]> git.saurik.com Git - apple/xnu.git/commitdiff
xnu-6153.11.26.tar.gz macos-1015 v6153.11.26
authorApple <opensource@apple.com>
Wed, 5 Feb 2020 22:25:23 +0000 (22:25 +0000)
committerApple <opensource@apple.com>
Wed, 5 Feb 2020 22:25:23 +0000 (22:25 +0000)
1628 files changed:
.gitignore
EXTERNAL_HEADERS/Makefile
EXTERNAL_HEADERS/corecrypto/cc.h
EXTERNAL_HEADERS/corecrypto/cc_config.h
EXTERNAL_HEADERS/corecrypto/cc_error.h
EXTERNAL_HEADERS/corecrypto/cc_priv.h
EXTERNAL_HEADERS/corecrypto/cc_runtime_config.h
EXTERNAL_HEADERS/corecrypto/ccaes.h
EXTERNAL_HEADERS/corecrypto/cccmac.h
EXTERNAL_HEADERS/corecrypto/ccdigest.h
EXTERNAL_HEADERS/corecrypto/ccdigest_priv.h
EXTERNAL_HEADERS/corecrypto/ccdrbg.h
EXTERNAL_HEADERS/corecrypto/cchmac.h
EXTERNAL_HEADERS/corecrypto/cckprng.h
EXTERNAL_HEADERS/corecrypto/ccmode.h
EXTERNAL_HEADERS/corecrypto/ccmode_factory.h
EXTERNAL_HEADERS/corecrypto/ccmode_siv.h
EXTERNAL_HEADERS/corecrypto/ccmode_siv_hmac.h [new file with mode: 0644]
EXTERNAL_HEADERS/corecrypto/ccn.h
EXTERNAL_HEADERS/corecrypto/ccrng.h
EXTERNAL_HEADERS/corecrypto/ccrng_system.h [deleted file]
EXTERNAL_HEADERS/corecrypto/ccrsa.h
EXTERNAL_HEADERS/corecrypto/ccsha1.h
EXTERNAL_HEADERS/corecrypto/ccsha2.h
EXTERNAL_HEADERS/corecrypto/cczp.h
EXTERNAL_HEADERS/img4/api.h
EXTERNAL_HEADERS/img4/environment.h
EXTERNAL_HEADERS/img4/img4.h
EXTERNAL_HEADERS/img4/nonce.h
EXTERNAL_HEADERS/img4/payload.h
EXTERNAL_HEADERS/mach-o/loader.h
EXTERNAL_HEADERS/ptrauth.h
EXTERNAL_HEADERS/stdatomic.h
EXTERNAL_HEADERS/stddef.h
EXTERNAL_HEADERS/sys/Makefile [new file with mode: 0644]
EXTERNAL_HEADERS/sys/_pthread/Makefile [new file with mode: 0644]
EXTERNAL_HEADERS/sys/_pthread/_pthread_types.h [new file with mode: 0644]
Makefile
README.md
SETUP/config/Makefile
SETUP/config/mkmakefile.c
SETUP/decomment/Makefile
SETUP/installfile/Makefile
SETUP/json_compilation_db/Makefile
SETUP/kextsymboltool/Makefile
SETUP/replacecontents/Makefile
SETUP/setsegname/Makefile
bsd/Makefile
bsd/arm/exec.h [deleted file]
bsd/arm/fasttrap_isa.h
bsd/arm/reboot.h [deleted file]
bsd/bsm/audit_fcntl.h
bsd/bsm/audit_kevents.h
bsd/conf/Makefile
bsd/conf/Makefile.template
bsd/conf/files
bsd/conf/param.c
bsd/dev/arm/conf.c
bsd/dev/arm/dtrace_isa.c
bsd/dev/arm/dtrace_subr_arm.c
bsd/dev/arm/fasttrap_isa.c
bsd/dev/arm/fbt_arm.c
bsd/dev/arm/kern_machdep.c
bsd/dev/arm/munge.c
bsd/dev/arm/pci_device.h [deleted file]
bsd/dev/arm/pio.h [deleted file]
bsd/dev/arm/sdt_arm.c
bsd/dev/arm/stubs.c
bsd/dev/arm/systemcalls.c
bsd/dev/arm/table_inline.h [deleted file]
bsd/dev/arm/unix_signal.c
bsd/dev/arm64/conf.c
bsd/dev/arm64/cpu_in_cksum.s
bsd/dev/arm64/disassembler.c
bsd/dev/arm64/dtrace_isa.c
bsd/dev/arm64/dtrace_subr_arm.c
bsd/dev/arm64/fasttrap_isa.c
bsd/dev/arm64/fbt_arm.c
bsd/dev/arm64/sdt_arm.c
bsd/dev/arm64/sysctl.c
bsd/dev/dtrace/blist.c
bsd/dev/dtrace/dtrace.c
bsd/dev/dtrace/dtrace_glue.c
bsd/dev/dtrace/dtrace_subr.c
bsd/dev/dtrace/dtrace_xoroshiro128_plus.c [new file with mode: 0644]
bsd/dev/dtrace/dtrace_xoroshiro128_plus.h [new file with mode: 0644]
bsd/dev/dtrace/fasttrap.c
bsd/dev/dtrace/fbt.c
bsd/dev/dtrace/fbt_blacklist.c [new file with mode: 0644]
bsd/dev/dtrace/lockprof.c
bsd/dev/dtrace/lockstat.c
bsd/dev/dtrace/profile_prvd.c
bsd/dev/dtrace/scripts/Makefile
bsd/dev/dtrace/scripts/ptrauth_arm64.d [new file with mode: 0644]
bsd/dev/dtrace/scripts/regs_arm.d
bsd/dev/dtrace/scripts/regs_arm64.d
bsd/dev/dtrace/scripts/regs_x86_64.d
bsd/dev/dtrace/scripts/unistd.d
bsd/dev/dtrace/sdt.c
bsd/dev/dtrace/sdt_subr.c
bsd/dev/dtrace/systrace.c
bsd/dev/dtrace/systrace.h
bsd/dev/i386/conf.c
bsd/dev/i386/dis_tables.c
bsd/dev/i386/dtrace_isa.c
bsd/dev/i386/dtrace_subr_x86.c
bsd/dev/i386/fasttrap_isa.c
bsd/dev/i386/fasttrap_regset.h
bsd/dev/i386/fbt_x86.c
bsd/dev/i386/instr_size.c
bsd/dev/i386/kern_machdep.c
bsd/dev/i386/sdt_x86.c
bsd/dev/i386/sysctl.c
bsd/dev/i386/systemcalls.c
bsd/dev/i386/unix_signal.c
bsd/dev/memdev.c
bsd/dev/monotonic.c
bsd/dev/vn/vn.c
bsd/i386/Makefile
bsd/i386/dis_tables.h
bsd/i386/exec.h [deleted file]
bsd/i386/fasttrap_isa.h
bsd/i386/limits.h
bsd/i386/reboot.h [deleted file]
bsd/kern/ast.h
bsd/kern/bsd_init.c
bsd/kern/bsd_stubs.c
bsd/kern/chunklist.c [new file with mode: 0644]
bsd/kern/chunklist.h
bsd/kern/decmpfs.c
bsd/kern/imageboot.c
bsd/kern/kdebug.c
bsd/kern/kern_aio.c
bsd/kern/kern_asl.c
bsd/kern/kern_backtrace.c
bsd/kern/kern_clock.c
bsd/kern/kern_credential.c
bsd/kern/kern_cs.c
bsd/kern/kern_descrip.c
bsd/kern/kern_event.c
bsd/kern/kern_exec.c
bsd/kern/kern_exit.c
bsd/kern/kern_fork.c
bsd/kern/kern_guarded.c
bsd/kern/kern_lockf.c
bsd/kern/kern_memorystatus.c
bsd/kern/kern_memorystatus_freeze.c [new file with mode: 0644]
bsd/kern/kern_memorystatus_notify.c [new file with mode: 0644]
bsd/kern/kern_mib.c
bsd/kern/kern_mman.c
bsd/kern/kern_newsysctl.c
bsd/kern/kern_ntptime.c
bsd/kern/kern_pcsamples.c [deleted file]
bsd/kern/kern_persona.c
bsd/kern/kern_proc.c
bsd/kern/kern_prot.c
bsd/kern/kern_resource.c
bsd/kern/kern_shutdown.c
bsd/kern/kern_sig.c
bsd/kern/kern_subr.c
bsd/kern/kern_symfile.c
bsd/kern/kern_sysctl.c
bsd/kern/kern_xxx.c
bsd/kern/kpi_mbuf.c
bsd/kern/kpi_socketfilter.c
bsd/kern/mach_fat.c
bsd/kern/mach_fat.h
bsd/kern/mach_loader.c
bsd/kern/mach_loader.h
bsd/kern/mach_process.c
bsd/kern/makesyscalls.sh
bsd/kern/mcache.c
bsd/kern/netboot.c
bsd/kern/policy_check.c
bsd/kern/posix_sem.c
bsd/kern/posix_shm.c
bsd/kern/proc_info.c
bsd/kern/stackshot.c
bsd/kern/subr_eventhandler.c
bsd/kern/subr_log.c
bsd/kern/subr_prf.c
bsd/kern/subr_prof.c [deleted file]
bsd/kern/subr_xxx.c
bsd/kern/sys_coalition.c
bsd/kern/sys_generic.c
bsd/kern/sys_persona.c
bsd/kern/sys_pipe.c
bsd/kern/sys_reason.c
bsd/kern/sys_socket.c
bsd/kern/sys_ulock.c
bsd/kern/syscalls.master
bsd/kern/sysv_msg.c
bsd/kern/sysv_sem.c
bsd/kern/sysv_shm.c
bsd/kern/trace_codes
bsd/kern/tty.c
bsd/kern/tty_compat.c
bsd/kern/tty_dev.c
bsd/kern/tty_ptmx.c
bsd/kern/ubc_subr.c
bsd/kern/uipc_domain.c
bsd/kern/uipc_mbuf.c
bsd/kern/uipc_mbuf2.c
bsd/kern/uipc_socket.c
bsd/kern/uipc_socket2.c
bsd/kern/uipc_syscalls.c
bsd/kern/uipc_usrreq.c
bsd/libkern/copyio.h
bsd/libkern/libkern.h
bsd/machine/Makefile
bsd/machine/exec.h
bsd/machine/reboot.h [deleted file]
bsd/man/man2/access.2
bsd/man/man2/chflags.2
bsd/man/man2/fcntl.2
bsd/man/man2/fs_snapshot_create.2
bsd/man/man2/fsgetpath.2
bsd/man/man2/getattrlist.2
bsd/man/man2/getattrlistbulk.2
bsd/man/man2/getdirentriesattr.2
bsd/man/man2/kqueue.2
bsd/man/man2/mkdir.2
bsd/man/man2/mkfifo.2
bsd/man/man2/open.2
bsd/man/man2/read.2
bsd/man/man2/rename.2
bsd/man/man2/shmctl.2
bsd/man/man2/stat.2
bsd/man/man2/symlink.2
bsd/man/man2/vfork.2
bsd/man/man3/Makefile
bsd/man/man3/getiopolicy_np.3
bsd/man/man3/posix_spawn_file_actions_addclose.3
bsd/man/man3/posix_spawnattr_setflags.3
bsd/miscfs/deadfs/dead_vnops.c
bsd/miscfs/devfs/devfs_fdesc_support.c
bsd/miscfs/devfs/devfs_tree.c
bsd/miscfs/devfs/devfs_vfsops.c
bsd/miscfs/devfs/devfs_vnops.c
bsd/miscfs/devfs/devfsdefs.h
bsd/miscfs/fifofs/fifo_vnops.c
bsd/miscfs/mockfs/mockfs_vnops.c
bsd/miscfs/nullfs/null_vfsops.c
bsd/miscfs/nullfs/null_vnops.c
bsd/miscfs/nullfs/nullfs.h
bsd/miscfs/routefs/routefs_ops.c
bsd/miscfs/specfs/spec_vnops.c
bsd/miscfs/specfs/specdev.h
bsd/net/Makefile
bsd/net/bpf.c
bsd/net/bpf_filter.c
bsd/net/cc.h [new file with mode: 0644]
bsd/net/classq/classq.c
bsd/net/classq/classq.h
bsd/net/classq/classq_fq_codel.c
bsd/net/classq/classq_fq_codel.h
bsd/net/classq/classq_sfb.c
bsd/net/classq/classq_subr.c
bsd/net/classq/if_classq.h
bsd/net/content_filter.c
bsd/net/content_filter.h
bsd/net/content_filter_crypto.c [new file with mode: 0644]
bsd/net/content_filter_crypto.h [new file with mode: 0644]
bsd/net/contiki-conf.h [new file with mode: 0644]
bsd/net/contiki-default-conf.h [new file with mode: 0644]
bsd/net/contiki-lib.h [new file with mode: 0644]
bsd/net/contiki-net.h [new file with mode: 0644]
bsd/net/contiki-version.h [new file with mode: 0644]
bsd/net/contiki.h [new file with mode: 0644]
bsd/net/dlil.c
bsd/net/dlil.h
bsd/net/ether_if_module.c
bsd/net/ethernet.h
bsd/net/flowadv.c
bsd/net/frame802154.c [new file with mode: 0644]
bsd/net/frame802154.h [new file with mode: 0644]
bsd/net/if.c
bsd/net/if.h
bsd/net/if_6lowpan.c [new file with mode: 0644]
bsd/net/if_6lowpan_var.h [new file with mode: 0644]
bsd/net/if_bond.c
bsd/net/if_bridge.c
bsd/net/if_fake.c
bsd/net/if_headless.c [new file with mode: 0644]
bsd/net/if_ipsec.c
bsd/net/if_ipsec.h
bsd/net/if_low_power_mode.c
bsd/net/if_media.h
bsd/net/if_mib.c
bsd/net/if_mib.h
bsd/net/if_types.h
bsd/net/if_utun.c
bsd/net/if_utun.h
bsd/net/if_var.h
bsd/net/if_vlan.c
bsd/net/kpi_interface.c
bsd/net/kpi_interface.h
bsd/net/linkaddr.c [new file with mode: 0644]
bsd/net/linkaddr.h [new file with mode: 0644]
bsd/net/multi_layer_pkt_log.c [new file with mode: 0644]
bsd/net/multi_layer_pkt_log.h [new file with mode: 0644]
bsd/net/nat464_utils.c
bsd/net/necp.c
bsd/net/necp.h
bsd/net/necp_client.c
bsd/net/net_log_common.h [new file with mode: 0644]
bsd/net/net_str_id.c
bsd/net/net_stubs.c
bsd/net/netsrc.c
bsd/net/network_agent.c
bsd/net/network_agent.h
bsd/net/ntstat.c
bsd/net/ntstat.h
bsd/net/nwk_wq.c
bsd/net/packet_mangler.c
bsd/net/pf.c
bsd/net/pf_if.c
bsd/net/pf_ioctl.c
bsd/net/pf_norm.c
bsd/net/pf_pbuf.c
bsd/net/pf_pbuf.h
bsd/net/pf_table.c
bsd/net/pfkeyv2.h
bsd/net/pfvar.h
bsd/net/pktap.c
bsd/net/pktsched/pktsched.c
bsd/net/pktsched/pktsched.h
bsd/net/pktsched/pktsched_fq_codel.c
bsd/net/pktsched/pktsched_netem.c [new file with mode: 0644]
bsd/net/pktsched/pktsched_netem.h [new file with mode: 0644]
bsd/net/pktsched/pktsched_qfq.c
bsd/net/pktsched/pktsched_tcq.c
bsd/net/restricted_in_port.c [new file with mode: 0644]
bsd/net/restricted_in_port.h [new file with mode: 0644]
bsd/net/route.c
bsd/net/route.h
bsd/net/rtsock.c
bsd/net/sixxlowpan.c [new file with mode: 0644]
bsd/net/sixxlowpan.h [new file with mode: 0644]
bsd/net/skywalk_stubs.c
bsd/netinet/Makefile
bsd/netinet/cbrtf.c
bsd/netinet/dhcp_options.c
bsd/netinet/flow_divert.c
bsd/netinet/flow_divert_proto.h
bsd/netinet/igmp.c
bsd/netinet/in.c
bsd/netinet/in.h
bsd/netinet/in_arp.c
bsd/netinet/in_mcast.c
bsd/netinet/in_pcb.c
bsd/netinet/in_pcb.h
bsd/netinet/in_pcblist.c
bsd/netinet/in_proto.c
bsd/netinet/in_stat.c
bsd/netinet/in_systm.h
bsd/netinet/in_tclass.c
bsd/netinet/in_tclass.h
bsd/netinet/in_var.h
bsd/netinet/ip_divert.c
bsd/netinet/ip_dummynet.c
bsd/netinet/ip_dummynet.h
bsd/netinet/ip_encap.c
bsd/netinet/ip_flowid.h
bsd/netinet/ip_fw2.c
bsd/netinet/ip_fw2_compat.c
bsd/netinet/ip_icmp.c
bsd/netinet/ip_input.c
bsd/netinet/ip_output.c
bsd/netinet/ip_var.h
bsd/netinet/kpi_ipfilter.c
bsd/netinet/kpi_ipfilter.h
bsd/netinet/mp_pcb.c
bsd/netinet/mp_pcb.h
bsd/netinet/mptcp.c
bsd/netinet/mptcp.h
bsd/netinet/mptcp_opt.c
bsd/netinet/mptcp_opt.h
bsd/netinet/mptcp_subr.c
bsd/netinet/mptcp_timer.c
bsd/netinet/mptcp_timer.h
bsd/netinet/mptcp_usrreq.c
bsd/netinet/mptcp_var.h
bsd/netinet/raw_ip.c
bsd/netinet/tcp.h
bsd/netinet/tcp_cache.c
bsd/netinet/tcp_cache.h
bsd/netinet/tcp_cc.c
bsd/netinet/tcp_cc.h
bsd/netinet/tcp_cubic.c
bsd/netinet/tcp_input.c
bsd/netinet/tcp_log.c [new file with mode: 0644]
bsd/netinet/tcp_log.h [new file with mode: 0644]
bsd/netinet/tcp_lro.c
bsd/netinet/tcp_output.c
bsd/netinet/tcp_subr.c
bsd/netinet/tcp_timer.c
bsd/netinet/tcp_usrreq.c
bsd/netinet/tcp_var.h
bsd/netinet/udp_usrreq.c
bsd/netinet6/Makefile
bsd/netinet6/ah_core.c
bsd/netinet6/ah_input.c
bsd/netinet6/ah_output.c
bsd/netinet6/esp.h
bsd/netinet6/esp_core.c
bsd/netinet6/esp_input.c
bsd/netinet6/esp_output.c
bsd/netinet6/frag6.c
bsd/netinet6/icmp6.c
bsd/netinet6/in6.c
bsd/netinet6/in6.h
bsd/netinet6/in6_ifattach.c
bsd/netinet6/in6_mcast.c
bsd/netinet6/in6_pcb.c
bsd/netinet6/in6_proto.c
bsd/netinet6/in6_src.c
bsd/netinet6/in6_var.h
bsd/netinet6/ip6_forward.c
bsd/netinet6/ip6_id.c
bsd/netinet6/ip6_input.c
bsd/netinet6/ip6_output.c
bsd/netinet6/ip6_var.h
bsd/netinet6/ip6protosw.h
bsd/netinet6/ipcomp.h
bsd/netinet6/ipcomp6.h [deleted file]
bsd/netinet6/ipcomp_core.c [deleted file]
bsd/netinet6/ipcomp_input.c [deleted file]
bsd/netinet6/ipcomp_output.c [deleted file]
bsd/netinet6/ipsec.c
bsd/netinet6/ipsec.h
bsd/netinet6/mld6.c
bsd/netinet6/nd6.c
bsd/netinet6/nd6.h
bsd/netinet6/nd6_nbr.c
bsd/netinet6/nd6_prproxy.c
bsd/netinet6/nd6_rtr.c
bsd/netinet6/raw_ip6.c
bsd/netinet6/udp6_output.c
bsd/netinet6/udp6_usrreq.c
bsd/netkey/key.c
bsd/netkey/key_debug.c
bsd/netkey/keydb.h
bsd/netkey/keysock.c
bsd/nfs/gss/gss_krb5_mech.c
bsd/nfs/nfs.h
bsd/nfs/nfs4_subs.c
bsd/nfs/nfs4_vnops.c
bsd/nfs/nfs_bio.c
bsd/nfs/nfs_boot.c
bsd/nfs/nfs_gss.c
bsd/nfs/nfs_ioctl.h
bsd/nfs/nfs_node.c
bsd/nfs/nfs_serv.c
bsd/nfs/nfs_socket.c
bsd/nfs/nfs_subs.c
bsd/nfs/nfs_syscalls.c
bsd/nfs/nfs_upcall.c
bsd/nfs/nfs_vfsops.c
bsd/nfs/nfs_vnops.c
bsd/nfs/nfsm_subs.h
bsd/nfs/nfsmount.h
bsd/nfs/nfsnode.h
bsd/nfs/nfsproto.h
bsd/nfs/rpcv2.h
bsd/nfs/xdr_subs.h
bsd/pthread/Makefile
bsd/pthread/bsdthread_private.h
bsd/pthread/priority_private.h
bsd/pthread/pthread_shims.c
bsd/pthread/pthread_workqueue.c
bsd/pthread/workqueue_internal.h
bsd/pthread/workqueue_syscalls.h
bsd/security/audit/audit_bsm.c
bsd/security/audit/audit_bsm_domain.c
bsd/security/audit/audit_bsm_errno.c
bsd/security/audit/audit_bsm_fcntl.c
bsd/security/audit/audit_pipe.c
bsd/security/audit/audit_private.h
bsd/security/audit/audit_session.c
bsd/security/audit/audit_syscalls.c
bsd/sys/Makefile
bsd/sys/_types.h
bsd/sys/_types/Makefile
bsd/sys/_types/_guid_t.h
bsd/sys/attr.h
bsd/sys/bitstring.h
bsd/sys/bsdtask_info.h
bsd/sys/buf.h
bsd/sys/buf_internal.h
bsd/sys/cdefs.h
bsd/sys/coalition.h
bsd/sys/codesign.h
bsd/sys/commpage.h
bsd/sys/decmpfs.h
bsd/sys/dirent.h
bsd/sys/disk.h
bsd/sys/domain.h
bsd/sys/dtrace.h
bsd/sys/dtrace_glue.h
bsd/sys/dtrace_impl.h
bsd/sys/errno.h
bsd/sys/event.h
bsd/sys/eventhandler.h
bsd/sys/eventvar.h
bsd/sys/fasttrap.h
bsd/sys/fasttrap_impl.h
bsd/sys/fbt.h
bsd/sys/fcntl.h
bsd/sys/file.h
bsd/sys/file_internal.h
bsd/sys/filedesc.h
bsd/sys/fsctl.h
bsd/sys/fsevents.h
bsd/sys/fsgetpath.h
bsd/sys/gmon.h
bsd/sys/guarded.h
bsd/sys/imageboot.h
bsd/sys/imgact.h
bsd/sys/kasl.h
bsd/sys/kauth.h
bsd/sys/kdebug.h
bsd/sys/kdebug_kernel.h [new file with mode: 0644]
bsd/sys/kdebug_signpost.h
bsd/sys/kern_memorystatus.h
bsd/sys/kern_memorystatus_freeze.h [new file with mode: 0644]
bsd/sys/kern_memorystatus_notify.h [new file with mode: 0644]
bsd/sys/kern_sysctl.h [new file with mode: 0644]
bsd/sys/kernel.h
bsd/sys/kernel_types.h
bsd/sys/kpi_mbuf.h
bsd/sys/kpi_socket.h
bsd/sys/kpi_socketfilter.h
bsd/sys/lockf.h
bsd/sys/lockstat.h
bsd/sys/log_data.h [new file with mode: 0644]
bsd/sys/make_symbol_aliasing.sh
bsd/sys/mbuf.h
bsd/sys/mcache.h
bsd/sys/mman.h
bsd/sys/monotonic.h
bsd/sys/mount.h
bsd/sys/mount_internal.h
bsd/sys/namei.h
bsd/sys/persona.h
bsd/sys/pipe.h
bsd/sys/priv.h
bsd/sys/proc.h
bsd/sys/proc_info.h
bsd/sys/proc_internal.h
bsd/sys/process_policy.h
bsd/sys/protosw.h
bsd/sys/pthread_shims.h
bsd/sys/queue.h
bsd/sys/quota.h
bsd/sys/reason.h
bsd/sys/reboot.h
bsd/sys/resource.h
bsd/sys/resourcevar.h
bsd/sys/sdt_impl.h
bsd/sys/select.h
bsd/sys/signalvar.h
bsd/sys/socket.h
bsd/sys/socketvar.h
bsd/sys/sockio.h
bsd/sys/spawn.h
bsd/sys/spawn_internal.h
bsd/sys/stat.h
bsd/sys/sysctl.h
bsd/sys/sysent.h
bsd/sys/systm.h
bsd/sys/tty.h
bsd/sys/ttycom.h
bsd/sys/ubc.h
bsd/sys/ucred.h
bsd/sys/uio_internal.h
bsd/sys/ulock.h
bsd/sys/user.h
bsd/sys/vnode.h
bsd/sys/vnode_if.h
bsd/sys/vnode_internal.h
bsd/sys/work_interval.h
bsd/sys_private/Makefile [new file with mode: 0644]
bsd/sys_private/kdebug_private.h [new file with mode: 0644]
bsd/tests/bsd_tests.c
bsd/tests/copyio_tests.c [new file with mode: 0644]
bsd/tests/pmap_test_sysctl.c
bsd/vfs/kpi_vfs.c
bsd/vfs/vfs_attrlist.c
bsd/vfs/vfs_bio.c
bsd/vfs/vfs_cache.c
bsd/vfs/vfs_cluster.c
bsd/vfs/vfs_conf.c
bsd/vfs/vfs_disk_conditioner.c
bsd/vfs/vfs_fsevents.c
bsd/vfs/vfs_init.c
bsd/vfs/vfs_lookup.c
bsd/vfs/vfs_subr.c
bsd/vfs/vfs_syscalls.c
bsd/vfs/vfs_vnops.c
bsd/vfs/vnode_if.c
bsd/vfs/vnode_if.sh
bsd/vm/vm_unix.c
bsd/vm/vnode_pager.c
config/BSDKernel.exports
config/IOKit.arm.exports
config/IOKit.arm64.exports
config/IOKit.exports
config/IOKit.x86_64.exports
config/Libkern.arm.exports
config/Libkern.arm64.exports
config/Libkern.exports
config/Libkern.x86_64.exports
config/MACFramework.exports
config/MASTER
config/MASTER.arm
config/MASTER.arm64
config/MASTER.arm64.bcm2837
config/MASTER.x86_64
config/Mach.exports
config/Makefile
config/MasterVersion
config/Private.arm.exports
config/Private.arm64.exports
config/Private.exports
config/Private.x86_64.exports
config/Unsupported.exports
config/Unused.arm.exports [new file with mode: 0644]
config/Unused.arm64.exports [new file with mode: 0644]
config/Unused.exports
config/generate_linker_exports.sh
doc/atomics.md [new file with mode: 0644]
iokit/DriverKit/IOBufferMemoryDescriptor.iig [new file with mode: 0644]
iokit/DriverKit/IODataQueueDispatchSource.iig [new file with mode: 0644]
iokit/DriverKit/IODispatchQueue.iig [new file with mode: 0644]
iokit/DriverKit/IODispatchSource.iig [new file with mode: 0644]
iokit/DriverKit/IOInterruptDispatchSource.iig [new file with mode: 0644]
iokit/DriverKit/IOMemoryDescriptor.iig [new file with mode: 0644]
iokit/DriverKit/IOMemoryMap.iig [new file with mode: 0644]
iokit/DriverKit/IORPC.h [new file with mode: 0644]
iokit/DriverKit/IOReturn.h [new file with mode: 0644]
iokit/DriverKit/IOService.iig [new file with mode: 0644]
iokit/DriverKit/IOTypes.h [new file with mode: 0644]
iokit/DriverKit/IOUserClient.iig [new file with mode: 0644]
iokit/DriverKit/IOUserServer.iig [new file with mode: 0644]
iokit/DriverKit/Makefile [new file with mode: 0644]
iokit/DriverKit/OSAction.iig [new file with mode: 0644]
iokit/DriverKit/OSObject.iig [new file with mode: 0644]
iokit/Families/IOSystemManagement/IOWatchDogTimer.cpp
iokit/IOKit/IOBSD.h
iokit/IOKit/IOBufferMemoryDescriptor.h
iokit/IOKit/IOCPU.h
iokit/IOKit/IOCatalogue.h
iokit/IOKit/IOCommand.h
iokit/IOKit/IOCommandGate.h
iokit/IOKit/IOCommandPool.h
iokit/IOKit/IOCommandQueue.h
iokit/IOKit/IOConditionLock.h
iokit/IOKit/IODMACommand.h
iokit/IOKit/IODMAEventSource.h
iokit/IOKit/IODataQueue.h
iokit/IOKit/IODeviceMemory.h
iokit/IOKit/IODeviceTreeSupport.h
iokit/IOKit/IOEventSource.h
iokit/IOKit/IOFilterInterruptEventSource.h
iokit/IOKit/IOInterruptAccountingPrivate.h
iokit/IOKit/IOInterruptEventSource.h
iokit/IOKit/IOKitDebug.h
iokit/IOKit/IOKitDiagnosticsUserClient.h
iokit/IOKit/IOKitKeys.h
iokit/IOKit/IOKitServer.h
iokit/IOKit/IOLib.h
iokit/IOKit/IOLocks.h
iokit/IOKit/IOMemoryCursor.h
iokit/IOKit/IOMemoryDescriptor.h
iokit/IOKit/IOMessage.h
iokit/IOKit/IOMultiMemoryDescriptor.h
iokit/IOKit/IONVRAM.h
iokit/IOKit/IONotifier.h
iokit/IOKit/IOPlatformExpert.h
iokit/IOKit/IOPolledInterface.h
iokit/IOKit/IORPC.h [new file with mode: 0644]
iokit/IOKit/IORangeAllocator.h
iokit/IOKit/IORegistryEntry.h
iokit/IOKit/IOReturn.h
iokit/IOKit/IOService.h
iokit/IOKit/IOServicePM.h
iokit/IOKit/IOSharedDataQueue.h
iokit/IOKit/IOSubMemoryDescriptor.h
iokit/IOKit/IOSyncer.h
iokit/IOKit/IOTimerEventSource.h
iokit/IOKit/IOTypes.h
iokit/IOKit/IOUserClient.h
iokit/IOKit/IOUserServer.h [new file with mode: 0644]
iokit/IOKit/IOWorkLoop.h
iokit/IOKit/Makefile
iokit/IOKit/machine/Makefile
iokit/IOKit/nvram/Makefile
iokit/IOKit/perfcontrol/IOPerfControl.h
iokit/IOKit/perfcontrol/Makefile
iokit/IOKit/platform/Makefile
iokit/IOKit/power/IOPwrController.h
iokit/IOKit/power/Makefile
iokit/IOKit/pwr_mgt/IOPM.h
iokit/IOKit/pwr_mgt/IOPMPowerSource.h
iokit/IOKit/pwr_mgt/IOPMPowerSourceList.h
iokit/IOKit/pwr_mgt/IOPMPrivate.h
iokit/IOKit/pwr_mgt/IOPMinformee.h
iokit/IOKit/pwr_mgt/IOPMinformeeList.h
iokit/IOKit/pwr_mgt/IOPowerConnection.h
iokit/IOKit/pwr_mgt/Makefile
iokit/IOKit/pwr_mgt/RootDomain.h
iokit/IOKit/rtc/IORTCController.h
iokit/IOKit/rtc/Makefile
iokit/IOKit/system_management/Makefile
iokit/IOKitUser/IOBlockStorageDevice.h [new file with mode: 0644]
iokit/IOKitUser/IOBufferMemoryDescriptor.h [new file with mode: 0644]
iokit/IOKitUser/IODataQueueDispatchSource.h [new file with mode: 0644]
iokit/IOKitUser/IODispatchQueue.h [new file with mode: 0644]
iokit/IOKitUser/IODispatchSource.h [new file with mode: 0644]
iokit/IOKitUser/IOInterruptDispatchSource.h [new file with mode: 0644]
iokit/IOKitUser/IOMemoryDescriptor.h [new file with mode: 0644]
iokit/IOKitUser/IOMemoryMap.h [new file with mode: 0644]
iokit/IOKitUser/IOService.h [new file with mode: 0644]
iokit/IOKitUser/IOTimerDispatchSource.h [new file with mode: 0644]
iokit/IOKitUser/IOUserServer.h [new file with mode: 0644]
iokit/IOKitUser/Makefile [new file with mode: 0644]
iokit/IOKitUser/OSAction.h [new file with mode: 0644]
iokit/IOKitUser/OSArray.h [new file with mode: 0644]
iokit/IOKitUser/OSBoolean.h [new file with mode: 0644]
iokit/IOKitUser/OSCollection.h [new file with mode: 0644]
iokit/IOKitUser/OSContainer.h [new file with mode: 0644]
iokit/IOKitUser/OSData.h [new file with mode: 0644]
iokit/IOKitUser/OSDictionary.h [new file with mode: 0644]
iokit/IOKitUser/OSNumber.h [new file with mode: 0644]
iokit/IOKitUser/OSObject.h [new file with mode: 0644]
iokit/IOKitUser/OSSerialization.h [new file with mode: 0644]
iokit/IOKitUser/OSString.h [new file with mode: 0644]
iokit/Kernel/IOBufferMemoryDescriptor.cpp
iokit/Kernel/IOCPU.cpp
iokit/Kernel/IOCatalogue.cpp
iokit/Kernel/IOCommandGate.cpp
iokit/Kernel/IOCommandPool.cpp
iokit/Kernel/IOCommandQueue.cpp
iokit/Kernel/IOConditionLock.cpp
iokit/Kernel/IODMACommand.cpp
iokit/Kernel/IODMAController.cpp
iokit/Kernel/IODMAEventSource.cpp
iokit/Kernel/IODataQueue.cpp
iokit/Kernel/IODeviceMemory.cpp
iokit/Kernel/IODeviceTreeSupport.cpp
iokit/Kernel/IOEventSource.cpp
iokit/Kernel/IOFilterInterruptEventSource.cpp
iokit/Kernel/IOHibernateIO.cpp
iokit/Kernel/IOHistogramReporter.cpp
iokit/Kernel/IOInterleavedMemoryDescriptor.cpp
iokit/Kernel/IOInterruptController.cpp
iokit/Kernel/IOInterruptEventSource.cpp
iokit/Kernel/IOKitDebug.cpp
iokit/Kernel/IOKitKernelInternal.h
iokit/Kernel/IOLib.cpp
iokit/Kernel/IOLocks.cpp
iokit/Kernel/IOMemoryCursor.cpp
iokit/Kernel/IOMemoryDescriptor.cpp
iokit/Kernel/IOMultiMemoryDescriptor.cpp
iokit/Kernel/IONVRAM.cpp
iokit/Kernel/IOPMPowerSource.cpp
iokit/Kernel/IOPMPowerStateQueue.h
iokit/Kernel/IOPMrootDomain.cpp
iokit/Kernel/IOPerfControl.cpp
iokit/Kernel/IOPlatformExpert.cpp
iokit/Kernel/IOPolledInterface.cpp
iokit/Kernel/IORangeAllocator.cpp
iokit/Kernel/IORegistryEntry.cpp
iokit/Kernel/IOService.cpp
iokit/Kernel/IOServicePM.cpp
iokit/Kernel/IOServicePMPrivate.h
iokit/Kernel/IOServicePrivate.h
iokit/Kernel/IOSharedDataQueue.cpp
iokit/Kernel/IOStartIOKit.cpp
iokit/Kernel/IOStatistics.cpp
iokit/Kernel/IOSubMemoryDescriptor.cpp
iokit/Kernel/IOSyncer.cpp
iokit/Kernel/IOTimerEventSource.cpp
iokit/Kernel/IOUserClient.cpp
iokit/Kernel/IOUserServer.cpp [new file with mode: 0644]
iokit/Kernel/IOWorkLoop.cpp
iokit/Kernel/RootDomainUserClient.cpp
iokit/Kernel/RootDomainUserClient.h
iokit/KernelConfigTables.cpp
iokit/Makefile
iokit/System/IODataQueueDispatchSourceShared.h [new file with mode: 0644]
iokit/System/Makefile [new file with mode: 0644]
iokit/Tests/Tests.cpp
iokit/bsddev/DINetBootHook.cpp
iokit/bsddev/IOKitBSDInit.cpp
iokit/bsddev/skywalk/IOSkywalkSupport.cpp
iokit/conf/Makefile
iokit/conf/Makefile.template
iokit/conf/files
libkdd/kcdata.h
libkdd/kcdtypes.c
libkdd/kdd.xcodeproj/project.pbxproj
libkdd/tests/Tests.swift
libkdd/tests/kdd_bridge.h
libkdd/tests/stackshot-sample-dispatch-queue-label [new file with mode: 0644]
libkdd/tests/stackshot-sample-dispatch-queue-label.plist.gz [new file with mode: 0644]
libkdd/tests/stackshot-sample-turnstileinfo [new file with mode: 0644]
libkdd/tests/stackshot-sample-turnstileinfo.plist.gz [new file with mode: 0644]
libkdd/xnu.libkdd.plist [new file with mode: 0644]
libkern/OSKextLib.cpp
libkern/c++/OSArray.cpp
libkern/c++/OSBoolean.cpp
libkern/c++/OSCollectionIterator.cpp
libkern/c++/OSCompat.cpp [new file with mode: 0644]
libkern/c++/OSData.cpp
libkern/c++/OSDictionary.cpp
libkern/c++/OSKext.cpp
libkern/c++/OSMetaClass.cpp
libkern/c++/OSNumber.cpp
libkern/c++/OSObject.cpp
libkern/c++/OSOrderedSet.cpp
libkern/c++/OSRuntime.cpp
libkern/c++/OSSerialize.cpp
libkern/c++/OSSerializeBinary.cpp
libkern/c++/OSSet.cpp
libkern/c++/OSString.cpp
libkern/c++/OSSymbol.cpp
libkern/c++/OSUnserialize.y
libkern/c++/OSUnserializeXML.cpp
libkern/c++/OSUnserializeXML.y
libkern/conf/Makefile
libkern/conf/Makefile.template
libkern/conf/files
libkern/crypto/corecrypto_aes.c
libkern/crypto/corecrypto_aesxts.c
libkern/crypto/corecrypto_des.c
libkern/firehose/Makefile
libkern/firehose/chunk_private.h
libkern/firehose/firehose_types_private.h
libkern/gen/OSAtomicOperations.c
libkern/gen/OSDebug.cpp
libkern/kxld/kxld.c
libkern/kxld/kxld_array.c
libkern/kxld/kxld_demangle.h
libkern/kxld/kxld_object.c
libkern/kxld/kxld_reloc.c
libkern/kxld/kxld_sym.c
libkern/kxld/kxld_util.c
libkern/kxld/kxld_util.h
libkern/libclosure/runtime.cpp
libkern/libkern/Makefile
libkern/libkern/OSAtomic.h
libkern/libkern/OSKextLib.h
libkern/libkern/OSKextLibPrivate.h
libkern/libkern/OSSerializeBinary.h
libkern/libkern/c++/Makefile
libkern/libkern/c++/OSArray.h
libkern/libkern/c++/OSBoolean.h
libkern/libkern/c++/OSCollection.h
libkern/libkern/c++/OSCollectionIterator.h
libkern/libkern/c++/OSData.h
libkern/libkern/c++/OSDictionary.h
libkern/libkern/c++/OSIterator.h
libkern/libkern/c++/OSKext.h
libkern/libkern/c++/OSLib.h
libkern/libkern/c++/OSMetaClass.h
libkern/libkern/c++/OSNumber.h
libkern/libkern/c++/OSObject.h
libkern/libkern/c++/OSOrderedSet.h
libkern/libkern/c++/OSPtr.h [new file with mode: 0644]
libkern/libkern/c++/OSSerialize.h
libkern/libkern/c++/OSSet.h
libkern/libkern/c++/OSString.h
libkern/libkern/c++/OSSymbol.h
libkern/libkern/c++/OSUnserialize.h
libkern/libkern/crypto/des.h
libkern/libkern/i386/Makefile
libkern/libkern/img4/interface.h
libkern/libkern/kext_panic_report.h
libkern/libkern/kext_request_keys.h
libkern/libkern/machine/Makefile
libkern/libkern/stack_protector.h
libkern/libkern/tree.h
libkern/os/Makefile
libkern/os/base.h
libkern/os/cpp_util.h [new file with mode: 0644]
libkern/os/log.c
libkern/os/log.h
libkern/os/log_encode_types.h
libkern/os/overflow.h
libkern/os/ptrtools.h [new file with mode: 0644]
libkern/os/reason_private.h
libkern/os/refcnt.c
libkern/os/refcnt.h
libkern/os/refcnt_internal.h [new file with mode: 0644]
libkern/os/smart_ptr.h [new file with mode: 0644]
libsa/bootstrap.cpp
libsa/conf/Makefile
libsa/conf/Makefile.template
libsa/nonlto.c [new file with mode: 0644]
libsyscall/Libsyscall.xcconfig
libsyscall/Libsyscall.xcodeproj/project.pbxproj
libsyscall/Platforms/DriverKit/x86_64/syscall.map [new file with mode: 0644]
libsyscall/custom/SYS.h
libsyscall/mach/abort.h
libsyscall/mach/err_iokit.sub
libsyscall/mach/err_mach_ipc.sub
libsyscall/mach/error_codes.c
libsyscall/mach/errorlib.h
libsyscall/mach/exc_catcher.c
libsyscall/mach/exc_catcher_state.c
libsyscall/mach/exc_catcher_state_identity.c
libsyscall/mach/host.c
libsyscall/mach/mach_msg.c
libsyscall/mach/mach_port.c
libsyscall/mach/mach_vm.c
libsyscall/mach/memory_entry.defs [new file with mode: 0644]
libsyscall/mach/port_descriptions.c
libsyscall/mach/restartable.defs [new file with mode: 0644]
libsyscall/mach/slot_name.c
libsyscall/os/log_data.c [new file with mode: 0644]
libsyscall/os/proc.h [new file with mode: 0644]
libsyscall/os/tsd.h
libsyscall/wrappers/cancelable/fcntl-base.c
libsyscall/wrappers/coalition.c
libsyscall/wrappers/gethostuuid.c
libsyscall/wrappers/getiopolicy_np.c
libsyscall/wrappers/kdebug_trace.c
libsyscall/wrappers/libproc/libproc.c
libsyscall/wrappers/libproc/libproc_internal.h
libsyscall/wrappers/mach_absolute_time.s
libsyscall/wrappers/mach_continuous_time.c
libsyscall/wrappers/mach_get_times.c
libsyscall/wrappers/persona.c
libsyscall/wrappers/proc.c [new file with mode: 0644]
libsyscall/wrappers/quota_obsolete.c
libsyscall/wrappers/skywalk/os_channel.c
libsyscall/wrappers/skywalk/os_channel_event.c [new file with mode: 0644]
libsyscall/wrappers/spawn/posix_spawn.c
libsyscall/wrappers/spawn/spawn.h
libsyscall/wrappers/spawn/spawn_private.h
libsyscall/wrappers/terminate_with_reason.c
libsyscall/xcodescripts/compile-syscalls.pl
libsyscall/xcodescripts/create-syscalls.pl
libsyscall/xcodescripts/mach_install_mig.sh
makedefs/MakeInc.cmd
makedefs/MakeInc.def
makedefs/MakeInc.kernel
makedefs/MakeInc.rule
makedefs/MakeInc.top
osfmk/Makefile
osfmk/UserNotification/KUNCUserNotifications.c
osfmk/UserNotification/Makefile
osfmk/arm/Makefile
osfmk/arm/arm_init.c
osfmk/arm/arm_timer.c
osfmk/arm/arm_vm_init.c
osfmk/arm/atomic.h
osfmk/arm/caches.c
osfmk/arm/caches_asm.s
osfmk/arm/commpage/commpage.c
osfmk/arm/commpage/commpage.h
osfmk/arm/cpu.c
osfmk/arm/cpu_capabilities.h
osfmk/arm/cpu_common.c
osfmk/arm/cpu_data.h
osfmk/arm/cpu_data_internal.h
osfmk/arm/cpu_internal.h
osfmk/arm/cpuid.c
osfmk/arm/cpuid.h
osfmk/arm/genassym.c
osfmk/arm/io_map.c
osfmk/arm/io_map_entries.h
osfmk/arm/kpc_arm.c
osfmk/arm/locks.h
osfmk/arm/locks_arm.c
osfmk/arm/locore.s
osfmk/arm/loose_ends.c
osfmk/arm/machine_cpuid.c
osfmk/arm/machine_routines.c
osfmk/arm/machine_routines.h
osfmk/arm/machine_routines_asm.s
osfmk/arm/machine_routines_common.c
osfmk/arm/memory_types.h [new file with mode: 0644]
osfmk/arm/misc_protos.h
osfmk/arm/model_dep.c
osfmk/arm/monotonic_arm.c
osfmk/arm/pal_routines.h
osfmk/arm/pcb.c
osfmk/arm/pmap.c
osfmk/arm/pmap.h
osfmk/arm/proc_reg.h
osfmk/arm/simple_lock.h
osfmk/arm/start.s
osfmk/arm/status.c
osfmk/arm/task.h
osfmk/arm/thread.h
osfmk/arm/tlb.h [new file with mode: 0644]
osfmk/arm/trap.c
osfmk/arm/trap.h
osfmk/arm/xpr.h [deleted file]
osfmk/arm64/Makefile
osfmk/arm64/arm_vm_init.c
osfmk/arm64/asm.h
osfmk/arm64/bsd_arm64.c
osfmk/arm64/caches_asm.s
osfmk/arm64/copyio.c
osfmk/arm64/cpu.c
osfmk/arm64/cswitch.s
osfmk/arm64/dbgwrap.c
osfmk/arm64/exception_asm.h [new file with mode: 0644]
osfmk/arm64/genassym.c
osfmk/arm64/gxf_exceptions.s [new file with mode: 0644]
osfmk/arm64/kpc.c
osfmk/arm64/locore.s
osfmk/arm64/loose_ends.c
osfmk/arm64/machine_remote_time.c
osfmk/arm64/machine_remote_time.h
osfmk/arm64/machine_routines.c
osfmk/arm64/machine_routines_asm.h [new file with mode: 0644]
osfmk/arm64/machine_routines_asm.s
osfmk/arm64/monotonic_arm64.c
osfmk/arm64/pcb.c
osfmk/arm64/platform_tests.c
osfmk/arm64/platform_tests_asm.s [new file with mode: 0644]
osfmk/arm64/proc_reg.h
osfmk/arm64/sleh.c
osfmk/arm64/start.s
osfmk/arm64/status.c
osfmk/arm64/tlb.h [new file with mode: 0644]
osfmk/atm/Makefile
osfmk/atm/atm.c
osfmk/atm/atm_internal.h
osfmk/bank/Makefile
osfmk/bank/bank.c
osfmk/bank/bank_internal.h
osfmk/bank/bank_types.h
osfmk/conf/Makefile
osfmk/conf/Makefile.template
osfmk/conf/Makefile.x86_64
osfmk/conf/files
osfmk/conf/files.arm
osfmk/conf/files.arm64
osfmk/conf/files.x86_64
osfmk/console/serial_console.c
osfmk/console/serial_general.c
osfmk/console/serial_protos.h
osfmk/console/video_console.c
osfmk/corecrypto/cc/src/cc_abort.c [new file with mode: 0644]
osfmk/corecrypto/cc/src/cc_clear.c
osfmk/corecrypto/cc/src/cc_cmp_safe.c
osfmk/corecrypto/cc/src/cc_try_abort.c [deleted file]
osfmk/corecrypto/ccdbrg/src/ccdrbg_nisthmac.c
osfmk/corecrypto/ccdigest/src/ccdigest_init.c
osfmk/corecrypto/ccdigest/src/ccdigest_update.c
osfmk/corecrypto/cchmac/src/cchmac_final.c
osfmk/corecrypto/cchmac/src/cchmac_init.c
osfmk/corecrypto/ccn/src/ccn_set.c [deleted file]
osfmk/corecrypto/ccsha1/src/ccdigest_final_64be.c
osfmk/corecrypto/ccsha1/src/ccdigest_internal.h
osfmk/corecrypto/ccsha1/src/ccsha1_eay.c [deleted file]
osfmk/corecrypto/ccsha1/src/ccsha1_initial_state.c [deleted file]
osfmk/corecrypto/ccsha1/src/ccsha1_internal.h [deleted file]
osfmk/corecrypto/ccsha2/src/ccdigest_internal.h
osfmk/corecrypto/ccsha2/src/ccsha256_di.c
osfmk/corecrypto/ccsha2/src/ccsha256_ltc_compress.c
osfmk/corecrypto/ccsha2/src/ccsha2_internal.h
osfmk/corpses/corpse.c
osfmk/corpses/task_corpse.h
osfmk/default_pager/Makefile
osfmk/device/Makefile
osfmk/device/device_init.c
osfmk/device/device_port.h
osfmk/device/device_types.h
osfmk/device/iokit_rpc.c
osfmk/gssd/Makefile
osfmk/i386/AT386/model_dep.c
osfmk/i386/Makefile
osfmk/i386/acpi.c
osfmk/i386/asm.h
osfmk/i386/atomic.h
osfmk/i386/bsd_i386.c
osfmk/i386/commpage/commpage.c
osfmk/i386/commpage/commpage.h
osfmk/i386/cpu.c
osfmk/i386/cpu_capabilities.h
osfmk/i386/cpu_data.h
osfmk/i386/cpuid.c
osfmk/i386/cpuid.h
osfmk/i386/fpu.c
osfmk/i386/genassym.c
osfmk/i386/hibernate_i386.c
osfmk/i386/i386_init.c
osfmk/i386/i386_vm_init.c
osfmk/i386/lapic.h
osfmk/i386/locks.h
osfmk/i386/locks_i386.c
osfmk/i386/locks_i386_inlines.h
osfmk/i386/locks_i386_opt.c
osfmk/i386/machine_routines.c
osfmk/i386/machine_routines.h
osfmk/i386/memory_types.h [new file with mode: 0644]
osfmk/i386/misc_protos.h
osfmk/i386/mp.c
osfmk/i386/mp.h
osfmk/i386/mp_desc.c
osfmk/i386/pal_routines.h
osfmk/i386/pcb.c
osfmk/i386/pmCPU.c
osfmk/i386/pmap.h
osfmk/i386/pmap_common.c
osfmk/i386/pmap_internal.h
osfmk/i386/pmap_x86_common.c
osfmk/i386/rtclock.c
osfmk/i386/rtclock_native.c
osfmk/i386/simple_lock.h
osfmk/i386/thread.h
osfmk/i386/trap.c
osfmk/i386/trap.h
osfmk/i386/trap_native.c
osfmk/i386/user_ldt.c
osfmk/i386/xpr.h [deleted file]
osfmk/ipc/ipc_entry.h
osfmk/ipc/ipc_importance.c
osfmk/ipc/ipc_importance.h
osfmk/ipc/ipc_init.c
osfmk/ipc/ipc_kmsg.c
osfmk/ipc/ipc_kmsg.h
osfmk/ipc/ipc_mqueue.c
osfmk/ipc/ipc_mqueue.h
osfmk/ipc/ipc_notify.c
osfmk/ipc/ipc_object.c
osfmk/ipc/ipc_object.h
osfmk/ipc/ipc_port.c
osfmk/ipc/ipc_port.h
osfmk/ipc/ipc_pset.c
osfmk/ipc/ipc_pset.h
osfmk/ipc/ipc_right.c
osfmk/ipc/ipc_right.h
osfmk/ipc/ipc_space.h
osfmk/ipc/ipc_voucher.c
osfmk/ipc/mach_debug.c
osfmk/ipc/mach_kernelrpc.c
osfmk/ipc/mach_msg.c
osfmk/ipc/mach_port.c
osfmk/kdp/kdp_callout.h
osfmk/kdp/kdp_core.c
osfmk/kdp/kdp_core.h
osfmk/kdp/kdp_en_debugger.h
osfmk/kdp/kdp_udp.c
osfmk/kdp/ml/arm/kdp_machdep.c
osfmk/kdp/processor_core.c
osfmk/kdp/processor_core.h
osfmk/kern/Makefile
osfmk/kern/arcade.c [new file with mode: 0644]
osfmk/kern/arcade.h [new file with mode: 0644]
osfmk/kern/assert.h
osfmk/kern/ast.c
osfmk/kern/ast.h
osfmk/kern/audit_sessionport.c
osfmk/kern/backtrace.c
osfmk/kern/backtrace.h
osfmk/kern/bits.h
osfmk/kern/block_hint.h
osfmk/kern/bsd_kern.c
osfmk/kern/btlog.c
osfmk/kern/circle_queue.h [new file with mode: 0644]
osfmk/kern/clock.c
osfmk/kern/clock_oldops.c
osfmk/kern/coalition.c
osfmk/kern/coalition.h
osfmk/kern/cpu_quiesce.c
osfmk/kern/cpu_quiesce.h
osfmk/kern/debug.c
osfmk/kern/debug.h
osfmk/kern/exc_guard.h
osfmk/kern/exception.c
osfmk/kern/exception.h
osfmk/kern/hibernate.c
osfmk/kern/host.c
osfmk/kern/host.h
osfmk/kern/host_notify.c
osfmk/kern/hv_support.c
osfmk/kern/hv_support.h
osfmk/kern/ipc_clock.c
osfmk/kern/ipc_host.c
osfmk/kern/ipc_kobject.c
osfmk/kern/ipc_kobject.h
osfmk/kern/ipc_mig.c
osfmk/kern/ipc_mig.h
osfmk/kern/ipc_misc.c
osfmk/kern/ipc_sync.c
osfmk/kern/ipc_tt.c
osfmk/kern/ipc_tt.h
osfmk/kern/kalloc.c
osfmk/kern/kcdata.h
osfmk/kern/kern_stackshot.c
osfmk/kern/kern_types.h
osfmk/kern/kmod.c
osfmk/kern/ledger.c
osfmk/kern/ledger.h
osfmk/kern/lock_group.h
osfmk/kern/locks.c
osfmk/kern/locks.h
osfmk/kern/ltable.h
osfmk/kern/machine.c
osfmk/kern/machine.h
osfmk/kern/memset_s.c
osfmk/kern/misc_protos.h
osfmk/kern/mk_timer.c
osfmk/kern/monotonic.h
osfmk/kern/mpsc_queue.c [new file with mode: 0644]
osfmk/kern/mpsc_queue.h [new file with mode: 0644]
osfmk/kern/policy_internal.h
osfmk/kern/printf.c
osfmk/kern/priority.c
osfmk/kern/priority_queue.h
osfmk/kern/processor.c
osfmk/kern/processor.h
osfmk/kern/processor_data.c
osfmk/kern/processor_data.h
osfmk/kern/queue.h
osfmk/kern/remote_time.c
osfmk/kern/remote_time.h
osfmk/kern/restartable.c [new file with mode: 0644]
osfmk/kern/restartable.h [new file with mode: 0644]
osfmk/kern/sched.h
osfmk/kern/sched_average.c
osfmk/kern/sched_clutch.c [new file with mode: 0644]
osfmk/kern/sched_clutch.h [new file with mode: 0644]
osfmk/kern/sched_clutch.md [new file with mode: 0644]
osfmk/kern/sched_dualq.c
osfmk/kern/sched_grrr.c
osfmk/kern/sched_multiq.c
osfmk/kern/sched_prim.c
osfmk/kern/sched_prim.h
osfmk/kern/sched_proto.c
osfmk/kern/sched_traditional.c
osfmk/kern/simple_lock.h
osfmk/kern/stack.c
osfmk/kern/startup.c
osfmk/kern/sync_sema.c
osfmk/kern/syscall_subr.c
osfmk/kern/syscall_sw.c
osfmk/kern/task.c
osfmk/kern/task.h
osfmk/kern/task_policy.c
osfmk/kern/telemetry.c
osfmk/kern/test_lock.c
osfmk/kern/test_mpsc_queue.c [new file with mode: 0644]
osfmk/kern/thread.c
osfmk/kern/thread.h
osfmk/kern/thread_act.c
osfmk/kern/thread_call.c
osfmk/kern/thread_group.c
osfmk/kern/thread_policy.c
osfmk/kern/timer_queue.h
osfmk/kern/tlock.c
osfmk/kern/trustcache.h
osfmk/kern/turnstile.c
osfmk/kern/turnstile.h
osfmk/kern/ux_handler.c
osfmk/kern/waitq.c
osfmk/kern/waitq.h
osfmk/kern/work_interval.c
osfmk/kern/xpr.c [deleted file]
osfmk/kern/xpr.h [deleted file]
osfmk/kern/zalloc.c
osfmk/kern/zalloc.h
osfmk/kern/zcache.c
osfmk/kextd/Makefile
osfmk/kperf/action.c
osfmk/kperf/callstack.c
osfmk/kperf/callstack.h
osfmk/kperf/kperf.c
osfmk/kperf/kperf_kpc.c
osfmk/kperf/kperf_kpc.h
osfmk/kperf/kperf_timer.c
osfmk/kperf/sample.h
osfmk/kperf/thread_samplers.c
osfmk/kperf/thread_samplers.h
osfmk/libsa/string.h
osfmk/libsa/types.h
osfmk/lockd/Makefile
osfmk/mach/Makefile
osfmk/mach/arcade_register.defs [new file with mode: 0644]
osfmk/mach/arcade_upcall.defs [new file with mode: 0644]
osfmk/mach/arm/_structs.h
osfmk/mach/arm/exception.h
osfmk/mach/arm/sdt_isa.h
osfmk/mach/arm/thread_state.h
osfmk/mach/arm/thread_status.h
osfmk/mach/arm/vm_param.h
osfmk/mach/coalition.h
osfmk/mach/exception_types.h
osfmk/mach/fairplayd_notification.defs [new file with mode: 0644]
osfmk/mach/host_info.h
osfmk/mach/host_special_ports.h
osfmk/mach/i386/_structs.h
osfmk/mach/i386/thread_status.h
osfmk/mach/kmod.h
osfmk/mach/mach_param.h
osfmk/mach/mach_port.defs
osfmk/mach/mach_time.h
osfmk/mach/mach_traps.h
osfmk/mach/mach_types.defs
osfmk/mach/mach_types.h
osfmk/mach/mach_voucher_types.h
osfmk/mach/machine.h
osfmk/mach/machine/sdt.h
osfmk/mach/memory_entry.defs
osfmk/mach/memory_object_types.h
osfmk/mach/message.h
osfmk/mach/mig.h
osfmk/mach/port.h
osfmk/mach/restartable.defs [new file with mode: 0644]
osfmk/mach/shared_region.h
osfmk/mach/syscall_sw.h
osfmk/mach/task.defs
osfmk/mach/task_info.h
osfmk/mach/task_policy.h
osfmk/mach/task_special_ports.h
osfmk/mach/thread_policy.h
osfmk/mach/thread_status.h
osfmk/mach/vfs_nspace.defs [new file with mode: 0644]
osfmk/mach/vm_param.h
osfmk/mach/vm_region.h
osfmk/mach/vm_statistics.h
osfmk/mach/vm_types.h
osfmk/machine/Makefile
osfmk/machine/atomic.h
osfmk/machine/atomic_impl.h [new file with mode: 0644]
osfmk/machine/memory_types.h [new file with mode: 0644]
osfmk/machine/monotonic.h
osfmk/machine/xpr.h [deleted file]
osfmk/prng/prng_random.c
osfmk/prng/random.h
osfmk/profiling/Makefile [deleted file]
osfmk/profiling/i386/Makefile [deleted file]
osfmk/profiling/i386/profile-md.h [deleted file]
osfmk/profiling/machine/Makefile [deleted file]
osfmk/profiling/machine/profile-md.h [deleted file]
osfmk/profiling/profile-internal.h [deleted file]
osfmk/profiling/profile-kgmon.c [deleted file]
osfmk/profiling/profile-mk.c [deleted file]
osfmk/profiling/profile-mk.h [deleted file]
osfmk/tests/kernel_tests.c
osfmk/tests/pmap_tests.c
osfmk/tests/xnupost.h
osfmk/vm/Makefile
osfmk/vm/bsd_vm.c
osfmk/vm/device_vm.c
osfmk/vm/lz4.h
osfmk/vm/memory_object.c
osfmk/vm/memory_object.h
osfmk/vm/memory_types.h [new file with mode: 0644]
osfmk/vm/pmap.h
osfmk/vm/vm_apple_protect.c
osfmk/vm/vm_compressor.c
osfmk/vm/vm_compressor_backing_store.c
osfmk/vm/vm_compressor_pager.c
osfmk/vm/vm_fault.c
osfmk/vm/vm_fault.h
osfmk/vm/vm_fourk_pager.c
osfmk/vm/vm_init.c
osfmk/vm/vm_init.h
osfmk/vm/vm_kern.c
osfmk/vm/vm_kern.h
osfmk/vm/vm_map.c
osfmk/vm/vm_map.h
osfmk/vm/vm_map_store.c
osfmk/vm/vm_map_store_rb.c
osfmk/vm/vm_object.c
osfmk/vm/vm_object.h
osfmk/vm/vm_page.h
osfmk/vm/vm_pageout.c
osfmk/vm/vm_pageout.h
osfmk/vm/vm_protos.h
osfmk/vm/vm_purgeable.c
osfmk/vm/vm_purgeable_internal.h
osfmk/vm/vm_resident.c
osfmk/vm/vm_shared_region.c
osfmk/vm/vm_shared_region.h
osfmk/vm/vm_shared_region_pager.c
osfmk/vm/vm_swapfile_pager.c
osfmk/vm/vm_user.c
osfmk/voucher/Makefile
osfmk/voucher/ipc_pthread_priority.c
osfmk/x86_64/copyio.c
osfmk/x86_64/cswitch.s
osfmk/x86_64/idt64.s
osfmk/x86_64/kpc_x86.c
osfmk/x86_64/locore.s
osfmk/x86_64/loose_ends.c
osfmk/x86_64/machine_remote_time.c
osfmk/x86_64/monotonic_x86_64.c
osfmk/x86_64/pmap.c
pexpert/arm/pe_consistent_debug.c
pexpert/arm/pe_identify_machine.c
pexpert/arm/pe_init.c
pexpert/arm/pe_kprintf.c
pexpert/arm/pe_serial.c
pexpert/conf/Makefile
pexpert/conf/Makefile.template
pexpert/conf/files.arm
pexpert/conf/files.arm64
pexpert/conf/files.x86_64
pexpert/gen/bootargs.c
pexpert/i386/pe_identify_machine.c
pexpert/i386/pe_init.c
pexpert/i386/pe_kprintf.c
pexpert/pexpert/arm/Makefile
pexpert/pexpert/arm/S7002.h
pexpert/pexpert/arm/T8002.h
pexpert/pexpert/arm/consistent_debug.h
pexpert/pexpert/arm/dockchannel.h [new file with mode: 0644]
pexpert/pexpert/arm64/BCM2837.h
pexpert/pexpert/arm64/Makefile
pexpert/pexpert/arm64/S5L8960X.h [deleted file]
pexpert/pexpert/arm64/T8010.h
pexpert/pexpert/arm64/arm64_common.h
pexpert/pexpert/arm64/board_config.h
pexpert/pexpert/arm64/boot.h
pexpert/pexpert/arm64/cyclone.h [deleted file]
pexpert/pexpert/arm64/hurricane.h
pexpert/pexpert/arm64/spr_locks.h [new file with mode: 0644]
pexpert/pexpert/arm64/twister.h
pexpert/pexpert/arm64/typhoon.h
pexpert/pexpert/pexpert.h
san/Kasan_kasan.exports
san/Makefile
san/conf/Makefile
san/conf/Makefile.template
san/conf/files
san/conf/files.arm64
san/conf/files.x86_64
san/kasan-arm64.c
san/kasan-blacklist
san/kasan-blacklist-x86_64
san/kasan.c
san/kasan.h
san/kasan_dynamic_blacklist.c
san/kasan_internal.h
san/ksancov.c [new file with mode: 0644]
san/ksancov.h [new file with mode: 0644]
san/memintrinsics.h
san/tools/ksancov.c [new file with mode: 0644]
san/ubsan-blacklist
san/ubsan.c
san/ubsan.h
san/ubsan_log.c
security/Makefile
security/conf/Makefile
security/conf/Makefile.template
security/mac_audit.c
security/mac_base.c
security/mac_framework.h
security/mac_internal.h
security/mac_policy.h
security/mac_process.c
security/mac_system.c
security/mac_vfs.c
security/mac_vfs_subr.c
tests/Makefile
tests/avx.c
tests/backtracing.c
tests/coalition_info.c [new file with mode: 0644]
tests/cpucount.c
tests/data_protection.c
tests/debug_control_port_for_pid.c [new file with mode: 0644]
tests/debug_control_port_for_pid_entitlement.plist [new file with mode: 0644]
tests/disk_mount_conditioner.c
tests/exc_resource_threads.c
tests/extended_getdirentries64.c [new file with mode: 0644]
tests/fp_exception.c [new file with mode: 0644]
tests/freebsd_waitpid_nohang.c
tests/gettimeofday.c
tests/host_statistics_rate_limiting.c
tests/immovable_rights.c [new file with mode: 0644]
tests/immovable_send.c [new file with mode: 0644]
tests/immovable_send_client.c [new file with mode: 0644]
tests/in_cksum_test.c [new file with mode: 0644]
tests/jumbo_va_spaces_28530648.c
tests/kdebug.c
tests/kernel_mtx_perf.c
tests/kernel_uuid_match.c
tests/kevent_pty.c
tests/kevent_qos.c
tests/kpc.c
tests/kperf.c
tests/kperf_backtracing.c
tests/kqueue_add_and_trigger.c
tests/kqueue_close.c
tests/kqueue_fifo_18776047.c
tests/ktrace_helpers.h [new file with mode: 0644]
tests/ldt.c
tests/ldt_entitlement.plist [new file with mode: 0644]
tests/mach_boottime_usec.c
tests/mach_continuous_time.c
tests/mach_exception_reply.c [new file with mode: 0644]
tests/mach_get_times.c
tests/mach_port_deallocate_21692215.c
tests/mach_port_insert_right.c
tests/mach_port_mod_refs.c
tests/mach_timebase_info.c
tests/memorystatus_assertion_helpers.c [new file with mode: 0644]
tests/memorystatus_assertion_helpers.h [new file with mode: 0644]
tests/memorystatus_freeze_test.c
tests/memorystatus_is_assertion.c [new file with mode: 0644]
tests/memorystatus_zone_test.c
tests/mktimer_kobject.c
tests/mo_immovable_receive.c [new file with mode: 0644]
tests/mpsc.c [new file with mode: 0644]
tests/net_tun_pr_35136664.c
tests/net_tuntests.c
tests/no32exec_35914211.c
tests/no32exec_35914211_helper_binprefs.c [new file with mode: 0644]
tests/os_proc.c [new file with mode: 0644]
tests/os_refcnt.c [new file with mode: 0644]
tests/os_unaligned.c [new file with mode: 0644]
tests/osptr.cpp [new file with mode: 0644]
tests/osptr_dumb.cpp [new file with mode: 0644]
tests/osptr_helper.cpp [new file with mode: 0644]
tests/perf_compressor.c
tests/perf_kdebug.c [deleted file]
tests/perf_vmfault.c
tests/phys_footprint_interval_max.c
tests/pipe_drain.c [new file with mode: 0644]
tests/pipe_kevent.c [new file with mode: 0644]
tests/poll.c
tests/port_descriptions.c
tests/posix_spawn_file_actions.c [new file with mode: 0644]
tests/posix_spawn_file_actions_add_fileportdup2_np.c [new file with mode: 0644]
tests/posix_spawn_posix_cred.c [new file with mode: 0644]
tests/prioritize_process_launch.c [new file with mode: 0644]
tests/prioritize_process_launch_helper.c [new file with mode: 0644]
tests/prng.c [new file with mode: 0644]
tests/proc_info.c
tests/proc_info_44873309.c [new file with mode: 0644]
tests/proc_info_44873309.entitlements [new file with mode: 0644]
tests/proc_info_list_kthreads.c
tests/proc_info_udata.c
tests/proc_uuid_policy_26567533.c
tests/processor_info.c
tests/pwrite_avoid_sigxfsz_28581610.c
tests/quiesce_counter.c
tests/remote_time.c
tests/restart.c [new file with mode: 0644]
tests/settimeofday_29193041.c
tests/shared_cache_tests.c [new file with mode: 0644]
tests/sigchld_return.c
tests/sigcont_return.c
tests/socket_bind_35685803.c
tests/socket_poll_close_25786011.c
tests/stackshot.m [deleted file]
tests/stackshot_block_owner_14362384.m
tests/stackshot_spawn_exit_stress.c
tests/stackshot_tests.m [new file with mode: 0644]
tests/suspended_spawn_26184412.c
tests/sysctl_get_owned_vmobjects.c [new file with mode: 0644]
tests/task_info.c
tests/task_info_28439149.c
tests/task_inspect.c
tests/task_vm_info_decompressions.c [new file with mode: 0644]
tests/telemetry.c
tests/testposixshm.c
tests/thread_group_set_32261625.c
tests/time.c [new file with mode: 0644]
tests/turnstile_multihop.c
tests/turnstile_multihop_helper.h
tests/turnstiles_test.c
tests/utimensat.c
tests/verify_kalloc_config.c
tests/vm_phys_footprint.c
tests/vm_phys_footprint_legacy.c [deleted file]
tests/vm_set_max_addr_test.c
tests/voucher_entry_18826844.c
tests/voucher_traps.c
tests/work_interval_test.c
tests/workq_sigprof.c
tests/xnu_quick_test.c
tests/xnu_quick_test_entitled.c
tests/xnu_quick_test_getsetpriority.c
tools/cocci/OSAtomic_rewrite.cocci [new file with mode: 0644]
tools/cocci/c11_atomic_builtin_rewrite.cocci [new file with mode: 0644]
tools/cocci/hw_atomic_rewrite.cocci [new file with mode: 0644]
tools/cocci/mcache_atomic_rewrite.cocci [new file with mode: 0644]
tools/cocci/os_atomic_normalize.cocci [new file with mode: 0644]
tools/lldbmacros/Makefile
tools/lldbmacros/bank.py
tools/lldbmacros/core/cvalue.py
tools/lldbmacros/core/kernelcore.py
tools/lldbmacros/core/operating_system.py
tools/lldbmacros/core/standard.py
tools/lldbmacros/core/xnu_lldb_init.py
tools/lldbmacros/ioreg.py
tools/lldbmacros/ipc.py
tools/lldbmacros/kcdata.py
tools/lldbmacros/kevent.py
tools/lldbmacros/memory.py
tools/lldbmacros/misc.py
tools/lldbmacros/net.py
tools/lldbmacros/netdefines.py
tools/lldbmacros/pmap.py
tools/lldbmacros/process.py
tools/lldbmacros/scheduler.py
tools/lldbmacros/skywalk.py
tools/lldbmacros/structanalyze.py
tools/lldbmacros/sysreg.py [new file with mode: 0755]
tools/lldbmacros/sysregdoc/AArch64-esr_el1.xml [new file with mode: 0644]
tools/lldbmacros/turnstile.py
tools/lldbmacros/ulock.py [new file with mode: 0755]
tools/lldbmacros/userspace.py
tools/lldbmacros/usertaskdebugging/userprocess.py
tools/lldbmacros/utils.py
tools/lldbmacros/workqueue.py
tools/lldbmacros/xnu.py
tools/lldbmacros/xnudefines.py
tools/tests/perf_index/Makefile
tools/tests/perf_index/test_fault_helper.c
tools/tests/personas/Makefile
tools/tests/personas/persona-entitlements.plist [new file with mode: 0644]
tools/tests/personas/persona_test_run.sh [deleted file]
tools/tests/personas/persona_test_run_src.sh [new file with mode: 0755]
tools/tests/zero-to-n/zero-to-n.c
tools/trace/kqtrace.lua
tools/trace/wqtrace.lua

index f5ad2c6fde93674fa5ddde9265c00f160796ecb4..70d6a40147c3f9d040ce16d2ff0cf3af8e9a057c 100644 (file)
@@ -25,6 +25,9 @@ compile_commands.json
 # /libkern/kmod/libkmod.xcodeproj/
 /libkern/kmod/libkmod.xcodeproj/xcuserdata
 
+# /libkdd/kdd.xcodeproj/
+/libkdd/kdd.xcodeproj/xcuserdata
+
 # /libsyscall/Libsyscall.xcodeproj/
 /libsyscall/Libsyscall.xcodeproj/xcuserdata
 /libsyscall/Libsyscall.xcodeproj/project.xcworkspace
@@ -44,14 +47,25 @@ compile_commands.json
 # /tools/tests/testkext/testkext.xcodeproj/
 /tools/tests/testkext/testkext.xcodeproj/xcuserdata
 
+#/tools/tests/unit_tests/cpu_monitor_tests_11646922_src/CatchRN/CatchRN.xcodeproj/
+/tools/tests/unit_tests/cpu_monitor_tests_11646922_src/CatchRN/CatchRN.xcodeproj/xcuserdata
+
 # /tools/tests/unit_tests/cpu_monitor_tests_11646922_src/cpu_hog/cpu_hog.xcodeproj/
 /tools/tests/unit_tests/cpu_monitor_tests_11646922_src/cpu_hog/cpu_hog.xcodeproj/xcuserdata
 
+# /tools/tests/unit_tests/mach_test_15789220_src/mach_test.xcodeproj/
+/tools/tests/unit_tests/mach_test_15789220_src/mach_test.xcodeproj/xcuserdata
+
 # /tools/tests/unit_tests/monitor_stress_12901965_src/monitor_stress.xcodeproj/
 /tools/tests/unit_tests/monitor_stress_12901965_src/monitor_stress.xcodeproj/xcuserdata
 
 # /tools/tests/unit_tests/monitor_stress_12901965_src/monitor_stress.xcodeproj/project.xcworkspace/
 /tools/tests/unit_tests/monitor_stress_12901965_src/monitor_stress.xcodeproj/project.xcworkspace/xcuserdata
 
+#/tools/tests/unit_tests/test_14395574/test_14395574.xcodeproj/
+/tools/tests/unit_tests/test_14395574/test_14395574.xcodeproj/xcuserdata
+
 # /tools/tests/zero-to-n
 /tools/tests/zero-to-n/zn*
+
+# do not add *.orig, *.rej, use `git clean` instead
index a8db883a3cccec08dd11570851893816a8a27dd8..770e156ec0e7dbd7bcb4690868d60a7b911189e9 100644 (file)
@@ -8,7 +8,8 @@ include $(MakeInc_def)
 
 INSTINC_SUBDIRS =      \
        architecture    \
-       mach-o
+       mach-o                  \
+       sys
 
 INSTINC_SUBDIRS_X86_64 =       \
        architecture
@@ -32,9 +33,7 @@ KERNEL_FILES = \
        stdatomic.h     \
        stdbool.h       \
        stddef.h        \
-       stdint.h
-
-KERNEL_FILES += \
+       stdint.h        \
        ptrauth.h
 
 INSTALL_MI_LIST =
index 5493e41c91987ed3da55d580bcf4ab80f5471878..4b2a6dec239f80557b4897cf6672a4e246982b37 100644 (file)
 #include <string.h>
 #include <stdint.h>
 
+#if __has_feature(attribute_availability_with_replacement)
+#if __has_feature(attribute_availability_bridgeos)
+  #ifndef __CC_BRIDGE_OS_DEPRECATED
+    #define __CC_BRIDGEOS_DEPRECATED(_dep, _msg) __attribute__((availability(bridgeos,deprecated=_dep, replacement=_msg)))
+  #endif
+#endif
+
+#ifndef __CC_BRIDGEOS_DEPRECATED
+  #define __CC_BRIDGEOS_DEPRECATED(_dep, _msg)
+#endif
+
+#define cc_deprecate_with_replacement(replacement_message, ios_version, macos_version, tvos_version, watchos_version, bridgeos_version) \
+__attribute__((availability(macos,deprecated=macos_version,       replacement=replacement_message)))\
+__attribute__((availability(ios,deprecated=ios_version,           replacement=replacement_message)))\
+__attribute__((availability(watchos,deprecated=watchos_version,   replacement=replacement_message)))\
+__attribute__((availability(tvos,deprecated=tvos_version,         replacement=replacement_message)))\
+__CC_BRIDGEOS_DEPRECATED(bridgeos_version, replacement_message)
+
+#else /* !__has_feature(attribute_availability_with_replacement) */
+
+#define cc_deprecate_with_replacement(replacement_message, ios_version, macos_version, tvos_version, watchos_version, bridgeos_version)
+
+#endif /* __has_feature(attribute_availability_with_replacement) */
+
 /* Provide a general purpose macro concat method. */
 #define cc_concat_(a, b) a##b
 #define cc_concat(a, b) cc_concat_(a, b)
 
 /* Manage asserts here because a few functions in header public files do use asserts */
+#if CORECRYPTO_DEBUG
 #define cc_assert(x) assert(x)
+#else
+#define cc_assert(x)
+#endif
+
 #if CC_KERNEL
 #include <kern/assert.h>
 #elif CC_USE_S3
@@ -32,7 +61,7 @@
 
 /* Provide a static assert that can be used to create compile-type failures. */
 #define cc_static_assert(e,m)                                               \
-    ;enum { cc_concat(static_assert_, __COUNTER__) = 1/(int)(!!(e)) }
+    enum { cc_concat(static_assert_, __COUNTER__) = 1/(int)(!!(e)) }
 
 /* Declare a struct element with a guarenteed alignment of _alignment_.
    The resulting struct can be used to create arrays that are aligned by
@@ -42,6 +71,15 @@ typedef struct { \
 uint8_t b[_alignment_]; \
 } CC_ALIGNED(_alignment_)
 
+#if defined(__BIGGEST_ALIGNMENT__)
+#define CC_MAX_ALIGNMENT __BIGGEST_ALIGNMENT__
+#else
+#define CC_MAX_ALIGNMENT 16
+#endif
+
+/* pads a given size to be a multiple of the biggest alignment for any type */
+#define cc_pad_align(_size_) ((_size_ + CC_MAX_ALIGNMENT - 1) & (~(CC_MAX_ALIGNMENT - 1)))
+
 /* number of array elements used in a cc_ctx_decl */
 #define cc_ctx_n(_type_, _size_) ((_size_ + sizeof(_type_) - 1) / sizeof(_type_))
 
@@ -55,14 +93,14 @@ uint8_t b[_alignment_]; \
   3. Never use sizeof() operator for the variables declared with cc_ctx_decl(), because it is not be compatible with the _MSC_VER version of cc_ctx_decl().
  */
 #if defined(_MSC_VER)
+#include <malloc.h>
 #define cc_ctx_decl(_type_, _size_, _name_)  _type_ * _name_ = (_type_ *) _alloca(sizeof(_type_) * cc_ctx_n(_type_, _size_) )
 #else
 #define cc_ctx_decl(_type_, _size_, _name_)  _type_ _name_ [cc_ctx_n(_type_, _size_)]
 #endif
 
-/* bzero is deprecated. memset is the way to go */
-/* FWIW, L4, HEXAGON and ARMCC even with gnu compatibility mode don't have bzero */
-#define cc_zero(_size_,_data_) memset((_data_),0 ,(_size_))
+// cc_zero is deprecated, please use cc_clear instead.
+#define cc_zero(_size_,_data_) _Pragma ("corecrypto deprecation warning \"'cc_zero' macro is deprecated. Use 'cc_clear' instead.\"") cc_clear(_size_,_data_)
 
 /*!
  @brief cc_clear(len, dst) zeroizes array dst and it will not be optimized out.
@@ -99,12 +137,16 @@ int cc_cmp_safe (size_t num, const void * ptr1, const void * ptr2);
 /* Exchange S and T of any type.  NOTE: Both and S and T are evaluated
    mutliple times and MUST NOT be expressions. */
 #define CC_SWAP(S,T)  do { \
-    __typeof__(S) _cc_swap_tmp = S; S = T; T = _cc_swap_tmp; \
+    volatile __typeof__(S) _cc_swap_tmp = S; S = T; T = _cc_swap_tmp; \
+    _cc_swap_tmp = 0;\
 } while(0)
 
 /* Return the maximum value between S and T. */
 #define CC_MAX(S, T) ({__typeof__(S) _cc_max_s = S; __typeof__(T) _cc_max_t = T; _cc_max_s > _cc_max_t ? _cc_max_s : _cc_max_t;})
 
+/* Clone of CC_MAX() that evalutes S and T multiple times to allow nesting. */
+#define CC_MAX_EVAL(S, T) ((S) > (T) ? (S) : (T))
+
 /* Return the minimum value between S and T. */
 #define CC_MIN(S, T) ({__typeof__(S) _cc_min_s = S; __typeof__(T) _cc_min_t = T; _cc_min_s <= _cc_min_t ? _cc_min_s : _cc_min_t;})
 
index fbdb2c61c7e7afbe9e71731c55876e23265dc33e..5fb1832884add5c815938e38237f9b5533efcc5d 100644 (file)
  #define CCN_OSX                                  1
 #endif
 
-#if CC_USE_L4 || CC_USE_S3
-/* No dynamic linking allowed in L4, e.g. avoid nonlazy symbols */
+#if CC_USE_S3
 /* For corecrypto kext, CC_STATIC should be undefined */
  #define CC_STATIC              1
 #endif
 
 // see rdar://problem/26636018
 #if (CCN_UNIT_SIZE == 8) && !( defined(_MSC_VER) && defined(__clang__))
-#define CCEC25519_CURVE25519DONNA_64BIT 1
+#define CCEC25519_CURVE25519_64BIT 1
 #else
-#define CCEC25519_CURVE25519DONNA_64BIT 0
+#define CCEC25519_CURVE25519_64BIT 0
 #endif
 
 //- functions implemented in assembly ------------------------------------------
  #warning "You are using the default corecrypto configuration, assembly optimizations may not be available for your platform"
 #endif
 
+// Enable assembler in Linux if CC_LINUX_ASM is defined
+#if CC_LINUX && defined(CC_LINUX_ASM) && CC_LINUX_ASM
+#define CC_USE_ASM 1
+#endif
+
 // Use this macro to strictly disable assembly regardless of cpu/os/compiler/etc.
 // Our assembly code is not gcc compatible. Clang defines the __GNUC__ macro as well.
 #if !defined(CC_USE_ASM)
- #if defined(_WIN32) || CC_EFI || CC_BASEBAND || CC_XNU_KERNEL_PRIVATE || (defined(__GNUC__) && !defined(__clang__)) || defined(__ANDROID_API__) || CC_RTKIT || CC_RTKITROM
+ #if defined(_WIN32) || CC_EFI || CC_BASEBAND || CC_XNU_KERNEL_PRIVATE || (defined(__GNUC__) && !defined(__clang__)) || defined(__ANDROID_API__) || CC_LINUX
   #define CC_USE_ASM 0
  #else
   #define CC_USE_ASM 1
  #define CCN_ADDMUL1_ASM        1
  #define CCN_MUL1_ASM           1
  #define CCN_CMP_ASM            1
- #define CCN_ADD1_ASM           0
- #define CCN_SUB1_ASM           0
+ #define CCN_ADD1_ASM           1
+ #define CCN_SUB1_ASM           1
  #define CCN_N_ASM              1
  #define CCN_SET_ASM            1
  #define CCN_SHIFT_RIGHT_ASM    1
+ #if defined(__ARM_NEON__) 
+ #define CCN_SHIFT_LEFT_ASM     1
+ #else
+ #define CCN_SHIFT_LEFT_ASM     0
+ #endif
+ #define CCN_MOD_224_ASM        1
+ #define CCN_MULMOD_256_ASM     1
  #define CCAES_ARM_ASM          1
  #define CCAES_INTEL_ASM        0
  #if CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_USE_S3
  #define CCSHA2_VNG_INTEL       0
 
  #if defined(__ARM_NEON__) || CC_KERNEL
-  #define CCSHA1_VNG_ARMV7NEON   1
-  #define CCSHA2_VNG_ARMV7NEON   1
+  #define CCSHA1_VNG_ARM        1
+  #define CCSHA2_VNG_ARM        1
  #else /* !defined(__ARM_NEON__) */
-  #define CCSHA1_VNG_ARMV7NEON   0
-  #define CCSHA2_VNG_ARMV7NEON   0
+  #define CCSHA1_VNG_ARM        0
+  #define CCSHA2_VNG_ARM        0
  #endif /* !defined(__ARM_NEON__) */
  #define CCSHA256_ARMV6M_ASM 0
 
+ #define CC_ACCELERATECRYPTO    1
+
 //-(2) ARM 64
 #elif defined(__arm64__) && __clang__ && CC_USE_ASM
  #define CCN_DEDICATED_SQR      CC_SMALL_CODE
  #define CCN_N_ASM              1
  #define CCN_SET_ASM            0
  #define CCN_SHIFT_RIGHT_ASM    1
+ #define CCN_SHIFT_LEFT_ASM     1
+ #define CCN_MOD_224_ASM        0
+ #define CCN_MULMOD_256_ASM     1
  #define CCAES_ARM_ASM          1
  #define CCAES_INTEL_ASM        0
  #define CCAES_MUX              0        // On 64bit SoC, asm is much faster than HW
  #define CCN_USE_BUILTIN_CLZ    1
  #define CCSHA1_VNG_INTEL       0
  #define CCSHA2_VNG_INTEL       0
- #define CCSHA1_VNG_ARMV7NEON   1              // reused this to avoid making change to xcode project, put arm64 assembly code with armv7 code
- #define CCSHA2_VNG_ARMV7NEON   1
+ #define CCSHA1_VNG_ARM         1
+ #define CCSHA2_VNG_ARM         1
  #define CCSHA256_ARMV6M_ASM    0
 
+ #define CC_ACCELERATECRYPTO    1
+
 //-(3) Intel 32/64
 #elif (defined(__x86_64__) || defined(__i386__)) && __clang__ && CC_USE_ASM
  #define CCN_DEDICATED_SQR      1
   #define CCN_CMP_ASM            1
   #define CCN_N_ASM              1
   #define CCN_SHIFT_RIGHT_ASM    1
+  #define CCN_SHIFT_LEFT_ASM     1
  #else
   #define CCN_CMP_ASM            0
   #define CCN_N_ASM              0
   #define CCN_SHIFT_RIGHT_ASM    0
+  #define CCN_SHIFT_LEFT_ASM     0
  #endif
 
+ #define CCN_MOD_224_ASM        0
+ #define CCN_MULMOD_256_ASM     0
  #define CCN_ADDMUL1_ASM        0
  #define CCN_MUL1_ASM           0
  #define CCN_ADD1_ASM           0
  #define CCN_USE_BUILTIN_CLZ    0
  #define CCSHA1_VNG_INTEL       1
  #define CCSHA2_VNG_INTEL       1
- #define CCSHA1_VNG_ARMV7NEON   0
- #define CCSHA2_VNG_ARMV7NEON   0
+ #define CCSHA1_VNG_ARM         0
+ #define CCSHA2_VNG_ARM         0
  #define CCSHA256_ARMV6M_ASM    0
 
+ #define CC_ACCELERATECRYPTO    1
+
 //-(4) disable assembly
 #else
  #if CCN_UINT128_SUPPORT_FOR_64BIT_ARCH
  #define CCN_N_ASM              0
  #define CCN_SET_ASM            0
  #define CCN_SHIFT_RIGHT_ASM    0
+ #define CCN_SHIFT_LEFT_ASM     0
+ #define CCN_MOD_224_ASM        0
+ #define CCN_MULMOD_256_ASM     0
  #define CCAES_ARM_ASM          0
  #define CCAES_INTEL_ASM        0
  #define CCAES_MUX              0
  #define CCN_USE_BUILTIN_CLZ    0
  #define CCSHA1_VNG_INTEL       0
  #define CCSHA2_VNG_INTEL       0
- #define CCSHA1_VNG_ARMV7NEON   0
- #define CCSHA2_VNG_ARMV7NEON   0
+ #define CCSHA1_VNG_ARM         0
+ #define CCSHA2_VNG_ARM         0
  #define CCSHA256_ARMV6M_ASM    0
 
+ #define CC_ACCELERATECRYPTO    0
+
 #endif
 
 #define CC_INLINE static inline
  #define CC_NONNULL4 CC_NONNULL((4))
  #define CC_NONNULL_ALL __attribute__((__nonnull__))
  #define CC_SENTINEL __attribute__((__sentinel__))
+ // Only apply the `CC_CONST` attribute to functions with no side-effects where the output is a strict function of pass by value input vars with no exterior side-effects.
+ // Specifically, do not apply CC_CONST if the function has any arguments that are pointers (directly, or indirectly)
  #define CC_CONST __attribute__((__const__))
  #define CC_PURE __attribute__((__pure__))
  #define CC_WARN_RESULT __attribute__((__warn_unused_result__))
- #define CC_MALLOC __attribute__((__malloc__))
+ #define CC_MALLOC_CLEAR __attribute__((__malloc__))
  #define CC_UNUSED __attribute__((unused))
 #else /* !__GNUC__ */
 /*! @parseOnly */
 /*! @parseOnly */
  #define CC_WARN_RESULT
 /*! @parseOnly */
- #define CC_MALLOC
+ #define CC_MALLOC_CLEAR
 #endif /* !__GNUC__ */
 
+
+// Bridge differences between MachO and ELF compiler/assemblers. */
+#if CC_USE_ASM
+#if CC_LINUX
+#define CC_ASM_SECTION_CONST .rodata
+#define CC_ASM_PRIVATE_EXTERN .hidden
+#define CC_C_LABEL(_sym) _sym
+#else /* !CC_LINUX */
+#define CC_ASM_SECTION_CONST .const
+#define CC_ASM_PRIVATE_EXTERN .private_extern
+#define CC_C_LABEL(_sym) _##_sym
+#endif /* !CC_LINUX */
+#endif /* CC_USE_ASM */
+
+
 // Enable FIPSPOST function tracing only when supported. */
 #ifdef CORECRYPTO_POST_TRACE
 #define CC_FIPSPOST_TRACE 1
index 57b8ec70c4ef18d6b1a5e735ec83b3d0508f0bd4..b382cc5c1cc8ed4d5c7e2f872427d5a766f6f9df 100644 (file)
@@ -116,6 +116,30 @@ enum {
     CCPOST_INTEGRITY_ERROR = -74,
     // Output of the algo is not as expected
     CCPOST_KAT_FAILURE = -75,
+
+    CCKPRNG_SEEDFILE_OPEN = -76,
+    CCKPRNG_SEEDFILE_READ = -78,
+    CCKPRNG_SEEDFILE_WRITE = -79,
+    CCKPRNG_SEEDFILE_CHMOD = -80,
+    CCKPRNG_SEEDFILE_CHOWN = -81,
+    CCKPRNG_RANDOMDEV_OPEN = -82,
+    CCKPRNG_RANDOMDEV_WRITE = -83,
+    CCKPRNG_GETENTROPY = -84,
+
+    CCSAE_HUNTPECK_EXCEEDED_MAX_TRIALS = -85,
+
+    CCERR_CALL_SEQUENCE = -86,
+
+    CCVRF_POINT_DECODE_FAILURE = -87,
+    CCVRF_POINT_INVALID_PUBLIC_KEY = -88,
+    CCVRF_VERIFY_FAILURE = -89,
+
+    // Error codes for Authenticated Encryption Modes
+    CCMODE_TAG_LENGTH_REQUEST_TOO_LONG = -100,
+    CCMODE_TAG_LENGTH_TOO_SHORT = -101,
+    CCMODE_NONCE_EMPTY = -102,
+    CCMODE_AD_EMPTY = -103,
+    CCMODE_DECRYPTION_OR_VERIFICATION_ERR=-104,
 };
 
 #define CCDRBG_STATUS_OK CCERR_OK
index 0a51e66eec069b9221e7d5c9f0d9fa4296f2a056..6a201eade4b59191d874ffcae755cca6ec000b9c 100644 (file)
 #include <corecrypto/cc.h>
 #include <stdint.h>
 
+// Fork handlers for the stateful components of corecrypto.
+void cc_atfork_prepare(void);
+void cc_atfork_parent(void);
+void cc_atfork_child(void);
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+#ifndef __DECONST
+#define __DECONST(type, var) ((type)(uintptr_t)(const void *)(var))
+#endif
+
 /* defines the following macros :
 
- CC_MEMCPY  : optimized memcpy.
- CC_MEMMOVE : optimized memmove.
- CC_MEMSET  : optimized memset.
+ CC_ARRAY_LEN: returns the number of elements in an array
 
  CC_STORE32_BE : store 32 bit value in big endian in unaligned buffer.
  CC_STORE32_LE : store 32 bit value in little endian in unaligned buffer.
  CC_H2BE32 : convert a 32 bits value between host and big endian order.
  CC_H2LE32 : convert a 32 bits value between host and little endian order.
 
-The following are not defined yet... define them if needed.
-
- CC_BSWAPc   : byte swap a 32 bits constant
-
  CC_BSWAP64  : byte swap a 64 bits variable
- CC_BSWAP64c : byte swap a 64 bits constant
 
  CC_READ_LE32 : read a 32 bits little endian value
 
@@ -62,10 +68,32 @@ The following are not defined yet... define them if needed.
 
 */
 
-/* TODO: optimized versions */
-#define CC_MEMCPY(D,S,L) memcpy((D),(S),(L))
-#define CC_MEMMOVE(D,S,L) memmove((D),(S),(L))
-#define CC_MEMSET(D,V,L) memset((D),(V),(L))
+// <rdar://problem/40683103> RTKitOSPlatform should replace CC_MEMCPY with memcpy
+#define CC_MEMCPY(D,S,L) cc_memcpy((D),(S),(L))
+#define CC_MEMMOVE(D,S,L) cc_memmove((D),(S),(L))
+#define CC_MEMSET(D,V,L) cc_memset((D),(V),(L))
+
+#if __has_builtin(__builtin___memcpy_chk) && !CC_RTKIT
+#define cc_memcpy(dst, src, len) __builtin___memcpy_chk((dst), (src), (len), __builtin_object_size((dst), 1))
+#define cc_memcpy_nochk(dst, src, len) __builtin___memcpy_chk((dst), (src), (len), __builtin_object_size((dst), 0))
+#else
+#define cc_memcpy(dst, src, len) memcpy((dst), (src), (len))
+#define cc_memcpy_nochk(dst, src, len) memcpy((dst), (src), (len))
+#endif
+
+#if __has_builtin(__builtin___memmove_chk) && !CC_RTKIT
+#define cc_memmove(dst, src, len) __builtin___memmove_chk((dst), (src), (len), __builtin_object_size((dst), 1))
+#else
+#define cc_memmove(dst, src, len) memmove((dst), (src), (len))
+#endif
+
+#if __has_builtin(__builtin___memset_chk) && !CC_RTKIT
+#define cc_memset(dst, val, len) __builtin___memset_chk((dst), (val), (len), __builtin_object_size((dst), 1))
+#else
+#define cc_memset(dst, val, len) memset((dst), (val), (len))
+#endif
+
+#define CC_ARRAY_LEN(x) (sizeof((x))/sizeof((x)[0]))
 
 // MARK: - Loads and Store
 
@@ -327,32 +355,46 @@ CC_INLINE uint64_t CC_ROR64(uint64_t word, int i)
 
 // MARK: - Byte Swaps
 
-CC_INLINE uint32_t CC_BSWAP(uint32_t x)
+#if __has_builtin(__builtin_bswap32)
+#define CC_BSWAP32(x) __builtin_bswap32(x)
+#else
+CC_INLINE uint32_t CC_BSWAP32(uint32_t x)
 {
-    return (
-        ((x>>24)&0x000000FF) |
-        ((x<<24)&0xFF000000) |
-        ((x>>8) &0x0000FF00) |
-        ((x<<8) &0x00FF0000)
-    );
+    return
+        ((x & 0xff000000) >> 24) |
+        ((x & 0x00ff0000) >>  8) |
+        ((x & 0x0000ff00) <<  8) |
+        ((x & 0x000000ff) << 24);
 }
+#endif
 
-#define CC_BSWAP64(x) \
-((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
-(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
-(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
-(((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
-(((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
-(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
-(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
-(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
+#if __has_builtin(__builtin_bswap64)
+#define CC_BSWAP64(x) __builtin_bswap64(x)
+#else
+CC_INLINE uint64_t CC_BSWAP64(uint64_t x)
+{
+    return
+        ((x & 0xff00000000000000ULL) >> 56) |
+        ((x & 0x00ff000000000000ULL) >> 40) |
+        ((x & 0x0000ff0000000000ULL) >> 24) |
+        ((x & 0x000000ff00000000ULL) >>  8) |
+        ((x & 0x00000000ff000000ULL) <<  8) |
+        ((x & 0x0000000000ff0000ULL) << 24) |
+        ((x & 0x000000000000ff00ULL) << 40) |
+        ((x & 0x00000000000000ffULL) << 56);
+}
+#endif
 
 #ifdef __LITTLE_ENDIAN__
-#define CC_H2BE32(x) CC_BSWAP(x)
+#define CC_H2BE32(x) CC_BSWAP32(x)
 #define CC_H2LE32(x) (x)
+#define CC_H2BE64(x) CC_BSWAP64(x)
+#define CC_H2LE64(x) (x)
 #else
 #define CC_H2BE32(x) (x)
-#define CC_H2LE32(x) CC_BSWAP(x)
+#define CC_H2LE32(x) CC_BSWAP32(x)
+#define CC_H2BE64(x) (x)
+#define CC_H2LE64(x) CC_BSWAP64(x)
 #endif
 
 #define        CC_READ_LE32(ptr) \
@@ -389,54 +431,156 @@ do { \
 #define cc_byte(x, n) (((x) >> (8 * (n))) & 255)
 #endif
 
+/* Count leading zeros (for nonzero inputs) */
+
+/*
+ *  On i386 and x86_64, we know clang and GCC will generate BSR for
+ *  __builtin_clzl.  This instruction IS NOT constant time on all micro-
+ *  architectures, but it *is* constant time on all micro-architectures that
+ *  have been used by Apple, and we expect that to continue to be the case.
+ *
+ *  When building for x86_64h with clang, this produces LZCNT, which is exactly
+ *  what we want.
+ *
+ *  On arm and arm64, we know that clang and GCC generate the constant-time CLZ
+ *  instruction from __builtin_clzl( ).
+ */
+
+#if defined(_WIN32)
+/* We use the Windows implementations below. */
+#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__)
+/* We use a thought-to-be-good version of __builtin_clz. */
+#elif defined __GNUC__
+#warning Using __builtin_clz() on an unknown architecture; it may not be constant-time.
+/* If you find yourself seeing this warning, file a radar for someone to
+ * check whether or not __builtin_clz() generates a constant-time
+ * implementation on the architecture you are targeting.  If it does, append
+ * the name of that architecture to the list of "safe" architectures above.  */                                          */
+#endif
+
+
+#if defined(_WIN32)
+
+#include <windows.h>
+#include <intrin.h>
+
+CC_INLINE CC_CONST unsigned clz64_win(uint64_t value)
+{
+    DWORD leading_zero;
+    _BitScanReverse64(&leading_zero, value);
+    return 63 - leading_zero;
+}
+
+
+CC_INLINE CC_CONST unsigned clz32_win(uint32_t value)
+{
+    DWORD leading_zero;
+    _BitScanReverse(&leading_zero, value);
+    return 31 - leading_zero;
+}
+
+#endif
+
+CC_INLINE CC_CONST unsigned cc_clz32_fallback(uint32_t data)
+{
+    unsigned int b = 0;
+    unsigned int bit = 0;
+    // Work from LSB to MSB
+    for (int i = 0; i < 32; i++) {
+        bit = (data >> i) & 1;
+        // If the bit is 0, update the "leading bits are zero" counter "b".
+        b += (1 - bit);
+        /* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained.
+         * If the bit is 1, (bit - 1) is 0 therefore b is set to 0.
+         */
+        b &= (bit - 1);
+    }
+    return b;
+}
+
+CC_INLINE CC_CONST unsigned cc_clz64_fallback(uint64_t data)
+{
+    unsigned int b = 0;
+    unsigned int bit = 0;
+    // Work from LSB to MSB
+    for (int i = 0; i < 64; i++) {
+        bit = (data >> i) & 1;
+        // If the bit is 0, update the "leading bits are zero" counter.
+        b += (1 - bit);
+        /* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained.
+         * If the bit is 1, (bit - 1) is 0 therefore b is set to 0.
+         */
+        b &= (bit - 1);
+    }
+    return b;
+}
+
+/*!
+  @function cc_clz32
+  @abstract Count leading zeros of a nonzero 32-bit value
+
+  @param data A nonzero 32-bit value
+
+  @result Count of leading zeros of @p data
+
+  @discussion @p data is assumed to be nonzero.
+*/
+CC_INLINE CC_CONST unsigned cc_clz32(uint32_t data) {
+#if defined(_WIN32)
+    return clz32_win(data);
+#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__)
+    cc_static_assert(sizeof(unsigned) == 4, "clz relies on an unsigned int being 4 bytes");
+    return (unsigned)__builtin_clz(data);
+#else
+    return cc_clz32_fallback(data);
+#endif
+}
+
+/*!
+  @function cc_clz64
+  @abstract Count leading zeros of a nonzero 64-bit value
+
+  @param data A nonzero 64-bit value
+
+  @result Count of leading zeros of @p data
+
+  @discussion @p data is assumed to be nonzero.
+*/
+CC_INLINE CC_CONST unsigned cc_clz64(uint64_t data) {
+#if defined(_WIN32)
+    return clz64_win(data);
+#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__)
+    return (unsigned)__builtin_clzll(data);
+#else
+    return cc_clz64_fallback(data);
+#endif
+}
+
 /* HEAVISIDE_STEP (shifted by one)
-   function f(x): x->0, when x=0 
+   function f(x): x->0, when x=0
                   x->1, when x>0
-   Can also be seen as a bitwise operation: 
+   Can also be seen as a bitwise operation:
       f(x): x -> y
         y[0]=(OR x[i]) for all i (all bits)
         y[i]=0 for all i>0
-   Run in constant time (log2(<bitsize of x>))  
+   Run in constant time (log2(<bitsize of x>))
    Useful to run constant time checks
 */
-#define HEAVISIDE_STEP_UINT64(r,s) {uint64_t _t=s; \
-    _t=(((_t)>>32) | (_t)); \
-    _t=(0xFFFFFFFF + (_t & 0xFFFFFFFF)); \
-    r=_t >> 32;}
-
-#define HEAVISIDE_STEP_UINT32(r,s) {uint32_t _t=s; \
-    _t=(((_t)>>16) | (_t)); \
-    _t=(0xFFFF + (_t & 0xFFFF)); \
-    r=_t >> 16;}
-
-#define HEAVISIDE_STEP_UINT16(r,s) {uint32_t _t=s; \
-    _t=(0xFFFF + ((_t) & 0xFFFF)); \
-    r=_t >> 16;}
-
-#define HEAVISIDE_STEP_UINT8(r,s) {uint16_t _t=s; \
-    _t=(0xFF + ((_t) & 0xFF)); \
-    r=_t >> 8;}
-
-#define CC_HEAVISIDE_STEP(r,s) { \
-    if (sizeof(s) == 1)      {HEAVISIDE_STEP_UINT8(r,s);}  \
-    else if (sizeof(s) == 2) {HEAVISIDE_STEP_UINT16(r,s);} \
-    else if (sizeof(s) == 4) {HEAVISIDE_STEP_UINT32(r,s);} \
-    else if (sizeof(s) == 8) {HEAVISIDE_STEP_UINT64(r,s);} \
-    else {r=(((s)==0)?0:1);} \
-    }
+#define CC_HEAVISIDE_STEP(r, s) {                       \
+    const uint64_t _s = (uint64_t)s;                    \
+    const uint64_t _t = (_s & 0xffffffff) | (_s >> 32); \
+    r = (__typeof__(r))((0xffffffff + _t) >> 32);       \
+}
 
 /* Return 1 if x mod 4 =1,2,3, 0 otherwise */
 #define CC_CARRY_2BITS(x) (((x>>1) | x) & 0x1)
 #define CC_CARRY_3BITS(x) (((x>>2) | (x>>1) | x) & 0x1)
 
-/* Set a variable to the biggest power of 2 which can be represented */ 
+/* Set a variable to the biggest power of 2 which can be represented */
 #define MAX_POWER_OF_2(x)   ((__typeof__(x))1<<(8*sizeof(x)-1))
 #define cc_ceiling(a,b)  (((a)+((b)-1))/(b))
 #define CC_BITLEN_TO_BYTELEN(x) cc_ceiling((x), 8)
 
-//cc_try_abort() is implemented to comply with FIPS 140-2. See radar 19129408
-void cc_try_abort(const char * msg , ...);
-
 /*!
  @brief     cc_muxp(s, a, b) is equivalent to z = s ? a : b, but it executes in constant time
  @param a      input pointer
@@ -447,30 +591,56 @@ void cc_try_abort(const char * msg , ...);
 void *cc_muxp(int s, const void *a, const void *b);
 
 /*!
- @brief     cc_mux2p
- @param a      input pointer
- @param b      input pointer
- @param r_true output pointer: if s is integer 1 r_true=a  is returned, otherwise r_true=b
- @param r_false        output pointer: if s is integer 1 r_false=b is returned, otherwise r_false=a
- @param s      The selection parameter s must be 0 or 1.
- @discussion Executes in constant time
+ @brief     CC_MUXU(r, s, a, b) is equivalent to r = s ? a : b, but executes in constant time
+ @param a   Input a
+ @param b   Input b
+ @param s   Selection parameter s. Must be 0 or 1.
+ @param r   Output, set to a if s=1, or b if s=0.
  */
-void cc_mux2p(int s, void **r_true, void **r_false, const void *a, const void *b);
+#define CC_MUXU(r, s, a, b)                                      \
+{                                                                \
+    __typeof__(r) _cond = ((__typeof__(r))(s)-(__typeof__(r))1); \
+    r = (~_cond&(a))|(_cond&(b));                                \
+}
+
+#define CC_PROVIDES_ABORT (!(CC_USE_SEPROM || CC_USE_S3 || CC_BASEBAND || CC_EFI || CC_IBOOT || CC_RTKITROM))
 
 /*!
- @brief     CC_MUXU(s, a, b) is equivalent to z = s ? a : b, but it executes in constant time
- @param a      input unsigned type
- @param b      input unsigned type
- @param s      The selection parameter s must be 0 or 1. if s is integer 1 a is returned. If s is integer 0, b is returned. Otherwise, the output is undefined.
- @param r      output
- @return    r = a, if s is 1 and b if s is 0
+ @function cc_abort
+ @abstract Abort execution unconditionally
  */
-#define CC_MUXU(r, s, a, b)   \
-{                       \
-    __typeof__(r) _cond = ((__typeof__(r))(s)-(__typeof__(r))1); \
-    r = (~_cond&(a))|(_cond&(b)); \
+CC_NORETURN
+void cc_abort(const char *msg);
+
+/*!
+  @function cc_try_abort
+  @abstract Abort execution iff the platform provides a function like @p abort() or @p panic()
+
+  @discussion If the platform does not provide a means to abort execution, this function does nothing; therefore, callers should return an error code after calling this function.
+*/
+#if CC_PROVIDES_ABORT
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmissing-noreturn"
+
+CC_INLINE
+void cc_try_abort(const char *msg)
+{
+    cc_abort(msg);
 }
 
+#pragma clang diagnostic pop
+
+#else
+
+CC_INLINE
+void cc_try_abort(CC_UNUSED const char *msg)
+{
+
+}
+
+#endif
+
 /*
   Unfortunately, since we export this symbol, this declaration needs
   to be in a public header to satisfy TAPI.
index 0d7ac528987de50a17136622c7594fbc4f9fd4f9..996accee162f2af7a3bb31f9edfa925d35cae3fb 100644 (file)
 #include <corecrypto/cc_config.h>
 
 /* Only intel systems have these runtime switches today. */
-#if (CCSHA1_VNG_INTEL || CCSHA2_VNG_INTEL || CCAES_INTEL_ASM) \
-    && (defined(__x86_64__) || defined(__i386__))
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#if CC_KERNEL
+    #include <i386/cpuid.h>
+    #define CC_HAS_RDRAND() ((cpuid_features() & CPUID_FEATURE_RDRAND) != 0)
+#elif CC_XNU_KERNEL_AVAILABLE
+    #include <System/i386/cpu_capabilities.h>
+
+    extern int _cpu_capabilities;
+    #define CC_HAS_RDRAND() (_cpu_capabilities & kHasRDRAND)
+#else
+    #define CC_HAS_RDRAND() 0
+#endif
+
+#if (CCSHA1_VNG_INTEL || CCSHA2_VNG_INTEL || CCAES_INTEL_ASM)
 
 #if CC_KERNEL
     #include <i386/cpuid.h>
     #define CC_HAS_AVX512_AND_IN_KERNEL()    ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX512F) !=0)
 
 #elif CC_XNU_KERNEL_AVAILABLE
-    # include <System/i386/cpu_capabilities.h>
-
-    #ifndef kHasAVX2_0 /* 10.8 doesn't have kHasAVX2_0 defined */
-    #define kHasAVX2_0 0
-    #endif
+    #include <System/i386/cpu_capabilities.h>
 
     extern int _cpu_capabilities;
     #define CC_HAS_AESNI() (_cpu_capabilities & kHasAES)
@@ -46,6 +56,8 @@
     #define CC_HAS_AVX512_AND_IN_KERNEL()  0
 #endif
 
-#endif /* !(defined(__x86_64__) || defined(__i386__)) */
+#endif  // (CCSHA1_VNG_INTEL || CCSHA2_VNG_INTEL || CCAES_INTEL_ASM)
+
+#endif  // defined(__x86_64__) || defined(__i386__)
 
 #endif /* CORECRYPTO_CC_RUNTIME_CONFIG_H_ */
index 281c99d22aa277b22592c8dedd93ada57f75316c..9c664b842c22d6952151783f22fbe6cc1c909feb 100644 (file)
@@ -122,4 +122,7 @@ const struct ccmode_ofb *ccaes_ofb_crypt_mode(void);
 const struct ccmode_siv *ccaes_siv_encrypt_mode(void);
 const struct ccmode_siv *ccaes_siv_decrypt_mode(void);
 
+const struct ccmode_siv_hmac *ccaes_siv_hmac_sha256_encrypt_mode(void);
+const struct ccmode_siv_hmac *ccaes_siv_hmac_sha256_decrypt_mode(void);
+
 #endif /* _CORECRYPTO_CCAES_H_ */
index d2e01814357a0596b9b2fc5e5101078ded7ff9ec..e29e543dd16257d6f2a36d35dca595ab9045d268 100644 (file)
@@ -24,7 +24,7 @@ struct cccmac_ctx {
     size_t  block_nbytes; // Number of byte occupied in block
     size_t  cumulated_nbytes;  // Total size processed
     const struct ccmode_cbc *cbc;
-    uint8_t ctx[8];
+    uint8_t ctx[1];
 } CC_ALIGNED(8);// cccmac_ctx_hdr;
 
 typedef struct cccmac_ctx* cccmac_ctx_t;
index 52ee15123b6e0542514f79414b871fa559fd5414..fa2b765f937fa86476655ec730519c4810dceb21 100644 (file)
@@ -85,15 +85,6 @@ struct ccdigest_info {
 #define ccdigest_u64(_state_)            (&((ccdigest_state_t)(_state_))->state.u64)
 #define ccdigest_ccn(_state_)            (&((ccdigest_state_t)(_state_))->state.ccn)
 
-/* We could just use memcpy instead of this special macro, but this allows us
-   to use the optimized ccn_set() assembly routine if we have one, which for
-   32 bit arm is about 200% quicker than generic memcpy(). */
-#if CCN_SET_ASM && CCN_UNIT_SIZE <= 4
-#define ccdigest_copy_state(_di_, _dst_, _src_) ccn_set((_di_)->state_size / CCN_UNIT_SIZE, _dst_, _src_)
-#else
-#define ccdigest_copy_state(_di_, _dst_, _src_) CC_MEMCPY(_dst_, _src_, (_di_)->state_size)
-#endif
-
 void ccdigest_init(const struct ccdigest_info *di, ccdigest_ctx_t ctx);
 void ccdigest_update(const struct ccdigest_info *di, ccdigest_ctx_t ctx,
                      size_t len, const void *data);
@@ -117,9 +108,6 @@ void ccdigest(const struct ccdigest_info *di, size_t len,
 #define CC_DIGEST_OID_SHA256    OID_DEF("\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x01")
 #define CC_DIGEST_OID_SHA384    OID_DEF("\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x02")
 #define CC_DIGEST_OID_SHA512    OID_DEF("\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x03")
-#define CC_DIGEST_OID_RMD128    OID_DEF("\x06\x06\x28\xCF\x06\x03\x00\x32")
 #define CC_DIGEST_OID_RMD160    OID_DEF("\x06\x05\x2B\x24\x03\x02\x01")
-#define CC_DIGEST_OID_RMD256    OID_DEF("\x06\x05\x2B\x24\x03\x02\x03")
-#define CC_DIGEST_OID_RMD320    OID_DEF(NULL)
 
 #endif /* _CORECRYPTO_CCDIGEST_H_ */
index 9d42de51948f50e9d4ac310ffe5e1663abbf5812..8061c5faf94930533e8a9cdebde61dc64363fdc7 100644 (file)
@@ -11,6 +11,7 @@
 #ifndef _CORECRYPTO_CCDIGEST_PRIV_H_
 #define _CORECRYPTO_CCDIGEST_PRIV_H_
 
+#include <corecrypto/cc_priv.h>
 #include <corecrypto/ccdigest.h>
 #include <corecrypto/ccasn1.h>
 
@@ -26,4 +27,6 @@ typedef const struct ccdigest_info *(ccdigest_lookup)(ccoid_t oid);
 #include <stdarg.h>
 const struct ccdigest_info *ccdigest_oid_lookup(ccoid_t oid, ...);
 
+#define ccdigest_copy_state(_di_, _dst_, _src_) cc_memcpy_nochk(_dst_, _src_, (_di_)->state_size)
+
 #endif /* _CORECRYPTO_CCDIGEST_PRIV_H_ */
index 7717d0c036982c719d00c18464654b5d055bdd2a..14db0a16bc7de4d53c84d1524429aa5ab619ddd1 100644 (file)
@@ -32,7 +32,7 @@
 #define CCDRBG_MAX_ADDITIONALINPUT_SIZE ((uint32_t)1<<16)
 #define CCDRBG_MAX_PSINPUT_SIZE         ((uint32_t)1<<16)
 #define CCDRBG_MAX_REQUEST_SIZE         ((uint32_t)1<<16) //this is the absolute maximum in NIST 800-90A
-#define CCDRBG_RESEED_INTERVAL          ((uint64_t)1<<30) // must be able to fit the NIST maximum of 2^48
+#define CCDRBG_RESEED_INTERVAL          ((uint64_t)1<<48) // must be able to fit the NIST maximum of 2^48
 
 
 /*
index 048c0de14af88fa9e5df23d05e3482a92eb74c71..3b6ac339b7a6ba8da0f1dc8803c42544af633b92 100644 (file)
 
 /* An hmac_ctx_t is normally allocated as an array of these. */
 struct cchmac_ctx {
-    uint8_t b[8];
+    uint8_t b[1];
 } CC_ALIGNED(8);
 
 typedef struct cchmac_ctx* cchmac_ctx_t;
 
-#define cchmac_ctx_size(STATE_SIZE, BLOCK_SIZE)  (ccdigest_ctx_size(STATE_SIZE, BLOCK_SIZE) + (STATE_SIZE))
+#define cchmac_ctx_size(STATE_SIZE, BLOCK_SIZE) (cc_pad_align(ccdigest_ctx_size(STATE_SIZE, BLOCK_SIZE)) + (STATE_SIZE))
 #define cchmac_di_size(_di_)  (cchmac_ctx_size((_di_)->state_size, (_di_)->block_size))
 
 #define cchmac_ctx_n(STATE_SIZE, BLOCK_SIZE)  ccn_nof_size(cchmac_ctx_size((STATE_SIZE), (BLOCK_SIZE)))
@@ -35,7 +35,7 @@ typedef struct cchmac_ctx* cchmac_ctx_t;
 #define cchmac_digest_ctx(_di_, HC)    ((ccdigest_ctx_t)(HC))
 
 /* Accesors for ostate fields, this is all cchmac_ctx_t adds to the ccdigest_ctx_t. */
-#define cchmac_ostate(_di_, HC)    ((struct ccdigest_state *)(((cchmac_ctx_t)(HC))->b + ccdigest_di_size(_di_)))
+#define cchmac_ostate(_di_, HC)    ((struct ccdigest_state *)(((cchmac_ctx_t)(HC))->b + cc_pad_align(ccdigest_di_size(_di_))))
 #define cchmac_ostate8(_di_, HC)   (ccdigest_u8(cchmac_ostate(_di_, HC)))
 #define cchmac_ostate32(_di_, HC)  (ccdigest_u32(cchmac_ostate(_di_, HC)))
 #define cchmac_ostate64(_di_, HC)  (ccdigest_u64(cchmac_ostate(_di_, HC)))
index 5e5bfcacd3e7d6f6b13917166ef188078b0bcaaf..edcff9a61f7c9f292d7c23fff174444445c8aae8 100644 (file)
 #ifndef _CORECRYPTO_CCKPRNG_H_
 #define _CORECRYPTO_CCKPRNG_H_
 
+#include <stdbool.h>
+
 #include <corecrypto/cc.h>
 
+#define CCKPRNG_YARROW 0
+
+#if CCKPRNG_YARROW
+
 typedef struct PRNG *PrngRef;
-typedef struct cckprng_ctx *cckprng_ctx_t;
 
 struct cckprng_ctx {
     PrngRef prng;
@@ -25,59 +30,326 @@ struct cckprng_ctx {
 #define CCKPRNG_ENTROPY_INTERVAL (1 << 14)
 #define CCKPRNG_RESEED_NTICKS 50
 
+typedef struct cckprng_ctx *cckprng_ctx_t;
+
+#else
+
+// This is a Fortuna-inspired PRNG. While it differs from Fortuna in
+// many minor details, the biggest difference is its support for
+// multiple independent output generators. This is to make it suitable
+// for use in concurrent environments.
+//
+// This PRNG targets a 256-bit security level.
+//
+// First, the user should call cckprng_init. The user must specify the
+// maximum number of output generators that might be
+// needed. (Typically, users should align this argument with the
+// number of available CPUs.)
+//
+// The user must also provide a read-only handle to an entropy
+// source. This is a fixed-size buffer that will receive entropy
+// updates out of band from the PRNG (e.g. in an interrupt
+// handler). The PRNG will consume entropy from this buffer according
+// to an internal schedule driven by calls to cckprng_refresh (see
+// below).
+//
+// The user should call cckprng_initgen for as many output generators
+// as are needed. The numeric argument is an identifier to be reused
+// during calls to cckprng_generate (see below) and must be less than
+// the maximum number of generators specified to cckprng_init.
+//
+// After initialization, the user is free to call cckprng_generate to
+// generate random bytes. The user must specify the generator in this
+// call using a numeric identifier passed in the call to
+// cckprng_initgen.
+//
+// Output generation is limited to 256 bytes per request. Users should
+// make multiple requests if more output is needed.
+//
+// The user is expected to call cckprng_refresh regularly. This
+// function consumes entropy and mixes it into the output generators
+// according to an internal schedule.
+//
+// This implementation is thread-safe. Internally, a set of mutexes
+// guard access to internal state. Most functions rely on a single
+// mutex to protect shared state. The main exception is the
+// cckprng_generate function, which uses a per-generator mutex to
+// allow concurrent output generation on different threads.
+//
+// Another important exception is cckprng_refresh. While this function
+// relies on the shared mutex, it returns immediately if it cannot
+// acquire it.
+//
+// The PRNG also supports user-initiated reseeds. This is to support a
+// user-writable random device.
+//
+// This PRNG supports reseeds concurrent with output generation,
+// i.e. it is safe to call cckprng_reseed or cckprng_refresh while
+// another thread is calling cckprng_generate.
+
+#define CCKPRNG_NPOOLS 32
+#define CCKPRNG_SEED_NBYTES 32
+#define CCKPRNG_POOL_NBYTES 32
+#define CCKPRNG_KEY_NBYTES 32
+
+struct cckprng_gen_diag {
+    // The number of times this generator has been rekeyed from the master seed
+    uint64_t nrekeys;
+
+    // The number of requests this generator has fulfilled
+    uint64_t out_nreqs;
+
+    // The total number of bytes this generator has generated over all requests
+    uint64_t out_nbytes;
+
+    // The maximum number of bytes this generator has generated in any one request
+    uint64_t out_nbytes_req_max;
+
+    // The total number of bytes this generator has generated since the last rekey
+    uint64_t out_nbytes_key;
+
+    // The maximum total number of bytes this generator has generated between two rekeys
+    uint64_t out_nbytes_key_max;
+};
+
+struct cckprng_pool_diag {
+    // The number of samples currently resident in the pool
+    uint64_t nsamples;
+
+    // The number of times this pool has been drained in a reseed
+    uint64_t ndrains;
+
+    // The maximum number of samples this pool has held at any one time
+    uint64_t nsamples_max;
+};
+
+struct cckprng_diag {
+    // The number of reseeds via user input (e.g. by writing to /dev/random)
+    uint64_t userreseed_nreseeds;
+
+    // The number of reseeds via the scheduler
+    uint64_t schedreseed_nreseeds;
+
+    // The maximum number of samples included in any one scheduler reseed
+    uint64_t schedreseed_nsamples_max;
+
+    // The maximum number of samples included in any one entropy input
+    uint64_t addentropy_nsamples_max;
+
+    // Diagnostics corresponding to individual output generators
+    unsigned ngens;
+    struct cckprng_gen_diag *gens;
+
+    // Diagnostics corresponding to internal entropy pools
+    struct cckprng_pool_diag pools[CCKPRNG_NPOOLS];
+};
+
+#if CC_KERNEL
+
+#include <kern/locks.h>
+
+typedef lck_grp_t *cckprng_lock_group;
+typedef lck_mtx_t *cckprng_lock_mutex;
+
+struct cckprng_lock_ctx {
+    cckprng_lock_group group;
+    cckprng_lock_mutex mutex;
+};
+
+#else
+
+#include <os/lock.h>
+
+typedef os_unfair_lock cckprng_lock_mutex;
+
+struct cckprng_lock_ctx {
+    cckprng_lock_mutex mutex;
+};
+
+#endif
+
+struct cckprng_key_ctx {
+    uint8_t data[CCKPRNG_KEY_NBYTES];
+};
+
+struct cckprng_gen_ctx {
+    // We maintain two keys (one live and one idle) to allow
+    // concurrent generation and reseeding
+    struct cckprng_key_ctx keys[2];
+    _Atomic unsigned swap;
+    unsigned key_live_idx;
+    unsigned key_idle_idx;
+
+    // A counter used in CTR mode
+    uint8_t ctr[16];
+
+    // Whether the generator has been initialized
+    bool init;
+
+    // A mutex governing this generator's state (but note the idle key
+    // context is under control of the PRNG's shared mutex)
+    struct {
+        cckprng_lock_mutex mutex;
+    } lock;
+};
+
+struct cckprng_pool_ctx {
+    uint8_t data[CCKPRNG_POOL_NBYTES];
+};
+
+// This is a handle to an "entropy buffer" to be managed externally
+// (i.e. in xnu). This is a non-cryptographic
+// accumulator. Practically, the buffer is filled with timestamps
+// collected during interrupts. The existing state of the buffer is
+// rotated and new timestamps are added in. A counter of raw timing
+// samples is also managed externally. The buffer and the counter are
+// both subject to data races, which we tolerate.
+
+struct cckprng_entropybuf {
+
+    // A read-only handle to an "entropy buffer" (a non-cryptographic accumulator) to be managed externally
+    const void *buf;
+
+    // The size of the entropy buffer
+    size_t nbytes;
+
+    // A read-only handle to a count of raw samples in the buffer
+    const uint32_t *nsamples;
+
+    // The count of raw samples in the buffer at time of last read
+    uint32_t nsamples_last;
+};
+
+struct cckprng_sched_ctx {
+    // A counter governing the set of entropy pools to drain
+    uint64_t reseed_sched;
+
+    // A timestamp from the last reseed
+    uint64_t reseed_last;
+
+    // An index used to add entropy to pools in a round-robin style
+    unsigned pool_idx;
+};
+
+struct cckprng_ctx {
+
+    // The master secret of the PRNG
+    uint8_t seed[CCKPRNG_SEED_NBYTES];
+
+    // State used to schedule entropy consumption and reseeds
+    struct cckprng_sched_ctx sched;
+
+    // A mutex governing access to shared state
+    struct cckprng_lock_ctx lock;
+
+    // The maximum number of generators that may be allocated
+    unsigned max_ngens;
+
+    // An array of output generators (allocated dynamically) of length max_ngens
+    struct cckprng_gen_ctx *gens;
+
+    // A set of entropy pools
+    struct cckprng_pool_ctx pools[CCKPRNG_NPOOLS];
+
+    // A handle to an entropy source managed externally
+    struct cckprng_entropybuf entropybuf;
+
+    // Diagnostics for the PRNG
+    struct cckprng_diag diag;
+};
+
+// This collection of function pointers is just a convenience for
+// registering the PRNG with xnu
+struct cckprng_funcs {
+    void (*init)(struct cckprng_ctx *ctx,
+                 unsigned max_ngens,
+                 size_t entropybuf_nbytes,
+                 const void *entropybuf,
+                 const uint32_t *entropybuf_nsamples,
+                 size_t seed_nbytes,
+                 const void *seed,
+                 size_t nonce_nbytes,
+                 const void *nonce);
+    void (*initgen)(struct cckprng_ctx *ctx, unsigned gen_idx);
+    void (*reseed)(struct cckprng_ctx *ctx, size_t nbytes, const void *seed);
+    void (*refresh)(struct cckprng_ctx *ctx);
+    void (*generate)(struct cckprng_ctx *ctx, unsigned gen_idx, size_t nbytes, void *out);
+};
+
+#endif
+
 /*
   @function cckprng_init
   @abstract Initialize a kernel PRNG context.
 
   @param ctx Context for this instance
-  @param nbytes Length of the seed in bytes
+  @param max_ngens Maximum count of generators that may be allocated
+  @param entropybuf_nbytes Length of the entropy buffer in bytes
+  @param entropybuf Read-only pointer to a long-lived entropy buffer
+  @param entropybuf_nsamples Read-only pointer to a counter of samples in the entropy buffer
+  @param seed_nbytes Length of the seed in bytes
   @param seed Pointer to a high-entropy seed
+  @param nonce_nbytes Length of the nonce in bytes
+  @param seed Pointer to a single-use nonce
+
+  @discussion @p max_ngens should be set based on an upper bound of CPUs available on the device. The entropy buffer should be managed outside the PRNG and updated continuously (e.g. by an interrupt handler). The count of samples in the entropy buffer needn't be better than a rough estimate.
+*/
+void cckprng_init(struct cckprng_ctx *ctx,
+                  unsigned max_ngens,
+                  size_t entropybuf_nbytes,
+                  const void *entropybuf,
+                  const uint32_t *entropybuf_nsamples,
+                  size_t seed_nbytes,
+                  const void *seed,
+                  size_t nonce_nbytes,
+                  const void *nonce);
+
+/*
+  @function cckprng_initgen
+  @abstract Initialize an output generator.
+
+  @param ctx Context for this instance
+  @param gen_idx Index of the generator
 
-  @result @p CCKPRNG_OK iff successful. Panic on @p CCKPRNG_ABORT.
+  @discussion @p gen_idx must be less than @p max_ngens provided to @cckprng_init and must be unique within the lifetime of a PRNG context. This function will abort if these contracts are violated.
 */
-int cckprng_init(cckprng_ctx_t ctx, size_t nbytes, const void *seed);
+void cckprng_initgen(struct cckprng_ctx *ctx, unsigned gen_idx);
 
 /*
   @function cckprng_reseed
-  @abstract Reseed a kernel PRNG context immediately.
+  @abstract Reseed a kernel PRNG context with a user-supplied seed.
 
   @param ctx Context for this instance
   @param nbytes Length of the seed in bytes
   @param seed Pointer to a high-entropy seed
 
-  @result @p CCKPRNG_OK iff successful. Panic on @p CCKPRNG_ABORT.
+  @discussion It is safe to expose this function to attacker-controlled requests (e.g. writes to /dev/random).
 */
-int cckprng_reseed(cckprng_ctx_t ctx, size_t nbytes, const void *seed);
+void cckprng_reseed(struct cckprng_ctx *ctx, size_t nbytes, const void *seed);
 
 /*
-  @function cckprng_addentropy
-  @abstract Add entropy to a kernel PRNG context.
+  @function cckprng_refresh
+  @abstract Consume entropy and reseed according to an internal schedule.
 
   @param ctx Context for this instance
-  @param nbytes Length of the input entropy in bytes
-  @param seed Pointer to input entropy
 
-  @result @p CCKPRNG_OK iff successful. Panic on @p CCKPRNG_ABORT.
-
-  @discussion Input entropy is stored internally and consumed at the
-  opportune moment. This will not necessarily be before the next call
-  to @p cckprng_generate. To force an immediate reseed, call @p
-  cckprng_reseed.
+  @discussion This function should be called on a regular basis. (For example, it is reasonable to call this inline before a call to @p cckprng_generate.) This function will not necessarily consume entropy or reseed the internal state on any given invocation. To force an immediate reseed, call @p cckprng_reseed.
 */
-int cckprng_addentropy(cckprng_ctx_t ctx, size_t nbytes, const void *entropy);
+void cckprng_refresh(struct cckprng_ctx *ctx);
+
+#define CCKPRNG_GENERATE_MAX_NBYTES 256
 
 /*
   @function cckprng_generate
   @abstract Generate random values for use in applications.
 
   @param ctx Context for this instance
+  @param gen_idx Index of the output generator
   @param nbytes Length of the desired output in bytes
-  @param seed Pointer to the output buffer
+  @param out Pointer to the output buffer
 
-  @result @p CCKPRNG_OK iff successful. Panic on @p
-  CCKPRNG_ABORT. Provide input to @p cckprng_addentropy on @p
-  CCKPRNG_NEED_ENTROPY.
+  @discussion @p gen_idx must be a previous argument to @p cckprng_initgen. @p nbytes must be less than or equal to @p CCKPRNG_GENERATE_MAX_NBYTES. (Callers may invoke this function in a loop to generate larger outputs.) This function will abort if these contracts are violated.
 */
-int cckprng_generate(cckprng_ctx_t ctx, size_t nbytes, void *out);
+void cckprng_generate(struct cckprng_ctx *ctx, unsigned gen_idx, size_t nbytes, void *out);
 
 #endif /* _CORECRYPTO_CCKPRNG_H_ */
index 191460b9b98973acb74776ded48c48b6a6455061..f4aa20a9911c4c9b4f7be4276f9da29cfe2e5364 100644 (file)
@@ -14,6 +14,7 @@
 #include <corecrypto/cc.h>
 #include <corecrypto/ccmode_impl.h>
 #include <corecrypto/ccmode_siv.h>
+#include <corecrypto/ccmode_siv_hmac.h>
 
 /* ECB mode. */
 
@@ -29,36 +30,35 @@ CC_INLINE size_t ccecb_context_size(const struct ccmode_ecb *mode)
 
 CC_INLINE size_t ccecb_block_size(const struct ccmode_ecb *mode)
 {
-       return mode->block_size;
+    return mode->block_size;
 }
 
-CC_INLINE int ccecb_init(const struct ccmode_ecb *mode, ccecb_ctx *ctx,
-                         size_t key_len, const void *key)
+CC_INLINE int ccecb_init(const struct ccmode_ecb *mode, ccecb_ctx *ctx, size_t key_len, const void *key)
 {
     return mode->init(mode, ctx, key_len, key);
 }
 
-CC_INLINE int ccecb_update(const struct ccmode_ecb *mode, const ccecb_ctx *ctx,
-                           size_t nblocks, const void *in, void *out)
+CC_INLINE int ccecb_update(const struct ccmode_ecb *mode, const ccecb_ctx *ctx, size_t nblocks, const void *in, void *out)
 {
-       return mode->ecb(ctx, nblocks, in, out);
+    return mode->ecb(ctx, nblocks, in, out);
 }
 
-CC_INLINE int ccecb_one_shot(const struct ccmode_ecb *mode,
-                             size_t key_len, const void *key,
-                             size_t nblocks, const void *in, void *out)
+CC_INLINE int
+ccecb_one_shot(const struct ccmode_ecb *mode, size_t key_len, const void *key, size_t nblocks, const void *in, void *out)
 {
     int rc;
-       ccecb_ctx_decl(mode->size, ctx);
-       rc = mode->init(mode, ctx, key_len, key);
-       mode->ecb(ctx, nblocks, in, out);
-       ccecb_ctx_clear(mode->size, ctx);
+    ccecb_ctx_decl(mode->size, ctx);
+    rc = mode->init(mode, ctx, key_len, key);
+    if (rc == 0) {
+        rc = mode->ecb(ctx, nblocks, in, out);
+    }
+    ccecb_ctx_clear(mode->size, ctx);
     return rc;
 }
 
 /* CBC mode. */
 
-/* The CBC interface changed due to rdar://11468135. This macros is to indicate 
+/* The CBC interface changed due to rdar://11468135. This macros is to indicate
    to client which CBC API is implemented. Clients can support old versions of
    corecrypto at build time using this.
  */
@@ -89,36 +89,36 @@ CC_INLINE size_t cccbc_context_size(const struct ccmode_cbc *mode)
 
 CC_INLINE size_t cccbc_block_size(const struct ccmode_cbc *mode)
 {
-       return mode->block_size;
+    return mode->block_size;
 }
 
-CC_INLINE int cccbc_init(const struct ccmode_cbc *mode, cccbc_ctx *ctx,
-                         size_t key_len, const void *key)
+CC_INLINE int cccbc_init(const struct ccmode_cbc *mode, cccbc_ctx *ctx, size_t key_len, const void *key)
 {
     return mode->init(mode, ctx, key_len, key);
 }
 
-CC_INLINE int cccbc_set_iv(const struct ccmode_cbc *mode, cccbc_iv *iv_ctx,
-                           const void *iv)
+CC_INLINE int cccbc_set_iv(const struct ccmode_cbc *mode, cccbc_iv *iv_ctx, const void *iv)
 {
-    if (iv)
+    if (iv) {
         cc_copy(mode->block_size, iv_ctx, iv);
-    else
-        cc_zero(mode->block_size, iv_ctx);
+    } else {
+        cc_clear(mode->block_size, iv_ctx);
+    }
     return 0;
 }
 
-CC_INLINE int cccbc_update(const struct ccmode_cbc *mode,  cccbc_ctx *ctx,
-                           cccbc_iv *iv, size_t nblocks,
-                           const void *in, void *out)
+CC_INLINE int cccbc_update(const struct ccmode_cbc *mode, cccbc_ctx *ctx, cccbc_iv *iv, size_t nblocks, const void *in, void *out)
 {
-       return mode->cbc(ctx, iv, nblocks, in, out);
+    return mode->cbc(ctx, iv, nblocks, in, out);
 }
 
 int cccbc_one_shot(const struct ccmode_cbc *mode,
-                   size_t key_len, const void *key,
-                   const void *iv, size_t nblocks,
-                   const void *in, void *out);
+                   size_t key_len,
+                   const void *key,
+                   const void *iv,
+                   size_t nblocks,
+                   const void *in,
+                   void *out);
 
 /* CFB mode. */
 
@@ -134,31 +134,34 @@ CC_INLINE size_t cccfb_context_size(const struct ccmode_cfb *mode)
 
 CC_INLINE size_t cccfb_block_size(const struct ccmode_cfb *mode)
 {
-       return mode->block_size;
+    return mode->block_size;
 }
 
-CC_INLINE int cccfb_init(const struct ccmode_cfb *mode, cccfb_ctx *ctx,
-                         size_t key_len, const void *key,
-                         const void *iv)
+CC_INLINE int cccfb_init(const struct ccmode_cfb *mode, cccfb_ctx *ctx, size_t key_len, const void *key, const void *iv)
 {
     return mode->init(mode, ctx, key_len, key, iv);
 }
 
-CC_INLINE int cccfb_update(const struct ccmode_cfb *mode, cccfb_ctx *ctx,
-                           size_t nbytes, const void *in, void *out)
+CC_INLINE int cccfb_update(const struct ccmode_cfb *mode, cccfb_ctx *ctx, size_t nbytes, const void *in, void *out)
 {
-       return mode->cfb(ctx, nbytes, in, out);
+    return mode->cfb(ctx, nbytes, in, out);
 }
 
 CC_INLINE int cccfb_one_shot(const struct ccmode_cfb *mode,
-                             size_t key_len, const void *key, const void *iv,
-                             size_t nbytes, const void *in, void *out)
+                             size_t key_len,
+                             const void *key,
+                             const void *iv,
+                             size_t nbytes,
+                             const void *in,
+                             void *out)
 {
     int rc;
-       cccfb_ctx_decl(mode->size, ctx);
-       rc = mode->init(mode, ctx, key_len, key, iv);
-       mode->cfb(ctx, nbytes, in, out);
-       cccfb_ctx_clear(mode->size, ctx);
+    cccfb_ctx_decl(mode->size, ctx);
+    rc = mode->init(mode, ctx, key_len, key, iv);
+    if (rc == 0) {
+        rc = mode->cfb(ctx, nbytes, in, out);
+    }
+    cccfb_ctx_clear(mode->size, ctx);
     return rc;
 }
 
@@ -176,30 +179,34 @@ CC_INLINE size_t cccfb8_context_size(const struct ccmode_cfb8 *mode)
 
 CC_INLINE size_t cccfb8_block_size(const struct ccmode_cfb8 *mode)
 {
-       return mode->block_size;
+    return mode->block_size;
 }
 
-CC_INLINE int cccfb8_init(const struct ccmode_cfb8 *mode, cccfb8_ctx *ctx,
-                          size_t key_len, const void *key, const void *iv)
+CC_INLINE int cccfb8_init(const struct ccmode_cfb8 *mode, cccfb8_ctx *ctx, size_t key_len, const void *key, const void *iv)
 {
     return mode->init(mode, ctx, key_len, key, iv);
 }
 
-CC_INLINE int cccfb8_update(const struct ccmode_cfb8 *mode,  cccfb8_ctx *ctx,
-                            size_t nbytes, const void *in, void *out)
+CC_INLINE int cccfb8_update(const struct ccmode_cfb8 *mode, cccfb8_ctx *ctx, size_t nbytes, const void *in, void *out)
 {
-       return mode->cfb8(ctx, nbytes, in, out);
+    return mode->cfb8(ctx, nbytes, in, out);
 }
 
 CC_INLINE int cccfb8_one_shot(const struct ccmode_cfb8 *mode,
-                              size_t key_len, const void *key, const void *iv,
-                              size_t nbytes, const void *in, void *out)
+                              size_t key_len,
+                              const void *key,
+                              const void *iv,
+                              size_t nbytes,
+                              const void *in,
+                              void *out)
 {
     int rc;
-       cccfb8_ctx_decl(mode->size, ctx);
-       rc = mode->init(mode, ctx, key_len, key, iv);
-       mode->cfb8(ctx, nbytes, in, out);
-       cccfb8_ctx_clear(mode->size, ctx);
+    cccfb8_ctx_decl(mode->size, ctx);
+    rc = mode->init(mode, ctx, key_len, key, iv);
+    if (rc == 0) {
+        rc = mode->cfb8(ctx, nbytes, in, out);
+    }
+    cccfb8_ctx_clear(mode->size, ctx);
     return rc;
 }
 
@@ -221,35 +228,37 @@ CC_INLINE size_t ccctr_context_size(const struct ccmode_ctr *mode)
 
 CC_INLINE size_t ccctr_block_size(const struct ccmode_ctr *mode)
 {
-       return mode->block_size;
+    return mode->block_size;
 }
 
-CC_INLINE int ccctr_init(const struct ccmode_ctr *mode, ccctr_ctx *ctx,
-                         size_t key_len, const void *key, const void *iv)
+CC_INLINE int ccctr_init(const struct ccmode_ctr *mode, ccctr_ctx *ctx, size_t key_len, const void *key, const void *iv)
 {
     return mode->init(mode, ctx, key_len, key, iv);
 }
 
-CC_INLINE int ccctr_update(const struct ccmode_ctr *mode, ccctr_ctx *ctx,
-                           size_t nbytes, const void *in, void *out)
+CC_INLINE int ccctr_update(const struct ccmode_ctr *mode, ccctr_ctx *ctx, size_t nbytes, const void *in, void *out)
 {
-       return mode->ctr(ctx, nbytes, in, out);
+    return mode->ctr(ctx, nbytes, in, out);
 }
 
 CC_INLINE int ccctr_one_shot(const struct ccmode_ctr *mode,
-                             size_t key_len, const void *key, const void *iv,
-                             size_t nbytes, const void *in, void *out)
+                             size_t key_len,
+                             const void *key,
+                             const void *iv,
+                             size_t nbytes,
+                             const void *in,
+                             void *out)
 {
     int rc;
-       ccctr_ctx_decl(mode->size, ctx);
-       rc = mode->init(mode, ctx, key_len, key, iv);
-    if (rc) return rc;
-       rc = mode->ctr(ctx, nbytes, in, out);
-       ccctr_ctx_clear(mode->size, ctx);
+    ccctr_ctx_decl(mode->size, ctx);
+    rc = mode->init(mode, ctx, key_len, key, iv);
+    if (rc == 0) {
+        rc = mode->ctr(ctx, nbytes, in, out);
+    }
+    ccctr_ctx_clear(mode->size, ctx);
     return rc;
 }
 
-
 /* OFB mode. */
 
 /* Declare a ofb key named _name_.  Pass the size field of a struct ccmode_ofb
@@ -264,30 +273,34 @@ CC_INLINE size_t ccofb_context_size(const struct ccmode_ofb *mode)
 
 CC_INLINE size_t ccofb_block_size(const struct ccmode_ofb *mode)
 {
-       return mode->block_size;
+    return mode->block_size;
 }
 
-CC_INLINE int ccofb_init(const struct ccmode_ofb *mode, ccofb_ctx *ctx,
-                         size_t key_len, const void *key, const void *iv)
+CC_INLINE int ccofb_init(const struct ccmode_ofb *mode, ccofb_ctx *ctx, size_t key_len, const void *key, const void *iv)
 {
     return mode->init(mode, ctx, key_len, key, iv);
 }
 
-CC_INLINE int ccofb_update(const struct ccmode_ofb *mode, ccofb_ctx *ctx,
-                           size_t nbytes, const void *in, void *out)
+CC_INLINE int ccofb_update(const struct ccmode_ofb *mode, ccofb_ctx *ctx, size_t nbytes, const void *in, void *out)
 {
-       return mode->ofb(ctx, nbytes, in, out);
+    return mode->ofb(ctx, nbytes, in, out);
 }
 
 CC_INLINE int ccofb_one_shot(const struct ccmode_ofb *mode,
-                             size_t key_len, const void *key, const void *iv,
-                             size_t nbytes, const void *in, void *out)
+                             size_t key_len,
+                             const void *key,
+                             const void *iv,
+                             size_t nbytes,
+                             const void *in,
+                             void *out)
 {
     int rc;
-       ccofb_ctx_decl(mode->size, ctx);
+    ccofb_ctx_decl(mode->size, ctx);
     rc = mode->init(mode, ctx, key_len, key, iv);
-       mode->ofb(ctx, nbytes, in, out);
-       ccofb_ctx_clear(mode->size, ctx);
+    if (rc == 0) {
+        rc = mode->ofb(ctx, nbytes, in, out);
+    }
+    ccofb_ctx_clear(mode->size, ctx);
     return rc;
 }
 
@@ -323,26 +336,25 @@ CC_INLINE size_t ccxts_context_size(const struct ccmode_xts *mode)
 
 CC_INLINE size_t ccxts_block_size(const struct ccmode_xts *mode)
 {
-       return mode->block_size;
+    return mode->block_size;
 }
 
 /*!
  @function   ccxts_init
  @abstract   Initialize an XTS context.
+
  @param      mode       Descriptor for the mode
  @param      ctx        Context for this instance
  @param      key_nbytes Length of the key arguments in bytes
  @param      data_key   Key for data encryption
  @param      tweak_key  Key for tweak generation
+
  @result     0 iff successful.
+
  @discussion For security reasons, the two keys must be different.
  */
-CC_INLINE int ccxts_init(const struct ccmode_xts *mode, ccxts_ctx *ctx,
-                          size_t key_nbytes, const void *data_key,
-                          const void *tweak_key)
+CC_INLINE int
+ccxts_init(const struct ccmode_xts *mode, ccxts_ctx *ctx, size_t key_nbytes, const void *data_key, const void *tweak_key)
 {
     return mode->init(mode, ctx, key_nbytes, data_key, tweak_key);
 }
@@ -350,43 +362,42 @@ CC_INLINE int ccxts_init(const struct ccmode_xts *mode, ccxts_ctx *ctx,
 /*!
  @function   ccxts_set_tweak
  @abstract   Initialize the tweak for a sector.
+
  @param      mode       Descriptor for the mode
  @param      ctx        Context for this instance
  @param      tweak      Context for the tweak for this sector
  @param      iv         Data used to generate the tweak
+
  @discussion The IV must be exactly one block in length.
  */
-CC_INLINE int ccxts_set_tweak(const struct ccmode_xts *mode, ccxts_ctx *ctx,
-                              ccxts_tweak *tweak, const void *iv)
+CC_INLINE int ccxts_set_tweak(const struct ccmode_xts *mode, ccxts_ctx *ctx, ccxts_tweak *tweak, const void *iv)
 {
-       return mode->set_tweak(ctx, tweak, iv);
+    return mode->set_tweak(ctx, tweak, iv);
 }
 
 /*!
  @function   ccxts_update
  @abstract   Encrypt or decrypt data.
+
  @param      mode       Descriptor for the mode
  @param      ctx        Context for an instance
  @param      tweak      Context for the tweak for this sector
  @param      nblocks    Length of the data in blocks
  @param      in         Input data
  @param      out        Output buffer
+
  @result     The updated internal buffer of the tweak context. May be ignored.
   */
-CC_INLINE void *ccxts_update(const struct ccmode_xts *mode, ccxts_ctx *ctx,
-                             ccxts_tweak *tweak, size_t nblocks, const void *in, void *out)
+CC_INLINE void *
+ccxts_update(const struct ccmode_xts *mode, ccxts_ctx *ctx, ccxts_tweak *tweak, size_t nblocks, const void *in, void *out)
 {
-       return mode->xts(ctx, tweak, nblocks, in, out);
+    return mode->xts(ctx, tweak, nblocks, in, out);
 }
 
 /*!
  @function   ccxts_one_shot
  @abstract   Encrypt or decrypt data in XTS mode.
+
  @param      mode       Descriptor for the mode
  @param      key_nbytes Length of the key arguments in bytes
  @param      data_key   Key for data encryption
@@ -395,15 +406,19 @@ CC_INLINE void *ccxts_update(const struct ccmode_xts *mode, ccxts_ctx *ctx,
  @param      nblocks    Length of the data in blocks
  @param      in         Input data
  @param      out        Output buffer
+
  @result     0 iff successful.
+
  @discussion For security reasons, the two keys must be different.
  */
 int ccxts_one_shot(const struct ccmode_xts *mode,
-                   size_t key_nbytes, const void *data_key,
-                   const void *tweak_key, const void *iv,
-                   size_t nblocks, const void *in, void *out);
+                   size_t key_nbytes,
+                   const void *data_key,
+                   const void *tweak_key,
+                   const void *iv,
+                   size_t nblocks,
+                   const void *in,
+                   void *out);
 
 /* Authenticated cipher modes. */
 
@@ -430,44 +445,44 @@ CC_INLINE size_t ccgcm_context_size(const struct ccmode_gcm *mode)
 
 CC_INLINE size_t ccgcm_block_size(const struct ccmode_gcm *mode)
 {
-       return mode->block_size;
+    return mode->block_size;
 }
 
 /*!
  @function   ccgcm_init
  @abstract   Initialize a GCM context.
+
  @param      mode       Descriptor for the mode
  @param      ctx        Context for this instance
  @param      key_nbytes Length of the key in bytes
  @param      key        Key for the underlying blockcipher (AES)
+
  @result     0 iff successful.
+
  @discussion The correct sequence of calls is:
+
  @code ccgcm_init(...)
  ccgcm_set_iv(...)
  ccgcm_aad(...)       (may be called zero or more times)
  ccgcm_update(...)    (may be called zero or more times)
  ccgcm_finalize(...)
+
  To reuse the context for additional encryptions, follow this sequence:
+
  @code ccgcm_reset(...)
  ccgcm_set_iv(...)
  ccgcm_aad(...)       (may be called zero or more times)
  ccgcm_update(...)    (may be called zero or more times)
  ccgcm_finalize(...)
+
  @warning The key-IV pair must be unique per encryption. The IV must be nonzero in length.
- @warning It is not permitted to call @p ccgcm_inc_iv after initializing the cipher via the @p ccgcm_init interface. Nonzero is returned in the event of an improper call sequence.
+
+ @warning It is not permitted to call @p ccgcm_inc_iv after initializing the cipher via the @p ccgcm_init interface. Nonzero is
+ returned in the event of an improper call sequence.
 
  @warning This function is not FIPS-compliant. Use @p ccgcm_init_with_iv instead.
  */
-CC_INLINE int ccgcm_init(const struct ccmode_gcm *mode, ccgcm_ctx *ctx,
-                         size_t key_nbytes, const void *key)
+CC_INLINE int ccgcm_init(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t key_nbytes, const void *key)
 {
     return mode->init(mode, ctx, key_nbytes, key);
 }
@@ -475,200 +490,204 @@ CC_INLINE int ccgcm_init(const struct ccmode_gcm *mode, ccgcm_ctx *ctx,
 /*!
  @function   ccgcm_init_with_iv
  @abstract   Initialize a GCM context to manage IVs internally.
+
  @param      mode       Descriptor for the mode
  @param      ctx        Context for this instance
  @param      key_nbytes Length of the key in bytes
  @param      key        Key for the underlying blockcipher (AES)
  @param      iv         IV for the first encryption
+
  @result     0 iff successful.
+
  @discussion The correct sequence of calls is:
+
  @code ccgcm_init_with_iv(...)
  ccgcm_aad(...)       (may be called zero or more times)
  ccgcm_update(...)    (may be called zero or more times)
  ccgcm_finalize(...)
+
  To reuse the context for additional encryptions, follow this sequence:
+
  @code ccgcm_reset(...)
  ccgcm_inc_iv(...)
  ccgcm_aad(...)       (may be called zero or more times)
  ccgcm_update(...)    (may be called zero or more times)
  ccgcm_finalize(...)
+
  The IV must be exactly 12 bytes in length.
- Internally, the IV is treated as a four-byte salt followed by an eight-byte counter. This is to match the behavior of certain protocols (e.g. TLS). In the call to @p ccgcm_inc_iv, the counter component will be interpreted as a big-endian, unsigned value and incremented in place.
- @warning It is not permitted to call @p ccgcm_set_iv after initializing the cipher via the @p ccgcm_init_with_iv interface. Nonzero is returned in the event of an improper call sequence.
- @warning The security of GCM depends on the uniqueness of key-IV pairs. To avoid key-IV repetition, callers should not initialize multiple contexts with the same key material via the @p ccgcm_init_with_iv interface.
+
+ Internally, the IV is treated as a four-byte salt followed by an eight-byte counter. This is to match the behavior of certain
+ protocols (e.g. TLS). In the call to @p ccgcm_inc_iv, the counter component will be interpreted as a big-endian, unsigned value
+ and incremented in place.
+
+ @warning It is not permitted to call @p ccgcm_set_iv after initializing the cipher via the @p ccgcm_init_with_iv interface.
+ Nonzero is returned in the event of an improper call sequence.
+
+ @warning The security of GCM depends on the uniqueness of key-IV pairs. To avoid key-IV repetition, callers should not initialize
+ multiple contexts with the same key material via the @p ccgcm_init_with_iv interface.
  */
-int ccgcm_init_with_iv(const struct ccmode_gcm *mode, ccgcm_ctx *ctx,
-                       size_t key_nbytes, const void *key,
-                       const void *iv);
+int ccgcm_init_with_iv(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t key_nbytes, const void *key, const void *iv);
 
 /*!
  @function   ccgcm_set_iv
  @abstract   Set the IV for encryption.
+
  @param      mode       Descriptor for the mode
  @param      ctx        Context for this instance
  @param      iv_nbytes  Length of the IV in bytes
  @param      iv         Initialization vector
+
  @result     0 iff successful.
+
  @discussion Set the initialization vector for encryption.
+
  @warning The key-IV pair must be unique per encryption. The IV must be nonzero in length.
- In stateful protocols, if each packet exposes a guaranteed-unique value, it is recommended to format this as a 12-byte value for use as the IV.
- In stateless protocols, it is recommended to choose a 16-byte value using a cryptographically-secure pseudorandom number generator (e.g. @p ccrng).
- @warning This function may not be used after initializing the cipher via @p ccgcm_init_with_iv. Nonzero is returned in the event of an improper call sequence.
+
+ In stateful protocols, if each packet exposes a guaranteed-unique value, it is recommended to format this as a 12-byte value for
+ use as the IV.
+
+ In stateless protocols, it is recommended to choose a 16-byte value using a cryptographically-secure pseudorandom number
+ generator (e.g. @p ccrng).
+
+ @warning This function may not be used after initializing the cipher via @p ccgcm_init_with_iv. Nonzero is returned in the event
+ of an improper call sequence.
+
  @warning This function is not FIPS-compliant. Use @p ccgcm_init_with_iv instead.
  */
-CC_INLINE int ccgcm_set_iv(const struct ccmode_gcm *mode, ccgcm_ctx *ctx,
-                            size_t iv_nbytes, const void *iv)
+CC_INLINE int ccgcm_set_iv(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t iv_nbytes, const void *iv)
 {
-       return mode->set_iv(ctx, iv_nbytes, iv);
+    return mode->set_iv(ctx, iv_nbytes, iv);
 }
 
 /*!
  @function   ccgcm_set_iv_legacy
  @abstract   Set the IV for encryption.
+
  @param      mode       Descriptor for the mode
  @param      ctx        Context for this instance
  @param      iv_nbytes  Length of the IV in bytes
  @param      iv         Initialization vector
+
  @result     0 iff successful.
+
  @discussion Identical to @p ccgcm_set_iv except that it allows zero-length IVs.
+
  @warning Zero-length IVs nullify the authenticity guarantees of GCM.
+
  @warning Do not use this function in new applications.
  */
-int ccgcm_set_iv_legacy(const struct ccmode_gcm *mode, ccgcm_ctx *ctx,
-                        size_t iv_nbytes, const void *iv);
+int ccgcm_set_iv_legacy(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t iv_nbytes, const void *iv);
 
 /*!
  @function   ccgcm_inc_iv
  @abstract   Increment the IV for another encryption.
+
  @param      mode       Descriptor for the mode
  @param      ctx        Context for this instance
  @param      iv         Updated initialization vector
+
  @result     0 iff successful.
+
  @discussion Updates the IV internally for another encryption.
- Internally, the IV is treated as a four-byte salt followed by an eight-byte counter. This is to match the behavior of certain protocols (e.g. TLS). The counter component is interpreted as a big-endian, unsigned value and incremented in place.
- The updated IV is copied to @p iv. This is to support protocols that require part of the IV to be specified explicitly in each packet (e.g. TLS).
+
+ Internally, the IV is treated as a four-byte salt followed by an eight-byte counter. This is to match the behavior of certain
+ protocols (e.g. TLS). The counter component is interpreted as a big-endian, unsigned value and incremented in place.
+
+ The updated IV is copied to @p iv. This is to support protocols that require part of the IV to be specified explicitly in each
+ packet (e.g. TLS).
+
  @warning This function may be used only after initializing the cipher via @p ccgcm_init_with_iv.
  */
 int ccgcm_inc_iv(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, void *iv);
 
-
 /*!
  @function   ccgcm_aad
  @abstract   Authenticate additional data.
+
  @param      mode               Descriptor for the mode
  @param      ctx                Context for this instance
  @param      nbytes             Length of the additional data in bytes
  @param      additional_data    Additional data to authenticate
+
  @result     0 iff successful.
+
  @discussion This is typically used to authenticate data that cannot be encrypted (e.g. packet headers).
+
  This function may be called zero or more times.
  */
-CC_INLINE int ccgcm_aad(const struct ccmode_gcm *mode, ccgcm_ctx *ctx,
-                         size_t nbytes, const void *additional_data)
+CC_INLINE int ccgcm_aad(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t nbytes, const void *additional_data)
 {
     return mode->gmac(ctx, nbytes, additional_data);
 }
 
 /*!
  @function   ccgcm_gmac
- @discussion See @p ccgcm_aad.
+
+ @discussion ccgcm_gmac is deprecated. Use the drop-in replacement 'ccgcm_aad' instead.
  */
-CC_INLINE int ccgcm_gmac(const struct ccmode_gcm *mode, ccgcm_ctx *ctx,
-                          size_t nbytes, const void *in)
+CC_INLINE int ccgcm_gmac (const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t nbytes, const void *in)
+cc_deprecate_with_replacement("ccgcm_aad", 13.0, 10.15, 13.0, 6.0, 4.0)
 {
-       return mode->gmac(ctx, nbytes, in);
+    return mode->gmac(ctx, nbytes, in);
 }
 
 /*!
  @function   ccgcm_update
  @abstract   Encrypt or decrypt data.
+
  @param      mode       Descriptor for the mode
  @param      ctx        Context for this instance
  @param      nbytes     Length of the data in bytes
  @param      in         Input plaintext or ciphertext
  @param      out        Output ciphertext or plaintext
+
  @result     0 iff successful.
+
  @discussion In-place processing is supported.
+
  This function may be called zero or more times.
  */
-CC_INLINE int ccgcm_update(const struct ccmode_gcm *mode, ccgcm_ctx *ctx,
-                            size_t nbytes, const void *in, void *out)
+CC_INLINE int ccgcm_update(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t nbytes, const void *in, void *out)
 {
-       return mode->gcm(ctx, nbytes, in, out);
+    return mode->gcm(ctx, nbytes, in, out);
 }
 
 /*!
  @function   ccgcm_finalize
  @abstract   Finish processing and authenticate.
+
  @param      mode       Descriptor for the mode
  @param      ctx        Context for this instance
  @param      tag_nbytes Length of the tag in bytes
  @param      tag        Authentication tag
+
  @result     0 iff successful.
+
  @discussion Finish processing a packet and generate the authentication tag.
+
  On encryption, @p tag is purely an output parameter. The generated tag is written to @p tag.
- On decryption, @p tag is both an input and an output parameter. Well-behaved callers should provide the authentication tag generated during encryption. The function will return nonzero if the input tag does not match the generated tag. The generated tag will be written into the @p tag buffer whether authentication succeeds or fails.
- @warning The generated tag is written to @p tag to support legacy applications that perform authentication manually. Do not follow this usage pattern in new applications. Rely on the function's error code to verify authenticity.
+
+ On decryption, @p tag is both an input and an output parameter. Well-behaved callers should provide the authentication tag
+ generated during encryption. The function will return nonzero if the input tag does not match the generated tag. The generated
+ tag will be written into the @p tag buffer whether authentication succeeds or fails.
+
+ @warning The generated tag is written to @p tag to support legacy applications that perform authentication manually. Do not
+ follow this usage pattern in new applications. Rely on the function's error code to verify authenticity.
  */
-CC_INLINE int ccgcm_finalize(const struct ccmode_gcm *mode, ccgcm_ctx *ctx,
-                              size_t tag_nbytes, void *tag)
+CC_INLINE int ccgcm_finalize(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t tag_nbytes, void *tag)
 {
-       return mode->finalize(ctx, tag_nbytes, tag);
+    return mode->finalize(ctx, tag_nbytes, tag);
 }
 
 /*!
  @function   ccgcm_reset
  @abstract   Reset the context for another encryption.
+
  @param      mode       Descriptor for the mode
  @param      ctx        Context for this instance
+
  @result     0 iff successful.
+
  @discussion Refer to @p ccgcm_init for correct usage.
  */
 CC_INLINE int ccgcm_reset(const struct ccmode_gcm *mode, ccgcm_ctx *ctx)
@@ -676,11 +695,10 @@ CC_INLINE int ccgcm_reset(const struct ccmode_gcm *mode, ccgcm_ctx *ctx)
     return mode->reset(ctx);
 }
 
-
 /*!
  @function   ccgcm_one_shot
  @abstract   Encrypt or decrypt with GCM.
+
  @param      mode           Descriptor for the mode
  @param      key_nbytes     Length of the key in bytes
  @param      key            Key for the underlying blockcipher (AES)
@@ -693,37 +711,47 @@ CC_INLINE int ccgcm_reset(const struct ccmode_gcm *mode, ccgcm_ctx *ctx)
  @param      out            Output ciphertext or plaintext
  @param      tag_nbytes     Length of the tag in bytes
  @param      tag            Authentication tag
+
  @result     0 iff successful.
+
  @discussion Perform GCM encryption or decryption.
+
  @warning The key-IV pair must be unique per encryption. The IV must be nonzero in length.
- In stateful protocols, if each packet exposes a guaranteed-unique value, it is recommended to format this as a 12-byte value for use as the IV.
- In stateless protocols, it is recommended to choose a 16-byte value using a cryptographically-secure pseudorandom number generator (e.g. @p ccrng).
+
+ In stateful protocols, if each packet exposes a guaranteed-unique value, it is recommended to format this as a 12-byte value for
+ use as the IV.
+
+ In stateless protocols, it is recommended to choose a 16-byte value using a cryptographically-secure pseudorandom number
+ generator (e.g. @p ccrng).
+
  In-place processing is supported.
+
  On encryption, @p tag is purely an output parameter. The generated tag is written to @p tag.
- On decryption, @p tag is primarily an input parameter. The caller should provide the authentication tag generated during encryption. The function will return nonzero if the input tag does not match the generated tag.
- @warning To support legacy applications, @p tag is also an output parameter during decryption. The generated tag is written to @p tag. Legacy callers may choose to compare this to the tag generated during encryption. Do not follow this usage pattern in new applications.
+
+ On decryption, @p tag is primarily an input parameter. The caller should provide the authentication tag generated during
+ encryption. The function will return nonzero if the input tag does not match the generated tag.
+
+ @warning To support legacy applications, @p tag is also an output parameter during decryption. The generated tag is written to @p
+ tag. Legacy callers may choose to compare this to the tag generated during encryption. Do not follow this usage pattern in new
+ applications.
  */
 int ccgcm_one_shot(const struct ccmode_gcm *mode,
-                   size_t key_nbytes, const void *key,
-                   size_t iv_nbytes, const void *iv,
-                   size_t adata_nbytes, const void *adata,
-                   size_t nbytes, const void *in, void *out,
-                   size_t tag_nbytes, void *tag);
-
+                   size_t key_nbytes,
+                   const void *key,
+                   size_t iv_nbytes,
+                   const void *iv,
+                   size_t adata_nbytes,
+                   const void *adata,
+                   size_t nbytes,
+                   const void *in,
+                   void *out,
+                   size_t tag_nbytes,
+                   void *tag);
 
 /*!
  @function   ccgcm_one_shot_legacy
  @abstract   Encrypt or decrypt with GCM.
+
  @param      mode           Descriptor for the mode
  @param      key_nbytes     Length of the key in bytes
  @param      key            Key for the underlying blockcipher (AES)
@@ -736,22 +764,27 @@ int ccgcm_one_shot(const struct ccmode_gcm *mode,
  @param      out            Output ciphertext or plaintext
  @param      tag_nbytes     Length of the tag in bytes
  @param      tag            Authentication tag
+
  @result     0 iff successful.
+
  @discussion Identical to @p ccgcm_one_shot except that it allows zero-length IVs.
+
  @warning Zero-length IVs nullify the authenticity guarantees of GCM.
+
  @warning Do not use this function in new applications.
  */
 int ccgcm_one_shot_legacy(const struct ccmode_gcm *mode,
-                          size_t key_nbytes, const void *key,
-                          size_t iv_nbytes, const void *iv,
-                          size_t adata_nbytes, const void *adata,
-                          size_t nbytes, const void *in, void *out,
-                          size_t tag_nbytes, void *tag);
-
+                          size_t key_nbytes,
+                          const void *key,
+                          size_t iv_nbytes,
+                          const void *iv,
+                          size_t adata_nbytes,
+                          const void *adata,
+                          size_t nbytes,
+                          const void *in,
+                          void *out,
+                          size_t tag_nbytes,
+                          void *tag);
 
 /* CCM */
 
@@ -762,7 +795,6 @@ int ccgcm_one_shot_legacy(const struct ccmode_gcm *mode,
 #define ccccm_nonce_decl(_size_, _name_) cc_ctx_decl(ccccm_nonce, _size_, _name_)
 #define ccccm_nonce_clear(_size_, _name_) cc_clear(_size_, _name_)
 
-
 CC_INLINE size_t ccccm_context_size(const struct ccmode_ccm *mode)
 {
     return mode->size;
@@ -770,38 +802,40 @@ CC_INLINE size_t ccccm_context_size(const struct ccmode_ccm *mode)
 
 CC_INLINE size_t ccccm_block_size(const struct ccmode_ccm *mode)
 {
-       return mode->block_size;
+    return mode->block_size;
 }
 
-CC_INLINE int ccccm_init(const struct ccmode_ccm *mode, ccccm_ctx *ctx,
-                          size_t key_len, const void *key)
+CC_INLINE int ccccm_init(const struct ccmode_ccm *mode, ccccm_ctx *ctx, size_t key_len, const void *key)
 {
     return mode->init(mode, ctx, key_len, key);
 }
 
-CC_INLINE int ccccm_set_iv(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx,
-                            size_t nonce_len, const void *nonce,
-                            size_t mac_size, size_t auth_len, size_t data_len)
+CC_INLINE int ccccm_set_iv(const struct ccmode_ccm *mode,
+                           ccccm_ctx *ctx,
+                           ccccm_nonce *nonce_ctx,
+                           size_t nonce_len,
+                           const void *nonce,
+                           size_t mac_size,
+                           size_t auth_len,
+                           size_t data_len)
 {
-       return mode->set_iv(ctx, nonce_ctx, nonce_len, nonce, mac_size, auth_len, data_len);
+    return mode->set_iv(ctx, nonce_ctx, nonce_len, nonce, mac_size, auth_len, data_len);
 }
 
-CC_INLINE int ccccm_cbcmac(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx,
-                          size_t nbytes, const void *in)
+CC_INLINE int ccccm_cbcmac(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in)
 {
-       return mode->cbcmac(ctx, nonce_ctx, nbytes, in);
+    return mode->cbcmac(ctx, nonce_ctx, nbytes, in);
 }
 
-CC_INLINE int ccccm_update(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx,
-                            size_t nbytes, const void *in, void *out)
+CC_INLINE int
+ccccm_update(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, void *out)
 {
-       return mode->ccm(ctx, nonce_ctx, nbytes, in, out);
+    return mode->ccm(ctx, nonce_ctx, nbytes, in, out);
 }
 
-CC_INLINE int ccccm_finalize(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx,
-                              void *mac)
+CC_INLINE int ccccm_finalize(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, void *mac)
 {
-       return mode->finalize(ctx, nonce_ctx, mac);
+    return mode->finalize(ctx, nonce_ctx, mac);
 }
 
 CC_INLINE int ccccm_reset(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_nonce *nonce_ctx)
@@ -809,32 +843,43 @@ CC_INLINE int ccccm_reset(const struct ccmode_ccm *mode, ccccm_ctx *ctx, ccccm_n
     return mode->reset(ctx, nonce_ctx);
 }
 
-
 CC_INLINE int ccccm_one_shot(const struct ccmode_ccm *mode,
-                              size_t key_len, const void *key,
-                              size_t nonce_len, const void *nonce,
-                              size_t nbytes, const void *in, void *out,
-                              size_t adata_len, const void* adata,
-                              size_t mac_size, void *mac)
-{
-    int rc=0;
-       ccccm_ctx_decl(mode->size, ctx);
-       ccccm_nonce_decl(mode->nonce_size, nonce_ctx);
-       rc = mode->init(mode, ctx, key_len, key);
-       if(rc==0) rc=mode->set_iv(ctx, nonce_ctx, nonce_len, nonce, mac_size, adata_len, nbytes);
-       if(rc==0) rc=mode->cbcmac(ctx, nonce_ctx, adata_len, adata);
-       if(rc==0) rc=mode->ccm(ctx, nonce_ctx, nbytes, in, out);
-       if(rc==0) rc=mode->finalize(ctx, nonce_ctx, mac);
-       ccccm_ctx_clear(mode->size, ctx);
+                             size_t key_len,
+                             const void *key,
+                             size_t nonce_len,
+                             const void *nonce,
+                             size_t nbytes,
+                             const void *in,
+                             void *out,
+                             size_t adata_len,
+                             const void *adata,
+                             size_t mac_size,
+                             void *mac)
+{
+    int rc;
+    ccccm_ctx_decl(mode->size, ctx);
+    ccccm_nonce_decl(mode->nonce_size, nonce_ctx);
+    rc = mode->init(mode, ctx, key_len, key);
+    if (rc == 0) {
+        rc = mode->set_iv(ctx, nonce_ctx, nonce_len, nonce, mac_size, adata_len, nbytes);
+    }
+    if (rc == 0) {
+        rc = mode->cbcmac(ctx, nonce_ctx, adata_len, adata);
+    }
+    if (rc == 0) {
+        rc = mode->ccm(ctx, nonce_ctx, nbytes, in, out);
+    }
+    if (rc == 0) {
+        rc = mode->finalize(ctx, nonce_ctx, mac);
+    }
+    ccccm_ctx_clear(mode->size, ctx);
     ccccm_nonce_clear(mode->nonce_size, nonce_ctx);
 
     return rc;
 }
 
-
 /* OMAC mode. */
 
-
 /* Declare a omac key named _name_.  Pass the size field of a struct ccmode_omac
  for _size_. */
 #define ccomac_ctx_decl(_size_, _name_) cc_ctx_decl(ccomac_ctx, _size_, _name_)
@@ -847,32 +892,37 @@ CC_INLINE size_t ccomac_context_size(const struct ccmode_omac *mode)
 
 CC_INLINE size_t ccomac_block_size(const struct ccmode_omac *mode)
 {
-       return mode->block_size;
+    return mode->block_size;
 }
 
-CC_INLINE int ccomac_init(const struct ccmode_omac *mode, ccomac_ctx *ctx,
-                          size_t tweak_len, size_t key_len, const void *key)
+CC_INLINE int ccomac_init(const struct ccmode_omac *mode, ccomac_ctx *ctx, size_t tweak_len, size_t key_len, const void *key)
 {
     return mode->init(mode, ctx, tweak_len, key_len, key);
 }
 
-CC_INLINE int ccomac_update(const struct ccmode_omac *mode, ccomac_ctx *ctx,
-       size_t nblocks, const void *tweak, const void *in, void *out)
+CC_INLINE int
+ccomac_update(const struct ccmode_omac *mode, ccomac_ctx *ctx, size_t nblocks, const void *tweak, const void *in, void *out)
 {
-       return mode->omac(ctx, nblocks, tweak, in, out);
+    return mode->omac(ctx, nblocks, tweak, in, out);
 }
 
 CC_INLINE int ccomac_one_shot(const struct ccmode_omac *mode,
-       size_t tweak_len, size_t key_len, const void *key,
-       const void *tweak, size_t nblocks, const void *in, void *out)
+                              size_t tweak_len,
+                              size_t key_len,
+                              const void *key,
+                              const void *tweak,
+                              size_t nblocks,
+                              const void *in,
+                              void *out)
 {
     int rc;
-       ccomac_ctx_decl(mode->size, ctx);
-       rc = mode->init(mode, ctx, tweak_len, key_len, key);
-       if (rc == 0) rc = mode->omac(ctx, nblocks, tweak, in, out);
-       ccomac_ctx_clear(mode->size, ctx);
+    ccomac_ctx_decl(mode->size, ctx);
+    rc = mode->init(mode, ctx, tweak_len, key_len, key);
+    if (rc == 0) {
+        rc = mode->omac(ctx, nblocks, tweak, in, out);
+    }
+    ccomac_ctx_clear(mode->size, ctx);
     return rc;
 }
 
-
 #endif /* _CORECRYPTO_CCMODE_H_ */
index a9498d1f7a03fce1ba0e11a5d61a7dc57822fc75..aa8cb052772d10924b58d66355be498ee70c2893 100644 (file)
 #include <corecrypto/ccn.h>  /* TODO: Remove dependency on this header. */
 #include <corecrypto/ccmode_impl.h>
 
-/* Function and macros defined in this file are only to be used
+/* Functions defined in this file are only to be used
  within corecrypto files.
  */
 
-/* For CBC, direction of underlying ecb is the same as the cbc direction */
-#define CCMODE_CBC_FACTORY(_cipher_, _dir_)                                     \
-static struct ccmode_cbc cbc_##_cipher_##_##_dir_;                              \
-                                                                                \
-const struct ccmode_cbc *cc##_cipher_##_cbc_##_dir_##_mode(void)                \
-{                                                                               \
-    const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_##_dir_##_mode();           \
-    ccmode_factory_cbc_##_dir_(&cbc_##_cipher_##_##_dir_, ecb);                 \
-    return &cbc_##_cipher_##_##_dir_;                                           \
-}
-
-/* For CTR, only one direction, underlying ecb is always encrypt */
-#define CCMODE_CTR_FACTORY(_cipher_)                                            \
-static struct ccmode_ctr ctr_##_cipher_;                                        \
-                                                                                \
-const struct ccmode_ctr *cc##_cipher_##_ctr_crypt_mode(void)                    \
-{                                                                               \
-    const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_encrypt_mode();             \
-    ccmode_factory_ctr_crypt(&ctr_##_cipher_, ecb);                             \
-    return &ctr_##_cipher_;                                                     \
-}
-
-/* OFB, same as CTR */
-#define CCMODE_OFB_FACTORY(_cipher_)                                            \
-static struct ccmode_ofb ofb_##_cipher_;                                        \
-                                                                                \
-const struct ccmode_ofb *cc##_cipher_##_ofb_crypt_mode(void)                    \
-{                                                                               \
-    const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_encrypt_mode();             \
-    ccmode_factory_ofb_crypt(&ofb_##_cipher_, ecb);                             \
-    return &ofb_##_cipher_;                                                     \
-}
-
-
-/* For CFB, the underlying ecb operation is encrypt for both directions */
-#define CCMODE_CFB_FACTORY(_cipher_, _mode_, _dir_)                             \
-static struct ccmode_##_mode_ _mode_##_##_cipher_##_##_dir_;                    \
-                                                                                \
-const struct ccmode_##_mode_ *cc##_cipher_##_##_mode_##_##_dir_##_mode(void)    \
-{                                                                               \
-    const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_encrypt_mode();             \
-    ccmode_factory_##_mode_##_##_dir_(&_mode_##_##_cipher_##_##_dir_, ecb);     \
-    return &_mode_##_##_cipher_##_##_dir_;                                      \
-}
-
-/* For GCM, same as CFB */
-#define CCMODE_GCM_FACTORY(_cipher_, _dir_) CCMODE_CFB_FACTORY(_cipher_, gcm, _dir_)
-
-/* For CCM, same as CFB */
-#define CCMODE_CCM_FACTORY(_cipher_, _dir_) CCMODE_CFB_FACTORY(_cipher_, ccm, _dir_)
-
-
-/* Fot XTS, you always need an ecb encrypt */
-#define CCMODE_XTS_FACTORY(_cipher_ , _dir_)                                    \
-static struct ccmode_xts xts##_cipher_##_##_dir_;                               \
-                                                                                \
-const struct ccmode_xts *cc##_cipher_##_xts_##_dir_##_mode(void)                \
-{                                                                               \
-    const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_##_dir_##_mode();           \
-    const struct ccmode_ecb *ecb_enc=cc##_cipher_##_ecb_encrypt_mode();         \
-                                                                                \
-    ccmode_factory_xts_##_dir_(&xts##_cipher_##_##_dir_, ecb, ecb_enc);         \
-    return &xts##_cipher_##_##_dir_;                                            \
-}
-
 /* Use these function to runtime initialize a ccmode_cbc decrypt object (for
  example if it's part of a larger structure). Normally you would pass a
  ecb decrypt mode implementation of some underlying algorithm as the ecb
index 99322ad2de82e6047067c94d928ac4a77a049a4c..1b05c638ea2eb2a6b67c5740d8c34051700bf0c5 100644 (file)
@@ -56,7 +56,7 @@ CC_INLINE size_t ccsiv_block_size(const struct ccmode_siv *mode)
 CC_INLINE size_t ccsiv_ciphertext_size(const struct ccmode_siv *mode,
                                        size_t plaintext_size)
 {
-    return plaintext_size+mode->cbc->block_size;
+    return plaintext_size + mode->cbc->block_size;
 }
 
 CC_INLINE size_t ccsiv_plaintext_size(const struct ccmode_siv *mode,
@@ -65,7 +65,7 @@ CC_INLINE size_t ccsiv_plaintext_size(const struct ccmode_siv *mode,
     if (ciphertext_size<mode->cbc->block_size) {
         return 0; // error
     }
-    return ciphertext_size-mode->cbc->block_size;
+    return ciphertext_size - mode->cbc->block_size;
 }
 
 // Supported key sizes are 32, 48, 64 bytes
@@ -99,7 +99,6 @@ CC_INLINE int ccsiv_crypt(const struct ccmode_siv *mode, ccsiv_ctx *ctx,
 }
 
 // Clear all context for reuse.
-// Key is clear to avoid leaking it
 CC_INLINE int ccsiv_reset(const struct ccmode_siv *mode, ccsiv_ctx *ctx)
 {
     return mode->reset(ctx);
diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode_siv_hmac.h b/EXTERNAL_HEADERS/corecrypto/ccmode_siv_hmac.h
new file mode 100644 (file)
index 0000000..2cbc9a1
--- /dev/null
@@ -0,0 +1,205 @@
+//
+//  ccmode_siv_hmac.h
+//  corecrypto
+//
+//  Created by Apple on 12/10/18.
+//
+
+#ifndef ccmode_siv_hmac_h
+#define ccmode_siv_hmac_h
+
+#include <corecrypto/cc.h>
+#include <corecrypto/ccmode.h>
+#include <corecrypto/ccmode_impl.h>
+#include <corecrypto/ccdigest.h>
+#include <corecrypto/cchmac.h>
+#include <corecrypto/ccsha2.h>
+
+/* This provides an implementation of SIV using AES CTR mode with HMAC as the MAC,
+ allowing for a tagging mechanism with collision resistant tags. This is a modification of the
+ standard specified in https://tools.ietf.org/html/rfc5297
+ also in http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/siv/siv.pdf
+ Counter Mode where IV is based on HMAC.
+ */
+
+cc_aligned_struct(16) ccsiv_hmac_ctx;
+
+struct ccmode_siv_hmac {
+    size_t size; /* first argument to ccsiv_hmac_ctx_decl(). */
+    size_t block_size;
+    
+    int (*init)(const struct ccmode_siv_hmac *sivhmac,
+                ccsiv_hmac_ctx *ctx,
+                size_t key_len,
+                const uint8_t *key,
+                const size_t tag_size);
+    int (*set_nonce)(ccsiv_hmac_ctx *ctx, size_t nbytes, const uint8_t *in);
+    int (*auth)(ccsiv_hmac_ctx *ctx, size_t nbytes, const uint8_t *in);      
+    int (*crypt)(ccsiv_hmac_ctx *ctx, size_t nbytes, const uint8_t *in, uint8_t *out);
+    int (*reset)(ccsiv_hmac_ctx *ctx);
+    const struct ccdigest_info *hmac_digest; // Digest to be used in HMAC;
+    const struct ccmode_ctr *ctr;
+};
+
+#define ccsiv_hmac_ctx_decl(_size_, _name_) cc_ctx_decl(ccsiv_hmac_ctx, _size_, _name_)
+#define ccsiv_hmac_ctx_clear(_size_, _name_) cc_clear(_size_, _name_)
+
+/*!
+ @function   ccsiv_hmac_context_size
+ @abstract   Return size of context
+ @param      mode       Descriptor for the mode
+ */
+CC_INLINE size_t ccsiv_hmac_context_size(const struct ccmode_siv_hmac *mode)
+{
+    return mode->size;
+}
+
+/*!
+ @function   ccsiv_hmac_block_size
+ @abstract   Return size of context
+ @param      mode       Descriptor for the mode
+ */
+CC_INLINE size_t ccsiv_hmac_block_size(const struct ccmode_siv_hmac *mode)
+{
+    return mode->block_size;
+}
+
+/*!
+ @function   ccsiv_hmac_ciphertext_size
+ @abstract   Return size of Ciphertext (which is the ciphertext and corresponding tag) given the mode and plaintext length
+ @param      ctx               Current siv_hmac context that has been previously initialized
+ @param      plaintext_size    Size of the plaintext
+ @discussion returns the length of the aead ciphertext that the context will generate which includes both the encrypted plaintext
+ and tag.
+ */
+size_t ccsiv_hmac_ciphertext_size(ccsiv_hmac_ctx *ctx, size_t plaintext_size);
+
+/*!
+ @function   ccsiv_hmac_plaintext_size
+ @abstract   Return size of plaintext given a ciphertext length and mode.
+
+ @param      ctx               Current siv_hmac context that has been previously initialized
+ @param      ciphertext_size    Size of the ciphertext
+
+ @discussion returns the length of the aead ciphertext which is both the encrypted plaintext and tag length together.
+ */
+size_t ccsiv_hmac_plaintext_size(ccsiv_hmac_ctx *ctx, size_t ciphertext_size);
+
+/*!
+ @function   ccsiv_hmac_init
+ @abstract   Initialize a context for siv_hmac with an associated mode, given key and specifying output tag size.
+ @param      mode               Descriptor for the mode
+ @param      ctx                Alocated context to be intialized
+ @param      key_byte_len       Length of the key:  Supported key sizes are 32, 48, 64 bytes
+ @param      key                key for siv_hmac
+ @param      tag_size           The length of the output tag requested. Must be at least 20 bytes, and can be as larged as the
+ associated digest's output
+ @discussion In order to  compute HMAC_SIV_Enc_k(a1,...,am, n, x) where ai is the ith piece of associated data, n is a nonce and x
+ is a plaintext, we first initialize the context with this call, and then use it to call ccsiv_hmac_aad for each ai, followed by
+ ccsiv_hmac_set_nonce for nonce n, and finally a call to ccsiv_hmac_crypt for the plaintext x. Note the order of the calls to aad,
+ nonce and then crypt is critical. If a second encryption is needed then a call to ccsiv_hmac_reset can be used to reset state,
+ and begin again.
+ */
+int ccsiv_hmac_init(const struct ccmode_siv_hmac *mode, ccsiv_hmac_ctx *ctx, size_t key_byte_len, const uint8_t *key, size_t tag_size);
+
+/*!
+ @function   ccsiv_hmac_aad
+ @abstract   Add the next piece of associated data to the hmac_siv's computation of the tag. Note this call is optional and no
+ associated data needs to be provided. Multiple pieces of associated data can be provided by multiple calls to this
+ function. Each input is regarded as a seperate piece of associated data, and the mac is NOT simply computed on the
+ concatenation of all of the associated data inputs. Therefore on decryption the same inputs must be prodivded and in
+ the same order.
+ @param      mode               Descriptor for the mode
+ @param      ctx                Intialized ctx
+ @param      nbytes             Length of the current associated data being added
+ @param      in                 Associated data to be authenticated.
+ @discussion Adds the associated data given by in to the computation of the tag in the associated data.
+ */
+int ccsiv_hmac_aad(const struct ccmode_siv_hmac *mode, ccsiv_hmac_ctx *ctx, size_t nbytes, const uint8_t *in);
+
+/*!
+ @function   ccsiv_hmac_nonce
+ @abstract   Add the nonce to the hmac_siv's computation of the the tag. Changes the internal state of the context
+ so that after the call only a crypt or reset call is permitted.
+ @param      mode               Descriptor for the mode
+ @param      ctx                Intialized ctx
+ @param      nbytes             Length of the current nonce data being added
+ @param      in                 Nonce data to be authenticated.
+ @discussion The nonce is a special form of authenticated data. If provided ( a call to hmac_nonce is optional) it allows
+ randomization of the of ciphertext (preventing deterministic encryption). While the length of the nonce is not limimited, the
+ amount of entropy that can be provided is limited by the number of bits in the block of the associated block-cipher in mode.
+ */
+int ccsiv_hmac_set_nonce(const struct ccmode_siv_hmac *mode, ccsiv_hmac_ctx *ctx, size_t nbytes, const uint8_t *in);
+
+/*!
+ @function   ccsiv_hmac_crypt
+ @abstract   Depending on whether mode has been setup to encrypt or decrypt, this function
+ 1) Encrypts the plaintext given as input in, and provides the ciphertext (which is a concatenation of the tag
+ followed by the encrypted plaintext) as output out. 2) Decrypts plaintext using the input ciphertext at in (which again is the
+ tag, followed by encrypted plaintext), and then verifies that the computer tag and provided tags match.
+ @param      mode               Descriptor for the mode
+ @param      ctx                Intialized ctx
+ @param      nbytes             Case 1) Length of the current plaintext
+ Case 2) Length of the current ciphertext (tag length + plaintext length)
+ @param      in                 Case 1) Plaintext
+ Case 2) Ciphertext
+ @discussion This function is only called once. If one wishes to compute another (en)/(de)cryption, one resets the state with
+ ccsiv_hmac_reset, and then begins the process again. There is no way to stream large plaintext/ciphertext inputs into the
+ function.
+ In the case of a decryption, if there is a failure in verifying the computed tag against the provided tag (embedded int he ciphertext), then a decryption/verification
+ failure is returned, and any internally computed plaintexts and tags are zeroed out.
+ Lastly the contexts internal state is reset, so that a new decryption/encryption can be commenced.
+ */
+int ccsiv_hmac_crypt(const struct ccmode_siv_hmac *mode, ccsiv_hmac_ctx *ctx, size_t nbytes, const uint8_t *in, uint8_t *out);
+
+/*!
+ @function   ccsiv_hmac_reset
+ @abstract   Resets the state of the siv_hamc ctx, maintaing the key, but preparing  the
+ ctx to preform a new Associated Data Authenticated (En)/(De)cryption.
+ @param      mode               Descriptor for the mode
+ @param      ctx                Intialized ctx
+ */
+int ccsiv_hmac_reset(const struct ccmode_siv_hmac *mode, ccsiv_hmac_ctx *ctx);
+
+/*!
+ @function   ccsiv_hmac_one_shot
+ @abstract   A simplified but more constrained way of performing an AEAD SIV HMAC (en)/(de)cryption. It is limited because only
+ one piece of associated data may be provided.
+ @param      mode               Descriptor for the mode
+ @param      key_len            Length of the key:  Supported key sizes are 32, 48, 64 bytes
+ @param      key                key for siv_hmac
+ @param      tag_length         The length of the tag to produce or accept as input. Must be at least 20
+ bytes, and can be as large as the hmac's digest's output
+ @param      nonce_nbytes       Length of the current nonce data being added
+ @param      nonce              Nonce data to be authenticated.
+ @param      adata_nbytes       Length of the associated data.
+ @param      adata              Associated data to be authenticated.
+ @param      in_nbytes          Length of either the plaintext (for encryption) or ciphertext (for decryption)
+ @param      in                 plaintext or ciphertext. Note that the ciphertext includes a tag of length tag_length prepended to
+ it.
+ */
+
+// One shot AEAD with only one input for adata, and a nonce.
+int ccsiv_hmac_one_shot(const struct ccmode_siv_hmac *mode,
+                                  size_t key_len,
+                                  const uint8_t *key,
+                                  size_t tag_length,
+                                  unsigned nonce_nbytes,
+                                  const uint8_t *nonce,
+                                  unsigned adata_nbytes,
+                                  const uint8_t *adata,
+                                  size_t in_nbytes,
+                                  const uint8_t *in,
+                        uint8_t *out);
+
+#endif /* ccmode_siv_hmac_h */
index 2d3e847c9cc6fe683ec91d65c839635632518f3f..778f3e5cf6f2c69ae1ca4870cf50232467add3af 100644 (file)
@@ -62,18 +62,10 @@ typedef uint16_t cc_dunit;         // 16 bit double width unit
 #error invalid CCN_UNIT_SIZE
 #endif
 
-// All mp types have units in little endian unit order.
-typedef cc_unit *ccn_t;                // n unit long mp
-typedef cc_unit *ccnp1_t;              // n + 1 unit long mp
-typedef cc_unit *cc2n_t;               // 2 * n unit long mp
-typedef cc_unit *cc2np2_t;             // 2 * n + 2 unit long mp
-typedef const cc_unit *ccn_in_t;       // n unit long mp
-typedef const cc_unit *ccnp1_in_t;     // n + 1 unit long mp
-typedef const cc_unit *cc2n_in_t;      // 2 * n unit long mp
-typedef const cc_unit *cc2np2_in_t;    // 2 * n + 2 unit long mp
-
 #define CCN_UNIT_BITS  (sizeof(cc_unit) * 8)
 #define CCN_UNIT_MASK  ((cc_unit)~0)
+#define CCN_UNIT_LOWER_HALF_MASK  ((CCN_UNIT_MASK) >> (CCN_UNIT_BITS/2))
+#define CCN_UNIT_UPPER_HALF_MASK  (~CCN_UNIT_LOWER_HALF_MASK)
 
 typedef struct {
     cc_unit *start;      // First cc_unit of the workspace
@@ -233,6 +225,7 @@ typedef struct {
 
 /* Macros to construct fixed size ccn arrays from 64 or 32 bit quantities. */
 #define ccn192_64(a2,a1,a0) ccn64_64(a0),ccn64_64(a1),ccn64_64(a2)
+#define ccn192_32(a5,a4,a3,a2,a1,a0) ccn64_32(a1,a0),ccn64_32(a3,a2),ccn64_32(a5,a4)
 #define ccn224_32(a6,a5,a4,a3,a2,a1,a0) ccn64_32(a1,a0),ccn64_32(a3,a2),ccn64_32(a5,a4),ccn32_32(a6)
 #define ccn256_32(a7,a6,a5,a4,a3,a2,a1,a0) ccn64_32(a1,a0),ccn64_32(a3,a2),ccn64_32(a5,a4),ccn64_32(a7,a6)
 #define ccn384_32(a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0) ccn64_32(a1,a0),ccn64_32(a3,a2),ccn64_32(a5,a4),ccn64_32(a7,a6),ccn64_32(a9,a8),ccn64_32(a11,a10)
@@ -286,18 +279,23 @@ typedef struct {
 
 /* Return the number of used units after stripping leading 0 units.  */
 CC_PURE CC_NONNULL((2))
-cc_size ccn_n(cc_size n, const cc_unit *s);
+cc_size ccn_n(cc_size n, const cc_unit *s) __asm__("_ccn_n");
 
-/* s >> k -> r return bits shifted out of least significant word in bits [0, n>
+/* s >> k -> r return bits shifted out of least significant word in the higest order bits of
+ the retuned value. For example if CCN_UNIT_SIZE == 1, then (0b1101 1110)>>4 returns (0b1110 0000)
+ and sets r==(0b0000 1101).
  { N bit, scalar -> N bit } N = n * sizeof(cc_unit) * 8
  the _multi version doesn't return the shifted bits, but does support multiple
  word shifts.  */
 CC_NONNULL((2, 3))
-cc_unit ccn_shift_right(cc_size n, cc_unit *r, const cc_unit *s, size_t k);
+cc_unit ccn_shift_right(cc_size n, cc_unit *r, const cc_unit *s, size_t k) __asm__("_ccn_shift_right");
 
 /* s == 0 -> return 0 | s > 0 -> return index (starting at 1) of most
- significant bit that is 1.
- { N bit } N = n * sizeof(cc_unit) * 8 */
+ * significant bit that is 1.
+ * { N bit } N = n * sizeof(cc_unit) * 8
+ *
+ * Runs in constant time, independent of the value of `s`.
+ */
 CC_NONNULL((2))
 size_t ccn_bitlen(cc_size n, const cc_unit *s);
 
@@ -314,7 +312,7 @@ size_t ccn_bitlen(cc_size n, const cc_unit *s);
 /* s < t -> return - 1 | s == t -> return 0 | s > t -> return 1
  { N bit, N bit -> int } N = n * sizeof(cc_unit) * 8 */
 CC_PURE CC_NONNULL((2, 3))
-int ccn_cmp(cc_size n, const cc_unit *s, const cc_unit *t);
+int ccn_cmp(cc_size n, const cc_unit *s, const cc_unit *t) __asm__("_ccn_cmp");
 
 /* s < t -> return - 1 | s == t -> return 0 | s > t -> return 1
  { N bit, M bit -> int } N = ns * sizeof(cc_unit) * 8  M = nt * sizeof(cc_unit) * 8 */
@@ -332,7 +330,7 @@ int ccn_cmpn(cc_size ns, const cc_unit *s,
 /* s - t -> r return 1 iff t > s
  { N bit, N bit -> N bit } N = n * sizeof(cc_unit) * 8 */
 CC_NONNULL((2, 3, 4))
-cc_unit ccn_sub(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t);
+cc_unit ccn_sub(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t) __asm__("_ccn_sub");
 
 /* s - v -> r return 1 iff v > s return 0 otherwise.
  { N bit, sizeof(cc_unit) * 8 bit -> N bit } N = n * sizeof(cc_unit) * 8 */
@@ -353,7 +351,7 @@ cc_unit ccn_subn(cc_size n, cc_unit *r, const cc_unit *s,
 /* s + t -> r return carry if result doesn't fit in n bits.
  { N bit, N bit -> N bit } N = n * sizeof(cc_unit) * 8 */
 CC_NONNULL((2, 3, 4))
-cc_unit ccn_add(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t);
+cc_unit ccn_add(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t) __asm__("_ccn_add");
 
 /* s + v -> r return carry if result doesn't fit in n bits.
  { N bit, sizeof(cc_unit) * 8 bit -> N bit } N = n * sizeof(cc_unit) * 8 */
@@ -375,7 +373,7 @@ cc_unit ccn_addn(cc_size n, cc_unit *r, const cc_unit *s,
  { n bit, n bit -> 2 * n bit } n = count * sizeof(cc_unit) * 8
  { N bit, N bit -> 2N bit } N = ccn_bitsof(n) */
 CC_NONNULL((2, 3, 4))
-void ccn_mul(cc_size n, cc_unit *r_2n, const cc_unit *s, const cc_unit *t);
+void ccn_mul(cc_size n, cc_unit *r_2n, const cc_unit *s, const cc_unit *t) __asm__("_ccn_mul");
 
 /* s[0..n) * v -> r[0..n)+return value
  { N bit, sizeof(cc_unit) * 8 bit -> N + sizeof(cc_unit) * 8 bit } N = n * sizeof(cc_unit) * 8 */
@@ -387,50 +385,120 @@ cc_unit ccn_mul1(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit v);
 CC_NONNULL((2, 3))
 cc_unit ccn_addmul1(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit v);
 
-#if 0
-/* a % d -> n
-   {2 * n bit, n bit -> n bit } n = count * sizeof(cc_unit) * 8 */
-CC_NONNULL((2, 3, 4))
-void ccn_mod(cc_size n, cc_unit *r, const cc_unit *a_2n, const cc_unit *d);
-#endif
 
-/* r = (data, len) treated as a big endian byte array, return -1 if data
- doesn't fit in r, return 0 otherwise. */
+/*!
+ @function   ccn_read_uint
+ @abstract   Copy big endian integer and represent it in cc_units
+
+ @param n           Input allocated size of the cc_unit output array r
+ @param r           Ouput cc_unit array for unsigned integer
+ @param data_nbytes Input byte size of data
+ @param data        Input unsigned integer represented in big endian
+
+ @result r is initialized with the big unsigned number
+
+ @return 0 if no error, !=0 if the big number cannot be represented in the allocated cc_unit array.
+
+ @discussion The execution pattern of this function depends on both n and data_nbytes but not on data values except the handling
+ of the error case.
+ */
+
 CC_NONNULL((2, 4))
-int ccn_read_uint(cc_size n, cc_unit *r, size_t data_size, const uint8_t *data);
+int ccn_read_uint(cc_size n, cc_unit *r, size_t data_nbytes, const uint8_t *data);
 
 /* r = (data, len) treated as a big endian byte array, return -1 if data
  doesn't fit in r, return 0 otherwise.
  ccn_read_uint strips leading zeroes and doesn't care about sign. */
 #define ccn_read_int(n, r, data_size, data) ccn_read_uint(n, r, data_size, data)
 
-/* Return actual size in bytes needed to serialize s. */
-CC_PURE CC_NONNULL((2))
-size_t ccn_write_uint_size(cc_size n, const cc_unit *s);
+/*!
+ @function   ccn_write_uint_size
+ @abstract   Compute the minimum size required to store an big integer
+
+ @param n           Input size of the cc_unit array representing the input
+ @param s           Input cc_unit array
+
+ @result Return value is the exact byte size of the big integer
+
+ @discussion
+ The execution flow is independent on the value of the big integer.
+ However, the use of the returned value may leak the position of the most significant byte
+ */
+CC_PURE CC_NONNULL((2)) size_t ccn_write_uint_size(cc_size n, const cc_unit *s);
 
-/* Serialize s, to out.
-   First byte of byte stream is the m.s. byte of s,
-   regardless of the size of cc_unit.
+/*!
+ @function   ccn_write_uint
+ @abstract   Serialize the big integer into a big endian byte buffer
 
-   No assumption is made about the alignment of out.
+ @param n           Input size of the cc_unit array representing the input
+ @param s           Input cc_unit array
+ @param out_size    Size of the output buffer
+ @param out         Output byte array of size at least  out_size
+
+ @discussion This function writes exactly
+ MIN(out_size,ccn_write_uint_size(n,s)) bytes truncating to keep the
+ most significant bytes when out_size<ccn_write_uint_size(n,s). The
+ execution flow of function is based on the position of the most
+ significant byte as well as input sizes.
+
+ */
 
-   The out_size argument should be the value returned from ccn_write_uint_size,
-   and is also the exact number of bytes this function will write to out.
-   If out_size if less than the value returned by ccn_write_uint_size, only the
-   first out_size non-zero most significant octets of s will be written. */
 CC_NONNULL((2, 4))
 void ccn_write_uint(cc_size n, const cc_unit *s, size_t out_size, void *out);
 
+/*!
+ @function   ccn_write_uint_padded_ct
+ @abstract   Serialize the big integer into a big endian byte buffer
 
-CC_INLINE CC_NONNULL((2, 4))
-cc_size ccn_write_uint_padded(cc_size n, const cc_unit* s, size_t out_size, uint8_t* to)
-{
-    size_t bytesInKey = ccn_write_uint_size(n, s);
-    cc_size offset = (out_size > bytesInKey) ? out_size - bytesInKey : 0;
+ @param n           Input size of the cc_unit array representing the input
+ @param s           Input cc_unit array
+ @param out_size    Size of the output buffer
+ @param out         Output byte array of size at least  out_size
+
+ @return number of leading zero bytes in case of success, a negative error value in case of failure
+
+ @result  This function writes exactly out_size byte, padding with zeroes when necessary.
+ This function DOES NOT support truncation and returns an error if out_size < ccn_write_uint_size
+
+ @discussion The execution flow of function is independent on the value of the big integer
+ However, the processing of the return value by the caller may expose the position of
+ the most significant byte
+ */
+CC_NONNULL((2, 4))
+int ccn_write_uint_padded_ct(cc_size n, const cc_unit *s, size_t out_size, uint8_t *out);
+
+/*!
+ @function   ccn_write_uint_padded
+ @abstract   Serialize the big integer into a big endian byte buffer
+ Not recommended, for most cases ccn_write_uint_padded_ct is more appropriate
+ Sensitive big integers are exposed since the processing expose the position of the MS byte
+
+ @param n           Input size of the cc_unit array representing the input
+ @param s           Input cc_unit array
+ @param out_size    Size of the output buffer
+ @param out         Output byte array of size at least  out_size
 
-    cc_zero(offset, to);
-    ccn_write_uint(n, s, out_size - offset, to + offset);
+ @return number of leading zero bytes
 
+ @result  This function writes exactly out_size byte, padding with zeroes when necessary.
+ This function DOES support truncation when out_size<ccn_write_uint_size()
+
+ @discussion The execution flow of this function DEPENDS on the position of the most significant byte in
+ case truncation is required.
+ */
+
+CC_INLINE CC_NONNULL((2, 4)) size_t ccn_write_uint_padded(cc_size n, const cc_unit *s, size_t out_size, uint8_t *out)
+{
+    size_t offset = 0;
+    // Try first the non-truncation case
+    int offset_int = ccn_write_uint_padded_ct(n, s, out_size, out);
+    if (offset_int >= 0) {
+        // It worked
+        offset = (size_t)offset_int;
+    } else {
+        // Truncation case, execution depends on the position of the MSByte
+        ccn_write_uint(n, s, out_size, out);
+    }
     return offset;
 }
 
@@ -456,11 +524,11 @@ void ccn_write_int(cc_size n, const cc_unit *s, size_t out_size, void *out);
 /* s -> r
  { n bit -> n bit } */
 CC_NONNULL((2, 3))
-void ccn_set(cc_size n, cc_unit *r, const cc_unit *s);
+void ccn_set(cc_size n, cc_unit *r, const cc_unit *s) __asm__("_ccn_set");
 
 CC_INLINE CC_NONNULL((2))
 void ccn_zero(cc_size n, cc_unit *r) {
-    cc_zero(ccn_sizeof_n(n),r);
+    cc_clear(ccn_sizeof_n(n),r);
 }
 
 CC_INLINE CC_NONNULL((2))
index c6bc18a90b339c3a511cdfdb5f6b05ff168f43ca..731f3e7bca8564d4873d7af20cc370f6c0877da8 100644 (file)
 
 #include <corecrypto/cc.h>
 
-#define CCRNG_STATE_COMMON                                                          \
+#define CCRNG_STATE_COMMON \
     int (*generate)(struct ccrng_state *rng, size_t outlen, void *out);
 
-/* default state structure. Do not instantiate, ccrng() returns a reference to this structure */
+/*!
+ @type      struct ccrng_state
+ @abstract  Default state structure. Do not instantiate. ccrng() returns a reference to this structure
+ */
 struct ccrng_state {
     CCRNG_STATE_COMMON
 };
 
 /*!
  @function   ccrng
- @abstract   initializes a AES-CTR mode cryptographic random number generator and returns the statically alocated rng object. 
-             Getting a pointer to a ccrng has never been simpler! 
+ @abstract   Initializes an AES-CTR mode cryptographic random number generator and returns the statically-allocated rng object.
+             Getting a pointer to a ccrng has never been simpler!
              Call this function, get an rng object and then pass the object to ccrng_generate() to generate randoms.
              ccrng() may be called more than once. It returns pointer to the same object on all calls.
 
  @result  a cryptographically secure random number generator or NULL if fails
- @discussion 
+
+ @discussion
  - It is significantly faster than using the system /dev/random
  - FIPS Compliant: NIST SP800-80A + FIPS 140-2
  - Seeded from the system entropy.
@@ -42,7 +45,29 @@ struct ccrng_state {
 
 struct ccrng_state *ccrng(int *error);
 
-//call this macro with the rng argument set to output of the call to the ccrng() function
-#define ccrng_generate(rng, outlen, out) ((rng)->generate((struct ccrng_state *)(rng), (outlen), (out)))
+/*!
+ @function   ccrng_generate
+ @abstract   Generate `outlen` bytes of output, stored in `out`, using ccrng_state `rng`.
+
+ @param rng  `struct ccrng_state` representing the state of the RNG.
+ @param outlen  Amount of random bytes to generate.
+ @param out  Pointer to memory where random bytes are stored, of size at least `outlen`.
+
+ @result 0 on success and nonzero on failure.
+ */
+#define ccrng_generate(rng, outlen, out) \
+    ((rng)->generate((struct ccrng_state *)(rng), (outlen), (out)))
+
+/*!
+  @function ccrng_uniform
+  @abstract Generate a random value in @p [0, bound).
+
+  @param rng   The state of the RNG.
+  @param bound The exclusive upper bound on the output.
+  @param rand  A pointer to a single @p uint64_t to store the result.
+
+  @result Returns zero iff the operation is successful.
+ */
+int ccrng_uniform(struct ccrng_state *rng, uint64_t bound, uint64_t *rand);
 
 #endif /* _CORECRYPTO_CCRNG_H_ */
diff --git a/EXTERNAL_HEADERS/corecrypto/ccrng_system.h b/EXTERNAL_HEADERS/corecrypto/ccrng_system.h
deleted file mode 100644 (file)
index a5aab7e..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- *  ccrng_system.h
- *  corecrypto
- *
- *  Created on 12/13/2010
- *
- *  Copyright (c) 2010,2013,2014,2015 Apple Inc. All rights reserved.
- *
- */
-
-#ifndef _CORECRYPTO_CCRNG_SYSTEM_H_
-#define _CORECRYPTO_CCRNG_SYSTEM_H_
-
-#include <corecrypto/ccrng.h>
-
-struct ccrng_system_state {
-    CCRNG_STATE_COMMON
-    int fd;
-};
-
-/*!
- @function   ccrng_system_init - DEPRECATED
- @abstract   Default ccrng.
-    Please transition to ccrng() which is easier to use and with provide the fastest, most secure option
-
- @param  rng   Structure containing the state of the RNG, must remain allocated as
- long as the rng is used.
- @result 0 iff successful
-
- @discussion
-        This RNG require call to "init" AND "done", otherwise it may leak a file descriptor.
- */
-
-// Initialize ccrng
-// Deprecated, if you need a rng, just call the function ccrng()
-int ccrng_system_init(struct ccrng_system_state *rng);
-
-// Close the system RNG
-// Mandatory step to avoid leaking file descriptor
-void ccrng_system_done(struct ccrng_system_state *rng);
-
-#endif /* _CORECRYPTO_CCRNG_SYSTEM_H_ */
index 0f70c3740221198fa8e4012ea584c4ac9bda0997..a2baa932be4e3d380d0f0b91c295d01982c38f6d 100644 (file)
@@ -56,7 +56,7 @@ typedef struct ccrsa_priv_ctx* ccrsa_priv_ctx_t;
 
 /* Declare a fully scheduled rsa key.  Size is the size in bytes each ccn in
    the key.  For example to declare (on the stack or in a struct) a 1021 bit
-   rsa public key named foo use ccrsa_pub_ctx_decl(ccn_sizeof(1021), foo).
+   rsa public key named foo use ccrsa_pub_ctx_decl(ccn_sizeof(1021), foo). 
  */
 #define ccrsa_full_ctx_decl(_size_, _name_)   cc_ctx_decl(struct ccrsa_full_ctx, ccrsa_full_ctx_size(_size_), _name_)
 #define ccrsa_full_ctx_clear(_size_, _name_)  cc_clear(ccrsa_full_ctx_size(_size_), _name_)
@@ -84,7 +84,7 @@ typedef struct ccrsa_priv_ctx* ccrsa_priv_ctx_t;
 #define ccrsa_ctx_private_qinv(FK) ((ccrsa_get_private_ctx_ptr(FK))->pv_ccn + 6 * ccrsa_ctx_private_zp(FK)->n + 2 + ccn_nof_size(sizeof(struct cczp)))
 
 /* rvalue accessors to ccec_key fields. */
-CC_CONST CC_INLINE
+CC_INLINE
 ccrsa_priv_ctx_t ccrsa_get_private_ctx_ptr(ccrsa_full_ctx_t fk) {
     ccrsa_priv_ctx_t priv = (ccrsa_priv_ctx_t)(ccrsa_ctx_d(fk)+ccrsa_ctx_n(fk));
     return priv;
@@ -96,16 +96,14 @@ ccrsa_priv_ctx_t ccrsa_get_private_ctx_ptr(ccrsa_full_ctx_t fk) {
  @param      fk      RSA full key
  @result     Returns RSA public ker
  */
-CC_CONST CC_INLINE
+CC_INLINE
 ccrsa_pub_ctx_t ccrsa_ctx_public(ccrsa_full_ctx_t fk) {
     return (ccrsa_pub_ctx_t) fk;
 }
 
 /* Return exact key bit size */
-static inline size_t
-ccrsa_pubkeylength(ccrsa_pub_ctx_t pubk) {
-    return cczp_bitlen(ccrsa_ctx_zm(pubk));
-}
+CC_NONNULL_ALL
+size_t ccrsa_pubkeylength(ccrsa_pub_ctx_t pubk);
 
 /* PKCS1 pad_markers */
 #define CCRSA_PKCS1_PAD_SIGN     1
@@ -116,6 +114,33 @@ CC_NONNULL((1, 2, 3))
 int ccrsa_init_pub(ccrsa_pub_ctx_t key, const cc_unit *modulus,
                     const cc_unit *e);
 
+/*!
+ @function   ccrsa_make_priv
+ @abstract   Initialize public and private key based on modulus and e, p and q as big endian byte arrays;
+ @param      full_ctx   Initialized context with full_ctx->zp.n already set to 2*ccn_nof_size(p_mbytes)
+ @param      exp_mbytes Number of bytes in big endian e.
+ @param      exp_in     pointer to big endian exponent e (may have leading 0's).
+ @param      p_mbytes   Number of bytes in big endian p.
+ @param      p_in       Pointer to the rsa p.
+ @param      q_mbytes   Number of bytes in big endian q.
+ @param      q_in       Pointer to the rsa q.
+ @result     0          iff successful.
+ @discussion  full_ctx->zp.n must already be set to 2*ccn_nof_size(p_mbytes), witt the expectation that p_mbytes>q_mbytes.
+ e is the public exponent, and exp_mbytes<= 2*p_mbytes.
+ The output is a fully formed rsa context with N=pq, d=e^{-1} mod phi(N), and appropriate inverses of different associated values precomputed
+ to speed computation.
+ */
+
+int ccrsa_make_priv(ccrsa_full_ctx_t full_ctx,
+                    size_t exp_mbytes,
+                    const uint8_t *exp_in,
+                    size_t p_mbytes,
+                    const uint8_t *p_in,
+                    size_t q_mbytes,
+                    const uint8_t *q_in);
+
 /* Initialize key based on modulus and e as big endian byte array
     key->zp.n must already be set. */
 CC_NONNULL((1, 3, 5))
@@ -139,12 +164,15 @@ CC_NONNULL((2, 4, 5))
 int ccrsa_generate_key(size_t nbits, ccrsa_full_ctx_t rsa_ctx,
                        size_t e_size, const void *e, struct ccrng_state *rng) CC_WARN_RESULT;
 
-/* Generate RSA key in conformance with FIPS186-4 standard */
+/* Generate RSA key in conformance with FIPS186-4 standard.
+   The first RNG `rng` will be used to generate p and q.
+   The second RNG `rng_mr` will be used only for primality testing.
+   This is relevant only for testing, just pass the same RNG twice. */
 CC_NONNULL((2, 4, 5, 6))
 int
 ccrsa_generate_fips186_key(size_t nbits, ccrsa_full_ctx_t fk,
                            size_t e_size, const void *eBytes,
-                           struct ccrng_state *rng1, struct ccrng_state *rng2) CC_WARN_RESULT;
+                           struct ccrng_state *rng, struct ccrng_state *rng_mr) CC_WARN_RESULT;
 
 /* Construct RSA key from fix input in conformance with FIPS186-4 standard */
 CC_NONNULL((3, 5, 7, 9, 11, 13, 15, 16))
@@ -221,7 +249,7 @@ int ccrsa_verify_pss(ccrsa_pub_ctx_t key,
                         for the output signature
 
  @result     0 iff successful.
-
   @discussion Null OID is a special case, required to support RFC 4346 where the padding
  is based on SHA1+MD5. In general it is not recommended to use a NULL OID,
  except when strictly required for interoperability
@@ -261,9 +289,9 @@ int ccrsa_verify_pkcs1v15(ccrsa_pub_ctx_t key, const uint8_t *oid,
 /*!
  @function   ccder_encode_rsa_pub_size
  @abstract   Calculate size of public key export format data package.
-
  @param      key        Public key
-
  @result     Returns size required for encoding.
  */
 
@@ -273,7 +301,7 @@ size_t ccder_encode_rsa_pub_size(const ccrsa_pub_ctx_t key);
 /*!
  @function   ccrsa_export_priv_pkcs1
  @abstract   Export a public key.
-
  @param      key        Public key
  @param      der        Beginning of output DER buffer
  @param      der_end    End of output DER buffer
@@ -286,9 +314,9 @@ uint8_t *ccder_encode_rsa_pub(const ccrsa_pub_ctx_t key, uint8_t *der, uint8_t *
 /*!
  @function   ccder_encode_rsa_priv_size
  @abstract   Calculate size of full key exported in PKCS#1 format.
-
  @param      key        Full key
-
  @result     Returns size required for encoding.
  */
 
@@ -298,7 +326,7 @@ size_t ccder_encode_rsa_priv_size(const ccrsa_full_ctx_t key);
 /*!
  @function   ccder_encode_rsa_priv
  @abstract   Export a full key in PKCS#1 format.
-
  @param      key        Full key
  @param      der        Beginning of output DER buffer
  @param      der_end    End of output DER buffer
@@ -311,10 +339,10 @@ uint8_t *ccder_encode_rsa_priv(const ccrsa_full_ctx_t key, const uint8_t *der, u
  @function   ccder_decode_rsa_pub_n
  @abstract   Calculate "n" for a public key imported from a data package.
         PKCS #1 format
-
  @param      der        Beginning of input DER buffer
  @param      der_end    End of input DER buffer
-
  @result the "n" of the RSA key that would result from the import.  This can be used
  to declare the key itself.
  */
@@ -326,11 +354,11 @@ cc_size ccder_decode_rsa_pub_n(const uint8_t *der, const uint8_t *der_end);
  @function   ccder_decode_rsa_pub
  @abstract   Import a public RSA key from a package in public key format.
         PKCS #1 format
-
  @param      key          Public key (n must be set)
  @param      der        Beginning of input DER buffer
  @param      der_end    End of input DER buffer
-
  @result     Key is initialized using the data in the public key message.
  */
 
@@ -369,10 +397,10 @@ const uint8_t *ccder_decode_rsa_pub_x509(const ccrsa_pub_ctx_t key, const uint8_
 /*!
  @function   ccder_decode_rsa_priv_n
  @abstract   Calculate "n" for a private key imported from a data package.
-
  @param      der        Beginning of input DER buffer
  @param      der_end    End of input DER buffer
-
  @result the "n" of the RSA key that would result from the import.  This can be used
  to declare the key itself.
  */
@@ -383,11 +411,11 @@ cc_size ccder_decode_rsa_priv_n(const uint8_t *der, const uint8_t *der_end);
 /*!
  @function   ccder_decode_rsa_priv
  @abstract   Import a private RSA key from a package in PKCS#1 format.
-
  @param      key          Full key (n must be set)
  @param      der        Beginning of input DER buffer
  @param      der_end    End of input DER buffer
-
  @result     Key is initialized using the data in the public key message.
  */
 
@@ -397,13 +425,13 @@ const uint8_t *ccder_decode_rsa_priv(const ccrsa_full_ctx_t key, const uint8_t *
 /*!
  @function   ccrsa_export_pub_size
  @abstract   Calculate size of public key exported data package.
-
  @param      key        Public key
-
  @result     Returns size required for encoding.
  */
 
-CC_CONST CC_INLINE CC_NONNULL((1))
+CC_INLINE CC_NONNULL((1))
 size_t ccrsa_export_pub_size(const ccrsa_pub_ctx_t key) {
     return ccder_encode_rsa_pub_size(key);
 }
@@ -411,7 +439,7 @@ size_t ccrsa_export_pub_size(const ccrsa_pub_ctx_t key) {
 /*!
  @function   ccrsa_export_pub
  @abstract   Export a public key in public key format.
-
  @param      key        Public key
  @param      out_len    Allocated size
  @param      out        Output buffer
@@ -422,15 +450,15 @@ int ccrsa_export_pub(const ccrsa_pub_ctx_t key, size_t out_len, uint8_t *out);
 /*!
  @function   ccrsa_import_pub_n
  @abstract   Calculate "n" for a public key imported from a data package.
-
  @param      inlen        Length of public key package data
  @param      der          pointer to public key package data
-
  @result the "n" of the RSA key that would result from the import.  This can be used
  to declare the key itself.
  */
 
-CC_CONST CC_INLINE CC_NONNULL((2))
+CC_INLINE CC_NONNULL((2))
 cc_size ccrsa_import_pub_n(size_t inlen, const uint8_t *der) {
     cc_size size = ccder_decode_rsa_pub_x509_n(der, der + inlen);
     if(size == 0) {
@@ -442,11 +470,11 @@ cc_size ccrsa_import_pub_n(size_t inlen, const uint8_t *der) {
 /*!
  @function   ccrsa_import_pub
  @abstract   Import a public RSA key from a package in public key format.
-
  @param      key          Public key (n must be set)
  @param      inlen        Length of public key package data
  @param      der           pointer to public key package data
-
  @result     Key is initialized using the data in the public key message.
  */
 
@@ -456,13 +484,13 @@ int ccrsa_import_pub(ccrsa_pub_ctx_t key, size_t inlen, const uint8_t *der);
 /*!
  @function   ccrsa_export_priv_size
  @abstract   Calculate size of full key exported in PKCS#1 format.
-
  @param      key        Full key
-
  @result     Returns size required for encoding.
  */
 
-CC_CONST CC_INLINE CC_NONNULL((1))
+CC_INLINE CC_NONNULL((1))
 size_t ccrsa_export_priv_size(const ccrsa_full_ctx_t key) {
     return ccder_encode_rsa_priv_size(key);
 }
@@ -470,13 +498,13 @@ size_t ccrsa_export_priv_size(const ccrsa_full_ctx_t key) {
 /*!
  @function   ccrsa_export_priv
  @abstract   Export a full key in PKCS#1 format.
-
  @param      key        Full key
  @param      out_len    Allocated size
  @param      out        Output buffer
  */
 
-CC_CONST CC_INLINE CC_NONNULL((1, 3))
+CC_INLINE CC_NONNULL((1, 3))
 int ccrsa_export_priv(const ccrsa_full_ctx_t key, size_t out_len, uint8_t *out) {
     return (ccder_encode_rsa_priv(key, out, out+out_len) != out);
 }
@@ -484,15 +512,15 @@ int ccrsa_export_priv(const ccrsa_full_ctx_t key, size_t out_len, uint8_t *out)
 /*!
  @function   ccrsa_import_priv_n
  @abstract   Calculate size of full key exported in PKCS#1 format.
-
  @param      inlen        Length of PKCS#1 package data
  @param      der           pointer to PKCS#1 package data
-
  @result the "n" of the RSA key that would result from the import.  This can be used
  to declare the key itself.
  */
 
-CC_CONST CC_INLINE CC_NONNULL((2))
+CC_INLINE CC_NONNULL((2))
 cc_size ccrsa_import_priv_n(size_t inlen, const uint8_t *der) {
     return ccder_decode_rsa_priv_n(der, der + inlen);
 }
@@ -500,15 +528,15 @@ cc_size ccrsa_import_priv_n(size_t inlen, const uint8_t *der) {
 /*!
  @function   ccrsa_import_priv
  @abstract   Import a full RSA key from a package in PKCS#1 format.
-
  @param      key          Full key (n must be set)
  @param      inlen        Length of PKCS#1 package data
  @param      der           pointer to PKCS#1 package data
-
  @result     Key is initialized using the data in the PKCS#1 message.
  */
 
-CC_CONST CC_INLINE CC_NONNULL((1, 3))
+CC_INLINE CC_NONNULL((1, 3))
 int ccrsa_import_priv(ccrsa_full_ctx_t key, size_t inlen, const uint8_t *der) {
     return (ccder_decode_rsa_priv(key, der, der+inlen) == NULL);
 }
index 3f343401e62934d7b2406ea7d1c9d51a773a7181..4dc3c51942675a81b4a40008d0470e95540835de 100644 (file)
@@ -29,8 +29,8 @@ extern const struct ccdigest_info ccsha1_eay_di;
 extern const struct ccdigest_info ccsha1_vng_intel_SupplementalSSE3_di;
 #endif
 
-#if  CCSHA1_VNG_ARMV7NEON
-extern const struct ccdigest_info ccsha1_vng_armv7neon_di;
+#if  CCSHA1_VNG_ARM
+extern const struct ccdigest_info ccsha1_vng_arm_di;
 #endif
 
 /* TODO: Placeholders */
index 995ef7e268cfbfae5f84cf24bcdafaf893234b1f..e80c70e9e26891cb607006d1b798609f6ecf0265 100644 (file)
@@ -42,9 +42,14 @@ extern const struct ccdigest_info ccsha256_ltc_di;
 extern const struct ccdigest_info ccsha224_vng_intel_SupplementalSSE3_di;
 extern const struct ccdigest_info ccsha256_vng_intel_SupplementalSSE3_di;
 #endif
-#if  CCSHA2_VNG_ARMV7NEON
-extern const struct ccdigest_info ccsha224_vng_armv7neon_di;
-extern const struct ccdigest_info ccsha256_vng_armv7neon_di;
+#if  CCSHA2_VNG_ARM
+extern const struct ccdigest_info ccsha224_vng_arm_di;
+extern const struct ccdigest_info ccsha256_vng_arm_di;
+#if CC_ACCELERATECRYPTO && defined(__arm64__) && CCSHA2_VNG_ARM
+extern const struct ccdigest_info ccsha256_vng_arm64neon_di;
+#endif  // CC_ACCELERATECRYPTO
+extern const struct ccdigest_info ccsha384_vng_arm_di;
+extern const struct ccdigest_info ccsha512_vng_arm_di;
 #endif
 
 /* SHA224 */
index d392432dc20171ead1a33b660a195ac4e30d5028..e77f6b863bddfe60e8659aef2f0c38f8a10f5625 100644 (file)
@@ -19,7 +19,7 @@
  definitions.
 
  Declare cczp objects using cczp_decl_n(). It allocates cc_unit arrays of the length returned by
either cczp_nof_n() or cczp_short_nof_n().
cczp_nof_n().
 */
 
 struct cczp;
@@ -27,7 +27,7 @@ struct cczp;
 typedef struct cczp *cczp_t;
 typedef const struct cczp *cczp_const_t;
 
-typedef void (*ccmod_func_t)(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *s);
+typedef void (*ccmod_func_t)(cc_ws_t ws, cczp_const_t zp, cc_unit *t, const cc_unit *x, const cc_unit *y);
 
 // keep cczp_hd and cczp structures consistent
 // cczp_hd is typecasted to cczp to read EC curve params
@@ -36,7 +36,7 @@ typedef void (*ccmod_func_t)(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_u
 #define __CCZP_HEADER_ELEMENTS_DEFINITIONS(pre) \
     cc_size pre##n;                             \
     cc_unit pre##options;                       \
-    ccmod_func_t pre##mod_prime;
+    ccmod_func_t pre##mulmod_prime;
 
 #define __CCZP_ELEMENTS_DEFINITIONS(pre)    \
     __CCZP_HEADER_ELEMENTS_DEFINITIONS(pre) \
@@ -60,85 +60,44 @@ struct cczp {
 #define cczp_nof_n(_n_) (ccn_nof_size(sizeof(struct cczp)) + 1 + 2 * (_n_))
 
 /* Return number of units that a struct cczp needs to be in units for a prime
-   size of _n_ units.  The _short variant does not have room for CCZP_RECIP,
-   so it can not be used with cczp_mod, cczp_mul, cczp_sqr. It can be used
-   with cczp_add, cczp_sub, cczp_div2, cczp_mod_inv. */
-#define cczp_short_nof_n(_n_) (ccn_nof_size(sizeof(struct cczp)) + (_n_))
-
+   size of _n_ units. */
 #define cczp_decl_n(_n_, _name_) cc_ctx_decl(struct cczp, ccn_sizeof_n(cczp_nof_n(_n_)), _name_)
-#define cczp_short_decl_n(_n_, _name_) \
-    cc_ctx_decl(struct cczp_short, ccn_sizeof_n(cczp_short_nof_n(_n_)), _name_)
-
 #define cczp_clear_n(_n_, _name_) cc_clear(ccn_sizeof_n(cczp_nof_n(_n_)), _name_)
-#define cczp_short_clear_n(_n_, _name_) cc_clear(ccn_sizeof_n(cczp_short_nof_n(_n_)), _name_)
 
 #define CCZP_N(ZP) ((ZP)->n)
-#define CCZP_MOD(ZP) ((ZP)->mod_prime)
-#define CCZP_MOD_PRIME(ZP) CCZP_MOD(ZP)
 #define CCZP_PRIME(ZP) ((ZP)->ccn)
 #define CCZP_RECIP(ZP) ((ZP)->ccn + CCZP_N(ZP))
-#define CCZP_OPS(ZP) ((ZP)->options)
-CC_CONST CC_NONNULL((1)) static inline cc_size cczp_n(cczp_const_t zp)
+CC_NONNULL((1)) CC_INLINE cc_size cczp_n(cczp_const_t zp)
 {
     return zp->n;
 }
 
-CC_CONST CC_NONNULL((1)) static inline cc_unit cczp_options(cczp_const_t zp)
-{
-    return zp->options;
-}
-
-CC_CONST CC_NONNULL((1)) static inline ccmod_func_t cczp_mod_prime(cczp_const_t zp)
-{
-    return zp->mod_prime;
-}
-
-CC_CONST CC_NONNULL((1)) static inline const cc_unit *cczp_prime(cczp_const_t zp)
+CC_NONNULL((1)) CC_INLINE const cc_unit *cczp_prime(cczp_const_t zp)
 {
     return zp->ccn;
 }
 
 /* Return a pointer to the Reciprocal or Montgomery constant of zp, which is
  allocated cczp_n(zp) + 1 units long. */
-CC_CONST CC_NONNULL((1))
-
-    static inline const cc_unit *cczp_recip(cczp_const_t zp)
+CC_NONNULL((1)) CC_INLINE const cc_unit *cczp_recip(cczp_const_t zp)
 {
     return zp->ccn + zp->n;
 }
 
-CC_CONST CC_NONNULL((1)) CC_INLINE size_t cczp_bitlen(cczp_const_t zp)
-{
-    return ccn_bitlen(cczp_n(zp), cczp_prime(zp));
-}
-
 /* Ensure both cczp_mod_prime(zp) and cczp_recip(zp) are valid. cczp_n and
-   cczp_prime must have been previously initialized. */
+   cczp_prime must have been previously initialized. The reciprocal will
+   be computed and set. */
 CC_NONNULL((1))
 int cczp_init(cczp_t zp);
 
-/* Compute r = s2n mod cczp_prime(zp). Will write cczp_n(zp)
- units to r and reads 2 * cczp_n(zp) units units from s2n. If r and s2n are not
- identical they must not overlap.  Before calling this function either
- cczp_init(zp) must have been called or both CCZP_MOD_PRIME((cc_unit *)zp)
- and CCZP_RECIP((cc_unit *)zp) must be initialized some other way. */
-CC_NONNULL((1, 2, 3)) void cczp_mod(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *s2n);
-
-/* Compute r = sn mod cczp_prime(zp), Will write cczp_n(zp)
- units to r and reads sn units units from s. If r and s are not
- identical they must not overlap.  Before calling this function either
- cczp_init(zp) must have been called or both CCZP_MOD_PRIME((cc_unit *)zp)
- and CCZP_RECIP((cc_unit *)zp) must be initialized some other way. */
-CC_NONNULL((1, 2, 4)) int cczp_modn(cczp_const_t zp, cc_unit *r, cc_size ns, const cc_unit *s);
-
-/* Compute r = x * y mod cczp_prime(zp). Will write cczp_n(zp) units to r
-   and reads cczp_n(zp) units units from both x and y. If r and x are not
-   identical they must not overlap, The same holds for r and y.  Before
-   calling this function either cczp_init(zp) must have been called or both
-   CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must be
-   initialized some other way. */
-CC_NONNULL((1, 2, 3, 4))
-void cczp_mul(cczp_const_t zp, cc_unit *t, const cc_unit *x, const cc_unit *y);
+/*! @function cczp_init_with_recip
+ @abstract Initializes a cczp struct with a given reciprocal.
+
+ @param zp    Pointer to a cczp struct.
+ @param recip Reciprocal for zp's prime.
+ */
+CC_NONNULL((1, 2))
+void cczp_init_with_recip(cczp_t zp, const cc_unit *recip);
 
 /* Compute r = m ^ e mod cczp_prime(zp), using Montgomery ladder.
    - writes cczp_n(zp) units to r
@@ -152,21 +111,6 @@ void cczp_mul(cczp_const_t zp, cc_unit *t, const cc_unit *x, const cc_unit *y);
 CC_NONNULL((1, 2, 3, 4))
 int cczp_power(cczp_const_t zp, cc_unit *r, const cc_unit *m, const cc_unit *e);
 
-/* Compute r = m ^ e mod cczp_prime(zp), using Square Square Multiply Always.
- - writes cczp_n(zp) units to r
- - reads  cczp_n(zp) units units from m and e
- - if r and m are not identical they must not overlap.
- - r and e must not overlap nor be identical.
- - before calling this function either cczp_init(zp) must have been called
- or both CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must
- be initialized some other way.
-
- Important: This function is intented to be constant time but is more likely
-    to leak information due to memory cache. Only used with randomized input
- */
-CC_NONNULL((1, 2, 3, 4))
-int cczp_power_ssma(cczp_const_t zp, cc_unit *r, const cc_unit *m, const cc_unit *e);
-
 /*!
  @brief cczp_inv(zp, r, x) computes r = x^-1 (mod p) , where p=cczp_prime(zp).
  @discussion It is a general function and works for any p. It validates the inputs. r and x can
@@ -182,32 +126,4 @@ int cczp_power_ssma(cczp_const_t zp, cc_unit *r, const cc_unit *m, const cc_unit
 CC_NONNULL((1, 2, 3))
 int cczp_inv(cczp_const_t zp, cc_unit *r, const cc_unit *x);
 
-/*!
- @brief cczp_inv_odd(zp, r, x) computes r = x^-1 (mod p) , where p=cczp_prime(zp) is an odd number.
- @discussion  r and x can overlap.
- @param zp  The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to
- be called before invoking.
- @param x input big integer
- @param r output big integer
- @return  0 if successful
- */
-CC_NONNULL((1, 2, 3)) int cczp_inv_odd(cczp_const_t zp, cc_unit *r, const cc_unit *x);
-
-/*!
- @brief cczp_inv_field(zp, r, x) computes r = x^-1 (mod p) , where p=cczp_prime(zp) is a prime
- number number.
- @discussion r and x must NOT overlap. The excution time of the function is independent to the value
- of the input x. It works only if p is a field. That is, when p is a prime. It supports Montgomery
- and non-Montgomery form of zp. It leaks the value of the prime and should only be used be used for
- public (not secret) primes (ex. Elliptic Curves)
-
- @param zp  The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to
- be called before invoking cczp_inv_field().
- @param x input big unteger
- @param r output big integer
- @return  0 if inverse exists and correctly computed.
- */
-CC_NONNULL((1, 2, 3))
-int cczp_inv_field(cczp_const_t zp, cc_unit *r, const cc_unit *x);
-
 #endif /* _CORECRYPTO_CCZP_H_ */
index ecaf2efed413dc2c5166eb01e880724009285949..9cd1d4e4b6055d70b553848fa4bba9c26c8a80cf 100644 (file)
@@ -40,7 +40,7 @@
  * individual preprocessor macros in this header that declare new behavior as
  * required.
  */
-#define IMG4_API_VERSION (20181106u)
+#define IMG4_API_VERSION (20190125u)
 
 #if !defined(KERNEL) && !IMG4_PROJECT_BUILD
 #define IMG4_API_AVAILABLE_20180112 \
 #define IMG4_API_AVAILABLE_20181106 \
                __API_UNAVAILABLE(macos) \
                API_AVAILABLE(ios(12.2), tvos(12.2), watchos(5.2))
-#define IMG4_API_AVAILABLE_20181106
+#define IMG4_API_AVAILABLE_20190125 \
+               API_AVAILABLE(macos(10.15), ios(13.0), tvos(13.0), watchos(6.0))
 #else
 #define IMG4_API_AVAILABLE_20180112
 #define IMG4_API_AVAILABLE_20181004
 #define IMG4_API_AVAILABLE_20181106
+#define IMG4_API_AVAILABLE_20190125
+#endif // !defined(KERNEL) && !IMG4_PROJECT_BUILD
+
+#if !defined(OS_CLOSED_ENUM)
+#define OS_CLOSED_ENUM(_name, _type, ...) \
+               OS_ENUM(_name, _type, ## __VA_ARGS__)
+#endif
+
+#if !defined(OS_CLOSED_OPTIONS)
+#define OS_CLOSED_OPTIONS(_name, _type, ...) \
+               OS_ENUM(_name, _type, ## __VA_ARGS__)
 #endif
 
 /*!
index 5f5ba1d0248bb4215470c9b2a0906fa9d6771a91..6942de840528b08af3a6d83de0e570dd48bd9778 100644 (file)
@@ -9,6 +9,10 @@
 #error "Please #include <img4/img4.h> instead of this file directly"
 #endif // __IMG4_INDIRECT
 
+#if IMG4_TAPI
+#include "tapi.h"
+#endif
+
 /*!
  * @typedef img4_environment_t
  * An opaque type describing an Image4 environment.
@@ -21,7 +25,7 @@ typedef struct _img4_environment img4_environment_t;
  * resolve the environment. This is the environment against which manifests are
  * personalized.
  */
-#if !MACH_KERNEL_PRIVATE
+#if !XNU_KERNEL_PRIVATE
 IMG4_API_AVAILABLE_20180112
 OS_EXPORT
 const struct _img4_environment _img4_environment_platform;
@@ -37,7 +41,7 @@ const struct _img4_environment _img4_environment_platform;
  * environment should be used as a fallback when validation against the platform
  * fails, and the caller is handling a loadable trust cache.
  */
-#if !MACH_KERNEL_PRIVATE
+#if !XNU_KERNEL_PRIVATE
 IMG4_API_AVAILABLE_20181004
 OS_EXPORT
 const struct _img4_environment _img4_environment_trust_cache;
index c3faf5a28351c5c47e35a3c4e74d50cb04a35182..cb68c4645e41c364f826df071a5941944d6d2f81 100644 (file)
 #include <stdbool.h>
 #include <sys/cdefs.h>
 
+#if KERNEL
+#if !defined(OS_CLOSED_ENUM)
+#define OS_CLOSED_ENUM(...) OS_ENUM(__VA_ARGS__)
+#endif
+
+#if !defined(OS_OPTIONS)
+#define OS_OPTIONS(...) OS_ENUM(__VA_ARGS__)
+#endif
+
+#if !defined(OS_CLOSED_OPTIONS)
+#define OS_CLOSED_OPTIONS(...) OS_ENUM(__VA_ARGS__)
+#endif
+#endif
+
 #define __IMG4_INDIRECT 1
 
 /*
- * This header is used in the pmap layer in xnu, which is in osfmk, which does
- * not have access to most of the BSD headers. (But for some reason it does have
- * access to sys/cdefs.h.) The only thing we need from that header is the
- * errno_t typedef though, so if we can't get to it, then just typedef it
- * ourselves.
+ * When used from the pmap layer, this header pulls in the types from libsa,
+ * which conflict with the BSD sys/types.h header that we need to pull in. But
+ * we only need it for the errno_t typedef and the vnode_t typedef. So when
+ * building MACH_KERNEL_PRIVATE, we do two things:
+ *
+ *     1. Explicitly pull in <sys/_types/_errno_t.h>, so we get errno_t and
+ *        nothing else (no transitive #include's)
+ *     2. #define _SYS_TYPES_H_ before #includ'ing <sys/kernel_types.h> so that
+ *        we don't get the transitive #include of <sys/types.h> but we still get
+ *        the definitions we need
  */
 #if MACH_KERNEL_PRIVATE
-typedef int errno_t;
+#define _SYS_TYPES_H_ 1
+#include <sys/kernel_types.h>
+#include <sys/_types/_errno_t.h>
 #else
+#include <sys/kernel_types.h>
 #include <sys/types.h>
 #endif
 
@@ -238,7 +260,7 @@ typedef void (*img4_destructor_t)(
  * It is illegal to use a manifest which possesses a CHMH tag as a first-stage
  * manifest.
  */
-OS_ENUM(img4_flags, uint64_t,
+OS_CLOSED_OPTIONS(img4_flags, uint64_t,
        I4F_INIT = 0,
        I4F_TRUST_MANIFEST = (1 << 0),
        I4F_FORCE_MIXNMATCH = (1 << 1),
@@ -264,12 +286,13 @@ typedef struct _img4 {
 #endif
 } img4_t;
 
-typedef char _img4_payload_opaque_data_64[496];
+typedef char _img4_payload_opaque_data_64[504];
 
-#if __ARM_ARCH_7S__ || __i386__
-typedef char _img4_payload_opaque_data_32[324];
-#else
+#if __ARM_ARCH_7A__ || __ARM_ARCH_7S__ || __ARM_ARCH_7K__ || \
+               __ARM64_ARCH_8_32__ || __i386__
 typedef char _img4_payload_opaque_data_32[328];
+#else
+typedef char _img4_payload_opaque_data_32[332];
 #endif
 
 /*!
@@ -330,7 +353,7 @@ typedef struct _img4_payload {
  * The bytes given to this routine must represent an Image4 manifest. They may
  * optionally also represent an Image4 payload.
  */
-#if !MACH_KERNEL_PRIVATE
+#if !XNU_KERNEL_PRIVATE
 IMG4_API_AVAILABLE_20180112
 OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL3
 errno_t
@@ -361,7 +384,7 @@ img4_init(img4_t *i4, img4_flags_t flags, const uint8_t *bytes, size_t len,
  * though there is no nonce in the environment. Therefore, any manifests which
  * have a BNCH property constraint will fail to validate.
  */
-#if !MACH_KERNEL_PRIVATE
+#if !XNU_KERNEL_PRIVATE
 IMG4_API_AVAILABLE_20180112
 OS_EXPORT OS_NONNULL1 OS_NONNULL2
 void
@@ -384,7 +407,7 @@ img4_set_nonce(img4_t *i4, const void *bytes, size_t len);
  * @discussion
  * See discussion for {@link img4_set_nonce}.
  */
-#if !MACH_KERNEL_PRIVATE
+#if !XNU_KERNEL_PRIVATE
 IMG4_API_AVAILABLE_20181106
 OS_EXPORT OS_NONNULL1 OS_NONNULL2
 void
@@ -446,7 +469,7 @@ img4_set_nonce_domain(img4_t *i4, const img4_nonce_domain_t *nd);
  * If any one of these validation checks fails, the payload is considered
  * untrustworthy and is not returned.
  */
-#if !MACH_KERNEL_PRIVATE
+#if !XNU_KERNEL_PRIVATE
 IMG4_API_AVAILABLE_20180112
 OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL3 OS_NONNULL4 OS_NONNULL5
 errno_t
@@ -475,10 +498,18 @@ img4_get_trusted_payload(img4_t *i4, img4_tag_t tag,
  * A pointer to the storage where the pointer to the payload buffer will be
  * written on success.
  *
+ * If the payload objects was initialized with
+ * {@link img4_payload_init_with_vnode_4xnu}, this parameter should be NULL, as
+ * there will be no in-memory buffer to return.
+ *
  * @param len
  * A pointer to the storage where the length of the payload buffer will be
  * written on success.
  *
+ * If the payload objects was initialized with
+ * {@link img4_payload_init_with_vnode_4xnu}, this parameter should be NULL, as
+ * there will be no in-memory buffer to return.
+ *
  * @result
  * Upon success, zero is returned. The implementation may also return one of the
  * following error codes directly:
@@ -494,6 +525,18 @@ img4_get_trusted_payload(img4_t *i4, img4_tag_t tag,
  *     [EILSEQ]     The payload for the given tag does not match its description
  *                  in the manifest
  *     [EIO]        The payload could not be fetched
+ *     [EIO]        The payload was initialized with
+ *                  {@link img4_payload_init_with_vnode_4xnu}, and reading from
+ *                  the vnode stalled repeatedly beyond the implementation's
+ *                  tolerance
+ *
+ * If the payload was initialized with
+ * {@link img4_payload_init_with_vnode_4xnu}, any error returned by
+ * {@link vnode_getattr} or {@link vn_rdwr} may be returned.
+ *
+ * If the payload was initialized with
+ * {@link img4_payload_init_with_fd_4MSM}, any error returned by stat(2),
+ * read(2), or malloc(3) may be returned.
  *
  * Otherwise, an error from the underlying Image4 implementation will be
  * returned.
@@ -502,10 +545,9 @@ img4_get_trusted_payload(img4_t *i4, img4_tag_t tag,
  * This routine performs the same validation steps as
  * {@link img4_get_trusted_payload}.
  */
-#if !MACH_KERNEL_PRIVATE
+#if !XNU_KERNEL_PRIVATE
 IMG4_API_AVAILABLE_20180112
-OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL2 OS_NONNULL3 OS_NONNULL4
-OS_NONNULL5
+OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL2 OS_NONNULL3
 errno_t
 img4_get_trusted_external_payload(img4_t *i4, img4_payload_t *payload,
                const img4_environment_t *env, const uint8_t **bytes, size_t *len);
@@ -525,7 +567,7 @@ img4_get_trusted_external_payload(img4_t *i4, img4_payload_t *payload,
  * The destructor passed to {@link img4_init} is called as a result of this
  * routine, if any was set.
  */
-#if !MACH_KERNEL_PRIVATE
+#if !XNU_KERNEL_PRIVATE
 IMG4_API_AVAILABLE_20180112
 OS_EXPORT OS_NONNULL1
 void
index c9571f704bcd5422d4f1536ce97d982d7598b713..93c10c0c93e9e2c6b2b4f2c0b729658d651705c8 100644 (file)
@@ -1,8 +1,8 @@
 /*!
  * @header
  * Provides an interface for managing nonces to govern the lifetime of a
- * personalization performed with Tatsu. A nonce managed by this interface may
- * be used in a Tatsu signing request as the value for the BNCH tag.
+ * personalization performed with TSS. A nonce managed by this interface may
+ * be used in a TSS signing request as the value for the BNCH tag.
  *
  * These interfaces require the caller to possess the
  *
 #error "Please #include <img4/img4.h> instead of this file directly"
 #endif // __IMG4_INDIRECT
 
+#if IMG4_TAPI
+#include "tapi.h"
+#endif
+
 /*!
  * @typedef img4_nonce_domain_t
  * An opaque type describing a nonce domain.
@@ -116,7 +120,7 @@ typedef struct _img4_nonce {
  *
  * entitlement.
  */
-#if !MACH_KERNEL_PRIVATE
+#if !XNU_KERNEL_PRIVATE
 IMG4_API_AVAILABLE_20181106
 OS_EXPORT
 const struct _img4_nonce_domain _img4_nonce_domain_trust_cache;
@@ -125,6 +129,42 @@ const struct _img4_nonce_domain _img4_nonce_domain_trust_cache;
 #define IMG4_NONCE_DOMAIN_TRUST_CACHE (img4if->i4if_v1.nonce_domain_trust_cache)
 #endif
 
+/*!
+ * @const IMG4_NONCE_DOMAIN_PDI
+ * The nonce domain governing disk image personalizations. Use of this domain
+ * requires the
+ *
+ *     com.apple.private.img4.nonce.pdi
+ *
+ * entitlement. The nonce for this domain is regenerated once every boot.
+ */
+#if !XNU_KERNEL_PRIVATE
+IMG4_API_AVAILABLE_20181106
+OS_EXPORT
+const struct _img4_nonce_domain _img4_nonce_domain_pdi;
+#define IMG4_NONCE_DOMAIN_PDI (&_img4_nonce_domain_pdi)
+#else
+#define IMG4_NONCE_DOMAIN_PDI (img4if->i4if_v3.nonce_domain_pdi)
+#endif
+
+/*!
+ * @const IMG4_NONCE_DOMAIN_CRYPTEX
+ * The nonce domain governing cryptex personalizations. Use of this domain
+ * requires the
+ *
+ *     com.apple.private.img4.nonce.cryptex
+ *
+ * entitlement.
+ */
+#if !XNU_KERNEL_PRIVATE
+IMG4_API_AVAILABLE_20181106
+OS_EXPORT
+const struct _img4_nonce_domain _img4_nonce_domain_cryptex;
+#define IMG4_NONCE_DOMAIN_CRYPTEX (&_img4_nonce_domain_cryptex)
+#else
+#define IMG4_NONCE_DOMAIN_CRYPTEX (img4if->i4if_v1.nonce_domain_cryptex)
+#endif
+
 /*!
  * @function img4_nonce_domain_copy_nonce
  * Copies the current value of the nonce in the given domain.
@@ -146,7 +186,7 @@ const struct _img4_nonce_domain _img4_nonce_domain_trust_cache;
  *     [EPERM]      The caller lacked the entitlement necessary to read the
  *                  given nonce
  */
-#if !MACH_KERNEL_PRIVATE
+#if !XNU_KERNEL_PRIVATE
 IMG4_API_AVAILABLE_20181106
 OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL2
 errno_t
@@ -172,7 +212,7 @@ img4_nonce_domain_copy_nonce(const img4_nonce_domain_t *nd, img4_nonce_t *n);
  *     [EPERM]      The caller lacked the entitlement necessary to roll the
  *                  given nonce
  */
-#if !MACH_KERNEL_PRIVATE
+#if !XNU_KERNEL_PRIVATE
 IMG4_API_AVAILABLE_20181106
 OS_EXPORT OS_NONNULL1
 errno_t
index 4a1d119d3e00cbd874de329bbe80f71158c8cb8e..8196742f00cd2b7b7a7477f8388d1c9e4bb3e7e5 100644 (file)
 #error "Please #include <img4/img4.h> instead of this file directly"
 #endif // __IMG4_INDIRECT
 
+#if IMG4_TAPI
+#include "tapi.h"
+#endif
+
 /*!
  * @typedef img4_payload_flags_t
  * Flags modifying the behavior of an Image4 payload object.
@@ -32,7 +36,7 @@
  * describe portable executable files which must be fed directly to the firmware
  * and cannot tolerate being wrapped in an intermediary format.
  */
-OS_ENUM(img4_payload_flags, uint64_t,
+OS_CLOSED_OPTIONS(img4_payload_flags, uint64_t,
        I4PLF_INIT = 0,
        I4PLF_UNWRAPPED = (1 << 0),
 );
@@ -67,7 +71,7 @@ OS_ENUM(img4_payload_flags, uint64_t,
  *     [EFTYPE]     The data does not contain an Image4 payload
  *     [ENOENT]     The bytes do not contain a payload for the specified tag
  */
-#if !MACH_KERNEL_PRIVATE
+#if !XNU_KERNEL_PRIVATE
 IMG4_API_AVAILABLE_20180112
 OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL4
 errno_t
@@ -78,6 +82,92 @@ img4_payload_init(img4_payload_t *i4p, img4_tag_t tag,
 #define img4_payload_init(...) img4if->i4if_payload_init(__VA_ARGS__)
 #endif
 
+/*!
+ * @function img4_payload_init_with_vnode_4xnu
+ * Initializes an Image4 payload object from a vnode.
+ *
+ * @param i4p
+ * A pointer to the payload object to initialize.
+ *
+ * @param tag
+ * The expected tag for the payload.
+ *
+ * @param vn
+ * The vnode from which to initialize the payload.
+ *
+ * @param flags
+ * Flags modifying the behavior of the payload object.
+ *
+ * @result
+ * Upon success, zero is returned. Otherwise, one of the following error codes:
+ *
+ *     [ENOENT]     The vnode is either dead or in the process of being
+ *                  recycled
+ *     [EIO]        Reading from the vnode stalled repeatedly beyond the
+ *                  implementation's tolerance
+ *
+ * Additionally, the implementation may return any error that vnode_ref() may
+ * return.
+ *
+ * @discussion
+ * Verification of a vnode is performed by reading in chunks of data, updating
+ * an ongoing hash operation with that data, and then discarding it. Therefore,
+ * payload objects created in this manner can only guarantee their validity at
+ * the time the check was performed since the vnode's contents are not kept in
+ * memory and may be tampered with after validation has been performed.
+ *
+ * Additionally, this operation requires the payload to be unwrapped, as it does
+ * not parse or recognize any Image4 payload wrapper. Payloads created with this
+ * interface are therefore implicitly created with the {@link I4PLF_UNWRAPPED}
+ * flag.
+ */
+
+#if KERNEL
+#if !XNU_KERNEL_PRIVATE
+IMG4_API_AVAILABLE_20180112
+OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL3
+errno_t
+img4_payload_init_with_vnode_4xnu(img4_payload_t *i4p, img4_tag_t tag,
+               vnode_t vn, img4_payload_flags_t flags);
+#else
+#define img4_payload_init_with_vnode_4xnu(...) \
+               (img4if->i4if_v2.payload_init_with_vnode_4xnu(__VA_ARGS__))
+#endif // !XNU_KERNEL_PRIVATE
+#endif // KERNEL
+
+/*!
+ * @function img4_payload_init_with_fd_4MSM
+ * Initializes an Image4 payload object from a file descriptor.
+ *
+ * @param i4p
+ * A pointer to the payload object to initialize.
+ *
+ * @param tag
+ * The expected tag for the payload.
+ *
+ * @param fd
+ * The file descriptor from which to initialize the payload.
+ *
+ * @param flags
+ * Flags modifying the behavior of the payload object.
+ *
+ * @result
+ * Upon success, zero is returned. Otherwise, the implementation may return any
+ * errno that is set by the dup(2) system call.
+ *
+ * @discussion
+ * This interface is a userspace equivalent to
+ * {@link img4_payload_init_with_vnode_4xnu}, and all the same caveats apply.
+ */
+
+#if !KERNEL
+IMG4_API_AVAILABLE_20180112
+OS_EXPORT OS_WARN_RESULT OS_NONNULL1
+errno_t
+img4_payload_init_with_fd_4MSM(img4_payload_t *i4p, img4_tag_t tag,
+               int fd, img4_payload_flags_t flags);
+#endif // KERNEL
+
 /*!
  * @function img4_payload_destroy
  * Disposes of the resources associated with the payload object.
@@ -90,7 +180,7 @@ img4_payload_init(img4_payload_t *i4p, img4_tag_t tag,
  * only the associated resources. This routine will cause the destructor given
  * in {@link img4_payload_init} to be called, if any.
  */
-#if !MACH_KERNEL_PRIVATE
+#if !XNU_KERNEL_PRIVATE
 IMG4_API_AVAILABLE_20180112
 OS_EXPORT OS_NONNULL1
 void
index d6bc7e0cdf069726da9ae7948244f3f78ff59da5..64f6bac73d79ac48cb99111221058942e5494e78 100644 (file)
@@ -210,6 +210,22 @@ struct mach_header_64 {
 #define MH_APP_EXTENSION_SAFE 0x02000000 /* The code was linked for use in an
                                            application extension. */
 
+#define        MH_NLIST_OUTOFSYNC_WITH_DYLDINFO 0x04000000 /* The external symbols
+                                          listed in the nlist symbol table do
+                                          not include all the symbols listed in
+                                          the dyld info. */
+
+#define        MH_SIM_SUPPORT 0x08000000       /* Allow LC_MIN_VERSION_MACOS and
+                                          LC_BUILD_VERSION load commands with
+                                          the platforms macOS, iOSMac,
+                                          iOSSimulator, tvOSSimulator and
+                                          watchOSSimulator. */
+
+#define MH_DYLIB_IN_CACHE 0x80000000   /* Only for use on dylibs. When this bit
+                                          is set, the dylib is part of the dyld
+                                          shared cache, rather than loose in
+                                          the filesystem. */
+
 /*
  * The load commands directly follow the mach_header.  The total size of all
  * of the commands is given by the sizeofcmds field in the mach_header.  All
@@ -304,6 +320,8 @@ struct load_command {
 #define LC_VERSION_MIN_WATCHOS 0x30 /* build for Watch min OS version */
 #define LC_NOTE 0x31 /* arbitrary data included within a Mach-O file */
 #define LC_BUILD_VERSION 0x32 /* build for platform min OS version */
+#define LC_DYLD_EXPORTS_TRIE (0x33 | LC_REQ_DYLD) /* used with linkedit_data_command, payload is trie */
+#define LC_DYLD_CHAINED_FIXUPS (0x34 | LC_REQ_DYLD) /* used with linkedit_data_command */
 
 /*
  * A variable length string in a load command is represented by an lc_str
@@ -381,6 +399,9 @@ struct segment_command_64 { /* for 64-bit architectures */
                                       first page of the segment is not
                                       protected.  All other pages of the
                                       segment are protected. */
+#define SG_READ_ONLY    0x10 /* This segment is made read-only after fixups */
+
+
 
 /*
  * A segment is made up of zero or more sections.  Non-MH_OBJECT files have
@@ -506,6 +527,8 @@ struct section_64 { /* for 64-bit architectures */
 #define S_THREAD_LOCAL_INIT_FUNCTION_POINTERS    0x15  /* functions to call
                                                          to initialize TLV
                                                          values */
+#define S_INIT_FUNC_OFFSETS                      0x16  /* 32-bit offsets to
+                                                         initializers */
 
 /*
  * Constants for the section attributes part of the flags field of a section
@@ -767,14 +790,14 @@ struct dylinker_command {
  * Thread commands contain machine-specific data structures suitable for
  * use in the thread state primitives.  The machine specific data structures
  * follow the struct thread_command as follows.
- * Each flavor of machine specific data structure is preceded by an unsigned
- * long constant for the flavor of that data structure, an uint32_t
- * that is the count of longs of the size of the state data structure and then
+ * Each flavor of machine specific data structure is preceded by an uint32_t
+ * constant for the flavor of that data structure, an uint32_t that is the
+ * count of uint32_t's of the size of the state data structure and then
  * the state data structure follows.  This triple may be repeated for many
  * flavors.  The constants for the flavors, counts and state data structure
  * definitions are expected to be in the header file <machine/thread_status.h>.
  * These machine specific data structures sizes must be multiples of
- * 4 bytes  The cmdsize reflects the total size of the thread_command
+ * 4 bytes.  The cmdsize reflects the total size of the thread_command
  * and all of the sizes of the constants for the flavors, counts and state
  * data structures.
  *
@@ -788,7 +811,7 @@ struct thread_command {
        uint32_t        cmd;            /* LC_THREAD or  LC_UNIXTHREAD */
        uint32_t        cmdsize;        /* total size of this command */
        /* uint32_t flavor                 flavor of thread state */
-       /* uint32_t count                  count of longs in thread state */
+       /* uint32_t count                  count of uint32_t's in thread state */
        /* struct XXX_thread_state state   thread state for this flavor */
        /* ... */
 };
@@ -1164,8 +1187,10 @@ struct rpath_command {
 struct linkedit_data_command {
     uint32_t   cmd;            /* LC_CODE_SIGNATURE, LC_SEGMENT_SPLIT_INFO,
                                    LC_FUNCTION_STARTS, LC_DATA_IN_CODE,
-                                  LC_DYLIB_CODE_SIGN_DRS or
-                                  LC_LINKER_OPTIMIZATION_HINT. */
+                                  LC_DYLIB_CODE_SIGN_DRS,
+                                  LC_LINKER_OPTIMIZATION_HINT,
+                                  LC_DYLD_EXPORTS_TRIE, or
+                                  LC_DYLD_CHAINED_FIXUPS. */
     uint32_t   cmdsize;        /* sizeof(struct linkedit_data_command) */
     uint32_t   dataoff;        /* file offset of data in __LINKEDIT segment */
     uint32_t   datasize;       /* file size of data in __LINKEDIT segment  */
@@ -1238,6 +1263,12 @@ struct build_tool_version {
 #define PLATFORM_IOS 2
 #define PLATFORM_TVOS 3
 #define PLATFORM_WATCHOS 4
+#define PLATFORM_BRIDGEOS 5
+#define PLATFORM_IOSMAC 6
+#define PLATFORM_IOSSIMULATOR 7
+#define PLATFORM_TVOSSIMULATOR 8
+#define PLATFORM_WATCHOSSIMULATOR 9
+#define PLATFORM_DRIVERKIT 10
 
 /* Known values for the tool field above. */
 #define TOOL_CLANG 1
@@ -1385,6 +1416,7 @@ struct dyld_info_command {
 #define BIND_SPECIAL_DYLIB_SELF                                         0
 #define BIND_SPECIAL_DYLIB_MAIN_EXECUTABLE                     -1
 #define BIND_SPECIAL_DYLIB_FLAT_LOOKUP                         -2
+#define BIND_SPECIAL_DYLIB_WEAK_LOOKUP                         -3
 
 #define BIND_SYMBOL_FLAGS_WEAK_IMPORT                          0x1
 #define BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION                  0x8
@@ -1404,6 +1436,9 @@ struct dyld_info_command {
 #define BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB                      0xA0
 #define BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED                        0xB0
 #define BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB           0xC0
+#define        BIND_OPCODE_THREADED                                    0xD0
+#define        BIND_SUBOPCODE_THREADED_SET_BIND_ORDINAL_TABLE_SIZE_ULEB 0x00
+#define        BIND_SUBOPCODE_THREADED_APPLY                            0x01
 
 
 /*
@@ -1413,6 +1448,7 @@ struct dyld_info_command {
 #define EXPORT_SYMBOL_FLAGS_KIND_MASK                          0x03
 #define EXPORT_SYMBOL_FLAGS_KIND_REGULAR                       0x00
 #define EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL                  0x01
+#define EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE                      0x02
 #define EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION                    0x04
 #define EXPORT_SYMBOL_FLAGS_REEXPORT                           0x08
 #define EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER                  0x10
index b6db0fb14acf9f1b78a3ad3ba1abbaab61c1e8e1..e7d7dfd9b0c110f6c127f4f32f4b60213807d097 100644 (file)
@@ -152,6 +152,17 @@ typedef uintptr_t ptrauth_generic_signature_t;
 #define ptrauth_blend_discriminator(__pointer, __integer) \
   __builtin_ptrauth_blend_discriminator(__pointer, __integer)
 
+/* Compute the 16-bit integer discriminator of the given type.
+
+   The argument must be a type.
+*/
+#if __has_feature(ptrauth_type_discriminator)
+#define ptrauth_type_discriminator(__type) \
+  __builtin_ptrauth_type_discriminator(__type)
+#else
+#define ptrauth_type_discriminator(__type) ((uintptr_t)0)
+#endif
+
 /* Add a signature to the given pointer value using a specific key,
    using the given extra data as a salt to the signing process.
 
@@ -308,6 +319,7 @@ typedef uintptr_t ptrauth_generic_signature_t;
 
 #define ptrauth_strip(__value, __key) __value
 #define ptrauth_blend_discriminator(__pointer, __integer) ((uintptr_t)0)
+#define ptrauth_type_discriminator(__type) ((uintptr_t)0)
 #define ptrauth_sign_constant(__value, __key, __data) __value
 #define ptrauth_sign_unauthenticated(__value, __key, __data) __value
 #define ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key, __new_data) __value
index 261c653af58be32440ae358dd2e40aef0951e42b..2ce9fa540b2574fdaa05c1c18f6aff34abde661c 100644 (file)
@@ -44,16 +44,16 @@ extern "C" {
 
 /* 7.17.1 Introduction */
 
-#define ATOMIC_BOOL_LOCK_FREE       __GCC_ATOMIC_BOOL_LOCK_FREE
-#define ATOMIC_CHAR_LOCK_FREE       __GCC_ATOMIC_CHAR_LOCK_FREE
-#define ATOMIC_CHAR16_T_LOCK_FREE   __GCC_ATOMIC_CHAR16_T_LOCK_FREE
-#define ATOMIC_CHAR32_T_LOCK_FREE   __GCC_ATOMIC_CHAR32_T_LOCK_FREE
-#define ATOMIC_WCHAR_T_LOCK_FREE    __GCC_ATOMIC_WCHAR_T_LOCK_FREE
-#define ATOMIC_SHORT_T_LOCK_FREE    __GCC_ATOMIC_SHORT_T_LOCK_FREE
-#define ATOMIC_INT_T_LOCK_FREE      __GCC_ATOMIC_INT_T_LOCK_FREE
-#define ATOMIC_LONG_T_LOCK_FREE     __GCC_ATOMIC_LONG_T_LOCK_FREE
-#define ATOMIC_LLONG_T_LOCK_FREE    __GCC_ATOMIC_LLONG_T_LOCK_FREE
-#define ATOMIC_POINTER_T_LOCK_FREE  __GCC_ATOMIC_POINTER_T_LOCK_FREE
+#define ATOMIC_BOOL_LOCK_FREE       __CLANG_ATOMIC_BOOL_LOCK_FREE
+#define ATOMIC_CHAR_LOCK_FREE       __CLANG_ATOMIC_CHAR_LOCK_FREE
+#define ATOMIC_CHAR16_T_LOCK_FREE   __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
+#define ATOMIC_CHAR32_T_LOCK_FREE   __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
+#define ATOMIC_WCHAR_T_LOCK_FREE    __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
+#define ATOMIC_SHORT_LOCK_FREE      __CLANG_ATOMIC_SHORT_LOCK_FREE
+#define ATOMIC_INT_LOCK_FREE        __CLANG_ATOMIC_INT_LOCK_FREE
+#define ATOMIC_LONG_LOCK_FREE       __CLANG_ATOMIC_LONG_LOCK_FREE
+#define ATOMIC_LLONG_LOCK_FREE      __CLANG_ATOMIC_LLONG_LOCK_FREE
+#define ATOMIC_POINTER_LOCK_FREE    __CLANG_ATOMIC_POINTER_LOCK_FREE
 
 /* 7.17.2 Initialization */
 
index d9bb51e83205c050ff17bd57ca69bc679369bfa5..9678b998fa4d9828b7464b522f4a7ab8f8dbf626 100644 (file)
 #ifndef __STDDEF_H
 #define __STDDEF_H
 
+#undef NULL
+#ifdef __cplusplus
+#if __cplusplus >= 201103L
+#define NULL nullptr
+#else
+#undef __null  // VC++ hack.
+#define NULL __null
+#endif
+#else
+#define NULL ((void*)0)
+#endif
+
 #ifndef _PTRDIFF_T
 #define _PTRDIFF_T
-typedef __typeof__(((int*)0)-((int*)0)) ptrdiff_t;
+typedef __typeof__(((int*)NULL)-((int*)NULL)) ptrdiff_t;
 #endif
 #ifndef _SIZE_T
 #define _SIZE_T
@@ -41,14 +53,6 @@ typedef __WCHAR_TYPE__ wchar_t;
 #endif
 #endif
 
-#undef NULL
-#ifdef __cplusplus
-#undef __null  // VC++ hack.
-#define NULL __null
-#else
-#define NULL ((void*)0)
-#endif
-
 #ifndef offsetof
 #define offsetof(t, d) __builtin_offsetof(t, d)
 #endif
diff --git a/EXTERNAL_HEADERS/sys/Makefile b/EXTERNAL_HEADERS/sys/Makefile
new file mode 100644 (file)
index 0000000..978ac41
--- /dev/null
@@ -0,0 +1,13 @@
+export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
+export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
+export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
+export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
+
+include $(MakeInc_cmd)
+include $(MakeInc_def)
+
+INSTINC_SUBDIRS =      \
+       _pthread
+
+include $(MakeInc_rule)
+include $(MakeInc_dir)
diff --git a/EXTERNAL_HEADERS/sys/_pthread/Makefile b/EXTERNAL_HEADERS/sys/_pthread/Makefile
new file mode 100644 (file)
index 0000000..0f81544
--- /dev/null
@@ -0,0 +1,21 @@
+export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
+export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
+export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
+export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
+
+include $(MakeInc_cmd)
+include $(MakeInc_def)
+
+EXPORT_FILES = \
+       _pthread_types.h
+
+EXPORT_MI_LIST = ${EXPORT_FILES}
+
+EXPORT_MI_DIR = sys/_pthread
+
+INSTALL_KF_MI_LCL_LIST = $(empty)
+
+INSTALL_KF_MI_LIST = $(empty)
+
+include $(MakeInc_rule)
+include $(MakeInc_dir)
diff --git a/EXTERNAL_HEADERS/sys/_pthread/_pthread_types.h b/EXTERNAL_HEADERS/sys/_pthread/_pthread_types.h
new file mode 100644 (file)
index 0000000..d9d51b8
--- /dev/null
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2003-2013 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _SYS__PTHREAD_TYPES_H_
+#define _SYS__PTHREAD_TYPES_H_
+
+#include <sys/cdefs.h>
+
+// pthread opaque structures
+#if defined(__LP64__)
+#define __PTHREAD_SIZE__               8176
+#define __PTHREAD_ATTR_SIZE__          56
+#define __PTHREAD_MUTEXATTR_SIZE__     8
+#define __PTHREAD_MUTEX_SIZE__         56
+#define __PTHREAD_CONDATTR_SIZE__      8
+#define __PTHREAD_COND_SIZE__          40
+#define __PTHREAD_ONCE_SIZE__          8
+#define __PTHREAD_RWLOCK_SIZE__                192
+#define __PTHREAD_RWLOCKATTR_SIZE__    16
+#else // !__LP64__
+#define __PTHREAD_SIZE__               4088
+#define __PTHREAD_ATTR_SIZE__          36
+#define __PTHREAD_MUTEXATTR_SIZE__     8
+#define __PTHREAD_MUTEX_SIZE__         40
+#define __PTHREAD_CONDATTR_SIZE__      4
+#define __PTHREAD_COND_SIZE__          24
+#define __PTHREAD_ONCE_SIZE__          4
+#define __PTHREAD_RWLOCK_SIZE__                124
+#define __PTHREAD_RWLOCKATTR_SIZE__    12
+#endif // !__LP64__
+
+struct __darwin_pthread_handler_rec {
+       void (*__routine)(void *);      // Routine to call
+       void *__arg;                    // Argument to pass
+       struct __darwin_pthread_handler_rec *__next;
+};
+
+struct _opaque_pthread_attr_t {
+       long __sig;
+       char __opaque[__PTHREAD_ATTR_SIZE__];
+};
+
+struct _opaque_pthread_cond_t {
+       long __sig;
+       char __opaque[__PTHREAD_COND_SIZE__];
+};
+
+struct _opaque_pthread_condattr_t {
+       long __sig;
+       char __opaque[__PTHREAD_CONDATTR_SIZE__];
+};
+
+struct _opaque_pthread_mutex_t {
+       long __sig;
+       char __opaque[__PTHREAD_MUTEX_SIZE__];
+};
+
+struct _opaque_pthread_mutexattr_t {
+       long __sig;
+       char __opaque[__PTHREAD_MUTEXATTR_SIZE__];
+};
+
+struct _opaque_pthread_once_t {
+       long __sig;
+       char __opaque[__PTHREAD_ONCE_SIZE__];
+};
+
+struct _opaque_pthread_rwlock_t {
+       long __sig;
+       char __opaque[__PTHREAD_RWLOCK_SIZE__];
+};
+
+struct _opaque_pthread_rwlockattr_t {
+       long __sig;
+       char __opaque[__PTHREAD_RWLOCKATTR_SIZE__];
+};
+
+struct _opaque_pthread_t {
+       long __sig;
+       struct __darwin_pthread_handler_rec  *__cleanup_stack;
+       char __opaque[__PTHREAD_SIZE__];
+};
+
+typedef struct _opaque_pthread_attr_t __darwin_pthread_attr_t;
+typedef struct _opaque_pthread_cond_t __darwin_pthread_cond_t;
+typedef struct _opaque_pthread_condattr_t __darwin_pthread_condattr_t;
+typedef unsigned long __darwin_pthread_key_t;
+typedef struct _opaque_pthread_mutex_t __darwin_pthread_mutex_t;
+typedef struct _opaque_pthread_mutexattr_t __darwin_pthread_mutexattr_t;
+typedef struct _opaque_pthread_once_t __darwin_pthread_once_t;
+typedef struct _opaque_pthread_rwlock_t __darwin_pthread_rwlock_t;
+typedef struct _opaque_pthread_rwlockattr_t __darwin_pthread_rwlockattr_t;
+typedef struct _opaque_pthread_t *__darwin_pthread_t;
+
+#endif // _SYS__PTHREAD_TYPES_H_
index 38cb749358eac4cf556b0cbb875506a0e37d60a5..64822cdf0298033718e64020bbb8dbb79f8e51ed 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -17,6 +17,12 @@ endif
 ifndef SYMROOT
 export SYMROOT = $(SRCROOT)/BUILD/sym
 endif
+ifndef MallocNanoZone
+export MallocNanoZone := 1
+endif
+
+# Avoid make default rules, make becomes faster
+MAKEFLAGS+=r
 
 export MakeInc_top=${VERSDIR}/makedefs/MakeInc.top
 export MakeInc_kernel=${VERSDIR}/makedefs/MakeInc.kernel
@@ -32,8 +38,10 @@ export MakeInc_dir=${VERSDIR}/makedefs/MakeInc.dir
 
 ifeq ($(findstring Libsyscall,$(RC_ProjectName)),Libsyscall)
 
-ifeq ($(RC_ProjectName),Libsyscall_headers_Sim)
-TARGET=-target Libsyscall_headers_Sim
+include $(MakeInc_cmd)
+
+ifneq ($(findstring Libsyscall_,$(RC_ProjectName)),)
+TARGET=-target $(RC_ProjectName)
 endif
 
 default: install
@@ -44,12 +52,17 @@ SDKROOT ?= macosx.internal
 installhdrs install:
        cd libsyscall ; \
                xcodebuild $@ $(TARGET) \
+                       $(MAKEOVERRIDES)        \
                        "SRCROOT=$(SRCROOT)/libsyscall"                                 \
                        "OBJROOT=$(OBJROOT)"                                            \
                        "SYMROOT=$(SYMROOT)"                                            \
                        "DSTROOT=$(DSTROOT)"                                            \
                        "SDKROOT=$(SDKROOT)"
 
+Libsyscall_driverkit: install
+
+.PHONY: Libsyscall_driverkit
+
 clean:
 
 installsrc:
@@ -90,6 +103,7 @@ default: install
 installhdrs install:
        cd libkern/kmod ; \
                xcodebuild $@   \
+                       $(MAKEOVERRIDES)        \
                        "SRCROOT=$(SRCROOT)/libkern/kmod"                               \
                        "OBJROOT=$(OBJROOT)"                                            \
                        "SYMROOT=$(SYMROOT)"                                            \
@@ -105,11 +119,7 @@ else ifeq ($(RC_ProjectName),xnu_tests)
 
 export SYSCTL_HW_PHYSICALCPU := $(shell /usr/sbin/sysctl -n hw.physicalcpu)
 export SYSCTL_HW_LOGICALCPU  := $(shell /usr/sbin/sysctl -n hw.logicalcpu)
-ifeq ($(SYSCTL_HW_PHYSICALCPU),$(SYSCTL_HW_LOGICALCPU))
-MAKEJOBS := --jobs=$(shell expr $(SYSCTL_HW_PHYSICALCPU) + 1)
-else
-MAKEJOBS := --jobs=$(SYSCTL_HW_LOGICALCPU)
-endif
+MAKEJOBS := --jobs=$(shell expr $(SYSCTL_HW_LOGICALCPU) + 1)
 
 default: install
 
@@ -142,11 +152,7 @@ endif
 #
 export SYSCTL_HW_PHYSICALCPU := $(shell /usr/sbin/sysctl -n hw.physicalcpu)
 export SYSCTL_HW_LOGICALCPU  := $(shell /usr/sbin/sysctl -n hw.logicalcpu)
-ifeq ($(SYSCTL_HW_PHYSICALCPU),$(SYSCTL_HW_LOGICALCPU))
-MAKEJOBS := --jobs=$(shell expr $(SYSCTL_HW_PHYSICALCPU) + 1)
-else
-MAKEJOBS := --jobs=$(SYSCTL_HW_LOGICALCPU)
-endif
+MAKEJOBS := --jobs=$(shell expr $(SYSCTL_HW_LOGICALCPU) + 1)
 
 TOP_TARGETS =                                                          \
        clean                                                           \
@@ -220,7 +226,7 @@ EXPINC_SUBDIRS_X86_64H = $(EXPINC_SUBDIRS)
 EXPINC_SUBDIRS_ARM = $(EXPINC_SUBDIRS)
 EXPINC_SUBDIRS_ARM64 = $(EXPINC_SUBDIRS)
 
-SETUP_SUBDIRS = SETUP osfmk san
+SETUP_SUBDIRS = SETUP san bsd
 
 COMP_SUBDIRS_X86_64 = $(ALL_SUBDIRS)
 COMP_SUBDIRS_X86_64H = $(ALL_SUBDIRS)
@@ -241,6 +247,7 @@ endif # all other RC_ProjectName
 installapi_libkdd installhdrs_libkdd install_libkdd:
        cd libkdd; \
                xcodebuild -target Default $(subst _libkdd,,$@) \
+                       $(MAKEOVERRIDES)        \
                        "SRCROOT=$(SRCROOT)/libkdd"             \
                        "OBJROOT=$(OBJROOT)"                    \
                        "SYMROOT=$(SYMROOT)"                    \
@@ -251,6 +258,7 @@ installapi_libkdd installhdrs_libkdd install_libkdd:
 installapi_libkdd_tests installhdrs_libkdd_tests install_libkdd_tests:
        cd libkdd; \
                xcodebuild -target tests $(subst _libkdd_tests,,$@)     \
+                       $(MAKEOVERRIDES)        \
                        "SRCROOT=$(SRCROOT)/libkdd"             \
                        "OBJROOT=$(OBJROOT)"                    \
                        "SYMROOT=$(SYMROOT)"                    \
@@ -261,6 +269,7 @@ installapi_libkdd_tests installhdrs_libkdd_tests install_libkdd_tests:
 installapi_libkdd_host installhdrs_libkdd_host install_libkdd_host:
        cd libkdd; \
                xcodebuild -configuration ReleaseHost -target kdd.framework $(subst _libkdd_host,,$@)   \
+                       $(MAKEOVERRIDES)        \
                        "SRCROOT=$(SRCROOT)/libkdd"             \
                        "OBJROOT=$(OBJROOT)"                    \
                        "SYMROOT=$(SYMROOT)"                    \
index a65afae99ffc6235dcf2c444d23f71301f300714..bb146bea3be273274c118f41e10f1e44ac1ffd01 100644 (file)
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ kernel together into a single bootable image.
 To build a kernelcache you can use the following mechanisms:
 
   * Using automatic kernelcache generation with `kextd`.
-    The kextd daemon keeps watching for changing in `/System/Library/Extensions` directory. 
+    The kextd daemon keeps watching for changing in `/System/Library/Extensions` directory.
     So you can setup new kernel as
 
         $ cp BUILD/obj/DEVELOPMENT/X86_64/kernel.development /System/Library/Kernels/
@@ -178,10 +178,12 @@ XNU installs header files at the following locations -
     a. $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers
     b. $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders
     c. $(DSTROOT)/usr/include/
-    d. $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
+    d. $(DSTROOT)/System/DriverKit/usr/include/
+    e. $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
 
 `Kernel.framework` is used by kernel extensions.\
 The `System.framework` and `/usr/include` are used by user level applications. \
+`/System/DriverKit/usr/include` is used by userspace drivers. \
 The header files in framework's `PrivateHeaders` are only available for ** Apple Internal Development **.
 
 The directory containing the header file should have a Makefile that
@@ -196,15 +198,18 @@ from each file list are -
     a. `DATAFILES` : To make header file available in user level -
        `$(DSTROOT)/usr/include`
 
-    b. `PRIVATE_DATAFILES` : To make header file available to Apple internal in
+    b. `DRIVERKIT_DATAFILES` : To make header file available to DriverKit userspace drivers -
+       `$(DSTROOT)/System/DriverKit/usr/include`
+
+    c. `PRIVATE_DATAFILES` : To make header file available to Apple internal in
        user level -
        `$(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders`
 
-    c. `KERNELFILES` : To make header file available in kernel level -
+    d. `KERNELFILES` : To make header file available in kernel level -
        `$(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers`
        `$(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders`
 
-    d. `PRIVATE_KERNELFILES` : To make header file available to Apple internal
+    e. `PRIVATE_KERNELFILES` : To make header file available to Apple internal
        for kernel extensions -
        `$(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders`
 
@@ -227,28 +232,35 @@ member file lists and their default location are described below -
        Definition -
            INSTALL_MI_LIST = ${DATAFILES}
 
-    b.  `INSTALL_MI_LCL_LIST` : Installs header file to a location that is available
+    b. `INSTALL_DRIVERKIT_MI_LIST` : Installs header file to a location that is
+        available to DriverKit userspace drivers.
+        Locations -
+           $(DSTROOT)/System/DriverKit/usr/include
+       Definition -
+           INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES}
+
+    c.  `INSTALL_MI_LCL_LIST` : Installs header file to a location that is available
        for Apple internal in user level.
        Locations -
            $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
        Definition -
            INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES}
 
-    c. `INSTALL_KF_MI_LIST` : Installs header file to location that is available
+    d. `INSTALL_KF_MI_LIST` : Installs header file to location that is available
        to everyone for kernel extensions.
        Locations -
             $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers
        Definition -
             INSTALL_KF_MI_LIST = ${KERNELFILES}
 
-    d. `INSTALL_KF_MI_LCL_LIST` : Installs header file to location that is
+    e. `INSTALL_KF_MI_LCL_LIST` : Installs header file to location that is
        available for Apple internal for kernel extensions.
        Locations -
             $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders
        Definition -
             INSTALL_KF_MI_LCL_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES}
 
-    e. `EXPORT_MI_LIST` : Exports header file to all of xnu (bsd/, osfmk/, etc.)
+    f. `EXPORT_MI_LIST` : Exports header file to all of xnu (bsd/, osfmk/, etc.)
        for compilation only. Does not install anything into the SDK.
        Definition -
             EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES}
@@ -291,6 +303,8 @@ want to export a function only to kernel level but not user level.
 
             $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers
             $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders
+    g. `DRIVERKIT`: If defined, enclosed code is visible exclusively in the
+    DriverKit SDK headers used by userspace drivers.
 
 Conditional compilation
 =======================
@@ -317,8 +331,9 @@ does not define the platform macros from `TargetConditionals.h`
 (`TARGET_OS_OSX`, `TARGET_OS_IOS`, etc.).
 
 
-There is a `TARGET_OS_EMBEDDED` macro, but this should be avoided as it is in
-general too broad a definition for most functionality.
+There is a deprecated `TARGET_OS_EMBEDDED` macro, but this should be avoided
+as it is in general too broad a definition for most functionality.
+Please refer to TargetConditionals.h for a full picture.
 
 How to add a new syscall
 ========================
@@ -375,7 +390,7 @@ common options.
 To debug a panic'ed kernel, use llvm debugger (lldb) along with unstripped symbol rich kernel binary.
 
     sh$ lldb kernel.development.unstripped
-    
+
 And then you can connect to panic'ed machine with `kdp_remote [ip addr]` or `gdb_remote [hostip : port]` commands.
 
 Each kernel is packaged with kernel specific debug scripts as part of the build process. For security reasons these special commands
index 56032b45d61a09a194b731ea25edf7743a2db9f3..fb79f3fcd63fdcb847b9f849af6fc78667191019 100644 (file)
@@ -17,21 +17,21 @@ WARNFLAGS = -Wall
 LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION)
 
 config: $(OBJS)
-       @echo "$(ColorH)HOST_LD$(Color0)       $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_LD$(Color0)       $(ColorF)$@$(Color0))
        $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^
-       @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0))
        $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@
 
 %.o: %.c
-       @echo "$(ColorH)HOST_CC$(Color0)       $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_CC$(Color0)       $(ColorF)$@$(Color0))
        $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $<
 
 parser.c: parser.y
-       @echo "$(ColorH)HOST_BISON$(Color0)    $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_BISON$(Color0)    $(ColorF)$@$(Color0))
        $(_v)$(HOST_BISON) -y -d -d -o $@ $<
 
 lexer.yy.c: lexer.l
-       @echo "$(ColorH)HOST_FLEX$(Color0)     $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_FLEX$(Color0)     $(ColorF)$@$(Color0))
        $(_v)env M4=$(HOST_GM4) $(HOST_FLEX) --header-file=lexer.yy.h -o $@ $<
 
 main.o mkheaders.o mkioconf.o mkmakefile.o lexer.yy.c: parser.c
index a32236fd1c0bb169a675ee71e588582f17a7ad50..9614ba1952fd586e474dbece9b39f74656f41570 100644 (file)
@@ -738,25 +738,14 @@ common:
                        fprintf(f, "%s%.*s${%c_RULE_1B%s}%s\n",
                            source_dir, (int)(tp - np), np, och_upper, extras, nl);
 
-                       /* While we are still using CTF, any build that normally does not support CTF will
-                        * a "standard" compile done as well that we can harvest CTF information from; do
-                        * that here.
-                        */
-                       fprintf(f, "\t${%c_CTFRULE_1A%s}", och_upper, extras);
-                       if (ftp->f_extra) {
-                               fprintf(f, "%s", ftp->f_extra);
-                       }
-                       fprintf(f, "%s%.*s${%c_CTFRULE_1B%s}%s\n",
-                           source_dir, (int)(tp - np), np, och_upper, extras, nl);
-
                        fprintf(f, "\t${%c_RULE_2%s}%s\n", och_upper, extras, nl);
-                       fprintf(f, "\t${%c_CTFRULE_2%s}%s\n", och_upper, extras, nl);
                        fprintf(f, "\t${%c_RULE_3%s}%s\n", och_upper, extras, nl);
-                       fprintf(f, "\t${%c_RULE_4A%s}", och_upper, extras);
+                       fprintf(f, "\t$(if ${%c_RULE_4A%s},${%c_RULE_4A%s}",
+                           och_upper, extras, och_upper, extras);
                        if (ftp->f_extra) {
                                fprintf(f, "%s", ftp->f_extra);
                        }
-                       fprintf(f, "%s%.*s${%c_RULE_4B%s}%s\n",
+                       fprintf(f, "%s%.*s${%c_RULE_4B%s}%s)\n",
                            source_dir, (int)(tp - np), np, och_upper, extras, nl);
                        break;
 
index 7018eb19eb81a5a8866ff2527ed8e51a39b38ee4..a22212f6e6c1814cb0a6831b008ceb5d5b3631b5 100644 (file)
@@ -15,13 +15,13 @@ WARNFLAGS = -Wall
 LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION)
 
 decomment: $(OBJS)
-       @echo "$(ColorH)HOST_LD$(Color0)       $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_LD$(Color0)       $(ColorF)$@$(Color0))
        $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^
-       @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0))
        $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@
 
 %.o: %.c
-       @echo "$(ColorH)HOST_CC$(Color0)       $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_CC$(Color0)       $(ColorF)$@$(Color0))
        $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $<
 
 do_build_setup:: decomment
index eb1f3afbb467fd127ab27ade3b2334fe30e9febe..4ad7a7498f935bef786dab11e70b3b207ebc95b7 100644 (file)
@@ -15,13 +15,13 @@ WARNFLAGS = -Wall
 LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION)
 
 installfile: $(OBJS)
-       @echo "$(ColorH)HOST_LD$(Color0)       $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_LD$(Color0)       $(ColorF)$@$(Color0))
        $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^
-       @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0))
        $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@
 
 %.o: %.c
-       @echo "$(ColorH)HOST_CC$(Color0)       $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_CC$(Color0)       $(ColorF)$@$(Color0))
        $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $<
 
 do_build_setup:: installfile
index 518644cb5bd05e60d8673e7d539969ab743e712e..18af26bdd9d19535adfa1937acefd4dc277bf846 100644 (file)
@@ -15,13 +15,13 @@ WARNFLAGS = -Wall
 LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION)
 
 json_compilation_db: $(OBJS)
-       @echo "$(ColorH)HOST_LD$(Color0)       $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_LD$(Color0)       $(ColorF)$@$(Color0))
        $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^
-       @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0))
        $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@
 
 %.o: %.c
-       @echo "$(ColorH)HOST_CC$(Color0)       $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_CC$(Color0)       $(ColorF)$@$(Color0))
        $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $<
 
 do_build_setup:: json_compilation_db
index af6cdcafd20316a4ef099cc69733e8c8beace746..dde295bae5e0734d0d328769a86c5335cf9c1cd6 100644 (file)
@@ -15,13 +15,13 @@ WARNFLAGS = -Wall
 LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION) -lstdc++
 
 kextsymboltool: $(OBJS)
-       @echo "$(ColorH)HOST_LD$(Color0)       $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_LD$(Color0)       $(ColorF)$@$(Color0))
        $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^
-       @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0))
        $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@
 
 %.o: %.c
-       @echo "$(ColorH)HOST_CC$(Color0)       $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_CC$(Color0)       $(ColorF)$@$(Color0))
        $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $<
 
 do_build_setup:: kextsymboltool
index e1e84844e2c58fa4f477d126147b61edb1318e3b..45459e48b2932342e13c993a669753bd53beace6 100644 (file)
@@ -15,13 +15,13 @@ WARNFLAGS = -Wall
 LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION)
 
 replacecontents: $(OBJS)
-       @echo "$(ColorH)HOST_LD$(Color0)       $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_LD$(Color0)       $(ColorF)$@$(Color0))
        $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^
-       @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0))
        $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@
 
 %.o: %.c
-       @echo "$(ColorH)HOST_CC$(Color0)       $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_CC$(Color0)       $(ColorF)$@$(Color0))
        $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $<
 
 do_build_setup:: replacecontents
index 7e9224ef0e64090393ac1942e2e549ebebaf26c3..70a55a7b5900f329423a017b4805c992251bbd1a 100644 (file)
@@ -15,13 +15,13 @@ WARNFLAGS = -Wall
 LDFLAGS = -isysroot $(HOST_SDKROOT) -mmacosx-version-min=$(HOST_OS_VERSION)
 
 setsegname: $(OBJS)
-       @echo "$(ColorH)HOST_LD$(Color0)       $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_LD$(Color0)       $(ColorF)$@$(Color0))
        $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^
-       @echo "$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_CODESIGN$(Color0) $(ColorF)$@$(Color0))
        $(_v)env CODESIGN_ALLOCATE=$(HOST_CODESIGN_ALLOCATE) $(HOST_CODESIGN) -s - $@
 
 %.o: %.c
-       @echo "$(ColorH)HOST_CC$(Color0)       $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorH)HOST_CC$(Color0)       $(ColorF)$@$(Color0))
        $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $<
 
 do_build_setup:: setsegname
index f79dc7046f61288d4a3d927721ba3844e9772952..22db7cb4dd597c8cdf9f9057fb06db8ad4a45aa3 100644 (file)
@@ -6,6 +6,8 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
 include $(MakeInc_cmd)
 include $(MakeInc_def)
 
+SETUP_SUBDIRS = sys
+
 INSTINC_SUBDIRS = \
        bsm \
        crypto \
@@ -21,6 +23,7 @@ INSTINC_SUBDIRS = \
        security \
        pthread \
        sys \
+       sys_private \
        uuid \
        vfs
 
@@ -52,6 +55,7 @@ EXPINC_SUBDIRS = \
        security \
        pthread \
        sys \
+       sys_private \
        uuid \
        vfs \
        vm
diff --git a/bsd/arm/exec.h b/bsd/arm/exec.h
deleted file mode 100644 (file)
index ed29b14..0000000
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
- */
-/*-
- * Copyright (c) 1992, 1993
- *     The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *     This product includes software developed by the University of
- *     California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *     @(#)exec.h      8.1 (Berkeley) 6/11/93
- */
-
-#ifndef _BSD_ARM_EXEC_H_
-#define _BSD_ARM_EXEC_H_
-
-
-#ifdef BSD_KERNEL_PRIVATE
-/* Size of a page in an object file. */
-#define __LDPGSZ        4096
-
-/* Valid magic number check. */
-#define N_BADMAG(ex) \
-       ((ex).a_magic != NMAGIC && (ex).a_magic != OMAGIC && \
-           (ex).a_magic != ZMAGIC)
-
-/* Address of the bottom of the text segment. */
-#define N_TXTADDR(X)    0
-
-/* Address of the bottom of the data segment. */
-#define N_DATADDR(ex) \
-       (N_TXTADDR(ex) + ((ex).a_magic == OMAGIC ? (ex).a_text \
-       : __LDPGSZ + ((ex).a_text - 1 & ~(__LDPGSZ - 1))))
-
-/* Text segment offset. */
-#define N_TXTOFF(ex) \
-       ((ex).a_magic == ZMAGIC ? __LDPGSZ : sizeof(struct exec))
-
-/* Data segment offset. */
-#define N_DATOFF(ex) \
-       (N_TXTOFF(ex) + ((ex).a_magic != ZMAGIC ? (ex).a_text : \
-       __LDPGSZ + ((ex).a_text - 1 & ~(__LDPGSZ - 1))))
-
-/* Symbol table offset. */
-#define N_SYMOFF(ex) \
-       (N_TXTOFF(ex) + (ex).a_text + (ex).a_data + (ex).a_trsize + \
-           (ex).a_drsize)
-
-/* String table offset. */
-#define N_STROFF(ex)    (N_SYMOFF(ex) + (ex).a_syms)
-
-/* Description of the object file header (a.out format). */
-struct exec {
-#define OMAGIC  0407            /* old impure format */
-#define NMAGIC  0410            /* read-only text */
-#define ZMAGIC  0413            /* demand load format */
-#define QMAGIC  0314            /* demand load format. Header in text. */
-       unsigned int    a_magic;        /* magic number */
-
-       unsigned int    a_text;         /* text segment size */
-       unsigned int    a_data;         /* initialized data size */
-       unsigned int    a_bss;          /* uninitialized data size */
-       unsigned int    a_syms;         /* symbol table size */
-       unsigned int    a_entry;        /* entry point */
-       unsigned int    a_trsize;       /* text relocation size */
-       unsigned int    a_drsize;       /* data relocation size */
-};
-
-#endif /* BSD_KERNEL_PRIVATE */
-
-#endif /* _BSD_ARM_EXEC_H_ */
index 823ecc5838edb38b4fbe98ef1de605e8668be051..69a777f41925de3c487bbae62b847548b076ede2 100644 (file)
@@ -30,8 +30,6 @@
 #ifndef _FASTTRAP_ISA_H
 #define _FASTTRAP_ISA_H
 
-/* #pragma ident       "@(#)fasttrap_isa.h     1.4     05/06/08 SMI" */
-
 #include <sys/types.h>
 #include <stdint.h>
 
diff --git a/bsd/arm/reboot.h b/bsd/arm/reboot.h
deleted file mode 100644 (file)
index 0bb3b5a..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
- */
-
-#ifndef _BSD_ARM_REBOOT_H_
-#define _BSD_ARM_REBOOT_H_
-
-/*
- * Empty file (publicly)
- */
-
-#include <sys/appleapiopts.h>
-
-#ifdef  BSD_KERNEL_PRIVATE
-
-/*
- *     Use most significant 16 bits to avoid collisions with
- *     machine independent flags.
- */
-#define RB_POWERDOWN    0x00010000      /* power down on halt */
-#define RB_NOBOOTRC     0x00020000      /* don't run '/etc/rc.boot' */
-#define RB_DEBUG        0x00040000      /* drop into mini monitor on panic */
-#define RB_EJECT        0x00080000      /* eject disks on halt */
-#define RB_COMMAND      0x00100000      /* new boot command specified */
-#define RB_NOFP         0x00200000      /* don't use floating point */
-#define RB_BOOTNEXT     0x00400000      /* reboot into NeXT */
-#define RB_BOOTDOS      0x00800000      /* reboot into DOS */
-#define RB_PRETTY       0x01000000      /* shutdown with pretty graphics */
-
-#endif  /* BSD_KERNEL_PRIVATE */
-
-#endif  /* _BSD_ARM_REBOOT_H_ */
index 20b73988aef2d547146320f6f82f9078f6ad78ac..b23d91bc9792f1011fee92c6c0743c51eb960f7f 100644 (file)
 #define BSM_F_MARKDEPENDENCY    360     /* Darwin-specific. */
 #define BSM_F_BARRIERFSYNC      361     /* Darwin-specific. */
 #define BSM_F_PUNCHHOLE         362     /* Darwin-specific. */
-#define BSM_F_TRIM_ACTIVE_FILE          363     /* Darwin-specific. */
+#define BSM_F_TRIM_ACTIVE_FILE  363     /* Darwin-specific. */
+#define BSM_F_SPECULATIVE_READ  364     /* Darwin-specific. */
 
 /*
  * Darwin file system specific (400-499).
index 3f4ddea6360e6951a6b6260282fbcd63728b3767..37dc16b53f2e86a6cfb1025743cca6d4a364b981 100644 (file)
 #define AUE_PIDFORTASK          43049   /* Darwin-specific. */
 #define AUE_SYSCTL_NONADMIN     43050
 #define AUE_COPYFILE            43051   /* Darwin-specific. */
-
+#define AUE_DBGPORTFORPID       43052   /* Darwin-specific. */
 /*
  * Events added to OpenBSM for FreeBSD and Linux; may also be used by Darwin
  * in the future.
 #define AUE_FCLONEFILEAT        43211   /* Darwin. */
 #define AUE_SETATTRLISTAT       43212   /* Darwin. */
 #define AUE_FMOUNT              43213   /* Darwin. */
+#define AUE_FSGETPATH_EXTENDED  43214   /* Darwin. */
 
 #define AUE_SESSION_START       44901   /* Darwin. */
 #define AUE_SESSION_UPDATE      44902   /* Darwin. */
index 7bd79d9ae78b20528784ccb5d5684d4a6bcb579f..05c4b79cf0cdde381cf245f419ccc25af4fec458 100644 (file)
@@ -23,7 +23,7 @@ endif
 
 $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS)
        $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG)
-       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG);
+       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG)
 
 do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
        $(_v)${MAKE} \
@@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
                SOURCE=$(subst conf/,,$(SOURCE))                        \
                TARGET=${TARGET}                                        \
                OBJPATH=${OBJPATH}                                      \
-               build_all;
+               build_all
 
 do_build_all:: do_all
 
index c38c2ffb6fb8ada0f7b9d97dbe1331e7d85ff7a5..fa831c803f344123b6aa0aef55304be053aedc0c 100644 (file)
@@ -127,8 +127,6 @@ OBJS_NO_SIGN_COMPARE =              \
                in6_ifattach.o  \
                ip6_input.o     \
                ip6_output.o    \
-               ipcomp_input.o  \
-               ipcomp_output.o \
                in6_proto.o     \
                mld6.o  \
                nd6.o   \
@@ -250,6 +248,7 @@ OBJS_NO_PACKED_ADDRESS =    \
                nd6_prproxy.o       \
                nd6_rtr.o           \
                necp.o              \
+               packet_mangler.o    \
                pf.o                \
                pf_norm.o           \
                pktap.o             \
@@ -258,7 +257,8 @@ OBJS_NO_PACKED_ADDRESS =    \
                tcp_subr.o          \
                udp6_output.o       \
                udp6_usrreq.o       \
-               udp_usrreq.o
+               udp_usrreq.o        \
+               sixxlowpan.o
 
 $(foreach file,$(OBJS_NO_PACKED_ADDRESS),$(eval $(call add_perfile_cflags,$(file),-Wno-address-of-packed-member)))
 
@@ -284,27 +284,27 @@ $(SOBJS): .SFLAGS
        $(_v)$(REPLACECONTENTS) $@ $(S_KCC) $(SFLAGS) $(INCFLAGS)
 
 $(COMPONENT).filelist: $(OBJS)
-       @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)"
+       $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0))
        $(_v)for obj in ${OBJS}; do     \
-                echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
+                $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
        done > $(COMPONENT).filelist
 
 MAKESYSCALLS = $(SRCROOT)/bsd/kern/makesyscalls.sh
 
 init_sysent.c: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS)
-       @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)";
+       $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0))
        $(_v)$(MAKESYSCALLS) $< table > /dev/null
 
 syscalls.c: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS)
-       @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)";
+       $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0))
        $(_v)$(MAKESYSCALLS) $< names > /dev/null
 
 audit_kevents.c: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS)
-       @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)";
+       $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0))
        $(_v)$(MAKESYSCALLS) $< audit > /dev/null
 
 systrace_args.c:  $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS)
-       @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)";
+       $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0))
        $(_v)$(MAKESYSCALLS) $< systrace > /dev/null
 
 do_all: $(COMPONENT).filelist
index e5a34f7945062ffd325ab5e87a4c7e68aadfc6df..99c4f51bc872fa6e217a29272fa6683d72c09468 100644 (file)
@@ -28,10 +28,11 @@ OPTIONS/mach_vm_debug                       optional mach_vm_debug
 OPTIONS/mach_xp                                optional mach_xp
 OPTIONS/mach_xp_fpd                    optional mach_xp_fpd
 OPTIONS/quota                          optional quota
-OPTIONS/xpr_debug                      optional xpr_debug
 OPTIONS/kdebug                         optional kdebug
 OPTIONS/nfsclient                      optional nfsclient
 OPTIONS/nfsserver                      optional nfsserver
+OPTIONS/config_nfs4                    optional config_nfs4
+OPTIONS/config_triggers                        optional config_triggers
 OPTIONS/kernremote                     optional kernremote
 OPTIONS/compat_43                      optional compat_43
 OPTIONS/diagnostic                     optional diagnostic
@@ -58,8 +59,10 @@ OPTIONS/inet6                                optional inet6
 OPTIONS/ipv6send                       optional ipv6send
 OPTIONS/ether                          optional ether
 OPTIONS/vlan                           optional vlan
+OPTIONS/sixlowpan                      optional sixlowpan
 OPTIONS/bond                           optional bond
 OPTIONS/if_fake                                optional if_fake
+OPTIONS/if_headless                    optional if_headless
 OPTIONS/bpfilter                       optional bpfilter
 OPTIONS/multipath                      optional multipath
 OPTIONS/mptcp                          optional mptcp
@@ -72,6 +75,7 @@ OPTIONS/sendfile                      optional sendfile
 OPTIONS/pf                             optional pf
 OPTIONS/pflog                          optional pflog pf
 OPTIONS/zlib                           optional zlib
+OPTIONS/sixlowpan                      optional sixlowpan
 
 
 #
@@ -87,39 +91,49 @@ OPTIONS/fs_compression              optional fs_compression
 OPTIONS/config_imageboot              optional config_imageboot
 
 bsd/nfs/nfs4_subs.c                    optional nfsclient
+bsd/nfs/nfs4_subs.c                    optional config_nfs4
 bsd/nfs/nfs4_vnops.c                   optional nfsclient
+bsd/nfs/nfs4_vnops.c                   optional config_nfs4
 bsd/nfs/krpc_subr.c                    optional nfsclient
 bsd/nfs/nfs_bio.c                      optional nfsclient
-bsd/nfs/nfs_boot.c                     optional nfsclient
-bsd/nfs/nfs_gss.c                      optional nfsclient
-bsd/nfs/nfs_gss.c                      optional nfsserver
+bsd/nfs/nfs_bio.c                      optional config_nfs4
+bsd/nfs/nfs_boot.c                     optional config_netboot
+bsd/nfs/nfs_gss.c                      optional config_nfs_gss
 bsd/nfs/nfs_lock.c                     optional nfsclient
 bsd/nfs/nfs_node.c                     optional nfsclient
+bsd/nfs/nfs_node.c                     optional config_nfs4
 bsd/nfs/nfs_serv.c                     optional nfsserver
 bsd/nfs/nfs_socket.c                   optional nfsclient
 bsd/nfs/nfs_socket.c                   optional nfsserver
+bsd/nfs/nfs_socket.c                   optional config_nfs4
 bsd/nfs/nfs_srvcache.c                 optional nfsserver
 bsd/nfs/nfs_subs.c                     optional nfsclient
 bsd/nfs/nfs_subs.c                     optional nfsserver
+bsd/nfs/nfs_subs.c                     optional config_nfs4
 bsd/nfs/nfs_syscalls.c                 optional nfsclient
 bsd/nfs/nfs_syscalls.c                 optional nfsserver
+bsd/nfs/nfs_syscalls.c                 optional config_nfs4
 bsd/nfs/nfs_vfsops.c                   optional nfsclient
+bsd/nfs/nfs_vfsops.c                   optional config_nfs4
 bsd/nfs/nfs_vnops.c                    optional nfsclient
+bsd/nfs/nfs_vnops.c                    optional config_nfs4
 bsd/nfs/nfs_upcall.c                   optional nfsserver
-bsd/nfs/gss/gss_krb5_mech.c            optional nfsclient
-bsd/nfs/gss/gss_krb5_mech.c            optional nfsserver
-bsd/nfs/gss/ccrypto.c                  optional nfsclient
-bsd/nfs/gss/ccrypto.c                  optional nfsserver
-bsd/kern/netboot.c                     optional nfsclient
+bsd/nfs/gss/gss_krb5_mech.c            optional config_nfs_gss
+bsd/nfs/gss/ccrypto.c                  optional config_nfs_gss
+bsd/kern/netboot.c                     optional config_netboot
+
+# NFS v4 is on for macOS builds
 
 bsd/dev/dtrace/dtrace.c                        optional config_dtrace
 bsd/dev/dtrace/lockprof.c              optional config_dtrace
 bsd/dev/dtrace/lockstat.c              optional config_dtrace
 bsd/dev/dtrace/dtrace_ptss.c           optional config_dtrace
 bsd/dev/dtrace/dtrace_subr.c           optional config_dtrace
-bsd/dev/dtrace/dtrace_glue.c           standard
+bsd/dev/dtrace/dtrace_glue.c           optional config_dtrace
+bsd/dev/dtrace/dtrace_xoroshiro128_plus.c      optional config_dtrace
 bsd/dev/dtrace/blist.c                 optional config_dtrace
 bsd/dev/dtrace/fbt.c                   optional config_dtrace
+bsd/dev/dtrace/fbt_blacklist.c         optional config_dtrace
 bsd/dev/dtrace/sdt.c                   optional config_dtrace
 bsd/dev/dtrace/sdt_subr.c              optional config_dtrace
 bsd/dev/dtrace/systrace.c              optional config_dtrace
@@ -154,6 +168,7 @@ bsd/vfs/vfs_bio.c                   standard
 bsd/vfs/vfs_cache.c                    standard
 bsd/vfs/vfs_cluster.c                  standard
 bsd/vfs/vfs_conf.c                     standard
+bsd/vfs/vfs_conf.c                     optional config_nfs4
 bsd/vfs/vfs_fslog.c                    standard
 bsd/vfs/vfs_init.c                     standard
 bsd/vfs/vfs_lookup.c                   standard
@@ -203,6 +218,8 @@ bsd/net/if_loop.c                   optional loop
 bsd/net/if_mib.c                       optional networking
 bsd/net/if_vlan.c                      optional vlan
 bsd/net/if_fake.c                      optional if_fake
+bsd/net/if_headless.c                  optional if_headless
+bsd/net/if_6lowpan.c                   optional sixlowpan
 bsd/net/multicast_list.c               optional networking
 bsd/net/if_bond.c                      optional bond
 bsd/net/devtimer.c                     optional bond
@@ -244,10 +261,12 @@ bsd/net/if_llreach.c                      optional networking
 bsd/net/flowhash.c                     optional networking
 bsd/net/flowadv.c                      optional networking
 bsd/net/content_filter.c               optional content_filter
+bsd/net/content_filter_crypto.c         optional content_filter
 bsd/net/packet_mangler.c               optional packet_mangler
 bsd/net/if_llatbl.c                    optional networking
 bsd/net/nwk_wq.c                       optional networking
 bsd/net/skmem_sysctl.c         optional skywalk
+bsd/net/restricted_in_port.c           optional networking
 
 bsd/net/classq/classq.c                        optional networking
 bsd/net/classq/classq_sfb.c            optional networking
@@ -259,6 +278,7 @@ bsd/net/pktsched/pktsched.c         optional networking
 bsd/net/pktsched/pktsched_qfq.c                optional networking
 bsd/net/pktsched/pktsched_tcq.c                optional networking
 bsd/net/pktsched/pktsched_fq_codel.c   optional networking
+bsd/net/pktsched/pktsched_netem.c      optional networking
 
 bsd/netinet/cpu_in_cksum_gen.c         standard
 bsd/netinet/in_cksum.c                 optional inet
@@ -293,6 +313,7 @@ bsd/netinet/tcp_cubic.c                     optional inet
 bsd/netinet/cbrtf.c                    optional inet
 bsd/netinet/tcp_lro.c                  optional inet
 bsd/netinet/tcp_ledbat.c               optional inet
+bsd/netinet/tcp_log.c                  optional inet
 bsd/netinet/udp_usrreq.c               optional inet
 bsd/netinet/in_gif.c                   optional gif inet
 bsd/netinet/ip_ecn.c                   optional inet
@@ -327,9 +348,6 @@ bsd/netinet6/in6_ifattach.c                 optional inet6
 bsd/netinet6/ip6_input.c               optional inet6
 bsd/netinet6/ip6_output.c              optional inet6
 bsd/netinet6/in6_src.c                 optional inet6
-bsd/netinet6/ipcomp_core.c             optional ipsec
-bsd/netinet6/ipcomp_input.c            optional ipsec
-bsd/netinet6/ipcomp_output.c           optional ipsec
 bsd/netinet6/in6_mcast.c               optional inet6
 bsd/netinet6/in6_pcb.c                 optional inet6
 bsd/netinet6/in6_proto.c               optional inet6
@@ -347,11 +365,17 @@ bsd/netinet6/udp6_output.c                optional inet6
 bsd/netinet6/udp6_usrreq.c             optional inet6
 bsd/netinet6/ip6_id.c                          optional inet6
 
+bsd/net/sixxlowpan.c                   optional sixlowpan
+bsd/net/frame802154.c                  optional sixlowpan
+bsd/net/linkaddr.c                     optional sixlowpan
+
 bsd/netkey/key.c                       optional ipsec
 bsd/netkey/key_debug.c                 optional ipsec
 bsd/netkey/keysock.c                   optional ipsec
 bsd/netkey/keydb.c                     optional ipsec
 
+bsd/net/multi_layer_pkt_log.c          optional inet inet6 ipsec ipsec_esp
+
 bsd/crypto/rc4/rc4.c                   optional crypto
 
 #bsd/netpm/pm_aTT.c        optional pm
@@ -424,6 +448,8 @@ bsd/kern/kern_synch.c                       standard
 bsd/kern/kern_sysctl.c                 standard
 bsd/kern/kern_newsysctl.c              standard
 bsd/kern/kern_memorystatus.c   optional config_memorystatus
+bsd/kern/kern_memorystatus_freeze.c    optional config_memorystatus
+bsd/kern/kern_memorystatus_notify.c    optional config_memorystatus
 bsd/kern/kern_mib.c                    standard
 bsd/kern/kpi_mbuf.c                    optional sockets
 bsd/kern/kern_sfi.c                    standard
@@ -434,7 +460,6 @@ bsd/kern/mcache.c                   optional sockets
 bsd/kern/stackshot.c                   standard
 bsd/kern/subr_log.c                    standard
 bsd/kern/subr_prf.c                    standard
-bsd/kern/subr_prof.c                   standard
 bsd/kern/subr_sbuf.c                   standard
 bsd/kern/subr_xxx.c                    standard
 bsd/kern/sys_generic.c                 standard
@@ -495,6 +520,7 @@ bsd/conf/param.c                    standard
 ./ioconf.c                             standard
 
 bsd/kern/imageboot.c                   optional config_imageboot
+bsd/kern/chunklist.c                   optional config_imageboot_chunklist
 
 osfmk/kperf/kperfbsd.c                 optional kperf
 bsd/kern/kern_kpc.c                    optional kpc
@@ -509,6 +535,7 @@ bsd/miscfs/nullfs/null_vfsops.c     optional nullfs
 bsd/miscfs/nullfs/null_vnops.c      optional nullfs
 
 bsd/tests/bsd_tests.c                  optional config_xnupost
+bsd/tests/copyio_tests.c               optional config_xnupost
 bsd/tests/pmap_test_sysctl.c           optional config_xnupost
 
 bsd/net/skywalk_stubs.c                                standard
index 401b05a3d635baf04151b85f400267e7d9aa6a6f..6878221776718ee28dac88dccc113a61882a831a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <sys/shm_internal.h>
 #include <sys/aio_kern.h>
 
-struct  timezone tz = { 0, 0 };
+struct  timezone tz = { .tz_minuteswest = 0, .tz_dsttime = 0 };
 
 #if CONFIG_EMBEDDED
 #define NPROC 1000          /* Account for TOTAL_CORPSES_ALLOWED by making this slightly lower than we can. */
 #define NPROC_PER_UID 950
 #else
-#define NPROC (20 + 16 * 32)
+#define NPROC (20 + 32 * 32)
 #define NPROC_PER_UID (NPROC/2)
 #endif
 
index 0e0e8a435c24065009497c6773613621acf43483..8925f90709b48e381000b9a087f4e0085a3a6deb 100644 (file)
@@ -215,12 +215,7 @@ struct cdevsw cdevsw[] = {
                kmioctl, nullstop, nullreset, km_tty, ttselect,
                eno_mmap, eno_strat, eno_getc, eno_putc, 0
        },
-       [13 ... 41] = NO_CDEVICE,
-       [42] = {
-               volopen, volclose, eno_rdwrt, eno_rdwrt,
-               volioctl, eno_stop, eno_reset, 0, (select_fcn_t *) seltrue,
-               eno_mmap, eno_strat, eno_getc, eno_putc, 0
-       }
+       [13 ... 42] = NO_CDEVICE,
 };
 const int nchrdev = sizeof(cdevsw) / sizeof(cdevsw[0]);
 
@@ -237,7 +232,7 @@ isdisk(dev_t dev, int type)
 
        switch (type) {
        case VCHR:
-               maj = chrtoblk(maj);
+               maj = chrtoblk(dev);
                if (maj == NODEV) {
                        break;
                }
@@ -251,32 +246,7 @@ isdisk(dev_t dev, int type)
        return 0;
 }
 
-static int      chrtoblktab[] = {
-       /* CHR *//* BLK *//* CHR *//* BLK */
-       /* 0 */ NODEV, /* 1 */ NODEV,
-       /* 2 */ NODEV, /* 3 */ NODEV,
-       /* 4 */ NODEV, /* 5 */ NODEV,
-       /* 6 */ NODEV, /* 7 */ NODEV,
-       /* 8 */ NODEV, /* 9 */ NODEV,
-       /* 10 */ NODEV, /* 11 */ NODEV,
-       /* 12 */ NODEV, /* 13 */ NODEV,
-       /* 14 */ NODEV, /* 15 */ NODEV,
-       /* 16 */ NODEV, /* 17 */ NODEV,
-       /* 18 */ NODEV, /* 19 */ NODEV,
-       /* 20 */ NODEV, /* 21 */ NODEV,
-       /* 22 */ NODEV, /* 23 */ NODEV,
-       /* 24 */ NODEV, /* 25 */ NODEV,
-       /* 26 */ NODEV, /* 27 */ NODEV,
-       /* 28 */ NODEV, /* 29 */ NODEV,
-       /* 30 */ NODEV, /* 31 */ NODEV,
-       /* 32 */ NODEV, /* 33 */ NODEV,
-       /* 34 */ NODEV, /* 35 */ NODEV,
-       /* 36 */ NODEV, /* 37 */ NODEV,
-       /* 38 */ NODEV, /* 39 */ NODEV,
-       /* 40 */ NODEV, /* 41 */ NODEV,
-       /* 42 */ NODEV, /* 43 */ NODEV,
-       /* 44 */ NODEV,
-};
+static int chrtoblktab[] = {[0 ... nchrdev] = NODEV };
 
 /*
  * convert chr dev to blk dev
index 23d09f6a0d7d8b2969f2749d837788badc7b5adc..1f8dbd2ef5cab9524cfdbbcb7ca6bd24ae6df666 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005-2008 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2005-2018 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -26,8 +26,6 @@
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
-#define MACH__POSIX_C_SOURCE_PRIVATE 1  /* pulls in suitable savearea from
-                                        * mach/ppc/thread_status.h */
 #include <arm/caches_internal.h>
 #include <arm/proc_reg.h>
 
@@ -44,7 +42,7 @@
 #include <sys/kauth.h>
 #include <sys/dtrace.h>
 #include <sys/dtrace_impl.h>
-#include <libkern/OSAtomic.h>
+#include <machine/atomic.h>
 #include <kern/simple_lock.h>
 #include <kern/sched_prim.h>            /* for thread_wakeup() */
 #include <kern/thread_call.h>
@@ -123,7 +121,7 @@ xcRemote(void *foo)
                (pArg->f)(pArg->arg);
        }
 
-       if (hw_atomic_sub(&dt_xc_sync, 1) == 0) {
+       if (os_atomic_dec(&dt_xc_sync, relaxed) == 0) {
                thread_wakeup((event_t) &dt_xc_sync);
        }
 }
index f2958312993a734870e870a42b7749e1a91053a6..f227223a547cce7707a9241cea49d8be0dced7ba 100644 (file)
  * Use is subject to license terms.
  */
 
-/*
- * #pragma ident       "@(#)dtrace_subr.c      1.12    05/06/08 SMI"
- */
-
 #include <sys/dtrace.h>
 #include <sys/dtrace_glue.h>
 #include <sys/dtrace_impl.h>
@@ -172,16 +168,3 @@ dtrace_user_probe(arm_saved_state_t *regs, unsigned int instr)
 
        return KERN_FAILURE;
 }
-
-void
-dtrace_safe_synchronous_signal(void)
-{
-       /* Not implemented */
-}
-
-int
-dtrace_safe_defer_signal(void)
-{
-       /* Not implemented */
-       return 0;
-}
index c45a952882baa38a2fa6d0db1785f27a3a9c63f6..08f831a01c31f2fd84d84994193775047cca529b 100644 (file)
  * Use is subject to license terms.
  */
 
-/*
- * #pragma ident       "@(#)fasttrap_isa.c     1.19    05/09/14 SMI"
- */
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL                 /* Solaris vs. Darwin */
-#endif
-#endif
-
 #include <sys/fasttrap_isa.h>
 #include <sys/fasttrap_impl.h>
 #include <sys/dtrace.h>
@@ -293,8 +283,8 @@ fasttrap_return_common(proc_t *p, arm_saved_state_t *regs, user_addr_t pc, user_
                }
 
                if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) {
-                       uint8_t already_triggered = atomic_or_8(&probe->ftp_triggered, 1);
-                       if (already_triggered) {
+                       if (os_atomic_xchg(&probe->ftp_triggered, 1, relaxed)) {
+                               /* already triggered */
                                continue;
                        }
                }
@@ -326,6 +316,9 @@ fasttrap_return_common(proc_t *p, arm_saved_state_t *regs, user_addr_t pc, user_
        lck_mtx_unlock(pid_mtx);
 }
 
+#if DEBUG
+__dead2
+#endif
 static void
 fasttrap_sigsegv(proc_t *p, uthread_t t, user_addr_t addr, arm_saved_state_t *regs)
 {
@@ -522,8 +515,8 @@ fasttrap_pid_probe(arm_saved_state_t *regs)
 #endif
                        } else {
                                if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) {
-                                       uint8_t already_triggered = atomic_or_8(&probe->ftp_triggered, 1);
-                                       if (already_triggered) {
+                                       if (os_atomic_xchg(&probe->ftp_triggered, 1, relaxed)) {
+                                               /* already triggered */
                                                continue;
                                        }
                                }
index 95ee1dfe2f2d4c5073dc7a927e908240d5e070d7..9986da8e415ae4c58c45f81bd5bd88d3f10c1a30 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
  */
 /*
  * CDDL HEADER START
  * Use is subject to license terms.
  */
 
-/* #pragma ident       "@(#)fbt.c      1.15    05/09/19 SMI" */
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL                 /* Solaris vs. Darwin */
-#endif
-#endif
-
-#define MACH__POSIX_C_SOURCE_PRIVATE 1  /* pulls in suitable savearea from
-                                        * mach/ppc/thread_status.h */
 #include <kern/thread.h>
 #include <mach/thread_status.h>
 #include <arm/proc_reg.h>
index 695c74ff0ec8de794dd9cab61a0b70201351ec69..076f3abd8e6930e2551b2717160cc558c5d23273 100644 (file)
@@ -17,7 +17,6 @@
 #include        <pexpert/arm64/board_config.h>
 
 #if __arm64__
-extern int bootarg_no64exec;    /* bsd_init.c */
 static cpu_subtype_t cpu_subtype32(void);
 #endif /* __arm64__ */
 
@@ -47,7 +46,7 @@ cpu_subtype32()
 *              not acceptable.
 **********************************************************************/
 int
-grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype)
+grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype, bool allow_simulator_binary __unused)
 {
 #if __arm64__
        cpu_subtype_t hostsubtype =
@@ -59,10 +58,6 @@ grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype)
        switch (exectype) {
 #if __arm64__
        case CPU_TYPE_ARM64:
-               if (bootarg_no64exec) {
-                       return 0;
-               }
-
                switch (hostsubtype) {
                case CPU_SUBTYPE_ARM64_V8:
                        switch (execsubtype) {
index 65eb5a2ae6c78806cd33c4c98071593c02ef8085..094970e285759dc2e9b99089aca8eede4929d138 100644 (file)
@@ -166,7 +166,7 @@ int
 munge_wwl(const void *regs, void *args)
 {
        if (REGS_TO_STYLE(regs) == kDirect) {
-               return marshal_no_pad(regs, args, 3);
+               return marshal_no_pad(regs, args, 4);
        } else {
                DECLARE_AND_CAST(regs, args, ss, uu_args);
 
diff --git a/bsd/dev/arm/pci_device.h b/bsd/dev/arm/pci_device.h
deleted file mode 100644 (file)
index f624a42..0000000
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
- */
-/*
- * @OSF_FREE_COPYRIGHT@
- *
- */
-/*
- * HISTORY
- *
- * Revision 1.2  1998/09/30 21:20:44  wsanchez
- * Merged in IntelMerge1 (mburg: Intel support)
- *
- * Revision 1.1.2.1  1998/09/30 18:18:50  mburg
- * Changes for Intel port
- *
- * Revision 1.1.1.1  1998/03/07 02:25:45  wsanchez
- * Import of OSF Mach kernel (~mburg)
- *
- * Revision 1.1.6.2  1995/12/15  10:52:14  bernadat
- *      Split dev and vendor ids.
- *      [95/11/15            bernadat]
- *
- * Revision 1.1.6.1  1995/02/23  17:22:27  alanl
- *      Taken from DIPC2_SHARED
- *      [1995/01/03  19:09:31  alanl]
- *
- * Revision 1.1.2.1  1994/10/11  18:24:42  rwd
- *      Created.
- *      [1994/10/11  18:15:31  rwd]
- *
- * $EndLog$
- */
-/*
- * Taken from
- *
- *  Copyright (c) 1994 Wolfgang Stanglmeier, Koeln, Germany
- *                     <wolf@dentaro.GUN.de>
- */
-
-#ifndef __PCI_DEVICE_H__
-#define __PCI_DEVICE_H__
-
-/*------------------------------------------------------------
- *
- *  Per driver structure.
- *
- *------------------------------------------------------------
- */
-
-typedef unsigned short pci_vendor_id_t;
-typedef unsigned short pci_dev_id_t;
-
-typedef union {
-       unsigned long cfg1;
-       struct {
-               unsigned char   enable;
-               unsigned char   forward;
-               unsigned short  port;
-       } cfg2;
-} pcici_t;
-
-struct pci_driver {
-       int                 (*probe )(pcici_t pci_ident);/* test whether device
-                                                         *  is present */
-       int                 (*attach)(pcici_t pci_ident);/* setup driver for a
-                                                         *  device */
-       pci_vendor_id_t     vendor_id;                  /* vendor pci id */
-       pci_dev_id_t        device_id;                  /* device pci id */
-       char                *name;                      /* device name */
-       char                *vendor;                    /* device long name */
-       void                (*intr)(int);               /* interupt handler */
-};
-
-/*-----------------------------------------------------------
- *
- *  Per device structure.
- *
- *  It is initialized by the config utility and should live in
- *  "ioconf.c". At the moment there is only one field.
- *
- *  This is a first attempt to include the pci bus to 386bsd.
- *  So this structure may grow ..
- *
- *-----------------------------------------------------------
- */
-
-struct pci_device {
-       struct pci_driver * pd_driver;
-};
-
-/*-----------------------------------------------------------
- *
- *  This functions may be used by drivers to map devices
- *  to virtual and physical addresses. The va and pa
- *  addresses are "in/out" parameters. If they are 0
- *  on entry, the mapping function assigns an address.
- *
- *-----------------------------------------------------------
- */
-
-int pci_map_mem(pcici_t tag,
-    unsigned long entry,
-    vm_offset_t *va,
-    vm_offset_t *pa);
-#endif /*__PCI_DEVICE_H__*/
diff --git a/bsd/dev/arm/pio.h b/bsd/dev/arm/pio.h
deleted file mode 100644 (file)
index 9cbdc65..0000000
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) 2000-2007 AppleInc. All rights reserved.
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/*
- * HISTORY
- *
- * Revision 1.2  1998/09/30 21:20:45  wsanchez
- * Merged in IntelMerge1 (mburg: Intel support)
- *
- * Revision 1.1.2.1  1998/09/30 18:18:50  mburg
- * Changes for Intel port
- *
- * Revision 1.1.1.1  1998/03/07 02:25:38  wsanchez
- * Import of OSF Mach kernel (~mburg)
- *
- * Revision 1.1.8.2  1996/07/31  09:46:36  paire
- *      Merged with nmk20b7_shared (1.1.11.2 -> 1.1.11.1)
- *      [96/06/10            paire]
- *
- * Revision 1.1.11.2  1996/06/13  12:38:25  bernadat
- *      Do not use inline macros when MACH_ASSERT is configured.
- *      [96/05/24            bernadat]
- *
- * Revision 1.1.11.1  1996/05/14  13:50:23  paire
- *      Added new linl and loutl __inline__.
- *      Added conditional compilation for [l]{in|oub}[bwl]() __inline__.
- *      [95/11/24            paire]
- *
- * Revision 1.1.8.1  1994/09/23  02:00:28  ezf
- *      change marker to not FREE
- *      [1994/09/22  21:25:52  ezf]
- *
- * Revision 1.1.4.5  1993/08/09  19:40:41  dswartz
- *      Add ANSI prototypes - CR#9523
- *      [1993/08/06  17:45:57  dswartz]
- *
- * Revision 1.1.4.4  1993/06/11  15:17:37  jeffc
- *      CR9176 - ANSI C violations: inb/outb macros must be changed from
- *      ({ ... }) to inline functions, with proper type definitions. Callers
- *      must pass proper types to these functions: 386 I/O port addresses
- *      are unsigned shorts (not pointers).
- *      [1993/06/10  14:26:10  jeffc]
- *
- * Revision 1.1.4.3  1993/06/07  22:09:28  jeffc
- *      CR9176 - ANSI C violations: trailing tokens on CPP
- *      directives, extra semicolons after decl_ ..., asm keywords
- *      [1993/06/07  19:00:26  jeffc]
- *
- * Revision 1.1.4.2  1993/06/04  15:28:45  jeffc
- *      CR9176 - ANSI problems -
- *      Added casts to get macros to take caddr_t as an I/O space address.
- *      [1993/06/04  13:45:55  jeffc]
- *
- * Revision 1.1  1992/09/30  02:25:51  robert
- *      Initial revision
- *
- * $EndLog$
- */
-/* CMU_HIST */
-/*
- * Revision 2.5  91/05/14  16:14:20  mrt
- *      Correcting copyright
- *
- * Revision 2.4  91/02/05  17:13:56  mrt
- *      Changed to new Mach copyright
- *      [91/02/01  17:37:08  mrt]
- *
- * Revision 2.3  90/12/20  16:36:37  jeffreyh
- *      changes for __STDC__
- *      [90/12/07            jeffreyh]
- *
- * Revision 2.2  90/11/26  14:48:41  rvb
- *      Pulled from 2.5
- *      [90/11/22  10:09:38  rvb]
- *
- *      [90/08/14            mg32]
- *
- *      Now we know how types are factor in.
- *      Cleaned up a bunch: eliminated ({ for output and flushed unused
- *      output variables.
- *      [90/08/14            rvb]
- *
- *      This is how its done in gcc:
- *              Created.
- *      [90/03/26            rvb]
- *
- */
-/* CMU_ENDHIST */
-/*
- * Mach Operating System
- * Copyright (c) 1991,1990 Carnegie Mellon University
- * All Rights Reserved.
- *
- * Permission to use, copy, modify and distribute this software and its
- * documentation is hereby granted, provided that both the copyright
- * notice and this permission notice appear in all copies of the
- * software, derivative works or modified versions, and any portions
- * thereof, and that both notices appear in supporting documentation.
- *
- * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
- * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
- * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- *
- * Carnegie Mellon requests users of this software to return to
- *
- *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
- *  School of Computer Science
- *  Carnegie Mellon University
- *  Pittsburgh PA 15213-3890
- *
- * any improvements or extensions that they make and grant Carnegie Mellon
- * the rights to redistribute these changes.
- */
-/*
- */
-#ifndef ARM_PIO_H
-#define ARM_PIO_H
-
-typedef unsigned short i386_ioport_t;
-
-/* read a longword */
-extern unsigned long    inl(
-       i386_ioport_t   port);
-/* read a shortword */
-extern unsigned short   inw(
-       i386_ioport_t   port);
-/* read a byte */
-extern unsigned char    inb(
-       i386_ioport_t   port);
-/* write a longword */
-extern void             outl(
-       i386_ioport_t   port,
-       unsigned long   datum);
-/* write a word */
-extern void             outw(
-       i386_ioport_t   port,
-       unsigned short  datum);
-/* write a longword */
-extern void             outb(
-       i386_ioport_t   port,
-       unsigned char   datum);
-
-/* input an array of longwords */
-extern void             linl(
-       i386_ioport_t   port,
-       int             * data,
-       int             count);
-/* output an array of longwords */
-extern void             loutl(
-       i386_ioport_t   port,
-       int             * data,
-       int             count);
-
-/* input an array of words */
-extern void             linw(
-       i386_ioport_t   port,
-       int             * data,
-       int             count);
-/* output an array of words */
-extern void             loutw(
-       i386_ioport_t   port,
-       int             * data,
-       int             count);
-
-/* input an array of bytes */
-extern void             linb(
-       i386_ioport_t   port,
-       char            * data,
-       int             count);
-/* output an array of bytes */
-extern void             loutb(
-       i386_ioport_t   port,
-       char            * data,
-       int             count);
-
-extern __inline__ unsigned long
-inl(
-       i386_ioport_t port)
-{
-       unsigned long datum;
-       __asm__ volatile ("inl %1, %0" : "=a" (datum) : "d" (port));
-       return datum;
-}
-
-extern __inline__ unsigned short
-inw(
-       i386_ioport_t port)
-{
-       unsigned short datum;
-       __asm__ volatile (".byte 0x66; inl %1, %0" : "=a" (datum) : "d" (port));
-       return datum;
-}
-
-extern __inline__ unsigned char
-inb(
-       i386_ioport_t port)
-{
-       unsigned char datum;
-       __asm__ volatile ("inb %1, %0" : "=a" (datum) : "d" (port));
-       return datum;
-}
-
-extern __inline__ void
-outl(
-       i386_ioport_t port,
-       unsigned long datum)
-{
-       __asm__ volatile ("outl %0, %1" : : "a" (datum), "d" (port));
-}
-
-extern __inline__ void
-outw(
-       i386_ioport_t port,
-       unsigned short datum)
-{
-       __asm__ volatile (".byte 0x66; outl %0, %1" : : "a" (datum), "d" (port));
-}
-
-extern __inline__ void
-outb(
-       i386_ioport_t port,
-       unsigned char datum)
-{
-       __asm__ volatile ("outb %0, %1" : : "a" (datum), "d" (port));
-}
-
-#endif /* ARM_PIO_H */
index 938aa048b1f22942f6070cadd12dd9b9eeb5e351..2fa0b7d87c4692e066848b1e972da8fbe8034037 100644 (file)
  * Use is subject to license terms.
  */
 
-/* #pragma ident       "@(#)sdt.c      1.6     06/03/24 SMI" */
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL /* Solaris vs. Darwin */
-#endif
-#endif
-
-#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */
 #include <kern/cpu_data.h>
 #include <kern/thread.h>
 #include <mach/thread_status.h>
index 298450d4ea6512e1e284b7bb8d4ca30239c36f8b..a76f54b60e5dfe0fbb57ba65bfd3d1b915a85523 100644 (file)
@@ -7,6 +7,7 @@
  *
  */
 
+#include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/ioctl.h>
@@ -33,7 +34,11 @@ copyoutstr(const void *from, user_addr_t to, size_t maxlen, size_t * lencopied)
 {
        size_t          slen;
        size_t          len;
-       int             error = 0;
+       int             error = copyoutstr_prevalidate(from, to, maxlen);
+
+       if (__improbable(error)) {
+               return error;
+       }
 
        slen = strlen(from) + 1;
        if (slen > maxlen) {
index 5ac5fcde257e2be9e106106cbb99116c0ec775a3..36deb9bff84b60c94c43749a38a8fbebeb863e21 100644 (file)
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/kauth.h>
+#include <sys/bitstring.h>
 
 #include <security/audit/audit.h>
 
+#if CONFIG_MACF
+#include <security/mac_framework.h>
+#endif
+
 #if CONFIG_DTRACE
 extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *);
 extern void dtrace_systrace_syscall_return(unsigned short, int, int *);
@@ -88,8 +93,8 @@ unix_syscall(
 {
        struct sysent  *callp;
        int             error;
-       unsigned short  code;
-       pid_t           pid;
+       unsigned short  code, syscode;
+       pid_t                   pid;
 
 #if defined(__arm__)
        assert(is_saved_state32(state));
@@ -101,16 +106,15 @@ unix_syscall(
 
 #define unix_syscall_kprintf(x...)     /* kprintf("unix_syscall: " x) */
 
-#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST)
        if (kdebug_enable && !code_is_kdebug_trace(code)) {
                arm_trace_unix_syscall(code, state);
        }
-#endif
 
        if ((uthread->uu_flag & UT_VFORK))
                proc = current_proc();
 
-       callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code];
+    syscode = (code < nsysent) ? code : SYS_invalid;
+       callp   = &sysent[syscode];
 
        /*
         * sy_narg is inaccurate on ARM if a 64 bit parameter is specified. Since user_addr_t
@@ -157,10 +161,22 @@ unix_syscall(
        unix_syscall_kprintf("code %d (pid %d - %s, tid %lld)\n", code,
                        pid, proc->p_comm, thread_tid(current_thread()));
 
+#if CONFIG_MACF
+       if (__improbable(proc->syscall_filter_mask != NULL && !bitstr_test(proc->syscall_filter_mask, syscode))) {
+               error = mac_proc_check_syscall_unix(proc, syscode);
+               if (error)
+                       goto skip_syscall;
+       }
+#endif /* CONFIG_MACF */
+
        AUDIT_SYSCALL_ENTER(code, proc, uthread);
        error = (*(callp->sy_call)) (proc, &uthread->uu_arg[0], &(uthread->uu_rval[0]));
        AUDIT_SYSCALL_EXIT(code, proc, uthread, error);
 
+#if CONFIG_MACF
+skip_syscall:
+#endif /* CONFIG_MACF */
+
        unix_syscall_kprintf("code %d, error %d, results %x, %x (pid %d - %s, tid %lld)\n", code, error, 
                        uthread->uu_rval[0], uthread->uu_rval[1], 
                        pid, get_bsdtask_info(current_task()) ? proc->p_comm : "unknown" , thread_tid(current_thread()));
@@ -194,13 +210,10 @@ unix_syscall(
                 */
                throttle_lowpri_io(1);
        }
-#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST)
        if (kdebug_enable && !code_is_kdebug_trace(code)) {
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                       BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
-                       error, uthread->uu_rval[0], uthread->uu_rval[1], pid, 0);
+               KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
+                       error, uthread->uu_rval[0], uthread->uu_rval[1], pid);
        }
-#endif
 
 #if PROC_REF_DEBUG
        if (__improbable(uthread_get_proc_refcount(uthread) != 0)) {
@@ -264,13 +277,10 @@ unix_syscall_return(int error)
                 */
                throttle_lowpri_io(1);
        }
-#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST)
        if (kdebug_enable && !code_is_kdebug_trace(code)) {
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                       BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
-                       error, uthread->uu_rval[0], uthread->uu_rval[1], proc->p_pid, 0);
+               KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
+                   error, uthread->uu_rval[0], uthread->uu_rval[1], proc->p_pid);
        }
-#endif
 
        thread_exception_return();
        /* NOTREACHED */
@@ -321,15 +331,14 @@ arm_prepare_u32_syscall_return(struct sysent *callp, arm_saved_state_t *regs, ut
 static void
 arm_trace_u32_unix_syscall(int code, arm_saved_state32_t *regs) 
 {
-       boolean_t indirect = (regs->save_r12 == 0);
-       if (indirect)
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
-                       BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
-                       regs->save_r1, regs->save_r2, regs->save_r3, regs->save_r4, 0);
-       else
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
-                       BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
-                       regs->save_r0, regs->save_r1, regs->save_r2, regs->save_r3, 0);
+       bool indirect = (regs->save_r12 == 0);
+       if (indirect) {
+               KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
+                   regs->save_r1, regs->save_r2, regs->save_r3, regs->save_r4);
+       } else {
+               KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
+                   regs->save_r0, regs->save_r1, regs->save_r2, regs->save_r3);
+       }
 }
 
 static void
@@ -597,7 +606,7 @@ arm_prepare_u64_syscall_return(struct sysent *callp, arm_saved_state_t *regs, ut
        arm_saved_state64_t *ss64 = saved_state64(regs);
 
        if (error == ERESTART) {
-               ss64->pc -= 4;
+               add_saved_state_pc(regs, -4);
        } else if (error != EJUSTRETURN) {
                if (error) {
                        ss64->x[0] = error;
@@ -642,15 +651,14 @@ arm_prepare_u64_syscall_return(struct sysent *callp, arm_saved_state_t *regs, ut
 static void
 arm_trace_u64_unix_syscall(int code, arm_saved_state64_t *regs) 
 {
-       boolean_t indirect = (regs->x[ARM64_SYSCALL_CODE_REG_NUM] == 0);
-       if (indirect)
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
-                       BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
-                       regs->x[1], regs->x[2], regs->x[3], regs->x[4], 0);
-       else
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
-                       BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
-                       regs->x[0], regs->x[1], regs->x[2], regs->x[3], 0);
+       bool indirect = (regs->x[ARM64_SYSCALL_CODE_REG_NUM] == 0);
+       if (indirect) {
+               KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
+                   regs->x[1], regs->x[2], regs->x[3], regs->x[4]);
+       } else {
+               KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
+                   regs->x[0], regs->x[1], regs->x[2], regs->x[3]);
+       }
 }
 
 static void
diff --git a/bsd/dev/arm/table_inline.h b/bsd/dev/arm/table_inline.h
deleted file mode 100644 (file)
index f599613..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
- */
-/*
- * Copyright (c) 1992 NeXT Computer, Inc.
- *
- * Intel386 Family:    Selector based access to descriptor tables.
- *
- * HISTORY
- *
- * 2 April 1992 ? at NeXT
- *     Created.
- */
-
-#include <architecture/i386/table.h>
-
-#include <machdep/i386/gdt.h>
-#include <machdep/i386/idt.h>
-
-static inline gdt_entry_t *
-sel_to_gdt_entry(sel_t sel)
-{
-       return &gdt[sel.index];
-}
-
-static inline idt_entry_t *
-sel_to_idt_entry(sel_t sel)
-{
-       return &idt[sel.index];
-}
-
-static inline ldt_entry_t *
-sel_to_ldt_entry(ldt_t *tbl, sel_t sel)
-{
-       return &tbl[sel.index];
-}
index 12d7b69f7dddf180d2d0b51a1f9abf376131e642..1e3bb03f9b356accc214bb9403d672305719a71e 100644 (file)
@@ -24,6 +24,7 @@
 #include <arm/proc_reg.h>
 
 #include <kern/assert.h>
+#include <kern/ast.h>
 #include <pexpert/pexpert.h>
 
 extern struct arm_saved_state *get_user_regs(thread_t);
@@ -264,7 +265,8 @@ sendsig(
        user_addr_t catcher,
        int sig,
        int mask,
-       __unused uint32_t code
+       __unused uint32_t code,
+       sigset_t siginfo
        )
 {
        union {
@@ -300,7 +302,7 @@ sendsig(
        bzero(&ts, sizeof(ts));
        bzero(&user_frame, sizeof(user_frame));
 
-       if (p->p_sigacts->ps_siginfo & sigmask(sig)) {
+       if (siginfo & sigmask(sig)) {
                infostyle = UC_FLAVOR;
        } else {
                infostyle = UC_TRAD;
@@ -409,6 +411,30 @@ sendsig(
                break;
 
        case SIGFPE:
+               switch (ut->uu_code) {
+               case EXC_ARM_FP_UF:
+                       sinfo.si_code = FPE_FLTUND;
+                       break;
+               case EXC_ARM_FP_OF:
+                       sinfo.si_code = FPE_FLTOVF;
+                       break;
+               case EXC_ARM_FP_IO:
+                       sinfo.si_code = FPE_FLTINV;
+                       break;
+               case EXC_ARM_FP_DZ:
+                       sinfo.si_code = FPE_FLTDIV;
+                       break;
+               case EXC_ARM_FP_ID:
+                       sinfo.si_code = FPE_FLTINV;
+                       break;
+               case EXC_ARM_FP_IX:
+                       sinfo.si_code = FPE_FLTRES;
+                       break;
+               default:
+                       sinfo.si_code = FPE_NOOP;
+                       break;
+               }
+
                break;
 
        case SIGBUS:
@@ -730,6 +756,9 @@ sigreturn(
        th_act = current_thread();
        ut = (struct uthread *) get_bsdthread_info(th_act);
 
+       /* see osfmk/kern/restartable.c */
+       act_set_ast_reset_pcs(th_act);
+
        if (proc_is64bit_data(p)) {
 #if defined(__arm64__)
                error = sigreturn_copyin_ctx64(&uctx.uc64, &mctx.mc64, uap->uctx);
index e40f4340a4c8f2771eab836daa65c91bf16fc6aa..06062ce28b7e32dbc7a2ccc70836f5c286f5daa5 100644 (file)
@@ -215,12 +215,7 @@ struct cdevsw cdevsw[] = {
                kmioctl, nullstop, nullreset, km_tty, ttselect,
                eno_mmap, eno_strat, eno_getc, eno_putc, 0
        },
-       [13 ... 41] = NO_CDEVICE,
-       [42] = {
-               volopen, volclose, eno_rdwrt, eno_rdwrt,
-               volioctl, eno_stop, eno_reset, 0, (select_fcn_t *) seltrue,
-               eno_mmap, eno_strat, eno_getc, eno_putc, 0
-       }
+       [13 ... 42] = NO_CDEVICE,
 };
 const int nchrdev = sizeof(cdevsw) / sizeof(cdevsw[0]);
 
@@ -237,7 +232,7 @@ isdisk(dev_t dev, int type)
 
        switch (type) {
        case VCHR:
-               maj = chrtoblk(maj);
+               maj = chrtoblk(dev);
                if (maj == NODEV) {
                        break;
                }
@@ -251,32 +246,7 @@ isdisk(dev_t dev, int type)
        return 0;
 }
 
-static int      chrtoblktab[] = {
-       /* CHR *//* BLK *//* CHR *//* BLK */
-       /* 0 */ NODEV, /* 1 */ NODEV,
-       /* 2 */ NODEV, /* 3 */ NODEV,
-       /* 4 */ NODEV, /* 5 */ NODEV,
-       /* 6 */ NODEV, /* 7 */ NODEV,
-       /* 8 */ NODEV, /* 9 */ NODEV,
-       /* 10 */ NODEV, /* 11 */ NODEV,
-       /* 12 */ NODEV, /* 13 */ NODEV,
-       /* 14 */ NODEV, /* 15 */ NODEV,
-       /* 16 */ NODEV, /* 17 */ NODEV,
-       /* 18 */ NODEV, /* 19 */ NODEV,
-       /* 20 */ NODEV, /* 21 */ NODEV,
-       /* 22 */ NODEV, /* 23 */ NODEV,
-       /* 24 */ NODEV, /* 25 */ NODEV,
-       /* 26 */ NODEV, /* 27 */ NODEV,
-       /* 28 */ NODEV, /* 29 */ NODEV,
-       /* 30 */ NODEV, /* 31 */ NODEV,
-       /* 32 */ NODEV, /* 33 */ NODEV,
-       /* 34 */ NODEV, /* 35 */ NODEV,
-       /* 36 */ NODEV, /* 37 */ NODEV,
-       /* 38 */ NODEV, /* 39 */ NODEV,
-       /* 40 */ NODEV, /* 41 */ NODEV,
-       /* 42 */ NODEV, /* 43 */ NODEV,
-       /* 44 */ NODEV,
-};
+static int chrtoblktab[] = {[0 ... nchrdev] = NODEV };
 
 /*
  * convert chr dev to blk dev
index 86d892aa35bc27a4742ad16ec9fa1d3d7e445ddc..35f317a445e08bb62ff15972526450c87a4f6947 100644 (file)
@@ -221,6 +221,26 @@ L_post_initial_offset:
        add     w7, w7, w9
 1:
 
+/*
+ *             if ((uintptr_t)data & 4) {
+ *                     if (mlen < 4)
+ *                             goto L2_bytes;
+ *                     partial += *(uint32_t *)(void *)data;
+ *                     data += 4;
+ *                     mlen -= 4;
+ *             }
+ */
+       // align on 8-bytes boundary if applicable
+       tst     data, #4
+       b.eq    1f
+       cmp     mlen, #4
+       b.lt    L2_bytes
+       ldr     w9, [data], #4
+       sub     mlen, mlen, #4
+       adds    w7, w7, w9
+       adc     x7, x7, x10 // assumes x10 still is #0 as set above
+1:
+
 /*
  *             while (mlen >= 64) {
  *                     __builtin_prefetch(data + 32);
index 48bf43cb82b4a52134194f80963938496cd39ab5..c9cb735821deb74b931c5866a05746783ecf1901 100644 (file)
 /*
- * Copyright (c) 2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2017-2018 Apple Inc. All rights reserved.
  *
- * Disassemblers for ARM (arm), Thumb (thumb16), and Thumb2 (thumb32).
- *
- * Each disassembly begins with a call to dtrace_decode_arm or dtrace_decode_thumb. The thumb
- * decoder will then call dtrace_decode_thumb16 or dtrace_decode_thumb32 as appropriate.
- *
- * The respective disassembly functions are all of the form {arm,thumb16,thumb32}_type. They
- * follow the ordering and breakdown in the ARMv7 Architecture Reference Manual.
- */
-
-#include  <sys/fasttrap_isa.h>
-
-#define BITS(x, n, mask) (((x) >> (n)) & (mask))
-
-static uint32_t
-thumb32_instword_to_arm(uint16_t hw1, uint16_t hw2)
-{
-       return (hw1 << 16) | hw2;
-}
-
-int dtrace_decode_arm(uint32_t instr);
-int dtrace_decode_arm64(uint32_t instr);
-int dtrace_decode_thumb(uint32_t instr);
-
-/*
- * VFP decoder - shared between ARM and THUMB32 mode
- */
-
-static
-int
-vfp_struct_loadstore(uint32_t instr)
-{
-       if (ARM_RM(instr) != REG_PC && ARM_RN(instr) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-vfp_64transfer(uint32_t instr)
-{
-       /* These instructions all use RD and RN */
-       if (ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-vfp_transfer(uint32_t instr)
-{
-       /* These instructions all use RD only */
-       if (ARM_RD(instr) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-vfp_loadstore(uint32_t instr)
-{
-       int opcode = BITS(instr, 20, 0x1F);
-
-       /* Instrument VLDR */
-       if ((opcode & 0x13) == 0x11 && ARM_RN(instr) == REG_PC) {
-               return FASTTRAP_T_VLDR_PC_IMMED;
-       }
-
-       /* These instructions all use RN only */
-       if (ARM_RN(instr) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-/*
- * ARM decoder
- */
-
-static
-int
-arm_unconditional_misc(uint32_t instr)
-{
-       int op = BITS(instr, 20, 0x7F);
-
-       if ((op & 0x60) == 0x20) {
-               /* VFP data processing uses its own registers */
-               return FASTTRAP_T_COMMON;
-       }
-
-       if ((op & 0x71) == 0x40) {
-               return vfp_struct_loadstore(instr);
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_unconditional(uint32_t instr)
-{
-       if (BITS(instr, 27, 0x1) == 0) {
-               return arm_unconditional_misc(instr);
-       }
-
-       /* The rest are privileged or BL/BLX, do not instrument */
-
-       /* Do not need to instrument BL/BLX either, see comment in arm_misc(uint32_t) */
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_syscall_coproc(uint32_t instr)
-{
-       /* Instrument any VFP data processing instructions, ignore the rest */
-
-       int op1 = BITS(instr, 20, 0x3F), coproc = BITS(instr, 8, 0xF), op = BITS(instr, 4, 0x1);
-
-       if ((op1 & 0x3E) == 0 || (op1 & 0x30) == 0x30) {
-               /* Undefined or swi */
-               return FASTTRAP_T_INV;
-       }
-
-       if ((coproc & 0xE) == 0xA) {
-               /* VFP instruction */
-
-               if ((op1 & 0x20) == 0 && (op1 & 0x3A) != 0) {
-                       return vfp_loadstore(instr);
-               }
-
-               if ((op1 & 0x3E) == 0x04) {
-                       return vfp_64transfer(instr);
-               }
-
-               if ((op1 & 0x30) == 0x20) {
-                       /* VFP data processing or 8, 16, or 32 bit move between ARM reg and VFP reg */
-                       if (op == 0) {
-                               /* VFP data processing uses its own registers */
-                               return FASTTRAP_T_COMMON;
-                       } else {
-                               return vfp_transfer(instr);
-                       }
-               }
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_branch_link_blockdata(uint32_t instr)
-{
-       int branch = BITS(instr, 25, 0x1), link = BITS(instr, 24, 0x1), op = BITS(instr, 20, 0x1F), uses_pc = BITS(instr, 15, 0x1), uses_lr = BITS(instr, 14, 0x1);
-
-       if (branch == 1) {
-               if (link == 0) {
-                       return FASTTRAP_T_B_COND;
-               }
-               return FASTTRAP_T_INV;
-       } else {
-               /* Only emulate a use of the pc if it's a return from function: ldmia sp!, { ... pc } */
-               if (op == 0x0B && ARM_RN(instr) == REG_SP && uses_pc == 1) {
-                       return FASTTRAP_T_LDM_PC;
-               }
-
-               /* stmia sp!, { ... lr } doesn't touch the pc, but it is very common, so special case it */
-               if (op == 0x12 && ARM_RN(instr) == REG_SP && uses_lr == 1) {
-                       return FASTTRAP_T_STM_LR;
-               }
-
-               if (ARM_RN(instr) != REG_PC && uses_pc == 0) {
-                       return FASTTRAP_T_COMMON;
-               }
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_signed_multiplies(uint32_t instr)
-{
-       int op1 = BITS(instr, 20, 0x7), op2 = BITS(instr, 5, 0x7);
-
-       /* smlald, smlsld, smmls use RD in addition to RM, RS, and RN */
-       if ((op1 == 0x4 && (op2 & 0x4) == 0) || (op1 == 0x5 && (op2 & 0x6) == 0x6)) {
-               if (ARM_RD(instr) == REG_PC) {
-                       return FASTTRAP_T_INV;
-               }
-       }
-
-       if (ARM_RM(instr) != REG_PC && ARM_RS(instr) != REG_PC && ARM_RN(instr) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_pack_unpack_sat_reversal(uint32_t instr)
-{
-       int op1 = BITS(instr, 20, 0x7), op2 = BITS(instr, 5, 0x7);
-
-       /* pkh, sel use RN in addition to RD and RM */
-       if ((op1 == 0 && (op2 & 0x1) == 0) || (op1 == 0 && op2 == 0x5)) {
-               if (ARM_RN(instr) == REG_PC) {
-                       return FASTTRAP_T_INV;
-               }
-       }
-
-       if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_parallel_addsub_unsigned(uint32_t instr)
-{
-       if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_parallel_addsub_signed(uint32_t instr)
-{
-       if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_media(uint32_t instr)
-{
-       int op1 = BITS(instr, 20, 0x1F), op2 = BITS(instr, 5, 0x7);
-
-       if ((op1 & 0x1C) == 0) {
-               return arm_parallel_addsub_signed(instr);
-       }
-
-       if ((op1 & 0x1C) == 0x04) {
-               return arm_parallel_addsub_unsigned(instr);
-       }
-
-       if ((op1 & 0x18) == 0x08) {
-               return arm_pack_unpack_sat_reversal(instr);
-       }
-
-       if ((op1 & 0x18) == 0x10) {
-               return arm_signed_multiplies(instr);
-       }
-
-       if (op1 == 0x1F && op2 == 0x7) {
-               /* Undefined instruction */
-               return FASTTRAP_T_INV;
-       }
-
-       if (op1 == 0x18 && op2 == 0) {
-               /* usad8 usada8 */
-               /* The registers are named differently in the reference manual for this instruction
-                * but the following positions are correct */
-
-               if (ARM_RM(instr) != REG_PC && ARM_RS(instr) != REG_PC && ARM_RN(instr) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-
-               return FASTTRAP_T_INV;
-       }
-
-       if ((op1 & 0x1E) == 0x1C && (op2 & 0x3) == 0) {
-               /* bfc bfi */
-               if (ARM_RD(instr) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-
-               return FASTTRAP_T_INV;
-       }
-
-       if (((op1 & 0x1E) == 0x1A || (op1 & 0x1E) == 0x1E) && ((op2 & 0x3) == 0x2)) {
-               /* sbfx ubfx */
-               if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-
-               return FASTTRAP_T_INV;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_loadstore_wordbyte(uint32_t instr)
-{
-       /* Instrument PC relative load with immediate, ignore any other uses of the PC */
-       int R = BITS(instr, 25, 0x1), L = BITS(instr, 20, 0x1);
-
-       if (R == 1) {
-               /* Three register load/store */
-               if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       } else {
-               /* Immediate load/store, but still do not support ldr pc, [pc...] */
-               if (L == 1 && ARM_RN(instr) == REG_PC && ARM_RD(instr) != REG_PC) {
-                       return FASTTRAP_T_LDR_PC_IMMED;
-               }
-
-               if (ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_saturating(uint32_t instr)
-{
-       if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_misc(uint32_t instr)
-{
-       int op = BITS(instr, 21, 0x3), __unused op1 = BITS(instr, 16, 0xF), op2 = BITS(instr, 4, 0x7);
-
-       if (op2 == 1 && op == 1) {
-               return FASTTRAP_T_BX_REG;
-       }
-
-       /* We do not need to emulate BLX for entry/return probes; if we eventually support full offset
-        * tracing, then we will. This is because BLX overwrites the link register, so a function that
-        * can execute this as its first instruction is a special function indeed.
-        */
-
-       if (op2 == 0x5) {
-               return arm_saturating(instr);
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_msr_hints(__unused uint32_t instr)
-{
-       /* These deal with the psr, not instrumented */
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_sync_primitive(__unused uint32_t instr)
-{
-       /* TODO will instrumenting these interfere with any kernel usage of these instructions? */
-       /* Don't instrument for now */
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_extra_loadstore_unpriv(uint32_t instr)
-{
-       int op = BITS(instr, 20, 0x1), __unused op2 = BITS(instr, 5, 0x3), immed = BITS(instr, 22, 0x1);
-
-       if (op == 0 && (op2 & 0x2) == 0x2) {
-               /* Unpredictable or undefined */
-               return FASTTRAP_T_INV;
-       }
-
-       if (immed == 1) {
-               if (ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       } else {
-               if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_extra_loadstore(uint32_t instr)
-{
-       int op1 = BITS(instr, 20, 0x1F);
-
-       /* There are two variants, and we do not instrument either of them that use the PC */
-
-       if ((op1 & 0x4) == 0) {
-               /* Variant 1, register */
-               if (ARM_RM(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       } else {
-               /* Variant 2, immediate */
-               if (ARM_RD(instr) != REG_PC && ARM_RN(instr) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_halfword_multiply(uint32_t instr)
-{
-       /* Not all multiply instructions use all four registers. The ones that don't should have those
-        * register locations set to 0, so we can test them anyway.
-        */
-
-       if (ARM_RN(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RS(instr) != REG_PC && ARM_RM(instr) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_multiply(uint32_t instr)
-{
-       /* Not all multiply instructions use all four registers. The ones that don't should have those
-        * register locations set to 0, so we can test them anyway.
-        */
-
-       if (ARM_RN(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RS(instr) != REG_PC && ARM_RM(instr) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_dataproc_immed(uint32_t instr)
-{
-       /* All these instructions are either two registers, or one register and have 0 where the other reg would be used */
-       if (ARM_RN(instr) != REG_PC && ARM_RD(instr) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_dataproc_regshift(uint32_t instr)
-{
-       /* All these instructions are either four registers, or three registers and have 0 where there last reg would be used */
-       if (ARM_RN(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RS(instr) != REG_PC && ARM_RM(instr) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_dataproc_reg(uint32_t instr)
-{
-       int op1 = BITS(instr, 20, 0x1F), op2 = BITS(instr, 7, 0x1F), op3 = BITS(instr, 5, 0x3);
-
-       if (op1 == 0x11 || op1 == 0x13 || op1 == 0x15 || op1 == 0x17) {
-               /* These are comparison flag setting instructions and do not have RD */
-               if (ARM_RN(instr) != REG_PC && ARM_RM(instr) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-
-               return FASTTRAP_T_INV;
-       }
-
-       /* The rest can, in theory, write or use the PC. The only one we instrument is mov pc, reg.
-        * movs pc, reg is a privileged instruction so we don't instrument that variant. The s bit
-        * is bit 0 of op1 and should be zero.
-        */
-       if (op1 == 0x1A && op2 == 0 && op3 == 0 && ARM_RD(instr) == REG_PC) {
-               return FASTTRAP_T_MOV_PC_REG;
-       }
-
-       /* Any instruction at this point is a three register instruction or two register instruction with RN = 0 */
-       if (ARM_RN(instr) != REG_PC && ARM_RD(instr) != REG_PC && ARM_RM(instr) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-arm_dataproc_misc(uint32_t instr)
-{
-       int op = BITS(instr, 25, 0x1), op1 = BITS(instr, 20, 0x1F), op2 = BITS(instr, 4, 0xF);
-
-       if (op == 0) {
-               if ((op1 & 0x19) != 0x10 && (op2 & 0x1) == 0) {
-                       return arm_dataproc_reg(instr);
-               }
-
-               if ((op1 & 0x19) != 0x10 && (op2 & 0x9) == 0x1) {
-                       return arm_dataproc_regshift(instr);
-               }
-
-               if ((op1 & 0x19) == 0x10 && (op2 & 0x8) == 0) {
-                       return arm_misc(instr);
-               }
-
-               if ((op1 & 0x19) == 0x19 && (op2 & 0x9) == 0x8) {
-                       return arm_halfword_multiply(instr);
-               }
-
-               if ((op1 & 0x10) == 0 && op2 == 0x9) {
-                       return arm_multiply(instr);
-               }
-
-               if ((op1 & 0x10) == 0x10 && op2 == 0x9) {
-                       return arm_sync_primitive(instr);
-               }
-
-               if ((op1 & 0x12) != 0x02 && (op2 == 0xB || (op2 & 0xD) == 0xD)) {
-                       return arm_extra_loadstore(instr);
-               }
-
-               if ((op1 & 0x12) == 0x02 && (op2 == 0xB || (op2 & 0xD) == 0xD)) {
-                       return arm_extra_loadstore_unpriv(instr);
-               }
-       } else {
-               if ((op1 & 0x19) != 0x10) {
-                       return arm_dataproc_immed(instr);
-               }
-
-               if (op1 == 0x10) {
-                       /* 16 bit immediate load (mov (immed)) [encoding A2] */
-                       if (ARM_RD(instr) != REG_PC) {
-                               return FASTTRAP_T_COMMON;
-                       }
-
-                       return FASTTRAP_T_INV;
-               }
-
-               if (op1 == 0x14) {
-                       /* high halfword 16 bit immediate load (movt) [encoding A1] */
-                       if (ARM_RD(instr) != REG_PC) {
-                               return FASTTRAP_T_COMMON;
-                       }
-
-                       return FASTTRAP_T_INV;
-               }
-
-               if ((op1 & 0x1B) == 0x12) {
-                       return arm_msr_hints(instr);
-               }
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-int
-dtrace_decode_arm(uint32_t instr)
-{
-       int cond = BITS(instr, 28, 0xF), op1 = BITS(instr, 25, 0x7), op = BITS(instr, 4, 0x1);
-
-       if (cond == 0xF) {
-               return arm_unconditional(instr);
-       }
-
-       if ((op1 & 0x6) == 0) {
-               return arm_dataproc_misc(instr);
-       }
-
-       if (op1 == 0x2) {
-               return arm_loadstore_wordbyte(instr);
-       }
-
-       if (op1 == 0x3 && op == 0) {
-               return arm_loadstore_wordbyte(instr);
-       }
-
-       if (op1 == 0x3 && op == 1) {
-               return arm_media(instr);
-       }
-
-       if ((op1 & 0x6) == 0x4) {
-               return arm_branch_link_blockdata(instr);
-       }
-
-       if ((op1 & 0x6) == 0x6) {
-               return arm_syscall_coproc(instr);
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-/*
- * Thumb 16-bit decoder
- */
-
-static
-int
-thumb16_cond_supervisor(uint16_t instr)
-{
-       int opcode = BITS(instr, 8, 0xF);
-
-       if ((opcode & 0xE) != 0xE) {
-               return FASTTRAP_T_B_COND;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb16_misc(uint16_t instr)
-{
-       int opcode = BITS(instr, 5, 0x7F);
-
-       if ((opcode & 0x70) == 0x30 || (opcode & 0x70) == 0x70) {
-               /* setend, cps, breakpoint, or if-then, not instrumentable */
-               return FASTTRAP_T_INV;
-       } else if ((opcode & 0x78) == 0x28) {
-               /* Doesn't modify pc, but this happens a lot so make this a special case for emulation */
-               return FASTTRAP_T_PUSH_LR;
-       } else if ((opcode & 0x78) == 0x68) {
-               return FASTTRAP_T_POP_PC;
-       } else if ((opcode & 0x28) == 0x08) {
-               return FASTTRAP_T_CB_N_Z;
-       }
-
-       /* All other instructions work on low regs only and are instrumentable */
-       return FASTTRAP_T_COMMON;
-}
-
-static
-int
-thumb16_loadstore_single(__unused uint16_t instr)
-{
-       /* These all access the low registers or SP only */
-       return FASTTRAP_T_COMMON;
-}
-
-static
-int
-thumb16_data_special_and_branch(uint16_t instr)
-{
-       int opcode = BITS(instr, 6, 0xF);
-
-       if (opcode == 0x4) {
-               /* Unpredictable */
-               return FASTTRAP_T_INV;
-       } else if ((opcode & 0xC) == 0xC) {
-               /* bx or blx */
-               /* Only instrument the bx */
-               if ((opcode & 0x2) == 0) {
-                       return FASTTRAP_T_BX_REG;
-               }
-               return FASTTRAP_T_INV;
-       } else {
-               /* Data processing on high registers, only instrument mov pc, reg */
-               if ((opcode & 0xC) == 0x8 && THUMB16_HRD(instr) == REG_PC) {
-                       return FASTTRAP_T_CPY_PC;
-               }
-
-               if (THUMB16_HRM(instr) != REG_PC && THUMB16_HRD(instr) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb16_data_proc(__unused uint16_t instr)
-{
-       /* These all access the low registers only */
-       return FASTTRAP_T_COMMON;
-}
-
-static
-int
-thumb16_shift_addsub_move_compare(__unused uint16_t instr)
-{
-       /* These all access the low registers only */
-       return FASTTRAP_T_COMMON;
-}
-
-static
-int
-dtrace_decode_thumb16(uint16_t instr)
-{
-       int opcode = BITS(instr, 10, 0x3F);
-
-       if ((opcode & 0x30) == 0) {
-               return thumb16_shift_addsub_move_compare(instr);
-       }
-
-       if (opcode == 0x10) {
-               return thumb16_data_proc(instr);
-       }
-
-       if (opcode == 0x11) {
-               return thumb16_data_special_and_branch(instr);
-       }
-
-       if ((opcode & 0x3E) == 0x12) {
-               /* ldr (literal) */
-               return FASTTRAP_T_LDR_PC_IMMED;
-       }
-
-       if ((opcode & 0x3C) == 0x14 || (opcode & 0x38) == 0x18 || (opcode & 0x38) == 0x20) {
-               return thumb16_loadstore_single(instr);
-       }
-
-       if ((opcode & 0x3E) == 0x28) {
-               /* adr, uses the pc */
-               return FASTTRAP_T_INV;
-       }
-
-       if ((opcode & 0x3E) == 0x2A) {
-               /* add (sp plus immediate) */
-               return FASTTRAP_T_COMMON;
-       }
-
-       if ((opcode & 0x3C) == 0x2C) {
-               return thumb16_misc(instr);
-       }
-
-       if ((opcode & 0x3E) == 0x30) {
-               /* stm - can't access high registers */
-               return FASTTRAP_T_COMMON;
-       }
-
-       if ((opcode & 0x3E) == 0x32) {
-               /* ldm - can't access high registers */
-               return FASTTRAP_T_COMMON;
-       }
-
-       if ((opcode & 0x3C) == 0x34) {
-               return thumb16_cond_supervisor(instr);
-       }
-
-       if ((opcode & 0x3E) == 0x38) {
-               /* b unconditional */
-               return FASTTRAP_T_B_UNCOND;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-/*
- * Thumb 32-bit decoder
+ * Disassemblers for ARM64 (AArch64)
  */
 
-static
-int
-thumb32_coproc(uint16_t instr1, uint16_t instr2)
-{
-       /* Instrument any VFP data processing instructions, ignore the rest */
-
-       int op1 = BITS(instr1, 4, 0x3F), coproc = BITS(instr2, 8, 0xF), op = BITS(instr2, 4, 0x1);
-
-       if ((op1 & 0x3E) == 0) {
-               /* Undefined */
-               return FASTTRAP_T_INV;
-       }
-
-       if ((coproc & 0xE) == 0xA || (op1 & 0x30) == 0x30) {
-               /* VFP instruction */
-               uint32_t instr = thumb32_instword_to_arm(instr1, instr2);
-
-               if ((op1 & 0x30) == 0x30) {
-                       /* VFP data processing uses its own registers */
-                       return FASTTRAP_T_COMMON;
-               }
-
-               if ((op1 & 0x3A) == 0x02 || (op1 & 0x38) == 0x08 || (op1 & 0x30) == 0x10) {
-                       return vfp_loadstore(instr);
-               }
-
-               if ((op1 & 0x3E) == 0x04) {
-                       return vfp_64transfer(instr);
-               }
-
-               if ((op1 & 0x30) == 0x20) {
-                       /* VFP data processing or 8, 16, or 32 bit move between ARM reg and VFP reg */
-                       if (op == 0) {
-                               /* VFP data processing uses its own registers */
-                               return FASTTRAP_T_COMMON;
-                       } else {
-                               return vfp_transfer(instr);
-                       }
-               }
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_longmultiply(uint16_t instr1, uint16_t instr2)
-{
-       int op1 = BITS(instr1, 4, 0x7), op2 = BITS(instr2, 4, 0xF);
-
-       if ((op1 == 1 && op2 == 0xF) || (op1 == 0x3 && op2 == 0xF)) {
-               /* Three register instruction */
-               if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       } else {
-               /* Four register instruction */
-               if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC &&
-                   THUMB32_RT(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_multiply(uint16_t instr1, uint16_t instr2)
-{
-       int op1 = BITS(instr1, 4, 0x7), op2 = BITS(instr2, 4, 0x3);
-
-       if ((op1 == 0 && op2 == 1) || (op1 == 0x6 && (op2 & 0x2) == 0)) {
-               if (THUMB32_RT(instr1, instr2) == REG_PC) {
-                       return FASTTRAP_T_INV;
-               }
-       }
-
-       if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_misc(uint16_t instr1, uint16_t instr2)
-{
-       if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_parallel_addsub_unsigned(uint16_t instr1, uint16_t instr2)
-{
-       if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_parallel_addsub_signed(uint16_t instr1, uint16_t instr2)
-{
-       if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_dataproc_reg(uint16_t instr1, uint16_t instr2)
-{
-       int op1 = BITS(instr1, 4, 0xF), op2 = BITS(instr2, 4, 0xF);
-
-       if (((0 <= op1) && (op1 <= 5)) && (op2 & 0x8) == 0x8) {
-               if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       }
-
-       if ((op1 & 0x8) == 0 && op2 == 0) {
-               if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       }
-
-       if ((op1 & 0x8) == 0x8 && (op2 & 0xC) == 0) {
-               return thumb32_parallel_addsub_signed(instr1, instr2);
-       }
-
-       if ((op1 & 0x8) == 0x8 && (op2 & 0xC) == 0x4) {
-               return thumb32_parallel_addsub_unsigned(instr1, instr2);
-       }
-
-       if ((op1 & 0xC) == 0x8 && (op2 & 0xC) == 0x8) {
-               return thumb32_misc(instr1, instr2);
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_dataproc_regshift(uint16_t instr1, uint16_t instr2)
-{
-       int op = BITS(instr1, 5, 0xF), S = BITS(instr1, 4, 0x1);
-
-       if (op == 0 || op == 0x4 || op == 0x8 || op == 0xD) {
-               /* These become test instructions if S is 1 and Rd is PC, otherwise they are data instructions. */
-               if (S == 1) {
-                       if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-                               return FASTTRAP_T_COMMON;
-                       }
-               } else {
-                       if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC &&
-                           THUMB32_RN(instr1, instr2) != REG_PC) {
-                               return FASTTRAP_T_COMMON;
-                       }
-               }
-       } else if (op == 0x2 || op == 0x3) {
-               /* These become moves if RN is PC, otherwise they are data insts. We don't instrument mov pc, reg here */
-               if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       } else {
-               /* Normal three register instruction */
-               if (THUMB32_RM(instr1, instr2) != REG_PC && THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_store_single(uint16_t instr1, uint16_t instr2)
-{
-       int op1 = BITS(instr1, 5, 0x7), op2 = BITS(instr2, 6, 0x3F);
-
-       /* Do not support any use of the pc yet */
-       if ((op1 == 0 || op1 == 1 || op1 == 2) && (op2 & 0x20) == 0) {
-               /* str (register) uses RM */
-               if (THUMB32_RM(instr1, instr2) == REG_PC) {
-                       return FASTTRAP_T_INV;
-               }
-       }
-
-       if (THUMB32_RT(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_loadbyte_memhint(uint16_t instr1, uint16_t instr2)
-{
-       int op1 = BITS(instr1, 7, 0x3), __unused op2 = BITS(instr2, 6, 0x3F);
-
-       /* Do not support any use of the pc yet */
-       if ((op1 == 0 || op1 == 0x2) && THUMB32_RM(instr1, instr2) == REG_PC) {
-               return FASTTRAP_T_INV;
-       }
-
-       if (THUMB32_RT(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_loadhalfword_memhint(uint16_t instr1, uint16_t instr2)
-{
-       int op1 = BITS(instr1, 7, 0x3), op2 = BITS(instr2, 6, 0x3F);
-
-       /* Do not support any use of the PC yet */
-       if (op1 == 0 && op2 == 0 && THUMB32_RM(inst1, instr2) == REG_PC) {
-               return FASTTRAP_T_INV;
-       }
-
-       if (THUMB32_RT(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_loadword(uint16_t instr1, uint16_t instr2)
-{
-       int op1 = BITS(instr1, 7, 0x3), op2 = BITS(instr2, 6, 0x3F);
-
-       if ((op1 & 0x2) == 0 && THUMB32_RN(instr1, instr2) == REG_PC && THUMB32_RT(instr1, instr2) != REG_PC) {
-               return FASTTRAP_T_LDR_PC_IMMED;
-       }
-
-       if (op1 == 0 && op2 == 0) {
-               /* ldr (register) uses an additional reg */
-               if (THUMB32_RM(instr1, instr2) == REG_PC) {
-                       return FASTTRAP_T_INV;
-               }
-       }
-
-       if (THUMB32_RT(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_loadstore_double_exclusive_table(__unused uint16_t instr1, __unused uint16_t instr2)
-{
-       /* Don't instrument any of these */
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_loadstore_multiple(uint16_t instr1, uint16_t instr2)
-{
-       int op = BITS(instr1, 7, 0x3), L = BITS(instr1, 4, 0x1), uses_pc = BITS(instr2, 15, 0x1), uses_lr = BITS(instr2, 14, 0x1);
-
-       if (op == 0 || op == 0x3) {
-               /* Privileged instructions: srs, rfe */
-               return FASTTRAP_T_INV;
-       }
-
-       /* Only emulate a use of the pc if it's a return from function: ldmia sp!, { ... pc }, aka pop { ... pc } */
-       if (op == 0x1 && L == 1 && THUMB32_RN(instr1, instr2) == REG_SP && uses_pc == 1) {
-               return FASTTRAP_T_LDM_PC;
-       }
-
-       /* stmia sp!, { ... lr }, aka push { ... lr } doesn't touch the pc, but it is very common, so special case it */
-       if (op == 0x2 && L == 0 && THUMB32_RN(instr1, instr2) == REG_SP && uses_lr == 1) {
-               return FASTTRAP_T_STM_LR;
-       }
-
-       if (THUMB32_RN(instr1, instr2) != REG_PC && uses_pc == 0) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_misc_control(__unused uint16_t instr1, __unused uint16_t instr2)
-{
-       /* Privileged, and instructions dealing with ThumbEE */
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_cps_hints(__unused uint16_t instr1, __unused uint16_t instr2)
-{
-       /* Privileged */
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_b_misc_control(uint16_t instr1, uint16_t instr2)
-{
-       int op = BITS(instr1, 4, 0x7F), op1 = BITS(instr2, 12, 0x7), __unused op2 = BITS(instr2, 8, 0xF);
-
-       if ((op1 & 0x5) == 0) {
-               if ((op & 0x38) != 0x38) {
-                       return FASTTRAP_T_B_COND;
-               }
-
-               if (op == 0x3A) {
-                       return thumb32_cps_hints(instr1, instr2);
-               }
-
-               if (op == 0x3B) {
-                       return thumb32_misc_control(instr1, instr2);
-               }
-       }
-
-       if ((op1 & 0x5) == 1) {
-               return FASTTRAP_T_B_UNCOND;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_dataproc_plain_immed(uint16_t instr1, uint16_t instr2)
-{
-       int op = BITS(instr1, 4, 0x1F);
-
-       if (op == 0x04 || op == 0x0C || op == 0x16) {
-               /* mov, movt, bfi, bfc */
-               /* These use only RD */
-               if (THUMB32_RD(instr1, instr2) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       } else {
-               if (THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-thumb32_dataproc_mod_immed(uint16_t instr1, uint16_t instr2)
-{
-       int op = BITS(instr1, 5, 0xF), S = BITS(instr1, 4, 0x1);
-
-       if (op == 0x2 || op == 0x3) {
-               /* These allow REG_PC in RN, but it doesn't mean use the PC! */
-               if (THUMB32_RD(instr1, instr2) != REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       }
-
-       if (op == 0 || op == 0x4 || op == 0x8 || op == 0xD) {
-               /* These are test instructions, if the sign bit is set and RD is the PC. */
-               if (S && THUMB32_RD(instr1, instr2) == REG_PC) {
-                       return FASTTRAP_T_COMMON;
-               }
-       }
-
-       if (THUMB32_RD(instr1, instr2) != REG_PC && THUMB32_RN(instr1, instr2) != REG_PC) {
-               return FASTTRAP_T_COMMON;
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-static
-int
-dtrace_decode_thumb32(uint16_t instr1, uint16_t instr2)
-{
-       int op1 = BITS(instr1, 11, 0x3), op2 = BITS(instr1, 4, 0x7F), op = BITS(instr2, 15, 0x1);
-
-       if (op1 == 0x1) {
-               if ((op2 & 0x64) == 0) {
-                       return thumb32_loadstore_multiple(instr1, instr2);
-               }
-
-               if ((op2 & 0x64) == 0x04) {
-                       return thumb32_loadstore_double_exclusive_table(instr1, instr2);
-               }
-
-               if ((op2 & 0x60) == 0x20) {
-                       return thumb32_dataproc_regshift(instr1, instr2);
-               }
-
-               if ((op2 & 0x40) == 0x40) {
-                       return thumb32_coproc(instr1, instr2);
-               }
-       }
-
-       if (op1 == 0x2) {
-               if ((op2 & 0x20) == 0 && op == 0) {
-                       return thumb32_dataproc_mod_immed(instr1, instr2);
-               }
-
-               if ((op2 & 0x20) == 0x20 && op == 0) {
-                       return thumb32_dataproc_plain_immed(instr1, instr2);
-               }
-
-               if (op == 1) {
-                       return thumb32_b_misc_control(instr1, instr2);
-               }
-       }
-
-       if (op1 == 0x3) {
-               if ((op2 & 0x71) == 0) {
-                       return thumb32_store_single(instr1, instr2);
-               }
-
-               if ((op2 & 0x71) == 0x10) {
-                       return vfp_struct_loadstore(thumb32_instword_to_arm(instr1, instr2));
-               }
-
-               if ((op2 & 0x67) == 0x01) {
-                       return thumb32_loadbyte_memhint(instr1, instr2);
-               }
-
-               if ((op2 & 0x67) == 0x03) {
-                       return thumb32_loadhalfword_memhint(instr1, instr2);
-               }
-
-               if ((op2 & 0x67) == 0x05) {
-                       return thumb32_loadword(instr1, instr2);
-               }
-
-               if ((op2 & 0x67) == 0x07) {
-                       /* Undefined instruction */
-                       return FASTTRAP_T_INV;
-               }
-
-               if ((op2 & 0x70) == 0x20) {
-                       return thumb32_dataproc_reg(instr1, instr2);
-               }
-
-               if ((op2 & 0x78) == 0x30) {
-                       return thumb32_multiply(instr1, instr2);
-               }
-
-               if ((op2 & 0x78) == 0x38) {
-                       return thumb32_longmultiply(instr1, instr2);
-               }
-
-               if ((op2 & 0x40) == 0x40) {
-                       return thumb32_coproc(instr1, instr2);
-               }
-       }
-
-       return FASTTRAP_T_INV;
-}
-
-int
-dtrace_decode_thumb(uint32_t instr)
-{
-       uint16_t* pInstr = (uint16_t*) &instr;
-       uint16_t hw1 = pInstr[0], hw2 = pInstr[1];
-
-       int size = BITS(hw1, 11, 0x1F);
+#include  <sys/fasttrap_isa.h>
 
-       if (size == 0x1D || size == 0x1E || size == 0x1F) {
-               return dtrace_decode_thumb32(hw1, hw2);
-       } else {
-               return dtrace_decode_thumb16(hw1);
-       }
-}
+int dtrace_decode_arm64(uint32_t instr);
 
 struct arm64_decode_entry {
        uint32_t mask;
@@ -1311,8 +43,6 @@ struct arm64_decode_entry arm64_decode_table[] = {
 
 #define NUM_DECODE_ENTRIES (sizeof(arm64_decode_table) / sizeof(struct arm64_decode_entry))
 
-
-
 int
 dtrace_decode_arm64(uint32_t instr)
 {
index 6a9296fb0530283242fbcd705e7cf5f9429a9242..5714f7971009eb63ba13d94a5ce5971e101e7026 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005-2008 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2005-2018 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
-#define MACH__POSIX_C_SOURCE_PRIVATE 1  /* pulls in suitable savearea from
-                                        * mach/ppc/thread_status.h */
 #include <arm/caches_internal.h>
-#include <arm/proc_reg.h>
-
 #include <kern/thread.h>
-#include <mach/thread_status.h>
 
 #if __has_include(<ptrauth.h>)
 #include <ptrauth.h>
 #endif
 #include <stdarg.h>
-#include <string.h>
-#include <sys/malloc.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/kauth.h>
 #include <sys/dtrace.h>
 #include <sys/dtrace_impl.h>
-#include <libkern/OSAtomic.h>
+#include <machine/atomic.h>
 #include <kern/simple_lock.h>
 #include <kern/sched_prim.h>            /* for thread_wakeup() */
 #include <kern/thread_call.h>
 #include <kern/task.h>
-#include <miscfs/devfs/devfs.h>
-#include <mach/vm_param.h>
 
 extern struct arm_saved_state *find_kern_regs(thread_t);
 
@@ -130,7 +121,7 @@ xcRemote(void *foo)
                (pArg->f)(pArg->arg);
        }
 
-       if (hw_atomic_sub(&dt_xc_sync, 1) == 0) {
+       if (os_atomic_dec(&dt_xc_sync, relaxed) == 0) {
                thread_wakeup((event_t) &dt_xc_sync);
        }
 }
@@ -180,12 +171,6 @@ dtrace_isa_init(void)
 /**
  * Register definitions
  */
-#define ARM_FP 7
-#define ARM_SP 13
-#define ARM_LR 14
-#define ARM_PC 15
-#define ARM_CPSR 16
-
 #define ARM64_FP 29
 #define ARM64_LR 30
 #define ARM64_SP 31
@@ -205,27 +190,6 @@ dtrace_getreg(struct regs * savearea, uint_t reg)
                return 0;
        }
 
-       if (is_saved_state32(regs)) {
-               // Fix special registers if user is 32 bits
-               switch (reg) {
-               case ARM64_FP:
-                       reg = ARM_FP;
-                       break;
-               case ARM64_SP:
-                       reg = ARM_SP;
-                       break;
-               case ARM64_LR:
-                       reg = ARM_LR;
-                       break;
-               case ARM64_PC:
-                       reg = ARM_PC;
-                       break;
-               case ARM64_CPSR:
-                       reg = ARM_CPSR;
-                       break;
-               }
-       }
-
        if (!check_saved_state_reglimit(regs, reg)) {
                DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
                return 0;
@@ -234,7 +198,6 @@ dtrace_getreg(struct regs * savearea, uint_t reg)
        return (uint64_t)get_saved_state_reg(regs, reg);
 }
 
-#define RETURN_OFFSET 4
 #define RETURN_OFFSET64 8
 
 static int
@@ -242,7 +205,6 @@ dtrace_getustack_common(uint64_t * pcstack, int pcstack_limit, user_addr_t pc,
     user_addr_t sp)
 {
        int ret = 0;
-       boolean_t is64bit = proc_is64bit_data(current_proc());
 
        ASSERT(pcstack == NULL || pcstack_limit > 0);
 
@@ -260,13 +222,8 @@ dtrace_getustack_common(uint64_t * pcstack, int pcstack_limit, user_addr_t pc,
                        break;
                }
 
-               if (is64bit) {
-                       pc = dtrace_fuword64((sp + RETURN_OFFSET64));
-                       sp = dtrace_fuword64(sp);
-               } else {
-                       pc = dtrace_fuword32((sp + RETURN_OFFSET));
-                       sp = dtrace_fuword32(sp);
-               }
+               pc = dtrace_fuword64((sp + RETURN_OFFSET64));
+               sp = dtrace_fuword64(sp);
        }
 
        return ret;
@@ -387,10 +344,6 @@ dtrace_getufpstack(uint64_t * pcstack, uint64_t * fpstack, int pcstack_limit)
        user_addr_t     pc, sp;
        volatile        uint16_t  *flags = (volatile uint16_t *) &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
 
-#if 0
-       uintptr_t oldcontext;
-       size_t          s1, s2;
-#endif
 
        if (*flags & CPU_DTRACE_FAULT) {
                return;
@@ -478,13 +431,8 @@ dtrace_getufpstack(uint64_t * pcstack, uint64_t * fpstack, int pcstack_limit)
                } else
 #endif
                {
-                       if (is64bit) {
-                               pc = dtrace_fuword64((sp + RETURN_OFFSET64));
-                               sp = dtrace_fuword64(sp);
-                       } else {
-                               pc = dtrace_fuword32((sp + RETURN_OFFSET));
-                               sp = dtrace_fuword32(sp);
-                       }
+                       pc = dtrace_fuword64((sp + RETURN_OFFSET64));
+                       sp = dtrace_fuword64(sp);
                }
 
 #if 0
@@ -606,28 +554,6 @@ dtrace_getpcstack(pc_t * pcstack, int pcstack_limit, int aframes,
        }
 }
 
-/*
- * On arm64, we support both 32bit and 64bit user processes.
- * This routine is only called when handling 32bit processes
- * where thumb_mode is pertinent.
- * If this routine is called when handling 64bit processes
- * thumb_mode should always be zero.
- */
-int
-dtrace_instr_size(uint32_t instr, int thumb_mode)
-{
-       if (thumb_mode) {
-               uint16_t instr16 = *(uint16_t*) &instr;
-               if (((instr16 >> 11) & 0x1F) > 0x1C) {
-                       return 4;
-               } else {
-                       return 2;
-               }
-       } else {
-               return 4;
-       }
-}
-
 uint64_t
 dtrace_getarg(int arg, int aframes, dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
 {
index 584317298d4f14f69f71d4d87e52fc55e82f01b9..c5c427d62310d71b5037831cc459e16e6271a293 100644 (file)
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2007 Apple Inc. All rights reserved.
+ *  Copyright (c) 2007-2018 Apple Inc. All rights reserved.
  */
 /*
  * CDDL HEADER START
  * Use is subject to license terms.
  */
 
-/*
- * #pragma ident       "@(#)dtrace_subr.c      1.12    05/06/08 SMI"
- */
-
 #include <sys/dtrace.h>
 #include <sys/dtrace_glue.h>
 #include <sys/dtrace_impl.h>
@@ -65,27 +61,11 @@ dtrace_user_probe(arm_saved_state_t *regs)
 
        kauth_cred_uthread_update(uthread, p);
 
-       if (is_saved_state32(regs)) {
-               if (saved_state32(regs)->cpsr & PSR_TF) {
-                       uint16_t pc;
-                       if (copyin((user_addr_t)saved_state32(regs)->pc, &pc, sizeof(uint16_t))) {
-                               return KERN_FAILURE;
-                       }
-                       is_fasttrap = (pc == FASTTRAP_THUMB32_RET_INSTR);
-               } else {
-                       uint32_t pc;
-                       if (copyin((user_addr_t)saved_state32(regs)->pc, &pc, sizeof(uint32_t))) {
-                               return KERN_FAILURE;
-                       }
-                       is_fasttrap = (pc == FASTTRAP_ARM32_RET_INSTR);
-               }
-       } else {
-               uint32_t pc;
-               if (copyin((user_addr_t)saved_state64(regs)->pc, &pc, sizeof(uint32_t))) {
-                       return KERN_FAILURE;
-               }
-               is_fasttrap = (pc == FASTTRAP_ARM64_RET_INSTR);
+       uint32_t pc;
+       if (copyin((user_addr_t)saved_state64(regs)->pc, &pc, sizeof(uint32_t))) {
+               return KERN_FAILURE;
        }
+       is_fasttrap = (pc == FASTTRAP_ARM64_RET_INSTR);
 
        if (is_fasttrap) {
                uint8_t step = uthread->t_dtrace_step;
@@ -183,38 +163,11 @@ dtrace_user_probe(arm_saved_state_t *regs)
                 *
                 * Note that the PC points to the instruction that caused the fault.
                 */
-               if (is_saved_state32(regs)) {
-                       if (saved_state32(regs)->cpsr & PSR_TF) {
-                               uint16_t instr;
-                               if (fuword16(saved_state32(regs)->pc, &instr) == 0 && instr != FASTTRAP_THUMB32_INSTR) {
-                                       return KERN_SUCCESS;
-                               }
-                       } else {
-                               uint32_t instr;
-                               if (fuword32(saved_state32(regs)->pc, &instr) == 0 && instr != FASTTRAP_ARM32_INSTR) {
-                                       return KERN_SUCCESS;
-                               }
-                       }
-               } else {
-                       uint32_t instr;
-                       if (fuword32(saved_state64(regs)->pc, &instr) == 0 && instr != FASTTRAP_ARM64_INSTR) {
-                               return KERN_SUCCESS;
-                       }
+               uint32_t instr;
+               if (fuword32(saved_state64(regs)->pc, &instr) == 0 && instr != FASTTRAP_ARM64_INSTR) {
+                       return KERN_SUCCESS;
                }
        }
 
        return KERN_FAILURE;
 }
-
-void
-dtrace_safe_synchronous_signal(void)
-{
-       /* Not implemented */
-}
-
-int
-dtrace_safe_defer_signal(void)
-{
-       /* Not implemented */
-       return 0;
-}
index 50f980f2c9129833af92e0aa8c74a494168c4bd5..b547aa99204963792ae0163710c71e03eaadfede 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
  */
 /*
  * CDDL HEADER START
  * Use is subject to license terms.
  */
 
-/*
- * #pragma ident       "@(#)fasttrap_isa.c     1.19    05/09/14 SMI"
- */
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL                 /* Solaris vs. Darwin */
-#endif
-#endif
 #include <sys/fasttrap_isa.h>
 #include <sys/fasttrap_impl.h>
 #include <sys/dtrace.h>
 #include <sys/dtrace_impl.h>
 #include <kern/task.h>
-#include <vm/pmap.h>
-#include <vm/vm_map.h>
-#include <mach/mach_vm.h>
-#include <arm/proc_reg.h>
 #include <arm/thread.h>
-#include <arm/caches_internal.h>
 
 #include <sys/dtrace_ptss.h>
-#include <kern/debug.h>
-
-#include <pexpert/pexpert.h>
 
 #if __has_include(<ptrauth.h>)
 #include <ptrauth.h>
 #endif
 
-
 extern dtrace_id_t dtrace_probeid_error;
 
 /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
 #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
 
 extern int dtrace_decode_arm64(uint32_t instr);
-extern int dtrace_decode_arm(uint32_t instr);
-extern int dtrace_decode_thumb(uint32_t instr);
-
-/*
- * Lossless User-Land Tracing on ARM
- * ---------------------------------
- *
- * The details here will be fleshed out as more of this is implemented. The
- * basic design will be the same way as tracing works in x86.
- *
- * Some ARM specific issues:
- *
- * We need to patch differently for ARM instructions and Thumb instructions.
- * When we hit a probe, we check to see if the mode we're currently in is the
- * same as the mode we're patching for. If not, we remove the tracepoint and
- * abort. This ARM/Thumb information is pulled in from the arch specific
- * information in the fasttrap probe.
- *
- * On ARM, any instruction that uses registers can also use the pc as a
- * register. This presents problems during emulation because we have copied
- * the instruction and thus the pc can be different. Currently we've emulated
- * any instructions that use the pc if they can be used in a return probe.
- * Eventually we will want to support all instructions that use the pc, but
- * to do so requires disassembling the instruction and reconstituting it by
- * substituting a different register.
- *
- */
-
-#define THUMB_INSTR(x) (*(uint16_t*) &(x))
-
-#define SIGNEXTEND(x, v) ((((int) (x)) << (32-(v))) >> (32-(v)))
-#define ALIGNADDR(x, v) (((x) >> (v)) << (v))
-#define GETITSTATE(x) ((((x) >> 8) & 0xFC) | (((x) >> 25) & 0x3))
-#define ISLASTINIT(x) (((x) & 0xF) == 8)
-
-#define SET16(x, w) *((uint16_t*) (x)) = (w)
-#define SET32(x, w) *((uint32_t*) (x)) = (w)
-
-#define IS_ARM32_NOP(x) ((x) == 0xE1A00000)
-/* Marker for is-enabled probes */
-#define IS_ARM32_IS_ENABLED(x) ((x) == 0xE0200000)
 
 #define IS_ARM64_NOP(x) ((x) == 0xD503201F)
 /* Marker for is-enabled probes */
 #define IS_ARM64_IS_ENABLED(x) ((x) == 0xD2800000)
 
-#define IS_THUMB32_NOP(x) ((x) == 0x46C0)
-/* Marker for is-enabled probes */
-#define IS_THUMB32_IS_ENABLED(x) ((x) == 0x4040)
-
-#define ARM_LDM_UF (1 << 23)
-#define ARM_LDM_PF (1 << 24)
-#define ARM_LDM_WF (1 << 21)
-
-#define ARM_LDR_UF (1 << 23)
-#define ARM_LDR_BF (1 << 22)
-
-static int fasttrap_tracepoint_init32(proc_t *, fasttrap_tracepoint_t *, user_addr_t, fasttrap_probe_type_t);
-static int fasttrap_tracepoint_init64(proc_t *, fasttrap_tracepoint_t *, user_addr_t, fasttrap_probe_type_t);
-
 int
 fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp,
     user_addr_t pc, fasttrap_probe_type_t type)
-{
-       if (proc_is64bit_data(p)) {
-               return fasttrap_tracepoint_init64(p, tp, pc, type);
-       } else {
-               return fasttrap_tracepoint_init32(p, tp, pc, type);
-       }
-}
-
-static int
-fasttrap_tracepoint_init32(proc_t *p, fasttrap_tracepoint_t *tp,
-    user_addr_t pc, fasttrap_probe_type_t type)
-{
-#pragma unused(type)
-       uint32_t instr;
-
-       /*
-        * Read the instruction at the given address out of the process's
-        * address space. We don't have to worry about a debugger
-        * changing this instruction before we overwrite it with our trap
-        * instruction since P_PR_LOCK is set. Since instructions can span
-        * pages, we potentially read the instruction in two parts. If the
-        * second part fails, we just zero out that part of the instruction.
-        */
-       /*
-        * APPLE NOTE: Of course, we do not have a P_PR_LOCK, so this is racey...
-        */
-
-       if (uread(p, &instr, 4, pc) != 0) {
-               return -1;
-       }
-
-       /* We want &instr to always point to the saved instruction, so just copy the
-        * whole thing When cast to a pointer to a uint16_t, that will give us a
-        * pointer to the first two bytes, which is the thumb instruction.
-        */
-       tp->ftt_instr = instr;
-
-       if (tp->ftt_fntype != FASTTRAP_FN_DONE_INIT) {
-               switch (tp->ftt_fntype) {
-               case FASTTRAP_FN_UNKNOWN:
-                       /* Can't instrument without any information. We can add some heuristics later if necessary. */
-                       return -1;
-
-               case FASTTRAP_FN_USDT:
-                       if (IS_ARM32_NOP(instr) || IS_ARM32_IS_ENABLED(instr)) {
-                               tp->ftt_thumb = 0;
-                       } else if (IS_THUMB32_NOP(THUMB_INSTR(instr)) || IS_THUMB32_IS_ENABLED(THUMB_INSTR(instr))) {
-                               tp->ftt_thumb = 1;
-                       } else {
-                               /* Shouldn't reach here - this means we don't recognize
-                                * the instruction at one of the USDT probe locations
-                                */
-                               return -1;
-                       }
-                       tp->ftt_fntype = FASTTRAP_FN_DONE_INIT;
-                       break;
-
-               case FASTTRAP_FN_ARM:
-                       tp->ftt_thumb = 0;
-                       tp->ftt_fntype = FASTTRAP_FN_DONE_INIT;
-                       break;
-
-               case FASTTRAP_FN_THUMB:
-                       tp->ftt_thumb = 1;
-                       tp->ftt_fntype = FASTTRAP_FN_DONE_INIT;
-                       break;
-
-               default:
-                       return -1;
-               }
-       }
-
-       if (tp->ftt_thumb) {
-               tp->ftt_type = dtrace_decode_thumb(instr);
-       } else {
-               tp->ftt_type = dtrace_decode_arm(instr);
-       }
-
-       if (tp->ftt_type == FASTTRAP_T_INV) {
-               /* This is an instruction we either don't recognize or can't instrument */
-               printf("dtrace: fasttrap init32: Unrecognized instruction: %08x at %08llx\n",
-                   (tp->ftt_thumb && dtrace_instr_size(tp->ftt_instr, tp->ftt_thumb) == 2) ? tp->ftt_instr1 : instr, pc);
-               return -1;
-       }
-
-       return 0;
-}
-
-
-static int
-fasttrap_tracepoint_init64(proc_t *p, fasttrap_tracepoint_t *tp,
-    user_addr_t pc, fasttrap_probe_type_t type)
 {
 #pragma unused(type)
        uint32_t instr = 0;
@@ -240,7 +75,6 @@ fasttrap_tracepoint_init64(proc_t *p, fasttrap_tracepoint_t *tp,
        }
 
        tp->ftt_instr = instr;
-       tp->ftt_thumb = 0;      /* Always zero on 64bit */
 
        if (tp->ftt_fntype != FASTTRAP_FN_DONE_INIT) {
                switch (tp->ftt_fntype) {
@@ -299,7 +133,6 @@ fasttrap_tracepoint_init64(proc_t *p, fasttrap_tracepoint_t *tp,
 int
 fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
 {
-       /* The thumb patch is a 2 byte instruction regardless of the size of the original instruction */
        uint32_t instr;
        int size;
 
@@ -307,12 +140,7 @@ fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
                size = 4;
                instr = FASTTRAP_ARM64_INSTR;
        } else {
-               size = tp->ftt_thumb ? 2 : 4;
-               if (tp->ftt_thumb) {
-                       *((uint16_t*) &instr) = FASTTRAP_THUMB32_INSTR;
-               } else {
-                       instr = FASTTRAP_ARM32_INSTR;
-               }
+               return -1;
        }
 
        if (uwrite(p, &instr, size, tp->ftt_pc) != 0) {
@@ -327,16 +155,14 @@ fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
 int
 fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
 {
-       /* The thumb patch is a 2 byte instruction regardless of the size of the original instruction */
        uint32_t instr;
-       int size;
+       int size = 4;
 
        if (proc_is64bit_data(p)) {
                /*
                 * Distinguish between read or write failures and a changed
                 * instruction.
                 */
-               size = 4;
                if (uread(p, &instr, size, tp->ftt_pc) != 0) {
                        goto end;
                }
@@ -345,24 +171,7 @@ fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
                        goto end;
                }
        } else {
-               /*
-                * Distinguish between read or write failures and a changed
-                * instruction.
-                */
-               size = tp->ftt_thumb ? 2 : 4;
-               if (uread(p, &instr, size, tp->ftt_pc) != 0) {
-                       goto end;
-               }
-
-               if (tp->ftt_thumb) {
-                       if (*((uint16_t*) &instr) != FASTTRAP_THUMB32_INSTR) {
-                               goto end;
-                       }
-               } else {
-                       if (instr != FASTTRAP_ARM32_INSTR) {
-                               goto end;
-                       }
-               }
+               return -1;
        }
 
        if (uwrite(p, &tp->ftt_instr, size, tp->ftt_pc) != 0) {
@@ -407,27 +216,14 @@ fasttrap_return_common(proc_t *p, arm_saved_state_t *regs, user_addr_t pc, user_
 
        for (id = tp->ftt_retids; id != NULL; id = id->fti_next) {
                fasttrap_probe_t *probe = id->fti_probe;
-               /*
-                * If there's a branch that could act as a return site, we
-                * need to trace it, and check here if the program counter is
-                * external to the function.
-                */
-               if (is_saved_state32(regs)) {
-                       if (tp->ftt_type != FASTTRAP_T_LDM_PC &&
-                           tp->ftt_type != FASTTRAP_T_POP_PC &&
-                           new_pc - probe->ftp_faddr < probe->ftp_fsize) {
-                               continue;
-                       }
-               } else {
-                       /* ARM64_TODO  - check for FASTTRAP_T_RET */
-                       if ((tp->ftt_type != FASTTRAP_T_ARM64_RET || tp->ftt_type != FASTTRAP_T_ARM64_RETAB) &&
-                           new_pc - probe->ftp_faddr < probe->ftp_fsize) {
-                               continue;
-                       }
+               /* ARM64_TODO  - check for FASTTRAP_T_RET */
+               if ((tp->ftt_type != FASTTRAP_T_ARM64_RET || tp->ftt_type != FASTTRAP_T_ARM64_RETAB) &&
+                   new_pc - probe->ftp_faddr < probe->ftp_fsize) {
+                       continue;
                }
                if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) {
-                       uint8_t already_triggered = atomic_or_8(&probe->ftp_triggered, 1);
-                       if (already_triggered) {
+                       if (os_atomic_xchg(&probe->ftp_triggered, 1, relaxed)) {
+                               /* already triggered */
                                continue;
                        }
                }
@@ -448,15 +244,9 @@ fasttrap_return_common(proc_t *p, arm_saved_state_t *regs, user_addr_t pc, user_
                if (FALSE) {
 #endif
                } else {
-                       if (is_saved_state32(regs)) {
-                               dtrace_probe(probe->ftp_id,
-                                   pc - id->fti_probe->ftp_faddr,
-                                   saved_state32(regs)->r[0], 0, 0, 0);
-                       } else {
-                               dtrace_probe(probe->ftp_id,
-                                   pc - id->fti_probe->ftp_faddr,
-                                   saved_state64(regs)->x[0], 0, 0, 0);
-                       }
+                       dtrace_probe(probe->ftp_id,
+                           pc - id->fti_probe->ftp_faddr,
+                           saved_state64(regs)->x[0], 0, 0, 0);
                }
        }
        if (retire_tp) {
@@ -466,6 +256,9 @@ fasttrap_return_common(proc_t *p, arm_saved_state_t *regs, user_addr_t pc, user_
        lck_mtx_unlock(pid_mtx);
 }
 
+#if DEBUG
+__dead2
+#endif
 static void
 fasttrap_sigsegv(proc_t *p, uthread_t t, user_addr_t addr, arm_saved_state_t *regs)
 {
@@ -503,31 +296,6 @@ fasttrap_sigsegv(proc_t *p, uthread_t t, user_addr_t addr, arm_saved_state_t *re
 #endif
 }
 
-static void
-fasttrap_usdt_args32(fasttrap_probe_t *probe, arm_saved_state32_t *regs32, int argc,
-    uint64_t *argv)
-{
-       int i, x, cap = MIN(argc, probe->ftp_nargs);
-
-       for (i = 0; i < cap; i++) {
-               x = probe->ftp_argmap[i];
-
-               /* Up to 4 args are passed in registers on arm */
-               if (x < 4) {
-                       argv[i] = regs32->r[x];
-               } else {
-                       uint32_t arg;
-                       fasttrap_fuword32_noerr(regs32->sp + (x - 4) * sizeof(uint32_t), &arg);
-
-                       argv[i] = arg;
-               }
-       }
-
-       for (; i < argc; i++) {
-               argv[i] = 0;
-       }
-}
-
 static void
 fasttrap_usdt_args64(fasttrap_probe_t *probe, arm_saved_state64_t *regs64, int argc,
     uint64_t *argv)
@@ -581,591 +349,6 @@ condition_true(int cond, int cpsr)
        return taken;
 }
 
-static void
-set_thumb_flag(arm_saved_state32_t *regs32, user_addr_t pc)
-{
-       if (pc & 1) {
-               regs32->cpsr |= PSR_TF;
-       } else {
-               regs32->cpsr &= ~PSR_TF;
-       }
-}
-
-static int
-fasttrap_pid_probe_thumb_state_valid(arm_saved_state32_t *state32, fasttrap_tracepoint_t *tp)
-{
-       uint32_t cpsr = state32->cpsr;
-       uint32_t itstate = GETITSTATE(cpsr);
-
-       /* If in IT block, make sure it's the last statement in the block */
-       if ((itstate != 0) && !ISLASTINIT(itstate)) {
-               printf("dtrace: fasttrap: Tried to trace instruction %08x at %08x but not at end of IT block\n",
-                   (tp->ftt_thumb && dtrace_instr_size(tp->ftt_instr, tp->ftt_thumb) == 2) ? tp->ftt_instr1 : tp->ftt_instr, state32->pc);
-               return 0;
-       }
-
-       if (!(cpsr & PSR_TF)) {
-               return 0;
-       }
-
-       return 1;
-}
-
-static int
-fasttrap_get_condition_code(arm_saved_state32_t *regs32, fasttrap_tracepoint_t *tp)
-{
-       /* Default to always execute */
-       int condition_code = 0xE;
-       if (tp->ftt_thumb) {
-               uint32_t itstate = GETITSTATE(regs32->cpsr);
-               if (itstate != 0) {
-                       /* In IT block, make sure it's the last statement in the block */
-                       assert(ISLASTINIT(itstate));
-                       condition_code = itstate >> 4;
-               }
-       } else {
-               condition_code = ARM_CONDCODE(tp->ftt_instr);
-       }
-
-       return condition_code;
-}
-
-static void
-fasttrap_pid_probe_handle_patched_instr32(arm_saved_state_t *state, fasttrap_tracepoint_t *tp, uthread_t uthread,
-    proc_t *p, uint_t is_enabled, int *was_simulated)
-{
-       arm_saved_state32_t *regs32 = saved_state32(state);
-       uint32_t new_pc = 0;
-       uint32_t pc = regs32->pc;
-       int instr_size;
-       int condition_code;
-
-       *was_simulated = 1;
-
-       /*
-        * If there's an is-enabled probe connected to this tracepoint it
-        * means that there was a 'eor r0,r0,r0'
-        * instruction that was placed there by DTrace when the binary was
-        * linked. As this probe is, in fact, enabled, we need to stuff 1
-        * into R0. Accordingly, we can bypass all the instruction
-        * emulation logic since we know the inevitable result. It's possible
-        * that a user could construct a scenario where the 'is-enabled'
-        * probe was on some other instruction, but that would be a rather
-        * exotic way to shoot oneself in the foot.
-        */
-
-       if (is_enabled) {
-               regs32->r[0] = 1;
-               new_pc = regs32->pc + (tp->ftt_thumb ? 2 : 4);
-               goto done;
-       }
-
-       /* For USDT probes, bypass all the emulation logic for the nop instruction */
-       if ((tp->ftt_thumb && IS_THUMB32_NOP(THUMB_INSTR(tp->ftt_instr))) ||
-           (!tp->ftt_thumb && IS_ARM32_NOP(tp->ftt_instr))) {
-               new_pc = regs32->pc + (tp->ftt_thumb ? 2 : 4);
-               goto done;
-       }
-
-       condition_code = fasttrap_get_condition_code(regs32, tp);
-       instr_size = dtrace_instr_size(tp->ftt_instr, tp->ftt_thumb);
-
-       switch (tp->ftt_type) {
-       case FASTTRAP_T_MOV_PC_REG:
-       case FASTTRAP_T_CPY_PC:
-       {
-               if (!condition_true(condition_code, regs32->cpsr)) {
-                       new_pc = pc + instr_size;
-                       break;
-               }
-
-               int rm;
-               if (tp->ftt_thumb) {
-                       rm = THUMB16_HRM(tp->ftt_instr1);
-               } else {
-                       rm = tp->ftt_instr & 0xF;
-               }
-               new_pc = regs32->r[rm];
-
-               /* This instruction does not change the Thumb state */
-
-               break;
-       }
-
-       case FASTTRAP_T_STM_LR:
-       case FASTTRAP_T_PUSH_LR:
-       {
-               /*
-                * This is a very common case, so we want to emulate this instruction if
-                * possible. However, on a push, it is possible that we might reach the end
-                * of a page and have to allocate a new page. Most of the time this will not
-                * happen, and we know that the push instruction can store at most 16 words,
-                * so check to see if we are far from the boundary, and if so, emulate. This
-                * can be made more aggressive by checking the actual number of words being
-                * pushed, but we won't do that for now.
-                *
-                * Some of the same issues that apply to POP_PC probably apply here also.
-                */
-
-               int reglist;
-               int ret;
-               uint32_t base;
-
-               if (!condition_true(condition_code, regs32->cpsr)) {
-                       new_pc = pc + instr_size;
-                       break;
-               }
-
-               base = regs32->sp;
-               if (((base - 16 * 4) >> PAGE_SHIFT) != (base >> PAGE_SHIFT)) {
-                       /* Crosses the page boundary, go to emulation */
-                       goto instr_emulate;
-               }
-
-               if (tp->ftt_thumb) {
-                       if (instr_size == 4) {
-                               /* We know we have to push lr, never push sp or pc */
-                               reglist = tp->ftt_instr2 & 0x1FFF;
-                       } else {
-                               reglist = tp->ftt_instr1 & 0xFF;
-                       }
-               } else {
-                       /* We know we have to push lr, never push sp or pc */
-                       reglist = tp->ftt_instr & 0x1FFF;
-               }
-
-               /* Push the link register */
-               base -= 4;
-               ret = fasttrap_suword32(base, regs32->lr);
-               if (ret == -1) {
-                       fasttrap_sigsegv(p, uthread, (user_addr_t) base, state);
-                       new_pc = regs32->pc;
-                       break;
-               }
-
-               /* Start pushing from $r12 */
-               int regmask = 1 << 12;
-               int regnum = 12;
-
-               while (regmask) {
-                       if (reglist & regmask) {
-                               base -= 4;
-                               ret = fasttrap_suword32(base, regs32->r[regnum]);
-                               if (ret == -1) {
-                                       fasttrap_sigsegv(p, uthread, (user_addr_t) base, state);
-                                       new_pc = regs32->pc;
-                                       break;
-                               }
-                       }
-                       regmask >>= 1;
-                       regnum--;
-               }
-
-               regs32->sp = base;
-
-               new_pc = pc + instr_size;
-
-               break;
-       }
-
-
-       case FASTTRAP_T_LDM_PC:
-       case FASTTRAP_T_POP_PC:
-       {
-               /* TODO Two issues that will eventually need to be resolved:
-                *
-                * 1. Understand what the hardware does if we have to segfault (data abort) in
-                * the middle of a load multiple. We currently don't have a working segfault
-                * handler anyway, and with no swapfile we should never segfault on this load.
-                * If we do, we'll just kill the process by setting the pc to 0.
-                *
-                * 2. The emulation is no longer atomic. We currently only emulate pop for
-                * function epilogues, and so we should never have a race here because one
-                * thread should never be trying to manipulate another thread's stack frames.
-                * That is almost certainly a bug in the program.
-                *
-                * This will need to be fixed if we ever:
-                *   a. Ship dtrace externally, as this could be a potential attack vector
-                *   b. Support instruction level tracing, as we might then pop/ldm non epilogues.
-                *
-                */
-
-               /* Assume ldmia! sp/pop ... pc */
-
-               int regnum = 0, reglist;
-               int ret;
-               uint32_t base;
-
-               if (!condition_true(condition_code, regs32->cpsr)) {
-                       new_pc = pc + instr_size;
-                       break;
-               }
-
-               if (tp->ftt_thumb) {
-                       if (instr_size == 4) {
-                               /* We know we have to load the pc, don't do it twice */
-                               reglist = tp->ftt_instr2 & 0x7FFF;
-                       } else {
-                               reglist = tp->ftt_instr1 & 0xFF;
-                       }
-               } else {
-                       /* We know we have to load the pc, don't do it twice */
-                       reglist = tp->ftt_instr & 0x7FFF;
-               }
-
-               base = regs32->sp;
-               while (reglist) {
-                       if (reglist & 1) {
-                               ret = fasttrap_fuword32((user_addr_t)base, &regs32->r[regnum]);
-                               if (ret == -1) {
-                                       fasttrap_sigsegv(p, uthread, (user_addr_t) base, state);
-                                       new_pc = regs32->pc;
-                                       break;
-                               }
-                               base += 4;
-                       }
-                       reglist >>= 1;
-                       regnum++;
-               }
-
-               ret = fasttrap_fuword32((user_addr_t)base, &new_pc);
-               if (ret == -1) {
-                       fasttrap_sigsegv(p, uthread, (user_addr_t) base, state);
-                       new_pc = regs32->pc;
-                       break;
-               }
-               base += 4;
-
-               regs32->sp = base;
-
-               set_thumb_flag(regs32, new_pc);
-
-               break;
-       }
-
-       case FASTTRAP_T_CB_N_Z:
-       {
-               /* Thumb mode instruction, and not permitted in IT block, so skip the condition code check */
-               int rn = tp->ftt_instr1 & 0x7;
-               int offset = (((tp->ftt_instr1 & 0x00F8) >> 2) | ((tp->ftt_instr1 & 0x0200) >> 3)) + 4;
-               int nonzero = tp->ftt_instr1 & 0x0800;
-               if (!nonzero != !(regs32->r[rn] == 0)) {
-                       new_pc = pc + offset;
-               } else {
-                       new_pc = pc + instr_size;
-               }
-               break;
-       }
-
-       case FASTTRAP_T_B_COND:
-       {
-               /* Use the condition code in the instruction and ignore the ITSTATE */
-
-               int code, offset;
-               if (tp->ftt_thumb) {
-                       if (instr_size == 4) {
-                               code = (tp->ftt_instr1 >> 6) & 0xF;
-                               if (code == 14 || code == 15) {
-                                       panic("fasttrap: Emulation of invalid branch");
-                               }
-                               int S = (tp->ftt_instr1 >> 10) & 1,
-                                   J1 = (tp->ftt_instr2 >> 13) & 1,
-                                   J2 = (tp->ftt_instr2 >> 11) & 1;
-                               offset = 4 + SIGNEXTEND(
-                                       (S << 20) | (J2 << 19) | (J1 << 18) |
-                                       ((tp->ftt_instr1 & 0x003F) << 12) |
-                                       ((tp->ftt_instr2 & 0x07FF) << 1),
-                                       21);
-                       } else {
-                               code = (tp->ftt_instr1 >> 8) & 0xF;
-                               if (code == 14 || code == 15) {
-                                       panic("fasttrap: Emulation of invalid branch");
-                               }
-                               offset = 4 + (SIGNEXTEND(tp->ftt_instr1 & 0xFF, 8) << 1);
-                       }
-               } else {
-                       code = ARM_CONDCODE(tp->ftt_instr);
-                       if (code == 15) {
-                               panic("fasttrap: Emulation of invalid branch");
-                       }
-                       offset = 8 + (SIGNEXTEND(tp->ftt_instr & 0x00FFFFFF, 24) << 2);
-               }
-
-               if (condition_true(code, regs32->cpsr)) {
-                       new_pc = pc + offset;
-               } else {
-                       new_pc = pc + instr_size;
-               }
-
-               break;
-       }
-
-       case FASTTRAP_T_B_UNCOND:
-       {
-               int offset;
-
-               /* Unconditional branches can only be taken from Thumb mode */
-               /* (This is different from an ARM branch with condition code "always") */
-               ASSERT(tp->ftt_thumb == 1);
-
-               if (!condition_true(condition_code, regs32->cpsr)) {
-                       new_pc = pc + instr_size;
-                       break;
-               }
-
-               if (instr_size == 4) {
-                       int S = (tp->ftt_instr1 >> 10) & 1,
-                           J1 = (tp->ftt_instr2 >> 13) & 1,
-                           J2 = (tp->ftt_instr2 >> 11) & 1;
-                       int I1 = (J1 != S) ? 0 : 1, I2 = (J2 != S) ? 0 : 1;
-                       offset = 4 + SIGNEXTEND(
-                               (S << 24) | (I1 << 23) | (I2 << 22) |
-                               ((tp->ftt_instr1 & 0x03FF) << 12) |
-                               ((tp->ftt_instr2 & 0x07FF) << 1),
-                               25);
-               } else {
-                       uint32_t instr1 = tp->ftt_instr1;
-                       offset = 4 + (SIGNEXTEND(instr1 & 0x7FF, 11) << 1);
-               }
-
-               new_pc = pc + offset;
-
-               break;
-       }
-
-       case FASTTRAP_T_BX_REG:
-       {
-               int reg;
-
-               if (!condition_true(condition_code, regs32->cpsr)) {
-                       new_pc = pc + instr_size;
-                       break;
-               }
-
-               if (tp->ftt_thumb) {
-                       reg = THUMB16_HRM(tp->ftt_instr1);
-               } else {
-                       reg = ARM_RM(tp->ftt_instr);
-               }
-               new_pc = regs32->r[reg];
-               set_thumb_flag(regs32, new_pc);
-
-               break;
-       }
-
-       case FASTTRAP_T_LDR_PC_IMMED:
-       case FASTTRAP_T_VLDR_PC_IMMED:
-               /* Handle these instructions by replacing the PC in the instruction with another
-                * register. They are common, so we'd like to support them, and this way we do so
-                * without any risk of having to simulate a segfault.
-                */
-
-               /* Fall through */
-
-instr_emulate:
-       case FASTTRAP_T_COMMON:
-       {
-               user_addr_t addr;
-               uint8_t scratch[32];
-               uint_t i = 0;
-               fasttrap_instr_t emul_instr;
-               emul_instr.instr32 = tp->ftt_instr;
-               int emul_instr_size;
-
-               /*
-                * Unfortunately sometimes when we emulate the instruction and have to replace the
-                * PC, there is no longer a thumb mode equivalent. We end up having to run the
-                * modified instruction in ARM mode. We use this variable to keep track of which
-                * mode we should emulate in. We still use the original variable to determine
-                * what mode to return to.
-                */
-               uint8_t emul_thumb = tp->ftt_thumb;
-               int save_reg = -1;
-               uint32_t save_val = 0;
-
-               /*
-                * Dealing with condition codes and emulation:
-                * We can't just uniformly do a condition code check here because not all instructions
-                * have condition codes. We currently do not support an instruction by instruction trace,
-                * so we can assume that either: 1. We are executing a Thumb instruction, in which case
-                * we either are not in an IT block and should execute always, or we are last in an IT
-                * block. Either way, the traced instruction will run correctly, and we won't have any
-                * problems when we return to the original code, because we will no longer be in the IT
-                * block. 2. We are executing an ARM instruction, in which case we are ok as long as
-                * we don't attempt to change the condition code.
-                */
-               if (tp->ftt_type == FASTTRAP_T_LDR_PC_IMMED) {
-                       /* We know we always have a free register (the one we plan to write the
-                        * result value to!). So we'll replace the pc with that one.
-                        */
-                       int new_reg;
-                       if (tp->ftt_thumb) {
-                               /* Check to see if thumb or thumb2 */
-                               if (instr_size == 2) {
-                                       /*
-                                        * Sadness. We need to emulate this instruction in ARM mode
-                                        * because it has an 8 bit immediate offset. Instead of having
-                                        * to deal with condition codes in the ARM instruction, we'll
-                                        * just check the condition and abort if the condition is false.
-                                        */
-                                       if (!condition_true(condition_code, regs32->cpsr)) {
-                                               new_pc = pc + instr_size;
-                                               break;
-                                       }
-
-                                       new_reg = (tp->ftt_instr1 >> 8) & 0x7;
-                                       regs32->r[new_reg] = ALIGNADDR(regs32->pc + 4, 2);
-                                       emul_thumb = 0;
-                                       emul_instr.instr32 = 0xE5900000 | (new_reg << 16) | (new_reg << 12) | ((tp->ftt_instr1 & 0xFF) << 2);
-                               } else {
-                                       /* Thumb2. Just replace the register. */
-                                       new_reg = (tp->ftt_instr2 >> 12) & 0xF;
-                                       regs32->r[new_reg] = ALIGNADDR(regs32->pc + 4, 2);
-                                       emul_instr.instr16.instr1 &= ~0x000F;
-                                       emul_instr.instr16.instr1 |= new_reg;
-                               }
-                       } else {
-                               /* ARM. Just replace the register. */
-                               new_reg = (tp->ftt_instr >> 12) & 0xF;
-                               regs32->r[new_reg] = ALIGNADDR(regs32->pc + 8, 2);
-                               emul_instr.instr32 &= ~0x000F0000;
-                               emul_instr.instr32 |= new_reg << 16;
-                       }
-               } else if (tp->ftt_type == FASTTRAP_T_VLDR_PC_IMMED) {
-                       /* This instruction only uses one register, and if we're here, we know
-                        * it must be the pc. So we'll just replace it with R0.
-                        */
-                       save_reg = 0;
-                       save_val = regs32->r[0];
-                       regs32->r[save_reg] = ALIGNADDR(regs32->pc + (tp->ftt_thumb ? 4 : 8), 2);
-                       if (tp->ftt_thumb) {
-                               emul_instr.instr16.instr1 &= ~0x000F;
-                       } else {
-                               emul_instr.instr32 &= ~0x000F0000;
-                       }
-               }
-
-               emul_instr_size = dtrace_instr_size(emul_instr.instr32, emul_thumb);
-
-               /*
-                * At this point:
-                *   tp->ftt_thumb = thumb mode of original instruction
-                *   emul_thumb = thumb mode for emulation
-                *   emul_instr = instruction we are using to emulate original instruction
-                *   emul_instr_size = size of emulating instruction
-                */
-
-               addr = uthread->t_dtrace_scratch->addr;
-
-               if (addr == 0LL) {
-                       fasttrap_sigtrap(p, uthread, pc);         // Should be killing target proc
-                       new_pc = pc;
-                       break;
-               }
-
-               uthread->t_dtrace_scrpc = addr;
-               if (emul_thumb) {
-                       /*
-                        * No way to do an unconditional branch in Thumb mode, shove the address
-                        * onto the user stack and go to the next location with a pop. This can
-                        * segfault if this push happens to cross a stack page, but that's ok, since
-                        * we are running in userland, and the kernel knows how to handle userland
-                        * stack expansions correctly.
-                        *
-                        * Layout of scratch space for Thumb mode:
-                        *   Emulated instruction
-                        *   ldr save_reg, [pc, #16] (if necessary, restore any register we clobbered)
-                        *   push { r0, r1 }
-                        *   ldr r0, [pc, #4]
-                        *   str r0, [sp, #4]
-                        *   pop { r0, pc }
-                        *   Location we should return to in original program
-                        *   Saved value of clobbered register (if necessary)
-                        */
-
-                       bcopy(&emul_instr, &scratch[i], emul_instr_size); i += emul_instr_size;
-
-                       if (save_reg != -1) {
-                               uint16_t restore_inst = 0x4803;
-                               restore_inst |= (save_reg & 0x7) << 8;
-                               SET16(scratch + i, restore_inst); i += 2;               // ldr reg, [pc , #16]
-                       }
-
-                       SET16(scratch + i, 0xB403); i += 2;                             // push { r0, r1 }
-                       SET16(scratch + i, 0x4801); i += 2;                             // ldr r0, [pc, #4]
-                       SET16(scratch + i, 0x9001); i += 2;                             // str r0, [sp, #4]
-                       SET16(scratch + i, 0xBD01); i += 2;                             // pop { r0, pc }
-
-                       if (i % 4) {
-                               SET16(scratch + i, 0); i += 2;                          // padding - saved 32 bit words must be aligned
-                       }
-                       SET32(scratch + i, pc + instr_size + (tp->ftt_thumb ? 1 : 0)); i += 4;          // Return address
-                       if (save_reg != -1) {
-                               SET32(scratch + i, save_val); i += 4;                   // saved value of clobbered register
-                       }
-
-                       uthread->t_dtrace_astpc = addr + i;
-                       bcopy(&emul_instr, &scratch[i], emul_instr_size); i += emul_instr_size;
-                       SET16(scratch + i, FASTTRAP_THUMB32_RET_INSTR); i += 2;
-               } else {
-                       /*
-                        * Layout of scratch space for ARM mode:
-                        *   Emulated instruction
-                        *   ldr save_reg, [pc, #12] (if necessary, restore any register we clobbered)
-                        *   ldr pc, [pc, #4]
-                        *   Location we should return to in original program
-                        *   Saved value of clobbered register (if necessary)
-                        */
-
-                       bcopy(&emul_instr, &scratch[i], emul_instr_size); i += emul_instr_size;
-
-                       if (save_reg != -1) {
-                               uint32_t restore_inst = 0xE59F0004;
-                               restore_inst |= save_reg << 12;
-                               SET32(scratch + i, restore_inst); i += 4;               // ldr reg, [pc, #12]
-                       }
-                       SET32(scratch + i, 0xE51FF004); i += 4;                         // ldr pc, [pc, #4]
-
-                       SET32(scratch + i, pc + instr_size + (tp->ftt_thumb ? 1 : 0)); i += 4;          // Return address
-                       if (save_reg != -1) {
-                               SET32(scratch + i, save_val); i += 4;                   // Saved value of clobbered register
-                       }
-
-                       uthread->t_dtrace_astpc = addr + i;
-                       bcopy(&emul_instr, &scratch[i], emul_instr_size); i += emul_instr_size;
-                       SET32(scratch + i, FASTTRAP_ARM32_RET_INSTR); i += 4;
-               }
-
-               if (uwrite(p, scratch, i, uthread->t_dtrace_scratch->write_addr) != KERN_SUCCESS) {
-                       fasttrap_sigtrap(p, uthread, pc);
-                       new_pc = pc;
-                       break;
-               }
-
-               if (tp->ftt_retids != NULL) {
-                       uthread->t_dtrace_step = 1;
-                       uthread->t_dtrace_ret = 1;
-                       new_pc = uthread->t_dtrace_astpc + (emul_thumb ? 1 : 0);
-               } else {
-                       new_pc = uthread->t_dtrace_scrpc + (emul_thumb ? 1 : 0);
-               }
-
-               uthread->t_dtrace_pc = pc;
-               uthread->t_dtrace_npc = pc + instr_size;
-               uthread->t_dtrace_on = 1;
-               *was_simulated = 0;
-               set_thumb_flag(regs32, new_pc);
-               break;
-       }
-
-       default:
-               panic("fasttrap: mishandled an instruction");
-       }
-done:
-       set_saved_state_pc(state, new_pc);
-       return;
-}
-
 /*
  * Copy out an instruction for execution in userland.
  * Trap back to kernel to handle return to original flow of execution, because
@@ -1408,8 +591,10 @@ fasttrap_pid_probe_handle_patched_instr64(arm_saved_state_t *state, fasttrap_tra
                res2 = fasttrap_suword64(regs64->sp - 8, regs64->lr);
                if (res1 != 0 || res2 != 0) {
                        fasttrap_sigsegv(p, uthread, regs64->sp - (res1 ? 16 : 8), state);
+#ifndef DEBUG
                        new_pc = regs64->pc;         /* Bit of a hack */
                        break;
+#endif
                }
 
                /* Move stack pointer */
@@ -1479,8 +664,10 @@ fasttrap_pid_probe_handle_patched_instr64(arm_saved_state_t *state, fasttrap_tra
 
                if (copyin(address, &value, valsize) != 0) {
                        fasttrap_sigsegv(p, uthread, address, state);
+#ifndef DEBUG
                        new_pc = regs64->pc;         /* Bit of a hack, we know about update in fasttrap_sigsegv() */
                        break;
+#endif
                }
 
                /* Stash in correct register slot */
@@ -1750,7 +937,7 @@ fasttrap_pid_probe(arm_saved_state_t *state)
 
        uint64_t pc = get_saved_state_pc(state);
 
-       assert(is_64_bit || (pc <= UINT32_MAX));
+       assert(is_64_bit);
 
        uthread_t uthread = (uthread_t) get_bsdthread_info(current_thread());
 
@@ -1814,15 +1001,6 @@ fasttrap_pid_probe(arm_saved_state_t *state)
                return -1;
        }
 
-       /* Validation of THUMB-related state */
-       if (tp->ftt_thumb) {
-               if (!fasttrap_pid_probe_thumb_state_valid(saved_state32(state), tp)) {
-                       fasttrap_tracepoint_remove(p, tp);
-                       lck_mtx_unlock(pid_mtx);
-                       return -1;
-               }
-       }
-
        /* Execute the actual probe */
        if (tp->ftt_ids != NULL) {
                fasttrap_id_t *id;
@@ -1831,11 +1009,7 @@ fasttrap_pid_probe(arm_saved_state_t *state)
                if (is_saved_state64(state)) {
                        arg4 = get_saved_state_reg(state, 4);
                } else {
-                       uint32_t arg;
-                       user_addr_t stack = (user_addr_t)get_saved_state_sp(state);
-
-                       fasttrap_fuword32_noerr(stack, &arg);
-                       arg4 = arg;
+                       return -1;
                }
 
 
@@ -1853,8 +1027,8 @@ fasttrap_pid_probe(arm_saved_state_t *state)
 #endif
                        } else {
                                if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) {
-                                       uint8_t already_triggered = atomic_or_8(&probe->ftp_triggered, 1);
-                                       if (already_triggered) {
+                                       if (os_atomic_xchg(&probe->ftp_triggered, 1, relaxed)) {
+                                               /* already triggered */
                                                continue;
                                        }
                                }
@@ -1901,11 +1075,7 @@ fasttrap_pid_probe(arm_saved_state_t *state)
                                } else {
                                        uint64_t t[5];
 
-                                       if (is_64_bit) {
-                                               fasttrap_usdt_args64(probe, saved_state64(state), 5, t);
-                                       } else {
-                                               fasttrap_usdt_args32(probe, saved_state32(state), 5, t);
-                                       }
+                                       fasttrap_usdt_args64(probe, saved_state64(state), 5, t);
                                        dtrace_probe(probe->ftp_id, t[0], t[1], t[2], t[3], t[4]);
                                }
                        }
@@ -1932,11 +1102,7 @@ fasttrap_pid_probe(arm_saved_state_t *state)
         * reported at: d, b, a. The new way gives c, b, a, which is closer
         * to correct, as the return instruction has already exectued.
         */
-       if (is_64_bit) {
-               fasttrap_pid_probe_handle_patched_instr64(state, tp, uthread, p, is_enabled, &was_simulated);
-       } else {
-               fasttrap_pid_probe_handle_patched_instr32(state, tp, uthread, p, is_enabled, &was_simulated);
-       }
+       fasttrap_pid_probe_handle_patched_instr64(state, tp, uthread, p, is_enabled, &was_simulated);
 
        /*
         * If there were no return probes when we first found the tracepoint,
@@ -2018,44 +1184,24 @@ fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
 #pragma unused(arg, id, parg, aframes)
        arm_saved_state_t* regs = find_user_regs(current_thread());
 
-       if (is_saved_state32(regs)) {
-               /* First four arguments are in registers */
-               if (argno < 4) {
-                       return saved_state32(regs)->r[argno];
-               }
-
-               /* Look on the stack for the rest */
-               uint32_t value;
-               uint32_t* sp = (uint32_t*)(uintptr_t) saved_state32(regs)->sp;
-               DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
-               value = dtrace_fuword32((user_addr_t) (sp + argno - 4));
-               DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
-
-               return value;
-       } else {
-               /* First eight arguments are in registers */
-               if (argno < 8) {
-                       return saved_state64(regs)->x[argno];
-               }
+       /* First eight arguments are in registers */
+       if (argno < 8) {
+               return saved_state64(regs)->x[argno];
+       }
 
-               /* Look on the stack for the rest */
-               uint64_t value;
-               uint64_t* sp = (uint64_t*) saved_state64(regs)->sp;
-               DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
-               value = dtrace_fuword64((user_addr_t) (sp + argno - 8));
-               DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
+       /* Look on the stack for the rest */
+       uint64_t value;
+       uint64_t* sp = (uint64_t*) saved_state64(regs)->sp;
+       DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+       value = dtrace_fuword64((user_addr_t) (sp + argno - 8));
+       DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
 
-               return value;
-       }
+       return value;
 }
 
 uint64_t
 fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
 {
 #pragma unused(arg, id, parg, argno, aframes)
-#if 0
-       return fasttrap_anarg(ttolwp(curthread)->lwp_regs, 0, argno);
-#endif
-
        return 0;
 }
index 083f98665aff2eea07e0ec934a8f7814cd94521f..4cff0d3f6104a2b9df3f3c1c93faeba77a5fdcdc 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
  */
 /*
  * CDDL HEADER START
  * Use is subject to license terms.
  */
 
-/* #pragma ident       "@(#)fbt.c      1.15    05/09/19 SMI" */
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL                 /* Solaris vs. Darwin */
-#endif
-#endif
-
-#define MACH__POSIX_C_SOURCE_PRIVATE 1  /* pulls in suitable savearea from
-                                        * mach/ppc/thread_status.h */
 #include <kern/thread.h>
 #include <mach/thread_status.h>
 #include <arm/proc_reg.h>
@@ -231,7 +221,7 @@ fbt_perfCallback(
        if (FBT_EXCEPTION_CODE == trapno && !IS_USER_TRAP(regs)) {
                boolean_t oldlevel = 0;
                machine_inst_t emul = 0;
-               uint64_t sp, pc, lr, imm;
+               uint64_t sp, lr, imm;
 
                oldlevel = ml_set_interrupts_enabled(FALSE);
 
@@ -259,8 +249,7 @@ fbt_perfCallback(
                        /*
                         * Skip over the patched NOP planted by sdt
                         */
-                       pc = get_saved_state_pc(regs);
-                       set_saved_state_pc(regs, pc + DTRACE_INVOP_NOP_SKIP);
+                       add_saved_state_pc(regs, DTRACE_INVOP_NOP_SKIP);
                        retval = KERN_SUCCESS;
                } else if (FBT_IS_ARM64_ADD_FP_SP(emul)) {
                        /* retrieve the value to add */
@@ -278,8 +267,7 @@ fbt_perfCallback(
                        set_saved_state_fp(regs, sp + val);
 
                        /* skip over the bytes of the patched instruction */
-                       pc = get_saved_state_pc(regs);
-                       set_saved_state_pc(regs, pc + DTRACE_INVOP_ADD_FP_SP_SKIP);
+                       add_saved_state_pc(regs, DTRACE_INVOP_ADD_FP_SP_SKIP);
 
                        retval = KERN_SUCCESS;
                } else if (FBT_IS_ARM64_RET(emul)) {
@@ -290,9 +278,8 @@ fbt_perfCallback(
                        set_saved_state_pc(regs, lr);
                        retval = KERN_SUCCESS;
                } else if (FBT_IS_ARM64_B_INSTR(emul)) {
-                       pc = get_saved_state_pc(regs);
                        imm = FBT_GET_ARM64_B_IMM(emul);
-                       set_saved_state_pc(regs, pc + imm);
+                       add_saved_state_pc(regs, imm);
                        retval = KERN_SUCCESS;
                } else if (emul == FBT_PATCHVAL) {
                        /* Means we encountered an error but handled it, try same inst again */
index 598bd05b7f192d1a97583614ac81d62ed2408d1d..a4b65f887c2d0797e082470a5864494a2b03a7dd 100644 (file)
  * Use is subject to license terms.
  */
 
-/* #pragma ident       "@(#)sdt.c      1.6     06/03/24 SMI" */
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL /* Solaris vs. Darwin */
-#endif
-#endif
-
 #include <kern/cpu_data.h>
 #include <kern/debug.h>
 #include <kern/thread.h>
index e2715281d2132e024a88d102452db9b8792ff11b..d67aa4a0b386025db2c5e27af85319511b869e7e 100644 (file)
@@ -24,14 +24,8 @@ static int
 sysctl_time_since_reset SYSCTL_HANDLER_ARGS
 {
 #pragma unused(arg1, arg2, oidp)
-       int error = 0;
-       uint64_t return_value = 0;
-
-       return_value = ml_get_time_since_reset();
-
-       SYSCTL_OUT(req, &return_value, sizeof(return_value));
-
-       return error;
+       uint64_t return_value = ml_get_time_since_reset();
+       return SYSCTL_OUT(req, &return_value, sizeof(return_value));
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, time_since_reset,
@@ -43,14 +37,8 @@ static int
 sysctl_wake_conttime SYSCTL_HANDLER_ARGS
 {
 #pragma unused(arg1, arg2, oidp)
-       int error = 0;
-       uint64_t return_value = 0;
-
-       return_value = ml_get_conttime_wake_time();
-
-       SYSCTL_OUT(req, &return_value, sizeof(return_value));
-
-       return error;
+       uint64_t return_value = ml_get_conttime_wake_time();
+       return SYSCTL_OUT(req, &return_value, sizeof(return_value));
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, wake_conttime,
@@ -185,9 +173,125 @@ SYSCTL_INT(_machdep, OID_AUTO, lck_mtx_adaptive_spin_mode,
     CTLFLAG_RW, &lck_mtx_adaptive_spin_mode, 0,
     "Enable adaptive spin behavior for kernel mutexes");
 
+
 #if DEVELOPMENT || DEBUG
 extern uint64_t TLockTimeOut;
 SYSCTL_QUAD(_machdep, OID_AUTO, tlto,
     CTLFLAG_RW | CTLFLAG_LOCKED, &TLockTimeOut,
     "Ticket spinlock timeout (MATUs): use with care");
+
+static int
+sysctl_sysreg_vbar_el1 SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2, oidp)
+       uint64_t return_value = __builtin_arm_rsr64("VBAR_EL1");
+       return SYSCTL_OUT(req, &return_value, sizeof(return_value));
+}
+
+/*
+ * machdep.cpu.sysreg_vbar_el1
+ *
+ * ARM64: Vector Base Address Register.
+ * Read from the current CPU's system registers.
+ */
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, sysreg_vbar_el1,
+    CTLFLAG_RD | CTLTYPE_QUAD | CTLFLAG_LOCKED,
+    0, 0, sysctl_sysreg_vbar_el1, "Q",
+    "VBAR_EL1 register on the current CPU");
+
+static int
+sysctl_sysreg_mair_el1 SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2, oidp)
+       uint64_t return_value = __builtin_arm_rsr64("MAIR_EL1");
+       return SYSCTL_OUT(req, &return_value, sizeof(return_value));
+}
+
+/*
+ * machdep.cpu.sysreg_mair_el1
+ *
+ * ARM64: Memory Attribute Indirection Register.
+ * Read from the current CPU's system registers.
+ */
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, sysreg_mair_el1,
+    CTLFLAG_RD | CTLTYPE_QUAD | CTLFLAG_LOCKED,
+    0, 0, sysctl_sysreg_mair_el1, "Q",
+    "MAIR_EL1 register on the current CPU");
+
+static int
+sysctl_sysreg_ttbr1_el1 SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2, oidp)
+       uint64_t return_value = __builtin_arm_rsr64("TTBR1_EL1");
+       return SYSCTL_OUT(req, &return_value, sizeof(return_value));
+}
+
+/*
+ * machdep.cpu.sysreg_ttbr1_el1
+ *
+ * ARM64: Translation table base register 1.
+ * Read from the current CPU's system registers.
+ */
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, sysreg_ttbr1_el1,
+    CTLFLAG_RD | CTLTYPE_QUAD | CTLFLAG_LOCKED,
+    0, 0, sysctl_sysreg_ttbr1_el1, "Q",
+    "TTBR1_EL1 register on the current CPU");
+
+static int
+sysctl_sysreg_sctlr_el1 SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2, oidp)
+       uint64_t return_value = __builtin_arm_rsr64("SCTLR_EL1");
+       return SYSCTL_OUT(req, &return_value, sizeof(return_value));
+}
+
+/*
+ * machdep.cpu.sysreg_sctlr_el1
+ *
+ * ARM64: System Control Register.
+ * Read from the current CPU's system registers.
+ */
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, sysreg_sctlr_el1,
+    CTLFLAG_RD | CTLTYPE_QUAD | CTLFLAG_LOCKED,
+    0, 0, sysctl_sysreg_sctlr_el1, "Q",
+    "SCTLR_EL1 register on the current CPU");
+
+static int
+sysctl_sysreg_tcr_el1 SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2, oidp)
+       uint64_t return_value = __builtin_arm_rsr64("TCR_EL1");
+       return SYSCTL_OUT(req, &return_value, sizeof(return_value));
+}
+
+/*
+ * machdep.cpu.sysreg_tcr_el1
+ *
+ * ARM64: Translation Control Register.
+ * Read from the current CPU's system registers.
+ */
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, sysreg_tcr_el1,
+    CTLFLAG_RD | CTLTYPE_QUAD | CTLFLAG_LOCKED,
+    0, 0, sysctl_sysreg_tcr_el1, "Q",
+    "TCR_EL1 register on the current CPU");
+
+static int
+sysctl_sysreg_id_aa64mmfr0_el1 SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2, oidp)
+       uint64_t return_value = __builtin_arm_rsr64("ID_AA64MMFR0_EL1");
+       return SYSCTL_OUT(req, &return_value, sizeof(return_value));
+}
+
+/*
+ * machdep.cpu.sysreg_id_aa64mmfr0_el1
+ *
+ * ARM64: AArch64 Memory Model Feature Register 0.
+ * Read from the current CPU's system registers.
+ */
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, sysreg_id_aa64mmfr0_el1,
+    CTLFLAG_RD | CTLTYPE_QUAD | CTLFLAG_LOCKED,
+    0, 0, sysctl_sysreg_id_aa64mmfr0_el1, "Q",
+    "ID_AA64MMFR0_EL1 register on the current CPU");
+
 #endif
index 180d30ffb9b39cbe1a90fdc935cb2af1c1a3c664..6d219f95de5c0c05f6b15b85f41100c5b1864b5c 100644 (file)
  * $FreeBSD: src/sys/kern/subr_blist.c,v 1.5.2.1 2000/03/17 10:47:29 ps Exp $
  */
 
-#if !defined(__APPLE__)
-#ifdef _KERNEL
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/lock.h>
-#include <sys/kernel.h>
-#include <sys/blist.h>
-#include <sys/malloc.h>
-#include <vm/vm.h>
-#include <vm/vm_object.h>
-#include <vm/vm_kern.h>
-#include <vm/vm_extern.h>
-#include <vm/vm_page.h>
-
-#else
-
-#ifndef BLIST_NO_DEBUG
-#define BLIST_DEBUG
-#endif
-
-#define SWAPBLK_NONE ((daddr_t)-1)
-
-#include <sys/types.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <stdarg.h>
-
-#define malloc(a, b, c)   malloc(a)
-#define free(a, b)       free(a)
-
-typedef unsigned int u_daddr_t;
-
-#include <sys/blist.h>
-
-void panic(const char *ctl, ...);
-
-#endif
-#else /* is MacOS X */
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL /* Solaris vs. Darwin */
-#endif
-#endif
-
 typedef unsigned int u_daddr_t;
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
-/* #include <sys/blist.h> */
 #include "blist.h"
 #include <sys/malloc.h>
 
@@ -123,8 +76,6 @@ typedef unsigned int u_daddr_t;
 #define free _FREE
 #define M_SWAP M_TEMP
 
-#endif /* __APPLE__ */
-
 /*
  * static support functions
  */
@@ -139,16 +90,6 @@ static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
     daddr_t skip, blist_t dest, daddr_t count);
 static daddr_t  blst_radix_init(blmeta_t *scan, daddr_t radix,
     int skip, daddr_t count);
-#ifndef _KERNEL
-static void     blst_radix_print(blmeta_t *scan, daddr_t blk,
-    daddr_t radix, int skip, int tab);
-#endif
-
-#if !defined(__APPLE__)
-#ifdef _KERNEL
-static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
-#endif
-#endif /* __APPLE__ */
 
 /*
  * blist_create() - create a blist capable of handling up to the specified
index 6de3c98c45a0315939148d531895e83a2bf5374c..9cc3b6094762fd468df658ceb333d3aba4bc458f 100644 (file)
@@ -29,8 +29,6 @@
  * Use is subject to license terms.
  */
 
-/* #pragma ident       "@(#)dtrace.c   1.65    08/07/02 SMI" */
-
 /*
  * DTrace - Dynamic Tracing for Solaris
  *
@@ -75,6 +73,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/conf.h>
+#include <sys/random.h>
 #include <sys/systm.h>
 #include <sys/dtrace_impl.h>
 #include <sys/param.h>
 #include <machine/monotonic.h>
 #endif /* MONOTONIC */
 
+#include "dtrace_xoroshiro128_plus.h"
+
 #include <IOKit/IOPlatformExpert.h>
 
 #include <kern/cpu_data.h>
@@ -112,6 +113,7 @@ extern uint32_t pmap_find_phys(void *, uint64_t);
 extern boolean_t pmap_valid_page(uint32_t);
 extern void OSKextRegisterKextsWithDTrace(void);
 extern kmod_info_t g_kernel_kmod_info;
+extern void commpage_update_dof(boolean_t enabled);
 
 /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
 #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
@@ -182,12 +184,12 @@ dtrace_optval_t dtrace_jstackstrsize_default = 512;
 dtrace_optval_t dtrace_buflimit_default = 75;
 dtrace_optval_t dtrace_buflimit_min = 1;
 dtrace_optval_t dtrace_buflimit_max = 99;
+size_t         dtrace_nprobes_default = 4;
 int            dtrace_msgdsize_max = 128;
 hrtime_t       dtrace_chill_max = 500 * (NANOSEC / MILLISEC);  /* 500 ms */
 hrtime_t       dtrace_chill_interval = NANOSEC;                /* 1000 ms */
 int            dtrace_devdepth_max = 32;
 int            dtrace_err_verbose;
-int            dtrace_provide_private_probes = 0;
 hrtime_t       dtrace_deadman_interval = NANOSEC;
 hrtime_t       dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
 hrtime_t       dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
@@ -855,46 +857,16 @@ SYSCTL_PROC(_kern_dtrace, OID_AUTO, global_maxsize,
        &dtrace_statvar_maxsize, 0,
        sysctl_dtrace_statvar_maxsize, "Q", "dtrace statvar maxsize");
 
-static int
-sysctl_dtrace_provide_private_probes SYSCTL_HANDLER_ARGS
-{
-#pragma unused(oidp, arg2)
-       int error;
-       int value = *(int *) arg1;
-
-       error = sysctl_io_number(req, value, sizeof(value), &value, NULL);
-       if (error)
-               return (error);
-
-       if (req->newptr) {
-               if (value != 0 && value != 1)
-                       return (ERANGE);
-
-               /*
-                * We do not allow changing this back to zero, as private probes
-                * would still be left registered
-                */
-               if (value != 1)
-                       return (EPERM);
-
-               lck_mtx_lock(&dtrace_lock);
-               dtrace_provide_private_probes = value;
-               lck_mtx_unlock(&dtrace_lock);
-       }
-       return (0);
-}
 
 /*
  * kern.dtrace.provide_private_probes
  *
  * Set whether the providers must provide the private probes.  This is
- * mainly used by the FBT provider to request probes for the private/static
- * symbols.
+ * kept as compatibility as they are always provided.
  */
-SYSCTL_PROC(_kern_dtrace, OID_AUTO, provide_private_probes,
-       CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
-       &dtrace_provide_private_probes, 0,
-       sysctl_dtrace_provide_private_probes, "I", "provider must provide the private probes");
+SYSCTL_INT(_kern_dtrace, OID_AUTO, provide_private_probes,
+       CTLFLAG_RD | CTLFLAG_LOCKED,
+       (int *)NULL, 1, "provider must provide the private probes");
 
 /*
  * kern.dtrace.dof_mode
@@ -1293,11 +1265,72 @@ dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
                vstate));
 }
 
+#define        isdigit(ch)     ((ch) >= '0' && (ch) <= '9')
+#define        islower(ch)     ((ch) >= 'a' && (ch) <= 'z')
+#define        isspace(ch)     (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \
+                       ((ch) == '\t') || ((ch) == '\f'))
+#define        isxdigit(ch)    (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
+                       ((ch) >= 'A' && (ch) <= 'F'))
+#define        lisalnum(x)     \
+       (isdigit(x) || ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z'))
+
+#define        DIGIT(x)        \
+       (isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A')
+
+/*
+ * Convert a string to a signed integer using safe loads.
+ */
+static int64_t
+dtrace_strtoll(char *input, int base, size_t limit)
+{
+       uintptr_t pos = (uintptr_t)input;
+       int64_t val = 0;
+       int x;
+       boolean_t neg = B_FALSE;
+       char c, cc, ccc;
+       uintptr_t end = pos + limit;
+
+       /*
+        * Consume any whitespace preceding digits.
+        */
+       while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
+               pos++;
+
+       /*
+        * Handle an explicit sign if one is present.
+        */
+       if (c == '-' || c == '+') {
+               if (c == '-')
+                       neg = B_TRUE;
+               c = dtrace_load8(++pos);
+       }
+
+       /*
+        * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
+        * if present.
+        */
+       if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
+           cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
+               pos += 2;
+               c = ccc;
+       }
+
+       /*
+        * Read in contiguous digits until the first non-digit character.
+        */
+       for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
+           c = dtrace_load8(++pos))
+               val = val * base + x;
+
+       return (neg ? -val : val);
+}
+
+
 /*
  * Compare two strings using safe loads.
  */
 static int
-dtrace_strncmp(char *s1, char *s2, size_t limit)
+dtrace_strncmp(const char *s1, const char *s2, size_t limit)
 {
        uint8_t c1, c2;
        volatile uint16_t *flags;
@@ -3273,10 +3306,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
                ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
                if (ndx >= sizeof (mstate->dtms_arg) /
                    sizeof (mstate->dtms_arg[0])) {
-                       /*
-                        * APPLE NOTE: Account for introduction of __dtrace_probe()
-                        */
-                       int aframes = mstate->dtms_probe->dtpr_aframes + 3;
+                       int aframes = mstate->dtms_probe->dtpr_aframes + 2;
                        dtrace_vstate_t *vstate = &state->dts_vstate;
                        dtrace_provider_t *pv;
                        uint64_t val;
@@ -3382,10 +3412,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
                if (!dtrace_priv_kernel(state))
                        return (0);
                if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
-                       /*
-                        * APPLE NOTE: Account for introduction of __dtrace_probe()
-                        */
-                       int aframes = mstate->dtms_probe->dtpr_aframes + 3;
+                       int aframes = mstate->dtms_probe->dtpr_aframes + 2;
 
                        mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
                        mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
@@ -3416,10 +3443,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
                if (!dtrace_priv_kernel(state))
                        return (0);
                if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
-                       /*
-                        * APPLE NOTE: Account for introduction of __dtrace_probe()
-                        */
-                       int aframes = mstate->dtms_probe->dtpr_aframes + 3;
+                       int aframes = mstate->dtms_probe->dtpr_aframes + 2;
 
                        if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
                                /*
@@ -3663,6 +3687,458 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
        }
 }
 
+typedef enum dtrace_json_state {
+       DTRACE_JSON_REST = 1,
+       DTRACE_JSON_OBJECT,
+       DTRACE_JSON_STRING,
+       DTRACE_JSON_STRING_ESCAPE,
+       DTRACE_JSON_STRING_ESCAPE_UNICODE,
+       DTRACE_JSON_COLON,
+       DTRACE_JSON_COMMA,
+       DTRACE_JSON_VALUE,
+       DTRACE_JSON_IDENTIFIER,
+       DTRACE_JSON_NUMBER,
+       DTRACE_JSON_NUMBER_FRAC,
+       DTRACE_JSON_NUMBER_EXP,
+       DTRACE_JSON_COLLECT_OBJECT
+} dtrace_json_state_t;
+
+/*
+ * This function possesses just enough knowledge about JSON to extract a single
+ * value from a JSON string and store it in the scratch buffer.  It is able
+ * to extract nested object values, and members of arrays by index.
+ *
+ * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
+ * be looked up as we descend into the object tree.  e.g.
+ *
+ *    foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
+ *       with nelems = 5.
+ *
+ * The run time of this function must be bounded above by strsize to limit the
+ * amount of work done in probe context.  As such, it is implemented as a
+ * simple state machine, reading one character at a time using safe loads
+ * until we find the requested element, hit a parsing error or run off the
+ * end of the object or string.
+ *
+ * As there is no way for a subroutine to return an error without interrupting
+ * clause execution, we simply return NULL in the event of a missing key or any
+ * other error condition.  Each NULL return in this function is commented with
+ * the error condition it represents -- parsing or otherwise.
+ *
+ * The set of states for the state machine closely matches the JSON
+ * specification (http://json.org/).  Briefly:
+ *
+ *   DTRACE_JSON_REST:
+ *     Skip whitespace until we find either a top-level Object, moving
+ *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
+ *
+ *   DTRACE_JSON_OBJECT:
+ *     Locate the next key String in an Object.  Sets a flag to denote
+ *     the next String as a key string and moves to DTRACE_JSON_STRING.
+ *
+ *   DTRACE_JSON_COLON:
+ *     Skip whitespace until we find the colon that separates key Strings
+ *     from their values.  Once found, move to DTRACE_JSON_VALUE.
+ *
+ *   DTRACE_JSON_VALUE:
+ *     Detects the type of the next value (String, Number, Identifier, Object
+ *     or Array) and routes to the states that process that type.  Here we also
+ *     deal with the element selector list if we are requested to traverse down
+ *     into the object tree.
+ *
+ *   DTRACE_JSON_COMMA:
+ *     Skip whitespace until we find the comma that separates key-value pairs
+ *     in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
+ *     (similarly DTRACE_JSON_VALUE).  All following literal value processing
+ *     states return to this state at the end of their value, unless otherwise
+ *     noted.
+ *
+ *   DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
+ *     Processes a Number literal from the JSON, including any exponent
+ *     component that may be present.  Numbers are returned as strings, which
+ *     may be passed to strtoll() if an integer is required.
+ *
+ *   DTRACE_JSON_IDENTIFIER:
+ *     Processes a "true", "false" or "null" literal in the JSON.
+ *
+ *   DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
+ *   DTRACE_JSON_STRING_ESCAPE_UNICODE:
+ *     Processes a String literal from the JSON, whether the String denotes
+ *     a key, a value or part of a larger Object.  Handles all escape sequences
+ *     present in the specification, including four-digit unicode characters,
+ *     but merely includes the escape sequence without converting it to the
+ *     actual escaped character.  If the String is flagged as a key, we
+ *     move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
+ *
+ *   DTRACE_JSON_COLLECT_OBJECT:
+ *     This state collects an entire Object (or Array), correctly handling
+ *     embedded strings.  If the full element selector list matches this nested
+ *     object, we return the Object in full as a string.  If not, we use this
+ *     state to skip to the next value at this level and continue processing.
+ */
+static char *
+dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
+    char *dest)
+{
+       dtrace_json_state_t state = DTRACE_JSON_REST;
+       int64_t array_elem = INT64_MIN;
+       int64_t array_pos = 0;
+       uint8_t escape_unicount = 0;
+       boolean_t string_is_key = B_FALSE;
+       boolean_t collect_object = B_FALSE;
+       boolean_t found_key = B_FALSE;
+       boolean_t in_array = B_FALSE;
+       uint32_t braces = 0, brackets = 0;
+       char *elem = elemlist;
+       char *dd = dest;
+       uintptr_t cur;
+
+       for (cur = json; cur < json + size; cur++) {
+               char cc = dtrace_load8(cur);
+               if (cc == '\0')
+                       return (NULL);
+
+               switch (state) {
+               case DTRACE_JSON_REST:
+                       if (isspace(cc))
+                               break;
+
+                       if (cc == '{') {
+                               state = DTRACE_JSON_OBJECT;
+                               break;
+                       }
+
+                       if (cc == '[') {
+                               in_array = B_TRUE;
+                               array_pos = 0;
+                               array_elem = dtrace_strtoll(elem, 10, size);
+                               found_key = array_elem == 0 ? B_TRUE : B_FALSE;
+                               state = DTRACE_JSON_VALUE;
+                               break;
+                       }
+
+                       /*
+                        * ERROR: expected to find a top-level object or array.
+                        */
+                       return (NULL);
+               case DTRACE_JSON_OBJECT:
+                       if (isspace(cc))
+                               break;
+
+                       if (cc == '"') {
+                               state = DTRACE_JSON_STRING;
+                               string_is_key = B_TRUE;
+                               break;
+                       }
+
+                       /*
+                        * ERROR: either the object did not start with a key
+                        * string, or we've run off the end of the object
+                        * without finding the requested key.
+                        */
+                       return (NULL);
+               case DTRACE_JSON_STRING:
+                       if (cc == '\\') {
+                               *dd++ = '\\';
+                               state = DTRACE_JSON_STRING_ESCAPE;
+                               break;
+                       }
+
+                       if (cc == '"') {
+                               if (collect_object) {
+                                       /*
+                                        * We don't reset the dest here, as
+                                        * the string is part of a larger
+                                        * object being collected.
+                                        */
+                                       *dd++ = cc;
+                                       collect_object = B_FALSE;
+                                       state = DTRACE_JSON_COLLECT_OBJECT;
+                                       break;
+                               }
+                               *dd = '\0';
+                               dd = dest; /* reset string buffer */
+                               if (string_is_key) {
+                                       if (dtrace_strncmp(dest, elem,
+                                           size) == 0)
+                                               found_key = B_TRUE;
+                               } else if (found_key) {
+                                       if (nelems > 1) {
+                                               /*
+                                                * We expected an object, not
+                                                * this string.
+                                                */
+                                               return (NULL);
+                                       }
+                                       return (dest);
+                               }
+                               state = string_is_key ? DTRACE_JSON_COLON :
+                                   DTRACE_JSON_COMMA;
+                               string_is_key = B_FALSE;
+                               break;
+                       }
+
+                       *dd++ = cc;
+                       break;
+               case DTRACE_JSON_STRING_ESCAPE:
+                       *dd++ = cc;
+                       if (cc == 'u') {
+                               escape_unicount = 0;
+                               state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
+                       } else {
+                               state = DTRACE_JSON_STRING;
+                       }
+                       break;
+               case DTRACE_JSON_STRING_ESCAPE_UNICODE:
+                       if (!isxdigit(cc)) {
+                               /*
+                                * ERROR: invalid unicode escape, expected
+                                * four valid hexidecimal digits.
+                                */
+                               return (NULL);
+                       }
+
+                       *dd++ = cc;
+                       if (++escape_unicount == 4)
+                               state = DTRACE_JSON_STRING;
+                       break;
+               case DTRACE_JSON_COLON:
+                       if (isspace(cc))
+                               break;
+
+                       if (cc == ':') {
+                               state = DTRACE_JSON_VALUE;
+                               break;
+                       }
+
+                       /*
+                        * ERROR: expected a colon.
+                        */
+                       return (NULL);
+               case DTRACE_JSON_COMMA:
+                       if (isspace(cc))
+                               break;
+
+                       if (cc == ',') {
+                               if (in_array) {
+                                       state = DTRACE_JSON_VALUE;
+                                       if (++array_pos == array_elem)
+                                               found_key = B_TRUE;
+                               } else {
+                                       state = DTRACE_JSON_OBJECT;
+                               }
+                               break;
+                       }
+
+                       /*
+                        * ERROR: either we hit an unexpected character, or
+                        * we reached the end of the object or array without
+                        * finding the requested key.
+                        */
+                       return (NULL);
+               case DTRACE_JSON_IDENTIFIER:
+                       if (islower(cc)) {
+                               *dd++ = cc;
+                               break;
+                       }
+
+                       *dd = '\0';
+                       dd = dest; /* reset string buffer */
+
+                       if (dtrace_strncmp(dest, "true", 5) == 0 ||
+                           dtrace_strncmp(dest, "false", 6) == 0 ||
+                           dtrace_strncmp(dest, "null", 5) == 0) {
+                               if (found_key) {
+                                       if (nelems > 1) {
+                                               /*
+                                                * ERROR: We expected an object,
+                                                * not this identifier.
+                                                */
+                                               return (NULL);
+                                       }
+                                       return (dest);
+                               } else {
+                                       cur--;
+                                       state = DTRACE_JSON_COMMA;
+                                       break;
+                               }
+                       }
+
+                       /*
+                        * ERROR: we did not recognise the identifier as one
+                        * of those in the JSON specification.
+                        */
+                       return (NULL);
+               case DTRACE_JSON_NUMBER:
+                       if (cc == '.') {
+                               *dd++ = cc;
+                               state = DTRACE_JSON_NUMBER_FRAC;
+                               break;
+                       }
+
+                       if (cc == 'x' || cc == 'X') {
+                               /*
+                                * ERROR: specification explicitly excludes
+                                * hexidecimal or octal numbers.
+                                */
+                               return (NULL);
+                       }
+
+                       /* FALLTHRU */
+               case DTRACE_JSON_NUMBER_FRAC:
+                       if (cc == 'e' || cc == 'E') {
+                               *dd++ = cc;
+                               state = DTRACE_JSON_NUMBER_EXP;
+                               break;
+                       }
+
+                       if (cc == '+' || cc == '-') {
+                               /*
+                                * ERROR: expect sign as part of exponent only.
+                                */
+                               return (NULL);
+                       }
+                       /* FALLTHRU */
+               case DTRACE_JSON_NUMBER_EXP:
+                       if (isdigit(cc) || cc == '+' || cc == '-') {
+                               *dd++ = cc;
+                               break;
+                       }
+
+                       *dd = '\0';
+                       dd = dest; /* reset string buffer */
+                       if (found_key) {
+                               if (nelems > 1) {
+                                       /*
+                                        * ERROR: We expected an object, not
+                                        * this number.
+                                        */
+                                       return (NULL);
+                               }
+                               return (dest);
+                       }
+
+                       cur--;
+                       state = DTRACE_JSON_COMMA;
+                       break;
+               case DTRACE_JSON_VALUE:
+                       if (isspace(cc))
+                               break;
+
+                       if (cc == '{' || cc == '[') {
+                               if (nelems > 1 && found_key) {
+                                       in_array = cc == '[' ? B_TRUE : B_FALSE;
+                                       /*
+                                        * If our element selector directs us
+                                        * to descend into this nested object,
+                                        * then move to the next selector
+                                        * element in the list and restart the
+                                        * state machine.
+                                        */
+                                       while (*elem != '\0')
+                                               elem++;
+                                       elem++; /* skip the inter-element NUL */
+                                       nelems--;
+                                       dd = dest;
+                                       if (in_array) {
+                                               state = DTRACE_JSON_VALUE;
+                                               array_pos = 0;
+                                               array_elem = dtrace_strtoll(
+                                                   elem, 10, size);
+                                               found_key = array_elem == 0 ?
+                                                   B_TRUE : B_FALSE;
+                                       } else {
+                                               found_key = B_FALSE;
+                                               state = DTRACE_JSON_OBJECT;
+                                       }
+                                       break;
+                               }
+
+                               /*
+                                * Otherwise, we wish to either skip this
+                                * nested object or return it in full.
+                                */
+                               if (cc == '[')
+                                       brackets = 1;
+                               else
+                                       braces = 1;
+                               *dd++ = cc;
+                               state = DTRACE_JSON_COLLECT_OBJECT;
+                               break;
+                       }
+
+                       if (cc == '"') {
+                               state = DTRACE_JSON_STRING;
+                               break;
+                       }
+
+                       if (islower(cc)) {
+                               /*
+                                * Here we deal with true, false and null.
+                                */
+                               *dd++ = cc;
+                               state = DTRACE_JSON_IDENTIFIER;
+                               break;
+                       }
+
+                       if (cc == '-' || isdigit(cc)) {
+                               *dd++ = cc;
+                               state = DTRACE_JSON_NUMBER;
+                               break;
+                       }
+
+                       /*
+                        * ERROR: unexpected character at start of value.
+                        */
+                       return (NULL);
+               case DTRACE_JSON_COLLECT_OBJECT:
+                       if (cc == '\0')
+                               /*
+                                * ERROR: unexpected end of input.
+                                */
+                               return (NULL);
+
+                       *dd++ = cc;
+                       if (cc == '"') {
+                               collect_object = B_TRUE;
+                               state = DTRACE_JSON_STRING;
+                               break;
+                       }
+
+                       if (cc == ']') {
+                               if (brackets-- == 0) {
+                                       /*
+                                        * ERROR: unbalanced brackets.
+                                        */
+                                       return (NULL);
+                               }
+                       } else if (cc == '}') {
+                               if (braces-- == 0) {
+                                       /*
+                                        * ERROR: unbalanced braces.
+                                        */
+                                       return (NULL);
+                               }
+                       } else if (cc == '{') {
+                               braces++;
+                       } else if (cc == '[') {
+                               brackets++;
+                       }
+
+                       if (brackets == 0 && braces == 0) {
+                               if (found_key) {
+                                       *dd = '\0';
+                                       return (dest);
+                               }
+                               dd = dest; /* reset string buffer */
+                               state = DTRACE_JSON_COMMA;
+                       }
+                       break;
+               }
+       }
+       return (NULL);
+}
+
 /*
  * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
  * Notice that we don't bother validating the proper number of arguments or
@@ -3695,7 +4171,8 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
 
        switch (subr) {
        case DIF_SUBR_RAND:
-               regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
+               regs[rd] = dtrace_xoroshiro128_plus_next(
+                   state->dts_rstate[CPU->cpu_id]);
                break;
 
 #if !defined(__APPLE__)
@@ -4421,6 +4898,29 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
                break;
        }
 
+       case DIF_SUBR_STRTOLL: {
+               uintptr_t s = tupregs[0].dttk_value;
+               uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+               size_t lim;
+               int base = 10;
+
+               if (nargs > 1) {
+                       if ((base = tupregs[1].dttk_value) <= 1 ||
+                           base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
+                               *flags |= CPU_DTRACE_ILLOP;
+                               break;
+                       }
+               }
+
+               if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) {
+                       regs[rd] = INT64_MIN;
+                       break;
+               }
+
+               regs[rd] = dtrace_strtoll((char *)s, base, lim);
+               break;
+       }
+
        case DIF_SUBR_LLTOSTR: {
                int64_t i = (int64_t)tupregs[0].dttk_value;
                uint64_t val, digit;
@@ -4976,6 +5476,65 @@ inetout: regs[rd] = (uintptr_t)end + 1;
                break;
        }
 
+       case DIF_SUBR_JSON: {
+               uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+               uintptr_t json = tupregs[0].dttk_value;
+               size_t jsonlen = dtrace_strlen((char *)json, size);
+               uintptr_t elem = tupregs[1].dttk_value;
+               size_t elemlen = dtrace_strlen((char *)elem, size);
+
+               char *dest = (char *)mstate->dtms_scratch_ptr;
+               char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
+               char *ee = elemlist;
+               int nelems = 1;
+               uintptr_t cur;
+
+               if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
+                   !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
+                       regs[rd] = 0;
+                       break;
+               }
+
+               if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
+                       DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+                       regs[rd] = 0;
+                       break;
+               }
+
+               /*
+                * Read the element selector and split it up into a packed list
+                * of strings.
+                */
+               for (cur = elem; cur < elem + elemlen; cur++) {
+                       char cc = dtrace_load8(cur);
+
+                       if (cur == elem && cc == '[') {
+                               /*
+                                * If the first element selector key is
+                                * actually an array index then ignore the
+                                * bracket.
+                                */
+                               continue;
+                       }
+
+                       if (cc == ']')
+                               continue;
+
+                       if (cc == '.' || cc == '[') {
+                               nelems++;
+                               cc = '\0';
+                       }
+
+                       *ee++ = cc;
+               }
+               *ee++ = '\0';
+
+               if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
+                   nelems, dest)) != 0)
+                       mstate->dtms_scratch_ptr += jsonlen + 1;
+               break;
+       }
+
        case DIF_SUBR_TOUPPER:
        case DIF_SUBR_TOLOWER: {
                uintptr_t src = tupregs[0].dttk_value;
@@ -5016,6 +5575,14 @@ inetout: regs[rd] = (uintptr_t)end + 1;
 
                break;
        }
+       case DIF_SUBR_STRIP:
+               if (!dtrace_is_valid_ptrauth_key(tupregs[1].dttk_value)) {
+                       DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
+                       break;
+               }
+               regs[rd] = (uint64_t)dtrace_ptrauth_strip(
+                   (void*)tupregs[0].dttk_value, tupregs[1].dttk_value);
+               break;
 
 #if defined(__APPLE__)
        case DIF_SUBR_VM_KERNEL_ADDRPERM: {
@@ -5890,6 +6457,10 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
                        }
                        *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
                        break;
+               case DIF_OP_STRIP:
+                       regs[rd] = (uint64_t)dtrace_ptrauth_strip(
+                           (void*)regs[r1], r2);
+                       break;
                }
        }
 
@@ -6287,13 +6858,64 @@ dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
        *valoffsp = valoffs;
 }
 
+/*
+ * Disables interrupts and sets the per-thread inprobe flag. When DEBUG is
+ * defined, we also assert that we are not recursing unless the probe ID is an
+ * error probe.
+ */
+static dtrace_icookie_t
+dtrace_probe_enter(dtrace_id_t id)
+{
+       thread_t thread = current_thread();
+       uint16_t inprobe;
+
+       dtrace_icookie_t cookie;
+
+       cookie = dtrace_interrupt_disable();
+
+       /*
+        * Unless this is an ERROR probe, we are not allowed to recurse in
+        * dtrace_probe(). Recursing into DTrace probe usually means that a
+        * function is instrumented that should not have been instrumented or
+        * that the ordering guarantee of the records will be violated,
+        * resulting in unexpected output. If there is an exception to this
+        * assertion, a new case should be added.
+        */
+       inprobe = dtrace_get_thread_inprobe(thread);
+       VERIFY(inprobe == 0 ||
+           id == dtrace_probeid_error);
+       ASSERT(inprobe < UINT16_MAX);
+       dtrace_set_thread_inprobe(thread, inprobe + 1);
+
+       return (cookie);
+}
+
+/*
+ * Clears the per-thread inprobe flag and enables interrupts.
+ */
+static void
+dtrace_probe_exit(dtrace_icookie_t cookie)
+{
+       thread_t thread = current_thread();
+       uint16_t inprobe = dtrace_get_thread_inprobe(thread);
+
+       ASSERT(inprobe > 0);
+       dtrace_set_thread_inprobe(thread, inprobe - 1);
+
+#if INTERRUPT_MASKED_DEBUG
+       ml_spin_debug_reset(thread);
+#endif /* INTERRUPT_MASKED_DEBUG */
+
+       dtrace_interrupt_enable(cookie);
+}
+
 /*
  * If you're looking for the epicenter of DTrace, you just found it.  This
  * is the function called by the provider to fire a probe -- from which all
  * subsequent probe-context DTrace activity emanates.
  */
-static void
-__dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
+void
+dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
     uint64_t arg2, uint64_t arg3, uint64_t arg4)
 {
        processorid_t cpuid;
@@ -6308,7 +6930,7 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
        volatile uint16_t *flags;
        hrtime_t now;
 
-       cookie = dtrace_interrupt_disable();
+       cookie = dtrace_probe_enter(id);
        probe = dtrace_probes[id - 1];
        cpuid = CPU->cpu_id;
        onintr = CPU_ON_INTR(CPU);
@@ -6319,7 +6941,7 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
                 * We have hit in the predicate cache; we know that
                 * this predicate would evaluate to be false.
                 */
-               dtrace_interrupt_enable(cookie);
+               dtrace_probe_exit(cookie);
                return;
        }
 
@@ -6327,7 +6949,7 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
                /*
                 * We don't trace anything if we're panicking.
                 */
-               dtrace_interrupt_enable(cookie);
+               dtrace_probe_exit(cookie);
                return;
        }
 
@@ -6999,45 +7621,16 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
                thread_t thread = current_thread();
                int64_t t = dtrace_get_thread_tracing(thread);
                
-               if (t >= 0) { 
+               if (t >= 0) {
                        /* Usual case, accumulate time spent here into t_dtrace_tracing */
                        dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
-               } else { 
+               } else {
                        /* Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. */
-                       dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t); 
+                       dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t);
                }
        }
 
-       dtrace_interrupt_enable(cookie);
-}
-
-/*
- * APPLE NOTE:  Don't allow a thread to re-enter dtrace_probe().
- * This could occur if a probe is encountered on some function in the
- * transitive closure of the call to dtrace_probe().
- * Solaris has some strong guarantees that this won't happen.
- * The Darwin implementation is not so mature as to make those guarantees.
- * Hence, the introduction of __dtrace_probe() on xnu.
- */
-
-void
-dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
-    uint64_t arg2, uint64_t arg3, uint64_t arg4)
-{
-       thread_t thread = current_thread();
-       disable_preemption();
-       if (id == dtrace_probeid_error) {
-               __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
-               dtrace_getipl(); /* Defeat tail-call optimization of __dtrace_probe() */
-       } else if (!dtrace_get_thread_reentering(thread)) {
-               dtrace_set_thread_reentering(thread, TRUE);
-               __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
-               dtrace_set_thread_reentering(thread, FALSE);
-       }
-#if DEBUG
-       else __dtrace_probe(dtrace_probeid_error, 0, id, 1, -1, DTRACEFLT_UNKNOWN);
-#endif
-       enable_preemption();
+       dtrace_probe_exit(cookie);
 }
 
 /*
@@ -8355,36 +8948,24 @@ dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
 
        if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
                size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
-               size_t nsize = osize << 1;
-
-               if (nsize == 0) {
-                       ASSERT(osize == 0);
-                       ASSERT(dtrace_probes == NULL);
-                       nsize = sizeof (dtrace_probe_t *);
-               }
+               size_t nsize = osize * 2;
 
                probes = kmem_zalloc(nsize, KM_SLEEP);
 
-               if (dtrace_probes == NULL) {
-                       ASSERT(osize == 0);
-                       dtrace_probes = probes;
-                       dtrace_nprobes = 1;
-               } else {
-                       dtrace_probe_t **oprobes = dtrace_probes;
+               dtrace_probe_t **oprobes = dtrace_probes;
 
-                       bcopy(oprobes, probes, osize);
-                       dtrace_membar_producer();
-                       dtrace_probes = probes;
+               bcopy(oprobes, probes, osize);
+               dtrace_membar_producer();
+               dtrace_probes = probes;
 
-                       dtrace_sync();
+               dtrace_sync();
 
-                       /*
-                        * All CPUs are now seeing the new probes array; we can
-                        * safely free the old array.
-                        */
-                       kmem_free(oprobes, osize);
-                       dtrace_nprobes <<= 1;
-               }
+               /*
+                * All CPUs are now seeing the new probes array; we can
+                * safely free the old array.
+                */
+               kmem_free(oprobes, osize);
+               dtrace_nprobes *= 2;
 
                ASSERT(id - 1 < (dtrace_id_t)dtrace_nprobes);
        }
@@ -9020,7 +9601,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
                        if (rd >= nregs)
                                err += efunc(pc, "invalid register %u\n", rd);
                        if (rd == 0)
-                               err += efunc(pc, "cannot write to %r0\n");
+                               err += efunc(pc, "cannot write to %%r0\n");
                        break;
                case DIF_OP_NOT:
                case DIF_OP_MOV:
@@ -9032,7 +9613,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
                        if (rd >= nregs)
                                err += efunc(pc, "invalid register %u\n", rd);
                        if (rd == 0)
-                               err += efunc(pc, "cannot write to %r0\n");
+                               err += efunc(pc, "cannot write to %%r0\n");
                        break;
                case DIF_OP_LDSB:
                case DIF_OP_LDSH:
@@ -9048,7 +9629,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
                        if (rd >= nregs)
                                err += efunc(pc, "invalid register %u\n", rd);
                        if (rd == 0)
-                               err += efunc(pc, "cannot write to %r0\n");
+                               err += efunc(pc, "cannot write to %%r0\n");
                        if (kcheckload)
                                dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
                                    DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
@@ -9067,7 +9648,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
                        if (rd >= nregs)
                                err += efunc(pc, "invalid register %u\n", rd);
                        if (rd == 0)
-                               err += efunc(pc, "cannot write to %r0\n");
+                               err += efunc(pc, "cannot write to %%r0\n");
                        break;
                case DIF_OP_ULDSB:
                case DIF_OP_ULDSH:
@@ -9083,7 +9664,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
                        if (rd >= nregs)
                                err += efunc(pc, "invalid register %u\n", rd);
                        if (rd == 0)
-                               err += efunc(pc, "cannot write to %r0\n");
+                               err += efunc(pc, "cannot write to %%r0\n");
                        break;
                case DIF_OP_STB:
                case DIF_OP_STH:
@@ -9153,7 +9734,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
                        if (rd >= nregs)
                                err += efunc(pc, "invalid register %u\n", rd);
                        if (rd == 0)
-                               err += efunc(pc, "cannot write to %r0\n");
+                               err += efunc(pc, "cannot write to %%r0\n");
                        break;
                case DIF_OP_SETS:
                        if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
@@ -9163,7 +9744,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
                        if (rd >= nregs)
                                err += efunc(pc, "invalid register %u\n", rd);
                        if (rd == 0)
-                               err += efunc(pc, "cannot write to %r0\n");
+                               err += efunc(pc, "cannot write to %%r0\n");
                        break;
                case DIF_OP_LDGA:
                case DIF_OP_LDTA:
@@ -9174,7 +9755,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
                        if (rd >= nregs)
                                err += efunc(pc, "invalid register %u\n", rd);
                        if (rd == 0)
-                               err += efunc(pc, "cannot write to %r0\n");
+                               err += efunc(pc, "cannot write to %%r0\n");
                        break;
                case DIF_OP_LDGS:
                case DIF_OP_LDTS:
@@ -9186,7 +9767,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
                        if (rd >= nregs)
                                err += efunc(pc, "invalid register %u\n", rd);
                        if (rd == 0)
-                               err += efunc(pc, "cannot write to %r0\n");
+                               err += efunc(pc, "cannot write to %%r0\n");
                        break;
                case DIF_OP_STGS:
                case DIF_OP_STTS:
@@ -9205,7 +9786,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
                        if (rd >= nregs)
                                err += efunc(pc, "invalid register %u\n", rd);
                        if (rd == 0)
-                               err += efunc(pc, "cannot write to %r0\n");
+                               err += efunc(pc, "cannot write to %%r0\n");
 
                        if (subr == DIF_SUBR_COPYOUT ||
                            subr == DIF_SUBR_COPYOUTSTR ||
@@ -9230,6 +9811,16 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
                        if (rs >= nregs)
                                err += efunc(pc, "invalid register %u\n", rs);
                        break;
+               case DIF_OP_STRIP:
+                       if (r1 >= nregs)
+                               err += efunc(pc, "invalid register %u\n", r1);
+                       if (!dtrace_is_valid_ptrauth_key(r2))
+                               err += efunc(pc, "invalid key\n");
+                       if (rd >= nregs)
+                               err += efunc(pc, "invalid register %u\n", rd);
+                       if (rd == 0)
+                               err += efunc(pc, "cannot write to %%r0\n");
+                       break;
                default:
                        err += efunc(pc, "invalid opcode %u\n",
                            DIF_INSTR_OP(instr));
@@ -9532,7 +10123,9 @@ dtrace_difo_validate_helper(dtrace_difo_t *dp)
                            subr == DIF_SUBR_INET_NTOA ||
                            subr == DIF_SUBR_INET_NTOA6 ||
                            subr == DIF_SUBR_INET_NTOP ||
+                           subr == DIF_SUBR_JSON ||
                            subr == DIF_SUBR_LLTOSTR ||
+                           subr == DIF_SUBR_STRTOLL ||
                            subr == DIF_SUBR_RINDEX ||
                            subr == DIF_SUBR_STRCHR ||
                            subr == DIF_SUBR_STRJOIN ||
@@ -11419,7 +12012,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
                        if (buf->dtb_cur_limit == buf->dtb_limit) {
                                buf->dtb_cur_limit = buf->dtb_size;
 
-                               atomic_add_32(&state->dts_buf_over_limit, 1);
+                               os_atomic_inc(&state->dts_buf_over_limit, relaxed);
                                /**
                                 * Set an AST on the current processor
                                 * so that we can wake up the process
@@ -11429,7 +12022,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
                                minor_t minor = getminor(state->dts_dev);
                                ASSERT(minor < 32);
 
-                               atomic_or_32(&dtrace_wake_clients, 1 << minor);
+                               os_atomic_or(&dtrace_wake_clients, 1 << minor, relaxed);
                                ast_dtrace_on();
                        }
                        if ((uint64_t)soffs > buf->dtb_size) {
@@ -13359,6 +13952,7 @@ dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
        dtrace_state_t *state;
        dtrace_optval_t *opt;
        int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
+       unsigned int cpu_it;
 
        LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
        LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
@@ -13405,6 +13999,25 @@ dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
        state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
        state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
        state->dts_buf_over_limit = 0;
+
+       /*
+         * Allocate and initialise the per-process per-CPU random state.
+        * SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is
+         * assumed to be seeded at this point (if from Fortuna seed file).
+        */
+       state->dts_rstate = kmem_zalloc(NCPU * sizeof(uint64_t*), KM_SLEEP);
+       state->dts_rstate[0] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP);
+       (void) read_random(state->dts_rstate[0], 2 * sizeof(uint64_t));
+       for (cpu_it = 1; cpu_it < NCPU; cpu_it++) {
+               state->dts_rstate[cpu_it] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP);
+               /*
+                * Each CPU is assigned a 2^64 period, non-overlapping
+                * subsequence.
+                */
+               dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-1],
+                   state->dts_rstate[cpu_it]);
+       }
+
        state->dts_cleaner = CYCLIC_NONE;
        state->dts_deadman = CYCLIC_NONE;
        state->dts_vstate.dtvs_state = state;
@@ -14178,6 +14791,11 @@ dtrace_state_destroy(dtrace_state_t *state)
        dtrace_buffer_free(state->dts_buffer);
        dtrace_buffer_free(state->dts_aggbuffer);
 
+       for (i = 0; i < (int)NCPU; i++) {
+               kmem_free(state->dts_rstate[i], 2 * sizeof(uint64_t));
+       }
+       kmem_free(state->dts_rstate, NCPU * sizeof(uint64_t*));
+
        for (i = 0; i < nspec; i++)
                dtrace_buffer_free(spec[i].dtsp_buffer);
 
@@ -16518,6 +17136,10 @@ dtrace_attach(dev_info_t *devi)
 
        LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
 
+       dtrace_nprobes = dtrace_nprobes_default;
+       dtrace_probes = kmem_zalloc(sizeof(dtrace_probe_t*) * dtrace_nprobes,
+           KM_SLEEP);
+
        dtrace_byprov = dtrace_hash_create(dtrace_strkey_probe_provider,
            0, /* unused */
            offsetof(dtrace_probe_t, dtpr_nextprov),
@@ -17664,7 +18286,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv
                 * checking the buffer over limit count  at this point.
                 */
                if (over_limit) {
-                       uint32_t old = atomic_add_32(&state->dts_buf_over_limit, -1);
+                       uint32_t old = os_atomic_dec_orig(&state->dts_buf_over_limit, relaxed);
                        #pragma unused(old)
 
                        /*
@@ -17888,10 +18510,6 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv
                        lck_mtx_lock(&mod_lock);
                        struct modctl* ctl = dtrace_modctl_list;
                        while (ctl) {
-                               /* Update the private probes bit */
-                               if (dtrace_provide_private_probes)
-                                       ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
-
                                ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
                                if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
                                        dtmul_count++;
@@ -17939,10 +18557,6 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv
                
                struct modctl* ctl = dtrace_modctl_list;
                while (ctl) {
-                       /* Update the private probes bit */
-                       if (dtrace_provide_private_probes)
-                               ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
-
                        /*
                         * We assume that userspace symbols will be "better" than kernel level symbols,
                         * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
@@ -18060,10 +18674,6 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv
                
                struct modctl* ctl = dtrace_modctl_list;
                while (ctl) {
-                       /* Update the private probes bit */
-                       if (dtrace_provide_private_probes)
-                               ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
-
                        ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
                        if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl) && memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
                                dtrace_provider_t *prv;
@@ -18427,7 +19037,7 @@ void
 dtrace_ast(void)
 {
        int i;
-       uint32_t clients = atomic_and_32(&dtrace_wake_clients, 0);
+       uint32_t clients = os_atomic_xchg(&dtrace_wake_clients, 0, relaxed);
        if (clients == 0)
                return;
        /**
@@ -18649,6 +19259,11 @@ dtrace_init( void )
                                break;
                }
 
+#if CONFIG_DTRACE
+        if (dtrace_dof_mode != DTRACE_DOF_MODE_NEVER)
+            commpage_update_dof(true);
+#endif
+
                gDTraceInited = 1;
 
        } else
@@ -18679,10 +19294,6 @@ dtrace_postinit(void)
        if (dtrace_module_loaded(&fake_kernel_kmod, 0) != 0) {
                printf("dtrace_postinit: Could not register mach_kernel modctl\n");
        }
-
-       if (!PE_parse_boot_argn("dtrace_provide_private_probes", &dtrace_provide_private_probes, sizeof (dtrace_provide_private_probes))) {
-                       dtrace_provide_private_probes = 0;
-       }
        
        (void)OSKextRegisterKextsWithDTrace();
 }
index d33a8f03084c6b81ad2b130d91aa4ca5c42e9d13..cd047e8d96296b28b4434c83bbe21074950f3070 100644 (file)
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
-
-/*
- * APPLE NOTE: This file is compiled even if dtrace is unconfig'd. A symbol
- * from this file (_dtrace_register_anon_DOF) always needs to be exported for
- * an external kext to link against.
- */
-
-#if CONFIG_DTRACE
-
-#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */
 #include <kern/thread.h>
-#include <mach/thread_status.h>
 
-#include <stdarg.h>
-#include <string.h>
-#include <sys/malloc.h>
 #include <sys/time.h>
 #include <sys/proc.h>
-#include <sys/proc_internal.h>
 #include <sys/kauth.h>
 #include <sys/user.h>
 #include <sys/systm.h>
 #include <sys/dtrace.h>
 #include <sys/dtrace_impl.h>
-#include <libkern/OSAtomic.h>
+#include <machine/atomic.h>
 #include <libkern/OSKextLibPrivate.h>
 #include <kern/kern_types.h>
 #include <kern/timer_call.h>
 #include <kern/thread_call.h>
 #include <kern/task.h>
 #include <kern/sched_prim.h>
-#include <kern/queue.h>
 #include <miscfs/devfs/devfs.h>
 #include <kern/kalloc.h>
 
 #include <mach/vm_param.h>
 #include <mach/mach_vm.h>
 #include <mach/task.h>
-#include <vm/pmap.h>
 #include <vm/vm_map.h> /* All the bits we care about are guarded by MACH_KERNEL_PRIVATE :-( */
 
 /*
@@ -76,7 +59,6 @@
 void
 dtrace_sprlock(proc_t *p)
 {
-       lck_mtx_assert(&p->p_mlock, LCK_MTX_ASSERT_NOTOWNED);
        lck_mtx_lock(&p->p_dtrace_sprlock);
 }
 
@@ -100,8 +82,6 @@ sprlock(pid_t pid)
 
        dtrace_sprlock(p);
 
-       proc_lock(p);
-
        return p;
 }
 
@@ -110,8 +90,6 @@ void
 sprunlock(proc_t *p)
 {
        if (p != PROC_NULL) {
-               proc_unlock(p);
-
                dtrace_sprunlock(p);
 
                task_resume_internal(p->task);
@@ -273,11 +251,6 @@ dtrace_CRED(void)
        }
 }
 
-#define HAS_ALLPRIVS(cr)        priv_isfullset(&CR_OEPRIV(cr))
-#define HAS_PRIVILEGE(cr, pr)   ((pr) == PRIV_ALL ? \
-                                       HAS_ALLPRIVS(cr) : \
-                                       PRIV_ISASSERT(&CR_OEPRIV(cr), pr))
-
 int
 PRIV_POLICY_CHOICE(void* cred, int priv, int all)
 {
@@ -605,15 +578,6 @@ cyclic_remove(cyclic_id_t cyclic)
        }
 }
 
-kern_return_t _dtrace_register_anon_DOF(char *, uchar_t *, uint_t);
-
-kern_return_t
-_dtrace_register_anon_DOF(char *name, uchar_t *data, uint_t nelements)
-{
-#pragma unused(name, data, nelements)
-       return KERN_FAILURE;
-}
-
 int
 ddi_driver_major(dev_info_t     *devi)
 {
@@ -1503,25 +1467,3 @@ void
 dtrace_vtime_disable(void)
 {
 }
-
-#else /* else ! CONFIG_DTRACE */
-
-#include <sys/types.h>
-#include <mach/vm_types.h>
-#include <mach/kmod.h>
-
-/*
- * This exists to prevent build errors when dtrace is unconfigured.
- */
-
-kern_return_t _dtrace_register_anon_DOF(char *, unsigned char *, uint32_t);
-
-kern_return_t
-_dtrace_register_anon_DOF(char *arg1, unsigned char *arg2, uint32_t arg3)
-{
-#pragma unused(arg1, arg2, arg3)
-
-       return KERN_FAILURE;
-}
-
-#endif /* CONFIG_DTRACE */
index 3bc601af83d994c97eb0219fb394510ef0cee11c..5f28ca8107e44762b24b508a563b6d4f29a4a7f4 100644 (file)
  * Use is subject to license terms.
  */
 
-/*
- * #pragma ident       "@(#)dtrace_subr.c      1.8     07/06/05 SMI"
- */
-
 #include <stdarg.h>
 #include <string.h>
 #include <sys/malloc.h>
@@ -295,6 +291,44 @@ dtrace_invop_remove(int (*func)(uintptr_t, uintptr_t *, uintptr_t))
        kmem_free(hdlr, sizeof (dtrace_invop_hdlr_t));
 }
 
+void*
+dtrace_ptrauth_strip(void *ptr, uint64_t key)
+{
+#pragma unused(key)
+#if __has_feature(ptrauth_calls)
+       /*
+        * The key argument to ptrauth_strip needs to be a compile-time
+        * constant
+        */
+       switch (key) {
+       case ptrauth_key_asia:
+               return ptrauth_strip(ptr, ptrauth_key_asia);
+       case ptrauth_key_asib:
+               return ptrauth_strip(ptr, ptrauth_key_asib);
+       case ptrauth_key_asda:
+               return ptrauth_strip(ptr, ptrauth_key_asda);
+       case ptrauth_key_asdb:
+               return ptrauth_strip(ptr, ptrauth_key_asdb);
+       default:
+               return ptr;
+       }
+#else
+       return ptr;
+#endif // __has_feature(ptrauth_calls)
+}
+
+int
+dtrace_is_valid_ptrauth_key(uint64_t key)
+{
+#pragma unused(key)
+#if __has_feature(ptrauth_calls)
+       return (key == ptrauth_key_asia) || (key == ptrauth_key_asib) ||
+           (key == ptrauth_key_asda) || (key == ptrauth_key_asdb);
+#else
+       return (0);
+#endif /* __has_feature(ptrauth_calls) */
+}
+
 static minor_t next_minor = 0;
 static dtrace_state_t* dtrace_clients[DTRACE_NCLIENTS] = {NULL};
 
@@ -303,7 +337,7 @@ minor_t
 dtrace_state_reserve(void)
 {
        for (int i = 0; i < DTRACE_NCLIENTS; i++) {
-               minor_t minor = atomic_add_32(&next_minor, 1) % DTRACE_NCLIENTS;
+               minor_t minor = os_atomic_inc_orig(&next_minor, relaxed) % DTRACE_NCLIENTS;
                if (dtrace_clients[minor] == NULL)
                        return minor;
        }
diff --git a/bsd/dev/dtrace/dtrace_xoroshiro128_plus.c b/bsd/dev/dtrace/dtrace_xoroshiro128_plus.c
new file mode 100644 (file)
index 0000000..d29d58e
--- /dev/null
@@ -0,0 +1,89 @@
+/*-
+ * Copyright (c) 2016 (Graeme Jenkinson)
+ * All rights reserved.
+ *
+ * This software was developed by BAE Systems, the University of Cambridge
+ * Computer Laboratory, and Memorial University under DARPA/AFRL contract
+ * FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent Computing
+ * (TC) research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/types.h>
+
+#include "dtrace_xoroshiro128_plus.h"
+
+static __inline uint64_t
+rotl(const uint64_t x, int k)
+{
+       return (x << k) | (x >> (64 - k));
+}
+
+/*
+ * This is the jump function for the generator. It is equivalent to 2^64 calls
+ * to next(); it can be used to generate 2^64 non-overlapping subsequences for
+ * parallel computations.
+ */
+void
+dtrace_xoroshiro128_plus_jump(uint64_t * const state,
+    uint64_t * const jump_state)
+{
+       static const uint64_t JUMP[] = { 0xbeac0467eba5facb,
+                                        0xd86b048b86aa9922 };
+
+       uint64_t s0 = 0;
+       uint64_t s1 = 0;
+       size_t i = 0;
+       int b = 0;
+       for (i = 0; i < sizeof JUMP / sizeof *JUMP; i++) {
+               for (b = 0; b < 64; b++) {
+                       if (JUMP[i] & 1ULL << b) {
+                               s0 ^= state[0];
+                               s1 ^= state[1];
+                       }
+                       dtrace_xoroshiro128_plus_next(state);
+               }
+       }
+       jump_state[0] = s0;
+       jump_state[1] = s1;
+}
+
+/*
+ * xoroshiro128+ - XOR/rotate/shift/rotate
+ * xorshift.di.unimi.it
+ */
+uint64_t
+dtrace_xoroshiro128_plus_next(uint64_t * const state)
+{
+       const uint64_t s0 = state[0];
+       uint64_t s1 = state[1];
+       uint64_t result;
+       result = s0 + s1;
+
+       s1 ^= s0;
+       state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14);
+       state[1] = rotl(s1, 36);
+
+       return result;
+}
diff --git a/bsd/dev/dtrace/dtrace_xoroshiro128_plus.h b/bsd/dev/dtrace/dtrace_xoroshiro128_plus.h
new file mode 100644 (file)
index 0000000..c1dafcd
--- /dev/null
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2016 (Graeme Jenkinson)
+ * All rights reserved.
+ *
+ * This software was developed by BAE Systems, the University of Cambridge
+ * Computer Laboratory, and Memorial University under DARPA/AFRL contract
+ * FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent Computing
+ * (TC) research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#ifndef _DTRACE_XOROSHIRO128_PLUS_H
+#define _DTRACE_XOROSHIRO128_PLUS_H
+
+#include <sys/types.h>
+#include <stdint.h>
+
+void dtrace_xoroshiro128_plus_jump(uint64_t * const, uint64_t * const);
+uint64_t dtrace_xoroshiro128_plus_next(uint64_t * const);
+
+#endif
index 9ce7ccc9cd1e16847cffc20e6356f881b460553d..e90e109f08189ff6fc0530341cd20be73bba202a 100644 (file)
  * Use is subject to license terms.
  */
 
-/*
- * #pragma ident       "@(#)fasttrap.c 1.26    08/04/21 SMI"
- */
-
 #include <sys/types.h>
 #include <sys/time.h>
 
@@ -519,7 +515,7 @@ fasttrap_pid_cleanup_cb(void)
        while (1) {
                unsigned int later = 0;
 
-               work = atomic_and_32(&fasttrap_cleanup_work, 0);
+               work = os_atomic_xchg(&fasttrap_cleanup_work, 0, relaxed);
                lck_mtx_unlock(&fasttrap_cleanup_mtx);
                if (work & FASTTRAP_CLEANUP_PROVIDER) {
                        later = fasttrap_pid_cleanup_providers();
@@ -542,7 +538,7 @@ fasttrap_pid_cleanup_cb(void)
                         * (if detach fails).
                         */
                        if (later > 0) {
-                               struct timespec t = {1, 0};
+                               struct timespec t = {.tv_sec = 1, .tv_nsec = 0};
                                msleep(&fasttrap_pid_cleanup_cb, &fasttrap_cleanup_mtx, PRIBIO, "fasttrap_pid_cleanup_cb", &t);
                        }
                        else
@@ -559,7 +555,7 @@ static void
 fasttrap_pid_cleanup(uint32_t work)
 {
        lck_mtx_lock(&fasttrap_cleanup_mtx);
-       atomic_or_32(&fasttrap_cleanup_work, work);
+       os_atomic_or(&fasttrap_cleanup_work, work, relaxed);
        fasttrap_pid_cleanup_compute_priority();
        wakeup(&fasttrap_pid_cleanup_cb);
        lck_mtx_unlock(&fasttrap_cleanup_mtx);
@@ -601,7 +597,6 @@ fasttrap_fork(proc_t *p, proc_t *cp)
                printf("fasttrap_fork: sprlock(%d) returned a different proc\n", cp->p_pid);
                return;
        }
-       proc_unlock(cp);
 
        /*
         * Iterate over every tracepoint looking for ones that belong to the
@@ -635,7 +630,6 @@ fasttrap_fork(proc_t *p, proc_t *cp)
         */
        dtrace_ptss_fork(p, cp);
 
-       proc_lock(cp);
        sprunlock(cp);
 }
 
@@ -656,9 +650,9 @@ fasttrap_exec_exit(proc_t *p)
         * explaining. This method is always called with the proc_lock held.
         * We must drop the proc_lock before calling fasttrap_provider_retire
         * to avoid a deadlock when it takes the bucket lock.
-        * 
+        *
         * Next, the dtrace_ptss_exec_exit function requires the sprlock
-        * be held, but not the proc_lock. 
+        * be held, but not the proc_lock.
         *
         * Finally, we must re-acquire the proc_lock
         */
@@ -922,13 +916,13 @@ fasttrap_tracepoint_disable(proc_t *p, fasttrap_probe_t *probe, uint_t index)
                        ASSERT(tp->ftt_ids != NULL);
                        idp = &tp->ftt_ids;
                        break;
-                       
+
                case DTFTP_RETURN:
                case DTFTP_POST_OFFSETS:
                        ASSERT(tp->ftt_retids != NULL);
                        idp = &tp->ftt_retids;
                        break;
-                       
+
                default:
                        /* Fix compiler warning... */
                        idp = NULL;
@@ -1151,6 +1145,8 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
            return(0);
        }
 
+       proc_lock(p);
+
        if ((p->p_csflags & (CS_KILL|CS_HARD))) {
                proc_unlock(p);
                for (i = 0; i < DTRACE_NCLIENTS; i++) {
@@ -1162,12 +1158,12 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
                        mac_proc_check_get_task(state->dts_cred.dcr_cred, p);
                }
                rc = cs_allow_invalid(p);
-               proc_lock(p);
                if (rc == 0) {
                        sprunlock(p);
                        cmn_err(CE_WARN, "process doesn't allow invalid code pages, failing to install fasttrap probe\n");
                        return (0);
                }
+               proc_lock(p);
        }
 
        /*
@@ -1217,7 +1213,6 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
                                i--;
                        }
 
-                       proc_lock(p);
                        sprunlock(p);
 
                        /*
@@ -1229,7 +1224,6 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
                }
        }
 
-       proc_lock(p);
        sprunlock(p);
 
        probe->ftp_enabled = 1;
@@ -1256,7 +1250,6 @@ fasttrap_pid_disable(void *arg, dtrace_id_t id, void *parg)
         */
        if ((p = sprlock(probe->ftp_pid)) != PROC_NULL) {
                // ASSERT(!(p->p_flag & SVFORK));
-               proc_unlock(p);
        }
 
        lck_mtx_lock(&provider->ftp_mtx);
@@ -1283,7 +1276,6 @@ fasttrap_pid_disable(void *arg, dtrace_id_t id, void *parg)
                        whack = provider->ftp_marked = 1;
                lck_mtx_unlock(&provider->ftp_mtx);
 
-               proc_lock(p);
                sprunlock(p);
        } else {
                /*
@@ -1360,8 +1352,8 @@ fasttrap_pid_destroy(void *arg, dtrace_id_t id, void *parg)
        ASSERT(!probe->ftp_enabled);
        ASSERT(fasttrap_total >= probe->ftp_ntps);
 
-       atomic_add_32(&fasttrap_total, -probe->ftp_ntps);
-       atomic_add_32(&fasttrap_retired, -probe->ftp_ntps);
+       os_atomic_sub(&fasttrap_total, probe->ftp_ntps, relaxed);
+       os_atomic_sub(&fasttrap_retired, probe->ftp_ntps, relaxed);
 
        if (probe->ftp_gen + 1 >= fasttrap_mod_gen)
                fasttrap_mod_barrier(probe->ftp_gen);
@@ -1427,7 +1419,7 @@ fasttrap_proc_lookup(pid_t pid)
                        lck_mtx_lock(&fprc->ftpc_mtx);
                        lck_mtx_unlock(&bucket->ftb_mtx);
                        fprc->ftpc_rcount++;
-                       atomic_add_64(&fprc->ftpc_acount, 1);
+                       os_atomic_inc(&fprc->ftpc_acount, relaxed);
                        ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount);
                        lck_mtx_unlock(&fprc->ftpc_mtx);
 
@@ -1458,7 +1450,7 @@ fasttrap_proc_lookup(pid_t pid)
                        lck_mtx_lock(&fprc->ftpc_mtx);
                        lck_mtx_unlock(&bucket->ftb_mtx);
                        fprc->ftpc_rcount++;
-                       atomic_add_64(&fprc->ftpc_acount, 1);
+                       os_atomic_inc(&fprc->ftpc_acount, relaxed);
                        ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount);
                        lck_mtx_unlock(&fprc->ftpc_mtx);
 
@@ -1686,7 +1678,7 @@ fasttrap_provider_free(fasttrap_provider_t *provider)
         * count of active providers on the associated process structure.
         */
        if (!provider->ftp_retired) {
-               atomic_add_64(&provider->ftp_proc->ftpc_acount, -1);
+               os_atomic_dec(&provider->ftp_proc->ftpc_acount, relaxed);
                ASSERT(provider->ftp_proc->ftpc_acount <
                provider->ftp_proc->ftpc_rcount);
        }
@@ -1716,7 +1708,7 @@ fasttrap_provider_free(fasttrap_provider_t *provider)
        proc_lock(p);
        p->p_dtrace_probes--;
        proc_unlock(p);
-       
+
        proc_rele(p);
 }
 
@@ -1765,14 +1757,14 @@ fasttrap_provider_retire(proc_t *p, const char *name, int mprov)
         * bucket lock therefore protects the integrity of the provider hash
         * table.
         */
-       atomic_add_64(&fp->ftp_proc->ftpc_acount, -1);
+       os_atomic_dec(&fp->ftp_proc->ftpc_acount, relaxed);
        ASSERT(fp->ftp_proc->ftpc_acount < fp->ftp_proc->ftpc_rcount);
 
        /*
         * Add this provider probes to the retired count and
         * make sure we don't add them twice
         */
-       atomic_add_32(&fasttrap_retired, fp->ftp_pcount);
+       os_atomic_add(&fasttrap_retired, fp->ftp_pcount, relaxed);
        fp->ftp_pcount = 0;
 
        fp->ftp_retired = 1;
@@ -1892,9 +1884,9 @@ fasttrap_add_probe(fasttrap_probe_spec_t *pdata)
                            pdata->ftps_mod, pdata->ftps_func, name_str) != 0)
                                continue;
 
-                       atomic_add_32(&fasttrap_total, 1);
+                       os_atomic_inc(&fasttrap_total, relaxed);
                        if (fasttrap_total > fasttrap_max) {
-                               atomic_add_32(&fasttrap_total, -1);
+                               os_atomic_dec(&fasttrap_total, relaxed);
                                goto no_mem;
                        }
                        provider->ftp_pcount++;
@@ -1908,7 +1900,7 @@ fasttrap_add_probe(fasttrap_probe_spec_t *pdata)
                        pp->ftp_pid = pdata->ftps_pid;
                        pp->ftp_ntps = 1;
 
-                       tp = zalloc(fasttrap_tracepoint_t_zone);                        
+                       tp = zalloc(fasttrap_tracepoint_t_zone);
                        bzero(tp, sizeof (fasttrap_tracepoint_t));
 
                        tp->ftt_proc = provider->ftp_proc;
@@ -1935,10 +1927,10 @@ fasttrap_add_probe(fasttrap_probe_spec_t *pdata)
 
        } else if (dtrace_probe_lookup(provider->ftp_provid, pdata->ftps_mod,
            pdata->ftps_func, name) == 0) {
-               atomic_add_32(&fasttrap_total, pdata->ftps_noffs);
+               os_atomic_add(&fasttrap_total, pdata->ftps_noffs, relaxed);
 
                if (fasttrap_total > fasttrap_max) {
-                       atomic_add_32(&fasttrap_total, -pdata->ftps_noffs);
+                       os_atomic_sub(&fasttrap_total, pdata->ftps_noffs, relaxed);
                        goto no_mem;
                }
 
@@ -1953,7 +1945,7 @@ fasttrap_add_probe(fasttrap_probe_spec_t *pdata)
                        if (pdata->ftps_offs[i] > pdata->ftps_offs[i - 1])
                                continue;
 
-                       atomic_add_32(&fasttrap_total, -pdata->ftps_noffs);
+                       os_atomic_sub(&fasttrap_total, pdata->ftps_noffs, relaxed);
                        goto no_mem;
                }
                provider->ftp_pcount += pdata->ftps_noffs;
@@ -1985,7 +1977,7 @@ fasttrap_add_probe(fasttrap_probe_spec_t *pdata)
                         * this field is simply initialized to 0 on its way
                         * into the kernel.
                         */
-                       
+
                        tp->ftt_fntype = pdata->ftps_arch_subinfo;
 #endif
                        pp->ftp_tps[i].fit_tp = tp;
@@ -2177,7 +2169,7 @@ fasttrap_meta_create_probe(void *arg, void *parg,
 
 #if 0
        /*
-        * APPLE NOTE: This is hideously expensive. See note in 
+        * APPLE NOTE: This is hideously expensive. See note in
         * fasttrap_meta_provide() for why we can get away without
         * checking here.
         */
@@ -2191,10 +2183,10 @@ fasttrap_meta_create_probe(void *arg, void *parg,
        ntps = dhpb->dthpb_noffs + dhpb->dthpb_nenoffs;
        ASSERT(ntps > 0);
 
-       atomic_add_32(&fasttrap_total, ntps);
+       os_atomic_add(&fasttrap_total, ntps, relaxed);
 
        if (fasttrap_total > fasttrap_max) {
-               atomic_add_32(&fasttrap_total, -ntps);
+               os_atomic_sub(&fasttrap_total, ntps, relaxed);
                lck_mtx_unlock(&provider->ftp_cmtx);
                return;
        }
@@ -2239,7 +2231,7 @@ fasttrap_meta_create_probe(void *arg, void *parg,
                 * All ARM and ARM64 probes are zero offset. We need to zero out the
                 * thumb bit because we still support 32bit user processes.
                 * On 64bit user processes, bit zero won't be set anyway.
-                */             
+                */
                tp->ftt_pc = (dhpb->dthpb_base + (int64_t)dhpb->dthpb_offs[i]) & ~0x1UL;
                tp->ftt_fntype = FASTTRAP_FN_USDT;
 #else
@@ -2277,7 +2269,7 @@ fasttrap_meta_create_probe(void *arg, void *parg,
                 * All ARM and ARM64 probes are zero offset. We need to zero out the
                 * thumb bit because we still support 32bit user processes.
                 * On 64bit user processes, bit zero won't be set anyway.
-                */                             
+                */
                tp->ftt_pc = (dhpb->dthpb_base + (int64_t)dhpb->dthpb_enoffs[j]) & ~0x1UL;
                tp->ftt_fntype = FASTTRAP_FN_USDT;
 #else
@@ -2613,7 +2605,7 @@ fasttrap_attach(void)
            &fasttrap_meta_id);
 }
 
-static int 
+static int
 _fasttrap_open(dev_t dev, int flags, int devtype, struct proc *p)
 {
 #pragma unused(dev, flags, devtype, p)
@@ -2640,7 +2632,7 @@ _fasttrap_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
        } else if (rv != 0) {
                ASSERT( (rv & 0xfff00000) == 0 );
                return (((rv & 0xfffff) << 12)); /* ioctl returns -1 and errno set to a return value >= 4096 */
-       } else 
+       } else
                return 0;
 }
 
@@ -2717,12 +2709,12 @@ fasttrap_init( void )
                                                          fasttrap_probe_t_zone_names[i]);
                }
 
-               
+
                /*
                 * Create the fasttrap lock group. Must be done before fasttrap_attach()!
                 */
                fasttrap_lck_attr = lck_attr_alloc_init();
-               fasttrap_lck_grp_attr= lck_grp_attr_alloc_init();               
+               fasttrap_lck_grp_attr= lck_grp_attr_alloc_init();
                fasttrap_lck_grp = lck_grp_alloc_init("fasttrap",  fasttrap_lck_grp_attr);
 
                /*
index 036d85bcbc3c819199fbb84c3026cb247b727ed8..fe2918435b3c8bdafb5fe1de5e31a9c8a8f7a2a6 100644 (file)
  * Use is subject to license terms.
  */
 
-/* #pragma ident       "@(#)fbt.c      1.18    07/01/10 SMI" */
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL /* Solaris vs. Darwin */
-#endif
-#endif
-
 #include <mach-o/loader.h>
 #include <libkern/kernel_mach_header.h>
 
@@ -80,471 +72,13 @@ fbt_probe_t                             **fbt_probetab;
 int                                             fbt_probetab_mask;
 static int                              fbt_verbose = 0;
 
-int ignore_fbt_blacklist = 0;
+extern int ignore_fbt_blacklist;
 
 extern int dtrace_kernel_symbol_mode;
 
 
 void fbt_init( void );
 
-/*
- * Critical routines that must not be probed. PR_5221096, PR_5379018.
- * The blacklist must be kept in alphabetic order for purposes of bsearch().
- */
-static const char * critical_blacklist[] =
-{
-       "Call_DebuggerC",
-       "DebuggerCall",
-       "DebuggerTrapWithState",
-       "DebuggerXCallEnter",
-       "IOCPURunPlatformPanicActions",
-       "PEARMDebugPanicHook",
-       "PEHaltRestart",
-       "SavePanicInfo",
-       "SysChoked",
-       "_ZN9IOService14newTemperatureElPS_", /* IOService::newTemperature */
-       "_ZN9IOService26temperatureCriticalForZoneEPS_", /* IOService::temperatureCriticalForZone */
-       "_ZNK6OSData14getBytesNoCopyEv", /* Data::getBytesNoCopy, IOHibernateSystemWake path */
-       "__ZN16IOPlatformExpert11haltRestartEj",
-       "__ZN18IODTPlatformExpert11haltRestartEj",
-       "__ZN9IODTNVRAM13savePanicInfoEPhy",
-       "_disable_preemption",
-       "_enable_preemption",
-       "alternate_debugger_enter",
-       "bcopy_phys",
-       "console_cpu_alloc",
-       "console_cpu_free",
-       "cpu_IA32e_disable",
-       "cpu_IA32e_enable",
-       "cpu_NMI_interrupt",
-       "cpu_control",
-       "cpu_data_alloc",
-       "cpu_desc_init",
-       "cpu_desc_init64",
-       "cpu_desc_load",
-       "cpu_desc_load64",
-       "cpu_exit_wait",
-       "cpu_info",
-       "cpu_info_count",
-       "cpu_init",
-       "cpu_interrupt",
-       "cpu_machine_init",
-       "cpu_mode_init",
-       "cpu_processor_alloc",
-       "cpu_processor_free",
-       "cpu_signal_handler",
-       "cpu_sleep",
-       "cpu_start",
-       "cpu_subtype",
-       "cpu_thread_alloc",
-       "cpu_thread_halt",
-       "cpu_thread_init",
-       "cpu_threadtype",
-       "cpu_to_processor",
-       "cpu_topology_sort",
-       "cpu_topology_start_cpu",
-       "cpu_type",
-       "cpuid_cpu_display",
-       "cpuid_extfeatures",
-       "dtrace_invop",
-       "enter_lohandler",
-       "fbt_invop",
-       "fbt_perfCallback",
-       "get_preemption_level"
-       "get_threadtask",
-       "handle_pending_TLB_flushes",
-       "hw_compare_and_store",
-       "interrupt",
-       "is_saved_state32",
-       "kernel_preempt_check",
-       "kernel_trap",
-       "kprintf",
-       "ks_dispatch_kernel",
-       "ks_dispatch_user",
-       "ks_kernel_trap",
-       "lo_alltraps",
-       "lock_debugger",
-       "machine_idle_cstate",
-       "machine_thread_get_kern_state",
-       "mca_cpu_alloc",
-       "mca_cpu_init",
-       "ml_nofault_copy",
-       "nanoseconds_to_absolutetime",
-       "nanotime_to_absolutetime",
-       "packA",
-       "panic",
-       "phystokv",
-       "phystokv_range",
-       "pltrace",
-       "pmKextRegister",
-       "pmMarkAllCPUsOff",
-       "pmSafeMode",
-       "pmTimerRestore",
-       "pmTimerSave",
-       "pmUnRegister",
-       "pmap_cpu_alloc",
-       "pmap_cpu_free",
-       "pmap_cpu_high_map_vaddr",
-       "pmap_cpu_high_shared_remap",
-       "pmap_cpu_init",
-       "power_management_init",
-       "preemption_underflow_panic",
-       "register_cpu_setup_func",
-       "ret64_iret"
-       "ret_to_user"
-       "return_to_kernel",
-       "return_to_user",
-       "saved_state64",
-       "sdt_invop",
-       "sprlock",
-       "sprunlock",
-       "strlen",
-       "strncmp",
-       "t_invop",
-       "tmrCvt",
-       "trap_from_kernel",
-       "uart_putc",
-       "unlock_debugger",
-       "unpackA",
-       "unregister_cpu_setup_func",
-       "uread",
-       "uwrite",
-       "vstart"
-};
-
-#define CRITICAL_BLACKLIST_COUNT (sizeof(critical_blacklist)/sizeof(critical_blacklist[0]))
-
-/*
- * The transitive closure of entry points that can be reached from probe context.
- * (Apart from routines whose names begin with dtrace_).
- */
-static const char * probe_ctx_closure[] =
-{
-       "ClearIdlePop",
-       "Debugger",
-       "IS_64BIT_PROCESS",
-       "OSCompareAndSwap",
-       "SetIdlePop",
-       "__dtrace_probe",
-       "absolutetime_to_microtime",
-       "act_set_astbsd",
-       "arm_init_idle_cpu",
-       "ast_dtrace_on",
-       "ast_pending",
-       "clean_dcache",
-       "clean_mmu_dcache",
-       "clock_get_calendar_nanotime_nowait",
-       "copyin",
-       "copyin_kern",
-       "copyin_user",
-       "copyinstr",
-       "copyout",
-       "copyoutstr",
-       "cpu_number",
-       "current_proc",
-       "current_processor",
-       "current_task",
-       "current_thread",
-       "debug_enter",
-       "drain_write_buffer",
-       "find_user_regs",
-       "flush_dcache",
-       "flush_tlb64",
-       "get_bsdtask_info",
-       "get_bsdthread_info",
-       "hertz_tick",
-       "hw_atomic_and",
-       "invalidate_mmu_icache",
-       "kauth_cred_get",
-       "kauth_getgid",
-       "kauth_getuid",
-       "kernel_preempt_check",
-       "kvtophys",
-       "mach_absolute_time",
-       "max_valid_stack_address",
-       "memcpy",
-       "memmove",
-       "ml_at_interrupt_context",
-       "ml_phys_write_byte_64",
-       "ml_phys_write_half_64",
-       "ml_phys_write_word_64",
-       "ml_set_interrupts_enabled",
-       "mt_core_snap",
-       "mt_cur_cpu_cycles",
-       "mt_cur_cpu_instrs",
-       "mt_cur_thread_cycles",
-       "mt_cur_thread_instrs",
-       "mt_fixed_counts",
-       "mt_fixed_counts_internal",
-       "mt_mtc_update_count",
-       "mt_update_thread",
-       "ovbcopy",
-       "panic",
-       "pmap64_pdpt",
-       "pmap_find_phys",
-       "pmap_get_mapwindow",
-       "pmap_pde",
-       "pmap_pde_internal0",
-       "pmap_pde_internal1",
-       "pmap_pte",
-       "pmap_pte_internal",
-       "pmap_put_mapwindow",
-       "pmap_valid_page",
-       "prf",
-       "proc_is64bit",
-       "proc_selfname",
-       "psignal_lock",
-       "rtc_nanotime_load",
-       "rtc_nanotime_read",
-       "sdt_getargdesc",
-       "setPop",
-       "strlcpy",
-       "sync_iss_to_iks_unconditionally",
-       "systrace_stub",
-       "timer_grab"
-};
-#define PROBE_CTX_CLOSURE_COUNT (sizeof(probe_ctx_closure)/sizeof(probe_ctx_closure[0]))
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wcast-qual"
-static int
-_cmp(const void *a, const void *b)
-{
-       return strncmp((const char *)a, *(const char **)b, strlen((const char *)a) + 1);
-}
-#pragma clang diagnostic pop
-/*
- * Module validation
- */
-int
-fbt_module_excluded(struct modctl* ctl)
-{
-       ASSERT(!MOD_FBT_DONE(ctl));
-
-       if (ctl->mod_address == 0 || ctl->mod_size == 0) {
-               return TRUE;
-       }
-
-       if (ctl->mod_loaded == 0) {
-               return TRUE;
-       }
-
-       /*
-        * If the user sets this, trust they know what they are doing.
-        */
-       if (ignore_fbt_blacklist) {
-               return FALSE;
-       }
-
-       /*
-        * These drivers control low level functions that when traced
-        * cause problems often in the sleep/wake paths as well as
-        * critical debug and panic paths.
-        * If somebody really wants to drill in on one of these kexts, then
-        * they can override blacklisting using the boot-arg above.
-        */
-
-#ifdef __x86_64__
-       if (strstr(ctl->mod_modname, "AppleACPIEC") != NULL) {
-               return TRUE;
-       }
-
-       if (strstr(ctl->mod_modname, "AppleACPIPlatform") != NULL) {
-               return TRUE;
-       }
-
-       if (strstr(ctl->mod_modname, "AppleRTC") != NULL) {
-               return TRUE;
-       }
-
-       if (strstr(ctl->mod_modname, "IOACPIFamily") != NULL) {
-               return TRUE;
-       }
-
-       if (strstr(ctl->mod_modname, "AppleIntelCPUPowerManagement") != NULL) {
-               return TRUE;
-       }
-
-       if (strstr(ctl->mod_modname, "AppleProfile") != NULL) {
-               return TRUE;
-       }
-
-       if (strstr(ctl->mod_modname, "AppleIntelProfile") != NULL) {
-               return TRUE;
-       }
-
-       if (strstr(ctl->mod_modname, "AppleEFI") != NULL) {
-               return TRUE;
-       }
-
-#elif __arm__ || __arm64__
-       if (LIT_STRNEQL(ctl->mod_modname, "com.apple.driver.AppleARMPlatform") ||
-           LIT_STRNEQL(ctl->mod_modname, "com.apple.driver.AppleARMPL192VIC") ||
-           LIT_STRNEQL(ctl->mod_modname, "com.apple.driver.AppleInterruptController")) {
-               return TRUE;
-       }
-#endif
-
-       return FALSE;
-}
-
-/*
- * FBT probe name validation
- */
-int
-fbt_excluded(const char* name)
-{
-       /*
-        * If the user set this, trust they know what they are doing.
-        */
-       if (ignore_fbt_blacklist) {
-               return FALSE;
-       }
-
-       if (LIT_STRNSTART(name, "dtrace_") && !LIT_STRNSTART(name, "dtrace_safe_")) {
-               /*
-                * Anything beginning with "dtrace_" may be called
-                * from probe context unless it explitly indicates
-                * that it won't be called from probe context by
-                * using the prefix "dtrace_safe_".
-                */
-               return TRUE;
-       }
-
-       /*
-        * Place no probes on critical routines (5221096)
-        */
-       if (bsearch( name, critical_blacklist, CRITICAL_BLACKLIST_COUNT, sizeof(name), _cmp ) != NULL) {
-               return TRUE;
-       }
-
-       /*
-        * Place no probes that could be hit in probe context.
-        */
-       if (bsearch( name, probe_ctx_closure, PROBE_CTX_CLOSURE_COUNT, sizeof(name), _cmp ) != NULL) {
-               return TRUE;
-       }
-
-       /*
-        * Place no probes that could be hit in probe context.
-        * In the interests of safety, some of these may be overly cautious.
-        * Also exclude very low-level "firmware" class calls.
-        */
-       if (LIT_STRNSTART(name, "cpu_") ||      /* Coarse */
-           LIT_STRNSTART(name, "platform_") ||         /* Coarse */
-           LIT_STRNSTART(name, "machine_") ||          /* Coarse */
-           LIT_STRNSTART(name, "ml_") ||       /* Coarse */
-           LIT_STRNSTART(name, "PE_") ||       /* Coarse */
-           LIT_STRNSTART(name, "rtc_") ||      /* Coarse */
-           LIT_STRNSTART(name, "_rtc_") ||
-           LIT_STRNSTART(name, "rtclock_") ||
-           LIT_STRNSTART(name, "clock_") ||
-           LIT_STRNSTART(name, "bcopy") ||
-           LIT_STRNSTART(name, "pmap_") ||
-           LIT_STRNSTART(name, "hw_") ||       /* Coarse */
-           LIT_STRNSTART(name, "lapic_") ||            /* Coarse */
-           LIT_STRNSTART(name, "OSAdd") ||
-           LIT_STRNSTART(name, "OSBit") ||
-           LIT_STRNSTART(name, "OSDecrement") ||
-           LIT_STRNSTART(name, "OSIncrement") ||
-           LIT_STRNSTART(name, "OSCompareAndSwap") ||
-           LIT_STRNSTART(name, "etimer_") ||
-           LIT_STRNSTART(name, "dtxnu_kern_") ||
-           LIT_STRNSTART(name, "flush_mmu_tlb_")) {
-               return TRUE;
-       }
-       /*
-        * Fasttrap inner-workings we can't instrument
-        * on Intel (6230149)
-        */
-       if (LIT_STRNSTART(name, "fasttrap_") ||
-           LIT_STRNSTART(name, "fuword") ||
-           LIT_STRNSTART(name, "suword")) {
-               return TRUE;
-       }
-
-       if (LIT_STRNSTART(name, "_dtrace")) {
-               return TRUE; /* Shims in dtrace.c */
-       }
-       if (LIT_STRNSTART(name, "hibernate_")) {
-               return TRUE;
-       }
-
-       /*
-        * Place no probes in the exception handling path
-        */
-#if __arm__ || __arm64__
-       if (LIT_STRNSTART(name, "fleh_") ||
-           LIT_STRNSTART(name, "sleh_") ||
-           LIT_STRNSTART(name, "timer_state_event") ||
-           LIT_STRNEQL(name, "get_vfp_enabled")) {
-               return TRUE;
-       }
-
-       if (LIT_STRNSTART(name, "_ZNK15OSMetaClassBase8metaCastEPK11OSMetaClass") ||
-           LIT_STRNSTART(name, "_ZN15OSMetaClassBase12safeMetaCastEPKS_PK11OSMetaClass") ||
-           LIT_STRNSTART(name, "_ZNK11OSMetaClass13checkMetaCastEPK15OSMetaClassBase")) {
-               return TRUE;
-       }
-#endif
-
-#ifdef __x86_64__
-       if (LIT_STRNSTART(name, "machine_") ||
-           LIT_STRNSTART(name, "idt64") ||
-           LIT_STRNSTART(name, "ks_") ||
-           LIT_STRNSTART(name, "hndl_") ||
-           LIT_STRNSTART(name, "_intr_") ||
-           LIT_STRNSTART(name, "mapping_") ||
-           LIT_STRNSTART(name, "tsc_") ||
-           LIT_STRNSTART(name, "pmCPU") ||
-           LIT_STRNSTART(name, "pms") ||
-           LIT_STRNSTART(name, "usimple_") ||
-           LIT_STRNSTART(name, "lck_spin_lock") ||
-           LIT_STRNSTART(name, "lck_spin_unlock") ||
-           LIT_STRNSTART(name, "absolutetime_to_") ||
-           LIT_STRNSTART(name, "commpage_") ||
-           LIT_STRNSTART(name, "ml_") ||
-           LIT_STRNSTART(name, "PE_") ||
-           LIT_STRNSTART(name, "act_machine") ||
-           LIT_STRNSTART(name, "acpi_") ||
-           LIT_STRNSTART(name, "pal_")) {
-               return TRUE;
-       }
-       // Don't Steal Mac OS X
-       if (LIT_STRNSTART(name, "dsmos_")) {
-               return TRUE;
-       }
-
-#endif
-
-       /*
-        * Place no probes that could be hit on the way to the debugger.
-        */
-       if (LIT_STRNSTART(name, "kdp_") ||
-           LIT_STRNSTART(name, "kdb_") ||
-           LIT_STRNSTART(name, "debug_")) {
-               return TRUE;
-       }
-
-#if KASAN
-       if (LIT_STRNSTART(name, "kasan") ||
-           LIT_STRNSTART(name, "__kasan") ||
-           LIT_STRNSTART(name, "__asan")) {
-               return TRUE;
-       }
-#endif
-
-       /*
-        * Place no probes that could be hit on the way to a panic.
-        */
-       if (NULL != strstr(name, "panic_")) {
-               return TRUE;
-       }
-
-       return FALSE;
-}
-
-
 /*ARGSUSED*/
 static void
 fbt_destroy(void *arg, dtrace_id_t id, void *parg)
@@ -785,7 +319,7 @@ fbt_provide_module_user_syms(struct modctl *ctl)
                                name += 1;
                        }
 
-                       if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name)) {
+                       if (fbt_excluded(name)) {
                                continue;
                        }
 
@@ -848,7 +382,7 @@ fbt_provide_kernel_section(struct modctl *ctl, kernel_section_t *sect, kernel_nl
                }
 #endif /* defined(__arm__) */
 
-               if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name)) {
+               if (fbt_excluded(name)) {
                        continue;
                }
 
@@ -970,9 +504,6 @@ fbt_provide_module(void *arg, struct modctl *ctl)
        if (MOD_HAS_USERSPACE_SYMBOLS(ctl)) {
                fbt_provide_module_user_syms(ctl);
                ctl->mod_flags |= MODCTL_FBT_PROBES_PROVIDED;
-               if (MOD_FBT_PROVIDE_PRIVATE_PROBES(ctl)) {
-                       ctl->mod_flags |= MODCTL_FBT_PRIVATE_PROBES_PROVIDED;
-               }
                if (MOD_FBT_PROVIDE_BLACKLISTED_PROBES(ctl)) {
                        ctl->mod_flags |= MODCTL_FBT_BLACKLISTED_PROBES_PROVIDED;
                }
@@ -1046,49 +577,6 @@ _fbt_open(dev_t dev, int flags, int devtype, struct proc *p)
 
 #define FBT_MAJOR  -24 /* let the kernel pick the device number */
 
-SYSCTL_DECL(_kern_dtrace);
-
-static int
-sysctl_dtrace_ignore_fbt_blacklist SYSCTL_HANDLER_ARGS
-{
-#pragma unused(oidp, arg2)
-       int err;
-       int value = *(int*)arg1;
-
-       err = sysctl_io_number(req, value, sizeof(value), &value, NULL);
-       if (err) {
-               return err;
-       }
-       if (req->newptr) {
-               if (!(value == 0 || value == 1)) {
-                       return ERANGE;
-               }
-
-               /*
-                * We do not allow setting the blacklist back to on, as we have no way
-                * of knowing if those unsafe probes are still used.
-                *
-                * If we are using kernel symbols, we also do not allow any change,
-                * since the symbols are jettison'd after the first pass.
-                *
-                * We do not need to take any locks here because those symbol modes
-                * are permanent and do not change after boot.
-                */
-               if (value != 1 || dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
-                   dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
-                       return EPERM;
-               }
-
-               ignore_fbt_blacklist = 1;
-       }
-
-       return 0;
-}
-
-SYSCTL_PROC(_kern_dtrace, OID_AUTO, ignore_fbt_blacklist,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
-    &ignore_fbt_blacklist, 0,
-    sysctl_dtrace_ignore_fbt_blacklist, "I", "fbt provider ignore blacklist");
 
 /*
  * A struct describing which functions will get invoked for certain
@@ -1116,6 +604,7 @@ static struct cdevsw fbt_cdevsw =
 #undef kmem_free /* from its binding to dt_kmem_free glue */
 #include <vm/vm_kern.h>
 
+
 void
 fbt_init( void )
 {
@@ -1126,8 +615,7 @@ fbt_init( void )
                return;
        }
 
-       PE_parse_boot_argn("IgnoreFBTBlacklist", &ignore_fbt_blacklist, sizeof(ignore_fbt_blacklist));
-
+       fbt_blacklist_init();
        fbt_attach((dev_info_t*)(uintptr_t)majdevno);
 }
 #undef FBT_MAJOR
diff --git a/bsd/dev/dtrace/fbt_blacklist.c b/bsd/dev/dtrace/fbt_blacklist.c
new file mode 100644 (file)
index 0000000..f8f34ae
--- /dev/null
@@ -0,0 +1,367 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/dtrace_impl.h>
+#include <sys/fbt.h>
+#include <sys/sysctl.h>
+
+#define CLOSURE(s) #s,
+#define CRITICAL(s) #s,
+
+#if KASAN
+#define KASAN_ONLY(s) #s,
+#else
+#define KASAN_ONLY(s)
+#endif /* KASAN */
+
+#if defined(__arm__) || defined(__arm64__)
+#define ARM_ONLY(s) #s,
+#else
+#define ARM_ONLY(s)
+#endif /* defined(__arm__) || defined(__arm64__) */
+#if defined(__x86_64__)
+#define X86_ONLY(s) #s,
+#else
+#define X86_ONLY(s)
+#endif /* defined(__x86_64__) */
+
+/*
+ * Routine prefixes that must not be probed, either because they are used in
+ * the exception path, by dtrace code in probe context, or are general
+ * critical routines that must never be probed.
+ *
+ * All routines whose name start with one of these will be ignored.
+ *
+ * This must be kept in asciibetical order for purposes of bsearch().
+ */
+const char * fbt_blacklist[] =
+{
+       CRITICAL(Call_DebuggerC)
+       CLOSURE(ClearIdlePop)
+       CLOSURE(Debugger)
+       CRITICAL(IOCPURunPlatformPanicActions)
+       CLOSURE(IS_64BIT_PROCESS)
+       CRITICAL(OSAdd)
+       CRITICAL(OSBit)
+       CLOSURE(OSCompareAndSwap)
+       CRITICAL(OSDecrement)
+       CRITICAL(OSIncrement)
+       CRITICAL(PEARMDebugPanicHook)
+       CRITICAL(PEHaltRestart)
+       CRITICAL(PE_)
+       CRITICAL(SavePanicInfo)
+       CLOSURE(SetIdlePop)
+       CRITICAL(SysChoked)
+       CRITICAL(_ZN15OSMetaClassBase12safeMetaCastEPKS_PK11OSMetaClass) /* OSMetaClassBase::safeMetaCast */
+       CRITICAL(_ZN16IOPlatformExpert11haltRestartEj) /* IOPlatformExpert::haltRestart */
+       CRITICAL(_ZN18IODTPlatformExpert11haltRestartEj) /* IODTPlatformExpert::haltRestart */
+       ARM_ONLY(_ZN8ASPNVRAM4syncEv) /* ASPNVRAM::sync */
+       CRITICAL(_ZN9IODTNVRAM13savePanicInfoEPhy) /* IODTNVRAM::savePanicInfo */
+       CRITICAL(_ZN9IOService14newTemperatureElPS_) /* IOService::newTemperature */
+       CRITICAL(_ZN9IOService26temperatureCriticalForZoneEPS_) /* IOService::temperatureCriticalForZone */
+       CRITICAL(_ZNK11OSMetaClass13checkMetaCastEPK15OSMetaClassBase) /* OSMetaClass::checkMetaCast */
+       CRITICAL(_ZNK15OSMetaClassBase8metaCastEPK11OSMetaClass) /* OSMetaClassBase::metaCast */
+       CRITICAL(_ZNK6OSData14getBytesNoCopyEv) /* Data::getBytesNoCopy, IOHibernateSystemWake path */
+       KASAN_ONLY(__asan)
+       ARM_ONLY(__div)
+       CLOSURE(__dtrace_probe)
+       KASAN_ONLY(__kasan)
+       ARM_ONLY(__mod)
+       CRITICAL(__strlcpy_chk)
+       ARM_ONLY(__udiv)
+       ARM_ONLY(__umod)
+       CRITICAL(_disable_preemption)
+       CRITICAL(_enable_preemption)
+       CLOSURE(absolutetime_to_microtime)
+       X86_ONLY(acpi_)
+       X86_ONLY(act_machine)
+       CLOSURE(act_set_astbsd)
+       ARM_ONLY(alternate_debugger_enter)
+       ARM_ONLY(arm_init_idle_cpu)
+       CLOSURE(ast_dtrace_on)
+       CLOSURE(ast_pending)
+       CRITICAL(bcopy)
+       CLOSURE(clean_dcache)
+       CLOSURE(clean_mmu_dcache)
+       CRITICAL(clock_)
+       X86_ONLY(commpage_)
+       CRITICAL(console_cpu_alloc)
+       CRITICAL(console_cpu_free)
+       CLOSURE(copyin)
+       CLOSURE(copyout)
+       CRITICAL(cpu_)
+       CLOSURE(current_proc)
+       CLOSURE(current_processor)
+       CLOSURE(current_task)
+       CLOSURE(current_thread)
+       CLOSURE(debug_)
+       X86_ONLY(dsmos_)
+       CLOSURE(dtrace_)
+       CRITICAL(enter_lohandler)
+       CRITICAL(fasttrap_)
+       CRITICAL(fbt_invop)
+       CRITICAL(fbt_perfCallback)
+       CLOSURE(find_user_regs)
+       ARM_ONLY(fleh_)
+       CLOSURE(flush_dcache)
+       ARM_ONLY(flush_mmu_tlb_)
+       CLOSURE(flush_tlb64)
+       CRITICAL(fuword)
+       CLOSURE(get_bsdtask_info)
+       CLOSURE(get_bsdthread_info)
+       CRITICAL(get_preemption_level)
+       CRITICAL(get_threadtask)
+       ARM_ONLY(get_vfp_enabled)
+       CRITICAL(getminor)
+       CRITICAL(handle_pending_TLB_flushes)
+       CRITICAL(hibernate_)
+       X86_ONLY(hndl_)
+       CRITICAL(hw_)
+       X86_ONLY(idt64)
+       CRITICAL(interrupt)
+       CRITICAL(invalidate_mmu_icache)
+       CRITICAL(is_saved_state32)
+       KASAN_ONLY(kasan)
+       CLOSURE(kauth_cred_get)
+       CLOSURE(kauth_getgid)
+       CLOSURE(kauth_getuid)
+       CRITICAL(kdb_)
+       CRITICAL(kdp_)
+       CRITICAL(kernel_preempt_check)
+       CRITICAL(kernel_trap)
+       CRITICAL(kprintf)
+       CRITICAL(ks_)
+       CLOSURE(kvtophys)
+       X86_ONLY(lapic_)
+       CRITICAL(lo_alltraps)
+       CRITICAL(lock_debugger)
+       CLOSURE(mach_absolute_time)
+       CRITICAL(machine_)
+       X86_ONLY(mapping_)
+       CRITICAL(mca_cpu_alloc)
+       CRITICAL(mca_cpu_init)
+       CLOSURE(memcpy)
+       CLOSURE(memmove)
+       CRITICAL(ml_)
+       CLOSURE(mt_core_snap)
+       CLOSURE(mt_cur_cpu_cycles)
+       CLOSURE(mt_cur_cpu_instrs)
+       CLOSURE(mt_cur_thread_cycles)
+       CLOSURE(mt_cur_thread_instrs)
+       CLOSURE(mt_fixed_counts)
+       CLOSURE(mt_fixed_counts_internal)
+       CLOSURE(mt_mtc_update_count)
+       CLOSURE(mt_update_thread)
+       CRITICAL(nanoseconds_to_absolutetime)
+       CRITICAL(nanotime_to_absolutetime)
+       CRITICAL(ovbcopy)
+       CRITICAL(packA)
+       X86_ONLY(pal_)
+       CLOSURE(panic)
+       CRITICAL(phystokv)
+       CRITICAL(platform_)
+       X86_ONLY(pltrace)
+       X86_ONLY(pmCPU)
+       X86_ONLY(pmKextRegister)
+       X86_ONLY(pmMarkAllCPUsOff)
+       X86_ONLY(pmSafeMode)
+       X86_ONLY(pmTimerRestore)
+       X86_ONLY(pmTimerSave)
+       X86_ONLY(pmUnRegister)
+       X86_ONLY(pmap64_pdpt)
+       CLOSURE(pmap_find_phys)
+       CLOSURE(pmap_get_mapwindow)
+       CLOSURE(pmap_pde)
+       CLOSURE(pmap_pde_internal0)
+       CLOSURE(pmap_pde_internal1)
+       CLOSURE(pmap_pte)
+       CLOSURE(pmap_pte_internal)
+       CLOSURE(pmap_put_mapwindow)
+       CLOSURE(pmap_valid_page)
+       X86_ONLY(pms)
+       CRITICAL(power_management_init)
+       CRITICAL(preemption_underflow_panic)
+       CLOSURE(prf)
+       CLOSURE(proc_is64bit)
+       CLOSURE(proc_selfname)
+       CRITICAL(register_cpu_setup_func)
+       CRITICAL(ret64_iret)
+       CRITICAL(ret_to_user)
+       CRITICAL(return_to_kernel)
+       CRITICAL(return_to_user)
+       CRITICAL(rtc_)
+       CRITICAL(rtclock_)
+       CRITICAL(saved_state64)
+       CLOSURE(sdt_getargdesc)
+       CRITICAL(sdt_invop)
+       CLOSURE(setPop)
+       ARM_ONLY(sleh_)
+       CRITICAL(sprlock)
+       CRITICAL(sprunlock)
+       CLOSURE(strlcpy)
+       CRITICAL(strlen)
+       CRITICAL(strncmp)
+       CRITICAL(suword)
+       X86_ONLY(sync_iss_to_iks_unconditionally)
+       CLOSURE(systrace_stub)
+       CRITICAL(t_invop)
+       CLOSURE(timer_grab)
+       ARM_ONLY(timer_state_event)
+       CRITICAL(tmrCvt)
+       CRITICAL(trap_from_kernel)
+       CRITICAL(tsc_)
+       CRITICAL(uart_putc)
+       CRITICAL(unlock_debugger)
+       CRITICAL(unpackA)
+       CRITICAL(unregister_cpu_setup_func)
+       CRITICAL(uread)
+       CRITICAL(uwrite)
+       CRITICAL(vstart)
+};
+#define BLACKLIST_COUNT (sizeof(fbt_blacklist)/sizeof(fbt_blacklist[0]))
+
+/*
+ * Modules that should not be probed.
+ *
+ * This must be kept in asciibetical order for purposes of bsearch().
+ */
+static const char* fbt_module_blacklist[] = {
+       X86_ONLY(com.apple.driver.AppleACPIEC)
+       X86_ONLY(com.apple.driver.AppleACPIPlatform)
+       ARM_ONLY(com.apple.driver.AppleARMPlatform)
+       X86_ONLY(com.apple.driver.AppleEFI)
+       X86_ONLY(com.apple.driver.AppleIntelCPUPowerManagement)
+       ARM_ONLY(com.apple.driver.AppleInterruptController)
+       X86_ONLY(com.apple.driver.AppleRTC)
+       X86_ONLY(com.apple.iokit.IOACPIFamily)
+};
+#define MODULE_BLACKLIST_COUNT (sizeof(fbt_module_blacklist)/sizeof(fbt_module_blacklist[0]))
+
+int ignore_fbt_blacklist = 0;
+extern int dtrace_kernel_symbol_mode;
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-qual"
+static int
+_cmp(const void *a, const void *b)
+{
+       const char *v = *(const char **)b;
+       return strncmp((const char *)a, v, strlen(v));
+}
+
+
+#pragma clang diagnostic pop
+/*
+ * Module validation
+ */
+bool
+fbt_module_excluded(struct modctl* ctl)
+{
+       const char *excluded;
+
+       ASSERT(!MOD_FBT_DONE(ctl));
+
+       if (ctl->mod_address == 0 || ctl->mod_size == 0 || !ctl->mod_loaded) {
+               return true;
+       }
+
+       if (ignore_fbt_blacklist) {
+               return false;
+       }
+
+       excluded = bsearch(ctl->mod_modname, fbt_module_blacklist,
+           MODULE_BLACKLIST_COUNT, sizeof(fbt_module_blacklist[0]), _cmp);
+       return excluded;
+}
+
+/*
+ * FBT probe name validation
+ */
+bool
+fbt_excluded(const char* name)
+{
+       const char *excluded;
+
+       if (ignore_fbt_blacklist) {
+               return false;
+       }
+
+       excluded = bsearch(name, fbt_blacklist, BLACKLIST_COUNT, sizeof(name),
+           _cmp );
+       return excluded;
+}
+
+SYSCTL_DECL(_kern_dtrace);
+
+static int
+sysctl_dtrace_ignore_fbt_blacklist SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg2)
+       int err;
+       int value = *(int*)arg1;
+
+       err = sysctl_io_number(req, value, sizeof(value), &value, NULL);
+       if (err) {
+               return err;
+       }
+       if (req->newptr) {
+               if (!(value == 0 || value == 1)) {
+                       return ERANGE;
+               }
+
+               /*
+                * We do not allow setting the blacklist back to on, as we have no way
+                * of knowing if those unsafe probes are still used.
+                *
+                * If we are using kernel symbols, we also do not allow any change,
+                * since the symbols are jettison'd after the first pass.
+                *
+                * We do not need to take any locks here because those symbol modes
+                * are permanent and do not change after boot.
+                */
+               if (value != 1 || dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
+                   dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
+                       return EPERM;
+               }
+
+               ignore_fbt_blacklist = 1;
+       }
+
+       return 0;
+}
+
+SYSCTL_PROC(_kern_dtrace, OID_AUTO, ignore_fbt_blacklist,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    &ignore_fbt_blacklist, 0,
+    sysctl_dtrace_ignore_fbt_blacklist, "I", "fbt provider ignore blacklist");
+
+void
+fbt_blacklist_init(void)
+{
+       PE_parse_boot_argn("IgnoreFBTBlacklist", &ignore_fbt_blacklist, sizeof(ignore_fbt_blacklist));
+#if DEBUG || DEVELOPMENT
+       for (size_t i = 1; i < BLACKLIST_COUNT; i++) {
+               if (strcmp(fbt_blacklist[i - 1], fbt_blacklist[i]) > 0) {
+                       panic("unordered fbt blacklist %s > %s", fbt_blacklist[i - 1], fbt_blacklist[i]);
+               }
+       }
+#endif /* DEBUG || DEVELOPMENT */
+}
index f7ea6085ef5cf5ab1a3466ed61031c94582a60e0..12f777ae2233019975216db417c6a7d4984fc8ff 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -31,6 +31,7 @@
 #include <kern/lock_stat.h>
 
 #if LOCK_STATS
+
 #define SPIN_HELD 0
 #define SPIN_MISS 1
 #define SPIN_SPIN 2
@@ -44,7 +45,7 @@
 
 static dtrace_provider_id_t lockprof_id;
 
-decl_lck_mtx_data(extern, lck_grp_lock)
+decl_lck_mtx_data(extern, lck_grp_lock);
 extern queue_head_t lck_grp_queue;
 extern unsigned int lck_grp_cnt;
 
index f28db3a3937b67e594057a311b1fd2c48d89b1b0..8c44121a64c2abae7dce09f83b45d7bb387601ec 100644 (file)
  * Use is subject to license terms.
  */
 
-/* #pragma ident       "@(#)lockstat.c 1.12    08/01/16 SMI" */
-
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL /* Solaris vs. Darwin */
-#endif
-#endif
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
index 6d36e4cde7f72765b2bdc44e615ec507a45e89de..2294eedfd7891675acf4bc1ffdbee10de18548a9 100644 (file)
  * Use is subject to license terms.
  */
 
-/* #pragma ident       "@(#)profile.c  1.7     07/01/10 SMI" */
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL /* Solaris vs. Darwin */
-#endif
-#endif
-
 #include <kern/cpu_data.h>
 #include <kern/thread.h>
 #include <kern/assert.h>
@@ -324,9 +316,9 @@ profile_create(hrtime_t interval, const char *name, int kind)
                return;
        }
 
-       atomic_add_32(&profile_total, 1);
+       os_atomic_inc(&profile_total, relaxed);
        if (profile_total > profile_max) {
-               atomic_add_32(&profile_total, -1);
+               os_atomic_dec(&profile_total, relaxed);
                return;
        }
 
@@ -503,7 +495,7 @@ profile_destroy(void *arg, dtrace_id_t id, void *parg)
        }
 
        ASSERT(profile_total >= 1);
-       atomic_add_32(&profile_total, -1);
+       os_atomic_dec(&profile_total, relaxed);
 }
 
 /*ARGSUSED*/
index 1957fb2b05c9fe547814e6258291fd0a25c56a20..58fc8b3046eae407520226c835b9a8d74a2d233d 100644 (file)
@@ -27,7 +27,7 @@ endif
 
 
 ifeq ($(CURRENT_ARCH_CONFIG),ARM64)
-INSTALL_DTRACE_SCRIPTS_LIST += regs_arm64.d
+INSTALL_DTRACE_SCRIPTS_LIST += regs_arm64.d ptrauth_arm64.d
 else ifeq ($(CURRENT_ARCH_CONFIG),ARM)
 INSTALL_DTRACE_SCRIPTS_LIST += regs_arm.d
 else
@@ -39,7 +39,7 @@ INSTALL_DTRACE_SCRIPTS_FILES = \
 
 $(INSTALL_DTRACE_SCRIPTS_FILES): $(DSTROOT)/$(INSTALL_DTRACE_SCRIPTS_DIR)/% : %
        $(_v)$(MKDIR) $(DSTROOT)/$(INSTALL_DTRACE_SCRIPTS_DIR)
-       @echo INSTALL $(@F)
+       $(call makelog,INSTALL $(@F))
        $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@
 
 INSTALL_DTRACE_LIBEXEC_FILES = \
@@ -47,7 +47,7 @@ INSTALL_DTRACE_LIBEXEC_FILES = \
 
 $(INSTALL_DTRACE_LIBEXEC_FILES): $(DSTROOT)/$(INSTALL_DTRACE_LIBEXEC_DIR)/% : %
        $(_v)$(MKDIR) $(DSTROOT)/$(INSTALL_DTRACE_LIBEXEC_DIR)
-       @echo INSTALL $(@F)
+       $(call makelog,INSTALL $(@F))
        $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@
 
 do_textfiles_install:: $(INSTALL_DTRACE_SCRIPTS_FILES) $(INSTALL_DTRACE_LIBEXEC_FILES)
diff --git a/bsd/dev/dtrace/scripts/ptrauth_arm64.d b/bsd/dev/dtrace/scripts/ptrauth_arm64.d
new file mode 100644 (file)
index 0000000..184c1bf
--- /dev/null
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License").  You may not use this file except in compliance with the
+ * License.  Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
+ * 
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+enum ptrauth_key {
+  ptrauth_key_asia = 0,
+  ptrauth_key_asib = 1,
+  ptrauth_key_asda = 2,
+  ptrauth_key_asdb = 3,
+
+  /* A process-independent key which can be used to sign code pointers.
+     Signing and authenticating with this key is a no-op in processes
+     which disable ABI pointer authentication. */
+  ptrauth_key_process_independent_code = ptrauth_key_asia,
+
+  /* A process-specific key which can be used to sign code pointers.
+     Signing and authenticating with this key is enforced even in processes
+     which disable ABI pointer authentication. */
+  ptrauth_key_process_dependent_code = ptrauth_key_asib,
+
+  /* A process-independent key which can be used to sign data pointers.
+     Signing and authenticating with this key is a no-op in processes
+     which disable ABI pointer authentication. */
+  ptrauth_key_process_independent_data = ptrauth_key_asda,
+
+  /* A process-specific key which can be used to sign data pointers.
+     Signing and authenticating with this key is a no-op in processes
+     which disable ABI pointer authentication. */
+  ptrauth_key_process_dependent_data = ptrauth_key_asdb,
+
+  /* The key used to sign C function pointers.
+     The extra data is always 0. */
+  ptrauth_key_function_pointer = ptrauth_key_process_independent_code,
+
+  /* The key used to sign return addresses on the stack.
+     The extra data is based on the storage address of the return address.
+     On ARM64, that is always the storage address of the return address plus 8
+     (or, in other words, the value of the stack pointer on function entry) */
+  ptrauth_key_return_address = ptrauth_key_process_dependent_code,
+
+  /* The key used to sign frame pointers on the stack.
+     The extra data is based on the storage address of the frame pointer.
+     On ARM64, that is always the storage address of the frame pointer plus 16
+     (or, in other words, the value of the stack pointer on function entry) */
+  ptrauth_key_frame_pointer = ptrauth_key_process_dependent_data,
+
+  /* The key used to sign block function pointers, including:
+       invocation functions,
+       block object copy functions,
+       block object destroy functions,
+       __block variable copy functions, and
+       __block variable destroy functions.
+     The extra data is always the address at which the function pointer
+     is stored.
+
+     Note that block object pointers themselves (i.e. the direct
+     representations of values of block-pointer type) are not signed. */
+  ptrauth_key_block_function = ptrauth_key_asia,
+
+  /* The key used to sign C++ v-table pointers.
+     The extra data is always 0. */
+  ptrauth_key_cxx_vtable_pointer = ptrauth_key_asda
+
+};
+
index 23d3b5387a88ffbb240d07541786828f1cb5da28..885f9ecfd461b8705840863b427bbc42af513c18 100644 (file)
@@ -3,8 +3,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident  "@(#)regs.d.in  1.0     04/09/28 SMI"
-
 inline int R_R0 = 0;
 #pragma D binding "1.0" R_R0
 inline int R_R1 = 1;
index 8979dea7704602731868f5d9a995d63b093d759b..528b96ce6f900b43bd1af392412fd9f46b8b44b3 100644 (file)
@@ -3,8 +3,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident  "@(#)regs.d.in  1.0     04/09/28 SMI"
-
 inline int R_R0 = 0;
 #pragma D binding "1.0" R_R0
 inline int R_R1 = 1;
index 8a5acc6992806cc804b6e6c701fc25e9433716ce..b18333392a0422a277edebe503d0f6a277bfd525 100644 (file)
@@ -3,8 +3,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident  "@(#)regs.d.in  1.1     04/09/28 SMI"
-
 inline int R_GS = 0;
 #pragma D binding "1.0" R_GS
 inline int R_FS = 1;
index 7279b311886ce4626890365d5bf72f0a408ed30a..ead9d23ce6707795fc0a9ac8b90f5b5a1f89b403 100644 (file)
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident  "@(#)unistd.d   1.4     07/02/20 SMI"
-
 inline int DTRACEFLT_UNKNOWN = 0;      /* Unknown fault */
 #pragma D binding "1.0" DTRACEFLT_UNKNOWN
 
index d851fb65927be56d2ad14157791d0934d2671429..1a38e614b31f101a1bda6ec1cf2a58ea0d367209 100644 (file)
  * Use is subject to license terms.
  */
 
-/* #pragma ident       "@(#)sdt.c      1.9     08/07/01 SMI" */
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL /* Solaris vs. Darwin */
-#endif
-#endif
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
index 65ae963fd7cd07a6fad8b5c95c1ebfc7f58c0be5..c9c52fb6a60c3f0190d1fb9401b60d2351124010 100644 (file)
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-/* #pragma ident       "@(#)sdt_subr.c 1.13    08/06/13 SMI" */
-
 #include <sys/sdt_impl.h>
 
 static dtrace_pattr_t vtrace_attr = {
@@ -93,6 +91,10 @@ sdt_provider_t sdt_providers[] = {
        { "sysevent", "__sysevent____", &stab_attr, 0 },
        { "sdt", "__sdt____", &sdt_attr, 0 },
        { "boost", "__boost____", &stab_attr, 0},
+       { "route", "__route____", &stab_attr, 0 },
+#if KASAN
+       { "kasan", "__kasan____", &stab_attr, 0 },
+#endif
        { NULL, NULL, NULL, 0 }
 };
 
index 27d199eebc7b98d375769ae0d3a2ee683b6edea3..ef85a1fca3bbb4d8894368ad13ab54a3af8f8138 100644 (file)
  * Use is subject to license terms.
  */
 
-/* #pragma ident       "@(#)systrace.c 1.6     06/09/19 SMI" */
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL /* Solaris vs. Darwin */
-#endif
-#endif
-
 #include <kern/thread.h>
 #include <mach/thread_status.h>
 
index f7b92bc9face09fa5362c62c9ca52e3fb55573f3..b8976d2d0ef80a9af2a7e4de5287e96eb70d4f07 100644 (file)
 #ifndef _SYS_SYSTRACE_H
 #define _SYS_SYSTRACE_H
 
-/* #pragma ident       "@(#)systrace.h 1.3     06/09/19 SMI" */
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL /* Solaris vs. Darwin */
-#endif
-#endif
-
 #include <sys/dtrace.h>
 
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
-#ifdef _KERNEL
-
 typedef struct systrace_sysent {
        dtrace_id_t     stsy_entry;
        dtrace_id_t     stsy_return;
@@ -62,8 +52,6 @@ extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *);
 
 extern void dtrace_systrace_syscall_return(unsigned short, int, int *);
 
-#endif /* _KERNEL */
-
 #ifdef  __cplusplus
 }
 #endif
index 094506014f089a343547abfbe52c241d1b3c8ba3..e81719bf6ae4f211b8db2ac9c71691fdaf83d8f8 100644 (file)
@@ -238,12 +238,7 @@ struct cdevsw cdevsw[] = {
                kmioctl, nullstop, nullreset, km_tty, ttselect,
                eno_mmap, eno_strat, eno_getc, eno_putc, 0
        },
-       [13 ... 41] = NO_CDEVICE,
-       [42] = {
-               volopen, volclose, eno_rdwrt, eno_rdwrt,
-               volioctl, eno_stop, eno_reset, 0, (select_fcn_t *) seltrue,
-               eno_mmap, eno_strat, eno_getc, eno_putc, 0
-       }
+       [13 ... 63] = NO_CDEVICE,
 };
 const int nchrdev = sizeof(cdevsw) / sizeof(cdevsw[0]);
 
@@ -260,7 +255,7 @@ isdisk(dev_t dev, int type)
 
        switch (type) {
        case VCHR:
-               maj = chrtoblk(maj);
+               maj = chrtoblk(dev);
                if (maj == NODEV) {
                        break;
                }
@@ -274,32 +269,7 @@ isdisk(dev_t dev, int type)
        return 0;
 }
 
-static int chrtoblktab[] = {
-       /* CHR*/        /* BLK*/        /* CHR*/        /* BLK*/
-       /*  0 */ NODEV, /*  1 */ NODEV,
-       /*  2 */ NODEV, /*  3 */ NODEV,
-       /*  4 */ NODEV, /*  5 */ NODEV,
-       /*  6 */ NODEV, /*  7 */ NODEV,
-       /*  8 */ NODEV, /*  9 */ NODEV,
-       /* 10 */ NODEV, /* 11 */ NODEV,
-       /* 12 */ NODEV, /* 13 */ NODEV,
-       /* 14 */ NODEV, /* 15 */ NODEV,
-       /* 16 */ NODEV, /* 17 */ NODEV,
-       /* 18 */ NODEV, /* 19 */ NODEV,
-       /* 20 */ NODEV, /* 21 */ NODEV,
-       /* 22 */ NODEV, /* 23 */ NODEV,
-       /* 24 */ NODEV, /* 25 */ NODEV,
-       /* 26 */ NODEV, /* 27 */ NODEV,
-       /* 28 */ NODEV, /* 29 */ NODEV,
-       /* 30 */ NODEV, /* 31 */ NODEV,
-       /* 32 */ NODEV, /* 33 */ NODEV,
-       /* 34 */ NODEV, /* 35 */ NODEV,
-       /* 36 */ NODEV, /* 37 */ NODEV,
-       /* 38 */ NODEV, /* 39 */ NODEV,
-       /* 40 */ NODEV, /* 41 */ NODEV,
-       /* 42 */ NODEV, /* 43 */ NODEV,
-       /* 44 */ NODEV,
-};
+static int chrtoblktab[] = {[0 ... nchrdev] = NODEV };
 
 /*
  * convert chr dev to blk dev
index c67273b79b4dc50aa31065af746d31953a765b79..f167167ca0ee79d0d975ed19ce81c01ff0c3e88d 100644 (file)
@@ -40,9 +40,6 @@
  * It needs to be in sync with this file.
  */
 
-/*
- * #pragma ident       "@(#)dis_tables.c       1.18    08/05/24 SMI"
- */
 #include <sys/dtrace.h>
 #include <sys/dtrace_glue.h>
 #include <sys/dis_tables.h>
index 458fc15b3625efedbea5db8f09baeecd0ca99f81..6785dc53650866ce0735d343c38c444a06af77b3 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2005-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -26,7 +26,6 @@
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
-#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */
 #include <kern/thread.h>
 #include <mach/thread_status.h>
 
index a5064d68898fec7f79268a987559b82b3448e591..e78af6efc28b44ef5551fa32cc20860890050579 100644 (file)
  * Use is subject to license terms.
  */
 
-/*
- * #pragma ident       "@(#)dtrace_subr.c      1.16    07/09/18 SMI"
- */
-
 #include <sys/dtrace.h>
 #include <sys/dtrace_glue.h>
 #include <sys/dtrace_impl.h>
@@ -204,110 +200,6 @@ dtrace_user_probe(x86_saved_state_t *regs)
        return KERN_FAILURE;
 }
 
-void
-dtrace_safe_synchronous_signal(void)
-{
-#if 0
-       kthread_t *t = curthread;
-       struct regs *rp = lwptoregs(ttolwp(t));
-       size_t isz = t->t_dtrace_npc - t->t_dtrace_pc;
-
-       ASSERT(t->t_dtrace_on);
-
-       /*
-        * If we're not in the range of scratch addresses, we're not actually
-        * tracing user instructions so turn off the flags. If the instruction
-        * we copied out caused a synchonous trap, reset the pc back to its
-        * original value and turn off the flags.
-        */
-       if (rp->r_pc < t->t_dtrace_scrpc ||
-                       rp->r_pc > t->t_dtrace_astpc + isz) {
-               t->t_dtrace_ft = 0;
-       } else if (rp->r_pc == t->t_dtrace_scrpc ||
-                       rp->r_pc == t->t_dtrace_astpc) {
-               rp->r_pc = t->t_dtrace_pc;
-               t->t_dtrace_ft = 0;
-       }
-#endif /* 0 */
-}
-
-int
-dtrace_safe_defer_signal(void)
-{
-#if 0
-       kthread_t *t = curthread;
-       struct regs *rp = lwptoregs(ttolwp(t));
-       size_t isz = t->t_dtrace_npc - t->t_dtrace_pc;
-
-       ASSERT(t->t_dtrace_on);
-
-       /*
-        * If we're not in the range of scratch addresses, we're not actually
-        * tracing user instructions so turn off the flags.
-        */
-       if (rp->r_pc < t->t_dtrace_scrpc ||
-                       rp->r_pc > t->t_dtrace_astpc + isz) {
-               t->t_dtrace_ft = 0;
-               return (0);
-       }
-
-       /*
-        * If we've executed the original instruction, but haven't performed
-        * the jmp back to t->t_dtrace_npc or the clean up of any registers
-        * used to emulate %rip-relative instructions in 64-bit mode, do that
-        * here and take the signal right away. We detect this condition by
-        * seeing if the program counter is the range [scrpc + isz, astpc).
-        */
-       if (t->t_dtrace_astpc - rp->r_pc <
-                       t->t_dtrace_astpc - t->t_dtrace_scrpc - isz) {
-#ifdef __sol64
-               /*
-                * If there is a scratch register and we're on the
-                * instruction immediately after the modified instruction,
-                * restore the value of that scratch register.
-                */
-               if (t->t_dtrace_reg != 0 &&
-                               rp->r_pc == t->t_dtrace_scrpc + isz) {
-                       switch (t->t_dtrace_reg) {
-                               case REG_RAX:
-                                       rp->r_rax = t->t_dtrace_regv;
-                                       break;
-                               case REG_RCX:
-                                       rp->r_rcx = t->t_dtrace_regv;
-                                       break;
-                               case REG_R8:
-                                       rp->r_r8 = t->t_dtrace_regv;
-                                       break;
-                               case REG_R9:
-                                       rp->r_r9 = t->t_dtrace_regv;
-                                       break;
-                       }
-               }
-#endif
-               rp->r_pc = t->t_dtrace_npc;
-               t->t_dtrace_ft = 0;
-               return (0);
-       }
-
-       /*
-        * Otherwise, make sure we'll return to the kernel after executing
-        * the copied out instruction and defer the signal.
-        */
-       if (!t->t_dtrace_step) {
-               ASSERT(rp->r_pc < t->t_dtrace_astpc);
-               rp->r_pc += t->t_dtrace_astpc - t->t_dtrace_scrpc;
-               t->t_dtrace_step = 1;
-       }
-
-       t->t_dtrace_ast = 1;
-
-       return (1);
-
-#endif /* 0 */
-
-       return 0;
-}
-
 void
 dtrace_flush_caches(void)
 {
index 0e9e9784979cb718d9247983e17a00c2aef394fa..6801862e0d71df44fa8d003b11e7ace99db2dd4e 100644 (file)
  * Use is subject to license terms.
  */
 
-/*
- * #pragma ident       "@(#)fasttrap_isa.c     1.27    08/04/09 SMI"
- */
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL /* Solaris vs. Darwin */
-#endif
-#endif
-
 #include <sys/fasttrap_isa.h>
 #include <sys/fasttrap_impl.h>
 #include <sys/dtrace.h>
@@ -235,7 +225,7 @@ fasttrap_anarg(x86_saved_state_t *regs, int function_entry, int argno)
 
        if (p_model == DATAMODEL_LP64) {
                user_addr_t stack;
-               
+
                /*
                 * In 64-bit mode, the first six arguments are stored in
                 * registers.
@@ -725,8 +715,8 @@ fasttrap_return_common(x86_saved_state_t *regs, user_addr_t pc, pid_t pid,
                        continue;
 
                if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) {
-                       uint8_t already_triggered = atomic_or_8(&probe->ftp_triggered, 1);
-                       if (already_triggered) {
+                       if (os_atomic_xchg(&probe->ftp_triggered, 1, relaxed)) {
+                               /* already triggered */
                                continue;
                        }
                }
@@ -767,14 +757,14 @@ fasttrap_return_common(x86_saved_state_t *regs, user_addr_t pc, pid_t pid,
 
 static void
 fasttrap_sigsegv(proc_t *p, uthread_t t, user_addr_t addr)
-{      
+{
        proc_lock(p);
 
        /* Set fault address and mark signal */
        t->uu_code = addr;
        t->uu_siglist |= sigmask(SIGSEGV);
 
-       /* 
+       /*
          * XXX These two line may be redundant; if not, then we need
         * XXX to potentially set the data address in the machine
         * XXX specific thread state structure to indicate the address.
@@ -1041,10 +1031,10 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
 
        if (tp->ftt_ids != NULL) {
                fasttrap_id_t *id;
-               
+
                uint32_t s0, s1, s2, s3, s4, s5;
                uint32_t *stack = (uint32_t *)(uintptr_t)(regs32->uesp);
-               
+
                /*
                 * In 32-bit mode, all arguments are passed on the
                 * stack. If this is a function entry probe, we need
@@ -1058,17 +1048,17 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                fasttrap_fuword32_noerr((user_addr_t)(unsigned long)&stack[3], &s3);
                fasttrap_fuword32_noerr((user_addr_t)(unsigned long)&stack[4], &s4);
                fasttrap_fuword32_noerr((user_addr_t)(unsigned long)&stack[5], &s5);
-               
+
                for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
                        fasttrap_probe_t *probe = id->fti_probe;
-                       
+
                        if (ISSET(current_proc()->p_lflag, P_LNOATTACH)) {
-                               dtrace_probe(dtrace_probeid_error, 0 /* state */, probe->ftp_id, 
+                               dtrace_probe(dtrace_probeid_error, 0 /* state */, probe->ftp_id,
                                             1 /* ndx */, -1 /* offset */, DTRACEFLT_UPRIV);
                        } else {
                                if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) {
-                                       uint8_t already_triggered = atomic_or_8(&probe->ftp_triggered, 1);
-                                       if (already_triggered) {
+                                       if (os_atomic_xchg(&probe->ftp_triggered, 1, relaxed)) {
+                                               /* already triggered */
                                                continue;
                                        }
                                }
@@ -1182,10 +1172,10 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                                new_pc = pc;
                                break;
                        }
-                       
+
                        if (tp->ftt_type == FASTTRAP_T_RET16)
                                addr += tp->ftt_dest;
-                       
+
                        regs32->uesp = addr;
                        new_pc = dst;
                        break;
@@ -1194,7 +1184,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                case FASTTRAP_T_JCC:
                {
                        uint_t taken;
-                       
+
                        switch (tp->ftt_code) {
                                case FASTTRAP_JO:
                                        taken = (regs32->efl & FASTTRAP_EFLAGS_OF) != 0;
@@ -1255,7 +1245,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                                default:
                                        taken = FALSE;
                        }
-                       
+
                        if (taken)
                                new_pc = tp->ftt_dest;
                        else
@@ -1283,7 +1273,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                                default:
                                        taken = FALSE;
                        }
-                       
+
                        if (taken)
                                new_pc = tp->ftt_dest;
                        else
@@ -1294,7 +1284,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                case FASTTRAP_T_JCXZ:
                {
                        greg_t cx = regs32->ecx;
-                       
+
                        if (cx == 0)
                                new_pc = tp->ftt_dest;
                        else
@@ -1306,18 +1296,18 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                {
                        user_addr_t addr = regs32->uesp - sizeof (uint32_t);
                        int ret = fasttrap_suword32(addr, (uint32_t)regs32->ebp);
-                       
+
                        if (ret == -1) {
                                fasttrap_sigsegv(p, uthread, addr);
                                new_pc = pc;
                                break;
                        }
-                       
+
                        regs32->uesp = addr;
                        new_pc = pc + tp->ftt_size;
                        break;
                }
-               
+
                case FASTTRAP_T_NOP:
                        new_pc = pc + tp->ftt_size;
                        break;
@@ -1334,7 +1324,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                                if (tp->ftt_index != FASTTRAP_NOREG)
                                        addr += fasttrap_getreg(regs, tp->ftt_index) <<
                                                tp->ftt_scale;
-                               
+
                                if (tp->ftt_code == 1) {
                                        /*
                                         * If there's a segment prefix for this
@@ -1348,7 +1338,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                                                new_pc = pc;
                                                break;
                                        }
-                                       
+
                                        uint32_t value32;
                                        addr = (user_addr_t)(uint32_t)addr;
                                        if (fasttrap_fuword32(addr, &value32) == -1) {
@@ -1371,13 +1361,13 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                        if (tp->ftt_type == FASTTRAP_T_CALL) {
                                user_addr_t addr = regs32->uesp - sizeof (uint32_t);
                                int ret = fasttrap_suword32(addr, (uint32_t)(pc + tp->ftt_size));
-                               
+
                                if (ret == -1) {
                                        fasttrap_sigsegv(p, uthread, addr);
                                        new_pc = pc;
                                        break;
                                }
-                               
+
                                regs32->uesp = addr;
                        }
                        break;
@@ -1456,7 +1446,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                        i += tp->ftt_size;
                        scratch[i++] = FASTTRAP_INT;
                        scratch[i++] = T_DTRACE_RET;
-                       
+
                        ASSERT(i <= sizeof (scratch));
 
                        if (fasttrap_copyout(scratch, write_addr, i)) {
@@ -1464,7 +1454,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                                new_pc = pc;
                                break;
                        }
-                       
+
                        if (tp->ftt_retids != NULL) {
                                uthread->t_dtrace_step = 1;
                                uthread->t_dtrace_ret = 1;
@@ -1472,17 +1462,17 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                        } else {
                                new_pc = uthread->t_dtrace_scrpc;
                        }
-                       
+
                        uthread->t_dtrace_pc = pc;
                        uthread->t_dtrace_npc = pc + tp->ftt_size;
                        uthread->t_dtrace_on = 1;
                        break;
                }
-               
+
                default:
                        panic("fasttrap: mishandled an instruction");
        }
-       
+
 done:
        /*
         * APPLE NOTE:
@@ -1619,10 +1609,10 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
 
                for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
                        fasttrap_probe_t *probe = id->fti_probe;
-                       
+
                        if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) {
-                               uint8_t already_triggered = atomic_or_8(&probe->ftp_triggered, 1);
-                               if (already_triggered) {
+                               if (os_atomic_xchg(&probe->ftp_triggered, 1, relaxed)) {
+                                       /* already triggered */
                                        continue;
                                }
                        }
@@ -1635,7 +1625,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                                retire_tp = 0;
                        }
                        if (ISSET(current_proc()->p_lflag, P_LNOATTACH)) {
-                               dtrace_probe(dtrace_probeid_error, 0 /* state */, probe->ftp_id, 
+                               dtrace_probe(dtrace_probeid_error, 0 /* state */, probe->ftp_id,
                                             1 /* ndx */, -1 /* offset */, DTRACEFLT_UPRIV);
                        } else if (id->fti_ptype == DTFTP_ENTRY) {
                                /*
@@ -1665,10 +1655,10 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                                             regs64->r8);
                        } else {
                                uint64_t t[5];
-                               
+
                                fasttrap_usdt_args64(probe, regs64,
                                                     sizeof (t) / sizeof (t[0]), t);
-                               
+
                                dtrace_probe(probe->ftp_id, t[0], t[1],
                                             t[2], t[3], t[4]);
                        }
@@ -1725,7 +1715,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                        user_addr_t dst;
                        user_addr_t addr;
                        int ret;
-                       
+
                        /*
                         * We have to emulate _every_ facet of the behavior of a ret
                         * instruction including what happens if the load from %esp
@@ -1733,25 +1723,25 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                         */
                        ret = fasttrap_fuword64((user_addr_t)regs64->isf.rsp, &dst);
                        addr = regs64->isf.rsp + sizeof (uint64_t);
-                       
+
                        if (ret == -1) {
                                fasttrap_sigsegv(p, uthread, (user_addr_t)regs64->isf.rsp);
                                new_pc = pc;
                                break;
                        }
-                       
+
                        if (tp->ftt_type == FASTTRAP_T_RET16)
                                addr += tp->ftt_dest;
-                       
+
                        regs64->isf.rsp = addr;
                        new_pc = dst;
                        break;
                }
-               
+
                case FASTTRAP_T_JCC:
                {
                        uint_t taken;
-                       
+
                        switch (tp->ftt_code) {
                                case FASTTRAP_JO:
                                        taken = (regs64->isf.rflags & FASTTRAP_EFLAGS_OF) != 0;
@@ -1812,7 +1802,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                                default:
                                        taken = FALSE;
                        }
-                       
+
                        if (taken)
                                new_pc = tp->ftt_dest;
                        else
@@ -1824,7 +1814,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                {
                        uint_t taken;
                        uint64_t cx = regs64->rcx--;
-                       
+
                        switch (tp->ftt_code) {
                                case FASTTRAP_LOOPNZ:
                                        taken = (regs64->isf.rflags & FASTTRAP_EFLAGS_ZF) == 0 &&
@@ -1840,14 +1830,14 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                                default:
                                        taken = FALSE;
                        }
-                       
+
                        if (taken)
                                new_pc = tp->ftt_dest;
                        else
                                new_pc = pc + tp->ftt_size;
                        break;
                }
-               
+
                case FASTTRAP_T_JCXZ:
                {
                        uint64_t cx = regs64->rcx;
@@ -1863,18 +1853,18 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                {
                        user_addr_t addr = regs64->isf.rsp - sizeof (uint64_t);
                        int ret = fasttrap_suword64(addr, (uint64_t)regs64->rbp);
-                       
+
                        if (ret == -1) {
                                fasttrap_sigsegv(p, uthread, addr);
                                new_pc = pc;
                                break;
                        }
-                       
+
                        regs64->isf.rsp = addr;
                        new_pc = pc + tp->ftt_size;
                        break;
                }
-               
+
                case FASTTRAP_T_NOP:
                        new_pc = pc + tp->ftt_size;
                        break;
@@ -1885,13 +1875,13 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                                new_pc = tp->ftt_dest;
                        } else {
                                user_addr_t value, addr = tp->ftt_dest;
-                               
+
                                if (tp->ftt_base != FASTTRAP_NOREG)
                                        addr += fasttrap_getreg(regs, tp->ftt_base);
                                if (tp->ftt_index != FASTTRAP_NOREG)
                                        addr += fasttrap_getreg(regs, tp->ftt_index) <<
                                                tp->ftt_scale;
-                               
+
                                if (tp->ftt_code == 1) {
                                        /*
                                         * If there's a segment prefix for this
@@ -1905,7 +1895,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                                                new_pc = pc;
                                                break;
                                        }
-                                       
+
                                        if (fasttrap_fuword64(addr, &value) == -1) {
                                                fasttrap_sigsegv(p, uthread, addr);
                                                new_pc = pc;
@@ -1926,13 +1916,13 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                        if (tp->ftt_type == FASTTRAP_T_CALL) {
                                user_addr_t addr = regs64->isf.rsp - sizeof (uint64_t);
                                int ret = fasttrap_suword64(addr, pc + tp->ftt_size);
-                               
+
                                if (ret == -1) {
                                        fasttrap_sigsegv(p, uthread, addr);
                                        new_pc = pc;
                                        break;
                                }
-                               
+
                                regs64->isf.rsp = addr;
                        }
                        break;
@@ -1942,7 +1932,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                        user_addr_t addr, write_addr;
                        uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 22];
                        uint_t i = 0;
-                       
+
                        /*
                         * Generic Instruction Tracing
                         * ---------------------------
@@ -2043,10 +2033,10 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
 
                        if (tp->ftt_ripmode != 0) {
                                uint64_t* reg;
-                               
+
                                ASSERT(tp->ftt_ripmode &
                                       (FASTTRAP_RIP_1 | FASTTRAP_RIP_2));
-                               
+
                                /*
                                 * If this was a %rip-relative instruction, we change
                                 * it to be either a %rax- or %rcx-relative
@@ -2060,12 +2050,12 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                                        scratch[i++] = FASTTRAP_REX(1, 0, 0, 1);
                                else
                                        scratch[i++] = FASTTRAP_REX(1, 0, 0, 0);
-                               
+
                                if (tp->ftt_ripmode & FASTTRAP_RIP_1)
                                        scratch[i++] = FASTTRAP_MOV_EAX;
                                else
                                        scratch[i++] = FASTTRAP_MOV_ECX;
-                               
+
                                switch (tp->ftt_ripmode) {
                                        case FASTTRAP_RIP_1:
                                                reg = &regs64->rax;
@@ -2087,7 +2077,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                                                reg = NULL;
                                                panic("unhandled ripmode in fasttrap_pid_probe64");
                                }
-                               
+
                                /* LINTED - alignment */
                                *(uint64_t *)&scratch[i] = *reg;
                                uthread->t_dtrace_regv = *reg;
@@ -2132,17 +2122,17 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                        } else {
                                new_pc = uthread->t_dtrace_scrpc;
                        }
-                       
+
                        uthread->t_dtrace_pc = pc;
                        uthread->t_dtrace_npc = pc + tp->ftt_size;
                        uthread->t_dtrace_on = 1;
                        break;
                }
-               
+
                default:
                        panic("fasttrap: mishandled an instruction");
        }
-       
+
 done:
        /*
         * APPLE NOTE:
index 348e04a30ebe1a99efd3cc71815264956d2a986d..e286708cbfe5be6372773b1e9a157b3216d580d7 100644 (file)
  * APPLE NOTE: This file was orginally uts/intel/sys/regset.h
  */
 
-/*
- * #pragma ident       "@(#)regset.h   1.11    05/06/08 SMI"
- */
-
 #ifdef __cplusplus
 extern "C" {
 #endif
index 63d1a843015983165b4ac9b8235ec3a54345b356..0b7d9516eba32c824508f623891a5690c24acfbd 100644 (file)
  * Use is subject to license terms.
  */
 
-/* #pragma ident       "@(#)fbt.c      1.15    05/09/19 SMI" */
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL /* Solaris vs. Darwin */
-#endif
-#endif
-
-#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */
 #include <kern/thread.h>
 #include <mach/thread_status.h>
 #include <mach/vm_param.h>
index 14f7ea974ca5b333b8ffb0ffc88293e2833d371d..6d7c48b7a2b977d63ae9962365b7e9be93ea3771 100644 (file)
 /*     Copyright (c) 1988 AT&T */
 /*       All Rights Reserved   */
 
-
-/*
- * #pragma ident       "@(#)instr_size.c       1.14    05/07/08 SMI"
- */
-
 #include <sys/dtrace.h>
 #include <sys/dtrace_glue.h>
 
index 1b3d774f3a8e1e249d29f00135bc9cf9de1d8d6d..1512e6b0c4abd615e933b768f67785aee36723e4 100644 (file)
 #include        <machine/exec.h>
 #include        <machine/machine_routines.h>
 
+#if __x86_64__
+extern int bootarg_no32exec;    /* bsd_init.c */
+#endif
+
 /**********************************************************************
 * Routine:     grade_binary()
 *
@@ -48,7 +52,7 @@
 *              by 32-bit binaries. 0 means unsupported.
 **********************************************************************/
 int
-grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype)
+grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype, bool allow_simulator_binary __unused)
 {
        cpu_subtype_t hostsubtype = cpu_subtype();
 
@@ -72,6 +76,11 @@ grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype)
                }
                break;
        case CPU_TYPE_X86:              /* native */
+#if __x86_64__
+               if (bootarg_no32exec && !allow_simulator_binary) {
+                       return 0;
+               }
+#endif
                return 1;
        }
 
index b5c244cd8ae5ce6710ff983050c49990d5c0b927..4b78fe79101ce618e7a0aea6ec28757159f05d69 100644 (file)
  * Use is subject to license terms.
  */
 
-/* #pragma ident       "@(#)sdt.c      1.9     08/07/01 SMI" */
-
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL /* Solaris vs. Darwin */
-#endif
-#endif
-
-#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */
 #include <kern/cpu_data.h>
 #include <kern/thread.h>
 #include <mach/thread_status.h>
index 39dd111107ac1a9273bd900a1d957b0140799c6d..2300e0b7f67e2f88c25bda03257d451bb3155778 100644 (file)
@@ -1027,9 +1027,28 @@ SYSCTL_INT(_machdep, OID_AUTO, fpsimd_fault_popc,
     CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED,
     &fpsimd_fault_popc, 0, "");
 
-extern int allow_64bit_proc_LDT_ops;
-SYSCTL_INT(_machdep, OID_AUTO, ldt64,
-    CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED,
-    &allow_64bit_proc_LDT_ops, 0, "");
+volatile int stop_spinning;
+static int
+spin_in_the_kernel(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       int new = 0, old = 0, changed = 0, error;
+
+       error = sysctl_io_number(req, old, sizeof(int), &new, &changed);
+       if (error == 0 && changed) {
+               stop_spinning = FALSE;
+               while (stop_spinning == FALSE) {
+                       __builtin_ia32_pause();
+               }
+       } else if (error == 0) {
+               stop_spinning = TRUE;
+       }
+
+       return error;
+}
+
+SYSCTL_PROC(_machdep_misc, OID_AUTO, spin_forever,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED,
+    0, 0,
+    spin_in_the_kernel, "I", "Spin forever");
 
 #endif /* DEVELOPMENT || DEBUG */
index ebbca2d75d7a9ada3925cf5b37deeabaf94a1e64..a5a7255bdf20b7e3623ea688c6d9771e1920c169 100644 (file)
@@ -47,6 +47,7 @@
 #include <sys/sysproto.h>
 #include <sys/kauth.h>
 #include <sys/systm.h>
+#include <sys/bitstring.h>
 
 #include <security/audit/audit.h>
 
 
 #include <machine/pal_routines.h>
 
+#if CONFIG_MACF
+#include <security/mac_framework.h>
+#endif
+
 #if CONFIG_DTRACE
 extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *);
 extern void dtrace_systrace_syscall_return(unsigned short, int, int *);
@@ -85,7 +90,7 @@ unix_syscall(x86_saved_state_t *state)
 {
        thread_t                thread;
        void                    *vt;
-       unsigned int            code;
+       unsigned int            code, syscode;
        struct sysent           *callp;
 
        int                     error;
@@ -116,19 +121,21 @@ unix_syscall(x86_saved_state_t *state)
                p = (struct proc *)get_bsdtask_info(current_task());
        }
 
-       code = regs->eax & I386_SYSCALL_NUMBER_MASK;
+       code    = regs->eax & I386_SYSCALL_NUMBER_MASK;
+       syscode = (code < nsysent) ? code : SYS_invalid;
        DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n",
-           code, syscallnames[code >= nsysent ? SYS_invalid : code], (uint32_t)regs->eip);
+           code, syscallnames[syscode], (uint32_t)regs->eip);
        params = (vm_offset_t) (regs->uesp + sizeof(int));
 
        regs->efl &= ~(EFL_CF);
 
-       callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code];
+       callp = &sysent[syscode];
 
        if (__improbable(callp == sysent)) {
                code = fuword(params);
                params += sizeof(int);
-               callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code];
+               syscode = (code < nsysent) ? code : SYS_invalid;
+               callp = &sysent[syscode];
        }
 
        vt = (void *)uthread->uu_arg;
@@ -152,11 +159,9 @@ unix_syscall(x86_saved_state_t *state)
                }
 
                if (__probable(!code_is_kdebug_trace(code))) {
-                       int *ip = (int *)vt;
-
-                       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                           BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
-                           *ip, *(ip + 1), *(ip + 2), *(ip + 3), 0);
+                       uint32_t *uip = vt;
+                       KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
+                           uip[0], uip[1], uip[2], uip[3]);
                }
 
 #if CONFIG_REQUIRES_U32_MUNGING
@@ -167,9 +172,7 @@ unix_syscall(x86_saved_state_t *state)
                }
 #endif
        } else {
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                   BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
-                   0, 0, 0, 0, 0);
+               KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START);
        }
 
        /*
@@ -189,10 +192,23 @@ unix_syscall(x86_saved_state_t *state)
        uthread->uu_vpindex = 0;
 #endif
 
+#if CONFIG_MACF
+       if (__improbable(p->syscall_filter_mask != NULL && !bitstr_test(p->syscall_filter_mask, syscode))) {
+               error = mac_proc_check_syscall_unix(p, syscode);
+               if (error) {
+                       goto skip_syscall;
+               }
+       }
+#endif /* CONFIG_MACF */
+
        AUDIT_SYSCALL_ENTER(code, p, uthread);
        error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0]));
        AUDIT_SYSCALL_EXIT(code, p, uthread, error);
 
+#if CONFIG_MACF
+skip_syscall:
+#endif /* CONFIG_MACF */
+
 #ifdef JOE_DEBUG
        if (uthread->uu_iocount) {
                printf("system call returned with uu_iocount != 0\n");
@@ -250,9 +266,8 @@ unix_syscall(x86_saved_state_t *state)
                throttle_lowpri_io(1);
        }
        if (__probable(!code_is_kdebug_trace(code))) {
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                   BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
-                   error, uthread->uu_rval[0], uthread->uu_rval[1], pid, 0);
+               KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
+                   error, uthread->uu_rval[0], uthread->uu_rval[1], pid);
        }
 
        if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) {
@@ -275,7 +290,7 @@ unix_syscall64(x86_saved_state_t *state)
 {
        thread_t        thread;
        void                    *vt;
-       unsigned int    code;
+       unsigned int    code, syscode;
        struct sysent   *callp;
        int             args_in_regs;
        boolean_t       args_start_at_rdi;
@@ -313,11 +328,12 @@ unix_syscall64(x86_saved_state_t *state)
                /* NOTREACHED */
        }
 
-       code = regs->rax & SYSCALL_NUMBER_MASK;
+       code    = regs->rax & SYSCALL_NUMBER_MASK;
+       syscode = (code < nsysent) ? code : SYS_invalid;
        DEBUG_KPRINT_SYSCALL_UNIX(
                "unix_syscall64: code=%d(%s) rip=%llx\n",
-               code, syscallnames[code >= nsysent ? SYS_invalid : code], regs->isf.rip);
-       callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code];
+               code, syscallnames[syscode], regs->isf.rip);
+       callp = &sysent[syscode];
 
        vt = (void *)uthread->uu_arg;
 
@@ -326,8 +342,9 @@ unix_syscall64(x86_saved_state_t *state)
                 * indirect system call... system call number
                 * passed as 'arg0'
                 */
-               code = regs->rdi;
-               callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code];
+               code    = regs->rdi;
+               syscode = (code < nsysent) ? code : SYS_invalid;
+               callp   = &sysent[syscode];
                args_start_at_rdi = FALSE;
                args_in_regs = 5;
        } else {
@@ -341,13 +358,11 @@ unix_syscall64(x86_saved_state_t *state)
                args_in_regs = MIN(args_in_regs, callp->sy_narg);
                memcpy(vt, args_start_at_rdi ? &regs->rdi : &regs->rsi, args_in_regs * sizeof(syscall_arg_t));
 
-
                if (!code_is_kdebug_trace(code)) {
-                       uint64_t *ip = (uint64_t *)vt;
+                       uint64_t *uip = vt;
 
-                       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                           BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
-                           (int)(*ip), (int)(*(ip + 1)), (int)(*(ip + 2)), (int)(*(ip + 3)), 0);
+                       KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
+                           uip[0], uip[1], uip[2], uip[3]);
                }
 
                if (__improbable(callp->sy_narg > args_in_regs)) {
@@ -364,9 +379,7 @@ unix_syscall64(x86_saved_state_t *state)
                        }
                }
        } else {
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                   BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
-                   0, 0, 0, 0, 0);
+               KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START);
        }
 
        /*
@@ -386,10 +399,23 @@ unix_syscall64(x86_saved_state_t *state)
        uthread->uu_vpindex = 0;
 #endif
 
+#if CONFIG_MACF
+       if (__improbable(p->syscall_filter_mask != NULL && !bitstr_test(p->syscall_filter_mask, syscode))) {
+               error = mac_proc_check_syscall_unix(p, syscode);
+               if (error) {
+                       goto skip_syscall;
+               }
+       }
+#endif /* CONFIG_MACF */
+
        AUDIT_SYSCALL_ENTER(code, p, uthread);
        error = (*(callp->sy_call))((void *) p, vt, &(uthread->uu_rval[0]));
        AUDIT_SYSCALL_EXIT(code, p, uthread, error);
 
+#if CONFIG_MACF
+skip_syscall:
+#endif /* CONFIG_MACF */
+
 #ifdef JOE_DEBUG
        if (uthread->uu_iocount) {
                printf("system call returned with uu_iocount != 0\n");
@@ -463,9 +489,8 @@ unix_syscall64(x86_saved_state_t *state)
                throttle_lowpri_io(1);
        }
        if (__probable(!code_is_kdebug_trace(code))) {
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                   BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
-                   error, uthread->uu_rval[0], uthread->uu_rval[1], pid, 0);
+               KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
+                   error, uthread->uu_rval[0], uthread->uu_rval[1], pid);
        }
 
 #if PROC_REF_DEBUG
@@ -602,9 +627,8 @@ unix_syscall_return(int error)
                throttle_lowpri_io(1);
        }
        if (!code_is_kdebug_trace(code)) {
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                   BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
-                   error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
+               KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
+                   error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid);
        }
 
        thread_exception_return();
index 603b216142413035da1576fbb04286f6ee9bf85b..724a1d210d791f6b7c15b96e6bfebce4107f6f87 100644 (file)
@@ -37,6 +37,7 @@
 #include <mach/exception.h>
 
 #include <kern/thread.h>
+#include <kern/ast.h>
 
 #include <sys/systm.h>
 #include <sys/param.h>
@@ -160,7 +161,7 @@ siginfo_user_to_user64_x86(user_siginfo_t *in, user64_siginfo_t *out)
 }
 
 void
-sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint32_t code)
+sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint32_t code, sigset_t siginfo)
 {
        union {
                struct mcontext_avx32           mctx_avx32;
@@ -198,7 +199,7 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint
        thread = current_thread();
        ut = get_bsdthread_info(thread);
 
-       if (p->p_sigacts->ps_siginfo & sigmask(sig)) {
+       if (siginfo & sigmask(sig)) {
                infostyle = UC_FLAVOR;
        }
 
@@ -802,6 +803,9 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval)
                return 0;
        }
 
+       /* see osfmk/kern/restartable.c */
+       act_set_ast_reset_pcs(thread);
+
        bzero(mctxp, sizeof(*mctxp));
 
        sig_xstate = current_xstate();
index ffac54d040b1af511af242afe46fc1478f795216..184862aa211f1bf1ef3c797d863154abf3d8b98e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2004-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -131,30 +131,30 @@ extern ppnum_t  pmap_find_phys(pmap_t pmap, addr64_t va);
  */
 
 static struct bdevsw mdevbdevsw = {
-       /* open */ mdevopen,
-       /* close */ mdevclose,
-       /* strategy */ mdevstrategy,
-       /* ioctl */ mdevbioctl,
-       /* dump */ eno_dump,
-       /* psize */ mdevsize,
-       /* flags */ D_DISK,
+       .d_open     = mdevopen,
+       .d_close    = mdevclose,
+       .d_strategy = mdevstrategy,
+       .d_ioctl    = mdevbioctl,
+       .d_dump     = eno_dump,
+       .d_psize    = mdevsize,
+       .d_type     = D_DISK,
 };
 
 static struct cdevsw mdevcdevsw = {
-       /* open */ mdevopen,
-       /* close */ mdevclose,
-       /* read */ mdevrw,
-       /* write */ mdevrw,
-       /* ioctl */ mdevcioctl,
-       /* stop */ eno_stop,
-       /* reset */ eno_reset,
-       /* ttys */ NULL,
-       /* select */ eno_select,
-       /* mmap */ eno_mmap,
-       /* strategy */ eno_strat,
-       /* getc */ eno_getc,
-       /* putc */ eno_putc,
-       /* flags */ D_DISK,
+       .d_open       = mdevopen,
+       .d_close      = mdevclose,
+       .d_read       = mdevrw,
+       .d_write      = mdevrw,
+       .d_ioctl      = mdevcioctl,
+       .d_stop       = eno_stop,
+       .d_reset      = eno_reset,
+       .d_ttys       = NULL,
+       .d_select     = eno_select,
+       .d_mmap       = eno_mmap,
+       .d_strategy   = eno_strat,
+       .d_reserved_1 = eno_getc,
+       .d_reserved_2 = eno_putc,
+       .d_type       = D_DISK,
 };
 
 struct mdev {
index 4a320cbbb76293458e877295663f6d0ab60ed0fb..6c445d7a3e9dd53d6e59fb89ba754fcd44cd7475 100644 (file)
@@ -346,9 +346,11 @@ mt_sysctl SYSCTL_HANDLER_ARGS
        case MT_SUPPORTED:
                return sysctl_io_number(req, (int)mt_core_supported, sizeof(int), NULL, NULL);
        case MT_PMIS:
-               return sysctl_io_number(req, mt_pmis, sizeof(mt_pmis), NULL, NULL);
-       case MT_RETROGRADE:
-               return sysctl_io_number(req, mt_retrograde, sizeof(mt_retrograde), NULL, NULL);
+               return sysctl_io_number(req, mt_count_pmis(), sizeof(uint64_t), NULL, NULL);
+       case MT_RETROGRADE: {
+               uint64_t value = os_atomic_load_wide(&mt_retrograde, relaxed);
+               return sysctl_io_number(req, value, sizeof(mt_retrograde), NULL, NULL);
+       }
        case MT_TASK_THREAD:
                return sysctl_io_number(req, (int)mt_core_supported, sizeof(int), NULL, NULL);
        case MT_DEBUG: {
index c0819facb1708633ca4add08812eb7319998078b..a3926c5db0ec19d11197164a043816d66f0ad4d3 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -142,30 +142,30 @@ static int      vndevice_cdev_major;
  */
 
 static struct bdevsw vn_bdevsw = {
-       /* open */ vnopen,
-       /* close */ vnclose,
-       /* strategy */ vnstrategy,
-       /* ioctl */ vnioctl_blk,
-       /* dump */ eno_dump,
-       /* psize */ vnsize,
-       /* flags */ D_DISK,
+       .d_open     = vnopen,
+       .d_close    = vnclose,
+       .d_strategy = vnstrategy,
+       .d_ioctl    = vnioctl_blk,
+       .d_dump     = eno_dump,
+       .d_psize    = vnsize,
+       .d_type     = D_DISK,
 };
 
 static struct cdevsw vn_cdevsw = {
-       /* open */ vnopen,
-       /* close */ vnclose,
-       /* read */ vnread,
-       /* write */ vnwrite,
-       /* ioctl */ vnioctl_chr,
-       /* stop */ eno_stop,
-       /* reset */ eno_reset,
-       /* ttys */ NULL,
-       /* select */ eno_select,
-       /* mmap */ eno_mmap,
-       /* strategy */ eno_strat,
-       /* getc */ eno_getc,
-       /* putc */ eno_putc,
-       /* flags */ D_DISK,
+       .d_open       = vnopen,
+       .d_close      = vnclose,
+       .d_read       = vnread,
+       .d_write      = vnwrite,
+       .d_ioctl      = vnioctl_chr,
+       .d_stop       = eno_stop,
+       .d_reset      = eno_reset,
+       .d_ttys       = NULL,
+       .d_select     = eno_select,
+       .d_mmap       = eno_mmap,
+       .d_strategy   = eno_strat,
+       .d_reserved_1 = eno_getc,
+       .d_reserved_2 = eno_putc,
+       .d_type       = D_DISK,
 };
 
 struct vn_softc {
index 5763410f84f6801e29f0609c7c55203f6bbf1edb..ab4a4ac8644317720ff1dcf377cbcc33b7ce74bf 100644 (file)
@@ -12,6 +12,9 @@ DATAFILES = \
        types.h vmparam.h _types.h _param.h \
        _mcontext.h
 
+DRIVERKIT_DATAFILES = \
+       limits.h types.h _types.h
+
 PRIVATE_DATAFILES = \
        disklabel.h
 
@@ -22,6 +25,7 @@ KERNELFILES = \
        _mcontext.h
 
 INSTALL_MD_LIST = ${DATAFILES}
+INSTALL_DRIVERKIT_MD_LIST = ${DRIVERKIT_DATAFILES}
 INSTALL_MD_LCL_LIST = ${PRIVATE_DATAFILES}
 
 INSTALL_MD_DIR = i386
index b627e5201614f95519df644f2d13ab26eee75985..1f6f2c7813036c49952507044ace6965ce4c8f6a 100644 (file)
@@ -30,8 +30,6 @@
 #ifndef _DIS_TABLES_H
 #define _DIS_TABLES_H
 
-/* #pragma ident       "@(#)dis_tables.h       1.10    07/07/10 SMI" */
-
 /*
  * Constants and prototypes for the IA32 disassembler backend.  See dis_tables.c
  * for usage information and documentation.
diff --git a/bsd/i386/exec.h b/bsd/i386/exec.h
deleted file mode 100644 (file)
index 24de864..0000000
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*-
- * Copyright (c) 1992, 1993
- *     The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *     This product includes software developed by the University of
- *     California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *     @(#)exec.h      8.1 (Berkeley) 6/11/93
- */
-
-#ifndef _BSD_I386_EXEC_H_
-#define _BSD_I386_EXEC_H_
-
-
-#ifdef BSD_KERNEL_PRIVATE
-/* Size of a page in an object file. */
-#define __LDPGSZ        4096
-
-/* Valid magic number check. */
-#define N_BADMAG(ex) \
-       ((ex).a_magic != NMAGIC && (ex).a_magic != OMAGIC && \
-           (ex).a_magic != ZMAGIC)
-
-/* Address of the bottom of the text segment. */
-#define N_TXTADDR(X)    0
-
-/* Address of the bottom of the data segment. */
-#define N_DATADDR(ex) \
-       (N_TXTADDR(ex) + ((ex).a_magic == OMAGIC ? (ex).a_text \
-       : __LDPGSZ + ((ex).a_text - 1 & ~(__LDPGSZ - 1))))
-
-/* Text segment offset. */
-#define N_TXTOFF(ex) \
-       ((ex).a_magic == ZMAGIC ? __LDPGSZ : sizeof(struct exec))
-
-/* Data segment offset. */
-#define N_DATOFF(ex) \
-       (N_TXTOFF(ex) + ((ex).a_magic != ZMAGIC ? (ex).a_text : \
-       __LDPGSZ + ((ex).a_text - 1 & ~(__LDPGSZ - 1))))
-
-/* Symbol table offset. */
-#define N_SYMOFF(ex) \
-       (N_TXTOFF(ex) + (ex).a_text + (ex).a_data + (ex).a_trsize + \
-           (ex).a_drsize)
-
-/* String table offset. */
-#define N_STROFF(ex)    (N_SYMOFF(ex) + (ex).a_syms)
-
-/* Description of the object file header (a.out format). */
-struct exec {
-#define OMAGIC  0407            /* old impure format */
-#define NMAGIC  0410            /* read-only text */
-#define ZMAGIC  0413            /* demand load format */
-#define QMAGIC  0314            /* demand load format. Header in text. */
-       unsigned int    a_magic;        /* magic number */
-
-       unsigned int    a_text;         /* text segment size */
-       unsigned int    a_data;         /* initialized data size */
-       unsigned int    a_bss;          /* uninitialized data size */
-       unsigned int    a_syms;         /* symbol table size */
-       unsigned int    a_entry;        /* entry point */
-       unsigned int    a_trsize;       /* text relocation size */
-       unsigned int    a_drsize;       /* data relocation size */
-};
-
-#endif /* BSD_KERNEL_PRIVATE */
-
-#endif /* _BSD_I386_EXEC_H_ */
index 974b59c5b10796ee2bb28ea5b6566293c34a7634..512d55512d235b307892853cd5e1718af89d1cf5 100644 (file)
 #ifndef _FASTTRAP_ISA_H
 #define _FASTTRAP_ISA_H
 
-/*
- * #pragma ident       "@(#)fasttrap_isa.h     1.6     06/09/19 SMI"
- */
-
 #include <sys/types.h>
 #include <stdint.h>
 
index 9bc2e5718409aa2460b035d66089ab88f4b771c9..f6cafd9c952cb7362f95e2e0832ee58f3edb5c77 100644 (file)
 #define _I386_LIMITS_H_
 
 #include <sys/cdefs.h>
+#ifndef DRIVERKIT
 #include <i386/_limits.h>
+#endif
 
 #define CHAR_BIT        8               /* number of bits in a char */
 #define MB_LEN_MAX      6               /* Allow 31 bit UTF2 */
 
+#ifndef DRIVERKIT
 #if !defined(_ANSI_SOURCE) && (!defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE))
 #define CLK_TCK         __DARWIN_CLK_TCK        /* ticks per second */
 #endif /* !_ANSI_SOURCE && (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */
+#endif
 
 /*
  * According to ANSI (section 2.2.4.2), the values below must be usable by
diff --git a/bsd/i386/reboot.h b/bsd/i386/reboot.h
deleted file mode 100644 (file)
index 0fbfa53..0000000
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#ifndef _BSD_I386_REBOOT_H_
-#define _BSD_I386_REBOOT_H_
-
-/*
- * Empty file (publicly)
- */
-
-#include <sys/appleapiopts.h>
-
-#ifdef  BSD_KERNEL_PRIVATE
-
-/*
- *     Use most significant 16 bits to avoid collisions with
- *     machine independent flags.
- */
-#define RB_POWERDOWN    0x00010000      /* power down on halt */
-#define RB_NOBOOTRC     0x00020000      /* don't run '/etc/rc.boot' */
-#define RB_DEBUG        0x00040000      /* drop into mini monitor on panic */
-#define RB_EJECT        0x00080000      /* eject disks on halt */
-#define RB_COMMAND      0x00100000      /* new boot command specified */
-#define RB_NOFP         0x00200000      /* don't use floating point */
-#define RB_BOOTNEXT     0x00400000      /* reboot into NeXT */
-#define RB_BOOTDOS      0x00800000      /* reboot into DOS */
-#define RB_PRETTY       0x01000000      /* shutdown with pretty graphics */
-
-#endif  /* BSD_KERNEL_PRIVATE */
-
-#endif  /* _BSD_I386_REBOOT_H_ */
index 7fc56d21798cd24f3dea9cb2efe5a3502b5c6f42..19183cbbf3749f05c8d4bb1baf42c11d9e86ef5f 100644 (file)
@@ -44,6 +44,8 @@ extern void bsd_ast(thread_t);
 
 extern void kevent_ast(thread_t thread, uint16_t bits);
 extern void act_set_astkevent(thread_t thread, uint16_t bits);
+extern uint16_t act_clear_astkevent(thread_t thread, uint16_t bits);
+extern void act_set_ast_reset_pcs(thread_t thread);
 
 #if CONFIG_DTRACE
 extern void ast_dtrace_on(void);
index 73be8cd43533cab875278789457fbf04e42707de..3e2052fb03016c41249ca11603dbf12e321af472 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <sys/event.h>                  /* for knote_init() */
 #include <sys/eventhandler.h>           /* for eventhandler_init() */
 #include <sys/kern_memorystatus.h>      /* for memorystatus_init() */
+#include <sys/kern_memorystatus_freeze.h> /* for memorystatus_freeze_init() */
 #include <sys/aio_kern.h>               /* for aio_init() */
 #include <sys/semaphore.h>              /* for psem_cache_init() */
 #include <net/dlil.h>                   /* for dlil_init() */
 #include <netinet/tcp_cc.h>                     /* for tcp_cc_init() */
 #include <netinet/mptcp_var.h>          /* for mptcp_control_register() */
 #include <net/nwk_wq.h>                 /* for nwk_wq_init */
+#include <net/restricted_in_port.h> /* for restricted_in_port_init() */
 #include <kern/assert.h>                /* for assert() */
 #include <sys/kern_overrides.h>         /* for init_system_override() */
 
 
 #include <machine/exec.h>
 
-#if NFSCLIENT
+#if CONFIG_NETBOOT
 #include <sys/netboot.h>
 #endif
 
@@ -236,9 +238,10 @@ dev_t   dumpdev;                /* device to take dumps on */
 long    dumplo;                 /* offset into dumpdev */
 long    hostid;
 char    hostname[MAXHOSTNAMELEN];
-int             hostnamelen;
+lck_mtx_t hostname_lock;
+lck_grp_t *hostname_lck_grp;
 char    domainname[MAXDOMNAMELEN];
-int             domainnamelen;
+lck_mtx_t domainname_lock;
 
 char rootdevice[DEVMAXNAMESIZE];
 
@@ -247,12 +250,16 @@ struct  kmemstats kmemstats[M_LAST];
 #endif
 
 struct  vnode *rootvp;
-int boothowto = RB_DEBUG;
+int boothowto;
 int minimalboot = 0;
 #if CONFIG_EMBEDDED
 int darkboot = 0;
 #endif
 
+#if __arm64__
+int legacy_footprint_entitlement_mode = LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE;
+#endif /* __arm64__ */
+
 #if PROC_REF_DEBUG
 __private_extern__ int proc_ref_tracking_disabled = 0; /* disable panics on leaked proc refs across syscall boundary */
 #endif
@@ -272,8 +279,19 @@ extern void oslog_setsize(int size);
 extern void throttle_init(void);
 extern void acct_init(void);
 
+#if CONFIG_LOCKERBOOT
+#define LOCKER_PROTOBOOT_MOUNT "/protoboot"
+
+const char kernel_protoboot_mount[] = LOCKER_PROTOBOOT_MOUNT;
+extern int mount_locker_protoboot(const char *fsname, const char *mntpoint,
+    const char *pbdevpath);
+#endif
+
 extern int serverperfmode;
 extern int ncl;
+#if DEVELOPMENT || DEBUG
+extern int syscallfilter_disable;
+#endif // DEVELOPMENT || DEBUG
 
 vm_map_t        bsd_pageable_map;
 vm_map_t        mb_map;
@@ -286,11 +304,10 @@ __private_extern__ vm_offset_t * execargs_cache = NULL;
 
 void bsd_exec_setup(int);
 
-#if __arm64__
-__private_extern__ int bootarg_no64exec = 0;
-#endif
+__private_extern__ int bootarg_execfailurereports = 0;
+
 #if __x86_64__
-__private_extern__ int bootarg_no32exec = 0;
+__private_extern__ int bootarg_no32exec = 1;
 #endif
 __private_extern__ int bootarg_vnode_cache_defeat = 0;
 
@@ -312,6 +329,7 @@ __private_extern__ int bootarg_disable_aslr = 0;
 #if DEVELOPMENT || DEBUG
 char dyld_alt_path[MAXPATHLEN];
 int use_alt_dyld = 0;
+extern uint64_t dyld_flags;
 #endif
 
 int     cmask = CMASK;
@@ -380,9 +398,9 @@ process_name(const char *s, proc_t p)
 
 /* To allow these values to be patched, they're globals here */
 #include <machine/vmparam.h>
-struct rlimit vm_initial_limit_stack = { DFLSSIZ, MAXSSIZ - PAGE_MAX_SIZE };
-struct rlimit vm_initial_limit_data = { DFLDSIZ, MAXDSIZ };
-struct rlimit vm_initial_limit_core = { DFLCSIZ, MAXCSIZ };
+struct rlimit vm_initial_limit_stack = { .rlim_cur = DFLSSIZ, .rlim_max = MAXSSIZ - PAGE_MAX_SIZE };
+struct rlimit vm_initial_limit_data = { .rlim_cur = DFLDSIZ, .rlim_max = MAXDSIZ };
+struct rlimit vm_initial_limit_core = { .rlim_cur = DFLCSIZ, .rlim_max = MAXCSIZ };
 
 extern thread_t cloneproc(task_t, coalition_t, proc_t, int, int);
 extern int      (*mountroot)(void);
@@ -445,11 +463,25 @@ bsd_init(void)
        kern_return_t   ret;
        struct ucred temp_cred;
        struct posix_cred temp_pcred;
-#if NFSCLIENT || CONFIG_IMAGEBOOT
+#if CONFIG_NETBOOT || CONFIG_IMAGEBOOT
        boolean_t       netboot = FALSE;
 #endif
+#if CONFIG_LOCKERBOOT
+       vnode_t pbvn = NULLVP;
+       mount_t pbmnt = NULL;
+       char *pbdevp = NULL;
+       char pbdevpath[64];
+       char pbfsname[MFSNAMELEN];
+       char *slash_dev = NULL;
+#endif
 
-#define bsd_init_kprintf(x...) /* kprintf("bsd_init: " x) */
+#define DEBUG_BSDINIT 0
+
+#if DEBUG_BSDINIT
+#define bsd_init_kprintf(x, ...) kprintf("bsd_init: " x, ## __VA_ARGS__)
+#else
+#define bsd_init_kprintf(x, ...)
+#endif
 
        throttle_init();
 
@@ -546,6 +578,10 @@ bsd_init(void)
 
        ulock_initialize();
 
+       hostname_lck_grp = lck_grp_alloc_init("hostname", LCK_GRP_ATTR_NULL);
+       lck_mtx_init(&hostname_lock, hostname_lck_grp, LCK_ATTR_NULL);
+       lck_mtx_init(&domainname_lock, hostname_lck_grp, LCK_ATTR_NULL);
+
        /*
         * Create process 0.
         */
@@ -646,7 +682,7 @@ bsd_init(void)
        /* Create the file descriptor table. */
        kernproc->p_fd = &filedesc0;
        filedesc0.fd_cmask = cmask;
-       filedesc0.fd_knlistsize = -1;
+       filedesc0.fd_knlistsize = 0;
        filedesc0.fd_knlist = NULL;
        filedesc0.fd_knhash = NULL;
        filedesc0.fd_knhashmask = 0;
@@ -738,6 +774,7 @@ bsd_init(void)
        bsd_init_kprintf("calling mbinit\n");
        mbinit();
        net_str_id_init(); /* for mbuf tags */
+       restricted_in_port_init();
 #endif /* SOCKETS */
 
        /*
@@ -839,13 +876,8 @@ bsd_init(void)
        bsd_init_kprintf("calling acct_init\n");
        acct_init();
 
-#ifdef GPROF
-       /* Initialize kernel profiling. */
-       kmstartup();
-#endif
-
        bsd_init_kprintf("calling sysctl_mib_init\n");
-       sysctl_mib_init()
+       sysctl_mib_init();
 
        bsd_init_kprintf("calling bsd_autoconf\n");
        bsd_autoconf();
@@ -928,7 +960,7 @@ bsd_init(void)
 
                bsd_init_kprintf("calling setconf\n");
                setconf();
-#if NFSCLIENT
+#if CONFIG_NETBOOT
                netboot = (mountroot == netboot_mountroot);
 #endif
 
@@ -937,7 +969,7 @@ bsd_init(void)
                        break;
                }
                rootdevice[0] = '\0';
-#if NFSCLIENT
+#if CONFIG_NETBOOT
                if (netboot) {
                        PE_display_icon( 0, "noroot");  /* XXX a netboot-specific icon would be nicer */
                        vc_progress_set(FALSE, 0);
@@ -970,7 +1002,7 @@ bsd_init(void)
        (void)vnode_put(rootvnode);
        filedesc0.fd_cdir = rootvnode;
 
-#if NFSCLIENT
+#if CONFIG_NETBOOT
        if (netboot) {
                int err;
 
@@ -992,17 +1024,60 @@ bsd_init(void)
 
 
 #if CONFIG_IMAGEBOOT
+#if CONFIG_LOCKERBOOT
+       /*
+        * Stash the protoboot vnode, mount, filesystem name, and device name for
+        * later use. Note that the mount-from name may not have the "/dev/"
+        * component, so we must sniff out this condition and add it as needed.
+        */
+       pbvn = rootvnode;
+       pbmnt = pbvn->v_mount;
+       pbdevp = vfs_statfs(pbmnt)->f_mntfromname;
+       slash_dev = strnstr(pbdevp, "/dev/", strlen(pbdevp));
+       if (slash_dev) {
+               /*
+                * If the old root is a snapshot mount, it will have the form:
+                *
+                *     com.apple.os.update-<boot manifest hash>@<dev node path>
+                *
+                * So we just search the mntfromname for any occurrence of "/dev/" and
+                * grab that as the device path. The image boot code needs a dev node to
+                * do the re-mount, so we cannot directly mount the snapshot as the
+                * protoboot volume currently.
+                */
+               strlcpy(pbdevpath, slash_dev, sizeof(pbdevpath));
+       } else {
+               snprintf(pbdevpath, sizeof(pbdevpath), "/dev/%s", pbdevp);
+       }
+
+       bsd_init_kprintf("protoboot mount-from: %s\n", pbdevp);
+       bsd_init_kprintf("protoboot dev path: %s\n", pbdevpath);
+
+       strlcpy(pbfsname, pbmnt->mnt_vtable->vfc_name, sizeof(pbfsname));
+#endif
        /*
         * See if a system disk image is present. If so, mount it and
         * switch the root vnode to point to it
         */
-       if (netboot == FALSE && imageboot_needed()) {
+       imageboot_type_t imageboot_type = imageboot_needed();
+       if (netboot == FALSE && imageboot_type) {
                /*
                 * An image was found.  No turning back: we're booted
                 * with a kernel from the disk image.
                 */
-               imageboot_setup();
+               bsd_init_kprintf("doing image boot: type = %d\n", imageboot_type);
+               imageboot_setup(imageboot_type);
        }
+
+#if CONFIG_LOCKERBOOT
+       if (imageboot_type == IMAGEBOOT_LOCKER) {
+               bsd_init_kprintf("booting from locker\n");
+               if (vnode_tag(rootvnode) != VT_LOCKERFS) {
+                       panic("root filesystem not a locker: fsname = %s",
+                           rootvnode->v_mount->mnt_vtable->vfc_name);
+               }
+       }
+#endif /* CONFIG_LOCKERBOOT */
 #endif /* CONFIG_IMAGEBOOT */
 
        /* set initial time; all other resource data is  already zero'ed */
@@ -1017,6 +1092,30 @@ bsd_init(void)
        }
 #endif /* DEVFS */
 
+       if (vfs_mount_rosv_data()) {
+               panic("failed to mount data volume!");
+       }
+
+       if (vfs_mount_vm()) {
+               printf("failed to mount vm volume!");
+       }
+
+#if CONFIG_LOCKERBOOT
+       /*
+        * We need to wait until devfs is up before remounting the protoboot volume
+        * within the locker so that it can have a real devfs vnode backing it.
+        */
+       if (imageboot_type == IMAGEBOOT_LOCKER) {
+               bsd_init_kprintf("re-mounting protoboot volume\n");
+               int error = mount_locker_protoboot(pbfsname, LOCKER_PROTOBOOT_MOUNT,
+                   pbdevpath);
+               if (error) {
+                       panic("failed to mount protoboot volume: dev path = %s, error = %d",
+                           pbdevpath, error);
+               }
+       }
+#endif /* CONFIG_LOCKERBOOT */
+
        /* Initialize signal state for process 0. */
        bsd_init_kprintf("calling siginit\n");
        siginit(kernproc);
@@ -1111,7 +1210,7 @@ setconf(void)
                flags = 0;
        }
 
-#if NFSCLIENT
+#if CONFIG_NETBOOT
        if (flags & 1) {
                /* network device */
                mountroot = netboot_mountroot;
@@ -1119,7 +1218,7 @@ setconf(void)
 #endif
        /* otherwise have vfs determine root filesystem */
        mountroot = NULL;
-#if NFSCLIENT
+#if CONFIG_NETBOOT
 }
 #endif
 }
@@ -1153,23 +1252,19 @@ bsd_utaskbootstrap(void)
        ut = (struct uthread *)get_bsdthread_info(thread);
        ut->uu_sigmask = 0;
        act_set_astbsd(thread);
-       task_clear_return_wait(get_threadtask(thread));
+       task_clear_return_wait(get_threadtask(thread), TCRW_CLEAR_ALL_WAIT);
 }
 
 static void
 parse_bsd_args(void)
 {
-       char namep[16];
+       char namep[48];
        int msgbuf;
 
        if (PE_parse_boot_argn("-s", namep, sizeof(namep))) {
                boothowto |= RB_SINGLE;
        }
 
-       if (PE_parse_boot_argn("-b", namep, sizeof(namep))) {
-               boothowto |= RB_NOBOOTRC;
-       }
-
        if (PE_parse_boot_argn("-x", namep, sizeof(namep))) { /* safe boot */
                boothowto |= RB_SAFEBOOT;
        }
@@ -1183,19 +1278,21 @@ parse_bsd_args(void)
                minimalboot = 1;
        }
 
-#if __arm64__
-       /* disable 64 bit grading */
-       if (PE_parse_boot_argn("-no64exec", namep, sizeof(namep))) {
-               bootarg_no64exec = 1;
-       }
-#endif
 #if __x86_64__
+       int no32exec;
+
        /* disable 32 bit grading */
-       if (PE_parse_boot_argn("-no32exec", namep, sizeof(namep))) {
-               bootarg_no32exec = 1;
+       if (PE_parse_boot_argn("no32exec", &no32exec, sizeof(no32exec))) {
+               bootarg_no32exec = !!no32exec;
        }
 #endif
 
+       int execfailure_crashreports;
+       /* enable crash reports on various exec failures */
+       if (PE_parse_boot_argn("execfailurecrashes", &execfailure_crashreports, sizeof(execfailure_crashreports))) {
+               bootarg_execfailurereports = !!execfailure_crashreports;
+       }
+
        /* disable vnode_cache_is_authorized() by setting vnode_cache_defeat */
        if (PE_parse_boot_argn("-vnode_cache_defeat", namep, sizeof(namep))) {
                bootarg_vnode_cache_defeat = 1;
@@ -1266,15 +1363,48 @@ parse_bsd_args(void)
        if (PE_parse_boot_argn("-no_sigsys", namep, sizeof(namep))) {
                send_sigsys = false;
        }
-#endif
 
-#if (DEVELOPMENT || DEBUG)
        if (PE_parse_boot_argn("alt-dyld", dyld_alt_path, sizeof(dyld_alt_path))) {
                if (strlen(dyld_alt_path) > 0) {
                        use_alt_dyld = 1;
                }
        }
-#endif
+       PE_parse_boot_argn("dyld_flags", &dyld_flags, sizeof(dyld_flags));
+
+       if (PE_parse_boot_argn("-disable_syscallfilter", &namep, sizeof(namep))) {
+               syscallfilter_disable = 1;
+       }
+
+#if __arm64__
+       if (PE_parse_boot_argn("legacy_footprint_entitlement_mode", &legacy_footprint_entitlement_mode, sizeof(legacy_footprint_entitlement_mode))) {
+               /*
+                * legacy_footprint_entitlement_mode specifies the behavior we want associated
+                * with the entitlement. The supported modes are:
+                *
+                * LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE:
+                *      Indicates that we want every process to have the memory accounting
+                *      that is available in iOS 12.0 and beyond.
+                *
+                * LEGACY_FOOTPRINT_ENTITLEMENT_IOS11_ACCT:
+                *      Indicates that for every process that has the 'legacy footprint entitlement',
+                *      we want to give it the old iOS 11.0 accounting behavior which accounted some
+                *      of the process's memory to the kernel.
+                *
+                * LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE:
+                *      Indicates that for every process that has the 'legacy footprint entitlement',
+                *      we want it to have a higher memory limit which will help them acclimate to the
+                *      iOS 12.0 (& beyond) accounting behavior that does the right accounting.
+                *      The bonus added to the system-wide task limit to calculate this higher memory limit
+                *      is available in legacy_footprint_bonus_mb.
+                */
+
+               if (legacy_footprint_entitlement_mode < LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE ||
+                   legacy_footprint_entitlement_mode > LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE) {
+                       legacy_footprint_entitlement_mode = LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE;
+               }
+       }
+#endif /* __arm64__ */
+#endif /* DEVELOPMENT || DEBUG */
 }
 
 void
@@ -1304,7 +1434,7 @@ bsd_exec_setup(int scale)
        bsd_pageable_map_size = (bsd_simul_execs * BSD_PAGEABLE_SIZE_PER_EXEC);
 }
 
-#if !NFSCLIENT
+#if !CONFIG_NETBOOT
 int
 netboot_root(void);
 
index fb33955de5cac4019429234a6d7c7df6bd2788e0..f738345987418d3c0695399828f2186967cc7331 100644 (file)
@@ -348,16 +348,21 @@ cdevsw_setkqueueok(int maj, struct cdevsw * csw, int extra_flags)
 int
 bsd_hostname(char * buf, int bufsize, int * len)
 {
+       int ret, hnlen;
        /*
-        * "hostname" is null-terminated, and "hostnamelen" is equivalent to strlen(hostname).
+        * "hostname" is null-terminated
         */
-       if (hostnamelen < bufsize) {
+       lck_mtx_lock(&hostname_lock);
+       hnlen = strlen(hostname);
+       if (hnlen < bufsize) {
                strlcpy(buf, hostname, bufsize);
-               *len = hostnamelen;
-               return 0;
+               *len = hnlen;
+               ret = 0;
        } else {
-               return ENAMETOOLONG;
+               ret = ENAMETOOLONG;
        }
+       lck_mtx_unlock(&hostname_lock);
+       return ret;
 }
 
 void
diff --git a/bsd/kern/chunklist.c b/bsd/kern/chunklist.c
new file mode 100644 (file)
index 0000000..ed93a2f
--- /dev/null
@@ -0,0 +1,676 @@
+/*
+ * Copyright (c) 2019 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/proc_internal.h>
+#include <sys/systm.h>
+#include <sys/systm.h>
+#include <sys/mount_internal.h>
+#include <sys/filedesc.h>
+#include <sys/vnode_internal.h>
+#include <sys/imageboot.h>
+#include <kern/assert.h>
+
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/sysproto.h>
+#include <sys/csr.h>
+#include <miscfs/devfs/devfsdefs.h>
+#include <libkern/crypto/sha2.h>
+#include <libkern/crypto/rsa.h>
+#include <libkern/OSKextLibPrivate.h>
+
+#include <kern/chunklist.h>
+#include <kern/kalloc.h>
+
+#include <pexpert/pexpert.h>
+
+extern int read_file(const char *path, void **bufp, size_t *bufszp); /* implemented in imageboot.c */
+extern vnode_t imgboot_get_image_file(const char *path, off_t *fsize, int *errp); /* implemented in imageboot.c */
+
+#define AUTHDBG(fmt, args...) do { printf("%s: " fmt "\n", __func__, ##args); } while (0)
+#define AUTHPRNT(fmt, args...) do { printf("%s: " fmt "\n", __func__, ##args); } while (0)
+#define kfree_safe(x) do { if ((x)) { kfree_addr((x)); (x) = NULL; } } while (0)
+
+static const char *libkern_path = "/System/Library/Extensions/System.kext/PlugIns/Libkern.kext/Libkern";
+static const char *libkern_bundle = "com.apple.kpi.libkern";
+
+/*
+ * Rev1 chunklist handling
+ */
+const struct chunklist_pubkey rev1_chunklist_pubkeys[] = {
+};
+const size_t rev1_chunklist_num_pubkeys = sizeof(rev1_chunklist_pubkeys) / sizeof(rev1_chunklist_pubkeys[0]);
+
+static void
+key_byteswap(void *_dst, const void *_src, size_t len)
+{
+       uint32_t *dst __attribute__((align_value(1))) = _dst;
+       const uint32_t *src __attribute__((align_value(1))) = _src;
+
+       assert(len % sizeof(uint32_t) == 0);
+
+       len = len / sizeof(uint32_t);
+       for (size_t i = 0; i < len; i++) {
+               dst[len - i - 1] = OSSwapInt32(src[i]);
+       }
+}
+
+static int
+construct_chunklist_path(const char *root_path, char **bufp)
+{
+       int err = 0;
+       char *path = NULL;
+       size_t len = 0;
+
+       path = kalloc(MAXPATHLEN);
+       if (path == NULL) {
+               AUTHPRNT("failed to allocate space for chunklist path");
+               err = ENOMEM;
+               goto out;
+       }
+
+       len = strnlen(root_path, MAXPATHLEN);
+       if (len < MAXPATHLEN && len > strlen(".dmg")) {
+               /* correctly terminated string with space for extension */
+       } else {
+               AUTHPRNT("malformed root path");
+               err = EOVERFLOW;
+               goto out;
+       }
+
+       len = strlcpy(path, root_path, MAXPATHLEN);
+       if (len >= MAXPATHLEN) {
+               AUTHPRNT("root path is too long");
+               err = EOVERFLOW;
+               goto out;
+       }
+
+       path[len - strlen(".dmg")] = '\0';
+       len = strlcat(path, ".chunklist", MAXPATHLEN);
+       if (len >= MAXPATHLEN) {
+               AUTHPRNT("chunklist path is too long");
+               err = EOVERFLOW;
+               goto out;
+       }
+
+out:
+       if (err) {
+               kfree_safe(path);
+       } else {
+               *bufp = path;
+       }
+       return err;
+}
+
+static int
+validate_signature(const uint8_t *key_msb, size_t keylen, uint8_t *sig_msb, size_t siglen, uint8_t *digest)
+{
+       int err = 0;
+       bool sig_valid = false;
+       uint8_t *sig = NULL;
+
+       const uint8_t exponent[] = { 0x01, 0x00, 0x01 };
+       uint8_t *modulus = kalloc(keylen);
+       rsa_pub_ctx *rsa_ctx = kalloc(sizeof(rsa_pub_ctx));
+       sig = kalloc(siglen);
+
+       if (modulus == NULL || rsa_ctx == NULL || sig == NULL) {
+               err = ENOMEM;
+               goto out;
+       }
+
+       bzero(rsa_ctx, sizeof(rsa_pub_ctx));
+       key_byteswap(modulus, key_msb, keylen);
+       key_byteswap(sig, sig_msb, siglen);
+
+       err = rsa_make_pub(rsa_ctx,
+           sizeof(exponent), exponent,
+           CHUNKLIST_PUBKEY_LEN, modulus);
+       if (err) {
+               AUTHPRNT("rsa_make_pub() failed");
+               goto out;
+       }
+
+       err = rsa_verify_pkcs1v15(rsa_ctx, CC_DIGEST_OID_SHA256,
+           SHA256_DIGEST_LENGTH, digest,
+           siglen, sig,
+           &sig_valid);
+       if (err) {
+               sig_valid = false;
+               AUTHPRNT("rsa_verify() failed");
+               goto out;
+       }
+
+out:
+       kfree_safe(sig);
+       kfree_safe(rsa_ctx);
+       kfree_safe(modulus);
+
+       if (err) {
+               return err;
+       } else if (sig_valid == true) {
+               return 0; /* success */
+       } else {
+               return EAUTH;
+       }
+}
+
+static int
+validate_root_image(const char *root_path, void *chunklist)
+{
+       int err = 0;
+       struct chunklist_hdr *hdr = chunklist;
+       struct chunklist_chunk *chk = NULL;
+       size_t ch = 0;
+       struct vnode *vp = NULL;
+       off_t fsize = 0;
+       off_t offset = 0;
+       bool doclose = false;
+       size_t bufsz = 0;
+       void *buf = NULL;
+
+       vfs_context_t ctx = vfs_context_kernel();
+       kauth_cred_t kerncred = vfs_context_ucred(ctx);
+       proc_t p = vfs_context_proc(ctx);
+
+       AUTHDBG("validating root dmg %s", root_path);
+
+       vp = imgboot_get_image_file(root_path, &fsize, &err);
+       if (vp == NULL) {
+               goto out;
+       }
+
+       if ((err = VNOP_OPEN(vp, FREAD, ctx)) != 0) {
+               AUTHPRNT("failed to open vnode");
+               goto out;
+       }
+       doclose = true;
+
+       /*
+        * Iterate the chunk list and check each chunk
+        */
+       chk = chunklist + hdr->cl_chunk_offset;
+       for (ch = 0; ch < hdr->cl_chunk_count; ch++) {
+               int resid = 0;
+
+               if (!buf) {
+                       /* allocate buffer based on first chunk size */
+                       buf = kalloc(chk->chunk_size);
+                       if (buf == NULL) {
+                               err = ENOMEM;
+                               goto out;
+                       }
+                       bufsz = chk->chunk_size;
+               }
+
+               if (chk->chunk_size > bufsz) {
+                       AUTHPRNT("chunk size too big");
+                       err = EINVAL;
+                       goto out;
+               }
+
+               err = vn_rdwr(UIO_READ, vp, (caddr_t)buf, chk->chunk_size, offset, UIO_SYSSPACE, IO_NODELOCKED, kerncred, &resid, p);
+               if (err) {
+                       AUTHPRNT("vn_rdrw fail (err = %d, resid = %d)", err, resid);
+                       goto out;
+               }
+               if (resid) {
+                       err = EINVAL;
+                       AUTHPRNT("chunk covered non-existant part of image");
+                       goto out;
+               }
+
+               /* calculate the SHA256 of this chunk */
+               uint8_t sha_digest[SHA256_DIGEST_LENGTH];
+               SHA256_CTX sha_ctx;
+               SHA256_Init(&sha_ctx);
+               SHA256_Update(&sha_ctx, buf, chk->chunk_size);
+               SHA256_Final(sha_digest, &sha_ctx);
+
+               /* Check the calculated SHA matches the chunk list */
+               if (bcmp(sha_digest, chk->chunk_sha256, SHA256_DIGEST_LENGTH) != 0) {
+                       AUTHPRNT("SHA mismatch on chunk %lu (offset %lld, size %u)", ch, offset, chk->chunk_size);
+                       err = EINVAL;
+                       goto out;
+               }
+
+               if (os_add_overflow(offset, chk->chunk_size, &offset)) {
+                       err = EINVAL;
+                       goto out;
+               }
+               chk++;
+       }
+
+       if (offset != fsize) {
+               AUTHPRNT("chunklist did not cover entire file (offset = %lld, fsize = %lld)", offset, fsize);
+               err = EINVAL;
+               goto out;
+       }
+
+out:
+       kfree_safe(buf);
+       if (doclose) {
+               VNOP_CLOSE(vp, FREAD, ctx);
+       }
+       if (vp) {
+               vnode_put(vp);
+               vp = NULL;
+       }
+
+       return err;
+}
+
+static const uuid_t *
+getuuidfromheader_safe(const void *buf, size_t bufsz, size_t *uuidsz)
+{
+       const struct uuid_command *cmd = NULL;
+       const kernel_mach_header_t *mh = buf;
+
+       /* space for the header and at least one load command? */
+       if (bufsz < sizeof(kernel_mach_header_t) + sizeof(struct uuid_command)) {
+               AUTHPRNT("libkern image too small");
+               return NULL;
+       }
+
+       /* validate the mach header */
+       if (mh->magic != MH_MAGIC_64 || (mh->sizeofcmds > bufsz - sizeof(kernel_mach_header_t))) {
+               AUTHPRNT("invalid MachO header");
+               return NULL;
+       }
+
+       /* iterate the load commands */
+       size_t offset = sizeof(kernel_mach_header_t);
+       for (size_t i = 0; i < mh->ncmds; i++) {
+               cmd = buf + offset;
+
+               if (cmd->cmd == LC_UUID) {
+                       *uuidsz = sizeof(cmd->uuid);
+                       return &cmd->uuid;
+               }
+
+               if (os_add_overflow(cmd->cmdsize, offset, &offset) ||
+                   offset > bufsz - sizeof(struct uuid_command)) {
+                       return NULL;
+               }
+       }
+
+       return NULL;
+}
+
+/*
+ * Rev2 chunklist handling
+ */
+const struct chunklist_pubkey rev2_chunklist_pubkeys[] = {
+};
+const size_t rev2_chunklist_num_pubkeys = sizeof(rev2_chunklist_pubkeys) / sizeof(rev2_chunklist_pubkeys[0]);
+
+static const struct efi_guid_t gEfiSignAppleCertTypeGuid = CHUNKLIST_REV2_SIG_HASH_GUID;
+static const struct efi_guid_t gEfiSignCertTypeRsa2048Sha256Guid = EFI_CERT_TYPE_RSA2048_SHA256;
+
+static boolean_t
+validate_rev2_certificate(struct rev2_chunklist_certificate *certificate)
+{
+       /* Default value of current security epoch MUST be CHUNKLIST_MIN_SECURITY_EPOCH */
+       uint8_t current_security_epoch = CHUNKLIST_MIN_SECURITY_EPOCH;
+
+       /* Certificate.Length must be equal to sizeof(CERTIFICATE) */
+       if (certificate->length != sizeof(struct rev2_chunklist_certificate)) {
+               AUTHDBG("invalid certificate length");
+               return FALSE;
+       }
+
+       /* Certificate.Revision MUST be equal to 2 */
+       if (certificate->revision != 2) {
+               AUTHDBG("invalid certificate revision");
+               return FALSE;
+       }
+
+       /* Certificate.SecurityEpoch MUST be current or higher */
+       if (PE_parse_boot_argn(CHUNKLIST_SECURITY_EPOCH, &current_security_epoch, sizeof(current_security_epoch)) &&
+           certificate->security_epoch < current_security_epoch) {
+               AUTHDBG("invalid certificate security epoch");
+               return FALSE;
+       }
+
+       /* Certificate.CertificateType MUST be equal to WIN_CERT_TYPE_EFI_GUID (0x0EF1) */
+       if (certificate->certificate_type != WIN_CERT_TYPE_EFI_GUID) {
+               AUTHDBG("invalid certificate type");
+               return FALSE;
+       }
+
+       /* Certificate.CertificateGuid MUST be equal to 45E7BC51-913C-42AC-96A2-10712FFBEBA7 */
+       if (0 != memcmp(&certificate->certificate_guid, &gEfiSignAppleCertTypeGuid, sizeof(struct efi_guid_t))) {
+               AUTHDBG("invalid certificate GUID");
+               return FALSE;
+       }
+
+       /* Certificate.HashTypeGuid MUST be equal to A7717414-C616-4977-9420-844712A735BF */
+       if (0 != memcmp(&certificate->hash_type_guid, &gEfiSignCertTypeRsa2048Sha256Guid, sizeof(struct efi_guid_t))) {
+               AUTHDBG("invalid hash type GUID");
+               return FALSE;
+       }
+
+       return TRUE;
+}
+
+static int
+validate_rev2_chunklist(uint8_t *buffer, size_t buffer_size)
+{
+       struct rev2_chunklist_certificate *certificate;
+       size_t security_data_offset;
+
+       /* Check input parameters to be sane */
+       if (buffer == NULL || buffer_size == 0) {
+               AUTHDBG("invalid parameter");
+               return EINVAL;
+       }
+
+       /* Check for existing signature */
+       if (buffer_size < sizeof(struct rev2_chunklist_certificate)) {
+               AUTHDBG("no space for certificate");
+               return EINVAL;
+       }
+
+       security_data_offset = buffer_size - sizeof(struct rev2_chunklist_certificate);
+       certificate = (struct rev2_chunklist_certificate*)(buffer + security_data_offset);
+
+       /* Check signature candidate to be a valid rev2 chunklist certificate */
+       if (TRUE != validate_rev2_certificate(certificate)) {
+               return EINVAL;
+       }
+
+       /* Check public key to be trusted */
+       for (size_t i = 0; i < rev2_chunklist_num_pubkeys; i++) {
+               const struct chunklist_pubkey *key = &rev2_chunklist_pubkeys[i];
+               /* Production keys are always trusted */
+               if (key->is_production != TRUE) {
+                       uint8_t no_rev2_dev = 0;
+                       /* Do not trust rev2 development keys if CHUNKLIST_NO_REV2_DEV is present */
+                       if (PE_parse_boot_argn(CHUNKLIST_NO_REV2_DEV, &no_rev2_dev, sizeof(no_rev2_dev))) {
+                               AUTHDBG("rev2 development key is not trusted");
+                               continue;
+                       }
+               }
+
+               /* Check certificate public key to be the trusted one */
+               if (0 == memcmp(key->key, certificate->rsa_public_key, sizeof(certificate->rsa_public_key))) {
+                       AUTHDBG("certificate public key is trusted");
+
+                       /* Hash everything but signature */
+                       SHA256_CTX hash_ctx;
+                       SHA256_Init(&hash_ctx);
+                       SHA256_Update(&hash_ctx, buffer, security_data_offset);
+
+                       /* Include Certificate.SecurityEpoch value */
+                       SHA256_Update(&hash_ctx, &certificate->security_epoch, sizeof(certificate->security_epoch));
+
+                       /* Finalize hashing into the output buffer */
+                       uint8_t sha_digest[SHA256_DIGEST_LENGTH];
+                       SHA256_Final(sha_digest, &hash_ctx);
+
+                       /* Validate signature */
+                       return validate_signature(certificate->rsa_public_key,
+                                  sizeof(certificate->rsa_public_key),
+                                  certificate->rsa_signature,
+                                  sizeof(certificate->rsa_signature),
+                                  sha_digest);
+               }
+       }
+
+       AUTHDBG("certificate public key is not trusted");
+       return EINVAL;
+}
+
+/*
+ * Main chunklist validation routine
+ */
+static int
+validate_chunklist(void *buf, size_t len)
+{
+       int err = 0;
+       size_t sigsz = 0;
+       size_t sig_end = 0;
+       size_t chunks_end = 0;
+       size_t sig_len = 0;
+       boolean_t valid_sig = FALSE;
+       struct chunklist_hdr *hdr = buf;
+
+       if (len < sizeof(struct chunklist_hdr)) {
+               AUTHPRNT("no space for header");
+               return EINVAL;
+       }
+
+       /* recognized file format? */
+       if (hdr->cl_magic != CHUNKLIST_MAGIC ||
+           hdr->cl_file_ver != CHUNKLIST_FILE_VERSION_10 ||
+           hdr->cl_chunk_method != CHUNKLIST_CHUNK_METHOD_10) {
+               AUTHPRNT("unrecognized chunklist format");
+               return EINVAL;
+       }
+
+       /* determine signature length based on signature method */
+       if (hdr->cl_sig_method == CHUNKLIST_SIGNATURE_METHOD_REV1) {
+               AUTHPRNT("rev1 chunklist");
+               sig_len = CHUNKLIST_REV1_SIG_LEN;
+       } else if (hdr->cl_sig_method == CHUNKLIST_SIGNATURE_METHOD_REV2) {
+               AUTHPRNT("rev2 chunklist");
+               sig_len = CHUNKLIST_REV2_SIG_LEN;
+       } else {
+               AUTHPRNT("unrecognized chunklist signature method");
+               return EINVAL;
+       }
+
+       /* does the chunk list fall within the bounds of the buffer? */
+       if (os_mul_and_add_overflow(hdr->cl_chunk_count, sizeof(struct chunklist_chunk), hdr->cl_chunk_offset, &chunks_end) ||
+           hdr->cl_chunk_offset < sizeof(struct chunklist_hdr) || chunks_end > len) {
+               AUTHPRNT("invalid chunk_count (%llu) or chunk_offset (%llu)",
+                   hdr->cl_chunk_count, hdr->cl_chunk_offset);
+               return EINVAL;
+       }
+
+       /* does the signature fall within the bounds of the buffer? */
+       if (os_add_overflow(hdr->cl_sig_offset, sig_len, &sig_end) ||
+           hdr->cl_sig_offset < sizeof(struct chunklist_hdr) ||
+           hdr->cl_sig_offset < chunks_end ||
+           hdr->cl_sig_offset > len) {
+               AUTHPRNT("invalid signature offset (%llu)", hdr->cl_sig_offset);
+               return EINVAL;
+       }
+
+       if (sig_end > len ||
+           os_sub_overflow(len, hdr->cl_sig_offset, &sigsz) ||
+           sigsz != sig_len) {
+               /* missing or incorrect signature size */
+               return EINVAL;
+       }
+
+       /* validate rev1 chunklist */
+       if (hdr->cl_sig_method == CHUNKLIST_SIGNATURE_METHOD_REV1) {
+               /* Do not trust rev1 chunklists if CHUNKLIST_NO_REV1 is present */
+               uint8_t no_rev1;
+               if (PE_parse_boot_argn(CHUNKLIST_NO_REV1, &no_rev1, sizeof(no_rev1))) {
+                       AUTHDBG("rev1 chunklists are not trusted");
+                       return EINVAL;
+               }
+
+               /* hash the chunklist (excluding the signature) */
+               AUTHDBG("hashing rev1 chunklist");
+               uint8_t sha_digest[SHA256_DIGEST_LENGTH];
+               SHA256_CTX sha_ctx;
+               SHA256_Init(&sha_ctx);
+               SHA256_Update(&sha_ctx, buf, hdr->cl_sig_offset);
+               SHA256_Final(sha_digest, &sha_ctx);
+
+               AUTHDBG("validating rev1 chunklist signature against rev1 pub keys");
+               for (size_t i = 0; i < rev1_chunklist_num_pubkeys; i++) {
+                       const struct chunklist_pubkey *key = &rev1_chunklist_pubkeys[i];
+                       err = validate_signature(key->key, CHUNKLIST_PUBKEY_LEN, buf + hdr->cl_sig_offset, CHUNKLIST_SIGNATURE_LEN, sha_digest);
+                       if (err == 0) {
+                               AUTHDBG("validated rev1 chunklist signature with rev1 key %lu (prod=%d)", i, key->is_production);
+                               valid_sig = key->is_production;
+#if IMAGEBOOT_ALLOW_DEVKEYS
+                               if (!key->is_production) {
+                                       /* allow dev keys in dev builds only */
+                                       AUTHDBG("*** allowing DEV rev1 key: this will fail in customer builds ***");
+                                       valid_sig = TRUE;
+                               }
+#endif
+                               goto out;
+                       }
+               }
+
+               /* At this point we tried all the keys: nothing went wrong but none of them
+                * signed our chunklist. */
+               AUTHPRNT("rev1 signature did not verify against any known rev1 public key");
+       } else if (hdr->cl_sig_method == CHUNKLIST_SIGNATURE_METHOD_REV2) {
+               AUTHDBG("validating rev2 chunklist signature against rev2 pub keys");
+               err = validate_rev2_chunklist(buf, len);
+               if (err) {
+                       goto out;
+               }
+               valid_sig = TRUE;
+       }
+
+out:
+       if (err) {
+               return err;
+       } else if (valid_sig == TRUE) {
+               return 0; /* signed, and everything checked out */
+       } else {
+               return EINVAL;
+       }
+}
+
+/*
+ * Authenticate a given DMG file using chunklist
+ */
+int
+authenticate_root_with_chunklist(const char *root_path)
+{
+       char *chunklist_path = NULL;
+       void *chunklist_buf = NULL;
+       size_t chunklist_len = 32 * 1024 * 1024UL;
+       int err = 0;
+
+       err = construct_chunklist_path(root_path, &chunklist_path);
+       if (err) {
+               AUTHPRNT("failed creating chunklist path");
+               goto out;
+       }
+
+       AUTHDBG("validating root against chunklist %s", chunklist_path);
+
+       /*
+        * Read and authenticate the chunklist, then validate the root image against
+        * the chunklist.
+        */
+       AUTHDBG("reading chunklist");
+       err = read_file(chunklist_path, &chunklist_buf, &chunklist_len);
+       if (err) {
+               AUTHPRNT("failed to read chunklist");
+               goto out;
+       }
+
+       AUTHDBG("validating chunklist");
+       err = validate_chunklist(chunklist_buf, chunklist_len);
+       if (err) {
+               AUTHPRNT("failed to validate chunklist");
+               goto out;
+       }
+       AUTHDBG("successfully validated chunklist");
+
+       AUTHDBG("validating root image against chunklist");
+       err = validate_root_image(root_path, chunklist_buf);
+       if (err) {
+               AUTHPRNT("failed to validate root image against chunklist (%d)", err);
+               goto out;
+       }
+
+       /* everything checked out - go ahead and mount this */
+       AUTHDBG("root image authenticated");
+
+out:
+       kfree_safe(chunklist_buf);
+       kfree_safe(chunklist_path);
+       return err;
+}
+
+/*
+ * Check that the UUID of the libkern currently loaded matches the one on disk.
+ */
+int
+authenticate_root_version_check(void)
+{
+       int err = 0;
+       void *buf = NULL;
+       size_t bufsz = 4 * 1024 * 1024UL;
+
+       /* get the UUID of the libkern in /S/L/E */
+       err = read_file(libkern_path, &buf, &bufsz);
+       if (err) {
+               goto out;
+       }
+
+       unsigned long uuidsz = 0;
+       const uuid_t *img_uuid = getuuidfromheader_safe(buf, bufsz, &uuidsz);
+       if (img_uuid == NULL || uuidsz != sizeof(uuid_t)) {
+               AUTHPRNT("invalid UUID (sz = %lu)", uuidsz);
+               err = EINVAL;
+               goto out;
+       }
+
+       /* Get the UUID of the loaded libkern */
+       uuid_t live_uuid;
+       err = OSKextGetUUIDForName(libkern_bundle, live_uuid);
+       if (err) {
+               AUTHPRNT("could not find loaded libkern");
+               goto out;
+       }
+
+       /* ... and compare them */
+       if (bcmp(live_uuid, img_uuid, uuidsz) != 0) {
+               AUTHPRNT("UUID of running libkern does not match %s", libkern_path);
+
+               uuid_string_t img_uuid_str, live_uuid_str;
+               uuid_unparse(*img_uuid, img_uuid_str);
+               uuid_unparse(live_uuid, live_uuid_str);
+               AUTHPRNT("loaded libkern UUID =  %s", live_uuid_str);
+               AUTHPRNT("on-disk libkern UUID = %s", img_uuid_str);
+
+               err = EINVAL;
+               goto out;
+       }
+
+       /* UUID matches! */
+out:
+       kfree_safe(buf);
+       return err;
+}
index b4fe59d01000184d17d060f8e1261751a1faa411..7a1042e75c755854d0ee7e1d72132dba68309026 100644 (file)
@@ -1,19 +1,60 @@
 #ifndef _CHUNKLIST_H
 #define _CHUNKLIST_H
 
-
 #include <libkern/crypto/sha2.h>
 
+/*
+ * Boot argument for disabling trust in rev2 development key(s)
+ * Set by boot.efi
+ */
+#define CHUNKLIST_NO_REV2_DEV           "-chunklist-no-rev2-dev"
+
+/*
+ * Boot argument for disabling trust in rev1 chunklists
+ * Set by boot.efi
+ */
+#define CHUNKLIST_NO_REV1               "-chunklist-no-rev1"
+
+/*
+ * Boot argument for obtaining current security epoch
+ * Set by boot.efi
+ */
+#define CHUNKLIST_SECURITY_EPOCH        "chunklist-security-epoch"
+#define CHUNKLIST_MIN_SECURITY_EPOCH    0
+
 /*
  * Chunklist file format
  */
+#define CHUNKLIST_MAGIC                 0x4C4B4E43
+#define CHUNKLIST_FILE_VERSION_10       1
+#define CHUNKLIST_CHUNK_METHOD_10       1
+#define CHUNKLIST_SIGNATURE_METHOD_REV1 1
+#define CHUNKLIST_SIGNATURE_METHOD_REV2 3
+#define CHUNKLIST_REV1_SIG_LEN          256
+#define CHUNKLIST_REV2_SIG_LEN          808
+#define CHUNKLIST_PUBKEY_LEN            (2048/8)
+#define CHUNKLIST_SIGNATURE_LEN         (2048/8)
+
+struct efi_guid_t {
+       uint32_t data1;
+       uint16_t data2;
+       uint16_t data3;
+       uint8_t  data4[8];
+} __attribute__((packed));
 
-#define CHUNKLIST_MAGIC               0x4C4B4E43
-#define CHUNKLIST_FILE_VERSION_10     1
-#define CHUNKLIST_CHUNK_METHOD_10     1
-#define CHUNKLIST_SIGNATURE_METHOD_10 1
-#define CHUNKLIST_SIG_LEN             256
-#define CHUNKLIST_PUBKEY_LEN          (2048/8)
+// 45E7BC51-913C-42AC-96A2-10712FFBEBA7
+#define CHUNKLIST_REV2_SIG_HASH_GUID \
+{ \
+    0x45E7BC51, 0x913C, 0x42AC, { 0x96, 0xA2, 0x10, 0x71, 0x2F, 0xFB, 0xEB, 0xA7 } \
+};
+
+// A7717414-C616-4977-9420-844712A735BF
+#define EFI_CERT_TYPE_RSA2048_SHA256 \
+{ \
+    0xa7717414, 0xc616, 0x4977, { 0x94, 0x20, 0x84, 0x47, 0x12, 0xa7, 0x35, 0xbf } \
+}
+
+#define WIN_CERT_TYPE_EFI_GUID 0x0EF1
 
 struct chunklist_hdr {
        uint32_t cl_magic;
@@ -32,23 +73,22 @@ struct chunklist_chunk {
        uint8_t  chunk_sha256[SHA256_DIGEST_LENGTH];
 } __attribute__((packed));
 
-struct chunklist_sig {
-       uint8_t  cl_sig[CHUNKLIST_SIG_LEN];
-};
-
-
-/*
- * Chunklist signing public keys
- */
+struct rev2_chunklist_certificate {
+       uint32_t length;
+       uint8_t  revision;
+       uint8_t  security_epoch;
+       uint16_t certificate_type;
+       guid_t   certificate_guid;
+       guid_t   hash_type_guid;
+       uint8_t  rsa_public_key[CHUNKLIST_PUBKEY_LEN];
+       uint8_t  rsa_signature[CHUNKLIST_SIGNATURE_LEN];
+} __attribute__((packed));
 
 struct chunklist_pubkey {
-       const bool isprod;
+       const boolean_t is_production;
        const uint8_t key[CHUNKLIST_PUBKEY_LEN];
 };
 
-const struct chunklist_pubkey chunklist_pubkeys[] = {
-};
-
-#define CHUNKLIST_NPUBKEYS (sizeof(chunklist_pubkeys)/sizeof(chunklist_pubkeys[0]))
-
-#endif
+int authenticate_root_with_chunklist(const char *root_path);
+int authenticate_root_version_check(void);
+#endif /* _CHUNKLIST_H */
index 84866b9e2f8d59f027b53b12c41b0f03b37194b3..fd532f1007667f78707cc61881c593004bd0fa64 100644 (file)
@@ -38,6 +38,8 @@ UNUSED_SYMBOL(decmpfs_read_compressed)
 UNUSED_SYMBOL(decmpfs_cnode_cmp_type)
 UNUSED_SYMBOL(decmpfs_cnode_get_vnode_state)
 UNUSED_SYMBOL(decmpfs_cnode_get_vnode_cached_size)
+UNUSED_SYMBOL(decmpfs_cnode_get_vnode_cached_nchildren)
+UNUSED_SYMBOL(decmpfs_cnode_get_vnode_cached_total_size)
 UNUSED_SYMBOL(decmpfs_lock_compressed_data)
 UNUSED_SYMBOL(decmpfs_cnode_free)
 UNUSED_SYMBOL(decmpfs_cnode_alloc)
@@ -457,7 +459,19 @@ decmpfs_cnode_get_vnode_cached_size(decmpfs_cnode *cp)
        return cp->uncompressed_size;
 }
 
-static void
+uint64_t
+decmpfs_cnode_get_vnode_cached_nchildren(decmpfs_cnode *cp)
+{
+       return cp->nchildren;
+}
+
+uint64_t
+decmpfs_cnode_get_vnode_cached_total_size(decmpfs_cnode *cp)
+{
+       return cp->total_size;
+}
+
+void
 decmpfs_cnode_set_vnode_cached_size(decmpfs_cnode *cp, uint64_t size)
 {
        while (1) {
@@ -470,6 +484,32 @@ decmpfs_cnode_set_vnode_cached_size(decmpfs_cnode *cp, uint64_t size)
        }
 }
 
+void
+decmpfs_cnode_set_vnode_cached_nchildren(decmpfs_cnode *cp, uint64_t nchildren)
+{
+       while (1) {
+               uint64_t old = cp->nchildren;
+               if (OSCompareAndSwap64(old, nchildren, (UInt64*)&cp->nchildren)) {
+                       return;
+               } else {
+                       /* failed to write our value, so loop */
+               }
+       }
+}
+
+void
+decmpfs_cnode_set_vnode_cached_total_size(decmpfs_cnode *cp, uint64_t total_sz)
+{
+       while (1) {
+               uint64_t old = cp->total_size;
+               if (OSCompareAndSwap64(old, total_sz, (UInt64*)&cp->total_size)) {
+                       return;
+               } else {
+                       /* failed to write our value, so loop */
+               }
+       }
+}
+
 static uint64_t
 decmpfs_cnode_get_decompression_flags(decmpfs_cnode *cp)
 {
@@ -539,7 +579,19 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header **
                hdr->attr_size = sizeof(decmpfs_disk_header);
                hdr->compression_magic = DECMPFS_MAGIC;
                hdr->compression_type  = cp->cmp_type;
-               hdr->uncompressed_size = decmpfs_cnode_get_vnode_cached_size(cp);
+               if (hdr->compression_type == DATALESS_PKG_CMPFS_TYPE) {
+                       if (!vnode_isdir(vp)) {
+                               err = EINVAL;
+                               goto out;
+                       }
+                       hdr->_size.value = DECMPFS_PKG_VALUE_FROM_SIZE_COUNT(
+                               decmpfs_cnode_get_vnode_cached_size(cp),
+                               decmpfs_cnode_get_vnode_cached_nchildren(cp));
+               } else if (vnode_isdir(vp)) {
+                       hdr->_size.value = decmpfs_cnode_get_vnode_cached_nchildren(cp);
+               } else {
+                       hdr->_size.value = decmpfs_cnode_get_vnode_cached_size(cp);
+               }
        } else {
                /* figure out how big the xattr is on disk */
                err = vn_getxattr(vp, DECMPFS_XATTR_NAME, NULL, &attr_size, XATTR_NOSECURITY, decmpfs_ctx);
@@ -585,7 +637,14 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header **
                goto out;
        }
 
-       if (hdr->compression_type >= CMP_MAX) {
+       /*
+        * Special-case the DATALESS compressor here; that is a valid type,
+        * even through there will never be an entry in the decompressor
+        * handler table for it.  If we don't do this, then the cmp_state
+        * for this cnode will end up being marked NOT_COMPRESSED, and
+        * we'll be stuck in limbo.
+        */
+       if (hdr->compression_type >= CMP_MAX && !decmpfs_type_is_dataless(hdr->compression_type)) {
                if (returnInvalid) {
                        /* return the header even though the type is out of range */
                        err = ERANGE;
@@ -686,19 +745,21 @@ decmpfs_validate_compressed_file(vnode_t vp, decmpfs_cnode *cp)
                goto out;
        }
 
-       lck_rw_lock_shared(decompressorsLock);
-       decmpfs_validate_compressed_file_func validate = decmp_get_func(vp, hdr->compression_type, validate);
-       if (validate) { /* make sure this validation function is valid */
-               /* is the data okay? */
-               err = validate(vp, decmpfs_ctx, hdr);
-       } else if (decmp_get_func(vp, hdr->compression_type, fetch) == NULL) {
-               /* the type isn't registered */
-               err = EIO;
-       } else {
-               /* no validate registered, so nothing to do */
-               err = 0;
+       if (!decmpfs_type_is_dataless(hdr->compression_type)) {
+               lck_rw_lock_shared(decompressorsLock);
+               decmpfs_validate_compressed_file_func validate = decmp_get_func(vp, hdr->compression_type, validate);
+               if (validate) { /* make sure this validation function is valid */
+                       /* is the data okay? */
+                       err = validate(vp, decmpfs_ctx, hdr);
+               } else if (decmp_get_func(vp, hdr->compression_type, fetch) == NULL) {
+                       /* the type isn't registered */
+                       err = EIO;
+               } else {
+                       /* no validate registered, so nothing to do */
+                       err = 0;
+               }
+               lck_rw_unlock_shared(decompressorsLock);
        }
-       lck_rw_unlock_shared(decompressorsLock);
 out:
        if (hdr) {
                FREE(hdr, M_TEMP);
@@ -761,12 +822,6 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp)
                return 0;
        }
 
-       if (!vnode_isreg(vp)) {
-               /* only regular files can be compressed */
-               ret = FILE_IS_NOT_COMPRESSED;
-               goto done;
-       }
-
        is_mounted = false;
        is_local_fs = false;
        mp = vnode_mount(vp);
@@ -825,7 +880,16 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp)
                        ret = FILE_IS_NOT_COMPRESSED;
                        goto done;
                }
-               /* we got the xattr, so the file is compressed */
+               /*
+                * We got the xattr, so the file is at least tagged compressed.
+                * For DATALESS, regular files and directories can be "compressed".
+                * For all other types, only files are allowed.
+                */
+               if (!vnode_isreg(vp) &&
+                   !(decmpfs_type_is_dataless(hdr->compression_type) && vnode_isdir(vp))) {
+                       ret = FILE_IS_NOT_COMPRESSED;
+                       goto done;
+               }
                ret = FILE_IS_COMPRESSED;
                goto done;
        }
@@ -847,7 +911,15 @@ done:
                        cnode_locked = 1;
                }
 
-               decmpfs_cnode_set_vnode_cached_size(cp, hdr->uncompressed_size);
+               if (vnode_isdir(vp)) {
+                       decmpfs_cnode_set_vnode_cached_size(cp, 64);
+                       decmpfs_cnode_set_vnode_cached_nchildren(cp, decmpfs_get_directory_entries(hdr));
+                       if (hdr->compression_type == DATALESS_PKG_CMPFS_TYPE) {
+                               decmpfs_cnode_set_vnode_cached_total_size(cp, DECMPFS_PKG_SIZE(hdr->_size));
+                       }
+               } else {
+                       decmpfs_cnode_set_vnode_cached_size(cp, hdr->uncompressed_size);
+               }
                decmpfs_cnode_set_vnode_state(cp, ret, 1);
                decmpfs_cnode_set_vnode_cmp_type(cp, hdr->compression_type, 1);
                /* remember if the xattr's size was equal to the minimal xattr */
@@ -941,11 +1013,19 @@ decmpfs_update_attributes(vnode_t vp, struct vnode_attr *vap)
                                error = decmpfs_fetch_compressed_header(vp, NULL, &hdr, 1);
                                if (error == 0) {
                                        /*
-                                        *  allow the flag to be set since the decmpfs attribute is present
-                                        *  in that case, we also want to truncate the data fork of the file
+                                        * Allow the flag to be set since the decmpfs attribute
+                                        * is present.
+                                        *
+                                        * If we're creating a dataless file we do not want to
+                                        * truncate it to zero which allows the file resolver to
+                                        * have more control over when truncation should happen.
+                                        * All other types of compressed files are truncated to
+                                        * zero.
                                         */
-                                       VATTR_SET_ACTIVE(vap, va_data_size);
-                                       vap->va_data_size = 0;
+                                       if (!decmpfs_type_is_dataless(hdr->compression_type)) {
+                                               VATTR_SET_ACTIVE(vap, va_data_size);
+                                               vap->va_data_size = 0;
+                                       }
                                } else if (error == ERANGE) {
                                        /* the file had a decmpfs attribute but the type was out of range, so don't muck with the file's data size */
                                } else {
index 96c0a1e734b57c462f8734663f2772ac651791e6..207d1fe0e1a2ad4ef25a5ceb611975dbed3c1899 100644 (file)
 #include <libkern/crypto/rsa.h>
 #include <libkern/OSKextLibPrivate.h>
 
+#if CONFIG_IMAGEBOOT_IMG4
+#include <libkern/img4/interface.h>
+#include <img4/img4.h>
+#endif
+
 #include <kern/kalloc.h>
 
 #include <pexpert/pexpert.h>
@@ -57,18 +62,32 @@ extern struct filedesc filedesc0;
 extern int (*mountroot)(void);
 extern char rootdevice[DEVMAXNAMESIZE];
 
+#if CONFIG_LOCKERBOOT
+typedef struct _locker_mount_args {
+       char lmnt_path[PATH_MAX];
+       uint16_t lmnt_preferred_hash;
+} locker_mount_args_t;
+#endif
+
 #define DEBUG_IMAGEBOOT 0
 
 #if DEBUG_IMAGEBOOT
-#define DBG_TRACE(...) printf(__VA_ARGS__)
+#define DBG_TRACE(...) printf("imageboot: " __VA_ARGS__)
 #else
 #define DBG_TRACE(...) do {} while(0)
 #endif
 
+#define AUTHDBG(fmt, args...) do { printf("%s: " fmt "\n", __func__, ##args); } while (0)
+#define AUTHPRNT(fmt, args...) do { printf("%s: " fmt "\n", __func__, ##args); } while (0)
+#define kfree_safe(x) do { if ((x)) { kfree_addr((x)); (x) = NULL; } } while (0)
+
 extern int di_root_image(const char *path, char *devname, size_t devsz, dev_t *dev_p);
 extern int di_root_ramfile_buf(void *buf, size_t bufsz, char *devname, size_t devsz, dev_t *dev_p);
 
-static boolean_t imageboot_setup_new(void);
+static boolean_t imageboot_setup_new(imageboot_type_t type);
+
+vnode_t imgboot_get_image_file(const char *path, off_t *fsize, int *errp); /* may be required by chunklist.c */
+int read_file(const char *path, void **bufp, size_t *bufszp); /* may be required by chunklist.c */
 
 #define kIBFilePrefix "file://"
 
@@ -87,10 +106,10 @@ vnode_get_and_drop_always(vnode_t vp)
        vnode_put(vp);
 }
 
-__private_extern__ int
+__private_extern__ imageboot_type_t
 imageboot_needed(void)
 {
-       int result = 0;
+       imageboot_type_t result = IMAGEBOOT_NONE;
        char *root_path = NULL;
 
        DBG_TRACE("%s: checking for presence of root path\n", __FUNCTION__);
@@ -100,8 +119,18 @@ imageboot_needed(void)
                panic("%s: M_NAMEI zone exhausted", __FUNCTION__);
        }
 
+#if CONFIG_LOCKERBOOT
+       if (PE_parse_boot_argn(IMAGEBOOT_LOCKER_ARG, root_path, MAXPATHLEN)) {
+               result = IMAGEBOOT_LOCKER;
+               goto out;
+       }
+#endif
+
        /* Check for first layer */
        if (!(PE_parse_boot_argn("rp0", root_path, MAXPATHLEN) ||
+#if CONFIG_IMAGEBOOT_IMG4
+           PE_parse_boot_argn("arp0", root_path, MAXPATHLEN) ||
+#endif
            PE_parse_boot_argn("rp", root_path, MAXPATHLEN) ||
            PE_parse_boot_argn(IMAGEBOOT_ROOT_ARG, root_path, MAXPATHLEN) ||
            PE_parse_boot_argn(IMAGEBOOT_AUTHROOT_ARG, root_path, MAXPATHLEN))) {
@@ -115,7 +144,7 @@ imageboot_needed(void)
                goto out;
        }
 
-       result = 1;
+       result = IMAGEBOOT_DMG;
 
        /* Check for second layer */
        if (!(PE_parse_boot_argn("rp1", root_path, MAXPATHLEN) ||
@@ -146,25 +175,61 @@ out:
  * is returned with usecount (no iocount).
  */
 __private_extern__ int
-imageboot_mount_image(const char *root_path, int height)
+imageboot_mount_image(const char *root_path, int height, imageboot_type_t type)
 {
        dev_t           dev;
        int             error;
-       vnode_t         old_rootvnode = NULL;
+       /*
+        * Need to stash this here since we may do a kernel_mount() on /, which will
+        * automatically update the rootvnode global. Note that vfs_mountroot() does
+        * not update that global, which is a bit weird.
+        */
+       vnode_t         old_rootvnode = rootvnode;
        vnode_t         newdp;
        mount_t         new_rootfs;
+       boolean_t update_rootvnode = FALSE;
 
-       error = di_root_image(root_path, rootdevice, DEVMAXNAMESIZE, &dev);
-       if (error) {
-               panic("%s: di_root_image failed: %d\n", __FUNCTION__, error);
+       if (type == IMAGEBOOT_DMG) {
+               error = di_root_image(root_path, rootdevice, DEVMAXNAMESIZE, &dev);
+               if (error) {
+                       panic("%s: di_root_image failed: %d\n", __FUNCTION__, error);
+               }
+
+               rootdev = dev;
+               mountroot = NULL;
+               printf("%s: root device 0x%x\n", __FUNCTION__, rootdev);
+               error = vfs_mountroot();
+               if (error != 0) {
+                       panic("vfs_mountroot() failed.\n");
+               }
+
+               update_rootvnode = TRUE;
        }
+#if CONFIG_LOCKERBOOT
+       else if (type == IMAGEBOOT_LOCKER) {
+               locker_mount_args_t *mntargs = kalloc(sizeof(*mntargs));
+               if (!mntargs) {
+                       panic("could not alloc mount args");
+               }
 
-       rootdev = dev;
-       mountroot = NULL;
-       printf("%s: root device 0x%x\n", __FUNCTION__, rootdev);
-       error = vfs_mountroot();
-       if (error != 0) {
-               panic("vfs_mountroot() failed.\n");
+               strlcpy(mntargs->lmnt_path, root_path, sizeof(mntargs->lmnt_path));
+               mntargs->lmnt_preferred_hash = 0;
+
+               DBG_TRACE("%s: mounting locker: %s\n", __FUNCTION__, root_path);
+               error = kernel_mount(LOCKERFS_NAME, NULLVP, NULLVP, "/",
+                   mntargs, sizeof(*mntargs), 0, 0, vfs_context_kernel());
+               if (error) {
+                       panic("failed to mount locker: %d", error);
+               }
+               kfree(mntargs, sizeof(*mntargs));
+
+               /* Clear the old mount association. */
+               old_rootvnode->v_mountedhere = NULL;
+               rootvnode->v_mount->mnt_vnodecovered = NULL;
+       }
+#endif
+       else {
+               panic("invalid imageboot type: %d", type);
        }
 
        /*
@@ -174,16 +239,13 @@ imageboot_mount_image(const char *root_path, int height)
        if (VFS_ROOT(TAILQ_LAST(&mountlist, mntlist), &newdp, vfs_context_kernel())) {
                panic("%s: cannot find root vnode", __FUNCTION__);
        }
+       DBG_TRACE("%s: old root fsname: %s\n", __FUNCTION__, old_rootvnode->v_mount->mnt_vtable->vfc_name);
 
-       if (rootvnode != NULL) {
+       if (old_rootvnode != NULL) {
                /* remember the old rootvnode, but remove it from mountlist */
-               mount_t         old_rootfs;
-
-               old_rootvnode = rootvnode;
-               old_rootfs = rootvnode->v_mount;
+               mount_t old_rootfs = old_rootvnode->v_mount;
 
                mount_list_remove(old_rootfs);
-
                mount_lock(old_rootfs);
 #ifdef CONFIG_IMGSRC_ACCESS
                old_rootfs->mnt_kern_flag |= MNTK_BACKS_ROOT;
@@ -193,7 +255,9 @@ imageboot_mount_image(const char *root_path, int height)
        }
 
        /* switch to the new rootvnode */
-       rootvnode = newdp;
+       if (update_rootvnode) {
+               rootvnode = newdp;
+       }
 
        new_rootfs = rootvnode->v_mount;
        mount_lock(new_rootfs);
@@ -213,43 +277,14 @@ imageboot_mount_image(const char *root_path, int height)
                        vnode_get_and_drop_always(old_rootvnode);
                }
 #else
-               height = 0; /* keep the compiler from complaining */
+#pragma unused(height)
                vnode_get_and_drop_always(old_rootvnode);
 #endif /* CONFIG_IMGSRC_ACCESS */
        }
        return 0;
 }
 
-
-/*
- * Authenticated root-dmg support
- */
-
-#define AUTHDBG(fmt, args...) do { printf("%s: " fmt "\n", __func__, ##args); } while (0)
-#define AUTHPRNT(fmt, args...) do { printf("%s: " fmt "\n", __func__, ##args); } while (0)
-
-#define kfree_safe(x) do { if ((x)) { kfree_addr((x)); (x) = NULL; } } while (0)
-
-enum {
-       MISSING_SIG = -1,
-       INVALID_SIG = -2
-};
-
-static void
-key_byteswap(void *_dst, const void *_src, size_t len)
-{
-       uint32_t *dst __attribute__((align_value(1))) = _dst;
-       const uint32_t *src __attribute__((align_value(1))) = _src;
-
-       assert(len % sizeof(uint32_t) == 0);
-
-       len = len / sizeof(uint32_t);
-       for (size_t i = 0; i < len; i++) {
-               dst[len - i - 1] = OSSwapInt32(src[i]);
-       }
-}
-
-static int
+int
 read_file(const char *path, void **bufp, size_t *bufszp)
 {
        int err = 0;
@@ -266,14 +301,14 @@ read_file(const char *path, void **bufp, size_t *bufszp)
 
        NDINIT(&ndp, LOOKUP, OP_OPEN, LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
        if ((err = namei(&ndp)) != 0) {
-               AUTHPRNT("namei failed (%s)", path);
+               AUTHPRNT("namei failed (%s) - %d", path, err);
                goto out;
        }
        nameidone(&ndp);
        vp = ndp.ni_vp;
 
        if ((err = vnode_size(vp, &fsize, ctx)) != 0) {
-               AUTHPRNT("failed to get vnode size");
+               AUTHPRNT("failed to get vnode size of %s - %d", path, err);
                goto out;
        }
        if (fsize < 0) {
@@ -281,7 +316,7 @@ read_file(const char *path, void **bufp, size_t *bufszp)
        }
 
        if ((err = VNOP_OPEN(vp, FREAD, ctx)) != 0) {
-               AUTHPRNT("failed to open vnode");
+               AUTHPRNT("failed to open %s - %d", path, err);
                goto out;
        }
        doclose = true;
@@ -298,13 +333,13 @@ read_file(const char *path, void **bufp, size_t *bufszp)
        }
 
        if ((err = vn_rdwr(UIO_READ, vp, (caddr_t)buf, fsize, 0, UIO_SYSSPACE, IO_NODELOCKED, kerncred, &resid, p)) != 0) {
-               AUTHPRNT("vn_rdwr() failed");
+               AUTHPRNT("Cannot read %d bytes from %s - %d", (int)fsize, path, err);
                goto out;
        }
 
        if (resid) {
                /* didnt get everything we wanted */
-               AUTHPRNT("vn_rdwr resid = %d", resid);
+               AUTHPRNT("Short read of %d bytes from %s - %d", (int)fsize, path, resid);
                err = EINVAL;
                goto out;
        }
@@ -328,513 +363,131 @@ out:
        return err;
 }
 
-static int
-validate_signature(const uint8_t *key_msb, size_t keylen, uint8_t *sig_msb, size_t siglen, uint8_t *digest)
-{
-       int err = 0;
-       bool sig_valid = false;
-       uint8_t *sig = NULL;
-
-       const uint8_t exponent[] = { 0x01, 0x00, 0x01 };
-       uint8_t *modulus = kalloc(keylen);
-       rsa_pub_ctx *rsa_ctx = kalloc(sizeof(rsa_pub_ctx));
-       sig = kalloc(siglen);
-
-       if (modulus == NULL || rsa_ctx == NULL || sig == NULL) {
-               err = ENOMEM;
-               goto out;
-       }
-
-       bzero(rsa_ctx, sizeof(rsa_pub_ctx));
-       key_byteswap(modulus, key_msb, keylen);
-       key_byteswap(sig, sig_msb, siglen);
-
-       err = rsa_make_pub(rsa_ctx,
-           sizeof(exponent), exponent,
-           CHUNKLIST_PUBKEY_LEN, modulus);
-       if (err) {
-               AUTHPRNT("rsa_make_pub() failed");
-               goto out;
-       }
-
-       err = rsa_verify_pkcs1v15(rsa_ctx, CC_DIGEST_OID_SHA256,
-           SHA256_DIGEST_LENGTH, digest,
-           siglen, sig,
-           &sig_valid);
-       if (err) {
-               sig_valid = false;
-               AUTHPRNT("rsa_verify() failed");
-               err = EINVAL;
-               goto out;
-       }
-
-out:
-       kfree_safe(sig);
-       kfree_safe(rsa_ctx);
-       kfree_safe(modulus);
-
-       if (err) {
-               return err;
-       } else if (sig_valid == true) {
-               return 0; /* success */
-       } else {
-               return INVALID_SIG;
-       }
-}
-
-static int
-validate_chunklist(void *buf, size_t len)
+#if CONFIG_IMAGEBOOT_IMG4 || CONFIG_IMAGEBOOT_CHUNKLIST
+vnode_t
+imgboot_get_image_file(const char *path, off_t *fsize, int *errp)
 {
-       int err = 0;
-       size_t sigsz = 0;
-       size_t sig_end = 0;
-       size_t chunks_end = 0;
-       bool valid_sig = false;
-       struct chunklist_hdr *hdr = buf;
-
-       if (len < sizeof(struct chunklist_hdr)) {
-               AUTHPRNT("no space for header");
-               return EINVAL;
-       }
-
-       /* recognized file format? */
-       if (hdr->cl_magic != CHUNKLIST_MAGIC ||
-           hdr->cl_file_ver != CHUNKLIST_FILE_VERSION_10 ||
-           hdr->cl_chunk_method != CHUNKLIST_SIGNATURE_METHOD_10 ||
-           hdr->cl_sig_method != CHUNKLIST_SIGNATURE_METHOD_10) {
-               AUTHPRNT("unrecognized chunklist format");
-               return EINVAL;
-       }
-
-       /* does the chunk list fall within the bounds of the buffer? */
-       if (os_mul_and_add_overflow(hdr->cl_chunk_count, sizeof(struct chunklist_chunk), hdr->cl_chunk_offset, &chunks_end) ||
-           hdr->cl_chunk_offset < sizeof(struct chunklist_hdr) || chunks_end > len) {
-               AUTHPRNT("invalid chunk_count (%llu) or chunk_offset (%llu)",
-                   hdr->cl_chunk_count, hdr->cl_chunk_offset);
-               return EINVAL;
-       }
-
-       /* does the signature fall within the bounds of the buffer? */
-       if (os_add_overflow(hdr->cl_sig_offset, sizeof(struct chunklist_sig), &sig_end) ||
-           hdr->cl_sig_offset < sizeof(struct chunklist_hdr) ||
-           hdr->cl_sig_offset < chunks_end ||
-           hdr->cl_sig_offset > len) {
-               AUTHPRNT("invalid signature offset (%llu)", hdr->cl_sig_offset);
-               return EINVAL;
-       }
-
-       if (sig_end > len || os_sub_overflow(len, hdr->cl_sig_offset, &sigsz) || sigsz != CHUNKLIST_SIG_LEN) {
-               /* missing or incorrect signature size */
-               return MISSING_SIG;
-       }
-
-       AUTHDBG("hashing chunklist");
-
-       /* hash the chunklist (excluding the signature) */
-       uint8_t sha_digest[SHA256_DIGEST_LENGTH];
-       SHA256_CTX sha_ctx;
-       SHA256_Init(&sha_ctx);
-       SHA256_Update(&sha_ctx, buf, hdr->cl_sig_offset);
-       SHA256_Final(sha_digest, &sha_ctx);
-
-       AUTHDBG("validating chunklist signature against pub keys");
-       for (size_t i = 0; i < CHUNKLIST_NPUBKEYS; i++) {
-               const struct chunklist_pubkey *key = &chunklist_pubkeys[i];
-               err = validate_signature(key->key, CHUNKLIST_PUBKEY_LEN,
-                   buf + hdr->cl_sig_offset, sigsz, sha_digest);
-               if (err == 0) {
-                       AUTHDBG("validated chunklist signature with key %lu (prod=%d)", i, key->isprod);
-                       valid_sig = key->isprod;
-#if IMAGEBOOT_ALLOW_DEVKEYS
-                       if (!key->isprod) {
-                               /* allow dev keys in dev builds only */
-                               AUTHDBG("*** allowing DEV key: this will fail in customer builds ***");
-                               valid_sig = true;
-                       }
-#endif
-                       goto out;
-               } else if (err == INVALID_SIG) {
-                       /* try the next key */
-               } else {
-                       goto out; /* something bad happened */
-               }
-       }
-
-       /* At this point we tried all the keys: nothing went wrong but none of them
-        * signed our chunklist. */
-       AUTHPRNT("signature did not verify against any known public key");
-
-out:
-       if (err) {
-               return err;
-       } else if (valid_sig == true) {
-               return 0; /* signed, and everything checked out */
-       } else {
-               return EINVAL;
-       }
-}
-
-static int
-validate_root_image(const char *root_path, void *chunklist)
-{
-       int err = 0;
-       struct chunklist_hdr *hdr = chunklist;
-       struct chunklist_chunk *chk = NULL;
-       size_t ch = 0;
        struct nameidata ndp = {};
-       struct vnode *vp = NULL;
-       off_t fsize = 0;
-       off_t offset = 0;
-       bool doclose = false;
-       size_t bufsz = 0;
-       void *buf = NULL;
-
+       vnode_t vp = NULL;
        vfs_context_t ctx = vfs_context_kernel();
-       kauth_cred_t kerncred = vfs_context_ucred(ctx);
-       proc_t p = vfs_context_proc(ctx);
-
-       AUTHDBG("validating root dmg %s", root_path);
+       int err;
 
-       /*
-        * Open the DMG
-        */
-       NDINIT(&ndp, LOOKUP, OP_OPEN, LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(root_path), ctx);
+       NDINIT(&ndp, LOOKUP, OP_OPEN, LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
        if ((err = namei(&ndp)) != 0) {
-               AUTHPRNT("namei failed (%s)", root_path);
-               goto out;
-       }
-       nameidone(&ndp);
-       vp = ndp.ni_vp;
-
-       if (vp->v_type != VREG) {
-               err = EINVAL;
-               goto out;
-       }
-
-       if ((err = vnode_size(vp, &fsize, ctx)) != 0) {
-               AUTHPRNT("failed to get vnode size");
-               goto out;
-       }
-
-       if ((err = VNOP_OPEN(vp, FREAD, ctx)) != 0) {
-               AUTHPRNT("failed to open vnode");
-               goto out;
-       }
-       doclose = true;
-
-       /*
-        * Iterate the chunk list and check each chunk
-        */
-       chk = chunklist + hdr->cl_chunk_offset;
-       for (ch = 0; ch < hdr->cl_chunk_count; ch++) {
-               int resid = 0;
-
-               if (!buf) {
-                       /* allocate buffer based on first chunk size */
-                       buf = kalloc(chk->chunk_size);
-                       if (buf == NULL) {
-                               err = ENOMEM;
-                               goto out;
-                       }
-                       bufsz = chk->chunk_size;
-               }
-
-               if (chk->chunk_size > bufsz) {
-                       AUTHPRNT("chunk size too big");
-                       err = EINVAL;
-                       goto out;
-               }
-
-               err = vn_rdwr(UIO_READ, vp, (caddr_t)buf, chk->chunk_size, offset, UIO_SYSSPACE, IO_NODELOCKED, kerncred, &resid, p);
-               if (err) {
-                       AUTHPRNT("vn_rdrw fail (err = %d, resid = %d)", err, resid);
-                       goto out;
-               }
-               if (resid) {
-                       err = EINVAL;
-                       AUTHPRNT("chunk covered non-existant part of image");
-                       goto out;
-               }
-
-               /* calculate the SHA256 of this chunk */
-               uint8_t sha_digest[SHA256_DIGEST_LENGTH];
-               SHA256_CTX sha_ctx;
-               SHA256_Init(&sha_ctx);
-               SHA256_Update(&sha_ctx, buf, chk->chunk_size);
-               SHA256_Final(sha_digest, &sha_ctx);
-
-               /* Check the calculated SHA matches the chunk list */
-               if (bcmp(sha_digest, chk->chunk_sha256, SHA256_DIGEST_LENGTH) != 0) {
-                       AUTHPRNT("SHA mismatch on chunk %lu (offset %lld, size %u)", ch, offset, chk->chunk_size);
-                       err = EINVAL;
-                       goto out;
-               }
+               AUTHPRNT("Cannot find %s - error %d", path, err);
+       } else {
+               nameidone(&ndp);
+               vp = ndp.ni_vp;
 
-               if (os_add_overflow(offset, chk->chunk_size, &offset)) {
+               if (vp->v_type != VREG) {
                        err = EINVAL;
-                       goto out;
+                       AUTHPRNT("%s it not a regular file", path);
+               } else if (fsize) {
+                       if ((err = vnode_size(vp, fsize, ctx)) != 0) {
+                               AUTHPRNT("Cannot get file size of %s - error %d", path, err);
+                       }
                }
-               chk++;
        }
 
-       if (offset != fsize) {
-               AUTHPRNT("chunklist did not cover entire file (offset = %lld, fsize = %lld)", offset, fsize);
-               err = EINVAL;
-               goto out;
-       }
-
-out:
-       kfree_safe(buf);
-       if (doclose) {
-               VNOP_CLOSE(vp, FREAD, ctx);
-       }
-       if (vp) {
-               vnode_put(vp);
+       if (err) {
+               *errp = err;
                vp = NULL;
        }
-
-       return err;
+       return vp;
 }
+#endif /* CONFIG_IMAGEBOOT_CHUNKLIST || CONFIG_IMAGEBOOT_CHUNKLIST */
 
-static int
-construct_chunklist_path(const char *root_path, char **bufp)
-{
-       int err = 0;
-       char *path = NULL;
-       size_t len = 0;
-
-       path = kalloc(MAXPATHLEN);
-       if (path == NULL) {
-               AUTHPRNT("failed to allocate space for chunklist path");
-               err = ENOMEM;
-               goto out;
-       }
+#if CONFIG_IMAGEBOOT_IMG4
 
-       len = strnlen(root_path, MAXPATHLEN);
-       if (len < MAXPATHLEN && len > strlen(".dmg")) {
-               /* correctly terminated string with space for extension */
-       } else {
-               AUTHPRNT("malformed root path");
-               err = EINVAL;
-               goto out;
-       }
+#define APTICKET_NAME "apticket.der"
 
-       len = strlcpy(path, root_path, MAXPATHLEN);
-       if (len >= MAXPATHLEN) {
-               AUTHPRNT("root path is too long");
-               err = EINVAL;
-               goto out;
-       }
+static char *
+imgboot_get_apticket_path(const char *rootpath)
+{
+       size_t plen = strlen(rootpath) + sizeof(APTICKET_NAME);
+       char *path = kalloc(plen);
 
-       path[len - strlen(".dmg")] = '\0';
-       len = strlcat(path, ".chunklist", MAXPATHLEN);
-       if (len >= MAXPATHLEN) {
-               AUTHPRNT("chunklist path is too long");
-               err = EINVAL;
-               goto out;
-       }
+       if (path) {
+               char *slash;
 
-out:
-       if (err) {
-               kfree_safe(path);
-       } else {
-               *bufp = path;
+               strlcpy(path, rootpath, plen);
+               slash = strrchr(path, '/');
+               if (slash == NULL) {
+                       slash = path;
+               } else {
+                       slash++;
+               }
+               strlcpy(slash, APTICKET_NAME, sizeof(APTICKET_NAME) + 1);
        }
-       return err;
+       return path;
 }
 
 static int
-authenticate_root(const char *root_path)
+authenticate_root_with_img4(const char *rootpath)
 {
-       char *chunklist_path = NULL;
-       void *chunklist_buf = NULL;
-       size_t chunklist_len = 32 * 1024 * 1024UL;
-       int err = 0;
-
-       err = construct_chunklist_path(root_path, &chunklist_path);
-       if (err) {
-               AUTHPRNT("failed creating chunklist path");
-               goto out;
-       }
-
-       AUTHDBG("validating root against chunklist %s", chunklist_path);
+       errno_t rv;
+       img4_t i4;
+       img4_payload_t i4pl;
+       vnode_t vp;
+       char *ticket_path;
+       size_t tcksz = 0;
+       void *tckbuf = NULL;
 
-       /*
-        * Read and authenticate the chunklist, then validate the root image against
-        * the chunklist.
-        */
-
-       AUTHDBG("reading chunklist");
-       err = read_file(chunklist_path, &chunklist_buf, &chunklist_len);
-       if (err) {
-               AUTHPRNT("failed to read chunklist");
-               goto out;
-       }
-
-       AUTHDBG("validating chunklist");
-       err = validate_chunklist(chunklist_buf, chunklist_len);
-       if (err < 0) {
-               AUTHDBG("missing or incorrect signature on chunklist");
-               goto out;
-       } else if (err) {
-               AUTHPRNT("failed to validate chunklist");
-               goto out;
-       } else {
-               AUTHDBG("successfully validated chunklist");
-       }
+       DBG_TRACE("Check %s\n", rootpath);
 
-       AUTHDBG("validating root image against chunklist");
-       err = validate_root_image(root_path, chunklist_buf);
-       if (err) {
-               AUTHPRNT("failed to validate root image against chunklist (%d)", err);
-               goto out;
+       if (img4if == NULL) {
+               AUTHPRNT("AppleImage4 is not ready");
+               return EAGAIN;
        }
 
-       /* everything checked out - go ahead and mount this */
-       AUTHDBG("root image authenticated");
-
-out:
-       kfree_safe(chunklist_buf);
-       kfree_safe(chunklist_path);
-       return err;
-}
-
-static const uuid_t *
-getuuidfromheader_safe(const void *buf, size_t bufsz, size_t *uuidsz)
-{
-       const struct uuid_command *cmd = NULL;
-       const kernel_mach_header_t *mh = buf;
-
-       /* space for the header and at least one load command? */
-       if (bufsz < sizeof(kernel_mach_header_t) + sizeof(struct uuid_command)) {
-               AUTHPRNT("libkern image too small");
-               return NULL;
+       ticket_path = imgboot_get_apticket_path(rootpath);
+       if (ticket_path == NULL) {
+               AUTHPRNT("Cannot construct ticket path - out of memory");
+               return ENOMEM;
        }
 
-       /* validate the mach header */
-       if (mh->magic != MH_MAGIC_64 || (mh->sizeofcmds > bufsz - sizeof(kernel_mach_header_t))) {
-               AUTHPRNT("invalid MachO header");
-               return NULL;
+       rv = read_file(ticket_path, &tckbuf, &tcksz);
+       if (rv) {
+               AUTHPRNT("Cannot get a ticket from %s - %d\n", ticket_path, rv);
+               goto out_with_ticket_path;
        }
 
-       /* iterate the load commands */
-       size_t offset = sizeof(kernel_mach_header_t);
-       for (size_t i = 0; i < mh->ncmds; i++) {
-               cmd = buf + offset;
-
-               if (cmd->cmd == LC_UUID) {
-                       *uuidsz = sizeof(cmd->uuid);
-                       return &cmd->uuid;
-               }
+       DBG_TRACE("Got %d bytes of manifest from %s\n", (int)tcksz, ticket_path);
 
-               if (os_add_overflow(cmd->cmdsize, offset, &offset) ||
-                   offset > bufsz - sizeof(struct uuid_command)) {
-                       return NULL;
-               }
+       rv = img4_init(&i4, 0, tckbuf, tcksz, NULL);
+       if (rv) {
+               AUTHPRNT("Cannot initialise verification handle - error %d", rv);
+               goto out_with_ticket_bytes;
        }
 
-       return NULL;
-}
-
-static const char *libkern_path = "/System/Library/Extensions/System.kext/PlugIns/Libkern.kext/Libkern";
-static const char *libkern_bundle = "com.apple.kpi.libkern";
-
-/*
- * Check that the UUID of the libkern currently loaded matches the one on disk.
- */
-static int
-auth_version_check(void)
-{
-       int err = 0;
-       void *buf = NULL;
-       size_t bufsz = 4 * 1024 * 1024UL;
-
-       /* get the UUID of the libkern in /S/L/E */
-
-       err = read_file(libkern_path, &buf, &bufsz);
-       if (err) {
+       vp = imgboot_get_image_file(rootpath, NULL, &rv);
+       if (vp == NULL) {
+               /* Error message had been printed already */
                goto out;
        }
 
-       unsigned long uuidsz = 0;
-       const uuid_t *img_uuid = getuuidfromheader_safe(buf, bufsz, &uuidsz);
-       if (img_uuid == NULL || uuidsz != sizeof(uuid_t)) {
-               AUTHPRNT("invalid UUID (sz = %lu)", uuidsz);
-               err = EINVAL;
+       rv = img4_payload_init_with_vnode_4xnu(&i4pl, 'rosi', vp, I4PLF_UNWRAPPED);
+       if (rv) {
+               AUTHPRNT("failed to init payload: %d", rv);
                goto out;
        }
 
-       /* Get the UUID of the loaded libkern */
-       uuid_t live_uuid;
-       err = OSKextGetUUIDForName(libkern_bundle, live_uuid);
-       if (err) {
-               AUTHPRNT("could not find loaded libkern");
-               goto out;
+       rv = img4_get_trusted_external_payload(&i4, &i4pl, IMG4_ENVIRONMENT_PPL, NULL, NULL);
+       if (rv) {
+               AUTHPRNT("failed to validate root image %s: %d", rootpath, rv);
        }
 
-       /* ... and compare them */
-       if (bcmp(live_uuid, img_uuid, uuidsz) != 0) {
-               AUTHPRNT("UUID of running libkern does not match %s", libkern_path);
-
-               uuid_string_t img_uuid_str, live_uuid_str;
-               uuid_unparse(*img_uuid, img_uuid_str);
-               uuid_unparse(live_uuid, live_uuid_str);
-               AUTHPRNT("loaded libkern UUID =  %s", live_uuid_str);
-               AUTHPRNT("on-disk libkern UUID = %s", img_uuid_str);
-
-               err = EINVAL;
-               goto out;
-       }
-
-       /* UUID matches! */
-
+       img4_payload_destroy(&i4pl);
 out:
-       kfree_safe(buf);
-       return err;
+       img4_destroy(&i4);
+out_with_ticket_bytes:
+       kfree_safe(tckbuf);
+out_with_ticket_path:
+       kfree_safe(ticket_path);
+       return rv;
 }
+#endif /* CONFIG_IMAGEBOOT_IMG4 */
 
-#if 0
-int
-auth_imgboot_test(proc_t __unused ap, struct auth_imgboot_test_args *uap, int32_t *retval)
-{
-       int ret = 0;
-       int err;
-       char path[MAXPATHLEN];
-       vm_size_t len;
-       *retval = 0;
-
-       err = copyinstr(uap->path, path, MAXPATHLEN, &len);
-       if (err) {
-               return err;
-       }
-       if (len >= MAXPATHLEN) {
-               return ENAMETOOLONG;
-       }
-
-       AUTHDBG("authenticating root image at %s", path);
-       err = authenticate_root(path);
-       if (err) {
-               AUTHPRNT("root authentication FAIL (%d)", err);
-               ret = err;
-       } else {
-               AUTHDBG("successfully authenticated %s", path);
-       }
-
-       AUTHDBG("checking root image version");
-       err = auth_version_check();
-       if (err) {
-               AUTHPRNT("root image version check FAIL (%d)", err);
-               err = err ?: ret;
-       } else {
-               AUTHPRNT("root version check success (%d)", err);
-       }
-
-       if (ret < 0) {
-               return EINVAL; /* negative return values have special meaning */
-       }
-       return ret;
-}
-#endif
 
 /*
  * Attach the image at 'path' as a ramdisk and mount it as our new rootfs.
@@ -926,19 +579,77 @@ out:
        return err;
 }
 
+/*
+ * If the path is in <file://> URL format then we allocate memory and decode it,
+ * otherwise return the same pointer.
+ *
+ * Caller is expected to check if the pointers are different.
+ */
+static char *
+url_to_path(char *url_path)
+{
+       char *path = url_path;
+       size_t len = strlen(kIBFilePrefix);
+
+       if (strncmp(kIBFilePrefix, url_path, len) == 0) {
+               /* its a URL - remove the file:// prefix and percent-decode */
+               url_path += len;
+
+               len = strlen(url_path);
+               if (len) {
+                       /* Make a copy of the path to URL-decode */
+                       path = kalloc(len + 1);
+                       if (path == NULL) {
+                               panic("imageboot path allocation failed - cannot allocate %d bytes\n", (int)len);
+                       }
+
+                       strlcpy(path, url_path, len + 1);
+                       url_decode(path);
+               } else {
+                       panic("Bogus imageboot path URL - missing path\n");
+               }
+
+               DBG_TRACE("%s: root image URL <%s> becomes %s\n", __func__, url_path, path);
+       }
+
+       return path;
+}
+
 static boolean_t
-imageboot_setup_new()
+imageboot_setup_new(imageboot_type_t type)
 {
        int error;
        char *root_path = NULL;
        int height = 0;
        boolean_t done = FALSE;
-       boolean_t auth_root = FALSE;
+       boolean_t auth_root = TRUE;
        boolean_t ramdisk_root = FALSE;
 
        MALLOC_ZONE(root_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK);
        assert(root_path != NULL);
 
+#if CONFIG_LOCKERBOOT
+       if (type == IMAGEBOOT_LOCKER) {
+               if (!PE_parse_boot_argn(IMAGEBOOT_LOCKER_ARG, root_path, MAXPATHLEN)) {
+                       panic("locker boot with no locker given");
+               }
+
+               DBG_TRACE("%s: root fsname: %s\n", __FUNCTION__, rootvnode->v_mount->mnt_vtable->vfc_name);
+
+               /*
+                * The locker path is a path, not a URL, so just pass it directly to
+                * imageboot_mount_image().
+                */
+               error = imageboot_mount_image(root_path, 0, type);
+               if (error) {
+                       panic("failed to mount system locker: %d", error);
+               }
+
+               done = TRUE;
+               goto out;
+       }
+#endif /* CONFIG_LOCKERBOOT */
+
        unsigned imgboot_arg;
        if (PE_parse_boot_argn("-rootdmg-ramdisk", &imgboot_arg, sizeof(imgboot_arg))) {
                ramdisk_root = TRUE;
@@ -946,7 +657,7 @@ imageboot_setup_new()
 
        if (PE_parse_boot_argn(IMAGEBOOT_CONTAINER_ARG, root_path, MAXPATHLEN) == TRUE) {
                printf("%s: container image url is %s\n", __FUNCTION__, root_path);
-               error = imageboot_mount_image(root_path, height);
+               error = imageboot_mount_image(root_path, height, type);
                if (error != 0) {
                        panic("Failed to mount container image.");
                }
@@ -954,71 +665,65 @@ imageboot_setup_new()
                height++;
        }
 
-       if (PE_parse_boot_argn(IMAGEBOOT_AUTHROOT_ARG, root_path, MAXPATHLEN) == TRUE) {
-               auth_root = TRUE;
-       } else if (PE_parse_boot_argn(IMAGEBOOT_ROOT_ARG, root_path, MAXPATHLEN) == FALSE) {
+       if (PE_parse_boot_argn(IMAGEBOOT_AUTHROOT_ARG, root_path, MAXPATHLEN) == FALSE &&
+           PE_parse_boot_argn(IMAGEBOOT_ROOT_ARG, root_path, MAXPATHLEN) == FALSE) {
                if (height > 0) {
-                       panic("%s specified without %s?\n", IMAGEBOOT_CONTAINER_ARG, IMAGEBOOT_ROOT_ARG);
+                       panic("%s specified without %s or %s?\n", IMAGEBOOT_CONTAINER_ARG, IMAGEBOOT_AUTHROOT_ARG, IMAGEBOOT_ROOT_ARG);
                }
                goto out;
        }
 
-       printf("%s: root image url is %s\n", __func__, root_path);
+       printf("%s: root image URL is '%s'\n", __func__, root_path);
 
 #if CONFIG_CSR
        if (auth_root && (csr_check(CSR_ALLOW_ANY_RECOVERY_OS) == 0)) {
                AUTHPRNT("CSR_ALLOW_ANY_RECOVERY_OS set, skipping root image authentication");
-               auth_root = false;
+               auth_root = FALSE;
        }
 #endif
 
        /* Make a copy of the path to URL-decode */
-       char *path_alloc = kalloc(MAXPATHLEN);
-       if (path_alloc == NULL) {
-               panic("imageboot path allocation failed\n");
-       }
-       char *path = path_alloc;
-
-       size_t len = strlen(kIBFilePrefix);
-       strlcpy(path, root_path, MAXPATHLEN);
-       if (strncmp(kIBFilePrefix, path, len) == 0) {
-               /* its a URL - remove the file:// prefix and percent-decode */
-               path += len;
-               url_decode(path);
-       }
+       char *path = url_to_path(root_path);
+       assert(path);
 
+#if CONFIG_IMAGEBOOT_CHUNKLIST
        if (auth_root) {
                AUTHDBG("authenticating root image at %s", path);
-               error = authenticate_root(path);
+               error = authenticate_root_with_chunklist(path);
                if (error) {
                        panic("root image authentication failed (err = %d)\n", error);
                }
                AUTHDBG("successfully authenticated %s", path);
        }
+#endif
 
        if (ramdisk_root) {
                error = imageboot_mount_ramdisk(path);
        } else {
-               error = imageboot_mount_image(root_path, height);
+               error = imageboot_mount_image(root_path, height, type);
        }
 
-       kfree_safe(path_alloc);
+       if (path != root_path) {
+               kfree_safe(path);
+       }
 
        if (error) {
                panic("Failed to mount root image (err=%d, auth=%d, ramdisk=%d)\n",
                    error, auth_root, ramdisk_root);
        }
 
+#if CONFIG_IMAGEBOOT_CHUNKLIST
        if (auth_root) {
                /* check that the image version matches the running kernel */
                AUTHDBG("checking root image version");
-               error = auth_version_check();
+               error = authenticate_root_version_check();
                if (error) {
                        panic("root image version check failed");
                } else {
                        AUTHDBG("root image version matches kernel");
                }
        }
+#endif
 
        done = TRUE;
 
@@ -1028,7 +733,7 @@ out:
 }
 
 __private_extern__ void
-imageboot_setup()
+imageboot_setup(imageboot_type_t type)
 {
        int         error = 0;
        char *root_path = NULL;
@@ -1041,11 +746,13 @@ imageboot_setup()
 
        /*
         * New boot-arg scheme:
-        *      root-dmg : the dmg that will be the root filesystem.
-        *      auth-root-dmg : same as root-dmg but with image authentication.
+        *      root-dmg : the dmg that will be the root filesystem, authenticated by default.
+        *      auth-root-dmg : same as root-dmg.
         *      container-dmg : an optional dmg that contains the root-dmg.
+        *  locker : the locker that will be the root filesystem -- mutually
+        *           exclusive with any other boot-arg.
         */
-       if (imageboot_setup_new()) {
+       if (imageboot_setup_new(type)) {
                return;
        }
 
@@ -1059,14 +766,28 @@ imageboot_setup()
         * device vnode created for it, and should not show up in getfsstat() until exposed
         * with MNT_IMGSRC. We just make it the temporary root.
         */
+#if CONFIG_IMAGEBOOT_IMG4
+       if (PE_parse_boot_argn("arp0", root_path, MAXPATHLEN)) {
+               char *path = url_to_path(root_path);
+
+               assert(path);
+
+               if (authenticate_root_with_img4(path)) {
+                       panic("Root image %s does not match the manifest\n", root_path);
+               }
+               if (path != root_path) {
+                       kfree_safe(path);
+               }
+       } else
+#endif /* CONFIG_IMAGEBOOT_IMG4 */
        if ((PE_parse_boot_argn("rp", root_path, MAXPATHLEN) == FALSE) &&
            (PE_parse_boot_argn("rp0", root_path, MAXPATHLEN) == FALSE)) {
                panic("%s: no valid path to image.\n", __FUNCTION__);
        }
 
-       printf("%s: root image url is %s\n", __FUNCTION__, root_path);
+       DBG_TRACE("%s: root image url is %s\n", __FUNCTION__, root_path);
 
-       error = imageboot_mount_image(root_path, 0);
+       error = imageboot_mount_image(root_path, 0, type);
        if (error) {
                panic("Failed on first stage of imageboot.");
        }
@@ -1084,7 +805,7 @@ imageboot_setup()
         * If we fail to set up second image, it's not a given that we
         * can safely root off the first.
         */
-       error = imageboot_mount_image(root_path, 1);
+       error = imageboot_mount_image(root_path, 1, type);
        if (error) {
                panic("Failed on second stage of imageboot.");
        }
index f901211e07789f798821d8e6fa2d36a46a616279..0110e71142597e053ab92fe47d10556455a49b9f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @Apple_LICENSE_HEADER_START@
  *
@@ -37,6 +37,7 @@
 #include <mach/mach_types.h>
 #include <mach/mach_time.h>
 #include <mach/mach_vm.h>
+#include <machine/atomic.h>
 #include <machine/machine_routines.h>
 
 #include <mach/machine.h>
@@ -282,16 +283,16 @@ static int kdbg_setreg(kd_regtype *);
 static int kdbg_setpidex(kd_regtype *);
 static int kdbg_setpid(kd_regtype *);
 static void kdbg_thrmap_init(void);
-static int kdbg_reinit(boolean_t);
-static int kdbg_bootstrap(boolean_t);
+static int kdbg_reinit(bool);
+static int kdbg_bootstrap(bool);
 static int kdbg_test(size_t flavor);
 
-static int kdbg_write_v1_header(boolean_t write_thread_map, vnode_t vp, vfs_context_t ctx);
+static int kdbg_write_v1_header(bool write_thread_map, vnode_t vp, vfs_context_t ctx);
 static int kdbg_write_thread_map(vnode_t vp, vfs_context_t ctx);
 static int kdbg_copyout_thread_map(user_addr_t buffer, size_t *buffer_size);
 static void kdbg_clear_thread_map(void);
 
-static boolean_t kdbg_wait(uint64_t timeout_ms, boolean_t locked_wait);
+static bool kdbg_wait(uint64_t timeout_ms, bool locked_wait);
 static void kdbg_wakeup(void);
 
 int kdbg_cpumap_init_internal(kd_iop_t* iops, uint32_t cpu_count,
@@ -301,7 +302,7 @@ static kd_threadmap *kdbg_thrmap_init_internal(unsigned int count,
     unsigned int *mapsize,
     unsigned int *mapcount);
 
-static boolean_t kdebug_current_proc_enabled(uint32_t debugid);
+static bool kdebug_current_proc_enabled(uint32_t debugid);
 static errno_t kdebug_check_trace_string(uint32_t debugid, uint64_t str_id);
 
 int kdbg_write_v3_header(user_addr_t, size_t *, int);
@@ -315,7 +316,7 @@ user_addr_t kdbg_write_v3_event_chunk_header(user_addr_t buffer, uint32_t tag,
 
 // Helper functions
 
-static int create_buffers(boolean_t);
+static int create_buffers(bool);
 static void delete_buffers(void);
 
 extern int tasks_count;
@@ -365,7 +366,7 @@ struct kd_storage {
        uint32_t kds_bufindx;
        uint32_t kds_bufcnt;
        uint32_t kds_readlast;
-       boolean_t kds_lostevents;
+       bool kds_lostevents;
        uint64_t  kds_timestamp;
 
        kd_buf  kds_records[EVENTS_PER_STORAGE_UNIT];
@@ -392,7 +393,7 @@ int kds_waiter = 0;
 struct kd_bufinfo {
        union  kds_ptr kd_list_head;
        union  kds_ptr kd_list_tail;
-       boolean_t kd_lostevents;
+       bool kd_lostevents;
        uint32_t _pad;
        uint64_t kd_prev_timebase;
        uint32_t num_bufs;
@@ -496,10 +497,129 @@ struct krt {
        struct tts *atts;
 };
 
+/*
+ * TRACE file formats...
+ *
+ * RAW_VERSION0
+ *
+ * uint32_t #threadmaps
+ * kd_threadmap[]
+ * kd_buf[]
+ *
+ * RAW_VERSION1
+ *
+ * RAW_header, with version_no set to RAW_VERSION1
+ * kd_threadmap[]
+ * Empty space to pad alignment to the nearest page boundary.
+ * kd_buf[]
+ *
+ * RAW_VERSION1+
+ *
+ * RAW_header, with version_no set to RAW_VERSION1
+ * kd_threadmap[]
+ * kd_cpumap_header, with version_no set to RAW_VERSION1
+ * kd_cpumap[]
+ * Empty space to pad alignment to the nearest page boundary.
+ * kd_buf[]
+ *
+ * V1+ implementation details...
+ *
+ * It would have been nice to add the cpumap data "correctly", but there were
+ * several obstacles. Existing code attempts to parse both V1 and V0 files.
+ * Due to the fact that V0 has no versioning or header, the test looks like
+ * this:
+ *
+ * // Read header
+ * if (header.version_no != RAW_VERSION1) { // Assume V0 }
+ *
+ * If we add a VERSION2 file format, all existing code is going to treat that
+ * as a VERSION0 file when reading it, and crash terribly when trying to read
+ * RAW_VERSION2 threadmap entries.
+ *
+ * To differentiate between a V1 and V1+ file, read as V1 until you reach
+ * the padding bytes. Then:
+ *
+ * boolean_t is_v1plus = FALSE;
+ * if (padding_bytes >= sizeof(kd_cpumap_header)) {
+ *     kd_cpumap_header header = // read header;
+ *     if (header.version_no == RAW_VERSION1) {
+ *         is_v1plus = TRUE;
+ *     }
+ * }
+ *
+ */
+
+#define RAW_VERSION3    0x00001000
+
+// Version 3 header
+// The header chunk has the tag 0x00001000 which also serves as a magic word
+// that identifies the file as a version 3 trace file. The header payload is
+// a set of fixed fields followed by a variable number of sub-chunks:
+/*
+ *  ____________________________________________________________________________
+ | Offset | Size | Field                                                    |
+ |  ----------------------------------------------------------------------------
+ |    0   |  4   | Tag (0x00001000)                                         |
+ |    4   |  4   | Sub-tag. Represents the version of the header.           |
+ |    8   |  8   | Length of header payload (40+8x)                         |
+ |   16   |  8   | Time base info. Two 32-bit numbers, numer/denom,         |
+ |        |      | for converting timestamps to nanoseconds.                |
+ |   24   |  8   | Timestamp of trace start.                                |
+ |   32   |  8   | Wall time seconds since Unix epoch.                      |
+ |        |      | As returned by gettimeofday().                           |
+ |   40   |  4   | Wall time microseconds. As returned by gettimeofday().   |
+ |   44   |  4   | Local time zone offset in minutes. ( " )                 |
+ |   48   |  4   | Type of daylight savings time correction to apply. ( " ) |
+ |   52   |  4   | Flags. 1 = 64-bit. Remaining bits should be written      |
+ |        |      | as 0 and ignored when reading.                           |
+ |   56   |  8x  | Variable number of sub-chunks. None are required.        |
+ |        |      | Ignore unknown chunks.                                   |
+ |  ----------------------------------------------------------------------------
+ */
+// NOTE: The header sub-chunks are considered part of the header chunk,
+// so they must be included in the header chunk’s length field.
+// The CPU map is an optional sub-chunk of the header chunk. It provides
+// information about the CPUs that are referenced from the trace events.
+typedef struct {
+       uint32_t tag;
+       uint32_t sub_tag;
+       uint64_t length;
+       uint32_t timebase_numer;
+       uint32_t timebase_denom;
+       uint64_t timestamp;
+       uint64_t walltime_secs;
+       uint32_t walltime_usecs;
+       uint32_t timezone_minuteswest;
+       uint32_t timezone_dst;
+       uint32_t flags;
+} __attribute__((packed)) kd_header_v3;
+
+typedef struct {
+       uint32_t tag;
+       uint32_t sub_tag;
+       uint64_t length;
+} __attribute__((packed)) kd_chunk_header_v3;
+
+#define V3_CONFIG       0x00001b00
+#define V3_CPU_MAP      0x00001c00
+#define V3_THREAD_MAP   0x00001d00
+#define V3_RAW_EVENTS   0x00001e00
+#define V3_NULL_CHUNK   0x00002000
+
+// The current version of all kernel managed chunks is 1. The
+// V3_CURRENT_CHUNK_VERSION is added to ease the simple case
+// when most/all the kernel managed chunks have the same version.
+
+#define V3_CURRENT_CHUNK_VERSION 1
+#define V3_HEADER_VERSION     V3_CURRENT_CHUNK_VERSION
+#define V3_CPUMAP_VERSION     V3_CURRENT_CHUNK_VERSION
+#define V3_THRMAP_VERSION     V3_CURRENT_CHUNK_VERSION
+#define V3_EVENT_DATA_VERSION V3_CURRENT_CHUNK_VERSION
+
 typedef struct krt krt_t;
 
 static uint32_t
-kdbg_cpu_count(boolean_t early_trace)
+kdbg_cpu_count(bool early_trace)
 {
        if (early_trace) {
 #if CONFIG_EMBEDDED
@@ -518,7 +638,7 @@ kdbg_cpu_count(boolean_t early_trace)
 
 #if MACH_ASSERT
 #if CONFIG_EMBEDDED
-static boolean_t
+static bool
 kdbg_iop_list_is_valid(kd_iop_t* iop)
 {
        if (iop) {
@@ -526,7 +646,7 @@ kdbg_iop_list_is_valid(kd_iop_t* iop)
                kd_iop_t* temp = iop;
                do {
                        assert(!temp->next || temp->next->cpu_id == temp->cpu_id - 1);
-                       assert(temp->next || (temp->cpu_id == kdbg_cpu_count(FALSE) || temp->cpu_id == kdbg_cpu_count(TRUE)));
+                       assert(temp->next || (temp->cpu_id == kdbg_cpu_count(false) || temp->cpu_id == kdbg_cpu_count(true)));
                } while ((temp = temp->next));
 
                /* Does each entry have a function and a name? */
@@ -537,20 +657,20 @@ kdbg_iop_list_is_valid(kd_iop_t* iop)
                } while ((temp = temp->next));
        }
 
-       return TRUE;
+       return true;
 }
 
-static boolean_t
+static bool
 kdbg_iop_list_contains_cpu_id(kd_iop_t* list, uint32_t cpu_id)
 {
        while (list) {
                if (list->cpu_id == cpu_id) {
-                       return TRUE;
+                       return true;
                }
                list = list->next;
        }
 
-       return FALSE;
+       return false;
 }
 #endif /* CONFIG_EMBEDDED */
 #endif /* MACH_ASSERT */
@@ -564,16 +684,25 @@ kdbg_iop_list_callback(kd_iop_t* iop, kd_callback_type type, void* arg)
        }
 }
 
-static lck_grp_t      *kdebug_lck_grp      = NULL;
+static lck_grp_t *kdebug_lck_grp = NULL;
 
 static void
-kdbg_set_tracing_enabled(boolean_t enabled, uint32_t trace_type)
+kdbg_set_tracing_enabled(bool enabled, uint32_t trace_type)
 {
-       int s = ml_set_interrupts_enabled(FALSE);
+       /*
+        * Drain any events from IOPs before making the state change.  On
+        * enabling, this removes any stale events from before tracing.  On
+        * disabling, this saves any events up to the point tracing is disabled.
+        */
+       kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_SYNC_FLUSH,
+           NULL);
+
+       int s = ml_set_interrupts_enabled(false);
        lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp);
+
        if (enabled) {
                /*
-                * The oldest valid time is now; reject old events from IOPs.
+                * The oldest valid time is now; reject past events from IOPs.
                 */
                kd_ctrl_page.oldest_time = kdbg_timestamp();
                kdebug_enable |= trace_type;
@@ -590,22 +719,18 @@ kdbg_set_tracing_enabled(boolean_t enabled, uint32_t trace_type)
        ml_set_interrupts_enabled(s);
 
        if (enabled) {
-               kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_KDEBUG_ENABLED, NULL);
+               kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops,
+                   KD_CALLBACK_KDEBUG_ENABLED, NULL);
        } else {
-               /*
-                * If you do not flush the IOP trace buffers, they can linger
-                * for a considerable period; consider code which disables and
-                * deallocates without a final sync flush.
-                */
-               kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_KDEBUG_DISABLED, NULL);
-               kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_SYNC_FLUSH, NULL);
+               kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops,
+                   KD_CALLBACK_KDEBUG_DISABLED, NULL);
        }
 }
 
 static void
-kdbg_set_flags(int slowflag, int enableflag, boolean_t enabled)
+kdbg_set_flags(int slowflag, int enableflag, bool enabled)
 {
-       int s = ml_set_interrupts_enabled(FALSE);
+       int s = ml_set_interrupts_enabled(false);
        lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp);
 
        if (enabled) {
@@ -623,11 +748,11 @@ kdbg_set_flags(int slowflag, int enableflag, boolean_t enabled)
 /*
  * Disable wrapping and return true if trace wrapped, false otherwise.
  */
-static boolean_t
+static bool
 disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags)
 {
-       boolean_t wrapped;
-       int s = ml_set_interrupts_enabled(FALSE);
+       bool wrapped;
+       int s = ml_set_interrupts_enabled(false);
        lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp);
 
        *old_slowcheck = kd_ctrl_page.kdebug_slowcheck;
@@ -646,7 +771,7 @@ disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags)
 static void
 enable_wrap(uint32_t old_slowcheck)
 {
-       int s = ml_set_interrupts_enabled(FALSE);
+       int s = ml_set_interrupts_enabled(false);
        lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp);
 
        kd_ctrl_page.kdebug_flags &= ~KDBG_NOWRAP;
@@ -660,7 +785,7 @@ enable_wrap(uint32_t old_slowcheck)
 }
 
 static int
-create_buffers(boolean_t early_trace)
+create_buffers(bool early_trace)
 {
        unsigned int i;
        unsigned int p_buffer_size;
@@ -769,7 +894,7 @@ create_buffers(boolean_t early_trace)
        for (i = 0; i < kd_ctrl_page.kdebug_cpus; i++) {
                kdbip[i].kd_list_head.raw = KDS_PTR_NULL;
                kdbip[i].kd_list_tail.raw = KDS_PTR_NULL;
-               kdbip[i].kd_lostevents = FALSE;
+               kdbip[i].kd_lostevents = false;
                kdbip[i].num_bufs = 0;
        }
 
@@ -828,7 +953,7 @@ release_storage_unit(int cpu, uint32_t kdsp_raw)
 
        kdsp.raw = kdsp_raw;
 
-       s = ml_set_interrupts_enabled(FALSE);
+       s = ml_set_interrupts_enabled(false);
        lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp);
 
        kdbp = &kdbip[cpu];
@@ -856,18 +981,17 @@ release_storage_unit(int cpu, uint32_t kdsp_raw)
        ml_set_interrupts_enabled(s);
 }
 
-
-boolean_t
+bool
 allocate_storage_unit(int cpu)
 {
        union kds_ptr kdsp;
        struct kd_storage *kdsp_actual, *kdsp_next_actual;
        struct kd_bufinfo *kdbp, *kdbp_vict, *kdbp_try;
        uint64_t oldest_ts, ts;
-       boolean_t retval = TRUE;
+       bool retval = true;
        int s = 0;
 
-       s = ml_set_interrupts_enabled(FALSE);
+       s = ml_set_interrupts_enabled(false);
        lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp);
 
        kdbp = &kdbip[cpu];
@@ -896,8 +1020,8 @@ allocate_storage_unit(int cpu)
                 */
                if (kd_ctrl_page.kdebug_flags & KDBG_NOWRAP) {
                        kd_ctrl_page.kdebug_slowcheck |= SLOW_NOLOG;
-                       kdbp->kd_lostevents = TRUE;
-                       retval = FALSE;
+                       kdbp->kd_lostevents = true;
+                       retval = false;
                        goto out;
                }
                kdbp_vict = NULL;
@@ -941,7 +1065,7 @@ allocate_storage_unit(int cpu)
                        kdebug_enable = 0;
                        kd_ctrl_page.enabled = 0;
                        commpage_update_kdebug_state();
-                       retval = FALSE;
+                       retval = false;
                        goto out;
                }
                kdsp = kdbp_vict->kd_list_head;
@@ -950,9 +1074,9 @@ allocate_storage_unit(int cpu)
 
                if (kdbp_vict->kd_list_head.raw != KDS_PTR_NULL) {
                        kdsp_next_actual = POINTER_FROM_KDS_PTR(kdbp_vict->kd_list_head);
-                       kdsp_next_actual->kds_lostevents = TRUE;
+                       kdsp_next_actual->kds_lostevents = true;
                } else {
-                       kdbp_vict->kd_lostevents = TRUE;
+                       kdbp_vict->kd_lostevents = true;
                }
 
                if (kd_ctrl_page.oldest_time < oldest_ts) {
@@ -966,7 +1090,7 @@ allocate_storage_unit(int cpu)
        kdsp_actual->kds_readlast = 0;
 
        kdsp_actual->kds_lostevents = kdbp->kd_lostevents;
-       kdbp->kd_lostevents = FALSE;
+       kdbp->kd_lostevents = false;
        kdsp_actual->kds_bufindx = 0;
 
        if (kdbp->kd_list_head.raw == KDS_PTR_NULL) {
@@ -995,7 +1119,7 @@ kernel_debug_register_callback(kd_callback_t callback)
                 * Remove when fixed.
                 */
                {
-                       boolean_t is_valid_name = FALSE;
+                       bool is_valid_name = false;
                        for (uint32_t length = 0; length < sizeof(callback.iop_name); ++length) {
                                /* This is roughly isprintable(c) */
                                if (callback.iop_name[length] > 0x20 && callback.iop_name[length] < 0x7F) {
@@ -1003,7 +1127,7 @@ kernel_debug_register_callback(kd_callback_t callback)
                                }
                                if (callback.iop_name[length] == 0) {
                                        if (length) {
-                                               is_valid_name = TRUE;
+                                               is_valid_name = true;
                                        }
                                        break;
                                }
@@ -1026,7 +1150,7 @@ kernel_debug_register_callback(kd_callback_t callback)
                         * TLDR; Must not read kd_iops more than once per loop.
                         */
                        iop->next = kd_iops;
-                       iop->cpu_id = iop->next ? (iop->next->cpu_id + 1) : kdbg_cpu_count(FALSE);
+                       iop->cpu_id = iop->next ? (iop->next->cpu_id + 1) : kdbg_cpu_count(false);
 
                        /*
                         * Header says OSCompareAndSwapPtr has a memory barrier
@@ -1128,7 +1252,7 @@ retry_q:
        }
 
        if (kdsp_actual == NULL || bindx >= EVENTS_PER_STORAGE_UNIT) {
-               if (allocate_storage_unit(coreid) == FALSE) {
+               if (allocate_storage_unit(coreid) == false) {
                        /*
                         * this can only happen if wrapping
                         * has been disabled
@@ -1308,7 +1432,7 @@ retry_q:
        }
 
        if (kdsp_actual == NULL || bindx >= EVENTS_PER_STORAGE_UNIT) {
-               if (allocate_storage_unit(cpu) == FALSE) {
+               if (allocate_storage_unit(cpu) == false) {
                        /*
                         * this can only happen if wrapping
                         * has been disabled
@@ -1357,6 +1481,7 @@ out1:
        }
 }
 
+__attribute__((noinline))
 void
 kernel_debug(
        uint32_t        debugid,
@@ -1370,6 +1495,7 @@ kernel_debug(
            (uintptr_t)thread_tid(current_thread()), 0);
 }
 
+__attribute__((noinline))
 void
 kernel_debug1(
        uint32_t        debugid,
@@ -1382,6 +1508,7 @@ kernel_debug1(
        kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, arg5, 0);
 }
 
+__attribute__((noinline))
 void
 kernel_debug_flags(
        uint32_t debugid,
@@ -1395,6 +1522,7 @@ kernel_debug_flags(
            (uintptr_t)thread_tid(current_thread()), flags);
 }
 
+__attribute__((noinline))
 void
 kernel_debug_filtered(
        uint32_t debugid,
@@ -1560,7 +1688,7 @@ void
 kernel_debug_disable(void)
 {
        if (kdebug_enable) {
-               kdbg_set_tracing_enabled(FALSE, 0);
+               kdbg_set_tracing_enabled(false, 0);
        }
 }
 
@@ -1624,7 +1752,7 @@ kdebug_typefilter(__unused struct proc* p,
         * the first atomic load test of Global Typefilter Ptr, this function
         * can then safely use the remaining global state without atomic checks.
         */
-       if (!__c11_atomic_load((_Atomic typefilter_t *)&kdbg_typefilter, memory_order_acquire)) {
+       if (!os_atomic_load(&kdbg_typefilter, acquire)) {
                return EINVAL;
        }
 
@@ -1643,7 +1771,7 @@ kdebug_typefilter(__unused struct proc* p,
                VM_KERN_MEMORY_NONE,
                kdbg_typefilter_memory_entry,                           // port (memory entry!)
                0,                                                      // offset (in memory entry)
-               FALSE,                                                  // should copy
+               false,                                                  // should copy
                VM_PROT_READ,                                           // cur_prot
                VM_PROT_READ,                                           // max_prot
                VM_INHERIT_SHARE));                                     // inherit behavior on fork
@@ -1787,18 +1915,18 @@ kernel_debug_string_internal(uint32_t debugid, uint64_t str_id, void *vstr,
  * Trace system and scheduling events circumvent this check, as do events
  * emitted in interrupt context.
  */
-static boolean_t
+static bool
 kdebug_current_proc_enabled(uint32_t debugid)
 {
        /* can't determine current process in interrupt context */
        if (ml_at_interrupt_context()) {
-               return TRUE;
+               return true;
        }
 
        /* always emit trace system and scheduling events */
        if ((KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE ||
            (debugid & KDBG_CSC_MASK) == MACHDBG_CODE(DBG_MACH_SCHED, 0))) {
-               return TRUE;
+               return true;
        }
 
        if (kd_ctrl_page.kdebug_flags & KDBG_PIDCHECK) {
@@ -1806,52 +1934,58 @@ kdebug_current_proc_enabled(uint32_t debugid)
 
                /* only the process with the kdebug bit set is allowed */
                if (cur_proc && !(cur_proc->p_kdebug)) {
-                       return FALSE;
+                       return false;
                }
        } else if (kd_ctrl_page.kdebug_flags & KDBG_PIDEXCLUDE) {
                proc_t cur_proc = current_proc();
 
                /* every process except the one with the kdebug bit set is allowed */
                if (cur_proc && cur_proc->p_kdebug) {
-                       return FALSE;
+                       return false;
                }
        }
 
-       return TRUE;
+       return true;
 }
 
-boolean_t
+bool
 kdebug_debugid_enabled(uint32_t debugid)
 {
        /* if no filtering is enabled */
        if (!kd_ctrl_page.kdebug_slowcheck) {
-               return TRUE;
+               return true;
        }
 
        return kdebug_debugid_explicitly_enabled(debugid);
 }
 
-boolean_t
+bool
 kdebug_debugid_explicitly_enabled(uint32_t debugid)
 {
        if (kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) {
                return typefilter_is_debugid_allowed(kdbg_typefilter, debugid);
        } else if (KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE) {
-               return TRUE;
+               return true;
        } else if (kd_ctrl_page.kdebug_flags & KDBG_RANGECHECK) {
                if (debugid < kdlog_beg || debugid > kdlog_end) {
-                       return FALSE;
+                       return false;
                }
        } else if (kd_ctrl_page.kdebug_flags & KDBG_VALCHECK) {
                if ((debugid & KDBG_EVENTID_MASK) != kdlog_value1 &&
                    (debugid & KDBG_EVENTID_MASK) != kdlog_value2 &&
                    (debugid & KDBG_EVENTID_MASK) != kdlog_value3 &&
                    (debugid & KDBG_EVENTID_MASK) != kdlog_value4) {
-                       return FALSE;
+                       return false;
                }
        }
 
-       return TRUE;
+       return true;
+}
+
+bool
+kdebug_using_continuous_time(void)
+{
+       return kdebug_enable & KDEBUG_ENABLE_CONT_TIME;
 }
 
 /*
@@ -2006,7 +2140,7 @@ kdbg_lock_init(void)
 }
 
 int
-kdbg_bootstrap(boolean_t early_trace)
+kdbg_bootstrap(bool early_trace)
 {
        kd_ctrl_page.kdebug_flags &= ~KDBG_WRAPPED;
 
@@ -2014,7 +2148,7 @@ kdbg_bootstrap(boolean_t early_trace)
 }
 
 int
-kdbg_reinit(boolean_t early_trace)
+kdbg_reinit(bool early_trace)
 {
        int ret = 0;
 
@@ -2060,12 +2194,9 @@ kdbg_trace_data(struct proc *proc, long *arg_pid, long *arg_uniqueid)
 
 
 void
-kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, long *arg4)
+kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3,
+    long *arg4)
 {
-       char *dbg_nameptr;
-       int dbg_namelen;
-       long dbg_parms[4];
-
        if (!proc) {
                *arg1 = 0;
                *arg2 = 0;
@@ -2073,26 +2204,22 @@ kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, long *a
                *arg4 = 0;
                return;
        }
-       /*
-        * Collect the pathname for tracing
-        */
-       dbg_nameptr = proc->p_comm;
-       dbg_namelen = (int)strlen(proc->p_comm);
-       dbg_parms[0] = 0L;
-       dbg_parms[1] = 0L;
-       dbg_parms[2] = 0L;
-       dbg_parms[3] = 0L;
 
-       if (dbg_namelen > (int)sizeof(dbg_parms)) {
-               dbg_namelen = (int)sizeof(dbg_parms);
+       const char *procname = proc_best_name(proc);
+       size_t namelen = strlen(procname);
+
+       long args[4] = { 0 };
+
+       if (namelen > sizeof(args)) {
+               namelen = sizeof(args);
        }
 
-       strncpy((char *)dbg_parms, dbg_nameptr, dbg_namelen);
+       strncpy((char *)args, procname, namelen);
 
-       *arg1 = dbg_parms[0];
-       *arg2 = dbg_parms[1];
-       *arg3 = dbg_parms[2];
-       *arg4 = dbg_parms[3];
+       *arg1 = args[0];
+       *arg2 = args[1];
+       *arg3 = args[2];
+       *arg4 = args[3];
 }
 
 static void
@@ -2394,7 +2521,7 @@ kdbg_setpid(kd_regtype *kdr)
                                 */
                                kd_ctrl_page.kdebug_flags |= KDBG_PIDCHECK;
                                kd_ctrl_page.kdebug_flags &= ~KDBG_PIDEXCLUDE;
-                               kdbg_set_flags(SLOW_CHECKS, 0, TRUE);
+                               kdbg_set_flags(SLOW_CHECKS, 0, true);
 
                                p->p_kdebug = 1;
                        } else {
@@ -2436,7 +2563,7 @@ kdbg_setpidex(kd_regtype *kdr)
                                 */
                                kd_ctrl_page.kdebug_flags |= KDBG_PIDEXCLUDE;
                                kd_ctrl_page.kdebug_flags &= ~KDBG_PIDCHECK;
-                               kdbg_set_flags(SLOW_CHECKS, 0, TRUE);
+                               kdbg_set_flags(SLOW_CHECKS, 0, true);
 
                                p->p_kdebug = 1;
                        } else {
@@ -2490,7 +2617,7 @@ kdbg_initialize_typefilter(typefilter_t tf)
         * that any non-null kdbg_typefilter means a
         * valid memory_entry is available.
         */
-       __c11_atomic_store(((_Atomic typefilter_t*)&kdbg_typefilter), tf, memory_order_release);
+       os_atomic_store(&kdbg_typefilter, tf, release);
 
        return KERN_SUCCESS;
 }
@@ -2552,7 +2679,7 @@ kdbg_enable_typefilter(void)
        assert(kdbg_typefilter);
        kd_ctrl_page.kdebug_flags &= ~(KDBG_RANGECHECK | KDBG_VALCHECK);
        kd_ctrl_page.kdebug_flags |= KDBG_TYPEFILTER_CHECK;
-       kdbg_set_flags(SLOW_CHECKS, 0, TRUE);
+       kdbg_set_flags(SLOW_CHECKS, 0, true);
        commpage_update_kdebug_state();
 }
 
@@ -2567,9 +2694,9 @@ kdbg_disable_typefilter(void)
        kd_ctrl_page.kdebug_flags &= ~KDBG_TYPEFILTER_CHECK;
 
        if ((kd_ctrl_page.kdebug_flags & (KDBG_PIDCHECK | KDBG_PIDEXCLUDE))) {
-               kdbg_set_flags(SLOW_CHECKS, 0, TRUE);
+               kdbg_set_flags(SLOW_CHECKS, 0, true);
        } else {
-               kdbg_set_flags(SLOW_CHECKS, 0, FALSE);
+               kdbg_set_flags(SLOW_CHECKS, 0, false);
        }
        commpage_update_kdebug_state();
 
@@ -2613,7 +2740,7 @@ kdbg_setreg(kd_regtype * kdr)
                kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES;
                kd_ctrl_page.kdebug_flags &= ~KDBG_VALCHECK;       /* Turn off specific value check  */
                kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_CLASSTYPE);
-               kdbg_set_flags(SLOW_CHECKS, 0, TRUE);
+               kdbg_set_flags(SLOW_CHECKS, 0, true);
                break;
        case KDBG_SUBCLSTYPE:
                val_1 = (kdr->value1 & 0xff);
@@ -2624,7 +2751,7 @@ kdbg_setreg(kd_regtype * kdr)
                kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES;
                kd_ctrl_page.kdebug_flags &= ~KDBG_VALCHECK;       /* Turn off specific value check  */
                kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_SUBCLSTYPE);
-               kdbg_set_flags(SLOW_CHECKS, 0, TRUE);
+               kdbg_set_flags(SLOW_CHECKS, 0, true);
                break;
        case KDBG_RANGETYPE:
                kdlog_beg = (kdr->value1);
@@ -2632,7 +2759,7 @@ kdbg_setreg(kd_regtype * kdr)
                kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES;
                kd_ctrl_page.kdebug_flags &= ~KDBG_VALCHECK;       /* Turn off specific value check  */
                kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_RANGETYPE);
-               kdbg_set_flags(SLOW_CHECKS, 0, TRUE);
+               kdbg_set_flags(SLOW_CHECKS, 0, true);
                break;
        case KDBG_VALCHECK:
                kdlog_value1 = (kdr->value1);
@@ -2642,7 +2769,7 @@ kdbg_setreg(kd_regtype * kdr)
                kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES;
                kd_ctrl_page.kdebug_flags &= ~KDBG_RANGECHECK;    /* Turn off range check */
                kd_ctrl_page.kdebug_flags |= KDBG_VALCHECK;       /* Turn on specific value check  */
-               kdbg_set_flags(SLOW_CHECKS, 0, TRUE);
+               kdbg_set_flags(SLOW_CHECKS, 0, true);
                break;
        case KDBG_TYPENONE:
                kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES;
@@ -2650,9 +2777,9 @@ kdbg_setreg(kd_regtype * kdr)
                if ((kd_ctrl_page.kdebug_flags & (KDBG_RANGECHECK | KDBG_VALCHECK   |
                    KDBG_PIDCHECK   | KDBG_PIDEXCLUDE |
                    KDBG_TYPEFILTER_CHECK))) {
-                       kdbg_set_flags(SLOW_CHECKS, 0, TRUE);
+                       kdbg_set_flags(SLOW_CHECKS, 0, true);
                } else {
-                       kdbg_set_flags(SLOW_CHECKS, 0, FALSE);
+                       kdbg_set_flags(SLOW_CHECKS, 0, false);
                }
 
                kdlog_beg = 0;
@@ -2705,25 +2832,7 @@ write_error:
        return ret;
 }
 
-int
-kdbg_write_v3_chunk_header_to_buffer(void * buffer, uint32_t tag, uint32_t sub_tag, uint64_t length)
-{
-       kd_chunk_header_v3 header = {
-               .tag = tag,
-               .sub_tag = sub_tag,
-               .length = length,
-       };
-
-       if (!buffer) {
-               return 0;
-       }
-
-       memcpy(buffer, &header, sizeof(kd_chunk_header_v3));
-
-       return sizeof(kd_chunk_header_v3);
-}
-
-int
+static int
 kdbg_write_v3_chunk_to_fd(uint32_t tag, uint32_t sub_tag, uint64_t length, void *payload, uint64_t payload_size, int fd)
 {
        proc_t p;
@@ -3006,7 +3115,7 @@ kdbg_readcurthrmap(user_addr_t buffer, size_t *bufsize)
 }
 
 static int
-kdbg_write_v1_header(boolean_t write_thread_map, vnode_t vp, vfs_context_t ctx)
+kdbg_write_v1_header(bool write_thread_map, vnode_t vp, vfs_context_t ctx)
 {
        int ret = 0;
        RAW_header header;
@@ -3175,7 +3284,7 @@ static int
 kdbg_write_thread_map(vnode_t vp, vfs_context_t ctx)
 {
        int ret = 0;
-       boolean_t map_initialized;
+       bool map_initialized;
 
        ktrace_assert_lock_held();
        assert(ctx != NULL);
@@ -3204,7 +3313,7 @@ kdbg_write_thread_map(vnode_t vp, vfs_context_t ctx)
 static int
 kdbg_copyout_thread_map(user_addr_t buffer, size_t *buffer_size)
 {
-       boolean_t map_initialized;
+       bool map_initialized;
        size_t map_size;
        int ret = 0;
 
@@ -3233,7 +3342,7 @@ int
 kdbg_readthrmap_v3(user_addr_t buffer, size_t buffer_size, int fd)
 {
        int ret = 0;
-       boolean_t map_initialized;
+       bool map_initialized;
        size_t map_size;
 
        ktrace_assert_lock_held();
@@ -3278,8 +3387,8 @@ kdbg_set_nkdbufs(unsigned int req_nkdbufs)
  *
  * Called with `ktrace_lock` locked and interrupts enabled.
  */
-static boolean_t
-kdbg_wait(uint64_t timeout_ms, boolean_t locked_wait)
+static bool
+kdbg_wait(uint64_t timeout_ms, bool locked_wait)
 {
        int wait_result = THREAD_AWAKENED;
        uint64_t abstime = 0;
@@ -3292,7 +3401,7 @@ kdbg_wait(uint64_t timeout_ms, boolean_t locked_wait)
                clock_absolutetime_interval_to_deadline(abstime, &abstime);
        }
 
-       boolean_t s = ml_set_interrupts_enabled(FALSE);
+       bool s = ml_set_interrupts_enabled(false);
        if (!s) {
                panic("kdbg_wait() called with interrupts disabled");
        }
@@ -3317,7 +3426,7 @@ kdbg_wait(uint64_t timeout_ms, boolean_t locked_wait)
        }
 
        /* check the count under the spinlock */
-       boolean_t threshold_exceeded = (kd_ctrl_page.kds_inuse_count >= n_storage_threshold);
+       bool threshold_exceeded = (kd_ctrl_page.kds_inuse_count >= n_storage_threshold);
 
        lck_spin_unlock(kdw_spin_lock);
        ml_set_interrupts_enabled(s);
@@ -3338,7 +3447,7 @@ kdbg_wait(uint64_t timeout_ms, boolean_t locked_wait)
 static void
 kdbg_wakeup(void)
 {
-       boolean_t need_kds_wakeup = FALSE;
+       bool need_kds_wakeup = false;
 
        /*
         * Try to take the lock here to synchronize with the waiter entering
@@ -3348,20 +3457,20 @@ kdbg_wakeup(void)
         * conditions.  No problem if we fail, there will be lots of additional
         * events coming in that will eventually succeed in grabbing this lock.
         */
-       boolean_t s = ml_set_interrupts_enabled(FALSE);
+       bool s = ml_set_interrupts_enabled(false);
 
        if (lck_spin_try_lock(kdw_spin_lock)) {
                if (kds_waiter &&
                    (kd_ctrl_page.kds_inuse_count >= n_storage_threshold)) {
                        kds_waiter = 0;
-                       need_kds_wakeup = TRUE;
+                       need_kds_wakeup = true;
                }
                lck_spin_unlock(kdw_spin_lock);
        }
 
        ml_set_interrupts_enabled(s);
 
-       if (need_kds_wakeup == TRUE) {
+       if (need_kds_wakeup == true) {
                wakeup(&kds_waiter);
        }
 }
@@ -3493,7 +3602,7 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep)
                        }
                        kdbg_thrmap_init();
 
-                       kdbg_set_tracing_enabled(TRUE, value);
+                       kdbg_set_tracing_enabled(true, value);
                } else {
                        if (!kdebug_enable) {
                                break;
@@ -3508,7 +3617,7 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep)
                break;
 
        case KERN_KDSETUP:
-               ret = kdbg_reinit(FALSE);
+               ret = kdbg_reinit(false);
                break;
 
        case KERN_KDREMOVE:
@@ -3548,7 +3657,7 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep)
                int     fd;
 
                if (name[0] == KERN_KDWRITETR || name[0] == KERN_KDWRITETR_V3) {
-                       (void)kdbg_wait(size, TRUE);
+                       (void)kdbg_wait(size, true);
                }
                p = current_proc();
                fd = value;
@@ -3601,7 +3710,7 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep)
                break;
        }
        case KERN_KDBUFWAIT:
-               *sizep = kdbg_wait(size, FALSE);
+               *sizep = kdbg_wait(size, false);
                break;
 
        case KERN_KDPIDTR:
@@ -3683,8 +3792,8 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin
        uint32_t tempbuf_number;
        uint32_t old_kdebug_flags;
        uint32_t old_kdebug_slowcheck;
-       boolean_t out_of_events = FALSE;
-       boolean_t wrapped = FALSE;
+       bool out_of_events = false;
+       bool wrapped = false;
 
        assert(number);
        count = *number / sizeof(kd_buf);
@@ -3701,22 +3810,19 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin
        memset(&lostevent, 0, sizeof(lostevent));
        lostevent.debugid = TRACE_LOST_EVENTS;
 
-       /*
-        * Capture the current time. Only sort events that have occured
-        * before now.  Since the IOPs are being flushed here, it is possible
-        * that events occur on the AP while running live tracing. If we are
-        * disabled, no new events should occur on the AP.
-        */
-       if (kd_ctrl_page.enabled) {
-               barrier_max = kdbg_timestamp() & KDBG_TIMESTAMP_MASK;
-       }
-
        /*
         * Request each IOP to provide us with up to date entries before merging
         * buffers together.
         */
        kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, KD_CALLBACK_SYNC_FLUSH, NULL);
 
+       /*
+        * Capture the current time.  Only sort events that have occured
+        * before now.  Since the IOPs are being flushed here, it is possible
+        * that events occur on the AP while running live tracing.
+        */
+       barrier_max = kdbg_timestamp() & KDBG_TIMESTAMP_MASK;
+
        /*
         * Disable wrap so storage units cannot be stolen out from underneath us
         * while merging events.
@@ -3749,7 +3855,7 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin
                                continue;
                        }
                        kdsp_actual = POINTER_FROM_KDS_PTR(kdsp);
-                       kdsp_actual->kds_lostevents = FALSE;
+                       kdsp_actual->kds_lostevents = false;
                }
        }
        /*
@@ -3771,7 +3877,7 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin
                         */
                        kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, 0);
                        *tempbuf = lostevent;
-                       wrapped = FALSE;
+                       wrapped = false;
                        goto nextevent;
                }
 
@@ -3809,7 +3915,7 @@ next_event:
                                 */
                                if (kdsp_actual->kds_lostevents) {
                                        lostevents = true;
-                                       kdsp_actual->kds_lostevents = FALSE;
+                                       kdsp_actual->kds_lostevents = false;
 
                                        /*
                                         * The earliest event we can trust is the first one in this
@@ -3831,7 +3937,7 @@ next_event:
 
                                t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]);
 
-                               if ((t > barrier_max) && (barrier_max > 0)) {
+                               if (t > barrier_max) {
                                        if (kdbg_debug) {
                                                printf("kdebug: FUTURE EVENT: debugid %#8x: "
                                                    "time %lld from CPU %u "
@@ -3839,12 +3945,7 @@ next_event:
                                                    kdsp_actual->kds_records[rcursor].debugid,
                                                    t, cpu, barrier_max, *number + tempbuf_number);
                                        }
-                                       /*
-                                        * Need to flush IOPs again before we can sort any more
-                                        * data from the buffers.
-                                        */
-                                       out_of_events = TRUE;
-                                       break;
+                                       goto next_cpu;
                                }
                                if (t < kdsp_actual->kds_timestamp) {
                                        /*
@@ -3857,7 +3958,7 @@ next_event:
                                         * Bail out so we don't get out-of-order events by
                                         * continuing to read events from other CPUs' events.
                                         */
-                                       out_of_events = TRUE;
+                                       out_of_events = true;
                                        break;
                                }
 
@@ -3867,6 +3968,13 @@ next_event:
                                 */
                                if (t < barrier_min) {
                                        kdsp_actual->kds_readlast++;
+                                       if (kdbg_debug) {
+                                               printf("kdebug: PAST EVENT: debugid %#8x: "
+                                                   "time %lld from CPU %u "
+                                                   "(barrier at time %lld)\n",
+                                                   kdsp_actual->kds_records[rcursor].debugid,
+                                                   t, cpu, barrier_min);
+                                       }
 
                                        if (kdsp_actual->kds_readlast >= EVENTS_PER_STORAGE_UNIT) {
                                                release_storage_unit(cpu, kdsp.raw);
@@ -3906,7 +4014,7 @@ next_event:
                        }
                        if (min_kdbp == NULL) {
                                /* All buffers ran empty. */
-                               out_of_events = TRUE;
+                               out_of_events = true;
                        }
                        if (out_of_events) {
                                break;
@@ -4000,7 +4108,7 @@ check_error:
                        count   -= tempbuf_number;
                        *number += tempbuf_number;
                }
-               if (out_of_events == TRUE) {
+               if (out_of_events == true) {
                        /*
                         * all trace buffers are empty
                         */
@@ -4018,13 +4126,37 @@ check_error:
        return error;
 }
 
+#define KDEBUG_TEST_CODE(code) BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, (code))
+
+/*
+ * A test IOP for the SYNC_FLUSH callback.
+ */
+
+static int sync_flush_iop = 0;
+
+static void
+sync_flush_callback(void * __unused context, kd_callback_type reason,
+    void * __unused arg)
+{
+       assert(sync_flush_iop > 0);
+
+       if (reason == KD_CALLBACK_SYNC_FLUSH) {
+               kernel_debug_enter(sync_flush_iop, KDEBUG_TEST_CODE(0xff),
+                   kdbg_timestamp(), 0, 0, 0, 0, 0);
+       }
+}
+
+static struct kd_callback sync_flush_kdcb = {
+       .func = sync_flush_callback,
+       .iop_name = "test_sf",
+};
+
 static int
 kdbg_test(size_t flavor)
 {
        int code = 0;
        int dummy_iop = 0;
 
-#define KDEBUG_TEST_CODE(code) BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, (code))
        switch (flavor) {
        case 1:
                /* try each macro */
@@ -4067,25 +4199,40 @@ kdbg_test(size_t flavor)
 
                /* ensure old timestamps are not emitted from kernel_debug_enter */
                kernel_debug_enter(dummy_iop, KDEBUG_TEST_CODE(code),
-                   100 /* very old timestamp */, 0, 0, 0,
-                   0, (uintptr_t)thread_tid(current_thread()));
+                   100 /* very old timestamp */, 0, 0, 0, 0, 0);
                code++;
                kernel_debug_enter(dummy_iop, KDEBUG_TEST_CODE(code),
-                   kdbg_timestamp(), 0, 0, 0, 0,
-                   (uintptr_t)thread_tid(current_thread()));
+                   kdbg_timestamp(), 0, 0, 0, 0, 0);
                code++;
                break;
 
+       case 3:
+               if (kd_ctrl_page.kdebug_iops) {
+                       dummy_iop = kd_ctrl_page.kdebug_iops[0].cpu_id;
+               }
+               kernel_debug_enter(dummy_iop, KDEBUG_TEST_CODE(code),
+                   kdbg_timestamp() * 2 /* !!! */, 0, 0, 0, 0, 0);
+               break;
+
+       case 4:
+               if (!sync_flush_iop) {
+                       sync_flush_iop = kernel_debug_register_callback(
+                               sync_flush_kdcb);
+                       assert(sync_flush_iop > 0);
+               }
+               break;
+
        default:
                return ENOTSUP;
        }
-#undef KDEBUG_TEST_CODE
 
        return 0;
 }
 
+#undef KDEBUG_TEST_CODE
+
 void
-kdebug_init(unsigned int n_events, char *filter_desc, boolean_t wrapping)
+kdebug_init(unsigned int n_events, char *filter_desc, bool wrapping)
 {
        assert(filter_desc != NULL);
 
@@ -4105,7 +4252,7 @@ kdebug_init(unsigned int n_events, char *filter_desc, boolean_t wrapping)
                n_events = 200000;
        }
 
-       kdebug_trace_start(n_events, filter_desc, wrapping, FALSE);
+       kdebug_trace_start(n_events, filter_desc, wrapping, false);
 }
 
 static void
@@ -4179,7 +4326,7 @@ kdbg_set_typefilter_string(const char *filter_desc)
  */
 void
 kdebug_trace_start(unsigned int n_events, const char *filter_desc,
-    boolean_t wrapping, boolean_t at_wake)
+    bool wrapping, bool at_wake)
 {
        if (!n_events) {
                kd_early_done = true;
@@ -4196,7 +4343,7 @@ kdebug_trace_start(unsigned int n_events, const char *filter_desc,
 
        kernel_debug_string_early("start_kern_tracing");
 
-       if (kdbg_reinit(TRUE)) {
+       if (kdbg_reinit(true)) {
                printf("error from kdbg_reinit, kernel tracing not started\n");
                goto out;
        }
@@ -4221,13 +4368,13 @@ kdebug_trace_start(unsigned int n_events, const char *filter_desc,
         * Hold off interrupts between getting a thread map and enabling trace
         * and until the early traces are recorded.
         */
-       boolean_t s = ml_set_interrupts_enabled(FALSE);
+       bool s = ml_set_interrupts_enabled(false);
 
        if (at_wake) {
                kdbg_thrmap_init();
        }
 
-       kdbg_set_tracing_enabled(TRUE, KDEBUG_ENABLE_TRACE | (kdebug_serial ?
+       kdbg_set_tracing_enabled(true, KDEBUG_ENABLE_TRACE | (kdebug_serial ?
            KDEBUG_ENABLE_SERIAL : 0));
 
        if (!at_wake) {
@@ -4369,7 +4516,7 @@ binary_search(uint32_t id)
        low = 0;
        high = (int)(sizeof(kd_events) / sizeof(kd_event_t)) - 1;
 
-       while (TRUE) {
+       while (true) {
                mid = (low + high) / 2;
 
                if (low > high) {
index ee523dff5807d13bca633d7cce52eb3584232735..3ad8a516da753807a270b500fe27aabc79135dbe 100644 (file)
@@ -1339,6 +1339,44 @@ out:
        return result;
 }
 
+/*
+ * validate user_sigevent.  at this point we only support
+ * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE.  this means
+ * sigev_value, sigev_notify_function, and sigev_notify_attributes
+ * are ignored, since SIGEV_THREAD is unsupported.  This is consistent
+ * with no [RTS] (RalTime Signal) option group support.
+ */
+static int
+aio_sigev_validate( const struct user_sigevent *sigev )
+{
+       switch (sigev->sigev_notify) {
+       case SIGEV_SIGNAL:
+       {
+               int signum;
+
+               /* make sure we have a valid signal number */
+               signum = sigev->sigev_signo;
+               if (signum <= 0 || signum >= NSIG ||
+                   signum == SIGKILL || signum == SIGSTOP) {
+                       return EINVAL;
+               }
+       }
+       break;
+
+       case SIGEV_NONE:
+               break;
+
+       case SIGEV_THREAD:
+       /* Unsupported [RTS] */
+
+       default:
+               return EINVAL;
+       }
+
+       return 0;
+}
+
+
 /*
  * aio_enqueue_work
  *
@@ -1517,6 +1555,10 @@ lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
                if (call_result) {
                        goto ExitRoutine;
                }
+               call_result = aio_sigev_validate(&aiosigev);
+               if (call_result) {
+                       goto ExitRoutine;
+               }
        }
 
        /* process list of aio requests */
@@ -1603,9 +1645,9 @@ lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
                    0 );
        }
 
+       aio_proc_lock_spin(p);
        switch (uap->mode) {
        case LIO_WAIT:
-               aio_proc_lock_spin(p);
                while (lio_context->io_completed < lio_context->io_issued) {
                        result = msleep(lio_context, aio_proc_mutex(p), PCATCH | PRIBIO | PSPIN, "lio_listio", 0);
 
@@ -1622,12 +1664,16 @@ lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
                        free_context = TRUE;
                }
 
-               aio_proc_unlock(p);
                break;
 
        case LIO_NOWAIT:
+               /* If no IOs were issued must free it (rdar://problem/45717887) */
+               if (lio_context->io_issued == 0) {
+                       free_context = TRUE;
+               }
                break;
        }
+       aio_proc_unlock(p);
 
        /* call_result == -1 means we had no trouble queueing up requests */
        if (call_result == -1) {
@@ -2128,35 +2174,9 @@ aio_validate( aio_workq_entry *entryp )
                }
        }
 
-       /*
-        * validate aiocb.aio_sigevent.  at this point we only support
-        * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE.  this means
-        * sigev_value, sigev_notify_function, and sigev_notify_attributes
-        * are ignored, since SIGEV_THREAD is unsupported.  This is consistent
-        * with no [RTS] (RalTime Signal) option group support.
-        */
-       switch (entryp->aiocb.aio_sigevent.sigev_notify) {
-       case SIGEV_SIGNAL:
-       {
-               int             signum;
-
-               /* make sure we have a valid signal number */
-               signum = entryp->aiocb.aio_sigevent.sigev_signo;
-               if (signum <= 0 || signum >= NSIG ||
-                   signum == SIGKILL || signum == SIGSTOP) {
-                       return EINVAL;
-               }
-       }
-       break;
-
-       case SIGEV_NONE:
-               break;
-
-       case SIGEV_THREAD:
-       /* Unsupported [RTS] */
-
-       default:
-               return EINVAL;
+       result = aio_sigev_validate(&entryp->aiocb.aio_sigevent);
+       if (result) {
+               return result;
        }
 
        /* validate the file descriptor and that the file was opened
index 09e1cd059134f10ccab62da19bce38e5ba052776..a005c1055cdd9b459567350ad0b1c1f4ba739ab8 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 
 /* Function to print input values as key-value pairs in format
  * identifiable by Apple system log (ASL) facility. All key-value pairs
- * are assumed to be pointer to strings and are provided using two ways -
- * (a) va_list argument which is a list of varying number of arguments
- *     created by the caller of this function.
- * (b) variable number of arguments passed to this function.
+ * are assumed to be pointer to strings and are provided using va_list
+ * argument which is a list of varying number of arguments created by the
+ * caller of this function.
  *
  * Parameters -
  *      level     - Priority level for this ASL message
  *     facility  - Facility for this ASL message.
  *     num_pairs - Number of key-value pairs provided by vargs argument.
  *     vargs     - List of key-value pairs.
- *     ...       - Additional key-value pairs (apart from vargs) as variable
- *                 argument list.  A NULL value indicates the end of the
- *                 variable argument list.
  *
  * Returns -
  *     zero    - On success, when it prints all key-values pairs provided.
  *     E2BIG   - When it cannot print all key-value pairs provided and had
  *               to truncate the output.
  */
-int
-kern_asl_msg_va(int level, const char *facility, int num_pairs, va_list vargs, ...)
+static int
+kern_asl_msg_va(int level, const char *facility, int num_pairs, va_list vargs)
 {
        int err = 0;
        char fmt[MAX_FMT_LEN];  /* Format string to use with vaddlog */
        int calc_pairs = 0;
        size_t len;
        int i;
-       va_list ap;
-       char *ptr;
 
        /* Mask extra bits, if any, from priority level */
        level = LOG_PRI(level);
@@ -130,60 +124,6 @@ kern_asl_msg_va(int level, const char *facility, int num_pairs, va_list vargs, .
                (void) strlcat(fmt, KASL_KEYVAL_FMT, len);
        }
 
-       /* Count number of variable arguments provided to this function
-        * and determine total number of key-value pairs.
-        */
-       calc_pairs = 0;
-       va_start(ap, vargs);
-       ptr = va_arg(ap, char *);
-       while (ptr) {
-               calc_pairs++;
-               ptr = va_arg(ap, char *);
-       }
-       calc_pairs /= 2;
-       va_end(ap);
-
-       /* If user provided variable number of arguments, append them as
-        * as real key-value "[k v]" into the format string.  If the format
-        * string is too small, ignore the key-value pair completely.
-        */
-       if (calc_pairs) {
-               char *key, *val;
-               size_t pairlen;
-               int offset;
-
-               /* Calculate bytes available for key-value pairs after reserving
-                * bytes for newline character and NULL terminator
-                */
-               len = MAX_FMT_LEN - strlen(fmt) - KASL_NEWLINE_CHAR_LEN - 1;
-               offset = strlen(fmt);
-
-               va_start(ap, vargs);
-               for (i = 0; i < calc_pairs; i++) {
-                       key = va_arg(ap, char *);
-                       val = va_arg(ap, char *);
-
-                       /* Calculate bytes required to store next key-value pair
-                        * as "[key val] " including space for '[', ']', and
-                        * two spaces.
-                        */
-                       pairlen = strlen(key) + strlen(val) + 4;
-                       if (pairlen > len) {
-                               err = E2BIG;
-                               break;
-                       }
-
-                       /* len + 1 because one byte has been set aside for NULL
-                        * terminator in calculation of 'len' above
-                        */
-                       snprintf((fmt + offset), len + 1, KASL_KEYVAL_FMT,
-                           key, val);
-                       offset += pairlen;
-                       len -= pairlen;
-               }
-               va_end(ap);
-       }
-
        /* Append newline */
        (void) strlcat(fmt, KASL_NEWLINE_CHAR, MAX_FMT_LEN);
 
@@ -208,7 +148,7 @@ kern_asl_msg(int level, const char *facility, int num_pairs, ...)
 
        va_start(ap, num_pairs);
        err = kern_asl_msg_va(level, facility,
-           num_pairs, ap, NULL);
+           num_pairs, ap);
        va_end(ap);
 
        return err;
index 29329bf7797c60928c16cf7bc3878519b9ab935c..d5b5ca727bb4c79849d4a5b49f43a18303ce48bc 100644 (file)
@@ -57,7 +57,6 @@ backtrace_sysctl SYSCTL_HANDLER_ARGS
        uint32_t bt_len = 0, bt_filled = 0;
        size_t bt_size = 0;
        int error = 0;
-       bool user_64 = false;
 
        if (type != BACKTRACE_USER) {
                return EINVAL;
@@ -74,7 +73,7 @@ backtrace_sysctl SYSCTL_HANDLER_ARGS
                return ENOBUFS;
        }
        memset(bt, 0, bt_size);
-       error = backtrace_user(bt, bt_len, &bt_filled, &user_64);
+       error = backtrace_user(bt, bt_len, &bt_filled, NULL, NULL);
        if (error) {
                goto out;
        }
index fd776ee2f0316320d905e9c8fdf1679e7e806346..a9c778a6458cf445e6f54c010ec78cef844a83c2 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <sys/vm.h>
 #include <sys/sysctl.h>
 
-#ifdef GPROF
-#include <sys/gmon.h>
-#endif
-
 #include <kern/thread.h>
 #include <kern/ast.h>
 #include <kern/assert.h>
@@ -332,61 +328,6 @@ tvtohz(struct timeval *tv)
        return (int)ticks;
 }
 
-
-/*
- * Start profiling on a process.
- *
- * Kernel profiling passes kernel_proc which never exits and hence
- * keeps the profile clock running constantly.
- */
-void
-startprofclock(struct proc *p)
-{
-       if ((p->p_flag & P_PROFIL) == 0) {
-               OSBitOrAtomic(P_PROFIL, &p->p_flag);
-       }
-}
-
-/*
- * Stop profiling on a process.
- */
-void
-stopprofclock(struct proc *p)
-{
-       if (p->p_flag & P_PROFIL) {
-               OSBitAndAtomic(~((uint32_t)P_PROFIL), &p->p_flag);
-       }
-}
-
-/* TBD locking user profiling is not resolved yet */
-void
-bsd_uprofil(struct time_value *syst, user_addr_t pc)
-{
-       struct proc *p = current_proc();
-       int             ticks;
-       struct timeval  *tv;
-       struct timeval st;
-
-       if (p == NULL) {
-               return;
-       }
-       if (!(p->p_flag & P_PROFIL)) {
-               return;
-       }
-
-       st.tv_sec = syst->seconds;
-       st.tv_usec = syst->microseconds;
-
-       tv = &(p->p_stats->p_ru.ru_stime);
-
-       ticks = ((tv->tv_sec - st.tv_sec) * 1000 +
-           (tv->tv_usec - st.tv_usec) / 1000) /
-           (tick / 1000);
-       if (ticks) {
-               addupc_task(p, pc, ticks);
-       }
-}
-
 /* TBD locking user profiling is not resolved yet */
 void
 get_procrustime(time_value_t *tv)
index 34b65a24fc57664a6f7a199be32db48129d76347..643b1cebbdafe3fcd4d33edc90d3874c82981648 100644 (file)
@@ -95,7 +95,7 @@ void mach_kauth_cred_uthread_update( void );
 #endif
 
 # define K_UUID_FMT "%08x:%08x:%08x:%08x"
-# define K_UUID_ARG(_u) *(int *)&_u.g_guid[0],*(int *)&_u.g_guid[4],*(int *)&_u.g_guid[8],*(int *)&_u.g_guid[12]
+# define K_UUID_ARG(_u) &_u.g_guid_asint[0],&_u.g_guid_asint[1],&_u.g_guid_asint[2],&_u.g_guid_asint[3]
 # define KAUTH_DEBUG(fmt, args...)      do { printf("%s:%d: " fmt "\n", __PRETTY_FUNCTION__, __LINE__ , ##args); } while (0)
 #endif
 
@@ -1089,7 +1089,7 @@ kauth_resolver_complete(user_addr_t message)
                                } else if (extl.el_flags &  (KAUTH_EXTLOOKUP_VALID_PWNAM | KAUTH_EXTLOOKUP_VALID_GRNAM)) {
                                        error = EFAULT;
                                        KAUTH_DEBUG("RESOLVER - resolver returned mismatching extension flags (%d), request contained (%d)",
-                                           extl.el_flags, request_flags);
+                                           extl.el_flags, want_extend_data);
                                }
 
                                /*
@@ -4744,7 +4744,7 @@ kauth_proc_setlabel(__unused struct proc *p, __unused void *label)
 
 #define KAUTH_CRED_REF_MAX 0x0ffffffful
 
-__attribute__((noinline, cold, not_tail_called, noreturn))
+__attribute__((noinline, cold, noreturn))
 static void
 kauth_cred_panic_resurrection(kauth_cred_t cred)
 {
@@ -4752,7 +4752,7 @@ kauth_cred_panic_resurrection(kauth_cred_t cred)
        __builtin_unreachable();
 }
 
-__attribute__((noinline, cold, not_tail_called, noreturn))
+__attribute__((noinline, cold, noreturn))
 static void
 kauth_cred_panic_over_released(kauth_cred_t cred)
 {
@@ -4760,7 +4760,7 @@ kauth_cred_panic_over_released(kauth_cred_t cred)
        __builtin_unreachable();
 }
 
-__attribute__((noinline, cold, not_tail_called, noreturn))
+__attribute__((noinline, cold, noreturn))
 static void
 kauth_cred_panic_over_retain(kauth_cred_t cred)
 {
index c6ab1e5bf086fef758f2c295acde1b4007112f3f..4a9fbc3ffddd784274a9ec31ef37b32e2aa399e4 100644 (file)
@@ -224,6 +224,9 @@ cs_allow_invalid(struct proc *p)
        }
        proc_unlock(p);
 
+       /* allow a debugged process to hide some (debug-only!) memory */
+       task_set_memory_ownership_transfer(p->task, TRUE);
+
        vm_map_switch_protect(get_task_map(p->task), FALSE);
 #endif
        return (p->p_csflags & (CS_KILL | CS_HARD)) == 0;
@@ -1137,6 +1140,39 @@ cs_entitlements_blob_get(proc_t p, void **out_start, size_t *out_length)
        return csblob_get_entitlements(csblob, out_start, out_length);
 }
 
+
+/* Retrieve the cached entitlements for a process
+ * Returns:
+ *   EINVAL    no text vnode associated with the process
+ *   EBADEXEC   invalid code signing data
+ *   0         no error occurred
+ *
+ * Note: the entitlements may be NULL if there is nothing cached.
+ */
+
+int
+cs_entitlements_dictionary_copy(proc_t p, void **entitlements)
+{
+       struct cs_blob *csblob;
+
+       *entitlements = NULL;
+
+       if ((p->p_csflags & CS_SIGNED) == 0) {
+               return 0;
+       }
+
+       if (NULL == p->p_textvp) {
+               return EINVAL;
+       }
+
+       if ((csblob = ubc_cs_blob_get(p->p_textvp, -1, p->p_textoff)) == NULL) {
+               return 0;
+       }
+
+       *entitlements = csblob_entitlements_dictionary_copy(csblob);
+       return 0;
+}
+
 /* Retrieve the codesign identity for a process.
  * Returns:
  *   NULL      an error occured
index 9d68de20e1b7dae4003c3b825ac37855fdb2495f..320c27b2c1be8fe18547a807098907c6df8f2038 100644 (file)
 #include <security/mac_framework.h>
 #endif
 
+#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1
 kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t,
-    mach_msg_type_name_t, ipc_port_t *);
+    mach_msg_type_name_t, ipc_port_t *, mach_port_context_t, mach_msg_guard_flags_t *, uint32_t);
 void ipc_port_release_send(ipc_port_t);
 
 struct psemnode;
@@ -908,7 +909,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
                        error = 0;
                        goto out;
                }
-               error = fo_ioctl(fp, (int)TIOCGPGRP, (caddr_t)retval, &context);
+               error = fo_ioctl(fp, TIOCGPGRP, (caddr_t)retval, &context);
                *retval = -*retval;
                goto out;
 
@@ -936,7 +937,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
                        tmp = (int)p1->p_pgrpid;
                        proc_rele(p1);
                }
-               error =  fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
+               error =  fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context);
                goto out;
 
        case F_SETNOSIGPIPE:
@@ -1398,6 +1399,50 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
 
                goto outdrop;
        }
+       case F_SPECULATIVE_READ: {
+               fspecread_t args;
+
+               if (fp->f_type != DTYPE_VNODE) {
+                       error = EBADF;
+                       goto out;
+               }
+
+               vp = (struct vnode *)fp->f_data;
+               proc_fdunlock(p);
+
+               if ((error = copyin(argp, (caddr_t)&args, sizeof(args)))) {
+                       goto outdrop;
+               }
+
+               /* Discard invalid offsets or lengths */
+               if ((args.fsr_offset < 0) || (args.fsr_length < 0)) {
+                       error = EINVAL;
+                       goto outdrop;
+               }
+
+               /*
+                * Round the file offset down to a page-size boundary (or to 0).
+                * The filesystem will need to round the length up to the end of the page boundary
+                * or to the EOF of the file.
+                */
+               uint64_t foff = (((uint64_t)args.fsr_offset) & ~((uint64_t)PAGE_MASK));
+               uint64_t foff_delta = args.fsr_offset - foff;
+               args.fsr_offset = (off_t) foff;
+
+               /*
+                * Now add in the delta to the supplied length. Since we may have adjusted the
+                * offset, increase it by the amount that we adjusted.
+                */
+               args.fsr_length += foff_delta;
+
+               if ((error = vnode_getwithref(vp))) {
+                       goto outdrop;
+               }
+               error = VNOP_IOCTL(vp, F_SPECULATIVE_READ, (caddr_t)&args, 0, &context);
+               (void)vnode_put(vp);
+
+               goto outdrop;
+       }
        case F_SETSIZE:
                if (fp->f_type != DTYPE_VNODE) {
                        error = EBADF;
@@ -1657,7 +1702,8 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
                }
                goto outdrop;
        }
-       case F_GETPATH: {
+       case F_GETPATH:
+       case F_GETPATH_NOFIRMLINK: {
                char *pathbufp;
                int pathlen;
 
@@ -1675,7 +1721,11 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
                        goto outdrop;
                }
                if ((error = vnode_getwithref(vp)) == 0) {
-                       error = vn_getpath(vp, pathbufp, &pathlen);
+                       if (uap->cmd == F_GETPATH_NOFIRMLINK) {
+                               error = vn_getpath_ext(vp, NULL, pathbufp, &pathlen, VN_GETPATH_NO_FIRMLINK);
+                       } else {
+                               error = vn_getpath(vp, pathbufp, &pathlen);
+                       }
                        (void)vnode_put(vp);
 
                        if (error == 0) {
@@ -2202,9 +2252,12 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
                        goto out;
                }
 
-               /* For now, special case HFS+ only, since this is SPI. */
+               /*
+                * For now, special case HFS+ and APFS only, since this
+                * is SPI.
+                */
                src_vp = (struct vnode *)fp->f_data;
-               if (src_vp->v_tag != VT_HFS) {
+               if (src_vp->v_tag != VT_HFS && src_vp->v_tag != VT_APFS) {
                        error = ENOTSUP;
                        goto out;
                }
@@ -2223,7 +2276,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
                        goto out;
                }
                dst_vp = (struct vnode *)fp2->f_data;
-               if (dst_vp->v_tag != VT_HFS) {
+               if (dst_vp->v_tag != VT_HFS && dst_vp->v_tag != VT_APFS) {
                        fp_drop(p, fd2, fp2, 1);
                        error = ENOTSUP;
                        goto out;
@@ -2592,10 +2645,12 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
                case (int)APFSIOC_REVERT_TO_SNAPSHOT:
                case (int)FSIOC_FIOSEEKHOLE:
                case (int)FSIOC_FIOSEEKDATA:
+               case (int)FSIOC_CAS_BSDFLAGS:
                case HFS_GET_BOOT_INFO:
                case HFS_SET_BOOT_INFO:
                case FIOPINSWAP:
                case F_MARKDEPENDENCY:
+               case TIOCREVOKE:
                        error = EINVAL;
                        goto out;
                default:
@@ -2933,6 +2988,8 @@ close_internal_locked(proc_t p, int fd, struct fileproc *fp, int flags)
                knote_fdclose(p, fd);
        }
 
+       /* release the ref returned from fp_lookup before calling drain */
+       (void) os_ref_release_locked(&fp->f_iocount);
        fileproc_drain(p, fp);
 
        if (fp->f_flags & FP_WAITEVENT) {
@@ -3051,10 +3108,10 @@ fstat1(proc_t p, int fd, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsec
                         * going to let them get the basic stat information.
                         */
                        if (xsecurity == USER_ADDR_NULL) {
-                               error = vn_stat_noauth((vnode_t)data, sbptr, NULL, isstat64, ctx,
+                               error = vn_stat_noauth((vnode_t)data, sbptr, NULL, isstat64, 0, ctx,
                                    fp->f_fglob->fg_cred);
                        } else {
-                               error = vn_stat((vnode_t)data, sbptr, &fsec, isstat64, ctx);
+                               error = vn_stat((vnode_t)data, sbptr, &fsec, isstat64, 0, ctx);
                        }
 
                        AUDIT_ARG(vnpath, (struct vnode *)data, ARG_VNODE1);
@@ -3573,7 +3630,7 @@ fp_getfvp(proc_t p, int fd, struct fileproc **resultfp, struct vnode **resultvp)
                proc_fdunlock(p);
                return ENOTSUP;
        }
-       fp->f_iocount++;
+       os_ref_retain_locked(&fp->f_iocount);
 
        if (resultfp) {
                *resultfp = fp;
@@ -3634,7 +3691,7 @@ fp_getfvpandvid(proc_t p, int fd, struct fileproc **resultfp,
                proc_fdunlock(p);
                return ENOTSUP;
        }
-       fp->f_iocount++;
+       os_ref_retain_locked(&fp->f_iocount);
 
        if (resultfp) {
                *resultfp = fp;
@@ -3694,7 +3751,7 @@ fp_getfsock(proc_t p, int fd, struct fileproc **resultfp,
                proc_fdunlock(p);
                return EOPNOTSUPP;
        }
-       fp->f_iocount++;
+       os_ref_retain_locked(&fp->f_iocount);
 
        if (resultfp) {
                *resultfp = fp;
@@ -3751,7 +3808,7 @@ fp_getfkq(proc_t p, int fd, struct fileproc **resultfp,
                proc_fdunlock(p);
                return EBADF;
        }
-       fp->f_iocount++;
+       os_ref_retain_locked(&fp->f_iocount);
 
        if (resultfp) {
                *resultfp = fp;
@@ -3810,7 +3867,7 @@ fp_getfpshm(proc_t p, int fd, struct fileproc **resultfp,
                proc_fdunlock(p);
                return EBADF;
        }
-       fp->f_iocount++;
+       os_ref_retain_locked(&fp->f_iocount);
 
        if (resultfp) {
                *resultfp = fp;
@@ -3878,7 +3935,7 @@ fp_getfpsem(proc_t p, int fd, struct fileproc **resultfp,
                proc_fdunlock(p);
                return EBADF;
        }
-       fp->f_iocount++;
+       os_ref_retain_locked(&fp->f_iocount);
 
        if (resultfp) {
                *resultfp = fp;
@@ -3935,7 +3992,7 @@ fp_getfpipe(proc_t p, int fd, struct fileproc **resultfp,
                proc_fdunlock(p);
                return EBADF;
        }
-       fp->f_iocount++;
+       os_ref_retain_locked(&fp->f_iocount);
 
        if (resultfp) {
                *resultfp = fp;
@@ -3990,7 +4047,7 @@ fp_lookup(proc_t p, int fd, struct fileproc **resultfp, int locked)
                }
                return EBADF;
        }
-       fp->f_iocount++;
+       os_ref_retain_locked(&fp->f_iocount);
 
        if (resultfp) {
                *resultfp = fp;
@@ -4009,6 +4066,7 @@ fp_lookup(proc_t p, int fd, struct fileproc **resultfp, int locked)
  * Description: Swap the fileproc pointer for a given fd with a new
  *             fileproc pointer in the per-process open file table of
  *             the specified process.  The fdlock must be held at entry.
+ *             Iff the swap is successful, the old fileproc pointer is freed.
  *
  * Parameters:  p              Process containing the fd
  *             fd              The fd of interest
@@ -4017,7 +4075,7 @@ fp_lookup(proc_t p, int fd, struct fileproc **resultfp, int locked)
  * Returns:    0               Success
  *             EBADF           Bad file descriptor
  *             EINTR           Interrupted
- *             EKEEPLOOKING    f_iocount changed while lock was dropped.
+ *             EKEEPLOOKING    Other references were active, try again.
  */
 int
 fp_tryswap(proc_t p, int fd, struct fileproc *nfp)
@@ -4034,20 +4092,28 @@ fp_tryswap(proc_t p, int fd, struct fileproc *nfp)
         * At this point, our caller (change_guardedfd_np) has
         * one f_iocount reference, and we just took another
         * one to begin the replacement.
+        * fp and nfp have a +1 reference from allocation.
+        * Thus if no-one else is looking, f_iocount should be 3.
         */
-       if (fp->f_iocount < 2) {
-               panic("f_iocount too small %d", fp->f_iocount);
-       } else if (2 == fp->f_iocount) {
+       if (os_ref_get_count(&fp->f_iocount) < 3 ||
+           1 != os_ref_get_count(&nfp->f_iocount)) {
+               panic("%s: f_iocount", __func__);
+       } else if (3 == os_ref_get_count(&fp->f_iocount)) {
                /* Copy the contents of *fp, preserving the "type" of *nfp */
 
                nfp->f_flags = (nfp->f_flags & FP_TYPEMASK) |
                    (fp->f_flags & ~FP_TYPEMASK);
-               nfp->f_iocount = fp->f_iocount;
+               os_ref_retain_locked(&nfp->f_iocount);
+               os_ref_retain_locked(&nfp->f_iocount);
                nfp->f_fglob = fp->f_fglob;
                nfp->f_wset = fp->f_wset;
 
                p->p_fd->fd_ofiles[fd] = nfp;
-               (void) fp_drop(p, fd, nfp, 1);
+               fp_drop(p, fd, nfp, 1);
+
+               os_ref_release_live(&fp->f_iocount);
+               os_ref_release_live(&fp->f_iocount);
+               fileproc_free(fp);
        } else {
                /*
                 * Wait for all other active references to evaporate.
@@ -4061,7 +4127,6 @@ fp_tryswap(proc_t p, int fd, struct fileproc *nfp)
                         * reevaluation of the change-guard attempt.
                         */
                        error = EKEEPLOOKING;
-                       printf("%s: lookup collision fd %d\n", __func__, fd);
                }
                (void) fp_drop(p, fd, fp, 1);
        }
@@ -4182,9 +4247,8 @@ fp_drop(proc_t p, int fd, struct fileproc *fp, int locked)
                }
                return EBADF;
        }
-       fp->f_iocount--;
 
-       if (fp->f_iocount == 0) {
+       if (1 == os_ref_release_locked(&fp->f_iocount)) {
                if (fp->f_flags & FP_SELCONFLICT) {
                        fp->f_flags &= ~FP_SELCONFLICT;
                }
@@ -4487,9 +4551,8 @@ file_drop(int fd)
                proc_fdunlock(p);
                return EBADF;
        }
-       fp->f_iocount--;
 
-       if (fp->f_iocount == 0) {
+       if (1 == os_ref_release_locked(&fp->f_iocount)) {
                if (fp->f_flags & FP_SELCONFLICT) {
                        fp->f_flags &= ~FP_SELCONFLICT;
                }
@@ -4630,22 +4693,22 @@ falloc_withalloc_locked(proc_t p, struct fileproc **resultfp, int *resultfd,
        struct fileglob *fg;
        int error, nfd;
 
+       if (nfiles >= maxfiles) {
+               tablefull("file");
+               return ENFILE;
+       }
+
        if (!locked) {
                proc_fdlock(p);
        }
+
        if ((error = fdalloc(p, 0, &nfd))) {
                if (!locked) {
                        proc_fdunlock(p);
                }
                return error;
        }
-       if (nfiles >= maxfiles) {
-               if (!locked) {
-                       proc_fdunlock(p);
-               }
-               tablefull("file");
-               return ENFILE;
-       }
+
 #if CONFIG_MACF
        error = mac_file_check_create(proc_ucred(p));
        if (error) {
@@ -4682,7 +4745,7 @@ falloc_withalloc_locked(proc_t p, struct fileproc **resultfp, int *resultfd,
        bzero(fg, sizeof(struct fileglob));
        lck_mtx_init(&fg->fg_lock, file_lck_grp, file_lck_attr);
 
-       fp->f_iocount = 1;
+       os_ref_retain_locked(&fp->f_iocount);
        fg->fg_count = 1;
        fg->fg_ops = &uninitops;
        fp->f_fglob = fg;
@@ -4753,6 +4816,27 @@ fg_free(struct fileglob *fg)
 }
 
 
+/*
+ * fg_get_vnode
+ *
+ * Description:        Return vnode associated with the file structure, if
+ *             any.  The lifetime of the returned vnode is bound to
+ *             the lifetime of the file structure.
+ *
+ * Parameters: fg                              Pointer to fileglob to
+ *                                             inspect
+ *
+ * Returns:    vnode_t
+ */
+vnode_t
+fg_get_vnode(struct fileglob *fg)
+{
+       if (FILEGLOB_DTYPE(fg) == DTYPE_VNODE) {
+               return (vnode_t)fg->fg_data;
+       } else {
+               return NULL;
+       }
+}
 
 /*
  * fdexec
@@ -4782,7 +4866,7 @@ fdexec(proc_t p, short flags, int self_exec)
        boolean_t cloexec_default = (flags & POSIX_SPAWN_CLOEXEC_DEFAULT) != 0;
        thread_t self = current_thread();
        struct uthread *ut = get_bsdthread_info(self);
-       struct kqueue *dealloc_kq = NULL;
+       struct kqworkq *dealloc_kqwq = NULL;
 
        /*
         * If the current thread is bound as a workq/workloop
@@ -4800,7 +4884,7 @@ fdexec(proc_t p, short flags, int self_exec)
         * subsequent kqueue closes go faster.
         */
        knotes_dealloc(p);
-       assert(fdp->fd_knlistsize == -1);
+       assert(fdp->fd_knlistsize == 0);
        assert(fdp->fd_knhashmask == 0);
 
        for (i = fdp->fd_lastfile; i >= 0; i--) {
@@ -4838,7 +4922,7 @@ fdexec(proc_t p, short flags, int self_exec)
                         * Wait for any third party viewers (e.g., lsof)
                         * to release their references to this fileproc.
                         */
-                       while (fp->f_iocount > 0) {
+                       while (os_ref_get_count(&fp->f_iocount) > 1) {
                                p->p_fpdrainwait = 1;
                                msleep(&p->p_fpdrainwait, &p->p_fdmlock, PRIBIO,
                                    "fpdrain", NULL);
@@ -4854,15 +4938,15 @@ fdexec(proc_t p, short flags, int self_exec)
 
        /* release the per-process workq kq */
        if (fdp->fd_wqkqueue) {
-               dealloc_kq = fdp->fd_wqkqueue;
+               dealloc_kqwq = fdp->fd_wqkqueue;
                fdp->fd_wqkqueue = NULL;
        }
 
        proc_fdunlock(p);
 
        /* Anything to free? */
-       if (dealloc_kq) {
-               kqueue_dealloc(dealloc_kq);
+       if (dealloc_kqwq) {
+               kqworkq_dealloc(dealloc_kqwq);
        }
 }
 
@@ -5087,7 +5171,7 @@ fdcopy(proc_t p, vnode_t uth_cdir)
         * Initialize knote and kqueue tracking structs
         */
        newfdp->fd_knlist = NULL;
-       newfdp->fd_knlistsize = -1;
+       newfdp->fd_knlistsize = 0;
        newfdp->fd_knhash = NULL;
        newfdp->fd_knhashmask = 0;
        newfdp->fd_kqhash = NULL;
@@ -5119,7 +5203,7 @@ fdfree(proc_t p)
 {
        struct filedesc *fdp;
        struct fileproc *fp;
-       struct kqueue *dealloc_kq = NULL;
+       struct kqworkq *dealloc_kqwq = NULL;
        int i;
 
        proc_fdlock(p);
@@ -5140,7 +5224,7 @@ fdfree(proc_t p)
         * tables to make any subsequent kqueue closes faster.
         */
        knotes_dealloc(p);
-       assert(fdp->fd_knlistsize == -1);
+       assert(fdp->fd_knlistsize == 0);
        assert(fdp->fd_knhashmask == 0);
 
        /*
@@ -5157,6 +5241,7 @@ fdfree(proc_t p)
                                        panic("fdfree: found fp with UF_RESERVED");
                                }
 
+                               fileproc_drain(p, fp);
                                procfdtbl_reservefd(p, i);
 
                                if (fp->f_flags & FP_WAITEVENT) {
@@ -5172,16 +5257,15 @@ fdfree(proc_t p)
        }
 
        if (fdp->fd_wqkqueue) {
-               dealloc_kq = fdp->fd_wqkqueue;
+               dealloc_kqwq = fdp->fd_wqkqueue;
                fdp->fd_wqkqueue = NULL;
        }
 
        proc_fdunlock(p);
 
-       if (dealloc_kq) {
-               kqueue_dealloc(dealloc_kq);
+       if (dealloc_kqwq) {
+               kqworkq_dealloc(dealloc_kqwq);
        }
-
        if (fdp->fd_cdir) {
                vnode_rele(fdp->fd_cdir);
        }
@@ -5195,7 +5279,7 @@ fdfree(proc_t p)
 
        if (fdp->fd_kqhash) {
                for (uint32_t j = 0; j <= fdp->fd_kqhashmask; j++) {
-                       assert(SLIST_EMPTY(&fdp->fd_kqhash[j]));
+                       assert(LIST_EMPTY(&fdp->fd_kqhash[j]));
                }
                FREE(fdp->fd_kqhash, M_KQUEUE);
        }
@@ -5337,14 +5421,13 @@ fileproc_drain(proc_t p, struct fileproc * fp)
        context.vc_thread = proc_thread(p);     /* XXX */
        context.vc_ucred = fp->f_fglob->fg_cred;
 
-       fp->f_iocount--;  /* (the one the close holds) */
+       /* Set the vflag for drain */
+       fileproc_modify_vflags(fp, FPV_DRAIN, FALSE);
 
-       while (fp->f_iocount) {
+       while (os_ref_get_count(&fp->f_iocount) > 1) {
                lck_mtx_convert_spin(&p->p_fdmlock);
 
-               if (fp->f_fglob->fg_ops->fo_drain) {
-                       (*fp->f_fglob->fg_ops->fo_drain)(fp, &context);
-               }
+               fo_drain(fp, &context);
                if ((fp->f_flags & FP_INSELECT) == FP_INSELECT) {
                        if (waitq_wakeup64_all((struct waitq *)fp->f_wset, NO_EVENT64,
                            THREAD_INTERRUPTED, WAITQ_ALL_PRIORITIES) == KERN_INVALID_ARGUMENT) {
@@ -5382,13 +5465,8 @@ fileproc_drain(proc_t p, struct fileproc * fp)
  * Parameters: p                               Process containing fd
  *             fd                              fd to be released
  *             fp                              fileproc to be freed
- *
- * Returns:    0                               Success
- *
- * Notes:      XXX function should be void - no one interprets the returns
- *             XXX code
  */
-int
+void
 fp_free(proc_t p, int fd, struct fileproc * fp)
 {
        proc_fdlock_spin(p);
@@ -5396,8 +5474,8 @@ fp_free(proc_t p, int fd, struct fileproc * fp)
        proc_fdunlock(p);
 
        fg_free(fp->f_fglob);
+       os_ref_release_live(&fp->f_iocount);
        fileproc_free(fp);
-       return 0;
 }
 
 
@@ -5584,15 +5662,11 @@ fileport_releasefg(struct fileglob *fg)
        return;
 }
 
-
 /*
- * fileport_makefd
+ * fileport_makefd_internal
  *
  * Description: Obtain the file descriptor for a given Mach send right.
  *
- * Parameters: p               Process calling fileport
- *              uap->port      Name of send right to file port.
- *
  * Returns:    0               Success
  *             EINVAL          Invalid Mach port name, or port is not for a file.
  *     fdalloc:EMFILE
@@ -5602,24 +5676,13 @@ fileport_releasefg(struct fileglob *fg)
  *             *retval (modified)              The new descriptor
  */
 int
-fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval)
+fileport_makefd_internal(proc_t p, ipc_port_t port, int uf_flags, int *retval)
 {
        struct fileglob *fg;
        struct fileproc *fp = FILEPROC_NULL;
-       ipc_port_t port = IPC_PORT_NULL;
-       mach_port_name_t send = uap->port;
-       kern_return_t res;
        int fd;
        int err;
 
-       res = ipc_object_copyin(get_task_ipcspace(p->task),
-           send, MACH_MSG_TYPE_COPY_SEND, &port);
-
-       if (res != KERN_SUCCESS) {
-               err = EINVAL;
-               goto out;
-       }
-
        fg = fileport_port_to_fileglob(port);
        if (fg == NULL) {
                err = EINVAL;
@@ -5642,7 +5705,9 @@ fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval)
                fg_drop(fp);
                goto out;
        }
-       *fdflags(p, fd) |= UF_EXCLOSE;
+       if (uf_flags) {
+               *fdflags(p, fd) |= uf_flags;
+       }
 
        procfdtbl_releasefd(p, fd, fp);
        proc_fdunlock(p);
@@ -5654,6 +5719,42 @@ out:
                fileproc_free(fp);
        }
 
+       return err;
+}
+
+/*
+ * fileport_makefd
+ *
+ * Description: Obtain the file descriptor for a given Mach send right.
+ *
+ * Parameters: p               Process calling fileport
+ *              uap->port      Name of send right to file port.
+ *
+ * Returns:    0               Success
+ *             EINVAL          Invalid Mach port name, or port is not for a file.
+ *     fdalloc:EMFILE
+ *     fdalloc:ENOMEM          Unable to allocate fileproc or extend file table.
+ *
+ * Implicit returns:
+ *             *retval (modified)              The new descriptor
+ */
+int
+fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval)
+{
+       ipc_port_t port = IPC_PORT_NULL;
+       mach_port_name_t send = uap->port;
+       kern_return_t res;
+       int err;
+
+       res = ipc_object_copyin(get_task_ipcspace(p->task),
+           send, MACH_MSG_TYPE_COPY_SEND, &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND);
+
+       if (res == KERN_SUCCESS) {
+               err = fileport_makefd_internal(p, port, UF_EXCLOSE, retval);
+       } else {
+               err = EINVAL;
+       }
+
        if (IPC_PORT_NULL != port) {
                ipc_port_release_send(port);
        }
@@ -5979,6 +6080,13 @@ fo_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx)
        return (*fp->f_ops->fo_read)(fp, uio, flags, ctx);
 }
 
+int
+fo_no_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx)
+{
+#pragma unused(fp, uio, flags, ctx)
+       return ENXIO;
+}
+
 
 /*
  * fo_write
@@ -6000,6 +6108,13 @@ fo_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx)
        return (*fp->f_ops->fo_write)(fp, uio, flags, ctx);
 }
 
+int
+fo_no_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx)
+{
+#pragma unused(fp, uio, flags, ctx)
+       return ENXIO;
+}
+
 
 /*
  * fo_ioctl
@@ -6034,6 +6149,13 @@ fo_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx)
        return error;
 }
 
+int
+fo_no_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx)
+{
+#pragma unused(fp, com, data, ctx)
+       return ENOTTY;
+}
+
 
 /*
  * fo_select
@@ -6055,6 +6177,13 @@ fo_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx)
        return (*fp->f_ops->fo_select)(fp, which, wql, ctx);
 }
 
+int
+fo_no_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx)
+{
+#pragma unused(fp, which, wql, ctx)
+       return ENOTSUP;
+}
+
 
 /*
  * fo_close
@@ -6076,6 +6205,32 @@ fo_close(struct fileglob *fg, vfs_context_t ctx)
 }
 
 
+/*
+ * fo_drain
+ *
+ * Description:        Generic fileops kqueue filter indirected through the fileops
+ *             pointer in the fileproc structure
+ *
+ * Parameters: fp                              fileproc structure pointer
+ *             ctx                             VFS context for operation
+ *
+ * Returns:    0                               Success
+ *             !0                              errno from drain
+ */
+int
+fo_drain(struct fileproc *fp, vfs_context_t ctx)
+{
+       return (*fp->f_ops->fo_drain)(fp, ctx);
+}
+
+int
+fo_no_drain(struct fileproc *fp, vfs_context_t ctx)
+{
+#pragma unused(fp, ctx)
+       return ENOTSUP;
+}
+
+
 /*
  * fo_kqfilter
  *
@@ -6084,19 +6239,26 @@ fo_close(struct fileglob *fg, vfs_context_t ctx)
  *
  * Parameters: fp                              fileproc structure pointer
  *             kn                              pointer to knote to filter on
- *             ctx                             VFS context for operation
  *
  * Returns:    (kn->kn_flags & EV_ERROR)       error in kn->kn_data
  *             0                               Filter is not active
  *             !0                              Filter is active
  */
 int
-fo_kqfilter(struct fileproc *fp, struct knote *kn,
-    struct kevent_internal_s *kev, vfs_context_t ctx)
+fo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
 {
-       return (*fp->f_ops->fo_kqfilter)(fp, kn, kev, ctx);
+       return (*fp->f_ops->fo_kqfilter)(fp, kn, kev);
 }
 
+int
+fo_no_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
+{
+#pragma unused(fp, kev)
+       knote_set_error(kn, ENOTSUP);
+       return 0;
+}
+
+
 /*
  * The ability to send a file descriptor to another
  * process is opt-in by file type.
@@ -6119,6 +6281,7 @@ file_issendable(proc_t p, struct fileproc *fp)
        }
 }
 
+os_refgrp_decl(, f_iocount_refgrp, "f_iocount", NULL);
 
 struct fileproc *
 fileproc_alloc_init(__unused void *arg)
@@ -6128,14 +6291,23 @@ fileproc_alloc_init(__unused void *arg)
        MALLOC_ZONE(fp, struct fileproc *, sizeof(*fp), M_FILEPROC, M_WAITOK);
        if (fp) {
                bzero(fp, sizeof(*fp));
+               os_ref_init(&fp->f_iocount, &f_iocount_refgrp);
        }
 
        return fp;
 }
 
+
 void
 fileproc_free(struct fileproc *fp)
 {
+       os_ref_count_t __unused refc = os_ref_release(&fp->f_iocount);
+#if DEVELOPMENT || DEBUG
+       if (0 != refc) {
+               panic("%s: pid %d refc: %u != 0",
+                   __func__, proc_pid(current_proc()), refc);
+       }
+#endif
        switch (FILEPROC_TYPE(fp)) {
        case FTYPE_SIMPLE:
                FREE_ZONE(fp, sizeof(*fp), M_FILEPROC);
@@ -6147,3 +6319,19 @@ fileproc_free(struct fileproc *fp)
                panic("%s: corrupt fp %p flags %x", __func__, fp, fp->f_flags);
        }
 }
+
+void
+fileproc_modify_vflags(struct fileproc *fp, fileproc_vflags_t vflags, boolean_t clearflags)
+{
+       if (clearflags) {
+               os_atomic_andnot(&fp->f_vflags, vflags, relaxed);
+       } else {
+               os_atomic_or(&fp->f_vflags, vflags, relaxed);
+       }
+}
+
+fileproc_vflags_t
+fileproc_get_vflags(struct fileproc *fp)
+{
+       return os_atomic_load(&fp->f_vflags, relaxed);
+}
index c45fbcffa81d391e9285cf41b0708e1ea2bd0193..725f96d1e7c38e8cce37d0255258b34204973c7c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -86,8 +86,7 @@
 #include <sys/codesign.h>
 #include <sys/pthread_shims.h>
 #include <sys/kdebug.h>
-#include <sys/reason.h>
-#include <os/reason_private.h>
+#include <os/base.h>
 #include <pexpert/pexpert.h>
 
 #include <kern/locks.h>
 #include <pthread/workqueue_syscalls.h>
 #include <pthread/workqueue_internal.h>
 #include <libkern/libkern.h>
-#include <libkern/OSAtomic.h>
 
 #include "net/net_str_id.h"
 
 #include <sys/kern_memorystatus.h>
 #endif
 
-extern thread_t port_name_to_thread(mach_port_name_t    port_name); /* osfmk/kern/ipc_tt.h   */
 extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */
 
 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
@@ -128,73 +125,49 @@ MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 
 #define KQ_EVENT        NO_EVENT64
 
-static int kqueue_read(struct fileproc *fp, struct uio *uio,
-    int flags, vfs_context_t ctx);
-static int kqueue_write(struct fileproc *fp, struct uio *uio,
-    int flags, vfs_context_t ctx);
-static int kqueue_ioctl(struct fileproc *fp, u_long com, caddr_t data,
-    vfs_context_t ctx);
 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
     vfs_context_t ctx);
 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
-    struct kevent_internal_s *kev, vfs_context_t ctx);
+    struct kevent_qos_s *kev);
 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
 
 static const struct fileops kqueueops = {
-       .fo_type = DTYPE_KQUEUE,
-       .fo_read = kqueue_read,
-       .fo_write = kqueue_write,
-       .fo_ioctl = kqueue_ioctl,
-       .fo_select = kqueue_select,
-       .fo_close = kqueue_close,
+       .fo_type     = DTYPE_KQUEUE,
+       .fo_read     = fo_no_read,
+       .fo_write    = fo_no_write,
+       .fo_ioctl    = fo_no_ioctl,
+       .fo_select   = kqueue_select,
+       .fo_close    = kqueue_close,
+       .fo_drain    = kqueue_drain,
        .fo_kqfilter = kqueue_kqfilter,
-       .fo_drain = kqueue_drain,
 };
 
-static void kevent_put_kq(struct proc *p, kqueue_id_t id, struct fileproc *fp, struct kqueue *kq);
-static int kevent_internal(struct proc *p,
-    kqueue_id_t id, kqueue_id_t *id_out,
-    user_addr_t changelist, int nchanges,
-    user_addr_t eventlist, int nevents,
-    user_addr_t data_out, uint64_t data_available,
-    unsigned int flags, user_addr_t utimeout,
-    kqueue_continue_t continuation,
-    int32_t *retval);
-static int kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp,
-    struct proc *p, unsigned int flags);
-static int kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp,
-    struct proc *p, unsigned int flags);
-char * kevent_description(struct kevent_internal_s *kevp, char *s, size_t n);
-
-static int kevent_register_wait_prepare(struct knote *kn, struct kevent_internal_s *kev);
+static inline int kevent_modern_copyout(struct kevent_qos_s *, user_addr_t *);
+static int kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int result);
 static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread,
-    struct knote_lock_ctx *knlc, thread_continue_t cont,
-    struct _kevent_register *cont_args) __dead2;
+    thread_continue_t cont, struct _kevent_register *cont_args) __dead2;
 static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2;
 static void kevent_register_wait_cleanup(struct knote *kn);
-static inline void kqueue_release_last(struct proc *p, kqueue_t kqu);
-static void kqueue_interrupt(struct kqueue *kq);
-static int kevent_callback(struct kqueue *kq, struct kevent_internal_s *kevp,
-    void *data);
-static void kevent_continue(struct kqueue *kq, void *data, int error);
-static void kqueue_scan_continue(void *contp, wait_result_t wait_result);
-static int kqueue_process(struct kqueue *kq, kevent_callback_t callback, void *callback_data,
-    struct filt_process_s *process_data, int *countp);
-static int kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index);
 
 static struct kqtailq *kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn);
-static void kqueue_threadreq_initiate(struct kqueue *kq, struct kqrequest *kqr, kq_index_t qos, int flags);
+static void kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t, kq_index_t qos, int flags);
+
+static void kqworkq_unbind(proc_t p, workq_threadreq_t);
+static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, workq_threadreq_t, thread_t thread);
+static workq_threadreq_t kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
 
-static void kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn, kq_index_t qos);
-static void kqworkq_unbind(proc_t p, struct kqrequest *kqr);
-static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, struct kqrequest *kqr, thread_t thread);
-static struct kqrequest *kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
+static void kqworkloop_unbind(struct kqworkloop *kwql);
 
-static void kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t override_index);
-static void kqworkloop_unbind(proc_t p, struct kqworkloop *kwql);
-static thread_qos_t kqworkloop_unbind_locked(struct kqworkloop *kwql, thread_t thread);
-static kq_index_t kqworkloop_owner_override(struct kqworkloop *kqwl);
+enum kqwl_unbind_locked_mode {
+       KQWL_OVERRIDE_DROP_IMMEDIATELY,
+       KQWL_OVERRIDE_DROP_DELAYED,
+};
+static void kqworkloop_unbind_locked(struct kqworkloop *kwql, thread_t thread,
+    enum kqwl_unbind_locked_mode how);
+static void kqworkloop_unbind_delayed_override_drop(thread_t thread);
+static kq_index_t kqworkloop_override(struct kqworkloop *kqwl);
+static void kqworkloop_set_overcommit(struct kqworkloop *kqwl);
 enum {
        KQWL_UTQ_NONE,
        /*
@@ -202,7 +175,7 @@ enum {
         *
         * This QoS is accounted for with the events override in the
         * kqr_override_index field. It is raised each time a new knote is queued at
-        * a given QoS. The kqr_wakeup_indexes field is a superset of the non empty
+        * a given QoS. The kqwl_wakeup_indexes field is a superset of the non empty
         * knote buckets and is recomputed after each event delivery.
         */
        KQWL_UTQ_UPDATE_WAKEUP_QOS,
@@ -227,40 +200,28 @@ enum {
        KQWL_UTQ_REDRIVE_EVENTS,
 };
 static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos);
-static void kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index);
 static int kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags);
 
-static int knote_process(struct knote *kn, kevent_callback_t callback, void *callback_data,
-    struct filt_process_s *process_data);
-
-static int kq_add_knote(struct kqueue *kq, struct knote *kn,
-    struct knote_lock_ctx *knlc, struct proc *p);
-static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_internal_s *kev, bool is_fd, struct proc *p);
-
-static void knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc);
 static struct knote *knote_alloc(void);
 static void knote_free(struct knote *kn);
+static int kq_add_knote(struct kqueue *kq, struct knote *kn,
+    struct knote_lock_ctx *knlc, struct proc *p);
+static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq,
+    struct kevent_qos_s *kev, bool is_fd, struct proc *p);
 
-static void knote_activate(struct knote *kn);
-static void knote_deactivate(struct knote *kn);
-
-static void knote_enable(struct knote *kn);
-static void knote_disable(struct knote *kn);
-
-static int knote_enqueue(struct knote *kn);
-static void knote_dequeue(struct knote *kn);
+static void knote_activate(kqueue_t kqu, struct knote *kn, int result);
+static void knote_dequeue(kqueue_t kqu, struct knote *kn);
 
-static void knote_suppress(struct knote *kn);
-static void knote_unsuppress(struct knote *kn);
-static void knote_wakeup(struct knote *kn);
+static void knote_apply_touch(kqueue_t kqu, struct knote *kn,
+    struct kevent_qos_s *kev, int result);
+static void knote_suppress(kqueue_t kqu, struct knote *kn);
+static void knote_unsuppress(kqueue_t kqu, struct knote *kn);
+static void knote_drop(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc);
 
-static bool knote_should_apply_qos_override(struct kqueue *kq, struct knote *kn,
-    int result, thread_qos_t *qos_out);
-static void knote_apply_qos_override(struct knote *kn, kq_index_t qos_index);
+// both these functions may dequeue the knote and it is up to the caller
+// to enqueue the knote back
 static void knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result);
-static void knote_reset_priority(struct knote *kn, pthread_priority_t pp);
-static kq_index_t knote_get_qos_override_index(struct knote *kn);
-static void knote_set_qos_overcommit(struct knote *kn);
+static void knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp);
 
 static zone_t knote_zone;
 static zone_t kqfile_zone;
@@ -291,11 +252,18 @@ kevent_debug_flags(void)
 
 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
 
-/* placeholder for not-yet-implemented filters */
-static int filt_badattach(struct knote *kn, struct kevent_internal_s *kev);
-static int filt_badevent(struct knote *kn, long hint);
+static int filt_no_attach(struct knote *kn, struct kevent_qos_s *kev);
+static void filt_no_detach(struct knote *kn);
+static int filt_bad_event(struct knote *kn, long hint);
+static int filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev);
+static int filt_bad_process(struct knote *kn, struct kevent_qos_s *kev);
+
 SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
-       .f_attach = filt_badattach,
+       .f_attach  = filt_no_attach,
+       .f_detach  = filt_no_detach,
+       .f_event   = filt_bad_event,
+       .f_touch   = filt_bad_touch,
+       .f_process = filt_bad_process,
 };
 
 #if CONFIG_MEMORYSTATUS
@@ -304,6 +272,7 @@ extern const struct filterops memorystatus_filtops;
 extern const struct filterops fs_filtops;
 extern const struct filterops sig_filtops;
 extern const struct filterops machport_filtops;
+extern const struct filterops pipe_nfiltops;
 extern const struct filterops pipe_rfiltops;
 extern const struct filterops pipe_wfiltops;
 extern const struct filterops ptsd_kqops;
@@ -342,7 +311,8 @@ const static struct filterops workloop_filtops;
  * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
  *   the Private filters section of the array.
  */
-SECURITY_READ_ONLY_EARLY(static struct filterops *) sysfilt_ops[EVFILTID_MAX] = {
+static_assert(EVFILTID_MAX < UINT8_MAX, "kn_filtid expects this to be true");
+static const struct filterops * const sysfilt_ops[EVFILTID_MAX] = {
        /* Public Filters */
        [~EVFILT_READ]                  = &file_filtops,
        [~EVFILT_WRITE]                 = &file_filtops,
@@ -354,7 +324,7 @@ SECURITY_READ_ONLY_EARLY(static struct filterops *) sysfilt_ops[EVFILTID_MAX] =
        [~EVFILT_MACHPORT]              = &machport_filtops,
        [~EVFILT_FS]                    = &fs_filtops,
        [~EVFILT_USER]                  = &user_filtops,
-       &bad_filtops,
+       [~EVFILT_UNUSED_11]             = &bad_filtops,
        [~EVFILT_VM]                    = &bad_filtops,
        [~EVFILT_SOCK]                  = &file_filtops,
 #if CONFIG_MEMORYSTATUS
@@ -367,6 +337,7 @@ SECURITY_READ_ONLY_EARLY(static struct filterops *) sysfilt_ops[EVFILTID_MAX] =
 
        /* Private filters */
        [EVFILTID_KQREAD]               = &kqread_filtops,
+       [EVFILTID_PIPE_N]               = &pipe_nfiltops,
        [EVFILTID_PIPE_R]               = &pipe_rfiltops,
        [EVFILTID_PIPE_W]               = &pipe_wfiltops,
        [EVFILTID_PTSD]                 = &ptsd_kqops,
@@ -381,40 +352,69 @@ SECURITY_READ_ONLY_EARLY(static struct filterops *) sysfilt_ops[EVFILTID_MAX] =
        [EVFILTID_VN]                   = &vnode_filtops,
        [EVFILTID_TTY]                  = &tty_filtops,
        [EVFILTID_PTMX]                 = &ptmx_kqops,
+
+       /* fake filter for detached knotes, keep last */
+       [EVFILTID_DETACHED]             = &bad_filtops,
 };
 
 /* waitq prepost callback */
-void waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos);
+void waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *kq_hook);
+
+static inline bool
+kqr_thread_bound(workq_threadreq_t kqr)
+{
+       return kqr->tr_state == WORKQ_TR_STATE_BOUND;
+}
+
+static inline bool
+kqr_thread_requested_pending(workq_threadreq_t kqr)
+{
+       workq_tr_state_t tr_state = kqr->tr_state;
+       return tr_state > WORKQ_TR_STATE_IDLE && tr_state < WORKQ_TR_STATE_BOUND;
+}
+
+static inline bool
+kqr_thread_requested(workq_threadreq_t kqr)
+{
+       return kqr->tr_state != WORKQ_TR_STATE_IDLE;
+}
+
+static inline thread_t
+kqr_thread_fast(workq_threadreq_t kqr)
+{
+       assert(kqr_thread_bound(kqr));
+       return kqr->tr_thread;
+}
+
+static inline thread_t
+kqr_thread(workq_threadreq_t kqr)
+{
+       return kqr_thread_bound(kqr) ? kqr->tr_thread : THREAD_NULL;
+}
 
 static inline struct kqworkloop *
-kqr_kqworkloop(struct kqrequest *kqr)
+kqr_kqworkloop(workq_threadreq_t kqr)
 {
-       if (kqr->kqr_state & KQR_WORKLOOP) {
+       if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
                return __container_of(kqr, struct kqworkloop, kqwl_request);
        }
        return NULL;
 }
 
 static inline kqueue_t
-kqr_kqueue(proc_t p, struct kqrequest *kqr)
+kqr_kqueue(proc_t p, workq_threadreq_t kqr)
 {
        kqueue_t kqu;
-       if (kqr->kqr_state & KQR_WORKLOOP) {
+       if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
                kqu.kqwl = kqr_kqworkloop(kqr);
        } else {
-               kqu.kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue;
+               kqu.kqwq = p->p_fd->fd_wqkqueue;
                assert(kqr >= kqu.kqwq->kqwq_request &&
                    kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS);
        }
        return kqu;
 }
 
-static inline boolean_t
-is_workqueue_thread(thread_t thread)
-{
-       return thread_get_tag(thread) & THREAD_TAG_WORKQUEUE;
-}
-
 /*
  * kqueue/note lock implementations
  *
@@ -456,36 +456,56 @@ kqunlock(kqueue_t kqu)
 }
 
 static inline void
-kq_req_lock(kqueue_t kqu)
+knhash_lock(struct filedesc *fdp)
 {
-       assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ));
-       lck_spin_lock(&kqu.kq->kq_reqlock);
+       lck_mtx_lock(&fdp->fd_knhashlock);
 }
 
 static inline void
-kq_req_unlock(kqueue_t kqu)
+knhash_unlock(struct filedesc *fdp)
 {
-       assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ));
-       lck_spin_unlock(&kqu.kq->kq_reqlock);
+       lck_mtx_unlock(&fdp->fd_knhashlock);
 }
 
-static inline void
-kq_req_held(__assert_only kqueue_t kqu)
+/* wait event for knote locks */
+static inline event_t
+knote_lock_wev(struct knote *kn)
 {
-       assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ));
-       LCK_SPIN_ASSERT(&kqu.kq->kq_reqlock, LCK_ASSERT_OWNED);
+       return (event_t)(&kn->kn_hook);
 }
 
-static inline void
-knhash_lock(proc_t p)
+/* wait event for kevent_register_wait_* */
+static inline event64_t
+knote_filt_wev64(struct knote *kn)
 {
-       lck_mtx_lock(&p->p_fd->fd_knhashlock);
+       /* kdp_workloop_sync_wait_find_owner knows about this */
+       return CAST_EVENT64_T(kn);
 }
 
-static inline void
-knhash_unlock(proc_t p)
+/* wait event for knote_post/knote_drop */
+static inline event64_t
+knote_post_wev64(struct knote *kn)
+{
+       return CAST_EVENT64_T(&kn->kn_kevent);
+}
+
+/*!
+ * @function knote_has_qos
+ *
+ * @brief
+ * Whether the knote has a regular QoS.
+ *
+ * @discussion
+ * kn_qos_override is:
+ * - 0 on kqfiles
+ * - THREAD_QOS_LAST for special buckets (stayactive, manager)
+ *
+ * Other values mean the knote participates to QoS propagation.
+ */
+static inline bool
+knote_has_qos(struct knote *kn)
 {
-       lck_mtx_unlock(&p->p_fd->fd_knhashlock);
+       return kn->kn_qos_override > 0 && kn->kn_qos_override < THREAD_QOS_LAST;
 }
 
 #pragma mark knote locks
@@ -496,37 +516,29 @@ knhash_unlock(proc_t p)
  * KNOTE_KQ_LOCK_ALWAYS
  *   The function will always return with the kq lock held.
  *
- * KNOTE_KQ_UNLOCK_ON_SUCCESS
+ * KNOTE_KQ_LOCK_ON_SUCCESS
  *   The function will return with the kq lock held if it was successful
  *   (knote_lock() is the only function that can fail).
  *
- * KNOTE_KQ_UNLOCK_ON_FAILURE
+ * KNOTE_KQ_LOCK_ON_FAILURE
  *   The function will return with the kq lock held if it was unsuccessful
  *   (knote_lock() is the only function that can fail).
  *
  * KNOTE_KQ_UNLOCK:
  *   The function returns with the kq unlocked.
  */
-#define KNOTE_KQ_LOCK_ALWAYS      0x0
-#define KNOTE_KQ_LOCK_ON_SUCCESS  0x1
-#define KNOTE_KQ_LOCK_ON_FAILURE  0x2
-#define KNOTE_KQ_UNLOCK           0x3
-
-#if DEBUG || DEVELOPMENT
-__attribute__((noinline, not_tail_called, disable_tail_calls))
-void
-knote_lock_ctx_chk(struct knote_lock_ctx *knlc)
-{
-       /* evil hackery to make sure no one forgets to unlock */
-       assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
-}
-#endif
+enum kqlocking {
+       KNOTE_KQ_LOCK_ALWAYS,
+       KNOTE_KQ_LOCK_ON_SUCCESS,
+       KNOTE_KQ_LOCK_ON_FAILURE,
+       KNOTE_KQ_UNLOCK,
+};
 
 static struct knote_lock_ctx *
-knote_lock_ctx_find(struct kqueue *kq, struct knote *kn)
+knote_lock_ctx_find(kqueue_t kqu, struct knote *kn)
 {
        struct knote_lock_ctx *ctx;
-       LIST_FOREACH(ctx, &kq->kq_knlocks, knlc_le) {
+       LIST_FOREACH(ctx, &kqu.kq->kq_knlocks, knlc_link) {
                if (ctx->knlc_knote == kn) {
                        return ctx;
                }
@@ -538,42 +550,60 @@ knote_lock_ctx_find(struct kqueue *kq, struct knote *kn)
 /* slowpath of knote_lock() */
 __attribute__((noinline))
 static bool __result_use_check
-knote_lock_slow(struct kqueue *kq, struct knote *kn,
+knote_lock_slow(kqueue_t kqu, struct knote *kn,
     struct knote_lock_ctx *knlc, int kqlocking)
 {
-       kqlock_held(kq);
+       struct knote_lock_ctx *owner_lc;
+       struct uthread *uth = current_uthread();
+       wait_result_t wr;
 
-       struct knote_lock_ctx *owner_lc = knote_lock_ctx_find(kq, kn);
-       thread_t owner_thread = owner_lc->knlc_thread;
+       kqlock_held(kqu);
 
+       owner_lc = knote_lock_ctx_find(kqu, kn);
 #if DEBUG || DEVELOPMENT
        knlc->knlc_state = KNOTE_LOCK_CTX_WAITING;
 #endif
+       owner_lc->knlc_waiters++;
 
-       thread_reference(owner_thread);
-       TAILQ_INSERT_TAIL(&owner_lc->knlc_head, knlc, knlc_tqe);
-       assert_wait(&kn->kn_status, THREAD_UNINT | THREAD_WAIT_NOREPORT);
-       kqunlock(kq);
+       /*
+        * Make our lock context visible to knote_unlock()
+        */
+       uth->uu_knlock = knlc;
 
-       if (thread_handoff_deallocate(owner_thread) == THREAD_RESTART) {
-               if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
-                   kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
-                       kqlock(kq);
-               }
+       wr = lck_spin_sleep_with_inheritor(&kqu.kq->kq_lock, LCK_SLEEP_UNLOCK,
+           knote_lock_wev(kn), owner_lc->knlc_thread,
+           THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
+
+       if (wr == THREAD_RESTART) {
+               /*
+                * We haven't been woken up by knote_unlock() but knote_unlock_cancel.
+                * We need to cleanup the state since no one did.
+                */
+               uth->uu_knlock = NULL;
 #if DEBUG || DEVELOPMENT
                assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING);
                knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
 #endif
+
+               if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
+                   kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
+                       kqlock(kqu);
+               }
                return false;
-       }
+       } else {
+               if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
+                   kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
+                       kqlock(kqu);
 #if DEBUG || DEVELOPMENT
-       assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
+                       /*
+                        * This state is set under the lock so we can't
+                        * really assert this unless we hold the lock.
+                        */
+                       assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
 #endif
-       if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
-           kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
-               kqlock(kq);
+               }
+               return true;
        }
-       return true;
 }
 
 /*
@@ -584,20 +614,20 @@ knote_lock_slow(struct kqueue *kq, struct knote *kn,
  * Returns true if the knote lock is acquired, false if it has been dropped
  */
 static bool __result_use_check
-knote_lock(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
-    int kqlocking)
+knote_lock(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc,
+    enum kqlocking kqlocking)
 {
-       kqlock_held(kq);
+       kqlock_held(kqu);
 
 #if DEBUG || DEVELOPMENT
        assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
 #endif
        knlc->knlc_knote = kn;
        knlc->knlc_thread = current_thread();
-       TAILQ_INIT(&knlc->knlc_head);
+       knlc->knlc_waiters = 0;
 
        if (__improbable(kn->kn_status & KN_LOCKED)) {
-               return knote_lock_slow(kq, kn, knlc, kqlocking);
+               return knote_lock_slow(kqu, kn, knlc, kqlocking);
        }
 
        /*
@@ -606,7 +636,7 @@ knote_lock(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
         * hash table that references it before the lock is canceled.
         */
        assert((kn->kn_status & KN_DROPPING) == 0);
-       LIST_INSERT_HEAD(&kq->kq_knlocks, knlc, knlc_le);
+       LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, knlc, knlc_link);
        kn->kn_status |= KN_LOCKED;
 #if DEBUG || DEVELOPMENT
        knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
@@ -614,7 +644,7 @@ knote_lock(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
 
        if (kqlocking == KNOTE_KQ_UNLOCK ||
            kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
-               kqunlock(kq);
+               kqunlock(kqu);
        }
        return true;
 }
@@ -624,13 +654,13 @@ knote_lock(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
  *
  * Called with the kqueue lock held.
  *
- * Returns with the kqueue lock held according to KNOTE_KQ_* flags
+ * Returns with the kqueue lock held according to KNOTE_KQ_* mode.
  */
 static void
-knote_unlock(struct kqueue *kq, struct knote *kn,
-    struct knote_lock_ctx *knlc, int flags)
+knote_unlock(kqueue_t kqu, struct knote *kn,
+    struct knote_lock_ctx *knlc, enum kqlocking kqlocking)
 {
-       kqlock_held(kq);
+       kqlock_held(kqu);
 
        assert(knlc->knlc_knote == kn);
        assert(kn->kn_status & KN_LOCKED);
@@ -638,36 +668,45 @@ knote_unlock(struct kqueue *kq, struct knote *kn,
        assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
 #endif
 
-       struct knote_lock_ctx *next_owner_lc = TAILQ_FIRST(&knlc->knlc_head);
+       LIST_REMOVE(knlc, knlc_link);
 
-       LIST_REMOVE(knlc, knlc_le);
+       if (knlc->knlc_waiters) {
+               thread_t thread = THREAD_NULL;
 
-       if (next_owner_lc) {
-               assert(next_owner_lc->knlc_knote == kn);
-               TAILQ_REMOVE(&knlc->knlc_head, next_owner_lc, knlc_tqe);
+               wakeup_one_with_inheritor(knote_lock_wev(kn), THREAD_AWAKENED,
+                   LCK_WAKE_DEFAULT, &thread);
+
+               /*
+                * knote_lock_slow() publishes the lock context of waiters
+                * in uthread::uu_knlock.
+                *
+                * Reach out and make this context the new owner.
+                */
+               struct uthread *ut = get_bsdthread_info(thread);
+               struct knote_lock_ctx *next_owner_lc = ut->uu_knlock;
 
-               assert(TAILQ_EMPTY(&next_owner_lc->knlc_head));
-               TAILQ_CONCAT(&next_owner_lc->knlc_head, &knlc->knlc_head, knlc_tqe);
-               LIST_INSERT_HEAD(&kq->kq_knlocks, next_owner_lc, knlc_le);
+               assert(next_owner_lc->knlc_knote == kn);
+               next_owner_lc->knlc_waiters = knlc->knlc_waiters - 1;
+               LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, next_owner_lc, knlc_link);
 #if DEBUG || DEVELOPMENT
                next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
 #endif
+               ut->uu_knlock = NULL;
+               thread_deallocate_safe(thread);
        } else {
                kn->kn_status &= ~KN_LOCKED;
        }
-       if (kn->kn_inuse == 0) {
+
+       if ((kn->kn_status & KN_MERGE_QOS) && !(kn->kn_status & KN_POSTING)) {
                /*
                 * No f_event() in flight anymore, we can leave QoS "Merge" mode
                 *
-                * See knote_should_apply_qos_override()
+                * See knote_adjust_qos()
                 */
                kn->kn_status &= ~KN_MERGE_QOS;
        }
-       if (flags & KNOTE_KQ_UNLOCK) {
-               kqunlock(kq);
-       }
-       if (next_owner_lc) {
-               thread_wakeup_thread(&kn->kn_status, next_owner_lc->knlc_thread);
+       if (kqlocking == KNOTE_KQ_UNLOCK) {
+               kqunlock(kqu);
        }
 #if DEBUG || DEVELOPMENT
        knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
@@ -679,11 +718,11 @@ knote_unlock(struct kqueue *kq, struct knote *kn,
  *
  * Called with the kqueue lock held.
  *
- * Returns with the kqueue lock held according to KNOTE_KQ_* flags
+ * Returns with the kqueue unlocked.
  */
 static void
 knote_unlock_cancel(struct kqueue *kq, struct knote *kn,
-    struct knote_lock_ctx *knlc, int kqlocking)
+    struct knote_lock_ctx *knlc)
 {
        kqlock_held(kq);
 
@@ -691,15 +730,12 @@ knote_unlock_cancel(struct kqueue *kq, struct knote *kn,
        assert(kn->kn_status & KN_LOCKED);
        assert(kn->kn_status & KN_DROPPING);
 
-       LIST_REMOVE(knlc, knlc_le);
+       LIST_REMOVE(knlc, knlc_link);
        kn->kn_status &= ~KN_LOCKED;
+       kqunlock(kq);
 
-       if (kqlocking == KNOTE_KQ_UNLOCK ||
-           kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
-               kqunlock(kq);
-       }
-       if (!TAILQ_EMPTY(&knlc->knlc_head)) {
-               thread_wakeup_with_result(&kn->kn_status, THREAD_RESTART);
+       if (knlc->knlc_waiters) {
+               wakeup_all_with_inheritor(knote_lock_wev(kn), THREAD_RESTART);
        }
 #if DEBUG || DEVELOPMENT
        knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
@@ -712,17 +748,23 @@ knote_unlock_cancel(struct kqueue *kq, struct knote *kn,
  * Takes a use count to protect against concurrent drops.
  */
 static void
-knote_call_filter_event(struct kqueue *kq, struct knote *kn, long hint)
+knote_post(struct knote *kn, long hint)
 {
-       int result, dropping = 0;
+       struct kqueue *kq = knote_get_kq(kn);
+       int dropping, result;
 
-       kqlock_held(kq);
+       kqlock(kq);
 
-       if (kn->kn_status & (KN_DROPPING | KN_VANISHED)) {
-               return;
+       if (__improbable(kn->kn_status & (KN_DROPPING | KN_VANISHED))) {
+               return kqunlock(kq);
+       }
+
+       if (__improbable(kn->kn_status & KN_POSTING)) {
+               panic("KNOTE() called concurrently on knote %p", kn);
        }
 
-       kn->kn_inuse++;
+       kn->kn_status |= KN_POSTING;
+
        kqunlock(kq);
        result = filter_call(knote_fops(kn), f_event(kn, hint));
        kqlock(kq);
@@ -730,28 +772,26 @@ knote_call_filter_event(struct kqueue *kq, struct knote *kn, long hint)
        dropping = (kn->kn_status & KN_DROPPING);
 
        if (!dropping && (result & FILTER_ACTIVE)) {
-               if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
-                       knote_adjust_qos(kq, kn, result);
-               }
-               knote_activate(kn);
+               knote_activate(kq, kn, result);
        }
 
-       if (--kn->kn_inuse == 0) {
-               if ((kn->kn_status & KN_LOCKED) == 0) {
-                       /*
-                        * We're the last f_event() call and there's no other f_* call in
-                        * flight, we can leave QoS "Merge" mode.
-                        *
-                        * See knote_should_apply_qos_override()
-                        */
-                       kn->kn_status &= ~KN_MERGE_QOS;
-               }
-               if (dropping) {
-                       waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
-                           CAST_EVENT64_T(&kn->kn_inuse),
-                           THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
-               }
+       if ((kn->kn_status & KN_LOCKED) == 0) {
+               /*
+                * There's no other f_* call in flight, we can leave QoS "Merge" mode.
+                *
+                * See knote_adjust_qos()
+                */
+               kn->kn_status &= ~(KN_POSTING | KN_MERGE_QOS);
+       } else {
+               kn->kn_status &= ~KN_POSTING;
        }
+
+       if (__improbable(dropping)) {
+               waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, knote_post_wev64(kn),
+                   THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
+       }
+
+       kqunlock(kq);
 }
 
 /*
@@ -761,7 +801,7 @@ knote_call_filter_event(struct kqueue *kq, struct knote *kn, long hint)
  *     - kq unlocked at exit
  */
 static void
-knote_wait_for_filter_events(struct kqueue *kq, struct knote *kn)
+knote_wait_for_post(struct kqueue *kq, struct knote *kn)
 {
        wait_result_t wr = THREAD_NOT_WAITING;
 
@@ -769,10 +809,10 @@ knote_wait_for_filter_events(struct kqueue *kq, struct knote *kn)
 
        assert(kn->kn_status & KN_DROPPING);
 
-       if (kn->kn_inuse) {
+       if (kn->kn_status & KN_POSTING) {
                wr = waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
-                   CAST_EVENT64_T(&kn->kn_inuse),
-                   THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
+                   knote_post_wev64(kn), THREAD_UNINT | THREAD_WAIT_NOREPORT,
+                   TIMEOUT_WAIT_FOREVER);
        }
        kqunlock(kq);
        if (wr == THREAD_WAITING) {
@@ -780,12 +820,107 @@ knote_wait_for_filter_events(struct kqueue *kq, struct knote *kn)
        }
 }
 
+#pragma mark knote helpers for filters
+
+OS_ALWAYS_INLINE
+void
+knote_set_error(struct knote *kn, int error)
+{
+       kn->kn_flags |= EV_ERROR;
+       kn->kn_sdata = error;
+}
+
+OS_ALWAYS_INLINE
+int64_t
+knote_low_watermark(const struct knote *kn)
+{
+       return (kn->kn_sfflags & NOTE_LOWAT) ? kn->kn_sdata : 1;
+}
+
+/*!
+ * @function knote_fill_kevent_with_sdata
+ *
+ * @brief
+ * Fills in a kevent from the current content of a knote.
+ *
+ * @discussion
+ * This is meant to be called from filter's f_event hooks.
+ * The kevent data is filled with kn->kn_sdata.
+ *
+ * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
+ *
+ * Using knote_fill_kevent is typically preferred.
+ */
+OS_ALWAYS_INLINE
+void
+knote_fill_kevent_with_sdata(struct knote *kn, struct kevent_qos_s *kev)
+{
+#define knote_assert_aliases(name1, offs1, name2) \
+       static_assert(offsetof(struct kevent_qos_s, name1) + offs1 == \
+           offsetof(struct kevent_internal_s, name2), \
+               "kevent_qos_s::" #name1 " and kevent_internal_s::" #name2 "need to alias")
+       /*
+        * All the code makes assumptions on these aliasing,
+        * so make sure we fail the build if we ever ever ever break them.
+        */
+       knote_assert_aliases(ident, 0, kei_ident);
+#ifdef __LITTLE_ENDIAN__
+       knote_assert_aliases(filter, 0, kei_filter);  // non trivial overlap
+       knote_assert_aliases(filter, 1, kei_filtid);  // non trivial overlap
+#else
+       knote_assert_aliases(filter, 0, kei_filtid);  // non trivial overlap
+       knote_assert_aliases(filter, 1, kei_filter);  // non trivial overlap
+#endif
+       knote_assert_aliases(flags, 0, kei_flags);
+       knote_assert_aliases(qos, 0, kei_qos);
+       knote_assert_aliases(udata, 0, kei_udata);
+       knote_assert_aliases(fflags, 0, kei_fflags);
+       knote_assert_aliases(xflags, 0, kei_sfflags); // non trivial overlap
+       knote_assert_aliases(data, 0, kei_sdata);     // non trivial overlap
+       knote_assert_aliases(ext, 0, kei_ext);
+#undef knote_assert_aliases
+
+       /*
+        * Fix the differences between kevent_qos_s and kevent_internal_s:
+        * - xflags is where kn_sfflags lives, we need to zero it
+        * - fixup the high bits of `filter` where kn_filtid lives
+        */
+       *kev = *(struct kevent_qos_s *)&kn->kn_kevent;
+       kev->xflags = 0;
+       kev->filter |= 0xff00;
+       if (kn->kn_flags & EV_CLEAR) {
+               kn->kn_fflags = 0;
+       }
+}
+
+/*!
+ * @function knote_fill_kevent
+ *
+ * @brief
+ * Fills in a kevent from the current content of a knote.
+ *
+ * @discussion
+ * This is meant to be called from filter's f_event hooks.
+ * The kevent data is filled with the passed in data.
+ *
+ * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
+ */
+OS_ALWAYS_INLINE
+void
+knote_fill_kevent(struct knote *kn, struct kevent_qos_s *kev, int64_t data)
+{
+       knote_fill_kevent_with_sdata(kn, kev);
+       kev->filter = kn->kn_filter;
+       kev->data = data;
+}
+
+
 #pragma mark file_filtops
 
 static int
-filt_fileattach(struct knote *kn, struct kevent_internal_s *kev)
+filt_fileattach(struct knote *kn, struct kevent_qos_s *kev)
 {
-       return fo_kqfilter(kn->kn_fp, kn, kev, vfs_context_current());
+       return fo_kqfilter(kn->kn_fp, kn, kev);
 }
 
 SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
@@ -820,36 +955,29 @@ filt_kqueue(struct knote *kn, __unused long hint)
 }
 
 static int
-filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev)
+filt_kqtouch(struct knote *kn, struct kevent_qos_s *kev)
 {
 #pragma unused(kev)
        struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
        int res;
 
        kqlock(kq);
-       kn->kn_data = kq->kq_count;
-       res = (kn->kn_data > 0);
-
+       res = (kq->kq_count > 0);
        kqunlock(kq);
 
        return res;
 }
 
 static int
-filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
+filt_kqprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-#pragma unused(data)
        struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
-       int res;
+       int res = 0;
 
        kqlock(kq);
-       kn->kn_data = kq->kq_count;
-       res = (kn->kn_data > 0);
-       if (res) {
-               *kev = kn->kn_kevent;
-               if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_data = 0;
-               }
+       if (kq->kq_count) {
+               knote_fill_kevent(kn, kev, kq->kq_count);
+               res = 1;
        }
        kqunlock(kq);
 
@@ -867,7 +995,7 @@ SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
 #pragma mark proc_filtops
 
 static int
-filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev)
+filt_procattach(struct knote *kn, __unused struct kevent_qos_s *kev)
 {
        struct proc *p;
 
@@ -884,7 +1012,7 @@ filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev)
                return 0;
        }
 
-       const int NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
+       const uint32_t NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
 
        if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits) {
                do {
@@ -903,9 +1031,11 @@ filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev)
                } while (0);
        }
 
-       proc_klist_lock();
+       kn->kn_proc = p;
+       kn->kn_flags |= EV_CLEAR;       /* automatically set */
+       kn->kn_sdata = 0;               /* incoming data is ignored */
 
-       kn->kn_ptr.p_proc = p;          /* store the proc handle */
+       proc_klist_lock();
 
        KNOTE_ATTACH(&p->p_klist, kn);
 
@@ -933,9 +1063,9 @@ filt_procdetach(struct knote *kn)
 
        proc_klist_lock();
 
-       p = kn->kn_ptr.p_proc;
+       p = kn->kn_proc;
        if (p != PROC_NULL) {
-               kn->kn_ptr.p_proc = PROC_NULL;
+               kn->kn_proc = PROC_NULL;
                KNOTE_DETACH(&p->p_klist, kn);
        }
 
@@ -943,7 +1073,7 @@ filt_procdetach(struct knote *kn)
 }
 
 static int
-filt_proc(struct knote *kn, long hint)
+filt_procevent(struct knote *kn, long hint)
 {
        u_int event;
 
@@ -952,7 +1082,7 @@ filt_proc(struct knote *kn, long hint)
        /*
         * Note: a lot of bits in hint may be obtained from the knote
         * To free some of those bits, see <rdar://problem/12592988> Freeing up
-        * bits in hint for filt_proc
+        * bits in hint for filt_procevent
         *
         * mask off extra data
         */
@@ -967,8 +1097,8 @@ filt_proc(struct knote *kn, long hint)
         * parent and these knotes re-fired.
         */
        if (event & NOTE_EXIT) {
-               if ((kn->kn_ptr.p_proc->p_oppid != 0)
-                   && (knote_get_kq(kn)->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) {
+               if ((kn->kn_proc->p_oppid != 0)
+                   && (knote_get_kq(kn)->kq_p->p_pid != kn->kn_proc->p_ppid)) {
                        /*
                         * This knote is not for the current ptrace(2) parent, ignore.
                         */
@@ -993,52 +1123,52 @@ filt_proc(struct knote *kn, long hint)
 
        /*
         * The kernel has a wrapper in place that returns the same data
-        * as is collected here, in kn_data.  Any changes to how
+        * as is collected here, in kn_hook64.  Any changes to how
         * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
         * should also be reflected in the proc_pidnoteexit() wrapper.
         */
        if (event == NOTE_EXIT) {
-               kn->kn_data = 0;
+               kn->kn_hook64 = 0;
                if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
                        kn->kn_fflags |= NOTE_EXITSTATUS;
-                       kn->kn_data |= (hint & NOTE_PDATAMASK);
+                       kn->kn_hook64 |= (hint & NOTE_PDATAMASK);
                }
                if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
                        kn->kn_fflags |= NOTE_EXIT_DETAIL;
-                       if ((kn->kn_ptr.p_proc->p_lflag &
+                       if ((kn->kn_proc->p_lflag &
                            P_LTERM_DECRYPTFAIL) != 0) {
-                               kn->kn_data |= NOTE_EXIT_DECRYPTFAIL;
+                               kn->kn_hook64 |= NOTE_EXIT_DECRYPTFAIL;
                        }
-                       if ((kn->kn_ptr.p_proc->p_lflag &
+                       if ((kn->kn_proc->p_lflag &
                            P_LTERM_JETSAM) != 0) {
-                               kn->kn_data |= NOTE_EXIT_MEMORY;
-                               switch (kn->kn_ptr.p_proc->p_lflag & P_JETSAM_MASK) {
+                               kn->kn_hook64 |= NOTE_EXIT_MEMORY;
+                               switch (kn->kn_proc->p_lflag & P_JETSAM_MASK) {
                                case P_JETSAM_VMPAGESHORTAGE:
-                                       kn->kn_data |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
+                                       kn->kn_hook64 |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
                                        break;
                                case P_JETSAM_VMTHRASHING:
-                                       kn->kn_data |= NOTE_EXIT_MEMORY_VMTHRASHING;
+                                       kn->kn_hook64 |= NOTE_EXIT_MEMORY_VMTHRASHING;
                                        break;
                                case P_JETSAM_FCTHRASHING:
-                                       kn->kn_data |= NOTE_EXIT_MEMORY_FCTHRASHING;
+                                       kn->kn_hook64 |= NOTE_EXIT_MEMORY_FCTHRASHING;
                                        break;
                                case P_JETSAM_VNODE:
-                                       kn->kn_data |= NOTE_EXIT_MEMORY_VNODE;
+                                       kn->kn_hook64 |= NOTE_EXIT_MEMORY_VNODE;
                                        break;
                                case P_JETSAM_HIWAT:
-                                       kn->kn_data |= NOTE_EXIT_MEMORY_HIWAT;
+                                       kn->kn_hook64 |= NOTE_EXIT_MEMORY_HIWAT;
                                        break;
                                case P_JETSAM_PID:
-                                       kn->kn_data |= NOTE_EXIT_MEMORY_PID;
+                                       kn->kn_hook64 |= NOTE_EXIT_MEMORY_PID;
                                        break;
                                case P_JETSAM_IDLEEXIT:
-                                       kn->kn_data |= NOTE_EXIT_MEMORY_IDLE;
+                                       kn->kn_hook64 |= NOTE_EXIT_MEMORY_IDLE;
                                        break;
                                }
                        }
-                       if ((kn->kn_ptr.p_proc->p_csflags &
+                       if ((kn->kn_proc->p_csflags &
                            CS_KILLED) != 0) {
-                               kn->kn_data |= NOTE_EXIT_CSERROR;
+                               kn->kn_hook64 |= NOTE_EXIT_CSERROR;
                        }
                }
        }
@@ -1048,7 +1178,7 @@ filt_proc(struct knote *kn, long hint)
 }
 
 static int
-filt_proctouch(struct knote *kn, struct kevent_internal_s *kev)
+filt_proctouch(struct knote *kn, struct kevent_qos_s *kev)
 {
        int res;
 
@@ -1072,28 +1202,25 @@ filt_proctouch(struct knote *kn, struct kevent_internal_s *kev)
 }
 
 static int
-filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
+filt_procprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-#pragma unused(data)
-       int res;
+       int res = 0;
 
        proc_klist_lock();
-       res = (kn->kn_fflags != 0);
-       if (res) {
-               *kev = kn->kn_kevent;
-               kn->kn_flags |= EV_CLEAR;       /* automatically set */
-               kn->kn_fflags = 0;
-               kn->kn_data = 0;
+       if (kn->kn_fflags) {
+               knote_fill_kevent(kn, kev, kn->kn_hook64);
+               kn->kn_hook64 = 0;
+               res = 1;
        }
        proc_klist_unlock();
        return res;
 }
 
 SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
-       .f_attach = filt_procattach,
-       .f_detach = filt_procdetach,
-       .f_event = filt_proc,
-       .f_touch = filt_proctouch,
+       .f_attach  = filt_procattach,
+       .f_detach  = filt_procdetach,
+       .f_event   = filt_procevent,
+       .f_touch   = filt_proctouch,
        .f_process = filt_procprocess,
 };
 
@@ -1109,12 +1236,12 @@ struct filt_timer_params {
 /*
  * Values stored in the knote at rest (using Mach absolute time units)
  *
- * kn->kn_hook          where the thread_call object is stored
+ * kn->kn_thcall        where the thread_call object is stored
  * kn->kn_ext[0]        next deadline or 0 if immediate expiration
  * kn->kn_ext[1]        leeway value
  * kn->kn_sdata         interval timer: the interval
  *                      absolute/deadline timer: 0
- * kn->kn_hookid        timer state
+ * kn->kn_hook32        timer state
  *
  * TIMER_IDLE:
  *   The timer has either never been scheduled or been cancelled.
@@ -1164,7 +1291,7 @@ filt_timer_set_params(struct knote *kn, struct filt_timer_params *params)
  * Called with timer filter lock held.
  */
 static int
-filt_timervalidate(const struct kevent_internal_s *kev,
+filt_timervalidate(const struct kevent_qos_s *kev,
     struct filt_timer_params *params)
 {
        /*
@@ -1354,13 +1481,13 @@ filt_timerexpire(void *knx, __unused void *spare)
        struct knote *kn = knx;
        int v;
 
-       if (os_atomic_cmpxchgv(&kn->kn_hookid, TIMER_ARMED, TIMER_FIRED,
+       if (os_atomic_cmpxchgv(&kn->kn_hook32, TIMER_ARMED, TIMER_FIRED,
            &v, relaxed)) {
                // our f_event always would say FILTER_ACTIVE,
                // so be leaner and just do it.
                struct kqueue *kq = knote_get_kq(kn);
                kqlock(kq);
-               knote_activate(kn);
+               knote_activate(kq, kn, FILTER_ACTIVE);
                kqunlock(kq);
        } else {
                /*
@@ -1377,9 +1504,9 @@ filt_timerexpire(void *knx, __unused void *spare)
 static void
 filt_timercancel(struct knote *kn)
 {
-       if (os_atomic_xchg(&kn->kn_hookid, TIMER_IDLE, relaxed) == TIMER_ARMED) {
+       if (os_atomic_xchg(&kn->kn_hook32, TIMER_IDLE, relaxed) == TIMER_ARMED) {
                /* cancel the thread call and wait for any filt_timerexpire in flight */
-               thread_call_cancel_wait((thread_call_t)kn->kn_hook);
+               thread_call_cancel_wait(kn->kn_thcall);
        }
 }
 
@@ -1418,7 +1545,7 @@ filt_timerarm(struct knote *kn)
        int filter_flags = kn->kn_sfflags;
        unsigned int timer_flags = 0;
 
-       assert(os_atomic_load(&kn->kn_hookid, relaxed) == TIMER_IDLE);
+       assert(os_atomic_load(&kn->kn_hook32, relaxed) == TIMER_IDLE);
 
        if (filter_flags & NOTE_CRITICAL) {
                timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
@@ -1436,8 +1563,8 @@ filt_timerarm(struct knote *kn)
                timer_flags |= THREAD_CALL_CONTINUOUS;
        }
 
-       os_atomic_store(&kn->kn_hookid, TIMER_ARMED, relaxed);
-       thread_call_enter_delayed_with_leeway((thread_call_t)kn->kn_hook, NULL,
+       os_atomic_store(&kn->kn_hook32, TIMER_ARMED, relaxed);
+       thread_call_enter_delayed_with_leeway(kn->kn_thcall, NULL,
            deadline, leeway, timer_flags);
 }
 
@@ -1445,7 +1572,7 @@ filt_timerarm(struct knote *kn)
  * Allocate a thread call for the knote's lifetime, and kick off the timer.
  */
 static int
-filt_timerattach(struct knote *kn, struct kevent_internal_s *kev)
+filt_timerattach(struct knote *kn, struct kevent_qos_s *kev)
 {
        thread_call_t callout;
        struct filt_timer_params params;
@@ -1466,9 +1593,9 @@ filt_timerattach(struct knote *kn, struct kevent_internal_s *kev)
        }
 
        filt_timer_set_params(kn, &params);
-       kn->kn_hook = callout;
+       kn->kn_thcall = callout;
        kn->kn_flags |= EV_CLEAR;
-       os_atomic_store(&kn->kn_hookid, TIMER_IDLE, relaxed);
+       os_atomic_store(&kn->kn_hook32, TIMER_IDLE, relaxed);
 
        /* NOTE_ABSOLUTE implies EV_ONESHOT */
        if (kn->kn_sfflags & NOTE_ABSOLUTE) {
@@ -1476,7 +1603,7 @@ filt_timerattach(struct knote *kn, struct kevent_internal_s *kev)
        }
 
        if (filt_timer_is_ready(kn)) {
-               os_atomic_store(&kn->kn_hookid, TIMER_IMMEDIATE, relaxed);
+               os_atomic_store(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
                return FILTER_ACTIVE;
        } else {
                filt_timerarm(kn);
@@ -1496,8 +1623,8 @@ filt_timerdetach(struct knote *kn)
         * Unconditionally cancel to make sure there can't be any filt_timerexpire()
         * running anymore.
         */
-       thread_call_cancel_wait((thread_call_t)kn->kn_hook);
-       freed = thread_call_free((thread_call_t)kn->kn_hook);
+       thread_call_cancel_wait(kn->kn_thcall);
+       freed = thread_call_free(kn->kn_thcall);
        assert(freed);
 }
 
@@ -1509,7 +1636,7 @@ filt_timerdetach(struct knote *kn)
  * pops have gone off (in kn_data).
  */
 static int
-filt_timertouch(struct knote *kn, struct kevent_internal_s *kev)
+filt_timertouch(struct knote *kn, struct kevent_qos_s *kev)
 {
        struct filt_timer_params params;
        uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags);
@@ -1533,7 +1660,7 @@ filt_timertouch(struct knote *kn, struct kevent_internal_s *kev)
        kn->kn_sfflags = kev->fflags;
 
        if (filt_timer_is_ready(kn)) {
-               os_atomic_store(&kn->kn_hookid, TIMER_IMMEDIATE, relaxed);
+               os_atomic_store(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
                return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
        } else {
                filt_timerarm(kn);
@@ -1549,10 +1676,7 @@ filt_timertouch(struct knote *kn, struct kevent_internal_s *kev)
  * counters for the next time.
  */
 static int
-filt_timerprocess(
-       struct knote *kn,
-       __unused struct filt_process_s *data,
-       struct kevent_internal_s *kev)
+filt_timerprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
        /*
         * filt_timerprocess is serialized with any filter routine except for
@@ -1563,7 +1687,7 @@ filt_timerprocess(
         * whether we see any of the "FIRED" state, and if we do, it is safe to
         * do simple state machine transitions.
         */
-       switch (os_atomic_load(&kn->kn_hookid, relaxed)) {
+       switch (os_atomic_load(&kn->kn_hook32, relaxed)) {
        case TIMER_IDLE:
        case TIMER_ARMED:
                /*
@@ -1573,7 +1697,7 @@ filt_timerprocess(
                return 0;
        }
 
-       os_atomic_store(&kn->kn_hookid, TIMER_IDLE, relaxed);
+       os_atomic_store(&kn->kn_hook32, TIMER_IDLE, relaxed);
 
        /*
         * Copy out the interesting kevent state,
@@ -1584,13 +1708,11 @@ filt_timerprocess(
         *      - return kn_sfflags in the fflags field so the client can know
         *        under what flags the timer fired
         */
-       *kev = kn->kn_kevent;
+       knote_fill_kevent(kn, kev, 1);
        kev->ext[0] = 0;
        /* kev->ext[1] = 0;  JMM - shouldn't we hide this too? */
 
-       if (kn->kn_sdata == 0) {
-               kev->data = 1;
-       } else {
+       if (kn->kn_sdata != 0) {
                /*
                 * This is a 'repeating' timer, so we have to emit
                 * how many intervals expired between the arm
@@ -1654,7 +1776,7 @@ SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
        .f_extended_codes = true,
        .f_attach   = filt_timerattach,
        .f_detach   = filt_timerdetach,
-       .f_event    = filt_badevent,
+       .f_event    = filt_bad_event,
        .f_touch    = filt_timertouch,
        .f_process  = filt_timerprocess,
 };
@@ -1662,24 +1784,18 @@ SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
 #pragma mark user_filtops
 
 static int
-filt_userattach(struct knote *kn, __unused struct kevent_internal_s *kev)
+filt_userattach(struct knote *kn, __unused struct kevent_qos_s *kev)
 {
        if (kn->kn_sfflags & NOTE_TRIGGER) {
-               kn->kn_hookid = FILTER_ACTIVE;
+               kn->kn_hook32 = FILTER_ACTIVE;
        } else {
-               kn->kn_hookid = 0;
+               kn->kn_hook32 = 0;
        }
-       return kn->kn_hookid;
-}
-
-static void
-filt_userdetach(__unused struct knote *kn)
-{
-       /* EVFILT_USER knotes are not attached to anything in the kernel */
+       return kn->kn_hook32;
 }
 
 static int
-filt_usertouch(struct knote *kn, struct kevent_internal_s *kev)
+filt_usertouch(struct knote *kn, struct kevent_qos_s *kev)
 {
        uint32_t ffctrl;
        int fflags;
@@ -1702,27 +1818,23 @@ filt_usertouch(struct knote *kn, struct kevent_internal_s *kev)
        kn->kn_sdata = kev->data;
 
        if (kev->fflags & NOTE_TRIGGER) {
-               kn->kn_hookid = FILTER_ACTIVE;
+               kn->kn_hook32 = FILTER_ACTIVE;
        }
-       return (int)kn->kn_hookid;
+       return (int)kn->kn_hook32;
 }
 
 static int
-filt_userprocess(
-       struct knote *kn,
-       __unused struct filt_process_s *data,
-       struct kevent_internal_s *kev)
+filt_userprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-       int result = (int)kn->kn_hookid;
+       int result = (int)kn->kn_hook32;
 
        if (result) {
-               *kev = kn->kn_kevent;
+               /* EVFILT_USER returns the data that was passed in */
+               knote_fill_kevent_with_sdata(kn, kev);
                kev->fflags = kn->kn_sfflags;
-               kev->data = kn->kn_sdata;
                if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_hookid = 0;
-                       kn->kn_data = 0;
-                       kn->kn_fflags = 0;
+                       /* knote_fill_kevent cleared kn_fflags */
+                       kn->kn_hook32 = 0;
                }
        }
 
@@ -1732,24 +1844,26 @@ filt_userprocess(
 SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
        .f_extended_codes = true,
        .f_attach  = filt_userattach,
-       .f_detach  = filt_userdetach,
-       .f_event   = filt_badevent,
+       .f_detach  = filt_no_detach,
+       .f_event   = filt_bad_event,
        .f_touch   = filt_usertouch,
        .f_process = filt_userprocess,
 };
 
 #pragma mark workloop_filtops
 
+#define EPREEMPTDISABLED (-1)
+
 static inline void
 filt_wllock(struct kqworkloop *kqwl)
 {
-       lck_mtx_lock(&kqwl->kqwl_statelock);
+       lck_spin_lock(&kqwl->kqwl_statelock);
 }
 
 static inline void
 filt_wlunlock(struct kqworkloop *kqwl)
 {
-       lck_mtx_unlock(&kqwl->kqwl_statelock);
+       lck_spin_unlock(&kqwl->kqwl_statelock);
 }
 
 /*
@@ -1766,9 +1880,7 @@ filt_wlunlock(struct kqworkloop *kqwl)
 static inline bool
 filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl)
 {
-       struct kqrequest *kqr = &kqwl->kqwl_request;
-       return (kqr->kqr_state & KQR_THREQUESTED) &&
-              (kqr->kqr_thread == THREAD_NULL);
+       return kqr_thread_requested_pending(&kqwl->kqwl_request);
 }
 
 static void
@@ -1776,7 +1888,7 @@ filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts,
     turnstile_update_flags_t flags)
 {
        turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
-       struct kqrequest *kqr = &kqwl->kqwl_request;
+       workq_threadreq_t kqr = &kqwl->kqwl_request;
 
        /*
         * binding to the workq should always happen through
@@ -1786,13 +1898,14 @@ filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts,
 
        if ((inheritor = kqwl->kqwl_owner)) {
                flags |= TURNSTILE_INHERITOR_THREAD;
-       } else if ((inheritor = kqr->kqr_thread)) {
+       } else if ((inheritor = kqr_thread(kqr))) {
                flags |= TURNSTILE_INHERITOR_THREAD;
        }
 
        turnstile_update_inheritor(ts, inheritor, flags);
 }
 
+#define EVFILT_WORKLOOP_EFAULT_RETRY_COUNT 100
 #define FILT_WLATTACH 0
 #define FILT_WLTOUCH  1
 #define FILT_WLDROP   2
@@ -1800,43 +1913,24 @@ filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts,
 __result_use_check
 static int
 filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
-    struct kevent_internal_s *kev, kq_index_t qos_index, int op)
+    struct kevent_qos_s *kev, kq_index_t qos_index, int op)
 {
        user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
-       struct kqrequest *kqr = &kqwl->kqwl_request;
+       workq_threadreq_t kqr = &kqwl->kqwl_request;
        thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
-       kq_index_t cur_owner_override = THREAD_QOS_UNSPECIFIED;
+       kq_index_t cur_override = THREAD_QOS_UNSPECIFIED;
+       int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
        int action = KQWL_UTQ_NONE, error = 0;
-       bool needs_wake = false, needs_wllock = false;
+       bool wl_inheritor_updated = false, needs_wake = false;
        uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
        uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
        uint64_t udata = 0;
+       struct turnstile *ts = TURNSTILE_NULL;
 
-       if (kev->fflags & (NOTE_WL_END_OWNERSHIP | NOTE_WL_DISCOVER_OWNER)) {
-               /*
-                * If we're maybe going to change the kqwl_owner,
-                * then we need to hold the filt_wllock().
-                */
-               needs_wllock = true;
-       } else if (kqr->kqr_thread == current_thread()) {
-               /*
-                * <rdar://problem/41531764> Servicer updates need to be serialized with
-                * any ownership change too, as the kqr_thread value influences the
-                * outcome of handling NOTE_WL_DISCOVER_OWNER.
-                */
-               needs_wllock = true;
-       }
+       filt_wllock(kqwl);
 
-       if (needs_wllock) {
-               filt_wllock(kqwl);
-               /*
-                * The kqwl owner is set under both the req and filter lock,
-                * meaning it's fine to look at it under any.
-                */
-               new_owner = cur_owner = kqwl->kqwl_owner;
-       } else {
-               new_owner = cur_owner = THREAD_NULL;
-       }
+again:
+       new_owner = cur_owner = kqwl->kqwl_owner;
 
        /*
         * Phase 1:
@@ -1853,8 +1947,33 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
         * Lastly decide whether we need to perform a QoS update.
         */
        if (uaddr) {
-               error = copyin_word(uaddr, &udata, sizeof(udata));
-               if (error) {
+               /*
+                * Until <rdar://problem/24999882> exists,
+                * disabling preemption copyin forces any
+                * vm_fault we encounter to fail.
+                */
+               error = copyin_atomic64(uaddr, &udata);
+
+               /*
+                * If we get EFAULT, drop locks, and retry.
+                * If we still get an error report it,
+                * else assume the memory has been faulted
+                * and attempt to copyin under lock again.
+                */
+               switch (error) {
+               case 0:
+                       break;
+               case EFAULT:
+                       if (efault_retry-- > 0) {
+                               filt_wlunlock(kqwl);
+                               error = copyin_atomic64(uaddr, &udata);
+                               filt_wllock(kqwl);
+                               if (error == 0) {
+                                       goto again;
+                               }
+                       }
+               /* FALLTHROUGH */
+               default:
                        goto out;
                }
 
@@ -1873,7 +1992,8 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
                        mach_port_name_t name = (mach_port_name_t)udata & ~0x3;
                        if (name != MACH_PORT_NULL) {
                                name = ipc_entry_name_mask(name);
-                               extra_thread_ref = port_name_to_thread(name);
+                               extra_thread_ref = port_name_to_thread(name,
+                                   PORT_TO_THREAD_IN_CURRENT_TASK);
                                if (extra_thread_ref == THREAD_NULL) {
                                        error = EOWNERDEAD;
                                        goto out;
@@ -1890,7 +2010,7 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
        if (error == 0) {
                if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
                        action = KQWL_UTQ_SET_QOS_INDEX;
-               } else if (qos_index && kqr->kqr_qos_index != qos_index) {
+               } else if (qos_index && kqr->tr_kq_qos_index != qos_index) {
                        action = KQWL_UTQ_SET_QOS_INDEX;
                }
 
@@ -1902,9 +2022,8 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
                         */
                        kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
                        kn->kn_sfflags |= kev->fflags;
-                       kn->kn_sdata = kev->data;
                        if (kev->fflags & NOTE_WL_SYNC_WAKE) {
-                               needs_wake = (kn->kn_hook != THREAD_NULL);
+                               needs_wake = (kn->kn_thread != THREAD_NULL);
                        }
                } else if (op == FILT_WLDROP) {
                        if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
@@ -1914,7 +2033,7 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
                                 * explicitly, issue a wake up.
                                 */
                                kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
-                               needs_wake = (kn->kn_hook != THREAD_NULL);
+                               needs_wake = (kn->kn_thread != THREAD_NULL);
                        }
                }
        }
@@ -1929,10 +2048,10 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
                goto out;
        }
 
-       kq_req_lock(kqwl);
+       kqlock(kqwl);
 
        /* If already tracked as servicer, don't track as owner */
-       if (new_owner == kqr->kqr_thread) {
+       if (new_owner == kqr_thread(kqr)) {
                new_owner = THREAD_NULL;
        }
 
@@ -1942,25 +2061,20 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
                        /* we just transfered this ref to kqwl_owner */
                        extra_thread_ref = THREAD_NULL;
                }
-               cur_owner_override = kqworkloop_owner_override(kqwl);
-
-               if (cur_owner) {
-                       thread_ends_owning_workloop(cur_owner);
-               }
+               cur_override = kqworkloop_override(kqwl);
 
                if (new_owner) {
                        /* override it before we drop the old */
-                       if (cur_owner_override != THREAD_QOS_UNSPECIFIED) {
-                               thread_add_ipc_override(new_owner, cur_owner_override);
+                       if (cur_override != THREAD_QOS_UNSPECIFIED) {
+                               thread_add_kevent_override(new_owner, cur_override);
                        }
-                       thread_starts_owning_workloop(new_owner);
-                       if ((kqr->kqr_state & KQR_THREQUESTED) && !kqr->kqr_thread) {
+                       if (kqr_thread_requested_pending(kqr)) {
                                if (action == KQWL_UTQ_NONE) {
                                        action = KQWL_UTQ_REDRIVE_EVENTS;
                                }
                        }
                } else {
-                       if ((kqr->kqr_state & (KQR_THREQUESTED | KQR_WAKEUP)) == KQR_WAKEUP) {
+                       if (!kqr_thread_requested(kqr) && kqr->tr_kq_wakeup) {
                                if (action == KQWL_UTQ_NONE) {
                                        action = KQWL_UTQ_REDRIVE_EVENTS;
                                }
@@ -1968,13 +2082,11 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
                }
        }
 
-       struct turnstile *ts = kqwl->kqwl_turnstile;
-       bool wl_inheritor_updated = false;
-
        if (action != KQWL_UTQ_NONE) {
                kqworkloop_update_threads_qos(kqwl, action, qos_index);
        }
 
+       ts = kqwl->kqwl_turnstile;
        if (cur_owner != new_owner && ts) {
                if (action == KQWL_UTQ_REDRIVE_EVENTS) {
                        /*
@@ -2012,16 +2124,15 @@ filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
        }
 
        if (needs_wake && ts) {
-               waitq_wakeup64_thread(&ts->ts_waitq, CAST_EVENT64_T((event_t)kn),
-                   (thread_t)kn->kn_hook, THREAD_AWAKENED);
+               waitq_wakeup64_thread(&ts->ts_waitq, knote_filt_wev64(kn),
+                   kn->kn_thread, THREAD_AWAKENED);
+               if (op == FILT_WLATTACH || op == FILT_WLTOUCH) {
+                       disable_preemption();
+                       error = EPREEMPTDISABLED;
+               }
        }
 
-       kq_req_unlock(kqwl);
-
-       if (wl_inheritor_updated) {
-               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
-               turnstile_deallocate(ts);
-       }
+       kqunlock(kqwl);
 
 out:
        /*
@@ -2029,14 +2140,12 @@ out:
         *
         * Unlock and cleanup various lingering references and things.
         */
-       if (needs_wllock) {
-               filt_wlunlock(kqwl);
-       }
+       filt_wlunlock(kqwl);
 
 #if CONFIG_WORKLOOP_DEBUG
        KQWL_HISTORY_WRITE_ENTRY(kqwl, {
                .updater = current_thread(),
-               .servicer = kqr->kqr_thread, /* Note: racy */
+               .servicer = kqr_thread(kqr), /* Note: racy */
                .old_owner = cur_owner,
                .new_owner = new_owner,
 
@@ -2051,15 +2160,19 @@ out:
        });
 #endif // CONFIG_WORKLOOP_DEBUG
 
+       if (wl_inheritor_updated) {
+               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
+               turnstile_deallocate_safe(ts);
+       }
+
        if (cur_owner && new_owner != cur_owner) {
-               if (cur_owner_override != THREAD_QOS_UNSPECIFIED) {
-                       thread_drop_ipc_override(cur_owner);
+               if (cur_override != THREAD_QOS_UNSPECIFIED) {
+                       thread_drop_kevent_override(cur_owner);
                }
-               thread_deallocate(cur_owner);
+               thread_deallocate_safe(cur_owner);
        }
-
        if (extra_thread_ref) {
-               thread_deallocate(extra_thread_ref);
+               thread_deallocate_safe(extra_thread_ref);
        }
        return error;
 }
@@ -2072,67 +2185,122 @@ out:
  * - data is set to the error if any
  */
 static inline void
-filt_wlremember_last_update(struct knote *kn, struct kevent_internal_s *kev,
+filt_wlremember_last_update(struct knote *kn, struct kevent_qos_s *kev,
     int error)
 {
        kn->kn_fflags = kev->fflags;
-       kn->kn_data = error;
+       kn->kn_sdata = error;
        memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
 }
 
 static int
-filt_wlattach(struct knote *kn, struct kevent_internal_s *kev)
+filt_wlupdate_sync_ipc(struct kqworkloop *kqwl, struct knote *kn,
+    struct kevent_qos_s *kev, int op)
 {
-       struct kqueue *kq = knote_get_kq(kn);
-       struct kqworkloop *kqwl = (struct kqworkloop *)kq;
+       uint64_t uaddr = kev->ext[EV_EXTIDX_WL_ADDR];
+       uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
+       uint64_t mask  = kev->ext[EV_EXTIDX_WL_MASK];
+       uint64_t udata = 0;
+       int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
        int error = 0;
-       kq_index_t qos_index = 0;
 
-       if ((kq->kq_state & KQ_WORKLOOP) == 0) {
-               error = ENOTSUP;
-               goto out;
+       if (op == FILT_WLATTACH) {
+               (void)kqueue_alloc_turnstile(&kqwl->kqwl_kqueue);
+       } else if (uaddr == 0) {
+               return 0;
        }
 
-#if DEVELOPMENT || DEBUG
-       if (kev->ident == 0 && kev->udata == 0 && kev->fflags == 0) {
-               struct kqrequest *kqr = &kqwl->kqwl_request;
-
-               kq_req_lock(kqwl);
-               kev->fflags = 0;
-               if (kqr->kqr_dsync_waiters) {
-                       kev->fflags |= NOTE_WL_SYNC_WAIT;
-               }
-               if (kqr->kqr_qos_index) {
-                       kev->fflags |= NOTE_WL_THREAD_REQUEST;
-               }
-               kev->ext[0] = thread_tid(kqwl->kqwl_owner);
-               kev->ext[1] = thread_tid(kqwl->kqwl_request.kqr_thread);
-               kev->ext[2] = thread_owned_workloops_count(current_thread());
-               kev->ext[3] = kn->kn_kevent.ext[3];
-               kq_req_unlock(kqwl);
-               error = EBUSY;
-               goto out;
-       }
-#endif
+       filt_wllock(kqwl);
 
-       int command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
-       switch (command) {
-       case NOTE_WL_THREAD_REQUEST:
-               if (kn->kn_id != kqwl->kqwl_dynamicid) {
-                       error = EINVAL;
-                       goto out;
-               }
-               qos_index = _pthread_priority_thread_qos(kn->kn_qos);
-               if (qos_index == THREAD_QOS_UNSPECIFIED) {
-                       error = ERANGE;
+again:
+
+       /*
+        * Do the debounce thing, the lock serializing the state is the knote lock.
+        */
+       if (uaddr) {
+               /*
+                * Until <rdar://problem/24999882> exists,
+                * disabling preemption copyin forces any
+                * vm_fault we encounter to fail.
+                */
+               error = copyin_atomic64(uaddr, &udata);
+
+               /*
+                * If we get EFAULT, drop locks, and retry.
+                * If we still get an error report it,
+                * else assume the memory has been faulted
+                * and attempt to copyin under lock again.
+                */
+               switch (error) {
+               case 0:
+                       break;
+               case EFAULT:
+                       if (efault_retry-- > 0) {
+                               filt_wlunlock(kqwl);
+                               error = copyin_atomic64(uaddr, &udata);
+                               filt_wllock(kqwl);
+                               if (error == 0) {
+                                       goto again;
+                               }
+                       }
+               /* FALLTHROUGH */
+               default:
                        goto out;
                }
-               if (kqwl->kqwl_request.kqr_qos_index) {
-                       /*
-                        * There already is a thread request, and well, you're only allowed
-                        * one per workloop, so fail the attach.
-                        */
-                       error = EALREADY;
+
+               kev->ext[EV_EXTIDX_WL_VALUE] = udata;
+               kn->kn_ext[EV_EXTIDX_WL_VALUE] = udata;
+
+               if ((udata & mask) != (kdata & mask)) {
+                       error = ESTALE;
+                       goto out;
+               }
+       }
+
+       if (op == FILT_WLATTACH) {
+               error = filt_wlattach_sync_ipc(kn);
+               if (error == 0) {
+                       disable_preemption();
+                       error = EPREEMPTDISABLED;
+               }
+       }
+
+out:
+       filt_wlunlock(kqwl);
+       return error;
+}
+
+static int
+filt_wlattach(struct knote *kn, struct kevent_qos_s *kev)
+{
+       struct kqueue *kq = knote_get_kq(kn);
+       struct kqworkloop *kqwl = (struct kqworkloop *)kq;
+       int error = 0, result = 0;
+       kq_index_t qos_index = 0;
+
+       if (__improbable((kq->kq_state & KQ_WORKLOOP) == 0)) {
+               error = ENOTSUP;
+               goto out;
+       }
+
+       uint32_t command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
+       switch (command) {
+       case NOTE_WL_THREAD_REQUEST:
+               if (kn->kn_id != kqwl->kqwl_dynamicid) {
+                       error = EINVAL;
+                       goto out;
+               }
+               qos_index = _pthread_priority_thread_qos(kn->kn_qos);
+               if (qos_index == THREAD_QOS_UNSPECIFIED) {
+                       error = ERANGE;
+                       goto out;
+               }
+               if (kqwl->kqwl_request.tr_kq_qos_index) {
+                       /*
+                        * There already is a thread request, and well, you're only allowed
+                        * one per workloop, so fail the attach.
+                        */
+                       error = EALREADY;
                        goto out;
                }
                break;
@@ -2151,13 +2319,32 @@ filt_wlattach(struct knote *kn, struct kevent_internal_s *kev)
                        goto out;
                }
                break;
+
+       case NOTE_WL_SYNC_IPC:
+               if ((kn->kn_flags & EV_DISABLE) == 0) {
+                       error = EINVAL;
+                       goto out;
+               }
+               if (kn->kn_sfflags & (NOTE_WL_UPDATE_QOS | NOTE_WL_DISCOVER_OWNER)) {
+                       error = EINVAL;
+                       goto out;
+               }
+               break;
        default:
                error = EINVAL;
                goto out;
        }
 
-       error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
+       if (command == NOTE_WL_SYNC_IPC) {
+               error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLATTACH);
+       } else {
+               error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
+       }
 
+       if (error == EPREEMPTDISABLED) {
+               error = 0;
+               result = FILTER_THREADREQ_NODEFEER;
+       }
 out:
        if (error) {
                /* If userland wants ESTALE to be hidden, fail the attach anyway */
@@ -2165,10 +2352,10 @@ out:
                        error = 0;
                }
                knote_set_error(kn, error);
-               return 0;
+               return result;
        }
        if (command == NOTE_WL_SYNC_WAIT) {
-               return kevent_register_wait_prepare(kn, kev);
+               return kevent_register_wait_prepare(kn, kev, result);
        }
        /* Just attaching the thread request successfully will fire it */
        if (command == NOTE_WL_THREAD_REQUEST) {
@@ -2177,28 +2364,26 @@ out:
                 * so delivering an event needs to also consume it.
                 */
                kn->kn_flags |= EV_CLEAR;
-               return FILTER_ACTIVE;
+               return result | FILTER_ACTIVE;
        }
-       return 0;
+       return result;
 }
 
 static void __dead2
 filt_wlwait_continue(void *parameter, wait_result_t wr)
 {
        struct _kevent_register *cont_args = parameter;
-       struct kqworkloop *kqwl = (struct kqworkloop *)cont_args->kq;
-       struct kqrequest *kqr = &kqwl->kqwl_request;
+       struct kqworkloop *kqwl = cont_args->kqwl;
 
-       kq_req_lock(kqwl);
-       kqr->kqr_dsync_waiters--;
+       kqlock(kqwl);
        if (filt_wlturnstile_interlock_is_workq(kqwl)) {
                workq_kern_threadreq_lock(kqwl->kqwl_p);
-               turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL);
+               turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
                workq_kern_threadreq_unlock(kqwl->kqwl_p);
        } else {
-               turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL);
+               turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
        }
-       kq_req_unlock(kqwl);
+       kqunlock(kqwl);
 
        turnstile_cleanup();
 
@@ -2217,17 +2402,15 @@ filt_wlwait_continue(void *parameter, wait_result_t wr)
  * calls filt_wlwait_continue through a continuation.
  */
 static void __dead2
-filt_wlpost_register_wait(struct uthread *uth, struct knote_lock_ctx *knlc,
+filt_wlpost_register_wait(struct uthread *uth, struct knote *kn,
     struct _kevent_register *cont_args)
 {
-       struct kqworkloop *kqwl = (struct kqworkloop *)cont_args->kq;
-       struct kqrequest *kqr = &kqwl->kqwl_request;
+       struct kqworkloop *kqwl = cont_args->kqwl;
+       workq_threadreq_t kqr = &kqwl->kqwl_request;
        struct turnstile *ts;
        bool workq_locked = false;
 
-       kq_req_lock(kqwl);
-
-       kqr->kqr_dsync_waiters++;
+       kqlock_held(kqwl);
 
        if (filt_wlturnstile_interlock_is_workq(kqwl)) {
                workq_kern_threadreq_lock(kqwl->kqwl_p);
@@ -2259,20 +2442,19 @@ filt_wlpost_register_wait(struct uthread *uth, struct knote_lock_ctx *knlc,
        }
 
        thread_set_pending_block_hint(uth->uu_thread, kThreadWaitWorkloopSyncWait);
-       waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(cont_args->knote),
+       waitq_assert_wait64(&ts->ts_waitq, knote_filt_wev64(kn),
            THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER);
 
        if (workq_locked) {
                workq_kern_threadreq_unlock(kqwl->kqwl_p);
        }
 
-       thread_t thread = kqwl->kqwl_owner ?: kqr->kqr_thread;
+       thread_t thread = kqwl->kqwl_owner ?: kqr_thread(kqr);
        if (thread) {
                thread_reference(thread);
        }
-       kq_req_unlock(kqwl);
 
-       kevent_register_wait_block(ts, thread, knlc, filt_wlwait_continue, cont_args);
+       kevent_register_wait_block(ts, thread, filt_wlwait_continue, cont_args);
 }
 
 /* called in stackshot context to report the thread responsible for blocking this thread */
@@ -2283,28 +2465,26 @@ kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
        struct knote *kn = (struct knote *)event;
        assert(kdp_is_in_zone(kn, "knote zone"));
 
-       assert(kn->kn_hook == thread);
+       assert(kn->kn_thread == thread);
 
        struct kqueue *kq = knote_get_kq(kn);
        assert(kdp_is_in_zone(kq, "kqueue workloop zone"));
        assert(kq->kq_state & KQ_WORKLOOP);
 
        struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-       struct kqrequest *kqr = &kqwl->kqwl_request;
+       workq_threadreq_t kqr = &kqwl->kqwl_request;
 
        thread_t kqwl_owner = kqwl->kqwl_owner;
-       thread_t servicer = kqr->kqr_thread;
 
        if (kqwl_owner != THREAD_NULL) {
                assert(kdp_is_in_zone(kqwl_owner, "threads"));
 
                waitinfo->owner = thread_tid(kqwl->kqwl_owner);
-       } else if (servicer != THREAD_NULL) {
-               assert(kdp_is_in_zone(servicer, "threads"));
-
-               waitinfo->owner = thread_tid(servicer);
-       } else if (kqr->kqr_state & KQR_THREQUESTED) {
+       } else if (kqr_thread_requested_pending(kqr)) {
                waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
+       } else if (kqr->tr_state >= WORKQ_TR_STATE_BINDING) {
+               assert(kdp_is_in_zone(kqr->tr_thread, "threads"));
+               waitinfo->owner = thread_tid(kqr->tr_thread);
        } else {
                waitinfo->owner = 0;
        }
@@ -2313,20 +2493,21 @@ kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
 }
 
 static void
-filt_wldetach(__assert_only struct knote *kn)
+filt_wldetach(struct knote *kn)
 {
-       assert(knote_get_kq(kn)->kq_state & KQ_WORKLOOP);
-       if (kn->kn_hook) {
+       if (kn->kn_sfflags & NOTE_WL_SYNC_IPC) {
+               filt_wldetach_sync_ipc(kn);
+       } else if (kn->kn_thread) {
                kevent_register_wait_cleanup(kn);
        }
 }
 
 static int
-filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_internal_s *kev,
+filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_qos_s *kev,
     thread_qos_t *qos_index)
 {
-       int new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
-       int sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
+       uint32_t new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
+       uint32_t sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
 
        if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) {
                return EINVAL;
@@ -2367,6 +2548,15 @@ sync_checks:
                }
                break;
 
+       case NOTE_WL_SYNC_IPC:
+               if (sav_commands != NOTE_WL_SYNC_IPC) {
+                       return EINVAL;
+               }
+               if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
+                       return EINVAL;
+               }
+               break;
+
        default:
                return EINVAL;
        }
@@ -2374,48 +2564,54 @@ sync_checks:
 }
 
 static int
-filt_wltouch(struct knote *kn, struct kevent_internal_s *kev)
+filt_wltouch(struct knote *kn, struct kevent_qos_s *kev)
 {
        struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
        thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED;
+       int result = 0;
 
        int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index);
        if (error) {
                goto out;
        }
 
-       error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
-       filt_wlremember_last_update(kn, kev, error);
-       if (error) {
-               goto out;
+       uint32_t command = kev->fflags & NOTE_WL_COMMANDS_MASK;
+       if (command == NOTE_WL_SYNC_IPC) {
+               error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLTOUCH);
+       } else {
+               error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
+               filt_wlremember_last_update(kn, kev, error);
+       }
+       if (error == EPREEMPTDISABLED) {
+               error = 0;
+               result = FILTER_THREADREQ_NODEFEER;
        }
 
 out:
        if (error) {
                if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
                        /* If userland wants ESTALE to be hidden, do not activate */
-                       return 0;
+                       return result;
                }
                kev->flags |= EV_ERROR;
                kev->data = error;
-               return 0;
+               return result;
        }
-       int command = kev->fflags & NOTE_WL_COMMANDS_MASK;
        if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) {
-               return kevent_register_wait_prepare(kn, kev);
+               return kevent_register_wait_prepare(kn, kev, result);
        }
        /* Just touching the thread request successfully will fire it */
        if (command == NOTE_WL_THREAD_REQUEST) {
                if (kev->fflags & NOTE_WL_UPDATE_QOS) {
-                       return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
+                       result |= FILTER_UPDATE_REQ_QOS;
                }
-               return FILTER_ACTIVE;
+               result |= FILTER_ACTIVE;
        }
-       return 0;
+       return result;
 }
 
 static bool
-filt_wlallow_drop(struct knote *kn, struct kevent_internal_s *kev)
+filt_wlallow_drop(struct knote *kn, struct kevent_qos_s *kev)
 {
        struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
 
@@ -2424,11 +2620,14 @@ filt_wlallow_drop(struct knote *kn, struct kevent_internal_s *kev)
                goto out;
        }
 
-       error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP);
-       filt_wlremember_last_update(kn, kev, error);
-       if (error) {
-               goto out;
+       uint32_t command = (kev->fflags & NOTE_WL_COMMANDS_MASK);
+       if (command == NOTE_WL_SYNC_IPC) {
+               error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLDROP);
+       } else {
+               error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP);
+               filt_wlremember_last_update(kn, kev, error);
        }
+       assert(error != EPREEMPTDISABLED);
 
 out:
        if (error) {
@@ -2443,17 +2642,14 @@ out:
 }
 
 static int
-filt_wlprocess(
-       struct knote *kn,
-       __unused struct filt_process_s *data,
-       struct kevent_internal_s *kev)
+filt_wlprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
        struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
        int rc = 0;
 
        assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
 
-       filt_wllock(kqwl);
+       kqlock(kqwl);
 
        if (kqwl->kqwl_owner) {
                /*
@@ -2464,9 +2660,7 @@ filt_wlprocess(
                 * When that happens, the automatic deactivation due to process
                 * would swallow the event, so we have to activate the knote again.
                 */
-               kqlock(kqwl);
-               knote_activate(kn);
-               kqunlock(kqwl);
+               knote_activate(kqwl, kn, FILTER_ACTIVE);
        } else {
 #if DEBUG || DEVELOPMENT
                if (kevent_debug_flags() & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) {
@@ -2478,7 +2672,7 @@ filt_wlprocess(
                        task_t t = current_task();
                        uint64_t val;
                        if (addr && task_is_active(t) && !task_is_halting(t) &&
-                           copyin_word(addr, &val, sizeof(val)) == 0 &&
+                           copyin_atomic64(addr, &val) == 0 &&
                            val && (val & DISPATCH_QUEUE_ENQUEUED) == 0 &&
                            (val >> 48) != 0xdead && (val >> 48) != 0 && (val >> 48) != 0xffff) {
                                panic("kevent: workloop %#016llx is not enqueued "
@@ -2487,14 +2681,12 @@ filt_wlprocess(
                        }
                }
 #endif
-               *kev = kn->kn_kevent;
+               knote_fill_kevent(kn, kev, 0);
                kev->fflags = kn->kn_sfflags;
-               kev->data = kn->kn_sdata;
-               kev->qos = kn->kn_qos;
                rc |= FILTER_ACTIVE;
        }
 
-       filt_wlunlock(kqwl);
+       kqunlock(kqwl);
 
        if (rc & FILTER_ACTIVE) {
                workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request);
@@ -2506,410 +2698,292 @@ SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
        .f_extended_codes = true,
        .f_attach  = filt_wlattach,
        .f_detach  = filt_wldetach,
-       .f_event   = filt_badevent,
+       .f_event   = filt_bad_event,
        .f_touch   = filt_wltouch,
        .f_process = filt_wlprocess,
        .f_allow_drop = filt_wlallow_drop,
        .f_post_register_wait = filt_wlpost_register_wait,
 };
 
-#pragma mark kevent / knotes
+#pragma mark - kqueues allocation and deallocation
 
-/*
- * JMM - placeholder for not-yet-implemented filters
+/*!
+ * @enum kqworkloop_dealloc_flags_t
+ *
+ * @brief
+ * Flags that alter kqworkloop_dealloc() behavior.
+ *
+ * @const KQWL_DEALLOC_NONE
+ * Convenient name for "no flags".
+ *
+ * @const KQWL_DEALLOC_SKIP_HASH_REMOVE
+ * Do not remove the workloop fromt he hash table.
+ * This is used for process tear-down codepaths as the workloops have been
+ * removed by the caller already.
  */
-static int
-filt_badevent(struct knote *kn, long hint)
+OS_OPTIONS(kqworkloop_dealloc_flags, unsigned,
+    KQWL_DEALLOC_NONE               = 0x0000,
+    KQWL_DEALLOC_SKIP_HASH_REMOVE   = 0x0001,
+    );
+
+static void
+kqworkloop_dealloc(struct kqworkloop *, kqworkloop_dealloc_flags_t, uint32_t);
+
+OS_NOINLINE OS_COLD OS_NORETURN
+static void
+kqworkloop_retain_panic(struct kqworkloop *kqwl, uint32_t previous)
 {
-       panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
-       return 0;
+       if (previous == 0) {
+               panic("kq(%p) resurrection", kqwl);
+       } else {
+               panic("kq(%p) retain overflow", kqwl);
+       }
 }
 
-static int
-filt_badattach(__unused struct knote *kn, __unused struct kevent_internal_s *kev)
+OS_NOINLINE OS_COLD OS_NORETURN
+static void
+kqworkloop_release_panic(struct kqworkloop *kqwl)
 {
-       knote_set_error(kn, ENOTSUP);
-       return 0;
+       panic("kq(%p) over-release", kqwl);
 }
 
-struct kqueue *
-kqueue_alloc(struct proc *p, unsigned int flags)
+OS_ALWAYS_INLINE
+static inline bool
+kqworkloop_try_retain(struct kqworkloop *kqwl)
 {
-       struct filedesc *fdp = p->p_fd;
-       struct kqueue *kq = NULL;
-       int policy;
-       void *hook = NULL;
-
-       if (flags & KEVENT_FLAG_WORKQ) {
-               struct kqworkq *kqwq;
-               int i;
-
-               kqwq = (struct kqworkq *)zalloc(kqworkq_zone);
-               if (kqwq == NULL) {
-                       return NULL;
-               }
-
-               kq = &kqwq->kqwq_kqueue;
-               bzero(kqwq, sizeof(struct kqworkq));
-
-               kqwq->kqwq_state = KQ_WORKQ;
-
-               for (i = 0; i < KQWQ_NBUCKETS; i++) {
-                       TAILQ_INIT(&kqwq->kqwq_queue[i]);
-               }
-               for (i = 0; i < KQWQ_NBUCKETS; i++) {
-                       if (i != KQWQ_QOS_MANAGER) {
-                               /*
-                                * Because of how the bucketized system works, we mix overcommit
-                                * sources with not overcommit: each time we move a knote from
-                                * one bucket to the next due to overrides, we'd had to track
-                                * overcommitness, and it's really not worth it in the workloop
-                                * enabled world that track this faithfully.
-                                *
-                                * Incidentally, this behaves like the original manager-based
-                                * kqwq where event delivery always happened (hence is
-                                * "overcommit")
-                                */
-                               kqwq->kqwq_request[i].kqr_state |= KQR_THOVERCOMMIT;
-                       }
-                       kqwq->kqwq_request[i].kqr_qos_index = i;
-                       TAILQ_INIT(&kqwq->kqwq_request[i].kqr_suppressed);
+       uint32_t old_ref, new_ref;
+       os_atomic_rmw_loop(&kqwl->kqwl_retains, old_ref, new_ref, relaxed, {
+               if (__improbable(old_ref == 0)) {
+                       os_atomic_rmw_loop_give_up(return false);
                }
-
-               policy = SYNC_POLICY_FIFO;
-               hook = (void *)kqwq;
-       } else if (flags & KEVENT_FLAG_WORKLOOP) {
-               struct kqworkloop *kqwl;
-               int i;
-
-               kqwl = (struct kqworkloop *)zalloc(kqworkloop_zone);
-               if (kqwl == NULL) {
-                       return NULL;
+               if (__improbable(old_ref >= KQ_WORKLOOP_RETAINS_MAX)) {
+                       kqworkloop_retain_panic(kqwl, old_ref);
                }
+               new_ref = old_ref + 1;
+       });
+       return true;
+}
 
-               bzero(kqwl, sizeof(struct kqworkloop));
-
-               kqwl->kqwl_state = KQ_WORKLOOP | KQ_DYNAMIC;
-               kqwl->kqwl_retains = 1; /* donate a retain to creator */
-               kqwl->kqwl_request.kqr_state = KQR_WORKLOOP;
+OS_ALWAYS_INLINE
+static inline void
+kqworkloop_retain(struct kqworkloop *kqwl)
+{
+       uint32_t previous = os_atomic_inc_orig(&kqwl->kqwl_retains, relaxed);
+       if (__improbable(previous == 0 || previous >= KQ_WORKLOOP_RETAINS_MAX)) {
+               kqworkloop_retain_panic(kqwl, previous);
+       }
+}
 
-               kq = &kqwl->kqwl_kqueue;
-               for (i = 0; i < KQWL_NBUCKETS; i++) {
-                       TAILQ_INIT(&kqwl->kqwl_queue[i]);
-               }
-               TAILQ_INIT(&kqwl->kqwl_request.kqr_suppressed);
+OS_ALWAYS_INLINE
+static inline void
+kqueue_retain(kqueue_t kqu)
+{
+       if (kqu.kq->kq_state & KQ_DYNAMIC) {
+               kqworkloop_retain(kqu.kqwl);
+       }
+}
 
-               lck_mtx_init(&kqwl->kqwl_statelock, kq_lck_grp, kq_lck_attr);
+OS_ALWAYS_INLINE
+static inline void
+kqworkloop_release_live(struct kqworkloop *kqwl)
+{
+       uint32_t refs = os_atomic_dec_orig(&kqwl->kqwl_retains, relaxed);
+       if (__improbable(refs <= 1)) {
+               kqworkloop_release_panic(kqwl);
+       }
+}
 
-               policy = SYNC_POLICY_FIFO;
-               hook = (void *)kqwl;
-       } else {
-               struct kqfile *kqf;
+OS_ALWAYS_INLINE
+static inline void
+kqueue_release_live(kqueue_t kqu)
+{
+       if (kqu.kq->kq_state & KQ_DYNAMIC) {
+               kqworkloop_release_live(kqu.kqwl);
+       }
+}
 
-               kqf = (struct kqfile *)zalloc(kqfile_zone);
-               if (kqf == NULL) {
-                       return NULL;
-               }
+OS_ALWAYS_INLINE
+static inline void
+kqworkloop_release(struct kqworkloop *kqwl)
+{
+       uint32_t refs = os_atomic_dec_orig(&kqwl->kqwl_retains, relaxed);
 
-               kq = &kqf->kqf_kqueue;
-               bzero(kqf, sizeof(struct kqfile));
-               TAILQ_INIT(&kqf->kqf_queue);
-               TAILQ_INIT(&kqf->kqf_suppressed);
+       if (__improbable(refs <= 1)) {
+               kqworkloop_dealloc(kqwl, KQWL_DEALLOC_NONE, refs - 1);
+       }
+}
 
-               policy = SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST;
+OS_ALWAYS_INLINE
+static inline void
+kqueue_release(kqueue_t kqu)
+{
+       if (kqu.kq->kq_state & KQ_DYNAMIC) {
+               kqworkloop_release(kqu.kqwl);
        }
+}
 
-       waitq_set_init(&kq->kq_wqs, policy, NULL, hook);
-       lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr);
-       lck_spin_init(&kq->kq_reqlock, kq_lck_grp, kq_lck_attr);
-       kq->kq_p = p;
+/*!
+ * @function kqueue_destroy
+ *
+ * @brief
+ * Common part to all kqueue dealloc functions.
+ */
+OS_NOINLINE
+static void
+kqueue_destroy(kqueue_t kqu, zone_t zone)
+{
+       /*
+        * waitq_set_deinit() remove the KQ's waitq set from
+        * any select sets to which it may belong.
+        *
+        * The order of these deinits matter: before waitq_set_deinit() returns,
+        * waitq_set__CALLING_PREPOST_HOOK__ may be called and it will take the
+        * kq_lock.
+        */
+       waitq_set_deinit(&kqu.kq->kq_wqs);
+       lck_spin_destroy(&kqu.kq->kq_lock, kq_lck_grp);
 
-       if (fdp->fd_knlistsize < 0) {
-               proc_fdlock(p);
-               if (fdp->fd_knlistsize < 0) {
-                       fdp->fd_knlistsize = 0; /* this process has had a kq */
-               }
-               proc_fdunlock(p);
-       }
+       zfree(zone, kqu.kq);
+}
 
-       return kq;
+/*!
+ * @function kqueue_init
+ *
+ * @brief
+ * Common part to all kqueue alloc functions.
+ */
+static kqueue_t
+kqueue_init(kqueue_t kqu, waitq_set_prepost_hook_t *hook, int policy)
+{
+       waitq_set_init(&kqu.kq->kq_wqs, policy, NULL, hook);
+       lck_spin_init(&kqu.kq->kq_lock, kq_lck_grp, kq_lck_attr);
+       return kqu;
 }
 
-/*
- * knotes_dealloc - detach all knotes for the process and drop them
+#pragma mark kqfile allocation and deallocation
+
+/*!
+ * @function kqueue_dealloc
  *
- *             Called with proc_fdlock held.
- *             Returns with it locked.
- *             May drop it temporarily.
- *             Process is in such a state that it will not try to allocate
- *             any more knotes during this process (stopped for exit or exec).
+ * @brief
+ * Detach all knotes from a kqfile and free it.
+ *
+ * @discussion
+ * We walk each list looking for knotes referencing this
+ * this kqueue.  If we find one, we try to drop it.  But
+ * if we fail to get a drop reference, that will wait
+ * until it is dropped.  So, we can just restart again
+ * safe in the assumption that the list will eventually
+ * not contain any more references to this kqueue (either
+ * we dropped them all, or someone else did).
+ *
+ * Assumes no new events are being added to the kqueue.
+ * Nothing locked on entry or exit.
  */
 void
-knotes_dealloc(proc_t p)
+kqueue_dealloc(struct kqueue *kq)
 {
+       KNOTE_LOCK_CTX(knlc);
+       struct proc *p = kq->kq_p;
        struct filedesc *fdp = p->p_fd;
-       struct kqueue *kq;
        struct knote *kn;
-       struct  klist *kn_hash = NULL;
-       int i;
 
-       /* Close all the fd-indexed knotes up front */
-       if (fdp->fd_knlistsize > 0) {
-               for (i = 0; i < fdp->fd_knlistsize; i++) {
-                       while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
-                               kq = knote_get_kq(kn);
+       assert(kq && (kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
+
+       proc_fdlock(p);
+       for (int i = 0; i < fdp->fd_knlistsize; i++) {
+               kn = SLIST_FIRST(&fdp->fd_knlist[i]);
+               while (kn != NULL) {
+                       if (kq == knote_get_kq(kn)) {
                                kqlock(kq);
                                proc_fdunlock(p);
-                               knote_drop(kq, kn, NULL);
+                               if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
+                                       knote_drop(kq, kn, &knlc);
+                               }
                                proc_fdlock(p);
+                               /* start over at beginning of list */
+                               kn = SLIST_FIRST(&fdp->fd_knlist[i]);
+                               continue;
                        }
+                       kn = SLIST_NEXT(kn, kn_link);
                }
-               /* free the table */
-               FREE(fdp->fd_knlist, M_KQUEUE);
-               fdp->fd_knlist = NULL;
        }
-       fdp->fd_knlistsize = -1;
 
-       knhash_lock(p);
+       knhash_lock(fdp);
        proc_fdunlock(p);
 
-       /* Clean out all the hashed knotes as well */
        if (fdp->fd_knhashmask != 0) {
-               for (i = 0; i <= (int)fdp->fd_knhashmask; i++) {
-                       while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
-                               kq = knote_get_kq(kn);
-                               kqlock(kq);
-                               knhash_unlock(p);
-                               knote_drop(kq, kn, NULL);
-                               knhash_lock(p);
+               for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
+                       kn = SLIST_FIRST(&fdp->fd_knhash[i]);
+                       while (kn != NULL) {
+                               if (kq == knote_get_kq(kn)) {
+                                       kqlock(kq);
+                                       knhash_unlock(fdp);
+                                       if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
+                                               knote_drop(kq, kn, &knlc);
+                                       }
+                                       knhash_lock(fdp);
+                                       /* start over at beginning of list */
+                                       kn = SLIST_FIRST(&fdp->fd_knhash[i]);
+                                       continue;
+                               }
+                               kn = SLIST_NEXT(kn, kn_link);
                        }
                }
-               kn_hash = fdp->fd_knhash;
-               fdp->fd_knhashmask = 0;
-               fdp->fd_knhash = NULL;
-       }
-
-       knhash_unlock(p);
-
-       /* free the kn_hash table */
-       if (kn_hash) {
-               FREE(kn_hash, M_KQUEUE);
        }
+       knhash_unlock(fdp);
 
-       proc_fdlock(p);
+       kqueue_destroy(kq, kqfile_zone);
 }
 
-/*
- * kqworkloop_invalidate
- *
- * Invalidate ownership of a workloop.
- *
- * This is meant to be used so that any remnant of overrides and ownership
- * information is dropped before a kqworkloop can no longer be found in the
- * global hash table and have ghost workloop ownership left over.
+/*!
+ * @function kqueue_alloc
  *
- * Possibly returns a thread to deallocate in a safe context.
+ * @brief
+ * Allocate a kqfile.
  */
-static thread_t
-kqworkloop_invalidate(struct kqworkloop *kqwl)
+struct kqueue *
+kqueue_alloc(struct proc *p)
 {
-       thread_t cur_owner = kqwl->kqwl_owner;
+       struct kqfile *kqf;
 
-       assert(TAILQ_EMPTY(&kqwl->kqwl_request.kqr_suppressed));
-       if (cur_owner) {
-               /*
-                * If the kqueue had an owner that prevented the thread request to
-                * go through, then no unbind happened, and we may have lingering
-                * overrides to drop.
-                */
-               if (kqworkloop_owner_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
-                       thread_drop_ipc_override(cur_owner);
-               }
-               thread_ends_owning_workloop(cur_owner);
-               kqwl->kqwl_owner = THREAD_NULL;
+       kqf = (struct kqfile *)zalloc(kqfile_zone);
+       if (__improbable(kqf == NULL)) {
+               return NULL;
        }
+       bzero(kqf, sizeof(struct kqfile));
+
+       /*
+        * kqfiles are created with kqueue() so we need to wait for
+        * the first kevent syscall to know which bit among
+        * KQ_KEV_{32,64,QOS} will be set in kqf_state
+        */
+       kqf->kqf_p = p;
+       TAILQ_INIT_AFTER_BZERO(&kqf->kqf_queue);
+       TAILQ_INIT_AFTER_BZERO(&kqf->kqf_suppressed);
 
-       return cur_owner;
+       return kqueue_init(kqf, NULL, SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST).kq;
 }
 
-/*
- * kqueue_dealloc - detach all knotes from a kqueue and free it
- *
- *     We walk each list looking for knotes referencing this
- *     this kqueue.  If we find one, we try to drop it.  But
- *     if we fail to get a drop reference, that will wait
- *     until it is dropped.  So, we can just restart again
- *     safe in the assumption that the list will eventually
- *     not contain any more references to this kqueue (either
- *     we dropped them all, or someone else did).
- *
- *     Assumes no new events are being added to the kqueue.
- *     Nothing locked on entry or exit.
- *
- * Workloop kqueues cant get here unless all the knotes
- * are already gone and all requested threads have come
- * and gone (cancelled or arrived).
+/*!
+ * @function kqueue_internal
+ *
+ * @brief
+ * Core implementation for kqueue and guarded_kqueue_np()
  */
-void
-kqueue_dealloc(struct kqueue *kq)
+int
+kqueue_internal(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval)
 {
-       struct proc *p;
-       struct filedesc *fdp;
-       struct knote *kn;
-       int i;
+       struct kqueue *kq;
+       struct fileproc *fp;
+       int fd, error;
 
-       if (kq == NULL) {
-               return;
-       }
-
-       p = kq->kq_p;
-       fdp = p->p_fd;
-
-       /*
-        * Workloops are refcounted by their knotes, so there's no point
-        * spending a lot of time under these locks just to deallocate one.
-        */
-       if ((kq->kq_state & KQ_WORKLOOP) == 0) {
-               KNOTE_LOCK_CTX(knlc);
-
-               proc_fdlock(p);
-               for (i = 0; i < fdp->fd_knlistsize; i++) {
-                       kn = SLIST_FIRST(&fdp->fd_knlist[i]);
-                       while (kn != NULL) {
-                               if (kq == knote_get_kq(kn)) {
-                                       kqlock(kq);
-                                       proc_fdunlock(p);
-                                       if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
-                                               knote_drop(kq, kn, &knlc);
-                                       }
-                                       proc_fdlock(p);
-                                       /* start over at beginning of list */
-                                       kn = SLIST_FIRST(&fdp->fd_knlist[i]);
-                                       continue;
-                               }
-                               kn = SLIST_NEXT(kn, kn_link);
-                       }
-               }
-
-               knhash_lock(p);
-               proc_fdunlock(p);
-
-               if (fdp->fd_knhashmask != 0) {
-                       for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
-                               kn = SLIST_FIRST(&fdp->fd_knhash[i]);
-                               while (kn != NULL) {
-                                       if (kq == knote_get_kq(kn)) {
-                                               kqlock(kq);
-                                               knhash_unlock(p);
-                                               if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
-                                                       knote_drop(kq, kn, &knlc);
-                                               }
-                                               knhash_lock(p);
-                                               /* start over at beginning of list */
-                                               kn = SLIST_FIRST(&fdp->fd_knhash[i]);
-                                               continue;
-                                       }
-                                       kn = SLIST_NEXT(kn, kn_link);
-                               }
-                       }
-               }
-               knhash_unlock(p);
-       }
-
-       if (kq->kq_state & KQ_WORKLOOP) {
-               struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-               thread_t cur_owner = kqworkloop_invalidate(kqwl);
-
-               if (cur_owner) {
-                       thread_deallocate(cur_owner);
-               }
-
-               if (kqwl->kqwl_request.kqr_state & KQR_ALLOCATED_TURNSTILE) {
-                       struct turnstile *ts;
-                       turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, &ts);
-                       turnstile_cleanup();
-                       turnstile_deallocate(ts);
-               } else {
-                       assert(kqwl->kqwl_turnstile == NULL);
-               }
-       }
-
-       /*
-        * waitq_set_deinit() remove the KQ's waitq set from
-        * any select sets to which it may belong.
-        */
-       waitq_set_deinit(&kq->kq_wqs);
-       lck_spin_destroy(&kq->kq_lock, kq_lck_grp);
-       lck_spin_destroy(&kq->kq_reqlock, kq_lck_grp);
-
-       if (kq->kq_state & KQ_WORKQ) {
-               zfree(kqworkq_zone, kq);
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-
-               assert(kqwl->kqwl_retains == 0);
-               lck_mtx_destroy(&kqwl->kqwl_statelock, kq_lck_grp);
-               zfree(kqworkloop_zone, kqwl);
-       } else {
-               zfree(kqfile_zone, kq);
-       }
-}
-
-static inline void
-kqueue_retain(struct kqueue *kq)
-{
-       struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-       uint32_t previous;
-
-       if ((kq->kq_state & KQ_DYNAMIC) == 0) {
-               return;
-       }
-
-       previous = OSIncrementAtomic(&kqwl->kqwl_retains);
-       if (previous == KQ_WORKLOOP_RETAINS_MAX) {
-               panic("kq(%p) retain overflow", kq);
-       }
-
-       if (previous == 0) {
-               panic("kq(%p) resurrection", kq);
-       }
-}
-
-#define KQUEUE_CANT_BE_LAST_REF  0
-#define KQUEUE_MIGHT_BE_LAST_REF 1
-
-static inline int
-kqueue_release(kqueue_t kqu, __assert_only int possibly_last)
-{
-       if ((kqu.kq->kq_state & KQ_DYNAMIC) == 0) {
-               return 0;
-       }
-
-       assert(kqu.kq->kq_state & KQ_WORKLOOP); /* for now */
-       uint32_t refs = OSDecrementAtomic(&kqu.kqwl->kqwl_retains);
-       if (__improbable(refs == 0)) {
-               panic("kq(%p) over-release", kqu.kq);
-       }
-       if (refs == 1) {
-               assert(possibly_last);
-       }
-       return refs == 1;
-}
-
-int
-kqueue_body(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval)
-{
-       struct kqueue *kq;
-       struct fileproc *fp;
-       int fd, error;
-
-       error = falloc_withalloc(p,
-           &fp, &fd, vfs_context_current(), fp_zalloc, cra);
+       error = falloc_withalloc(p, &fp, &fd, vfs_context_current(), fp_zalloc, cra);
        if (error) {
                return error;
        }
 
-       kq = kqueue_alloc(p, 0);
+       kq = kqueue_alloc(p);
        if (kq == NULL) {
                fp_free(p, fd, fp);
                return ENOMEM;
@@ -2930,631 +3004,489 @@ kqueue_body(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval)
        return error;
 }
 
+/*!
+ * @function kqueue
+ *
+ * @brief
+ * The kqueue syscall.
+ */
 int
 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
 {
-       return kqueue_body(p, fileproc_alloc_init, NULL, retval);
+       return kqueue_internal(p, fileproc_alloc_init, NULL, retval);
 }
 
-static int
-kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p,
-    unsigned int flags)
-{
-       int advance;
-       int error;
-
-       if (flags & KEVENT_FLAG_LEGACY32) {
-               bzero(kevp, sizeof(*kevp));
+#pragma mark kqworkq allocation and deallocation
 
-               if (IS_64BIT_PROCESS(p)) {
-                       struct user64_kevent kev64;
+/*!
+ * @function kqworkq_dealloc
+ *
+ * @brief
+ * Deallocates a workqueue kqueue.
+ *
+ * @discussion
+ * This only happens at process death, or for races with concurrent
+ * kevent_get_kqwq calls, hence we don't have to care about knotes referencing
+ * this kqueue, either there are none, or someone else took care of them.
+ */
+void
+kqworkq_dealloc(struct kqworkq *kqwq)
+{
+       kqueue_destroy(kqwq, kqworkq_zone);
+}
 
-                       advance = sizeof(kev64);
-                       error = copyin(*addrp, (caddr_t)&kev64, advance);
-                       if (error) {
-                               return error;
-                       }
-                       kevp->ident = kev64.ident;
-                       kevp->filter = kev64.filter;
-                       kevp->flags = kev64.flags;
-                       kevp->udata = kev64.udata;
-                       kevp->fflags = kev64.fflags;
-                       kevp->data = kev64.data;
-               } else {
-                       struct user32_kevent kev32;
+/*!
+ * @function kqworkq_alloc
+ *
+ * @brief
+ * Allocates a workqueue kqueue.
+ *
+ * @discussion
+ * This is the slow path of kevent_get_kqwq.
+ * This takes care of making sure procs have a single workq kqueue.
+ */
+OS_NOINLINE
+static struct kqworkq *
+kqworkq_alloc(struct proc *p, unsigned int flags)
+{
+       struct kqworkq *kqwq, *tmp;
 
-                       advance = sizeof(kev32);
-                       error = copyin(*addrp, (caddr_t)&kev32, advance);
-                       if (error) {
-                               return error;
-                       }
-                       kevp->ident = (uintptr_t)kev32.ident;
-                       kevp->filter = kev32.filter;
-                       kevp->flags = kev32.flags;
-                       kevp->udata = CAST_USER_ADDR_T(kev32.udata);
-                       kevp->fflags = kev32.fflags;
-                       kevp->data = (intptr_t)kev32.data;
-               }
-       } else if (flags & KEVENT_FLAG_LEGACY64) {
-               struct kevent64_s kev64;
+       kqwq = (struct kqworkq *)zalloc(kqworkq_zone);
+       if (__improbable(kqwq == NULL)) {
+               return NULL;
+       }
+       bzero(kqwq, sizeof(struct kqworkq));
 
-               bzero(kevp, sizeof(*kevp));
+       assert((flags & KEVENT_FLAG_LEGACY32) == 0);
+       if (flags & KEVENT_FLAG_LEGACY64) {
+               kqwq->kqwq_state = KQ_WORKQ | KQ_KEV64;
+       } else {
+               kqwq->kqwq_state = KQ_WORKQ | KQ_KEV_QOS;
+       }
+       kqwq->kqwq_p = p;
 
-               advance = sizeof(struct kevent64_s);
-               error = copyin(*addrp, (caddr_t)&kev64, advance);
-               if (error) {
-                       return error;
+       for (int i = 0; i < KQWQ_NBUCKETS; i++) {
+               TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_queue[i]);
+               TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_suppressed[i]);
+       }
+       for (int i = 0; i < KQWQ_NBUCKETS; i++) {
+               /*
+                * Because of how the bucketized system works, we mix overcommit
+                * sources with not overcommit: each time we move a knote from
+                * one bucket to the next due to overrides, we'd had to track
+                * overcommitness, and it's really not worth it in the workloop
+                * enabled world that track this faithfully.
+                *
+                * Incidentally, this behaves like the original manager-based
+                * kqwq where event delivery always happened (hence is
+                * "overcommit")
+                */
+               kqwq->kqwq_request[i].tr_state = WORKQ_TR_STATE_IDLE;
+               kqwq->kqwq_request[i].tr_flags = WORKQ_TR_FLAG_KEVENT;
+               if (i != KQWQ_QOS_MANAGER) {
+                       kqwq->kqwq_request[i].tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
                }
-               kevp->ident = kev64.ident;
-               kevp->filter = kev64.filter;
-               kevp->flags = kev64.flags;
-               kevp->udata = kev64.udata;
-               kevp->fflags = kev64.fflags;
-               kevp->data = kev64.data;
-               kevp->ext[0] = kev64.ext[0];
-               kevp->ext[1] = kev64.ext[1];
-       } else {
-               struct kevent_qos_s kevqos;
+               kqwq->kqwq_request[i].tr_kq_qos_index = i;
+       }
 
-               bzero(kevp, sizeof(*kevp));
+       kqueue_init(kqwq, &kqwq->kqwq_waitq_hook, SYNC_POLICY_FIFO);
 
-               advance = sizeof(struct kevent_qos_s);
-               error = copyin(*addrp, (caddr_t)&kevqos, advance);
-               if (error) {
-                       return error;
-               }
-               kevp->ident = kevqos.ident;
-               kevp->filter = kevqos.filter;
-               kevp->flags = kevqos.flags;
-               kevp->qos = kevqos.qos;
-//             kevp->xflags = kevqos.xflags;
-               kevp->udata = kevqos.udata;
-               kevp->fflags = kevqos.fflags;
-               kevp->data = kevqos.data;
-               kevp->ext[0] = kevqos.ext[0];
-               kevp->ext[1] = kevqos.ext[1];
-               kevp->ext[2] = kevqos.ext[2];
-               kevp->ext[3] = kevqos.ext[3];
-       }
-       if (!error) {
-               *addrp += advance;
+       if (!os_atomic_cmpxchgv(&p->p_fd->fd_wqkqueue, NULL, kqwq, &tmp, release)) {
+               kqworkq_dealloc(kqwq);
+               return tmp;
        }
-       return error;
-}
 
-static int
-kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *p,
-    unsigned int flags)
-{
-       user_addr_t addr = *addrp;
-       int advance;
-       int error;
+       return kqwq;
+}
 
-       /*
-        * fully initialize the differnt output event structure
-        * types from the internal kevent (and some universal
-        * defaults for fields not represented in the internal
-        * form).
-        */
-       if (flags & KEVENT_FLAG_LEGACY32) {
-               assert((flags & KEVENT_FLAG_STACK_EVENTS) == 0);
+#pragma mark kqworkloop allocation and deallocation
 
-               if (IS_64BIT_PROCESS(p)) {
-                       struct user64_kevent kev64;
+#define KQ_HASH(val, mask)  (((val) ^ (val >> 8)) & (mask))
+#define CONFIG_KQ_HASHSIZE  CONFIG_KN_HASHSIZE
 
-                       advance = sizeof(kev64);
-                       bzero(&kev64, advance);
+OS_ALWAYS_INLINE
+static inline void
+kqhash_lock(struct filedesc *fdp)
+{
+       lck_mtx_lock_spin_always(&fdp->fd_kqhashlock);
+}
 
-                       /*
-                        * deal with the special case of a user-supplied
-                        * value of (uintptr_t)-1.
-                        */
-                       kev64.ident = (kevp->ident == (uintptr_t)-1) ?
-                           (uint64_t)-1LL : (uint64_t)kevp->ident;
-
-                       kev64.filter = kevp->filter;
-                       kev64.flags = kevp->flags;
-                       kev64.fflags = kevp->fflags;
-                       kev64.data = (int64_t) kevp->data;
-                       kev64.udata = kevp->udata;
-                       error = copyout((caddr_t)&kev64, addr, advance);
-               } else {
-                       struct user32_kevent kev32;
-
-                       advance = sizeof(kev32);
-                       bzero(&kev32, advance);
-                       kev32.ident = (uint32_t)kevp->ident;
-                       kev32.filter = kevp->filter;
-                       kev32.flags = kevp->flags;
-                       kev32.fflags = kevp->fflags;
-                       kev32.data = (int32_t)kevp->data;
-                       kev32.udata = kevp->udata;
-                       error = copyout((caddr_t)&kev32, addr, advance);
-               }
-       } else if (flags & KEVENT_FLAG_LEGACY64) {
-               struct kevent64_s kev64;
+OS_ALWAYS_INLINE
+static inline void
+kqhash_unlock(struct filedesc *fdp)
+{
+       lck_mtx_unlock(&fdp->fd_kqhashlock);
+}
 
-               advance = sizeof(struct kevent64_s);
-               if (flags & KEVENT_FLAG_STACK_EVENTS) {
-                       addr -= advance;
-               }
-               bzero(&kev64, advance);
-               kev64.ident = kevp->ident;
-               kev64.filter = kevp->filter;
-               kev64.flags = kevp->flags;
-               kev64.fflags = kevp->fflags;
-               kev64.data = (int64_t) kevp->data;
-               kev64.udata = kevp->udata;
-               kev64.ext[0] = kevp->ext[0];
-               kev64.ext[1] = kevp->ext[1];
-               error = copyout((caddr_t)&kev64, addr, advance);
-       } else {
-               struct kevent_qos_s kevqos;
-
-               advance = sizeof(struct kevent_qos_s);
-               if (flags & KEVENT_FLAG_STACK_EVENTS) {
-                       addr -= advance;
-               }
-               bzero(&kevqos, advance);
-               kevqos.ident = kevp->ident;
-               kevqos.filter = kevp->filter;
-               kevqos.flags = kevp->flags;
-               kevqos.qos = kevp->qos;
-               kevqos.udata = kevp->udata;
-               kevqos.fflags = kevp->fflags;
-               kevqos.xflags = 0;
-               kevqos.data = (int64_t) kevp->data;
-               kevqos.ext[0] = kevp->ext[0];
-               kevqos.ext[1] = kevp->ext[1];
-               kevqos.ext[2] = kevp->ext[2];
-               kevqos.ext[3] = kevp->ext[3];
-               error = copyout((caddr_t)&kevqos, addr, advance);
-       }
-       if (!error) {
-               if (flags & KEVENT_FLAG_STACK_EVENTS) {
-                       *addrp = addr;
-               } else {
-                       *addrp = addr + advance;
-               }
-       }
-       return error;
+OS_ALWAYS_INLINE
+static inline void
+kqworkloop_hash_insert_locked(struct filedesc *fdp, kqueue_id_t id,
+    struct kqworkloop *kqwl)
+{
+       struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
+       LIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
 }
 
-static int
-kevent_get_data_size(
-       struct proc *p,
-       uint64_t data_available,
-       unsigned int flags,
-       user_size_t *residp)
+OS_ALWAYS_INLINE
+static inline struct kqworkloop *
+kqworkloop_hash_lookup_locked(struct filedesc *fdp, kqueue_id_t id)
 {
-       user_size_t resid;
-       int error = 0;
+       struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
+       struct kqworkloop *kqwl;
 
-       if (data_available != USER_ADDR_NULL) {
-               if (flags & KEVENT_FLAG_KERNEL) {
-                       resid = *(user_size_t *)(uintptr_t)data_available;
-               } else if (IS_64BIT_PROCESS(p)) {
-                       user64_size_t usize;
-                       error = copyin((user_addr_t)data_available, &usize, sizeof(usize));
-                       resid = (user_size_t)usize;
-               } else {
-                       user32_size_t usize;
-                       error = copyin((user_addr_t)data_available, &usize, sizeof(usize));
-                       resid = (user_size_t)usize;
-               }
-               if (error) {
-                       return error;
+       LIST_FOREACH(kqwl, list, kqwl_hashlink) {
+               if (kqwl->kqwl_dynamicid == id) {
+                       return kqwl;
                }
-       } else {
-               resid = 0;
        }
-       *residp = resid;
-       return 0;
+       return NULL;
 }
 
-static int
-kevent_put_data_size(
-       struct proc *p,
-       uint64_t data_available,
-       unsigned int flags,
-       user_size_t resid)
+static struct kqworkloop *
+kqworkloop_hash_lookup_and_retain(struct filedesc *fdp, kqueue_id_t kq_id)
 {
-       int error = 0;
+       struct kqworkloop *kqwl = NULL;
 
-       if (data_available) {
-               if (flags & KEVENT_FLAG_KERNEL) {
-                       *(user_size_t *)(uintptr_t)data_available = resid;
-               } else if (IS_64BIT_PROCESS(p)) {
-                       user64_size_t usize = (user64_size_t)resid;
-                       error = copyout(&usize, (user_addr_t)data_available, sizeof(usize));
-               } else {
-                       user32_size_t usize = (user32_size_t)resid;
-                       error = copyout(&usize, (user_addr_t)data_available, sizeof(usize));
+       kqhash_lock(fdp);
+       if (__probable(fdp->fd_kqhash)) {
+               kqwl = kqworkloop_hash_lookup_locked(fdp, kq_id);
+               if (kqwl && !kqworkloop_try_retain(kqwl)) {
+                       kqwl = NULL;
                }
        }
-       return error;
+       kqhash_unlock(fdp);
+       return kqwl;
 }
 
-/*
- * kevent_continue - continue a kevent syscall after blocking
- *
- *     assume we inherit a use count on the kq fileglob.
- */
-__attribute__((noreturn))
+OS_NOINLINE
 static void
-kevent_continue(__unused struct kqueue *kq, void *data, int error)
+kqworkloop_hash_init(struct filedesc *fdp)
 {
-       struct _kevent *cont_args;
-       struct fileproc *fp;
-       uint64_t data_available;
-       user_size_t data_size;
-       user_size_t data_resid;
-       unsigned int flags;
-       int32_t *retval;
-       int noutputs;
-       int fd;
-       struct proc *p = current_proc();
-
-       cont_args = (struct _kevent *)data;
-       data_available = cont_args->data_available;
-       flags = cont_args->process_data.fp_flags;
-       data_size = cont_args->process_data.fp_data_size;
-       data_resid = cont_args->process_data.fp_data_resid;
-       noutputs = cont_args->eventout;
-       retval = cont_args->retval;
-       fd = cont_args->fd;
-       fp = cont_args->fp;
-
-       kevent_put_kq(p, fd, fp, kq);
-
-       /* don't abandon other output just because of residual copyout failures */
-       if (error == 0 && data_available && data_resid != data_size) {
-               (void)kevent_put_data_size(p, data_available, flags, data_resid);
-       }
+       struct kqwllist *alloc_hash;
+       u_long alloc_mask;
 
-       /* don't restart after signals... */
-       if (error == ERESTART) {
-               error = EINTR;
-       } else if (error == EWOULDBLOCK) {
-               error = 0;
-       }
-       if (error == 0) {
-               *retval = noutputs;
+       kqhash_unlock(fdp);
+       alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
+       kqhash_lock(fdp);
+
+       /* See if we won the race */
+       if (__probable(fdp->fd_kqhashmask == 0)) {
+               fdp->fd_kqhash = alloc_hash;
+               fdp->fd_kqhashmask = alloc_mask;
+       } else {
+               kqhash_unlock(fdp);
+               FREE(alloc_hash, M_KQUEUE);
+               kqhash_lock(fdp);
        }
-       unix_syscall_return(error);
 }
 
-/*
- * kevent - [syscall] register and wait for kernel events
+/*!
+ * @function kqworkloop_dealloc
+ *
+ * @brief
+ * Deallocates a workloop kqueue.
+ *
+ * @discussion
+ * Knotes hold references on the workloop, so we can't really reach this
+ * function unless all of these are already gone.
+ *
+ * Nothing locked on entry or exit.
  *
+ * @param flags
+ * Unless KQWL_DEALLOC_SKIP_HASH_REMOVE is set, the workloop is removed
+ * from its hash table.
+ *
+ * @param current_ref
+ * This function is also called to undo a kqworkloop_alloc in case of
+ * allocation races, expected_ref is the current refcount that is expected
+ * on the workloop object, usually 0, and 1 when a dealloc race is resolved.
  */
-int
-kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
+static void
+kqworkloop_dealloc(struct kqworkloop *kqwl, kqworkloop_dealloc_flags_t flags,
+    uint32_t current_ref)
 {
-       unsigned int flags = KEVENT_FLAG_LEGACY32;
+       thread_t cur_owner;
 
-       return kevent_internal(p,
-                  (kqueue_id_t)uap->fd, NULL,
-                  uap->changelist, uap->nchanges,
-                  uap->eventlist, uap->nevents,
-                  0ULL, 0ULL,
-                  flags,
-                  uap->timeout,
-                  kevent_continue,
-                  retval);
-}
+       if (__improbable(current_ref > 1)) {
+               kqworkloop_release_panic(kqwl);
+       }
+       assert(kqwl->kqwl_retains == current_ref);
 
-int
-kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
-{
-       unsigned int flags;
+       /* pair with kqunlock() and other kq locks */
+       os_atomic_thread_fence(acquire);
 
-       /* restrict to user flags and set legacy64 */
-       flags = uap->flags & KEVENT_FLAG_USER;
-       flags |= KEVENT_FLAG_LEGACY64;
+       cur_owner = kqwl->kqwl_owner;
+       if (cur_owner) {
+               if (kqworkloop_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
+                       thread_drop_kevent_override(cur_owner);
+               }
+               thread_deallocate(cur_owner);
+               kqwl->kqwl_owner = THREAD_NULL;
+       }
 
-       return kevent_internal(p,
-                  (kqueue_id_t)uap->fd, NULL,
-                  uap->changelist, uap->nchanges,
-                  uap->eventlist, uap->nevents,
-                  0ULL, 0ULL,
-                  flags,
-                  uap->timeout,
-                  kevent_continue,
-                  retval);
-}
+       if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
+               struct turnstile *ts;
+               turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
+                   &ts, TURNSTILE_WORKLOOPS);
+               turnstile_cleanup();
+               turnstile_deallocate(ts);
+       }
 
-int
-kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
-{
-       /* restrict to user flags */
-       uap->flags &= KEVENT_FLAG_USER;
-
-       return kevent_internal(p,
-                  (kqueue_id_t)uap->fd, NULL,
-                  uap->changelist, uap->nchanges,
-                  uap->eventlist, uap->nevents,
-                  uap->data_out, (uint64_t)uap->data_available,
-                  uap->flags,
-                  0ULL,
-                  kevent_continue,
-                  retval);
-}
+       if ((flags & KQWL_DEALLOC_SKIP_HASH_REMOVE) == 0) {
+               struct filedesc *fdp = kqwl->kqwl_p->p_fd;
 
-int
-kevent_qos_internal(struct proc *p, int fd,
-    user_addr_t changelist, int nchanges,
-    user_addr_t eventlist, int nevents,
-    user_addr_t data_out, user_size_t *data_available,
-    unsigned int flags,
-    int32_t *retval)
-{
-       return kevent_internal(p,
-                  (kqueue_id_t)fd, NULL,
-                  changelist, nchanges,
-                  eventlist, nevents,
-                  data_out, (uint64_t)data_available,
-                  (flags | KEVENT_FLAG_KERNEL),
-                  0ULL,
-                  NULL,
-                  retval);
-}
+               kqhash_lock(fdp);
+               LIST_REMOVE(kqwl, kqwl_hashlink);
+               kqhash_unlock(fdp);
+       }
 
-int
-kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval)
-{
-       /* restrict to user flags */
-       uap->flags &= KEVENT_FLAG_USER;
-
-       return kevent_internal(p,
-                  (kqueue_id_t)uap->id, NULL,
-                  uap->changelist, uap->nchanges,
-                  uap->eventlist, uap->nevents,
-                  uap->data_out, (uint64_t)uap->data_available,
-                  (uap->flags | KEVENT_FLAG_DYNAMIC_KQUEUE),
-                  0ULL,
-                  kevent_continue,
-                  retval);
-}
+       assert(TAILQ_EMPTY(&kqwl->kqwl_suppressed));
+       assert(kqwl->kqwl_owner == THREAD_NULL);
+       assert(kqwl->kqwl_turnstile == TURNSTILE_NULL);
 
-int
-kevent_id_internal(struct proc *p, kqueue_id_t *id,
-    user_addr_t changelist, int nchanges,
-    user_addr_t eventlist, int nevents,
-    user_addr_t data_out, user_size_t *data_available,
-    unsigned int flags,
-    int32_t *retval)
-{
-       return kevent_internal(p,
-                  *id, id,
-                  changelist, nchanges,
-                  eventlist, nevents,
-                  data_out, (uint64_t)data_available,
-                  (flags | KEVENT_FLAG_KERNEL | KEVENT_FLAG_DYNAMIC_KQUEUE),
-                  0ULL,
-                  NULL,
-                  retval);
+       lck_spin_destroy(&kqwl->kqwl_statelock, kq_lck_grp);
+       kqueue_destroy(kqwl, kqworkloop_zone);
 }
 
-static int
-kevent_get_timeout(struct proc *p,
-    user_addr_t utimeout,
-    unsigned int flags,
-    struct timeval *atvp)
+/*!
+ * @function kqworkloop_alloc
+ *
+ * @brief
+ * Allocates a workloop kqueue.
+ */
+static void
+kqworkloop_init(struct kqworkloop *kqwl, proc_t p,
+    kqueue_id_t id, workq_threadreq_param_t *trp)
 {
-       struct timeval atv;
-       int error = 0;
+       bzero(kqwl, sizeof(struct kqworkloop));
 
-       if (flags & KEVENT_FLAG_IMMEDIATE) {
-               getmicrouptime(&atv);
-       } else if (utimeout != USER_ADDR_NULL) {
-               struct timeval rtv;
-               if (flags & KEVENT_FLAG_KERNEL) {
-                       struct timespec *tsp = (struct timespec *)utimeout;
-                       TIMESPEC_TO_TIMEVAL(&rtv, tsp);
-               } else if (IS_64BIT_PROCESS(p)) {
-                       struct user64_timespec ts;
-                       error = copyin(utimeout, &ts, sizeof(ts));
-                       if ((ts.tv_sec & 0xFFFFFFFF00000000ull) != 0) {
-                               error = EINVAL;
-                       } else {
-                               TIMESPEC_TO_TIMEVAL(&rtv, &ts);
-                       }
-               } else {
-                       struct user32_timespec ts;
-                       error = copyin(utimeout, &ts, sizeof(ts));
-                       TIMESPEC_TO_TIMEVAL(&rtv, &ts);
-               }
-               if (error) {
-                       return error;
+       kqwl->kqwl_state     = KQ_WORKLOOP | KQ_DYNAMIC | KQ_KEV_QOS;
+       kqwl->kqwl_retains   = 1; /* donate a retain to creator */
+       kqwl->kqwl_dynamicid = id;
+       kqwl->kqwl_p         = p;
+       if (trp) {
+               kqwl->kqwl_params = trp->trp_value;
+       }
+
+       workq_tr_flags_t tr_flags = WORKQ_TR_FLAG_WORKLOOP;
+       if (trp) {
+               if (trp->trp_flags & TRP_PRIORITY) {
+                       tr_flags |= WORKQ_TR_FLAG_WL_OUTSIDE_QOS;
                }
-               if (itimerfix(&rtv)) {
-                       return EINVAL;
+               if (trp->trp_flags) {
+                       tr_flags |= WORKQ_TR_FLAG_WL_PARAMS;
                }
-               getmicrouptime(&atv);
-               timevaladd(&atv, &rtv);
-       } else {
-               /* wait forever value */
-               atv.tv_sec = 0;
-               atv.tv_usec = 0;
        }
-       *atvp = atv;
-       return 0;
+       kqwl->kqwl_request.tr_state = WORKQ_TR_STATE_IDLE;
+       kqwl->kqwl_request.tr_flags = tr_flags;
+
+       for (int i = 0; i < KQWL_NBUCKETS; i++) {
+               TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_queue[i]);
+       }
+       TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_suppressed);
+
+       lck_spin_init(&kqwl->kqwl_statelock, kq_lck_grp, kq_lck_attr);
+
+       kqueue_init(kqwl, &kqwl->kqwl_waitq_hook, SYNC_POLICY_FIFO);
 }
 
+/*!
+ * @function kqworkloop_get_or_create
+ *
+ * @brief
+ * Wrapper around kqworkloop_alloc that handles the uniquing of workloops.
+ *
+ * @returns
+ * 0:      success
+ * EINVAL: invalid parameters
+ * EEXIST: KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST is set and a collision exists.
+ * ENOENT: KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST is set and the entry wasn't found.
+ * ENOMEM: allocation failed
+ */
 static int
-kevent_set_kq_mode(struct kqueue *kq, unsigned int flags)
+kqworkloop_get_or_create(struct proc *p, kqueue_id_t id,
+    workq_threadreq_param_t *trp, unsigned int flags, struct kqworkloop **kqwlp)
 {
-       /* each kq should only be used for events of one type */
-       kqlock(kq);
-       if (kq->kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) {
-               if (flags & KEVENT_FLAG_LEGACY32) {
-                       if ((kq->kq_state & KQ_KEV32) == 0) {
-                               kqunlock(kq);
-                               return EINVAL;
-                       }
-               } else if (kq->kq_state & KQ_KEV32) {
-                       kqunlock(kq);
-                       return EINVAL;
-               }
-       } else if (flags & KEVENT_FLAG_LEGACY32) {
-               kq->kq_state |= KQ_KEV32;
-       } else if (flags & KEVENT_FLAG_LEGACY64) {
-               kq->kq_state |= KQ_KEV64;
-       } else {
-               kq->kq_state |= KQ_KEV_QOS;
-       }
-       kqunlock(kq);
-       return 0;
-}
+       struct filedesc *fdp = p->p_fd;
+       struct kqworkloop *alloc_kqwl = NULL;
+       struct kqworkloop *kqwl = NULL;
+       int error = 0;
 
-#define KQ_HASH(val, mask)  (((val) ^ (val >> 8)) & (mask))
-#define CONFIG_KQ_HASHSIZE  CONFIG_KN_HASHSIZE
+       assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
 
-static inline void
-kqhash_lock(proc_t p)
-{
-       lck_mtx_lock_spin_always(&p->p_fd->fd_kqhashlock);
-}
+       if (id == 0 || id == (kqueue_id_t)-1) {
+               return EINVAL;
+       }
 
-static inline void
-kqhash_lock_held(__assert_only proc_t p)
-{
-       LCK_MTX_ASSERT(&p->p_fd->fd_kqhashlock, LCK_MTX_ASSERT_OWNED);
-}
+       for (;;) {
+               kqhash_lock(fdp);
+               if (__improbable(fdp->fd_kqhash == NULL)) {
+                       kqworkloop_hash_init(fdp);
+               }
 
-static inline void
-kqhash_unlock(proc_t p)
-{
-       lck_mtx_unlock(&p->p_fd->fd_kqhashlock);
-}
+               kqwl = kqworkloop_hash_lookup_locked(fdp, id);
+               if (kqwl) {
+                       if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
+                               /*
+                                * If MUST_NOT_EXIST was passed, even if we would have failed
+                                * the try_retain, it could have gone the other way, and
+                                * userspace can't tell. Let'em fix their race.
+                                */
+                               error = EEXIST;
+                               break;
+                       }
 
-static void
-kqueue_hash_init_if_needed(proc_t p)
-{
-       struct filedesc *fdp = p->p_fd;
+                       if (__probable(kqworkloop_try_retain(kqwl))) {
+                               /*
+                                * This is a valid live workloop !
+                                */
+                               *kqwlp = kqwl;
+                               error = 0;
+                               break;
+                       }
+               }
 
-       kqhash_lock_held(p);
+               if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST)) {
+                       error = ENOENT;
+                       break;
+               }
 
-       if (__improbable(fdp->fd_kqhash == NULL)) {
-               struct kqlist *alloc_hash;
-               u_long alloc_mask;
+               /*
+                * We didn't find what we were looking for.
+                *
+                * If this is the second time we reach this point (alloc_kqwl != NULL),
+                * then we're done.
+                *
+                * If this is the first time we reach this point (alloc_kqwl == NULL),
+                * then try to allocate one without blocking.
+                */
+               if (__probable(alloc_kqwl == NULL)) {
+                       alloc_kqwl = (struct kqworkloop *)zalloc_noblock(kqworkloop_zone);
+               }
+               if (__probable(alloc_kqwl)) {
+                       kqworkloop_init(alloc_kqwl, p, id, trp);
+                       kqworkloop_hash_insert_locked(fdp, id, alloc_kqwl);
+                       kqhash_unlock(fdp);
+                       *kqwlp = alloc_kqwl;
+                       return 0;
+               }
 
-               kqhash_unlock(p);
-               alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
-               kqhash_lock(p);
+               /*
+                * We have to block to allocate a workloop, drop the lock,
+                * allocate one, but then we need to retry lookups as someone
+                * else could race with us.
+                */
+               kqhash_unlock(fdp);
 
-               /* See if we won the race */
-               if (fdp->fd_kqhashmask == 0) {
-                       fdp->fd_kqhash = alloc_hash;
-                       fdp->fd_kqhashmask = alloc_mask;
-               } else {
-                       kqhash_unlock(p);
-                       FREE(alloc_hash, M_KQUEUE);
-                       kqhash_lock(p);
+               alloc_kqwl = (struct kqworkloop *)zalloc(kqworkloop_zone);
+               if (__improbable(!alloc_kqwl)) {
+                       return ENOMEM;
                }
        }
-}
-
-/*
- * Called with the kqhash_lock() held
- */
-static void
-kqueue_hash_insert(
-       struct proc *p,
-       kqueue_id_t id,
-       struct kqueue *kq)
-{
-       struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-       struct filedesc *fdp = p->p_fd;
-       struct kqlist *list;
 
-       /* should hold the kq hash lock */
-       kqhash_lock_held(p);
+       kqhash_unlock(fdp);
 
-       if ((kq->kq_state & KQ_DYNAMIC) == 0) {
-               assert(kq->kq_state & KQ_DYNAMIC);
-               return;
+       if (__improbable(alloc_kqwl)) {
+               zfree(kqworkloop_zone, alloc_kqwl);
        }
 
-       /* only dynamically allocate workloop kqs for now */
-       assert(kq->kq_state & KQ_WORKLOOP);
-       assert(fdp->fd_kqhash);
+       return error;
+}
 
-       kqwl->kqwl_dynamicid = id;
+#pragma mark - knotes
 
-       list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
-       SLIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
+static int
+filt_no_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
+{
+       knote_set_error(kn, ENOTSUP);
+       return 0;
 }
 
-/* Called with kqhash_lock held */
 static void
-kqueue_hash_remove(
-       struct proc *p,
-       struct kqueue *kq)
+filt_no_detach(__unused struct knote *kn)
 {
-       struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-       struct filedesc *fdp = p->p_fd;
-       struct kqlist *list;
+}
 
-       /* should hold the kq hash lock */
-       kqhash_lock_held(p);
+static int __dead2
+filt_bad_event(struct knote *kn, long hint)
+{
+       panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
+}
 
-       if ((kq->kq_state & KQ_DYNAMIC) == 0) {
-               assert(kq->kq_state & KQ_DYNAMIC);
-               return;
-       }
-       assert(kq->kq_state & KQ_WORKLOOP); /* for now */
-       list = &fdp->fd_kqhash[KQ_HASH(kqwl->kqwl_dynamicid, fdp->fd_kqhashmask)];
-       SLIST_REMOVE(list, kqwl, kqworkloop, kqwl_hashlink);
+static int __dead2
+filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev)
+{
+       panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
 }
 
-/* Called with kqhash_lock held */
-static struct kqueue *
-kqueue_hash_lookup(struct proc *p, kqueue_id_t id)
+static int __dead2
+filt_bad_process(struct knote *kn, struct kevent_qos_s *kev)
 {
-       struct filedesc *fdp = p->p_fd;
-       struct kqlist *list;
-       struct kqworkloop *kqwl;
+       panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
+}
 
-       /* should hold the kq hash lock */
-       kqhash_lock_held(p);
+/*
+ * knotes_dealloc - detach all knotes for the process and drop them
+ *
+ *             Called with proc_fdlock held.
+ *             Returns with it locked.
+ *             May drop it temporarily.
+ *             Process is in such a state that it will not try to allocate
+ *             any more knotes during this process (stopped for exit or exec).
+ */
+void
+knotes_dealloc(proc_t p)
+{
+       struct filedesc *fdp = p->p_fd;
+       struct kqueue *kq;
+       struct knote *kn;
+       struct  klist *kn_hash = NULL;
+       int i;
 
-       if (fdp->fd_kqhashmask == 0) {
-               return NULL;
+       /* Close all the fd-indexed knotes up front */
+       if (fdp->fd_knlistsize > 0) {
+               for (i = 0; i < fdp->fd_knlistsize; i++) {
+                       while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
+                               kq = knote_get_kq(kn);
+                               kqlock(kq);
+                               proc_fdunlock(p);
+                               knote_drop(kq, kn, NULL);
+                               proc_fdlock(p);
+                       }
+               }
+               /* free the table */
+               FREE(fdp->fd_knlist, M_KQUEUE);
+               fdp->fd_knlist = NULL;
        }
+       fdp->fd_knlistsize = 0;
 
-       list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
-       SLIST_FOREACH(kqwl, list, kqwl_hashlink) {
-               if (kqwl->kqwl_dynamicid == id) {
-                       struct kqueue *kq = (struct kqueue *)kqwl;
+       knhash_lock(fdp);
+       proc_fdunlock(p);
 
-                       assert(kq->kq_state & KQ_DYNAMIC);
-                       assert(kq->kq_state & KQ_WORKLOOP); /* for now */
-                       return kq;
+       /* Clean out all the hashed knotes as well */
+       if (fdp->fd_knhashmask != 0) {
+               for (i = 0; i <= (int)fdp->fd_knhashmask; i++) {
+                       while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
+                               kq = knote_get_kq(kn);
+                               kqlock(kq);
+                               knhash_unlock(fdp);
+                               knote_drop(kq, kn, NULL);
+                               knhash_lock(fdp);
+                       }
                }
+               kn_hash = fdp->fd_knhash;
+               fdp->fd_knhashmask = 0;
+               fdp->fd_knhash = NULL;
        }
-       return NULL;
-}
 
-static inline void
-kqueue_release_last(struct proc *p, kqueue_t kqu)
-{
-       struct kqueue *kq = kqu.kq;
-       if (kq->kq_state & KQ_DYNAMIC) {
-               kqhash_lock(p);
-               if (kqueue_release(kq, KQUEUE_MIGHT_BE_LAST_REF)) {
-                       thread_t cur_owner = kqworkloop_invalidate(kqu.kqwl);
-                       kqueue_hash_remove(p, kq);
-                       kqhash_unlock(p);
-                       if (cur_owner) {
-                               thread_deallocate(cur_owner);
-                       }
-                       kqueue_dealloc(kq);
-               } else {
-                       kqhash_unlock(p);
-               }
+       knhash_unlock(fdp);
+
+       /* free the kn_hash table */
+       if (kn_hash) {
+               FREE(kn_hash, M_KQUEUE);
        }
+
+       proc_fdlock(p);
 }
 
 /*
 kqworkloops_dealloc(proc_t p)
 {
        struct filedesc *fdp = p->p_fd;
-       struct kqlist *list;
        struct kqworkloop *kqwl, *kqwln;
-       struct kqlist tofree;
-       int i;
+       struct kqwllist tofree;
 
        if (!(fdp->fd_flags & FD_WORKLOOP)) {
                return;
        }
 
-       SLIST_INIT(&tofree);
+       kqhash_lock(fdp);
+
+       if (fdp->fd_kqhashmask == 0) {
+               kqhash_unlock(fdp);
+               return;
+       }
 
-       kqhash_lock(p);
-       assert(fdp->fd_kqhashmask != 0);
+       LIST_INIT(&tofree);
 
-       for (i = 0; i <= (int)fdp->fd_kqhashmask; i++) {
-               list = &fdp->fd_kqhash[i];
-               SLIST_FOREACH_SAFE(kqwl, list, kqwl_hashlink, kqwln) {
+       for (size_t i = 0; i <= fdp->fd_kqhashmask; i++) {
+               LIST_FOREACH_SAFE(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink, kqwln) {
                        /*
                         * kqworkloops that have scheduling parameters have an
                         * implicit retain from kqueue_workloop_ctl that needs
                         * to be balanced on process exit.
                         */
                        assert(kqwl->kqwl_params);
-                       SLIST_REMOVE(list, kqwl, kqworkloop, kqwl_hashlink);
-                       SLIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
+                       LIST_REMOVE(kqwl, kqwl_hashlink);
+                       LIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
                }
        }
 
-       kqhash_unlock(p);
+       kqhash_unlock(fdp);
 
-       SLIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
-               struct kqueue *kq = (struct kqueue *)kqwl;
-               __assert_only bool released;
-               released = kqueue_release(kq, KQUEUE_MIGHT_BE_LAST_REF);
-               assert(released);
-               kqueue_dealloc(kq);
+       LIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
+               kqworkloop_dealloc(kqwl, KQWL_DEALLOC_SKIP_HASH_REMOVE, 1);
        }
 }
 
-static struct kqueue *
-kevent_get_bound_kqworkloop(thread_t thread)
-{
-       struct uthread *ut = get_bsdthread_info(thread);
-       struct kqrequest *kqr = ut->uu_kqr_bound;
-
-       return kqr ? (struct kqueue *)kqr_kqworkloop(kqr) : NULL;
-}
-
 static int
-kevent_get_kq(struct proc *p, kqueue_id_t id, workq_threadreq_param_t *trp,
-    unsigned int flags, struct fileproc **fpp, int *fdp,
-    struct kqueue **kqp)
-{
-       struct filedesc *descp = p->p_fd;
-       struct fileproc *fp = NULL;
-       struct kqueue *kq = NULL;
-       int fd = 0;
-       int error = 0;
-       thread_t th = current_thread();
-
-       assert(!trp || (flags & KEVENT_FLAG_WORKLOOP));
-
-       /* Was the workloop flag passed?  Then it is for sure only a workloop */
-       if (flags & KEVENT_FLAG_DYNAMIC_KQUEUE) {
-               assert(flags & KEVENT_FLAG_WORKLOOP);
-               assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
-               kq = kevent_get_bound_kqworkloop(th);
+kevent_register_validate_priority(struct kqueue *kq, struct knote *kn,
+    struct kevent_qos_s *kev)
+{
+       /* We don't care about the priority of a disabled or deleted knote */
+       if (kev->flags & (EV_DISABLE | EV_DELETE)) {
+               return 0;
+       }
 
+       if (kq->kq_state & KQ_WORKLOOP) {
                /*
-                * when kevent_id_internal is called from within the
-                * kernel, and the passed 'id' value is '-1' then we
-                * look for the currently bound workloop kq.
+                * Workloops need valid priorities with a QOS (excluding manager) for
+                * any enabled knote.
+                *
+                * When it is pre-existing, just make sure it has a valid QoS as
+                * kevent_register() will not use the incoming priority (filters who do
+                * have the responsibility to validate it again, see filt_wltouch).
+                *
+                * If the knote is being made, validate the incoming priority.
                 */
-               if (id == (kqueue_id_t)-1 &&
-                   (flags & KEVENT_FLAG_KERNEL) &&
-                   (flags & KEVENT_FLAG_WORKLOOP)) {
-                       if (!is_workqueue_thread(th) || !kq) {
-                               return EINVAL;
-                       }
-
-                       kqueue_retain(kq);
-                       goto out;
-               }
-
-               if (id == 0 || id == (kqueue_id_t)-1) {
-                       return EINVAL;
-               }
-
-               /* try shortcut on kq lookup for bound threads */
-               if (kq != NULL && ((struct kqworkloop *)kq)->kqwl_dynamicid == id) {
-                       if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
-                               return EEXIST;
-                       }
-
-                       /* retain a reference while working with this kq. */
-                       assert(kq->kq_state & KQ_DYNAMIC);
-                       kqueue_retain(kq);
-                       goto out;
+               if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) {
+                       return ERANGE;
                }
+       }
 
-               /* look for the kq on the hash table */
-               kqhash_lock(p);
-               kq = kqueue_hash_lookup(p, id);
-               if (kq == NULL) {
-                       kqhash_unlock(p);
-
-                       if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST) {
-                               return ENOENT;
-                       }
-
-                       struct kqueue *alloc_kq;
-                       alloc_kq = kqueue_alloc(p, flags);
-                       if (!alloc_kq) {
-                               return ENOMEM;
-                       }
+       return 0;
+}
 
-                       kqhash_lock(p);
-                       kqueue_hash_init_if_needed(p);
-                       kq = kqueue_hash_lookup(p, id);
-                       if (kq == NULL) {
-                               /* insert our new one */
-                               kq = alloc_kq;
-                               if (trp) {
-                                       struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-                                       kqwl->kqwl_params = trp->trp_value;
-                               }
-                               kqueue_hash_insert(p, id, kq);
-                               kqhash_unlock(p);
-                       } else if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
-                               /* lost race and caller wants an error */
-                               kqhash_unlock(p);
-                               kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF);
-                               kqueue_dealloc(alloc_kq);
-                               return EEXIST;
-                       } else {
-                               /* lost race, retain existing workloop */
-                               kqueue_retain(kq);
-                               kqhash_unlock(p);
-                               kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF);
-                               kqueue_dealloc(alloc_kq);
-                       }
-               } else {
-                       if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
-                               kqhash_unlock(p);
-                               return EEXIST;
-                       }
+/*
+ * Prepare a filter for waiting after register.
+ *
+ * The f_post_register_wait hook will be called later by kevent_register()
+ * and should call kevent_register_wait_block()
+ */
+static int
+kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int rc)
+{
+       thread_t thread = current_thread();
 
-                       /* retain a reference while working with this kq. */
-                       assert(kq->kq_state & KQ_DYNAMIC);
-                       kqueue_retain(kq);
-                       kqhash_unlock(p);
-               }
-       } else if (flags & KEVENT_FLAG_WORKQ) {
-               /* must already exist for bound threads. */
-               if (flags & KEVENT_FLAG_KERNEL) {
-                       assert(descp->fd_wqkqueue != NULL);
-               }
+       assert(knote_fops(kn)->f_extended_codes);
 
+       if (kn->kn_thread == NULL) {
+               thread_reference(thread);
+               kn->kn_thread = thread;
+       } else if (kn->kn_thread != thread) {
                /*
-                * use the private kq associated with the proc workq.
-                * Just being a thread within the process (and not
-                * being the exit/exec thread) is enough to hold a
-                * reference on this special kq.
+                * kn_thread may be set from a previous aborted wait
+                * However, it has to be from the same thread.
                 */
-               kq = descp->fd_wqkqueue;
-               if (kq == NULL) {
-                       struct kqueue *alloc_kq = kqueue_alloc(p, KEVENT_FLAG_WORKQ);
-                       if (alloc_kq == NULL) {
-                               return ENOMEM;
-                       }
-
-                       knhash_lock(p);
-                       if (descp->fd_wqkqueue == NULL) {
-                               kq = descp->fd_wqkqueue = alloc_kq;
-                               knhash_unlock(p);
-                       } else {
-                               knhash_unlock(p);
-                               kq = descp->fd_wqkqueue;
-                               kqueue_dealloc(alloc_kq);
-                       }
-               }
-       } else {
-               /* get a usecount for the kq itself */
-               fd = (int)id;
-               if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0) {
-                       return error;
-               }
-       }
-       if ((error = kevent_set_kq_mode(kq, flags)) != 0) {
-               /* drop the usecount */
-               if (fp != NULL) {
-                       fp_drop(p, fd, fp, 0);
-               }
-               return error;
+               kev->flags |= EV_ERROR;
+               kev->data = EXDEV;
+               return 0;
        }
 
-out:
-       *fpp = fp;
-       *fdp = fd;
-       *kqp = kq;
-
-       return error;
+       return FILTER_REGISTER_WAIT | rc;
 }
 
+/*
+ * Cleanup a kevent_register_wait_prepare() effect for threads that have been
+ * aborted instead of properly woken up with thread_wakeup_thread().
+ */
 static void
-kevent_put_kq(
-       struct proc *p,
-       kqueue_id_t id,
-       struct fileproc *fp,
-       struct kqueue *kq)
+kevent_register_wait_cleanup(struct knote *kn)
 {
-       kqueue_release_last(p, kq);
-       if (fp != NULL) {
-               assert((kq->kq_state & KQ_WORKQ) == 0);
-               fp_drop(p, (int)id, fp, 0);
-       }
+       thread_t thread = kn->kn_thread;
+       kn->kn_thread = NULL;
+       thread_deallocate(thread);
 }
 
-static uint64_t
-kevent_workloop_serial_no_copyin(proc_t p, uint64_t workloop_id)
+/*
+ * Must be called at the end of a f_post_register_wait call from a filter.
+ */
+static void
+kevent_register_wait_block(struct turnstile *ts, thread_t thread,
+    thread_continue_t cont, struct _kevent_register *cont_args)
 {
-       uint64_t serial_no = 0;
-       user_addr_t addr;
-       int rc;
-
-       if (workloop_id == 0 || p->p_dispatchqueue_serialno_offset == 0) {
-               return 0;
-       }
-       addr = (user_addr_t)(workloop_id + p->p_dispatchqueue_serialno_offset);
-
-       if (proc_is64bit(p)) {
-               rc = copyin(addr, (caddr_t)&serial_no, sizeof(serial_no));
-       } else {
-               uint32_t serial_no32 = 0;
-               rc = copyin(addr, (caddr_t)&serial_no32, sizeof(serial_no32));
-               serial_no = serial_no32;
-       }
-       return rc == 0 ? serial_no : 0;
+       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+       kqunlock(cont_args->kqwl);
+       cont_args->handoff_thread = thread;
+       thread_handoff_parameter(thread, cont, cont_args);
 }
 
-int
-kevent_exit_on_workloop_ownership_leak(thread_t thread)
+/*
+ * Called by Filters using a f_post_register_wait to return from their wait.
+ */
+static void
+kevent_register_wait_return(struct _kevent_register *cont_args)
 {
-       proc_t p = current_proc();
-       struct filedesc *fdp = p->p_fd;
-       kqueue_id_t workloop_id = 0;
-       os_reason_t reason = OS_REASON_NULL;
-       mach_vm_address_t addr;
-       uint32_t reason_size;
+       struct kqworkloop *kqwl = cont_args->kqwl;
+       struct kevent_qos_s *kev = &cont_args->kev;
+       int error = 0;
 
-       kqhash_lock(p);
-       if (fdp->fd_kqhashmask > 0) {
-               for (uint32_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
-                       struct kqworkloop *kqwl;
-
-                       SLIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
-                               struct kqueue *kq = &kqwl->kqwl_kqueue;
-                               if ((kq->kq_state & KQ_DYNAMIC) && kqwl->kqwl_owner == thread) {
-                                       workloop_id = kqwl->kqwl_dynamicid;
-                                       break;
-                               }
-                       }
-               }
-       }
-       kqhash_unlock(p);
-
-       reason = os_reason_create(OS_REASON_LIBSYSTEM,
-           OS_REASON_LIBSYSTEM_CODE_WORKLOOP_OWNERSHIP_LEAK);
-       if (reason == OS_REASON_NULL) {
-               goto out;
-       }
-
-       reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
-       reason_size = 2 * sizeof(uint64_t);
-       reason_size = kcdata_estimate_required_buffer_size(2, reason_size);
-       if (os_reason_alloc_buffer(reason, reason_size) != 0) {
-               goto out;
+       if (cont_args->handoff_thread) {
+               thread_deallocate(cont_args->handoff_thread);
        }
 
-       if (workloop_id) {
-               struct kcdata_descriptor *kcd = &reason->osr_kcd_descriptor;
-
-               if (kcdata_get_memory_addr(kcd, EXIT_REASON_WORKLOOP_ID,
-                   sizeof(workloop_id), &addr) == KERN_SUCCESS) {
-                       kcdata_memcpy(kcd, addr, &workloop_id, sizeof(workloop_id));
+       if (kev->flags & (EV_ERROR | EV_RECEIPT)) {
+               if ((kev->flags & EV_ERROR) == 0) {
+                       kev->flags |= EV_ERROR;
+                       kev->data = 0;
                }
-
-               uint64_t serial_no = kevent_workloop_serial_no_copyin(p, workloop_id);
-               if (serial_no && kcdata_get_memory_addr(kcd, EXIT_REASON_DISPATCH_QUEUE_NO,
-                   sizeof(serial_no), &addr) == KERN_SUCCESS) {
-                       kcdata_memcpy(kcd, addr, &serial_no, sizeof(serial_no));
+               error = kevent_modern_copyout(kev, &cont_args->ueventlist);
+               if (error == 0) {
+                       cont_args->eventout++;
                }
        }
-out:
-#if DEVELOPMENT || DEBUG
-       if (kevent_debug_flags() & KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK) {
-               panic("thread %p in task %p is leaked workloop 0x%016llx ownership",
-                   thread, p->task, workloop_id);
+
+       kqworkloop_release(kqwl);
+       if (error == 0) {
+               *(int32_t *)&current_uthread()->uu_rval = cont_args->eventout;
        }
-       psignal_try_thread_with_reason(p, thread, SIGABRT, reason);
-       return 0;
-#else
-       return exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL,
-                  FALSE, FALSE, 0, reason);
-#endif
+       unix_syscall_return(error);
 }
 
-static inline boolean_t
-kevent_args_requesting_events(unsigned int flags, int nevents)
-{
-       return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0;
-}
+/*
+ * kevent_register - add a new event to a kqueue
+ *
+ *     Creates a mapping between the event source and
+ *     the kqueue via a knote data structure.
+ *
+ *     Because many/most the event sources are file
+ *     descriptor related, the knote is linked off
+ *     the filedescriptor table for quick access.
+ *
+ *     called with nothing locked
+ *     caller holds a reference on the kqueue
+ */
 
-static int
-kevent_internal(struct proc *p,
-    kqueue_id_t id, kqueue_id_t *id_out,
-    user_addr_t changelist, int nchanges,
-    user_addr_t ueventlist, int nevents,
-    user_addr_t data_out, uint64_t data_available,
-    unsigned int flags,
-    user_addr_t utimeout,
-    kqueue_continue_t continuation,
-    int32_t *retval)
+int
+kevent_register(struct kqueue *kq, struct kevent_qos_s *kev,
+    struct knote **kn_out)
 {
-       uthread_t ut;
-       struct kqueue *kq;
-       struct fileproc *fp = NULL;
-       int fd = 0;
-       struct kevent_internal_s kev;
-       int error, noutputs, register_rc;
-       bool needs_end_processing = false;
-       struct timeval atv;
-       user_size_t data_size;
-       user_size_t data_resid;
-       thread_t thread = current_thread();
+       struct proc *p = kq->kq_p;
+       const struct filterops *fops;
+       struct knote *kn = NULL;
+       int result = 0, error = 0;
+       unsigned short kev_flags = kev->flags;
        KNOTE_LOCK_CTX(knlc);
 
-       /* Don't allow user-space threads to process output events from the workq kqs */
-       if (((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL)) == KEVENT_FLAG_WORKQ) &&
-           kevent_args_requesting_events(flags, nevents)) {
-               return EINVAL;
-       }
-
-       if (flags & KEVENT_FLAG_PARKING) {
-               if (!kevent_args_requesting_events(flags, nevents) || id != (kqueue_id_t)-1) {
-                       return EINVAL;
-               }
-       }
-
-       /* restrict dynamic kqueue allocation to workloops (for now) */
-       if ((flags & (KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP)) == KEVENT_FLAG_DYNAMIC_KQUEUE) {
-               return EINVAL;
+       if (__probable(kev->filter < 0 && kev->filter + EVFILT_SYSCOUNT >= 0)) {
+               fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
+       } else {
+               error = EINVAL;
+               goto out;
        }
 
-       if ((flags & (KEVENT_FLAG_WORKLOOP)) && (flags & (KEVENT_FLAG_WORKQ))) {
-               return EINVAL;
+       /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
+       if (__improbable((kev->flags & EV_VANISHED) &&
+           (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2))) {
+               error = EINVAL;
+               goto out;
        }
 
-       if (flags & (KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
-               /* allowed only on workloops when calling kevent_id from user-space */
-               if (!(flags & KEVENT_FLAG_WORKLOOP) || (flags & KEVENT_FLAG_KERNEL) || !(flags & KEVENT_FLAG_DYNAMIC_KQUEUE)) {
-                       return EINVAL;
-               }
+       /* Simplify the flags - delete and disable overrule */
+       if (kev->flags & EV_DELETE) {
+               kev->flags &= ~EV_ADD;
        }
-
-       /* prepare to deal with stack-wise allocation of out events */
-       if (flags & KEVENT_FLAG_STACK_EVENTS) {
-               int scale = ((flags & KEVENT_FLAG_LEGACY32) ?
-                   (IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) :
-                   sizeof(struct user32_kevent)) :
-                   ((flags & KEVENT_FLAG_LEGACY64) ? sizeof(struct kevent64_s) :
-                   sizeof(struct kevent_qos_s)));
-               ueventlist += nevents * scale;
+       if (kev->flags & EV_DISABLE) {
+               kev->flags &= ~EV_ENABLE;
        }
 
-       /* convert timeout to absolute - if we have one (and not immediate) */
-       error = kevent_get_timeout(p, utimeout, flags, &atv);
-       if (error) {
-               return error;
+       if (kq->kq_state & KQ_WORKLOOP) {
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
+                   ((struct kqworkloop *)kq)->kqwl_dynamicid,
+                   kev->udata, kev->flags, kev->filter);
+       } else if (kq->kq_state & KQ_WORKQ) {
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
+                   0, kev->udata, kev->flags, kev->filter);
+       } else {
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
+                   VM_KERNEL_UNSLIDE_OR_PERM(kq),
+                   kev->udata, kev->flags, kev->filter);
        }
 
-       /* copyin initial value of data residual from data_available */
-       error = kevent_get_data_size(p, data_available, flags, &data_size);
+restart:
+       /* find the matching knote from the fd tables/hashes */
+       kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
+       error = kevent_register_validate_priority(kq, kn, kev);
+       result = 0;
        if (error) {
-               return error;
+               goto out;
        }
 
-       /* get the kq we are going to be working on */
-       error = kevent_get_kq(p, id, NULL, flags, &fp, &fd, &kq);
-#if CONFIG_WORKLOOP_DEBUG
-       ut = (uthread_t)get_bsdthread_info(thread);
-       UU_KEVENT_HISTORY_WRITE_ENTRY(ut, {
-               .uu_kqid = id,
-               .uu_kq = error ? NULL : kq,
-               .uu_error = error,
-               .uu_nchanges = nchanges,
-               .uu_nevents = nevents,
-               .uu_flags = flags,
-       });
-#endif // CONFIG_WORKLOOP_DEBUG
-       if (error) {
-               return error;
-       }
+       if (kn == NULL && (kev->flags & EV_ADD) == 0) {
+               /*
+                * No knote found, EV_ADD wasn't specified
+                */
 
-       /* only bound threads can receive events on workloops */
-       if (flags & KEVENT_FLAG_WORKLOOP) {
-               struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-               struct kqrequest *kqr = &kqwl->kqwl_request;
+               if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
+                   (kq->kq_state & KQ_WORKLOOP)) {
+                       /*
+                        * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
+                        * that doesn't care about ENOENT, so just pretend the deletion
+                        * happened.
+                        */
+               } else {
+                       error = ENOENT;
+               }
+               goto out;
+       } else if (kn == NULL) {
+               /*
+                * No knote found, need to attach a new one (attach)
+                */
 
-               assert(kq->kq_state & KQ_WORKLOOP);
+               struct fileproc *knote_fp = NULL;
 
-               if (kevent_args_requesting_events(flags, nevents)) {
-                       if (kq != kevent_get_bound_kqworkloop(thread)) {
-                               error = EXDEV;
+               /* grab a file reference for the new knote */
+               if (fops->f_isfd) {
+                       if ((error = fp_lookup(p, kev->ident, &knote_fp, 0)) != 0) {
                                goto out;
                        }
-
-                       kq_req_lock(kqwl);
-                       /*
-                        * Disable the R2K notification while doing a register, if the
-                        * caller wants events too, we don't want the AST to be set if we
-                        * will process these events soon.
-                        */
-                       kqr->kqr_state &= ~KQR_R2K_NOTIF_ARMED;
-                       needs_end_processing = true;
-                       kq_req_unlock(kq);
                }
 
-               if (id_out) {
-                       *id_out = kqwl->kqwl_dynamicid;
+               kn = knote_alloc();
+               if (kn == NULL) {
+                       error = ENOMEM;
+                       if (knote_fp != NULL) {
+                               fp_drop(p, kev->ident, knote_fp, 0);
+                       }
+                       goto out;
                }
-       }
 
-       /* register all the change requests the user provided... */
-       noutputs = 0;
-       while (nchanges > 0 && error == 0) {
-               error = kevent_copyin(&changelist, &kev, p, flags);
-               if (error) {
-                       break;
+               kn->kn_fp = knote_fp;
+               kn->kn_is_fd = fops->f_isfd;
+               kn->kn_kq_packed = (intptr_t)(struct kqueue *)kq;
+               kn->kn_status = 0;
+
+               /* was vanish support requested */
+               if (kev->flags & EV_VANISHED) {
+                       kev->flags &= ~EV_VANISHED;
+                       kn->kn_status |= KN_REQVANISH;
                }
 
-               /* Make sure user doesn't pass in any system flags */
-               kev.flags &= ~EV_SYSFLAGS;
+               /* snapshot matching/dispatching protcol flags into knote */
+               if (kev->flags & EV_DISABLE) {
+                       kn->kn_status |= KN_DISABLED;
+               }
 
-               register_rc = kevent_register(kq, &kev, &knlc);
-               if (register_rc & FILTER_REGISTER_WAIT) {
-                       kqlock_held(kq);
+               /*
+                * copy the kevent state into knote
+                * protocol is that fflags and data
+                * are saved off, and cleared before
+                * calling the attach routine.
+                *
+                * - kn->kn_sfflags aliases with kev->xflags
+                * - kn->kn_sdata   aliases with kev->data
+                * - kn->kn_filter  is the top 8 bits of kev->filter
+                */
+               kn->kn_kevent  = *(struct kevent_internal_s *)kev;
+               kn->kn_sfflags = kev->fflags;
+               kn->kn_filtid  = (uint8_t)~kev->filter;
+               kn->kn_fflags  = 0;
+               knote_reset_priority(kq, kn, kev->qos);
 
-                       // f_post_register_wait is meant to call a continuation and not to
-                       // return, which is why we don't support FILTER_REGISTER_WAIT if
-                       // KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
-                       // waits isn't the last.
-                       //
-                       // It is implementable, but not used by any userspace code at the
-                       // moment, so for now return ENOTSUP if someone tries to do it.
-                       if (nchanges == 1 && nevents >= 1 && (flags & KEVENT_FLAG_ERROR_EVENTS)) {
-                               struct _kevent_register *cont_args;
-                               /* store the continuation/completion data in the uthread */
-                               ut = (uthread_t)get_bsdthread_info(thread);
-                               cont_args = &ut->uu_save.uus_kevent_register;
-                               cont_args->kev = kev;
-                               cont_args->kq = kq;
-                               cont_args->fp = fp;
-                               cont_args->fd = fd;
-                               cont_args->ueventlist = ueventlist;
-                               cont_args->flags = flags;
-                               cont_args->retval = retval;
-                               cont_args->eventcount = nevents;
-                               cont_args->eventout = noutputs;
-                               knote_fops(cont_args->knote)->f_post_register_wait(ut, &knlc, cont_args);
-                               panic("f_post_register_wait returned (kev: %p)", &kev);
+               /* Add the knote for lookup thru the fd table */
+               error = kq_add_knote(kq, kn, &knlc, p);
+               if (error) {
+                       knote_free(kn);
+                       if (knote_fp != NULL) {
+                               fp_drop(p, kev->ident, knote_fp, 0);
                        }
 
-                       kev.flags |= EV_ERROR;
-                       kev.data = ENOTSUP;
-                       knote_unlock(kq, knlc.knlc_knote, &knlc, KNOTE_KQ_UNLOCK);
-               }
-
-               // keep in sync with kevent_register_wait_return()
-               if (nevents > 0 && (kev.flags & (EV_ERROR | EV_RECEIPT))) {
-                       if ((kev.flags & EV_ERROR) == 0) {
-                               kev.flags |= EV_ERROR;
-                               kev.data = 0;
-                       }
-                       error = kevent_copyout(&kev, &ueventlist, p, flags);
-                       if (error == 0) {
-                               nevents--;
-                               noutputs++;
+                       if (error == ERESTART) {
+                               goto restart;
                        }
-               } else if (kev.flags & EV_ERROR) {
-                       error = kev.data;
+                       goto out;
                }
-               nchanges--;
-       }
 
-       /* short-circuit the scan if we only want error events */
-       if (flags & KEVENT_FLAG_ERROR_EVENTS) {
-               nevents = 0;
-       }
-
-       /* process pending events */
-       if (nevents > 0 && noutputs == 0 && error == 0) {
-               struct _kevent *cont_args;
-               /* store the continuation/completion data in the uthread */
-               ut = (uthread_t)get_bsdthread_info(thread);
-               cont_args = &ut->uu_save.uus_kevent;
-               cont_args->fp = fp;
-               cont_args->fd = fd;
-               cont_args->retval = retval;
-               cont_args->eventlist = ueventlist;
-               cont_args->eventcount = nevents;
-               cont_args->eventout = noutputs;
-               cont_args->data_available = data_available;
-               cont_args->process_data.fp_fd = (int)id;
-               cont_args->process_data.fp_flags = flags;
-               cont_args->process_data.fp_data_out = data_out;
-               cont_args->process_data.fp_data_size = data_size;
-               cont_args->process_data.fp_data_resid = data_size;
+               /* fp reference count now applies to knote */
 
                /*
-                * kqworkloop_end_processing() will happen at the end of kqueue_scan()
+                * we can't use filter_call() because f_attach can change the filter ops
+                * for a filter that supports f_extended_codes, so we need to reload
+                * knote_fops() and not use `fops`.
                 */
-               needs_end_processing = false;
+               result = fops->f_attach(kn, kev);
+               if (result && !knote_fops(kn)->f_extended_codes) {
+                       result = FILTER_ACTIVE;
+               }
 
-               error = kqueue_scan(kq, kevent_callback,
-                   continuation, cont_args,
-                   &cont_args->process_data,
-                   &atv, p);
+               kqlock(kq);
 
-               /* process remaining outputs */
-               noutputs = cont_args->eventout;
-               data_resid = cont_args->process_data.fp_data_resid;
+               if (result & FILTER_THREADREQ_NODEFEER) {
+                       enable_preemption();
+               }
 
-               /* copyout residual data size value (if it needs to be copied out) */
-               /* don't abandon other output just because of residual copyout failures */
-               if (error == 0 && data_available && data_resid != data_size) {
-                       (void)kevent_put_data_size(p, data_available, flags, data_resid);
+               if (kn->kn_flags & EV_ERROR) {
+                       /*
+                        * Failed to attach correctly, so drop.
+                        */
+                       kn->kn_filtid = EVFILTID_DETACHED;
+                       error = kn->kn_sdata;
+                       knote_drop(kq, kn, &knlc);
+                       result = 0;
+                       goto out;
                }
-       }
 
-out:
-       if (__improbable(needs_end_processing)) {
                /*
-                * If we didn't through kqworkloop_end_processing(),
-                * we need to do it here.
+                * end "attaching" phase - now just attached
+                *
+                * Mark the thread request overcommit, if appropos
+                *
+                * If the attach routine indicated that an
+                * event is already fired, activate the knote.
+                */
+               if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
+                   (kq->kq_state & KQ_WORKLOOP)) {
+                       kqworkloop_set_overcommit((struct kqworkloop *)kq);
+               }
+       } else if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
+               /*
+                * The knote was dropped while we were waiting for the lock,
+                * we need to re-evaluate entirely
                 */
-               kqlock(kq);
-               kqworkloop_end_processing((struct kqworkloop *)kq, 0, 0);
-               kqunlock(kq);
-       }
-       kevent_put_kq(p, id, fp, kq);
-
-       /* don't restart after signals... */
-       if (error == ERESTART) {
-               error = EINTR;
-       } else if (error == EWOULDBLOCK) {
-               error = 0;
-       }
-       if (error == 0) {
-               *retval = noutputs;
-       }
-       return error;
-}
-
 
-/*
- * kevent_callback - callback for each individual event
- *
- * called with nothing locked
- * caller holds a reference on the kqueue
- */
-static int
-kevent_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp,
-    void *data)
-{
-       struct _kevent *cont_args;
-       int error;
-
-       cont_args = (struct _kevent *)data;
-       assert(cont_args->eventout < cont_args->eventcount);
-
-       /*
-        * Copy out the appropriate amount of event data for this user.
-        */
-       error = kevent_copyout(kevp, &cont_args->eventlist, current_proc(),
-           cont_args->process_data.fp_flags);
-
-       /*
-        * If there isn't space for additional events, return
-        * a harmless error to stop the processing here
-        */
-       if (error == 0 && ++cont_args->eventout == cont_args->eventcount) {
-               error = EWOULDBLOCK;
-       }
-       return error;
-}
+               goto restart;
+       } else if (kev->flags & EV_DELETE) {
+               /*
+                * Deletion of a knote (drop)
+                *
+                * If the filter wants to filter drop events, let it do so.
+                *
+                * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
+                * we must wait for the knote to be re-enabled (unless it is being
+                * re-enabled atomically here).
+                */
 
-/*
- * kevent_description - format a description of a kevent for diagnostic output
- *
- * called with a 256-byte string buffer
- */
+               if (knote_fops(kn)->f_allow_drop) {
+                       bool drop;
 
-char *
-kevent_description(struct kevent_internal_s *kevp, char *s, size_t n)
-{
-       snprintf(s, n,
-           "kevent="
-           "{.ident=%#llx, .filter=%d, .flags=%#x, .udata=%#llx, .fflags=%#x, .data=%#llx, .ext[0]=%#llx, .ext[1]=%#llx}",
-           kevp->ident,
-           kevp->filter,
-           kevp->flags,
-           kevp->udata,
-           kevp->fflags,
-           kevp->data,
-           kevp->ext[0],
-           kevp->ext[1] );
+                       kqunlock(kq);
+                       drop = knote_fops(kn)->f_allow_drop(kn, kev);
+                       kqlock(kq);
 
-       return s;
-}
+                       if (!drop) {
+                               goto out_unlock;
+                       }
+               }
 
-static int
-kevent_register_validate_priority(struct kqueue *kq, struct knote *kn,
-    struct kevent_internal_s *kev)
-{
-       /* We don't care about the priority of a disabled or deleted knote */
-       if (kev->flags & (EV_DISABLE | EV_DELETE)) {
-               return 0;
-       }
+               if ((kev->flags & EV_ENABLE) == 0 &&
+                   (kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
+                   (kn->kn_status & KN_DISABLED) != 0) {
+                       kn->kn_status |= KN_DEFERDELETE;
+                       error = EINPROGRESS;
+                       goto out_unlock;
+               }
 
-       if (kq->kq_state & KQ_WORKLOOP) {
+               knote_drop(kq, kn, &knlc);
+               goto out;
+       } else {
                /*
-                * Workloops need valid priorities with a QOS (excluding manager) for
-                * any enabled knote.
+                * Regular update of a knote (touch)
                 *
-                * When it is pre-existing, just make sure it has a valid QoS as
-                * kevent_register() will not use the incoming priority (filters who do
-                * have the responsibility to validate it again, see filt_wltouch).
+                * Call touch routine to notify filter of changes in filter values
+                * (and to re-determine if any events are fired).
                 *
-                * If the knote is being made, validate the incoming priority.
+                * If the knote is in defer-delete, avoid calling the filter touch
+                * routine (it has delivered its last event already).
+                *
+                * If the touch routine had no failure,
+                * apply the requested side effects to the knote.
                 */
-               if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) {
-                       return ERANGE;
+
+               if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
+                       if (kev->flags & EV_ENABLE) {
+                               result = FILTER_ACTIVE;
+                       }
+               } else {
+                       kqunlock(kq);
+                       result = filter_call(knote_fops(kn), f_touch(kn, kev));
+                       kqlock(kq);
+                       if (result & FILTER_THREADREQ_NODEFEER) {
+                               enable_preemption();
+                       }
+               }
+
+               if (kev->flags & EV_ERROR) {
+                       result = 0;
+                       goto out_unlock;
+               }
+
+               if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0 &&
+                   kn->kn_udata != kev->udata) {
+                       // this allows klist_copy_udata() not to take locks
+                       os_atomic_store_wide(&kn->kn_udata, kev->udata, relaxed);
+               }
+               if ((kev->flags & EV_DISABLE) && !(kn->kn_status & KN_DISABLED)) {
+                       kn->kn_status |= KN_DISABLED;
+                       knote_dequeue(kq, kn);
                }
        }
 
-       return 0;
+       /* accept new kevent state */
+       knote_apply_touch(kq, kn, kev, result);
+
+out_unlock:
+       /*
+        * When the filter asked for a post-register wait,
+        * we leave the kqueue locked for kevent_register()
+        * to call the filter's f_post_register_wait hook.
+        */
+       if (result & FILTER_REGISTER_WAIT) {
+               knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
+               *kn_out = kn;
+       } else {
+               knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
+       }
+
+out:
+       /* output local errors through the kevent */
+       if (error) {
+               kev->flags |= EV_ERROR;
+               kev->data = error;
+       }
+       return result;
 }
 
 /*
- * Prepare a filter for waiting after register.
+ * knote_process - process a triggered event
  *
- * The f_post_register_wait hook will be called later by kevent_register()
- * and should call kevent_register_wait_block()
+ *     Validate that it is really still a triggered event
+ *     by calling the filter routines (if necessary).  Hold
+ *     a use reference on the knote to avoid it being detached.
+ *
+ *     If it is still considered triggered, we will have taken
+ *     a copy of the state under the filter lock.  We use that
+ *     snapshot to dispatch the knote for future processing (or
+ *     not, if this was a lost event).
+ *
+ *     Our caller assures us that nobody else can be processing
+ *     events from this knote during the whole operation. But
+ *     others can be touching or posting events to the knote
+ *     interspersed with our processing it.
+ *
+ *     caller holds a reference on the kqueue.
+ *     kqueue locked on entry and exit - but may be dropped
  */
 static int
-kevent_register_wait_prepare(struct knote *kn, struct kevent_internal_s *kev)
+knote_process(struct knote *kn, kevent_ctx_t kectx,
+    kevent_callback_t callback)
 {
-       thread_t thread = current_thread();
-       struct uthread *uth = get_bsdthread_info(thread);
+       struct kevent_qos_s kev;
+       struct kqueue *kq = knote_get_kq(kn);
+       KNOTE_LOCK_CTX(knlc);
+       int result = FILTER_ACTIVE;
+       int error = 0;
+       bool drop = false;
 
-       assert(knote_fops(kn)->f_extended_codes);
+       /*
+        * Must be active or stayactive
+        * Must be queued and not disabled/suppressed or dropping
+        */
+       assert(kn->kn_status & KN_QUEUED);
+       assert(kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE));
+       assert(!(kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)));
 
-       if (kn->kn_hook == NULL) {
-               thread_reference(thread);
-               kn->kn_hook = thread;
-       } else if (kn->kn_hook != thread) {
+       if (kq->kq_state & KQ_WORKLOOP) {
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
+                   ((struct kqworkloop *)kq)->kqwl_dynamicid,
+                   kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
+                   kn->kn_filtid);
+       } else if (kq->kq_state & KQ_WORKQ) {
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
+                   0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
+                   kn->kn_filtid);
+       } else {
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
+                   VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
+                   kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
+       }
+
+       if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) {
                /*
-                * kn_hook may be set from a previous aborted wait
-                * However, it has to be from the same thread.
+                * When the knote is dropping or has dropped,
+                * then there's nothing we want to process.
                 */
-               kev->flags |= EV_ERROR;
-               kev->data = EXDEV;
-               return 0;
+               return EJUSTRETURN;
        }
 
-       uth->uu_save.uus_kevent_register.knote = kn;
-       return FILTER_REGISTER_WAIT;
-}
-
-/*
- * Cleanup a kevent_register_wait_prepare() effect for threads that have been
- * aborted instead of properly woken up with thread_wakeup_thread().
- */
-static void
-kevent_register_wait_cleanup(struct knote *kn)
-{
-       thread_t thread = kn->kn_hook;
-       kn->kn_hook = NULL;
-       thread_deallocate(thread);
-}
+       /*
+        * While waiting for the knote lock, we may have dropped the kq lock.
+        * and a touch may have disabled and dequeued the knote.
+        */
+       if (!(kn->kn_status & KN_QUEUED)) {
+               knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
+               return EJUSTRETURN;
+       }
 
-/*
- * Must be called at the end of a f_post_register_wait call from a filter.
- */
-static void
-kevent_register_wait_block(struct turnstile *ts, thread_t thread,
-    struct knote_lock_ctx *knlc, thread_continue_t cont,
-    struct _kevent_register *cont_args)
-{
-       knote_unlock(cont_args->kq, cont_args->knote, knlc, KNOTE_KQ_UNLOCK);
-       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
-       cont_args->handoff_thread = thread;
-       thread_handoff_parameter(thread, cont, cont_args);
-}
+       /*
+        * For deferred-drop or vanished events, we just create a fake
+        * event to acknowledge end-of-life.  Otherwise, we call the
+        * filter's process routine to snapshot the kevent state under
+        * the filter's locking protocol.
+        *
+        * suppress knotes to avoid returning the same event multiple times in
+        * a single call.
+        */
+       knote_suppress(kq, kn);
 
-/*
- * Called by Filters using a f_post_register_wait to return from their wait.
- */
-static void
-kevent_register_wait_return(struct _kevent_register *cont_args)
-{
-       struct kqueue *kq = cont_args->kq;
-       proc_t p = kq->kq_p;
-       struct kevent_internal_s *kev = &cont_args->kev;
-       int error = 0;
+       if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
+               int kev_flags = EV_DISPATCH2 | EV_ONESHOT;
+               if (kn->kn_status & KN_DEFERDELETE) {
+                       kev_flags |= EV_DELETE;
+               } else {
+                       kev_flags |= EV_VANISHED;
+               }
 
-       if (cont_args->handoff_thread) {
-               thread_deallocate(cont_args->handoff_thread);
+               /* create fake event */
+               kev = (struct kevent_qos_s){
+                       .filter = kn->kn_filter,
+                       .ident  = kn->kn_id,
+                       .flags  = kev_flags,
+                       .udata  = kn->kn_udata,
+               };
+       } else {
+               kqunlock(kq);
+               kev = (struct kevent_qos_s) { };
+               result = filter_call(knote_fops(kn), f_process(kn, &kev));
+               kqlock(kq);
        }
 
-       if (kev->flags & (EV_ERROR | EV_RECEIPT)) {
-               if ((kev->flags & EV_ERROR) == 0) {
-                       kev->flags |= EV_ERROR;
-                       kev->data = 0;
+       /*
+        * Determine how to dispatch the knote for future event handling.
+        * not-fired: just return (do not callout, leave deactivated).
+        * One-shot:  If dispatch2, enter deferred-delete mode (unless this is
+        *            is the deferred delete event delivery itself).  Otherwise,
+        *            drop it.
+        * Dispatch:  don't clear state, just mark it disabled.
+        * Cleared:   just leave it deactivated.
+        * Others:    re-activate as there may be more events to handle.
+        *            This will not wake up more handlers right now, but
+        *            at the completion of handling events it may trigger
+        *            more handler threads (TODO: optimize based on more than
+        *            just this one event being detected by the filter).
+        */
+       if ((result & FILTER_ACTIVE) == 0) {
+               if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0) {
+                       /*
+                        * Stay active knotes should not be unsuppressed or we'd create an
+                        * infinite loop.
+                        *
+                        * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
+                        * within f_process() but that doesn't necessarily make them
+                        * ready to process, so we should leave them be.
+                        *
+                        * For other knotes, since we will not return an event,
+                        * there's no point keeping the knote suppressed.
+                        */
+                       knote_unsuppress(kq, kn);
                }
-               error = kevent_copyout(kev, &cont_args->ueventlist, p, cont_args->flags);
-               if (error == 0) {
-                       cont_args->eventout++;
+               knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
+               return EJUSTRETURN;
+       }
+
+       if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
+               knote_adjust_qos(kq, kn, result);
+       }
+       kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override);
+
+       if (kev.flags & EV_ONESHOT) {
+               if ((kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
+                   (kn->kn_status & KN_DEFERDELETE) == 0) {
+                       /* defer dropping non-delete oneshot dispatch2 events */
+                       kn->kn_status |= KN_DEFERDELETE | KN_DISABLED;
+               } else {
+                       drop = true;
                }
+       } else if (kn->kn_flags & EV_DISPATCH) {
+               /* disable all dispatch knotes */
+               kn->kn_status |= KN_DISABLED;
+       } else if ((kn->kn_flags & EV_CLEAR) == 0) {
+               /* re-activate in case there are more events */
+               knote_activate(kq, kn, FILTER_ACTIVE);
        }
 
-       kevent_put_kq(p, cont_args->fd, cont_args->fp, kq);
-       if (error == 0) {
-               *cont_args->retval = cont_args->eventout;
+       /*
+        * callback to handle each event as we find it.
+        * If we have to detach and drop the knote, do
+        * it while we have the kq unlocked.
+        */
+       if (drop) {
+               knote_drop(kq, kn, &knlc);
+       } else {
+               knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
        }
-       unix_syscall_return(error);
-}
 
-/*
- * kevent_register - add a new event to a kqueue
- *
- *     Creates a mapping between the event source and
- *     the kqueue via a knote data structure.
- *
- *     Because many/most the event sources are file
- *     descriptor related, the knote is linked off
- *     the filedescriptor table for quick access.
- *
- *     called with nothing locked
- *     caller holds a reference on the kqueue
- */
+       if (kev.flags & EV_VANISHED) {
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
+                   kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
+                   kn->kn_filtid);
+       }
 
-int
-kevent_register(struct kqueue *kq, struct kevent_internal_s *kev,
-    struct knote_lock_ctx *knlc)
+       error = (callback)(&kev, kectx);
+       kqlock(kq);
+       return error;
+}
+
+/*
+ * Returns -1 if the kqueue was unbound and processing should not happen
+ */
+#define KQWQAE_BEGIN_PROCESSING 1
+#define KQWQAE_END_PROCESSING   2
+#define KQWQAE_UNBIND           3
+static int
+kqworkq_acknowledge_events(struct kqworkq *kqwq, workq_threadreq_t kqr,
+    int kevent_flags, int kqwqae_op)
 {
-       struct proc *p = kq->kq_p;
-       const struct filterops *fops;
-       struct knote *kn = NULL;
-       int result = 0, error = 0;
-       unsigned short kev_flags = kev->flags;
+       thread_qos_t old_override = THREAD_QOS_UNSPECIFIED;
+       thread_t thread = kqr_thread_fast(kqr);
+       struct knote *kn;
+       int rc = 0;
+       bool unbind;
+       struct kqtailq *suppressq = &kqwq->kqwq_suppressed[kqr->tr_kq_qos_index];
 
-       if (kev->filter < 0) {
-               if (kev->filter + EVFILT_SYSCOUNT < 0) {
-                       error = EINVAL;
-                       goto out;
+       kqlock_held(&kqwq->kqwq_kqueue);
+
+       if (!TAILQ_EMPTY(suppressq)) {
+               /*
+                * Return suppressed knotes to their original state.
+                * For workq kqueues, suppressed ones that are still
+                * truly active (not just forced into the queue) will
+                * set flags we check below to see if anything got
+                * woken up.
+                */
+               while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
+                       assert(kn->kn_status & KN_SUPPRESSED);
+                       knote_unsuppress(kqwq, kn);
                }
-               fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
-       } else {
-               error = EINVAL;
-               goto out;
        }
 
-       /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
-       if ((kev->flags & EV_VANISHED) &&
-           (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2)) {
-               error = EINVAL;
-               goto out;
-       }
+#if DEBUG || DEVELOPMENT
+       thread_t self = current_thread();
+       struct uthread *ut = get_bsdthread_info(self);
 
-       /* Simplify the flags - delete and disable overrule */
-       if (kev->flags & EV_DELETE) {
-               kev->flags &= ~EV_ADD;
-       }
-       if (kev->flags & EV_DISABLE) {
-               kev->flags &= ~EV_ENABLE;
-       }
+       assert(thread == self);
+       assert(ut->uu_kqr_bound == kqr);
+#endif // DEBUG || DEVELOPMENT
 
-       if (kq->kq_state & KQ_WORKLOOP) {
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
-                   ((struct kqworkloop *)kq)->kqwl_dynamicid,
-                   kev->udata, kev->flags, kev->filter);
-       } else if (kq->kq_state & KQ_WORKQ) {
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
-                   0, kev->udata, kev->flags, kev->filter);
+       if (kqwqae_op == KQWQAE_UNBIND) {
+               unbind = true;
+       } else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) {
+               unbind = false;
        } else {
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
-                   VM_KERNEL_UNSLIDE_OR_PERM(kq),
-                   kev->udata, kev->flags, kev->filter);
-       }
-
-restart:
-       /* find the matching knote from the fd tables/hashes */
-       kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
-       error = kevent_register_validate_priority(kq, kn, kev);
-       result = 0;
-       if (error) {
-               goto out;
+               unbind = !kqr->tr_kq_wakeup;
        }
-
-       if (kn == NULL && (kev->flags & EV_ADD) == 0) {
+       if (unbind) {
+               old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
+               rc = -1;
                /*
-                * No knote found, EV_ADD wasn't specified
+                * request a new thread if we didn't process the whole queue or real events
+                * have happened (not just putting stay-active events back).
                 */
-
-               if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
-                   (kq->kq_state & KQ_WORKLOOP)) {
-                       /*
-                        * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
-                        * that doesn't care about ENOENT, so just pretend the deletion
-                        * happened.
-                        */
-               } else {
-                       error = ENOENT;
+               if (kqr->tr_kq_wakeup) {
+                       kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr,
+                           kqr->tr_kq_qos_index, 0);
                }
-               goto out;
-       } else if (kn == NULL) {
+       }
+
+       if (rc == 0) {
                /*
-                * No knote found, need to attach a new one (attach)
+                * Reset wakeup bit to notice events firing while we are processing,
+                * as we cannot rely on the bucket queue emptiness because of stay
+                * active knotes.
                 */
+               kqr->tr_kq_wakeup = false;
+       }
 
-               struct fileproc *knote_fp = NULL;
-
-               /* grab a file reference for the new knote */
-               if (fops->f_isfd) {
-                       if ((error = fp_lookup(p, kev->ident, &knote_fp, 0)) != 0) {
-                               goto out;
-                       }
-               }
+       if (old_override) {
+               thread_drop_kevent_override(thread);
+       }
 
-               kn = knote_alloc();
-               if (kn == NULL) {
-                       error = ENOMEM;
-                       if (knote_fp != NULL) {
-                               fp_drop(p, kev->ident, knote_fp, 0);
-                       }
-                       goto out;
-               }
+       return rc;
+}
 
-               kn->kn_fp = knote_fp;
-               kn->kn_kq_packed = (intptr_t)(struct kqueue *)kq;
-               kqueue_retain(kq); /* retain a kq ref */
-               kn->kn_filtid = ~kev->filter;
-               kn->kn_status = KN_ATTACHING | KN_ATTACHED;
+/*
+ * Return 0 to indicate that processing should proceed,
+ * -1 if there is nothing to process.
+ *
+ * Called with kqueue locked and returns the same way,
+ * but may drop lock temporarily.
+ */
+static int
+kqworkq_begin_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
+    int kevent_flags)
+{
+       int rc = 0;
 
-               /* was vanish support requested */
-               if (kev->flags & EV_VANISHED) {
-                       kev->flags &= ~EV_VANISHED;
-                       kn->kn_status |= KN_REQVANISH;
-               }
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
+           0, kqr->tr_kq_qos_index);
 
-               /* snapshot matching/dispatching protcol flags into knote */
-               if (kev->flags & EV_DISPATCH) {
-                       kn->kn_status |= KN_DISPATCH;
-               }
-               if (kev->flags & EV_UDATA_SPECIFIC) {
-                       kn->kn_status |= KN_UDATA_SPECIFIC;
-               }
-               if (kev->flags & EV_DISABLE) {
-                       kn->kn_status |= KN_DISABLED;
-               }
+       rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
+           KQWQAE_BEGIN_PROCESSING);
 
-               /*
-                * copy the kevent state into knote
-                * protocol is that fflags and data
-                * are saved off, and cleared before
-                * calling the attach routine.
-                */
-               kn->kn_kevent = *kev;
-               kn->kn_sfflags = kev->fflags;
-               kn->kn_sdata = kev->data;
-               kn->kn_fflags = 0;
-               kn->kn_data = 0;
-               knote_reset_priority(kn, kev->qos);
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
+           thread_tid(kqr_thread(kqr)), kqr->tr_kq_wakeup);
 
-               /* Add the knote for lookup thru the fd table */
-               error = kq_add_knote(kq, kn, knlc, p);
-               if (error) {
-                       (void)kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
-                       knote_free(kn);
-                       if (knote_fp != NULL) {
-                               fp_drop(p, kev->ident, knote_fp, 0);
-                       }
+       return rc;
+}
 
-                       if (error == ERESTART) {
-                               goto restart;
-                       }
-                       goto out;
-               }
+static thread_qos_t
+kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
+{
+       kq_index_t qos = THREAD_QOS_UNSPECIFIED;
+       struct knote *kn, *tmp;
 
-               /* fp reference count now applies to knote */
+       kqlock_held(kqwl);
 
+       TAILQ_FOREACH_SAFE(kn, &kqwl->kqwl_suppressed, kn_tqe, tmp) {
                /*
-                * we can't use filter_call() because f_attach can change the filter ops
-                * for a filter that supports f_extended_codes, so we need to reload
-                * knote_fops() and not use `fops`.
+                * If a knote that can adjust QoS is disabled because of the automatic
+                * behavior of EV_DISPATCH, the knotes should stay suppressed so that
+                * further overrides keep pushing.
                 */
-               result = fops->f_attach(kn, kev);
-               if (result && !knote_fops(kn)->f_extended_codes) {
-                       result = FILTER_ACTIVE;
-               }
-
-               kqlock(kq);
-
-               if (kn->kn_flags & EV_ERROR) {
-                       /*
-                        * Failed to attach correctly, so drop.
-                        */
-                       kn->kn_status &= ~(KN_ATTACHED | KN_ATTACHING);
-                       error = kn->kn_data;
-                       knote_drop(kq, kn, knlc);
-                       result = 0;
-                       goto out;
+               if (knote_fops(kn)->f_adjusts_qos && (kn->kn_status & KN_DISABLED) &&
+                   (kn->kn_status & (KN_STAYACTIVE | KN_DROPPING)) == 0 &&
+                   (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
+                       qos = MAX(qos, kn->kn_qos_override);
+                       continue;
                }
+               knote_unsuppress(kqwl, kn);
+       }
 
-               /*
-                * end "attaching" phase - now just attached
-                *
-                * Mark the thread request overcommit, if appropos
-                *
-                * If the attach routine indicated that an
-                * event is already fired, activate the knote.
-                */
-               kn->kn_status &= ~KN_ATTACHING;
-               knote_set_qos_overcommit(kn);
+       return qos;
+}
 
-               if (result & FILTER_ACTIVE) {
-                       if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
-                               knote_adjust_qos(kq, kn, result);
-                       }
-                       knote_activate(kn);
-               }
-       } else if (!knote_lock(kq, kn, knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
-               /*
-                * The knote was dropped while we were waiting for the lock,
-                * we need to re-evaluate entirely
-                */
+static int
+kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags)
+{
+       workq_threadreq_t kqr = &kqwl->kqwl_request;
+       struct kqueue *kq = &kqwl->kqwl_kqueue;
+       thread_qos_t qos_override;
+       thread_t thread = kqr_thread_fast(kqr);
+       int rc = 0, op = KQWL_UTQ_NONE;
 
-               goto restart;
-       } else if (kev->flags & EV_DELETE) {
-               /*
-                * Deletion of a knote (drop)
-                *
-                * If the filter wants to filter drop events, let it do so.
-                *
-                * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
-                * we must wait for the knote to be re-enabled (unless it is being
-                * re-enabled atomically here).
-                */
+       kqlock_held(kq);
 
-               if (knote_fops(kn)->f_allow_drop) {
-                       bool drop;
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
+           kqwl->kqwl_dynamicid, 0, 0);
 
-                       kqunlock(kq);
-                       drop = knote_fops(kn)->f_allow_drop(kn, kev);
-                       kqlock(kq);
+       /* nobody else should still be processing */
+       assert((kq->kq_state & KQ_PROCESSING) == 0);
 
-                       if (!drop) {
-                               goto out_unlock;
-                       }
-               }
+       kq->kq_state |= KQ_PROCESSING;
 
-               if ((kev->flags & EV_ENABLE) == 0 &&
-                   (kn->kn_status & (KN_DISPATCH2 | KN_DISABLED)) ==
-                   (KN_DISPATCH2 | KN_DISABLED)) {
-                       kn->kn_status |= KN_DEFERDELETE;
-                       error = EINPROGRESS;
-                       goto out_unlock;
-               }
+       if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
+               op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
+       }
 
-               knote_drop(kq, kn, knlc);
-               goto out;
-       } else {
+       if (kevent_flags & KEVENT_FLAG_PARKING) {
                /*
-                * Regular update of a knote (touch)
-                *
-                * Call touch routine to notify filter of changes in filter values
-                * (and to re-determine if any events are fired).
-                *
-                * If the knote is in defer-delete, avoid calling the filter touch
-                * routine (it has delivered its last event already).
+                * When "parking" we want to process events and if no events are found
+                * unbind.
                 *
-                * If the touch routine had no failure,
-                * apply the requested side effects to the knote.
+                * However, non overcommit threads sometimes park even when they have
+                * more work so that the pool can narrow.  For these, we need to unbind
+                * early, so that calling kqworkloop_update_threads_qos() can ask the
+                * workqueue subsystem whether the thread should park despite having
+                * pending events.
                 */
-
-               if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
-                       if (kev->flags & EV_ENABLE) {
-                               result = FILTER_ACTIVE;
-                       }
+               if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
+                       op = KQWL_UTQ_PARKING;
                } else {
-                       kqunlock(kq);
-                       result = filter_call(knote_fops(kn), f_touch(kn, kev));
-                       kqlock(kq);
+                       op = KQWL_UTQ_UNBINDING;
                }
+       }
+       if (op == KQWL_UTQ_NONE) {
+               goto done;
+       }
 
-               if (kev->flags & EV_ERROR) {
-                       result = 0;
+       qos_override = kqworkloop_acknowledge_events(kqwl);
+
+       if (op == KQWL_UTQ_UNBINDING) {
+               kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_IMMEDIATELY);
+               kqworkloop_release_live(kqwl);
+       }
+       kqworkloop_update_threads_qos(kqwl, op, qos_override);
+       if (op == KQWL_UTQ_PARKING) {
+               if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) {
+                       /*
+                        * We cannot trust tr_kq_wakeup when looking at stay active knotes.
+                        * We need to process once, and kqworkloop_end_processing will
+                        * handle the unbind.
+                        */
+               } else if (!kqr->tr_kq_wakeup || kqwl->kqwl_owner) {
+                       kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
+                       kqworkloop_release_live(kqwl);
+                       rc = -1;
+               }
+       } else if (op == KQWL_UTQ_UNBINDING) {
+               if (kqr_thread(kqr) == thread) {
+                       /*
+                        * The thread request fired again, passed the admission check and
+                        * got bound to the current thread again.
+                        */
                } else {
-                       /* accept new kevent state */
-                       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) {
-                               kn->kn_udata = kev->udata;
-                       }
-                       if (kev->flags & EV_DISABLE) {
-                               knote_disable(kn);
-                       }
-                       if (result & (FILTER_UPDATE_REQ_QOS | FILTER_ADJUST_EVENT_QOS_BIT)) {
-                               knote_dequeue(kn);
-                       }
-                       if ((result & FILTER_UPDATE_REQ_QOS) &&
-                           kev->qos && kev->qos != kn->kn_qos) {
-                               knote_reset_priority(kn, kev->qos);
-                       }
-                       if (result & FILTER_ACTIVE) {
-                               thread_qos_t qos;
-                               if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
-                                       if (knote_should_apply_qos_override(kq, kn, result, &qos)) {
-                                               knote_apply_qos_override(kn, qos);
-                                       }
-                               }
-                               knote_activate(kn);
-                       }
-                       if (result & (FILTER_UPDATE_REQ_QOS | FILTER_ADJUST_EVENT_QOS_BIT)) {
-                               if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) {
-                                       knote_wakeup(kn);
-                               }
-                       }
-                       if (kev->flags & EV_ENABLE) {
-                               knote_enable(kn);
-                       }
+                       rc = -1;
                }
        }
 
-out_unlock:
-       if ((result & FILTER_REGISTER_WAIT) == 0) {
+       if (rc == 0) {
                /*
-                * When the filter asked for a post-register wait,
-                * we leave the knote and kqueue locked for kevent_register()
-                * to call the filter's f_post_register_wait hook.
+                * Reset wakeup bit to notice stay active events firing while we are
+                * processing, as we cannot rely on the stayactive bucket emptiness.
                 */
-               knote_unlock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
+               kqwl->kqwl_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT;
+       } else {
+               kq->kq_state &= ~KQ_PROCESSING;
        }
 
-out:
-       /* output local errors through the kevent */
-       if (error) {
-               kev->flags |= EV_ERROR;
-               kev->data = error;
+       if (rc == -1) {
+               kqworkloop_unbind_delayed_override_drop(thread);
        }
-       return result;
+
+done:
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
+           kqwl->kqwl_dynamicid, 0, 0);
+
+       return rc;
 }
 
 /*
- * knote_process - process a triggered event
- *
- *     Validate that it is really still a triggered event
- *     by calling the filter routines (if necessary).  Hold
- *     a use reference on the knote to avoid it being detached.
- *
- *     If it is still considered triggered, we will have taken
- *     a copy of the state under the filter lock.  We use that
- *     snapshot to dispatch the knote for future processing (or
- *     not, if this was a lost event).
- *
- *     Our caller assures us that nobody else can be processing
- *     events from this knote during the whole operation. But
- *     others can be touching or posting events to the knote
- *     interspersed with our processing it.
+ * Return 0 to indicate that processing should proceed,
+ * -1 if there is nothing to process.
+ * EBADF if the kqueue is draining
  *
- *     caller holds a reference on the kqueue.
- *     kqueue locked on entry and exit - but may be dropped
+ * Called with kqueue locked and returns the same way,
+ * but may drop lock temporarily.
+ * May block.
  */
 static int
-knote_process(struct knote *kn,
-    kevent_callback_t callback,
-    void *callback_data,
-    struct filt_process_s *process_data)
+kqfile_begin_processing(struct kqfile *kq)
 {
-       struct kevent_internal_s kev;
-       struct kqueue *kq = knote_get_kq(kn);
-       KNOTE_LOCK_CTX(knlc);
-       int result = FILTER_ACTIVE;
-       int error = 0;
-       bool drop = false;
-
-       bzero(&kev, sizeof(kev));
+       struct kqtailq *suppressq;
 
-       /*
-        * Must be active or stayactive
-        * Must be queued and not disabled/suppressed
-        */
-       assert(kn->kn_status & KN_QUEUED);
-       assert(kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE));
-       assert(!(kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)));
+       kqlock_held(kq);
 
-       if (kq->kq_state & KQ_WORKLOOP) {
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
-                   ((struct kqworkloop *)kq)->kqwl_dynamicid,
-                   kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
-                   kn->kn_filtid);
-       } else if (kq->kq_state & KQ_WORKQ) {
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
-                   0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
-                   kn->kn_filtid);
-       } else {
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
-                   VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
-                   kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
-       }
+       assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
+           VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
 
-       if ((kn->kn_status & KN_DROPPING) ||
-           !knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) {
-               /*
-                * When the knote is dropping or has dropped,
-                * then there's nothing we want to process.
-                */
-               return EJUSTRETURN;
-       }
+       /* wait to become the exclusive processing thread */
+       for (;;) {
+               if (kq->kqf_state & KQ_DRAIN) {
+                       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
+                           VM_KERNEL_UNSLIDE_OR_PERM(kq), 2);
+                       return EBADF;
+               }
 
-       /*
-        * For deferred-drop or vanished events, we just create a fake
-        * event to acknowledge end-of-life.  Otherwise, we call the
-        * filter's process routine to snapshot the kevent state under
-        * the filter's locking protocol.
-        *
-        * suppress knotes to avoid returning the same event multiple times in
-        * a single call.
-        */
-       knote_suppress(kn);
+               if ((kq->kqf_state & KQ_PROCESSING) == 0) {
+                       break;
+               }
 
-       if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
-               /* create fake event */
-               kev.filter = kn->kn_filter;
-               kev.ident = kn->kn_id;
-               kev.flags = (kn->kn_status & KN_DEFERDELETE) ? EV_DELETE : EV_VANISHED;
-               kev.flags |= (EV_DISPATCH2 | EV_ONESHOT);
-               kev.udata = kn->kn_udata;
-       } else {
-               /* deactivate - so new activations indicate a wakeup */
-               knote_deactivate(kn);
+               /* if someone else is processing the queue, wait */
+               kq->kqf_state |= KQ_PROCWAIT;
+               suppressq = &kq->kqf_suppressed;
+               waitq_assert_wait64((struct waitq *)&kq->kqf_wqs,
+                   CAST_EVENT64_T(suppressq), THREAD_UNINT | THREAD_WAIT_NOREPORT,
+                   TIMEOUT_WAIT_FOREVER);
 
                kqunlock(kq);
-               result = filter_call(knote_fops(kn), f_process(kn, process_data, &kev));
+               thread_block(THREAD_CONTINUE_NULL);
                kqlock(kq);
        }
 
-       /*
-        * Determine how to dispatch the knote for future event handling.
-        * not-fired: just return (do not callout, leave deactivated).
-        * One-shot:  If dispatch2, enter deferred-delete mode (unless this is
-        *            is the deferred delete event delivery itself).  Otherwise,
-        *            drop it.
-        * Dispatch:  don't clear state, just mark it disabled.
-        * Cleared:   just leave it deactivated.
-        * Others:    re-activate as there may be more events to handle.
-        *            This will not wake up more handlers right now, but
-        *            at the completion of handling events it may trigger
-        *            more handler threads (TODO: optimize based on more than
-        *            just this one event being detected by the filter).
-        */
-       if ((result & FILTER_ACTIVE) == 0) {
-               if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0) {
-                       /*
-                        * Stay active knotes should not be unsuppressed or we'd create an
-                        * infinite loop.
-                        *
-                        * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
-                        * within f_process() but that doesn't necessarily make them
-                        * ready to process, so we should leave them be.
-                        *
-                        * For other knotes, since we will not return an event,
-                        * there's no point keeping the knote suppressed.
-                        */
-                       knote_unsuppress(kn);
-               }
-               knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
-               return EJUSTRETURN;
-       }
+       /* Nobody else processing */
 
-       if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
-               knote_adjust_qos(kq, kn, result);
-       }
-       kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override);
+       /* clear pre-posts and KQ_WAKEUP now, in case we bail early */
+       waitq_set_clear_preposts(&kq->kqf_wqs);
+       kq->kqf_state &= ~KQ_WAKEUP;
 
-       if (kev.flags & EV_ONESHOT) {
-               if ((kn->kn_status & (KN_DISPATCH2 | KN_DEFERDELETE)) == KN_DISPATCH2) {
-                       /* defer dropping non-delete oneshot dispatch2 events */
-                       kn->kn_status |= KN_DEFERDELETE;
-                       knote_disable(kn);
-               } else {
-                       drop = true;
-               }
-       } else if (kn->kn_status & KN_DISPATCH) {
-               /* disable all dispatch knotes */
-               knote_disable(kn);
-       } else if ((kev.flags & EV_CLEAR) == 0) {
-               /* re-activate in case there are more events */
-               knote_activate(kn);
+       /* anything left to process? */
+       if (TAILQ_EMPTY(&kq->kqf_queue)) {
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
+                   VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
+               return -1;
        }
 
-       /*
-        * callback to handle each event as we find it.
-        * If we have to detach and drop the knote, do
-        * it while we have the kq unlocked.
-        */
-       if (drop) {
-               knote_drop(kq, kn, &knlc);
-       } else {
-               knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
-       }
+       /* convert to processing mode */
+       kq->kqf_state |= KQ_PROCESSING;
 
-       if (kev.flags & EV_VANISHED) {
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
-                   kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
-                   kn->kn_filtid);
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
+           VM_KERNEL_UNSLIDE_OR_PERM(kq));
+
+       return 0;
+}
+
+/*
+ * Try to end the processing, only called when a workq thread is attempting to
+ * park (KEVENT_FLAG_PARKING is set).
+ *
+ * When returning -1, the kqworkq is setup again so that it is ready to be
+ * processed.
+ */
+static int
+kqworkq_end_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
+    int kevent_flags)
+{
+       if (!TAILQ_EMPTY(&kqwq->kqwq_queue[kqr->tr_kq_qos_index])) {
+               /* remember we didn't process everything */
+               kqr->tr_kq_wakeup = true;
+       }
+
+       if (kevent_flags & KEVENT_FLAG_PARKING) {
+               /*
+                * if acknowledge events "succeeds" it means there are events,
+                * which is a failure condition for end_processing.
+                */
+               int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
+                   KQWQAE_END_PROCESSING);
+               if (rc == 0) {
+                       return -1;
+               }
        }
 
-       error = (callback)(kq, &kev, callback_data);
-       kqlock(kq);
-       return error;
+       return 0;
 }
 
 /*
- * Returns -1 if the kqueue was unbound and processing should not happen
+ * Try to end the processing, only called when a workq thread is attempting to
+ * park (KEVENT_FLAG_PARKING is set).
+ *
+ * When returning -1, the kqworkq is setup again so that it is ready to be
+ * processed (as if kqworkloop_begin_processing had just been called).
+ *
+ * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
+ * the kqworkloop is unbound from its servicer as a side effect.
  */
-#define KQWQAE_BEGIN_PROCESSING 1
-#define KQWQAE_END_PROCESSING   2
-#define KQWQAE_UNBIND           3
 static int
-kqworkq_acknowledge_events(struct kqworkq *kqwq, struct kqrequest *kqr,
-    int kevent_flags, int kqwqae_op)
+kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags)
 {
-       thread_qos_t old_override = THREAD_QOS_UNSPECIFIED;
-       thread_t thread = kqr->kqr_thread;
-       struct knote *kn;
+       struct kqueue *kq = &kqwl->kqwl_kqueue;
+       workq_threadreq_t kqr = &kqwl->kqwl_request;
+       thread_qos_t qos_override;
+       thread_t thread = kqr_thread_fast(kqr);
        int rc = 0;
-       bool seen_stayactive = false, unbind;
-
-       kqlock_held(&kqwq->kqwq_kqueue);
-
-       if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) {
-               /*
-                * Return suppressed knotes to their original state.
-                * For workq kqueues, suppressed ones that are still
-                * truly active (not just forced into the queue) will
-                * set flags we check below to see if anything got
-                * woken up.
-                */
-               while ((kn = TAILQ_FIRST(&kqr->kqr_suppressed)) != NULL) {
-                       assert(kn->kn_status & KN_SUPPRESSED);
-                       knote_unsuppress(kn);
-                       if (kn->kn_status & KN_STAYACTIVE) {
-                               seen_stayactive = true;
-                       }
-               }
-       }
 
-       kq_req_lock(kqwq);
+       kqlock_held(kq);
 
-#if DEBUG || DEVELOPMENT
-       thread_t self = current_thread();
-       struct uthread *ut = get_bsdthread_info(self);
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
+           kqwl->kqwl_dynamicid, 0, 0);
 
-       assert(kqr->kqr_state & KQR_THREQUESTED);
-       assert(kqr->kqr_thread == self);
-       assert(ut->uu_kqr_bound == kqr);
-#endif // DEBUG || DEVELOPMENT
+       if (flags & KQ_PROCESSING) {
+               assert(kq->kq_state & KQ_PROCESSING);
 
-       if (kqwqae_op == KQWQAE_UNBIND) {
-               unbind = true;
-       } else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) {
-               unbind = false;
-       } else if (kqwqae_op == KQWQAE_BEGIN_PROCESSING && seen_stayactive) {
-               /*
-                * When we unsuppress stayactive knotes, for the kind that are hooked
-                * through select, we need to process once before we can assert there's
-                * no event pending. Hence we can't unbind during BEGIN PROCESSING.
-                */
-               unbind = false;
-       } else {
-               unbind = ((kqr->kqr_state & KQR_WAKEUP) == 0);
-       }
-       if (unbind) {
-               old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
-               rc = -1;
                /*
-                * request a new thread if we didn't process the whole queue or real events
-                * have happened (not just putting stay-active events back).
+                * If we still have queued stayactive knotes, remember we didn't finish
+                * processing all of them.  This should be extremely rare and would
+                * require to have a lot of them registered and fired.
                 */
-               if (kqr->kqr_state & KQR_WAKEUP) {
-                       kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr,
-                           kqr->kqr_qos_index, 0);
+               if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) {
+                       kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS,
+                           KQWL_BUCKET_STAYACTIVE);
                }
-       }
 
-       if (rc == 0) {
                /*
-                * Reset wakeup bit to notice events firing while we are processing,
-                * as we cannot rely on the bucket queue emptiness because of stay
-                * active knotes.
+                * When KEVENT_FLAG_PARKING is set, we need to attempt an unbind while
+                * still under the lock.
+                *
+                * So we do everything kqworkloop_unbind() would do, but because we're
+                * inside kqueue_process(), if the workloop actually received events
+                * while our locks were dropped, we have the opportunity to fail the end
+                * processing and loop again.
+                *
+                * This avoids going through the process-wide workqueue lock hence
+                * scales better.
                 */
-               kqr->kqr_state &= ~KQR_WAKEUP;
+               if (kevent_flags & KEVENT_FLAG_PARKING) {
+                       qos_override = kqworkloop_acknowledge_events(kqwl);
+               }
        }
 
-       kq_req_unlock(kqwq);
+       if (kevent_flags & KEVENT_FLAG_PARKING) {
+               kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override);
+               if (kqr->tr_kq_wakeup && !kqwl->kqwl_owner) {
+                       /*
+                        * Reset wakeup bit to notice stay active events firing while we are
+                        * processing, as we cannot rely on the stayactive bucket emptiness.
+                        */
+                       kqwl->kqwl_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT;
+                       rc = -1;
+               } else {
+                       kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
+                       kqworkloop_release_live(kqwl);
+                       kq->kq_state &= ~flags;
+               }
+       } else {
+               kq->kq_state &= ~flags;
+               kq->kq_state |= KQ_R2K_ARMED;
+               kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
+       }
 
-       if (old_override) {
-               thread_drop_ipc_override(thread);
+       if ((kevent_flags & KEVENT_FLAG_PARKING) && rc == 0) {
+               kqworkloop_unbind_delayed_override_drop(thread);
        }
 
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
+           kqwl->kqwl_dynamicid, 0, 0);
+
        return rc;
 }
 
 /*
- * Return 0 to indicate that processing should proceed,
- * -1 if there is nothing to process.
+ * Called with kqueue lock held.
  *
- * Called with kqueue locked and returns the same way,
- * but may drop lock temporarily.
+ * 0: no more events
+ * -1: has more events
+ * EBADF: kqueue is in draining mode
  */
 static int
-kqworkq_begin_processing(struct kqworkq *kqwq, struct kqrequest *kqr,
-    int kevent_flags)
+kqfile_end_processing(struct kqfile *kq)
 {
-       int rc = 0;
+       struct kqtailq *suppressq = &kq->kqf_suppressed;
+       struct knote *kn;
+       int procwait;
 
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
-           0, kqr->kqr_qos_index);
+       kqlock_held(kq);
 
-       rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
-           KQWQAE_BEGIN_PROCESSING);
+       assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
 
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
-           thread_tid(kqr->kqr_thread), kqr->kqr_state);
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
+           VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
 
-       return rc;
-}
+       /*
+        * Return suppressed knotes to their original state.
+        */
+       while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
+               assert(kn->kn_status & KN_SUPPRESSED);
+               knote_unsuppress(kq, kn);
+       }
 
-static inline bool
-kqworkloop_is_processing_on_current_thread(struct kqworkloop *kqwl)
-{
-       struct kqueue *kq = &kqwl->kqwl_kqueue;
+       procwait = (kq->kqf_state & KQ_PROCWAIT);
+       kq->kqf_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
 
-       kqlock_held(kq);
+       if (procwait) {
+               /* first wake up any thread already waiting to process */
+               waitq_wakeup64_all((struct waitq *)&kq->kqf_wqs,
+                   CAST_EVENT64_T(suppressq), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
+       }
 
-       if (kq->kq_state & KQ_PROCESSING) {
-               /*
-                * KQ_PROCESSING is unset with the kqlock held, and the kqr thread is
-                * never modified while KQ_PROCESSING is set, meaning that peeking at
-                * its value is safe from this context.
-                */
-               return kqwl->kqwl_request.kqr_thread == current_thread();
+       if (kq->kqf_state & KQ_DRAIN) {
+               return EBADF;
        }
-       return false;
+       return (kq->kqf_state & KQ_WAKEUP) ? -1 : 0;
 }
 
-static thread_qos_t
-kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
+static int
+kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
+    struct kqueue_workloop_params *params, int *retval)
 {
-       struct kqrequest *kqr = &kqwl->kqwl_request;
-       kq_index_t qos = THREAD_QOS_UNSPECIFIED;
-       struct knote *kn, *tmp;
+       int error = 0;
+       struct kqworkloop *kqwl;
+       struct filedesc *fdp = p->p_fd;
+       workq_threadreq_param_t trp = { };
+
+       switch (cmd) {
+       case KQ_WORKLOOP_CREATE:
+               if (!params->kqwlp_flags) {
+                       error = EINVAL;
+                       break;
+               }
 
-       kqlock_held(&kqwl->kqwl_kqueue);
+               if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
+                   (params->kqwlp_sched_pri < 1 ||
+                   params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) {
+                       error = EINVAL;
+                       break;
+               }
 
-       TAILQ_FOREACH_SAFE(kn, &kqr->kqr_suppressed, kn_tqe, tmp) {
-               /*
-                * If a knote that can adjust QoS is disabled because of the automatic
-                * behavior of EV_DISPATCH, the knotes should stay suppressed so that
-                * further overrides keep pushing.
-                */
-               if (knote_fops(kn)->f_adjusts_qos && (kn->kn_status & KN_DISABLED) &&
-                   (kn->kn_status & (KN_STAYACTIVE | KN_DROPPING)) == 0 &&
-                   (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
-                       qos = MAX(qos, knote_get_qos_override_index(kn));
-                       continue;
+               if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
+                   invalid_policy(params->kqwlp_sched_pol)) {
+                       error = EINVAL;
+                       break;
+               }
+
+               if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
+                   (params->kqwlp_cpu_percent <= 0 ||
+                   params->kqwlp_cpu_percent > 100 ||
+                   params->kqwlp_cpu_refillms <= 0 ||
+                   params->kqwlp_cpu_refillms > 0x00ffffff)) {
+                       error = EINVAL;
+                       break;
                }
-               knote_unsuppress(kn);
+
+               if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
+                       trp.trp_flags |= TRP_PRIORITY;
+                       trp.trp_pri = params->kqwlp_sched_pri;
+               }
+               if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
+                       trp.trp_flags |= TRP_POLICY;
+                       trp.trp_pol = params->kqwlp_sched_pol;
+               }
+               if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
+                       trp.trp_flags |= TRP_CPUPERCENT;
+                       trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
+                       trp.trp_refillms = params->kqwlp_cpu_refillms;
+               }
+
+               error = kqworkloop_get_or_create(p, params->kqwlp_id, &trp,
+                   KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
+                   KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, &kqwl);
+               if (error) {
+                       break;
+               }
+
+               if (!(fdp->fd_flags & FD_WORKLOOP)) {
+                       /* FD_WORKLOOP indicates we've ever created a workloop
+                        * via this syscall but its only ever added to a process, never
+                        * removed.
+                        */
+                       proc_fdlock(p);
+                       fdp->fd_flags |= FD_WORKLOOP;
+                       proc_fdunlock(p);
+               }
+               break;
+       case KQ_WORKLOOP_DESTROY:
+               error = kqworkloop_get_or_create(p, params->kqwlp_id, NULL,
+                   KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
+                   KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, &kqwl);
+               if (error) {
+                       break;
+               }
+               kqlock(kqwl);
+               trp.trp_value = kqwl->kqwl_params;
+               if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
+                       trp.trp_flags |= TRP_RELEASED;
+                       kqworkloop_release_live(kqwl);
+               } else {
+                       error = EINVAL;
+               }
+               kqunlock(kqwl);
+               kqworkloop_release(kqwl);
+               break;
+       }
+       *retval = 0;
+       return error;
+}
+
+int
+kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval)
+{
+       struct kqueue_workloop_params params = {
+               .kqwlp_id = 0,
+       };
+       if (uap->sz < sizeof(params.kqwlp_version)) {
+               return EINVAL;
+       }
+
+       size_t copyin_sz = MIN(sizeof(params), uap->sz);
+       int rv = copyin(uap->addr, &params, copyin_sz);
+       if (rv) {
+               return rv;
+       }
+
+       if (params.kqwlp_version != (int)uap->sz) {
+               return EINVAL;
        }
 
-       return qos;
+       return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, &params,
+                  retval);
 }
 
+/*ARGSUSED*/
 static int
-kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags)
+kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
+    __unused vfs_context_t ctx)
 {
-       struct kqrequest *kqr = &kqwl->kqwl_request;
-       struct kqueue *kq = &kqwl->kqwl_kqueue;
-       thread_qos_t old_override = THREAD_QOS_UNSPECIFIED, qos_override;
-       thread_t thread = kqr->kqr_thread;
-       int rc = 0, op = KQWL_UTQ_NONE;
+       struct kqfile *kq = (struct kqfile *)fp->f_data;
+       struct kqtailq *suppressq = &kq->kqf_suppressed;
+       struct kqtailq *queue = &kq->kqf_queue;
+       struct knote *kn;
+       int retnum = 0;
 
-       kqlock_held(kq);
+       if (which != FREAD) {
+               return 0;
+       }
 
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
-           kqwl->kqwl_dynamicid, 0, 0);
+       kqlock(kq);
 
-       /* nobody else should still be processing */
-       assert((kq->kq_state & KQ_PROCESSING) == 0);
+       assert((kq->kqf_state & KQ_WORKQ) == 0);
 
-       kq->kq_state |= KQ_PROCESSING;
+       /*
+        * If this is the first pass, link the wait queue associated with the
+        * the kqueue onto the wait queue set for the select().  Normally we
+        * use selrecord() for this, but it uses the wait queue within the
+        * selinfo structure and we need to use the main one for the kqueue to
+        * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
+        * (The select() call will unlink them when it ends).
+        */
+       if (wq_link_id != NULL) {
+               thread_t cur_act = current_thread();
+               struct uthread * ut = get_bsdthread_info(cur_act);
 
-       if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) {
-               op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
-       }
+               kq->kqf_state |= KQ_SEL;
+               waitq_link((struct waitq *)&kq->kqf_wqs, ut->uu_wqset,
+                   WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id);
+
+               /* always consume the reserved link object */
+               waitq_link_release(*(uint64_t *)wq_link_id);
+               *(uint64_t *)wq_link_id = 0;
 
-       if (kevent_flags & KEVENT_FLAG_PARKING) {
                /*
-                * When "parking" we want to process events and if no events are found
-                * unbind.
-                *
-                * However, non overcommit threads sometimes park even when they have
-                * more work so that the pool can narrow.  For these, we need to unbind
-                * early, so that calling kqworkloop_update_threads_qos() can ask the
-                * workqueue subsystem whether the thread should park despite having
-                * pending events.
+                * selprocess() is expecting that we send it back the waitq
+                * that was just added to the thread's waitq set. In order
+                * to not change the selrecord() API (which is exported to
+                * kexts), we pass this value back through the
+                * void *wq_link_id pointer we were passed. We need to use
+                * memcpy here because the pointer may not be properly aligned
+                * on 32-bit systems.
                 */
-               if (kqr->kqr_state & KQR_THOVERCOMMIT) {
-                       op = KQWL_UTQ_PARKING;
-               } else {
-                       op = KQWL_UTQ_UNBINDING;
-               }
-       }
-       if (op == KQWL_UTQ_NONE) {
-               goto done;
+               void *wqptr = &kq->kqf_wqs;
+               memcpy(wq_link_id, (void *)&wqptr, sizeof(void *));
        }
 
-       qos_override = kqworkloop_acknowledge_events(kqwl);
-
-       kq_req_lock(kqwl);
-
-       if (op == KQWL_UTQ_UNBINDING) {
-               old_override = kqworkloop_unbind_locked(kqwl, thread);
-               (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF);
+       if (kqfile_begin_processing(kq) == -1) {
+               kqunlock(kq);
+               return 0;
        }
-       kqworkloop_update_threads_qos(kqwl, op, qos_override);
-       if (op == KQWL_UTQ_PARKING) {
-               if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) {
-                       /*
-                        * We cannot trust KQR_WAKEUP when looking at stay active knotes.
-                        * We need to process once, and kqworkloop_end_processing will
-                        * handle the unbind.
-                        */
-               } else if ((kqr->kqr_state & KQR_WAKEUP) == 0 || kqwl->kqwl_owner) {
-                       old_override = kqworkloop_unbind_locked(kqwl, thread);
-                       (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF);
-                       rc = -1;
-               }
-       } else if (op == KQWL_UTQ_UNBINDING) {
-               if (kqr->kqr_thread == thread) {
-                       /*
-                        * The thread request fired again, passed the admission check and
-                        * got bound to the current thread again.
-                        */
-               } else {
-                       rc = -1;
+
+       if (!TAILQ_EMPTY(queue)) {
+               /*
+                * there is something queued - but it might be a
+                * KN_STAYACTIVE knote, which may or may not have
+                * any events pending.  Otherwise, we have to walk
+                * the list of knotes to see, and peek at the
+                * (non-vanished) stay-active ones to be really sure.
+                */
+               while ((kn = (struct knote *)TAILQ_FIRST(queue)) != NULL) {
+                       if (kn->kn_status & KN_ACTIVE) {
+                               retnum = 1;
+                               goto out;
+                       }
+                       assert(kn->kn_status & KN_STAYACTIVE);
+                       knote_suppress(kq, kn);
                }
-       }
 
-       if (rc == 0) {
                /*
-                * Reset wakeup bit to notice stay active events firing while we are
-                * processing, as we cannot rely on the stayactive bucket emptiness.
+                * There were no regular events on the queue, so take
+                * a deeper look at the stay-queued ones we suppressed.
                 */
-               kqr->kqr_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT;
-       } else {
-               kq->kq_state &= ~KQ_PROCESSING;
-       }
+               while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) {
+                       KNOTE_LOCK_CTX(knlc);
+                       int result = 0;
 
-       kq_req_unlock(kqwl);
+                       /* If didn't vanish while suppressed - peek at it */
+                       if ((kn->kn_status & KN_DROPPING) || !knote_lock(kq, kn, &knlc,
+                           KNOTE_KQ_LOCK_ON_FAILURE)) {
+                               continue;
+                       }
 
-       if (old_override) {
-               thread_drop_ipc_override(thread);
-       }
+                       result = filter_call(knote_fops(kn), f_peek(kn));
 
-done:
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
-           kqwl->kqwl_dynamicid, 0, 0);
+                       kqlock(kq);
+                       knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
 
-       return rc;
+                       /* unsuppress it */
+                       knote_unsuppress(kq, kn);
+
+                       /* has data or it has to report a vanish */
+                       if (result & FILTER_ACTIVE) {
+                               retnum = 1;
+                               goto out;
+                       }
+               }
+       }
+
+out:
+       kqfile_end_processing(kq);
+       kqunlock(kq);
+       return retnum;
 }
 
 /*
- * Return 0 to indicate that processing should proceed,
- * -1 if there is nothing to process.
- *
- * Called with kqueue locked and returns the same way,
- * but may drop lock temporarily.
- * May block.
+ * kqueue_close -
  */
+/*ARGSUSED*/
 static int
-kqfile_begin_processing(struct kqueue *kq)
+kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
 {
-       struct kqtailq *suppressq;
-
-       kqlock_held(kq);
-
-       assert((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
-           VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
-
-       /* wait to become the exclusive processing thread */
-       for (;;) {
-               if (kq->kq_state & KQ_DRAIN) {
-                       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
-                           VM_KERNEL_UNSLIDE_OR_PERM(kq), 2);
-                       return -1;
-               }
-
-               if ((kq->kq_state & KQ_PROCESSING) == 0) {
-                       break;
-               }
-
-               /* if someone else is processing the queue, wait */
-               kq->kq_state |= KQ_PROCWAIT;
-               suppressq = kqueue_get_suppressed_queue(kq, NULL);
-               waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
-                   CAST_EVENT64_T(suppressq), THREAD_UNINT | THREAD_WAIT_NOREPORT,
-                   TIMEOUT_WAIT_FOREVER);
-
-               kqunlock(kq);
-               thread_block(THREAD_CONTINUE_NULL);
-               kqlock(kq);
-       }
-
-       /* Nobody else processing */
-
-       /* clear pre-posts and KQ_WAKEUP now, in case we bail early */
-       waitq_set_clear_preposts(&kq->kq_wqs);
-       kq->kq_state &= ~KQ_WAKEUP;
-
-       /* anything left to process? */
-       if (kqueue_queue_empty(kq, QOS_INDEX_KQFILE)) {
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
-                   VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
-               return -1;
-       }
-
-       /* convert to processing mode */
-       kq->kq_state |= KQ_PROCESSING;
-
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
-           VM_KERNEL_UNSLIDE_OR_PERM(kq));
+       struct kqfile *kqf = (struct kqfile *)fg->fg_data;
 
+       assert((kqf->kqf_state & KQ_WORKQ) == 0);
+       kqueue_dealloc(&kqf->kqf_kqueue);
+       fg->fg_data = NULL;
        return 0;
 }
 
 /*
- * Try to end the processing, only called when a workq thread is attempting to
- * park (KEVENT_FLAG_PARKING is set).
- *
- * When returning -1, the kqworkq is setup again so that it is ready to be
- * processed.
+ * Max depth of the nested kq path that can be created.
+ * Note that this has to be less than the size of kq_level
+ * to avoid wrapping around and mislabeling the level.
+ */
+#define MAX_NESTED_KQ 1000
+
+/*ARGSUSED*/
+/*
+ * The callers has taken a use-count reference on this kqueue and will donate it
+ * to the kqueue we are being added to.  This keeps the kqueue from closing until
+ * that relationship is torn down.
  */
 static int
-kqworkq_end_processing(struct kqworkq *kqwq, struct kqrequest *kqr,
-    int kevent_flags)
+kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
+    __unused struct kevent_qos_s *kev)
 {
-       if (!kqueue_queue_empty(&kqwq->kqwq_kqueue, kqr->kqr_qos_index)) {
-               /* remember we didn't process everything */
-               kq_req_lock(kqwq);
-               kqr->kqr_state |= KQR_WAKEUP;
-               kq_req_unlock(kqwq);
+       struct kqfile *kqf = (struct kqfile *)fp->f_data;
+       struct kqueue *kq = &kqf->kqf_kqueue;
+       struct kqueue *parentkq = knote_get_kq(kn);
+
+       assert((kqf->kqf_state & KQ_WORKQ) == 0);
+
+       if (parentkq == kq || kn->kn_filter != EVFILT_READ) {
+               knote_set_error(kn, EINVAL);
+               return 0;
        }
 
-       if (kevent_flags & KEVENT_FLAG_PARKING) {
-               /*
-                * if acknowledge events "succeeds" it means there are events,
-                * which is a failure condition for end_processing.
-                */
-               int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
-                   KQWQAE_END_PROCESSING);
-               if (rc == 0) {
-                       return -1;
+       /*
+        * We have to avoid creating a cycle when nesting kqueues
+        * inside another.  Rather than trying to walk the whole
+        * potential DAG of nested kqueues, we just use a simple
+        * ceiling protocol.  When a kqueue is inserted into another,
+        * we check that the (future) parent is not already nested
+        * into another kqueue at a lower level than the potenial
+        * child (because it could indicate a cycle).  If that test
+        * passes, we just mark the nesting levels accordingly.
+        *
+        * Only up to MAX_NESTED_KQ can be nested.
+        *
+        * Note: kqworkq and kqworkloop cannot be nested and have reused their
+        *       kq_level field, so ignore these as parent.
+        */
+
+       kqlock(parentkq);
+
+       if ((parentkq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
+               if (parentkq->kq_level > 0 &&
+                   parentkq->kq_level < kq->kq_level) {
+                       kqunlock(parentkq);
+                       knote_set_error(kn, EINVAL);
+                       return 0;
+               }
+
+               /* set parent level appropriately */
+               uint16_t plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level;
+               if (plevel < kq->kq_level + 1) {
+                       if (kq->kq_level + 1 > MAX_NESTED_KQ) {
+                               kqunlock(parentkq);
+                               knote_set_error(kn, EINVAL);
+                               return 0;
+                       }
+                       plevel = kq->kq_level + 1;
                }
+
+               parentkq->kq_level = plevel;
        }
 
-       return 0;
+       kqunlock(parentkq);
+
+       kn->kn_filtid = EVFILTID_KQREAD;
+       kqlock(kq);
+       KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
+       /* indicate nesting in child, if needed */
+       if (kq->kq_level == 0) {
+               kq->kq_level = 1;
+       }
+
+       int count = kq->kq_count;
+       kqunlock(kq);
+       return count > 0;
 }
 
 /*
- * Try to end the processing, only called when a workq thread is attempting to
- * park (KEVENT_FLAG_PARKING is set).
- *
- * When returning -1, the kqworkq is setup again so that it is ready to be
- * processed (as if kqworkloop_begin_processing had just been called).
- *
- * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
- * the kqworkloop is unbound from its servicer as a side effect.
+ * kqueue_drain - called when kq is closed
  */
+/*ARGSUSED*/
 static int
-kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags)
+kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
 {
-       struct kqueue *kq = &kqwl->kqwl_kqueue;
-       struct kqrequest *kqr = &kqwl->kqwl_request;
-       thread_qos_t old_override = THREAD_QOS_UNSPECIFIED, qos_override;
-       thread_t thread = kqr->kqr_thread;
-       int rc = 0;
+       struct kqfile *kqf = (struct kqfile *)fp->f_fglob->fg_data;
 
-       kqlock_held(kq);
+       assert((kqf->kqf_state & KQ_WORKQ) == 0);
 
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
-           kqwl->kqwl_dynamicid, 0, 0);
+       kqlock(kqf);
+       kqf->kqf_state |= KQ_DRAIN;
 
-       if (flags & KQ_PROCESSING) {
-               assert(kq->kq_state & KQ_PROCESSING);
+       /* wakeup sleeping threads */
+       if ((kqf->kqf_state & (KQ_SLEEP | KQ_SEL)) != 0) {
+               kqf->kqf_state &= ~(KQ_SLEEP | KQ_SEL);
+               (void)waitq_wakeup64_all((struct waitq *)&kqf->kqf_wqs,
+                   KQ_EVENT,
+                   THREAD_RESTART,
+                   WAITQ_ALL_PRIORITIES);
+       }
 
-               /*
-                * If we still have queued stayactive knotes, remember we didn't finish
-                * processing all of them.  This should be extremely rare and would
-                * require to have a lot of them registered and fired.
-                */
-               if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) {
-                       kq_req_lock(kqwl);
-                       kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS,
-                           KQWL_BUCKET_STAYACTIVE);
-                       kq_req_unlock(kqwl);
-               }
+       /* wakeup threads waiting their turn to process */
+       if (kqf->kqf_state & KQ_PROCWAIT) {
+               assert(kqf->kqf_state & KQ_PROCESSING);
 
-               /*
-                * When KEVENT_FLAG_PARKING is set, we need to attempt an unbind while
-                * still under the lock.
-                *
-                * So we do everything kqworkloop_unbind() would do, but because we're
-                * inside kqueue_process(), if the workloop actually received events
-                * while our locks were dropped, we have the opportunity to fail the end
-                * processing and loop again.
-                *
-                * This avoids going through the process-wide workqueue lock hence
-                * scales better.
-                */
-               if (kevent_flags & KEVENT_FLAG_PARKING) {
-                       qos_override = kqworkloop_acknowledge_events(kqwl);
-               }
+               kqf->kqf_state &= ~KQ_PROCWAIT;
+               (void)waitq_wakeup64_all((struct waitq *)&kqf->kqf_wqs,
+                   CAST_EVENT64_T(&kqf->kqf_suppressed),
+                   THREAD_RESTART, WAITQ_ALL_PRIORITIES);
        }
 
-       kq_req_lock(kqwl);
+       kqunlock(kqf);
+       return 0;
+}
+
+/*ARGSUSED*/
+int
+kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
+{
+       assert((kq->kq_state & KQ_WORKQ) == 0);
 
-       if (kevent_flags & KEVENT_FLAG_PARKING) {
-               kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override);
-               if ((kqr->kqr_state & KQR_WAKEUP) && !kqwl->kqwl_owner) {
-                       /*
-                        * Reset wakeup bit to notice stay active events firing while we are
-                        * processing, as we cannot rely on the stayactive bucket emptiness.
-                        */
-                       kqr->kqr_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT;
-                       rc = -1;
+       kqlock(kq);
+       if (isstat64 != 0) {
+               struct stat64 *sb64 = (struct stat64 *)ub;
+
+               bzero((void *)sb64, sizeof(*sb64));
+               sb64->st_size = kq->kq_count;
+               if (kq->kq_state & KQ_KEV_QOS) {
+                       sb64->st_blksize = sizeof(struct kevent_qos_s);
+               } else if (kq->kq_state & KQ_KEV64) {
+                       sb64->st_blksize = sizeof(struct kevent64_s);
+               } else if (IS_64BIT_PROCESS(p)) {
+                       sb64->st_blksize = sizeof(struct user64_kevent);
                } else {
-                       old_override = kqworkloop_unbind_locked(kqwl, thread);
-                       (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF);
-                       kq->kq_state &= ~flags;
+                       sb64->st_blksize = sizeof(struct user32_kevent);
                }
+               sb64->st_mode = S_IFIFO;
        } else {
-               kq->kq_state &= ~flags;
-               kqr->kqr_state |= KQR_R2K_NOTIF_ARMED;
-               kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
-       }
-
-       kq_req_unlock(kqwl);
+               struct stat *sb = (struct stat *)ub;
 
-       if (old_override) {
-               thread_drop_ipc_override(thread);
+               bzero((void *)sb, sizeof(*sb));
+               sb->st_size = kq->kq_count;
+               if (kq->kq_state & KQ_KEV_QOS) {
+                       sb->st_blksize = sizeof(struct kevent_qos_s);
+               } else if (kq->kq_state & KQ_KEV64) {
+                       sb->st_blksize = sizeof(struct kevent64_s);
+               } else if (IS_64BIT_PROCESS(p)) {
+                       sb->st_blksize = sizeof(struct user64_kevent);
+               } else {
+                       sb->st_blksize = sizeof(struct user32_kevent);
+               }
+               sb->st_mode = S_IFIFO;
        }
+       kqunlock(kq);
+       return 0;
+}
 
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
-           kqwl->kqwl_dynamicid, 0, 0);
-
-       return rc;
+static inline bool
+kqueue_threadreq_can_use_ast(struct kqueue *kq)
+{
+       if (current_proc() == kq->kq_p) {
+               /*
+                * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can
+                * do combined send/receive and in the case of self-IPC, the AST may bet
+                * set on a thread that will not return to userspace and needs the
+                * thread the AST would create to unblock itself.
+                *
+                * At this time, we really want to target:
+                *
+                * - kevent variants that can cause thread creations, and dispatch
+                *   really only uses kevent_qos and kevent_id,
+                *
+                * - workq_kernreturn (directly about thread creations)
+                *
+                * - bsdthread_ctl which is used for qos changes and has direct impact
+                *   on the creator thread scheduling decisions.
+                */
+               switch (current_uthread()->syscall_code) {
+               case SYS_kevent_qos:
+               case SYS_kevent_id:
+               case SYS_workq_kernreturn:
+               case SYS_bsdthread_ctl:
+                       return true;
+               }
+       }
+       return false;
 }
 
 /*
- * Called with kqueue lock held.
+ * Interact with the pthread kext to request a servicing there at a specific QoS
+ * level.
+ *
+ * - Caller holds the workq request lock
+ *
+ * - May be called with the kqueue's wait queue set locked,
+ *   so cannot do anything that could recurse on that.
  */
 static void
-kqfile_end_processing(struct kqueue *kq)
+kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t kqr,
+    kq_index_t qos, int flags)
 {
-       struct knote *kn;
-       struct kqtailq *suppressq;
-       int procwait;
+       assert(kqr->tr_kq_wakeup);
+       assert(kqr_thread(kqr) == THREAD_NULL);
+       assert(!kqr_thread_requested(kqr));
+       struct turnstile *ts = TURNSTILE_NULL;
+
+       if (workq_is_exiting(kq->kq_p)) {
+               return;
+       }
 
        kqlock_held(kq);
 
-       assert((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
+       if (kq->kq_state & KQ_WORKLOOP) {
+               __assert_only struct kqworkloop *kqwl = (struct kqworkloop *)kq;
 
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
-           VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
+               assert(kqwl->kqwl_owner == THREAD_NULL);
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
+                   kqwl->kqwl_dynamicid, 0, qos, kqr->tr_kq_wakeup);
+               ts = kqwl->kqwl_turnstile;
+               /* Add a thread request reference on the kqueue. */
+               kqworkloop_retain(kqwl);
+       } else {
+               assert(kq->kq_state & KQ_WORKQ);
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST),
+                   -1, 0, qos, kqr->tr_kq_wakeup);
+       }
 
        /*
-        * Return suppressed knotes to their original state.
+        * New-style thread request supported.
+        * Provide the pthread kext a pointer to a workq_threadreq_s structure for
+        * its use until a corresponding kqueue_threadreq_bind callback.
         */
-       suppressq = kqueue_get_suppressed_queue(kq, NULL);
-       while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
-               assert(kn->kn_status & KN_SUPPRESSED);
-               knote_unsuppress(kn);
+       if (kqueue_threadreq_can_use_ast(kq)) {
+               flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
+       }
+       if (qos == KQWQ_QOS_MANAGER) {
+               qos = WORKQ_THREAD_QOS_MANAGER;
+       }
+       if (!workq_kern_threadreq_initiate(kq->kq_p, kqr, ts, qos, flags)) {
+               /*
+                * Process is shutting down or exec'ing.
+                * All the kqueues are going to be cleaned up
+                * soon. Forget we even asked for a thread -
+                * and make sure we don't ask for more.
+                */
+               kq->kq_state &= ~KQ_R2K_ARMED;
+               kqueue_release_live(kq);
        }
+}
 
-       procwait = (kq->kq_state & KQ_PROCWAIT);
-       kq->kq_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
+/*
+ * kqueue_threadreq_bind_prepost - prepost the bind to kevent
+ *
+ * This is used when kqueue_threadreq_bind may cause a lock inversion.
+ */
+__attribute__((always_inline))
+void
+kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t kqr,
+    struct uthread *ut)
+{
+       ut->uu_kqr_bound = kqr;
+       kqr->tr_thread = ut->uu_thread;
+       kqr->tr_state = WORKQ_TR_STATE_BINDING;
+}
 
-       if (procwait) {
-               /* first wake up any thread already waiting to process */
-               waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
-                   CAST_EVENT64_T(suppressq),
-                   THREAD_AWAKENED,
-                   WAITQ_ALL_PRIORITIES);
+/*
+ * kqueue_threadreq_bind_commit - commit a bind prepost
+ *
+ * The workq code has to commit any binding prepost before the thread has
+ * a chance to come back to userspace (and do kevent syscalls) or be aborted.
+ */
+void
+kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
+{
+       struct uthread *ut = get_bsdthread_info(thread);
+       workq_threadreq_t kqr = ut->uu_kqr_bound;
+       kqueue_t kqu = kqr_kqueue(p, kqr);
+
+       kqlock(kqu);
+       if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
+               kqueue_threadreq_bind(p, kqr, thread, 0);
        }
+       kqunlock(kqu);
 }
 
-static int
-kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
-    struct kqueue_workloop_params *params, int *retval)
+static void
+kqueue_threadreq_modify(kqueue_t kqu, workq_threadreq_t kqr, kq_index_t qos,
+    workq_kern_threadreq_flags_t flags)
 {
-       int error = 0;
-       int fd;
-       struct fileproc *fp;
-       struct kqueue *kq;
-       struct kqworkloop *kqwl;
-       struct filedesc *fdp = p->p_fd;
-       workq_threadreq_param_t trp = { };
+       assert(kqr_thread_requested_pending(kqr));
 
-       switch (cmd) {
-       case KQ_WORKLOOP_CREATE:
-               if (!params->kqwlp_flags) {
-                       error = EINVAL;
-                       break;
-               }
+       kqlock_held(kqu);
 
-               if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
-                   (params->kqwlp_sched_pri < 1 ||
-                   params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) {
-                       error = EINVAL;
-                       break;
-               }
+       if (kqueue_threadreq_can_use_ast(kqu.kq)) {
+               flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
+       }
+       workq_kern_threadreq_modify(kqu.kq->kq_p, kqr, qos, flags);
+}
+
+/*
+ * kqueue_threadreq_bind - bind thread to processing kqrequest
+ *
+ * The provided thread will be responsible for delivering events
+ * associated with the given kqrequest.  Bind it and get ready for
+ * the thread to eventually arrive.
+ */
+void
+kqueue_threadreq_bind(struct proc *p, workq_threadreq_t kqr, thread_t thread,
+    unsigned int flags)
+{
+       kqueue_t kqu = kqr_kqueue(p, kqr);
+       struct uthread *ut = get_bsdthread_info(thread);
 
-               if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
-                   invalid_policy(params->kqwlp_sched_pol)) {
-                       error = EINVAL;
-                       break;
-               }
+       kqlock_held(kqu);
 
-               if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
-                   (params->kqwlp_cpu_percent <= 0 ||
-                   params->kqwlp_cpu_percent > 100 ||
-                   params->kqwlp_cpu_refillms <= 0 ||
-                   params->kqwlp_cpu_refillms > 0x00ffffff)) {
-                       error = EINVAL;
-                       break;
-               }
+       assert(ut->uu_kqueue_override == 0);
 
-               if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
-                       trp.trp_flags |= TRP_PRIORITY;
-                       trp.trp_pri = params->kqwlp_sched_pri;
-               }
-               if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
-                       trp.trp_flags |= TRP_POLICY;
-                       trp.trp_pol = params->kqwlp_sched_pol;
-               }
-               if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
-                       trp.trp_flags |= TRP_CPUPERCENT;
-                       trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
-                       trp.trp_refillms = params->kqwlp_cpu_refillms;
-               }
+       if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
+               assert(ut->uu_kqr_bound == kqr);
+               assert(kqr->tr_thread == thread);
+       } else {
+               assert(kqr_thread_requested_pending(kqr));
+               assert(kqr->tr_thread == THREAD_NULL);
+               assert(ut->uu_kqr_bound == NULL);
+               ut->uu_kqr_bound = kqr;
+               kqr->tr_thread = thread;
+       }
 
-               error = kevent_get_kq(p, params->kqwlp_id, &trp,
-                   KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
-                   KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, &fp, &fd, &kq);
-               if (error) {
-                       break;
-               }
+       kqr->tr_state = WORKQ_TR_STATE_BOUND;
 
-               if (!(fdp->fd_flags & FD_WORKLOOP)) {
-                       /* FD_WORKLOOP indicates we've ever created a workloop
-                        * via this syscall but its only ever added to a process, never
-                        * removed.
+       if (kqu.kq->kq_state & KQ_WORKLOOP) {
+               struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
+
+               if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
+                       /*
+                        * <rdar://problem/38626999> shows that asserting here is not ok.
+                        *
+                        * This is not supposed to happen for correct use of the interface,
+                        * but it is sadly possible for userspace (with the help of memory
+                        * corruption, such as over-release of a dispatch queue) to make
+                        * the creator thread the "owner" of a workloop.
+                        *
+                        * Once that happens, and that creator thread picks up the same
+                        * workloop as a servicer, we trip this codepath. We need to fixup
+                        * the state to forget about this thread being the owner, as the
+                        * entire workloop state machine expects servicers to never be
+                        * owners and everything would basically go downhill from here.
                         */
-                       proc_fdlock(p);
-                       fdp->fd_flags |= FD_WORKLOOP;
-                       proc_fdunlock(p);
+                       kqu.kqwl->kqwl_owner = THREAD_NULL;
+                       if (kqworkloop_override(kqu.kqwl)) {
+                               thread_drop_kevent_override(thread);
+                       }
                }
-               break;
-       case KQ_WORKLOOP_DESTROY:
-               error = kevent_get_kq(p, params->kqwlp_id, NULL,
-                   KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
-                   KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, &fp, &fd, &kq);
-               if (error) {
-                       break;
+
+               if (ts && (flags & KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE) == 0) {
+                       /*
+                        * Past this point, the interlock is the kq req lock again,
+                        * so we can fix the inheritor for good.
+                        */
+                       filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
+                       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
                }
-               kqlock(kq);
-               kqwl = (struct kqworkloop *)kq;
-               trp.trp_value = kqwl->kqwl_params;
-               if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
-                       trp.trp_flags |= TRP_RELEASED;
-                       kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
-               } else {
-                       error = EINVAL;
+
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
+                   thread_tid(thread), kqr->tr_kq_qos_index,
+                   (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup);
+
+               ut->uu_kqueue_override = kqr->tr_kq_override_index;
+               if (kqr->tr_kq_override_index) {
+                       thread_add_servicer_override(thread, kqr->tr_kq_override_index);
                }
-               kqunlock(kq);
-               kqueue_release_last(p, kq);
-               break;
+       } else {
+               assert(kqr->tr_kq_override_index == 0);
+
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1,
+                   thread_tid(thread), kqr->tr_kq_qos_index,
+                   (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup);
        }
-       *retval = 0;
-       return error;
 }
 
-int
-kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval)
+/*
+ * kqueue_threadreq_cancel - abort a pending thread request
+ *
+ * Called when exiting/exec'ing. Forget our pending request.
+ */
+void
+kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t kqr)
 {
-       struct kqueue_workloop_params params = {
-               .kqwlp_id = 0,
-       };
-       if (uap->sz < sizeof(params.kqwlp_version)) {
-               return EINVAL;
-       }
-
-       size_t copyin_sz = MIN(sizeof(params), uap->sz);
-       int rv = copyin(uap->addr, &params, copyin_sz);
-       if (rv) {
-               return rv;
-       }
+       kqueue_release(kqr_kqueue(p, kqr));
+}
 
-       if (params.kqwlp_version != (int)uap->sz) {
-               return EINVAL;
-       }
+workq_threadreq_param_t
+kqueue_threadreq_workloop_param(workq_threadreq_t kqr)
+{
+       struct kqworkloop *kqwl;
+       workq_threadreq_param_t trp;
 
-       return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, &params,
-                  retval);
+       assert(kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
+       kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
+       trp.trp_value = kqwl->kqwl_params;
+       return trp;
 }
 
 /*
- * kqueue_process - process the triggered events in a kqueue
- *
- *     Walk the queued knotes and validate that they are really still triggered
- *     events by calling the filter routines (if necessary).
+ *     kqueue_threadreq_unbind - unbind thread from processing kqueue
  *
- *     For each event that is still considered triggered, invoke the callback
- *     routine provided.
+ *     End processing the per-QoS bucket of events and allow other threads
+ *     to be requested for future servicing.
  *
  *     caller holds a reference on the kqueue.
- *     kqueue locked on entry and exit - but may be dropped
- *     kqueue list locked (held for duration of call)
  */
-static int
-kqueue_process(struct kqueue *kq,
-    kevent_callback_t callback,
-    void *callback_data,
-    struct filt_process_s *process_data,
-    int *countp)
+void
+kqueue_threadreq_unbind(struct proc *p, workq_threadreq_t kqr)
 {
-       struct uthread *ut = get_bsdthread_info(current_thread());
-       struct kqrequest *kqr = ut->uu_kqr_bound;
-       struct knote *kn;
-       unsigned int flags = process_data ? process_data->fp_flags : 0;
-       int nevents = 0, error = 0, rc = 0;
-       struct kqtailq *base_queue, *queue;
-       kqueue_t kqu = { .kq = kq };
-#if DEBUG || DEVELOPMENT
-       int retries = 64;
-#endif
-
-       if (kq->kq_state & KQ_WORKQ) {
-               if (kqr == NULL || (kqr->kqr_state & KQR_WORKLOOP)) {
-                       return EJUSTRETURN;
-               }
-               rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags);
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               if (ut->uu_kqr_bound != &kqu.kqwl->kqwl_request) {
-                       return EJUSTRETURN;
-               }
-               rc = kqworkloop_begin_processing(kqu.kqwl, flags);
+       if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
+               kqworkloop_unbind(kqr_kqworkloop(kqr));
        } else {
-               rc = kqfile_begin_processing(kq);
+               kqworkq_unbind(p, kqr);
        }
+}
 
-       if (rc == -1) {
-               /* Nothing to process */
-               *countp = 0;
-               return 0;
+/*
+ * If we aren't already busy processing events [for this QoS],
+ * request workq thread support as appropriate.
+ *
+ * TBD - for now, we don't segregate out processing by QoS.
+ *
+ * - May be called with the kqueue's wait queue set locked,
+ *   so cannot do anything that could recurse on that.
+ */
+static void
+kqworkq_wakeup(struct kqworkq *kqwq, kq_index_t qos_index)
+{
+       workq_threadreq_t kqr = kqworkq_get_request(kqwq, qos_index);
+
+       /* convert to thread qos value */
+       assert(qos_index < KQWQ_NBUCKETS);
+
+       if (!kqr->tr_kq_wakeup) {
+               kqr->tr_kq_wakeup = true;
+               if (!kqr_thread_requested(kqr)) {
+                       kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0);
+               }
        }
+}
 
-       /*
-        * loop through the enqueued knotes associated with this request,
-        * processing each one. Each request may have several queues
-        * of knotes to process (depending on the type of kqueue) so we
-        * have to loop through all the queues as long as we have additional
-        * space.
-        */
+/*
+ * This represent the asynchronous QoS a given workloop contributes,
+ * hence is the max of the current active knotes (override index)
+ * and the workloop max qos (userspace async qos).
+ */
+static kq_index_t
+kqworkloop_override(struct kqworkloop *kqwl)
+{
+       workq_threadreq_t kqr = &kqwl->kqwl_request;
+       return MAX(kqr->tr_kq_qos_index, kqr->tr_kq_override_index);
+}
 
-process_again:
-       if (kq->kq_state & KQ_WORKQ) {
-               base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->kqr_qos_index];
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               base_queue = &kqu.kqwl->kqwl_queue[0];
-               queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1];
-       } else {
-               base_queue = queue = &kq->kq_queue[QOS_INDEX_KQFILE];
+static inline void
+kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
+{
+       workq_threadreq_t kqr = &kqwl->kqwl_request;
+
+       kqlock_held(kqwl);
+
+       if (kqwl->kqwl_state & KQ_R2K_ARMED) {
+               kqwl->kqwl_state &= ~KQ_R2K_ARMED;
+               act_set_astkevent(kqr_thread_fast(kqr), AST_KEVENT_RETURN_TO_KERNEL);
        }
+}
 
-       do {
-               while (error == 0 && (kn = TAILQ_FIRST(queue)) != NULL) {
-                       error = knote_process(kn, callback, callback_data, process_data);
-                       if (error == EJUSTRETURN) {
-                               error = 0;
-                       } else {
-                               nevents++;
-                       }
-                       /* error is EWOULDBLOCK when the out event array is full */
-               }
+static void
+kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
+{
+       workq_threadreq_t kqr = &kqwl->kqwl_request;
+       struct kqueue *kq = &kqwl->kqwl_kqueue;
+       kq_index_t old_override = kqworkloop_override(kqwl);
+       kq_index_t i;
 
-               if (error == EWOULDBLOCK) {
-                       /* break out if no more space for additional events */
-                       error = 0;
+       kqlock_held(kqwl);
+
+       switch (op) {
+       case KQWL_UTQ_UPDATE_WAKEUP_QOS:
+               if (qos == KQWL_BUCKET_STAYACTIVE) {
+                       /*
+                        * the KQWL_BUCKET_STAYACTIVE is not a QoS bucket, we only remember
+                        * a high watermark (kqwl_stayactive_qos) of any stay active knote
+                        * that was ever registered with this workloop.
+                        *
+                        * When waitq_set__CALLING_PREPOST_HOOK__() wakes up any stay active
+                        * knote, we use this high-watermark as a wakeup-index, and also set
+                        * the magic KQWL_BUCKET_STAYACTIVE bit to make sure we remember
+                        * there is at least one stay active knote fired until the next full
+                        * processing of this bucket.
+                        */
+                       kqwl->kqwl_wakeup_indexes |= KQWL_STAYACTIVE_FIRED_BIT;
+                       qos = kqwl->kqwl_stayactive_qos;
+                       assert(qos);
+               }
+               if (kqwl->kqwl_wakeup_indexes & (1 << qos)) {
+                       assert(kqr->tr_kq_wakeup);
                        break;
                }
-       } while (queue-- > base_queue);
 
-       *countp = nevents;
+               kqwl->kqwl_wakeup_indexes |= (1 << qos);
+               kqr->tr_kq_wakeup = true;
+               kqworkloop_request_fire_r2k_notification(kqwl);
+               goto recompute;
 
-       /*
-        * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
-        * we want to unbind the kqrequest from the thread.
-        *
-        * However, because the kq locks are dropped several times during process,
-        * new knotes may have fired again, in which case, we want to fail the end
-        * processing and process again, until it converges.
-        *
-        * If we returned events however, end processing never fails.
-        */
-       if (error || nevents) {
-               flags &= ~KEVENT_FLAG_PARKING;
-       }
-       if (kq->kq_state & KQ_WORKQ) {
-               rc = kqworkq_end_processing(kqu.kqwq, kqr, flags);
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags);
-       } else {
-               kqfile_end_processing(kq);
-               rc = 0;
-       }
-       if (rc == -1) {
-               assert(flags & KEVENT_FLAG_PARKING);
-#if DEBUG || DEVELOPMENT
-               if (retries-- == 0) {
-                       panic("kevent: way too many knote_process retries, kq: %p (0x%02x)",
-                           kq, kq->kq_state);
+       case KQWL_UTQ_UPDATE_STAYACTIVE_QOS:
+               assert(qos);
+               if (kqwl->kqwl_stayactive_qos < qos) {
+                       kqwl->kqwl_stayactive_qos = qos;
+                       if (kqwl->kqwl_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT) {
+                               assert(kqr->tr_kq_wakeup);
+                               kqwl->kqwl_wakeup_indexes |= (1 << qos);
+                               goto recompute;
+                       }
+               }
+               break;
+
+       case KQWL_UTQ_PARKING:
+       case KQWL_UTQ_UNBINDING:
+               kqr->tr_kq_override_index = qos;
+       /* FALLTHROUGH */
+       case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
+               if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
+                       assert(qos == THREAD_QOS_UNSPECIFIED);
+               }
+               i = KQWL_BUCKET_STAYACTIVE;
+               if (TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
+                       kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
+               }
+               if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i]) &&
+                   (kqwl->kqwl_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT)) {
+                       /*
+                        * If the KQWL_STAYACTIVE_FIRED_BIT is set, it means a stay active
+                        * knote may have fired, so we need to merge in kqwl_stayactive_qos.
+                        *
+                        * Unlike other buckets, this one is never empty but could be idle.
+                        */
+                       kqwl->kqwl_wakeup_indexes &= KQWL_STAYACTIVE_FIRED_BIT;
+                       kqwl->kqwl_wakeup_indexes |= (1 << kqwl->kqwl_stayactive_qos);
+               } else {
+                       kqwl->kqwl_wakeup_indexes = 0;
                }
-#endif
-               goto process_again;
-       }
-       return error;
-}
-
-static void
-kqueue_scan_continue(void *data, wait_result_t wait_result)
-{
-       thread_t self = current_thread();
-       uthread_t ut = (uthread_t)get_bsdthread_info(self);
-       struct _kqueue_scan * cont_args = &ut->uu_save.uus_kqueue_scan;
-       struct kqueue *kq = (struct kqueue *)data;
-       struct filt_process_s *process_data = cont_args->process_data;
-       int error;
-       int count;
-
-       /* convert the (previous) wait_result to a proper error */
-       switch (wait_result) {
-       case THREAD_AWAKENED: {
-               kqlock(kq);
-retry:
-               error = kqueue_process(kq, cont_args->call, cont_args->data,
-                   process_data, &count);
-               if (error == 0 && count == 0) {
-                       if (kq->kq_state & KQ_DRAIN) {
-                               kqunlock(kq);
-                               goto drain;
+               for (i = THREAD_QOS_UNSPECIFIED + 1; i < KQWL_BUCKET_STAYACTIVE; i++) {
+                       if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i])) {
+                               kqwl->kqwl_wakeup_indexes |= (1 << i);
                        }
+               }
+               if (kqwl->kqwl_wakeup_indexes) {
+                       kqr->tr_kq_wakeup = true;
+                       kqworkloop_request_fire_r2k_notification(kqwl);
+               } else {
+                       kqr->tr_kq_wakeup = false;
+               }
+               goto recompute;
 
-                       if (kq->kq_state & KQ_WAKEUP) {
-                               goto retry;
-                       }
+       case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
+               kqr->tr_kq_override_index = qos;
+               goto recompute;
 
-                       waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
-                           KQ_EVENT, THREAD_ABORTSAFE,
-                           cont_args->deadline);
-                       kq->kq_state |= KQ_SLEEP;
-                       kqunlock(kq);
-                       thread_block_parameter(kqueue_scan_continue, kq);
-                       /* NOTREACHED */
+       case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
+recompute:
+               /*
+                * When modifying the wakeup QoS or the override QoS, we always need to
+                * maintain our invariant that kqr_override_index is at least as large
+                * as the highest QoS for which an event is fired.
+                *
+                * However this override index can be larger when there is an overriden
+                * suppressed knote pushing on the kqueue.
+                */
+               if (kqwl->kqwl_wakeup_indexes > (1 << qos)) {
+                       qos = fls(kqwl->kqwl_wakeup_indexes) - 1; /* fls is 1-based */
+               }
+               if (kqr->tr_kq_override_index < qos) {
+                       kqr->tr_kq_override_index = qos;
                }
-               kqunlock(kq);
-       } break;
-       case THREAD_TIMED_OUT:
-               error = EWOULDBLOCK;
                break;
-       case THREAD_INTERRUPTED:
-               error = EINTR;
+
+       case KQWL_UTQ_REDRIVE_EVENTS:
                break;
-       case THREAD_RESTART:
-drain:
-               error = EBADF;
+
+       case KQWL_UTQ_SET_QOS_INDEX:
+               kqr->tr_kq_qos_index = qos;
                break;
+
        default:
-               panic("%s: - invalid wait_result (%d)", __func__,
-                   wait_result);
-               error = 0;
+               panic("unknown kqwl thread qos update operation: %d", op);
        }
 
-       /* call the continuation with the results */
-       assert(cont_args->cont != NULL);
-       (cont_args->cont)(kq, cont_args->data, error);
-}
-
-
-/*
- * kqueue_scan - scan and wait for events in a kqueue
- *
- *     Process the triggered events in a kqueue.
- *
- *     If there are no events triggered arrange to
- *     wait for them. If the caller provided a
- *     continuation routine, then kevent_scan will
- *     also.
- *
- *     The callback routine must be valid.
- *     The caller must hold a use-count reference on the kq.
- */
-int
-kqueue_scan(struct kqueue *kq,
-    kevent_callback_t callback,
-    kqueue_continue_t continuation,
-    void *callback_data,
-    struct filt_process_s *process_data,
-    struct timeval *atvp,
-    __unused struct proc *p)
-{
-       thread_continue_t cont = THREAD_CONTINUE_NULL;
-       unsigned int flags;
-       uint64_t deadline;
-       int error;
-       int first;
-       int fd;
-
-       assert(callback != NULL);
+       thread_t kqwl_owner = kqwl->kqwl_owner;
+       thread_t servicer = kqr_thread(kqr);
+       boolean_t qos_changed = FALSE;
+       kq_index_t new_override = kqworkloop_override(kqwl);
 
        /*
-        * Determine which QoS index we are servicing
+        * Apply the diffs to the owner if applicable
         */
-       flags = (process_data) ? process_data->fp_flags : 0;
-       fd = (process_data) ? process_data->fp_fd : -1;
-
-       first = 1;
-       for (;;) {
-               wait_result_t wait_result;
-               int count;
+       if (kqwl_owner) {
+#if 0
+               /* JMM - need new trace hooks for owner overrides */
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
+                   kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->tr_kq_qos_index,
+                   (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup);
+#endif
+               if (new_override == old_override) {
+                       // nothing to do
+               } else if (old_override == THREAD_QOS_UNSPECIFIED) {
+                       thread_add_kevent_override(kqwl_owner, new_override);
+               } else if (new_override == THREAD_QOS_UNSPECIFIED) {
+                       thread_drop_kevent_override(kqwl_owner);
+               } else { /*  old_override != new_override */
+                       thread_update_kevent_override(kqwl_owner, new_override);
+               }
+       }
 
+       /*
+        * apply the diffs to the servicer
+        */
+       if (!kqr_thread_requested(kqr)) {
                /*
-                * Make a pass through the kq to find events already
-                * triggered.
+                * No servicer, nor thread-request
+                *
+                * Make a new thread request, unless there is an owner (or the workloop
+                * is suspended in userland) or if there is no asynchronous work in the
+                * first place.
                 */
-               kqlock(kq);
-               error = kqueue_process(kq, callback, callback_data,
-                   process_data, &count);
-               if (error || count) {
-                       break; /* lock still held */
-               }
-               /* looks like we have to consider blocking */
-               if (first) {
-                       first = 0;
-                       /* convert the timeout to a deadline once */
-                       if (atvp->tv_sec || atvp->tv_usec) {
-                               uint64_t now;
-
-                               clock_get_uptime(&now);
-                               nanoseconds_to_absolutetime((uint64_t)atvp->tv_sec * NSEC_PER_SEC +
-                                   atvp->tv_usec * (long)NSEC_PER_USEC,
-                                   &deadline);
-                               if (now >= deadline) {
-                                       /* non-blocking call */
-                                       error = EWOULDBLOCK;
-                                       break; /* lock still held */
-                               }
-                               deadline -= now;
-                               clock_absolutetime_interval_to_deadline(deadline, &deadline);
-                       } else {
-                               deadline = 0;   /* block forever */
-                       }
 
-                       if (continuation) {
-                               uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
-                               struct _kqueue_scan *cont_args = &ut->uu_save.uus_kqueue_scan;
-
-                               cont_args->call = callback;
-                               cont_args->cont = continuation;
-                               cont_args->deadline = deadline;
-                               cont_args->data = callback_data;
-                               cont_args->process_data = process_data;
-                               cont = kqueue_scan_continue;
+               if (kqwl_owner == NULL && kqr->tr_kq_wakeup) {
+                       int initiate_flags = 0;
+                       if (op == KQWL_UTQ_UNBINDING) {
+                               initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
                        }
+                       kqueue_threadreq_initiate(kq, kqr, new_override, initiate_flags);
                }
-
-               if (kq->kq_state & KQ_DRAIN) {
-                       kqunlock(kq);
-                       return EBADF;
-               }
-
-               /* If awakened during processing, try again */
-               if (kq->kq_state & KQ_WAKEUP) {
-                       kqunlock(kq);
-                       continue;
+       } else if (servicer) {
+               /*
+                * Servicer in flight
+                *
+                * Just apply the diff to the servicer
+                */
+               struct uthread *ut = get_bsdthread_info(servicer);
+               if (ut->uu_kqueue_override != new_override) {
+                       if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
+                               thread_add_servicer_override(servicer, new_override);
+                       } else if (new_override == THREAD_QOS_UNSPECIFIED) {
+                               thread_drop_servicer_override(servicer);
+                       } else { /* ut->uu_kqueue_override != new_override */
+                               thread_update_servicer_override(servicer, new_override);
+                       }
+                       ut->uu_kqueue_override = new_override;
+                       qos_changed = TRUE;
                }
+       } else if (new_override == THREAD_QOS_UNSPECIFIED) {
+               /*
+                * No events to deliver anymore.
+                *
+                * However canceling with turnstiles is challenging, so the fact that
+                * the request isn't useful will be discovered by the servicer himself
+                * later on.
+                */
+       } else if (old_override != new_override) {
+               /*
+                * Request is in flight
+                *
+                * Apply the diff to the thread request
+                */
+               kqueue_threadreq_modify(kq, kqr, new_override, WORKQ_THREADREQ_NONE);
+               qos_changed = TRUE;
+       }
 
-               /* go ahead and wait */
-               waitq_assert_wait64_leeway((struct waitq *)&kq->kq_wqs,
-                   KQ_EVENT, THREAD_ABORTSAFE,
-                   TIMEOUT_URGENCY_USER_NORMAL,
-                   deadline, TIMEOUT_NO_LEEWAY);
-               kq->kq_state |= KQ_SLEEP;
-               kqunlock(kq);
-               wait_result = thread_block_parameter(cont, kq);
-               /* NOTREACHED if (continuation != NULL) */
-
-               switch (wait_result) {
-               case THREAD_AWAKENED:
-                       continue;
-               case THREAD_TIMED_OUT:
-                       return EWOULDBLOCK;
-               case THREAD_INTERRUPTED:
-                       return EINTR;
-               case THREAD_RESTART:
-                       return EBADF;
-               default:
-                       panic("%s: - bad wait_result (%d)", __func__,
-                           wait_result);
-                       error = 0;
-               }
+       if (qos_changed) {
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
+                   thread_tid(servicer), kqr->tr_kq_qos_index,
+                   (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup);
        }
-       kqunlock(kq);
-       return error;
 }
 
-
-/*
- * XXX
- * This could be expanded to call kqueue_scan, if desired.
- */
-/*ARGSUSED*/
-static int
-kqueue_read(__unused struct fileproc *fp,
-    __unused struct uio *uio,
-    __unused int flags,
-    __unused vfs_context_t ctx)
+static void
+kqworkloop_wakeup(struct kqworkloop *kqwl, kq_index_t qos)
 {
-       return ENXIO;
-}
+       if ((kqwl->kqwl_state & KQ_PROCESSING) &&
+           kqr_thread(&kqwl->kqwl_request) == current_thread()) {
+               /*
+                * kqworkloop_end_processing() will perform the required QoS
+                * computations when it unsets the processing mode.
+                */
+               return;
+       }
 
-/*ARGSUSED*/
-static int
-kqueue_write(__unused struct fileproc *fp,
-    __unused struct uio *uio,
-    __unused int flags,
-    __unused vfs_context_t ctx)
-{
-       return ENXIO;
+       kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos);
 }
 
-/*ARGSUSED*/
-static int
-kqueue_ioctl(__unused struct fileproc *fp,
-    __unused u_long com,
-    __unused caddr_t data,
-    __unused vfs_context_t ctx)
+static struct kqtailq *
+kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
 {
-       return ENOTTY;
+       if (kq.kq->kq_state & KQ_WORKLOOP) {
+               return &kq.kqwl->kqwl_suppressed;
+       } else if (kq.kq->kq_state & KQ_WORKQ) {
+               return &kq.kqwq->kqwq_suppressed[kn->kn_qos_index];
+       } else {
+               return &kq.kqf->kqf_suppressed;
+       }
 }
 
-/*ARGSUSED*/
-static int
-kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
-    __unused vfs_context_t ctx)
+struct turnstile *
+kqueue_alloc_turnstile(kqueue_t kqu)
 {
-       struct kqueue *kq = (struct kqueue *)fp->f_data;
-       struct kqtailq *queue;
-       struct kqtailq *suppressq;
-       struct knote *kn;
-       int retnum = 0;
+       struct kqworkloop *kqwl = kqu.kqwl;
+       kq_state_t kq_state;
 
-       if (which != FREAD) {
-               return 0;
+       kq_state = os_atomic_load(&kqu.kq->kq_state, dependency);
+       if (kq_state & KQ_HAS_TURNSTILE) {
+               /* force a dependency to pair with the atomic or with release below */
+               return os_atomic_load_with_dependency_on(&kqwl->kqwl_turnstile,
+                          (uintptr_t)kq_state);
        }
 
-       kqlock(kq);
+       if (!(kq_state & KQ_WORKLOOP)) {
+               return TURNSTILE_NULL;
+       }
 
-       assert((kq->kq_state & KQ_WORKQ) == 0);
+       struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL;
+       bool workq_locked = false;
 
-       /*
-        * If this is the first pass, link the wait queue associated with the
-        * the kqueue onto the wait queue set for the select().  Normally we
-        * use selrecord() for this, but it uses the wait queue within the
-        * selinfo structure and we need to use the main one for the kqueue to
-        * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
-        * (The select() call will unlink them when it ends).
-        */
-       if (wq_link_id != NULL) {
-               thread_t cur_act = current_thread();
-               struct uthread * ut = get_bsdthread_info(cur_act);
+       kqlock(kqu);
 
-               kq->kq_state |= KQ_SEL;
-               waitq_link((struct waitq *)&kq->kq_wqs, ut->uu_wqset,
-                   WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id);
+       if (filt_wlturnstile_interlock_is_workq(kqwl)) {
+               workq_locked = true;
+               workq_kern_threadreq_lock(kqwl->kqwl_p);
+       }
 
-               /* always consume the reserved link object */
-               waitq_link_release(*(uint64_t *)wq_link_id);
-               *(uint64_t *)wq_link_id = 0;
+       if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
+               free_ts = ts;
+               ts = kqwl->kqwl_turnstile;
+       } else {
+               ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
+                   ts, TURNSTILE_WORKLOOPS);
 
-               /*
-                * selprocess() is expecting that we send it back the waitq
-                * that was just added to the thread's waitq set. In order
-                * to not change the selrecord() API (which is exported to
-                * kexts), we pass this value back through the
-                * void *wq_link_id pointer we were passed. We need to use
-                * memcpy here because the pointer may not be properly aligned
-                * on 32-bit systems.
-                */
-               void *wqptr = &kq->kq_wqs;
-               memcpy(wq_link_id, (void *)&wqptr, sizeof(void *));
+               /* release-barrier to pair with the unlocked load of kqwl_turnstile above */
+               os_atomic_or(&kqwl->kqwl_state, KQ_HAS_TURNSTILE, release);
+
+               if (filt_wlturnstile_interlock_is_workq(kqwl)) {
+                       workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
+                           &kqwl->kqwl_request, kqwl->kqwl_owner,
+                           ts, TURNSTILE_IMMEDIATE_UPDATE);
+                       /*
+                        * The workq may no longer be the interlock after this.
+                        * In which case the inheritor wasn't updated.
+                        */
+               }
+               if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
+                       filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
+               }
+       }
+
+       if (workq_locked) {
+               workq_kern_threadreq_unlock(kqwl->kqwl_p);
+       }
+
+       kqunlock(kqu);
+
+       if (free_ts) {
+               turnstile_deallocate(free_ts);
+       } else {
+               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
+       }
+       return ts;
+}
+
+__attribute__((always_inline))
+struct turnstile *
+kqueue_turnstile(kqueue_t kqu)
+{
+       kq_state_t kq_state = os_atomic_load(&kqu.kq->kq_state, relaxed);
+       if (kq_state & KQ_WORKLOOP) {
+               return os_atomic_load(&kqu.kqwl->kqwl_turnstile, relaxed);
+       }
+       return TURNSTILE_NULL;
+}
+
+__attribute__((always_inline))
+struct turnstile *
+kqueue_threadreq_get_turnstile(workq_threadreq_t kqr)
+{
+       struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
+       if (kqwl) {
+               return os_atomic_load(&kqwl->kqwl_turnstile, relaxed);
        }
+       return TURNSTILE_NULL;
+}
+
+static void
+kqworkloop_set_overcommit(struct kqworkloop *kqwl)
+{
+       workq_threadreq_t kqr = &kqwl->kqwl_request;
 
-       if (kqfile_begin_processing(kq) == -1) {
-               kqunlock(kq);
-               return 0;
+       /*
+        * This test is racy, but since we never remove this bit,
+        * it allows us to avoid taking a lock.
+        */
+       if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
+               return;
        }
 
-       queue = &kq->kq_queue[QOS_INDEX_KQFILE];
-       if (!TAILQ_EMPTY(queue)) {
-               /*
-                * there is something queued - but it might be a
-                * KN_STAYACTIVE knote, which may or may not have
-                * any events pending.  Otherwise, we have to walk
-                * the list of knotes to see, and peek at the
-                * (non-vanished) stay-active ones to be really sure.
-                */
-               while ((kn = (struct knote *)TAILQ_FIRST(queue)) != NULL) {
-                       if (kn->kn_status & KN_ACTIVE) {
-                               retnum = 1;
-                               goto out;
-                       }
-                       assert(kn->kn_status & KN_STAYACTIVE);
-                       knote_suppress(kn);
-               }
+       kqlock_held(kqwl);
 
-               /*
-                * There were no regular events on the queue, so take
-                * a deeper look at the stay-queued ones we suppressed.
-                */
-               suppressq = kqueue_get_suppressed_queue(kq, NULL);
-               while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) {
-                       KNOTE_LOCK_CTX(knlc);
-                       int result = 0;
+       if (kqr_thread_requested_pending(kqr)) {
+               kqueue_threadreq_modify(kqwl, kqr, kqr->tr_qos,
+                   WORKQ_THREADREQ_MAKE_OVERCOMMIT);
+       } else {
+               kqr->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
+       }
+}
 
-                       /* If didn't vanish while suppressed - peek at it */
-                       if ((kn->kn_status & KN_DROPPING) || !knote_lock(kq, kn, &knlc,
-                           KNOTE_KQ_LOCK_ON_FAILURE)) {
-                               continue;
-                       }
+static void
+kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn,
+    kq_index_t override_index)
+{
+       workq_threadreq_t kqr;
+       kq_index_t old_override_index;
+       kq_index_t queue_index = kn->kn_qos_index;
 
-                       result = filter_call(knote_fops(kn), f_peek(kn));
+       if (override_index <= queue_index) {
+               return;
+       }
 
-                       kqlock(kq);
-                       knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
+       kqr = kqworkq_get_request(kqwq, queue_index);
 
-                       /* unsuppress it */
-                       knote_unsuppress(kn);
+       kqlock_held(kqwq);
 
-                       /* has data or it has to report a vanish */
-                       if (result & FILTER_ACTIVE) {
-                               retnum = 1;
-                               goto out;
+       old_override_index = kqr->tr_kq_override_index;
+       if (override_index > MAX(kqr->tr_kq_qos_index, old_override_index)) {
+               thread_t servicer = kqr_thread(kqr);
+               kqr->tr_kq_override_index = override_index;
+
+               /* apply the override to [incoming?] servicing thread */
+               if (servicer) {
+                       if (old_override_index) {
+                               thread_update_kevent_override(servicer, override_index);
+                       } else {
+                               thread_add_kevent_override(servicer, override_index);
                        }
                }
        }
-
-out:
-       kqfile_end_processing(kq);
-       kqunlock(kq);
-       return retnum;
 }
 
-/*
- * kqueue_close -
- */
-/*ARGSUSED*/
-static int
-kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
+static void
+kqueue_update_override(kqueue_t kqu, struct knote *kn, thread_qos_t qos)
 {
-       struct kqfile *kqf = (struct kqfile *)fg->fg_data;
-
-       assert((kqf->kqf_state & KQ_WORKQ) == 0);
-       kqueue_dealloc(&kqf->kqf_kqueue);
-       fg->fg_data = NULL;
-       return 0;
+       if (kqu.kq->kq_state & KQ_WORKLOOP) {
+               kqworkloop_update_threads_qos(kqu.kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
+                   qos);
+       } else {
+               kqworkq_update_override(kqu.kqwq, kn, qos);
+       }
 }
 
-/*
- * Max depth of the nested kq path that can be created.
- * Note that this has to be less than the size of kq_level
- * to avoid wrapping around and mislabeling the level.
- */
-#define MAX_NESTED_KQ 1000
-
-/*ARGSUSED*/
-/*
- * The callers has taken a use-count reference on this kqueue and will donate it
- * to the kqueue we are being added to.  This keeps the kqueue from closing until
- * that relationship is torn down.
- */
-static int
-kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn,
-    __unused struct kevent_internal_s *kev, __unused vfs_context_t ctx)
+static void
+kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
+    enum kqwl_unbind_locked_mode how)
 {
-       struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
-       struct kqueue *kq = &kqf->kqf_kqueue;
-       struct kqueue *parentkq = knote_get_kq(kn);
-       uint16_t plevel = 0;
-
-       assert((kqf->kqf_state & KQ_WORKQ) == 0);
+       struct uthread *ut = get_bsdthread_info(thread);
+       workq_threadreq_t kqr = &kqwl->kqwl_request;
 
-       if (parentkq == kq || kn->kn_filter != EVFILT_READ) {
-               knote_set_error(kn, EINVAL);
-               return 0;
-       }
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
+           thread_tid(thread), 0, 0);
 
-       /*
-        * We have to avoid creating a cycle when nesting kqueues
-        * inside another.  Rather than trying to walk the whole
-        * potential DAG of nested kqueues, we just use a simple
-        * ceiling protocol.  When a kqueue is inserted into another,
-        * we check that the (future) parent is not already nested
-        * into another kqueue at a lower level than the potenial
-        * child (because it could indicate a cycle).  If that test
-        * passes, we just mark the nesting levels accordingly.
-        *
-        * Only up to MAX_NESTED_KQ can be nested.
-        */
+       kqlock_held(kqwl);
 
-       kqlock(parentkq);
-       if (parentkq->kq_level > 0 &&
-           parentkq->kq_level < kq->kq_level) {
-               kqunlock(parentkq);
-               knote_set_error(kn, EINVAL);
-               return 0;
-       } else {
-               /* set parent level appropriately */
-               plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level;
-               if (plevel < kq->kq_level + 1) {
-                       if (kq->kq_level + 1 > MAX_NESTED_KQ) {
-                               kqunlock(parentkq);
-                               knote_set_error(kn, EINVAL);
-                               return 0;
-                       }
-                       plevel = kq->kq_level + 1;
-               }
+       assert(ut->uu_kqr_bound == kqr);
+       ut->uu_kqr_bound = NULL;
+       if (how == KQWL_OVERRIDE_DROP_IMMEDIATELY &&
+           ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
+               thread_drop_servicer_override(thread);
+               ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
+       }
 
-               parentkq->kq_level = plevel;
-               kqunlock(parentkq);
+       if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
+               turnstile_update_inheritor(kqwl->kqwl_turnstile,
+                   TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
+               turnstile_update_inheritor_complete(kqwl->kqwl_turnstile,
+                   TURNSTILE_INTERLOCK_HELD);
+       }
 
-               kn->kn_filtid = EVFILTID_KQREAD;
-               kqlock(kq);
-               KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
-               /* indicate nesting in child, if needed */
-               if (kq->kq_level == 0) {
-                       kq->kq_level = 1;
-               }
+       kqr->tr_thread = THREAD_NULL;
+       kqr->tr_state = WORKQ_TR_STATE_IDLE;
+       kqwl->kqwl_state &= ~KQ_R2K_ARMED;
+}
 
-               int count = kq->kq_count;
-               kqunlock(kq);
-               return count > 0;
+static void
+kqworkloop_unbind_delayed_override_drop(thread_t thread)
+{
+       struct uthread *ut = get_bsdthread_info(thread);
+       assert(ut->uu_kqr_bound == NULL);
+       if (ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
+               thread_drop_servicer_override(thread);
+               ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
        }
 }
 
 /*
- * kqueue_drain - called when kq is closed
+ *     kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
+ *
+ *     It will acknowledge events, and possibly request a new thread if:
+ *     - there were active events left
+ *     - we pended waitq hook callouts during processing
+ *     - we pended wakeups while processing (or unsuppressing)
+ *
+ *     Called with kqueue lock held.
  */
-/*ARGSUSED*/
-static int
-kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
+static void
+kqworkloop_unbind(struct kqworkloop *kqwl)
 {
-       struct kqueue *kq = (struct kqueue *)fp->f_fglob->fg_data;
+       struct kqueue *kq = &kqwl->kqwl_kqueue;
+       workq_threadreq_t kqr = &kqwl->kqwl_request;
+       thread_t thread = kqr_thread_fast(kqr);
+       int op = KQWL_UTQ_PARKING;
+       kq_index_t qos_override = THREAD_QOS_UNSPECIFIED;
 
-       assert((kq->kq_state & KQ_WORKQ) == 0);
+       assert(thread == current_thread());
 
-       kqlock(kq);
-       kq->kq_state |= KQ_DRAIN;
-       kqueue_interrupt(kq);
-       kqunlock(kq);
-       return 0;
-}
+       kqlock(kqwl);
 
-/*ARGSUSED*/
-int
-kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
-{
-       assert((kq->kq_state & KQ_WORKQ) == 0);
+       /*
+        * Forcing the KQ_PROCESSING flag allows for QoS updates because of
+        * unsuppressing knotes not to be applied until the eventual call to
+        * kqworkloop_update_threads_qos() below.
+        */
+       assert((kq->kq_state & KQ_PROCESSING) == 0);
+       if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
+               kq->kq_state |= KQ_PROCESSING;
+               qos_override = kqworkloop_acknowledge_events(kqwl);
+               kq->kq_state &= ~KQ_PROCESSING;
+       }
 
-       kqlock(kq);
-       if (isstat64 != 0) {
-               struct stat64 *sb64 = (struct stat64 *)ub;
+       kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
+       kqworkloop_update_threads_qos(kqwl, op, qos_override);
 
-               bzero((void *)sb64, sizeof(*sb64));
-               sb64->st_size = kq->kq_count;
-               if (kq->kq_state & KQ_KEV_QOS) {
-                       sb64->st_blksize = sizeof(struct kevent_qos_s);
-               } else if (kq->kq_state & KQ_KEV64) {
-                       sb64->st_blksize = sizeof(struct kevent64_s);
-               } else if (IS_64BIT_PROCESS(p)) {
-                       sb64->st_blksize = sizeof(struct user64_kevent);
-               } else {
-                       sb64->st_blksize = sizeof(struct user32_kevent);
-               }
-               sb64->st_mode = S_IFIFO;
-       } else {
-               struct stat *sb = (struct stat *)ub;
+       kqunlock(kqwl);
 
-               bzero((void *)sb, sizeof(*sb));
-               sb->st_size = kq->kq_count;
-               if (kq->kq_state & KQ_KEV_QOS) {
-                       sb->st_blksize = sizeof(struct kevent_qos_s);
-               } else if (kq->kq_state & KQ_KEV64) {
-                       sb->st_blksize = sizeof(struct kevent64_s);
-               } else if (IS_64BIT_PROCESS(p)) {
-                       sb->st_blksize = sizeof(struct user64_kevent);
-               } else {
-                       sb->st_blksize = sizeof(struct user32_kevent);
-               }
-               sb->st_mode = S_IFIFO;
-       }
-       kqunlock(kq);
-       return 0;
+       /*
+        * Drop the override on the current thread last, after the call to
+        * kqworkloop_update_threads_qos above.
+        */
+       kqworkloop_unbind_delayed_override_drop(thread);
+
+       /* If last reference, dealloc the workloop kq */
+       kqworkloop_release(kqwl);
 }
 
-static inline bool
-kqueue_threadreq_can_use_ast(struct kqueue *kq)
-{
-       if (current_proc() == kq->kq_p) {
-               /*
-                * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can
-                * do combined send/receive and in the case of self-IPC, the AST may bet
-                * set on a thread that will not return to userspace and needs the
-                * thread the AST would create to unblock itself.
-                *
-                * At this time, we really want to target:
-                *
-                * - kevent variants that can cause thread creations, and dispatch
-                *   really only uses kevent_qos and kevent_id,
-                *
-                * - workq_kernreturn (directly about thread creations)
-                *
-                * - bsdthread_ctl which is used for qos changes and has direct impact
-                *   on the creator thread scheduling decisions.
-                */
-               switch (current_uthread()->syscall_code) {
-               case SYS_kevent_qos:
-               case SYS_kevent_id:
-               case SYS_workq_kernreturn:
-               case SYS_bsdthread_ctl:
-                       return true;
-               }
-       }
-       return false;
+static thread_qos_t
+kqworkq_unbind_locked(struct kqworkq *kqwq,
+    workq_threadreq_t kqr, thread_t thread)
+{
+       struct uthread *ut = get_bsdthread_info(thread);
+       kq_index_t old_override = kqr->tr_kq_override_index;
+
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1,
+           thread_tid(kqr_thread(kqr)), kqr->tr_kq_qos_index, 0);
+
+       kqlock_held(kqwq);
+
+       assert(ut->uu_kqr_bound == kqr);
+       ut->uu_kqr_bound = NULL;
+       kqr->tr_thread = THREAD_NULL;
+       kqr->tr_state = WORKQ_TR_STATE_IDLE;
+       kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
+       kqwq->kqwq_state &= ~KQ_R2K_ARMED;
+
+       return old_override;
 }
 
 /*
- * Interact with the pthread kext to request a servicing there at a specific QoS
- * level.
- *
- * - Caller holds the workq request lock
+ *     kqworkq_unbind - unbind of a workq kqueue from a thread
  *
- * - May be called with the kqueue's wait queue set locked,
- *   so cannot do anything that could recurse on that.
+ *     We may have to request new threads.
+ *     This can happen there are no waiting processing threads and:
+ *     - there were active events we never got to (count > 0)
+ *     - we pended waitq hook callouts during processing
+ *     - we pended wakeups while processing (or unsuppressing)
  */
 static void
-kqueue_threadreq_initiate(struct kqueue *kq, struct kqrequest *kqr,
-    kq_index_t qos, int flags)
+kqworkq_unbind(proc_t p, workq_threadreq_t kqr)
 {
-       assert(kqr->kqr_state & KQR_WAKEUP);
-       assert(kqr->kqr_thread == THREAD_NULL);
-       assert((kqr->kqr_state & KQR_THREQUESTED) == 0);
-       struct turnstile *ts = TURNSTILE_NULL;
-
-       if (workq_is_exiting(kq->kq_p)) {
-               return;
-       }
+       struct kqworkq *kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue;
+       __assert_only int rc;
 
-       /* Add a thread request reference on the kqueue. */
-       kqueue_retain(kq);
+       kqlock(kqwq);
+       rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND);
+       assert(rc == -1);
+       kqunlock(kqwq);
+}
 
-       kq_req_held(kq);
+workq_threadreq_t
+kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
+{
+       assert(qos_index < KQWQ_NBUCKETS);
+       return &kqwq->kqwq_request[qos_index];
+}
 
-       if (kq->kq_state & KQ_WORKLOOP) {
-               __assert_only struct kqworkloop *kqwl = (struct kqworkloop *)kq;
+static void
+knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp)
+{
+       kq_index_t qos = _pthread_priority_thread_qos(pp);
 
-               assert(kqwl->kqwl_owner == THREAD_NULL);
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
-                   kqwl->kqwl_dynamicid, 0, qos, kqr->kqr_state);
-               ts = kqwl->kqwl_turnstile;
+       if (kqu.kq->kq_state & KQ_WORKLOOP) {
+               assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0);
+               pp = _pthread_priority_normalize(pp);
+       } else if (kqu.kq->kq_state & KQ_WORKQ) {
+               if (qos == THREAD_QOS_UNSPECIFIED) {
+                       /* On workqueues, outside of QoS means MANAGER */
+                       qos = KQWQ_QOS_MANAGER;
+                       pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
+               } else {
+                       pp = _pthread_priority_normalize(pp);
+               }
        } else {
-               assert(kq->kq_state & KQ_WORKQ);
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST),
-                   -1, 0, qos, kqr->kqr_state);
+               pp = _pthread_unspecified_priority();
+               qos = THREAD_QOS_UNSPECIFIED;
+       }
+
+       kn->kn_qos = pp;
+
+       if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) {
+               /* Never lower QoS when in "Merge" mode */
+               kn->kn_qos_override = qos;
+       }
+
+       /* only adjust in-use qos index when not suppressed */
+       if (kn->kn_status & KN_SUPPRESSED) {
+               kqueue_update_override(kqu, kn, qos);
+       } else if (kn->kn_qos_index != qos) {
+               knote_dequeue(kqu, kn);
+               kn->kn_qos_index = qos;
        }
+}
+
+static void
+knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result)
+{
+       thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7;
 
-       kqr->kqr_state |= KQR_THREQUESTED;
+       kqlock_held(kq);
+
+       assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
+       assert(qos_index < THREAD_QOS_LAST);
 
        /*
-        * New-style thread request supported.
-        * Provide the pthread kext a pointer to a workq_threadreq_s structure for
-        * its use until a corresponding kqueue_threadreq_bind callback.
+        * Early exit for knotes that should not change QoS
         */
-       if (kqueue_threadreq_can_use_ast(kq)) {
-               flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
+       if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
+               panic("filter %d cannot change QoS", kn->kn_filtid);
+       } else if (__improbable(!knote_has_qos(kn))) {
+               return;
        }
-       if (qos == KQWQ_QOS_MANAGER) {
-               qos = WORKQ_THREAD_QOS_MANAGER;
+
+       /*
+        * knotes with the FALLBACK flag will only use their registration QoS if the
+        * incoming event has no QoS, else, the registration QoS acts as a floor.
+        */
+       thread_qos_t req_qos = _pthread_priority_thread_qos_fast(kn->kn_qos);
+       if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
+               if (qos_index == THREAD_QOS_UNSPECIFIED) {
+                       qos_index = req_qos;
+               }
+       } else {
+               if (qos_index < req_qos) {
+                       qos_index = req_qos;
+               }
        }
-       if (!workq_kern_threadreq_initiate(kq->kq_p, kqr, ts, qos, flags)) {
+       if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
+               /* Never lower QoS when in "Merge" mode */
+               return;
+       }
+
+       if ((kn->kn_status & KN_LOCKED) && (kn->kn_status & KN_POSTING)) {
                /*
-                * Process is shutting down or exec'ing.
-                * All the kqueues are going to be cleaned up
-                * soon. Forget we even asked for a thread -
-                * and make sure we don't ask for more.
+                * When we're trying to update the QoS override and that both an
+                * f_event() and other f_* calls are running concurrently, any of these
+                * in flight calls may want to perform overrides that aren't properly
+                * serialized with each other.
+                *
+                * The first update that observes this racy situation enters a "Merge"
+                * mode which causes subsequent override requests to saturate the
+                * override instead of replacing its value.
+                *
+                * This mode is left when knote_unlock() or knote_post()
+                * observe that no other f_* routine is in flight.
                 */
-               kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
-               kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
+               kn->kn_status |= KN_MERGE_QOS;
        }
-}
 
-/*
- * kqueue_threadreq_bind_prepost - prepost the bind to kevent
- *
- * This is used when kqueue_threadreq_bind may cause a lock inversion.
- */
-void
-kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t req,
-    thread_t thread)
-{
-       struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req);
-       struct uthread *ut = get_bsdthread_info(thread);
+       /*
+        * Now apply the override if it changed.
+        */
 
-       req->tr_binding_thread = thread;
-       ut->uu_kqr_bound = kqr;
-       req->tr_state = TR_STATE_BINDING;
+       if (kn->kn_qos_override == qos_index) {
+               return;
+       }
 
-       struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
-       if (kqwl && kqwl->kqwl_turnstile) {
-               struct turnstile *ts = kqwl->kqwl_turnstile;
+       kn->kn_qos_override = qos_index;
+
+       if (kn->kn_status & KN_SUPPRESSED) {
                /*
-                * While a thread request is in flight, the workqueue
-                * is the interlock for the turnstile and can update the inheritor.
+                * For suppressed events, the kn_qos_index field cannot be touched as it
+                * allows us to know on which supress queue the knote is for a kqworkq.
+                *
+                * Also, there's no natural push applied on the kqueues when this field
+                * changes anyway. We hence need to apply manual overrides in this case,
+                * which will be cleared when the events are later acknowledged.
                 */
-               turnstile_update_inheritor(ts, thread, TURNSTILE_IMMEDIATE_UPDATE |
-                   TURNSTILE_INHERITOR_THREAD);
-               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+               kqueue_update_override(kq, kn, qos_index);
+       } else if (kn->kn_qos_index != qos_index) {
+               knote_dequeue(kq, kn);
+               kn->kn_qos_index = qos_index;
        }
 }
 
 /*
- * kqueue_threadreq_bind_commit - commit a bind prepost
+ * Called back from waitq code when no threads waiting and the hook was set.
  *
- * The workq code has to commit any binding prepost before the thread has
- * a chance to come back to userspace (and do kevent syscalls) or be aborted.
+ * Preemption is disabled - minimal work can be done in this context!!!
  */
 void
-kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
+waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *kq_hook)
 {
-       struct uthread *ut = get_bsdthread_info(thread);
-       struct kqrequest *kqr = ut->uu_kqr_bound;
-       kqueue_t kqu = kqr_kqueue(p, kqr);
+       kqueue_t kqu;
 
-       kq_req_lock(kqu);
-       if (kqr->kqr_req.tr_state == TR_STATE_BINDING) {
-               kqueue_threadreq_bind(p, &kqr->kqr_req, thread, 0);
+       kqu.kq = __container_of(kq_hook, struct kqueue, kq_waitq_hook);
+       assert(kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
+
+       kqlock(kqu);
+
+       if (kqu.kq->kq_count > 0) {
+               if (kqu.kq->kq_state & KQ_WORKLOOP) {
+                       kqworkloop_wakeup(kqu.kqwl, KQWL_BUCKET_STAYACTIVE);
+               } else {
+                       kqworkq_wakeup(kqu.kqwq, KQWQ_QOS_MANAGER);
+               }
        }
-       kq_req_unlock(kqu);
+
+       kqunlock(kqu);
 }
 
-static void
-kqueue_threadreq_modify(struct kqueue *kq, struct kqrequest *kqr, kq_index_t qos)
+void
+klist_init(struct klist *list)
 {
-       assert(kqr->kqr_state & KQR_THREQUESTED);
-       assert(kqr->kqr_thread == THREAD_NULL);
-
-       kq_req_held(kq);
-
-       int flags = 0;
-       if (kqueue_threadreq_can_use_ast(kq)) {
-               flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
-       }
-       workq_kern_threadreq_modify(kq->kq_p, kqr, qos, flags);
+       SLIST_INIT(list);
 }
 
+
 /*
- * kqueue_threadreq_bind - bind thread to processing kqrequest
+ * Query/Post each knote in the object's list
  *
- * The provided thread will be responsible for delivering events
- * associated with the given kqrequest.  Bind it and get ready for
- * the thread to eventually arrive.
+ *     The object lock protects the list. It is assumed
+ *     that the filter/event routine for the object can
+ *     determine that the object is already locked (via
+ *     the hint) and not deadlock itself.
+ *
+ *     The object lock should also hold off pending
+ *     detach/drop operations.
  */
 void
-kqueue_threadreq_bind(struct proc *p, workq_threadreq_t req, thread_t thread,
-    unsigned int flags)
+knote(struct klist *list, long hint)
 {
-       struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req);
-       kqueue_t kqu = kqr_kqueue(p, kqr);
-       struct uthread *ut = get_bsdthread_info(thread);
-
-       kq_req_held(kqu);
-
-       assert(kqr->kqr_state & KQR_THREQUESTED);
-       assert(kqr->kqr_thread == THREAD_NULL);
-       assert(ut->uu_kqueue_override == 0);
+       struct knote *kn;
 
-       if (kqr->kqr_req.tr_state == TR_STATE_BINDING) {
-               assert(ut->uu_kqr_bound == kqr);
-               assert(kqr->kqr_req.tr_binding_thread == thread);
-               kqr->kqr_req.tr_state = TR_STATE_IDLE;
-               kqr->kqr_req.tr_binding_thread = NULL;
-       } else {
-               assert(ut->uu_kqr_bound == NULL);
+       SLIST_FOREACH(kn, list, kn_selnext) {
+               knote_post(kn, hint);
        }
+}
 
-       ut->uu_kqr_bound = kqr;
-       kqr->kqr_thread = thread;
+/*
+ * attach a knote to the specified list.  Return true if this is the first entry.
+ * The list is protected by whatever lock the object it is associated with uses.
+ */
+int
+knote_attach(struct klist *list, struct knote *kn)
+{
+       int ret = SLIST_EMPTY(list);
+       SLIST_INSERT_HEAD(list, kn, kn_selnext);
+       return ret;
+}
 
-       if (kqu.kq->kq_state & KQ_WORKLOOP) {
-               struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
+/*
+ * detach a knote from the specified list.  Return true if that was the last entry.
+ * The list is protected by whatever lock the object it is associated with uses.
+ */
+int
+knote_detach(struct klist *list, struct knote *kn)
+{
+       SLIST_REMOVE(list, kn, knote, kn_selnext);
+       return SLIST_EMPTY(list);
+}
 
-               if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
-                       /*
-                        * <rdar://problem/38626999> shows that asserting here is not ok.
-                        *
-                        * This is not supposed to happen for correct use of the interface,
-                        * but it is sadly possible for userspace (with the help of memory
-                        * corruption, such as over-release of a dispatch queue) to make
-                        * the creator thread the "owner" of a workloop.
-                        *
-                        * Once that happens, and that creator thread picks up the same
-                        * workloop as a servicer, we trip this codepath. We need to fixup
-                        * the state to forget about this thread being the owner, as the
-                        * entire workloop state machine expects servicers to never be
-                        * owners and everything would basically go downhill from here.
-                        */
-                       kqu.kqwl->kqwl_owner = THREAD_NULL;
-                       if (kqworkloop_owner_override(kqu.kqwl)) {
-                               thread_drop_ipc_override(thread);
-                       }
-                       thread_ends_owning_workloop(thread);
-               }
+/*
+ * knote_vanish - Indicate that the source has vanished
+ *
+ * If the knote has requested EV_VANISHED delivery,
+ * arrange for that. Otherwise, deliver a NOTE_REVOKE
+ * event for backward compatibility.
+ *
+ * The knote is marked as having vanished, but is not
+ * actually detached from the source in this instance.
+ * The actual detach is deferred until the knote drop.
+ *
+ * Our caller already has the object lock held. Calling
+ * the detach routine would try to take that lock
+ * recursively - which likely is not supported.
+ */
+void
+knote_vanish(struct klist *list, bool make_active)
+{
+       struct knote *kn;
+       struct knote *kn_next;
 
-               if (ts && (flags & KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE) == 0) {
+       SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
+               struct kqueue *kq = knote_get_kq(kn);
+
+               kqlock(kq);
+               if (__probable(kn->kn_status & KN_REQVANISH)) {
                        /*
-                        * Past this point, the interlock is the kq req lock again,
-                        * so we can fix the inheritor for good.
+                        * If EV_VANISH supported - prepare to deliver one
                         */
-                       filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
-                       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+                       kn->kn_status |= KN_VANISHED;
+               } else {
+                       /*
+                        * Handle the legacy way to indicate that the port/portset was
+                        * deallocated or left the current Mach portspace (modern technique
+                        * is with an EV_VANISHED protocol).
+                        *
+                        * Deliver an EV_EOF event for these changes (hopefully it will get
+                        * delivered before the port name recycles to the same generation
+                        * count and someone tries to re-register a kevent for it or the
+                        * events are udata-specific - avoiding a conflict).
+                        */
+                       kn->kn_flags |= EV_EOF | EV_ONESHOT;
                }
-
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
-                   thread_tid(thread), kqr->kqr_qos_index,
-                   (kqr->kqr_override_index << 16) | kqr->kqr_state);
-
-               ut->uu_kqueue_override = kqr->kqr_override_index;
-               if (kqr->kqr_override_index) {
-                       thread_add_ipc_override(thread, kqr->kqr_override_index);
+               if (make_active) {
+                       knote_activate(kq, kn, FILTER_ACTIVE);
                }
-       } else {
-               assert(kqr->kqr_override_index == 0);
-
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1,
-                   thread_tid(thread), kqr->kqr_qos_index,
-                   (kqr->kqr_override_index << 16) | kqr->kqr_state);
+               kqunlock(kq);
        }
 }
 
 /*
- * kqueue_threadreq_cancel - abort a pending thread request
+ * Force a lazy allocation of the waitqset link
+ * of the kq_wqs associated with the kn
+ * if it wasn't already allocated.
  *
- * Called when exiting/exec'ing. Forget our pending request.
+ * This allows knote_link_waitq to never block
+ * if reserved_link is not NULL.
  */
 void
-kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t req)
+knote_link_waitqset_lazy_alloc(struct knote *kn)
 {
-       struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req);
-       kqueue_t kqu = kqr_kqueue(p, kqr);
-
-       kq_req_lock(kqu);
-
-       assert(kqr->kqr_thread == THREAD_NULL);
-       assert(kqr->kqr_state & KQR_THREQUESTED);
-       kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
-
-       kq_req_unlock(kqu);
-
-       kqueue_release_last(p, kqu); /* may dealloc kqu */
+       struct kqueue *kq = knote_get_kq(kn);
+       waitq_set_lazy_init_link(&kq->kq_wqs);
 }
 
-workq_threadreq_param_t
-kqueue_threadreq_workloop_param(workq_threadreq_t req)
+/*
+ * Check if a lazy allocation for the waitqset link
+ * of the kq_wqs is needed.
+ */
+boolean_t
+knote_link_waitqset_should_lazy_alloc(struct knote *kn)
 {
-       struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req);
-       struct kqworkloop *kqwl;
-       workq_threadreq_param_t trp;
-
-       assert(kqr->kqr_state & KQR_WORKLOOP);
-       kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
-       trp.trp_value = kqwl->kqwl_params;
-       return trp;
+       struct kqueue *kq = knote_get_kq(kn);
+       return waitq_set_should_lazy_init_link(&kq->kq_wqs);
 }
 
 /*
- *     kqueue_threadreq_unbind - unbind thread from processing kqueue
+ * For a given knote, link a provided wait queue directly with the kqueue.
+ * Wakeups will happen via recursive wait queue support.  But nothing will move
+ * the knote to the active list at wakeup (nothing calls knote()).  Instead,
+ * we permanently enqueue them here.
  *
- *     End processing the per-QoS bucket of events and allow other threads
- *     to be requested for future servicing.
+ * kqueue and knote references are held by caller.
+ * waitq locked by caller.
  *
- *     caller holds a reference on the kqueue.
+ * caller provides the wait queue link structure and insures that the kq->kq_wqs
+ * is linked by previously calling knote_link_waitqset_lazy_alloc.
  */
-void
-kqueue_threadreq_unbind(struct proc *p, struct kqrequest *kqr)
+int
+knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link)
 {
-       if (kqr->kqr_state & KQR_WORKLOOP) {
-               kqworkloop_unbind(p, kqr_kqworkloop(kqr));
+       struct kqueue *kq = knote_get_kq(kn);
+       kern_return_t kr;
+
+       kr = waitq_link(wq, &kq->kq_wqs, WAITQ_ALREADY_LOCKED, reserved_link);
+       if (kr == KERN_SUCCESS) {
+               knote_markstayactive(kn);
+               return 0;
        } else {
-               kqworkq_unbind(p, kqr);
+               return EINVAL;
        }
 }
 
 /*
- * If we aren't already busy processing events [for this QoS],
- * request workq thread support as appropriate.
+ * Unlink the provided wait queue from the kqueue associated with a knote.
+ * Also remove it from the magic list of directly attached knotes.
  *
- * TBD - for now, we don't segregate out processing by QoS.
+ * Note that the unlink may have already happened from the other side, so
+ * ignore any failures to unlink and just remove it from the kqueue list.
  *
- * - May be called with the kqueue's wait queue set locked,
- *   so cannot do anything that could recurse on that.
+ * On success, caller is responsible for the link structure
  */
-static void
-kqworkq_request_help(struct kqworkq *kqwq, kq_index_t qos_index)
+int
+knote_unlink_waitq(struct knote *kn, struct waitq *wq)
 {
-       struct kqrequest *kqr;
-
-       /* convert to thread qos value */
-       assert(qos_index < KQWQ_NBUCKETS);
-
-       kq_req_lock(kqwq);
-       kqr = kqworkq_get_request(kqwq, qos_index);
-
-       if ((kqr->kqr_state & KQR_WAKEUP) == 0) {
-               kqr->kqr_state |= KQR_WAKEUP;
-               if ((kqr->kqr_state & KQR_THREQUESTED) == 0) {
-                       kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0);
-               }
-       }
-       kq_req_unlock(kqwq);
-}
+       struct kqueue *kq = knote_get_kq(kn);
+       kern_return_t kr;
 
-static kq_index_t
-kqworkloop_owner_override(struct kqworkloop *kqwl)
-{
-       struct kqrequest *kqr = &kqwl->kqwl_request;
-       return MAX(kqr->kqr_qos_index, kqr->kqr_override_index);
+       kr = waitq_unlink(wq, &kq->kq_wqs);
+       knote_clearstayactive(kn);
+       return (kr != KERN_SUCCESS) ? EINVAL : 0;
 }
 
-static inline void
-kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
+/*
+ * remove all knotes referencing a specified fd
+ *
+ * Entered with the proc_fd lock already held.
+ * It returns the same way, but may drop it temporarily.
+ */
+void
+knote_fdclose(struct proc *p, int fd)
 {
-       struct kqrequest *kqr = &kqwl->kqwl_request;
-
-       kq_req_held(kqwl);
-
-       if (kqr->kqr_state & KQR_R2K_NOTIF_ARMED) {
-               assert(kqr->kqr_thread);
-               kqr->kqr_state &= ~KQR_R2K_NOTIF_ARMED;
-               act_set_astkevent(kqr->kqr_thread, AST_KEVENT_RETURN_TO_KERNEL);
-       }
-}
+       struct klist *list;
+       struct knote *kn;
+       KNOTE_LOCK_CTX(knlc);
 
-static void
-kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
-{
-       struct kqrequest *kqr = &kqwl->kqwl_request;
-       struct kqueue *kq = &kqwl->kqwl_kqueue;
-       kq_index_t old_owner_override = kqworkloop_owner_override(kqwl);
-       kq_index_t i;
+restart:
+       list = &p->p_fd->fd_knlist[fd];
+       SLIST_FOREACH(kn, list, kn_link) {
+               struct kqueue *kq = knote_get_kq(kn);
 
-       /* must hold the kqr lock */
-       kq_req_held(kqwl);
+               kqlock(kq);
 
-       switch (op) {
-       case KQWL_UTQ_UPDATE_WAKEUP_QOS:
-               if (qos == KQWL_BUCKET_STAYACTIVE) {
-                       /*
-                        * the KQWL_BUCKET_STAYACTIVE is not a QoS bucket, we only remember
-                        * a high watermark (kqr_stayactive_qos) of any stay active knote
-                        * that was ever registered with this workloop.
-                        *
-                        * When waitq_set__CALLING_PREPOST_HOOK__() wakes up any stay active
-                        * knote, we use this high-watermark as a wakeup-index, and also set
-                        * the magic KQWL_BUCKET_STAYACTIVE bit to make sure we remember
-                        * there is at least one stay active knote fired until the next full
-                        * processing of this bucket.
-                        */
-                       kqr->kqr_wakeup_indexes |= KQWL_STAYACTIVE_FIRED_BIT;
-                       qos = kqr->kqr_stayactive_qos;
-                       assert(qos);
+               if (kq->kq_p != p) {
+                       panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
+                           __func__, kq->kq_p, p);
                }
-               if (kqr->kqr_wakeup_indexes & (1 << qos)) {
-                       assert(kqr->kqr_state & KQR_WAKEUP);
-                       break;
+
+               /*
+                * If the knote supports EV_VANISHED delivery,
+                * transition it to vanished mode (or skip over
+                * it if already vanished).
+                */
+               if (kn->kn_status & KN_VANISHED) {
+                       kqunlock(kq);
+                       continue;
                }
 
-               kqr->kqr_wakeup_indexes |= (1 << qos);
-               kqr->kqr_state |= KQR_WAKEUP;
-               kqworkloop_request_fire_r2k_notification(kqwl);
-               goto recompute;
+               proc_fdunlock(p);
+               if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
+                       /* the knote was dropped by someone, nothing to do */
+               } else if (kn->kn_status & KN_REQVANISH) {
+                       kn->kn_status |= KN_VANISHED;
 
-       case KQWL_UTQ_UPDATE_STAYACTIVE_QOS:
-               assert(qos);
-               if (kqr->kqr_stayactive_qos < qos) {
-                       kqr->kqr_stayactive_qos = qos;
-                       if (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT) {
-                               assert(kqr->kqr_state & KQR_WAKEUP);
-                               kqr->kqr_wakeup_indexes |= (1 << qos);
-                               goto recompute;
+                       kqunlock(kq);
+                       knote_fops(kn)->f_detach(kn);
+                       if (kn->kn_is_fd) {
+                               fp_drop(p, kn->kn_id, kn->kn_fp, 0);
                        }
-               }
-               break;
+                       kn->kn_filtid = EVFILTID_DETACHED;
+                       kqlock(kq);
 
-       case KQWL_UTQ_PARKING:
-       case KQWL_UTQ_UNBINDING:
-               kqr->kqr_override_index = qos;
-       /* FALLTHROUGH */
-       case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
-               if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
-                       assert(qos == THREAD_QOS_UNSPECIFIED);
-               }
-               kqlock_held(kqwl); // to look at kq_queues
-               i = KQWL_BUCKET_STAYACTIVE;
-               if (TAILQ_EMPTY(&kqr->kqr_suppressed)) {
-                       kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
-               }
-               if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i]) &&
-                   (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT)) {
-                       /*
-                        * If the KQWL_STAYACTIVE_FIRED_BIT is set, it means a stay active
-                        * knote may have fired, so we need to merge in kqr_stayactive_qos.
-                        *
-                        * Unlike other buckets, this one is never empty but could be idle.
-                        */
-                       kqr->kqr_wakeup_indexes &= KQWL_STAYACTIVE_FIRED_BIT;
-                       kqr->kqr_wakeup_indexes |= (1 << kqr->kqr_stayactive_qos);
-               } else {
-                       kqr->kqr_wakeup_indexes = 0;
-               }
-               for (i = THREAD_QOS_UNSPECIFIED + 1; i < KQWL_BUCKET_STAYACTIVE; i++) {
-                       if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i])) {
-                               kqr->kqr_wakeup_indexes |= (1 << i);
-                       }
-               }
-               if (kqr->kqr_wakeup_indexes) {
-                       kqr->kqr_state |= KQR_WAKEUP;
-                       kqworkloop_request_fire_r2k_notification(kqwl);
+                       knote_activate(kq, kn, FILTER_ACTIVE);
+                       knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
                } else {
-                       kqr->kqr_state &= ~KQR_WAKEUP;
-               }
-               goto recompute;
-
-       case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
-               kqr->kqr_override_index = qos;
-               goto recompute;
-
-       case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
-recompute:
-               /*
-                * When modifying the wakeup QoS or the override QoS, we always need to
-                * maintain our invariant that kqr_override_index is at least as large
-                * as the highest QoS for which an event is fired.
-                *
-                * However this override index can be larger when there is an overriden
-                * suppressed knote pushing on the kqueue.
-                */
-               if (kqr->kqr_wakeup_indexes > (1 << qos)) {
-                       qos = fls(kqr->kqr_wakeup_indexes) - 1; /* fls is 1-based */
-               }
-               if (kqr->kqr_override_index < qos) {
-                       kqr->kqr_override_index = qos;
+                       knote_drop(kq, kn, &knlc);
                }
-               break;
-
-       case KQWL_UTQ_REDRIVE_EVENTS:
-               break;
 
-       case KQWL_UTQ_SET_QOS_INDEX:
-               kqr->kqr_qos_index = qos;
-               break;
-
-       default:
-               panic("unknown kqwl thread qos update operation: %d", op);
+               proc_fdlock(p);
+               goto restart;
        }
+}
 
-       thread_t kqwl_owner = kqwl->kqwl_owner;
-       thread_t servicer = kqr->kqr_thread;
-       boolean_t qos_changed = FALSE;
-       kq_index_t new_owner_override = kqworkloop_owner_override(kqwl);
+/*
+ * knote_fdfind - lookup a knote in the fd table for process
+ *
+ * If the filter is file-based, lookup based on fd index.
+ * Otherwise use a hash based on the ident.
+ *
+ * Matching is based on kq, filter, and ident. Optionally,
+ * it may also be based on the udata field in the kevent -
+ * allowing multiple event registration for the file object
+ * per kqueue.
+ *
+ * fd_knhashlock or fdlock held on entry (and exit)
+ */
+static struct knote *
+knote_fdfind(struct kqueue *kq,
+    const struct kevent_internal_s *kev,
+    bool is_fd,
+    struct proc *p)
+{
+       struct filedesc *fdp = p->p_fd;
+       struct klist *list = NULL;
+       struct knote *kn = NULL;
 
        /*
-        * Apply the diffs to the owner if applicable
+        * determine where to look for the knote
         */
-       if (kqwl_owner) {
-#if 0
-               /* JMM - need new trace hooks for owner overrides */
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
-                   kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->kqr_qos_index,
-                   (kqr->kqr_override_index << 16) | kqr->kqr_state);
-#endif
-               if (new_owner_override == old_owner_override) {
-                       // nothing to do
-               } else if (old_owner_override == THREAD_QOS_UNSPECIFIED) {
-                       thread_add_ipc_override(kqwl_owner, new_owner_override);
-               } else if (new_owner_override == THREAD_QOS_UNSPECIFIED) {
-                       thread_drop_ipc_override(kqwl_owner);
-               } else { /*  old_owner_override != new_owner_override */
-                       thread_update_ipc_override(kqwl_owner, new_owner_override);
+       if (is_fd) {
+               /* fd-based knotes are linked off the fd table */
+               if (kev->kei_ident < (u_int)fdp->fd_knlistsize) {
+                       list = &fdp->fd_knlist[kev->kei_ident];
                }
+       } else if (fdp->fd_knhashmask != 0) {
+               /* hash non-fd knotes here too */
+               list = &fdp->fd_knhash[KN_HASH((u_long)kev->kei_ident, fdp->fd_knhashmask)];
        }
 
        /*
-        * apply the diffs to the servicer
+        * scan the selected list looking for a match
         */
-       if ((kqr->kqr_state & KQR_THREQUESTED) == 0) {
-               /*
-                * No servicer, nor thread-request
-                *
-                * Make a new thread request, unless there is an owner (or the workloop
-                * is suspended in userland) or if there is no asynchronous work in the
-                * first place.
-                */
-
-               if (kqwl_owner == NULL && (kqr->kqr_state & KQR_WAKEUP)) {
-                       int initiate_flags = 0;
-                       if (op == KQWL_UTQ_UNBINDING) {
-                               initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
-                       }
-                       kqueue_threadreq_initiate(kq, kqr, new_owner_override,
-                           initiate_flags);
-               }
-       } else if (servicer) {
-               /*
-                * Servicer in flight
-                *
-                * Just apply the diff to the servicer
-                */
-               struct uthread *ut = get_bsdthread_info(servicer);
-               if (ut->uu_kqueue_override != kqr->kqr_override_index) {
-                       if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
-                               thread_add_ipc_override(servicer, kqr->kqr_override_index);
-                       } else if (kqr->kqr_override_index == THREAD_QOS_UNSPECIFIED) {
-                               thread_drop_ipc_override(servicer);
-                       } else { /* ut->uu_kqueue_override != kqr->kqr_override_index */
-                               thread_update_ipc_override(servicer, kqr->kqr_override_index);
+       if (list != NULL) {
+               SLIST_FOREACH(kn, list, kn_link) {
+                       if (kq == knote_get_kq(kn) &&
+                           kev->kei_ident == kn->kn_id &&
+                           kev->kei_filter == kn->kn_filter) {
+                               if (kev->kei_flags & EV_UDATA_SPECIFIC) {
+                                       if ((kn->kn_flags & EV_UDATA_SPECIFIC) &&
+                                           kev->kei_udata == kn->kn_udata) {
+                                               break; /* matching udata-specific knote */
+                                       }
+                               } else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0) {
+                                       break; /* matching non-udata-specific knote */
+                               }
                        }
-                       ut->uu_kqueue_override = kqr->kqr_override_index;
-                       qos_changed = TRUE;
                }
-       } else if (new_owner_override == THREAD_QOS_UNSPECIFIED) {
-               /*
-                * No events to deliver anymore.
-                *
-                * However canceling with turnstiles is challenging, so the fact that
-                * the request isn't useful will be discovered by the servicer himself
-                * later on.
-                */
-       } else if (old_owner_override != new_owner_override) {
-               /*
-                * Request is in flight
-                *
-                * Apply the diff to the thread request
-                */
-               kqueue_threadreq_modify(kq, kqr, new_owner_override);
-               qos_changed = TRUE;
-       }
-
-       if (qos_changed) {
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
-                   thread_tid(kqr->kqr_thread), kqr->kqr_qos_index,
-                   (kqr->kqr_override_index << 16) | kqr->kqr_state);
        }
+       return kn;
 }
 
-static void
-kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index)
+/*
+ * kq_add_knote- Add knote to the fd table for process
+ * while checking for duplicates.
+ *
+ * All file-based filters associate a list of knotes by file
+ * descriptor index. All other filters hash the knote by ident.
+ *
+ * May have to grow the table of knote lists to cover the
+ * file descriptor index presented.
+ *
+ * fd_knhashlock and fdlock unheld on entry (and exit).
+ *
+ * Takes a rwlock boost if inserting the knote is successful.
+ */
+static int
+kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
+    struct proc *p)
 {
-       /* convert to thread qos value */
-       assert(qos_index < KQWL_NBUCKETS);
-
-       kq_req_lock(kqwl);
-       kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos_index);
-       kq_req_unlock(kqwl);
-}
+       struct filedesc *fdp = p->p_fd;
+       struct klist *list = NULL;
+       int ret = 0;
+       bool is_fd = kn->kn_is_fd;
 
-static struct kqtailq *
-kqueue_get_queue(struct kqueue *kq, kq_index_t qos_index)
-{
-       if (kq->kq_state & KQ_WORKQ) {
-               assert(qos_index < KQWQ_NBUCKETS);
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               assert(qos_index < KQWL_NBUCKETS);
+       if (is_fd) {
+               proc_fdlock(p);
        } else {
-               assert(qos_index == QOS_INDEX_KQFILE);
+               knhash_lock(fdp);
        }
-       static_assert(offsetof(struct kqueue, kq_queue) == sizeof(struct kqueue),
-           "struct kqueue::kq_queue must be exactly at the end");
-       return &kq->kq_queue[qos_index];
-}
-
-static int
-kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index)
-{
-       return TAILQ_EMPTY(kqueue_get_queue(kq, qos_index));
-}
 
-static struct kqtailq *
-kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
-{
-       if (kq.kq->kq_state & KQ_WORKQ) {
-               return &kqworkq_get_request(kq.kqwq, kn->kn_qos_index)->kqr_suppressed;
-       } else if (kq.kq->kq_state & KQ_WORKLOOP) {
-               return &kq.kqwl->kqwl_request.kqr_suppressed;
-       } else {
-               return &kq.kqf->kqf_suppressed;
+       if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) {
+               /* found an existing knote: we can't add this one */
+               ret = ERESTART;
+               goto out_locked;
        }
-}
 
-static struct turnstile *
-kqueue_get_turnstile(kqueue_t kqu, bool can_alloc)
-{
-       uint8_t kqr_state;
+       /* knote was not found: add it now */
+       if (!is_fd) {
+               if (fdp->fd_knhashmask == 0) {
+                       u_long size = 0;
 
-       if ((kqu.kq->kq_state & KQ_WORKLOOP) == 0) {
-               return TURNSTILE_NULL;
-       }
+                       list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size);
+                       if (list == NULL) {
+                               ret = ENOMEM;
+                               goto out_locked;
+                       }
 
-       kqr_state = os_atomic_load(&kqu.kqwl->kqwl_request.kqr_state, relaxed);
-       if (kqr_state & KQR_ALLOCATED_TURNSTILE) {
-               /* force a dependency to pair with the atomic or with release below */
-               return os_atomic_load_with_dependency_on(&kqu.kqwl->kqwl_turnstile,
-                          kqr_state);
-       }
+                       fdp->fd_knhash = list;
+                       fdp->fd_knhashmask = size;
+               }
 
-       if (!can_alloc) {
-               return TURNSTILE_NULL;
-       }
+               list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
+               SLIST_INSERT_HEAD(list, kn, kn_link);
+               ret = 0;
+               goto out_locked;
+       } else {
+               /* knote is fd based */
 
-       struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL;
+               if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
+                       u_int size = 0;
 
-       kq_req_lock(kqu);
-       if (filt_wlturnstile_interlock_is_workq(kqu.kqwl)) {
-               workq_kern_threadreq_lock(kqu.kqwl->kqwl_p);
-       }
+                       if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur
+                           || kn->kn_id >= (uint64_t)maxfiles) {
+                               ret = EINVAL;
+                               goto out_locked;
+                       }
+                       /* have to grow the fd_knlist */
+                       size = fdp->fd_knlistsize;
+                       while (size <= kn->kn_id) {
+                               size += KQEXTENT;
+                       }
 
-       if (kqu.kqwl->kqwl_request.kqr_state & KQR_ALLOCATED_TURNSTILE) {
-               free_ts = ts;
-               ts = kqu.kqwl->kqwl_turnstile;
-       } else {
-               ts = turnstile_prepare((uintptr_t)kqu.kqwl, &kqu.kqwl->kqwl_turnstile,
-                   ts, TURNSTILE_WORKLOOPS);
+                       if (size >= (UINT_MAX / sizeof(struct klist *))) {
+                               ret = EINVAL;
+                               goto out_locked;
+                       }
 
-               /* release-barrier to pair with the unlocked load of kqwl_turnstile above */
-               os_atomic_or(&kqu.kqwl->kqwl_request.kqr_state,
-                   KQR_ALLOCATED_TURNSTILE, release);
-       }
+                       MALLOC(list, struct klist *,
+                           size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
+                       if (list == NULL) {
+                               ret = ENOMEM;
+                               goto out_locked;
+                       }
 
-       if (filt_wlturnstile_interlock_is_workq(kqu.kqwl)) {
-               workq_kern_threadreq_unlock(kqu.kqwl->kqwl_p);
-       }
-       kq_req_unlock(kqu.kqwl);
+                       bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
+                           fdp->fd_knlistsize * sizeof(struct klist *));
+                       bzero((caddr_t)list +
+                           fdp->fd_knlistsize * sizeof(struct klist *),
+                           (size - fdp->fd_knlistsize) * sizeof(struct klist *));
+                       FREE(fdp->fd_knlist, M_KQUEUE);
+                       fdp->fd_knlist = list;
+                       fdp->fd_knlistsize = size;
+               }
 
-       if (free_ts) {
-               turnstile_deallocate(free_ts);
+               list = &fdp->fd_knlist[kn->kn_id];
+               SLIST_INSERT_HEAD(list, kn, kn_link);
+               ret = 0;
+               goto out_locked;
        }
-       return ts;
-}
 
-struct turnstile *
-kqueue_turnstile(struct kqueue *kq)
-{
-       return kqueue_get_turnstile(kq, false);
-}
-
-struct turnstile *
-kqueue_alloc_turnstile(struct kqueue *kq)
-{
-       return kqueue_get_turnstile(kq, true);
-}
+out_locked:
+       if (ret == 0) {
+               kqlock(kq);
+               assert((kn->kn_status & KN_LOCKED) == 0);
+               (void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
+               kqueue_retain(kq); /* retain a kq ref */
+       }
+       if (is_fd) {
+               proc_fdunlock(p);
+       } else {
+               knhash_unlock(fdp);
+       }
 
-static struct kqtailq *
-knote_get_queue(struct knote *kn)
-{
-       return kqueue_get_queue(knote_get_kq(kn), kn->kn_qos_index);
+       return ret;
 }
 
+/*
+ * kq_remove_knote - remove a knote from the fd table for process
+ *
+ * If the filter is file-based, remove based on fd index.
+ * Otherwise remove from the hash based on the ident.
+ *
+ * fd_knhashlock and fdlock unheld on entry (and exit).
+ */
 static void
-knote_reset_priority(struct knote *kn, pthread_priority_t pp)
+kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
+    struct knote_lock_ctx *knlc)
 {
-       struct kqueue *kq = knote_get_kq(kn);
-       kq_index_t qos = _pthread_priority_thread_qos(pp);
-
-       assert((kn->kn_status & KN_QUEUED) == 0);
+       struct filedesc *fdp = p->p_fd;
+       struct klist *list = NULL;
+       uint16_t kq_state;
+       bool is_fd = kn->kn_is_fd;
 
-       if (kq->kq_state & KQ_WORKQ) {
-               if (qos == THREAD_QOS_UNSPECIFIED) {
-                       /* On workqueues, outside of QoS means MANAGER */
-                       qos = KQWQ_QOS_MANAGER;
-                       pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
-               } else {
-                       pp = _pthread_priority_normalize(pp);
-               }
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0);
-               pp = _pthread_priority_normalize(pp);
+       if (is_fd) {
+               proc_fdlock(p);
        } else {
-               pp = _pthread_unspecified_priority();
-               qos = THREAD_QOS_UNSPECIFIED;
+               knhash_lock(fdp);
        }
 
-       kn->kn_qos = pp;
-       kn->kn_req_index = qos;
+       if (is_fd) {
+               assert((u_int)fdp->fd_knlistsize > kn->kn_id);
+               list = &fdp->fd_knlist[kn->kn_id];
+       } else {
+               list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
+       }
+       SLIST_REMOVE(list, kn, knote, kn_link);
 
-       if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) {
-               /* Never lower QoS when in "Merge" mode */
-               kn->kn_qos_override = qos;
+       kqlock(kq);
+       kq_state = kq->kq_state;
+       if (knlc) {
+               knote_unlock_cancel(kq, kn, knlc);
+       } else {
+               kqunlock(kq);
+       }
+       if (is_fd) {
+               proc_fdunlock(p);
+       } else {
+               knhash_unlock(fdp);
        }
 
-       /* only adjust in-use qos index when not suppressed */
-       if ((kn->kn_status & KN_SUPPRESSED) == 0) {
-               kn->kn_qos_index = qos;
-       } else if (kq->kq_state & KQ_WORKQ) {
-               kqworkq_update_override((struct kqworkq *)kq, kn, qos);
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               kqworkloop_update_override((struct kqworkloop *)kq, qos);
+       if (kq_state & KQ_DYNAMIC) {
+               kqworkloop_release((struct kqworkloop *)kq);
        }
 }
 
-static void
-knote_set_qos_overcommit(struct knote *kn)
+/*
+ * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
+ * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
+ *
+ * fd_knhashlock or fdlock unheld on entry (and exit)
+ */
+
+static struct knote *
+kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_qos_s *kev,
+    bool is_fd, struct proc *p)
 {
-       struct kqueue *kq = knote_get_kq(kn);
+       struct filedesc *fdp = p->p_fd;
+       struct knote *kn;
 
-       /* turn overcommit on for the appropriate thread request? */
-       if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
-           (kq->kq_state & KQ_WORKLOOP)) {
-               struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-               struct kqrequest *kqr = &kqwl->kqwl_request;
+       if (is_fd) {
+               proc_fdlock(p);
+       } else {
+               knhash_lock(fdp);
+       }
 
-               /*
-                * This test is racy, but since we never remove this bit,
-                * it allows us to avoid taking a lock.
-                */
-               if (kqr->kqr_state & KQR_THOVERCOMMIT) {
-                       return;
-               }
+       /*
+        * Temporary horrible hack:
+        * this cast is gross and will go away in a future change.
+        * It is OK to do because we don't look at xflags/s_fflags,
+        * and that when we cast down the kev this way,
+        * the truncated filter field works.
+        */
+       kn = knote_fdfind(kq, (struct kevent_internal_s *)kev, is_fd, p);
 
-               kq_req_lock(kqwl);
-               kqr->kqr_state |= KQR_THOVERCOMMIT;
-               if (!kqr->kqr_thread && (kqr->kqr_state & KQR_THREQUESTED)) {
-                       kqueue_threadreq_modify(kq, kqr, kqr->kqr_req.tr_qos);
-               }
-               kq_req_unlock(kqwl);
+       if (kn) {
+               kqlock(kq);
+               assert(knote_get_kq(kn) == kq);
        }
-}
 
-static kq_index_t
-knote_get_qos_override_index(struct knote *kn)
-{
-       return kn->kn_qos_override;
+       if (is_fd) {
+               proc_fdunlock(p);
+       } else {
+               knhash_unlock(fdp);
+       }
+
+       return kn;
 }
 
+__attribute__((noinline))
 static void
-kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn,
-    kq_index_t override_index)
+kqfile_wakeup(struct kqfile *kqf, __unused kq_index_t qos)
 {
-       struct kqrequest *kqr;
-       kq_index_t old_override_index;
-       kq_index_t queue_index = kn->kn_qos_index;
-
-       if (override_index <= queue_index) {
-               return;
+       /* flag wakeups during processing */
+       if (kqf->kqf_state & KQ_PROCESSING) {
+               kqf->kqf_state |= KQ_WAKEUP;
        }
 
-       kqr = kqworkq_get_request(kqwq, queue_index);
-
-       kq_req_lock(kqwq);
-       old_override_index = kqr->kqr_override_index;
-       if (override_index > MAX(kqr->kqr_qos_index, old_override_index)) {
-               kqr->kqr_override_index = override_index;
-
-               /* apply the override to [incoming?] servicing thread */
-               if (kqr->kqr_thread) {
-                       if (old_override_index) {
-                               thread_update_ipc_override(kqr->kqr_thread, override_index);
-                       } else {
-                               thread_add_ipc_override(kqr->kqr_thread, override_index);
-                       }
-               }
+       /* wakeup a thread waiting on this queue */
+       if (kqf->kqf_state & (KQ_SLEEP | KQ_SEL)) {
+               kqf->kqf_state &= ~(KQ_SLEEP | KQ_SEL);
+               waitq_wakeup64_all((struct waitq *)&kqf->kqf_wqs, KQ_EVENT,
+                   THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
        }
-       kq_req_unlock(kqwq);
-}
 
-static void
-kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t override_index)
-{
-       kq_req_lock(kqwl);
-       kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
-           override_index);
-       kq_req_unlock(kqwl);
+       /* wakeup other kqueues/select sets we're inside */
+       KNOTE(&kqf->kqf_sel.si_note, 0);
 }
 
-static thread_qos_t
-kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread)
+static struct kqtailq *
+knote_get_tailq(kqueue_t kqu, struct knote *kn)
 {
-       struct uthread *ut = get_bsdthread_info(thread);
-       struct kqrequest *kqr = &kqwl->kqwl_request;
-       kq_index_t ipc_override = ut->uu_kqueue_override;
-
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
-           thread_tid(thread), 0, 0);
-
-       kq_req_held(kqwl);
-       assert(ut->uu_kqr_bound == kqr);
-       ut->uu_kqr_bound = NULL;
-       ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
+       kq_index_t qos_index = kn->kn_qos_index;
 
-       if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
-               turnstile_update_inheritor(kqwl->kqwl_turnstile,
-                   TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
-               turnstile_update_inheritor_complete(kqwl->kqwl_turnstile,
-                   TURNSTILE_INTERLOCK_HELD);
+       if (kqu.kq->kq_state & KQ_WORKLOOP) {
+               assert(qos_index < KQWL_NBUCKETS);
+       } else if (kqu.kq->kq_state & KQ_WORKQ) {
+               assert(qos_index < KQWQ_NBUCKETS);
+       } else {
+               assert(qos_index == QOS_INDEX_KQFILE);
        }
-
-       kqr->kqr_thread = NULL;
-       kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
-       return ipc_override;
+       static_assert(offsetof(struct kqueue, kq_queue) == sizeof(struct kqueue),
+           "struct kqueue::kq_queue must be exactly at the end");
+       return &kqu.kq->kq_queue[qos_index];
 }
 
-/*
- *     kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
- *
- *     It will acknowledge events, and possibly request a new thread if:
- *     - there were active events left
- *     - we pended waitq hook callouts during processing
- *     - we pended wakeups while processing (or unsuppressing)
- *
- *     Called with kqueue lock held.
- */
 static void
-kqworkloop_unbind(proc_t p, struct kqworkloop *kqwl)
+knote_enqueue(kqueue_t kqu, struct knote *kn, kn_status_t wakeup_mask)
 {
-       struct kqueue *kq = &kqwl->kqwl_kqueue;
-       struct kqrequest *kqr = &kqwl->kqwl_request;
-       thread_t thread = kqr->kqr_thread;
-       int op = KQWL_UTQ_PARKING;
-       kq_index_t ipc_override, qos_override = THREAD_QOS_UNSPECIFIED;
-
-       assert(thread == current_thread());
-
-       kqlock(kqwl);
+       kqlock_held(kqu);
 
-       /*
-        * Forcing the KQ_PROCESSING flag allows for QoS updates because of
-        * unsuppressing knotes not to be applied until the eventual call to
-        * kqworkloop_update_threads_qos() below.
-        */
-       assert((kq->kq_state & KQ_PROCESSING) == 0);
-       if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) {
-               kq->kq_state |= KQ_PROCESSING;
-               qos_override = kqworkloop_acknowledge_events(kqwl);
-               kq->kq_state &= ~KQ_PROCESSING;
+       if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0) {
+               return;
        }
 
-       kq_req_lock(kqwl);
-
-       ipc_override = kqworkloop_unbind_locked(kqwl, thread);
-       kqworkloop_update_threads_qos(kqwl, op, qos_override);
-
-       kq_req_unlock(kqwl);
+       if (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)) {
+               return;
+       }
 
-       kqunlock(kqwl);
+       if ((kn->kn_status & KN_QUEUED) == 0) {
+               struct kqtailq *queue = knote_get_tailq(kqu, kn);
 
-       /*
-        * Drop the override on the current thread last, after the call to
-        * kqworkloop_update_threads_qos above.
-        */
-       if (ipc_override) {
-               thread_drop_ipc_override(thread);
+               TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
+               kn->kn_status |= KN_QUEUED;
+               kqu.kq->kq_count++;
+       } else if ((kn->kn_status & KN_STAYACTIVE) == 0) {
+               return;
        }
 
-       /* If last reference, dealloc the workloop kq */
-       kqueue_release_last(p, kqwl);
+       if (kn->kn_status & wakeup_mask) {
+               if (kqu.kq->kq_state & KQ_WORKLOOP) {
+                       kqworkloop_wakeup(kqu.kqwl, kn->kn_qos_index);
+               } else if (kqu.kq->kq_state & KQ_WORKQ) {
+                       kqworkq_wakeup(kqu.kqwq, kn->kn_qos_index);
+               } else {
+                       kqfile_wakeup(kqu.kqf, kn->kn_qos_index);
+               }
+       }
 }
 
-static thread_qos_t
-kqworkq_unbind_locked(__assert_only struct kqworkq *kqwq,
-    struct kqrequest *kqr, thread_t thread)
+__attribute__((always_inline))
+static inline void
+knote_dequeue(kqueue_t kqu, struct knote *kn)
 {
-       struct uthread *ut = get_bsdthread_info(thread);
-       kq_index_t old_override = kqr->kqr_override_index;
+       if (kn->kn_status & KN_QUEUED) {
+               struct kqtailq *queue = knote_get_tailq(kqu, kn);
 
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1,
-           thread_tid(kqr->kqr_thread), kqr->kqr_qos_index, 0);
-
-       kq_req_held(kqwq);
-       assert(ut->uu_kqr_bound == kqr);
-       ut->uu_kqr_bound = NULL;
-       kqr->kqr_thread = NULL;
-       kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
-       kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
+               // attaching the knote calls knote_reset_priority() without
+               // the kqlock which is fine, so we can't call kqlock_held()
+               // if we're not queued.
+               kqlock_held(kqu);
 
-       return old_override;
+               TAILQ_REMOVE(queue, kn, kn_tqe);
+               kn->kn_status &= ~KN_QUEUED;
+               kqu.kq->kq_count--;
+       }
 }
 
-/*
- *     kqworkq_unbind - unbind of a workq kqueue from a thread
- *
- *     We may have to request new threads.
- *     This can happen there are no waiting processing threads and:
- *     - there were active events we never got to (count > 0)
- *     - we pended waitq hook callouts during processing
- *     - we pended wakeups while processing (or unsuppressing)
- */
+/* called with kqueue lock held */
 static void
-kqworkq_unbind(proc_t p, struct kqrequest *kqr)
-{
-       struct kqworkq *kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue;
-       __assert_only int rc;
-
-       kqlock(kqwq);
-       rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND);
-       assert(rc == -1);
-       kqunlock(kqwq);
-}
-
-struct kqrequest *
-kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
+knote_suppress(kqueue_t kqu, struct knote *kn)
 {
-       assert(qos_index < KQWQ_NBUCKETS);
-       return &kqwq->kqwq_request[qos_index];
-}
+       struct kqtailq *suppressq;
 
-static void
-knote_apply_qos_override(struct knote *kn, kq_index_t qos_index)
-{
-       assert((kn->kn_status & KN_QUEUED) == 0);
+       kqlock_held(kqu);
 
-       kn->kn_qos_override = qos_index;
+       assert((kn->kn_status & KN_SUPPRESSED) == 0);
+       assert(kn->kn_status & KN_QUEUED);
 
-       if (kn->kn_status & KN_SUPPRESSED) {
-               struct kqueue *kq = knote_get_kq(kn);
-               /*
-                * For suppressed events, the kn_qos_index field cannot be touched as it
-                * allows us to know on which supress queue the knote is for a kqworkq.
-                *
-                * Also, there's no natural push applied on the kqueues when this field
-                * changes anyway. We hence need to apply manual overrides in this case,
-                * which will be cleared when the events are later acknowledged.
-                */
-               if (kq->kq_state & KQ_WORKQ) {
-                       kqworkq_update_override((struct kqworkq *)kq, kn, qos_index);
-               } else {
-                       kqworkloop_update_override((struct kqworkloop *)kq, qos_index);
-               }
-       } else {
-               kn->kn_qos_index = qos_index;
-       }
+       knote_dequeue(kqu, kn);
+       /* deactivate - so new activations indicate a wakeup */
+       kn->kn_status &= ~KN_ACTIVE;
+       kn->kn_status |= KN_SUPPRESSED;
+       suppressq = kqueue_get_suppressed_queue(kqu, kn);
+       TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
 }
 
-static bool
-knote_should_apply_qos_override(struct kqueue *kq, struct knote *kn, int result,
-    thread_qos_t *qos_out)
+__attribute__((always_inline))
+static inline void
+knote_unsuppress_noqueue(kqueue_t kqu, struct knote *kn)
 {
-       thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7;
+       struct kqtailq *suppressq;
 
-       kqlock_held(kq);
+       kqlock_held(kqu);
 
-       assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
-       assert(qos_index < THREAD_QOS_LAST);
+       assert(kn->kn_status & KN_SUPPRESSED);
 
-       /*
-        * Early exit for knotes that should not change QoS
-        *
-        * It is safe to test kn_req_index against MANAGER / STAYACTIVE because
-        * knotes with such kn_req_index values never change for their entire
-        * lifetime.
-        */
-       if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
-               panic("filter %d cannot change QoS", kn->kn_filtid);
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               if (kn->kn_req_index == KQWL_BUCKET_STAYACTIVE) {
-                       return false;
-               }
-       } else if (kq->kq_state & KQ_WORKQ) {
-               if (kn->kn_req_index == KQWQ_QOS_MANAGER) {
-                       return false;
-               }
-       } else {
-               return false;
-       }
+       kn->kn_status &= ~KN_SUPPRESSED;
+       suppressq = kqueue_get_suppressed_queue(kqu, kn);
+       TAILQ_REMOVE(suppressq, kn, kn_tqe);
 
        /*
-        * knotes with the FALLBACK flag will only use their registration QoS if the
-        * incoming event has no QoS, else, the registration QoS acts as a floor.
+        * If the knote is no longer active, reset its push,
+        * and resynchronize kn_qos_index with kn_qos_override
+        * for knotes with a real qos.
         */
-       if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
-               if (qos_index == THREAD_QOS_UNSPECIFIED) {
-                       qos_index = kn->kn_req_index;
-               }
-       } else {
-               if (qos_index < kn->kn_req_index) {
-                       qos_index = kn->kn_req_index;
-               }
-       }
-       if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
-               /* Never lower QoS when in "Merge" mode */
-               return false;
+       if ((kn->kn_status & KN_ACTIVE) == 0 && knote_has_qos(kn)) {
+               kn->kn_qos_override = _pthread_priority_thread_qos_fast(kn->kn_qos);
        }
+       kn->kn_qos_index = kn->kn_qos_override;
+}
 
-       if ((kn->kn_status & KN_LOCKED) && kn->kn_inuse) {
-               /*
-                * When we're trying to update the QoS override and that both an
-                * f_event() and other f_* calls are running concurrently, any of these
-                * in flight calls may want to perform overrides that aren't properly
-                * serialized with each other.
-                *
-                * The first update that observes this racy situation enters a "Merge"
-                * mode which causes subsequent override requests to saturate the
-                * override instead of replacing its value.
-                *
-                * This mode is left when knote_unlock() or knote_call_filter_event()
-                * observe that no other f_* routine is in flight.
-                */
-               kn->kn_status |= KN_MERGE_QOS;
+/* called with kqueue lock held */
+static void
+knote_unsuppress(kqueue_t kqu, struct knote *kn)
+{
+       if (kn->kn_status & KN_SUPPRESSED) {
+               knote_unsuppress_noqueue(kqu, kn);
+
+               /* don't wakeup if unsuppressing just a stay-active knote */
+               knote_enqueue(kqu, kn, KN_ACTIVE);
        }
+}
 
-       if (kn->kn_qos_override == qos_index) {
-               return false;
+__attribute__((always_inline))
+static inline void
+knote_mark_active(struct knote *kn)
+{
+       if ((kn->kn_status & KN_ACTIVE) == 0) {
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
+                   kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
+                   kn->kn_filtid);
        }
 
-       *qos_out = qos_index;
-       return true;
+       kn->kn_status |= KN_ACTIVE;
 }
 
+/* called with kqueue lock held */
 static void
-knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result)
+knote_activate(kqueue_t kqu, struct knote *kn, int result)
 {
-       thread_qos_t qos;
-       if (knote_should_apply_qos_override(kq, kn, result, &qos)) {
-               knote_dequeue(kn);
-               knote_apply_qos_override(kn, qos);
-               if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) {
-                       knote_wakeup(kn);
-               }
+       assert(result & FILTER_ACTIVE);
+       if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
+               // may dequeue the knote
+               knote_adjust_qos(kqu.kq, kn, result);
        }
+       knote_mark_active(kn);
+       knote_enqueue(kqu, kn, KN_ACTIVE | KN_STAYACTIVE);
 }
 
+/*
+ * This function applies changes requested by f_attach or f_touch for
+ * a given filter. It proceeds in a carefully chosen order to help
+ * every single transition do the minimal amount of work possible.
+ */
 static void
-knote_wakeup(struct knote *kn)
+knote_apply_touch(kqueue_t kqu, struct knote *kn, struct kevent_qos_s *kev,
+    int result)
 {
-       struct kqueue *kq = knote_get_kq(kn);
-
-       kqlock_held(kq);
+       kn_status_t wakeup_mask = KN_ACTIVE;
 
-       if (kq->kq_state & KQ_WORKQ) {
-               struct kqworkq *kqwq = (struct kqworkq *)kq;
-
-               kqworkq_request_help(kqwq, kn->kn_qos_index);
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               struct kqworkloop *kqwl = (struct kqworkloop *)kq;
+       if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
+               /*
+                * When a stayactive knote is reenabled, we may have missed wakeups
+                * while it was disabled, so we need to poll it. To do so, ask
+                * knote_enqueue() below to reenqueue it.
+                */
+               wakeup_mask |= KN_STAYACTIVE;
+               kn->kn_status &= ~KN_DISABLED;
 
                /*
-                * kqworkloop_end_processing() will perform the required QoS
-                * computations when it unsets the processing mode.
+                * it is possible for userland to have knotes registered for a given
+                * workloop `wl_orig` but really handled on another workloop `wl_new`.
+                *
+                * In that case, rearming will happen from the servicer thread of
+                * `wl_new` which if `wl_orig` is no longer being serviced, would cause
+                * this knote to stay suppressed forever if we only relied on
+                * kqworkloop_acknowledge_events to be called by `wl_orig`.
+                *
+                * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
+                * unsuppress because that would mess with the processing phase of
+                * `wl_orig`, however it also means kqworkloop_acknowledge_events()
+                * will be called.
                 */
-               if (!kqworkloop_is_processing_on_current_thread(kqwl)) {
-                       kqworkloop_request_help(kqwl, kn->kn_qos_index);
+               if (__improbable(kn->kn_status & KN_SUPPRESSED)) {
+                       if ((kqu.kq->kq_state & KQ_PROCESSING) == 0) {
+                               knote_unsuppress_noqueue(kqu, kn);
+                       }
                }
-       } else {
-               struct kqfile *kqf = (struct kqfile *)kq;
+       }
 
-               /* flag wakeups during processing */
-               if (kq->kq_state & KQ_PROCESSING) {
-                       kq->kq_state |= KQ_WAKEUP;
-               }
+       if ((result & FILTER_UPDATE_REQ_QOS) && kev->qos && kev->qos != kn->kn_qos) {
+               // may dequeue the knote
+               knote_reset_priority(kqu, kn, kev->qos);
+       }
 
-               /* wakeup a thread waiting on this queue */
-               if (kq->kq_state & (KQ_SLEEP | KQ_SEL)) {
-                       kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
-                       waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, KQ_EVENT,
-                           THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
-               }
+       /*
+        * When we unsuppress above, or because of knote_reset_priority(),
+        * the knote may have been dequeued, we need to restore the invariant
+        * that if the knote is active it needs to be queued now that
+        * we're done applying changes.
+        */
+       if (result & FILTER_ACTIVE) {
+               knote_activate(kqu, kn, result);
+       } else {
+               knote_enqueue(kqu, kn, wakeup_mask);
+       }
 
-               /* wakeup other kqueues/select sets we're inside */
-               KNOTE(&kqf->kqf_sel.si_note, 0);
+       if ((result & FILTER_THREADREQ_NODEFEER) &&
+           act_clear_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ)) {
+               workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
        }
 }
 
 /*
- * Called with the kqueue locked
+ * knote_drop - disconnect and drop the knote
+ *
+ * Called with the kqueue locked, returns with the kqueue unlocked.
+ *
+ * If a knote locking context is passed, it is canceled.
+ *
+ * The knote may have already been detached from
+ * (or not yet attached to) its source object.
  */
 static void
-kqueue_interrupt(struct kqueue *kq)
+knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc)
 {
-       assert((kq->kq_state & KQ_WORKQ) == 0);
+       struct proc *p = kq->kq_p;
 
-       /* wakeup sleeping threads */
-       if ((kq->kq_state & (KQ_SLEEP | KQ_SEL)) != 0) {
-               kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
-               (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
-                   KQ_EVENT,
-                   THREAD_RESTART,
-                   WAITQ_ALL_PRIORITIES);
+       kqlock_held(kq);
+
+       assert((kn->kn_status & KN_DROPPING) == 0);
+       if (knlc == NULL) {
+               assert((kn->kn_status & KN_LOCKED) == 0);
        }
+       kn->kn_status |= KN_DROPPING;
 
-       /* wakeup threads waiting their turn to process */
-       if (kq->kq_state & KQ_PROCWAIT) {
-               struct kqtailq *suppressq;
+       if (kn->kn_status & KN_SUPPRESSED) {
+               knote_unsuppress_noqueue(kq, kn);
+       } else {
+               knote_dequeue(kq, kn);
+       }
+       knote_wait_for_post(kq, kn);
 
-               assert(kq->kq_state & KQ_PROCESSING);
+       knote_fops(kn)->f_detach(kn);
 
-               kq->kq_state &= ~KQ_PROCWAIT;
-               suppressq = kqueue_get_suppressed_queue(kq, NULL);
-               (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
-                   CAST_EVENT64_T(suppressq),
-                   THREAD_RESTART,
-                   WAITQ_ALL_PRIORITIES);
+       /* kq may be freed when kq_remove_knote() returns */
+       kq_remove_knote(kq, kn, p, knlc);
+       if (kn->kn_is_fd && ((kn->kn_status & KN_VANISHED) == 0)) {
+               fp_drop(p, kn->kn_id, kn->kn_fp, 0);
        }
+
+       knote_free(kn);
 }
 
-/*
- * Called back from waitq code when no threads waiting and the hook was set.
- *
- * Interrupts are likely disabled and spin locks are held - minimal work
- * can be done in this context!!!
- *
- * JMM - in the future, this will try to determine which knotes match the
- * wait queue wakeup and apply these wakeups against those knotes themselves.
- * For now, all the events dispatched this way are dispatch-manager handled,
- * so hard-code that for now.
- */
 void
-waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos)
+knote_init(void)
 {
-#pragma unused(knote_hook, qos)
+       knote_zone = zinit(sizeof(struct knote), 8192 * sizeof(struct knote),
+           8192, "knote zone");
+       zone_change(knote_zone, Z_CACHING_ENABLED, TRUE);
 
-       struct kqueue *kq = (struct kqueue *)kq_hook;
+       kqfile_zone = zinit(sizeof(struct kqfile), 8192 * sizeof(struct kqfile),
+           8192, "kqueue file zone");
 
-       if (kq->kq_state & KQ_WORKQ) {
-               struct kqworkq *kqwq = (struct kqworkq *)kq;
+       kqworkq_zone = zinit(sizeof(struct kqworkq), 8192 * sizeof(struct kqworkq),
+           8192, "kqueue workq zone");
 
-               kqworkq_request_help(kqwq, KQWQ_QOS_MANAGER);
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               struct kqworkloop *kqwl = (struct kqworkloop *)kq;
+       kqworkloop_zone = zinit(sizeof(struct kqworkloop), 8192 * sizeof(struct kqworkloop),
+           8192, "kqueue workloop zone");
+       zone_change(kqworkloop_zone, Z_CACHING_ENABLED, TRUE);
 
-               kqworkloop_request_help(kqwl, KQWL_BUCKET_STAYACTIVE);
-       }
+       /* allocate kq lock group attribute and group */
+       kq_lck_grp_attr = lck_grp_attr_alloc_init();
+
+       kq_lck_grp = lck_grp_alloc_init("kqueue", kq_lck_grp_attr);
+
+       /* Allocate kq lock attribute */
+       kq_lck_attr = lck_attr_alloc_init();
+
+#if CONFIG_MEMORYSTATUS
+       /* Initialize the memorystatus list lock */
+       memorystatus_kevent_init(kq_lck_grp, kq_lck_attr);
+#endif
 }
+SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
 
-void
-klist_init(struct klist *list)
+const struct filterops *
+knote_fops(struct knote *kn)
 {
-       SLIST_INIT(list);
+       return sysfilt_ops[kn->kn_filtid];
 }
 
+static struct knote *
+knote_alloc(void)
+{
+       struct knote *kn = ((struct knote *)zalloc(knote_zone));
+       bzero(kn, sizeof(struct knote));
+       return kn;
+}
 
-/*
- * Query/Post each knote in the object's list
- *
- *     The object lock protects the list. It is assumed
- *     that the filter/event routine for the object can
- *     determine that the object is already locked (via
- *     the hint) and not deadlock itself.
- *
- *     The object lock should also hold off pending
- *     detach/drop operations.
- */
-void
-knote(struct klist *list, long hint)
+static void
+knote_free(struct knote *kn)
 {
-       struct knote *kn;
+       assert((kn->kn_status & (KN_LOCKED | KN_POSTING)) == 0);
+       zfree(knote_zone, kn);
+}
 
-       SLIST_FOREACH(kn, list, kn_selnext) {
-               struct kqueue *kq = knote_get_kq(kn);
-               kqlock(kq);
-               knote_call_filter_event(kq, kn, hint);
-               kqunlock(kq);
-       }
+#pragma mark - syscalls: kevent, kevent64, kevent_qos, kevent_id
+
+kevent_ctx_t
+kevent_get_context(thread_t thread)
+{
+       uthread_t ut = get_bsdthread_info(thread);
+       return &ut->uu_save.uus_kevent;
 }
 
-/*
- * attach a knote to the specified list.  Return true if this is the first entry.
- * The list is protected by whatever lock the object it is associated with uses.
- */
-int
-knote_attach(struct klist *list, struct knote *kn)
+static inline bool
+kevent_args_requesting_events(unsigned int flags, int nevents)
 {
-       int ret = SLIST_EMPTY(list);
-       SLIST_INSERT_HEAD(list, kn, kn_selnext);
-       return ret;
+       return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0;
 }
 
-/*
- * detach a knote from the specified list.  Return true if that was the last entry.
- * The list is protected by whatever lock the object it is associated with uses.
- */
-int
-knote_detach(struct klist *list, struct knote *kn)
+static inline int
+kevent_adjust_flags_for_proc(proc_t p, int flags)
 {
-       SLIST_REMOVE(list, kn, knote, kn_selnext);
-       return SLIST_EMPTY(list);
+       __builtin_assume(p);
+       return flags | (IS_64BIT_PROCESS(p) ? KEVENT_FLAG_PROC64 : 0);
 }
 
-/*
- * knote_vanish - Indicate that the source has vanished
+/*!
+ * @function kevent_get_kqfile
  *
- * If the knote has requested EV_VANISHED delivery,
- * arrange for that. Otherwise, deliver a NOTE_REVOKE
- * event for backward compatibility.
+ * @brief
+ * Lookup a kqfile by fd.
  *
- * The knote is marked as having vanished, but is not
- * actually detached from the source in this instance.
- * The actual detach is deferred until the knote drop.
+ * @discussion
+ * Callers: kevent, kevent64, kevent_qos
  *
- * Our caller already has the object lock held. Calling
- * the detach routine would try to take that lock
- * recursively - which likely is not supported.
+ * This is not assumed to be a fastpath (kqfile interfaces are legacy)
  */
-void
-knote_vanish(struct klist *list, bool make_active)
+OS_NOINLINE
+static int
+kevent_get_kqfile(struct proc *p, int fd, int flags,
+    struct fileproc **fp, struct kqueue **kqp)
 {
-       struct knote *kn;
-       struct knote *kn_next;
+       int error = 0;
+       struct kqueue *kq;
 
-       SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
-               struct kqueue *kq = knote_get_kq(kn);
+       error = fp_getfkq(p, fd, fp, &kq);
+       if (__improbable(error)) {
+               return error;
+       }
 
+       uint16_t kq_state = os_atomic_load(&kq->kq_state, relaxed);
+       if (__improbable((kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) == 0)) {
                kqlock(kq);
-               if (__probable(kn->kn_status & KN_REQVANISH)) {
-                       /*
-                        * If EV_VANISH supported - prepare to deliver one
-                        */
-                       kn->kn_status |= KN_VANISHED;
-               } else {
-                       /*
-                        * Handle the legacy way to indicate that the port/portset was
-                        * deallocated or left the current Mach portspace (modern technique
-                        * is with an EV_VANISHED protocol).
-                        *
-                        * Deliver an EV_EOF event for these changes (hopefully it will get
-                        * delivered before the port name recycles to the same generation
-                        * count and someone tries to re-register a kevent for it or the
-                        * events are udata-specific - avoiding a conflict).
-                        */
-                       kn->kn_flags |= EV_EOF | EV_ONESHOT;
-               }
-               if (make_active) {
-                       knote_activate(kn);
+               kq_state = kq->kq_state;
+               if (!(kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS))) {
+                       if (flags & KEVENT_FLAG_LEGACY32) {
+                               kq_state |= KQ_KEV32;
+                       } else if (flags & KEVENT_FLAG_LEGACY64) {
+                               kq_state |= KQ_KEV64;
+                       } else {
+                               kq_state |= KQ_KEV_QOS;
+                       }
+                       kq->kq_state = kq_state;
                }
                kqunlock(kq);
        }
+
+       /*
+        * kqfiles can't be used through the legacy kevent()
+        * and other interfaces at the same time.
+        */
+       if (__improbable((bool)(flags & KEVENT_FLAG_LEGACY32) !=
+           (bool)(kq_state & KQ_KEV32))) {
+               fp_drop(p, fd, *fp, 0);
+               return EINVAL;
+       }
+
+       *kqp = kq;
+       return 0;
 }
 
-/*
- * Force a lazy allocation of the waitqset link
- * of the kq_wqs associated with the kn
- * if it wasn't already allocated.
+/*!
+ * @function kevent_get_kqwq
  *
- * This allows knote_link_waitq to never block
- * if reserved_link is not NULL.
+ * @brief
+ * Lookup or create the process kqwq (faspath).
+ *
+ * @discussion
+ * Callers: kevent64, kevent_qos
  */
-void
-knote_link_waitqset_lazy_alloc(struct knote *kn)
+OS_ALWAYS_INLINE
+static int
+kevent_get_kqwq(proc_t p, int flags, int nevents, struct kqueue **kqp)
 {
-       struct kqueue *kq = knote_get_kq(kn);
-       waitq_set_lazy_init_link(&kq->kq_wqs);
-}
+       struct kqworkq *kqwq = p->p_fd->fd_wqkqueue;
 
-/*
- * Check if a lazy allocation for the waitqset link
- * of the kq_wqs is needed.
- */
-boolean_t
-knote_link_waitqset_should_lazy_alloc(struct knote *kn)
-{
-       struct kqueue *kq = knote_get_kq(kn);
-       return waitq_set_should_lazy_init_link(&kq->kq_wqs);
+       if (__improbable(kevent_args_requesting_events(flags, nevents))) {
+               return EINVAL;
+       }
+       if (__improbable(kqwq == NULL)) {
+               kqwq = kqworkq_alloc(p, flags);
+               if (__improbable(kqwq == NULL)) {
+                       return ENOMEM;
+               }
+       }
+
+       *kqp = &kqwq->kqwq_kqueue;
+       return 0;
 }
 
-/*
- * For a given knote, link a provided wait queue directly with the kqueue.
- * Wakeups will happen via recursive wait queue support.  But nothing will move
- * the knote to the active list at wakeup (nothing calls knote()).  Instead,
- * we permanently enqueue them here.
- *
- * kqueue and knote references are held by caller.
- * waitq locked by caller.
+#pragma mark kevent copyio
+
+/*!
+ * @function kevent_get_data_size
  *
- * caller provides the wait queue link structure and insures that the kq->kq_wqs
- * is linked by previously calling knote_link_waitqset_lazy_alloc.
+ * @brief
+ * Copies in the extra data size from user-space.
  */
-int
-knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link)
-{
-       struct kqueue *kq = knote_get_kq(kn);
-       kern_return_t kr;
-
-       kr = waitq_link(wq, &kq->kq_wqs, WAITQ_ALREADY_LOCKED, reserved_link);
-       if (kr == KERN_SUCCESS) {
-               knote_markstayactive(kn);
-               return 0;
+static int
+kevent_get_data_size(int flags, user_addr_t data_avail, user_addr_t data_out,
+    kevent_ctx_t kectx)
+{
+       if (!data_avail || !data_out) {
+               kectx->kec_data_size  = 0;
+               kectx->kec_data_resid = 0;
+       } else if (flags & KEVENT_FLAG_PROC64) {
+               user64_size_t usize = 0;
+               int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
+               if (__improbable(error)) {
+                       return error;
+               }
+               kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
        } else {
-               return EINVAL;
+               user32_size_t usize = 0;
+               int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
+               if (__improbable(error)) {
+                       return error;
+               }
+               kectx->kec_data_avail = data_avail;
+               kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
        }
+       kectx->kec_data_out   = data_out;
+       kectx->kec_data_avail = data_avail;
+       return 0;
 }
 
-/*
- * Unlink the provided wait queue from the kqueue associated with a knote.
- * Also remove it from the magic list of directly attached knotes.
- *
- * Note that the unlink may have already happened from the other side, so
- * ignore any failures to unlink and just remove it from the kqueue list.
+/*!
+ * @function kevent_put_data_size
  *
- * On success, caller is responsible for the link structure
+ * @brief
+ * Copies out the residual data size to user-space if any has been used.
  */
-int
-knote_unlink_waitq(struct knote *kn, struct waitq *wq)
+static int
+kevent_put_data_size(unsigned int flags, kevent_ctx_t kectx)
 {
-       struct kqueue *kq = knote_get_kq(kn);
-       kern_return_t kr;
-
-       kr = waitq_unlink(wq, &kq->kq_wqs);
-       knote_clearstayactive(kn);
-       return (kr != KERN_SUCCESS) ? EINVAL : 0;
+       if (kectx->kec_data_resid == kectx->kec_data_size) {
+               return 0;
+       }
+       if (flags & KEVENT_FLAG_KERNEL) {
+               *(user_size_t *)(uintptr_t)kectx->kec_data_avail = kectx->kec_data_resid;
+               return 0;
+       }
+       if (flags & KEVENT_FLAG_PROC64) {
+               user64_size_t usize = (user64_size_t)kectx->kec_data_resid;
+               return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
+       } else {
+               user32_size_t usize = (user32_size_t)kectx->kec_data_resid;
+               return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
+       }
 }
 
-/*
- * remove all knotes referencing a specified fd
+/*!
+ * @function kevent_legacy_copyin
  *
- * Entered with the proc_fd lock already held.
- * It returns the same way, but may drop it temporarily.
+ * @brief
+ * Handles the copyin of a kevent/kevent64 event.
  */
-void
-knote_fdclose(struct proc *p, int fd)
+static int
+kevent_legacy_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp, unsigned int flags)
 {
-       struct klist *list;
-       struct knote *kn;
-       KNOTE_LOCK_CTX(knlc);
+       int error;
 
-restart:
-       list = &p->p_fd->fd_knlist[fd];
-       SLIST_FOREACH(kn, list, kn_link) {
-               struct kqueue *kq = knote_get_kq(kn);
+       assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
 
-               kqlock(kq);
+       if (flags & KEVENT_FLAG_LEGACY64) {
+               struct kevent64_s kev64;
 
-               if (kq->kq_p != p) {
-                       panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
-                           __func__, kq->kq_p, p);
+               error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
+               if (__improbable(error)) {
+                       return error;
+               }
+               *addrp += sizeof(kev64);
+               *kevp = (struct kevent_qos_s){
+                       .ident  = kev64.ident,
+                       .filter = kev64.filter,
+                       /* Make sure user doesn't pass in any system flags */
+                       .flags  = kev64.flags & ~EV_SYSFLAGS,
+                       .udata  = kev64.udata,
+                       .fflags = kev64.fflags,
+                       .data   = kev64.data,
+                       .ext[0] = kev64.ext[0],
+                       .ext[1] = kev64.ext[1],
+               };
+       } else if (flags & KEVENT_FLAG_PROC64) {
+               struct user64_kevent kev64;
+
+               error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
+               if (__improbable(error)) {
+                       return error;
                }
+               *addrp += sizeof(kev64);
+               *kevp = (struct kevent_qos_s){
+                       .ident  = kev64.ident,
+                       .filter = kev64.filter,
+                       /* Make sure user doesn't pass in any system flags */
+                       .flags  = kev64.flags & ~EV_SYSFLAGS,
+                       .udata  = kev64.udata,
+                       .fflags = kev64.fflags,
+                       .data   = kev64.data,
+               };
+       } else {
+               struct user32_kevent kev32;
 
-               /*
-                * If the knote supports EV_VANISHED delivery,
-                * transition it to vanished mode (or skip over
-                * it if already vanished).
-                */
-               if (kn->kn_status & KN_VANISHED) {
-                       kqunlock(kq);
-                       continue;
+               error = copyin(*addrp, (caddr_t)&kev32, sizeof(kev32));
+               if (__improbable(error)) {
+                       return error;
                }
+               *addrp += sizeof(kev32);
+               *kevp = (struct kevent_qos_s){
+                       .ident  = (uintptr_t)kev32.ident,
+                       .filter = kev32.filter,
+                       /* Make sure user doesn't pass in any system flags */
+                       .flags  = kev32.flags & ~EV_SYSFLAGS,
+                       .udata  = CAST_USER_ADDR_T(kev32.udata),
+                       .fflags = kev32.fflags,
+                       .data   = (intptr_t)kev32.data,
+               };
+       }
 
-               proc_fdunlock(p);
-               if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
-                       /* the knote was dropped by someone, nothing to do */
-               } else if (kn->kn_status & KN_REQVANISH) {
-                       kn->kn_status |= KN_VANISHED;
-                       kn->kn_status &= ~KN_ATTACHED;
+       return 0;
+}
 
-                       kqunlock(kq);
-                       knote_fops(kn)->f_detach(kn);
-                       if (knote_fops(kn)->f_isfd) {
-                               fp_drop(p, kn->kn_id, kn->kn_fp, 0);
-                       }
-                       kqlock(kq);
+/*!
+ * @function kevent_modern_copyin
+ *
+ * @brief
+ * Handles the copyin of a kevent_qos/kevent_id event.
+ */
+static int
+kevent_modern_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp)
+{
+       int error = copyin(*addrp, (caddr_t)kevp, sizeof(struct kevent_qos_s));
+       if (__probable(!error)) {
+               /* Make sure user doesn't pass in any system flags */
+               *addrp += sizeof(struct kevent_qos_s);
+               kevp->flags &= ~EV_SYSFLAGS;
+       }
+       return error;
+}
 
-                       knote_activate(kn);
-                       knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
-               } else {
-                       knote_drop(kq, kn, &knlc);
-               }
+/*!
+ * @function kevent_legacy_copyout
+ *
+ * @brief
+ * Handles the copyout of a kevent/kevent64 event.
+ */
+static int
+kevent_legacy_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp, unsigned int flags)
+{
+       int advance;
+       int error;
 
-               proc_fdlock(p);
-               goto restart;
+       assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
+
+       /*
+        * fully initialize the differnt output event structure
+        * types from the internal kevent (and some universal
+        * defaults for fields not represented in the internal
+        * form).
+        *
+        * Note: these structures have no padding hence the C99
+        *       initializers below do not leak kernel info.
+        */
+       if (flags & KEVENT_FLAG_LEGACY64) {
+               struct kevent64_s kev64 = {
+                       .ident  = kevp->ident,
+                       .filter = kevp->filter,
+                       .flags  = kevp->flags,
+                       .fflags = kevp->fflags,
+                       .data   = (int64_t)kevp->data,
+                       .udata  = kevp->udata,
+                       .ext[0] = kevp->ext[0],
+                       .ext[1] = kevp->ext[1],
+               };
+               advance = sizeof(struct kevent64_s);
+               error = copyout((caddr_t)&kev64, *addrp, advance);
+       } else if (flags & KEVENT_FLAG_PROC64) {
+               /*
+                * deal with the special case of a user-supplied
+                * value of (uintptr_t)-1.
+                */
+               uint64_t ident = (kevp->ident == (uintptr_t)-1) ?
+                   (uint64_t)-1LL : (uint64_t)kevp->ident;
+               struct user64_kevent kev64 = {
+                       .ident  = ident,
+                       .filter = kevp->filter,
+                       .flags  = kevp->flags,
+                       .fflags = kevp->fflags,
+                       .data   = (int64_t) kevp->data,
+                       .udata  = kevp->udata,
+               };
+               advance = sizeof(kev64);
+               error = copyout((caddr_t)&kev64, *addrp, advance);
+       } else {
+               struct user32_kevent kev32 = {
+                       .ident  = (uint32_t)kevp->ident,
+                       .filter = kevp->filter,
+                       .flags  = kevp->flags,
+                       .fflags = kevp->fflags,
+                       .data   = (int32_t)kevp->data,
+                       .udata  = kevp->udata,
+               };
+               advance = sizeof(kev32);
+               error = copyout((caddr_t)&kev32, *addrp, advance);
+       }
+       if (__probable(!error)) {
+               *addrp += advance;
        }
+       return error;
 }
 
-/*
- * knote_fdfind - lookup a knote in the fd table for process
+/*!
+ * @function kevent_modern_copyout
  *
- * If the filter is file-based, lookup based on fd index.
- * Otherwise use a hash based on the ident.
+ * @brief
+ * Handles the copyout of a kevent_qos/kevent_id event.
+ */
+OS_ALWAYS_INLINE
+static inline int
+kevent_modern_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp)
+{
+       int error = copyout((caddr_t)kevp, *addrp, sizeof(struct kevent_qos_s));
+       if (__probable(!error)) {
+               *addrp += sizeof(struct kevent_qos_s);
+       }
+       return error;
+}
+
+#pragma mark kevent core implementation
+
+/*!
+ * @function kevent_callback_inline
  *
- * Matching is based on kq, filter, and ident. Optionally,
- * it may also be based on the udata field in the kevent -
- * allowing multiple event registration for the file object
- * per kqueue.
+ * @brief
+ * Callback for each individual event
  *
- * fd_knhashlock or fdlock held on entry (and exit)
+ * @discussion
+ * This is meant to be inlined in kevent_modern_callback and
+ * kevent_legacy_callback.
  */
-static struct knote *
-knote_fdfind(struct kqueue *kq,
-    struct kevent_internal_s *kev,
-    bool is_fd,
-    struct proc *p)
+OS_ALWAYS_INLINE
+static inline int
+kevent_callback_inline(struct kevent_qos_s *kevp, kevent_ctx_t kectx, bool legacy)
 {
-       struct filedesc *fdp = p->p_fd;
-       struct klist *list = NULL;
-       struct knote *kn = NULL;
+       int error;
+
+       assert(kectx->kec_process_noutputs < kectx->kec_process_nevents);
 
        /*
-        * determine where to look for the knote
+        * Copy out the appropriate amount of event data for this user.
         */
-       if (is_fd) {
-               /* fd-based knotes are linked off the fd table */
-               if (kev->ident < (u_int)fdp->fd_knlistsize) {
-                       list = &fdp->fd_knlist[kev->ident];
-               }
-       } else if (fdp->fd_knhashmask != 0) {
-               /* hash non-fd knotes here too */
-               list = &fdp->fd_knhash[KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
+       if (legacy) {
+               error = kevent_legacy_copyout(kevp, &kectx->kec_process_eventlist,
+                   kectx->kec_process_flags);
+       } else {
+               error = kevent_modern_copyout(kevp, &kectx->kec_process_eventlist);
        }
 
        /*
-        * scan the selected list looking for a match
+        * If there isn't space for additional events, return
+        * a harmless error to stop the processing here
         */
-       if (list != NULL) {
-               SLIST_FOREACH(kn, list, kn_link) {
-                       if (kq == knote_get_kq(kn) &&
-                           kev->ident == kn->kn_id &&
-                           kev->filter == kn->kn_filter) {
-                               if (kev->flags & EV_UDATA_SPECIFIC) {
-                                       if ((kn->kn_status & KN_UDATA_SPECIFIC) &&
-                                           kev->udata == kn->kn_udata) {
-                                               break; /* matching udata-specific knote */
-                                       }
-                               } else if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) {
-                                       break; /* matching non-udata-specific knote */
-                               }
-                       }
-               }
+       if (error == 0 && ++kectx->kec_process_noutputs == kectx->kec_process_nevents) {
+               error = EWOULDBLOCK;
        }
-       return kn;
+       return error;
 }
 
-/*
- * kq_add_knote- Add knote to the fd table for process
- * while checking for duplicates.
+/*!
+ * @function kevent_modern_callback
  *
- * All file-based filters associate a list of knotes by file
- * descriptor index. All other filters hash the knote by ident.
+ * @brief
+ * Callback for each individual modern event.
  *
- * May have to grow the table of knote lists to cover the
- * file descriptor index presented.
+ * @discussion
+ * This callback handles kevent_qos/kevent_id events.
+ */
+static int
+kevent_modern_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
+{
+       return kevent_callback_inline(kevp, kectx, /*legacy*/ false);
+}
+
+/*!
+ * @function kevent_legacy_callback
  *
- * fd_knhashlock and fdlock unheld on entry (and exit).
+ * @brief
+ * Callback for each individual legacy event.
  *
- * Takes a rwlock boost if inserting the knote is successful.
+ * @discussion
+ * This callback handles kevent/kevent64 events.
  */
 static int
-kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
-    struct proc *p)
+kevent_legacy_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
 {
-       struct filedesc *fdp = p->p_fd;
-       struct klist *list = NULL;
-       int ret = 0;
-       bool is_fd = knote_fops(kn)->f_isfd;
+       return kevent_callback_inline(kevp, kectx, /*legacy*/ true);
+}
 
-       if (is_fd) {
-               proc_fdlock(p);
+/*!
+ * @function kevent_cleanup
+ *
+ * @brief
+ * Handles the cleanup returning from a kevent call.
+ *
+ * @discussion
+ * kevent entry points will take a reference on workloops,
+ * and a usecount on the fileglob of kqfiles.
+ *
+ * This function undoes this on the exit paths of kevents.
+ *
+ * @returns
+ * The error to return to userspace.
+ */
+static int
+kevent_cleanup(kqueue_t kqu, int flags, int error, kevent_ctx_t kectx)
+{
+       // poll should not call any codepath leading to this
+       assert((flags & KEVENT_FLAG_POLL) == 0);
+
+       if (flags & KEVENT_FLAG_WORKLOOP) {
+               kqworkloop_release(kqu.kqwl);
+       } else if (flags & KEVENT_FLAG_WORKQ) {
+               /* nothing held */
        } else {
-               knhash_lock(p);
+               fp_drop(kqu.kqf->kqf_p, kectx->kec_fd, kectx->kec_fp, 0);
        }
 
-       if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) {
-               /* found an existing knote: we can't add this one */
-               ret = ERESTART;
-               goto out_locked;
+       /* don't restart after signals... */
+       if (error == ERESTART) {
+               error = EINTR;
+       } else if (error == 0) {
+               /* don't abandon other output just because of residual copyout failures */
+               (void)kevent_put_data_size(flags, kectx);
        }
 
-       /* knote was not found: add it now */
-       if (!is_fd) {
-               if (fdp->fd_knhashmask == 0) {
-                       u_long size = 0;
-
-                       list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size);
-                       if (list == NULL) {
-                               ret = ENOMEM;
-                               goto out_locked;
-                       }
-
-                       fdp->fd_knhash = list;
-                       fdp->fd_knhashmask = size;
+       if (flags & KEVENT_FLAG_PARKING) {
+               thread_t th = current_thread();
+               struct uthread *uth = get_bsdthread_info(th);
+               if (uth->uu_kqr_bound) {
+                       thread_unfreeze_base_pri(th);
                }
+       }
+       return error;
+}
 
-               list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
-               SLIST_INSERT_HEAD(list, kn, kn_link);
-               ret = 0;
-               goto out_locked;
+/*!
+ * @function kqueue_process
+ *
+ * @brief
+ * Process the triggered events in a kqueue.
+ *
+ * @discussion
+ * Walk the queued knotes and validate that they are really still triggered
+ * events by calling the filter routines (if necessary).
+ *
+ * For each event that is still considered triggered, invoke the callback
+ * routine provided.
+ *
+ * caller holds a reference on the kqueue.
+ * kqueue locked on entry and exit - but may be dropped
+ * kqueue list locked (held for duration of call)
+ *
+ * This is only called by kqueue_scan() so that the compiler can inline it.
+ *
+ * @returns
+ * - 0:            no event was returned, no other error occured
+ * - EBADF:        the kqueue is being destroyed (KQ_DRAIN is set)
+ * - EWOULDBLOCK:  (not an error) events have been found and we should return
+ * - EFAULT:       copyout failed
+ * - filter specific errors
+ */
+static int
+kqueue_process(kqueue_t kqu, int flags, kevent_ctx_t kectx,
+    kevent_callback_t callback)
+{
+       workq_threadreq_t kqr = current_uthread()->uu_kqr_bound;
+       struct knote *kn;
+       int error = 0, rc = 0;
+       struct kqtailq *base_queue, *queue;
+#if DEBUG || DEVELOPMENT
+       int retries = 64;
+#endif
+       uint16_t kq_type = (kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
+
+       if (kq_type & KQ_WORKQ) {
+               rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags);
+       } else if (kq_type & KQ_WORKLOOP) {
+               rc = kqworkloop_begin_processing(kqu.kqwl, flags);
        } else {
-               /* knote is fd based */
+kqfile_retry:
+               rc = kqfile_begin_processing(kqu.kqf);
+               if (rc == EBADF) {
+                       return EBADF;
+               }
+       }
 
-               if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
-                       u_int size = 0;
+       if (rc == -1) {
+               /* Nothing to process */
+               return 0;
+       }
 
-                       if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur
-                           || kn->kn_id >= (uint64_t)maxfiles) {
-                               ret = EINVAL;
-                               goto out_locked;
-                       }
-                       /* have to grow the fd_knlist */
-                       size = fdp->fd_knlistsize;
-                       while (size <= kn->kn_id) {
-                               size += KQEXTENT;
-                       }
+       /*
+        * loop through the enqueued knotes associated with this request,
+        * processing each one. Each request may have several queues
+        * of knotes to process (depending on the type of kqueue) so we
+        * have to loop through all the queues as long as we have additional
+        * space.
+        */
 
-                       if (size >= (UINT_MAX / sizeof(struct klist *))) {
-                               ret = EINVAL;
-                               goto out_locked;
-                       }
+process_again:
+       if (kq_type & KQ_WORKQ) {
+               base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index];
+       } else if (kq_type & KQ_WORKLOOP) {
+               base_queue = &kqu.kqwl->kqwl_queue[0];
+               queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1];
+       } else {
+               base_queue = queue = &kqu.kqf->kqf_queue;
+       }
 
-                       MALLOC(list, struct klist *,
-                           size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
-                       if (list == NULL) {
-                               ret = ENOMEM;
-                               goto out_locked;
+       do {
+               while ((kn = TAILQ_FIRST(queue)) != NULL) {
+                       error = knote_process(kn, kectx, callback);
+                       if (error == EJUSTRETURN) {
+                               error = 0;
+                       } else if (__improbable(error)) {
+                               /* error is EWOULDBLOCK when the out event array is full */
+                               goto stop_processing;
                        }
-
-                       bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
-                           fdp->fd_knlistsize * sizeof(struct klist *));
-                       bzero((caddr_t)list +
-                           fdp->fd_knlistsize * sizeof(struct klist *),
-                           (size - fdp->fd_knlistsize) * sizeof(struct klist *));
-                       FREE(fdp->fd_knlist, M_KQUEUE);
-                       fdp->fd_knlist = list;
-                       fdp->fd_knlistsize = size;
                }
+       } while (queue-- > base_queue);
 
-               list = &fdp->fd_knlist[kn->kn_id];
-               SLIST_INSERT_HEAD(list, kn, kn_link);
-               ret = 0;
-               goto out_locked;
+       if (kectx->kec_process_noutputs) {
+               /* callers will transform this into no error */
+               error = EWOULDBLOCK;
        }
 
-out_locked:
-       if (ret == 0) {
-               kqlock(kq);
-               assert((kn->kn_status & KN_LOCKED) == 0);
-               (void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
+stop_processing:
+       /*
+        * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
+        * we want to unbind the kqrequest from the thread.
+        *
+        * However, because the kq locks are dropped several times during process,
+        * new knotes may have fired again, in which case, we want to fail the end
+        * processing and process again, until it converges.
+        *
+        * If we have an error or returned events, end processing never fails.
+        */
+       if (error) {
+               flags &= ~KEVENT_FLAG_PARKING;
        }
-       if (is_fd) {
-               proc_fdunlock(p);
+       if (kq_type & KQ_WORKQ) {
+               rc = kqworkq_end_processing(kqu.kqwq, kqr, flags);
+       } else if (kq_type & KQ_WORKLOOP) {
+               rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags);
        } else {
-               knhash_unlock(p);
+               rc = kqfile_end_processing(kqu.kqf);
        }
 
-       return ret;
+       if (__probable(error)) {
+               return error;
+       }
+
+       if (__probable(rc >= 0)) {
+               assert(rc == 0 || rc == EBADF);
+               return rc;
+       }
+
+#if DEBUG || DEVELOPMENT
+       if (retries-- == 0) {
+               panic("kevent: way too many knote_process retries, kq: %p (0x%04x)",
+                   kqu.kq, kqu.kq->kq_state);
+       }
+#endif
+       if (kq_type & (KQ_WORKQ | KQ_WORKLOOP)) {
+               assert(flags & KEVENT_FLAG_PARKING);
+               goto process_again;
+       } else {
+               goto kqfile_retry;
+       }
 }
 
-/*
- * kq_remove_knote - remove a knote from the fd table for process
+/*!
+ * @function kqueue_scan_continue
  *
- * If the filter is file-based, remove based on fd index.
- * Otherwise remove from the hash based on the ident.
+ * @brief
+ * The continuation used by kqueue_scan for kevent entry points.
  *
- * fd_knhashlock and fdlock unheld on entry (and exit).
+ * @discussion
+ * Assumes we inherit a use/ref count on the kq or its fileglob.
+ *
+ * This is called by kqueue_scan if neither KEVENT_FLAG_POLL nor
+ * KEVENT_FLAG_KERNEL was set, and the caller had to wait.
  */
+OS_NORETURN OS_NOINLINE
 static void
-kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
-    struct knote_lock_ctx *knlc)
+kqueue_scan_continue(void *data, wait_result_t wait_result)
 {
-       struct filedesc *fdp = p->p_fd;
-       struct klist *list = NULL;
-       uint16_t kq_state;
-       bool is_fd;
+       uthread_t ut = current_uthread();
+       kevent_ctx_t kectx = &ut->uu_save.uus_kevent;
+       int error = 0, flags = kectx->kec_process_flags;
+       struct kqueue *kq = data;
 
-       is_fd = knote_fops(kn)->f_isfd;
-
-       if (is_fd) {
-               proc_fdlock(p);
-       } else {
-               knhash_lock(p);
-       }
+       /*
+        * only kevent variants call in here, so we know the callback is
+        * kevent_legacy_callback or kevent_modern_callback.
+        */
+       assert((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0);
 
-       if (is_fd) {
-               assert((u_int)fdp->fd_knlistsize > kn->kn_id);
-               list = &fdp->fd_knlist[kn->kn_id];
-       } else {
-               list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
+       switch (wait_result) {
+       case THREAD_AWAKENED:
+               if (__improbable(flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64))) {
+                       error = kqueue_scan(kq, flags, kectx, kevent_legacy_callback);
+               } else {
+                       error = kqueue_scan(kq, flags, kectx, kevent_modern_callback);
+               }
+               break;
+       case THREAD_TIMED_OUT:
+               error = 0;
+               break;
+       case THREAD_INTERRUPTED:
+               error = EINTR;
+               break;
+       case THREAD_RESTART:
+               error = EBADF;
+               break;
+       default:
+               panic("%s: - invalid wait_result (%d)", __func__, wait_result);
        }
-       SLIST_REMOVE(list, kn, knote, kn_link);
 
-       kqlock(kq);
-       kq_state = kq->kq_state;
-       if (knlc) {
-               knote_unlock_cancel(kq, kn, knlc, KNOTE_KQ_UNLOCK);
-       } else {
-               kqunlock(kq);
-       }
-       if (is_fd) {
-               proc_fdunlock(p);
-       } else {
-               knhash_unlock(p);
-       }
 
-       if (kq_state & KQ_DYNAMIC) {
-               kqueue_release_last(p, kq);
-       }
+       error = kevent_cleanup(kq, flags, error, kectx);
+       *(int32_t *)&ut->uu_rval = kectx->kec_process_noutputs;
+       unix_syscall_return(error);
 }
 
-/*
- * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
- * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
+/*!
+ * @function kqueue_scan
  *
- * fd_knhashlock or fdlock unheld on entry (and exit)
+ * @brief
+ * Scan and wait for events in a kqueue (used by poll & kevent).
+ *
+ * @discussion
+ * Process the triggered events in a kqueue.
+ *
+ * If there are no events triggered arrange to wait for them:
+ * - unless KEVENT_FLAG_IMMEDIATE is set in kectx->kec_process_flags
+ * - possibly until kectx->kec_deadline expires
+ *
+ * When it waits, and that neither KEVENT_FLAG_POLL nor KEVENT_FLAG_KERNEL
+ * are set, then it will wait in the kqueue_scan_continue continuation.
+ *
+ * poll() will block in place, and KEVENT_FLAG_KERNEL calls
+ * all pass KEVENT_FLAG_IMMEDIATE and will not wait.
+ *
+ * @param kq
+ * The kqueue being scanned.
+ *
+ * @param flags
+ * The KEVENT_FLAG_* flags for this call.
+ *
+ * @param kectx
+ * The context used for this scan.
+ * The uthread_t::uu_save.uus_kevent storage is used for this purpose.
+ *
+ * @param callback
+ * The callback to be called on events sucessfully processed.
+ * (Either kevent_legacy_callback, kevent_modern_callback or poll_callback)
  */
-
-static struct knote *
-kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_internal_s *kev,
-    bool is_fd, struct proc *p)
+int
+kqueue_scan(struct kqueue *kq, int flags, kevent_ctx_t kectx,
+    kevent_callback_t callback)
 {
-       struct knote * ret;
+       int error;
 
-       if (is_fd) {
-               proc_fdlock(p);
-       } else {
-               knhash_lock(p);
-       }
+       for (;;) {
+               kqlock(kq);
+               error = kqueue_process(kq, flags, kectx, callback);
 
-       ret = knote_fdfind(kq, kev, is_fd, p);
+               /*
+                * If we got an error, events returned (EWOULDBLOCK)
+                * or blocking was disallowed (KEVENT_FLAG_IMMEDIATE),
+                * just return.
+                */
+               if (__probable(error || (flags & KEVENT_FLAG_IMMEDIATE))) {
+                       kqunlock(kq);
+                       return error == EWOULDBLOCK ? 0 : error;
+               }
 
-       if (ret) {
-               kqlock(kq);
-       }
+               waitq_assert_wait64_leeway((struct waitq *)&kq->kq_wqs,
+                   KQ_EVENT, THREAD_ABORTSAFE, TIMEOUT_URGENCY_USER_NORMAL,
+                   kectx->kec_deadline, TIMEOUT_NO_LEEWAY);
+               kq->kq_state |= KQ_SLEEP;
 
-       if (is_fd) {
-               proc_fdunlock(p);
-       } else {
-               knhash_unlock(p);
-       }
+               kqunlock(kq);
 
-       return ret;
+               if (__probable((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0)) {
+                       thread_block_parameter(kqueue_scan_continue, kq);
+                       __builtin_unreachable();
+               }
+
+               wait_result_t wr = thread_block(THREAD_CONTINUE_NULL);
+               switch (wr) {
+               case THREAD_AWAKENED:
+                       break;
+               case THREAD_TIMED_OUT:
+                       return 0;
+               case THREAD_INTERRUPTED:
+                       return EINTR;
+               case THREAD_RESTART:
+                       return EBADF;
+               default:
+                       panic("%s: - bad wait_result (%d)", __func__, wr);
+               }
+       }
 }
-/*
- * knote_drop - disconnect and drop the knote
+
+/*!
+ * @function kevent_internal
  *
- * Called with the kqueue locked, returns with the kqueue unlocked.
+ * @brief
+ * Common kevent code.
  *
- * If a knote locking context is passed, it is canceled.
+ * @discussion
+ * Needs to be inlined to specialize for legacy or modern and
+ * eliminate dead code.
  *
- * The knote may have already been detached from
- * (or not yet attached to) its source object.
+ * This is the core logic of kevent entry points, that will:
+ * - register kevents
+ * - optionally scan the kqueue for events
+ *
+ * The caller is giving kevent_internal a reference on the kqueue
+ * or its fileproc that needs to be cleaned up by kevent_cleanup().
  */
-static void
-knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc)
+OS_ALWAYS_INLINE
+static inline int
+kevent_internal(kqueue_t kqu,
+    user_addr_t changelist, int nchanges,
+    user_addr_t ueventlist, int nevents,
+    int flags, kevent_ctx_t kectx, int32_t *retval,
+    bool legacy)
 {
-       struct proc *p = kq->kq_p;
+       int error = 0, noutputs = 0, register_rc;
 
-       kqlock_held(kq);
+       /* only bound threads can receive events on workloops */
+       if (!legacy && (flags & KEVENT_FLAG_WORKLOOP)) {
+#if CONFIG_WORKLOOP_DEBUG
+               UU_KEVENT_HISTORY_WRITE_ENTRY(current_uthread(), {
+                       .uu_kqid = kqu.kqwl->kqwl_dynamicid,
+                       .uu_kq = error ? NULL : kqu.kq,
+                       .uu_error = error,
+                       .uu_nchanges = nchanges,
+                       .uu_nevents = nevents,
+                       .uu_flags = flags,
+               });
+#endif // CONFIG_WORKLOOP_DEBUG
 
-       assert((kn->kn_status & KN_DROPPING) == 0);
-       if (knlc == NULL) {
-               assert((kn->kn_status & KN_LOCKED) == 0);
+               if (flags & KEVENT_FLAG_KERNEL) {
+                       /* see kevent_workq_internal */
+                       error = copyout(&kqu.kqwl->kqwl_dynamicid,
+                           ueventlist - sizeof(kqueue_id_t), sizeof(kqueue_id_t));
+                       kectx->kec_data_resid -= sizeof(kqueue_id_t);
+                       if (__improbable(error)) {
+                               goto out;
+                       }
+               }
+
+               if (kevent_args_requesting_events(flags, nevents)) {
+                       /*
+                        * Disable the R2K notification while doing a register, if the
+                        * caller wants events too, we don't want the AST to be set if we
+                        * will process these events soon.
+                        */
+                       kqlock(kqu);
+                       kqu.kq->kq_state &= ~KQ_R2K_ARMED;
+                       kqunlock(kqu);
+                       flags |= KEVENT_FLAG_NEEDS_END_PROCESSING;
+               }
        }
-       kn->kn_status |= KN_DROPPING;
 
-       knote_unsuppress(kn);
-       knote_dequeue(kn);
-       knote_wait_for_filter_events(kq, kn);
+       /* register all the change requests the user provided... */
+       while (nchanges > 0 && error == 0) {
+               struct kevent_qos_s kev;
+               struct knote *kn = NULL;
+
+               if (legacy) {
+                       error = kevent_legacy_copyin(&changelist, &kev, flags);
+               } else {
+                       error = kevent_modern_copyin(&changelist, &kev);
+               }
+               if (error) {
+                       break;
+               }
+
+               register_rc = kevent_register(kqu.kq, &kev, &kn);
+               if (__improbable(!legacy && (register_rc & FILTER_REGISTER_WAIT))) {
+                       thread_t thread = current_thread();
+
+                       kqlock_held(kqu);
+
+                       if (act_clear_astkevent(thread, AST_KEVENT_REDRIVE_THREADREQ)) {
+                               workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
+                       }
+
+                       // f_post_register_wait is meant to call a continuation and not to
+                       // return, which is why we don't support FILTER_REGISTER_WAIT if
+                       // KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
+                       // waits isn't the last.
+                       //
+                       // It is implementable, but not used by any userspace code at the
+                       // moment, so for now return ENOTSUP if someone tries to do it.
+                       if (nchanges == 1 && noutputs < nevents &&
+                           (flags & KEVENT_FLAG_KERNEL) == 0 &&
+                           (flags & KEVENT_FLAG_PARKING) == 0 &&
+                           (flags & KEVENT_FLAG_ERROR_EVENTS) &&
+                           (flags & KEVENT_FLAG_WORKLOOP)) {
+                               uthread_t ut = get_bsdthread_info(thread);
+
+                               /*
+                                * store the continuation/completion data in the uthread
+                                *
+                                * Note: the kectx aliases with this,
+                                * and is destroyed in the process.
+                                */
+                               ut->uu_save.uus_kevent_register = (struct _kevent_register){
+                                       .kev        = kev,
+                                       .kqwl       = kqu.kqwl,
+                                       .eventout   = noutputs,
+                                       .ueventlist = ueventlist,
+                               };
+                               knote_fops(kn)->f_post_register_wait(ut, kn,
+                                   &ut->uu_save.uus_kevent_register);
+                               __builtin_unreachable();
+                       }
+                       kqunlock(kqu);
+
+                       kev.flags |= EV_ERROR;
+                       kev.data = ENOTSUP;
+               } else {
+                       assert((register_rc & FILTER_REGISTER_WAIT) == 0);
+               }
 
-       /* If we are attached, disconnect from the source first */
-       if (kn->kn_status & KN_ATTACHED) {
-               knote_fops(kn)->f_detach(kn);
+               // keep in sync with kevent_register_wait_return()
+               if (noutputs < nevents && (kev.flags & (EV_ERROR | EV_RECEIPT))) {
+                       if ((kev.flags & EV_ERROR) == 0) {
+                               kev.flags |= EV_ERROR;
+                               kev.data = 0;
+                       }
+                       if (legacy) {
+                               error = kevent_legacy_copyout(&kev, &ueventlist, flags);
+                       } else {
+                               error = kevent_modern_copyout(&kev, &ueventlist);
+                       }
+                       if (error == 0) {
+                               noutputs++;
+                       }
+               } else if (kev.flags & EV_ERROR) {
+                       error = kev.data;
+               }
+               nchanges--;
        }
 
-       /* kq may be freed when kq_remove_knote() returns */
-       kq_remove_knote(kq, kn, p, knlc);
-       if (knote_fops(kn)->f_isfd && ((kn->kn_status & KN_VANISHED) == 0)) {
-               fp_drop(p, kn->kn_id, kn->kn_fp, 0);
+       if ((flags & KEVENT_FLAG_ERROR_EVENTS) == 0 &&
+           nevents > 0 && noutputs == 0 && error == 0) {
+               kectx->kec_process_flags = flags;
+               kectx->kec_process_nevents = nevents;
+               kectx->kec_process_noutputs = 0;
+               kectx->kec_process_eventlist = ueventlist;
+
+               if (legacy) {
+                       error = kqueue_scan(kqu.kq, flags, kectx, kevent_legacy_callback);
+               } else {
+                       error = kqueue_scan(kqu.kq, flags, kectx, kevent_modern_callback);
+               }
+
+               noutputs = kectx->kec_process_noutputs;
+       } else if (!legacy && (flags & KEVENT_FLAG_NEEDS_END_PROCESSING)) {
+               /*
+                * If we didn't through kqworkloop_end_processing(),
+                * we need to do it here.
+                *
+                * kqueue_scan will call kqworkloop_end_processing(),
+                * so we only need to do it if we didn't scan.
+                */
+               kqlock(kqu);
+               kqworkloop_end_processing(kqu.kqwl, 0, 0);
+               kqunlock(kqu);
        }
 
-       knote_free(kn);
+       *retval = noutputs;
+out:
+       return kevent_cleanup(kqu.kq, flags, error, kectx);
 }
 
-/* called with kqueue lock held */
-static void
-knote_activate(struct knote *kn)
-{
-       if (kn->kn_status & KN_ACTIVE) {
-               return;
-       }
-
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
-           kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
-           kn->kn_filtid);
+#pragma mark modern syscalls: kevent_qos, kevent_id, kevent_workq_internal
 
-       kn->kn_status |= KN_ACTIVE;
-       if (knote_enqueue(kn)) {
-               knote_wakeup(kn);
-       }
+/*!
+ * @function kevent_modern_internal
+ *
+ * @brief
+ * The backend of the kevent_id and kevent_workq_internal entry points.
+ *
+ * @discussion
+ * Needs to be inline due to the number of arguments.
+ */
+OS_NOINLINE
+static int
+kevent_modern_internal(kqueue_t kqu,
+    user_addr_t changelist, int nchanges,
+    user_addr_t ueventlist, int nevents,
+    int flags, kevent_ctx_t kectx, int32_t *retval)
+{
+       return kevent_internal(kqu.kq, changelist, nchanges,
+                  ueventlist, nevents, flags, kectx, retval, /*legacy*/ false);
 }
 
-/* called with kqueue lock held */
-static void
-knote_deactivate(struct knote *kn)
+/*!
+ * @function kevent_id
+ *
+ * @brief
+ * The kevent_id() syscall.
+ */
+int
+kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval)
 {
-       kn->kn_status &= ~KN_ACTIVE;
-       if ((kn->kn_status & KN_STAYACTIVE) == 0) {
-               knote_dequeue(kn);
+       int error, flags = uap->flags & KEVENT_FLAG_USER;
+       uthread_t uth = current_uthread();
+       workq_threadreq_t kqr = uth->uu_kqr_bound;
+       kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
+       kqueue_t kqu;
+
+       flags = kevent_adjust_flags_for_proc(p, flags);
+       flags |= KEVENT_FLAG_DYNAMIC_KQUEUE;
+
+       if (__improbable((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP)) !=
+           KEVENT_FLAG_WORKLOOP)) {
+               return EINVAL;
        }
-}
 
-/* called with kqueue lock held */
-static void
-knote_enable(struct knote *kn)
-{
-       if ((kn->kn_status & KN_DISABLED) == 0) {
-               return;
+       error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
+       if (__improbable(error)) {
+               return error;
        }
 
-       kn->kn_status &= ~KN_DISABLED;
+       kectx->kec_deadline = 0;
+       kectx->kec_fp       = NULL;
+       kectx->kec_fd       = -1;
+       /* the kec_process_* fields are filled if kqueue_scann is called only */
 
-       if (kn->kn_status & KN_SUPPRESSED) {
-               /*
-                * it is possible for userland to have knotes registered for a given
-                * workloop `wl_orig` but really handled on another workloop `wl_new`.
-                *
-                * In that case, rearming will happen from the servicer thread of
-                * `wl_new` which if `wl_orig` is no longer being serviced, would cause
-                * this knote to stay suppressed forever if we only relied on
-                * kqworkloop_acknowledge_events to be called by `wl_orig`.
-                *
-                * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
-                * unsuppress because that would mess with the processing phase of
-                * `wl_orig`, however it also means kqworkloop_acknowledge_events()
-                * will be called.
-                */
-               struct kqueue *kq = knote_get_kq(kn);
-               if ((kq->kq_state & KQ_PROCESSING) == 0) {
-                       knote_unsuppress(kn);
+       /*
+        * Get the kq we are going to be working on
+        * As a fastpath, look at the currently bound workloop.
+        */
+       kqu.kqwl = kqr ? kqr_kqworkloop(kqr) : NULL;
+       if (kqu.kqwl && kqu.kqwl->kqwl_dynamicid == uap->id) {
+               if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
+                       return EEXIST;
+               }
+               kqworkloop_retain(kqu.kqwl);
+       } else if (__improbable(kevent_args_requesting_events(flags, uap->nevents))) {
+               return EXDEV;
+       } else {
+               error = kqworkloop_get_or_create(p, uap->id, NULL, flags, &kqu.kqwl);
+               if (__improbable(error)) {
+                       return error;
                }
-       } else if (knote_enqueue(kn)) {
-               knote_wakeup(kn);
        }
+
+       return kevent_modern_internal(kqu, uap->changelist, uap->nchanges,
+                  uap->eventlist, uap->nevents, flags, kectx, retval);
 }
 
-/* called with kqueue lock held */
-static void
-knote_disable(struct knote *kn)
+/**!
+ * @function kevent_workq_internal
+ *
+ * @discussion
+ * This function is exported for the sake of the workqueue subsystem.
+ *
+ * It is called in two ways:
+ * - when a thread is about to go to userspace to ask for pending event
+ * - when a thread is returning from userspace with events back
+ *
+ * the workqueue subsystem will only use the following flags:
+ * - KEVENT_FLAG_STACK_DATA (always)
+ * - KEVENT_FLAG_IMMEDIATE (always)
+ * - KEVENT_FLAG_PARKING (depending on whether it is going to or returning from
+ *   userspace).
+ *
+ * It implicitly acts on the bound kqueue, and for the case of workloops
+ * will copyout the kqueue ID before anything else.
+ *
+ *
+ * Pthread will have setup the various arguments to fit this stack layout:
+ *
+ * +-------....----+--------------+-----------+--------------------+
+ * |  user stack   |  data avail  |  nevents  |   pthread_self()   |
+ * +-------....----+--------------+-----------+--------------------+
+ *                 ^              ^
+ *             data_out       eventlist
+ *
+ * When a workloop is used, the workloop ID is copied out right before
+ * the eventlist and is taken from the data buffer.
+ *
+ * @warning
+ * This function is carefuly tailored to not make any call except the final tail
+ * call into kevent_modern_internal. (LTO inlines current_uthread()).
+ *
+ * This function is performance sensitive due to the workq subsystem.
+ */
+int
+kevent_workq_internal(struct proc *p,
+    user_addr_t changelist, int nchanges,
+    user_addr_t eventlist, int nevents,
+    user_addr_t data_out, user_size_t *data_available,
+    unsigned int flags, int32_t *retval)
 {
-       if (kn->kn_status & KN_DISABLED) {
-               return;
-       }
+       uthread_t uth = current_uthread();
+       workq_threadreq_t kqr = uth->uu_kqr_bound;
+       kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
+       kqueue_t kqu;
 
-       kn->kn_status |= KN_DISABLED;
-       knote_dequeue(kn);
-}
+       assert(flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE) ||
+           flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_PARKING));
 
-/* called with kqueue lock held */
-static void
-knote_suppress(struct knote *kn)
-{
-       struct kqtailq *suppressq;
-       struct kqueue *kq = knote_get_kq(kn);
+       kectx->kec_data_out   = data_out;
+       kectx->kec_data_avail = (uint64_t)data_available;
+       kectx->kec_data_size  = *data_available;
+       kectx->kec_data_resid = *data_available;
+       kectx->kec_deadline   = 0;
+       kectx->kec_fp         = NULL;
+       kectx->kec_fd         = -1;
+       /* the kec_process_* fields are filled if kqueue_scann is called only */
 
-       kqlock_held(kq);
+       flags = kevent_adjust_flags_for_proc(p, flags);
 
-       if (kn->kn_status & KN_SUPPRESSED) {
-               return;
+       if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
+               kqu.kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
+               kqworkloop_retain(kqu.kqwl);
+
+               flags |= KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_DYNAMIC_KQUEUE |
+                   KEVENT_FLAG_KERNEL;
+       } else {
+               kqu.kqwq = p->p_fd->fd_wqkqueue;
+
+               flags |= KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL;
        }
 
-       knote_dequeue(kn);
-       kn->kn_status |= KN_SUPPRESSED;
-       suppressq = kqueue_get_suppressed_queue(kq, kn);
-       TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
+       return kevent_modern_internal(kqu, changelist, nchanges,
+                  eventlist, nevents, flags, kectx, retval);
 }
 
-/* called with kqueue lock held */
-static void
-knote_unsuppress(struct knote *kn)
+/*!
+ * @function kevent_qos
+ *
+ * @brief
+ * The kevent_qos() syscall.
+ */
+int
+kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
 {
-       struct kqtailq *suppressq;
-       struct kqueue *kq = knote_get_kq(kn);
-
-       kqlock_held(kq);
+       uthread_t uth = current_uthread();
+       kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
+       int error, flags = uap->flags & KEVENT_FLAG_USER;
+       struct kqueue *kq;
 
-       if ((kn->kn_status & KN_SUPPRESSED) == 0) {
-               return;
+       if (__improbable(flags & KEVENT_ID_FLAG_USER)) {
+               return EINVAL;
        }
 
-       kn->kn_status &= ~KN_SUPPRESSED;
-       suppressq = kqueue_get_suppressed_queue(kq, kn);
-       TAILQ_REMOVE(suppressq, kn, kn_tqe);
-
-       /*
-        * If the knote is no longer active, reset its push,
-        * and resynchronize kn_qos_index with kn_qos_override
-        */
-       if ((kn->kn_status & KN_ACTIVE) == 0) {
-               kn->kn_qos_override = kn->kn_req_index;
-       }
-       kn->kn_qos_index = kn->kn_qos_override;
+       flags = kevent_adjust_flags_for_proc(p, flags);
 
-       /* don't wakeup if unsuppressing just a stay-active knote */
-       if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) {
-               knote_wakeup(kn);
+       error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
+       if (__improbable(error)) {
+               return error;
        }
 
-       if ((kq->kq_state & KQ_WORKLOOP) && TAILQ_EMPTY(suppressq)) {
-               struct kqworkloop *kqwl = (struct kqworkloop *)kq;
+       kectx->kec_deadline = 0;
+       kectx->kec_fp       = NULL;
+       kectx->kec_fd       = uap->fd;
+       /* the kec_process_* fields are filled if kqueue_scann is called only */
 
-               if (kqworkloop_is_processing_on_current_thread(kqwl)) {
-                       /*
-                        * kqworkloop_end_processing() or kqworkloop_begin_processing()
-                        * will perform the required QoS computations when it unsets the
-                        * processing mode.
-                        */
-               } else {
-                       kq_req_lock(kqwl);
-                       kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RESET_WAKEUP_OVERRIDE, 0);
-                       kq_req_unlock(kqwl);
-               }
+       /* get the kq we are going to be working on */
+       if (__probable(flags & KEVENT_FLAG_WORKQ)) {
+               error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
+       } else {
+               error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
        }
-}
-
-/* called with kqueue lock held */
-static int
-knote_enqueue(struct knote *kn)
-{
-       if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0 ||
-           (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING))) {
-               return 0;
+       if (__improbable(error)) {
+               return error;
        }
 
-       if ((kn->kn_status & KN_QUEUED) == 0) {
-               struct kqtailq *queue = knote_get_queue(kn);
-               struct kqueue *kq = knote_get_kq(kn);
-
-               kqlock_held(kq);
-               TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
-               kn->kn_status |= KN_QUEUED;
-               kq->kq_count++;
-               return 1;
-       }
-       return (kn->kn_status & KN_STAYACTIVE) != 0;
+       return kevent_modern_internal(kq, uap->changelist, uap->nchanges,
+                  uap->eventlist, uap->nevents, flags, kectx, retval);
 }
 
+#pragma mark legacy syscalls: kevent, kevent64
 
-/* called with kqueue lock held */
-static void
-knote_dequeue(struct knote *kn)
+/*!
+ * @function kevent_legacy_get_deadline
+ *
+ * @brief
+ * Compute the deadline for the legacy kevent syscalls.
+ *
+ * @discussion
+ * This is not necessary if KEVENT_FLAG_IMMEDIATE is specified,
+ * as this takes precedence over the deadline.
+ *
+ * This function will fail if utimeout is USER_ADDR_NULL
+ * (the caller should check).
+ */
+static int
+kevent_legacy_get_deadline(int flags, user_addr_t utimeout, uint64_t *deadline)
 {
-       struct kqueue *kq = knote_get_kq(kn);
-       struct kqtailq *queue;
-
-       kqlock_held(kq);
+       struct timespec ts;
 
-       if ((kn->kn_status & KN_QUEUED) == 0) {
-               return;
+       if (flags & KEVENT_FLAG_PROC64) {
+               struct user64_timespec ts64;
+               int error = copyin(utimeout, &ts64, sizeof(ts64));
+               if (__improbable(error)) {
+                       return error;
+               }
+               ts.tv_sec = ts64.tv_sec;
+               ts.tv_nsec = ts64.tv_nsec;
+       } else {
+               struct user32_timespec ts32;
+               int error = copyin(utimeout, &ts32, sizeof(ts32));
+               if (__improbable(error)) {
+                       return error;
+               }
+               ts.tv_sec = ts32.tv_sec;
+               ts.tv_nsec = ts32.tv_nsec;
+       }
+       if (!timespec_is_valid(&ts)) {
+               return EINVAL;
        }
 
-       queue = knote_get_queue(kn);
-       TAILQ_REMOVE(queue, kn, kn_tqe);
-       kn->kn_status &= ~KN_QUEUED;
-       kq->kq_count--;
+       clock_absolutetime_interval_to_deadline(tstoabstime(&ts), deadline);
+       return 0;
 }
 
-void
-knote_init(void)
+/*!
+ * @function kevent_legacy_internal
+ *
+ * @brief
+ * The core implementation for kevent and kevent64
+ */
+OS_NOINLINE
+static int
+kevent_legacy_internal(struct proc *p, struct kevent64_args *uap,
+    int32_t *retval, int flags)
 {
-       knote_zone = zinit(sizeof(struct knote), 8192 * sizeof(struct knote),
-           8192, "knote zone");
-
-       kqfile_zone = zinit(sizeof(struct kqfile), 8192 * sizeof(struct kqfile),
-           8192, "kqueue file zone");
+       uthread_t uth = current_uthread();
+       kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
+       struct kqueue *kq;
+       int error;
 
-       kqworkq_zone = zinit(sizeof(struct kqworkq), 8192 * sizeof(struct kqworkq),
-           8192, "kqueue workq zone");
+       if (__improbable(uap->flags & KEVENT_ID_FLAG_USER)) {
+               return EINVAL;
+       }
 
-       kqworkloop_zone = zinit(sizeof(struct kqworkloop), 8192 * sizeof(struct kqworkloop),
-           8192, "kqueue workloop zone");
+       flags = kevent_adjust_flags_for_proc(p, flags);
 
-       /* allocate kq lock group attribute and group */
-       kq_lck_grp_attr = lck_grp_attr_alloc_init();
+       kectx->kec_data_out   = 0;
+       kectx->kec_data_avail = 0;
+       kectx->kec_data_size  = 0;
+       kectx->kec_data_resid = 0;
+       kectx->kec_deadline   = 0;
+       kectx->kec_fp         = NULL;
+       kectx->kec_fd         = uap->fd;
+       /* the kec_process_* fields are filled if kqueue_scann is called only */
 
-       kq_lck_grp = lck_grp_alloc_init("kqueue", kq_lck_grp_attr);
+       /* convert timeout to absolute - if we have one (and not immediate) */
+       if (__improbable(uap->timeout && !(flags & KEVENT_FLAG_IMMEDIATE))) {
+               error = kevent_legacy_get_deadline(flags, uap->timeout,
+                   &kectx->kec_deadline);
+               if (__improbable(error)) {
+                       return error;
+               }
+       }
 
-       /* Allocate kq lock attribute */
-       kq_lck_attr = lck_attr_alloc_init();
+       /* get the kq we are going to be working on */
+       if (flags & KEVENT_FLAG_WORKQ) {
+               error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
+       } else {
+               error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
+       }
+       if (__improbable(error)) {
+               return error;
+       }
 
-#if CONFIG_MEMORYSTATUS
-       /* Initialize the memorystatus list lock */
-       memorystatus_kevent_init(kq_lck_grp, kq_lck_attr);
-#endif
+       return kevent_internal(kq, uap->changelist, uap->nchanges,
+                  uap->eventlist, uap->nevents, flags, kectx, retval,
+                  /*legacy*/ true);
 }
-SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
 
-const struct filterops *
-knote_fops(struct knote *kn)
+/*!
+ * @function kevent
+ *
+ * @brief
+ * The legacy kevent() syscall.
+ */
+int
+kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
 {
-       return sysfilt_ops[kn->kn_filtid];
-}
+       struct kevent64_args args = {
+               .fd         = uap->fd,
+               .changelist = uap->changelist,
+               .nchanges   = uap->nchanges,
+               .eventlist  = uap->eventlist,
+               .nevents    = uap->nevents,
+               .timeout    = uap->timeout,
+       };
 
-static struct knote *
-knote_alloc(void)
-{
-       struct knote *kn = ((struct knote *)zalloc(knote_zone));
-       bzero(kn, sizeof(struct knote));
-       return kn;
+       return kevent_legacy_internal(p, &args, retval, KEVENT_FLAG_LEGACY32);
 }
 
-static void
-knote_free(struct knote *kn)
+/*!
+ * @function kevent64
+ *
+ * @brief
+ * The legacy kevent64() syscall.
+ */
+int
+kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
 {
-       assert(kn->kn_inuse == 0);
-       assert((kn->kn_status & KN_LOCKED) == 0);
-       zfree(knote_zone, kn);
+       int flags = (uap->flags & KEVENT_FLAG_USER) | KEVENT_FLAG_LEGACY64;
+       return kevent_legacy_internal(p, uap, retval, flags);
 }
 
+#pragma mark - socket interface
+
 #if SOCKETS
 #include <sys/param.h>
 #include <sys/socket.h>
@@ -8266,7 +8394,7 @@ kev_msg_post(struct kev_msg *event_msg)
         */
        if (event_msg->vendor_code < min_vendor ||
            event_msg->vendor_code > max_vendor) {
-               OSIncrementAtomic64((SInt64 *)&kevtstat.kes_badvendor);
+               os_atomic_inc(&kevtstat.kes_badvendor, relaxed);
                return EINVAL;
        }
        return kev_post_msg(event_msg);
@@ -8293,13 +8421,13 @@ kev_post_msg(struct kev_msg *event_msg)
        }
 
        if (total_size > MLEN) {
-               OSIncrementAtomic64((SInt64 *)&kevtstat.kes_toobig);
+               os_atomic_inc(&kevtstat.kes_toobig, relaxed);
                return EMSGSIZE;
        }
 
        m = m_get(M_WAIT, MT_DATA);
        if (m == 0) {
-               OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
+               os_atomic_inc(&kevtstat.kes_nomem, relaxed);
                return ENOMEM;
        }
        ev = mtod(m, struct kern_event_msg *);
@@ -8358,7 +8486,7 @@ kev_post_msg(struct kev_msg *event_msg)
 
                m2 = m_copym(m, 0, m->m_len, M_WAIT);
                if (m2 == 0) {
-                       OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
+                       os_atomic_inc(&kevtstat.kes_nomem, relaxed);
                        m_free(m);
                        lck_mtx_unlock(&ev_pcb->evp_mtx);
                        lck_rw_done(kev_rwlock);
@@ -8373,9 +8501,9 @@ kev_post_msg(struct kev_msg *event_msg)
                            1, m->m_len, MBUF_TC_BE);
 
                        sorwakeup(ev_pcb->evp_socket);
-                       OSIncrementAtomic64((SInt64 *)&kevtstat.kes_posted);
+                       os_atomic_inc(&kevtstat.kes_posted, relaxed);
                } else {
-                       OSIncrementAtomic64((SInt64 *)&kevtstat.kes_fullsock);
+                       os_atomic_inc(&kevtstat.kes_fullsock, relaxed);
                }
                lck_mtx_unlock(&ev_pcb->evp_mtx);
        }
@@ -8590,29 +8718,28 @@ fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo)
 }
 
 static int
-fill_kqueue_dyninfo(struct kqueue *kq, struct kqueue_dyninfo *kqdi)
+fill_kqueue_dyninfo(struct kqworkloop *kqwl, struct kqueue_dyninfo *kqdi)
 {
-       struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-       struct kqrequest *kqr = &kqwl->kqwl_request;
+       workq_threadreq_t kqr = &kqwl->kqwl_request;
        workq_threadreq_param_t trp = {};
        int err;
 
-       if ((kq->kq_state & KQ_WORKLOOP) == 0) {
+       if ((kqwl->kqwl_state & KQ_WORKLOOP) == 0) {
                return EINVAL;
        }
 
-       if ((err = fill_kqueueinfo(kq, &kqdi->kqdi_info))) {
+       if ((err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi->kqdi_info))) {
                return err;
        }
 
-       kq_req_lock(kqwl);
+       kqlock(kqwl);
 
-       kqdi->kqdi_servicer = thread_tid(kqr->kqr_thread);
+       kqdi->kqdi_servicer = thread_tid(kqr_thread(kqr));
        kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
-       kqdi->kqdi_request_state = kqr->kqr_state;
-       kqdi->kqdi_async_qos = kqr->kqr_qos_index;
-       kqdi->kqdi_events_qos = kqr->kqr_override_index;
-       kqdi->kqdi_sync_waiters = kqr->kqr_dsync_waiters;
+       kqdi->kqdi_request_state = kqr->tr_state;
+       kqdi->kqdi_async_qos = kqr->tr_kq_qos_index;
+       kqdi->kqdi_events_qos = kqr->tr_kq_override_index;
+       kqdi->kqdi_sync_waiters = 0;
        kqdi->kqdi_sync_waiter_qos = 0;
 
        trp.trp_value = kqwl->kqwl_params;
@@ -8634,7 +8761,7 @@ fill_kqueue_dyninfo(struct kqueue *kq, struct kqueue_dyninfo *kqdi)
                kqdi->kqdi_cpupercent = 0;
        }
 
-       kq_req_unlock(kqwl);
+       kqunlock(kqwl);
 
        return 0;
 }
@@ -8653,40 +8780,37 @@ knote_markstayactive(struct knote *kn)
         * Making a knote stay active is a property of the knote that must be
         * established before it is fully attached.
         */
-       assert(kn->kn_status & KN_ATTACHING);
        assert((kn->kn_status & (KN_QUEUED | KN_SUPPRESSED)) == 0);
 
        /* handle all stayactive knotes on the (appropriate) manager */
-       if (kq->kq_state & KQ_WORKQ) {
-               qos = KQWQ_QOS_MANAGER;
-       } else if (kq->kq_state & KQ_WORKLOOP) {
+       if (kq->kq_state & KQ_WORKLOOP) {
                struct kqworkloop *kqwl = (struct kqworkloop *)kq;
 
                qos = _pthread_priority_thread_qos(kn->kn_qos);
                assert(qos && qos < THREAD_QOS_LAST);
-               kq_req_lock(kq);
                kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_STAYACTIVE_QOS, qos);
-               kq_req_unlock(kq);
                qos = KQWL_BUCKET_STAYACTIVE;
+       } else if (kq->kq_state & KQ_WORKQ) {
+               qos = KQWQ_QOS_MANAGER;
        } else {
                qos = THREAD_QOS_UNSPECIFIED;
        }
 
-       kn->kn_req_index = qos;
        kn->kn_qos_override = qos;
        kn->kn_qos_index = qos;
 
-       knote_activate(kn);
+       knote_activate(kq, kn, FILTER_ACTIVE);
        kqunlock(kq);
 }
 
 void
 knote_clearstayactive(struct knote *kn)
 {
-       kqlock(knote_get_kq(kn));
-       kn->kn_status &= ~KN_STAYACTIVE;
-       knote_deactivate(kn);
-       kqunlock(knote_get_kq(kn));
+       struct kqueue *kq = knote_get_kq(kn);
+       kqlock(kq);
+       kn->kn_status &= ~(KN_STAYACTIVE | KN_ACTIVE);
+       knote_dequeue(kq, kn);
+       kqunlock(kq);
 }
 
 static unsigned long
@@ -8697,26 +8821,22 @@ kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *
                if (kq == knote_get_kq(kn)) {
                        if (nknotes < buflen) {
                                struct kevent_extinfo *info = &buf[nknotes];
-                               struct kevent_internal_s *kevp = &kn->kn_kevent;
 
                                kqlock(kq);
 
-                               info->kqext_kev = (struct kevent_qos_s){
-                                       .ident = kevp->ident,
-                                       .filter = kevp->filter,
-                                       .flags = kevp->flags,
-                                       .fflags = kevp->fflags,
-                                       .data = (int64_t)kevp->data,
-                                       .udata = kevp->udata,
-                                       .ext[0] = kevp->ext[0],
-                                       .ext[1] = kevp->ext[1],
-                                       .ext[2] = kevp->ext[2],
-                                       .ext[3] = kevp->ext[3],
-                                       .qos = kn->kn_req_index,
-                               };
-                               info->kqext_sdata = kn->kn_sdata;
-                               info->kqext_status = kn->kn_status;
-                               info->kqext_sfflags = kn->kn_sfflags;
+                               info->kqext_kev         = *(struct kevent_qos_s *)&kn->kn_kevent;
+                               if (knote_has_qos(kn)) {
+                                       info->kqext_kev.qos =
+                                           _pthread_priority_thread_qos_fast(kn->kn_qos);
+                               } else {
+                                       info->kqext_kev.qos = kn->kn_qos_override;
+                               }
+                               info->kqext_kev.filter |= 0xff00; /* sign extend filter */
+                               info->kqext_kev.xflags  = 0; /* this is where sfflags lives */
+                               info->kqext_kev.data    = 0; /* this is where sdata lives */
+                               info->kqext_sdata       = kn->kn_sdata;
+                               info->kqext_status      = kn->kn_status;
+                               info->kqext_sfflags     = kn->kn_sfflags;
 
                                kqunlock(kq);
                        }
@@ -8763,13 +8883,13 @@ kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
                bzero(kq_ids, bufsize);
        }
 
-       kqhash_lock(p);
+       kqhash_lock(fdp);
 
        if (fdp->fd_kqhashmask > 0) {
                for (uint32_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
                        struct kqworkloop *kqwl;
 
-                       SLIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
+                       LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
                                /* report the number of kqueues, even if they don't all fit */
                                if (nkqueues < buflen) {
                                        kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
@@ -8779,7 +8899,7 @@ kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
                }
        }
 
-       kqhash_unlock(p);
+       kqhash_unlock(fdp);
 
        if (kq_ids) {
                size_t copysize;
@@ -8808,7 +8928,7 @@ kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
     uint32_t ubufsize, int32_t *size_out)
 {
        proc_t p = (proc_t)proc;
-       struct kqueue *kq;
+       struct kqworkloop *kqwl;
        int err = 0;
        struct kqueue_dyninfo kqdi = { };
 
@@ -8818,14 +8938,10 @@ kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
                return ENOBUFS;
        }
 
-       kqhash_lock(p);
-       kq = kqueue_hash_lookup(p, kq_id);
-       if (!kq) {
-               kqhash_unlock(p);
+       kqwl = kqworkloop_hash_lookup_and_retain(p->p_fd, kq_id);
+       if (!kqwl) {
                return ESRCH;
        }
-       kqueue_retain(kq);
-       kqhash_unlock(p);
 
        /*
         * backward compatibility: allow the argument to this call to only be
@@ -8833,15 +8949,15 @@ kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
         */
        if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
                ubufsize = sizeof(struct kqueue_dyninfo);
-               err = fill_kqueue_dyninfo(kq, &kqdi);
+               err = fill_kqueue_dyninfo(kqwl, &kqdi);
        } else {
                ubufsize = sizeof(struct kqueue_info);
-               err = fill_kqueueinfo(kq, &kqdi.kqdi_info);
+               err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi.kqdi_info);
        }
        if (err == 0 && (err = copyout(&kqdi, ubuf, ubufsize)) == 0) {
                *size_out = ubufsize;
        }
-       kqueue_release_last(p, kq);
+       kqworkloop_release(kqwl);
        return err;
 }
 
@@ -8850,22 +8966,16 @@ kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
     uint32_t ubufsize, int32_t *nknotes_out)
 {
        proc_t p = (proc_t)proc;
-       struct kqueue *kq;
+       struct kqworkloop *kqwl;
        int err;
 
-       assert(p != NULL);
-
-       kqhash_lock(p);
-       kq = kqueue_hash_lookup(p, kq_id);
-       if (!kq) {
-               kqhash_unlock(p);
+       kqwl = kqworkloop_hash_lookup_and_retain(p->p_fd, kq_id);
+       if (!kqwl) {
                return ESRCH;
        }
-       kqueue_retain(kq);
-       kqhash_unlock(p);
 
-       err = pid_kqueue_extinfo(p, kq, ubuf, ubufsize, nknotes_out);
-       kqueue_release_last(p, kq);
+       err = pid_kqueue_extinfo(p, &kqwl->kqwl_kqueue, ubuf, ubufsize, nknotes_out);
+       kqworkloop_release(kqwl);
        return err;
 }
 
@@ -8900,10 +9010,10 @@ pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
 
        if (fdp->fd_knhashmask != 0) {
                for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
-                       kqhash_lock(p);
+                       knhash_lock(fdp);
                        kn = SLIST_FIRST(&fdp->fd_knhash[i]);
                        nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
-                       kqhash_unlock(p);
+                       knhash_unlock(fdp);
                }
        }
 
@@ -8926,15 +9036,14 @@ static unsigned int
 klist_copy_udata(struct klist *list, uint64_t *buf,
     unsigned int buflen, unsigned int nknotes)
 {
-       struct kevent_internal_s *kev;
        struct knote *kn;
        SLIST_FOREACH(kn, list, kn_link) {
                if (nknotes < buflen) {
-                       struct kqueue *kq = knote_get_kq(kn);
-                       kqlock(kq);
-                       kev = &(kn->kn_kevent);
-                       buf[nknotes] = kev->udata;
-                       kqunlock(kq);
+                       /*
+                        * kevent_register will always set kn_udata atomically
+                        * so that we don't have to take any kqlock here.
+                        */
+                       buf[nknotes] = os_atomic_load_wide(&kn->kn_udata, relaxed);
                }
                /* we return total number of knotes, which may be more than requested */
                nknotes++;
@@ -8943,21 +9052,6 @@ klist_copy_udata(struct klist *list, uint64_t *buf,
        return nknotes;
 }
 
-static unsigned int
-kqlist_copy_dynamicids(__assert_only proc_t p, struct kqlist *list,
-    uint64_t *buf, unsigned int buflen, unsigned int nids)
-{
-       kqhash_lock_held(p);
-       struct kqworkloop *kqwl;
-       SLIST_FOREACH(kqwl, list, kqwl_hashlink) {
-               if (nids < buflen) {
-                       buf[nids] = kqwl->kqwl_dynamicid;
-               }
-               nids++;
-       }
-       return nids;
-}
-
 int
 kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize)
 {
@@ -8965,6 +9059,7 @@ kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize)
        struct filedesc *fdp = p->p_fd;
        unsigned int nuptrs = 0;
        unsigned long buflen = bufsize / sizeof(uint64_t);
+       struct kqworkloop *kqwl;
 
        if (buflen > 0) {
                assert(buf != NULL);
@@ -8974,23 +9069,28 @@ kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize)
        for (int i = 0; i < fdp->fd_knlistsize; i++) {
                nuptrs = klist_copy_udata(&fdp->fd_knlist[i], buf, buflen, nuptrs);
        }
-       knhash_lock(p);
        proc_fdunlock(p);
+
+       knhash_lock(fdp);
        if (fdp->fd_knhashmask != 0) {
-               for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
+               for (size_t i = 0; i < fdp->fd_knhashmask + 1; i++) {
                        nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs);
                }
        }
-       knhash_unlock(p);
+       knhash_unlock(fdp);
 
-       kqhash_lock(p);
+       kqhash_lock(fdp);
        if (fdp->fd_kqhashmask != 0) {
-               for (int i = 0; i < (int)fdp->fd_kqhashmask + 1; i++) {
-                       nuptrs = kqlist_copy_dynamicids(p, &fdp->fd_kqhash[i], buf, buflen,
-                           nuptrs);
+               for (size_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
+                       LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
+                               if (nuptrs < buflen) {
+                                       buf[nuptrs] = kqwl->kqwl_dynamicid;
+                               }
+                               nuptrs++;
+                       }
                }
        }
-       kqhash_unlock(p);
+       kqhash_unlock(fdp);
 
        return (int)nuptrs;
 }
@@ -9068,9 +9168,9 @@ kevent_sysctl SYSCTL_HANDLER_ARGS
                return EFAULT;
        }
 
-       struct kqrequest *kqr = ut->uu_kqr_bound;
+       workq_threadreq_t kqr = ut->uu_kqr_bound;
        if (kqr) {
-               if (kqr->kqr_state & KQR_WORKLOOP) {
+               if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
                        bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid;
                } else {
                        bound_id = -1;
index fea51a17250fa9fab4a580963ef3e88012ab9bbb..03bcf7896c58579ee416cc8a483ce1beac28c0bb 100644 (file)
 
 #include <ipc/ipc_types.h>
 
+#include <mach/mach_param.h>
 #include <mach/mach_types.h>
 #include <mach/port.h>
 #include <mach/task.h>
 #include <security/mac_mach_internal.h>
 #endif
 
+#if CONFIG_ARCADE
+#include <kern/arcade.h>
+#endif
+
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_protos.h>
 
 #include <kdp/kdp_dyld.h>
 
+#include <machine/machine_routines.h>
 #include <machine/pal_routines.h>
 
 #include <pexpert/pexpert.h>
 
 extern boolean_t vm_darkwake_mode;
 
+extern int bootarg_execfailurereports; /* bsd_init.c */
+
 #if CONFIG_DTRACE
 /* Do not include dtrace.h, it redefines kmem_[alloc/free] */
 extern void dtrace_proc_exec(proc_t);
@@ -198,10 +206,12 @@ boolean_t thread_is_active(thread_t thread);
 void thread_copy_resource_info(thread_t dst_thread, thread_t src_thread);
 void *ipc_importance_exec_switch_task(task_t old_task, task_t new_task);
 extern void ipc_importance_release(void *elem);
+extern boolean_t task_has_watchports(task_t task);
 
 /*
  * Mach things for which prototypes are unavailable from Mach headers
  */
+#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1
 void            ipc_task_reset(
        task_t          task);
 void            ipc_thread_reset(
@@ -210,7 +220,10 @@ kern_return_t ipc_object_copyin(
        ipc_space_t             space,
        mach_port_name_t        name,
        mach_msg_type_name_t    msgt_name,
-       ipc_object_t            *objectp);
+       ipc_object_t            *objectp,
+       mach_port_context_t     context,
+       mach_msg_guard_flags_t  *guard_flags,
+       uint32_t                kmsg_flags);
 void ipc_port_release_send(ipc_port_t);
 
 #if DEVELOPMENT || DEBUG
@@ -265,6 +278,13 @@ SYSCTL_INT(_security_mac, OID_AUTO, platform_exec_logging, CTLFLAG_RW, &platform
 
 static os_log_t peLog = OS_LOG_DEFAULT;
 
+struct exec_port_actions {
+       uint32_t portwatch_count;
+       uint32_t registered_count;
+       ipc_port_t *portwatch_array;
+       ipc_port_t *registered_array;
+};
+
 struct image_params;    /* Forward */
 static int exec_activate_image(struct image_params *imgp);
 static int exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp);
@@ -282,9 +302,11 @@ static int copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size);
 static void exec_resettextvp(proc_t, struct image_params *);
 static int check_for_signature(proc_t, struct image_params *);
 static void exec_prefault_data(proc_t, struct image_params *, load_result_t *);
-static errno_t exec_handle_port_actions(struct image_params *imgp, boolean_t * portwatch_present, ipc_port_t * portwatch_ports);
-static errno_t exec_handle_spawnattr_policy(proc_t p, int psa_apptype, uint64_t psa_qos_clamp, uint64_t psa_darwin_role,
-    ipc_port_t * portwatch_ports, int portwatch_count);
+static errno_t exec_handle_port_actions(struct image_params *imgp,
+    struct exec_port_actions *port_actions);
+static errno_t exec_handle_spawnattr_policy(proc_t p, thread_t thread, int psa_apptype, uint64_t psa_qos_clamp,
+    uint64_t psa_darwin_role, struct exec_port_actions *port_actions);
+static void exec_port_actions_destroy(struct exec_port_actions *port_actions);
 
 /*
  * exec_add_user_string
@@ -689,6 +711,7 @@ exec_fat_imgact(struct image_params *imgp)
                        lret = fatfile_getbestarch_for_cputype(pref,
                            (vm_offset_t)fat_header,
                            PAGE_SIZE,
+                           imgp,
                            &fat_arch);
                        if (lret == LOAD_SUCCESS) {
                                goto use_arch;
@@ -704,6 +727,7 @@ regular_grading:
        /* Look up our preferred architecture in the fat file. */
        lret = fatfile_getbestarch((vm_offset_t)fat_header,
            PAGE_SIZE,
+           imgp,
            &fat_arch);
        if (lret != LOAD_SUCCESS) {
                error = load_return_to_errno(lret);
@@ -748,6 +772,7 @@ activate_exec_state(task_t task, proc_t p, thread_t thread, load_result_t *resul
        } else {
                OSBitAndAtomic(~((uint32_t)P_LP64), &p->p_flag);
        }
+       task_set_mach_header_address(task, result->mach_header);
 
        ret = thread_state_initialize(thread);
        if (ret != KERN_SUCCESS) {
@@ -914,11 +939,34 @@ exec_mach_imgact(struct image_params *imgp)
                goto bad;
        }
 grade:
-       if (!grade_binary(imgp->ip_origcputype, imgp->ip_origcpusubtype & ~CPU_SUBTYPE_MASK)) {
+       if (!grade_binary(imgp->ip_origcputype, imgp->ip_origcpusubtype & ~CPU_SUBTYPE_MASK, TRUE)) {
+               error = EBADARCH;
+               goto bad;
+       }
+
+       if (validate_potential_simulator_binary(imgp->ip_origcputype, imgp,
+           imgp->ip_arch_offset, imgp->ip_arch_size) != LOAD_SUCCESS) {
+#if __x86_64__
+               const char *excpath;
+               error = exec_save_path(imgp, imgp->ip_user_fname, imgp->ip_seg, &excpath);
+               os_log_error(OS_LOG_DEFAULT, "Unsupported 32-bit executable: \"%s\"", (error) ? imgp->ip_vp->v_name : excpath);
+#endif
                error = EBADARCH;
                goto bad;
        }
 
+#if defined(HAS_APPLE_PAC)
+       assert(mach_header->cputype == CPU_TYPE_ARM64
+           );
+
+       if (((mach_header->cputype == CPU_TYPE_ARM64 &&
+           (mach_header->cpusubtype & ~CPU_SUBTYPE_MASK) == CPU_SUBTYPE_ARM64E)
+           ) && (CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(mach_header->cpusubtype) == 0)) {
+               imgp->ip_flags &= ~IMGPF_NOJOP;
+       } else {
+               imgp->ip_flags |= IMGPF_NOJOP;
+       }
+#endif
 
        /* Copy in arguments/environment from the old process */
        error = exec_extract_strings(imgp);
@@ -981,29 +1029,28 @@ grade:
                KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
                    p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_BAD_MACHO, 0, 0);
                if (lret == LOAD_BADMACHO_UPX) {
-                       /* set anything that might be useful in the crash report */
                        set_proc_name(imgp, p);
-
                        exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_UPX);
                        exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
-                       exec_failure_reason->osr_flags |= OS_REASON_FLAG_CONSISTENT_FAILURE;
-               } else if (lret == LOAD_BADARCH_X86) {
-                       /* set anything that might be useful in the crash report */
-                       set_proc_name(imgp, p);
-
-                       exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_NO32EXEC);
-                       exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
-                       exec_failure_reason->osr_flags |= OS_REASON_FLAG_CONSISTENT_FAILURE;
                } else {
                        exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_BAD_MACHO);
+
+                       if (bootarg_execfailurereports) {
+                               set_proc_name(imgp, p);
+                               exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
+                       }
                }
 
+               exec_failure_reason->osr_flags |= OS_REASON_FLAG_CONSISTENT_FAILURE;
+
                goto badtoolate;
        }
 
        proc_lock(p);
        p->p_cputype = imgp->ip_origcputype;
        p->p_cpusubtype = imgp->ip_origcpusubtype;
+       p->p_platform = load_result.ip_platform;
+       p->p_sdk = load_result.lr_sdk;
        proc_unlock(p);
 
        vm_map_set_user_wire_limit(map, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
@@ -1049,6 +1096,19 @@ grade:
         */
        int cpu_subtype;
        cpu_subtype = 0; /* all cpu_subtypes use the same shared region */
+#if defined(HAS_APPLE_PAC)
+       if (cpu_type() == CPU_TYPE_ARM64 &&
+           (p->p_cpusubtype & ~CPU_SUBTYPE_MASK) == CPU_SUBTYPE_ARM64E) {
+               assertf(p->p_cputype == CPU_TYPE_ARM64,
+                   "p %p cpu_type() 0x%x p->p_cputype 0x%x p->p_cpusubtype 0x%x",
+                   p, cpu_type(), p->p_cputype, p->p_cpusubtype);
+               /*
+                * arm64e uses pointer authentication, so request a separate
+                * shared region for this CPU subtype.
+                */
+               cpu_subtype = p->p_cpusubtype & ~CPU_SUBTYPE_MASK;
+       }
+#endif /* HAS_APPLE_PAC */
        vm_map_exec(map, task, load_result.is_64bit_addr, (void *)p->p_fd->fd_rdir, cpu_type(), cpu_subtype);
 
        /*
@@ -1065,7 +1125,13 @@ grade:
 
                KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
                    p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_SUGID_FAILURE, 0, 0);
+
                exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_SUGID_FAILURE);
+               if (bootarg_execfailurereports) {
+                       set_proc_name(imgp, p);
+                       exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
+               }
+
                goto badtoolate;
        }
 
@@ -1089,7 +1155,13 @@ grade:
        if (lret != KERN_SUCCESS) {
                KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
                    p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_ACTV_THREADSTATE, 0, 0);
+
                exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_ACTV_THREADSTATE);
+               if (bootarg_execfailurereports) {
+                       set_proc_name(imgp, p);
+                       exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
+               }
+
                goto badtoolate;
        }
 
@@ -1113,7 +1185,13 @@ grade:
 
                KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
                    p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_STACK_ALLOC, 0, 0);
+
                exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_STACK_ALLOC);
+               if (bootarg_execfailurereports) {
+                       set_proc_name(imgp, p);
+                       exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
+               }
+
                goto badtoolate;
        }
 
@@ -1121,7 +1199,12 @@ grade:
        if (error) {
                KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
                    p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_APPLE_STRING_INIT, 0, 0);
+
                exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_APPLE_STRING_INIT);
+               if (bootarg_execfailurereports) {
+                       set_proc_name(imgp, p);
+                       exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
+               }
                goto badtoolate;
        }
 
@@ -1142,7 +1225,12 @@ grade:
 
                        KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
                            p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_STRINGS, 0, 0);
+
                        exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_STRINGS);
+                       if (bootarg_execfailurereports) {
+                               set_proc_name(imgp, p);
+                               exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
+                       }
                        goto badtoolate;
                }
                /* Set the stack */
@@ -1162,7 +1250,12 @@ grade:
 
                        KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
                            p->p_pid, OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_DYNLINKER, 0, 0);
+
                        exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_DYNLINKER);
+                       if (bootarg_execfailurereports) {
+                               set_proc_name(imgp, p);
+                               exec_failure_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
+                       }
                        goto badtoolate;
                }
                task_set_dyld_info(task, load_result.all_image_info_addr,
@@ -1174,9 +1267,6 @@ grade:
 
        vm_map_switch(old_map);
 
-       /* Stop profiling */
-       stopprofclock(p);
-
        /*
         * Reset signal state.
         */
@@ -1226,11 +1316,7 @@ grade:
 
 #if __arm64__
        if (load_result.legacy_footprint) {
-#if DEVELOPMENT || DEBUG
-               printf("%s: %d[%s] legacy footprint (mach-o)\n",
-                   __FUNCTION__, p->p_pid, p->p_name);
-#endif /* DEVELOPMENT || DEBUG */
-               task_set_legacy_footprint(task, TRUE);
+               task_set_legacy_footprint(task);
        }
 #endif /* __arm64__ */
 
@@ -1382,9 +1468,9 @@ bad:
  * XXX hardcoded, for now; should use linker sets
  */
 struct execsw {
-       int (*ex_imgact)(struct image_params *);
+       int(*const ex_imgact)(struct image_params *);
        const char *ex_name;
-} execsw[] = {
+}const execsw[] = {
        { exec_mach_imgact, "Mach-o Binary" },
        { exec_fat_imgact, "Fat Binary" },
        { exec_shell_imgact, "Interpreter Script" },
@@ -1597,6 +1683,30 @@ bad_notrans:
        return error;
 }
 
+/*
+ * exec_validate_spawnattr_policy
+ *
+ * Description: Validates the entitlements required to set the apptype.
+ *
+ * Parameters:  int psa_apptype         posix spawn attribute apptype
+ *
+ * Returns:     0                       Success
+ *              EPERM                   Failure
+ */
+static errno_t
+exec_validate_spawnattr_policy(int psa_apptype)
+{
+       if ((psa_apptype & POSIX_SPAWN_PROC_TYPE_MASK) != 0) {
+               int proctype = psa_apptype & POSIX_SPAWN_PROC_TYPE_MASK;
+               if (proctype == POSIX_SPAWN_PROC_TYPE_DRIVER) {
+                       if (!IOTaskHasEntitlement(current_task(), POSIX_SPAWN_ENTITLEMENT_DRIVER)) {
+                               return EPERM;
+                       }
+               }
+       }
+
+       return 0;
+}
 
 /*
  * exec_handle_spawnattr_policy
@@ -1609,8 +1719,8 @@ bad_notrans:
  * Returns:     0                       Success
  */
 static errno_t
-exec_handle_spawnattr_policy(proc_t p, int psa_apptype, uint64_t psa_qos_clamp, uint64_t psa_darwin_role,
-    ipc_port_t * portwatch_ports, int portwatch_count)
+exec_handle_spawnattr_policy(proc_t p, thread_t thread, int psa_apptype, uint64_t psa_qos_clamp,
+    uint64_t psa_darwin_role, struct exec_port_actions *port_actions)
 {
        int apptype     = TASK_APPTYPE_NONE;
        int qos_clamp   = THREAD_QOS_UNSPECIFIED;
@@ -1640,6 +1750,9 @@ exec_handle_spawnattr_policy(proc_t p, int psa_apptype, uint64_t psa_qos_clamp,
                        apptype = TASK_APPTYPE_APP_TAL;
                        break;
 #endif /* !CONFIG_EMBEDDED */
+               case POSIX_SPAWN_PROC_TYPE_DRIVER:
+                       apptype = TASK_APPTYPE_DRIVER;
+                       break;
                default:
                        apptype = TASK_APPTYPE_NONE;
                        /* TODO: Should an invalid value here fail the spawn? */
@@ -1671,14 +1784,50 @@ exec_handle_spawnattr_policy(proc_t p, int psa_apptype, uint64_t psa_qos_clamp,
 
        if (apptype != TASK_APPTYPE_NONE ||
            qos_clamp != THREAD_QOS_UNSPECIFIED ||
-           role != TASK_UNSPECIFIED) {
-               proc_set_task_spawnpolicy(p->task, apptype, qos_clamp, role,
-                   portwatch_ports, portwatch_count);
+           role != TASK_UNSPECIFIED ||
+           port_actions->portwatch_count) {
+               proc_set_task_spawnpolicy(p->task, thread, apptype, qos_clamp, role,
+                   port_actions->portwatch_array, port_actions->portwatch_count);
+       }
+
+       if (port_actions->registered_count) {
+               if (mach_ports_register(p->task, port_actions->registered_array,
+                   port_actions->registered_count)) {
+                       return EINVAL;
+               }
+               /* mach_ports_register() consumed the array */
+               port_actions->registered_array = NULL;
+               port_actions->registered_count = 0;
        }
 
        return 0;
 }
 
+static void
+exec_port_actions_destroy(struct exec_port_actions *port_actions)
+{
+       if (port_actions->portwatch_array) {
+               for (uint32_t i = 0; i < port_actions->portwatch_count; i++) {
+                       ipc_port_t port = NULL;
+                       if ((port = port_actions->portwatch_array[i]) != NULL) {
+                               ipc_port_release_send(port);
+                       }
+               }
+               kfree(port_actions->portwatch_array,
+                   port_actions->portwatch_count * sizeof(ipc_port_t *));
+       }
+
+       if (port_actions->registered_array) {
+               for (uint32_t i = 0; i < port_actions->registered_count; i++) {
+                       ipc_port_t port = NULL;
+                       if ((port = port_actions->registered_array[i]) != NULL) {
+                               ipc_port_release_send(port);
+                       }
+               }
+               kfree(port_actions->registered_array,
+                   port_actions->registered_count * sizeof(ipc_port_t *));
+       }
+}
 
 /*
  * exec_handle_port_actions
@@ -1694,8 +1843,8 @@ exec_handle_spawnattr_policy(proc_t p, int psa_apptype, uint64_t psa_qos_clamp,
  *              ENOTSUP                        Illegal posix_spawn attr flag was set
  */
 static errno_t
-exec_handle_port_actions(struct image_params *imgp, boolean_t * portwatch_present,
-    ipc_port_t * portwatch_ports)
+exec_handle_port_actions(struct image_params *imgp,
+    struct exec_port_actions *actions)
 {
        _posix_spawn_port_actions_t pacts = imgp->ip_px_spa;
 #if CONFIG_AUDIT
@@ -1705,10 +1854,64 @@ exec_handle_port_actions(struct image_params *imgp, boolean_t * portwatch_presen
        task_t task = get_threadtask(imgp->ip_new_thread);
        ipc_port_t port = NULL;
        errno_t ret = 0;
-       int i;
+       int i, portwatch_i = 0, registered_i = 0;
        kern_return_t kr;
+       boolean_t task_has_watchport_boost = task_has_watchports(current_task());
+       boolean_t in_exec = (imgp->ip_flags & IMGPF_EXEC);
+
+       for (i = 0; i < pacts->pspa_count; i++) {
+               act = &pacts->pspa_actions[i];
+
+               switch (act->port_type) {
+               case PSPA_SPECIAL:
+               case PSPA_EXCEPTION:
+#if CONFIG_AUDIT
+               case PSPA_AU_SESSION:
+#endif
+                       break;
+               case PSPA_IMP_WATCHPORTS:
+                       if (++actions->portwatch_count > TASK_MAX_WATCHPORT_COUNT) {
+                               ret = EINVAL;
+                               goto done;
+                       }
+                       break;
+               case PSPA_REGISTERED_PORTS:
+                       if (++actions->registered_count > TASK_PORT_REGISTER_MAX) {
+                               ret = EINVAL;
+                               goto done;
+                       }
+                       break;
+               default:
+                       ret = EINVAL;
+                       goto done;
+               }
+       }
+
+       if (actions->portwatch_count) {
+               if (in_exec && task_has_watchport_boost) {
+                       ret = EINVAL;
+                       goto done;
+               }
+               actions->portwatch_array =
+                   kalloc(sizeof(ipc_port_t *) * actions->portwatch_count);
+               if (actions->portwatch_array == NULL) {
+                       ret = ENOMEM;
+                       goto done;
+               }
+               bzero(actions->portwatch_array,
+                   sizeof(ipc_port_t *) * actions->portwatch_count);
+       }
 
-       *portwatch_present = FALSE;
+       if (actions->registered_count) {
+               actions->registered_array =
+                   kalloc(sizeof(ipc_port_t *) * actions->registered_count);
+               if (actions->registered_array == NULL) {
+                       ret = ENOMEM;
+                       goto done;
+               }
+               bzero(actions->registered_array,
+                   sizeof(ipc_port_t *) * actions->registered_count);
+       }
 
        for (i = 0; i < pacts->pspa_count; i++) {
                act = &pacts->pspa_actions[i];
@@ -1716,7 +1919,7 @@ exec_handle_port_actions(struct image_params *imgp, boolean_t * portwatch_presen
                if (MACH_PORT_VALID(act->new_port)) {
                        kr = ipc_object_copyin(get_task_ipcspace(current_task()),
                            act->new_port, MACH_MSG_TYPE_COPY_SEND,
-                           (ipc_object_t *) &port);
+                           (ipc_object_t *) &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND);
 
                        if (kr != KERN_SUCCESS) {
                                ret = EINVAL;
@@ -1754,14 +1957,16 @@ exec_handle_port_actions(struct image_params *imgp, boolean_t * portwatch_presen
                        break;
 #endif
                case PSPA_IMP_WATCHPORTS:
-                       if (portwatch_ports != NULL && IPC_PORT_VALID(port)) {
-                               *portwatch_present = TRUE;
+                       if (actions->portwatch_array) {
                                /* hold on to this till end of spawn */
-                               portwatch_ports[i] = port;
+                               actions->portwatch_array[portwatch_i++] = port;
                        } else {
                                ipc_port_release_send(port);
                        }
-
+                       break;
+               case PSPA_REGISTERED_PORTS:
+                       /* hold on to this till end of spawn */
+                       actions->registered_array[registered_i++] = port;
                        break;
                default:
                        ret = EINVAL;
@@ -1900,7 +2105,7 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags)
                        struct dup2_args dup2a;
 
                        dup2a.from = psfa->psfaa_filedes;
-                       dup2a.to = psfa->psfaa_openargs.psfao_oflag;
+                       dup2a.to = psfa->psfaa_dup2args.psfad_newfiledes;
 
                        /*
                         * The dup2() system call implementation sets
@@ -1912,6 +2117,47 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags)
                }
                break;
 
+               case PSFA_FILEPORT_DUP2: {
+                       ipc_port_t port;
+                       kern_return_t kr;
+                       struct dup2_args dup2a;
+                       struct close_nocancel_args ca;
+
+                       if (!MACH_PORT_VALID(psfa->psfaa_fileport)) {
+                               error = EINVAL;
+                               break;
+                       }
+
+                       kr = ipc_object_copyin(get_task_ipcspace(current_task()),
+                           psfa->psfaa_fileport, MACH_MSG_TYPE_COPY_SEND,
+                           (ipc_object_t *) &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND);
+
+                       if (kr != KERN_SUCCESS) {
+                               error = EINVAL;
+                               break;
+                       }
+
+                       error = fileport_makefd_internal(p, port, 0, ival);
+
+                       if (IPC_PORT_NULL != port) {
+                               ipc_port_release_send(port);
+                       }
+
+                       if (error || ival[0] == psfa->psfaa_dup2args.psfad_newfiledes) {
+                               break;
+                       }
+
+                       dup2a.from = ca.fd = ival[0];
+                       dup2a.to = psfa->psfaa_dup2args.psfad_newfiledes;
+                       error = dup2(p, &dup2a, ival);
+                       if (error) {
+                               break;
+                       }
+
+                       error = close_nocancel(p, &ca, ival);
+               }
+               break;
+
                case PSFA_CLOSE: {
                        struct close_nocancel_args ca;
 
@@ -1946,6 +2192,34 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags)
                }
                break;
 
+               case PSFA_CHDIR: {
+                       /*
+                        * Chdir is different, in that it requires the use of
+                        * a path argument, which is normally copied in from
+                        * user space; because of this, we have to support a
+                        * chdir from kernel space that passes an address space
+                        * context of UIO_SYSSPACE, and casts the address
+                        * argument to a user_addr_t.
+                        */
+                       struct nameidata nd;
+
+                       NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1, UIO_SYSSPACE,
+                           CAST_USER_ADDR_T(psfa->psfaa_chdirargs.psfac_path),
+                           imgp->ip_vfs_context);
+
+                       error = chdir_internal(p, imgp->ip_vfs_context, &nd, 0);
+               }
+               break;
+
+               case PSFA_FCHDIR: {
+                       struct fchdir_args fchdira;
+
+                       fchdira.fd = psfa->psfaa_filedes;
+
+                       error = fchdir(p, &fchdira, ival);
+               }
+               break;
+
                default:
                        error = EINVAL;
                        break;
@@ -1984,7 +2258,8 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags)
 
                switch (psfa->psfaa_type) {
                case PSFA_DUP2:
-                       fd = psfa->psfaa_openargs.psfao_oflag;
+               case PSFA_FILEPORT_DUP2:
+                       fd = psfa->psfaa_dup2args.psfad_newfiledes;
                /*FALLTHROUGH*/
                case PSFA_OPEN:
                case PSFA_INHERIT:
@@ -1992,6 +2267,15 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags)
                        break;
 
                case PSFA_CLOSE:
+               case PSFA_CHDIR:
+               case PSFA_FCHDIR:
+                       /*
+                        * Although PSFA_FCHDIR does have a file descriptor, it is not
+                        * *creating* one, thus we do not automatically mark it for
+                        * inheritance under POSIX_SPAWN_CLOEXEC_DEFAULT. A client that
+                        * wishes it to be inherited should use the PSFA_INHERIT action
+                        * explicitly.
+                        */
                        break;
                }
        }
@@ -2126,14 +2410,16 @@ spawn_validate_persona(struct _posix_spawn_persona_info *px_persona)
        struct persona *persona = NULL;
        int verify = px_persona->pspi_flags & POSIX_SPAWN_PERSONA_FLAGS_VERIFY;
 
-       /*
-        * TODO: rdar://problem/19981151
-        * Add entitlement check!
-        */
-       if (!kauth_cred_issuser(kauth_cred_get())) {
+       if (!IOTaskHasEntitlement(current_task(), PERSONA_MGMT_ENTITLEMENT)) {
                return EPERM;
        }
 
+       if (px_persona->pspi_flags & POSIX_SPAWN_PERSONA_GROUPS) {
+               if (px_persona->pspi_ngroups > NGROUPS_MAX) {
+                       return EINVAL;
+               }
+       }
+
        persona = persona_lookup(px_persona->pspi_id);
        if (!persona) {
                error = ESRCH;
@@ -2245,21 +2531,119 @@ out:
 #endif
 
 #if __arm64__
+extern int legacy_footprint_entitlement_mode;
 static inline void
-proc_legacy_footprint(proc_t p, task_t task, const char *caller)
+proc_legacy_footprint_entitled(proc_t p, task_t task, const char *caller)
 {
+#pragma unused(p, caller)
        boolean_t legacy_footprint_entitled;
 
-       legacy_footprint_entitled = IOTaskHasEntitlement(task,
-           "com.apple.private.memory.legacy_footprint");
-       if (legacy_footprint_entitled) {
-               printf("%s: %d[%s] legacy footprint (entitled)\n",
-                   caller, p->p_pid, p->p_name);
-               task_set_legacy_footprint(task, TRUE);
+       switch (legacy_footprint_entitlement_mode) {
+       case LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE:
+               /* the entitlement is ignored */
+               break;
+       case LEGACY_FOOTPRINT_ENTITLEMENT_IOS11_ACCT:
+               /* the entitlement grants iOS11 legacy accounting */
+               legacy_footprint_entitled = IOTaskHasEntitlement(task,
+                   "com.apple.private.memory.legacy_footprint");
+               if (legacy_footprint_entitled) {
+                       task_set_legacy_footprint(task);
+               }
+               break;
+       case LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE:
+               /* the entitlement grants a footprint limit increase */
+               legacy_footprint_entitled = IOTaskHasEntitlement(task,
+                   "com.apple.private.memory.legacy_footprint");
+               if (legacy_footprint_entitled) {
+                       task_set_extra_footprint_limit(task);
+               }
+               break;
+       default:
+               break;
        }
 }
 #endif /* __arm64__ */
 
+/*
+ * Apply a modification on the proc's kauth cred until it converges.
+ *
+ * `update` consumes its argument to return a new kauth cred.
+ */
+static void
+apply_kauth_cred_update(proc_t p,
+    kauth_cred_t (^update)(kauth_cred_t orig_cred))
+{
+       kauth_cred_t my_cred, my_new_cred;
+
+       my_cred = kauth_cred_proc_ref(p);
+       for (;;) {
+               my_new_cred = update(my_cred);
+               if (my_cred == my_new_cred) {
+                       kauth_cred_unref(&my_new_cred);
+                       break;
+               }
+
+               /* try update cred on proc */
+               proc_ucred_lock(p);
+
+               if (p->p_ucred == my_cred) {
+                       /* base pointer didn't change, donate our ref */
+                       p->p_ucred = my_new_cred;
+                       PROC_UPDATE_CREDS_ONPROC(p);
+                       proc_ucred_unlock(p);
+
+                       /* drop p->p_ucred reference */
+                       kauth_cred_unref(&my_cred);
+                       break;
+               }
+
+               /* base pointer changed, retry */
+               my_cred = p->p_ucred;
+               kauth_cred_ref(my_cred);
+               proc_ucred_unlock(p);
+
+               kauth_cred_unref(&my_new_cred);
+       }
+}
+
+static int
+spawn_posix_cred_adopt(proc_t p,
+    struct _posix_spawn_posix_cred_info *px_pcred_info)
+{
+       int error = 0;
+
+       if (px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_GID) {
+               struct setgid_args args = {
+                       .gid = px_pcred_info->pspci_gid,
+               };
+               error = setgid(p, &args, NULL);
+               if (error) {
+                       return error;
+               }
+       }
+
+       if (px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_GROUPS) {
+               error = setgroups_internal(p,
+                   px_pcred_info->pspci_ngroups,
+                   px_pcred_info->pspci_groups,
+                   px_pcred_info->pspci_gmuid);
+               if (error) {
+                       return error;
+               }
+       }
+
+       if (px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_UID) {
+               struct setuid_args args = {
+                       .uid = px_pcred_info->pspci_uid,
+               };
+               error = setuid(p, &args, NULL);
+               if (error) {
+                       return error;
+               }
+       }
+       return 0;
+}
+
 /*
  * posix_spawn
  *
@@ -2280,6 +2664,7 @@ proc_legacy_footprint(proc_t p, task_t task, const char *caller)
  *     exec_activate_image:ENAMETOOLONG        Filename too long
  *     exec_activate_image:ENOEXEC     Executable file format error
  *     exec_activate_image:ETXTBSY     Text file busy [misuse of error code]
+ *     exec_activate_image:EAUTH       Image decryption failed
  *     exec_activate_image:EBADEXEC    The executable is corrupt/unknown
  *     exec_activate_image:???
  *     mac_execve_enter:???
@@ -2310,8 +2695,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
        boolean_t spawn_no_exec = FALSE;
        boolean_t proc_transit_set = TRUE;
        boolean_t exec_done = FALSE;
-       int portwatch_count = 0;
-       ipc_port_t * portwatch_ports = NULL;
+       struct exec_port_actions port_actions = { };
        vm_size_t px_sa_offset = offsetof(struct _posix_spawnattr, psa_ports);
        task_t old_task = current_task();
        task_t new_task = NULL;
@@ -2320,6 +2704,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
 #if CONFIG_PERSONAS
        struct _posix_spawn_persona_info *px_persona = NULL;
 #endif
+       struct _posix_spawn_posix_cred_info *px_pcred_info = NULL;
 
        /*
         * Allocate a big chunk for locals instead of using stack since these
@@ -2345,7 +2730,9 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
        imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32);
        imgp->ip_mac_return = 0;
        imgp->ip_px_persona = NULL;
+       imgp->ip_px_pcred_info = NULL;
        imgp->ip_cs_error = OS_REASON_NULL;
+       imgp->ip_simulator_binary = IMGPF_SB_DEFAULT;
 
        if (uap->adesc != USER_ADDR_NULL) {
                if (is_64) {
@@ -2371,6 +2758,8 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
                        px_args.coal_info = CAST_USER_ADDR_T(px_args32.coal_info);
                        px_args.persona_info_size = px_args32.persona_info_size;
                        px_args.persona_info = CAST_USER_ADDR_T(px_args32.persona_info);
+                       px_args.posix_cred_info_size = px_args32.posix_cred_info_size;
+                       px_args.posix_cred_info = CAST_USER_ADDR_T(px_args32.posix_cred_info);
                }
                if (error) {
                        goto bad;
@@ -2472,6 +2861,39 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
                        }
                }
 #endif
+               /* copy in the posix cred info */
+               if (px_args.posix_cred_info_size != 0 && px_args.posix_cred_info != 0) {
+                       /* for now, we need the exact same struct in user space */
+                       if (px_args.posix_cred_info_size != sizeof(*px_pcred_info)) {
+                               error = ERANGE;
+                               goto bad;
+                       }
+
+                       if (!kauth_cred_issuser(kauth_cred_get())) {
+                               error = EPERM;
+                               goto bad;
+                       }
+
+                       MALLOC(px_pcred_info, struct _posix_spawn_posix_cred_info *,
+                           px_args.posix_cred_info_size, M_TEMP, M_WAITOK | M_ZERO);
+                       if (px_pcred_info == NULL) {
+                               error = ENOMEM;
+                               goto bad;
+                       }
+                       imgp->ip_px_pcred_info = px_pcred_info;
+
+                       if ((error = copyin(px_args.posix_cred_info, px_pcred_info,
+                           px_args.posix_cred_info_size)) != 0) {
+                               goto bad;
+                       }
+
+                       if (px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_GROUPS) {
+                               if (px_pcred_info->pspci_ngroups > NGROUPS_MAX) {
+                                       error = EINVAL;
+                                       goto bad;
+                               }
+                       }
+               }
 #if CONFIG_MACF
                if (px_args.mac_extensions_size != 0) {
                        if ((error = spawn_copyin_macpolicyinfo(&px_args, (_posix_spawn_mac_policy_extensions_t *)&imgp->ip_px_smpx)) != 0) {
@@ -2494,6 +2916,13 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
                goto bad;
        }
 
+       if (imgp->ip_px_sa != NULL) {
+               struct _posix_spawnattr *psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
+               if ((error = exec_validate_spawnattr_policy(psa->psa_apptype)) != 0) {
+                       goto bad;
+               }
+       }
+
        /*
         * If we don't have the extension flag that turns "posix_spawn()"
         * into "execve() with options", then we will be creating a new
@@ -2633,31 +3062,6 @@ do_fork1:
                }
                imgp->ip_flags |= IMGPF_SPAWN;  /* spawn w/o exec */
                spawn_no_exec = TRUE;           /* used in later tests */
-
-#if CONFIG_PERSONAS
-               /*
-                * If the parent isn't in a persona (launchd), and
-                * hasn't specified a new persona for the process,
-                * then we'll put the process into the system persona
-                *
-                * TODO: this will have to be re-worked because as of
-                *       now, without any launchd adoption, the resulting
-                *       xpcproxy process will not have sufficient
-                *       privileges to setuid/gid.
-                */
-#if 0
-               if (!proc_has_persona(p) && imgp->ip_px_persona == NULL) {
-                       MALLOC(px_persona, struct _posix_spawn_persona_info *,
-                           sizeof(*px_persona), M_TEMP, M_WAITOK | M_ZERO);
-                       if (px_persona == NULL) {
-                               error = ENOMEM;
-                               goto bad;
-                       }
-                       px_persona->pspi_id = persona_get_id(g_system_persona);
-                       imgp->ip_px_persona = px_persona;
-               }
-#endif /* 0 */
-#endif /* CONFIG_PERSONAS */
        } else {
                /*
                 * For execve case, create a new task and thread
@@ -2737,56 +3141,13 @@ do_fork1:
 
        /* Has spawn port actions? */
        if (imgp->ip_px_spa != NULL) {
-               boolean_t is_adaptive = FALSE;
-               boolean_t portwatch_present = FALSE;
-
-               /* Will this process become adaptive? The apptype isn't ready yet, so we can't look there. */
-               if (imgp->ip_px_sa != NULL && px_sa.psa_apptype == POSIX_SPAWN_PROC_TYPE_DAEMON_ADAPTIVE) {
-                       is_adaptive = TRUE;
-               }
-
-               /*
-                * portwatch only:
-                * Allocate a place to store the ports we want to bind to the new task
-                * We can't bind them until after the apptype is set.
-                */
-               if (px_spap->pspa_count != 0 && is_adaptive) {
-                       portwatch_count = px_spap->pspa_count;
-                       MALLOC(portwatch_ports, ipc_port_t *, (sizeof(ipc_port_t) * portwatch_count), M_TEMP, M_WAITOK | M_ZERO);
-               } else {
-                       portwatch_ports = NULL;
-               }
-
-               if ((error = exec_handle_port_actions(imgp, &portwatch_present, portwatch_ports)) != 0) {
+               if ((error = exec_handle_port_actions(imgp, &port_actions)) != 0) {
                        goto bad;
                }
-
-               if (portwatch_present == FALSE && portwatch_ports != NULL) {
-                       FREE(portwatch_ports, M_TEMP);
-                       portwatch_ports = NULL;
-                       portwatch_count = 0;
-               }
        }
 
        /* Has spawn attr? */
        if (imgp->ip_px_sa != NULL) {
-               /*
-                * Set the process group ID of the child process; this has
-                * to happen before the image activation.
-                */
-               if (px_sa.psa_flags & POSIX_SPAWN_SETPGROUP) {
-                       struct setpgid_args spga;
-                       spga.pid = p->p_pid;
-                       spga.pgid = px_sa.psa_pgroup;
-                       /*
-                        * Effectively, call setpgid() system call; works
-                        * because there are no pointer arguments.
-                        */
-                       if ((error = setpgid(p, &spga, ival)) != 0) {
-                               goto bad;
-                       }
-               }
-
                /*
                 * Reset UID/GID to parent's RUID/RGID; This works only
                 * because the operation occurs *after* the vfork() and
@@ -2800,35 +3161,33 @@ do_fork1:
                 * proc's ucred lock. This prevents others from accessing
                 * a garbage credential.
                 */
-               while (px_sa.psa_flags & POSIX_SPAWN_RESETIDS) {
-                       kauth_cred_t my_cred = kauth_cred_proc_ref(p);
-                       kauth_cred_t my_new_cred = kauth_cred_setuidgid(my_cred, kauth_cred_getruid(my_cred), kauth_cred_getrgid(my_cred));
+               if (px_sa.psa_flags & POSIX_SPAWN_RESETIDS) {
+                       apply_kauth_cred_update(p, ^kauth_cred_t (kauth_cred_t my_cred){
+                               return kauth_cred_setuidgid(my_cred,
+                               kauth_cred_getruid(my_cred),
+                               kauth_cred_getrgid(my_cred));
+                       });
+               }
 
-                       if (my_cred == my_new_cred) {
-                               kauth_cred_unref(&my_cred);
-                               break;
+               if (imgp->ip_px_pcred_info) {
+                       if (!spawn_no_exec) {
+                               error = ENOTSUP;
+                               goto bad;
                        }
 
-                       /* update cred on proc */
-                       proc_ucred_lock(p);
-
-                       if (p->p_ucred != my_cred) {
-                               proc_ucred_unlock(p);
-                               kauth_cred_unref(&my_new_cred);
-                               continue;
+                       error = spawn_posix_cred_adopt(p, imgp->ip_px_pcred_info);
+                       if (error != 0) {
+                               goto bad;
                        }
-
-                       /* donate cred reference on my_new_cred to p->p_ucred */
-                       p->p_ucred = my_new_cred;
-                       PROC_UPDATE_CREDS_ONPROC(p);
-                       proc_ucred_unlock(p);
-
-                       /* drop additional reference that was taken on the previous cred */
-                       kauth_cred_unref(&my_cred);
                }
 
 #if CONFIG_PERSONAS
-               if (spawn_no_exec && imgp->ip_px_persona != NULL) {
+               if (imgp->ip_px_persona != NULL) {
+                       if (!spawn_no_exec) {
+                               error = ENOTSUP;
+                               goto bad;
+                       }
+
                        /*
                         * If we were asked to spawn a process into a new persona,
                         * do the credential switch now (which may override the UID/GID
@@ -2864,6 +3223,7 @@ do_fork1:
                        imgp->ip_flags |= IMGPF_HIGH_BITS_ASLR;
                }
 
+#if !SECURE_KERNEL
                /*
                 * Forcibly disallow execution from data pages for the spawned process
                 * even if it would otherwise be permitted by the architecture default.
@@ -2871,6 +3231,12 @@ do_fork1:
                if (px_sa.psa_flags & _POSIX_SPAWN_ALLOW_DATA_EXEC) {
                        imgp->ip_flags |= IMGPF_ALLOW_DATA_EXEC;
                }
+#endif /* !SECURE_KERNEL */
+
+               if ((px_sa.psa_apptype & POSIX_SPAWN_PROC_TYPE_MASK) ==
+                   POSIX_SPAWN_PROC_TYPE_DRIVER) {
+                       imgp->ip_flags |= IMGPF_DRIVER;
+               }
        }
 
        /*
@@ -2906,6 +3272,10 @@ do_fork1:
         * Activate the image
         */
        error = exec_activate_image(imgp);
+#if defined(HAS_APPLE_PAC)
+       ml_task_set_disable_user_jop(new_task, imgp->ip_flags & IMGPF_NOJOP ? TRUE : FALSE);
+       ml_thread_set_disable_user_jop(imgp->ip_new_thread, imgp->ip_flags & IMGPF_NOJOP ? TRUE : FALSE);
+#endif
 
        if (error == 0 && !spawn_no_exec) {
                p = proc_exec_switch_task(p, old_task, new_task, imgp->ip_new_thread);
@@ -2930,19 +3300,45 @@ do_fork1:
                error = ENOEXEC;
        }
 
-       /*
-        * If we have a spawn attr, and it contains signal related flags,
-        * the we need to process them in the "context" of the new child
-        * process, so we have to process it following image activation,
-        * prior to making the thread runnable in user space.  This is
-        * necessitated by some signal information being per-thread rather
-        * than per-process, and we don't have the new allocation in hand
-        * until after the image is activated.
-        */
        if (!error && imgp->ip_px_sa != NULL) {
                thread_t child_thread = imgp->ip_new_thread;
                uthread_t child_uthread = get_bsdthread_info(child_thread);
 
+               /*
+                * Because of POSIX_SPAWN_SETEXEC, we need to handle this after image
+                * activation, else when image activation fails (before the point of no
+                * return) would leave the parent process in a modified state.
+                */
+               if (px_sa.psa_flags & POSIX_SPAWN_SETPGROUP) {
+                       struct setpgid_args spga;
+                       spga.pid = p->p_pid;
+                       spga.pgid = px_sa.psa_pgroup;
+                       /*
+                        * Effectively, call setpgid() system call; works
+                        * because there are no pointer arguments.
+                        */
+                       if ((error = setpgid(p, &spga, ival)) != 0) {
+                               goto bad;
+                       }
+               }
+
+               if (px_sa.psa_flags & POSIX_SPAWN_SETSID) {
+                       error = setsid_internal(p);
+                       if (error != 0) {
+                               goto bad;
+                       }
+               }
+
+               /*
+                * If we have a spawn attr, and it contains signal related flags,
+                * the we need to process them in the "context" of the new child
+                * process, so we have to process it following image activation,
+                * prior to making the thread runnable in user space.  This is
+                * necessitated by some signal information being per-thread rather
+                * than per-process, and we don't have the new allocation in hand
+                * until after the image is activated.
+                */
+
                /*
                 * Mask a list of signals, instead of them being unmasked, if
                 * they were unmasked in the parent; note that some signals
@@ -2989,6 +3385,15 @@ do_fork1:
                            px_sa.psa_cpumonitor_interval * NSEC_PER_SEC,
                            0, TRUE);
                }
+
+
+               if (px_pcred_info &&
+                   (px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_LOGIN)) {
+                       /*
+                        * setlogin() must happen after setsid()
+                        */
+                       setlogin_internal(p, px_pcred_info->pspci_login);
+               }
        }
 
 bad:
@@ -3022,6 +3427,11 @@ bad:
                exec_resettextvp(p, imgp);
 
 #if CONFIG_MEMORYSTATUS
+               /* Set jetsam priority for DriverKit processes */
+               if (px_sa.psa_apptype == POSIX_SPAWN_PROC_TYPE_DRIVER) {
+                       px_sa.psa_priority = JETSAM_PRIORITY_DRIVER_APPLE;
+               }
+
                /* Has jetsam attributes? */
                if (imgp->ip_px_sa != NULL && (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_SET)) {
                        /*
@@ -3032,14 +3442,15 @@ bad:
                         * we attempt to mimic previous behavior by forcing the BG limit data into the
                         * inactive/non-fatal mode and force the active slots to hold system_wide/fatal mode.
                         */
+
                        if (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_HIWATER_BACKGROUND) {
-                               memorystatus_update(p, px_sa.psa_priority, 0,
+                               memorystatus_update(p, px_sa.psa_priority, 0, FALSE, /* assertion priority */
                                    (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_USE_EFFECTIVE_PRIORITY),
                                    TRUE,
                                    -1, TRUE,
                                    px_sa.psa_memlimit_inactive, FALSE);
                        } else {
-                               memorystatus_update(p, px_sa.psa_priority, 0,
+                               memorystatus_update(p, px_sa.psa_priority, 0, FALSE, /* assertion priority */
                                    (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_USE_EFFECTIVE_PRIORITY),
                                    TRUE,
                                    px_sa.psa_memlimit_active,
@@ -3048,6 +3459,31 @@ bad:
                                    (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_MEMLIMIT_INACTIVE_FATAL));
                        }
                }
+
+               /* Has jetsam relaunch behavior? */
+               if (imgp->ip_px_sa != NULL && (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MASK)) {
+                       /*
+                        * Launchd has passed in data indicating the behavior of this process in response to jetsam.
+                        * This data would be used by the jetsam subsystem to determine the position and protection
+                        * offered to this process on dirty -> clean transitions.
+                        */
+                       int relaunch_flags = P_MEMSTAT_RELAUNCH_UNKNOWN;
+                       switch (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MASK) {
+                       case POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_LOW:
+                               relaunch_flags = P_MEMSTAT_RELAUNCH_LOW;
+                               break;
+                       case POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MED:
+                               relaunch_flags = P_MEMSTAT_RELAUNCH_MED;
+                               break;
+                       case POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_HIGH:
+                               relaunch_flags = P_MEMSTAT_RELAUNCH_HIGH;
+                               break;
+                       default:
+                               break;
+                       }
+                       memorystatus_relaunch_flags_update(p, relaunch_flags);
+               }
+
 #endif /* CONFIG_MEMORYSTATUS */
                if (imgp->ip_px_sa != NULL && px_sa.psa_thread_limit > 0) {
                        task_set_thread_limit(new_task, (uint16_t)px_sa.psa_thread_limit);
@@ -3099,7 +3535,7 @@ bad:
                }
 
 #if __arm64__
-               proc_legacy_footprint(p, new_task, __FUNCTION__);
+               proc_legacy_footprint_entitled(p, new_task, __FUNCTION__);
 #endif /* __arm64__ */
        }
 
@@ -3108,6 +3544,21 @@ bad:
                proc_inherit_task_role(new_task, old_task);
        }
 
+#if CONFIG_ARCADE
+       if (error == 0) {
+               /*
+                * Check to see if we need to trigger an arcade upcall AST now
+                * that the vnode has been reset on the task.
+                */
+               arcade_prepare(new_task, imgp->ip_new_thread);
+       }
+#endif /* CONFIG_ARCADE */
+
+       /* Clear the initial wait on the thread before handling spawn policy */
+       if (imgp && imgp->ip_new_thread) {
+               task_clear_return_wait(get_threadtask(imgp->ip_new_thread), TCRW_CLEAR_INITIAL_WAIT);
+       }
+
        /*
         * Apply the spawnattr policy, apptype (which primes the task for importance donation),
         * and bind any portwatch ports to the new task.
@@ -3120,8 +3571,13 @@ bad:
        if (error == 0 && imgp->ip_px_sa != NULL) {
                struct _posix_spawnattr *psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
 
-               exec_handle_spawnattr_policy(p, psa->psa_apptype, psa->psa_qos_clamp, psa->psa_darwin_role,
-                   portwatch_ports, portwatch_count);
+               error = exec_handle_spawnattr_policy(p, imgp->ip_new_thread, psa->psa_apptype, psa->psa_qos_clamp,
+                   psa->psa_darwin_role, &port_actions);
+       }
+
+       /* Transfer the turnstile watchport boost to new task if in exec */
+       if (error == 0 && !spawn_no_exec) {
+               task_transfer_turnstile_watchports(old_task, new_task, imgp->ip_new_thread);
        }
 
        /*
@@ -3147,6 +3603,7 @@ bad:
                 */
                if (mac_proc_check_map_anon(p, 0, 0, 0, MAP_JIT, NULL) == 0) {
                        vm_map_set_jumbo(get_task_map(new_task));
+                       vm_map_set_jit_entitled(get_task_map(new_task));
                }
 #endif /* CONFIG_MACF */
        }
@@ -3155,16 +3612,8 @@ bad:
         * Release any ports we kept around for binding to the new task
         * We need to release the rights even if the posix_spawn has failed.
         */
-       if (portwatch_ports != NULL) {
-               for (int i = 0; i < portwatch_count; i++) {
-                       ipc_port_t port = NULL;
-                       if ((port = portwatch_ports[i]) != NULL) {
-                               ipc_port_release_send(port);
-                       }
-               }
-               FREE(portwatch_ports, M_TEMP);
-               portwatch_ports = NULL;
-               portwatch_count = 0;
+       if (imgp->ip_px_spa != NULL) {
+               exec_port_actions_destroy(&port_actions);
        }
 
        /*
@@ -3212,6 +3661,9 @@ bad:
                        FREE(imgp->ip_px_persona, M_TEMP);
                }
 #endif
+               if (imgp->ip_px_pcred_info != NULL) {
+                       FREE(imgp->ip_px_pcred_info, M_TEMP);
+               }
 #if CONFIG_MACF
                if (imgp->ip_px_smpx != NULL) {
                        spawn_free_macpolicyinfo(imgp->ip_px_smpx);
@@ -3301,7 +3753,13 @@ bad:
                 * If the parent wants the pid, copy it out
                 */
                if (pid != USER_ADDR_NULL) {
-                       (void)suword(pid, p->p_pid);
+                       _Static_assert(sizeof(p->p_pid) == 4, "posix_spawn() assumes a 32-bit pid_t");
+                       bool aligned = (pid & 3) == 0;
+                       if (aligned) {
+                               (void)copyout_atomic32(p->p_pid, pid);
+                       } else {
+                               (void)suword(pid, p->p_pid);
+                       }
                }
                retval[0] = error;
 
@@ -3339,7 +3797,7 @@ bad:
        /* Release the thread ref returned by fork_create_child/fork1 */
        if (imgp != NULL && imgp->ip_new_thread) {
                /* wake up the new thread */
-               task_clear_return_wait(get_threadtask(imgp->ip_new_thread));
+               task_clear_return_wait(get_threadtask(imgp->ip_new_thread), TCRW_CLEAR_FINAL_WAIT);
                thread_deallocate(imgp->ip_new_thread);
                imgp->ip_new_thread = NULL;
        }
@@ -3438,6 +3896,7 @@ proc_exec_switch_task(proc_t p, task_t old_task, task_t new_task, thread_t new_t
                        /* Clear dispatchqueue and workloop ast offset */
                        p->p_dispatchqueue_offset = 0;
                        p->p_dispatchqueue_serialno_offset = 0;
+                       p->p_dispatchqueue_label_offset = 0;
                        p->p_return_to_kernel_offset = 0;
 
                        /* Copy the signal state, dtrace state and set bsd ast on new thread */
@@ -3600,6 +4059,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
        imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32);
        imgp->ip_mac_return = 0;
        imgp->ip_cs_error = OS_REASON_NULL;
+       imgp->ip_simulator_binary = IMGPF_SB_DEFAULT;
 
 #if CONFIG_MACF
        if (uap->mac_p != USER_ADDR_NULL) {
@@ -3668,6 +4128,10 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
                 * for vfexec.
                 */
                new_task = get_threadtask(imgp->ip_new_thread);
+#if defined(HAS_APPLE_PAC)
+               ml_task_set_disable_user_jop(new_task, imgp->ip_flags & IMGPF_NOJOP ? TRUE : FALSE);
+               ml_thread_set_disable_user_jop(imgp->ip_new_thread, imgp->ip_flags & IMGPF_NOJOP ? TRUE : FALSE);
+#endif
        }
 
        if (!error && !in_vfexec) {
@@ -3742,7 +4206,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
                proc_transend(p, 0);
 
 #if __arm64__
-               proc_legacy_footprint(p, new_task, __FUNCTION__);
+               proc_legacy_footprint_entitled(p, new_task, __FUNCTION__);
 #endif /* __arm64__ */
 
                /* Sever any extant thread affinity */
@@ -3757,6 +4221,14 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
 
                task_set_main_thread_qos(new_task, main_thread);
 
+#if CONFIG_ARCADE
+               /*
+                * Check to see if we need to trigger an arcade upcall AST now
+                * that the vnode has been reset on the task.
+                */
+               arcade_prepare(new_task, imgp->ip_new_thread);
+#endif /* CONFIG_ARCADE */
+
 #if CONFIG_MACF
                /*
                 * Processes with the MAP_JIT entitlement are permitted to have
@@ -3764,6 +4236,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
                 */
                if (mac_proc_check_map_anon(p, 0, 0, 0, MAP_JIT, NULL) == 0) {
                        vm_map_set_jumbo(get_task_map(new_task));
+                       vm_map_set_jit_entitled(get_task_map(new_task));
                }
 #endif /* CONFIG_MACF */
 
@@ -3817,6 +4290,16 @@ exit_with_error:
        }
 
        if (imgp != NULL) {
+               /* Clear the initial wait on the thread transferring watchports */
+               if (imgp->ip_new_thread) {
+                       task_clear_return_wait(get_threadtask(imgp->ip_new_thread), TCRW_CLEAR_INITIAL_WAIT);
+               }
+
+               /* Transfer the watchport boost to new task */
+               if (!error && !in_vfexec) {
+                       task_transfer_turnstile_watchports(old_task,
+                           new_task, imgp->ip_new_thread);
+               }
                /*
                 * Do not terminate the current task, if proc_exec_switch_task did not
                 * switch the tasks, terminating the current task without the switch would
@@ -3830,7 +4313,7 @@ exit_with_error:
                /* Release the thread ref returned by fork_create_child */
                if (imgp->ip_new_thread) {
                        /* wake up the new exec thread */
-                       task_clear_return_wait(get_threadtask(imgp->ip_new_thread));
+                       task_clear_return_wait(get_threadtask(imgp->ip_new_thread), TCRW_CLEAR_FINAL_WAIT);
                        thread_deallocate(imgp->ip_new_thread);
                        imgp->ip_new_thread = NULL;
                }
@@ -3881,7 +4364,7 @@ copyinptr(user_addr_t froma, user_addr_t *toptr, int ptr_size)
 
        if (ptr_size == 4) {
                /* 64 bit value containing 32 bit address */
-               unsigned int i;
+               unsigned int i = 0;
 
                error = copyin(froma, &i, 4);
                *toptr = CAST_USER_ADDR_T(i);   /* SAFE */
@@ -4438,6 +4921,7 @@ extern user64_addr_t commpage_text64_location;
 #define FSID_KEY "executable_file="
 #define DYLD_FSID_KEY "dyld_file="
 #define CDHASH_KEY "executable_cdhash="
+#define DYLD_FLAGS_KEY "dyld_flags="
 
 #define FSID_MAX_STRING "0x1234567890abcdef,0x1234567890abcdef"
 
@@ -4476,6 +4960,10 @@ exec_add_entropy_key(struct image_params *imgp,
 /*
  * Build up the contents of the apple[] string vector
  */
+#if (DEVELOPMENT || DEBUG)
+uint64_t dyld_flags = 0;
+#endif
+
 static int
 exec_add_apple_strings(struct image_params *imgp,
     const load_result_t *load_result)
@@ -4611,6 +5099,17 @@ exec_add_apple_strings(struct image_params *imgp,
                }
                imgp->ip_applec++;
        }
+#if (DEVELOPMENT || DEBUG)
+       if (dyld_flags) {
+               char dyld_flags_string[strlen(DYLD_FLAGS_KEY) + HEX_STR_LEN + 1];
+               snprintf(dyld_flags_string, sizeof(dyld_flags_string), DYLD_FLAGS_KEY "0x%llx", dyld_flags);
+               error = exec_add_user_string(imgp, CAST_USER_ADDR_T(dyld_flags_string), UIO_SYSSPACE, FALSE);
+               if (error) {
+                       goto bad;
+               }
+               imgp->ip_applec++;
+       }
+#endif
 
        /* Align the tail of the combined applev area */
        while (imgp->ip_strspace % img_ptr_size != 0) {
@@ -4763,7 +5262,6 @@ exec_handle_sugid(struct image_params *imgp)
 {
        proc_t                  p = vfs_context_proc(imgp->ip_vfs_context);
        kauth_cred_t            cred = vfs_context_ucred(imgp->ip_vfs_context);
-       kauth_cred_t            my_cred, my_new_cred;
        int                     i;
        int                     leave_sugid_clear = 0;
        int                     mac_reset_ipc = 0;
@@ -4840,62 +5338,23 @@ handle_mac_transition:
                 * proc's ucred lock. This prevents others from accessing
                 * a garbage credential.
                 */
-               while (imgp->ip_origvattr->va_mode & VSUID) {
-                       my_cred = kauth_cred_proc_ref(p);
-                       my_new_cred = kauth_cred_setresuid(my_cred, KAUTH_UID_NONE, imgp->ip_origvattr->va_uid, imgp->ip_origvattr->va_uid, KAUTH_UID_NONE);
-
-                       if (my_new_cred == my_cred) {
-                               kauth_cred_unref(&my_cred);
-                               break;
-                       }
-
-                       /* update cred on proc */
-                       proc_ucred_lock(p);
-
-                       if (p->p_ucred != my_cred) {
-                               proc_ucred_unlock(p);
-                               kauth_cred_unref(&my_new_cred);
-                               continue;
-                       }
-
-                       /* donate cred reference on my_new_cred to p->p_ucred */
-                       p->p_ucred = my_new_cred;
-                       PROC_UPDATE_CREDS_ONPROC(p);
-                       proc_ucred_unlock(p);
-
-                       /* drop additional reference that was taken on the previous cred */
-                       kauth_cred_unref(&my_cred);
-
-                       break;
-               }
-
-               while (imgp->ip_origvattr->va_mode & VSGID) {
-                       my_cred = kauth_cred_proc_ref(p);
-                       my_new_cred = kauth_cred_setresgid(my_cred, KAUTH_GID_NONE, imgp->ip_origvattr->va_gid, imgp->ip_origvattr->va_gid);
-
-                       if (my_new_cred == my_cred) {
-                               kauth_cred_unref(&my_cred);
-                               break;
-                       }
-
-                       /* update cred on proc */
-                       proc_ucred_lock(p);
-
-                       if (p->p_ucred != my_cred) {
-                               proc_ucred_unlock(p);
-                               kauth_cred_unref(&my_new_cred);
-                               continue;
-                       }
-
-                       /* donate cred reference on my_new_cred to p->p_ucred */
-                       p->p_ucred = my_new_cred;
-                       PROC_UPDATE_CREDS_ONPROC(p);
-                       proc_ucred_unlock(p);
-
-                       /* drop additional reference that was taken on the previous cred */
-                       kauth_cred_unref(&my_cred);
-
-                       break;
+               if (imgp->ip_origvattr->va_mode & VSUID) {
+                       apply_kauth_cred_update(p, ^kauth_cred_t (kauth_cred_t my_cred) {
+                               return kauth_cred_setresuid(my_cred,
+                               KAUTH_UID_NONE,
+                               imgp->ip_origvattr->va_uid,
+                               imgp->ip_origvattr->va_uid,
+                               KAUTH_UID_NONE);
+                       });
+               }
+
+               if (imgp->ip_origvattr->va_mode & VSGID) {
+                       apply_kauth_cred_update(p, ^kauth_cred_t (kauth_cred_t my_cred) {
+                               return kauth_cred_setresgid(my_cred,
+                               KAUTH_GID_NONE,
+                               imgp->ip_origvattr->va_gid,
+                               imgp->ip_origvattr->va_gid);
+                       });
                }
 #endif /* !SECURE_KERNEL */
 
@@ -5072,35 +5531,11 @@ handle_mac_transition:
         * proc's ucred lock. This prevents others from accessing
         * a garbage credential.
         */
-       for (;;) {
-               my_cred = kauth_cred_proc_ref(p);
-               my_new_cred = kauth_cred_setsvuidgid(my_cred, kauth_cred_getuid(my_cred), kauth_cred_getgid(my_cred));
-
-               if (my_new_cred == my_cred) {
-                       kauth_cred_unref(&my_cred);
-                       break;
-               }
-
-               /* update cred on proc */
-               proc_ucred_lock(p);
-
-               if (p->p_ucred != my_cred) {
-                       proc_ucred_unlock(p);
-                       kauth_cred_unref(&my_new_cred);
-                       continue;
-               }
-
-               /* donate cred reference on my_new_cred to p->p_ucred */
-               p->p_ucred = my_new_cred;
-               PROC_UPDATE_CREDS_ONPROC(p);
-               proc_ucred_unlock(p);
-
-               /* drop additional reference that was taken on the previous cred */
-               kauth_cred_unref(&my_cred);
-
-               break;
-       }
-
+       apply_kauth_cred_update(p, ^kauth_cred_t (kauth_cred_t my_cred) {
+               return kauth_cred_setsvuidgid(my_cred,
+               kauth_cred_getuid(my_cred),
+               kauth_cred_getgid(my_cred));
+       });
 
        /* Update the process' identity version and set the security token */
        p->p_idversion = OSIncrementAtomic(&nextpidversion);
@@ -5442,7 +5877,6 @@ load_return_to_errno(load_return_t lrtn)
        case LOAD_SUCCESS:
                return 0;
        case LOAD_BADARCH:
-       case LOAD_BADARCH_X86:
                return EBADARCH;
        case LOAD_BADMACHO:
        case LOAD_BADMACHO_UPX:
@@ -5458,8 +5892,9 @@ load_return_to_errno(load_return_t lrtn)
                return ENOENT;
        case LOAD_IOERROR:
                return EIO;
-       case LOAD_FAILURE:
        case LOAD_DECRYPTFAIL:
+               return EAUTH;
+       case LOAD_FAILURE:
        default:
                return EBADEXEC;
        }
@@ -5737,7 +6172,7 @@ __EXEC_WAITING_ON_TASKGATED_CODE_SIGNATURE_UPCALL__(mach_port_t task_access_port
 static int
 check_for_signature(proc_t p, struct image_params *imgp)
 {
-       mach_port_t port = NULL;
+       mach_port_t port = IPC_PORT_NULL;
        kern_return_t kr = KERN_FAILURE;
        int error = EACCES;
        boolean_t unexpected_failure = FALSE;
@@ -5905,6 +6340,10 @@ done:
                }
        }
 
+       if (port != IPC_PORT_NULL) {
+               ipc_port_release_send(port);
+       }
+
        /* If we hit this, we likely would have leaked an exit reason */
        assert(signature_failure_reason == OS_REASON_NULL);
        return error;
index e958587b4333bb65799847c0e110a0b1eb280129..825508bf37ee77752a2b734ca0adcf255ce15d71 100644 (file)
 #include <kern/exc_guard.h>
 
 #include <vm/vm_protos.h>
+#include <os/log.h>
 
 #include <pexpert/pexpert.h>
 
@@ -152,6 +153,9 @@ void dtrace_proc_exit(proc_t p);
 #include <sys/syscall.h>
 #endif /* CONFIG_MACF */
 
+#if CONFIG_MEMORYSTATUS
+static void proc_memorystatus_remove(proc_t p);
+#endif /* CONFIG_MEMORYSTATUS */
 void proc_prepareexit(proc_t p, int rv, boolean_t perf_notify);
 void gather_populate_corpse_crashinfo(proc_t p, task_t corpse_task,
     mach_exception_data_type_t code, mach_exception_data_type_t subcode,
@@ -509,6 +513,11 @@ populate_corpse_crashinfo(proc_t p, task_t corpse_task, struct rusage_superset *
                kcdata_memcpy(crash_info_ptr, uaddr, &p->p_responsible_pid, sizeof(p->p_responsible_pid));
        }
 
+       if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_PROC_PERSONA_ID, sizeof(uid_t), &uaddr)) {
+               uid_t persona_id = proc_persona_id(p);
+               kcdata_memcpy(crash_info_ptr, uaddr, &persona_id, sizeof(persona_id));
+       }
+
 #if CONFIG_COALITIONS
        if (KERN_SUCCESS == kcdata_get_memory_addr_for_array(crash_info_ptr, TASK_CRASHINFO_COALITION_ID, sizeof(uint64_t), COALITION_NUM_TYPES, &uaddr)) {
                uint64_t coalition_ids[COALITION_NUM_TYPES];
@@ -518,12 +527,16 @@ populate_corpse_crashinfo(proc_t p, task_t corpse_task, struct rusage_superset *
 #endif /* CONFIG_COALITIONS */
 
 #if CONFIG_MEMORYSTATUS
-       memstat_dirty_flags = memorystatus_dirty_get(p);
+       memstat_dirty_flags = memorystatus_dirty_get(p, FALSE);
        if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_DIRTY_FLAGS, sizeof(memstat_dirty_flags), &uaddr)) {
                kcdata_memcpy(crash_info_ptr, uaddr, &memstat_dirty_flags, sizeof(memstat_dirty_flags));
        }
 #endif
 
+       if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_MEMORY_LIMIT_INCREASE, sizeof(p->p_memlimit_increase), &uaddr)) {
+               kcdata_memcpy(crash_info_ptr, uaddr, &p->p_memlimit_increase, sizeof(p->p_memlimit_increase));
+       }
+
        if (p->p_exit_reason != OS_REASON_NULL && reason == OS_REASON_NULL) {
                reason = p->p_exit_reason;
        }
@@ -596,7 +609,8 @@ launchd_exit_reason_get_string_desc(os_reason_t exit_reason)
        return (char *)kcdata_iter_payload(iter);
 }
 
-static __attribute__((noinline)) void
+__abortlike
+static void
 launchd_crashed_panic(proc_t p, int rv)
 {
        char *launchd_exit_reason_desc = launchd_exit_reason_get_string_desc(p->p_exit_reason);
@@ -921,6 +935,25 @@ exit_with_reason(proc_t p, int rv, int *retval, boolean_t thread_can_terminate,
        return 0;
 }
 
+#if CONFIG_MEMORYSTATUS
+/*
+ * Remove this process from jetsam bands for freezing or exiting. Note this will block, if the process
+ * is currently being frozen.
+ * The proc_list_lock is held by the caller.
+ * NB: If the process should be ineligible for future freezing or jetsaming the caller should first set
+ * the p_listflag P_LIST_EXITED bit.
+ */
+static void
+proc_memorystatus_remove(proc_t p)
+{
+       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+       while (memorystatus_remove(p) == EAGAIN) {
+               os_log(OS_LOG_DEFAULT, "memorystatus_remove: Process[%d] tried to exit while being frozen. Blocking exit until freeze completes.", p->p_pid);
+               msleep(&p->p_memstat_state, proc_list_mlock, PWAIT, "proc_memorystatus_remove", NULL);
+       }
+}
+#endif
+
 void
 proc_prepareexit(proc_t p, int rv, boolean_t perf_notify)
 {
@@ -1056,7 +1089,7 @@ skipcheck:
        proc_list_lock();
 
 #if CONFIG_MEMORYSTATUS
-       memorystatus_remove(p, TRUE);
+       proc_memorystatus_remove(p);
 #endif
 
        LIST_REMOVE(p, p_list);
@@ -1066,7 +1099,6 @@ skipcheck:
 
        proc_list_unlock();
 
-
 #ifdef PGINPROF
        vmsizmon();
 #endif
@@ -1140,8 +1172,6 @@ proc_exit(proc_t p)
        dtrace_proc_exit(p);
 #endif
 
-       nspace_proc_exit(p);
-
        /*
         * need to cancel async IO requests that can be cancelled and wait for those
         * already active.  MAY BLOCK!
@@ -1179,6 +1209,14 @@ proc_exit(proc_t p)
                throttle_lowpri_io(0);
        }
 
+       if (p->p_lflag & P_LNSPACE_RESOLVER) {
+               /*
+                * The namespace resolver is exiting; there may be
+                * outstanding materialization requests to clean up.
+                */
+               nspace_resolver_exited(p);
+       }
+
 #if SYSV_SHM
        /* Close ref SYSV Shared memory*/
        if (p->vm_shm) {
@@ -2327,7 +2365,7 @@ proc_reparentlocked(proc_t child, proc_t parent, int signallable, int locked)
        }
 #endif
        oldparent->p_childrencnt--;
-#if __PROC_INTERNAL_DEBUG1
+#if __PROC_INTERNAL_DEBUG
        if (oldparent->p_childrencnt < 0) {
                panic("process children count -ve\n");
        }
@@ -2411,7 +2449,7 @@ vfork_exit_internal(proc_t p, int rv, int forceexit)
        proc_list_lock();
 
 #if CONFIG_MEMORYSTATUS
-       memorystatus_remove(p, TRUE);
+       proc_memorystatus_remove(p);
 #endif
 
        LIST_REMOVE(p, p_list);
index 772c163554e40c0d21dcb1af64d85d883ce73dbe..4b0f0e9a4bb9cab9cb4a7dba27203bef28a426eb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007, 2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -720,7 +720,7 @@ vfork_return(proc_t child_proc, int32_t *retval, int rval)
        /* restore thread-set-id state */
        if (uth->uu_flag & UT_WASSETUID) {
                uth->uu_flag |= UT_SETUID;
-               uth->uu_flag &= UT_WASSETUID;
+               uth->uu_flag &= ~UT_WASSETUID;
        }
        uth->uu_proc = 0;
        uth->uu_sigmask = uth->uu_vforkmask;
@@ -792,8 +792,9 @@ fork_create_child(task_t parent_task,
            inherit_memory,
            is_64bit_addr,
            is_64bit_data,
-           TF_LRETURNWAIT | TF_LRETURNWAITER,                                     /* All created threads will wait in task_wait_to_return */
-           in_exec ? TPF_EXEC_COPY : TPF_NONE,                               /* Mark the task exec copy if in execve */
+           TF_NONE,
+           in_exec ? TPF_EXEC_COPY : TPF_NONE,                        /* Mark the task exec copy if in execve */
+           (TRW_LRETURNWAIT | TRW_LRETURNWAITER),                     /* All created threads will wait in task_wait_to_return */
            &child_task);
        if (result != KERN_SUCCESS) {
                printf("%s: task_create_internal failed.  Code: %d\n",
@@ -922,7 +923,7 @@ fork(proc_t parent_proc, __unused struct fork_args *uap, int32_t *retval)
 #endif
 
                /* "Return" to the child */
-               task_clear_return_wait(get_threadtask(child_thread));
+               task_clear_return_wait(get_threadtask(child_thread), TCRW_CLEAR_ALL_WAIT);
 
                /* drop the extra references we got during the creation */
                if ((child_task = (task_t)get_threadtask(child_thread)) != NULL) {
@@ -1107,9 +1108,6 @@ forkproc_free(proc_t p)
                p->p_textvp = NULL;
        }
 
-       /* Stop the profiling clock */
-       stopprofclock(p);
-
        /* Update the audit session proc count */
        AUDIT_SESSION_PROCEXIT(p);
 
@@ -1246,7 +1244,6 @@ retry:
        }
        nprocs++;
        child_proc->p_pid = nextpid;
-       child_proc->p_responsible_pid = nextpid;        /* initially responsible for self */
        child_proc->p_idversion = OSIncrementAtomic(&nextpidversion);
        /* kernel process is handcrafted and not from fork, so start from 1 */
        child_proc->p_uniqueid = ++nextuniqueid;
@@ -1282,7 +1279,7 @@ retry:
         * for insertion to hash.  Copy the section that is to be copied
         * directly from the parent.
         */
-       bcopy(&parent_proc->p_startcopy, &child_proc->p_startcopy,
+       __nochk_bcopy(&parent_proc->p_startcopy, &child_proc->p_startcopy,
            (unsigned) ((caddr_t)&child_proc->p_endcopy - (caddr_t)&child_proc->p_startcopy));
 
        /*
@@ -1296,12 +1293,11 @@ retry:
 #else /*  !CONFIG_EMBEDDED */
        child_proc->p_flag = (parent_proc->p_flag & (P_LP64 | P_DISABLE_ASLR | P_SUGID));
 #endif /* !CONFIG_EMBEDDED */
-       if (parent_proc->p_flag & P_PROFIL) {
-               startprofclock(child_proc);
-       }
 
        child_proc->p_vfs_iopolicy = (parent_proc->p_vfs_iopolicy & (P_VFS_IOPOLICY_VALID_MASK));
 
+       child_proc->p_responsible_pid = parent_proc->p_responsible_pid;
+
        /*
         * Note that if the current thread has an assumed identity, this
         * credential will be granted to the new process.
@@ -1414,6 +1410,7 @@ retry:
        }
        child_proc->p_dispatchqueue_offset = parent_proc->p_dispatchqueue_offset;
        child_proc->p_dispatchqueue_serialno_offset = parent_proc->p_dispatchqueue_serialno_offset;
+       child_proc->p_dispatchqueue_label_offset = parent_proc->p_dispatchqueue_label_offset;
        child_proc->p_return_to_kernel_offset = parent_proc->p_return_to_kernel_offset;
        child_proc->p_mach_thread_self_offset = parent_proc->p_mach_thread_self_offset;
        child_proc->p_pth_tsd_offset = parent_proc->p_pth_tsd_offset;
@@ -1437,12 +1434,14 @@ retry:
        child_proc->p_memstat_state = 0;
        child_proc->p_memstat_effectivepriority = JETSAM_PRIORITY_DEFAULT;
        child_proc->p_memstat_requestedpriority = JETSAM_PRIORITY_DEFAULT;
+       child_proc->p_memstat_assertionpriority = 0;
        child_proc->p_memstat_userdata          = 0;
        child_proc->p_memstat_idle_start        = 0;
        child_proc->p_memstat_idle_delta        = 0;
        child_proc->p_memstat_memlimit          = 0;
        child_proc->p_memstat_memlimit_active   = 0;
        child_proc->p_memstat_memlimit_inactive = 0;
+       child_proc->p_memstat_relaunch_flags    = P_MEMSTAT_RELAUNCH_UNKNOWN;
 #if CONFIG_FREEZE
        child_proc->p_memstat_freeze_sharedanon_pages = 0;
 #endif
index dc29cb531eede2195b58b51fd4aec31488535139..c78c646733079f72da166751a21ef17c7df46ab1 100644 (file)
@@ -104,6 +104,10 @@ struct gfp_crarg {
        u_int gca_attrs;
 };
 
+#ifdef OS_REFCNT_DEBUG
+extern struct os_refgrp f_iocount_refgrp;
+#endif
+
 static struct fileproc *
 guarded_fileproc_alloc_init(void *crarg)
 {
@@ -115,7 +119,11 @@ guarded_fileproc_alloc_init(void *crarg)
        }
 
        bzero(gfp, sizeof(*gfp));
-       gfp->gf_fileproc.f_flags = FTYPE_GUARDED;
+
+       struct fileproc *fp = &gfp->gf_fileproc;
+       os_ref_init(&fp->f_iocount, &f_iocount_refgrp);
+       fp->f_flags = FTYPE_GUARDED;
+
        gfp->gf_magic = GUARDED_FILEPROC_MAGIC;
        gfp->gf_guard = aarg->gca_guard;
        gfp->gf_attrs = aarg->gca_attrs;
@@ -172,7 +180,7 @@ fp_lookup_guarded(proc_t p, int fd, guardid_t guard,
  * if (FP_ISGUARDED(fp, GUARD_CLOSE)) {
  *      error = fp_guard_exception(p, fd, fp, kGUARD_EXC_CLOSE);
  *      proc_fdunlock(p);
- *      return (error);
+ *      return error;
  * }
  */
 
@@ -211,7 +219,7 @@ fp_guard_exception(proc_t p, int fd, struct fileproc *fp, u_int flavor)
        mach_exception_subcode_t subcode = gfp->gf_guard;
 
        thread_t t = current_thread();
-       thread_guard_violation(t, code, subcode);
+       thread_guard_violation(t, code, subcode, TRUE);
        return EPERM;
 }
 
@@ -413,7 +421,7 @@ guarded_kqueue_np(proc_t p, struct guarded_kqueue_np_args *uap, int32_t *retval)
                return EINVAL;
        }
 
-       return kqueue_body(p, guarded_fileproc_alloc_init, &crarg, retval);
+       return kqueue_internal(p, guarded_fileproc_alloc_init, &crarg, retval);
 }
 
 /*
@@ -636,14 +644,14 @@ restart:
                        proc_fdlock(p);
 
                        switch (error = fp_tryswap(p, fd, nfp)) {
-                       case 0: /* guarded-ness comes with side-effects */
+                       case 0: /* success; guarded-ness comes with side-effects */
+                               fp = NULL;
                                gfp = FP_TO_GFP(nfp);
                                if (gfp->gf_attrs & GUARD_CLOSE) {
                                        FDFLAGS_SET(p, fd, UF_FORKCLOSE);
                                }
                                FDFLAGS_SET(p, fd, UF_EXCLOSE);
                                (void) fp_drop(p, fd, nfp, 1);
-                               fileproc_free(fp);
                                break;
                        case EKEEPLOOKING: /* f_iocount indicates a collision */
                                (void) fp_drop(p, fd, fp, 1);
@@ -688,7 +696,8 @@ restart:
                        proc_fdlock(p);
 
                        switch (error = fp_tryswap(p, fd, nfp)) {
-                       case 0: /* undo side-effects of guarded-ness */
+                       case 0: /* success; undo side-effects of guarded-ness */
+                               fp = NULL;
                                FDFLAGS_CLR(p, fd, UF_FORKCLOSE | UF_EXCLOSE);
                                FDFLAGS_SET(p, fd,
                                    (nfdflags & FD_CLOFORK) ? UF_FORKCLOSE : 0);
@@ -696,7 +705,6 @@ restart:
                                FDFLAGS_SET(p, fd,
                                    (nfdflags & FD_CLOEXEC) ? UF_EXCLOSE : 0);
                                (void) fp_drop(p, fd, nfp, 1);
-                               fileproc_free(fp);
                                break;
                        case EKEEPLOOKING: /* f_iocount indicates collision */
                                (void) fp_drop(p, fd, fp, 1);
@@ -1077,6 +1085,59 @@ vng_lbl_set(struct label *label, void *data)
        mac_label_set(label, label_slot, (intptr_t)data);
 }
 
+static int
+vnguard_sysc_getguardattr(proc_t p, struct vnguard_getattr *vga)
+{
+       const int fd = vga->vga_fd;
+
+       if (0 == vga->vga_guard) {
+               return EINVAL;
+       }
+
+       int error;
+       struct fileproc *fp;
+       if (0 != (error = fp_lookup(p, fd, &fp, 0))) {
+               return error;
+       }
+       do {
+               struct fileglob *fg = fp->f_fglob;
+               if (FILEGLOB_DTYPE(fg) != DTYPE_VNODE) {
+                       error = EBADF;
+                       break;
+               }
+               struct vnode *vp = fg->fg_data;
+               if (!vnode_isreg(vp) || NULL == vp->v_mount) {
+                       error = EBADF;
+                       break;
+               }
+               error = vnode_getwithref(vp);
+               if (0 != error) {
+                       break;
+               }
+
+               vga->vga_attrs = 0;
+
+               lck_rw_lock_shared(&llock);
+
+               if (NULL != vp->v_label) {
+                       const struct vng_info *vgi = vng_lbl_get(vp->v_label);
+                       if (NULL != vgi) {
+                               if (vgi->vgi_guard != vga->vga_guard) {
+                                       error = EPERM;
+                               } else {
+                                       vga->vga_attrs = vgi->vgi_attrs;
+                               }
+                       }
+               }
+
+               lck_rw_unlock_shared(&llock);
+               vnode_put(vp);
+       } while (0);
+
+       fp_drop(p, fd, fp, 0);
+       return error;
+}
+
 static int
 vnguard_sysc_setguard(proc_t p, const struct vnguard_set *vns)
 {
@@ -1122,9 +1183,9 @@ vnguard_sysc_setguard(proc_t p, const struct vnguard_set *vns)
                }
                error = vnode_getwithref(vp);
                if (0 != error) {
-                       fp_drop(p, fd, fp, 0);
                        break;
                }
+
                /* Ensure the target vnode -has- a label */
                struct vfs_context *ctx = vfs_context_current();
                mac_vnode_label_update(ctx, vp, NULL);
@@ -1165,7 +1226,16 @@ vnguard_sysc_setguard(proc_t p, const struct vnguard_set *vns)
                                if (vgi->vgi_guard != vns->vns_guard) {
                                        error = EPERM; /* guard mismatch */
                                } else if (vgi->vgi_attrs != vns->vns_attrs) {
-                                       error = EACCES; /* attr mismatch */
+                                       /*
+                                        * Temporary workaround for older versions of SQLite:
+                                        * allow newer guard attributes to be silently cleared.
+                                        */
+                                       const unsigned mask = ~(VNG_WRITE_OTHER | VNG_TRUNC_OTHER);
+                                       if ((vgi->vgi_attrs & mask) == (vns->vns_attrs & mask)) {
+                                               vgi->vgi_attrs &= vns->vns_attrs;
+                                       } else {
+                                               error = EACCES; /* attr mismatch */
+                                       }
                                }
                                if (0 != error || NULL != vgo) {
                                        free_vgo(nvgo);
@@ -1205,6 +1275,19 @@ vng_policy_syscall(proc_t p, int cmd, user_addr_t arg)
                error = vnguard_sysc_setguard(p, &vns);
                break;
        }
+       case VNG_SYSC_GET_ATTR: {
+               struct vnguard_getattr vga;
+               error = copyin(arg, (void *)&vga, sizeof(vga));
+               if (error) {
+                       break;
+               }
+               error = vnguard_sysc_getguardattr(p, &vga);
+               if (error) {
+                       break;
+               }
+               error = copyout((void *)&vga, arg, sizeof(vga));
+               break;
+       }
        default:
                break;
        }
@@ -1281,6 +1364,11 @@ vng_reason_from_pathname(const char *path, uint32_t pathlen)
 
 static int vng_policy_flags;
 
+/*
+ * Note: if an EXC_GUARD is generated, llock will be dropped and
+ * subsequently reacquired by this routine. Data derived from
+ * any label in the caller should be regenerated.
+ */
 static int
 vng_guard_violation(const struct vng_info *vgi,
     unsigned opval, vnode_t vp)
@@ -1364,6 +1452,8 @@ vng_guard_violation(const struct vng_info *vgi,
                EXC_GUARD_ENCODE_TARGET(code, pid);
                subcode = vgi->vgi_guard;
 
+               lck_rw_unlock_shared(&llock);
+
                if (vng_policy_flags & kVNG_POLICY_EXC_CORPSE) {
                        char *path;
                        int len = MAXPATHLEN;
@@ -1384,8 +1474,10 @@ vng_guard_violation(const struct vng_info *vgi,
                        }
                } else {
                        thread_t t = current_thread();
-                       thread_guard_violation(t, code, subcode);
+                       thread_guard_violation(t, code, subcode, TRUE);
                }
+
+               lck_rw_lock_shared(&llock);
        } else if (vng_policy_flags & kVNG_POLICY_SIGKILL) {
                proc_t p = current_proc();
                psignal(p, SIGKILL);
@@ -1614,7 +1706,7 @@ SECURITY_READ_ONLY_LATE(static struct mac_policy_conf) vng_policy_conf = {
        .mpc_runtime_flags = 0
 };
 
-static mac_policy_handle_t vng_policy_handle;
+SECURITY_READ_ONLY_LATE(static mac_policy_handle_t) vng_policy_handle;
 
 void
 vnguard_policy_init(void)
index 92ecc8164f6411317ef5b90c9172f047c43e51d7..21edbc5d97dc9631b1328e0d26de7c61fcd3aab9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 
 #include <sys/file_internal.h>
 
-/*
- * This variable controls the maximum number of processes that will
- * be checked in doing deadlock detection.
- */
-static int maxlockdepth = MAXDEPTH;
-
 #if (DEVELOPMENT || DEBUG)
 #define LOCKF_DEBUGGING 1
 #endif
@@ -99,6 +93,7 @@ void lf_printlist(const char *tag, struct lockf *lock);
 #define LF_DBG_LIST     (1 << 1)        /* split, coalesce */
 #define LF_DBG_IMPINH   (1 << 2)        /* importance inheritance */
 #define LF_DBG_TRACE    (1 << 3)        /* errors, exit */
+#define LF_DBG_DEADLOCK (1 << 4)        /* deadlock detection */
 
 static int      lockf_debug = 0;        /* was 2, could be 3 ;-) */
 SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, "");
@@ -109,10 +104,16 @@ SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_de
  */
 #define LOCKF_DEBUG(mask, ...)                                  \
        do {                                                    \
-               if( !(mask) || ((mask) & lockf_debug)) {        \
+               if (!(mask) || ((mask) & lockf_debug)) {        \
+                       printf("%s>", __FUNCTION__);            \
                        printf(__VA_ARGS__);                    \
                }                                               \
        } while(0)
+
+#define LOCKF_DEBUGP(mask)                                      \
+       ({                                                      \
+               ((mask) & lockf_debug);                         \
+       })
 #else   /* !LOCKF_DEBUGGING */
 #define LOCKF_DEBUG(mask, ...)          /* mask */
 #endif  /* !LOCKF_DEBUGGING */
@@ -503,11 +504,12 @@ lf_setlock(struct lockf *lock, struct timespec *timeout)
        overlap_t ovcase;
 
 #ifdef LOCKF_DEBUGGING
-       if (lockf_debug & LF_DBG_LOCKOP) {
+       if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
                lf_print("lf_setlock", lock);
                lf_printlist("lf_setlock(in)", lock);
        }
 #endif /* LOCKF_DEBUGGING */
+       LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p Looking for deadlock, vnode %p\n", lock, lock->lf_vnode);
 
        /*
         * Set the priority
@@ -517,6 +519,7 @@ lf_setlock(struct lockf *lock, struct timespec *timeout)
                priority += 4;
        }
        priority |= PCATCH;
+scan:
        /*
         * Scan lock list for this file looking for locks that would block us.
         */
@@ -530,6 +533,8 @@ lf_setlock(struct lockf *lock, struct timespec *timeout)
                        return EAGAIN;
                }
 
+               LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p found blocking lock %p\n", lock, block);
+
                /*
                 * We are blocked. Since flock style locks cover
                 * the whole file, there is no chance for deadlock.
@@ -541,36 +546,59 @@ lf_setlock(struct lockf *lock, struct timespec *timeout)
                 *
                 * Deadlock detection is done by looking through the
                 * wait channels to see if there are any cycles that
-                * involve us. MAXDEPTH is set just to make sure we
-                * do not go off into neverland.
+                * involve us.
                 */
                if ((lock->lf_flags & F_POSIX) &&
                    (block->lf_flags & F_POSIX)) {
-                       struct proc *wproc, *bproc;
+                       struct proc *wproc;
                        struct uthread *ut;
-                       struct lockf *waitblock;
-                       int i = 0;
 
                        /* The block is waiting on something */
                        wproc = block->lf_owner;
                        proc_lock(wproc);
+                       LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(wproc));
                        TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
                                /*
-                                * While the thread is asleep (uu_wchan != 0)
+                                * If the thread is asleep (uu_wchan != 0)
                                 * in this code (uu_wmesg == lockstr)
-                                * and we have not exceeded the maximum cycle
-                                * depth (i < maxlockdepth), then check for a
-                                * cycle to see if the lock is blocked behind
+                                * check to see if the lock is blocked behind
                                 * someone blocked behind us.
                                 */
-                               while (((waitblock = (struct lockf *)ut->uu_wchan) != NULL) &&
-                                   ut->uu_wmesg == lockstr &&
-                                   (i++ < maxlockdepth)) {
-                                       waitblock = (struct lockf *)ut->uu_wchan;
+                               if ((ut->uu_wchan != NULL) && (ut->uu_wmesg == lockstr)) {
+                                       struct lockf *waitblock = (struct lockf *)ut->uu_wchan;
+                                       LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, waitblock, waitblock->lf_vnode);
+
+                                       vnode_t othervp = NULL;
+                                       if (waitblock->lf_vnode != vp) {
+                                               /*
+                                                * This thread in wproc is waiting for a lock
+                                                * on a different vnode; grab the lock on it
+                                                * that protects lf_next while we examine it.
+                                                */
+                                               othervp = waitblock->lf_vnode;
+                                               if (!lck_mtx_try_lock(&othervp->v_lock)) {
+                                                       /*
+                                                        * avoid kernel deadlock: drop all
+                                                        * locks, pause for a bit to let the
+                                                        * other thread do what it needs to do,
+                                                        * then (because we drop and retake
+                                                        * v_lock) retry the scan.
+                                                        */
+                                                       proc_unlock(wproc);
+                                                       static struct timespec ts = {
+                                                               .tv_sec = 0,
+                                                               .tv_nsec = 10 * NSEC_PER_MSEC,
+                                                       };
+                                                       (void) msleep(lock, &vp->v_lock, priority, lockstr, &ts);
+                                                       LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p contention for vp %p => restart\n", lock, othervp);
+                                                       goto scan;
+                                               }
+                                       }
+
                                        /*
                                         * Get the lock blocking the lock
                                         * which would block us, and make
-                                        * certain it hasn't come unblocked
+                                        * certain it hasn't become unblocked
                                         * (been granted, e.g. between the time
                                         * we called lf_getblock, and the time
                                         * we successfully acquired the
@@ -578,8 +606,13 @@ lf_setlock(struct lockf *lock, struct timespec *timeout)
                                         */
                                        waitblock = waitblock->lf_next;
                                        if (waitblock == NULL) {
-                                               break;
+                                               if (othervp) {
+                                                       lck_mtx_unlock(&othervp->v_lock);
+                                               }
+                                               LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p with no lf_next\n", lock);
+                                               continue;
                                        }
+                                       LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, waitblock, waitblock->lf_vnode);
 
                                        /*
                                         * Make sure it's an advisory range
@@ -588,7 +621,10 @@ lf_setlock(struct lockf *lock, struct timespec *timeout)
                                         * fault.
                                         */
                                        if ((waitblock->lf_flags & F_POSIX) == 0) {
-                                               break;
+                                               if (othervp) {
+                                                       lck_mtx_unlock(&othervp->v_lock);
+                                               }
+                                               continue;
                                        }
 
                                        /*
@@ -597,13 +633,21 @@ lf_setlock(struct lockf *lock, struct timespec *timeout)
                                         * getting the requested lock, then we
                                         * would deadlock, so error out.
                                         */
-                                       bproc = waitblock->lf_owner;
-                                       if (bproc == lock->lf_owner) {
+                                       struct proc *bproc = waitblock->lf_owner;
+                                       const boolean_t deadlocked = bproc == lock->lf_owner;
+
+                                       if (othervp) {
+                                               lck_mtx_unlock(&othervp->v_lock);
+                                       }
+                                       LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(bproc));
+                                       if (deadlocked) {
+                                               LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is me, so EDEADLK\n", lock);
                                                proc_unlock(wproc);
                                                FREE(lock, M_LOCKF);
                                                return EDEADLK;
                                        }
                                }
+                               LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p bottom of thread loop\n", lock);
                        }
                        proc_unlock(wproc);
                }
@@ -658,7 +702,7 @@ lf_setlock(struct lockf *lock, struct timespec *timeout)
 #endif /* IMPORTANCE_INHERITANCE */
 
 #ifdef LOCKF_DEBUGGING
-               if (lockf_debug & LF_DBG_LOCKOP) {
+               if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
                        lf_print("lf_setlock: blocking on", block);
                        lf_printlist("lf_setlock(block)", block);
                }
@@ -853,7 +897,7 @@ lf_setlock(struct lockf *lock, struct timespec *timeout)
        /* Coalesce adjacent locks with identical attributes */
        lf_coalesce_adjacent(lock);
 #ifdef LOCKF_DEBUGGING
-       if (lockf_debug & LF_DBG_LOCKOP) {
+       if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
                lf_print("lf_setlock: got the lock", lock);
                lf_printlist("lf_setlock(out)", lock);
        }
@@ -893,7 +937,7 @@ lf_clearlock(struct lockf *unlock)
        if (unlock->lf_type != F_UNLCK) {
                panic("lf_clearlock: bad type");
        }
-       if (lockf_debug & LF_DBG_LOCKOP) {
+       if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
                lf_print("lf_clearlock", unlock);
        }
 #endif /* LOCKF_DEBUGGING */
@@ -952,7 +996,7 @@ lf_clearlock(struct lockf *unlock)
                break;
        }
 #ifdef LOCKF_DEBUGGING
-       if (lockf_debug & LF_DBG_LOCKOP) {
+       if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
                lf_printlist("lf_clearlock", unlock);
        }
 #endif /* LOCKF_DEBUGGING */
@@ -988,7 +1032,7 @@ lf_getlock(struct lockf *lock, struct flock *fl, pid_t matchpid)
        struct lockf *block;
 
 #ifdef LOCKF_DEBUGGING
-       if (lockf_debug & LF_DBG_LOCKOP) {
+       if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
                lf_print("lf_getlock", lock);
        }
 #endif /* LOCKF_DEBUGGING */
@@ -1121,7 +1165,7 @@ lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
                return 0;
        }
 #ifdef LOCKF_DEBUGGING
-       if (lockf_debug & LF_DBG_LIST) {
+       if (LOCKF_DEBUGP(LF_DBG_LIST)) {
                lf_print("lf_findoverlap: looking for overlap in", lock);
        }
 #endif /* LOCKF_DEBUGGING */
@@ -1153,7 +1197,7 @@ lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
                }
 
 #ifdef LOCKF_DEBUGGING
-               if (lockf_debug & LF_DBG_LIST) {
+               if (LOCKF_DEBUGP(LF_DBG_LIST)) {
                        lf_print("\tchecking", lf);
                }
 #endif /* LOCKF_DEBUGGING */
@@ -1238,7 +1282,7 @@ lf_split(struct lockf *lock1, struct lockf *lock2)
        struct lockf *splitlock;
 
 #ifdef LOCKF_DEBUGGING
-       if (lockf_debug & LF_DBG_LIST) {
+       if (LOCKF_DEBUGP(LF_DBG_LIST)) {
                lf_print("lf_split", lock1);
                lf_print("splitting from", lock2);
        }
@@ -1314,7 +1358,7 @@ lf_wakelock(struct lockf *listhead, boolean_t force_all)
 
                wakelock->lf_next = NOLOCKF;
 #ifdef LOCKF_DEBUGGING
-               if (lockf_debug & LF_DBG_LOCKOP) {
+               if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
                        lf_print("lf_wakelock: awakening", wakelock);
                }
 #endif /* LOCKF_DEBUGGING */
index 5c3410624cf99db7f39d92748d3646caab3403f9..15512dd418c87b9182dda9118f13bf5078d68a24 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2006-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -40,6 +40,7 @@
 
 #include <IOKit/IOBSD.h>
 
+#include <corpses/task_corpse.h>
 #include <libkern/libkern.h>
 #include <mach/coalition.h>
 #include <mach/mach_time.h>
 #include <sys/priv.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_protos.h>
+#include <mach/machine/sdt.h>
+#include <libkern/section_keywords.h>
+#include <stdatomic.h>
 
 #if CONFIG_FREEZE
 #include <vm/vm_map.h>
 #endif /* CONFIG_FREEZE */
 
 #include <sys/kern_memorystatus.h>
-
-#include <mach/machine/sdt.h>
-#include <libkern/section_keywords.h>
-#include <stdatomic.h>
+#include <sys/kern_memorystatus_freeze.h>
+#include <sys/kern_memorystatus_notify.h>
 
 /* For logging clarity */
 static const char *memorystatus_kill_cause_name[] = {
@@ -100,6 +102,8 @@ memorystatus_priority_band_name(int32_t priority)
                return "AUDIO_AND_ACCESSORY";
        case JETSAM_PRIORITY_CONDUCTOR:
                return "CONDUCTOR";
+       case JETSAM_PRIORITY_DRIVER_APPLE:
+               return "DRIVER_APPLE";
        case JETSAM_PRIORITY_HOME:
                return "HOME";
        case JETSAM_PRIORITY_EXECUTIVE:
@@ -149,18 +153,6 @@ extern void get_zone_map_size(uint64_t *current_size, uint64_t *capacity);
  */
 extern void get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size);
 
-/* These are very verbose printfs(), enable with
- * MEMORYSTATUS_DEBUG_LOG
- */
-#if MEMORYSTATUS_DEBUG_LOG
-#define MEMORYSTATUS_DEBUG(cond, format, ...)      \
-do {                                              \
-       if (cond) { printf(format, ##__VA_ARGS__); } \
-} while(0)
-#else
-#define MEMORYSTATUS_DEBUG(cond, format, ...)
-#endif
-
 /*
  * Active / Inactive limit support
  * proc list must be locked
@@ -221,120 +213,147 @@ MACRO_END
 
 unsigned long delta_percentage = 5;
 unsigned long critical_threshold_percentage = 5;
+// On embedded devices with more than 3GB of memory we lower the critical percentage.
+uint64_t config_jetsam_large_memory_cutoff = 3UL * (1UL << 30);
+unsigned long critical_threshold_percentage_larger_devices = 4;
+unsigned long delta_percentage_larger_devices = 4;
 unsigned long idle_offset_percentage = 5;
 unsigned long pressure_threshold_percentage = 15;
-unsigned long freeze_threshold_percentage = 50;
 unsigned long policy_more_free_offset_percentage = 5;
-
-/* General memorystatus stuff */
-
-struct klist memorystatus_klist;
-static lck_mtx_t memorystatus_klist_mutex;
-
-static void memorystatus_klist_lock(void);
-static void memorystatus_klist_unlock(void);
-
-static uint64_t memorystatus_sysprocs_idle_delay_time = 0;
-static uint64_t memorystatus_apps_idle_delay_time = 0;
+unsigned long sysproc_aging_aggr_threshold_percentage = 7;
 
 /*
- * Memorystatus kevents
+ * default jetsam snapshot support
  */
+memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
+memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_copy;
+unsigned int memorystatus_jetsam_snapshot_count = 0;
+unsigned int memorystatus_jetsam_snapshot_copy_count = 0;
+unsigned int memorystatus_jetsam_snapshot_max = 0;
+unsigned int memorystatus_jetsam_snapshot_size = 0;
+uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0;
+uint64_t memorystatus_jetsam_snapshot_timeout = 0;
 
-static int filt_memorystatusattach(struct knote *kn, struct kevent_internal_s *kev);
-static void filt_memorystatusdetach(struct knote *kn);
-static int filt_memorystatus(struct knote *kn, long hint);
-static int filt_memorystatustouch(struct knote *kn, struct kevent_internal_s *kev);
-static int filt_memorystatusprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
-
-SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = {
-       .f_attach = filt_memorystatusattach,
-       .f_detach = filt_memorystatusdetach,
-       .f_event = filt_memorystatus,
-       .f_touch = filt_memorystatustouch,
-       .f_process = filt_memorystatusprocess,
-};
+/* General memorystatus stuff */
 
-enum {
-       kMemorystatusNoPressure = 0x1,
-       kMemorystatusPressure = 0x2,
-       kMemorystatusLowSwap = 0x4,
-       kMemorystatusProcLimitWarn = 0x8,
-       kMemorystatusProcLimitCritical = 0x10
-};
+uint64_t memorystatus_sysprocs_idle_delay_time = 0;
+uint64_t memorystatus_apps_idle_delay_time = 0;
+
+static lck_grp_attr_t *memorystatus_jetsam_fg_band_lock_grp_attr;
+static lck_grp_t *memorystatus_jetsam_fg_band_lock_grp;
+lck_mtx_t memorystatus_jetsam_fg_band_lock;
 
 /* Idle guard handling */
 
 static int32_t memorystatus_scheduled_idle_demotions_sysprocs = 0;
 static int32_t memorystatus_scheduled_idle_demotions_apps = 0;
 
-static thread_call_t memorystatus_idle_demotion_call;
-
 static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2);
 static void memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state);
-static void memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clean_state);
 static void memorystatus_reschedule_idle_demotion_locked(void);
-
-static void memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check);
-
 int memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap);
-
 vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
-
 boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
 void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
 void memorystatus_send_low_swap_note(void);
+int memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index);
+boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count,
+    uint32_t *errors, uint64_t *memory_reclaimed);
+uint64_t memorystatus_available_memory_internal(proc_t p);
 
 unsigned int memorystatus_level = 0;
-
 static int memorystatus_list_count = 0;
-
-
-#define MEMSTAT_BUCKET_COUNT (JETSAM_PRIORITY_MAX + 1)
-
-typedef struct memstat_bucket {
-       TAILQ_HEAD(, proc) list;
-       int count;
-} memstat_bucket_t;
-
 memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
-
-int memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index);
-
+static thread_call_t memorystatus_idle_demotion_call;
 uint64_t memstat_idle_demotion_deadline = 0;
-
 int system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
 int applications_aging_band = JETSAM_PRIORITY_IDLE;
 
 #define isProcessInAgingBands(p)        ((isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) || (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)))
 
-/*
- * Checking the p_memstat_state almost always requires the proc_list_lock
- * because the jetsam thread could be on the other core changing the state.
- *
- * App -- almost always managed by a system process. Always have dirty tracking OFF. Can include extensions too.
- * System Processes -- not managed by anybody. Always have dirty tracking ON. Can include extensions (here) too.
- */
-#define isApp(p)                        ((p->p_memstat_state & P_MEMSTAT_MANAGED) || ! (p->p_memstat_dirty & P_DIRTY_TRACK))
-#define isSysProc(p)                    ( ! (p->p_memstat_state & P_MEMSTAT_MANAGED) || (p->p_memstat_dirty & P_DIRTY_TRACK))
-
 #define kJetsamAgingPolicyNone                          (0)
 #define kJetsamAgingPolicyLegacy                        (1)
 #define kJetsamAgingPolicySysProcsReclaimedFirst        (2)
 #define kJetsamAgingPolicyAppsReclaimedFirst            (3)
 #define kJetsamAgingPolicyMax                           kJetsamAgingPolicyAppsReclaimedFirst
 
-unsigned int jetsam_aging_policy = kJetsamAgingPolicyLegacy;
+unsigned int jetsam_aging_policy = kJetsamAgingPolicySysProcsReclaimedFirst;
 
 extern int corpse_for_fatal_memkill;
-extern unsigned long total_corpses_count(void) __attribute__((pure));
-extern void task_purge_all_corpses(void);
 extern uint64_t vm_purgeable_purge_task_owned(task_t task);
 boolean_t memorystatus_allowed_vm_map_fork(task_t);
 #if DEVELOPMENT || DEBUG
 void memorystatus_abort_vm_map_fork(task_t);
 #endif
 
+/*
+ * Idle delay timeout factors for daemons based on relaunch behavior. Only used in
+ * kJetsamAgingPolicySysProcsReclaimedFirst aging policy.
+ */
+#define kJetsamSysProcsIdleDelayTimeLowRatio    (5)
+#define kJetsamSysProcsIdleDelayTimeMedRatio    (2)
+#define kJetsamSysProcsIdleDelayTimeHighRatio   (1)
+static_assert(kJetsamSysProcsIdleDelayTimeLowRatio <= DEFERRED_IDLE_EXIT_TIME_SECS, "sysproc idle delay time for low relaunch daemons would be 0");
+
+/*
+ * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, treat apps as well
+ * behaved daemons for aging purposes.
+ */
+#define kJetsamAppsIdleDelayTimeRatio   (kJetsamSysProcsIdleDelayTimeLowRatio)
+
+static uint64_t
+memorystatus_sysprocs_idle_time(proc_t p)
+{
+       /*
+        * The kJetsamAgingPolicySysProcsReclaimedFirst aging policy uses the relaunch behavior to
+        * determine the exact idle deferred time provided to the daemons. For all other aging
+        * policies, simply return the default aging idle time.
+        */
+       if (jetsam_aging_policy != kJetsamAgingPolicySysProcsReclaimedFirst) {
+               return memorystatus_sysprocs_idle_delay_time;
+       }
+
+       uint64_t idle_delay_time = 0;
+       /*
+        * For system processes, base the idle delay time on the
+        * jetsam relaunch behavior specified by launchd. The idea
+        * is to provide extra protection to the daemons which would
+        * relaunch immediately after jetsam.
+        */
+       switch (p->p_memstat_relaunch_flags) {
+       case P_MEMSTAT_RELAUNCH_UNKNOWN:
+       case P_MEMSTAT_RELAUNCH_LOW:
+               idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeLowRatio;
+               break;
+       case P_MEMSTAT_RELAUNCH_MED:
+               idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeMedRatio;
+               break;
+       case P_MEMSTAT_RELAUNCH_HIGH:
+               idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeHighRatio;
+               break;
+       default:
+               panic("Unknown relaunch flags on process!");
+               break;
+       }
+       return idle_delay_time;
+}
+
+static uint64_t
+memorystatus_apps_idle_time(__unused proc_t p)
+{
+       /*
+        * For kJetsamAgingPolicySysProcsReclaimedFirst, the Apps are considered as low
+        * relaunch candidates. So only provide limited protection to them. In the other
+        * aging policies, return the default aging idle time.
+        */
+       if (jetsam_aging_policy != kJetsamAgingPolicySysProcsReclaimedFirst) {
+               return memorystatus_apps_idle_delay_time;
+       }
+
+       return memorystatus_apps_idle_delay_time / kJetsamAppsIdleDelayTimeRatio;
+}
+
+
 #if 0
 
 /* Keeping around for future use if we need a utility that can do this OR an app that needs a dynamic adjustment. */
@@ -518,6 +537,103 @@ static unsigned int memorystatus_dirty_count = 0;
 
 SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED, &max_task_footprint_mb, 0, "");
 
+static int memorystatus_highwater_enabled = 1;  /* Update the cached memlimit data. */
+static boolean_t proc_jetsam_state_is_active_locked(proc_t);
+
+#if __arm64__
+#if CONFIG_MEMORYSTATUS
+int legacy_footprint_bonus_mb = 50; /* This value was chosen after looking at the top 30 apps
+                                     * that needed the additional room in their footprint when
+                                     * the 'correct' accounting methods were applied to them.
+                                     */
+
+#if DEVELOPMENT || DEBUG
+SYSCTL_INT(_kern, OID_AUTO, legacy_footprint_bonus_mb, CTLFLAG_RW | CTLFLAG_LOCKED, &legacy_footprint_bonus_mb, 0, "");
+#endif /* DEVELOPMENT || DEBUG */
+
+void
+memorystatus_act_on_legacy_footprint_entitlement(proc_t p, boolean_t footprint_increase)
+{
+       int memlimit_mb_active = 0, memlimit_mb_inactive = 0;
+       boolean_t memlimit_active_is_fatal = FALSE, memlimit_inactive_is_fatal = 0, use_active_limit = FALSE;
+
+       if (p == NULL) {
+               return;
+       }
+
+       proc_list_lock();
+
+       if (p->p_memstat_memlimit_active > 0) {
+               memlimit_mb_active = p->p_memstat_memlimit_active;
+       } else if (p->p_memstat_memlimit_active == -1) {
+               memlimit_mb_active = max_task_footprint_mb;
+       } else {
+               /*
+                * Nothing to do for '0' which is
+                * a special value only used internally
+                * to test 'no limits'.
+                */
+               proc_list_unlock();
+               return;
+       }
+
+       if (p->p_memstat_memlimit_inactive > 0) {
+               memlimit_mb_inactive = p->p_memstat_memlimit_inactive;
+       } else if (p->p_memstat_memlimit_inactive == -1) {
+               memlimit_mb_inactive = max_task_footprint_mb;
+       } else {
+               /*
+                * Nothing to do for '0' which is
+                * a special value only used internally
+                * to test 'no limits'.
+                */
+               proc_list_unlock();
+               return;
+       }
+
+       if (footprint_increase) {
+               memlimit_mb_active += legacy_footprint_bonus_mb;
+               memlimit_mb_inactive += legacy_footprint_bonus_mb;
+       } else {
+               memlimit_mb_active -= legacy_footprint_bonus_mb;
+               if (memlimit_mb_active == max_task_footprint_mb) {
+                       memlimit_mb_active = -1; /* reverting back to default system limit */
+               }
+
+               memlimit_mb_inactive -= legacy_footprint_bonus_mb;
+               if (memlimit_mb_inactive == max_task_footprint_mb) {
+                       memlimit_mb_inactive = -1; /* reverting back to default system limit */
+               }
+       }
+
+       memlimit_active_is_fatal = (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL);
+       memlimit_inactive_is_fatal = (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL);
+
+       SET_ACTIVE_LIMITS_LOCKED(p, memlimit_mb_active, memlimit_active_is_fatal);
+       SET_INACTIVE_LIMITS_LOCKED(p, memlimit_mb_inactive, memlimit_inactive_is_fatal);
+
+       if (proc_jetsam_state_is_active_locked(p) == TRUE) {
+               use_active_limit = TRUE;
+               CACHE_ACTIVE_LIMITS_LOCKED(p, memlimit_active_is_fatal);
+       } else {
+               CACHE_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive_is_fatal);
+       }
+
+
+       if (memorystatus_highwater_enabled) {
+               task_set_phys_footprint_limit_internal(p->task,
+                   (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1,
+                   NULL,                                    /*return old value */
+                   use_active_limit,                                    /*active limit?*/
+                   (use_active_limit ? memlimit_active_is_fatal : memlimit_inactive_is_fatal));
+       }
+
+       proc_list_unlock();
+}
+
+#endif /* CONFIG_MEMORYSTATUS */
+#endif /* __arm64__ */
+
 #if CONFIG_EMBEDDED
 
 SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_level, 0, "");
@@ -538,16 +654,10 @@ memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_ar
        return 0;
 }
 
-static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search);
-static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search);
-
 static void memorystatus_thread(void *param __unused, wait_result_t wr __unused);
 
 /* Memory Limits */
 
-static int memorystatus_highwater_enabled = 1;  /* Update the cached memlimit data. */
-
-static boolean_t proc_jetsam_state_is_active_locked(proc_t);
 static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
 static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
 
@@ -560,6 +670,9 @@ static int memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffe
 
 static int memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
 
+static void memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry);
+static int memorystatus_set_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry);
+
 int proc_get_memstat_priority(proc_t, boolean_t);
 
 static boolean_t memorystatus_idle_snapshot = 0;
@@ -601,20 +714,6 @@ SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_priority_band_max,
 
 static uint32_t kill_under_pressure_cause = 0;
 
-/*
- * default jetsam snapshot support
- */
-static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
-static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_copy;
-#define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot->entries
-static unsigned int memorystatus_jetsam_snapshot_count = 0;
-static unsigned int memorystatus_jetsam_snapshot_copy_count = 0;
-static unsigned int memorystatus_jetsam_snapshot_max = 0;
-static unsigned int memorystatus_jetsam_snapshot_size = 0;
-static uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0;
-static uint64_t memorystatus_jetsam_snapshot_timeout = 0;
-#define JETSAM_SNAPSHOT_TIMEOUT_SECS 30
-
 /*
  * snapshot support for memstats collected at boot.
  */
@@ -625,7 +724,6 @@ static boolean_t memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memory
 static void memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime);
 
 static void memorystatus_clear_errors(void);
-static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages);
 static void memorystatus_get_task_phys_footprint_page_counts(task_t task,
     uint64_t *internal_pages, uint64_t *internal_compressed_pages,
     uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
@@ -637,10 +735,10 @@ static void memorystatus_get_task_memory_region_count(task_t task, uint64_t *cou
 static uint32_t memorystatus_build_state(proc_t p);
 //static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured);
 
-static boolean_t memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason, int32_t *priority, uint32_t *errors);
-static boolean_t memorystatus_kill_top_process_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, uint32_t *errors);
-static boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors);
-static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged);
+static boolean_t memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason, int32_t *priority,
+    uint32_t *errors, uint64_t *memory_reclaimed);
+static boolean_t memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, uint32_t *errors, uint64_t *memory_reclaimed);
+static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed);
 
 static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause);
 
@@ -665,14 +763,18 @@ extern unsigned int    vm_page_purgeable_count;
 extern unsigned int    vm_page_wire_count;
 #if CONFIG_SECLUDED_MEMORY
 extern unsigned int     vm_page_secluded_count;
+extern unsigned int     vm_page_secluded_count_over_target;
 #endif /* CONFIG_SECLUDED_MEMORY */
 
+/* Aggressive jetsam pages threshold for sysproc aging policy */
+unsigned int memorystatus_sysproc_aging_aggr_pages = 0;
+
 #if CONFIG_JETSAM
 unsigned int memorystatus_available_pages = (unsigned int)-1;
 unsigned int memorystatus_available_pages_pressure = 0;
 unsigned int memorystatus_available_pages_critical = 0;
-static unsigned int memorystatus_available_pages_critical_base = 0;
-static unsigned int memorystatus_available_pages_critical_idle_offset = 0;
+unsigned int memorystatus_available_pages_critical_base = 0;
+unsigned int memorystatus_available_pages_critical_idle_offset = 0;
 
 #if DEVELOPMENT || DEBUG
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
@@ -688,6 +790,15 @@ static unsigned int memorystatus_thread_wasted_wakeup = 0;
 /* Callback into vm_compressor.c to signal that thrashing has been mitigated. */
 extern void vm_thrashing_jetsam_done(void);
 static int memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit);
+#if DEVELOPMENT || DEBUG
+static inline uint32_t
+roundToNearestMB(uint32_t in)
+{
+       return (in + ((1 << 20) - 1)) >> 20;
+}
+
+static int memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase);
+#endif
 
 int32_t max_kill_priority = JETSAM_PRIORITY_MAX;
 
@@ -700,56 +811,6 @@ uint64_t memorystatus_available_pages_critical = (uint64_t)-1;
 int32_t max_kill_priority = JETSAM_PRIORITY_IDLE;
 #endif /* CONFIG_JETSAM */
 
-unsigned int memorystatus_frozen_count = 0;
-unsigned int memorystatus_frozen_processes_max = 0;
-unsigned int memorystatus_frozen_shared_mb = 0;
-unsigned int memorystatus_frozen_shared_mb_max = 0;
-unsigned int memorystatus_freeze_shared_mb_per_process_max = 0; /* Max. MB allowed per process to be freezer-eligible. */
-unsigned int memorystatus_freeze_private_shared_pages_ratio = 2; /* Ratio of private:shared pages for a process to be freezer-eligible. */
-unsigned int memorystatus_suspended_count = 0;
-unsigned int memorystatus_thaw_count = 0;
-unsigned int memorystatus_refreeze_eligible_count = 0; /* # of processes currently thawed i.e. have state on disk & in-memory */
-
-#if VM_PRESSURE_EVENTS
-
-boolean_t memorystatus_warn_process(pid_t pid, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t exceeded);
-
-vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
-
-/*
- * We use this flag to signal if we have any HWM offenders
- * on the system. This way we can reduce the number of wakeups
- * of the memorystatus_thread when the system is between the
- * "pressure" and "critical" threshold.
- *
- * The (re-)setting of this variable is done without any locks
- * or synchronization simply because it is not possible (currently)
- * to keep track of HWM offenders that drop down below their memory
- * limit and/or exit. So, we choose to burn a couple of wasted wakeups
- * by allowing the unguarded modification of this variable.
- */
-boolean_t memorystatus_hwm_candidates = 0;
-
-static int memorystatus_send_note(int event_code, void *data, size_t data_length);
-
-/*
- * This value is the threshold that a process must meet to be considered for scavenging.
- */
-#if CONFIG_EMBEDDED
-#define VM_PRESSURE_MINIMUM_RSIZE               6       /* MB */
-#else /* CONFIG_EMBEDDED */
-#define VM_PRESSURE_MINIMUM_RSIZE               10      /* MB */
-#endif /* CONFIG_EMBEDDED */
-
-uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE;
-
-#if DEVELOPMENT || DEBUG
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, "");
-#endif /* DEVELOPMENT || DEBUG */
-
-#endif /* VM_PRESSURE_EVENTS */
-
-
 #if DEVELOPMENT || DEBUG
 
 lck_grp_attr_t *disconnect_page_mappings_lck_grp_attr;
@@ -760,80 +821,6 @@ extern boolean_t kill_on_no_paging_space;
 #endif /* DEVELOPMENT || DEBUG */
 
 
-/*
- * Table that expresses the probability of a process
- * being used in the next hour.
- */
-typedef struct memorystatus_internal_probabilities {
-       char proc_name[MAXCOMLEN + 1];
-       int use_probability;
-} memorystatus_internal_probabilities_t;
-
-static memorystatus_internal_probabilities_t *memorystatus_global_probabilities_table = NULL;
-static size_t memorystatus_global_probabilities_size = 0;
-
-/* Freeze */
-
-#if CONFIG_FREEZE
-boolean_t memorystatus_freeze_enabled = FALSE;
-int memorystatus_freeze_wakeup = 0;
-int memorystatus_freeze_jetsam_band = 0; /* the jetsam band which will contain P_MEMSTAT_FROZEN processes */
-
-lck_grp_attr_t *freezer_lck_grp_attr;
-lck_grp_t *freezer_lck_grp;
-static lck_mtx_t freezer_mutex;
-
-static inline boolean_t memorystatus_can_freeze_processes(void);
-static boolean_t memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low);
-static boolean_t memorystatus_is_process_eligible_for_freeze(proc_t p);
-static void memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused);
-static boolean_t memorystatus_freeze_thread_should_run(void);
-
-void memorystatus_disable_freeze(void);
-
-/* Thresholds */
-static unsigned int memorystatus_freeze_threshold = 0;
-
-static unsigned int memorystatus_freeze_pages_min = 0;
-static unsigned int memorystatus_freeze_pages_max = 0;
-
-static unsigned int memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
-
-static unsigned int memorystatus_freeze_daily_mb_max = FREEZE_DAILY_MB_MAX_DEFAULT;
-static uint64_t  memorystatus_freeze_budget_pages_remaining = 0; //remaining # of pages that can be frozen to disk
-static boolean_t memorystatus_freeze_degradation = FALSE; //protected by the freezer mutex. Signals we are in a degraded freeze mode.
-
-static unsigned int memorystatus_max_frozen_demotions_daily = 0;
-static unsigned int memorystatus_thaw_count_demotion_threshold = 0;
-
-/* Stats */
-static uint64_t memorystatus_freeze_pageouts = 0;
-
-/* Throttling */
-#define DEGRADED_WINDOW_MINS    (30)
-#define NORMAL_WINDOW_MINS      (24 * 60)
-
-static throttle_interval_t throttle_intervals[] = {
-       { DEGRADED_WINDOW_MINS, 1, 0, 0, { 0, 0 }},
-       { NORMAL_WINDOW_MINS, 1, 0, 0, { 0, 0 }},
-};
-throttle_interval_t *degraded_throttle_window = &throttle_intervals[0];
-throttle_interval_t *normal_throttle_window = &throttle_intervals[1];
-
-extern uint64_t vm_swap_get_free_space(void);
-extern boolean_t vm_swap_max_budget(uint64_t *);
-
-static void memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed);
-
-static uint64_t memorystatus_freezer_thread_next_run_ts = 0;
-
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_frozen_count, 0, "");
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_thaw_count, 0, "");
-SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, "");
-SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_budget_pages_remaining, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_budget_pages_remaining, "");
-
-#endif /* CONFIG_FREEZE */
-
 /* Debug */
 
 extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *);
@@ -870,16 +857,17 @@ memorystatus_debug_dump_bucket_locked(unsigned int bucket_index)
         */
 
        printf("memorystatus_debug_dump ***START*(PAGE_SIZE_64=%llu)**\n", PAGE_SIZE_64);
-       printf("bucket [pid]       [pages / MB]     [state]      [EP / RP]   dirty     deadline [L-limit / C-limit / A-limit / IA-limit] name\n");
+       printf("bucket [pid]       [pages / MB]     [state]      [EP / RP / AP]   dirty     deadline [L-limit / C-limit / A-limit / IA-limit] name\n");
        p = memorystatus_get_first_proc_locked(&b, traverse_all_buckets);
        while (p) {
                bytes = get_task_phys_footprint(p->task);
                task_get_phys_footprint_limit(p->task, &ledger_limit);
-               printf("%2d     [%5d]     [%5lld /%3lldMB]   0x%-8x   [%2d / %2d]   0x%-3x   %10lld    [%3d / %3d%s / %3d%s / %3d%s]   %s\n",
+               printf("%2d     [%5d]     [%5lld /%3lldMB]   0x%-8x   [%2d / %2d / %2d]   0x%-3x   %10lld    [%3d / %3d%s / %3d%s / %3d%s]   %s\n",
                    b, p->p_pid,
                    (bytes / PAGE_SIZE_64),             /* task's footprint converted from bytes to pages     */
                    (bytes / (1024ULL * 1024ULL)),      /* task's footprint converted from bytes to MB */
-                   p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_dirty, p->p_memstat_idledeadline,
+                   p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_assertionpriority,
+                   p->p_memstat_dirty, p->p_memstat_idledeadline,
                    ledger_limit,
                    p->p_memstat_memlimit,
                    (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"),
@@ -999,7525 +987,5078 @@ sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS
 
 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", "");
 
+SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, "");
+
+#if CONFIG_JETSAM
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages_critical, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_base, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_critical_base, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_idle_offset, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_critical_idle_offset, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_policy_more_free_offset_pages, CTLFLAG_RW, &memorystatus_policy_more_free_offset_pages, 0, "");
+
+static unsigned int memorystatus_jetsam_panic_debug = 0;
+
 #if VM_PRESSURE_EVENTS
 
-/*
- * This routine is used for targeted notifications regardless of system memory pressure
- * and regardless of whether or not the process has already been notified.
- * It bypasses and has no effect on the only-one-notification per soft-limit policy.
- *
- * "memnote" is the current user.
- */
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_pressure, 0, "");
 
-static int
-sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
-{
-#pragma unused(arg1, arg2)
+#endif /* VM_PRESSURE_EVENTS */
 
-       int error = 0, pid = 0;
-       struct knote *kn = NULL;
-       boolean_t found_knote = FALSE;
-       int fflags = 0;         /* filter flags for EVFILT_MEMORYSTATUS */
-       uint64_t value = 0;
+#endif /* CONFIG_JETSAM */
 
-       error = sysctl_handle_quad(oidp, &value, 0, req);
-       if (error || !req->newptr) {
-               return error;
-       }
+#endif /* DEVELOPMENT || DEBUG */
 
-       /*
-        * Find the pid in the low 32 bits of value passed in.
-        */
-       pid = (int)(value & 0xFFFFFFFF);
+extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
+    void *parameter,
+    integer_t priority,
+    thread_t *new_thread);
 
-       /*
-        * Find notification in the high 32 bits of the value passed in.
-        */
-       fflags = (int)((value >> 32) & 0xFFFFFFFF);
+#if DEVELOPMENT || DEBUG
 
-       /*
-        * For backwards compatibility, when no notification is
-        * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN
-        */
-       if (fflags == 0) {
-               fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
-               // printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags);
-       }
+static int
+sysctl_memorystatus_disconnect_page_mappings SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int     error = 0, pid = 0;
+       proc_t  p;
 
-       /*
-        * See event.h ... fflags for EVFILT_MEMORYSTATUS
-        */
-       if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) ||
-           (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) ||
-           (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) ||
-           (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) ||
-           (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) ||
-           (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) ||
-           (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 &&
-           ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) {
-               printf("memorystatus_vm_pressure_send: notification [0x%x] not supported \n", fflags);
-               error = 1;
+       error = sysctl_handle_int(oidp, &pid, 0, req);
+       if (error || !req->newptr) {
                return error;
        }
 
-       /*
-        * Forcibly send pid a memorystatus notification.
-        */
+       lck_mtx_lock(&disconnect_page_mappings_mutex);
+
+       if (pid == -1) {
+               vm_pageout_disconnect_all_pages();
+       } else {
+               p = proc_find(pid);
 
-       memorystatus_klist_lock();
+               if (p != NULL) {
+                       error = task_disconnect_page_mappings(p->task);
 
-       SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
-               proc_t knote_proc = knote_get_kq(kn)->kq_p;
-               pid_t knote_pid = knote_proc->p_pid;
+                       proc_rele(p);
 
-               if (knote_pid == pid) {
-                       /*
-                        * Forcibly send this pid a memorystatus notification.
-                        */
-                       kn->kn_fflags = fflags;
-                       found_knote = TRUE;
+                       if (error) {
+                               error = EIO;
+                       }
+               } else {
+                       error = EINVAL;
                }
        }
-
-       if (found_knote) {
-               KNOTE(&memorystatus_klist, 0);
-               printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d] \n", value, fflags, pid);
-               error = 0;
-       } else {
-               printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid);
-               error = 1;
-       }
-
-       memorystatus_klist_unlock();
+       lck_mtx_unlock(&disconnect_page_mappings_mutex);
 
        return error;
 }
 
-SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
-    0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", "");
-
-#endif /* VM_PRESSURE_EVENTS */
-
-SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, "");
-
-#if CONFIG_JETSAM
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages_critical, 0, "");
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_base, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_critical_base, 0, "");
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_idle_offset, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_critical_idle_offset, 0, "");
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_policy_more_free_offset_pages, CTLFLAG_RW, &memorystatus_policy_more_free_offset_pages, 0, "");
-
-static unsigned int memorystatus_jetsam_panic_debug = 0;
-static unsigned int memorystatus_jetsam_policy_offset_pages_diagnostic = 0;
-
-/* Diagnostic code */
+SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
+    0, 0, &sysctl_memorystatus_disconnect_page_mappings, "I", "");
 
-enum {
-       kJetsamDiagnosticModeNone =              0,
-       kJetsamDiagnosticModeAll  =              1,
-       kJetsamDiagnosticModeStopAtFirstActive = 2,
-       kJetsamDiagnosticModeCount
-} jetsam_diagnostic_mode = kJetsamDiagnosticModeNone;
+#endif /* DEVELOPMENT || DEBUG */
 
-static int jetsam_diagnostic_suspended_one_active_proc = 0;
 
+/*
+ * Picks the sorting routine for a given jetsam priority band.
+ *
+ * Input:
+ *     bucket_index - jetsam priority band to be sorted.
+ *     sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
+ *             Currently sort_order is only meaningful when handling
+ *             coalitions.
+ *
+ * Return:
+ *     0     on success
+ *      non-0 on failure
+ */
 static int
-sysctl_jetsam_diagnostic_mode SYSCTL_HANDLER_ARGS
+memorystatus_sort_bucket(unsigned int bucket_index, int sort_order)
 {
-#pragma unused(arg1, arg2)
-
-       const char *diagnosticStrings[] = {
-               "jetsam: diagnostic mode: resetting critical level.",
-               "jetsam: diagnostic mode: will examine all processes",
-               "jetsam: diagnostic mode: will stop at first active process"
-       };
+       int coal_sort_order;
 
-       int error, val = jetsam_diagnostic_mode;
-       boolean_t changed = FALSE;
+       /*
+        * Verify the jetsam priority
+        */
+       if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
+               return EINVAL;
+       }
 
-       error = sysctl_handle_int(oidp, &val, 0, req);
-       if (error || !req->newptr) {
-               return error;
+#if DEVELOPMENT || DEBUG
+       if (sort_order == JETSAM_SORT_DEFAULT) {
+               coal_sort_order = COALITION_SORT_DEFAULT;
+       } else {
+               coal_sort_order = sort_order;           /* only used for testing scenarios */
        }
-       if ((val < 0) || (val >= kJetsamDiagnosticModeCount)) {
-               printf("jetsam: diagnostic mode: invalid value - %d\n", val);
+#else
+       /* Verify default */
+       if (sort_order == JETSAM_SORT_DEFAULT) {
+               coal_sort_order = COALITION_SORT_DEFAULT;
+       } else {
                return EINVAL;
        }
+#endif
 
        proc_list_lock();
 
-       if ((unsigned int) val != jetsam_diagnostic_mode) {
-               jetsam_diagnostic_mode = val;
-
-               memorystatus_jetsam_policy &= ~kPolicyDiagnoseActive;
+       if (memstat_bucket[bucket_index].count == 0) {
+               proc_list_unlock();
+               return 0;
+       }
 
-               switch (jetsam_diagnostic_mode) {
-               case kJetsamDiagnosticModeNone:
-                       /* Already cleared */
-                       break;
-               case kJetsamDiagnosticModeAll:
-                       memorystatus_jetsam_policy |= kPolicyDiagnoseAll;
-                       break;
-               case kJetsamDiagnosticModeStopAtFirstActive:
-                       memorystatus_jetsam_policy |= kPolicyDiagnoseFirst;
-                       break;
-               default:
-                       /* Already validated */
-                       break;
+       switch (bucket_index) {
+       case JETSAM_PRIORITY_FOREGROUND:
+               if (memorystatus_sort_by_largest_coalition_locked(bucket_index, coal_sort_order) == 0) {
+                       /*
+                        * Fall back to per process sorting when zero coalitions are found.
+                        */
+                       memorystatus_sort_by_largest_process_locked(bucket_index);
                }
-
-               memorystatus_update_levels_locked(FALSE);
-               changed = TRUE;
+               break;
+       default:
+               memorystatus_sort_by_largest_process_locked(bucket_index);
+               break;
        }
-
        proc_list_unlock();
 
-       if (changed) {
-               printf("%s\n", diagnosticStrings[val]);
-       }
-
        return 0;
 }
 
-SYSCTL_PROC(_debug, OID_AUTO, jetsam_diagnostic_mode, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
-    &jetsam_diagnostic_mode, 0, sysctl_jetsam_diagnostic_mode, "I", "Jetsam Diagnostic Mode");
-
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jetsam_policy_offset_pages_diagnostic, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jetsam_policy_offset_pages_diagnostic, 0, "");
-
-#if VM_PRESSURE_EVENTS
-
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_pressure, 0, "");
-
-#endif /* VM_PRESSURE_EVENTS */
-
-#endif /* CONFIG_JETSAM */
-
-#if CONFIG_FREEZE
-
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_jetsam_band, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_jetsam_band, 0, "");
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_daily_mb_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_daily_mb_max, 0, "");
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_degraded_mode, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_degradation, 0, "");
-
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, "");
-
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, "");
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, "");
-
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_refreeze_eligible_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_refreeze_eligible_count, 0, "");
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_processes_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_frozen_processes_max, 0, "");
-
-/*
- * Max. shared-anonymous memory in MB that can be held by frozen processes in the high jetsam band.
- * "0" means no limit.
- * Default is 10% of system-wide task limit.
- */
-
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_frozen_shared_mb_max, 0, "");
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_frozen_shared_mb, 0, "");
-
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_per_process_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_shared_mb_per_process_max, 0, "");
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_private_shared_pages_ratio, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_private_shared_pages_ratio, 0, "");
-
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, "");
-
-/*
- * max. # of frozen process demotions we will allow in our daily cycle.
- */
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_max_freeze_demotions_daily, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_max_frozen_demotions_daily, 0, "");
-/*
- * min # of thaws needed by a process to protect it from getting demoted into the IDLE band.
- */
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count_demotion_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_thaw_count_demotion_threshold, 0, "");
-
-boolean_t memorystatus_freeze_throttle_enabled = TRUE;
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, "");
-
 /*
- * When set to true, this keeps frozen processes in the compressor pool in memory, instead of swapping them out to disk.
- * Exposed via the sysctl kern.memorystatus_freeze_to_memory.
+ * Sort processes by size for a single jetsam bucket.
  */
-boolean_t memorystatus_freeze_to_memory = FALSE;
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_to_memory, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_to_memory, 0, "");
 
-#define VM_PAGES_FOR_ALL_PROCS  (2)
-/*
- * Manual trigger of freeze and thaw for dev / debug kernels only.
- */
-static int
-sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS
+static void
+memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)
 {
-#pragma unused(arg1, arg2)
-       int error, pid = 0;
-       proc_t p;
-       int freezer_error_code = 0;
-
-       if (memorystatus_freeze_enabled == FALSE) {
-               printf("sysctl_freeze: Freeze is DISABLED\n");
-               return ENOTSUP;
-       }
-
-       error = sysctl_handle_int(oidp, &pid, 0, req);
-       if (error || !req->newptr) {
-               return error;
-       }
-
-       if (pid == VM_PAGES_FOR_ALL_PROCS) {
-               vm_pageout_anonymous_pages();
+       proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL;
+       proc_t next_p = NULL, prev_max_proc = NULL;
+       uint32_t pages = 0, max_pages = 0;
+       memstat_bucket_t *current_bucket;
 
-               return 0;
+       if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
+               return;
        }
 
-       lck_mtx_lock(&freezer_mutex);
-
-       p = proc_find(pid);
-       if (p != NULL) {
-               uint32_t purgeable, wired, clean, dirty, shared;
-               uint32_t max_pages = 0, state = 0;
-
-               if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
-                       /*
-                        * Freezer backed by the compressor and swap file(s)
-                        * will hold compressed data.
-                        *
-                        * Set the sysctl kern.memorystatus_freeze_to_memory to true to keep compressed data from
-                        * being swapped out to disk. Note that this disables freezer swap support globally,
-                        * not just for the process being frozen.
-                        *
-                        *
-                        * We don't care about the global freezer budget or the process's (min/max) budget here.
-                        * The freeze sysctl is meant to force-freeze a process.
-                        *
-                        * We also don't update any global or process stats on this path, so that the jetsam/ freeze
-                        * logic remains unaffected. The tasks we're performing here are: freeze the process, set the
-                        * P_MEMSTAT_FROZEN bit, and elevate the process to a higher band (if the freezer is active).
-                        */
-                       max_pages = memorystatus_freeze_pages_max;
-               } else {
-                       /*
-                        * We only have the compressor without any swap.
-                        */
-                       max_pages = UINT32_MAX - 1;
-               }
-
-               proc_list_lock();
-               state = p->p_memstat_state;
-               proc_list_unlock();
-
-               /*
-                * The jetsam path also verifies that the process is a suspended App. We don't care about that here.
-                * We simply ensure that jetsam is not already working on the process and that the process has not
-                * explicitly disabled freezing.
-                */
-               if (state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED)) {
-                       printf("sysctl_freeze: p_memstat_state check failed, process is%s%s%s\n",
-                           (state & P_MEMSTAT_TERMINATED) ? " terminated" : "",
-                           (state & P_MEMSTAT_LOCKED) ? " locked" : "",
-                           (state & P_MEMSTAT_FREEZE_DISABLED) ? " unfreezable" : "");
-
-                       proc_rele(p);
-                       lck_mtx_unlock(&freezer_mutex);
-                       return EPERM;
-               }
-
-               error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */);
-
-               if (error) {
-                       char reason[128];
-                       if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) {
-                               strlcpy(reason, "too much shared memory", 128);
-                       }
+       current_bucket = &memstat_bucket[bucket_index];
 
-                       if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) {
-                               strlcpy(reason, "low private-shared pages ratio", 128);
-                       }
+       p = TAILQ_FIRST(&current_bucket->list);
 
-                       if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) {
-                               strlcpy(reason, "no compressor space", 128);
-                       }
+       while (p) {
+               memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
+               max_pages = pages;
+               max_proc = p;
+               prev_max_proc = p;
 
-                       if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) {
-                               strlcpy(reason, "no swap space", 128);
+               while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) {
+                       /* traversing list until we find next largest process */
+                       p = next_p;
+                       memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
+                       if (pages > max_pages) {
+                               max_pages = pages;
+                               max_proc = p;
                        }
+               }
 
-                       printf("sysctl_freeze: task_freeze failed: %s\n", reason);
-
-                       if (error == KERN_NO_SPACE) {
-                               /* Make it easy to distinguish between failures due to low compressor/ swap space and other failures. */
-                               error = ENOSPC;
+               if (prev_max_proc != max_proc) {
+                       /* found a larger process, place it in the list */
+                       TAILQ_REMOVE(&current_bucket->list, max_proc, p_memstat_list);
+                       if (insert_after_proc == NULL) {
+                               TAILQ_INSERT_HEAD(&current_bucket->list, max_proc, p_memstat_list);
                        } else {
-                               error = EIO;
-                       }
-               } else {
-                       proc_list_lock();
-                       if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) {
-                               p->p_memstat_state |= P_MEMSTAT_FROZEN;
-                               memorystatus_frozen_count++;
-                       }
-                       p->p_memstat_frozen_count++;
-
-
-                       proc_list_unlock();
-
-                       if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
-                               /*
-                                * We elevate only if we are going to swap out the data.
-                                */
-                               error = memorystatus_update_inactive_jetsam_priority_band(pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE,
-                                   memorystatus_freeze_jetsam_band, TRUE);
-
-                               if (error) {
-                                       printf("sysctl_freeze: Elevating frozen process to higher jetsam band failed with %d\n", error);
-                               }
+                               TAILQ_INSERT_AFTER(&current_bucket->list, insert_after_proc, max_proc, p_memstat_list);
                        }
+                       prev_max_proc = max_proc;
                }
 
-               proc_rele(p);
+               insert_after_proc = max_proc;
 
-               lck_mtx_unlock(&freezer_mutex);
-               return error;
-       } else {
-               printf("sysctl_freeze: Invalid process\n");
+               p = TAILQ_NEXT(max_proc, p_memstat_list);
        }
-
-
-       lck_mtx_unlock(&freezer_mutex);
-       return EINVAL;
 }
 
-SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freeze, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
-    0, 0, &sysctl_memorystatus_freeze, "I", "");
-
-static int
-sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS
+proc_t
+memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search)
 {
-#pragma unused(arg1, arg2)
-
-       int error, pid = 0;
-       proc_t p;
-
-       if (memorystatus_freeze_enabled == FALSE) {
-               return ENOTSUP;
-       }
+       memstat_bucket_t *current_bucket;
+       proc_t next_p;
 
-       error = sysctl_handle_int(oidp, &pid, 0, req);
-       if (error || !req->newptr) {
-               return error;
+       if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
+               return NULL;
        }
 
-       if (pid == VM_PAGES_FOR_ALL_PROCS) {
-               do_fastwake_warmup_all();
-               return 0;
-       } else {
-               p = proc_find(pid);
-               if (p != NULL) {
-                       error = task_thaw(p->task);
-
-                       if (error) {
-                               error = EIO;
-                       } else {
-                               /*
-                                * task_thaw() succeeded.
-                                *
-                                * We increment memorystatus_frozen_count on the sysctl freeze path.
-                                * And so we need the P_MEMSTAT_FROZEN to decrement the frozen count
-                                * when this process exits.
-                                *
-                                * proc_list_lock();
-                                * p->p_memstat_state &= ~P_MEMSTAT_FROZEN;
-                                * proc_list_unlock();
-                                */
-                       }
-                       proc_rele(p);
-                       return error;
+       current_bucket = &memstat_bucket[*bucket_index];
+       next_p = TAILQ_FIRST(&current_bucket->list);
+       if (!next_p && search) {
+               while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
+                       current_bucket = &memstat_bucket[*bucket_index];
+                       next_p = TAILQ_FIRST(&current_bucket->list);
                }
        }
 
-       return EINVAL;
+       return next_p;
 }
 
-SYSCTL_PROC(_kern, OID_AUTO, memorystatus_thaw, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
-    0, 0, &sysctl_memorystatus_available_pages_thaw, "I", "");
-
-typedef struct _global_freezable_status {
-       boolean_t       freeze_pages_threshold_crossed;
-       boolean_t       freeze_eligible_procs_available;
-       boolean_t       freeze_scheduled_in_future;
-}global_freezable_status_t;
-
-typedef struct _proc_freezable_status {
-       boolean_t       freeze_has_memstat_state;
-       boolean_t       freeze_has_pages_min;
-       int             freeze_has_probability;
-       boolean_t       freeze_attempted;
-       uint32_t        p_memstat_state;
-       uint32_t        p_pages;
-       int             p_freeze_error_code;
-       int             p_pid;
-       char            p_name[MAXCOMLEN + 1];
-}proc_freezable_status_t;
-
-#define MAX_FREEZABLE_PROCESSES 100
-
-static int
-memorystatus_freezer_get_status(user_addr_t buffer, size_t buffer_size, int32_t *retval)
+proc_t
+memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search)
 {
-       uint32_t                        proc_count = 0, i = 0;
-       global_freezable_status_t       *list_head;
-       proc_freezable_status_t         *list_entry;
-       size_t                          list_size = 0;
-       proc_t                          p;
-       memstat_bucket_t                *bucket;
-       uint32_t                        state = 0, pages = 0, entry_count = 0;
-       boolean_t                       try_freeze = TRUE;
-       int                             error = 0, probability_of_use = 0;
-
-
-       if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
-               return ENOTSUP;
-       }
-
-       list_size = sizeof(global_freezable_status_t) + (sizeof(proc_freezable_status_t) * MAX_FREEZABLE_PROCESSES);
+       memstat_bucket_t *current_bucket;
+       proc_t next_p;
 
-       if (buffer_size < list_size) {
-               return EINVAL;
+       if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
+               return NULL;
        }
 
-       list_head = (global_freezable_status_t*)kalloc(list_size);
-       if (list_head == NULL) {
-               return ENOMEM;
+       next_p = TAILQ_NEXT(p, p_memstat_list);
+       while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
+               current_bucket = &memstat_bucket[*bucket_index];
+               next_p = TAILQ_FIRST(&current_bucket->list);
        }
 
-       memset(list_head, 0, list_size);
+       return next_p;
+}
 
-       list_size = sizeof(global_freezable_status_t);
+/*
+ * Structure to hold state for a jetsam thread.
+ * Typically there should be a single jetsam thread
+ * unless parallel jetsam is enabled.
+ */
+struct jetsam_thread_state {
+       uint8_t       inited; /* boolean - if the thread is initialized */
+       uint8_t       limit_to_low_bands; /* boolean */
+       int           memorystatus_wakeup; /* wake channel */
+       int           index; /* jetsam thread index */
+       thread_t      thread; /* jetsam thread pointer */
+} *jetsam_threads;
 
-       proc_list_lock();
+/* Maximum number of jetsam threads allowed */
+#define JETSAM_THREADS_LIMIT   3
 
-       uint64_t curr_time = mach_absolute_time();
+/* Number of active jetsam threads */
+_Atomic int active_jetsam_threads = 1;
 
-       list_head->freeze_pages_threshold_crossed = (memorystatus_available_pages < memorystatus_freeze_threshold);
-       list_head->freeze_eligible_procs_available = ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold);
-       list_head->freeze_scheduled_in_future = (curr_time < memorystatus_freezer_thread_next_run_ts);
+/* Number of maximum jetsam threads configured */
+int max_jetsam_threads = JETSAM_THREADS_LIMIT;
 
-       list_entry = (proc_freezable_status_t*) ((uintptr_t)list_head + sizeof(global_freezable_status_t));
+/*
+ * Global switch for enabling fast jetsam. Fast jetsam is
+ * hooked up via the system_override() system call. It has the
+ * following effects:
+ * - Raise the jetsam threshold ("clear-the-deck")
+ * - Enabled parallel jetsam on eligible devices
+ */
+int fast_jetsam_enabled = 0;
 
-       bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
+/* Routine to find the jetsam state structure for the current jetsam thread */
+static inline struct jetsam_thread_state *
+jetsam_current_thread(void)
+{
+       for (int thr_id = 0; thr_id < max_jetsam_threads; thr_id++) {
+               if (jetsam_threads[thr_id].thread == current_thread()) {
+                       return &(jetsam_threads[thr_id]);
+               }
+       }
+       return NULL;
+}
 
-       entry_count = (memorystatus_global_probabilities_size / sizeof(memorystatus_internal_probabilities_t));
 
-       p = memorystatus_get_first_proc_locked(&i, FALSE);
-       proc_count++;
-
-       while ((proc_count <= MAX_FREEZABLE_PROCESSES) &&
-           (p) &&
-           (list_size < buffer_size)) {
-               if (isApp(p) == FALSE) {
-                       p = memorystatus_get_next_proc_locked(&i, p, FALSE);
-                       proc_count++;
-                       continue;
-               }
+__private_extern__ void
+memorystatus_init(void)
+{
+       kern_return_t result;
+       int i;
 
-               strlcpy(list_entry->p_name, p->p_name, MAXCOMLEN + 1);
+#if CONFIG_FREEZE
+       memorystatus_freeze_jetsam_band = JETSAM_PRIORITY_UI_SUPPORT;
+       memorystatus_frozen_processes_max = FREEZE_PROCESSES_MAX;
+       memorystatus_frozen_shared_mb_max = ((MAX_FROZEN_SHARED_MB_PERCENT * max_task_footprint_mb) / 100); /* 10% of the system wide task limit */
+       memorystatus_freeze_shared_mb_per_process_max = (memorystatus_frozen_shared_mb_max / 4);
+       memorystatus_freeze_pages_min = FREEZE_PAGES_MIN;
+       memorystatus_freeze_pages_max = FREEZE_PAGES_MAX;
+       memorystatus_max_frozen_demotions_daily = MAX_FROZEN_PROCESS_DEMOTIONS;
+       memorystatus_thaw_count_demotion_threshold = MIN_THAW_DEMOTION_THRESHOLD;
+#endif
 
-               list_entry->p_pid = p->p_pid;
+#if DEVELOPMENT || DEBUG
+       disconnect_page_mappings_lck_grp_attr = lck_grp_attr_alloc_init();
+       disconnect_page_mappings_lck_grp = lck_grp_alloc_init("disconnect_page_mappings", disconnect_page_mappings_lck_grp_attr);
 
-               state = p->p_memstat_state;
+       lck_mtx_init(&disconnect_page_mappings_mutex, disconnect_page_mappings_lck_grp, NULL);
 
-               if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED | P_MEMSTAT_FREEZE_IGNORE)) ||
-                   !(state & P_MEMSTAT_SUSPENDED)) {
-                       try_freeze = list_entry->freeze_has_memstat_state = FALSE;
-               } else {
-                       try_freeze = list_entry->freeze_has_memstat_state = TRUE;
-               }
+       if (kill_on_no_paging_space == TRUE) {
+               max_kill_priority = JETSAM_PRIORITY_MAX;
+       }
+#endif
 
-               list_entry->p_memstat_state = state;
+       memorystatus_jetsam_fg_band_lock_grp_attr = lck_grp_attr_alloc_init();
+       memorystatus_jetsam_fg_band_lock_grp =
+           lck_grp_alloc_init("memorystatus_jetsam_fg_band", memorystatus_jetsam_fg_band_lock_grp_attr);
+       lck_mtx_init(&memorystatus_jetsam_fg_band_lock, memorystatus_jetsam_fg_band_lock_grp, NULL);
 
-               memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
-               if (pages < memorystatus_freeze_pages_min) {
-                       try_freeze = list_entry->freeze_has_pages_min = FALSE;
-               } else {
-                       list_entry->freeze_has_pages_min = TRUE;
-                       if (try_freeze != FALSE) {
-                               try_freeze = TRUE;
-                       }
-               }
+       /* Init buckets */
+       for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
+               TAILQ_INIT(&memstat_bucket[i].list);
+               memstat_bucket[i].count = 0;
+               memstat_bucket[i].relaunch_high_count = 0;
+       }
+       memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
 
-               list_entry->p_pages = pages;
+       nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
+       nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
 
-               if (entry_count) {
-                       uint32_t j = 0;
-                       for (j = 0; j < entry_count; j++) {
-                               if (strncmp(memorystatus_global_probabilities_table[j].proc_name,
-                                   p->p_name,
-                                   MAXCOMLEN + 1) == 0) {
-                                       probability_of_use = memorystatus_global_probabilities_table[j].use_probability;
-                                       break;
-                               }
-                       }
+#if CONFIG_JETSAM
+       /* Apply overrides */
+       if (!PE_parse_boot_argn("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage))) {
+               PE_get_default("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage));
+       }
+       if (delta_percentage == 0) {
+               delta_percentage = 5;
+       }
+       if (max_mem > config_jetsam_large_memory_cutoff) {
+               critical_threshold_percentage = critical_threshold_percentage_larger_devices;
+               delta_percentage = delta_percentage_larger_devices;
+       }
+       assert(delta_percentage < 100);
+       if (!PE_parse_boot_argn("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage))) {
+               PE_get_default("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage));
+       }
+       assert(critical_threshold_percentage < 100);
+       PE_get_default("kern.jetsam_idle_offset", &idle_offset_percentage, sizeof(idle_offset_percentage));
+       assert(idle_offset_percentage < 100);
+       PE_get_default("kern.jetsam_pressure_threshold", &pressure_threshold_percentage, sizeof(pressure_threshold_percentage));
+       assert(pressure_threshold_percentage < 100);
+       PE_get_default("kern.jetsam_freeze_threshold", &freeze_threshold_percentage, sizeof(freeze_threshold_percentage));
+       assert(freeze_threshold_percentage < 100);
 
-                       list_entry->freeze_has_probability = probability_of_use;
 
-                       if (probability_of_use && try_freeze != FALSE) {
-                               try_freeze = TRUE;
-                       } else {
-                               try_freeze = FALSE;
-                       }
-               } else {
-                       if (try_freeze != FALSE) {
-                               try_freeze = TRUE;
-                       }
-                       list_entry->freeze_has_probability = -1;
+       if (!PE_parse_boot_argn("jetsam_aging_policy", &jetsam_aging_policy,
+           sizeof(jetsam_aging_policy))) {
+               if (!PE_get_default("kern.jetsam_aging_policy", &jetsam_aging_policy,
+                   sizeof(jetsam_aging_policy))) {
+                       jetsam_aging_policy = kJetsamAgingPolicySysProcsReclaimedFirst;
                }
+       }
 
-               if (try_freeze) {
-                       uint32_t purgeable, wired, clean, dirty, shared;
-                       uint32_t max_pages = 0;
-                       int freezer_error_code = 0;
-
-                       error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, TRUE /* eval only */);
+       if (jetsam_aging_policy > kJetsamAgingPolicyMax) {
+               jetsam_aging_policy = kJetsamAgingPolicySysProcsReclaimedFirst;
+       }
 
-                       if (error) {
-                               list_entry->p_freeze_error_code = freezer_error_code;
-                       }
+       switch (jetsam_aging_policy) {
+       case kJetsamAgingPolicyNone:
+               system_procs_aging_band = JETSAM_PRIORITY_IDLE;
+               applications_aging_band = JETSAM_PRIORITY_IDLE;
+               break;
 
-                       list_entry->freeze_attempted = TRUE;
-               }
+       case kJetsamAgingPolicyLegacy:
+               /*
+                * Legacy behavior where some daemons get a 10s protection once
+                * AND only before the first clean->dirty->clean transition before
+                * going into IDLE band.
+                */
+               system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
+               applications_aging_band = JETSAM_PRIORITY_IDLE;
+               break;
 
-               list_entry++;
+       case kJetsamAgingPolicySysProcsReclaimedFirst:
+               system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
+               applications_aging_band = JETSAM_PRIORITY_AGING_BAND2;
+               break;
 
-               list_size += sizeof(proc_freezable_status_t);
+       case kJetsamAgingPolicyAppsReclaimedFirst:
+               system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND2;
+               applications_aging_band = JETSAM_PRIORITY_AGING_BAND1;
+               break;
 
-               p = memorystatus_get_next_proc_locked(&i, p, FALSE);
-               proc_count++;
+       default:
+               break;
        }
 
-       proc_list_unlock();
+       /*
+        * The aging bands cannot overlap with the JETSAM_PRIORITY_ELEVATED_INACTIVE
+        * band and must be below it in priority. This is so that we don't have to make
+        * our 'aging' code worry about a mix of processes, some of which need to age
+        * and some others that need to stay elevated in the jetsam bands.
+        */
+       assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band);
+       assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > applications_aging_band);
+
+       /* Take snapshots for idle-exit kills by default? First check the boot-arg... */
+       if (!PE_parse_boot_argn("jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot))) {
+               /* ...no boot-arg, so check the device tree */
+               PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot));
+       }
 
-       buffer_size = list_size;
+       memorystatus_delta = delta_percentage * atop_64(max_mem) / 100;
+       memorystatus_available_pages_critical_idle_offset = idle_offset_percentage * atop_64(max_mem) / 100;
+       memorystatus_available_pages_critical_base = (critical_threshold_percentage / delta_percentage) * memorystatus_delta;
+       memorystatus_policy_more_free_offset_pages = (policy_more_free_offset_percentage / delta_percentage) * memorystatus_delta;
+       memorystatus_sysproc_aging_aggr_pages = sysproc_aging_aggr_threshold_percentage * atop_64(max_mem) / 100;
 
-       error = copyout(list_head, buffer, buffer_size);
-       if (error == 0) {
-               *retval = buffer_size;
+       /* Jetsam Loop Detection */
+       if (max_mem <= (512 * 1024 * 1024)) {
+               /* 512 MB devices */
+               memorystatus_jld_eval_period_msecs = 8000;      /* 8000 msecs == 8 second window */
        } else {
-               *retval = 0;
+               /* 1GB and larger devices */
+               memorystatus_jld_eval_period_msecs = 6000;      /* 6000 msecs == 6 second window */
        }
 
-       list_size = sizeof(global_freezable_status_t) + (sizeof(proc_freezable_status_t) * MAX_FREEZABLE_PROCESSES);
-       kfree(list_head, list_size);
-
-       MEMORYSTATUS_DEBUG(1, "memorystatus_freezer_get_status: returning %d (%lu - size)\n", error, (unsigned long)*list_size);
+       memorystatus_jld_enabled = TRUE;
 
-       return error;
-}
+       /* No contention at this point */
+       memorystatus_update_levels_locked(FALSE);
 
-static int
-memorystatus_freezer_control(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval)
-{
-       int err = ENOTSUP;
+#endif /* CONFIG_JETSAM */
 
-       if (flags == FREEZER_CONTROL_GET_STATUS) {
-               err = memorystatus_freezer_get_status(buffer, buffer_size, retval);
-       }
+       memorystatus_jetsam_snapshot_max = maxproc;
 
-       return err;
-}
+       memorystatus_jetsam_snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
+           (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
 
-#endif /* CONFIG_FREEZE */
+       memorystatus_jetsam_snapshot =
+           (memorystatus_jetsam_snapshot_t*)kalloc(memorystatus_jetsam_snapshot_size);
+       if (!memorystatus_jetsam_snapshot) {
+               panic("Could not allocate memorystatus_jetsam_snapshot");
+       }
 
-#endif /* DEVELOPMENT || DEBUG */
+       memorystatus_jetsam_snapshot_copy =
+           (memorystatus_jetsam_snapshot_t*)kalloc(memorystatus_jetsam_snapshot_size);
+       if (!memorystatus_jetsam_snapshot_copy) {
+               panic("Could not allocate memorystatus_jetsam_snapshot_copy");
+       }
 
-extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
-    void *parameter,
-    integer_t priority,
-    thread_t *new_thread);
+       nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout);
 
-#if DEVELOPMENT || DEBUG
+       memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t));
 
-static int
-sysctl_memorystatus_disconnect_page_mappings SYSCTL_HANDLER_ARGS
-{
-#pragma unused(arg1, arg2)
-       int     error = 0, pid = 0;
-       proc_t  p;
+#if CONFIG_FREEZE
+       memorystatus_freeze_threshold = (freeze_threshold_percentage / delta_percentage) * memorystatus_delta;
+#endif
 
-       error = sysctl_handle_int(oidp, &pid, 0, req);
-       if (error || !req->newptr) {
-               return error;
+       /* Check the boot-arg to see if fast jetsam is allowed */
+       if (!PE_parse_boot_argn("fast_jetsam_enabled", &fast_jetsam_enabled, sizeof(fast_jetsam_enabled))) {
+               fast_jetsam_enabled = 0;
        }
 
-       lck_mtx_lock(&disconnect_page_mappings_mutex);
+       /* Check the boot-arg to configure the maximum number of jetsam threads */
+       if (!PE_parse_boot_argn("max_jetsam_threads", &max_jetsam_threads, sizeof(max_jetsam_threads))) {
+               max_jetsam_threads = JETSAM_THREADS_LIMIT;
+       }
 
-       if (pid == -1) {
-               vm_pageout_disconnect_all_pages();
-       } else {
-               p = proc_find(pid);
+       /* Restrict the maximum number of jetsam threads to JETSAM_THREADS_LIMIT */
+       if (max_jetsam_threads > JETSAM_THREADS_LIMIT) {
+               max_jetsam_threads = JETSAM_THREADS_LIMIT;
+       }
 
-               if (p != NULL) {
-                       error = task_disconnect_page_mappings(p->task);
+       /* For low CPU systems disable fast jetsam mechanism */
+       if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
+               max_jetsam_threads = 1;
+               fast_jetsam_enabled = 0;
+       }
 
-                       proc_rele(p);
+       /* Initialize the jetsam_threads state array */
+       jetsam_threads = kalloc(sizeof(struct jetsam_thread_state) * max_jetsam_threads);
 
-                       if (error) {
-                               error = EIO;
-                       }
-               } else {
-                       error = EINVAL;
+       /* Initialize all the jetsam threads */
+       for (i = 0; i < max_jetsam_threads; i++) {
+               jetsam_threads[i].inited = FALSE;
+               jetsam_threads[i].index = i;
+               result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &jetsam_threads[i].thread);
+               if (result != KERN_SUCCESS) {
+                       panic("Could not create memorystatus_thread %d", i);
                }
+               thread_deallocate(jetsam_threads[i].thread);
        }
-       lck_mtx_unlock(&disconnect_page_mappings_mutex);
-
-       return error;
 }
 
-SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
-    0, 0, &sysctl_memorystatus_disconnect_page_mappings, "I", "");
-
-#endif /* DEVELOPMENT || DEBUG */
-
+/* Centralised for the purposes of allowing panic-on-jetsam */
+extern void
+vm_run_compactor(void);
 
 /*
- * Picks the sorting routine for a given jetsam priority band.
- *
- * Input:
- *     bucket_index - jetsam priority band to be sorted.
- *     sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
- *             Currently sort_order is only meaningful when handling
- *             coalitions.
- *
- * Return:
- *     0     on success
- *      non-0 on failure
+ * The jetsam no frills kill call
+ *      Return: 0 on success
+ *             error code on failure (EINVAL...)
  */
 static int
-memorystatus_sort_bucket(unsigned int bucket_index, int sort_order)
+jetsam_do_kill(proc_t p, int jetsam_flags, os_reason_t jetsam_reason)
 {
-       int coal_sort_order;
+       int error = 0;
+       error = exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags, jetsam_reason);
+       return error;
+}
 
-       /*
-        * Verify the jetsam priority
-        */
-       if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
-               return EINVAL;
-       }
+/*
+ * Wrapper for processes exiting with memorystatus details
+ */
+static boolean_t
+memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason, uint64_t *footprint_of_killed_proc)
+{
+       int error = 0;
+       __unused pid_t victim_pid = p->p_pid;
+       uint64_t footprint = get_task_phys_footprint(p->task);
+#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
+       int32_t memstat_effectivepriority = p->p_memstat_effectivepriority;
+#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
 
-#if DEVELOPMENT || DEBUG
-       if (sort_order == JETSAM_SORT_DEFAULT) {
-               coal_sort_order = COALITION_SORT_DEFAULT;
-       } else {
-               coal_sort_order = sort_order;           /* only used for testing scenarios */
+       KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START,
+           victim_pid, cause, vm_page_free_count, footprint, 0);
+       DTRACE_MEMORYSTATUS4(memorystatus_do_kill, proc_t, p, os_reason_t, jetsam_reason, uint32_t, cause, uint64_t, footprint);
+#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
+       if (memorystatus_jetsam_panic_debug & (1 << cause)) {
+               panic("memorystatus_do_kill(): jetsam debug panic (cause: %d)", cause);
        }
 #else
-       /* Verify default */
-       if (sort_order == JETSAM_SORT_DEFAULT) {
-               coal_sort_order = COALITION_SORT_DEFAULT;
-       } else {
-               return EINVAL;
-       }
+#pragma unused(cause)
 #endif
 
-       proc_list_lock();
-
-       if (memstat_bucket[bucket_index].count == 0) {
-               proc_list_unlock();
-               return 0;
+       if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
+               printf("memorystatus: killing process %d [%s] in high band %s (%d) - memorystatus_available_pages: %llu\n", p->p_pid,
+                   (*p->p_name ? p->p_name : "unknown"),
+                   memorystatus_priority_band_name(p->p_memstat_effectivepriority), p->p_memstat_effectivepriority,
+                   (uint64_t)memorystatus_available_pages);
        }
 
-       switch (bucket_index) {
-       case JETSAM_PRIORITY_FOREGROUND:
-               if (memorystatus_sort_by_largest_coalition_locked(bucket_index, coal_sort_order) == 0) {
-                       /*
-                        * Fall back to per process sorting when zero coalitions are found.
-                        */
-                       memorystatus_sort_by_largest_process_locked(bucket_index);
-               }
-               break;
-       default:
-               memorystatus_sort_by_largest_process_locked(bucket_index);
-               break;
+       /*
+        * The jetsam_reason (os_reason_t) has enough information about the kill cause.
+        * We don't really need jetsam_flags anymore, so it's okay that not all possible kill causes have been mapped.
+        */
+       int jetsam_flags = P_LTERM_JETSAM;
+       switch (cause) {
+       case kMemorystatusKilledHiwat:                                          jetsam_flags |= P_JETSAM_HIWAT; break;
+       case kMemorystatusKilledVnodes:                                         jetsam_flags |= P_JETSAM_VNODE; break;
+       case kMemorystatusKilledVMPageShortage:                         jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
+       case kMemorystatusKilledVMCompressorThrashing:
+       case kMemorystatusKilledVMCompressorSpaceShortage:      jetsam_flags |= P_JETSAM_VMTHRASHING; break;
+       case kMemorystatusKilledFCThrashing:                            jetsam_flags |= P_JETSAM_FCTHRASHING; break;
+       case kMemorystatusKilledPerProcessLimit:                        jetsam_flags |= P_JETSAM_PID; break;
+       case kMemorystatusKilledIdleExit:                                       jetsam_flags |= P_JETSAM_IDLEEXIT; break;
        }
-       proc_list_unlock();
+       error = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
+       *footprint_of_killed_proc = ((error == 0) ? footprint : 0);
 
-       return 0;
+       KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END,
+           victim_pid, memstat_effectivepriority, vm_page_free_count, error, 0);
+
+       KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_COMPACTOR_RUN)) | DBG_FUNC_START,
+           victim_pid, cause, vm_page_free_count, *footprint_of_killed_proc, 0);
+
+       vm_run_compactor();
+
+       KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_COMPACTOR_RUN)) | DBG_FUNC_END,
+           victim_pid, cause, vm_page_free_count, 0, 0);
+
+       return error == 0;
 }
 
 /*
- * Sort processes by size for a single jetsam bucket.
+ * Node manipulation
  */
 
 static void
-memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)
+memorystatus_check_levels_locked(void)
 {
-       proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL;
-       proc_t next_p = NULL, prev_max_proc = NULL;
-       uint32_t pages = 0, max_pages = 0;
-       memstat_bucket_t *current_bucket;
+#if CONFIG_JETSAM
+       /* Update levels */
+       memorystatus_update_levels_locked(TRUE);
+#else /* CONFIG_JETSAM */
+       /*
+        * Nothing to do here currently since we update
+        * memorystatus_available_pages in vm_pressure_response.
+        */
+#endif /* CONFIG_JETSAM */
+}
 
-       if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
-               return;
-       }
+/*
+ * Pin a process to a particular jetsam band when it is in the background i.e. not doing active work.
+ * For an application: that means no longer in the FG band
+ * For a daemon: that means no longer in its 'requested' jetsam priority band
+ */
 
-       current_bucket = &memstat_bucket[bucket_index];
+int
+memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, int jetsam_prio, boolean_t effective_now)
+{
+       int error = 0;
+       boolean_t enable = FALSE;
+       proc_t  p = NULL;
 
-       p = TAILQ_FIRST(&current_bucket->list);
+       if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE) {
+               enable = TRUE;
+       } else if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE) {
+               enable = FALSE;
+       } else {
+               return EINVAL;
+       }
 
-       while (p) {
-               memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
-               max_pages = pages;
-               max_proc = p;
-               prev_max_proc = p;
+       p = proc_find(pid);
+       if (p != NULL) {
+               if ((enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) ||
+                   (!enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == 0))) {
+                       /*
+                        * No change in state.
+                        */
+               } else {
+                       proc_list_lock();
 
-               while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) {
-                       /* traversing list until we find next largest process */
-                       p = next_p;
-                       memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
-                       if (pages > max_pages) {
-                               max_pages = pages;
-                               max_proc = p;
-                       }
-               }
+                       if (enable) {
+                               p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
+                               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
 
-               if (prev_max_proc != max_proc) {
-                       /* found a larger process, place it in the list */
-                       TAILQ_REMOVE(&current_bucket->list, max_proc, p_memstat_list);
-                       if (insert_after_proc == NULL) {
-                               TAILQ_INSERT_HEAD(&current_bucket->list, max_proc, p_memstat_list);
+                               if (effective_now) {
+                                       if (p->p_memstat_effectivepriority < jetsam_prio) {
+                                               if (memorystatus_highwater_enabled) {
+                                                       /*
+                                                        * Process is about to transition from
+                                                        * inactive --> active
+                                                        * assign active state
+                                                        */
+                                                       boolean_t is_fatal;
+                                                       boolean_t use_active = TRUE;
+                                                       CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
+                                                       task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal);
+                                               }
+                                               memorystatus_update_priority_locked(p, jetsam_prio, FALSE, FALSE);
+                                       }
+                               } else {
+                                       if (isProcessInAgingBands(p)) {
+                                               memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
+                                       }
+                               }
                        } else {
-                               TAILQ_INSERT_AFTER(&current_bucket->list, insert_after_proc, max_proc, p_memstat_list);
-                       }
-                       prev_max_proc = max_proc;
-               }
+                               p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
+                               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
 
-               insert_after_proc = max_proc;
+                               if (effective_now) {
+                                       if (p->p_memstat_effectivepriority == jetsam_prio) {
+                                               memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
+                                       }
+                               } else {
+                                       if (isProcessInAgingBands(p)) {
+                                               memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
+                                       }
+                               }
+                       }
 
-               p = TAILQ_NEXT(max_proc, p_memstat_list);
+                       proc_list_unlock();
+               }
+               proc_rele(p);
+               error = 0;
+       } else {
+               error = ESRCH;
        }
+
+       return error;
 }
 
-static proc_t
-memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search)
+static void
+memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
 {
-       memstat_bucket_t *current_bucket;
-       proc_t next_p;
+       proc_t p;
+       uint64_t current_time = 0, idle_delay_time = 0;
+       int demote_prio_band = 0;
+       memstat_bucket_t *demotion_bucket;
 
-       if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
-               return NULL;
-       }
+       MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion()\n");
 
-       current_bucket = &memstat_bucket[*bucket_index];
-       next_p = TAILQ_FIRST(&current_bucket->list);
-       if (!next_p && search) {
-               while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
-                       current_bucket = &memstat_bucket[*bucket_index];
-                       next_p = TAILQ_FIRST(&current_bucket->list);
-               }
-       }
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START, 0, 0, 0, 0, 0);
 
-       return next_p;
-}
+       current_time = mach_absolute_time();
 
-static proc_t
-memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search)
-{
-       memstat_bucket_t *current_bucket;
-       proc_t next_p;
+       proc_list_lock();
 
-       if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
-               return NULL;
-       }
+       demote_prio_band = JETSAM_PRIORITY_IDLE + 1;
 
-       next_p = TAILQ_NEXT(p, p_memstat_list);
-       while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
-               current_bucket = &memstat_bucket[*bucket_index];
-               next_p = TAILQ_FIRST(&current_bucket->list);
-       }
+       for (; demote_prio_band < JETSAM_PRIORITY_MAX; demote_prio_band++) {
+               if (demote_prio_band != system_procs_aging_band && demote_prio_band != applications_aging_band) {
+                       continue;
+               }
 
-       return next_p;
-}
-
-/*
- * Structure to hold state for a jetsam thread.
- * Typically there should be a single jetsam thread
- * unless parallel jetsam is enabled.
- */
-struct jetsam_thread_state {
-       boolean_t       inited; /* if the thread is initialized */
-       int             memorystatus_wakeup; /* wake channel */
-       int             index; /* jetsam thread index */
-       thread_t        thread; /* jetsam thread pointer */
-} *jetsam_threads;
+               demotion_bucket = &memstat_bucket[demote_prio_band];
+               p = TAILQ_FIRST(&demotion_bucket->list);
 
-/* Maximum number of jetsam threads allowed */
-#define JETSAM_THREADS_LIMIT   3
+               while (p) {
+                       MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", p->p_pid);
 
-/* Number of active jetsam threads */
-_Atomic int active_jetsam_threads = 1;
+                       assert(p->p_memstat_idledeadline);
 
-/* Number of maximum jetsam threads configured */
-int max_jetsam_threads = JETSAM_THREADS_LIMIT;
+                       assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS);
 
-/*
- * Global switch for enabling fast jetsam. Fast jetsam is
- * hooked up via the system_override() system call. It has the
- * following effects:
- * - Raise the jetsam threshold ("clear-the-deck")
- * - Enabled parallel jetsam on eligible devices
- */
-int fast_jetsam_enabled = 0;
+                       if (current_time >= p->p_memstat_idledeadline) {
+                               if ((isSysProc(p) &&
+                                   ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) != P_DIRTY_IDLE_EXIT_ENABLED)) || /* system proc marked dirty*/
+                                   task_has_assertions((struct task *)(p->task))) {     /* has outstanding assertions which might indicate outstanding work too */
+                                       idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_time(p) : memorystatus_apps_idle_time(p);
 
-/* Routine to find the jetsam state structure for the current jetsam thread */
-static inline struct jetsam_thread_state *
-jetsam_current_thread(void)
-{
-       for (int thr_id = 0; thr_id < max_jetsam_threads; thr_id++) {
-               if (jetsam_threads[thr_id].thread == current_thread()) {
-                       return &(jetsam_threads[thr_id]);
-               }
-       }
-       panic("jetsam_current_thread() is being called from a non-jetsam thread\n");
-       /* Contol should not reach here */
-       return NULL;
-}
+                                       p->p_memstat_idledeadline += idle_delay_time;
+                                       p = TAILQ_NEXT(p, p_memstat_list);
+                               } else {
+                                       proc_t next_proc = NULL;
 
+                                       next_proc = TAILQ_NEXT(p, p_memstat_list);
+                                       memorystatus_invalidate_idle_demotion_locked(p, TRUE);
 
-__private_extern__ void
-memorystatus_init(void)
-{
-       kern_return_t result;
-       int i;
+                                       memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, false, true);
 
-#if CONFIG_FREEZE
-       memorystatus_freeze_jetsam_band = JETSAM_PRIORITY_UI_SUPPORT;
-       memorystatus_frozen_processes_max = FREEZE_PROCESSES_MAX;
-       memorystatus_frozen_shared_mb_max = ((MAX_FROZEN_SHARED_MB_PERCENT * max_task_footprint_mb) / 100); /* 10% of the system wide task limit */
-       memorystatus_freeze_shared_mb_per_process_max = (memorystatus_frozen_shared_mb_max / 4);
-       memorystatus_freeze_pages_min = FREEZE_PAGES_MIN;
-       memorystatus_freeze_pages_max = FREEZE_PAGES_MAX;
-       memorystatus_max_frozen_demotions_daily = MAX_FROZEN_PROCESS_DEMOTIONS;
-       memorystatus_thaw_count_demotion_threshold = MIN_THAW_DEMOTION_THRESHOLD;
-#endif
+                                       p = next_proc;
+                                       continue;
+                               }
+                       } else {
+                               // No further candidates
+                               break;
+                       }
+               }
+       }
 
-#if DEVELOPMENT || DEBUG
-       disconnect_page_mappings_lck_grp_attr = lck_grp_attr_alloc_init();
-       disconnect_page_mappings_lck_grp = lck_grp_alloc_init("disconnect_page_mappings", disconnect_page_mappings_lck_grp_attr);
+       memorystatus_reschedule_idle_demotion_locked();
 
-       lck_mtx_init(&disconnect_page_mappings_mutex, disconnect_page_mappings_lck_grp, NULL);
+       proc_list_unlock();
 
-       if (kill_on_no_paging_space == TRUE) {
-               max_kill_priority = JETSAM_PRIORITY_MAX;
-       }
-#endif
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
+}
 
+static void
+memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state)
+{
+       boolean_t present_in_sysprocs_aging_bucket = FALSE;
+       boolean_t present_in_apps_aging_bucket = FALSE;
+       uint64_t  idle_delay_time = 0;
 
-       /* Init buckets */
-       for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
-               TAILQ_INIT(&memstat_bucket[i].list);
-               memstat_bucket[i].count = 0;
+       if (jetsam_aging_policy == kJetsamAgingPolicyNone) {
+               return;
        }
-       memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
-
-#if CONFIG_JETSAM
-       nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
-       nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
 
-       /* Apply overrides */
-       PE_get_default("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage));
-       if (delta_percentage == 0) {
-               delta_percentage = 5;
+       if ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) ||
+           (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION)) {
+               /*
+                * This process isn't going to be making the trip to the lower bands.
+                */
+               return;
        }
-       assert(delta_percentage < 100);
-       PE_get_default("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage));
-       assert(critical_threshold_percentage < 100);
-       PE_get_default("kern.jetsam_idle_offset", &idle_offset_percentage, sizeof(idle_offset_percentage));
-       assert(idle_offset_percentage < 100);
-       PE_get_default("kern.jetsam_pressure_threshold", &pressure_threshold_percentage, sizeof(pressure_threshold_percentage));
-       assert(pressure_threshold_percentage < 100);
-       PE_get_default("kern.jetsam_freeze_threshold", &freeze_threshold_percentage, sizeof(freeze_threshold_percentage));
-       assert(freeze_threshold_percentage < 100);
 
-       if (!PE_parse_boot_argn("jetsam_aging_policy", &jetsam_aging_policy,
-           sizeof(jetsam_aging_policy))) {
-               if (!PE_get_default("kern.jetsam_aging_policy", &jetsam_aging_policy,
-                   sizeof(jetsam_aging_policy))) {
-                       jetsam_aging_policy = kJetsamAgingPolicyLegacy;
+       if (isProcessInAgingBands(p)) {
+               if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) {
+                       assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) != P_DIRTY_AGING_IN_PROGRESS);
                }
-       }
 
-       if (jetsam_aging_policy > kJetsamAgingPolicyMax) {
-               jetsam_aging_policy = kJetsamAgingPolicyLegacy;
+               if (isSysProc(p) && system_procs_aging_band) {
+                       present_in_sysprocs_aging_bucket = TRUE;
+               } else if (isApp(p) && applications_aging_band) {
+                       present_in_apps_aging_bucket = TRUE;
+               }
        }
 
-       switch (jetsam_aging_policy) {
-       case kJetsamAgingPolicyNone:
-               system_procs_aging_band = JETSAM_PRIORITY_IDLE;
-               applications_aging_band = JETSAM_PRIORITY_IDLE;
-               break;
-
-       case kJetsamAgingPolicyLegacy:
-               /*
-                * Legacy behavior where some daemons get a 10s protection once
-                * AND only before the first clean->dirty->clean transition before
-                * going into IDLE band.
-                */
-               system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
-               applications_aging_band = JETSAM_PRIORITY_IDLE;
-               break;
+       assert(!present_in_sysprocs_aging_bucket);
+       assert(!present_in_apps_aging_bucket);
 
-       case kJetsamAgingPolicySysProcsReclaimedFirst:
-               system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
-               applications_aging_band = JETSAM_PRIORITY_AGING_BAND2;
-               break;
+       MEMORYSTATUS_DEBUG(1, "memorystatus_schedule_idle_demotion_locked: scheduling demotion to idle band for pid %d (dirty:0x%x, set_state %d, demotions %d).\n",
+           p->p_pid, p->p_memstat_dirty, set_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps));
 
-       case kJetsamAgingPolicyAppsReclaimedFirst:
-               system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND2;
-               applications_aging_band = JETSAM_PRIORITY_AGING_BAND1;
-               break;
+       if (isSysProc(p)) {
+               assert((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED);
+       }
 
-       default:
-               break;
+       idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_time(p) : memorystatus_apps_idle_time(p);
+       if (set_state) {
+               p->p_memstat_dirty |= P_DIRTY_AGING_IN_PROGRESS;
+               p->p_memstat_idledeadline = mach_absolute_time() + idle_delay_time;
        }
 
-       /*
-        * The aging bands cannot overlap with the JETSAM_PRIORITY_ELEVATED_INACTIVE
-        * band and must be below it in priority. This is so that we don't have to make
-        * our 'aging' code worry about a mix of processes, some of which need to age
-        * and some others that need to stay elevated in the jetsam bands.
-        */
-       assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band);
-       assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > applications_aging_band);
+       assert(p->p_memstat_idledeadline);
 
-       /* Take snapshots for idle-exit kills by default? First check the boot-arg... */
-       if (!PE_parse_boot_argn("jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot))) {
-               /* ...no boot-arg, so check the device tree */
-               PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot));
+       if (isSysProc(p) && present_in_sysprocs_aging_bucket == FALSE) {
+               memorystatus_scheduled_idle_demotions_sysprocs++;
+       } else if (isApp(p) && present_in_apps_aging_bucket == FALSE) {
+               memorystatus_scheduled_idle_demotions_apps++;
        }
+}
 
-       memorystatus_delta = delta_percentage * atop_64(max_mem) / 100;
-       memorystatus_available_pages_critical_idle_offset = idle_offset_percentage * atop_64(max_mem) / 100;
-       memorystatus_available_pages_critical_base = (critical_threshold_percentage / delta_percentage) * memorystatus_delta;
-       memorystatus_policy_more_free_offset_pages = (policy_more_free_offset_percentage / delta_percentage) * memorystatus_delta;
+void
+memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clear_state)
+{
+       boolean_t present_in_sysprocs_aging_bucket = FALSE;
+       boolean_t present_in_apps_aging_bucket = FALSE;
 
-       /* Jetsam Loop Detection */
-       if (max_mem <= (512 * 1024 * 1024)) {
-               /* 512 MB devices */
-               memorystatus_jld_eval_period_msecs = 8000;      /* 8000 msecs == 8 second window */
-       } else {
-               /* 1GB and larger devices */
-               memorystatus_jld_eval_period_msecs = 6000;      /* 6000 msecs == 6 second window */
+       if (!system_procs_aging_band && !applications_aging_band) {
+               return;
        }
 
-       memorystatus_jld_enabled = TRUE;
+       if ((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == 0) {
+               return;
+       }
 
-       /* No contention at this point */
-       memorystatus_update_levels_locked(FALSE);
+       if (isProcessInAgingBands(p)) {
+               if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) {
+                       assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == P_DIRTY_AGING_IN_PROGRESS);
+               }
 
-#endif /* CONFIG_JETSAM */
+               if (isSysProc(p) && system_procs_aging_band) {
+                       assert(p->p_memstat_effectivepriority == system_procs_aging_band);
+                       assert(p->p_memstat_idledeadline);
+                       present_in_sysprocs_aging_bucket = TRUE;
+               } else if (isApp(p) && applications_aging_band) {
+                       assert(p->p_memstat_effectivepriority == applications_aging_band);
+                       assert(p->p_memstat_idledeadline);
+                       present_in_apps_aging_bucket = TRUE;
+               }
+       }
 
-       memorystatus_jetsam_snapshot_max = maxproc;
+       MEMORYSTATUS_DEBUG(1, "memorystatus_invalidate_idle_demotion(): invalidating demotion to idle band for pid %d (clear_state %d, demotions %d).\n",
+           p->p_pid, clear_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps));
 
-       memorystatus_jetsam_snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
-           (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
 
-       memorystatus_jetsam_snapshot =
-           (memorystatus_jetsam_snapshot_t*)kalloc(memorystatus_jetsam_snapshot_size);
-       if (!memorystatus_jetsam_snapshot) {
-               panic("Could not allocate memorystatus_jetsam_snapshot");
+       if (clear_state) {
+               p->p_memstat_idledeadline = 0;
+               p->p_memstat_dirty &= ~P_DIRTY_AGING_IN_PROGRESS;
        }
 
-       memorystatus_jetsam_snapshot_copy =
-           (memorystatus_jetsam_snapshot_t*)kalloc(memorystatus_jetsam_snapshot_size);
-       if (!memorystatus_jetsam_snapshot_copy) {
-               panic("Could not allocate memorystatus_jetsam_snapshot_copy");
+       if (isSysProc(p) && present_in_sysprocs_aging_bucket == TRUE) {
+               memorystatus_scheduled_idle_demotions_sysprocs--;
+               assert(memorystatus_scheduled_idle_demotions_sysprocs >= 0);
+       } else if (isApp(p) && present_in_apps_aging_bucket == TRUE) {
+               memorystatus_scheduled_idle_demotions_apps--;
+               assert(memorystatus_scheduled_idle_demotions_apps >= 0);
        }
 
-       nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout);
-
-       memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t));
-
-#if CONFIG_FREEZE
-       memorystatus_freeze_threshold = (freeze_threshold_percentage / delta_percentage) * memorystatus_delta;
-#endif
+       assert((memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps) >= 0);
+}
 
-       /* Check the boot-arg to see if fast jetsam is allowed */
-       if (!PE_parse_boot_argn("fast_jetsam_enabled", &fast_jetsam_enabled, sizeof(fast_jetsam_enabled))) {
-               fast_jetsam_enabled = 0;
-       }
+static void
+memorystatus_reschedule_idle_demotion_locked(void)
+{
+       if (0 == (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)) {
+               if (memstat_idle_demotion_deadline) {
+                       /* Transitioned 1->0, so cancel next call */
+                       thread_call_cancel(memorystatus_idle_demotion_call);
+                       memstat_idle_demotion_deadline = 0;
+               }
+       } else {
+               memstat_bucket_t *demotion_bucket;
+               proc_t p = NULL, p1 = NULL, p2 = NULL;
 
-       /* Check the boot-arg to configure the maximum number of jetsam threads */
-       if (!PE_parse_boot_argn("max_jetsam_threads", &max_jetsam_threads, sizeof(max_jetsam_threads))) {
-               max_jetsam_threads = JETSAM_THREADS_LIMIT;
-       }
+               if (system_procs_aging_band) {
+                       demotion_bucket = &memstat_bucket[system_procs_aging_band];
+                       p1 = TAILQ_FIRST(&demotion_bucket->list);
 
-       /* Restrict the maximum number of jetsam threads to JETSAM_THREADS_LIMIT */
-       if (max_jetsam_threads > JETSAM_THREADS_LIMIT) {
-               max_jetsam_threads = JETSAM_THREADS_LIMIT;
-       }
+                       p = p1;
+               }
 
-       /* For low CPU systems disable fast jetsam mechanism */
-       if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
-               max_jetsam_threads = 1;
-               fast_jetsam_enabled = 0;
-       }
+               if (applications_aging_band) {
+                       demotion_bucket = &memstat_bucket[applications_aging_band];
+                       p2 = TAILQ_FIRST(&demotion_bucket->list);
 
-       /* Initialize the jetsam_threads state array */
-       jetsam_threads = kalloc(sizeof(struct jetsam_thread_state) * max_jetsam_threads);
+                       if (p1 && p2) {
+                               p = (p1->p_memstat_idledeadline > p2->p_memstat_idledeadline) ? p2 : p1;
+                       } else {
+                               p = (p1 == NULL) ? p2 : p1;
+                       }
+               }
 
-       /* Initialize all the jetsam threads */
-       for (i = 0; i < max_jetsam_threads; i++) {
-               result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &jetsam_threads[i].thread);
-               if (result == KERN_SUCCESS) {
-                       jetsam_threads[i].inited = FALSE;
-                       jetsam_threads[i].index = i;
-                       thread_deallocate(jetsam_threads[i].thread);
-               } else {
-                       panic("Could not create memorystatus_thread %d", i);
+               assert(p);
+
+               if (p != NULL) {
+                       assert(p && p->p_memstat_idledeadline);
+                       if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline) {
+                               thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline);
+                               memstat_idle_demotion_deadline = p->p_memstat_idledeadline;
+                       }
                }
        }
 }
 
-/* Centralised for the purposes of allowing panic-on-jetsam */
-extern void
-vm_run_compactor(void);
-
 /*
- * The jetsam no frills kill call
- *      Return: 0 on success
- *             error code on failure (EINVAL...)
+ * List manipulation
  */
-static int
-jetsam_do_kill(proc_t p, int jetsam_flags, os_reason_t jetsam_reason)
-{
-       int error = 0;
-       error = exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags, jetsam_reason);
-       return error;
-}
 
-/*
- * Wrapper for processes exiting with memorystatus details
- */
-static boolean_t
-memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason)
+int
+memorystatus_add(proc_t p, boolean_t locked)
 {
-       int error = 0;
-       __unused pid_t victim_pid = p->p_pid;
+       memstat_bucket_t *bucket;
 
-       KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START,
-           victim_pid, cause, vm_page_free_count, 0, 0);
+       MEMORYSTATUS_DEBUG(1, "memorystatus_list_add(): adding pid %d with priority %d.\n", p->p_pid, p->p_memstat_effectivepriority);
 
-       DTRACE_MEMORYSTATUS3(memorystatus_do_kill, proc_t, p, os_reason_t, jetsam_reason, uint32_t, cause);
-#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
-       if (memorystatus_jetsam_panic_debug & (1 << cause)) {
-               panic("memorystatus_do_kill(): jetsam debug panic (cause: %d)", cause);
+       if (!locked) {
+               proc_list_lock();
        }
-#else
-#pragma unused(cause)
-#endif
 
-       if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
-               printf("memorystatus: killing process %d [%s] in high band %s (%d) - memorystatus_available_pages: %llu\n", p->p_pid,
-                   (*p->p_name ? p->p_name : "unknown"),
-                   memorystatus_priority_band_name(p->p_memstat_effectivepriority), p->p_memstat_effectivepriority,
-                   (uint64_t)memorystatus_available_pages);
+       DTRACE_MEMORYSTATUS2(memorystatus_add, proc_t, p, int32_t, p->p_memstat_effectivepriority);
+
+       /* Processes marked internal do not have priority tracked */
+       if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
+               goto exit;
        }
 
        /*
-        * The jetsam_reason (os_reason_t) has enough information about the kill cause.
-        * We don't really need jetsam_flags anymore, so it's okay that not all possible kill causes have been mapped.
+        * Opt out system processes from being frozen by default.
+        * For coalition-based freezing, we only want to freeze sysprocs that have specifically opted in.
         */
-       int jetsam_flags = P_LTERM_JETSAM;
-       switch (cause) {
-       case kMemorystatusKilledHiwat:                                          jetsam_flags |= P_JETSAM_HIWAT; break;
-       case kMemorystatusKilledVnodes:                                         jetsam_flags |= P_JETSAM_VNODE; break;
-       case kMemorystatusKilledVMPageShortage:                         jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
-       case kMemorystatusKilledVMCompressorThrashing:
-       case kMemorystatusKilledVMCompressorSpaceShortage:      jetsam_flags |= P_JETSAM_VMTHRASHING; break;
-       case kMemorystatusKilledFCThrashing:                            jetsam_flags |= P_JETSAM_FCTHRASHING; break;
-       case kMemorystatusKilledPerProcessLimit:                        jetsam_flags |= P_JETSAM_PID; break;
-       case kMemorystatusKilledIdleExit:                                       jetsam_flags |= P_JETSAM_IDLEEXIT; break;
+       if (isSysProc(p)) {
+               p->p_memstat_state |= P_MEMSTAT_FREEZE_DISABLED;
        }
-       error = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
 
-       KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END,
-           victim_pid, cause, vm_page_free_count, error, 0);
+       bucket = &memstat_bucket[p->p_memstat_effectivepriority];
 
-       vm_run_compactor();
+       if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) {
+               assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs - 1);
+       } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) {
+               assert(bucket->count == memorystatus_scheduled_idle_demotions_apps - 1);
+       } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
+               /*
+                * Entering the idle band.
+                * Record idle start time.
+                */
+               p->p_memstat_idle_start = mach_absolute_time();
+       }
 
-       return error == 0;
-}
+       TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
+       bucket->count++;
+       if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
+               bucket->relaunch_high_count++;
+       }
 
-/*
- * Node manipulation
- */
+       memorystatus_list_count++;
 
-static void
-memorystatus_check_levels_locked(void)
-{
-#if CONFIG_JETSAM
-       /* Update levels */
-       memorystatus_update_levels_locked(TRUE);
-#else /* CONFIG_JETSAM */
-       /*
-        * Nothing to do here currently since we update
-        * memorystatus_available_pages in vm_pressure_response.
-        */
-#endif /* CONFIG_JETSAM */
+       memorystatus_check_levels_locked();
+
+exit:
+       if (!locked) {
+               proc_list_unlock();
+       }
+
+       return 0;
 }
 
 /*
- * Pin a process to a particular jetsam band when it is in the background i.e. not doing active work.
- * For an application: that means no longer in the FG band
- * For a daemon: that means no longer in its 'requested' jetsam priority band
+ * Description:
+ *     Moves a process from one jetsam bucket to another.
+ *     which changes the LRU position of the process.
+ *
+ *     Monitors transition between buckets and if necessary
+ *     will update cached memory limits accordingly.
+ *
+ *     skip_demotion_check:
+ *     - if the 'jetsam aging policy' is NOT 'legacy':
+ *             When this flag is TRUE, it means we are going
+ *             to age the ripe processes out of the aging bands and into the
+ *             IDLE band and apply their inactive memory limits.
+ *
+ *     - if the 'jetsam aging policy' is 'legacy':
+ *             When this flag is TRUE, it might mean the above aging mechanism
+ *             OR
+ *             It might be that we have a process that has used up its 'idle deferral'
+ *             stay that is given to it once per lifetime. And in this case, the process
+ *             won't be going through any aging codepaths. But we still need to apply
+ *             the right inactive limits and so we explicitly set this to TRUE if the
+ *             new priority for the process is the IDLE band.
  */
-
-int
-memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, int jetsam_prio, boolean_t effective_now)
+void
+memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check)
 {
-       int error = 0;
-       boolean_t enable = FALSE;
-       proc_t  p = NULL;
+       memstat_bucket_t *old_bucket, *new_bucket;
 
-       if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE) {
-               enable = TRUE;
-       } else if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE) {
-               enable = FALSE;
-       } else {
-               return EINVAL;
+       assert(priority < MEMSTAT_BUCKET_COUNT);
+
+       /* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
+       if ((p->p_listflag & P_LIST_EXITED) != 0) {
+               return;
        }
 
-       p = proc_find(pid);
-       if (p != NULL) {
-               if ((enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) ||
-                   (!enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == 0))) {
+       MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting %s(%d) to priority %d, inserting at %s\n",
+           (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, head_insert ? "head" : "tail");
+
+       DTRACE_MEMORYSTATUS3(memorystatus_update_priority, proc_t, p, int32_t, p->p_memstat_effectivepriority, int, priority);
+
+       old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
+
+       if (skip_demotion_check == FALSE) {
+               if (isSysProc(p)) {
                        /*
-                        * No change in state.
+                        * For system processes, the memorystatus_dirty_* routines take care of adding/removing
+                        * the processes from the aging bands and balancing the demotion counts.
+                        * We can, however, override that if the process has an 'elevated inactive jetsam band' attribute.
                         */
-               } else {
-                       proc_list_lock();
-
-                       if (enable) {
-                               p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
-                               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
 
-                               if (effective_now) {
-                                       if (p->p_memstat_effectivepriority < jetsam_prio) {
-                                               if (memorystatus_highwater_enabled) {
-                                                       /*
-                                                        * Process is about to transition from
-                                                        * inactive --> active
-                                                        * assign active state
-                                                        */
-                                                       boolean_t is_fatal;
-                                                       boolean_t use_active = TRUE;
-                                                       CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
-                                                       task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal);
-                                               }
-                                               memorystatus_update_priority_locked(p, jetsam_prio, FALSE, FALSE);
-                                       }
-                               } else {
-                                       if (isProcessInAgingBands(p)) {
-                                               memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
+                       if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
+                               /*
+                                * 2 types of processes can use the non-standard elevated inactive band:
+                                * - Frozen processes that always land in memorystatus_freeze_jetsam_band
+                                * OR
+                                * - processes that specifically opt-in to the elevated inactive support e.g. docked processes.
+                                */
+#if CONFIG_FREEZE
+                               if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
+                                       if (priority <= memorystatus_freeze_jetsam_band) {
+                                               priority = memorystatus_freeze_jetsam_band;
+                                       }
+                               } else
+#endif /* CONFIG_FREEZE */
+                               {
+                                       if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) {
+                                               priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
                                        }
                                }
-                       } else {
-                               p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
-                               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+                               assert(!(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS));
+                       }
+               } else if (isApp(p)) {
+                       /*
+                        * Check to see if the application is being lowered in jetsam priority. If so, and:
+                        * - it has an 'elevated inactive jetsam band' attribute, then put it in the appropriate band.
+                        * - it is a normal application, then let it age in the aging band if that policy is in effect.
+                        */
 
-                               if (effective_now) {
-                                       if (p->p_memstat_effectivepriority == jetsam_prio) {
-                                               memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
+                       if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
+#if CONFIG_FREEZE
+                               if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
+                                       if (priority <= memorystatus_freeze_jetsam_band) {
+                                               priority = memorystatus_freeze_jetsam_band;
                                        }
-                               } else {
-                                       if (isProcessInAgingBands(p)) {
-                                               memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
+                               } else
+#endif /* CONFIG_FREEZE */
+                               {
+                                       if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) {
+                                               priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
                                        }
                                }
-                       }
+                       } else {
+                               if (applications_aging_band) {
+                                       if (p->p_memstat_effectivepriority == applications_aging_band) {
+                                               assert(old_bucket->count == (memorystatus_scheduled_idle_demotions_apps + 1));
+                                       }
 
-                       proc_list_unlock();
+                                       if ((jetsam_aging_policy != kJetsamAgingPolicyLegacy) && (priority <= applications_aging_band)) {
+                                               assert(!(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS));
+                                               priority = applications_aging_band;
+                                               memorystatus_schedule_idle_demotion_locked(p, TRUE);
+                                       }
+                               }
+                       }
                }
-               proc_rele(p);
-               error = 0;
-       } else {
-               error = ESRCH;
        }
 
-       return error;
-}
-
-static void
-memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
-{
-       proc_t p;
-       uint64_t current_time = 0, idle_delay_time = 0;
-       int demote_prio_band = 0;
-       memstat_bucket_t *demotion_bucket;
-
-       MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion()\n");
-
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START, 0, 0, 0, 0, 0);
-
-       current_time = mach_absolute_time();
-
-       proc_list_lock();
-
-       demote_prio_band = JETSAM_PRIORITY_IDLE + 1;
-
-       for (; demote_prio_band < JETSAM_PRIORITY_MAX; demote_prio_band++) {
-               if (demote_prio_band != system_procs_aging_band && demote_prio_band != applications_aging_band) {
-                       continue;
-               }
-
-               demotion_bucket = &memstat_bucket[demote_prio_band];
-               p = TAILQ_FIRST(&demotion_bucket->list);
-
-               while (p) {
-                       MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", p->p_pid);
+       if ((system_procs_aging_band && (priority == system_procs_aging_band)) || (applications_aging_band && (priority == applications_aging_band))) {
+               assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS);
+       }
 
-                       assert(p->p_memstat_idledeadline);
+#if DEVELOPMENT || DEBUG
+       if (priority == JETSAM_PRIORITY_IDLE && /* if the process is on its way into the IDLE band */
+           skip_demotion_check == FALSE &&     /* and it isn't via the path that will set the INACTIVE memlimits */
+           (p->p_memstat_dirty & P_DIRTY_TRACK) && /* and it has 'DIRTY' tracking enabled */
+           ((p->p_memstat_memlimit != p->p_memstat_memlimit_inactive) || /* and we notice that the current limit isn't the right value (inactive) */
+           ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) ? (!(p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)) : (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)))) { /* OR type (fatal vs non-fatal) */
+               printf("memorystatus_update_priority_locked: on %s with 0x%x, prio: %d and %d\n", p->p_name, p->p_memstat_state, priority, p->p_memstat_memlimit); /* then we must catch this */
+       }
+#endif /* DEVELOPMENT || DEBUG */
 
-                       assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS);
+       TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
+       old_bucket->count--;
+       if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
+               old_bucket->relaunch_high_count--;
+       }
 
-                       if (current_time >= p->p_memstat_idledeadline) {
-                               if ((isSysProc(p) &&
-                                   ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) != P_DIRTY_IDLE_EXIT_ENABLED)) || /* system proc marked dirty*/
-                                   task_has_assertions((struct task *)(p->task))) {     /* has outstanding assertions which might indicate outstanding work too */
-                                       idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_delay_time : memorystatus_apps_idle_delay_time;
+       new_bucket = &memstat_bucket[priority];
+       if (head_insert) {
+               TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list);
+       } else {
+               TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
+       }
+       new_bucket->count++;
+       if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
+               new_bucket->relaunch_high_count++;
+       }
 
-                                       p->p_memstat_idledeadline += idle_delay_time;
-                                       p = TAILQ_NEXT(p, p_memstat_list);
-                               } else {
-                                       proc_t next_proc = NULL;
+       if (memorystatus_highwater_enabled) {
+               boolean_t is_fatal;
+               boolean_t use_active;
 
-                                       next_proc = TAILQ_NEXT(p, p_memstat_list);
-                                       memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+               /*
+                * If cached limit data is updated, then the limits
+                * will be enforced by writing to the ledgers.
+                */
+               boolean_t ledger_update_needed = TRUE;
 
-                                       memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, false, true);
+               /*
+                * Here, we must update the cached memory limit if the task
+                * is transitioning between:
+                *      active <--> inactive
+                *      FG     <-->       BG
+                * but:
+                *      dirty  <-->    clean   is ignored
+                *
+                * We bypass non-idle processes that have opted into dirty tracking because
+                * a move between buckets does not imply a transition between the
+                * dirty <--> clean state.
+                */
 
-                                       p = next_proc;
-                                       continue;
-                               }
+               if (p->p_memstat_dirty & P_DIRTY_TRACK) {
+                       if (skip_demotion_check == TRUE && priority == JETSAM_PRIORITY_IDLE) {
+                               CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
+                               use_active = FALSE;
                        } else {
-                               // No further candidates
-                               break;
+                               ledger_update_needed = FALSE;
                        }
+               } else if ((priority >= JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority < JETSAM_PRIORITY_FOREGROUND)) {
+                       /*
+                        *      inactive --> active
+                        *      BG       -->     FG
+                        *      assign active state
+                        */
+                       CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
+                       use_active = TRUE;
+               } else if ((priority < JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
+                       /*
+                        *      active --> inactive
+                        *      FG     -->       BG
+                        *      assign inactive state
+                        */
+                       CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
+                       use_active = FALSE;
+               } else {
+                       /*
+                        * The transition between jetsam priority buckets apparently did
+                        * not affect active/inactive state.
+                        * This is not unusual... especially during startup when
+                        * processes are getting established in their respective bands.
+                        */
+                       ledger_update_needed = FALSE;
                }
-       }
-
-       memorystatus_reschedule_idle_demotion_locked();
-
-       proc_list_unlock();
 
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
-}
-
-static void
-memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state)
-{
-       boolean_t present_in_sysprocs_aging_bucket = FALSE;
-       boolean_t present_in_apps_aging_bucket = FALSE;
-       uint64_t  idle_delay_time = 0;
+               /*
+                * Enforce the new limits by writing to the ledger
+                */
+               if (ledger_update_needed) {
+                       task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal);
 
-       if (jetsam_aging_policy == kJetsamAgingPolicyNone) {
-               return;
+                       MEMORYSTATUS_DEBUG(3, "memorystatus_update_priority_locked: new limit on pid %d (%dMB %s) priority old --> new (%d --> %d) dirty?=0x%x %s\n",
+                           p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
+                           (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, priority, p->p_memstat_dirty,
+                           (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
+               }
        }
 
-       if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
+       /*
+        * Record idle start or idle delta.
+        */
+       if (p->p_memstat_effectivepriority == priority) {
                /*
-                * This process isn't going to be making the trip to the lower bands.
+                * This process is not transitioning between
+                * jetsam priority buckets.  Do nothing.
                 */
-               return;
-       }
-
-       if (isProcessInAgingBands(p)) {
-               if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) {
-                       assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) != P_DIRTY_AGING_IN_PROGRESS);
+       } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
+               uint64_t now;
+               /*
+                * Transitioning out of the idle priority bucket.
+                * Record idle delta.
+                */
+               assert(p->p_memstat_idle_start != 0);
+               now = mach_absolute_time();
+               if (now > p->p_memstat_idle_start) {
+                       p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
                }
 
-               if (isSysProc(p) && system_procs_aging_band) {
-                       present_in_sysprocs_aging_bucket = TRUE;
-               } else if (isApp(p) && applications_aging_band) {
-                       present_in_apps_aging_bucket = TRUE;
+               /*
+                * About to become active and so memory footprint could change.
+                * So mark it eligible for freeze-considerations next time around.
+                */
+               if (p->p_memstat_state & P_MEMSTAT_FREEZE_IGNORE) {
+                       p->p_memstat_state &= ~P_MEMSTAT_FREEZE_IGNORE;
                }
+       } else if (priority == JETSAM_PRIORITY_IDLE) {
+               /*
+                * Transitioning into the idle priority bucket.
+                * Record idle start.
+                */
+               p->p_memstat_idle_start = mach_absolute_time();
        }
 
-       assert(!present_in_sysprocs_aging_bucket);
-       assert(!present_in_apps_aging_bucket);
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CHANGE_PRIORITY), p->p_pid, priority, p->p_memstat_effectivepriority, 0, 0);
 
-       MEMORYSTATUS_DEBUG(1, "memorystatus_schedule_idle_demotion_locked: scheduling demotion to idle band for pid %d (dirty:0x%x, set_state %d, demotions %d).\n",
-           p->p_pid, p->p_memstat_dirty, set_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps));
+       p->p_memstat_effectivepriority = priority;
 
-       if (isSysProc(p)) {
-               assert((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED);
+#if CONFIG_SECLUDED_MEMORY
+       if (secluded_for_apps &&
+           task_could_use_secluded_mem(p->task)) {
+               task_set_can_use_secluded_mem(
+                       p->task,
+                       (priority >= JETSAM_PRIORITY_FOREGROUND));
        }
+#endif /* CONFIG_SECLUDED_MEMORY */
 
-       idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_delay_time : memorystatus_apps_idle_delay_time;
+       memorystatus_check_levels_locked();
+}
 
-       if (set_state) {
-               p->p_memstat_dirty |= P_DIRTY_AGING_IN_PROGRESS;
-               p->p_memstat_idledeadline = mach_absolute_time() + idle_delay_time;
-       }
-
-       assert(p->p_memstat_idledeadline);
-
-       if (isSysProc(p) && present_in_sysprocs_aging_bucket == FALSE) {
-               memorystatus_scheduled_idle_demotions_sysprocs++;
-       } else if (isApp(p) && present_in_apps_aging_bucket == FALSE) {
-               memorystatus_scheduled_idle_demotions_apps++;
-       }
+int
+memorystatus_relaunch_flags_update(proc_t p, int relaunch_flags)
+{
+       p->p_memstat_relaunch_flags = relaunch_flags;
+       KDBG(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_RELAUNCH_FLAGS), p->p_pid, relaunch_flags, 0, 0, 0);
+       return 0;
 }
 
-static void
-memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clear_state)
-{
-       boolean_t present_in_sysprocs_aging_bucket = FALSE;
-       boolean_t present_in_apps_aging_bucket = FALSE;
+/*
+ *
+ * Description: Update the jetsam priority and memory limit attributes for a given process.
+ *
+ * Parameters:
+ *     p       init this process's jetsam information.
+ *     priority          The jetsam priority band
+ *     user_data         user specific data, unused by the kernel
+ *     is_assertion      When true, a priority update is driven by an assertion.
+ *     effective         guards against race if process's update already occurred
+ *     update_memlimit   When true we know this is the init step via the posix_spawn path.
+ *
+ *     memlimit_active   Value in megabytes; The monitored footprint level while the
+ *                       process is active.  Exceeding it may result in termination
+ *                       based on it's associated fatal flag.
+ *
+ *     memlimit_active_is_fatal  When a process is active and exceeds its memory footprint,
+ *                               this describes whether or not it should be immediately fatal.
+ *
+ *     memlimit_inactive Value in megabytes; The monitored footprint level while the
+ *                       process is inactive.  Exceeding it may result in termination
+ *                       based on it's associated fatal flag.
+ *
+ *     memlimit_inactive_is_fatal  When a process is inactive and exceeds its memory footprint,
+ *                                 this describes whether or not it should be immediatly fatal.
+ *
+ * Returns:     0      Success
+ *             non-0   Failure
+ */
 
-       if (!system_procs_aging_band && !applications_aging_band) {
-               return;
-       }
+int
+memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t is_assertion, boolean_t effective, boolean_t update_memlimit,
+    int32_t memlimit_active, boolean_t memlimit_active_is_fatal,
+    int32_t memlimit_inactive, boolean_t memlimit_inactive_is_fatal)
+{
+       int ret;
+       boolean_t head_insert = false;
 
-       if ((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == 0) {
-               return;
-       }
+       MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing (%s) pid %d: priority %d, user_data 0x%llx\n", (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, user_data);
 
-       if (isProcessInAgingBands(p)) {
-               if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) {
-                       assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == P_DIRTY_AGING_IN_PROGRESS);
-               }
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, p->p_pid, priority, user_data, effective, 0);
 
-               if (isSysProc(p) && system_procs_aging_band) {
-                       assert(p->p_memstat_effectivepriority == system_procs_aging_band);
-                       assert(p->p_memstat_idledeadline);
-                       present_in_sysprocs_aging_bucket = TRUE;
-               } else if (isApp(p) && applications_aging_band) {
-                       assert(p->p_memstat_effectivepriority == applications_aging_band);
-                       assert(p->p_memstat_idledeadline);
-                       present_in_apps_aging_bucket = TRUE;
-               }
+       if (priority == -1) {
+               /* Use as shorthand for default priority */
+               priority = JETSAM_PRIORITY_DEFAULT;
+       } else if ((priority == system_procs_aging_band) || (priority == applications_aging_band)) {
+               /* Both the aging bands are reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */
+               priority = JETSAM_PRIORITY_IDLE;
+       } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) {
+               /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */
+               priority = JETSAM_PRIORITY_IDLE;
+               head_insert = TRUE;
+       } else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
+               /* Sanity check */
+               ret = EINVAL;
+               goto out;
        }
 
-       MEMORYSTATUS_DEBUG(1, "memorystatus_invalidate_idle_demotion(): invalidating demotion to idle band for pid %d (clear_state %d, demotions %d).\n",
-           p->p_pid, clear_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps));
+       proc_list_lock();
 
+       assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
 
-       if (clear_state) {
-               p->p_memstat_idledeadline = 0;
-               p->p_memstat_dirty &= ~P_DIRTY_AGING_IN_PROGRESS;
+       if (effective && (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
+               ret = EALREADY;
+               proc_list_unlock();
+               MEMORYSTATUS_DEBUG(1, "memorystatus_update: effective change specified for pid %d, but change already occurred.\n", p->p_pid);
+               goto out;
        }
 
-       if (isSysProc(p) && present_in_sysprocs_aging_bucket == TRUE) {
-               memorystatus_scheduled_idle_demotions_sysprocs--;
-               assert(memorystatus_scheduled_idle_demotions_sysprocs >= 0);
-       } else if (isApp(p) && present_in_apps_aging_bucket == TRUE) {
-               memorystatus_scheduled_idle_demotions_apps--;
-               assert(memorystatus_scheduled_idle_demotions_apps >= 0);
+       if ((p->p_memstat_state & P_MEMSTAT_TERMINATED) || ((p->p_listflag & P_LIST_EXITED) != 0)) {
+               /*
+                * This could happen when a process calling posix_spawn() is exiting on the jetsam thread.
+                */
+               ret = EBUSY;
+               proc_list_unlock();
+               goto out;
        }
 
-       assert((memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps) >= 0);
-}
+       p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
+       p->p_memstat_userdata = user_data;
 
-static void
-memorystatus_reschedule_idle_demotion_locked(void)
-{
-       if (0 == (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)) {
-               if (memstat_idle_demotion_deadline) {
-                       /* Transitioned 1->0, so cancel next call */
-                       thread_call_cancel(memorystatus_idle_demotion_call);
-                       memstat_idle_demotion_deadline = 0;
+       if (is_assertion) {
+               if (priority == JETSAM_PRIORITY_IDLE) {
+                       /*
+                        * Assertions relinquish control when the process is heading to IDLE.
+                        */
+                       if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
+                               /*
+                                * Mark the process as no longer being managed by assertions.
+                                */
+                               p->p_memstat_state &= ~P_MEMSTAT_PRIORITY_ASSERTION;
+                       } else {
+                               /*
+                                * Ignore an idle priority transition if the process is not
+                                * already managed by assertions.  We won't treat this as
+                                * an error, but we will log the unexpected behavior and bail.
+                                */
+                               os_log(OS_LOG_DEFAULT, "memorystatus: Ignore assertion driven idle priority. Process not previously controlled %s:%d\n",
+                                   (*p->p_name ? p->p_name : "unknown"), p->p_pid);
+
+                               ret = 0;
+                               proc_list_unlock();
+                               goto out;
+                       }
+               } else {
+                       /*
+                        * Process is now being managed by assertions,
+                        */
+                       p->p_memstat_state |= P_MEMSTAT_PRIORITY_ASSERTION;
                }
-       } else {
-               memstat_bucket_t *demotion_bucket;
-               proc_t p = NULL, p1 = NULL, p2 = NULL;
 
-               if (system_procs_aging_band) {
-                       demotion_bucket = &memstat_bucket[system_procs_aging_band];
-                       p1 = TAILQ_FIRST(&demotion_bucket->list);
+               /* Always update the assertion priority in this path */
 
-                       p = p1;
-               }
+               p->p_memstat_assertionpriority = priority;
 
-               if (applications_aging_band) {
-                       demotion_bucket = &memstat_bucket[applications_aging_band];
-                       p2 = TAILQ_FIRST(&demotion_bucket->list);
+               int memstat_dirty_flags = memorystatus_dirty_get(p, TRUE);  /* proc_list_lock is held */
 
-                       if (p1 && p2) {
-                               p = (p1->p_memstat_idledeadline > p2->p_memstat_idledeadline) ? p2 : p1;
+               if (memstat_dirty_flags != 0) {
+                       /*
+                        * Calculate maximum priority only when dirty tracking processes are involved.
+                        */
+                       int maxpriority;
+                       if (memstat_dirty_flags & PROC_DIRTY_IS_DIRTY) {
+                               maxpriority = MAX(p->p_memstat_assertionpriority, p->p_memstat_requestedpriority);
                        } else {
-                               p = (p1 == NULL) ? p2 : p1;
-                       }
-               }
+                               /* clean */
 
-               assert(p);
+                               if (memstat_dirty_flags & PROC_DIRTY_ALLOWS_IDLE_EXIT) {
+                                       /*
+                                        * The aging policy must be evaluated and applied here because runnningboardd
+                                        * has relinquished its hold on the jetsam priority by attempting to move a
+                                        * clean process to the idle band.
+                                        */
 
-               if (p != NULL) {
-                       assert(p && p->p_memstat_idledeadline);
-                       if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline) {
-                               thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline);
-                               memstat_idle_demotion_deadline = p->p_memstat_idledeadline;
+                                       int newpriority = JETSAM_PRIORITY_IDLE;
+                                       if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) {
+                                               newpriority = (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) ? system_procs_aging_band : JETSAM_PRIORITY_IDLE;
+                                       }
+
+                                       maxpriority = MAX(p->p_memstat_assertionpriority, newpriority );
+
+                                       if (newpriority == system_procs_aging_band) {
+                                               memorystatus_schedule_idle_demotion_locked(p, FALSE);
+                                       }
+                               } else {
+                                       /*
+                                        * Preserves requestedpriority when the process does not support pressured exit.
+                                        */
+                                       maxpriority = MAX(p->p_memstat_assertionpriority, p->p_memstat_requestedpriority);
+                               }
                        }
+                       priority = maxpriority;
                }
+       } else {
+               p->p_memstat_requestedpriority = priority;
        }
-}
 
-/*
- * List manipulation
- */
+       if (update_memlimit) {
+               boolean_t is_fatal;
+               boolean_t use_active;
 
-int
-memorystatus_add(proc_t p, boolean_t locked)
-{
-       memstat_bucket_t *bucket;
+               /*
+                * Posix_spawn'd processes come through this path to instantiate ledger limits.
+                * Forked processes do not come through this path, so no ledger limits exist.
+                * (That's why forked processes can consume unlimited memory.)
+                */
 
-       MEMORYSTATUS_DEBUG(1, "memorystatus_list_add(): adding pid %d with priority %d.\n", p->p_pid, p->p_memstat_effectivepriority);
+               MEMORYSTATUS_DEBUG(3, "memorystatus_update(enter): pid %d, priority %d, dirty=0x%x, Active(%dMB %s), Inactive(%dMB, %s)\n",
+                   p->p_pid, priority, p->p_memstat_dirty,
+                   memlimit_active, (memlimit_active_is_fatal ? "F " : "NF"),
+                   memlimit_inactive, (memlimit_inactive_is_fatal ? "F " : "NF"));
 
-       if (!locked) {
-               proc_list_lock();
-       }
+               if (memlimit_active <= 0) {
+                       /*
+                        * This process will have a system_wide task limit when active.
+                        * System_wide task limit is always fatal.
+                        * It's quite common to see non-fatal flag passed in here.
+                        * It's not an error, we just ignore it.
+                        */
 
-       DTRACE_MEMORYSTATUS2(memorystatus_add, proc_t, p, int32_t, p->p_memstat_effectivepriority);
+                       /*
+                        * For backward compatibility with some unexplained launchd behavior,
+                        * we allow a zero sized limit.  But we still enforce system_wide limit
+                        * when written to the ledgers.
+                        */
 
-       /* Processes marked internal do not have priority tracked */
-       if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
-               goto exit;
-       }
+                       if (memlimit_active < 0) {
+                               memlimit_active = -1;  /* enforces system_wide task limit */
+                       }
+                       memlimit_active_is_fatal = TRUE;
+               }
 
-       bucket = &memstat_bucket[p->p_memstat_effectivepriority];
+               if (memlimit_inactive <= 0) {
+                       /*
+                        * This process will have a system_wide task limit when inactive.
+                        * System_wide task limit is always fatal.
+                        */
+
+                       memlimit_inactive = -1;
+                       memlimit_inactive_is_fatal = TRUE;
+               }
 
-       if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) {
-               assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs - 1);
-       } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) {
-               assert(bucket->count == memorystatus_scheduled_idle_demotions_apps - 1);
-       } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
                /*
-                * Entering the idle band.
-                * Record idle start time.
+                * Initialize the active limit variants for this process.
                 */
-               p->p_memstat_idle_start = mach_absolute_time();
-       }
-
-       TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
-       bucket->count++;
+               SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal);
 
-       memorystatus_list_count++;
+               /*
+                * Initialize the inactive limit variants for this process.
+                */
+               SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal);
 
-       memorystatus_check_levels_locked();
+               /*
+                * Initialize the cached limits for target process.
+                * When the target process is dirty tracked, it's typically
+                * in a clean state.  Non dirty tracked processes are
+                * typically active (Foreground or above).
+                * But just in case, we don't make assumptions...
+                */
 
-exit:
-       if (!locked) {
-               proc_list_unlock();
+               if (proc_jetsam_state_is_active_locked(p) == TRUE) {
+                       CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
+                       use_active = TRUE;
+               } else {
+                       CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
+                       use_active = FALSE;
+               }
+
+               /*
+                * Enforce the cached limit by writing to the ledger.
+                */
+               if (memorystatus_highwater_enabled) {
+                       /* apply now */
+                       task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, use_active, is_fatal);
+
+                       MEMORYSTATUS_DEBUG(3, "memorystatus_update: init: limit on pid %d (%dMB %s) targeting priority(%d) dirty?=0x%x %s\n",
+                           p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
+                           (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), priority, p->p_memstat_dirty,
+                           (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
+               }
        }
 
-       return 0;
+       /*
+        * We can't add to the aging bands buckets here.
+        * But, we could be removing it from those buckets.
+        * Check and take appropriate steps if so.
+        */
+
+       if (isProcessInAgingBands(p)) {
+               if ((jetsam_aging_policy != kJetsamAgingPolicyLegacy) && isApp(p) && (priority > applications_aging_band)) {
+                       /*
+                        * Runningboardd is pulling up an application that is in the aging band.
+                        * We reset the app's state here so that it'll get a fresh stay in the
+                        * aging band on the way back.
+                        *
+                        * We always handled the app 'aging' in the memorystatus_update_priority_locked()
+                        * function. Daemons used to be handled via the dirty 'set/clear/track' path.
+                        * But with extensions (daemon-app hybrid), runningboardd is now going through
+                        * this routine for daemons too and things have gotten a bit tangled. This should
+                        * be simplified/untangled at some point and might require some assistance from
+                        * runningboardd.
+                        */
+                       memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+               } else {
+                       memorystatus_invalidate_idle_demotion_locked(p, FALSE);
+               }
+               memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
+       } else {
+               if (jetsam_aging_policy == kJetsamAgingPolicyLegacy && priority == JETSAM_PRIORITY_IDLE) {
+                       /*
+                        * Daemons with 'inactive' limits will go through the dirty tracking codepath.
+                        * This path deals with apps that may have 'inactive' limits e.g. WebContent processes.
+                        * If this is the legacy aging policy we explicitly need to apply those limits. If it
+                        * is any other aging policy, then we don't need to worry because all processes
+                        * will go through the aging bands and then the demotion thread will take care to
+                        * move them into the IDLE band and apply the required limits.
+                        */
+                       memorystatus_update_priority_locked(p, priority, head_insert, TRUE);
+               }
+       }
+
+       memorystatus_update_priority_locked(p, priority, head_insert, FALSE);
+
+       proc_list_unlock();
+       ret = 0;
+
+out:
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret, 0, 0, 0, 0);
+
+       return ret;
 }
 
-/*
- * Description:
- *     Moves a process from one jetsam bucket to another.
- *     which changes the LRU position of the process.
- *
- *     Monitors transition between buckets and if necessary
- *     will update cached memory limits accordingly.
- *
- *     skip_demotion_check:
- *     - if the 'jetsam aging policy' is NOT 'legacy':
- *             When this flag is TRUE, it means we are going
- *             to age the ripe processes out of the aging bands and into the
- *             IDLE band and apply their inactive memory limits.
- *
- *     - if the 'jetsam aging policy' is 'legacy':
- *             When this flag is TRUE, it might mean the above aging mechanism
- *             OR
- *             It might be that we have a process that has used up its 'idle deferral'
- *             stay that is given to it once per lifetime. And in this case, the process
- *             won't be going through any aging codepaths. But we still need to apply
- *             the right inactive limits and so we explicitly set this to TRUE if the
- *             new priority for the process is the IDLE band.
- */
-void
-memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check)
+int
+memorystatus_remove(proc_t p)
 {
-       memstat_bucket_t *old_bucket, *new_bucket;
+       int ret;
+       memstat_bucket_t *bucket;
+       boolean_t       reschedule = FALSE;
 
-       assert(priority < MEMSTAT_BUCKET_COUNT);
+       MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing pid %d\n", p->p_pid);
 
-       /* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
-       if ((p->p_listflag & P_LIST_EXITED) != 0) {
-               return;
+       /*
+        * Check if this proc is locked (because we're performing a freeze).
+        * If so, we fail and instruct the caller to try again later.
+        */
+       if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
+               return EAGAIN;
        }
 
-       MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting %s(%d) to priority %d, inserting at %s\n",
-           (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, head_insert ? "head" : "tail");
+       assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
 
-       DTRACE_MEMORYSTATUS3(memorystatus_update_priority, proc_t, p, int32_t, p->p_memstat_effectivepriority, int, priority);
+       bucket = &memstat_bucket[p->p_memstat_effectivepriority];
 
-#if DEVELOPMENT || DEBUG
-       if (priority == JETSAM_PRIORITY_IDLE && /* if the process is on its way into the IDLE band */
-           skip_demotion_check == FALSE &&     /* and it isn't via the path that will set the INACTIVE memlimits */
-           (p->p_memstat_dirty & P_DIRTY_TRACK) && /* and it has 'DIRTY' tracking enabled */
-           ((p->p_memstat_memlimit != p->p_memstat_memlimit_inactive) || /* and we notice that the current limit isn't the right value (inactive) */
-           ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) ? (!(p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)) : (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)))) { /* OR type (fatal vs non-fatal) */
-               panic("memorystatus_update_priority_locked: on %s with 0x%x, prio: %d and %d\n", p->p_name, p->p_memstat_state, priority, p->p_memstat_memlimit); /* then we must catch this */
+       if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) {
+               assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs);
+               reschedule = TRUE;
+       } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) {
+               assert(bucket->count == memorystatus_scheduled_idle_demotions_apps);
+               reschedule = TRUE;
        }
-#endif /* DEVELOPMENT || DEBUG */
 
-       old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
+       /*
+        * Record idle delta
+        */
 
-       if (skip_demotion_check == FALSE) {
-               if (isSysProc(p)) {
-                       /*
-                        * For system processes, the memorystatus_dirty_* routines take care of adding/removing
-                        * the processes from the aging bands and balancing the demotion counts.
-                        * We can, however, override that if the process has an 'elevated inactive jetsam band' attribute.
-                        */
+       if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
+               uint64_t now = mach_absolute_time();
+               if (now > p->p_memstat_idle_start) {
+                       p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
+               }
+       }
 
-                       if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
-                               /*
-                                * 2 types of processes can use the non-standard elevated inactive band:
-                                * - Frozen processes that always land in memorystatus_freeze_jetsam_band
-                                * OR
-                                * - processes that specifically opt-in to the elevated inactive support e.g. docked processes.
-                                */
-#if CONFIG_FREEZE
-                               if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
-                                       if (priority <= memorystatus_freeze_jetsam_band) {
-                                               priority = memorystatus_freeze_jetsam_band;
-                                       }
-                               } else
-#endif /* CONFIG_FREEZE */
-                               {
-                                       if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) {
-                                               priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
-                                       }
-                               }
-                               assert(!(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS));
-                       }
-               } else if (isApp(p)) {
-                       /*
-                        * Check to see if the application is being lowered in jetsam priority. If so, and:
-                        * - it has an 'elevated inactive jetsam band' attribute, then put it in the appropriate band.
-                        * - it is a normal application, then let it age in the aging band if that policy is in effect.
-                        */
+       TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
+       bucket->count--;
+       if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
+               bucket->relaunch_high_count--;
+       }
 
-                       if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
-#if CONFIG_FREEZE
-                               if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
-                                       if (priority <= memorystatus_freeze_jetsam_band) {
-                                               priority = memorystatus_freeze_jetsam_band;
-                                       }
-                               } else
-#endif /* CONFIG_FREEZE */
-                               {
-                                       if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) {
-                                               priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
-                                       }
-                               }
-                       } else {
-                               if (applications_aging_band) {
-                                       if (p->p_memstat_effectivepriority == applications_aging_band) {
-                                               assert(old_bucket->count == (memorystatus_scheduled_idle_demotions_apps + 1));
-                                       }
+       memorystatus_list_count--;
 
-                                       if ((jetsam_aging_policy != kJetsamAgingPolicyLegacy) && (priority <= applications_aging_band)) {
-                                               assert(!(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS));
-                                               priority = applications_aging_band;
-                                               memorystatus_schedule_idle_demotion_locked(p, TRUE);
-                                       }
-                               }
-                       }
-               }
+       /* If awaiting demotion to the idle band, clean up */
+       if (reschedule) {
+               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+               memorystatus_reschedule_idle_demotion_locked();
        }
 
-       if ((system_procs_aging_band && (priority == system_procs_aging_band)) || (applications_aging_band && (priority == applications_aging_band))) {
-               assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS);
+       memorystatus_check_levels_locked();
+
+#if CONFIG_FREEZE
+       if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) {
+               if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
+                       p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
+                       memorystatus_refreeze_eligible_count--;
+               }
+
+               memorystatus_frozen_count--;
+               memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages;
+               p->p_memstat_freeze_sharedanon_pages = 0;
        }
 
-       TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
-       old_bucket->count--;
+       if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
+               memorystatus_suspended_count--;
+       }
+#endif
 
-       new_bucket = &memstat_bucket[priority];
-       if (head_insert) {
-               TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list);
+       if (p) {
+               ret = 0;
        } else {
-               TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
+               ret = ESRCH;
        }
-       new_bucket->count++;
-
-       if (memorystatus_highwater_enabled) {
-               boolean_t is_fatal;
-               boolean_t use_active;
 
-               /*
-                * If cached limit data is updated, then the limits
-                * will be enforced by writing to the ledgers.
-                */
-               boolean_t ledger_update_needed = TRUE;
+       return ret;
+}
 
-               /*
-                * Here, we must update the cached memory limit if the task
-                * is transitioning between:
-                *      active <--> inactive
-                *      FG     <-->       BG
-                * but:
-                *      dirty  <-->    clean   is ignored
-                *
-                * We bypass non-idle processes that have opted into dirty tracking because
-                * a move between buckets does not imply a transition between the
-                * dirty <--> clean state.
-                */
+/*
+ * Validate dirty tracking flags with process state.
+ *
+ * Return:
+ *     0     on success
+ *      non-0 on failure
+ *
+ * The proc_list_lock is held by the caller.
+ */
 
-               if (p->p_memstat_dirty & P_DIRTY_TRACK) {
-                       if (skip_demotion_check == TRUE && priority == JETSAM_PRIORITY_IDLE) {
-                               CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
-                               use_active = FALSE;
-                       } else {
-                               ledger_update_needed = FALSE;
-                       }
-               } else if ((priority >= JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority < JETSAM_PRIORITY_FOREGROUND)) {
-                       /*
-                        *      inactive --> active
-                        *      BG       -->     FG
-                        *      assign active state
-                        */
-                       CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
-                       use_active = TRUE;
-               } else if ((priority < JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
-                       /*
-                        *      active --> inactive
-                        *      FG     -->       BG
-                        *      assign inactive state
-                        */
-                       CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
-                       use_active = FALSE;
-               } else {
-                       /*
-                        * The transition between jetsam priority buckets apparently did
-                        * not affect active/inactive state.
-                        * This is not unusual... especially during startup when
-                        * processes are getting established in their respective bands.
-                        */
-                       ledger_update_needed = FALSE;
-               }
+static int
+memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol)
+{
+       /* See that the process isn't marked for termination */
+       if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) {
+               return EBUSY;
+       }
 
-               /*
-                * Enforce the new limits by writing to the ledger
-                */
-               if (ledger_update_needed) {
-                       task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal);
+       /* Idle exit requires that process be tracked */
+       if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
+           !(pcontrol & PROC_DIRTY_TRACK)) {
+               return EINVAL;
+       }
 
-                       MEMORYSTATUS_DEBUG(3, "memorystatus_update_priority_locked: new limit on pid %d (%dMB %s) priority old --> new (%d --> %d) dirty?=0x%x %s\n",
-                           p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
-                           (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, priority, p->p_memstat_dirty,
-                           (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
-               }
+       /* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */
+       if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) &&
+           !(pcontrol & PROC_DIRTY_TRACK)) {
+               return EINVAL;
        }
 
-       /*
-        * Record idle start or idle delta.
-        */
-       if (p->p_memstat_effectivepriority == priority) {
-               /*
-                * This process is not transitioning between
-                * jetsam priority buckets.  Do nothing.
-                */
-       } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
-               uint64_t now;
-               /*
-                * Transitioning out of the idle priority bucket.
-                * Record idle delta.
-                */
-               assert(p->p_memstat_idle_start != 0);
-               now = mach_absolute_time();
-               if (now > p->p_memstat_idle_start) {
-                       p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
-               }
+       /* Only one type of DEFER behavior is allowed.*/
+       if ((pcontrol & PROC_DIRTY_DEFER) &&
+           (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) {
+               return EINVAL;
+       }
+
+       /* Deferral is only relevant if idle exit is specified */
+       if (((pcontrol & PROC_DIRTY_DEFER) ||
+           (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) &&
+           !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
+               return EINVAL;
+       }
+
+       return 0;
+}
+
+static void
+memorystatus_update_idle_priority_locked(proc_t p)
+{
+       int32_t priority;
+
+       MEMORYSTATUS_DEBUG(1, "memorystatus_update_idle_priority_locked(): pid %d dirty 0x%X\n", p->p_pid, p->p_memstat_dirty);
+
+       assert(isSysProc(p));
+
+       if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) {
+               priority = (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) ? system_procs_aging_band : JETSAM_PRIORITY_IDLE;
+       } else {
+               priority = p->p_memstat_requestedpriority;
+       }
 
+       if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
                /*
-                * About to become active and so memory footprint could change.
-                * So mark it eligible for freeze-considerations next time around.
+                * This process has a jetsam priority managed by an assertion.
+                * Policy is to choose the max priority.
                 */
-               if (p->p_memstat_state & P_MEMSTAT_FREEZE_IGNORE) {
-                       p->p_memstat_state &= ~P_MEMSTAT_FREEZE_IGNORE;
+               if (p->p_memstat_assertionpriority > priority) {
+                       os_log(OS_LOG_DEFAULT, "memorystatus: assertion priority %d overrides priority %d for %s:%d\n",
+                           p->p_memstat_assertionpriority, priority,
+                           (*p->p_name ? p->p_name : "unknown"), p->p_pid);
+                       priority = p->p_memstat_assertionpriority;
                }
-       } else if (priority == JETSAM_PRIORITY_IDLE) {
-               /*
-                * Transitioning into the idle priority bucket.
-                * Record idle start.
-                */
-               p->p_memstat_idle_start = mach_absolute_time();
        }
 
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CHANGE_PRIORITY), p->p_pid, priority, p->p_memstat_effectivepriority, 0, 0);
+       if (priority != p->p_memstat_effectivepriority) {
+               if ((jetsam_aging_policy == kJetsamAgingPolicyLegacy) &&
+                   (priority == JETSAM_PRIORITY_IDLE)) {
+                       /*
+                        * This process is on its way into the IDLE band. The system is
+                        * using 'legacy' jetsam aging policy. That means, this process
+                        * has already used up its idle-deferral aging time that is given
+                        * once per its lifetime. So we need to set the INACTIVE limits
+                        * explicitly because it won't be going through the demotion paths
+                        * that take care to apply the limits appropriately.
+                        */
 
-       p->p_memstat_effectivepriority = priority;
+                       if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
+                               /*
+                                * This process has the 'elevated inactive jetsam band' attribute.
+                                * So, there will be no trip to IDLE after all.
+                                * Instead, we pin the process in the elevated band,
+                                * where its ACTIVE limits will apply.
+                                */
 
-#if CONFIG_SECLUDED_MEMORY
-       if (secluded_for_apps &&
-           task_could_use_secluded_mem(p->task)) {
-               task_set_can_use_secluded_mem(
-                       p->task,
-                       (priority >= JETSAM_PRIORITY_FOREGROUND));
-       }
-#endif /* CONFIG_SECLUDED_MEMORY */
+                               priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
+                       }
 
-       memorystatus_check_levels_locked();
+                       memorystatus_update_priority_locked(p, priority, false, true);
+               } else {
+                       memorystatus_update_priority_locked(p, priority, false, false);
+               }
+       }
 }
 
 /*
+ * Processes can opt to have their state tracked by the kernel, indicating  when they are busy (dirty) or idle
+ * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
+ * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
+ * priority idle band when clean (and killed earlier, protecting higher priority procesess).
  *
- * Description: Update the jetsam priority and memory limit attributes for a given process.
- *
- * Parameters:
- *     p       init this process's jetsam information.
- *     priority          The jetsam priority band
- *     user_data         user specific data, unused by the kernel
- *     effective         guards against race if process's update already occurred
- *     update_memlimit   When true we know this is the init step via the posix_spawn path.
- *
- *     memlimit_active   Value in megabytes; The monitored footprint level while the
- *                       process is active.  Exceeding it may result in termination
- *                       based on it's associated fatal flag.
- *
- *     memlimit_active_is_fatal  When a process is active and exceeds its memory footprint,
- *                               this describes whether or not it should be immediately fatal.
- *
- *     memlimit_inactive Value in megabytes; The monitored footprint level while the
- *                       process is inactive.  Exceeding it may result in termination
- *                       based on it's associated fatal flag.
- *
- *     memlimit_inactive_is_fatal  When a process is inactive and exceeds its memory footprint,
- *                                 this describes whether or not it should be immediatly fatal.
+ * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
+ * memorystatus_sysprocs_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
+ * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
+ * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
+ * band. The deferral can be cleared early by clearing the appropriate flag.
  *
- * Returns:     0      Success
- *             non-0   Failure
+ * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
+ * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
+ * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
  */
 
 int
-memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effective, boolean_t update_memlimit,
-    int32_t memlimit_active, boolean_t memlimit_active_is_fatal,
-    int32_t memlimit_inactive, boolean_t memlimit_inactive_is_fatal)
+memorystatus_dirty_track(proc_t p, uint32_t pcontrol)
 {
-       int ret;
-       boolean_t head_insert = false;
-
-       MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing (%s) pid %d: priority %d, user_data 0x%llx\n", (*p->p_name ? p->p_name : "unknown"), p->p_pid, priority, user_data);
-
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, p->p_pid, priority, user_data, effective, 0);
+       unsigned int old_dirty;
+       boolean_t reschedule = FALSE;
+       boolean_t already_deferred = FALSE;
+       boolean_t defer_now = FALSE;
+       int ret = 0;
 
-       if (priority == -1) {
-               /* Use as shorthand for default priority */
-               priority = JETSAM_PRIORITY_DEFAULT;
-       } else if ((priority == system_procs_aging_band) || (priority == applications_aging_band)) {
-               /* Both the aging bands are reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */
-               priority = JETSAM_PRIORITY_IDLE;
-       } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) {
-               /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */
-               priority = JETSAM_PRIORITY_IDLE;
-               head_insert = TRUE;
-       } else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
-               /* Sanity check */
-               ret = EINVAL;
-               goto out;
-       }
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_TRACK),
+           p->p_pid, p->p_memstat_dirty, pcontrol, 0, 0);
 
        proc_list_lock();
 
-       assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
-
-       if (effective && (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
-               ret = EALREADY;
-               proc_list_unlock();
-               MEMORYSTATUS_DEBUG(1, "memorystatus_update: effective change specified for pid %d, but change already occurred.\n", p->p_pid);
-               goto out;
-       }
-
-       if ((p->p_memstat_state & P_MEMSTAT_TERMINATED) || ((p->p_listflag & P_LIST_EXITED) != 0)) {
+       if ((p->p_listflag & P_LIST_EXITED) != 0) {
                /*
-                * This could happen when a process calling posix_spawn() is exiting on the jetsam thread.
+                * Process is on its way out.
                 */
                ret = EBUSY;
-               proc_list_unlock();
-               goto out;
+               goto exit;
        }
 
-       p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
-       p->p_memstat_userdata = user_data;
-       p->p_memstat_requestedpriority = priority;
-
-       if (update_memlimit) {
-               boolean_t is_fatal;
-               boolean_t use_active;
+       if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
+               ret = EPERM;
+               goto exit;
+       }
 
-               /*
-                * Posix_spawn'd processes come through this path to instantiate ledger limits.
-                * Forked processes do not come through this path, so no ledger limits exist.
-                * (That's why forked processes can consume unlimited memory.)
-                */
+       if ((ret = memorystatus_validate_track_flags(p, pcontrol)) != 0) {
+               /* error  */
+               goto exit;
+       }
 
-               MEMORYSTATUS_DEBUG(3, "memorystatus_update(enter): pid %d, priority %d, dirty=0x%x, Active(%dMB %s), Inactive(%dMB, %s)\n",
-                   p->p_pid, priority, p->p_memstat_dirty,
-                   memlimit_active, (memlimit_active_is_fatal ? "F " : "NF"),
-                   memlimit_inactive, (memlimit_inactive_is_fatal ? "F " : "NF"));
+       old_dirty = p->p_memstat_dirty;
 
-               if (memlimit_active <= 0) {
-                       /*
-                        * This process will have a system_wide task limit when active.
-                        * System_wide task limit is always fatal.
-                        * It's quite common to see non-fatal flag passed in here.
-                        * It's not an error, we just ignore it.
-                        */
+       /* These bits are cumulative, as per <rdar://problem/11159924> */
+       if (pcontrol & PROC_DIRTY_TRACK) {
+               p->p_memstat_dirty |= P_DIRTY_TRACK;
+       }
 
-                       /*
-                        * For backward compatibility with some unexplained launchd behavior,
-                        * we allow a zero sized limit.  But we still enforce system_wide limit
-                        * when written to the ledgers.
-                        */
+       if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
+               p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
+       }
 
-                       if (memlimit_active < 0) {
-                               memlimit_active = -1;  /* enforces system_wide task limit */
-                       }
-                       memlimit_active_is_fatal = TRUE;
-               }
+       if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
+               p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS;
+       }
 
-               if (memlimit_inactive <= 0) {
-                       /*
-                        * This process will have a system_wide task limit when inactive.
-                        * System_wide task limit is always fatal.
-                        */
+       if (old_dirty & P_DIRTY_AGING_IN_PROGRESS) {
+               already_deferred = TRUE;
+       }
 
-                       memlimit_inactive = -1;
-                       memlimit_inactive_is_fatal = TRUE;
+
+       /* This can be set and cleared exactly once. */
+       if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
+               if ((pcontrol & (PROC_DIRTY_DEFER)) &&
+                   !(old_dirty & P_DIRTY_DEFER)) {
+                       p->p_memstat_dirty |= P_DIRTY_DEFER;
                }
 
-               /*
-                * Initialize the active limit variants for this process.
-                */
-               SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal);
+               if ((pcontrol & (PROC_DIRTY_DEFER_ALWAYS)) &&
+                   !(old_dirty & P_DIRTY_DEFER_ALWAYS)) {
+                       p->p_memstat_dirty |= P_DIRTY_DEFER_ALWAYS;
+               }
 
-               /*
-                * Initialize the inactive limit variants for this process.
-                */
-               SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal);
+               defer_now = TRUE;
+       }
 
-               /*
-                * Initialize the cached limits for target process.
-                * When the target process is dirty tracked, it's typically
-                * in a clean state.  Non dirty tracked processes are
-                * typically active (Foreground or above).
-                * But just in case, we don't make assumptions...
-                */
+       MEMORYSTATUS_DEBUG(1, "memorystatus_on_track_dirty(): set idle-exit %s / defer %s / dirty %s for pid %d\n",
+           ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) ? "Y" : "N",
+           defer_now ? "Y" : "N",
+           p->p_memstat_dirty & P_DIRTY ? "Y" : "N",
+           p->p_pid);
 
-               if (proc_jetsam_state_is_active_locked(p) == TRUE) {
-                       CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
-                       use_active = TRUE;
-               } else {
-                       CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
-                       use_active = FALSE;
-               }
+       /* Kick off or invalidate the idle exit deferment if there's a state transition. */
+       if (!(p->p_memstat_dirty & P_DIRTY_IS_DIRTY)) {
+               if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
+                       if (defer_now && !already_deferred) {
+                               /*
+                                * Request to defer a clean process that's idle-exit enabled
+                                * and not already in the jetsam deferred band. Most likely a
+                                * new launch.
+                                */
+                               memorystatus_schedule_idle_demotion_locked(p, TRUE);
+                               reschedule = TRUE;
+                       } else if (!defer_now) {
+                               /*
+                                * The process isn't asking for the 'aging' facility.
+                                * Could be that it is:
+                                */
+
+                               if (already_deferred) {
+                                       /*
+                                        * already in the aging bands. Traditionally,
+                                        * some processes have tried to use this to
+                                        * opt out of the 'aging' facility.
+                                        */
+
+                                       memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+                               } else {
+                                       /*
+                                        * agnostic to the 'aging' facility. In that case,
+                                        * we'll go ahead and opt it in because this is likely
+                                        * a new launch (clean process, dirty tracking enabled)
+                                        */
+
+                                       memorystatus_schedule_idle_demotion_locked(p, TRUE);
+                               }
 
+                               reschedule = TRUE;
+                       }
+               }
+       } else {
                /*
-                * Enforce the cached limit by writing to the ledger.
+                * We are trying to operate on a dirty process. Dirty processes have to
+                * be removed from the deferred band. The question is do we reset the
+                * deferred state or not?
+                *
+                * This could be a legal request like:
+                * - this process had opted into the 'aging' band
+                * - but it's now dirty and requests to opt out.
+                * In this case, we remove the process from the band and reset its
+                * state too. It'll opt back in properly when needed.
+                *
+                * OR, this request could be a user-space bug. E.g.:
+                * - this process had opted into the 'aging' band when clean
+                * - and, then issues another request to again put it into the band except
+                *   this time the process is dirty.
+                * The process going dirty, as a transition in memorystatus_dirty_set(), will pull the process out of
+                * the deferred band with its state intact. So our request below is no-op.
+                * But we do it here anyways for coverage.
+                *
+                * memorystatus_update_idle_priority_locked()
+                * single-mindedly treats a dirty process as "cannot be in the aging band".
                 */
-               if (memorystatus_highwater_enabled) {
-                       /* apply now */
-                       task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, use_active, is_fatal);
 
-                       MEMORYSTATUS_DEBUG(3, "memorystatus_update: init: limit on pid %d (%dMB %s) targeting priority(%d) dirty?=0x%x %s\n",
-                           p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
-                           (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), priority, p->p_memstat_dirty,
-                           (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
+               if (!defer_now && already_deferred) {
+                       memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+                       reschedule = TRUE;
+               } else {
+                       boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE;
+
+                       memorystatus_invalidate_idle_demotion_locked(p, reset_state);
+                       reschedule = TRUE;
                }
        }
 
-       /*
-        * We can't add to the aging bands buckets here.
-        * But, we could be removing it from those buckets.
-        * Check and take appropriate steps if so.
-        */
+       memorystatus_update_idle_priority_locked(p);
 
-       if (isProcessInAgingBands(p)) {
-               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
-               memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
-       } else {
-               if (jetsam_aging_policy == kJetsamAgingPolicyLegacy && priority == JETSAM_PRIORITY_IDLE) {
-                       /*
-                        * Daemons with 'inactive' limits will go through the dirty tracking codepath.
-                        * This path deals with apps that may have 'inactive' limits e.g. WebContent processes.
-                        * If this is the legacy aging policy we explicitly need to apply those limits. If it
-                        * is any other aging policy, then we don't need to worry because all processes
-                        * will go through the aging bands and then the demotion thread will take care to
-                        * move them into the IDLE band and apply the required limits.
-                        */
-                       memorystatus_update_priority_locked(p, priority, head_insert, TRUE);
-               }
+       if (reschedule) {
+               memorystatus_reschedule_idle_demotion_locked();
        }
 
-       memorystatus_update_priority_locked(p, priority, head_insert, FALSE);
-
-       proc_list_unlock();
        ret = 0;
 
-out:
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret, 0, 0, 0, 0);
+exit:
+       proc_list_unlock();
 
        return ret;
 }
 
 int
-memorystatus_remove(proc_t p, boolean_t locked)
+memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol)
 {
        int ret;
-       memstat_bucket_t *bucket;
-       boolean_t       reschedule = FALSE;
-
-       MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing pid %d\n", p->p_pid);
+       boolean_t kill = false;
+       boolean_t reschedule = FALSE;
+       boolean_t was_dirty = FALSE;
+       boolean_t now_dirty = FALSE;
 
-       if (!locked) {
-               proc_list_lock();
-       }
+       MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, p->p_pid, pcontrol, p->p_memstat_dirty);
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_SET), p->p_pid, self, pcontrol, 0, 0);
 
-       assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
+       proc_list_lock();
 
-       bucket = &memstat_bucket[p->p_memstat_effectivepriority];
+       if ((p->p_listflag & P_LIST_EXITED) != 0) {
+               /*
+                * Process is on its way out.
+                */
+               ret = EBUSY;
+               goto exit;
+       }
 
-       if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) {
-               assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs);
-               reschedule = TRUE;
-       } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) {
-               assert(bucket->count == memorystatus_scheduled_idle_demotions_apps);
-               reschedule = TRUE;
+       if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
+               ret = EPERM;
+               goto exit;
        }
 
-       /*
-        * Record idle delta
-        */
+       if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
+               was_dirty = TRUE;
+       }
 
-       if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
-               uint64_t now = mach_absolute_time();
-               if (now > p->p_memstat_idle_start) {
-                       p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
+       if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
+               /* Dirty tracking not enabled */
+               ret = EINVAL;
+       } else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
+               /*
+                * Process is set to be terminated and we're attempting to mark it dirty.
+                * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
+                */
+               ret = EBUSY;
+       } else {
+               int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
+               if (pcontrol && !(p->p_memstat_dirty & flag)) {
+                       /* Mark the process as having been dirtied at some point */
+                       p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
+                       memorystatus_dirty_count++;
+                       ret = 0;
+               } else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
+                       if ((flag == P_DIRTY_SHUTDOWN) && (!(p->p_memstat_dirty & P_DIRTY))) {
+                               /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
+                               p->p_memstat_dirty |= P_DIRTY_TERMINATED;
+                               kill = true;
+                       } else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
+                               /* Kill previously terminated processes if set clean */
+                               kill = true;
+                       }
+                       p->p_memstat_dirty &= ~flag;
+                       memorystatus_dirty_count--;
+                       ret = 0;
+               } else {
+                       /* Already set */
+                       ret = EALREADY;
                }
        }
 
-       TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
-       bucket->count--;
-
-       memorystatus_list_count--;
-
-       /* If awaiting demotion to the idle band, clean up */
-       if (reschedule) {
-               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
-               memorystatus_reschedule_idle_demotion_locked();
-       }
-
-       memorystatus_check_levels_locked();
-
-#if CONFIG_FREEZE
-       if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) {
-               if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
-                       p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
-                       memorystatus_refreeze_eligible_count--;
-               }
-
-               memorystatus_frozen_count--;
-               memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages;
-               p->p_memstat_freeze_sharedanon_pages = 0;
-       }
-
-       if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
-               memorystatus_suspended_count--;
-       }
-#endif
-
-       if (!locked) {
-               proc_list_unlock();
-       }
-
-       if (p) {
-               ret = 0;
-       } else {
-               ret = ESRCH;
-       }
-
-       return ret;
-}
-
-/*
- * Validate dirty tracking flags with process state.
- *
- * Return:
- *     0     on success
- *      non-0 on failure
- *
- * The proc_list_lock is held by the caller.
- */
-
-static int
-memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol)
-{
-       /* See that the process isn't marked for termination */
-       if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) {
-               return EBUSY;
-       }
-
-       /* Idle exit requires that process be tracked */
-       if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
-           !(pcontrol & PROC_DIRTY_TRACK)) {
-               return EINVAL;
-       }
-
-       /* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */
-       if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) &&
-           !(pcontrol & PROC_DIRTY_TRACK)) {
-               return EINVAL;
-       }
-
-       /* Only one type of DEFER behavior is allowed.*/
-       if ((pcontrol & PROC_DIRTY_DEFER) &&
-           (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) {
-               return EINVAL;
-       }
-
-       /* Deferral is only relevant if idle exit is specified */
-       if (((pcontrol & PROC_DIRTY_DEFER) ||
-           (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) &&
-           !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
-               return EINVAL;
-       }
-
-       return 0;
-}
-
-static void
-memorystatus_update_idle_priority_locked(proc_t p)
-{
-       int32_t priority;
-
-       MEMORYSTATUS_DEBUG(1, "memorystatus_update_idle_priority_locked(): pid %d dirty 0x%X\n", p->p_pid, p->p_memstat_dirty);
-
-       assert(isSysProc(p));
-
-       if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) {
-               priority = (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) ? system_procs_aging_band : JETSAM_PRIORITY_IDLE;
-       } else {
-               priority = p->p_memstat_requestedpriority;
-       }
-
-       if (priority != p->p_memstat_effectivepriority) {
-               if ((jetsam_aging_policy == kJetsamAgingPolicyLegacy) &&
-                   (priority == JETSAM_PRIORITY_IDLE)) {
-                       /*
-                        * This process is on its way into the IDLE band. The system is
-                        * using 'legacy' jetsam aging policy. That means, this process
-                        * has already used up its idle-deferral aging time that is given
-                        * once per its lifetime. So we need to set the INACTIVE limits
-                        * explicitly because it won't be going through the demotion paths
-                        * that take care to apply the limits appropriately.
-                        */
-
-                       if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
-                               /*
-                                * This process has the 'elevated inactive jetsam band' attribute.
-                                * So, there will be no trip to IDLE after all.
-                                * Instead, we pin the process in the elevated band,
-                                * where its ACTIVE limits will apply.
-                                */
-
-                               priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
-                       }
-
-                       memorystatus_update_priority_locked(p, priority, false, true);
-               } else {
-                       memorystatus_update_priority_locked(p, priority, false, false);
-               }
-       }
-}
-
-/*
- * Processes can opt to have their state tracked by the kernel, indicating  when they are busy (dirty) or idle
- * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
- * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
- * priority idle band when clean (and killed earlier, protecting higher priority procesess).
- *
- * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
- * memorystatus_sysprocs_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
- * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
- * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
- * band. The deferral can be cleared early by clearing the appropriate flag.
- *
- * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
- * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
- * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
- */
-
-int
-memorystatus_dirty_track(proc_t p, uint32_t pcontrol)
-{
-       unsigned int old_dirty;
-       boolean_t reschedule = FALSE;
-       boolean_t already_deferred = FALSE;
-       boolean_t defer_now = FALSE;
-       int ret = 0;
-
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_TRACK),
-           p->p_pid, p->p_memstat_dirty, pcontrol, 0, 0);
-
-       proc_list_lock();
-
-       if ((p->p_listflag & P_LIST_EXITED) != 0) {
-               /*
-                * Process is on its way out.
-                */
-               ret = EBUSY;
-               goto exit;
-       }
-
-       if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
-               ret = EPERM;
-               goto exit;
-       }
-
-       if ((ret = memorystatus_validate_track_flags(p, pcontrol)) != 0) {
-               /* error  */
-               goto exit;
-       }
-
-       old_dirty = p->p_memstat_dirty;
-
-       /* These bits are cumulative, as per <rdar://problem/11159924> */
-       if (pcontrol & PROC_DIRTY_TRACK) {
-               p->p_memstat_dirty |= P_DIRTY_TRACK;
-       }
-
-       if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
-               p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
-       }
-
-       if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
-               p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS;
-       }
-
-       if (old_dirty & P_DIRTY_AGING_IN_PROGRESS) {
-               already_deferred = TRUE;
-       }
-
-
-       /* This can be set and cleared exactly once. */
-       if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
-               if ((pcontrol & (PROC_DIRTY_DEFER)) &&
-                   !(old_dirty & P_DIRTY_DEFER)) {
-                       p->p_memstat_dirty |= P_DIRTY_DEFER;
-               }
-
-               if ((pcontrol & (PROC_DIRTY_DEFER_ALWAYS)) &&
-                   !(old_dirty & P_DIRTY_DEFER_ALWAYS)) {
-                       p->p_memstat_dirty |= P_DIRTY_DEFER_ALWAYS;
-               }
-
-               defer_now = TRUE;
-       }
-
-       MEMORYSTATUS_DEBUG(1, "memorystatus_on_track_dirty(): set idle-exit %s / defer %s / dirty %s for pid %d\n",
-           ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) ? "Y" : "N",
-           defer_now ? "Y" : "N",
-           p->p_memstat_dirty & P_DIRTY ? "Y" : "N",
-           p->p_pid);
-
-       /* Kick off or invalidate the idle exit deferment if there's a state transition. */
-       if (!(p->p_memstat_dirty & P_DIRTY_IS_DIRTY)) {
-               if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
-                       if (defer_now && !already_deferred) {
-                               /*
-                                * Request to defer a clean process that's idle-exit enabled
-                                * and not already in the jetsam deferred band. Most likely a
-                                * new launch.
-                                */
-                               memorystatus_schedule_idle_demotion_locked(p, TRUE);
-                               reschedule = TRUE;
-                       } else if (!defer_now) {
-                               /*
-                                * The process isn't asking for the 'aging' facility.
-                                * Could be that it is:
-                                */
-
-                               if (already_deferred) {
-                                       /*
-                                        * already in the aging bands. Traditionally,
-                                        * some processes have tried to use this to
-                                        * opt out of the 'aging' facility.
-                                        */
-
-                                       memorystatus_invalidate_idle_demotion_locked(p, TRUE);
-                               } else {
-                                       /*
-                                        * agnostic to the 'aging' facility. In that case,
-                                        * we'll go ahead and opt it in because this is likely
-                                        * a new launch (clean process, dirty tracking enabled)
-                                        */
-
-                                       memorystatus_schedule_idle_demotion_locked(p, TRUE);
-                               }
-
-                               reschedule = TRUE;
-                       }
-               }
-       } else {
-               /*
-                * We are trying to operate on a dirty process. Dirty processes have to
-                * be removed from the deferred band. The question is do we reset the
-                * deferred state or not?
-                *
-                * This could be a legal request like:
-                * - this process had opted into the 'aging' band
-                * - but it's now dirty and requests to opt out.
-                * In this case, we remove the process from the band and reset its
-                * state too. It'll opt back in properly when needed.
-                *
-                * OR, this request could be a user-space bug. E.g.:
-                * - this process had opted into the 'aging' band when clean
-                * - and, then issues another request to again put it into the band except
-                *   this time the process is dirty.
-                * The process going dirty, as a transition in memorystatus_dirty_set(), will pull the process out of
-                * the deferred band with its state intact. So our request below is no-op.
-                * But we do it here anyways for coverage.
-                *
-                * memorystatus_update_idle_priority_locked()
-                * single-mindedly treats a dirty process as "cannot be in the aging band".
-                */
-
-               if (!defer_now && already_deferred) {
-                       memorystatus_invalidate_idle_demotion_locked(p, TRUE);
-                       reschedule = TRUE;
-               } else {
-                       boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE;
-
-                       memorystatus_invalidate_idle_demotion_locked(p, reset_state);
-                       reschedule = TRUE;
-               }
-       }
-
-       memorystatus_update_idle_priority_locked(p);
-
-       if (reschedule) {
-               memorystatus_reschedule_idle_demotion_locked();
-       }
-
-       ret = 0;
-
-exit:
-       proc_list_unlock();
-
-       return ret;
-}
-
-int
-memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol)
-{
-       int ret;
-       boolean_t kill = false;
-       boolean_t reschedule = FALSE;
-       boolean_t was_dirty = FALSE;
-       boolean_t now_dirty = FALSE;
-
-       MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, p->p_pid, pcontrol, p->p_memstat_dirty);
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_SET), p->p_pid, self, pcontrol, 0, 0);
-
-       proc_list_lock();
-
-       if ((p->p_listflag & P_LIST_EXITED) != 0) {
-               /*
-                * Process is on its way out.
-                */
-               ret = EBUSY;
-               goto exit;
-       }
-
-       if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
-               ret = EPERM;
-               goto exit;
-       }
-
-       if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
-               was_dirty = TRUE;
-       }
-
-       if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
-               /* Dirty tracking not enabled */
-               ret = EINVAL;
-       } else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
-               /*
-                * Process is set to be terminated and we're attempting to mark it dirty.
-                * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
-                */
-               ret = EBUSY;
-       } else {
-               int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
-               if (pcontrol && !(p->p_memstat_dirty & flag)) {
-                       /* Mark the process as having been dirtied at some point */
-                       p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
-                       memorystatus_dirty_count++;
-                       ret = 0;
-               } else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
-                       if ((flag == P_DIRTY_SHUTDOWN) && (!(p->p_memstat_dirty & P_DIRTY))) {
-                               /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
-                               p->p_memstat_dirty |= P_DIRTY_TERMINATED;
-                               kill = true;
-                       } else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
-                               /* Kill previously terminated processes if set clean */
-                               kill = true;
-                       }
-                       p->p_memstat_dirty &= ~flag;
-                       memorystatus_dirty_count--;
-                       ret = 0;
-               } else {
-                       /* Already set */
-                       ret = EALREADY;
-               }
-       }
-
-       if (ret != 0) {
-               goto exit;
-       }
-
-       if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
-               now_dirty = TRUE;
-       }
-
-       if ((was_dirty == TRUE && now_dirty == FALSE) ||
-           (was_dirty == FALSE && now_dirty == TRUE)) {
-               /* Manage idle exit deferral, if applied */
-               if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
-                       /*
-                        * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back
-                        * there once it's clean again. For the legacy case, this only applies if it has some protection window left.
-                        * P_DIRTY_DEFER: one-time protection window given at launch
-                        * P_DIRTY_DEFER_ALWAYS: protection window given for every dirty->clean transition. Like non-legacy mode.
-                        *
-                        * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over
-                        * in that band on it's way to IDLE.
-                        */
-
-                       if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
-                               /*
-                                * New dirty process i.e. "was_dirty == FALSE && now_dirty == TRUE"
-                                *
-                                * The process will move from its aging band to its higher requested
-                                * jetsam band.
-                                */
-                               boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE;
-
-                               memorystatus_invalidate_idle_demotion_locked(p, reset_state);
-                               reschedule = TRUE;
-                       } else {
-                               /*
-                                * Process is back from "dirty" to "clean".
-                                */
-
-                               if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) {
-                                       if (((p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) == FALSE) &&
-                                           (mach_absolute_time() >= p->p_memstat_idledeadline)) {
-                                               /*
-                                                * The process' hasn't enrolled in the "always defer after dirty"
-                                                * mode and its deadline has expired. It currently
-                                                * does not reside in any of the aging buckets.
-                                                *
-                                                * It's on its way to the JETSAM_PRIORITY_IDLE
-                                                * bucket via memorystatus_update_idle_priority_locked()
-                                                * below.
-                                                *
-                                                * So all we need to do is reset all the state on the
-                                                * process that's related to the aging bucket i.e.
-                                                * the AGING_IN_PROGRESS flag and the timer deadline.
-                                                */
-
-                                               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
-                                               reschedule = TRUE;
-                                       } else {
-                                               /*
-                                                * Process enrolled in "always stop in deferral band after dirty" OR
-                                                * it still has some protection window left and so
-                                                * we just re-arm the timer without modifying any
-                                                * state on the process iff it still wants into that band.
-                                                */
-
-                                               if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) {
-                                                       memorystatus_schedule_idle_demotion_locked(p, TRUE);
-                                                       reschedule = TRUE;
-                                               } else if (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) {
-                                                       memorystatus_schedule_idle_demotion_locked(p, FALSE);
-                                                       reschedule = TRUE;
-                                               }
-                                       }
-                               } else {
-                                       memorystatus_schedule_idle_demotion_locked(p, TRUE);
-                                       reschedule = TRUE;
-                               }
-                       }
-               }
-
-               memorystatus_update_idle_priority_locked(p);
-
-               if (memorystatus_highwater_enabled) {
-                       boolean_t ledger_update_needed = TRUE;
-                       boolean_t use_active;
-                       boolean_t is_fatal;
-                       /*
-                        * We are in this path because this process transitioned between
-                        * dirty <--> clean state.  Update the cached memory limits.
-                        */
-
-                       if (proc_jetsam_state_is_active_locked(p) == TRUE) {
-                               /*
-                                * process is pinned in elevated band
-                                * or
-                                * process is dirty
-                                */
-                               CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
-                               use_active = TRUE;
-                               ledger_update_needed = TRUE;
-                       } else {
-                               /*
-                                * process is clean...but if it has opted into pressured-exit
-                                * we don't apply the INACTIVE limit till the process has aged
-                                * out and is entering the IDLE band.
-                                * See memorystatus_update_priority_locked() for that.
-                                */
-
-                               if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
-                                       ledger_update_needed = FALSE;
-                               } else {
-                                       CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
-                                       use_active = FALSE;
-                                       ledger_update_needed = TRUE;
-                               }
-                       }
-
-                       /*
-                        * Enforce the new limits by writing to the ledger.
-                        *
-                        * This is a hot path and holding the proc_list_lock while writing to the ledgers,
-                        * (where the task lock is taken) is bad.  So, we temporarily drop the proc_list_lock.
-                        * We aren't traversing the jetsam bucket list here, so we should be safe.
-                        * See rdar://21394491.
-                        */
-
-                       if (ledger_update_needed && proc_ref_locked(p) == p) {
-                               int ledger_limit;
-                               if (p->p_memstat_memlimit > 0) {
-                                       ledger_limit = p->p_memstat_memlimit;
-                               } else {
-                                       ledger_limit = -1;
-                               }
-                               proc_list_unlock();
-                               task_set_phys_footprint_limit_internal(p->task, ledger_limit, NULL, use_active, is_fatal);
-                               proc_list_lock();
-                               proc_rele_locked(p);
-
-                               MEMORYSTATUS_DEBUG(3, "memorystatus_dirty_set: new limit on pid %d (%dMB %s) priority(%d) dirty?=0x%x %s\n",
-                                   p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
-                                   (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty,
-                                   (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
-                       }
-               }
-
-               /* If the deferral state changed, reschedule the demotion timer */
-               if (reschedule) {
-                       memorystatus_reschedule_idle_demotion_locked();
-               }
-       }
-
-       if (kill) {
-               if (proc_ref_locked(p) == p) {
-                       proc_list_unlock();
-                       psignal(p, SIGKILL);
-                       proc_list_lock();
-                       proc_rele_locked(p);
-               }
-       }
-
-exit:
-       proc_list_unlock();
-
-       return ret;
-}
-
-int
-memorystatus_dirty_clear(proc_t p, uint32_t pcontrol)
-{
-       int ret = 0;
-
-       MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_clear(): %d 0x%x 0x%x\n", p->p_pid, pcontrol, p->p_memstat_dirty);
-
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_CLEAR), p->p_pid, pcontrol, 0, 0, 0);
-
-       proc_list_lock();
-
-       if ((p->p_listflag & P_LIST_EXITED) != 0) {
-               /*
-                * Process is on its way out.
-                */
-               ret = EBUSY;
-               goto exit;
-       }
-
-       if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
-               ret = EPERM;
-               goto exit;
-       }
-
-       if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
-               /* Dirty tracking not enabled */
-               ret = EINVAL;
-               goto exit;
-       }
-
-       if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) == 0) {
-               ret = EINVAL;
-               goto exit;
-       }
-
-       if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
-               p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS;
-       }
-
-       /* This can be set and cleared exactly once. */
-       if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
-               if (p->p_memstat_dirty & P_DIRTY_DEFER) {
-                       p->p_memstat_dirty &= ~(P_DIRTY_DEFER);
-               }
-
-               if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) {
-                       p->p_memstat_dirty &= ~(P_DIRTY_DEFER_ALWAYS);
-               }
-
-               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
-               memorystatus_update_idle_priority_locked(p);
-               memorystatus_reschedule_idle_demotion_locked();
-       }
-
-       ret = 0;
-exit:
-       proc_list_unlock();
-
-       return ret;
-}
-
-int
-memorystatus_dirty_get(proc_t p)
-{
-       int ret = 0;
-
-       proc_list_lock();
-
-       if (p->p_memstat_dirty & P_DIRTY_TRACK) {
-               ret |= PROC_DIRTY_TRACKED;
-               if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
-                       ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
-               }
-               if (p->p_memstat_dirty & P_DIRTY) {
-                       ret |= PROC_DIRTY_IS_DIRTY;
-               }
-               if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) {
-                       ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS;
-               }
-       }
-
-       proc_list_unlock();
-
-       return ret;
-}
-
-int
-memorystatus_on_terminate(proc_t p)
-{
-       int sig;
-
-       proc_list_lock();
-
-       p->p_memstat_dirty |= P_DIRTY_TERMINATED;
-
-       if ((p->p_memstat_dirty & (P_DIRTY_TRACK | P_DIRTY_IS_DIRTY)) == P_DIRTY_TRACK) {
-               /* Clean; mark as terminated and issue SIGKILL */
-               sig = SIGKILL;
-       } else {
-               /* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
-               sig = SIGTERM;
-       }
-
-       proc_list_unlock();
-
-       return sig;
-}
-
-void
-memorystatus_on_suspend(proc_t p)
-{
-#if CONFIG_FREEZE
-       uint32_t pages;
-       memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
-#endif
-       proc_list_lock();
-#if CONFIG_FREEZE
-       memorystatus_suspended_count++;
-#endif
-       p->p_memstat_state |= P_MEMSTAT_SUSPENDED;
-       proc_list_unlock();
-}
-
-void
-memorystatus_on_resume(proc_t p)
-{
-#if CONFIG_FREEZE
-       boolean_t frozen;
-       pid_t pid;
-#endif
-
-       proc_list_lock();
-
-#if CONFIG_FREEZE
-       frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN);
-       if (frozen) {
-               /*
-                * Now that we don't _thaw_ a process completely,
-                * resuming it (and having some on-demand swapins)
-                * shouldn't preclude it from being counted as frozen.
-                *
-                * memorystatus_frozen_count--;
-                *
-                * We preserve the P_MEMSTAT_FROZEN state since the process
-                * could have state on disk AND so will deserve some protection
-                * in the jetsam bands.
-                */
-               if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == 0) {
-                       p->p_memstat_state |= P_MEMSTAT_REFREEZE_ELIGIBLE;
-                       memorystatus_refreeze_eligible_count++;
-               }
-               p->p_memstat_thaw_count++;
-
-               memorystatus_thaw_count++;
-       }
-
-       memorystatus_suspended_count--;
-
-       pid = p->p_pid;
-#endif
-
-       /*
-        * P_MEMSTAT_FROZEN will remain unchanged. This used to be:
-        * p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
-        */
-       p->p_memstat_state &= ~P_MEMSTAT_SUSPENDED;
-
-       proc_list_unlock();
-
-#if CONFIG_FREEZE
-       if (frozen) {
-               memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
-               memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
-       }
-#endif
-}
-
-void
-memorystatus_on_inactivity(proc_t p)
-{
-#pragma unused(p)
-#if CONFIG_FREEZE
-       /* Wake the freeze thread */
-       thread_wakeup((event_t)&memorystatus_freeze_wakeup);
-#endif
-}
-
-/*
- * The proc_list_lock is held by the caller.
- */
-static uint32_t
-memorystatus_build_state(proc_t p)
-{
-       uint32_t snapshot_state = 0;
-
-       /* General */
-       if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
-               snapshot_state |= kMemorystatusSuspended;
-       }
-       if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
-               snapshot_state |= kMemorystatusFrozen;
-       }
-       if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
-               snapshot_state |= kMemorystatusWasThawed;
-       }
-
-       /* Tracking */
-       if (p->p_memstat_dirty & P_DIRTY_TRACK) {
-               snapshot_state |= kMemorystatusTracked;
-       }
-       if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
-               snapshot_state |= kMemorystatusSupportsIdleExit;
-       }
-       if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
-               snapshot_state |= kMemorystatusDirty;
-       }
-
-       return snapshot_state;
-}
-
-static boolean_t
-kill_idle_exit_proc(void)
-{
-       proc_t p, victim_p = PROC_NULL;
-       uint64_t current_time;
-       boolean_t killed = FALSE;
-       unsigned int i = 0;
-       os_reason_t jetsam_reason = OS_REASON_NULL;
-
-       /* Pick next idle exit victim. */
-       current_time = mach_absolute_time();
-
-       jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_IDLE_EXIT);
-       if (jetsam_reason == OS_REASON_NULL) {
-               printf("kill_idle_exit_proc: failed to allocate jetsam reason\n");
-       }
-
-       proc_list_lock();
-
-       p = memorystatus_get_first_proc_locked(&i, FALSE);
-       while (p) {
-               /* No need to look beyond the idle band */
-               if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
-                       break;
-               }
-
-               if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT | P_DIRTY_IS_DIRTY | P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
-                       if (current_time >= p->p_memstat_idledeadline) {
-                               p->p_memstat_dirty |= P_DIRTY_TERMINATED;
-                               victim_p = proc_ref_locked(p);
-                               break;
-                       }
-               }
-
-               p = memorystatus_get_next_proc_locked(&i, p, FALSE);
-       }
-
-       proc_list_unlock();
-
-       if (victim_p) {
-               printf("memorystatus: killing_idle_process pid %d [%s]\n", victim_p->p_pid, (*victim_p->p_name ? victim_p->p_name : "unknown"));
-               killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit, jetsam_reason);
-               proc_rele(victim_p);
-       } else {
-               os_reason_free(jetsam_reason);
-       }
-
-       return killed;
-}
-
-static void
-memorystatus_thread_wake(void)
-{
-       int thr_id = 0;
-       int active_thr = atomic_load(&active_jetsam_threads);
-
-       /* Wakeup all the jetsam threads */
-       for (thr_id = 0; thr_id < active_thr; thr_id++) {
-               thread_wakeup((event_t)&jetsam_threads[thr_id].memorystatus_wakeup);
-       }
-}
-
-#if CONFIG_JETSAM
-
-static void
-memorystatus_thread_pool_max()
-{
-       /* Increase the jetsam thread pool to max_jetsam_threads */
-       int max_threads = max_jetsam_threads;
-       printf("Expanding memorystatus pool to %d!\n", max_threads);
-       atomic_store(&active_jetsam_threads, max_threads);
-}
-
-static void
-memorystatus_thread_pool_default()
-{
-       /* Restore the jetsam thread pool to a single thread */
-       printf("Reverting memorystatus pool back to 1\n");
-       atomic_store(&active_jetsam_threads, 1);
-}
-
-#endif /* CONFIG_JETSAM */
-
-extern void vm_pressure_response(void);
-
-static int
-memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation)
-{
-       struct jetsam_thread_state *jetsam_thread = jetsam_current_thread();
-
-       if (interval_ms) {
-               assert_wait_timeout(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT, interval_ms, NSEC_PER_MSEC);
-       } else {
-               assert_wait(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT);
-       }
-
-       return thread_block(continuation);
-}
-
-static boolean_t
-memorystatus_avail_pages_below_pressure(void)
-{
-#if CONFIG_EMBEDDED
-/*
- * Instead of CONFIG_EMBEDDED for these *avail_pages* routines, we should
- * key off of the system having dynamic swap support. With full swap support,
- * the system shouldn't really need to worry about various page thresholds.
- */
-       return memorystatus_available_pages <= memorystatus_available_pages_pressure;
-#else /* CONFIG_EMBEDDED */
-       return FALSE;
-#endif /* CONFIG_EMBEDDED */
-}
-
-static boolean_t
-memorystatus_avail_pages_below_critical(void)
-{
-#if CONFIG_EMBEDDED
-       return memorystatus_available_pages <= memorystatus_available_pages_critical;
-#else /* CONFIG_EMBEDDED */
-       return FALSE;
-#endif /* CONFIG_EMBEDDED */
-}
-
-static boolean_t
-memorystatus_post_snapshot(int32_t priority, uint32_t cause)
-{
-#if CONFIG_EMBEDDED
-#pragma unused(cause)
-       /*
-        * Don't generate logs for steady-state idle-exit kills,
-        * unless it is overridden for debug or by the device
-        * tree.
-        */
-
-       return (priority != JETSAM_PRIORITY_IDLE) || memorystatus_idle_snapshot;
-
-#else /* CONFIG_EMBEDDED */
-       /*
-        * Don't generate logs for steady-state idle-exit kills,
-        * unless
-        * - it is overridden for debug or by the device
-        * tree.
-        * OR
-        * - the kill causes are important i.e. not kMemorystatusKilledIdleExit
-        */
-
-       boolean_t snapshot_eligible_kill_cause = (is_reason_thrashing(cause) || is_reason_zone_map_exhaustion(cause));
-       return (priority != JETSAM_PRIORITY_IDLE) || memorystatus_idle_snapshot || snapshot_eligible_kill_cause;
-#endif /* CONFIG_EMBEDDED */
-}
-
-static boolean_t
-memorystatus_action_needed(void)
-{
-#if CONFIG_EMBEDDED
-       return is_reason_thrashing(kill_under_pressure_cause) ||
-              is_reason_zone_map_exhaustion(kill_under_pressure_cause) ||
-              memorystatus_available_pages <= memorystatus_available_pages_pressure;
-#else /* CONFIG_EMBEDDED */
-       return is_reason_thrashing(kill_under_pressure_cause) ||
-              is_reason_zone_map_exhaustion(kill_under_pressure_cause);
-#endif /* CONFIG_EMBEDDED */
-}
-
-#if CONFIG_FREEZE
-extern void             vm_swap_consider_defragmenting(int);
-
-/*
- * This routine will _jetsam_ all frozen processes
- * and reclaim the swap space immediately.
- *
- * So freeze has to be DISABLED when we call this routine.
- */
-
-void
-memorystatus_disable_freeze(void)
-{
-       memstat_bucket_t *bucket;
-       int bucket_count = 0, retries = 0;
-       boolean_t retval = FALSE, killed = FALSE;
-       uint32_t errors = 0, errors_over_prev_iteration = 0;
-       os_reason_t jetsam_reason = 0;
-       unsigned int band = 0;
-       proc_t p = PROC_NULL, next_p = PROC_NULL;
-
-       assert(memorystatus_freeze_enabled == FALSE);
-
-       jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_DISK_SPACE_SHORTAGE);
-       if (jetsam_reason == OS_REASON_NULL) {
-               printf("memorystatus_disable_freeze: failed to allocate jetsam reason\n");
-       }
-
-       /*
-        * Let's relocate all frozen processes into band 8. Demoted frozen processes
-        * are sitting in band 0 currently and it's possible to have a frozen process
-        * in the FG band being actively used. We don't reset its frozen state when
-        * it is resumed because it has state on disk.
-        *
-        * We choose to do this relocation rather than implement a new 'kill frozen'
-        * process function for these reasons:
-        * - duplication of code: too many kill functions exist and we need to rework them better.
-        * - disk-space-shortage kills are rare
-        * - not having the 'real' jetsam band at time of the this frozen kill won't preclude us
-        *   from answering any imp. questions re. jetsam policy/effectiveness.
-        *
-        * This is essentially what memorystatus_update_inactive_jetsam_priority_band() does while
-        * avoiding the application of memory limits.
-        */
-
-again:
-       proc_list_lock();
-
-       band = JETSAM_PRIORITY_IDLE;
-       p = PROC_NULL;
-       next_p = PROC_NULL;
-
-       next_p = memorystatus_get_first_proc_locked(&band, TRUE);
-       while (next_p) {
-               p = next_p;
-               next_p = memorystatus_get_next_proc_locked(&band, p, TRUE);
-
-               if (p->p_memstat_effectivepriority > JETSAM_PRIORITY_FOREGROUND) {
-                       break;
-               }
-
-               if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) {
-                       continue;
-               }
-
-               if (p->p_memstat_state & P_MEMSTAT_ERROR) {
-                       p->p_memstat_state &= ~P_MEMSTAT_ERROR;
-               }
-
-               if (p->p_memstat_effectivepriority == memorystatus_freeze_jetsam_band) {
-                       continue;
-               }
-
-               /*
-                * We explicitly add this flag here so the process looks like a normal
-                * frozen process i.e. P_MEMSTAT_FROZEN and P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND.
-                * We don't bother with assigning the 'active' memory
-                * limits at this point because we are going to be killing it soon below.
-                */
-               p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
-               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
-
-               memorystatus_update_priority_locked(p, memorystatus_freeze_jetsam_band, FALSE, TRUE);
-       }
-
-       bucket = &memstat_bucket[memorystatus_freeze_jetsam_band];
-       bucket_count = bucket->count;
-       proc_list_unlock();
-
-       /*
-        * Bucket count is already stale at this point. But, we don't expect
-        * freezing to continue since we have already disabled the freeze functionality.
-        * However, an existing freeze might be in progress. So we might miss that process
-        * in the first go-around. We hope to catch it in the next.
-        */
-
-       errors_over_prev_iteration = 0;
-       while (bucket_count) {
-               bucket_count--;
-
-               /*
-                * memorystatus_kill_elevated_process() drops a reference,
-                * so take another one so we can continue to use this exit reason
-                * even after it returns.
-                */
-
-               os_reason_ref(jetsam_reason);
-               retval = memorystatus_kill_elevated_process(
-                       kMemorystatusKilledDiskSpaceShortage,
-                       jetsam_reason,
-                       memorystatus_freeze_jetsam_band,
-                       0, /* the iteration of aggressive jetsam..ignored here */
-                       &errors);
-
-               if (errors > 0) {
-                       printf("memorystatus_disable_freeze: memorystatus_kill_elevated_process returned %d error(s)\n", errors);
-                       errors_over_prev_iteration += errors;
-                       errors = 0;
-               }
-
-               if (retval == 0) {
-                       /*
-                        * No frozen processes left to kill.
-                        */
-                       break;
-               }
-
-               killed = TRUE;
-       }
-
-       proc_list_lock();
-
-       if (memorystatus_frozen_count) {
-               /*
-                * A frozen process snuck in and so
-                * go back around to kill it. That
-                * process may have been resumed and
-                * put into the FG band too. So we
-                * have to do the relocation again.
-                */
-               assert(memorystatus_freeze_enabled == FALSE);
-
-               retries++;
-               if (retries < 3) {
-                       proc_list_unlock();
-                       goto again;
-               }
-#if DEVELOPMENT || DEBUG
-               panic("memorystatus_disable_freeze: Failed to kill all frozen processes, memorystatus_frozen_count = %d, errors = %d",
-                   memorystatus_frozen_count, errors_over_prev_iteration);
-#endif /* DEVELOPMENT || DEBUG */
-       }
-       proc_list_unlock();
-
-       os_reason_free(jetsam_reason);
-
-       if (killed) {
-               vm_swap_consider_defragmenting(VM_SWAP_FLAGS_FORCE_DEFRAG | VM_SWAP_FLAGS_FORCE_RECLAIM);
-
-               proc_list_lock();
-               size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
-                   sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
-               uint64_t timestamp_now = mach_absolute_time();
-               memorystatus_jetsam_snapshot->notification_time = timestamp_now;
-               memorystatus_jetsam_snapshot->js_gencount++;
-               if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
-                   timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
-                       proc_list_unlock();
-                       int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
-                       if (!ret) {
-                               proc_list_lock();
-                               memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
-                               proc_list_unlock();
-                       }
-               } else {
-                       proc_list_unlock();
-               }
-       }
-
-       return;
-}
-#endif /* CONFIG_FREEZE */
-
-static boolean_t
-memorystatus_act_on_hiwat_processes(uint32_t *errors, uint32_t *hwm_kill, boolean_t *post_snapshot, __unused boolean_t *is_critical)
-{
-       boolean_t purged = FALSE;
-       boolean_t killed = memorystatus_kill_hiwat_proc(errors, &purged);
-
-       if (killed) {
-               *hwm_kill = *hwm_kill + 1;
-               *post_snapshot = TRUE;
-               return TRUE;
-       } else {
-               if (purged == FALSE) {
-                       /* couldn't purge and couldn't kill */
-                       memorystatus_hwm_candidates = FALSE;
-               }
-       }
-
-#if CONFIG_JETSAM
-       /* No highwater processes to kill. Continue or stop for now? */
-       if (!is_reason_thrashing(kill_under_pressure_cause) &&
-           !is_reason_zone_map_exhaustion(kill_under_pressure_cause) &&
-           (memorystatus_available_pages > memorystatus_available_pages_critical)) {
-               /*
-                * We are _not_ out of pressure but we are above the critical threshold and there's:
-                * - no compressor thrashing
-                * - enough zone memory
-                * - no more HWM processes left.
-                * For now, don't kill any other processes.
-                */
-
-               if (*hwm_kill == 0) {
-                       memorystatus_thread_wasted_wakeup++;
-               }
-
-               *is_critical = FALSE;
-
-               return TRUE;
-       }
-#endif /* CONFIG_JETSAM */
-
-       return FALSE;
-}
-
-static boolean_t
-memorystatus_act_aggressive(uint32_t cause, os_reason_t jetsam_reason, int *jld_idle_kills, boolean_t *corpse_list_purged, boolean_t *post_snapshot)
-{
-       if (memorystatus_jld_enabled == TRUE) {
-               boolean_t killed;
-               uint32_t errors = 0;
-
-               /* Jetsam Loop Detection - locals */
-               memstat_bucket_t *bucket;
-               int             jld_bucket_count = 0;
-               struct timeval  jld_now_tstamp = {0, 0};
-               uint64_t        jld_now_msecs = 0;
-               int             elevated_bucket_count = 0;
-
-               /* Jetsam Loop Detection - statics */
-               static uint64_t  jld_timestamp_msecs = 0;
-               static int       jld_idle_kill_candidates = 0;  /* Number of available processes in band 0,1 at start */
-               static int       jld_eval_aggressive_count = 0;         /* Bumps the max priority in aggressive loop */
-               static int32_t   jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT;
-               /*
-                * Jetsam Loop Detection: attempt to detect
-                * rapid daemon relaunches in the lower bands.
-                */
-
-               microuptime(&jld_now_tstamp);
-
-               /*
-                * Ignore usecs in this calculation.
-                * msecs granularity is close enough.
-                */
-               jld_now_msecs = (jld_now_tstamp.tv_sec * 1000);
-
-               proc_list_lock();
-               switch (jetsam_aging_policy) {
-               case kJetsamAgingPolicyLegacy:
-                       bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
-                       jld_bucket_count = bucket->count;
-                       bucket = &memstat_bucket[JETSAM_PRIORITY_AGING_BAND1];
-                       jld_bucket_count += bucket->count;
-                       break;
-               case kJetsamAgingPolicySysProcsReclaimedFirst:
-               case kJetsamAgingPolicyAppsReclaimedFirst:
-                       bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
-                       jld_bucket_count = bucket->count;
-                       bucket = &memstat_bucket[system_procs_aging_band];
-                       jld_bucket_count += bucket->count;
-                       bucket = &memstat_bucket[applications_aging_band];
-                       jld_bucket_count += bucket->count;
-                       break;
-               case kJetsamAgingPolicyNone:
-               default:
-                       bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
-                       jld_bucket_count = bucket->count;
-                       break;
-               }
-
-               bucket = &memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE];
-               elevated_bucket_count = bucket->count;
-
-               proc_list_unlock();
-
-               /*
-                * memorystatus_jld_eval_period_msecs is a tunable
-                * memorystatus_jld_eval_aggressive_count is a tunable
-                * memorystatus_jld_eval_aggressive_priority_band_max is a tunable
-                */
-               if ((jld_bucket_count == 0) ||
-                   (jld_now_msecs > (jld_timestamp_msecs + memorystatus_jld_eval_period_msecs))) {
-                       /*
-                        * Refresh evaluation parameters
-                        */
-                       jld_timestamp_msecs      = jld_now_msecs;
-                       jld_idle_kill_candidates = jld_bucket_count;
-                       *jld_idle_kills          = 0;
-                       jld_eval_aggressive_count = 0;
-                       jld_priority_band_max   = JETSAM_PRIORITY_UI_SUPPORT;
-               }
-
-               if (*jld_idle_kills > jld_idle_kill_candidates) {
-                       jld_eval_aggressive_count++;
-
-#if DEVELOPMENT || DEBUG
-                       printf("memorystatus: aggressive%d: beginning of window: %lld ms, : timestamp now: %lld ms\n",
-                           jld_eval_aggressive_count,
-                           jld_timestamp_msecs,
-                           jld_now_msecs);
-                       printf("memorystatus: aggressive%d: idle candidates: %d, idle kills: %d\n",
-                           jld_eval_aggressive_count,
-                           jld_idle_kill_candidates,
-                           *jld_idle_kills);
-#endif /* DEVELOPMENT || DEBUG */
-
-                       if ((jld_eval_aggressive_count == memorystatus_jld_eval_aggressive_count) &&
-                           (total_corpses_count() > 0) && (*corpse_list_purged == FALSE)) {
-                               /*
-                                * If we reach this aggressive cycle, corpses might be causing memory pressure.
-                                * So, in an effort to avoid jetsams in the FG band, we will attempt to purge
-                                * corpse memory prior to this final march through JETSAM_PRIORITY_UI_SUPPORT.
-                                */
-                               task_purge_all_corpses();
-                               *corpse_list_purged = TRUE;
-                       } else if (jld_eval_aggressive_count > memorystatus_jld_eval_aggressive_count) {
-                               /*
-                                * Bump up the jetsam priority limit (eg: the bucket index)
-                                * Enforce bucket index sanity.
-                                */
-                               if ((memorystatus_jld_eval_aggressive_priority_band_max < 0) ||
-                                   (memorystatus_jld_eval_aggressive_priority_band_max >= MEMSTAT_BUCKET_COUNT)) {
-                                       /*
-                                        * Do nothing.  Stick with the default level.
-                                        */
-                               } else {
-                                       jld_priority_band_max = memorystatus_jld_eval_aggressive_priority_band_max;
-                               }
-                       }
-
-                       /* Visit elevated processes first */
-                       while (elevated_bucket_count) {
-                               elevated_bucket_count--;
-
-                               /*
-                                * memorystatus_kill_elevated_process() drops a reference,
-                                * so take another one so we can continue to use this exit reason
-                                * even after it returns.
-                                */
-
-                               os_reason_ref(jetsam_reason);
-                               killed = memorystatus_kill_elevated_process(
-                                       cause,
-                                       jetsam_reason,
-                                       JETSAM_PRIORITY_ELEVATED_INACTIVE,
-                                       jld_eval_aggressive_count,
-                                       &errors);
-
-                               if (killed) {
-                                       *post_snapshot = TRUE;
-                                       if (memorystatus_avail_pages_below_pressure()) {
-                                               /*
-                                                * Still under pressure.
-                                                * Find another pinned processes.
-                                                */
-                                               continue;
-                                       } else {
-                                               return TRUE;
-                                       }
-                               } else {
-                                       /*
-                                        * No pinned processes left to kill.
-                                        * Abandon elevated band.
-                                        */
-                                       break;
-                               }
-                       }
-
-                       /*
-                        * memorystatus_kill_top_process_aggressive() allocates its own
-                        * jetsam_reason so the kMemorystatusKilledProcThrashing cause
-                        * is consistent throughout the aggressive march.
-                        */
-                       killed = memorystatus_kill_top_process_aggressive(
-                               kMemorystatusKilledProcThrashing,
-                               jld_eval_aggressive_count,
-                               jld_priority_band_max,
-                               &errors);
-
-                       if (killed) {
-                               /* Always generate logs after aggressive kill */
-                               *post_snapshot = TRUE;
-                               *jld_idle_kills = 0;
-                               return TRUE;
-                       }
-               }
-
-               return FALSE;
-       }
-
-       return FALSE;
-}
-
-
-static void
-memorystatus_thread(void *param __unused, wait_result_t wr __unused)
-{
-       boolean_t post_snapshot = FALSE;
-       uint32_t errors = 0;
-       uint32_t hwm_kill = 0;
-       boolean_t sort_flag = TRUE;
-       boolean_t corpse_list_purged = FALSE;
-       int     jld_idle_kills = 0;
-       struct jetsam_thread_state *jetsam_thread = jetsam_current_thread();
-
-       if (jetsam_thread->inited == FALSE) {
-               /*
-                * It's the first time the thread has run, so just mark the thread as privileged and block.
-                * This avoids a spurious pass with unset variables, as set out in <rdar://problem/9609402>.
-                */
-
-               char name[32];
-               thread_wire(host_priv_self(), current_thread(), TRUE);
-               snprintf(name, 32, "VM_memorystatus_%d", jetsam_thread->index + 1);
-
-               if (jetsam_thread->index == 0) {
-                       if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
-                               thread_vm_bind_group_add();
-                       }
-               }
-               thread_set_thread_name(current_thread(), name);
-               jetsam_thread->inited = TRUE;
-               memorystatus_thread_block(0, memorystatus_thread);
-       }
-
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
-           memorystatus_available_pages, memorystatus_jld_enabled, memorystatus_jld_eval_period_msecs, memorystatus_jld_eval_aggressive_count, 0);
-
-       /*
-        * Jetsam aware version.
-        *
-        * The VM pressure notification thread is working it's way through clients in parallel.
-        *
-        * So, while the pressure notification thread is targeting processes in order of
-        * increasing jetsam priority, we can hopefully reduce / stop it's work by killing
-        * any processes that have exceeded their highwater mark.
-        *
-        * If we run out of HWM processes and our available pages drops below the critical threshold, then,
-        * we target the least recently used process in order of increasing jetsam priority (exception: the FG band).
-        */
-       while (memorystatus_action_needed()) {
-               boolean_t killed;
-               int32_t priority;
-               uint32_t cause;
-               uint64_t jetsam_reason_code = JETSAM_REASON_INVALID;
-               os_reason_t jetsam_reason = OS_REASON_NULL;
-
-               cause = kill_under_pressure_cause;
-               switch (cause) {
-               case kMemorystatusKilledFCThrashing:
-                       jetsam_reason_code = JETSAM_REASON_MEMORY_FCTHRASHING;
-                       break;
-               case kMemorystatusKilledVMCompressorThrashing:
-                       jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING;
-                       break;
-               case kMemorystatusKilledVMCompressorSpaceShortage:
-                       jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE;
-                       break;
-               case kMemorystatusKilledZoneMapExhaustion:
-                       jetsam_reason_code = JETSAM_REASON_ZONE_MAP_EXHAUSTION;
-                       break;
-               case kMemorystatusKilledVMPageShortage:
-               /* falls through */
-               default:
-                       jetsam_reason_code = JETSAM_REASON_MEMORY_VMPAGESHORTAGE;
-                       cause = kMemorystatusKilledVMPageShortage;
-                       break;
-               }
-
-               /* Highwater */
-               boolean_t is_critical = TRUE;
-               if (memorystatus_act_on_hiwat_processes(&errors, &hwm_kill, &post_snapshot, &is_critical)) {
-                       if (is_critical == FALSE) {
-                               /*
-                                * For now, don't kill any other processes.
-                                */
-                               break;
-                       } else {
-                               goto done;
-                       }
-               }
-
-               jetsam_reason = os_reason_create(OS_REASON_JETSAM, jetsam_reason_code);
-               if (jetsam_reason == OS_REASON_NULL) {
-                       printf("memorystatus_thread: failed to allocate jetsam reason\n");
-               }
-
-               if (memorystatus_act_aggressive(cause, jetsam_reason, &jld_idle_kills, &corpse_list_purged, &post_snapshot)) {
-                       goto done;
-               }
-
-               /*
-                * memorystatus_kill_top_process() drops a reference,
-                * so take another one so we can continue to use this exit reason
-                * even after it returns
-                */
-               os_reason_ref(jetsam_reason);
-
-               /* LRU */
-               killed = memorystatus_kill_top_process(TRUE, sort_flag, cause, jetsam_reason, &priority, &errors);
-               sort_flag = FALSE;
-
-               if (killed) {
-                       if (memorystatus_post_snapshot(priority, cause) == TRUE) {
-                               post_snapshot = TRUE;
-                       }
-
-                       /* Jetsam Loop Detection */
-                       if (memorystatus_jld_enabled == TRUE) {
-                               if ((priority == JETSAM_PRIORITY_IDLE) || (priority == system_procs_aging_band) || (priority == applications_aging_band)) {
-                                       jld_idle_kills++;
-                               } else {
-                                       /*
-                                        * We've reached into bands beyond idle deferred.
-                                        * We make no attempt to monitor them
-                                        */
-                               }
-                       }
-
-                       if ((priority >= JETSAM_PRIORITY_UI_SUPPORT) && (total_corpses_count() > 0) && (corpse_list_purged == FALSE)) {
-                               /*
-                                * If we have jetsammed a process in or above JETSAM_PRIORITY_UI_SUPPORT
-                                * then we attempt to relieve pressure by purging corpse memory.
-                                */
-                               task_purge_all_corpses();
-                               corpse_list_purged = TRUE;
-                       }
-                       goto done;
-               }
-
-               if (memorystatus_avail_pages_below_critical()) {
-                       /*
-                        * Still under pressure and unable to kill a process - purge corpse memory
-                        */
-                       if (total_corpses_count() > 0) {
-                               task_purge_all_corpses();
-                               corpse_list_purged = TRUE;
-                       }
-
-                       if (memorystatus_avail_pages_below_critical()) {
-                               /*
-                                * Still under pressure and unable to kill a process - panic
-                                */
-                               panic("memorystatus_jetsam_thread: no victim! available pages:%llu\n", (uint64_t)memorystatus_available_pages);
-                       }
-               }
-
-done:
-
-               /*
-                * We do not want to over-kill when thrashing has been detected.
-                * To avoid that, we reset the flag here and notify the
-                * compressor.
-                */
-               if (is_reason_thrashing(kill_under_pressure_cause)) {
-                       kill_under_pressure_cause = 0;
-#if CONFIG_JETSAM
-                       vm_thrashing_jetsam_done();
-#endif /* CONFIG_JETSAM */
-               } else if (is_reason_zone_map_exhaustion(kill_under_pressure_cause)) {
-                       kill_under_pressure_cause = 0;
-               }
-
-               os_reason_free(jetsam_reason);
-       }
-
-       kill_under_pressure_cause = 0;
-
-       if (errors) {
-               memorystatus_clear_errors();
-       }
-
-       if (post_snapshot) {
-               proc_list_lock();
-               size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
-                   sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
-               uint64_t timestamp_now = mach_absolute_time();
-               memorystatus_jetsam_snapshot->notification_time = timestamp_now;
-               memorystatus_jetsam_snapshot->js_gencount++;
-               if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
-                   timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
-                       proc_list_unlock();
-                       int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
-                       if (!ret) {
-                               proc_list_lock();
-                               memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
-                               proc_list_unlock();
-                       }
-               } else {
-                       proc_list_unlock();
-               }
-       }
-
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
-           memorystatus_available_pages, 0, 0, 0, 0);
-
-       memorystatus_thread_block(0, memorystatus_thread);
-}
-
-/*
- * Returns TRUE:
- *      when an idle-exitable proc was killed
- * Returns FALSE:
- *     when there are no more idle-exitable procs found
- *      when the attempt to kill an idle-exitable proc failed
- */
-boolean_t
-memorystatus_idle_exit_from_VM(void)
-{
-       /*
-        * This routine should no longer be needed since we are
-        * now using jetsam bands on all platforms and so will deal
-        * with IDLE processes within the memorystatus thread itself.
-        *
-        * But we still use it because we observed that macos systems
-        * started heavy compression/swapping with a bunch of
-        * idle-exitable processes alive and doing nothing. We decided
-        * to rather kill those processes than start swapping earlier.
-        */
-
-       return kill_idle_exit_proc();
-}
-
-/*
- * Callback invoked when allowable physical memory footprint exceeded
- * (dirty pages + IOKit mappings)
- *
- * This is invoked for both advisory, non-fatal per-task high watermarks,
- * as well as the fatal task memory limits.
- */
-void
-memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
-{
-       os_reason_t jetsam_reason = OS_REASON_NULL;
-
-       proc_t p = current_proc();
-
-#if VM_PRESSURE_EVENTS
-       if (warning == TRUE) {
-               /*
-                * This is a warning path which implies that the current process is close, but has
-                * not yet exceeded its per-process memory limit.
-                */
-               if (memorystatus_warn_process(p->p_pid, memlimit_is_active, memlimit_is_fatal, FALSE /* not exceeded */) != TRUE) {
-                       /* Print warning, since it's possible that task has not registered for pressure notifications */
-                       os_log(OS_LOG_DEFAULT, "memorystatus_on_ledger_footprint_exceeded: failed to warn the current task (%d exiting, or no handler registered?).\n", p->p_pid);
-               }
-               return;
-       }
-#endif /* VM_PRESSURE_EVENTS */
-
-       if (memlimit_is_fatal) {
-               /*
-                * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task
-                * has violated either the system-wide per-task memory limit OR its own task limit.
-                */
-               jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT);
-               if (jetsam_reason == NULL) {
-                       printf("task_exceeded footprint: failed to allocate jetsam reason\n");
-               } else if (corpse_for_fatal_memkill != 0 && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) {
-                       /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */
-                       jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
-               }
-
-               if (memorystatus_kill_process_sync(p->p_pid, kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) {
-                       printf("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
-               }
-       } else {
-               /*
-                * HWM offender exists. Done without locks or synchronization.
-                * See comment near its declaration for more details.
-                */
-               memorystatus_hwm_candidates = TRUE;
-
-#if VM_PRESSURE_EVENTS
-               /*
-                * The current process is not in the warning path.
-                * This path implies the current process has exceeded a non-fatal (soft) memory limit.
-                * Failure to send note is ignored here.
-                */
-               (void)memorystatus_warn_process(p->p_pid, memlimit_is_active, memlimit_is_fatal, TRUE /* exceeded */);
-
-#endif /* VM_PRESSURE_EVENTS */
-       }
-}
-
-void
-memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
-{
-       proc_t p = current_proc();
-
-       /*
-        * The limit violation is logged here, but only once per process per limit.
-        * Soft memory limit is a non-fatal high-water-mark
-        * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
-        */
-
-       os_log_with_startup_serial(OS_LOG_DEFAULT, "EXC_RESOURCE -> %s[%d] exceeded mem limit: %s%s %d MB (%s)\n",
-           (*p->p_name ? p->p_name : "unknown"), p->p_pid, (memlimit_is_active ? "Active" : "Inactive"),
-           (memlimit_is_fatal  ? "Hard" : "Soft"), max_footprint_mb,
-           (memlimit_is_fatal  ? "fatal" : "non-fatal"));
-
-       return;
-}
-
-
-/*
- * Description:
- *     Evaluates process state to determine which limit
- *     should be applied (active vs. inactive limit).
- *
- *     Processes that have the 'elevated inactive jetsam band' attribute
- *     are first evaluated based on their current priority band.
- *     presently elevated ==> active
- *
- *     Processes that opt into dirty tracking are evaluated
- *     based on clean vs dirty state.
- *     dirty ==> active
- *     clean ==> inactive
- *
- *     Process that do not opt into dirty tracking are
- *     evalulated based on priority level.
- *     Foreground or above ==> active
- *     Below Foreground    ==> inactive
- *
- *     Return: TRUE if active
- *             False if inactive
- */
-
-static boolean_t
-proc_jetsam_state_is_active_locked(proc_t p)
-{
-       if ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) &&
-           (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE)) {
-               /*
-                * process has the 'elevated inactive jetsam band' attribute
-                * and process is present in the elevated band
-                * implies active state
-                */
-               return TRUE;
-       } else if (p->p_memstat_dirty & P_DIRTY_TRACK) {
-               /*
-                * process has opted into dirty tracking
-                * active state is based on dirty vs. clean
-                */
-               if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
-                       /*
-                        * process is dirty
-                        * implies active state
-                        */
-                       return TRUE;
-               } else {
-                       /*
-                        * process is clean
-                        * implies inactive state
-                        */
-                       return FALSE;
-               }
-       } else if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
-               /*
-                * process is Foreground or higher
-                * implies active state
-                */
-               return TRUE;
-       } else {
-               /*
-                * process found below Foreground
-                * implies inactive state
-                */
-               return FALSE;
-       }
-}
-
-static boolean_t
-memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
-{
-       boolean_t res;
-
-       uint32_t errors = 0;
-
-       if (victim_pid == -1) {
-               /* No pid, so kill first process */
-               res = memorystatus_kill_top_process(TRUE, TRUE, cause, jetsam_reason, NULL, &errors);
-       } else {
-               res = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason);
-       }
-
-       if (errors) {
-               memorystatus_clear_errors();
-       }
-
-       if (res == TRUE) {
-               /* Fire off snapshot notification */
-               proc_list_lock();
-               size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
-                   sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
-               uint64_t timestamp_now = mach_absolute_time();
-               memorystatus_jetsam_snapshot->notification_time = timestamp_now;
-               if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
-                   timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
-                       proc_list_unlock();
-                       int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
-                       if (!ret) {
-                               proc_list_lock();
-                               memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
-                               proc_list_unlock();
-                       }
-               } else {
-                       proc_list_unlock();
-               }
-       }
-
-       return res;
-}
-
-/*
- * Jetsam a specific process.
- */
-static boolean_t
-memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
-{
-       boolean_t killed;
-       proc_t p;
-       uint64_t killtime = 0;
-       clock_sec_t     tv_sec;
-       clock_usec_t    tv_usec;
-       uint32_t        tv_msec;
-
-       /* TODO - add a victim queue and push this into the main jetsam thread */
-
-       p = proc_find(victim_pid);
-       if (!p) {
-               os_reason_free(jetsam_reason);
-               return FALSE;
-       }
-
-       proc_list_lock();
-
-       if (memorystatus_jetsam_snapshot_count == 0) {
-               memorystatus_init_jetsam_snapshot_locked(NULL, 0);
-       }
-
-       killtime = mach_absolute_time();
-       absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
-       tv_msec = tv_usec / 1000;
-
-       memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
-
-       proc_list_unlock();
-
-       os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_specific_process pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
-           (unsigned long)tv_sec, tv_msec, victim_pid, (*p->p_name ? p->p_name : "unknown"),
-           memorystatus_kill_cause_name[cause], p->p_memstat_effectivepriority, (uint64_t)memorystatus_available_pages);
-
-       killed = memorystatus_do_kill(p, cause, jetsam_reason);
-       proc_rele(p);
-
-       return killed;
-}
-
-
-/*
- * Toggle the P_MEMSTAT_TERMINATED state.
- * Takes the proc_list_lock.
- */
-void
-proc_memstat_terminated(proc_t p, boolean_t set)
-{
-#if DEVELOPMENT || DEBUG
-       if (p) {
-               proc_list_lock();
-               if (set == TRUE) {
-                       p->p_memstat_state |= P_MEMSTAT_TERMINATED;
-               } else {
-                       p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
-               }
-               proc_list_unlock();
-       }
-#else
-#pragma unused(p, set)
-       /*
-        * do nothing
-        */
-#endif /* DEVELOPMENT || DEBUG */
-       return;
-}
-
-
-#if CONFIG_JETSAM
-/*
- * This is invoked when cpulimits have been exceeded while in fatal mode.
- * The jetsam_flags do not apply as those are for memory related kills.
- * We call this routine so that the offending process is killed with
- * a non-zero exit status.
- */
-void
-jetsam_on_ledger_cpulimit_exceeded(void)
-{
-       int retval = 0;
-       int jetsam_flags = 0;  /* make it obvious */
-       proc_t p = current_proc();
-       os_reason_t jetsam_reason = OS_REASON_NULL;
-
-       printf("task_exceeded_cpulimit: killing pid %d [%s]\n",
-           p->p_pid, (*p->p_name ? p->p_name : "(unknown)"));
-
-       jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_CPULIMIT);
-       if (jetsam_reason == OS_REASON_NULL) {
-               printf("task_exceeded_cpulimit: unable to allocate memory for jetsam reason\n");
-       }
-
-       retval = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
-
-       if (retval) {
-               printf("task_exceeded_cpulimit: failed to kill current task (exiting?).\n");
-       }
-}
-
-#endif /* CONFIG_JETSAM */
-
-static void
-memorystatus_get_task_memory_region_count(task_t task, uint64_t *count)
-{
-       assert(task);
-       assert(count);
-
-       *count = get_task_memory_region_count(task);
-}
-
-
-#define MEMORYSTATUS_VM_MAP_FORK_ALLOWED     0x100000000
-#define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000
-
-#if DEVELOPMENT || DEBUG
-
-/*
- * Sysctl only used to test memorystatus_allowed_vm_map_fork() path.
- *   set a new pidwatch value
- *     or
- *   get the current pidwatch value
- *
- * The pidwatch_val starts out with a PID to watch for in the map_fork path.
- * Its value is:
- * - OR'd with MEMORYSTATUS_VM_MAP_FORK_ALLOWED if we allow the map_fork.
- * - OR'd with MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED if we disallow the map_fork.
- * - set to -1ull if the map_fork() is aborted for other reasons.
- */
-
-uint64_t memorystatus_vm_map_fork_pidwatch_val = 0;
-
-static int sysctl_memorystatus_vm_map_fork_pidwatch SYSCTL_HANDLER_ARGS {
-#pragma unused(oidp, arg1, arg2)
-
-       uint64_t new_value = 0;
-       uint64_t old_value = 0;
-       int error = 0;
-
-       /*
-        * The pid is held in the low 32 bits.
-        * The 'allowed' flags are in the upper 32 bits.
-        */
-       old_value = memorystatus_vm_map_fork_pidwatch_val;
-
-       error = sysctl_io_number(req, old_value, sizeof(old_value), &new_value, NULL);
-
-       if (error || !req->newptr) {
-               /*
-                * No new value passed in.
-                */
-               return error;
-       }
-
-       /*
-        * A new pid was passed in via req->newptr.
-        * Ignore any attempt to set the higher order bits.
-        */
-       memorystatus_vm_map_fork_pidwatch_val = new_value & 0xFFFFFFFF;
-       printf("memorystatus: pidwatch old_value = 0x%llx, new_value = 0x%llx \n", old_value, new_value);
-
-       return error;
-}
-
-SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_map_fork_pidwatch, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED,
-    0, 0, sysctl_memorystatus_vm_map_fork_pidwatch, "Q", "get/set pid watched for in vm_map_fork");
-
-
-/*
- * Record if a watched process fails to qualify for a vm_map_fork().
- */
-void
-memorystatus_abort_vm_map_fork(task_t task)
-{
-       if (memorystatus_vm_map_fork_pidwatch_val != 0) {
-               proc_t p = get_bsdtask_info(task);
-               if (p != NULL && memorystatus_vm_map_fork_pidwatch_val == (uint64_t)p->p_pid) {
-                       memorystatus_vm_map_fork_pidwatch_val = -1ull;
-               }
-       }
-}
-
-static void
-set_vm_map_fork_pidwatch(task_t task, uint64_t x)
-{
-       if (memorystatus_vm_map_fork_pidwatch_val != 0) {
-               proc_t p = get_bsdtask_info(task);
-               if (p && (memorystatus_vm_map_fork_pidwatch_val == (uint64_t)p->p_pid)) {
-                       memorystatus_vm_map_fork_pidwatch_val |= x;
-               }
-       }
-}
-
-#else /* DEVELOPMENT || DEBUG */
-
-
-static void
-set_vm_map_fork_pidwatch(task_t task, uint64_t x)
-{
-#pragma unused(task)
-#pragma unused(x)
-}
-
-#endif /* DEVELOPMENT || DEBUG */
-
-/*
- * Called during EXC_RESOURCE handling when a process exceeds a soft
- * memory limit.  This is the corpse fork path and here we decide if
- * vm_map_fork will be allowed when creating the corpse.
- * The task being considered is suspended.
- *
- * By default, a vm_map_fork is allowed to proceed.
- *
- * A few simple policy assumptions:
- *     Desktop platform is not considered in this path.
- *     The vm_map_fork is always allowed.
- *
- *     If the device has a zero system-wide task limit,
- *     then the vm_map_fork is allowed.
- *
- *     And if a process's memory footprint calculates less
- *     than or equal to half of the system-wide task limit,
- *     then the vm_map_fork is allowed.  This calculation
- *     is based on the assumption that a process can
- *     munch memory up to the system-wide task limit.
- */
-boolean_t
-memorystatus_allowed_vm_map_fork(task_t task)
-{
-       boolean_t is_allowed = TRUE;   /* default */
-
-#if CONFIG_EMBEDDED
-
-       uint64_t footprint_in_bytes;
-       uint64_t max_allowed_bytes;
-
-       if (max_task_footprint_mb == 0) {
-               set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
-               return is_allowed;
-       }
-
-       footprint_in_bytes = get_task_phys_footprint(task);
-
-       /*
-        * Maximum is 1/4 of the system-wide task limit.
-        */
-       max_allowed_bytes = ((uint64_t)max_task_footprint_mb * 1024 * 1024) >> 2;
-
-       if (footprint_in_bytes > max_allowed_bytes) {
-               printf("memorystatus disallowed vm_map_fork %lld  %lld\n", footprint_in_bytes, max_allowed_bytes);
-               set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED);
-               return !is_allowed;
-       }
-#endif /* CONFIG_EMBEDDED */
-
-       set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
-       return is_allowed;
-}
-
-static void
-memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
-{
-       assert(task);
-       assert(footprint);
-
-       uint64_t pages;
-
-       pages = (get_task_phys_footprint(task) / PAGE_SIZE_64);
-       assert(((uint32_t)pages) == pages);
-       *footprint = (uint32_t)pages;
-
-       if (max_footprint_lifetime) {
-               pages = (get_task_phys_footprint_lifetime_max(task) / PAGE_SIZE_64);
-               assert(((uint32_t)pages) == pages);
-               *max_footprint_lifetime = (uint32_t)pages;
-       }
-       if (purgeable_pages) {
-               pages = (get_task_purgeable_size(task) / PAGE_SIZE_64);
-               assert(((uint32_t)pages) == pages);
-               *purgeable_pages = (uint32_t)pages;
-       }
-}
-
-static void
-memorystatus_get_task_phys_footprint_page_counts(task_t task,
-    uint64_t *internal_pages, uint64_t *internal_compressed_pages,
-    uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
-    uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
-    uint64_t *iokit_mapped_pages, uint64_t *page_table_pages)
-{
-       assert(task);
-
-       if (internal_pages) {
-               *internal_pages = (get_task_internal(task) / PAGE_SIZE_64);
-       }
-
-       if (internal_compressed_pages) {
-               *internal_compressed_pages = (get_task_internal_compressed(task) / PAGE_SIZE_64);
-       }
-
-       if (purgeable_nonvolatile_pages) {
-               *purgeable_nonvolatile_pages = (get_task_purgeable_nonvolatile(task) / PAGE_SIZE_64);
-       }
-
-       if (purgeable_nonvolatile_compressed_pages) {
-               *purgeable_nonvolatile_compressed_pages = (get_task_purgeable_nonvolatile_compressed(task) / PAGE_SIZE_64);
-       }
-
-       if (alternate_accounting_pages) {
-               *alternate_accounting_pages = (get_task_alternate_accounting(task) / PAGE_SIZE_64);
-       }
-
-       if (alternate_accounting_compressed_pages) {
-               *alternate_accounting_compressed_pages = (get_task_alternate_accounting_compressed(task) / PAGE_SIZE_64);
-       }
-
-       if (iokit_mapped_pages) {
-               *iokit_mapped_pages = (get_task_iokit_mapped(task) / PAGE_SIZE_64);
-       }
-
-       if (page_table_pages) {
-               *page_table_pages = (get_task_page_table(task) / PAGE_SIZE_64);
-       }
-}
-
-/*
- * This routine only acts on the global jetsam event snapshot.
- * Updating the process's entry can race when the memorystatus_thread
- * has chosen to kill a process that is racing to exit on another core.
- */
-static void
-memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime)
-{
-       memorystatus_jetsam_snapshot_entry_t *entry = NULL;
-       memorystatus_jetsam_snapshot_t *snapshot    = NULL;
-       memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
-
-       unsigned int i;
-
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
-
-       if (memorystatus_jetsam_snapshot_count == 0) {
-               /*
-                * No active snapshot.
-                * Nothing to do.
-                */
-               return;
-       }
-
-       /*
-        * Sanity check as this routine should only be called
-        * from a jetsam kill path.
-        */
-       assert(kill_cause != 0 && killtime != 0);
-
-       snapshot       = memorystatus_jetsam_snapshot;
-       snapshot_list  = memorystatus_jetsam_snapshot->entries;
-
-       for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
-               if (snapshot_list[i].pid == p->p_pid) {
-                       entry = &snapshot_list[i];
-
-                       if (entry->killed || entry->jse_killtime) {
-                               /*
-                                * We apparently raced on the exit path
-                                * for this process, as it's snapshot entry
-                                * has already recorded a kill.
-                                */
-                               assert(entry->killed && entry->jse_killtime);
-                               break;
-                       }
-
-                       /*
-                        * Update the entry we just found in the snapshot.
-                        */
-
-                       entry->killed       = kill_cause;
-                       entry->jse_killtime = killtime;
-                       entry->jse_gencount = snapshot->js_gencount;
-                       entry->jse_idle_delta = p->p_memstat_idle_delta;
-#if CONFIG_FREEZE
-                       entry->jse_thaw_count = p->p_memstat_thaw_count;
-#else /* CONFIG_FREEZE */
-                       entry->jse_thaw_count = 0;
-#endif /* CONFIG_FREEZE */
-
-                       /*
-                        * If a process has moved between bands since snapshot was
-                        * initialized, then likely these fields changed too.
-                        */
-                       if (entry->priority != p->p_memstat_effectivepriority) {
-                               strlcpy(entry->name, p->p_name, sizeof(entry->name));
-                               entry->priority  = p->p_memstat_effectivepriority;
-                               entry->state     = memorystatus_build_state(p);
-                               entry->user_data = p->p_memstat_userdata;
-                               entry->fds       = p->p_fd->fd_nfiles;
-                       }
-
-                       /*
-                        * Always update the page counts on a kill.
-                        */
-
-                       uint32_t pages              = 0;
-                       uint32_t max_pages_lifetime = 0;
-                       uint32_t purgeable_pages    = 0;
-
-                       memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages);
-                       entry->pages              = (uint64_t)pages;
-                       entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
-                       entry->purgeable_pages    = (uint64_t)purgeable_pages;
-
-                       uint64_t internal_pages                        = 0;
-                       uint64_t internal_compressed_pages             = 0;
-                       uint64_t purgeable_nonvolatile_pages           = 0;
-                       uint64_t purgeable_nonvolatile_compressed_pages = 0;
-                       uint64_t alternate_accounting_pages            = 0;
-                       uint64_t alternate_accounting_compressed_pages = 0;
-                       uint64_t iokit_mapped_pages                    = 0;
-                       uint64_t page_table_pages                      = 0;
-
-                       memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages,
-                           &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
-                           &alternate_accounting_pages, &alternate_accounting_compressed_pages,
-                           &iokit_mapped_pages, &page_table_pages);
-
-                       entry->jse_internal_pages = internal_pages;
-                       entry->jse_internal_compressed_pages = internal_compressed_pages;
-                       entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
-                       entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
-                       entry->jse_alternate_accounting_pages = alternate_accounting_pages;
-                       entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
-                       entry->jse_iokit_mapped_pages = iokit_mapped_pages;
-                       entry->jse_page_table_pages = page_table_pages;
-
-                       uint64_t region_count = 0;
-                       memorystatus_get_task_memory_region_count(p->task, &region_count);
-                       entry->jse_memory_region_count = region_count;
-
-                       goto exit;
-               }
-       }
-
-       if (entry == NULL) {
-               /*
-                * The entry was not found in the snapshot, so the process must have
-                * launched after the snapshot was initialized.
-                * Let's try to append the new entry.
-                */
-               if (memorystatus_jetsam_snapshot_count < memorystatus_jetsam_snapshot_max) {
-                       /*
-                        * A populated snapshot buffer exists
-                        * and there is room to init a new entry.
-                        */
-                       assert(memorystatus_jetsam_snapshot_count == snapshot->entry_count);
-
-                       unsigned int next = memorystatus_jetsam_snapshot_count;
-
-                       if (memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[next], (snapshot->js_gencount)) == TRUE) {
-                               entry = &snapshot_list[next];
-                               entry->killed       = kill_cause;
-                               entry->jse_killtime = killtime;
-
-                               snapshot->entry_count = ++next;
-                               memorystatus_jetsam_snapshot_count = next;
-
-                               if (memorystatus_jetsam_snapshot_count >= memorystatus_jetsam_snapshot_max) {
-                                       /*
-                                        * We just used the last slot in the snapshot buffer.
-                                        * We only want to log it once... so we do it here
-                                        * when we notice we've hit the max.
-                                        */
-                                       printf("memorystatus: WARNING snapshot buffer is full, count %d\n",
-                                           memorystatus_jetsam_snapshot_count);
-                               }
-                       }
-               }
-       }
-
-exit:
-       if (entry == NULL) {
-               /*
-                * If we reach here, the snapshot buffer could not be updated.
-                * Most likely, the buffer is full, in which case we would have
-                * logged a warning in the previous call.
-                *
-                * For now, we will stop appending snapshot entries.
-                * When the buffer is consumed, the snapshot state will reset.
-                */
-
-               MEMORYSTATUS_DEBUG(4, "memorystatus_update_jetsam_snapshot_entry_locked: failed to update pid %d, priority %d, count %d\n",
-                   p->p_pid, p->p_memstat_effectivepriority, memorystatus_jetsam_snapshot_count);
-       }
-
-       return;
-}
-
-#if CONFIG_JETSAM
-void
-memorystatus_pages_update(unsigned int pages_avail)
-{
-       memorystatus_available_pages = pages_avail;
-
-#if VM_PRESSURE_EVENTS
-       /*
-        * Since memorystatus_available_pages changes, we should
-        * re-evaluate the pressure levels on the system and
-        * check if we need to wake the pressure thread.
-        * We also update memorystatus_level in that routine.
-        */
-       vm_pressure_response();
-
-       if (memorystatus_available_pages <= memorystatus_available_pages_pressure) {
-               if (memorystatus_hwm_candidates || (memorystatus_available_pages <= memorystatus_available_pages_critical)) {
-                       memorystatus_thread_wake();
-               }
-       }
-#if CONFIG_FREEZE
-       /*
-        * We can't grab the freezer_mutex here even though that synchronization would be correct to inspect
-        * the # of frozen processes and wakeup the freezer thread. Reason being that we come here into this
-        * code with (possibly) the page-queue locks held and preemption disabled. So trying to grab a mutex here
-        * will result in the "mutex with preemption disabled" panic.
-        */
-
-       if (memorystatus_freeze_thread_should_run() == TRUE) {
-               /*
-                * The freezer thread is usually woken up by some user-space call i.e. pid_hibernate(any process).
-                * That trigger isn't invoked often enough and so we are enabling this explicit wakeup here.
-                */
-               if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
-                       thread_wakeup((event_t)&memorystatus_freeze_wakeup);
-               }
-       }
-#endif /* CONFIG_FREEZE */
-
-#else /* VM_PRESSURE_EVENTS */
-
-       boolean_t critical, delta;
-
-       if (!memorystatus_delta) {
-               return;
-       }
-
-       critical = (pages_avail < memorystatus_available_pages_critical) ? TRUE : FALSE;
-       delta = ((pages_avail >= (memorystatus_available_pages + memorystatus_delta))
-           || (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) ? TRUE : FALSE;
-
-       if (critical || delta) {
-               unsigned int total_pages;
-
-               total_pages = (unsigned int) atop_64(max_mem);
-#if CONFIG_SECLUDED_MEMORY
-               total_pages -= vm_page_secluded_count;
-#endif /* CONFIG_SECLUDED_MEMORY */
-               memorystatus_level = memorystatus_available_pages * 100 / total_pages;
-               memorystatus_thread_wake();
-       }
-#endif /* VM_PRESSURE_EVENTS */
-}
-#endif /* CONFIG_JETSAM */
-
-static boolean_t
-memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount)
-{
-       clock_sec_t                     tv_sec;
-       clock_usec_t                    tv_usec;
-       uint32_t pages = 0;
-       uint32_t max_pages_lifetime = 0;
-       uint32_t purgeable_pages = 0;
-       uint64_t internal_pages                         = 0;
-       uint64_t internal_compressed_pages              = 0;
-       uint64_t purgeable_nonvolatile_pages            = 0;
-       uint64_t purgeable_nonvolatile_compressed_pages = 0;
-       uint64_t alternate_accounting_pages             = 0;
-       uint64_t alternate_accounting_compressed_pages  = 0;
-       uint64_t iokit_mapped_pages                     = 0;
-       uint64_t page_table_pages                       = 0;
-       uint64_t region_count                           = 0;
-       uint64_t cids[COALITION_NUM_TYPES];
+       if (ret != 0) {
+               goto exit;
+       }
 
-       memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
+       if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
+               now_dirty = TRUE;
+       }
 
-       entry->pid = p->p_pid;
-       strlcpy(&entry->name[0], p->p_name, sizeof(entry->name));
-       entry->priority = p->p_memstat_effectivepriority;
+       if ((was_dirty == TRUE && now_dirty == FALSE) ||
+           (was_dirty == FALSE && now_dirty == TRUE)) {
+               /* Manage idle exit deferral, if applied */
+               if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
+                       /*
+                        * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back
+                        * there once it's clean again. For the legacy case, this only applies if it has some protection window left.
+                        * P_DIRTY_DEFER: one-time protection window given at launch
+                        * P_DIRTY_DEFER_ALWAYS: protection window given for every dirty->clean transition. Like non-legacy mode.
+                        *
+                        * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over
+                        * in that band on it's way to IDLE.
+                        */
 
-       memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages);
-       entry->pages              = (uint64_t)pages;
-       entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
-       entry->purgeable_pages    = (uint64_t)purgeable_pages;
+                       if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
+                               /*
+                                * New dirty process i.e. "was_dirty == FALSE && now_dirty == TRUE"
+                                *
+                                * The process will move from its aging band to its higher requested
+                                * jetsam band.
+                                */
+                               boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE;
 
-       memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages,
-           &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
-           &alternate_accounting_pages, &alternate_accounting_compressed_pages,
-           &iokit_mapped_pages, &page_table_pages);
+                               memorystatus_invalidate_idle_demotion_locked(p, reset_state);
+                               reschedule = TRUE;
+                       } else {
+                               /*
+                                * Process is back from "dirty" to "clean".
+                                */
 
-       entry->jse_internal_pages = internal_pages;
-       entry->jse_internal_compressed_pages = internal_compressed_pages;
-       entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
-       entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
-       entry->jse_alternate_accounting_pages = alternate_accounting_pages;
-       entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
-       entry->jse_iokit_mapped_pages = iokit_mapped_pages;
-       entry->jse_page_table_pages = page_table_pages;
+                               if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) {
+                                       if (((p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) == FALSE) &&
+                                           (mach_absolute_time() >= p->p_memstat_idledeadline)) {
+                                               /*
+                                                * The process' hasn't enrolled in the "always defer after dirty"
+                                                * mode and its deadline has expired. It currently
+                                                * does not reside in any of the aging buckets.
+                                                *
+                                                * It's on its way to the JETSAM_PRIORITY_IDLE
+                                                * bucket via memorystatus_update_idle_priority_locked()
+                                                * below.
+                                                *
+                                                * So all we need to do is reset all the state on the
+                                                * process that's related to the aging bucket i.e.
+                                                * the AGING_IN_PROGRESS flag and the timer deadline.
+                                                */
 
-       memorystatus_get_task_memory_region_count(p->task, &region_count);
-       entry->jse_memory_region_count = region_count;
+                                               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+                                               reschedule = TRUE;
+                                       } else {
+                                               /*
+                                                * Process enrolled in "always stop in deferral band after dirty" OR
+                                                * it still has some protection window left and so
+                                                * we just re-arm the timer without modifying any
+                                                * state on the process iff it still wants into that band.
+                                                */
 
-       entry->state     = memorystatus_build_state(p);
-       entry->user_data = p->p_memstat_userdata;
-       memcpy(&entry->uuid[0], &p->p_uuid[0], sizeof(p->p_uuid));
-       entry->fds       = p->p_fd->fd_nfiles;
+                                               if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) {
+                                                       memorystatus_schedule_idle_demotion_locked(p, TRUE);
+                                                       reschedule = TRUE;
+                                               } else if (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) {
+                                                       memorystatus_schedule_idle_demotion_locked(p, FALSE);
+                                                       reschedule = TRUE;
+                                               }
+                                       }
+                               } else {
+                                       memorystatus_schedule_idle_demotion_locked(p, TRUE);
+                                       reschedule = TRUE;
+                               }
+                       }
+               }
 
-       absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec);
-       entry->cpu_time.tv_sec = (int64_t)tv_sec;
-       entry->cpu_time.tv_usec = (int64_t)tv_usec;
+               memorystatus_update_idle_priority_locked(p);
 
-       assert(p->p_stats != NULL);
-       entry->jse_starttime =  p->p_stats->ps_start;   /* abstime process started */
-       entry->jse_killtime = 0;                        /* abstime jetsam chose to kill process */
-       entry->killed       = 0;                        /* the jetsam kill cause */
-       entry->jse_gencount = gencount;                 /* indicates a pass through jetsam thread, when process was targeted to be killed */
+               if (memorystatus_highwater_enabled) {
+                       boolean_t ledger_update_needed = TRUE;
+                       boolean_t use_active;
+                       boolean_t is_fatal;
+                       /*
+                        * We are in this path because this process transitioned between
+                        * dirty <--> clean state.  Update the cached memory limits.
+                        */
 
-       entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */
+                       if (proc_jetsam_state_is_active_locked(p) == TRUE) {
+                               /*
+                                * process is pinned in elevated band
+                                * or
+                                * process is dirty
+                                */
+                               CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
+                               use_active = TRUE;
+                               ledger_update_needed = TRUE;
+                       } else {
+                               /*
+                                * process is clean...but if it has opted into pressured-exit
+                                * we don't apply the INACTIVE limit till the process has aged
+                                * out and is entering the IDLE band.
+                                * See memorystatus_update_priority_locked() for that.
+                                */
 
-#if CONFIG_FREEZE
-       entry->jse_thaw_count = p->p_memstat_thaw_count;
-#else /* CONFIG_FREEZE */
-       entry->jse_thaw_count = 0;
-#endif /* CONFIG_FREEZE */
+                               if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
+                                       ledger_update_needed = FALSE;
+                               } else {
+                                       CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
+                                       use_active = FALSE;
+                                       ledger_update_needed = TRUE;
+                               }
+                       }
 
-       proc_coalitionids(p, cids);
-       entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM];
+                       /*
+                        * Enforce the new limits by writing to the ledger.
+                        *
+                        * This is a hot path and holding the proc_list_lock while writing to the ledgers,
+                        * (where the task lock is taken) is bad.  So, we temporarily drop the proc_list_lock.
+                        * We aren't traversing the jetsam bucket list here, so we should be safe.
+                        * See rdar://21394491.
+                        */
 
-       return TRUE;
-}
+                       if (ledger_update_needed && proc_ref_locked(p) == p) {
+                               int ledger_limit;
+                               if (p->p_memstat_memlimit > 0) {
+                                       ledger_limit = p->p_memstat_memlimit;
+                               } else {
+                                       ledger_limit = -1;
+                               }
+                               proc_list_unlock();
+                               task_set_phys_footprint_limit_internal(p->task, ledger_limit, NULL, use_active, is_fatal);
+                               proc_list_lock();
+                               proc_rele_locked(p);
 
-static void
-memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t *snapshot)
-{
-       kern_return_t kr = KERN_SUCCESS;
-       mach_msg_type_number_t  count = HOST_VM_INFO64_COUNT;
-       vm_statistics64_data_t  vm_stat;
+                               MEMORYSTATUS_DEBUG(3, "memorystatus_dirty_set: new limit on pid %d (%dMB %s) priority(%d) dirty?=0x%x %s\n",
+                                   p->p_pid, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
+                                   (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty,
+                                   (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
+                       }
+               }
 
-       if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count)) != KERN_SUCCESS) {
-               printf("memorystatus_init_jetsam_snapshot_stats: host_statistics64 failed with %d\n", kr);
-               memset(&snapshot->stats, 0, sizeof(snapshot->stats));
-       } else {
-               snapshot->stats.free_pages      = vm_stat.free_count;
-               snapshot->stats.active_pages    = vm_stat.active_count;
-               snapshot->stats.inactive_pages  = vm_stat.inactive_count;
-               snapshot->stats.throttled_pages = vm_stat.throttled_count;
-               snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
-               snapshot->stats.wired_pages     = vm_stat.wire_count;
+               /* If the deferral state changed, reschedule the demotion timer */
+               if (reschedule) {
+                       memorystatus_reschedule_idle_demotion_locked();
+               }
+       }
 
-               snapshot->stats.speculative_pages = vm_stat.speculative_count;
-               snapshot->stats.filebacked_pages  = vm_stat.external_page_count;
-               snapshot->stats.anonymous_pages   = vm_stat.internal_page_count;
-               snapshot->stats.compressions      = vm_stat.compressions;
-               snapshot->stats.decompressions    = vm_stat.decompressions;
-               snapshot->stats.compressor_pages  = vm_stat.compressor_page_count;
-               snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
+       if (kill) {
+               if (proc_ref_locked(p) == p) {
+                       proc_list_unlock();
+                       psignal(p, SIGKILL);
+                       proc_list_lock();
+                       proc_rele_locked(p);
+               }
        }
 
-       get_zone_map_size(&snapshot->stats.zone_map_size, &snapshot->stats.zone_map_capacity);
-       get_largest_zone_info(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name),
-           &snapshot->stats.largest_zone_size);
-}
+exit:
+       proc_list_unlock();
 
-/*
- * Collect vm statistics at boot.
- * Called only once (see kern_exec.c)
- * Data can be consumed at any time.
- */
-void
-memorystatus_init_at_boot_snapshot()
-{
-       memorystatus_init_snapshot_vmstats(&memorystatus_at_boot_snapshot);
-       memorystatus_at_boot_snapshot.entry_count = 0;
-       memorystatus_at_boot_snapshot.notification_time = 0;   /* updated when consumed */
-       memorystatus_at_boot_snapshot.snapshot_time = mach_absolute_time();
+       return ret;
 }
 
-static void
-memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count )
+int
+memorystatus_dirty_clear(proc_t p, uint32_t pcontrol)
 {
-       proc_t p, next_p;
-       unsigned int b = 0, i = 0;
+       int ret = 0;
 
-       memorystatus_jetsam_snapshot_t *snapshot = NULL;
-       memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
-       unsigned int snapshot_max = 0;
+       MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_clear(): %d 0x%x 0x%x\n", p->p_pid, pcontrol, p->p_memstat_dirty);
 
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_CLEAR), p->p_pid, pcontrol, 0, 0, 0);
 
-       if (od_snapshot) {
-               /*
-                * This is an on_demand snapshot
-                */
-               snapshot      = od_snapshot;
-               snapshot_list = od_snapshot->entries;
-               snapshot_max  = ods_list_count;
-       } else {
+       proc_list_lock();
+
+       if ((p->p_listflag & P_LIST_EXITED) != 0) {
                /*
-                * This is a jetsam event snapshot
-                */
-               snapshot      = memorystatus_jetsam_snapshot;
-               snapshot_list = memorystatus_jetsam_snapshot->entries;
-               snapshot_max  = memorystatus_jetsam_snapshot_max;
+                * Process is on its way out.
+                */
+               ret = EBUSY;
+               goto exit;
        }
 
-       /*
-        * Init the snapshot header information
-        */
-       memorystatus_init_snapshot_vmstats(snapshot);
-       snapshot->snapshot_time = mach_absolute_time();
-       snapshot->notification_time = 0;
-       snapshot->js_gencount = 0;
+       if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
+               ret = EPERM;
+               goto exit;
+       }
 
-       next_p = memorystatus_get_first_proc_locked(&b, TRUE);
-       while (next_p) {
-               p = next_p;
-               next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
+       if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
+               /* Dirty tracking not enabled */
+               ret = EINVAL;
+               goto exit;
+       }
 
-               if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], snapshot->js_gencount)) {
-                       continue;
-               }
+       if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) == 0) {
+               ret = EINVAL;
+               goto exit;
+       }
 
-               MEMORYSTATUS_DEBUG(0, "jetsam snapshot pid %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
-                   p->p_pid,
-                   p->p_uuid[0], p->p_uuid[1], p->p_uuid[2], p->p_uuid[3], p->p_uuid[4], p->p_uuid[5], p->p_uuid[6], p->p_uuid[7],
-                   p->p_uuid[8], p->p_uuid[9], p->p_uuid[10], p->p_uuid[11], p->p_uuid[12], p->p_uuid[13], p->p_uuid[14], p->p_uuid[15]);
+       if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
+               p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS;
+       }
 
-               if (++i == snapshot_max) {
-                       break;
+       /* This can be set and cleared exactly once. */
+       if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
+               if (p->p_memstat_dirty & P_DIRTY_DEFER) {
+                       p->p_memstat_dirty &= ~(P_DIRTY_DEFER);
                }
-       }
 
-       snapshot->entry_count = i;
+               if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) {
+                       p->p_memstat_dirty &= ~(P_DIRTY_DEFER_ALWAYS);
+               }
 
-       if (!od_snapshot) {
-               /* update the system buffer count */
-               memorystatus_jetsam_snapshot_count = i;
+               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+               memorystatus_update_idle_priority_locked(p);
+               memorystatus_reschedule_idle_demotion_locked();
        }
-}
 
-#if DEVELOPMENT || DEBUG
+       ret = 0;
+exit:
+       proc_list_unlock();
 
-#if CONFIG_JETSAM
-static int
-memorystatus_cmd_set_panic_bits(user_addr_t buffer, uint32_t buffer_size)
+       return ret;
+}
+
+int
+memorystatus_dirty_get(proc_t p, boolean_t locked)
 {
-       int ret;
-       memorystatus_jetsam_panic_options_t debug;
+       int ret = 0;
 
-       if (buffer_size != sizeof(memorystatus_jetsam_panic_options_t)) {
-               return EINVAL;
+       if (!locked) {
+               proc_list_lock();
        }
 
-       ret = copyin(buffer, &debug, buffer_size);
-       if (ret) {
-               return ret;
+       if (p->p_memstat_dirty & P_DIRTY_TRACK) {
+               ret |= PROC_DIRTY_TRACKED;
+               if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
+                       ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
+               }
+               if (p->p_memstat_dirty & P_DIRTY) {
+                       ret |= PROC_DIRTY_IS_DIRTY;
+               }
+               if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) {
+                       ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS;
+               }
        }
 
-       /* Panic bits match kMemorystatusKilled* enum */
-       memorystatus_jetsam_panic_debug = (memorystatus_jetsam_panic_debug & ~debug.mask) | (debug.data & debug.mask);
-
-       /* Copyout new value */
-       debug.data = memorystatus_jetsam_panic_debug;
-       ret = copyout(&debug, buffer, sizeof(memorystatus_jetsam_panic_options_t));
+       if (!locked) {
+               proc_list_unlock();
+       }
 
        return ret;
 }
-#endif /* CONFIG_JETSAM */
 
-/*
- * Triggers a sort_order on a specified jetsam priority band.
- * This is for testing only, used to force a path through the sort
- * function.
- */
-static int
-memorystatus_cmd_test_jetsam_sort(int priority, int sort_order)
+int
+memorystatus_on_terminate(proc_t p)
 {
-       int error = 0;
+       int sig;
 
-       unsigned int bucket_index = 0;
+       proc_list_lock();
 
-       if (priority == -1) {
-               /* Use as shorthand for default priority */
-               bucket_index = JETSAM_PRIORITY_DEFAULT;
+       p->p_memstat_dirty |= P_DIRTY_TERMINATED;
+
+       if ((p->p_memstat_dirty & (P_DIRTY_TRACK | P_DIRTY_IS_DIRTY)) == P_DIRTY_TRACK) {
+               /* Clean; mark as terminated and issue SIGKILL */
+               sig = SIGKILL;
        } else {
-               bucket_index = (unsigned int)priority;
+               /* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
+               sig = SIGTERM;
        }
 
-       error = memorystatus_sort_bucket(bucket_index, sort_order);
+       proc_list_unlock();
 
-       return error;
+       return sig;
 }
 
-#endif /* DEVELOPMENT || DEBUG */
-
-/*
- * Prepare the process to be killed (set state, update snapshot) and kill it.
- */
-static uint64_t memorystatus_purge_before_jetsam_success = 0;
-
-static boolean_t
-memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, boolean_t *killed)
+void
+memorystatus_on_suspend(proc_t p)
 {
-       pid_t aPid = 0;
-       uint32_t aPid_ep = 0;
+#if CONFIG_FREEZE
+       uint32_t pages;
+       memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
+#endif
+       proc_list_lock();
+#if CONFIG_FREEZE
+       memorystatus_suspended_count++;
+#endif
+       p->p_memstat_state |= P_MEMSTAT_SUSPENDED;
+       proc_list_unlock();
+}
 
-       uint64_t        killtime = 0;
-       clock_sec_t     tv_sec;
-       clock_usec_t    tv_usec;
-       uint32_t        tv_msec;
-       boolean_t       retval = FALSE;
-       uint64_t        num_pages_purged = 0;
+void
+memorystatus_on_resume(proc_t p)
+{
+#if CONFIG_FREEZE
+       boolean_t frozen;
+       pid_t pid;
+#endif
 
-       aPid = p->p_pid;
-       aPid_ep = p->p_memstat_effectivepriority;
+       proc_list_lock();
 
-       if (cause != kMemorystatusKilledVnodes && cause != kMemorystatusKilledZoneMapExhaustion) {
+#if CONFIG_FREEZE
+       frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN);
+       if (frozen) {
                /*
-                * Genuine memory pressure and not other (vnode/zone) resource exhaustion.
+                * Now that we don't _thaw_ a process completely,
+                * resuming it (and having some on-demand swapins)
+                * shouldn't preclude it from being counted as frozen.
+                *
+                * memorystatus_frozen_count--;
+                *
+                * We preserve the P_MEMSTAT_FROZEN state since the process
+                * could have state on disk AND so will deserve some protection
+                * in the jetsam bands.
                 */
-               boolean_t success = FALSE;
+               if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == 0) {
+                       p->p_memstat_state |= P_MEMSTAT_REFREEZE_ELIGIBLE;
+                       memorystatus_refreeze_eligible_count++;
+               }
+               p->p_memstat_thaw_count++;
 
-               networking_memstatus_callout(p, cause);
-               num_pages_purged = vm_purgeable_purge_task_owned(p->task);
+               memorystatus_thaw_count++;
+       }
 
-               if (num_pages_purged) {
-                       /*
-                        * We actually purged something and so let's
-                        * check if we need to continue with the kill.
-                        */
-                       if (cause == kMemorystatusKilledHiwat) {
-                               uint64_t footprint_in_bytes = get_task_phys_footprint(p->task);
-                               uint64_t memlimit_in_bytes  = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL);  /* convert MB to bytes */
-                               success = (footprint_in_bytes <= memlimit_in_bytes);
-                       } else {
-                               success = (memorystatus_avail_pages_below_pressure() == FALSE);
-                       }
+       memorystatus_suspended_count--;
 
-                       if (success) {
-                               memorystatus_purge_before_jetsam_success++;
+       pid = p->p_pid;
+#endif
+
+       /*
+        * P_MEMSTAT_FROZEN will remain unchanged. This used to be:
+        * p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
+        */
+       p->p_memstat_state &= ~P_MEMSTAT_SUSPENDED;
 
-                               os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: purged %llu pages from pid %d [%s] and avoided %s\n",
-                                   num_pages_purged, aPid, (*p->p_name ? p->p_name : "unknown"), memorystatus_kill_cause_name[cause]);
+       proc_list_unlock();
 
-                               *killed = FALSE;
+#if CONFIG_FREEZE
+       if (frozen) {
+               memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
+               memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
+       }
+#endif
+}
 
-                               return TRUE;
-                       }
-               }
+void
+memorystatus_on_inactivity(proc_t p)
+{
+#pragma unused(p)
+#if CONFIG_FREEZE
+       /* Wake the freeze thread */
+       thread_wakeup((event_t)&memorystatus_freeze_wakeup);
+#endif
+}
+
+/*
+ * The proc_list_lock is held by the caller.
+ */
+static uint32_t
+memorystatus_build_state(proc_t p)
+{
+       uint32_t snapshot_state = 0;
+
+       /* General */
+       if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
+               snapshot_state |= kMemorystatusSuspended;
+       }
+       if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
+               snapshot_state |= kMemorystatusFrozen;
+       }
+       if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
+               snapshot_state |= kMemorystatusWasThawed;
+       }
+       if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
+               snapshot_state |= kMemorystatusAssertion;
+       }
+
+       /* Tracking */
+       if (p->p_memstat_dirty & P_DIRTY_TRACK) {
+               snapshot_state |= kMemorystatusTracked;
+       }
+       if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
+               snapshot_state |= kMemorystatusSupportsIdleExit;
+       }
+       if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
+               snapshot_state |= kMemorystatusDirty;
        }
 
-#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
-       MEMORYSTATUS_DEBUG(1, "jetsam: %s pid %d [%s] - %lld Mb > 1 (%d Mb)\n",
-           (memorystatus_jetsam_policy & kPolicyDiagnoseActive) ? "suspending": "killing",
-           aPid, (*p->p_name ? p->p_name : "unknown"),
-           (footprint_in_bytes / (1024ULL * 1024ULL)),                 /* converted bytes to MB */
-           p->p_memstat_memlimit);
-#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
+       return snapshot_state;
+}
 
-       killtime = mach_absolute_time();
-       absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
-       tv_msec = tv_usec / 1000;
+static boolean_t
+kill_idle_exit_proc(void)
+{
+       proc_t p, victim_p = PROC_NULL;
+       uint64_t current_time, footprint_of_killed_proc;
+       boolean_t killed = FALSE;
+       unsigned int i = 0;
+       os_reason_t jetsam_reason = OS_REASON_NULL;
 
-#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
-       if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
-               if (cause == kMemorystatusKilledHiwat) {
-                       MEMORYSTATUS_DEBUG(1, "jetsam: suspending pid %d [%s] for diagnosis - memorystatus_available_pages: %d\n",
-                           aPid, (*p->p_name ? p->p_name: "(unknown)"), memorystatus_available_pages);
-               } else {
-                       int activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND;
-                       if (activeProcess) {
-                               MEMORYSTATUS_DEBUG(1, "jetsam: suspending pid %d [%s] (active) for diagnosis - memorystatus_available_pages: %d\n",
-                                   aPid, (*p->p_name ? p->p_name: "(unknown)"), memorystatus_available_pages);
-
-                               if (memorystatus_jetsam_policy & kPolicyDiagnoseFirst) {
-                                       jetsam_diagnostic_suspended_one_active_proc = 1;
-                                       printf("jetsam: returning after suspending first active proc - %d\n", aPid);
-                               }
-                       }
-               }
+       /* Pick next idle exit victim. */
+       current_time = mach_absolute_time();
 
-               proc_list_lock();
-               /* This diagnostic code is going away soon. Ignore the kMemorystatusInvalid cause here. */
-               memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusInvalid, killtime);
-               proc_list_unlock();
+       jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_IDLE_EXIT);
+       if (jetsam_reason == OS_REASON_NULL) {
+               printf("kill_idle_exit_proc: failed to allocate jetsam reason\n");
+       }
 
-               p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
+       proc_list_lock();
 
-               if (p) {
-                       task_suspend(p->task);
-                       *killed = TRUE;
+       p = memorystatus_get_first_proc_locked(&i, FALSE);
+       while (p) {
+               /* No need to look beyond the idle band */
+               if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
+                       break;
                }
-       } else
-#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
-       {
-               proc_list_lock();
-               memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
-               proc_list_unlock();
 
-               char kill_reason_string[128];
-
-               if (cause == kMemorystatusKilledHiwat) {
-                       strlcpy(kill_reason_string, "killing_highwater_process", 128);
-               } else {
-                       if (aPid_ep == JETSAM_PRIORITY_IDLE) {
-                               strlcpy(kill_reason_string, "killing_idle_process", 128);
-                       } else {
-                               strlcpy(kill_reason_string, "killing_top_process", 128);
+               if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT | P_DIRTY_IS_DIRTY | P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
+                       if (current_time >= p->p_memstat_idledeadline) {
+                               p->p_memstat_dirty |= P_DIRTY_TERMINATED;
+                               victim_p = proc_ref_locked(p);
+                               break;
                        }
                }
 
-               os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: %s pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
-                   (unsigned long)tv_sec, tv_msec, kill_reason_string,
-                   aPid, (*p->p_name ? p->p_name : "unknown"),
-                   memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)memorystatus_available_pages);
-
-               /*
-                * memorystatus_do_kill drops a reference, so take another one so we can
-                * continue to use this exit reason even after memorystatus_do_kill()
-                * returns
-                */
-               os_reason_ref(jetsam_reason);
+               p = memorystatus_get_next_proc_locked(&i, p, FALSE);
+       }
 
-               retval = memorystatus_do_kill(p, cause, jetsam_reason);
+       proc_list_unlock();
 
-               *killed = retval;
+       if (victim_p) {
+               printf("memorystatus: killing_idle_process pid %d [%s] jetsam_reason->osr_code: %llu\n", victim_p->p_pid, (*victim_p->p_name ? victim_p->p_name : "unknown"), jetsam_reason->osr_code);
+               killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit, jetsam_reason, &footprint_of_killed_proc);
+               proc_rele(victim_p);
+       } else {
+               os_reason_free(jetsam_reason);
        }
 
-       return retval;
+       return killed;
 }
 
-/*
- * Jetsam the first process in the queue.
- */
-static boolean_t
-memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason,
-    int32_t *priority, uint32_t *errors)
+static void
+memorystatus_thread_wake(void)
 {
-       pid_t aPid;
-       proc_t p = PROC_NULL, next_p = PROC_NULL;
-       boolean_t new_snapshot = FALSE, force_new_snapshot = FALSE, killed = FALSE, freed_mem = FALSE;
-       unsigned int i = 0;
-       uint32_t aPid_ep;
-       int32_t         local_max_kill_prio = JETSAM_PRIORITY_IDLE;
-
-#ifndef CONFIG_FREEZE
-#pragma unused(any)
-#endif
-
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
-           memorystatus_available_pages, 0, 0, 0, 0);
-
+       int thr_id = 0;
+       int active_thr = atomic_load(&active_jetsam_threads);
 
-#if CONFIG_JETSAM
-       if (sort_flag == TRUE) {
-               (void)memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
+       /* Wakeup all the jetsam threads */
+       for (thr_id = 0; thr_id < active_thr; thr_id++) {
+               thread_wakeup((event_t)&jetsam_threads[thr_id].memorystatus_wakeup);
        }
+}
 
-       local_max_kill_prio = max_kill_priority;
-
-       force_new_snapshot = FALSE;
-
-#else /* CONFIG_JETSAM */
-
-       if (sort_flag == TRUE) {
-               (void)memorystatus_sort_bucket(JETSAM_PRIORITY_IDLE, JETSAM_SORT_DEFAULT);
-       }
+#if CONFIG_JETSAM
 
-       /*
-        * On macos, we currently only have 2 reasons to be here:
-        *
-        * kMemorystatusKilledZoneMapExhaustion
-        * AND
-        * kMemorystatusKilledVMCompressorSpaceShortage
-        *
-        * If we are here because of kMemorystatusKilledZoneMapExhaustion, we will consider
-        * any and all processes as eligible kill candidates since we need to avoid a panic.
-        *
-        * Since this function can be called async. it is harder to toggle the max_kill_priority
-        * value before and after a call. And so we use this local variable to set the upper band
-        * on the eligible kill bands.
-        */
-       if (cause == kMemorystatusKilledZoneMapExhaustion) {
-               local_max_kill_prio = JETSAM_PRIORITY_MAX;
-       } else {
-               local_max_kill_prio = max_kill_priority;
-       }
+static void
+memorystatus_thread_pool_max()
+{
+       /* Increase the jetsam thread pool to max_jetsam_threads */
+       int max_threads = max_jetsam_threads;
+       printf("Expanding memorystatus pool to %d!\n", max_threads);
+       atomic_store(&active_jetsam_threads, max_threads);
+}
 
-       /*
-        * And, because we are here under extreme circumstances, we force a snapshot even for
-        * IDLE kills.
-        */
-       force_new_snapshot = TRUE;
+static void
+memorystatus_thread_pool_default()
+{
+       /* Restore the jetsam thread pool to a single thread */
+       printf("Reverting memorystatus pool back to 1\n");
+       atomic_store(&active_jetsam_threads, 1);
+}
 
 #endif /* CONFIG_JETSAM */
 
-       proc_list_lock();
-
-       next_p = memorystatus_get_first_proc_locked(&i, TRUE);
-       while (next_p && (next_p->p_memstat_effectivepriority <= local_max_kill_prio)) {
-#if DEVELOPMENT || DEBUG
-               int procSuspendedForDiagnosis;
-#endif /* DEVELOPMENT || DEBUG */
+extern void vm_pressure_response(void);
 
-               p = next_p;
-               next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
+static int
+memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation)
+{
+       struct jetsam_thread_state *jetsam_thread = jetsam_current_thread();
 
-#if DEVELOPMENT || DEBUG
-               procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED;
-#endif /* DEVELOPMENT || DEBUG */
+       assert(jetsam_thread != NULL);
+       if (interval_ms) {
+               assert_wait_timeout(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT, interval_ms, NSEC_PER_MSEC);
+       } else {
+               assert_wait(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT);
+       }
 
-               aPid = p->p_pid;
-               aPid_ep = p->p_memstat_effectivepriority;
+       return thread_block(continuation);
+}
 
-               if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
-                       continue;   /* with lock held */
-               }
+static boolean_t
+memorystatus_avail_pages_below_pressure(void)
+{
+#if CONFIG_EMBEDDED
+/*
+ * Instead of CONFIG_EMBEDDED for these *avail_pages* routines, we should
+ * key off of the system having dynamic swap support. With full swap support,
+ * the system shouldn't really need to worry about various page thresholds.
+ */
+       return memorystatus_available_pages <= memorystatus_available_pages_pressure;
+#else /* CONFIG_EMBEDDED */
+       return FALSE;
+#endif /* CONFIG_EMBEDDED */
+}
 
-#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
-               if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) {
-                       printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid);
-                       continue;
-               }
-#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
+static boolean_t
+memorystatus_avail_pages_below_critical(void)
+{
+#if CONFIG_EMBEDDED
+       return memorystatus_available_pages <= memorystatus_available_pages_critical;
+#else /* CONFIG_EMBEDDED */
+       return FALSE;
+#endif /* CONFIG_EMBEDDED */
+}
 
-               if (cause == kMemorystatusKilledVnodes) {
-                       /*
-                        * If the system runs out of vnodes, we systematically jetsam
-                        * processes in hopes of stumbling onto a vnode gain that helps
-                        * the system recover.  The process that happens to trigger
-                        * this path has no known relationship to the vnode shortage.
-                        * Deadlock avoidance: attempt to safeguard the caller.
-                        */
+static boolean_t
+memorystatus_post_snapshot(int32_t priority, uint32_t cause)
+{
+       boolean_t is_idle_priority;
 
-                       if (p == current_proc()) {
-                               /* do not jetsam the current process */
-                               continue;
-                       }
-               }
+       if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) {
+               is_idle_priority = (priority == JETSAM_PRIORITY_IDLE);
+       } else {
+               is_idle_priority = (priority == JETSAM_PRIORITY_IDLE || priority == JETSAM_PRIORITY_IDLE_DEFERRED);
+       }
+#if CONFIG_EMBEDDED
+#pragma unused(cause)
+       /*
+        * Don't generate logs for steady-state idle-exit kills,
+        * unless it is overridden for debug or by the device
+        * tree.
+        */
 
-#if CONFIG_FREEZE
-               boolean_t skip;
-               boolean_t reclaim_proc = !(p->p_memstat_state & P_MEMSTAT_LOCKED);
-               if (any || reclaim_proc) {
-                       skip = FALSE;
-               } else {
-                       skip = TRUE;
-               }
+       return !is_idle_priority || memorystatus_idle_snapshot;
 
-               if (skip) {
-                       continue;
-               } else
-#endif
-               {
-                       if (proc_ref_locked(p) == p) {
-                               /*
-                                * Mark as terminated so that if exit1() indicates success, but the process (for example)
-                                * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
-                                * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
-                                * acquisition of the proc lock.
-                                */
-                               p->p_memstat_state |= P_MEMSTAT_TERMINATED;
-                       } else {
-                               /*
-                                * We need to restart the search again because
-                                * proc_ref_locked _can_ drop the proc_list lock
-                                * and we could have lost our stored next_p via
-                                * an exit() on another core.
-                                */
-                               i = 0;
-                               next_p = memorystatus_get_first_proc_locked(&i, TRUE);
-                               continue;
-                       }
+#else /* CONFIG_EMBEDDED */
+       /*
+        * Don't generate logs for steady-state idle-exit kills,
+        * unless
+        * - it is overridden for debug or by the device
+        * tree.
+        * OR
+        * - the kill causes are important i.e. not kMemorystatusKilledIdleExit
+        */
 
-                       /*
-                        * Capture a snapshot if none exists and:
-                        * - we are forcing a new snapshot creation, either because:
-                        *      - on a particular platform we need these snapshots every time, OR
-                        *      - a boot-arg/embedded device tree property has been set.
-                        * - priority was not requested (this is something other than an ambient kill)
-                        * - the priority was requested *and* the targeted process is not at idle priority
-                        */
-                       if ((memorystatus_jetsam_snapshot_count == 0) &&
-                           (force_new_snapshot || memorystatus_idle_snapshot || ((!priority) || (priority && (aPid_ep != JETSAM_PRIORITY_IDLE))))) {
-                               memorystatus_init_jetsam_snapshot_locked(NULL, 0);
-                               new_snapshot = TRUE;
-                       }
+       boolean_t snapshot_eligible_kill_cause = (is_reason_thrashing(cause) || is_reason_zone_map_exhaustion(cause));
+       return !is_idle_priority || memorystatus_idle_snapshot || snapshot_eligible_kill_cause;
+#endif /* CONFIG_EMBEDDED */
+}
 
-                       proc_list_unlock();
+static boolean_t
+memorystatus_action_needed(void)
+{
+#if CONFIG_EMBEDDED
+       return is_reason_thrashing(kill_under_pressure_cause) ||
+              is_reason_zone_map_exhaustion(kill_under_pressure_cause) ||
+              memorystatus_available_pages <= memorystatus_available_pages_pressure;
+#else /* CONFIG_EMBEDDED */
+       return is_reason_thrashing(kill_under_pressure_cause) ||
+              is_reason_zone_map_exhaustion(kill_under_pressure_cause);
+#endif /* CONFIG_EMBEDDED */
+}
 
-                       freed_mem = memorystatus_kill_proc(p, cause, jetsam_reason, &killed); /* purged and/or killed 'p' */
-                       /* Success? */
-                       if (freed_mem) {
-                               if (killed) {
-                                       if (priority) {
-                                               *priority = aPid_ep;
-                                       }
-                               } else {
-                                       /* purged */
-                                       proc_list_lock();
-                                       p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
-                                       proc_list_unlock();
-                               }
-                               proc_rele(p);
-                               goto exit;
-                       }
+static boolean_t
+memorystatus_act_on_hiwat_processes(uint32_t *errors, uint32_t *hwm_kill, boolean_t *post_snapshot, __unused boolean_t *is_critical, uint64_t *memory_reclaimed)
+{
+       boolean_t purged = FALSE, killed = FALSE;
 
-                       /*
-                        * Failure - first unwind the state,
-                        * then fall through to restart the search.
-                        */
-                       proc_list_lock();
-                       proc_rele_locked(p);
-                       p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
-                       p->p_memstat_state |= P_MEMSTAT_ERROR;
-                       *errors += 1;
+       *memory_reclaimed = 0;
+       killed = memorystatus_kill_hiwat_proc(errors, &purged, memory_reclaimed);
 
-                       i = 0;
-                       next_p = memorystatus_get_first_proc_locked(&i, TRUE);
+       if (killed) {
+               *hwm_kill = *hwm_kill + 1;
+               *post_snapshot = TRUE;
+               return TRUE;
+       } else {
+               if (purged == FALSE) {
+                       /* couldn't purge and couldn't kill */
+                       memorystatus_hwm_candidates = FALSE;
                }
        }
 
-       proc_list_unlock();
+#if CONFIG_JETSAM
+       /* No highwater processes to kill. Continue or stop for now? */
+       if (!is_reason_thrashing(kill_under_pressure_cause) &&
+           !is_reason_zone_map_exhaustion(kill_under_pressure_cause) &&
+           (memorystatus_available_pages > memorystatus_available_pages_critical)) {
+               /*
+                * We are _not_ out of pressure but we are above the critical threshold and there's:
+                * - no compressor thrashing
+                * - enough zone memory
+                * - no more HWM processes left.
+                * For now, don't kill any other processes.
+                */
 
-exit:
-       os_reason_free(jetsam_reason);
+               if (*hwm_kill == 0) {
+                       memorystatus_thread_wasted_wakeup++;
+               }
 
-       /* Clear snapshot if freshly captured and no target was found */
-       if (new_snapshot && !killed) {
-               proc_list_lock();
-               memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
-               proc_list_unlock();
-       }
+               *is_critical = FALSE;
 
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
-           memorystatus_available_pages, killed ? aPid : 0, 0, 0, 0);
+               return TRUE;
+       }
+#endif /* CONFIG_JETSAM */
 
-       return killed;
+       return FALSE;
 }
 
 /*
- * Jetsam aggressively
+ * kJetsamHighRelaunchCandidatesThreshold defines the percentage of candidates
+ * in the idle & deferred bands that need to be bad candidates in order to trigger
+ * aggressive jetsam.
  */
+#define kJetsamHighRelaunchCandidatesThreshold  (100)
+
+/* kJetsamMinCandidatesThreshold defines the minimum number of candidates in the
+ * idle/deferred bands to trigger aggressive jetsam. This value basically decides
+ * how much memory the system is ready to hold in the lower bands without triggering
+ * aggressive jetsam. This number should ideally be tuned based on the memory config
+ * of the device.
+ */
+#define kJetsamMinCandidatesThreshold           (5)
+
 static boolean_t
-memorystatus_kill_top_process_aggressive(uint32_t cause, int aggr_count,
-    int32_t priority_max, uint32_t *errors)
+memorystatus_aggressive_jetsam_needed_sysproc_aging(__unused int jld_eval_aggressive_count, __unused int *jld_idle_kills, __unused int jld_idle_kill_candidates, int *total_candidates, int *elevated_bucket_count)
 {
-       pid_t aPid;
-       proc_t p = PROC_NULL, next_p = PROC_NULL;
-       boolean_t new_snapshot = FALSE, killed = FALSE;
-       int kill_count = 0;
-       unsigned int i = 0;
-       int32_t aPid_ep = 0;
-       unsigned int memorystatus_level_snapshot = 0;
-       uint64_t killtime = 0;
-       clock_sec_t     tv_sec;
-       clock_usec_t    tv_usec;
-       uint32_t        tv_msec;
-       os_reason_t jetsam_reason = OS_REASON_NULL;
+       boolean_t aggressive_jetsam_needed = false;
 
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
-           memorystatus_available_pages, priority_max, 0, 0, 0);
+       /*
+        * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, we maintain the jetsam
+        * relaunch behavior for all daemons. Also, daemons and apps are aged in deferred bands on
+        * every dirty->clean transition. For this aging policy, the best way to determine if
+        * aggressive jetsam is needed, is to see if the kill candidates are mostly bad candidates.
+        * If yes, then we need to go to higher bands to reclaim memory.
+        */
+       proc_list_lock();
+       /* Get total candidate counts for idle and idle deferred bands */
+       *total_candidates = memstat_bucket[JETSAM_PRIORITY_IDLE].count + memstat_bucket[system_procs_aging_band].count;
+       /* Get counts of bad kill candidates in idle and idle deferred bands */
+       int bad_candidates = memstat_bucket[JETSAM_PRIORITY_IDLE].relaunch_high_count + memstat_bucket[system_procs_aging_band].relaunch_high_count;
 
-       memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
+       *elevated_bucket_count = memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE].count;
 
-       jetsam_reason = os_reason_create(OS_REASON_JETSAM, cause);
-       if (jetsam_reason == OS_REASON_NULL) {
-               printf("memorystatus_kill_top_process_aggressive: failed to allocate exit reason\n");
-       }
+       proc_list_unlock();
 
-       proc_list_lock();
+       /* Check if the number of bad candidates is greater than kJetsamHighRelaunchCandidatesThreshold % */
+       aggressive_jetsam_needed = (((bad_candidates * 100) / *total_candidates) >= kJetsamHighRelaunchCandidatesThreshold);
+
+       /*
+        * Since the new aging policy bases the aggressive jetsam trigger on percentage of
+        * bad candidates, it is prone to being overly aggressive. In order to mitigate that,
+        * make sure the system is really under memory pressure before triggering aggressive
+        * jetsam.
+        */
+       if (memorystatus_available_pages > memorystatus_sysproc_aging_aggr_pages) {
+               aggressive_jetsam_needed = false;
+       }
 
-       next_p = memorystatus_get_first_proc_locked(&i, TRUE);
-       while (next_p) {
 #if DEVELOPMENT || DEBUG
-               int activeProcess;
-               int procSuspendedForDiagnosis;
+       printf("memorystatus: aggressive%d: [%s] Bad Candidate Threshold Check (total: %d, bad: %d, threshold: %d %%); Memory Pressure Check (available_pgs: %llu, threshold_pgs: %llu)\n",
+           jld_eval_aggressive_count, aggressive_jetsam_needed ? "PASSED" : "FAILED", *total_candidates, bad_candidates,
+           kJetsamHighRelaunchCandidatesThreshold, (uint64_t)memorystatus_available_pages, (uint64_t)memorystatus_sysproc_aging_aggr_pages);
 #endif /* DEVELOPMENT || DEBUG */
+       return aggressive_jetsam_needed;
+}
 
-               if (((next_p->p_listflag & P_LIST_EXITED) != 0) ||
-                   ((unsigned int)(next_p->p_memstat_effectivepriority) != i)) {
-                       /*
-                        * We have raced with next_p running on another core.
-                        * It may be exiting or it may have moved to a different
-                        * jetsam priority band.  This means we have lost our
-                        * place in line while traversing the jetsam list.  We
-                        * attempt to recover by rewinding to the beginning of the band
-                        * we were already traversing.  By doing this, we do not guarantee
-                        * that no process escapes this aggressive march, but we can make
-                        * skipping an entire range of processes less likely. (PR-21069019)
-                        */
-
-                       MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: rewinding band %d, %s(%d) moved or exiting.\n",
-                           aggr_count, i, (*next_p->p_name ? next_p->p_name : "unknown"), next_p->p_pid);
+static boolean_t
+memorystatus_aggressive_jetsam_needed_default(__unused int jld_eval_aggressive_count, int *jld_idle_kills, int jld_idle_kill_candidates, int *total_candidates, int *elevated_bucket_count)
+{
+       boolean_t aggressive_jetsam_needed = false;
+       /* Jetsam Loop Detection - locals */
+       memstat_bucket_t *bucket;
+       int             jld_bucket_count = 0;
 
-                       next_p = memorystatus_get_first_proc_locked(&i, TRUE);
-                       continue;
-               }
+       proc_list_lock();
+       switch (jetsam_aging_policy) {
+       case kJetsamAgingPolicyLegacy:
+               bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
+               jld_bucket_count = bucket->count;
+               bucket = &memstat_bucket[JETSAM_PRIORITY_AGING_BAND1];
+               jld_bucket_count += bucket->count;
+               break;
+       case kJetsamAgingPolicyAppsReclaimedFirst:
+               bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
+               jld_bucket_count = bucket->count;
+               bucket = &memstat_bucket[system_procs_aging_band];
+               jld_bucket_count += bucket->count;
+               bucket = &memstat_bucket[applications_aging_band];
+               jld_bucket_count += bucket->count;
+               break;
+       case kJetsamAgingPolicyNone:
+       default:
+               bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
+               jld_bucket_count = bucket->count;
+               break;
+       }
 
-               p = next_p;
-               next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
+       bucket = &memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE];
+       *elevated_bucket_count = bucket->count;
+       *total_candidates = jld_bucket_count;
+       proc_list_unlock();
 
-               if (p->p_memstat_effectivepriority > priority_max) {
-                       /*
-                        * Bail out of this killing spree if we have
-                        * reached beyond the priority_max jetsam band.
-                        * That is, we kill up to and through the
-                        * priority_max jetsam band.
-                        */
-                       proc_list_unlock();
-                       goto exit;
-               }
+       aggressive_jetsam_needed = (*jld_idle_kills > jld_idle_kill_candidates);
 
 #if DEVELOPMENT || DEBUG
-               activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND;
-               procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED;
+       if (aggressive_jetsam_needed) {
+               printf("memorystatus: aggressive%d: idle candidates: %d, idle kills: %d\n",
+                   jld_eval_aggressive_count,
+                   jld_idle_kill_candidates,
+                   *jld_idle_kills);
+       }
 #endif /* DEVELOPMENT || DEBUG */
+       return aggressive_jetsam_needed;
+}
 
-               aPid = p->p_pid;
-               aPid_ep = p->p_memstat_effectivepriority;
-
-               if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
-                       continue;
-               }
-
-#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
-               if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) {
-                       printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid);
-                       continue;
-               }
-#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
-
-               /*
-                * Capture a snapshot if none exists.
-                */
-               if (memorystatus_jetsam_snapshot_count == 0) {
-                       memorystatus_init_jetsam_snapshot_locked(NULL, 0);
-                       new_snapshot = TRUE;
-               }
+static boolean_t
+memorystatus_act_aggressive(uint32_t cause, os_reason_t jetsam_reason, int *jld_idle_kills, boolean_t *corpse_list_purged, boolean_t *post_snapshot, uint64_t *memory_reclaimed)
+{
+       boolean_t aggressive_jetsam_needed = false;
+       boolean_t killed;
+       uint32_t errors = 0;
+       uint64_t footprint_of_killed_proc = 0;
+       int elevated_bucket_count = 0;
+       int total_candidates = 0;
+       *memory_reclaimed = 0;
 
-               /*
-                * Mark as terminated so that if exit1() indicates success, but the process (for example)
-                * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
-                * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
-                * acquisition of the proc lock.
-                */
-               p->p_memstat_state |= P_MEMSTAT_TERMINATED;
+       /*
+        * The aggressive jetsam logic looks at the number of times it has been in the
+        * aggressive loop to determine the max priority band it should kill upto. The
+        * static variables below are used to track that property.
+        *
+        * To reset those values, the implementation checks if it has been
+        * memorystatus_jld_eval_period_msecs since the parameters were reset.
+        */
+       static int       jld_eval_aggressive_count = 0;
+       static int32_t   jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT;
+       static uint64_t  jld_timestamp_msecs = 0;
+       static int       jld_idle_kill_candidates = 0;
 
-               killtime = mach_absolute_time();
-               absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
-               tv_msec = tv_usec / 1000;
+       if (memorystatus_jld_enabled == FALSE) {
+               /* If aggressive jetsam is disabled, nothing to do here */
+               return FALSE;
+       }
 
-               /* Shift queue, update stats */
-               memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
+       /* Get current timestamp (msecs only) */
+       struct timeval  jld_now_tstamp = {0, 0};
+       uint64_t        jld_now_msecs = 0;
+       microuptime(&jld_now_tstamp);
+       jld_now_msecs = (jld_now_tstamp.tv_sec * 1000);
 
+       /*
+        * The aggressive jetsam logic looks at the number of candidates and their
+        * properties to decide if aggressive jetsam should be engaged.
+        */
+       if (jetsam_aging_policy == kJetsamAgingPolicySysProcsReclaimedFirst) {
                /*
-                * In order to kill the target process, we will drop the proc_list_lock.
-                * To guaranteee that p and next_p don't disappear out from under the lock,
-                * we must take a ref on both.
-                * If we cannot get a reference, then it's likely we've raced with
-                * that process exiting on another core.
-                */
-               if (proc_ref_locked(p) == p) {
-                       if (next_p) {
-                               while (next_p && (proc_ref_locked(next_p) != next_p)) {
-                                       proc_t temp_p;
-
-                                       /*
-                                        * We must have raced with next_p exiting on another core.
-                                        * Recover by getting the next eligible process in the band.
-                                        */
-
-                                       MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n",
-                                           aggr_count, next_p->p_pid, (*next_p->p_name ? next_p->p_name : "(unknown)"));
-
-                                       temp_p = next_p;
-                                       next_p = memorystatus_get_next_proc_locked(&i, temp_p, TRUE);
-                               }
-                       }
-                       proc_list_unlock();
-
-                       printf("%lu.%03d memorystatus: %s%d pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
-                           (unsigned long)tv_sec, tv_msec,
-                           ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process_aggressive" : "killing_top_process_aggressive"),
-                           aggr_count, aPid, (*p->p_name ? p->p_name : "unknown"),
-                           memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)memorystatus_available_pages);
-
-                       memorystatus_level_snapshot = memorystatus_level;
+                * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, the logic looks at the number of
+                * candidates in the idle and deferred band and how many out of them are marked as high relaunch
+                * probability.
+                */
+               aggressive_jetsam_needed = memorystatus_aggressive_jetsam_needed_sysproc_aging(jld_eval_aggressive_count,
+                   jld_idle_kills, jld_idle_kill_candidates, &total_candidates, &elevated_bucket_count);
+       } else {
+               /*
+                * The other aging policies look at number of candidate processes over a specific time window and
+                * evaluate if the system is in a jetsam loop. If yes, aggressive jetsam is triggered.
+                */
+               aggressive_jetsam_needed = memorystatus_aggressive_jetsam_needed_default(jld_eval_aggressive_count,
+                   jld_idle_kills, jld_idle_kill_candidates, &total_candidates, &elevated_bucket_count);
+       }
 
-                       /*
-                        * memorystatus_do_kill() drops a reference, so take another one so we can
-                        * continue to use this exit reason even after memorystatus_do_kill()
-                        * returns.
-                        */
-                       os_reason_ref(jetsam_reason);
-                       killed = memorystatus_do_kill(p, cause, jetsam_reason);
+       /*
+        * Check if its been really long since the aggressive jetsam evaluation
+        * parameters have been refreshed. This logic also resets the jld_eval_aggressive_count
+        * counter to make sure we reset the aggressive jetsam severity.
+        */
+       boolean_t param_reval = false;
 
-                       /* Success? */
-                       if (killed) {
-                               proc_rele(p);
-                               kill_count++;
-                               p = NULL;
-                               killed = FALSE;
+       if ((total_candidates == 0) ||
+           (jld_now_msecs > (jld_timestamp_msecs + memorystatus_jld_eval_period_msecs))) {
+               jld_timestamp_msecs      = jld_now_msecs;
+               jld_idle_kill_candidates = total_candidates;
+               *jld_idle_kills          = 0;
+               jld_eval_aggressive_count = 0;
+               jld_priority_band_max   = JETSAM_PRIORITY_UI_SUPPORT;
+               param_reval = true;
+       }
 
-                               /*
-                                * Continue the killing spree.
-                                */
-                               proc_list_lock();
-                               if (next_p) {
-                                       proc_rele_locked(next_p);
-                               }
+       /*
+        * If the parameters have been updated, re-evaluate the aggressive_jetsam_needed condition for
+        * the non kJetsamAgingPolicySysProcsReclaimedFirst policy since its based on jld_idle_kill_candidates etc.
+        */
+       if ((param_reval == true) && (jetsam_aging_policy != kJetsamAgingPolicySysProcsReclaimedFirst)) {
+               aggressive_jetsam_needed = (*jld_idle_kills > jld_idle_kill_candidates);
+       }
 
-                               if (aPid_ep == JETSAM_PRIORITY_FOREGROUND && memorystatus_aggressive_jetsam_lenient == TRUE) {
-                                       if (memorystatus_level > memorystatus_level_snapshot && ((memorystatus_level - memorystatus_level_snapshot) >= AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD)) {
+       /*
+        * It is also possible that the system is down to a very small number of processes in the candidate
+        * bands. In that case, the decisions made by the memorystatus_aggressive_jetsam_needed_* routines
+        * would not be useful. In that case, do not trigger aggressive jetsam.
+        */
+       if (total_candidates < kJetsamMinCandidatesThreshold) {
 #if DEVELOPMENT || DEBUG
-                                               printf("Disabling Lenient mode after one-time deployment.\n");
+               printf("memorystatus: aggressive: [FAILED] Low Candidate Count (current: %d, threshold: %d)\n", total_candidates, kJetsamMinCandidatesThreshold);
 #endif /* DEVELOPMENT || DEBUG */
-                                               memorystatus_aggressive_jetsam_lenient = FALSE;
-                                               break;
-                                       }
-                               }
+               aggressive_jetsam_needed = false;
+       }
 
-                               continue;
-                       }
+       if (aggressive_jetsam_needed == false) {
+               /* Either the aging policy or the candidate count decided that aggressive jetsam is not needed. Nothing more to do here. */
+               return FALSE;
+       }
 
-                       /*
-                        * Failure - first unwind the state,
-                        * then fall through to restart the search.
-                        */
-                       proc_list_lock();
-                       proc_rele_locked(p);
-                       if (next_p) {
-                               proc_rele_locked(next_p);
-                       }
-                       p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
-                       p->p_memstat_state |= P_MEMSTAT_ERROR;
-                       *errors += 1;
-                       p = NULL;
-               }
+       /* Looks like aggressive jetsam is needed */
+       jld_eval_aggressive_count++;
+
+       if (jld_eval_aggressive_count == memorystatus_jld_eval_aggressive_count) {
+               memorystatus_issue_fg_band_notify();
 
                /*
-                * Failure - restart the search at the beginning of
-                * the band we were already traversing.
-                *
-                * We might have raced with "p" exiting on another core, resulting in no
-                * ref on "p".  Or, we may have failed to kill "p".
-                *
-                * Either way, we fall thru to here, leaving the proc in the
-                * P_MEMSTAT_TERMINATED or P_MEMSTAT_ERROR state.
-                *
-                * And, we hold the the proc_list_lock at this point.
+                * If we reach this aggressive cycle, corpses might be causing memory pressure.
+                * So, in an effort to avoid jetsams in the FG band, we will attempt to purge
+                * corpse memory prior to this final march through JETSAM_PRIORITY_UI_SUPPORT.
                 */
-
-               next_p = memorystatus_get_first_proc_locked(&i, TRUE);
+               if (total_corpses_count() > 0 && !*corpse_list_purged) {
+                       task_purge_all_corpses();
+                       *corpse_list_purged = TRUE;
+               }
+       } else if (jld_eval_aggressive_count > memorystatus_jld_eval_aggressive_count) {
+               /*
+                * Bump up the jetsam priority limit (eg: the bucket index)
+                * Enforce bucket index sanity.
+                */
+               if ((memorystatus_jld_eval_aggressive_priority_band_max < 0) ||
+                   (memorystatus_jld_eval_aggressive_priority_band_max >= MEMSTAT_BUCKET_COUNT)) {
+                       /*
+                        * Do nothing.  Stick with the default level.
+                        */
+               } else {
+                       jld_priority_band_max = memorystatus_jld_eval_aggressive_priority_band_max;
+               }
        }
 
-       proc_list_unlock();
+       /* Visit elevated processes first */
+       while (elevated_bucket_count) {
+               elevated_bucket_count--;
 
-exit:
-       os_reason_free(jetsam_reason);
+               /*
+                * memorystatus_kill_elevated_process() drops a reference,
+                * so take another one so we can continue to use this exit reason
+                * even after it returns.
+                */
 
-       /* Clear snapshot if freshly captured and no target was found */
-       if (new_snapshot && (kill_count == 0)) {
-               proc_list_lock();
-               memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
-               proc_list_unlock();
+               os_reason_ref(jetsam_reason);
+               killed = memorystatus_kill_elevated_process(
+                       cause,
+                       jetsam_reason,
+                       JETSAM_PRIORITY_ELEVATED_INACTIVE,
+                       jld_eval_aggressive_count,
+                       &errors, &footprint_of_killed_proc);
+               if (killed) {
+                       *post_snapshot = TRUE;
+                       *memory_reclaimed += footprint_of_killed_proc;
+                       if (memorystatus_avail_pages_below_pressure()) {
+                               /*
+                                * Still under pressure.
+                                * Find another pinned processes.
+                                */
+                               continue;
+                       } else {
+                               return TRUE;
+                       }
+               } else {
+                       /*
+                        * No pinned processes left to kill.
+                        * Abandon elevated band.
+                        */
+                       break;
+               }
        }
 
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
-           memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
+       /*
+        * memorystatus_kill_processes_aggressive() allocates its own
+        * jetsam_reason so the kMemorystatusKilledProcThrashing cause
+        * is consistent throughout the aggressive march.
+        */
+       killed = memorystatus_kill_processes_aggressive(
+               kMemorystatusKilledProcThrashing,
+               jld_eval_aggressive_count,
+               jld_priority_band_max,
+               &errors, &footprint_of_killed_proc);
 
-       if (kill_count > 0) {
+       if (killed) {
+               /* Always generate logs after aggressive kill */
+               *post_snapshot = TRUE;
+               *memory_reclaimed += footprint_of_killed_proc;
+               *jld_idle_kills = 0;
                return TRUE;
-       } else {
-               return FALSE;
        }
+
+       return FALSE;
 }
 
-static boolean_t
-memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged)
+
+static void
+memorystatus_thread(void *param __unused, wait_result_t wr __unused)
 {
-       pid_t aPid = 0;
-       proc_t p = PROC_NULL, next_p = PROC_NULL;
-       boolean_t new_snapshot = FALSE, killed = FALSE, freed_mem = FALSE;
-       unsigned int i = 0;
-       uint32_t aPid_ep;
-       os_reason_t jetsam_reason = OS_REASON_NULL;
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
-           memorystatus_available_pages, 0, 0, 0, 0);
+       boolean_t post_snapshot = FALSE;
+       uint32_t errors = 0;
+       uint32_t hwm_kill = 0;
+       boolean_t sort_flag = TRUE;
+       boolean_t corpse_list_purged = FALSE;
+       int     jld_idle_kills = 0;
+       struct jetsam_thread_state *jetsam_thread = jetsam_current_thread();
+       uint64_t total_memory_reclaimed = 0;
 
-       jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_HIGHWATER);
-       if (jetsam_reason == OS_REASON_NULL) {
-               printf("memorystatus_kill_hiwat_proc: failed to allocate exit reason\n");
+       assert(jetsam_thread != NULL);
+       if (jetsam_thread->inited == FALSE) {
+               /*
+                * It's the first time the thread has run, so just mark the thread as privileged and block.
+                * This avoids a spurious pass with unset variables, as set out in <rdar://problem/9609402>.
+                */
+
+               char name[32];
+               thread_wire(host_priv_self(), current_thread(), TRUE);
+               snprintf(name, 32, "VM_memorystatus_%d", jetsam_thread->index + 1);
+
+               /* Limit all but one thread to the lower jetsam bands, as that's where most of the victims are. */
+               if (jetsam_thread->index == 0) {
+                       if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
+                               thread_vm_bind_group_add();
+                       }
+                       jetsam_thread->limit_to_low_bands = FALSE;
+               } else {
+                       jetsam_thread->limit_to_low_bands = TRUE;
+               }
+               thread_set_thread_name(current_thread(), name);
+               jetsam_thread->inited = TRUE;
+               memorystatus_thread_block(0, memorystatus_thread);
        }
 
-       proc_list_lock();
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
+           memorystatus_available_pages, memorystatus_jld_enabled, memorystatus_jld_eval_period_msecs, memorystatus_jld_eval_aggressive_count, 0);
 
-       next_p = memorystatus_get_first_proc_locked(&i, TRUE);
-       while (next_p) {
-               uint64_t footprint_in_bytes = 0;
-               uint64_t memlimit_in_bytes  = 0;
-               boolean_t skip = 0;
+       /*
+        * Jetsam aware version.
+        *
+        * The VM pressure notification thread is working it's way through clients in parallel.
+        *
+        * So, while the pressure notification thread is targeting processes in order of
+        * increasing jetsam priority, we can hopefully reduce / stop it's work by killing
+        * any processes that have exceeded their highwater mark.
+        *
+        * If we run out of HWM processes and our available pages drops below the critical threshold, then,
+        * we target the least recently used process in order of increasing jetsam priority (exception: the FG band).
+        */
+       while (memorystatus_action_needed()) {
+               boolean_t killed;
+               int32_t priority;
+               uint32_t cause;
+               uint64_t memory_reclaimed = 0;
+               uint64_t jetsam_reason_code = JETSAM_REASON_INVALID;
+               os_reason_t jetsam_reason = OS_REASON_NULL;
 
-               p = next_p;
-               next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
+               cause = kill_under_pressure_cause;
+               switch (cause) {
+               case kMemorystatusKilledFCThrashing:
+                       jetsam_reason_code = JETSAM_REASON_MEMORY_FCTHRASHING;
+                       break;
+               case kMemorystatusKilledVMCompressorThrashing:
+                       jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING;
+                       break;
+               case kMemorystatusKilledVMCompressorSpaceShortage:
+                       jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE;
+                       break;
+               case kMemorystatusKilledZoneMapExhaustion:
+                       jetsam_reason_code = JETSAM_REASON_ZONE_MAP_EXHAUSTION;
+                       break;
+               case kMemorystatusKilledVMPageShortage:
+               /* falls through */
+               default:
+                       jetsam_reason_code = JETSAM_REASON_MEMORY_VMPAGESHORTAGE;
+                       cause = kMemorystatusKilledVMPageShortage;
+                       break;
+               }
 
-               aPid = p->p_pid;
-               aPid_ep = p->p_memstat_effectivepriority;
+               /* Highwater */
+               boolean_t is_critical = TRUE;
+               if (memorystatus_act_on_hiwat_processes(&errors, &hwm_kill, &post_snapshot, &is_critical, &memory_reclaimed)) {
+                       total_memory_reclaimed += memory_reclaimed;
+                       if (is_critical == FALSE) {
+                               /*
+                                * For now, don't kill any other processes.
+                                */
+                               break;
+                       } else {
+                               goto done;
+                       }
+               }
 
-               if (p->p_memstat_state  & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
-                       continue;
+               jetsam_reason = os_reason_create(OS_REASON_JETSAM, jetsam_reason_code);
+               if (jetsam_reason == OS_REASON_NULL) {
+                       printf("memorystatus_thread: failed to allocate jetsam reason\n");
                }
 
-               /* skip if no limit set */
-               if (p->p_memstat_memlimit <= 0) {
-                       continue;
+               /* Only unlimited jetsam threads should act aggressive */
+               if (!jetsam_thread->limit_to_low_bands &&
+                   memorystatus_act_aggressive(cause, jetsam_reason, &jld_idle_kills, &corpse_list_purged, &post_snapshot, &memory_reclaimed)) {
+                       total_memory_reclaimed += memory_reclaimed;
+                       goto done;
                }
 
-               footprint_in_bytes = get_task_phys_footprint(p->task);
-               memlimit_in_bytes  = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL);   /* convert MB to bytes */
-               skip = (footprint_in_bytes <= memlimit_in_bytes);
+               /*
+                * memorystatus_kill_top_process() drops a reference,
+                * so take another one so we can continue to use this exit reason
+                * even after it returns
+                */
+               os_reason_ref(jetsam_reason);
 
-#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
-               if (!skip && (memorystatus_jetsam_policy & kPolicyDiagnoseActive)) {
-                       if (p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED) {
-                               continue;
+               /* LRU */
+               killed = memorystatus_kill_top_process(TRUE, sort_flag, cause, jetsam_reason, &priority, &errors, &memory_reclaimed);
+               sort_flag = FALSE;
+
+               if (killed) {
+                       total_memory_reclaimed += memory_reclaimed;
+                       if (memorystatus_post_snapshot(priority, cause) == TRUE) {
+                               post_snapshot = TRUE;
+                       }
+
+                       /* Jetsam Loop Detection */
+                       if (memorystatus_jld_enabled == TRUE) {
+                               if ((priority == JETSAM_PRIORITY_IDLE) || (priority == system_procs_aging_band) || (priority == applications_aging_band)) {
+                                       jld_idle_kills++;
+                               } else {
+                                       /*
+                                        * We've reached into bands beyond idle deferred.
+                                        * We make no attempt to monitor them
+                                        */
+                               }
                        }
-               }
-#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
 
-#if CONFIG_FREEZE
-               if (!skip) {
-                       if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
-                               skip = TRUE;
-                       } else {
-                               skip = FALSE;
+                       /*
+                        * If we have jetsammed a process in or above JETSAM_PRIORITY_UI_SUPPORT
+                        * then we attempt to relieve pressure by purging corpse memory and notifying
+                        * anybody wanting to know this.
+                        */
+                       if (priority >= JETSAM_PRIORITY_UI_SUPPORT) {
+                               memorystatus_issue_fg_band_notify();
+                               if (total_corpses_count() > 0 && !corpse_list_purged) {
+                                       task_purge_all_corpses();
+                                       corpse_list_purged = TRUE;
+                               }
                        }
+                       goto done;
                }
-#endif
 
-               if (skip) {
-                       continue;
-               } else {
-                       if (memorystatus_jetsam_snapshot_count == 0) {
-                               memorystatus_init_jetsam_snapshot_locked(NULL, 0);
-                               new_snapshot = TRUE;
+               if (memorystatus_avail_pages_below_critical()) {
+                       /*
+                        * Still under pressure and unable to kill a process - purge corpse memory
+                        */
+                       if (total_corpses_count() > 0) {
+                               task_purge_all_corpses();
+                               corpse_list_purged = TRUE;
                        }
 
-                       if (proc_ref_locked(p) == p) {
-                               /*
-                                * Mark as terminated so that if exit1() indicates success, but the process (for example)
-                                * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
-                                * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
-                                * acquisition of the proc lock.
-                                */
-                               p->p_memstat_state |= P_MEMSTAT_TERMINATED;
-
-                               proc_list_unlock();
-                       } else {
+                       if (!jetsam_thread->limit_to_low_bands && memorystatus_avail_pages_below_critical()) {
                                /*
-                                * We need to restart the search again because
-                                * proc_ref_locked _can_ drop the proc_list lock
-                                * and we could have lost our stored next_p via
-                                * an exit() on another core.
+                                * Still under pressure and unable to kill a process - panic
                                 */
-                               i = 0;
-                               next_p = memorystatus_get_first_proc_locked(&i, TRUE);
-                               continue;
+                               panic("memorystatus_jetsam_thread: no victim! available pages:%llu\n", (uint64_t)memorystatus_available_pages);
                        }
+               }
 
-                       freed_mem = memorystatus_kill_proc(p, kMemorystatusKilledHiwat, jetsam_reason, &killed); /* purged and/or killed 'p' */
-
-                       /* Success? */
-                       if (freed_mem) {
-                               if (killed == FALSE) {
-                                       /* purged 'p'..don't reset HWM candidate count */
-                                       *purged = TRUE;
-
-                                       proc_list_lock();
-                                       p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
-                                       proc_list_unlock();
-                               }
-                               proc_rele(p);
-                               goto exit;
-                       }
-                       /*
-                        * Failure - first unwind the state,
-                        * then fall through to restart the search.
-                        */
-                       proc_list_lock();
-                       proc_rele_locked(p);
-                       p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
-                       p->p_memstat_state |= P_MEMSTAT_ERROR;
-                       *errors += 1;
+done:
 
-                       i = 0;
-                       next_p = memorystatus_get_first_proc_locked(&i, TRUE);
+               /*
+                * We do not want to over-kill when thrashing has been detected.
+                * To avoid that, we reset the flag here and notify the
+                * compressor.
+                */
+               if (is_reason_thrashing(kill_under_pressure_cause)) {
+                       kill_under_pressure_cause = 0;
+#if CONFIG_JETSAM
+                       vm_thrashing_jetsam_done();
+#endif /* CONFIG_JETSAM */
+               } else if (is_reason_zone_map_exhaustion(kill_under_pressure_cause)) {
+                       kill_under_pressure_cause = 0;
                }
+
+               os_reason_free(jetsam_reason);
        }
 
-       proc_list_unlock();
+       kill_under_pressure_cause = 0;
 
-exit:
-       os_reason_free(jetsam_reason);
+       if (errors) {
+               memorystatus_clear_errors();
+       }
 
-       /* Clear snapshot if freshly captured and no target was found */
-       if (new_snapshot && !killed) {
+       if (post_snapshot) {
                proc_list_lock();
-               memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
-               proc_list_unlock();
+               size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
+                   sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
+               uint64_t timestamp_now = mach_absolute_time();
+               memorystatus_jetsam_snapshot->notification_time = timestamp_now;
+               memorystatus_jetsam_snapshot->js_gencount++;
+               if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
+                   timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
+                       proc_list_unlock();
+                       int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
+                       if (!ret) {
+                               proc_list_lock();
+                               memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
+                               proc_list_unlock();
+                       }
+               } else {
+                       proc_list_unlock();
+               }
        }
 
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
-           memorystatus_available_pages, killed ? aPid : 0, 0, 0, 0);
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
+           memorystatus_available_pages, total_memory_reclaimed, 0, 0, 0);
 
-       return killed;
+       memorystatus_thread_block(0, memorystatus_thread);
 }
 
 /*
- * Jetsam a process pinned in the elevated band.
- *
- * Return:  true -- at least one pinned process was jetsammed
- *         false -- no pinned process was jetsammed
+ * Returns TRUE:
+ *      when an idle-exitable proc was killed
+ * Returns FALSE:
+ *     when there are no more idle-exitable procs found
+ *      when the attempt to kill an idle-exitable proc failed
  */
-static boolean_t
-memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors)
+boolean_t
+memorystatus_idle_exit_from_VM(void)
 {
-       pid_t aPid = 0;
-       proc_t p = PROC_NULL, next_p = PROC_NULL;
-       boolean_t new_snapshot = FALSE, killed = FALSE;
-       int kill_count = 0;
-       uint32_t aPid_ep;
-       uint64_t killtime = 0;
-       clock_sec_t     tv_sec;
-       clock_usec_t    tv_usec;
-       uint32_t        tv_msec;
-
-
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
-           memorystatus_available_pages, 0, 0, 0, 0);
-
-#if CONFIG_FREEZE
-       boolean_t consider_frozen_only = FALSE;
-
-       if (band == (unsigned int) memorystatus_freeze_jetsam_band) {
-               consider_frozen_only = TRUE;
-       }
-#endif /* CONFIG_FREEZE */
+       /*
+        * This routine should no longer be needed since we are
+        * now using jetsam bands on all platforms and so will deal
+        * with IDLE processes within the memorystatus thread itself.
+        *
+        * But we still use it because we observed that macos systems
+        * started heavy compression/swapping with a bunch of
+        * idle-exitable processes alive and doing nothing. We decided
+        * to rather kill those processes than start swapping earlier.
+        */
 
-       proc_list_lock();
+       return kill_idle_exit_proc();
+}
 
-       next_p = memorystatus_get_first_proc_locked(&band, FALSE);
-       while (next_p) {
-               p = next_p;
-               next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
+/*
+ * Callback invoked when allowable physical memory footprint exceeded
+ * (dirty pages + IOKit mappings)
+ *
+ * This is invoked for both advisory, non-fatal per-task high watermarks,
+ * as well as the fatal task memory limits.
+ */
+void
+memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
+{
+       os_reason_t jetsam_reason = OS_REASON_NULL;
 
-               aPid = p->p_pid;
-               aPid_ep = p->p_memstat_effectivepriority;
+       proc_t p = current_proc();
 
+#if VM_PRESSURE_EVENTS
+       if (warning == TRUE) {
                /*
-                * Only pick a process pinned in this elevated band
-                */
-               if (!(p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) {
-                       continue;
-               }
-
-               if (p->p_memstat_state  & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
-                       continue;
-               }
-
-#if CONFIG_FREEZE
-               if (consider_frozen_only && !(p->p_memstat_state & P_MEMSTAT_FROZEN)) {
-                       continue;
-               }
-
-               if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
-                       continue;
-               }
-#endif /* CONFIG_FREEZE */
-
-#if DEVELOPMENT || DEBUG
-               MEMORYSTATUS_DEBUG(1, "jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n",
-                   aggr_count,
-                   aPid, (*p->p_name ? p->p_name : "unknown"),
-                   memorystatus_available_pages);
-#endif /* DEVELOPMENT || DEBUG */
-
-               if (memorystatus_jetsam_snapshot_count == 0) {
-                       memorystatus_init_jetsam_snapshot_locked(NULL, 0);
-                       new_snapshot = TRUE;
-               }
-
-               p->p_memstat_state |= P_MEMSTAT_TERMINATED;
-
-               killtime = mach_absolute_time();
-               absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
-               tv_msec = tv_usec / 1000;
-
-               memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
-
-               if (proc_ref_locked(p) == p) {
-                       proc_list_unlock();
-
-                       os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_top_process_elevated%d pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
-                           (unsigned long)tv_sec, tv_msec,
-                           aggr_count,
-                           aPid, (*p->p_name ? p->p_name : "unknown"),
-                           memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)memorystatus_available_pages);
-
-                       /*
-                        * memorystatus_do_kill drops a reference, so take another one so we can
-                        * continue to use this exit reason even after memorystatus_do_kill()
-                        * returns
-                        */
-                       os_reason_ref(jetsam_reason);
-                       killed = memorystatus_do_kill(p, cause, jetsam_reason);
-
-                       /* Success? */
-                       if (killed) {
-                               proc_rele(p);
-                               kill_count++;
-                               goto exit;
-                       }
-
-                       /*
-                        * Failure - first unwind the state,
-                        * then fall through to restart the search.
-                        */
-                       proc_list_lock();
-                       proc_rele_locked(p);
-                       p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
-                       p->p_memstat_state |= P_MEMSTAT_ERROR;
-                       *errors += 1;
+                * This is a warning path which implies that the current process is close, but has
+                * not yet exceeded its per-process memory limit.
+                */
+               if (memorystatus_warn_process(p->p_pid, memlimit_is_active, memlimit_is_fatal, FALSE /* not exceeded */) != TRUE) {
+                       /* Print warning, since it's possible that task has not registered for pressure notifications */
+                       os_log(OS_LOG_DEFAULT, "memorystatus_on_ledger_footprint_exceeded: failed to warn the current task (%d exiting, or no handler registered?).\n", p->p_pid);
                }
+               return;
+       }
+#endif /* VM_PRESSURE_EVENTS */
 
+       if (memlimit_is_fatal) {
                /*
-                * Failure - restart the search.
-                *
-                * We might have raced with "p" exiting on another core, resulting in no
-                * ref on "p".  Or, we may have failed to kill "p".
-                *
-                * Either way, we fall thru to here, leaving the proc in the
-                * P_MEMSTAT_TERMINATED state or P_MEMSTAT_ERROR state.
-                *
-                * And, we hold the the proc_list_lock at this point.
+                * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task
+                * has violated either the system-wide per-task memory limit OR its own task limit.
                 */
+               jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT);
+               if (jetsam_reason == NULL) {
+                       printf("task_exceeded footprint: failed to allocate jetsam reason\n");
+               } else if (corpse_for_fatal_memkill != 0 && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) {
+                       /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */
+                       jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
+               }
 
-               next_p = memorystatus_get_first_proc_locked(&band, FALSE);
-       }
-
-       proc_list_unlock();
+               if (memorystatus_kill_process_sync(p->p_pid, kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) {
+                       printf("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
+               }
+       } else {
+               /*
+                * HWM offender exists. Done without locks or synchronization.
+                * See comment near its declaration for more details.
+                */
+               memorystatus_hwm_candidates = TRUE;
 
-exit:
-       os_reason_free(jetsam_reason);
+#if VM_PRESSURE_EVENTS
+               /*
+                * The current process is not in the warning path.
+                * This path implies the current process has exceeded a non-fatal (soft) memory limit.
+                * Failure to send note is ignored here.
+                */
+               (void)memorystatus_warn_process(p->p_pid, memlimit_is_active, memlimit_is_fatal, TRUE /* exceeded */);
 
-       /* Clear snapshot if freshly captured and no target was found */
-       if (new_snapshot && (kill_count == 0)) {
-               proc_list_lock();
-               memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
-               proc_list_unlock();
+#endif /* VM_PRESSURE_EVENTS */
        }
-
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
-           memorystatus_available_pages, killed ? aPid : 0, kill_count, 0, 0);
-
-       return killed;
 }
 
-static boolean_t
-memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause)
+void
+memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
 {
+       proc_t p = current_proc();
+
        /*
-        * TODO: allow a general async path
-        *
-        * NOTE: If a new async kill cause is added, make sure to update memorystatus_thread() to
-        * add the appropriate exit reason code mapping.
+        * The limit violation is logged here, but only once per process per limit.
+        * Soft memory limit is a non-fatal high-water-mark
+        * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
         */
-       if ((victim_pid != -1) ||
-           (cause != kMemorystatusKilledVMPageShortage &&
-           cause != kMemorystatusKilledVMCompressorThrashing &&
-           cause != kMemorystatusKilledVMCompressorSpaceShortage &&
-           cause != kMemorystatusKilledFCThrashing &&
-           cause != kMemorystatusKilledZoneMapExhaustion)) {
-               return FALSE;
-       }
 
-       kill_under_pressure_cause = cause;
-       memorystatus_thread_wake();
-       return TRUE;
+       os_log_with_startup_serial(OS_LOG_DEFAULT, "EXC_RESOURCE -> %s[%d] exceeded mem limit: %s%s %d MB (%s)\n",
+           ((p && *p->p_name) ? p->p_name : "unknown"), (p ? p->p_pid : -1), (memlimit_is_active ? "Active" : "Inactive"),
+           (memlimit_is_fatal  ? "Hard" : "Soft"), max_footprint_mb,
+           (memlimit_is_fatal  ? "fatal" : "non-fatal"));
+
+       return;
 }
 
-boolean_t
-memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async)
-{
-       if (async) {
-               return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorSpaceShortage);
-       } else {
-               os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE);
-               if (jetsam_reason == OS_REASON_NULL) {
-                       printf("memorystatus_kill_on_VM_compressor_space_shortage -- sync: failed to allocate jetsam reason\n");
-               }
 
-               return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorSpaceShortage, jetsam_reason);
-       }
-}
+/*
+ * Description:
+ *     Evaluates process state to determine which limit
+ *     should be applied (active vs. inactive limit).
+ *
+ *     Processes that have the 'elevated inactive jetsam band' attribute
+ *     are first evaluated based on their current priority band.
+ *     presently elevated ==> active
+ *
+ *     Processes that opt into dirty tracking are evaluated
+ *     based on clean vs dirty state.
+ *     dirty ==> active
+ *     clean ==> inactive
+ *
+ *     Process that do not opt into dirty tracking are
+ *     evalulated based on priority level.
+ *     Foreground or above ==> active
+ *     Below Foreground    ==> inactive
+ *
+ *     Return: TRUE if active
+ *             False if inactive
+ */
 
-#if CONFIG_JETSAM
-boolean_t
-memorystatus_kill_on_VM_compressor_thrashing(boolean_t async)
+static boolean_t
+proc_jetsam_state_is_active_locked(proc_t p)
 {
-       if (async) {
-               return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorThrashing);
-       } else {
-               os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING);
-               if (jetsam_reason == OS_REASON_NULL) {
-                       printf("memorystatus_kill_on_VM_compressor_thrashing -- sync: failed to allocate jetsam reason\n");
+       if ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) &&
+           (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE)) {
+               /*
+                * process has the 'elevated inactive jetsam band' attribute
+                * and process is present in the elevated band
+                * implies active state
+                */
+               return TRUE;
+       } else if (p->p_memstat_dirty & P_DIRTY_TRACK) {
+               /*
+                * process has opted into dirty tracking
+                * active state is based on dirty vs. clean
+                */
+               if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
+                       /*
+                        * process is dirty
+                        * implies active state
+                        */
+                       return TRUE;
+               } else {
+                       /*
+                        * process is clean
+                        * implies inactive state
+                        */
+                       return FALSE;
                }
-
-               return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorThrashing, jetsam_reason);
+       } else if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
+               /*
+                * process is Foreground or higher
+                * implies active state
+                */
+               return TRUE;
+       } else {
+               /*
+                * process found below Foreground
+                * implies inactive state
+                */
+               return FALSE;
        }
 }
 
-boolean_t
-memorystatus_kill_on_VM_page_shortage(boolean_t async)
+static boolean_t
+memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
 {
-       if (async) {
-               return memorystatus_kill_process_async(-1, kMemorystatusKilledVMPageShortage);
-       } else {
-               os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMPAGESHORTAGE);
-               if (jetsam_reason == OS_REASON_NULL) {
-                       printf("memorystatus_kill_on_VM_page_shortage -- sync: failed to allocate jetsam reason\n");
-               }
+       boolean_t res;
 
-               return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage, jetsam_reason);
-       }
-}
+       uint32_t errors = 0;
+       uint64_t memory_reclaimed = 0;
 
-boolean_t
-memorystatus_kill_on_FC_thrashing(boolean_t async)
-{
-       if (async) {
-               return memorystatus_kill_process_async(-1, kMemorystatusKilledFCThrashing);
+       if (victim_pid == -1) {
+               /* No pid, so kill first process */
+               res = memorystatus_kill_top_process(TRUE, TRUE, cause, jetsam_reason, NULL, &errors, &memory_reclaimed);
        } else {
-               os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_FCTHRASHING);
-               if (jetsam_reason == OS_REASON_NULL) {
-                       printf("memorystatus_kill_on_FC_thrashing -- sync: failed to allocate jetsam reason\n");
-               }
+               res = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason);
+       }
 
-               return memorystatus_kill_process_sync(-1, kMemorystatusKilledFCThrashing, jetsam_reason);
+       if (errors) {
+               memorystatus_clear_errors();
        }
-}
 
-boolean_t
-memorystatus_kill_on_vnode_limit(void)
-{
-       os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_VNODE);
-       if (jetsam_reason == OS_REASON_NULL) {
-               printf("memorystatus_kill_on_vnode_limit: failed to allocate jetsam reason\n");
+       if (res == TRUE) {
+               /* Fire off snapshot notification */
+               proc_list_lock();
+               size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
+                   sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
+               uint64_t timestamp_now = mach_absolute_time();
+               memorystatus_jetsam_snapshot->notification_time = timestamp_now;
+               if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
+                   timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
+                       proc_list_unlock();
+                       int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
+                       if (!ret) {
+                               proc_list_lock();
+                               memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
+                               proc_list_unlock();
+                       }
+               } else {
+                       proc_list_unlock();
+               }
        }
 
-       return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason);
+       return res;
 }
 
-#endif /* CONFIG_JETSAM */
-
-boolean_t
-memorystatus_kill_on_zone_map_exhaustion(pid_t pid)
+/*
+ * Jetsam a specific process.
+ */
+static boolean_t
+memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
 {
-       boolean_t res = FALSE;
-       if (pid == -1) {
-               res = memorystatus_kill_process_async(-1, kMemorystatusKilledZoneMapExhaustion);
-       } else {
-               os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_ZONE_MAP_EXHAUSTION);
-               if (jetsam_reason == OS_REASON_NULL) {
-                       printf("memorystatus_kill_on_zone_map_exhaustion: failed to allocate jetsam reason\n");
-               }
+       boolean_t killed;
+       proc_t p;
+       uint64_t killtime = 0;
+       uint64_t footprint_of_killed_proc;
+       clock_sec_t     tv_sec;
+       clock_usec_t    tv_usec;
+       uint32_t        tv_msec;
+
+       /* TODO - add a victim queue and push this into the main jetsam thread */
 
-               res = memorystatus_kill_process_sync(pid, kMemorystatusKilledZoneMapExhaustion, jetsam_reason);
+       p = proc_find(victim_pid);
+       if (!p) {
+               os_reason_free(jetsam_reason);
+               return FALSE;
        }
-       return res;
-}
 
-#if CONFIG_FREEZE
+       proc_list_lock();
 
-__private_extern__ void
-memorystatus_freeze_init(void)
-{
-       kern_return_t result;
-       thread_t thread;
+       if (memorystatus_jetsam_snapshot_count == 0) {
+               memorystatus_init_jetsam_snapshot_locked(NULL, 0);
+       }
 
-       freezer_lck_grp_attr = lck_grp_attr_alloc_init();
-       freezer_lck_grp = lck_grp_alloc_init("freezer", freezer_lck_grp_attr);
+       killtime = mach_absolute_time();
+       absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
+       tv_msec = tv_usec / 1000;
 
-       lck_mtx_init(&freezer_mutex, freezer_lck_grp, NULL);
+       memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
 
-       /*
-        * This is just the default value if the underlying
-        * storage device doesn't have any specific budget.
-        * We check with the storage layer in memorystatus_freeze_update_throttle()
-        * before we start our freezing the first time.
-        */
-       memorystatus_freeze_budget_pages_remaining = (memorystatus_freeze_daily_mb_max * 1024 * 1024) / PAGE_SIZE;
+       proc_list_unlock();
 
-       result = kernel_thread_start(memorystatus_freeze_thread, NULL, &thread);
-       if (result == KERN_SUCCESS) {
-               proc_set_thread_policy(thread, TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2);
-               proc_set_thread_policy(thread, TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
-               thread_set_thread_name(thread, "VM_freezer");
+       killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
 
-               thread_deallocate(thread);
-       } else {
-               panic("Could not create memorystatus_freeze_thread");
-       }
+       os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_specific_process pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu\n",
+           (unsigned long)tv_sec, tv_msec, victim_pid, ((p && *p->p_name) ? p->p_name : "unknown"),
+           memorystatus_kill_cause_name[cause], (p ? p->p_memstat_effectivepriority: -1),
+           footprint_of_killed_proc >> 10, (uint64_t)memorystatus_available_pages);
+
+       proc_rele(p);
+
+       return killed;
 }
 
-static boolean_t
-memorystatus_is_process_eligible_for_freeze(proc_t p)
+
+/*
+ * Toggle the P_MEMSTAT_TERMINATED state.
+ * Takes the proc_list_lock.
+ */
+void
+proc_memstat_terminated(proc_t p, boolean_t set)
 {
+#if DEVELOPMENT || DEBUG
+       if (p) {
+               proc_list_lock();
+               if (set == TRUE) {
+                       p->p_memstat_state |= P_MEMSTAT_TERMINATED;
+               } else {
+                       p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
+               }
+               proc_list_unlock();
+       }
+#else
+#pragma unused(p, set)
        /*
-        * Called with proc_list_lock held.
+        * do nothing
         */
+#endif /* DEVELOPMENT || DEBUG */
+       return;
+}
 
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
 
-       boolean_t should_freeze = FALSE;
-       uint32_t state = 0, entry_count = 0, pages = 0, i = 0;
-       int probability_of_use = 0;
+#if CONFIG_JETSAM
+/*
+ * This is invoked when cpulimits have been exceeded while in fatal mode.
+ * The jetsam_flags do not apply as those are for memory related kills.
+ * We call this routine so that the offending process is killed with
+ * a non-zero exit status.
+ */
+void
+jetsam_on_ledger_cpulimit_exceeded(void)
+{
+       int retval = 0;
+       int jetsam_flags = 0;  /* make it obvious */
+       proc_t p = current_proc();
+       os_reason_t jetsam_reason = OS_REASON_NULL;
+
+       printf("task_exceeded_cpulimit: killing pid %d [%s]\n",
+           p->p_pid, (*p->p_name ? p->p_name : "(unknown)"));
 
-       if (isApp(p) == FALSE) {
-               goto out;
+       jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_CPULIMIT);
+       if (jetsam_reason == OS_REASON_NULL) {
+               printf("task_exceeded_cpulimit: unable to allocate memory for jetsam reason\n");
        }
 
-       state = p->p_memstat_state;
+       retval = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
 
-       if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED | P_MEMSTAT_FREEZE_IGNORE)) ||
-           !(state & P_MEMSTAT_SUSPENDED)) {
-               goto out;
+       if (retval) {
+               printf("task_exceeded_cpulimit: failed to kill current task (exiting?).\n");
        }
+}
 
-       /* Only freeze processes meeting our minimum resident page criteria */
-       memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
-       if (pages < memorystatus_freeze_pages_min) {
-               goto out;
-       }
+#endif /* CONFIG_JETSAM */
 
-       entry_count = (memorystatus_global_probabilities_size / sizeof(memorystatus_internal_probabilities_t));
+static void
+memorystatus_get_task_memory_region_count(task_t task, uint64_t *count)
+{
+       assert(task);
+       assert(count);
 
-       if (entry_count) {
-               for (i = 0; i < entry_count; i++) {
-                       if (strncmp(memorystatus_global_probabilities_table[i].proc_name,
-                           p->p_name,
-                           MAXCOMLEN + 1) == 0) {
-                               probability_of_use = memorystatus_global_probabilities_table[i].use_probability;
-                               break;
-                       }
-               }
+       *count = get_task_memory_region_count(task);
+}
 
-               if (probability_of_use == 0) {
-                       goto out;
-               }
-       }
 
-       should_freeze = TRUE;
-out:
-       return should_freeze;
-}
+#define MEMORYSTATUS_VM_MAP_FORK_ALLOWED     0x100000000
+#define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000
+
+#if DEVELOPMENT || DEBUG
 
 /*
- * Synchronously freeze the passed proc. Called with a reference to the proc held.
- *
- * Doesn't deal with re-freezing because this is called on a specific process and
- * not by the freezer thread. If that changes, we'll have to teach it about
- * refreezing a frozen process.
+ * Sysctl only used to test memorystatus_allowed_vm_map_fork() path.
+ *   set a new pidwatch value
+ *     or
+ *   get the current pidwatch value
  *
- * Returns EINVAL or the value returned by task_freeze().
+ * The pidwatch_val starts out with a PID to watch for in the map_fork path.
+ * Its value is:
+ * - OR'd with MEMORYSTATUS_VM_MAP_FORK_ALLOWED if we allow the map_fork.
+ * - OR'd with MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED if we disallow the map_fork.
+ * - set to -1ull if the map_fork() is aborted for other reasons.
  */
-int
-memorystatus_freeze_process_sync(proc_t p)
-{
-       int ret = EINVAL;
-       pid_t aPid = 0;
-       boolean_t memorystatus_freeze_swap_low = FALSE;
-       int     freezer_error_code = 0;
 
-       lck_mtx_lock(&freezer_mutex);
+uint64_t memorystatus_vm_map_fork_pidwatch_val = 0;
 
-       if (p == NULL) {
-               printf("memorystatus_freeze_process_sync: Invalid process\n");
-               goto exit;
-       }
+static int sysctl_memorystatus_vm_map_fork_pidwatch SYSCTL_HANDLER_ARGS {
+#pragma unused(oidp, arg1, arg2)
 
-       if (memorystatus_freeze_enabled == FALSE) {
-               printf("memorystatus_freeze_process_sync: Freezing is DISABLED\n");
-               goto exit;
-       }
+       uint64_t new_value = 0;
+       uint64_t old_value = 0;
+       int error = 0;
 
-       if (!memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
-               printf("memorystatus_freeze_process_sync: Low compressor and/or low swap space...skipping freeze\n");
-               goto exit;
-       }
+       /*
+        * The pid is held in the low 32 bits.
+        * The 'allowed' flags are in the upper 32 bits.
+        */
+       old_value = memorystatus_vm_map_fork_pidwatch_val;
 
-       memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining);
-       if (!memorystatus_freeze_budget_pages_remaining) {
-               printf("memorystatus_freeze_process_sync: exit with NO available budget\n");
-               goto exit;
+       error = sysctl_io_number(req, old_value, sizeof(old_value), &new_value, NULL);
+
+       if (error || !req->newptr) {
+               /*
+                * No new value passed in.
+                */
+               return error;
        }
 
-       proc_list_lock();
+       /*
+        * A new pid was passed in via req->newptr.
+        * Ignore any attempt to set the higher order bits.
+        */
+       memorystatus_vm_map_fork_pidwatch_val = new_value & 0xFFFFFFFF;
+       printf("memorystatus: pidwatch old_value = 0x%llx, new_value = 0x%llx \n", old_value, new_value);
 
-       if (p != NULL) {
-               uint32_t purgeable, wired, clean, dirty, shared;
-               uint32_t max_pages, i;
+       return error;
+}
 
-               aPid = p->p_pid;
+SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_map_fork_pidwatch, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED,
+    0, 0, sysctl_memorystatus_vm_map_fork_pidwatch, "Q", "get/set pid watched for in vm_map_fork");
 
-               /* Ensure the process is eligible for freezing */
-               if (memorystatus_is_process_eligible_for_freeze(p) == FALSE) {
-                       proc_list_unlock();
-                       goto exit;
-               }
 
-               if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
-                       max_pages = MIN(memorystatus_freeze_pages_max, memorystatus_freeze_budget_pages_remaining);
-               } else {
-                       /*
-                        * We only have the compressor without any swap.
-                        */
-                       max_pages = UINT32_MAX - 1;
+/*
+ * Record if a watched process fails to qualify for a vm_map_fork().
+ */
+void
+memorystatus_abort_vm_map_fork(task_t task)
+{
+       if (memorystatus_vm_map_fork_pidwatch_val != 0) {
+               proc_t p = get_bsdtask_info(task);
+               if (p != NULL && memorystatus_vm_map_fork_pidwatch_val == (uint64_t)p->p_pid) {
+                       memorystatus_vm_map_fork_pidwatch_val = -1ull;
                }
+       }
+}
 
-               /* Mark as locked temporarily to avoid kill */
-               p->p_memstat_state |= P_MEMSTAT_LOCKED;
-               proc_list_unlock();
-
-               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
-                   memorystatus_available_pages, 0, 0, 0, 0);
-
-               ret = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */);
+static void
+set_vm_map_fork_pidwatch(task_t task, uint64_t x)
+{
+       if (memorystatus_vm_map_fork_pidwatch_val != 0) {
+               proc_t p = get_bsdtask_info(task);
+               if (p && (memorystatus_vm_map_fork_pidwatch_val == (uint64_t)p->p_pid)) {
+                       memorystatus_vm_map_fork_pidwatch_val |= x;
+               }
+       }
+}
 
-               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
-                   memorystatus_available_pages, aPid, 0, 0, 0);
+#else /* DEVELOPMENT || DEBUG */
 
-               DTRACE_MEMORYSTATUS6(memorystatus_freeze, proc_t, p, unsigned int, memorystatus_available_pages, boolean_t, purgeable, unsigned int, wired, uint32_t, clean, uint32_t, dirty);
 
-               MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_process_sync: task_freeze %s for pid %d [%s] - "
-                   "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n",
-                   (ret == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"),
-                   memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared);
+static void
+set_vm_map_fork_pidwatch(task_t task, uint64_t x)
+{
+#pragma unused(task)
+#pragma unused(x)
+}
 
-               proc_list_lock();
+#endif /* DEVELOPMENT || DEBUG */
 
-               if (ret == KERN_SUCCESS) {
-                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (specific) pid %d [%s]...done",
-                           aPid, (*p->p_name ? p->p_name : "unknown"));
+/*
+ * Called during EXC_RESOURCE handling when a process exceeds a soft
+ * memory limit.  This is the corpse fork path and here we decide if
+ * vm_map_fork will be allowed when creating the corpse.
+ * The task being considered is suspended.
+ *
+ * By default, a vm_map_fork is allowed to proceed.
+ *
+ * A few simple policy assumptions:
+ *     Desktop platform is not considered in this path.
+ *     The vm_map_fork is always allowed.
+ *
+ *     If the device has a zero system-wide task limit,
+ *     then the vm_map_fork is allowed.
+ *
+ *     And if a process's memory footprint calculates less
+ *     than or equal to half of the system-wide task limit,
+ *     then the vm_map_fork is allowed.  This calculation
+ *     is based on the assumption that a process can
+ *     munch memory up to the system-wide task limit.
+ */
+boolean_t
+memorystatus_allowed_vm_map_fork(task_t task)
+{
+       boolean_t is_allowed = TRUE;   /* default */
 
-                       memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
+#if CONFIG_EMBEDDED
 
-                       p->p_memstat_freeze_sharedanon_pages += shared;
+       uint64_t footprint_in_bytes;
+       uint64_t max_allowed_bytes;
 
-                       memorystatus_frozen_shared_mb += shared;
+       if (max_task_footprint_mb == 0) {
+               set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
+               return is_allowed;
+       }
 
-                       if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) {
-                               p->p_memstat_state |= P_MEMSTAT_FROZEN;
-                               memorystatus_frozen_count++;
-                       }
+       footprint_in_bytes = get_task_phys_footprint(task);
 
-                       p->p_memstat_frozen_count++;
+       /*
+        * Maximum is 1/4 of the system-wide task limit.
+        */
+       max_allowed_bytes = ((uint64_t)max_task_footprint_mb * 1024 * 1024) >> 2;
 
-                       /*
-                        * Still keeping the P_MEMSTAT_LOCKED bit till we are actually done elevating this frozen process
-                        * to its higher jetsam band.
-                        */
-                       proc_list_unlock();
+       if (footprint_in_bytes > max_allowed_bytes) {
+               printf("memorystatus disallowed vm_map_fork %lld  %lld\n", footprint_in_bytes, max_allowed_bytes);
+               set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED);
+               return !is_allowed;
+       }
+#endif /* CONFIG_EMBEDDED */
 
-                       memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
+       set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
+       return is_allowed;
+}
 
-                       if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
-                               ret = memorystatus_update_inactive_jetsam_priority_band(p->p_pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE,
-                                   memorystatus_freeze_jetsam_band, TRUE);
+void
+memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
+{
+       assert(task);
+       assert(footprint);
 
-                               if (ret) {
-                                       printf("Elevating the frozen process failed with %d\n", ret);
-                                       /* not fatal */
-                                       ret = 0;
-                               }
+       uint64_t pages;
 
-                               proc_list_lock();
+       pages = (get_task_phys_footprint(task) / PAGE_SIZE_64);
+       assert(((uint32_t)pages) == pages);
+       *footprint = (uint32_t)pages;
 
-                               /* Update stats */
-                               for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
-                                       throttle_intervals[i].pageouts += dirty;
-                               }
-                       } else {
-                               proc_list_lock();
-                       }
+       if (max_footprint_lifetime) {
+               pages = (get_task_phys_footprint_lifetime_max(task) / PAGE_SIZE_64);
+               assert(((uint32_t)pages) == pages);
+               *max_footprint_lifetime = (uint32_t)pages;
+       }
+       if (purgeable_pages) {
+               pages = (get_task_purgeable_size(task) / PAGE_SIZE_64);
+               assert(((uint32_t)pages) == pages);
+               *purgeable_pages = (uint32_t)pages;
+       }
+}
 
-                       memorystatus_freeze_pageouts += dirty;
+static void
+memorystatus_get_task_phys_footprint_page_counts(task_t task,
+    uint64_t *internal_pages, uint64_t *internal_compressed_pages,
+    uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
+    uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
+    uint64_t *iokit_mapped_pages, uint64_t *page_table_pages)
+{
+       assert(task);
 
-                       if (memorystatus_frozen_count == (memorystatus_frozen_processes_max - 1)) {
-                               /*
-                                * Add some eviction logic here? At some point should we
-                                * jetsam a process to get back its swap space so that we
-                                * can freeze a more eligible process at this moment in time?
-                                */
-                       }
-               } else {
-                       char reason[128];
-                       if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) {
-                               strlcpy(reason, "too much shared memory", 128);
-                       }
+       if (internal_pages) {
+               *internal_pages = (get_task_internal(task) / PAGE_SIZE_64);
+       }
 
-                       if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) {
-                               strlcpy(reason, "low private-shared pages ratio", 128);
-                       }
+       if (internal_compressed_pages) {
+               *internal_compressed_pages = (get_task_internal_compressed(task) / PAGE_SIZE_64);
+       }
 
-                       if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) {
-                               strlcpy(reason, "no compressor space", 128);
-                       }
+       if (purgeable_nonvolatile_pages) {
+               *purgeable_nonvolatile_pages = (get_task_purgeable_nonvolatile(task) / PAGE_SIZE_64);
+       }
 
-                       if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) {
-                               strlcpy(reason, "no swap space", 128);
-                       }
+       if (purgeable_nonvolatile_compressed_pages) {
+               *purgeable_nonvolatile_compressed_pages = (get_task_purgeable_nonvolatile_compressed(task) / PAGE_SIZE_64);
+       }
 
-                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (specific) pid %d [%s]...skipped (%s)",
-                           aPid, (*p->p_name ? p->p_name : "unknown"), reason);
-                       p->p_memstat_state |= P_MEMSTAT_FREEZE_IGNORE;
-               }
+       if (alternate_accounting_pages) {
+               *alternate_accounting_pages = (get_task_alternate_accounting(task) / PAGE_SIZE_64);
+       }
 
-               p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
-               proc_list_unlock();
+       if (alternate_accounting_compressed_pages) {
+               *alternate_accounting_compressed_pages = (get_task_alternate_accounting_compressed(task) / PAGE_SIZE_64);
        }
 
-exit:
-       lck_mtx_unlock(&freezer_mutex);
+       if (iokit_mapped_pages) {
+               *iokit_mapped_pages = (get_task_iokit_mapped(task) / PAGE_SIZE_64);
+       }
 
-       return ret;
+       if (page_table_pages) {
+               *page_table_pages = (get_task_page_table(task) / PAGE_SIZE_64);
+       }
 }
 
-static int
-memorystatus_freeze_top_process(void)
+/*
+ * This routine only acts on the global jetsam event snapshot.
+ * Updating the process's entry can race when the memorystatus_thread
+ * has chosen to kill a process that is racing to exit on another core.
+ */
+static void
+memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime)
 {
-       pid_t aPid = 0;
-       int ret = -1;
-       proc_t p = PROC_NULL, next_p = PROC_NULL;
-       unsigned int i = 0;
-       unsigned int band = JETSAM_PRIORITY_IDLE;
-       boolean_t refreeze_processes = FALSE;
+       memorystatus_jetsam_snapshot_entry_t *entry = NULL;
+       memorystatus_jetsam_snapshot_t *snapshot    = NULL;
+       memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
 
-       proc_list_lock();
+       unsigned int i;
+
+       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
 
-       if (memorystatus_frozen_count >= memorystatus_frozen_processes_max) {
+       if (memorystatus_jetsam_snapshot_count == 0) {
                /*
-                * Freezer is already full but we are here and so let's
-                * try to refreeze any processes we might have thawed
-                * in the past and push out their compressed state out.
+                * No active snapshot.
+                * Nothing to do.
                 */
-               refreeze_processes = TRUE;
-               band = (unsigned int) memorystatus_freeze_jetsam_band;
+               return;
        }
 
-freeze_process:
-
-       next_p = memorystatus_get_first_proc_locked(&band, FALSE);
-       while (next_p) {
-               kern_return_t kr;
-               uint32_t purgeable, wired, clean, dirty, shared;
-               uint32_t max_pages = 0;
-               int     freezer_error_code = 0;
-
-               p = next_p;
-               next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
+       /*
+        * Sanity check as this routine should only be called
+        * from a jetsam kill path.
+        */
+       assert(kill_cause != 0 && killtime != 0);
 
-               aPid = p->p_pid;
+       snapshot       = memorystatus_jetsam_snapshot;
+       snapshot_list  = memorystatus_jetsam_snapshot->entries;
 
-               if (p->p_memstat_effectivepriority != (int32_t) band) {
-                       /*
-                        * We shouldn't be freezing processes outside the
-                        * prescribed band.
-                        */
-                       break;
-               }
+       for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
+               if (snapshot_list[i].pid == p->p_pid) {
+                       entry = &snapshot_list[i];
 
-               /* Ensure the process is eligible for (re-)freezing */
-               if (refreeze_processes) {
-                       /*
-                        * Has to have been frozen once before.
-                        */
-                       if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) {
-                               continue;
+                       if (entry->killed || entry->jse_killtime) {
+                               /*
+                                * We apparently raced on the exit path
+                                * for this process, as it's snapshot entry
+                                * has already recorded a kill.
+                                */
+                               assert(entry->killed && entry->jse_killtime);
+                               break;
                        }
 
                        /*
-                        * Has to have been resumed once before.
+                        * Update the entry we just found in the snapshot.
                         */
-                       if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == FALSE) {
-                               continue;
-                       }
 
-                       /*
-                        * Not currently being looked at for something.
-                        */
-                       if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
-                               continue;
-                       }
+                       entry->killed       = kill_cause;
+                       entry->jse_killtime = killtime;
+                       entry->jse_gencount = snapshot->js_gencount;
+                       entry->jse_idle_delta = p->p_memstat_idle_delta;
+#if CONFIG_FREEZE
+                       entry->jse_thaw_count = p->p_memstat_thaw_count;
+#else /* CONFIG_FREEZE */
+                       entry->jse_thaw_count = 0;
+#endif /* CONFIG_FREEZE */
 
                        /*
-                        * We are going to try and refreeze and so re-evaluate
-                        * the process. We don't want to double count the shared
-                        * memory. So deduct the old snapshot here.
+                        * If a process has moved between bands since snapshot was
+                        * initialized, then likely these fields changed too.
                         */
-                       memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages;
-                       p->p_memstat_freeze_sharedanon_pages = 0;
-
-                       p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
-                       memorystatus_refreeze_eligible_count--;
-               } else {
-                       if (memorystatus_is_process_eligible_for_freeze(p) == FALSE) {
-                               continue; // with lock held
+                       if (entry->priority != p->p_memstat_effectivepriority) {
+                               strlcpy(entry->name, p->p_name, sizeof(entry->name));
+                               entry->priority  = p->p_memstat_effectivepriority;
+                               entry->state     = memorystatus_build_state(p);
+                               entry->user_data = p->p_memstat_userdata;
+                               entry->fds       = p->p_fd->fd_nfiles;
                        }
-               }
-
-               if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
-                       /*
-                        * Freezer backed by the compressor and swap file(s)
-                        * will hold compressed data.
-                        */
 
-                       max_pages = MIN(memorystatus_freeze_pages_max, memorystatus_freeze_budget_pages_remaining);
-               } else {
                        /*
-                        * We only have the compressor pool.
+                        * Always update the page counts on a kill.
                         */
-                       max_pages = UINT32_MAX - 1;
-               }
-
-               /* Mark as locked temporarily to avoid kill */
-               p->p_memstat_state |= P_MEMSTAT_LOCKED;
-
-               p = proc_ref_locked(p);
-               if (!p) {
-                       break;
-               }
-
-               proc_list_unlock();
-
-               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
-                   memorystatus_available_pages, 0, 0, 0, 0);
 
-               kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */);
-
-               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
-                   memorystatus_available_pages, aPid, 0, 0, 0);
-
-               MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_process: task_freeze %s for pid %d [%s] - "
-                   "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n",
-                   (kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"),
-                   memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared);
-
-               proc_list_lock();
+                       uint32_t pages              = 0;
+                       uint32_t max_pages_lifetime = 0;
+                       uint32_t purgeable_pages    = 0;
 
-               /* Success? */
-               if (KERN_SUCCESS == kr) {
-                       if (refreeze_processes) {
-                               os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: Refreezing (general) pid %d [%s]...done",
-                                   aPid, (*p->p_name ? p->p_name : "unknown"));
-                       } else {
-                               os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (general) pid %d [%s]...done",
-                                   aPid, (*p->p_name ? p->p_name : "unknown"));
-                       }
+                       memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages);
+                       entry->pages              = (uint64_t)pages;
+                       entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
+                       entry->purgeable_pages    = (uint64_t)purgeable_pages;
 
-                       memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
+                       uint64_t internal_pages                        = 0;
+                       uint64_t internal_compressed_pages             = 0;
+                       uint64_t purgeable_nonvolatile_pages           = 0;
+                       uint64_t purgeable_nonvolatile_compressed_pages = 0;
+                       uint64_t alternate_accounting_pages            = 0;
+                       uint64_t alternate_accounting_compressed_pages = 0;
+                       uint64_t iokit_mapped_pages                    = 0;
+                       uint64_t page_table_pages                      = 0;
 
-                       p->p_memstat_freeze_sharedanon_pages += shared;
+                       memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages,
+                           &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
+                           &alternate_accounting_pages, &alternate_accounting_compressed_pages,
+                           &iokit_mapped_pages, &page_table_pages);
 
-                       memorystatus_frozen_shared_mb += shared;
+                       entry->jse_internal_pages = internal_pages;
+                       entry->jse_internal_compressed_pages = internal_compressed_pages;
+                       entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
+                       entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
+                       entry->jse_alternate_accounting_pages = alternate_accounting_pages;
+                       entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
+                       entry->jse_iokit_mapped_pages = iokit_mapped_pages;
+                       entry->jse_page_table_pages = page_table_pages;
 
-                       if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) {
-                               p->p_memstat_state |= P_MEMSTAT_FROZEN;
-                               memorystatus_frozen_count++;
-                       }
+                       uint64_t region_count = 0;
+                       memorystatus_get_task_memory_region_count(p->task, &region_count);
+                       entry->jse_memory_region_count = region_count;
 
-                       p->p_memstat_frozen_count++;
+                       goto exit;
+               }
+       }
 
+       if (entry == NULL) {
+               /*
+                * The entry was not found in the snapshot, so the process must have
+                * launched after the snapshot was initialized.
+                * Let's try to append the new entry.
+                */
+               if (memorystatus_jetsam_snapshot_count < memorystatus_jetsam_snapshot_max) {
                        /*
-                        * Still keeping the P_MEMSTAT_LOCKED bit till we are actually done elevating this frozen process
-                        * to its higher jetsam band.
+                        * A populated snapshot buffer exists
+                        * and there is room to init a new entry.
                         */
-                       proc_list_unlock();
-
-                       memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
-
-                       if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
-                               ret = memorystatus_update_inactive_jetsam_priority_band(p->p_pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE, memorystatus_freeze_jetsam_band, TRUE);
-
-                               if (ret) {
-                                       printf("Elevating the frozen process failed with %d\n", ret);
-                                       /* not fatal */
-                                       ret = 0;
-                               }
-
-                               proc_list_lock();
-
-                               /* Update stats */
-                               for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
-                                       throttle_intervals[i].pageouts += dirty;
-                               }
-                       } else {
-                               proc_list_lock();
-                       }
-
-                       memorystatus_freeze_pageouts += dirty;
-
-                       if (memorystatus_frozen_count == (memorystatus_frozen_processes_max - 1)) {
-                               /*
-                                * Add some eviction logic here? At some point should we
-                                * jetsam a process to get back its swap space so that we
-                                * can freeze a more eligible process at this moment in time?
-                                */
-                       }
-
-                       /* Return KERN_SUCCESS */
-                       ret = kr;
+                       assert(memorystatus_jetsam_snapshot_count == snapshot->entry_count);
 
-                       p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
-                       proc_rele_locked(p);
+                       unsigned int next = memorystatus_jetsam_snapshot_count;
 
-                       /*
-                        * We froze a process successfully. We can stop now
-                        * and see if that helped.
-                        */
+                       if (memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[next], (snapshot->js_gencount)) == TRUE) {
+                               entry = &snapshot_list[next];
+                               entry->killed       = kill_cause;
+                               entry->jse_killtime = killtime;
 
-                       break;
-               } else {
-                       p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
+                               snapshot->entry_count = ++next;
+                               memorystatus_jetsam_snapshot_count = next;
 
-                       if (refreeze_processes == TRUE) {
-                               if ((freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) ||
-                                   (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO)) {
+                               if (memorystatus_jetsam_snapshot_count >= memorystatus_jetsam_snapshot_max) {
                                        /*
-                                        * Keeping this prior-frozen process in this high band when
-                                        * we failed to re-freeze it due to bad shared memory usage
-                                        * could cause excessive pressure on the lower bands.
-                                        * We need to demote it for now. It'll get re-evaluated next
-                                        * time because we don't set the P_MEMSTAT_FREEZE_IGNORE
-                                        * bit.
+                                        * We just used the last slot in the snapshot buffer.
+                                        * We only want to log it once... so we do it here
+                                        * when we notice we've hit the max.
                                         */
-
-                                       p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
-                                       memorystatus_invalidate_idle_demotion_locked(p, TRUE);
-                                       memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, TRUE, TRUE);
+                                       printf("memorystatus: WARNING snapshot buffer is full, count %d\n",
+                                           memorystatus_jetsam_snapshot_count);
                                }
-                       } else {
-                               p->p_memstat_state |= P_MEMSTAT_FREEZE_IGNORE;
                        }
+               }
+       }
 
-                       proc_rele_locked(p);
-
-                       char reason[128];
-                       if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) {
-                               strlcpy(reason, "too much shared memory", 128);
-                       }
+exit:
+       if (entry == NULL) {
+               /*
+                * If we reach here, the snapshot buffer could not be updated.
+                * Most likely, the buffer is full, in which case we would have
+                * logged a warning in the previous call.
+                *
+                * For now, we will stop appending snapshot entries.
+                * When the buffer is consumed, the snapshot state will reset.
+                */
 
-                       if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) {
-                               strlcpy(reason, "low private-shared pages ratio", 128);
-                       }
+               MEMORYSTATUS_DEBUG(4, "memorystatus_update_jetsam_snapshot_entry_locked: failed to update pid %d, priority %d, count %d\n",
+                   p->p_pid, p->p_memstat_effectivepriority, memorystatus_jetsam_snapshot_count);
+       }
 
-                       if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) {
-                               strlcpy(reason, "no compressor space", 128);
-                       }
+       return;
+}
 
-                       if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) {
-                               strlcpy(reason, "no swap space", 128);
-                       }
+#if CONFIG_JETSAM
+void
+memorystatus_pages_update(unsigned int pages_avail)
+{
+       memorystatus_available_pages = pages_avail;
 
-                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (general) pid %d [%s]...skipped (%s)",
-                           aPid, (*p->p_name ? p->p_name : "unknown"), reason);
+#if VM_PRESSURE_EVENTS
+       /*
+        * Since memorystatus_available_pages changes, we should
+        * re-evaluate the pressure levels on the system and
+        * check if we need to wake the pressure thread.
+        * We also update memorystatus_level in that routine.
+        */
+       vm_pressure_response();
 
-                       if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
-                               break;
-                       }
+       if (memorystatus_available_pages <= memorystatus_available_pages_pressure) {
+               if (memorystatus_hwm_candidates || (memorystatus_available_pages <= memorystatus_available_pages_critical)) {
+                       memorystatus_thread_wake();
                }
        }
+#if CONFIG_FREEZE
+       /*
+        * We can't grab the freezer_mutex here even though that synchronization would be correct to inspect
+        * the # of frozen processes and wakeup the freezer thread. Reason being that we come here into this
+        * code with (possibly) the page-queue locks held and preemption disabled. So trying to grab a mutex here
+        * will result in the "mutex with preemption disabled" panic.
+        */
 
-       if ((ret == -1) &&
-           (memorystatus_refreeze_eligible_count >= MIN_THAW_REFREEZE_THRESHOLD) &&
-           (refreeze_processes == FALSE)) {
+       if (memorystatus_freeze_thread_should_run() == TRUE) {
                /*
-                * We failed to freeze a process from the IDLE
-                * band AND we have some thawed  processes
-                * AND haven't tried refreezing as yet.
-                * Let's try and re-freeze processes in the
-                * frozen band that have been resumed in the past
-                * and so have brought in state from disk.
+                * The freezer thread is usually woken up by some user-space call i.e. pid_hibernate(any process).
+                * That trigger isn't invoked often enough and so we are enabling this explicit wakeup here.
                 */
+               if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+                       thread_wakeup((event_t)&memorystatus_freeze_wakeup);
+               }
+       }
+#endif /* CONFIG_FREEZE */
 
-               band = (unsigned int) memorystatus_freeze_jetsam_band;
+#else /* VM_PRESSURE_EVENTS */
 
-               refreeze_processes = TRUE;
+       boolean_t critical, delta;
 
-               goto freeze_process;
+       if (!memorystatus_delta) {
+               return;
        }
 
-       proc_list_unlock();
+       critical = (pages_avail < memorystatus_available_pages_critical) ? TRUE : FALSE;
+       delta = ((pages_avail >= (memorystatus_available_pages + memorystatus_delta))
+           || (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) ? TRUE : FALSE;
 
-       return ret;
+       if (critical || delta) {
+               unsigned int total_pages;
+
+               total_pages = (unsigned int) atop_64(max_mem);
+#if CONFIG_SECLUDED_MEMORY
+               total_pages -= vm_page_secluded_count;
+#endif /* CONFIG_SECLUDED_MEMORY */
+               memorystatus_level = memorystatus_available_pages * 100 / total_pages;
+               memorystatus_thread_wake();
+       }
+#endif /* VM_PRESSURE_EVENTS */
 }
+#endif /* CONFIG_JETSAM */
 
-static inline boolean_t
-memorystatus_can_freeze_processes(void)
+static boolean_t
+memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount)
 {
-       boolean_t ret;
+       clock_sec_t                     tv_sec;
+       clock_usec_t                    tv_usec;
+       uint32_t pages = 0;
+       uint32_t max_pages_lifetime = 0;
+       uint32_t purgeable_pages = 0;
+       uint64_t internal_pages                         = 0;
+       uint64_t internal_compressed_pages              = 0;
+       uint64_t purgeable_nonvolatile_pages            = 0;
+       uint64_t purgeable_nonvolatile_compressed_pages = 0;
+       uint64_t alternate_accounting_pages             = 0;
+       uint64_t alternate_accounting_compressed_pages  = 0;
+       uint64_t iokit_mapped_pages                     = 0;
+       uint64_t page_table_pages                       = 0;
+       uint64_t region_count                           = 0;
+       uint64_t cids[COALITION_NUM_TYPES];
 
-       proc_list_lock();
+       memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
 
-       if (memorystatus_suspended_count) {
-               memorystatus_freeze_suspended_threshold = MIN(memorystatus_freeze_suspended_threshold, FREEZE_SUSPENDED_THRESHOLD_DEFAULT);
+       entry->pid = p->p_pid;
+       strlcpy(&entry->name[0], p->p_name, sizeof(entry->name));
+       entry->priority = p->p_memstat_effectivepriority;
 
-               if ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold) {
-                       ret = TRUE;
-               } else {
-                       ret = FALSE;
-               }
-       } else {
-               ret = FALSE;
-       }
+       memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages);
+       entry->pages              = (uint64_t)pages;
+       entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
+       entry->purgeable_pages    = (uint64_t)purgeable_pages;
 
-       proc_list_unlock();
+       memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages,
+           &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
+           &alternate_accounting_pages, &alternate_accounting_compressed_pages,
+           &iokit_mapped_pages, &page_table_pages);
 
-       return ret;
-}
+       entry->jse_internal_pages = internal_pages;
+       entry->jse_internal_compressed_pages = internal_compressed_pages;
+       entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
+       entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
+       entry->jse_alternate_accounting_pages = alternate_accounting_pages;
+       entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
+       entry->jse_iokit_mapped_pages = iokit_mapped_pages;
+       entry->jse_page_table_pages = page_table_pages;
 
-static boolean_t
-memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low)
-{
-       boolean_t can_freeze = TRUE;
+       memorystatus_get_task_memory_region_count(p->task, &region_count);
+       entry->jse_memory_region_count = region_count;
 
-       /* Only freeze if we're sufficiently low on memory; this holds off freeze right
-       *  after boot,  and is generally is a no-op once we've reached steady state. */
-       if (memorystatus_available_pages > memorystatus_freeze_threshold) {
-               return FALSE;
-       }
+       entry->state     = memorystatus_build_state(p);
+       entry->user_data = p->p_memstat_userdata;
+       memcpy(&entry->uuid[0], &p->p_uuid[0], sizeof(p->p_uuid));
+       entry->fds       = p->p_fd->fd_nfiles;
 
-       /* Check minimum suspended process threshold. */
-       if (!memorystatus_can_freeze_processes()) {
-               return FALSE;
-       }
-       assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
+       absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec);
+       entry->cpu_time.tv_sec = (int64_t)tv_sec;
+       entry->cpu_time.tv_usec = (int64_t)tv_usec;
 
-       if (!VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
-               /*
-                * In-core compressor used for freezing WITHOUT on-disk swap support.
-                */
-               if (vm_compressor_low_on_space()) {
-                       if (*memorystatus_freeze_swap_low) {
-                               *memorystatus_freeze_swap_low = TRUE;
-                       }
+       assert(p->p_stats != NULL);
+       entry->jse_starttime =  p->p_stats->ps_start;   /* abstime process started */
+       entry->jse_killtime = 0;                        /* abstime jetsam chose to kill process */
+       entry->killed       = 0;                        /* the jetsam kill cause */
+       entry->jse_gencount = gencount;                 /* indicates a pass through jetsam thread, when process was targeted to be killed */
 
-                       can_freeze = FALSE;
-               } else {
-                       if (*memorystatus_freeze_swap_low) {
-                               *memorystatus_freeze_swap_low = FALSE;
-                       }
+       entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */
 
-                       can_freeze = TRUE;
-               }
+#if CONFIG_FREEZE
+       entry->jse_thaw_count = p->p_memstat_thaw_count;
+#else /* CONFIG_FREEZE */
+       entry->jse_thaw_count = 0;
+#endif /* CONFIG_FREEZE */
+
+       proc_coalitionids(p, cids);
+       entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM];
+
+       return TRUE;
+}
+
+static void
+memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t *snapshot)
+{
+       kern_return_t kr = KERN_SUCCESS;
+       mach_msg_type_number_t  count = HOST_VM_INFO64_COUNT;
+       vm_statistics64_data_t  vm_stat;
+
+       if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count)) != KERN_SUCCESS) {
+               printf("memorystatus_init_jetsam_snapshot_stats: host_statistics64 failed with %d\n", kr);
+               memset(&snapshot->stats, 0, sizeof(snapshot->stats));
        } else {
-               /*
-                * Freezing WITH on-disk swap support.
-                *
-                * In-core compressor fronts the swap.
-                */
-               if (vm_swap_low_on_space()) {
-                       if (*memorystatus_freeze_swap_low) {
-                               *memorystatus_freeze_swap_low = TRUE;
-                       }
+               snapshot->stats.free_pages      = vm_stat.free_count;
+               snapshot->stats.active_pages    = vm_stat.active_count;
+               snapshot->stats.inactive_pages  = vm_stat.inactive_count;
+               snapshot->stats.throttled_pages = vm_stat.throttled_count;
+               snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
+               snapshot->stats.wired_pages     = vm_stat.wire_count;
 
-                       can_freeze = FALSE;
-               }
+               snapshot->stats.speculative_pages = vm_stat.speculative_count;
+               snapshot->stats.filebacked_pages  = vm_stat.external_page_count;
+               snapshot->stats.anonymous_pages   = vm_stat.internal_page_count;
+               snapshot->stats.compressions      = vm_stat.compressions;
+               snapshot->stats.decompressions    = vm_stat.decompressions;
+               snapshot->stats.compressor_pages  = vm_stat.compressor_page_count;
+               snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
        }
 
-       return can_freeze;
+       get_zone_map_size(&snapshot->stats.zone_map_size, &snapshot->stats.zone_map_capacity);
+
+       bzero(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name));
+       get_largest_zone_info(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name),
+           &snapshot->stats.largest_zone_size);
 }
 
 /*
- * This function evaluates if the currently frozen processes deserve
- * to stay in the higher jetsam band. If the # of thaws of a process
- * is below our threshold, then we will demote that process into the IDLE
- * band and put it at the head. We don't immediately kill the process here
- * because it  already has state on disk and so it might be worth giving
- * it another shot at getting thawed/resumed and used.
+ * Collect vm statistics at boot.
+ * Called only once (see kern_exec.c)
+ * Data can be consumed at any time.
  */
+void
+memorystatus_init_at_boot_snapshot()
+{
+       memorystatus_init_snapshot_vmstats(&memorystatus_at_boot_snapshot);
+       memorystatus_at_boot_snapshot.entry_count = 0;
+       memorystatus_at_boot_snapshot.notification_time = 0;   /* updated when consumed */
+       memorystatus_at_boot_snapshot.snapshot_time = mach_absolute_time();
+}
+
 static void
-memorystatus_demote_frozen_processes(void)
+memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count )
 {
-       unsigned int band = (unsigned int) memorystatus_freeze_jetsam_band;
-       unsigned int demoted_proc_count = 0;
-       proc_t p = PROC_NULL, next_p = PROC_NULL;
+       proc_t p, next_p;
+       unsigned int b = 0, i = 0;
 
-       proc_list_lock();
+       memorystatus_jetsam_snapshot_t *snapshot = NULL;
+       memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
+       unsigned int snapshot_max = 0;
+
+       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
 
-       if (memorystatus_freeze_enabled == FALSE) {
+       if (od_snapshot) {
                /*
-                * Freeze has been disabled likely to
-                * reclaim swap space. So don't change
-                * any state on the frozen processes.
+                * This is an on_demand snapshot
                 */
-               proc_list_unlock();
-               return;
+               snapshot      = od_snapshot;
+               snapshot_list = od_snapshot->entries;
+               snapshot_max  = ods_list_count;
+       } else {
+               /*
+                * This is a jetsam event snapshot
+                */
+               snapshot      = memorystatus_jetsam_snapshot;
+               snapshot_list = memorystatus_jetsam_snapshot->entries;
+               snapshot_max  = memorystatus_jetsam_snapshot_max;
        }
 
-       next_p = memorystatus_get_first_proc_locked(&band, FALSE);
+       /*
+        * Init the snapshot header information
+        */
+       memorystatus_init_snapshot_vmstats(snapshot);
+       snapshot->snapshot_time = mach_absolute_time();
+       snapshot->notification_time = 0;
+       snapshot->js_gencount = 0;
+
+       next_p = memorystatus_get_first_proc_locked(&b, TRUE);
        while (next_p) {
                p = next_p;
-               next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
-
-               if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) {
-                       continue;
-               }
+               next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
 
-               if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
+               if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], snapshot->js_gencount)) {
                        continue;
                }
 
-               if (p->p_memstat_thaw_count < memorystatus_thaw_count_demotion_threshold) {
-                       p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
-                       memorystatus_invalidate_idle_demotion_locked(p, TRUE);
-
-                       memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, TRUE, TRUE);
-#if DEVELOPMENT || DEBUG
-                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus_demote_frozen_process pid %d [%s]",
-                           p->p_pid, (*p->p_name ? p->p_name : "unknown"));
-#endif /* DEVELOPMENT || DEBUG */
-
-                       /*
-                        * The freezer thread will consider this a normal app to be frozen
-                        * because it is in the IDLE band. So we don't need the
-                        * P_MEMSTAT_REFREEZE_ELIGIBLE state here. Also, if it gets resumed
-                        * we'll correctly count it as eligible for re-freeze again.
-                        *
-                        * We don't drop the frozen count because this process still has
-                        * state on disk. So there's a chance it gets resumed and then it
-                        * should land in the higher jetsam band. For that it needs to
-                        * remain marked frozen.
-                        */
-                       if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
-                               p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
-                               memorystatus_refreeze_eligible_count--;
-                       }
-
-                       demoted_proc_count++;
-               }
+               MEMORYSTATUS_DEBUG(0, "jetsam snapshot pid %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+                   p->p_pid,
+                   p->p_uuid[0], p->p_uuid[1], p->p_uuid[2], p->p_uuid[3], p->p_uuid[4], p->p_uuid[5], p->p_uuid[6], p->p_uuid[7],
+                   p->p_uuid[8], p->p_uuid[9], p->p_uuid[10], p->p_uuid[11], p->p_uuid[12], p->p_uuid[13], p->p_uuid[14], p->p_uuid[15]);
 
-               if (demoted_proc_count == memorystatus_max_frozen_demotions_daily) {
+               if (++i == snapshot_max) {
                        break;
                }
        }
 
-       memorystatus_thaw_count = 0;
-       proc_list_unlock();
+       snapshot->entry_count = i;
+
+       if (!od_snapshot) {
+               /* update the system buffer count */
+               memorystatus_jetsam_snapshot_count = i;
+       }
 }
 
+#if DEVELOPMENT || DEBUG
 
-/*
- * This function will do 4 things:
- *
- * 1) check to see if we are currently in a degraded freezer mode, and if so:
- *     - check to see if our window has expired and we should exit this mode, OR,
- *     - return a budget based on the degraded throttle window's max. pageouts vs current pageouts.
- *
- * 2) check to see if we are in a NEW normal window and update the normal throttle window's params.
- *
- * 3) check what the current normal window allows for a budget.
- *
- * 4) calculate the current rate of pageouts for DEGRADED_WINDOW_MINS duration. If that rate is below
- *    what we would normally expect, then we are running low on our daily budget and need to enter
- *    degraded perf. mode.
- */
-
-static void
-memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed)
+#if CONFIG_JETSAM
+static int
+memorystatus_cmd_set_panic_bits(user_addr_t buffer, uint32_t buffer_size)
 {
-       clock_sec_t sec;
-       clock_nsec_t nsec;
-       mach_timespec_t ts;
+       int ret;
+       memorystatus_jetsam_panic_options_t debug;
 
-       unsigned int freeze_daily_pageouts_max = 0;
+       if (buffer_size != sizeof(memorystatus_jetsam_panic_options_t)) {
+               return EINVAL;
+       }
 
-#if DEVELOPMENT || DEBUG
-       if (!memorystatus_freeze_throttle_enabled) {
-               /*
-                * No throttling...we can use the full budget everytime.
-                */
-               *budget_pages_allowed = UINT64_MAX;
-               return;
+       ret = copyin(buffer, &debug, buffer_size);
+       if (ret) {
+               return ret;
        }
-#endif
 
-       clock_get_system_nanotime(&sec, &nsec);
-       ts.tv_sec = sec;
-       ts.tv_nsec = nsec;
+       /* Panic bits match kMemorystatusKilled* enum */
+       memorystatus_jetsam_panic_debug = (memorystatus_jetsam_panic_debug & ~debug.mask) | (debug.data & debug.mask);
+
+       /* Copyout new value */
+       debug.data = memorystatus_jetsam_panic_debug;
+       ret = copyout(&debug, buffer, sizeof(memorystatus_jetsam_panic_options_t));
+
+       return ret;
+}
+#endif /* CONFIG_JETSAM */
 
-       struct throttle_interval_t *interval = NULL;
+/*
+ * Triggers a sort_order on a specified jetsam priority band.
+ * This is for testing only, used to force a path through the sort
+ * function.
+ */
+static int
+memorystatus_cmd_test_jetsam_sort(int priority, int sort_order)
+{
+       int error = 0;
 
-       if (memorystatus_freeze_degradation == TRUE) {
-               interval = degraded_throttle_window;
+       unsigned int bucket_index = 0;
 
-               if (CMP_MACH_TIMESPEC(&ts, &interval->ts) >= 0) {
-                       memorystatus_freeze_degradation = FALSE;
-                       interval->pageouts = 0;
-                       interval->max_pageouts = 0;
-               } else {
-                       *budget_pages_allowed = interval->max_pageouts - interval->pageouts;
-               }
+       if (priority == -1) {
+               /* Use as shorthand for default priority */
+               bucket_index = JETSAM_PRIORITY_DEFAULT;
+       } else {
+               bucket_index = (unsigned int)priority;
        }
 
-       interval = normal_throttle_window;
+       error = memorystatus_sort_bucket(bucket_index, sort_order);
 
-       if (CMP_MACH_TIMESPEC(&ts, &interval->ts) >= 0) {
-               /*
-                * New throttle window.
-                * Rollover any unused budget.
-                * Also ask the storage layer what the new budget needs to be.
-                */
-               uint64_t freeze_daily_budget = 0;
-               unsigned int daily_budget_pageouts = 0;
+       return error;
+}
 
-               if (vm_swap_max_budget(&freeze_daily_budget)) {
-                       memorystatus_freeze_daily_mb_max = (freeze_daily_budget / (1024 * 1024));
-                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: memorystatus_freeze_daily_mb_max set to %dMB\n", memorystatus_freeze_daily_mb_max);
-               }
+#endif /* DEVELOPMENT || DEBUG */
 
-               freeze_daily_pageouts_max = memorystatus_freeze_daily_mb_max * (1024 * 1024 / PAGE_SIZE);
+/*
+ * Prepare the process to be killed (set state, update snapshot) and kill it.
+ */
+static uint64_t memorystatus_purge_before_jetsam_success = 0;
 
-               daily_budget_pageouts =  (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / NORMAL_WINDOW_MINS));
-               interval->max_pageouts = (interval->max_pageouts - interval->pageouts) + daily_budget_pageouts;
+static boolean_t
+memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, boolean_t *killed, uint64_t *footprint_of_killed_proc)
+{
+       pid_t aPid = 0;
+       uint32_t aPid_ep = 0;
 
-               interval->ts.tv_sec = interval->mins * 60;
-               interval->ts.tv_nsec = 0;
-               ADD_MACH_TIMESPEC(&interval->ts, &ts);
-               /* Since we update the throttle stats pre-freeze, adjust for overshoot here */
-               if (interval->pageouts > interval->max_pageouts) {
-                       interval->pageouts -= interval->max_pageouts;
-               } else {
-                       interval->pageouts = 0;
-               }
-               *budget_pages_allowed = interval->max_pageouts;
+       uint64_t        killtime = 0;
+       clock_sec_t     tv_sec;
+       clock_usec_t    tv_usec;
+       uint32_t        tv_msec;
+       boolean_t       retval = FALSE;
 
-               memorystatus_demote_frozen_processes();
-       } else {
-               /*
-                * Current throttle window.
-                * Deny freezing if we have no budget left.
-                * Try graceful degradation if we are within 25% of:
-                * - the daily budget, and
-                * - the current budget left is below our normal budget expectations.
-                */
+       aPid = p->p_pid;
+       aPid_ep = p->p_memstat_effectivepriority;
 
-#if DEVELOPMENT || DEBUG
+       if (cause != kMemorystatusKilledVnodes && cause != kMemorystatusKilledZoneMapExhaustion) {
                /*
-                * This can only happen in the INTERNAL configs because we allow modifying the daily budget for testing.
+                * Genuine memory pressure and not other (vnode/zone) resource exhaustion.
                 */
+               boolean_t success = FALSE;
+               uint64_t num_pages_purged;
+               uint64_t num_pages_reclaimed = 0;
+               uint64_t num_pages_unsecluded = 0;
 
-               if (freeze_daily_pageouts_max > interval->max_pageouts) {
+               networking_memstatus_callout(p, cause);
+               num_pages_purged = vm_purgeable_purge_task_owned(p->task);
+               num_pages_reclaimed += num_pages_purged;
+#if CONFIG_SECLUDED_MEMORY
+               if (cause == kMemorystatusKilledVMPageShortage &&
+                   vm_page_secluded_count > 0 &&
+                   task_can_use_secluded_mem(p->task, FALSE)) {
                        /*
-                        * We just bumped the daily budget. Re-evaluate our normal window params.
+                        * We're about to kill a process that has access
+                        * to the secluded pool.  Drain that pool into the
+                        * free or active queues to make these pages re-appear
+                        * as "available", which might make us no longer need
+                        * to kill that process.
+                        * Since the secluded pool does not get refilled while
+                        * a process has access to it, it should remain
+                        * drained.
                         */
-                       interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / NORMAL_WINDOW_MINS));
-                       memorystatus_freeze_degradation = FALSE; //we'll re-evaluate this below...
+                       num_pages_unsecluded = vm_page_secluded_drain();
+                       num_pages_reclaimed += num_pages_unsecluded;
                }
-#endif /* DEVELOPMENT || DEBUG */
+#endif /* CONFIG_SECLUDED_MEMORY */
 
-               if (memorystatus_freeze_degradation == FALSE) {
-                       if (interval->pageouts >= interval->max_pageouts) {
-                               *budget_pages_allowed = 0;
+               if (num_pages_reclaimed) {
+                       /*
+                        * We actually reclaimed something and so let's
+                        * check if we need to continue with the kill.
+                        */
+                       if (cause == kMemorystatusKilledHiwat) {
+                               uint64_t footprint_in_bytes = get_task_phys_footprint(p->task);
+                               uint64_t memlimit_in_bytes  = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL);  /* convert MB to bytes */
+                               success = (footprint_in_bytes <= memlimit_in_bytes);
                        } else {
-                               int budget_left = interval->max_pageouts - interval->pageouts;
-                               int budget_threshold = (freeze_daily_pageouts_max * FREEZE_DEGRADATION_BUDGET_THRESHOLD) / 100;
-
-                               mach_timespec_t time_left = {0, 0};
-
-                               time_left.tv_sec = interval->ts.tv_sec;
-                               time_left.tv_nsec = 0;
-
-                               SUB_MACH_TIMESPEC(&time_left, &ts);
-
-                               if (budget_left <= budget_threshold) {
+                               success = (memorystatus_avail_pages_below_pressure() == FALSE);
+#if CONFIG_SECLUDED_MEMORY
+                               if (!success && num_pages_unsecluded) {
                                        /*
-                                        * For the current normal window, calculate how much we would pageout in a DEGRADED_WINDOW_MINS duration.
-                                        * And also calculate what we would pageout for the same DEGRADED_WINDOW_MINS duration if we had the full
-                                        * daily pageout budget.
+                                        * We just drained the secluded pool
+                                        * because we're about to kill a
+                                        * process that has access to it.
+                                        * This is an important process and
+                                        * we'd rather not kill it unless
+                                        * absolutely necessary, so declare
+                                        * success even if draining the pool
+                                        * did not quite get us out of the
+                                        * "pressure" level but still got
+                                        * us out of the "critical" level.
                                         */
+                                       success = (memorystatus_avail_pages_below_critical() == FALSE);
+                               }
+#endif /* CONFIG_SECLUDED_MEMORY */
+                       }
 
-                                       unsigned int current_budget_rate_allowed = ((budget_left / time_left.tv_sec) / 60) * DEGRADED_WINDOW_MINS;
-                                       unsigned int normal_budget_rate_allowed = (freeze_daily_pageouts_max / NORMAL_WINDOW_MINS) * DEGRADED_WINDOW_MINS;
-
-                                       /*
-                                        * The current rate of pageouts is below what we would expect for
-                                        * the normal rate i.e. we have below normal budget left and so...
-                                        */
+                       if (success) {
+                               memorystatus_purge_before_jetsam_success++;
 
-                                       if (current_budget_rate_allowed < normal_budget_rate_allowed) {
-                                               memorystatus_freeze_degradation = TRUE;
-                                               degraded_throttle_window->max_pageouts = current_budget_rate_allowed;
-                                               degraded_throttle_window->pageouts = 0;
+                               os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: reclaimed %llu pages (%llu purged, %llu unsecluded) from pid %d [%s] and avoided %s\n",
+                                   num_pages_reclaimed, num_pages_purged, num_pages_unsecluded, aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_kill_cause_name[cause]);
 
-                                               /*
-                                                * Switch over to the degraded throttle window so the budget
-                                                * doled out is based on that window.
-                                                */
-                                               interval = degraded_throttle_window;
-                                       }
-                               }
+                               *killed = FALSE;
 
-                               *budget_pages_allowed = interval->max_pageouts - interval->pageouts;
+                               return TRUE;
                        }
                }
        }
 
-       MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n",
-           interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60,
-           interval->throttle ? "on" : "off");
-}
+#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
+       MEMORYSTATUS_DEBUG(1, "jetsam: killing pid %d [%s] - %lld Mb > 1 (%d Mb)\n",
+           aPid, (*p->p_name ? p->p_name : "unknown"),
+           (footprint_in_bytes / (1024ULL * 1024ULL)),                 /* converted bytes to MB */
+           p->p_memstat_memlimit);
+#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
 
-static void
-memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused)
-{
-       static boolean_t memorystatus_freeze_swap_low = FALSE;
+       killtime = mach_absolute_time();
+       absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
+       tv_msec = tv_usec / 1000;
 
-       lck_mtx_lock(&freezer_mutex);
+       proc_list_lock();
+       memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
+       proc_list_unlock();
 
-       if (memorystatus_freeze_enabled) {
-               if ((memorystatus_frozen_count < memorystatus_frozen_processes_max) ||
-                   (memorystatus_refreeze_eligible_count >= MIN_THAW_REFREEZE_THRESHOLD)) {
-                       if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
-                               /* Only freeze if we've not exceeded our pageout budgets.*/
-                               memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining);
+       char kill_reason_string[128];
 
-                               if (memorystatus_freeze_budget_pages_remaining) {
-                                       memorystatus_freeze_top_process();
-                               }
-                       }
+       if (cause == kMemorystatusKilledHiwat) {
+               strlcpy(kill_reason_string, "killing_highwater_process", 128);
+       } else {
+               if (aPid_ep == JETSAM_PRIORITY_IDLE) {
+                       strlcpy(kill_reason_string, "killing_idle_process", 128);
+               } else {
+                       strlcpy(kill_reason_string, "killing_top_process", 128);
                }
        }
 
        /*
-        * We use memorystatus_apps_idle_delay_time because if/when we adopt aging for applications,
-        * it'll tie neatly into running the freezer once we age an application.
-        *
-        * Till then, it serves as a good interval that can be tuned via a sysctl too.
+        * memorystatus_do_kill drops a reference, so take another one so we can
+        * continue to use this exit reason even after memorystatus_do_kill()
+        * returns
         */
-       memorystatus_freezer_thread_next_run_ts = mach_absolute_time() + memorystatus_apps_idle_delay_time;
+       os_reason_ref(jetsam_reason);
 
-       assert_wait((event_t) &memorystatus_freeze_wakeup, THREAD_UNINT);
-       lck_mtx_unlock(&freezer_mutex);
+       retval = memorystatus_do_kill(p, cause, jetsam_reason, footprint_of_killed_proc);
+       *killed = retval;
 
-       thread_block((thread_continue_t) memorystatus_freeze_thread);
+       os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: %s pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu",
+           (unsigned long)tv_sec, tv_msec, kill_reason_string,
+           aPid, ((p && *p->p_name) ? p->p_name : "unknown"),
+           memorystatus_kill_cause_name[cause], aPid_ep,
+           (*footprint_of_killed_proc) >> 10, (uint64_t)memorystatus_available_pages);
+
+       return retval;
 }
 
+/*
+ * Jetsam the first process in the queue.
+ */
 static boolean_t
-memorystatus_freeze_thread_should_run(void)
+memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason,
+    int32_t *priority, uint32_t *errors, uint64_t *memory_reclaimed)
 {
-       /*
-        * No freezer_mutex held here...see why near call-site
-        * within memorystatus_pages_update().
-        */
-
-       boolean_t should_run = FALSE;
-
-       if (memorystatus_freeze_enabled == FALSE) {
-               goto out;
-       }
-
-       if (memorystatus_available_pages > memorystatus_freeze_threshold) {
-               goto out;
-       }
+       pid_t aPid;
+       proc_t p = PROC_NULL, next_p = PROC_NULL;
+       boolean_t new_snapshot = FALSE, force_new_snapshot = FALSE, killed = FALSE, freed_mem = FALSE;
+       unsigned int i = 0;
+       uint32_t aPid_ep;
+       int32_t local_max_kill_prio = JETSAM_PRIORITY_IDLE;
+       uint64_t footprint_of_killed_proc = 0;
 
-       if ((memorystatus_frozen_count >= memorystatus_frozen_processes_max) &&
-           (memorystatus_refreeze_eligible_count < MIN_THAW_REFREEZE_THRESHOLD)) {
-               goto out;
-       }
+#ifndef CONFIG_FREEZE
+#pragma unused(any)
+#endif
 
-       if (memorystatus_frozen_shared_mb_max && (memorystatus_frozen_shared_mb >= memorystatus_frozen_shared_mb_max)) {
-               goto out;
-       }
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
+           memorystatus_available_pages, 0, 0, 0, 0);
 
-       uint64_t curr_time = mach_absolute_time();
 
-       if (curr_time < memorystatus_freezer_thread_next_run_ts) {
-               goto out;
+#if CONFIG_JETSAM
+       if (sort_flag == TRUE) {
+               (void)memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
        }
 
-       should_run = TRUE;
+       local_max_kill_prio = max_kill_priority;
 
-out:
-       return should_run;
-}
+       force_new_snapshot = FALSE;
 
-static int
-sysctl_memorystatus_do_fastwake_warmup_all  SYSCTL_HANDLER_ARGS
-{
-#pragma unused(oidp, req, arg1, arg2)
+#else /* CONFIG_JETSAM */
 
-       /* Need to be root or have entitlement */
-       if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) {
-               return EPERM;
+       if (sort_flag == TRUE) {
+               (void)memorystatus_sort_bucket(JETSAM_PRIORITY_IDLE, JETSAM_SORT_DEFAULT);
        }
 
-       if (memorystatus_freeze_enabled == FALSE) {
-               return ENOTSUP;
+       /*
+        * On macos, we currently only have 2 reasons to be here:
+        *
+        * kMemorystatusKilledZoneMapExhaustion
+        * AND
+        * kMemorystatusKilledVMCompressorSpaceShortage
+        *
+        * If we are here because of kMemorystatusKilledZoneMapExhaustion, we will consider
+        * any and all processes as eligible kill candidates since we need to avoid a panic.
+        *
+        * Since this function can be called async. it is harder to toggle the max_kill_priority
+        * value before and after a call. And so we use this local variable to set the upper band
+        * on the eligible kill bands.
+        */
+       if (cause == kMemorystatusKilledZoneMapExhaustion) {
+               local_max_kill_prio = JETSAM_PRIORITY_MAX;
+       } else {
+               local_max_kill_prio = max_kill_priority;
        }
 
-       do_fastwake_warmup_all();
-
-       return 0;
-}
-
-SYSCTL_PROC(_kern, OID_AUTO, memorystatus_do_fastwake_warmup_all, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
-    0, 0, &sysctl_memorystatus_do_fastwake_warmup_all, "I", "");
-
-#endif /* CONFIG_FREEZE */
-
-#if VM_PRESSURE_EVENTS
-
-#if CONFIG_MEMORYSTATUS
-
-static int
-memorystatus_send_note(int event_code, void *data, size_t data_length)
-{
-       int ret;
-       struct kev_msg ev_msg;
-
-       ev_msg.vendor_code    = KEV_VENDOR_APPLE;
-       ev_msg.kev_class      = KEV_SYSTEM_CLASS;
-       ev_msg.kev_subclass   = KEV_MEMORYSTATUS_SUBCLASS;
-
-       ev_msg.event_code     = event_code;
+       /*
+        * And, because we are here under extreme circumstances, we force a snapshot even for
+        * IDLE kills.
+        */
+       force_new_snapshot = TRUE;
 
-       ev_msg.dv[0].data_length = data_length;
-       ev_msg.dv[0].data_ptr = data;
-       ev_msg.dv[1].data_length = 0;
+#endif /* CONFIG_JETSAM */
 
-       ret = kev_post_msg(&ev_msg);
-       if (ret) {
-               printf("%s: kev_post_msg() failed, err %d\n", __func__, ret);
+       if (cause != kMemorystatusKilledZoneMapExhaustion &&
+           jetsam_current_thread() != NULL &&
+           jetsam_current_thread()->limit_to_low_bands &&
+           local_max_kill_prio > JETSAM_PRIORITY_BACKGROUND) {
+               local_max_kill_prio = JETSAM_PRIORITY_BACKGROUND;
        }
 
-       return ret;
-}
+       proc_list_lock();
 
-boolean_t
-memorystatus_warn_process(pid_t pid, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded)
-{
-       boolean_t ret = FALSE;
-       boolean_t found_knote = FALSE;
-       struct knote *kn = NULL;
-       int send_knote_count = 0;
+       next_p = memorystatus_get_first_proc_locked(&i, TRUE);
+       while (next_p && (next_p->p_memstat_effectivepriority <= local_max_kill_prio)) {
+               p = next_p;
+               next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
 
-       /*
-        * See comment in sysctl_memorystatus_vm_pressure_send.
-        */
 
-       memorystatus_klist_lock();
+               aPid = p->p_pid;
+               aPid_ep = p->p_memstat_effectivepriority;
 
-       SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
-               proc_t knote_proc = knote_get_kq(kn)->kq_p;
-               pid_t knote_pid = knote_proc->p_pid;
+               if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
+                       continue;   /* with lock held */
+               }
 
-               if (knote_pid == pid) {
+               if (cause == kMemorystatusKilledVnodes) {
                        /*
-                        * By setting the "fflags" here, we are forcing
-                        * a process to deal with the case where it's
-                        * bumping up into its memory limits. If we don't
-                        * do this here, we will end up depending on the
-                        * system pressure snapshot evaluation in
-                        * filt_memorystatus().
+                        * If the system runs out of vnodes, we systematically jetsam
+                        * processes in hopes of stumbling onto a vnode gain that helps
+                        * the system recover.  The process that happens to trigger
+                        * this path has no known relationship to the vnode shortage.
+                        * Deadlock avoidance: attempt to safeguard the caller.
                         */
 
-#if CONFIG_EMBEDDED
-                       if (!limit_exceeded) {
+                       if (p == current_proc()) {
+                               /* do not jetsam the current process */
+                               continue;
+                       }
+               }
+
+#if CONFIG_FREEZE
+               boolean_t skip;
+               boolean_t reclaim_proc = !(p->p_memstat_state & P_MEMSTAT_LOCKED);
+               if (any || reclaim_proc) {
+                       skip = FALSE;
+               } else {
+                       skip = TRUE;
+               }
+
+               if (skip) {
+                       continue;
+               } else
+#endif
+               {
+                       if (proc_ref_locked(p) == p) {
                                /*
-                                * Intentionally set either the unambiguous limit warning,
-                                * the system-wide critical or the system-wide warning
-                                * notification bit.
+                                * Mark as terminated so that if exit1() indicates success, but the process (for example)
+                                * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
+                                * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
+                                * acquisition of the proc lock.
                                 */
-
-                               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
-                                       kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
-                                       found_knote = TRUE;
-                                       send_knote_count++;
-                               } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
-                                       kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
-                                       found_knote = TRUE;
-                                       send_knote_count++;
-                               } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
-                                       kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
-                                       found_knote = TRUE;
-                                       send_knote_count++;
-                               }
+                               p->p_memstat_state |= P_MEMSTAT_TERMINATED;
                        } else {
                                /*
-                                * Send this notification when a process has exceeded a soft limit.
+                                * We need to restart the search again because
+                                * proc_ref_locked _can_ drop the proc_list lock
+                                * and we could have lost our stored next_p via
+                                * an exit() on another core.
                                 */
-                               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
-                                       kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
-                                       found_knote = TRUE;
-                                       send_knote_count++;
-                               }
+                               i = 0;
+                               next_p = memorystatus_get_first_proc_locked(&i, TRUE);
+                               continue;
                        }
-#else /* CONFIG_EMBEDDED */
-                       if (!limit_exceeded) {
-                               /*
-                                * Processes on desktop are not expecting to handle a system-wide
-                                * critical or system-wide warning notification from this path.
-                                * Intentionally set only the unambiguous limit warning here.
-                                *
-                                * If the limit is soft, however, limit this to one notification per
-                                * active/inactive limit (per each registered listener).
-                                */
 
-                               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
-                                       found_knote = TRUE;
-                                       if (!is_fatal) {
-                                               /*
-                                                * Restrict proc_limit_warn notifications when
-                                                * non-fatal (soft) limit is at play.
-                                                */
-                                               if (is_active) {
-                                                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
-                                                               /*
-                                                                * Mark this knote for delivery.
-                                                                */
-                                                               kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
-                                                               /*
-                                                                * And suppress it from future notifications.
-                                                                */
-                                                               kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
-                                                               send_knote_count++;
-                                                       }
-                                               } else {
-                                                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
-                                                               /*
-                                                                * Mark this knote for delivery.
-                                                                */
-                                                               kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
-                                                               /*
-                                                                * And suppress it from future notifications.
-                                                                */
-                                                               kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
-                                                               send_knote_count++;
-                                                       }
-                                               }
-                                       } else {
-                                               /*
-                                                * No restriction on proc_limit_warn notifications when
-                                                * fatal (hard) limit is at play.
-                                                */
-                                               kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
-                                               send_knote_count++;
-                                       }
-                               }
-                       } else {
-                               /*
-                                * Send this notification when a process has exceeded a soft limit,
-                                */
+                       /*
+                        * Capture a snapshot if none exists and:
+                        * - we are forcing a new snapshot creation, either because:
+                        *      - on a particular platform we need these snapshots every time, OR
+                        *      - a boot-arg/embedded device tree property has been set.
+                        * - priority was not requested (this is something other than an ambient kill)
+                        * - the priority was requested *and* the targeted process is not at idle priority
+                        */
+                       if ((memorystatus_jetsam_snapshot_count == 0) &&
+                           (force_new_snapshot || memorystatus_idle_snapshot || ((!priority) || (priority && (aPid_ep != JETSAM_PRIORITY_IDLE))))) {
+                               memorystatus_init_jetsam_snapshot_locked(NULL, 0);
+                               new_snapshot = TRUE;
+                       }
 
-                               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
-                                       found_knote = TRUE;
-                                       if (!is_fatal) {
-                                               /*
-                                                * Restrict critical notifications for soft limits.
-                                                */
+                       proc_list_unlock();
 
-                                               if (is_active) {
-                                                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
-                                                               /*
-                                                                * Suppress future proc_limit_critical notifications
-                                                                * for the active soft limit.
-                                                                */
-                                                               kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
-                                                               kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
-                                                               send_knote_count++;
-                                                       }
-                                               } else {
-                                                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
-                                                               /*
-                                                                * Suppress future proc_limit_critical_notifications
-                                                                * for the inactive soft limit.
-                                                                */
-                                                               kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
-                                                               kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
-                                                               send_knote_count++;
-                                                       }
-                                               }
-                                       } else {
-                                               /*
-                                                * We should never be trying to send a critical notification for
-                                                * a hard limit... the process would be killed before it could be
-                                                * received.
-                                                */
-                                               panic("Caught sending pid %d a critical warning for a fatal limit.\n", pid);
+                       freed_mem = memorystatus_kill_proc(p, cause, jetsam_reason, &killed, &footprint_of_killed_proc); /* purged and/or killed 'p' */
+                       /* Success? */
+                       if (freed_mem) {
+                               if (killed) {
+                                       *memory_reclaimed = footprint_of_killed_proc;
+                                       if (priority) {
+                                               *priority = aPid_ep;
                                        }
+                               } else {
+                                       /* purged */
+                                       proc_list_lock();
+                                       p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
+                                       proc_list_unlock();
                                }
+                               proc_rele(p);
+                               goto exit;
                        }
-#endif /* CONFIG_EMBEDDED */
-               }
-       }
 
-       if (found_knote) {
-               if (send_knote_count > 0) {
-                       KNOTE(&memorystatus_klist, 0);
+                       /*
+                        * Failure - first unwind the state,
+                        * then fall through to restart the search.
+                        */
+                       proc_list_lock();
+                       proc_rele_locked(p);
+                       p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
+                       p->p_memstat_state |= P_MEMSTAT_ERROR;
+                       *errors += 1;
+
+                       i = 0;
+                       next_p = memorystatus_get_first_proc_locked(&i, TRUE);
                }
-               ret = TRUE;
        }
 
-       memorystatus_klist_unlock();
-
-       return ret;
-}
-
-/*
- * Can only be set by the current task on itself.
- */
-int
-memorystatus_low_mem_privileged_listener(uint32_t op_flags)
-{
-       boolean_t set_privilege = FALSE;
-       /*
-        * Need an entitlement check here?
-        */
-       if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
-               set_privilege = TRUE;
-       } else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
-               set_privilege = FALSE;
-       } else {
-               return EINVAL;
-       }
+       proc_list_unlock();
 
-       return task_low_mem_privileged_listener(current_task(), set_privilege, NULL);
-}
+exit:
+       os_reason_free(jetsam_reason);
 
-int
-memorystatus_send_pressure_note(pid_t pid)
-{
-       MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid);
-       return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
-}
+       if (!killed) {
+               *memory_reclaimed = 0;
 
-void
-memorystatus_send_low_swap_note(void)
-{
-       struct knote *kn = NULL;
-
-       memorystatus_klist_lock();
-       SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
-               /* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
-                * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
-                * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
-                * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
-               if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
-                       KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
-                       break;
+               /* Clear snapshot if freshly captured and no target was found */
+               if (new_snapshot) {
+                       proc_list_lock();
+                       memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
+                       proc_list_unlock();
                }
        }
 
-       memorystatus_klist_unlock();
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
+           memorystatus_available_pages, killed ? aPid : 0, killed, *memory_reclaimed, 0);
+
+       return killed;
 }
 
-boolean_t
-memorystatus_bg_pressure_eligible(proc_t p)
+/*
+ * Jetsam aggressively
+ */
+static boolean_t
+memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count,
+    int32_t priority_max, uint32_t *errors, uint64_t *memory_reclaimed)
 {
-       boolean_t eligible = FALSE;
-
-       proc_list_lock();
+       pid_t aPid;
+       proc_t p = PROC_NULL, next_p = PROC_NULL;
+       boolean_t new_snapshot = FALSE, killed = FALSE;
+       int kill_count = 0;
+       unsigned int i = 0;
+       int32_t aPid_ep = 0;
+       unsigned int memorystatus_level_snapshot = 0;
+       uint64_t killtime = 0;
+       clock_sec_t     tv_sec;
+       clock_usec_t    tv_usec;
+       uint32_t        tv_msec;
+       os_reason_t jetsam_reason = OS_REASON_NULL;
+       uint64_t footprint_of_killed_proc = 0;
 
-       MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", p->p_pid, p->p_memstat_state);
+       *memory_reclaimed = 0;
 
-       /* Foreground processes have already been dealt with at this point, so just test for eligibility */
-       if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
-               eligible = TRUE;
-       }
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
+           memorystatus_available_pages, priority_max, 0, 0, 0);
 
-       if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) {
+       if (priority_max >= JETSAM_PRIORITY_FOREGROUND) {
                /*
-                * IDLE and IDLE_DEFERRED bands contain processes
-                * that have dropped memory to be under their inactive
-                * memory limits. And so they can't really give back
-                * anything.
+                * Check if aggressive jetsam has been asked to kill upto or beyond the
+                * JETSAM_PRIORITY_FOREGROUND bucket. If yes, sort the FG band based on
+                * coalition footprint.
                 */
-               eligible = FALSE;
+               memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
        }
 
-       proc_list_unlock();
-
-       return eligible;
-}
-
-boolean_t
-memorystatus_is_foreground_locked(proc_t p)
-{
-       return (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
-              (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT);
-}
-
-/*
- * This is meant for stackshot and kperf -- it does not take the proc_list_lock
- * to access the p_memstat_dirty field.
- */
-void
-memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit)
-{
-       if (!v) {
-               *is_dirty = FALSE;
-               *is_dirty_tracked = FALSE;
-               *allow_idle_exit = FALSE;
-       } else {
-               proc_t p = (proc_t)v;
-               *is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
-               *is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0;
-               *allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0;
+       jetsam_reason = os_reason_create(OS_REASON_JETSAM, cause);
+       if (jetsam_reason == OS_REASON_NULL) {
+               printf("memorystatus_kill_processes_aggressive: failed to allocate exit reason\n");
        }
-}
-
-#endif /* CONFIG_MEMORYSTATUS */
-
-/*
- * Trigger levels to test the mechanism.
- * Can be used via a sysctl.
- */
-#define TEST_LOW_MEMORY_TRIGGER_ONE             1
-#define TEST_LOW_MEMORY_TRIGGER_ALL             2
-#define TEST_PURGEABLE_TRIGGER_ONE              3
-#define TEST_PURGEABLE_TRIGGER_ALL              4
-#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE   5
-#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL   6
-
-boolean_t               memorystatus_manual_testing_on = FALSE;
-vm_pressure_level_t     memorystatus_manual_testing_level = kVMPressureNormal;
-
-extern struct knote *
-vm_pressure_select_optimal_candidate_to_notify(struct klist *, int, boolean_t);
 
+       proc_list_lock();
 
-#define VM_PRESSURE_NOTIFY_WAIT_PERIOD          10000   /* milliseconds */
-
-#if DEBUG
-#define VM_PRESSURE_DEBUG(cond, format, ...)      \
-do {                                              \
-       if (cond) { printf(format, ##__VA_ARGS__); } \
-} while(0)
-#else
-#define VM_PRESSURE_DEBUG(cond, format, ...)
-#endif
-
-#define INTER_NOTIFICATION_DELAY        (250000)        /* .25 second */
-
-void
-memorystatus_on_pageout_scan_end(void)
-{
-       /* No-op */
-}
+       next_p = memorystatus_get_first_proc_locked(&i, TRUE);
+       while (next_p) {
+               if (((next_p->p_listflag & P_LIST_EXITED) != 0) ||
+                   ((unsigned int)(next_p->p_memstat_effectivepriority) != i)) {
+                       /*
+                        * We have raced with next_p running on another core.
+                        * It may be exiting or it may have moved to a different
+                        * jetsam priority band.  This means we have lost our
+                        * place in line while traversing the jetsam list.  We
+                        * attempt to recover by rewinding to the beginning of the band
+                        * we were already traversing.  By doing this, we do not guarantee
+                        * that no process escapes this aggressive march, but we can make
+                        * skipping an entire range of processes less likely. (PR-21069019)
+                        */
 
-/*
- * kn_max - knote
- *
- * knote_pressure_level - to check if the knote is registered for this notification level.
- *
- * task        - task whose bits we'll be modifying
- *
- * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
- *
- * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
- *
- */
+                       MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: rewinding band %d, %s(%d) moved or exiting.\n",
+                           aggr_count, i, (*next_p->p_name ? next_p->p_name : "unknown"), next_p->p_pid);
 
-boolean_t
-is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
-{
-       if (kn_max->kn_sfflags & knote_pressure_level) {
-               if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
-                       task_clear_has_been_notified(task, pressure_level_to_clear);
+                       next_p = memorystatus_get_first_proc_locked(&i, TRUE);
+                       continue;
                }
 
-               task_mark_has_been_notified(task, pressure_level_to_set);
-               return TRUE;
-       }
-
-       return FALSE;
-}
-
-void
-memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)
-{
-       struct knote *kn = NULL;
-
-       memorystatus_klist_lock();
-       SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
-               proc_t                  p = PROC_NULL;
-               struct task*            t = TASK_NULL;
+               p = next_p;
+               next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
 
-               p = knote_get_kq(kn)->kq_p;
-               proc_list_lock();
-               if (p != proc_ref_locked(p)) {
-                       p = PROC_NULL;
+               if (p->p_memstat_effectivepriority > priority_max) {
+                       /*
+                        * Bail out of this killing spree if we have
+                        * reached beyond the priority_max jetsam band.
+                        * That is, we kill up to and through the
+                        * priority_max jetsam band.
+                        */
                        proc_list_unlock();
-                       continue;
+                       goto exit;
                }
-               proc_list_unlock();
-
-               t = (struct task *)(p->task);
-
-               task_clear_has_been_notified(t, pressure_level_to_clear);
-
-               proc_rele(p);
-       }
-
-       memorystatus_klist_unlock();
-}
-
-extern kern_return_t vm_pressure_notify_dispatch_vm_clients(boolean_t target_foreground_process);
-
-struct knote *
-vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process);
-
-/*
- * Used by the vm_pressure_thread which is
- * signalled from within vm_pageout_scan().
- */
-static void vm_dispatch_memory_pressure(void);
-void consider_vm_pressure_events(void);
 
-void
-consider_vm_pressure_events(void)
-{
-       vm_dispatch_memory_pressure();
-}
-static void
-vm_dispatch_memory_pressure(void)
-{
-       memorystatus_update_vm_pressure(FALSE);
-}
-
-extern vm_pressure_level_t
-convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
+               aPid = p->p_pid;
+               aPid_ep = p->p_memstat_effectivepriority;
 
-struct knote *
-vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process)
-{
-       struct knote    *kn = NULL, *kn_max = NULL;
-       uint64_t        resident_max = 0;       /* MB */
-       struct timeval  curr_tstamp = {0, 0};
-       int             elapsed_msecs = 0;
-       int             selected_task_importance = 0;
-       static int      pressure_snapshot = -1;
-       boolean_t       pressure_increase = FALSE;
+               if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
+                       continue;
+               }
 
-       if (pressure_snapshot == -1) {
                /*
-                * Initial snapshot.
+                * Capture a snapshot if none exists.
                 */
-               pressure_snapshot = level;
-               pressure_increase = TRUE;
-       } else {
-               if (level && (level >= pressure_snapshot)) {
-                       pressure_increase = TRUE;
-               } else {
-                       pressure_increase = FALSE;
+               if (memorystatus_jetsam_snapshot_count == 0) {
+                       memorystatus_init_jetsam_snapshot_locked(NULL, 0);
+                       new_snapshot = TRUE;
                }
 
-               pressure_snapshot = level;
-       }
-
-       if (pressure_increase == TRUE) {
-               /*
-                * We'll start by considering the largest
-                * unimportant task in our list.
-                */
-               selected_task_importance = INT_MAX;
-       } else {
                /*
-                * We'll start by considering the largest
-                * important task in our list.
+                * Mark as terminated so that if exit1() indicates success, but the process (for example)
+                * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
+                * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
+                * acquisition of the proc lock.
                 */
-               selected_task_importance = 0;
-       }
+               p->p_memstat_state |= P_MEMSTAT_TERMINATED;
 
-       microuptime(&curr_tstamp);
+               killtime = mach_absolute_time();
+               absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
+               tv_msec = tv_usec / 1000;
 
-       SLIST_FOREACH(kn, candidate_list, kn_selnext) {
-               uint64_t                resident_size = 0;      /* MB */
-               proc_t                  p = PROC_NULL;
-               struct task*            t = TASK_NULL;
-               int                     curr_task_importance = 0;
-               boolean_t               consider_knote = FALSE;
-               boolean_t               privileged_listener = FALSE;
+               /* Shift queue, update stats */
+               memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
 
-               p = knote_get_kq(kn)->kq_p;
-               proc_list_lock();
-               if (p != proc_ref_locked(p)) {
-                       p = PROC_NULL;
-                       proc_list_unlock();
-                       continue;
-               }
-               proc_list_unlock();
+               /*
+                * In order to kill the target process, we will drop the proc_list_lock.
+                * To guaranteee that p and next_p don't disappear out from under the lock,
+                * we must take a ref on both.
+                * If we cannot get a reference, then it's likely we've raced with
+                * that process exiting on another core.
+                */
+               if (proc_ref_locked(p) == p) {
+                       if (next_p) {
+                               while (next_p && (proc_ref_locked(next_p) != next_p)) {
+                                       proc_t temp_p;
 
-#if CONFIG_MEMORYSTATUS
-               if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) {
-                       /*
-                        * Skip process not marked foreground.
-                        */
-                       proc_rele(p);
-                       continue;
-               }
-#endif /* CONFIG_MEMORYSTATUS */
+                                       /*
+                                        * We must have raced with next_p exiting on another core.
+                                        * Recover by getting the next eligible process in the band.
+                                        */
 
-               t = (struct task *)(p->task);
+                                       MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n",
+                                           aggr_count, next_p->p_pid, (*next_p->p_name ? next_p->p_name : "(unknown)"));
 
-               timevalsub(&curr_tstamp, &p->vm_pressure_last_notify_tstamp);
-               elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
+                                       temp_p = next_p;
+                                       next_p = memorystatus_get_next_proc_locked(&i, temp_p, TRUE);
+                               }
+                       }
+                       proc_list_unlock();
 
-               vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
+                       printf("%lu.%03d memorystatus: %s%d pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
+                           (unsigned long)tv_sec, tv_msec,
+                           ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process_aggressive" : "killing_top_process_aggressive"),
+                           aggr_count, aPid, (*p->p_name ? p->p_name : "unknown"),
+                           memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)memorystatus_available_pages);
 
-               if ((kn->kn_sfflags & dispatch_level) == 0) {
-                       proc_rele(p);
-                       continue;
-               }
+                       memorystatus_level_snapshot = memorystatus_level;
 
-#if CONFIG_MEMORYSTATUS
-               if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) {
-                       VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", p->p_pid);
-                       proc_rele(p);
-                       continue;
-               }
-#endif /* CONFIG_MEMORYSTATUS */
+                       /*
+                        * memorystatus_do_kill() drops a reference, so take another one so we can
+                        * continue to use this exit reason even after memorystatus_do_kill()
+                        * returns.
+                        */
+                       os_reason_ref(jetsam_reason);
+                       killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
 
-#if CONFIG_EMBEDDED
-               curr_task_importance = p->p_memstat_effectivepriority;
-#else /* CONFIG_EMBEDDED */
-               curr_task_importance = task_importance_estimate(t);
-#endif /* CONFIG_EMBEDDED */
+                       /* Success? */
+                       if (killed) {
+                               *memory_reclaimed += footprint_of_killed_proc;
+                               proc_rele(p);
+                               kill_count++;
+                               p = NULL;
+                               killed = FALSE;
 
-               /*
-                * Privileged listeners are only considered in the multi-level pressure scheme
-                * AND only if the pressure is increasing.
-                */
-               if (level > 0) {
-                       if (task_has_been_notified(t, level) == FALSE) {
                                /*
-                                * Is this a privileged listener?
+                                * Continue the killing spree.
                                 */
-                               if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) {
-                                       if (privileged_listener) {
-                                               kn_max = kn;
-                                               proc_rele(p);
-                                               goto done_scanning;
+                               proc_list_lock();
+                               if (next_p) {
+                                       proc_rele_locked(next_p);
+                               }
+
+                               if (aPid_ep == JETSAM_PRIORITY_FOREGROUND && memorystatus_aggressive_jetsam_lenient == TRUE) {
+                                       if (memorystatus_level > memorystatus_level_snapshot && ((memorystatus_level - memorystatus_level_snapshot) >= AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD)) {
+#if DEVELOPMENT || DEBUG
+                                               printf("Disabling Lenient mode after one-time deployment.\n");
+#endif /* DEVELOPMENT || DEBUG */
+                                               memorystatus_aggressive_jetsam_lenient = FALSE;
+                                               break;
                                        }
                                }
-                       } else {
-                               proc_rele(p);
+
                                continue;
                        }
-               } else if (level == 0) {
+
                        /*
-                        * Task wasn't notified when the pressure was increasing and so
-                        * no need to notify it that the pressure is decreasing.
+                        * Failure - first unwind the state,
+                        * then fall through to restart the search.
                         */
-                       if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) {
-                               proc_rele(p);
-                               continue;
+                       proc_list_lock();
+                       proc_rele_locked(p);
+                       if (next_p) {
+                               proc_rele_locked(next_p);
                        }
+                       p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
+                       p->p_memstat_state |= P_MEMSTAT_ERROR;
+                       *errors += 1;
+                       p = NULL;
                }
 
                /*
-                * We don't want a small process to block large processes from
-                * being notified again. <rdar://problem/7955532>
+                * Failure - restart the search at the beginning of
+                * the band we were already traversing.
+                *
+                * We might have raced with "p" exiting on another core, resulting in no
+                * ref on "p".  Or, we may have failed to kill "p".
+                *
+                * Either way, we fall thru to here, leaving the proc in the
+                * P_MEMSTAT_TERMINATED or P_MEMSTAT_ERROR state.
+                *
+                * And, we hold the the proc_list_lock at this point.
                 */
-               resident_size = (get_task_phys_footprint(t)) / (1024 * 1024ULL);  /* MB */
 
-               if (resident_size >= vm_pressure_task_footprint_min) {
-                       if (level > 0) {
-                               /*
-                                * Warning or Critical Pressure.
-                                */
-                               if (pressure_increase) {
-                                       if ((curr_task_importance < selected_task_importance) ||
-                                           ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
-                                               /*
-                                                * We have found a candidate process which is:
-                                                * a) at a lower importance than the current selected process
-                                                * OR
-                                                * b) has importance equal to that of the current selected process but is larger
-                                                */
+               next_p = memorystatus_get_first_proc_locked(&i, TRUE);
+       }
 
-                                               consider_knote = TRUE;
-                                       }
-                               } else {
-                                       if ((curr_task_importance > selected_task_importance) ||
-                                           ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
-                                               /*
-                                                * We have found a candidate process which is:
-                                                * a) at a higher importance than the current selected process
-                                                * OR
-                                                * b) has importance equal to that of the current selected process but is larger
-                                                */
+       proc_list_unlock();
 
-                                               consider_knote = TRUE;
-                                       }
-                               }
-                       } else if (level == 0) {
-                               /*
-                                * Pressure back to normal.
-                                */
-                               if ((curr_task_importance > selected_task_importance) ||
-                                   ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
-                                       consider_knote = TRUE;
-                               }
-                       }
+exit:
+       os_reason_free(jetsam_reason);
 
-                       if (consider_knote) {
-                               resident_max = resident_size;
-                               kn_max = kn;
-                               selected_task_importance = curr_task_importance;
-                               consider_knote = FALSE; /* reset for the next candidate */
-                       }
-               } else {
-                       /* There was no candidate with enough resident memory to scavenge */
-                       VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", p->p_pid, resident_size);
-               }
-               proc_rele(p);
+       /* Clear snapshot if freshly captured and no target was found */
+       if (new_snapshot && (kill_count == 0)) {
+               proc_list_lock();
+               memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
+               proc_list_unlock();
        }
 
-done_scanning:
-       if (kn_max) {
-               VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, knote_get_kq(kn_max)->kq_p->p_pid, resident_max, 0, 0);
-               VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", knote_get_kq(kn_max)->kq_p->p_pid, resident_max);
-       }
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
+           memorystatus_available_pages, 0, kill_count, *memory_reclaimed, 0);
 
-       return kn_max;
+       if (kill_count > 0) {
+               return TRUE;
+       } else {
+               return FALSE;
+       }
 }
 
-#define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD          5000    /* milliseconds */
-#define WARNING_NOTIFICATION_RESTING_PERIOD             25      /* seconds */
-#define CRITICAL_NOTIFICATION_RESTING_PERIOD            25      /* seconds */
+static boolean_t
+memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed)
+{
+       pid_t aPid = 0;
+       proc_t p = PROC_NULL, next_p = PROC_NULL;
+       boolean_t new_snapshot = FALSE, killed = FALSE, freed_mem = FALSE;
+       unsigned int i = 0;
+       uint32_t aPid_ep;
+       os_reason_t jetsam_reason = OS_REASON_NULL;
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
+           memorystatus_available_pages, 0, 0, 0, 0);
 
-uint64_t next_warning_notification_sent_at_ts = 0;
-uint64_t next_critical_notification_sent_at_ts = 0;
+       jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_HIGHWATER);
+       if (jetsam_reason == OS_REASON_NULL) {
+               printf("memorystatus_kill_hiwat_proc: failed to allocate exit reason\n");
+       }
 
-kern_return_t
-memorystatus_update_vm_pressure(boolean_t target_foreground_process)
-{
-       struct knote                    *kn_max = NULL;
-       struct knote                    *kn_cur = NULL, *kn_temp = NULL;  /* for safe list traversal */
-       pid_t                           target_pid = -1;
-       struct klist                    dispatch_klist = { NULL };
-       proc_t                          target_proc = PROC_NULL;
-       struct task                     *task = NULL;
-       boolean_t                       found_candidate = FALSE;
+       proc_list_lock();
 
-       static vm_pressure_level_t      level_snapshot = kVMPressureNormal;
-       static vm_pressure_level_t      prev_level_snapshot = kVMPressureNormal;
-       boolean_t                       smoothing_window_started = FALSE;
-       struct timeval                  smoothing_window_start_tstamp = {0, 0};
-       struct timeval                  curr_tstamp = {0, 0};
-       int                             elapsed_msecs = 0;
-       uint64_t                        curr_ts = mach_absolute_time();
+       next_p = memorystatus_get_first_proc_locked(&i, TRUE);
+       while (next_p) {
+               uint64_t footprint_in_bytes = 0;
+               uint64_t memlimit_in_bytes  = 0;
+               boolean_t skip = 0;
 
-#if !CONFIG_JETSAM
-#define MAX_IDLE_KILLS 100      /* limit the number of idle kills allowed */
+               p = next_p;
+               next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
 
-       int     idle_kill_counter = 0;
+               aPid = p->p_pid;
+               aPid_ep = p->p_memstat_effectivepriority;
 
-       /*
-        * On desktop we take this opportunity to free up memory pressure
-        * by immediately killing idle exitable processes. We use a delay
-        * to avoid overkill.  And we impose a max counter as a fail safe
-        * in case daemons re-launch too fast.
-        */
-       while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
-               if (memorystatus_idle_exit_from_VM() == FALSE) {
-                       /* No idle exitable processes left to kill */
-                       break;
+               if (p->p_memstat_state  & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
+                       continue;
                }
-               idle_kill_counter++;
 
-               if (memorystatus_manual_testing_on == TRUE) {
-                       /*
-                        * Skip the delay when testing
-                        * the pressure notification scheme.
-                        */
-               } else {
-                       delay(1000000);    /* 1 second */
+               /* skip if no limit set */
+               if (p->p_memstat_memlimit <= 0) {
+                       continue;
                }
-       }
-#endif /* !CONFIG_JETSAM */
 
-       if (level_snapshot != kVMPressureNormal) {
-               /*
-                * Check to see if we are still in the 'resting' period
-                * after having notified all clients interested in
-                * a particular pressure level.
-                */
+               footprint_in_bytes = get_task_phys_footprint(p->task);
+               memlimit_in_bytes  = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL);   /* convert MB to bytes */
+               skip = (footprint_in_bytes <= memlimit_in_bytes);
+
+#if CONFIG_FREEZE
+               if (!skip) {
+                       if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
+                               skip = TRUE;
+                       } else {
+                               skip = FALSE;
+                       }
+               }
+#endif
+
+               if (skip) {
+                       continue;
+               } else {
+                       if (memorystatus_jetsam_snapshot_count == 0) {
+                               memorystatus_init_jetsam_snapshot_locked(NULL, 0);
+                               new_snapshot = TRUE;
+                       }
+
+                       if (proc_ref_locked(p) == p) {
+                               /*
+                                * Mark as terminated so that if exit1() indicates success, but the process (for example)
+                                * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
+                                * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
+                                * acquisition of the proc lock.
+                                */
+                               p->p_memstat_state |= P_MEMSTAT_TERMINATED;
+
+                               proc_list_unlock();
+                       } else {
+                               /*
+                                * We need to restart the search again because
+                                * proc_ref_locked _can_ drop the proc_list lock
+                                * and we could have lost our stored next_p via
+                                * an exit() on another core.
+                                */
+                               i = 0;
+                               next_p = memorystatus_get_first_proc_locked(&i, TRUE);
+                               continue;
+                       }
 
-               level_snapshot = memorystatus_vm_pressure_level;
+                       footprint_in_bytes = 0;
+                       freed_mem = memorystatus_kill_proc(p, kMemorystatusKilledHiwat, jetsam_reason, &killed, &footprint_in_bytes); /* purged and/or killed 'p' */
+
+                       /* Success? */
+                       if (freed_mem) {
+                               if (killed == FALSE) {
+                                       /* purged 'p'..don't reset HWM candidate count */
+                                       *purged = TRUE;
 
-               if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
-                       if (next_warning_notification_sent_at_ts) {
-                               if (curr_ts < next_warning_notification_sent_at_ts) {
-                                       delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
-                                       return KERN_SUCCESS;
+                                       proc_list_lock();
+                                       p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
+                                       proc_list_unlock();
+                               } else {
+                                       *memory_reclaimed = footprint_in_bytes;
                                }
+                               proc_rele(p);
+                               goto exit;
+                       }
+                       /*
+                        * Failure - first unwind the state,
+                        * then fall through to restart the search.
+                        */
+                       proc_list_lock();
+                       proc_rele_locked(p);
+                       p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
+                       p->p_memstat_state |= P_MEMSTAT_ERROR;
+                       *errors += 1;
 
-                               next_warning_notification_sent_at_ts = 0;
-                               memorystatus_klist_reset_all_for_level(kVMPressureWarning);
-                       }
-               } else if (level_snapshot == kVMPressureCritical) {
-                       if (next_critical_notification_sent_at_ts) {
-                               if (curr_ts < next_critical_notification_sent_at_ts) {
-                                       delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
-                                       return KERN_SUCCESS;
-                               }
-                               next_critical_notification_sent_at_ts = 0;
-                               memorystatus_klist_reset_all_for_level(kVMPressureCritical);
-                       }
+                       i = 0;
+                       next_p = memorystatus_get_first_proc_locked(&i, TRUE);
                }
        }
 
-       while (1) {
-               /*
-                * There is a race window here. But it's not clear
-                * how much we benefit from having extra synchronization.
-                */
-               level_snapshot = memorystatus_vm_pressure_level;
+       proc_list_unlock();
 
-               if (prev_level_snapshot > level_snapshot) {
-                       /*
-                        * Pressure decreased? Let's take a little breather
-                        * and see if this condition stays.
-                        */
-                       if (smoothing_window_started == FALSE) {
-                               smoothing_window_started = TRUE;
-                               microuptime(&smoothing_window_start_tstamp);
-                       }
+exit:
+       os_reason_free(jetsam_reason);
 
-                       microuptime(&curr_tstamp);
-                       timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
-                       elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
+       if (!killed) {
+               *memory_reclaimed = 0;
 
-                       if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
-                               delay(INTER_NOTIFICATION_DELAY);
-                               continue;
-                       }
+               /* Clear snapshot if freshly captured and no target was found */
+               if (new_snapshot) {
+                       proc_list_lock();
+                       memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
+                       proc_list_unlock();
                }
+       }
+
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
+           memorystatus_available_pages, killed ? aPid : 0, killed, *memory_reclaimed, 0);
 
-               prev_level_snapshot = level_snapshot;
-               smoothing_window_started = FALSE;
+       return killed;
+}
 
-               memorystatus_klist_lock();
-               kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process);
+/*
+ * Jetsam a process pinned in the elevated band.
+ *
+ * Return:  true -- a pinned process was jetsammed
+ *         false -- no pinned process was jetsammed
+ */
+boolean_t
+memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors, uint64_t *memory_reclaimed)
+{
+       pid_t aPid = 0;
+       proc_t p = PROC_NULL, next_p = PROC_NULL;
+       boolean_t new_snapshot = FALSE, killed = FALSE;
+       int kill_count = 0;
+       uint32_t aPid_ep;
+       uint64_t killtime = 0;
+       clock_sec_t     tv_sec;
+       clock_usec_t    tv_usec;
+       uint32_t        tv_msec;
+       uint64_t footprint_of_killed_proc = 0;
 
-               if (kn_max == NULL) {
-                       memorystatus_klist_unlock();
 
-                       /*
-                        * No more level-based clients to notify.
-                        *
-                        * Start the 'resting' window within which clients will not be re-notified.
-                        */
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
+           memorystatus_available_pages, 0, 0, 0, 0);
 
-                       if (level_snapshot != kVMPressureNormal) {
-                               if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
-                                       nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
+#if CONFIG_FREEZE
+       boolean_t consider_frozen_only = FALSE;
 
-                                       /* Next warning notification (if nothing changes) won't be sent before...*/
-                                       next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts;
-                               }
+       if (band == (unsigned int) memorystatus_freeze_jetsam_band) {
+               consider_frozen_only = TRUE;
+       }
+#endif /* CONFIG_FREEZE */
 
-                               if (level_snapshot == kVMPressureCritical) {
-                                       nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
+       proc_list_lock();
 
-                                       /* Next critical notification (if nothing changes) won't be sent before...*/
-                                       next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts;
-                               }
-                       }
-                       return KERN_FAILURE;
-               }
+       next_p = memorystatus_get_first_proc_locked(&band, FALSE);
+       while (next_p) {
+               p = next_p;
+               next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
 
-               target_proc = knote_get_kq(kn_max)->kq_p;
+               aPid = p->p_pid;
+               aPid_ep = p->p_memstat_effectivepriority;
 
-               proc_list_lock();
-               if (target_proc != proc_ref_locked(target_proc)) {
-                       target_proc = PROC_NULL;
-                       proc_list_unlock();
-                       memorystatus_klist_unlock();
+               /*
+                * Only pick a process pinned in this elevated band
+                */
+               if (!(p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) {
                        continue;
                }
-               proc_list_unlock();
-
-               target_pid = target_proc->p_pid;
-
-               task = (struct task *)(target_proc->task);
-
-               if (level_snapshot != kVMPressureNormal) {
-                       if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
-                               if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) {
-                                       found_candidate = TRUE;
-                               }
-                       } else {
-                               if (level_snapshot == kVMPressureCritical) {
-                                       if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) {
-                                               found_candidate = TRUE;
-                                       }
-                               }
-                       }
-               } else {
-                       if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
-                               task_clear_has_been_notified(task, kVMPressureWarning);
-                               task_clear_has_been_notified(task, kVMPressureCritical);
 
-                               found_candidate = TRUE;
-                       }
+               if (p->p_memstat_state  & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
+                       continue;
                }
 
-               if (found_candidate == FALSE) {
-                       proc_rele(target_proc);
-                       memorystatus_klist_unlock();
+#if CONFIG_FREEZE
+               if (consider_frozen_only && !(p->p_memstat_state & P_MEMSTAT_FROZEN)) {
                        continue;
                }
 
-               SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
-                       int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot);
-
-                       if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) {
-                               proc_t knote_proc = knote_get_kq(kn_cur)->kq_p;
-                               pid_t knote_pid = knote_proc->p_pid;
-                               if (knote_pid == target_pid) {
-                                       KNOTE_DETACH(&memorystatus_klist, kn_cur);
-                                       KNOTE_ATTACH(&dispatch_klist, kn_cur);
-                               }
-                       }
+               if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
+                       continue;
                }
+#endif /* CONFIG_FREEZE */
 
-               KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
+#if DEVELOPMENT || DEBUG
+               MEMORYSTATUS_DEBUG(1, "jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n",
+                   aggr_count,
+                   aPid, (*p->p_name ? p->p_name : "unknown"),
+                   memorystatus_available_pages);
+#endif /* DEVELOPMENT || DEBUG */
 
-               SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
-                       KNOTE_DETACH(&dispatch_klist, kn_cur);
-                       KNOTE_ATTACH(&memorystatus_klist, kn_cur);
+               if (memorystatus_jetsam_snapshot_count == 0) {
+                       memorystatus_init_jetsam_snapshot_locked(NULL, 0);
+                       new_snapshot = TRUE;
                }
 
-               memorystatus_klist_unlock();
+               p->p_memstat_state |= P_MEMSTAT_TERMINATED;
+
+               killtime = mach_absolute_time();
+               absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
+               tv_msec = tv_usec / 1000;
 
-               microuptime(&target_proc->vm_pressure_last_notify_tstamp);
-               proc_rele(target_proc);
+               memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
 
-               if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
-                       break;
-               }
+               if (proc_ref_locked(p) == p) {
+                       proc_list_unlock();
 
-               if (memorystatus_manual_testing_on == TRUE) {
                        /*
-                        * Testing out the pressure notification scheme.
-                        * No need for delays etc.
+                        * memorystatus_do_kill drops a reference, so take another one so we can
+                        * continue to use this exit reason even after memorystatus_do_kill()
+                        * returns
                         */
-               } else {
-                       uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
-#if CONFIG_JETSAM
-                       unsigned int page_delta = 0;
-                       unsigned int skip_delay_page_threshold = 0;
-
-                       assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
+                       os_reason_ref(jetsam_reason);
+                       killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
 
-                       page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
-                       skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
+                       os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_top_process_elevated%d pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu\n",
+                           (unsigned long)tv_sec, tv_msec,
+                           aggr_count,
+                           aPid, ((p && *p->p_name) ? p->p_name : "unknown"),
+                           memorystatus_kill_cause_name[cause], aPid_ep,
+                           footprint_of_killed_proc >> 10, (uint64_t)memorystatus_available_pages);
 
-                       if (memorystatus_available_pages <= skip_delay_page_threshold) {
-                               /*
-                                * We are nearing the critcal mark fast and can't afford to wait between
-                                * notifications.
-                                */
-                               sleep_interval = 0;
+                       /* Success? */
+                       if (killed) {
+                               *memory_reclaimed = footprint_of_killed_proc;
+                               proc_rele(p);
+                               kill_count++;
+                               goto exit;
                        }
-#endif /* CONFIG_JETSAM */
 
-                       if (sleep_interval) {
-                               delay(sleep_interval);
-                       }
+                       /*
+                        * Failure - first unwind the state,
+                        * then fall through to restart the search.
+                        */
+                       proc_list_lock();
+                       proc_rele_locked(p);
+                       p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
+                       p->p_memstat_state |= P_MEMSTAT_ERROR;
+                       *errors += 1;
                }
-       }
-
-       return KERN_SUCCESS;
-}
 
-vm_pressure_level_t
-convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
-{
-       vm_pressure_level_t     dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
+               /*
+                * Failure - restart the search.
+                *
+                * We might have raced with "p" exiting on another core, resulting in no
+                * ref on "p".  Or, we may have failed to kill "p".
+                *
+                * Either way, we fall thru to here, leaving the proc in the
+                * P_MEMSTAT_TERMINATED state or P_MEMSTAT_ERROR state.
+                *
+                * And, we hold the the proc_list_lock at this point.
+                */
 
-       switch (internal_pressure_level) {
-       case kVMPressureNormal:
-       {
-               dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
-               break;
+               next_p = memorystatus_get_first_proc_locked(&band, FALSE);
        }
 
-       case kVMPressureWarning:
-       case kVMPressureUrgent:
-       {
-               dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
-               break;
-       }
+       proc_list_unlock();
 
-       case kVMPressureCritical:
-       {
-               dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
-               break;
-       }
+exit:
+       os_reason_free(jetsam_reason);
 
-       default:
-               break;
+       if (kill_count == 0) {
+               *memory_reclaimed = 0;
+
+               /* Clear snapshot if freshly captured and no target was found */
+               if (new_snapshot) {
+                       proc_list_lock();
+                       memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
+                       proc_list_unlock();
+               }
        }
 
-       return dispatch_level;
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
+           memorystatus_available_pages, killed ? aPid : 0, kill_count, *memory_reclaimed, 0);
+
+       return killed;
 }
 
-static int
-sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
+static boolean_t
+memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause)
 {
-#pragma unused(arg1, arg2, oidp)
-#if CONFIG_EMBEDDED
-       int error = 0;
-
-       error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
-       if (error) {
-               return error;
+       /*
+        * TODO: allow a general async path
+        *
+        * NOTE: If a new async kill cause is added, make sure to update memorystatus_thread() to
+        * add the appropriate exit reason code mapping.
+        */
+       if ((victim_pid != -1) ||
+           (cause != kMemorystatusKilledVMPageShortage &&
+           cause != kMemorystatusKilledVMCompressorThrashing &&
+           cause != kMemorystatusKilledVMCompressorSpaceShortage &&
+           cause != kMemorystatusKilledFCThrashing &&
+           cause != kMemorystatusKilledZoneMapExhaustion)) {
+               return FALSE;
        }
 
-#endif /* CONFIG_EMBEDDED */
-       vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
-
-       return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
+       kill_under_pressure_cause = cause;
+       memorystatus_thread_wake();
+       return TRUE;
 }
 
-#if DEBUG || DEVELOPMENT
-
-SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
-    0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
-
-#else /* DEBUG || DEVELOPMENT */
-
-SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
-    0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
-
-#endif /* DEBUG || DEVELOPMENT */
-
-
-static int
-sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
+boolean_t
+memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async)
 {
-#pragma unused(arg1, arg2)
-
-       int level = 0;
-       int error = 0;
-       int pressure_level = 0;
-       int trigger_request = 0;
-       int force_purge;
+       if (async) {
+               return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorSpaceShortage);
+       } else {
+               os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE);
+               if (jetsam_reason == OS_REASON_NULL) {
+                       printf("memorystatus_kill_on_VM_compressor_space_shortage -- sync: failed to allocate jetsam reason\n");
+               }
 
-       error = sysctl_handle_int(oidp, &level, 0, req);
-       if (error || !req->newptr) {
-               return error;
+               return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorSpaceShortage, jetsam_reason);
        }
+}
 
-       memorystatus_manual_testing_on = TRUE;
-
-       trigger_request = (level >> 16) & 0xFFFF;
-       pressure_level = (level & 0xFFFF);
+#if CONFIG_JETSAM
+boolean_t
+memorystatus_kill_on_VM_compressor_thrashing(boolean_t async)
+{
+       if (async) {
+               return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorThrashing);
+       } else {
+               os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING);
+               if (jetsam_reason == OS_REASON_NULL) {
+                       printf("memorystatus_kill_on_VM_compressor_thrashing -- sync: failed to allocate jetsam reason\n");
+               }
 
-       if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
-           trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
-               return EINVAL;
-       }
-       switch (pressure_level) {
-       case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
-       case NOTE_MEMORYSTATUS_PRESSURE_WARN:
-       case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
-               break;
-       default:
-               return EINVAL;
+               return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorThrashing, jetsam_reason);
        }
+}
 
-       /*
-        * The pressure level is being set from user-space.
-        * And user-space uses the constants in sys/event.h
-        * So we translate those events to our internal levels here.
-        */
-       if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
-               memorystatus_manual_testing_level = kVMPressureNormal;
-               force_purge = 0;
-       } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
-               memorystatus_manual_testing_level = kVMPressureWarning;
-               force_purge = vm_pageout_state.memorystatus_purge_on_warning;
-       } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
-               memorystatus_manual_testing_level = kVMPressureCritical;
-               force_purge = vm_pageout_state.memorystatus_purge_on_critical;
-       }
-
-       memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
-
-       /* purge according to the new pressure level */
-       switch (trigger_request) {
-       case TEST_PURGEABLE_TRIGGER_ONE:
-       case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
-               if (force_purge == 0) {
-                       /* no purging requested */
-                       break;
-               }
-               vm_purgeable_object_purge_one_unlocked(force_purge);
-               break;
-       case TEST_PURGEABLE_TRIGGER_ALL:
-       case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
-               if (force_purge == 0) {
-                       /* no purging requested */
-                       break;
-               }
-               while (vm_purgeable_object_purge_one_unlocked(force_purge)) {
-                       ;
+boolean_t
+memorystatus_kill_on_VM_page_shortage(boolean_t async)
+{
+       if (async) {
+               return memorystatus_kill_process_async(-1, kMemorystatusKilledVMPageShortage);
+       } else {
+               os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMPAGESHORTAGE);
+               if (jetsam_reason == OS_REASON_NULL) {
+                       printf("memorystatus_kill_on_VM_page_shortage -- sync: failed to allocate jetsam reason\n");
                }
-               break;
-       }
 
-       if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
-           (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
-               memorystatus_update_vm_pressure(TRUE);
+               return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage, jetsam_reason);
        }
+}
 
-       if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
-           (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
-               while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
-                       continue;
+boolean_t
+memorystatus_kill_on_FC_thrashing(boolean_t async)
+{
+       if (async) {
+               return memorystatus_kill_process_async(-1, kMemorystatusKilledFCThrashing);
+       } else {
+               os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_FCTHRASHING);
+               if (jetsam_reason == OS_REASON_NULL) {
+                       printf("memorystatus_kill_on_FC_thrashing -- sync: failed to allocate jetsam reason\n");
                }
+
+               return memorystatus_kill_process_sync(-1, kMemorystatusKilledFCThrashing, jetsam_reason);
        }
+}
 
-       if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
-               memorystatus_manual_testing_on = FALSE;
+boolean_t
+memorystatus_kill_on_vnode_limit(void)
+{
+       os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_VNODE);
+       if (jetsam_reason == OS_REASON_NULL) {
+               printf("memorystatus_kill_on_vnode_limit: failed to allocate jetsam reason\n");
        }
 
-       return 0;
+       return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason);
 }
 
-SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
-    0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
-
+#endif /* CONFIG_JETSAM */
 
-SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, "");
-SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, "");
-SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, "");
+boolean_t
+memorystatus_kill_on_zone_map_exhaustion(pid_t pid)
+{
+       boolean_t res = FALSE;
+       if (pid == -1) {
+               res = memorystatus_kill_process_async(-1, kMemorystatusKilledZoneMapExhaustion);
+       } else {
+               os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_ZONE_MAP_EXHAUSTION);
+               if (jetsam_reason == OS_REASON_NULL) {
+                       printf("memorystatus_kill_on_zone_map_exhaustion: failed to allocate jetsam reason\n");
+               }
 
-#if DEBUG || DEVELOPMENT
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, "");
-#endif
+               res = memorystatus_kill_process_sync(pid, kMemorystatusKilledZoneMapExhaustion, jetsam_reason);
+       }
+       return res;
+}
 
-#endif /* VM_PRESSURE_EVENTS */
+void
+memorystatus_on_pageout_scan_end(void)
+{
+       /* No-op */
+}
 
 /* Return both allocated and actual size, since there's a race between allocation and list compilation */
 static int
@@ -8586,6 +6127,7 @@ memorystatus_get_priority_pid(pid_t pid, user_addr_t buffer, size_t buffer_size)
 {
        int error = 0;
        memorystatus_priority_entry_t mp_entry;
+       kern_return_t ret;
 
        /* Validate inputs */
        if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_entry_t))) {
@@ -8603,7 +6145,11 @@ memorystatus_get_priority_pid(pid_t pid, user_addr_t buffer, size_t buffer_size)
        mp_entry.priority = p->p_memstat_effectivepriority;
        mp_entry.user_data = p->p_memstat_userdata;
        if (p->p_memstat_memlimit <= 0) {
-               task_get_phys_footprint_limit(p->task, &mp_entry.limit);
+               ret = task_get_phys_footprint_limit(p->task, &mp_entry.limit);
+               if (ret != KERN_SUCCESS) {
+                       proc_rele(p);
+                       return EINVAL;
+               }
        } else {
                mp_entry.limit = p->p_memstat_memlimit;
        }
@@ -8701,19 +6247,6 @@ memorystatus_update_levels_locked(boolean_t critical_only)
                }
        }
 
-#if DEBUG || DEVELOPMENT
-       if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
-               memorystatus_available_pages_critical += memorystatus_jetsam_policy_offset_pages_diagnostic;
-
-               if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure) {
-                       /*
-                        * The critical threshold must never exceed the pressure threshold
-                        */
-                       memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
-               }
-       }
-#endif /* DEBUG || DEVELOPMENT */
-
        if (memorystatus_jetsam_policy & kPolicyMoreFree) {
                memorystatus_available_pages_critical += memorystatus_policy_more_free_offset_pages;
        }
@@ -8724,11 +6257,6 @@ memorystatus_update_levels_locked(boolean_t critical_only)
 
 #if VM_PRESSURE_EVENTS
        memorystatus_available_pages_pressure = (pressure_threshold_percentage / delta_percentage) * memorystatus_delta;
-#if DEBUG || DEVELOPMENT
-       if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
-               memorystatus_available_pages_pressure += memorystatus_jetsam_policy_offset_pages_diagnostic;
-       }
-#endif
 #endif
 }
 
@@ -9236,6 +6764,9 @@ out:
        return error;
 }
 
+memorystatus_internal_probabilities_t *memorystatus_global_probabilities_table = NULL;
+size_t memorystatus_global_probabilities_size = 0;
+
 static int
 memorystatus_cmd_grp_set_probabilities(user_addr_t buffer, size_t buffer_size)
 {
@@ -9353,12 +6884,24 @@ memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t bu
  * This routine is used to update a process's jetsam priority position and stored user_data.
  * It is not used for the setting of memory limits, which is why the last 6 args to the
  * memorystatus_update() call are 0 or FALSE.
+ *
+ * Flags passed into this call are used to distinguish the motivation behind a jetsam priority
+ * transition.  By default, the kernel updates the process's original requested priority when
+ * no flag is passed.  But when the MEMORYSTATUS_SET_PRIORITY_ASSERTION flag is used, the kernel
+ * updates the process's assertion driven priority.
+ *
+ * The assertion flag was introduced for use by the device's assertion mediator (eg: runningboardd).
+ * When an assertion is controlling a process's jetsam priority, it may conflict with that process's
+ * dirty/clean (active/inactive) jetsam state.  The kernel attempts to resolve a priority transition
+ * conflict by reviewing the process state and then choosing the maximum jetsam band at play,
+ * eg: requested priority versus assertion priority.
  */
 
 static int
-memorystatus_cmd_set_priority_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
+memorystatus_cmd_set_priority_properties(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
 {
        int error = 0;
+       boolean_t is_assertion = FALSE;         /* priority is driven by an assertion */
        memorystatus_priority_properties_t mpp_entry;
 
        /* Validate inputs */
@@ -9366,6 +6909,22 @@ memorystatus_cmd_set_priority_properties(pid_t pid, user_addr_t buffer, size_t b
                return EINVAL;
        }
 
+       /* Validate flags */
+       if (flags == 0) {
+               /*
+                * Default. This path updates requestedpriority.
+                */
+       } else {
+               if (flags & ~(MEMORYSTATUS_SET_PRIORITY_ASSERTION)) {
+                       /*
+                        * Unsupported bit set in flag.
+                        */
+                       return EINVAL;
+               } else if (flags & MEMORYSTATUS_SET_PRIORITY_ASSERTION) {
+                       is_assertion = TRUE;
+               }
+       }
+
        error = copyin(buffer, &mpp_entry, buffer_size);
 
        if (error == 0) {
@@ -9381,7 +6940,12 @@ memorystatus_cmd_set_priority_properties(pid_t pid, user_addr_t buffer, size_t b
                        return EPERM;
                }
 
-               error = memorystatus_update(p, mpp_entry.priority, mpp_entry.user_data, FALSE, FALSE, 0, 0, FALSE, FALSE);
+               if (is_assertion) {
+                       os_log(OS_LOG_DEFAULT, "memorystatus: set assertion priority(%d) target %s:%d\n",
+                           mpp_entry.priority, (*p->p_name ? p->p_name : "unknown"), p->p_pid);
+               }
+
+               error = memorystatus_update(p, mpp_entry.priority, mpp_entry.user_data, is_assertion, FALSE, FALSE, 0, 0, FALSE, FALSE);
                proc_rele(p);
        }
 
@@ -9408,6 +6972,34 @@ memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t b
        return error;
 }
 
+static void
+memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t* p_entry)
+{
+       memset(p_entry, 0, sizeof(memorystatus_memlimit_properties_t));
+
+       if (p->p_memstat_memlimit_active > 0) {
+               p_entry->memlimit_active = p->p_memstat_memlimit_active;
+       } else {
+               task_convert_phys_footprint_limit(-1, &p_entry->memlimit_active);
+       }
+
+       if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) {
+               p_entry->memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
+       }
+
+       /*
+        * Get the inactive limit and attributes
+        */
+       if (p->p_memstat_memlimit_inactive <= 0) {
+               task_convert_phys_footprint_limit(-1, &p_entry->memlimit_inactive);
+       } else {
+               p_entry->memlimit_inactive = p->p_memstat_memlimit_inactive;
+       }
+       if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) {
+               p_entry->memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
+       }
+}
+
 /*
  * When getting the memlimit settings, we can't simply call task_get_phys_footprint_limit().
  * That gets the proc's cached memlimit and there is no guarantee that the active/inactive
@@ -9418,15 +7010,16 @@ memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t b
 static int
 memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
 {
-       int error = 0;
-       memorystatus_memlimit_properties_t mmp_entry;
+       memorystatus_memlimit_properties2_t mmp_entry;
 
        /* Validate inputs */
-       if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
+       if ((pid == 0) || (buffer == USER_ADDR_NULL) ||
+           ((buffer_size != sizeof(memorystatus_memlimit_properties_t)) &&
+           (buffer_size != sizeof(memorystatus_memlimit_properties2_t)))) {
                return EINVAL;
        }
 
-       memset(&mmp_entry, 0, sizeof(memorystatus_memlimit_properties_t));
+       memset(&mmp_entry, 0, sizeof(memorystatus_memlimit_properties2_t));
 
        proc_t p = proc_find(pid);
        if (!p) {
@@ -9438,30 +7031,21 @@ memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t b
         * No locks taken since we hold a reference to the proc.
         */
 
-       if (p->p_memstat_memlimit_active > 0) {
-               mmp_entry.memlimit_active = p->p_memstat_memlimit_active;
-       } else {
-               task_convert_phys_footprint_limit(-1, &mmp_entry.memlimit_active);
-       }
-
-       if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) {
-               mmp_entry.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
-       }
+       memorystatus_get_memlimit_properties_internal(p, &mmp_entry.v1);
 
+#if CONFIG_JETSAM
+#if DEVELOPMENT || DEBUG
        /*
-        * Get the inactive limit and attributes
+        * Get the limit increased via SPI
         */
-       if (p->p_memstat_memlimit_inactive <= 0) {
-               task_convert_phys_footprint_limit(-1, &mmp_entry.memlimit_inactive);
-       } else {
-               mmp_entry.memlimit_inactive = p->p_memstat_memlimit_inactive;
-       }
-       if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) {
-               mmp_entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
-       }
+       mmp_entry.memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
+       mmp_entry.memlimit_increase_bytes = p->p_memlimit_increase;
+#endif /* DEVELOPMENT || DEBUG */
+#endif /* CONFIG_JETSAM */
+
        proc_rele(p);
 
-       error = copyout(&mmp_entry, buffer, buffer_size);
+       int error = copyout(&mmp_entry, buffer, buffer_size);
 
        return error;
 }
@@ -9586,87 +7170,21 @@ memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __u
 #endif /* CONFIG_JETSAM */
 
 static int
-memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry)
+memorystatus_set_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry)
 {
-       int32_t  memlimit_active;
-       boolean_t memlimit_active_is_fatal;
-       int32_t  memlimit_inactive;
-       boolean_t memlimit_inactive_is_fatal;
-       uint32_t valid_attrs = 0;
-       int       error = 0;
-
-       proc_t p = proc_find(pid);
-       if (!p) {
-               return ESRCH;
-       }
-
-       /*
-        * Check for valid attribute flags.
-        */
-       valid_attrs |= (MEMORYSTATUS_MEMLIMIT_ATTR_FATAL);
-       if ((entry->memlimit_active_attr & (~valid_attrs)) != 0) {
-               proc_rele(p);
-               return EINVAL;
-       }
-       if ((entry->memlimit_inactive_attr & (~valid_attrs)) != 0) {
-               proc_rele(p);
-               return EINVAL;
-       }
-
-       /*
-        * Setup the active memlimit properties
-        */
-       memlimit_active = entry->memlimit_active;
-       if (entry->memlimit_active_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
-               memlimit_active_is_fatal = TRUE;
-       } else {
-               memlimit_active_is_fatal = FALSE;
-       }
-
-       /*
-        * Setup the inactive memlimit properties
-        */
-       memlimit_inactive = entry->memlimit_inactive;
-       if (entry->memlimit_inactive_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
-               memlimit_inactive_is_fatal = TRUE;
-       } else {
-               memlimit_inactive_is_fatal = FALSE;
-       }
-
-       /*
-        * Setting a limit of <= 0 implies that the process has no
-        * high-water-mark and has no per-task-limit.  That means
-        * the system_wide task limit is in place, which by the way,
-        * is always fatal.
-        */
-
-       if (memlimit_active <= 0) {
-               /*
-                * Enforce the fatal system_wide task limit while process is active.
-                */
-               memlimit_active = -1;
-               memlimit_active_is_fatal = TRUE;
-       }
-
-       if (memlimit_inactive <= 0) {
-               /*
-                * Enforce the fatal system_wide task limit while process is inactive.
-                */
-               memlimit_inactive = -1;
-               memlimit_inactive_is_fatal = TRUE;
-       }
+       int error = 0;
 
-       proc_list_lock();
+       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
 
        /*
         * Store the active limit variants in the proc.
         */
-       SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal);
+       SET_ACTIVE_LIMITS_LOCKED(p, p_entry->memlimit_active, p_entry->memlimit_active_attr);
 
        /*
         * Store the inactive limit variants in the proc.
         */
-       SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal);
+       SET_INACTIVE_LIMITS_LOCKED(p, p_entry->memlimit_inactive, p_entry->memlimit_inactive_attr);
 
        /*
         * Enforce appropriate limit variant by updating the cached values
@@ -9696,84 +7214,116 @@ memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties
                DTRACE_MEMORYSTATUS2(memorystatus_set_memlimit, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1));
        }
 
-       proc_list_unlock();
-       proc_rele(p);
-
        return error;
 }
 
-/*
- * Returns the jetsam priority (effective or requested) of the process
- * associated with this task.
- */
-int
-proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
-{
-       if (p) {
-               if (effective_priority) {
-                       return p->p_memstat_effectivepriority;
-               } else {
-                       return p->p_memstat_requestedpriority;
-               }
-       }
-       return 0;
-}
-
 static int
-memorystatus_get_process_is_managed(pid_t pid, int *is_managed)
+memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry)
 {
-       proc_t p = NULL;
-
-       /* Validate inputs */
-       if (pid == 0) {
-               return EINVAL;
-       }
+       memorystatus_memlimit_properties_t set_entry;
 
-       p = proc_find(pid);
+       proc_t p = proc_find(pid);
        if (!p) {
                return ESRCH;
        }
 
-       proc_list_lock();
-       *is_managed = ((p->p_memstat_state & P_MEMSTAT_MANAGED) ? 1 : 0);
-       proc_rele_locked(p);
-       proc_list_unlock();
+       /*
+        * Check for valid attribute flags.
+        */
+       const uint32_t valid_attrs = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
+       if ((entry->memlimit_active_attr & (~valid_attrs)) != 0) {
+               proc_rele(p);
+               return EINVAL;
+       }
+       if ((entry->memlimit_inactive_attr & (~valid_attrs)) != 0) {
+               proc_rele(p);
+               return EINVAL;
+       }
 
-       return 0;
-}
+       /*
+        * Setup the active memlimit properties
+        */
+       set_entry.memlimit_active = entry->memlimit_active;
+       set_entry.memlimit_active_attr = entry->memlimit_active_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
 
-static int
-memorystatus_set_process_is_managed(pid_t pid, boolean_t set_managed)
-{
-       proc_t p = NULL;
+       /*
+        * Setup the inactive memlimit properties
+        */
+       set_entry.memlimit_inactive = entry->memlimit_inactive;
+       set_entry.memlimit_inactive_attr = entry->memlimit_inactive_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
 
-       /* Validate inputs */
-       if (pid == 0) {
-               return EINVAL;
-       }
+       /*
+        * Setting a limit of <= 0 implies that the process has no
+        * high-water-mark and has no per-task-limit.  That means
+        * the system_wide task limit is in place, which by the way,
+        * is always fatal.
+        */
 
-       p = proc_find(pid);
-       if (!p) {
-               return ESRCH;
+       if (set_entry.memlimit_active <= 0) {
+               /*
+                * Enforce the fatal system_wide task limit while process is active.
+                */
+               set_entry.memlimit_active = -1;
+               set_entry.memlimit_active_attr = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
+       }
+#if CONFIG_JETSAM
+#if DEVELOPMENT || DEBUG
+       else {
+               /* add the current increase to it, for roots */
+               set_entry.memlimit_active += roundToNearestMB(p->p_memlimit_increase);
        }
+#endif /* DEVELOPMENT || DEBUG */
+#endif /* CONFIG_JETSAM */
 
-       proc_list_lock();
-       if (set_managed == TRUE) {
-               p->p_memstat_state |= P_MEMSTAT_MANAGED;
-       } else {
-               p->p_memstat_state &= ~P_MEMSTAT_MANAGED;
+       if (set_entry.memlimit_inactive <= 0) {
+               /*
+                * Enforce the fatal system_wide task limit while process is inactive.
+                */
+               set_entry.memlimit_inactive = -1;
+               set_entry.memlimit_inactive_attr = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
        }
-       proc_rele_locked(p);
+#if CONFIG_JETSAM
+#if DEVELOPMENT || DEBUG
+       else {
+               /* add the current increase to it, for roots */
+               set_entry.memlimit_inactive += roundToNearestMB(p->p_memlimit_increase);
+       }
+#endif /* DEVELOPMENT || DEBUG */
+#endif /* CONFIG_JETSAM */
+
+       proc_list_lock();
+
+       int error = memorystatus_set_memlimit_properties_internal(p, &set_entry);
+
        proc_list_unlock();
+       proc_rele(p);
+
+       return error;
+}
 
+/*
+ * Returns the jetsam priority (effective or requested) of the process
+ * associated with this task.
+ */
+int
+proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
+{
+       if (p) {
+               if (effective_priority) {
+                       return p->p_memstat_effectivepriority;
+               } else {
+                       return p->p_memstat_requestedpriority;
+               }
+       }
        return 0;
 }
 
 static int
-memorystatus_get_process_is_freezable(pid_t pid, int *is_freezable)
+memorystatus_get_process_is_managed(pid_t pid, int *is_managed)
 {
-       proc_t p = PROC_NULL;
+       proc_t p = NULL;
 
+       /* Validate inputs */
        if (pid == 0) {
                return EINVAL;
        }
@@ -9783,17 +7333,8 @@ memorystatus_get_process_is_freezable(pid_t pid, int *is_freezable)
                return ESRCH;
        }
 
-       /*
-        * Only allow this on the current proc for now.
-        * We can check for privileges and allow targeting another process in the future.
-        */
-       if (p != current_proc()) {
-               proc_rele(p);
-               return EPERM;
-       }
-
        proc_list_lock();
-       *is_freezable = ((p->p_memstat_state & P_MEMSTAT_FREEZE_DISABLED) ? 0 : 1);
+       *is_managed = ((p->p_memstat_state & P_MEMSTAT_MANAGED) ? 1 : 0);
        proc_rele_locked(p);
        proc_list_unlock();
 
@@ -9801,10 +7342,11 @@ memorystatus_get_process_is_freezable(pid_t pid, int *is_freezable)
 }
 
 static int
-memorystatus_set_process_is_freezable(pid_t pid, boolean_t is_freezable)
+memorystatus_set_process_is_managed(pid_t pid, boolean_t set_managed)
 {
-       proc_t p = PROC_NULL;
+       proc_t p = NULL;
 
+       /* Validate inputs */
        if (pid == 0) {
                return EINVAL;
        }
@@ -9814,25 +7356,17 @@ memorystatus_set_process_is_freezable(pid_t pid, boolean_t is_freezable)
                return ESRCH;
        }
 
-       /*
-        * Only allow this on the current proc for now.
-        * We can check for privileges and allow targeting another process in the future.
-        */
-       if (p != current_proc()) {
-               proc_rele(p);
-               return EPERM;
-       }
-
        proc_list_lock();
-       if (is_freezable == FALSE) {
-               /* Freeze preference set to FALSE. Set the P_MEMSTAT_FREEZE_DISABLED bit. */
-               p->p_memstat_state |= P_MEMSTAT_FREEZE_DISABLED;
-               printf("memorystatus_set_process_is_freezable: disabling freeze for pid %d [%s]\n",
-                   p->p_pid, (*p->p_name ? p->p_name : "unknown"));
-       } else {
+       if (set_managed == TRUE) {
+               p->p_memstat_state |= P_MEMSTAT_MANAGED;
+               /*
+                * The P_MEMSTAT_MANAGED bit is set by assertiond for Apps.
+                * Also opt them in to being frozen (they might have started
+                * off with the P_MEMSTAT_FREEZE_DISABLED bit set.)
+                */
                p->p_memstat_state &= ~P_MEMSTAT_FREEZE_DISABLED;
-               printf("memorystatus_set_process_is_freezable: enabling freeze for pid %d [%s]\n",
-                   p->p_pid, (*p->p_name ? p->p_name : "unknown"));
+       } else {
+               p->p_memstat_state &= ~P_MEMSTAT_MANAGED;
        }
        proc_rele_locked(p);
        proc_list_unlock();
@@ -9848,8 +7382,8 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *
        os_reason_t jetsam_reason = OS_REASON_NULL;
 
 #if !CONFIG_JETSAM
-       #pragma unused(ret)
-       #pragma unused(jetsam_reason)
+    #pragma unused(ret)
+    #pragma unused(jetsam_reason)
 #endif
 
        /* We don't need entitlements if we're setting/ querying the freeze preference for a process. Skip the check below. */
@@ -9879,7 +7413,7 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *
                error = memorystatus_cmd_get_priority_list(args->pid, args->buffer, args->buffersize, ret);
                break;
        case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES:
-               error = memorystatus_cmd_set_priority_properties(args->pid, args->buffer, args->buffersize, ret);
+               error = memorystatus_cmd_set_priority_properties(args->pid, args->flags, args->buffer, args->buffersize, ret);
                break;
        case MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES:
                error = memorystatus_cmd_set_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
@@ -9957,6 +7491,10 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *
                memorystatus_aggressive_jetsam_lenient = FALSE;
                error = 0;
                break;
+       case MEMORYSTATUS_CMD_GET_AGGRESSIVE_JETSAM_LENIENT_MODE:
+               *ret = (memorystatus_aggressive_jetsam_lenient ? 1 : 0);
+               error = 0;
+               break;
        case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE:
        case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE:
                error = memorystatus_low_mem_privileged_listener(args->command);
@@ -9974,6 +7512,7 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *
                error = memorystatus_get_process_is_managed(args->pid, ret);
                break;
 
+#if CONFIG_FREEZE
        case MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE:
                error = memorystatus_set_process_is_freezable(args->pid, args->flags ? TRUE : FALSE);
                break;
@@ -9982,7 +7521,6 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *
                error = memorystatus_get_process_is_freezable(args->pid, ret);
                break;
 
-#if CONFIG_FREEZE
 #if DEVELOPMENT || DEBUG
        case MEMORYSTATUS_CMD_FREEZER_CONTROL:
                error = memorystatus_freezer_control(args->flags, args->buffer, args->buffersize, ret);
@@ -9990,6 +7528,14 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *
 #endif /* DEVELOPMENT || DEBUG */
 #endif /* CONFIG_FREEZE */
 
+#if CONFIG_JETSAM
+#if DEVELOPMENT || DEBUG
+       case MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT:
+               error = memorystatus_cmd_increase_jetsam_task_limit(args->pid, args->flags);
+               break;
+#endif /* DEVELOPMENT */
+#endif /* CONFIG_JETSAM */
+
        default:
                break;
        }
@@ -9998,263 +7544,6 @@ out:
        return error;
 }
 
-
-static int
-filt_memorystatusattach(struct knote *kn, __unused struct kevent_internal_s *kev)
-{
-       int error;
-
-       kn->kn_flags |= EV_CLEAR;
-       error = memorystatus_knote_register(kn);
-       if (error) {
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = error;
-       }
-       return 0;
-}
-
-static void
-filt_memorystatusdetach(struct knote *kn)
-{
-       memorystatus_knote_unregister(kn);
-}
-
-static int
-filt_memorystatus(struct knote *kn __unused, long hint)
-{
-       if (hint) {
-               switch (hint) {
-               case kMemorystatusNoPressure:
-                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
-                               kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
-                       }
-                       break;
-               case kMemorystatusPressure:
-                       if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
-                               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
-                                       kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
-                               }
-                       } else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
-                               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
-                                       kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
-                               }
-                       }
-                       break;
-               case kMemorystatusLowSwap:
-                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
-                               kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
-                       }
-                       break;
-
-               case kMemorystatusProcLimitWarn:
-                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
-                               kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
-                       }
-                       break;
-
-               case kMemorystatusProcLimitCritical:
-                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
-                               kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
-                       }
-                       break;
-
-               default:
-                       break;
-               }
-       }
-
-#if 0
-       if (kn->kn_fflags != 0) {
-               proc_t knote_proc = knote_get_kq(kn)->kq_p;
-               pid_t knote_pid = knote_proc->p_pid;
-
-               printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n",
-                   (unsigned long)kn, kn->kn_fflags, knote_pid);
-       }
-#endif
-
-       return kn->kn_fflags != 0;
-}
-
-static int
-filt_memorystatustouch(struct knote *kn, struct kevent_internal_s *kev)
-{
-       int res;
-       int prev_kn_sfflags = 0;
-
-       memorystatus_klist_lock();
-
-       /*
-        * copy in new kevent settings
-        * (saving the "desired" data and fflags).
-        */
-
-       prev_kn_sfflags = kn->kn_sfflags;
-       kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK);
-
-#if !CONFIG_EMBEDDED
-       /*
-        * Only on desktop do we restrict notifications to
-        * one per active/inactive state (soft limits only).
-        */
-       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
-               /*
-                * Is there previous state to preserve?
-                */
-               if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
-                       /*
-                        * This knote was previously interested in proc_limit_warn,
-                        * so yes, preserve previous state.
-                        */
-                       if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
-                               kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
-                       }
-                       if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
-                               kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
-                       }
-               } else {
-                       /*
-                        * This knote was not previously interested in proc_limit_warn,
-                        * but it is now.  Set both states.
-                        */
-                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
-                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
-               }
-       }
-
-       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
-               /*
-                * Is there previous state to preserve?
-                */
-               if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
-                       /*
-                        * This knote was previously interested in proc_limit_critical,
-                        * so yes, preserve previous state.
-                        */
-                       if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
-                               kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
-                       }
-                       if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
-                               kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
-                       }
-               } else {
-                       /*
-                        * This knote was not previously interested in proc_limit_critical,
-                        * but it is now.  Set both states.
-                        */
-                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
-                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
-               }
-       }
-#endif /* !CONFIG_EMBEDDED */
-
-       /*
-        * reset the output flags based on a
-        * combination of the old events and
-        * the new desired event list.
-        */
-       //kn->kn_fflags &= kn->kn_sfflags;
-
-       res = (kn->kn_fflags != 0);
-
-       memorystatus_klist_unlock();
-
-       return res;
-}
-
-static int
-filt_memorystatusprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
-{
-#pragma unused(data)
-       int res;
-
-       memorystatus_klist_lock();
-       res = (kn->kn_fflags != 0);
-       if (res) {
-               *kev = kn->kn_kevent;
-               kn->kn_flags |= EV_CLEAR; /* automatic */
-               kn->kn_fflags = 0;
-               kn->kn_data = 0;
-       }
-       memorystatus_klist_unlock();
-
-       return res;
-}
-
-static void
-memorystatus_klist_lock(void)
-{
-       lck_mtx_lock(&memorystatus_klist_mutex);
-}
-
-static void
-memorystatus_klist_unlock(void)
-{
-       lck_mtx_unlock(&memorystatus_klist_mutex);
-}
-
-void
-memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr)
-{
-       lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
-       klist_init(&memorystatus_klist);
-}
-
-int
-memorystatus_knote_register(struct knote *kn)
-{
-       int error = 0;
-
-       memorystatus_klist_lock();
-
-       /*
-        * Support only userspace visible flags.
-        */
-       if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) {
-#if !CONFIG_EMBEDDED
-               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
-                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
-                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
-               }
-
-               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
-                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
-                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
-               }
-#endif /* !CONFIG_EMBEDDED */
-
-               KNOTE_ATTACH(&memorystatus_klist, kn);
-       } else {
-               error = ENOTSUP;
-       }
-
-       memorystatus_klist_unlock();
-
-       return error;
-}
-
-void
-memorystatus_knote_unregister(struct knote *kn __unused)
-{
-       memorystatus_klist_lock();
-       KNOTE_DETACH(&memorystatus_klist, kn);
-       memorystatus_klist_unlock();
-}
-
-
-#if 0
-#if CONFIG_JETSAM && VM_PRESSURE_EVENTS
-static boolean_t
-memorystatus_issue_pressure_kevent(boolean_t pressured)
-{
-       memorystatus_klist_lock();
-       KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
-       memorystatus_klist_unlock();
-       return TRUE;
-}
-#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
-#endif /* 0 */
-
 /* Coalition support */
 
 /* sorting info for a particular priority bucket */
@@ -10322,7 +7611,8 @@ memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coa
 
        p = memorystatus_get_first_proc_locked(&b, FALSE);
        while (p) {
-               if (coalition_is_leader(p->task, COALITION_TYPE_JETSAM, &coal)) {
+               coal = task_get_coalition(p->task, COALITION_TYPE_JETSAM);
+               if (coalition_is_leader(p->task, coal)) {
                        if (nleaders < MAX_COAL_LEADERS) {
                                int coal_ntasks = 0;
                                uint64_t coal_page_count = coalition_get_page_count(coal, &coal_ntasks);
@@ -10610,10 +7900,14 @@ memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap)
 
        TAILQ_REMOVE(&current_bucket->list, p, p_memstat_list);
        current_bucket->count--;
-
+       if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
+               current_bucket->relaunch_high_count--;
+       }
        TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
        new_bucket->count++;
-
+       if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
+               new_bucket->relaunch_high_count++;
+       }
        /*
         * Record idle start or idle delta.
         */
@@ -10655,3 +7949,95 @@ memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap)
        return -1;
 #endif /* !CONFIG_JETSAM */
 }
+
+uint64_t
+memorystatus_available_memory_internal(proc_t p)
+{
+#ifdef XNU_TARGET_OS_OSX
+       #pragma unused(p)
+       return 0;
+#else
+       const uint64_t footprint_in_bytes = get_task_phys_footprint(p->task);
+       int32_t memlimit_mb;
+       int64_t memlimit_bytes;
+       int64_t rc;
+
+       if (isApp(p) == FALSE) {
+               return 0;
+       }
+
+       if (p->p_memstat_memlimit > 0) {
+               memlimit_mb = p->p_memstat_memlimit;
+       } else if (task_convert_phys_footprint_limit(-1, &memlimit_mb) != KERN_SUCCESS) {
+               return 0;
+       }
+
+       if (memlimit_mb <= 0) {
+               memlimit_bytes = INT_MAX & ~((1 << 20) - 1);
+       } else {
+               memlimit_bytes = ((int64_t) memlimit_mb) << 20;
+       }
+
+       rc = memlimit_bytes - footprint_in_bytes;
+
+       return (rc >= 0) ? rc : 0;
+#endif
+}
+
+int
+memorystatus_available_memory(struct proc *p, __unused struct memorystatus_available_memory_args *args, uint64_t *ret)
+{
+       *ret = memorystatus_available_memory_internal(p);
+
+       return 0;
+}
+
+#if CONFIG_JETSAM
+#if DEVELOPMENT || DEBUG
+static int
+memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase)
+{
+       memorystatus_memlimit_properties_t mmp_entry;
+
+       /* Validate inputs */
+       if ((pid == 0) || (byte_increase == 0)) {
+               return EINVAL;
+       }
+
+       proc_t p = proc_find(pid);
+
+       if (!p) {
+               return ESRCH;
+       }
+
+       const uint32_t current_memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
+       const uint32_t page_aligned_increase = round_page(p->p_memlimit_increase + byte_increase); /* round to page */
+
+       proc_list_lock();
+
+       memorystatus_get_memlimit_properties_internal(p, &mmp_entry);
+
+       if (mmp_entry.memlimit_active > 0) {
+               mmp_entry.memlimit_active -= current_memlimit_increase;
+               mmp_entry.memlimit_active += roundToNearestMB(page_aligned_increase);
+       }
+
+       if (mmp_entry.memlimit_inactive > 0) {
+               mmp_entry.memlimit_inactive -= current_memlimit_increase;
+               mmp_entry.memlimit_inactive += roundToNearestMB(page_aligned_increase);
+       }
+
+       /*
+        * Store the updated delta limit in the proc.
+        */
+       p->p_memlimit_increase = page_aligned_increase;
+
+       int error = memorystatus_set_memlimit_properties_internal(p, &mmp_entry);
+
+       proc_list_unlock();
+       proc_rele(p);
+
+       return error;
+}
+#endif /* DEVELOPMENT */
+#endif /* CONFIG_JETSAM */
diff --git a/bsd/kern/kern_memorystatus_freeze.c b/bsd/kern/kern_memorystatus_freeze.c
new file mode 100644 (file)
index 0000000..c83a80d
--- /dev/null
@@ -0,0 +1,2196 @@
+/*
+ * Copyright (c) 2006-2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ *
+ */
+
+#include <kern/sched_prim.h>
+#include <kern/kalloc.h>
+#include <kern/assert.h>
+#include <kern/debug.h>
+#include <kern/locks.h>
+#include <kern/task.h>
+#include <kern/thread.h>
+#include <kern/host.h>
+#include <kern/policy_internal.h>
+#include <kern/thread_group.h>
+
+#include <IOKit/IOBSD.h>
+
+#include <libkern/libkern.h>
+#include <mach/coalition.h>
+#include <mach/mach_time.h>
+#include <mach/task.h>
+#include <mach/host_priv.h>
+#include <mach/mach_host.h>
+#include <os/log.h>
+#include <pexpert/pexpert.h>
+#include <sys/coalition.h>
+#include <sys/kern_event.h>
+#include <sys/proc.h>
+#include <sys/proc_info.h>
+#include <sys/reason.h>
+#include <sys/signal.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/wait.h>
+#include <sys/tree.h>
+#include <sys/priv.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_protos.h>
+#include <mach/machine/sdt.h>
+#include <libkern/section_keywords.h>
+#include <stdatomic.h>
+
+#if CONFIG_FREEZE
+#include <vm/vm_map.h>
+#endif /* CONFIG_FREEZE */
+
+#include <sys/kern_memorystatus.h>
+#include <sys/kern_memorystatus_freeze.h>
+#include <sys/kern_memorystatus_notify.h>
+
+#if CONFIG_JETSAM
+
+extern unsigned int memorystatus_available_pages;
+extern unsigned int memorystatus_available_pages_pressure;
+extern unsigned int memorystatus_available_pages_critical;
+extern unsigned int memorystatus_available_pages_critical_base;
+extern unsigned int memorystatus_available_pages_critical_idle_offset;
+
+#else /* CONFIG_JETSAM */
+
+extern uint64_t memorystatus_available_pages;
+extern uint64_t memorystatus_available_pages_pressure;
+extern uint64_t memorystatus_available_pages_critical;
+
+#endif /* CONFIG_JETSAM */
+
+unsigned int memorystatus_frozen_count = 0;
+unsigned int memorystatus_suspended_count = 0;
+unsigned long freeze_threshold_percentage = 50;
+
+#if CONFIG_FREEZE
+
+lck_grp_attr_t *freezer_lck_grp_attr;
+lck_grp_t *freezer_lck_grp;
+static lck_mtx_t freezer_mutex;
+
+/* Thresholds */
+unsigned int memorystatus_freeze_threshold = 0;
+unsigned int memorystatus_freeze_pages_min = 0;
+unsigned int memorystatus_freeze_pages_max = 0;
+unsigned int memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
+unsigned int memorystatus_freeze_daily_mb_max = FREEZE_DAILY_MB_MAX_DEFAULT;
+uint64_t     memorystatus_freeze_budget_pages_remaining = 0; //remaining # of pages that can be frozen to disk
+boolean_t memorystatus_freeze_degradation = FALSE; //protected by the freezer mutex. Signals we are in a degraded freeze mode.
+
+unsigned int memorystatus_max_frozen_demotions_daily = 0;
+unsigned int memorystatus_thaw_count_demotion_threshold = 0;
+
+boolean_t memorystatus_freeze_enabled = FALSE;
+int memorystatus_freeze_wakeup = 0;
+int memorystatus_freeze_jetsam_band = 0; /* the jetsam band which will contain P_MEMSTAT_FROZEN processes */
+
+#define MAX_XPC_SERVICE_PIDS 10 /* Max. # of XPC services per coalition we'll consider freezing. */
+
+#ifdef XNU_KERNEL_PRIVATE
+
+unsigned int memorystatus_frozen_processes_max = 0;
+unsigned int memorystatus_frozen_shared_mb = 0;
+unsigned int memorystatus_frozen_shared_mb_max = 0;
+unsigned int memorystatus_freeze_shared_mb_per_process_max = 0; /* Max. MB allowed per process to be freezer-eligible. */
+unsigned int memorystatus_freeze_private_shared_pages_ratio = 2; /* Ratio of private:shared pages for a process to be freezer-eligible. */
+unsigned int memorystatus_thaw_count = 0;
+unsigned int memorystatus_refreeze_eligible_count = 0; /* # of processes currently thawed i.e. have state on disk & in-memory */
+
+#endif /* XNU_KERNEL_PRIVATE */
+
+static inline boolean_t memorystatus_can_freeze_processes(void);
+static boolean_t memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low);
+static boolean_t memorystatus_is_process_eligible_for_freeze(proc_t p);
+static void memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused);
+
+void memorystatus_disable_freeze(void);
+
+/* Stats */
+static uint64_t memorystatus_freeze_pageouts = 0;
+
+/* Throttling */
+#define DEGRADED_WINDOW_MINS    (30)
+#define NORMAL_WINDOW_MINS      (24 * 60)
+
+static throttle_interval_t throttle_intervals[] = {
+       { DEGRADED_WINDOW_MINS, 1, 0, 0, { 0, 0 }},
+       { NORMAL_WINDOW_MINS, 1, 0, 0, { 0, 0 }},
+};
+throttle_interval_t *degraded_throttle_window = &throttle_intervals[0];
+throttle_interval_t *normal_throttle_window = &throttle_intervals[1];
+
+extern uint64_t vm_swap_get_free_space(void);
+extern boolean_t vm_swap_max_budget(uint64_t *);
+extern int i_coal_jetsam_get_taskrole(coalition_t coal, task_t task);
+
+static void memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed);
+static void memorystatus_demote_frozen_processes(boolean_t force_one);
+
+static uint64_t memorystatus_freezer_thread_next_run_ts = 0;
+
+/* Sysctls needed for aggd stats */
+
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_frozen_count, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_thaw_count, 0, "");
+SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, "");
+SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_budget_pages_remaining, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_budget_pages_remaining, "");
+
+
+#if DEVELOPMENT || DEBUG
+
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_jetsam_band, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_jetsam_band, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_daily_mb_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_daily_mb_max, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_degraded_mode, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_degradation, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_refreeze_eligible_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_refreeze_eligible_count, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_processes_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_frozen_processes_max, 0, "");
+
+/*
+ * Max. shared-anonymous memory in MB that can be held by frozen processes in the high jetsam band.
+ * "0" means no limit.
+ * Default is 10% of system-wide task limit.
+ */
+
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_frozen_shared_mb_max, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_frozen_shared_mb, 0, "");
+
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_per_process_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_shared_mb_per_process_max, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_private_shared_pages_ratio, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_private_shared_pages_ratio, 0, "");
+
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, "");
+
+/*
+ * max. # of frozen process demotions we will allow in our daily cycle.
+ */
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_max_freeze_demotions_daily, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_max_frozen_demotions_daily, 0, "");
+/*
+ * min # of thaws needed by a process to protect it from getting demoted into the IDLE band.
+ */
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count_demotion_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_thaw_count_demotion_threshold, 0, "");
+
+boolean_t memorystatus_freeze_throttle_enabled = TRUE;
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, "");
+
+/*
+ * When set to true, this keeps frozen processes in the compressor pool in memory, instead of swapping them out to disk.
+ * Exposed via the sysctl kern.memorystatus_freeze_to_memory.
+ */
+boolean_t memorystatus_freeze_to_memory = FALSE;
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_to_memory, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_to_memory, 0, "");
+
+#define VM_PAGES_FOR_ALL_PROCS    (2)
+/*
+ * Manual trigger of freeze and thaw for dev / debug kernels only.
+ */
+static int
+sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int error, pid = 0;
+       proc_t p;
+       int freezer_error_code = 0;
+       pid_t pid_list[MAX_XPC_SERVICE_PIDS];
+       int ntasks = 0;
+       coalition_t coal = COALITION_NULL;
+
+       if (memorystatus_freeze_enabled == FALSE) {
+               printf("sysctl_freeze: Freeze is DISABLED\n");
+               return ENOTSUP;
+       }
+
+       error = sysctl_handle_int(oidp, &pid, 0, req);
+       if (error || !req->newptr) {
+               return error;
+       }
+
+       if (pid == VM_PAGES_FOR_ALL_PROCS) {
+               vm_pageout_anonymous_pages();
+
+               return 0;
+       }
+
+       lck_mtx_lock(&freezer_mutex);
+
+again:
+       p = proc_find(pid);
+       if (p != NULL) {
+               uint32_t purgeable, wired, clean, dirty, shared;
+               uint32_t max_pages = 0, state = 0;
+
+               if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+                       /*
+                        * Freezer backed by the compressor and swap file(s)
+                        * will hold compressed data.
+                        *
+                        * Set the sysctl kern.memorystatus_freeze_to_memory to true to keep compressed data from
+                        * being swapped out to disk. Note that this disables freezer swap support globally,
+                        * not just for the process being frozen.
+                        *
+                        *
+                        * We don't care about the global freezer budget or the process's (min/max) budget here.
+                        * The freeze sysctl is meant to force-freeze a process.
+                        *
+                        * We also don't update any global or process stats on this path, so that the jetsam/ freeze
+                        * logic remains unaffected. The tasks we're performing here are: freeze the process, set the
+                        * P_MEMSTAT_FROZEN bit, and elevate the process to a higher band (if the freezer is active).
+                        */
+                       max_pages = memorystatus_freeze_pages_max;
+               } else {
+                       /*
+                        * We only have the compressor without any swap.
+                        */
+                       max_pages = UINT32_MAX - 1;
+               }
+
+               proc_list_lock();
+               state = p->p_memstat_state;
+               proc_list_unlock();
+
+               /*
+                * The jetsam path also verifies that the process is a suspended App. We don't care about that here.
+                * We simply ensure that jetsam is not already working on the process and that the process has not
+                * explicitly disabled freezing.
+                */
+               if (state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED)) {
+                       printf("sysctl_freeze: p_memstat_state check failed, process is%s%s%s\n",
+                           (state & P_MEMSTAT_TERMINATED) ? " terminated" : "",
+                           (state & P_MEMSTAT_LOCKED) ? " locked" : "",
+                           (state & P_MEMSTAT_FREEZE_DISABLED) ? " unfreezable" : "");
+
+                       proc_rele(p);
+                       lck_mtx_unlock(&freezer_mutex);
+                       return EPERM;
+               }
+
+               error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */);
+
+               if (error) {
+                       char reason[128];
+                       if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) {
+                               strlcpy(reason, "too much shared memory", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) {
+                               strlcpy(reason, "low private-shared pages ratio", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) {
+                               strlcpy(reason, "no compressor space", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) {
+                               strlcpy(reason, "no swap space", 128);
+                       }
+
+                       printf("sysctl_freeze: task_freeze failed: %s\n", reason);
+
+                       if (error == KERN_NO_SPACE) {
+                               /* Make it easy to distinguish between failures due to low compressor/ swap space and other failures. */
+                               error = ENOSPC;
+                       } else {
+                               error = EIO;
+                       }
+               } else {
+                       proc_list_lock();
+                       if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) {
+                               p->p_memstat_state |= P_MEMSTAT_FROZEN;
+                               memorystatus_frozen_count++;
+                       }
+                       p->p_memstat_frozen_count++;
+
+
+                       proc_list_unlock();
+
+                       if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+                               /*
+                                * We elevate only if we are going to swap out the data.
+                                */
+                               error = memorystatus_update_inactive_jetsam_priority_band(pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE,
+                                   memorystatus_freeze_jetsam_band, TRUE);
+
+                               if (error) {
+                                       printf("sysctl_freeze: Elevating frozen process to higher jetsam band failed with %d\n", error);
+                               }
+                       }
+               }
+
+               if ((error == 0) && (coal == NULL)) {
+                       /*
+                        * We froze a process and so we check to see if it was
+                        * a coalition leader and if it has XPC services that
+                        * might need freezing.
+                        * Only one leader can be frozen at a time and so we shouldn't
+                        * enter this block more than once per call. Hence the
+                        * check that 'coal' has to be NULL. We should make this an
+                        * assert() or panic() once we have a much more concrete way
+                        * to detect an app vs a daemon.
+                        */
+
+                       task_t          curr_task = NULL;
+
+                       curr_task = proc_task(p);
+                       coal = task_get_coalition(curr_task, COALITION_TYPE_JETSAM);
+                       if (coalition_is_leader(curr_task, coal)) {
+                               ntasks = coalition_get_pid_list(coal, COALITION_ROLEMASK_XPC,
+                                   COALITION_SORT_DEFAULT, pid_list, MAX_XPC_SERVICE_PIDS);
+
+                               if (ntasks > MAX_XPC_SERVICE_PIDS) {
+                                       ntasks = MAX_XPC_SERVICE_PIDS;
+                               }
+                       }
+               }
+
+               proc_rele(p);
+
+               while (ntasks) {
+                       pid = pid_list[--ntasks];
+                       goto again;
+               }
+
+               lck_mtx_unlock(&freezer_mutex);
+               return error;
+       } else {
+               printf("sysctl_freeze: Invalid process\n");
+       }
+
+
+       lck_mtx_unlock(&freezer_mutex);
+       return EINVAL;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freeze, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
+    0, 0, &sysctl_memorystatus_freeze, "I", "");
+
+/*
+ * Manual trigger of agressive frozen demotion for dev / debug kernels only.
+ */
+static int
+sysctl_memorystatus_demote_frozen_process SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2, oidp, req)
+       memorystatus_demote_frozen_processes(false);
+       return 0;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, memorystatus_demote_frozen_processes, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_memorystatus_demote_frozen_process, "I", "");
+
+static int
+sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+
+       int error, pid = 0;
+       proc_t p;
+
+       if (memorystatus_freeze_enabled == FALSE) {
+               return ENOTSUP;
+       }
+
+       error = sysctl_handle_int(oidp, &pid, 0, req);
+       if (error || !req->newptr) {
+               return error;
+       }
+
+       if (pid == VM_PAGES_FOR_ALL_PROCS) {
+               do_fastwake_warmup_all();
+               return 0;
+       } else {
+               p = proc_find(pid);
+               if (p != NULL) {
+                       error = task_thaw(p->task);
+
+                       if (error) {
+                               error = EIO;
+                       } else {
+                               /*
+                                * task_thaw() succeeded.
+                                *
+                                * We increment memorystatus_frozen_count on the sysctl freeze path.
+                                * And so we need the P_MEMSTAT_FROZEN to decrement the frozen count
+                                * when this process exits.
+                                *
+                                * proc_list_lock();
+                                * p->p_memstat_state &= ~P_MEMSTAT_FROZEN;
+                                * proc_list_unlock();
+                                */
+                       }
+                       proc_rele(p);
+                       return error;
+               }
+       }
+
+       return EINVAL;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, memorystatus_thaw, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
+    0, 0, &sysctl_memorystatus_available_pages_thaw, "I", "");
+
+
+typedef struct _global_freezable_status {
+       boolean_t       freeze_pages_threshold_crossed;
+       boolean_t       freeze_eligible_procs_available;
+       boolean_t       freeze_scheduled_in_future;
+}global_freezable_status_t;
+
+typedef struct _proc_freezable_status {
+       boolean_t    freeze_has_memstat_state;
+       boolean_t    freeze_has_pages_min;
+       int        freeze_has_probability;
+       int        freeze_leader_eligible;
+       boolean_t    freeze_attempted;
+       uint32_t    p_memstat_state;
+       uint32_t    p_pages;
+       int        p_freeze_error_code;
+       int        p_pid;
+       int        p_leader_pid;
+       char        p_name[MAXCOMLEN + 1];
+}proc_freezable_status_t;
+
+#define MAX_FREEZABLE_PROCESSES 200 /* Total # of processes in band 0 that we evaluate for freezability */
+
+/*
+ * For coalition based freezing evaluations, we proceed as follows:
+ *  - detect that the process is a coalition member and a XPC service
+ *  - mark its 'freeze_leader_eligible' field with FREEZE_PROC_LEADER_FREEZABLE_UNKNOWN
+ *  - continue its freezability evaluation assuming its leader will be freezable too
+ *
+ * Once we are done evaluating all processes, we do a quick run thru all
+ * processes and for a coalition member XPC service we look up the 'freezable'
+ * status of its leader and iff:
+ *  - the xpc service is freezable i.e. its individual freeze evaluation worked
+ *  - and, its leader is also marked freezable
+ * we update its 'freeze_leader_eligible' to FREEZE_PROC_LEADER_FREEZABLE_SUCCESS.
+ */
+
+#define FREEZE_PROC_LEADER_FREEZABLE_UNKNOWN   (-1)
+#define FREEZE_PROC_LEADER_FREEZABLE_SUCCESS    (1)
+#define FREEZE_PROC_LEADER_FREEZABLE_FAILURE    (2)
+
+static int
+memorystatus_freezer_get_status(user_addr_t buffer, size_t buffer_size, int32_t *retval)
+{
+       uint32_t            proc_count = 0, freeze_eligible_proc_considered = 0, band = 0, xpc_index = 0, leader_index = 0;
+       global_freezable_status_t    *list_head;
+       proc_freezable_status_t     *list_entry, *list_entry_start;
+       size_t                list_size = 0;
+       proc_t                p, leader_proc;
+       memstat_bucket_t        *bucket;
+       uint32_t            state = 0, pages = 0, entry_count = 0;
+       boolean_t            try_freeze = TRUE, xpc_skip_size_probability_check = FALSE;
+       int                error = 0, probability_of_use = 0;
+       pid_t              leader_pid = 0;
+
+
+       if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
+               return ENOTSUP;
+       }
+
+       list_size = sizeof(global_freezable_status_t) + (sizeof(proc_freezable_status_t) * MAX_FREEZABLE_PROCESSES);
+
+       if (buffer_size < list_size) {
+               return EINVAL;
+       }
+
+       list_head = (global_freezable_status_t*)kalloc(list_size);
+       if (list_head == NULL) {
+               return ENOMEM;
+       }
+
+       memset(list_head, 0, list_size);
+
+       list_size = sizeof(global_freezable_status_t);
+
+       proc_list_lock();
+
+       uint64_t curr_time = mach_absolute_time();
+
+       list_head->freeze_pages_threshold_crossed = (memorystatus_available_pages < memorystatus_freeze_threshold);
+       list_head->freeze_eligible_procs_available = ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold);
+       list_head->freeze_scheduled_in_future = (curr_time < memorystatus_freezer_thread_next_run_ts);
+
+       list_entry_start = (proc_freezable_status_t*) ((uintptr_t)list_head + sizeof(global_freezable_status_t));
+       list_entry = list_entry_start;
+
+       bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
+
+       entry_count = (memorystatus_global_probabilities_size / sizeof(memorystatus_internal_probabilities_t));
+
+       p = memorystatus_get_first_proc_locked(&band, FALSE);
+       proc_count++;
+
+       while ((proc_count <= MAX_FREEZABLE_PROCESSES) &&
+           (p) &&
+           (list_size < buffer_size)) {
+               if (isSysProc(p)) {
+                       /*
+                        * Daemon:- We will consider freezing it iff:
+                        * - it belongs to a coalition and the leader is freeze-eligible (delayed evaluation)
+                        * - its role in the coalition is XPC service.
+                        *
+                        * We skip memory size requirements in this case.
+                        */
+
+                       coalition_t     coal = COALITION_NULL;
+                       task_t          leader_task = NULL, curr_task = NULL;
+                       int             task_role_in_coalition = 0;
+
+                       curr_task = proc_task(p);
+                       coal = task_get_coalition(curr_task, COALITION_TYPE_JETSAM);
+
+                       if (coal == COALITION_NULL || coalition_is_leader(curr_task, coal)) {
+                               /*
+                                * By default, XPC services without an app
+                                * will be the leader of their own single-member
+                                * coalition.
+                                */
+                               goto skip_ineligible_xpc;
+                       }
+
+                       leader_task = coalition_get_leader(coal);
+                       if (leader_task == TASK_NULL) {
+                               /*
+                                * This jetsam coalition is currently leader-less.
+                                * This could happen if the app died, but XPC services
+                                * have not yet exited.
+                                */
+                               goto skip_ineligible_xpc;
+                       }
+
+                       leader_proc = (proc_t)get_bsdtask_info(leader_task);
+                       task_deallocate(leader_task);
+
+                       if (leader_proc == PROC_NULL) {
+                               /* leader task is exiting */
+                               goto skip_ineligible_xpc;
+                       }
+
+                       task_role_in_coalition = i_coal_jetsam_get_taskrole(coal, curr_task);
+
+                       if (task_role_in_coalition == COALITION_TASKROLE_XPC) {
+                               xpc_skip_size_probability_check = TRUE;
+                               leader_pid = leader_proc->p_pid;
+                               goto continue_eval;
+                       }
+
+skip_ineligible_xpc:
+                       p = memorystatus_get_next_proc_locked(&band, p, FALSE);
+                       proc_count++;
+                       continue;
+               }
+
+continue_eval:
+               strlcpy(list_entry->p_name, p->p_name, MAXCOMLEN + 1);
+
+               list_entry->p_pid = p->p_pid;
+
+               state = p->p_memstat_state;
+
+               if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED | P_MEMSTAT_FREEZE_IGNORE)) ||
+                   !(state & P_MEMSTAT_SUSPENDED)) {
+                       try_freeze = list_entry->freeze_has_memstat_state = FALSE;
+               } else {
+                       try_freeze = list_entry->freeze_has_memstat_state = TRUE;
+               }
+
+               list_entry->p_memstat_state = state;
+
+               if (xpc_skip_size_probability_check == TRUE) {
+                       /*
+                        * Assuming the coalition leader is freezable
+                        * we don't care re. minimum pages and probability
+                        * as long as the process isn't marked P_MEMSTAT_FREEZE_DISABLED.
+                        * XPC services have to be explicity opted-out of the disabled
+                        * state. And we checked that state above.
+                        */
+                       list_entry->freeze_has_pages_min = TRUE;
+                       list_entry->p_pages = -1;
+                       list_entry->freeze_has_probability = -1;
+
+                       list_entry->freeze_leader_eligible = FREEZE_PROC_LEADER_FREEZABLE_UNKNOWN;
+                       list_entry->p_leader_pid = leader_pid;
+
+                       xpc_skip_size_probability_check = FALSE;
+               } else {
+                       list_entry->freeze_leader_eligible = FREEZE_PROC_LEADER_FREEZABLE_SUCCESS; /* Apps are freeze eligible and their own leaders. */
+                       list_entry->p_leader_pid = 0; /* Setting this to 0 signifies this isn't a coalition driven freeze. */
+
+                       memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
+                       if (pages < memorystatus_freeze_pages_min) {
+                               try_freeze = list_entry->freeze_has_pages_min = FALSE;
+                       } else {
+                               list_entry->freeze_has_pages_min = TRUE;
+                       }
+
+                       list_entry->p_pages = pages;
+
+                       if (entry_count) {
+                               uint32_t j = 0;
+                               for (j = 0; j < entry_count; j++) {
+                                       if (strncmp(memorystatus_global_probabilities_table[j].proc_name,
+                                           p->p_name,
+                                           MAXCOMLEN + 1) == 0) {
+                                               probability_of_use = memorystatus_global_probabilities_table[j].use_probability;
+                                               break;
+                                       }
+                               }
+
+                               list_entry->freeze_has_probability = probability_of_use;
+
+                               try_freeze = ((probability_of_use > 0) && try_freeze);
+                       } else {
+                               list_entry->freeze_has_probability = -1;
+                       }
+               }
+
+               if (try_freeze) {
+                       uint32_t purgeable, wired, clean, dirty, shared;
+                       uint32_t max_pages = 0;
+                       int freezer_error_code = 0;
+
+                       error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, TRUE /* eval only */);
+
+                       if (error) {
+                               list_entry->p_freeze_error_code = freezer_error_code;
+                       }
+
+                       list_entry->freeze_attempted = TRUE;
+               }
+
+               list_entry++;
+               freeze_eligible_proc_considered++;
+
+               list_size += sizeof(proc_freezable_status_t);
+
+               p = memorystatus_get_next_proc_locked(&band, p, FALSE);
+               proc_count++;
+       }
+
+       proc_list_unlock();
+
+       list_entry = list_entry_start;
+
+       for (xpc_index = 0; xpc_index < freeze_eligible_proc_considered; xpc_index++) {
+               if (list_entry[xpc_index].freeze_leader_eligible == FREEZE_PROC_LEADER_FREEZABLE_UNKNOWN) {
+                       leader_pid = list_entry[xpc_index].p_leader_pid;
+
+                       leader_proc = proc_find(leader_pid);
+
+                       if (leader_proc) {
+                               if (leader_proc->p_memstat_state & P_MEMSTAT_FROZEN) {
+                                       /*
+                                        * Leader has already been frozen.
+                                        */
+                                       list_entry[xpc_index].freeze_leader_eligible = FREEZE_PROC_LEADER_FREEZABLE_SUCCESS;
+                                       proc_rele(leader_proc);
+                                       continue;
+                               }
+                               proc_rele(leader_proc);
+                       }
+
+                       for (leader_index = 0; leader_index < freeze_eligible_proc_considered; leader_index++) {
+                               if (list_entry[leader_index].p_pid == leader_pid) {
+                                       if (list_entry[leader_index].freeze_attempted && list_entry[leader_index].p_freeze_error_code == 0) {
+                                               list_entry[xpc_index].freeze_leader_eligible = FREEZE_PROC_LEADER_FREEZABLE_SUCCESS;
+                                       } else {
+                                               list_entry[xpc_index].freeze_leader_eligible = FREEZE_PROC_LEADER_FREEZABLE_FAILURE;
+                                               list_entry[xpc_index].p_freeze_error_code = FREEZER_ERROR_GENERIC;
+                                       }
+                                       break;
+                               }
+                       }
+
+                       /*
+                        * Didn't find the leader entry. This might be likely because
+                        * the leader never made it down to band 0.
+                        */
+                       if (leader_index == freeze_eligible_proc_considered) {
+                               list_entry[xpc_index].freeze_leader_eligible = FREEZE_PROC_LEADER_FREEZABLE_FAILURE;
+                               list_entry[xpc_index].p_freeze_error_code = FREEZER_ERROR_GENERIC;
+                       }
+               }
+       }
+
+       buffer_size = list_size;
+
+       error = copyout(list_head, buffer, buffer_size);
+       if (error == 0) {
+               *retval = buffer_size;
+       } else {
+               *retval = 0;
+       }
+
+       list_size = sizeof(global_freezable_status_t) + (sizeof(proc_freezable_status_t) * MAX_FREEZABLE_PROCESSES);
+       kfree(list_head, list_size);
+
+       MEMORYSTATUS_DEBUG(1, "memorystatus_freezer_get_status: returning %d (%lu - size)\n", error, (unsigned long)*list_size);
+
+       return error;
+}
+
+int
+memorystatus_freezer_control(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval)
+{
+       int err = ENOTSUP;
+
+       if (flags == FREEZER_CONTROL_GET_STATUS) {
+               err = memorystatus_freezer_get_status(buffer, buffer_size, retval);
+       }
+
+       return err;
+}
+
+#endif /* DEVELOPMENT || DEBUG */
+
+extern void        vm_swap_consider_defragmenting(int);
+extern boolean_t memorystatus_kill_elevated_process(uint32_t, os_reason_t, unsigned int, int, uint32_t *, uint64_t *);
+
+/*
+ * This routine will _jetsam_ all frozen processes
+ * and reclaim the swap space immediately.
+ *
+ * So freeze has to be DISABLED when we call this routine.
+ */
+
+void
+memorystatus_disable_freeze(void)
+{
+       memstat_bucket_t *bucket;
+       int bucket_count = 0, retries = 0;
+       boolean_t retval = FALSE, killed = FALSE;
+       uint32_t errors = 0, errors_over_prev_iteration = 0;
+       os_reason_t jetsam_reason = 0;
+       unsigned int band = 0;
+       proc_t p = PROC_NULL, next_p = PROC_NULL;
+       uint64_t memory_reclaimed = 0, footprint = 0;
+
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE_DISABLE) | DBG_FUNC_START,
+           memorystatus_available_pages, 0, 0, 0, 0);
+
+       assert(memorystatus_freeze_enabled == FALSE);
+
+       jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_DISK_SPACE_SHORTAGE);
+       if (jetsam_reason == OS_REASON_NULL) {
+               printf("memorystatus_disable_freeze: failed to allocate jetsam reason\n");
+       }
+
+       /*
+        * Let's relocate all frozen processes into band 8. Demoted frozen processes
+        * are sitting in band 0 currently and it's possible to have a frozen process
+        * in the FG band being actively used. We don't reset its frozen state when
+        * it is resumed because it has state on disk.
+        *
+        * We choose to do this relocation rather than implement a new 'kill frozen'
+        * process function for these reasons:
+        * - duplication of code: too many kill functions exist and we need to rework them better.
+        * - disk-space-shortage kills are rare
+        * - not having the 'real' jetsam band at time of the this frozen kill won't preclude us
+        *   from answering any imp. questions re. jetsam policy/effectiveness.
+        *
+        * This is essentially what memorystatus_update_inactive_jetsam_priority_band() does while
+        * avoiding the application of memory limits.
+        */
+
+again:
+       proc_list_lock();
+
+       band = JETSAM_PRIORITY_IDLE;
+       p = PROC_NULL;
+       next_p = PROC_NULL;
+
+       next_p = memorystatus_get_first_proc_locked(&band, TRUE);
+       while (next_p) {
+               p = next_p;
+               next_p = memorystatus_get_next_proc_locked(&band, p, TRUE);
+
+               if (p->p_memstat_effectivepriority > JETSAM_PRIORITY_FOREGROUND) {
+                       break;
+               }
+
+               if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) {
+                       continue;
+               }
+
+               if (p->p_memstat_state & P_MEMSTAT_ERROR) {
+                       p->p_memstat_state &= ~P_MEMSTAT_ERROR;
+               }
+
+               if (p->p_memstat_effectivepriority == memorystatus_freeze_jetsam_band) {
+                       continue;
+               }
+
+               /*
+                * We explicitly add this flag here so the process looks like a normal
+                * frozen process i.e. P_MEMSTAT_FROZEN and P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND.
+                * We don't bother with assigning the 'active' memory
+                * limits at this point because we are going to be killing it soon below.
+                */
+               p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
+               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+
+               memorystatus_update_priority_locked(p, memorystatus_freeze_jetsam_band, FALSE, TRUE);
+       }
+
+       bucket = &memstat_bucket[memorystatus_freeze_jetsam_band];
+       bucket_count = bucket->count;
+       proc_list_unlock();
+
+       /*
+        * Bucket count is already stale at this point. But, we don't expect
+        * freezing to continue since we have already disabled the freeze functionality.
+        * However, an existing freeze might be in progress. So we might miss that process
+        * in the first go-around. We hope to catch it in the next.
+        */
+
+       errors_over_prev_iteration = 0;
+       while (bucket_count) {
+               bucket_count--;
+
+               /*
+                * memorystatus_kill_elevated_process() drops a reference,
+                * so take another one so we can continue to use this exit reason
+                * even after it returns.
+                */
+
+               os_reason_ref(jetsam_reason);
+               retval = memorystatus_kill_elevated_process(
+                       kMemorystatusKilledDiskSpaceShortage,
+                       jetsam_reason,
+                       memorystatus_freeze_jetsam_band,
+                       0,                             /* the iteration of aggressive jetsam..ignored here */
+                       &errors,
+                       &footprint);
+
+               if (errors > 0) {
+                       printf("memorystatus_disable_freeze: memorystatus_kill_elevated_process returned %d error(s)\n", errors);
+                       errors_over_prev_iteration += errors;
+                       errors = 0;
+               }
+
+               if (retval == 0) {
+                       /*
+                        * No frozen processes left to kill.
+                        */
+                       break;
+               }
+
+               killed = TRUE;
+               memory_reclaimed += footprint;
+       }
+
+       proc_list_lock();
+
+       if (memorystatus_frozen_count) {
+               /*
+                * A frozen process snuck in and so
+                * go back around to kill it. That
+                * process may have been resumed and
+                * put into the FG band too. So we
+                * have to do the relocation again.
+                */
+               assert(memorystatus_freeze_enabled == FALSE);
+
+               retries++;
+               if (retries < 3) {
+                       proc_list_unlock();
+                       goto again;
+               }
+#if DEVELOPMENT || DEBUG
+               panic("memorystatus_disable_freeze: Failed to kill all frozen processes, memorystatus_frozen_count = %d, errors = %d",
+                   memorystatus_frozen_count, errors_over_prev_iteration);
+#endif /* DEVELOPMENT || DEBUG */
+       }
+       proc_list_unlock();
+
+       os_reason_free(jetsam_reason);
+
+       if (killed) {
+               vm_swap_consider_defragmenting(VM_SWAP_FLAGS_FORCE_DEFRAG | VM_SWAP_FLAGS_FORCE_RECLAIM);
+
+               proc_list_lock();
+               size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
+                   sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
+               uint64_t timestamp_now = mach_absolute_time();
+               memorystatus_jetsam_snapshot->notification_time = timestamp_now;
+               memorystatus_jetsam_snapshot->js_gencount++;
+               if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
+                   timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
+                       proc_list_unlock();
+                       int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
+                       if (!ret) {
+                               proc_list_lock();
+                               memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
+                               proc_list_unlock();
+                       }
+               } else {
+                       proc_list_unlock();
+               }
+       }
+
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE_DISABLE) | DBG_FUNC_END,
+           memorystatus_available_pages, memory_reclaimed, 0, 0, 0);
+
+       return;
+}
+
+__private_extern__ void
+memorystatus_freeze_init(void)
+{
+       kern_return_t result;
+       thread_t thread;
+
+       freezer_lck_grp_attr = lck_grp_attr_alloc_init();
+       freezer_lck_grp = lck_grp_alloc_init("freezer", freezer_lck_grp_attr);
+
+       lck_mtx_init(&freezer_mutex, freezer_lck_grp, NULL);
+
+       /*
+        * This is just the default value if the underlying
+        * storage device doesn't have any specific budget.
+        * We check with the storage layer in memorystatus_freeze_update_throttle()
+        * before we start our freezing the first time.
+        */
+       memorystatus_freeze_budget_pages_remaining = (memorystatus_freeze_daily_mb_max * 1024 * 1024) / PAGE_SIZE;
+
+       result = kernel_thread_start(memorystatus_freeze_thread, NULL, &thread);
+       if (result == KERN_SUCCESS) {
+               proc_set_thread_policy(thread, TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2);
+               proc_set_thread_policy(thread, TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
+               thread_set_thread_name(thread, "VM_freezer");
+
+               thread_deallocate(thread);
+       } else {
+               panic("Could not create memorystatus_freeze_thread");
+       }
+}
+
+static boolean_t
+memorystatus_is_process_eligible_for_freeze(proc_t p)
+{
+       /*
+        * Called with proc_list_lock held.
+        */
+
+       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+
+       boolean_t should_freeze = FALSE;
+       uint32_t state = 0, entry_count = 0, pages = 0, i = 0;
+       int probability_of_use = 0;
+
+       state = p->p_memstat_state;
+
+       if (state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED | P_MEMSTAT_FREEZE_IGNORE)) {
+               goto out;
+       }
+
+       if (isSysProc(p)) {
+               /*
+                * Daemon:- We consider freezing it if:
+                * - it belongs to a coalition and the leader is frozen, and,
+                * - its role in the coalition is XPC service.
+                *
+                * We skip memory size requirements in this case.
+                */
+
+               coalition_t     coal = COALITION_NULL;
+               task_t          leader_task = NULL, curr_task = NULL;
+               proc_t          leader_proc = NULL;
+               int             task_role_in_coalition = 0;
+
+               curr_task = proc_task(p);
+               coal = task_get_coalition(curr_task, COALITION_TYPE_JETSAM);
+
+               if (coal == NULL || coalition_is_leader(curr_task, coal)) {
+                       /*
+                        * By default, XPC services without an app
+                        * will be the leader of their own single-member
+                        * coalition.
+                        */
+                       goto out;
+               }
+
+               leader_task = coalition_get_leader(coal);
+               if (leader_task == TASK_NULL) {
+                       /*
+                        * This jetsam coalition is currently leader-less.
+                        * This could happen if the app died, but XPC services
+                        * have not yet exited.
+                        */
+                       goto out;
+               }
+
+               leader_proc = (proc_t)get_bsdtask_info(leader_task);
+               task_deallocate(leader_task);
+
+               if (leader_proc == PROC_NULL) {
+                       /* leader task is exiting */
+                       goto out;
+               }
+
+               if (!(leader_proc->p_memstat_state & P_MEMSTAT_FROZEN)) {
+                       goto out;
+               }
+
+               task_role_in_coalition = i_coal_jetsam_get_taskrole(coal, curr_task);
+
+               if (task_role_in_coalition == COALITION_TASKROLE_XPC) {
+                       should_freeze = TRUE;
+               }
+
+               goto out;
+       } else {
+               /*
+                * Application. In addition to the above states we need to make
+                * sure we only consider suspended applications for freezing.
+                */
+               if (!(state & P_MEMSTAT_SUSPENDED)) {
+                       goto out;
+               }
+       }
+
+
+       /* Only freeze applications meeting our minimum resident page criteria */
+       memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
+       if (pages < memorystatus_freeze_pages_min) {
+               goto out;
+       }
+
+       /* Don't freeze processes that are already exiting on core. It may have started exiting
+        * after we chose it for freeze, but before we obtained the proc_list_lock.
+        * NB: This is only possible if we're coming in from memorystatus_freeze_process_sync.
+        * memorystatus_freeze_top_process holds the proc_list_lock while it traverses the bands.
+        */
+       if ((p->p_listflag & P_LIST_EXITED) != 0) {
+               goto out;
+       }
+
+       entry_count = (memorystatus_global_probabilities_size / sizeof(memorystatus_internal_probabilities_t));
+
+       if (entry_count) {
+               for (i = 0; i < entry_count; i++) {
+                       if (strncmp(memorystatus_global_probabilities_table[i].proc_name,
+                           p->p_name,
+                           MAXCOMLEN + 1) == 0) {
+                               probability_of_use = memorystatus_global_probabilities_table[i].use_probability;
+                               break;
+                       }
+               }
+
+               if (probability_of_use == 0) {
+                       goto out;
+               }
+       }
+
+       should_freeze = TRUE;
+out:
+       return should_freeze;
+}
+
+/*
+ * Synchronously freeze the passed proc. Called with a reference to the proc held.
+ *
+ * Doesn't deal with:
+ * - re-freezing because this is called on a specific process and
+ *   not by the freezer thread. If that changes, we'll have to teach it about
+ *   refreezing a frozen process.
+ *
+ * - grouped/coalition freezing because we are hoping to deprecate this
+ *   interface as it was used by user-space to freeze particular processes. But
+ *   we have moved away from that approach to having the kernel choose the optimal
+ *   candidates to be frozen.
+ *
+ * Returns EINVAL or the value returned by task_freeze().
+ */
+int
+memorystatus_freeze_process_sync(proc_t p)
+{
+       int ret = EINVAL;
+       pid_t aPid = 0;
+       boolean_t memorystatus_freeze_swap_low = FALSE;
+       int    freezer_error_code = 0;
+
+       lck_mtx_lock(&freezer_mutex);
+
+       if (p == NULL) {
+               printf("memorystatus_freeze_process_sync: Invalid process\n");
+               goto exit;
+       }
+
+       if (memorystatus_freeze_enabled == FALSE) {
+               printf("memorystatus_freeze_process_sync: Freezing is DISABLED\n");
+               goto exit;
+       }
+
+       if (!memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
+               printf("memorystatus_freeze_process_sync: Low compressor and/or low swap space...skipping freeze\n");
+               goto exit;
+       }
+
+       memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining);
+       if (!memorystatus_freeze_budget_pages_remaining) {
+               printf("memorystatus_freeze_process_sync: exit with NO available budget\n");
+               goto exit;
+       }
+
+       proc_list_lock();
+
+       if (p != NULL) {
+               uint32_t purgeable, wired, clean, dirty, shared;
+               uint32_t max_pages, i;
+
+               aPid = p->p_pid;
+
+               /* Ensure the process is eligible for freezing */
+               if (memorystatus_is_process_eligible_for_freeze(p) == FALSE) {
+                       proc_list_unlock();
+                       goto exit;
+               }
+
+               if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+                       max_pages = MIN(memorystatus_freeze_pages_max, memorystatus_freeze_budget_pages_remaining);
+               } else {
+                       /*
+                        * We only have the compressor without any swap.
+                        */
+                       max_pages = UINT32_MAX - 1;
+               }
+
+               /* Mark as locked temporarily to avoid kill */
+               p->p_memstat_state |= P_MEMSTAT_LOCKED;
+               proc_list_unlock();
+
+               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
+                   memorystatus_available_pages, 0, 0, 0, 0);
+
+               ret = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */);
+
+               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
+                   memorystatus_available_pages, aPid, 0, 0, 0);
+
+               DTRACE_MEMORYSTATUS6(memorystatus_freeze, proc_t, p, unsigned int, memorystatus_available_pages, boolean_t, purgeable, unsigned int, wired, uint32_t, clean, uint32_t, dirty);
+
+               MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_process_sync: task_freeze %s for pid %d [%s] - "
+                   "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n",
+                   (ret == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"),
+                   memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared);
+
+               proc_list_lock();
+
+               if (ret == KERN_SUCCESS) {
+                       memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
+
+                       p->p_memstat_freeze_sharedanon_pages += shared;
+
+                       memorystatus_frozen_shared_mb += shared;
+
+                       if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) {
+                               p->p_memstat_state |= P_MEMSTAT_FROZEN;
+                               memorystatus_frozen_count++;
+                       }
+
+                       p->p_memstat_frozen_count++;
+
+                       /*
+                        * Still keeping the P_MEMSTAT_LOCKED bit till we are actually done elevating this frozen process
+                        * to its higher jetsam band.
+                        */
+                       proc_list_unlock();
+
+                       memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
+
+                       if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+                               ret = memorystatus_update_inactive_jetsam_priority_band(p->p_pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE,
+                                   memorystatus_freeze_jetsam_band, TRUE);
+
+                               if (ret) {
+                                       printf("Elevating the frozen process failed with %d\n", ret);
+                                       /* not fatal */
+                                       ret = 0;
+                               }
+
+                               proc_list_lock();
+
+                               /* Update stats */
+                               for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
+                                       throttle_intervals[i].pageouts += dirty;
+                               }
+                       } else {
+                               proc_list_lock();
+                       }
+
+                       memorystatus_freeze_pageouts += dirty;
+
+                       if (memorystatus_frozen_count == (memorystatus_frozen_processes_max - 1)) {
+                               /*
+                                * Add some eviction logic here? At some point should we
+                                * jetsam a process to get back its swap space so that we
+                                * can freeze a more eligible process at this moment in time?
+                                */
+                       }
+
+                       memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining);
+                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (specific) pid %d [%s] done memorystatus_freeze_budget_pages_remaining %llu froze %u pages",
+                           aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_freeze_budget_pages_remaining, dirty);
+               } else {
+                       char reason[128];
+                       if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) {
+                               strlcpy(reason, "too much shared memory", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) {
+                               strlcpy(reason, "low private-shared pages ratio", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) {
+                               strlcpy(reason, "no compressor space", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) {
+                               strlcpy(reason, "no swap space", 128);
+                       }
+
+                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (specific) pid %d [%s]...skipped (%s)",
+                           aPid, ((p && *p->p_name) ? p->p_name : "unknown"), reason);
+                       p->p_memstat_state |= P_MEMSTAT_FREEZE_IGNORE;
+               }
+
+               p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
+               wakeup(&p->p_memstat_state);
+               proc_list_unlock();
+       }
+
+exit:
+       lck_mtx_unlock(&freezer_mutex);
+
+       return ret;
+}
+
+static int
+memorystatus_freeze_top_process(void)
+{
+       pid_t aPid = 0, coal_xpc_pid = 0;
+       int ret = -1;
+       proc_t p = PROC_NULL, next_p = PROC_NULL;
+       unsigned int i = 0;
+       unsigned int band = JETSAM_PRIORITY_IDLE;
+       boolean_t refreeze_processes = FALSE;
+       task_t curr_task = NULL;
+       coalition_t coal = COALITION_NULL;
+       pid_t pid_list[MAX_XPC_SERVICE_PIDS];
+       unsigned int    ntasks = 0;
+
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE_SCAN) | DBG_FUNC_START, memorystatus_available_pages, 0, 0, 0, 0);
+
+       proc_list_lock();
+
+       if (memorystatus_frozen_count >= memorystatus_frozen_processes_max) {
+               /*
+                * Freezer is already full but we are here and so let's
+                * try to refreeze any processes we might have thawed
+                * in the past and push out their compressed state out.
+                */
+               refreeze_processes = TRUE;
+               band = (unsigned int) memorystatus_freeze_jetsam_band;
+       }
+
+freeze_process:
+
+       next_p = memorystatus_get_first_proc_locked(&band, FALSE);
+       while (next_p) {
+               kern_return_t kr;
+               uint32_t purgeable, wired, clean, dirty, shared;
+               uint32_t max_pages = 0;
+               int    freezer_error_code = 0;
+
+               p = next_p;
+
+               if (coal == NULL) {
+                       next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
+               } else {
+                       /*
+                        * We have frozen a coalition leader and now are
+                        * dealing with its XPC services. We get our
+                        * next_p for each XPC service from the pid_list
+                        * acquired after a successful task_freeze call
+                        * on the coalition leader.
+                        */
+
+                       if (ntasks > 0) {
+                               coal_xpc_pid = pid_list[--ntasks];
+                               next_p = proc_findinternal(coal_xpc_pid, 1 /* proc_list_lock held */);
+                               /*
+                                * We grab a reference when we are about to freeze the process. So, drop
+                                * the reference that proc_findinternal() grabbed for us.
+                                * We also have the proc_list_lock and so this process is stable.
+                                */
+                               if (next_p) {
+                                       proc_rele_locked(next_p);
+                               }
+                       } else {
+                               next_p = NULL;
+                       }
+               }
+
+               aPid = p->p_pid;
+
+               if (p->p_memstat_effectivepriority != (int32_t) band) {
+                       /*
+                        * We shouldn't be freezing processes outside the
+                        * prescribed band.
+                        */
+                       break;
+               }
+
+               /* Ensure the process is eligible for (re-)freezing */
+               if (refreeze_processes) {
+                       /*
+                        * Has to have been frozen once before.
+                        */
+                       if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) {
+                               continue;
+                       }
+
+                       /*
+                        * Has to have been resumed once before.
+                        */
+                       if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == FALSE) {
+                               continue;
+                       }
+
+                       /*
+                        * Not currently being looked at for something.
+                        */
+                       if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
+                               continue;
+                       }
+
+                       /*
+                        * We are going to try and refreeze and so re-evaluate
+                        * the process. We don't want to double count the shared
+                        * memory. So deduct the old snapshot here.
+                        */
+                       memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages;
+                       p->p_memstat_freeze_sharedanon_pages = 0;
+
+                       p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
+                       memorystatus_refreeze_eligible_count--;
+               } else {
+                       if (memorystatus_is_process_eligible_for_freeze(p) == FALSE) {
+                               continue; // with lock held
+                       }
+               }
+
+               if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+                       /*
+                        * Freezer backed by the compressor and swap file(s)
+                        * will hold compressed data.
+                        */
+
+                       max_pages = MIN(memorystatus_freeze_pages_max, memorystatus_freeze_budget_pages_remaining);
+               } else {
+                       /*
+                        * We only have the compressor pool.
+                        */
+                       max_pages = UINT32_MAX - 1;
+               }
+
+               /* Mark as locked temporarily to avoid kill */
+               p->p_memstat_state |= P_MEMSTAT_LOCKED;
+
+               p = proc_ref_locked(p);
+               if (!p) {
+                       break;
+               }
+
+               proc_list_unlock();
+
+               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
+                   memorystatus_available_pages, 0, 0, 0, 0);
+
+               kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */);
+
+               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
+                   memorystatus_available_pages, aPid, 0, 0, 0);
+
+               MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_process: task_freeze %s for pid %d [%s] - "
+                   "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n",
+                   (kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"),
+                   memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared);
+
+               proc_list_lock();
+
+               /* Success? */
+               if (KERN_SUCCESS == kr) {
+                       memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
+
+                       p->p_memstat_freeze_sharedanon_pages += shared;
+
+                       memorystatus_frozen_shared_mb += shared;
+
+                       if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) {
+                               p->p_memstat_state |= P_MEMSTAT_FROZEN;
+                               memorystatus_frozen_count++;
+                       }
+
+                       p->p_memstat_frozen_count++;
+
+                       /*
+                        * Still keeping the P_MEMSTAT_LOCKED bit till we are actually done elevating this frozen process
+                        * to its higher jetsam band.
+                        */
+                       proc_list_unlock();
+
+                       memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
+
+                       if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+                               ret = memorystatus_update_inactive_jetsam_priority_band(p->p_pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE, memorystatus_freeze_jetsam_band, TRUE);
+
+                               if (ret) {
+                                       printf("Elevating the frozen process failed with %d\n", ret);
+                                       /* not fatal */
+                                       ret = 0;
+                               }
+
+                               proc_list_lock();
+
+                               /* Update stats */
+                               for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
+                                       throttle_intervals[i].pageouts += dirty;
+                               }
+                       } else {
+                               proc_list_lock();
+                       }
+
+                       memorystatus_freeze_pageouts += dirty;
+
+                       if (memorystatus_frozen_count == (memorystatus_frozen_processes_max - 1)) {
+                               /*
+                                * Add some eviction logic here? At some point should we
+                                * jetsam a process to get back its swap space so that we
+                                * can freeze a more eligible process at this moment in time?
+                                */
+                       }
+
+                       memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining);
+                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: %sfreezing (%s) pid %d [%s] done, memorystatus_freeze_budget_pages_remaining %llu %sfroze %u pages\n",
+                           refreeze_processes? "re" : "", (coal == NULL ? "general" : "coalition-driven"), aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_freeze_budget_pages_remaining, refreeze_processes? "Re" : "", dirty);
+
+                       /* Return KERN_SUCCESS */
+                       ret = kr;
+
+                       /*
+                        * We froze a process successfully. We can stop now
+                        * and see if that helped if this process isn't part
+                        * of a coalition.
+                        *
+                        * Else:
+                        * - if it is a leader, get the list of XPC services
+                        *   that need to be frozen.
+                        * - if it is a XPC service whose leader was frozen
+                        *   here, continue on to the next XPC service in the list.
+                        */
+
+                       if (coal == NULL) {
+                               curr_task = proc_task(p);
+                               coal = task_get_coalition(curr_task, COALITION_TYPE_JETSAM);
+                               if (coalition_is_leader(curr_task, coal)) {
+                                       ntasks = coalition_get_pid_list(coal, COALITION_ROLEMASK_XPC,
+                                           COALITION_SORT_DEFAULT, pid_list, MAX_XPC_SERVICE_PIDS);
+
+                                       if (ntasks > MAX_XPC_SERVICE_PIDS) {
+                                               ntasks = MAX_XPC_SERVICE_PIDS;
+                                       }
+                               }
+
+                               next_p = NULL;
+
+                               if (ntasks > 0) {
+                                       /*
+                                        * Start off with our first next_p in this list.
+                                        */
+                                       coal_xpc_pid = pid_list[--ntasks];
+                                       next_p = proc_findinternal(coal_xpc_pid, 1 /* proc_list_lock held */);
+
+                                       /*
+                                        * We grab a reference when we are about to freeze the process. So drop
+                                        * the reference that proc_findinternal() grabbed for us.
+                                        * We also have the proc_list_lock and so this process is stable.
+                                        */
+                                       if (next_p) {
+                                               proc_rele_locked(next_p);
+                                       }
+                               }
+                       }
+
+                       p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
+                       wakeup(&p->p_memstat_state);
+                       proc_rele_locked(p);
+
+                       if (coal && next_p) {
+                               continue;
+                       }
+
+                       /*
+                        * No coalition leader was frozen. So we don't
+                        * need to evaluate any XPC services.
+                        *
+                        * OR
+                        *
+                        * We have frozen all eligible XPC services for
+                        * the current coalition leader.
+                        *
+                        * Either way, we can break here and see if freezing
+                        * helped.
+                        */
+
+                       break;
+               } else {
+                       p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
+                       wakeup(&p->p_memstat_state);
+
+                       if (refreeze_processes == TRUE) {
+                               if ((freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) ||
+                                   (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO)) {
+                                       /*
+                                        * Keeping this prior-frozen process in this high band when
+                                        * we failed to re-freeze it due to bad shared memory usage
+                                        * could cause excessive pressure on the lower bands.
+                                        * We need to demote it for now. It'll get re-evaluated next
+                                        * time because we don't set the P_MEMSTAT_FREEZE_IGNORE
+                                        * bit.
+                                        */
+
+                                       p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
+                                       memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+                                       memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, TRUE, TRUE);
+                               }
+                       } else {
+                               p->p_memstat_state |= P_MEMSTAT_FREEZE_IGNORE;
+                       }
+
+                       char reason[128];
+                       if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) {
+                               strlcpy(reason, "too much shared memory", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) {
+                               strlcpy(reason, "low private-shared pages ratio", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) {
+                               strlcpy(reason, "no compressor space", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) {
+                               strlcpy(reason, "no swap space", 128);
+                       }
+
+                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (%s) pid %d [%s]...skipped (%s)\n",
+                           (coal == NULL ? "general" : "coalition-driven"), aPid, ((p && *p->p_name) ? p->p_name : "unknown"), reason);
+
+                       proc_rele_locked(p);
+
+                       if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
+                               break;
+                       }
+               }
+       }
+
+       if ((ret == -1) &&
+           (memorystatus_refreeze_eligible_count >= MIN_THAW_REFREEZE_THRESHOLD) &&
+           (refreeze_processes == FALSE)) {
+               /*
+                * We failed to freeze a process from the IDLE
+                * band AND we have some thawed  processes
+                * AND haven't tried refreezing as yet.
+                * Let's try and re-freeze processes in the
+                * frozen band that have been resumed in the past
+                * and so have brought in state from disk.
+                */
+
+               band = (unsigned int) memorystatus_freeze_jetsam_band;
+
+               refreeze_processes = TRUE;
+
+               goto freeze_process;
+       }
+
+       proc_list_unlock();
+
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE_SCAN) | DBG_FUNC_END, memorystatus_available_pages, aPid, 0, 0, 0);
+
+       return ret;
+}
+
+static inline boolean_t
+memorystatus_can_freeze_processes(void)
+{
+       boolean_t ret;
+
+       proc_list_lock();
+
+       if (memorystatus_suspended_count) {
+               memorystatus_freeze_suspended_threshold = MIN(memorystatus_freeze_suspended_threshold, FREEZE_SUSPENDED_THRESHOLD_DEFAULT);
+
+               if ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold) {
+                       ret = TRUE;
+               } else {
+                       ret = FALSE;
+               }
+       } else {
+               ret = FALSE;
+       }
+
+       proc_list_unlock();
+
+       return ret;
+}
+
+static boolean_t
+memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low)
+{
+       boolean_t can_freeze = TRUE;
+
+       /* Only freeze if we're sufficiently low on memory; this holds off freeze right
+       *  after boot,  and is generally is a no-op once we've reached steady state. */
+       if (memorystatus_available_pages > memorystatus_freeze_threshold) {
+               return FALSE;
+       }
+
+       /* Check minimum suspended process threshold. */
+       if (!memorystatus_can_freeze_processes()) {
+               return FALSE;
+       }
+       assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
+
+       if (!VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+               /*
+                * In-core compressor used for freezing WITHOUT on-disk swap support.
+                */
+               if (vm_compressor_low_on_space()) {
+                       if (*memorystatus_freeze_swap_low) {
+                               *memorystatus_freeze_swap_low = TRUE;
+                       }
+
+                       can_freeze = FALSE;
+               } else {
+                       if (*memorystatus_freeze_swap_low) {
+                               *memorystatus_freeze_swap_low = FALSE;
+                       }
+
+                       can_freeze = TRUE;
+               }
+       } else {
+               /*
+                * Freezing WITH on-disk swap support.
+                *
+                * In-core compressor fronts the swap.
+                */
+               if (vm_swap_low_on_space()) {
+                       if (*memorystatus_freeze_swap_low) {
+                               *memorystatus_freeze_swap_low = TRUE;
+                       }
+
+                       can_freeze = FALSE;
+               }
+       }
+
+       return can_freeze;
+}
+
+/*
+ * This function evaluates if the currently frozen processes deserve
+ * to stay in the higher jetsam band. There are 2 modes:
+ * - 'force one == TRUE': (urgent mode)
+ *     We are out of budget and can't refreeze a process. The process's
+ * state, if it was resumed, will stay in compressed memory. If we let it
+ * remain up in the higher frozen jetsam band, it'll put a lot of pressure on
+ * the lower bands. So we force-demote the least-recently-used-and-thawed
+ * process.
+ *
+ * - 'force_one == FALSE': (normal mode)
+ *      If the # of thaws of a process is below our threshold, then we
+ * will demote that process into the IDLE band.
+ * We don't immediately kill the process here because it  already has
+ * state on disk and so it might be worth giving it another shot at
+ * getting thawed/resumed and used.
+ */
+static void
+memorystatus_demote_frozen_processes(boolean_t force_one)
+{
+       unsigned int band = (unsigned int) memorystatus_freeze_jetsam_band;
+       unsigned int demoted_proc_count = 0;
+       proc_t p = PROC_NULL, next_p = PROC_NULL;
+       /* We demote to IDLE unless someone has asserted a higher priority on this process. */
+       int maxpriority = JETSAM_PRIORITY_IDLE;
+
+       proc_list_lock();
+
+       if (memorystatus_freeze_enabled == FALSE) {
+               /*
+                * Freeze has been disabled likely to
+                * reclaim swap space. So don't change
+                * any state on the frozen processes.
+                */
+               proc_list_unlock();
+               return;
+       }
+
+       next_p = memorystatus_get_first_proc_locked(&band, FALSE);
+       while (next_p) {
+               p = next_p;
+               next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
+
+               if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) {
+                       continue;
+               }
+
+               if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
+                       continue;
+               }
+
+               if (force_one == TRUE) {
+                       if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == 0) {
+                               /*
+                                * This process hasn't been thawed recently and so most of
+                                * its state sits on NAND and so we skip it -- jetsamming it
+                                * won't help with memory pressure.
+                                */
+                               continue;
+                       }
+               } else {
+                       if (p->p_memstat_thaw_count >= memorystatus_thaw_count_demotion_threshold) {
+                               /*
+                                * This process has met / exceeded our thaw count demotion threshold
+                                * and so we let it live in the higher bands.
+                                */
+                               continue;
+                       }
+               }
+
+               p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
+               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+
+               maxpriority = MAX(p->p_memstat_assertionpriority, maxpriority);
+               memorystatus_update_priority_locked(p, maxpriority, FALSE, FALSE);
+#if DEVELOPMENT || DEBUG
+               os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus_demote_frozen_process(%s) pid %d [%s]",
+                   (force_one ? "urgent" : "normal"), (p ? p->p_pid : -1), ((p && *p->p_name) ? p->p_name : "unknown"));
+#endif /* DEVELOPMENT || DEBUG */
+
+               /*
+                * The freezer thread will consider this a normal app to be frozen
+                * because it is in the IDLE band. So we don't need the
+                * P_MEMSTAT_REFREEZE_ELIGIBLE state here. Also, if it gets resumed
+                * we'll correctly count it as eligible for re-freeze again.
+                *
+                * We don't drop the frozen count because this process still has
+                * state on disk. So there's a chance it gets resumed and then it
+                * should land in the higher jetsam band. For that it needs to
+                * remain marked frozen.
+                */
+               if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
+                       p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
+                       memorystatus_refreeze_eligible_count--;
+               }
+
+               demoted_proc_count++;
+
+               if ((force_one == TRUE) || (demoted_proc_count == memorystatus_max_frozen_demotions_daily)) {
+                       break;
+               }
+       }
+
+       if (force_one == FALSE) {
+               /*
+                * We use this counter to track daily thaws.
+                * So we only reset it to 0 under the normal
+                * mode.
+                */
+               memorystatus_thaw_count = 0;
+       }
+
+       proc_list_unlock();
+}
+
+
+/*
+ * This function will do 4 things:
+ *
+ * 1) check to see if we are currently in a degraded freezer mode, and if so:
+ *    - check to see if our window has expired and we should exit this mode, OR,
+ *    - return a budget based on the degraded throttle window's max. pageouts vs current pageouts.
+ *
+ * 2) check to see if we are in a NEW normal window and update the normal throttle window's params.
+ *
+ * 3) check what the current normal window allows for a budget.
+ *
+ * 4) calculate the current rate of pageouts for DEGRADED_WINDOW_MINS duration. If that rate is below
+ *    what we would normally expect, then we are running low on our daily budget and need to enter
+ *    degraded perf. mode.
+ */
+
+static void
+memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed)
+{
+       clock_sec_t sec;
+       clock_nsec_t nsec;
+       mach_timespec_t ts;
+
+       unsigned int freeze_daily_pageouts_max = 0;
+
+#if DEVELOPMENT || DEBUG
+       if (!memorystatus_freeze_throttle_enabled) {
+               /*
+                * No throttling...we can use the full budget everytime.
+                */
+               *budget_pages_allowed = UINT64_MAX;
+               return;
+       }
+#endif
+
+       clock_get_system_nanotime(&sec, &nsec);
+       ts.tv_sec = sec;
+       ts.tv_nsec = nsec;
+
+       struct throttle_interval_t *interval = NULL;
+
+       if (memorystatus_freeze_degradation == TRUE) {
+               interval = degraded_throttle_window;
+
+               if (CMP_MACH_TIMESPEC(&ts, &interval->ts) >= 0) {
+                       memorystatus_freeze_degradation = FALSE;
+                       interval->pageouts = 0;
+                       interval->max_pageouts = 0;
+               } else {
+                       *budget_pages_allowed = interval->max_pageouts - interval->pageouts;
+               }
+       }
+
+       interval = normal_throttle_window;
+
+       if (CMP_MACH_TIMESPEC(&ts, &interval->ts) >= 0) {
+               /*
+                * New throttle window.
+                * Rollover any unused budget.
+                * Also ask the storage layer what the new budget needs to be.
+                */
+               uint64_t freeze_daily_budget = 0;
+               unsigned int daily_budget_pageouts = 0;
+
+               if (vm_swap_max_budget(&freeze_daily_budget)) {
+                       memorystatus_freeze_daily_mb_max = (freeze_daily_budget / (1024 * 1024));
+                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: memorystatus_freeze_daily_mb_max set to %dMB\n", memorystatus_freeze_daily_mb_max);
+               }
+
+               freeze_daily_pageouts_max = memorystatus_freeze_daily_mb_max * (1024 * 1024 / PAGE_SIZE);
+
+               daily_budget_pageouts =  (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / NORMAL_WINDOW_MINS));
+               interval->max_pageouts = (interval->max_pageouts - interval->pageouts) + daily_budget_pageouts;
+
+               interval->ts.tv_sec = interval->mins * 60;
+               interval->ts.tv_nsec = 0;
+               ADD_MACH_TIMESPEC(&interval->ts, &ts);
+               /* Since we update the throttle stats pre-freeze, adjust for overshoot here */
+               if (interval->pageouts > interval->max_pageouts) {
+                       interval->pageouts -= interval->max_pageouts;
+               } else {
+                       interval->pageouts = 0;
+               }
+               *budget_pages_allowed = interval->max_pageouts;
+
+               memorystatus_demote_frozen_processes(FALSE); /* normal mode...don't force a demotion */
+       } else {
+               /*
+                * Current throttle window.
+                * Deny freezing if we have no budget left.
+                * Try graceful degradation if we are within 25% of:
+                * - the daily budget, and
+                * - the current budget left is below our normal budget expectations.
+                */
+
+#if DEVELOPMENT || DEBUG
+               /*
+                * This can only happen in the INTERNAL configs because we allow modifying the daily budget for testing.
+                */
+
+               if (freeze_daily_pageouts_max > interval->max_pageouts) {
+                       /*
+                        * We just bumped the daily budget. Re-evaluate our normal window params.
+                        */
+                       interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / NORMAL_WINDOW_MINS));
+                       memorystatus_freeze_degradation = FALSE; //we'll re-evaluate this below...
+               }
+#endif /* DEVELOPMENT || DEBUG */
+
+               if (memorystatus_freeze_degradation == FALSE) {
+                       if (interval->pageouts >= interval->max_pageouts) {
+                               *budget_pages_allowed = 0;
+                       } else {
+                               int budget_left = interval->max_pageouts - interval->pageouts;
+                               int budget_threshold = (freeze_daily_pageouts_max * FREEZE_DEGRADATION_BUDGET_THRESHOLD) / 100;
+
+                               mach_timespec_t time_left = {0, 0};
+
+                               time_left.tv_sec = interval->ts.tv_sec;
+                               time_left.tv_nsec = 0;
+
+                               SUB_MACH_TIMESPEC(&time_left, &ts);
+
+                               if (budget_left <= budget_threshold) {
+                                       /*
+                                        * For the current normal window, calculate how much we would pageout in a DEGRADED_WINDOW_MINS duration.
+                                        * And also calculate what we would pageout for the same DEGRADED_WINDOW_MINS duration if we had the full
+                                        * daily pageout budget.
+                                        */
+
+                                       unsigned int current_budget_rate_allowed = ((budget_left / time_left.tv_sec) / 60) * DEGRADED_WINDOW_MINS;
+                                       unsigned int normal_budget_rate_allowed = (freeze_daily_pageouts_max / NORMAL_WINDOW_MINS) * DEGRADED_WINDOW_MINS;
+
+                                       /*
+                                        * The current rate of pageouts is below what we would expect for
+                                        * the normal rate i.e. we have below normal budget left and so...
+                                        */
+
+                                       if (current_budget_rate_allowed < normal_budget_rate_allowed) {
+                                               memorystatus_freeze_degradation = TRUE;
+                                               degraded_throttle_window->max_pageouts = current_budget_rate_allowed;
+                                               degraded_throttle_window->pageouts = 0;
+
+                                               /*
+                                                * Switch over to the degraded throttle window so the budget
+                                                * doled out is based on that window.
+                                                */
+                                               interval = degraded_throttle_window;
+                                       }
+                               }
+
+                               *budget_pages_allowed = interval->max_pageouts - interval->pageouts;
+                       }
+               }
+       }
+
+       MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n",
+           interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60,
+           interval->throttle ? "on" : "off");
+}
+
+static void
+memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused)
+{
+       static boolean_t memorystatus_freeze_swap_low = FALSE;
+
+       lck_mtx_lock(&freezer_mutex);
+
+       if (memorystatus_freeze_enabled) {
+               if ((memorystatus_frozen_count < memorystatus_frozen_processes_max) ||
+                   (memorystatus_refreeze_eligible_count >= MIN_THAW_REFREEZE_THRESHOLD)) {
+                       if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
+                               /* Only freeze if we've not exceeded our pageout budgets.*/
+                               memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining);
+
+                               if (memorystatus_freeze_budget_pages_remaining) {
+                                       memorystatus_freeze_top_process();
+                               } else {
+                                       memorystatus_demote_frozen_processes(TRUE); /* urgent mode..force one demotion */
+                               }
+                       }
+               }
+       }
+
+       /*
+        * We use memorystatus_apps_idle_delay_time because if/when we adopt aging for applications,
+        * it'll tie neatly into running the freezer once we age an application.
+        *
+        * Till then, it serves as a good interval that can be tuned via a sysctl too.
+        */
+       memorystatus_freezer_thread_next_run_ts = mach_absolute_time() + memorystatus_apps_idle_delay_time;
+
+       assert_wait((event_t) &memorystatus_freeze_wakeup, THREAD_UNINT);
+       lck_mtx_unlock(&freezer_mutex);
+
+       thread_block((thread_continue_t) memorystatus_freeze_thread);
+}
+
+boolean_t
+memorystatus_freeze_thread_should_run(void)
+{
+       /*
+        * No freezer_mutex held here...see why near call-site
+        * within memorystatus_pages_update().
+        */
+
+       boolean_t should_run = FALSE;
+
+       if (memorystatus_freeze_enabled == FALSE) {
+               goto out;
+       }
+
+       if (memorystatus_available_pages > memorystatus_freeze_threshold) {
+               goto out;
+       }
+
+       if ((memorystatus_frozen_count >= memorystatus_frozen_processes_max) &&
+           (memorystatus_refreeze_eligible_count < MIN_THAW_REFREEZE_THRESHOLD)) {
+               goto out;
+       }
+
+       if (memorystatus_frozen_shared_mb_max && (memorystatus_frozen_shared_mb >= memorystatus_frozen_shared_mb_max)) {
+               goto out;
+       }
+
+       uint64_t curr_time = mach_absolute_time();
+
+       if (curr_time < memorystatus_freezer_thread_next_run_ts) {
+               goto out;
+       }
+
+       should_run = TRUE;
+
+out:
+       return should_run;
+}
+
+int
+memorystatus_get_process_is_freezable(pid_t pid, int *is_freezable)
+{
+       proc_t p = PROC_NULL;
+
+       if (pid == 0) {
+               return EINVAL;
+       }
+
+       p = proc_find(pid);
+       if (!p) {
+               return ESRCH;
+       }
+
+       /*
+        * Only allow this on the current proc for now.
+        * We can check for privileges and allow targeting another process in the future.
+        */
+       if (p != current_proc()) {
+               proc_rele(p);
+               return EPERM;
+       }
+
+       proc_list_lock();
+       *is_freezable = ((p->p_memstat_state & P_MEMSTAT_FREEZE_DISABLED) ? 0 : 1);
+       proc_rele_locked(p);
+       proc_list_unlock();
+
+       return 0;
+}
+
+int
+memorystatus_set_process_is_freezable(pid_t pid, boolean_t is_freezable)
+{
+       proc_t p = PROC_NULL;
+
+       if (pid == 0) {
+               return EINVAL;
+       }
+
+       /*
+        * To enable freezable status, you need to be root or an entitlement.
+        */
+       if (is_freezable &&
+           !kauth_cred_issuser(kauth_cred_get()) &&
+           !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) {
+               return EPERM;
+       }
+
+       p = proc_find(pid);
+       if (!p) {
+               return ESRCH;
+       }
+
+       /*
+        * A process can change its own status. A coalition leader can
+        * change the status of coalition members.
+        */
+       if (p != current_proc()) {
+               coalition_t coal = task_get_coalition(proc_task(p), COALITION_TYPE_JETSAM);
+               if (!coalition_is_leader(proc_task(current_proc()), coal)) {
+                       proc_rele(p);
+                       return EPERM;
+               }
+       }
+
+       proc_list_lock();
+       if (is_freezable == FALSE) {
+               /* Freeze preference set to FALSE. Set the P_MEMSTAT_FREEZE_DISABLED bit. */
+               p->p_memstat_state |= P_MEMSTAT_FREEZE_DISABLED;
+               printf("memorystatus_set_process_is_freezable: disabling freeze for pid %d [%s]\n",
+                   p->p_pid, (*p->p_name ? p->p_name : "unknown"));
+       } else {
+               p->p_memstat_state &= ~P_MEMSTAT_FREEZE_DISABLED;
+               printf("memorystatus_set_process_is_freezable: enabling freeze for pid %d [%s]\n",
+                   p->p_pid, (*p->p_name ? p->p_name : "unknown"));
+       }
+       proc_rele_locked(p);
+       proc_list_unlock();
+
+       return 0;
+}
+
+static int
+sysctl_memorystatus_do_fastwake_warmup_all  SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+
+       if (!req->newptr) {
+               return EINVAL;
+       }
+
+       /* Need to be root or have entitlement */
+       if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) {
+               return EPERM;
+       }
+
+       if (memorystatus_freeze_enabled == FALSE) {
+               return ENOTSUP;
+       }
+
+       do_fastwake_warmup_all();
+
+       return 0;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, memorystatus_do_fastwake_warmup_all, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
+    0, 0, &sysctl_memorystatus_do_fastwake_warmup_all, "I", "");
+
+#endif /* CONFIG_FREEZE */
diff --git a/bsd/kern/kern_memorystatus_notify.c b/bsd/kern/kern_memorystatus_notify.c
new file mode 100644 (file)
index 0000000..c5be3d0
--- /dev/null
@@ -0,0 +1,1585 @@
+/*
+ * Copyright (c) 2006-2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ *
+ */
+
+#include <sys/kern_event.h>
+#include <kern/sched_prim.h>
+#include <kern/kalloc.h>
+#include <kern/assert.h>
+#include <kern/debug.h>
+#include <kern/locks.h>
+#include <kern/task.h>
+#include <kern/thread.h>
+#include <kern/host.h>
+#include <kern/policy_internal.h>
+#include <kern/thread_group.h>
+
+#include <IOKit/IOBSD.h>
+
+#include <libkern/libkern.h>
+#include <mach/coalition.h>
+#include <mach/mach_time.h>
+#include <mach/task.h>
+#include <mach/host_priv.h>
+#include <mach/mach_host.h>
+#include <os/log.h>
+#include <pexpert/pexpert.h>
+#include <sys/coalition.h>
+#include <sys/kern_event.h>
+#include <sys/proc.h>
+#include <sys/proc_info.h>
+#include <sys/reason.h>
+#include <sys/signal.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/tree.h>
+#include <sys/priv.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_protos.h>
+#include <mach/machine/sdt.h>
+#include <libkern/section_keywords.h>
+#include <stdatomic.h>
+
+#if CONFIG_FREEZE
+#include <vm/vm_map.h>
+#endif /* CONFIG_FREEZE */
+
+#include <sys/kern_memorystatus.h>
+#include <sys/kern_memorystatus_notify.h>
+
+/*
+ * Memorystatus klist structures
+ */
+struct klist memorystatus_klist;
+static lck_mtx_t memorystatus_klist_mutex;
+static void memorystatus_klist_lock(void);
+static void memorystatus_klist_unlock(void);
+
+/*
+ * Memorystatus kevent filter routines
+ */
+static int filt_memorystatusattach(struct knote *kn, struct kevent_qos_s *kev);
+static void filt_memorystatusdetach(struct knote *kn);
+static int filt_memorystatus(struct knote *kn, long hint);
+static int filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev);
+static int filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev);
+
+SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = {
+       .f_attach = filt_memorystatusattach,
+       .f_detach = filt_memorystatusdetach,
+       .f_event = filt_memorystatus,
+       .f_touch = filt_memorystatustouch,
+       .f_process = filt_memorystatusprocess,
+};
+
+/*
+ * Memorystatus notification events
+ */
+enum {
+       kMemorystatusNoPressure = 0x1,
+       kMemorystatusPressure = 0x2,
+       kMemorystatusLowSwap = 0x4,
+       kMemorystatusProcLimitWarn = 0x8,
+       kMemorystatusProcLimitCritical = 0x10
+};
+
+#define INTER_NOTIFICATION_DELAY    (250000)    /* .25 second */
+#define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD        5000    /* milliseconds */
+#define WARNING_NOTIFICATION_RESTING_PERIOD        25    /* seconds */
+#define CRITICAL_NOTIFICATION_RESTING_PERIOD        25    /* seconds */
+
+/*
+ * Memorystatus notification helper routines
+ */
+static vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
+static boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
+static void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
+static struct knote *vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process);
+static void vm_dispatch_memory_pressure(void);
+kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_process);
+
+#if VM_PRESSURE_EVENTS
+
+/*
+ * This value is the threshold that a process must meet to be considered for scavenging.
+ */
+#if CONFIG_EMBEDDED
+#define VM_PRESSURE_MINIMUM_RSIZE        6    /* MB */
+#else /* CONFIG_EMBEDDED */
+#define VM_PRESSURE_MINIMUM_RSIZE        10    /* MB */
+#endif /* CONFIG_EMBEDDED */
+
+static uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE;
+
+#if DEVELOPMENT || DEBUG
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, "");
+#endif /* DEVELOPMENT || DEBUG */
+
+vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
+
+/*
+ * We use this flag to signal if we have any HWM offenders
+ * on the system. This way we can reduce the number of wakeups
+ * of the memorystatus_thread when the system is between the
+ * "pressure" and "critical" threshold.
+ *
+ * The (re-)setting of this variable is done without any locks
+ * or synchronization simply because it is not possible (currently)
+ * to keep track of HWM offenders that drop down below their memory
+ * limit and/or exit. So, we choose to burn a couple of wasted wakeups
+ * by allowing the unguarded modification of this variable.
+ */
+boolean_t memorystatus_hwm_candidates = 0;
+
+#endif /* VM_PRESSURE_EVENTS */
+
+#if CONFIG_JETSAM
+
+extern unsigned int memorystatus_available_pages;
+extern unsigned int memorystatus_available_pages_pressure;
+extern unsigned int memorystatus_available_pages_critical;
+extern unsigned int memorystatus_available_pages_critical_base;
+extern unsigned int memorystatus_available_pages_critical_idle_offset;
+
+#else /* CONFIG_JETSAM */
+
+extern uint64_t memorystatus_available_pages;
+extern uint64_t memorystatus_available_pages_pressure;
+extern uint64_t memorystatus_available_pages_critical;
+
+#endif /* CONFIG_JETSAM */
+
+extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
+uint32_t memorystatus_jetsam_fg_band_waiters = 0;
+static uint64_t memorystatus_jetsam_fg_band_timestamp_ns = 0; /* nanosec */
+static uint64_t memorystatus_jetsam_fg_band_delay_ns = 5ull * 1000 * 1000 * 1000; /* nanosec */
+
+extern boolean_t(*volatile consider_buffer_cache_collect)(int);
+
+#if DEVELOPMENT || DEBUG
+SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_jetsam_fg_band_delay_ns, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &memorystatus_jetsam_fg_band_delay_ns, "");
+#endif
+
+static int
+filt_memorystatusattach(struct knote *kn, __unused struct kevent_qos_s *kev)
+{
+       int error;
+
+       kn->kn_flags |= EV_CLEAR; /* automatically set */
+       kn->kn_sdata = 0;         /* incoming data is ignored */
+
+       error = memorystatus_knote_register(kn);
+       if (error) {
+               knote_set_error(kn, error);
+       }
+       return 0;
+}
+
+static void
+filt_memorystatusdetach(struct knote *kn)
+{
+       memorystatus_knote_unregister(kn);
+}
+
+static int
+filt_memorystatus(struct knote *kn __unused, long hint)
+{
+       if (hint) {
+               switch (hint) {
+               case kMemorystatusNoPressure:
+                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
+                               kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
+                       }
+                       break;
+               case kMemorystatusPressure:
+                       if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
+                               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
+                                       kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
+                               }
+                       } else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
+                               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
+                                       kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
+                               }
+                       }
+                       break;
+               case kMemorystatusLowSwap:
+                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
+                               kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
+                       }
+                       break;
+
+               case kMemorystatusProcLimitWarn:
+                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
+                               kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
+                       }
+                       break;
+
+               case kMemorystatusProcLimitCritical:
+                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
+                               kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
+                       }
+                       break;
+
+               default:
+                       break;
+               }
+       }
+
+#if 0
+       if (kn->kn_fflags != 0) {
+               proc_t knote_proc = knote_get_kq(kn)->kq_p;
+               pid_t knote_pid = knote_proc->p_pid;
+
+               printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n",
+                   (unsigned long)kn, kn->kn_fflags, knote_pid);
+       }
+#endif
+
+       return kn->kn_fflags != 0;
+}
+
+static int
+filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev)
+{
+       int res;
+       int prev_kn_sfflags = 0;
+
+       memorystatus_klist_lock();
+
+       /*
+        * copy in new kevent settings
+        * (saving the "desired" data and fflags).
+        */
+
+       prev_kn_sfflags = kn->kn_sfflags;
+       kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK);
+
+#if !CONFIG_EMBEDDED
+       /*
+        * Only on desktop do we restrict notifications to
+        * one per active/inactive state (soft limits only).
+        */
+       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
+               /*
+                * Is there previous state to preserve?
+                */
+               if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
+                       /*
+                        * This knote was previously interested in proc_limit_warn,
+                        * so yes, preserve previous state.
+                        */
+                       if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
+                               kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
+                       }
+                       if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
+                               kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
+                       }
+               } else {
+                       /*
+                        * This knote was not previously interested in proc_limit_warn,
+                        * but it is now.  Set both states.
+                        */
+                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
+                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
+               }
+       }
+
+       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
+               /*
+                * Is there previous state to preserve?
+                */
+               if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
+                       /*
+                        * This knote was previously interested in proc_limit_critical,
+                        * so yes, preserve previous state.
+                        */
+                       if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
+                               kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
+                       }
+                       if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
+                               kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
+                       }
+               } else {
+                       /*
+                        * This knote was not previously interested in proc_limit_critical,
+                        * but it is now.  Set both states.
+                        */
+                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
+                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
+               }
+       }
+#endif /* !CONFIG_EMBEDDED */
+
+       /*
+        * reset the output flags based on a
+        * combination of the old events and
+        * the new desired event list.
+        */
+       //kn->kn_fflags &= kn->kn_sfflags;
+
+       res = (kn->kn_fflags != 0);
+
+       memorystatus_klist_unlock();
+
+       return res;
+}
+
+static int
+filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev)
+{
+       int res = 0;
+
+       memorystatus_klist_lock();
+       if (kn->kn_fflags) {
+               knote_fill_kevent(kn, kev, 0);
+               res = 1;
+       }
+       memorystatus_klist_unlock();
+
+       return res;
+}
+
+static void
+memorystatus_klist_lock(void)
+{
+       lck_mtx_lock(&memorystatus_klist_mutex);
+}
+
+static void
+memorystatus_klist_unlock(void)
+{
+       lck_mtx_unlock(&memorystatus_klist_mutex);
+}
+
+void
+memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr)
+{
+       lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
+       klist_init(&memorystatus_klist);
+}
+
+int
+memorystatus_knote_register(struct knote *kn)
+{
+       int error = 0;
+
+       memorystatus_klist_lock();
+
+       /*
+        * Support only userspace visible flags.
+        */
+       if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) {
+#if !CONFIG_EMBEDDED
+               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
+                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
+                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
+               }
+
+               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
+                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
+                       kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
+               }
+#endif /* !CONFIG_EMBEDDED */
+
+               KNOTE_ATTACH(&memorystatus_klist, kn);
+       } else {
+               error = ENOTSUP;
+       }
+
+       memorystatus_klist_unlock();
+
+       return error;
+}
+
+void
+memorystatus_knote_unregister(struct knote *kn __unused)
+{
+       memorystatus_klist_lock();
+       KNOTE_DETACH(&memorystatus_klist, kn);
+       memorystatus_klist_unlock();
+}
+
+#if VM_PRESSURE_EVENTS
+
+#if CONFIG_MEMORYSTATUS
+
+int
+memorystatus_send_note(int event_code, void *data, size_t data_length)
+{
+       int ret;
+       struct kev_msg ev_msg;
+
+       ev_msg.vendor_code    = KEV_VENDOR_APPLE;
+       ev_msg.kev_class      = KEV_SYSTEM_CLASS;
+       ev_msg.kev_subclass   = KEV_MEMORYSTATUS_SUBCLASS;
+
+       ev_msg.event_code     = event_code;
+
+       ev_msg.dv[0].data_length = data_length;
+       ev_msg.dv[0].data_ptr = data;
+       ev_msg.dv[1].data_length = 0;
+
+       ret = kev_post_msg(&ev_msg);
+       if (ret) {
+               printf("%s: kev_post_msg() failed, err %d\n", __func__, ret);
+       }
+
+       return ret;
+}
+
+boolean_t
+memorystatus_warn_process(pid_t pid, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded)
+{
+       boolean_t ret = FALSE;
+       boolean_t found_knote = FALSE;
+       struct knote *kn = NULL;
+       int send_knote_count = 0;
+
+       /*
+        * See comment in sysctl_memorystatus_vm_pressure_send.
+        */
+
+       memorystatus_klist_lock();
+
+       SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
+               proc_t knote_proc = knote_get_kq(kn)->kq_p;
+               pid_t knote_pid = knote_proc->p_pid;
+
+               if (knote_pid == pid) {
+                       /*
+                        * By setting the "fflags" here, we are forcing
+                        * a process to deal with the case where it's
+                        * bumping up into its memory limits. If we don't
+                        * do this here, we will end up depending on the
+                        * system pressure snapshot evaluation in
+                        * filt_memorystatus().
+                        */
+
+#if CONFIG_EMBEDDED
+                       if (!limit_exceeded) {
+                               /*
+                                * Intentionally set either the unambiguous limit warning,
+                                * the system-wide critical or the system-wide warning
+                                * notification bit.
+                                */
+
+                               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
+                                       kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
+                                       found_knote = TRUE;
+                                       send_knote_count++;
+                               } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
+                                       kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
+                                       found_knote = TRUE;
+                                       send_knote_count++;
+                               } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
+                                       kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
+                                       found_knote = TRUE;
+                                       send_knote_count++;
+                               }
+                       } else {
+                               /*
+                                * Send this notification when a process has exceeded a soft limit.
+                                */
+                               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
+                                       kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
+                                       found_knote = TRUE;
+                                       send_knote_count++;
+                               }
+                       }
+#else /* CONFIG_EMBEDDED */
+                       if (!limit_exceeded) {
+                               /*
+                                * Processes on desktop are not expecting to handle a system-wide
+                                * critical or system-wide warning notification from this path.
+                                * Intentionally set only the unambiguous limit warning here.
+                                *
+                                * If the limit is soft, however, limit this to one notification per
+                                * active/inactive limit (per each registered listener).
+                                */
+
+                               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
+                                       found_knote = TRUE;
+                                       if (!is_fatal) {
+                                               /*
+                                                * Restrict proc_limit_warn notifications when
+                                                * non-fatal (soft) limit is at play.
+                                                */
+                                               if (is_active) {
+                                                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
+                                                               /*
+                                                                * Mark this knote for delivery.
+                                                                */
+                                                               kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
+                                                               /*
+                                                                * And suppress it from future notifications.
+                                                                */
+                                                               kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
+                                                               send_knote_count++;
+                                                       }
+                                               } else {
+                                                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
+                                                               /*
+                                                                * Mark this knote for delivery.
+                                                                */
+                                                               kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
+                                                               /*
+                                                                * And suppress it from future notifications.
+                                                                */
+                                                               kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
+                                                               send_knote_count++;
+                                                       }
+                                               }
+                                       } else {
+                                               /*
+                                                * No restriction on proc_limit_warn notifications when
+                                                * fatal (hard) limit is at play.
+                                                */
+                                               kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
+                                               send_knote_count++;
+                                       }
+                               }
+                       } else {
+                               /*
+                                * Send this notification when a process has exceeded a soft limit,
+                                */
+
+                               if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
+                                       found_knote = TRUE;
+                                       if (!is_fatal) {
+                                               /*
+                                                * Restrict critical notifications for soft limits.
+                                                */
+
+                                               if (is_active) {
+                                                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
+                                                               /*
+                                                                * Suppress future proc_limit_critical notifications
+                                                                * for the active soft limit.
+                                                                */
+                                                               kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
+                                                               kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
+                                                               send_knote_count++;
+                                                       }
+                                               } else {
+                                                       if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
+                                                               /*
+                                                                * Suppress future proc_limit_critical_notifications
+                                                                * for the inactive soft limit.
+                                                                */
+                                                               kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
+                                                               kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
+                                                               send_knote_count++;
+                                                       }
+                                               }
+                                       } else {
+                                               /*
+                                                * We should never be trying to send a critical notification for
+                                                * a hard limit... the process would be killed before it could be
+                                                * received.
+                                                */
+                                               panic("Caught sending pid %d a critical warning for a fatal limit.\n", pid);
+                                       }
+                               }
+                       }
+#endif /* CONFIG_EMBEDDED */
+               }
+       }
+
+       if (found_knote) {
+               if (send_knote_count > 0) {
+                       KNOTE(&memorystatus_klist, 0);
+               }
+               ret = TRUE;
+       }
+
+       memorystatus_klist_unlock();
+
+       return ret;
+}
+
+/*
+ * Can only be set by the current task on itself.
+ */
+int
+memorystatus_low_mem_privileged_listener(uint32_t op_flags)
+{
+       boolean_t set_privilege = FALSE;
+       /*
+        * Need an entitlement check here?
+        */
+       if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
+               set_privilege = TRUE;
+       } else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
+               set_privilege = FALSE;
+       } else {
+               return EINVAL;
+       }
+
+       return task_low_mem_privileged_listener(current_task(), set_privilege, NULL);
+}
+
+int
+memorystatus_send_pressure_note(pid_t pid)
+{
+       MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid);
+       return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
+}
+
+boolean_t
+memorystatus_is_foreground_locked(proc_t p)
+{
+       return (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
+              (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT);
+}
+
+/*
+ * This is meant for stackshot and kperf -- it does not take the proc_list_lock
+ * to access the p_memstat_dirty field.
+ */
+void
+memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit)
+{
+       if (!v) {
+               *is_dirty = FALSE;
+               *is_dirty_tracked = FALSE;
+               *allow_idle_exit = FALSE;
+       } else {
+               proc_t p = (proc_t)v;
+               *is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
+               *is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0;
+               *allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0;
+       }
+}
+
+boolean_t
+memorystatus_bg_pressure_eligible(proc_t p)
+{
+       boolean_t eligible = FALSE;
+
+       proc_list_lock();
+
+       MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", p->p_pid, p->p_memstat_state);
+
+       /* Foreground processes have already been dealt with at this point, so just test for eligibility */
+       if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
+               eligible = TRUE;
+       }
+
+       if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) {
+               /*
+                * IDLE and IDLE_DEFERRED bands contain processes
+                * that have dropped memory to be under their inactive
+                * memory limits. And so they can't really give back
+                * anything.
+                */
+               eligible = FALSE;
+       }
+
+       proc_list_unlock();
+
+       return eligible;
+}
+
+void
+memorystatus_send_low_swap_note(void)
+{
+       struct knote *kn = NULL;
+
+       memorystatus_klist_lock();
+       SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
+               /* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
+                * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
+                * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
+                * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
+               if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
+                       KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
+                       break;
+               }
+       }
+
+       memorystatus_klist_unlock();
+}
+
+#endif /* CONFIG_MEMORYSTATUS */
+
+/*
+ * kn_max - knote
+ *
+ * knote_pressure_level - to check if the knote is registered for this notification level.
+ *
+ * task    - task whose bits we'll be modifying
+ *
+ * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
+ *
+ * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
+ *
+ */
+
+static boolean_t
+is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
+{
+       if (kn_max->kn_sfflags & knote_pressure_level) {
+               if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
+                       task_clear_has_been_notified(task, pressure_level_to_clear);
+               }
+
+               task_mark_has_been_notified(task, pressure_level_to_set);
+               return TRUE;
+       }
+
+       return FALSE;
+}
+
+static void
+memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)
+{
+       struct knote *kn = NULL;
+
+       memorystatus_klist_lock();
+       SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
+               proc_t            p = PROC_NULL;
+               struct task*        t = TASK_NULL;
+
+               p = knote_get_kq(kn)->kq_p;
+               proc_list_lock();
+               if (p != proc_ref_locked(p)) {
+                       p = PROC_NULL;
+                       proc_list_unlock();
+                       continue;
+               }
+               proc_list_unlock();
+
+               t = (struct task *)(p->task);
+
+               task_clear_has_been_notified(t, pressure_level_to_clear);
+
+               proc_rele(p);
+       }
+
+       memorystatus_klist_unlock();
+}
+
+/*
+ * Used by the vm_pressure_thread which is
+ * signalled from within vm_pageout_scan().
+ */
+
+void
+consider_vm_pressure_events(void)
+{
+       vm_dispatch_memory_pressure();
+}
+
+static void
+vm_dispatch_memory_pressure(void)
+{
+       memorystatus_update_vm_pressure(FALSE);
+}
+
+static struct knote *
+vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process)
+{
+       struct knote    *kn = NULL, *kn_max = NULL;
+       uint64_t    resident_max = 0;/* MB */
+       struct timeval    curr_tstamp = {0, 0};
+       int        elapsed_msecs = 0;
+       int        selected_task_importance = 0;
+       static int    pressure_snapshot = -1;
+       boolean_t    pressure_increase = FALSE;
+
+       if (pressure_snapshot == -1) {
+               /*
+                * Initial snapshot.
+                */
+               pressure_snapshot = level;
+               pressure_increase = TRUE;
+       } else {
+               if (level && (level >= pressure_snapshot)) {
+                       pressure_increase = TRUE;
+               } else {
+                       pressure_increase = FALSE;
+               }
+
+               pressure_snapshot = level;
+       }
+
+       if (pressure_increase == TRUE) {
+               /*
+                * We'll start by considering the largest
+                * unimportant task in our list.
+                */
+               selected_task_importance = INT_MAX;
+       } else {
+               /*
+                * We'll start by considering the largest
+                * important task in our list.
+                */
+               selected_task_importance = 0;
+       }
+
+       microuptime(&curr_tstamp);
+
+       SLIST_FOREACH(kn, candidate_list, kn_selnext) {
+               uint64_t        resident_size = 0;/* MB */
+               proc_t            p = PROC_NULL;
+               struct task*        t = TASK_NULL;
+               int            curr_task_importance = 0;
+               boolean_t        consider_knote = FALSE;
+               boolean_t        privileged_listener = FALSE;
+
+               p = knote_get_kq(kn)->kq_p;
+               proc_list_lock();
+               if (p != proc_ref_locked(p)) {
+                       p = PROC_NULL;
+                       proc_list_unlock();
+                       continue;
+               }
+               proc_list_unlock();
+
+#if CONFIG_MEMORYSTATUS
+               if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) {
+                       /*
+                        * Skip process not marked foreground.
+                        */
+                       proc_rele(p);
+                       continue;
+               }
+#endif /* CONFIG_MEMORYSTATUS */
+
+               t = (struct task *)(p->task);
+
+               timevalsub(&curr_tstamp, &p->vm_pressure_last_notify_tstamp);
+               elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
+
+               vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
+
+               if ((kn->kn_sfflags & dispatch_level) == 0) {
+                       proc_rele(p);
+                       continue;
+               }
+
+#if CONFIG_MEMORYSTATUS
+               if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) {
+                       VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", p->p_pid);
+                       proc_rele(p);
+                       continue;
+               }
+#endif /* CONFIG_MEMORYSTATUS */
+
+#if CONFIG_EMBEDDED
+               curr_task_importance = p->p_memstat_effectivepriority;
+#else /* CONFIG_EMBEDDED */
+               curr_task_importance = task_importance_estimate(t);
+#endif /* CONFIG_EMBEDDED */
+
+               /*
+                * Privileged listeners are only considered in the multi-level pressure scheme
+                * AND only if the pressure is increasing.
+                */
+               if (level > 0) {
+                       if (task_has_been_notified(t, level) == FALSE) {
+                               /*
+                                * Is this a privileged listener?
+                                */
+                               if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) {
+                                       if (privileged_listener) {
+                                               kn_max = kn;
+                                               proc_rele(p);
+                                               goto done_scanning;
+                                       }
+                               }
+                       } else {
+                               proc_rele(p);
+                               continue;
+                       }
+               } else if (level == 0) {
+                       /*
+                        * Task wasn't notified when the pressure was increasing and so
+                        * no need to notify it that the pressure is decreasing.
+                        */
+                       if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) {
+                               proc_rele(p);
+                               continue;
+                       }
+               }
+
+               /*
+                * We don't want a small process to block large processes from
+                * being notified again. <rdar://problem/7955532>
+                */
+               resident_size = (get_task_phys_footprint(t)) / (1024 * 1024ULL); /* MB */
+
+               if (resident_size >= vm_pressure_task_footprint_min) {
+                       if (level > 0) {
+                               /*
+                                * Warning or Critical Pressure.
+                                */
+                               if (pressure_increase) {
+                                       if ((curr_task_importance < selected_task_importance) ||
+                                           ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
+                                               /*
+                                                * We have found a candidate process which is:
+                                                * a) at a lower importance than the current selected process
+                                                * OR
+                                                * b) has importance equal to that of the current selected process but is larger
+                                                */
+
+                                               consider_knote = TRUE;
+                                       }
+                               } else {
+                                       if ((curr_task_importance > selected_task_importance) ||
+                                           ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
+                                               /*
+                                                * We have found a candidate process which is:
+                                                * a) at a higher importance than the current selected process
+                                                * OR
+                                                * b) has importance equal to that of the current selected process but is larger
+                                                */
+
+                                               consider_knote = TRUE;
+                                       }
+                               }
+                       } else if (level == 0) {
+                               /*
+                                * Pressure back to normal.
+                                */
+                               if ((curr_task_importance > selected_task_importance) ||
+                                   ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
+                                       consider_knote = TRUE;
+                               }
+                       }
+
+                       if (consider_knote) {
+                               resident_max = resident_size;
+                               kn_max = kn;
+                               selected_task_importance = curr_task_importance;
+                               consider_knote = FALSE; /* reset for the next candidate */
+                       }
+               } else {
+                       /* There was no candidate with enough resident memory to scavenge */
+                       VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", p->p_pid, resident_size);
+               }
+               proc_rele(p);
+       }
+
+done_scanning:
+       if (kn_max) {
+               VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, knote_get_kq(kn_max)->kq_p->p_pid, resident_max, 0, 0);
+               VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", knote_get_kq(kn_max)->kq_p->p_pid, resident_max);
+       }
+
+       return kn_max;
+}
+
+static uint64_t next_warning_notification_sent_at_ts = 0;
+static uint64_t next_critical_notification_sent_at_ts = 0;
+
+boolean_t        memorystatus_manual_testing_on = FALSE;
+vm_pressure_level_t    memorystatus_manual_testing_level = kVMPressureNormal;
+
+kern_return_t
+memorystatus_update_vm_pressure(boolean_t target_foreground_process)
+{
+       struct knote            *kn_max = NULL;
+       struct knote            *kn_cur = NULL, *kn_temp = NULL;/* for safe list traversal */
+       pid_t                target_pid = -1;
+       struct klist            dispatch_klist = { NULL };
+       proc_t                target_proc = PROC_NULL;
+       struct task            *task = NULL;
+       boolean_t            found_candidate = FALSE;
+
+       static vm_pressure_level_t     level_snapshot = kVMPressureNormal;
+       static vm_pressure_level_t    prev_level_snapshot = kVMPressureNormal;
+       boolean_t            smoothing_window_started = FALSE;
+       struct timeval            smoothing_window_start_tstamp = {0, 0};
+       struct timeval            curr_tstamp = {0, 0};
+       int                elapsed_msecs = 0;
+       uint64_t             curr_ts = mach_absolute_time();
+
+#if !CONFIG_JETSAM
+#define MAX_IDLE_KILLS 100    /* limit the number of idle kills allowed */
+
+       int    idle_kill_counter = 0;
+
+       /*
+        * On desktop we take this opportunity to free up memory pressure
+        * by immediately killing idle exitable processes. We use a delay
+        * to avoid overkill.  And we impose a max counter as a fail safe
+        * in case daemons re-launch too fast.
+        */
+       while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
+               if (memorystatus_idle_exit_from_VM() == FALSE) {
+                       /* No idle exitable processes left to kill */
+                       break;
+               }
+               idle_kill_counter++;
+
+               if (memorystatus_manual_testing_on == TRUE) {
+                       /*
+                        * Skip the delay when testing
+                        * the pressure notification scheme.
+                        */
+               } else {
+                       delay(1000000); /* 1 second */
+               }
+       }
+#endif /* !CONFIG_JETSAM */
+
+       if (level_snapshot != kVMPressureNormal) {
+               /*
+                * Check to see if we are still in the 'resting' period
+                * after having notified all clients interested in
+                * a particular pressure level.
+                */
+
+               level_snapshot = memorystatus_vm_pressure_level;
+
+               if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
+                       if (next_warning_notification_sent_at_ts) {
+                               if (curr_ts < next_warning_notification_sent_at_ts) {
+                                       delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
+                                       return KERN_SUCCESS;
+                               }
+
+                               next_warning_notification_sent_at_ts = 0;
+                               memorystatus_klist_reset_all_for_level(kVMPressureWarning);
+                       }
+               } else if (level_snapshot == kVMPressureCritical) {
+                       if (next_critical_notification_sent_at_ts) {
+                               if (curr_ts < next_critical_notification_sent_at_ts) {
+                                       delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
+                                       return KERN_SUCCESS;
+                               }
+                               next_critical_notification_sent_at_ts = 0;
+                               memorystatus_klist_reset_all_for_level(kVMPressureCritical);
+                       }
+               }
+       }
+
+       while (1) {
+               /*
+                * There is a race window here. But it's not clear
+                * how much we benefit from having extra synchronization.
+                */
+               level_snapshot = memorystatus_vm_pressure_level;
+
+               if (prev_level_snapshot > level_snapshot) {
+                       /*
+                        * Pressure decreased? Let's take a little breather
+                        * and see if this condition stays.
+                        */
+                       if (smoothing_window_started == FALSE) {
+                               smoothing_window_started = TRUE;
+                               microuptime(&smoothing_window_start_tstamp);
+                       }
+
+                       microuptime(&curr_tstamp);
+                       timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
+                       elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
+
+                       if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
+                               delay(INTER_NOTIFICATION_DELAY);
+                               continue;
+                       }
+               }
+
+               prev_level_snapshot = level_snapshot;
+               smoothing_window_started = FALSE;
+
+               memorystatus_klist_lock();
+               kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process);
+
+               if (kn_max == NULL) {
+                       memorystatus_klist_unlock();
+
+                       /*
+                        * No more level-based clients to notify.
+                        *
+                        * Start the 'resting' window within which clients will not be re-notified.
+                        */
+
+                       if (level_snapshot != kVMPressureNormal) {
+                               if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
+                                       nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
+
+                                       /* Next warning notification (if nothing changes) won't be sent before...*/
+                                       next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts;
+                               }
+
+                               if (level_snapshot == kVMPressureCritical) {
+                                       nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
+
+                                       /* Next critical notification (if nothing changes) won't be sent before...*/
+                                       next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts;
+                               }
+                       }
+                       return KERN_FAILURE;
+               }
+
+               target_proc = knote_get_kq(kn_max)->kq_p;
+
+               proc_list_lock();
+               if (target_proc != proc_ref_locked(target_proc)) {
+                       target_proc = PROC_NULL;
+                       proc_list_unlock();
+                       memorystatus_klist_unlock();
+                       continue;
+               }
+               proc_list_unlock();
+
+               target_pid = target_proc->p_pid;
+
+               task = (struct task *)(target_proc->task);
+
+               if (level_snapshot != kVMPressureNormal) {
+                       if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
+                               if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) {
+                                       found_candidate = TRUE;
+                               }
+                       } else {
+                               if (level_snapshot == kVMPressureCritical) {
+                                       if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) {
+                                               found_candidate = TRUE;
+                                       }
+                               }
+                       }
+               } else {
+                       if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
+                               task_clear_has_been_notified(task, kVMPressureWarning);
+                               task_clear_has_been_notified(task, kVMPressureCritical);
+
+                               found_candidate = TRUE;
+                       }
+               }
+
+               if (found_candidate == FALSE) {
+                       proc_rele(target_proc);
+                       memorystatus_klist_unlock();
+                       continue;
+               }
+
+               SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
+                       int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot);
+
+                       if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) {
+                               proc_t knote_proc = knote_get_kq(kn_cur)->kq_p;
+                               pid_t knote_pid = knote_proc->p_pid;
+                               if (knote_pid == target_pid) {
+                                       KNOTE_DETACH(&memorystatus_klist, kn_cur);
+                                       KNOTE_ATTACH(&dispatch_klist, kn_cur);
+                               }
+                       }
+               }
+
+               KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
+
+               SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
+                       KNOTE_DETACH(&dispatch_klist, kn_cur);
+                       KNOTE_ATTACH(&memorystatus_klist, kn_cur);
+               }
+
+               memorystatus_klist_unlock();
+
+               microuptime(&target_proc->vm_pressure_last_notify_tstamp);
+               proc_rele(target_proc);
+
+               if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
+                       break;
+               }
+
+               if (memorystatus_manual_testing_on == TRUE) {
+                       /*
+                        * Testing out the pressure notification scheme.
+                        * No need for delays etc.
+                        */
+               } else {
+                       uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
+#if CONFIG_JETSAM
+                       unsigned int page_delta = 0;
+                       unsigned int skip_delay_page_threshold = 0;
+
+                       assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
+
+                       page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
+                       skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
+
+                       if (memorystatus_available_pages <= skip_delay_page_threshold) {
+                               /*
+                                * We are nearing the critcal mark fast and can't afford to wait between
+                                * notifications.
+                                */
+                               sleep_interval = 0;
+                       }
+#endif /* CONFIG_JETSAM */
+
+                       if (sleep_interval) {
+                               delay(sleep_interval);
+                       }
+               }
+       }
+
+       return KERN_SUCCESS;
+}
+
+static uint32_t
+convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
+{
+       uint32_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
+
+       switch (internal_pressure_level) {
+       case kVMPressureNormal:
+       {
+               dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
+               break;
+       }
+
+       case kVMPressureWarning:
+       case kVMPressureUrgent:
+       {
+               dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
+               break;
+       }
+
+       case kVMPressureCritical:
+       {
+               dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
+               break;
+       }
+
+       default:
+               break;
+       }
+
+       return dispatch_level;
+}
+
+/*
+ * Notify any kexts that are waiting for notification that jetsam
+ * is approaching the foreground bands. They should use this notification
+ * to free cached memory.
+ */
+void
+memorystatus_issue_fg_band_notify(void)
+{
+       uint64_t now;
+
+       lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
+       absolutetime_to_nanoseconds(mach_absolute_time(), &now);
+       if (now - memorystatus_jetsam_fg_band_timestamp_ns < memorystatus_jetsam_fg_band_delay_ns) {
+               lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
+               return;
+       }
+
+       if (memorystatus_jetsam_fg_band_waiters > 0) {
+               thread_wakeup(&memorystatus_jetsam_fg_band_waiters);
+               memorystatus_jetsam_fg_band_waiters = 0;
+               memorystatus_jetsam_fg_band_timestamp_ns = now;
+       }
+       lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
+
+       /* Notify the buffer cache, file systems, etc. to jetison everything they can. */
+       if (consider_buffer_cache_collect != NULL) {
+               (void)(*consider_buffer_cache_collect)(1);
+       }
+}
+
+
+/*
+ * Memorystatus notification debugging support
+ */
+
+static int
+sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2, oidp)
+#if CONFIG_EMBEDDED
+       int error = 0;
+
+       error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
+       if (error) {
+               return error;
+       }
+
+#endif /* CONFIG_EMBEDDED */
+       uint32_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
+
+       return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
+}
+
+#if DEBUG || DEVELOPMENT
+
+SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
+    0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
+
+#else /* DEBUG || DEVELOPMENT */
+
+SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
+    0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
+
+#endif /* DEBUG || DEVELOPMENT */
+
+/*
+ * Trigger levels to test the mechanism.
+ * Can be used via a sysctl.
+ */
+#define TEST_LOW_MEMORY_TRIGGER_ONE        1
+#define TEST_LOW_MEMORY_TRIGGER_ALL        2
+#define TEST_PURGEABLE_TRIGGER_ONE        3
+#define TEST_PURGEABLE_TRIGGER_ALL        4
+#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE    5
+#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL    6
+
+static int
+sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+
+       int level = 0;
+       int error = 0;
+       int pressure_level = 0;
+       int trigger_request = 0;
+       int force_purge;
+
+       error = sysctl_handle_int(oidp, &level, 0, req);
+       if (error || !req->newptr) {
+               return error;
+       }
+
+       memorystatus_manual_testing_on = TRUE;
+
+       trigger_request = (level >> 16) & 0xFFFF;
+       pressure_level = (level & 0xFFFF);
+
+       if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
+           trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
+               return EINVAL;
+       }
+       switch (pressure_level) {
+       case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
+       case NOTE_MEMORYSTATUS_PRESSURE_WARN:
+       case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
+               break;
+       default:
+               return EINVAL;
+       }
+
+       /*
+        * The pressure level is being set from user-space.
+        * And user-space uses the constants in sys/event.h
+        * So we translate those events to our internal levels here.
+        */
+       if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
+               memorystatus_manual_testing_level = kVMPressureNormal;
+               force_purge = 0;
+       } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
+               memorystatus_manual_testing_level = kVMPressureWarning;
+               force_purge = vm_pageout_state.memorystatus_purge_on_warning;
+       } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
+               memorystatus_manual_testing_level = kVMPressureCritical;
+               force_purge = vm_pageout_state.memorystatus_purge_on_critical;
+       }
+
+       memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
+
+       /* purge according to the new pressure level */
+       switch (trigger_request) {
+       case TEST_PURGEABLE_TRIGGER_ONE:
+       case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
+               if (force_purge == 0) {
+                       /* no purging requested */
+                       break;
+               }
+               vm_purgeable_object_purge_one_unlocked(force_purge);
+               break;
+       case TEST_PURGEABLE_TRIGGER_ALL:
+       case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
+               if (force_purge == 0) {
+                       /* no purging requested */
+                       break;
+               }
+               while (vm_purgeable_object_purge_one_unlocked(force_purge)) {
+                       ;
+               }
+               break;
+       }
+
+       if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
+           (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
+               memorystatus_update_vm_pressure(TRUE);
+       }
+
+       if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
+           (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
+               while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
+                       continue;
+               }
+       }
+
+       if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
+               memorystatus_manual_testing_on = FALSE;
+       }
+
+       return 0;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
+    0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
+
+
+SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, "");
+SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, "");
+SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, "");
+
+#if DEBUG || DEVELOPMENT
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, "");
+
+#if 0
+#if CONFIG_JETSAM && VM_PRESSURE_EVENTS
+static boolean_t
+memorystatus_issue_pressure_kevent(boolean_t pressured)
+{
+       memorystatus_klist_lock();
+       KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
+       memorystatus_klist_unlock();
+       return TRUE;
+}
+#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
+#endif /* 0 */
+
+/*
+ * This routine is used for targeted notifications regardless of system memory pressure
+ * and regardless of whether or not the process has already been notified.
+ * It bypasses and has no effect on the only-one-notification per soft-limit policy.
+ *
+ * "memnote" is the current user.
+ */
+
+static int
+sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       /* Need to be root or have memorystatus entitlement */
+       if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) {
+               return EPERM;
+       }
+
+       int error = 0, pid = 0;
+       struct knote *kn = NULL;
+       boolean_t found_knote = FALSE;
+       int fflags = 0;    /* filter flags for EVFILT_MEMORYSTATUS */
+       uint64_t value = 0;
+
+       error = sysctl_handle_quad(oidp, &value, 0, req);
+       if (error || !req->newptr) {
+               return error;
+       }
+
+       /*
+        * Find the pid in the low 32 bits of value passed in.
+        */
+       pid = (int)(value & 0xFFFFFFFF);
+
+       /*
+        * Find notification in the high 32 bits of the value passed in.
+        */
+       fflags = (int)((value >> 32) & 0xFFFFFFFF);
+
+       /*
+        * For backwards compatibility, when no notification is
+        * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN
+        */
+       if (fflags == 0) {
+               fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
+               // printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags);
+       }
+
+       /* wake up everybody waiting for kVMPressureJetsam */
+       if (fflags == NOTE_MEMORYSTATUS_JETSAM_FG_BAND) {
+               memorystatus_issue_fg_band_notify();
+               return error;
+       }
+
+       /*
+        * See event.h ... fflags for EVFILT_MEMORYSTATUS
+        */
+       if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) ||
+           (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) ||
+           (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) ||
+           (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) ||
+           (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) ||
+           (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) ||
+           (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 &&
+           ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) {
+               printf("memorystatus_vm_pressure_send: notification [0x%x] not supported \n", fflags);
+               error = 1;
+               return error;
+       }
+
+       /*
+        * Forcibly send pid a memorystatus notification.
+        */
+
+       memorystatus_klist_lock();
+
+       SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
+               proc_t knote_proc = knote_get_kq(kn)->kq_p;
+               pid_t knote_pid = knote_proc->p_pid;
+
+               if (knote_pid == pid) {
+                       /*
+                        * Forcibly send this pid a memorystatus notification.
+                        */
+                       kn->kn_fflags = fflags;
+                       found_knote = TRUE;
+               }
+       }
+
+       if (found_knote) {
+               KNOTE(&memorystatus_klist, 0);
+               printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d] \n", value, fflags, pid);
+               error = 0;
+       } else {
+               printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid);
+               error = 1;
+       }
+
+       memorystatus_klist_unlock();
+
+       return error;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
+    0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", "");
+
+#endif /* DEBUG || DEVELOPMENT */
+
+#endif /* VM_PRESSURE_EVENTS */
index dd0cc669bf4b2381eeb363b00dff9e05c062e7fa..667b17a7d7b3aad2397317ec376da5242a4dd2cc 100644 (file)
 #include <vm/vm_protos.h>
 #include <mach/host_info.h>
 #include <kern/pms.h>
+#include <pexpert/device_tree.h>
 
 extern vm_map_t bsd_pageable_map;
 
@@ -135,6 +136,16 @@ static int      cputype, cpusubtype, cputhreadtype, cpufamily, cpu64bit;
 static uint64_t cacheconfig[10], cachesize[10];
 static int      packages;
 
+static char *   osenvironment;
+static uint32_t osenvironment_size = 0;
+static uint32_t ephemeral_storage = 0;
+static uint32_t use_recovery_securityd = 0;
+
+static struct {
+       uint32_t ephemeral_storage:1;
+       uint32_t use_recovery_securityd:1;
+} property_existence = {0, 0};
+
 SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
     "Sysctl internal magic");
 SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
@@ -163,8 +174,8 @@ SYSCTL_NODE(_kern, OID_AUTO, bridge, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
  * hw.* MIB
  */
 
-#define CTLHW_RETQUAD   (1 << 31)
-#define CTLHW_LOCAL     (1 << 30)
+#define CTLHW_RETQUAD   (1U << 31)
+#define CTLHW_LOCAL     (1U << 30)
 
 #define HW_LOCAL_CPUTHREADTYPE  (1 | CTLHW_LOCAL)
 #define HW_LOCAL_PHYSICALCPU    (2 | CTLHW_LOCAL)
@@ -366,6 +377,83 @@ sysctl_tbfrequency
        return sysctl_io_number(req, l, sizeof(l), NULL, NULL);
 }
 
+/*
+ * Create sysctl entries coming from device tree.
+ *
+ * Entries from device tree are loaded here because DTLookupEntry() only works before
+ * PE_init_iokit(). Doing this also avoids the extern-C hackery to access these entries
+ * from IORegistry (which requires C++).
+ */
+void
+sysctl_load_devicetree_entries(void)
+{
+       DTEntry chosen;
+       void *value;
+       unsigned int size;
+
+       if (kSuccess != DTLookupEntry(0, "/chosen", &chosen)) {
+               return;
+       }
+
+       /* load osenvironment */
+       if (kSuccess == DTGetProperty(chosen, "osenvironment", (void **) &value, &size)) {
+               MALLOC(osenvironment, char *, size, M_TEMP, M_WAITOK);
+               if (osenvironment) {
+                       memcpy(osenvironment, value, size);
+                       osenvironment_size = size;
+               }
+       }
+
+       /* load ephemeral_storage */
+       if (kSuccess == DTGetProperty(chosen, "ephemeral-storage", (void **) &value, &size)) {
+               if (size == sizeof(uint32_t)) {
+                       ephemeral_storage = *(uint32_t *)value;
+                       property_existence.ephemeral_storage = 1;
+               }
+       }
+
+       /* load use_recovery_securityd */
+       if (kSuccess == DTGetProperty(chosen, "use-recovery-securityd", (void **) &value, &size)) {
+               if (size == sizeof(uint32_t)) {
+                       use_recovery_securityd = *(uint32_t *)value;
+                       property_existence.use_recovery_securityd = 1;
+               }
+       }
+}
+
+static int
+sysctl_osenvironment
+(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       if (osenvironment_size > 0) {
+               return SYSCTL_OUT(req, osenvironment, osenvironment_size);
+       } else {
+               return EINVAL;
+       }
+}
+
+static int
+sysctl_ephemeral_storage
+(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       if (property_existence.ephemeral_storage) {
+               return SYSCTL_OUT(req, &ephemeral_storage, sizeof(ephemeral_storage));
+       } else {
+               return EINVAL;
+       }
+}
+
+static int
+sysctl_use_recovery_securityd
+(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       if (property_existence.use_recovery_securityd) {
+               return SYSCTL_OUT(req, &use_recovery_securityd, sizeof(use_recovery_securityd));
+       } else {
+               return EINVAL;
+       }
+}
+
 /*
  * hw.* MIB variables.
  */
@@ -409,6 +497,9 @@ SYSCTL_QUAD(_hw, OID_AUTO, fixfrequency, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOC
 SYSCTL_PROC(_hw, OID_AUTO, tbfrequency, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_tbfrequency, "Q", "");
 SYSCTL_QUAD(_hw, HW_MEMSIZE, memsize, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &max_mem, "");
 SYSCTL_INT(_hw, OID_AUTO, packages, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &packages, 0, "");
+SYSCTL_PROC(_hw, OID_AUTO, osenvironment, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_osenvironment, "A", "");
+SYSCTL_PROC(_hw, OID_AUTO, ephemeral_storage, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_ephemeral_storage, "I", "");
+SYSCTL_PROC(_hw, OID_AUTO, use_recovery_securityd, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_use_recovery_securityd, "I", "");
 
 /*
  * Optional CPU features can register nodes below hw.optional.
@@ -512,6 +603,7 @@ int gNeonHpfp = -1;
 int gNeonFp16 = -1;
 int gARMv81Atomics = 0;
 int gARMv8Crc32 = 0;
+int gARMv82FHM = 0;
 
 #if defined (__arm__)
 int arm64_flag = 0;
@@ -528,6 +620,7 @@ SYSCTL_INT(_hw_optional, OID_AUTO, neon_hpfp, CTLFLAG_RD | CTLFLAG_KERN | CTLFLA
 SYSCTL_INT(_hw_optional, OID_AUTO, neon_fp16, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gNeonFp16, 0, "");
 SYSCTL_INT(_hw_optional, OID_AUTO, armv8_1_atomics, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gARMv81Atomics, 0, "");
 SYSCTL_INT(_hw_optional, OID_AUTO, armv8_crc32, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gARMv8Crc32, 0, "");
+SYSCTL_INT(_hw_optional, OID_AUTO, armv8_2_fhm, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gARMv82FHM, 0, "");
 
 /*
  * Without this little ifdef dance, the preprocessor replaces "arm64" with "1",
@@ -627,6 +720,7 @@ sysctl_mib_init(void)
        cachesize[4] = 0;
 
        packages = 1;
+
 #else
 #error unknown architecture
 #endif /* !__i386__ && !__x86_64 && !__arm__ && !__arm64__ */
index 29853fd43f7bb268b42a751045a0d4d7819d2b10..0fd0cc33662786ff6fe422cc9b98fba721aa9cf4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Apple Inc. All Rights Reserved.
+ * Copyright (c) 2007-2019 Apple Inc. All Rights Reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #if CONFIG_MACF
 #include <security/mac_framework.h>
 #endif
+#include <os/overflow.h>
+
+#ifndef CONFIG_EMBEDDED
+#include <IOKit/IOBSD.h> /* for IOTaskHasEntitlement */
+#include <sys/csr.h> /* for csr_check */
+#define MAP_32BIT_ENTITLEMENT "com.apple.security.mmap-map-32bit"
+#endif
 
 /*
  * XXX Internally, we use VM_PROT_* somewhat interchangeably, but the correct
@@ -151,6 +158,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval)
        vm_map_t                user_map;
        kern_return_t           result;
        vm_map_offset_t         user_addr;
+       vm_map_offset_t         sum;
        vm_map_size_t           user_size;
        vm_object_offset_t      pageoff;
        vm_object_offset_t      file_pos;
@@ -183,6 +191,9 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval)
        AUDIT_ARG(len, user_size);
        AUDIT_ARG(fd, uap->fd);
 
+       if (vm_map_range_overflows(user_addr, user_size)) {
+               return EINVAL;
+       }
        prot = (uap->prot & VM_PROT_ALL);
 #if 3777787
        /*
@@ -200,7 +211,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval)
        vp = NULLVP;
 
        /*
-        * The vm code does not have prototypes & compiler doesn't do the'
+        * The vm code does not have prototypes & compiler doesn't do
         * the right thing when you cast 64bit value and pass it in function
         * call. So here it is.
         */
@@ -208,7 +219,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval)
 
 
        /* make sure mapping fits into numeric range etc */
-       if (file_pos + user_size > (vm_object_offset_t)-PAGE_SIZE_64) {
+       if (os_add3_overflow(file_pos, user_size, PAGE_SIZE_64 - 1, &sum)) {
                return EINVAL;
        }
 
@@ -241,10 +252,31 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval)
                    (flags & MAP_JIT)) {
                        return EINVAL;
                }
+       }
+       if (flags & MAP_RESILIENT_CODESIGN) {
                if (prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
                        return EPERM;
                }
        }
+       if (flags & MAP_SHARED) {
+               /*
+                * MAP_RESILIENT_MEDIA is not valid with MAP_SHARED because
+                * there is no place to inject zero-filled pages without
+                * actually adding them to the file.
+                * Since we didn't reject that combination before, there might
+                * already be callers using it and getting a valid MAP_SHARED
+                * mapping but without the resilience.
+                * For backwards compatibility's sake, let's keep ignoring
+                * MAP_RESILIENT_MEDIA in that case.
+                */
+               flags &= ~MAP_RESILIENT_MEDIA;
+       }
+       if (flags & MAP_RESILIENT_MEDIA) {
+               if ((flags & MAP_ANON) ||
+                   (flags & MAP_SHARED)) {
+                       return EINVAL;
+               }
+       }
 
        /*
         * Check for illegal addresses.  Watch out for address wrap... Note
@@ -450,7 +482,21 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval)
                                goto bad;
                        }
 #endif /* MAC */
+                       /*
+                        * Consult the file system to determine if this
+                        * particular file object can be mapped.
+                        */
+                       error = VNOP_MMAP_CHECK(vp, prot, ctx);
+                       if (error) {
+                               (void)vnode_put(vp);
+                               goto bad;
+                       }
                }
+
+               /*
+                * No copy-on-read for mmap() mappings themselves.
+                */
+               vmk_flags.vmkf_no_copy_on_read = 1;
        }
 
        if (user_size == 0) {
@@ -514,6 +560,21 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval)
        if (flags & MAP_RESILIENT_CODESIGN) {
                alloc_flags |= VM_FLAGS_RESILIENT_CODESIGN;
        }
+       if (flags & MAP_RESILIENT_MEDIA) {
+               alloc_flags |= VM_FLAGS_RESILIENT_MEDIA;
+       }
+
+#ifndef CONFIG_EMBEDDED
+       if (flags & MAP_32BIT) {
+               if (csr_check(CSR_ALLOW_UNTRUSTED_KEXTS) == 0 ||
+                   IOTaskHasEntitlement(current_task(), MAP_32BIT_ENTITLEMENT)) {
+                       vmk_flags.vmkf_32bit_map_va = TRUE;
+               } else {
+                       error = EPERM;
+                       goto bad;
+               }
+       }
+#endif
 
        /*
         * Lookup/allocate object.
@@ -616,8 +677,7 @@ map_anon_retry:
 #endif  /* radar 3777787 */
 
 map_file_retry:
-               if ((flags & MAP_RESILIENT_CODESIGN) ||
-                   (flags & MAP_RESILIENT_MEDIA)) {
+               if (flags & MAP_RESILIENT_CODESIGN) {
                        if (prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
                                assert(!mapanon);
                                vnode_put(vp);
@@ -716,10 +776,13 @@ msync_nocancel(__unused proc_t p, struct msync_nocancel_args *uap, __unused int3
 
        user_map = current_map();
        addr = (mach_vm_offset_t) uap->addr;
-       size = (mach_vm_size_t)uap->len;
+       size = (mach_vm_size_t) uap->len;
 #ifndef CONFIG_EMBEDDED
        KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_msync) | DBG_FUNC_NONE), (uint32_t)(addr >> 32), (uint32_t)(size >> 32), 0, 0, 0);
 #endif
+       if (mach_vm_range_overflows(addr, size)) {
+               return EINVAL;
+       }
        if (addr & vm_map_page_mask(user_map)) {
                /* UNIX SPEC: user address is not page-aligned, return EINVAL */
                return EINVAL;
@@ -797,7 +860,7 @@ munmap(__unused proc_t p, struct munmap_args *uap, __unused int32_t *retval)
                return EINVAL;
        }
 
-       if (user_addr + user_size < user_addr) {
+       if (mach_vm_range_overflows(user_addr, user_size)) {
                return EINVAL;
        }
 
@@ -834,6 +897,9 @@ mprotect(__unused proc_t p, struct mprotect_args *uap, __unused int32_t *retval)
        user_size = (mach_vm_size_t) uap->len;
        prot = (vm_prot_t)(uap->prot & (VM_PROT_ALL | VM_PROT_TRUSTED | VM_PROT_STRIP_READ));
 
+       if (mach_vm_range_overflows(user_addr, user_size)) {
+               return EINVAL;
+       }
        if (user_addr & vm_map_page_mask(user_map)) {
                /* UNIX SPEC: user address is not page-aligned, return EINVAL */
                return EINVAL;
@@ -939,7 +1005,9 @@ minherit(__unused proc_t p, struct minherit_args *uap, __unused int32_t *retval)
        addr = (mach_vm_offset_t)uap->addr;
        size = (mach_vm_size_t)uap->len;
        inherit = uap->inherit;
-
+       if (mach_vm_range_overflows(addr, size)) {
+               return EINVAL;
+       }
        user_map = current_map();
        result = mach_vm_inherit(user_map, addr, size,
            inherit);
@@ -1009,7 +1077,9 @@ madvise(__unused proc_t p, struct madvise_args *uap, __unused int32_t *retval)
 
        start = (mach_vm_offset_t) uap->addr;
        size = (mach_vm_size_t) uap->len;
-
+       if (mach_vm_range_overflows(start, size)) {
+               return EINVAL;
+       }
 #if __arm64__
        if (start == 0 &&
            size != 0 &&
@@ -1203,8 +1273,7 @@ mlock(__unused proc_t p, struct mlock_args *uap, __unused int32_t *retvalval)
        addr = (vm_map_offset_t) uap->addr;
        size = (vm_map_size_t)uap->len;
 
-       /* disable wrap around */
-       if (addr + size < addr) {
+       if (vm_map_range_overflows(addr, size)) {
                return EINVAL;
        }
 
@@ -1240,12 +1309,14 @@ munlock(__unused proc_t p, struct munlock_args *uap, __unused int32_t *retval)
        kern_return_t   result;
 
        AUDIT_ARG(addr, uap->addr);
-       AUDIT_ARG(addr, uap->len);
+       AUDIT_ARG(len, uap->len);
 
        addr = (mach_vm_offset_t) uap->addr;
        size = (mach_vm_size_t)uap->len;
        user_map = current_map();
-
+       if (mach_vm_range_overflows(addr, size)) {
+               return EINVAL;
+       }
        /* JMM - need to remove all wirings by spec - this just removes one */
        result = mach_vm_wire_kernel(host_priv_self(), user_map, addr, size, VM_PROT_NONE, VM_KERN_MEMORY_MLOCK);
        return result == KERN_SUCCESS ? 0 : ENOMEM;
@@ -1295,6 +1366,9 @@ mremap_encrypted(__unused struct proc *p, struct mremap_encrypted_args *uap, __u
        cputype = uap->cputype;
        cpusubtype = uap->cpusubtype;
 
+       if (mach_vm_range_overflows(user_addr, user_size)) {
+               return EINVAL;
+       }
        if (user_addr & vm_map_page_mask(user_map)) {
                /* UNIX SPEC: user address is not page-aligned, return EINVAL */
                return EINVAL;
index 746752d34dac2baa5e40fd1a6472d2226e8b5d6f..07cd0e08289a3ee91bb4f927abb15ae168c66b7f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -84,6 +84,9 @@
 #include <security/mac_framework.h>
 #endif
 
+#if defined(HAS_APPLE_PAC)
+#include <ptrauth.h>
+#endif /* defined(HAS_APPLE_PAC) */
 
 lck_grp_t * sysctl_lock_group = NULL;
 lck_rw_t * sysctl_geometry_lock = NULL;
@@ -209,13 +212,43 @@ sysctl_register_oid(struct sysctl_oid *new_oidp)
                }
        }
 
+#if defined(HAS_APPLE_PAC)
+       if (oidp->oid_handler) {
+               /*
+                * Dereference function-pointer-signed oid_handler to prevent an
+                * attacker with the ability to observe the result of the
+                * auth_and_resign below from trying all possible inputs until an auth
+                * succeeds.
+                */
+               if (__builtin_expect(!*(uintptr_t*)ptrauth_auth_data((void*)
+                   oidp->oid_handler, ptrauth_key_function_pointer, 0), 0)) {
+                       /*
+                        * This is necessary to force the dereference but will never
+                        * actually be reached, dereferencing an invalidly signed pointer
+                        * will trap before getting here (and the codegen is nicer than
+                        * with a panic).
+                        */
+                       __builtin_trap();
+               }
+               /*
+                * Sign oid_handler address-discriminated upon installation to make it
+                * harder to replace with an arbitrary function pointer.
+                */
+               oidp->oid_handler = ptrauth_auth_and_resign(oidp->oid_handler,
+                   ptrauth_key_function_pointer, 0, ptrauth_key_function_pointer,
+                   ptrauth_blend_discriminator(&oidp->oid_handler,
+                   ptrauth_string_discriminator("oid_handler")));
+       }
+#endif /* defined(HAS_APPLE_PAC) */
 
        /*
         * Insert the oid into the parent's list in order.
         */
        q = NULL;
        SLIST_FOREACH(p, parent, oid_link) {
-               if (oidp->oid_number < p->oid_number) {
+               if (oidp->oid_number == p->oid_number) {
+                       panic("attempting to register a sysctl at previously registered slot : %d", oidp->oid_number);
+               } else if (oidp->oid_number < p->oid_number) {
                        break;
                }
                q = p;
@@ -269,6 +302,34 @@ sysctl_unregister_oid(struct sysctl_oid *oidp)
                }
        }
 
+#if defined(HAS_APPLE_PAC)
+       if (removed_oidp && removed_oidp->oid_handler && old_oidp == NULL) {
+               /*
+                * Revert address-discriminated signing performed by
+                * sysctl_register_oid() (in case this oid is registered again).
+                */
+               removed_oidp->oid_handler = ptrauth_auth_function(removed_oidp->oid_handler,
+                   ptrauth_key_function_pointer,
+                   ptrauth_blend_discriminator(&removed_oidp->oid_handler,
+                   ptrauth_string_discriminator("oid_handler")));
+               /*
+                * Dereference the function-pointer-signed result to prevent an
+                * attacker with the ability to observe the result of the
+                * auth_and_resign above from trying all possible inputs until an auth
+                * succeeds.
+                */
+               if (__builtin_expect(!*(uintptr_t*)ptrauth_auth_data((void*)
+                   removed_oidp->oid_handler, ptrauth_key_function_pointer, 0), 0)) {
+                       /*
+                        * This is necessary to force the dereference but will never
+                        * actually be reached, dereferencing an invalidly signed pointer
+                        * will trap before getting here (and the codegen is nicer than
+                        * with a panic).
+                        */
+                       __builtin_trap();
+               }
+       }
+#endif /* defined(HAS_APPLE_PAC) */
 
        /*
         * We've removed it from the list at this point, but we don't want
@@ -349,6 +410,7 @@ sysctl_early_init(void)
        sysctl_unlocked_node_lock = lck_mtx_alloc_init(sysctl_lock_group, NULL);
 
        sysctl_register_set("__sysctl_set");
+       sysctl_load_devicetree_entries();
 }
 
 /*
@@ -441,10 +503,10 @@ sysctl_io_string(struct sysctl_req *req, char *pValue, size_t valueSize, int tru
                 * returned string to the buffer size.  This preserves the semantics
                 * of some library routines implemented via sysctl, which truncate
                 * their returned data, rather than simply returning an error. The
-                * returned string is always NUL terminated. */
+                * returned string is always nul (ascii '\0') terminated. */
                error = SYSCTL_OUT(req, pValue, req->oldlen - 1);
                if (!error) {
-                       char c = 0;
+                       char c = '\0';
                        error = SYSCTL_OUT(req, &c, 1);
                }
        } else {
@@ -467,7 +529,7 @@ sysctl_io_string(struct sysctl_req *req, char *pValue, size_t valueSize, int tru
                return EINVAL;
        }
 
-       /* copy the string in and force NUL termination */
+       /* copy the string in and force nul termination */
        error = SYSCTL_IN(req, pValue, req->newlen);
        pValue[req->newlen] = '\0';
 
@@ -1589,6 +1651,15 @@ found:
                lck_mtx_lock(sysctl_unlocked_node_lock);
        }
 
+#if defined(HAS_APPLE_PAC)
+       /*
+        * oid_handler is signed address-discriminated by sysctl_register_oid().
+        */
+       oid_handler = ptrauth_auth_function(oid_handler,
+           ptrauth_key_function_pointer,
+           ptrauth_blend_discriminator(&oid->oid_handler,
+           ptrauth_string_discriminator("oid_handler")));
+#endif /* defined(HAS_APPLE_PAC) */
 
        if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
                i = oid_handler(oid, name + indx, namelen - indx, req);
@@ -1656,7 +1727,7 @@ sysctl_create_user_req(struct sysctl_req *req, struct proc *p, user_addr_t oldp,
 int
 sysctl(proc_t p, struct sysctl_args *uap, __unused int32_t *retval)
 {
-       int error;
+       int error, new_error;
        size_t oldlen = 0, newlen;
        int name[CTL_MAXNAME];
        struct sysctl_req req;
@@ -1721,16 +1792,25 @@ sysctl(proc_t p, struct sysctl_args *uap, __unused int32_t *retval)
 
 err:
        if (uap->oldlenp != USER_ADDR_NULL) {
-               error = suulong(uap->oldlenp, oldlen);
+               /*
+                * Only overwrite the old error value on a new error
+                */
+               new_error = suulong(uap->oldlenp, oldlen);
+
+               if (new_error) {
+                       error = new_error;
+               }
        }
 
        return error;
 }
 
+// sysctlbyname is also exported as KPI to kexts
+// and the syscall name cannot conflict with it
 int
-sysctlbyname(proc_t p, struct sysctlbyname_args *uap, __unused int32_t *retval)
+sys_sysctlbyname(proc_t p, struct sysctlbyname_args *uap, __unused int32_t *retval)
 {
-       int error;
+       int error, new_error;
        size_t oldlen = 0, newlen;
        char *name;
        size_t namelen = 0;
@@ -1788,7 +1868,14 @@ sysctlbyname(proc_t p, struct sysctlbyname_args *uap, __unused int32_t *retval)
        }
 
        if (uap->oldlenp != USER_ADDR_NULL) {
-               error = suulong(uap->oldlenp, oldlen);
+               /*
+                * Only overwrite the old error value on a new error
+                */
+               new_error = suulong(uap->oldlenp, oldlen);
+
+               if (new_error) {
+                       error = new_error;
+               }
        }
 
        return error;
index 589ff9d97e0a1e2d05b03d018789ea98d24975be..2ad397e7430d304fcb1a0c77e2322d0495df61d9 100644 (file)
@@ -334,7 +334,7 @@ ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, int32_t *retval)
                return error;
        }
 
-#if DEVELOPEMNT || DEBUG
+#if DEVELOPMENT || DEBUG
        if (g_should_log_clock_adjustments) {
                os_log(OS_LOG_DEFAULT, "%s: BEFORE modes %u offset %ld freq %ld status %d constant %ld time_adjtime %lld\n",
                    __func__, ntv.modes, ntv.offset, ntv.freq, ntv.status, ntv.constant, time_adjtime);
@@ -438,7 +438,7 @@ ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, int32_t *retval)
 
        ret = ntp_is_time_error(time_status) ? TIME_ERROR : time_state;
 
-#if DEVELOPEMNT || DEBUG
+#if DEVELOPMENT || DEBUG
        if (g_should_log_clock_adjustments) {
                os_log(OS_LOG_DEFAULT, "%s: AFTER modes %u offset %lld freq %lld status %d constant %ld time_adjtime %lld\n",
                    __func__, modes, time_offset, time_freq, time_status, time_constant, time_adjtime);
@@ -572,7 +572,7 @@ ntp_update_second(int64_t *adjustment, clock_sec_t secs)
                updated = 0;
        }
 
-#if DEVELOPEMNT || DEBUG
+#if DEVELOPMENT || DEBUG
        if (g_should_log_clock_adjustments) {
                int64_t nano = (time_adj > 0)? time_adj >> 32 : -((-time_adj) >> 32);
                int64_t frac = (time_adj > 0)? ((uint32_t) time_adj) : -((uint32_t) (-time_adj));
@@ -675,7 +675,7 @@ kern_adjtime(struct timeval *delta)
        NTP_LOCK(enable);
        ltr = time_adjtime;
        time_adjtime = ltw;
-#if DEVELOPEMNT || DEBUG
+#if DEVELOPMENT || DEBUG
        if (g_should_log_clock_adjustments) {
                os_log(OS_LOG_DEFAULT, "%s:AFTER offset %lld freq %lld status %d constant %ld time_adjtime %lld\n",
                    __func__, time_offset, time_freq, time_status, time_constant, time_adjtime);
diff --git a/bsd/kern/kern_pcsamples.c b/bsd/kern/kern_pcsamples.c
deleted file mode 100644 (file)
index 69694bc..0000000
+++ /dev/null
@@ -1,424 +0,0 @@
-/*
- * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#include <sys/kdebug.h>
-#include <sys/errno.h>
-#include <sys/param.h>
-#include <sys/proc_internal.h>
-#include <sys/vm.h>
-#include <sys/sysctl.h>
-#include <sys/systm.h>
-#include <vm/vm_kern.h>
-#include <machine/machine_routines.h>
-
-vm_offset_t pc_buftomem = 0;
-unsigned int *  pc_buffer   = 0;   /* buffer that holds each pc */
-unsigned int *  pc_bufptr   = 0;
-unsigned int *  pc_buflast  = 0;
-unsigned int npcbufs         = 8192;      /* number of pc entries in buffer */
-unsigned int pc_bufsize      = 0;
-unsigned int pcsample_flags  = 0;
-unsigned int pcsample_enable = 0;
-
-pid_t pc_sample_pid = 0;
-boolean_t pc_trace_frameworks = FALSE;
-
-char pcsample_comm[MAXCOMLEN + 1];
-
-/* Set the default framework boundaries */
-unsigned int pcsample_beg    = 0;
-unsigned int pcsample_end    = 0;
-
-static pid_t global_state_pid = -1;       /* Used to control exclusive use of pc_buffer */
-
-extern unsigned int pc_trace_buf[];
-extern int pc_trace_cnt;
-
-void add_pcbuffer(void);
-int branch_tracing_enabled(void);
-int disable_branch_tracing(void);
-int enable_branch_tracing(void);
-int pcsamples_bootstrap(void);
-void pcsamples_clear(void);
-int pcsamples_control(int *name, u_int namelen, user_addr_t where, size_t *sizep);
-int pcsamples_read(user_addr_t buffer, size_t *number);
-int pcsamples_reinit(void);
-
-int
-enable_branch_tracing(void)
-{
-       struct proc *p;
-       if (-1 != pc_sample_pid) {
-               p = proc_find(pc_sample_pid);
-               if (p) {
-                       p->p_btrace = 1;
-                       proc_rele(p);
-               }
-       } else {
-               pc_trace_frameworks = TRUE;
-       }
-
-       return 1;
-}
-
-int
-disable_branch_tracing(void)
-{
-       struct proc *p;
-       switch (pc_sample_pid) {
-       case -1:
-               pc_trace_frameworks = FALSE;
-               break;
-       case 0:
-               break;
-       default:
-               p = proc_find(pc_sample_pid);
-               if (p) {
-                       p->p_btrace = 0;
-                       proc_rele(p);
-               }
-               break;
-       }
-       clr_be_bit();
-       return 1;
-}
-
-/*
- * this only works for the current proc as it
- * is called from context_switch in the scheduler
- */
-int
-branch_tracing_enabled(void)
-{
-       struct proc *p = current_proc();
-       if (TRUE == pc_trace_frameworks) {
-               return TRUE;
-       }
-       if (p) {
-               return p->p_btrace;
-       }
-       return 0;
-}
-
-
-void
-add_pcbuffer(void)
-{
-       int      i;
-       unsigned int  pc;
-
-       if (!pcsample_enable) {
-               return;
-       }
-
-       for (i = 0; i < pc_trace_cnt; i++) {
-               pc = pc_trace_buf[i];
-
-               if ((pcsample_beg <= pc) && (pc < pcsample_end)) {
-                       if (pc_bufptr > pc_buffer) {
-                               if ((*(pc_bufptr - 1)) == pc) {
-                                       continue; /* Ignore, probably spinning */
-                               }
-                       }
-
-                       /* Then the sample is in our range */
-                       *pc_bufptr = pc;
-                       pc_bufptr++;
-               }
-       }
-
-       /* We never wrap the buffer */
-       if ((pc_bufptr + pc_trace_cnt) >= pc_buflast) {
-               pcsample_enable = 0;
-               (void)disable_branch_tracing();
-               wakeup(&pcsample_enable);
-       }
-       return;
-}
-
-int
-pcsamples_bootstrap(void)
-{
-       if (!disable_branch_tracing()) {
-               return ENOTSUP;
-       }
-
-       pc_bufsize = npcbufs * sizeof(*pc_buffer);
-       if (kmem_alloc(kernel_map, &pc_buftomem,
-           (vm_size_t)pc_bufsize) == KERN_SUCCESS) {
-               pc_buffer = (unsigned int *) pc_buftomem;
-       } else {
-               pc_buffer = NULL;
-       }
-
-       if (pc_buffer) {
-               pc_bufptr = pc_buffer;
-               pc_buflast = &pc_bufptr[npcbufs];
-               pcsample_enable = 0;
-               return 0;
-       } else {
-               pc_bufsize = 0;
-               return EINVAL;
-       }
-}
-
-int
-pcsamples_reinit(void)
-{
-       int ret = 0;
-
-       pcsample_enable = 0;
-
-       if (pc_bufsize && pc_buffer) {
-               kmem_free(kernel_map, (vm_offset_t)pc_buffer, pc_bufsize);
-       }
-
-       ret = pcsamples_bootstrap();
-       return ret;
-}
-
-void
-pcsamples_clear(void)
-{
-       /* Clean up the sample buffer, set defaults */
-       global_state_pid = -1;
-       pcsample_enable = 0;
-       if (pc_bufsize && pc_buffer) {
-               kmem_free(kernel_map, (vm_offset_t)pc_buffer, pc_bufsize);
-       }
-       pc_buffer   = NULL;
-       pc_bufptr   = NULL;
-       pc_buflast  = NULL;
-       pc_bufsize  = 0;
-       pcsample_beg = 0;
-       pcsample_end = 0;
-       bzero((void *)pcsample_comm, sizeof(pcsample_comm));
-       (void)disable_branch_tracing();
-       pc_sample_pid = 0;
-       pc_trace_frameworks = FALSE;
-}
-
-int
-pcsamples_control(int *name, __unused u_int namelen, user_addr_t where, size_t *sizep)
-{
-       int ret = 0;
-       size_t size = *sizep;
-       int value = name[1];
-       pcinfo_t pc_bufinfo = {};
-       pid_t *pidcheck;
-
-       pid_t curpid;
-       struct proc *p, *curproc;
-
-       if (name[0] != PCSAMPLE_GETNUMBUF) {
-               curproc = current_proc();
-               if (curproc) {
-                       curpid = curproc->p_pid;
-               } else {
-                       return ESRCH;
-               }
-
-               if (global_state_pid == -1) {
-                       global_state_pid = curpid;
-               } else if (global_state_pid != curpid) {
-                       if ((p = proc_find(global_state_pid)) == NULL) {
-                               /* The global pid no longer exists */
-                               global_state_pid = curpid;
-                       } else {
-                               proc_rele(p);
-                               /* The global pid exists, deny this request */
-                               return EBUSY;
-                       }
-               }
-       }
-
-
-       switch (name[0]) {
-       case PCSAMPLE_DISABLE: /* used to disable */
-               pcsample_enable = 0;
-               break;
-       case PCSAMPLE_SETNUMBUF:
-               /* The buffer size is bounded by a min and max number of samples */
-               if (value < pc_trace_cnt) {
-                       ret = EINVAL;
-                       break;
-               }
-               if (value <= MAX_PCSAMPLES) {
-                       /*      npcbufs = value & ~(PC_TRACE_CNT-1); */
-                       npcbufs = value;
-               } else {
-                       npcbufs = MAX_PCSAMPLES;
-               }
-               break;
-       case PCSAMPLE_GETNUMBUF:
-               if (size < sizeof(pc_bufinfo)) {
-                       ret = EINVAL;
-                       break;
-               }
-               pc_bufinfo.npcbufs = npcbufs;
-               pc_bufinfo.bufsize = pc_bufsize;
-               pc_bufinfo.enable = pcsample_enable;
-               pc_bufinfo.pcsample_beg = pcsample_beg;
-               pc_bufinfo.pcsample_end = pcsample_end;
-               if (copyout(&pc_bufinfo, where, sizeof(pc_bufinfo))) {
-                       ret = EINVAL;
-               }
-               break;
-       case PCSAMPLE_SETUP:
-               ret = pcsamples_reinit();
-               break;
-       case PCSAMPLE_REMOVE:
-               pcsamples_clear();
-               break;
-       case PCSAMPLE_READBUF:
-               /* A nonzero value says enable and wait on the buffer */
-               /* A zero value says read up the buffer immediately */
-               if (value == 0) {
-                       /* Do not wait on the buffer */
-                       pcsample_enable = 0;
-                       (void)disable_branch_tracing();
-                       ret = pcsamples_read(where, sizep);
-                       break;
-               } else if ((pc_bufsize <= 0) || (!pc_buffer)) {
-                       /* enable only if buffer is initialized */
-                       ret = EINVAL;
-                       break;
-               }
-
-               /* Turn on branch tracing */
-               if (!enable_branch_tracing()) {
-                       ret = ENOTSUP;
-                       break;
-               }
-
-               /* Enable sampling */
-               pcsample_enable = 1;
-
-               ret = tsleep(&pcsample_enable, PRIBIO | PCATCH, "pcsample", 0);
-               pcsample_enable = 0;
-               (void)disable_branch_tracing();
-
-               if (ret) {
-                       /*      Eventually fix this...  if (ret != EINTR) */
-                       if (ret) {
-                               /* On errors, except EINTR, we want to cleanup buffer ptrs */
-                               /* pc_bufptr = pc_buffer; */
-                               *sizep = 0;
-                       }
-               } else {
-                       /* The only way to get here is if the buffer is full */
-                       ret = pcsamples_read(where, sizep);
-               }
-
-               break;
-       case PCSAMPLE_SETREG:
-               if (size < sizeof(pc_bufinfo)) {
-                       ret = EINVAL;
-                       break;
-               }
-               if (copyin(where, &pc_bufinfo, sizeof(pc_bufinfo))) {
-                       ret = EINVAL;
-                       break;
-               }
-
-               pcsample_beg = pc_bufinfo.pcsample_beg;
-               pcsample_end = pc_bufinfo.pcsample_end;
-               break;
-       case PCSAMPLE_COMM:
-               if (!(sizeof(pcsample_comm) > size)) {
-                       ret = EINVAL;
-                       break;
-               }
-               bzero((void *)pcsample_comm, sizeof(pcsample_comm));
-               if (copyin(where, pcsample_comm, size)) {
-                       ret = EINVAL;
-                       break;
-               }
-
-               /* Check for command name or pid */
-               if (pcsample_comm[0] != '\0') {
-                       ret = ENOTSUP;
-                       break;
-               } else {
-                       if (size != (2 * sizeof(pid_t))) {
-                               ret = EINVAL;
-                               break;
-                       } else {
-                               pidcheck = (pid_t *)pcsample_comm;
-                               pc_sample_pid = pidcheck[1];
-                       }
-               }
-               break;
-       default:
-               ret = ENOTSUP;
-               break;
-       }
-       return ret;
-}
-
-
-/*
- *  This buffer must be read up in one call.
- *  If the buffer isn't big enough to hold
- *  all the samples, it will copy up enough
- *  to fill the buffer and throw the rest away.
- *  This buffer never wraps.
- */
-int
-pcsamples_read(user_addr_t buffer, size_t *number)
-{
-       size_t count = 0;
-       size_t copycount;
-
-       count = (*number) / sizeof(*pc_buffer);
-
-       if (count && pc_bufsize && pc_buffer) {
-               copycount = pc_bufptr - pc_buffer;
-
-               if (copycount <= 0) {
-                       *number = 0;
-                       return 0;
-               }
-
-               if (copycount > count) {
-                       copycount = count;
-               }
-
-               /* We actually have data to send up */
-               if (copyout(pc_buffer, buffer, copycount * sizeof(*pc_buffer))) {
-                       *number = 0;
-                       return EINVAL;
-               }
-               *number = copycount;
-               pc_bufptr = pc_buffer;
-               return 0;
-       } else {
-               *number = 0;
-               return 0;
-       }
-}
index c9c846717e186aa1928990dca888de7bf4f89845..7fc207026a5cdc6bf607b58d900bbfb8f826e50a 100644 (file)
 #include <sys/kernel.h>
 #include <sys/kernel_types.h>
 #include <sys/persona.h>
+#include <pexpert/pexpert.h>
 
 #if CONFIG_PERSONAS
+#include <machine/atomic.h>
+
 #include <kern/assert.h>
 #include <kern/simple_lock.h>
 #include <kern/task.h>
 #include <kern/zalloc.h>
+#include <mach/thread_act.h>
+#include <kern/thread.h>
 
 #include <sys/param.h>
 #include <sys/proc_internal.h>
@@ -52,9 +57,6 @@
 #define FIRST_PERSONA_ID 501
 #define PERSONA_ID_STEP   10
 
-#define PERSONA_SYSTEM_UID    ((uid_t)99)
-#define PERSONA_SYSTEM_LOGIN  "system"
-
 #define PERSONA_ALLOC_TOKEN   (0x7a0000ae)
 #define PERSONA_INIT_TOKEN    (0x7500005e)
 #define PERSONA_MAGIC         (0x0aa55aa0)
 static LIST_HEAD(personalist, persona) all_personas;
 static uint32_t g_total_personas;
 uint32_t g_max_personas = MAX_PERSONAS;
-
-struct persona *g_system_persona = NULL;
+struct persona *system_persona = NULL;
+struct persona *proxy_system_persona = NULL;
+#if CONFIG_EMBEDDED
+int unique_persona = 1;
+#else
+int unique_persona = 0;
+#endif
 
 static uid_t g_next_persona_id;
 
@@ -80,17 +87,23 @@ os_refgrp_decl(static, persona_refgrp, "persona", NULL);
 static zone_t persona_zone;
 
 kauth_cred_t g_default_persona_cred;
+extern struct auditinfo_addr *audit_default_aia_p;
 
 #define lock_personas()    lck_mtx_lock(&all_personas_lock)
 #define unlock_personas()  lck_mtx_unlock(&all_personas_lock)
 
-
 extern void mach_kauth_cred_uthread_update(void);
 
+extern kern_return_t bank_get_bank_ledger_thread_group_and_persona(void *voucher,
+    void *bankledger, void **banktg, uint32_t *persona_id);
+void
+ipc_voucher_release(void *voucher);
+
 void
 personas_bootstrap(void)
 {
        struct posix_cred pcred;
+       int unique_persona_bootarg;
 
        persona_dbg("Initializing persona subsystem");
        LIST_INIT(&all_personas);
@@ -126,20 +139,17 @@ personas_bootstrap(void)
        if (!g_default_persona_cred) {
                panic("couldn't create default persona credentials!");
        }
-
-       g_system_persona = persona_alloc(PERSONA_SYSTEM_UID,
-           PERSONA_SYSTEM_LOGIN,
-           PERSONA_SYSTEM, NULL);
-       int err = persona_init_begin(g_system_persona);
-       assert(err == 0);
-
-       persona_init_end(g_system_persona, err);
-
-       assert(g_system_persona != NULL);
+#if CONFIG_AUDIT
+       /* posix_cred_create() sets this value to NULL */
+       g_default_persona_cred->cr_audit.as_aia_p = audit_default_aia_p;
+#endif
+       if (PE_parse_boot_argn("unique_persona", &unique_persona_bootarg, sizeof(unique_persona_bootarg))) {
+               unique_persona = !!unique_persona_bootarg;
+       }
 }
 
 struct persona *
-persona_alloc(uid_t id, const char *login, int type, int *error)
+persona_alloc(uid_t id, const char *login, int type, char *path, int *error)
 {
        struct persona *persona;
        int err = 0;
@@ -170,7 +180,7 @@ persona_alloc(uid_t id, const char *login, int type, int *error)
 
        bzero(persona, sizeof(*persona));
 
-       if (hw_atomic_add(&g_total_personas, 1) > MAX_PERSONAS) {
+       if (os_atomic_inc(&g_total_personas, relaxed) > MAX_PERSONAS) {
                /* too many personas! */
                pna_err("too many active personas!");
                err = EBUSY;
@@ -199,6 +209,7 @@ persona_alloc(uid_t id, const char *login, int type, int *error)
        persona->pna_type = type;
        persona->pna_id = id;
        persona->pna_valid = PERSONA_ALLOC_TOKEN;
+       persona->pna_path = path;
 
        /*
         * NOTE: this persona has not been fully initialized. A subsequent
@@ -211,7 +222,7 @@ persona_alloc(uid_t id, const char *login, int type, int *error)
        return persona;
 
 out_error:
-       (void)hw_atomic_add(&g_total_personas, -1);
+       os_atomic_dec(&g_total_personas, relaxed);
        zfree(persona_zone, persona);
        if (error) {
                *error = err;
@@ -375,7 +386,7 @@ persona_init_end(struct persona *persona, int error)
        if (error != 0 || persona->pna_valid == PERSONA_ALLOC_TOKEN) {
                persona_dbg("ERROR:%d after initialization of %d (%s)", error, persona->pna_id, persona->pna_login);
                /* remove this persona from the global count */
-               (void)hw_atomic_add(&g_total_personas, -1);
+               os_atomic_dec(&g_total_personas, relaxed);
        } else if (error == 0 &&
            persona->pna_valid == PERSONA_INIT_TOKEN) {
                persona->pna_valid = PERSONA_MAGIC;
@@ -386,6 +397,76 @@ persona_init_end(struct persona *persona, int error)
        unlock_personas();
 }
 
+/**
+ * persona_verify_and_set_uniqueness
+ *
+ * This function checks the persona, if the one being spawned is of type
+ * PERSONA_SYSTEM or PERSONA_SYSTEM_PROXY, is unique.
+ *
+ * Conditions:
+ *      global persona list is locked on entry and return.
+ *
+ * Returns:
+ *      EEXIST: if persona is system/system-proxy and is not unique.
+ *      0: Otherwise.
+ */
+int
+persona_verify_and_set_uniqueness(struct persona *persona)
+{
+       if (persona == NULL) {
+               return EINVAL;
+       }
+
+       if (!unique_persona) {
+               return 0;
+       }
+
+       if (persona->pna_type == PERSONA_SYSTEM) {
+               if (system_persona != NULL) {
+                       return EEXIST;
+               }
+               system_persona = persona;
+               return 0;
+       }
+
+       if (persona->pna_type == PERSONA_SYSTEM_PROXY) {
+               if (proxy_system_persona != NULL) {
+                       return EEXIST;
+               }
+               proxy_system_persona = persona;
+               return 0;
+       }
+       return 0;
+}
+
+/**
+ * persona_is_unique
+ *
+ * This function checks if the persona spawned is unique.
+ *
+ * Returns:
+ *      TRUE: if unique.
+ *      FALSE: otherwise.
+ */
+boolean_t
+persona_is_unique(struct persona *persona)
+{
+       if (persona == NULL) {
+               return FALSE;
+       }
+
+       if (!unique_persona) {
+               return FALSE;
+       }
+
+       if (persona->pna_type == PERSONA_SYSTEM ||
+           persona->pna_type == PERSONA_SYSTEM_PROXY) {
+               return TRUE;
+       }
+
+       return FALSE;
+}
+
 static struct persona *
 persona_get_locked(struct persona *persona)
 {
@@ -438,11 +519,14 @@ persona_put(struct persona *persona)
        persona_lock(persona);
        if (persona_valid(persona)) {
                LIST_REMOVE(persona, pna_list);
-               if (hw_atomic_add(&g_total_personas, -1) == UINT_MAX) {
+               if (os_atomic_dec_orig(&g_total_personas, relaxed) == 0) {
                        panic("persona count underflow!\n");
                }
                persona_mkinvalid(persona);
        }
+       if (persona->pna_path != NULL) {
+               FREE_ZONE(persona->pna_path, MAXPATHLEN, M_NAMEI);
+       }
        persona_unlock(persona);
        unlock_personas();
 
@@ -497,11 +581,11 @@ persona_lookup_and_invalidate(uid_t id)
        LIST_FOREACH_SAFE(entry, &all_personas, pna_list, tmp) {
                persona_lock(entry);
                if (entry->pna_id == id) {
-                       if (persona_valid(entry)) {
+                       if (persona_valid(entry) && !persona_is_unique(entry)) {
                                persona = persona_get_locked(entry);
                                assert(persona != NULL);
                                LIST_REMOVE(persona, pna_list);
-                               if (hw_atomic_add(&g_total_personas, -1) == UINT_MAX) {
+                               if (os_atomic_dec_orig(&g_total_personas, relaxed) == 0) {
                                        panic("persona ref count underflow!\n");
                                }
                                persona_mkinvalid(persona);
@@ -516,9 +600,22 @@ persona_lookup_and_invalidate(uid_t id)
        return persona;
 }
 
+int
+persona_find_by_type(int persona_type, struct persona **persona, size_t *plen)
+{
+       return persona_find_all(NULL, PERSONA_ID_NONE, persona_type, persona, plen);
+}
+
 int
 persona_find(const char *login, uid_t uid,
     struct persona **persona, size_t *plen)
+{
+       return persona_find_all(login, uid, PERSONA_INVALID, persona, plen);
+}
+
+int
+persona_find_all(const char *login, uid_t uid, int persona_type,
+    struct persona **persona, size_t *plen)
 {
        struct persona *tmp;
        int match = 0;
@@ -530,6 +627,11 @@ persona_find(const char *login, uid_t uid,
        if (uid != PERSONA_ID_NONE) {
                match++;
        }
+       if ((persona_type > PERSONA_INVALID) && (persona_type <= PERSONA_TYPE_MAX)) {
+               match++;
+       } else if (persona_type != PERSONA_INVALID) {
+               return EINVAL;
+       }
 
        if (match == 0) {
                return EINVAL;
@@ -548,6 +650,9 @@ persona_find(const char *login, uid_t uid,
                if (uid != PERSONA_ID_NONE && uid == tmp->pna_id) {
                        m++;
                }
+               if (persona_type != PERSONA_INVALID && persona_type == tmp->pna_type) {
+                       m++;
+               }
                if (m == match) {
                        if (persona && *plen > found) {
                                persona[found] = persona_get_locked(tmp);
@@ -593,13 +698,29 @@ persona_proc_get(pid_t pid)
 struct persona *
 current_persona_get(void)
 {
-       proc_t p = current_proc();
-       struct persona *persona;
-
-       proc_lock(p);
-       persona = persona_get(p->p_persona);
-       proc_unlock(p);
+       struct persona *persona = NULL;
+       uid_t current_persona_id = PERSONA_ID_NONE;
+       ipc_voucher_t voucher;
 
+       thread_get_mach_voucher(current_thread(), 0, &voucher);
+       /* returns a voucher ref */
+       if (voucher != IPC_VOUCHER_NULL) {
+               /*
+                * If the voucher doesn't contain a bank attribute, it uses
+                * the default bank task value to determine the persona id
+                * which is the same as the proc's persona id
+                */
+               bank_get_bank_ledger_thread_group_and_persona(voucher, NULL,
+                   NULL, &current_persona_id);
+               ipc_voucher_release(voucher);
+               persona = persona_lookup(current_persona_id);
+       } else {
+               /* Fallback - get the proc's persona */
+               proc_t p = current_proc();
+               proc_lock(p);
+               persona = persona_get(p->p_persona);
+               proc_unlock(p);
+       }
        return persona;
 }
 
@@ -852,7 +973,6 @@ persona_proc_adopt(proc_t p, struct persona *persona, kauth_cred_t auth_override
 {
        int error;
        struct persona *old_persona;
-       struct session * sessp;
 
        if (!persona) {
                return EINVAL;
@@ -886,15 +1006,21 @@ persona_proc_adopt(proc_t p, struct persona *persona, kauth_cred_t auth_override
                enterpgrp(p, persona->pna_pgid, persona->pna_pgid == uid);
        }
 
+       /* Only Multiuser Mode needs to update the session login name to the persona name */
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
+       volatile uint32_t *multiuser_flag_address = (volatile uint32_t *)(uintptr_t)(_COMM_PAGE_MULTIUSER_CONFIG);
+       uint32_t multiuser_flags = *multiuser_flag_address;
        /* set the login name of the session */
-       sessp = proc_session(p);
-       if (sessp != SESSION_NULL) {
-               session_lock(sessp);
-               bcopy(persona->pna_login, sessp->s_login, MAXLOGNAME);
-               session_unlock(sessp);
-               session_rele(sessp);
+       if (multiuser_flags) {
+               struct session * sessp = proc_session(p);
+               if (sessp != SESSION_NULL) {
+                       session_lock(sessp);
+                       bcopy(persona->pna_login, sessp->s_login, MAXLOGNAME);
+                       session_unlock(sessp);
+                       session_rele(sessp);
+               }
        }
-
+#endif
        persona_unlock(persona);
 
        set_security_token(p);
@@ -1259,8 +1385,6 @@ persona_get_login(struct persona *persona, char login[MAXLOGNAME + 1])
 
 out_unlock:
        persona_unlock(persona);
-       login[MAXLOGNAME] = 0;
-
        return ret;
 }
 
@@ -1270,6 +1394,10 @@ out_unlock:
  * symbol exports for kext compatibility
  */
 
+struct persona *system_persona = NULL;
+struct persona *proxy_system_persona = NULL;
+int unique_persona = 0;
+
 uid_t
 persona_get_id(__unused struct persona *persona)
 {
@@ -1303,6 +1431,20 @@ persona_find(__unused const char *login,
        return ENOTSUP;
 }
 
+int
+persona_find_by_type(__unused int persona_type,
+    __unused struct persona **persona,
+    __unused size_t *plen)
+{
+       return ENOTSUP;
+}
+
+struct persona *
+persona_proc_get(__unused pid_t pid)
+{
+       return NULL;
+}
+
 struct persona *
 current_persona_get(void)
 {
index 1d76892320971380bf7c67d4e075dc2fa49a3ddb..1efcb56746637f90d5a32204f2bab4d5d471ed4f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <sys/proc_info.h>
 #include <sys/bsdtask_info.h>
 #include <sys/persona.h>
+#include <sys/sysent.h>
+#include <sys/reason.h>
 
 #ifdef CONFIG_32BIT_TELEMETRY
 #include <sys/kasl.h>
@@ -162,6 +164,10 @@ extern struct tty cons;
 
 extern int cs_debug;
 
+#if DEVELOPMENT || DEBUG
+int syscallfilter_disable = 0;
+#endif // DEVELOPMENT || DEBUG
+
 #if DEBUG
 #define __PROC_INTERNAL_DEBUG 1
 #endif
@@ -184,6 +190,7 @@ typedef uint64_t unaligned_u64 __attribute__((aligned(1)));
 
 static void orphanpg(struct pgrp * pg);
 void proc_name_kdp(task_t t, char * buf, int size);
+boolean_t proc_binary_uuid_kdp(task_t task, uuid_t uuid);
 int proc_threadname_kdp(void * uth, char * buf, size_t size);
 void proc_starttime_kdp(void * p, unaligned_u64 *tv_sec, unaligned_u64 *tv_usec, unaligned_u64 *abstime);
 char * proc_name_address(void * p);
@@ -463,13 +470,12 @@ record_procref(proc_t p __unused, int count)
                return;
        }
 
-       if (count == 1) {
-               if (uth->uu_pindex < NUM_PROC_REFS_TO_TRACK) {
-                       backtrace((uintptr_t *) &uth->uu_proc_pcs[uth->uu_pindex], PROC_REF_STACK_DEPTH);
+       if (uth->uu_pindex < NUM_PROC_REFS_TO_TRACK) {
+               backtrace((uintptr_t *) &uth->uu_proc_pcs[uth->uu_pindex],
+                   PROC_REF_STACK_DEPTH, NULL);
 
-                       uth->uu_proc_ps[uth->uu_pindex] = p;
-                       uth->uu_pindex++;
-               }
+               uth->uu_proc_ps[uth->uu_pindex] = p;
+               uth->uu_pindex++;
        }
 #endif
 }
@@ -808,6 +814,15 @@ proc_ppid(proc_t p)
        return -1;
 }
 
+int
+proc_original_ppid(proc_t p)
+{
+       if (p != NULL) {
+               return p->p_original_ppid;
+       }
+       return -1;
+}
+
 int
 proc_selfpid(void)
 {
@@ -826,6 +841,24 @@ proc_selfcsflags(void)
        return current_proc()->p_csflags;
 }
 
+uint32_t
+proc_platform(proc_t p)
+{
+       if (p != NULL) {
+               return p->p_platform;
+       }
+       return (uint32_t)-1;
+}
+
+uint32_t
+proc_sdk(proc_t p)
+{
+       if (p != NULL) {
+               return p->p_sdk;
+       }
+       return (uint32_t)-1;
+}
+
 #if CONFIG_DTRACE
 static proc_t
 dtrace_current_proc_vforking(void)
@@ -923,6 +956,19 @@ proc_name_kdp(task_t t, char * buf, int size)
        }
 }
 
+boolean_t
+proc_binary_uuid_kdp(task_t task, uuid_t uuid)
+{
+       proc_t p = get_bsdtask_info(task);
+       if (p == PROC_NULL) {
+               return FALSE;
+       }
+
+       proc_getexecutableuuid(p, uuid, sizeof(uuid_t));
+
+       return TRUE;
+}
+
 int
 proc_threadname_kdp(void * uth, char * buf, size_t size)
 {
@@ -1191,6 +1237,12 @@ proc_getcdhash(proc_t p, unsigned char *cdhash)
        return vn_getcdhash(p->p_textvp, p->p_textoff, cdhash);
 }
 
+int
+proc_exitstatus(proc_t p)
+{
+       return p->p_xstat & 0xffff;
+}
+
 void
 proc_getexecutableuuid(proc_t p, unsigned char *uuidbuf, unsigned long size)
 {
@@ -1214,6 +1266,49 @@ proc_getexecutablevnode(proc_t p)
        return NULLVP;
 }
 
+int
+proc_selfexecutableargs(uint8_t *buf, size_t *buflen)
+{
+       proc_t p = current_proc();
+
+       // buflen must always be provided
+       if (buflen == NULL) {
+               return EINVAL;
+       }
+
+       // If a buf is provided, there must be at least enough room to fit argc
+       if (buf && *buflen < sizeof(p->p_argc)) {
+               return EINVAL;
+       }
+
+       if (!p->user_stack) {
+               return EINVAL;
+       }
+
+       if (buf == NULL) {
+               *buflen = p->p_argslen + sizeof(p->p_argc);
+               return 0;
+       }
+
+       // Copy in argc to the first 4 bytes
+       memcpy(buf, &p->p_argc, sizeof(p->p_argc));
+
+       if (*buflen > sizeof(p->p_argc) && p->p_argslen > 0) {
+               // See memory layout comment in kern_exec.c:exec_copyout_strings()
+               // We want to copy starting from `p_argslen` bytes away from top of stack
+               return copyin(p->user_stack - p->p_argslen,
+                          buf + sizeof(p->p_argc),
+                          MIN(p->p_argslen, *buflen - sizeof(p->p_argc)));
+       } else {
+               return 0;
+       }
+}
+
+off_t
+proc_getexecutableoffset(proc_t p)
+{
+       return p->p_textoff;
+}
 
 void
 bsd_set_dependency_capable(task_t task)
@@ -1387,6 +1482,7 @@ pinsertchild(proc_t parent, proc_t child)
        TAILQ_INIT(&child->p_evlist);
        child->p_pptr = parent;
        child->p_ppid = parent->p_pid;
+       child->p_original_ppid = parent->p_pid;
        child->p_puniqueid = parent->p_uniqueid;
        child->p_xhighbits = 0;
 
@@ -1765,6 +1861,95 @@ fixjobc(proc_t p, struct pgrp *pgrp, int entering)
        proc_childrenwalk(p, fixjob_callback, &fjarg);
 }
 
+/*
+ * The pidlist_* routines support the functions in this file that
+ * walk lists of processes applying filters and callouts to the
+ * elements of the list.
+ *
+ * A prior implementation used a single linear array, which can be
+ * tricky to allocate on large systems. This implementation creates
+ * an SLIST of modestly sized arrays of PIDS_PER_ENTRY elements.
+ *
+ * The array should be sized large enough to keep the overhead of
+ * walking the list low, but small enough that blocking allocations of
+ * pidlist_entry_t structures always succeed.
+ */
+
+#define PIDS_PER_ENTRY 1021
+
+typedef struct pidlist_entry {
+       SLIST_ENTRY(pidlist_entry) pe_link;
+       u_int pe_nused;
+       pid_t pe_pid[PIDS_PER_ENTRY];
+} pidlist_entry_t;
+
+typedef struct {
+       SLIST_HEAD(, pidlist_entry) pl_head;
+       struct pidlist_entry *pl_active;
+       u_int pl_nalloc;
+} pidlist_t;
+
+static __inline__ pidlist_t *
+pidlist_init(pidlist_t *pl)
+{
+       SLIST_INIT(&pl->pl_head);
+       pl->pl_active = NULL;
+       pl->pl_nalloc = 0;
+       return pl;
+}
+
+static u_int
+pidlist_alloc(pidlist_t *pl, u_int needed)
+{
+       while (pl->pl_nalloc < needed) {
+               pidlist_entry_t *pe = kalloc(sizeof(*pe));
+               if (NULL == pe) {
+                       panic("no space for pidlist entry");
+               }
+               pe->pe_nused = 0;
+               SLIST_INSERT_HEAD(&pl->pl_head, pe, pe_link);
+               pl->pl_nalloc += (sizeof(pe->pe_pid) / sizeof(pe->pe_pid[0]));
+       }
+       return pl->pl_nalloc;
+}
+
+static void
+pidlist_free(pidlist_t *pl)
+{
+       pidlist_entry_t *pe;
+       while (NULL != (pe = SLIST_FIRST(&pl->pl_head))) {
+               SLIST_FIRST(&pl->pl_head) = SLIST_NEXT(pe, pe_link);
+               kfree(pe, sizeof(*pe));
+       }
+       pl->pl_nalloc = 0;
+}
+
+static __inline__ void
+pidlist_set_active(pidlist_t *pl)
+{
+       pl->pl_active = SLIST_FIRST(&pl->pl_head);
+       assert(pl->pl_active);
+}
+
+static void
+pidlist_add_pid(pidlist_t *pl, pid_t pid)
+{
+       pidlist_entry_t *pe = pl->pl_active;
+       if (pe->pe_nused >= sizeof(pe->pe_pid) / sizeof(pe->pe_pid[0])) {
+               if (NULL == (pe = SLIST_NEXT(pe, pe_link))) {
+                       panic("pidlist allocation exhausted");
+               }
+               pl->pl_active = pe;
+       }
+       pe->pe_pid[pe->pe_nused++] = pid;
+}
+
+static __inline__ u_int
+pidlist_nalloc(const pidlist_t *pl)
+{
+       return pl->pl_nalloc;
+}
+
 /*
  * A process group has become orphaned; if there are any stopped processes in
  * the group, hang-up all process in that group.
@@ -1772,14 +1957,9 @@ fixjobc(proc_t p, struct pgrp *pgrp, int entering)
 static void
 orphanpg(struct pgrp *pgrp)
 {
-       pid_t *pid_list;
+       pidlist_t pid_list, *pl = pidlist_init(&pid_list);
+       u_int pid_count_available = 0;
        proc_t p;
-       vm_size_t pid_list_size = 0;
-       vm_size_t pid_list_size_needed = 0;
-       int pid_count = 0;
-       int pid_count_available = 0;
-
-       assert(pgrp != NULL);
 
        /* allocate outside of the pgrp_lock */
        for (;;) {
@@ -1790,71 +1970,52 @@ orphanpg(struct pgrp *pgrp)
 
                PGMEMBERS_FOREACH(pgrp, p) {
                        pid_count_available++;
-
                        if (p->p_stat == SSTOP) {
                                should_iterate = TRUE;
                        }
                }
-
                if (pid_count_available == 0 || !should_iterate) {
                        pgrp_unlock(pgrp);
-                       return;
+                       goto out; /* no orphaned processes OR nothing stopped */
                }
-
-               pid_list_size_needed = pid_count_available * sizeof(pid_t);
-               if (pid_list_size >= pid_list_size_needed) {
+               if (pidlist_nalloc(pl) >= pid_count_available) {
                        break;
                }
                pgrp_unlock(pgrp);
 
-               if (pid_list_size != 0) {
-                       kfree(pid_list, pid_list_size);
-               }
-               pid_list = kalloc(pid_list_size_needed);
-               if (!pid_list) {
-                       return;
-               }
-               pid_list_size = pid_list_size_needed;
-       }
-
-       /* no orphaned processes */
-       if (pid_list_size == 0) {
-               pgrp_unlock(pgrp);
-               return;
+               pidlist_alloc(pl, pid_count_available);
        }
+       pidlist_set_active(pl);
 
+       u_int pid_count = 0;
        PGMEMBERS_FOREACH(pgrp, p) {
-               pid_list[pid_count++] = proc_pid(p);
-               if (pid_count >= pid_count_available) {
+               pidlist_add_pid(pl, proc_pid(p));
+               if (++pid_count >= pid_count_available) {
                        break;
                }
        }
        pgrp_unlock(pgrp);
 
-       if (pid_count == 0) {
-               goto out;
-       }
-
-       for (int i = 0; i < pid_count; i++) {
-               /* do not handle kernproc */
-               if (pid_list[i] == 0) {
-                       continue;
-               }
-               p = proc_find(pid_list[i]);
-               if (!p) {
-                       continue;
+       const pidlist_entry_t *pe;
+       SLIST_FOREACH(pe, &(pl->pl_head), pe_link) {
+               for (u_int i = 0; i < pe->pe_nused; i++) {
+                       const pid_t pid = pe->pe_pid[i];
+                       if (0 == pid) {
+                               continue; /* skip kernproc */
+                       }
+                       p = proc_find(pid);
+                       if (!p) {
+                               continue;
+                       }
+                       proc_transwait(p, 0);
+                       pt_setrunnable(p);
+                       psignal(p, SIGHUP);
+                       psignal(p, SIGCONT);
+                       proc_rele(p);
                }
-
-               proc_transwait(p, 0);
-               pt_setrunnable(p);
-               psignal(p, SIGHUP);
-               psignal(p, SIGCONT);
-               proc_rele(p);
        }
-
 out:
-       kfree(pid_list, pid_list_size);
-       return;
+       pidlist_free(pl);
 }
 
 int
@@ -2344,7 +2505,7 @@ out:
        return error;
 }
 
-int
+void
 proc_iterate(
        unsigned int flags,
        proc_iterate_fn_t callout,
@@ -2352,40 +2513,28 @@ proc_iterate(
        proc_iterate_fn_t filterfn,
        void *filterarg)
 {
-       pid_t *pid_list = NULL;
-       vm_size_t pid_list_size = 0;
-       vm_size_t pid_list_size_needed = 0;
-       int pid_count = 0;
-       int pid_count_available = 0;
+       pidlist_t pid_list, *pl = pidlist_init(&pid_list);
+       u_int pid_count_available = 0;
 
        assert(callout != NULL);
 
        /* allocate outside of the proc_list_lock */
        for (;;) {
                proc_list_lock();
-
-               pid_count_available = nprocs + 1 /* kernel_task not counted in nprocs */;
+               pid_count_available = nprocs + 1; /* kernel_task not counted in nprocs */
                assert(pid_count_available > 0);
-
-               pid_list_size_needed = pid_count_available * sizeof(pid_t);
-               if (pid_list_size >= pid_list_size_needed) {
+               if (pidlist_nalloc(pl) > pid_count_available) {
                        break;
                }
                proc_list_unlock();
 
-               if (pid_list_size != 0) {
-                       kfree(pid_list, pid_list_size);
-               }
-               pid_list = kalloc(pid_list_size_needed);
-               if (!pid_list) {
-                       return 1;
-               }
-               pid_list_size = pid_list_size_needed;
+               pidlist_alloc(pl, pid_count_available);
        }
-       assert(pid_list != NULL);
+       pidlist_set_active(pl);
 
-       /* filter pids into pid_list */
+       /* filter pids into the pid_list */
 
+       u_int pid_count = 0;
        if (flags & PROC_ALLPROCLIST) {
                proc_t p;
                ALLPROC_FOREACH(p) {
@@ -2396,9 +2545,8 @@ proc_iterate(
                        if ((filterfn != NULL) && (filterfn(p, filterarg) == 0)) {
                                continue;
                        }
-
-                       pid_list[pid_count++] = proc_pid(p);
-                       if (pid_count >= pid_count_available) {
+                       pidlist_add_pid(pl, proc_pid(p));
+                       if (++pid_count >= pid_count_available) {
                                break;
                        }
                }
@@ -2411,9 +2559,8 @@ proc_iterate(
                        if ((filterfn != NULL) && (filterfn(p, filterarg) == 0)) {
                                continue;
                        }
-
-                       pid_list[pid_count++] = proc_pid(p);
-                       if (pid_count >= pid_count_available) {
+                       pidlist_add_pid(pl, proc_pid(p));
+                       if (++pid_count >= pid_count_available) {
                                break;
                        }
                }
@@ -2423,63 +2570,63 @@ proc_iterate(
 
        /* call callout on processes in the pid_list */
 
-       for (int i = 0; i < pid_count; i++) {
-               proc_t p = proc_find(pid_list[i]);
-               if (p) {
-                       if ((flags & PROC_NOWAITTRANS) == 0) {
-                               proc_transwait(p, 0);
-                       }
-                       int callout_ret = callout(p, arg);
-
-                       switch (callout_ret) {
-                       case PROC_RETURNED_DONE:
-                               proc_rele(p);
-                       /* FALLTHROUGH */
-                       case PROC_CLAIMED_DONE:
-                               goto out;
-
-                       case PROC_RETURNED:
-                               proc_rele(p);
-                       /* FALLTHROUGH */
-                       case PROC_CLAIMED:
-                               break;
-
-                       default:
-                               panic("proc_iterate: callout returned %d for pid %d",
-                                   callout_ret, pid_list[i]);
-                               break;
-                       }
-               } else if (flags & PROC_ZOMBPROCLIST) {
-                       p = proc_find_zombref(pid_list[i]);
-                       if (!p) {
-                               continue;
-                       }
-                       int callout_ret = callout(p, arg);
-
-                       switch (callout_ret) {
-                       case PROC_RETURNED_DONE:
-                               proc_drop_zombref(p);
-                       /* FALLTHROUGH */
-                       case PROC_CLAIMED_DONE:
-                               goto out;
-
-                       case PROC_RETURNED:
-                               proc_drop_zombref(p);
-                       /* FALLTHROUGH */
-                       case PROC_CLAIMED:
-                               break;
-
-                       default:
-                               panic("proc_iterate: callout returned %d for zombie pid %d",
-                                   callout_ret, pid_list[i]);
-                               break;
+       const pidlist_entry_t *pe;
+       SLIST_FOREACH(pe, &(pl->pl_head), pe_link) {
+               for (u_int i = 0; i < pe->pe_nused; i++) {
+                       const pid_t pid = pe->pe_pid[i];
+                       proc_t p = proc_find(pid);
+                       if (p) {
+                               if ((flags & PROC_NOWAITTRANS) == 0) {
+                                       proc_transwait(p, 0);
+                               }
+                               const int callout_ret = callout(p, arg);
+
+                               switch (callout_ret) {
+                               case PROC_RETURNED_DONE:
+                                       proc_rele(p);
+                               /* FALLTHROUGH */
+                               case PROC_CLAIMED_DONE:
+                                       goto out;
+
+                               case PROC_RETURNED:
+                                       proc_rele(p);
+                               /* FALLTHROUGH */
+                               case PROC_CLAIMED:
+                                       break;
+                               default:
+                                       panic("%s: callout =%d for pid %d",
+                                           __func__, callout_ret, pid);
+                                       break;
+                               }
+                       } else if (flags & PROC_ZOMBPROCLIST) {
+                               p = proc_find_zombref(pid);
+                               if (!p) {
+                                       continue;
+                               }
+                               const int callout_ret = callout(p, arg);
+
+                               switch (callout_ret) {
+                               case PROC_RETURNED_DONE:
+                                       proc_drop_zombref(p);
+                               /* FALLTHROUGH */
+                               case PROC_CLAIMED_DONE:
+                                       goto out;
+
+                               case PROC_RETURNED:
+                                       proc_drop_zombref(p);
+                               /* FALLTHROUGH */
+                               case PROC_CLAIMED:
+                                       break;
+                               default:
+                                       panic("%s: callout =%d for zombie %d",
+                                           __func__, callout_ret, pid);
+                                       break;
+                               }
                        }
                }
        }
-
 out:
-       kfree(pid_list, pid_list_size);
-       return 0;
+       pidlist_free(pl);
 }
 
 void
@@ -2520,93 +2667,82 @@ restart_foreach:
        proc_list_unlock();
 }
 
-int
+void
 proc_childrenwalk(
        proc_t parent,
        proc_iterate_fn_t callout,
        void *arg)
 {
-       pid_t *pid_list;
-       vm_size_t pid_list_size = 0;
-       vm_size_t pid_list_size_needed = 0;
-       int pid_count = 0;
-       int pid_count_available = 0;
+       pidlist_t pid_list, *pl = pidlist_init(&pid_list);
+       u_int pid_count_available = 0;
 
        assert(parent != NULL);
        assert(callout != NULL);
 
        for (;;) {
                proc_list_lock();
-
                pid_count_available = parent->p_childrencnt;
                if (pid_count_available == 0) {
                        proc_list_unlock();
-                       return 0;
+                       goto out;
                }
-
-               pid_list_size_needed = pid_count_available * sizeof(pid_t);
-               if (pid_list_size >= pid_list_size_needed) {
+               if (pidlist_nalloc(pl) > pid_count_available) {
                        break;
                }
                proc_list_unlock();
 
-               if (pid_list_size != 0) {
-                       kfree(pid_list, pid_list_size);
-               }
-               pid_list = kalloc(pid_list_size_needed);
-               if (!pid_list) {
-                       return 1;
-               }
-               pid_list_size = pid_list_size_needed;
+               pidlist_alloc(pl, pid_count_available);
        }
+       pidlist_set_active(pl);
 
+       u_int pid_count = 0;
        proc_t p;
        PCHILDREN_FOREACH(parent, p) {
                if (p->p_stat == SIDL) {
                        continue;
                }
-
-               pid_list[pid_count++] = proc_pid(p);
-               if (pid_count >= pid_count_available) {
+               pidlist_add_pid(pl, proc_pid(p));
+               if (++pid_count >= pid_count_available) {
                        break;
                }
        }
 
        proc_list_unlock();
 
-       for (int i = 0; i < pid_count; i++) {
-               p = proc_find(pid_list[i]);
-               if (!p) {
-                       continue;
-               }
-
-               int callout_ret = callout(p, arg);
+       const pidlist_entry_t *pe;
+       SLIST_FOREACH(pe, &(pl->pl_head), pe_link) {
+               for (u_int i = 0; i < pe->pe_nused; i++) {
+                       const pid_t pid = pe->pe_pid[i];
+                       p = proc_find(pid);
+                       if (!p) {
+                               continue;
+                       }
+                       const int callout_ret = callout(p, arg);
 
-               switch (callout_ret) {
-               case PROC_RETURNED_DONE:
-                       proc_rele(p);
-               /* FALLTHROUGH */
-               case PROC_CLAIMED_DONE:
-                       goto out;
+                       switch (callout_ret) {
+                       case PROC_RETURNED_DONE:
+                               proc_rele(p);
+                       /* FALLTHROUGH */
+                       case PROC_CLAIMED_DONE:
+                               goto out;
 
-               case PROC_RETURNED:
-                       proc_rele(p);
-               /* FALLTHROUGH */
-               case PROC_CLAIMED:
-                       break;
-               default:
-                       panic("proc_childrenwalk: callout returned %d for pid %d",
-                           callout_ret, pid_list[i]);
-                       break;
+                       case PROC_RETURNED:
+                               proc_rele(p);
+                       /* FALLTHROUGH */
+                       case PROC_CLAIMED:
+                               break;
+                       default:
+                               panic("%s: callout =%d for pid %d",
+                                   __func__, callout_ret, pid);
+                               break;
+                       }
                }
        }
-
 out:
-       kfree(pid_list, pid_list_size);
-       return 0;
+       pidlist_free(pl);
 }
 
-int
+void
 pgrp_iterate(
        struct pgrp *pgrp,
        unsigned int flags,
@@ -2615,51 +2751,40 @@ pgrp_iterate(
        proc_iterate_fn_t filterfn,
        void * filterarg)
 {
-       pid_t *pid_list;
-       proc_t p;
-       vm_size_t pid_list_size = 0;
-       vm_size_t pid_list_size_needed = 0;
-       int pid_count = 0;
-       int pid_count_available = 0;
-
-       pid_t pgid;
+       pidlist_t pid_list, *pl = pidlist_init(&pid_list);
+       u_int pid_count_available = 0;
 
        assert(pgrp != NULL);
        assert(callout != NULL);
 
        for (;;) {
                pgrp_lock(pgrp);
-
                pid_count_available = pgrp->pg_membercnt;
                if (pid_count_available == 0) {
                        pgrp_unlock(pgrp);
-                       return 0;
+                       if (flags & PGRP_DROPREF) {
+                               pg_rele(pgrp);
+                       }
+                       goto out;
                }
-
-               pid_list_size_needed = pid_count_available * sizeof(pid_t);
-               if (pid_list_size >= pid_list_size_needed) {
+               if (pidlist_nalloc(pl) > pid_count_available) {
                        break;
                }
                pgrp_unlock(pgrp);
 
-               if (pid_list_size != 0) {
-                       kfree(pid_list, pid_list_size);
-               }
-               pid_list = kalloc(pid_list_size_needed);
-               if (!pid_list) {
-                       return 1;
-               }
-               pid_list_size = pid_list_size_needed;
+               pidlist_alloc(pl, pid_count_available);
        }
+       pidlist_set_active(pl);
 
-       pgid = pgrp->pg_id;
-
+       const pid_t pgid = pgrp->pg_id;
+       u_int pid_count = 0;
+       proc_t p;
        PGMEMBERS_FOREACH(pgrp, p) {
                if ((filterfn != NULL) && (filterfn(p, filterarg) == 0)) {
                        continue;;
                }
-               pid_list[pid_count++] = proc_pid(p);
-               if (pid_count >= pid_count_available) {
+               pidlist_add_pid(pl, proc_pid(p));
+               if (++pid_count >= pid_count_available) {
                        break;
                }
        }
@@ -2670,44 +2795,44 @@ pgrp_iterate(
                pg_rele(pgrp);
        }
 
-       for (int i = 0; i < pid_count; i++) {
-               /* do not handle kernproc */
-               if (pid_list[i] == 0) {
-                       continue;
-               }
-               p = proc_find(pid_list[i]);
-               if (!p) {
-                       continue;
-               }
-               if (p->p_pgrpid != pgid) {
-                       proc_rele(p);
-                       continue;
-               }
-
-               int callout_ret = callout(p, arg);
-
-               switch (callout_ret) {
-               case PROC_RETURNED:
-                       proc_rele(p);
-               /* FALLTHROUGH */
-               case PROC_CLAIMED:
-                       break;
+       const pidlist_entry_t *pe;
+       SLIST_FOREACH(pe, &(pl->pl_head), pe_link) {
+               for (u_int i = 0; i < pe->pe_nused; i++) {
+                       const pid_t pid = pe->pe_pid[i];
+                       if (0 == pid) {
+                               continue; /* skip kernproc */
+                       }
+                       p = proc_find(pid);
+                       if (!p) {
+                               continue;
+                       }
+                       if (p->p_pgrpid != pgid) {
+                               proc_rele(p);
+                               continue;
+                       }
+                       const int callout_ret = callout(p, arg);
 
-               case PROC_RETURNED_DONE:
-                       proc_rele(p);
-               /* FALLTHROUGH */
-               case PROC_CLAIMED_DONE:
-                       goto out;
+                       switch (callout_ret) {
+                       case PROC_RETURNED:
+                               proc_rele(p);
+                       /* FALLTHROUGH */
+                       case PROC_CLAIMED:
+                               break;
+                       case PROC_RETURNED_DONE:
+                               proc_rele(p);
+                       /* FALLTHROUGH */
+                       case PROC_CLAIMED_DONE:
+                               goto out;
 
-               default:
-                       panic("pgrp_iterate: callout returned %d for pid %d",
-                           callout_ret, pid_list[i]);
+                       default:
+                               panic("%s: callout =%d for pid %d",
+                                   __func__, callout_ret, pid);
+                       }
                }
        }
 
 out:
-       kfree(pid_list, pid_list_size);
-       return 0;
+       pidlist_free(pl);
 }
 
 static void
@@ -3110,7 +3235,7 @@ proc_knote_drain(struct proc *p)
         */
        proc_klist_lock();
        while ((kn = SLIST_FIRST(&p->p_klist))) {
-               kn->kn_ptr.p_proc = PROC_NULL;
+               kn->kn_proc = PROC_NULL;
                KNOTE_DETACH(&p->p_klist, kn);
        }
        proc_klist_unlock();
@@ -3138,6 +3263,20 @@ proc_pgrpid(proc_t p)
        return p->p_pgrpid;
 }
 
+pid_t
+proc_sessionid(proc_t p)
+{
+       pid_t sid = -1;
+       struct session * sessp = proc_session(p);
+
+       if (sessp != SESSION_NULL) {
+               sid = sessp->s_sid;
+               session_rele(sessp);
+       }
+
+       return sid;
+}
+
 pid_t
 proc_selfpgrpid()
 {
@@ -3167,6 +3306,7 @@ int
 proc_dopcontrol(proc_t p)
 {
        int pcontrol;
+       os_reason_t kill_reason;
 
        proc_lock(p);
 
@@ -3191,7 +3331,8 @@ proc_dopcontrol(proc_t p)
                        PROC_SETACTION_STATE(p);
                        proc_unlock(p);
                        printf("low swap: killing pid %d (%s)\n", p->p_pid, p->p_comm);
-                       psignal(p, SIGKILL);
+                       kill_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_LOWSWAP);
+                       psignal_with_reason(p, SIGKILL, kill_reason);
                        break;
 
                default:
@@ -3348,7 +3489,7 @@ proc_pcontrol_null(__unused proc_t p, __unused void *arg)
 
 extern uint64_t vm_compressor_pages_compressed(void);
 
-struct timeval  last_no_space_action = {0, 0};
+struct timeval  last_no_space_action = {.tv_sec = 0, .tv_usec = 0};
 
 #if DEVELOPMENT || DEBUG
 extern boolean_t kill_on_no_paging_space;
@@ -3366,6 +3507,7 @@ no_paging_space_action()
        proc_t          p;
        struct no_paging_space nps;
        struct timeval  now;
+       os_reason_t kill_reason;
 
        /*
         * Throttle how often we come through here.  Once every 5 seconds should be plenty.
@@ -3413,7 +3555,8 @@ no_paging_space_action()
                                last_no_space_action = now;
 
                                printf("low swap: killing largest compressed process with pid %d (%s) and size %llu MB\n", p->p_pid, p->p_comm, (nps.pcs_max_size / MB_SIZE));
-                               psignal(p, SIGKILL);
+                               kill_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_LOWSWAP);
+                               psignal_with_reason(p, SIGKILL, kill_reason);
 
                                proc_rele(p);
 
@@ -3575,6 +3718,36 @@ proc_send_synchronous_EXC_RESOURCE(proc_t p)
        return FALSE;
 }
 
+size_t
+proc_get_syscall_filter_mask_size(int which)
+{
+       if (which == SYSCALL_MASK_UNIX) {
+               return nsysent;
+       }
+
+       return 0;
+}
+
+int
+proc_set_syscall_filter_mask(proc_t p, int which, unsigned char *maskptr, size_t masklen)
+{
+#if DEVELOPMENT || DEBUG
+       if (syscallfilter_disable) {
+               printf("proc_set_syscall_filter_mask: attempt to set policy for pid %d, but disabled by boot-arg\n", proc_pid(p));
+               return KERN_SUCCESS;
+       }
+#endif // DEVELOPMENT || DEBUG
+
+       if (which != SYSCALL_MASK_UNIX ||
+           (maskptr != NULL && masklen != nsysent)) {
+               return EINVAL;
+       }
+
+       p->syscall_filter_mask = maskptr;
+
+       return KERN_SUCCESS;
+}
+
 #ifdef CONFIG_32BIT_TELEMETRY
 void
 proc_log_32bit_telemetry(proc_t p)
index 7840d8a4bf700d4faa067e995d076fbace9e7be7..4a4a662edb986f6f89313bb6ebefcdeadc5620ca 100644 (file)
@@ -501,6 +501,27 @@ getwgroups(__unused proc_t p, __unused struct getwgroups_args *uap, __unused int
        return ENOTSUP;
 }
 
+/*
+ * setsid_internal
+ *
+ * Description:        Core implementation of setsid().
+ */
+int
+setsid_internal(proc_t p)
+{
+       struct pgrp * pg = PGRP_NULL;
+
+       if (p->p_pgrpid == p->p_pid || (pg = pgfind(p->p_pid)) || p->p_lflag & P_LINVFORK) {
+               if (pg != PGRP_NULL) {
+                       pg_rele(pg);
+               }
+               return EPERM;
+       } else {
+               /* enter pgrp works with its own pgrp refcount */
+               (void)enterpgrp(p, p->p_pid, 1);
+               return 0;
+       }
+}
 
 /*
  * setsid
@@ -529,19 +550,11 @@ getwgroups(__unused proc_t p, __unused struct getwgroups_args *uap, __unused int
 int
 setsid(proc_t p, __unused struct setsid_args *uap, int32_t *retval)
 {
-       struct pgrp * pg = PGRP_NULL;
-
-       if (p->p_pgrpid == p->p_pid || (pg = pgfind(p->p_pid)) || p->p_lflag & P_LINVFORK) {
-               if (pg != PGRP_NULL) {
-                       pg_rele(pg);
-               }
-               return EPERM;
-       } else {
-               /* enter pgrp works with its own pgrp refcount */
-               (void)enterpgrp(p, p->p_pid, 1);
+       int rc = setsid_internal(p);
+       if (rc == 0) {
                *retval = p->p_pid;
-               return 0;
        }
+       return rc;
 }
 
 
@@ -1640,30 +1653,34 @@ settid_with_pid(proc_t p, struct settid_with_pid_args *uap, __unused int32_t *re
  *             flag the process as having set privilege since the last exec.
  */
 static int
-setgroups1(proc_t p, u_int gidsetsize, user_addr_t gidset, uid_t gmuid, __unused int32_t *retval)
+setgroups1(proc_t p, u_int ngrp, user_addr_t gidset, uid_t gmuid, __unused int32_t *retval)
 {
-       u_int ngrp;
        gid_t   newgroups[NGROUPS] = { 0 };
        int     error;
-       kauth_cred_t my_cred, my_new_cred;
-       struct uthread *uthread = get_bsdthread_info(current_thread());
 
-       DEBUG_CRED_ENTER("setgroups1 (%d/%d): %d 0x%016x %d\n", p->p_pid, (p->p_pptr ? p->p_pptr->p_pid : 0), gidsetsize, gidset, gmuid);
+       DEBUG_CRED_ENTER("setgroups1 (%d/%d): %d 0x%016x %d\n", p->p_pid,
+           (p->p_pptr ? p->p_pptr->p_pid : 0), ngrp, gidset, gmuid);
 
-       ngrp = gidsetsize;
        if (ngrp > NGROUPS) {
                return EINVAL;
        }
 
-       if (ngrp < 1) {
-               ngrp = 1;
-       } else {
+       if (ngrp >= 1) {
                error = copyin(gidset,
                    (caddr_t)newgroups, ngrp * sizeof(gid_t));
                if (error) {
                        return error;
                }
        }
+       return setgroups_internal(p, ngrp, newgroups, gmuid);
+}
+
+int
+setgroups_internal(proc_t p, u_int ngrp, gid_t *newgroups, uid_t gmuid)
+{
+       struct uthread *uthread = get_bsdthread_info(current_thread());
+       kauth_cred_t my_cred, my_new_cred;
+       int     error;
 
        my_cred = kauth_cred_proc_ref(p);
        if ((error = suser(my_cred, &p->p_acflag))) {
@@ -1671,6 +1688,11 @@ setgroups1(proc_t p, u_int gidsetsize, user_addr_t gidset, uid_t gmuid, __unused
                return error;
        }
 
+       if (ngrp < 1) {
+               ngrp = 1;
+               newgroups[0] = 0;
+       }
+
        if ((uthread->uu_flag & UT_SETUID) != 0) {
 #if DEBUG_CRED
                int my_cred_flags = uthread->uu_ucred->cr_flags;
@@ -1942,6 +1964,18 @@ getlogin(proc_t p, struct getlogin_args *uap, __unused int32_t *retval)
        return copyout((caddr_t)buffer, uap->namebuf, uap->namelen);
 }
 
+void
+setlogin_internal(proc_t p, const char login[static MAXLOGNAME])
+{
+       struct session *sessp = proc_session(p);
+
+       if (sessp != SESSION_NULL) {
+               session_lock(sessp);
+               bcopy(login, sessp->s_login, MAXLOGNAME);
+               session_unlock(sessp);
+               session_rele(sessp);
+       }
+}
 
 /*
  * setlogin
@@ -1965,7 +1999,6 @@ setlogin(proc_t p, struct setlogin_args *uap, __unused int32_t *retval)
        int error;
        size_t dummy = 0;
        char buffer[MAXLOGNAME + 1];
-       struct session * sessp;
 
        if ((error = proc_suser(p))) {
                return error;
@@ -1978,15 +2011,7 @@ setlogin(proc_t p, struct setlogin_args *uap, __unused int32_t *retval)
            (caddr_t) &buffer[0],
            MAXLOGNAME - 1, (size_t *)&dummy);
 
-       sessp = proc_session(p);
-
-       if (sessp != SESSION_NULL) {
-               session_lock(sessp);
-               bcopy(buffer, sessp->s_login, MAXLOGNAME);
-               session_unlock(sessp);
-               session_rele(sessp);
-       }
-
+       setlogin_internal(p, buffer);
 
        if (!error) {
                AUDIT_ARG(text, buffer);
index 839a190b5b61a92f9c84127f89f3b2a1f3acc0b9..7978cff02ed0b314e8c672a2c3e6c56915cf540c 100644 (file)
@@ -133,7 +133,7 @@ int fill_task_rusage(task_t task, rusage_info_current *ri);
 void fill_task_billed_usage(task_t task, rusage_info_current *ri);
 int fill_task_io_rusage(task_t task, rusage_info_current *ri);
 int fill_task_qos_rusage(task_t task, rusage_info_current *ri);
-uint64_t get_task_logical_writes(task_t task);
+uint64_t get_task_logical_writes(task_t task, boolean_t external);
 void fill_task_monotonic_rusage(task_t task, rusage_info_current *ri);
 
 int proc_get_rusage(proc_t p, int flavor, user_addr_t buffer, __unused int is_zombie);
@@ -780,7 +780,11 @@ do_background_socket(struct proc *p, thread_t thread)
 #if SOCKETS
        struct filedesc                     *fdp;
        struct fileproc                     *fp;
-       int                                 i, background;
+       int                                 i = 0;
+       int                                                                     background = false;
+#if NECP
+       int                                                                     update_necp = false;
+#endif /* NECP */
 
        proc_fdlock(p);
 
@@ -811,7 +815,9 @@ do_background_socket(struct proc *p, thread_t thread)
                                }
 #if NECP
                                else if (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_NETPOLICY) {
-                                       necp_set_client_as_background(p, fp, background);
+                                       if (necp_set_client_as_background(p, fp, background)) {
+                                               update_necp = true;
+                                       }
                                }
 #endif /* NECP */
                        }
@@ -841,13 +847,21 @@ do_background_socket(struct proc *p, thread_t thread)
                        }
 #if NECP
                        else if (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_NETPOLICY) {
-                               necp_set_client_as_background(p, fp, background);
+                               if (necp_set_client_as_background(p, fp, background)) {
+                                       update_necp = true;
+                               }
                        }
 #endif /* NECP */
                }
        }
 
        proc_fdunlock(p);
+
+#if NECP
+       if (update_necp) {
+               necp_update_all_clients();
+       }
+#endif /* NECP */
 #else
 #pragma unused(p, thread)
 #endif
@@ -1480,6 +1494,10 @@ static int
 iopolicysys_vfs_hfs_case_sensitivity(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
 static int
 iopolicysys_vfs_atime_updates(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
+static int
+iopolicysys_vfs_materialize_dataless_files(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
+static int
+iopolicysys_vfs_statfs_no_data_volume(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
 
 /*
  * iopolicysys
@@ -1526,6 +1544,17 @@ iopolicysys(struct proc *p, struct iopolicysys_args *uap, int32_t *retval)
                        goto out;
                }
                break;
+       case IOPOL_TYPE_VFS_MATERIALIZE_DATALESS_FILES:
+               error = iopolicysys_vfs_materialize_dataless_files(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param);
+               if (error) {
+                       goto out;
+               }
+               break;
+       case IOPOL_TYPE_VFS_STATFS_NO_DATA_VOLUME:
+               error = iopolicysys_vfs_statfs_no_data_volume(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param);
+               if (error) {
+                       goto out;
+               }
        default:
                error = EINVAL;
                goto out;
@@ -1823,6 +1852,184 @@ out:
        return error;
 }
 
+static inline int
+get_thread_materialize_policy(struct uthread *ut)
+{
+       if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
+               return IOPOL_MATERIALIZE_DATALESS_FILES_OFF;
+       } else if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
+               return IOPOL_MATERIALIZE_DATALESS_FILES_ON;
+       }
+       /* Default thread behavior is "inherit process behavior". */
+       return IOPOL_MATERIALIZE_DATALESS_FILES_DEFAULT;
+}
+
+static inline void
+set_thread_materialize_policy(struct uthread *ut, int policy)
+{
+       if (policy == IOPOL_MATERIALIZE_DATALESS_FILES_OFF) {
+               ut->uu_flag &= ~UT_NSPACE_FORCEDATALESSFAULTS;
+               ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
+       } else if (policy == IOPOL_MATERIALIZE_DATALESS_FILES_ON) {
+               ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
+               ut->uu_flag |= UT_NSPACE_FORCEDATALESSFAULTS;
+       } else {
+               ut->uu_flag &= ~(UT_NSPACE_NODATALESSFAULTS | UT_NSPACE_FORCEDATALESSFAULTS);
+       }
+}
+
+static inline void
+set_proc_materialize_policy(struct proc *p, int policy)
+{
+       if (policy == IOPOL_MATERIALIZE_DATALESS_FILES_DEFAULT) {
+               /*
+                * Caller has specified "use the default policy".
+                * The default policy is to NOT materialize dataless
+                * files.
+                */
+               policy = IOPOL_MATERIALIZE_DATALESS_FILES_OFF;
+       }
+       if (policy == IOPOL_MATERIALIZE_DATALESS_FILES_ON) {
+               OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
+       } else {
+               OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
+       }
+}
+
+static int
+get_proc_materialize_policy(struct proc *p)
+{
+       return (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) ? IOPOL_MATERIALIZE_DATALESS_FILES_ON : IOPOL_MATERIALIZE_DATALESS_FILES_OFF;
+}
+
+static int
+iopolicysys_vfs_materialize_dataless_files(struct proc *p __unused, int cmd, int scope, int policy, struct _iopol_param_t *iop_param)
+{
+       int                     error = 0;
+       thread_t                thread;
+
+       /* Validate scope */
+       switch (scope) {
+       case IOPOL_SCOPE_THREAD:
+               thread = current_thread();
+               break;
+       case IOPOL_SCOPE_PROCESS:
+               thread = THREAD_NULL;
+               break;
+       default:
+               error = EINVAL;
+               goto out;
+       }
+
+       /* Validate policy */
+       if (cmd == IOPOL_CMD_SET) {
+               switch (policy) {
+               case IOPOL_MATERIALIZE_DATALESS_FILES_DEFAULT:
+               case IOPOL_MATERIALIZE_DATALESS_FILES_OFF:
+               case IOPOL_MATERIALIZE_DATALESS_FILES_ON:
+                       break;
+               default:
+                       error = EINVAL;
+                       goto out;
+               }
+       }
+
+       /* Perform command */
+       switch (cmd) {
+       case IOPOL_CMD_SET:
+               if (thread != THREAD_NULL) {
+                       set_thread_materialize_policy(get_bsdthread_info(thread), policy);
+               } else {
+                       set_proc_materialize_policy(p, policy);
+               }
+               break;
+       case IOPOL_CMD_GET:
+               if (thread != THREAD_NULL) {
+                       policy = get_thread_materialize_policy(get_bsdthread_info(thread));
+               } else {
+                       policy = get_proc_materialize_policy(p);
+               }
+               iop_param->iop_policy = policy;
+               break;
+       default:
+               error = EINVAL;         /* unknown command */
+               break;
+       }
+
+out:
+       return error;
+}
+
+static int
+iopolicysys_vfs_statfs_no_data_volume(struct proc *p __unused, int cmd,
+    int scope, int policy, struct _iopol_param_t *iop_param)
+{
+       int error = 0;
+
+       /* Validate scope */
+       switch (scope) {
+       case IOPOL_SCOPE_PROCESS:
+               /* Only process OK */
+               break;
+       default:
+               error = EINVAL;
+               goto out;
+       }
+
+       /* Validate policy */
+       if (cmd == IOPOL_CMD_SET) {
+               switch (policy) {
+               case IOPOL_VFS_STATFS_NO_DATA_VOLUME_DEFAULT:
+               /* fall-through */
+               case IOPOL_VFS_STATFS_FORCE_NO_DATA_VOLUME:
+                       /* These policies are OK */
+                       break;
+               default:
+                       error = EINVAL;
+                       goto out;
+               }
+       }
+
+       /* Perform command */
+       switch (cmd) {
+       case IOPOL_CMD_SET:
+               if (0 == kauth_cred_issuser(kauth_cred_get())) {
+                       /* If it's a non-root process, it needs to have the entitlement to set the policy */
+                       boolean_t entitled = FALSE;
+                       entitled = IOTaskHasEntitlement(current_task(), "com.apple.private.iopol.case_sensitivity");
+                       if (!entitled) {
+                               error = EPERM;
+                               goto out;
+                       }
+               }
+
+               switch (policy) {
+               case IOPOL_VFS_STATFS_NO_DATA_VOLUME_DEFAULT:
+                       OSBitAndAtomic16(~((uint32_t)P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME), &p->p_vfs_iopolicy);
+                       break;
+               case IOPOL_VFS_STATFS_FORCE_NO_DATA_VOLUME:
+                       OSBitOrAtomic16((uint32_t)P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME, &p->p_vfs_iopolicy);
+                       break;
+               default:
+                       error = EINVAL;
+                       goto out;
+               }
+
+               break;
+       case IOPOL_CMD_GET:
+               iop_param->iop_policy = (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)
+                   ? IOPOL_VFS_STATFS_FORCE_NO_DATA_VOLUME
+                   : IOPOL_VFS_STATFS_NO_DATA_VOLUME_DEFAULT;
+               break;
+       default:
+               error = EINVAL;         /* unknown command */
+               break;
+       }
+
+out:
+       return error;
+}
+
 /* BSD call back function for task_policy networking changes */
 void
 proc_apply_task_networkbg(void * bsd_info, thread_t thread)
@@ -1850,7 +2057,7 @@ gather_rusage_info(proc_t p, rusage_info_current *ru, int flavor)
        memset(ru, 0, sizeof(*ru));
        switch (flavor) {
        case RUSAGE_INFO_V4:
-               ru->ri_logical_writes = get_task_logical_writes(p->task);
+               ru->ri_logical_writes = get_task_logical_writes(p->task, FALSE);
                ru->ri_lifetime_max_phys_footprint = get_task_phys_footprint_lifetime_max(p->task);
 #if CONFIG_LEDGER_INTERVAL_MAX
                ru->ri_interval_max_phys_footprint = get_task_phys_footprint_interval_max(p->task, FALSE);
index c9b334c8daeb00b6b650ca624406fd824c62d072..1e0027f953f3c98458328c2988450766c2da7950 100644 (file)
@@ -145,6 +145,7 @@ get_system_inshutdown()
        return system_inshutdown;
 }
 
+__abortlike
 static void
 panic_kernel(int howto, char *message)
 {
@@ -154,6 +155,11 @@ panic_kernel(int howto, char *message)
        panic("userspace panic: %s", message);
 }
 
+extern boolean_t compressor_store_stop_compaction;
+extern lck_mtx_t vm_swap_data_lock;
+extern int vm_swapfile_create_thread_running;
+extern int vm_swapfile_gc_thread_running;
+
 int
 reboot_kernel(int howto, char *message)
 {
@@ -170,6 +176,25 @@ reboot_kernel(int howto, char *message)
                }
                return EBUSY;
        }
+
+       lck_mtx_lock(&vm_swap_data_lock);
+
+       /* Turn OFF future swapfile reclaimation / compaction etc.*/
+       compressor_store_stop_compaction = TRUE;
+
+       /* wait for any current swapfile work to end */
+       while (vm_swapfile_create_thread_running || vm_swapfile_gc_thread_running) {
+               assert_wait((event_t)&compressor_store_stop_compaction, THREAD_UNINT);
+
+               lck_mtx_unlock(&vm_swap_data_lock);
+
+               thread_block(THREAD_CONTINUE_NULL);
+
+               lck_mtx_lock(&vm_swap_data_lock);
+       }
+
+       lck_mtx_unlock(&vm_swap_data_lock);
+
        /*
         * Notify the power management root domain that the system will shut down.
         */
@@ -263,9 +288,6 @@ force_reboot:
                panic_kernel(howto, message);
        }
 
-       if (howto & RB_POWERDOWN) {
-               hostboot_option = HOST_REBOOT_HALT;
-       }
        if (howto & RB_HALT) {
                hostboot_option = HOST_REBOOT_HALT;
        }
index 254f60066d2955ee3f157da60bb22aa7d4744aff..dc26af6f31ee49ba12ddffd642d3c5dd1b40b195 100644 (file)
  * +++
  */
 extern int thread_enable_fpe(thread_t act, int onoff);
-extern thread_t port_name_to_thread(mach_port_name_t port_name);
 extern kern_return_t get_signalact(task_t, thread_t *, int);
 extern unsigned int get_useraddr(void);
 extern boolean_t task_did_exec(task_t task);
@@ -154,11 +153,11 @@ kern_return_t semaphore_timedwait_trap_internal(mach_port_name_t, unsigned int,
 kern_return_t semaphore_wait_signal_trap_internal(mach_port_name_t, mach_port_name_t, void (*)(kern_return_t));
 kern_return_t semaphore_wait_trap_internal(mach_port_name_t, void (*)(kern_return_t));
 
-static int      filt_sigattach(struct knote *kn, struct kevent_internal_s *kev);
+static int      filt_sigattach(struct knote *kn, struct kevent_qos_s *kev);
 static void     filt_sigdetach(struct knote *kn);
 static int      filt_signal(struct knote *kn, long hint);
-static int      filt_signaltouch(struct knote *kn, struct kevent_internal_s *kev);
-static int      filt_signalprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
+static int      filt_signaltouch(struct knote *kn, struct kevent_qos_s *kev);
+static int      filt_signalprocess(struct knote *kn, struct kevent_qos_s *kev);
 
 SECURITY_READ_ONLY_EARLY(struct filterops) sig_filtops = {
        .f_attach = filt_sigattach,
@@ -996,7 +995,8 @@ __pthread_markcancel(__unused proc_t p,
        int error = 0;
        struct uthread *uth;
 
-       target_act = (thread_act_t)port_name_to_thread(uap->thread_port);
+       target_act = (thread_act_t)port_name_to_thread(uap->thread_port,
+           PORT_TO_THREAD_IN_CURRENT_TASK);
 
        if (target_act == THR_ACT_NULL) {
                return ESRCH;
@@ -1264,7 +1264,8 @@ __pthread_kill(__unused proc_t p, struct __pthread_kill_args *uap,
        int signum = uap->sig;
        struct uthread *uth;
 
-       target_act = (thread_t)port_name_to_thread(uap->thread_port);
+       target_act = (thread_t)port_name_to_thread(uap->thread_port,
+           PORT_TO_THREAD_NONE);
 
        if (target_act == THREAD_NULL) {
                return ESRCH;
@@ -1281,6 +1282,11 @@ __pthread_kill(__unused proc_t p, struct __pthread_kill_args *uap,
                goto out;
        }
 
+       if ((thread_get_tag(target_act) & THREAD_TAG_WORKQUEUE) && !uth->uu_workq_pthread_kill_allowed) {
+               error = ENOTSUP;
+               goto out;
+       }
+
        if (signum) {
                psignal_uthread(target_act, signum);
        }
@@ -2048,6 +2054,7 @@ get_signalthread(proc_t p, int signum, thread_t * thr)
        thread_t sig_thread;
        struct task * sig_task = p->task;
        kern_return_t kret;
+       bool skip_wqthreads = true;
 
        *thr = THREAD_NULL;
 
@@ -2062,15 +2069,25 @@ get_signalthread(proc_t p, int signum, thread_t * thr)
                }
        }
 
+again:
        TAILQ_FOREACH(uth, &p->p_uthlist, uu_list) {
                if (((uth->uu_flag & UT_NO_SIGMASK) == 0) &&
                    (((uth->uu_sigmask & mask) == 0) || (uth->uu_sigwait & mask))) {
-                       if (check_actforsig(p->task, uth->uu_context.vc_thread, 1) == KERN_SUCCESS) {
-                               *thr = uth->uu_context.vc_thread;
+                       thread_t th = uth->uu_context.vc_thread;
+                       if (skip_wqthreads && (thread_get_tag(th) & THREAD_TAG_WORKQUEUE)) {
+                               /* Workqueue threads may be parked in the kernel unable to
+                                * deliver signals for an extended period of time, so skip them
+                                * in favor of pthreads in a first pass. (rdar://50054475). */
+                       } else if (check_actforsig(p->task, th, 1) == KERN_SUCCESS) {
+                               *thr = th;
                                return KERN_SUCCESS;
                        }
                }
        }
+       if (skip_wqthreads) {
+               skip_wqthreads = false;
+               goto again;
+       }
        if (get_signalact(p->task, thr, 1) == KERN_SUCCESS) {
                return KERN_SUCCESS;
        }
@@ -2689,6 +2706,12 @@ psignal_with_reason(proc_t p, int signum, struct os_reason *signal_reason)
        psignal_internal(p, NULL, NULL, 0, signum, signal_reason);
 }
 
+void
+psignal_sigkill_with_reason(proc_t p, struct os_reason *signal_reason)
+{
+       psignal_internal(p, NULL, NULL, 0, SIGKILL, signal_reason);
+}
+
 void
 psignal_locked(proc_t p, int signum)
 {
@@ -3269,6 +3292,7 @@ postsig_locked(int signum)
                if ((ps->ps_signodefer & mask) == 0) {
                        ut->uu_sigmask |= mask;
                }
+               sigset_t siginfo = ps->ps_siginfo;
                if ((signum != SIGILL) && (signum != SIGTRAP) && (ps->ps_sigreset & mask)) {
                        if ((signum != SIGCONT) && (sigprop[signum] & SA_IGNORE)) {
                                p->p_sigignore |= mask;
@@ -3285,7 +3309,7 @@ postsig_locked(int signum)
                        ps->ps_code = 0;
                }
                OSIncrementAtomicLong(&p->p_stats->p_ru.ru_nsignals);
-               sendsig(p, catcher, signum, returnmask, code);
+               sendsig(p, catcher, signum, returnmask, code, siginfo);
        }
        proc_signalend(p, 1);
 }
@@ -3299,13 +3323,15 @@ postsig_locked(int signum)
  */
 
 static int
-filt_sigattach(struct knote *kn, __unused struct kevent_internal_s *kev)
+filt_sigattach(struct knote *kn, __unused struct kevent_qos_s *kev)
 {
        proc_t p = current_proc();  /* can attach only to oneself */
 
        proc_klist_lock();
 
-       kn->kn_ptr.p_proc = p;
+       kn->kn_proc = p;
+       kn->kn_flags |= EV_CLEAR; /* automatically set */
+       kn->kn_sdata = 0;         /* incoming data is ignored */
 
        KNOTE_ATTACH(&p->p_klist, kn);
 
@@ -3323,10 +3349,10 @@ filt_sigattach(struct knote *kn, __unused struct kevent_internal_s *kev)
 static void
 filt_sigdetach(struct knote *kn)
 {
-       proc_t p = kn->kn_ptr.p_proc;
+       proc_t p = kn->kn_proc;
 
        proc_klist_lock();
-       kn->kn_ptr.p_proc = NULL;
+       kn->kn_proc = NULL;
        KNOTE_DETACH(&p->p_klist, kn);
        proc_klist_unlock();
 }
@@ -3347,19 +3373,17 @@ filt_signal(struct knote *kn, long hint)
                hint &= ~NOTE_SIGNAL;
 
                if (kn->kn_id == (unsigned int)hint) {
-                       kn->kn_data++;
+                       kn->kn_hook32++;
                }
        } else if (hint & NOTE_EXIT) {
                panic("filt_signal: detected NOTE_EXIT event");
        }
 
-       return kn->kn_data != 0;
+       return kn->kn_hook32 != 0;
 }
 
 static int
-filt_signaltouch(
-       struct knote *kn,
-       struct kevent_internal_s *kev)
+filt_signaltouch(struct knote *kn, struct kevent_qos_s *kev)
 {
 #pragma unused(kev)
 
@@ -3370,7 +3394,7 @@ filt_signaltouch(
        /*
         * No data to save - just capture if it is already fired
         */
-       res = (kn->kn_data > 0);
+       res = (kn->kn_hook32 > 0);
 
        proc_klist_unlock();
 
@@ -3378,29 +3402,22 @@ filt_signaltouch(
 }
 
 static int
-filt_signalprocess(
-       struct knote *kn,
-       __unused struct filt_process_s *data,
-       struct kevent_internal_s *kev)
+filt_signalprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-       proc_klist_lock();
-
-       if (kn->kn_data == 0) {
-               proc_klist_unlock();
-               return 0;
-       }
+       int res = 0;
 
        /*
         * Snapshot the event data.
-        * All signal events are EV_CLEAR, so
-        * add that and clear out the data field.
         */
-       *kev = kn->kn_kevent;
-       kev->flags |= EV_CLEAR;
-       kn->kn_data = 0;
 
+       proc_klist_lock();
+       if (kn->kn_hook32) {
+               knote_fill_kevent(kn, kev, kn->kn_hook32);
+               kn->kn_hook32 = 0;
+               res = 1;
+       }
        proc_klist_unlock();
-       return 1;
+       return res;
 }
 
 void
@@ -3409,7 +3426,6 @@ bsd_ast(thread_t thread)
        proc_t p = current_proc();
        struct uthread *ut = get_bsdthread_info(thread);
        int     signum;
-       user_addr_t pc;
        static int bsd_init_done = 0;
 
        if (p == NULL) {
@@ -3421,12 +3437,6 @@ bsd_ast(thread_t thread)
                return;
        }
 
-       if ((p->p_flag & P_OWEUPC) && (p->p_flag & P_PROFIL)) {
-               pc = get_useraddr();
-               addupc_task(p, pc, 1);
-               OSBitAndAtomic(~((uint32_t)P_OWEUPC), &p->p_flag);
-       }
-
        if (timerisset(&p->p_vtimer_user.it_value)) {
                uint32_t        microsecs;
 
index 62c5990723eec0df8ba6c1c49478e6a3833158d2..9988a3a3d671dd34f37638282ae2281434012749 100644 (file)
@@ -66,6 +66,8 @@
  *     @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
  */
 
+#include <machine/atomic.h>
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc_internal.h>
@@ -601,7 +603,7 @@ uio_create( int a_iovcount,                     /* number of iovecs */
                /* leave a note that we allocated this uio_t */
                my_uio->uio_flags |= UIO_FLAGS_WE_ALLOCED;
 #if DEBUG
-               (void)hw_atomic_add(&uio_t_count, 1);
+               os_atomic_inc(&uio_t_count, relaxed);
 #endif
        }
 
@@ -826,7 +828,7 @@ uio_free( uio_t a_uio )
 
        if (a_uio != NULL && (a_uio->uio_flags & UIO_FLAGS_WE_ALLOCED) != 0) {
 #if DEBUG
-               if (hw_atomic_sub(&uio_t_count, 1) == UINT_MAX) {
+               if (os_atomic_dec_orig(&uio_t_count, relaxed) == 0) {
                        panic("%s :%d - uio_t_count underflow\n", __FILE__, __LINE__);
                }
 #endif
@@ -843,12 +845,20 @@ uio_free( uio_t a_uio )
 int
 uio_addiov( uio_t a_uio, user_addr_t a_baseaddr, user_size_t a_length )
 {
-       int                     i;
+       int i;
+       user_size_t resid;
 
        if (a_uio == NULL) {
 #if DEBUG
                panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__);
-#endif /* LP64_DEBUG */
+#endif
+               return -1;
+       }
+
+       if (os_add_overflow(a_length, a_uio->uio_resid_64, &resid)) {
+#if DEBUG
+               panic("%s :%d - invalid length %lu\n", __FILE__, __LINE__, (unsigned long)a_length);
+#endif
                return -1;
        }
 
@@ -858,7 +868,7 @@ uio_addiov( uio_t a_uio, user_addr_t a_baseaddr, user_size_t a_length )
                                a_uio->uio_iovs.uiovp[i].iov_len = a_length;
                                a_uio->uio_iovs.uiovp[i].iov_base = a_baseaddr;
                                a_uio->uio_iovcnt++;
-                               a_uio->uio_resid_64 += a_length;
+                               a_uio->uio_resid_64 = resid;
                                return 0;
                        }
                }
@@ -868,7 +878,7 @@ uio_addiov( uio_t a_uio, user_addr_t a_baseaddr, user_size_t a_length )
                                a_uio->uio_iovs.kiovp[i].iov_len = (u_int64_t)a_length;
                                a_uio->uio_iovs.kiovp[i].iov_base = (u_int64_t)a_baseaddr;
                                a_uio->uio_iovcnt++;
-                               a_uio->uio_resid_64 += a_length;
+                               a_uio->uio_resid_64 = resid;
                                return 0;
                        }
                }
@@ -1161,7 +1171,7 @@ uio_duplicate( uio_t a_uio )
 
        my_uio->uio_flags = UIO_FLAGS_WE_ALLOCED | UIO_FLAGS_INITED;
 #if DEBUG
-       (void)hw_atomic_add(&uio_t_count, 1);
+       os_atomic_inc(&uio_t_count, relaxed);
 #endif
 
 
index 2ec92f29a12fa1ed779341acf2c7ee92779d5432..94b6a8975cd2c008d4352d39e32609e631a89226 100644 (file)
@@ -62,8 +62,9 @@
 #include <pexpert/pexpert.h>
 #include <IOKit/IOPolledInterface.h>
 
-#define HIBERNATE_MIN_PHYSICAL_LBA    (34)
-#define HIBERNATE_MIN_FILE_SIZE       (1024*1024)
+#define HIBERNATE_MIN_PHYSICAL_LBA_512    (34)
+#define HIBERNATE_MIN_PHYSICAL_LBA_4096   (6)
+#define HIBERNATE_MIN_FILE_SIZE           (1024*1024)
 
 /* This function is called from kern_sysctl in the current process context;
  * it is exported with the System6.0.exports, but this appears to be a legacy
@@ -379,7 +380,11 @@ kern_open_file_for_direct_io(const char * name,
                goto out;
        }
 
-       minoffset = HIBERNATE_MIN_PHYSICAL_LBA * ref->blksize;
+       if (ref->blksize == 4096) {
+               minoffset = HIBERNATE_MIN_PHYSICAL_LBA_4096 * ref->blksize;
+       } else {
+               minoffset = HIBERNATE_MIN_PHYSICAL_LBA_512 * ref->blksize;
+       }
 
        if (ref->vp->v_type != VREG) {
                error = do_ioctl(p1, p2, DKIOCGETBLOCKCOUNT, (caddr_t) &fileblk);
index 566ab26060bd56ee1c3b5e7059d192ba5788d5b2..ff66292387a163cfa2db31e3ca822ed7f3438624 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 
 #include <sys/mount_internal.h>
 #include <sys/kdebug.h>
+#include <sys/kern_sysctl.h>
 
 #include <IOKit/IOPlatformExpert.h>
 #include <pexpert/pexpert.h>
@@ -196,6 +197,10 @@ extern uint32_t vm_page_creation_throttled_hard;
 extern uint32_t vm_page_creation_throttled_soft;
 #endif /* DEVELOPMENT || DEBUG */
 
+#if CONFIG_LOCKERBOOT
+extern const char kernel_protoboot_mount[];
+#endif
+
 /*
  * Conditionally allow dtrace to see these functions for debugging purposes.
  */
@@ -232,9 +237,6 @@ extern int
 netboot_root(void);
 #endif
 int
-pcsamples_ops(int *name, u_int namelen, user_addr_t where, size_t *sizep,
-    proc_t p);
-int
 sysctl_procargs(int *name, u_int namelen, user_addr_t where,
     size_t *sizep, proc_t cur_proc);
 STATIC int
@@ -402,14 +404,19 @@ sysctl_handle_kern_threadname(  __unused struct sysctl_oid *oidp, __unused void
                        return ENAMETOOLONG;
                }
                if (!ut->pth_name) {
-                       ut->pth_name = (char*)kalloc( MAXTHREADNAMESIZE );
-                       if (!ut->pth_name) {
+                       char *tmp_pth_name = (char *)kalloc(MAXTHREADNAMESIZE);
+                       if (!tmp_pth_name) {
                                return ENOMEM;
                        }
+                       bzero(tmp_pth_name, MAXTHREADNAMESIZE);
+                       if (!OSCompareAndSwapPtr(NULL, tmp_pth_name, &ut->pth_name)) {
+                               kfree(tmp_pth_name, MAXTHREADNAMESIZE);
+                               return EBUSY;
+                       }
                } else {
                        kernel_debug_string_simple(TRACE_STRING_THREADNAME_PREV, ut->pth_name);
+                       bzero(ut->pth_name, MAXTHREADNAMESIZE);
                }
-               bzero(ut->pth_name, MAXTHREADNAMESIZE);
                error = copyin(newp, ut->pth_name, newlen);
                if (error) {
                        return error;
@@ -502,7 +509,7 @@ extern int get_kernel_symfile(proc_t, char **);
 #if COUNT_SYSCALLS
 #define KERN_COUNT_SYSCALLS (KERN_OSTYPE + 1000)
 
-extern unsigned int     nsysent;
+extern const unsigned int     nsysent;
 extern int syscalls_log[];
 extern const char *syscallnames[];
 
@@ -790,7 +797,6 @@ sysctl_prochandle SYSCTL_HANDLER_ARGS
        int uidcheck = 0;
        int ruidcheck = 0;
        int ttycheck = 0;
-       int success = 0;
 
        if (namelen != 1 && !(namelen == 0 && cmd == KERN_PROC_ALL)) {
                return EINVAL;
@@ -848,18 +854,9 @@ sysctl_prochandle SYSCTL_HANDLER_ARGS
                args.uidval = name[0];
        }
 
-       success = proc_iterate((PROC_ALLPROCLIST | PROC_ZOMBPROCLIST),
+       proc_iterate((PROC_ALLPROCLIST | PROC_ZOMBPROCLIST),
            sysdoproc_callback, &args, filterfn, name);
 
-       /*
-        * rdar://problem/28433391: if we can't iterate over the processes,
-        * make sure to return an error.
-        */
-
-       if (success != 0) {
-               return ENOMEM;
-       }
-
        if (error) {
                return error;
        }
@@ -1732,6 +1729,33 @@ SYSCTL_STRING(_kern, OID_AUTO, osbuildconfig,
     CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_MASKED,
     &osbuild_config[0], 0, "");
 
+STATIC int
+sysctl_protoboot(__unused struct sysctl_oid *oidp,
+    __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       int error = -1;
+#if CONFIG_LOCKERBOOT
+       char protoboot_buff[24];
+       size_t protoboot_len = sizeof(protoboot_buff);
+
+       if (vnode_tag(rootvnode) == VT_LOCKERFS) {
+               strlcpy(protoboot_buff, kernel_protoboot_mount, protoboot_len);
+               error = sysctl_io_string(req, protoboot_buff, protoboot_len, 0, NULL);
+       } else {
+               error = EFTYPE;
+       }
+
+#else
+#pragma unused(req)
+       error = ENOTSUP;
+#endif
+
+       return error;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, protoboot,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED,
+    0, 0, sysctl_protoboot, "A", "");
 
 #if DEBUG
 #ifndef DKPR
@@ -1827,6 +1851,28 @@ SYSCTL_PROC(_kern, OID_AUTO, osproductversion,
     osproductversion, sizeof(osproductversion),
     sysctl_osproductversion, "A", "The ProductVersion from SystemVersion.plist");
 
+static uint64_t iossupportversion_string[48];
+
+STATIC int
+sysctl_iossupportversion(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
+{
+       if (req->newptr != 0) {
+               /*
+                * Can only ever be set by launchd, and only once at boot.
+                */
+               if (req->p->p_pid != 1 || iossupportversion_string[0] != '\0') {
+                       return EPERM;
+               }
+       }
+
+       return sysctl_handle_string(oidp, arg1, arg2, req);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, iossupportversion,
+    CTLFLAG_RW | CTLFLAG_KERN | CTLTYPE_STRING | CTLFLAG_LOCKED,
+    iossupportversion_string, sizeof(iossupportversion_string),
+    sysctl_iossupportversion, "A", "The iOSSupportVersion from SystemVersion.plist");
+
 static uint64_t osvariant_status = 0;
 
 STATIC int
@@ -1849,6 +1895,32 @@ SYSCTL_PROC(_kern, OID_AUTO, osvariant_status,
     &osvariant_status, sizeof(osvariant_status),
     sysctl_osvariant_status, "Q", "Opaque flags used to cache OS variant information");
 
+extern void commpage_update_dyld_flags(uint64_t);
+static uint64_t dyld_system_flags = 0;
+
+STATIC int
+sysctl_dyld_system_flags(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
+{
+       /*
+        * Can only ever be set by launchd, possibly several times
+        * as dyld may change its mind after a userspace reboot.
+        */
+       if (req->newptr != 0 && req->p->p_pid != 1) {
+               return EPERM;
+       }
+
+       int res = sysctl_handle_quad(oidp, arg1, arg2, req);
+       if (req->newptr && res == 0) {
+               commpage_update_dyld_flags(osvariant_status);
+       }
+       return res;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, dyld_system_flags,
+    CTLFLAG_RW | CTLTYPE_QUAD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
+    &dyld_system_flags, sizeof(dyld_system_flags),
+    sysctl_dyld_system_flags, "Q", "Opaque flags used to cache dyld system-wide configuration");
+
 #if defined(XNU_TARGET_OS_BRIDGE)
 char macosproductversion[MACOS_VERS_LEN] = { '\0' };
 
@@ -1868,16 +1940,10 @@ sysctl_sysctl_bootargs
 (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
 {
        int error;
-       /* BOOT_LINE_LENGTH */
-#if CONFIG_EMBEDDED
-       size_t boot_args_len = 256;
-#else
-       size_t boot_args_len = 1024;
-#endif
-       char buf[boot_args_len];
+       char buf[BOOT_LINE_LENGTH];
 
-       strlcpy(buf, PE_boot_args(), boot_args_len);
-       error = sysctl_io_string(req, buf, boot_args_len, 0, NULL);
+       strlcpy(buf, PE_boot_args(), BOOT_LINE_LENGTH);
+       error = sysctl_io_string(req, buf, BOOT_LINE_LENGTH, 0, NULL);
        return error;
 }
 
@@ -2067,6 +2133,14 @@ SYSCTL_PROC(_kern_perfcontrol_callout, OID_AUTO, update_cycles,
     sysctl_perfcontrol_callout_stat, "I", "");
 
 #endif /* __arm__ || __arm64__ */
+
+#if __arm64__
+extern int legacy_footprint_entitlement_mode;
+SYSCTL_INT(_kern, OID_AUTO, legacy_footprint_entitlement_mode,
+    CTLFLAG_KERN | CTLFLAG_RD | CTLFLAG_LOCKED,
+    &legacy_footprint_entitlement_mode, 0, "");
+#endif /* __arm64__ */
+
 #endif /* (DEVELOPMENT || DEBUG) */
 
 STATIC int
@@ -2097,9 +2171,17 @@ sysctl_domainname
 (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
 {
        int error, changed;
-       error = sysctl_io_string(req, domainname, sizeof(domainname), 0, &changed);
-       if (changed) {
-               domainnamelen = strlen(domainname);
+       char tmpname[MAXHOSTNAMELEN] = {};
+
+       lck_mtx_lock(&domainname_lock);
+       strlcpy(tmpname, domainname, sizeof(tmpname));
+       lck_mtx_unlock(&domainname_lock);
+
+       error = sysctl_io_string(req, tmpname, sizeof(tmpname), 0, &changed);
+       if (!error && changed) {
+               lck_mtx_lock(&hostname_lock);
+               strlcpy(domainname, tmpname, sizeof(domainname));
+               lck_mtx_unlock(&hostname_lock);
        }
        return error;
 }
@@ -2117,14 +2199,21 @@ sysctl_hostname
 (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
 {
        int error, changed;
-       error = sysctl_io_string(req, hostname, sizeof(hostname), 1, &changed);
-       if (changed) {
-               hostnamelen = req->newlen;
+       char tmpname[MAXHOSTNAMELEN] = {};
+
+       lck_mtx_lock(&hostname_lock);
+       strlcpy(tmpname, hostname, sizeof(tmpname));
+       lck_mtx_unlock(&hostname_lock);
+
+       error = sysctl_io_string(req, tmpname, sizeof(tmpname), 1, &changed);
+       if (!error && changed) {
+               lck_mtx_lock(&hostname_lock);
+               strlcpy(hostname, tmpname, sizeof(hostname));
+               lck_mtx_unlock(&hostname_lock);
        }
        return error;
 }
 
-
 SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED,
     0, 0, sysctl_hostname, "A", "");
@@ -2571,7 +2660,7 @@ sysctl_rage_vnode
 
        error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed);
 
-       if (error == 0) {
+       if ((error == 0) && (changed != 0)) {
                switch (new_value) {
                case KERN_RAGE_PROC:
                        proc_lock(p);
@@ -2600,6 +2689,21 @@ SYSCTL_PROC(_kern, KERN_RAGEVNODE, rage_vnode,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
     0, 0, sysctl_rage_vnode, "I", "");
 
+/* XXX until filecoordinationd fixes a bit of inverted logic. */
+STATIC int
+sysctl_vfsnspace
+(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       int old_value = 0, new_value, changed;
+
+       return sysctl_io_number(req, old_value, sizeof(int), &new_value,
+                  &changed);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, vfsnspace,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+    0, 0, sysctl_vfsnspace, "I", "");
+
 /* XXX move this interface into libproc and remove this sysctl */
 STATIC int
 sysctl_setthread_cpupercent
@@ -2658,7 +2762,7 @@ sysctl_kern_check_openevt
 
        error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed);
 
-       if (error == 0) {
+       if ((error == 0) && (changed != 0)) {
                switch (new_value) {
                case KERN_OPENEVT_PROC:
                        OSBitOrAtomic(P_CHECKOPENEVT, &p->p_flag);
@@ -3675,6 +3779,7 @@ extern uint32_t vm_pageout_memorystatus_fb_factor_dr;
 SYSCTL_INT(_vm, OID_AUTO, vm_pageout_memorystatus_fb_factor_nr, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_memorystatus_fb_factor_nr, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, vm_pageout_memorystatus_fb_factor_dr, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_memorystatus_fb_factor_dr, 0, "");
 
+extern uint32_t vm_grab_anon_nops;
 
 SYSCTL_INT(_vm, OID_AUTO, vm_grab_anon_overrides, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_debug.vm_grab_anon_overrides, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, vm_grab_anon_nops, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_debug.vm_grab_anon_nops, 0, "");
@@ -3756,12 +3861,14 @@ sysctl_cpu_quiescent_counter_interval SYSCTL_HANDLER_ARGS
 {
 #pragma unused(arg1, arg2)
 
-       int error = sysctl_handle_int(oidp, &cpu_checkin_min_interval_us, 0, req);
+       uint32_t local_min_interval_us = cpu_quiescent_counter_get_min_interval_us();
+
+       int error = sysctl_handle_int(oidp, &local_min_interval_us, 0, req);
        if (error || !req->newptr) {
                return error;
        }
 
-       cpu_quiescent_counter_set_min_interval_us(cpu_checkin_min_interval_us);
+       cpu_quiescent_counter_set_min_interval_us(local_min_interval_us);
 
        return 0;
 }
@@ -4130,7 +4237,7 @@ sysctl_debugger_test SYSCTL_HANDLER_ARGS
        return rval;
 }
 
-decl_lck_spin_data(, spinlock_panic_test_lock)
+decl_lck_spin_data(, spinlock_panic_test_lock);
 
 __attribute__((noreturn))
 static void
@@ -4258,7 +4365,7 @@ sysctl_grade_cputype SYSCTL_HANDLER_ARGS
                return error;
        }
 
-       return_value = grade_binary(type_tuple[0], type_tuple[1]);
+       return_value = grade_binary(type_tuple[0], type_tuple[1], FALSE);
 
        error = SYSCTL_OUT(req, &return_value, sizeof(return_value));
 
@@ -4295,11 +4402,9 @@ unwedge_thread SYSCTL_HANDLER_ARGS
 
 SYSCTL_PROC(_kern, OID_AUTO, unwedge_thread, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, unwedge_thread, "I", "unwedge the thread wedged by kern.wedge_thread");
 
-extern uintptr_t phys_carveout_pa;
 SYSCTL_LONG(_kern, OID_AUTO, phys_carveout_pa, CTLFLAG_RD | CTLFLAG_LOCKED,
     &phys_carveout_pa,
     "base physical address of the phys_carveout_mb boot-arg region");
-extern size_t phys_carveout_size;
 SYSCTL_LONG(_kern, OID_AUTO, phys_carveout_size, CTLFLAG_RD | CTLFLAG_LOCKED,
     &phys_carveout_size,
     "size in bytes of the phys_carveout_mb boot-arg region");
@@ -4337,9 +4442,6 @@ tstile_test_prim_lock(boolean_t use_hashtable);
 int
 tstile_test_prim_unlock(boolean_t use_hashtable);
 
-#define SYSCTL_TURNSTILE_TEST_DEFAULT                   1
-#define SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE          2
-
 static int
 sysctl_turnstile_test_prim_lock SYSCTL_HANDLER_ARGS
 {
@@ -4349,8 +4451,15 @@ sysctl_turnstile_test_prim_lock SYSCTL_HANDLER_ARGS
        if (error || val == 0) {
                return error;
        }
-       boolean_t use_hashtable = (val == SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE) ? true : false;
-       return tstile_test_prim_lock(use_hashtable);
+       switch (val) {
+       case SYSCTL_TURNSTILE_TEST_USER_DEFAULT:
+       case SYSCTL_TURNSTILE_TEST_USER_HASHTABLE:
+       case SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT:
+       case SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE:
+               return tstile_test_prim_lock(val);
+       default:
+               return error;
+       }
 }
 
 static int
@@ -4362,8 +4471,15 @@ sysctl_turnstile_test_prim_unlock SYSCTL_HANDLER_ARGS
        if (error || val == 0) {
                return error;
        }
-       boolean_t use_hashtable = (val == SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE) ? true : false;
-       return tstile_test_prim_unlock(use_hashtable);
+       switch (val) {
+       case SYSCTL_TURNSTILE_TEST_USER_DEFAULT:
+       case SYSCTL_TURNSTILE_TEST_USER_HASHTABLE:
+       case SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT:
+       case SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE:
+               return tstile_test_prim_unlock(val);
+       default:
+               return error;
+       }
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, turnstiles_test_lock, CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED,
@@ -4408,42 +4524,6 @@ SYSCTL_QUAD(_kern, OID_AUTO, thread_block_count_on_reg_waitq,
     CTLFLAG_RD | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED,
     &thread_block_on_regular_waitq_count, "thread blocked on regular waitq count");
 
-static int
-sysctl_lck_mtx_test_lock SYSCTL_HANDLER_ARGS
-{
-#pragma unused(arg1, arg2)
-       int error, val = 0;
-       error = sysctl_handle_int(oidp, &val, 0, req);
-       if (error || val == 0) {
-               return error;
-       }
-
-       if (val == 1) {
-               lck_mtx_test_init();
-               lck_mtx_test_lock();
-       }
-
-       return 0;
-}
-
-static int
-sysctl_lck_mtx_test_unlock SYSCTL_HANDLER_ARGS
-{
-#pragma unused(arg1, arg2)
-       int error, val = 0;
-       error = sysctl_handle_int(oidp, &val, 0, req);
-       if (error || val == 0) {
-               return error;
-       }
-
-       if (val == 1) {
-               lck_mtx_test_init();
-               lck_mtx_test_unlock();
-       }
-
-       return 0;
-}
-
 static int
 sysctl_erase_all_test_mtx_stats SYSCTL_HANDLER_ARGS
 {
@@ -4512,7 +4592,12 @@ sysctl_test_mtx_uncontended SYSCTL_HANDLER_ARGS
        }
        input_val[req->newlen] = '\0';
 
-       sscanf(input_val, "%d", &iter);
+       iter = 0;
+       error = sscanf(input_val, "%d", &iter);
+       if (error != 1) {
+               printf("%s invalid input\n", __func__);
+               return EINVAL;
+       }
 
        if (iter <= 0) {
                printf("%s requested %d iterations, not starting the test\n", __func__, iter);
@@ -4551,8 +4636,6 @@ sysctl_test_mtx_contended SYSCTL_HANDLER_ARGS
        int buffer_size, offset, error, iter;
        char input_val[40];
 
-       printf("%s called\n", __func__);
-
        if (!req->newptr) {
                return 0;
        }
@@ -4571,7 +4654,12 @@ sysctl_test_mtx_contended SYSCTL_HANDLER_ARGS
        }
        input_val[req->newlen] = '\0';
 
-       sscanf(input_val, "%d", &iter);
+       iter = 0;
+       error = sscanf(input_val, "%d", &iter);
+       if (error != 1) {
+               printf("%s invalid input\n", __func__);
+               return EINVAL;
+       }
 
        if (iter <= 0) {
                printf("%s requested %d iterations, not starting the test\n", __func__, iter);
@@ -4582,7 +4670,7 @@ sysctl_test_mtx_contended SYSCTL_HANDLER_ARGS
 
        erase_all_test_mtx_stats();
 
-       buffer_size = 1000;
+       buffer_size = 2000;
        offset = 0;
        buffer = kalloc(buffer_size);
        if (!buffer) {
@@ -4590,29 +4678,34 @@ sysctl_test_mtx_contended SYSCTL_HANDLER_ARGS
        }
        memset(buffer, 0, buffer_size);
 
-       printf("%s starting contended mutex test with %d iterations\n", __func__, iter);
+       printf("%s starting contended mutex test with %d iterations FULL_CONTENDED\n", __func__, iter);
 
        offset = snprintf(buffer, buffer_size, "STATS INNER LOOP");
-       offset += lck_mtx_test_mtx_contended(iter, &buffer[offset], buffer_size - offset);
+       offset += lck_mtx_test_mtx_contended(iter, &buffer[offset], buffer_size - offset, FULL_CONTENDED);
+
+       printf("%s starting contended mutex loop test with %d iterations FULL_CONTENDED\n", __func__, iter);
+
+       offset += snprintf(&buffer[offset], buffer_size - offset, "\nSTATS OUTER LOOP");
+       offset += lck_mtx_test_mtx_contended_loop_time(iter, &buffer[offset], buffer_size - offset, FULL_CONTENDED);
+
+       printf("%s starting contended mutex test with %d iterations HALF_CONTENDED\n", __func__, iter);
+
+       offset += snprintf(&buffer[offset], buffer_size - offset, "STATS INNER LOOP");
+       offset += lck_mtx_test_mtx_contended(iter, &buffer[offset], buffer_size - offset, HALF_CONTENDED);
 
-       printf("%s starting contended mutex loop test with %d iterations\n", __func__, iter);
+       printf("%s starting contended mutex loop test with %d iterations HALF_CONTENDED\n", __func__, iter);
 
        offset += snprintf(&buffer[offset], buffer_size - offset, "\nSTATS OUTER LOOP");
-       offset += lck_mtx_test_mtx_contended_loop_time(iter, &buffer[offset], buffer_size - offset);
+       offset += lck_mtx_test_mtx_contended_loop_time(iter, &buffer[offset], buffer_size - offset, HALF_CONTENDED);
 
        error = SYSCTL_OUT(req, buffer, offset);
 
+       printf("\n%s\n", buffer);
        kfree(buffer, buffer_size);
 
        return error;
 }
 
-SYSCTL_PROC(_kern, OID_AUTO, lck_mtx_test_lock, CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    0, 0, sysctl_lck_mtx_test_lock, "I", "lck mtx test lock");
-
-SYSCTL_PROC(_kern, OID_AUTO, lck_mtx_test_unlock, CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    0, 0, sysctl_lck_mtx_test_unlock, "I", "lck mtx test unlock");
-
 SYSCTL_PROC(_kern, OID_AUTO, erase_all_test_mtx_stats, CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED,
     0, 0, sysctl_erase_all_test_mtx_stats, "I", "erase test_mtx statistics");
 
@@ -4706,4 +4799,78 @@ sysctl_test_panic_with_thread SYSCTL_HANDLER_ARGS
 SYSCTL_PROC(_kern, OID_AUTO, test_panic_with_thread, CTLFLAG_MASKED | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_WR | CTLTYPE_STRING,
     0, 0, sysctl_test_panic_with_thread, "A", "test panic flow for backtracing a different thread");
 #endif /* defined (__x86_64__) */
+
 #endif /* DEVELOPMENT || DEBUG */
+
+static int
+sysctl_get_owned_vmobjects SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+
+       /* validate */
+       if (req->newlen != sizeof(mach_port_name_t) || req->newptr == USER_ADDR_NULL ||
+    req->oldidx != 0 || req->newidx != 0 || req->p == NULL) {
+               return EINVAL;
+       }
+
+       int error;
+       mach_port_name_t task_port_name;
+       task_t task;
+       int buffer_size = (req->oldptr != USER_ADDR_NULL) ? req->oldlen : 0;
+       vmobject_list_output_t buffer;
+       size_t output_size;
+
+       if (buffer_size) {
+               const int min_size = sizeof(vm_object_query_data_t) + sizeof(int64_t);
+
+               if (buffer_size < min_size) {
+                       buffer_size = min_size;
+               }
+
+               buffer = kalloc(buffer_size);
+
+               if (!buffer) {
+                       error = ENOMEM;
+                       goto sysctl_get_vmobject_list_exit;
+               }
+       } else {
+               buffer = NULL;
+       }
+
+       /* we have a "newptr" (for write) we get a task port name from the caller. */
+       error = SYSCTL_IN(req, &task_port_name, sizeof(mach_port_name_t));
+
+       if (error != 0) {
+               goto sysctl_get_vmobject_list_exit;
+       }
+
+       task = port_name_to_task(task_port_name);
+       if (task == TASK_NULL) {
+               error = ESRCH;
+               goto sysctl_get_vmobject_list_exit;
+       }
+
+       /* copy the vmobjects and vmobject data out of the task */
+       if (buffer_size == 0) {
+               int64_t __size;
+               task_copy_vmobjects(task, NULL, 0, &__size);
+               output_size = (__size > 0) ? __size * sizeof(vm_object_query_data_t) + sizeof(int64_t) : 0;
+       } else {
+               task_copy_vmobjects(task, &buffer->data[0], buffer_size - sizeof(int64_t), &buffer->entries);
+               output_size = buffer->entries * sizeof(vm_object_query_data_t) + sizeof(int64_t);
+       }
+
+       task_deallocate(task);
+
+       error = SYSCTL_OUT(req, (char*) buffer, output_size);
+
+sysctl_get_vmobject_list_exit:
+       if (buffer) {
+               kfree(buffer, buffer_size);
+       }
+
+       return error;
+}
+
+SYSCTL_PROC(_vm, OID_AUTO, get_owned_vmobjects, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
+    0, 0, sysctl_get_owned_vmobjects, "A", "get owned vmobjects in task");
index 386d06971508d20e9ee966b57e612948000877ad..ee65deb6c97bf6f7ce27dc6439afb261cb940023 100644 (file)
@@ -104,8 +104,10 @@ reboot(struct proc *p, struct reboot_args *uap, __unused int32_t *retval)
 
        if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
 #if (DEVELOPMENT || DEBUG)
-               /* allow non-root user to call panic on dev/debug kernels */
-               if (!(uap->opt & RB_PANIC)) {
+               if (uap->opt & RB_PANIC) {
+                       /* clear 'error' to allow non-root users to call panic on dev/debug kernels */
+                       error = 0;
+               } else {
                        return error;
                }
 #else
@@ -113,10 +115,6 @@ reboot(struct proc *p, struct reboot_args *uap, __unused int32_t *retval)
 #endif
        }
 
-       if (uap->opt & RB_COMMAND) {
-               return ENOSYS;
-       }
-
        if (uap->opt & RB_PANIC && uap->msg != USER_ADDR_NULL) {
                if (copyinstr(uap->msg, (void *)message, sizeof(message), (size_t *)&dummy)) {
                        strncpy(message, "user space RB_PANIC message copyin failed", sizeof(message) - 1);
index 7fb3d23d1ced8c397dad42bea54254461fd8258d..27d01a4a7887877c71ebc04cc6f0a52467d88db5 100644 (file)
@@ -528,9 +528,13 @@ errno_t
 mbuf_copydata(const mbuf_t m0, size_t off, size_t len, void *out_data)
 {
        /* Copied m_copydata, added error handling (don't just panic) */
-       int count;
+       size_t count;
        mbuf_t  m = m0;
 
+       if (off >= INT_MAX || len >= INT_MAX) {
+               return EINVAL;
+       }
+
        while (off > 0) {
                if (m == 0) {
                        return EINVAL;
@@ -2014,3 +2018,31 @@ m_do_tx_compl_callback(struct mbuf *m, struct ifnet *ifp)
        }
 #endif /* (DEBUG || DEVELOPMENT) */
 }
+
+errno_t
+mbuf_get_keepalive_flag(mbuf_t m, boolean_t *is_keepalive)
+{
+       if (m == NULL || is_keepalive == NULL || !(m->m_flags & M_PKTHDR)) {
+               return EINVAL;
+       }
+
+       *is_keepalive = (m->m_pkthdr.pkt_flags & PKTF_KEEPALIVE);
+
+       return 0;
+}
+
+errno_t
+mbuf_set_keepalive_flag(mbuf_t m, boolean_t is_keepalive)
+{
+       if (m == NULL || !(m->m_flags & M_PKTHDR)) {
+               return EINVAL;
+       }
+
+       if (is_keepalive) {
+               m->m_pkthdr.pkt_flags |= PKTF_KEEPALIVE;
+       } else {
+               m->m_pkthdr.pkt_flags &= ~PKTF_KEEPALIVE;
+       }
+
+       return 0;
+}
index 597045a88fb7767f974ac5a92187a9e19c056391..4492cf15faf5024d8a7003af557b1a3ef0fe3d86 100644 (file)
@@ -106,21 +106,40 @@ errno_t sflt_register(const struct sflt_filter *filter, int domain,
 __private_extern__ int
 sflt_permission_check(struct inpcb *inp)
 {
-       /*
-        * All these permissions only apply to the co-processor interface,
-        * so ignore IPv4.
-        */
-       if (!(inp->inp_vflag & INP_IPV6)) {
+       /* Only IPv4 or IPv6 sockets can bypass filters */
+       if (!(inp->inp_vflag & INP_IPV4) &&
+           !(inp->inp_vflag & INP_IPV6)) {
                return 0;
        }
        /* Sockets that have this entitlement bypass socket filters. */
        if (INP_INTCOPROC_ALLOWED(inp)) {
                return 1;
        }
+       /* Sockets bound to an intcoproc interface bypass socket filters. */
        if ((inp->inp_flags & INP_BOUND_IF) &&
            IFNET_IS_INTCOPROC(inp->inp_boundifp)) {
                return 1;
        }
+#if NECP
+       /*
+        * Make sure that the NECP policy is populated.
+        * If result is not populated, the policy ID will be
+        * NECP_KERNEL_POLICY_ID_NONE. Note that if the result
+        * is populated, but there was no match, it will be
+        * NECP_KERNEL_POLICY_ID_NO_MATCH.
+        * Do not call inp_update_necp_policy() to avoid scoping
+        * a socket prior to calls to bind().
+        */
+       if (inp->inp_policyresult.policy_id == NECP_KERNEL_POLICY_ID_NONE) {
+               necp_socket_find_policy_match(inp, NULL, NULL, 0);
+       }
+
+       /* If the filter unit is marked to be "no filter", bypass filters */
+       if (inp->inp_policyresult.results.filter_control_unit ==
+           NECP_FILTER_UNIT_NO_FILTER) {
+               return 1;
+       }
+#endif /* NECP */
        return 0;
 }
 
index ffb26e9bf5849089bca11213f55def60951b3936..ce9ab133d02a375eddd757b588222af77bfefabf 100644 (file)
@@ -50,6 +50,7 @@
 *              req_cpu_type:   The required cpu type.
 *              mask_bits:      Bits to mask from the sub-image type when
 *                              grading it vs. the req_cpu_type
+*              imgp:           Image params
 *              archret (out):  Pointer to fat_arch structure to hold
 *                              the results.
 *
 **********************************************************************/
 static load_return_t
 fatfile_getarch(
-       vm_offset_t     data_ptr,
-       vm_size_t       data_size,
-       cpu_type_t      req_cpu_type,
-       cpu_type_t      mask_bits,
-       struct fat_arch *archret)
+       vm_offset_t              data_ptr,
+       vm_size_t                data_size,
+       cpu_type_t               req_cpu_type,
+       cpu_type_t               mask_bits,
+       struct image_params      *imgp,
+       struct fat_arch          *archret)
 {
        load_return_t           lret;
        struct fat_arch         *arch;
@@ -106,7 +108,7 @@ fatfile_getarch(
                /*
                 *      Get the grade of the cpu subtype (without feature flags)
                 */
-               grade = grade_binary(testtype, testsubtype);
+               grade = grade_binary(testtype, testsubtype, TRUE);
 
                /*
                 *      Remember it if it's the best we've seen.
@@ -117,6 +119,18 @@ fatfile_getarch(
                }
        }
 
+       /* On X86_64, allow 32 bit exec only for simulator binaries.
+        * Failing here without re-running the grading algorithm is safe because i386
+        * has the lowest possible grade value (so there can't be a lower best grade
+        * that would be allowed if this check denied the i386 slice). */
+       if (best_arch != NULL &&
+           validate_potential_simulator_binary(OSSwapBigToHostInt32(best_arch->cputype),
+           imgp, OSSwapBigToHostInt32(best_arch->offset),
+           OSSwapBigToHostInt32(best_arch->size)) != LOAD_SUCCESS) {
+               best_arch = NULL;
+               best_grade = 0;
+       }
+
        /*
         *      Return our results.
         */
@@ -147,13 +161,14 @@ load_return_t
 fatfile_getbestarch(
        vm_offset_t             data_ptr,
        vm_size_t               data_size,
+       struct image_params     *imgp,
        struct fat_arch *archret)
 {
        /*
         * Ignore all architectural bits when determining if an image
         * in a fat file should be skipped or graded.
         */
-       return fatfile_getarch(data_ptr, data_size, cpu_type(), CPU_ARCH_MASK, archret);
+       return fatfile_getarch(data_ptr, data_size, cpu_type(), CPU_ARCH_MASK, imgp, archret);
 }
 
 load_return_t
@@ -161,12 +176,13 @@ fatfile_getbestarch_for_cputype(
        cpu_type_t cputype,
        vm_offset_t data_ptr,
        vm_size_t data_size,
+       struct image_params *imgp,
        struct fat_arch *archret)
 {
        /*
         * Scan the fat_arch array for exact matches for this cpu_type_t only
         */
-       return fatfile_getarch(data_ptr, data_size, cputype, 0, archret);
+       return fatfile_getarch(data_ptr, data_size, cputype, 0, imgp, archret);
 }
 
 /**********************************************************************
@@ -187,7 +203,7 @@ fatfile_getbestarch_for_cputype(
 load_return_t
 fatfile_getarch_with_bits(
        integer_t               archbits,
-       vm_offset_t     data_ptr,
+       vm_offset_t             data_ptr,
        vm_size_t               data_size,
        struct fat_arch         *archret)
 {
@@ -195,7 +211,7 @@ fatfile_getarch_with_bits(
         * Scan the fat_arch array for matches with the requested
         * architectural bits set, and for the current hardware cpu CPU.
         */
-       return fatfile_getarch(data_ptr, data_size, (archbits & CPU_ARCH_MASK) | (cpu_type() & ~CPU_ARCH_MASK), 0, archret);
+       return fatfile_getarch(data_ptr, data_size, (archbits & CPU_ARCH_MASK) | (cpu_type() & ~CPU_ARCH_MASK), 0, NULL, archret);
 }
 
 /*
index 6d108d1ec45888289bcd6e2a0ee04c8d332d9428..885fb32ee1e35bf12afe31fb87826eecaed874e9 100644 (file)
@@ -36,9 +36,9 @@
 
 load_return_t fatfile_validate_fatarches(vm_offset_t data_ptr, vm_size_t data_size);
 
-load_return_t fatfile_getbestarch(vm_offset_t data_ptr, vm_size_t data_size, struct fat_arch *archret);
+load_return_t fatfile_getbestarch(vm_offset_t data_ptr, vm_size_t data_size, struct image_params *imgp, struct fat_arch *archret);
 load_return_t fatfile_getbestarch_for_cputype(cpu_type_t cputype,
-    vm_offset_t data_ptr, vm_size_t data_size, struct fat_arch *archret);
+    vm_offset_t data_ptr, vm_size_t data_size, struct image_params *imgp, struct fat_arch *archret);
 load_return_t fatfile_getarch_with_bits(integer_t archbits,
     vm_offset_t data_ptr, vm_size_t data_size, struct fat_arch *archret);
 
index 82ee10f0f8ec3a3bd10a61e4fefad5ae43da2fe7..d51e05c70f515aa613006c2d7da861227245dd83 100644 (file)
 
 #include <os/overflow.h>
 
-#if __x86_64__
-extern int bootarg_no32exec;    /* bsd_init.c */
-#endif
-
 /*
  * XXX vm/pmap.h should not treat these prototypes as MACH_KERNEL_PRIVATE
  * when KERNEL is defined.
  */
-extern pmap_t   pmap_create(ledger_t ledger, vm_map_size_t size,
-    boolean_t is_64bit);
+extern pmap_t   pmap_create_options(ledger_t ledger, vm_map_size_t size,
+    unsigned int flags);
 
 /* XXX should have prototypes in a shared header file */
 extern int      get_map_nentries(vm_map_t);
@@ -173,6 +169,13 @@ load_uuid(
        load_result_t                   *result
        );
 
+static load_return_t
+load_version(
+       struct version_min_command     *vmc,
+       boolean_t               *found_version_cmd,
+       load_result_t           *result
+       );
+
 static load_return_t
 load_code_signature(
        struct linkedit_data_command    *lcp,
@@ -205,6 +208,14 @@ load_main(
        load_result_t           *result
        );
 
+static
+load_return_t
+setup_driver_main(
+       thread_t                thread,
+       int64_t                         slide,
+       load_result_t           *result
+       );
+
 static load_return_t
 load_unixthread(
        struct thread_command   *tcp,
@@ -251,6 +262,15 @@ load_dylinker(
        struct image_params     *imgp
        );
 
+#if __x86_64__
+extern int bootarg_no32exec;
+static boolean_t
+check_if_simulator_binary(
+       struct image_params     *imgp,
+       off_t                   file_offset,
+       off_t                   macho_size);
+#endif
+
 struct macho_data;
 
 static load_return_t
@@ -341,12 +361,12 @@ load_machfile(
        boolean_t enforce_hard_pagezero = TRUE;
        int in_exec = (imgp->ip_flags & IMGPF_EXEC);
        task_t task = current_task();
-       proc_t p = current_proc();
        int64_t                 aslr_page_offset = 0;
        int64_t                 dyld_aslr_page_offset = 0;
        int64_t                 aslr_section_size = 0;
        int64_t                 aslr_section_offset = 0;
        kern_return_t           kret;
+       unsigned int            pmap_flags = 0;
 
        if (macho_size > file_size) {
                return LOAD_BADMACHO;
@@ -354,6 +374,10 @@ load_machfile(
 
        result->is_64bit_addr = ((imgp->ip_flags & IMGPF_IS_64BIT_ADDR) == IMGPF_IS_64BIT_ADDR);
        result->is_64bit_data = ((imgp->ip_flags & IMGPF_IS_64BIT_DATA) == IMGPF_IS_64BIT_DATA);
+#if defined(HAS_APPLE_PAC)
+       pmap_flags |= (imgp->ip_flags & IMGPF_NOJOP) ? PMAP_CREATE_DISABLE_JOP : 0;
+#endif /* defined(HAS_APPLE_PAC) */
+       pmap_flags |= result->is_64bit_addr ? PMAP_CREATE_64BIT : 0;
 
        task_t ledger_task;
        if (imgp->ip_new_thread) {
@@ -361,9 +385,12 @@ load_machfile(
        } else {
                ledger_task = task;
        }
-       pmap = pmap_create(get_task_ledger(ledger_task),
+       pmap = pmap_create_options(get_task_ledger(ledger_task),
            (vm_map_size_t) 0,
-           result->is_64bit_addr);
+           pmap_flags);
+       if (pmap == NULL) {
+               return LOAD_RESOURCE;
+       }
        map = vm_map_create(pmap,
            0,
            vm_compute_max_offset(result->is_64bit_addr),
@@ -497,6 +524,7 @@ load_machfile(
         * task is not yet running, and it makes no sense.
         */
        if (in_exec) {
+               proc_t p = vfs_context_proc(imgp->ip_vfs_context);
                /*
                 * Mark the task as halting and start the other
                 * threads towards terminating themselves.  Then
@@ -597,14 +625,17 @@ parse_machfile(
        size_t                  offset;
        size_t                  oldoffset;      /* for overflow check */
        int                     pass;
-       proc_t                  p = current_proc();             /* XXXX */
+       proc_t                  p = vfs_context_proc(imgp->ip_vfs_context);
        int                     error;
        int                     resid = 0;
+       int                     spawn = (imgp->ip_flags & IMGPF_SPAWN);
+       int                     vfexec = (imgp->ip_flags & IMGPF_VFORK_EXEC);
        size_t                  mach_header_sz = sizeof(struct mach_header);
        boolean_t               abi64;
        boolean_t               got_code_signatures = FALSE;
        boolean_t               found_header_segment = FALSE;
        boolean_t               found_xhdr = FALSE;
+       boolean_t               found_version_cmd = FALSE;
        int64_t                 slide = 0;
        boolean_t               dyld_no_load_addr = FALSE;
        boolean_t               is_dyld = FALSE;
@@ -637,16 +668,10 @@ parse_machfile(
         */
        if (((cpu_type_t)(header->cputype & ~CPU_ARCH_MASK) != (cpu_type() & ~CPU_ARCH_MASK)) ||
            !grade_binary(header->cputype,
-           header->cpusubtype & ~CPU_SUBTYPE_MASK)) {
+           header->cpusubtype & ~CPU_SUBTYPE_MASK, TRUE)) {
                return LOAD_BADARCH;
        }
 
-#if __x86_64__
-       if (bootarg_no32exec && (header->cputype == CPU_TYPE_X86)) {
-               return LOAD_BADARCH_X86;
-       }
-#endif
-
        abi64 = ((header->cputype & CPU_ARCH_ABI64) == CPU_ARCH_ABI64);
 
        switch (header->filetype) {
@@ -702,7 +727,7 @@ parse_machfile(
        }
 
        error = vn_rdwr(UIO_READ, vp, addr, alloc_size, file_offset,
-           UIO_SYSSPACE, 0, kauth_cred_get(), &resid, p);
+           UIO_SYSSPACE, 0, vfs_context_ucred(imgp->ip_vfs_context), &resid, p);
        if (error) {
                kfree(addr, alloc_size);
                return LOAD_IOERROR;
@@ -811,10 +836,22 @@ parse_machfile(
                /*
                 * Check that the entry point is contained in an executable segments
                 */
-               if ((pass == 3) && (!result->using_lcmain && result->validentry == 0)) {
-                       thread_state_initialize(thread);
-                       ret = LOAD_FAILURE;
-                       break;
+               if (pass == 3) {
+                       if (depth == 1 && imgp && (imgp->ip_flags & IMGPF_DRIVER)) {
+                               /* Driver binaries must have driverkit platform */
+                               if (result->ip_platform == PLATFORM_DRIVERKIT) {
+                                       /* Driver binaries have no entry point */
+                                       ret = setup_driver_main(thread, slide, result);
+                               } else {
+                                       ret = LOAD_FAILURE;
+                               }
+                       } else if (!result->using_lcmain && result->validentry == 0) {
+                               ret = LOAD_FAILURE;
+                       }
+                       if (ret != KERN_SUCCESS) {
+                               thread_state_initialize(thread);
+                               break;
+                       }
                }
 
                /*
@@ -866,10 +903,17 @@ parse_machfile(
                        /*
                         * Act on struct load_command's for which kernel
                         * intervention is required.
+                        * Note that each load command implementation is expected to validate
+                        * that lcp->cmdsize is large enough to fit its specific struct type
+                        * before dereferencing fields not covered by struct load_command.
                         */
                        switch (lcp->cmd) {
                        case LC_SEGMENT: {
                                struct segment_command *scp = (struct segment_command *) lcp;
+                               if (scp->cmdsize < sizeof(*scp)) {
+                                       ret = LOAD_BADMACHO;
+                                       break;
+                               }
                                if (pass == 0) {
                                        if (is_dyld && scp->vmaddr == 0 && scp->fileoff == 0) {
                                                dyld_no_load_addr = TRUE;
@@ -948,7 +992,10 @@ parse_machfile(
                        }
                        case LC_SEGMENT_64: {
                                struct segment_command_64 *scp64 = (struct segment_command_64 *) lcp;
-
+                               if (scp64->cmdsize < sizeof(*scp64)) {
+                                       ret = LOAD_BADMACHO;
+                                       break;
+                               }
                                if (pass == 0) {
                                        if (is_dyld && scp64->vmaddr == 0 && scp64->fileoff == 0) {
                                                dyld_no_load_addr = TRUE;
@@ -1142,27 +1189,56 @@ parse_machfile(
                                                load_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_DECRYPT);
                                        }
 
-                                       assert(load_failure_reason != OS_REASON_NULL);
-                                       psignal_with_reason(p, SIGKILL, load_failure_reason);
+                                       /*
+                                        * Don't signal the process if it was forked and in a partially constructed
+                                        * state as part of a spawn -- it will just be torn down when the exec fails.
+                                        */
+                                       if (!spawn) {
+                                               assert(load_failure_reason != OS_REASON_NULL);
+                                               if (vfexec) {
+                                                       psignal_vfork_with_reason(p, get_threadtask(imgp->ip_new_thread), imgp->ip_new_thread, SIGKILL, load_failure_reason);
+                                                       load_failure_reason = OS_REASON_NULL;
+                                               } else {
+                                                       psignal_with_reason(p, SIGKILL, load_failure_reason);
+                                                       load_failure_reason = OS_REASON_NULL;
+                                               }
+                                       } else {
+                                               os_reason_free(load_failure_reason);
+                                               load_failure_reason = OS_REASON_NULL;
+                                       }
                                }
                                break;
 #endif
-#if __arm64__
-                       case LC_VERSION_MIN_IPHONEOS: {
+                       case LC_VERSION_MIN_IPHONEOS:
+                       case LC_VERSION_MIN_MACOSX:
+                       case LC_VERSION_MIN_WATCHOS:
+                       case LC_VERSION_MIN_TVOS: {
                                struct version_min_command *vmc;
 
-                               if (pass != 1) {
+                               if (depth != 1 || pass != 1) {
                                        break;
                                }
                                vmc = (struct version_min_command *) lcp;
-                               if (vmc->sdk < (12 << 16)) {
-                                       /* app built with a pre-iOS12 SDK: apply legacy footprint mitigation */
-                                       result->legacy_footprint = TRUE;
+                               ret = load_version(vmc, &found_version_cmd, result);
+                               break;
+                       }
+                       case LC_BUILD_VERSION: {
+                               if (depth != 1 || pass != 1) {
+                                       break;
+                               }
+                               struct build_version_command* bvc = (struct build_version_command*)lcp;
+                               if (bvc->cmdsize < sizeof(*bvc)) {
+                                       ret = LOAD_BADMACHO;
+                                       break;
                                }
-//                             printf("FBDP %s:%d vp %p (%s) sdk %d.%d.%d -> legacy_footprint=%d\n", __FUNCTION__, __LINE__, vp, vp->v_name, (vmc->sdk >> 16), ((vmc->sdk & 0xFF00) >> 8), (vmc->sdk & 0xFF), result->legacy_footprint);
+                               if (found_version_cmd == TRUE) {
+                                       ret = LOAD_BADMACHO;
+                                       break;
+                               }
+                               result->ip_platform = bvc->platform;
+                               found_version_cmd = TRUE;
                                break;
                        }
-#endif /* __arm64__ */
                        default:
                                /* Other commands are ignored by the kernel */
                                ret = LOAD_SUCCESS;
@@ -1217,6 +1293,190 @@ parse_machfile(
        return ret;
 }
 
+load_return_t
+validate_potential_simulator_binary(
+       cpu_type_t               exectype __unused,
+       struct image_params      *imgp __unused,
+       off_t                    file_offset __unused,
+       off_t                    macho_size __unused)
+{
+#if __x86_64__
+       /* Allow 32 bit exec only for simulator binaries */
+       if (bootarg_no32exec && imgp != NULL && exectype == CPU_TYPE_X86) {
+               if (imgp->ip_simulator_binary == IMGPF_SB_DEFAULT) {
+                       boolean_t simulator_binary = check_if_simulator_binary(imgp, file_offset, macho_size);
+                       imgp->ip_simulator_binary = simulator_binary ? IMGPF_SB_TRUE : IMGPF_SB_FALSE;
+               }
+
+               if (imgp->ip_simulator_binary != IMGPF_SB_TRUE) {
+                       return LOAD_BADARCH;
+               }
+       }
+#endif
+       return LOAD_SUCCESS;
+}
+
+#if __x86_64__
+static boolean_t
+check_if_simulator_binary(
+       struct image_params     *imgp,
+       off_t                   file_offset,
+       off_t                   macho_size)
+{
+       struct mach_header      *header;
+       char                    *ip_vdata = NULL;
+       kauth_cred_t            cred = NULL;
+       uint32_t                ncmds;
+       struct load_command     *lcp;
+       boolean_t               simulator_binary = FALSE;
+       void *                  addr = NULL;
+       vm_size_t               alloc_size, cmds_size;
+       size_t                  offset;
+       proc_t                  p = current_proc();             /* XXXX */
+       int                     error;
+       int                     resid = 0;
+       size_t                  mach_header_sz = sizeof(struct mach_header);
+
+
+       cred =  kauth_cred_proc_ref(p);
+
+       /* Allocate page to copyin mach header */
+       ip_vdata = kalloc(PAGE_SIZE);
+       if (ip_vdata == NULL) {
+               goto bad;
+       }
+
+       /* Read the Mach-O header */
+       error = vn_rdwr(UIO_READ, imgp->ip_vp, ip_vdata,
+           PAGE_SIZE, file_offset,
+           UIO_SYSSPACE, (IO_UNIT | IO_NODELOCKED),
+           cred, &resid, p);
+       if (error) {
+               goto bad;
+       }
+
+       header = (struct mach_header *)ip_vdata;
+
+       if (header->magic == MH_MAGIC_64 ||
+           header->magic == MH_CIGAM_64) {
+               mach_header_sz = sizeof(struct mach_header_64);
+       }
+
+       /* ensure header + sizeofcmds falls within the file */
+       if (os_add_overflow(mach_header_sz, header->sizeofcmds, &cmds_size) ||
+           (off_t)cmds_size > macho_size ||
+           round_page_overflow(cmds_size, &alloc_size)) {
+               goto bad;
+       }
+
+       /*
+        * Map the load commands into kernel memory.
+        */
+       addr = kalloc(alloc_size);
+       if (addr == NULL) {
+               goto bad;
+       }
+
+       error = vn_rdwr(UIO_READ, imgp->ip_vp, addr, alloc_size, file_offset,
+           UIO_SYSSPACE, IO_NODELOCKED, cred, &resid, p);
+       if (error) {
+               goto bad;
+       }
+
+       if (resid) {
+               /* We must be able to read in as much as the mach_header indicated */
+               goto bad;
+       }
+
+       /*
+        * Loop through each of the load_commands indicated by the
+        * Mach-O header; if an absurd value is provided, we just
+        * run off the end of the reserved section by incrementing
+        * the offset too far, so we are implicitly fail-safe.
+        */
+       offset = mach_header_sz;
+       ncmds = header->ncmds;
+
+       while (ncmds--) {
+               /* ensure enough space for a minimal load command */
+               if (offset + sizeof(struct load_command) > cmds_size) {
+                       break;
+               }
+
+               /*
+                *      Get a pointer to the command.
+                */
+               lcp = (struct load_command *)(addr + offset);
+
+               /*
+                * Perform prevalidation of the struct load_command
+                * before we attempt to use its contents.  Invalid
+                * values are ones which result in an overflow, or
+                * which can not possibly be valid commands, or which
+                * straddle or exist past the reserved section at the
+                * start of the image.
+                */
+               if (os_add_overflow(offset, lcp->cmdsize, &offset) ||
+                   lcp->cmdsize < sizeof(struct load_command) ||
+                   offset > cmds_size) {
+                       break;
+               }
+
+               /* Check if its a simulator binary. */
+               switch (lcp->cmd) {
+               case LC_VERSION_MIN_WATCHOS:
+                       simulator_binary = TRUE;
+                       break;
+
+               case LC_BUILD_VERSION: {
+                       struct build_version_command *bvc;
+
+                       bvc = (struct build_version_command *) lcp;
+                       if (bvc->cmdsize < sizeof(*bvc)) {
+                               /* unsafe to use this command struct if cmdsize
+                               * validated above is too small for it to fit */
+                               break;
+                       }
+                       if (bvc->platform == PLATFORM_IOSSIMULATOR ||
+                           bvc->platform == PLATFORM_WATCHOSSIMULATOR) {
+                               simulator_binary = TRUE;
+                       }
+
+                       break;
+               }
+
+               case LC_VERSION_MIN_IPHONEOS: {
+                       simulator_binary = TRUE;
+                       break;
+               }
+
+               default:
+                       /* ignore other load commands */
+                       break;
+               }
+
+               if (simulator_binary == TRUE) {
+                       break;
+               }
+       }
+
+bad:
+       if (ip_vdata) {
+               kfree(ip_vdata, PAGE_SIZE);
+       }
+
+       if (cred) {
+               kauth_cred_unref(&cred);
+       }
+
+       if (addr) {
+               kfree(addr, alloc_size);
+       }
+
+       return simulator_binary;
+}
+#endif /* __x86_64__ */
+
 #if CONFIG_CODE_DECRYPTION
 
 #define APPLE_UNPROTECTED_HEADER_SIZE   (3 * 4096)
@@ -1390,6 +1650,8 @@ map_segment(
                        cur_end = vm_start + (file_end - file_start);
                }
                if (control != MEMORY_OBJECT_CONTROL_NULL) {
+                       /* no copy-on-read for mapped binaries */
+                       vmk_flags.vmkf_no_copy_on_read = 1;
                        ret = vm_map_enter_mem_object_control(
                                map,
                                &cur_start,
@@ -1463,6 +1725,8 @@ map_segment(
                    file_start),
                    effective_page_mask);
                if (control != MEMORY_OBJECT_CONTROL_NULL) {
+                       /* no copy-on-read for mapped binaries */
+                       cur_vmk_flags.vmkf_no_copy_on_read = 1;
                        ret = vm_map_enter_mem_object_control(
                                map,
                                &cur_start,
@@ -1507,6 +1771,8 @@ map_segment(
                /* one 4K pager for the last page */
                cur_end = vm_start + (file_end - file_start);
                if (control != MEMORY_OBJECT_CONTROL_NULL) {
+                       /* no copy-on-read for mapped binaries */
+                       vmk_flags.vmkf_no_copy_on_read = 1;
                        ret = vm_map_enter_mem_object_control(
                                map,
                                &cur_start,
@@ -1687,7 +1953,13 @@ load_segment(
                return LOAD_BADMACHO;
        }
 
-       vm_offset = scp->vmaddr + slide;
+       if (os_add_overflow(scp->vmaddr, slide, &vm_offset)) {
+               if (cs_debug) {
+                       printf("vmaddr too large\n");
+               }
+               return LOAD_BADMACHO;
+       }
+
        vm_size = scp->vmsize;
 
        if (vm_size == 0) {
@@ -1973,6 +2245,68 @@ load_uuid(
        return LOAD_SUCCESS;
 }
 
+static
+load_return_t
+load_version(
+       struct version_min_command     *vmc,
+       boolean_t               *found_version_cmd,
+       load_result_t           *result
+       )
+{
+       uint32_t platform = 0;
+       uint32_t sdk;
+
+       if (vmc->cmdsize < sizeof(*vmc)) {
+               return LOAD_BADMACHO;
+       }
+       if (*found_version_cmd == TRUE) {
+               return LOAD_BADMACHO;
+       }
+       *found_version_cmd = TRUE;
+       sdk = vmc->sdk;
+       switch (vmc->cmd) {
+       case LC_VERSION_MIN_MACOSX:
+               platform = PLATFORM_MACOS;
+               break;
+#if __x86_64__ /* __x86_64__ */
+       case LC_VERSION_MIN_IPHONEOS:
+               platform = PLATFORM_IOSSIMULATOR;
+               break;
+       case LC_VERSION_MIN_WATCHOS:
+               platform = PLATFORM_WATCHOSSIMULATOR;
+               break;
+       case LC_VERSION_MIN_TVOS:
+               platform = PLATFORM_TVOSSIMULATOR;
+               break;
+#else
+       case LC_VERSION_MIN_IPHONEOS: {
+#if __arm64__
+               extern int legacy_footprint_entitlement_mode;
+               if (vmc->sdk < (12 << 16)) {
+                       /* app built with a pre-iOS12 SDK: apply legacy footprint mitigation */
+                       result->legacy_footprint = TRUE;
+               }
+#endif /* __arm64__ */
+               platform = PLATFORM_IOS;
+               break;
+       }
+       case LC_VERSION_MIN_WATCHOS:
+               platform = PLATFORM_WATCHOS;
+               break;
+       case LC_VERSION_MIN_TVOS:
+               platform = PLATFORM_TVOS;
+               break;
+#endif /* __x86_64__ */
+       /* All LC_VERSION_MIN_* load commands are legacy and we will not be adding any more */
+       default:
+               sdk = (uint32_t)-1;
+               __builtin_unreachable();
+       }
+       result->ip_platform = platform;
+       result->lr_sdk = sdk;
+       return LOAD_SUCCESS;
+}
+
 static
 load_return_t
 load_main(
@@ -2049,6 +2383,52 @@ load_main(
        return LOAD_SUCCESS;
 }
 
+static
+load_return_t
+setup_driver_main(
+       thread_t                thread,
+       int64_t                         slide,
+       load_result_t           *result
+       )
+{
+       mach_vm_offset_t addr;
+       kern_return_t   ret;
+
+       /* Driver binaries have no LC_MAIN, use defaults */
+
+       if (thread == THREAD_NULL) {
+               return LOAD_SUCCESS;
+       }
+
+       result->user_stack_alloc_size = MAXSSIZ;
+
+       /* use default location for stack */
+       ret = thread_userstackdefault(&addr, result->is_64bit_addr);
+       if (ret != KERN_SUCCESS) {
+               return LOAD_FAILURE;
+       }
+
+       /* The stack slides down from the default location */
+       result->user_stack = addr;
+       result->user_stack -= slide;
+
+       if (result->using_lcmain || result->entry_point != MACH_VM_MIN_ADDRESS) {
+               /* Already processed LC_MAIN or LC_UNIXTHREAD */
+               return LOAD_FAILURE;
+       }
+
+       result->needs_dynlinker = TRUE;
+
+       ret = thread_state_initialize( thread );
+       if (ret != KERN_SUCCESS) {
+               return LOAD_FAILURE;
+       }
+
+       result->unixproc = TRUE;
+       result->thread_count++;
+
+       return LOAD_SUCCESS;
+}
 
 static
 load_return_t
@@ -2426,12 +2806,18 @@ load_code_signature(
        struct cs_blob  *blob;
        int             error;
        vm_size_t       blob_size;
+       uint32_t        sum;
 
        addr = 0;
        blob = NULL;
 
-       if (lcp->cmdsize != sizeof(struct linkedit_data_command) ||
-           lcp->dataoff + lcp->datasize > macho_size) {
+       if (lcp->cmdsize != sizeof(struct linkedit_data_command)) {
+               ret = LOAD_BADMACHO;
+               goto out;
+       }
+
+       sum = 0;
+       if (os_add_overflow(lcp->dataoff, lcp->datasize, &sum) || sum > macho_size) {
                ret = LOAD_BADMACHO;
                goto out;
        }
index 5a0f66ceb00bb4d9151c5723c74f693817990969..606d733fbcae749c3201cf9c35de59bd5b6f40c9 100644 (file)
@@ -86,6 +86,8 @@ typedef struct _load_result {
        off_t                   cs_end_offset;
        void                    *threadstate;
        size_t                  threadstate_sz;
+       uint32_t                ip_platform;
+       uint32_t                lr_sdk;
 } load_result_t;
 
 struct image_params;
@@ -96,6 +98,13 @@ load_return_t load_machfile(
        vm_map_t                *mapp,
        load_result_t           *result);
 
+load_return_t
+validate_potential_simulator_binary(
+       cpu_type_t               exectype,
+       struct image_params      *imgp,
+       off_t                    file_offset,
+       off_t                    macho_size);
+
 #define LOAD_SUCCESS            0
 #define LOAD_BADARCH            1       /* CPU type/subtype not found */
 #define LOAD_BADMACHO           2       /* malformed mach-o file */
index bce1b97847ff7451c3ef64c571d000d91f528bcb..915a8cb45f3e1f6d47f3488ba91ee1f3d561c297 100644 (file)
@@ -107,7 +107,6 @@ int get_task_userstop(task_t);
 #define CLR(t, f)       (t) &= ~(f)
 #define ISSET(t, f)     ((t) & (f))
 
-extern thread_t port_name_to_thread(mach_port_name_t port_name);
 extern thread_t get_firstthread(task_t);
 
 
@@ -451,7 +450,8 @@ resume:
                        error = EINVAL;
                        goto out;
                }
-               th_act = port_name_to_thread(CAST_MACH_PORT_TO_NAME(uap->addr));
+               th_act = port_name_to_thread(CAST_MACH_PORT_TO_NAME(uap->addr),
+                   PORT_TO_THREAD_NONE);
                if (th_act == THREAD_NULL) {
                        error = ESRCH;
                        goto out;
index 411d5ce6183a2e8c8b452b70eff22f8f82260a91..ffdacf957769351a6efd2f3cd3aed1a6d73c9d38 100755 (executable)
@@ -409,6 +409,9 @@ s/\$//g
                current_field++
                funcname = $current_field
                argalias = funcname "_args"
+               if (substr(argalias, 1, 4) == "sys_") {
+                       argalias = substr(argalias, 5)
+               }
                current_field++ # bump past function name
 
                if ($current_field != "(")
@@ -681,6 +684,9 @@ s/\$//g
                # output to syscalls.c
                if (add_sysnames_entry == 1) {
                        tempname = funcname
+                       if (substr(tempname, 1, 4) == "sys_") {
+                               tempname = substr(tempname, 5)
+                       }
                        if (funcname == "nosys" || funcname == "enosys") {
                                if (syscall_num == 0)
                                        tempname = "syscall"
@@ -701,6 +707,9 @@ s/\$//g
                # output to syscalls.h
                if (add_sysheader_entry == 1) {
                        tempname = funcname
+                       if (substr(tempname, 1, 4) == "sys_") {
+                               tempname = substr(tempname, 5)
+                       }
                        if (syscall_num == 0) {
                                tempname = "syscall"
                        }
@@ -762,7 +771,7 @@ s/\$//g
                printf("\n#endif /* !%s */\n", sysproto_h) > sysprotoend
 
                printf("};\n") > sysent
-               printf("unsigned int    nsysent = sizeof(sysent) / sizeof(sysent[0]);\n") > sysent
+               printf("const unsigned int      nsysent = sizeof(sysent) / sizeof(sysent[0]);\n") > sysent
 
                printf("};\n") > syscallnamestempfile
                printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall_num) \
index a2263417d41c1209be3f17d3b4ec25848cfbc34b..03326e009b99b6b5b61600d356c906e130ccfc47 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2006-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -146,13 +146,11 @@ static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
 static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
 static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
 static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
-static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *,
-    mcache_bkttype_t **);
+static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *);
 static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
 static void mcache_cache_bkt_enable(mcache_t *);
 static void mcache_bkt_purge(mcache_t *);
-static void mcache_bkt_destroy(mcache_t *, mcache_bkttype_t *,
-    mcache_bkt_t *, int);
+static void mcache_bkt_destroy(mcache_t *, mcache_bkt_t *, int);
 static void mcache_bkt_ws_update(mcache_t *);
 static void mcache_bkt_ws_zero(mcache_t *);
 static void mcache_bkt_ws_reap(mcache_t *);
@@ -201,12 +199,16 @@ mcache_init(void)
        mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
        if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL) {
                panic("mcache_init: thread_call_allocate failed");
+               /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE,
            PAGE_SIZE, "mcache");
        if (mcache_zone == NULL) {
                panic("mcache_init: failed to allocate mcache zone\n");
+               /* NOTREACHED */
+               __builtin_unreachable();
        }
        zone_change(mcache_zone, Z_CALLERACCT, FALSE);
 
@@ -346,6 +348,8 @@ mcache_create_common(const char *name, size_t bufsize, size_t align,
 
        if ((align & (align - 1)) != 0) {
                panic("mcache_create: bad alignment %lu", align);
+               /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        cp->mc_align = align;
@@ -548,7 +552,7 @@ retry_alloc:
                 * bucket from the bucket layer.  Upon success, refill this
                 * CPU and place any empty bucket into the empty list.
                 */
-               bkt = mcache_bkt_alloc(cp, &cp->mc_full, NULL);
+               bkt = mcache_bkt_alloc(cp, &cp->mc_full);
                if (bkt != NULL) {
                        if (ccp->cc_pfilled != NULL) {
                                mcache_bkt_free(cp, &cp->mc_empty,
@@ -616,6 +620,8 @@ debug_alloc:
                        panic("mcache_alloc_ext: %s cp %p corrupted list "
                            "(got %d actual %d)\n", cp->mc_name,
                            (void *)cp, num - need, n);
+                       /* NOTREACHED */
+                       __builtin_unreachable();
                }
        }
 
@@ -810,7 +816,7 @@ mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
                 * bucket from the bucket layer.  Upon success, empty this
                 * CPU and place any full bucket into the full list.
                 */
-               bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp);
+               bkt = mcache_bkt_alloc(cp, &cp->mc_empty);
                if (bkt != NULL) {
                        if (ccp->cc_pfilled != NULL) {
                                mcache_bkt_free(cp, &cp->mc_full,
@@ -819,6 +825,7 @@ mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
                        mcache_cpu_refill(ccp, bkt, 0);
                        continue;
                }
+               btp = cp->cache_bkttype;
 
                /*
                 * We need an empty bucket to put our freed objects into
@@ -844,6 +851,14 @@ mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
                                continue;
                        }
 
+                       /*
+                        * Store it in the bucket object since we'll
+                        * need to refer to it during bucket destroy;
+                        * we can't safely refer to cache_bkttype as
+                        * the bucket lock may not be acquired then.
+                        */
+                       bkt->bkt_type = btp;
+
                        /*
                         * We have an empty bucket of the right size;
                         * add it to the bucket layer and try again.
@@ -1082,7 +1097,7 @@ mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
  * Allocate a bucket from the bucket layer.
  */
 static mcache_bkt_t *
-mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkttype_t **btp)
+mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp)
 {
        mcache_bkt_t *bkt;
 
@@ -1104,10 +1119,6 @@ mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkttype_t **btp)
                blp->bl_alloc++;
        }
 
-       if (btp != NULL) {
-               *btp = cp->cache_bkttype;
-       }
-
        MCACHE_UNLOCK(&cp->mc_bkt_lock);
 
        return bkt;
@@ -1157,7 +1168,6 @@ mcache_bkt_purge(mcache_t *cp)
 {
        mcache_cpu_t *ccp;
        mcache_bkt_t *bp, *pbp;
-       mcache_bkttype_t *btp;
        int cpu, objs, pobjs;
 
        for (cpu = 0; cpu < ncpu; cpu++) {
@@ -1165,7 +1175,6 @@ mcache_bkt_purge(mcache_t *cp)
 
                MCACHE_LOCK(&ccp->cc_lock);
 
-               btp = cp->cache_bkttype;
                bp = ccp->cc_filled;
                pbp = ccp->cc_pfilled;
                objs = ccp->cc_objs;
@@ -1179,10 +1188,10 @@ mcache_bkt_purge(mcache_t *cp)
                MCACHE_UNLOCK(&ccp->cc_lock);
 
                if (bp != NULL) {
-                       mcache_bkt_destroy(cp, btp, bp, objs);
+                       mcache_bkt_destroy(cp, bp, objs);
                }
                if (pbp != NULL) {
-                       mcache_bkt_destroy(cp, btp, pbp, pobjs);
+                       mcache_bkt_destroy(cp, pbp, pobjs);
                }
        }
 
@@ -1195,8 +1204,7 @@ mcache_bkt_purge(mcache_t *cp)
  * and also free the bucket itself.
  */
 static void
-mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt,
-    int nobjs)
+mcache_bkt_destroy(mcache_t *cp, mcache_bkt_t *bkt, int nobjs)
 {
        if (nobjs > 0) {
                mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
@@ -1219,6 +1227,8 @@ mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt,
                                    "list in bkt %p (nobjs %d actual %d)\n",
                                    cp->mc_name, (void *)cp, (void *)bkt,
                                    nobjs, cnt);
+                               /* NOTREACHED */
+                               __builtin_unreachable();
                        }
                }
 
@@ -1226,7 +1236,7 @@ mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt,
                (*cp->mc_slab_free)(cp->mc_private, top,
                    (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
        }
-       mcache_free(btp->bt_cache, bkt);
+       mcache_free(bkt->bkt_type->bt_cache, bkt);
 }
 
 /*
@@ -1269,18 +1279,17 @@ mcache_bkt_ws_reap(mcache_t *cp)
 {
        long reap;
        mcache_bkt_t *bkt;
-       mcache_bkttype_t *btp;
 
        reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
        while (reap-- &&
-           (bkt = mcache_bkt_alloc(cp, &cp->mc_full, &btp)) != NULL) {
-               mcache_bkt_destroy(cp, btp, bkt, btp->bt_bktsize);
+           (bkt = mcache_bkt_alloc(cp, &cp->mc_full)) != NULL) {
+               mcache_bkt_destroy(cp, bkt, bkt->bkt_type->bt_bktsize);
        }
 
        reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
        while (reap-- &&
-           (bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp)) != NULL) {
-               mcache_bkt_destroy(cp, btp, bkt, 0);
+           (bkt = mcache_bkt_alloc(cp, &cp->mc_empty)) != NULL) {
+               mcache_bkt_destroy(cp, bkt, 0);
        }
 }
 
@@ -1487,7 +1496,7 @@ __private_extern__ void
 mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
     struct timeval *base_ts)
 {
-       struct timeval now, base = { 0, 0 };
+       struct timeval now, base = { .tv_sec = 0, .tv_usec = 0 };
        void *stack[MCACHE_STACK_DEPTH + 1];
        struct mca_trn *transaction;
 
@@ -1670,17 +1679,21 @@ mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
                    "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
                    offset, got, expected);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
            "(0x%llx instead of 0x%llx)\n%s\n",
            addr, offset, got, expected, mcache_dump_mca(mca));
        /* NOTREACHED */
+       __builtin_unreachable();
 }
 
+__attribute__((noinline, cold, not_tail_called, noreturn))
 __private_extern__ int
 assfail(const char *a, const char *f, int l)
 {
        panic("assertion failed: %s, file: %s, line: %d", a, f, l);
-       return 0;
+       /* NOTREACHED */
+       __builtin_unreachable();
 }
index 71362c2f27e3f3043155b1bf694a0a0cb5171aad..f0cfb10370ca37d4f21d1d978a64cd91512b41b5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2001-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -561,7 +561,7 @@ static int
 default_route_add(struct in_addr router, boolean_t proxy_arp)
 {
        uint32_t                    flags = 0;
-       struct in_addr              zeroes = { 0 };
+       struct in_addr              zeroes = { .s_addr = 0 };
 
        if (proxy_arp == FALSE) {
                flags |= RTF_GATEWAY;
@@ -572,7 +572,7 @@ default_route_add(struct in_addr router, boolean_t proxy_arp)
 static int
 host_route_delete(struct in_addr host, unsigned int ifscope)
 {
-       struct in_addr              zeroes = { 0 };
+       struct in_addr              zeroes = { .s_addr = 0 };
 
        return route_cmd(RTM_DELETE, host, zeroes, zeroes, RTF_HOST, ifscope);
 }
@@ -599,11 +599,11 @@ find_interface(void)
 }
 
 static const struct sockaddr_in blank_sin = {
-       sizeof(struct sockaddr_in),
-       AF_INET,
-       0,
-       { 0 },
-       { 0, 0, 0, 0, 0, 0, 0, 0 }
+       .sin_len = sizeof(struct sockaddr_in),
+       .sin_family = AF_INET,
+       .sin_port = 0,
+       .sin_addr = { .s_addr = 0 },
+       .sin_zero = { 0, 0, 0, 0, 0, 0, 0, 0 }
 };
 
 static int
@@ -636,12 +636,12 @@ int
 netboot_mountroot(void)
 {
        int                         error = 0;
-       struct in_addr              iaddr = { 0 };
+       struct in_addr              iaddr = { .s_addr = 0 };
        struct ifreq                ifr;
        struct ifnet *              ifp;
-       struct in_addr              netmask = { 0 };
+       struct in_addr              netmask = { .s_addr = 0 };
        proc_t                      procp = current_proc();
-       struct in_addr              router = { 0 };
+       struct in_addr              router = { .s_addr = 0 };
        struct socket *             so = NULL;
        unsigned int                try;
 
@@ -770,11 +770,11 @@ netboot_setup()
                goto done;
        }
        printf("netboot_setup: calling imageboot_mount_image\n");
-       error = imageboot_mount_image(S_netboot_info_p->image_path, -1);
+       error = imageboot_mount_image(S_netboot_info_p->image_path, -1, IMAGEBOOT_DMG);
        if (error != 0) {
                printf("netboot: failed to mount root image, %d\n", error);
        } else if (S_netboot_info_p->second_image_path != NULL) {
-               error = imageboot_mount_image(S_netboot_info_p->second_image_path, 0);
+               error = imageboot_mount_image(S_netboot_info_p->second_image_path, 0, IMAGEBOOT_DMG);
                if (error != 0) {
                        printf("netboot: failed to mount second root image, %d\n", error);
                }
index 06ea2dcfc012adf4074ff07c0d4a957a1846f1e2..de77a23be622a9211f113e50446e1e5ec2a8e8f8 100644 (file)
@@ -121,7 +121,7 @@ common_hook(void)
        return rv;
 }
 
-#if (MAC_POLICY_OPS_VERSION != 55)
+#if (MAC_POLICY_OPS_VERSION != 58)
 # error "struct mac_policy_ops doesn't match definition in mac_policy.h"
 #endif
 /*
@@ -271,8 +271,8 @@ const static struct mac_policy_ops policy_ops = {
        CHECK_SET_HOOK(vnode_check_rename)
        CHECK_SET_HOOK(kext_check_query)
        CHECK_SET_HOOK(proc_notify_exec_complete)
-       .mpo_reserved5 = (mpo_reserved_hook_t *)common_hook,
-       .mpo_reserved6 = (mpo_reserved_hook_t *)common_hook,
+       .mpo_reserved4 = (mpo_reserved_hook_t *)common_hook,
+       CHECK_SET_HOOK(proc_check_syscall_unix)
        CHECK_SET_HOOK(proc_check_expose_task)
        CHECK_SET_HOOK(proc_check_set_host_special_port)
        CHECK_SET_HOOK(proc_check_set_host_exception_port)
@@ -284,9 +284,9 @@ const static struct mac_policy_ops policy_ops = {
        CHECK_SET_HOOK(exc_action_label_update)
 
        CHECK_SET_HOOK(vnode_check_trigger_resolve)
+       CHECK_SET_HOOK(mount_check_mount_late)
        .mpo_reserved1 = (mpo_reserved_hook_t *)common_hook,
        .mpo_reserved2 = (mpo_reserved_hook_t *)common_hook,
-       .mpo_reserved3 = (mpo_reserved_hook_t *)common_hook,
        CHECK_SET_HOOK(skywalk_flow_check_connect)
        CHECK_SET_HOOK(skywalk_flow_check_listen)
 
@@ -322,8 +322,9 @@ const static struct mac_policy_ops policy_ops = {
        CHECK_SET_HOOK(proc_check_setlcid)
        CHECK_SET_HOOK(proc_check_signal)
        CHECK_SET_HOOK(proc_check_wait)
-       CHECK_SET_HOOK(proc_label_destroy)
-       CHECK_SET_HOOK(proc_label_init)
+
+       .mpo_reserved5 = (mpo_reserved_hook_t *)common_hook,
+       .mpo_reserved6 = (mpo_reserved_hook_t *)common_hook,
 
        CHECK_SET_HOOK(socket_check_accept)
        CHECK_SET_HOOK(socket_check_accepted)
@@ -473,6 +474,8 @@ const static struct mac_policy_ops policy_ops = {
 
        CHECK_SET_HOOK(iokit_check_set_properties)
 
+       .mpo_reserved3 = (mpo_reserved_hook_t *)common_hook,
+
        CHECK_SET_HOOK(vnode_check_searchfs)
 
        CHECK_SET_HOOK(priv_check)
index 5aa96d0f11cdfdf8112919d21962f88d45985245..b1119d15381f0860b6e3dea3c44e2a56d45ee342 100644 (file)
@@ -171,28 +171,18 @@ static int psem_cache_search(struct pseminfo **,
     struct psemname *, struct psemcache **);
 static int psem_delete(struct pseminfo * pinfo);
 
-static int psem_read(struct fileproc *fp, struct uio *uio,
-    int flags, vfs_context_t ctx);
-static int psem_write(struct fileproc *fp, struct uio *uio,
-    int flags, vfs_context_t ctx);
-static int psem_ioctl(struct fileproc *fp, u_long com,
-    caddr_t data, vfs_context_t ctx);
-static int psem_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx);
 static int psem_closefile(struct fileglob *fp, vfs_context_t ctx);
 static int psem_unlink_internal(struct pseminfo *pinfo, struct psemcache *pcache);
 
-static int psem_kqfilter(struct fileproc *fp, struct knote *kn,
-    struct kevent_internal_s *kev, vfs_context_t ctx);
-
 static const struct fileops psemops = {
-       .fo_type = DTYPE_PSXSEM,
-       .fo_read = psem_read,
-       .fo_write = psem_write,
-       .fo_ioctl = psem_ioctl,
-       .fo_select = psem_select,
-       .fo_close = psem_closefile,
-       .fo_kqfilter = psem_kqfilter,
-       .fo_drain = NULL,
+       .fo_type     = DTYPE_PSXSEM,
+       .fo_read     = fo_no_read,
+       .fo_write    = fo_no_write,
+       .fo_ioctl    = fo_no_ioctl,
+       .fo_select   = fo_no_select,
+       .fo_close    = psem_closefile,
+       .fo_drain    = fo_no_drain,
+       .fo_kqfilter = fo_no_kqfilter,
 };
 
 static lck_grp_t       *psx_sem_subsys_lck_grp;
@@ -797,7 +787,7 @@ sem_unlink(__unused proc_t p, struct sem_unlink_args *uap, __unused int32_t *ret
 
        if (error != PSEMCACHE_FOUND) {
                PSEM_SUBSYS_UNLOCK();
-               error = EINVAL;
+               error = ENOENT;
                goto bad;
        }
 
@@ -842,6 +832,8 @@ sem_close(proc_t p, struct sem_close_args *uap, __unused int32_t *retval)
                return EBADF;
        }
        procfdtbl_markclosefd(p, fd);
+       /* release the ref returned from fp_lookup before calling drain */
+       (void) os_ref_release_locked(&fp->f_iocount);
        fileproc_drain(p, fp);
        fdrelse(p, fd);
        error = closef_locked(fp, fp->f_fglob, p);
@@ -1117,43 +1109,6 @@ psem_delete(struct pseminfo * pinfo)
        }
 }
 
-static int
-psem_read(__unused struct fileproc *fp, __unused struct uio *uio,
-    __unused int flags, __unused vfs_context_t ctx)
-{
-       return ENOTSUP;
-}
-
-static int
-psem_write(__unused struct fileproc *fp, __unused struct uio *uio,
-    __unused int flags, __unused vfs_context_t ctx)
-{
-       return ENOTSUP;
-}
-
-static int
-psem_ioctl(__unused struct fileproc *fp, __unused u_long com,
-    __unused caddr_t data, __unused vfs_context_t ctx)
-{
-       return ENOTSUP;
-}
-
-static int
-psem_select(__unused struct fileproc *fp, __unused int which,
-    __unused void *wql, __unused vfs_context_t ctx)
-{
-       return ENOTSUP;
-}
-
-static int
-psem_kqfilter(__unused struct fileproc *fp, struct knote *kn,
-    __unused struct kevent_internal_s *kev, __unused vfs_context_t ctx)
-{
-       kn->kn_flags = EV_ERROR;
-       kn->kn_data = ENOTSUP;
-       return 0;
-}
-
 int
 fill_pseminfo(struct psemnode *pnode, struct psem_info * info)
 {
index 3cd6aebd140b79d13802d528af4f5aa5bb98f466..29c89efb93bba6b668479a07c5950ee047ac27d5 100644 (file)
@@ -187,29 +187,22 @@ static pshm_info_t *pshm_cache_search(pshm_info_t * look);
 static void pshm_cache_add(pshm_info_t *entry);
 static void pshm_cache_delete(pshm_info_t *entry);
 
-static int pshm_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx);
-static int pshm_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx);
-static int pshm_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx);
-static int pshm_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx);
 static int pshm_closefile(struct fileglob *fg, vfs_context_t ctx);
 
-static int pshm_kqfilter(struct fileproc *fp, struct knote *kn,
-    struct kevent_internal_s *kev, vfs_context_t ctx);
-
 static int pshm_access(pshm_info_t *pinfo, int mode, kauth_cred_t cred, proc_t p);
 int pshm_cache_purge_all(proc_t p);
 
 static int pshm_unlink_internal(pshm_info_t *pinfo);
 
 static const struct fileops pshmops = {
-       .fo_type = DTYPE_PSXSHM,
-       .fo_read = pshm_read,
-       .fo_write = pshm_write,
-       .fo_ioctl = pshm_ioctl,
-       .fo_select = pshm_select,
-       .fo_close = pshm_closefile,
-       .fo_kqfilter = pshm_kqfilter,
-       .fo_drain = NULL,
+       .fo_type     = DTYPE_PSXSHM,
+       .fo_read     = fo_no_read,
+       .fo_write    = fo_no_write,
+       .fo_ioctl    = fo_no_ioctl,
+       .fo_select   = fo_no_select,
+       .fo_close    = pshm_closefile,
+       .fo_drain    = fo_no_drain,
+       .fo_kqfilter = fo_no_kqfilter,
 };
 
 /*
@@ -1128,43 +1121,6 @@ pshm_closefile(struct fileglob *fg, __unused vfs_context_t ctx)
        return error;
 }
 
-static int
-pshm_read(__unused struct fileproc *fp, __unused struct uio *uio,
-    __unused int flags, __unused vfs_context_t ctx)
-{
-       return ENOTSUP;
-}
-
-static int
-pshm_write(__unused struct fileproc *fp, __unused struct uio *uio,
-    __unused int flags, __unused vfs_context_t ctx)
-{
-       return ENOTSUP;
-}
-
-static int
-pshm_ioctl(__unused struct fileproc *fp, __unused u_long com,
-    __unused caddr_t data, __unused vfs_context_t ctx)
-{
-       return ENOTSUP;
-}
-
-static int
-pshm_select(__unused struct fileproc *fp, __unused int which, __unused void *wql,
-    __unused vfs_context_t ctx)
-{
-       return ENOTSUP;
-}
-
-static int
-pshm_kqfilter(__unused struct fileproc *fp, struct knote *kn,
-    __unused struct kevent_internal_s *kev, __unused vfs_context_t ctx)
-{
-       kn->kn_flags = EV_ERROR;
-       kn->kn_data = ENOTSUP;
-       return 0;
-}
-
 int
 fill_pshminfo(pshmnode_t * pshm, struct pshm_info * info)
 {
index 8d026e5db020c37e1063fd80441fab4ff0e45654..d4bc5e794c1d04687a2fd6c6836ea70cadbe061e 100644 (file)
@@ -106,6 +106,7 @@ struct atalk;
 
 uint64_t get_dispatchqueue_offset_from_proc(void *);
 uint64_t get_dispatchqueue_serialno_offset_from_proc(void *);
+uint64_t get_dispatchqueue_label_offset_from_proc(void *p);
 uint64_t get_return_to_kernel_offset_from_proc(void *p);
 int proc_info_internal(int callnum, int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t * retval);
 
@@ -174,6 +175,8 @@ int __attribute__ ((noinline)) proc_pidexitreasoninfo(proc_t p, struct proc_exit
 int __attribute__ ((noinline)) proc_pidoriginatorpid_uuid(uuid_t uuid, uint32_t buffersize, pid_t *pid);
 int __attribute__ ((noinline)) proc_pidlistuptrs(proc_t p, user_addr_t buffer, uint32_t buffersize, int32_t *retval);
 int __attribute__ ((noinline)) proc_piddynkqueueinfo(pid_t pid, int flavor, kqueue_id_t id, user_addr_t buffer, uint32_t buffersize, int32_t *retval);
+int __attribute__ ((noinline)) proc_pidregionpath(proc_t p, uint64_t arg, user_addr_t buffer, __unused uint32_t buffersize, int32_t *retval);
+int __attribute__ ((noinline)) proc_pidipctableinfo(proc_t p, struct proc_ipctableinfo *table_info);
 
 #if !CONFIG_EMBEDDED
 int __attribute__ ((noinline)) proc_udata_info(pid_t pid, int flavor, user_addr_t buffer, uint32_t buffersize, int32_t *retval);
@@ -192,7 +195,7 @@ int __attribute__ ((noinline)) pid_atalkinfo(struct atalk  * at, struct fileproc
 
 /* protos for misc */
 
-int fill_vnodeinfo(vnode_t vp, struct vnode_info *vinfo);
+int fill_vnodeinfo(vnode_t vp, struct vnode_info *vinfo, boolean_t check_fsgetpath);
 void  fill_fileinfo(struct fileproc * fp, proc_t proc, int fd, struct proc_fileinfo * finfo);
 int proc_security_policy(proc_t targetp, int callnum, int flavor, boolean_t check_same_user);
 static void munge_vinfo_stat(struct stat64 *sbp, struct vinfo_stat *vsbp);
@@ -227,6 +230,17 @@ get_dispatchqueue_serialno_offset_from_proc(void *p)
        }
 }
 
+uint64_t
+get_dispatchqueue_label_offset_from_proc(void *p)
+{
+       if (p != NULL) {
+               proc_t pself = (proc_t)p;
+               return pself->p_dispatchqueue_label_offset;
+       } else {
+               return (uint64_t)0;
+       }
+}
+
 uint64_t
 get_return_to_kernel_offset_from_proc(void *p)
 {
@@ -968,7 +982,7 @@ proc_pidthreadpathinfo(proc_t p, uint64_t arg, struct proc_threadwithpathinfo *p
        }
 
        if ((vp != NULLVP) && ((vnode_getwithvid(vp, vid)) == 0)) {
-               error = fill_vnodeinfo(vp, &pinfo->pvip.vip_vi);
+               error = fill_vnodeinfo(vp, &pinfo->pvip.vip_vi, FALSE);
                if (error == 0) {
                        count = MAXPATHLEN;
                        vn_getpath(vp, &pinfo->pvip.vip_path[0], &count);
@@ -1057,7 +1071,7 @@ proc_pidregionpathinfo(proc_t p, uint64_t arg, user_addr_t buffer, __unused uint
                vp = (vnode_t)vnodeaddr;
                if ((vnode_getwithvid(vp, vnodeid)) == 0) {
                        /* FILL THE VNODEINFO */
-                       error = fill_vnodeinfo(vp, &preginfo.prp_vip.vip_vi);
+                       error = fill_vnodeinfo(vp, &preginfo.prp_vip.vip_vi, FALSE);
                        count = MAXPATHLEN;
                        vn_getpath(vp, &preginfo.prp_vip.vip_path[0], &count);
                        /* Always make sure it is null terminated */
@@ -1095,7 +1109,7 @@ proc_pidregionpathinfo2(proc_t p, uint64_t arg, user_addr_t buffer, __unused uin
        vp = (vnode_t)vnodeaddr;
        if ((vnode_getwithvid(vp, vnodeid)) == 0) {
                /* FILL THE VNODEINFO */
-               error = fill_vnodeinfo(vp, &preginfo.prp_vip.vip_vi);
+               error = fill_vnodeinfo(vp, &preginfo.prp_vip.vip_vi, FALSE);
                count = MAXPATHLEN;
                vn_getpath(vp, &preginfo.prp_vip.vip_path[0], &count);
                /* Always make sure it is null terminated */
@@ -1112,6 +1126,45 @@ proc_pidregionpathinfo2(proc_t p, uint64_t arg, user_addr_t buffer, __unused uin
        return error;
 }
 
+int
+proc_pidregionpath(proc_t p, uint64_t arg, user_addr_t buffer, __unused uint32_t buffersize, int32_t *retval)
+{
+       struct proc_regionpath path;
+       int ret, error = 0;
+       uintptr_t vnodeaddr = 0;
+       uint32_t vnodeid = 0;
+       vnode_t vp;
+
+       bzero(&path, sizeof(struct proc_regionpath));
+
+       ret = find_region_details(p->task, (vm_map_offset_t) arg,
+           (uintptr_t *)&vnodeaddr, (uint32_t *)&vnodeid,
+           &path.prpo_addr, &path.prpo_regionlength);
+       if (ret == 0) {
+               return EINVAL;
+       }
+       if (!vnodeaddr) {
+               return EINVAL;
+       }
+
+       vp = (vnode_t)vnodeaddr;
+       if ((vnode_getwithvid(vp, vnodeid)) == 0) {
+               int count = MAXPATHLEN;
+               vn_getpath(vp, &path.prpo_path[0], &count);
+               /* Always make sure it is null terminated */
+               path.prpo_path[MAXPATHLEN - 1] = 0;
+               vnode_put(vp);
+       } else {
+               return EINVAL;
+       }
+
+       error = copyout(&path, buffer, sizeof(struct proc_regionpath));
+       if (error == 0) {
+               *retval = sizeof(struct proc_regionpath);
+       }
+       return error;
+}
+
 int
 proc_pidregionpathinfo3(proc_t p, uint64_t arg, user_addr_t buffer, __unused uint32_t buffersize, int32_t *retval)
 {
@@ -1155,7 +1208,7 @@ proc_pidregionpathinfo3(proc_t p, uint64_t arg, user_addr_t buffer, __unused uin
 
                        if (vnode_get_va_fsid(&va) == arg) {
                                /* FILL THE VNODEINFO */
-                               error = fill_vnodeinfo(vp, &preginfo.prp_vip.vip_vi);
+                               error = fill_vnodeinfo(vp, &preginfo.prp_vip.vip_vi, FALSE);
                                count = MAXPATHLEN;
                                vn_getpath(vp, &preginfo.prp_vip.vip_path[0], &count);
                                /* Always make sure it is null terminated */
@@ -1209,7 +1262,7 @@ proc_pidvnodepathinfo(proc_t p, __unused uint64_t arg, user_addr_t buffer, __unu
        if (vncdirvp != NULLVP) {
                if ((error = vnode_getwithvid(vncdirvp, vncdirid)) == 0) {
                        /* FILL THE VNODEINFO */
-                       error = fill_vnodeinfo(vncdirvp, &pvninfo.pvi_cdir.vip_vi);
+                       error = fill_vnodeinfo(vncdirvp, &pvninfo.pvi_cdir.vip_vi, TRUE);
                        if (error == 0) {
                                count = MAXPATHLEN;
                                vn_getpath(vncdirvp, &pvninfo.pvi_cdir.vip_path[0], &count);
@@ -1224,7 +1277,7 @@ proc_pidvnodepathinfo(proc_t p, __unused uint64_t arg, user_addr_t buffer, __unu
        if ((error == 0) && (vnrdirvp != NULLVP)) {
                if ((error = vnode_getwithvid(vnrdirvp, vnrdirid)) == 0) {
                        /* FILL THE VNODEINFO */
-                       error = fill_vnodeinfo(vnrdirvp, &pvninfo.pvi_rdir.vip_vi);
+                       error = fill_vnodeinfo(vnrdirvp, &pvninfo.pvi_rdir.vip_vi, TRUE);
                        if (error == 0) {
                                count = MAXPATHLEN;
                                vn_getpath(vnrdirvp, &pvninfo.pvi_rdir.vip_path[0], &count);
@@ -1403,6 +1456,27 @@ proc_pidoriginatoruuid(uuid_t uuid, uint32_t buffersize)
        return proc_pidoriginatorpid_uuid(uuid, buffersize, &originator_pid);
 }
 
+/*
+ * Function to get the task ipc table size.
+ */
+int
+proc_pidipctableinfo(proc_t p, struct proc_ipctableinfo *table_info)
+{
+       task_t task;
+       int error = 0;
+
+       task = p->task;
+
+       bzero(table_info, sizeof(struct proc_ipctableinfo));
+       error = fill_taskipctableinfo(task, &(table_info->table_size), &(table_info->table_free));
+
+       if (error) {
+               error = EINVAL;
+       }
+
+       return error;
+}
+
 /***************************** proc_pidoriginatorinfo ***************************/
 
 int
@@ -1628,10 +1702,10 @@ proc_can_use_foreground_hw(int pid, user_addr_t u_reason, uint32_t reasonsize, i
        }
 
        task = p->task;
-       task_reference(task);
-       if (coalition_is_leader(task, COALITION_TYPE_JETSAM, &coal) == FALSE) {
+       if (coalition_is_leader(task, task_get_coalition(task, COALITION_TYPE_JETSAM))) {
+               task_reference(task);
+       } else {
                /* current task is not a coalition leader: find the leader */
-               task_deallocate(task);
                task = coalition_get_leader(coal);
        }
 
@@ -1892,6 +1966,16 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t  bu
                        size = 0;
                }
                break;
+       case PROC_PIDPLATFORMINFO:
+               size = PROC_PIDPLATFORMINFO_SIZE;
+               findzomb = 1;
+               break;
+       case PROC_PIDREGIONPATH:
+               size = PROC_PIDREGIONPATH_SIZE;
+               break;
+       case PROC_PIDIPCTABLEINFO:
+               size = PROC_PIDIPCTABLEINFO_SIZE;
+               break;
        default:
                return EINVAL;
        }
@@ -1931,6 +2015,7 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t  bu
        case PROC_PIDUNIQIDENTIFIERINFO:
        case PROC_PIDPATHINFO:
        case PROC_PIDCOALITIONINFO:
+       case PROC_PIDPLATFORMINFO:
                check_same_user = NO_CHECK_SAME_USER;
                break;
        default:
@@ -2232,6 +2317,31 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t  bu
                kfree(vmrtfbuf, kbufsz);
        }
        break;
+       case PROC_PIDPLATFORMINFO: {
+               proc_lock(p);
+               uint32_t platform = p->p_platform;
+               proc_unlock(p);
+               error = copyout(&platform, buffer, sizeof(uint32_t));
+               if (error == 0) {
+                       *retval = sizeof(uint32_t);
+               }
+       } break;
+       case PROC_PIDREGIONPATH: {
+               error = proc_pidregionpath(p, arg, buffer, buffersize, retval);
+       }
+       break;
+       case PROC_PIDIPCTABLEINFO: {
+               struct proc_ipctableinfo table_info;
+
+               error = proc_pidipctableinfo(p, &table_info);
+               if (error == 0) {
+                       error = copyout(&table_info, buffer, sizeof(struct proc_ipctableinfo));
+                       if (error == 0) {
+                               *retval = sizeof(struct proc_ipctableinfo);
+                       }
+               }
+       }
+       break;
        default:
                error = ENOTSUP;
                break;
@@ -2258,7 +2368,7 @@ pid_vnodeinfo(vnode_t vp, uint32_t vid, struct fileproc * fp, proc_t proc, int f
        }
        bzero(&vfi, sizeof(struct vnode_fdinfo));
        fill_fileinfo(fp, proc, fd, &vfi.pfi);
-       error = fill_vnodeinfo(vp, &vfi.pvi);
+       error = fill_vnodeinfo(vp, &vfi.pvi, FALSE);
        vnode_put(vp);
        if (error == 0) {
                error = copyout((caddr_t)&vfi, buffer, sizeof(struct vnode_fdinfo));
@@ -2280,7 +2390,7 @@ pid_vnodeinfopath(vnode_t vp, uint32_t vid, struct fileproc * fp, proc_t proc, i
        }
        bzero(&vfip, sizeof(struct vnode_fdinfowithpath));
        fill_fileinfo(fp, proc, fd, &vfip.pfi);
-       error = fill_vnodeinfo(vp, &vfip.pvip.vip_vi);
+       error = fill_vnodeinfo(vp, &vfip.pvip.vip_vi, TRUE);
        if (error == 0) {
                count = MAXPATHLEN;
                vn_getpath(vp, &vfip.pvip.vip_path[0], &count);
@@ -2335,7 +2445,7 @@ fill_fileinfo(struct fileproc * fp, proc_t proc, int fd, struct proc_fileinfo *
 
 
 int
-fill_vnodeinfo(vnode_t vp, struct vnode_info *vinfo)
+fill_vnodeinfo(vnode_t vp, struct vnode_info *vinfo, __unused boolean_t check_fsgetpath)
 {
        vfs_context_t context;
        struct stat64 sb;
@@ -2343,11 +2453,17 @@ fill_vnodeinfo(vnode_t vp, struct vnode_info *vinfo)
 
        bzero(&sb, sizeof(struct stat64));
        context = vfs_context_create((vfs_context_t)0);
-       error = vn_stat(vp, &sb, NULL, 1, context);
+#if CONFIG_MACF
+       /* Called when vnode info is used by the caller to get vnode's path */
+       if (check_fsgetpath) {
+               error = mac_vnode_check_fsgetpath(context, vp);
+       }
+#endif
+       if (!error) {
+               error = vn_stat(vp, &sb, NULL, 1, 0, context);
+               munge_vinfo_stat(&sb, &vinfo->vi_stat);
+       }
        (void)vfs_context_rele(context);
-
-       munge_vinfo_stat(&sb, &vinfo->vi_stat);
-
        if (error != 0) {
                goto out;
        }
@@ -2598,36 +2714,36 @@ proc_pidfdinfo(int pid, int flavor, int fd, user_addr_t buffer, uint32_t buffers
        break;
 
        case PROC_PIDFDKQUEUEINFO: {
-               struct kqueue * kq;
+               kqueue_t kqu;
 
                if (fd == -1) {
-                       if ((kq = p->p_fd->fd_wqkqueue) == NULL) {
+                       if ((kqu.kqwq = p->p_fd->fd_wqkqueue) == NULL) {
                                /* wqkqueue is initialized on-demand */
                                error = 0;
                                break;
                        }
-               } else if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0) {
+               } else if ((error = fp_getfkq(p, fd, &fp, &kqu.kq)) != 0) {
                        goto out1;
                }
 
                /* no need to be under the fdlock */
-               error = pid_kqueueinfo(kq, fp, p, fd, buffer, buffersize, retval);
+               error = pid_kqueueinfo(kqu.kq, fp, p, fd, buffer, buffersize, retval);
        }
        break;
 
        case PROC_PIDFDKQUEUE_EXTINFO: {
-               struct kqueue * kq;
+               kqueue_t kqu;
 
                if (fd == -1) {
-                       if ((kq = p->p_fd->fd_wqkqueue) == NULL) {
+                       if ((kqu.kqwq = p->p_fd->fd_wqkqueue) == NULL) {
                                /* wqkqueue is initialized on-demand */
                                error = 0;
                                break;
                        }
-               } else if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0) {
+               } else if ((error = fp_getfkq(p, fd, &fp, &kqu.kq)) != 0) {
                        goto out1;
                }
-               error = pid_kqueue_extinfo(p, kq, buffer, buffersize, retval);
+               error = pid_kqueue_extinfo(p, kqu.kq, buffer, buffersize, retval);
        }
        break;
 
@@ -3041,7 +3157,7 @@ proc_dirtycontrol(int pid, int flavor, uint64_t arg, int32_t *retval)
        case PROC_DIRTYCONTROL_GET: {
                /* No permissions check - dirty state is freely available */
                if (retval) {
-                       *retval = memorystatus_dirty_get(target_p);
+                       *retval = memorystatus_dirty_get(target_p, FALSE);
                } else {
                        error = EINVAL;
                }
index 3ef4acd7b8c3e22f0373435f5f409f6da276517b..d30037c200f8c8c08c713efc2df95f968e722f9a 100644 (file)
@@ -179,8 +179,8 @@ kern_stack_snapshot_with_reason(__unused char *reason)
 
        config.sc_pid = -1;
        config.sc_flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS | STACKSHOT_SAVE_IN_KERNEL_BUFFER |
-           STACKSHOT_KCDATA_FORMAT | STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_THREAD_WAITINFO |
-           STACKSHOT_NO_IO_STATS);
+           STACKSHOT_KCDATA_FORMAT | STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_THREAD_WAITINFO |
+           STACKSHOT_NO_IO_STATS | STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT);
        config.sc_delta_timestamp = 0;
        config.sc_out_buffer_addr = 0;
        config.sc_out_size_addr = 0;
index 41c57380fe210c4d2983e57a556b3202bae24e54..0fd805173b472ab2b01d4d48177c132ecc968eb0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2016-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -72,7 +72,7 @@ SYSCTL_NODE(_kern, OID_AUTO, eventhandler, CTLFLAG_RW | CTLFLAG_LOCKED,
 SYSCTL_INT(_kern_eventhandler, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED,
     &evh_debug, 0, "Eventhandler debug mode");
 
-struct eventhandler_entry_arg eventhandler_entry_dummy_arg = { { 0 }, { 0 } };
+struct eventhandler_entry_arg eventhandler_entry_dummy_arg = { .ee_fm_uuid = { 0 }, .ee_fr_uuid = { 0 } };
 
 /* List of 'slow' lists */
 static struct eventhandler_lists_ctxt evthdlr_lists_ctxt_glb;
@@ -177,6 +177,11 @@ eventhandler_register_internal(
                if (list == NULL) {
                        lck_mtx_convert_spin(&evthdlr_lists_ctxt->eventhandler_mutex);
                        new_list = mcache_alloc(el_cache, MCR_SLEEP);
+                       if (new_list == NULL) {
+                               evhlog((LOG_DEBUG, "%s: Can't allocate list \"%s\"", __func__, name));
+                               lck_mtx_unlock(&evthdlr_lists_ctxt->eventhandler_mutex);
+                               return NULL;
+                       }
                        bzero(new_list, el_size);
                        evhlog((LOG_DEBUG, "%s: creating list \"%s\"", __func__, name));
                        list = new_list;
@@ -224,6 +229,11 @@ eventhandler_register(struct eventhandler_lists_ctxt *evthdlr_lists_ctxt,
 
        /* allocate an entry for this handler, populate it */
        eg = mcache_alloc(eg_cache, MCR_SLEEP);
+       if (eg == NULL) {
+               evhlog((LOG_DEBUG, "%s: Can't allocate entry to register for event list "
+                   "\"%s\"", __func__, name));
+               return NULL;
+       }
        bzero(eg, eg_size);
        eg->func = func;
        eg->ee.ee_arg = arg;
index 46475b683dd1e7aa179b0bab23f063867880e67c..b5a78b6b9489435ff0e90c0b044aa9bebca1a680 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -65,6 +65,7 @@
  * Error log buffer for kernel printf's.
  */
 
+#include <machine/atomic.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc_internal.h>
@@ -120,7 +121,7 @@ extern uint32_t oslog_s_error_count;
 /* All globals should be accessed under LOG_LOCK() */
 
 static char amsg_bufc[1024];
-static struct msgbuf aslbuf = {MSG_MAGIC, sizeof(amsg_bufc), 0, 0, amsg_bufc};
+static struct msgbuf aslbuf = {.msg_magic = MSG_MAGIC, .msg_size = sizeof(amsg_bufc), .msg_bufx = 0, .msg_bufr = 0, .msg_bufc = amsg_bufc};
 struct msgbuf *aslbufp __attribute__((used)) = &aslbuf;
 
 /* logsoftc only valid while log_open=1 */
@@ -144,8 +145,8 @@ struct firehose_chunk_s oslog_boot_buf = {
        },
 }; /* static buffer */
 firehose_chunk_t firehose_boot_chunk = &oslog_boot_buf;
-struct msgbuf msgbuf = {MSG_MAGIC, sizeof(smsg_bufc), 0, 0, smsg_bufc};
-struct msgbuf oslog_stream_buf = {MSG_MAGIC, 0, 0, 0, NULL};
+struct msgbuf msgbuf = {.msg_magic  = MSG_MAGIC, .msg_size = sizeof(smsg_bufc), .msg_bufx = 0, .msg_bufr = 0, .msg_bufc = smsg_bufc};
+struct msgbuf oslog_stream_buf = {.msg_magic = MSG_MAGIC, .msg_size = 0, .msg_bufx = 0, .msg_bufr = 0, .msg_bufc = NULL};
 struct msgbuf *msgbufp __attribute__((used)) = &msgbuf;
 struct msgbuf *oslog_streambufp __attribute__((used)) = &oslog_stream_buf;
 
@@ -195,7 +196,7 @@ void bsd_log_init(void);
  * Ideally this file would define this lock, but bsd doesn't have the definition
  * for lock groups.
  */
-decl_lck_spin_data(extern, oslog_stream_lock)
+decl_lck_spin_data(extern, oslog_stream_lock);
 #define stream_lock() lck_spin_lock(&oslog_stream_lock)
 #define stream_unlock() lck_spin_unlock(&oslog_stream_lock)
 
@@ -609,7 +610,7 @@ oslog_streamread(__unused dev_t dev, struct uio *uio, int flag)
        if (copy_size != 0) {
                error = uiomove((caddr_t)logline, copy_size, uio);
        }
-       (void)hw_atomic_add(&oslog_s_streamed_msgcount, 1);
+       os_atomic_inc(&oslog_s_streamed_msgcount, relaxed);
 
        return error;
 }
@@ -1057,7 +1058,7 @@ oslog_streamwrite_locked(firehose_tracepoint_id_u ftid,
 
        mbp = oslog_streambufp;
        if (ft_length > mbp->msg_size) {
-               (void)hw_atomic_add(&oslog_s_error_count, 1);
+               os_atomic_inc(&oslog_s_error_count, relaxed);
                return;
        }
 
index d090a242923d1739a30c173187cd0e1b5219b1a4..a7f73781a9afadf3f0332f4603e6af5dac56edc2 100644 (file)
@@ -394,13 +394,16 @@ putchar(int c, void *arg)
 }
 
 int
-vprintf_log_locked(const char *fmt, va_list ap)
+vprintf_log_locked(const char *fmt, va_list ap, bool addcr)
 {
        struct putchar_args pca;
 
        pca.flags = TOLOGLOCKED;
        pca.tty   = NULL;
        __doprnt(fmt, ap, putchar, &pca, 10, TRUE);
+       if (addcr) {
+               putchar('\n', &pca);
+       }
        return 0;
 }
 
diff --git a/bsd/kern/subr_prof.c b/bsd/kern/subr_prof.c
deleted file mode 100644 (file)
index a638d87..0000000
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
-/*-
- * Copyright (c) 1982, 1986, 1993
- *     The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *     This product includes software developed by the University of
- *     California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *     @(#)subr_prof.c 8.3 (Berkeley) 9/23/93
- */
-
-#ifdef GPROF
-#include <libkern/kernel_mach_header.h>
-#endif
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/proc_internal.h>
-#include <sys/user.h>
-#include <machine/machine_routines.h>
-
-#include <sys/mount_internal.h>
-#include <sys/sysproto.h>
-
-#include <mach/mach_types.h>
-#include <kern/kern_types.h>
-#include <kern/cpu_number.h>
-#include <kern/kalloc.h>
-
-#ifdef GPROF
-#include <sys/malloc.h>
-#include <sys/gmon.h>
-
-extern int sysctl_doprof(int *, u_int, user_addr_t, size_t *,
-    user_addr_t, size_t newlen);
-extern int sysctl_struct(user_addr_t, size_t *,
-    user_addr_t, size_t, void *, int);
-
-lck_spin_t * mcount_lock;
-lck_grp_t * mcount_lock_grp;
-lck_attr_t * mcount_lock_attr;
-
-/*
- * Froms is actually a bunch of unsigned shorts indexing tos
- */
-struct gmonparam _gmonparam = { .state = GMON_PROF_OFF };
-
-/*
- * This code uses 32 bit mach object segment information from the currently
- * running kernel.
- */
-void
-kmstartup(void)
-{
-       tostruct_t *cp;
-       kernel_segment_command_t        *sgp;   /* 32 bit mach object file segment */
-       struct gmonparam *p = &_gmonparam;
-
-       sgp = getsegbyname("__TEXT");
-       p->lowpc = (u_int32_t)sgp->vmaddr;
-       p->highpc = (u_int32_t)(sgp->vmaddr + sgp->vmsize);
-
-       /*
-        * Round lowpc and highpc to multiples of the density we're using
-        * so the rest of the scaling (here and in gprof) stays in ints.
-        */
-       p->lowpc = ROUNDDOWN(p->lowpc, HISTFRACTION * sizeof(HISTCOUNTER));
-       p->highpc = ROUNDUP(p->highpc, HISTFRACTION * sizeof(HISTCOUNTER));
-       p->textsize = p->highpc - p->lowpc;
-       printf("Profiling kernel, textsize=%lu [0x%016lx..0x%016lx]\n",
-           p->textsize, p->lowpc, p->highpc);
-       p->kcountsize = p->textsize / HISTFRACTION;
-       p->hashfraction = HASHFRACTION;
-       p->fromssize = p->textsize / HASHFRACTION;
-       p->tolimit = p->textsize * ARCDENSITY / 100;
-       if (p->tolimit < MINARCS) {
-               p->tolimit = MINARCS;
-       } else if (p->tolimit > MAXARCS) {
-               p->tolimit = MAXARCS;
-       }
-       p->tossize = p->tolimit * sizeof(tostruct_t);
-       /* Why not use MALLOC with M_GPROF ? */
-       cp = (tostruct_t *)kalloc(p->kcountsize + p->fromssize + p->tossize);
-       if (cp == 0) {
-               printf("No memory for profiling.\n");
-               return;
-       }
-       bzero(cp, p->kcountsize + p->tossize + p->fromssize);
-       p->tos = cp;
-       cp = (tostruct_t *)((vm_offset_t)cp + p->tossize);
-       p->kcount = (u_short *)cp;
-       cp = (tostruct_t *)((vm_offset_t)cp + p->kcountsize);
-       p->froms = (u_short *)cp;
-
-       mcount_lock_grp = lck_grp_alloc_init("MCOUNT", LCK_GRP_ATTR_NULL);
-       mcount_lock_attr = lck_attr_alloc_init();
-       mcount_lock = lck_spin_alloc_init(mcount_lock_grp, mcount_lock_attr);
-}
-
-/*
- * XXX         These should be broken out into per-argument OID values,
- * XXX         since there are no sub-OID parameter values, but unfortunately
- * XXX         there is barely enough time for an initial conversion.
- *
- * Note:       These items appear to be read/write.
- */
-STATIC int
-sysctl_doprofhandle SYSCTL_HANDLER_ARGS
-{
-       sysctl_doprof(int *name, u_int namelen, user_addr_t oldp, size_t * oldlenp,
-    user_addr_t newp, size_t newlen)
-       {
-               __unused int cmd = oidp->oid_arg2; /* subcommand*/
-               int *name = arg1;       /* oid element argument vector */
-               int namelen = arg2;     /* number of oid element arguments */
-               user_addr_t oldp = req->oldptr; /* user buffer copy out address */
-               size_t *oldlenp = req->oldlen; /* user buffer copy out size */
-               user_addr_t newp = req->newptr; /* user buffer copy in address */
-               size_t newlen = req->newlen; /* user buffer copy in size */
-
-               struct gmonparam *gp = &_gmonparam;
-               int error = 0;
-
-               /* all sysctl names at this level are terminal */
-               if (namelen != 1) {
-                       return ENOTDIR;       /* overloaded */
-               }
-               switch (name[0]) {
-               case GPROF_STATE:
-                       error = sysctl_int(oldp, oldlenp, newp, newlen, &gp->state);
-                       if (error) {
-                               break;
-                       }
-                       if (gp->state == GMON_PROF_OFF) {
-                               stopprofclock(kernproc);
-                       } else {
-                               startprofclock(kernproc);
-                       }
-                       break;
-               case GPROF_COUNT:
-                       error = sysctl_struct(oldp, oldlenp, newp, newlen,
-                           gp->kcount, gp->kcountsize);
-                       break;
-               case GPROF_FROMS:
-                       error = sysctl_struct(oldp, oldlenp, newp, newlen,
-                           gp->froms, gp->fromssize);
-                       break;
-               case GPROF_TOS:
-                       error = sysctl_struct(oldp, oldlenp, newp, newlen,
-                           gp->tos, gp->tossize);
-                       break;
-               case GPROF_GMONPARAM:
-                       error = sysctl_rdstruct(oldp, oldlenp, newp, gp, sizeof *gp);
-                       break;
-               default:
-                       error = ENOTSUP;
-                       break;
-               }
-
-               /* adjust index so we return the right required/consumed amount */
-               if (!error) {
-                       req->oldidx += req->oldlen;
-               }
-
-               return error;
-       }
-       SYSCTL_PROC(_kern, KERN_PROF, prof, STLFLAG_NODE | CTLFLAG_RW | CTLFLAG_LOCKED,
-           0,                  /* Pointer argument (arg1) */
-           0,                  /* Integer argument (arg2) */
-           sysctl_doprofhandle, /* Handler function */
-           NULL,               /* No explicit data */
-           "");
-
-
-/*
- * mcount() called with interrupts disabled.
- */
-       void
-       mcount(
-               uintptr_t frompc,
-               uintptr_t selfpc
-               )
-       {
-               unsigned short *frompcindex;
-               tostruct_t *top, *prevtop;
-               struct gmonparam *p = &_gmonparam;
-               long toindex;
-
-               /*
-                * check that we are profiling
-                * and that we aren't recursively invoked.
-                */
-               if (p->state != GMON_PROF_ON) {
-                       return;
-               }
-
-               lck_spin_lock(mcount_lock);
-
-               /*
-                *      check that frompcindex is a reasonable pc value.
-                *      for example:    signal catchers get called from the stack,
-                *                      not from text space.  too bad.
-                */
-               frompc -= p->lowpc;
-               if (frompc > p->textsize) {
-                       goto done;
-               }
-
-               frompcindex = &p->froms[frompc / (p->hashfraction * sizeof(*p->froms))];
-               toindex = *frompcindex;
-               if (toindex == 0) {
-                       /*
-                        *      first time traversing this arc
-                        */
-                       toindex = ++p->tos[0].link;
-                       if (toindex >= p->tolimit) {
-                               /* halt further profiling */
-                               goto overflow;
-                       }
-                       *frompcindex = toindex;
-                       top = &p->tos[toindex];
-                       top->selfpc = selfpc;
-                       top->count = 1;
-                       top->link = 0;
-                       goto done;
-               }
-               top = &p->tos[toindex];
-               if (top->selfpc == selfpc) {
-                       /*
-                        *      arc at front of chain; usual case.
-                        */
-                       top->count++;
-                       goto done;
-               }
-               /*
-                *      have to go looking down chain for it.
-                *      top points to what we are looking at,
-                *      prevtop points to previous top.
-                *      we know it is not at the head of the chain.
-                */
-               for (; /* goto done */;) {
-                       if (top->link == 0) {
-                               /*
-                                *      top is end of the chain and none of the chain
-                                *      had top->selfpc == selfpc.
-                                *      so we allocate a new tostruct
-                                *      and link it to the head of the chain.
-                                */
-                               toindex = ++p->tos[0].link;
-                               if (toindex >= p->tolimit) {
-                                       goto overflow;
-                               }
-                               top = &p->tos[toindex];
-                               top->selfpc = selfpc;
-                               top->count = 1;
-                               top->link = *frompcindex;
-                               *frompcindex = toindex;
-                               goto done;
-                       }
-                       /*
-                        *      otherwise, check the next arc on the chain.
-                        */
-                       prevtop = top;
-                       top = &p->tos[top->link];
-                       if (top->selfpc == selfpc) {
-                               /*
-                                *      there it is.
-                                *      increment its count
-                                *      move it to the head of the chain.
-                                */
-                               top->count++;
-                               toindex = prevtop->link;
-                               prevtop->link = top->link;
-                               top->link = *frompcindex;
-                               *frompcindex = toindex;
-                               goto done;
-                       }
-               }
-done:
-               lck_spin_unlock(mcount_lock);
-               return;
-
-overflow:
-               p->state = GMON_PROF_ERROR;
-               lck_spin_unlock(mcount_lock);
-               printf("mcount: tos overflow\n");
-               return;
-       }
-
-#endif /* GPROF */
-
-#define PROFILE_LOCK(x)
-#define PROFILE_UNLOCK(x)
-
-
-/*
- * Scale is a fixed-point number with the binary point 16 bits
- * into the value, and is <= 1.0.  pc is at most 32 bits, so the
- * intermediate result is at most 48 bits.
- */
-//K64todo - this doesn't fit into 64 bit any more, it needs 64+16
-#define PC_TO_INDEX(pc, prof) \
-       ((user_addr_t)(((u_quad_t)((pc) - (prof)->pr_off) * \
-                       (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
-
-/*
- * Collect user-level profiling statistics; called on a profiling tick,
- * when a process is running in user-mode. We use
- * an AST that will vector us to trap() with a context in which copyin
- * and copyout will work.  Trap will then call addupc_task().
- *
- * Note that we may (rarely) not get around to the AST soon enough, and
- * lose profile ticks when the next tick overwrites this one, but in this
- * case the system is overloaded and the profile is probably already
- * inaccurate.
- *
- * We can afford to take faults here.  If the
- * update fails, we simply turn off profiling.
- */
-void
-addupc_task(struct proc *p, user_addr_t pc, u_int ticks)
-{
-       user_addr_t off;
-       u_short count;
-
-       /* Testing P_PROFIL may be unnecessary, but is certainly safe. */
-       if ((p->p_flag & P_PROFIL) == 0 || ticks == 0) {
-               return;
-       }
-
-       if (proc_is64bit(p)) {
-               struct user_uprof *prof;
-               user_addr_t cell;
-
-               for (prof = &p->p_stats->user_p_prof; prof; prof = prof->pr_next) {
-                       off = PC_TO_INDEX(pc, prof);
-                       cell = (prof->pr_base + off);
-                       if (cell >= prof->pr_base &&
-                           cell < (prof->pr_size + prof->pr_base)) {
-                               if (copyin(cell, (caddr_t) &count, sizeof(count)) == 0) {
-                                       count += ticks;
-                                       if (copyout((caddr_t) &count, cell, sizeof(count)) == 0) {
-                                               return;
-                                       }
-                               }
-                               p->p_stats->user_p_prof.pr_scale = 0;
-                               stopprofclock(p);
-                               break;
-                       }
-               }
-       } else {
-               struct uprof *prof;
-               short *cell;
-
-               for (prof = &p->p_stats->p_prof; prof; prof = prof->pr_next) {
-                       off = PC_TO_INDEX(pc, prof);
-                       cell = (short *)(prof->pr_base + off);
-                       if (cell >= (short *)prof->pr_base &&
-                           cell < (short*)(prof->pr_size + prof->pr_base)) {
-                               if (copyin(CAST_USER_ADDR_T(cell), (caddr_t) &count, sizeof(count)) == 0) {
-                                       count += ticks;
-                                       if (copyout((caddr_t) &count, CAST_USER_ADDR_T(cell), sizeof(count)) == 0) {
-                                               return;
-                                       }
-                               }
-                               p->p_stats->p_prof.pr_scale = 0;
-                               stopprofclock(p);
-                               break;
-                       }
-               }
-       }
-}
index 879963f19d7e5630f8a2d3968a8ed86ad6545ea9..c3f69f22f410bbefa6a4d9a119a5148e03af4e24 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2018 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <sys/signalvar.h>              /* for psignal() */
 #include <kern/debug.h>
 
-#ifdef GPROF
-#include <sys/gmon.h>
-#endif
-
 #if DEVELOPMENT || DEBUG
 bool send_sigsys = true;
 #else
@@ -192,17 +188,6 @@ nosys(__unused struct proc *p, __unused struct nosys_args *args, __unused int32_
        return ENOSYS;
 }
 
-#ifdef  GPROF
-/*
- * Stub routine in case it is ever possible to free space.
- */
-void
-cfreemem(caddr_t cp, int size)
-{
-       printf("freeing %p, size %d\n", cp, size);
-}
-#endif
-
 #if !CRYPTO
 #include <crypto/rc4/rc4.h>
 
index bfbd9c9ca60744ad2460af5d843a430f6593bc67..5b1d7d7ac426c5a0c9d2879e0faede5175951045 100644 (file)
@@ -7,6 +7,7 @@
 
 #include <sys/coalition.h>
 #include <sys/errno.h>
+#include <sys/kauth.h>
 #include <sys/kernel.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
@@ -260,6 +261,27 @@ coalition_info_efficiency(coalition_t coal, user_addr_t buffer, user_size_t bufs
        return error;
 }
 
+static int
+coalition_ledger_logical_writes_limit(coalition_t coal, user_addr_t buffer, user_size_t bufsize)
+{
+       int error = 0;
+       int64_t limit = 0;
+
+       if (coalition_type(coal) != COALITION_TYPE_RESOURCE) {
+               error = EINVAL;
+               goto out;
+       }
+       error = copyin(buffer, &limit, MIN(bufsize, sizeof(limit)));
+       if (error) {
+               goto out;
+       }
+
+
+       error = coalition_ledger_set_logical_writes_limit(coal, limit);
+out:
+       return error;
+}
+
 int
 coalition_info(proc_t p, struct coalition_info_args *uap, __unused int32_t *retval)
 {
@@ -315,6 +337,60 @@ bad:
        return error;
 }
 
+int
+coalition_ledger(__unused proc_t p, __unused struct coalition_ledger_args *uap, __unused int32_t *retval)
+{
+       user_addr_t cidp = uap->cid;
+       user_addr_t buffer = uap->buffer;
+       user_addr_t bufsizep = uap->bufsize;
+       user_size_t bufsize;
+       uint32_t operation = uap->operation;
+       int error;
+       uint64_t cid;
+       coalition_t coal = COALITION_NULL;
+
+       if (!kauth_cred_issuser(kauth_cred_get())) {
+               error = EPERM;
+               goto out;
+       }
+
+       error = copyin(cidp, &cid, sizeof(cid));
+       if (error) {
+               goto out;
+       }
+
+       coal = coalition_find_by_id(cid);
+       if (coal == COALITION_NULL) {
+               error = ESRCH;
+               goto out;
+       }
+
+       if (IS_64BIT_PROCESS(p)) {
+               user64_size_t size64;
+               error = copyin(bufsizep, &size64, sizeof(size64));
+               bufsize = (user_size_t)size64;
+       } else {
+               user32_size_t size32;
+               error = copyin(bufsizep, &size32, sizeof(size32));
+               bufsize = (user_size_t)size32;
+       }
+       if (error) {
+               goto out;
+       }
+
+       switch (operation) {
+       case COALITION_LEDGER_SET_LOGICAL_WRITES_LIMIT:
+               error = coalition_ledger_logical_writes_limit(coal, buffer, bufsize);
+               break;
+       default:
+               error = EINVAL;
+       }
+out:
+       if (coal != COALITION_NULL) {
+               coalition_release(coal);
+       }
+       return error;
+}
 #if DEVELOPMENT || DEBUG
 static int sysctl_coalition_get_ids SYSCTL_HANDLER_ARGS
 {
@@ -418,8 +494,7 @@ static int sysctl_coalition_get_page_count SYSCTL_HANDLER_ARGS
        memset(pgcount, 0, sizeof(pgcount));
 
        for (int t = 0; t < COALITION_NUM_TYPES; t++) {
-               coal = COALITION_NULL;
-               coalition_is_leader(tproc->task, t, &coal);
+               coal = task_get_coalition(tproc->task, t);
                if (coal != COALITION_NULL) {
                        int ntasks = 0;
                        pgcount[t] = coalition_get_page_count(coal, &ntasks);
@@ -484,7 +559,7 @@ static int sysctl_coalition_get_pid_list SYSCTL_HANDLER_ARGS
                return ESRCH;
        }
 
-       (void)coalition_is_leader(tproc->task, type, &coal);
+       coal = task_get_coalition(tproc->task, type);
        if (coal == COALITION_NULL) {
                goto out;
        }
index ec07b11fc7986774541547d34935607f69257c96..d9fb9d1f9cec0fb2ebf1a188b0bf928767506a30 100644 (file)
 #include <kern/telemetry.h>
 #include <kern/waitq.h>
 #include <kern/sched_prim.h>
+#include <kern/mpsc_queue.h>
 
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/vnode_internal.h>
 /* for remote time api*/
 #include <kern/remote_time.h>
+#include <os/log.h>
+#include <sys/log_data.h>
 
 #if CONFIG_MACF
 #include <security/mac_framework.h>
 #endif
 
+/* for entitlement check */
+#include <IOKit/IOBSD.h>
+
 /* XXX should be in a header file somewhere */
 void evsofree(struct socket *);
 void evpipefree(struct pipe *);
@@ -353,7 +359,7 @@ dofileread(vfs_context_t ctx, struct fileproc *fp,
 {
        uio_t auio;
        user_ssize_t bytecnt;
-       long error = 0;
+       int error = 0;
        char uio_buf[UIO_SIZEOF(1)];
 
        if (nbyte > INT_MAX) {
@@ -367,7 +373,10 @@ dofileread(vfs_context_t ctx, struct fileproc *fp,
                auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
                    &uio_buf[0], sizeof(uio_buf));
        }
-       uio_addiov(auio, bufp, nbyte);
+       if (uio_addiov(auio, bufp, nbyte) != 0) {
+               *retval = 0;
+               return EINVAL;
+       }
 
        bytecnt = nbyte;
 
@@ -590,7 +599,7 @@ dofilewrite(vfs_context_t ctx, struct fileproc *fp,
     user_ssize_t *retval)
 {
        uio_t auio;
-       long error = 0;
+       int error = 0;
        user_ssize_t bytecnt;
        char uio_buf[UIO_SIZEOF(1)];
 
@@ -606,7 +615,10 @@ dofilewrite(vfs_context_t ctx, struct fileproc *fp,
                auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
                    &uio_buf[0], sizeof(uio_buf));
        }
-       uio_addiov(auio, bufp, nbyte);
+       if (uio_addiov(auio, bufp, nbyte) != 0) {
+               *retval = 0;
+               return EINVAL;
+       }
 
        bytecnt = nbyte;
        if ((error = fo_write(fp, auio, flags, ctx))) {
@@ -911,7 +923,7 @@ ioctl(struct proc *p, struct ioctl_args *uap, __unused int32_t *retval)
                        break;
                }
                if (fp->f_type == DTYPE_PIPE) {
-                       error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
+                       error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context);
                        break;
                }
                if (tmp <= 0) {
@@ -925,7 +937,7 @@ ioctl(struct proc *p, struct ioctl_args *uap, __unused int32_t *retval)
                        tmp = p1->p_pgrpid;
                        proc_rele(p1);
                }
-               error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
+               error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context);
                break;
 
        case FIOGETOWN:
@@ -1623,7 +1635,7 @@ selscan(struct proc *p, struct _select *sel, struct _select_data * seldata,
                        bits = iptr[i / NFDBITS];
 
                        while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
-                               bits &= ~(1 << j);
+                               bits &= ~(1U << j);
 
                                if (fd < fdp->fd_nfiles) {
                                        fp = fdp->fd_ofiles[fd];
@@ -1667,7 +1679,7 @@ selscan(struct proc *p, struct _select *sel, struct _select_data * seldata,
                                /* The select; set the bit, if true */
                                if (fp->f_ops && fp->f_type
                                    && fo_select(fp, flag[msk], rl_ptr, &context)) {
-                                       optr[fd / NFDBITS] |= (1 << (fd % NFDBITS));
+                                       optr[fd / NFDBITS] |= (1U << (fd % NFDBITS));
                                        n++;
                                }
                                if (sel_pass == SEL_FIRSTPASS) {
@@ -1699,13 +1711,7 @@ selscan(struct proc *p, struct _select *sel, struct _select_data * seldata,
        return 0;
 }
 
-int poll_callback(struct kqueue *, struct kevent_internal_s *, void *);
-
-struct poll_continue_args {
-       user_addr_t pca_fds;
-       u_int pca_nfds;
-       u_int pca_rfds;
-};
+static int poll_callback(struct kevent_qos_s *, kevent_ctx_t);
 
 int
 poll(struct proc *p, struct poll_args *uap, int32_t *retval)
@@ -1718,15 +1724,11 @@ poll(struct proc *p, struct poll_args *uap, int32_t *retval)
 int
 poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
 {
-       struct poll_continue_args *cont;
-       struct pollfd *fds;
-       struct kqueue *kq;
-       struct timeval atv;
+       struct pollfd *fds = NULL;
+       struct kqueue *kq = NULL;
        int ncoll, error = 0;
        u_int nfds = uap->nfds;
        u_int rfds = 0;
-       u_int i;
-       size_t ni;
 
        /*
         * This is kinda bogus.  We have fd limits, but that is not
@@ -1740,46 +1742,30 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
                return EINVAL;
        }
 
-       kq = kqueue_alloc(p, 0);
+       kq = kqueue_alloc(p);
        if (kq == NULL) {
                return EAGAIN;
        }
 
-       ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args);
-       MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK);
-       if (NULL == cont) {
-               error = EAGAIN;
-               goto out;
-       }
-
-       fds = (struct pollfd *)&cont[1];
-       error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
-       if (error) {
-               goto out;
-       }
-
-       if (uap->timeout != -1) {
-               struct timeval rtv;
+       if (nfds) {
+               size_t ni = nfds * sizeof(struct pollfd);
+               MALLOC(fds, struct pollfd *, ni, M_TEMP, M_WAITOK);
+               if (NULL == fds) {
+                       error = EAGAIN;
+                       goto out;
+               }
 
-               atv.tv_sec = uap->timeout / 1000;
-               atv.tv_usec = (uap->timeout % 1000) * 1000;
-               if (itimerfix(&atv)) {
-                       error = EINVAL;
+               error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
+               if (error) {
                        goto out;
                }
-               getmicrouptime(&rtv);
-               timevaladd(&atv, &rtv);
-       } else {
-               atv.tv_sec = 0;
-               atv.tv_usec = 0;
        }
 
        /* JMM - all this P_SELECT stuff is bogus */
        ncoll = nselcoll;
        OSBitOrAtomic(P_SELECT, &p->p_flag);
-       for (i = 0; i < nfds; i++) {
+       for (u_int i = 0; i < nfds; i++) {
                short events = fds[i].events;
-               KNOTE_LOCK_CTX(knlc);
                __assert_only int rc;
 
                /* per spec, ignore fd values below zero */
@@ -1789,7 +1775,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
                }
 
                /* convert the poll event into a kqueue kevent */
-               struct kevent_internal_s kev = {
+               struct kevent_qos_s kev = {
                        .ident = fds[i].fd,
                        .flags = EV_ADD | EV_ONESHOT | EV_POLL,
                        .udata = CAST_USER_ADDR_T(&fds[i])
@@ -1801,7 +1787,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
                        if (events & (POLLPRI | POLLRDBAND)) {
                                kev.flags |= EV_OOBAND;
                        }
-                       rc = kevent_register(kq, &kev, &knlc);
+                       rc = kevent_register(kq, &kev, NULL);
                        assert((rc & FILTER_REGISTER_WAIT) == 0);
                }
 
@@ -1809,7 +1795,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
                if ((kev.flags & EV_ERROR) == 0 &&
                    (events & (POLLOUT | POLLWRNORM | POLLWRBAND))) {
                        kev.filter = EVFILT_WRITE;
-                       rc = kevent_register(kq, &kev, &knlc);
+                       rc = kevent_register(kq, &kev, NULL);
                        assert((rc & FILTER_REGISTER_WAIT) == 0);
                }
 
@@ -1830,7 +1816,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
                        if (events & POLLWRITE) {
                                kev.fflags |= NOTE_WRITE;
                        }
-                       rc = kevent_register(kq, &kev, &knlc);
+                       rc = kevent_register(kq, &kev, NULL);
                        assert((rc & FILTER_REGISTER_WAIT) == 0);
                }
 
@@ -1854,21 +1840,27 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
                goto done;
        }
 
+       /* scan for, and possibly wait for, the kevents to trigger */
+       kevent_ctx_t kectx = kevent_get_context(current_thread());
+       *kectx = (struct kevent_ctx_s){
+               .kec_process_noutputs = rfds,
+               .kec_process_flags    = KEVENT_FLAG_POLL,
+               .kec_deadline         = 0, /* wait forever */
+       };
+
        /*
         * If any events have trouble registering, an event has fired and we
-        * shouldn't wait for events in kqueue_scan -- use the current time as
-        * the deadline.
+        * shouldn't wait for events in kqueue_scan.
         */
        if (rfds) {
-               getmicrouptime(&atv);
+               kectx->kec_process_flags |= KEVENT_FLAG_IMMEDIATE;
+       } else if (uap->timeout != -1) {
+               clock_interval_to_deadline(uap->timeout, NSEC_PER_MSEC,
+                   &kectx->kec_deadline);
        }
 
-       /* scan for, and possibly wait for, the kevents to trigger */
-       cont->pca_fds = uap->fds;
-       cont->pca_nfds = nfds;
-       cont->pca_rfds = rfds;
-       error = kqueue_scan(kq, poll_callback, NULL, cont, NULL, &atv, p);
-       rfds = cont->pca_rfds;
+       error = kqueue_scan(kq, kectx->kec_process_flags, kectx, poll_callback);
+       rfds = kectx->kec_process_noutputs;
 
 done:
        OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
@@ -1876,27 +1868,23 @@ done:
        if (error == ERESTART) {
                error = EINTR;
        }
-       if (error == EWOULDBLOCK) {
-               error = 0;
-       }
        if (error == 0) {
                error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
                *retval = rfds;
        }
 
 out:
-       if (NULL != cont) {
-               FREE(cont, M_TEMP);
+       if (NULL != fds) {
+               FREE(fds, M_TEMP);
        }
 
        kqueue_dealloc(kq);
        return error;
 }
 
-int
-poll_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp, void *data)
+static int
+poll_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
 {
-       struct poll_continue_args *cont = (struct poll_continue_args *)data;
        struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
        short prev_revents = fds->revents;
        short mask = 0;
@@ -1945,7 +1933,7 @@ poll_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp, void *
        }
 
        if (fds->revents != 0 && prev_revents == 0) {
-               cont->pca_rfds++;
+               kectx->kec_process_noutputs++;
        }
 
        return 0;
@@ -2011,7 +1999,7 @@ selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp)
                for (i = 0; i < nfd; i += NFDBITS) {
                        bits = iptr[i / NFDBITS];
                        while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
-                               bits &= ~(1 << j);
+                               bits &= ~(1U << j);
 
                                if (fd < fdp->fd_nfiles) {
                                        fp = fdp->fd_ofiles[fd];
@@ -2025,7 +2013,7 @@ selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp)
                                        error = EBADF;
                                        goto bad;
                                }
-                               fp->f_iocount++;
+                               os_ref_retain_locked(&fp->f_iocount);
                                n++;
                        }
                }
@@ -2111,7 +2099,7 @@ seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wak
                for (i = 0; i < nfd; i += NFDBITS) {
                        bits = iptr[i / NFDBITS];
                        while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
-                               bits &= ~(1 << j);
+                               bits &= ~(1U << j);
                                fp = fdp->fd_ofiles[fd];
                                /*
                                 * If we've already dropped as many as were
@@ -2138,12 +2126,12 @@ seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wak
                                        continue;
                                }
 
-                               fp->f_iocount--;
-                               if (fp->f_iocount < 0) {
+                               const os_ref_count_t refc = os_ref_release_locked(&fp->f_iocount);
+                               if (0 == refc) {
                                        panic("f_iocount overdecrement!");
                                }
 
-                               if (fp->f_iocount == 0) {
+                               if (1 == refc) {
                                        /*
                                         * The last iocount is responsible for clearing
                                         * selconfict flag - even if we didn't set it -
@@ -3184,7 +3172,6 @@ waitevent_close(struct proc *p, struct fileproc *fp)
  *
  * Parameters: uuid_buf                Pointer to buffer to receive UUID
  *             timeout                 Timespec for timout
- *             spi                             SPI, skip sandbox check (temporary)
  *
  * Returns:    0                       Success
  *             EWOULDBLOCK             Timeout is too short
@@ -3202,7 +3189,8 @@ gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused int32_t *retv
        mach_timespec_t mach_ts;        /* for IOKit call */
        __darwin_uuid_t uuid_kern = {}; /* for IOKit call */
 
-       if (!uap->spi) {
+       /* Check entitlement */
+       if (!IOTaskHasEntitlement(current_task(), "com.apple.private.getprivatesysid")) {
 #if CONFIG_EMBEDDED
 #if CONFIG_MACF
                if ((error = mac_system_check_info(kauth_cred_get(), "hw.uuid")) != 0) {
@@ -3403,6 +3391,86 @@ telemetry(__unused struct proc *p, struct telemetry_args *args, __unused int32_t
        return error;
 }
 
+/*
+ * Logging
+ *
+ * Description: syscall to access kernel logging from userspace
+ *
+ * Args:
+ *     tag - used for syncing with userspace on the version.
+ *     flags - flags used by the syscall.
+ *     buffer - userspace address of string to copy.
+ *     size - size of buffer.
+ */
+int
+log_data(__unused struct proc *p, struct log_data_args *args, int *retval)
+{
+       unsigned int tag = args->tag;
+       unsigned int flags = args->flags;
+       user_addr_t buffer = args->buffer;
+       unsigned int size = args->size;
+       int ret = 0;
+       char *log_msg = NULL;
+       int error;
+       *retval = 0;
+
+       /*
+        * Tag synchronize the syscall version with userspace.
+        * Tag == 0 => flags == OS_LOG_TYPE
+        */
+       if (tag != 0) {
+               return EINVAL;
+       }
+
+       /*
+        * OS_LOG_TYPE are defined in libkern/os/log.h
+        * In userspace they are defined in libtrace/os/log.h
+        */
+       if (flags != OS_LOG_TYPE_DEFAULT &&
+           flags != OS_LOG_TYPE_INFO &&
+           flags != OS_LOG_TYPE_DEBUG &&
+           flags != OS_LOG_TYPE_ERROR &&
+           flags != OS_LOG_TYPE_FAULT) {
+               return EINVAL;
+       }
+
+       if (size == 0) {
+               return EINVAL;
+       }
+
+       /* truncate to OS_LOG_DATA_MAX_SIZE */
+       if (size > OS_LOG_DATA_MAX_SIZE) {
+               printf("%s: WARNING msg is going to be truncated from %u to %u\n", __func__, size, OS_LOG_DATA_MAX_SIZE);
+               size = OS_LOG_DATA_MAX_SIZE;
+       }
+
+       log_msg = kalloc(size);
+       if (!log_msg) {
+               return ENOMEM;
+       }
+
+       error = copyin(buffer, log_msg, size);
+       if (error) {
+               ret = EFAULT;
+               goto out;
+       }
+       log_msg[size - 1] = '\0';
+
+       /*
+        * This will log to dmesg and logd.
+        * The call will fail if the current
+        * process is not a driverKit process.
+        */
+       os_log_driverKit(&ret, OS_LOG_DEFAULT, flags, "%s", log_msg);
+
+out:
+       if (log_msg != NULL) {
+               kfree(log_msg, size);
+       }
+
+       return ret;
+}
+
 #if DEVELOPMENT || DEBUG
 #if CONFIG_WAITQ_DEBUG
 static uint64_t g_wqset_num = 0;
@@ -3835,6 +3903,30 @@ SYSCTL_PROC(_kern, OID_AUTO, n_ltable_entries, CTLFLAG_RD | CTLFLAG_LOCKED,
     0, 0, sysctl_waitq_set_nelem, "I", "ltable elementis currently used");
 
 
+static int
+sysctl_mpsc_test_pingpong SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       uint64_t value = 0;
+       int error;
+
+       error = SYSCTL_IN(req, &value, sizeof(value));
+       if (error) {
+               return error;
+       }
+
+       if (error == 0 && req->newptr) {
+               error = mpsc_test_pingpong(value, &value);
+               if (error == 0) {
+                       error = SYSCTL_OUT(req, &value, sizeof(value));
+               }
+       }
+
+       return error;
+}
+SYSCTL_PROC(_kern, OID_AUTO, mpsc_test_pingpong, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
+    0, 0, sysctl_mpsc_test_pingpong, "Q", "MPSC tests: pingpong");
+
 #endif /* DEVELOPMENT || DEBUG */
 
 /*Remote Time api*/
@@ -3858,7 +3950,7 @@ static int sysctl_mach_bridge_timer_enable SYSCTL_HANDLER_ARGS
                req->oldidx = sizeof(value);
                return 0;
        }
-       if (bt_init_flag) {
+       if (os_atomic_load(&bt_init_flag, acquire)) {
                if (req->newptr) {
                        int new_value = 0;
                        error = SYSCTL_IN(req, &new_value, sizeof(new_value));
@@ -3931,6 +4023,13 @@ SYSCTL_PROC(_machdep_remotetime, OID_AUTO, conversion_params,
 
 #endif /* CONFIG_MACH_BRIDGE_RECV_TIME */
 
+#if DEVELOPMENT || DEBUG
+#endif /* DEVELOPMENT || DEBUG */
+
+extern uint32_t task_exc_guard_default;
+
+SYSCTL_INT(_kern, OID_AUTO, task_exc_guard_default,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &task_exc_guard_default, 0, "");
 
 
 static int
@@ -4022,4 +4121,4 @@ sysctl_kern_sched_thread_set_no_smt(__unused struct sysctl_oid *oidp, __unused v
 SYSCTL_PROC(_kern, OID_AUTO, sched_thread_set_no_smt,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
     0, 0, sysctl_kern_sched_thread_set_no_smt, "I", "");
-#endif
+#endif /* DEVELOPMENT || DEBUG */
index e2964c118b580c4183e620e0eecea17c51ccce92..186f829933998111314dde9d4968be3da3337f33 100644 (file)
 #include <sys/persona.h>
 #include <sys/proc.h>
 
+#include <kern/task.h>
+#include <kern/thread.h>
+#include <mach/thread_act.h>
+#include <mach/mach_types.h>
+
 #include <libkern/libkern.h>
+#include <IOKit/IOBSD.h>
+
+extern kern_return_t bank_get_bank_ledger_thread_group_and_persona(void *voucher,
+    void *bankledger, void **banktg, uint32_t *persona_id);
 
 static int
 kpersona_copyin(user_addr_t infop, struct kpersona_info *kinfo)
@@ -84,19 +93,16 @@ kpersona_copyout(struct kpersona_info *kinfo, user_addr_t infop)
 
 
 static int
-kpersona_alloc_syscall(user_addr_t infop, user_addr_t idp)
+kpersona_alloc_syscall(user_addr_t infop, user_addr_t idp, user_addr_t path)
 {
        int error;
        struct kpersona_info kinfo;
-       struct persona *persona;
+       struct persona *persona = NULL;
        uid_t id = PERSONA_ID_NONE;
        const char *login;
+       char *pna_path = NULL;
 
-       /*
-        * TODO: rdar://problem/19981151
-        * Add entitlement check!
-        */
-       if (!kauth_cred_issuser(kauth_cred_get())) {
+       if (!IOTaskHasEntitlement(current_task(), PERSONA_MGMT_ENTITLEMENT)) {
                return EPERM;
        }
 
@@ -110,12 +116,31 @@ kpersona_alloc_syscall(user_addr_t infop, user_addr_t idp)
                id = kinfo.persona_id;
        }
 
+       if (path) {
+               MALLOC_ZONE(pna_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK | M_ZERO);
+               if (pna_path == NULL) {
+                       return ENOMEM;
+               }
+               size_t pathlen;
+               error = copyinstr(path, (void *)pna_path, MAXPATHLEN, &pathlen);
+               if (error) {
+                       FREE_ZONE(pna_path, MAXPATHLEN, M_NAMEI);
+                       return error;
+               }
+       }
+
        error = 0;
-       persona = persona_alloc(id, login, kinfo.persona_type, &error);
+       persona = persona_alloc(id, login, kinfo.persona_type, pna_path, &error);
        if (!persona) {
+               if (pna_path != NULL) {
+                       FREE_ZONE(pna_path, MAXPATHLEN, M_NAMEI);
+               }
                return error;
        }
 
+       /* persona struct contains a reference to pna_path */
+       pna_path = NULL;
+
        error = persona_init_begin(persona);
        if (error) {
                goto out_persona_err;
@@ -153,6 +178,11 @@ kpersona_alloc_syscall(user_addr_t infop, user_addr_t idp)
                goto out_persona_err;
        }
 
+       error = persona_verify_and_set_uniqueness(persona);
+       if (error) {
+               goto out_persona_err;
+       }
+
        persona_init_end(persona, error);
 
        /*
@@ -182,7 +212,7 @@ kpersona_dealloc_syscall(user_addr_t idp)
        uid_t persona_id;
        struct persona *persona;
 
-       if (!kauth_cred_issuser(kauth_cred_get())) {
+       if (!IOTaskHasEntitlement(current_task(), PERSONA_MGMT_ENTITLEMENT)) {
                return EPERM;
        }
 
@@ -211,7 +241,9 @@ static int
 kpersona_get_syscall(user_addr_t idp)
 {
        int error;
-       struct persona *persona = current_persona_get();
+       struct persona *persona;
+
+       persona = current_persona_get();
 
        if (!persona) {
                return ESRCH;
@@ -223,10 +255,60 @@ kpersona_get_syscall(user_addr_t idp)
        return error;
 }
 
+static int
+kpersona_getpath_syscall(user_addr_t idp, user_addr_t path)
+{
+       int error;
+       uid_t persona_id;
+       struct persona *persona;
+       size_t pathlen;
+       uid_t current_persona_id = PERSONA_ID_NONE;
+
+       if (!path) {
+               return EINVAL;
+       }
+
+       error = copyin(idp, &persona_id, sizeof(persona_id));
+       if (error) {
+               return error;
+       }
+
+       /* Get current thread's persona id to compare if the
+        * input persona_id matches the current persona id
+        */
+       persona = current_persona_get();
+       if (persona) {
+               current_persona_id = persona->pna_id;
+       }
+
+       if (persona_id && persona_id != current_persona_id) {
+               /* Release the reference on the current persona id's persona */
+               persona_put(persona);
+               if (!kauth_cred_issuser(kauth_cred_get()) &&
+                   !IOTaskHasEntitlement(current_task(), PERSONA_MGMT_ENTITLEMENT)) {
+                       return EPERM;
+               }
+               persona = persona_lookup(persona_id);
+       }
+
+       if (!persona) {
+               return ESRCH;
+       }
+
+       if (persona->pna_path) {
+               error = copyoutstr((void *)persona->pna_path, path, MAXPATHLEN, &pathlen);
+       }
+
+       persona_put(persona);
+
+       return error;
+}
+
 static int
 kpersona_info_syscall(user_addr_t idp, user_addr_t infop)
 {
        int error;
+       uid_t current_persona_id = PERSONA_ID_NONE;
        uid_t persona_id;
        struct persona *persona;
        struct kpersona_info kinfo;
@@ -236,12 +318,24 @@ kpersona_info_syscall(user_addr_t idp, user_addr_t infop)
                return error;
        }
 
-       /*
-        * TODO: rdar://problem/19981151
-        * Add entitlement check!
+       /* Get current thread's persona id to compare if the
+        * input persona_id matches the current persona id
         */
+       persona = current_persona_get();
+       if (persona) {
+               current_persona_id = persona->pna_id;
+       }
+
+       if (persona_id && persona_id != current_persona_id) {
+               /* Release the reference on the current persona id's persona */
+               persona_put(persona);
+               if (!kauth_cred_issuser(kauth_cred_get()) &&
+                   !IOTaskHasEntitlement(current_task(), PERSONA_MGMT_ENTITLEMENT)) {
+                       return EPERM;
+               }
+               persona = persona_lookup(persona_id);
+       }
 
-       persona = persona_lookup(persona_id);
        if (!persona) {
                return ESRCH;
        }
@@ -350,7 +444,7 @@ kpersona_find_syscall(user_addr_t infop, user_addr_t idp, user_addr_t idlenp)
        }
 
        k_idlen = u_idlen;
-       error = persona_find(login, kinfo.persona_id, persona, &k_idlen);
+       error = persona_find_all(login, kinfo.persona_id, kinfo.persona_type, persona, &k_idlen);
        if (error) {
                goto out;
        }
@@ -381,7 +475,6 @@ out:
        return error;
 }
 
-
 /*
  * Syscall entry point / demux.
  */
@@ -393,10 +486,14 @@ persona(__unused proc_t p, struct persona_args *pargs, __unused int32_t *retval)
        /* uint32_t flags = pargs->flags; */
        user_addr_t infop = pargs->info;
        user_addr_t idp = pargs->id;
+       user_addr_t path = pargs->path;
 
        switch (op) {
        case PERSONA_OP_ALLOC:
-               error = kpersona_alloc_syscall(infop, idp);
+               error = kpersona_alloc_syscall(infop, idp, USER_ADDR_NULL);
+               break;
+       case PERSONA_OP_PALLOC:
+               error = kpersona_alloc_syscall(infop, idp, path);
                break;
        case PERSONA_OP_DEALLOC:
                error = kpersona_dealloc_syscall(idp);
@@ -404,6 +501,9 @@ persona(__unused proc_t p, struct persona_args *pargs, __unused int32_t *retval)
        case PERSONA_OP_GET:
                error = kpersona_get_syscall(idp);
                break;
+       case PERSONA_OP_GETPATH:
+               error = kpersona_getpath_syscall(idp, path);
+               break;
        case PERSONA_OP_INFO:
                error = kpersona_info_syscall(idp, infop);
                break;
@@ -411,6 +511,7 @@ persona(__unused proc_t p, struct persona_args *pargs, __unused int32_t *retval)
                error = kpersona_pidinfo_syscall(idp, infop);
                break;
        case PERSONA_OP_FIND:
+       case PERSONA_OP_FIND_BY_TYPE:
                error = kpersona_find_syscall(infop, idp, pargs->idlen);
                break;
        default:
index cf0e5f2b02a26d55ee9abd05e69141c7dacbc287..ef7dcbab1f17905db33ed2ade735b919bd44dda9 100644 (file)
 #define f_offset f_fglob->fg_offset
 #define f_data f_fglob->fg_data
 
+struct pipepair {
+       lck_mtx_t     pp_mtx;
+       struct pipe   pp_rpipe;
+       struct pipe   pp_wpipe;
+};
+
+#define PIPE_PAIR(pipe) \
+               __container_of(PIPE_MTX(pipe), struct pipepair, pp_mtx)
+
 /*
  * interfaces to the outside world exported through file operations
  */
@@ -170,45 +179,57 @@ static int pipe_close(struct fileglob *fg, vfs_context_t ctx);
 static int pipe_select(struct fileproc *fp, int which, void * wql,
     vfs_context_t ctx);
 static int pipe_kqfilter(struct fileproc *fp, struct knote *kn,
-    struct kevent_internal_s *kev, vfs_context_t ctx);
+    struct kevent_qos_s *kev);
 static int pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data,
     vfs_context_t ctx);
 static int pipe_drain(struct fileproc *fp, vfs_context_t ctx);
 
 static const struct fileops pipeops = {
-       .fo_type = DTYPE_PIPE,
-       .fo_read = pipe_read,
-       .fo_write = pipe_write,
-       .fo_ioctl = pipe_ioctl,
-       .fo_select = pipe_select,
-       .fo_close = pipe_close,
+       .fo_type     = DTYPE_PIPE,
+       .fo_read     = pipe_read,
+       .fo_write    = pipe_write,
+       .fo_ioctl    = pipe_ioctl,
+       .fo_select   = pipe_select,
+       .fo_close    = pipe_close,
+       .fo_drain    = pipe_drain,
        .fo_kqfilter = pipe_kqfilter,
-       .fo_drain = pipe_drain,
 };
 
 static void filt_pipedetach(struct knote *kn);
 
+static int filt_pipenotsup(struct knote *kn, long hint);
+static int filt_pipenotsuptouch(struct knote *kn, struct kevent_qos_s *kev);
+static int filt_pipenotsupprocess(struct knote *kn, struct kevent_qos_s *kev);
+
 static int filt_piperead(struct knote *kn, long hint);
-static int filt_pipereadtouch(struct knote *kn, struct kevent_internal_s *kev);
-static int filt_pipereadprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
+static int filt_pipereadtouch(struct knote *kn, struct kevent_qos_s *kev);
+static int filt_pipereadprocess(struct knote *kn, struct kevent_qos_s *kev);
 
 static int filt_pipewrite(struct knote *kn, long hint);
-static int filt_pipewritetouch(struct knote *kn, struct kevent_internal_s *kev);
-static int filt_pipewriteprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
+static int filt_pipewritetouch(struct knote *kn, struct kevent_qos_s *kev);
+static int filt_pipewriteprocess(struct knote *kn, struct kevent_qos_s *kev);
+
+SECURITY_READ_ONLY_EARLY(struct filterops) pipe_nfiltops = {
+       .f_isfd    = 1,
+       .f_detach  = filt_pipedetach,
+       .f_event   = filt_pipenotsup,
+       .f_touch   = filt_pipenotsuptouch,
+       .f_process = filt_pipenotsupprocess,
+};
 
 SECURITY_READ_ONLY_EARLY(struct filterops) pipe_rfiltops = {
-       .f_isfd = 1,
-       .f_detach = filt_pipedetach,
-       .f_event = filt_piperead,
-       .f_touch = filt_pipereadtouch,
+       .f_isfd    = 1,
+       .f_detach  = filt_pipedetach,
+       .f_event   = filt_piperead,
+       .f_touch   = filt_pipereadtouch,
        .f_process = filt_pipereadprocess,
 };
 
 SECURITY_READ_ONLY_EARLY(struct filterops) pipe_wfiltops = {
-       .f_isfd = 1,
-       .f_detach = filt_pipedetach,
-       .f_event = filt_pipewrite,
-       .f_touch = filt_pipewritetouch,
+       .f_isfd    = 1,
+       .f_detach  = filt_pipedetach,
+       .f_event   = filt_pipewrite,
+       .f_touch   = filt_pipewritetouch,
        .f_process = filt_pipewriteprocess,
 };
 
@@ -235,9 +256,9 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD | CTLFLAG_LOCKED,
     &amountpipekvawired, 0, "Pipe wired KVA usage");
 #endif
 
+static int pipepair_alloc(struct pipe **rpipe, struct pipe **wpipe);
 static void pipeclose(struct pipe *cpipe);
 static void pipe_free_kmem(struct pipe *cpipe);
-static int pipe_create(struct pipe **cpipep);
 static int pipespace(struct pipe *cpipe, int size);
 static int choose_pipespace(unsigned long current, unsigned long expected);
 static int expand_pipespace(struct pipe *p, int target_size);
@@ -256,23 +277,6 @@ static zone_t pipe_zone;
 
 #define MAX_PIPESIZE(pipe)              ( MAX(PIPE_SIZE, (pipe)->pipe_buffer.size) )
 
-#define PIPE_GARBAGE_AGE_LIMIT          5000    /* In milliseconds */
-#define PIPE_GARBAGE_QUEUE_LIMIT        32000
-
-struct pipe_garbage {
-       struct pipe             *pg_pipe;
-       struct pipe_garbage     *pg_next;
-       uint64_t                pg_timestamp;
-};
-
-static zone_t pipe_garbage_zone;
-static struct pipe_garbage *pipe_garbage_head = NULL;
-static struct pipe_garbage *pipe_garbage_tail = NULL;
-static uint64_t pipe_garbage_age_limit = PIPE_GARBAGE_AGE_LIMIT;
-static int pipe_garbage_count = 0;
-static lck_mtx_t *pipe_garbage_lock;
-static void pipe_garbage_collect(struct pipe *cpipe);
-
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
 
 /* initial setup done at time of sysinit */
@@ -282,8 +286,8 @@ pipeinit(void)
        nbigpipe = 0;
        vm_size_t zone_size;
 
-       zone_size = 8192 * sizeof(struct pipe);
-       pipe_zone = zinit(sizeof(struct pipe), zone_size, 4096, "pipe zone");
+       zone_size = 8192 * sizeof(struct pipepair);
+       pipe_zone = zinit(sizeof(struct pipepair), zone_size, 4096, "pipe zone");
 
 
        /* allocate lock group attribute and group for pipe mutexes */
@@ -292,15 +296,6 @@ pipeinit(void)
 
        /* allocate the lock attribute for pipe mutexes */
        pipe_mtx_attr = lck_attr_alloc_init();
-
-       /*
-        * Set up garbage collection for dead pipes
-        */
-       zone_size = (PIPE_GARBAGE_QUEUE_LIMIT + 20) *
-           sizeof(struct pipe_garbage);
-       pipe_garbage_zone = (zone_t)zinit(sizeof(struct pipe_garbage),
-           zone_size, 4096, "pipe garbage zone");
-       pipe_garbage_lock = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr);
 }
 
 #ifndef CONFIG_EMBEDDED
@@ -422,46 +417,27 @@ pipe(proc_t p, __unused struct pipe_args *uap, int32_t *retval)
 {
        struct fileproc *rf, *wf;
        struct pipe *rpipe, *wpipe;
-       lck_mtx_t   *pmtx;
-       int fd, error;
+       int error;
 
-       if ((pmtx = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr)) == NULL) {
-               return ENOMEM;
+       error = pipepair_alloc(&rpipe, &wpipe);
+       if (error) {
+               return error;
        }
 
-       rpipe = wpipe = NULL;
-       if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
-               error = ENFILE;
-               goto freepipes;
-       }
        /*
-        * allocate the space for the normal I/O direction up
-        * front... we'll delay the allocation for the other
-        * direction until a write actually occurs (most likely it won't)...
+        * for now we'll create half-duplex pipes(refer returns section above).
+        * this is what we've always supported..
         */
-       error = pipespace(rpipe, choose_pipespace(rpipe->pipe_buffer.size, 0));
-       if (error) {
-               goto freepipes;
-       }
-
-       TAILQ_INIT(&rpipe->pipe_evlist);
-       TAILQ_INIT(&wpipe->pipe_evlist);
 
-       error = falloc(p, &rf, &fd, vfs_context_current());
+       error = falloc(p, &rf, &retval[0], vfs_context_current());
        if (error) {
                goto freepipes;
        }
-       retval[0] = fd;
-
-       /*
-        * for now we'll create half-duplex pipes(refer returns section above).
-        * this is what we've always supported..
-        */
        rf->f_flag = FREAD;
        rf->f_data = (caddr_t)rpipe;
        rf->f_ops = &pipeops;
 
-       error = falloc(p, &wf, &fd, vfs_context_current());
+       error = falloc(p, &wf, &retval[1], vfs_context_current());
        if (error) {
                fp_free(p, retval[0], rf);
                goto freepipes;
@@ -472,10 +448,7 @@ pipe(proc_t p, __unused struct pipe_args *uap, int32_t *retval)
 
        rpipe->pipe_peer = wpipe;
        wpipe->pipe_peer = rpipe;
-       /* both structures share the same mutex */
-       rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx;
 
-       retval[1] = fd;
 #if CONFIG_MACF
        /*
         * XXXXXXXX SHOULD NOT HOLD FILE_LOCK() XXXXXXXXXXXX
@@ -495,15 +468,11 @@ pipe(proc_t p, __unused struct pipe_args *uap, int32_t *retval)
        fp_drop(p, retval[0], rf, 1);
        fp_drop(p, retval[1], wf, 1);
        proc_fdunlock(p);
-
-
        return 0;
 
 freepipes:
        pipeclose(rpipe);
        pipeclose(wpipe);
-       lck_mtx_free(pmtx, pipe_mtx_grp);
-
        return error;
 }
 
@@ -577,7 +546,7 @@ pipe_stat(struct pipe *cpipe, void *ub, int isstat64)
                 * address of this pipe's struct pipe.  This number may be recycled
                 * relatively quickly.
                 */
-               sb64->st_ino = (ino64_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe);
+               sb64->st_ino = (ino64_t)VM_KERNEL_ADDRHASH((uintptr_t)cpipe);
        } else {
                sb = (struct stat *)ub;
 
@@ -604,7 +573,7 @@ pipe_stat(struct pipe *cpipe, void *ub, int isstat64)
                 * address of this pipe's struct pipe.  This number may be recycled
                 * relatively quickly.
                 */
-               sb->st_ino = (ino_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe);
+               sb->st_ino = (ino_t)VM_KERNEL_ADDRHASH((uintptr_t)cpipe);
        }
        PIPE_UNLOCK(cpipe);
 
@@ -657,12 +626,13 @@ pipespace(struct pipe *cpipe, int size)
  * initialize and allocate VM and memory for pipe
  */
 static int
-pipe_create(struct pipe **cpipep)
+pipepair_alloc(struct pipe **rp_out, struct pipe **wp_out)
 {
-       struct pipe *cpipe;
-       cpipe = (struct pipe *)zalloc(pipe_zone);
+       struct pipepair *pp = zalloc(pipe_zone);
+       struct pipe *rpipe = &pp->pp_rpipe;
+       struct pipe *wpipe = &pp->pp_wpipe;
 
-       if ((*cpipep = cpipe) == NULL) {
+       if (pp == NULL) {
                return ENOMEM;
        }
 
@@ -670,15 +640,61 @@ pipe_create(struct pipe **cpipep)
         * protect so pipespace or pipeclose don't follow a junk pointer
         * if pipespace() fails.
         */
-       bzero(cpipe, sizeof *cpipe);
+       bzero(pp, sizeof(struct pipepair));
+       lck_mtx_init(&pp->pp_mtx, pipe_mtx_grp, pipe_mtx_attr);
+
+       rpipe->pipe_mtxp = &pp->pp_mtx;
+       wpipe->pipe_mtxp = &pp->pp_mtx;
+
+       TAILQ_INIT(&rpipe->pipe_evlist);
+       TAILQ_INIT(&wpipe->pipe_evlist);
 
 #ifndef CONFIG_EMBEDDED
        /* Initial times are all the time of creation of the pipe */
-       pipe_touch(cpipe, PIPE_ATIME | PIPE_MTIME | PIPE_CTIME);
+       pipe_touch(rpipe, PIPE_ATIME | PIPE_MTIME | PIPE_CTIME);
+       pipe_touch(wpipe, PIPE_ATIME | PIPE_MTIME | PIPE_CTIME);
 #endif
+
+       /*
+        * allocate the space for the normal I/O direction up
+        * front... we'll delay the allocation for the other
+        * direction until a write actually occurs (most likely it won't)...
+        */
+       int error = pipespace(rpipe, choose_pipespace(rpipe->pipe_buffer.size, 0));
+       if (__improbable(error)) {
+               lck_mtx_destroy(&pp->pp_mtx, pipe_mtx_grp);
+               zfree(pipe_zone, pp);
+               return error;
+       }
+
+       *rp_out = rpipe;
+       *wp_out = wpipe;
        return 0;
 }
 
+static void
+pipepair_destroy_pipe(struct pipepair *pp, struct pipe *cpipe)
+{
+       bool can_free;
+
+       pipe_free_kmem(cpipe);
+
+       lck_mtx_lock(&pp->pp_mtx);
+       if (__improbable(cpipe->pipe_state & PIPE_DEAD)) {
+               panic("double free of pipe %p in pair %p", cpipe, pp);
+       }
+
+       cpipe->pipe_state |= PIPE_DEAD;
+
+       can_free = (pp->pp_rpipe.pipe_state & PIPE_DEAD) &&
+           (pp->pp_wpipe.pipe_state & PIPE_DEAD);
+       lck_mtx_unlock(&pp->pp_mtx);
+
+       if (can_free) {
+               lck_mtx_destroy(&pp->pp_mtx, pipe_mtx_grp);
+               zfree(pipe_zone, pp);
+       }
+}
 
 /*
  * lock a pipe for I/O, blocking other access
@@ -722,9 +738,8 @@ pipeselwakeup(struct pipe *cpipe, struct pipe *spipe)
                cpipe->pipe_state &= ~PIPE_SEL;
                selwakeup(&cpipe->pipe_sel);
        }
-       if (cpipe->pipe_state & PIPE_KNOTE) {
-               KNOTE(&cpipe->pipe_sel.si_note, 1);
-       }
+
+       KNOTE(&cpipe->pipe_sel.si_note, 1);
 
        postpipeevent(cpipe, EV_RWBYTES);
 
@@ -817,7 +832,8 @@ pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags,
                         * detect EOF condition
                         * read returns 0 on EOF, no need to set error
                         */
-                       if (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
+                       if ((rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
+                           (fileproc_get_vflags(fp) & FPV_DRAIN)) {
                                break;
                        }
 
@@ -923,7 +939,8 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags,
        /*
         * detect loss of pipe read side, issue SIGPIPE if lost.
         */
-       if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
+       if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
+           (fileproc_get_vflags(fp) & FPV_DRAIN)) {
                PIPE_UNLOCK(rpipe);
                return EPIPE;
        }
@@ -999,7 +1016,8 @@ retrywrite:
                                int size;       /* Transfer size */
                                int segsize;    /* first segment to transfer */
 
-                               if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
+                               if ((wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
+                                   (fileproc_get_vflags(fp) & FPV_DRAIN)) {
                                        pipeio_unlock(wpipe);
                                        error = EPIPE;
                                        break;
@@ -1099,21 +1117,23 @@ retrywrite:
                                wpipe->pipe_state &= ~PIPE_WANTR;
                                wakeup(wpipe);
                        }
+
                        /*
-                        * don't block on non-blocking I/O
-                        * we'll do the pipeselwakeup on the way out
+                        * If read side wants to go away, we just issue a signal
+                        * to ourselves.
                         */
-                       if (fp->f_flag & FNONBLOCK) {
-                               error = EAGAIN;
+                       if ((wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
+                           (fileproc_get_vflags(fp) & FPV_DRAIN)) {
+                               error = EPIPE;
                                break;
                        }
 
                        /*
-                        * If read side wants to go away, we just issue a signal
-                        * to ourselves.
+                        * don't block on non-blocking I/O
+                        * we'll do the pipeselwakeup on the way out
                         */
-                       if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
-                               error = EPIPE;
+                       if (fp->f_flag & FNONBLOCK) {
+                               error = EAGAIN;
                                break;
                        }
 
@@ -1254,7 +1274,8 @@ pipe_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx)
        case FREAD:
                if ((rpipe->pipe_state & PIPE_DIRECTW) ||
                    (rpipe->pipe_buffer.cnt > 0) ||
-                   (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
+                   (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
+                   (fileproc_get_vflags(fp) & FPV_DRAIN)) {
                        retnum = 1;
                } else {
                        rpipe->pipe_state |= PIPE_SEL;
@@ -1267,6 +1288,7 @@ pipe_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx)
                        wpipe->pipe_state |= PIPE_WSELECT;
                }
                if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
+                   (fileproc_get_vflags(fp) & FPV_DRAIN) ||
                    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
                    (MAX_PIPESIZE(wpipe) - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) {
                        retnum = 1;
@@ -1324,14 +1346,7 @@ pipeclose(struct pipe *cpipe)
 {
        struct pipe *ppipe;
 
-       if (cpipe == NULL) {
-               return;
-       }
-       /* partially created pipes won't have a valid mutex. */
-       if (PIPE_MTX(cpipe) != NULL) {
-               PIPE_LOCK(cpipe);
-       }
-
+       PIPE_LOCK(cpipe);
 
        /*
         * If the other side is blocked, wake it up saying that
@@ -1367,9 +1382,7 @@ pipeclose(struct pipe *cpipe)
                pipeselwakeup(ppipe, ppipe);
                wakeup(ppipe);
 
-               if (cpipe->pipe_state & PIPE_KNOTE) {
-                       KNOTE(&ppipe->pipe_sel.si_note, 1);
-               }
+               KNOTE(&ppipe->pipe_sel.si_note, 1);
 
                postpipeevent(ppipe, EV_RCLOSED);
 
@@ -1380,76 +1393,114 @@ pipeclose(struct pipe *cpipe)
        /*
         * free resources
         */
-       if (PIPE_MTX(cpipe) != NULL) {
-               if (ppipe != NULL) {
-                       /*
-                        * since the mutex is shared and the peer is still
-                        * alive, we need to release the mutex, not free it
-                        */
-                       PIPE_UNLOCK(cpipe);
-               } else {
-                       /*
-                        * peer is gone, so we're the sole party left with
-                        * interest in this mutex... unlock and free it
-                        */
-                       PIPE_UNLOCK(cpipe);
-                       lck_mtx_free(PIPE_MTX(cpipe), pipe_mtx_grp);
-               }
+
+       PIPE_UNLOCK(cpipe);
+
+       pipepair_destroy_pipe(PIPE_PAIR(cpipe), cpipe);
+}
+
+static int64_t
+filt_pipelowwat(struct knote *kn, struct pipe *rpipe, int64_t def_lowwat)
+{
+       if ((kn->kn_sfflags & NOTE_LOWAT) == 0) {
+               return def_lowwat;
        }
-       pipe_free_kmem(cpipe);
-       if (cpipe->pipe_state & PIPE_WSELECT) {
-               pipe_garbage_collect(cpipe);
-       } else {
-               zfree(pipe_zone, cpipe);
-               pipe_garbage_collect(NULL);
+       if (rpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(rpipe)) {
+               return MAX_PIPESIZE(rpipe);
        }
+       return MAX(kn->kn_sdata, def_lowwat);
 }
 
-/*ARGSUSED*/
 static int
-filt_piperead_common(struct knote *kn, struct pipe *rpipe)
+filt_pipe_draincommon(struct knote *kn, struct pipe *rpipe)
 {
-       struct pipe *wpipe;
-       int    retval;
-
-       /*
-        * we're being called back via the KNOTE post
-        * we made in pipeselwakeup, and we already hold the mutex...
-        */
+       struct pipe *wpipe = rpipe->pipe_peer;
 
-       wpipe = rpipe->pipe_peer;
-       kn->kn_data = rpipe->pipe_buffer.cnt;
        if ((rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
            (wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
                kn->kn_flags |= EV_EOF;
-               retval = 1;
+               return 1;
+       }
+
+       return 0;
+}
+
+static int
+filt_pipenotsup(struct knote *kn, long hint)
+{
+#pragma unused(hint)
+       struct pipe *rpipe = kn->kn_hook;
+
+       return filt_pipe_draincommon(kn, rpipe);
+}
+
+static int
+filt_pipenotsuptouch(struct knote *kn, struct kevent_qos_s *kev)
+{
+       struct pipe *rpipe = kn->kn_hook;
+       int res;
+
+       PIPE_LOCK(rpipe);
+
+       /* accept new kevent data (and save off lowat threshold and flag) */
+       kn->kn_sfflags = kev->fflags;
+       kn->kn_sdata = kev->data;
+
+       /* determine if any event is now deemed fired */
+       res = filt_pipe_draincommon(kn, rpipe);
+
+       PIPE_UNLOCK(rpipe);
+
+       return res;
+}
+
+static int
+filt_pipenotsupprocess(struct knote *kn, struct kevent_qos_s *kev)
+{
+       struct pipe *rpipe = kn->kn_hook;
+       int res;
+
+       PIPE_LOCK(rpipe);
+       res = filt_pipe_draincommon(kn, rpipe);
+       if (res) {
+               knote_fill_kevent(kn, kev, 0);
+       }
+       PIPE_UNLOCK(rpipe);
+
+       return res;
+}
+
+/*ARGSUSED*/
+static int
+filt_piperead_common(struct knote *kn, struct kevent_qos_s *kev, struct pipe *rpipe)
+{
+       int64_t data = rpipe->pipe_buffer.cnt;
+       int res = 0;
+
+       if (filt_pipe_draincommon(kn, rpipe)) {
+               res = 1;
        } else {
-               int64_t lowwat = 1;
-               if (kn->kn_sfflags & NOTE_LOWAT) {
-                       if (rpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(rpipe)) {
-                               lowwat = MAX_PIPESIZE(rpipe);
-                       } else if (kn->kn_sdata > lowwat) {
-                               lowwat = kn->kn_sdata;
-                       }
-               }
-               retval = kn->kn_data >= lowwat;
+               res = data >= filt_pipelowwat(kn, rpipe, 1);
        }
-       return retval;
+       if (res && kev) {
+               knote_fill_kevent(kn, kev, data);
+       }
+       return res;
 }
 
 static int
 filt_piperead(struct knote *kn, long hint)
 {
 #pragma unused(hint)
-       struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
+       struct pipe *rpipe = kn->kn_hook;
 
-       return filt_piperead_common(kn, rpipe);
+       return filt_piperead_common(kn, NULL, rpipe);
 }
 
 static int
-filt_pipereadtouch(struct knote *kn, struct kevent_internal_s *kev)
+filt_pipereadtouch(struct knote *kn, struct kevent_qos_s *kev)
 {
-       struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
+       struct pipe *rpipe = kn->kn_hook;
        int retval;
 
        PIPE_LOCK(rpipe);
@@ -1459,7 +1510,7 @@ filt_pipereadtouch(struct knote *kn, struct kevent_internal_s *kev)
        kn->kn_sfflags = kev->fflags;
 
        /* identify if any events are now fired */
-       retval = filt_piperead_common(kn, rpipe);
+       retval = filt_piperead_common(kn, NULL, rpipe);
 
        PIPE_UNLOCK(rpipe);
 
@@ -1467,21 +1518,13 @@ filt_pipereadtouch(struct knote *kn, struct kevent_internal_s *kev)
 }
 
 static int
-filt_pipereadprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
+filt_pipereadprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-#pragma unused(data)
-       struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
+       struct pipe *rpipe = kn->kn_hook;
        int    retval;
 
        PIPE_LOCK(rpipe);
-       retval = filt_piperead_common(kn, rpipe);
-       if (retval) {
-               *kev = kn->kn_kevent;
-               if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_fflags = 0;
-                       kn->kn_data = 0;
-               }
-       }
+       retval = filt_piperead_common(kn, kev, rpipe);
        PIPE_UNLOCK(rpipe);
 
        return retval;
@@ -1489,33 +1532,21 @@ filt_pipereadprocess(struct knote *kn, struct filt_process_s *data, struct keven
 
 /*ARGSUSED*/
 static int
-filt_pipewrite_common(struct knote *kn, struct pipe *rpipe)
+filt_pipewrite_common(struct knote *kn, struct kevent_qos_s *kev, struct pipe *rpipe)
 {
-       struct pipe *wpipe;
-
-       /*
-        * we're being called back via the KNOTE post
-        * we made in pipeselwakeup, and we already hold the mutex...
-        */
-       wpipe = rpipe->pipe_peer;
+       int64_t data = 0;
+       int res = 0;
 
-       if ((wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
-               kn->kn_data = 0;
-               kn->kn_flags |= EV_EOF;
-               return 1;
+       if (filt_pipe_draincommon(kn, rpipe)) {
+               res = 1;
+       } else {
+               data = MAX_PIPESIZE(rpipe) - rpipe->pipe_buffer.cnt;
+               res = data >= filt_pipelowwat(kn, rpipe, PIPE_BUF);
        }
-       kn->kn_data = MAX_PIPESIZE(wpipe) - wpipe->pipe_buffer.cnt;
-
-       int64_t lowwat = PIPE_BUF;
-       if (kn->kn_sfflags & NOTE_LOWAT) {
-               if (wpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(wpipe)) {
-                       lowwat = MAX_PIPESIZE(wpipe);
-               } else if (kn->kn_sdata > lowwat) {
-                       lowwat = kn->kn_sdata;
-               }
+       if (res && kev) {
+               knote_fill_kevent(kn, kev, data);
        }
-
-       return kn->kn_data >= lowwat;
+       return res;
 }
 
 /*ARGSUSED*/
@@ -1523,16 +1554,16 @@ static int
 filt_pipewrite(struct knote *kn, long hint)
 {
 #pragma unused(hint)
-       struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
+       struct pipe *rpipe = kn->kn_hook;
 
-       return filt_pipewrite_common(kn, rpipe);
+       return filt_pipewrite_common(kn, NULL, rpipe);
 }
 
 
 static int
-filt_pipewritetouch(struct knote *kn, struct kevent_internal_s *kev)
+filt_pipewritetouch(struct knote *kn, struct kevent_qos_s *kev)
 {
-       struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
+       struct pipe *rpipe = kn->kn_hook;
        int res;
 
        PIPE_LOCK(rpipe);
@@ -1542,7 +1573,7 @@ filt_pipewritetouch(struct knote *kn, struct kevent_internal_s *kev)
        kn->kn_sdata = kev->data;
 
        /* determine if any event is now deemed fired */
-       res = filt_pipewrite_common(kn, rpipe);
+       res = filt_pipewrite_common(kn, NULL, rpipe);
 
        PIPE_UNLOCK(rpipe);
 
@@ -1550,21 +1581,13 @@ filt_pipewritetouch(struct knote *kn, struct kevent_internal_s *kev)
 }
 
 static int
-filt_pipewriteprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
+filt_pipewriteprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-#pragma unused(data)
-       struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
+       struct pipe *rpipe = kn->kn_hook;
        int res;
 
        PIPE_LOCK(rpipe);
-       res = filt_pipewrite_common(kn, rpipe);
-       if (res) {
-               *kev = kn->kn_kevent;
-               if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_fflags = 0;
-                       kn->kn_data = 0;
-               }
-       }
+       res = filt_pipewrite_common(kn, kev, rpipe);
        PIPE_UNLOCK(rpipe);
 
        return res;
@@ -1572,10 +1595,11 @@ filt_pipewriteprocess(struct knote *kn, struct filt_process_s *data, struct keve
 
 /*ARGSUSED*/
 static int
-pipe_kqfilter(__unused struct fileproc *fp, struct knote *kn,
-    __unused struct kevent_internal_s *kev, __unused vfs_context_t ctx)
+pipe_kqfilter(struct fileproc *fp, struct knote *kn,
+    __unused struct kevent_qos_s *kev)
 {
-       struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
+       struct pipe *cpipe = (struct pipe *)fp->f_data;
+       struct pipe *rpipe = &PIPE_PAIR(cpipe)->pp_rpipe;
        int res;
 
        PIPE_LOCK(cpipe);
@@ -1585,51 +1609,56 @@ pipe_kqfilter(__unused struct fileproc *fp, struct knote *kn,
         * XXX process credential should have a persistent reference on it
         * XXX before being passed in here.
         */
-       if (mac_pipe_check_kqfilter(vfs_context_ucred(ctx), kn, cpipe) != 0) {
+       kauth_cred_t cred = vfs_context_ucred(vfs_context_current());
+       if (mac_pipe_check_kqfilter(cred, kn, cpipe) != 0) {
                PIPE_UNLOCK(cpipe);
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = EPERM;
+               knote_set_error(kn, EPERM);
                return 0;
        }
 #endif
 
+       /*
+        * FreeBSD will fail the attach with EPIPE if the peer pipe is detached,
+        * however, this isn't a programming error as the other side closing
+        * could race with the kevent registration.
+        *
+        * Attach should only fail for programming mistakes else it will break
+        * libdispatch.
+        *
+        * Like FreeBSD, have a "Neutered" filter that will not fire until
+        * the pipe dies if the wrong filter is attached to the wrong end.
+        *
+        * Knotes are always attached to the "rpipe".
+        */
        switch (kn->kn_filter) {
        case EVFILT_READ:
-               kn->kn_filtid = EVFILTID_PIPE_R;
-
-               /* determine initial state */
-               res = filt_piperead_common(kn, cpipe);
+               if (fp->f_flag & FREAD) {
+                       kn->kn_filtid = EVFILTID_PIPE_R;
+                       res = filt_piperead_common(kn, NULL, rpipe);
+               } else {
+                       kn->kn_filtid = EVFILTID_PIPE_N;
+                       res = filt_pipe_draincommon(kn, rpipe);
+               }
                break;
 
        case EVFILT_WRITE:
-               kn->kn_filtid = EVFILTID_PIPE_W;
-
-               if (cpipe->pipe_peer == NULL) {
-                       /*
-                        * other end of pipe has been closed
-                        */
-                       PIPE_UNLOCK(cpipe);
-                       kn->kn_flags = EV_ERROR;
-                       kn->kn_data = EPIPE;
-                       return 0;
-               }
-               if (cpipe->pipe_peer) {
-                       cpipe = cpipe->pipe_peer;
+               if (fp->f_flag & FWRITE) {
+                       kn->kn_filtid = EVFILTID_PIPE_W;
+                       res = filt_pipewrite_common(kn, NULL, rpipe);
+               } else {
+                       kn->kn_filtid = EVFILTID_PIPE_N;
+                       res = filt_pipe_draincommon(kn, rpipe);
                }
-
-               /* determine inital state */
-               res = filt_pipewrite_common(kn, cpipe);
                break;
+
        default:
                PIPE_UNLOCK(cpipe);
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = EINVAL;
+               knote_set_error(kn, EINVAL);
                return 0;
        }
 
-       if (KNOTE_ATTACH(&cpipe->pipe_sel.si_note, kn)) {
-               cpipe->pipe_state |= PIPE_KNOTE;
-       }
+       kn->kn_hook = rpipe;
+       KNOTE_ATTACH(&rpipe->pipe_sel.si_note, kn);
 
        PIPE_UNLOCK(cpipe);
        return res;
@@ -1639,21 +1668,10 @@ static void
 filt_pipedetach(struct knote *kn)
 {
        struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
+       struct pipe *rpipe = &PIPE_PAIR(cpipe)->pp_rpipe;
 
        PIPE_LOCK(cpipe);
-
-       if (kn->kn_filter == EVFILT_WRITE) {
-               if (cpipe->pipe_peer == NULL) {
-                       PIPE_UNLOCK(cpipe);
-                       return;
-               }
-               cpipe = cpipe->pipe_peer;
-       }
-       if (cpipe->pipe_state & PIPE_KNOTE) {
-               if (KNOTE_DETACH(&cpipe->pipe_sel.si_note, kn)) {
-                       cpipe->pipe_state &= ~PIPE_KNOTE;
-               }
-       }
+       KNOTE_DETACH(&rpipe->pipe_sel.si_note, kn);
        PIPE_UNLOCK(cpipe);
 }
 
@@ -1734,8 +1752,8 @@ fill_pipeinfo(struct pipe * cpipe, struct pipe_info * pinfo)
         * XXX (st_dev, st_ino) should be unique.
         */
 
-       pinfo->pipe_handle = (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe);
-       pinfo->pipe_peerhandle = (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)(cpipe->pipe_peer));
+       pinfo->pipe_handle = (uint64_t)VM_KERNEL_ADDRHASH((uintptr_t)cpipe);
+       pinfo->pipe_peerhandle = (uint64_t)VM_KERNEL_ADDRHASH((uintptr_t)(cpipe->pipe_peer));
        pinfo->pipe_status = cpipe->pipe_state;
 
        PIPE_UNLOCK(cpipe);
@@ -1749,17 +1767,30 @@ pipe_drain(struct fileproc *fp, __unused vfs_context_t ctx)
 {
        /* Note: fdlock already held */
        struct pipe *ppipe, *cpipe = (struct pipe *)(fp->f_fglob->fg_data);
+       boolean_t drain_pipe = FALSE;
+
+       /* Check if the pipe is going away */
+       lck_mtx_lock_spin(&fp->f_fglob->fg_lock);
+       if (fp->f_fglob->fg_count == 1) {
+               drain_pipe = TRUE;
+       }
+       lck_mtx_unlock(&fp->f_fglob->fg_lock);
 
        if (cpipe) {
                PIPE_LOCK(cpipe);
-               cpipe->pipe_state |= PIPE_DRAIN;
-               cpipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
+
+               if (drain_pipe) {
+                       cpipe->pipe_state |= PIPE_DRAIN;
+                       cpipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
+               }
                wakeup(cpipe);
 
                /* Must wake up peer: a writer sleeps on the read side */
                if ((ppipe = cpipe->pipe_peer)) {
-                       ppipe->pipe_state |= PIPE_DRAIN;
-                       ppipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
+                       if (drain_pipe) {
+                               ppipe->pipe_state |= PIPE_DRAIN;
+                               ppipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
+                       }
                        wakeup(ppipe);
                }
 
@@ -1769,80 +1800,3 @@ pipe_drain(struct fileproc *fp, __unused vfs_context_t ctx)
 
        return 1;
 }
-
-
-/*
- * When a thread sets a write-select on a pipe, it creates an implicit,
- * untracked dependency between that thread and the peer of the pipe
- * on which the select is set.  If the peer pipe is closed and freed
- * before the select()ing thread wakes up, the system will panic as
- * it attempts to unwind the dangling select().  To avoid that panic,
- * we notice whenever a dangerous select() is set on a pipe, and
- * defer the final deletion of the pipe until that select()s are all
- * resolved.  Since we can't currently detect exactly when that
- * resolution happens, we use a simple garbage collection queue to
- * reap the at-risk pipes 'later'.
- */
-static void
-pipe_garbage_collect(struct pipe *cpipe)
-{
-       uint64_t old, now;
-       struct pipe_garbage *pgp;
-
-       /* Convert msecs to nsecs and then to abstime */
-       old = pipe_garbage_age_limit * 1000000;
-       nanoseconds_to_absolutetime(old, &old);
-
-       lck_mtx_lock(pipe_garbage_lock);
-
-       /* Free anything that's been on the queue for <mumble> seconds */
-       now = mach_absolute_time();
-       old = now - old;
-       while ((pgp = pipe_garbage_head) && pgp->pg_timestamp < old) {
-               pipe_garbage_head = pgp->pg_next;
-               if (pipe_garbage_head == NULL) {
-                       pipe_garbage_tail = NULL;
-               }
-               pipe_garbage_count--;
-               zfree(pipe_zone, pgp->pg_pipe);
-               zfree(pipe_garbage_zone, pgp);
-       }
-
-       /* Add the new pipe (if any) to the tail of the garbage queue */
-       if (cpipe) {
-               cpipe->pipe_state = PIPE_DEAD;
-               pgp = (struct pipe_garbage *)zalloc(pipe_garbage_zone);
-               if (pgp == NULL) {
-                       /*
-                        * We're too low on memory to garbage collect the
-                        * pipe.  Freeing it runs the risk of panicing the
-                        * system.  All we can do is leak it and leave
-                        * a breadcrumb behind.  The good news, such as it
-                        * is, is that this will probably never happen.
-                        * We will probably hit the panic below first.
-                        */
-                       printf("Leaking pipe %p - no room left in the queue",
-                           cpipe);
-                       lck_mtx_unlock(pipe_garbage_lock);
-                       return;
-               }
-
-               pgp->pg_pipe = cpipe;
-               pgp->pg_timestamp = now;
-               pgp->pg_next = NULL;
-
-               if (pipe_garbage_tail) {
-                       pipe_garbage_tail->pg_next = pgp;
-               }
-               pipe_garbage_tail = pgp;
-               if (pipe_garbage_head == NULL) {
-                       pipe_garbage_head = pipe_garbage_tail;
-               }
-
-               if (pipe_garbage_count++ >= PIPE_GARBAGE_QUEUE_LIMIT) {
-                       panic("Length of pipe garbage queue exceeded %d",
-                           PIPE_GARBAGE_QUEUE_LIMIT);
-               }
-       }
-       lck_mtx_unlock(pipe_garbage_lock);
-}
index 518df53ab2b3fe21bc6ead368f023647147d3be3..70493f9741c3096e9b9b0b4f9ba2b6135e5c0f6f 100644 (file)
@@ -50,6 +50,8 @@ lck_grp_attr_t  *os_reason_lock_grp_attr;
 lck_grp_t       *os_reason_lock_grp;
 lck_attr_t      *os_reason_lock_attr;
 
+os_refgrp_decl(static, os_reason_refgrp, "os_reason", NULL);
+
 #define OS_REASON_RESERVE_COUNT 100
 #define OS_REASON_MAX_COUNT     (maxproc + 100)
 
@@ -131,7 +133,7 @@ os_reason_create(uint32_t osr_namespace, uint64_t osr_code)
        new_reason->osr_kcd_buf = NULL;
 
        lck_mtx_init(&new_reason->osr_lock, os_reason_lock_grp, os_reason_lock_attr);
-       new_reason->osr_refcount = 1;
+       os_ref_init(&new_reason->osr_refcount, &os_reason_refgrp);
 
        return new_reason;
 }
@@ -276,14 +278,8 @@ os_reason_ref(os_reason_t cur_reason)
        }
 
        lck_mtx_lock(&cur_reason->osr_lock);
-
-       assert(cur_reason->osr_refcount > 0);
-       if (os_add_overflow(cur_reason->osr_refcount, 1, &cur_reason->osr_refcount)) {
-               panic("os reason refcount overflow");
-       }
-
+       os_ref_retain_locked(&cur_reason->osr_refcount);
        lck_mtx_unlock(&cur_reason->osr_lock);
-
        return;
 }
 
@@ -300,12 +296,7 @@ os_reason_free(os_reason_t cur_reason)
 
        lck_mtx_lock(&cur_reason->osr_lock);
 
-       if (cur_reason->osr_refcount == 0) {
-               panic("os_reason_free called on reason with zero refcount");
-       }
-
-       cur_reason->osr_refcount--;
-       if (cur_reason->osr_refcount != 0) {
+       if (os_ref_release_locked(&cur_reason->osr_refcount) > 0) {
                lck_mtx_unlock(&cur_reason->osr_lock);
                return;
        }
@@ -317,3 +308,44 @@ os_reason_free(os_reason_t cur_reason)
 
        zfree(os_reason_zone, cur_reason);
 }
+
+/*
+ * Sets flags on the passed reason.
+ */
+void
+os_reason_set_flags(os_reason_t cur_reason, uint64_t flags)
+{
+       if (cur_reason == OS_REASON_NULL) {
+               return;
+       }
+
+       lck_mtx_lock(&cur_reason->osr_lock);
+       cur_reason->osr_flags = flags;
+       lck_mtx_unlock(&cur_reason->osr_lock);
+}
+
+/*
+ * Allocates space and sets description data in kcd_descriptor on the passed reason.
+ */
+void
+os_reason_set_description_data(os_reason_t cur_reason, uint32_t type, void *reason_data, uint32_t reason_data_len)
+{
+       mach_vm_address_t osr_data_addr = 0;
+
+       if (cur_reason == OS_REASON_NULL) {
+               return;
+       }
+
+       if (0 != os_reason_alloc_buffer(cur_reason, kcdata_estimate_required_buffer_size(1, reason_data_len))) {
+               panic("os_reason failed to allocate");
+       }
+
+       lck_mtx_lock(&cur_reason->osr_lock);
+       if (KERN_SUCCESS != kcdata_get_memory_addr(&cur_reason->osr_kcd_descriptor, type, reason_data_len, &osr_data_addr)) {
+               panic("os_reason failed to get data address");
+       }
+       if (KERN_SUCCESS != kcdata_memcpy(&cur_reason->osr_kcd_descriptor, osr_data_addr, reason_data, reason_data_len)) {
+               panic("os_reason failed to copy description data");
+       }
+       lck_mtx_unlock(&cur_reason->osr_lock);
+}
index 9988ba7c68db4a9c91d5413df71def023aaf389c..53e8f07b58167e89ee3b2261893b2fcf61fb962a 100644 (file)
@@ -98,14 +98,14 @@ static int soo_close(struct fileglob *, vfs_context_t ctx);
 static int soo_drain(struct fileproc *, vfs_context_t ctx);
 
 const struct fileops socketops = {
-       .fo_type = DTYPE_SOCKET,
-       .fo_read = soo_read,
-       .fo_write = soo_write,
-       .fo_ioctl = soo_ioctl,
-       .fo_select = soo_select,
-       .fo_close = soo_close,
+       .fo_type     = DTYPE_SOCKET,
+       .fo_read     = soo_read,
+       .fo_write    = soo_write,
+       .fo_ioctl    = soo_ioctl,
+       .fo_select   = soo_select,
+       .fo_close    = soo_close,
+       .fo_drain    = soo_drain,
        .fo_kqfilter = soo_kqfilter,
-       .fo_drain = soo_drain,
 };
 
 /* ARGSUSED */
index b8046c66e0c8b065b8db3b38c82e718ca1ac9209..dce4c3aec295b95ce31879006a5de221cef3471f 100644 (file)
@@ -26,6 +26,8 @@
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
+#include <machine/atomic.h>
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/ioctl.h>
@@ -101,16 +103,44 @@ typedef lck_spin_t ull_lock_t;
 #define ULOCK_TO_EVENT(ull)   ((event_t)ull)
 #define EVENT_TO_ULOCK(event) ((ull_t *)event)
 
-typedef struct __attribute__((packed)) {
-       user_addr_t     ulk_addr;
-       pid_t           ulk_pid;
+typedef enum {
+       ULK_INVALID = 0,
+       ULK_UADDR,
+       ULK_XPROC,
+} ulk_type;
+
+typedef struct {
+       union {
+               struct __attribute__((packed)) {
+                       user_addr_t     ulk_addr;
+                       pid_t           ulk_pid;
+               };
+               struct __attribute__((packed)) {
+                       uint64_t        ulk_object;
+                       uint64_t        ulk_offset;
+               };
+       };
+       ulk_type        ulk_key_type;
 } ulk_t;
 
+#define ULK_UADDR_LEN   (sizeof(user_addr_t) + sizeof(pid_t))
+#define ULK_XPROC_LEN   (sizeof(uint64_t) + sizeof(uint64_t))
+
 inline static bool
 ull_key_match(ulk_t *a, ulk_t *b)
 {
-       return (a->ulk_pid == b->ulk_pid) &&
-              (a->ulk_addr == b->ulk_addr);
+       if (a->ulk_key_type != b->ulk_key_type) {
+               return false;
+       }
+
+       if (a->ulk_key_type == ULK_UADDR) {
+               return (a->ulk_pid == b->ulk_pid) &&
+                      (a->ulk_addr == b->ulk_addr);
+       }
+
+       assert(a->ulk_key_type == ULK_XPROC);
+       return (a->ulk_object == b->ulk_object) &&
+              (a->ulk_offset == b->ulk_offset);
 }
 
 typedef struct ull {
@@ -120,11 +150,9 @@ typedef struct ull {
         */
        thread_t        ull_owner; /* holds +1 thread reference */
        ulk_t           ull_key;
-       ulk_t           ull_saved_key;
        ull_lock_t      ull_lock;
        uint            ull_bucket_index;
        int32_t         ull_nwaiters;
-       int32_t         ull_max_nwaiters;
        int32_t         ull_refcount;
        uint8_t         ull_opcode;
        struct turnstile *ull_turnstile;
@@ -134,9 +162,13 @@ typedef struct ull {
 extern void ulock_initialize(void);
 
 #define ULL_MUST_EXIST  0x0001
-static ull_t *ull_get(ulk_t *, uint32_t, ull_t **);
 static void ull_put(ull_t *);
 
+static uint32_t ulock_adaptive_spin_usecs = 20;
+
+SYSCTL_INT(_kern, OID_AUTO, ulock_adaptive_spin_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &ulock_adaptive_spin_usecs, 0, "ulock adaptive spin duration");
+
 #if DEVELOPMENT || DEBUG
 static int ull_simulate_copyin_fault = 0;
 
@@ -144,12 +176,22 @@ static void
 ull_dump(ull_t *ull)
 {
        kprintf("ull\t%p\n", ull);
-       kprintf("ull_key.ulk_pid\t%d\n", ull->ull_key.ulk_pid);
-       kprintf("ull_key.ulk_addr\t%p\n", (void *)(ull->ull_key.ulk_addr));
-       kprintf("ull_saved_key.ulk_pid\t%d\n", ull->ull_saved_key.ulk_pid);
-       kprintf("ull_saved_key.ulk_addr\t%p\n", (void *)(ull->ull_saved_key.ulk_addr));
+       switch (ull->ull_key.ulk_key_type) {
+       case ULK_UADDR:
+               kprintf("ull_key.ulk_key_type\tULK_UADDR\n");
+               kprintf("ull_key.ulk_pid\t%d\n", ull->ull_key.ulk_pid);
+               kprintf("ull_key.ulk_addr\t%p\n", (void *)(ull->ull_key.ulk_addr));
+               break;
+       case ULK_XPROC:
+               kprintf("ull_key.ulk_key_type\tULK_XPROC\n");
+               kprintf("ull_key.ulk_object\t%p\n", (void *)(ull->ull_key.ulk_object));
+               kprintf("ull_key.ulk_offset\t%p\n", (void *)(ull->ull_key.ulk_offset));
+               break;
+       default:
+               kprintf("ull_key.ulk_key_type\tUNKNOWN %d\n", ull->ull_key.ulk_key_type);
+               break;
+       }
        kprintf("ull_nwaiters\t%d\n", ull->ull_nwaiters);
-       kprintf("ull_max_nwaiters\t%d\n", ull->ull_max_nwaiters);
        kprintf("ull_refcount\t%d\n", ull->ull_refcount);
        kprintf("ull_opcode\t%d\n\n", ull->ull_opcode);
        kprintf("ull_owner\t0x%llx\n\n", thread_tid(ull->ull_owner));
@@ -180,13 +222,7 @@ ull_hash_index(const void *key, size_t length)
        return hash;
 }
 
-/* Ensure that the key structure is packed,
- * so that no undefined memory is passed to
- * ull_hash_index()
- */
-static_assert(sizeof(ulk_t) == sizeof(user_addr_t) + sizeof(pid_t));
-
-#define ULL_INDEX(keyp) ull_hash_index(keyp, sizeof *keyp)
+#define ULL_INDEX(keyp) ull_hash_index(keyp, keyp->ulk_key_type == ULK_UADDR ? ULK_UADDR_LEN : ULK_XPROC_LEN)
 
 void
 ulock_initialize(void)
@@ -215,6 +251,7 @@ ulock_initialize(void)
            0, "ulocks");
 
        zone_change(ull_zone, Z_NOENCRYPT, TRUE);
+       zone_change(ull_zone, Z_CACHING_ENABLED, TRUE);
 }
 
 #if DEVELOPMENT || DEBUG
@@ -237,7 +274,7 @@ ull_hash_dump(pid_t pid)
                                kprintf("%s>index %d:\n", __FUNCTION__, i);
                        }
                        qe_foreach_element(elem, &ull_bucket[i].ulb_head, ull_hash_link) {
-                               if ((pid == 0) || (pid == elem->ull_key.ulk_pid)) {
+                               if ((pid == 0) || ((elem->ull_key.ulk_key_type == ULK_UADDR) && (pid == elem->ull_key.ulk_pid))) {
                                        ull_dump(elem);
                                        count++;
                                }
@@ -261,10 +298,8 @@ ull_alloc(ulk_t *key)
 
        ull->ull_refcount = 1;
        ull->ull_key = *key;
-       ull->ull_saved_key = *key;
        ull->ull_bucket_index = ULL_INDEX(key);
        ull->ull_nwaiters = 0;
-       ull->ull_max_nwaiters = 0;
        ull->ull_opcode = 0;
 
        ull->ull_owner = THREAD_NULL;
@@ -351,7 +386,7 @@ ull_put(ull_t *ull)
 {
        ull_assert_owned(ull);
        int refcount = --ull->ull_refcount;
-       assert(refcount == 0 ? (ull->ull_key.ulk_pid == 0 && ull->ull_key.ulk_addr == 0) : 1);
+       assert(refcount == 0 ? (ull->ull_key.ulk_key_type == ULK_INVALID) : 1);
        ull_unlock(ull);
 
        if (refcount > 0) {
@@ -365,6 +400,31 @@ ull_put(ull_t *ull)
        ull_free(ull);
 }
 
+extern kern_return_t vm_map_page_info(vm_map_t map, vm_map_offset_t offset, vm_page_info_flavor_t flavor, vm_page_info_t info, mach_msg_type_number_t *count);
+extern vm_map_t current_map(void);
+extern boolean_t machine_thread_on_core(thread_t thread);
+
+static int
+uaddr_findobj(user_addr_t uaddr, uint64_t *objectp, uint64_t *offsetp)
+{
+       kern_return_t ret;
+       vm_page_info_basic_data_t info;
+       mach_msg_type_number_t count = VM_PAGE_INFO_BASIC_COUNT;
+       ret = vm_map_page_info(current_map(), uaddr, VM_PAGE_INFO_BASIC, (vm_page_info_t)&info, &count);
+       if (ret != KERN_SUCCESS) {
+               return EINVAL;
+       }
+
+       if (objectp != NULL) {
+               *objectp = (uint64_t)info.object_id;
+       }
+       if (offsetp != NULL) {
+               *offsetp = (uint64_t)info.offset;
+       }
+
+       return 0;
+}
+
 static void ulock_wait_continue(void *, wait_result_t);
 static void ulock_wait_cleanup(ull_t *, thread_t, thread_t, int32_t *);
 
@@ -389,6 +449,24 @@ wait_result_to_return_code(wait_result_t wr)
        return ret;
 }
 
+static int
+ulock_resolve_owner(uint32_t value, thread_t *owner)
+{
+       mach_port_name_t owner_name = ulock_owner_value_to_port_name(value);
+
+       *owner = port_name_to_thread(owner_name,
+           PORT_TO_THREAD_IN_CURRENT_TASK |
+           PORT_TO_THREAD_NOT_CURRENT_THREAD);
+       if (*owner == THREAD_NULL) {
+               /*
+                * Translation failed - even though the lock value is up to date,
+                * whatever was stored in the lock wasn't actually a thread port.
+                */
+               return owner_name == MACH_PORT_DEAD ? ESRCH : EOWNERDEAD;
+       }
+       return 0;
+}
+
 int
 ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
 {
@@ -414,29 +492,96 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
                goto munge_retval;
        }
 
-       boolean_t set_owner = FALSE;
+       bool set_owner = false;
+       bool xproc = false;
+       size_t lock_size = sizeof(uint32_t);
+       int copy_ret;
 
        switch (opcode) {
        case UL_UNFAIR_LOCK:
-               set_owner = TRUE;
+               set_owner = true;
                break;
        case UL_COMPARE_AND_WAIT:
                break;
+       case UL_COMPARE_AND_WAIT64:
+               lock_size = sizeof(uint64_t);
+               break;
+       case UL_COMPARE_AND_WAIT_SHARED:
+               xproc = true;
+               break;
+       case UL_COMPARE_AND_WAIT64_SHARED:
+               xproc = true;
+               lock_size = sizeof(uint64_t);
+               break;
        default:
                ret = EINVAL;
                goto munge_retval;
        }
 
-       /* 32-bit lock type for UL_COMPARE_AND_WAIT and UL_UNFAIR_LOCK */
-       uint32_t value = 0;
+       uint64_t value = 0;
 
-       if ((args->addr == 0) || (args->addr % _Alignof(_Atomic(typeof(value))))) {
+       if ((args->addr == 0) || (args->addr & (lock_size - 1))) {
                ret = EINVAL;
                goto munge_retval;
        }
 
-       key.ulk_pid = p->p_pid;
-       key.ulk_addr = args->addr;
+       if (xproc) {
+               uint64_t object = 0;
+               uint64_t offset = 0;
+
+               ret = uaddr_findobj(args->addr, &object, &offset);
+               if (ret) {
+                       ret = EINVAL;
+                       goto munge_retval;
+               }
+               key.ulk_key_type = ULK_XPROC;
+               key.ulk_object = object;
+               key.ulk_offset = offset;
+       } else {
+               key.ulk_key_type = ULK_UADDR;
+               key.ulk_pid = p->p_pid;
+               key.ulk_addr = args->addr;
+       }
+
+       if ((flags & ULF_WAIT_ADAPTIVE_SPIN) && set_owner) {
+               /*
+                * Attempt the copyin outside of the lock once,
+                *
+                * If it doesn't match (which is common), return right away.
+                *
+                * If it matches, resolve the current owner, and if it is on core,
+                * spin a bit waiting for the value to change. If the owner isn't on
+                * core, or if the value stays stable, then go on with the regular
+                * blocking code.
+                */
+               uint64_t end = 0;
+               uint32_t u32;
+
+               ret = copyin_atomic32(args->addr, &u32);
+               if (ret || u32 != args->value) {
+                       goto munge_retval;
+               }
+               for (;;) {
+                       if (owner_thread == NULL && ulock_resolve_owner(u32, &owner_thread) != 0) {
+                               break;
+                       }
+
+                       /* owner_thread may have a +1 starting here */
+
+                       if (!machine_thread_on_core(owner_thread)) {
+                               break;
+                       }
+                       if (end == 0) {
+                               clock_interval_to_deadline(ulock_adaptive_spin_usecs,
+                                   NSEC_PER_USEC, &end);
+                       } else if (mach_absolute_time() > end) {
+                               break;
+                       }
+                       if (copyin_atomic32_wait_if_equals(args->addr, u32) != 0) {
+                               goto munge_retval;
+                       }
+               }
+       }
 
        ull_t *ull = ull_get(&key, 0, &unused_ull);
        if (ull == NULL) {
@@ -447,10 +592,6 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
 
        ull->ull_nwaiters++;
 
-       if (ull->ull_nwaiters > ull->ull_max_nwaiters) {
-               ull->ull_max_nwaiters = ull->ull_nwaiters;
-       }
-
        if (ull->ull_opcode == 0) {
                ull->ull_opcode = opcode;
        } else if (ull->ull_opcode != opcode) {
@@ -466,17 +607,22 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
         * holding the ull spinlock across copyin forces any
         * vm_fault we encounter to fail.
         */
-       uint64_t val64; /* copyin_word always zero-extends to 64-bits */
 
-       int copy_ret = copyin_word(args->addr, &val64, sizeof(value));
+       /* copyin_atomicXX always checks alignment */
 
-       value = (uint32_t)val64;
+       if (lock_size == 4) {
+               uint32_t u32;
+               copy_ret = copyin_atomic32(args->addr, &u32);
+               value = u32;
+       } else {
+               copy_ret = copyin_atomic64(args->addr, &value);
+       }
 
 #if DEVELOPMENT || DEBUG
        /* Occasionally simulate copyin finding the user address paged out */
        if (((ull_simulate_copyin_fault == p->p_pid) || (ull_simulate_copyin_fault == 1)) && (copy_ret == 0)) {
                static _Atomic int fault_inject = 0;
-               if (__c11_atomic_fetch_add(&fault_inject, 1, __ATOMIC_RELAXED) % 73 == 0) {
+               if (os_atomic_inc_orig(&fault_inject, relaxed) % 73 == 0) {
                        copy_ret = EFAULT;
                }
        }
@@ -495,17 +641,17 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
        }
 
        if (set_owner) {
-               mach_port_name_t owner_name = ulock_owner_value_to_port_name(args->value);
-               owner_thread = port_name_to_thread_for_ulock(owner_name);
-
-               /* HACK: don't bail on MACH_PORT_DEAD, to avoid blowing up the no-tsd pthread lock */
-               if (owner_name != MACH_PORT_DEAD && owner_thread == THREAD_NULL) {
-                       /*
-                        * Translation failed - even though the lock value is up to date,
-                        * whatever was stored in the lock wasn't actually a thread port.
-                        */
-                       ret = EOWNERDEAD;
-                       goto out_locked;
+               if (owner_thread == THREAD_NULL) {
+                       ret = ulock_resolve_owner(args->value, &owner_thread);
+                       if (ret == EOWNERDEAD) {
+                               /*
+                                * Translation failed - even though the lock value is up to date,
+                                * whatever was stored in the lock wasn't actually a thread port.
+                                */
+                               goto out_locked;
+                       }
+                       /* HACK: don't bail on MACH_PORT_DEAD, to avoid blowing up the no-tsd pthread lock */
+                       ret = 0;
                }
                /* owner_thread has a +1 reference */
 
@@ -584,10 +730,11 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
        ret = wait_result_to_return_code(wr);
 
        ull_lock(ull);
-       turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL);
+       turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL, TURNSTILE_ULOCK);
 
 out_locked:
        ulock_wait_cleanup(ull, owner_thread, old_owner, retval);
+       owner_thread = NULL;
 
        if (unused_ull) {
                ull_free(unused_ull);
@@ -597,6 +744,12 @@ out_locked:
        assert(*retval >= 0);
 
 munge_retval:
+       if (owner_thread) {
+               thread_deallocate(owner_thread);
+       }
+       if (ret == ESTALE) {
+               ret = 0;
+       }
        if ((flags & ULF_NO_ERRNO) && (ret != 0)) {
                *retval = -ret;
                ret = 0;
@@ -624,8 +777,7 @@ ulock_wait_cleanup(ull_t *ull, thread_t owner_thread, thread_t old_owner, int32_
                old_lingering_owner = ull->ull_owner;
                ull->ull_owner = THREAD_NULL;
 
-               ull->ull_key.ulk_pid = 0;
-               ull->ull_key.ulk_addr = 0;
+               memset(&ull->ull_key, 0, sizeof ull->ull_key);
                ull->ull_refcount--;
                assert(ull->ull_refcount > 0);
        }
@@ -666,7 +818,7 @@ ulock_wait_continue(void * parameter, wait_result_t wr)
        ret = wait_result_to_return_code(wr);
 
        ull_lock(ull);
-       turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL);
+       turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL, TURNSTILE_ULOCK);
 
        ulock_wait_cleanup(ull, owner_thread, old_owner, retval);
 
@@ -688,12 +840,6 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva
 
        /* involved threads - each variable holds +1 ref if not null */
        thread_t wake_thread    = THREAD_NULL;
-       thread_t old_owner      = THREAD_NULL;
-
-       if ((flags & ULF_WAKE_MASK) != flags) {
-               ret = EINVAL;
-               goto munge_retval;
-       }
 
 #if DEVELOPMENT || DEBUG
        if (opcode == UL_DEBUG_HASH_DUMP_PID) {
@@ -708,120 +854,159 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva
        }
 #endif
 
+       bool set_owner = false;
+       bool xproc = false;
+
+       switch (opcode) {
+       case UL_UNFAIR_LOCK:
+               set_owner = true;
+               break;
+       case UL_COMPARE_AND_WAIT:
+       case UL_COMPARE_AND_WAIT64:
+               break;
+       case UL_COMPARE_AND_WAIT_SHARED:
+       case UL_COMPARE_AND_WAIT64_SHARED:
+               xproc = true;
+               break;
+       default:
+               ret = EINVAL;
+               goto munge_retval;
+       }
+
+       if ((flags & ULF_WAKE_MASK) != flags) {
+               ret = EINVAL;
+               goto munge_retval;
+       }
+
+       if ((flags & ULF_WAKE_THREAD) && ((flags & ULF_WAKE_ALL) || set_owner)) {
+               ret = EINVAL;
+               goto munge_retval;
+       }
+
        if (args->addr == 0) {
                ret = EINVAL;
                goto munge_retval;
        }
 
-       if (flags & ULF_WAKE_THREAD) {
-               if (flags & ULF_WAKE_ALL) {
+       if (xproc) {
+               uint64_t object = 0;
+               uint64_t offset = 0;
+
+               ret = uaddr_findobj(args->addr, &object, &offset);
+               if (ret) {
                        ret = EINVAL;
                        goto munge_retval;
                }
+               key.ulk_key_type = ULK_XPROC;
+               key.ulk_object = object;
+               key.ulk_offset = offset;
+       } else {
+               key.ulk_key_type = ULK_UADDR;
+               key.ulk_pid = p->p_pid;
+               key.ulk_addr = args->addr;
+       }
+
+       if (flags & ULF_WAKE_THREAD) {
                mach_port_name_t wake_thread_name = (mach_port_name_t)(args->wake_value);
-               wake_thread = port_name_to_thread_for_ulock(wake_thread_name);
+               wake_thread = port_name_to_thread(wake_thread_name,
+                   PORT_TO_THREAD_IN_CURRENT_TASK |
+                   PORT_TO_THREAD_NOT_CURRENT_THREAD);
                if (wake_thread == THREAD_NULL) {
                        ret = ESRCH;
                        goto munge_retval;
                }
        }
 
-       key.ulk_pid = p->p_pid;
-       key.ulk_addr = args->addr;
-
        ull_t *ull = ull_get(&key, ULL_MUST_EXIST, NULL);
+       thread_t new_owner = THREAD_NULL;
+       struct turnstile *ts = TURNSTILE_NULL;
+       thread_t cleanup_thread = THREAD_NULL;
+
        if (ull == NULL) {
-               if (wake_thread != THREAD_NULL) {
-                       thread_deallocate(wake_thread);
-               }
                ret = ENOENT;
                goto munge_retval;
        }
        /* ull is locked */
 
-       boolean_t clear_owner = FALSE; /* need to reset owner */
-
-       switch (opcode) {
-       case UL_UNFAIR_LOCK:
-               clear_owner = TRUE;
-               break;
-       case UL_COMPARE_AND_WAIT:
-               break;
-       default:
-               ret = EINVAL;
-               goto out_locked;
-       }
-
        if (opcode != ull->ull_opcode) {
                ret = EDOM;
-               goto out_locked;
+               goto out_ull_put;
        }
 
-       if (!clear_owner) {
+       if (set_owner) {
+               if (ull->ull_owner != current_thread()) {
+                       /*
+                        * If the current thread isn't the known owner,
+                        * then this wake call was late to the party,
+                        * and the kernel already knows who owns the lock.
+                        *
+                        * This current owner already knows the lock is contended
+                        * and will redrive wakes, just bail out.
+                        */
+                       goto out_ull_put;
+               }
+       } else {
                assert(ull->ull_owner == THREAD_NULL);
        }
 
-       struct turnstile *ts;
        ts = turnstile_prepare((uintptr_t)ull, &ull->ull_turnstile,
            TURNSTILE_NULL, TURNSTILE_ULOCK);
+       assert(ts != TURNSTILE_NULL);
 
-       if (flags & ULF_WAKE_ALL) {
-               waitq_wakeup64_all(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
-                   THREAD_AWAKENED, 0);
-       } else if (flags & ULF_WAKE_THREAD) {
-               kern_return_t kr = waitq_wakeup64_thread(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
+       if (flags & ULF_WAKE_THREAD) {
+               kern_return_t kr = waitq_wakeup64_thread(&ts->ts_waitq,
+                   CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
                    wake_thread, THREAD_AWAKENED);
                if (kr != KERN_SUCCESS) {
                        assert(kr == KERN_NOT_WAITING);
                        ret = EALREADY;
                }
-       } else {
+       } else if (flags & ULF_WAKE_ALL) {
+               if (set_owner) {
+                       turnstile_update_inheritor(ts, THREAD_NULL,
+                           TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
+               }
+               waitq_wakeup64_all(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
+                   THREAD_AWAKENED, 0);
+       } else if (set_owner) {
                /*
-                * TODO: WAITQ_SELECT_MAX_PRI forces a linear scan of the (hashed) global waitq.
-                * Move to a ulock-private, priority sorted waitq (i.e. SYNC_POLICY_FIXED_PRIORITY) to avoid that.
-                *
-                * TODO: 'owner is not current_thread (or null)' likely means we can avoid this wakeup
-                * <rdar://problem/25487001>
+                * The turnstile waitq is priority ordered,
+                * and will wake up the highest priority waiter
+                * and set it as the inheritor for us.
                 */
+               new_owner = waitq_wakeup64_identify(&ts->ts_waitq,
+                   CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
+                   THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE);
+       } else {
                waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
-                   THREAD_AWAKENED, WAITQ_SELECT_MAX_PRI);
+                   THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
        }
 
-       /*
-        * Reaching this point means I previously moved the lock to 'unowned' state in userspace.
-        * Therefore I need to relinquish my promotion.
-        *
-        * However, someone else could have locked it after I unlocked, and then had a third thread
-        * block on the lock, causing a promotion of some other owner.
-        *
-        * I don't want to stomp over that, so only remove the promotion if I'm the current owner.
-        */
-
-       if (ull->ull_owner == current_thread()) {
-               turnstile_update_inheritor(ts, THREAD_NULL,
-                   (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
+       if (set_owner) {
                turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
-               old_owner = ull->ull_owner;
-               ull->ull_owner = THREAD_NULL;
+               cleanup_thread = ull->ull_owner;
+               ull->ull_owner = new_owner;
        }
 
-       turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL);
+       turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL, TURNSTILE_ULOCK);
 
-out_locked:
+out_ull_put:
        ull_put(ull);
 
-       /* Need to be called after dropping the interlock */
-       turnstile_cleanup();
-
-       if (wake_thread != THREAD_NULL) {
-               thread_deallocate(wake_thread);
+       if (ts != TURNSTILE_NULL) {
+               /* Need to be called after dropping the interlock */
+               turnstile_cleanup();
        }
 
-       if (old_owner != THREAD_NULL) {
-               thread_deallocate(old_owner);
+       if (cleanup_thread != THREAD_NULL) {
+               thread_deallocate(cleanup_thread);
        }
 
 munge_retval:
+       if (wake_thread != THREAD_NULL) {
+               thread_deallocate(wake_thread);
+       }
+
        if ((flags & ULF_NO_ERRNO) && (ret != 0)) {
                *retval = -ret;
                ret = 0;
@@ -835,14 +1020,22 @@ kdp_ulock_find_owner(__unused struct waitq * waitq, event64_t event, thread_wait
        ull_t *ull = EVENT_TO_ULOCK(event);
        assert(kdp_is_in_zone(ull, "ulocks"));
 
-       if (ull->ull_opcode == UL_UNFAIR_LOCK) {// owner is only set if it's an os_unfair_lock
-               waitinfo->owner = thread_tid(ull->ull_owner);
+       switch (ull->ull_opcode) {
+       case UL_UNFAIR_LOCK:
+       case UL_UNFAIR_LOCK64_SHARED:
+               waitinfo->owner   = thread_tid(ull->ull_owner);
                waitinfo->context = ull->ull_key.ulk_addr;
-       } else if (ull->ull_opcode == UL_COMPARE_AND_WAIT) { // otherwise, this is a spinlock
-               waitinfo->owner = 0;
+               break;
+       case UL_COMPARE_AND_WAIT:
+       case UL_COMPARE_AND_WAIT64:
+       case UL_COMPARE_AND_WAIT_SHARED:
+       case UL_COMPARE_AND_WAIT64_SHARED:
+               waitinfo->owner   = 0;
                waitinfo->context = ull->ull_key.ulk_addr;
-       } else {
+               break;
+       default:
                panic("%s: Invalid ulock opcode %d addr %p", __FUNCTION__, ull->ull_opcode, (void*)ull);
+               break;
        }
        return;
 }
index 811d42826a60f242ed700a4f284a9c67ff73e847..6878545e7e0d53d1641ef4145dddc9f9c47b7484 100644 (file)
 139    AUE_FUTIMES     ALL     { int futimes(int fd, struct timeval *tptr); } 
 140    AUE_ADJTIME     ALL     { int adjtime(struct timeval *delta, struct timeval *olddelta); } 
 141    AUE_NULL        ALL     { int nosys(void); }   { old getpeername }
-142    AUE_SYSCTL      ALL     { int gethostuuid(unsigned char *uuid_buf, const struct timespec *timeoutp, int spi) NO_SYSCALL_STUB; }
+142    AUE_SYSCTL      ALL     { int gethostuuid(unsigned char *uuid_buf, const struct timespec *timeoutp) NO_SYSCALL_STUB; }
 143    AUE_NULL        ALL     { int nosys(void); }   { old sethostid  }
 144    AUE_NULL        ALL     { int nosys(void); }   { old getrlimit }
 145    AUE_NULL        ALL     { int nosys(void); }   { old setrlimit }
 
 ; 216-> 219 used to be mkcomplex and {f,l}statv variants. They are gone now.
 216    AUE_NULL        ALL     { int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode) NO_SYSCALL_STUB; }
-217    AUE_NULL        ALL     { int nosys(void); }    { old statv }
+217    AUE_FSGETPATH_EXTENDED  ALL     { user_ssize_t fsgetpath_ext(user_addr_t buf, size_t bufsize, user_addr_t fsid, uint64_t objid, uint32_t options); }
 218    AUE_NULL        ALL     { int nosys(void); }    { old lstatv }
 219    AUE_NULL        ALL     { int nosys(void); }    { old fstatv }
 220    AUE_GETATTRLIST ALL     { int getattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options) NO_SYSCALL_STUB; } 
 271    AUE_SEMWAIT     ALL     { int sem_wait(sem_t *sem); } 
 272    AUE_SEMTRYWAIT  ALL     { int sem_trywait(sem_t *sem); } 
 273    AUE_SEMPOST     ALL     { int sem_post(sem_t *sem); } 
-274    AUE_SYSCTL      ALL     { int sysctlbyname(const char *name, size_t namelen, void *old, size_t *oldlenp, void *new, size_t newlen) NO_SYSCALL_STUB; }
+274    AUE_SYSCTL      ALL     { int sys_sysctlbyname(const char *name, size_t namelen, void *old, size_t *oldlenp, void *new, size_t newlen) NO_SYSCALL_STUB; }
 275    AUE_NULL        ALL     { int enosys(void); } { old sem_init }
 276    AUE_NULL        ALL     { int enosys(void); } { old sem_destroy }
 277    AUE_OPEN_EXTENDED_RWTC  ALL     { int open_extended(user_addr_t path, int flags, uid_t uid, gid_t gid, int mode, user_addr_t xsecurity) NO_SYSCALL_STUB; } 
 493    AUE_NULL        ALL { int enosys(void); }
 #endif
 #if CONFIG_PERSONAS
-494    AUE_PERSONA     ALL     { int persona(uint32_t operation, uint32_t flags, struct kpersona_info *info, uid_t *id, size_t *idlen) NO_SYSCALL_STUB; }
+494    AUE_PERSONA     ALL     { int persona(uint32_t operation, uint32_t flags, struct kpersona_info *info, uid_t *id, size_t *idlen, char *path) NO_SYSCALL_STUB; }
 #else
 494    AUE_NULL        ALL     { int enosys(void); }
 #endif
 530    AUE_NULL        ALL     { int enosys(void); }
 #endif // CONFIG_WORKQUEUE
 531    AUE_NULL        ALL     { uint64_t __mach_bridge_remote_time(uint64_t local_timestamp); }
+#if CONFIG_COALITIONS
+532 AUE_NULL  ALL { int coalition_ledger(uint32_t operation, uint64_t *cid, void *buffer, size_t *bufsize) NO_SYSCALL_STUB; }
+#else
+532   AUE_NULL    ALL { int enosys(void); }
+#endif // CONFIG_COALITIONS
+533     AUE_NULL        ALL     { int log_data(unsigned int tag, unsigned int flags, void *buffer, unsigned int size) NO_SYSCALL_STUB; }
+534 AUE_NULL   ALL     { uint64_t memorystatus_available_memory(void) NO_SYSCALL_STUB; }
index 4e141439dc6588bf43bc487a5984403780b7687c..c7352a6ce630518a96bc56c0e29a6e591038e465 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -90,7 +90,7 @@ static void msg_freehdr(struct msg *msghdr);
 typedef int     sy_call_t(struct proc *, void *, int *);
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
-static sy_call_t *msgcalls[] = {
+static sy_call_t* const msgcalls[] = {
        (sy_call_t *)msgctl, (sy_call_t *)msgget,
        (sy_call_t *)msgsnd, (sy_call_t *)msgrcv
 };
@@ -122,12 +122,12 @@ int     msgmax,                 /* max chars in a message */
     msgssz,                     /* size of a message segment (see notes above) */
     msgseg;                     /* number of message segments */
 struct msginfo msginfo = {
-       MSGMAX,                 /* = (MSGSSZ*MSGSEG) : max chars in a message */
-       MSGMNI,                 /* = 40 : max message queue identifiers */
-       MSGMNB,                 /* = 2048 : max chars in a queue */
-       MSGTQL,                 /* = 40 : max messages in system */
-       MSGSSZ,                 /* = 8 : size of a message segment (2^N long) */
-       MSGSEG                  /* = 2048 : number of message segments */
+       .msgmax = MSGMAX,               /* = (MSGSSZ*MSGSEG) : max chars in a message */
+       .msgmni = MSGMNI,               /* = 40 : max message queue identifiers */
+       .msgmnb = MSGMNB,               /* = 2048 : max chars in a queue */
+       .msgtql = MSGTQL,               /* = 40 : max messages in system */
+       .msgssz = MSGSSZ,               /* = 8 : size of a message segment (2^N long) */
+       .msgseg = MSGSEG                /* = 2048 : number of message segments */
 };
 #endif /* __APPLE_API_PRIVATE */
 
index 795fd6d022009cd7f49fbf2b72de34b31117aa21..b6cad0b1befb0862a7bca5bff80aaa212f47dbc7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
  * These are not needed if we can make the semaphore pages swappable.
  */
 static struct seminfo limitseminfo = {
-       SEMMAP,        /* # of entries in semaphore map */
-       SEMMNI,        /* # of semaphore identifiers */
-       SEMMNS,        /* # of semaphores in system */
-       SEMMNU,        /* # of undo structures in system */
-       SEMMSL,        /* max # of semaphores per id */
-       SEMOPM,        /* max # of operations per semop call */
-       SEMUME,        /* max # of undo entries per process */
-       SEMUSZ,        /* size in bytes of undo structure */
-       SEMVMX,        /* semaphore maximum value */
-       SEMAEM         /* adjust on exit max value */
+       .semmap = SEMMAP,        /* # of entries in semaphore map */
+       .semmni = SEMMNI,        /* # of semaphore identifiers */
+       .semmns = SEMMNS,        /* # of semaphores in system */
+       .semmnu = SEMMNU,        /* # of undo structures in system */
+       .semmsl = SEMMSL,        /* max # of semaphores per id */
+       .semopm = SEMOPM,        /* max # of operations per semop call */
+       .semume = SEMUME,        /* max # of undo entries per process */
+       .semusz = SEMUSZ,        /* size in bytes of undo structure */
+       .semvmx = SEMVMX,        /* semaphore maximum value */
+       .semaem = SEMAEM         /* adjust on exit max value */
 };
 
 /* Current system allocations.  We use this structure to track how many
@@ -102,16 +102,16 @@ static struct seminfo limitseminfo = {
  * and not allocate the memory for them up front.
  */
 struct seminfo seminfo = {
-       SEMMAP, /* Unused, # of entries in semaphore map */
-       0,      /* # of semaphore identifiers */
-       0,      /* # of semaphores in system */
-       0,      /* # of undo entries in system */
-       SEMMSL, /* max # of semaphores per id */
-       SEMOPM, /* max # of operations per semop call */
-       SEMUME, /* max # of undo entries per process */
-       SEMUSZ, /* size in bytes of undo structure */
-       SEMVMX, /* semaphore maximum value */
-       SEMAEM  /* adjust on exit max value */
+       .semmap = SEMMAP,       /* Unused, # of entries in semaphore map */
+       .semmni = 0,            /* # of semaphore identifiers */
+       .semmns = 0,            /* # of semaphores in system */
+       .semmnu = 0,            /* # of undo entries in system */
+       .semmsl = SEMMSL,       /* max # of semaphores per id */
+       .semopm = SEMOPM,       /* max # of operations per semop call */
+       .semume = SEMUME,       /* max # of undo entries per process */
+       .semusz = SEMUSZ,       /* size in bytes of undo structure */
+       .semvmx = SEMVMX,       /* semaphore maximum value */
+       .semaem = SEMAEM        /* adjust on exit max value */
 };
 
 
@@ -121,7 +121,7 @@ static int semundo_adjust(struct proc *p, int *supidx,
 static void semundo_clear(int semid, int semnum);
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
-static sy_call_t *semcalls[] = {
+static sy_call_t* const semcalls[] = {
        (sy_call_t *)semctl, (sy_call_t *)semget,
        (sy_call_t *)semop
 };
index 99ad6602e01129735a59ae581f0f715345b1971d..d31d1f57b1ec3c843fe1f0ac8b91ea9ccdd52eba 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -124,7 +124,7 @@ static void shmid_ds_64to32(struct user_shmid_ds *in, struct user32_shmid_ds *ou
 static void shmid_ds_32to64(struct user32_shmid_ds *in, struct user_shmid_ds *out);
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
-static sy_call_t *shmcalls[] = {
+static sy_call_t* const shmcalls[] = {
        (sy_call_t *)shmat, (sy_call_t *)oshmctl,
        (sy_call_t *)shmdt, (sy_call_t *)shmget,
        (sy_call_t *)shmctl
@@ -170,11 +170,11 @@ static int shm_delete_mapping(struct proc *, struct shmmap_state *, int);
 #define DEFAULT_SHMALL  1024
 
 struct shminfo shminfo = {
-       DEFAULT_SHMMAX,
-       DEFAULT_SHMMIN,
-       DEFAULT_SHMMNI,
-       DEFAULT_SHMSEG,
-       DEFAULT_SHMALL
+       .shmmax = DEFAULT_SHMMAX,
+       .shmmin = DEFAULT_SHMMIN,
+       .shmmni = DEFAULT_SHMMNI,
+       .shmseg = DEFAULT_SHMSEG,
+       .shmall = DEFAULT_SHMALL
 };
 
 #define SHMID_IS_VALID(x) ((x) >= 0)
index 3cbac2c73c670c369946bb74f09380951a45addd..2da5b2b91065f4f472b907bfad2b21c64afd668c 100644 (file)
@@ -97,7 +97,7 @@
 0x10c0094      MSC_semaphore_wait_signal_trap
 0x10c0098      MSC_semaphore_timedwait_trap
 0x10c009c      MSC_semaphore_timedwait_signal_trap
-0x10c00a0      MSC_kern_invalid_#40
+0x10c00a0      MSC_mach_port_get_attributes_trap
 0x10c00a4      MSC_mach_port_guard_trap
 0x10c00a8      MSC_mach_port_unguard_trap
 0x10c00ac      MSC_mach_generate_activity_id
 0x10c0124      MSC_kern_invalid_#73
 0x10c0128      MSC_kern_invalid_#74
 0x10c012c      MSC_kern_invalid_#75
-0x10c0130      MSC_kern_invalid_#76
-0x10c0134      MSC_kern_invalid_#77
+0x10c0130      MSC_mach_port_type_trap
+0x10c0134      MSC_mach_port_request_notification_trap
 0x10c0138      MSC_kern_invalid_#78
 0x10c013c      MSC_kern_invalid_#79
 0x10c0140      MSC_kern_invalid_#80
 0x14000D8      MACH_UNPROMOTED
 0x14000DC      MACH_PROMOTED_UPDATE
 0x14000E0      MACH_QUIESCENT_COUNTER
+0x14000E4      MACH_TURNSTILE_USER_CHANGE
+0x14000E8      MACH_AMP_RECOMMENDATION_CHANGE
+0x1400100      MACH_TURNSTILE_KERNEL_CHANGE
 0x1500000      MACH_MSGID_INVALID
 0x1600000      MTX_SLEEP
 0x1600004      MTX_SLEEP_DEADLINE
 0x3120008      DECMPFS_fetch_uncmp_data
 0x3120010      DECMPFS_free_cmp_data
 0x3120014      DECMPFS_file_is_cmp
+0x3130000      VFS_devfsdirent_label_alloc
+0x3130004      VFS_mount_label_alloc
+0x3130008      VFS_label_alloc
+0x313000C      VFS_devfs_label_free
+0x3130010      VFS_mount_label_free
+0x3130014      VFS_label_free
+0x3130018      VFS_label_copy
+0x313001C      VFS_devfs_label_copy
+0x3130020      VFS_devfs_label_update
+0x3130024      VFS_label_associate_devfs
+0x3130028      VFS_label_associate_extattr
+0x313002C      VFS_label_associate_singlelabel
+0x3130030      VFS_notify_create
+0x3130034      VFS_notify_rename
+0x3130038      VFS_notify_open
+0x313003C      VFS_notify_link
+0x3130040      VFS_notify_deleteextattr
+0x3130044      VFS_notify_setacl
+0x3130048      VFS_notify_setattrlist
+0x313004C      VFS_notify_setextattr
+0x3130050      VFS_notify_setflags
+0x3130054      VFS_notify_setmode
+0x3130058      VFS_notify_setowner
+0x313005C      VFS_notify_setutimes
+0x3130060      VFS_notify_truncate
+0x3130064      VFS_label_update_extattr
+0x3130068      VFS_label_store
+0x313006C      VFS_cred_label_update_execve
+0x3130070      VFS_cred_check_label_update_execve
+0x3130074      VFS_check_access
+0x3130078      VFS_check_chdir
+0x313007C      VFS_check_chroot
+0x3130080      VFS_check_clone
+0x3130084      VFS_check_create
+0x3130088      VFS_check_unlink
+0x313008C      VFS_check_deleteacl
+0x3130090      VFS_check_deleteextattr
+0x3130094      VFS_check_exchangedata
+0x3130098      VFS_check_getacl
+0x313009C      VFS_check_getattr
+0x31300A0      VFS_check_getattrlist
+0x31300A4      VFS_check_exec
+0x31300A8      VFS_check_fsgetpath
+0x31300AC      VFS_check_signature
+0x31300B0      VFS_check_getacl
+0x31300B4      VFS_check_getextattr
+0x31300B8      VFS_check_ioctl
+0x31300BC      VFS_check_kqfilter
+0x31300C0      VFS_check_link
+0x31300C4      VFS_check_listextattr
+0x31300C8      VFS_check_lookup_preflight
+0x31300CC      VFS_check_lookup
+0x31300D0      VFS_check_open
+0x31300D4      VFS_check_read
+0x31300D8      VFS_check_readdir
+0x31300DC      VFS_check_readlink
+0x31300E0      VFS_check_label_update
+0x31300E4      VFS_check_rename
+0x31300E8      VFS_check_revoke
+0x31300EC      VFS_check_searchfs
+0x31300F0      VFS_check_select
+0x31300F4      VFS_check_setacl
+0x31300F8      VFS_check_setattrlist
+0x31300FC      VFS_check_setextattr
+0x3130100      VFS_check_setflags
+0x3130104      VFS_check_setmode
+0x3130108      VFS_check_setowner
+0x313010C      VFS_check_setutimes
+0x3130110      VFS_check_stat
+0x3130114      VFS_check_trigger_resolve
+0x3130118      VFS_check_truncate
+0x313011C      VFS_check_write
+0x3130120      VFS_check_uipc_bind
+0x3130124      VFS_check_uipc_connect
+0x3130128      VFS_label_update
+0x313012C      VFS_find_sigs
+0x3130130      VFS_mount_label_associate
+0x3130134      VFS_mount_check_mount
+0x3130138      VFS_mount_check_mount_late
+0x313013C      VFS_mount_check_snapshot_create
+0x3130140      VFS_mount_check_snapshot_delete
+0x3130144      VFS_mount_check_snapshot_revert
+0x3130148      VFS_mount_check_remount
+0x313014C      VFS_mount_check_umount
+0x3130150      VFS_mount_check_getattr
+0x3130154      VFS_mount_check_setattr
+0x3130158      VFS_mount_check_stat
+0x313015C      VFS_mount_check_label_update
+0x3130160      VFS_mount_check_fsctl
+0x3130164      VFS_devfs_label_associate_device
+0x3130168      VFS_devfs_label_associate_directory
+0x313016C      VFS_label_associate_fdesc
 0x3CF0000      CP_OFFSET_IO
 0x4010004      proc_exit
 0x4010008      force_exit
 0x4020008      MEMSTAT_jetsam
 0x402000C      MEMSTAT_jetsam_hiwat
 0x4020010      MEMSTAT_freeze
-0x4020014      MEMSTAT_latency_coalesce
+0x4020014      MEMSTAT_freeze_scan
 0x4020018      MEMSTAT_update
 0x402001C      MEMSTAT_idle_demote
 0x4020020      MEMSTAT_clear_errors
 0x4020034      MEMSTAT_do_kill
 0x4020038      MEMSTAT_change_priority
 0x402003C      MEMSTAT_fast_jetsam
+0x4020040      MEMSTAT_compactor_run
+0x4020044      MEMSTAT_freeze_disable
 0x4030004      KEVENT_kq_processing_begin
 0x4030008      KEVENT_kq_processing_end
 0x403000c      KEVENT_kqwq_processing_begin
 0x263b0028     imp_thread_qos_workq_override
 0x263c0028     imp_thread_qos_promote
 0x263d0028     imp_thread_qos_ipc_override
+0x263e0028     imp_thread_qos_servicer_override
 0x27000000     PERF_PCEVENT
 0x27001000     PERF_CPU_IDLE
 0x27001100     PERF_CPU_IDLE_TIMER
index 417357add10f375b235de1723b738356b6ae26fd..8148988033ad029b1f833ecdd54920a314daca1a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 1997-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -1581,6 +1581,12 @@ ttioctl_locked(struct tty *tp, u_long cmd, caddr_t data, int flag, proc_t p)
        case TIOCGDRAINWAIT:
                *(int *)data = tp->t_timeout / hz;
                break;
+       case TIOCREVOKE:
+               if (ISSET(tp->t_state, TS_PGRPHUP)) {
+                       tp->t_gen++;
+                       wakeup(TSA_HUP_OR_INPUT(tp));
+               }
+               break;
        default:
                error = ttcompat(tp, cmd, data, flag, p);
                goto out;
@@ -2147,7 +2153,7 @@ loop:
                int m = cc[VMIN];
                long t = cc[VTIME];
                struct timeval timecopy;
-               struct timeval etime = {0, 0};  /* protected by !has_etime */
+               struct timeval etime = {.tv_sec = 0, .tv_usec = 0};     /* protected by !has_etime */
 
                /*
                 * Check each of the four combinations.
@@ -2806,6 +2812,16 @@ ttyecho(int c, struct tty *tp)
        (void)ttyoutput(c, tp);
 }
 
+static void
+ttwakeup_knote(struct selinfo *sip, long hint)
+{
+       if ((sip->si_flags & SI_KNPOSTING) == 0) {
+               sip->si_flags |= SI_KNPOSTING;
+               KNOTE(&sip->si_note, hint);
+               sip->si_flags &= ~SI_KNPOSTING;
+       }
+}
+
 
 /*
  * Wake up any readers on a tty.
@@ -2818,7 +2834,7 @@ ttwakeup(struct tty *tp)
        TTY_LOCK_OWNED(tp);     /* debug assert */
 
        selwakeup(&tp->t_rsel);
-       KNOTE(&tp->t_rsel.si_note, 1);
+       ttwakeup_knote(&tp->t_rsel, 0);
        if (ISSET(tp->t_state, TS_ASYNC)) {
                /*
                 * XXX: Callers may not revalidate it the tty is closed
@@ -2850,7 +2866,7 @@ ttwwakeup(struct tty *tp)
 
        if (tp->t_outq.c_cc <= tp->t_lowat) {
                selwakeup(&tp->t_wsel);
-               KNOTE(&tp->t_wsel.si_note, 1);
+               ttwakeup_knote(&tp->t_wsel, 0);
        }
        if (ISSET(tp->t_state, TS_BUSY | TS_SO_OCOMPLETE) ==
            TS_SO_OCOMPLETE && tp->t_outq.c_cc == 0) {
@@ -3030,7 +3046,6 @@ ttyinfo_locked(struct tty *tp)
                break;
        }
        calcru(pick, &utime, &stime, NULL);
-       proc_rele(pick);
 
        /* Print command, pid, state, utime, and stime */
        ttyprintf(tp, " cmd: %s %d %s %ld.%02du %ld.%02ds\n",
@@ -3039,6 +3054,8 @@ ttyinfo_locked(struct tty *tp)
            state,
            (long)utime.tv_sec, utime.tv_usec / 10000,
            (long)stime.tv_sec, stime.tv_usec / 10000);
+
+       proc_rele(pick);
        tp->t_rocount = 0;
 }
 
@@ -3311,11 +3328,11 @@ isctty_sp(proc_t p, struct tty  *tp, struct session *sessp)
 }
 
 
-static int  filt_ttyattach(struct knote *kn, struct kevent_internal_s *kev);
+static int  filt_ttyattach(struct knote *kn, struct kevent_qos_s *kev);
 static void filt_ttydetach(struct knote *kn);
 static int  filt_ttyevent(struct knote *kn, long hint);
-static int  filt_ttytouch(struct knote *kn, struct kevent_internal_s *kev);
-static int  filt_ttyprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
+static int  filt_ttytouch(struct knote *kn, struct kevent_qos_s *kev);
+static int  filt_ttyprocess(struct knote *kn, struct kevent_qos_s *kev);
 
 SECURITY_READ_ONLY_EARLY(struct filterops) tty_filtops = {
        .f_isfd    = 1,
@@ -3331,31 +3348,35 @@ SECURITY_READ_ONLY_EARLY(struct filterops) tty_filtops = {
  * or written.
  */
 static int
-filt_tty_common(struct knote *kn, struct tty *tp)
+filt_tty_common(struct knote *kn, struct kevent_qos_s *kev, struct tty *tp)
 {
        int retval = 0;
+       int64_t data = 0;
 
        TTY_LOCK_OWNED(tp); /* debug assert */
 
-       if (tp->t_state & TS_ZOMBIE) {
-               kn->kn_flags |= EV_EOF;
-               return 1;
-       }
-
-       switch (knote_get_seltype(kn)) {
-       case FREAD:
-               retval = ttnread(tp);
+       switch (kn->kn_filter) {
+       case EVFILT_READ:
+               /*
+                * ttnread can change the tty state,
+                * hence must be done upfront, before any other check.
+                */
+               data = ttnread(tp);
+               retval = (data != 0);
                break;
-       case FWRITE:
+       case EVFILT_WRITE:
                if ((tp->t_outq.c_cc <= tp->t_lowat) &&
                    (tp->t_state & TS_CONNECTED)) {
-                       retval = tp->t_hiwat - tp->t_outq.c_cc;
+                       data = tp->t_hiwat - tp->t_outq.c_cc;
+                       retval = (data != 0);
                }
                break;
+       default:
+               panic("tty kevent: unexpected filter: %d, kn = %p, tty = %p",
+                   kn->kn_filter, kn, tp);
+               break;
        }
 
-       kn->kn_data = retval;
-
        /*
         * TODO(mwidmann, jandrus): For native knote low watermark support,
         * check the kn_sfflags for NOTE_LOWAT and check against kn_sdata.
@@ -3364,6 +3385,16 @@ filt_tty_common(struct knote *kn, struct tty *tp)
         *        (kn->kn_data >= kn->kn_sdata) : kn->kn_data;
         */
 
+       if (tp->t_state & TS_ZOMBIE) {
+               kn->kn_flags |= EV_EOF;
+       }
+       if (kn->kn_flags & EV_EOF) {
+               retval = 1;
+       }
+       if (retval && kev) {
+               knote_fill_kevent(kn, kev, data);
+       }
+
        return retval;
 }
 
@@ -3415,24 +3446,6 @@ tty_from_knote(struct knote *kn)
        return (struct tty *)kn->kn_hook;
 }
 
-/*
- * Try to lock the TTY structure associated with a knote.
- *
- * On success, this function returns a locked TTY structure.  Otherwise, NULL is
- * returned.
- */
-__attribute__((warn_unused_result))
-static struct tty *
-tty_lock_from_knote(struct knote *kn)
-{
-       struct tty *tp = tty_from_knote(kn);
-       if (tp) {
-               tty_lock(tp);
-       }
-
-       return tp;
-}
-
 /*
  * Set the knote's struct tty to the kn_hook field.
  *
@@ -3538,7 +3551,7 @@ out:
 }
 
 static int
-filt_ttyattach(struct knote *kn, __unused struct kevent_internal_s *kev)
+filt_ttyattach(struct knote *kn, __unused struct kevent_qos_s *kev)
 {
        int selres = 0;
        struct tty *tp;
@@ -3566,19 +3579,18 @@ filt_ttyattach(struct knote *kn, __unused struct kevent_internal_s *kev)
        /*
         * Attach the knote to selinfo's klist.
         */
-       tp = tty_lock_from_knote(kn);
-       if (!tp) {
-               knote_set_error(kn, ENOENT);
-               return 0;
-       }
+       tp = tty_from_knote(kn);
+       tty_lock(tp);
 
-       switch (knote_get_seltype(kn)) {
-       case FREAD:
+       switch (kn->kn_filter) {
+       case EVFILT_READ:
                KNOTE_ATTACH(&tp->t_rsel.si_note, kn);
                break;
-       case FWRITE:
+       case EVFILT_WRITE:
                KNOTE_ATTACH(&tp->t_wsel.si_note, kn);
                break;
+       default:
+               panic("invalid knote %p attach, filter: %d", kn, kn->kn_filter);
        }
 
        tty_unlock(tp);
@@ -3589,28 +3601,22 @@ filt_ttyattach(struct knote *kn, __unused struct kevent_internal_s *kev)
 static void
 filt_ttydetach(struct knote *kn)
 {
-       struct tty *tp;
+       struct tty *tp = tty_from_knote(kn);
 
-       tp = tty_lock_from_knote(kn);
-       if (!tp) {
-               knote_set_error(kn, ENOENT);
-               return;
-       }
+       tty_lock(tp);
 
-       struct selinfo *si = NULL;
-       switch (knote_get_seltype(kn)) {
-       case FREAD:
-               si = &tp->t_rsel;
+       switch (kn->kn_filter) {
+       case EVFILT_READ:
+               KNOTE_DETACH(&tp->t_rsel.si_note, kn);
                break;
-       case FWRITE:
-               si = &tp->t_wsel;
+       case EVFILT_WRITE:
+               KNOTE_DETACH(&tp->t_wsel.si_note, kn);
+               break;
+       default:
+               panic("invalid knote %p detach, filter: %d", kn, kn->kn_filter);
                break;
-               /* knote_get_seltype will panic on default */
        }
 
-       KNOTE_DETACH(&si->si_note, kn);
-       kn->kn_hook = NULL;
-
        tty_unlock(tp);
        ttyfree(tp);
 }
@@ -3618,52 +3624,34 @@ filt_ttydetach(struct knote *kn)
 static int
 filt_ttyevent(struct knote *kn, long hint)
 {
+       struct tty *tp = tty_from_knote(kn);
        int ret;
-       struct tty *tp;
-       bool revoked = hint & NOTE_REVOKE;
-       hint &= ~NOTE_REVOKE;
-
-       tp = tty_from_knote(kn);
-       if (!tp) {
-               knote_set_error(kn, ENOENT);
-               return 0;
-       }
 
-       if (!hint) {
-               tty_lock(tp);
-       }
+       TTY_LOCK_OWNED(tp);
 
-       if (revoked) {
+       if (hint & NOTE_REVOKE) {
                kn->kn_flags |= EV_EOF | EV_ONESHOT;
                ret = 1;
        } else {
-               ret = filt_tty_common(kn, tp);
-       }
-
-       if (!hint) {
-               tty_unlock(tp);
+               ret = filt_tty_common(kn, NULL, tp);
        }
 
        return ret;
 }
 
 static int
-filt_ttytouch(struct knote *kn, struct kevent_internal_s *kev)
+filt_ttytouch(struct knote *kn, struct kevent_qos_s *kev)
 {
-       struct tty *tp;
+       struct tty *tp = tty_from_knote(kn);
        int res = 0;
 
-       tp = tty_lock_from_knote(kn);
-       if (!tp) {
-               knote_set_error(kn, ENOENT);
-               return 0;
-       }
+       tty_lock(tp);
 
        kn->kn_sdata = kev->data;
        kn->kn_sfflags = kev->fflags;
 
        if (kn->kn_vnode_kqok) {
-               res = filt_tty_common(kn, tp);
+               res = filt_tty_common(kn, NULL, tp);
        }
 
        tty_unlock(tp);
@@ -3672,26 +3660,14 @@ filt_ttytouch(struct knote *kn, struct kevent_internal_s *kev)
 }
 
 static int
-filt_ttyprocess(struct knote *kn, __unused struct filt_process_s *data, struct kevent_internal_s *kev)
+filt_ttyprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-       struct tty *tp;
+       struct tty *tp = tty_from_knote(kn);
        int res;
 
-       tp = tty_lock_from_knote(kn);
-       if (!tp) {
-               knote_set_error(kn, ENOENT);
-               return 0;
-       }
-
-       res = filt_tty_common(kn, tp);
+       tty_lock(tp);
 
-       if (res) {
-               *kev = kn->kn_kevent;
-               if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_fflags = 0;
-                       kn->kn_data = 0;
-               }
-       }
+       res = filt_tty_common(kn, kev, tp);
 
        tty_unlock(tp);
 
index 4bd2d02730424bb09e1c9cbf1801608503eb7914..ac452a3d9b227a0c5495fce3ce5b3b643c89929b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -99,25 +99,25 @@ static int ttcompatspeedtab(int speed, struct speedtab *table);
  */
 static struct speedtab compatspeeds[] = {
 #define MAX_SPEED       17
-       { 115200, 17 },
-       { 57600, 16 },
-       { 38400, 15 },
-       { 19200, 14 },
-       { 9600, 13 },
-       { 4800, 12 },
-       { 2400, 11 },
-       { 1800, 10 },
-       { 1200, 9 },
-       { 600, 8 },
-       { 300, 7 },
-       { 200, 6 },
-       { 150, 5 },
-       { 134, 4 },
-       { 110, 3 },
-       { 75, 2 },
-       { 50, 1 },
-       { 0, 0 },
-       { -1, -1 },
+       { .sp_speed = 115200, .sp_code = 17 },
+       { .sp_speed = 57600, .sp_code = 16 },
+       { .sp_speed = 38400, .sp_code = 15 },
+       { .sp_speed = 19200, .sp_code = 14 },
+       { .sp_speed = 9600, .sp_code = 13 },
+       { .sp_speed = 4800, .sp_code = 12 },
+       { .sp_speed = 2400, .sp_code = 11 },
+       { .sp_speed = 1800, .sp_code = 10 },
+       { .sp_speed = 1200, .sp_code = 9 },
+       { .sp_speed = 600, .sp_code = 8 },
+       { .sp_speed = 300, .sp_code = 7 },
+       { .sp_speed = 200, .sp_code = 6 },
+       { .sp_speed = 150, .sp_code = 5 },
+       { .sp_speed = 134, .sp_code = 4 },
+       { .sp_speed = 110, .sp_code = 3 },
+       { .sp_speed = 75, .sp_code = 2 },
+       { .sp_speed = 50, .sp_code = 1 },
+       { .sp_speed = 0, .sp_code = 0 },
+       { .sp_speed = -1, .sp_code = -1 },
 };
 static int compatspcodes[] = {
        0, 50, 75, 110, 134, 150, 200, 300, 600, 1200,
index ccfd752fb142254bb2df75c0dd272ed887dd8d09..302b76aa53521f2879a69601126a16d7274c0ca6 100644 (file)
@@ -506,6 +506,16 @@ out:
        return;
 }
 
+static void
+ptcwakeup_knote(struct selinfo *sip, long hint)
+{
+       if ((sip->si_flags & SI_KNPOSTING) == 0) {
+               sip->si_flags |= SI_KNPOSTING;
+               KNOTE(&sip->si_note, hint);
+               sip->si_flags &= ~SI_KNPOSTING;
+       }
+}
+
 /*
  * Locks:      Assumes tty_lock() is held over this call.
  */
@@ -520,12 +530,12 @@ ptcwakeup(struct tty *tp, int flag)
        if (flag & FREAD) {
                selwakeup(&pti->pt_selr);
                wakeup(TSA_PTC_READ(tp));
-               KNOTE(&pti->pt_selr.si_note, 1);
+               ptcwakeup_knote(&pti->pt_selr, 1);
        }
        if (flag & FWRITE) {
                selwakeup(&pti->pt_selw);
                wakeup(TSA_PTC_WRITE(tp));
-               KNOTE(&pti->pt_selw.si_note, 1);
+               ptcwakeup_knote(&pti->pt_selw, 1);
        }
 }
 
@@ -1011,6 +1021,9 @@ block:
        goto again;
 }
 
+/*
+ * ptyioctl: Assumes dev was opened and lock was initilized
+ */
 __private_extern__ int
 ptyioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
 {
@@ -1020,9 +1033,10 @@ ptyioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
        int stop, error = 0;
        int allow_ext_ioctl = 1;
 
-       if (pti == NULL) {
+       if (pti == NULL || pti->pt_tty == NULL) {
                return ENXIO;
        }
+
        tp = pti->pt_tty;
        tty_lock(tp);
 
index 6da6e641abaf9a2b2d0729656718aec67c57af38..8f0ba28e43623188d9c27455cabf2bf3a598be73 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 1997-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -118,18 +118,38 @@ extern  d_select_t      ptcselect;
 
 static int ptmx_major;          /* dynamically assigned major number */
 static struct cdevsw ptmx_cdev = {
-       ptcopen, ptcclose, ptcread, ptcwrite,
-       ptyioctl, ptcstop, ptcreset, 0,
-       ptcselect, eno_mmap, eno_strat, eno_getc,
-       eno_putc, D_TTY
+       .d_open       = ptcopen,
+       .d_close      = ptcclose,
+       .d_read       = ptcread,
+       .d_write      = ptcwrite,
+       .d_ioctl      = ptyioctl,
+       .d_stop       = ptcstop,
+       .d_reset      = ptcreset,
+       .d_ttys       = NULL,
+       .d_select     = ptcselect,
+       .d_mmap       = eno_mmap,
+       .d_strategy   = eno_strat,
+       .d_reserved_1 = eno_getc,
+       .d_reserved_2 = eno_putc,
+       .d_type       = D_TTY
 };
 
 static int ptsd_major;          /* dynamically assigned major number */
 static struct cdevsw ptsd_cdev = {
-       ptsopen, ptsclose, ptsread, ptswrite,
-       ptyioctl, ptsstop, ptsreset, 0,
-       ptsselect, eno_mmap, eno_strat, eno_getc,
-       eno_putc, D_TTY
+       .d_open       = ptsopen,
+       .d_close      = ptsclose,
+       .d_read       = ptsread,
+       .d_write      = ptswrite,
+       .d_ioctl      = ptyioctl,
+       .d_stop       = ptsstop,
+       .d_reset      = ptsreset,
+       .d_ttys       = NULL,
+       .d_select     = ptsselect,
+       .d_mmap       = eno_mmap,
+       .d_strategy   = eno_strat,
+       .d_reserved_1 = eno_getc,
+       .d_reserved_2 = eno_putc,
+       .d_type       = D_TTY
 };
 
 /*
@@ -467,8 +487,8 @@ ptmx_clone(__unused dev_t dev, int action)
 int ptsd_kqfilter(dev_t dev, struct knote *kn);
 static void ptsd_kqops_detach(struct knote *);
 static int ptsd_kqops_event(struct knote *, long);
-static int ptsd_kqops_touch(struct knote *kn, struct kevent_internal_s *kev);
-static int ptsd_kqops_process(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
+static int ptsd_kqops_touch(struct knote *kn, struct kevent_qos_s *kev);
+static int ptsd_kqops_process(struct knote *kn, struct kevent_qos_s *kev);
 
 SECURITY_READ_ONLY_EARLY(struct filterops) ptsd_kqops = {
        .f_isfd = 1,
@@ -491,10 +511,7 @@ SECURITY_READ_ONLY_EARLY(struct filterops) ptsd_kqops = {
 static void
 ptsd_kqops_detach(struct knote *kn)
 {
-       struct tty *tp;
-
-       tp = kn->kn_hook;
-       assert(tp != NULL);
+       struct tty *tp = kn->kn_hook;
 
        tty_lock(tp);
 
@@ -507,42 +524,41 @@ ptsd_kqops_detach(struct knote *kn)
                case EVFILT_READ:
                        KNOTE_DETACH(&tp->t_rsel.si_note, kn);
                        break;
-
                case EVFILT_WRITE:
                        KNOTE_DETACH(&tp->t_wsel.si_note, kn);
                        break;
-
                default:
                        panic("invalid knote %p detach, filter: %d", kn, kn->kn_filter);
                        break;
                }
        }
 
-       kn->kn_hook = NULL;
        tty_unlock(tp);
-
        ttyfree(tp);
 }
 
 static int
-ptsd_kqops_common(struct knote *kn, struct tty *tp)
+ptsd_kqops_common(struct knote *kn, struct kevent_qos_s *kev, struct tty *tp)
 {
        int retval = 0;
+       int64_t data = 0;
 
        TTY_LOCK_OWNED(tp);
 
        switch (kn->kn_filter) {
        case EVFILT_READ:
-               kn->kn_data = ttnread(tp);
-               if (kn->kn_data > 0) {
-                       retval = 1;
-               }
+               /*
+                * ttnread can change the tty state,
+                * hence must be done upfront, before any other check.
+                */
+               data = ttnread(tp);
+               retval = (data > 0);
                break;
 
        case EVFILT_WRITE:
                if ((tp->t_outq.c_cc <= tp->t_lowat) &&
                    (tp->t_state & TS_CONNECTED)) {
-                       kn->kn_data = tp->t_outq.c_cn - tp->t_outq.c_cc;
+                       data = tp->t_outq.c_cn - tp->t_outq.c_cc;
                        retval = 1;
                }
                break;
@@ -555,9 +571,13 @@ ptsd_kqops_common(struct knote *kn, struct tty *tp)
 
        if (tp->t_state & TS_ZOMBIE) {
                kn->kn_flags |= EV_EOF;
+       }
+       if (kn->kn_flags & EV_EOF) {
                retval = 1;
        }
-
+       if (retval && kev) {
+               knote_fill_kevent(kn, kev, data);
+       }
        return retval;
 }
 
@@ -566,35 +586,25 @@ ptsd_kqops_event(struct knote *kn, long hint)
 {
        struct tty *tp = kn->kn_hook;
        int ret;
-       bool revoked = hint & NOTE_REVOKE;
-       hint &= ~NOTE_REVOKE;
 
-       if (!hint) {
-               tty_lock(tp);
-       }
+       TTY_LOCK_OWNED(tp);
 
-       if (revoked) {
+       if (hint & NOTE_REVOKE) {
                kn->kn_flags |= EV_EOF | EV_ONESHOT;
                ret = 1;
        } else {
-               ret = ptsd_kqops_common(kn, tp);
-       }
-
-       if (!hint) {
-               tty_unlock(tp);
+               ret = ptsd_kqops_common(kn, NULL, tp);
        }
 
        return ret;
 }
 
 static int
-ptsd_kqops_touch(struct knote *kn, struct kevent_internal_s *kev)
+ptsd_kqops_touch(struct knote *kn, struct kevent_qos_s *kev)
 {
-       struct tty *tp;
+       struct tty *tp = kn->kn_hook;
        int ret;
 
-       tp = kn->kn_hook;
-
        tty_lock(tp);
 
        /* accept new kevent state */
@@ -602,7 +612,7 @@ ptsd_kqops_touch(struct knote *kn, struct kevent_internal_s *kev)
        kn->kn_sdata = kev->data;
 
        /* recapture fired state of knote */
-       ret = ptsd_kqops_common(kn, tp);
+       ret = ptsd_kqops_common(kn, NULL, tp);
 
        tty_unlock(tp);
 
@@ -610,21 +620,13 @@ ptsd_kqops_touch(struct knote *kn, struct kevent_internal_s *kev)
 }
 
 static int
-ptsd_kqops_process(struct knote *kn, __unused struct filt_process_s *data,
-    struct kevent_internal_s *kev)
+ptsd_kqops_process(struct knote *kn, struct kevent_qos_s *kev)
 {
        struct tty *tp = kn->kn_hook;
        int ret;
 
        tty_lock(tp);
-       ret = ptsd_kqops_common(kn, tp);
-       if (ret) {
-               *kev = kn->kn_kevent;
-               if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_fflags = 0;
-                       kn->kn_data = 0;
-               }
-       }
+       ret = ptsd_kqops_common(kn, kev, tp);
        tty_unlock(tp);
 
        return ret;
@@ -672,7 +674,7 @@ ptsd_kqfilter(dev_t dev, struct knote *kn)
        }
 
        /* capture current event state */
-       ret = ptsd_kqops_common(kn, tp);
+       ret = ptsd_kqops_common(kn, NULL, tp);
 
        tty_unlock(tp);
 
@@ -688,10 +690,12 @@ ptsd_revoke_knotes(__unused int minor, struct tty *tp)
        tty_lock(tp);
 
        ttwakeup(tp);
-       KNOTE(&tp->t_rsel.si_note, NOTE_REVOKE | 1 /* the lock is already held */);
+       assert((tp->t_rsel.si_flags & SI_KNPOSTING) == 0);
+       KNOTE(&tp->t_rsel.si_note, NOTE_REVOKE);
 
        ttwwakeup(tp);
-       KNOTE(&tp->t_wsel.si_note, NOTE_REVOKE | 1);
+       assert((tp->t_wsel.si_flags & SI_KNPOSTING) == 0);
+       KNOTE(&tp->t_wsel.si_note, NOTE_REVOKE);
 
        tty_unlock(tp);
 }
@@ -706,9 +710,10 @@ ptsd_revoke_knotes(__unused int minor, struct tty *tp)
 int ptmx_kqfilter(dev_t dev, struct knote *kn);
 static void ptmx_kqops_detach(struct knote *);
 static int ptmx_kqops_event(struct knote *, long);
-static int ptmx_kqops_touch(struct knote *kn, struct kevent_internal_s *kev);
-static int ptmx_kqops_process(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
-static int ptmx_kqops_common(struct knote *kn, struct ptmx_ioctl *pti, struct tty *tp);
+static int ptmx_kqops_touch(struct knote *kn, struct kevent_qos_s *kev);
+static int ptmx_kqops_process(struct knote *kn, struct kevent_qos_s *kev);
+static int ptmx_kqops_common(struct knote *kn, struct kevent_qos_s *kev,
+    struct ptmx_ioctl *pti, struct tty *tp);
 
 SECURITY_READ_ONLY_EARLY(struct filterops) ptmx_kqops = {
        .f_isfd = 1,
@@ -728,8 +733,7 @@ ptmx_knote_ioctl(struct knote *kn)
 static struct tty *
 ptmx_knote_tty(struct knote *kn)
 {
-       struct ptmx_ioctl *pti = kn->kn_hook;
-       return pti->pt_tty;
+       return ptmx_knote_ioctl(kn)->pt_tty;
 }
 
 int
@@ -754,6 +758,8 @@ ptmx_kqfilter(dev_t dev, struct knote *kn)
        tty_lock(tp);
 
        kn->kn_filtid = EVFILTID_PTMX;
+       /* the tty will be freed when detaching the knote */
+       ttyhold(tp);
        kn->kn_hook = pti;
 
        /*
@@ -775,10 +781,8 @@ ptmx_kqfilter(dev_t dev, struct knote *kn)
        }
 
        /* capture current event state */
-       ret = ptmx_kqops_common(kn, pti, tp);
+       ret = ptmx_kqops_common(kn, NULL, pti, tp);
 
-       /* take a reference on the TTY */
-       ttyhold(tp);
        tty_unlock(tp);
 
        return ret;
@@ -790,49 +794,39 @@ ptmx_kqops_detach(struct knote *kn)
        struct ptmx_ioctl *pti = kn->kn_hook;
        struct tty *tp = pti->pt_tty;
 
-       assert(tp != NULL);
-
        tty_lock(tp);
 
        switch (kn->kn_filter) {
        case EVFILT_READ:
                KNOTE_DETACH(&pti->pt_selr.si_note, kn);
                break;
-
        case EVFILT_WRITE:
                KNOTE_DETACH(&pti->pt_selw.si_note, kn);
                break;
-
        default:
                panic("invalid knote %p detach, filter: %d", kn, kn->kn_filter);
                break;
        }
 
-       kn->kn_hook = NULL;
        tty_unlock(tp);
-
        ttyfree(tp);
 }
 
 static int
-ptmx_kqops_common(struct knote *kn, struct ptmx_ioctl *pti, struct tty *tp)
+ptmx_kqops_common(struct knote *kn, struct kevent_qos_s *kev,
+    struct ptmx_ioctl *pti, struct tty *tp)
 {
        int retval = 0;
+       int64_t data = 0;
 
        TTY_LOCK_OWNED(tp);
 
-       /* disconnects should force a wakeup (EOF) */
-       if (!(tp->t_state & TS_CONNECTED)) {
-               kn->kn_flags |= EV_EOF;
-               return 1;
-       }
-
        switch (kn->kn_filter) {
        case EVFILT_READ:
                /* there's data on the TTY and it's not stopped */
                if (tp->t_outq.c_cc && !(tp->t_state & TS_TTSTOP)) {
-                       retval = tp->t_outq.c_cc;
-                       kn->kn_data = retval;
+                       data = tp->t_outq.c_cc;
+                       retval = data > 0;
                } else if (((pti->pt_flags & PF_PKT) && pti->pt_send) ||
                    ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl)) {
                        retval = 1;
@@ -861,11 +855,16 @@ ptmx_kqops_common(struct knote *kn, struct ptmx_ioctl *pti, struct tty *tp)
                break;
        }
 
-       if (tp->t_state & TS_ZOMBIE) {
+       /* disconnects should force a wakeup (EOF) */
+       if (!(tp->t_state & TS_CONNECTED) || (tp->t_state & TS_ZOMBIE)) {
                kn->kn_flags |= EV_EOF;
+       }
+       if (kn->kn_flags & EV_EOF) {
                retval = 1;
        }
-
+       if (retval && kev) {
+               knote_fill_kevent(kn, kev, data);
+       }
        return retval;
 }
 
@@ -875,29 +874,21 @@ ptmx_kqops_event(struct knote *kn, long hint)
        struct ptmx_ioctl *pti = ptmx_knote_ioctl(kn);
        struct tty *tp = ptmx_knote_tty(kn);
        int ret;
-       bool revoked = hint & NOTE_REVOKE;
-       hint &= ~NOTE_REVOKE;
 
-       if (!hint) {
-               tty_lock(tp);
-       }
+       TTY_LOCK_OWNED(tp);
 
-       if (revoked) {
+       if (hint & NOTE_REVOKE) {
                kn->kn_flags |= EV_EOF | EV_ONESHOT;
                ret = 1;
        } else {
-               ret = ptmx_kqops_common(kn, pti, tp);
-       }
-
-       if (!hint) {
-               tty_unlock(tp);
+               ret = ptmx_kqops_common(kn, NULL, pti, tp);
        }
 
        return ret;
 }
 
 static int
-ptmx_kqops_touch(struct knote *kn, struct kevent_internal_s *kev)
+ptmx_kqops_touch(struct knote *kn, struct kevent_qos_s *kev)
 {
        struct ptmx_ioctl *pti = ptmx_knote_ioctl(kn);
        struct tty *tp = ptmx_knote_tty(kn);
@@ -910,7 +901,7 @@ ptmx_kqops_touch(struct knote *kn, struct kevent_internal_s *kev)
        kn->kn_sdata = kev->data;
 
        /* recapture fired state of knote */
-       ret = ptmx_kqops_common(kn, pti, tp);
+       ret = ptmx_kqops_common(kn, NULL, pti, tp);
 
        tty_unlock(tp);
 
@@ -918,22 +909,14 @@ ptmx_kqops_touch(struct knote *kn, struct kevent_internal_s *kev)
 }
 
 static int
-ptmx_kqops_process(struct knote *kn, __unused struct filt_process_s *data,
-    struct kevent_internal_s *kev)
+ptmx_kqops_process(struct knote *kn, struct kevent_qos_s *kev)
 {
        struct ptmx_ioctl *pti = ptmx_knote_ioctl(kn);
        struct tty *tp = ptmx_knote_tty(kn);
        int ret;
 
        tty_lock(tp);
-       ret = ptmx_kqops_common(kn, pti, tp);
-       if (ret) {
-               *kev = kn->kn_kevent;
-               if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_fflags = 0;
-                       kn->kn_data = 0;
-               }
-       }
+       ret = ptmx_kqops_common(kn, kev, pti, tp);
        tty_unlock(tp);
 
        return ret;
index 01c615b283214cca85b56248103dcb85b5025f26..cc16291df411300a2a74ab622ed39b35ea5d4456 100644 (file)
@@ -82,6 +82,8 @@ extern kern_return_t memory_object_pages_resident(memory_object_control_t,
 extern kern_return_t    memory_object_signed(memory_object_control_t control,
     boolean_t is_signed);
 extern boolean_t        memory_object_is_signed(memory_object_control_t);
+extern void             memory_object_mark_trusted(
+       memory_object_control_t         control);
 
 /* XXX Same for those. */
 
@@ -1937,6 +1939,33 @@ ubc_map(vnode_t vp, int flags)
                        if (vnode_ref_ext(vp, 0, VNODE_REF_FORCE)) {
                                panic("%s : VNODE_REF_FORCE failed\n", __FUNCTION__);
                        }
+                       /*
+                        * Vnodes that are on "unreliable" media (like disk
+                        * images, network filesystems, 3rd-party filesystems,
+                        * and possibly external devices) could see their
+                        * contents be changed via the backing store without
+                        * triggering copy-on-write, so we can't fully rely
+                        * on copy-on-write and might have to resort to
+                        * copy-on-read to protect "privileged" processes and
+                        * prevent privilege escalation.
+                        *
+                        * The root filesystem is considered "reliable" because
+                        * there's not much point in trying to protect
+                        * ourselves from such a vulnerability and the extra
+                        * cost of copy-on-read (CPU time and memory pressure)
+                        * could result in some serious regressions.
+                        */
+                       if (vp->v_mount != NULL &&
+                           ((vp->v_mount->mnt_flag & MNT_ROOTFS) ||
+                           vnode_on_reliable_media(vp))) {
+                               /*
+                                * This vnode is deemed "reliable" so mark
+                                * its VM object as "trusted".
+                                */
+                               memory_object_mark_trusted(uip->ui_control);
+                       } else {
+//                             printf("BUGGYCOW: %s:%d vp %p \"%s\" in mnt %p \"%s\" is untrusted\n", __FUNCTION__, __LINE__, vp, vp->v_name, vp->v_mount, vp->v_mount->mnt_vnodecovered->v_name);
+                       }
                }
        }
        return error;
index 6b798e832a63252ed94702ba98303620d7af6bf0..9321399dce0148eb971c90171a5c4c9df0042a86 100644 (file)
@@ -107,6 +107,7 @@ decl_lck_mtx_data(static, domain_proto_mtx);
 decl_lck_mtx_data(static, domain_timeout_mtx);
 
 u_int64_t _net_uptime;
+u_int64_t _net_uptime_ms;
 
 #if (DEVELOPMENT || DEBUG)
 
@@ -1003,6 +1004,10 @@ net_update_uptime_with_time(const struct timeval *tvp)
        if (tvp->tv_usec > 500000) {
                _net_uptime++;
        }
+
+       /* update milliseconds variant */
+       _net_uptime_ms = (((u_int64_t)tvp->tv_sec * 1000) +
+           ((u_int64_t)tvp->tv_usec / 1000));
 }
 
 void
@@ -1044,6 +1049,16 @@ net_uptime(void)
        return _net_uptime;
 }
 
+u_int64_t
+net_uptime_ms(void)
+{
+       if (_net_uptime_ms == 0) {
+               net_update_uptime();
+       }
+
+       return _net_uptime_ms;
+}
+
 void
 domain_proto_mtx_lock_assert_held(void)
 {
index c5b86b7b82aceba280bf65fc3466c56df6e19133..db7acd2ed8d6fcae37b21130b76a69da67dc987c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -998,7 +998,7 @@ struct omb_stat *omb_stat;      /* For backwards compatibility */
 #define MB_STAT_SIZE(n) \
        __builtin_offsetof(mb_stat_t, mbs_class[n])
 #define OMB_STAT_SIZE(n) \
-       ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
+       __builtin_offsetof(struct omb_stat, mbs_class[n])
 
 /*
  * The legacy structure holding all of the mbuf allocation statistics.
@@ -1038,7 +1038,7 @@ typedef struct {
 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
 
 #define MBUF_MTYPES_SIZE(n) \
-       ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
+       __builtin_offsetof(mbuf_mtypes_t, mbs_cpu[n])
 
 #define MTYPES_CPU(p) \
        ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
@@ -1268,7 +1268,7 @@ m_incref(struct mbuf *m)
        do {
                old = *addr;
                new = old + 1;
-               ASSERT(new != 0);
+               VERIFY(new != 0);
        } while (!OSCompareAndSwap16(old, new, addr));
 
        /*
@@ -1290,7 +1290,7 @@ m_decref(struct mbuf *m)
        do {
                old = *addr;
                new = old - 1;
-               ASSERT(old != 0);
+               VERIFY(old != 0);
        } while (!OSCompareAndSwap16(old, new, addr));
 
        return new;
@@ -4686,7 +4686,7 @@ fail:
                mcache_free_ext(rcp, rmp_list);
        }
        if (wantall && top != NULL) {
-               m_freem(top);
+               m_freem_list(top);
                return NULL;
        }
        *numlist = num;
@@ -5576,6 +5576,8 @@ m_copyup(struct mbuf *n, int len, int dstoff)
        struct mbuf *m;
        int count, space;
 
+       VERIFY(len >= 0 && dstoff >= 0);
+
        if (len > (MHLEN - dstoff)) {
                goto bad;
        }
@@ -6348,6 +6350,9 @@ m_dup(struct mbuf *m, int how)
                                (void) m_free(n);
                                goto nospace;
                        }
+               } else {
+                       VERIFY((copyhdr == 1 && m->m_len <= MHLEN) ||
+                           (copyhdr == 0 && m->m_len <= MLEN));
                }
                *np = n;
                if (copyhdr) {
@@ -7455,6 +7460,7 @@ mcl_audit_scratch(mcache_audit_t *mca)
        }
 }
 
+__abortlike
 static void
 mcl_audit_mcheck_panic(struct mbuf *m)
 {
@@ -7535,7 +7541,7 @@ mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
 
        if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
                uintptr_t bt[MLEAK_STACK_DEPTH];
-               int logged = backtrace(bt, MLEAK_STACK_DEPTH);
+               int logged = backtrace(bt, MLEAK_STACK_DEPTH, NULL);
                mleak_log(bt, addr, logged, num);
        }
 }
@@ -8800,7 +8806,7 @@ mtracelarge_register(size_t size)
        uintptr_t bt[MLEAK_STACK_DEPTH];
        unsigned int depth;
 
-       depth = backtrace(bt, MLEAK_STACK_DEPTH);
+       depth = backtrace(bt, MLEAK_STACK_DEPTH, NULL);
        /* Check if this entry is already on the list. */
        for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
                trace = &mtracelarge_table[i];
index 90bef57a037aaaaab5b97ed08b39c4f89d8db264..2efefbc3359f58dff91bb3953b3fa2ac615c2d1c 100644 (file)
@@ -133,12 +133,10 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp)
        struct mbuf *n = NULL, *o = NULL;
        int hlen = 0, tlen = 0, olen = 0;
        int sharedcluster = 0;
-#if defined(PULLDOWN_STAT) && INET6
-       static struct mbuf *prev = NULL;
-       int prevlen = 0, prevmlen = 0;
-#endif
 
        /* check invalid arguments. */
+       VERIFY(len >= 0 && off >= 0);
+
        if (m == NULL) {
                panic("m == NULL in m_pulldown()");
        }
@@ -146,73 +144,12 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp)
                m_freem(m);
                return NULL;    /* impossible */
        }
-
-#if defined(PULLDOWN_STAT) && INET6
-       ip6stat.ip6s_pulldown++;
-#endif
-
-#if defined(PULLDOWN_STAT) && INET6
-       /* statistics for m_pullup */
-       ip6stat.ip6s_pullup++;
-       if (off + len > MHLEN) {
-               ip6stat.ip6s_pullup_fail++;
-       } else {
-               int dlen, mlen;
-
-               dlen = (prev == m) ? prevlen : m->m_len;
-               mlen = (prev == m) ? prevmlen : m->m_len + M_TRAILINGSPACE(m);
-
-               if (dlen >= off + len) {
-                       ip6stat.ip6s_pullup--; /* call will not be made! */
-               } else if ((m->m_flags & M_EXT) != 0) {
-                       ip6stat.ip6s_pullup_alloc++;
-                       ip6stat.ip6s_pullup_copy++;
-               } else {
-                       if (mlen >= off + len) {
-                               ip6stat.ip6s_pullup_copy++;
-                       } else {
-                               ip6stat.ip6s_pullup_alloc++;
-                               ip6stat.ip6s_pullup_copy++;
-                       }
-               }
-
-               prevlen = off + len;
-               prevmlen = MHLEN;
-       }
-
-       /* statistics for m_pullup2 */
-       ip6stat.ip6s_pullup2++;
-       if (off + len > MCLBYTES) {
-               ip6stat.ip6s_pullup2_fail++;
-       } else {
-               int dlen, mlen;
-
-               dlen = (prev == m) ? prevlen : m->m_len;
-               mlen = (prev == m) ? prevmlen : m->m_len + M_TRAILINGSPACE(m);
-               prevlen = off + len;
-               prevmlen = mlen;
-
-               if (dlen >= off + len) {
-                       ip6stat.ip6s_pullup2--; /* call will not be made! */
-               } else if ((m->m_flags & M_EXT) != 0) {
-                       ip6stat.ip6s_pullup2_alloc++;
-                       ip6stat.ip6s_pullup2_copy++;
-                       prevmlen = (off + len > MHLEN) ? MCLBYTES : MHLEN;
-               } else {
-                       if (mlen >= off + len) {
-                               ip6stat.ip6s_pullup2_copy++;
-                       } else {
-                               ip6stat.ip6s_pullup2_alloc++;
-                               ip6stat.ip6s_pullup2_copy++;
-                               prevmlen = (off + len > MHLEN) ? MCLBYTES
-                                   : MHLEN;
-                       }
-               }
+       int tmp_len = 0;
+       if (os_add_overflow(off, len, &tmp_len)) {
+               m_free(m);
+               return NULL;
        }
 
-       prev = m;
-#endif
-
 #ifdef PULLDOWN_DEBUG
        {
                struct mbuf *t;
@@ -267,10 +204,6 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp)
                goto ok;
        }
 
-#if defined(PULLDOWN_STAT) && INET6
-       ip6stat.ip6s_pulldown_copy++;
-#endif
-
        /*
         * when len <= n->m_len - off and off != 0, it is a special case.
         * len bytes from <n, off> sits in single mbuf, but the caller does
@@ -364,9 +297,6 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp)
         * now, we need to do the hard way.  don't m_copy as there's no room
         * on both end.
         */
-#if defined(PULLDOWN_STAT) && INET6
-       ip6stat.ip6s_pulldown_alloc++;
-#endif
        MGET(o, M_DONTWAIT, m->m_type);
        if (o == NULL) {
                m_freem(m);
index 4d6e12a7036bae05037a16fc5c55092140bcb6a6..dc2bd511cc9d8db0d9e33ce5440059328d8667cb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_tclass.h>
+#include <netinet/in_var.h>
 #include <netinet/tcp_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
@@ -159,23 +160,23 @@ static lck_mtx_t        *so_cache_mtx;
 
 #include <machine/limits.h>
 
-static int      filt_sorattach(struct knote *kn, struct kevent_internal_s *kev);
+static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
 static void     filt_sordetach(struct knote *kn);
 static int      filt_soread(struct knote *kn, long hint);
-static int      filt_sortouch(struct knote *kn, struct kevent_internal_s *kev);
-static int      filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
+static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
+static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
 
-static int      filt_sowattach(struct knote *kn, struct kevent_internal_s *kev);
+static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
 static void     filt_sowdetach(struct knote *kn);
 static int      filt_sowrite(struct knote *kn, long hint);
-static int      filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev);
-static int      filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
+static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
+static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
 
-static int      filt_sockattach(struct knote *kn, struct kevent_internal_s *kev);
+static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
 static void     filt_sockdetach(struct knote *kn);
 static int      filt_sockev(struct knote *kn, long hint);
-static int      filt_socktouch(struct knote *kn, struct kevent_internal_s *kev);
-static int      filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
+static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
+static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
 
 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
@@ -550,6 +551,9 @@ so_update_last_owner_locked(struct socket *so, proc_t self)
                        so->last_pid = proc_pid(self);
                        proc_getexecutableuuid(self, so->last_uuid,
                            sizeof(so->last_uuid));
+                       if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
+                               (*so->so_proto->pr_update_last_owner)(so, self, NULL);
+                       }
                }
                proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
        }
@@ -736,7 +740,7 @@ socreate_internal(int dom, struct socket **aso, int type, int proto,
                break;
        }
 
-       if (flags & SOCF_ASYNC) {
+       if (flags & SOCF_MPTCP) {
                so->so_state |= SS_NBIO;
        }
 
@@ -791,6 +795,13 @@ socreate_internal(int dom, struct socket **aso, int type, int proto,
                return error;
        }
 
+       /*
+        * Note: needs so_pcb to be set after pru_attach
+        */
+       if (prp->pr_update_last_owner != NULL) {
+               (*prp->pr_update_last_owner)(so, p, ep);
+       }
+
        atomic_add_32(&prp->pr_domain->dom_refs, 1);
        TAILQ_INIT(&so->so_evlist);
 
@@ -807,8 +818,8 @@ socreate_internal(int dom, struct socket **aso, int type, int proto,
         * If this thread or task is marked to create backgrounded sockets,
         * mark the socket as background.
         */
-       if (proc_get_effective_thread_policy(current_thread(),
-           TASK_POLICY_NEW_SOCKETS_BG)) {
+       if (!(flags & SOCF_MPTCP) &&
+           proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
                socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
                so->so_background_thread = current_thread();
        }
@@ -1979,7 +1990,7 @@ defunct:
                            !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
                                return ENOTCONN;
                        }
-               } else if (addr == 0 && !(flags & MSG_HOLD)) {
+               } else if (addr == 0) {
                        return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
                               ENOTCONN : EDESTADDRREQ;
                }
@@ -2049,10 +2060,6 @@ defunct:
  * Returns nonzero on error, timeout or signal; callers
  * must check for short counts if EINTR/ERESTART are returned.
  * Data and control buffers are freed on return.
- * Experiment:
- * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
- * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
- *  point at the mbuf chain being constructed and go from there.
  *
  * Returns:    0                       Success
  *             EOPNOTSUPP
@@ -2446,29 +2453,6 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
                                }
                        }
 
-                       if (flags & (MSG_HOLD | MSG_SEND)) {
-                               /* Enqueue for later, go away if HOLD */
-                               struct mbuf *mb1;
-                               if (so->so_temp && (flags & MSG_FLUSH)) {
-                                       m_freem(so->so_temp);
-                                       so->so_temp = NULL;
-                               }
-                               if (so->so_temp) {
-                                       so->so_tail->m_next = top;
-                               } else {
-                                       so->so_temp = top;
-                               }
-                               mb1 = top;
-                               while (mb1->m_next) {
-                                       mb1 = mb1->m_next;
-                               }
-                               so->so_tail = mb1;
-                               if (flags & MSG_HOLD) {
-                                       top = NULL;
-                                       goto out_locked;
-                               }
-                               top = so->so_temp;
-                       }
                        if (dontroute) {
                                so->so_options |= SO_DONTROUTE;
                        }
@@ -2531,10 +2515,6 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
                        error = (*so->so_proto->pr_usrreqs->pru_send)
                            (so, sendflags, top, addr, control, p);
 
-                       if (flags & MSG_SEND) {
-                               so->so_temp = NULL;
-                       }
-
                        if (dontroute) {
                                so->so_options &= ~SO_DONTROUTE;
                        }
@@ -2587,7 +2567,7 @@ out_locked:
 int
 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
 {
-       struct mbuf *m0, *control_end;
+       struct mbuf *m0 = NULL, *control_end = NULL;
 
        socket_lock_assert_owned(so);
 
@@ -4566,6 +4546,33 @@ out:
        return error;
 }
 
+static int
+so_statistics_event_to_nstat_event(int64_t *input_options,
+    uint64_t *nstat_event)
+{
+       int error = 0;
+       switch (*input_options) {
+       case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
+               *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
+               break;
+       case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
+               *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
+               break;
+#if (DEBUG || DEVELOPMENT)
+       case SO_STATISTICS_EVENT_RESERVED_1:
+               *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
+               break;
+       case SO_STATISTICS_EVENT_RESERVED_2:
+               *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
+               break;
+#endif /* (DEBUG || DEVELOPMENT) */
+       default:
+               error = EINVAL;
+               break;
+       }
+       return error;
+}
+
 /*
  * Returns:    0                       Success
  *             EINVAL
@@ -4906,14 +4913,15 @@ sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
 }
 
 int
-soopt_cred_check(struct socket *so, int priv, boolean_t allow_root)
+soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
+    boolean_t ignore_delegate)
 {
        kauth_cred_t cred =  NULL;
        proc_t ep = PROC_NULL;
        uid_t uid;
        int error = 0;
 
-       if (so->so_flags & SOF_DELEGATED) {
+       if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
                ep = proc_find(so->e_pid);
                if (ep) {
                        cred = kauth_cred_proc_ref(ep);
@@ -4960,6 +4968,7 @@ int
 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
 {
        int     error, optval;
+       int64_t long_optval;
        struct  linger l;
        struct  timeval tv;
 #if CONFIG_MACF_SOCKET
@@ -5240,7 +5249,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
                        }
                        if (optval != 0) {
                                error = soopt_cred_check(so,
-                                   PRIV_NET_RESTRICTED_AWDL, false);
+                                   PRIV_NET_RESTRICTED_AWDL, false, false);
                                if (error == 0) {
                                        inp_set_awdl_unrestricted(
                                                sotoinpcb(so));
@@ -5262,7 +5271,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
                        if (optval != 0 &&
                            inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
                                error = soopt_cred_check(so,
-                                   PRIV_NET_RESTRICTED_INTCOPROC, false);
+                                   PRIV_NET_RESTRICTED_INTCOPROC, false, false);
                                if (error == 0) {
                                        inp_set_intcoproc_allowed(
                                                sotoinpcb(so));
@@ -5524,7 +5533,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
                                break;
                        }
 
-                       error = so_set_effective_pid(so, optval, sopt->sopt_p);
+                       error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
                        break;
 
                case SO_DELEGATED_UUID: {
@@ -5535,7 +5544,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
                                break;
                        }
 
-                       error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
+                       error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
                        break;
                }
 
@@ -5544,7 +5553,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
                        error = necp_set_socket_attributes(so, sopt);
                        break;
 
-               case SO_NECP_CLIENTUUID:
+               case SO_NECP_CLIENTUUID: {
                        if (SOCK_DOM(so) == PF_MULTIPATH) {
                                /* Handled by MPTCP itself */
                                break;
@@ -5572,7 +5581,8 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
                                goto out;
                        }
 
-                       error = necp_client_register_socket_flow(so->last_pid,
+                       pid_t current_pid = proc_pid(current_proc());
+                       error = necp_client_register_socket_flow(current_pid,
                            inp->necp_client_uuid, inp);
                        if (error != 0) {
                                uuid_clear(inp->necp_client_uuid);
@@ -5580,12 +5590,48 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
                        }
 
                        if (inp->inp_lport != 0) {
-                               // There is bound local port, so this is not
+                               // There is bound local port, so this is not
                                // a fresh socket. Assign to the client.
-                               necp_client_assign_from_socket(so->last_pid, inp->necp_client_uuid, inp);
+                               necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
                        }
 
                        break;
+               }
+               case SO_NECP_LISTENUUID: {
+                       if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
+                               error = EINVAL;
+                               goto out;
+                       }
+
+                       struct inpcb *inp = sotoinpcb(so);
+                       if (!uuid_is_null(inp->necp_client_uuid)) {
+                               error = EINVAL;
+                               goto out;
+                       }
+
+                       error = sooptcopyin(sopt, &inp->necp_client_uuid,
+                           sizeof(uuid_t), sizeof(uuid_t));
+                       if (error != 0) {
+                               goto out;
+                       }
+
+                       if (uuid_is_null(inp->necp_client_uuid)) {
+                               error = EINVAL;
+                               goto out;
+                       }
+
+                       error = necp_client_register_socket_listener(proc_pid(current_proc()),
+                           inp->necp_client_uuid, inp);
+                       if (error != 0) {
+                               uuid_clear(inp->necp_client_uuid);
+                               goto out;
+                       }
+
+                       // Mark that the port registration is held by NECP
+                       inp->inp_flags2 |= INP2_EXTERNAL_PORT;
+
+                       break;
+               }
 #endif /* NECP */
 
                case SO_EXTENDED_BK_IDLE:
@@ -5613,6 +5659,21 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
                        }
                        break;
 
+               case SO_STATISTICS_EVENT:
+                       error = sooptcopyin(sopt, &long_optval,
+                           sizeof(long_optval), sizeof(long_optval));
+                       if (error != 0) {
+                               goto out;
+                       }
+                       u_int64_t nstat_event = 0;
+                       error = so_statistics_event_to_nstat_event(
+                               &long_optval, &nstat_event);
+                       if (error != 0) {
+                               goto out;
+                       }
+                       nstat_pcb_event(sotoinpcb(so), nstat_event);
+                       break;
+
                case SO_NET_SERVICE_TYPE: {
                        error = sooptcopyin(sopt, &optval, sizeof(optval),
                            sizeof(optval));
@@ -5641,6 +5702,24 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
                        }
                        break;
 
+               case SO_MPKL_SEND_INFO: {
+                       struct so_mpkl_send_info so_mpkl_send_info;
+
+                       error = sooptcopyin(sopt, &so_mpkl_send_info,
+                           sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
+                       if (error != 0) {
+                               goto out;
+                       }
+                       uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
+                       so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
+
+                       if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
+                               so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
+                       } else {
+                               so->so_flags1 |= SOF1_MPKL_SEND_INFO;
+                       }
+                       break;
+               }
                default:
                        error = ENOPROTOOPT;
                        break;
@@ -5837,17 +5916,13 @@ integer:
 
                                m1 = so->so_rcv.sb_mb;
                                while (m1 != NULL) {
-                                       if (m1->m_type == MT_DATA ||
-                                           m1->m_type == MT_HEADER ||
-                                           m1->m_type == MT_OOBDATA) {
-                                               cnt += 1;
-                                       }
+                                       cnt += 1;
                                        m1 = m1->m_nextpkt;
                                }
                                optval = cnt;
                                goto integer;
                        } else {
-                               error = EINVAL;
+                               error = ENOPROTOOPT;
                                break;
                        }
 
@@ -6050,8 +6125,7 @@ integer:
                        error = necp_get_socket_attributes(so, sopt);
                        break;
 
-               case SO_NECP_CLIENTUUID:
-               {
+               case SO_NECP_CLIENTUUID: {
                        uuid_t *ncu;
 
                        if (SOCK_DOM(so) == PF_MULTIPATH) {
@@ -6066,6 +6140,25 @@ integer:
                        error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
                        break;
                }
+
+               case SO_NECP_LISTENUUID: {
+                       uuid_t *nlu;
+
+                       if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
+                               if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
+                                       nlu = &sotoinpcb(so)->necp_client_uuid;
+                               } else {
+                                       error = ENOENT;
+                                       goto out;
+                               }
+                       } else {
+                               error = EINVAL;
+                               goto out;
+                       }
+
+                       error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
+                       break;
+               }
 #endif /* NECP */
 
 #if CONTENT_FILTER
@@ -6099,6 +6192,15 @@ integer:
                        optval = so_get_netsvc_marking_level(so);
                        goto integer;
 
+               case SO_MPKL_SEND_INFO: {
+                       struct so_mpkl_send_info so_mpkl_send_info;
+
+                       uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
+                       so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
+                       error = sooptcopyout(sopt, &so_mpkl_send_info,
+                           sizeof(struct so_mpkl_send_info));
+                       break;
+               }
                default:
                        error = ENOPROTOOPT;
                        break;
@@ -6312,14 +6414,9 @@ sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
 }
 
 int
-soo_kqfilter(struct fileproc *fp, struct knote *kn,
-    struct kevent_internal_s *kev, vfs_context_t ctx)
+soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
 {
-#pragma unused(fp)
-#if !CONFIG_MACF_SOCKET
-#pragma unused(ctx)
-#endif /* MAC_SOCKET */
-       struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
+       struct socket *so = (struct socket *)fp->f_fglob->fg_data;
        int result;
 
        socket_lock(so, 1);
@@ -6327,11 +6424,10 @@ soo_kqfilter(struct fileproc *fp, struct knote *kn,
        so_update_policy(so);
 
 #if CONFIG_MACF_SOCKET
-       if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
-           kn, so) != 0) {
+       proc_t p = knote_get_kq(kn)->kq_p;
+       if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
                socket_unlock(so, 1);
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = EPERM;
+               knote_set_error(kn, EPERM);
                return 0;
        }
 #endif /* MAC_SOCKET */
@@ -6351,8 +6447,7 @@ soo_kqfilter(struct fileproc *fp, struct knote *kn,
                break;
        default:
                socket_unlock(so, 1);
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = EINVAL;
+               knote_set_error(kn, EINVAL);
                return 0;
        }
 
@@ -6368,21 +6463,21 @@ soo_kqfilter(struct fileproc *fp, struct knote *kn,
 }
 
 static int
-filt_soread_common(struct knote *kn, struct socket *so)
+filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
 {
-       if (so->so_options & SO_ACCEPTCONN) {
-               int is_not_empty;
+       int retval = 0;
+       int64_t data = 0;
 
+       if (so->so_options & SO_ACCEPTCONN) {
                /*
                 * Radar 6615193 handle the listen case dynamically
                 * for kqueue read filter. This allows to call listen()
                 * after registering the kqueue EVFILT_READ.
                 */
 
-               kn->kn_data = so->so_qlen;
-               is_not_empty = !TAILQ_EMPTY(&so->so_comp);
-
-               return is_not_empty;
+               retval = !TAILQ_EMPTY(&so->so_comp);
+               data = so->so_qlen;
+               goto out;
        }
 
        /* socket isn't a listener */
@@ -6391,13 +6486,14 @@ filt_soread_common(struct knote *kn, struct socket *so)
         * the bytes of protocol data. We therefore exclude any
         * control bytes.
         */
-       kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
+       data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
 
        if (kn->kn_sfflags & NOTE_OOB) {
                if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
                        kn->kn_fflags |= NOTE_OOB;
-                       kn->kn_data -= so->so_oobmark;
-                       return 1;
+                       data -= so->so_oobmark;
+                       retval = 1;
+                       goto out;
                }
        }
 
@@ -6408,11 +6504,13 @@ filt_soread_common(struct knote *kn, struct socket *so)
            ) {
                kn->kn_flags |= EV_EOF;
                kn->kn_fflags = so->so_error;
-               return 1;
+               retval = 1;
+               goto out;
        }
 
        if (so->so_error) {     /* temporary udp error */
-               return 1;
+               retval = 1;
+               goto out;
        }
 
        int64_t lowwat = so->so_rcv.sb_lowat;
@@ -6429,20 +6527,17 @@ filt_soread_common(struct knote *kn, struct socket *so)
                }
        }
 
-       /*
-        * The order below is important. Since NOTE_LOWAT
-        * overrides sb_lowat, check for NOTE_LOWAT case
-        * first.
-        */
-       if (kn->kn_sfflags & NOTE_LOWAT) {
-               return kn->kn_data >= lowwat;
-       }
+       retval = (data >= lowwat);
 
-       return so->so_rcv.sb_cc >= lowwat;
+out:
+       if (retval && kev) {
+               knote_fill_kevent(kn, kev, data);
+       }
+       return retval;
 }
 
 static int
-filt_sorattach(struct knote *kn, __unused struct kevent_internal_s *kev)
+filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
 {
        struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
 
@@ -6456,16 +6551,16 @@ filt_sorattach(struct knote *kn, __unused struct kevent_internal_s *kev)
        if (kn->kn_filter == EVFILT_READ &&
            kn->kn_flags & EV_OOBAND) {
                kn->kn_flags &= ~EV_OOBAND;
-               kn->kn_hookid = EV_OOBAND;
+               kn->kn_hook32 = EV_OOBAND;
        } else {
-               kn->kn_hookid = 0;
+               kn->kn_hook32 = 0;
        }
        if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
                so->so_rcv.sb_flags |= SB_KNOTE;
        }
 
        /* indicate if event is already fired */
-       return filt_soread_common(kn, so);
+       return filt_soread_common(kn, NULL, so);
 }
 
 static void
@@ -6493,7 +6588,7 @@ filt_soread(struct knote *kn, long hint)
                socket_lock(so, 1);
        }
 
-       retval = filt_soread_common(kn, so);
+       retval = filt_soread_common(kn, NULL, so);
 
        if ((hint & SO_FILT_HINT_LOCKED) == 0) {
                socket_unlock(so, 1);
@@ -6503,7 +6598,7 @@ filt_soread(struct knote *kn, long hint)
 }
 
 static int
-filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
+filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
 {
        struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
        int retval;
@@ -6515,7 +6610,7 @@ filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
        kn->kn_sdata = kev->data;
 
        /* determine if changes result in fired events */
-       retval = filt_soread_common(kn, so);
+       retval = filt_soread_common(kn, NULL, so);
 
        socket_unlock(so, 1);
 
@@ -6523,21 +6618,13 @@ filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
 }
 
 static int
-filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
+filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-#pragma unused(data)
        struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
        int retval;
 
        socket_lock(so, 1);
-       retval = filt_soread_common(kn, so);
-       if (retval) {
-               *kev = kn->kn_kevent;
-               if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_fflags = 0;
-                       kn->kn_data = 0;
-               }
-       }
+       retval = filt_soread_common(kn, kev, so);
        socket_unlock(so, 1);
 
        return retval;
@@ -6557,26 +6644,35 @@ so_wait_for_if_feedback(struct socket *so)
 }
 
 static int
-filt_sowrite_common(struct knote *kn, struct socket *so)
+filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
 {
        int ret = 0;
+       int64_t data = sbspace(&so->so_snd);
 
-       kn->kn_data = sbspace(&so->so_snd);
        if (so->so_state & SS_CANTSENDMORE) {
                kn->kn_flags |= EV_EOF;
                kn->kn_fflags = so->so_error;
-               return 1;
+               ret = 1;
+               goto out;
        }
+
        if (so->so_error) {     /* temporary udp error */
-               return 1;
+               ret = 1;
+               goto out;
        }
+
        if (!socanwrite(so)) {
-               return 0;
+               ret = 0;
+               goto out;
        }
+
        if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
-               return 1;
+               ret = 1;
+               goto out;
        }
+
        int64_t lowwat = so->so_snd.sb_lowat;
+
        if (kn->kn_sfflags & NOTE_LOWAT) {
                if (kn->kn_sdata > so->so_snd.sb_hiwat) {
                        lowwat = so->so_snd.sb_hiwat;
@@ -6584,7 +6680,8 @@ filt_sowrite_common(struct knote *kn, struct socket *so)
                        lowwat = kn->kn_sdata;
                }
        }
-       if (kn->kn_data >= lowwat) {
+
+       if (data >= lowwat) {
                if ((so->so_flags & SOF_NOTSENT_LOWAT)
 #if (DEBUG || DEVELOPMENT)
                    && so_notsent_lowat_check == 1
@@ -6602,7 +6699,8 @@ filt_sowrite_common(struct knote *kn, struct socket *so)
                        }
 #endif
                        else {
-                               return 1;
+                               ret = 1;
+                               goto out;
                        }
                } else {
                        ret = 1;
@@ -6611,11 +6709,16 @@ filt_sowrite_common(struct knote *kn, struct socket *so)
        if (so_wait_for_if_feedback(so)) {
                ret = 0;
        }
+
+out:
+       if (ret && kev) {
+               knote_fill_kevent(kn, kev, data);
+       }
        return ret;
 }
 
 static int
-filt_sowattach(struct knote *kn, __unused struct kevent_internal_s *kev)
+filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
 {
        struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
 
@@ -6625,7 +6728,7 @@ filt_sowattach(struct knote *kn, __unused struct kevent_internal_s *kev)
        }
 
        /* determine if its already fired */
-       return filt_sowrite_common(kn, so);
+       return filt_sowrite_common(kn, NULL, so);
 }
 
 static void
@@ -6653,7 +6756,7 @@ filt_sowrite(struct knote *kn, long hint)
                socket_lock(so, 1);
        }
 
-       ret = filt_sowrite_common(kn, so);
+       ret = filt_sowrite_common(kn, NULL, so);
 
        if ((hint & SO_FILT_HINT_LOCKED) == 0) {
                socket_unlock(so, 1);
@@ -6663,7 +6766,7 @@ filt_sowrite(struct knote *kn, long hint)
 }
 
 static int
-filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
+filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
 {
        struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
        int ret;
@@ -6675,7 +6778,7 @@ filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
        kn->kn_sdata = kev->data;
 
        /* determine if these changes result in a triggered event */
-       ret = filt_sowrite_common(kn, so);
+       ret = filt_sowrite_common(kn, NULL, so);
 
        socket_unlock(so, 1);
 
@@ -6683,29 +6786,24 @@ filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
 }
 
 static int
-filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
+filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-#pragma unused(data)
        struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
        int ret;
 
        socket_lock(so, 1);
-       ret = filt_sowrite_common(kn, so);
-       if (ret) {
-               *kev = kn->kn_kevent;
-               if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_fflags = 0;
-                       kn->kn_data = 0;
-               }
-       }
+       ret = filt_sowrite_common(kn, kev, so);
        socket_unlock(so, 1);
+
        return ret;
 }
 
 static int
-filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
+filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
+    struct socket *so, long ev_hint)
 {
        int ret = 0;
+       int64_t data = 0;
        uint32_t level_trigger = 0;
 
        if (ev_hint & SO_FILT_HINT_CONNRESET) {
@@ -6770,7 +6868,7 @@ filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
                kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
 
                /* If resume event was delivered before, reset it */
-               kn->kn_hookid &= ~NOTE_RESUME;
+               kn->kn_hook32 &= ~NOTE_RESUME;
 
                kn->kn_fflags |= NOTE_SUSPEND;
                level_trigger |= NOTE_SUSPEND;
@@ -6781,7 +6879,7 @@ filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
                kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
 
                /* If suspend event was delivered before, reset it */
-               kn->kn_hookid &= ~NOTE_SUSPEND;
+               kn->kn_hook32 &= ~NOTE_SUSPEND;
 
                kn->kn_fflags |= NOTE_RESUME;
                level_trigger |= NOTE_RESUME;
@@ -6789,10 +6887,12 @@ filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
 
        if (so->so_error != 0) {
                ret = 1;
-               kn->kn_data = so->so_error;
+               data = so->so_error;
                kn->kn_flags |= EV_EOF;
        } else {
-               get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
+               u_int32_t data32;
+               get_sockev_state(so, &data32);
+               data = data32;
        }
 
        /* Reset any events that are not requested on this knote */
@@ -6800,7 +6900,7 @@ filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
        level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
 
        /* Find the level triggerred events that are already delivered */
-       level_trigger &= kn->kn_hookid;
+       level_trigger &= kn->kn_hook32;
        level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
 
        /* Do not deliver level triggerred events more than once */
@@ -6808,22 +6908,48 @@ filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
                ret = 1;
        }
 
+       if (ret && kev) {
+               /*
+                * Store the state of the events being delivered. This
+                * state can be used to deliver level triggered events
+                * ateast once and still avoid waking up the application
+                * multiple times as long as the event is active.
+                */
+               if (kn->kn_fflags != 0) {
+                       kn->kn_hook32 |= (kn->kn_fflags &
+                           EVFILT_SOCK_LEVEL_TRIGGER_MASK);
+               }
+
+               /*
+                * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
+                * only one of them and remember the last one that was
+                * delivered last
+                */
+               if (kn->kn_fflags & NOTE_SUSPEND) {
+                       kn->kn_hook32 &= ~NOTE_RESUME;
+               }
+               if (kn->kn_fflags & NOTE_RESUME) {
+                       kn->kn_hook32 &= ~NOTE_SUSPEND;
+               }
+
+               knote_fill_kevent(kn, kev, data);
+       }
        return ret;
 }
 
 static int
-filt_sockattach(struct knote *kn, __unused struct kevent_internal_s *kev)
+filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
 {
        struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
 
        /* socket locked */
-       kn->kn_hookid = 0;
+       kn->kn_hook32 = 0;
        if (KNOTE_ATTACH(&so->so_klist, kn)) {
                so->so_flags |= SOF_KNOTE;
        }
 
        /* determine if event already fired */
-       return filt_sockev_common(kn, so, 0);
+       return filt_sockev_common(kn, NULL, so, 0);
 }
 
 static void
@@ -6852,7 +6978,7 @@ filt_sockev(struct knote *kn, long hint)
                locked = 1;
        }
 
-       ret = filt_sockev_common(kn, so, ev_hint);
+       ret = filt_sockev_common(kn, NULL, so, ev_hint);
 
        if (locked) {
                socket_unlock(so, 1);
@@ -6869,7 +6995,7 @@ filt_sockev(struct knote *kn, long hint)
 static int
 filt_socktouch(
        struct knote *kn,
-       struct kevent_internal_s *kev)
+       struct kevent_qos_s *kev)
 {
        struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
        uint32_t changed_flags;
@@ -6878,7 +7004,7 @@ filt_socktouch(
        socket_lock(so, 1);
 
        /* save off the [result] data and fflags */
-       changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
+       changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
 
        /* save off the new input fflags and data */
        kn->kn_sfflags = kev->fflags;
@@ -6896,11 +7022,10 @@ filt_socktouch(
         * delivered, if any of those events are not requested
         * anymore the state related to them can be reset
         */
-       kn->kn_hookid &=
-           ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
+       kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
 
        /* determine if we have events to deliver */
-       ret = filt_sockev_common(kn, so, 0);
+       ret = filt_sockev_common(kn, NULL, so, 0);
 
        socket_unlock(so, 1);
 
@@ -6911,50 +7036,14 @@ filt_socktouch(
  *     filt_sockprocess - query event fired state and return data
  */
 static int
-filt_sockprocess(
-       struct knote *kn,
-       struct filt_process_s *data,
-       struct kevent_internal_s *kev)
+filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-#pragma unused(data)
-
        struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
        int ret = 0;
 
        socket_lock(so, 1);
 
-       ret = filt_sockev_common(kn, so, 0);
-       if (ret) {
-               *kev = kn->kn_kevent;
-
-               /*
-                * Store the state of the events being delivered. This
-                * state can be used to deliver level triggered events
-                * ateast once and still avoid waking up the application
-                * multiple times as long as the event is active.
-                */
-               if (kn->kn_fflags != 0) {
-                       kn->kn_hookid |= (kn->kn_fflags &
-                           EVFILT_SOCK_LEVEL_TRIGGER_MASK);
-               }
-
-               /*
-                * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
-                * only one of them and remember the last one that was
-                * delivered last
-                */
-               if (kn->kn_fflags & NOTE_SUSPEND) {
-                       kn->kn_hookid &= ~NOTE_RESUME;
-               }
-               if (kn->kn_fflags & NOTE_RESUME) {
-                       kn->kn_hookid &= ~NOTE_SUSPEND;
-               }
-
-               if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_data = 0;
-                       kn->kn_fflags = 0;
-               }
-       }
+       ret = filt_sockev_common(kn, kev, so, 0);
 
        socket_unlock(so, 1);
 
@@ -7003,6 +7092,16 @@ solockhistory_nr(struct socket *so)
        return lock_history_str;
 }
 
+lck_mtx_t *
+socket_getlock(struct socket *so, int flags)
+{
+       if (so->so_proto->pr_getlock != NULL) {
+               return (*so->so_proto->pr_getlock)(so, flags);
+       } else {
+               return so->so_proto->pr_domain->dom_mtx;
+       }
+}
+
 void
 socket_lock(struct socket *so, int refcount)
 {
@@ -7062,12 +7161,12 @@ socket_unlock(struct socket *so, int refcount)
 
        lr_saved = __builtin_return_address(0);
 
-       if (so->so_proto == NULL) {
+       if (so == NULL || so->so_proto == NULL) {
                panic("%s: null so_proto so=%p\n", __func__, so);
                /* NOTREACHED */
        }
 
-       if (so && so->so_proto->pr_unlock) {
+       if (so->so_proto->pr_unlock) {
                (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
        } else {
                mutex_held = so->so_proto->pr_domain->dom_mtx;
@@ -7625,6 +7724,7 @@ so_set_restrictions(struct socket *so, uint32_t vals)
 {
        int nocell_old, nocell_new;
        int noexpensive_old, noexpensive_new;
+       int noconstrained_old, noconstrained_new;
 
        /*
         * Deny-type restrictions are trapdoors; once set they cannot be
@@ -7641,15 +7741,18 @@ so_set_restrictions(struct socket *so, uint32_t vals)
         */
        nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
        noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
+       noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
        so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
            SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
-           SO_RESTRICT_DENY_EXPENSIVE));
+           SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
        nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
        noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
+       noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
 
        /* we can only set, not clear restrictions */
        if ((nocell_new - nocell_old) == 0 &&
-           (noexpensive_new - noexpensive_old) == 0) {
+           (noexpensive_new - noexpensive_old) == 0 &&
+           (noconstrained_new - noconstrained_old) == 0) {
                return 0;
        }
 #if INET6
@@ -7667,6 +7770,9 @@ so_set_restrictions(struct socket *so, uint32_t vals)
                if (noexpensive_new - noexpensive_old != 0) {
                        inp_set_noexpensive(sotoinpcb(so));
                }
+               if (noconstrained_new - noconstrained_old != 0) {
+                       inp_set_noconstrained(sotoinpcb(so));
+               }
        }
 
        if (SOCK_DOM(so) == PF_MULTIPATH) {
@@ -7685,7 +7791,7 @@ so_get_restrictions(struct socket *so)
 }
 
 int
-so_set_effective_pid(struct socket *so, int epid, struct proc *p)
+so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
 {
        struct proc *ep = PROC_NULL;
        int error = 0;
@@ -7712,7 +7818,7 @@ so_set_effective_pid(struct socket *so, int epid, struct proc *p)
         * the process's own pid, then proceed.  Otherwise ensure
         * that the issuing process has the necessary privileges.
         */
-       if (epid != so->last_pid || epid != proc_pid(p)) {
+       if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
                if ((error = priv_check_cred(kauth_cred_get(),
                    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
                        error = EACCES;
@@ -7747,6 +7853,9 @@ so_set_effective_pid(struct socket *so, int epid, struct proc *p)
                so->e_pid = proc_pid(ep);
                proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
        }
+       if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
+               (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
+       }
 done:
        if (error == 0 && net_io_policy_log) {
                uuid_string_t buf;
@@ -7784,7 +7893,7 @@ done:
 }
 
 int
-so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
+so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
 {
        uuid_string_t buf;
        uuid_t uuid;
@@ -7815,8 +7924,9 @@ so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
         * the process's own uuid, then proceed.  Otherwise ensure
         * that the issuing process has the necessary privileges.
         */
-       if (uuid_compare(euuid, so->last_uuid) != 0 ||
-           uuid_compare(euuid, uuid) != 0) {
+       if (check_cred &&
+           (uuid_compare(euuid, so->last_uuid) != 0 ||
+           uuid_compare(euuid, uuid) != 0)) {
                if ((error = priv_check_cred(kauth_cred_get(),
                    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
                        error = EACCES;
@@ -7851,7 +7961,13 @@ so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
                so->e_pid = so->last_pid;
                uuid_copy(so->e_uuid, euuid);
        }
-
+       /*
+        * The following will clear the effective process name as it's the same
+        * as the real process
+        */
+       if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
+               (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
+       }
 done:
        if (error == 0 && net_io_policy_log) {
                uuid_unparse(so->e_uuid, buf);
index 34b2955901be87c3a89d0f8381ff2a7e6f5f15fb..cc3c37a5226dd5678ca7457636ba5bf48624c0b4 100644 (file)
@@ -83,6 +83,7 @@
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
+#include <sys/unpcb.h>
 #include <sys/ev.h>
 #include <kern/locks.h>
 #include <net/route.h>
@@ -185,6 +186,15 @@ soisconnecting(struct socket *so)
 void
 soisconnected(struct socket *so)
 {
+       /*
+        * If socket is subject to filter and is pending initial verdict,
+        * delay marking socket as connected and do not present the connected
+        * socket to user just yet.
+        */
+       if (cfil_sock_connected_pending_verdict(so)) {
+               return;
+       }
+
        so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING | SS_ISCONFIRMING);
        so->so_state |= SS_ISCONNECTED;
 
@@ -381,6 +391,7 @@ sonewconn_internal(struct socket *head, int connstatus)
            SOF_NOTIFYCONFLICT | SOF_BINDRANDOMPORT | SOF_NPX_SETOPTSHUT |
            SOF_NODEFUNCT | SOF_PRIVILEGED_TRAFFIC_CLASS | SOF_NOTSENT_LOWAT |
            SOF_USELRO | SOF_DELEGATED);
+       so->so_flags1 |= SOF1_INBOUND;
        so->so_usecount = 1;
        so->next_lock_lr = 0;
        so->next_unlock_lr = 0;
@@ -395,9 +406,11 @@ sonewconn_internal(struct socket *head, int connstatus)
 
        /* inherit traffic management properties of listener */
        so->so_flags1 |=
-           head->so_flags1 & (SOF1_TRAFFIC_MGT_SO_BACKGROUND);
+           head->so_flags1 & (SOF1_TRAFFIC_MGT_SO_BACKGROUND | SOF1_TC_NET_SERV_TYPE |
+           SOF1_QOSMARKING_ALLOWED | SOF1_QOSMARKING_POLICY_OVERRIDE);
        so->so_background_thread = head->so_background_thread;
        so->so_traffic_class = head->so_traffic_class;
+       so->so_netsvctype = head->so_netsvctype;
 
        if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
                sodealloc(so);
@@ -434,6 +447,9 @@ sonewconn_internal(struct socket *head, int connstatus)
                }
        }
 
+       if (so->so_proto->pr_copy_last_owner != NULL) {
+               (*so->so_proto->pr_copy_last_owner)(so, head);
+       }
        atomic_add_32(&so->so_proto->pr_domain->dom_refs, 1);
 
        /* Insert in head appropriate lists */
@@ -605,7 +621,7 @@ sbwakeup(struct sockbuf *sb)
  * if the socket has the SS_ASYNC flag set.
  */
 void
-sowakeup(struct socket *so, struct sockbuf *sb)
+sowakeup(struct socket *so, struct sockbuf *sb, struct socket *so2)
 {
        if (so->so_flags & SOF_DEFUNCT) {
                SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] si 0x%x, "
@@ -640,11 +656,42 @@ sowakeup(struct socket *so, struct sockbuf *sb)
                so->so_upcallusecount++;
 
                if (lock) {
+                       if (so2) {
+                               struct unpcb *unp = sotounpcb(so2);
+                               unp->unp_flags |= UNP_DONTDISCONNECT;
+                               unp->rw_thrcount++;
+
+                               socket_unlock(so2, 0);
+                       }
                        socket_unlock(so, 0);
                }
                (*sb_upcall)(so, sb_upcallarg, M_DONTWAIT);
                if (lock) {
+                       if (so2 && so > so2) {
+                               struct unpcb *unp;
+                               socket_lock(so2, 0);
+
+                               unp = sotounpcb(so2);
+                               unp->rw_thrcount--;
+                               if (unp->rw_thrcount == 0) {
+                                       unp->unp_flags &= ~UNP_DONTDISCONNECT;
+                                       wakeup(unp);
+                               }
+                       }
+
                        socket_lock(so, 0);
+
+                       if (so2 && so < so2) {
+                               struct unpcb *unp;
+                               socket_lock(so2, 0);
+
+                               unp = sotounpcb(so2);
+                               unp->rw_thrcount--;
+                               if (unp->rw_thrcount == 0) {
+                                       unp->unp_flags &= ~UNP_DONTDISCONNECT;
+                                       wakeup(unp);
+                               }
+                       }
                }
 
                so->so_upcallusecount--;
@@ -1086,82 +1133,6 @@ sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
        return 1;
 }
 
-/*
- * As above except that OOB data
- * is inserted at the beginning of the sockbuf,
- * but after any other OOB data.
- */
-int
-sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
-{
-       struct mbuf *m;
-       struct mbuf **mp;
-
-       if (m0 == 0) {
-               return 0;
-       }
-
-       SBLASTRECORDCHK(sb, "sbinsertoob 1");
-
-       if ((sb->sb_flags & SB_RECV && !(m0->m_flags & M_SKIPCFIL)) != 0) {
-               int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL,
-                   sock_data_filt_flag_oob);
-
-               SBLASTRECORDCHK(sb, "sbinsertoob 2");
-
-#if CONTENT_FILTER
-               if (error == 0) {
-                       error = cfil_sock_data_in(sb->sb_so, NULL, m0, NULL, 0);
-               }
-#endif /* CONTENT_FILTER */
-
-               if (error) {
-                       if (error != EJUSTRETURN) {
-                               m_freem(m0);
-                       }
-                       return 0;
-               }
-       } else if (m0) {
-               m0->m_flags &= ~M_SKIPCFIL;
-       }
-
-       for (mp = &sb->sb_mb; *mp; mp = &((*mp)->m_nextpkt)) {
-               m = *mp;
-again:
-               switch (m->m_type) {
-               case MT_OOBDATA:
-                       continue;               /* WANT next train */
-
-               case MT_CONTROL:
-                       m = m->m_next;
-                       if (m) {
-                               goto again;     /* inspect THIS train further */
-                       }
-               }
-               break;
-       }
-       /*
-        * Put the first mbuf on the queue.
-        * Note this permits zero length records.
-        */
-       sballoc(sb, m0);
-       m0->m_nextpkt = *mp;
-       if (*mp == NULL) {
-               /* m0 is actually the new tail */
-               sb->sb_lastrecord = m0;
-       }
-       *mp = m0;
-       m = m0->m_next;
-       m0->m_next = 0;
-       if (m && (m0->m_flags & M_EOR)) {
-               m0->m_flags &= ~M_EOR;
-               m->m_flags |= M_EOR;
-       }
-       sbcompress(sb, m, m0);
-       SBLASTRECORDCHK(sb, "sbinsertoob 3");
-       return 1;
-}
-
 /*
  * Concatenate address (optional), control (optional) and data into one
  * single mbuf chain.  If sockbuf *sb is passed in, space check will be
@@ -2845,21 +2816,25 @@ sbunlock(struct sockbuf *sb, boolean_t keeplocked)
        }
 
        if (!keeplocked) {      /* unlock on exit */
-               lck_mtx_t *mutex_held;
-
-               if (so->so_proto->pr_getlock != NULL) {
-                       mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
+               if (so->so_flags & SOF_MP_SUBFLOW || SOCK_DOM(so) == PF_MULTIPATH) {
+                       (*so->so_proto->pr_unlock)(so, 1, lr_saved);
                } else {
-                       mutex_held = so->so_proto->pr_domain->dom_mtx;
-               }
+                       lck_mtx_t *mutex_held;
 
-               LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
+                       if (so->so_proto->pr_getlock != NULL) {
+                               mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
+                       } else {
+                               mutex_held = so->so_proto->pr_domain->dom_mtx;
+                       }
 
-               VERIFY(so->so_usecount > 0);
-               so->so_usecount--;
-               so->unlock_lr[so->next_unlock_lr] = lr_saved;
-               so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
-               lck_mtx_unlock(mutex_held);
+                       LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
+
+                       VERIFY(so->so_usecount > 0);
+                       so->so_usecount--;
+                       so->unlock_lr[so->next_unlock_lr] = lr_saved;
+                       so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
+                       lck_mtx_unlock(mutex_held);
+               }
        }
 }
 
@@ -2867,7 +2842,7 @@ void
 sorwakeup(struct socket *so)
 {
        if (sb_notify(&so->so_rcv)) {
-               sowakeup(so, &so->so_rcv);
+               sowakeup(so, &so->so_rcv, NULL);
        }
 }
 
@@ -2875,7 +2850,7 @@ void
 sowwakeup(struct socket *so)
 {
        if (sb_notify(&so->so_snd)) {
-               sowakeup(so, &so->so_snd);
+               sowakeup(so, &so->so_snd, NULL);
        }
 }
 
@@ -2895,7 +2870,8 @@ soevent(struct socket *so, long hint)
        if ((hint & SO_FILT_HINT_IFDENIED) &&
            !(so->so_flags & SOF_MP_SUBFLOW) &&
            !(so->so_restrictions & SO_RESTRICT_DENY_CELLULAR) &&
-           !(so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
+           !(so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE) &&
+           !(so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
                soevent_ifdenied(so);
        }
 }
index b89e2cc49367723b7c7b9251954eb2cc021fa21f..b903e4a18932055b92d7c1d49ebb0627b08ae3d6 100644 (file)
 #include <net/route.h>
 #include <netinet/in_pcb.h>
 
+#include <os/ptrtools.h>
+
 #if CONFIG_MACF_SOCKET_SUBSET
 #include <security/mac_framework.h>
 #endif /* MAC_SOCKET_SUBSET */
@@ -1012,6 +1014,12 @@ connectitx(struct socket *so, struct sockaddr *src,
        if ((error = mac_socket_check_connect(kauth_cred_get(), so, dst)) != 0) {
                return error;
        }
+
+       if (auio != NULL) {
+               if ((error = mac_socket_check_send(kauth_cred_get(), so, dst)) != 0) {
+                       return error;
+               }
+       }
 #endif /* MAC_SOCKET_SUBSET */
 
        socket_lock(so, 1);
@@ -1816,8 +1824,8 @@ copyout_control(struct proc *p, struct mbuf *m, user_addr_t control,
                                if (proc_is64bit(p)) {
                                        struct user64_timeval *tv64 = (struct user64_timeval *)(void *)CMSG_DATA(tmp_cp);
 
-                                       tv64->tv_sec = tv->tv_sec;
-                                       tv64->tv_usec = tv->tv_usec;
+                                       os_unaligned_deref(&tv64->tv_sec) = tv->tv_sec;
+                                       os_unaligned_deref(&tv64->tv_usec) = tv->tv_usec;
 
                                        tmp_cp->cmsg_len = CMSG_LEN(sizeof(struct user64_timeval));
                                        tmp_space = CMSG_SPACE(sizeof(struct user64_timeval));
@@ -3440,9 +3448,11 @@ sendfile(struct proc *p, struct sendfile_args *uap, __unused int *retval)
        /*
         * Get number of bytes to send
         * Should it applies to size of header and trailer?
-        * JMM - error handling?
         */
-       copyin(uap->nbytes, &nbytes, sizeof(off_t));
+       error = copyin(uap->nbytes, &nbytes, sizeof(off_t));
+       if (error) {
+               goto done2;
+       }
 
        /*
         * If specified, get the pointer to the sf_hdtr struct for
index 2925a6fee059ddb6429917a3d7e3110b6b26f0c7..2c4434dc2cc491f0f56809aa7d8a9783a241482c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -161,7 +161,7 @@ struct mdns_ipc_msg_hdr {
  *     need a proper out-of-band
  *     lock pushdown
  */
-static struct   sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL, { 0 } };
+static struct   sockaddr sun_noname = { .sa_len = sizeof(sun_noname), .sa_family = AF_LOCAL, .sa_data = { 0 } };
 static ino_t    unp_ino;                /* prototype for fake inode numbers */
 
 static int      unp_attach(struct socket *);
@@ -392,7 +392,9 @@ uipc_rcvd(struct socket *so, __unused int flags)
                unp->unp_mbcnt = rcv->sb_mbcnt;
                snd->sb_hiwat += unp->unp_cc - rcv->sb_cc;
                unp->unp_cc = rcv->sb_cc;
-               sowwakeup(so2);
+               if (sb_notify(&so2->so_snd)) {
+                       sowakeup(so2, &so2->so_snd, so);
+               }
 
                socket_unlock(so2, 1);
 
@@ -495,7 +497,9 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
                 */
                if (sbappendaddr(&so2->so_rcv, from, m, control, &error)) {
                        control = NULL;
-                       sorwakeup(so2);
+                       if (sb_notify(&so2->so_rcv)) {
+                               sowakeup(so2, &so2->so_rcv, so);
+                       }
                } else if (control != NULL && error == 0) {
                        /* A socket filter took control; don't touch it */
                        control = NULL;
@@ -587,7 +591,9 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
                unp->unp_conn->unp_cc = rcv->sb_cc;
                if (didreceive) {
                        control = NULL;
-                       sorwakeup(so2);
+                       if (sb_notify(&so2->so_rcv)) {
+                               sowakeup(so2, &so2->so_rcv, so);
+                       }
                } else if (control != NULL && error == 0) {
                        /* A socket filter took control; don't touch it */
                        control = NULL;
@@ -1736,12 +1742,12 @@ unp_pcblist SYSCTL_HANDLER_ARGS
                         * connect/disconnect races for SMP.
                         */
                        if (unp->unp_addr) {
-                               bcopy(unp->unp_addr, &xu.xu_addr,
+                               bcopy(unp->unp_addr, &xu.xu_au,
                                    unp->unp_addr->sun_len);
                        }
                        if (unp->unp_conn && unp->unp_conn->unp_addr) {
                                bcopy(unp->unp_conn->unp_addr,
-                                   &xu.xu_caddr,
+                                   &xu.xu_cau,
                                    unp->unp_conn->unp_addr->sun_len);
                        }
                        unpcb_to_compat(unp, &xu.xu_unp);
@@ -1890,12 +1896,12 @@ unp_pcblist64 SYSCTL_HANDLER_ARGS
                         * connect/disconnect races for SMP.
                         */
                        if (unp->unp_addr) {
-                               bcopy(unp->unp_addr, &xu.xunp_addr,
+                               bcopy(unp->unp_addr, &xu.xu_au,
                                    unp->unp_addr->sun_len);
                        }
                        if (unp->unp_conn && unp->unp_conn->unp_addr) {
                                bcopy(unp->unp_conn->unp_addr,
-                                   &xu.xunp_caddr,
+                                   &xu.xu_cau,
                                    unp->unp_conn->unp_addr->sun_len);
                        }
 
@@ -2019,14 +2025,13 @@ unp_externalize(struct mbuf *rights)
                if (fp == NULL) {
                        panic("unp_externalize: MALLOC_ZONE");
                }
-               fp->f_iocount = 0;
                fp->f_fglob = rp[i];
                if (fg_removeuipc_mark(rp[i])) {
                        /*
                         * Take an iocount on the fp for completing the
                         * removal from the global msg queue
                         */
-                       fp->f_iocount++;
+                       os_ref_retain_locked(&fp->f_iocount);
                        fileproc_l[i] = fp;
                } else {
                        fileproc_l[i] = NULL;
index 8162ded607dbc7a302893dc5b1948165469848a8..fc782cf05be0debf11a09967df909e30460dd084 100644 (file)
 
 __BEGIN_DECLS
 
-int     copyin(const user_addr_t uaddr, void *kaddr, size_t len);
+int     copyin(const user_addr_t uaddr, void *kaddr, size_t len) OS_WARN_RESULT;
 int     copyout(const void *kaddr, user_addr_t udaddr, size_t len);
 
 #if defined (_FORTIFY_SOURCE) && _FORTIFY_SOURCE == 0
 /* FORTIFY_SOURCE disabled */
 #else
-__attribute__((always_inline)) static inline int
+OS_ALWAYS_INLINE OS_WARN_RESULT static inline int
 __copyin_chk(const user_addr_t uaddr, void *kaddr, size_t len, size_t chk_size)
 {
        if (chk_size < len) {
@@ -47,7 +47,7 @@ __copyin_chk(const user_addr_t uaddr, void *kaddr, size_t len, size_t chk_size)
        return copyin(uaddr, kaddr, len);
 }
 
-__attribute__((always_inline)) static inline int
+OS_ALWAYS_INLINE static inline int
 __copyout_chk(const void *kaddr, user_addr_t uaddr, size_t len, size_t chk_size)
 {
        if (chk_size < len) {
index 9a4ecb489b484824730fe805354b548adccbc38c..ab4dfd3ee583a86dfc1406e99fa6d763796088c1 100644 (file)
@@ -144,9 +144,9 @@ ulmin(u_int32_t a, u_int32_t b)
 
 
 /* Prototypes for non-quad routines. */
-extern int      ffs(int);
+extern int      ffs(unsigned int);
 extern int      ffsll(unsigned long long);
-extern int      fls(int);
+extern int      fls(unsigned int);
 extern int      flsll(unsigned long long);
 extern u_int32_t        random(void);
 extern int      scanc(u_int, u_char *, const u_char *, int);
@@ -194,10 +194,15 @@ __nosan_crc16(uint16_t crc, const void *bufp, size_t len)
 #endif
 
 int     copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done);
-int     copyinstr(const user_addr_t uaddr, void *kaddr, size_t len, size_t *done);
+int     copyinstr(const user_addr_t uaddr, void *kaddr, size_t len, size_t *done) OS_WARN_RESULT;
 int     copyoutstr(const void *kaddr, user_addr_t udaddr, size_t len, size_t *done);
 #if XNU_KERNEL_PRIVATE
-extern int copyin_word(const user_addr_t user_addr, uint64_t *kernel_addr, vm_size_t nbytes);
+int     copyin_atomic32(const user_addr_t user_addr, uint32_t *u32);
+int     copyin_atomic32_wait_if_equals(const user_addr_t user_addr, uint32_t u32);
+int     copyin_atomic64(const user_addr_t user_addr, uint64_t *u64);
+int     copyout_atomic32(uint32_t u32, user_addr_t user_addr);
+int     copyout_atomic64(uint64_t u64, user_addr_t user_addr);
+int     copyoutstr_prevalidate(const void *kaddr, user_addr_t uaddr, size_t len);
 #endif
 
 int vsscanf(const char *, char const *, va_list);
@@ -206,7 +211,7 @@ extern int      vprintf(const char *, va_list) __printflike(1, 0);
 extern int      vsnprintf(char *, size_t, const char *, va_list) __printflike(3, 0);
 
 #if XNU_KERNEL_PRIVATE
-extern int      vprintf_log_locked(const char *, va_list) __printflike(1, 0);
+extern int      vprintf_log_locked(const char *, va_list, bool addcr) __printflike(1, 0);
 extern void     osobject_retain(void * object);
 extern void     osobject_release(void * object);
 #endif
index bffe6a8142ee1534e40d3401ca6047f68c9724f5..cd91a9858d7ff5f3daf29671763d061df232de9a 100644 (file)
@@ -13,6 +13,9 @@ DATAFILES = \
        vmparam.h _types.h _limits.h _param.h \
        _mcontext.h
 
+DRIVERKIT_DATAFILES = \
+       limits.h types.h _types.h
+
 PRIVATE_DATAFILES = \
        disklabel.h
 
@@ -25,6 +28,7 @@ KERNELFILES = \
        _mcontext.h
 
 INSTALL_MI_LIST = ${DATAFILES}
+INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES}
 INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES}
 
 INSTALL_MI_DIR = machine
index f93f56bcf7f3092bcfd87b2f7d469bf2dbe8704c..cdbf3a2e363b8597e07308f1594af414b89867f8 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -32,6 +32,7 @@
 #define _BSD_MACHINE_EXEC_H_
 
 #include <sys/param.h>
+#include <stdbool.h>
 
 struct exec_info {
        char    path[MAXPATHLEN];
@@ -41,15 +42,7 @@ struct exec_info {
        char    **ev;
 };
 
-int grade_binary(cpu_type_t, cpu_subtype_t);
+int grade_binary(cpu_type_t, cpu_subtype_t, bool allow_simulator_binary);
 boolean_t pie_required(cpu_type_t, cpu_subtype_t);
 
-#if defined (__i386__) || defined(__x86_64__)
-#include "i386/exec.h"
-#elif defined (__arm__) || defined (__arm64__)
-#include "arm/exec.h"
-#else
-#error architecture not supported
-#endif
-
 #endif /* _BSD_MACHINE_EXEC_H_ */
diff --git a/bsd/machine/reboot.h b/bsd/machine/reboot.h
deleted file mode 100644 (file)
index ae3b8bf..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-#ifndef _BSD_MACHINE_REBOOT_H_
-#define _BSD_MACHINE_REBOOT_H_
-
-#if defined (__i386__) || defined(__x86_64__)
-#include "i386/reboot.h"
-#elif defined (__arm__) || defined (__arm64__)
-#include "arm/reboot.h"
-#else
-#error architecture not supported
-#endif
-
-#endif /* _BSD_MACHINE_REBOOT_H_ */
index c4b2f92f66524330319a28c6da87603da2a2ea02..703ce97afd2b99aecb7b7bfa07d205e58dc4541b 100644 (file)
@@ -109,6 +109,12 @@ The checks for accessibility are performed using the effective user and group
 IDs instead of the real user and group ID as required in a call to
 .Fn access .
 .El
+.Bl -tag -width indent
+.It Dv AT_SYMLINK_NOFOLLOW
+If
+.Fa path
+names a symbolic link, the status of the symbolic link is returned.
+.El
 .Pp
 Even if a process has appropriate privileges and indicates success for
 .Dv X_OK ,
index 3463d71802270fb1e6baaa5bdae45585615f6ff5..6bea184c416eedde8c13c855748626e8f08fcc54 100644 (file)
@@ -77,6 +77,11 @@ The file has been archived.
 The file may not be changed.
 .It SF_APPEND
 The file may only be appended to.
+.It SF_DATALESSFAULT
+The file is a dataless placeholder.
+The system will attempt to materialize it when accessed according to the dataless file materialization policy of the accessing thread or process.
+See
+.Xr getiopolicy_np 3 .
 .El
 .Pp
 The
@@ -93,6 +98,10 @@ The
 and
 .Dq SF_APPEND
 flags may only be set or unset by the super-user.
+.Pp
+The
+.Dq SF_DATALESSFAULT
+flag is an internal flag and may not be set from user space.
 .Sh RETURN VALUES
 Upon successful completion, a value of 0 is returned.
 Otherwise, -1 is returned and the global variable
index abafe401760a9f326f903db9a23fc5f50086cb6e..7390b18e374796c0ed4e2dcacb412b6facfc8070 100644 (file)
@@ -146,7 +146,9 @@ as negative, otherwise
 .Fa arg
 is interpreted as a process ID.
 .It Dv F_GETPATH
-Get the path of the file descriptor 
+Get the path of the file descriptor
+.It Dv F_GETPATH_NOFIRMLINK
+Get the non firmlinked path of the file descriptor
 .Fa Fildes .  
 The argument must be a buffer of size
 .Sy MAXPATHLEN
@@ -176,11 +178,6 @@ disables read ahead.
 A non-zero value in
 .Fa arg
 turns read ahead on.
-.It Dv F_READBOOTSTRAP
-Read bootstrap from disk.
-.It Dv F_WRITEBOOTSTRAP
-Write bootstrap on disk.
-The calling process must have root privileges.
 .It Dv F_NOCACHE
 Turns data caching off/on. A non-zero value in
 .Fa arg
@@ -199,6 +196,29 @@ to change.
 .It Dv F_LOG2PHYS_EXT
 Variant of F_LOG2PHYS that uses the passed in
 file offset and length.
+.It Dv F_BARRIERFSYNC
+Does the same thing as
+.Xr fsync 2
+then issues a barrier command to the drive
+.Fa ( arg
+is ignored).
+The barrier applies to I/O that have been flushed with
+.Xr fsync 2
+on the same device before.  These operations are guaranteed to
+be persisted before any other I/O that would follow the barrier,
+although no assumption should be made on what has been persisted
+or not when this call returns.  After the barrier has been issued,
+operations on other FDs that have been fsync'd before can still be
+re-ordered by the device, but not after the barrier.  This is
+typically useful to guarantee valid state on disk when ordering is a
+concern but durability is not.  A barrier can be used to order two phases of operations on
+a set of file descriptors and ensure that no file can possibly get persisted
+with the effect of the second phase without the effect of the first one. To do so,
+execute operations of phase one, then
+.Xr fsync 2
+each FD and issue a single barrier.  Finally execute operations of phase two.
+This is currently implemented on HFS and APFS. It requires hardware support,
+which Apple SSDs are guaranteed to provide.
 .It Dv F_FULLFSYNC
 Does the same thing as
 .Xr fsync 2
@@ -207,8 +227,11 @@ flush all buffered data to
 the permanent storage device
 .Fa ( arg
 is ignored).
+As this drains the entire queue of the device and acts as a
+barrier, data that had been fsync'd on the same device before
+is guaranteed to be persisted when this call returns.
 This is currently implemented on HFS, MS-DOS (FAT),
-and Universal Disk Format (UDF) file systems.
+Universal Disk Format (UDF) and APFS file systems.
 The operation may take quite a while to complete.
 Certain FireWire drives have also been known
 to ignore the request to flush their buffered data.
index 6407f428e743d66427e51848146d48a92368a4d1..6218679feb07e87180d6ec298e685388db928cc7 100644 (file)
@@ -196,8 +196,12 @@ The
 ,
 .Fn fs_snapshot_delete
 ,
-.Fn fs_snapshot_delete
-and
 .Fn fs_snapshot_list
+,
+.Fn fs_snapshot_mount
+,
+.Fn fs_snapshot_rename
+and
+.Fn fs_snapshot_revert
 function calls appeared in macOS version 10.13
 .
index 317c45cb72773b51ca05466ec06c4945ca33480a..e8b71b76bd442cb006af79371f90296ac518a6bc 100644 (file)
@@ -67,7 +67,7 @@ multiple paths to that filesystem object may be returned.
 .Sh RETURN VALUES
 Upon successful completion,
 .Fn fsgetpath
-returns the path length. Otherwise, a value of -1 is returned and errno is set to indicate the error.
+returns the length of the path including the null terminator. Otherwise, a value of -1 is returned and errno is set to indicate the error.
 .Pp
 .Sh COMPATIBILITY
 Not all volumes support
index b790fe01330973d92c4af50e8c3ddcd028a8efe3..f37a137aa56c7f9be6a658df0c6e8b64536a055d 100644 (file)
@@ -245,6 +245,12 @@ and
 can be requested. When this option is used, forkattrs are reinterpreted as a
 set of extended common attributes.
 .
+.It FSOPT_RETURN_REALDEV
+If this is bit is set, then ATTR_CMN_DEVID and ATTR_CMN_FSID will return
+the values corresponding to the physical volume they are on. When a
+filesystem supports VOL_CAP_INT_VOLUME_GROUPS, it is possible that the
+filesystem may return a common logical value for these attributes otherwise.
+.
 .El
 .
 .Sh ATTRIBUTE BUFFER
@@ -433,7 +439,7 @@ in
 An
 .Vt fsobj_id_t
 structure that uniquely identifies the file system object within a mounted 
-volume for the duration of it's mount; this identifier is not guaranteed to be 
+volume for the duration of its mount; this identifier is not guaranteed to be 
 persistent for the volume and may change every time the volume is mounted.
 .Pp
 On HFS+ volumes, the ATTR_CMN_OBJID of a file system object is distinct from 
@@ -1181,6 +1187,41 @@ system object. Although the ATTR_CMNEXT_LINKID of a file system object may appea
 similar (in whole or in part) to its ATTR_CMN_FILEID (see description of
 ATTR_CMN_FILEID above), \fBno relation between the two attributes should ever be implied.\fP
 .
+.It ATTR_CMNEXT_NOFIRMLINKPATH
+An
+.Vt attrreference
+structure containing a path that does not have firmlinks of
+the file system object as
+a UTF-8 encoded, null terminated C string.
+The attribute data length will not be greater than
+.Dv PATH_MAX.
+Inconsistent behavior may be observed when this attribute is requested on
+hard-linked items, particularly when the file system does not support
+ATTR_CMN_PARENTID natively. Callers should be aware of this when requesting the
+canonical path of a hard-linked item.
+.It ATTR_CMNEXT_REALDEVID
+A
+.Vt dev_t
+containing the real device number of the device on which this
+file system object's volume is mounted.
+Equivalent to the
+.Fa st_dev
+field of the
+.Vt stat
+structure returned by
+.Xr stat 2 .
+.
+.It ATTR_CMNEXT_REALFSID
+An
+.Vt fsid_t
+structure containing the real file system identifier for the volume on which
+the file system object resides.
+Equivalent to the
+.Fa f_fsid
+field of the
+.Vt statfs
+structure returned by
+.Xr statfs 2 .
 .El
 .
 .Sh VOLUME CAPABILITIES
@@ -1424,6 +1465,14 @@ See ATTR_CMN_FLAGS for more details.
 If this bit is set, the volume format does not support setting file
 permissions.
 See ATTR_CMN_USERACCESS for more details.
+.It VOL_CAP_FMT_SHARED_SPACE
+If this bit is set, the volume format supports having multiple logical filesystems
+in a single "partition" which share space.
+.It VOL_CAP_FMT_VOL_GROUPS
+If this bit is set, the volume format supports having multiple logical filesystems
+which may be mounted and unmounted together and may present common filesystem
+identifier information.
+.
 .
 .El
 .Pp
@@ -1563,6 +1612,13 @@ operation. See
 .Xr rename 2
 for more details.
 .
+.It VOL_CAP_INT_RENAME_OPENFAIL
+If this bit is set, the file system may fail a rename operation
+of a directory if one of its descendents is open.
+See
+.Xr rename 2
+for more details.
+.
 .El
 .Pp
 .
index aaf91dd5aa93f137092fd9f2e916c313c4893f9a..60f7e45661360bbcfc38ee3c74ee84b5b6874b80 100644 (file)
@@ -86,7 +86,7 @@ structure must be set.
 Volume attributes  cannot be requested but all other supported getattrlist attributes can be used. For this function,
 .Dv ATTR_CMN_NAME
 and
-.Dv ATRR_CMN_RETURNED_ATTRS
+.Dv ATTR_CMN_RETURNED_ATTRS
 are required and the absence of these attributes in the attrList parameter results in an error. Note that 
 not all attributes supported by 
 .Fn getattrlist
@@ -160,8 +160,23 @@ attributes and then use the value of the
 .Dv ATTR_CMN_OBJTYPE 
 attribute to parse the resulting attribute buffer.
 .Pp
-A directory which is a mount point for a file system, will have a value of "DIR_MNTSTATUS_MNTPOINT" set for it's the
-ATTR_DIR_MOUNTSTATUS attribute entry. However the attributes for the mount point will be those from the (underlying) file system. The only way to get the attributes of mounted root directory is to call getattrlist(2) on the mount point.
+A directory which is a mount point for a file system, will have a value of
+.Dq DIR_MNTSTATUS_MNTPOINT
+set for its ATTR_DIR_MOUNTSTATUS attribute entry.
+However the attributes for the mount point will be those from the (underlying) file system.
+To get the attributes of the mounted root directory, call
+.Xr getattrlist 2
+on the mount point.
+.Pp
+A directory which is a firmlink will have the
+.Dq SF_FIRMLINK
+flag set in its ATTR_CMN_FLAGS attribute entry.
+However, the attributes returned by
+.Fn getattrlistbulk
+will be those from the firmlink, not the firmlink's target.
+To get the attribute of the firmlink's target, call
+.Xr getattrlist 2
+on the firmlink.
 .
 .Sh RETURN VALUES
 Upon successful completion the numbers of entries successfully read
index 6be39ee8726e6fd3a3d4132d88dbb098a85de55c..cc6c35ec70382917f525f36770030852979239ff 100644 (file)
@@ -254,7 +254,7 @@ you should be careful to support the behaviour specified by this document.
 .
 .Pp
 If the directory contains a mount point, then
-.Dv DIR_MNTSTATUS_MNTPOINT
+.Dq DIR_MNTSTATUS_MNTPOINT
 will be set in the
 .Dv ATTR_DIR_MOUNTSTATUS
 for that entry; all other attributes for that entry, however,
@@ -262,6 +262,17 @@ will be for the underlying file system (as opposed to the mounted
 file system).
 .Xr getattrlist 2
 should be used to get the attributes for the mount point.
+.Pp
+A directory which is a firmlink will have the
+.Dq SF_FIRMLINK
+flag set in its
+ATTR_CMN_FLAGS attribute entry.
+However the attributes returned by
+.Fn getdirentriesattr
+will be those from the firmlink, not the firmlink's target.
+To get the attributes of the firmlink's target, call
+.Xr getattrlist 2
+on the firmlink.
 .Sh ERRORS
 .Fn getdirentriesattr
 will fail if:
index d1a5355422d8206b9ef2c23bc81d92587b34c645..be1a477c7a8835891ec590b064e39cc0a911da17 100644 (file)
@@ -145,6 +145,7 @@ The
 argument
 gives the size of
 .Fa changelist .
+.Pp
 The
 .Fa eventlist
 argument
@@ -158,11 +159,9 @@ The
 .Fa nevents
 argument determines the size of
 .Fa eventlist .
-If the KEVENT_FLAG_STACK_EVENTS flag is provided on the system call,
-the eventlist array is filled in in stack order (starting in the
-highest available index) instead of typical array order.
+.Pp
 The
-.Fa out_data
+.Fa data_out
 argument provides space for extra out data provided by specific filters.
 The
 .Fa data_available
@@ -170,6 +169,7 @@ argument's contents specified the space available in the data pool on input,
 and contains the amount still remaining on output.
 If the KEVENT_FLAG_STACK_DATA flag is specified on the system call,
 the data is allocated from the pool in stack order instead of typical heap order. 
+.Pp
 If
 .Fa timeout
 is a non-NULL pointer, it specifies a maximum interval to wait
index 24966225896de27db89e1a79f9d4c04134b7aaeb..acb59588ecd61a08d0fe4eaf20920952939fd1e2 100644 (file)
@@ -188,6 +188,8 @@ argument is not an absolute path and
 is neither
 .Dv AT_FDCWD
 nor a file descriptor associated with a directory.
+.It Bq Eq EILSEQ
+The filename does not match the encoding rules.
 .El
 .Sh EXAMPLE
 .Bd -literal -offset indent
index 69ecfd6166e45d3b1b45859e5537a09e7664cd58..29e2d9240105fd1f75696f8e85b891bb4d0bf612 100644 (file)
@@ -112,6 +112,8 @@ error occurred while reading from or writing to the file system.
 .It Bq Er EFAULT
 .Fa Path
 points outside the process's allocated address space.
+.It Bq Eq EILSEQ
+The filename does not match the encoding rules.
 .El
 .Sh SEE ALSO
 .Xr chmod 2 ,
index 40a94d7d16d12c7e800d79966d362788a2906a87..73eb670f7a4ca837abc7575480b34caca1f663b8 100644 (file)
@@ -129,14 +129,19 @@ and the behavior is identical to a call to
 The flags specified
 for the
 .Fa oflag
-argument are formed by
-.Em or Ns 'ing
-the following values:
+argument must include exactly one of the following file access modes:
 .Pp
 .Bd -literal -offset indent -compact
 O_RDONLY       open for reading only
 O_WRONLY       open for writing only
 O_RDWR         open for reading and writing
+.Ed
+.Pp
+In addition any combination of the following values can be
+.Em or Ns 'ed in
+.Fa oflag:
+.Pp
+.Bd -literal -offset indent -compact
 O_NONBLOCK     do not block on open or for data to become available
 O_APPEND       append on each write
 O_CREAT                create file if it does not exist
@@ -421,6 +426,8 @@ argument is not an absolute path and
 is neither
 .Dv AT_FDCWD
 nor a file descriptor associated with a directory.
+.It Bq Eq EILSEQ
+The filename does not match the encoding rules.
 .El
 .Sh COMPATIBILITY
 .Fn open
index d8df563cc79ceadde5651ba3486c746dff7a0134..cdf753cac9242f27a9c400b4b5bf875d44d867bd 100644 (file)
@@ -205,6 +205,15 @@ An action is requested of a device that does not exist..
 .\" ===========
 .It Bq Er ENXIO
 A requested action cannot be performed by the device.
+.\" ===========
+.It Bq Er ESTALE
+An attempt to read a remote file through NFS that has already been deleted in
+the server.
+.\" ===========
+.It Bq Er ETIMEDOUT
+The connection timed out while reading a remote file from a soft mounted NFS
+volume (see
+.Xr mount_nfs 8 ) .
 .El
 .Pp
 The
index 49a562912ce2b6391dcd94f378657cffe77a2029..5151ce64ce14debd2593cb4b0ada3fe3a157feab 100644 (file)
@@ -204,6 +204,14 @@ The requested operation requires writing in a directory
 .Fa new ,
 new/.., or old/..) whose modes disallow this.
 .\" ===========
+.It Bq Er EACCES
+.Fa old
+is a directory and it, or some descendent in the namespace, is open
+and the file system format does does not support renaming a directory
+with open descendents (see
+.Xr getattrlist 2
+.Dv VOL_CAP_INT_RENAME_OPENFAIL Ns ).
+.\" ===========
 .It Bq Er EDQUOT
 The directory in which the entry for the new name
 is being placed cannot be extended because the
index 0af2016edd8fed3a78ce99008344594ab06458c9..6f5c6e4bb2808f47d7845e0ddeccf00afb86180d 100644 (file)
@@ -60,7 +60,7 @@ This structure is defined as follows in
 .Bd -literal
 struct shmid_ds {
     struct ipc_perm  shm_perm;     /* operation permissions */
-    int              shm_segsz;    /* size of segment in bytes */
+    size_t           shm_segsz;    /* size of segment in bytes */
     pid_t            shm_lpid;     /* pid of last shm op */
     pid_t            shm_cpid;     /* pid of creator */
     short            shm_nattch;   /* # of current attaches */
index e6ed48bf0b2dc1e19d4b945adb14a372b53f064e..197313465e35acfcd5a810886960abe7a5c3edce 100644 (file)
@@ -324,8 +324,6 @@ and
 in
 .Fa struct statfs .
 Please refer to
-.Xr stat 2
-and
 .Xr dir 5
 for more detail on the specific changes to the other affected data structures.
 .Pp
index ae6f2ad6b147532d6e9c137b72caa7fc8b31acbb..0dd37fee0798746bc689e5077325a4ed352a7a8e 100644 (file)
@@ -203,6 +203,8 @@ argument is not an absolute path and
 is neither
 .Dv AT_FDCWD
 nor a file descriptor associated with a directory.
+.It Bq Eq EILSEQ
+The filename does not match the encoding rules.
 .El
 .Sh SEE ALSO
 .Xr ln 1 ,
index 3a07e996593661a573791757a5825dd854b9aad7..16e3dcdbcfbaffe06b5d3309aad0f9dc89af3143 100644 (file)
@@ -1,5 +1,3 @@
-.\"    $NetBSD: vfork.2,v 1.6 1995/02/27 12:39:30 cgd Exp $
-.\"
 .\" Copyright (c) 1980, 1991, 1993
 .\"    The Regents of the University of California.  All rights reserved.
 .\"
@@ -11,11 +9,7 @@
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
-.\" 3. All advertising materials mentioning features or use of this software
-.\"    must display the following acknowledgement:
-.\"    This product includes software developed by the University of
-.\"    California, Berkeley and its contributors.
-.\" 4. Neither the name of the University nor the names of its contributors
+.\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" SUCH DAMAGE.
 .\"
 .\"     @(#)vfork.2    8.1 (Berkeley) 6/4/93
+.\" $FreeBSD$
 .\"
-.Dd June 4, 1993
+.Dd May 22, 2016
 .Dt VFORK 2
-.Os BSD 4
+.Os
 .Sh NAME
 .Nm vfork
-.Nd spawn new process in a virtual memory efficient way
+.Nd create a new process without copying the address space
+.Sh LIBRARY
+.Lb libc
 .Sh SYNOPSIS
-.Fd #include <unistd.h>
+.In unistd.h
 .Ft pid_t
-.Fo vfork
-.Fa void
-.Fc
+.Fn vfork void
 .Sh DESCRIPTION
+.Bf -symbolic
+Since this function is hard to use correctly from application software,
+it is recommended to use
+.Xr posix_spawn 3
+or
+.Xr fork 2
+instead.
+.Ef
+.Pp
+The
 .Fn vfork
+system call
 can be used to create new processes without fully copying the address
-space of the old process, which is horrendously inefficient in a paged
-environment.  It is useful when the purpose of
+space of the old process, which is inefficient in a paged
+environment.
+It is useful when the purpose of
 .Xr fork 2
 would have been to create a new system context for an
-.Xr execve .
+.Xr execve 2 .
+The
 .Fn vfork
+system call
 differs from
-.Xr fork
-in that the child borrows the parent's memory and thread of
-control until a call to
+.Xr fork 2
+in that the child borrows the parent process's address space and the
+calling thread's stack
+until a call to
 .Xr execve 2
 or an exit (either by a call to
-.Xr exit 2
-or abnormally.)
-The parent process is suspended while the child is using its resources.
+.Xr _exit 2
+or abnormally).
+The calling thread is suspended while the child is using its resources.
+Other threads continue to run.
 .Pp
+The
 .Fn vfork
+system call
 returns 0 in the child's context and (later) the pid of the child in
 the parent's context.
 .Pp
-.Fn vfork
-can normally be used just like
-.Xr fork .
-It does not work, however, to return while running in the childs context
+Many problems can occur when replacing
+.Xr fork 2
+with
+.Fn vfork .
+For example, it does not work to return while running in the child's context
 from the procedure that called
 .Fn vfork
 since the eventual return from
 .Fn vfork
 would then return to a no longer existent stack frame.
+Also, changing process state which is partially implemented in user space
+such as signal handlers with
+.Xr libthr 3
+will corrupt the parent's state.
+.Pp
 Be careful, also, to call
-.Xr _exit
+.Xr _exit 2
 rather than
-.Xr exit
-if you can't
-.Xr execve ,
+.Xr exit 3
+if you cannot
+.Xr execve ,
 since
-.Xr exit
+.Xr exit 3
 will flush and close standard I/O channels, and thereby mess up the
 parent processes standard I/O data structures.
 (Even with
-.Xr fork
+.Xr fork 2
 it is wrong to call
-.Xr exit
+.Xr exit 3
 since buffered data would then be flushed twice.)
-.Sh SEE ALSO
-.Xr execve 2 ,
+.Pp
+Unlike
 .Xr fork 2 ,
-.Xr sigaction 2 ,
-.Xr wait 2 
+.Fn vfork
+does not run
+.Xr pthread_atfork 3
+handlers.
+.Sh RETURN VALUES
+Same as for
+.Xr fork 2 .
 .Sh ERRORS
 The
 .Fn vfork
@@ -117,15 +141,20 @@ is called following calling a
 .Fn vfork
 call.
 .El
+.Sh SEE ALSO
+.Xr _exit 2 ,
+.Xr execve 2 ,
+.Xr fork 2 ,
+.Xr sigaction 2 ,
+.Xr wait 2 ,
+.Xr exit 3 ,
+.Xr posix_spawn 3
+.Sh HISTORY
+The
+.Fn vfork
+system call appeared in
+.Bx 3 .
 .Sh BUGS
-This system call will be eliminated when proper system sharing
-mechanisms are implemented. 
-Users should not depend on the memory
-sharing semantics of
-.Xr vfork
-as it will, in that case, be made synonymous to
-.Xr fork .
-.Pp
 To avoid a possible deadlock situation,
 processes that are children in the middle
 of a
@@ -140,8 +169,15 @@ output or
 calls
 are allowed
 and input attempts result in an end-of-file indication.
-.Sh HISTORY
-The
+.Sh CAVEATS
+There are limits to what you can do in the child process.
+To be totally safe you should restrict yourself to only
+executing async-signal safe operations until such time
+as one of the exec functions is called.  All APIs, including
+global data symbols, in any framework or library should be
+assumed to be unsafe after a
 .Fn vfork
-function call appeared in
-.Bx 3.0 .
+unless explicitly documented to be safe or async-signal
+safe.  If you need to use these frameworks in the child
+process, you must exec.  In this situation it is reasonable
+to exec yourself.
index fb44344cb64df652dbb9fc046ae11ab7f3f95fb5..d9eeb14aa45b03e3d4d5d0ad665ce6b039040da7 100644 (file)
@@ -47,6 +47,9 @@ INSTALL_MAN_LINKS = \
        queue.3 TAILQ_REMOVE.3 \
        posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_addopen.3 \
        posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_adddup2.3 \
+       posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_addinherit_np.3 \
+       posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_addchdir_np.3 \
+       posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_addfchdir_np.3 \
        posix_spawn_file_actions_init.3 posix_spawn_file_actions_destroy.3 \
        posix_spawnattr_init.3 posix_spawnattr_destroy.3 \
        posix_spawnattr_setbinpref_np.3 posix_spawnattr_getbinpref_np.3 \
index 8c7e697436deeef7bed45fcd9e1c46676f37cf34..20e6f754366df6044d4b40d22aa1ffcf65659bb3 100644 (file)
@@ -1,4 +1,4 @@
-.Dd April 30, 2013
+.Dd February 11, 2019
 .Dt getiopolicy_np 3
 .Os
 .Sh NAME
@@ -110,7 +110,7 @@ This
 lets users change the access time updates policy for the files accessed
 by the current thread or process.
 .Pp
-IOPOL_TYPE_VFS_ATIME_UPDATES supports following values for
+IOPOL_TYPE_VFS_ATIME_UPDATES supports the following values for
 .Fa policy:
 .Bl -tag -width IOPOL_ATIME_UPDATES_DEFAULT
 .It IOPOL_ATIME_UPDATES_OFF
@@ -120,12 +120,37 @@ to reduce the metadata I/O writes.
 .It IOPOL_ATIME_UPDATES_DEFAULT
 This is the default I/O policy for new threads.
 .El
-.El
 .Pp
 Like with IOPOL_TYPE_DISK, the I/O policy of a newly created process is
 inherited from its parent process.  Access time updates are turned off if the
 I/O policy is set to IOPOL_ATIME_UPDATES_OFF for the current thread or current
 process.
+.It IOPOL_TYPE_VFS_MATERIALIZE_DATALESS_FILES
+This
+.Fa iotype
+lets users change the materialization policy for dataless files accessed
+by the current thread or process.
+.Pp
+IOPOL_TYPE_VFS_MATERIALIZE_DATALESS_FILES supports the following values for
+.Fa policy:
+.Bl -tag -width IOPOL_MATERIALIZE_DATALESS
+.It IOPOL_MATERIALIZE_DATALESS_FILES_DEFAULT
+Selects the default materialization policy.
+For IOPOL_SCOPE_THREAD, all accesses by the current thread will follow the
+process policy.
+For IOPOL_SCOPE_PROCESS, all accesses will follow the system default
+policy
+.Pq IOPOL_MATERIALIZE_DATALESS_FILES_OFF .
+.It IOPOL_MATERIALIZE_DATALESS_FILES_OFF
+Disables materialization of dataless files by the current thread or
+process.
+.It IOPOL_MATERIALIZE_DATALESS_FILES_ON
+Enables materialization of dataless files by the current thread or
+process.
+.El
+.Pp
+New processes inherit the policy of their parent process.
+.El
 .Sh RETURN VALUES
 The
 .Fn getiopolicy_np
index 36c64715d518f5595ac55f86dba45f7ff9293349..e2915720ace4c2f1cf05fa0cfc026ed671da187a 100644 (file)
 .Fa "posix_spawn_file_actions_t *file_actions"
 .Fa "int filedes"
 .Fc
+.Ft int
+.Fo posix_spawn_file_actions_addchdir_np
+.Fa "posix_spawn_file_actions_t *file_actions"
+.Fa "const char *restrict path"
+.Fc
+.Ft int
+.Fo posix_spawn_file_actions_addfchdir_np
+.Fa "posix_spawn_file_actions_t *file_actions"
+.Fa "int filedes"
+.Fc
 .Sh DESCRIPTION
 The
 .Fn posix_spawn_file_actions_addclose
@@ -156,6 +166,45 @@ are made available in the spawned process. In that case,
 can be used to make specific pre-existing file
 descriptors from the parent process be
 available in the spawned process.
+.Pp
+The
+.Fn posix_spawn_file_actions_addchdir
+function adds an chdir operation to the list of operations associated with
+the object referenced by
+.Em file_actions ,
+for subsequent use in a call to
+.Xr posix_spawn 2
+or 
+.Xr posix_spawnp 2 .
+The current working directory will be set as if
+.Fn chdir
+had been called with
+.Em path
+prior to the new child process starting execution.
+.Pp
+The
+.Fn posix_spawn_file_actions_addfchdir
+function adds a fchdir operation to the list of operations associated with
+the object referenced by
+.Em file_actions ,
+for subsequent use in a call to
+.Xr posix_spawn 2
+or 
+.Xr posix_spawnp 2 .
+The current working directory will be set as if
+.Fn fchdir
+had been called with
+.Em filedes
+prior to the new child process starting execution.
+When
+.Em POSIX_SPAWN_CLOEXEC_DEFAULT
+is set, the file descriptor
+.Em filedes
+will not be automatically inherited unless an explicit
+.Fn posix_spawn_file_actions_addinherit_np
+action for
+.Em filedes
+has been added.
 .Sh RETURN VALUES
 On success, these functions return 0; on failure they return an error
 number from
@@ -175,6 +224,12 @@ The value of
 .Fa file_actions
 is invalid.
 .\" ==========
+.It Bq Er ENAMETOOLONG
+The length of the value specified by
+.Fa path
+exceeds
+.Dv PATH_MAX.
+.\" ==========
 .It Bq Er ENOMEM
 Insufficient memory was available to add the new action to
 .Fa file_actions .
index 21666ad8d0f03c3a4996b3864cf03efaf9c2a52d..0077fa6225bd6e199c3f8c55ce5f1a0be999b03a 100644 (file)
@@ -125,7 +125,7 @@ To resume the child process, it must be sent a
 signal.
 .It Dv POSIX_SPAWN_CLOEXEC_DEFAULT
 .Em Apple Extension :
-If this bit is set, then only file descriptors explicitly described by the
+If this bit is set, then only file descriptors explicitly created by the
 .Fa file_actions
 argument are available in the spawned process; all of the other file descriptors
 are automatically closed in the spawned process.
index 18eade2faf6827c0be77a12fa6d034548be402d2..c3efcf70f622bf14f81c23e5530b834e8075b6ba 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -113,46 +113,46 @@ int dead_blockmap(struct vnop_blockmap_args *);
 
 #define VOPFUNC int (*)(void *)
 int(**dead_vnodeop_p)(void *);
-struct vnodeopv_entry_desc dead_vnodeop_entries[] = {
-       { &vnop_default_desc, (VOPFUNC)vn_default_error },
-       { &vnop_lookup_desc, (VOPFUNC)dead_lookup },    /* lookup */
-       { &vnop_create_desc, (VOPFUNC)dead_create },    /* create */
-       { &vnop_open_desc, (VOPFUNC)dead_open },                /* open */
-       { &vnop_mknod_desc, (VOPFUNC)dead_mknod },              /* mknod */
-       { &vnop_close_desc, (VOPFUNC)dead_close },      /* close */
-       { &vnop_access_desc, (VOPFUNC)dead_access },    /* access */
-       { &vnop_getattr_desc, (VOPFUNC)dead_getattr },  /* getattr */
-       { &vnop_setattr_desc, (VOPFUNC)dead_setattr },  /* setattr */
-       { &vnop_read_desc, (VOPFUNC)dead_read },                /* read */
-       { &vnop_write_desc, (VOPFUNC)dead_write },      /* write */
-       { &vnop_ioctl_desc, (VOPFUNC)dead_ioctl },      /* ioctl */
-       { &vnop_select_desc, (VOPFUNC)dead_select },    /* select */
-       { &vnop_mmap_desc, (VOPFUNC)dead_mmap },                /* mmap */
-       { &vnop_fsync_desc, (VOPFUNC)dead_fsync },      /* fsync */
-       { &vnop_remove_desc, (VOPFUNC)dead_remove },    /* remove */
-       { &vnop_link_desc, (VOPFUNC)dead_link },                /* link */
-       { &vnop_rename_desc, (VOPFUNC)dead_rename },    /* rename */
-       { &vnop_mkdir_desc, (VOPFUNC)dead_mkdir },      /* mkdir */
-       { &vnop_rmdir_desc, (VOPFUNC)dead_rmdir },      /* rmdir */
-       { &vnop_symlink_desc, (VOPFUNC)dead_symlink },  /* symlink */
-       { &vnop_readdir_desc, (VOPFUNC)dead_readdir },  /* readdir */
-       { &vnop_readlink_desc, (VOPFUNC)dead_readlink },        /* readlink */
-       { &vnop_inactive_desc, (VOPFUNC)dead_inactive },        /* inactive */
-       { &vnop_reclaim_desc, (VOPFUNC)dead_reclaim },  /* reclaim */
-       { &vnop_strategy_desc, (VOPFUNC)dead_strategy },        /* strategy */
-       { &vnop_pathconf_desc, (VOPFUNC)dead_pathconf },        /* pathconf */
-       { &vnop_advlock_desc, (VOPFUNC)dead_advlock },  /* advlock */
-       { &vnop_bwrite_desc, (VOPFUNC)dead_bwrite },    /* bwrite */
-       { &vnop_pagein_desc, (VOPFUNC)err_pagein },     /* Pagein */
-       { &vnop_pageout_desc, (VOPFUNC)err_pageout },   /* Pageout */
-       { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */
-       { &vnop_blktooff_desc, (VOPFUNC)dead_blktooff },        /* blktooff */
-       { &vnop_offtoblk_desc, (VOPFUNC)dead_offtoblk },        /* offtoblk */
-       { &vnop_blockmap_desc, (VOPFUNC)dead_blockmap },                /* blockmap */
-       { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL }
+const struct vnodeopv_entry_desc dead_vnodeop_entries[] = {
+       { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error },
+       { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)dead_lookup },    /* lookup */
+       { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)dead_create },    /* create */
+       { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)dead_open },                /* open */
+       { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)dead_mknod },              /* mknod */
+       { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)dead_close },      /* close */
+       { .opve_op = &vnop_access_desc, .opve_impl = (VOPFUNC)dead_access },    /* access */
+       { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)dead_getattr },  /* getattr */
+       { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)dead_setattr },  /* setattr */
+       { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)dead_read },                /* read */
+       { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)dead_write },      /* write */
+       { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)dead_ioctl },      /* ioctl */
+       { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)dead_select },    /* select */
+       { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)dead_mmap },                /* mmap */
+       { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)dead_fsync },      /* fsync */
+       { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)dead_remove },    /* remove */
+       { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)dead_link },                /* link */
+       { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)dead_rename },    /* rename */
+       { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)dead_mkdir },      /* mkdir */
+       { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)dead_rmdir },      /* rmdir */
+       { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)dead_symlink },  /* symlink */
+       { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)dead_readdir },  /* readdir */
+       { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)dead_readlink },        /* readlink */
+       { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)dead_inactive },        /* inactive */
+       { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)dead_reclaim },  /* reclaim */
+       { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)dead_strategy },        /* strategy */
+       { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)dead_pathconf },        /* pathconf */
+       { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)dead_advlock },  /* advlock */
+       { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)dead_bwrite },    /* bwrite */
+       { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein },     /* Pagein */
+       { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout },   /* Pageout */
+       { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile }, /* Copyfile */
+       { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)dead_blktooff },        /* blktooff */
+       { .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC)dead_offtoblk },        /* offtoblk */
+       { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)dead_blockmap },                /* blockmap */
+       { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (VOPFUNC)NULL }
 };
-struct vnodeopv_desc dead_vnodeop_opv_desc =
-{ &dead_vnodeop_p, dead_vnodeop_entries };
+const struct vnodeopv_desc dead_vnodeop_opv_desc =
+{ .opv_desc_vector_p = &dead_vnodeop_p, .opv_desc_ops = dead_vnodeop_entries };
 
 /*
  * Trivial lookup routine that always fails.
index 28806babebfd40311ae356586413874dc3ade019..b83108b525a10387828dd345a92969d705a43aba 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -736,45 +736,45 @@ fdesc_badop(void)
 #define fdesc_blockmap (int (*) (struct  vnop_blockmap_args *))eopnotsupp
 
 int(**fdesc_vnodeop_p)(void *);
-struct vnodeopv_entry_desc devfs_fdesc_vnodeop_entries[] = {
-       { &vnop_default_desc, (VOPFUNC)vn_default_error },
-       { &vnop_lookup_desc, (VOPFUNC)vn_default_error},        /* lookup */
-       { &vnop_create_desc, (VOPFUNC)fdesc_create },   /* create */
-       { &vnop_mknod_desc, (VOPFUNC)fdesc_mknod },     /* mknod */
-       { &vnop_open_desc, (VOPFUNC)fdesc_open },       /* open */
-       { &vnop_close_desc, (VOPFUNC)fdesc_close },     /* close */
-       { &vnop_access_desc, (VOPFUNC)fdesc_access },   /* access */
-       { &vnop_getattr_desc, (VOPFUNC)fdesc_getattr }, /* getattr */
-       { &vnop_setattr_desc, (VOPFUNC)fdesc_setattr }, /* setattr */
-       { &vnop_read_desc, (VOPFUNC)fdesc_read },       /* read */
-       { &vnop_write_desc, (VOPFUNC)fdesc_write },     /* write */
-       { &vnop_ioctl_desc, (VOPFUNC)fdesc_ioctl },     /* ioctl */
-       { &vnop_select_desc, (VOPFUNC)fdesc_select },   /* select */
-       { &vnop_revoke_desc, (VOPFUNC)fdesc_revoke },   /* revoke */
-       { &vnop_mmap_desc, (VOPFUNC)fdesc_mmap },       /* mmap */
-       { &vnop_fsync_desc, (VOPFUNC)fdesc_fsync },     /* fsync */
-       { &vnop_remove_desc, (VOPFUNC)fdesc_remove },   /* remove */
-       { &vnop_link_desc, (VOPFUNC)fdesc_link },       /* link */
-       { &vnop_rename_desc, (VOPFUNC)fdesc_rename },   /* rename */
-       { &vnop_mkdir_desc, (VOPFUNC)fdesc_mkdir },     /* mkdir */
-       { &vnop_rmdir_desc, (VOPFUNC)fdesc_rmdir },     /* rmdir */
-       { &vnop_symlink_desc, (VOPFUNC)fdesc_symlink }, /* symlink */
-       { &vnop_readdir_desc, (VOPFUNC)vn_default_error},/* readdir */
-       { &vnop_readlink_desc, (VOPFUNC)err_readlink}, /* readlink */
-       { &vnop_inactive_desc, (VOPFUNC)fdesc_inactive },/* inactive */
-       { &vnop_reclaim_desc, (VOPFUNC)fdesc_reclaim }, /* reclaim */
-       { &vnop_strategy_desc, (VOPFUNC)fdesc_strategy },       /* strategy */
-       { &vnop_pathconf_desc, (VOPFUNC)fdesc_pathconf },       /* pathconf */
-       { &vnop_advlock_desc, (VOPFUNC)fdesc_advlock }, /* advlock */
-       { &vnop_bwrite_desc, (VOPFUNC)fdesc_bwrite },   /* bwrite */
-       { &vnop_pagein_desc, (VOPFUNC)err_pagein },     /* pagein */
-       { &vnop_pageout_desc, (VOPFUNC)err_pageout },   /* pageout */
-       { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */
-       { &vnop_blktooff_desc, (VOPFUNC)fdesc_blktooff },       /* blktooff */
-       { &vnop_blktooff_desc, (VOPFUNC)fdesc_offtoblk },       /* offtoblk */
-       { &vnop_blockmap_desc, (VOPFUNC)fdesc_blockmap },       /* blockmap */
-       { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL }
+const struct vnodeopv_entry_desc devfs_fdesc_vnodeop_entries[] = {
+       { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error },
+       { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)vn_default_error},        /* lookup */
+       { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)fdesc_create },   /* create */
+       { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)fdesc_mknod },     /* mknod */
+       { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)fdesc_open },       /* open */
+       { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)fdesc_close },     /* close */
+       { .opve_op = &vnop_access_desc, .opve_impl = (VOPFUNC)fdesc_access },   /* access */
+       { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)fdesc_getattr }, /* getattr */
+       { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)fdesc_setattr }, /* setattr */
+       { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)fdesc_read },       /* read */
+       { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)fdesc_write },     /* write */
+       { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)fdesc_ioctl },     /* ioctl */
+       { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)fdesc_select },   /* select */
+       { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)fdesc_revoke },   /* revoke */
+       { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)fdesc_mmap },       /* mmap */
+       { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)fdesc_fsync },     /* fsync */
+       { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)fdesc_remove },   /* remove */
+       { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)fdesc_link },       /* link */
+       { .opve_op = &vnop_rename_desc, .opve_impl =  (VOPFUNC)fdesc_rename },   /* rename */
+       { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)fdesc_mkdir },     /* mkdir */
+       { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)fdesc_rmdir },     /* rmdir */
+       { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)fdesc_symlink }, /* symlink */
+       { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)vn_default_error},/* readdir */
+       { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)err_readlink}, /* readlink */
+       { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)fdesc_inactive },/* inactive */
+       { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)fdesc_reclaim }, /* reclaim */
+       { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)fdesc_strategy },       /* strategy */
+       { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)fdesc_pathconf },       /* pathconf */
+       { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)fdesc_advlock }, /* advlock */
+       { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)fdesc_bwrite },   /* bwrite */
+       { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein },     /* pagein */
+       { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout },   /* pageout */
+       { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile }, /* Copyfile */
+       { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)fdesc_blktooff },       /* blktooff */
+       { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)fdesc_offtoblk },       /* offtoblk */
+       { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)fdesc_blockmap },       /* blockmap */
+       { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (VOPFUNC)NULL }
 };
 
-struct vnodeopv_desc devfs_fdesc_vnodeop_opv_desc =
-{ &fdesc_vnodeop_p, devfs_fdesc_vnodeop_entries };
+const struct vnodeopv_desc devfs_fdesc_vnodeop_opv_desc =
+{ .opv_desc_vector_p = &fdesc_vnodeop_p, .opv_desc_ops = devfs_fdesc_vnodeop_entries };
index fe32a3c680b4cea0030b335634f51dec69367cb1..589472d570332840d155fd45d70a058b45c07663 100644 (file)
@@ -99,6 +99,7 @@
 #include <sys/vnode_internal.h>
 #include <stdarg.h>
 #include <libkern/OSAtomic.h>
+#include <os/refcnt.h>
 #define BSD_KERNEL_PRIVATE      1       /* devfs_make_link() prototype */
 #include "devfs.h"
 #include "devfsdefs.h"
@@ -150,6 +151,8 @@ lck_attr_t      * devfs_lck_attr;
 lck_mtx_t         devfs_mutex;
 lck_mtx_t         devfs_attr_mutex;
 
+os_refgrp_decl(static, devfs_refgrp, "devfs", NULL);
+
 devdirent_t *           dev_root = NULL;        /* root of backing tree */
 struct devfs_stats      devfs_stats;            /* hold stats */
 
@@ -515,6 +518,7 @@ dev_add_node(int entrytype, devnode_type_t * typeinfo, devnode_t * proto,
     devnode_t * *dn_pp, struct devfsmount *dvm)
 {
        devnode_t *     dnp = NULL;
+       int     error = 0;
 
 #if defined SPLIT_DEVS
        /*
@@ -587,7 +591,9 @@ dev_add_node(int entrytype, devnode_type_t * typeinfo, devnode_t * proto,
 #endif
        }
        dnp->dn_dvm = dvm;
-       dnp->dn_refcount = 0;
+
+       /* Note: this inits the reference count to 1, this is considered unreferenced */
+       os_ref_init_raw(&dnp->dn_refcount, &devfs_refgrp);
        dnp->dn_ino = devfs_unique_fileno;
        devfs_unique_fileno++;
 
@@ -627,8 +633,8 @@ dev_add_node(int entrytype, devnode_type_t * typeinfo, devnode_t * proto,
                    typeinfo->Slnk.namelen + 1,
                    M_DEVFSNODE, M_WAITOK);
                if (!dnp->dn_typeinfo.Slnk.name) {
-                       FREE(dnp, M_DEVFSNODE);
-                       return ENOMEM;
+                       error = ENOMEM;
+                       break;
                }
                strlcpy(dnp->dn_typeinfo.Slnk.name, typeinfo->Slnk.name,
                    typeinfo->Slnk.namelen + 1);
@@ -656,12 +662,17 @@ dev_add_node(int entrytype, devnode_type_t * typeinfo, devnode_t * proto,
 
        #endif /* FDESC */
        default:
-               return EINVAL;
+               error = EINVAL;
        }
 
-       *dn_pp = dnp;
-       DEVFS_INCR_NODES();
-       return 0;
+       if (error) {
+               FREE(dnp, M_DEVFSNODE);
+       } else {
+               *dn_pp = dnp;
+               DEVFS_INCR_NODES();
+       }
+
+       return error;
 }
 
 
@@ -698,7 +709,10 @@ devfs_dn_free(devnode_t * dnp)
                }
 
                /* Can only free if there are no references; otherwise, wait for last vnode to be reclaimed */
-               if (dnp->dn_refcount == 0) {
+               os_ref_count_t rc = os_ref_get_count_raw(&dnp->dn_refcount);
+               if (rc == 1) {
+                       /* release final reference from dev_add_node */
+                       (void) os_ref_release_locked_raw(&dnp->dn_refcount, &devfs_refgrp);
                        devnode_free(dnp);
                } else {
                        dnp->dn_lflags |= DN_DELETE;
@@ -1362,20 +1376,22 @@ out:
 void
 devfs_ref_node(devnode_t *dnp)
 {
-       dnp->dn_refcount++;
+       os_ref_retain_locked_raw(&dnp->dn_refcount, &devfs_refgrp);
 }
 
 /*
  * Release a reference on a devnode.  If the devnode is marked for
- * free and the refcount is dropped to zero, do the free.
+ * free and the refcount is dropped to one, do the free.
  */
 void
 devfs_rele_node(devnode_t *dnp)
 {
-       dnp->dn_refcount--;
-       if (dnp->dn_refcount < 0) {
-               panic("devfs_rele_node: devnode with a negative refcount!\n");
-       } else if ((dnp->dn_refcount == 0) && (dnp->dn_lflags & DN_DELETE)) {
+       os_ref_count_t rc = os_ref_release_locked_raw(&dnp->dn_refcount, &devfs_refgrp);
+       if (rc < 1) {
+               panic("devfs_rele_node: devnode without a refcount!\n");
+       } else if ((rc == 1) && (dnp->dn_lflags & DN_DELETE)) {
+               /* release final reference from dev_add_node */
+               (void) os_ref_release_locked_raw(&dnp->dn_refcount, &devfs_refgrp);
                devnode_free(dnp);
        }
 }
index ab3986460ff8bb308240f6d394d9286b9c7c35b9..a1392ce2a0b338b4380df1305ae78f49906fc1d4 100644 (file)
@@ -504,7 +504,7 @@ devfs_kernel_mount(char * mntname)
        return 0;
 }
 
-struct vfsops devfs_vfsops = {
+const struct vfsops devfs_vfsops = {
        .vfs_mount   = devfs_mount,
        .vfs_start   = devfs_start,
        .vfs_unmount = devfs_unmount,
index 3377d3a35ededae8e791b2ccb7f68381abeb4ec5..b9de4b101f06b01a7f7cf33f0a5f6c2d161d495e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -1583,118 +1583,118 @@ devfs_update(struct vnode *vp, struct timeval *access, struct timeval *modify)
 
 /* The following ops are used by directories and symlinks */
 int(**devfs_vnodeop_p)(void *);
-static struct vnodeopv_entry_desc devfs_vnodeop_entries[] = {
-       { &vnop_default_desc, (VOPFUNC)vn_default_error },
-       { &vnop_lookup_desc, (VOPFUNC)devfs_lookup },           /* lookup */
-       { &vnop_create_desc, (VOPFUNC)err_create },             /* create */
-       { &vnop_whiteout_desc, (VOPFUNC)err_whiteout },         /* whiteout */
-       { &vnop_mknod_desc, (VOPFUNC)devfs_mknod },             /* mknod */
-       { &vnop_open_desc, (VOPFUNC)nop_open },                 /* open */
-       { &vnop_close_desc, (VOPFUNC)devfs_close },             /* close */
-       { &vnop_getattr_desc, (VOPFUNC)devfs_getattr },         /* getattr */
-       { &vnop_setattr_desc, (VOPFUNC)devfs_setattr },         /* setattr */
-       { &vnop_read_desc, (VOPFUNC)devfs_read },               /* read */
-       { &vnop_write_desc, (VOPFUNC)devfs_write },             /* write */
-       { &vnop_ioctl_desc, (VOPFUNC)err_ioctl },               /* ioctl */
-       { &vnop_select_desc, (VOPFUNC)err_select },             /* select */
-       { &vnop_revoke_desc, (VOPFUNC)err_revoke },             /* revoke */
-       { &vnop_mmap_desc, (VOPFUNC)err_mmap },                 /* mmap */
-       { &vnop_fsync_desc, (VOPFUNC)nop_fsync },               /* fsync */
-       { &vnop_remove_desc, (VOPFUNC)devfs_vnop_remove },      /* remove */
-       { &vnop_link_desc, (VOPFUNC)devfs_link },               /* link */
-       { &vnop_rename_desc, (VOPFUNC)devfs_rename },           /* rename */
-       { &vnop_mkdir_desc, (VOPFUNC)devfs_mkdir },             /* mkdir */
-       { &vnop_rmdir_desc, (VOPFUNC)devfs_rmdir },             /* rmdir */
-       { &vnop_symlink_desc, (VOPFUNC)devfs_symlink },         /* symlink */
-       { &vnop_readdir_desc, (VOPFUNC)devfs_readdir },         /* readdir */
-       { &vnop_readlink_desc, (VOPFUNC)devfs_readlink },       /* readlink */
-       { &vnop_inactive_desc, (VOPFUNC)devfs_inactive },       /* inactive */
-       { &vnop_reclaim_desc, (VOPFUNC)devfs_reclaim },         /* reclaim */
-       { &vnop_strategy_desc, (VOPFUNC)err_strategy },         /* strategy */
-       { &vnop_pathconf_desc, (VOPFUNC)devs_vnop_pathconf },   /* pathconf */
-       { &vnop_advlock_desc, (VOPFUNC)err_advlock },           /* advlock */
-       { &vnop_bwrite_desc, (VOPFUNC)err_bwrite },
-       { &vnop_pagein_desc, (VOPFUNC)err_pagein },             /* Pagein */
-       { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
-       { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
-       { &vnop_blktooff_desc, (VOPFUNC)err_blktooff },         /* blktooff */
-       { &vnop_offtoblk_desc, (VOPFUNC)err_offtoblk },         /* offtoblk */
-       { &vnop_blockmap_desc, (VOPFUNC)err_blockmap },         /* blockmap */
+const static struct vnodeopv_entry_desc devfs_vnodeop_entries[] = {
+       { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error },
+       { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)devfs_lookup },           /* lookup */
+       { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)err_create },             /* create */
+       { .opve_op = &vnop_whiteout_desc, .opve_impl = (VOPFUNC)err_whiteout },         /* whiteout */
+       { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)devfs_mknod },             /* mknod */
+       { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)nop_open },                 /* open */
+       { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)devfs_close },             /* close */
+       { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)devfs_getattr },         /* getattr */
+       { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)devfs_setattr },         /* setattr */
+       { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)devfs_read },               /* read */
+       { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)devfs_write },             /* write */
+       { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)err_ioctl },               /* ioctl */
+       { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)err_select },             /* select */
+       { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)err_revoke },             /* revoke */
+       { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)err_mmap },                 /* mmap */
+       { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)nop_fsync },               /* fsync */
+       { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)devfs_vnop_remove },      /* remove */
+       { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)devfs_link },               /* link */
+       { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)devfs_rename },           /* rename */
+       { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)devfs_mkdir },             /* mkdir */
+       { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)devfs_rmdir },             /* rmdir */
+       { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)devfs_symlink },         /* symlink */
+       { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)devfs_readdir },         /* readdir */
+       { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)devfs_readlink },       /* readlink */
+       { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)devfs_inactive },       /* inactive */
+       { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)devfs_reclaim },         /* reclaim */
+       { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)err_strategy },         /* strategy */
+       { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)devs_vnop_pathconf },   /* pathconf */
+       { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)err_advlock },           /* advlock */
+       { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)err_bwrite },
+       { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein },             /* Pagein */
+       { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout },           /* Pageout */
+       { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile },         /* Copyfile */
+       { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)err_blktooff },         /* blktooff */
+       { .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC)err_offtoblk },         /* offtoblk */
+       { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)err_blockmap },         /* blockmap */
 #if CONFIG_MACF
-       { &vnop_setlabel_desc, (VOPFUNC)devfs_setlabel },       /* setlabel */
+       { .opve_op = &vnop_setlabel_desc, .opve_impl = (VOPFUNC)devfs_setlabel },       /* setlabel */
 #endif
-       { (struct vnodeop_desc*)NULL, (int (*)(void *))NULL }
+       { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL }
 };
-struct vnodeopv_desc devfs_vnodeop_opv_desc =
-{ &devfs_vnodeop_p, devfs_vnodeop_entries };
+const struct vnodeopv_desc devfs_vnodeop_opv_desc =
+{ .opv_desc_vector_p = &devfs_vnodeop_p, .opv_desc_ops = devfs_vnodeop_entries };
 
 /* The following ops are used by the device nodes */
 int(**devfs_spec_vnodeop_p)(void *);
-static struct vnodeopv_entry_desc devfs_spec_vnodeop_entries[] = {
-       { &vnop_default_desc, (VOPFUNC)vn_default_error },
-       { &vnop_lookup_desc, (VOPFUNC)spec_lookup },            /* lookup */
-       { &vnop_create_desc, (VOPFUNC)spec_create },            /* create */
-       { &vnop_mknod_desc, (VOPFUNC)spec_mknod },              /* mknod */
-       { &vnop_open_desc, (VOPFUNC)spec_open },                        /* open */
-       { &vnop_close_desc, (VOPFUNC)devfsspec_close },         /* close */
-       { &vnop_getattr_desc, (VOPFUNC)devfs_getattr },         /* getattr */
-       { &vnop_setattr_desc, (VOPFUNC)devfs_setattr },         /* setattr */
-       { &vnop_read_desc, (VOPFUNC)devfsspec_read },           /* read */
-       { &vnop_write_desc, (VOPFUNC)devfsspec_write },         /* write */
-       { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl },              /* ioctl */
-       { &vnop_select_desc, (VOPFUNC)spec_select },            /* select */
-       { &vnop_revoke_desc, (VOPFUNC)spec_revoke },            /* revoke */
-       { &vnop_mmap_desc, (VOPFUNC)spec_mmap },                        /* mmap */
-       { &vnop_fsync_desc, (VOPFUNC)spec_fsync },              /* fsync */
-       { &vnop_remove_desc, (VOPFUNC)devfs_vnop_remove },      /* remove */
-       { &vnop_link_desc, (VOPFUNC)devfs_link },               /* link */
-       { &vnop_rename_desc, (VOPFUNC)spec_rename },            /* rename */
-       { &vnop_mkdir_desc, (VOPFUNC)spec_mkdir },              /* mkdir */
-       { &vnop_rmdir_desc, (VOPFUNC)spec_rmdir },              /* rmdir */
-       { &vnop_symlink_desc, (VOPFUNC)spec_symlink },          /* symlink */
-       { &vnop_readdir_desc, (VOPFUNC)spec_readdir },          /* readdir */
-       { &vnop_readlink_desc, (VOPFUNC)spec_readlink },                /* readlink */
-       { &vnop_inactive_desc, (VOPFUNC)devfs_inactive },       /* inactive */
-       { &vnop_reclaim_desc, (VOPFUNC)devfs_reclaim },         /* reclaim */
-       { &vnop_strategy_desc, (VOPFUNC)spec_strategy },                /* strategy */
-       { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf },                /* pathconf */
-       { &vnop_advlock_desc, (VOPFUNC)spec_advlock },          /* advlock */
-       { &vnop_bwrite_desc, (VOPFUNC)vn_bwrite },
-       { &vnop_pagein_desc, (VOPFUNC)err_pagein },             /* Pagein */
-       { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
-       { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
-       { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff },        /* blktooff */
-       { &vnop_blktooff_desc, (VOPFUNC)spec_offtoblk  },       /* blkofftoblk */
-       { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap },        /* blockmap */
+const static struct vnodeopv_entry_desc devfs_spec_vnodeop_entries[] = {
+       { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error },
+       { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)spec_lookup },            /* lookup */
+       { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)spec_create },            /* create */
+       { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)spec_mknod },              /* mknod */
+       { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)spec_open },                        /* open */
+       { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)devfsspec_close },         /* close */
+       { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)devfs_getattr },         /* getattr */
+       { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)devfs_setattr },         /* setattr */
+       { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)devfsspec_read },           /* read */
+       { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)devfsspec_write },         /* write */
+       { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)spec_ioctl },              /* ioctl */
+       { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)spec_select },            /* select */
+       { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)spec_revoke },            /* revoke */
+       { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)spec_mmap },                        /* mmap */
+       { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)spec_fsync },              /* fsync */
+       { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)devfs_vnop_remove },      /* remove */
+       { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)devfs_link },               /* link */
+       { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)spec_rename },            /* rename */
+       { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)spec_mkdir },              /* mkdir */
+       { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)spec_rmdir },              /* rmdir */
+       { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)spec_symlink },          /* symlink */
+       { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)spec_readdir },          /* readdir */
+       { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)spec_readlink },                /* readlink */
+       { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)devfs_inactive },       /* inactive */
+       { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)devfs_reclaim },         /* reclaim */
+       { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)spec_strategy },                /* strategy */
+       { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)spec_pathconf },                /* pathconf */
+       { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)spec_advlock },          /* advlock */
+       { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)vn_bwrite },
+       { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein },             /* Pagein */
+       { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout },           /* Pageout */
+       { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile },         /* Copyfile */
+       { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)spec_blktooff },        /* blktooff */
+       { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)spec_offtoblk  },       /* blkofftoblk */
+       { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)spec_blockmap },        /* blockmap */
 #if CONFIG_MACF
-       { &vnop_setlabel_desc, (VOPFUNC)devfs_setlabel },       /* setlabel */
+       { .opve_op = &vnop_setlabel_desc, .opve_impl = (VOPFUNC)devfs_setlabel },       /* setlabel */
 #endif
-       { (struct vnodeop_desc*)NULL, (int (*)(void *))NULL }
+       { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL }
 };
-struct vnodeopv_desc devfs_spec_vnodeop_opv_desc =
-{ &devfs_spec_vnodeop_p, devfs_spec_vnodeop_entries };
+const struct vnodeopv_desc devfs_spec_vnodeop_opv_desc =
+{ .opv_desc_vector_p = &devfs_spec_vnodeop_p, .opv_desc_ops = devfs_spec_vnodeop_entries };
 
 
 #if FDESC
 int(**devfs_devfd_vnodeop_p)(void*);
-static struct vnodeopv_entry_desc devfs_devfd_vnodeop_entries[] = {
-       { &vnop_default_desc, (VOPFUNC)vn_default_error },
-       { &vnop_lookup_desc, (VOPFUNC)devfs_devfd_lookup},      /* lookup */
-       { &vnop_open_desc, (VOPFUNC)nop_open },                 /* open */
-       { &vnop_close_desc, (VOPFUNC)devfs_close },             /* close */
-       { &vnop_getattr_desc, (VOPFUNC)devfs_getattr },         /* getattr */
-       { &vnop_setattr_desc, (VOPFUNC)devfs_setattr },         /* setattr */
-       { &vnop_revoke_desc, (VOPFUNC)err_revoke },             /* revoke */
-       { &vnop_fsync_desc, (VOPFUNC)nop_fsync },               /* fsync */
-       { &vnop_readdir_desc, (VOPFUNC)devfs_devfd_readdir},            /* readdir */
-       { &vnop_inactive_desc, (VOPFUNC)devfs_inactive },       /* inactive */
-       { &vnop_reclaim_desc, (VOPFUNC)devfs_reclaim },         /* reclaim */
-       { &vnop_pathconf_desc, (VOPFUNC)devs_vnop_pathconf },   /* pathconf */
+const static struct vnodeopv_entry_desc devfs_devfd_vnodeop_entries[] = {
+       { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error },
+       { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)devfs_devfd_lookup},      /* lookup */
+       { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)nop_open },                 /* open */
+       { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)devfs_close },             /* close */
+       { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)devfs_getattr },         /* getattr */
+       { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)devfs_setattr },         /* setattr */
+       { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)err_revoke },             /* revoke */
+       { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)nop_fsync },               /* fsync */
+       { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)devfs_devfd_readdir},            /* readdir */
+       { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)devfs_inactive },       /* inactive */
+       { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)devfs_reclaim },         /* reclaim */
+       { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)devs_vnop_pathconf },   /* pathconf */
 #if CONFIG_MACF
-       { &vnop_setlabel_desc, (VOPFUNC)devfs_setlabel },       /* setlabel */
+       { .opve_op = &vnop_setlabel_desc, .opve_impl = (VOPFUNC)devfs_setlabel },       /* setlabel */
 #endif
-       { (struct vnodeop_desc*)NULL, (int (*)(void *))NULL }
+       { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL }
 };
-struct vnodeopv_desc devfs_devfd_vnodeop_opv_desc =
-{ &devfs_devfd_vnodeop_p, devfs_devfd_vnodeop_entries};
+const struct vnodeopv_desc devfs_devfd_vnodeop_opv_desc =
+{ .opv_desc_vector_p = &devfs_devfd_vnodeop_p, .opv_desc_ops = devfs_devfd_vnodeop_entries};
 #endif /* FDESC */
index 502e4daa521288159ac99690523698579331ce66..e2dee3842dfea60f22d44f115db3d1feeace9998 100644 (file)
@@ -89,7 +89,7 @@ typedef enum {
 
 extern int(**devfs_vnodeop_p)(void *);  /* our own vector array for dirs */
 extern int(**devfs_spec_vnodeop_p)(void *);  /* our own vector array for devs */
-extern struct vfsops devfs_vfsops;
+extern const struct vfsops devfs_vfsops;
 
 typedef struct devnode          devnode_t;
 typedef struct devdirent        devdirent_t;
@@ -132,7 +132,7 @@ struct devnode {
         * make sure that a deferred delete eventually happens if it is
         * blocked behind that reference.
         */
-       int                 dn_refcount;
+       os_ref_atomic_t     dn_refcount;
        u_short             dn_mode;
        uid_t               dn_uid;
        gid_t               dn_gid;
index 956824de4adfdcd667f1e42ce079c7f43c768a25..df13d0d96537ebdd0404e846bd5e990c6856df0e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define VOPFUNC int (*)(void *)
 
 int(**fifo_vnodeop_p)(void *);
-struct vnodeopv_entry_desc fifo_vnodeop_entries[] = {
-       { &vnop_default_desc, (VOPFUNC)vn_default_error },
-       { &vnop_lookup_desc, (VOPFUNC)fifo_lookup },            /* lookup */
-       { &vnop_create_desc, (VOPFUNC)err_create },             /* create */
-       { &vnop_mknod_desc, (VOPFUNC)err_mknod },               /* mknod */
-       { &vnop_open_desc, (VOPFUNC)fifo_open },                        /* open */
-       { &vnop_close_desc, (VOPFUNC)fifo_close },              /* close */
-       { &vnop_access_desc, (VOPFUNC)fifo_access },            /* access */
-       { &vnop_getattr_desc, (VOPFUNC)fifo_getattr },          /* getattr */
-       { &vnop_setattr_desc, (VOPFUNC)fifo_setattr },          /* setattr */
-       { &vnop_read_desc, (VOPFUNC)fifo_read },                        /* read */
-       { &vnop_write_desc, (VOPFUNC)fifo_write },              /* write */
-       { &vnop_ioctl_desc, (VOPFUNC)fifo_ioctl },              /* ioctl */
-       { &vnop_select_desc, (VOPFUNC)fifo_select },            /* select */
-       { &vnop_revoke_desc, (VOPFUNC)fifo_revoke },            /* revoke */
-       { &vnop_mmap_desc, (VOPFUNC)err_mmap },                 /* mmap */
-       { &vnop_fsync_desc, (VOPFUNC)fifo_fsync },              /* fsync */
-       { &vnop_remove_desc, (VOPFUNC)err_remove },             /* remove */
-       { &vnop_link_desc, (VOPFUNC)err_link },                 /* link */
-       { &vnop_rename_desc, (VOPFUNC)err_rename },             /* rename */
-       { &vnop_mkdir_desc, (VOPFUNC)err_mkdir },               /* mkdir */
-       { &vnop_rmdir_desc, (VOPFUNC)err_rmdir },               /* rmdir */
-       { &vnop_symlink_desc, (VOPFUNC)err_symlink },           /* symlink */
-       { &vnop_readdir_desc, (VOPFUNC)err_readdir },           /* readdir */
-       { &vnop_readlink_desc, (VOPFUNC)err_readlink },         /* readlink */
-       { &vnop_inactive_desc, (VOPFUNC)fifo_inactive },                /* inactive */
-       { &vnop_reclaim_desc, (VOPFUNC)fifo_reclaim },          /* reclaim */
-       { &vnop_strategy_desc, (VOPFUNC)err_strategy },         /* strategy */
-       { &vnop_pathconf_desc, (VOPFUNC)fifo_pathconf },                /* pathconf */
-       { &vnop_advlock_desc, (VOPFUNC)fifo_advlock },          /* advlock */
-       { &vnop_bwrite_desc, (VOPFUNC)fifo_bwrite },            /* bwrite */
-       { &vnop_pagein_desc, (VOPFUNC)err_pagein },             /* Pagein */
-       { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
-       { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
-       { &vnop_blktooff_desc, (VOPFUNC)err_blktooff },         /* blktooff */
-       { &vnop_offtoblk_desc, (VOPFUNC)err_offtoblk },         /* offtoblk */
-       { &vnop_blockmap_desc, (VOPFUNC)err_blockmap },                 /* blockmap */
-       { (struct vnodeop_desc*)NULL, (int (*)(void *))NULL }
+const struct vnodeopv_entry_desc fifo_vnodeop_entries[] = {
+       { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error },
+       { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)fifo_lookup },            /* lookup */
+       { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)err_create },             /* create */
+       { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)err_mknod },               /* mknod */
+       { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)fifo_open },                        /* open */
+       { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)fifo_close },              /* close */
+       { .opve_op = &vnop_access_desc, .opve_impl = (VOPFUNC)fifo_access },            /* access */
+       { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)fifo_getattr },          /* getattr */
+       { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)fifo_setattr },          /* setattr */
+       { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)fifo_read },                        /* read */
+       { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)fifo_write },              /* write */
+       { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)fifo_ioctl },              /* ioctl */
+       { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)fifo_select },            /* select */
+       { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)fifo_revoke },            /* revoke */
+       { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)err_mmap },                 /* mmap */
+       { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)fifo_fsync },              /* fsync */
+       { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)err_remove },             /* remove */
+       { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)err_link },                 /* link */
+       { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)err_rename },             /* rename */
+       { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)err_mkdir },               /* mkdir */
+       { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)err_rmdir },               /* rmdir */
+       { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)err_symlink },           /* symlink */
+       { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)err_readdir },           /* readdir */
+       { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)err_readlink },         /* readlink */
+       { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)fifo_inactive },                /* inactive */
+       { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)fifo_reclaim },          /* reclaim */
+       { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)err_strategy },         /* strategy */
+       { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)fifo_pathconf },                /* pathconf */
+       { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)fifo_advlock },          /* advlock */
+       { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)fifo_bwrite },            /* bwrite */
+       { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein },             /* Pagein */
+       { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout },           /* Pageout */
+       { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile },         /* Copyfile */
+       { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)err_blktooff },         /* blktooff */
+       { .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC)err_offtoblk },         /* offtoblk */
+       { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)err_blockmap },                 /* blockmap */
+       { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL }
 };
-struct vnodeopv_desc fifo_vnodeop_opv_desc =
-{ &fifo_vnodeop_p, fifo_vnodeop_entries };
+const struct vnodeopv_desc fifo_vnodeop_opv_desc =
+{ .opv_desc_vector_p = &fifo_vnodeop_p, .opv_desc_ops = fifo_vnodeop_entries };
 
 /*
  * Trivial lookup routine that always fails.
index 406eddfc2dd15c462e6991ed989cb6a3998068c3..5dbbbc6489bf36d4ec03de0288e524735a587b0f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -407,46 +407,46 @@ mockfs_blockmap(struct vnop_blockmap_args * ap)
 }
 
 int(**mockfs_vnodeop_p)(void *);
-struct vnodeopv_entry_desc mockfs_vnodeop_entries[] = {
-       { &vnop_default_desc, (VOPFUNC) vn_default_error }, /* default */
-       { &vnop_lookup_desc, (VOPFUNC) mockfs_lookup }, /* lookup */
-       { &vnop_create_desc, (VOPFUNC) err_create },/* create */
-       { &vnop_open_desc, (VOPFUNC) err_open }, /* open */
-       { &vnop_mknod_desc, (VOPFUNC) err_mknod }, /* mknod */
-       { &vnop_close_desc, (VOPFUNC) err_close }, /* close */
-       { &vnop_access_desc, (VOPFUNC) err_access }, /* access */
-       { &vnop_getattr_desc, (VOPFUNC) mockfs_getattr }, /* getattr */
-       { &vnop_setattr_desc, (VOPFUNC) err_setattr }, /* setattr */
-       { &vnop_read_desc, (VOPFUNC) mockfs_read }, /* read */
-       { &vnop_write_desc, (VOPFUNC) err_write }, /* write */
-       { &vnop_ioctl_desc, (VOPFUNC) err_ioctl }, /* ioctl */
-       { &vnop_select_desc, (VOPFUNC) err_select }, /* select */
-       { &vnop_mmap_desc, (VOPFUNC) err_mmap }, /* mmap */
-       { &vnop_fsync_desc, (VOPFUNC) nop_fsync }, /* fsync */
-       { &vnop_remove_desc, (VOPFUNC) err_remove }, /* remove */
-       { &vnop_link_desc, (VOPFUNC) err_link }, /* link */
-       { &vnop_rename_desc, (VOPFUNC) err_rename }, /* rename */
-       { &vnop_mkdir_desc, (VOPFUNC) err_mkdir }, /* mkdir */
-       { &vnop_rmdir_desc, (VOPFUNC) err_rmdir }, /* rmdir */
-       { &vnop_symlink_desc, (VOPFUNC) err_symlink }, /* symlink */
-       { &vnop_readdir_desc, (VOPFUNC) err_readdir }, /* readdir */
-       { &vnop_readlink_desc, (VOPFUNC) err_readlink }, /* readlink */
-       { &vnop_inactive_desc, (VOPFUNC) err_inactive }, /* inactive */
-       { &vnop_reclaim_desc, (VOPFUNC) mockfs_reclaim }, /* reclaim */
-       { &vnop_strategy_desc, (VOPFUNC) mockfs_strategy }, /* strategy */
-       { &vnop_pathconf_desc, (VOPFUNC) err_pathconf }, /* pathconf */
-       { &vnop_advlock_desc, (VOPFUNC) err_advlock }, /* advlock */
-       { &vnop_bwrite_desc, (VOPFUNC) err_bwrite }, /* bwrite */
-       { &vnop_pagein_desc, (VOPFUNC) mockfs_pagein }, /* pagein */
-       { &vnop_pageout_desc, (VOPFUNC) err_pageout }, /* pageout */
-       { &vnop_copyfile_desc, (VOPFUNC) err_copyfile }, /* copyfile */
-       { &vnop_blktooff_desc, (VOPFUNC) err_blktooff }, /* blktooff */
-       { &vnop_offtoblk_desc, (VOPFUNC) err_offtoblk }, /* offtoblk */
-       { &vnop_blockmap_desc, (VOPFUNC) mockfs_blockmap }, /* blockmap */
-       { (struct vnodeop_desc *) NULL, (VOPFUNC) NULL }
+const struct vnodeopv_entry_desc mockfs_vnodeop_entries[] = {
+       { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC) vn_default_error }, /* default */
+       { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC) mockfs_lookup }, /* lookup */
+       { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC) err_create },/* create */
+       { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC) err_open }, /* open */
+       { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC) err_mknod }, /* mknod */
+       { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC) err_close }, /* close */
+       { .opve_op = &vnop_access_desc, .opve_impl = (VOPFUNC) err_access }, /* access */
+       { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC) mockfs_getattr }, /* getattr */
+       { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC) err_setattr }, /* setattr */
+       { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC) mockfs_read }, /* read */
+       { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC) err_write }, /* write */
+       { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC) err_ioctl }, /* ioctl */
+       { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC) err_select }, /* select */
+       { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC) err_mmap }, /* mmap */
+       { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC) nop_fsync }, /* fsync */
+       { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC) err_remove }, /* remove */
+       { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC) err_link }, /* link */
+       { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC) err_rename }, /* rename */
+       { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC) err_mkdir }, /* mkdir */
+       { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC) err_rmdir }, /* rmdir */
+       { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC) err_symlink }, /* symlink */
+       { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC) err_readdir }, /* readdir */
+       { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC) err_readlink }, /* readlink */
+       { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC) err_inactive }, /* inactive */
+       { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC) mockfs_reclaim }, /* reclaim */
+       { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC) mockfs_strategy }, /* strategy */
+       { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC) err_pathconf }, /* pathconf */
+       { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC) err_advlock }, /* advlock */
+       { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC) err_bwrite }, /* bwrite */
+       { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC) mockfs_pagein }, /* pagein */
+       { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC) err_pageout }, /* pageout */
+       { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC) err_copyfile }, /* copyfile */
+       { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC) err_blktooff }, /* blktooff */
+       { .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC) err_offtoblk }, /* offtoblk */
+       { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC) mockfs_blockmap }, /* blockmap */
+       { .opve_op = (struct vnodeop_desc *) NULL, .opve_impl = (VOPFUNC) NULL }
 };
 
-struct vnodeopv_desc mockfs_vnodeop_opv_desc = {
-       &mockfs_vnodeop_p,
-       mockfs_vnodeop_entries
+const struct vnodeopv_desc mockfs_vnodeop_opv_desc = {
+       .opv_desc_vector_p = &mockfs_vnodeop_p,
+       .opv_desc_ops = mockfs_vnodeop_entries
 };
index 0e8330da7bacc2271be18312512477824a8607af..8305be1c60a0a16ad1c06e976887134c7a1d618f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  *
@@ -408,7 +408,7 @@ nullfs_vfs_getattr(struct mount * mp, struct vfs_attr * vfap, vfs_context_t ctx)
        vol_capabilities_attr_t capabilities;
        struct vfsstatfs * sp = vfs_statfs(mp);
 
-       struct timespec tzero = {0, 0};
+       struct timespec tzero = {.tv_sec = 0, .tv_nsec = 0};
 
        NULLFSDEBUG("%s\n", __FUNCTION__);
 
@@ -549,9 +549,9 @@ nullfs_vfs_start(__unused struct mount * mp, __unused int flags, __unused vfs_co
        return 0;
 }
 
-extern struct vnodeopv_desc nullfs_vnodeop_opv_desc;
+extern const struct vnodeopv_desc nullfs_vnodeop_opv_desc;
 
-struct vnodeopv_desc * nullfs_vnodeopv_descs[] = {
+const struct vnodeopv_desc * nullfs_vnodeopv_descs[] = {
        &nullfs_vnodeop_opv_desc,
 };
 
index ebfe2e7e160d20845e576ba2dc923076c29571fb..6afadbfab808bd2253271253f6f39636808e0b98 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  *
@@ -832,7 +832,7 @@ nullfs_getxattr(struct vnop_getxattr_args * args)
        NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp);
 
        if (nullfs_checkspecialvp(args->a_vp)) {
-               return 0; /* nothing extra needed */
+               return ENOATTR; /* no xattrs on the special vnodes */
        }
 
        vp    = args->a_vp;
@@ -855,7 +855,7 @@ nullfs_listxattr(struct vnop_listxattr_args * args)
        NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp);
 
        if (nullfs_checkspecialvp(args->a_vp)) {
-               return 0; /* nothing extra needed */
+               return 0; /* no xattrs on the special vnodes */
        }
 
        vp    = args->a_vp;
@@ -1017,19 +1017,19 @@ end:
  * Global vfs data structures
  */
 
-static struct vnodeopv_entry_desc nullfs_vnodeop_entries[] = {
-       {&vnop_default_desc, (vop_t)nullfs_default}, {&vnop_getattr_desc, (vop_t)nullfs_getattr},
-       {&vnop_open_desc, (vop_t)nullfs_open}, {&vnop_close_desc, (vop_t)nullfs_close},
-       {&vnop_inactive_desc, (vop_t)null_inactive}, {&vnop_reclaim_desc, (vop_t)null_reclaim},
-       {&vnop_lookup_desc, (vop_t)null_lookup}, {&vnop_readdir_desc, (vop_t)nullfs_readdir},
-       {&vnop_readlink_desc, (vop_t)nullfs_readlink}, {&vnop_pathconf_desc, (vop_t)nullfs_pathconf},
-       {&vnop_fsync_desc, (vop_t)nullfs_fsync}, {&vnop_mmap_desc, (vop_t)nullfs_mmap},
-       {&vnop_mnomap_desc, (vop_t)nullfs_mnomap}, {&vnop_getxattr_desc, (vop_t)nullfs_getxattr},
-       {&vnop_pagein_desc, (vop_t)nullfs_pagein}, {&vnop_read_desc, (vop_t)nullfs_read},
-       {&vnop_listxattr_desc, (vop_t)nullfs_listxattr}, {NULL, NULL},
+static const struct vnodeopv_entry_desc nullfs_vnodeop_entries[] = {
+       {.opve_op = &vnop_default_desc, .opve_impl = (vop_t)nullfs_default}, {.opve_op = &vnop_getattr_desc, .opve_impl = (vop_t)nullfs_getattr},
+       {.opve_op = &vnop_open_desc, .opve_impl = (vop_t)nullfs_open}, {.opve_op = &vnop_close_desc, .opve_impl = (vop_t)nullfs_close},
+       {.opve_op = &vnop_inactive_desc, .opve_impl = (vop_t)null_inactive}, {.opve_op = &vnop_reclaim_desc, .opve_impl = (vop_t)null_reclaim},
+       {.opve_op = &vnop_lookup_desc, .opve_impl = (vop_t)null_lookup}, {.opve_op = &vnop_readdir_desc, .opve_impl = (vop_t)nullfs_readdir},
+       {.opve_op = &vnop_readlink_desc, .opve_impl = (vop_t)nullfs_readlink}, {.opve_op = &vnop_pathconf_desc, .opve_impl = (vop_t)nullfs_pathconf},
+       {.opve_op = &vnop_fsync_desc, .opve_impl = (vop_t)nullfs_fsync}, {.opve_op = &vnop_mmap_desc, .opve_impl = (vop_t)nullfs_mmap},
+       {.opve_op = &vnop_mnomap_desc, .opve_impl = (vop_t)nullfs_mnomap}, {.opve_op = &vnop_getxattr_desc, .opve_impl = (vop_t)nullfs_getxattr},
+       {.opve_op = &vnop_pagein_desc, .opve_impl = (vop_t)nullfs_pagein}, {.opve_op = &vnop_read_desc, .opve_impl = (vop_t)nullfs_read},
+       {.opve_op = &vnop_listxattr_desc, .opve_impl = (vop_t)nullfs_listxattr}, {.opve_op = NULL, .opve_impl = NULL},
 };
 
-struct vnodeopv_desc nullfs_vnodeop_opv_desc = {&nullfs_vnodeop_p, nullfs_vnodeop_entries};
+const struct vnodeopv_desc nullfs_vnodeop_opv_desc = {.opv_desc_vector_p = &nullfs_vnodeop_p, .opv_desc_ops = nullfs_vnodeop_entries};
 
 //NULLFS Specific helper function
 
index 766194f6d37712c3e7800c9f6d3711b43446d884..38b55fd53cb71c401306c5bbccafc675ce0356b7 100644 (file)
@@ -148,7 +148,7 @@ int nullfs_getbackingvnode(vnode_t in_vp, vnode_t* out_vpp);
 #define NULLVPTOLOWERVID(vp) (VTONULL(vp)->null_lowervid)
 #define NULLVPTOMYVID(vp) (VTONULL(vp)->null_myvid)
 
-extern struct vnodeopv_desc nullfs_vnodeop_opv_desc;
+extern const struct vnodeopv_desc nullfs_vnodeop_opv_desc;
 
 extern vop_t * nullfs_vnodeop_p;
 
index d066326e11785528e413a9338706d48b7f382100..664ae0e16607a62eceed8a66ff9fb912f4d9d0c7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -487,7 +487,7 @@ out:
        return error;
 }
 
-struct vfsops routefs_vfsops = {
+const struct vfsops routefs_vfsops = {
        .vfs_mount = routefs_mount,
        .vfs_start = routefs_start,
        .vfs_unmount = routefs_unmount,
@@ -518,47 +518,48 @@ routefserr_setlabel(__unused struct vnop_setlabel_args * args)
 
 /* The following ops are used by directories and symlinks */
 int(**routefs_vnodeop_p)(void *);
-static struct vnodeopv_entry_desc routefs_vnodeop_entries[] = {
-       { &vnop_default_desc, (VOPFUNC)vn_default_error },
-       { &vnop_lookup_desc, (VOPFUNC)routefserr_lookup },      /* lookup */
-       { &vnop_create_desc, (VOPFUNC)err_create },     /* create */
-       { &vnop_whiteout_desc, (VOPFUNC)err_whiteout },         /* whiteout */
-       { &vnop_mknod_desc, (VOPFUNC)err_mknod },       /* mknod */
-       { &vnop_open_desc, (VOPFUNC)err_open },                 /* open */
-       { &vnop_close_desc, (VOPFUNC)err_close },       /* close */
-       { &vnop_getattr_desc, (VOPFUNC)err_getattr },           /* getattr */
-       { &vnop_setattr_desc, (VOPFUNC)err_setattr },           /* setattr */
-       { &vnop_read_desc, (VOPFUNC)err_read },         /* read */
-       { &vnop_write_desc, (VOPFUNC)err_write },       /* write */
-       { &vnop_ioctl_desc, (VOPFUNC)err_ioctl },       /* ioctl */
-       { &vnop_select_desc, (VOPFUNC)err_select },     /* select */
-       { &vnop_revoke_desc, (VOPFUNC)err_revoke },     /* revoke */
-       { &vnop_mmap_desc, (VOPFUNC)err_mmap },                 /* mmap */
-       { &vnop_fsync_desc, (VOPFUNC)nop_fsync },       /* fsync */
-       { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */
-       { &vnop_link_desc, (VOPFUNC)err_link },         /* link */
-       { &vnop_rename_desc, (VOPFUNC)err_rename },     /* rename */
-       { &vnop_mkdir_desc, (VOPFUNC)err_mkdir },       /* mkdir */
-       { &vnop_rmdir_desc, (VOPFUNC)err_rmdir },       /* rmdir */
-       { &vnop_symlink_desc, (VOPFUNC)err_symlink },           /* symlink */
-       { &vnop_readdir_desc, (VOPFUNC)err_readdir },           /* readdir */
-       { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */
-       { &vnop_inactive_desc, (VOPFUNC)err_inactive }, /* inactive */
-       { &vnop_reclaim_desc, (VOPFUNC)err_reclaim },           /* reclaim */
-       { &vnop_strategy_desc, (VOPFUNC)err_strategy },         /* strategy */
-       { &vnop_pathconf_desc, (VOPFUNC)err_pathconf }, /* pathconf */
-       { &vnop_advlock_desc, (VOPFUNC)err_advlock },           /* advlock */
-       { &vnop_bwrite_desc, (VOPFUNC)err_bwrite },
-       { &vnop_pagein_desc, (VOPFUNC)err_pagein },     /* Pagein */
-       { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
-       { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
-       { &vnop_blktooff_desc, (VOPFUNC)err_blktooff },         /* blktooff */
-       { &vnop_offtoblk_desc, (VOPFUNC)err_offtoblk },         /* offtoblk */
-       { &vnop_blockmap_desc, (VOPFUNC)err_blockmap },         /* blockmap */
+static const struct vnodeopv_entry_desc routefs_vnodeop_entries[] = {
+       { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error },
+       { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)routefserr_lookup },      /* lookup */
+       { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)err_create },     /* create */
+       { .opve_op = &vnop_whiteout_desc, .opve_impl = (VOPFUNC)err_whiteout },         /* whiteout */
+       { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)err_mknod },       /* mknod */
+       { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)err_open },                 /* open */
+       { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)err_close },       /* close */
+       { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)err_getattr },           /* getattr */
+       { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)err_setattr },           /* setattr */
+       { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)err_read },         /* read */
+       { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)err_write },       /* write */
+       { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)err_ioctl },       /* ioctl */
+       { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)err_select },     /* select */
+       { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)err_revoke },     /* revoke */
+       { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)err_mmap },                 /* mmap */
+       { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)nop_fsync },       /* fsync */
+       { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)err_remove }, /* remove */
+       { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)err_link },         /* link */
+       { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)err_rename },     /* rename */
+       { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)err_mkdir },       /* mkdir */
+       { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)err_rmdir },       /* rmdir */
+       { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)err_symlink },           /* symlink */
+       { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)err_readdir },           /* readdir */
+       { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)err_readlink }, /* readlink */
+       { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)err_inactive }, /* inactive */
+       { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)err_reclaim },           /* reclaim */
+       { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)err_strategy },         /* strategy */
+       { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)err_pathconf }, /* pathconf */
+       { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)err_advlock },           /* advlock */
+       { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)err_bwrite },
+       { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein },     /* Pagein */
+       { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout },           /* Pageout */
+       { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile },         /* Copyfile */
+       { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)err_blktooff },         /* blktooff */
+       { .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC)err_offtoblk },         /* offtoblk */
+       { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)err_blockmap },         /* blockmap */
 #if CONFIG_MACF
-       { &vnop_setlabel_desc, (VOPFUNC)routefserr_setlabel },   /* setlabel */
+       { .opve_op = &vnop_setlabel_desc, .opve_impl = (VOPFUNC)routefserr_setlabel },   /* setlabel */
 #endif
-       { (struct vnodeop_desc*)NULL, (int (*)(void *))NULL }
+       { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL }
 };
-struct vnodeopv_desc routefs_vnodeop_opv_desc =
-{ &routefs_vnodeop_p, routefs_vnodeop_entries };
+
+const struct vnodeopv_desc routefs_vnodeop_opv_desc =
+{ .opv_desc_vector_p = &routefs_vnodeop_p, .opv_desc_ops = routefs_vnodeop_entries };
index 7b6f18d9ce54dfbb87e9102b252cdf2d987e7514..042363b4158605a58ad4c45b162395c8e60ccccc 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 /* XXX following three prototypes should be in a header file somewhere */
 extern dev_t    chrtoblk(dev_t dev);
 extern boolean_t        iskmemdev(dev_t dev);
-extern int      bpfkqfilter(dev_t dev, struct knote *kn);
+extern int bpfkqfilter(dev_t dev, struct knote *kn);
 extern int ptsd_kqfilter(dev_t, struct knote *);
 extern int ptmx_kqfilter(dev_t, struct knote *);
 
@@ -120,47 +120,47 @@ char    devcls[] = "devcls";
 #define VOPFUNC int (*)(void *)
 
 int(**spec_vnodeop_p)(void *);
-struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
-       { &vnop_default_desc, (VOPFUNC)vn_default_error },
-       { &vnop_lookup_desc, (VOPFUNC)spec_lookup },            /* lookup */
-       { &vnop_create_desc, (VOPFUNC)err_create },             /* create */
-       { &vnop_mknod_desc, (VOPFUNC)err_mknod },               /* mknod */
-       { &vnop_open_desc, (VOPFUNC)spec_open },                        /* open */
-       { &vnop_close_desc, (VOPFUNC)spec_close },              /* close */
-       { &vnop_access_desc, (VOPFUNC)spec_access },            /* access */
-       { &vnop_getattr_desc, (VOPFUNC)spec_getattr },          /* getattr */
-       { &vnop_setattr_desc, (VOPFUNC)spec_setattr },          /* setattr */
-       { &vnop_read_desc, (VOPFUNC)spec_read },                        /* read */
-       { &vnop_write_desc, (VOPFUNC)spec_write },              /* write */
-       { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl },              /* ioctl */
-       { &vnop_select_desc, (VOPFUNC)spec_select },            /* select */
-       { &vnop_revoke_desc, (VOPFUNC)nop_revoke },             /* revoke */
-       { &vnop_mmap_desc, (VOPFUNC)err_mmap },                 /* mmap */
-       { &vnop_fsync_desc, (VOPFUNC)spec_fsync },              /* fsync */
-       { &vnop_remove_desc, (VOPFUNC)err_remove },             /* remove */
-       { &vnop_link_desc, (VOPFUNC)err_link },                 /* link */
-       { &vnop_rename_desc, (VOPFUNC)err_rename },             /* rename */
-       { &vnop_mkdir_desc, (VOPFUNC)err_mkdir },               /* mkdir */
-       { &vnop_rmdir_desc, (VOPFUNC)err_rmdir },               /* rmdir */
-       { &vnop_symlink_desc, (VOPFUNC)err_symlink },           /* symlink */
-       { &vnop_readdir_desc, (VOPFUNC)err_readdir },           /* readdir */
-       { &vnop_readlink_desc, (VOPFUNC)err_readlink },         /* readlink */
-       { &vnop_inactive_desc, (VOPFUNC)nop_inactive },         /* inactive */
-       { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim },           /* reclaim */
-       { &vnop_strategy_desc, (VOPFUNC)spec_strategy },                /* strategy */
-       { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf },                /* pathconf */
-       { &vnop_advlock_desc, (VOPFUNC)err_advlock },           /* advlock */
-       { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite },            /* bwrite */
-       { &vnop_pagein_desc, (VOPFUNC)err_pagein },             /* Pagein */
-       { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
-       { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
-       { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff },                /* blktooff */
-       { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk },                /* offtoblk */
-       { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap },                /* blockmap */
-       { (struct vnodeop_desc*)NULL, (int (*)(void *))NULL }
+const struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
+       { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error },
+       { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)spec_lookup },            /* lookup */
+       { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)err_create },             /* create */
+       { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)err_mknod },               /* mknod */
+       { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)spec_open },                        /* open */
+       { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)spec_close },              /* close */
+       { .opve_op = &vnop_access_desc, .opve_impl = (VOPFUNC)spec_access },            /* access */
+       { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)spec_getattr },          /* getattr */
+       { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)spec_setattr },          /* setattr */
+       { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)spec_read },                        /* read */
+       { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)spec_write },              /* write */
+       { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)spec_ioctl },              /* ioctl */
+       { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)spec_select },            /* select */
+       { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)nop_revoke },             /* revoke */
+       { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)err_mmap },                 /* mmap */
+       { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)spec_fsync },              /* fsync */
+       { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)err_remove },             /* remove */
+       { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)err_link },                 /* link */
+       { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)err_rename },             /* rename */
+       { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)err_mkdir },               /* mkdir */
+       { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)err_rmdir },               /* rmdir */
+       { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)err_symlink },           /* symlink */
+       { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)err_readdir },           /* readdir */
+       { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)err_readlink },         /* readlink */
+       { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)nop_inactive },         /* inactive */
+       { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)nop_reclaim },           /* reclaim */
+       { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)spec_strategy },                /* strategy */
+       { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)spec_pathconf },                /* pathconf */
+       { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)err_advlock },           /* advlock */
+       { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)spec_bwrite },            /* bwrite */
+       { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein },             /* Pagein */
+       { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout },           /* Pageout */
+       { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile },         /* Copyfile */
+       { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)spec_blktooff },                /* blktooff */
+       { .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC)spec_offtoblk },                /* offtoblk */
+       { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)spec_blockmap },                /* blockmap */
+       { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL }
 };
-struct vnodeopv_desc spec_vnodeop_opv_desc =
-{ &spec_vnodeop_p, spec_vnodeop_entries };
+const struct vnodeopv_desc spec_vnodeop_opv_desc =
+{ .opv_desc_vector_p = &spec_vnodeop_p, .opv_desc_ops = spec_vnodeop_entries };
 
 
 static void set_blocksize(vnode_t, dev_t);
@@ -315,6 +315,7 @@ spec_open(struct vnop_open_args *ap)
                        return ENXIO;
                }
                if (cred != FSCRED && (ap->a_mode & FWRITE)) {
+#if 0
                        /*
                         * When running in very secure mode, do not allow
                         * opens for writing of any disk character devices.
@@ -322,6 +323,7 @@ spec_open(struct vnop_open_args *ap)
                        if (securelevel >= 2 && isdisk(dev, VCHR)) {
                                return EPERM;
                        }
+#endif
 
                        /* Never allow writing to /dev/mem or /dev/kmem */
                        if (iskmemdev(dev)) {
@@ -485,13 +487,49 @@ spec_read(struct vnop_read_args *ap)
        {
                struct _throttle_io_info_t *throttle_info = NULL;
                int thread_throttle_level;
-               if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
+               uint64_t blkno = 0;
+               uint32_t iolen = 0;
+               int ddisk = 0;
+               int ktrace_code = DKIO_READ;
+               devBlockSize = vp->v_specsize;
+               uintptr_t our_id;
+
+               if (cdevsw[major(vp->v_rdev)].d_type == D_DISK) {
+                       ddisk = 1;
+               }
+
+               if (ddisk && vp->v_un.vu_specinfo->si_throttleable) {
                        throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
                        thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
                }
+
+               if (kdebug_enable && ddisk) {
+                       if (devBlockSize == 0) {
+                               devBlockSize = 512;  // default sector size
+                       }
+
+                       if (uio_offset(uio) && devBlockSize) {
+                               blkno = ((uint64_t) uio_offset(uio) / ((uint64_t)devBlockSize));
+                       }
+                       iolen = (int) uio_resid(uio);
+                       our_id = (uintptr_t)thread_tid(current_thread());
+                       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
+                           (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
+                           vp->v_rdev, blkno, iolen, 0);
+               }
+
                error = (*cdevsw[major(vp->v_rdev)].d_read)
                    (vp->v_rdev, uio, ap->a_ioflag);
 
+
+               if (kdebug_enable && ddisk) {
+                       uint32_t residual = (uint32_t)uio_resid(uio);
+                       ktrace_code |= DKIO_DONE;
+                       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
+                           (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
+                           (uintptr_t)VM_KERNEL_ADDRPERM(vp), residual, error, 0);
+               }
+
                if (throttle_info) {
                        throttle_info_end_io_internal(throttle_info, thread_throttle_level);
                }
@@ -589,16 +627,51 @@ spec_write(struct vnop_write_args *ap)
        {
                struct _throttle_io_info_t *throttle_info = NULL;
                int thread_throttle_level;
-               if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
+               dev = vp->v_rdev;
+               devBlockSize = vp->v_specsize;
+               uint32_t iolen = 0;
+               uint64_t blkno = 0;
+               int ddisk = 0;
+               int ktrace_code = 0;  // write is implied; read must be OR'd in.
+               uintptr_t our_id;
+
+               if (cdevsw[major(dev)].d_type == D_DISK) {
+                       ddisk = 1;
+               }
+
+               if (ddisk && vp->v_un.vu_specinfo->si_throttleable) {
                        throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
 
                        thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
 
                        microuptime(&throttle_info->throttle_last_write_timestamp);
                }
+
+               if (kdebug_enable && ddisk) {
+                       if (devBlockSize == 0) {
+                               devBlockSize = 512; // default sector size
+                       }
+                       if ((uio_offset(uio) != 0) && devBlockSize) {
+                               blkno = ((uint64_t)uio_offset(uio)) / ((uint64_t)devBlockSize);
+                       }
+                       iolen = (int)uio_resid(uio);
+                       our_id = (uintptr_t)thread_tid(current_thread());
+                       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
+                           (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
+                           vp->v_rdev, blkno, iolen, 0);
+               }
                error = (*cdevsw[major(vp->v_rdev)].d_write)
                    (vp->v_rdev, uio, ap->a_ioflag);
 
+               if (kdebug_enable && ddisk) {
+                       //emit the I/O completion
+                       uint32_t residual = (uint32_t)uio_resid(uio);
+                       ktrace_code |= DKIO_DONE;
+                       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
+                           (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
+                           (uintptr_t)VM_KERNEL_ADDRPERM(vp), residual, error, 0);
+               }
+
                if (throttle_info) {
                        throttle_info_end_io_internal(throttle_info, thread_throttle_level);
                }
@@ -746,10 +819,10 @@ spec_select(struct vnop_select_args *ap)
        }
 }
 
-static int filt_specattach(struct knote *kn, struct kevent_internal_s *kev);
+static int filt_specattach(struct knote *kn, struct kevent_qos_s *kev);
 
 int
-spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_internal_s *kev)
+spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_qos_s *kev)
 {
        dev_t dev;
 
@@ -765,7 +838,7 @@ spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_internal_s *kev)
         * other attaches.
         */
        int32_t tmp_flags = kn->kn_flags;
-       int64_t tmp_data = kn->kn_data;
+       int64_t tmp_sdata = kn->kn_sdata;
        int res;
 
        res = bpfkqfilter(dev, kn);
@@ -773,7 +846,7 @@ spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_internal_s *kev)
                return res;
        }
        kn->kn_flags = tmp_flags;
-       kn->kn_data = tmp_data;
+       kn->kn_sdata = tmp_sdata;
 #endif
 
        if (major(dev) > nchrdev) {
@@ -1975,6 +2048,50 @@ done:
        return sleep_cnt;
 }
 
+/*
+ *  returns TRUE if the throttle_lowpri_io called with the same sleep_amount would've slept
+ *  This function mimics the most of the throttle_lowpri_io checks but without actual sleeping
+ */
+int
+throttle_lowpri_io_will_be_throttled(int sleep_amount)
+{
+       if (sleep_amount == 0) {
+               return FALSE;
+       }
+
+       uthread_t ut = get_bsdthread_info(current_thread());
+       if (ut->uu_lowpri_window == 0) {
+               return FALSE;
+       }
+
+       struct _throttle_io_info_t *info = ut->uu_throttle_info;
+       if (info == NULL) {
+               return FALSE;
+       }
+
+       lck_mtx_lock(&info->throttle_lock);
+       assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED);
+
+       if (sleep_amount == 1 && !ut->uu_throttle_bc) {
+               sleep_amount = 0;
+       }
+
+       int result = FALSE;
+
+       int throttle_type = throttle_io_will_be_throttled_internal(info, NULL, NULL);
+       if (throttle_type > THROTTLE_DISENGAGED) {
+               result = TRUE;
+               if ((throttle_type == THROTTLE_ENGAGED) && (sleep_amount == 0)) {
+                       result = FALSE;
+               }
+       }
+
+       lck_mtx_unlock(&info->throttle_lock);
+
+       return result;
+}
+
+
 /*
  * KPI routine
  *
@@ -2379,7 +2496,7 @@ spec_strategy(struct vnop_strategy_args *ap)
         * For metadata writes, unconditionally mark them as IOSCHED_METADATA_TIER and passive
         */
        if (bap->ba_flags & BA_META) {
-               if (mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
+               if ((mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) || (bap->ba_flags & BA_IO_SCHEDULED)) {
                        if (bp->b_flags & B_READ) {
                                if (io_tier > IOSCHED_METADATA_TIER) {
                                        io_tier = IOSCHED_METADATA_TIER;
@@ -2748,8 +2865,8 @@ spec_offtoblk(struct vnop_offtoblk_args *ap)
 
 static void filt_specdetach(struct knote *kn);
 static int filt_specevent(struct knote *kn, long hint);
-static int filt_spectouch(struct knote *kn, struct kevent_internal_s *kev);
-static int filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
+static int filt_spectouch(struct knote *kn, struct kevent_qos_s *kev);
+static int filt_specprocess(struct knote *kn, struct kevent_qos_s *kev);
 static int filt_specpeek(struct knote *kn);
 
 SECURITY_READ_ONLY_EARLY(struct filterops) spec_filtops = {
@@ -2789,7 +2906,7 @@ spec_knote_select_and_link(struct knote *kn)
        ctx = vfs_context_current();
        vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
 
-       int error = vnode_getwithvid(vp, kn->kn_hookid);
+       int error = vnode_getwithvid(vp, vnode_vid(vp));
        if (error != 0) {
                knote_set_error(kn, ENOENT);
                return 0;
@@ -2798,7 +2915,7 @@ spec_knote_select_and_link(struct knote *kn)
        /*
         * This function may be called many times to link or re-link the
         * underlying vnode to the kqueue.  If we've already linked the two,
-        * we will have a valid kn_hook_data which ties us to the underlying
+        * we will have a valid kn_hook64 which ties us to the underlying
         * device's waitq via a the waitq's prepost table object. However,
         * devices can abort any select action by calling selthreadclear().
         * This is OK because the table object will be invalidated by the
@@ -2868,13 +2985,13 @@ spec_knote_select_and_link(struct knote *kn)
                 * the table object's ID to us.  It will also set the
                 * waitq_prepost_id field within the waitq structure.
                 *
-                * We can just overwrite kn_hook_data because it's simply a
+                * We can just overwrite kn_hook64 because it's simply a
                 * table ID used to grab a reference when needed.
                 *
                 * We have a reference on the vnode, so we know that the
                 * device won't go away while we get this ID.
                 */
-               kn->kn_hook_data = waitq_get_prepost_id(wq);
+               kn->kn_hook64 = waitq_get_prepost_id(wq);
        } else if (selres == 0) {
                /*
                 * The device indicated that there's no data to read, but didn't call
@@ -2890,22 +3007,33 @@ spec_knote_select_and_link(struct knote *kn)
        return selres;
 }
 
-static void
-filt_spec_common(struct knote *kn, int selres)
+static int
+filt_spec_common(struct knote *kn, struct kevent_qos_s *kev, int selres)
 {
+       int64_t data;
+       int ret;
+
        if (kn->kn_vnode_use_ofst) {
                if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
-                       kn->kn_data = 0;
+                       data = 0;
                } else {
-                       kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
+                       data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
                }
        } else {
-               kn->kn_data = selres;
+               data = selres;
+       }
+
+       ret = data >= knote_low_watermark(kn);
+
+       if (ret && kev) {
+               knote_fill_kevent(kn, kev, data);
        }
+
+       return ret;
 }
 
 static int
-filt_specattach(struct knote *kn, __unused struct kevent_internal_s *kev)
+filt_specattach(struct knote *kn, __unused struct kevent_qos_s *kev)
 {
        vnode_t vp;
        dev_t dev;
@@ -2941,8 +3069,7 @@ filt_specattach(struct knote *kn, __unused struct kevent_internal_s *kev)
        }
 
        kn->kn_filtid = EVFILTID_SPEC;
-       kn->kn_hook_data = 0;
-       kn->kn_hookid = vnode_vid(vp);
+       kn->kn_hook64 = 0;
 
        knote_markstayactive(kn);
        return spec_knote_select_and_link(kn);
@@ -2957,7 +3084,7 @@ filt_specdetach(struct knote *kn)
         * This is potentially tricky: the device's selinfo waitq that was
         * tricked into being part of this knote's waitq set may not be a part
         * of any other set, and the device itself may have revoked the memory
-        * in which the waitq was held. We use the knote's kn_hook_data field
+        * in which the waitq was held. We use the knote's kn_hook64 field
         * to keep the ID of the waitq's prepost table object. This
         * object keeps a pointer back to the waitq, and gives us a safe way
         * to decouple the dereferencing of driver allocated memory: if the
@@ -2965,9 +3092,9 @@ filt_specdetach(struct knote *kn)
         * object will be invalidated. The waitq details are handled in the
         * waitq API invoked here.
         */
-       if (kn->kn_hook_data) {
-               waitq_unlink_by_prepost_id(kn->kn_hook_data, &(knote_get_kq(kn)->kq_wqs));
-               kn->kn_hook_data = 0;
+       if (kn->kn_hook64) {
+               waitq_unlink_by_prepost_id(kn->kn_hook64, &(knote_get_kq(kn)->kq_wqs));
+               kn->kn_hook64 = 0;
        }
 }
 
@@ -2982,7 +3109,7 @@ filt_specevent(struct knote *kn, __unused long hint)
 }
 
 static int
-filt_spectouch(struct knote *kn, struct kevent_internal_s *kev)
+filt_spectouch(struct knote *kn, struct kevent_qos_s *kev)
 {
        kn->kn_sdata = kev->data;
        kn->kn_sfflags = kev->fflags;
@@ -2995,9 +3122,8 @@ filt_spectouch(struct knote *kn, struct kevent_internal_s *kev)
 }
 
 static int
-filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
+filt_specprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-#pragma unused(data)
        vnode_t vp;
        uthread_t uth;
        vfs_context_t ctx;
@@ -3009,29 +3135,18 @@ filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_in
        ctx = vfs_context_current();
        vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
 
-       error = vnode_getwithvid(vp, kn->kn_hookid);
+       error = vnode_getwithvid(vp, vnode_vid(vp));
        if (error != 0) {
                kn->kn_flags |= (EV_EOF | EV_ONESHOT);
-               *kev = kn->kn_kevent;
+               knote_fill_kevent(kn, kev, 0);
                return 1;
        }
 
        selres = spec_knote_select_and_link(kn);
-       filt_spec_common(kn, selres);
+       res = filt_spec_common(kn, kev, selres);
 
        vnode_put(vp);
 
-       res = ((kn->kn_sfflags & NOTE_LOWAT) != 0) ?
-           (kn->kn_data >= kn->kn_sdata) : kn->kn_data;
-
-       if (res) {
-               *kev = kn->kn_kevent;
-               if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_fflags = 0;
-                       kn->kn_data = 0;
-               }
-       }
-
        return res;
 }
 
@@ -3041,7 +3156,5 @@ filt_specpeek(struct knote *kn)
        int selres = 0;
 
        selres = spec_knote_select_and_link(kn);
-       filt_spec_common(kn, selres);
-
-       return kn->kn_data != 0;
+       return filt_spec_common(kn, NULL, selres);
 }
index ae4e79ae0b696a169f38e0543067d5eb87d9beea..3a4b2aa5ca5b8ac0bc8e23ef0230fd7723d475f6 100644 (file)
@@ -133,9 +133,9 @@ __BEGIN_DECLS
 #ifdef BSD_KERNEL_PRIVATE
 int spec_blktooff(struct  vnop_blktooff_args *);
 int spec_offtoblk(struct  vnop_offtoblk_args *);
-int     spec_fsync_internal(vnode_t, int, vfs_context_t);
+int spec_fsync_internal(vnode_t, int, vfs_context_t);
 int spec_blockmap(struct  vnop_blockmap_args *);
-int spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_internal_s *kev);
+int spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_qos_s *kev);
 #endif /* BSD_KERNEL_PRIVATE */
 
 int     spec_ebadf(void *);
index 60c66a58b95134d3b77afe10a0efa152c06ad8c0..fb8e28cd1b5627f4b16cfaea4636abcee7ca7852 100644 (file)
@@ -52,11 +52,14 @@ PRIVATE_DATAFILES = \
        if_var.h \
        if_vlan_var.h \
        if_fake_var.h \
+       if_6lowpan_var.h \
        iptap.h \
        lacp.h \
+       multi_layer_pkt_log.h \
        ndrv_var.h \
        necp.h \
        net_api_stats.h \
+       net_log_common.h \
        netsrc.h \
        network_agent.h \
        ntstat.h \
@@ -70,6 +73,8 @@ PRIVATE_DATAFILES = \
        route.h \
        net_perf.h \
        net_kev.h \
+       sixxlowpan.h \
+       frame802154.h \
        nat464_utils.h
 
 PRIVATE_KERNELFILES = $(filter-out radix.h,${KERNELFILES}) \
index 52047280ceec25b017d08442470d9d72ced3a32a..b855f3a48a80caf01a151dc8377b60510c953b5d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -156,9 +156,15 @@ typedef void (*pktcopyfunc_t)(const void *, void *, size_t);
 static unsigned int bpf_bufsize = BPF_BUFSIZE;
 SYSCTL_INT(_debug, OID_AUTO, bpf_bufsize, CTLFLAG_RW | CTLFLAG_LOCKED,
     &bpf_bufsize, 0, "");
+
+static int sysctl_bpf_maxbufsize SYSCTL_HANDLER_ARGS;
+extern const int copysize_limit_panic;
+#define BPF_MAXSIZE_CAP (copysize_limit_panic >> 1)
 __private_extern__ unsigned int bpf_maxbufsize = BPF_MAXBUFSIZE;
-SYSCTL_INT(_debug, OID_AUTO, bpf_maxbufsize, CTLFLAG_RW | CTLFLAG_LOCKED,
-    &bpf_maxbufsize, 0, "");
+SYSCTL_PROC(_debug, OID_AUTO, bpf_maxbufsize, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    &bpf_maxbufsize, 0,
+    sysctl_bpf_maxbufsize, "I", "Default BPF max buffer size");
+
 static unsigned int bpf_maxdevices = 256;
 SYSCTL_UINT(_debug, OID_AUTO, bpf_maxdevices, CTLFLAG_RW | CTLFLAG_LOCKED,
     &bpf_maxdevices, 0, "");
@@ -248,20 +254,20 @@ select_fcn_t        bpfselect;
 /* Darwin's cdevsw struct differs slightly from BSDs */
 #define CDEV_MAJOR 23
 static struct cdevsw bpf_cdevsw = {
-       /* open */ bpfopen,
-       /* close */ bpfclose,
-       /* read */ bpfread,
-       /* write */ bpfwrite,
-       /* ioctl */ bpfioctl,
-       /* stop */ eno_stop,
-       /* reset */ eno_reset,
-       /* tty */ NULL,
-       /* select */ bpfselect,
-       /* mmap */ eno_mmap,
-       /* strategy */ eno_strat,
-       /* getc */ eno_getc,
-       /* putc */ eno_putc,
-       /* type */ 0
+       .d_open       = bpfopen,
+       .d_close      = bpfclose,
+       .d_read       = bpfread,
+       .d_write      = bpfwrite,
+       .d_ioctl      = bpfioctl,
+       .d_stop       = eno_stop,
+       .d_reset      = eno_reset,
+       .d_ttys       = NULL,
+       .d_select     = bpfselect,
+       .d_mmap       = eno_mmap,
+       .d_strategy   = eno_strat,
+       .d_reserved_1 = eno_getc,
+       .d_reserved_2 = eno_putc,
+       .d_type       = 0
 };
 
 #define SOCKADDR_HDR_LEN           offsetof(struct sockaddr, sa_data)
@@ -1221,8 +1227,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
                                }
                                if (found == 1) {
                                        ehp->bh_pid = soprocinfo.spi_pid;
-                                       proc_name(ehp->bh_pid, ehp->bh_comm,
-                                           MAXCOMLEN);
+                                       strlcpy(&ehp->bh_comm[0], &soprocinfo.spi_proc_name[0], sizeof(ehp->bh_comm));
                                }
                                ehp->bh_flowid = 0;
                        }
@@ -2526,9 +2531,8 @@ bpfselect(dev_t dev, int which, void * wql, struct proc *p)
 int bpfkqfilter(dev_t dev, struct knote *kn);
 static void filt_bpfdetach(struct knote *);
 static int filt_bpfread(struct knote *, long);
-static int filt_bpftouch(struct knote *kn, struct kevent_internal_s *kev);
-static int filt_bpfprocess(struct knote *kn, struct filt_process_s *data,
-    struct kevent_internal_s *kev);
+static int filt_bpftouch(struct knote *kn, struct kevent_qos_s *kev);
+static int filt_bpfprocess(struct knote *kn, struct kevent_qos_s *kev);
 
 SECURITY_READ_ONLY_EARLY(struct filterops) bpfread_filtops = {
        .f_isfd = 1,
@@ -2539,9 +2543,10 @@ SECURITY_READ_ONLY_EARLY(struct filterops) bpfread_filtops = {
 };
 
 static int
-filt_bpfread_common(struct knote *kn, struct bpf_d *d)
+filt_bpfread_common(struct knote *kn, struct kevent_qos_s *kev, struct bpf_d *d)
 {
        int ready = 0;
+       int64_t data = 0;
 
        if (d->bd_immediate) {
                /*
@@ -2558,17 +2563,13 @@ filt_bpfread_common(struct knote *kn, struct bpf_d *d)
                 * If there's no data in either buffer, we're not
                 * ready to read.
                 */
-               kn->kn_data = (d->bd_hlen == 0 || d->bd_hbuf_read != 0 ?
+               data = (d->bd_hlen == 0 || d->bd_hbuf_read != 0 ?
                    d->bd_slen : d->bd_hlen);
-               int64_t lowwat = 1;
-               if (kn->kn_sfflags & NOTE_LOWAT) {
-                       if (kn->kn_sdata > d->bd_bufsize) {
-                               lowwat = d->bd_bufsize;
-                       } else if (kn->kn_sdata > lowwat) {
-                               lowwat = kn->kn_sdata;
-                       }
+               int64_t lowwat = knote_low_watermark(kn);
+               if (lowwat > d->bd_bufsize) {
+                       lowwat = d->bd_bufsize;
                }
-               ready = (kn->kn_data >= lowwat);
+               ready = (data >= lowwat);
        } else {
                /*
                 * If there's data in the hold buffer, it's the
@@ -2585,12 +2586,14 @@ filt_bpfread_common(struct knote *kn, struct bpf_d *d)
                 * no data in the hold buffer and the timer hasn't
                 * expired, we're not ready to read.
                 */
-               kn->kn_data = ((d->bd_hlen == 0 || d->bd_hbuf_read != 0) &&
+               data = ((d->bd_hlen == 0 || d->bd_hbuf_read != 0) &&
                    d->bd_state == BPF_TIMED_OUT ? d->bd_slen : d->bd_hlen);
-               ready = (kn->kn_data > 0);
+               ready = (data > 0);
        }
        if (!ready) {
                bpf_start_timer(d);
+       } else if (kev) {
+               knote_fill_kevent(kn, kev, data);
        }
 
        return ready;
@@ -2605,10 +2608,8 @@ bpfkqfilter(dev_t dev, struct knote *kn)
        /*
         * Is this device a bpf?
         */
-       if (major(dev) != CDEV_MAJOR ||
-           kn->kn_filter != EVFILT_READ) {
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = EINVAL;
+       if (major(dev) != CDEV_MAJOR || kn->kn_filter != EVFILT_READ) {
+               knote_set_error(kn, EINVAL);
                return 0;
        }
 
@@ -2620,8 +2621,7 @@ bpfkqfilter(dev_t dev, struct knote *kn)
            (d->bd_flags & BPF_CLOSING) != 0 ||
            d->bd_bif == NULL) {
                lck_mtx_unlock(bpf_mlock);
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = ENXIO;
+               knote_set_error(kn, ENXIO);
                return 0;
        }
 
@@ -2631,7 +2631,7 @@ bpfkqfilter(dev_t dev, struct knote *kn)
        d->bd_flags |= BPF_KNOTE;
 
        /* capture the current state */
-       res = filt_bpfread_common(kn, d);
+       res = filt_bpfread_common(kn, NULL, d);
 
        lck_mtx_unlock(bpf_mlock);
 
@@ -2657,11 +2657,11 @@ filt_bpfread(struct knote *kn, long hint)
 #pragma unused(hint)
        struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 
-       return filt_bpfread_common(kn, d);
+       return filt_bpfread_common(kn, NULL, d);
 }
 
 static int
-filt_bpftouch(struct knote *kn, struct kevent_internal_s *kev)
+filt_bpftouch(struct knote *kn, struct kevent_qos_s *kev)
 {
        struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
        int res;
@@ -2673,7 +2673,7 @@ filt_bpftouch(struct knote *kn, struct kevent_internal_s *kev)
        kn->kn_sfflags = kev->fflags;
 
        /* output data will be re-generated here */
-       res = filt_bpfread_common(kn, d);
+       res = filt_bpfread_common(kn, NULL, d);
 
        lck_mtx_unlock(bpf_mlock);
 
@@ -2681,18 +2681,13 @@ filt_bpftouch(struct knote *kn, struct kevent_internal_s *kev)
 }
 
 static int
-filt_bpfprocess(struct knote *kn, struct filt_process_s *data,
-    struct kevent_internal_s *kev)
+filt_bpfprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-#pragma unused(data)
        struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
        int res;
 
        lck_mtx_lock(bpf_mlock);
-       res = filt_bpfread_common(kn, d);
-       if (res) {
-               *kev = kn->kn_kevent;
-       }
+       res = filt_bpfread_common(kn, kev, d);
        lck_mtx_unlock(bpf_mlock);
 
        return res;
@@ -3233,7 +3228,7 @@ get_pkt_trunc_len(u_char *p, u_int len)
         * pre is the offset to the L3 header after the bpfp_header, or length
         * of L2 header after bpfp_header, if present.
         */
-       uint32_t pre = pktap->pth_frame_pre_length -
+       int32_t pre = pktap->pth_frame_pre_length -
            (pkt->bpfp_header_length - pktap->pth_length);
 
        /* Length of the input packet starting from  L3 header */
@@ -3242,7 +3237,7 @@ get_pkt_trunc_len(u_char *p, u_int len)
            pktap->pth_protocol_family == AF_INET6) {
                /* Contains L2 header */
                if (pre > 0) {
-                       if (pre < sizeof(struct ether_header)) {
+                       if (pre < (int32_t)sizeof(struct ether_header)) {
                                goto too_short;
                        }
 
@@ -3720,7 +3715,7 @@ bpf_init(__unused void *unused)
 }
 
 #ifndef __APPLE__
-SYSINIT(bpfdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE + CDEV_MAJOR, bpf_drvinit, NULL)
+SYSINIT(bpfdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE + CDEV_MAJOR, bpf_drvinit, NULL);
 #endif
 
 #if CONFIG_MACF_NET
@@ -3736,3 +3731,24 @@ mac_bpfdesc_label_set(struct bpf_d *d, struct label *label)
        d->bd_label = label;
 }
 #endif
+
+static int
+sysctl_bpf_maxbufsize SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int i, err;
+
+       i = bpf_maxbufsize;
+
+       err = sysctl_handle_int(oidp, &i, 0, req);
+       if (err != 0 || req->newptr == USER_ADDR_NULL) {
+               return err;
+       }
+
+       if (i < 0 || i > BPF_MAXSIZE_CAP) {
+               i = BPF_MAXSIZE_CAP;
+       }
+
+       bpf_maxbufsize = i;
+       return err;
+}
index 12d3a6e37b56d38c546a7e42b101ef873d51c421..6d5bcc587515246691ed581f598c5be14aa73976 100644 (file)
@@ -470,10 +470,16 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
                        continue;
 
                case BPF_LD | BPF_MEM:
+                       if (pc->k >= BPF_MEMWORDS) {
+                               return 0;
+                       }
                        A = mem[pc->k];
                        continue;
 
                case BPF_LDX | BPF_MEM:
+                       if (pc->k >= BPF_MEMWORDS) {
+                               return 0;
+                       }
                        X = mem[pc->k];
                        continue;
 
diff --git a/bsd/net/cc.h b/bsd/net/cc.h
new file mode 100644 (file)
index 0000000..c5113b4
--- /dev/null
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ * Copyright (c) 2003, Adam Dunkels.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials provided
+ *    with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This file is part of the Contiki desktop OS
+ *
+ *
+ */
+
+/**
+ * \file
+ * Default definitions of C compiler quirk work-arounds.
+ * \author Adam Dunkels <adam@dunkels.com>
+ *
+ * This file is used for making use of extra functionality of some C
+ * compilers used for Contiki, and defining work-arounds for various
+ * quirks and problems with some other C compilers.
+ */
+
+#ifndef CC_H_
+#define CC_H_
+
+#include "contiki-conf.h"
+
+/**
+ * Configure if the C compiler supports the "register" keyword for
+ * function arguments.
+ */
+#if CC_CONF_REGISTER_ARGS
+#define CC_REGISTER_ARG register
+#else /* CC_CONF_REGISTER_ARGS */
+#define CC_REGISTER_ARG
+#endif /* CC_CONF_REGISTER_ARGS */
+
+/**
+ * Configure if the C compiler supports the arguments for function
+ * pointers.
+ */
+#if CC_CONF_FUNCTION_POINTER_ARGS
+#define CC_FUNCTION_POINTER_ARGS 1
+#else /* CC_CONF_FUNCTION_POINTER_ARGS */
+#define CC_FUNCTION_POINTER_ARGS 0
+#endif /* CC_CONF_FUNCTION_POINTER_ARGS */
+
+/**
+ * Configure if the C compiler supports fastcall function
+ * declarations.
+ */
+#ifdef CC_CONF_FASTCALL
+#define CC_FASTCALL CC_CONF_FASTCALL
+#else /* CC_CONF_FASTCALL */
+#define CC_FASTCALL
+#endif /* CC_CONF_FASTCALL */
+
+/**
+ * Configure if the C compiler have problems with const function pointers
+ */
+#ifdef CC_CONF_CONST_FUNCTION_BUG
+#define CC_CONST_FUNCTION
+#else /* CC_CONF_FASTCALL */
+#define CC_CONST_FUNCTION const
+#endif /* CC_CONF_FASTCALL */
+
+/**
+ * Configure work-around for unsigned char bugs with sdcc.
+ */
+#if CC_CONF_UNSIGNED_CHAR_BUGS
+#define CC_UNSIGNED_CHAR_BUGS 1
+#else /* CC_CONF_UNSIGNED_CHAR_BUGS */
+#define CC_UNSIGNED_CHAR_BUGS 0
+#endif /* CC_CONF_UNSIGNED_CHAR_BUGS */
+
+/**
+ * Configure if C compiler supports double hash marks in C macros.
+ */
+#if CC_CONF_DOUBLE_HASH
+#define CC_DOUBLE_HASH 1
+#else /* CC_CONF_DOUBLE_HASH */
+#define CC_DOUBLE_HASH 0
+#endif /* CC_CONF_DOUBLE_HASH */
+
+#ifdef CC_CONF_INLINE
+#define CC_INLINE CC_CONF_INLINE
+#else /* CC_CONF_INLINE */
+#define CC_INLINE
+#endif /* CC_CONF_INLINE */
+
+/**
+ * Configure if the C compiler supports the assignment of struct value.
+ */
+#ifdef CC_CONF_ASSIGN_AGGREGATE
+#define CC_ASSIGN_AGGREGATE(dest, src)  CC_CONF_ASSIGN_AGGREGATE(dest, src)
+#else /* CC_CONF_ASSIGN_AGGREGATE */
+#define CC_ASSIGN_AGGREGATE(dest, src)  *dest = *src
+#endif /* CC_CONF_ASSIGN_AGGREGATE */
+
+#if CC_CONF_NO_VA_ARGS
+#define CC_NO_VA_ARGS CC_CONF_VA_ARGS
+#endif
+
+#ifndef NULL
+#define NULL 0
+#endif /* NULL */
+
+#ifndef MAX
+#define MAX(n, m)   (((n) < (m)) ? (m) : (n))
+#endif
+
+#ifndef MIN
+#define MIN(n, m)   (((n) < (m)) ? (n) : (m))
+#endif
+
+#ifndef ABS
+#define ABS(n)      (((n) < 0) ? -(n) : (n))
+#endif
+
+
+#define CC_CONCAT2(s1, s2) s1##s2
+/**
+ * A C preprocessing macro for concatenating two preprocessor tokens.
+ *
+ * We need use two macros (CC_CONCAT and CC_CONCAT2) in order to allow
+ * concatenation of two \#defined macros.
+ */
+#define CC_CONCAT(s1, s2) CC_CONCAT2(s1, s2)
+#define CC_CONCAT_EXT_2(s1, s2) CC_CONCAT2(s1, s2)
+
+/**
+ * A C preprocessing macro for concatenating three preprocessor tokens.
+ */
+#define CC_CONCAT3(s1, s2, s3) s1##s2##s3
+#define CC_CONCAT_EXT_3(s1, s2, s3) CC_CONCAT3(s1, s2, s3)
+
+#endif /* CC_H_ */
index 434a18dfabb3498ce701e20a2bd6e4e2b568c393..dad2552c3212792de985f072d7464efa17414da8 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -105,13 +105,15 @@ _qinit(class_queue_t *q, int type, int lim, classq_pkt_type_t ptype)
 
 /* add a packet at the tail of the queue */
 void
-_addq(class_queue_t *q, void *pkt)
+_addq(class_queue_t *q, classq_pkt_t *pkt)
 {
        uint32_t size = 0;
 
+       ASSERT(pkt->cp_ptype == qptype(q));
+
        switch (qptype(q)) {
        case QP_MBUF: {
-               struct mbuf *m = pkt;
+               struct mbuf *m = pkt->cp_mbuf;
                MBUFQ_ENQUEUE(&qmbufq(q), m);
                size = m_length(m);
                break;
@@ -121,6 +123,7 @@ _addq(class_queue_t *q, void *pkt)
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        qlen(q)++;
@@ -130,13 +133,15 @@ _addq(class_queue_t *q, void *pkt)
 
 /* add one or more packets at the tail of the queue */
 void
-_addq_multi(class_queue_t *q, void *pkt_head, void *pkt_tail,
+_addq_multi(class_queue_t *q, classq_pkt_t *pkt_head, classq_pkt_t *pkt_tail,
     u_int32_t cnt, u_int32_t size)
 {
+       ASSERT(pkt_head->cp_ptype == qptype(q));
+       ASSERT(pkt_tail->cp_ptype == qptype(q));
        switch (qptype(q)) {
        case QP_MBUF: {
-               struct mbuf *m_head = pkt_head;
-               struct mbuf *m_tail = pkt_tail;
+               struct mbuf *m_head = pkt_head->cp_mbuf;
+               struct mbuf *m_tail = pkt_tail->cp_mbuf;
                MBUFQ_ENQUEUE_MULTI(&qmbufq(q), m_head, m_tail);
                break;
        }
@@ -145,6 +150,7 @@ _addq_multi(class_queue_t *q, void *pkt_head, void *pkt_tail,
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        qlen(q) += cnt;
@@ -152,19 +158,17 @@ _addq_multi(class_queue_t *q, void *pkt_head, void *pkt_tail,
 }
 
 /* get a packet at the head of the queue */
-void *
-_getq(class_queue_t *q)
+void
+_getq(class_queue_t *q, classq_pkt_t *pkt)
 {
-       void *pkt = NULL;
        uint32_t pkt_len;
 
        switch (qptype(q)) {
        case QP_MBUF: {
-               struct mbuf *m;
-               MBUFQ_DEQUEUE(&qmbufq(q), m);
-               if (m != NULL) {
-                       pkt_len = m_length(m);
-                       pkt = m;
+               MBUFQ_DEQUEUE(&qmbufq(q), pkt->cp_mbuf);
+               if (__probable(pkt->cp_mbuf != NULL)) {
+                       CLASSQ_PKT_INIT_MBUF(pkt, pkt->cp_mbuf);
+                       pkt_len = m_length(pkt->cp_mbuf);
                }
                break;
        }
@@ -173,14 +177,15 @@ _getq(class_queue_t *q)
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 
-       if (pkt == NULL) {
+       if (pkt->cp_mbuf == NULL) {
                VERIFY(qlen(q) == 0);
                if (qsize(q) > 0) {
                        qsize(q) = 0;
                }
-               return NULL;
+               return;
        }
        VERIFY(qlen(q) > 0);
        qlen(q)--;
@@ -191,14 +196,12 @@ _getq(class_queue_t *q)
        } else if (qsize(q) != 0) {
                qsize(q) = 0;
        }
-
-       return pkt;
 }
 
-static void *
-_getq_flow_or_scidx(class_queue_t *q, u_int32_t val, boolean_t isflowid)
+static void
+_getq_flow_or_scidx(class_queue_t *q, classq_pkt_t *pkt, u_int32_t val,
+    boolean_t isflowid)
 {
-       void *pkt = NULL;
        uint32_t pkt_len;
 
        switch (qptype(q)) {
@@ -217,8 +220,8 @@ _getq_flow_or_scidx(class_queue_t *q, u_int32_t val, boolean_t isflowid)
                                break;
                        }
                }
-               if (m != NULL) {
-                       pkt = m;
+               if (__probable(m != NULL)) {
+                       CLASSQ_PKT_INIT_MBUF(pkt, m);
                        pkt_len = m_length(m);
                }
                break;
@@ -228,9 +231,10 @@ _getq_flow_or_scidx(class_queue_t *q, u_int32_t val, boolean_t isflowid)
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 
-       if (pkt != NULL) {
+       if (pkt->cp_mbuf != NULL) {
                VERIFY(qlen(q) > 0);
                qlen(q)--;
 
@@ -241,36 +245,38 @@ _getq_flow_or_scidx(class_queue_t *q, u_int32_t val, boolean_t isflowid)
                        qsize(q) = 0;
                }
        }
-
-       return pkt;
 }
 
 /* get a packet of a specific flow beginning from the head of the queue */
-void *
-_getq_flow(class_queue_t *q, u_int32_t flow)
+void
+_getq_flow(class_queue_t *q, classq_pkt_t *pkt, u_int32_t flow)
 {
-       return _getq_flow_or_scidx(q, flow, TRUE);
+       return _getq_flow_or_scidx(q, pkt, flow, TRUE);
 }
 
 /* Get a packet whose MBUF_SCIDX() < scidx from head of queue */
-void *
-_getq_scidx_lt(class_queue_t *q, u_int32_t scidx)
+void
+_getq_scidx_lt(class_queue_t *q, classq_pkt_t *pkt, u_int32_t scidx)
 {
-       return _getq_flow_or_scidx(q, scidx, FALSE);
+       return _getq_flow_or_scidx(q, pkt, scidx, FALSE);
 }
 
 /* get all packets (chained) starting from the head of the queue */
-void *
-_getq_all(class_queue_t *q, void **last, u_int32_t *qlenp,
-    u_int64_t *qsizep)
+void
+_getq_all(class_queue_t *q, classq_pkt_t *first, classq_pkt_t *last,
+    u_int32_t *qlenp, u_int64_t *qsizep)
 {
-       void *pkt = NULL;
-
        switch (qptype(q)) {
        case QP_MBUF:
-               pkt = MBUFQ_FIRST(&qmbufq(q));
+               first->cp_mbuf = MBUFQ_FIRST(&qmbufq(q));
+               if (__probable(first->cp_mbuf != NULL)) {
+                       CLASSQ_PKT_INIT_MBUF(first, first->cp_mbuf);
+               }
                if (last != NULL) {
-                       *last = MBUFQ_LAST(&qmbufq(q));
+                       last->cp_mbuf = MBUFQ_LAST(&qmbufq(q));
+                       if (__probable(last->cp_mbuf != NULL)) {
+                               CLASSQ_PKT_INIT_MBUF(last, last->cp_mbuf);
+                       }
                }
                MBUFQ_INIT(&qmbufq(q));
                break;
@@ -279,6 +285,7 @@ _getq_all(class_queue_t *q, void **last, u_int32_t *qlenp,
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        if (qlenp != NULL) {
@@ -290,8 +297,6 @@ _getq_all(class_queue_t *q, void **last, u_int32_t *qlenp,
 
        qlen(q) = 0;
        qsize(q) = 0;
-
-       return pkt;
 }
 
 static inline struct mbuf *
@@ -335,22 +340,22 @@ _getq_tail_mbuf(class_queue_t *q)
 }
 
 /* drop a packet at the tail of the queue */
-void *
-_getq_tail(class_queue_t *q)
+void
+_getq_tail(class_queue_t *q, classq_pkt_t *pkt)
 {
-       void *t = NULL;
-
        switch (qptype(q)) {
        case QP_MBUF:
-               t = _getq_tail_mbuf(q);
+               pkt->cp_mbuf = _getq_tail_mbuf(q);
+               if (__probable(pkt->cp_mbuf != NULL)) {
+                       CLASSQ_PKT_INIT_MBUF(pkt, pkt->cp_mbuf);
+               }
                break;
 
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
-
-       return t;
 }
 
 static inline struct mbuf *
@@ -415,22 +420,22 @@ _getq_random_mbuf(class_queue_t *q)
 }
 
 /* randomly select a packet in the queue */
-void *
-_getq_random(class_queue_t *q)
+void
+_getq_random(class_queue_t *q, classq_pkt_t *pkt)
 {
-       void *r = NULL;
-
        switch (qptype(q)) {
        case QP_MBUF:
-               r = _getq_random_mbuf(q);
+               pkt->cp_mbuf = _getq_random_mbuf(q);
+               if (__probable(pkt->cp_mbuf != NULL)) {
+                       CLASSQ_PKT_INIT_MBUF(pkt, pkt->cp_mbuf);
+               }
                break;
 
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
-
-       return r;
 }
 
 static inline void
@@ -445,12 +450,13 @@ _removeq_mbuf(class_queue_t *q, struct mbuf *m)
        }
 
        if (m0 != m) {
-               while (MBUFQ_NEXT(m0) != m) {
-                       if (m0 == NULL) {
-                               return;
-                       }
+               while (m0 != NULL && MBUFQ_NEXT(m0) != m) {
                        m0 = MBUFQ_NEXT(m0);
                }
+               if (m0 == NULL) {
+                       return;
+               }
+
                mtail = &MBUFQ_NEXT(m0);
        } else {
                mtail = &MBUFQ_FIRST(head);
@@ -476,16 +482,18 @@ _removeq_mbuf(class_queue_t *q, struct mbuf *m)
 
 /* remove a packet from the queue */
 void
-_removeq(class_queue_t *q, void *pkt)
+_removeq(class_queue_t *q, classq_pkt_t *pkt)
 {
        switch (qptype(q)) {
        case QP_MBUF:
-               _removeq_mbuf(q, pkt);
+               ASSERT(pkt->cp_ptype == QP_MBUF);
+               _removeq_mbuf(q, pkt->cp_mbuf);
                break;
 
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 }
 
@@ -545,6 +553,7 @@ _flushq_flow_mbuf(class_queue_t *q, u_int32_t flow, u_int32_t *cnt,
        }
 }
 
+
 void
 _flushq_flow(class_queue_t *q, u_int32_t flow, u_int32_t *cnt, u_int32_t *len)
 {
@@ -556,5 +565,6 @@ _flushq_flow(class_queue_t *q, u_int32_t flow, u_int32_t *cnt, u_int32_t *len)
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 }
index 93ded92bc1405cd3247a9c79c38bd81ef436657c..33d64b75d589f5e8f97e7ef0e4d8bddf31279c50 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -80,6 +80,25 @@ typedef enum classq_pkt_type {
        QP_MBUF,        /* mbuf packet */
 } classq_pkt_type_t;
 
+/*
+ * Packet
+ */
+typedef struct classq_pkt {
+       union {
+               struct mbuf             *cp_mbuf;       /* mbuf packet */
+       };
+       classq_pkt_type_t       cp_ptype;
+} classq_pkt_t;
+
+#define CLASSQ_PKT_INITIALIZER(_p)      \
+       (classq_pkt_t){ .cp_mbuf = NULL, .cp_ptype = QP_INVALID }
+
+#define CLASSQ_PKT_INIT_MBUF(_p, _m)    do {    \
+       (_p)->cp_ptype = QP_MBUF;               \
+       (_p)->cp_mbuf = (_m);                   \
+} while (0)
+
+
 /*
  * Packet Queue types
  */
@@ -168,15 +187,17 @@ extern u_int32_t classq_verbose;
 SYSCTL_DECL(_net_classq);
 
 extern void _qinit(class_queue_t *, int, int, classq_pkt_type_t);
-extern void _addq(class_queue_t *, void *);
-extern void _addq_multi(class_queue_t *, void *, void *, u_int32_t, u_int32_t);
-extern void *_getq(class_queue_t *);
-extern void *_getq_all(class_queue_t *, void **, u_int32_t *, u_int64_t *);
-extern void *_getq_tail(class_queue_t *);
-extern void *_getq_random(class_queue_t *);
-extern void *_getq_flow(class_queue_t *, u_int32_t);
-extern void *_getq_scidx_lt(class_queue_t *, u_int32_t);
-extern void _removeq(class_queue_t *, void *);
+extern void _addq(class_queue_t *, classq_pkt_t *);
+extern void _addq_multi(class_queue_t *, classq_pkt_t *, classq_pkt_t *,
+    u_int32_t, u_int32_t);
+extern void _getq(class_queue_t *, classq_pkt_t *);
+extern void _getq_all(class_queue_t *, classq_pkt_t *, classq_pkt_t *,
+    u_int32_t *, u_int64_t *);
+extern void _getq_tail(class_queue_t *, classq_pkt_t *);
+extern void _getq_random(class_queue_t *, classq_pkt_t *);
+extern void _getq_flow(class_queue_t *, classq_pkt_t *, u_int32_t);
+extern void _getq_scidx_lt(class_queue_t *, classq_pkt_t *, u_int32_t);
+extern void _removeq(class_queue_t *, classq_pkt_t *);
 extern void _flushq(class_queue_t *);
 extern void _flushq_flow(class_queue_t *, u_int32_t, u_int32_t *, u_int32_t *);
 
index 75a568d2c521be2ac2063ca5cf58fd98c458151f..912302bebb23fb30daccb46696d700abcd88fcf6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2016-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -69,6 +69,7 @@ fq_codel_init(void)
        if (flowq_cache == NULL) {
                panic("%s: failed to allocate flowq_cache", __func__);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 }
 
@@ -130,12 +131,13 @@ void
 fq_head_drop(fq_if_t *fqs, fq_t *fq)
 {
        pktsched_pkt_t pkt;
-       uint32_t *pkt_flags;
+       volatile uint32_t *pkt_flags;
        uint64_t *pkt_timestamp;
        struct ifclassq *ifq = fqs->fqs_ifq;
 
        _PKTSCHED_PKT_INIT(&pkt);
-       if (fq_getq_flow_internal(fqs, fq, &pkt) == NULL) {
+       fq_getq_flow_internal(fqs, fq, &pkt);
+       if (pkt.pktsched_pkt_mbuf == NULL) {
                return;
        }
 
@@ -143,8 +145,14 @@ fq_head_drop(fq_if_t *fqs, fq_t *fq)
            NULL, NULL);
 
        *pkt_timestamp = 0;
-       if (pkt.pktsched_ptype == QP_MBUF) {
+       switch (pkt.pktsched_ptype) {
+       case QP_MBUF:
                *pkt_flags &= ~PKTF_PRIV_GUARDED;
+               break;
+       default:
+               VERIFY(0);
+               /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        IFCQ_DROP_ADD(ifq, 1, pktsched_get_pkt_len(&pkt));
@@ -159,17 +167,23 @@ fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl)
        u_int64_t now;
        fq_t *fq = NULL;
        uint64_t *pkt_timestamp;
-       uint32_t *pkt_flags;
+       volatile uint32_t *pkt_flags;
        uint32_t pkt_flowid, pkt_tx_start_seq;
        uint8_t pkt_proto, pkt_flowsrc;
 
        pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, &pkt_flowid,
            &pkt_flowsrc, &pkt_proto, &pkt_tx_start_seq);
 
-       if (pkt->pktsched_ptype == QP_MBUF) {
+       switch (pkt->pktsched_ptype) {
+       case QP_MBUF:
                /* See comments in <rdar://problem/14040693> */
                VERIFY(!(*pkt_flags & PKTF_PRIV_GUARDED));
                *pkt_flags |= PKTF_PRIV_GUARDED;
+               break;
+       default:
+               VERIFY(0);
+               /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        if (*pkt_timestamp > 0) {
@@ -200,9 +214,10 @@ fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl)
                        fc_adv = 1;
                        /*
                         * If the flow is suspended or it is not
-                        * TCP, drop the packet
+                        * TCP/QUIC, drop the packet
                         */
-                       if (pkt_proto != IPPROTO_TCP) {
+                       if ((pkt_proto != IPPROTO_TCP) &&
+                           (pkt_proto != IPPROTO_QUIC)) {
                                droptype = DTYPE_EARLY;
                                fq_cl->fcl_stat.fcl_drop_early++;
                        }
@@ -312,20 +327,21 @@ fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl)
        return ret;
 }
 
-void *
+void
 fq_getq_flow_internal(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
 {
-       void *p;
+       classq_pkt_t p = CLASSQ_PKT_INITIALIZER(p);
        uint32_t plen;
        fq_if_classq_t *fq_cl;
        struct ifclassq *ifq = fqs->fqs_ifq;
 
-       fq_dequeue(fq, p);
-       if (p == NULL) {
-               return NULL;
+       fq_dequeue(fq, &p);
+       if (p.cp_ptype == QP_INVALID) {
+               VERIFY(p.cp_mbuf == NULL);
+               return;
        }
 
-       pktsched_pkt_encap(pkt, fq->fq_ptype, p);
+       pktsched_pkt_encap(pkt, &p);
        plen = pktsched_get_pkt_len(pkt);
 
        VERIFY(fq->fq_bytes >= plen);
@@ -341,24 +357,23 @@ fq_getq_flow_internal(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
        if (fq_empty(fq)) {
                fq->fq_getqtime = 0;
        }
-
-       return p;
 }
 
-void *
+void
 fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
 {
-       void *p;
        fq_if_classq_t *fq_cl;
        u_int64_t now;
        int64_t qdelay = 0;
        struct timespec now_ts;
-       uint32_t *pkt_flags, pkt_tx_start_seq;
+       volatile uint32_t *pkt_flags;
+       uint32_t pkt_tx_start_seq;
        uint64_t *pkt_timestamp;
 
-       p = fq_getq_flow_internal(fqs, fq, pkt);
-       if (p == NULL) {
-               return NULL;
+       fq_getq_flow_internal(fqs, fq, pkt);
+       if (pkt->pktsched_ptype == QP_INVALID) {
+               VERIFY(pkt->pktsched_pkt_mbuf == NULL);
+               return;
        }
 
        pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
@@ -385,8 +400,6 @@ fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
                } else {
                        FQ_CLEAR_DELAY_HIGH(fq);
                }
-
-
                /* Reset measured queue delay and update time */
                fq->fq_updatetime = now + fqs->fqs_update_interval;
                fq->fq_min_qdelay = 0;
@@ -407,9 +420,13 @@ fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
        fq_if_is_flow_heavy(fqs, fq);
 
        *pkt_timestamp = 0;
-       if (pkt->pktsched_ptype == QP_MBUF) {
+       switch (pkt->pktsched_ptype) {
+       case QP_MBUF:
                *pkt_flags &= ~PKTF_PRIV_GUARDED;
+               break;
+       default:
+               VERIFY(0);
+               /* NOTREACHED */
+               __builtin_unreachable();
        }
-
-       return p;
 }
index 29b81a9db86222644000795d10f02db5e2624808..582e4a89910bb0835e32f9fbef0844e92a4d7646 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2016-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -74,12 +74,13 @@ typedef struct flowq {
 
 #define fq_empty(_q)    MBUFQ_EMPTY(&(_q)->fq_mbufq)
 
-#define fq_enqueue(_q, _p)      MBUFQ_ENQUEUE(&(_q)->fq_mbufq, (mbuf_t)_p)
+#define fq_enqueue(_q, _p)      MBUFQ_ENQUEUE(&(_q)->fq_mbufq, _p.cp_mbuf)
 
 #define fq_dequeue(_q, _p) do {                                         \
-       mbuf_t _m;                                                      \
-       MBUFQ_DEQUEUE(&(_q)->fq_mbufq, _m);                             \
-       (_p) = _m;                                                      \
+       MBUFQ_DEQUEUE(&(_q)->fq_mbufq, (_p)->cp_mbuf);                  \
+       if (__probable((_p)->cp_mbuf != NULL)) {                        \
+               CLASSQ_PKT_INIT_MBUF((_p), (_p)->cp_mbuf);              \
+       }                                                               \
 } while (0)
 
 struct fq_codel_sched_data;
@@ -92,9 +93,9 @@ extern fq_t *fq_alloc(classq_pkt_type_t);
 extern void fq_destroy(fq_t *);
 extern int fq_addq(struct fq_codel_sched_data *, pktsched_pkt_t *,
     struct fq_if_classq *);
-extern void *fq_getq_flow(struct fq_codel_sched_data *, fq_t *,
+extern void fq_getq_flow(struct fq_codel_sched_data *, fq_t *,
     pktsched_pkt_t *);
-extern void *fq_getq_flow_internal(struct fq_codel_sched_data *,
+extern void fq_getq_flow_internal(struct fq_codel_sched_data *,
     fq_t *, pktsched_pkt_t *);
 extern void fq_head_drop(struct fq_codel_sched_data *, fq_t *);
 
index 73f0ca03d89f9d195448b1944d8fb63fb8d8c26f..1e5963e5ae039ab8a95eb32c0816fbe0950948d4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2011-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define DEQUEUE_SPIKE(_new, _old)       \
        ((u_int64_t)ABS((int64_t)(_new) - (int64_t)(_old)) > ((_old) << 11))
 
-#define ABS(v)  (((v) > 0) ? (v) : -(v))
-
 #define SFB_ZONE_MAX            32              /* maximum elements in zone */
 #define SFB_ZONE_NAME           "classq_sfb"    /* zone name */
 
@@ -301,13 +299,13 @@ struct sfb_time_tbl {
 };
 
 static struct sfb_time_tbl sfb_ttbl[] = {
-       {   1 * MBPS, HOLDTIME_BASE * 1000, PBOXTIME_BASE * 1000    },
-       {  10 * MBPS, HOLDTIME_BASE * 100, PBOXTIME_BASE * 100     },
-       { 100 * MBPS, HOLDTIME_BASE * 10, PBOXTIME_BASE * 10      },
-       {   1 * GBPS, HOLDTIME_BASE, PBOXTIME_BASE           },
-       {  10 * GBPS, HOLDTIME_BASE / 10, PBOXTIME_BASE / 10      },
-       { 100 * GBPS, HOLDTIME_BASE / 100, PBOXTIME_BASE / 100     },
-       { 0, 0, 0 }
+       { .speed =   1 * MBPS, .holdtime = HOLDTIME_BASE * 1000, .pboxtime = PBOXTIME_BASE * 1000},
+       { .speed =  10 * MBPS, .holdtime = HOLDTIME_BASE * 100, .pboxtime = PBOXTIME_BASE * 100 },
+       { .speed = 100 * MBPS, .holdtime = HOLDTIME_BASE * 10, .pboxtime = PBOXTIME_BASE * 10  },
+       { .speed =   1 * GBPS, .holdtime = HOLDTIME_BASE, .pboxtime = PBOXTIME_BASE       },
+       { .speed =  10 * GBPS, .holdtime = HOLDTIME_BASE / 10, .pboxtime = PBOXTIME_BASE / 10  },
+       { .speed = 100 * GBPS, .holdtime = HOLDTIME_BASE / 100, .pboxtime = PBOXTIME_BASE / 100 },
+       { .speed = 0, .holdtime = 0, .pboxtime = 0 }
 };
 
 void
@@ -326,7 +324,7 @@ sfb_init(void)
        zone_change(sfb_zone, Z_EXPAND, TRUE);
        zone_change(sfb_zone, Z_CALLERACCT, TRUE);
 
-       sfb_bins_size = sizeof(*((struct sfb *)0)->sfb_bins);
+       sfb_bins_size = sizeof(struct sfb_bins);
        sfb_bins_zone = zinit(sfb_bins_size, SFB_BINS_ZONE_MAX * sfb_bins_size,
            0, SFB_BINS_ZONE_NAME);
        if (sfb_bins_zone == NULL) {
@@ -336,7 +334,7 @@ sfb_init(void)
        zone_change(sfb_bins_zone, Z_EXPAND, TRUE);
        zone_change(sfb_bins_zone, Z_CALLERACCT, TRUE);
 
-       sfb_fcl_size = sizeof(*((struct sfb *)0)->sfb_fc_lists);
+       sfb_fcl_size = sizeof(struct sfb_fcl);
        sfb_fcl_zone = zinit(sfb_fcl_size, SFB_FCL_ZONE_MAX * sfb_fcl_size,
            0, SFB_FCL_ZONE_NAME);
        if (sfb_fcl_zone == NULL) {
@@ -722,7 +720,7 @@ static int
 sfb_penalize(struct sfb *sp, uint32_t pkt_sfb_hash, uint32_t *pkt_sfb_flags,
     struct timespec *now)
 {
-       struct timespec delta = { 0, 0 };
+       struct timespec delta = { .tv_sec = 0, .tv_nsec = 0 };
        uint8_t *pkt_sfb_hash8 = (uint8_t *)&pkt_sfb_hash;
 
        /* If minimum pmark of current bins is < SFB_PMARK_TH, we're done */
@@ -1149,7 +1147,7 @@ sfb_addq(struct sfb *sp, class_queue_t *q, pktsched_pkt_t *pkt,
        uint16_t *pkt_sfb_hash16;
        uint32_t *pkt_sfb_flags;
        uint32_t pkt_flowid;
-       uint32_t *pkt_flags;
+       volatile uint32_t *pkt_flags;
        uint8_t pkt_proto, pkt_flowsrc;
 
        s = sp->sfb_current;
@@ -1160,10 +1158,16 @@ sfb_addq(struct sfb *sp, class_queue_t *q, pktsched_pkt_t *pkt,
        pkt_sfb_hash = pktsched_get_pkt_sfb_vars(pkt, &pkt_sfb_flags);
        pkt_sfb_hash16 = (uint16_t *)pkt_sfb_hash;
 
-       if (pkt->pktsched_ptype == QP_MBUF) {
+       switch (pkt->pktsched_ptype) {
+       case QP_MBUF:
                /* See comments in <rdar://problem/14040693> */
                VERIFY(!(*pkt_flags & PKTF_PRIV_GUARDED));
                *pkt_flags |= PKTF_PRIV_GUARDED;
+               break;
+       default:
+               VERIFY(0);
+               /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        if (*pkt_timestamp > 0) {
@@ -1294,7 +1298,7 @@ sfb_addq(struct sfb *sp, class_queue_t *q, pktsched_pkt_t *pkt,
        /* if successful enqueue this packet, else drop it */
        if (droptype == DTYPE_NODROP) {
                VERIFY(pkt->pktsched_ptype == qptype(q));
-               _addq(q, pkt->pktsched_pkt);
+               _addq(q, &pkt->pktsched_pkt);
        } else {
                IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
                return (ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROP;
@@ -1316,12 +1320,11 @@ sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge,
     pktsched_pkt_t *pkt)
 {
        struct timespec now;
-       classq_pkt_type_t ptype;
        uint64_t *pkt_timestamp;
-       uint32_t *pkt_flags;
+       volatile uint32_t *pkt_flags;
        uint32_t *pkt_sfb_flags;
        uint32_t *pkt_sfb_hash;
-       void *p;
+       classq_pkt_t p = CLASSQ_PKT_INITIALIZER(p);
 
        if (!purge && (sp->sfb_flags & SFBF_SUSPENDED)) {
                return NULL;
@@ -1330,22 +1333,33 @@ sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge,
        nanouptime(&now);
 
        /* flow of 0 means head of queue */
-       if ((p = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) {
+       if (flow == 0) {
+               _getq(q, &p);
+       } else {
+               _getq_flow(q, &p, flow);
+       }
+
+       if (p.cp_ptype == QP_INVALID) {
                if (!purge) {
                        net_timerclear(&sp->sfb_getqtime);
                }
                return NULL;
        }
 
-       ptype = qptype(q);
-       pktsched_pkt_encap(pkt, ptype, p);
+       pktsched_pkt_encap(pkt, &p);
        pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, NULL,
            NULL, NULL, NULL);
        pkt_sfb_hash = pktsched_get_pkt_sfb_vars(pkt, &pkt_sfb_flags);
 
        /* See comments in <rdar://problem/14040693> */
-       if (ptype == QP_MBUF) {
+       switch (p.cp_ptype) {
+       case QP_MBUF:
                VERIFY(*pkt_flags & PKTF_PRIV_GUARDED);
+               break;
+       default:
+               VERIFY(0);
+               /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        if (!purge) {
@@ -1424,9 +1438,15 @@ sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge,
                    &now, qsize(q));
        }
 
-       /* See comments in <rdar://problem/14040693> */
-       if (ptype == QP_MBUF) {
+       switch (p.cp_ptype) {
+       case QP_MBUF:
+               /* See comments in <rdar://problem/14040693> */
                *pkt_flags &= ~PKTF_PRIV_GUARDED;
+               break;
+       default:
+               VERIFY(0);
+               /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        /*
@@ -1440,7 +1460,7 @@ sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge,
                net_timerclear(&sp->sfb_update_time);
                net_timerclear(&sp->sfb_getqtime);
        }
-       return p;
+       return pkt->pktsched_pkt_mbuf;
 }
 
 void
index 1256b3c3eaeafa18951dccbf821bae625f01734f..d5af79b4ed3ca4394ffc65e392388e473113ba0c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2011-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 
 
 static errno_t ifclassq_dequeue_common(struct ifclassq *, mbuf_svc_class_t,
-    u_int32_t, u_int32_t, void **, void **, u_int32_t *, u_int32_t *,
-    boolean_t, classq_pkt_type_t *);
-static void *ifclassq_tbr_dequeue_common(struct ifclassq *, mbuf_svc_class_t,
-    boolean_t, classq_pkt_type_t *);
+    u_int32_t, u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *,
+    u_int32_t *, boolean_t);
+static void ifclassq_tbr_dequeue_common(struct ifclassq *, mbuf_svc_class_t,
+    boolean_t, classq_pkt_t *);
 
 static u_int64_t ifclassq_target_qdelay = 0;
 SYSCTL_QUAD(_net_classq, OID_AUTO, target_qdelay, CTLFLAG_RW | CTLFLAG_LOCKED,
@@ -141,7 +141,7 @@ ifclassq_teardown(struct ifnet *ifp)
 
        if (IFCQ_IS_READY(ifq)) {
                if (IFCQ_TBR_IS_ENABLED(ifq)) {
-                       struct tb_profile tb = { 0, 0, 0 };
+                       struct tb_profile tb = { .rate = 0, .percent = 0, .depth = 0 };
                        (void) ifclassq_tbr_set(ifq, &tb, FALSE);
                }
                (void) pktsched_teardown(ifq);
@@ -244,20 +244,21 @@ ifclassq_get_len(struct ifclassq *ifq, mbuf_svc_class_t sc, u_int32_t *packets,
        }
        IFCQ_UNLOCK(ifq);
 
+
        return err;
 }
 
 inline void
 ifclassq_set_packet_metadata(struct ifclassq *ifq, struct ifnet *ifp,
-    void *p, classq_pkt_type_t ptype)
+    classq_pkt_t *p)
 {
        if (!IFNET_IS_CELLULAR(ifp)) {
                return;
        }
 
-       switch (ptype) {
+       switch (p->cp_ptype) {
        case QP_MBUF: {
-               struct mbuf *m = p;
+               struct mbuf *m = p->cp_mbuf;
                m->m_pkthdr.pkt_flags |= PKTF_VALID_UNSENT_DATA;
                m->m_pkthdr.bufstatus_if = IFCQ_BYTES(ifq);
                m->m_pkthdr.bufstatus_sndbuf = ifp->if_sndbyte_unsent;
@@ -268,16 +269,16 @@ ifclassq_set_packet_metadata(struct ifclassq *ifq, struct ifnet *ifp,
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 }
 
 errno_t
-ifclassq_enqueue(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype,
-    boolean_t *pdrop)
+ifclassq_enqueue(struct ifclassq *ifq, classq_pkt_t *p, boolean_t *pdrop)
 {
        errno_t err;
 
-       switch (ptype) {
+       switch (p->cp_ptype) {
        case QP_MBUF:
                IFCQ_LOCK_SPIN(ifq);
                break;
@@ -287,43 +288,41 @@ ifclassq_enqueue(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype,
                break;
        }
 
-       IFCQ_ENQUEUE(ifq, p, ptype, err, pdrop);
+       IFCQ_ENQUEUE(ifq, p, err, pdrop);
        IFCQ_UNLOCK(ifq);
        return err;
 }
 
 errno_t
 ifclassq_dequeue(struct ifclassq *ifq, u_int32_t pkt_limit,
-    u_int32_t byte_limit, void **head, void **tail,
-    u_int32_t *cnt, u_int32_t *len, classq_pkt_type_t *ptype)
+    u_int32_t byte_limit, classq_pkt_t *head, classq_pkt_t *tail,
+    u_int32_t *cnt, u_int32_t *len)
 {
        return ifclassq_dequeue_common(ifq, MBUF_SC_UNSPEC, pkt_limit,
-                  byte_limit, head, tail, cnt, len, FALSE, ptype);
+                  byte_limit, head, tail, cnt, len, FALSE);
 }
 
 errno_t
 ifclassq_dequeue_sc(struct ifclassq *ifq, mbuf_svc_class_t sc,
-    u_int32_t pkt_limit, u_int32_t byte_limit, void **head, void **tail,
-    u_int32_t *cnt, u_int32_t *len, classq_pkt_type_t *ptype)
+    u_int32_t pkt_limit, u_int32_t byte_limit, classq_pkt_t *head,
+    classq_pkt_t *tail, u_int32_t *cnt, u_int32_t *len)
 {
        return ifclassq_dequeue_common(ifq, sc, pkt_limit, byte_limit,
-                  head, tail, cnt, len, TRUE, ptype);
+                  head, tail, cnt, len, TRUE);
 }
 
 static errno_t
-ifclassq_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc,
-    u_int32_t pkt_limit, u_int32_t byte_limit, void **head,
-    void **tail, u_int32_t *cnt, u_int32_t *len, boolean_t drvmgt,
-    classq_pkt_type_t *ptype)
+ifclassq_dequeue_common_default(struct ifclassq *ifq, mbuf_svc_class_t sc,
+    u_int32_t pkt_limit, u_int32_t byte_limit, classq_pkt_t *head,
+    classq_pkt_t *tail, u_int32_t *cnt, u_int32_t *len, boolean_t drvmgt)
 {
        struct ifnet *ifp = ifq->ifcq_ifp;
        u_int32_t i = 0, l = 0, lock_spin = 1;
-       void **first, *last;
+       classq_pkt_t first = CLASSQ_PKT_INITIALIZER(first);
+       classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
 
        VERIFY(!drvmgt || MBUF_VALID_SC(sc));
 
-       *ptype = 0;
-
 
        if (IFCQ_TBR_IS_ENABLED(ifq)) {
                goto dequeue_loop;
@@ -342,10 +341,10 @@ ifclassq_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc,
                        IFCQ_LOCK(ifq);
                }
                err = ifq->ifcq_dequeue_sc_multi(ifq, sc, pkt_limit,
-                   byte_limit, head, tail, cnt, len, ptype);
+                   byte_limit, head, tail, cnt, len);
                IFCQ_UNLOCK(ifq);
 
-               if (err == 0 && (*head) == NULL) {
+               if (err == 0 && head->cp_mbuf == NULL) {
                        err = EAGAIN;
                }
                return err;
@@ -359,19 +358,16 @@ ifclassq_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc,
                }
 
                err = ifq->ifcq_dequeue_multi(ifq, pkt_limit, byte_limit,
-                   head, tail, cnt, len, ptype);
+                   head, tail, cnt, len);
                IFCQ_UNLOCK(ifq);
 
-               if (err == 0 && (*head) == NULL) {
+               if (err == 0 && head->cp_mbuf == NULL) {
                        err = EAGAIN;
                }
                return err;
        }
 
 dequeue_loop:
-       *head = NULL;
-       first = &(*head);
-       last = NULL;
 
        if (lock_spin) {
                IFCQ_LOCK_SPIN(ifq);
@@ -380,42 +376,46 @@ dequeue_loop:
        }
 
        while (i < pkt_limit && l < byte_limit) {
-               classq_pkt_type_t tmp_ptype;
                if (drvmgt) {
                        if (IFCQ_TBR_IS_ENABLED(ifq)) {
-                               IFCQ_TBR_DEQUEUE_SC(ifq, sc, *head, &tmp_ptype);
+                               IFCQ_TBR_DEQUEUE_SC(ifq, sc, head);
                        } else {
-                               IFCQ_DEQUEUE_SC(ifq, sc, *head, &tmp_ptype);
+                               IFCQ_DEQUEUE_SC(ifq, sc, head);
                        }
                } else {
                        if (IFCQ_TBR_IS_ENABLED(ifq)) {
-                               IFCQ_TBR_DEQUEUE(ifq, *head, &tmp_ptype);
+                               IFCQ_TBR_DEQUEUE(ifq, head);
                        } else {
-                               IFCQ_DEQUEUE(ifq, *head, &tmp_ptype);
+                               IFCQ_DEQUEUE(ifq, head);
                        }
                }
 
-               if (*head == NULL) {
+               if (head->cp_mbuf == NULL) {
                        break;
                }
 
-               switch (tmp_ptype) {
+               if (first.cp_mbuf == NULL) {
+                       first = *head;
+               }
+
+               switch (head->cp_ptype) {
                case QP_MBUF:
-                       (*((mbuf_t *)head))->m_nextpkt = NULL;
-                       last = *head;
-                       l += (*((mbuf_t *)head))->m_pkthdr.len;
-                       ifclassq_set_packet_metadata(ifq, ifp, (*head),
-                           QP_MBUF);
-                       head = (void **)&(*((mbuf_t *)head))->m_nextpkt;
+                       head->cp_mbuf->m_nextpkt = NULL;
+                       l += head->cp_mbuf->m_pkthdr.len;
+                       ifclassq_set_packet_metadata(ifq, ifp, head);
+                       if (last.cp_mbuf != NULL) {
+                               last.cp_mbuf->m_nextpkt = head->cp_mbuf;
+                       }
                        break;
 
 
                default:
                        VERIFY(0);
                        /* NOTREACHED */
+                       __builtin_unreachable();
                }
 
-               *ptype = tmp_ptype;
+               last = *head;
                i++;
        }
 
@@ -431,7 +431,17 @@ dequeue_loop:
                *len = l;
        }
 
-       return (*first != NULL) ? 0 : EAGAIN;
+       *head = first;
+       return (first.cp_mbuf != NULL) ? 0 : EAGAIN;
+}
+
+static errno_t
+ifclassq_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc,
+    u_int32_t pkt_limit, u_int32_t byte_limit, classq_pkt_t *head,
+    classq_pkt_t *tail, u_int32_t *cnt, u_int32_t *len, boolean_t drvmgt)
+{
+       return ifclassq_dequeue_common_default(ifq, sc,
+                  pkt_limit, byte_limit, head, tail, cnt, len, drvmgt);
 }
 
 void
@@ -570,25 +580,24 @@ ifclassq_ev2str(cqev_t ev)
 #define TBR_SCALE(x)    ((int64_t)(x) << TBR_SHIFT)
 #define TBR_UNSCALE(x)  ((x) >> TBR_SHIFT)
 
-void *
-ifclassq_tbr_dequeue(struct ifclassq *ifq, classq_pkt_type_t *ptype)
+void
+ifclassq_tbr_dequeue(struct ifclassq *ifq, classq_pkt_t *pkt)
 {
-       return ifclassq_tbr_dequeue_common(ifq, MBUF_SC_UNSPEC, FALSE, ptype);
+       ifclassq_tbr_dequeue_common(ifq, MBUF_SC_UNSPEC, FALSE, pkt);
 }
 
-void *
+void
 ifclassq_tbr_dequeue_sc(struct ifclassq *ifq, mbuf_svc_class_t sc,
-    classq_pkt_type_t *ptype)
+    classq_pkt_t *pkt)
 {
-       return ifclassq_tbr_dequeue_common(ifq, sc, TRUE, ptype);
+       ifclassq_tbr_dequeue_common(ifq, sc, TRUE, pkt);
 }
 
-static void *
+static void
 ifclassq_tbr_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc,
-    boolean_t drvmgt, classq_pkt_type_t *ptype)
+    boolean_t drvmgt, classq_pkt_t *pkt)
 {
        struct tb_regulator *tbr;
-       void *p;
        int64_t interval;
        u_int64_t now;
 
@@ -597,6 +606,7 @@ ifclassq_tbr_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc,
        VERIFY(!drvmgt || MBUF_VALID_SC(sc));
        VERIFY(IFCQ_TBR_IS_ENABLED(ifq));
 
+       *pkt = CLASSQ_PKT_INITIALIZER(*pkt);
        tbr = &ifq->ifcq_tbr;
        /* update token only when it is negative */
        if (tbr->tbr_token <= 0) {
@@ -614,7 +624,7 @@ ifclassq_tbr_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc,
        }
        /* if token is still negative, don't allow dequeue */
        if (tbr->tbr_token <= 0) {
-               return NULL;
+               return;
        }
 
        /*
@@ -622,15 +632,15 @@ ifclassq_tbr_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc,
         * ifcq_drain count is adjusted by the caller.
         */
        if (drvmgt) {
-               IFCQ_DEQUEUE_SC(ifq, sc, p, ptype);
+               IFCQ_DEQUEUE_SC(ifq, sc, pkt);
        } else {
-               IFCQ_DEQUEUE(ifq, p, ptype);
+               IFCQ_DEQUEUE(ifq, pkt);
        }
 
-       if (p != NULL) {
-               switch (*ptype) {
+       if (pkt->cp_mbuf != NULL) {
+               switch (pkt->cp_ptype) {
                case QP_MBUF:
-                       tbr->tbr_token -= TBR_SCALE(m_pktlen((mbuf_t)p));
+                       tbr->tbr_token -= TBR_SCALE(m_pktlen(pkt->cp_mbuf));
                        break;
 
 
@@ -639,8 +649,6 @@ ifclassq_tbr_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc,
                        /* NOTREACHED */
                }
        }
-
-       return p;
 }
 
 /*
@@ -678,7 +686,7 @@ ifclassq_tbr_set(struct ifclassq *ifq, struct tb_profile *profile,
 
        if (rate == 0) {
                if (!IFCQ_TBR_IS_ENABLED(ifq)) {
-                       return ENOENT;
+                       return 0;
                }
 
                if (pktsched_verbose) {
index 98f019796715c36c7b4b2a5cd47fd27775d71e54..2de9ac9b1ec8ee643009cde3328f50e1c7f3a53f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2011-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -100,16 +100,16 @@ struct ifclassq;
 enum cqdq_op;
 enum cqrq;
 
-typedef int (*ifclassq_enq_func)(struct ifclassq *, void *, classq_pkt_type_t,
+typedef int (*ifclassq_enq_func)(struct ifclassq *, classq_pkt_t *,
     boolean_t *);
-typedef void  *(*ifclassq_deq_func)(struct ifclassq *, classq_pkt_type_t *);
-typedef void *(*ifclassq_deq_sc_func)(struct ifclassq *,
-    mbuf_svc_class_t, classq_pkt_type_t *);
+typedef void  (*ifclassq_deq_func)(struct ifclassq *, classq_pkt_t *);
+typedef void (*ifclassq_deq_sc_func)(struct ifclassq *, mbuf_svc_class_t,
+    classq_pkt_t *);
 typedef int (*ifclassq_deq_multi_func)(struct ifclassq *, u_int32_t,
-    u_int32_t, void **, void **, u_int32_t *, u_int32_t *, classq_pkt_type_t *);
+    u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *, u_int32_t *);
 typedef int (*ifclassq_deq_sc_multi_func)(struct ifclassq *,
-    mbuf_svc_class_t, u_int32_t, u_int32_t, void **, void **,
-    u_int32_t *, u_int32_t *, classq_pkt_type_t *);
+    mbuf_svc_class_t, u_int32_t, u_int32_t, classq_pkt_t *, classq_pkt_t *,
+    u_int32_t *, u_int32_t *);
 typedef int (*ifclassq_req_func)(struct ifclassq *, enum cqrq, void *);
 
 /*
@@ -249,24 +249,24 @@ struct if_ifclassq_stats {
 /*
  * For ifclassq operations
  */
-#define IFCQ_ENQUEUE(_ifq, _p, _t, _err, _drop) do {                    \
-       (_err) = (*(_ifq)->ifcq_enqueue)(_ifq, _p, _t, _drop);          \
+#define IFCQ_ENQUEUE(_ifq, _p, _err, _drop) do {                        \
+       (_err) = (*(_ifq)->ifcq_enqueue)(_ifq, _p, _drop);              \
 } while (0)
 
-#define IFCQ_DEQUEUE(_ifq, _p, _t) do {                                 \
-       (_p) = (*(_ifq)->ifcq_dequeue)(_ifq, _t);                       \
+#define IFCQ_DEQUEUE(_ifq, _p) do {                                     \
+       (*(_ifq)->ifcq_dequeue)(_ifq, _p);                              \
 } while (0)
 
-#define IFCQ_DEQUEUE_SC(_ifq, _sc, _p, _t) do {                         \
-       (_p) = (*(_ifq)->ifcq_dequeue_sc)(_ifq, _sc, _t);               \
+#define IFCQ_DEQUEUE_SC(_ifq, _sc, _p) do {                             \
+       (*(_ifq)->ifcq_dequeue_sc)(_ifq, _sc, _p);                      \
 } while (0)
 
-#define IFCQ_TBR_DEQUEUE(_ifcq, _p, _t) do {                            \
-       (_p) = ifclassq_tbr_dequeue(_ifcq, _t);                         \
+#define IFCQ_TBR_DEQUEUE(_ifcq, _p) do {                                \
+       ifclassq_tbr_dequeue(_ifcq, _p);                                \
 } while (0)
 
-#define IFCQ_TBR_DEQUEUE_SC(_ifcq, _sc, _p, _t) do {                    \
-       (_p) = ifclassq_tbr_dequeue_sc(_ifcq, _sc, _t);                 \
+#define IFCQ_TBR_DEQUEUE_SC(_ifcq, _sc, _p) do {                        \
+       ifclassq_tbr_dequeue_sc(_ifcq, _sc, _p);                        \
 } while (0)
 
 #define IFCQ_PURGE(_ifq) do {                                           \
@@ -338,13 +338,12 @@ extern void ifclassq_set_maxlen(struct ifclassq *, u_int32_t);
 extern u_int32_t ifclassq_get_maxlen(struct ifclassq *);
 extern int ifclassq_get_len(struct ifclassq *, mbuf_svc_class_t,
     u_int32_t *, u_int32_t *);
-extern errno_t ifclassq_enqueue(struct ifclassq *, void *, classq_pkt_type_t,
-    boolean_t *);
+extern errno_t ifclassq_enqueue(struct ifclassq *, classq_pkt_t *, boolean_t *);
 extern errno_t ifclassq_dequeue(struct ifclassq *, u_int32_t, u_int32_t,
-    void **, void **, u_int32_t *, u_int32_t *, classq_pkt_type_t *);
+    classq_pkt_t *, classq_pkt_t *, u_int32_t *, u_int32_t *);
 extern errno_t ifclassq_dequeue_sc(struct ifclassq *, mbuf_svc_class_t,
-    u_int32_t, u_int32_t, void **, void **, u_int32_t *, u_int32_t *,
-    classq_pkt_type_t *);
+    u_int32_t, u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *,
+    u_int32_t *);
 extern void *ifclassq_poll(struct ifclassq *, classq_pkt_type_t *);
 extern void *ifclassq_poll_sc(struct ifclassq *, mbuf_svc_class_t,
     classq_pkt_type_t *);
@@ -357,14 +356,14 @@ extern int ifclassq_getqstats(struct ifclassq *, u_int32_t,
     void *, u_int32_t *);
 extern const char *ifclassq_ev2str(cqev_t);
 extern int ifclassq_tbr_set(struct ifclassq *, struct tb_profile *, boolean_t);
-extern void *ifclassq_tbr_dequeue(struct ifclassq *, classq_pkt_type_t *);
-extern void *ifclassq_tbr_dequeue_sc(struct ifclassq *, mbuf_svc_class_t,
-    classq_pkt_type_t *);
+extern void ifclassq_tbr_dequeue(struct ifclassq *, classq_pkt_t *);
+extern void ifclassq_tbr_dequeue_sc(struct ifclassq *, mbuf_svc_class_t,
+    classq_pkt_t *);
 extern void ifclassq_calc_target_qdelay(struct ifnet *ifp,
     u_int64_t *if_target_qdelay);
 extern void ifclassq_calc_update_interval(u_int64_t *update_interval);
 extern void ifclassq_set_packet_metadata(struct ifclassq *ifq,
-    struct ifnet *ifp, void *p, classq_pkt_type_t ptype);
+    struct ifnet *ifp, classq_pkt_t *p);
 extern void ifclassq_reap_caches(boolean_t);
 
 #endif /* BSD_KERNEL_PRIVATE */
index 49f16a7df30f54396ed78320032930c50bad86bb..62988b66b456454d8a7961a3bf95cb1b8ce5ae16 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2013-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  *
 #include <kern/debug.h>
 
 #include <net/content_filter.h>
+#include <net/content_filter_crypto.h>
 
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 #include <string.h>
 #include <libkern/libkern.h>
 #include <kern/sched_prim.h>
+#include <kern/task.h>
+#include <mach/task_info.h>
 
+#if !TARGET_OS_OSX && !defined(XNU_TARGET_OS_OSX)
 #define MAX_CONTENT_FILTER 2
+#else
+#define MAX_CONTENT_FILTER 8
+#endif
 
 struct cfil_entry;
 
@@ -340,6 +347,8 @@ struct content_filter {
 
        uint32_t                cf_sock_count;
        TAILQ_HEAD(, cfil_entry) cf_sock_entries;
+
+       cfil_crypto_state_t cf_crypto_state;
 };
 
 #define CFF_ACTIVE              0x01
@@ -391,6 +400,7 @@ struct cfil_queue {
  */
 struct cfil_entry {
        TAILQ_ENTRY(cfil_entry) cfe_link;
+       SLIST_ENTRY(cfil_entry) cfe_order_link;
        struct content_filter   *cfe_filter;
 
        struct cfil_info        *cfe_cfil_info;
@@ -452,7 +462,14 @@ struct cfil_info {
        uint32_t                cfi_op_list_ctr;
        uint32_t                cfi_op_time[CFI_MAX_TIME_LOG_ENTRY];    /* time interval in microseconds since first event */
        unsigned char           cfi_op_list[CFI_MAX_TIME_LOG_ENTRY];
+       union sockaddr_in_4_6   cfi_so_attach_faddr;                    /* faddr at the time of attach */
+       union sockaddr_in_4_6   cfi_so_attach_laddr;                    /* laddr at the time of attach */
 
+       int                     cfi_dir;
+       uint64_t                cfi_byte_inbound_count;
+       uint64_t                cfi_byte_outbound_count;
+
+       boolean_t               cfi_isSignatureLatest;                  /* Indicates if signature covers latest flow attributes */
        struct cfi_buf {
                /*
                 * cfi_pending_first and cfi_pending_last describe the total
@@ -479,6 +496,7 @@ struct cfil_info {
 
        struct cfil_entry       cfi_entries[MAX_CONTENT_FILTER];
        struct cfil_hash_entry *cfi_hash_entry;
+       SLIST_HEAD(, cfil_entry) cfi_ordered_entries;
 } __attribute__((aligned(8)));
 
 #define CFIF_DROP               0x0001  /* drop action applied */
@@ -488,12 +506,16 @@ struct cfil_info {
 #define CFIF_RETRY_INJECT_OUT   0x0020  /* inject out failed */
 #define CFIF_SHUT_WR            0x0040  /* shutdown write */
 #define CFIF_SHUT_RD            0x0080  /* shutdown read */
+#define CFIF_SOCKET_CONNECTED   0x0100  /* socket is connected */
+#define CFIF_INITIAL_VERDICT    0x0200  /* received initial verdict */
 
 #define CFI_MASK_GENCNT         0xFFFFFFFF00000000      /* upper 32 bits */
 #define CFI_SHIFT_GENCNT        32
 #define CFI_MASK_FLOWHASH       0x00000000FFFFFFFF      /* lower 32 bits */
 #define CFI_SHIFT_FLOWHASH      0
 
+#define CFI_ENTRY_KCUNIT(i, e) (((e) - &((i)->cfi_entries[0])) + 1)
+
 TAILQ_HEAD(cfil_sock_head, cfil_info) cfil_sock_head;
 
 #define CFIL_QUEUE_VERIFY(x) if (cfil_debug) cfil_queue_verify(x)
@@ -505,12 +527,16 @@ TAILQ_HEAD(cfil_sock_head, cfil_info) cfil_sock_head;
 LIST_HEAD(cfilhashhead, cfil_hash_entry);
 #define CFILHASHSIZE 16
 #define CFIL_HASH(laddr, faddr, lport, fport) ((faddr) ^ ((laddr) >> 16) ^ (fport) ^ (lport))
-#define IS_UDP(so) (so && so->so_proto->pr_type == SOCK_DGRAM && so->so_proto->pr_protocol == IPPROTO_UDP)
+#define IS_UDP(so) (so && so->so_proto && so->so_proto->pr_type == SOCK_DGRAM && so->so_proto->pr_protocol == IPPROTO_UDP)
 #define UNCONNECTED(inp) (inp && (((inp->inp_vflag & INP_IPV4) && (inp->inp_faddr.s_addr == INADDR_ANY)) || \
                                                                  ((inp->inp_vflag & INP_IPV6) && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))))
 #define IS_ENTRY_ATTACHED(cfil_info, kcunit) (cfil_info != NULL && (kcunit <= MAX_CONTENT_FILTER) && \
                                                                                          cfil_info->cfi_entries[kcunit - 1].cfe_filter != NULL)
 #define IS_DNS(local, remote) (check_port(local, 53) || check_port(remote, 53) || check_port(local, 5353) || check_port(remote, 5353))
+#define IS_INITIAL_TFO_DATA(so) (so && (so->so_flags1 & SOF1_PRECONNECT_DATA) && (so->so_state & SS_ISCONNECTING))
+#define NULLADDRESS(addr) ((addr.sa.sa_len == 0) || \
+                          (addr.sa.sa_family == AF_INET && addr.sin.sin_addr.s_addr == 0) || \
+                          (addr.sa.sa_family == AF_INET6 && IN6_IS_ADDR_UNSPECIFIED(&addr.sin6.sin6_addr)))
 
 /*
  * UDP Garbage Collection:
@@ -657,6 +683,7 @@ static int cfil_action_data_pass(struct socket *, struct cfil_info *, uint32_t,
     uint64_t, uint64_t);
 static int cfil_action_drop(struct socket *, struct cfil_info *, uint32_t);
 static int cfil_action_bless_client(uint32_t, struct cfil_msg_hdr *);
+static int cfil_action_set_crypto_key(uint32_t, struct cfil_msg_hdr *);
 static int cfil_dispatch_closed_event(struct socket *, struct cfil_info *, int);
 static int cfil_data_common(struct socket *, struct cfil_info *, int, struct sockaddr *,
     struct mbuf *, struct mbuf *, uint32_t);
@@ -666,8 +693,8 @@ static void fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *,
     struct in_addr, u_int16_t);
 static void fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *,
     struct in6_addr *, u_int16_t);
-;
-static int cfil_dispatch_attach_event(struct socket *, struct cfil_info *, uint32_t);
+
+static int cfil_dispatch_attach_event(struct socket *, struct cfil_info *, uint32_t, int);
 static void cfil_info_free(struct cfil_info *);
 static struct cfil_info * cfil_info_alloc(struct socket *, struct cfil_hash_entry *);
 static int cfil_info_attach_unit(struct socket *, uint32_t, struct cfil_info *);
@@ -722,6 +749,11 @@ bool cfil_info_buffer_threshold_exceeded(struct cfil_info *);
 struct m_tag *cfil_udp_save_socket_state(struct cfil_info *, struct mbuf *);
 static void cfil_udp_gc_thread_func(void *, wait_result_t);
 static void cfil_info_udp_expire(void *, wait_result_t);
+static bool fill_cfil_hash_entry_from_address(struct cfil_hash_entry *, bool, struct sockaddr *);
+static void cfil_sock_received_verdict(struct socket *so);
+static void cfil_fill_event_msg_addresses(struct cfil_hash_entry *, struct inpcb *,
+    union sockaddr_in_4_6 *, union sockaddr_in_4_6 *,
+    boolean_t, boolean_t);
 
 bool check_port(struct sockaddr *, u_short);
 
@@ -1059,7 +1091,6 @@ cfil_info_buf_verify(struct cfi_buf *cfi_buf)
        CFIL_QUEUE_VERIFY(&cfi_buf->cfi_inject_q);
 
        VERIFY(cfi_buf->cfi_pending_first <= cfi_buf->cfi_pending_last);
-       VERIFY(cfi_buf->cfi_pending_mbcnt >= 0);
 }
 
 static void
@@ -1312,6 +1343,11 @@ release:
        cfil_active_count--;
        cfil_rw_unlock_exclusive(&cfil_lck_rw);
 
+       if (cfc->cf_crypto_state != NULL) {
+               cfil_crypto_cleanup_state(cfc->cf_crypto_state);
+               cfc->cf_crypto_state = NULL;
+       }
+
        zfree(content_filter_zone, cfc);
 done:
        if (error == 0) {
@@ -1569,6 +1605,7 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
        struct cfil_msg_action *action_msg;
        struct cfil_entry *entry;
        struct cfil_info *cfil_info = NULL;
+       unsigned int data_len = 0;
 
        CFIL_LOG(LOG_INFO, "");
 
@@ -1583,9 +1620,15 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
                error = EINVAL;
                goto done;
        }
+       if (m == NULL) {
+               CFIL_LOG(LOG_ERR, "null mbuf");
+               error = EINVAL;
+               goto done;
+       }
+       data_len = m_length(m);
 
-       if (m_length(m) < sizeof(struct cfil_msg_hdr)) {
-               CFIL_LOG(LOG_ERR, "too short %u", m_length(m));
+       if (data_len < sizeof(struct cfil_msg_hdr)) {
+               CFIL_LOG(LOG_ERR, "too short %u", data_len);
                error = EINVAL;
                goto done;
        }
@@ -1600,6 +1643,12 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
                error = EINVAL;
                goto done;
        }
+       if (msghdr->cfm_len > data_len) {
+               CFIL_LOG(LOG_ERR, "bad length %u", msghdr->cfm_len);
+               error = EINVAL;
+               goto done;
+       }
+
        /* Validate action operation */
        switch (msghdr->cfm_op) {
        case CFM_OP_DATA_UPDATE:
@@ -1620,6 +1669,17 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
                }
                error = cfil_action_bless_client(kcunit, msghdr);
                goto done;
+       case CFM_OP_SET_CRYPTO_KEY:
+               if (msghdr->cfm_len != sizeof(struct cfil_msg_set_crypto_key)) {
+                       OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_len);
+                       error = EINVAL;
+                       CFIL_LOG(LOG_ERR, "bad len: %u for op %u",
+                           msghdr->cfm_len,
+                           msghdr->cfm_op);
+                       goto done;
+               }
+               error = cfil_action_set_crypto_key(kcunit, msghdr);
+               goto done;
        default:
                OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_op);
                CFIL_LOG(LOG_ERR, "bad op %u", msghdr->cfm_op);
@@ -1699,6 +1759,13 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
                    action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset,
                    action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset);
 #endif
+               /*
+                * Received verdict, at this point we know this
+                * socket connection is allowed.  Unblock thread
+                * immediately before proceeding to process the verdict.
+                */
+               cfil_sock_received_verdict(so);
+
                if (action_msg->cfa_out_peek_offset != 0 ||
                    action_msg->cfa_out_pass_offset != 0) {
                        error = cfil_action_data_pass(so, cfil_info, kcunit, 1,
@@ -1723,7 +1790,15 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
                break;
 
        case CFM_OP_DROP:
+#if VERDICT_DEBUG
+               CFIL_LOG(LOG_ERR, "CFIL: VERDICT DROP RECEIVED: <so %llx sockID %llu> <IN peek:%llu pass:%llu, OUT peek:%llu pass:%llu>",
+                   (uint64_t)VM_KERNEL_ADDRPERM(so),
+                   cfil_info->cfi_sock_id,
+                   action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset,
+                   action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset);
+#endif
                error = cfil_action_drop(so, cfil_info, kcunit);
+               cfil_sock_received_verdict(so);
                break;
 
        default:
@@ -1852,7 +1927,7 @@ cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
                        fill_ip6_sockaddr_4_6(&sock_info->cfs_local, laddr, lport);
                        fill_ip6_sockaddr_4_6(&sock_info->cfs_remote, faddr, fport);
                } else if (inp->inp_vflag & INP_IPV4) {
-                       struct in_addr laddr = {0}, faddr = {0};
+                       struct in_addr laddr = {.s_addr = 0}, faddr = {.s_addr = 0};
                        u_int16_t lport = 0, fport = 0;
 
                        cfil_get_flow_address(cfil_info->cfi_hash_entry, inp,
@@ -2291,6 +2366,7 @@ cfil_info_alloc(struct socket *so, struct cfil_hash_entry *hash_entry)
        }
 
        TAILQ_INSERT_TAIL(&cfil_sock_head, cfil_info, cfi_link);
+       SLIST_INIT(&cfil_info->cfi_ordered_entries);
 
        cfil_sock_attached_count++;
 
@@ -2323,24 +2399,41 @@ cfil_info_attach_unit(struct socket *so, uint32_t filter_control_unit, struct cf
            kcunit++) {
                struct content_filter *cfc = content_filters[kcunit - 1];
                struct cfil_entry *entry;
+               struct cfil_entry *iter_entry;
+               struct cfil_entry *iter_prev;
 
                if (cfc == NULL) {
                        continue;
                }
-               if (cfc->cf_necp_control_unit != filter_control_unit) {
+               if (!(cfc->cf_necp_control_unit & filter_control_unit)) {
                        continue;
                }
 
                entry = &cfil_info->cfi_entries[kcunit - 1];
 
                entry->cfe_filter = cfc;
-               entry->cfe_necp_control_unit = filter_control_unit;
+               entry->cfe_necp_control_unit = cfc->cf_necp_control_unit;
                TAILQ_INSERT_TAIL(&cfc->cf_sock_entries, entry, cfe_link);
                cfc->cf_sock_count++;
+
+               /* Insert the entry into the list ordered by control unit */
+               iter_prev = NULL;
+               SLIST_FOREACH(iter_entry, &cfil_info->cfi_ordered_entries, cfe_order_link) {
+                       if (entry->cfe_necp_control_unit < iter_entry->cfe_necp_control_unit) {
+                               break;
+                       }
+                       iter_prev = iter_entry;
+               }
+
+               if (iter_prev == NULL) {
+                       SLIST_INSERT_HEAD(&cfil_info->cfi_ordered_entries, entry, cfe_order_link);
+               } else {
+                       SLIST_INSERT_AFTER(iter_prev, entry, cfe_order_link);
+               }
+
                verify_content_filter(cfc);
                attached = 1;
                entry->cfe_flags |= CFEF_CFIL_ATTACHED;
-               break;
        }
 
        cfil_rw_unlock_exclusive(&cfil_lck_rw);
@@ -2417,12 +2510,69 @@ cfil_info_free(struct cfil_info *cfil_info)
        zfree(cfil_info_zone, cfil_info);
 }
 
+/*
+ * Received a verdict from userspace for a socket.
+ * Perform any delayed operation if needed.
+ */
+static void
+cfil_sock_received_verdict(struct socket *so)
+{
+       if (so == NULL || so->so_cfil == NULL) {
+               return;
+       }
+
+       so->so_cfil->cfi_flags |= CFIF_INITIAL_VERDICT;
+
+       /*
+        * If socket has already been connected, trigger
+        * soisconnected now.
+        */
+       if (so->so_cfil->cfi_flags & CFIF_SOCKET_CONNECTED) {
+               so->so_cfil->cfi_flags &= ~CFIF_SOCKET_CONNECTED;
+               soisconnected(so);
+               return;
+       }
+}
+
+/*
+ * Entry point from Sockets layer
+ * The socket is locked.
+ *
+ * Checks if a connected socket is subject to filter and
+ * pending the initial verdict.
+ */
+boolean_t
+cfil_sock_connected_pending_verdict(struct socket *so)
+{
+       if (so == NULL || so->so_cfil == NULL) {
+               return false;
+       }
+
+       if (so->so_cfil->cfi_flags & CFIF_INITIAL_VERDICT) {
+               return false;
+       } else {
+               /*
+                * Remember that this protocol is already connected, so
+                * we will trigger soisconnected() upon receipt of
+                * initial verdict later.
+                */
+               so->so_cfil->cfi_flags |= CFIF_SOCKET_CONNECTED;
+               return true;
+       }
+}
+
+boolean_t
+cfil_filter_present(void)
+{
+       return cfil_active_count > 0;
+}
+
 /*
  * Entry point from Sockets layer
  * The socket is locked.
  */
 errno_t
-cfil_sock_attach(struct socket *so)
+cfil_sock_attach(struct socket *so, struct sockaddr *local, struct sockaddr *remote, int dir)
 {
        errno_t error = 0;
        uint32_t filter_control_unit;
@@ -2444,6 +2594,9 @@ cfil_sock_attach(struct socket *so)
                goto done;
        }
 
+       if (filter_control_unit == NECP_FILTER_UNIT_NO_FILTER) {
+               goto done;
+       }
        if ((filter_control_unit & NECP_MASK_USERSPACE_ONLY) != 0) {
                OSIncrementAtomic(&cfil_stats.cfs_sock_userspace_only);
                goto done;
@@ -2462,6 +2615,7 @@ cfil_sock_attach(struct socket *so)
                        OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem);
                        goto done;
                }
+               so->so_cfil->cfi_dir = dir;
        }
        if (cfil_info_attach_unit(so, filter_control_unit, so->so_cfil) == 0) {
                CFIL_LOG(LOG_ERR, "cfil_info_attach_unit(%u) failed",
@@ -2479,7 +2633,18 @@ cfil_sock_attach(struct socket *so)
        /* Hold a reference on the socket */
        so->so_usecount++;
 
-       error = cfil_dispatch_attach_event(so, so->so_cfil, filter_control_unit);
+       /*
+        * Save passed addresses for attach event msg (in case resend
+        * is needed.
+        */
+       if (remote != NULL) {
+               memcpy(&so->so_cfil->cfi_so_attach_faddr, remote, remote->sa_len);
+       }
+       if (local != NULL) {
+               memcpy(&so->so_cfil->cfi_so_attach_laddr, local, local->sa_len);
+       }
+
+       error = cfil_dispatch_attach_event(so, so->so_cfil, 0, dir);
        /* We can recover from flow control or out of memory errors */
        if (error == ENOBUFS || error == ENOMEM) {
                error = 0;
@@ -2517,14 +2682,215 @@ cfil_sock_detach(struct socket *so)
        return 0;
 }
 
+/*
+ * Fill in the address info of an event message from either
+ * the socket or passed in address info.
+ */
+static void
+cfil_fill_event_msg_addresses(struct cfil_hash_entry *entry, struct inpcb *inp,
+    union sockaddr_in_4_6 *sin_src, union sockaddr_in_4_6 *sin_dst,
+    boolean_t isIPv4, boolean_t outgoing)
+{
+       if (isIPv4) {
+               struct in_addr laddr = {0}, faddr = {0};
+               u_int16_t lport = 0, fport = 0;
+
+               cfil_get_flow_address(entry, inp, &laddr, &faddr, &lport, &fport);
+
+               if (outgoing) {
+                       fill_ip_sockaddr_4_6(sin_src, laddr, lport);
+                       fill_ip_sockaddr_4_6(sin_dst, faddr, fport);
+               } else {
+                       fill_ip_sockaddr_4_6(sin_src, faddr, fport);
+                       fill_ip_sockaddr_4_6(sin_dst, laddr, lport);
+               }
+       } else {
+               struct in6_addr *laddr = NULL, *faddr = NULL;
+               u_int16_t lport = 0, fport = 0;
+
+               cfil_get_flow_address_v6(entry, inp, &laddr, &faddr, &lport, &fport);
+               if (outgoing) {
+                       fill_ip6_sockaddr_4_6(sin_src, laddr, lport);
+                       fill_ip6_sockaddr_4_6(sin_dst, faddr, fport);
+               } else {
+                       fill_ip6_sockaddr_4_6(sin_src, faddr, fport);
+                       fill_ip6_sockaddr_4_6(sin_dst, laddr, lport);
+               }
+       }
+}
+
+static boolean_t
+cfil_dispatch_attach_event_sign(cfil_crypto_state_t crypto_state,
+    struct cfil_info *cfil_info,
+    struct cfil_msg_sock_attached *msg)
+{
+       struct cfil_crypto_data data = {};
+
+       if (crypto_state == NULL || msg == NULL || cfil_info == NULL) {
+               return false;
+       }
+
+       data.sock_id = msg->cfs_msghdr.cfm_sock_id;
+       data.direction = msg->cfs_conn_dir;
+
+       data.pid = msg->cfs_pid;
+       data.effective_pid = msg->cfs_e_pid;
+       uuid_copy(data.uuid, msg->cfs_uuid);
+       uuid_copy(data.effective_uuid, msg->cfs_e_uuid);
+       data.socketProtocol = msg->cfs_sock_protocol;
+       if (data.direction == CFS_CONNECTION_DIR_OUT) {
+               data.remote.sin6 = msg->cfs_dst.sin6;
+               data.local.sin6 = msg->cfs_src.sin6;
+       } else {
+               data.remote.sin6 = msg->cfs_src.sin6;
+               data.local.sin6 = msg->cfs_dst.sin6;
+       }
+
+       // At attach, if local address is already present, no need to re-sign subsequent data messages.
+       if (!NULLADDRESS(data.local)) {
+               cfil_info->cfi_isSignatureLatest = true;
+       }
+
+       msg->cfs_signature_length = sizeof(cfil_crypto_signature);
+       if (cfil_crypto_sign_data(crypto_state, &data, msg->cfs_signature, &msg->cfs_signature_length) != 0) {
+               msg->cfs_signature_length = 0;
+               CFIL_LOG(LOG_ERR, "CFIL: Failed to sign attached msg <sockID %llu>",
+                   msg->cfs_msghdr.cfm_sock_id);
+               return false;
+       }
+
+       return true;
+}
+
+static boolean_t
+cfil_dispatch_data_event_sign(cfil_crypto_state_t crypto_state,
+    struct socket *so, struct cfil_info *cfil_info,
+    struct cfil_msg_data_event *msg)
+{
+       struct cfil_crypto_data data = {};
+
+       if (crypto_state == NULL || msg == NULL ||
+           so == NULL || cfil_info == NULL) {
+               return false;
+       }
+
+       data.sock_id = cfil_info->cfi_sock_id;
+       data.direction = cfil_info->cfi_dir;
+       data.pid = so->last_pid;
+       memcpy(data.uuid, so->last_uuid, sizeof(uuid_t));
+       if (so->so_flags & SOF_DELEGATED) {
+               data.effective_pid = so->e_pid;
+               memcpy(data.effective_uuid, so->e_uuid, sizeof(uuid_t));
+       } else {
+               data.effective_pid = so->last_pid;
+               memcpy(data.effective_uuid, so->last_uuid, sizeof(uuid_t));
+       }
+       data.socketProtocol = so->so_proto->pr_protocol;
+
+       if (data.direction == CFS_CONNECTION_DIR_OUT) {
+               data.remote.sin6 = msg->cfc_dst.sin6;
+               data.local.sin6 = msg->cfc_src.sin6;
+       } else {
+               data.remote.sin6 = msg->cfc_src.sin6;
+               data.local.sin6 = msg->cfc_dst.sin6;
+       }
+
+       // At first data, local address may show up for the first time, update address cache and
+       // no need to re-sign subsequent data messages anymore.
+       if (!NULLADDRESS(data.local)) {
+               memcpy(&cfil_info->cfi_so_attach_laddr, &data.local, data.local.sa.sa_len);
+               cfil_info->cfi_isSignatureLatest = true;
+       }
+
+       msg->cfd_signature_length = sizeof(cfil_crypto_signature);
+       if (cfil_crypto_sign_data(crypto_state, &data, msg->cfd_signature, &msg->cfd_signature_length) != 0) {
+               msg->cfd_signature_length = 0;
+               CFIL_LOG(LOG_ERR, "CFIL: Failed to sign data msg <sockID %llu>",
+                   msg->cfd_msghdr.cfm_sock_id);
+               return false;
+       }
+
+       return true;
+}
+
+static boolean_t
+cfil_dispatch_closed_event_sign(cfil_crypto_state_t crypto_state,
+    struct socket *so, struct cfil_info *cfil_info,
+    struct cfil_msg_sock_closed *msg)
+{
+       struct cfil_crypto_data data = {};
+       struct cfil_hash_entry hash_entry = {};
+       struct cfil_hash_entry *hash_entry_ptr = NULL;
+       struct inpcb *inp = (struct inpcb *)so->so_pcb;
+
+       if (crypto_state == NULL || msg == NULL ||
+           so == NULL || inp == NULL || cfil_info == NULL) {
+               return false;
+       }
+
+       data.sock_id = cfil_info->cfi_sock_id;
+       data.direction = cfil_info->cfi_dir;
+
+       data.pid = so->last_pid;
+       memcpy(data.uuid, so->last_uuid, sizeof(uuid_t));
+       if (so->so_flags & SOF_DELEGATED) {
+               data.effective_pid = so->e_pid;
+               memcpy(data.effective_uuid, so->e_uuid, sizeof(uuid_t));
+       } else {
+               data.effective_pid = so->last_pid;
+               memcpy(data.effective_uuid, so->last_uuid, sizeof(uuid_t));
+       }
+       data.socketProtocol = so->so_proto->pr_protocol;
+
+       /*
+        * Fill in address info:
+        * For UDP, use the cfil_info hash entry directly.
+        * For TCP, compose an hash entry with the saved addresses.
+        */
+       if (cfil_info->cfi_hash_entry != NULL) {
+               hash_entry_ptr = cfil_info->cfi_hash_entry;
+       } else if (cfil_info->cfi_so_attach_faddr.sa.sa_len > 0 ||
+           cfil_info->cfi_so_attach_laddr.sa.sa_len > 0) {
+               fill_cfil_hash_entry_from_address(&hash_entry, TRUE, &cfil_info->cfi_so_attach_laddr.sa);
+               fill_cfil_hash_entry_from_address(&hash_entry, FALSE, &cfil_info->cfi_so_attach_faddr.sa);
+               hash_entry_ptr = &hash_entry;
+       }
+       if (hash_entry_ptr != NULL) {
+               boolean_t outgoing = (cfil_info->cfi_dir == CFS_CONNECTION_DIR_OUT);
+               union sockaddr_in_4_6 *src = outgoing ? &data.local : &data.remote;
+               union sockaddr_in_4_6 *dst = outgoing ? &data.remote : &data.local;
+               cfil_fill_event_msg_addresses(hash_entry_ptr, inp, src, dst, inp->inp_vflag & INP_IPV4, outgoing);
+       }
+
+       data.byte_count_in = cfil_info->cfi_byte_inbound_count;
+       data.byte_count_out = cfil_info->cfi_byte_outbound_count;
+
+       msg->cfc_signature_length = sizeof(cfil_crypto_signature);
+       if (cfil_crypto_sign_data(crypto_state, &data, msg->cfc_signature, &msg->cfc_signature_length) != 0) {
+               msg->cfc_signature_length = 0;
+               CFIL_LOG(LOG_ERR, "CFIL: Failed to sign closed msg <sockID %llu>",
+                   msg->cfc_msghdr.cfm_sock_id);
+               return false;
+       }
+
+       return true;
+}
+
 static int
-cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, uint32_t filter_control_unit)
+cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info,
+    uint32_t kcunit, int conn_dir)
 {
        errno_t error = 0;
        struct cfil_entry *entry = NULL;
        struct cfil_msg_sock_attached msg_attached;
-       uint32_t kcunit;
        struct content_filter *cfc = NULL;
+       struct inpcb *inp = (struct inpcb *)so->so_pcb;
+       struct cfil_hash_entry *hash_entry_ptr = NULL;
+       struct cfil_hash_entry hash_entry;
+
+       memset(&hash_entry, 0, sizeof(struct cfil_hash_entry));
+       proc_t p = PROC_NULL;
+       task_t t = TASK_NULL;
 
        socket_lock_assert_owned(so);
 
@@ -2534,29 +2900,19 @@ cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, uint3
                error = EINVAL;
                goto done;
        }
-       /*
-        * Find the matching filter unit
-        */
-       for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
-               cfc = content_filters[kcunit - 1];
 
-               if (cfc == NULL) {
-                       continue;
-               }
-               if (cfc->cf_necp_control_unit != filter_control_unit) {
-                       continue;
-               }
+       if (kcunit == 0) {
+               entry = SLIST_FIRST(&cfil_info->cfi_ordered_entries);
+       } else {
                entry = &cfil_info->cfi_entries[kcunit - 1];
-               if (entry->cfe_filter == NULL) {
-                       continue;
-               }
-
-               VERIFY(cfc == entry->cfe_filter);
+       }
 
-               break;
+       if (entry == NULL) {
+               goto done;
        }
 
-       if (entry == NULL || entry->cfe_filter == NULL) {
+       cfc = entry->cfe_filter;
+       if (cfc == NULL) {
                goto done;
        }
 
@@ -2564,8 +2920,12 @@ cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, uint3
                goto done;
        }
 
+       if (kcunit == 0) {
+               kcunit = CFI_ENTRY_KCUNIT(cfil_info, entry);
+       }
+
        CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u kcunit %u",
-           (uint64_t)VM_KERNEL_ADDRPERM(so), filter_control_unit, kcunit);
+           (uint64_t)VM_KERNEL_ADDRPERM(so), entry->cfe_necp_control_unit, kcunit);
 
        /* Would be wasteful to try when flow controlled */
        if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
@@ -2593,6 +2953,46 @@ cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, uint3
                memcpy(msg_attached.cfs_e_uuid, so->last_uuid, sizeof(uuid_t));
        }
 
+       /*
+        * Fill in address info:
+        * For UDP, use the cfil_info hash entry directly.
+        * For TCP, compose an hash entry with the saved addresses.
+        */
+       if (cfil_info->cfi_hash_entry != NULL) {
+               hash_entry_ptr = cfil_info->cfi_hash_entry;
+       } else if (cfil_info->cfi_so_attach_faddr.sa.sa_len > 0 ||
+           cfil_info->cfi_so_attach_laddr.sa.sa_len > 0) {
+               fill_cfil_hash_entry_from_address(&hash_entry, TRUE, &cfil_info->cfi_so_attach_laddr.sa);
+               fill_cfil_hash_entry_from_address(&hash_entry, FALSE, &cfil_info->cfi_so_attach_faddr.sa);
+               hash_entry_ptr = &hash_entry;
+       }
+       if (hash_entry_ptr != NULL) {
+               cfil_fill_event_msg_addresses(hash_entry_ptr, inp,
+                   &msg_attached.cfs_src, &msg_attached.cfs_dst,
+                   inp->inp_vflag & INP_IPV4, conn_dir == CFS_CONNECTION_DIR_OUT);
+       }
+       msg_attached.cfs_conn_dir = conn_dir;
+
+       if (msg_attached.cfs_e_pid != 0) {
+               p = proc_find(msg_attached.cfs_e_pid);
+               if (p != PROC_NULL) {
+                       t = proc_task(p);
+                       if (t != TASK_NULL) {
+                               audit_token_t audit_token;
+                               mach_msg_type_number_t count = TASK_AUDIT_TOKEN_COUNT;
+                               if (task_info(t, TASK_AUDIT_TOKEN, (task_info_t)&audit_token, &count) == KERN_SUCCESS) {
+                                       memcpy(&msg_attached.cfs_audit_token, &audit_token, sizeof(msg_attached.cfs_audit_token));
+                               } else {
+                                       CFIL_LOG(LOG_ERR, "CFIL: Failed to get process audit token <sockID %llu> ",
+                                           entry->cfe_cfil_info->cfi_sock_id);
+                               }
+                       }
+                       proc_rele(p);
+               }
+       }
+
+       cfil_dispatch_attach_event_sign(entry->cfe_filter->cf_crypto_state, cfil_info, &msg_attached);
+
 #if LIFECYCLE_DEBUG
        CFIL_LOG(LOG_DEBUG, "CFIL: LIFECYCLE: SENDING ATTACH UP <sockID %llu> ",
            entry->cfe_cfil_info->cfi_sock_id);
@@ -2800,6 +3200,10 @@ cfil_dispatch_closed_event(struct socket *so, struct cfil_info *cfil_info, int k
        memcpy(msg_closed.cfc_op_time, cfil_info->cfi_op_time, sizeof(uint32_t) * CFI_MAX_TIME_LOG_ENTRY);
        memcpy(msg_closed.cfc_op_list, cfil_info->cfi_op_list, sizeof(unsigned char) * CFI_MAX_TIME_LOG_ENTRY);
        msg_closed.cfc_op_list_ctr = cfil_info->cfi_op_list_ctr;
+       msg_closed.cfc_byte_inbound_count = cfil_info->cfi_byte_inbound_count;
+       msg_closed.cfc_byte_outbound_count = cfil_info->cfi_byte_outbound_count;
+
+       cfil_dispatch_closed_event_sign(entry->cfe_filter->cf_crypto_state, so, cfil_info, &msg_closed);
 
 #if LIFECYCLE_DEBUG
        CFIL_LOG(LOG_ERR, "CFIL: LIFECYCLE: SENDING CLOSED UP: <sock id %llu> op ctr %d, start time %llu.%llu", msg_closed.cfc_msghdr.cfm_sock_id, cfil_info->cfi_op_list_ctr, cfil_info->cfi_first_event.tv_sec, cfil_info->cfi_first_event.tv_usec);
@@ -2998,37 +3402,16 @@ cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_
        data_req->cfd_end_offset = entrybuf->cfe_peeked + copylen;
 
        /*
-        * TBD:
+        * Copy address/port into event msg.
         * For non connected sockets need to copy addresses from passed
         * parameters
         */
-       if (inp->inp_vflag & INP_IPV6) {
-               struct in6_addr *laddr = NULL, *faddr = NULL;
-               u_int16_t lport = 0, fport = 0;
+       cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp,
+           &data_req->cfc_src, &data_req->cfc_dst,
+           inp->inp_vflag & INP_IPV4, outgoing);
 
-               cfil_get_flow_address_v6(cfil_info->cfi_hash_entry, inp,
-                   &laddr, &faddr, &lport, &fport);
-               if (outgoing) {
-                       fill_ip6_sockaddr_4_6(&data_req->cfc_src, laddr, lport);
-                       fill_ip6_sockaddr_4_6(&data_req->cfc_dst, faddr, fport);
-               } else {
-                       fill_ip6_sockaddr_4_6(&data_req->cfc_src, faddr, fport);
-                       fill_ip6_sockaddr_4_6(&data_req->cfc_dst, laddr, lport);
-               }
-       } else if (inp->inp_vflag & INP_IPV4) {
-               struct in_addr laddr = {0}, faddr = {0};
-               u_int16_t lport = 0, fport = 0;
-
-               cfil_get_flow_address(cfil_info->cfi_hash_entry, inp,
-                   &laddr, &faddr, &lport, &fport);
-
-               if (outgoing) {
-                       fill_ip_sockaddr_4_6(&data_req->cfc_src, laddr, lport);
-                       fill_ip_sockaddr_4_6(&data_req->cfc_dst, faddr, fport);
-               } else {
-                       fill_ip_sockaddr_4_6(&data_req->cfc_src, faddr, fport);
-                       fill_ip_sockaddr_4_6(&data_req->cfc_dst, laddr, lport);
-               }
+       if (cfil_info->cfi_isSignatureLatest == false) {
+               cfil_dispatch_data_event_sign(entry->cfe_filter->cf_crypto_state, so, cfil_info, data_req);
        }
 
        microuptime(&tv);
@@ -3105,7 +3488,8 @@ cfil_data_service_ctl_q(struct socket *so, struct cfil_info *cfil_info, uint32_t
 
        /* Send attached message if not yet done */
        if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) {
-               error = cfil_dispatch_attach_event(so, cfil_info, kcunit);
+               error = cfil_dispatch_attach_event(so, cfil_info, CFI_ENTRY_KCUNIT(cfil_info, entry),
+                   outgoing ? CFS_CONNECTION_DIR_OUT : CFS_CONNECTION_DIR_IN);
                if (error != 0) {
                        /* We can recover from flow control */
                        if (error == ENOBUFS || error == ENOMEM) {
@@ -3566,6 +3950,7 @@ cfil_service_pending_queue(struct socket *so, struct cfil_info *cfil_info, uint3
         */
        curlen = 0;
        while ((data = cfil_queue_first(pending_q)) != NULL) {
+               struct cfil_entry *iter_entry;
                datalen = cfil_data_length(data, NULL, NULL);
 
 #if DATA_DEBUG
@@ -3583,10 +3968,10 @@ cfil_service_pending_queue(struct socket *so, struct cfil_info *cfil_info, uint3
 
                curlen += datalen;
 
-               for (kcunit += 1;
-                   kcunit <= MAX_CONTENT_FILTER;
-                   kcunit++) {
-                       error = cfil_data_filter(so, cfil_info, kcunit, outgoing,
+               for (iter_entry = SLIST_NEXT(entry, cfe_order_link);
+                   iter_entry != NULL;
+                   iter_entry = SLIST_NEXT(iter_entry, cfe_order_link)) {
+                       error = cfil_data_filter(so, cfil_info, CFI_ENTRY_KCUNIT(cfil_info, iter_entry), outgoing,
                            data, datalen);
                        /* 0 means passed so we can continue */
                        if (error != 0) {
@@ -3967,6 +4352,7 @@ cfil_action_bless_client(uint32_t kcunit, struct cfil_msg_hdr *msghdr)
                                    cfil_info->cfi_sock_id);
                        }
 #endif
+                       cfil_sock_received_verdict(so);
                        (void)cfil_action_data_pass(so, cfil_info, kcunit, 1, CFM_MAX_OFFSET, CFM_MAX_OFFSET);
                        (void)cfil_action_data_pass(so, cfil_info, kcunit, 0, CFM_MAX_OFFSET, CFM_MAX_OFFSET);
                } else {
@@ -3978,6 +4364,51 @@ cfil_action_bless_client(uint32_t kcunit, struct cfil_msg_hdr *msghdr)
        return error;
 }
 
+int
+cfil_action_set_crypto_key(uint32_t kcunit, struct cfil_msg_hdr *msghdr)
+{
+       struct content_filter *cfc = NULL;
+       cfil_crypto_state_t crypto_state = NULL;
+       struct cfil_msg_set_crypto_key *keymsg = (struct cfil_msg_set_crypto_key *)msghdr;
+
+       CFIL_LOG(LOG_NOTICE, "");
+
+       if (content_filters == NULL) {
+               CFIL_LOG(LOG_ERR, "no content filter");
+               return EINVAL;
+       }
+       if (kcunit > MAX_CONTENT_FILTER) {
+               CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
+                   kcunit, MAX_CONTENT_FILTER);
+               return EINVAL;
+       }
+       crypto_state = cfil_crypto_init_client((uint8_t *)keymsg->crypto_key);
+       if (crypto_state == NULL) {
+               CFIL_LOG(LOG_ERR, "failed to initialize crypto state for unit %u)",
+                   kcunit);
+               return EINVAL;
+       }
+
+       cfil_rw_lock_exclusive(&cfil_lck_rw);
+
+       cfc = content_filters[kcunit - 1];
+       if (cfc->cf_kcunit != kcunit) {
+               CFIL_LOG(LOG_ERR, "bad unit info %u)",
+                   kcunit);
+               cfil_rw_unlock_exclusive(&cfil_lck_rw);
+               cfil_crypto_cleanup_state(crypto_state);
+               return EINVAL;
+       }
+       if (cfc->cf_crypto_state != NULL) {
+               cfil_crypto_cleanup_state(cfc->cf_crypto_state);
+               cfc->cf_crypto_state = NULL;
+       }
+       cfc->cf_crypto_state = crypto_state;
+
+       cfil_rw_unlock_exclusive(&cfil_lck_rw);
+       return 0;
+}
+
 static int
 cfil_update_entry_offsets(struct socket *so, struct cfil_info *cfil_info, int outgoing, unsigned int datalen)
 {
@@ -4047,8 +4478,10 @@ cfil_data_common(struct socket *so, struct cfil_info *cfil_info, int outgoing, s
 
        if (outgoing) {
                cfi_buf = &cfil_info->cfi_snd;
+               cfil_info->cfi_byte_outbound_count += datalen;
        } else {
                cfi_buf = &cfil_info->cfi_rcv;
+               cfil_info->cfi_byte_inbound_count += datalen;
        }
 
        cfi_buf->cfi_pending_last += datalen;
@@ -4085,10 +4518,12 @@ cfil_data_common(struct socket *so, struct cfil_info *cfil_info, int outgoing, s
                CFIL_LOG(LOG_DEBUG, "CFIL: QUEUEING DATA: FAST PATH");
 #endif
        } else {
-               for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
+               struct cfil_entry *iter_entry;
+               SLIST_FOREACH(iter_entry, &cfil_info->cfi_ordered_entries, cfe_order_link) {
                        // Is cfil attached to this filter?
+                       kcunit = CFI_ENTRY_KCUNIT(cfil_info, iter_entry);
                        if (IS_ENTRY_ATTACHED(cfil_info, kcunit)) {
-                               if (IS_UDP(so)) {
+                               if (IS_UDP(so) && chain == NULL) {
                                        /* UDP only:
                                         * Chain addr (incoming only TDB), control (optional) and data into one chain.
                                         * This full chain will be reinjected into socket after recieving verdict.
@@ -4140,6 +4575,13 @@ cfil_sock_data_out(struct socket *so, struct sockaddr  *to,
                return 0;
        }
 
+       /*
+        * Pass initial data for TFO.
+        */
+       if (IS_INITIAL_TFO_DATA(so)) {
+               return 0;
+       }
+
        socket_lock_assert_owned(so);
 
        if (so->so_cfil->cfi_flags & CFIF_DROP) {
@@ -4188,6 +4630,13 @@ cfil_sock_data_in(struct socket *so, struct sockaddr *from,
                return 0;
        }
 
+       /*
+        * Pass initial data for TFO.
+        */
+       if (IS_INITIAL_TFO_DATA(so)) {
+               return 0;
+       }
+
        socket_lock_assert_owned(so);
 
        if (so->so_cfil->cfi_flags & CFIF_DROP) {
@@ -5311,7 +5760,7 @@ cfil_db_get_cfil_info(struct cfil_db *db, cfil_sock_id_t id)
 
        if (db == NULL || id == 0) {
                CFIL_LOG(LOG_DEBUG, "CFIL: UDP <so %llx> NULL DB <id %llu>",
-                   (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so), id);
+                   db ? (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so) : 0, id);
                return NULL;
        }
 
@@ -5331,7 +5780,6 @@ cfil_db_get_cfil_info(struct cfil_db *db, cfil_sock_id_t id)
 struct cfil_hash_entry *
 cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool outgoing, struct sockaddr *local, struct sockaddr *remote)
 {
-#pragma unused(so, filter_control_unit, outgoing, local, remote)
        struct cfil_hash_entry *hash_entry = NULL;
 
        errno_t error = 0;
@@ -5364,6 +5812,7 @@ cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool out
                OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem);
                return NULL;
        }
+       hash_entry->cfentry_cfil->cfi_dir = outgoing ? CFS_CONNECTION_DIR_OUT : CFS_CONNECTION_DIR_IN;
 
 #if LIFECYCLE_DEBUG
        cfil_info_log(LOG_ERR, hash_entry->cfentry_cfil, "CFIL: LIFECYCLE: ADDED");
@@ -5387,7 +5836,8 @@ cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool out
        /* Hold a reference on the socket for each flow */
        so->so_usecount++;
 
-       error = cfil_dispatch_attach_event(so, hash_entry->cfentry_cfil, filter_control_unit);
+       error = cfil_dispatch_attach_event(so, hash_entry->cfentry_cfil, 0,
+           outgoing ? CFS_CONNECTION_DIR_OUT : CFS_CONNECTION_DIR_IN);
        /* We can recover from flow control or out of memory errors */
        if (error != 0 && error != ENOBUFS && error != ENOMEM) {
                return NULL;
@@ -5416,12 +5866,21 @@ cfil_sock_udp_handle_data(bool outgoing, struct socket *so,
                return error;
        }
 
+       // Socket has been blessed
+       if ((so->so_flags1 & SOF1_CONTENT_FILTER_SKIP) != 0) {
+               return error;
+       }
+
        filter_control_unit = necp_socket_get_content_filter_control_unit(so);
        if (filter_control_unit == 0) {
                CFIL_LOG(LOG_DEBUG, "CFIL: UDP failed to get control unit");
                return error;
        }
 
+       if (filter_control_unit == NECP_FILTER_UNIT_NO_FILTER) {
+               return error;
+       }
+
        if ((filter_control_unit & NECP_MASK_USERSPACE_ONLY) != 0) {
                CFIL_LOG(LOG_DEBUG, "CFIL: UDP user space only");
                OSIncrementAtomic(&cfil_stats.cfs_sock_userspace_only);
index 6af66bb7034838d5abf74c8f90449b64ce38a8f0..b4f4485c53477cb12f7eb74cf3bf2e8ce45baf38 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2013-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  *
@@ -31,6 +31,7 @@
 #include <sys/syslog.h>
 #include <netinet/in.h>
 #include <stdint.h>
+#include <corecrypto/ccsha2.h>
 
 #ifdef BSD_KERNEL_PRIVATE
 #include <sys/mbuf.h>
@@ -91,7 +92,42 @@ struct cfil_opt_sock_info {
 /*
  * How many filter may be active simultaneously
  */
+#if !TARGET_OS_OSX && !defined(XNU_TARGET_OS_OSX)
 #define CFIL_MAX_FILTER_COUNT   2
+#else
+#define CFIL_MAX_FILTER_COUNT   8
+#endif
+
+
+/*
+ * Crypto Support
+ */
+#define CFIL_CRYPTO 1
+#define CFIL_CRYPTO_SIGNATURE_SIZE 32
+#define CFIL_CRYPTO_DATA_EVENT 1
+
+typedef uint8_t cfil_crypto_key[CCSHA256_OUTPUT_SIZE];
+typedef uint8_t cfil_crypto_signature[CFIL_CRYPTO_SIGNATURE_SIZE];
+
+typedef struct cfil_crypto_state {
+       const struct ccdigest_info *digest_info;
+       cfil_crypto_key key;
+} *cfil_crypto_state_t;
+
+typedef struct cfil_crypto_data {
+       uuid_t flow_id;
+       u_int64_t sock_id;
+       u_int32_t direction;
+       union sockaddr_in_4_6 remote;
+       union sockaddr_in_4_6 local;
+       u_int32_t socketProtocol;
+       pid_t pid;
+       pid_t effective_pid;
+       uuid_t uuid;
+       uuid_t effective_uuid;
+       u_int64_t byte_count_in;
+       u_int64_t byte_count_out;
+} *cfil_crypto_data_t;
 
 /*
  * Types of messages
@@ -120,6 +156,7 @@ struct cfil_opt_sock_info {
 #define CFM_OP_DATA_UPDATE 16           /* update pass or peek offsets */
 #define CFM_OP_DROP 17                  /* shutdown socket, no more data */
 #define CFM_OP_BLESS_CLIENT 18          /* mark a client flow as already filtered, passes a uuid */
+#define CFM_OP_SET_CRYPTO_KEY 19        /* assign client crypto key for message signing */
 
 /*
  * struct cfil_msg_hdr
@@ -136,6 +173,14 @@ struct cfil_msg_hdr {
 
 #define CFM_VERSION_CURRENT 1
 
+/*
+ * Connection Direction
+ */
+#define CFS_CONNECTION_DIR_IN  0
+#define CFS_CONNECTION_DIR_OUT 1
+
+#define CFS_AUDIT_TOKEN            1
+
 /*
  * struct cfil_msg_sock_attached
  *
@@ -158,6 +203,12 @@ struct cfil_msg_sock_attached {
        pid_t                   cfs_e_pid;
        uuid_t                  cfs_uuid;
        uuid_t                  cfs_e_uuid;
+       union sockaddr_in_4_6   cfs_src;
+       union sockaddr_in_4_6   cfs_dst;
+       int                     cfs_conn_dir;
+       unsigned int            cfs_audit_token[8];             /* Must match audit_token_t */
+       cfil_crypto_signature   cfs_signature;
+       uint32_t                cfs_signature_length;
 };
 
 /*
@@ -181,6 +232,8 @@ struct cfil_msg_data_event {
        union sockaddr_in_4_6   cfc_dst;
        uint64_t                cfd_start_offset;
        uint64_t                cfd_end_offset;
+       cfil_crypto_signature   cfd_signature;
+       uint32_t                cfd_signature_length;
        /* Actual content data immediatly follows */
 };
 
@@ -203,6 +256,10 @@ struct cfil_msg_sock_closed {
        uint32_t                cfc_op_list_ctr;
        uint32_t                cfc_op_time[CFI_MAX_TIME_LOG_ENTRY];    /* time interval in microseconds since first event */
        unsigned char           cfc_op_list[CFI_MAX_TIME_LOG_ENTRY];
+       uint64_t                cfc_byte_inbound_count;
+       uint64_t                cfc_byte_outbound_count;
+       cfil_crypto_signature   cfc_signature;
+       uint32_t                cfc_signature_length;
 } __attribute__((aligned(8)));
 
 /*
@@ -244,6 +301,20 @@ struct cfil_msg_bless_client {
        uuid_t cfb_client_uuid;
 };
 
+/*
+ * struct cfil_msg_set_crypto_key
+ *
+ * Filter assigning client crypto key to CFIL for message signing
+ *
+ * Valid Type: CFM_TYPE_ACTION
+ *
+ * Valid Ops: CFM_OP_SET_CRYPTO_KEY
+ */
+struct cfil_msg_set_crypto_key {
+       struct cfil_msg_hdr     cfb_msghdr;
+       cfil_crypto_key         crypto_key;
+};
+
 #define CFM_MAX_OFFSET  UINT64_MAX
 
 /*
@@ -400,7 +471,10 @@ do { \
 
 extern void cfil_init(void);
 
-extern errno_t cfil_sock_attach(struct socket *so);
+extern boolean_t cfil_filter_present(void);
+extern boolean_t cfil_sock_connected_pending_verdict(struct socket *so);
+extern errno_t cfil_sock_attach(struct socket *so,
+    struct sockaddr *local, struct sockaddr *remote, int dir);
 extern errno_t cfil_sock_detach(struct socket *so);
 
 extern int cfil_sock_data_out(struct socket *so, struct sockaddr  *to,
diff --git a/bsd/net/content_filter_crypto.c b/bsd/net/content_filter_crypto.c
new file mode 100644 (file)
index 0000000..a0d8e64
--- /dev/null
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2019 Apple Inc.
+ * All rights reserved.
+ */
+
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <corecrypto/cchmac.h>
+#include <net/content_filter.h>
+#include <net/content_filter_crypto.h>
+
+extern int cfil_log_level;
+
+#define CFIL_CRYPTO_LOG(level, fmt, ...) \
+do { \
+    if (cfil_log_level >= level) \
+       printf("%s:%d " fmt "\n",\
+           __FUNCTION__, __LINE__, ##__VA_ARGS__); \
+} while (0)
+
+#define CFIL_CRYPTO_LOG_4BYTES(name) \
+    CFIL_CRYPTO_LOG(LOG_DEBUG, \
+                   "%s \t%s: %hhX %hhX %hhX %hhX", \
+                   prefix, name, ptr[0], ptr[1], ptr[2], ptr[3])
+
+#define CFIL_CRYPTO_LOG_8BYTES(name) \
+    CFIL_CRYPTO_LOG(LOG_DEBUG, \
+                   "%s \t%s: %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX", \
+                   prefix, name, ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7])
+
+#define CFIL_CRYPTO_LOG_16BYTES(name) \
+    CFIL_CRYPTO_LOG(LOG_DEBUG, \
+               "%s \t%s: %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX", \
+               prefix, name, ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7], ptr[8], ptr[9], ptr[10], ptr[11], ptr[12], ptr[13], ptr[14], ptr[15])
+
+#define CFIL_CRYPTO_LOG_28BYTES(name) \
+    CFIL_CRYPTO_LOG(LOG_DEBUG, \
+                   "%s \t%s: %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX", \
+                   prefix, name, ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7], ptr[8], ptr[9], ptr[10], ptr[11], ptr[12], ptr[13], ptr[14], ptr[15], ptr[16], ptr[17], ptr[18], ptr[19], ptr[20], ptr[21], ptr[22], ptr[23], ptr[24], ptr[25], ptr[26], ptr[27])
+
+#define CFIL_CRYPTO_LOG_32BYTES(name, prefix) \
+    CFIL_CRYPTO_LOG(LOG_DEBUG, \
+                   "%s \t%s: %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX %hhX", \
+                   prefix, name, ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7], ptr[8], ptr[9], ptr[10], ptr[11], ptr[12], ptr[13], ptr[14], ptr[15], ptr[16], ptr[17], ptr[18], ptr[19], ptr[20], ptr[21], ptr[22], ptr[23], ptr[24], ptr[25], ptr[26], ptr[27], ptr[28], ptr[29], ptr[30], ptr[31])
+
+static void
+cfil_crypto_print_data(cfil_crypto_data_t data, const char *prefix)
+{
+       u_int8_t *ptr = NULL;
+       CFIL_CRYPTO_LOG(LOG_DEBUG, "%s NE Filter crypto data:", prefix);
+
+       ptr = (u_int8_t *)&data->flow_id;
+       CFIL_CRYPTO_LOG_16BYTES("flow_id");
+
+       ptr = (u_int8_t *)&data->sock_id;
+       CFIL_CRYPTO_LOG_8BYTES("sock_id");
+
+       ptr = (u_int8_t *)&data->direction;
+       CFIL_CRYPTO_LOG_4BYTES("direction");
+
+       ptr = (u_int8_t *)&data->remote;
+       CFIL_CRYPTO_LOG_28BYTES("remote");
+       ptr = (u_int8_t *)&data->local;
+       CFIL_CRYPTO_LOG_28BYTES("local");
+
+       ptr = (u_int8_t *)&data->socketProtocol;
+       CFIL_CRYPTO_LOG_4BYTES("socketProtocol");
+
+       ptr = (u_int8_t *)&data->pid;
+       CFIL_CRYPTO_LOG_4BYTES("pid");
+
+       ptr = (u_int8_t *)&data->effective_pid;
+       CFIL_CRYPTO_LOG_4BYTES("effective_pid");
+
+       ptr = (u_int8_t *)&data->uuid;
+       CFIL_CRYPTO_LOG_16BYTES("uuid");
+       ptr = (u_int8_t *)&data->effective_uuid;
+       CFIL_CRYPTO_LOG_16BYTES("effective_uuid");
+
+       ptr = (u_int8_t *)&data->byte_count_in;
+       CFIL_CRYPTO_LOG_8BYTES("byte_count_in");
+
+       ptr = (u_int8_t *)&data->byte_count_out;
+       CFIL_CRYPTO_LOG_8BYTES("byte_count_out");
+}
+
+cfil_crypto_state_t
+cfil_crypto_init_client(cfil_crypto_key client_key)
+{
+       if (client_key == NULL) {
+               return NULL;
+       }
+
+       struct cfil_crypto_state *state;
+       MALLOC(state, struct cfil_crypto_state *, sizeof(struct cfil_crypto_state),
+           M_TEMP, M_WAITOK | M_ZERO);
+       if (state == NULL) {
+               return NULL;
+       }
+
+       memcpy(state->key, client_key, sizeof(cfil_crypto_key));
+       state->digest_info = ccsha256_di();
+
+       CFIL_CRYPTO_LOG(LOG_DEBUG, "Inited client key");
+       return state;
+}
+
+void
+cfil_crypto_cleanup_state(cfil_crypto_state_t state)
+{
+       if (state != NULL) {
+               FREE(state, M_TEMP);
+       }
+}
+
+static void
+cfil_crypto_update_context(const struct ccdigest_info *di,
+    cchmac_ctx_t ctx,
+    cfil_crypto_data_t data)
+{
+       const uint8_t context[32] = {[0 ... 31] = 0x20}; // 0x20 repeated 32 times
+       const char *context_string = "NEFilterCrypto";
+       uint8_t separator = 0;
+       cchmac_update(di, ctx, sizeof(context), context);
+       cchmac_update(di, ctx, strlen(context_string), context_string);
+       cchmac_update(di, ctx, sizeof(separator), &separator);
+       cchmac_update(di, ctx, sizeof(struct cfil_crypto_data), data);
+}
+
+int
+cfil_crypto_sign_data(cfil_crypto_state_t state, cfil_crypto_data_t data,
+    cfil_crypto_signature signature, u_int32_t *signature_length)
+{
+       u_int8_t *ptr = NULL;
+
+       if (state->digest_info == NULL) {
+               return EINVAL;
+       }
+
+       if (data == NULL ||
+           signature == NULL ||
+           signature_length == NULL) {
+               return EINVAL;
+       }
+
+       size_t required_tag_length = state->digest_info->output_size;
+       if (*signature_length < required_tag_length) {
+               return ERANGE;
+       }
+
+       *signature_length = (u_int32_t)required_tag_length;
+
+       cchmac_ctx_decl(state->digest_info->state_size,
+           state->digest_info->block_size, ctx);
+       cchmac_init(state->digest_info, ctx,
+           sizeof(state->key),
+           state->key);
+       cfil_crypto_update_context(state->digest_info, ctx, data);
+       cchmac_final(state->digest_info, ctx, signature);
+
+       if (cfil_log_level >= LOG_DEBUG) {
+               cfil_crypto_print_data(data, "SIGN");
+               CFIL_CRYPTO_LOG(LOG_DEBUG, "Signed data: datalen %lu", sizeof(struct cfil_crypto_data));
+               ptr = (u_int8_t *)signature;
+               CFIL_CRYPTO_LOG_32BYTES("Signature", "SIGN");
+       }
+
+       return 0;
+}
diff --git a/bsd/net/content_filter_crypto.h b/bsd/net/content_filter_crypto.h
new file mode 100644 (file)
index 0000000..fd56c0a
--- /dev/null
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2019 Apple Inc.
+ * All rights reserved.
+ */
+
+#ifndef __content_filter_crypto_h
+#define __content_filter_crypto_h
+
+#include <net/content_filter.h>
+
+extern cfil_crypto_state_t
+cfil_crypto_init_client(cfil_crypto_key client_key);
+
+extern void
+cfil_crypto_cleanup_state(cfil_crypto_state_t state);
+
+extern int
+cfil_crypto_sign_data(cfil_crypto_state_t state, cfil_crypto_data_t data,
+    cfil_crypto_signature signature, u_int32_t *signature_length);
+
+#endif // __content_filter_crypto_h
diff --git a/bsd/net/contiki-conf.h b/bsd/net/contiki-conf.h
new file mode 100644 (file)
index 0000000..2ee32b4
--- /dev/null
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifndef contiki_conf_h
+#define contiki_conf_h
+
+#define NETSTACK_CONF_NETWORK sicslowpan_driver
+
+#endif /* contiki_conf_h */
diff --git a/bsd/net/contiki-default-conf.h b/bsd/net/contiki-default-conf.h
new file mode 100644 (file)
index 0000000..9cf10f7
--- /dev/null
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ * Copyright (c) 2012, Thingsquare, http://www.thingsquare.com/.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef CONTIKI_DEFAULT_CONF_H
+#define CONTIKI_DEFAULT_CONF_H
+
+/*---------------------------------------------------------------------------*/
+/* Netstack configuration
+ *
+ * The netstack configuration is typically overridden by the platform
+ * configuration, as defined in contiki-conf.h
+ */
+
+/* NETSTACK_CONF_RADIO specifies the radio driver. The radio driver
+ *  typically depends on the radio used on the target hardware. */
+#ifndef NETSTACK_CONF_RADIO
+#define NETSTACK_CONF_RADIO nullradio_driver
+/* #define NETSTACK_CONF_RADIO cc2420_driver */
+#endif /* NETSTACK_CONF_RADIO */
+
+/* NETSTACK_CONF_FRAMER specifies the over-the-air frame format used
+ *  by Contiki radio packets. For IEEE 802.15.4 radios, use the
+ *  framer_802154 driver. */
+#ifndef NETSTACK_CONF_FRAMER
+#define NETSTACK_CONF_FRAMER framer_nullmac
+/* #define NETSTACK_CONF_FRAMER framer_802154 */
+#endif /* NETSTACK_CONF_FRAMER */
+
+/* NETSTACK_CONF_RDC specifies the Radio Duty Cycling (RDC) layer. The
+ *  nullrdc_driver never turns the radio off and is compatible with all
+ *  radios, but consumes a lot of power. The contikimac_driver is
+ *  highly power-efficent and allows sleepy routers, but is not
+ *  compatible with all radios. */
+#ifndef NETSTACK_CONF_RDC
+#define NETSTACK_CONF_RDC   nullrdc_driver
+/* #define NETSTACK_CONF_RDC   contikimac_driver */
+#endif /* NETSTACK_CONF_RDC */
+
+/* NETSTACK_CONF_MAC specifies the Medium Access Control (MAC)
+ *  layer. The nullmac_driver does not provide any MAC
+ *  functionality. The csma_driver is the default CSMA MAC layer, but
+ *  is not compatible with all radios. */
+#ifndef NETSTACK_CONF_MAC
+#define NETSTACK_CONF_MAC   nullmac_driver
+/* #define NETSTACK_CONF_MAC   csma_driver */
+#endif /* NETSTACK_CONF_MAC */
+
+/* NETSTACK_CONF_LLSEC specifies the link layer security driver. */
+#ifndef NETSTACK_CONF_LLSEC
+#define NETSTACK_CONF_LLSEC nullsec_driver
+#endif /* NETSTACK_CONF_LLSEC */
+
+/* To avoid unnecessary complexity, we assume the common case of
+ *   a constant LoWPAN-wide IEEE 802.15.4 security level, which
+ *   can be specified by defining LLSEC802154_CONF_SECURITY_LEVEL. */
+#ifndef LLSEC802154_CONF_SECURITY_LEVEL
+#define LLSEC802154_CONF_SECURITY_LEVEL 0
+#endif /* LLSEC802154_CONF_SECURITY_LEVEL */
+
+/* NETSTACK_CONF_NETWORK specifies the network layer and can be either
+ *  sicslowpan_driver, for IPv6 networking, or rime_driver, for the
+ *  custom Rime network stack. */
+#ifndef NETSTACK_CONF_NETWORK
+#define NETSTACK_CONF_NETWORK rime_driver
+/* #define NETSTACK_CONF_NETWORK sicslowpan_driver */
+#endif /* NETSTACK_CONF_NETWORK */
+
+/* NETSTACK_CONF_RDC_CHANNEL_CHECK_RATE specifies the channel check
+ *  rate of the RDC layer. This defines how often the RDC will wake up
+ *  and check for radio channel activity. A higher check rate results
+ *  in higher communication performance at the cost of a higher power
+ *  consumption. */
+#ifndef NETSTACK_CONF_RDC_CHANNEL_CHECK_RATE
+#define NETSTACK_CONF_RDC_CHANNEL_CHECK_RATE 8
+#endif /* NETSTACK_CONF_RDC_CHANNEL_CHECK_RATE */
+
+/*---------------------------------------------------------------------------*/
+/* Packet buffer size options.
+ *
+ * The packet buffer size options can be tweaked on a per-project
+ * basis to reduce memory consumption.
+ */
+
+/* QUEUEBUF_CONF_NUM specifies the number of queue buffers. Queue
+ *  buffers are used throughout the Contiki netstack but the
+ *  configuration option can be tweaked to save memory. Performance can
+ *  suffer with a too low number of queue buffers though. */
+#ifndef QUEUEBUF_CONF_NUM
+#define QUEUEBUF_CONF_NUM 8
+#endif /* QUEUEBUF_CONF_NUM */
+/*---------------------------------------------------------------------------*/
+/* uIPv6 configuration options.
+ *
+ * Many of the uIPv6 configuration options can be overriden by a
+ * project-specific configuration to save memory.
+ */
+
+/* NETSTACK_CONF_WITH_IPV6 specifies whether or not IPv6 should be used. If IPv6
+ *  is not used, IPv4 is used instead. */
+#ifndef NETSTACK_CONF_WITH_IPV6
+#define NETSTACK_CONF_WITH_IPV6 0
+#endif /* NETSTACK_CONF_WITH_IPV6 */
+
+/* UIP_CONF_BUFFER_SIZE specifies how much memory should be reserved
+ *  for the uIP packet buffer. This sets an upper bound on the largest
+ *  IP packet that can be received by the system. */
+#ifndef UIP_CONF_BUFFER_SIZE
+#define UIP_CONF_BUFFER_SIZE 128
+#endif /* UIP_CONF_BUFFER_SIZE */
+
+/* UIP_CONF_ROUTER specifies if the IPv6 node should be a router or
+ *  not. By default, all Contiki nodes are routers. */
+#ifndef UIP_CONF_ROUTER
+#define UIP_CONF_ROUTER 1
+#endif /* UIP_CONF_ROUTER */
+
+/* UIP_CONF_IPV6_RPL specifies if RPL is to be used for IPv6
+ *  routing. */
+#ifndef UIP_CONF_IPV6_RPL
+#define UIP_CONF_IPV6_RPL 1
+#endif /* UIP_CONF_IPV6_RPL */
+
+/* UIP_CONF_MAX_ROUTES specifies the maximum number of routes that each
+ *  node will be able to handle. */
+#ifndef UIP_CONF_MAX_ROUTES
+#define UIP_CONF_MAX_ROUTES 20
+#endif /* UIP_CONF_MAX_ROUTES */
+
+/* UIP_CONF_UDP specifies if UDP support should be included or
+ *  not. Disabling UDP saves memory but breaks a lot of stuff. */
+#ifndef UIP_CONF_UDP
+#define UIP_CONF_UDP 1
+#endif /* UIP_CONF_UDP */
+
+/* UIP_CONF_MAX_CONNECTIONS specifies the maximum number of
+ *  simultaneous TCP connections. */
+#ifndef UIP_CONF_MAX_CONNECTIONS
+#define UIP_CONF_MAX_CONNECTIONS 8
+#endif /* UIP_CONF_MAX_CONNECTIONS */
+
+/* UIP_CONF_TCP specifies if TCP support should be included or
+ *  not. Disabling TCP saves memory. */
+#ifndef UIP_CONF_TCP
+#define UIP_CONF_TCP 1
+#endif /* UIP_CONF_TCP */
+
+/* UIP_CONF_MAX_CONNECTIONS specifies the maximum number of
+ *  simultaneous TCP connections. */
+#ifndef UIP_CONF_MAX_CONNECTIONS
+#define UIP_CONF_MAX_CONNECTIONS 8
+#endif /* UIP_CONF_MAX_CONNECTIONS */
+
+
+/* UIP_CONF_TCP_SPLIT enables a performance optimization hack, where
+ *  each maximum-sized TCP segment is split into two, to avoid the
+ *  performance degradation that is caused by delayed ACKs. */
+#ifndef UIP_CONF_TCP_SPLIT
+#define UIP_CONF_TCP_SPLIT 0
+#endif /* UIP_CONF_TCP_SPLIT */
+
+/* NBR_TABLE_CONF_MAX_NEIGHBORS specifies the maximum number of neighbors
+ *  that each node will be able to handle. */
+#ifndef NBR_TABLE_CONF_MAX_NEIGHBORS
+#define NBR_TABLE_CONF_MAX_NEIGHBORS 8
+#endif /* NBR_TABLE_CONF_MAX_NEIGHBORS */
+
+/* UIP_CONF_ND6_SEND_NA enables standard IPv6 Neighbor Discovery Protocol.
+ *  This is unneeded when RPL is used. Disable to save ROM and a little RAM. */
+#ifndef UIP_CONF_ND6_SEND_NA
+#define UIP_CONF_ND6_SEND_NA 1
+#endif /* UIP_CONF_ND6_SEND_NA */
+
+/*---------------------------------------------------------------------------*/
+/* 6lowpan configuration options.
+ *
+ * These options change the behavior of the 6lowpan header compression
+ * code (sicslowpan). They typically depend on the type of radio used
+ * on the target platform, and are therefore platform-specific.
+ */
+
+/* SICSLOWPAN_CONF_MAX_MAC_TRANSMISSIONS specifies how many times the
+ *  MAC layer should resend packets if no link-layer ACK was
+ *  received. This only makes sense with the csma_driver
+ *  NETSTACK_CONF_MAC. */
+#ifndef SICSLOWPAN_CONF_MAX_MAC_TRANSMISSIONS
+#define SICSLOWPAN_CONF_MAX_MAC_TRANSMISSIONS 4
+#endif /* SICSLOWPAN_CONF_MAX_MAC_TRANSMISSIONS */
+
+/* SICSLOWPAN_CONF_FRAG specifies if 6lowpan fragmentation should be
+ *  used or not. Fragmentation is on by default. */
+#ifndef SICSLOWPAN_CONF_FRAG
+#define SICSLOWPAN_CONF_FRAG 1
+#endif /* SICSLOWPAN_CONF_FRAG */
+
+/* SICSLOWPAN_CONF_MAC_MAX_PAYLOAD is the maximum available size for
+ *  frame headers, link layer security-related overhead,  as well as
+ *  6LoWPAN payload. By default, SICSLOWPAN_CONF_MAC_MAX_PAYLOAD is
+ *  127 bytes (MTU of 802.15.4) - 2 bytes (Footer of 802.15.4). */
+#ifndef SICSLOWPAN_CONF_MAC_MAX_PAYLOAD
+#define SICSLOWPAN_CONF_MAC_MAX_PAYLOAD (127 - 2)
+#endif /* SICSLOWPAN_CONF_MAC_MAX_PAYLOAD */
+
+/* SICSLOWPAN_CONF_COMPRESSION_THRESHOLD sets a lower threshold for
+ *  when packets should not be compressed. This is used by ContikiMAC,
+ *  which requires packets to be larger than a given minimum size. */
+#ifndef SICSLOWPAN_CONF_COMPRESSION_THRESHOLD
+#define SICSLOWPAN_CONF_COMPRESSION_THRESHOLD 0
+/* #define SICSLOWPAN_CONF_COMPRESSION_THRESHOLD 63 */
+#endif /* SICSLOWPAN_CONF_COMPRESSION_THRESHOLD */
+
+/* SICSLOWPAN_CONF_COMPRESSION specifies what 6lowpan compression
+ *  mechanism to be used. 6lowpan hc06 is the default in Contiki. */
+#ifndef SICSLOWPAN_CONF_COMPRESSION
+#define SICSLOWPAN_CONF_COMPRESSION SICSLOWPAN_COMPRESSION_HC06
+#endif /* SICSLOWPAN_CONF_COMPRESSION */
+
+/*---------------------------------------------------------------------------*/
+/* ContikiMAC configuration options.
+ *
+ * These are typically configured on a per-platform basis.
+ */
+
+/* CONTIKIMAC_CONF_WITH_PHASE_OPTIMIZATION specifies if ContikiMAC
+ *  should optimize for the phase of neighbors. The phase optimization
+ *  may reduce power consumption but is not compatible with all timer
+ *  settings and is therefore off by default. */
+#ifndef CONTIKIMAC_CONF_WITH_PHASE_OPTIMIZATION
+#define CONTIKIMAC_CONF_WITH_PHASE_OPTIMIZATION 0
+#endif /* CONTIKIMAC_CONF_WITH_PHASE_OPTIMIZATION */
+
+
+#endif /* CONTIKI_DEFAULT_CONF_H */
diff --git a/bsd/net/contiki-lib.h b/bsd/net/contiki-lib.h
new file mode 100644 (file)
index 0000000..23e4daf
--- /dev/null
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ * Copyright (c) 2005, Swedish Institute of Computer Science.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Institute nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file is part of the Contiki operating system.
+ *
+ * Author: Adam Dunkels <adam@sics.se>
+ *
+ */
+#ifndef CONTIKI_LIB_H_
+#define CONTIKI_LIB_H_
+
+#include "contiki.h"
+#include "lib/list.h"
+#include "lib/memb.h"
+#include "lib/mmem.h"
+#include "lib/random.h"
+
+#endif /* CONTIKI_LIB_H_ */
diff --git a/bsd/net/contiki-net.h b/bsd/net/contiki-net.h
new file mode 100644 (file)
index 0000000..4b67351
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ * Copyright (c) 2005, Swedish Institute of Computer Science.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Institute nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file is part of the Contiki operating system.
+ *
+ * Author: Adam Dunkels <adam@sics.se>
+ *
+ */
+#ifndef CONTIKI_NET_H_
+#define CONTIKI_NET_H_
+
+#include "contiki.h"
+
+#include "net/ip/tcpip.h"
+#include "net/ip/uip.h"
+#include "net/ipv4/uip-fw.h"
+#include "net/ipv4/uip-fw-drv.h"
+#include "net/ipv4/uip_arp.h"
+#include "net/ip/uiplib.h"
+#include "net/ip/uip-udp-packet.h"
+#include "net/ip/simple-udp.h"
+#include "net/ip/uip-nameserver.h"
+
+#if NETSTACK_CONF_WITH_IPV6
+#include "net/ipv6/uip-icmp6.h"
+#include "net/ipv6/uip-ds6.h"
+#endif /* NETSTACK_CONF_WITH_IPV6 */
+
+#include "net/ip/resolv.h"
+
+#include "net/ip/psock.h"
+
+#include "net/ip/udp-socket.h"
+#include "net/ip/tcp-socket.h"
+
+#include "net/rime/rime.h"
+
+#include "net/netstack.h"
+
+#endif /* CONTIKI_NET_H_ */
diff --git a/bsd/net/contiki-version.h b/bsd/net/contiki-version.h
new file mode 100644 (file)
index 0000000..318d9ee
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ * Copyright (c) 2004, Swedish Institute of Computer Science.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Institute nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file is part of the Contiki operating system.
+ *
+ * Author: Adam Dunkels <adam@sics.se>
+ *
+ */
+#ifndef __CONTIKI_VERSION__
+#define __CONTIKI_VERSION__
+
+#ifndef CONTIKI_VERSION_STRING
+#define CONTIKI_VERSION_STRING "Contiki 3.x"
+#endif /* CONTIKI_VERSION_STRING */
+
+#endif /* __CONTIKI_VERSION__ */
diff --git a/bsd/net/contiki.h b/bsd/net/contiki.h
new file mode 100644 (file)
index 0000000..3cc2488
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ * Copyright (c) 2004, Swedish Institute of Computer Science.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Institute nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file is part of the Contiki operating system.
+ *
+ * Author: Adam Dunkels <adam@sics.se>
+ *
+ */
+#ifndef CONTIKI_H_
+#define CONTIKI_H_
+
+#include "contiki-version.h"
+#include "contiki-conf.h"
+#include "contiki-default-conf.h"
+
+#include "sys/process.h"
+#include "sys/autostart.h"
+
+#include "sys/timer.h"
+#include "sys/ctimer.h"
+#include "sys/etimer.h"
+#include "sys/rtimer.h"
+
+#include "sys/pt.h"
+
+#include "sys/procinit.h"
+
+#include "sys/loader.h"
+#include "sys/clock.h"
+
+#include "sys/energest.h"
+
+#endif /* CONTIKI_H_ */
index 39ce6ac9ee4410efcd92b5a479a54a3cc0c3cdec..7a119911e9bcc3a5141e571de094da6c2d716caa 100644 (file)
 #include <net/pfvar.h>
 #endif /* PF */
 #include <net/pktsched/pktsched.h>
+#include <net/pktsched/pktsched_netem.h>
 
 #if NECP
 #include <net/necp.h>
 #endif /* NECP */
 
 
+#include <os/log.h>
+
 #define DBG_LAYER_BEG           DLILDBG_CODE(DBG_DLIL_STATIC, 0)
 #define DBG_LAYER_END           DLILDBG_CODE(DBG_DLIL_STATIC, 2)
 #define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
@@ -392,13 +395,20 @@ static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
 
 static void dlil_main_input_thread_func(void *, wait_result_t);
+static void dlil_main_input_thread_cont(void *, wait_result_t);
+
 static void dlil_input_thread_func(void *, wait_result_t);
+static void dlil_input_thread_cont(void *, wait_result_t);
+
 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
+static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
+
 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *);
 static void dlil_terminate_input_thread(struct dlil_threading_info *);
 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
-    struct dlil_threading_info *, boolean_t);
-static void dlil_input_stats_sync(struct ifnet *, struct dlil_threading_info *);
+    struct dlil_threading_info *, struct ifnet *, boolean_t);
+static boolean_t dlil_input_stats_sync(struct ifnet *,
+    struct dlil_threading_info *);
 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
     u_int32_t, ifnet_model_t, boolean_t);
 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
@@ -414,17 +424,23 @@ static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
     protocol_family_t);
 
+static void dlil_incr_pending_thread_count(void);
+static void dlil_decr_pending_thread_count(void);
+
 static void ifnet_detacher_thread_func(void *, wait_result_t);
 static int ifnet_detacher_thread_cont(int);
 static void ifnet_detach_final(struct ifnet *);
 static void ifnet_detaching_enqueue(struct ifnet *);
 static struct ifnet *ifnet_detaching_dequeue(void);
 
-static void ifnet_start_thread_fn(void *, wait_result_t);
-static void ifnet_poll_thread_fn(void *, wait_result_t);
-static void ifnet_poll(struct ifnet *);
-static errno_t ifnet_enqueue_common(struct ifnet *, void *,
-    classq_pkt_type_t, boolean_t, boolean_t *);
+static void ifnet_start_thread_func(void *, wait_result_t);
+static void ifnet_start_thread_cont(void *, wait_result_t);
+
+static void ifnet_poll_thread_func(void *, wait_result_t);
+static void ifnet_poll_thread_cont(void *, wait_result_t);
+
+static errno_t ifnet_enqueue_common(struct ifnet *, classq_pkt_t *,
+    boolean_t, boolean_t *);
 
 static void ifp_src_route_copyout(struct ifnet *, struct route *);
 static void ifp_src_route_copyin(struct ifnet *, struct route *);
@@ -526,7 +542,7 @@ int dlil_verbose = 0;
 static u_int32_t dlil_input_sanity_check = 0;
 #endif /* IFNET_INPUT_SANITY_CHK */
 /* rate limit debug messages */
-struct timespec dlil_dbgrate = { 1, 0 };
+struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
 
 SYSCTL_DECL(_net_link_generic_system);
 
@@ -547,7 +563,7 @@ SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
     sysctl_rcvq_maxlen, "I", "Default receive queue max length");
 
 #define IF_RXPOLL_DECAY         2       /* ilog2 of EWMA decay rate (4) */
-static u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
+u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
     "ilog2 of EWMA decay rate of avg inbound packets");
@@ -568,8 +584,6 @@ SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
     IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
     "Q", "input poll sampling time");
 
-#define IF_RXPOLL_INTERVALTIME_MIN      (1ULL * 1000)           /* 1 us */
-#define IF_RXPOLL_INTERVALTIME          (1ULL * 1000 * 1000)    /* 1 ms */
 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
@@ -577,22 +591,22 @@ SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
     "Q", "input poll interval (time)");
 
 #define IF_RXPOLL_INTERVAL_PKTS 0       /* 0 (disabled) */
-static u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
+u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
     IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
 
 #define IF_RXPOLL_WLOWAT        10
-static u_int32_t if_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
+static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_wlowat,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
     IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
     "I", "input poll wakeup low watermark");
 
 #define IF_RXPOLL_WHIWAT        100
-static u_int32_t if_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
+static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_whiwat,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
     IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
     "I", "input poll wakeup high watermark");
 
@@ -601,7 +615,7 @@ SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
     "max packets per poll call");
 
-static u_int32_t if_rxpoll = 1;
+u_int32_t if_rxpoll = 1;
 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
     sysctl_rxpoll, "I", "enable opportunistic input polling");
@@ -770,20 +784,16 @@ static  lck_attr_t      *dlil_lck_attributes = NULL;
 /* DLIL data threshold thread call */
 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
 
-static void dlil_mit_tcall_fn(thread_call_param_t, thread_call_param_t);
-
-uint32_t dlil_rcv_mit_pkts_min = 5;
-uint32_t dlil_rcv_mit_pkts_max = 64;
-uint32_t dlil_rcv_mit_interval = (500 * 1000);
-
-#if (DEVELOPMENT || DEBUG)
-SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rcv_mit_pkts_min,
-    CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_rcv_mit_pkts_min, 0, "");
-SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rcv_mit_pkts_max,
-    CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_rcv_mit_pkts_max, 0, "");
-SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rcv_mit_interval,
-    CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_rcv_mit_interval, 0, "");
-#endif /* DEVELOPMENT || DEBUG */
+void
+ifnet_filter_update_tso(boolean_t filter_enable)
+{
+       /*
+        * update filter count and route_generation ID to let TCP
+        * know it should reevalute doing TSO or not
+        */
+       OSAddAtomic(filter_enable ? 1 : -1, &dlil_filter_disable_tso_count);
+       routegenid_update();
+}
 
 
 #define DLIL_INPUT_CHECK(m, ifp) {                                      \
@@ -816,14 +826,38 @@ struct rxpoll_time_tbl {
 };
 
 static struct rxpoll_time_tbl rxpoll_tbl[] = {
-       {  10 * MBPS, 2, 8, (1 * 1024), (6 * 1024)      },
-       { 100 * MBPS, 10, 40, (4 * 1024), (64 * 1024)     },
-       {   1 * GBPS, 10, 40, (4 * 1024), (64 * 1024)     },
-       {  10 * GBPS, 10, 40, (4 * 1024), (64 * 1024)     },
-       { 100 * GBPS, 10, 40, (4 * 1024), (64 * 1024)     },
-       { 0, 0, 0, 0, 0 }
+       { .speed =  10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024)    },
+       { .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
+       { .speed =   1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
+       { .speed =  10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
+       { .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
+       { .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
 };
 
+decl_lck_mtx_data(static, dlil_thread_sync_lock);
+static uint32_t dlil_pending_thread_cnt = 0;
+static void
+dlil_incr_pending_thread_count(void)
+{
+       LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
+       lck_mtx_lock(&dlil_thread_sync_lock);
+       dlil_pending_thread_cnt++;
+       lck_mtx_unlock(&dlil_thread_sync_lock);
+}
+
+static void
+dlil_decr_pending_thread_count(void)
+{
+       LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
+       lck_mtx_lock(&dlil_thread_sync_lock);
+       VERIFY(dlil_pending_thread_cnt > 0);
+       dlil_pending_thread_cnt--;
+       if (dlil_pending_thread_cnt == 0) {
+               wakeup(&dlil_pending_thread_cnt);
+       }
+       lck_mtx_unlock(&dlil_thread_sync_lock);
+}
+
 int
 proto_hash_value(u_int32_t protocol_family)
 {
@@ -839,9 +873,11 @@ proto_hash_value(u_int32_t protocol_family)
                return 1;
        case PF_VLAN:
                return 2;
+       case PF_802154:
+               return 3;
        case PF_UNSPEC:
        default:
-               return 3;
+               return 4;
        }
 }
 
@@ -1107,7 +1143,7 @@ if_free_protolist(u_int32_t *list)
        _FREE(list, M_TEMP);
 }
 
-__private_extern__ void
+__private_extern__ int
 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
     u_int32_t event_code, struct net_event_data *event_data,
     u_int32_t event_data_len)
@@ -1140,12 +1176,23 @@ dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
        ev_msg.dv[0].data_ptr    = event_data;
        ev_msg.dv[1].data_length = 0;
 
-       /* Don't update interface generation for quality and RRC state changess */
-       bool update_generation = (event_subclass != KEV_DL_SUBCLASS ||
-           (event_code != KEV_DL_LINK_QUALITY_METRIC_CHANGED &&
-           event_code != KEV_DL_RRC_STATE_CHANGED));
+       bool update_generation = true;
+       if (event_subclass == KEV_DL_SUBCLASS) {
+               /* Don't update interface generation for frequent link quality and state changes  */
+               switch (event_code) {
+               case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
+               case KEV_DL_RRC_STATE_CHANGED:
+               case KEV_DL_NODE_PRESENCE:
+               case KEV_DL_NODE_ABSENCE:
+               case KEV_DL_MASTER_ELECTED:
+                       update_generation = false;
+                       break;
+               default:
+                       break;
+               }
+       }
 
-       dlil_event_internal(ifp, &ev_msg, update_generation);
+       return dlil_event_internal(ifp, &ev_msg, update_generation);
 }
 
 __private_extern__ int
@@ -1227,7 +1274,7 @@ dlil_alloc_local_stats(struct ifnet *ifp)
                }
        }
 end:
-       if (ret != 0) {
+       if (ifp != NULL && ret != 0) {
                if (ifp->if_tcp_stat != NULL) {
                        pbuf = (void **)
                            ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
@@ -1253,20 +1300,43 @@ end:
        return ret;
 }
 
+static void
+dlil_reset_rxpoll_params(ifnet_t ifp)
+{
+       ASSERT(ifp != NULL);
+       ifnet_set_poll_cycle(ifp, NULL);
+       ifp->if_poll_update = 0;
+       ifp->if_poll_flags = 0;
+       ifp->if_poll_req = 0;
+       ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
+       bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
+       bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
+       bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
+       net_timerclear(&ifp->if_poll_mode_holdtime);
+       net_timerclear(&ifp->if_poll_mode_lasttime);
+       net_timerclear(&ifp->if_poll_sample_holdtime);
+       net_timerclear(&ifp->if_poll_sample_lasttime);
+       net_timerclear(&ifp->if_poll_dbg_lasttime);
+}
+
 static int
 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp)
 {
+       boolean_t dlil_rxpoll_input;
        thread_continue_t func;
        u_int32_t limit;
        int error;
 
+       dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
+           (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
+
        /* NULL ifp indicates the main input thread, called at dlil_init time */
        if (ifp == NULL) {
                func = dlil_main_input_thread_func;
                VERIFY(inp == dlil_main_input_thread);
                (void) strlcat(inp->input_name,
                    "main_input", DLIL_THREADNAME_LEN);
-       } else if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
+       } else if (dlil_rxpoll_input) {
                func = dlil_rxpoll_input_thread_func;
                VERIFY(inp != dlil_main_input_thread);
                (void) snprintf(inp->input_name, DLIL_THREADNAME_LEN,
@@ -1282,15 +1352,7 @@ dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp)
        inp->lck_grp = lck_grp_alloc_init(inp->input_name, dlil_grp_attributes);
        lck_mtx_init(&inp->input_lck, inp->lck_grp, dlil_lck_attributes);
 
-       inp->mode = IFNET_MODEL_INPUT_POLL_OFF;
-       inp->ifp = ifp;         /* NULL for main input thread */
-
-       net_timerclear(&inp->mode_holdtime);
-       net_timerclear(&inp->mode_lasttime);
-       net_timerclear(&inp->sample_holdtime);
-       net_timerclear(&inp->sample_lasttime);
-       net_timerclear(&inp->dbg_lasttime);
-
+       inp->ifp = ifp; /* NULL for main input thread */
        /*
         * For interfaces that support opportunistic polling, set the
         * low and high watermarks for outstanding inbound packets/bytes.
@@ -1299,7 +1361,9 @@ dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp)
         */
        if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
                limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
-               (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
+               if (ifp->if_xflags & IFXF_LEGACY) {
+                       (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
+               }
        } else {
                limit = (u_int32_t)-1;
        }
@@ -1390,18 +1454,6 @@ dlil_clean_threading_info(struct dlil_threading_info *inp)
        VERIFY(inp->wloop_thr == THREAD_NULL);
        VERIFY(inp->poll_thr == THREAD_NULL);
        VERIFY(inp->tag == 0);
-
-       inp->mode = IFNET_MODEL_INPUT_POLL_OFF;
-       bzero(&inp->tstats, sizeof(inp->tstats));
-       bzero(&inp->pstats, sizeof(inp->pstats));
-       bzero(&inp->sstats, sizeof(inp->sstats));
-
-       net_timerclear(&inp->mode_holdtime);
-       net_timerclear(&inp->mode_lasttime);
-       net_timerclear(&inp->sample_holdtime);
-       net_timerclear(&inp->sample_lasttime);
-       net_timerclear(&inp->dbg_lasttime);
-
 #if IFNET_INPUT_SANITY_CHK
        inp->input_mbuf_cnt = 0;
 #endif /* IFNET_INPUT_SANITY_CHK */
@@ -1411,6 +1463,7 @@ static void
 dlil_terminate_input_thread(struct dlil_threading_info *inp)
 {
        struct ifnet *ifp = inp->ifp;
+       classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
 
        VERIFY(current_thread() == inp->input_thr);
        VERIFY(inp != dlil_main_input_thread);
@@ -1425,21 +1478,27 @@ dlil_terminate_input_thread(struct dlil_threading_info *inp)
                    i++) {
                        v = (i + 1) * v;
                }
-               printf("the value is %d\n", v);
+               DLIL_PRINTF("the value is %d\n", v);
        }
 #endif /* TEST_INPUT_THREAD_TERMINATION */
 
        lck_mtx_lock_spin(&inp->input_lck);
+       _getq_all(&inp->rcvq_pkts, &pkt, NULL, NULL, NULL);
        VERIFY((inp->input_waiting & DLIL_INPUT_TERMINATE) != 0);
        inp->input_waiting |= DLIL_INPUT_TERMINATE_COMPLETE;
        wakeup_one((caddr_t)&inp->input_waiting);
        lck_mtx_unlock(&inp->input_lck);
 
+       /* free up pending packets */
+       if (pkt.cp_mbuf != NULL) {
+               mbuf_freem_list(pkt.cp_mbuf);
+       }
+
        /* for the extra refcnt from kernel_thread_start() */
        thread_deallocate(current_thread());
 
        if (dlil_verbose) {
-               printf("%s: input thread terminated\n",
+               DLIL_PRINTF("%s: input thread terminated\n",
                    if_name(ifp));
        }
 
@@ -1563,6 +1622,9 @@ dlil_init(void)
        _CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
        _CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
        _CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
+       _CASSERT(IFRTYPE_FAMILY_6LOWPAN == IFNET_FAMILY_6LOWPAN);
+       _CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
+       _CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
 
        _CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
        _CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
@@ -1571,6 +1633,8 @@ dlil_init(void)
        _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
        _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
        _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
+       _CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
+       _CASSERT(IFRTYPE_SUBFAMILY_DEFAULT == IFNET_SUBFAMILY_DEFAULT);
 
        _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
        _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
@@ -1584,6 +1648,7 @@ dlil_init(void)
 
        PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
 
+       VERIFY(dlil_pending_thread_cnt == 0);
        dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
            sizeof(struct dlil_ifnet_dbg);
        /* Enforce 64-bit alignment for dlil_ifnet structure */
@@ -1696,6 +1761,7 @@ dlil_init(void)
        lck_rw_init(&ifnet_head_lock, ifnet_head_lock_group,
            dlil_lck_attributes);
        lck_mtx_init(&dlil_ifnet_lock, dlil_lock_group, dlil_lck_attributes);
+       lck_mtx_init(&dlil_thread_sync_lock, dlil_lock_group, dlil_lck_attributes);
 
        /* Setup interface flow control related items */
        lck_mtx_init(&ifnet_fc_lock, dlil_lock_group, dlil_lck_attributes);
@@ -1752,14 +1818,39 @@ dlil_init(void)
         * Create and start up the main DLIL input thread and the interface
         * detacher threads once everything is initialized.
         */
+       dlil_incr_pending_thread_count();
        dlil_create_input_thread(NULL, dlil_main_input_thread);
 
+       /*
+        * Create ifnet detacher thread.
+        * When an interface gets detached, part of the detach processing
+        * is delayed. The interface is added to delayed detach list
+        * and this thread is woken up to call ifnet_detach_final
+        * on these interfaces.
+        */
+       dlil_incr_pending_thread_count();
        if (kernel_thread_start(ifnet_detacher_thread_func,
            NULL, &thread) != KERN_SUCCESS) {
                panic_plain("%s: couldn't create detacher thread", __func__);
                /* NOTREACHED */
        }
        thread_deallocate(thread);
+
+       /*
+        * Wait for the created kernel threads for dlil to get
+        * scheduled and run at least once before we proceed
+        */
+       lck_mtx_lock(&dlil_thread_sync_lock);
+       while (dlil_pending_thread_cnt != 0) {
+               DLIL_PRINTF("%s: Waiting for all the create dlil kernel threads "
+                   "to get scheduled at least once.\n", __func__);
+               (void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock, (PZERO - 1),
+                   __func__, NULL);
+               LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
+       }
+       lck_mtx_unlock(&dlil_thread_sync_lock);
+       DLIL_PRINTF("%s: All the created dlil kernel threads have been scheduled "
+           "at least once. Proceeding.\n", __func__);
 }
 
 static void
@@ -1858,8 +1949,7 @@ dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
         * know it shouldn't do TSO on this connection
         */
        if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
-               OSAddAtomic(1, &dlil_filter_disable_tso_count);
-               routegenid_update();
+               ifnet_filter_update_tso(TRUE);
        }
        OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
        INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
@@ -1867,7 +1957,7 @@ dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
                INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
        }
        if (dlil_verbose) {
-               printf("%s: %s filter attached\n", if_name(ifp),
+               DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
                    if_filter->iff_name);
        }
 done:
@@ -1923,7 +2013,7 @@ dlil_detach_filter_internal(interface_filter_t  filter, int detached)
                                if_flt_monitor_leave(ifp);
                                lck_mtx_unlock(&ifp->if_flt_lock);
                                if (dlil_verbose) {
-                                       printf("%s: %s filter detached\n",
+                                       DLIL_PRINTF("%s: %s filter detached\n",
                                            if_name(ifp), filter->filt_name);
                                }
                                goto destroy;
@@ -1938,7 +2028,7 @@ dlil_detach_filter_internal(interface_filter_t  filter, int detached)
        }
 
        if (dlil_verbose) {
-               printf("%s filter detached\n", filter->filt_name);
+               DLIL_PRINTF("%s filter detached\n", filter->filt_name);
        }
 
 destroy:
@@ -1953,8 +2043,7 @@ destroy:
         * know it should reevalute doing TSO or not
         */
        if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
-               OSAddAtomic(-1, &dlil_filter_disable_tso_count);
-               routegenid_update();
+               ifnet_filter_update_tso(FALSE);
        }
 
        VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
@@ -1980,6 +2069,27 @@ dlil_detach_filter(interface_filter_t filter)
        dlil_detach_filter_internal(filter, 0);
 }
 
+__attribute__((noreturn))
+static void
+dlil_main_input_thread_func(void *v, wait_result_t w)
+{
+#pragma unused(w)
+       struct dlil_threading_info *inp = v;
+
+       VERIFY(inp == dlil_main_input_thread);
+       VERIFY(inp->ifp == NULL);
+       VERIFY(current_thread() == inp->input_thr);
+
+       dlil_decr_pending_thread_count();
+       lck_mtx_lock(&inp->input_lck);
+       VERIFY(!(inp->input_waiting & DLIL_INPUT_RUNNING));
+       (void) assert_wait(&inp->input_waiting, THREAD_UNINT);
+       lck_mtx_unlock(&inp->input_lck);
+       (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
+       /* NOTREACHED */
+       __builtin_unreachable();
+}
+
 /*
  * Main input thread:
  *
@@ -1992,46 +2102,38 @@ dlil_detach_filter(interface_filter_t filter)
  */
 __attribute__((noreturn))
 static void
-dlil_main_input_thread_func(void *v, wait_result_t w)
+dlil_main_input_thread_cont(void *v, wait_result_t wres)
 {
-#pragma unused(w)
        struct dlil_main_threading_info *inpm = v;
        struct dlil_threading_info *inp = v;
 
-       VERIFY(inp == dlil_main_input_thread);
-       VERIFY(inp->ifp == NULL);
-       VERIFY(inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
+       /* main input thread is uninterruptible */
+       VERIFY(wres != THREAD_INTERRUPTED);
+       lck_mtx_lock_spin(&inp->input_lck);
+       VERIFY(!(inp->input_waiting & (DLIL_INPUT_TERMINATE |
+           DLIL_INPUT_RUNNING)));
+       inp->input_waiting |= DLIL_INPUT_RUNNING;
 
        while (1) {
                struct mbuf *m = NULL, *m_loop = NULL;
                u_int32_t m_cnt, m_cnt_loop;
+               classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
                boolean_t proto_req;
 
-               lck_mtx_lock_spin(&inp->input_lck);
-
-               /* Wait until there is work to be done */
-               while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
-                       inp->input_waiting &= ~DLIL_INPUT_RUNNING;
-                       (void) msleep(&inp->input_waiting, &inp->input_lck,
-                           (PZERO - 1) | PSPIN, inp->input_name, NULL);
-               }
-
-               inp->input_waiting |= DLIL_INPUT_RUNNING;
                inp->input_waiting &= ~DLIL_INPUT_WAITING;
 
-               /* Main input thread cannot be terminated */
-               VERIFY(!(inp->input_waiting & DLIL_INPUT_TERMINATE));
-
                proto_req = (inp->input_waiting &
                    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
 
                /* Packets for non-dedicated interfaces other than lo0 */
                m_cnt = qlen(&inp->rcvq_pkts);
-               m = _getq_all(&inp->rcvq_pkts, NULL, NULL, NULL);
+               _getq_all(&inp->rcvq_pkts, &pkt, NULL, NULL, NULL);
+               m = pkt.cp_mbuf;
 
                /* Packets exclusive to lo0 */
                m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
-               m_loop = _getq_all(&inpm->lo_rcvq_pkts, NULL, NULL, NULL);
+               _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
+               m_loop = pkt.cp_mbuf;
 
                inp->wtot = 0;
 
@@ -2044,26 +2146,41 @@ dlil_main_input_thread_func(void *v, wait_result_t w)
                 */
                if (m_loop != NULL) {
                        dlil_input_packet_list_extended(lo_ifp, m_loop,
-                           m_cnt_loop, inp->mode);
+                           m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
                }
 
                if (m != NULL) {
                        dlil_input_packet_list_extended(NULL, m,
-                           m_cnt, inp->mode);
+                           m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
                }
 
                if (proto_req) {
                        proto_input_run();
                }
+
+               lck_mtx_lock_spin(&inp->input_lck);
+               VERIFY(inp->input_waiting & DLIL_INPUT_RUNNING);
+               /* main input thread cannot be terminated */
+               VERIFY(!(inp->input_waiting & DLIL_INPUT_TERMINATE));
+               if (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
+                       break;
+               }
        }
 
-       /* NOTREACHED */
+       inp->input_waiting &= ~DLIL_INPUT_RUNNING;
+       (void) assert_wait(&inp->input_waiting, THREAD_UNINT);
+       lck_mtx_unlock(&inp->input_lck);
+       (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
+
        VERIFY(0);      /* we should never get here */
+       /* NOTREACHED */
+       __builtin_unreachable();
 }
 
 /*
  * Input thread for interfaces with legacy input model.
  */
+__attribute__((noreturn))
 static void
 dlil_input_thread_func(void *v, wait_result_t w)
 {
@@ -2072,30 +2189,52 @@ dlil_input_thread_func(void *v, wait_result_t w)
        struct dlil_threading_info *inp = v;
        struct ifnet *ifp = inp->ifp;
 
-       /* Construct the name for this thread, and then apply it. */
+       VERIFY(inp != dlil_main_input_thread);
+       VERIFY(ifp != NULL);
+       VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
+           !(ifp->if_xflags & IFXF_LEGACY));
+       VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
+           !(ifp->if_xflags & IFXF_LEGACY));
+       VERIFY(current_thread() == inp->input_thr);
+
+       /* construct the name for this thread, and then apply it */
        bzero(thread_name, sizeof(thread_name));
-       snprintf(thread_name, sizeof(thread_name), "dlil_input_%s", ifp->if_xname);
+       (void) snprintf(thread_name, sizeof(thread_name),
+           "dlil_input_%s", ifp->if_xname);
        thread_set_thread_name(inp->input_thr, thread_name);
+       ifnet_decr_pending_thread_count(ifp);
 
-       VERIFY(inp != dlil_main_input_thread);
-       VERIFY(ifp != NULL);
-       VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll);
-       VERIFY(inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
+       lck_mtx_lock(&inp->input_lck);
+       VERIFY(!(inp->input_waiting & DLIL_INPUT_RUNNING));
+       (void) assert_wait(&inp->input_waiting, THREAD_UNINT);
+       lck_mtx_unlock(&inp->input_lck);
+       (void) thread_block_parameter(dlil_input_thread_cont, inp);
+       /* NOTREACHED */
+       __builtin_unreachable();
+}
+
+__attribute__((noreturn))
+static void
+dlil_input_thread_cont(void *v, wait_result_t wres)
+{
+       struct dlil_threading_info *inp = v;
+       struct ifnet *ifp = inp->ifp;
+
+       lck_mtx_lock_spin(&inp->input_lck);
+       if (__improbable(wres == THREAD_INTERRUPTED ||
+           (inp->input_waiting & DLIL_INPUT_TERMINATE))) {
+               goto terminate;
+       }
+
+       VERIFY(!(inp->input_waiting & DLIL_INPUT_RUNNING));
+       inp->input_waiting |= DLIL_INPUT_RUNNING;
 
        while (1) {
                struct mbuf *m = NULL;
+               classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
+               boolean_t notify = FALSE;
                u_int32_t m_cnt;
 
-               lck_mtx_lock_spin(&inp->input_lck);
-
-               /* Wait until there is work to be done */
-               while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
-                       inp->input_waiting &= ~DLIL_INPUT_RUNNING;
-                       (void) msleep(&inp->input_waiting, &inp->input_lck,
-                           (PZERO - 1) | PSPIN, inp->input_name, NULL);
-               }
-
-               inp->input_waiting |= DLIL_INPUT_RUNNING;
                inp->input_waiting &= ~DLIL_INPUT_WAITING;
 
                /*
@@ -2110,27 +2249,19 @@ dlil_input_thread_func(void *v, wait_result_t w)
 
                /* Packets for this interface */
                m_cnt = qlen(&inp->rcvq_pkts);
-               m = _getq_all(&inp->rcvq_pkts, NULL, NULL, NULL);
-
-               if (inp->input_waiting & DLIL_INPUT_TERMINATE) {
-                       lck_mtx_unlock(&inp->input_lck);
-
-                       /* Free up pending packets */
-                       if (m != NULL) {
-                               mbuf_freem_list(m);
-                       }
-
-                       dlil_terminate_input_thread(inp);
-                       /* NOTREACHED */
-                       return;
-               }
+               _getq_all(&inp->rcvq_pkts, &pkt, NULL, NULL, NULL);
+               m = pkt.cp_mbuf;
 
                inp->wtot = 0;
 
-               dlil_input_stats_sync(ifp, inp);
+               notify = dlil_input_stats_sync(ifp, inp);
 
                lck_mtx_unlock(&inp->input_lck);
 
+               if (notify) {
+                       ifnet_notify_data_threshold(ifp);
+               }
+
                /*
                 * NOTE warning %%% attention !!!!
                 * We should think about putting some thread starvation
@@ -2138,38 +2269,97 @@ dlil_input_thread_func(void *v, wait_result_t w)
                 */
                if (m != NULL) {
                        dlil_input_packet_list_extended(NULL, m,
-                           m_cnt, inp->mode);
+                           m_cnt, ifp->if_poll_mode);
+               }
+
+               lck_mtx_lock_spin(&inp->input_lck);
+               VERIFY(inp->input_waiting & DLIL_INPUT_RUNNING);
+               if (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
+                       break;
                }
        }
 
-       /* NOTREACHED */
+       inp->input_waiting &= ~DLIL_INPUT_RUNNING;
+
+       if (__improbable(inp->input_waiting & DLIL_INPUT_TERMINATE)) {
+terminate:
+               lck_mtx_unlock(&inp->input_lck);
+               dlil_terminate_input_thread(inp);
+               /* NOTREACHED */
+       } else {
+               (void) assert_wait(&inp->input_waiting, THREAD_UNINT);
+               lck_mtx_unlock(&inp->input_lck);
+               (void) thread_block_parameter(dlil_input_thread_cont, inp);
+               /* NOTREACHED */
+       }
+
        VERIFY(0);      /* we should never get here */
+       /* NOTREACHED */
+       __builtin_unreachable();
 }
 
 /*
  * Input thread for interfaces with opportunistic polling input model.
  */
+__attribute__((noreturn))
 static void
 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
 {
 #pragma unused(w)
+       char thread_name[MAXTHREADNAMESIZE];
        struct dlil_threading_info *inp = v;
        struct ifnet *ifp = inp->ifp;
-       struct timespec ts;
 
        VERIFY(inp != dlil_main_input_thread);
-       VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL));
+       VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
+           (ifp->if_xflags & IFXF_LEGACY));
+       VERIFY(current_thread() == inp->input_thr);
+
+       /* construct the name for this thread, and then apply it */
+       bzero(thread_name, sizeof(thread_name));
+       (void) snprintf(thread_name, sizeof(thread_name),
+           "dlil_input_poll_%s", ifp->if_xname);
+       thread_set_thread_name(inp->input_thr, thread_name);
+       ifnet_decr_pending_thread_count(ifp);
+
+       lck_mtx_lock(&inp->input_lck);
+       VERIFY(!(inp->input_waiting & DLIL_INPUT_RUNNING));
+       (void) assert_wait(&inp->input_waiting, THREAD_UNINT);
+       lck_mtx_unlock(&inp->input_lck);
+       (void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
+       /* NOTREACHED */
+       __builtin_unreachable();
+}
+
+__attribute__((noreturn))
+static void
+dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
+{
+       struct dlil_threading_info *inp = v;
+       struct ifnet *ifp = inp->ifp;
+       struct timespec ts;
+
+       lck_mtx_lock_spin(&inp->input_lck);
+       if (__improbable(wres == THREAD_INTERRUPTED ||
+           (inp->input_waiting & DLIL_INPUT_TERMINATE))) {
+               goto terminate;
+       }
+
+       VERIFY(!(inp->input_waiting & DLIL_INPUT_RUNNING));
+       inp->input_waiting |= DLIL_INPUT_RUNNING;
 
        while (1) {
                struct mbuf *m = NULL;
                u_int32_t m_cnt, m_size, poll_req = 0;
                ifnet_model_t mode;
                struct timespec now, delta;
+               classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
+               boolean_t notify;
                u_int64_t ival;
 
-               lck_mtx_lock_spin(&inp->input_lck);
+               inp->input_waiting &= ~DLIL_INPUT_WAITING;
 
-               if ((ival = inp->rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
+               if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
                        ival = IF_RXPOLL_INTERVALTIME_MIN;
                }
 
@@ -2180,17 +2370,7 @@ dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
                }
 
                /* Current operating mode */
-               mode = inp->mode;
-
-               /* Wait until there is work to be done */
-               while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
-                       inp->input_waiting &= ~DLIL_INPUT_RUNNING;
-                       (void) msleep(&inp->input_waiting, &inp->input_lck,
-                           (PZERO - 1) | PSPIN, inp->input_name, NULL);
-               }
-
-               inp->input_waiting |= DLIL_INPUT_RUNNING;
-               inp->input_waiting &= ~DLIL_INPUT_WAITING;
+               mode = ifp->if_poll_mode;
 
                /*
                 * Protocol registration and injection must always use
@@ -2202,22 +2382,6 @@ dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
                VERIFY(!(inp->input_waiting &
                    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
 
-               if (inp->input_waiting & DLIL_INPUT_TERMINATE) {
-                       /* Free up pending packets */
-                       lck_mtx_convert_spin(&inp->input_lck);
-                       _flushq(&inp->rcvq_pkts);
-                       if (inp->input_mit_tcall != NULL) {
-                               if (thread_call_isactive(inp->input_mit_tcall)) {
-                                       thread_call_cancel(inp->input_mit_tcall);
-                               }
-                       }
-                       lck_mtx_unlock(&inp->input_lck);
-
-                       dlil_terminate_input_thread(inp);
-                       /* NOTREACHED */
-                       return;
-               }
-
                /* Total count of all packets */
                m_cnt = qlen(&inp->rcvq_pkts);
 
@@ -2225,116 +2389,121 @@ dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
                m_size = qsize(&inp->rcvq_pkts);
 
                /* Packets for this interface */
-               m = _getq_all(&inp->rcvq_pkts, NULL, NULL, NULL);
+               _getq_all(&inp->rcvq_pkts, &pkt, NULL, NULL, NULL);
+               m = pkt.cp_mbuf;
                VERIFY(m != NULL || m_cnt == 0);
 
                nanouptime(&now);
-               if (!net_timerisset(&inp->sample_lasttime)) {
-                       *(&inp->sample_lasttime) = *(&now);
+               if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
+                       *(&ifp->if_poll_sample_lasttime) = *(&now);
                }
 
-               net_timersub(&now, &inp->sample_lasttime, &delta);
-               if (if_rxpoll && net_timerisset(&inp->sample_holdtime)) {
+               net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
+               if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
                        u_int32_t ptot, btot;
 
                        /* Accumulate statistics for current sampling */
-                       PKTCNTR_ADD(&inp->sstats, m_cnt, m_size);
+                       PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
 
-                       if (net_timercmp(&delta, &inp->sample_holdtime, <)) {
+                       if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
                                goto skip;
                        }
 
-                       *(&inp->sample_lasttime) = *(&now);
+                       *(&ifp->if_poll_sample_lasttime) = *(&now);
 
                        /* Calculate min/max of inbound bytes */
-                       btot = (u_int32_t)inp->sstats.bytes;
-                       if (inp->rxpoll_bmin == 0 || inp->rxpoll_bmin > btot) {
-                               inp->rxpoll_bmin = btot;
+                       btot = (u_int32_t)ifp->if_poll_sstats.bytes;
+                       if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
+                               ifp->if_rxpoll_bmin = btot;
                        }
-                       if (btot > inp->rxpoll_bmax) {
-                               inp->rxpoll_bmax = btot;
+                       if (btot > ifp->if_rxpoll_bmax) {
+                               ifp->if_rxpoll_bmax = btot;
                        }
 
                        /* Calculate EWMA of inbound bytes */
-                       DLIL_EWMA(inp->rxpoll_bavg, btot, if_rxpoll_decay);
+                       DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
 
                        /* Calculate min/max of inbound packets */
-                       ptot = (u_int32_t)inp->sstats.packets;
-                       if (inp->rxpoll_pmin == 0 || inp->rxpoll_pmin > ptot) {
-                               inp->rxpoll_pmin = ptot;
+                       ptot = (u_int32_t)ifp->if_poll_sstats.packets;
+                       if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
+                               ifp->if_rxpoll_pmin = ptot;
                        }
-                       if (ptot > inp->rxpoll_pmax) {
-                               inp->rxpoll_pmax = ptot;
+                       if (ptot > ifp->if_rxpoll_pmax) {
+                               ifp->if_rxpoll_pmax = ptot;
                        }
 
                        /* Calculate EWMA of inbound packets */
-                       DLIL_EWMA(inp->rxpoll_pavg, ptot, if_rxpoll_decay);
+                       DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
 
                        /* Reset sampling statistics */
-                       PKTCNTR_CLEAR(&inp->sstats);
+                       PKTCNTR_CLEAR(&ifp->if_poll_sstats);
 
                        /* Calculate EWMA of wakeup requests */
-                       DLIL_EWMA(inp->rxpoll_wavg, inp->wtot, if_rxpoll_decay);
+                       DLIL_EWMA(ifp->if_rxpoll_wavg, inp->wtot, if_rxpoll_decay);
                        inp->wtot = 0;
 
                        if (dlil_verbose) {
-                               if (!net_timerisset(&inp->dbg_lasttime)) {
-                                       *(&inp->dbg_lasttime) = *(&now);
+                               if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
+                                       *(&ifp->if_poll_dbg_lasttime) = *(&now);
                                }
-                               net_timersub(&now, &inp->dbg_lasttime, &delta);
+                               net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
                                if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
-                                       *(&inp->dbg_lasttime) = *(&now);
-                                       printf("%s: [%s] pkts avg %d max %d "
+                                       *(&ifp->if_poll_dbg_lasttime) = *(&now);
+                                       DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
                                            "limits [%d/%d], wreq avg %d "
                                            "limits [%d/%d], bytes avg %d "
                                            "limits [%d/%d]\n", if_name(ifp),
-                                           (inp->mode ==
+                                           (ifp->if_poll_mode ==
                                            IFNET_MODEL_INPUT_POLL_ON) ?
-                                           "ON" : "OFF", inp->rxpoll_pavg,
-                                           inp->rxpoll_pmax,
-                                           inp->rxpoll_plowat,
-                                           inp->rxpoll_phiwat,
-                                           inp->rxpoll_wavg,
-                                           inp->rxpoll_wlowat,
-                                           inp->rxpoll_whiwat,
-                                           inp->rxpoll_bavg,
-                                           inp->rxpoll_blowat,
-                                           inp->rxpoll_bhiwat);
+                                           "ON" : "OFF", ifp->if_rxpoll_pavg,
+                                           ifp->if_rxpoll_pmax,
+                                           ifp->if_rxpoll_plowat,
+                                           ifp->if_rxpoll_phiwat,
+                                           ifp->if_rxpoll_wavg,
+                                           ifp->if_rxpoll_wlowat,
+                                           ifp->if_rxpoll_whiwat,
+                                           ifp->if_rxpoll_bavg,
+                                           ifp->if_rxpoll_blowat,
+                                           ifp->if_rxpoll_bhiwat);
                                }
                        }
 
                        /* Perform mode transition, if necessary */
-                       if (!net_timerisset(&inp->mode_lasttime)) {
-                               *(&inp->mode_lasttime) = *(&now);
+                       if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
+                               *(&ifp->if_poll_mode_lasttime) = *(&now);
                        }
 
-                       net_timersub(&now, &inp->mode_lasttime, &delta);
-                       if (net_timercmp(&delta, &inp->mode_holdtime, <)) {
+                       net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
+                       if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
                                goto skip;
                        }
 
-                       if (inp->rxpoll_pavg <= inp->rxpoll_plowat &&
-                           inp->rxpoll_bavg <= inp->rxpoll_blowat &&
-                           inp->mode != IFNET_MODEL_INPUT_POLL_OFF) {
+                       if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
+                           ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
+                           ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
                                mode = IFNET_MODEL_INPUT_POLL_OFF;
-                       } else if (inp->rxpoll_pavg >= inp->rxpoll_phiwat &&
-                           (inp->rxpoll_bavg >= inp->rxpoll_bhiwat ||
-                           inp->rxpoll_wavg >= inp->rxpoll_whiwat) &&
-                           inp->mode != IFNET_MODEL_INPUT_POLL_ON) {
+                       } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
+                           (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
+                           ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
+                           ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
                                mode = IFNET_MODEL_INPUT_POLL_ON;
                        }
 
-                       if (mode != inp->mode) {
-                               inp->mode = mode;
-                               *(&inp->mode_lasttime) = *(&now);
+                       if (mode != ifp->if_poll_mode) {
+                               ifp->if_poll_mode = mode;
+                               *(&ifp->if_poll_mode_lasttime) = *(&now);
                                poll_req++;
                        }
                }
 skip:
-               dlil_input_stats_sync(ifp, inp);
+               notify = dlil_input_stats_sync(ifp, inp);
 
                lck_mtx_unlock(&inp->input_lck);
 
+               if (notify) {
+                       ifnet_notify_data_threshold(ifp);
+               }
+
                /*
                 * If there's a mode change and interface is still attached,
                 * perform a downcall to the driver for the new mode.  Also
@@ -2342,27 +2511,29 @@ skip:
                 * being detached (will be release below.)
                 */
                if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
-                       struct ifnet_model_params p = { mode, { 0 } };
+                       struct ifnet_model_params p = {
+                               .model = mode, .reserved = { 0 }
+                       };
                        errno_t err;
 
                        if (dlil_verbose) {
-                               printf("%s: polling is now %s, "
+                               DLIL_PRINTF("%s: polling is now %s, "
                                    "pkts avg %d max %d limits [%d/%d], "
                                    "wreq avg %d limits [%d/%d], "
                                    "bytes avg %d limits [%d/%d]\n",
                                    if_name(ifp),
                                    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
-                                   "ON" : "OFF", inp->rxpoll_pavg,
-                                   inp->rxpoll_pmax, inp->rxpoll_plowat,
-                                   inp->rxpoll_phiwat, inp->rxpoll_wavg,
-                                   inp->rxpoll_wlowat, inp->rxpoll_whiwat,
-                                   inp->rxpoll_bavg, inp->rxpoll_blowat,
-                                   inp->rxpoll_bhiwat);
+                                   "ON" : "OFF", ifp->if_rxpoll_pavg,
+                                   ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
+                                   ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
+                                   ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
+                                   ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
+                                   ifp->if_rxpoll_bhiwat);
                        }
 
                        if ((err = ((*ifp->if_input_ctl)(ifp,
                            IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
-                               printf("%s: error setting polling mode "
+                               DLIL_PRINTF("%s: error setting polling mode "
                                    "to %s (%d)\n", if_name(ifp),
                                    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
                                    "ON" : "OFF", err);
@@ -2371,9 +2542,9 @@ skip:
                        switch (mode) {
                        case IFNET_MODEL_INPUT_POLL_OFF:
                                ifnet_set_poll_cycle(ifp, NULL);
-                               inp->rxpoll_offreq++;
+                               ifp->if_rxpoll_offreq++;
                                if (err != 0) {
-                                       inp->rxpoll_offerr++;
+                                       ifp->if_rxpoll_offerr++;
                                }
                                break;
 
@@ -2381,9 +2552,9 @@ skip:
                                net_nsectimer(&ival, &ts);
                                ifnet_set_poll_cycle(ifp, &ts);
                                ifnet_poll(ifp);
-                               inp->rxpoll_onreq++;
+                               ifp->if_rxpoll_onreq++;
                                if (err != 0) {
-                                       inp->rxpoll_onerr++;
+                                       ifp->if_rxpoll_onerr++;
                                }
                                break;
 
@@ -2404,28 +2575,37 @@ skip:
                if (m != NULL) {
                        dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
                }
+
+               lck_mtx_lock_spin(&inp->input_lck);
+               VERIFY(inp->input_waiting & DLIL_INPUT_RUNNING);
+               if (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
+                       break;
+               }
+       }
+
+       inp->input_waiting &= ~DLIL_INPUT_RUNNING;
+
+       if (__improbable(inp->input_waiting & DLIL_INPUT_TERMINATE)) {
+terminate:
+               lck_mtx_unlock(&inp->input_lck);
+               dlil_terminate_input_thread(inp);
+               /* NOTREACHED */
+       } else {
+               (void) assert_wait(&inp->input_waiting, THREAD_UNINT);
+               lck_mtx_unlock(&inp->input_lck);
+               (void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
+                   inp);
+               /* NOTREACHED */
        }
 
-       /* NOTREACHED */
        VERIFY(0);      /* we should never get here */
+       /* NOTREACHED */
+       __builtin_unreachable();
 }
 
-/*
- * Must be called on an attached ifnet (caller is expected to check.)
- * Caller may pass NULL for poll parameters to indicate "auto-tuning."
- */
 errno_t
-dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
-    boolean_t locked)
+dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
 {
-       struct dlil_threading_info *inp;
-       u_int64_t sample_holdtime, inbw;
-
-       VERIFY(ifp != NULL);
-       if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
-               return ENXIO;
-       }
-
        if (p != NULL) {
                if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
                    (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
@@ -2448,33 +2628,22 @@ dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
                        p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
                }
        }
+       return 0;
+}
 
-       if (!locked) {
-               lck_mtx_lock(&inp->input_lck);
-       }
-
-       LCK_MTX_ASSERT(&inp->input_lck, LCK_MTX_ASSERT_OWNED);
-
-       /*
-        * Normally, we'd reset the parameters to the auto-tuned values
-        * if the the input thread detects a change in link rate.  If the
-        * driver provides its own parameters right after a link rate
-        * changes, but before the input thread gets to run, we want to
-        * make sure to keep the driver's values.  Clearing if_poll_update
-        * will achieve that.
-        */
-       if (p != NULL && !locked && ifp->if_poll_update != 0) {
-               ifp->if_poll_update = 0;
-       }
+void
+dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
+{
+       u_int64_t sample_holdtime, inbw;
 
        if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
                sample_holdtime = 0;    /* polling is disabled */
-               inp->rxpoll_wlowat = inp->rxpoll_plowat =
-                   inp->rxpoll_blowat = 0;
-               inp->rxpoll_whiwat = inp->rxpoll_phiwat =
-                   inp->rxpoll_bhiwat = (u_int32_t)-1;
-               inp->rxpoll_plim = 0;
-               inp->rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
+               ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
+                   ifp->if_rxpoll_blowat = 0;
+               ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
+                   ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
+               ifp->if_rxpoll_plim = 0;
+               ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
        } else {
                u_int32_t plowat, phiwat, blowat, bhiwat, plim;
                u_int64_t ival;
@@ -2505,33 +2674,71 @@ dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
                VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
 
                sample_holdtime = if_rxpoll_sample_holdtime;
-               inp->rxpoll_wlowat = if_rxpoll_wlowat;
-               inp->rxpoll_whiwat = if_rxpoll_whiwat;
-               inp->rxpoll_plowat = plowat;
-               inp->rxpoll_phiwat = phiwat;
-               inp->rxpoll_blowat = blowat;
-               inp->rxpoll_bhiwat = bhiwat;
-               inp->rxpoll_plim = plim;
-               inp->rxpoll_ival = ival;
+               ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
+               ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
+               ifp->if_rxpoll_plowat = plowat;
+               ifp->if_rxpoll_phiwat = phiwat;
+               ifp->if_rxpoll_blowat = blowat;
+               ifp->if_rxpoll_bhiwat = bhiwat;
+               ifp->if_rxpoll_plim = plim;
+               ifp->if_rxpoll_ival = ival;
        }
 
-       net_nsectimer(&if_rxpoll_mode_holdtime, &inp->mode_holdtime);
-       net_nsectimer(&sample_holdtime, &inp->sample_holdtime);
+       net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
+       net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
 
        if (dlil_verbose) {
-               printf("%s: speed %llu bps, sample per %llu nsec, "
+               DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
                    "poll interval %llu nsec, pkts per poll %u, "
                    "pkt limits [%u/%u], wreq limits [%u/%u], "
                    "bytes limits [%u/%u]\n", if_name(ifp),
-                   inbw, sample_holdtime, inp->rxpoll_ival, inp->rxpoll_plim,
-                   inp->rxpoll_plowat, inp->rxpoll_phiwat, inp->rxpoll_wlowat,
-                   inp->rxpoll_whiwat, inp->rxpoll_blowat, inp->rxpoll_bhiwat);
+                   inbw, sample_holdtime, ifp->if_rxpoll_ival,
+                   ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
+                   ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
+                   ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
+                   ifp->if_rxpoll_bhiwat);
+       }
+}
+
+/*
+ * Must be called on an attached ifnet (caller is expected to check.)
+ * Caller may pass NULL for poll parameters to indicate "auto-tuning."
+ */
+errno_t
+dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
+    boolean_t locked)
+{
+       errno_t err;
+       struct dlil_threading_info *inp;
+
+       VERIFY(ifp != NULL);
+       if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
+               return ENXIO;
+       }
+       err = dlil_rxpoll_validate_params(p);
+       if (err != 0) {
+               return err;
        }
 
+       if (!locked) {
+               lck_mtx_lock(&inp->input_lck);
+       }
+       LCK_MTX_ASSERT(&inp->input_lck, LCK_MTX_ASSERT_OWNED);
+       /*
+        * Normally, we'd reset the parameters to the auto-tuned values
+        * if the the input thread detects a change in link rate.  If the
+        * driver provides its own parameters right after a link rate
+        * changes, but before the input thread gets to run, we want to
+        * make sure to keep the driver's values.  Clearing if_poll_update
+        * will achieve that.
+        */
+       if (p != NULL && !locked && ifp->if_poll_update != 0) {
+               ifp->if_poll_update = 0;
+       }
+       dlil_rxpoll_update_params(ifp, p);
        if (!locked) {
                lck_mtx_unlock(&inp->input_lck);
        }
-
        return 0;
 }
 
@@ -2551,12 +2758,12 @@ dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
        bzero(p, sizeof(*p));
 
        lck_mtx_lock(&inp->input_lck);
-       p->packets_limit = inp->rxpoll_plim;
-       p->packets_lowat = inp->rxpoll_plowat;
-       p->packets_hiwat = inp->rxpoll_phiwat;
-       p->bytes_lowat = inp->rxpoll_blowat;
-       p->bytes_hiwat = inp->rxpoll_bhiwat;
-       p->interval_time = inp->rxpoll_ival;
+       p->packets_limit = ifp->if_rxpoll_plim;
+       p->packets_lowat = ifp->if_rxpoll_plowat;
+       p->packets_hiwat = ifp->if_rxpoll_phiwat;
+       p->bytes_lowat = ifp->if_rxpoll_blowat;
+       p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
+       p->interval_time = ifp->if_rxpoll_ival;
        lck_mtx_unlock(&inp->input_lck);
 
        return 0;
@@ -2576,6 +2783,14 @@ ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
        return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
 }
 
+errno_t
+ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
+    struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
+{
+       return ifnet_input_common(ifp, m_head, m_tail, s,
+                  (m_head != NULL), TRUE);
+}
+
 static errno_t
 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
     const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
@@ -2602,7 +2817,7 @@ ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
         * interface is no longer attached; else hold an IO refcnt to
         * prevent it from being detached (will be released below.)
         */
-       if (ifp == NULL || (ifp != lo_ifp && !ifnet_is_attached(ifp, 1))) {
+       if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
                if (m_head != NULL) {
                        mbuf_freem_list(m_head);
                }
@@ -2685,7 +2900,7 @@ ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
 
        if (ifp != lo_ifp) {
                /* Release the IO refcnt */
-               ifnet_decr_iorefcnt(ifp);
+               ifnet_datamov_end(ifp);
        }
 
        return err;
@@ -2706,6 +2921,7 @@ dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
        struct dlil_threading_info *inp;
        u_int32_t m_cnt = s->packets_in;
        u_int32_t m_size = s->bytes_in;
+       boolean_t notify = FALSE;
 
        if ((inp = ifp->if_inp) == NULL) {
                inp = dlil_main_input_thread;
@@ -2753,13 +2969,16 @@ dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
         * dedicated input threads go to the regular list.
         */
        if (m_head != NULL) {
+               classq_pkt_t head, tail;
+               CLASSQ_PKT_INIT_MBUF(&head, m_head);
+               CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
                if (inp == dlil_main_input_thread && ifp == lo_ifp) {
                        struct dlil_main_threading_info *inpm =
                            (struct dlil_main_threading_info *)inp;
-                       _addq_multi(&inpm->lo_rcvq_pkts, m_head, m_tail,
+                       _addq_multi(&inpm->lo_rcvq_pkts, &head, &tail,
                            m_cnt, m_size);
                } else {
-                       _addq_multi(&inp->rcvq_pkts, m_head, m_tail,
+                       _addq_multi(&inp->rcvq_pkts, &head, &tail,
                            m_cnt, m_size);
                }
        }
@@ -2784,7 +3003,7 @@ dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
        }
 #endif /* IFNET_INPUT_SANITY_CHK */
 
-       dlil_input_stats_add(s, inp, poll);
+       dlil_input_stats_add(s, inp, ifp, poll);
        /*
         * If we're using the main input thread, synchronize the
         * stats now since we have the interface context.  All
@@ -2792,31 +3011,20 @@ dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
         * have their stats synchronized there.
         */
        if (inp == dlil_main_input_thread) {
-               dlil_input_stats_sync(ifp, inp);
-       }
-
-       if (inp->input_mit_tcall &&
-           qlen(&inp->rcvq_pkts) >= dlil_rcv_mit_pkts_min &&
-           qlen(&inp->rcvq_pkts) < dlil_rcv_mit_pkts_max &&
-           (ifp->if_family == IFNET_FAMILY_ETHERNET ||
-           ifp->if_type == IFT_CELLULAR)
-           ) {
-               if (!thread_call_isactive(inp->input_mit_tcall)) {
-                       uint64_t deadline;
-                       clock_interval_to_deadline(dlil_rcv_mit_interval,
-                           1, &deadline);
-                       (void) thread_call_enter_delayed(
-                               inp->input_mit_tcall, deadline);
-               }
-       } else {
-               inp->input_waiting |= DLIL_INPUT_WAITING;
-               if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) {
-                       inp->wtot++;
-                       wakeup_one((caddr_t)&inp->input_waiting);
-               }
+               notify = dlil_input_stats_sync(ifp, inp);
+       }
+
+       inp->input_waiting |= DLIL_INPUT_WAITING;
+       if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) {
+               inp->wtot++;
+               wakeup_one((caddr_t)&inp->input_waiting);
        }
        lck_mtx_unlock(&inp->input_lck);
 
+       if (notify) {
+               ifnet_notify_data_threshold(ifp);
+       }
+
        return 0;
 }
 
@@ -2858,22 +3066,20 @@ ifnet_start(struct ifnet *ifp)
        ifnet_start_common(ifp, FALSE);
 }
 
+__attribute__((noreturn))
 static void
-ifnet_start_thread_fn(void *v, wait_result_t w)
+ifnet_start_thread_func(void *v, wait_result_t w)
 {
 #pragma unused(w)
        struct ifnet *ifp = v;
-       char ifname[IFNAMSIZ + 1];
        char thread_name[MAXTHREADNAMESIZE];
-       struct timespec *ts = NULL;
-       struct ifclassq *ifq = &ifp->if_snd;
-       struct timespec delay_start_ts;
 
        /* Construct the name for this thread, and then apply it. */
        bzero(thread_name, sizeof(thread_name));
        (void) snprintf(thread_name, sizeof(thread_name),
            "ifnet_start_%s", ifp->if_xname);
-       thread_set_thread_name(ifp->if_start_thread, thread_name);
+       ASSERT(ifp->if_start_thread == current_thread());
+       thread_set_thread_name(current_thread(), thread_name);
 
        /*
         * Treat the dedicated starter thread for lo0 as equivalent to
@@ -2901,86 +3107,89 @@ ifnet_start_thread_fn(void *v, wait_result_t w)
                        lck_mtx_unlock(&inp->input_lck);
                }
        }
+       ifnet_decr_pending_thread_count(ifp);
 
-       (void) snprintf(ifname, sizeof(ifname), "%s_starter", if_name(ifp));
+       lck_mtx_lock(&ifp->if_start_lock);
+       VERIFY(!ifp->if_start_active);
+       (void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
+       lck_mtx_unlock(&ifp->if_start_lock);
+       (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
+       /* NOTREACHED */
+       __builtin_unreachable();
+}
 
-       lck_mtx_lock_spin(&ifp->if_start_lock);
+__attribute__((noreturn))
+static void
+ifnet_start_thread_cont(void *v, wait_result_t wres)
+{
+       struct ifnet *ifp = v;
+       struct ifclassq *ifq = &ifp->if_snd;
 
-       for (;;) {
-               if (ifp->if_start_thread != NULL) {
-                       (void) msleep(&ifp->if_start_thread,
-                           &ifp->if_start_lock,
-                           (PZERO - 1) | PSPIN, ifname, ts);
-               }
-               /* interface is detached? */
-               if (ifp->if_start_thread == THREAD_NULL) {
-                       ifnet_set_start_cycle(ifp, NULL);
-                       lck_mtx_unlock(&ifp->if_start_lock);
-                       ifnet_purge(ifp);
+       lck_mtx_lock(&ifp->if_start_lock);
+       if (__improbable(wres == THREAD_INTERRUPTED ||
+           ifp->if_start_thread == THREAD_NULL)) {
+               goto terminate;
+       }
 
-                       if (dlil_verbose) {
-                               printf("%s: starter thread terminated\n",
-                                   if_name(ifp));
-                       }
+       ifp->if_start_active = 1;
 
-                       /* for the extra refcnt from kernel_thread_start() */
-                       thread_deallocate(current_thread());
-                       /* this is the end */
-                       thread_terminate(current_thread());
-                       /* NOTREACHED */
-                       return;
+       /*
+        * Keep on servicing until no more request.
+        */
+       for (;;) {
+               u_int32_t req = ifp->if_start_req;
+               if (!IFCQ_IS_EMPTY(ifq) &&
+                   (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
+                   ifp->if_start_delayed == 0 &&
+                   IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
+                   (ifp->if_eflags & IFEF_DELAY_START)) {
+                       ifp->if_start_delayed = 1;
+                       ifnet_start_delayed++;
+                       break;
+               } else {
+                       ifp->if_start_delayed = 0;
                }
+               lck_mtx_unlock(&ifp->if_start_lock);
 
-               ifp->if_start_active = 1;
+               /*
+                * If no longer attached, don't call start because ifp
+                * is being destroyed; else hold an IO refcnt to
+                * prevent the interface from being detached (will be
+                * released below.)
+                */
+               if (!ifnet_datamov_begin(ifp)) {
+                       lck_mtx_lock_spin(&ifp->if_start_lock);
+                       break;
+               }
 
-               for (;;) {
-                       u_int32_t req = ifp->if_start_req;
-                       if (!IFCQ_IS_EMPTY(ifq) &&
-                           (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
-                           ifp->if_start_delayed == 0 &&
-                           IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
-                           (ifp->if_eflags & IFEF_DELAY_START)) {
-                               ifp->if_start_delayed = 1;
-                               ifnet_start_delayed++;
-                               break;
-                       } else {
-                               ifp->if_start_delayed = 0;
-                       }
-                       lck_mtx_unlock(&ifp->if_start_lock);
+               /* invoke the driver's start routine */
+               ((*ifp->if_start)(ifp));
 
-                       /*
-                        * If no longer attached, don't call start because ifp
-                        * is being destroyed; else hold an IO refcnt to
-                        * prevent the interface from being detached (will be
-                        * released below.)
-                        */
-                       if (!ifnet_is_attached(ifp, 1)) {
-                               lck_mtx_lock_spin(&ifp->if_start_lock);
-                               break;
-                       }
+               /*
+                * Release the io ref count taken above.
+                */
+               ifnet_datamov_end(ifp);
 
-                       /* invoke the driver's start routine */
-                       ((*ifp->if_start)(ifp));
+               lck_mtx_lock_spin(&ifp->if_start_lock);
 
-                       /*
-                        * Release the io ref count taken by ifnet_is_attached.
-                        */
-                       ifnet_decr_iorefcnt(ifp);
+               /*
+                * If there's no pending request or if the
+                * interface has been disabled, we're done.
+                */
+               if (req == ifp->if_start_req ||
+                   (ifp->if_start_flags & IFSF_FLOW_CONTROLLED)) {
+                       break;
+               }
+       }
 
-                       lck_mtx_lock_spin(&ifp->if_start_lock);
+       ifp->if_start_req = 0;
+       ifp->if_start_active = 0;
 
-                       /*
-                        * If there's no pending request or if the
-                        * interface has been disabled, we're done.
-                        */
-                       if (req == ifp->if_start_req ||
-                           (ifp->if_start_flags & IFSF_FLOW_CONTROLLED)) {
-                               break;
-                       }
-               }
 
-               ifp->if_start_req = 0;
-               ifp->if_start_active = 0;
+       if (__probable(ifp->if_start_thread != THREAD_NULL)) {
+               uint64_t deadline = TIMEOUT_WAIT_FOREVER;
+               struct timespec delay_start_ts;
+               struct timespec *ts;
 
                /*
                 * Wakeup N ns from now if rate-controlled by TBR, and if
@@ -3000,9 +3209,40 @@ ifnet_start_thread_fn(void *v, wait_result_t w)
                if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
                        ts = NULL;
                }
+
+               if (__improbable(ts != NULL)) {
+                       clock_interval_to_deadline((ts->tv_nsec +
+                           (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
+               }
+
+               (void) assert_wait_deadline(&ifp->if_start_thread,
+                   THREAD_UNINT, deadline);
+               lck_mtx_unlock(&ifp->if_start_lock);
+               (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
+               /* NOTREACHED */
+       } else {
+terminate:
+               /* interface is detached? */
+               ifnet_set_start_cycle(ifp, NULL);
+               lck_mtx_unlock(&ifp->if_start_lock);
+               ifnet_purge(ifp);
+
+               if (dlil_verbose) {
+                       DLIL_PRINTF("%s: starter thread terminated\n",
+                           if_name(ifp));
+               }
+
+               /* for the extra refcnt from kernel_thread_start() */
+               thread_deallocate(current_thread());
+               /* this is the end */
+               thread_terminate(current_thread());
+               /* NOTREACHED */
        }
 
+       /* must never get here */
+       VERIFY(0);
        /* NOTREACHED */
+       __builtin_unreachable();
 }
 
 void
@@ -3015,12 +3255,12 @@ ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
        }
 
        if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
-               printf("%s: restart interval set to %lu nsec\n",
+               DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
                    if_name(ifp), ts->tv_nsec);
        }
 }
 
-static void
+void
 ifnet_poll(struct ifnet *ifp)
 {
        /*
@@ -3028,134 +3268,149 @@ ifnet_poll(struct ifnet *ifp)
         */
        lck_mtx_lock_spin(&ifp->if_poll_lock);
        ifp->if_poll_req++;
-       if (!ifp->if_poll_active && ifp->if_poll_thread != THREAD_NULL) {
+       if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
+           ifp->if_poll_thread != THREAD_NULL) {
                wakeup_one((caddr_t)&ifp->if_poll_thread);
        }
        lck_mtx_unlock(&ifp->if_poll_lock);
 }
 
+__attribute__((noreturn))
 static void
-ifnet_poll_thread_fn(void *v, wait_result_t w)
+ifnet_poll_thread_func(void *v, wait_result_t w)
 {
 #pragma unused(w)
+       char thread_name[MAXTHREADNAMESIZE];
+       struct ifnet *ifp = v;
+
+       VERIFY(ifp->if_eflags & IFEF_RXPOLL);
+       VERIFY(current_thread() == ifp->if_poll_thread);
+
+       /* construct the name for this thread, and then apply it */
+       bzero(thread_name, sizeof(thread_name));
+       (void) snprintf(thread_name, sizeof(thread_name),
+           "ifnet_poller_%s", ifp->if_xname);
+       thread_set_thread_name(ifp->if_poll_thread, thread_name);
+       ifnet_decr_pending_thread_count(ifp);
+
+       lck_mtx_lock(&ifp->if_poll_lock);
+       (void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
+       lck_mtx_unlock(&ifp->if_poll_lock);
+       (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
+       /* NOTREACHED */
+       __builtin_unreachable();
+}
+
+__attribute__((noreturn))
+static void
+ifnet_poll_thread_cont(void *v, wait_result_t wres)
+{
        struct dlil_threading_info *inp;
        struct ifnet *ifp = v;
-       char ifname[IFNAMSIZ + 1];
-       struct timespec *ts = NULL;
        struct ifnet_stat_increment_param s;
+       struct timespec start_time;
+
+       VERIFY(ifp->if_eflags & IFEF_RXPOLL);
 
-       snprintf(ifname, sizeof(ifname), "%s_poller",
-           if_name(ifp));
        bzero(&s, sizeof(s));
+       net_timerclear(&start_time);
 
        lck_mtx_lock_spin(&ifp->if_poll_lock);
+       if (__improbable(wres == THREAD_INTERRUPTED ||
+           ifp->if_poll_thread == THREAD_NULL)) {
+               goto terminate;
+       }
 
        inp = ifp->if_inp;
        VERIFY(inp != NULL);
 
+       ifp->if_poll_flags |= IF_POLLF_RUNNING;
+
+       /*
+        * Keep on servicing until no more request.
+        */
        for (;;) {
-               if (ifp->if_poll_thread != THREAD_NULL) {
-                       (void) msleep(&ifp->if_poll_thread, &ifp->if_poll_lock,
-                           (PZERO - 1) | PSPIN, ifname, ts);
-               }
+               struct mbuf *m_head, *m_tail;
+               u_int32_t m_lim, m_cnt, m_totlen;
+               u_int16_t req = ifp->if_poll_req;
 
-               /* interface is detached (maybe while asleep)? */
-               if (ifp->if_poll_thread == THREAD_NULL) {
-                       ifnet_set_poll_cycle(ifp, NULL);
-                       lck_mtx_unlock(&ifp->if_poll_lock);
+               m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
+                   MAX((qlimit(&inp->rcvq_pkts)), (ifp->if_rxpoll_phiwat << 2));
+               lck_mtx_unlock(&ifp->if_poll_lock);
 
-                       if (dlil_verbose) {
-                               printf("%s: poller thread terminated\n",
-                                   if_name(ifp));
-                       }
+               /*
+                * If no longer attached, there's nothing to do;
+                * else hold an IO refcnt to prevent the interface
+                * from being detached (will be released below.)
+                */
+               if (!ifnet_is_attached(ifp, 1)) {
+                       lck_mtx_lock_spin(&ifp->if_poll_lock);
+                       break;
+               }
 
-                       /* for the extra refcnt from kernel_thread_start() */
-                       thread_deallocate(current_thread());
-                       /* this is the end */
-                       thread_terminate(current_thread());
-                       /* NOTREACHED */
-                       return;
+               if (dlil_verbose > 1) {
+                       DLIL_PRINTF("%s: polling up to %d pkts, "
+                           "pkts avg %d max %d, wreq avg %d, "
+                           "bytes avg %d\n",
+                           if_name(ifp), m_lim,
+                           ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
+                           ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
                }
 
-               ifp->if_poll_active = 1;
-               for (;;) {
-                       struct mbuf *m_head, *m_tail;
-                       u_int32_t m_lim, m_cnt, m_totlen;
-                       u_int16_t req = ifp->if_poll_req;
+               /* invoke the driver's input poll routine */
+               ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
+               &m_cnt, &m_totlen));
 
-                       lck_mtx_unlock(&ifp->if_poll_lock);
+               if (m_head != NULL) {
+                       VERIFY(m_tail != NULL && m_cnt > 0);
 
-                       /*
-                        * If no longer attached, there's nothing to do;
-                        * else hold an IO refcnt to prevent the interface
-                        * from being detached (will be released below.)
-                        */
-                       if (!ifnet_is_attached(ifp, 1)) {
-                               lck_mtx_lock_spin(&ifp->if_poll_lock);
-                               break;
+                       if (dlil_verbose > 1) {
+                               DLIL_PRINTF("%s: polled %d pkts, "
+                                   "pkts avg %d max %d, wreq avg %d, "
+                                   "bytes avg %d\n",
+                                   if_name(ifp), m_cnt,
+                                   ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
+                                   ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
                        }
 
-                       m_lim = (inp->rxpoll_plim != 0) ? inp->rxpoll_plim :
-                           MAX((qlimit(&inp->rcvq_pkts)),
-                           (inp->rxpoll_phiwat << 2));
+                       /* stats are required for extended variant */
+                       s.packets_in = m_cnt;
+                       s.bytes_in = m_totlen;
 
+                       (void) ifnet_input_common(ifp, m_head, m_tail,
+                           &s, TRUE, TRUE);
+               } else {
                        if (dlil_verbose > 1) {
-                               printf("%s: polling up to %d pkts, "
+                               DLIL_PRINTF("%s: no packets, "
                                    "pkts avg %d max %d, wreq avg %d, "
                                    "bytes avg %d\n",
-                                   if_name(ifp), m_lim,
-                                   inp->rxpoll_pavg, inp->rxpoll_pmax,
-                                   inp->rxpoll_wavg, inp->rxpoll_bavg);
+                                   if_name(ifp), ifp->if_rxpoll_pavg,
+                                   ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
+                                   ifp->if_rxpoll_bavg);
                        }
 
-                       /* invoke the driver's input poll routine */
-                       ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
-                       &m_cnt, &m_totlen));
-
-                       if (m_head != NULL) {
-                               VERIFY(m_tail != NULL && m_cnt > 0);
-
-                               if (dlil_verbose > 1) {
-                                       printf("%s: polled %d pkts, "
-                                           "pkts avg %d max %d, wreq avg %d, "
-                                           "bytes avg %d\n",
-                                           if_name(ifp), m_cnt,
-                                           inp->rxpoll_pavg, inp->rxpoll_pmax,
-                                           inp->rxpoll_wavg, inp->rxpoll_bavg);
-                               }
-
-                               /* stats are required for extended variant */
-                               s.packets_in = m_cnt;
-                               s.bytes_in = m_totlen;
+                       (void) ifnet_input_common(ifp, NULL, NULL,
+                           NULL, FALSE, TRUE);
+               }
 
-                               (void) ifnet_input_common(ifp, m_head, m_tail,
-                                   &s, TRUE, TRUE);
-                       } else {
-                               if (dlil_verbose > 1) {
-                                       printf("%s: no packets, "
-                                           "pkts avg %d max %d, wreq avg %d, "
-                                           "bytes avg %d\n",
-                                           if_name(ifp), inp->rxpoll_pavg,
-                                           inp->rxpoll_pmax, inp->rxpoll_wavg,
-                                           inp->rxpoll_bavg);
-                               }
+               /* Release the io ref count */
+               ifnet_decr_iorefcnt(ifp);
 
-                               (void) ifnet_input_common(ifp, NULL, NULL,
-                                   NULL, FALSE, TRUE);
-                       }
+               lck_mtx_lock_spin(&ifp->if_poll_lock);
 
-                       /* Release the io ref count */
-                       ifnet_decr_iorefcnt(ifp);
+               /* if there's no pending request, we're done */
+               if (req == ifp->if_poll_req ||
+                   ifp->if_poll_thread == THREAD_NULL) {
+                       break;
+               }
+       }
 
-                       lck_mtx_lock_spin(&ifp->if_poll_lock);
+       ifp->if_poll_req = 0;
+       ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
 
-                       /* if there's no pending request, we're done */
-                       if (req == ifp->if_poll_req) {
-                               break;
-                       }
-               }
-               ifp->if_poll_req = 0;
-               ifp->if_poll_active = 0;
+       if (ifp->if_poll_thread != THREAD_NULL) {
+               uint64_t deadline = TIMEOUT_WAIT_FOREVER;
+               struct timespec *ts;
 
                /*
                 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
@@ -3165,9 +3420,39 @@ ifnet_poll_thread_fn(void *v, wait_result_t w)
                if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
                        ts = NULL;
                }
+
+               if (ts != NULL) {
+                       clock_interval_to_deadline((ts->tv_nsec +
+                           (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
+               }
+
+               (void) assert_wait_deadline(&ifp->if_poll_thread,
+                   THREAD_UNINT, deadline);
+               lck_mtx_unlock(&ifp->if_poll_lock);
+               (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
+               /* NOTREACHED */
+       } else {
+terminate:
+               /* interface is detached (maybe while asleep)? */
+               ifnet_set_poll_cycle(ifp, NULL);
+               lck_mtx_unlock(&ifp->if_poll_lock);
+
+               if (dlil_verbose) {
+                       DLIL_PRINTF("%s: poller thread terminated\n",
+                           if_name(ifp));
+               }
+
+               /* for the extra refcnt from kernel_thread_start() */
+               thread_deallocate(current_thread());
+               /* this is the end */
+               thread_terminate(current_thread());
+               /* NOTREACHED */
        }
 
+       /* must never get here */
+       VERIFY(0);
        /* NOTREACHED */
+       __builtin_unreachable();
 }
 
 void
@@ -3180,7 +3465,7 @@ ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
        }
 
        if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
-               printf("%s: poll interval set to %lu nsec\n",
+               DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
                    if_name(ifp), ts->tv_nsec);
        }
 }
@@ -3203,8 +3488,10 @@ ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
        }
 
        if (IFCQ_TBR_IS_ENABLED(ifq)) {
-               struct tb_profile tb = { ifq->ifcq_tbr.tbr_rate_raw,
-                                        ifq->ifcq_tbr.tbr_percent, 0 };
+               struct tb_profile tb = {
+                       .rate = ifq->ifcq_tbr.tbr_rate_raw,
+                       .percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
+               };
                (void) ifclassq_tbr_set(ifq, &tb, FALSE);
        }
 
@@ -3375,16 +3662,78 @@ ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
        }
 }
 
+/*
+ * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
+ * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
+ * buf holds the full header.
+ */
+static __attribute__((noinline)) void
+ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
+{
+       struct ip *ip;
+       struct ip6_hdr *ip6;
+       uint8_t lbuf[64] __attribute__((aligned(8)));
+       uint8_t *p = buf;
+
+       if (ip_ver == IPVERSION) {
+               uint8_t old_tos;
+               uint32_t sum;
+
+               if (__improbable(!IP_HDR_ALIGNED_P(p))) {
+                       DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
+                       bcopy(buf, lbuf, sizeof(struct ip));
+                       p = lbuf;
+               }
+               ip = (struct ip *)(void *)p;
+               if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
+                       return;
+               }
+
+               DTRACE_IP1(clear__v4, struct ip *, ip);
+               old_tos = ip->ip_tos;
+               ip->ip_tos &= IPTOS_ECN_MASK;
+               sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
+               sum = (sum >> 16) + (sum & 0xffff);
+               ip->ip_sum = (uint16_t)(sum & 0xffff);
+
+               if (__improbable(p == lbuf)) {
+                       bcopy(lbuf, buf, sizeof(struct ip));
+               }
+       } else {
+               uint32_t flow;
+               ASSERT(ip_ver == IPV6_VERSION);
+
+               if (__improbable(!IP_HDR_ALIGNED_P(p))) {
+                       DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
+                       bcopy(buf, lbuf, sizeof(struct ip6_hdr));
+                       p = lbuf;
+               }
+               ip6 = (struct ip6_hdr *)(void *)p;
+               flow = ntohl(ip6->ip6_flow);
+               if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
+                       return;
+               }
+
+               DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
+               ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
+
+               if (__improbable(p == lbuf)) {
+                       bcopy(lbuf, buf, sizeof(struct ip6_hdr));
+               }
+       }
+}
+
 static inline errno_t
-ifnet_enqueue_common(struct ifnet *ifp, void *p, classq_pkt_type_t ptype,
-    boolean_t flush, boolean_t *pdrop)
+ifnet_enqueue_ifclassq(struct ifnet *ifp, classq_pkt_t *p, boolean_t flush,
+    boolean_t *pdrop)
 {
        volatile uint64_t *fg_ts = NULL;
        volatile uint64_t *rt_ts = NULL;
-       struct mbuf *m = p;
        struct timespec now;
        u_int64_t now_nsec = 0;
        int error = 0;
+       uint8_t *mcast_buf = NULL;
+       uint8_t ip_ver;
 
        ASSERT(ifp->if_eflags & IFEF_TXSTART);
 
@@ -3394,44 +3743,110 @@ ifnet_enqueue_common(struct ifnet *ifp, void *p, classq_pkt_type_t ptype,
         * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
         * the timestamp value is used internally there.
         */
-       switch (ptype) {
+       switch (p->cp_ptype) {
        case QP_MBUF:
-               ASSERT(m->m_flags & M_PKTHDR);
-               ASSERT(m->m_nextpkt == NULL);
+               ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
+               ASSERT(p->cp_mbuf->m_nextpkt == NULL);
 
-               if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
-                   m->m_pkthdr.pkt_timestamp == 0) {
+               if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
+                   p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
                        nanouptime(&now);
                        net_timernsec(&now, &now_nsec);
-                       m->m_pkthdr.pkt_timestamp = now_nsec;
+                       p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
                }
-               m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
+               p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
                /*
                 * If the packet service class is not background,
                 * update the timestamp to indicate recent activity
                 * on a foreground socket.
                 */
-               if ((m->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
-                   m->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
-                       if (!(m->m_pkthdr.pkt_flags & PKTF_SO_BACKGROUND)) {
+               if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
+                   p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
+                       if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
+                           PKTF_SO_BACKGROUND)) {
                                ifp->if_fg_sendts = _net_uptime;
                                if (fg_ts != NULL) {
                                        *fg_ts = _net_uptime;
                                }
                        }
-                       if (m->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
+                       if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
                                ifp->if_rt_sendts = _net_uptime;
                                if (rt_ts != NULL) {
                                        *rt_ts = _net_uptime;
                                }
                        }
                }
+
+               /*
+                * Some Wi-Fi AP implementations do not correctly handle
+                * multicast IP packets with DSCP bits set (radr://9331522).
+                * As a workaround we clear the DSCP bits and set the service
+                * class to BE.
+                */
+               if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
+                   IFNET_IS_WIFI_INFRA(ifp)) {
+                       size_t len = mbuf_len(p->cp_mbuf), hlen;
+                       struct ether_header *eh;
+                       boolean_t pullup = FALSE;
+                       uint16_t etype;
+
+                       if (__improbable(len < sizeof(struct ether_header))) {
+                               DTRACE_IP1(small__ether, size_t, len);
+                               if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
+                                   sizeof(struct ether_header))) == NULL) {
+                                       return ENOMEM;
+                               }
+                       }
+                       eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
+                       etype = ntohs(eh->ether_type);
+                       if (etype == ETHERTYPE_IP) {
+                               hlen = sizeof(struct ether_header) +
+                                   sizeof(struct ip);
+                               if (len < hlen) {
+                                       DTRACE_IP1(small__v4, size_t, len);
+                                       pullup = TRUE;
+                               }
+                               ip_ver = IPVERSION;
+                       } else if (etype == ETHERTYPE_IPV6) {
+                               hlen = sizeof(struct ether_header) +
+                                   sizeof(struct ip6_hdr);
+                               if (len < hlen) {
+                                       DTRACE_IP1(small__v6, size_t, len);
+                                       pullup = TRUE;
+                               }
+                               ip_ver = IPV6_VERSION;
+                       } else {
+                               DTRACE_IP1(invalid__etype, uint16_t, etype);
+                               break;
+                       }
+                       if (pullup) {
+                               if ((p->cp_mbuf = m_pullup(p->cp_mbuf, hlen)) ==
+                                   NULL) {
+                                       return ENOMEM;
+                               }
+
+                               eh = (struct ether_header *)mbuf_data(
+                                       p->cp_mbuf);
+                       }
+                       mbuf_set_service_class(p->cp_mbuf, MBUF_SC_BE);
+                       mcast_buf = (uint8_t *)(eh + 1);
+                       /*
+                        * ifnet_mcast_clear_dscp() will finish the work below.
+                        * Note that the pullups above ensure that mcast_buf
+                        * points to a full IP header.
+                        */
+               }
                break;
 
 
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
+       }
+
+       if (mcast_buf != NULL) {
+               ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
        }
 
        if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
@@ -3474,7 +3889,8 @@ ifnet_enqueue_common(struct ifnet *ifp, void *p, classq_pkt_type_t ptype,
                                        ifp->if_start_delay_idle = 0;
                                } else {
                                        if (ifp->if_start_delay_idle >= 10) {
-                                               ifp->if_eflags &= ~(IFEF_DELAY_START);
+                                               ifp->if_eflags &=
+                                                   ~(IFEF_DELAY_START);
                                                ifnet_delay_start_disabled++;
                                        } else {
                                                ifp->if_start_delay_idle++;
@@ -3493,17 +3909,8 @@ ifnet_enqueue_common(struct ifnet *ifp, void *p, classq_pkt_type_t ptype,
                ifp->if_eflags &= ~(IFEF_DELAY_START);
        }
 
-       switch (ptype) {
-       case QP_MBUF:
-               /* enqueue the packet (caller consumes object) */
-               error = ifclassq_enqueue(&ifp->if_snd, m, QP_MBUF, pdrop);
-               m = NULL;
-               break;
-
-
-       default:
-               break;
-       }
+       /* enqueue the packet (caller consumes object) */
+       error = ifclassq_enqueue(&ifp->if_snd, p, pdrop);
 
        /*
         * Tell the driver to start dequeueing; do this even when the queue
@@ -3515,7 +3922,36 @@ ifnet_enqueue_common(struct ifnet *ifp, void *p, classq_pkt_type_t ptype,
                ifnet_start(ifp);
        }
 
-       return error;
+       return error;
+}
+
+int
+ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
+{
+       struct ifnet *ifp = handle;
+       boolean_t pdrop;        /* dummy */
+       uint32_t i;
+
+       ASSERT(n_pkts >= 1);
+       for (i = 0; i < n_pkts - 1; i++) {
+               (void) ifnet_enqueue_ifclassq(ifp, &pkts[i].pktsched_pkt,
+                   FALSE, &pdrop);
+       }
+       /* flush with the last packet */
+       (void) ifnet_enqueue_ifclassq(ifp, &pkts[i].pktsched_pkt, TRUE, &pdrop);
+
+       return 0;
+}
+
+static inline errno_t
+ifnet_enqueue_common(struct ifnet *ifp, classq_pkt_t *pkt, boolean_t flush,
+    boolean_t *pdrop)
+{
+       if (ifp->if_output_netem != NULL) {
+               return netem_enqueue(ifp->if_output_netem, pkt, pdrop);
+       } else {
+               return ifnet_enqueue_ifclassq(ifp, pkt, flush, pdrop);
+       }
 }
 
 errno_t
@@ -3529,6 +3965,8 @@ errno_t
 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
     boolean_t *pdrop)
 {
+       classq_pkt_t pkt;
+
        if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
            m->m_nextpkt != NULL) {
                if (m != NULL) {
@@ -3548,7 +3986,8 @@ ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
                return ENETDOWN;
        }
 
-       return ifnet_enqueue_common(ifp, m, QP_MBUF, flush, pdrop);
+       CLASSQ_PKT_INIT_MBUF(&pkt, m);
+       return ifnet_enqueue_common(ifp, &pkt, flush, pdrop);
 }
 
 
@@ -3556,7 +3995,8 @@ errno_t
 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
 {
        errno_t rc;
-       classq_pkt_type_t ptype;
+       classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
+
        if (ifp == NULL || mp == NULL) {
                return EINVAL;
        } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
@@ -3568,10 +4008,10 @@ ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
        }
 
        rc = ifclassq_dequeue(&ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
-           (void **)mp, NULL, NULL, NULL, &ptype);
-       VERIFY((*mp == NULL) || (ptype == QP_MBUF));
+           &pkt, NULL, NULL, NULL);
+       VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
        ifnet_decr_iorefcnt(ifp);
-
+       *mp = pkt.cp_mbuf;
        return rc;
 }
 
@@ -3580,7 +4020,8 @@ ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
     struct mbuf **mp)
 {
        errno_t rc;
-       classq_pkt_type_t ptype;
+       classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
+
        if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
                return EINVAL;
        } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
@@ -3592,10 +4033,10 @@ ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
        }
 
        rc = ifclassq_dequeue_sc(&ifp->if_snd, sc, 1,
-           CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, (void **)mp, NULL, NULL,
-           NULL, &ptype);
-       VERIFY((*mp == NULL) || (ptype == QP_MBUF));
+           CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL);
+       VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
        ifnet_decr_iorefcnt(ifp);
+       *mp = pkt.cp_mbuf;
        return rc;
 }
 
@@ -3604,7 +4045,9 @@ ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
 {
        errno_t rc;
-       classq_pkt_type_t ptype;
+       classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
+       classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
+
        if (ifp == NULL || head == NULL || pkt_limit < 1) {
                return EINVAL;
        } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
@@ -3616,10 +4059,13 @@ ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
        }
 
        rc = ifclassq_dequeue(&ifp->if_snd, pkt_limit,
-           CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, (void **)head, (void **)tail, cnt,
-           len, &ptype);
-       VERIFY((*head == NULL) || (ptype == QP_MBUF));
+           CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len);
+       VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
        ifnet_decr_iorefcnt(ifp);
+       *head = pkt_head.cp_mbuf;
+       if (tail != NULL) {
+               *tail = pkt_tail.cp_mbuf;
+       }
        return rc;
 }
 
@@ -3628,7 +4074,9 @@ ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
 {
        errno_t rc;
-       classq_pkt_type_t ptype;
+       classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
+       classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
+
        if (ifp == NULL || head == NULL || byte_limit < 1) {
                return EINVAL;
        } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
@@ -3640,9 +4088,13 @@ ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
        }
 
        rc = ifclassq_dequeue(&ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
-           byte_limit, (void **)head, (void **)tail, cnt, len, &ptype);
-       VERIFY((*head == NULL) || (ptype == QP_MBUF));
+           byte_limit, &pkt_head, &pkt_tail, cnt, len);
+       VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
        ifnet_decr_iorefcnt(ifp);
+       *head = pkt_head.cp_mbuf;
+       if (tail != NULL) {
+               *tail = pkt_tail.cp_mbuf;
+       }
        return rc;
 }
 
@@ -3652,7 +4104,9 @@ ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
     u_int32_t *len)
 {
        errno_t rc;
-       classq_pkt_type_t ptype;
+       classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
+       classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
+
        if (ifp == NULL || head == NULL || pkt_limit < 1 ||
            !MBUF_VALID_SC(sc)) {
                return EINVAL;
@@ -3665,10 +4119,14 @@ ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
        }
 
        rc = ifclassq_dequeue_sc(&ifp->if_snd, sc, pkt_limit,
-           CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, (void **)head,
-           (void **)tail, cnt, len, &ptype);
-       VERIFY((*head == NULL) || (ptype == QP_MBUF));
+           CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
+           cnt, len);
+       VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
        ifnet_decr_iorefcnt(ifp);
+       *head = pkt_head.cp_mbuf;
+       if (tail != NULL) {
+               *tail = pkt_tail.cp_mbuf;
+       }
        return rc;
 }
 
@@ -3689,11 +4147,30 @@ ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
 }
 #endif /* !CONFIG_EMBEDDED */
 
+static boolean_t
+packet_has_vlan_tag(struct mbuf * m)
+{
+       u_int   tag = 0;
+
+       if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
+               tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
+               if (tag == 0) {
+                       /* the packet is just priority-tagged, clear the bit */
+                       m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
+               }
+       }
+       return tag != 0;
+}
+
 static int
 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
     char **frame_header_p, protocol_family_t protocol_family)
 {
-       struct ifnet_filter *filter;
+       boolean_t               is_vlan_packet = FALSE;
+       struct ifnet_filter     *filter;
+       struct mbuf             *m = *m_p;
+
+       is_vlan_packet = packet_has_vlan_tag(m);
 
        /*
         * Pass the inbound packet to the interface filters
@@ -3704,6 +4181,12 @@ dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
        TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
                int result;
 
+               /* exclude VLAN packets from external filters PR-3586856 */
+               if (is_vlan_packet &&
+                   (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
+                       continue;
+               }
+
                if (!filter->filt_skip && filter->filt_input != NULL &&
                    (filter->filt_protocol == 0 ||
                    filter->filt_protocol == protocol_family)) {
@@ -3740,7 +4223,11 @@ static int
 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
     protocol_family_t protocol_family)
 {
-       struct ifnet_filter *filter;
+       boolean_t               is_vlan_packet;
+       struct ifnet_filter     *filter;
+       struct mbuf             *m = *m_p;
+
+       is_vlan_packet = packet_has_vlan_tag(m);
 
        /*
         * Pass the outbound packet to the interface filters
@@ -3751,6 +4238,12 @@ dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
        TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
                int result;
 
+               /* exclude VLAN packets from external filters PR-3586856 */
+               if (is_vlan_packet &&
+                   (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
+                       continue;
+               }
+
                if (!filter->filt_skip && filter->filt_output != NULL &&
                    (filter->filt_protocol == 0 ||
                    filter->filt_protocol == protocol_family)) {
@@ -3809,7 +4302,7 @@ dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
 
 static void
 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
-    struct dlil_threading_info *inp, boolean_t poll)
+    struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
 {
        struct ifnet_stat_increment_param *d = &inp->stats;
 
@@ -3841,11 +4334,11 @@ dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
        }
 
        if (poll) {
-               PKTCNTR_ADD(&inp->tstats, s->packets_in, s->bytes_in);
+               PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
        }
 }
 
-static void
+static boolean_t
 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
 {
        struct ifnet_stat_increment_param *s = &inp->stats;
@@ -3889,23 +4382,20 @@ dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
                s->dropped = 0;
        }
 
-       if (ifp->if_data_threshold != 0) {
-               lck_mtx_convert_spin(&inp->input_lck);
-               ifnet_notify_data_threshold(ifp);
-       }
-
        /*
         * No need for atomic operations as they are modified here
         * only from within the DLIL input thread context.
         */
-       if (inp->tstats.packets != 0) {
-               inp->pstats.ifi_poll_packets += inp->tstats.packets;
-               inp->tstats.packets = 0;
+       if (ifp->if_poll_tstats.packets != 0) {
+               ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
+               ifp->if_poll_tstats.packets = 0;
        }
-       if (inp->tstats.bytes != 0) {
-               inp->pstats.ifi_poll_bytes += inp->tstats.bytes;
-               inp->tstats.bytes = 0;
+       if (ifp->if_poll_tstats.bytes != 0) {
+               ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
+               ifp->if_poll_tstats.bytes = 0;
        }
+
+       return ifp->if_data_threshold != 0;
 }
 
 __private_extern__ void
@@ -3952,7 +4442,8 @@ dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
                        ifp = m->m_pkthdr.rcvif;
                }
 
-               if ((ifp->if_eflags & IFEF_RXPOLL) && poll_thresh != 0 &&
+               if ((ifp->if_eflags & IFEF_RXPOLL) &&
+                   (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
                    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
                        ifnet_poll(ifp);
                }
@@ -3971,7 +4462,7 @@ dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
                 * away, so optimize for that.
                 */
                if (ifp != lo_ifp) {
-                       if (!ifnet_is_attached(ifp, 1)) {
+                       if (!ifnet_datamov_begin(ifp)) {
                                m_freem(m);
                                goto next;
                        }
@@ -4076,7 +4567,6 @@ skip_clat:
                        dlil_input_cksum_dbg(ifp, m, frame_header,
                            protocol_family);
                }
-
                /*
                 * For partial checksum offload, we expect the driver to
                 * set the start offset indicating the start of the span
@@ -4084,11 +4574,14 @@ skip_clat:
                 * adjust this start offset accordingly because the data
                 * pointer has been advanced beyond the link-layer header.
                 *
-                * Don't adjust if the interface is a bridge member, as
-                * the adjustment will occur from the context of the
-                * bridge interface during input.
+                * Virtual lan types (bridge, vlan, bond) can call
+                * dlil_input_packet_list() with the same packet with the
+                * checksum flags set. Set a flag indicating that the
+                * adjustment has already been done.
                 */
-               if (ifp->if_bridge == NULL && (m->m_pkthdr.csum_flags &
+               if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
+                       /* adjustment has already been done */
+               } else if ((m->m_pkthdr.csum_flags &
                    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
                    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
                        int adj;
@@ -4103,8 +4596,9 @@ skip_clat:
                        } else {
                                m->m_pkthdr.csum_rx_start -= adj;
                        }
+                       /* make sure we don't adjust more than once */
+                       m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
                }
-
                if (clat_debug) {
                        pktap_input(ifp, protocol_family, m, frame_header);
                }
@@ -4113,18 +4607,16 @@ skip_clat:
                        atomic_add_64(&ifp->if_imcasts, 1);
                }
 
-               /* run interface filters, exclude VLAN packets PR-3586856 */
-               if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) {
-                       error = dlil_interface_filters_input(ifp, &m,
-                           &frame_header, protocol_family);
-                       if (error != 0) {
-                               if (error != EJUSTRETURN) {
-                                       m_freem(m);
-                               }
-                               goto next;
+               /* run interface filters */
+               error = dlil_interface_filters_input(ifp, &m,
+                   &frame_header, protocol_family);
+               if (error != 0) {
+                       if (error != EJUSTRETURN) {
+                               m_freem(m);
                        }
+                       goto next;
                }
-               if (error != 0 || ((m->m_flags & M_PROMISC) != 0)) {
+               if ((m->m_flags & M_PROMISC) != 0) {
                        m_freem(m);
                        goto next;
                }
@@ -4187,7 +4679,7 @@ next:
                        ifp->if_updatemcasts = 0;
                }
                if (iorefcnt == 1) {
-                       ifnet_decr_iorefcnt(ifp);
+                       ifnet_datamov_end(ifp);
                }
        }
 
@@ -4203,7 +4695,7 @@ if_mcasts_update(struct ifnet *ifp)
        if (err == EAFNOSUPPORT) {
                err = 0;
        }
-       printf("%s: %s %d suspended link-layer multicast membership(s) "
+       DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
            "(err=%d)\n", if_name(ifp),
            (err == 0 ? "successfully restored" : "failed to restore"),
            ifp->if_updatemcasts, err);
@@ -4492,7 +4984,9 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
        u_int64_t now_nsec;
        boolean_t did_clat46 = FALSE;
        protocol_family_t old_proto_family = proto_family;
+       struct sockaddr_in6 dest6;
        struct rtentry *rt = NULL;
+       u_int32_t m_loop_set = 0;
 
        KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
 
@@ -4500,7 +4994,7 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
         * Get an io refcnt if the interface is attached to prevent ifnet_detach
         * from happening while this operation is in progress
         */
-       if (!ifnet_is_attached(ifp, 1)) {
+       if (!ifnet_datamov_begin(ifp)) {
                retval = ENXIO;
                goto cleanup;
        }
@@ -4564,7 +5058,6 @@ preout_again:
                 * performed address family translation.
                 */
                if (!did_clat46 && proto_family == PF_INET6) {
-                       struct sockaddr_in6 dest6;
                        did_clat46 = TRUE;
 
                        if (proto != NULL) {
@@ -4700,7 +5193,7 @@ preout_again:
                                m->m_pkthdr.rcvif = ifp;
                                rcvif_set = 1;
                        }
-
+                       m_loop_set = m->m_flags & M_LOOP;
                        retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
                            frame_type, &pre, &post);
                        if (retval != 0) {
@@ -4743,16 +5236,12 @@ preout_again:
                /*
                 * Let interface filters (if any) do their thing ...
                 */
-               /* Do not pass VLAN tagged packets to filters PR-3586856 */
-               if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) {
-                       retval = dlil_interface_filters_output(ifp,
-                           &m, proto_family);
-                       if (retval != 0) {
-                               if (retval != EJUSTRETURN) {
-                                       m_freem(m);
-                               }
-                               goto next;
+               retval = dlil_interface_filters_output(ifp, &m, proto_family);
+               if (retval != 0) {
+                       if (retval != EJUSTRETURN) {
+                               m_freem(m);
                        }
+                       goto next;
                }
                /*
                 * Strip away M_PROTO1 bit prior to sending packet
@@ -4850,7 +5339,7 @@ preout_again:
                                fpkts++;
                        }
                        if (retval != 0 && dlil_verbose) {
-                               printf("%s: output error on %s retval = %d\n",
+                               DLIL_PRINTF("%s: output error on %s retval = %d\n",
                                    __func__, if_name(ifp),
                                    retval);
                        }
@@ -4862,6 +5351,7 @@ preout_again:
 next:
                m = packetlist;
                if (m != NULL) {
+                       m->m_flags |= m_loop_set;
                        packetlist = packetlist->m_nextpkt;
                        m->m_nextpkt = NULL;
                }
@@ -4889,7 +5379,7 @@ next:
                                fpkts++;
                        }
                        if (retval != 0 && dlil_verbose) {
-                               printf("%s: output error on %s retval = %d\n",
+                               DLIL_PRINTF("%s: output error on %s retval = %d\n",
                                    __func__, if_name(ifp), retval);
                        }
                } else {
@@ -4916,7 +5406,7 @@ next:
                                        }
                                }
                                if (retval != 0 && dlil_verbose) {
-                                       printf("%s: output error on %s "
+                                       DLIL_PRINTF("%s: output error on %s "
                                            "retval = %d\n",
                                            __func__, if_name(ifp), retval);
                                }
@@ -4948,7 +5438,7 @@ cleanup:
                retval = 0;
        }
        if (iorefcnt == 1) {
-               ifnet_decr_iorefcnt(ifp);
+               ifnet_datamov_end(ifp);
        }
        if (rt != NULL) {
                rtfree(rt);
@@ -5593,9 +6083,10 @@ static __inline__ int
 _is_announcement(const struct sockaddr_in * sender_sin,
     const struct sockaddr_in * target_sin)
 {
-       if (sender_sin == NULL) {
+       if (target_sin == NULL || sender_sin == NULL) {
                return FALSE;
        }
+
        return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
 }
 
@@ -5610,8 +6101,11 @@ dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
        struct sockaddr_inarp target_proto_sinarp;
        struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
 
-       if (target_proto == NULL || (sender_proto != NULL &&
-           sender_proto->sa_family != target_proto->sa_family)) {
+       if (target_proto == NULL || sender_proto == NULL) {
+               return EINVAL;
+       }
+
+       if (sender_proto->sa_family != target_proto->sa_family) {
                return EINVAL;
        }
 
@@ -5637,7 +6131,7 @@ dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
        if (target_proto->sa_family == AF_INET &&
            IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
            ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
-           !_is_announcement(target_sin, sender_sin)) {
+           !_is_announcement(sender_sin, target_sin)) {
                ifnet_t         *ifp_list;
                u_int32_t       count;
                u_int32_t       ifp_on;
@@ -5749,10 +6243,30 @@ ifnet_is_attached(struct ifnet *ifp, int refio)
        return ret;
 }
 
+void
+ifnet_incr_pending_thread_count(struct ifnet *ifp)
+{
+       lck_mtx_lock_spin(&ifp->if_ref_lock);
+       ifp->if_threads_pending++;
+       lck_mtx_unlock(&ifp->if_ref_lock);
+}
+
+void
+ifnet_decr_pending_thread_count(struct ifnet *ifp)
+{
+       lck_mtx_lock_spin(&ifp->if_ref_lock);
+       VERIFY(ifp->if_threads_pending > 0);
+       ifp->if_threads_pending--;
+       if (ifp->if_threads_pending == 0) {
+               wakeup(&ifp->if_threads_pending);
+       }
+       lck_mtx_unlock(&ifp->if_ref_lock);
+}
+
 /*
  * Caller must ensure the interface is attached; the assumption is that
  * there is at least an outstanding IO reference count held already.
- * Most callers would call ifnet_is_attached() instead.
+ * Most callers would call ifnet_is_{attached,data_ready}() instead.
  */
 void
 ifnet_incr_iorefcnt(struct ifnet *ifp)
@@ -5764,13 +6278,17 @@ ifnet_incr_iorefcnt(struct ifnet *ifp)
        lck_mtx_unlock(&ifp->if_ref_lock);
 }
 
-void
-ifnet_decr_iorefcnt(struct ifnet *ifp)
+__attribute__((always_inline))
+static void
+ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
 {
-       lck_mtx_lock_spin(&ifp->if_ref_lock);
+       LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
+
        VERIFY(ifp->if_refio > 0);
        VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
+
        ifp->if_refio--;
+       VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
 
        /*
         * if there are no more outstanding io references, wakeup the
@@ -5779,7 +6297,95 @@ ifnet_decr_iorefcnt(struct ifnet *ifp)
        if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
                wakeup(&(ifp->if_refio));
        }
+}
+
+void
+ifnet_decr_iorefcnt(struct ifnet *ifp)
+{
+       lck_mtx_lock_spin(&ifp->if_ref_lock);
+       ifnet_decr_iorefcnt_locked(ifp);
+       lck_mtx_unlock(&ifp->if_ref_lock);
+}
+
+boolean_t
+ifnet_datamov_begin(struct ifnet *ifp)
+{
+       boolean_t ret;
+
+       lck_mtx_lock_spin(&ifp->if_ref_lock);
+       if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
+               ifp->if_refio++;
+               ifp->if_datamov++;
+       }
+       lck_mtx_unlock(&ifp->if_ref_lock);
+
+       return ret;
+}
+
+void
+ifnet_datamov_end(struct ifnet *ifp)
+{
+       lck_mtx_lock_spin(&ifp->if_ref_lock);
+       VERIFY(ifp->if_datamov > 0);
+       /*
+        * if there's no more thread moving data, wakeup any
+        * drainers that's blocked waiting for this.
+        */
+       if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
+               wakeup(&(ifp->if_datamov));
+       }
+       ifnet_decr_iorefcnt_locked(ifp);
+       lck_mtx_unlock(&ifp->if_ref_lock);
+}
+
+void
+ifnet_datamov_suspend(struct ifnet *ifp)
+{
+       lck_mtx_lock_spin(&ifp->if_ref_lock);
+       VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
+       ifp->if_refio++;
+       if (ifp->if_suspend++ == 0) {
+               VERIFY(ifp->if_refflags & IFRF_READY);
+               ifp->if_refflags &= ~IFRF_READY;
+       }
+       lck_mtx_unlock(&ifp->if_ref_lock);
+}
+
+void
+ifnet_datamov_drain(struct ifnet *ifp)
+{
+       lck_mtx_lock(&ifp->if_ref_lock);
+       VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
+       /* data movement must already be suspended */
+       VERIFY(ifp->if_suspend > 0);
+       VERIFY(!(ifp->if_refflags & IFRF_READY));
+       ifp->if_drainers++;
+       while (ifp->if_datamov != 0) {
+               (void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
+                   (PZERO - 1), __func__, NULL);
+       }
+       VERIFY(!(ifp->if_refflags & IFRF_READY));
+       VERIFY(ifp->if_drainers > 0);
+       ifp->if_drainers--;
+       lck_mtx_unlock(&ifp->if_ref_lock);
+
+       /* purge the interface queues */
+       if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
+               if_qflush(ifp, 0);
+       }
+}
 
+void
+ifnet_datamov_resume(struct ifnet *ifp)
+{
+       lck_mtx_lock(&ifp->if_ref_lock);
+       /* data movement must already be suspended */
+       VERIFY(ifp->if_suspend > 0);
+       if (--ifp->if_suspend == 0) {
+               VERIFY(!(ifp->if_refflags & IFRF_READY));
+               ifp->if_refflags |= IFRF_READY;
+       }
+       ifnet_decr_iorefcnt_locked(ifp);
        lck_mtx_unlock(&ifp->if_ref_lock);
 }
 
@@ -5976,13 +6582,13 @@ ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
            &proto_count);
 
 end:
-       if (retval != 0 && retval != EEXIST && ifp != NULL) {
+       if (retval != 0 && retval != EEXIST) {
                DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
-                   if_name(ifp), protocol, retval);
+                   ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
        } else {
                if (dlil_verbose) {
-                       printf("%s: attached v1 protocol %d (count = %d)\n",
-                           if_name(ifp),
+                       DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
+                           ifp != NULL ? if_name(ifp) : "N/A",
                            protocol, proto_count);
                }
        }
@@ -6045,13 +6651,13 @@ ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
            &proto_count);
 
 end:
-       if (retval != 0 && retval != EEXIST && ifp != NULL) {
+       if (retval != 0 && retval != EEXIST) {
                DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
-                   if_name(ifp), protocol, retval);
+                   ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
        } else {
                if (dlil_verbose) {
-                       printf("%s: attached v2 protocol %d (count = %d)\n",
-                           if_name(ifp),
+                       DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
+                           ifp != NULL ? if_name(ifp) : "N/A",
                            protocol, proto_count);
                }
        }
@@ -6118,7 +6724,7 @@ ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
        ifnet_lock_done(ifp);
 
        if (dlil_verbose) {
-               printf("%s: detached %s protocol %d\n", if_name(ifp),
+               DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
                    (proto->proto_kpi == kProtoKPI_v1) ?
                    "v1" : "v2", proto_family);
        }
@@ -6243,6 +6849,7 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
        /* Sanity check */
        VERIFY(ifp->if_detaching_link.tqe_next == NULL);
        VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
+       VERIFY(ifp->if_threads_pending == 0);
 
        if (ll_addr != NULL) {
                if (ifp->if_addrlen == 0) {
@@ -6396,23 +7003,19 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
        VERIFY(dl_inp->wloop_thr == THREAD_NULL);
        VERIFY(dl_inp->poll_thr == THREAD_NULL);
        VERIFY(dl_inp->tag == 0);
-       VERIFY(dl_inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
-       bzero(&dl_inp->tstats, sizeof(dl_inp->tstats));
-       bzero(&dl_inp->pstats, sizeof(dl_inp->pstats));
-       bzero(&dl_inp->sstats, sizeof(dl_inp->sstats));
+
 #if IFNET_INPUT_SANITY_CHK
        VERIFY(dl_inp->input_mbuf_cnt == 0);
 #endif /* IFNET_INPUT_SANITY_CHK */
 
+       VERIFY(ifp->if_poll_thread == THREAD_NULL);
+       dlil_reset_rxpoll_params(ifp);
        /*
-        * A specific DLIL input thread is created per Ethernet/cellular
-        * interface or for an interface which supports opportunistic
-        * input polling.  Pseudo interfaces or other types of interfaces
-        * use the main input thread instead.
+        * A specific DLIL input thread is created per non-loopback interface.
         */
-       if ((net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) ||
-           ifp->if_type == IFT_ETHER || ifp->if_type == IFT_CELLULAR) {
+       if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
                ifp->if_inp = dl_inp;
+               ifnet_incr_pending_thread_count(ifp);
                err = dlil_create_input_thread(ifp, ifp->if_inp);
                if (err != 0) {
                        panic_plain("%s: ifp=%p couldn't get an input thread; "
@@ -6420,13 +7023,6 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
                        /* NOTREACHED */
                }
        }
-
-       if (ifp->if_inp != NULL && ifp->if_inp->input_mit_tcall == NULL) {
-               ifp->if_inp->input_mit_tcall =
-                   thread_call_allocate_with_priority(dlil_mit_tcall_fn,
-                   ifp, THREAD_CALL_PRIORITY_KERNEL);
-       }
-
        /*
         * If the driver supports the new transmit model, calculate flow hash
         * and create a workloop starter thread to invoke the if_start callback
@@ -6442,7 +7038,8 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
                ifp->if_start_req = 0;
                ifp->if_start_flags = 0;
                VERIFY(ifp->if_start != NULL);
-               if ((err = kernel_thread_start(ifnet_start_thread_fn,
+               ifnet_incr_pending_thread_count(ifp);
+               if ((err = kernel_thread_start(ifnet_start_thread_func,
                    ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
                        panic_plain("%s: "
                            "ifp=%p couldn't get a start thread; "
@@ -6455,21 +7052,25 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
                ifp->if_flowhash = 0;
        }
 
+       /* Reset polling parameters */
+       ifnet_set_poll_cycle(ifp, NULL);
+       ifp->if_poll_update = 0;
+       ifp->if_poll_flags = 0;
+       ifp->if_poll_req = 0;
+       VERIFY(ifp->if_poll_thread == THREAD_NULL);
+
        /*
         * If the driver supports the new receive model, create a poller
         * thread to invoke if_input_poll callback where the packets may
         * be dequeued from the driver and processed for reception.
+        * if the interface is netif compat then the poller thread is managed by netif.
         */
-       if (ifp->if_eflags & IFEF_RXPOLL) {
+       if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL) &&
+           (ifp->if_xflags & IFXF_LEGACY)) {
                VERIFY(ifp->if_input_poll != NULL);
                VERIFY(ifp->if_input_ctl != NULL);
-               VERIFY(ifp->if_poll_thread == THREAD_NULL);
-
-               ifnet_set_poll_cycle(ifp, NULL);
-               ifp->if_poll_update = 0;
-               ifp->if_poll_active = 0;
-               ifp->if_poll_req = 0;
-               if ((err = kernel_thread_start(ifnet_poll_thread_fn, ifp,
+               ifnet_incr_pending_thread_count(ifp);
+               if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
                    &ifp->if_poll_thread)) != KERN_SUCCESS) {
                        panic_plain("%s: ifp=%p couldn't get a poll thread; "
                            "err=%d", __func__, ifp, err);
@@ -6498,7 +7099,7 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
                        IFMA_UNLOCK(ifma);
                }
 
-               printf("%s: attached with %d suspended link-layer multicast "
+               DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
                    "membership(s)\n", if_name(ifp),
                    ifp->if_updatemcasts);
        }
@@ -6515,6 +7116,7 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
        VERIFY(ifp->if_delegated.family == 0);
        VERIFY(ifp->if_delegated.subfamily == 0);
        VERIFY(ifp->if_delegated.expensive == 0);
+       VERIFY(ifp->if_delegated.constrained == 0);
 
        VERIFY(ifp->if_agentids == NULL);
        VERIFY(ifp->if_agentcount == 0);
@@ -6553,12 +7155,12 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
                error = if_set_qosmarking_mode(ifp,
                    IFRTYPE_QOSMARKING_FASTLANE);
                if (error != 0) {
-                       printf("%s if_set_qosmarking_mode(%s) error %d\n",
+                       DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
                            __func__, ifp->if_xname, error);
                } else {
                        ifp->if_eflags |= IFEF_QOSMARKING_ENABLED;
 #if (DEVELOPMENT || DEBUG)
-                       printf("%s fastlane enabled on %s\n",
+                       DLIL_PRINTF("%s fastlane enabled on %s\n",
                            __func__, ifp->if_xname);
 #endif /* (DEVELOPMENT || DEBUG) */
                }
@@ -6614,12 +7216,28 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
        VERIFY(ifp->if_dt_tcall != NULL);
 
        /*
-        * Finally, mark this ifnet as attached.
+        * Wait for the created kernel threads for I/O to get
+        * scheduled and run at least once before we proceed
+        * to mark interface as attached.
         */
+       lck_mtx_lock(&ifp->if_ref_lock);
+       while (ifp->if_threads_pending != 0) {
+               DLIL_PRINTF("%s: Waiting for all kernel threads created for "
+                   "interface %s to get scheduled at least once.\n",
+                   __func__, ifp->if_xname);
+               (void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
+                   __func__, NULL);
+               LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
+       }
+       lck_mtx_unlock(&ifp->if_ref_lock);
+       DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
+           "at least once. Proceeding.\n", __func__, ifp->if_xname);
+
+       /* Final mark this ifnet as attached. */
        lck_mtx_lock(rnh_lock);
        ifnet_lock_exclusive(ifp);
        lck_mtx_lock_spin(&ifp->if_ref_lock);
-       ifp->if_refflags = IFRF_ATTACHED;       /* clears embryonic */
+       ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
        lck_mtx_unlock(&ifp->if_ref_lock);
        if (net_rtref) {
                /* boot-args override; enable idle notification */
@@ -6644,7 +7262,7 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
        dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0);
 
        if (dlil_verbose) {
-               printf("%s: attached%s\n", if_name(ifp),
+               DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
                    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
        }
 
@@ -6802,6 +7420,11 @@ ifnet_detach(ifnet_t ifp)
        ifnet_head_lock_exclusive();
        ifnet_lock_exclusive(ifp);
 
+       if (ifp->if_output_netem != NULL) {
+               netem_destroy(ifp->if_output_netem);
+               ifp->if_output_netem = NULL;
+       }
+
        /*
         * Check to see if this interface has previously triggered
         * aggressive protocol draining; if so, decrement the global
@@ -6832,7 +7455,7 @@ ifnet_detach(ifnet_t ifp)
        lck_mtx_unlock(&ifp->if_ref_lock);
 
        if (dlil_verbose) {
-               printf("%s: detaching\n", if_name(ifp));
+               DLIL_PRINTF("%s: detaching\n", if_name(ifp));
        }
 
        /* clean up flow control entry object if there's any */
@@ -6847,6 +7470,17 @@ ifnet_detach(ifnet_t ifp)
        /* Reset CLAT46 flag */
        ifp->if_eflags &= ~IFEF_CLAT46;
 
+       /*
+        * We do not reset the TCP keep alive counters in case
+        * a TCP connection stays connection after the interface
+        * went down
+        */
+       if (ifp->if_tcp_kao_cnt > 0) {
+               os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
+                   __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
+       }
+       ifp->if_tcp_kao_max = 0;
+
        /*
         * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
         * no longer be visible during lookups from this point.
@@ -7009,6 +7643,8 @@ ifnet_detacher_thread_cont(int err)
                        /* NOTREACHED */
                }
 
+               net_update_uptime();
+
                VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
 
                /* Take care of detaching ifnet */
@@ -7021,10 +7657,12 @@ ifnet_detacher_thread_cont(int err)
        }
 }
 
+__dead2
 static void
 ifnet_detacher_thread_func(void *v, wait_result_t w)
 {
 #pragma unused(v, w)
+       dlil_decr_pending_thread_count();
        dlil_if_lock();
        (void) msleep0(&ifnet_delayed_run, &dlil_ifnet_lock,
            (PZERO - 1), "ifnet_detacher", 0, ifnet_detacher_thread_cont);
@@ -7059,11 +7697,16 @@ ifnet_detach_final(struct ifnet *ifp)
         * common case, so block without using a continuation.
         */
        while (ifp->if_refio > 0) {
-               printf("%s: Waiting for IO references on %s interface "
+               DLIL_PRINTF("%s: Waiting for IO references on %s interface "
                    "to be released\n", __func__, if_name(ifp));
                (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
                    (PZERO - 1), "ifnet_ioref_wait", NULL);
        }
+
+       VERIFY(ifp->if_datamov == 0);
+       VERIFY(ifp->if_drainers == 0);
+       VERIFY(ifp->if_suspend == 0);
+       ifp->if_refflags &= ~IFRF_READY;
        lck_mtx_unlock(&ifp->if_ref_lock);
 
        /* Drain and destroy send queue */
@@ -7181,6 +7824,7 @@ ifnet_detach_final(struct ifnet *ifp)
                        /* Tear down poll thread affinity */
                        if (ptp != NULL) {
                                VERIFY(ifp->if_eflags & IFEF_RXPOLL);
+                               VERIFY(ifp->if_xflags & IFXF_LEGACY);
                                (void) dlil_affinity_set(ptp,
                                    THREAD_AFFINITY_TAG_NULL);
                                thread_deallocate(ptp);
@@ -7222,6 +7866,9 @@ ifnet_detach_final(struct ifnet *ifp)
 
                /* clean-up input thread state */
                dlil_clean_threading_info(inp);
+               /* clean-up poll parameters */
+               VERIFY(ifp->if_poll_thread == THREAD_NULL);
+               dlil_reset_rxpoll_params(ifp);
        }
 
        /* The driver might unload, so point these to ourselves */
@@ -7257,6 +7904,7 @@ ifnet_detach_final(struct ifnet *ifp)
        VERIFY(ifp->if_delegated.family == 0);
        VERIFY(ifp->if_delegated.subfamily == 0);
        VERIFY(ifp->if_delegated.expensive == 0);
+       VERIFY(ifp->if_delegated.constrained == 0);
 
        /* QoS marking get cleared */
        ifp->if_eflags &= ~IFEF_QOSMARKING_ENABLED;
@@ -7317,7 +7965,7 @@ ifnet_detach_final(struct ifnet *ifp)
        }
 
        if (dlil_verbose) {
-               printf("%s: detached\n", if_name(ifp));
+               DLIL_PRINTF("%s: detached\n", if_name(ifp));
        }
 
        /* Release reference held during ifnet attach */
@@ -7475,6 +8123,7 @@ dlil_if_acquire(u_int32_t family, const void *uniqueid,
 {
        struct ifnet *ifp1 = NULL;
        struct dlil_ifnet *dlifp1 = NULL;
+       struct dlil_ifnet *dlifp1_saved = NULL;
        void *buf, *base, **pbuf;
        int ret = 0;
 
@@ -7513,10 +8162,10 @@ dlil_if_acquire(u_int32_t family, const void *uniqueid,
                                        ret = EBUSY;
                                        goto end;
                                } else {
-                                       dlifp1->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
                                        /* Cache the first interface that can be recycled */
                                        if (*ifp == NULL) {
                                                *ifp = ifp1;
+                                               dlifp1_saved = dlifp1;
                                        }
                                        /*
                                         * XXX Do not break or jump to end as we have to traverse
@@ -7530,6 +8179,12 @@ dlil_if_acquire(u_int32_t family, const void *uniqueid,
 
        /* If there's an interface that can be recycled, use that */
        if (*ifp != NULL) {
+               if (dlifp1_saved != NULL) {
+                       lck_mtx_lock(&dlifp1_saved->dl_if_lock);
+                       dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
+                       lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
+                       dlifp1_saved = NULL;
+               }
                goto end;
        }
 
@@ -7985,7 +8640,12 @@ if_state_update(struct ifnet *ifp,
 
                if (ifp->if_interface_state.interface_availability ==
                    IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
+                       os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
+                           __func__, if_name(ifp), ifp->if_index);
                        if_index_available = ifp->if_index;
+               } else {
+                       os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
+                           __func__, if_name(ifp), ifp->if_index);
                }
        }
        ifnet_lock_done(ifp);
@@ -7993,8 +8653,8 @@ if_state_update(struct ifnet *ifp,
        /*
         * Check if the TCP connections going on this interface should be
         * forced to send probe packets instead of waiting for TCP timers
-        * to fire. This will be done when there is an explicit
-        * notification that the interface became available.
+        * to fire. This is done on an explicit notification such as
+        * SIOCSIFINTERFACESTATE which marks the interface as available.
         */
        if (if_index_available > 0) {
                tcp_interface_send_probe(if_index_available);
@@ -8060,30 +8720,76 @@ if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
 }
 
 /* for uuid.c */
-int
-uuid_get_ethernet(u_int8_t *node)
+static int
+get_ether_index(int * ret_other_index)
 {
        struct ifnet *ifp;
-       struct sockaddr_dl *sdl;
+       int en0_index = 0;
+       int other_en_index = 0;
+       int any_ether_index = 0;
+       short best_unit = 0;
 
-       ifnet_head_lock_shared();
+       *ret_other_index = 0;
        TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
+               /*
+                * find en0, or if not en0, the lowest unit en*, and if not
+                * that, any ethernet
+                */
                ifnet_lock_shared(ifp);
-               IFA_LOCK_SPIN(ifp->if_lladdr);
-               sdl = (struct sockaddr_dl *)(void *)ifp->if_lladdr->ifa_addr;
-               if (sdl->sdl_type == IFT_ETHER) {
-                       memcpy(node, LLADDR(sdl), ETHER_ADDR_LEN);
-                       IFA_UNLOCK(ifp->if_lladdr);
-                       ifnet_lock_done(ifp);
-                       ifnet_head_done();
-                       return 0;
+               if (strcmp(ifp->if_name, "en") == 0) {
+                       if (ifp->if_unit == 0) {
+                               /* found en0, we're done */
+                               en0_index = ifp->if_index;
+                               ifnet_lock_done(ifp);
+                               break;
+                       }
+                       if (other_en_index == 0 || ifp->if_unit < best_unit) {
+                               other_en_index = ifp->if_index;
+                               best_unit = ifp->if_unit;
+                       }
+               } else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
+                       any_ether_index = ifp->if_index;
                }
-               IFA_UNLOCK(ifp->if_lladdr);
                ifnet_lock_done(ifp);
        }
-       ifnet_head_done();
+       if (en0_index == 0) {
+               if (other_en_index != 0) {
+                       *ret_other_index = other_en_index;
+               } else if (any_ether_index != 0) {
+                       *ret_other_index = any_ether_index;
+               }
+       }
+       return en0_index;
+}
+
+int
+uuid_get_ethernet(u_int8_t *node)
+{
+       static int en0_index;
+       struct ifnet *ifp;
+       int other_index = 0;
+       int the_index = 0;
+       int ret;
 
-       return -1;
+       ifnet_head_lock_shared();
+       if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
+               en0_index = get_ether_index(&other_index);
+       }
+       if (en0_index != 0) {
+               the_index = en0_index;
+       } else if (other_index != 0) {
+               the_index = other_index;
+       }
+       if (the_index != 0) {
+               ifp = ifindex2ifnet[the_index];
+               VERIFY(ifp != NULL);
+               memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
+               ret = 0;
+       } else {
+               ret = -1;
+       }
+       ifnet_head_done();
+       return ret;
 }
 
 static int
@@ -8184,18 +8890,18 @@ sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
        uint32_t i;
        int err;
 
-       i = if_rxpoll_wlowat;
+       i = if_sysctl_rxpoll_wlowat;
 
        err = sysctl_handle_int(oidp, &i, 0, req);
        if (err != 0 || req->newptr == USER_ADDR_NULL) {
                return err;
        }
 
-       if (i == 0 || i >= if_rxpoll_whiwat) {
+       if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
                return EINVAL;
        }
 
-       if_rxpoll_wlowat = i;
+       if_sysctl_rxpoll_wlowat = i;
        return err;
 }
 
@@ -8206,18 +8912,18 @@ sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
        uint32_t i;
        int err;
 
-       i = if_rxpoll_whiwat;
+       i = if_sysctl_rxpoll_whiwat;
 
        err = sysctl_handle_int(oidp, &i, 0, req);
        if (err != 0 || req->newptr == USER_ADDR_NULL) {
                return err;
        }
 
-       if (i <= if_rxpoll_wlowat) {
+       if (i <= if_sysctl_rxpoll_wlowat) {
                return EINVAL;
        }
 
-       if_rxpoll_whiwat = i;
+       if_sysctl_rxpoll_whiwat = i;
        return err;
 }
 
@@ -8263,13 +8969,14 @@ sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
        return err;
 }
 
-void
+int
 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
 {
        struct kev_dl_node_presence kev;
        struct sockaddr_dl *sdl;
        struct sockaddr_in6 *sin6;
+       int ret = 0;
 
        VERIFY(ifp);
        VERIFY(sa);
@@ -8284,32 +8991,97 @@ dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
        kev.node_proximity_metric = npm;
        bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
 
-       nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
-       dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
-           &kev.link_data, sizeof(kev));
+       ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
+       if (ret == 0) {
+               int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
+                   &kev.link_data, sizeof(kev));
+               if (err != 0) {
+                       log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
+                           "error %d\n", __func__, err);
+               }
+       }
+       return ret;
 }
 
 void
 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
 {
-       struct kev_dl_node_absence kev;
-       struct sockaddr_in6 *sin6;
-       struct sockaddr_dl *sdl;
+       struct kev_dl_node_absence kev = {};
+       struct sockaddr_in6 *kev_sin6 = NULL;
+       struct sockaddr_dl *kev_sdl = NULL;
 
-       VERIFY(ifp);
-       VERIFY(sa);
+       VERIFY(ifp != NULL);
+       VERIFY(sa != NULL);
        VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
 
-       bzero(&kev, sizeof(kev));
-       sin6 = &kev.sin6_node_address;
-       sdl = &kev.sdl_node_address;
-       nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
+       kev_sin6 = &kev.sin6_node_address;
+       kev_sdl = &kev.sdl_node_address;
+
+       if (sa->sa_family == AF_INET6) {
+               /*
+                * If IPv6 address is given, get the link layer
+                * address from what was cached in the neighbor cache
+                */
+               VERIFY(sa->sa_len <= sizeof(*kev_sin6));
+               bcopy(sa, kev_sin6, sa->sa_len);
+               nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
+       } else {
+               /*
+                * If passed address is AF_LINK type, derive the address
+                * based on the link address.
+                */
+               nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
+               nd6_alt_node_absent(ifp, kev_sin6, NULL);
+       }
+
+       kev_sdl->sdl_type = ifp->if_type;
+       kev_sdl->sdl_index = ifp->if_index;
 
-       nd6_alt_node_absent(ifp, sin6);
        dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
            &kev.link_data, sizeof(kev));
 }
 
+int
+dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
+    int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
+{
+       struct kev_dl_node_presence kev = {};
+       struct sockaddr_dl *kev_sdl = NULL;
+       struct sockaddr_in6 *kev_sin6 = NULL;
+       int ret = 0;
+
+       VERIFY(ifp != NULL);
+       VERIFY(sa != NULL && sdl != NULL);
+       VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
+
+       kev_sin6 = &kev.sin6_node_address;
+       kev_sdl = &kev.sdl_node_address;
+
+       VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
+       bcopy(sdl, kev_sdl, sdl->sdl_len);
+       kev_sdl->sdl_type = ifp->if_type;
+       kev_sdl->sdl_index = ifp->if_index;
+
+       VERIFY(sa->sa_len <= sizeof(*kev_sin6));
+       bcopy(sa, kev_sin6, sa->sa_len);
+
+       kev.rssi = rssi;
+       kev.link_quality_metric = lqm;
+       kev.node_proximity_metric = npm;
+       bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
+
+       ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
+       if (ret == 0) {
+               int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
+                   &kev.link_data, sizeof(kev));
+               if (err != 0) {
+                       log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with",
+                           "error %d\n", __func__, err);
+               }
+       }
+       return ret;
+}
+
 const void *
 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
     kauth_cred_t *credp)
@@ -8484,8 +9256,11 @@ ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
        IFCQ_UNLOCK(ifq);
 
        if (err == 0) {
-               printf("%s: throttling level set to %d\n", if_name(ifp),
+               DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
                    level);
+#if NECP
+               necp_update_all_clients();
+#endif /* NECP */
                if (level == IFNET_THROTTLE_OFF) {
                        ifnet_start(ifp);
                }
@@ -9133,7 +9908,7 @@ dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
        if (frame_header == NULL ||
            frame_header < (char *)mbuf_datastart(m) ||
            frame_header > (char *)m->m_data) {
-               printf("%s: frame header pointer 0x%llx out of range "
+               DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
                    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
                    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
                    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
@@ -9193,7 +9968,7 @@ dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
                if (hlen > rxoff) {
                        hwcksum_dbg_bad_rxoff++;
                        if (dlil_verbose) {
-                               printf("%s: partial cksum start offset %d "
+                               DLIL_PRINTF("%s: partial cksum start offset %d "
                                    "is less than frame header length %d for "
                                    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
                                    (uint64_t)VM_KERNEL_ADDRPERM(m));
@@ -9214,7 +9989,7 @@ dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
                        if (sum != m->m_pkthdr.csum_rx_val) {
                                hwcksum_dbg_bad_cksum++;
                                if (dlil_verbose) {
-                                       printf("%s: bad partial cksum value "
+                                       DLIL_PRINTF("%s: bad partial cksum value "
                                            "0x%x (expected 0x%x) for mbuf "
                                            "0x%llx [rx_start %d]\n",
                                            if_name(ifp),
@@ -9653,7 +10428,7 @@ sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
        error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
            frames_array_count, frame_data_offset, &used_frames_count);
        if (error != 0) {
-               printf("%s: ifnet_get_keepalive_offload_frames error %d\n",
+               DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
                    __func__, error);
                goto done;
        }
@@ -9679,27 +10454,3 @@ ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
 {
        tcp_update_stats_per_flow(ifs, ifp);
 }
-
-static void
-dlil_mit_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
-{
-#pragma unused(arg1)
-       struct ifnet *ifp = (struct ifnet *)arg0;
-       struct dlil_threading_info *inp = ifp->if_inp;
-
-       ifnet_lock_shared(ifp);
-       if (!IF_FULLY_ATTACHED(ifp) || inp == NULL) {
-               ifnet_lock_done(ifp);
-               return;
-       }
-
-       lck_mtx_lock_spin(&inp->input_lck);
-       inp->input_waiting |= DLIL_INPUT_WAITING;
-       if (!(inp->input_waiting & DLIL_INPUT_RUNNING) ||
-           !qempty(&inp->rcvq_pkts)) {
-               inp->wtot++;
-               wakeup_one((caddr_t)&inp->input_waiting);
-       }
-       lck_mtx_unlock(&inp->input_lck);
-       ifnet_lock_done(ifp);
-}
index 881cf0505d23a265482b5db121120874bce988fc..7f2753cba4148d6c657dc27abd61ef313799c93e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 1999-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -97,6 +97,12 @@ enum {
        }                                                               \
 } while (0)
 
+#define net_timerusec(tvp, nsp) do {                                    \
+       *(nsp) = (tvp)->tv_nsec / NSEC_PER_USEC;                        \
+       if ((tvp)->tv_sec > 0)                                          \
+               *(nsp) += ((tvp)->tv_sec * USEC_PER_SEC);               \
+} while (0)
+
 #define net_timernsec(tvp, nsp) do {                                    \
        *(nsp) = (tvp)->tv_nsec;                                        \
        if ((tvp)->tv_sec > 0)                                          \
@@ -165,44 +171,12 @@ struct dlil_threading_info {
        struct thread   *wloop_thr;     /* workloop thread */
        struct thread   *poll_thr;      /* poll thread */
        u_int32_t       tag;            /* affinity tag */
-       /*
-        * Opportunistic polling.
-        */
-       ifnet_model_t   mode;           /* current mode */
-       struct pktcntr  tstats;         /* incremental polling statistics */
-       struct if_rxpoll_stats pstats;  /* polling statistics */
-#define rxpoll_offreq   pstats.ifi_poll_off_req
-#define rxpoll_offerr   pstats.ifi_poll_off_err
-#define rxpoll_onreq    pstats.ifi_poll_on_req
-#define rxpoll_onerr    pstats.ifi_poll_on_err
-#define rxpoll_wavg     pstats.ifi_poll_wakeups_avg
-#define rxpoll_wlowat   pstats.ifi_poll_wakeups_lowat
-#define rxpoll_whiwat   pstats.ifi_poll_wakeups_hiwat
-#define rxpoll_pavg     pstats.ifi_poll_packets_avg
-#define rxpoll_pmin     pstats.ifi_poll_packets_min
-#define rxpoll_pmax     pstats.ifi_poll_packets_max
-#define rxpoll_plowat   pstats.ifi_poll_packets_lowat
-#define rxpoll_phiwat   pstats.ifi_poll_packets_hiwat
-#define rxpoll_bavg     pstats.ifi_poll_bytes_avg
-#define rxpoll_bmin     pstats.ifi_poll_bytes_min
-#define rxpoll_bmax     pstats.ifi_poll_bytes_max
-#define rxpoll_blowat   pstats.ifi_poll_bytes_lowat
-#define rxpoll_bhiwat   pstats.ifi_poll_bytes_hiwat
-#define rxpoll_plim     pstats.ifi_poll_packets_limit
-#define rxpoll_ival     pstats.ifi_poll_interval_time
-       struct pktcntr  sstats;         /* packets and bytes per sampling */
-       struct timespec mode_holdtime;  /* mode holdtime in nsec */
-       struct timespec mode_lasttime;  /* last mode change time in nsec */
-       struct timespec sample_holdtime; /* sampling holdtime in nsec */
-       struct timespec sample_lasttime; /* last sampling time in nsec */
-       struct timespec dbg_lasttime;   /* last debug message time in nsec */
 #if IFNET_INPUT_SANITY_CHK
        /*
         * For debugging.
         */
        u_int64_t       input_mbuf_cnt; /* total # of packets processed */
 #endif
-       thread_call_t   input_mit_tcall; /* coalescing input processing */
 };
 
 /*
@@ -230,11 +204,20 @@ struct dlil_main_threading_info {
 #define DLIL_IFF_TSO            0x01    /* Interface filter supports TSO */
 #define DLIL_IFF_INTERNAL       0x02    /* Apple internal -- do not count towards stats */
 
+/* Input poll interval definitions */
+#define IF_RXPOLL_INTERVALTIME_MIN      (1ULL * 1000)           /* 1 us */
+#define IF_RXPOLL_INTERVALTIME          (1ULL * 1000 * 1000)    /* 1 ms */
+
 extern int dlil_verbose;
 extern uint32_t hwcksum_dbg;
 extern uint32_t hwcksum_tx;
 extern uint32_t hwcksum_rx;
 extern struct dlil_threading_info *dlil_main_input_thread;
+extern unsigned int net_rxpoll;
+extern uint32_t if_rxpoll;
+extern uint32_t if_rxpoll_decay;
+extern uint32_t if_rxpoll_interval_pkts;
+extern uint32_t if_rcvq_maxlen;
 
 extern void dlil_init(void);
 
@@ -323,7 +306,7 @@ extern void dlil_detach_filter(interface_filter_t);
 
 extern void dlil_proto_unplumb_all(ifnet_t);
 
-extern void dlil_post_msg(struct ifnet *, u_int32_t, u_int32_t,
+extern int dlil_post_msg(struct ifnet *, u_int32_t, u_int32_t,
     struct net_event_data *, u_int32_t);
 
 extern void dlil_post_sifflags_msg(struct ifnet *);
@@ -332,6 +315,14 @@ extern int dlil_post_complete_msg(struct ifnet *, struct kev_msg *);
 
 extern int dlil_alloc_local_stats(struct ifnet *);
 
+extern void ifnet_filter_update_tso(boolean_t filter_enable);
+extern errno_t dlil_rxpoll_validate_params(struct ifnet_poll_params *);
+extern void dlil_rxpoll_update_params(struct ifnet *,
+    struct ifnet_poll_params *);
+extern void ifnet_poll(struct ifnet *);
+extern errno_t ifnet_input_poll(struct ifnet *, struct mbuf *,
+    struct mbuf *, const struct ifnet_stat_increment_param *);
+
 
 /*
  * dlil_if_acquire is obsolete. Use ifnet_allocate.
@@ -346,9 +337,11 @@ extern void dlil_if_release(struct ifnet *ifp);
 extern errno_t dlil_if_ref(struct ifnet *);
 extern errno_t dlil_if_free(struct ifnet *);
 
-extern void dlil_node_present(struct ifnet *, struct sockaddr *, int32_t, int,
+extern int dlil_node_present(struct ifnet *, struct sockaddr *, int32_t, int,
     int, u_int8_t[48]);
 extern void dlil_node_absent(struct ifnet *, struct sockaddr *);
+extern int dlil_node_present_v2(struct ifnet *, struct sockaddr *, struct sockaddr_dl *, int32_t, int,
+    int, u_int8_t[48]);
 
 extern const void *dlil_ifaddr_bytes(const struct sockaddr_dl *, size_t *,
     kauth_cred_t *);
@@ -356,7 +349,7 @@ extern const void *dlil_ifaddr_bytes(const struct sockaddr_dl *, size_t *,
 extern void dlil_report_issues(struct ifnet *, u_int8_t[DLIL_MODIDLEN],
     u_int8_t[DLIL_MODARGLEN]);
 
-#define PROTO_HASH_SLOTS        4
+#define PROTO_HASH_SLOTS        5
 
 extern int proto_hash_value(u_int32_t);
 
index 35acf10648f3c8ac2b89e3d6f7db564146d6465d..f26aec76c489a6f40359183da05a37775027721b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -96,6 +96,7 @@
 #include <net/ether_if_module.h>
 #include <sys/socketvar.h>
 #include <net/if_vlan_var.h>
+#include <net/if_6lowpan_var.h>
 #if BOND
 #include <net/if_bond_internal.h>
 #endif /* BOND */
 #if IF_FAKE
 #include <net/if_fake_var.h>
 #endif /* IF_FAKE */
+#if IF_HEADLESS
+extern void if_headless_init(void);
+#endif /* IF_HEADLESS */
 
 #include <net/dlil.h>
 
@@ -377,12 +381,6 @@ ether_demux(ifnet_t ifp, mbuf_t m, char *frame_header,
                m->m_flags &= ~M_HASFCS;
        }
 
-       if (ifp->if_eflags & IFEF_BOND) {
-               /* if we're bonded, bond "protocol" gets all the packets */
-               *protocol_family = PF_BOND;
-               return 0;
-       }
-
        if ((eh->ether_dhost[0] & 1) == 0) {
                /*
                 * When the driver is put into promiscuous mode we may receive
@@ -396,6 +394,12 @@ ether_demux(ifnet_t ifp, mbuf_t m, char *frame_header,
                }
        }
 
+       /* check for IEEE 802.15.4 */
+       if (ether_type == htons(ETHERTYPE_IEEE802154)) {
+               *protocol_family = PF_802154;
+               return 0;
+       }
+
        /* check for VLAN */
        if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
                if (EVL_VLANOFTAG(m->m_pkthdr.vlan_tag) != 0) {
@@ -655,7 +659,12 @@ ether_family_init(void)
 #if IF_FAKE
        if_fake_init();
 #endif /* IF_FAKE */
-
+#if IF_HEADLESS
+       if_headless_init();
+#endif /* IF_HEADLESS */
+#if SIXLOWPAN
+       sixlowpan_family_init();
+#endif /* VLAN */
 done:
 
        return error;
index 9dda79efb8323cb7fd6d5a42330347cd4d1dc4b7..c4df507205feb70e08609f53e0b03130e4b20361 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -108,6 +108,7 @@ typedef struct  ether_addr {
 #define ETHERTYPE_PTP           0x88f7  /* IEEE 1588 Precision Time Protocol */
 #define ETHERTYPE_LOOPBACK      0x9000  /* used to test interfaces */
 /* XXX - add more useful types here */
+#define ETHERTYPE_IEEE802154    0x0809  /* 802.15.4 */
 
 /*
  * The ETHERTYPE_NTRAILER packet types starting at ETHERTYPE_TRAIL have
index 020081305138572404a137bf535cfaaa38ff771e..2d6a41ecb930344dbd3da750cf5d6a1362e2441c 100644 (file)
@@ -246,6 +246,7 @@ flowadv_thread_cont(int err)
        }
 }
 
+__dead2
 static void
 flowadv_thread_func(void *v, wait_result_t w)
 {
diff --git a/bsd/net/frame802154.c b/bsd/net/frame802154.c
new file mode 100644 (file)
index 0000000..e5f2e93
--- /dev/null
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ *
+ *  Copyright (c) 2008, Swedish Institute of Computer Science
+ *  All rights reserved.
+ *
+ *  Additional fixes for AVR contributed by:
+ *
+ *      Colin O'Flynn coflynn@newae.com
+ *      Eric Gnoske egnoske@gmail.com
+ *      Blake Leverett bleverett@gmail.com
+ *      Mike Vidales mavida404@gmail.com
+ *      Kevin Brown kbrown3@uccs.edu
+ *      Nate Bohlmann nate@elfwerks.com
+ *
+ *  Additional fixes for MSP430 contributed by:
+ *        Joakim Eriksson
+ *        Niclas Finne
+ *        Nicolas Tsiftes
+ *
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of the copyright holders nor the names of
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+/*
+ *  \brief This file is where the main functions that relate to frame
+ *  manipulation will reside.
+ */
+
+/**
+ *  \file
+ *  \brief 802.15.4 frame creation and parsing functions
+ *
+ *  This file converts to and from a structure to a packed 802.15.4
+ *  frame.
+ */
+
+/**
+ *   \addtogroup frame802154
+ *   @{
+ */
+
+#include "cc.h"
+#include "frame802154.h"
+//#include "net/llsec/llsec802154.h"
+#include "linkaddr.h"
+#include <string.h>
+
+/**
+ *  \brief Structure that contains the lengths of the various addressing and security fields
+ *  in the 802.15.4 header.  This structure is used in \ref frame802154_create()
+ */
+typedef struct {
+       uint8_t dest_pid_len;    /**<  Length (in bytes) of destination PAN ID field */
+       uint8_t dest_addr_len;   /**<  Length (in bytes) of destination address field */
+       uint8_t src_pid_len;     /**<  Length (in bytes) of source PAN ID field */
+       uint8_t src_addr_len;    /**<  Length (in bytes) of source address field */
+       uint8_t aux_sec_len;     /**<  Length (in bytes) of aux security header field */
+} field_length_t;
+
+/*----------------------------------------------------------------------------*/
+CC_INLINE static uint8_t
+addr_len(uint8_t mode)
+{
+       switch (mode) {
+       case FRAME802154_SHORTADDRMODE:          /* 16-bit address */
+               return 2;
+       case FRAME802154_LONGADDRMODE:           /* 64-bit address */
+               return 8;
+       default:
+               return 0;
+       }
+}
+/*----------------------------------------------------------------------------*/
+#if LLSEC802154_USES_EXPLICIT_KEYS
+static uint8_t
+get_key_id_len(uint8_t key_id_mode)
+{
+       switch (key_id_mode) {
+       case FRAME802154_1_BYTE_KEY_ID_MODE:
+               return 1;
+       case FRAME802154_5_BYTE_KEY_ID_MODE:
+               return 5;
+       case FRAME802154_9_BYTE_KEY_ID_MODE:
+               return 9;
+       default:
+               return 0;
+       }
+}
+#endif /* LLSEC802154_USES_EXPLICIT_KEYS */
+/*----------------------------------------------------------------------------*/
+static void
+field_len(frame802154_t *p, field_length_t *flen)
+{
+       /* init flen to zeros */
+       memset(flen, 0, sizeof(field_length_t));
+
+       /* Determine lengths of each field based on fcf and other args */
+       if (p->fcf.dest_addr_mode & 3) {
+               flen->dest_pid_len = 2;
+       }
+       if (p->fcf.src_addr_mode & 3) {
+               flen->src_pid_len = 2;
+       }
+
+       /* Set PAN ID compression bit if src pan id matches dest pan id. */
+       if (p->fcf.dest_addr_mode & 3 && p->fcf.src_addr_mode & 3 &&
+           p->src_pid == p->dest_pid) {
+               p->fcf.panid_compression = 1;
+
+               /* compressed header, only do dest pid */
+               flen->src_pid_len = 0;
+       } else {
+               p->fcf.panid_compression = 0;
+       }
+
+       /* determine address lengths */
+       flen->dest_addr_len = addr_len(p->fcf.dest_addr_mode & 3);
+       flen->src_addr_len = addr_len(p->fcf.src_addr_mode & 3);
+
+#if LLSEC802154_SECURITY_LEVEL
+       /* Aux security header */
+       if (p->fcf.security_enabled & 1) {
+               flen->aux_sec_len = 5
+#if LLSEC802154_USES_EXPLICIT_KEYS
+                   + get_key_id_len(p->aux_hdr.security_control.key_id_mode);
+#endif /* LLSEC802154_USES_EXPLICIT_KEYS */
+               ;
+       }
+#endif /* LLSEC802154_SECURITY_LEVEL */
+}
+/*----------------------------------------------------------------------------*/
+/**
+ *   \brief Calculates the length of the frame header.  This function is
+ *   meant to be called by a higher level function, that interfaces to a MAC.
+ *
+ *   \param p Pointer to frame802154_t_t struct, which specifies the
+ *   frame to send.
+ *
+ *   \return The length of the frame header.
+ */
+int
+frame802154_hdrlen(frame802154_t *p)
+{
+       field_length_t flen;
+       field_len(p, &flen);
+       return 3 + flen.dest_pid_len + flen.dest_addr_len +
+              flen.src_pid_len + flen.src_addr_len + flen.aux_sec_len;
+}
+/*----------------------------------------------------------------------------*/
+/**
+ *   \brief Creates a frame for transmission over the air.  This function is
+ *   meant to be called by a higher level function, that interfaces to a MAC.
+ *
+ *   \param p Pointer to frame802154_t struct, which specifies the
+ *   frame to send.
+ *
+ *   \param buf Pointer to the buffer to use for the frame.
+ *
+ *   \return The length of the frame header
+ */
+int
+frame802154_create(frame802154_t *p, uint8_t *buf)
+{
+       int c;
+       field_length_t flen;
+       uint8_t pos;
+#if LLSEC802154_USES_EXPLICIT_KEYS
+       uint8_t key_id_mode;
+#endif /* LLSEC802154_USES_EXPLICIT_KEYS */
+
+       field_len(p, &flen);
+
+       /* OK, now we have field lengths.  Time to actually construct */
+       /* the outgoing frame, and store it in buf */
+       buf[0] = (p->fcf.frame_type & 7) |
+           ((p->fcf.security_enabled & 1) << 3) |
+           ((p->fcf.frame_pending & 1) << 4) |
+           ((p->fcf.ack_required & 1) << 5) |
+           ((p->fcf.panid_compression & 1) << 6);
+       buf[1] = ((p->fcf.dest_addr_mode & 3) << 2) |
+           ((p->fcf.frame_version & 3) << 4) |
+           ((p->fcf.src_addr_mode & 3) << 6);
+
+       /* sequence number */
+       buf[2] = p->seq;
+       pos = 3;
+
+       /* Destination PAN ID */
+       if (flen.dest_pid_len == 2) {
+               buf[pos++] = p->dest_pid & 0xff;
+               buf[pos++] = (p->dest_pid >> 8) & 0xff;
+       }
+
+       /* Destination address */
+       for (c = flen.dest_addr_len; c > 0; c--) {
+               buf[pos++] = p->dest_addr[c - 1];
+       }
+
+       /* Source PAN ID */
+       if (flen.src_pid_len == 2) {
+               buf[pos++] = p->src_pid & 0xff;
+               buf[pos++] = (p->src_pid >> 8) & 0xff;
+       }
+
+       /* Source address */
+       for (c = flen.src_addr_len; c > 0; c--) {
+               buf[pos++] = p->src_addr[c - 1];
+       }
+
+#if LLSEC802154_SECURITY_LEVEL
+       /* Aux header */
+       if (flen.aux_sec_len) {
+               buf[pos++] = p->aux_hdr.security_control.security_level
+#if LLSEC802154_USES_EXPLICIT_KEYS
+                   | (p->aux_hdr.security_control.key_id_mode << 3)
+#endif /* LLSEC802154_USES_EXPLICIT_KEYS */
+               ;
+               memcpy(buf + pos, p->aux_hdr.frame_counter.u8, 4);
+               pos += 4;
+
+#if LLSEC802154_USES_EXPLICIT_KEYS
+               key_id_mode = p->aux_hdr.security_control.key_id_mode;
+               if (key_id_mode) {
+                       c = (key_id_mode - 1) * 4;
+                       memcpy(buf + pos, p->aux_hdr.key_source.u8, c);
+                       pos += c;
+                       buf[pos++] = p->aux_hdr.key_index;
+               }
+#endif /* LLSEC802154_USES_EXPLICIT_KEYS */
+       }
+#endif /* LLSEC802154_SECURITY_LEVEL */
+
+       return (int)pos;
+}
+/*----------------------------------------------------------------------------*/
+/**
+ *   \brief Parses an input frame.  Scans the input frame to find each
+ *   section, and stores the information of each section in a
+ *   frame802154_t structure.
+ *
+ *   \param data The input data from the radio chip.
+ *   \param len The size of the input data
+ *   \param pf The frame802154_t struct to store the parsed frame information.
+ */
+int
+frame802154_parse(uint8_t *data, int len, frame802154_t *pf, uint8_t **payload)
+{
+       uint8_t *p;
+       frame802154_fcf_t fcf;
+       int c;
+#if LLSEC802154_USES_EXPLICIT_KEYS
+       uint8_t key_id_mode;
+#endif /* LLSEC802154_USES_EXPLICIT_KEYS */
+
+       if (len < 3) {
+               return 0;
+       }
+
+       p = data;
+
+       /* decode the FCF */
+       fcf.frame_type = p[0] & 7;
+       fcf.security_enabled = (p[0] >> 3) & 1;
+       fcf.frame_pending = (p[0] >> 4) & 1;
+       fcf.ack_required = (p[0] >> 5) & 1;
+       fcf.panid_compression = (p[0] >> 6) & 1;
+
+       fcf.dest_addr_mode = (p[1] >> 2) & 3;
+       fcf.frame_version = (p[1] >> 4) & 3;
+       fcf.src_addr_mode = (p[1] >> 6) & 3;
+
+       /* copy fcf and seqNum */
+       memcpy(&pf->fcf, &fcf, sizeof(frame802154_fcf_t));
+       pf->seq = p[2];
+       p += 3;                             /* Skip first three bytes */
+
+       /* Destination address, if any */
+       if (fcf.dest_addr_mode) {
+               /* Destination PAN */
+               pf->dest_pid = p[0] + (p[1] << 8);
+               p += 2;
+
+               /* Destination address */
+               /*     l = addr_len(fcf.dest_addr_mode); */
+               /*     for(c = 0; c < l; c++) { */
+               /*       pf->dest_addr.u8[c] = p[l - c - 1]; */
+               /*     } */
+               /*     p += l; */
+               if (fcf.dest_addr_mode == FRAME802154_SHORTADDRMODE) {
+                       linkaddr_copy((linkaddr_t *)(uintptr_t)&(pf->dest_addr), &linkaddr_null);
+                       pf->dest_addr[0] = p[1];
+                       pf->dest_addr[1] = p[0];
+                       p += 2;
+               } else if (fcf.dest_addr_mode == FRAME802154_LONGADDRMODE) {
+                       for (c = 0; c < 8; c++) {
+                               pf->dest_addr[c] = p[7 - c];
+                       }
+                       p += 8;
+               }
+       } else {
+               linkaddr_copy((linkaddr_t *)(uintptr_t)&(pf->dest_addr), &linkaddr_null);
+               pf->dest_pid = 0;
+       }
+
+       /* Source address, if any */
+       if (fcf.src_addr_mode) {
+               /* Source PAN */
+               if (!fcf.panid_compression) {
+                       pf->src_pid = p[0] + (p[1] << 8);
+                       p += 2;
+               } else {
+                       pf->src_pid = pf->dest_pid;
+               }
+
+               /* Source address */
+               /*     l = addr_len(fcf.src_addr_mode); */
+               /*     for(c = 0; c < l; c++) { */
+               /*       pf->src_addr.u8[c] = p[l - c - 1]; */
+               /*     } */
+               /*     p += l; */
+               if (fcf.src_addr_mode == FRAME802154_SHORTADDRMODE) {
+                       linkaddr_copy((linkaddr_t *)(uintptr_t)&(pf->src_addr), &linkaddr_null);
+                       pf->src_addr[0] = p[1];
+                       pf->src_addr[1] = p[0];
+                       p += 2;
+               } else if (fcf.src_addr_mode == FRAME802154_LONGADDRMODE) {
+                       for (c = 0; c < 8; c++) {
+                               pf->src_addr[c] = p[7 - c];
+                       }
+                       p += 8;
+               }
+       } else {
+               linkaddr_copy((linkaddr_t *)(uintptr_t)&(pf->src_addr), &linkaddr_null);
+               pf->src_pid = 0;
+       }
+
+#if LLSEC802154_SECURITY_LEVEL
+       if (fcf.security_enabled) {
+               pf->aux_hdr.security_control.security_level = p[0] & 7;
+#if LLSEC802154_USES_EXPLICIT_KEYS
+               pf->aux_hdr.security_control.key_id_mode = (p[0] >> 3) & 3;
+#endif /* LLSEC802154_USES_EXPLICIT_KEYS */
+               p += 1;
+
+               memcpy(pf->aux_hdr.frame_counter.u8, p, 4);
+               p += 4;
+
+#if LLSEC802154_USES_EXPLICIT_KEYS
+               key_id_mode = pf->aux_hdr.security_control.key_id_mode;
+               if (key_id_mode) {
+                       c = (key_id_mode - 1) * 4;
+                       memcpy(pf->aux_hdr.key_source.u8, p, c);
+                       p += c;
+                       pf->aux_hdr.key_index = p[0];
+                       p += 1;
+               }
+#endif /* LLSEC802154_USES_EXPLICIT_KEYS */
+       }
+#endif /* LLSEC802154_SECURITY_LEVEL */
+
+       /* header length */
+       c = p - data;
+       /* payload length */
+       pf->payload_len = (len - c);
+       /* payload */
+       *payload = p;
+
+       /* return header length if successful */
+       return c > len ? 0 : c;
+}
+/** \}   */
diff --git a/bsd/net/frame802154.h b/bsd/net/frame802154.h
new file mode 100644 (file)
index 0000000..fbdb29c
--- /dev/null
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ *  Copyright (c) 2008, Swedish Institute of Computer Science
+ *  All rights reserved.
+ *
+ *  Additional fixes for AVR contributed by:
+ *        Colin O'Flynn coflynn@newae.com
+ *        Eric Gnoske egnoske@gmail.com
+ *        Blake Leverett bleverett@gmail.com
+ *        Mike Vidales mavida404@gmail.com
+ *        Kevin Brown kbrown3@uccs.edu
+ *        Nate Bohlmann nate@elfwerks.com
+ *
+ *  Additional fixes for MSP430 contributed by:
+ *        Joakim Eriksson
+ *        Niclas Finne
+ *        Nicolas Tsiftes
+ *
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of the copyright holders nor the names of
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ *    \addtogroup net
+ *    @{
+ */
+
+/**
+ *    \defgroup frame802154 802.15.4 frame creation and parsing
+ *    @{
+ */
+/**
+ *  \file
+ *  \brief 802.15.4 frame creation and parsing functions
+ *
+ *  This file converts to and from a structure to a packed 802.15.4
+ *  frame.
+ *
+ */
+
+/* Includes */
+#ifndef FRAME_802154_H
+#define FRAME_802154_H
+
+#include "contiki-conf.h"
+
+#include <stdint.h>
+
+#ifdef IEEE802154_CONF_PANID
+#define IEEE802154_PANID        IEEE802154_CONF_PANID
+#else /* IEEE802154_CONF_PANID */
+#define IEEE802154_PANID        0xABCD
+#endif /* IEEE802154_CONF_PANID */
+
+/* Macros & Defines */
+
+/** \brief These are some definitions of values used in the FCF.  See the 802.15.4 spec for details.
+ *  \name FCF element values definitions
+ *  @{
+ */
+#define FRAME802154_BEACONFRAME         (0x00)
+#define FRAME802154_DATAFRAME           (0x01)
+#define FRAME802154_ACKFRAME            (0x02)
+#define FRAME802154_CMDFRAME            (0x03)
+
+#define FRAME802154_BEACONREQ           (0x07)
+
+#define FRAME802154_IEEERESERVED        (0x00)
+#define FRAME802154_NOADDR              (0x00)      /**< Only valid for ACK or Beacon frames. */
+#define FRAME802154_SHORTADDRMODE       (0x02)
+#define FRAME802154_LONGADDRMODE        (0x03)
+
+#define FRAME802154_NOBEACONS           (0x0F)
+
+#define FRAME802154_BROADCASTADDR       (0xFFFF)
+#define FRAME802154_BROADCASTPANDID     (0xFFFF)
+
+#define FRAME802154_IEEE802154_2003     (0x00)
+#define FRAME802154_IEEE802154_2006     (0x01)
+
+#define FRAME802154_SECURITY_LEVEL_NONE         (0)
+#define FRAME802154_SECURITY_LEVEL_MIC_32       (1)
+#define FRAME802154_SECURITY_LEVEL_MIC_64       (2)
+#define FRAME802154_SECURITY_LEVEL_MIC_128      (3)
+#define FRAME802154_SECURITY_LEVEL_ENC          (4)
+#define FRAME802154_SECURITY_LEVEL_ENC_MIC_32   (5)
+#define FRAME802154_SECURITY_LEVEL_ENC_MIC_64   (6)
+#define FRAME802154_SECURITY_LEVEL_ENC_MIC_128  (7)
+
+#define FRAME802154_IMPLICIT_KEY                (0)
+#define FRAME802154_1_BYTE_KEY_ID_MODE          (1)
+#define FRAME802154_5_BYTE_KEY_ID_MODE          (2)
+#define FRAME802154_9_BYTE_KEY_ID_MODE          (3)
+
+/**
+ *    @brief  The IEEE 802.15.4 frame has a number of constant/fixed fields that
+ *            can be counted to make frame construction and max payload
+ *            calculations easier.
+ *
+ *            These include:
+ *            1. FCF                  - 2 bytes       - Fixed
+ *            2. Sequence number      - 1 byte        - Fixed
+ *            3. Addressing fields    - 4 - 20 bytes  - Variable
+ *            4. Aux security header  - 0 - 14 bytes  - Variable
+ *            5. CRC                  - 2 bytes       - Fixed
+ */
+
+/**
+ * \brief Defines the bitfields of the frame control field (FCF).
+ */
+typedef struct {
+       uint8_t frame_type;        /**< 3 bit. Frame type field, see 802.15.4 */
+       uint8_t security_enabled;  /**< 1 bit. True if security is used in this frame */
+       uint8_t frame_pending;     /**< 1 bit. True if sender has more data to send */
+       uint8_t ack_required;      /**< 1 bit. Is an ack frame required? */
+       uint8_t panid_compression; /**< 1 bit. Is this a compressed header? */
+       /*   uint8_t reserved; */  /**< 3 bit. Unused bits */
+       uint8_t dest_addr_mode;    /**< 2 bit. Destination address mode, see 802.15.4 */
+       uint8_t frame_version;     /**< 2 bit. 802.15.4 frame version */
+       uint8_t src_addr_mode;     /**< 2 bit. Source address mode, see 802.15.4 */
+} frame802154_fcf_t;
+
+/** \brief 802.15.4 security control bitfield.  See section 7.6.2.2.1 in 802.15.4 specification */
+typedef struct {
+       uint8_t  security_level; /**< 3 bit. security level      */
+       uint8_t  key_id_mode;    /**< 2 bit. Key identifier mode */
+       uint8_t  reserved;       /**< 3 bit. Reserved bits       */
+} frame802154_scf_t;
+
+typedef union {
+       uint32_t u32;
+       uint16_t u16[2];
+       uint8_t u8[4];
+} frame802154_frame_counter_t;
+
+typedef union {
+       uint16_t u16[4];
+       uint8_t u8[8];
+} frame802154_key_source_t;
+
+/** \brief 802.15.4 Aux security header */
+typedef struct {
+       frame802154_scf_t security_control;        /**< Security control bitfield */
+       frame802154_frame_counter_t frame_counter; /**< Frame counter, used for security */
+       frame802154_key_source_t key_source;       /**< Key Source subfield */
+       uint8_t key_index;                         /**< Key Index subfield */
+} frame802154_aux_hdr_t;
+
+/** \brief Parameters used by the frame802154_create() function.  These
+ *  parameters are used in the 802.15.4 frame header.  See the 802.15.4
+ *  specification for details.
+ */
+struct frame802154 {
+       /* The fields dest_addr and src_addr must come first to ensure they are aligned to the
+        * CPU word size. Needed as they are accessed directly as linkaddr_t*. Note we cannot use
+        * the type linkaddr_t directly here, as we always need 8 bytes, not LINKADDR_SIZE bytes. */
+       uint8_t dest_addr[8];           /**< Destination address */
+       uint8_t src_addr[8];            /**< Source address */
+       frame802154_fcf_t fcf;          /**< Frame control field  */
+       uint8_t seq;                    /**< Sequence number */
+       uint16_t dest_pid;              /**< Destination PAN ID */
+       uint16_t src_pid;               /**< Source PAN ID */
+       frame802154_aux_hdr_t aux_hdr;  /**< Aux security header */
+       //uint8_t *payload;               /**< Pointer to 802.15.4 payload */
+       int payload_len;                /**< Length of payload field */
+};
+typedef struct frame802154 frame802154_t;
+
+/* Prototypes */
+
+int frame802154_hdrlen(frame802154_t *p);
+int frame802154_create(frame802154_t *p, uint8_t *buf);
+int frame802154_parse(uint8_t *data, int length, frame802154_t *pf, uint8_t **payload);
+
+/** @} */
+#endif /* FRAME_802154_H */
+/** @} */
+/** @} */
index 79921cbdba6301ca2663557d02a3f03e7b888f0a..dcb728807e59aa1b923b92907331df85fd66059d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <net/if_ppp.h>
 #include <net/ethernet.h>
 #include <net/network_agent.h>
+#include <net/pktsched/pktsched_netem.h>
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/dlil.h>
 #include <security/mac_framework.h>
 #endif
 
+
 #include <os/log.h>
 
 /*
@@ -247,6 +249,14 @@ static uint32_t if_verbose = 0;
 SYSCTL_INT(_net_link_generic_system, OID_AUTO, if_verbose,
     CTLFLAG_RW | CTLFLAG_LOCKED, &if_verbose, 0, "");
 
+#if (DEBUG || DEVELOPMENT)
+static uint32_t default_tcp_kao_max = 0;
+SYSCTL_INT(_net_link_generic_system, OID_AUTO, default_tcp_kao_max,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &default_tcp_kao_max, 0, "");
+#else
+static const uint32_t default_tcp_kao_max = 0;
+#endif /* (DEBUG || DEVELOPMENT) */
+
 boolean_t intcoproc_unrestricted;
 
 /* Eventhandler context for interface events */
@@ -393,7 +403,7 @@ if_detach_ifa_common(struct ifnet *ifp, struct ifaddr *ifa, int link)
                panic("%s: unexpected (missing) refcnt ifa=%p", __func__, ifa);
                /* NOTREACHED */
        }
-       ifa->ifa_debug &= ~(IFD_ATTACHED | IFD_DETACHING);
+       ifa->ifa_debug &= ~IFD_ATTACHED;
 
        if (ifa->ifa_detached != NULL) {
                (*ifa->ifa_detached)(ifa);
@@ -795,11 +805,15 @@ u_int32_t
 if_functional_type(struct ifnet *ifp, bool exclude_delegate)
 {
        u_int32_t ret = IFRTYPE_FUNCTIONAL_UNKNOWN;
+
        if (ifp != NULL) {
                if (ifp->if_flags & IFF_LOOPBACK) {
                        ret = IFRTYPE_FUNCTIONAL_LOOPBACK;
+               } else if (IFNET_IS_COMPANION_LINK(ifp)) {
+                       ret = IFRTYPE_FUNCTIONAL_COMPANIONLINK;
                } else if ((exclude_delegate &&
-                   (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI)) ||
+                   (ifp->if_family == IFNET_FAMILY_ETHERNET &&
+                   ifp->if_subfamily == IFNET_SUBFAMILY_WIFI)) ||
                    (!exclude_delegate && IFNET_IS_WIFI(ifp))) {
                        if (ifp->if_eflags & IFEF_AWDL) {
                                ret = IFRTYPE_FUNCTIONAL_WIFI_AWDL;
@@ -1806,13 +1820,23 @@ ifioctl_linkparams(struct ifnet *ifp, u_long cmd, caddr_t data, struct proc *p)
 
        switch (cmd) {
        case SIOCSIFLINKPARAMS: {               /* struct if_linkparamsreq */
-               struct tb_profile tb = { 0, 0, 0 };
+               struct tb_profile tb = { .rate = 0, .percent = 0, .depth = 0 };
 
                if ((error = proc_suser(p)) != 0) {
                        break;
                }
 
 
+               char netem_name[32];
+               (void) snprintf(netem_name, sizeof(netem_name),
+                   "if_output_netem_%s", if_name(ifp));
+               error = netem_config(&ifp->if_output_netem, netem_name,
+                   &iflpr->iflpr_output_netem, (void *)ifp,
+                   ifnet_enqueue_netem, NETEM_MAX_BATCH_SIZE);
+               if (error != 0) {
+                       break;
+               }
+
                IFCQ_LOCK(ifq);
                if (!IFCQ_IS_READY(ifq)) {
                        error = ENXIO;
@@ -1864,6 +1888,12 @@ ifioctl_linkparams(struct ifnet *ifp, u_long cmd, caddr_t data, struct proc *p)
                    sizeof(iflpr->iflpr_output_lt));
                bcopy(&ifp->if_input_lt, &iflpr->iflpr_input_lt,
                    sizeof(iflpr->iflpr_input_lt));
+
+               if (ifp->if_output_netem != NULL) {
+                       netem_get_params(ifp->if_output_netem,
+                           &iflpr->iflpr_output_netem);
+               }
+
                break;
        }
 
@@ -1980,9 +2010,11 @@ ifioctl_getnetagents(struct ifnet *ifp, u_int32_t *count, user_addr_t uuid_p)
 
 #define IF_MAXAGENTS            64
 #define IF_AGENT_INCREMENT      8
-static int
+int
 if_add_netagent_locked(struct ifnet *ifp, uuid_t new_agent_uuid)
 {
+       VERIFY(ifp != NULL);
+
        uuid_t *first_empty_slot = NULL;
        u_int32_t index = 0;
        bool already_added = FALSE;
@@ -2290,14 +2322,10 @@ if_set_qosmarking_mode(struct ifnet *ifp, u_int32_t mode)
        switch (mode) {
        case IFRTYPE_QOSMARKING_MODE_NONE:
                ifp->if_qosmarking_mode = IFRTYPE_QOSMARKING_MODE_NONE;
-               ifp->if_eflags &= ~IFEF_QOSMARKING_CAPABLE;
                break;
        case IFRTYPE_QOSMARKING_FASTLANE:
-               ifp->if_qosmarking_mode = IFRTYPE_QOSMARKING_FASTLANE;
-               ifp->if_eflags |= IFEF_QOSMARKING_CAPABLE;
-               if (net_qos_policy_capable_enabled != 0) {
-                       ifp->if_eflags |= IFEF_QOSMARKING_ENABLED;
-               }
+       case IFRTYPE_QOSMARKING_RFC4594:
+               ifp->if_qosmarking_mode = mode;
                break;
        default:
                error = EINVAL;
@@ -2305,7 +2333,7 @@ if_set_qosmarking_mode(struct ifnet *ifp, u_int32_t mode)
        }
        if (error == 0 && old_mode != ifp->if_qosmarking_mode) {
                dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_QOS_MODE_CHANGED,
-                   NULL, sizeof(struct kev_dl_rrc_state));
+                   NULL, 0);
        }
        return error;
 }
@@ -2360,10 +2388,12 @@ ifioctl_iforder(u_long cmd, caddr_t data)
                        if (found_duplicate) {
                                break;
                        }
-               }
-
-               error = ifnet_reset_order(ordered_indices, ifo->ifo_count);
 
+                       error = ifnet_reset_order(ordered_indices, ifo->ifo_count);
+               } else {
+                       // Clear the list
+                       error = ifnet_reset_order(NULL, 0);
+               }
                break;
        }
 
@@ -2640,6 +2670,7 @@ ifioctl_restrict_intcoproc(unsigned long cmd, const char *ifname,
        case SIOCGIFNETMASK_IN6:
        case SIOCGIFPROTOLIST32:
        case SIOCGIFPROTOLIST64:
+       case SIOCGIFXFLAGS:
                return false;
        default:
 #if (DEBUG || DEVELOPMENT)
@@ -2900,6 +2931,15 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p)
        case SIOCGIFLOWINTERNET:                /* struct ifreq */
        case SIOCGIFLOWPOWER:                   /* struct ifreq */
        case SIOCSIFLOWPOWER:                   /* struct ifreq */
+       case SIOCSIF6LOWPAN:                    /* struct ifreq */
+       case SIOCGIF6LOWPAN:                    /* struct ifreq */
+       case SIOCGIFMPKLOG:                     /* struct ifreq */
+       case SIOCSIFMPKLOG:                     /* struct ifreq */
+       case SIOCGIFCONSTRAINED:                /* struct ifreq */
+       case SIOCSIFCONSTRAINED:                /* struct ifreq */
+       case SIOCGIFXFLAGS:                     /* struct ifreq */
+       case SIOCGIFNOACKPRIO:                  /* struct ifreq */
+       case SIOCSIFNOACKPRIO:                  /* struct ifreq */
        {                       /* struct ifreq */
                struct ifreq ifr;
                bcopy(data, &ifr, sizeof(ifr));
@@ -2924,20 +2964,20 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p)
        case SIOCSIFPHYADDR:                    /* struct {if,in_}aliasreq */
                bcopy(((struct in_aliasreq *)(void *)data)->ifra_name,
                    ifname, IFNAMSIZ);
-               ifp = ifunit(ifname);
+               ifp = ifunit_ref(ifname);
                break;
 
 #if INET6
        case SIOCSIFPHYADDR_IN6_32:             /* struct in6_aliasreq_32 */
                bcopy(((struct in6_aliasreq_32 *)(void *)data)->ifra_name,
                    ifname, IFNAMSIZ);
-               ifp = ifunit(ifname);
+               ifp = ifunit_ref(ifname);
                break;
 
        case SIOCSIFPHYADDR_IN6_64:             /* struct in6_aliasreq_64 */
                bcopy(((struct in6_aliasreq_64 *)(void *)data)->ifra_name,
                    ifname, IFNAMSIZ);
-               ifp = ifunit(ifname);
+               ifp = ifunit_ref(ifname);
                break;
 #endif /* INET6 */
 
@@ -2951,48 +2991,48 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p)
                bcopy(data, ifs, sizeof(*ifs));
                ifs->ifs_name[IFNAMSIZ - 1] = '\0';
                bcopy(ifs->ifs_name, ifname, IFNAMSIZ);
-               ifp = ifunit(ifname);
+               ifp = ifunit_ref(ifname);
                break;
 
        case SIOCGIFMEDIA32:                    /* struct ifmediareq32 */
        case SIOCGIFXMEDIA32:                    /* struct ifmediareq32 */
                bcopy(((struct ifmediareq32 *)(void *)data)->ifm_name,
                    ifname, IFNAMSIZ);
-               ifp = ifunit(ifname);
+               ifp = ifunit_ref(ifname);
                break;
 
        case SIOCGIFMEDIA64:                    /* struct ifmediareq64 */
        case SIOCGIFXMEDIA64:                    /* struct ifmediareq64 */
                bcopy(((struct ifmediareq64 *)(void *)data)->ifm_name,
                    ifname, IFNAMSIZ);
-               ifp = ifunit(ifname);
+               ifp = ifunit_ref(ifname);
                break;
 
        case SIOCSIFDESC:                       /* struct if_descreq */
        case SIOCGIFDESC:                       /* struct if_descreq */
                bcopy(((struct if_descreq *)(void *)data)->ifdr_name,
                    ifname, IFNAMSIZ);
-               ifp = ifunit(ifname);
+               ifp = ifunit_ref(ifname);
                break;
 
        case SIOCSIFLINKPARAMS:                 /* struct if_linkparamsreq */
        case SIOCGIFLINKPARAMS:                 /* struct if_linkparamsreq */
                bcopy(((struct if_linkparamsreq *)(void *)data)->iflpr_name,
                    ifname, IFNAMSIZ);
-               ifp = ifunit(ifname);
+               ifp = ifunit_ref(ifname);
                break;
 
        case SIOCGIFQUEUESTATS:                 /* struct if_qstatsreq */
                bcopy(((struct if_qstatsreq *)(void *)data)->ifqr_name,
                    ifname, IFNAMSIZ);
-               ifp = ifunit(ifname);
+               ifp = ifunit_ref(ifname);
                break;
 
        case SIOCSIFTHROTTLE:                   /* struct if_throttlereq */
        case SIOCGIFTHROTTLE:                   /* struct if_throttlereq */
                bcopy(((struct if_throttlereq *)(void *)data)->ifthr_name,
                    ifname, IFNAMSIZ);
-               ifp = ifunit(ifname);
+               ifp = ifunit_ref(ifname);
                break;
 
        case SIOCAIFAGENTID:                    /* struct if_agentidreq */
@@ -3001,21 +3041,21 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p)
        case SIOCGIFAGENTIDS64:         /* struct if_agentidsreq64 */
                bcopy(((struct if_agentidreq *)(void *)data)->ifar_name,
                    ifname, IFNAMSIZ);
-               ifp = ifunit(ifname);
+               ifp = ifunit_ref(ifname);
                break;
 
        case SIOCSIFNETSIGNATURE:               /* struct if_nsreq */
        case SIOCGIFNETSIGNATURE:               /* struct if_nsreq */
                bcopy(((struct if_nsreq *)(void *)data)->ifnsr_name,
                    ifname, IFNAMSIZ);
-               ifp = ifunit(ifname);
+               ifp = ifunit_ref(ifname);
                break;
 
        case SIOCGIFPROTOLIST32:                /* struct if_protolistreq32 */
        case SIOCGIFPROTOLIST64:                /* struct if_protolistreq64 */
                bcopy(((struct if_protolistreq *)(void *)data)->ifpl_name,
                    ifname, IFNAMSIZ);
-               ifp = ifunit(ifname);
+               ifp = ifunit_ref(ifname);
                break;
        default:
                /*
@@ -3024,7 +3064,7 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p)
                 */
                bcopy(((struct ifreq *)(void *)data)->ifr_name,
                    ifname, IFNAMSIZ);
-               ifp = ifunit(ifname);
+               ifp = ifunit_ref(ifname);
                break;
        }
        dlil_if_unlock();
@@ -3066,8 +3106,8 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p)
                bcopy(ifs, data, sizeof(*ifs));
                break;
 
-       case SIOCGIFMEDIA32:                   /* struct ifmediareq32 */
-       case SIOCGIFMEDIA64:                   /* struct ifmediareq64 */
+       case SIOCGIFMEDIA32:                    /* struct ifmediareq32 */
+       case SIOCGIFMEDIA64:                    /* struct ifmediareq64 */
        case SIOCGIFXMEDIA32:                    /* struct ifmediareq32 */
        case SIOCGIFXMEDIA64:                    /* struct ifmediareq64 */
                error = ifioctl_get_media(ifp, so, cmd, data);
@@ -3171,6 +3211,9 @@ done:
                }
        }
 
+       if (ifp != NULL) {
+               ifnet_decr_iorefcnt(ifp);
+       }
        return error;
 }
 
@@ -3228,6 +3271,12 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p)
                ifnet_lock_done(ifp);
                break;
 
+       case SIOCGIFXFLAGS:
+               ifnet_lock_shared(ifp);
+               ifr->ifr_xflags = ifp->if_xflags;
+               ifnet_lock_done(ifp);
+               break;
+
        case SIOCGIFCAP:
                ifnet_lock_shared(ifp);
                ifr->ifr_reqcap = ifp->if_capabilities;
@@ -3495,6 +3544,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p)
        case SIOCSIFALTMTU:
        case SIOCSIFVLAN:
        case SIOCSIFBOND:
+       case SIOCSIF6LOWPAN:
                error = proc_suser(p);
                if (error != 0) {
                        break;
@@ -3545,6 +3595,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p)
        case SIOCGIFDEVMTU:
        case SIOCGIFVLAN:
        case SIOCGIFBOND:
+       case SIOCGIF6LOWPAN:
                error = ifnet_ioctl(ifp, SOCK_DOM(so), cmd, (caddr_t)ifr);
                break;
 
@@ -3617,6 +3668,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p)
                } else {
                        ifp->if_eflags &= ~IFEF_EXPENSIVE;
                }
+               ifnet_increment_generation(ifp);
                ifnet_lock_done(ifp);
                /*
                 * Update the expensive bit in the delegated interface
@@ -3628,10 +3680,57 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p)
                        if (difp->if_delegated.ifp == ifp) {
                                difp->if_delegated.expensive =
                                    ifp->if_eflags & IFEF_EXPENSIVE ? 1 : 0;
+                               ifnet_increment_generation(difp);
                        }
                        ifnet_lock_done(difp);
                }
                ifnet_head_done();
+               necp_update_all_clients();
+               break;
+       }
+
+       case SIOCGIFCONSTRAINED:
+               ifnet_lock_shared(ifp);
+               if (ifp->if_xflags & IFXF_CONSTRAINED) {
+                       ifr->ifr_constrained = 1;
+               } else {
+                       ifr->ifr_constrained = 0;
+               }
+               ifnet_lock_done(ifp);
+               break;
+
+       case SIOCSIFCONSTRAINED:
+       {
+               struct ifnet *difp;
+
+               if ((error = priv_check_cred(kauth_cred_get(),
+                   PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
+                       return error;
+               }
+               ifnet_lock_exclusive(ifp);
+               if (ifr->ifr_constrained) {
+                       ifp->if_xflags |= IFXF_CONSTRAINED;
+               } else {
+                       ifp->if_xflags &= ~IFXF_CONSTRAINED;
+               }
+               ifnet_increment_generation(ifp);
+               ifnet_lock_done(ifp);
+               /*
+                * Update the constrained bit in the delegated interface
+                * structure.
+                */
+               ifnet_head_lock_shared();
+               TAILQ_FOREACH(difp, &ifnet_head, if_link) {
+                       ifnet_lock_exclusive(difp);
+                       if (difp->if_delegated.ifp == ifp) {
+                               difp->if_delegated.constrained =
+                                   ifp->if_xflags & IFXF_CONSTRAINED ? 1 : 0;
+                               ifnet_increment_generation(difp);
+                       }
+                       ifnet_lock_done(difp);
+               }
+               ifnet_head_done();
+               necp_update_all_clients();
                break;
        }
 
@@ -3794,6 +3893,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p)
                        error = EINVAL;
                }
                break;
+
        case SIOCSIFTIMESTAMPENABLE:
        case SIOCSIFTIMESTAMPDISABLE:
                error = proc_suser(p);
@@ -3875,6 +3975,15 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p)
                error = EINVAL;
 #endif /* (DEBUG || DEVELOPMENT) */
                break;
+
+       case SIOCSIFSUBFAMILY:
+               if ((error = priv_check_cred(kauth_cred_get(),
+                   PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
+                       return error;
+               }
+               error = ifnet_ioctl(ifp, SOCK_DOM(so), cmd, (caddr_t)ifr);
+               break;
+
        case SIOCSIFLOWINTERNET:
                if ((error = priv_check_cred(kauth_cred_get(),
                    PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
@@ -3918,6 +4027,41 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p)
                error = EOPNOTSUPP;
 #endif /* DEVELOPMENT || DEBUG */
                break;
+
+       case SIOCGIFMPKLOG:
+               ifr->ifr_mpk_log = !!(ifp->if_xflags & IFXF_MPK_LOG);
+               break;
+       case SIOCSIFMPKLOG:
+               if (ifr->ifr_mpk_log) {
+                       ifp->if_xflags |= IFXF_MPK_LOG;
+               } else {
+                       ifp->if_xflags &= ~IFXF_MPK_LOG;
+               }
+               break;
+       case SIOCGIFNOACKPRIO:
+               ifnet_lock_shared(ifp);
+               if (ifp->if_eflags & IFEF_NOACKPRI) {
+                       ifr->ifr_noack_prio = 1;
+               } else {
+                       ifr->ifr_noack_prio = 0;
+               }
+               ifnet_lock_done(ifp);
+               break;
+
+       case SIOCSIFNOACKPRIO:
+               if ((error = priv_check_cred(kauth_cred_get(),
+                   PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
+                       return error;
+               }
+               ifnet_lock_exclusive(ifp);
+               if (ifr->ifr_noack_prio) {
+                       ifp->if_eflags |= IFEF_NOACKPRI;
+               } else {
+                       ifp->if_eflags &= ~IFEF_NOACKPRI;
+               }
+               ifnet_lock_done(ifp);
+               break;
+
        default:
                VERIFY(0);
                /* NOTREACHED */
@@ -5216,15 +5360,18 @@ if_copy_rxpoll_stats(struct ifnet *ifp, struct if_rxpoll_stats *if_rs)
        if (!(ifp->if_eflags & IFEF_RXPOLL) || !ifnet_is_attached(ifp, 1)) {
                return;
        }
-
-       /* by now, ifnet will stay attached so if_inp must be valid */
-       VERIFY(ifp->if_inp != NULL);
-       bcopy(&ifp->if_inp->pstats, if_rs, sizeof(*if_rs));
-
+       bcopy(&ifp->if_poll_pstats, if_rs, sizeof(*if_rs));
        /* Release the IO refcnt */
        ifnet_decr_iorefcnt(ifp);
 }
 
+void
+if_copy_netif_stats(struct ifnet *ifp, struct if_netif_stats *if_ns)
+{
+       bzero(if_ns, sizeof(*if_ns));
+#pragma unused(ifp)
+}
+
 struct ifaddr *
 ifa_remref(struct ifaddr *ifa, int locked)
 {
@@ -5544,6 +5691,8 @@ ifioctl_cassert(void)
 
        case SIOCSIFDISABLEOUTPUT:
 
+       case SIOCSIFSUBFAMILY:
+
        case SIOCGIFAGENTLIST32:
        case SIOCGIFAGENTLIST64:
 
@@ -5560,8 +5709,22 @@ ifioctl_cassert(void)
        case SIOCGIFPROTOLIST32:
        case SIOCGIFPROTOLIST64:
 
+       case SIOCSIF6LOWPAN:
+       case SIOCGIF6LOWPAN:
+
        case SIOCGIFLOWPOWER:
        case SIOCSIFLOWPOWER:
+
+       case SIOCGIFMPKLOG:
+       case SIOCSIFMPKLOG:
+
+       case SIOCGIFCONSTRAINED:
+       case SIOCSIFCONSTRAINED:
+
+       case SIOCGIFXFLAGS:
+
+       case SIOCGIFNOACKPRIO:
+       case SIOCSIFNOACKPRIO:
                ;
        }
 }
@@ -5617,3 +5780,25 @@ intf_event_enqueue_nwk_wq_entry(struct ifnet *ifp, struct sockaddr *addrp,
        p_intf_ev->nwk_wqe.arg = &p_intf_ev->intf_ev_arg;
        nwk_wq_enqueue((struct nwk_wq_entry*)p_intf_ev);
 }
+
+int
+if_get_tcp_kao_max(struct ifnet *ifp)
+{
+       int error = 0;
+
+       if (ifp->if_tcp_kao_max == 0) {
+               struct ifreq ifr;
+
+               memset(&ifr, 0, sizeof(struct ifreq));
+               error = ifnet_ioctl(ifp, 0, SIOCGIFTCPKAOMAX, &ifr);
+
+               ifnet_lock_exclusive(ifp);
+               if (error == 0) {
+                       ifp->if_tcp_kao_max = ifr.ifr_tcp_kao_max;
+               } else if (error == EOPNOTSUPP) {
+                       ifp->if_tcp_kao_max = default_tcp_kao_max;
+               }
+               ifnet_lock_done(ifp);
+       }
+       return error;
+}
index 4189a82cc6d64226e89d753646177f52ce97ddd9..0e516a42e64e7717a602af82f7c21f7b03584eb7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -129,7 +129,7 @@ struct if_clonereq32 {
 #define IFEF_ENQUEUE_MULTI      0x00000002      /* enqueue multiple packets at once */
 #define IFEF_DELAY_START        0x00000004      /* delay start callback */
 #define IFEF_PROBE_CONNECTIVITY 0x00000008      /* Probe connections going over this interface */
-#define IFEF_QOSMARKING_CAPABLE 0x00000010      /* XXX Obsolete, to be removed */
+#define IFEF_ADV_REPORT         0x00000010      /* Supports interface advisory report */
 #define IFEF_IPV6_DISABLED      0x00000020      /* coupled to ND6_IFF_IFDISABLED */
 #define IFEF_ACCEPT_RTADV       0x00000040      /* accepts IPv6 RA on the interface */
 #define IFEF_TXSTART            0x00000080      /* has start callback */
@@ -178,12 +178,14 @@ struct if_clonereq32 {
 #define IFXF_WAKE_ON_MAGIC_PACKET       0x00000001 /* wake on magic packet */
 #define IFXF_TIMESTAMP_ENABLED          0x00000002 /* time stamping enabled */
 #define IFXF_NX_NOAUTO                  0x00000004 /* no auto config nexus */
-#define IFXF_MULTISTACK_BPF_TAP         0x00000008 /* multistack bpf tap */
+#define IFXF_LEGACY                     0x00000008 /* legacy (non-netif) mode */
 #define IFXF_LOW_INTERNET_UL            0x00000010 /* Uplink Low Internet is confirmed */
 #define IFXF_LOW_INTERNET_DL            0x00000020 /* Downlink Low Internet is confirmed */
 #define IFXF_ALLOC_KPI                  0x00000040 /* Allocated via the ifnet_alloc KPI */
 #define IFXF_LOW_POWER                  0x00000080 /* Low Power Mode */
-
+#define IFXF_MPK_LOG                    0x00000100 /* Multi-layer Packet Logging */
+#define IFXF_CONSTRAINED                0x00000200 /* Constrained - Save Data Mode */
+#define IFXF_LOW_LATENCY                0x00000400 /* Low latency interface */
 /*
  * Current requirements for an AWDL interface.  Setting/clearing IFEF_AWDL
  * will also trigger the setting/clearing of the rest of the flags.  Once
@@ -433,6 +435,7 @@ struct  ifreq {
                        uint32_t        ifo_inuse;
                } ifru_opportunistic;
                u_int64_t ifru_eflags;
+               u_int64_t ifru_xflags;
                struct {
                        int32_t         ifl_level;
                        uint32_t        ifl_flags;
@@ -466,6 +469,9 @@ struct  ifreq {
 #define IFRTYPE_FAMILY_FIREWIRE         13
 #define IFRTYPE_FAMILY_BOND             14
 #define IFRTYPE_FAMILY_CELLULAR         15
+#define IFRTYPE_FAMILY_6LOWPAN          16
+#define IFRTYPE_FAMILY_UTUN             17
+#define IFRTYPE_FAMILY_IPSEC            18
                        uint32_t        ift_subfamily;
 #define IFRTYPE_SUBFAMILY_ANY           0
 #define IFRTYPE_SUBFAMILY_USB           1
@@ -474,19 +480,23 @@ struct  ifreq {
 #define IFRTYPE_SUBFAMILY_THUNDERBOLT   4
 #define IFRTYPE_SUBFAMILY_RESERVED      5
 #define IFRTYPE_SUBFAMILY_INTCOPROC     6
+#define IFRTYPE_SUBFAMILY_QUICKRELAY    7
+#define IFRTYPE_SUBFAMILY_DEFAULT       8
                } ifru_type;
 #endif /* PRIVATE */
                u_int32_t ifru_functional_type;
-#define IFRTYPE_FUNCTIONAL_UNKNOWN      0
-#define IFRTYPE_FUNCTIONAL_LOOPBACK     1
-#define IFRTYPE_FUNCTIONAL_WIRED        2
-#define IFRTYPE_FUNCTIONAL_WIFI_INFRA   3
-#define IFRTYPE_FUNCTIONAL_WIFI_AWDL    4
-#define IFRTYPE_FUNCTIONAL_CELLULAR     5
-#define IFRTYPE_FUNCTIONAL_INTCOPROC    6
-#define IFRTYPE_FUNCTIONAL_LAST         6
+#define IFRTYPE_FUNCTIONAL_UNKNOWN              0
+#define IFRTYPE_FUNCTIONAL_LOOPBACK             1
+#define IFRTYPE_FUNCTIONAL_WIRED                2
+#define IFRTYPE_FUNCTIONAL_WIFI_INFRA           3
+#define IFRTYPE_FUNCTIONAL_WIFI_AWDL            4
+#define IFRTYPE_FUNCTIONAL_CELLULAR             5
+#define IFRTYPE_FUNCTIONAL_INTCOPROC            6
+#define IFRTYPE_FUNCTIONAL_COMPANIONLINK        7
+#define IFRTYPE_FUNCTIONAL_LAST                 7
 #ifdef PRIVATE
                u_int32_t ifru_expensive;
+               u_int32_t ifru_constrained;
                u_int32_t ifru_2kcl;
                struct {
                        u_int32_t qlen;
@@ -500,7 +510,8 @@ struct  ifreq {
 #define IFRTYPE_ECN_DISABLE             2
                u_int32_t ifru_qosmarking_mode;
 #define IFRTYPE_QOSMARKING_MODE_NONE            0
-#define IFRTYPE_QOSMARKING_FASTLANE     1
+#define IFRTYPE_QOSMARKING_FASTLANE     1       /* supported: socket/channel */
+#define IFRTYPE_QOSMARKING_RFC4594      2       /* supported: channel only */
                u_int32_t ifru_qosmarking_enabled;
                u_int32_t ifru_disable_output;
                u_int32_t ifru_low_internet;
@@ -508,6 +519,9 @@ struct  ifreq {
 #define IFRTYPE_LOW_INTERNET_ENABLE_UL          0x0001
 #define IFRTYPE_LOW_INTERNET_ENABLE_DL          0x0002
                int ifru_low_power_mode;
+               u_int32_t ifru_tcp_kao_max;
+               int ifru_mpk_log;        /* Multi Layer Packet Log */
+               u_int32_t ifru_noack_prio;
 #endif /* PRIVATE */
        } ifr_ifru;
 #define ifr_addr        ifr_ifru.ifru_addr      /* address */
@@ -540,9 +554,11 @@ struct  ifreq {
 #ifdef PRIVATE
 #define ifr_opportunistic       ifr_ifru.ifru_opportunistic
 #define ifr_eflags      ifr_ifru.ifru_eflags    /* extended flags  */
+#define ifr_xflags      ifr_ifru.ifru_xflags    /* extra flags  */
 #define ifr_log         ifr_ifru.ifru_log       /* logging level/flags */
 #define ifr_delegated   ifr_ifru.ifru_delegated /* delegated interface index */
 #define ifr_expensive   ifr_ifru.ifru_expensive
+#define ifr_constrained   ifr_ifru.ifru_constrained
 #define ifr_type        ifr_ifru.ifru_type      /* interface type */
 #define ifr_functional_type     ifr_ifru.ifru_functional_type
 #define ifr_2kcl        ifr_ifru.ifru_2kcl
@@ -558,6 +574,9 @@ struct  ifreq {
 #define ifr_disable_output      ifr_ifru.ifru_disable_output
 #define ifr_low_internet        ifr_ifru.ifru_low_internet
 #define ifr_low_power_mode      ifr_ifru.ifru_low_power_mode
+#define ifr_tcp_kao_max         ifr_ifru.ifru_tcp_kao_max
+#define ifr_mpk_log             ifr_ifru.ifru_mpk_log
+#define ifr_noack_prio          ifr_ifru.ifru_noack_prio
 
 #endif /* PRIVATE */
 };
@@ -794,6 +813,8 @@ struct if_linkparamsreq {
        struct if_bandwidths iflpr_input_bw;
        struct if_latencies iflpr_output_lt;
        struct if_latencies iflpr_input_lt;
+       struct if_netem_params iflpr_input_netem;
+       struct if_netem_params iflpr_output_netem;
 };
 
 /*
@@ -912,7 +933,7 @@ struct if_nexusreq {
        char            ifnr_name[IFNAMSIZ];    /* interface name */
        uint64_t        ifnr_flags;             /* unused, must be zero */
        uuid_t          ifnr_netif;             /* netif nexus instance UUID */
-       uuid_t          ifnr_multistack;        /* multistack nexus UUID */
+       uuid_t          ifnr_flowswitch;        /* flowswitch nexus UUID */
        uint64_t        ifnr_reserved[5];
 };
 
diff --git a/bsd/net/if_6lowpan.c b/bsd/net/if_6lowpan.c
new file mode 100644 (file)
index 0000000..92eebb0
--- /dev/null
@@ -0,0 +1,1095 @@
+/*
+ * Copyright (c) 2017-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ * Copyright 1998 Massachusetts Institute of Technology
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation for any purpose and without fee is hereby
+ * granted, provided that both the above copyright notice and this
+ * permission notice appear in all copies, that both the above
+ * copyright notice and this permission notice appear in all
+ * supporting documentation, and that the name of M.I.T. not be used
+ * in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission.  M.I.T. makes
+ * no representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied
+ * warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
+ * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
+ * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ * if_6lowpan.c - pseudo-device driver for IEEE 802.15.4 .
+ */
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/queue.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/kern_event.h>
+#include <sys/mcache.h>
+
+#include <net/bpf.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/if_dl.h>
+#include <net/if_ether.h>
+#include <net/if_types.h>
+#include <net/if_6lowpan_var.h>
+#include <net/frame802154.h>
+#include <net/sixxlowpan.h>
+#include <libkern/OSAtomic.h>
+
+#include <net/dlil.h>
+
+#include <net/kpi_interface.h>
+#include <net/kpi_protocol.h>
+
+#include <kern/locks.h>
+
+#ifdef INET
+#include <netinet/in.h>
+#include <netinet/if_ether.h>
+#endif
+
+#include <net/if_media.h>
+#include <net/multicast_list.h>
+#include <net/ether_if_module.h>
+
+#define SIXLOWPANNAME   "6lowpan"
+
+struct ifnet *p_6lowpan_ifnet = NULL;
+
+extern errno_t nd6_lookup_ipv6(ifnet_t interface,
+    const struct sockaddr_in6 *ip6_dest, struct sockaddr_dl *ll_dest,
+    size_t ll_dest_len, route_t hint, mbuf_t packet);
+
+
+typedef int (bpf_callback_func)(struct ifnet *, struct mbuf *);
+typedef int (if_set_bpf_tap_func)(struct ifnet *ifp, int mode, bpf_callback_func * func);
+
+static __inline__ lck_grp_t *
+my_lck_grp_alloc_init(const char * grp_name)
+{
+       lck_grp_t *             grp;
+       lck_grp_attr_t *        grp_attrs;
+
+       grp_attrs = lck_grp_attr_alloc_init();
+       grp = lck_grp_alloc_init(grp_name, grp_attrs);
+       lck_grp_attr_free(grp_attrs);
+       return grp;
+}
+
+static __inline__ lck_mtx_t *
+my_lck_mtx_alloc_init(lck_grp_t * lck_grp)
+{
+       lck_attr_t *    lck_attrs;
+       lck_mtx_t *             lck_mtx;
+
+       lck_attrs = lck_attr_alloc_init();
+       lck_mtx = lck_mtx_alloc_init(lck_grp, lck_attrs);
+       lck_attr_free(lck_attrs);
+       return lck_mtx;
+}
+
+static lck_mtx_t *sixlowpan_lck_mtx;
+
+static __inline__ void
+sixlowpan_lock_init(void)
+{
+       lck_grp_t *lck_grp;
+
+       lck_grp = my_lck_grp_alloc_init("if_6lowpan");
+       sixlowpan_lck_mtx = my_lck_mtx_alloc_init(lck_grp);
+}
+
+static __inline__ void
+sixlowpan_assert_lock_held(void)
+{
+       lck_mtx_assert(sixlowpan_lck_mtx, LCK_MTX_ASSERT_OWNED);
+       return;
+}
+
+#ifdef __UNUSED__
+static __inline__ void
+sixlowpan_assert_lock_not_held(void)
+{
+       lck_mtx_assert(sixlowpan_lck_mtx, LCK_MTX_ASSERT_NOTOWNED);
+       return;
+}
+#endif
+
+static __inline__ void
+sixlowpan_lock(void)
+{
+       lck_mtx_lock(sixlowpan_lck_mtx);
+       return;
+}
+
+static __inline__ void
+sixlowpan_unlock(void)
+{
+       lck_mtx_unlock(sixlowpan_lck_mtx);
+       return;
+}
+
+struct if6lpan;
+LIST_HEAD(if6lpan_list, if6lpan);
+
+typedef LIST_ENTRY(if6lpan)
+if6lpan_entry;
+
+#define IF6LPAN_SIGNATURE       0x6666face
+struct if6lpan {
+       if6lpan_entry           if6lpan_list;
+       char                    if6lpan_name[IFNAMSIZ]; /* our unique id */
+       char                    if6lpan_addr[IEEE802154_ADDR_LEN]; /* our LL address */
+       struct ifnet *          if6lpan_ifp;    /* our interface */
+       struct ifnet *          if6lpan_pifp;   /* parent interface */
+#define IF6LPANF_DETACHING      0x1             /* interface is detaching */
+#define IF6LPANF_READY          0x2             /* interface is ready */
+       u_int32_t               if6lpan_flags;
+       bpf_packet_func         if6lpan_bpf_input;
+       bpf_packet_func         if6lpan_bpf_output;
+       int32_t                 if6lpan_retain_count;
+       u_int32_t               if6lpan_signature;      /* IF6LPAN_SIGNATURE */
+       u_int8_t                if6lpan_ieee802154_seq;
+};
+
+typedef struct if6lpan * if6lpan_ref;
+
+static __inline__ int
+if6lpan_flags_ready(if6lpan_ref ifl)
+{
+       return (ifl->if6lpan_flags & IF6LPANF_READY) != 0;
+}
+
+static __inline__ void
+if6lpan_flags_set_ready(if6lpan_ref ifl)
+{
+       ifl->if6lpan_flags |= IF6LPANF_READY;
+       return;
+}
+
+static __inline__ void
+if6lpan_set_addr(if6lpan_ref ifl, caddr_t ether_addr)
+{
+       ifl->if6lpan_addr[0] = 0x66;
+       ifl->if6lpan_addr[1] = 0x66;
+       bcopy(ether_addr, &ifl->if6lpan_addr[2], ETHER_ADDR_LEN);
+       return;
+}
+
+#ifdef __UNUSED__
+static __inline__ u_int8_t*
+if6lpan_get_addr(if6lpan_ref ifl)
+{
+       return ifl->ifl6lpan_addr;
+}
+#endif
+
+static __inline__ int
+if6lpan_flags_detaching(if6lpan_ref ifl)
+{
+       return (ifl->if6lpan_flags & IF6LPANF_DETACHING) != 0;
+}
+
+static __inline__ void
+if6lpan_flags_set_detaching(if6lpan_ref ifl)
+{
+       ifl->if6lpan_flags |= IF6LPANF_DETACHING;
+       return;
+}
+
+static  int sixlowpan_clone_create(struct if_clone *, u_int32_t, void *);
+static  int sixlowpan_clone_destroy(struct ifnet *);
+static  int sixlowpan_input(ifnet_t ifp, protocol_family_t protocol,
+    mbuf_t m, char *frame_header);
+static  int sixlowpan_output(struct ifnet *ifp, struct mbuf *m);
+static  int sixlowpan_ioctl(ifnet_t ifp, u_long cmd, void *addr);
+static  int sixlowpan_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode,
+    bpf_packet_func func);
+static  int sixlowpan_attach_protocol(struct ifnet *ifp);
+static  int sixlowpan_detach_protocol(struct ifnet *ifp);
+static  int sixlowpan_unconfig(if6lpan_ref ifl);
+static  int sixlowpan_config(struct ifnet *ifp, struct ifnet *p);
+static  void sixlowpan_if_free(struct ifnet *ifp);
+static  int sixlowpan_remove(if6lpan_ref ifl);
+static  int sixlowpan_framer_extended(struct ifnet *ifp, struct mbuf **m,
+    const struct sockaddr *ndest, const char *edst,
+    const char *ether_type, u_int32_t *prepend_len, u_int32_t *postpend_len);
+
+#define SIXLOWPAN_MAXUNIT       IF_MAXUNIT
+#define SIXLOWPAN_ZONE_MAX_ELEM MIN(IFNETS_MAX, SIXLOWPAN_MAXUNIT)
+
+static struct if_clone sixlowpan_cloner = IF_CLONE_INITIALIZER(SIXLOWPANNAME,
+    sixlowpan_clone_create,
+    sixlowpan_clone_destroy,
+    0,
+    SIXLOWPAN_MAXUNIT,
+    SIXLOWPAN_ZONE_MAX_ELEM,
+    sizeof(struct if6lpan));
+
+/**
+** if6lpan_ref routines
+**/
+static void
+if6lpan_retain(if6lpan_ref ifl)
+{
+       if (ifl->if6lpan_signature != IF6LPAN_SIGNATURE) {
+               panic("if6lpan_retain: bad signature\n");
+       }
+       if (ifl->if6lpan_retain_count == 0) {
+               panic("if6lpan_retain: retain count is 0\n");
+       }
+       OSIncrementAtomic(&ifl->if6lpan_retain_count);
+}
+
+static void
+if6lpan_release(if6lpan_ref ifl)
+{
+       u_int32_t old_retain_count;
+
+       if (ifl->if6lpan_signature != IF6LPAN_SIGNATURE) {
+               panic("if6lpan_release: bad signature\n");
+       }
+       old_retain_count = OSDecrementAtomic(&ifl->if6lpan_retain_count);
+       switch (old_retain_count) {
+       case 0:
+               panic("if6lpan_release: retain count is 0\n");
+               break;
+       case 1:
+               ifl->if6lpan_signature = 0;
+               if_clone_softc_deallocate(&sixlowpan_cloner, ifl);
+               break;
+       default:
+               break;
+       }
+       return;
+}
+
+static if6lpan_ref
+ifnet_get_if6lpan(struct ifnet * ifp)
+{
+       if6lpan_ref             ifl;
+
+       ifl = (if6lpan_ref)ifnet_softc(ifp);
+       return ifl;
+}
+
+static if6lpan_ref
+ifnet_get_if6lpan_retained(struct ifnet * ifp)
+{
+       if6lpan_ref             ifl;
+
+       ifl = ifnet_get_if6lpan(ifp);
+       if (ifl == NULL) {
+               return NULL;
+       }
+       if (if6lpan_flags_detaching(ifl)) {
+               return NULL;
+       }
+       if6lpan_retain(ifl);
+       return ifl;
+}
+
+static int
+sixlowpan_clone_attach(void)
+{
+       int error;
+
+       error = if_clone_attach(&sixlowpan_cloner);
+       if (error != 0) {
+               return error;
+       }
+       sixlowpan_lock_init();
+       return 0;
+}
+
+static int
+sixlowpan_demux(
+       __unused ifnet_t ifp,
+       __unused mbuf_t m,
+       __unused char *frame_header,
+       protocol_family_t *protocol_family)
+{
+       *protocol_family = PF_INET6;
+       return 0;
+}
+
+static errno_t
+sixlowpan_add_proto(__unused ifnet_t interface, protocol_family_t protocol,
+    __unused const struct ifnet_demux_desc *demux_array,
+    __unused u_int32_t demux_count)
+{
+       if (protocol == PF_INET6) {
+               return 0;
+       }
+       return ENOPROTOOPT;
+}
+
+static errno_t
+sixlowpan_del_proto(__unused ifnet_t interface, __unused protocol_family_t protocol)
+{
+       return 0;
+}
+
+static int
+sixlowpan_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params)
+{
+       int                             error;
+       if6lpan_ref                     ifl;
+       ifnet_t                         ifp;
+       struct ifnet_init_eparams       if_epraram;
+
+       ifl = if_clone_softc_allocate(&sixlowpan_cloner);
+       if (ifl == NULL) {
+               return ENOBUFS;
+       }
+       ifl->if6lpan_retain_count = 1;
+       ifl->if6lpan_signature = IF6LPAN_SIGNATURE;
+
+       /* use the interface name as the unique id for ifp recycle */
+       if ((unsigned int)
+           snprintf(ifl->if6lpan_name, sizeof(ifl->if6lpan_name), "%s%d",
+           ifc->ifc_name, unit) >= sizeof(ifl->if6lpan_name)) {
+               if6lpan_release(ifl);
+               return EINVAL;
+       }
+
+       bzero(&if_epraram, sizeof(if_epraram));
+       if_epraram.ver = IFNET_INIT_CURRENT_VERSION;
+       if_epraram.len = sizeof(if_epraram);
+       if_epraram.flags = IFNET_INIT_LEGACY;
+       if_epraram.uniqueid = ifl->if6lpan_name;
+       if_epraram.uniqueid_len = strlen(ifl->if6lpan_name);
+       if_epraram.name = ifc->ifc_name;
+       if_epraram.unit = unit;
+       if_epraram.family = IFNET_FAMILY_6LOWPAN;
+       if_epraram.type = IFT_6LOWPAN;
+       if_epraram.output = sixlowpan_output;
+       if_epraram.demux = sixlowpan_demux;
+       if_epraram.add_proto = sixlowpan_add_proto;
+       if_epraram.del_proto = sixlowpan_del_proto;
+       if_epraram.framer_extended = sixlowpan_framer_extended;
+       if_epraram.softc = ifl;
+       if_epraram.ioctl = sixlowpan_ioctl;
+       if_epraram.set_bpf_tap = sixlowpan_set_bpf_tap;
+       if_epraram.detach = sixlowpan_if_free;
+       error = ifnet_allocate_extended(&if_epraram, &ifp);
+
+       if (error) {
+               if6lpan_release(ifl);
+               return error;
+       }
+
+       ifnet_set_offload(ifp, 0);
+       ifnet_set_addrlen(ifp, IEEE802154_ADDR_LEN);
+       ifnet_set_baudrate(ifp, 0);
+       // TODO: ifnet_set_hdrlen(ifp, IEEE802154_ENCAP_LEN);
+
+       error = ifnet_attach(ifp, NULL);
+       if (error) {
+               ifnet_release(ifp);
+               if6lpan_release(ifl);
+               return error;
+       }
+       ifl->if6lpan_ifp = ifp;
+
+       p_6lowpan_ifnet = ifp;
+       /* TODO:  attach as IEEE 802.15.4 with no FCS */
+       bpfattach(ifp, DLT_IEEE802_15_4_NOFCS, IEEE802154_ENCAP_LEN);
+       return 0;
+}
+
+static int
+sixlowpan_remove(if6lpan_ref ifl)
+{
+       sixlowpan_assert_lock_held();
+       if (if6lpan_flags_detaching(ifl)) {
+               return 0;
+       }
+       if6lpan_flags_set_detaching(ifl);
+       sixlowpan_unconfig(ifl);
+       return 1;
+}
+
+
+static int
+sixlowpan_clone_destroy(struct ifnet *ifp)
+{
+       if6lpan_ref ifl;
+
+       sixlowpan_lock();
+       ifl = ifnet_get_if6lpan_retained(ifp);
+       if (ifl == NULL) {
+               sixlowpan_unlock();
+               return 0;
+       }
+       if (sixlowpan_remove(ifl) == 0) {
+               sixlowpan_unlock();
+               if6lpan_release(ifl);
+               return 0;
+       }
+       sixlowpan_unlock();
+       if6lpan_release(ifl);
+       ifnet_detach(ifp);
+       p_6lowpan_ifnet = NULL;
+       return 0;
+}
+
+static int
+sixlowpan_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func func)
+{
+       if6lpan_ref     ifl;
+
+       sixlowpan_lock();
+       ifl = ifnet_get_if6lpan_retained(ifp);
+       if (ifl == NULL) {
+               sixlowpan_unlock();
+               return ENODEV;
+       }
+       switch (mode) {
+       case BPF_TAP_DISABLE:
+               ifl->if6lpan_bpf_input = ifl->if6lpan_bpf_output = NULL;
+               break;
+
+       case BPF_TAP_INPUT:
+               ifl->if6lpan_bpf_input = func;
+               break;
+
+       case BPF_TAP_OUTPUT:
+               ifl->if6lpan_bpf_output = func;
+               break;
+
+       case BPF_TAP_INPUT_OUTPUT:
+               ifl->if6lpan_bpf_input = ifl->if6lpan_bpf_output = func;
+               break;
+       default:
+               break;
+       }
+       sixlowpan_unlock();
+       if6lpan_release(ifl);
+       return 0;
+}
+
+/*
+ * 6lowpan output routine.
+ * Header compression on the protocol payload
+ * Frame the compressed payload in 802.15.4 Data Frame
+ * Encapsulate the 802.15.4 frame in an Ethernet frame.
+ */
+static int
+sixlowpan_output(struct ifnet * ifp, struct mbuf * m)
+{
+       struct ifnet            *p_intf = NULL;
+       if6lpan_ref             ifl = NULL;
+       struct flowadv          adv = { .code = FADV_SUCCESS };
+       int                     err = 0;
+       char                    link_layer_dest[ETHER_ADDR_LEN];
+       bpf_packet_func         bpf_func;
+
+       u_int16_t ethertype = htons(ETHERTYPE_IEEE802154);
+       memset(link_layer_dest, 0xff, ETHER_ADDR_LEN);
+
+       if (m == 0) {
+               return 0;
+       }
+       if ((m->m_flags & M_PKTHDR) == 0) {
+               m_freem_list(m);
+               return 0;
+       }
+
+       sixlowpan_lock();
+       ifl = ifnet_get_if6lpan_retained(ifp);
+
+       if (ifl == NULL || if6lpan_flags_ready(ifl) == 0) {
+               goto unlock_done;
+       }
+
+       /* XXX parent interface equivalent? */
+       p_intf = ifl->if6lpan_pifp;
+       bpf_func = ifl->if6lpan_bpf_output;
+
+       sixlowpan_unlock();
+       if6lpan_release(ifl);
+
+       (void)ifnet_stat_increment_out(ifp, 1, m->m_pkthdr.len, 0);
+
+       /*
+        * We added a 2 byte length before the 802.15.4 data frame
+        * We can play just with the length of the first mbuf in the
+        * chain because bpf_tap_imp() disregards the packet length
+        * of the mbuf packet header.
+        */
+       if (bpf_func && (mbuf_setdata(m, m->m_data + 2, m->m_len - 2) == 0)) {
+               bpf_func(ifp, m);
+               mbuf_setdata(m, m->m_data - 2, m->m_len + 2);
+       }
+
+       /* Append ethernet header */
+       if ((err = ether_frameout_extended(p_intf, &m, NULL,
+           link_layer_dest, (const char *)&ethertype,
+           NULL, NULL))) {
+               return err;
+       }
+
+       err = dlil_output(p_intf, PF_802154, m, NULL, NULL, 1, &adv);
+
+       if (err == 0) {
+               if (adv.code == FADV_FLOW_CONTROLLED) {
+                       err = EQFULL;
+               } else if (adv.code == FADV_SUSPENDED) {
+                       err = EQSUSPENDED;
+               }
+       }
+       return err;
+
+unlock_done:
+       sixlowpan_unlock();
+       if (ifl != NULL) {
+               if6lpan_release(ifl);
+       }
+       m_freem(m);
+       return err;
+}
+
+/*
+ * 6lowpan input routine.
+ * Decapsulate the 802.15.4 Data Frame
+ * Header decompression on the payload
+ * Pass the mbuf to the IPV6 protocol stack using proto_input()
+ */
+static int
+sixlowpan_input(ifnet_t p, __unused protocol_family_t protocol,
+    mbuf_t m, __unused char *frame_header)
+{
+       frame802154_t      ieee02154hdr;
+       u_int8_t           *payload = NULL;
+       if6lpan_ref        ifl = NULL;
+       bpf_packet_func    bpf_func;
+       mbuf_t mc, m_temp;
+       int off, err = 0;
+       u_int16_t len;
+
+       /* Allocate an mbuf cluster for the 802.15.4 frame and uncompressed payload */
+       mc = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
+       if (mc == NULL) {
+               err = -1;
+               goto err_out;
+       }
+
+       memcpy(&len, mtod(m, u_int8_t *), sizeof(u_int16_t));
+       len = ntohs(len);
+       m_adj(m, sizeof(u_int16_t));
+       /* Copy the compressed 802.15.4 payload from source mbuf to allocated cluster mbuf */
+       for (m_temp = m, off = 0; m_temp != NULL; m_temp = m_temp->m_next) {
+               if (m_temp->m_len > 0) {
+                       m_copyback(mc, off, m_temp->m_len, mtod(m_temp, void *));
+                       off += m_temp->m_len;
+               }
+       }
+
+       p = p_6lowpan_ifnet;
+       mc->m_pkthdr.rcvif = p;
+
+       sixlowpan_lock();
+       ifl = ifnet_get_if6lpan_retained(p);
+
+       if (ifl == NULL) {
+               sixlowpan_unlock();
+               err = -1;
+               goto err_out;
+       }
+
+       if (if6lpan_flags_ready(ifl) == 0) {
+               if6lpan_release(ifl);
+               sixlowpan_unlock();
+               err = -1;
+               goto err_out;
+       }
+
+       bpf_func = ifl->if6lpan_bpf_input;
+       sixlowpan_unlock();
+       if6lpan_release(ifl);
+
+       if (bpf_func) {
+               bpf_func(p, mc);
+       }
+
+       /* Parse the 802.15.4 frame header */
+       bzero(&ieee02154hdr, sizeof(ieee02154hdr));
+       frame802154_parse(mtod(mc, uint8_t *), len, &ieee02154hdr, &payload);
+
+       /* XXX Add check for your link layer address being dest */
+       sixxlowpan_input(&ieee02154hdr, payload);
+
+       if (mbuf_setdata(mc, payload, ieee02154hdr.payload_len)) {
+               err = -1;
+               goto err_out;
+       }
+       mbuf_pkthdr_setlen(mc, ieee02154hdr.payload_len);
+
+       /* Post decompression */
+       if (proto_input(PF_INET6, mc) != 0) {
+               ifnet_stat_increment_in(p, 0, 0, 1);
+               err = -1;
+               goto err_out;
+       } else {
+               ifnet_stat_increment_in(p, 1, mc->m_pkthdr.len, 0);
+       }
+
+err_out:
+       if (err && mc) {
+               m_freem(mc);
+       }
+       if (!err) {
+               m_freem(m);
+       }
+       return err;
+}
+
+#define SIXLOWPAN_IFMTU 1280
+
+static int
+sixlowpan_config(struct ifnet *ifp, struct ifnet *p)
+{
+       if6lpan_ref ifl;
+       u_int16_t parent_flags;
+       sixlowpan_lock();
+       ifl = ifnet_get_if6lpan_retained(ifp);
+       if (ifl == NULL || ifl->if6lpan_pifp != NULL) {
+               sixlowpan_unlock();
+               if (ifl != NULL) {
+                       if6lpan_release(ifl);
+               }
+               return EBUSY;
+       }
+       sixlowpan_attach_protocol(p);
+
+       /* set our LL address derived from that of the parent */
+       if6lpan_set_addr(ifl, IF_LLADDR(p));
+       ifnet_set_lladdr_and_type(ifp, ifl->if6lpan_addr, IEEE802154_ADDR_LEN, IFT_6LOWPAN);
+
+       ifl->if6lpan_pifp = p;
+       ifl->if6lpan_flags = 0;
+       ifnet_set_mtu(ifp, SIXLOWPAN_IFMTU);
+       parent_flags = ifnet_flags(p) & (IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX);
+       ifnet_set_flags(ifp, parent_flags, IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX);
+       ifnet_set_flags(ifp, IFF_RUNNING, IFF_RUNNING);
+       ifnet_set_eflags(ifp, IFEF_NOAUTOIPV6LL, IFEF_NOAUTOIPV6LL);
+       if6lpan_flags_set_ready(ifl);
+       if6lpan_release(ifl);
+       sixlowpan_unlock();
+       return 0;
+}
+
+static int
+sixlowpan_unconfig(if6lpan_ref ifl)
+{
+       struct ifnet *ifp = ifl->if6lpan_ifp;
+
+       sixlowpan_assert_lock_held();
+       /* Clear our MAC address. */
+       ifnet_set_lladdr_and_type(ifp, NULL, 0, IFT_6LOWPAN);
+       sixlowpan_detach_protocol(ifl->if6lpan_pifp);
+       ifnet_set_mtu(ifp, 0);
+       ifnet_set_flags(ifp, 0,
+           IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX | IFF_RUNNING);
+       ifnet_set_eflags(ifp, 0, IFEF_NOAUTOIPV6LL);
+       ifl->if6lpan_flags = 0;
+
+       return 0;
+}
+
+static int
+sixlowpan_ioctl(ifnet_t ifp, u_long cmd, void * data)
+{
+       int             error = 0;
+       struct ifreq *  ifr = NULL;
+       struct ifnet *  p = NULL;
+       struct sixlowpanreq req = {};
+       user_addr_t             user_addr = 0;
+       if6lpan_ref             ifl = NULL;
+
+       if (ifnet_type(ifp) != IFT_6LOWPAN) {
+               return EOPNOTSUPP;
+       }
+       ifr = (struct ifreq *)data;
+
+       switch (cmd) {
+       case SIOCSIFADDR:
+               ifnet_set_flags(ifp, IFF_UP, IFF_UP);
+               break;
+
+       case SIOCSIF6LOWPAN:
+               user_addr = proc_is64bit(current_proc())
+                   ? ifr->ifr_data64 : CAST_USER_ADDR_T(ifr->ifr_data);
+               error = copyin(user_addr, &req, sizeof(req));
+               req.parent[IFNAMSIZ - 1] = '\0';
+               if (error) {
+                       break;
+               }
+               if (req.parent[0] != '\0') {
+                       p = ifunit(req.parent);
+                       if (p == NULL) {
+                               error = ENXIO;
+                               break;
+                       }
+                       if (ifnet_type(p) != IFT_ETHER
+                           && ifnet_type(p) != IFT_IEEE8023ADLAG) {
+                               error = EPROTONOSUPPORT;
+                               break;
+                       }
+                       error = sixlowpan_config(ifp, p);
+                       if (error) {
+                               break;
+                       }
+               }
+               break;
+
+       case SIOCGIF6LOWPAN:
+               bzero(&req, sizeof req);
+               sixlowpan_lock();
+               ifl = (if6lpan_ref)ifnet_softc(ifp);
+               if (ifl == NULL || if6lpan_flags_detaching(ifl)) {
+                       sixlowpan_unlock();
+                       return ifl == NULL ? EOPNOTSUPP : EBUSY;
+               }
+               p = ifl->if6lpan_pifp;
+               sixlowpan_unlock();
+               if (p != NULL) {
+                       snprintf(req.parent, sizeof(req.parent),
+                           "%s%d", ifnet_name(p), ifnet_unit(p));
+               }
+               user_addr = proc_is64bit(current_proc())
+                   ? ifr->ifr_data64 : CAST_USER_ADDR_T(ifr->ifr_data);
+               error = copyout(&req, user_addr, sizeof(req));
+               break;
+
+#ifdef  SIOCSIFMTU /* xxx */
+       case SIOCGIFMTU:
+               break;
+
+       case SIOCSIFMTU:
+               ifnet_set_mtu(ifp, ifr->ifr_mtu);
+               break;
+#endif /* SIOCSIFMTU */
+
+       default:
+               error = EOPNOTSUPP;
+       }
+       return error;
+}
+
+static void
+sixlowpan_if_free(struct ifnet * ifp)
+{
+       if6lpan_ref     ifl;
+
+       if (ifp == NULL) {
+               return;
+       }
+       ifl = (if6lpan_ref)ifnet_softc(ifp);
+       if (ifl == NULL) {
+               return;
+       }
+       if6lpan_release(ifl);
+       ifnet_release(ifp);
+       return;
+}
+
+static errno_t
+sixlowpan_detached(ifnet_t p, __unused protocol_family_t protocol)
+{
+       if (ifnet_is_attached(p, 0) == 0) {
+               // TODO: Find ifp from the parent p
+               // sixlowpan_if_free(ifp);
+       }
+       return 0;
+}
+
+/*
+ * Function: sixlowpan_attach_protocol
+ * Purpose:
+ *   Attach a DLIL protocol to the interface
+ *      The ethernet demux actually special cases 802.15.4.
+ *      The demux here isn't used. The demux will return PF_802154 for the
+ *      appropriate packets and our sixlowpan_input function will be called.
+ */
+static int
+sixlowpan_attach_protocol(struct ifnet *ifp)
+{
+       int     error;
+       struct ifnet_attach_proto_param reg;
+
+       bzero(&reg, sizeof(reg));
+       reg.input            = sixlowpan_input;
+       reg.detached         = sixlowpan_detached;
+       error = ifnet_attach_protocol(ifp, PF_802154, &reg);
+       if (error) {
+               printf("%s(%s%d) ifnet_attach_protocol failed, %d\n",
+                   __func__, ifnet_name(ifp), ifnet_unit(ifp), error);
+       }
+       return error;
+}
+
+/*
+ * Function: sixlowpan_detach_protocol
+ * Purpose:
+ *   Detach our DLIL protocol from an interface
+ */
+static int
+sixlowpan_detach_protocol(struct ifnet *ifp)
+{
+       int error;
+
+       error = ifnet_detach_protocol(ifp, PF_802154);
+       if (error) {
+               printf("(%s%d) ifnet_detach_protocol failed, %d\n",
+                   ifnet_name(ifp), ifnet_unit(ifp), error);
+       }
+
+       return error;
+}
+
+static errno_t
+sixlowpan_proto_pre_output(ifnet_t ifp,
+    __unused protocol_family_t protocol_family,
+    mbuf_t *m0,
+    const struct sockaddr *dest,
+    void *route,
+    char *type,
+    char *ll_dest)
+{
+#pragma unused(protocol_family)
+       errno_t result = 0;
+       struct sockaddr_dl sdl;
+       struct sockaddr_in6 *dest6 =  (struct sockaddr_in6 *)(uintptr_t)(size_t)dest;
+
+       if (!IN6_IS_ADDR_MULTICAST(&dest6->sin6_addr)) {
+               result = nd6_lookup_ipv6(ifp, dest6, &sdl, sizeof(sdl), route, *m0);
+               if (result == 0) {
+                       bcopy(LLADDR(&sdl), ll_dest, sdl.sdl_alen);
+               }
+       } else {
+               /* map multicast address */
+               ll_dest[0] = (dest6->sin6_addr.s6_addr8[14] & 0x1f) | 0x80;
+               ll_dest[1] = dest6->sin6_addr.s6_addr8[15];
+       }
+
+       /*
+        * XXX This should be generic to the underlying hardware type
+        */
+       if (result == 0) {
+               u_int16_t ethertype = htons(ETHERTYPE_IEEE802154);
+               bcopy(&ethertype, type, sizeof(ethertype));
+       }
+
+       return result;
+}
+
+static int
+sixlowpan_framer_extended(struct ifnet *ifp, struct mbuf **m,
+    const struct sockaddr *ndest, const char *edst,
+    const char *ether_type, u_int32_t *prepend_len, u_int32_t *postpend_len)
+{
+#pragma unused(ndest)
+#pragma unused(ether_type)
+       char buf[IEEE802154_ENCAP_LEN] = {0};
+       int buflen = 0, err = 0;
+       frame802154_t ieee02154hdr;
+       if6lpan_ref ifl = NULL;
+       u_int8_t *payload = NULL;
+       struct mbuf *mc = NULL;
+       u_int16_t len;
+       struct sockaddr_in6 *dest6 =  (struct sockaddr_in6 *)(uintptr_t)(size_t)ndest;
+
+       /* Initialize 802.15.4 frame header */
+       bzero(&ieee02154hdr, sizeof(ieee02154hdr));
+       if (!IN6_IS_ADDR_MULTICAST(&dest6->sin6_addr)) {
+               bcopy(edst, ieee02154hdr.dest_addr, sizeof(ieee02154hdr.dest_addr));
+               ieee02154hdr.fcf.dest_addr_mode = FRAME802154_LONGADDRMODE;
+       } else {
+               bcopy(edst, ieee02154hdr.dest_addr, 2);
+               ieee02154hdr.fcf.dest_addr_mode = FRAME802154_SHORTADDRMODE;
+       }
+
+       /* Allocate a contiguous buffer for IPv6 header & payload */
+       /*
+        * XXX As of now either we compress or we don't compress at all
+        * adding another byte of dispatch to communicate that there's no
+        * compression.
+        *
+        * Allocate for the worst case.
+        */
+       payload = _MALLOC(m_pktlen(*m) + 1, M_TEMP, M_WAITOK | M_ZERO);
+       if (payload == NULL) {
+               err = -1;
+               goto err_out;
+       }
+
+       /* Copy the IPv6 header & payload */
+       if (mbuf_copydata(*m, 0, m_pktlen(*m), payload)) {
+               err = -1;
+               goto err_out;
+       }
+
+       /* Allocate an mbuf cluster for the 802.15.4 frame and compressed payload */
+       mc = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
+       if (mc == NULL) {
+               err = -1;
+               goto err_out;
+       }
+
+       sixlowpan_lock();
+       ifl = ifnet_get_if6lpan_retained(ifp);
+       if (ifl == NULL || if6lpan_flags_ready(ifl) == 0) {
+               if (ifl != NULL) {
+                       if6lpan_release(ifl);
+               }
+               sixlowpan_unlock();
+               err = -1;
+               goto err_out;
+       }
+       bcopy(ifl->if6lpan_addr, ieee02154hdr.src_addr, sizeof(ieee02154hdr.src_addr));
+       ieee02154hdr.seq = ifl->if6lpan_ieee802154_seq++;   /**< Sequence number */
+       if6lpan_release(ifl);
+       sixlowpan_unlock();
+
+       /* Initialize frame control field */
+       ieee02154hdr.fcf.frame_type = FRAME802154_DATAFRAME;  /**< 3 bit. Frame type field, see 802.15.4 */
+       ieee02154hdr.fcf.security_enabled = 0;  /**< 1 bit. True if security is used in this frame */
+       ieee02154hdr.fcf.frame_pending = 0;     /**< 1 bit. True if sender has more data to send */
+       ieee02154hdr.fcf.ack_required = 0;      /**< 1 bit. Is an ack frame required? */
+       ieee02154hdr.fcf.panid_compression = 0; /**< 1 bit. Is this a compressed header? */
+       ieee02154hdr.fcf.frame_version = FRAME802154_IEEE802154_2006; /**< 2 bit. 802.15.4 frame version */
+       ieee02154hdr.fcf.src_addr_mode = FRAME802154_LONGADDRMODE;    /**< 2 bit. Source address mode, see 802.15.4 */
+       ieee02154hdr.dest_pid = IEEE802154_PANID;   /**< Destination PAN ID */
+       ieee02154hdr.src_pid = IEEE802154_PANID;    /**< Source PAN ID */
+       ieee02154hdr.payload_len = m_pktlen(*m);    /**< Length of payload field */
+
+       /* Create an 802.15.4 Data header frame */
+       buflen = frame802154_create(&ieee02154hdr, (uint8_t *)buf);
+
+       /* Perform inline compression of the IPv6 hdr & payload */
+       sixxlowpan_output(&ieee02154hdr, payload);
+
+       /*
+        * Add 2 bytes at the front of the frame indicating the total payload
+        * length
+        */
+       len = htons(buflen + ieee02154hdr.payload_len);
+       m_copyback(mc, 0, sizeof(len), &len);
+       /* Copy back the 802.15.4 Data frame header into mbuf */
+       m_copyback(mc, sizeof(len), buflen, buf);
+       /* Copy back the compressed payload into mbuf */
+       m_copyback(mc, buflen + sizeof(len), ieee02154hdr.payload_len, payload);
+
+       if (prepend_len != NULL) {
+               *prepend_len = buflen;
+       }
+       if (postpend_len != NULL) {
+               *postpend_len = 0;
+       }
+
+err_out:
+       if (payload != NULL) {
+               _FREE(payload, M_TEMP);
+       }
+       m_freem(*m);
+       *m = mc;
+       return err;
+}
+
+
+static errno_t
+sixlowpan_attach_inet6(struct ifnet *ifp, protocol_family_t protocol_family)
+{
+       struct ifnet_attach_proto_param proto;
+       errno_t error;
+
+       bzero(&proto, sizeof(proto));
+       proto.pre_output = sixlowpan_proto_pre_output;
+
+       error = ifnet_attach_protocol(ifp, protocol_family, &proto);
+       if (error && error != EEXIST) {
+               printf("WARNING: %s can't attach ipv6 to %s\n", __func__,
+                   if_name(ifp));
+       }
+       return error;
+}
+
+static void
+sixlowpan_detach_inet6(struct ifnet *ifp, protocol_family_t protocol_family)
+{
+       (void) ifnet_detach_protocol(ifp, protocol_family);
+}
+
+#if INET6
+__private_extern__ int
+sixlowpan_family_init(void)
+{
+       int error = 0;
+
+       error = proto_register_plumber(PF_INET6, IFNET_FAMILY_6LOWPAN,
+           sixlowpan_attach_inet6, sixlowpan_detach_inet6);
+       if (error != 0) {
+               printf("6lowpan: proto_register_plumber failed for AF_INET6 error=%d\n",
+                   error);
+               goto done;
+       }
+
+       error = sixlowpan_clone_attach();
+       if (error != 0) {
+               printf("6lowpan: proto_register_plumber failed sixlowpan_clone_attach error=%d\n",
+                   error);
+               goto done;
+       }
+
+
+done:
+       return error;
+}
+#endif
diff --git a/bsd/net/if_6lowpan_var.h b/bsd/net/if_6lowpan_var.h
new file mode 100644 (file)
index 0000000..494db05
--- /dev/null
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _NET_IF_6LOWPAN_VAR_H_
+#define _NET_IF_6LOWPAN_VAR_H_  1
+
+#define IEEE802154_ADDR_LEN     8
+#define IEEE802154_ENCAP_LEN    25 /* len of 802.15.4 Frame header */
+#define IEEE802154_FRAME_LEN    127
+
+/*
+ * Configuration structure for SIOCSET6LOWPAN and SIOCGET6LOWPAN ioctls.
+ */
+struct sixlowpanreq {
+       char parent[IFNAMSIZ];
+};
+
+#ifdef KERNEL_PRIVATE
+int sixlowpan_family_init(void);
+#endif /* KERNEL_PRIVATE */
+#endif /* _NET_IF_6LOWPAN_VAR_H_ */
index 1161489673a04b796f6d6c6681e34e07b669d4a2..75492392758d15c94629753b60b5629190721155 100644 (file)
@@ -53,6 +53,7 @@
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/kpi_interface.h>
+#include <net/kpi_interfacefilter.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_ether.h>
@@ -65,7 +66,7 @@
 #include <net/devtimer.h>
 #include <net/if_vlan_var.h>
 #include <net/kpi_protocol.h>
-
+#include <sys/protosw.h>
 #include <kern/locks.h>
 #include <kern/zalloc.h>
 #include <os/refcnt.h>
 #include <net/if_media.h>
 #include <net/multicast_list.h>
 
+SYSCTL_DECL(_net_link);
+SYSCTL_NODE(_net_link, OID_AUTO, bond, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
+    "Bond interface");
+
+static int if_bond_debug = 0;
+SYSCTL_INT(_net_link_bond, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &if_bond_debug, 0, "Bond interface debug logs");
+
 static struct ether_addr slow_proto_multicast = {
-       IEEE8023AD_SLOW_PROTO_MULTICAST
+       .octet = IEEE8023AD_SLOW_PROTO_MULTICAST
 };
 
 typedef struct ifbond_s ifbond, * ifbond_ref;
@@ -269,6 +278,13 @@ enum {
 
 typedef u_char MuxState;
 
+#define PORT_CONTROL_FLAGS_IN_LIST               0x01
+#define PORT_CONTROL_FLAGS_PROTO_ATTACHED        0x02
+#define PORT_CONTROL_FLAGS_FILTER_ATTACHED       0x04
+#define PORT_CONTROL_FLAGS_LLADDR_SET            0x08
+#define PORT_CONTROL_FLAGS_MTU_SET               0x10
+#define PORT_CONTROL_FLAGS_PROMISCUOUS_SET       0x20
+
 struct bondport_s {
        TAILQ_ENTRY(bondport_s)     po_port_list;
        ifbond_ref                  po_bond;
@@ -278,6 +294,8 @@ struct bondport_s {
        int                         po_enabled;
        char                        po_name[IFNAMSIZ];
        struct ifdevmtu             po_devmtu;
+       uint32_t                    po_control_flags;
+       interface_filter_t          po_filter;
 
        /* LACP */
        TAILQ_ENTRY(bondport_s)     po_lag_port_list;
@@ -296,6 +314,7 @@ struct bondport_s {
        SelectedState               po_selected;
        int32_t                     po_last_transmit_secs;
        struct media_info           po_media_info;
+       uint64_t                    po_force_link_event_time;
        LAG_ref                     po_lag;
 };
 
@@ -462,7 +481,6 @@ typedef struct bond_globals_s {
        struct ifbond_list          ifbond_list;
        lacp_system                 system;
        lacp_system_priority        system_priority;
-       int                         verbose;
 } * bond_globals_ref;
 
 static bond_globals_ref g_bond;
@@ -566,6 +584,8 @@ LAG_get_aggregatable_port_count(LAG_ref lag, int * active_media);
 static int
 ifbond_selection(ifbond_ref bond);
 
+static void
+bond_handle_event(struct ifnet * port_ifp, int event_code);
 
 /**
 ** bondport
@@ -621,19 +641,20 @@ bondport_collecting(bondport_ref p)
 **/
 static int bond_clone_create(struct if_clone *, u_int32_t, void *);
 static int bond_clone_destroy(struct ifnet *);
-static int bond_input(ifnet_t ifp, protocol_family_t protocol, mbuf_t m,
-    char *frame_header);
 static int bond_output(struct ifnet *ifp, struct mbuf *m);
 static int bond_ioctl(struct ifnet *ifp, u_long cmd, void * addr);
 static int bond_set_bpf_tap(struct ifnet * ifp, bpf_tap_mode mode,
     bpf_packet_func func);
 static int bond_attach_protocol(struct ifnet *ifp);
 static int bond_detach_protocol(struct ifnet *ifp);
+static errno_t bond_iff_input(void *cookie, ifnet_t ifp,
+    protocol_family_t protocol, mbuf_t *data, char **frame_ptr);
+static int bond_attach_filter(struct ifnet *ifp, interface_filter_t * filter_p);
 static int bond_setmulti(struct ifnet *ifp);
 static int bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp);
 static int bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp);
 static void bond_if_free(struct ifnet * ifp);
-static  void interface_link_event(struct ifnet * ifp, u_int32_t event_code);
+static void interface_link_event(struct ifnet * ifp, u_int32_t event_code);
 
 static struct if_clone bond_cloner = IF_CLONE_INITIALIZER(BONDNAME,
     bond_clone_create,
@@ -687,11 +708,11 @@ ifbond_release(ifbond_ref ifb)
                return;
        }
 
-       if (g_bond->verbose) {
+       if (if_bond_debug) {
                printf("ifbond_release(%s)\n", ifb->ifb_name);
        }
        if (ifb->ifb_ifma_slow_proto != NULL) {
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        printf("ifbond_release(%s) removing multicast\n",
                            ifb->ifb_name);
                }
@@ -732,7 +753,7 @@ ifbond_wait(ifbond_ref ifb, const char * msg)
 
        /* other add/remove in progress */
        while (ifbond_flags_change_in_progress(ifb)) {
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        printf("%s: %s msleep\n", ifb->ifb_name, msg);
                }
                waited = 1;
@@ -740,7 +761,7 @@ ifbond_wait(ifbond_ref ifb, const char * msg)
        }
        /* prevent other bond list remove/add from taking place */
        ifbond_flags_set_change_in_progress(ifb);
-       if (g_bond->verbose && waited) {
+       if (if_bond_debug && waited) {
                printf("%s: %s woke up\n", ifb->ifb_name, msg);
        }
        return;
@@ -761,7 +782,7 @@ ifbond_signal(ifbond_ref ifb, const char * msg)
 {
        ifbond_flags_clear_change_in_progress(ifb);
        wakeup((caddr_t)ifb);
-       if (g_bond->verbose) {
+       if (if_bond_debug) {
                printf("%s: %s wakeup\n", ifb->ifb_name, msg);
        }
        return;
@@ -775,6 +796,10 @@ static int
 link_speed(int active)
 {
        switch (IFM_SUBTYPE(active)) {
+       case IFM_AUTO:
+       case IFM_MANUAL:
+       case IFM_NONE:
+               return 0;
        case IFM_10_T:
        case IFM_10_2:
        case IFM_10_5:
@@ -795,7 +820,7 @@ link_speed(int active)
        case IFM_1000_KX:
                return 1000;
        case IFM_HPNA_1:
-               return 0;
+               return 1;
        default:
        /* assume that new defined types are going to be at least 10GigE */
        case IFM_10G_SR:
@@ -851,6 +876,30 @@ media_full_duplex(const struct media_info * mi)
        return (mi->mi_active & IFM_FDX) != 0;
 }
 
+static __inline__ int
+media_type_unknown(const struct media_info * mi)
+{
+       int unknown;
+
+       switch (IFM_SUBTYPE(mi->mi_active)) {
+       case IFM_AUTO:
+       case IFM_MANUAL:
+       case IFM_NONE:
+               unknown = 1;
+               break;
+       default:
+               unknown = 0;
+               break;
+       }
+       return unknown;
+}
+
+static __inline__ int
+media_ok(const struct media_info * mi)
+{
+       return media_full_duplex(mi) || media_type_unknown(mi);
+}
+
 static __inline__ int
 media_speed(const struct media_info * mi)
 {
@@ -1067,7 +1116,7 @@ bond_setmulti(struct ifnet * ifp)
        }
        bond_lock();
 signal_done:
-       ifbond_signal(ifb, "bond_setmulti");
+       ifbond_signal(ifb, __func__);
        bond_unlock();
        ifbond_release(ifb);
        return result;
@@ -1171,7 +1220,6 @@ bond_clone_create(struct if_clone * ifc, u_int32_t unit, __unused void *params)
        ifnet_set_offload(ifp, 0);
        ifnet_set_addrlen(ifp, ETHER_ADDR_LEN); /* XXX ethernet specific */
        ifnet_set_flags(ifp, IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX, 0xffff);
-       ifnet_set_baudrate(ifp, 0);
        ifnet_set_mtu(ifp, ETHERMTU);
 
        error = ifnet_attach(ifp, NULL);
@@ -1473,7 +1521,7 @@ bond_output(struct ifnet * ifp, struct mbuf * m)
        ifbond_ref                  ifb;
        struct ifnet *              port_ifp = NULL;
        int                         err;
-       struct flowadv              adv = { FADV_SUCCESS };
+       struct flowadv              adv = { .code = FADV_SUCCESS };
 
        if (m == 0) {
                return 0;
@@ -1571,6 +1619,7 @@ bond_receive_lacpdu(struct mbuf * m, struct ifnet * port_ifp)
        struct ifnet *              bond_ifp = NULL;
        ifbond_ref                  ifb;
        int                         event_code = 0;
+       bool                        need_link_update = false;
        bondport_ref                p;
 
        bond_lock();
@@ -1588,6 +1637,24 @@ bond_receive_lacpdu(struct mbuf * m, struct ifnet * port_ifp)
        if (ifb->ifb_mode != IF_BOND_MODE_LACP) {
                goto done;
        }
+       /*
+        * Work-around for rdar://problem/51372042
+        * Sometimes, the link comes up but the driver doesn't report the
+        * negotiated medium at that time. When we receive an LACPDU packet,
+        * and the medium is unknown, force a link status check. Don't force
+        * the link status check more often than _FORCE_LINK_EVENT_INTERVAL
+        * seconds.
+        */
+#define _FORCE_LINK_EVENT_INTERVAL      1
+       if (media_type_unknown(&p->po_media_info)) {
+               uint64_t        now = net_uptime();
+
+               if ((now - p->po_force_link_event_time) >=
+                   _FORCE_LINK_EVENT_INTERVAL) {
+                       need_link_update = true;
+                       p->po_force_link_event_time = now;
+               }
+       }
        bondport_receive_lacpdu(p, (lacpdu_ref)m->m_data);
        if (ifbond_selection(ifb)) {
                event_code = (ifb->ifb_active_lag == NULL)
@@ -1601,7 +1668,7 @@ bond_receive_lacpdu(struct mbuf * m, struct ifnet * port_ifp)
                    ? KEV_DL_LINK_OFF
                    : KEV_DL_LINK_ON;
                if (event_code != ifb->ifb_last_link_event) {
-                       if (g_bond->verbose) {
+                       if (if_bond_debug) {
                                timestamp_printf("%s: (receive) generating LINK event\n",
                                    ifb->ifb_name);
                        }
@@ -1616,6 +1683,12 @@ done:
                interface_link_event(bond_ifp, event_code);
        }
        m_freem(m);
+       if (need_link_update) {
+               if (if_bond_debug != 0) {
+                       printf("bond: simulating link status changed event");
+               }
+               bond_handle_event(port_ifp, KEV_DL_LINK_ON);
+       }
        return;
 }
 
@@ -1651,9 +1724,8 @@ failed:
        return;
 }
 
-static int
-bond_input(ifnet_t port_ifp, __unused protocol_family_t protocol, mbuf_t m,
-    char * frame_header)
+static void
+bond_input(ifnet_t port_ifp, mbuf_t m, char *frame_header)
 {
        bpf_packet_func             bpf_func;
        const struct ether_header * eh_p;
@@ -1671,17 +1743,17 @@ bond_input(ifnet_t port_ifp, __unused protocol_family_t protocol, mbuf_t m,
                if (subtype == IEEE8023AD_SLOW_PROTO_SUBTYPE_LACP) {
                        if (m->m_pkthdr.len < (int)offsetof(lacpdu, la_reserved)) {
                                m_freem(m);
-                               return 0;
+                               return;
                        }
                        /* send to lacp */
                        if (m->m_len < (int)offsetof(lacpdu, la_reserved)) {
                                m = m_pullup(m, offsetof(lacpdu, la_reserved));
                                if (m == NULL) {
-                                       return 0;
+                                       return;
                                }
                        }
                        bond_receive_lacpdu(m, port_ifp);
-                       return 0;
+                       return;
                } else if (subtype == IEEE8023AD_SLOW_PROTO_SUBTYPE_LA_MARKER_PROTOCOL) {
                        int         min_size;
 
@@ -1692,23 +1764,23 @@ bond_input(ifnet_t port_ifp, __unused protocol_family_t protocol, mbuf_t m,
                        min_size = ETHER_HDR_LEN + offsetof(la_marker_pdu, lm_reserved);
                        if (m->m_pkthdr.len < min_size) {
                                m_freem(m);
-                               return 0;
+                               return;
                        }
                        /* send to lacp */
                        if (m->m_len < min_size) {
                                m = m_pullup(m, min_size);
                                if (m == NULL) {
-                                       return 0;
+                                       return;
                                }
                        }
                        /* send to marker responder */
                        bond_receive_la_marker_pdu(m, port_ifp);
-                       return 0;
+                       return;
                } else if (subtype == 0
                    || subtype > IEEE8023AD_SLOW_PROTO_SUBTYPE_RESERVED_END) {
                        /* invalid subtype, discard the frame */
                        m_freem(m);
-                       return 0;
+                       return;
                }
        }
        bond_lock();
@@ -1720,12 +1792,19 @@ bond_input(ifnet_t port_ifp, __unused protocol_family_t protocol, mbuf_t m,
                goto done;
        }
 
-       /* make the packet appear as if it arrived on the bonded interface */
        ifb = p->po_bond;
        ifp = ifb->ifb_ifp;
        bpf_func = ifb->ifb_bpf_input;
        bond_unlock();
 
+       /*
+        * Need to clear the promiscous flags otherwise it will be
+        * dropped by DLIL after processing filters
+        */
+       if ((mbuf_flags(m) & MBUF_PROMISC)) {
+               mbuf_setflags_mask(m, 0, MBUF_PROMISC);
+       }
+
        if (m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) {
                (void)ifnet_stat_increment_in(ifp, 1,
                    (m->m_pkthdr.len + ETHER_HDR_LEN
@@ -1734,16 +1813,31 @@ bond_input(ifnet_t port_ifp, __unused protocol_family_t protocol, mbuf_t m,
                (void)ifnet_stat_increment_in(ifp, 1,
                    (m->m_pkthdr.len + ETHER_HDR_LEN), 0);
        }
+
+       /* make the packet appear as if it arrived on the bonded interface */
        m->m_pkthdr.rcvif = ifp;
        bond_bpf_input(ifp, m, eh_p, bpf_func);
        m->m_pkthdr.pkt_hdr = frame_header;
        dlil_input_packet_list(ifp, m);
-       return 0;
+       return;
 
 done:
        bond_unlock();
        m_freem(m);
-       return 0;
+       return;
+}
+
+static errno_t
+bond_iff_input(void *cookie, ifnet_t port_ifp, protocol_family_t protocol,
+    mbuf_t *data, char **frame_header_ptr)
+{
+#pragma unused(cookie)
+#pragma unused(protocol)
+       mbuf_t                      m = *data;
+       char *                      frame_header = *frame_header_ptr;
+
+       bond_input(port_ifp, m, frame_header);
+       return EJUSTRETURN;
 }
 
 static __inline__ const char *
@@ -1807,7 +1901,7 @@ bondport_timer_process_func(devtimer_ref timer,
                                    ? KEV_DL_LINK_OFF
                                    : KEV_DL_LINK_ON;
                                if (event_code != p->po_bond->ifb_last_link_event) {
-                                       if (g_bond->verbose) {
+                                       if (if_bond_debug) {
                                                timestamp_printf("%s: (timer) generating LINK event\n",
                                                    p->po_bond->ifb_name);
                                        }
@@ -1952,11 +2046,6 @@ bondport_free(bondport_ref p)
        return;
 }
 
-#define BOND_ADD_PROGRESS_IN_LIST               0x1
-#define BOND_ADD_PROGRESS_PROTO_ATTACHED        0x2
-#define BOND_ADD_PROGRESS_LLADDR_SET            0x4
-#define BOND_ADD_PROGRESS_MTU_SET               0x8
-
 static __inline__ int
 bond_device_mtu(struct ifnet * ifp, ifbond_ref ifb)
 {
@@ -1967,15 +2056,16 @@ bond_device_mtu(struct ifnet * ifp, ifbond_ref ifb)
 static int
 bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp)
 {
+       uint32_t                    control_flags = 0;
        int                         devmtu;
        int                         error = 0;
        int                         event_code = 0;
+       interface_filter_t          filter = NULL;
        int                         first = FALSE;
        ifbond_ref                  ifb;
        bondport_ref *              new_array = NULL;
        bondport_ref *              old_array = NULL;
        bondport_ref                p;
-       int                         progress = 0;
 
        if (IFNET_IS_INTCOPROC(port_ifp)) {
                return EINVAL;
@@ -2009,7 +2099,7 @@ bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp)
        ifbond_retain(ifb);
 
        /* wait for other add or remove to complete */
-       ifbond_wait(ifb, "bond_add_interface");
+       ifbond_wait(ifb, __func__);
 
        if (ifbond_flags_if_detaching(ifb)) {
                /* someone destroyed the bond while we were waiting */
@@ -2050,8 +2140,9 @@ bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp)
                        ifnet_offload_t     offload;
 
                        offload = ifp_offload & port_ifp_offload;
-                       printf("bond_add_interface(%s, %s)  "
+                       printf("%s(%s, %s)  "
                            "hwassist values don't match 0x%x != 0x%x, using 0x%x instead\n",
+                           __func__,
                            ifb->ifb_name, bondport_get_name(p),
                            ifp_offload, port_ifp_offload, offload);
                        /*
@@ -2080,7 +2171,7 @@ bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp)
                    IFT_ETHER);
        }
 
-       progress |= BOND_ADD_PROGRESS_IN_LIST;
+       control_flags |= PORT_CONTROL_FLAGS_IN_LIST;
 
        /* allocate a larger distributing array */
        new_array = (bondport_ref *)
@@ -2095,24 +2186,32 @@ bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp)
        if (error) {
                goto failed;
        }
-       progress |= BOND_ADD_PROGRESS_PROTO_ATTACHED;
+       control_flags |= PORT_CONTROL_FLAGS_PROTO_ATTACHED;
+
+       /* attach our BOND interface filter */
+       error = bond_attach_filter(port_ifp, &filter);
+       if (error != 0) {
+               goto failed;
+       }
+       control_flags |= PORT_CONTROL_FLAGS_FILTER_ATTACHED;
 
        /* set the interface MTU */
        devmtu = bond_device_mtu(ifp, ifb);
        error = siocsifmtu(port_ifp, devmtu);
        if (error != 0) {
-               printf("bond_add_interface(%s, %s):"
+               printf("%s(%s, %s):"
                    " SIOCSIFMTU %d failed %d\n",
+                   __func__,
                    ifb->ifb_name, bondport_get_name(p), devmtu, error);
                goto failed;
        }
-       progress |= BOND_ADD_PROGRESS_MTU_SET;
+       control_flags |= PORT_CONTROL_FLAGS_MTU_SET;
 
        /* program the port with our multicast addresses */
        error = multicast_list_program(&p->po_multicast, ifp, port_ifp);
        if (error) {
-               printf("bond_add_interface(%s, %s):"
-                   " multicast_list_program failed %d\n",
+               printf("%s(%s, %s): multicast_list_program failed %d\n",
+                   __func__,
                    ifb->ifb_name, bondport_get_name(p), error);
                goto failed;
        }
@@ -2122,7 +2221,8 @@ bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp)
 
        error = ifnet_ioctl(port_ifp, 0, SIOCSIFFLAGS, NULL);
        if (error != 0) {
-               printf("bond_add_interface(%s, %s): SIOCSIFFLAGS failed %d\n",
+               printf("%s(%s, %s): SIOCSIFFLAGS failed %d\n",
+                   __func__,
                    ifb->ifb_name, bondport_get_name(p), error);
                goto failed;
        }
@@ -2130,18 +2230,36 @@ bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp)
        /* re-program the port's ethernet address */
        error = if_siflladdr(port_ifp,
            (const struct ether_addr *)IF_LLADDR(ifp));
+       if (error == 0) {
+               if (memcmp(IF_LLADDR(ifp), IF_LLADDR(port_ifp), ETHER_ADDR_LEN)
+                   != 0) {
+                       /* it lied, it really doesn't support setting lladdr */
+                       error = EOPNOTSUPP;
+               }
+       }
        if (error != 0) {
                /* port doesn't support setting the link address */
-               printf("bond_add_interface(%s, %s): if_siflladdr failed %d\n",
+               printf("%s(%s, %s): if_siflladdr failed %d\n",
+                   __func__,
                    ifb->ifb_name, bondport_get_name(p), error);
-               goto failed;
+               error = ifnet_set_promiscuous(port_ifp, 1);
+               if (error != 0) {
+                       /* port doesn't support setting promiscuous mode */
+                       printf("%s(%s, %s): set promiscuous failed %d\n",
+                           __func__,
+                           ifb->ifb_name, bondport_get_name(p), error);
+                       goto failed;
+               }
+               control_flags |= PORT_CONTROL_FLAGS_PROMISCUOUS_SET;
+       } else {
+               control_flags |= PORT_CONTROL_FLAGS_LLADDR_SET;
        }
-       progress |= BOND_ADD_PROGRESS_LLADDR_SET;
 
        bond_lock();
 
        /* no failures past this point */
        p->po_enabled = 1;
+       p->po_control_flags = control_flags;
 
        /* copy the contents of the existing distributing array */
        if (ifb->ifb_distributing_count) {
@@ -2172,8 +2290,10 @@ bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp)
                        bondport_disable_distributing(p);
                }
        }
+       p->po_filter = filter;
+
        /* clear the busy state, and wakeup anyone waiting */
-       ifbond_signal(ifb, "bond_add_interface");
+       ifbond_signal(ifb, __func__);
        bond_unlock();
        if (event_code != 0) {
                interface_link_event(ifp, event_code);
@@ -2194,30 +2314,45 @@ failed:
        if (new_array != NULL) {
                FREE(new_array, M_BOND);
        }
-       if ((progress & BOND_ADD_PROGRESS_LLADDR_SET) != 0) {
+       if ((control_flags & PORT_CONTROL_FLAGS_LLADDR_SET) != 0) {
                int     error1;
 
                error1 = if_siflladdr(port_ifp, &p->po_saved_addr);
                if (error1 != 0) {
-                       printf("bond_add_interface(%s, %s): if_siflladdr failed %d\n",
+                       printf("%s(%s, %s): if_siflladdr restore failed %d\n",
+                           __func__,
+                           ifb->ifb_name, bondport_get_name(p), error1);
+               }
+       }
+       if ((control_flags & PORT_CONTROL_FLAGS_PROMISCUOUS_SET) != 0) {
+               int     error1;
+
+               error1 = ifnet_set_promiscuous(port_ifp, 0);
+               if (error1 != 0) {
+                       printf("%s(%s, %s): promiscous mode disable failed %d\n",
+                           __func__,
                            ifb->ifb_name, bondport_get_name(p), error1);
                }
        }
-       if ((progress & BOND_ADD_PROGRESS_PROTO_ATTACHED) != 0) {
+       if ((control_flags & PORT_CONTROL_FLAGS_PROTO_ATTACHED) != 0) {
                (void)bond_detach_protocol(port_ifp);
        }
-       if ((progress & BOND_ADD_PROGRESS_MTU_SET) != 0) {
+       if ((control_flags & PORT_CONTROL_FLAGS_FILTER_ATTACHED) != 0) {
+               iflt_detach(filter);
+       }
+       if ((control_flags & PORT_CONTROL_FLAGS_MTU_SET) != 0) {
                int error1;
 
                error1 = siocsifmtu(port_ifp, p->po_devmtu.ifdm_current);
                if (error1 != 0) {
-                       printf("bond_add_interface(%s, %s): SIOCSIFMTU %d failed %d\n",
+                       printf("%s(%s, %s): SIOCSIFMTU %d failed %d\n",
+                           __func__,
                            ifb->ifb_name, bondport_get_name(p),
                            p->po_devmtu.ifdm_current, error1);
                }
        }
        bond_lock();
-       if ((progress & BOND_ADD_PROGRESS_IN_LIST) != 0) {
+       if ((control_flags & PORT_CONTROL_FLAGS_IN_LIST) != 0) {
                TAILQ_REMOVE(&ifb->ifb_port_list, p, po_port_list);
                ifb->ifb_port_count--;
        }
@@ -2229,7 +2364,7 @@ failed:
        }
 
 signal_done:
-       ifbond_signal(ifb, "bond_add_interface");
+       ifbond_signal(ifb, __func__);
        bond_unlock();
        ifbond_release(ifb);
        bondport_free(p);
@@ -2244,6 +2379,7 @@ bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp)
        int                         event_code = 0;
        bondport_ref                head_port;
        struct ifnet *              ifp;
+       interface_filter_t          filter;
        int                         last = FALSE;
        int                         new_link_address = FALSE;
        bondport_ref                p;
@@ -2315,7 +2451,7 @@ bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp)
                        ifb->ifb_last_link_event = event_code = KEV_DL_LINK_OFF;
                }
        }
-
+       filter = p->po_filter;
        bond_unlock();
 
        if (last) {
@@ -2335,11 +2471,17 @@ bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp)
                TAILQ_FOREACH(scan_port, &ifb->ifb_port_list, po_port_list) {
                        scan_ifp = scan_port->po_ifp;
 
+                       if ((scan_port->po_control_flags &
+                           PORT_CONTROL_FLAGS_LLADDR_SET) == 0) {
+                               /* port doesn't support setting lladdr */
+                               continue;
+                       }
                        error = if_siflladdr(scan_ifp,
                            (const struct ether_addr *) IF_LLADDR(ifp));
                        if (error != 0) {
-                               printf("bond_remove_interface(%s, %s): "
+                               printf("%s(%s, %s): "
                                    "if_siflladdr (%s) failed %d\n",
+                                   __func__,
                                    ifb->ifb_name, bondport_get_name(p),
                                    bondport_get_name(scan_port), error);
                        }
@@ -2347,16 +2489,30 @@ bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp)
        }
 
        /* restore the port's ethernet address */
-       error = if_siflladdr(port_ifp, &p->po_saved_addr);
-       if (error != 0) {
-               printf("bond_remove_interface(%s, %s): if_siflladdr failed %d\n",
-                   ifb->ifb_name, bondport_get_name(p), error);
+       if ((p->po_control_flags & PORT_CONTROL_FLAGS_LLADDR_SET) != 0) {
+               error = if_siflladdr(port_ifp, &p->po_saved_addr);
+               if (error != 0) {
+                       printf("%s(%s, %s): if_siflladdr failed %d\n",
+                           __func__,
+                           ifb->ifb_name, bondport_get_name(p), error);
+               }
+       }
+
+       /* disable promiscous mode (if we enabled it) */
+       if ((p->po_control_flags & PORT_CONTROL_FLAGS_PROMISCUOUS_SET) != 0) {
+               error = ifnet_set_promiscuous(port_ifp, 0);
+               if (error != 0) {
+                       printf("%s(%s, %s): disable promiscuous failed %d\n",
+                           __func__,
+                           ifb->ifb_name, bondport_get_name(p), error);
+               }
        }
 
        /* restore the port's MTU */
        error = siocsifmtu(port_ifp, p->po_devmtu.ifdm_current);
        if (error != 0) {
-               printf("bond_remove_interface(%s, %s): SIOCSIFMTU %d failed %d\n",
+               printf("%s(%s, %s): SIOCSIFMTU %d failed %d\n",
+                   __func__,
                    ifb->ifb_name, bondport_get_name(p),
                    p->po_devmtu.ifdm_current, error);
        }
@@ -2364,6 +2520,11 @@ bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp)
        /* remove the bond "protocol" */
        bond_detach_protocol(port_ifp);
 
+       /* detach the filter */
+       if (filter != NULL) {
+               iflt_detach(filter);
+       }
+
        /* generate link event */
        if (event_code != 0) {
                interface_link_event(ifp, event_code);
@@ -2376,7 +2537,7 @@ bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp)
        ifbond_release(ifb);
 
 signal_done:
-       ifbond_signal(ifb, "bond_remove_interface");
+       ifbond_signal(ifb, __func__);
        ifbond_release(ifb);
        return error;
 }
@@ -2474,7 +2635,7 @@ bond_set_mode(struct ifnet * ifp, int mode)
        ifb->ifb_last_link_event = event_code;
 
 signal_done:
-       ifbond_signal(ifb, "bond_set_mode");
+       ifbond_signal(ifb, __func__);
        bond_unlock();
        ifbond_release(ifb);
 
@@ -2665,7 +2826,7 @@ bond_set_mtu(struct ifnet * ifp, int mtu, int isdevmtu)
        }
 
 signal_done:
-       ifbond_signal(ifb, "bond_set_mtu");
+       ifbond_signal(ifb, __func__);
        ifbond_release(ifb);
 
 done:
@@ -2816,12 +2977,7 @@ bond_ioctl(struct ifnet *ifp, u_long cmd, void * data)
                        break;
                case IF_BOND_OP_SET_VERBOSE:
                        bond_lock();
-                       if (g_bond == NULL) {
-                               bond_unlock();
-                               error = ENXIO;
-                               break;
-                       }
-                       g_bond->verbose = ibr.ibr_ibru.ibru_int_val;
+                       if_bond_debug = ibr.ibr_ibru.ibru_int_val;
                        bond_unlock();
                        break;
                case IF_BOND_OP_SET_MODE:
@@ -2920,10 +3076,11 @@ bond_handle_event(struct ifnet * port_ifp, int event_code)
        ifbond_ref          ifb;
        int                 old_distributing_count;
        bondport_ref        p;
-       struct media_info   media_info = { 0, 0};
+       struct media_info   media_info = { .mi_active = 0, .mi_status = 0 };
 
        switch (event_code) {
        case KEV_DL_IF_DETACHED:
+       case KEV_DL_IF_DETACHING:
                break;
        case KEV_DL_LINK_OFF:
        case KEV_DL_LINK_ON:
@@ -2942,6 +3099,7 @@ bond_handle_event(struct ifnet * port_ifp, int event_code)
        old_distributing_count = ifb->ifb_distributing_count;
        switch (event_code) {
        case KEV_DL_IF_DETACHED:
+       case KEV_DL_IF_DETACHING:
                bond_remove_interface(ifb, p->po_ifp);
                break;
        case KEV_DL_LINK_OFF:
@@ -2966,7 +3124,7 @@ bond_handle_event(struct ifnet * port_ifp, int event_code)
                            ? KEV_DL_LINK_OFF
                            : KEV_DL_LINK_ON;
                        if (event_code != ifb->ifb_last_link_event) {
-                               if (g_bond->verbose) {
+                               if (if_bond_debug) {
                                        timestamp_printf("%s: (event) generating LINK event\n",
                                            ifb->ifb_name);
                                }
@@ -3000,8 +3158,9 @@ bond_handle_event(struct ifnet * port_ifp, int event_code)
 }
 
 static void
-bond_event(struct ifnet * port_ifp, __unused protocol_family_t protocol,
-    const struct kev_msg * event)
+bond_iff_event(__unused void *cookie, ifnet_t port_ifp,
+    __unused protocol_family_t protocol,
+    const struct kev_msg *event)
 {
        int         event_code;
 
@@ -3014,7 +3173,8 @@ bond_event(struct ifnet * port_ifp, __unused protocol_family_t protocol,
        switch (event_code) {
        case KEV_DL_LINK_OFF:
        case KEV_DL_LINK_ON:
-               /* we only care about link status changes */
+       case KEV_DL_IF_DETACHING:
+       case KEV_DL_IF_DETACHED:
                bond_handle_event(port_ifp, event_code);
                break;
        default:
@@ -3023,11 +3183,11 @@ bond_event(struct ifnet * port_ifp, __unused protocol_family_t protocol,
        return;
 }
 
-static errno_t
-bond_detached(ifnet_t port_ifp, __unused protocol_family_t protocol)
+static void
+bond_iff_detached(__unused void *cookie, ifnet_t port_ifp)
 {
        bond_handle_event(port_ifp, KEV_DL_IF_DETACHED);
-       return 0;
+       return;
 }
 
 static void
@@ -3052,6 +3212,19 @@ interface_link_event(struct ifnet * ifp, u_int32_t event_code)
        return;
 }
 
+static errno_t
+bond_proto_input(ifnet_t ifp, protocol_family_t protocol, mbuf_t packet,
+    char *header)
+{
+#pragma unused(protocol, packet, header)
+       if (if_bond_debug != 0) {
+               printf("%s: unexpected packet from %s\n", __func__,
+                   ifp->if_xname);
+       }
+       return 0;
+}
+
+
 /*
  * Function: bond_attach_protocol
  * Purpose:
@@ -3069,9 +3242,7 @@ bond_attach_protocol(struct ifnet *ifp)
        struct ifnet_attach_proto_param     reg;
 
        bzero(&reg, sizeof(reg));
-       reg.input = bond_input;
-       reg.event = bond_event;
-       reg.detached = bond_detached;
+       reg.input = bond_proto_input;
 
        error = ifnet_attach_protocol(ifp, PF_BOND, &reg);
        if (error) {
@@ -3099,6 +3270,33 @@ bond_detach_protocol(struct ifnet *ifp)
        return error;
 }
 
+/*
+ * Function: bond_attach_filter
+ * Purpose:
+ *   Attach our DLIL interface filter.
+ */
+static int
+bond_attach_filter(struct ifnet *ifp, interface_filter_t * filter_p)
+{
+       int                     error;
+       struct iff_filter       iff;
+
+       /*
+        * install an interface filter
+        */
+       memset(&iff, 0, sizeof(struct iff_filter));
+       iff.iff_name = "com.apple.kernel.bsd.net.if_bond";
+       iff.iff_input = bond_iff_input;
+       iff.iff_event = bond_iff_event;
+       iff.iff_detached = bond_iff_detached;
+       error = iflt_attach_internal(ifp, &iff, filter_p);
+       if (error != 0) {
+               printf("%s: iflt_attach_internal failed %d\n", __func__, error);
+       }
+       return error;
+}
+
+
 /*
  * DLIL interface family functions
  */
@@ -3178,7 +3376,7 @@ ifbond_list_find_moved_port(bondport_ref rx_port,
                        if (ps->ps_port == lacp_actor_partner_tlv_get_port(atlv)
                            && bcmp(&ps_li->li_system, atlv->lap_system,
                            sizeof(ps_li->li_system)) == 0) {
-                               if (g_bond->verbose) {
+                               if (if_bond_debug) {
                                        timestamp_printf("System " EA_FORMAT
                                            " Port 0x%x moved from %s to %s\n",
                                            EA_LIST(&ps_li->li_system), ps->ps_port,
@@ -3219,7 +3417,7 @@ ifbond_selection(ifbond_ref bond)
                lag_changed = 1;
        } else if (lag != NULL) {
                if (lag->lag_active_media != active_media) {
-                       if (g_bond->verbose) {
+                       if (if_bond_debug) {
                                timestamp_printf("LAG PORT SPEED CHANGED from %d to %d\n",
                                    link_speed(lag->lag_active_media),
                                    link_speed(active_media));
@@ -3484,15 +3682,24 @@ bondport_link_status_changed(bondport_ref p)
 {
        ifbond_ref  bond = p->po_bond;
 
-       if (g_bond->verbose) {
+       if (if_bond_debug) {
                if (media_active(&p->po_media_info)) {
+                       const char * duplex_string;
+
+                       if (media_full_duplex(&p->po_media_info)) {
+                               duplex_string = "full";
+                       } else if (media_type_unknown(&p->po_media_info)) {
+                               duplex_string = "unknown";
+                       } else {
+                               duplex_string = "half";
+                       }
                        timestamp_printf("[%s] Link UP %d Mbit/s %s duplex\n",
                            bondport_get_name(p),
                            media_speed(&p->po_media_info),
-                           media_full_duplex(&p->po_media_info)
-                           ? "full" : "half");
+                           duplex_string);
                } else {
-                       timestamp_printf("[%s] Link DOWN\n", bondport_get_name(p));
+                       timestamp_printf("[%s] Link DOWN\n",
+                           bondport_get_name(p));
                }
        }
        if (bond->ifb_mode == IF_BOND_MODE_LACP) {
@@ -3501,7 +3708,7 @@ bondport_link_status_changed(bondport_ref p)
                    && p->po_lag == bond->ifb_active_lag
                    && p->po_selected != SelectedState_UNSELECTED) {
                        if (media_speed(&p->po_media_info) != p->po_lag->lag_active_media) {
-                               if (g_bond->verbose) {
+                               if (if_bond_debug) {
                                        timestamp_printf("[%s] Port speed %d differs from LAG %d\n",
                                            bondport_get_name(p),
                                            media_speed(&p->po_media_info),
@@ -3538,7 +3745,7 @@ bondport_aggregatable(bondport_ref p)
        }
        switch (p->po_receive_state) {
        default:
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Port is not selectable\n",
                            bondport_get_name(p));
                }
@@ -3581,7 +3788,7 @@ bondport_remove_from_LAG(bondport_ref p)
                return 0;
        }
        TAILQ_REMOVE(&lag->lag_port_list, p, po_lag_port_list);
-       if (g_bond->verbose) {
+       if (if_bond_debug) {
                timestamp_printf("[%s] Removed from LAG (0x%04x," EA_FORMAT
                    ",0x%04x)\n",
                    bondport_get_name(p),
@@ -3594,7 +3801,7 @@ bondport_remove_from_LAG(bondport_ref p)
        if (lag->lag_port_count > 0) {
                return bond->ifb_active_lag == lag;
        }
-       if (g_bond->verbose) {
+       if (if_bond_debug) {
                timestamp_printf("Key 0x%04x: LAG Released (%04x," EA_FORMAT
                    ",0x%04x)\n",
                    bond->ifb_key,
@@ -3617,7 +3824,7 @@ bondport_add_to_LAG(bondport_ref p, LAG_ref lag)
        TAILQ_INSERT_TAIL(&lag->lag_port_list, p, po_lag_port_list);
        p->po_lag = lag;
        lag->lag_port_count++;
-       if (g_bond->verbose) {
+       if (if_bond_debug) {
                timestamp_printf("[%s] Added to LAG (0x%04x," EA_FORMAT "0x%04x)\n",
                    bondport_get_name(p),
                    lag->lag_info.li_system_priority,
@@ -3656,7 +3863,7 @@ bondport_assign_to_LAG(bondport_ref p)
        lag->lag_selected_port_count = 0;
        lag->lag_info = p->po_partner_state.ps_lag_info;
        TAILQ_INSERT_TAIL(&bond->ifb_lag_list, lag, lag_list);
-       if (g_bond->verbose) {
+       if (if_bond_debug) {
                timestamp_printf("Key 0x%04x: LAG Created (0x%04x," EA_FORMAT
                    ",0x%04x)\n",
                    bond->ifb_key,
@@ -3699,7 +3906,7 @@ bondport_set_selected(bondport_ref p, SelectedState s)
                        } else if (s == SelectedState_SELECTED) {
                                lag->lag_selected_port_count++;
                        }
-                       if (g_bond->verbose) {
+                       if (if_bond_debug) {
                                timestamp_printf("[%s] SetSelected: %s (was %s)\n",
                                    bondport_get_name(p),
                                    SelectedStateString(s),
@@ -3753,7 +3960,7 @@ bondport_UpdateSelected(bondport_ref p, lacpdu_ref lacpdu_p)
            || (lacp_actor_partner_state_aggregatable(actor->lap_state)
            != lacp_actor_partner_state_aggregatable(ps->ps_state))) {
                bondport_set_selected(p, SelectedState_UNSELECTED);
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] updateSelected UNSELECTED\n",
                            bondport_get_name(p));
                }
@@ -3791,7 +3998,7 @@ bondport_RecordPDU(bondport_ref p, lacpdu_ref lacpdu_p)
        if (lacp_actor_partner_state_active_lacp(ps->ps_state)
            || (lacp_actor_partner_state_active_lacp(p->po_actor_state)
            && lacp_actor_partner_state_active_lacp(partner->lap_state))) {
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] recordPDU: LACP will maintain\n",
                            bondport_get_name(p));
                }
@@ -3810,7 +4017,7 @@ bondport_RecordPDU(bondport_ref p, lacpdu_ref lacpdu_p)
            && lacp_actor_partner_state_in_sync(actor->lap_state)
            && lacp_maintain) {
                ps->ps_state = lacp_actor_partner_state_set_in_sync(ps->ps_state);
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] recordPDU: LACP partner in sync\n",
                            bondport_get_name(p));
                }
@@ -3818,7 +4025,7 @@ bondport_RecordPDU(bondport_ref p, lacpdu_ref lacpdu_p)
            && lacp_actor_partner_state_in_sync(actor->lap_state)
            && lacp_maintain) {
                ps->ps_state = lacp_actor_partner_state_set_in_sync(ps->ps_state);
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] recordPDU: LACP partner in sync (ind)\n",
                            bondport_get_name(p));
                }
@@ -3853,7 +4060,7 @@ bondport_UpdateNTT(bondport_ref p, lacpdu_ref lacpdu_p)
            || (updateNTTBits(partner->lap_state)
            != updateNTTBits(p->po_actor_state))) {
                bondport_flags_set_ntt(p);
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] updateNTT: Need To Transmit\n",
                            bondport_get_name(p));
                }
@@ -3865,7 +4072,7 @@ static void
 bondport_AttachMuxToAggregator(bondport_ref p)
 {
        if (bondport_flags_mux_attached(p) == 0) {
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Attached Mux To Aggregator\n",
                            bondport_get_name(p));
                }
@@ -3878,7 +4085,7 @@ static void
 bondport_DetachMuxFromAggregator(bondport_ref p)
 {
        if (bondport_flags_mux_attached(p)) {
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Detached Mux From Aggregator\n",
                            bondport_get_name(p));
                }
@@ -3894,7 +4101,7 @@ bondport_enable_distributing(bondport_ref p)
                ifbond_ref      bond = p->po_bond;
 
                bond->ifb_distributing_array[bond->ifb_distributing_count++] = p;
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Distribution Enabled\n",
                            bondport_get_name(p));
                }
@@ -3926,7 +4133,7 @@ bondport_disable_distributing(bondport_ref p)
                        }
                }
                bond->ifb_distributing_count--;
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Distribution Disabled\n",
                            bondport_get_name(p));
                }
@@ -4027,7 +4234,7 @@ bondport_receive_machine_initialize(bondport_ref p, LAEvent event,
        switch (event) {
        case LAEventStart:
                devtimer_cancel(p->po_current_while_timer);
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Receive INITIALIZE\n",
                            bondport_get_name(p));
                }
@@ -4053,7 +4260,7 @@ bondport_receive_machine_port_disabled(bondport_ref p, LAEvent event,
        switch (event) {
        case LAEventStart:
                devtimer_cancel(p->po_current_while_timer);
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Receive PORT_DISABLED\n",
                            bondport_get_name(p));
                }
@@ -4063,7 +4270,7 @@ bondport_receive_machine_port_disabled(bondport_ref p, LAEvent event,
        /* FALL THROUGH */
        case LAEventMediaChange:
                if (media_active(&p->po_media_info)) {
-                       if (media_full_duplex(&p->po_media_info)) {
+                       if (media_ok(&p->po_media_info)) {
                                bondport_receive_machine_expired(p, LAEventStart, NULL);
                        } else {
                                bondport_receive_machine_lacp_disabled(p, LAEventStart, NULL);
@@ -4071,7 +4278,7 @@ bondport_receive_machine_port_disabled(bondport_ref p, LAEvent event,
                } else if (p->po_selected == SelectedState_SELECTED) {
                        struct timeval      tv;
 
-                       if (g_bond->verbose) {
+                       if (if_bond_debug) {
                                timestamp_printf("[%s] Receive PORT_DISABLED: "
                                    "link timer started\n",
                                    bondport_get_name(p));
@@ -4088,7 +4295,7 @@ bondport_receive_machine_port_disabled(bondport_ref p, LAEvent event,
                break;
        case LAEventTimeout:
                if (p->po_selected == SelectedState_SELECTED) {
-                       if (g_bond->verbose) {
+                       if (if_bond_debug) {
                                timestamp_printf("[%s] Receive PORT_DISABLED: "
                                    "link timer completed, marking UNSELECTED\n",
                                    bondport_get_name(p));
@@ -4115,7 +4322,7 @@ bondport_receive_machine_expired(bondport_ref p, LAEvent event,
        switch (event) {
        case LAEventStart:
                devtimer_cancel(p->po_current_while_timer);
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Receive EXPIRED\n",
                            bondport_get_name(p));
                }
@@ -4152,7 +4359,7 @@ bondport_receive_machine_lacp_disabled(bondport_ref p, LAEvent event,
        switch (event) {
        case LAEventStart:
                devtimer_cancel(p->po_current_while_timer);
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Receive LACP_DISABLED\n",
                            bondport_get_name(p));
                }
@@ -4177,7 +4384,7 @@ bondport_receive_machine_defaulted(bondport_ref p, LAEvent event,
        switch (event) {
        case LAEventStart:
                devtimer_cancel(p->po_current_while_timer);
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Receive DEFAULTED\n",
                            bondport_get_name(p));
                }
@@ -4203,7 +4410,7 @@ bondport_receive_machine_current(bondport_ref p, LAEvent event,
        switch (event) {
        case LAEventPacket:
                devtimer_cancel(p->po_current_while_timer);
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Receive CURRENT\n",
                            bondport_get_name(p));
                }
@@ -4250,7 +4457,7 @@ bondport_periodic_transmit_machine(bondport_ref p, LAEvent event,
 
        switch (event) {
        case LAEventStart:
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] periodic_transmit Start\n",
                            bondport_get_name(p));
                }
@@ -4259,7 +4466,7 @@ bondport_periodic_transmit_machine(bondport_ref p, LAEvent event,
                devtimer_cancel(p->po_periodic_timer);
                p->po_periodic_interval = 0;
                if (media_active(&p->po_media_info) == 0
-                   || media_full_duplex(&p->po_media_info) == 0) {
+                   || media_ok(&p->po_media_info) == 0) {
                        break;
                }
        case LAEventPacket:
@@ -4280,7 +4487,7 @@ bondport_periodic_transmit_machine(bondport_ref p, LAEvent event,
                if (p->po_periodic_interval != interval) {
                        if (interval == LACP_FAST_PERIODIC_TIME
                            && p->po_periodic_interval == LACP_SLOW_PERIODIC_TIME) {
-                               if (g_bond->verbose) {
+                               if (if_bond_debug) {
                                        timestamp_printf("[%s] periodic_transmit:"
                                            " Need To Transmit\n",
                                            bondport_get_name(p));
@@ -4294,7 +4501,7 @@ bondport_periodic_transmit_machine(bondport_ref p, LAEvent event,
                            (devtimer_timeout_func)
                            bondport_periodic_transmit_machine,
                            (void *)LAEventTimeout, NULL);
-                       if (g_bond->verbose) {
+                       if (if_bond_debug) {
                                timestamp_printf("[%s] Periodic Transmission Timer: %d secs\n",
                                    bondport_get_name(p),
                                    p->po_periodic_interval);
@@ -4308,7 +4515,7 @@ bondport_periodic_transmit_machine(bondport_ref p, LAEvent event,
                devtimer_set_relative(p->po_periodic_timer, tv, (devtimer_timeout_func)
                    bondport_periodic_transmit_machine,
                    (void *)LAEventTimeout, NULL);
-               if (g_bond->verbose > 1) {
+               if (if_bond_debug > 1) {
                        timestamp_printf("[%s] Periodic Transmission Timer: %d secs\n",
                            bondport_get_name(p), p->po_periodic_interval);
                }
@@ -4346,7 +4553,7 @@ bondport_transmit_machine(bondport_ref p, LAEvent event,
 {
        lacp_actor_partner_tlv_ref  aptlv;
        lacp_collector_tlv_ref      ctlv;
-       struct timeval              next_tick_time = {0, 0};
+       struct timeval              next_tick_time = {.tv_sec = 0, .tv_usec = 0};
        lacpdu_ref          out_lacpdu_p;
        packet_buffer_ref           pkt;
        partner_state_ref           ps;
@@ -4363,7 +4570,7 @@ bondport_transmit_machine(bondport_ref p, LAEvent event,
                } else if (bondport_can_transmit(p, devtimer_current_secs(),
                    &next_tick_time.tv_sec) == 0) {
                        if (devtimer_enabled(p->po_transmit_timer)) {
-                               if (g_bond->verbose > 0) {
+                               if (if_bond_debug > 0) {
                                        timestamp_printf("[%s] Transmit Timer Already Set\n",
                                            bondport_get_name(p));
                                }
@@ -4372,7 +4579,7 @@ bondport_transmit_machine(bondport_ref p, LAEvent event,
                                    (devtimer_timeout_func)
                                    bondport_transmit_machine,
                                    (void *)LAEventTimeout, NULL);
-                               if (g_bond->verbose > 0) {
+                               if (if_bond_debug > 0) {
                                        timestamp_printf("[%s] Transmit Timer Deadline %d secs\n",
                                            bondport_get_name(p),
                                            (int)next_tick_time.tv_sec);
@@ -4380,7 +4587,7 @@ bondport_transmit_machine(bondport_ref p, LAEvent event,
                        }
                        break;
                }
-               if (g_bond->verbose > 0) {
+               if (if_bond_debug > 0) {
                        if (event == LAEventTimeout) {
                                timestamp_printf("[%s] Transmit Timer Complete\n",
                                    bondport_get_name(p));
@@ -4430,7 +4637,7 @@ bondport_transmit_machine(bondport_ref p, LAEvent event,
 
                bondport_slow_proto_transmit(p, pkt);
                bondport_flags_clear_ntt(p);
-               if (g_bond->verbose > 0) {
+               if (if_bond_debug > 0) {
                        timestamp_printf("[%s] Transmit Packet %d\n",
                            bondport_get_name(p), p->po_n_transmit);
                }
@@ -4493,7 +4700,7 @@ bondport_mux_machine_detached(bondport_ref p, LAEvent event,
        switch (event) {
        case LAEventStart:
                devtimer_cancel(p->po_wait_while_timer);
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Mux DETACHED\n",
                            bondport_get_name(p));
                }
@@ -4531,7 +4738,7 @@ bondport_mux_machine_waiting(bondport_ref p, LAEvent event,
        switch (event) {
        case LAEventStart:
                devtimer_cancel(p->po_wait_while_timer);
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Mux WAITING\n",
                            bondport_get_name(p));
                }
@@ -4546,21 +4753,21 @@ bondport_mux_machine_waiting(bondport_ref p, LAEvent event,
                if (p->po_selected == SelectedState_STANDBY) {
                        devtimer_cancel(p->po_wait_while_timer);
                        /* wait until state changes to SELECTED */
-                       if (g_bond->verbose) {
+                       if (if_bond_debug) {
                                timestamp_printf("[%s] Mux WAITING: Standby\n",
                                    bondport_get_name(p));
                        }
                        break;
                }
                if (bondport_flags_ready(p)) {
-                       if (g_bond->verbose) {
+                       if (if_bond_debug) {
                                timestamp_printf("[%s] Mux WAITING: Port is already ready\n",
                                    bondport_get_name(p));
                        }
                        break;
                }
                if (devtimer_enabled(p->po_wait_while_timer)) {
-                       if (g_bond->verbose) {
+                       if (if_bond_debug) {
                                timestamp_printf("[%s] Mux WAITING: Timer already set\n",
                                    bondport_get_name(p));
                        }
@@ -4568,14 +4775,14 @@ bondport_mux_machine_waiting(bondport_ref p, LAEvent event,
                }
                if (ifbond_all_ports_attached(p->po_bond, p)) {
                        devtimer_cancel(p->po_wait_while_timer);
-                       if (g_bond->verbose) {
+                       if (if_bond_debug) {
                                timestamp_printf("[%s] Mux WAITING: No waiting\n",
                                    bondport_get_name(p));
                        }
                        bondport_flags_set_ready(p);
                        goto no_waiting;
                }
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Mux WAITING: 2 seconds\n",
                            bondport_get_name(p));
                }
@@ -4587,7 +4794,7 @@ bondport_mux_machine_waiting(bondport_ref p, LAEvent event,
                    (void *)LAEventTimeout, NULL);
                break;
        case LAEventTimeout:
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Mux WAITING: Ready\n",
                            bondport_get_name(p));
                }
@@ -4596,7 +4803,7 @@ bondport_mux_machine_waiting(bondport_ref p, LAEvent event,
        case LAEventReady:
 no_waiting:
                if (bondport_flags_ready(p)) {
-                       if (g_bond->verbose) {
+                       if (if_bond_debug) {
                                timestamp_printf("[%s] Mux WAITING: All Ports Ready\n",
                                    bondport_get_name(p));
                        }
@@ -4617,7 +4824,7 @@ bondport_mux_machine_attached(bondport_ref p, LAEvent event,
        switch (event) {
        case LAEventStart:
                devtimer_cancel(p->po_wait_while_timer);
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Mux ATTACHED\n",
                            bondport_get_name(p));
                }
@@ -4659,7 +4866,7 @@ bondport_mux_machine_collecting_distributing(bondport_ref p,
        switch (event) {
        case LAEventStart:
                devtimer_cancel(p->po_wait_while_timer);
-               if (g_bond->verbose) {
+               if (if_bond_debug) {
                        timestamp_printf("[%s] Mux COLLECTING_DISTRIBUTING\n",
                            bondport_get_name(p));
                }
index ca97c63dd6814ba2d2ce746d21c306cb7651c65f..a3c90194dad3364e0aab539b1ec7fceef12bddfc 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define BR_DBGF_MBUF            0x0040
 #define BR_DBGF_MCAST           0x0080
 #define BR_DBGF_HOSTFILTER      0x0100
+#define BR_DBGF_CHECKSUM        0x0200
 #endif /* BRIDGE_DEBUG */
 
 #define _BRIDGE_LOCK(_sc)               lck_mtx_lock(&(_sc)->sc_mtx)
@@ -323,6 +324,7 @@ struct bridge_iflist {
 #define BIFF_HOST_FILTER        0x10    /* host filter enabled */
 #define BIFF_HF_HWSRC           0x20    /* host filter source MAC is set */
 #define BIFF_HF_IPSRC           0x40    /* host filter source IP is set */
+#define BIFF_INPUT_BROADCAST    0x80    /* send broadcast packets in */
 
 /*
  * Bridge route node.
@@ -355,44 +357,26 @@ struct bridge_delayed_call {
 #define BDCF_OUTSTANDING        0x01    /* Delayed call has been scheduled */
 #define BDCF_CANCELLING         0x02    /* May be waiting for call completion */
 
-
 /*
  * Software state for each bridge.
  */
 LIST_HEAD(_bridge_rtnode_list, bridge_rtnode);
 
-typedef struct {
-       struct _bridge_rtnode_list *bb_rthash;  /* our forwarding table */
-       struct _bridge_rtnode_list bb_rtlist;   /* list version of above */
-       uint32_t                bb_rthash_key;  /* key for hash */
-       uint32_t                bb_rthash_size; /* size of the hash table */
-       struct bridge_delayed_call bb_aging_timer;
-       struct bridge_delayed_call bb_resize_call;
-       TAILQ_HEAD(, bridge_iflist) bb_spanlist;        /* span ports list */
-       struct bstp_state       bb_stp;         /* STP state */
-       bpf_packet_func         bb_bpf_input;
-       bpf_packet_func         bb_bpf_output;
-} bridge_bsd, *bridge_bsd_t;
-
-#define sc_rthash       sc_u.scu_bsd.bb_rthash
-#define sc_rtlist       sc_u.scu_bsd.bb_rtlist
-#define sc_rthash_key   sc_u.scu_bsd.bb_rthash_key
-#define sc_rthash_size  sc_u.scu_bsd.bb_rthash_size
-#define sc_aging_timer  sc_u.scu_bsd.bb_aging_timer
-#define sc_resize_call  sc_u.scu_bsd.bb_resize_call
-#define sc_spanlist     sc_u.scu_bsd.bb_spanlist
-#define sc_stp          sc_u.scu_bsd.bb_stp
-#define sc_bpf_input    sc_u.scu_bsd.bb_bpf_input
-#define sc_bpf_output   sc_u.scu_bsd.bb_bpf_output
-
 struct bridge_softc {
        struct ifnet            *sc_ifp;        /* make this an interface */
        u_int32_t               sc_flags;
-       union {
-               bridge_bsd      scu_bsd;
-       } sc_u;
        LIST_ENTRY(bridge_softc) sc_list;
        decl_lck_mtx_data(, sc_mtx);
+       struct _bridge_rtnode_list *sc_rthash;  /* our forwarding table */
+       struct _bridge_rtnode_list sc_rtlist;   /* list version of above */
+       uint32_t                sc_rthash_key;  /* key for hash */
+       uint32_t                sc_rthash_size; /* size of the hash table */
+       struct bridge_delayed_call sc_aging_timer;
+       struct bridge_delayed_call sc_resize_call;
+       TAILQ_HEAD(, bridge_iflist) sc_spanlist;        /* span ports list */
+       struct bstp_state       sc_stp;         /* STP state */
+       bpf_packet_func         sc_bpf_input;
+       bpf_packet_func         sc_bpf_output;
        void                    *sc_cv;
        uint32_t                sc_brtmax;      /* max # of addresses */
        uint32_t                sc_brtcnt;      /* cur. # of addresses */
@@ -420,19 +404,13 @@ struct bridge_softc {
 #define SCF_DETACHING 0x01
 #define SCF_RESIZING 0x02
 #define SCF_MEDIA_ACTIVE 0x04
-#define SCF_BSD_MODE    0x08
-
-static inline void
-bridge_set_bsd_mode(struct bridge_softc * sc)
-{
-       sc->sc_flags |= SCF_BSD_MODE;
-}
 
-static inline boolean_t
-bridge_in_bsd_mode(const struct bridge_softc * sc)
-{
-       return (sc->sc_flags & SCF_BSD_MODE) != 0;
-}
+typedef enum {
+       kChecksumOperationNone = 0,
+       kChecksumOperationClear = 1,
+       kChecksumOperationFinalize = 2,
+       kChecksumOperationCompute = 3,
+} ChecksumOperation;
 
 struct bridge_hostfilter_stats bridge_hostfilter_stats;
 
@@ -452,7 +430,8 @@ static void     bridge_set_ifcap(struct bridge_softc *, struct bridge_iflist *,
     int);
 #endif
 static errno_t bridge_set_tso(struct bridge_softc *);
-__private_extern__ void bridge_ifdetach(struct bridge_iflist *, struct ifnet *);
+static void     bridge_ifdetach(struct ifnet *);
+static void     bridge_proto_attach_changed(struct ifnet *);
 static int      bridge_init(struct ifnet *);
 #if HAS_BRIDGE_DUMMYNET
 static void     bridge_dummynet(struct mbuf *, struct ifnet *);
@@ -462,14 +441,13 @@ static int      bridge_output(struct ifnet *, struct mbuf *);
 static void     bridge_finalize_cksum(struct ifnet *, struct mbuf *);
 static void     bridge_start(struct ifnet *);
 __private_extern__ errno_t bridge_input(struct ifnet *, struct mbuf *, void *);
-#if BRIDGE_MEMBER_OUT_FILTER
-static errno_t bridge_iff_output(void *, ifnet_t, protocol_family_t,
+static errno_t  bridge_iff_output(void *, ifnet_t, protocol_family_t,
     mbuf_t *);
-static int      bridge_member_output(struct ifnet *, struct mbuf *,
-    struct sockaddr *, struct rtentry *);
-#endif
+static errno_t  bridge_member_output(struct bridge_softc *sc, ifnet_t ifp,
+    mbuf_t m);
+
 static int      bridge_enqueue(struct bridge_softc *, struct ifnet *,
-    struct mbuf *);
+    struct ifnet *, struct mbuf *, ChecksumOperation);
 static void     bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int);
 
 static void     bridge_forward(struct bridge_softc *, struct bridge_iflist *,
@@ -586,7 +564,6 @@ static void bridge_cancel_delayed_call(struct bridge_delayed_call *);
 static void bridge_cleanup_delayed_call(struct bridge_delayed_call *);
 static int bridge_host_filter(struct bridge_iflist *, struct mbuf *);
 
-
 #define m_copypacket(m, how) m_copym(m, 0, M_COPYALL, how)
 
 /* The default bridge vlan is 1 (IEEE 802.1Q-2003 Table 9-2) */
@@ -634,14 +611,6 @@ SYSCTL_INT(_net_link_bridge, OID_AUTO, delayed_callback_delay,
     "Delay before calling delayed function");
 #endif
 
-static int bridge_bsd_mode = 1;
-#if (DEVELOPMENT || DEBUG)
-SYSCTL_INT(_net_link_bridge, OID_AUTO, bsd_mode,
-    CTLFLAG_RW | CTLFLAG_LOCKED,
-    &bridge_bsd_mode, 0,
-    "Bridge using bsd mode");
-#endif /* (DEVELOPMENT || DEBUG) */
-
 SYSCTL_STRUCT(_net_link_bridge, OID_AUTO,
     hostfilterstats, CTLFLAG_RD | CTLFLAG_LOCKED,
     &bridge_hostfilter_stats, bridge_hostfilter_stats, "");
@@ -684,199 +653,199 @@ struct bridge_control {
 #define BC_F_SUSER              0x04    /* do super-user check */
 
 static const struct bridge_control bridge_control_table32[] = {
-       { bridge_ioctl_add, sizeof(struct ifbreq),              /* 0 */
-         BC_F_COPYIN | BC_F_SUSER },
-       { bridge_ioctl_del, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_add, .bc_argsize = sizeof(struct ifbreq),             /* 0 */
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_del, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_gifflags, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_COPYOUT },
-       { bridge_ioctl_sifflags, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_gifflags, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_sifflags, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_scache, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
-       { bridge_ioctl_gcache, sizeof(struct ifbrparam),
-         BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_scache, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_gcache, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYOUT },
 
-       { bridge_ioctl_gifs32, sizeof(struct ifbifconf32),
-         BC_F_COPYIN | BC_F_COPYOUT },
-       { bridge_ioctl_rts32, sizeof(struct ifbaconf32),
-         BC_F_COPYIN | BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_gifs32, .bc_argsize = sizeof(struct ifbifconf32),
+         .bc_flags = BC_F_COPYIN | BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_rts32, .bc_argsize = sizeof(struct ifbaconf32),
+         .bc_flags = BC_F_COPYIN | BC_F_COPYOUT },
 
-       { bridge_ioctl_saddr32, sizeof(struct ifbareq32),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_saddr32, .bc_argsize = sizeof(struct ifbareq32),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_sto, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
-       { bridge_ioctl_gto, sizeof(struct ifbrparam),              /* 10 */
-         BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_sto, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_gto, .bc_argsize = sizeof(struct ifbrparam),           /* 10 */
+         .bc_flags = BC_F_COPYOUT },
 
-       { bridge_ioctl_daddr32, sizeof(struct ifbareq32),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_daddr32, .bc_argsize = sizeof(struct ifbareq32),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_flush, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_flush, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_gpri, sizeof(struct ifbrparam),
-         BC_F_COPYOUT },
-       { bridge_ioctl_spri, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_gpri, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_spri, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_ght, sizeof(struct ifbrparam),
-         BC_F_COPYOUT },
-       { bridge_ioctl_sht, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_ght, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_sht, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_gfd, sizeof(struct ifbrparam),
-         BC_F_COPYOUT },
-       { bridge_ioctl_sfd, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_gfd, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_sfd, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_gma, sizeof(struct ifbrparam),
-         BC_F_COPYOUT },
-       { bridge_ioctl_sma, sizeof(struct ifbrparam),              /* 20 */
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_gma, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_sma, .bc_argsize = sizeof(struct ifbrparam),           /* 20 */
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_sifprio, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_sifprio, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_sifcost, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_sifcost, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_gfilt, sizeof(struct ifbrparam),
-         BC_F_COPYOUT },
-       { bridge_ioctl_sfilt, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_gfilt, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_sfilt, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_purge, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_purge, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_addspan, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
-       { bridge_ioctl_delspan, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_addspan, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_delspan, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_gbparam32, sizeof(struct ifbropreq32),
-         BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_gbparam32, .bc_argsize = sizeof(struct ifbropreq32),
+         .bc_flags = BC_F_COPYOUT },
 
-       { bridge_ioctl_grte, sizeof(struct ifbrparam),
-         BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_grte, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYOUT },
 
-       { bridge_ioctl_gifsstp32, sizeof(struct ifbpstpconf32),        /* 30 */
-         BC_F_COPYIN | BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_gifsstp32, .bc_argsize = sizeof(struct ifbpstpconf32),     /* 30 */
+         .bc_flags = BC_F_COPYIN | BC_F_COPYOUT },
 
-       { bridge_ioctl_sproto, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_sproto, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_stxhc, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_stxhc, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_sifmaxaddr, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_ghostfilter, sizeof(struct ifbrhostfilter),
-         BC_F_COPYIN | BC_F_COPYOUT },
-       { bridge_ioctl_shostfilter, sizeof(struct ifbrhostfilter),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_ghostfilter, .bc_argsize = sizeof(struct ifbrhostfilter),
+         .bc_flags = BC_F_COPYIN | BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_shostfilter, .bc_argsize = sizeof(struct ifbrhostfilter),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 };
 
 static const struct bridge_control bridge_control_table64[] = {
-       { bridge_ioctl_add, sizeof(struct ifbreq),              /* 0 */
-         BC_F_COPYIN | BC_F_SUSER },
-       { bridge_ioctl_del, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_add, .bc_argsize = sizeof(struct ifbreq),           /* 0 */
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_del, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_gifflags, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_COPYOUT },
-       { bridge_ioctl_sifflags, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_gifflags, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_sifflags, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_scache, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
-       { bridge_ioctl_gcache, sizeof(struct ifbrparam),
-         BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_scache, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_gcache, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYOUT },
 
-       { bridge_ioctl_gifs64, sizeof(struct ifbifconf64),
-         BC_F_COPYIN | BC_F_COPYOUT },
-       { bridge_ioctl_rts64, sizeof(struct ifbaconf64),
-         BC_F_COPYIN | BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_gifs64, .bc_argsize = sizeof(struct ifbifconf64),
+         .bc_flags = BC_F_COPYIN | BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_rts64, .bc_argsize = sizeof(struct ifbaconf64),
+         .bc_flags = BC_F_COPYIN | BC_F_COPYOUT },
 
-       { bridge_ioctl_saddr64, sizeof(struct ifbareq64),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_saddr64, .bc_argsize = sizeof(struct ifbareq64),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_sto, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
-       { bridge_ioctl_gto, sizeof(struct ifbrparam),              /* 10 */
-         BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_sto, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_gto, .bc_argsize = sizeof(struct ifbrparam),           /* 10 */
+         .bc_flags = BC_F_COPYOUT },
 
-       { bridge_ioctl_daddr64, sizeof(struct ifbareq64),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_daddr64, .bc_argsize = sizeof(struct ifbareq64),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_flush, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_flush, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_gpri, sizeof(struct ifbrparam),
-         BC_F_COPYOUT },
-       { bridge_ioctl_spri, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_gpri, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_spri, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_ght, sizeof(struct ifbrparam),
-         BC_F_COPYOUT },
-       { bridge_ioctl_sht, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_ght, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_sht, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_gfd, sizeof(struct ifbrparam),
-         BC_F_COPYOUT },
-       { bridge_ioctl_sfd, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_gfd, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_sfd, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_gma, sizeof(struct ifbrparam),
-         BC_F_COPYOUT },
-       { bridge_ioctl_sma, sizeof(struct ifbrparam),              /* 20 */
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_gma, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_sma, .bc_argsize = sizeof(struct ifbrparam),           /* 20 */
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_sifprio, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_sifprio, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_sifcost, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_sifcost, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_gfilt, sizeof(struct ifbrparam),
-         BC_F_COPYOUT },
-       { bridge_ioctl_sfilt, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_gfilt, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_sfilt, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_purge, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_purge, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_addspan, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
-       { bridge_ioctl_delspan, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_addspan, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_delspan, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_gbparam64, sizeof(struct ifbropreq64),
-         BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_gbparam64, .bc_argsize = sizeof(struct ifbropreq64),
+         .bc_flags = BC_F_COPYOUT },
 
-       { bridge_ioctl_grte, sizeof(struct ifbrparam),
-         BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_grte, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYOUT },
 
-       { bridge_ioctl_gifsstp64, sizeof(struct ifbpstpconf64),        /* 30 */
-         BC_F_COPYIN | BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_gifsstp64, .bc_argsize = sizeof(struct ifbpstpconf64),     /* 30 */
+         .bc_flags = BC_F_COPYIN | BC_F_COPYOUT },
 
-       { bridge_ioctl_sproto, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_sproto, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_stxhc, sizeof(struct ifbrparam),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_stxhc, .bc_argsize = sizeof(struct ifbrparam),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_sifmaxaddr, .bc_argsize = sizeof(struct ifbreq),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 
-       { bridge_ioctl_ghostfilter, sizeof(struct ifbrhostfilter),
-         BC_F_COPYIN | BC_F_COPYOUT },
-       { bridge_ioctl_shostfilter, sizeof(struct ifbrhostfilter),
-         BC_F_COPYIN | BC_F_SUSER },
+       { .bc_func = bridge_ioctl_ghostfilter, .bc_argsize = sizeof(struct ifbrhostfilter),
+         .bc_flags = BC_F_COPYIN | BC_F_COPYOUT },
+       { .bc_func = bridge_ioctl_shostfilter, .bc_argsize = sizeof(struct ifbrhostfilter),
+         .bc_flags = BC_F_COPYIN | BC_F_SUSER },
 };
 
 static const unsigned int bridge_control_table_size =
@@ -1279,10 +1248,6 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params)
        sc->sc_filter_flags &= ~IFBF_FILT_USEIPF;
 #endif
 
-       if (bridge_bsd_mode != 0) {
-               bridge_set_bsd_mode(sc);
-       }
-
        TAILQ_INIT(&sc->sc_iflist);
 
        /* use the interface name as the unique id for ifp recycle */
@@ -1291,23 +1256,21 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params)
        bzero(&init_params, sizeof(init_params));
        init_params.ver                 = IFNET_INIT_CURRENT_VERSION;
        init_params.len                 = sizeof(init_params);
-       if (bridge_in_bsd_mode(sc)) {
-               /* Initialize our routing table. */
-               error = bridge_rtable_init(sc);
-               if (error != 0) {
-                       printf("%s: bridge_rtable_init failed %d\n",
-                           __func__, error);
-                       goto done;
-               }
-               TAILQ_INIT(&sc->sc_spanlist);
-               if (if_bridge_txstart) {
-                       init_params.start = bridge_start;
-               } else {
-                       init_params.flags = IFNET_INIT_LEGACY;
-                       init_params.output = bridge_output;
-               }
-               init_params.set_bpf_tap = bridge_set_bpf_tap;
+       /* Initialize our routing table. */
+       error = bridge_rtable_init(sc);
+       if (error != 0) {
+               printf("%s: bridge_rtable_init failed %d\n",
+                   __func__, error);
+               goto done;
+       }
+       TAILQ_INIT(&sc->sc_spanlist);
+       if (if_bridge_txstart) {
+               init_params.start = bridge_start;
+       } else {
+               init_params.flags = IFNET_INIT_LEGACY;
+               init_params.output = bridge_output;
        }
+       init_params.set_bpf_tap = bridge_set_bpf_tap;
        init_params.uniqueid            = sc->sc_if_xname;
        init_params.uniqueid_len        = strlen(sc->sc_if_xname);
        init_params.sndq_maxlen         = IFQ_MAXLEN;
@@ -1326,22 +1289,19 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params)
        init_params.broadcast_addr      = etherbroadcastaddr;
        init_params.broadcast_len       = ETHER_ADDR_LEN;
 
-       if (bridge_in_bsd_mode(sc)) {
-               error = ifnet_allocate_extended(&init_params, &ifp);
-               if (error != 0) {
-                       printf("%s: ifnet_allocate failed %d\n",
-                           __func__, error);
-                       goto done;
-               }
-               sc->sc_ifp = ifp;
-               error = bridge_ifnet_set_attrs(ifp);
-               if (error != 0) {
-                       printf("%s: bridge_ifnet_set_attrs failed %d\n",
-                           __func__, error);
-                       goto done;
-               }
+       error = ifnet_allocate_extended(&init_params, &ifp);
+       if (error != 0) {
+               printf("%s: ifnet_allocate failed %d\n",
+                   __func__, error);
+               goto done;
+       }
+       sc->sc_ifp = ifp;
+       error = bridge_ifnet_set_attrs(ifp);
+       if (error != 0) {
+               printf("%s: bridge_ifnet_set_attrs failed %d\n",
+                   __func__, error);
+               goto done;
        }
-
        /*
         * Generate an ethernet address with a locally administered address.
         *
@@ -1397,12 +1357,10 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params)
                link_print(sc);
        }
 #endif
-       if (bridge_in_bsd_mode(sc)) {
-               error = ifnet_attach(ifp, NULL);
-               if (error != 0) {
-                       printf("%s: ifnet_attach failed %d\n", __func__, error);
-                       goto done;
-               }
+       error = ifnet_attach(ifp, NULL);
+       if (error != 0) {
+               printf("%s: ifnet_attach failed %d\n", __func__, error);
+               goto done;
        }
 
        error = ifnet_set_lladdr_and_type(ifp, sc->sc_defaddr, ETHER_ADDR_LEN,
@@ -1413,20 +1371,18 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params)
                goto done;
        }
 
-       if (bridge_in_bsd_mode(sc)) {
-               ifnet_set_offload(ifp,
-                   IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP |
-                   IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_MULTIPAGES);
-               error = bridge_set_tso(sc);
-               if (error != 0) {
-                       printf("%s: bridge_set_tso failed %d\n",
-                           __func__, error);
-                       goto done;
-               }
+       ifnet_set_offload(ifp,
+           IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP |
+           IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_MULTIPAGES);
+       error = bridge_set_tso(sc);
+       if (error != 0) {
+               printf("%s: bridge_set_tso failed %d\n",
+                   __func__, error);
+               goto done;
+       }
 #if BRIDGESTP
-               bstp_attach(&sc->sc_stp, &bridge_ops);
+       bstp_attach(&sc->sc_stp, &bridge_ops);
 #endif /* BRIDGESTP */
-       }
 
        lck_mtx_lock(&bridge_list_mtx);
        LIST_INSERT_HEAD(&bridge_list, sc, sc_list);
@@ -1466,12 +1422,10 @@ bridge_clone_destroy(struct ifnet *ifp)
 
        bridge_ifstop(ifp, 1);
 
-       if (bridge_in_bsd_mode(sc)) {
-               bridge_cancel_delayed_call(&sc->sc_resize_call);
+       bridge_cancel_delayed_call(&sc->sc_resize_call);
 
-               bridge_cleanup_delayed_call(&sc->sc_resize_call);
-               bridge_cleanup_delayed_call(&sc->sc_aging_timer);
-       }
+       bridge_cleanup_delayed_call(&sc->sc_resize_call);
+       bridge_cleanup_delayed_call(&sc->sc_aging_timer);
 
        error = ifnet_set_flags(ifp, 0, IFF_UP);
        if (error != 0) {
@@ -1482,12 +1436,10 @@ bridge_clone_destroy(struct ifnet *ifp)
                bridge_delete_member(sc, bif, 0);
        }
 
-       if (bridge_in_bsd_mode(sc)) {
-               while ((bif = TAILQ_FIRST(&sc->sc_spanlist)) != NULL) {
-                       bridge_delete_span(sc, bif);
-               }
-               BRIDGE_UNLOCK(sc);
+       while ((bif = TAILQ_FIRST(&sc->sc_spanlist)) != NULL) {
+               bridge_delete_span(sc, bif);
        }
+       BRIDGE_UNLOCK(sc);
 
        error = ifnet_detach(ifp);
        if (error != 0) {
@@ -1995,7 +1947,6 @@ out:
        return error;
 }
 
-#if BRIDGE_MEMBER_OUT_FILTER
 static errno_t
 bridge_iff_output(void *cookie, ifnet_t ifp, protocol_family_t protocol,
     mbuf_t *data)
@@ -2020,17 +1971,15 @@ bridge_iff_output(void *cookie, ifnet_t ifp, protocol_family_t protocol,
 #endif /* BRIDGE_DEBUG */
 
        error = bridge_member_output(sc, ifp, m);
-       if (error != 0) {
+       if (error != 0 && error != EJUSTRETURN) {
                printf("%s: bridge_member_output failed error %d\n", __func__,
                    error);
        }
-
 out:
        BRIDGE_LOCK_ASSERT_NOTHELD(sc);
 
        return error;
 }
-#endif /* BRIDGE_MEMBER_OUT_FILTER */
 
 static void
 bridge_iff_event(void *cookie, ifnet_t ifp, protocol_family_t protocol,
@@ -2054,7 +2003,7 @@ bridge_iff_event(void *cookie, ifnet_t ifp, protocol_family_t protocol,
                switch (event_msg->event_code) {
                case KEV_DL_IF_DETACHING:
                case KEV_DL_IF_DETACHED: {
-                       bridge_ifdetach(bif, ifp);
+                       bridge_ifdetach(ifp);
                        break;
                }
                case KEV_DL_LINK_OFF:
@@ -2089,6 +2038,11 @@ bridge_iff_event(void *cookie, ifnet_t ifp, protocol_family_t protocol,
                        BRIDGE_UNLOCK(sc);
                        break;
                }
+               case KEV_DL_PROTO_DETACHED:
+               case KEV_DL_PROTO_ATTACHED: {
+                       bridge_proto_attach_changed(ifp);
+                       break;
+               }
                default:
                        break;
                }
@@ -2112,7 +2066,7 @@ bridge_iff_detached(void *cookie, ifnet_t ifp)
        }
 #endif /* BRIDGE_DEBUG */
 
-       bridge_ifdetach(bif, ifp);
+       bridge_ifdetach(ifp);
 
        _FREE(bif, M_DEVBUF);
 }
@@ -2185,13 +2139,10 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
        int lladdr_changed = 0, error, filt_attached;
        uint8_t eaddr[ETHER_ADDR_LEN];
        u_int32_t event_code = 0;
-       boolean_t bsd_mode;
 
        BRIDGE_LOCK_ASSERT_HELD(sc);
        VERIFY(ifs != NULL);
 
-       bsd_mode = bridge_in_bsd_mode(sc);
-
        /*
         * First, remove the member from the list first so it cannot be found anymore
         * when we release the bridge lock below
@@ -2239,7 +2190,7 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
                BRIDGE_LOCK(sc);
        }
 #if BRIDGESTP
-       if (bsd_mode && (bif->bif_ifflags & IFBIF_STP) != 0) {
+       if ((bif->bif_ifflags & IFBIF_STP) != 0) {
                bstp_disable(&bif->bif_stp);
        }
 #endif /* BRIDGESTP */
@@ -2273,9 +2224,7 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
                printf("%s: bridge_set_tso failed %d\n", __func__, error);
        }
 
-       if (bsd_mode) {
-               bridge_rtdelete(sc, ifs, IFBF_FLUSHALL);
-       }
+       bridge_rtdelete(sc, ifs, IFBF_FLUSHALL);
 
        KASSERT(bif->bif_addrcnt == 0,
            ("%s: %d bridge routes referenced", __func__, bif->bif_addrcnt));
@@ -2287,9 +2236,8 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
         */
        event_code = bridge_updatelinkstatus(sc);
 
-       if (bsd_mode) {
-               BRIDGE_UNLOCK(sc);
-       }
+       BRIDGE_UNLOCK(sc);
+
 
        if (lladdr_changed &&
            (error = ifnet_set_lladdr(bifp, eaddr, ETHER_ADDR_LEN)) != 0) {
@@ -2301,9 +2249,7 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
        }
 
 #if BRIDGESTP
-       if (bsd_mode) {
-               bstp_destroy(&bif->bif_stp);    /* prepare to free */
-       }
+       bstp_destroy(&bif->bif_stp);    /* prepare to free */
 #endif /* BRIDGESTP */
 
        if (filt_attached) {
@@ -2347,7 +2293,6 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg)
        uint8_t eaddr[ETHER_ADDR_LEN];
        struct iff_filter iff;
        u_int32_t event_code = 0;
-       boolean_t bsd_mode = bridge_in_bsd_mode(sc);
 
        ifs = ifunit(req->ifbr_ifsname);
        if (ifs == NULL) {
@@ -2361,12 +2306,10 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg)
                return EINVAL;
        }
 
-       if (bsd_mode) {
-               /* If it's in the span list, it can't be a member. */
-               TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next)
-               if (ifs == bif->bif_ifp) {
-                       return EBUSY;
-               }
+       /* If it's in the span list, it can't be a member. */
+       TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next)
+       if (ifs == bif->bif_ifp) {
+               return EBUSY;
        }
 
        if (ifs->if_bridge == sc) {
@@ -2427,9 +2370,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg)
 
        ifs->if_bridge = sc;
 #if BRIDGESTP
-       if (bsd_mode) {
-               bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp);
-       }
+       bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp);
 #endif /* BRIDGESTP */
 
        /*
@@ -2481,9 +2422,8 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg)
        /*
         * Respect lock ordering with DLIL lock for the following operations
         */
-       if (bsd_mode) {
-               BRIDGE_UNLOCK(sc);
-       }
+       BRIDGE_UNLOCK(sc);
+
 
        /*
         * install an interface filter
@@ -2491,12 +2431,8 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg)
        memset(&iff, 0, sizeof(struct iff_filter));
        iff.iff_cookie = bif;
        iff.iff_name = "com.apple.kernel.bsd.net.if_bridge";
-       if (bsd_mode) {
-               iff.iff_input = bridge_iff_input;
-#if BRIDGE_MEMBER_OUT_FILTER
-               iff.iff_output = bridge_iff_output;
-#endif /* BRIDGE_MEMBER_OUT_FILTER */
-       }
+       iff.iff_input = bridge_iff_input;
+       iff.iff_output = bridge_iff_output;
        iff.iff_event = bridge_iff_event;
        iff.iff_detached = bridge_iff_detached;
        error = dlil_attach_filter(ifs, &iff, &bif->bif_iff_ref,
@@ -2506,10 +2442,12 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg)
                BRIDGE_LOCK(sc);
                goto out;
        }
+       BRIDGE_LOCK(sc);
        bif->bif_flags |= BIFF_FILTER_ATTACHED;
+       BRIDGE_UNLOCK(sc);
 
        /*
-        * install an dummy "bridge" protocol
+        * install a dummy "bridge" protocol
         */
        if ((error = bridge_attach_protocol(ifs)) != 0) {
                if (error != 0) {
@@ -2519,7 +2457,9 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg)
                        goto out;
                }
        }
+       BRIDGE_LOCK(sc);
        bif->bif_flags |= BIFF_PROTO_ATTACHED;
+       BRIDGE_UNLOCK(sc);
 
        if (lladdr_changed &&
            (error = ifnet_set_lladdr(bifp, eaddr, ETHER_ADDR_LEN)) != 0) {
@@ -2574,36 +2514,35 @@ bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg)
                return ENOENT;
        }
 
-       if (bridge_in_bsd_mode(sc)) {
-               struct bstp_port *bp;
+       struct bstp_port *bp;
 
-               bp = &bif->bif_stp;
-               req->ifbr_state = bp->bp_state;
-               req->ifbr_priority = bp->bp_priority;
-               req->ifbr_path_cost = bp->bp_path_cost;
-               req->ifbr_proto = bp->bp_protover;
-               req->ifbr_role = bp->bp_role;
-               req->ifbr_stpflags = bp->bp_flags;
-               /* Copy STP state options as flags */
-               if (bp->bp_operedge) {
-                       req->ifbr_ifsflags |= IFBIF_BSTP_EDGE;
-               }
-               if (bp->bp_flags & BSTP_PORT_AUTOEDGE) {
-                       req->ifbr_ifsflags |= IFBIF_BSTP_AUTOEDGE;
-               }
-               if (bp->bp_ptp_link) {
-                       req->ifbr_ifsflags |= IFBIF_BSTP_PTP;
-               }
-               if (bp->bp_flags & BSTP_PORT_AUTOPTP) {
-                       req->ifbr_ifsflags |= IFBIF_BSTP_AUTOPTP;
-               }
-               if (bp->bp_flags & BSTP_PORT_ADMEDGE) {
-                       req->ifbr_ifsflags |= IFBIF_BSTP_ADMEDGE;
-               }
-               if (bp->bp_flags & BSTP_PORT_ADMCOST) {
-                       req->ifbr_ifsflags |= IFBIF_BSTP_ADMCOST;
-               }
+       bp = &bif->bif_stp;
+       req->ifbr_state = bp->bp_state;
+       req->ifbr_priority = bp->bp_priority;
+       req->ifbr_path_cost = bp->bp_path_cost;
+       req->ifbr_proto = bp->bp_protover;
+       req->ifbr_role = bp->bp_role;
+       req->ifbr_stpflags = bp->bp_flags;
+       /* Copy STP state options as flags */
+       if (bp->bp_operedge) {
+               req->ifbr_ifsflags |= IFBIF_BSTP_EDGE;
+       }
+       if (bp->bp_flags & BSTP_PORT_AUTOEDGE) {
+               req->ifbr_ifsflags |= IFBIF_BSTP_AUTOEDGE;
+       }
+       if (bp->bp_ptp_link) {
+               req->ifbr_ifsflags |= IFBIF_BSTP_PTP;
        }
+       if (bp->bp_flags & BSTP_PORT_AUTOPTP) {
+               req->ifbr_ifsflags |= IFBIF_BSTP_AUTOPTP;
+       }
+       if (bp->bp_flags & BSTP_PORT_ADMEDGE) {
+               req->ifbr_ifsflags |= IFBIF_BSTP_ADMEDGE;
+       }
+       if (bp->bp_flags & BSTP_PORT_ADMCOST) {
+               req->ifbr_ifsflags |= IFBIF_BSTP_ADMCOST;
+       }
+
        req->ifbr_ifsflags = bif->bif_ifflags;
        req->ifbr_portno = bif->bif_ifp->if_index & 0xfff;
        req->ifbr_addrcnt = bif->bif_addrcnt;
@@ -2623,10 +2562,6 @@ bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg)
        int error;
 #endif /* BRIDGESTP */
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return EINVAL;
-       }
-
        bif = bridge_lookup_member(sc, req->ifbr_ifsname);
        if (bif == NULL) {
                return ENOENT;
@@ -2677,9 +2612,7 @@ bridge_ioctl_scache(struct bridge_softc *sc, void *arg)
        struct ifbrparam *param = arg;
 
        sc->sc_brtmax = param->ifbrp_csize;
-       if (bridge_in_bsd_mode(sc)) {
-               bridge_rttrim(sc);
-       }
+       bridge_rttrim(sc);
        return 0;
 }
 
@@ -2702,10 +2635,8 @@ bridge_ioctl_gcache(struct bridge_softc *sc, void *arg)
        count = 0;                                                      \
        TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next)                    \
                count++;                                                \
-       if (bridge_in_bsd_mode(sc)) {                                   \
-               TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next)          \
-                       count++;                                        \
-       }                                                               \
+       TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next)                  \
+               count++;                                                \
                                                                         \
        buflen = sizeof (breq) * count;                                 \
        if (bifc->ifbic_len == 0) {                                     \
@@ -2735,22 +2666,20 @@ bridge_ioctl_gcache(struct bridge_softc *sc, void *arg)
                buf += sizeof (breq);                                   \
                len -= sizeof (breq);                                   \
        }                                                               \
-       if (bridge_in_bsd_mode(sc)) {                                   \
-               TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) {        \
-                       if (len < sizeof (breq))                        \
-                               break;                                  \
+       TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) {                \
+               if (len < sizeof (breq))                                \
+                       break;                                          \
                                                                         \
-                       snprintf(breq.ifbr_ifsname,                     \
-                                sizeof (breq.ifbr_ifsname),            \
-                                "%s", bif->bif_ifp->if_xname);         \
-                       breq.ifbr_ifsflags = bif->bif_ifflags;          \
-                       breq.ifbr_portno                                \
-                               = bif->bif_ifp->if_index & 0xfff;       \
-                       memcpy(buf, &breq, sizeof (breq));              \
-                       count++;                                        \
-                       buf += sizeof (breq);                           \
-                       len -= sizeof (breq);                           \
-               }                                                       \
+               snprintf(breq.ifbr_ifsname,                             \
+                        sizeof (breq.ifbr_ifsname),                    \
+                        "%s", bif->bif_ifp->if_xname);                 \
+               breq.ifbr_ifsflags = bif->bif_ifflags;                  \
+               breq.ifbr_portno                                        \
+                       = bif->bif_ifp->if_index & 0xfff;               \
+               memcpy(buf, &breq, sizeof (breq));                      \
+               count++;                                                \
+               buf += sizeof (breq);                                   \
+               len -= sizeof (breq);                                   \
        }                                                               \
                                                                         \
        BRIDGE_UNLOCK(sc);                                              \
@@ -2794,9 +2723,6 @@ bridge_ioctl_gifs32(struct bridge_softc *sc, void *arg)
                                                                             \
        bzero(&bareq, sizeof (bareq));                                      \
        count = 0;                                                          \
-       if (!bridge_in_bsd_mode(sc)) {                                      \
-               goto out;                                                   \
-       }                                                                   \
        LIST_FOREACH(brt, &sc->sc_rtlist, brt_list)                         \
                count++;                                                    \
        buflen = sizeof (bareq) * count;                                    \
@@ -2869,10 +2795,6 @@ bridge_ioctl_saddr32(struct bridge_softc *sc, void *arg)
        struct bridge_iflist *bif;
        int error;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return 0;
-       }
-
        bif = bridge_lookup_member(sc, req->ifba_ifsname);
        if (bif == NULL) {
                return ENOENT;
@@ -2891,10 +2813,6 @@ bridge_ioctl_saddr64(struct bridge_softc *sc, void *arg)
        struct bridge_iflist *bif;
        int error;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return 0;
-       }
-
        bif = bridge_lookup_member(sc, req->ifba_ifsname);
        if (bif == NULL) {
                return ENOENT;
@@ -2929,9 +2847,6 @@ bridge_ioctl_daddr32(struct bridge_softc *sc, void *arg)
 {
        struct ifbareq32 *req = arg;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return 0;
-       }
        return bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan);
 }
 
@@ -2940,9 +2855,6 @@ bridge_ioctl_daddr64(struct bridge_softc *sc, void *arg)
 {
        struct ifbareq64 *req = arg;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return 0;
-       }
        return bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan);
 }
 
@@ -2951,9 +2863,6 @@ bridge_ioctl_flush(struct bridge_softc *sc, void *arg)
 {
        struct ifbreq *req = arg;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return 0;
-       }
        bridge_rtflush(sc, req->ifbr_ifsflags);
        return 0;
 }
@@ -2964,9 +2873,6 @@ bridge_ioctl_gpri(struct bridge_softc *sc, void *arg)
        struct ifbrparam *param = arg;
        struct bstp_state *bs = &sc->sc_stp;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return 0;
-       }
        param->ifbrp_prio = bs->bs_bridge_priority;
        return 0;
 }
@@ -2977,9 +2883,6 @@ bridge_ioctl_spri(struct bridge_softc *sc, void *arg)
 #if BRIDGESTP
        struct ifbrparam *param = arg;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return EOPNOTSUPP;
-       }
        return bstp_set_priority(&sc->sc_stp, param->ifbrp_prio);
 #else /* !BRIDGESTP */
 #pragma unused(sc, arg)
@@ -2993,9 +2896,6 @@ bridge_ioctl_ght(struct bridge_softc *sc, void *arg)
        struct ifbrparam *param = arg;
        struct bstp_state *bs = &sc->sc_stp;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return 0;
-       }
        param->ifbrp_hellotime = bs->bs_bridge_htime >> 8;
        return 0;
 }
@@ -3006,9 +2906,6 @@ bridge_ioctl_sht(struct bridge_softc *sc, void *arg)
 #if BRIDGESTP
        struct ifbrparam *param = arg;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return EOPNOTSUPP;
-       }
        return bstp_set_htime(&sc->sc_stp, param->ifbrp_hellotime);
 #else /* !BRIDGESTP */
 #pragma unused(sc, arg)
@@ -3022,9 +2919,6 @@ bridge_ioctl_gfd(struct bridge_softc *sc, void *arg)
        struct ifbrparam *param;
        struct bstp_state *bs;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return 0;
-       }
        param = arg;
        bs = &sc->sc_stp;
        param->ifbrp_fwddelay = bs->bs_bridge_fdelay >> 8;
@@ -3037,9 +2931,6 @@ bridge_ioctl_sfd(struct bridge_softc *sc, void *arg)
 #if BRIDGESTP
        struct ifbrparam *param = arg;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return EOPNOTSUPP;
-       }
        return bstp_set_fdelay(&sc->sc_stp, param->ifbrp_fwddelay);
 #else /* !BRIDGESTP */
 #pragma unused(sc, arg)
@@ -3053,9 +2944,6 @@ bridge_ioctl_gma(struct bridge_softc *sc, void *arg)
        struct ifbrparam *param;
        struct bstp_state *bs;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return EOPNOTSUPP;
-       }
        param = arg;
        bs = &sc->sc_stp;
        param->ifbrp_maxage = bs->bs_bridge_max_age >> 8;
@@ -3068,9 +2956,6 @@ bridge_ioctl_sma(struct bridge_softc *sc, void *arg)
 #if BRIDGESTP
        struct ifbrparam *param = arg;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return EOPNOTSUPP;
-       }
        return bstp_set_maxage(&sc->sc_stp, param->ifbrp_maxage);
 #else /* !BRIDGESTP */
 #pragma unused(sc, arg)
@@ -3085,9 +2970,6 @@ bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg)
        struct ifbreq *req = arg;
        struct bridge_iflist *bif;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return EOPNOTSUPP;
-       }
        bif = bridge_lookup_member(sc, req->ifbr_ifsname);
        if (bif == NULL) {
                return ENOENT;
@@ -3107,9 +2989,6 @@ bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg)
        struct ifbreq *req = arg;
        struct bridge_iflist *bif;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return EOPNOTSUPP;
-       }
        bif = bridge_lookup_member(sc, req->ifbr_ifsname);
        if (bif == NULL) {
                return ENOENT;
@@ -3174,9 +3053,6 @@ bridge_ioctl_addspan(struct bridge_softc *sc, void *arg)
        struct bridge_iflist *bif = NULL;
        struct ifnet *ifs;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return EOPNOTSUPP;
-       }
        ifs = ifunit(req->ifbr_ifsname);
        if (ifs == NULL) {
                return ENOENT;
@@ -3228,9 +3104,6 @@ bridge_ioctl_delspan(struct bridge_softc *sc, void *arg)
        struct bridge_iflist *bif;
        struct ifnet *ifs;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return EOPNOTSUPP;
-       }
        ifs = ifunit(req->ifbr_ifsname);
        if (ifs == NULL) {
                return ENOENT;
@@ -3280,9 +3153,7 @@ bridge_ioctl_gbparam32(struct bridge_softc *sc, void *arg)
 {
        struct ifbropreq32 *req = arg;
 
-       if (bridge_in_bsd_mode(sc)) {
-               BRIDGE_IOCTL_GBPARAM;
-       }
+       BRIDGE_IOCTL_GBPARAM;
        return 0;
 }
 
@@ -3291,9 +3162,7 @@ bridge_ioctl_gbparam64(struct bridge_softc *sc, void *arg)
 {
        struct ifbropreq64 *req = arg;
 
-       if (bridge_in_bsd_mode(sc)) {
-               BRIDGE_IOCTL_GBPARAM;
-       }
+       BRIDGE_IOCTL_GBPARAM;
        return 0;
 }
 
@@ -3368,9 +3237,7 @@ bridge_ioctl_gifsstp32(struct bridge_softc *sc, void *arg)
        struct ifbpstpconf32 *bifstp = arg;
        int error = 0;
 
-       if (bridge_in_bsd_mode(sc)) {
-               BRIDGE_IOCTL_GIFSSTP;
-       }
+       BRIDGE_IOCTL_GIFSSTP;
        return error;
 }
 
@@ -3380,9 +3247,7 @@ bridge_ioctl_gifsstp64(struct bridge_softc *sc, void *arg)
        struct ifbpstpconf64 *bifstp = arg;
        int error = 0;
 
-       if (bridge_in_bsd_mode(sc)) {
-               BRIDGE_IOCTL_GIFSSTP;
-       }
+       BRIDGE_IOCTL_GIFSSTP;
        return error;
 }
 
@@ -3392,9 +3257,6 @@ bridge_ioctl_sproto(struct bridge_softc *sc, void *arg)
 #if BRIDGESTP
        struct ifbrparam *param = arg;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return EOPNOTSUPP;
-       }
        return bstp_set_protocol(&sc->sc_stp, param->ifbrp_proto);
 #else /* !BRIDGESTP */
 #pragma unused(sc, arg)
@@ -3408,9 +3270,6 @@ bridge_ioctl_stxhc(struct bridge_softc *sc, void *arg)
 #if BRIDGESTP
        struct ifbrparam *param = arg;
 
-       if (!bridge_in_bsd_mode(sc)) {
-               return EOPNOTSUPP;
-       }
        return bstp_set_holdcount(&sc->sc_stp, param->ifbrp_txhc);
 #else /* !BRIDGESTP */
 #pragma unused(sc, arg)
@@ -3491,9 +3350,10 @@ bridge_ioctl_shostfilter(struct bridge_softc *sc, void *arg)
  *     Detach an interface from a bridge.  Called when a member
  *     interface is detaching.
  */
-__private_extern__ void
-bridge_ifdetach(struct bridge_iflist *bif, struct ifnet *ifp)
+static void
+bridge_ifdetach(struct ifnet *ifp)
 {
+       struct bridge_iflist *bif;
        struct bridge_softc *sc = ifp->if_bridge;
 
 #if BRIDGE_DEBUG
@@ -3515,19 +3375,68 @@ bridge_ifdetach(struct bridge_iflist *bif, struct ifnet *ifp)
        /* Check if the interface is a span port */
        lck_mtx_lock(&bridge_list_mtx);
        LIST_FOREACH(sc, &bridge_list, sc_list) {
-               if (bridge_in_bsd_mode(sc)) {
-                       BRIDGE_LOCK(sc);
-                       TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next)
-                       if (ifp == bif->bif_ifp) {
-                               bridge_delete_span(sc, bif);
-                               break;
-                       }
-                       BRIDGE_UNLOCK(sc);
+               BRIDGE_LOCK(sc);
+               TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next)
+               if (ifp == bif->bif_ifp) {
+                       bridge_delete_span(sc, bif);
+                       break;
                }
+               BRIDGE_UNLOCK(sc);
        }
        lck_mtx_unlock(&bridge_list_mtx);
 }
 
+/*
+ * bridge_proto_attach_changed
+ *
+ *     Called when protocol attachment on the interface changes.
+ */
+static void
+bridge_proto_attach_changed(struct ifnet *ifp)
+{
+       boolean_t changed = FALSE;
+       struct bridge_iflist *bif;
+       boolean_t input_broadcast;
+       struct bridge_softc *sc = ifp->if_bridge;
+
+#if BRIDGE_DEBUG
+       if (if_bridge_debug & BR_DBGF_LIFECYCLE) {
+               printf("%s: %s\n", __func__, ifp->if_xname);
+       }
+#endif /* BRIDGE_DEBUG */
+       if (sc == NULL) {
+               return;
+       }
+       /*
+        * Selectively enable input broadcast only when necessary.
+        * The bridge interface itself attaches a fake protocol
+        * so checking for at least two protocols means that the
+        * interface is being used for something besides bridging.
+        */
+       input_broadcast = if_get_protolist(ifp, NULL, 0) >= 2;
+       BRIDGE_LOCK(sc);
+       bif = bridge_lookup_member_if(sc, ifp);
+       if (bif != NULL) {
+               if (input_broadcast) {
+                       if ((bif->bif_flags & BIFF_INPUT_BROADCAST) == 0) {
+                               bif->bif_flags |= BIFF_INPUT_BROADCAST;
+                               changed = TRUE;
+                       }
+               } else if ((bif->bif_flags & BIFF_INPUT_BROADCAST) != 0) {
+                       changed = TRUE;
+                       bif->bif_flags &= ~BIFF_INPUT_BROADCAST;
+               }
+       }
+       BRIDGE_UNLOCK(sc);
+#if BRIDGE_DEBUG
+       if ((if_bridge_debug & BR_DBGF_LIFECYCLE) != 0 && changed) {
+               printf("%s: input broadcast %s", ifp->if_xname,
+                   input_broadcast ? "ENABLED" : "DISABLED");
+       }
+#endif /* BRIDGE_DEBUG */
+       return;
+}
+
 /*
  * interface_media_active:
  *
@@ -3827,18 +3736,16 @@ bridge_init(struct ifnet *ifp)
 
        error = ifnet_set_flags(ifp, IFF_RUNNING, IFF_RUNNING);
 
-       if (bridge_in_bsd_mode(sc)) {
-               /*
-                * Calling bridge_aging_timer() is OK as there are no entries to
-                * age so we're just going to arm the timer
-                */
-               bridge_aging_timer(sc);
+       /*
+        * Calling bridge_aging_timer() is OK as there are no entries to
+        * age so we're just going to arm the timer
+        */
+       bridge_aging_timer(sc);
 #if BRIDGESTP
-               if (error == 0) {
-                       bstp_init(&sc->sc_stp); /* Initialize Spanning Tree */
-               }
-#endif /* BRIDGESTP */
+       if (error == 0) {
+               bstp_init(&sc->sc_stp); /* Initialize Spanning Tree */
        }
+#endif /* BRIDGESTP */
        return error;
 }
 
@@ -3859,18 +3766,70 @@ bridge_ifstop(struct ifnet *ifp, int disable)
                return;
        }
 
-       if (bridge_in_bsd_mode(sc)) {
-               bridge_cancel_delayed_call(&sc->sc_aging_timer);
+       bridge_cancel_delayed_call(&sc->sc_aging_timer);
 
 #if BRIDGESTP
-               bstp_stop(&sc->sc_stp);
+       bstp_stop(&sc->sc_stp);
 #endif /* BRIDGESTP */
 
-               bridge_rtflush(sc, IFBF_FLUSHDYN);
-       }
+       bridge_rtflush(sc, IFBF_FLUSHDYN);
        (void) ifnet_set_flags(ifp, 0, IFF_RUNNING);
 }
 
+/*
+ * bridge_compute_cksum:
+ *
+ *     If the packet has checksum flags, compare the hardware checksum
+ *     capabilities of the source and destination interfaces. If they
+ *     are the same, there's nothing to do. If they are different,
+ *     finalize the checksum so that it can be sent on the destination
+ *     interface.
+ */
+static void
+bridge_compute_cksum(struct ifnet *src_if, struct ifnet *dst_if, struct mbuf *m)
+{
+       uint32_t csum_flags;
+       uint16_t dst_hw_csum;
+       uint32_t did_sw;
+       struct ether_header *eh;
+       uint16_t src_hw_csum;
+
+       csum_flags = m->m_pkthdr.csum_flags & IF_HWASSIST_CSUM_MASK;
+       if (csum_flags == 0) {
+               /* no checksum offload */
+               return;
+       }
+
+       /*
+        * if destination/source differ in checksum offload
+        * capabilities, finalize/compute the checksum
+        */
+       dst_hw_csum = IF_HWASSIST_CSUM_FLAGS(dst_if->if_hwassist);
+       src_hw_csum = IF_HWASSIST_CSUM_FLAGS(src_if->if_hwassist);
+       if (dst_hw_csum == src_hw_csum) {
+               return;
+       }
+       eh = mtod(m, struct ether_header *);
+       switch (ntohs(eh->ether_type)) {
+       case ETHERTYPE_IP:
+               did_sw = in_finalize_cksum(m, sizeof(*eh), csum_flags);
+               break;
+#if INET6
+       case ETHERTYPE_IPV6:
+               did_sw = in6_finalize_cksum(m, sizeof(*eh), -1, -1, csum_flags);
+               break;
+#endif /* INET6 */
+       }
+#if BRIDGE_DEBUG
+       if (if_bridge_debug & BR_DBGF_CHECKSUM) {
+               printf("%s: [%s -> %s] before 0x%x did 0x%x after 0x%x\n",
+                   __func__,
+                   src_if->if_xname, dst_if->if_xname, csum_flags, did_sw,
+                   m->m_pkthdr.csum_flags);
+       }
+#endif /* BRIDGE_DEBUG */
+}
+
 /*
  * bridge_enqueue:
  *
@@ -3878,11 +3837,11 @@ bridge_ifstop(struct ifnet *ifp, int disable)
  *
  */
 static int
-bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m)
+bridge_enqueue(struct bridge_softc *sc, struct ifnet *src_ifp,
+    struct ifnet *dst_ifp, struct mbuf *m, ChecksumOperation cksum_op)
 {
        int len, error = 0;
-       short mflags;
-       struct mbuf *m0;
+       struct mbuf *next_m;
 
        VERIFY(dst_ifp != NULL);
 
@@ -3891,19 +3850,30 @@ bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m)
         *
         * NOTE: bridge_fragment() is called only when PFIL_HOOKS is enabled.
         */
-       for (; m; m = m0) {
+       for (; m; m = next_m) {
                errno_t _error;
-               struct flowadv adv = { FADV_SUCCESS };
+               struct flowadv adv = { .code = FADV_SUCCESS };
 
-               m0 = m->m_nextpkt;
+               next_m = m->m_nextpkt;
                m->m_nextpkt = NULL;
 
                len = m->m_pkthdr.len;
-               mflags = m->m_flags;
                m->m_flags |= M_PROTO1; /* set to avoid loops */
 
-               bridge_finalize_cksum(dst_ifp, m);
-
+               switch (cksum_op) {
+               case kChecksumOperationClear:
+                       m->m_pkthdr.csum_flags = 0;
+                       break;
+               case kChecksumOperationFinalize:
+                       /* the checksum might not be correct, finalize now */
+                       bridge_finalize_cksum(dst_ifp, m);
+                       break;
+               case kChecksumOperationCompute:
+                       bridge_compute_cksum(src_ifp, dst_ifp, m);
+                       break;
+               default:
+                       break;
+               }
 #if HAS_IF_CAP
                /*
                 * If underlying interface can not do VLAN tag insertion itself
@@ -3963,7 +3933,7 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp)
        sc = ifp->if_bridge;
 
        /*
-        * The packet didnt originate from a member interface. This should only
+        * The packet didn't originate from a member interface. This should only
         * ever happen if a member interface is removed while packets are
         * queued for it.
         */
@@ -3981,11 +3951,10 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp)
                }
        }
 
-       (void) bridge_enqueue(sc, ifp, m);
+       (void) bridge_enqueue(sc, NULL, ifp, m, kChecksumOperationNone);
 }
 #endif /* HAS_BRIDGE_DUMMYNET */
 
-#if BRIDGE_MEMBER_OUT_FILTER
 /*
  * bridge_member_output:
  *
@@ -3993,17 +3962,13 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp)
  *     performs the bridging function for locally originated
  *     packets.
  *
- *     The mbuf has the Ethernet header already attached.  We must
- *     enqueue or free the mbuf before returning.
+ *     The mbuf has the Ethernet header already attached.
  */
-static int
-bridge_member_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa,
-    struct rtentry *rt)
+static errno_t
+bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t m)
 {
-#pragma unused(sa, rt)
        struct ether_header *eh;
        struct ifnet *dst_if;
-       struct bridge_softc *sc;
        uint16_t vlan;
 
 #if BRIDGE_DEBUG
@@ -4015,12 +3980,11 @@ bridge_member_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa,
        if (m->m_len < ETHER_HDR_LEN) {
                m = m_pullup(m, ETHER_HDR_LEN);
                if (m == NULL) {
-                       return 0;
+                       return ENOBUFS;
                }
        }
 
        eh = mtod(m, struct ether_header *);
-       sc = ifp->if_bridge;
        vlan = VLANTAGOF(m);
 
        BRIDGE_LOCK(sc);
@@ -4057,17 +4021,23 @@ bridge_member_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa,
        if (dst_if == NULL) {
                struct bridge_iflist *bif;
                struct mbuf *mc;
-               int error = 0, used = 0;
+               int used = 0;
+               errno_t error;
+
 
                bridge_span(sc, m);
 
                BRIDGE_LOCK2REF(sc, error);
-               if (error) {
+               if (error != 0) {
                        m_freem(m);
-                       return 0;
+                       return error;
                }
 
                TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) {
+                       /* skip interface with inactive link status */
+                       if ((bif->bif_flags & BIFF_MEDIA_ACTIVE) == 0) {
+                               continue;
+                       }
                        dst_if = bif->bif_ifp;
 
                        if (dst_if->if_type == IFT_GIF) {
@@ -4087,26 +4057,25 @@ bridge_member_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa,
                            bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) {
                                continue;
                        }
-
-                       if (LIST_NEXT(bif, bif_next) == NULL) {
+                       if (TAILQ_NEXT(bif, bif_next) == NULL) {
                                used = 1;
                                mc = m;
                        } else {
-                               mc = m_copypacket(m, M_DONTWAIT);
+                               mc = m_dup(m, M_DONTWAIT);
                                if (mc == NULL) {
                                        (void) ifnet_stat_increment_out(
                                                sc->sc_ifp, 0, 0, 1);
                                        continue;
                                }
                        }
-
-                       (void) bridge_enqueue(sc, dst_if, mc);
+                       (void) bridge_enqueue(sc, ifp, dst_if, mc,
+                           kChecksumOperationCompute);
                }
                if (used == 0) {
                        m_freem(m);
                }
                BRIDGE_UNREF(sc);
-               return 0;
+               return EJUSTRETURN;
        }
 
 sendunicast:
@@ -4118,14 +4087,18 @@ sendunicast:
        if ((dst_if->if_flags & IFF_RUNNING) == 0) {
                m_freem(m);
                BRIDGE_UNLOCK(sc);
-               return 0;
+               return EJUSTRETURN;
        }
 
        BRIDGE_UNLOCK(sc);
-       (void) bridge_enqueue(sc, dst_if, m);
-       return 0;
+       if (dst_if == ifp) {
+               /* just let the packet continue on its way */
+               return 0;
+       }
+       (void) bridge_enqueue(sc, ifp, dst_if, m,
+           kChecksumOperationCompute);
+       return EJUSTRETURN;
 }
-#endif /* BRIDGE_MEMBER_OUT_FILTER */
 
 /*
  * Output callback.
@@ -4145,7 +4118,6 @@ bridge_output(struct ifnet *ifp, struct mbuf *m)
        dst_if = NULL;
 
        BRIDGE_LOCK(sc);
-       ASSERT(bridge_in_bsd_mode(sc));
 
        if (!(m->m_flags & (M_BCAST | M_MCAST))) {
                dst_if = bridge_rtlookup(sc, eh->ether_dhost, 0);
@@ -4161,10 +4133,11 @@ bridge_output(struct ifnet *ifp, struct mbuf *m)
 
        if (dst_if == NULL) {
                /* callee will unlock */
-               bridge_broadcast(sc, ifp, m, 0);
+               bridge_broadcast(sc, NULL, m, 0);
        } else {
                BRIDGE_UNLOCK(sc);
-               error = bridge_enqueue(sc, dst_if, m);
+               error = bridge_enqueue(sc, NULL, dst_if, m,
+                   kChecksumOperationFinalize);
        }
 
        return error;
@@ -4176,6 +4149,7 @@ bridge_finalize_cksum(struct ifnet *ifp, struct mbuf *m)
        struct ether_header *eh = mtod(m, struct ether_header *);
        uint32_t sw_csum, hwcap;
 
+
        if (ifp != NULL) {
                hwcap = (ifp->if_hwassist | CSUM_DATA_VALID);
        } else {
@@ -4277,7 +4251,6 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif,
        int error;
 
        BRIDGE_LOCK_ASSERT_HELD(sc);
-       ASSERT(bridge_in_bsd_mode(sc));
 
 #if BRIDGE_DEBUG
        if (if_bridge_debug & BR_DBGF_OUTPUT) {
@@ -4329,11 +4302,14 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif,
         * "this" side of the bridge, drop it.
         */
        if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) {
+               /* unicast */
                dst_if = bridge_rtlookup(sc, dst, vlan);
                if (src_if == dst_if) {
                        goto drop;
                }
        } else {
+               /* broadcast/multicast */
+
                /*
                 * Check if its a reserved multicast address, any address
                 * listed in 802.1D section 7.12.6 may not be forwarded by the
@@ -4390,6 +4366,9 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif,
                return;
        }
 
+       /*
+        * Unicast.
+        */
        /*
         * At this point, we're dealing with a unicast frame
         * going to a different interface.
@@ -4438,7 +4417,14 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif,
        }
 #endif /* PFIL_HOOKS */
 
-       (void) bridge_enqueue(sc, dst_if, m);
+       /*
+        * This is an inbound packet where the checksum
+        * (if applicable) is already present/valid. Since
+        * we are just doing layer 2 forwarding (not IP
+        * forwarding), there's no need to validate the checksum.
+        * Clear the checksum offload flags and send it along.
+        */
+       (void) bridge_enqueue(sc, NULL, dst_if, m, kChecksumOperationClear);
        return;
 
 drop:
@@ -4478,7 +4464,6 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header)
        uint16_t vlan;
        int error;
 
-       ASSERT(bridge_in_bsd_mode(sc));
 #if BRIDGE_DEBUG
        if (if_bridge_debug & BR_DBGF_INPUT) {
                printf("%s: %s from %s m 0x%llx data 0x%llx\n", __func__,
@@ -4695,30 +4680,40 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header)
        if ((iface)->if_type == IFT_GIF)                                \
                continue;                                               \
        /* It is destined for us. */                                    \
-       if (memcmp(IF_LLADDR((iface)), eh->ether_dhost,         \
+       if (memcmp(IF_LLADDR((iface)), eh->ether_dhost,                 \
            ETHER_ADDR_LEN) == 0 || CARP_CHECK_WE_ARE_DST((iface))) {   \
                if ((iface)->if_type == IFT_BRIDGE) {                   \
                        BRIDGE_BPF_MTAP_INPUT(sc, m);                   \
        /* Filter on the physical interface. */         \
                        PFIL_PHYS(sc, iface, m);                        \
+               } else {                                                \
+                       bpf_tap_in(iface, DLT_EN10MB, m, NULL, 0);      \
                }                                                       \
                if (bif->bif_ifflags & IFBIF_LEARNING) {                \
                        error = bridge_rtupdate(sc, eh->ether_shost,    \
                            vlan, bif, 0, IFBAF_DYNAMIC);               \
                        if (error && bif->bif_addrmax) {                \
                                BRIDGE_UNLOCK(sc);                      \
+                               m_freem(m);                             \
                                return (EJUSTRETURN);                   \
                        }                                               \
                }                                                       \
-               m->m_pkthdr.rcvif = iface;                              \
                BRIDGE_UNLOCK(sc);                                      \
-               return (0);                                             \
+               mbuf_pkthdr_setrcvif(m, iface);                         \
+               mbuf_pkthdr_setheader(m, mbuf_data(m));                 \
+               mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN,   \
+                            mbuf_len(m) - ETHER_HDR_LEN);              \
+               mbuf_pkthdr_adjustlen(m, -ETHER_HDR_LEN);               \
+               m->m_flags |= M_PROTO1; /* set to avoid loops */        \
+               dlil_input_packet_list(iface, m);                       \
+               return (EJUSTRETURN);                                   \
        }                                                               \
                                                                         \
        /* We just received a packet that we sent out. */               \
-       if (memcmp(IF_LLADDR((iface)), eh->ether_shost,         \
+       if (memcmp(IF_LLADDR((iface)), eh->ether_shost,                 \
            ETHER_ADDR_LEN) == 0 || CARP_CHECK_WE_ARE_SRC((iface))) {   \
                BRIDGE_UNLOCK(sc);                                      \
+               m_freem(m);                                             \
                return (EJUSTRETURN);                                   \
        }
 
@@ -4787,13 +4782,16 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header)
                printf("%s: not forwarding packet bound for member "
                    "interface\n", __func__);
 #endif
+
                BRIDGE_UNLOCK(sc);
                return 0;
        }
 
-       /* Now check the all bridge members. */
+       /* Now check the remaining bridge members. */
        TAILQ_FOREACH(bif2, &sc->sc_iflist, bif_next) {
-               GRAB_OUR_PACKETS(bif2->bif_ifp)
+               if (bif2->bif_ifp != ifp) {
+                       GRAB_OUR_PACKETS(bif2->bif_ifp);
+               }
        }
 
 #undef CARP_CHECK_WE_ARE_DST
@@ -4828,10 +4826,25 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if,
 #endif
        struct bridge_iflist *dbif, *sbif;
        struct mbuf *mc;
+       struct mbuf *mc_in;
        struct ifnet *dst_if;
        int error = 0, used = 0;
+       boolean_t is_output;
+       ChecksumOperation cksum_op;
 
-       sbif = bridge_lookup_member_if(sc, src_if);
+       if (src_if != NULL) {
+               is_output = FALSE;
+               cksum_op = kChecksumOperationClear;
+               sbif = bridge_lookup_member_if(sc, src_if);
+       } else {
+               /*
+                * src_if is NULL when the bridge interface calls
+                * bridge_broadcast().
+                */
+               is_output = TRUE;
+               cksum_op = kChecksumOperationFinalize;
+               sbif = NULL;
+       }
 
        BRIDGE_LOCK2REF(sc, error);
        if (error) {
@@ -4854,11 +4867,12 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if,
        TAILQ_FOREACH(dbif, &sc->sc_iflist, bif_next) {
                dst_if = dbif->bif_ifp;
                if (dst_if == src_if) {
+                       /* skip the interface that the packet came in on */
                        continue;
                }
 
                /* Private segments can not talk to each other */
-               if (sbif &&
+               if (sbif != NULL &&
                    (sbif->bif_ifflags & dbif->bif_ifflags & IFBIF_PRIVATE)) {
                        continue;
                }
@@ -4893,6 +4907,18 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if,
                        }
                }
 
+               /*
+                * If broadcast input is enabled, do so only if this
+                * is an input packet.
+                */
+               if (!is_output &&
+                   (dbif->bif_flags & BIFF_INPUT_BROADCAST) != 0) {
+                       mc_in = m_dup(mc, M_DONTWAIT);
+                       /* this could fail, but we continue anyways */
+               } else {
+                       mc_in = NULL;
+               }
+
 #ifdef PFIL_HOOKS
                /*
                 * Filter on the output interface. Pass a NULL bridge interface
@@ -4908,19 +4934,42 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if,
                                if (mc == NULL) {
                                        (void) ifnet_stat_increment_out(
                                                sc->sc_ifp, 0, 0, 1);
+                                       if (mc_in != NULL) {
+                                               m_freem(mc_in);
+                                       }
                                        continue;
                                }
                        }
                        if (bridge_pfil(&mc, NULL, dst_if, PFIL_OUT) != 0) {
+                               if (mc_in != NULL) {
+                                       m_freem(mc_in);
+                               }
                                continue;
                        }
                        if (mc == NULL) {
+                               if (mc_in != NULL) {
+                                       m_freem(mc_in);
+                               }
                                continue;
                        }
                }
 #endif /* PFIL_HOOKS */
 
-               (void) bridge_enqueue(sc, dst_if, mc);
+               /* out */
+               (void) bridge_enqueue(sc, NULL, dst_if, mc, cksum_op);
+
+               /* in */
+               if (mc_in == NULL) {
+                       continue;
+               }
+               bpf_tap_in(dst_if, DLT_EN10MB, mc_in, NULL, 0);
+               mbuf_pkthdr_setrcvif(mc_in, dst_if);
+               mbuf_pkthdr_setheader(mc_in, mbuf_data(mc_in));
+               mbuf_setdata(mc_in, (char *)mbuf_data(mc_in) + ETHER_HDR_LEN,
+                   mbuf_len(mc_in) - ETHER_HDR_LEN);
+               mbuf_pkthdr_adjustlen(mc_in, -ETHER_HDR_LEN);
+               mc_in->m_flags |= M_PROTO1; /* set to avoid loops */
+               dlil_input_packet_list(dst_if, mc_in);
        }
        if (used == 0) {
                m_freem(m);
@@ -4963,7 +5012,8 @@ bridge_span(struct bridge_softc *sc, struct mbuf *m)
                        continue;
                }
 
-               (void) bridge_enqueue(sc, dst_if, mc);
+               (void) bridge_enqueue(sc, NULL, dst_if, mc,
+                   kChecksumOperationNone);
        }
 }
 
@@ -4981,7 +5031,6 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan,
        int error;
 
        BRIDGE_LOCK_ASSERT_HELD(sc);
-       ASSERT(bridge_in_bsd_mode(sc));
 
        /* Check the source address is valid and not multicast. */
        if (ETHER_IS_MULTICAST(dst) ||
@@ -5243,8 +5292,6 @@ bridge_rtable_init(struct bridge_softc *sc)
 {
        u_int32_t i;
 
-       ASSERT(bridge_in_bsd_mode(sc));
-
        sc->sc_rthash = _MALLOC(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE,
            M_DEVBUF, M_WAITOK | M_ZERO);
        if (sc->sc_rthash == NULL) {
@@ -5465,7 +5512,6 @@ bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr,
        int dir;
 
        BRIDGE_LOCK_ASSERT_HELD(sc);
-       ASSERT(bridge_in_bsd_mode(sc));
 
        hash = bridge_rthash(sc, addr);
        LIST_FOREACH(brt, &sc->sc_rthash[hash], brt_hash) {
@@ -6197,7 +6243,6 @@ bridge_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func bpf_callback)
        if (sc == NULL || (sc->sc_flags & SCF_DETACHING)) {
                return ENODEV;
        }
-       ASSERT(bridge_in_bsd_mode(sc));
        switch (mode) {
        case BPF_TAP_DISABLE:
                sc->sc_bpf_input = sc->sc_bpf_output = NULL;
@@ -6236,10 +6281,8 @@ bridge_detach(ifnet_t ifp)
        bstp_detach(&sc->sc_stp);
 #endif /* BRIDGESTP */
 
-       if (bridge_in_bsd_mode(sc)) {
-               /* Tear down the routing table. */
-               bridge_rtable_fini(sc);
-       }
+       /* Tear down the routing table. */
+       bridge_rtable_fini(sc);
 
        lck_mtx_lock(&bridge_list_mtx);
        LIST_REMOVE(sc, sc_list);
@@ -6261,7 +6304,6 @@ bridge_bpf_input(ifnet_t ifp, struct mbuf *m)
 {
        struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp);
 
-       ASSERT(bridge_in_bsd_mode(sc));
        if (sc->sc_bpf_input) {
                if (mbuf_pkthdr_rcvif(m) != ifp) {
                        printf("%s: rcvif: 0x%llx != ifp 0x%llx\n", __func__,
@@ -6283,7 +6325,6 @@ bridge_bpf_output(ifnet_t ifp, struct mbuf *m)
 {
        struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp);
 
-       ASSERT(bridge_in_bsd_mode(sc));
        if (sc->sc_bpf_output) {
                (*sc->sc_bpf_output)(ifp, m);
        }
@@ -6603,5 +6644,3 @@ done:
        }
        return error;
 }
-
-
index 88bbab70726ab6b118ec9bdd8151df9230571cad..06baffe5eed3c7cfc6a46b9601c939912b358b58 100644 (file)
 #include <net/if_media.h>
 #include <net/ether_if_module.h>
 
+static boolean_t
+is_power_of_two(unsigned int val)
+{
+       return (val & (val - 1)) == 0;
+}
+
 #define FAKE_ETHER_NAME         "feth"
 
 SYSCTL_DECL(_net_link);
@@ -111,6 +117,204 @@ static int if_fake_wmm_mode = 0;
 SYSCTL_INT(_net_link_fake, OID_AUTO, wmm_mode, CTLFLAG_RW | CTLFLAG_LOCKED,
     &if_fake_wmm_mode, 0, "Fake interface in 802.11 WMM mode");
 
+static int if_fake_multibuflet = 0;
+SYSCTL_INT(_net_link_fake, OID_AUTO, multibuflet, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &if_fake_multibuflet, 0, "Fake interface using multi-buflet packets");
+
+static int if_fake_copypkt_mode = 0;
+SYSCTL_INT(_net_link_fake, OID_AUTO, copypkt_mode, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &if_fake_copypkt_mode, 0, "Fake interface copying packet to peer");
+
+/* sysctl net.link.fake.tx_headroom */
+#define FETH_TX_HEADROOM_MAX      32
+static unsigned int if_fake_tx_headroom = 0;
+
+static int
+feth_tx_headroom_sysctl SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       unsigned int new_value;
+       int changed;
+       int error;
+
+       error = sysctl_io_number(req, if_fake_tx_headroom,
+           sizeof(if_fake_tx_headroom), &new_value, &changed);
+       if (error == 0 && changed != 0) {
+               if (new_value > FETH_TX_HEADROOM_MAX ||
+                   (new_value % 8) != 0) {
+                       return EINVAL;
+               }
+               if_fake_tx_headroom = new_value;
+       }
+       return 0;
+}
+
+SYSCTL_PROC(_net_link_fake, OID_AUTO, tx_headroom,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    0, 0, feth_tx_headroom_sysctl, "IU", "Fake ethernet Tx headroom");
+
+
+/* sysctl net.link.fake.max_mtu */
+#define FETH_MAX_MTU_DEFAULT    2048
+#define FETH_MAX_MTU_MAX        ((16 * 1024) - ETHER_HDR_LEN)
+
+static unsigned int if_fake_max_mtu = FETH_MAX_MTU_DEFAULT;
+
+/* sysctl net.link.fake.buflet_size */
+#define FETH_BUFLET_SIZE_MIN            512
+#define FETH_BUFLET_SIZE_MAX            2048
+
+static unsigned int if_fake_buflet_size = FETH_BUFLET_SIZE_MIN;
+
+static int
+feth_max_mtu_sysctl SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       unsigned int new_value;
+       int changed;
+       int error;
+
+       error = sysctl_io_number(req, if_fake_max_mtu,
+           sizeof(if_fake_max_mtu), &new_value, &changed);
+       if (error == 0 && changed != 0) {
+               if (new_value > FETH_MAX_MTU_MAX ||
+                   new_value < ETHERMTU ||
+                   new_value <= if_fake_buflet_size) {
+                       return EINVAL;
+               }
+               if_fake_max_mtu = new_value;
+       }
+       return 0;
+}
+
+SYSCTL_PROC(_net_link_fake, OID_AUTO, max_mtu,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    0, 0, feth_max_mtu_sysctl, "IU", "Fake interface maximum MTU");
+
+static int
+feth_buflet_size_sysctl SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       unsigned int new_value;
+       int changed;
+       int error;
+
+       error = sysctl_io_number(req, if_fake_buflet_size,
+           sizeof(if_fake_buflet_size), &new_value, &changed);
+       if (error == 0 && changed != 0) {
+               /* must be a power of 2 between min and max */
+               if (new_value > FETH_BUFLET_SIZE_MAX ||
+                   new_value < FETH_BUFLET_SIZE_MIN ||
+                   !is_power_of_two(new_value) ||
+                   new_value >= if_fake_max_mtu) {
+                       return EINVAL;
+               }
+               if_fake_buflet_size = new_value;
+       }
+       return 0;
+}
+
+SYSCTL_PROC(_net_link_fake, OID_AUTO, buflet_size,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    0, 0, feth_buflet_size_sysctl, "IU", "Fake interface buflet size");
+
+static unsigned int if_fake_user_access = 0;
+
+static int
+feth_user_access_sysctl SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       unsigned int new_value;
+       int changed;
+       int error;
+
+       error = sysctl_io_number(req, if_fake_user_access,
+           sizeof(if_fake_user_access), &new_value, &changed);
+       if (error == 0 && changed != 0) {
+               if (new_value != 0) {
+                       if (new_value != 1) {
+                               return EINVAL;
+                       }
+                       /*
+                        * copypkt mode requires a kernel only buffer pool so
+                        * it is incompatible with user access mode.
+                        */
+                       if (if_fake_copypkt_mode != 0) {
+                               return ENOTSUP;
+                       }
+               }
+               if_fake_user_access = new_value;
+       }
+       return 0;
+}
+
+SYSCTL_PROC(_net_link_fake, OID_AUTO, user_access,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    0, 0, feth_user_access_sysctl, "IU", "Fake interface user access");
+
+/* sysctl net.link.fake.if_adv_intvl (unit: millisecond) */
+#define FETH_IF_ADV_INTVL_MIN            10
+#define FETH_IF_ADV_INTVL_MAX            INT_MAX
+
+static int if_fake_if_adv_interval = 0; /* no interface advisory */
+static int
+feth_if_adv_interval_sysctl SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       unsigned int new_value;
+       int changed;
+       int error;
+
+       error = sysctl_io_number(req, if_fake_if_adv_interval,
+           sizeof(if_fake_if_adv_interval), &new_value, &changed);
+       if (error == 0 && changed != 0) {
+               if ((new_value != 0) && (new_value > FETH_IF_ADV_INTVL_MAX ||
+                   new_value < FETH_IF_ADV_INTVL_MIN)) {
+                       return EINVAL;
+               }
+               if_fake_if_adv_interval = new_value;
+       }
+       return 0;
+}
+
+SYSCTL_PROC(_net_link_fake, OID_AUTO, if_adv_intvl,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0,
+    feth_if_adv_interval_sysctl, "IU",
+    "Fake interface will generate interface advisories reports at the specified interval in ms");
+
+/* sysctl net.link.fake.tx_drops */
+/*
+ * Fake ethernet will drop packet on the transmit path at the specified
+ * rate, i.e drop one in every if_fake_tx_drops number of packets.
+ */
+#define FETH_TX_DROPS_MIN            0
+#define FETH_TX_DROPS_MAX            INT_MAX
+static int if_fake_tx_drops = 0; /* no packets are dropped */
+static int
+feth_fake_tx_drops_sysctl SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       unsigned int new_value;
+       int changed;
+       int error;
+
+       error = sysctl_io_number(req, if_fake_tx_drops,
+           sizeof(if_fake_tx_drops), &new_value, &changed);
+       if (error == 0 && changed != 0) {
+               if (new_value > FETH_TX_DROPS_MAX ||
+                   new_value < FETH_TX_DROPS_MIN) {
+                       return EINVAL;
+               }
+               if_fake_tx_drops = new_value;
+       }
+       return 0;
+}
+
+SYSCTL_PROC(_net_link_fake, OID_AUTO, tx_drops,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0,
+    feth_fake_tx_drops_sysctl, "IU",
+    "Fake interface will intermittently drop packets on Tx path");
+
 /**
 ** virtual ethernet structures, types
 **/
@@ -125,6 +329,8 @@ typedef uint16_t        iff_flags_t;
 #define IFF_FLAGS_BSD_MODE              0x0002
 #define IFF_FLAGS_DETACHING             0x0004
 #define IFF_FLAGS_WMM_MODE              0x0008
+#define IFF_FLAGS_MULTIBUFLETS          0x0010
+#define IFF_FLAGS_COPYPKT_MODE          0x0020
 
 
 struct if_fake {
@@ -139,6 +345,7 @@ struct if_fake {
        int                     iff_media_list[IF_FAKE_MEDIA_LIST_MAX];
        struct mbuf *           iff_pending_tx_packet;
        boolean_t               iff_start_busy;
+       unsigned int            iff_max_mtu;
 };
 
 typedef struct if_fake * if_fake_ref;
@@ -288,12 +495,35 @@ feth_unlock(void)
 }
 
 static inline int
-feth_max_mtu(void)
+get_max_mtu(int bsd_mode, unsigned int max_mtu)
 {
-       if (njcl > 0) {
-               return M16KCLBYTES - ETHER_HDR_LEN;
+       unsigned int    mtu;
+
+       if (bsd_mode != 0) {
+               mtu = (njcl > 0) ? (M16KCLBYTES - ETHER_HDR_LEN)
+                   : MBIGCLBYTES - ETHER_HDR_LEN;
+               if (mtu > max_mtu) {
+                       mtu = max_mtu;
+               }
+       } else {
+               mtu = max_mtu;
        }
-       return MBIGCLBYTES - ETHER_HDR_LEN;
+       return mtu;
+}
+
+static inline unsigned int
+feth_max_mtu(ifnet_t ifp)
+{
+       if_fake_ref     fakeif;
+       unsigned int    max_mtu = ETHERMTU;
+
+       feth_lock();
+       fakeif = ifnet_get_if_fake(ifp);
+       if (fakeif != NULL) {
+               max_mtu = fakeif->iff_max_mtu;
+       }
+       feth_unlock();
+       return max_mtu;
 }
 
 static void
@@ -406,6 +636,7 @@ feth_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params)
        if (if_fake_hwcsum != 0) {
                fakeif->iff_flags |= IFF_FLAGS_HWCSUM;
        }
+       fakeif->iff_max_mtu = get_max_mtu(if_fake_bsd_mode, if_fake_max_mtu);
 
        /* use the interface name as the unique id for ifp recycle */
        if ((unsigned int)
@@ -598,15 +829,19 @@ feth_start(ifnet_t ifp)
 
        feth_lock();
        fakeif = ifnet_get_if_fake(ifp);
+       if (fakeif == NULL) {
+               feth_unlock();
+               return;
+       }
+
        if (fakeif->iff_start_busy) {
                feth_unlock();
                printf("if_fake: start is busy\n");
                return;
        }
-       if (fakeif != NULL) {
-               peer = fakeif->iff_peer;
-               flags = fakeif->iff_flags;
-       }
+
+       peer = fakeif->iff_peer;
+       flags = fakeif->iff_flags;
 
        /* check for pending TX */
        m = fakeif->iff_pending_tx_packet;
@@ -888,7 +1123,7 @@ feth_get_drvspec(ifnet_t ifp, u_int32_t cmd, u_int32_t len,
                        break;
                }
                feth_lock();
-               fakeif = (if_fake_ref)ifnet_softc(ifp);
+               fakeif = ifnet_get_if_fake(ifp);
                if (fakeif == NULL) {
                        feth_unlock();
                        error = EOPNOTSUPP;
@@ -941,7 +1176,7 @@ feth_ioctl(ifnet_t ifp, u_long cmd, void * data)
        case SIOCGIFMEDIA32:
        case SIOCGIFMEDIA64:
                feth_lock();
-               fakeif = (if_fake_ref)ifnet_softc(ifp);
+               fakeif = ifnet_get_if_fake(ifp);
                if (fakeif == NULL) {
                        feth_unlock();
                        return EOPNOTSUPP;
@@ -973,12 +1208,13 @@ feth_ioctl(ifnet_t ifp, u_long cmd, void * data)
        case SIOCGIFDEVMTU:
                devmtu_p = &ifr->ifr_devmtu;
                devmtu_p->ifdm_current = ifnet_mtu(ifp);
-               devmtu_p->ifdm_max = feth_max_mtu();
+               devmtu_p->ifdm_max = feth_max_mtu(ifp);
                devmtu_p->ifdm_min = IF_MINMTU;
                break;
 
        case SIOCSIFMTU:
-               if (ifr->ifr_mtu > feth_max_mtu() || ifr->ifr_mtu < IF_MINMTU) {
+               if ((unsigned int)ifr->ifr_mtu > feth_max_mtu(ifp) ||
+                   ifr->ifr_mtu < IF_MINMTU) {
                        error = EINVAL;
                } else {
                        error = ifnet_set_mtu(ifp, ifr->ifr_mtu);
diff --git a/bsd/net/if_headless.c b/bsd/net/if_headless.c
new file mode 100644 (file)
index 0000000..f7ebb17
--- /dev/null
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+void
+if_headless_init(void)
+{
+       /* nothing here */
+}
index eb32af709a0853e5050af750bf12eab943ad6f00..e967cad2cd73c54f5021a5fc66cfb72c80dec4a5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -54,6 +54,7 @@
 #include <netkey/key.h>
 #include <net/pktap.h>
 #include <kern/zalloc.h>
+#include <os/log.h>
 
 #define IPSEC_NEXUS 0
 
@@ -95,7 +96,6 @@ static errno_t ipsec_proto_pre_output(ifnet_t interface, protocol_family_t proto
     char *frame_type, char *link_layer_dest);
 
 static kern_ctl_ref     ipsec_kctlref;
-static u_int32_t        ipsec_family;
 static lck_attr_t *ipsec_lck_attr;
 static lck_grp_attr_t *ipsec_lck_grp_attr;
 static lck_grp_t *ipsec_lck_grp;
@@ -116,12 +116,23 @@ SYSCTL_INT(_net_ipsec, OID_AUTO, verify_interface_creation, CTLFLAG_RW | CTLFLAG
 #define IPSEC_IF_DEFAULT_RX_FSW_RING_SIZE 128
 #define IPSEC_IF_DEFAULT_BUF_SEG_SIZE   skmem_usr_buf_seg_size
 
-#define IPSEC_IF_MIN_RING_SIZE 16
+#define IPSEC_IF_WMM_RING_COUNT NEXUS_NUM_WMM_QUEUES
+#define IPSEC_IF_MAX_RING_COUNT IPSEC_IF_WMM_RING_COUNT
+#define IPSEC_NETIF_WMM_TX_RING_COUNT IPSEC_IF_WMM_RING_COUNT
+#define IPSEC_NETIF_WMM_RX_RING_COUNT 1
+#define IPSEC_NETIF_MAX_TX_RING_COUNT IPSEC_NETIF_WMM_TX_RING_COUNT
+#define IPSEC_NETIF_MAX_RX_RING_COUNT IPSEC_NETIF_WMM_RX_RING_COUNT
+
+#define IPSEC_IF_MIN_RING_SIZE 8
 #define IPSEC_IF_MAX_RING_SIZE 1024
 
 #define IPSEC_IF_MIN_SLOT_SIZE 1024
 #define IPSEC_IF_MAX_SLOT_SIZE 4096
 
+#define IPSEC_DEFAULT_MAX_PENDING_INPUT_COUNT 512
+
+static int if_ipsec_max_pending_input = IPSEC_DEFAULT_MAX_PENDING_INPUT_COUNT;
+
 static int sysctl_if_ipsec_ring_size SYSCTL_HANDLER_ARGS;
 static int sysctl_if_ipsec_tx_fsw_ring_size SYSCTL_HANDLER_ARGS;
 static int sysctl_if_ipsec_rx_fsw_ring_size SYSCTL_HANDLER_ARGS;
@@ -130,6 +141,7 @@ static int if_ipsec_ring_size = IPSEC_IF_DEFAULT_RING_SIZE;
 static int if_ipsec_tx_fsw_ring_size = IPSEC_IF_DEFAULT_TX_FSW_RING_SIZE;
 static int if_ipsec_rx_fsw_ring_size = IPSEC_IF_DEFAULT_RX_FSW_RING_SIZE;
 
+SYSCTL_INT(_net_ipsec, OID_AUTO, max_pending_input, CTLFLAG_LOCKED | CTLFLAG_RW, &if_ipsec_max_pending_input, 0, "");
 SYSCTL_PROC(_net_ipsec, OID_AUTO, ring_size, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
     &if_ipsec_ring_size, IPSEC_IF_DEFAULT_RING_SIZE, &sysctl_if_ipsec_ring_size, "I", "");
 SYSCTL_PROC(_net_ipsec, OID_AUTO, tx_fsw_ring_size, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
@@ -137,17 +149,20 @@ SYSCTL_PROC(_net_ipsec, OID_AUTO, tx_fsw_ring_size, CTLTYPE_INT | CTLFLAG_LOCKED
 SYSCTL_PROC(_net_ipsec, OID_AUTO, rx_fsw_ring_size, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
     &if_ipsec_rx_fsw_ring_size, IPSEC_IF_DEFAULT_RX_FSW_RING_SIZE, &sysctl_if_ipsec_rx_fsw_ring_size, "I", "");
 
+static int if_ipsec_debug = 0;
+SYSCTL_INT(_net_ipsec, OID_AUTO, debug, CTLFLAG_LOCKED | CTLFLAG_RW, &if_ipsec_debug, 0, "");
+
 static errno_t
 ipsec_register_nexus(void);
 
 typedef struct ipsec_nx {
        uuid_t if_provider;
        uuid_t if_instance;
-       uuid_t ms_provider;
-       uuid_t ms_instance;
-       uuid_t ms_device;
-       uuid_t ms_host;
-       uuid_t ms_agent;
+       uuid_t fsw_provider;
+       uuid_t fsw_instance;
+       uuid_t fsw_device;
+       uuid_t fsw_host;
+       uuid_t fsw_agent;
 } *ipsec_nx_t;
 
 static nexus_controller_t ipsec_ncd;
@@ -160,47 +175,97 @@ static uuid_t ipsec_kpipe_uuid;
 struct ipsec_pcb {
        TAILQ_ENTRY(ipsec_pcb)  ipsec_chain;
        kern_ctl_ref            ipsec_ctlref;
-       ifnet_t                         ipsec_ifp;
-       u_int32_t                       ipsec_unit;
-       u_int32_t                       ipsec_unique_id;
-       u_int32_t                       ipsec_flags;
-       u_int32_t                       ipsec_input_frag_size;
-       bool                            ipsec_frag_size_set;
-       int                                     ipsec_ext_ifdata_stats;
+       ifnet_t                 ipsec_ifp;
+       u_int32_t               ipsec_unit;
+       u_int32_t               ipsec_unique_id;
+       // These external flags can be set with IPSEC_OPT_FLAGS
+       u_int32_t               ipsec_external_flags;
+       // These internal flags are only used within this driver
+       u_int32_t               ipsec_internal_flags;
+       u_int32_t               ipsec_input_frag_size;
+       bool                    ipsec_frag_size_set;
+       int                     ipsec_ext_ifdata_stats;
        mbuf_svc_class_t        ipsec_output_service_class;
-       char                            ipsec_if_xname[IFXNAMSIZ];
-       char                            ipsec_unique_name[IFXNAMSIZ];
-       // PCB lock protects state fields, like ipsec_kpipe_enabled
+       char                    ipsec_if_xname[IFXNAMSIZ];
+       char                    ipsec_unique_name[IFXNAMSIZ];
+       // PCB lock protects state fields, like ipsec_kpipe_count
        decl_lck_rw_data(, ipsec_pcb_lock);
+       // lock to protect ipsec_pcb_data_move & ipsec_pcb_drainers
+       decl_lck_mtx_data(, ipsec_pcb_data_move_lock);
+       u_int32_t               ipsec_pcb_data_move; /* number of data moving contexts */
+       u_int32_t               ipsec_pcb_drainers; /* number of threads waiting to drain */
+       u_int32_t               ipsec_pcb_data_path_state; /* internal state of interface data path */
 
 #if IPSEC_NEXUS
-       lck_mtx_t                       ipsec_input_chain_lock;
+       lck_mtx_t               ipsec_input_chain_lock;
+       lck_mtx_t               ipsec_kpipe_encrypt_lock;
+       lck_mtx_t               ipsec_kpipe_decrypt_lock;
        struct mbuf *           ipsec_input_chain;
        struct mbuf *           ipsec_input_chain_last;
+       u_int32_t               ipsec_input_chain_count;
        // Input chain lock protects the list of input mbufs
        // The input chain lock must be taken AFTER the PCB lock if both are held
        struct ipsec_nx         ipsec_nx;
-       int                                     ipsec_kpipe_enabled;
-       uuid_t                          ipsec_kpipe_uuid;
-       void *                          ipsec_kpipe_rxring;
-       void *                          ipsec_kpipe_txring;
-       kern_pbufpool_t                 ipsec_kpipe_pp;
+       u_int32_t               ipsec_kpipe_count;
+       pid_t                   ipsec_kpipe_pid;
+       uuid_t                  ipsec_kpipe_uuid[IPSEC_IF_MAX_RING_COUNT];
+       void *                  ipsec_kpipe_rxring[IPSEC_IF_MAX_RING_COUNT];
+       void *                  ipsec_kpipe_txring[IPSEC_IF_MAX_RING_COUNT];
+       kern_pbufpool_t         ipsec_kpipe_pp;
+       u_int32_t               ipsec_kpipe_tx_ring_size;
+       u_int32_t               ipsec_kpipe_rx_ring_size;
 
        kern_nexus_t            ipsec_netif_nexus;
-       kern_pbufpool_t                 ipsec_netif_pp;
-       void *                          ipsec_netif_rxring;
-       void *                          ipsec_netif_txring;
-       uint64_t                        ipsec_netif_txring_size;
-
-       u_int32_t                       ipsec_slot_size;
-       u_int32_t                       ipsec_netif_ring_size;
-       u_int32_t                       ipsec_tx_fsw_ring_size;
-       u_int32_t                       ipsec_rx_fsw_ring_size;
-       bool                            ipsec_use_netif;
-       bool                            ipsec_needs_netagent;
+       kern_pbufpool_t         ipsec_netif_pp;
+       void *                  ipsec_netif_rxring[IPSEC_NETIF_MAX_RX_RING_COUNT];
+       void *                  ipsec_netif_txring[IPSEC_NETIF_MAX_TX_RING_COUNT];
+       uint64_t                ipsec_netif_txring_size;
+
+       u_int32_t               ipsec_slot_size;
+       u_int32_t               ipsec_netif_ring_size;
+       u_int32_t               ipsec_tx_fsw_ring_size;
+       u_int32_t               ipsec_rx_fsw_ring_size;
+       bool                    ipsec_use_netif;
+       bool                    ipsec_needs_netagent;
 #endif // IPSEC_NEXUS
 };
 
+/* These are internal flags not exposed outside this file */
+#define IPSEC_FLAGS_KPIPE_ALLOCATED 1
+
+/* data movement refcounting functions */
+static boolean_t ipsec_data_move_begin(struct ipsec_pcb *pcb);
+static void ipsec_data_move_end(struct ipsec_pcb *pcb);
+static void ipsec_wait_data_move_drain(struct ipsec_pcb *pcb);
+
+/* Data path states */
+#define IPSEC_PCB_DATA_PATH_READY    0x1
+
+/* Macros to set/clear/test data path states */
+#define IPSEC_SET_DATA_PATH_READY(_pcb) ((_pcb)->ipsec_pcb_data_path_state |= IPSEC_PCB_DATA_PATH_READY)
+#define IPSEC_CLR_DATA_PATH_READY(_pcb) ((_pcb)->ipsec_pcb_data_path_state &= ~IPSEC_PCB_DATA_PATH_READY)
+#define IPSEC_IS_DATA_PATH_READY(_pcb) (((_pcb)->ipsec_pcb_data_path_state & IPSEC_PCB_DATA_PATH_READY) != 0)
+
+#if IPSEC_NEXUS
+/* Macros to clear/set/test flags. */
+static inline void
+ipsec_flag_set(struct ipsec_pcb *pcb, uint32_t flag)
+{
+       pcb->ipsec_internal_flags |= flag;
+}
+static inline void
+ipsec_flag_clr(struct ipsec_pcb *pcb, uint32_t flag)
+{
+       pcb->ipsec_internal_flags &= ~flag;
+}
+
+static inline bool
+ipsec_flag_isset(struct ipsec_pcb *pcb, uint32_t flag)
+{
+       return !!(pcb->ipsec_internal_flags & flag);
+}
+#endif // IPSEC_NEXUS
+
 TAILQ_HEAD(ipsec_list, ipsec_pcb) ipsec_head;
 
 #define IPSEC_PCB_ZONE_MAX              32
@@ -274,6 +339,14 @@ sysctl_if_ipsec_rx_fsw_ring_size SYSCTL_HANDLER_ARGS
 
        return 0;
 }
+
+
+static inline bool
+ipsec_in_wmm_mode(struct ipsec_pcb *pcb)
+{
+       return pcb->ipsec_kpipe_count == IPSEC_IF_WMM_RING_COUNT;
+}
+
 #endif // IPSEC_NEXUS
 
 errno_t
@@ -282,19 +355,12 @@ ipsec_register_control(void)
        struct kern_ctl_reg     kern_ctl;
        errno_t                         result = 0;
 
-       /* Find a unique value for our interface family */
-       result = mbuf_tag_id_find(IPSEC_CONTROL_NAME, &ipsec_family);
-       if (result != 0) {
-               printf("ipsec_register_control - mbuf_tag_id_find_internal failed: %d\n", result);
-               return result;
-       }
-
        ipsec_pcb_size = sizeof(struct ipsec_pcb);
        ipsec_pcb_zone = zinit(ipsec_pcb_size,
            IPSEC_PCB_ZONE_MAX * ipsec_pcb_size,
            0, IPSEC_PCB_ZONE_NAME);
        if (ipsec_pcb_zone == NULL) {
-               printf("ipsec_register_control - zinit(ipsec_pcb) failed");
+               os_log_error(OS_LOG_DEFAULT, "ipsec_register_control - zinit(ipsec_pcb) failed");
                return ENOMEM;
        }
 
@@ -319,26 +385,26 @@ ipsec_register_control(void)
 
        result = ctl_register(&kern_ctl, &ipsec_kctlref);
        if (result != 0) {
-               printf("ipsec_register_control - ctl_register failed: %d\n", result);
+               os_log_error(OS_LOG_DEFAULT, "ipsec_register_control - ctl_register failed: %d\n", result);
                return result;
        }
 
        /* Register the protocol plumbers */
-       if ((result = proto_register_plumber(PF_INET, ipsec_family,
+       if ((result = proto_register_plumber(PF_INET, IFNET_FAMILY_IPSEC,
            ipsec_attach_proto, NULL)) != 0) {
-               printf("ipsec_register_control - proto_register_plumber(PF_INET, %d) failed: %d\n",
-                   ipsec_family, result);
+               os_log_error(OS_LOG_DEFAULT, "ipsec_register_control - proto_register_plumber(PF_INET, IFNET_FAMILY_IPSEC) failed: %d\n",
+                   result);
                ctl_deregister(ipsec_kctlref);
                return result;
        }
 
        /* Register the protocol plumbers */
-       if ((result = proto_register_plumber(PF_INET6, ipsec_family,
+       if ((result = proto_register_plumber(PF_INET6, IFNET_FAMILY_IPSEC,
            ipsec_attach_proto, NULL)) != 0) {
-               proto_unregister_plumber(PF_INET, ipsec_family);
+               proto_unregister_plumber(PF_INET, IFNET_FAMILY_IPSEC);
                ctl_deregister(ipsec_kctlref);
-               printf("ipsec_register_control - proto_register_plumber(PF_INET6, %d) failed: %d\n",
-                   ipsec_family, result);
+               os_log_error(OS_LOG_DEFAULT, "ipsec_register_control - proto_register_plumber(PF_INET6, IFNET_FAMILY_IPSEC) failed: %d\n",
+                   result);
                return result;
        }
 
@@ -449,7 +515,7 @@ ipsec_register_nexus(void)
            &dp_init, sizeof(dp_init),
            &ipsec_nx_dom_prov);
        if (err != 0) {
-               printf("%s: failed to register domain provider\n", __func__);
+               os_log_error(OS_LOG_DEFAULT, "%s: failed to register domain provider\n", __func__);
                return err;
        }
        return 0;
@@ -480,6 +546,12 @@ ipsec_nexus_connected(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
 #pragma unused(nxprov, channel)
        struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
        boolean_t ok = ifnet_is_attached(pcb->ipsec_ifp, 1);
+       /* Mark the data path as ready */
+       if (ok) {
+               lck_mtx_lock(&pcb->ipsec_pcb_data_move_lock);
+               IPSEC_SET_DATA_PATH_READY(pcb);
+               lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
+       }
        return ok ? 0 : ENXIO;
 }
 
@@ -487,14 +559,24 @@ static void
 ipsec_nexus_pre_disconnect(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
     kern_channel_t channel)
 {
-#pragma unused(nxprov, nexus, channel)
+#pragma unused(nxprov, channel)
+       struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
+
+       VERIFY(pcb->ipsec_kpipe_count != 0);
+
+       /* Wait until all threads in the data paths are done. */
+       ipsec_wait_data_move_drain(pcb);
 }
 
 static void
 ipsec_netif_pre_disconnect(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
     kern_channel_t channel)
 {
-#pragma unused(nxprov, nexus, channel)
+#pragma unused(nxprov, channel)
+       struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
+
+       /* Wait until all threads in the data paths are done. */
+       ipsec_wait_data_move_drain(pcb);
 }
 
 static void
@@ -516,14 +598,30 @@ ipsec_kpipe_ring_init(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
 {
 #pragma unused(nxprov)
 #pragma unused(channel)
-#pragma unused(ring_ctx)
        struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
+       uint8_t ring_idx;
+
+       for (ring_idx = 0; ring_idx < pcb->ipsec_kpipe_count; ring_idx++) {
+               if (!uuid_compare(channel->ch_info->cinfo_nx_uuid, pcb->ipsec_kpipe_uuid[ring_idx])) {
+                       break;
+               }
+       }
+
+       if (ring_idx == pcb->ipsec_kpipe_count) {
+               uuid_string_t uuidstr;
+               uuid_unparse(channel->ch_info->cinfo_nx_uuid, uuidstr);
+               os_log_error(OS_LOG_DEFAULT, "%s: %s cannot find channel %s\n", __func__, pcb->ipsec_if_xname, uuidstr);
+               return ENOENT;
+       }
+
+       *ring_ctx = (void *)(uintptr_t)ring_idx;
+
        if (!is_tx_ring) {
-               VERIFY(pcb->ipsec_kpipe_rxring == NULL);
-               pcb->ipsec_kpipe_rxring = ring;
+               VERIFY(pcb->ipsec_kpipe_rxring[ring_idx] == NULL);
+               pcb->ipsec_kpipe_rxring[ring_idx] = ring;
        } else {
-               VERIFY(pcb->ipsec_kpipe_txring == NULL);
-               pcb->ipsec_kpipe_txring = ring;
+               VERIFY(pcb->ipsec_kpipe_txring[ring_idx] == NULL);
+               pcb->ipsec_kpipe_txring[ring_idx] = ring;
        }
        return 0;
 }
@@ -533,12 +631,19 @@ ipsec_kpipe_ring_fini(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
     kern_channel_ring_t ring)
 {
 #pragma unused(nxprov)
+       bool found = false;
        struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
-       if (pcb->ipsec_kpipe_rxring == ring) {
-               pcb->ipsec_kpipe_rxring = NULL;
-       } else if (pcb->ipsec_kpipe_txring == ring) {
-               pcb->ipsec_kpipe_txring = NULL;
+
+       for (unsigned int i = 0; i < pcb->ipsec_kpipe_count; i++) {
+               if (pcb->ipsec_kpipe_rxring[i] == ring) {
+                       pcb->ipsec_kpipe_rxring[i] = NULL;
+                       found = true;
+               } else if (pcb->ipsec_kpipe_txring[i] == ring) {
+                       pcb->ipsec_kpipe_txring[i] = NULL;
+                       found = true;
+               }
        }
+       VERIFY(found);
 }
 
 static errno_t
@@ -549,27 +654,38 @@ ipsec_kpipe_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
 #pragma unused(flags)
        struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
 
+       if (!ipsec_data_move_begin(pcb)) {
+               os_log_info(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp));
+               return 0;
+       }
+
        lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
-       int channel_enabled = pcb->ipsec_kpipe_enabled;
-       if (!channel_enabled) {
+
+       if (!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) {
                lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
+               ipsec_data_move_end(pcb);
                return 0;
        }
 
+       VERIFY(pcb->ipsec_kpipe_count);
+
        kern_channel_slot_t tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL);
        if (tx_slot == NULL) {
                // Nothing to write, bail
                lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
+               ipsec_data_move_end(pcb);
                return 0;
        }
 
        // Signal the netif ring to read
-       kern_channel_ring_t rx_ring = pcb->ipsec_netif_rxring;
+       kern_channel_ring_t rx_ring = pcb->ipsec_netif_rxring[0];
        lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
 
        if (rx_ring != NULL) {
                kern_channel_notify(rx_ring, 0);
        }
+
+       ipsec_data_move_end(pcb);
        return 0;
 }
 
@@ -613,7 +729,7 @@ ipsec_encrypt_mbuf(ifnet_t interface,
                data = ipsec_state.m;
                if (error || data == NULL) {
                        if (error) {
-                               printf("ipsec_encrypt_mbuf: ipsec4_output error %d\n", error);
+                               os_log_error(OS_LOG_DEFAULT, "ipsec_encrypt_mbuf: ipsec4_output error %d\n", error);
                        }
                        goto ipsec_output_err;
                }
@@ -624,7 +740,7 @@ ipsec_encrypt_mbuf(ifnet_t interface,
 
                data = ipsec6_splithdr(data);
                if (data == NULL) {
-                       printf("ipsec_encrypt_mbuf: ipsec6_splithdr returned NULL\n");
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_encrypt_mbuf: ipsec6_splithdr returned NULL\n");
                        goto ipsec_output_err;
                }
 
@@ -645,14 +761,14 @@ ipsec_encrypt_mbuf(ifnet_t interface,
                data = ipsec_state.m;
                if (error || data == NULL) {
                        if (error) {
-                               printf("ipsec_encrypt_mbuf: ipsec6_output error %d\n", error);
+                               os_log_error(OS_LOG_DEFAULT, "ipsec_encrypt_mbuf: ipsec6_output error %d\n", error);
                        }
                        goto ipsec_output_err;
                }
                goto done;
        }
        default: {
-               printf("ipsec_encrypt_mbuf: Received unknown packet version %d\n", ip_version);
+               os_log_error(OS_LOG_DEFAULT, "ipsec_encrypt_mbuf: Received unknown packet version %d\n", ip_version);
                error = -1;
                goto ipsec_output_err;
        }
@@ -676,28 +792,43 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
 #pragma unused(flags)
        struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
        struct kern_channel_ring_stat_increment rx_ring_stats;
+       uint8_t ring_idx = (uint8_t)(uintptr_t)kern_channel_ring_get_context(rx_ring);
+
+       if (!ipsec_data_move_begin(pcb)) {
+               os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp));
+               return 0;
+       }
 
        lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
 
-       int channel_enabled = pcb->ipsec_kpipe_enabled;
-       if (!channel_enabled) {
+       if (!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) {
                lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
+               ipsec_data_move_end(pcb);
                return 0;
        }
 
+       VERIFY(pcb->ipsec_kpipe_count);
+       VERIFY(ring_idx <= pcb->ipsec_kpipe_count);
+
        // Reclaim user-released slots
        (void) kern_channel_reclaim(rx_ring);
 
        uint32_t avail = kern_channel_available_slot_count(rx_ring);
        if (avail == 0) {
                lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
+               os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d no room in rx_ring\n", __func__,
+                   pcb->ipsec_if_xname, rx_ring->ckr_name, ring_idx);
+               ipsec_data_move_end(pcb);
                return 0;
        }
 
-       kern_channel_ring_t tx_ring = pcb->ipsec_netif_txring;
+       kern_channel_ring_t tx_ring = pcb->ipsec_netif_txring[ring_idx];
        if (tx_ring == NULL) {
                // Net-If TX ring not set up yet, nothing to read
                lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
+               os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d bad netif_txring 1\n", __func__,
+                   pcb->ipsec_if_xname, rx_ring->ckr_name, ring_idx);
+               ipsec_data_move_end(pcb);
                return 0;
        }
 
@@ -710,15 +841,17 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
 
        // Lock again after entering and validate
        lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
-       if (tx_ring != pcb->ipsec_netif_txring) {
+       if (tx_ring != pcb->ipsec_netif_txring[ring_idx]) {
                // Ring no longer valid
                // Unlock first, then exit ring
                lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
                kr_exit(tx_ring);
+               os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d bad netif_txring 2\n", __func__,
+                   pcb->ipsec_if_xname, rx_ring->ckr_name, ring_idx);
+               ipsec_data_move_end(pcb);
                return 0;
        }
 
-
        struct kern_channel_ring_stat_increment tx_ring_stats;
        bzero(&tx_ring_stats, sizeof(tx_ring_stats));
        kern_channel_slot_t tx_pslot = NULL;
@@ -728,6 +861,7 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                // Unlock first, then exit ring
                lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
                kr_exit(tx_ring);
+               ipsec_data_move_end(pcb);
                return 0;
        }
 
@@ -746,7 +880,7 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                kern_packet_t rx_ph = 0;
                error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph);
                if (__improbable(error != 0)) {
-                       printf("ipsec_kpipe_sync_rx %s: failed to allocate packet\n",
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s: failed to allocate packet\n",
                            pcb->ipsec_ifp->if_xname);
                        break;
                }
@@ -783,27 +917,29 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                                error = mbuf_copyback(data, 0, length, tx_baddr, MBUF_DONTWAIT);
                                if (error == 0) {
                                        // Encrypt and send packet
+                                       lck_mtx_lock(&pcb->ipsec_kpipe_encrypt_lock);
                                        data = ipsec_encrypt_mbuf(pcb->ipsec_ifp, data);
+                                       lck_mtx_unlock(&pcb->ipsec_kpipe_encrypt_lock);
                                } else {
-                                       printf("ipsec_kpipe_sync_rx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error);
-                                       STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF);
-                                       STATS_INC(nifs, NETIF_STATS_DROPPED);
+                                       os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error);
+                                       STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
+                                       STATS_INC(nifs, NETIF_STATS_DROP);
                                        mbuf_freem(data);
                                        data = NULL;
                                }
                        } else {
-                               printf("ipsec_kpipe_sync_rx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error);
-                               STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF);
-                               STATS_INC(nifs, NETIF_STATS_DROPPED);
+                               os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error);
+                               STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
+                               STATS_INC(nifs, NETIF_STATS_DROP);
                        }
                } else {
-                       printf("ipsec_kpipe_sync_rx %s - 0 length packet\n", pcb->ipsec_ifp->if_xname);
-                       STATS_INC(nifs, NETIF_STATS_BADLEN);
-                       STATS_INC(nifs, NETIF_STATS_DROPPED);
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s - 0 length packet\n", pcb->ipsec_ifp->if_xname);
+                       STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                       STATS_INC(nifs, NETIF_STATS_DROP);
                }
 
                if (data == NULL) {
-                       printf("ipsec_kpipe_sync_rx %s: no encrypted packet to send\n", pcb->ipsec_ifp->if_xname);
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s: no encrypted packet to send\n", pcb->ipsec_ifp->if_xname);
                        kern_pbufpool_free(rx_pp, rx_ph);
                        break;
                }
@@ -813,7 +949,7 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                        // Flush data
                        mbuf_freem(data);
                        kern_pbufpool_free(rx_pp, rx_ph);
-                       printf("ipsec_kpipe_sync_rx %s: encrypted packet length %zu > %u\n",
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx %s: encrypted packet length %zu > %u\n",
                            pcb->ipsec_ifp->if_xname, length, rx_pp->pp_buflet_size);
                        continue;
                }
@@ -838,8 +974,8 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph);
                VERIFY(error == 0);
 
-               STATS_INC(nifs, NETIF_STATS_TXPKTS);
-               STATS_INC(nifs, NETIF_STATS_TXCOPY_DIRECT);
+               STATS_INC(nifs, NETIF_STATS_TX_PACKETS);
+               STATS_INC(nifs, NETIF_STATS_TX_COPY_DIRECT);
 
                rx_ring_stats.kcrsi_slots_transferred++;
                rx_ring_stats.kcrsi_bytes_transferred += length;
@@ -868,7 +1004,7 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
        /* always reenable output */
        errno_t error = ifnet_enable_output(pcb->ipsec_ifp);
        if (error != 0) {
-               printf("ipsec_kpipe_sync_rx: ifnet_enable_output returned error %d\n", error);
+               os_log_error(OS_LOG_DEFAULT, "ipsec_kpipe_sync_rx: ifnet_enable_output returned error %d\n", error);
        }
 
        // Unlock first, then exit ring
@@ -879,9 +1015,33 @@ ipsec_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
        }
        kr_exit(tx_ring);
 
+       ipsec_data_move_end(pcb);
        return 0;
 }
 
+static uint8_t
+ipsec_find_tx_ring_by_svc(kern_packet_svc_class_t svc_class)
+{
+       switch (svc_class) {
+       case KPKT_SC_VO: {
+               return 0;
+       }
+       case KPKT_SC_VI: {
+               return 1;
+       }
+       case KPKT_SC_BE: {
+               return 2;
+       }
+       case KPKT_SC_BK: {
+               return 3;
+       }
+       default: {
+               VERIFY(0);
+               return 0;
+       }
+       }
+}
+
 static errno_t
 ipsec_netif_ring_init(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
     kern_channel_t channel, kern_channel_ring_t ring, boolean_t is_tx_ring,
@@ -889,14 +1049,26 @@ ipsec_netif_ring_init(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
 {
 #pragma unused(nxprov)
 #pragma unused(channel)
-#pragma unused(ring_ctx)
        struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
+
        if (!is_tx_ring) {
-               VERIFY(pcb->ipsec_netif_rxring == NULL);
-               pcb->ipsec_netif_rxring = ring;
+               VERIFY(pcb->ipsec_netif_rxring[0] == NULL);
+               pcb->ipsec_netif_rxring[0] = ring;
        } else {
-               VERIFY(pcb->ipsec_netif_txring == NULL);
-               pcb->ipsec_netif_txring = ring;
+               uint8_t ring_idx = 0;
+               if (ipsec_in_wmm_mode(pcb)) {
+                       int err;
+                       kern_packet_svc_class_t svc_class;
+                       err = kern_channel_get_service_class(ring, &svc_class);
+                       VERIFY(err == 0);
+                       ring_idx = ipsec_find_tx_ring_by_svc(svc_class);
+                       VERIFY(ring_idx < IPSEC_IF_WMM_RING_COUNT);
+               }
+
+               *ring_ctx = (void *)(uintptr_t)ring_idx;
+
+               VERIFY(pcb->ipsec_netif_txring[ring_idx] == NULL);
+               pcb->ipsec_netif_txring[ring_idx] = ring;
        }
        return 0;
 }
@@ -907,11 +1079,23 @@ ipsec_netif_ring_fini(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
 {
 #pragma unused(nxprov)
        struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
-       if (pcb->ipsec_netif_rxring == ring) {
-               pcb->ipsec_netif_rxring = NULL;
-       } else if (pcb->ipsec_netif_txring == ring) {
-               pcb->ipsec_netif_txring = NULL;
+       bool found = false;
+
+       for (int i = 0; i < IPSEC_NETIF_MAX_RX_RING_COUNT; i++) {
+               if (pcb->ipsec_netif_rxring[i] == ring) {
+                       pcb->ipsec_netif_rxring[i] = NULL;
+                       VERIFY(!found);
+                       found = true;
+               }
+       }
+       for (int i = 0; i < IPSEC_NETIF_MAX_TX_RING_COUNT; i++) {
+               if (pcb->ipsec_netif_txring[i] == ring) {
+                       pcb->ipsec_netif_txring[i] = NULL;
+                       VERIFY(!found);
+                       found = true;
+               }
        }
+       VERIFY(found);
 }
 
 static bool
@@ -935,12 +1119,12 @@ ipsec_netif_check_policy(mbuf_t data)
        u_int ip_version = ip->ip_v;
        switch (ip_version) {
        case 4: {
-               necp_matched_policy_id = necp_ip_output_find_policy_match(data, 0, NULL,
+               necp_matched_policy_id = necp_ip_output_find_policy_match(data, 0, NULL, NULL,
                    &necp_result, &necp_result_parameter);
                break;
        }
        case 6: {
-               necp_matched_policy_id = necp_ip6_output_find_policy_match(data, 0, NULL,
+               necp_matched_policy_id = necp_ip6_output_find_policy_match(data, 0, NULL, NULL,
                    &necp_result, &necp_result_parameter);
                break;
        }
@@ -969,6 +1153,11 @@ ipsec_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
 
        struct netif_stats *nifs = &NX_NETIF_PRIVATE(nexus)->nif_stats;
 
+       if (!ipsec_data_move_begin(pcb)) {
+               os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp));
+               return 0;
+       }
+
        lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
 
        struct kern_channel_ring_stat_increment tx_ring_stats;
@@ -976,22 +1165,31 @@ ipsec_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
        kern_channel_slot_t tx_pslot = NULL;
        kern_channel_slot_t tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL);
 
-       STATS_INC(nifs, NETIF_STATS_TXSYNC);
+       STATS_INC(nifs, NETIF_STATS_TX_SYNC);
 
        if (tx_slot == NULL) {
                // Nothing to write, don't bother signalling
                lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
+               ipsec_data_move_end(pcb);
                return 0;
        }
 
-       if (pcb->ipsec_kpipe_enabled) {
-               kern_channel_ring_t rx_ring = pcb->ipsec_kpipe_rxring;
+       if (pcb->ipsec_kpipe_count &&
+           ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) {
+               // Select the corresponding kpipe rx ring
+               uint8_t ring_idx = (uint8_t)(uintptr_t)kern_channel_ring_get_context(tx_ring);
+               VERIFY(ring_idx < IPSEC_IF_MAX_RING_COUNT);
+               kern_channel_ring_t rx_ring = pcb->ipsec_kpipe_rxring[ring_idx];
+
+               // Unlock while calling notify
                lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
 
                // Signal the kernel pipe ring to read
                if (rx_ring != NULL) {
                        kern_channel_notify(rx_ring, 0);
                }
+
+               ipsec_data_move_end(pcb);
                return 0;
        }
 
@@ -1032,42 +1230,42 @@ ipsec_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
 
                                        // Check policy with NECP
                                        if (!ipsec_netif_check_policy(data)) {
-                                               printf("ipsec_netif_sync_tx %s - failed policy check\n", pcb->ipsec_ifp->if_xname);
-                                               STATS_INC(nifs, NETIF_STATS_DROPPED);
+                                               os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - failed policy check\n", pcb->ipsec_ifp->if_xname);
+                                               STATS_INC(nifs, NETIF_STATS_DROP);
                                                mbuf_freem(data);
                                                data = NULL;
                                        } else {
                                                // Send through encryption
                                                error = ipsec_output(pcb->ipsec_ifp, data);
                                                if (error != 0) {
-                                                       printf("ipsec_netif_sync_tx %s - ipsec_output error %d\n", pcb->ipsec_ifp->if_xname, error);
+                                                       os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - ipsec_output error %d\n", pcb->ipsec_ifp->if_xname, error);
                                                }
                                        }
                                } else {
-                                       printf("ipsec_netif_sync_tx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error);
-                                       STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF);
-                                       STATS_INC(nifs, NETIF_STATS_DROPPED);
+                                       os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error);
+                                       STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
+                                       STATS_INC(nifs, NETIF_STATS_DROP);
                                        mbuf_freem(data);
                                        data = NULL;
                                }
                        } else {
-                               printf("ipsec_netif_sync_tx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error);
-                               STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF);
-                               STATS_INC(nifs, NETIF_STATS_DROPPED);
+                               os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error);
+                               STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
+                               STATS_INC(nifs, NETIF_STATS_DROP);
                        }
                } else {
-                       printf("ipsec_netif_sync_tx %s - 0 length packet\n", pcb->ipsec_ifp->if_xname);
-                       STATS_INC(nifs, NETIF_STATS_BADLEN);
-                       STATS_INC(nifs, NETIF_STATS_DROPPED);
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s - 0 length packet\n", pcb->ipsec_ifp->if_xname);
+                       STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                       STATS_INC(nifs, NETIF_STATS_DROP);
                }
 
                if (data == NULL) {
-                       printf("ipsec_netif_sync_tx %s: no encrypted packet to send\n", pcb->ipsec_ifp->if_xname);
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_tx %s: no encrypted packet to send\n", pcb->ipsec_ifp->if_xname);
                        break;
                }
 
-               STATS_INC(nifs, NETIF_STATS_TXPKTS);
-               STATS_INC(nifs, NETIF_STATS_TXCOPY_MBUF);
+               STATS_INC(nifs, NETIF_STATS_TX_PACKETS);
+               STATS_INC(nifs, NETIF_STATS_TX_COPY_MBUF);
 
                tx_ring_stats.kcrsi_slots_transferred++;
                tx_ring_stats.kcrsi_bytes_transferred += length;
@@ -1080,19 +1278,22 @@ ipsec_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
        }
 
        lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
+       ipsec_data_move_end(pcb);
 
        return 0;
 }
 
 static errno_t
-ipsec_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
-    kern_channel_ring_t ring, __unused uint32_t flags)
+ipsec_netif_tx_doorbell_one(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
+    kern_channel_ring_t ring, uint32_t flags, uint8_t ring_idx)
 {
 #pragma unused(nxprov)
        struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
        boolean_t more = false;
        errno_t rc = 0;
 
+       VERIFY((flags & KERN_NEXUS_TXDOORBELLF_ASYNC_REFILL) == 0);
+
        /*
         * Refill and sync the ring; we may be racing against another thread doing
         * an RX sync that also wants to do kr_enter(), and so use the blocking
@@ -1100,26 +1301,35 @@ ipsec_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
         */
        rc = kern_channel_tx_refill_canblock(ring, UINT32_MAX, UINT32_MAX, true, &more);
        if (rc != 0 && rc != EAGAIN && rc != EBUSY) {
-               printf("%s, tx refill failed %d\n", __func__, rc);
+               os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s tx refill failed %d\n", __func__,
+                   pcb->ipsec_if_xname, ring->ckr_name, rc);
        }
 
        (void) kr_enter(ring, TRUE);
        lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
+       if (ring != pcb->ipsec_netif_txring[ring_idx]) {
+               // ring no longer valid
+               lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
+               kr_exit(ring);
+               os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d bad netif_txring 3\n", __func__,
+                   pcb->ipsec_if_xname, ring->ckr_name, ring_idx);
+               return ENXIO;
+       }
 
-       if (pcb->ipsec_kpipe_enabled) {
+       if (pcb->ipsec_kpipe_count) {
                uint32_t tx_available = kern_channel_available_slot_count(ring);
                if (pcb->ipsec_netif_txring_size > 0 &&
                    tx_available >= pcb->ipsec_netif_txring_size - 1) {
                        // No room left in tx ring, disable output for now
                        errno_t error = ifnet_disable_output(pcb->ipsec_ifp);
                        if (error != 0) {
-                               printf("ipsec_netif_tx_doorbell: ifnet_disable_output returned error %d\n", error);
+                               os_log_error(OS_LOG_DEFAULT, "ipsec_netif_tx_doorbell: ifnet_disable_output returned error %d\n", error);
                        }
                }
        }
 
-       if (pcb->ipsec_kpipe_enabled) {
-               kern_channel_ring_t rx_ring = pcb->ipsec_kpipe_rxring;
+       if (pcb->ipsec_kpipe_count) {
+               kern_channel_ring_t rx_ring = pcb->ipsec_kpipe_rxring[ring_idx];
 
                // Unlock while calling notify
                lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
@@ -1136,6 +1346,34 @@ ipsec_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
        return 0;
 }
 
+static errno_t
+ipsec_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
+    kern_channel_ring_t ring, __unused uint32_t flags)
+{
+       errno_t ret = 0;
+       struct ipsec_pcb *pcb = kern_nexus_get_context(nexus);
+
+       if (!ipsec_data_move_begin(pcb)) {
+               os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp));
+               return 0;
+       }
+
+       if (ipsec_in_wmm_mode(pcb)) {
+               for (uint8_t i = 0; i < IPSEC_IF_WMM_RING_COUNT; i++) {
+                       kern_channel_ring_t nring = pcb->ipsec_netif_txring[i];
+                       ret = ipsec_netif_tx_doorbell_one(nxprov, nexus, nring, flags, i);
+                       if (ret) {
+                               break;
+                       }
+               }
+       } else {
+               ret = ipsec_netif_tx_doorbell_one(nxprov, nexus, ring, flags, 0);
+       }
+
+       ipsec_data_move_end(pcb);
+       return ret;
+}
+
 static errno_t
 ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
     kern_channel_ring_t rx_ring, uint32_t flags)
@@ -1147,16 +1385,22 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
 
        struct netif_stats *nifs = &NX_NETIF_PRIVATE(nexus)->nif_stats;
 
+       if (!ipsec_data_move_begin(pcb)) {
+               os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->ipsec_ifp));
+               return 0;
+       }
+
        lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
 
        // Reclaim user-released slots
        (void) kern_channel_reclaim(rx_ring);
 
-       STATS_INC(nifs, NETIF_STATS_RXSYNC);
+       STATS_INC(nifs, NETIF_STATS_RX_SYNC);
 
        uint32_t avail = kern_channel_available_slot_count(rx_ring);
        if (avail == 0) {
                lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
+               ipsec_data_move_end(pcb);
                return 0;
        }
 
@@ -1179,13 +1423,16 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                kern_packet_t rx_ph = 0;
                errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph);
                if (__improbable(error != 0)) {
-                       STATS_INC(nifs, NETIF_STATS_NOMEM_PKT);
-                       STATS_INC(nifs, NETIF_STATS_DROPPED);
+                       STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT);
+                       STATS_INC(nifs, NETIF_STATS_DROP);
                        lck_mtx_unlock(&pcb->ipsec_input_chain_lock);
                        break;
                }
 
                // Advance waiting packets
+               if (pcb->ipsec_input_chain_count > 0) {
+                       pcb->ipsec_input_chain_count--;
+               }
                pcb->ipsec_input_chain = data->m_nextpkt;
                data->m_nextpkt = NULL;
                if (pcb->ipsec_input_chain == NULL) {
@@ -1199,9 +1446,9 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                        // Flush data
                        mbuf_freem(data);
                        kern_pbufpool_free(rx_pp, rx_ph);
-                       STATS_INC(nifs, NETIF_STATS_BADLEN);
-                       STATS_INC(nifs, NETIF_STATS_DROPPED);
-                       printf("ipsec_netif_sync_rx %s: legacy decrypted packet length cannot hold IP %zu < %zu\n",
+                       STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                       STATS_INC(nifs, NETIF_STATS_DROP);
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: legacy decrypted packet length cannot hold IP %zu < %zu\n",
                            pcb->ipsec_ifp->if_xname, length, sizeof(struct ip));
                        continue;
                }
@@ -1219,7 +1466,7 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                        break;
                }
                default: {
-                       printf("ipsec_netif_sync_rx %s: legacy unknown ip version %u\n",
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: legacy unknown ip version %u\n",
                            pcb->ipsec_ifp->if_xname, ip_version);
                        break;
                }
@@ -1246,9 +1493,9 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                                if (fragment_error == 0 && data != NULL) {
                                        fragment_chain = data;
                                } else {
-                                       STATS_INC(nifs, NETIF_STATS_BADLEN);
-                                       STATS_INC(nifs, NETIF_STATS_DROPPED);
-                                       printf("ipsec_netif_sync_rx %s: failed to fragment IPv4 packet of length %zu (%d)\n",
+                                       STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                                       STATS_INC(nifs, NETIF_STATS_DROP);
+                                       os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: failed to fragment IPv4 packet of length %zu (%d)\n",
                                            pcb->ipsec_ifp->if_xname, length, fragment_error);
                                }
                                break;
@@ -1256,25 +1503,23 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                        case AF_INET6: {
                                if (length < sizeof(struct ip6_hdr)) {
                                        mbuf_freem(data);
-                                       STATS_INC(nifs, NETIF_STATS_BADLEN);
-                                       STATS_INC(nifs, NETIF_STATS_DROPPED);
-                                       printf("ipsec_netif_sync_rx %s: failed to fragment IPv6 packet of length %zu < %zu\n",
+                                       STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                                       STATS_INC(nifs, NETIF_STATS_DROP);
+                                       os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: failed to fragment IPv6 packet of length %zu < %zu\n",
                                            pcb->ipsec_ifp->if_xname, length, sizeof(struct ip6_hdr));
                                } else {
                                        // ip6_do_fragmentation will free the original data on success only
                                        struct ip6_hdr *ip6 = mtod(data, struct ip6_hdr *);
-                                       struct ip6_exthdrs exthdrs;
-                                       memset(&exthdrs, 0, sizeof(exthdrs));
 
                                        int fragment_error = ip6_do_fragmentation(&data, 0, pcb->ipsec_ifp, sizeof(struct ip6_hdr),
-                                           ip6, &exthdrs, fragment_mtu, ip6->ip6_nxt);
+                                           ip6, NULL, fragment_mtu, ip6->ip6_nxt, htonl(ip6_randomid()));
                                        if (fragment_error == 0 && data != NULL) {
                                                fragment_chain = data;
                                        } else {
                                                mbuf_freem(data);
-                                               STATS_INC(nifs, NETIF_STATS_BADLEN);
-                                               STATS_INC(nifs, NETIF_STATS_DROPPED);
-                                               printf("ipsec_netif_sync_rx %s: failed to fragment IPv6 packet of length %zu (%d)\n",
+                                               STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                                               STATS_INC(nifs, NETIF_STATS_DROP);
+                                               os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: failed to fragment IPv6 packet of length %zu (%d)\n",
                                                    pcb->ipsec_ifp->if_xname, length, fragment_error);
                                        }
                                }
@@ -1283,9 +1528,9 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                        default: {
                                // Cannot fragment unknown families
                                mbuf_freem(data);
-                               STATS_INC(nifs, NETIF_STATS_BADLEN);
-                               STATS_INC(nifs, NETIF_STATS_DROPPED);
-                               printf("ipsec_netif_sync_rx %s: uknown legacy decrypted packet length %zu > %u\n",
+                               STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                               STATS_INC(nifs, NETIF_STATS_DROP);
+                               os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: uknown legacy decrypted packet length %zu > %u\n",
                                    pcb->ipsec_ifp->if_xname, length, rx_pp->pp_buflet_size);
                                break;
                        }
@@ -1299,9 +1544,11 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                                } else {
                                        pcb->ipsec_input_chain = fragment_chain;
                                }
+                               pcb->ipsec_input_chain_count++;
                                while (fragment_chain->m_nextpkt) {
                                        VERIFY(fragment_chain != fragment_chain->m_nextpkt);
                                        fragment_chain = fragment_chain->m_nextpkt;
+                                       pcb->ipsec_input_chain_count++;
                                }
                                pcb->ipsec_input_chain_last = fragment_chain;
                                lck_mtx_unlock(&pcb->ipsec_input_chain_lock);
@@ -1330,17 +1577,15 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                VERIFY(error == 0);
                error = kern_buflet_set_data_length(rx_buf, length);
                VERIFY(error == 0);
-               error = kern_packet_set_link_header_offset(rx_ph, 0);
-               VERIFY(error == 0);
-               error = kern_packet_set_network_header_offset(rx_ph, 0);
+               error = kern_packet_set_headroom(rx_ph, 0);
                VERIFY(error == 0);
                error = kern_packet_finalize(rx_ph);
                VERIFY(error == 0);
                error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph);
                VERIFY(error == 0);
 
-               STATS_INC(nifs, NETIF_STATS_RXPKTS);
-               STATS_INC(nifs, NETIF_STATS_RXCOPY_MBUF);
+               STATS_INC(nifs, NETIF_STATS_RX_PACKETS);
+               STATS_INC(nifs, NETIF_STATS_RX_COPY_MBUF);
                bpf_tap_packet_in(pcb->ipsec_ifp, DLT_RAW, rx_ph, NULL, 0);
 
                rx_ring_stats.kcrsi_slots_transferred++;
@@ -1357,234 +1602,244 @@ ipsec_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL);
        }
 
-       struct kern_channel_ring_stat_increment tx_ring_stats;
-       bzero(&tx_ring_stats, sizeof(tx_ring_stats));
-       kern_channel_ring_t tx_ring = pcb->ipsec_kpipe_txring;
-       kern_channel_slot_t tx_pslot = NULL;
-       kern_channel_slot_t tx_slot = NULL;
-       if (tx_ring == NULL) {
-               // Net-If TX ring not set up yet, nothing to read
-               goto done;
-       }
-
-
-       // Unlock ipsec before entering ring
-       lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
+       for (uint8_t ring_idx = 0; ring_idx < pcb->ipsec_kpipe_count; ring_idx++) {
+               struct kern_channel_ring_stat_increment tx_ring_stats;
+               bzero(&tx_ring_stats, sizeof(tx_ring_stats));
+               kern_channel_ring_t tx_ring = pcb->ipsec_kpipe_txring[ring_idx];
+               kern_channel_slot_t tx_pslot = NULL;
+               kern_channel_slot_t tx_slot = NULL;
+               if (tx_ring == NULL) {
+                       // Net-If TX ring not set up yet, nothing to read
+                       goto done;
+               }
 
-       (void)kr_enter(tx_ring, TRUE);
 
-       // Lock again after entering and validate
-       lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
+               // Unlock ipsec before entering ring
+               lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
 
-       if (tx_ring != pcb->ipsec_kpipe_txring) {
-               goto done;
-       }
+               (void)kr_enter(tx_ring, TRUE);
 
-       tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL);
-       if (tx_slot == NULL) {
-               // Nothing to read, don't bother signalling
-               goto done;
-       }
+               // Lock again after entering and validate
+               lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
 
-       while (rx_slot != NULL && tx_slot != NULL) {
-               size_t length = 0;
-               mbuf_t data = NULL;
-               errno_t error = 0;
-               uint32_t af;
+               if (tx_ring != pcb->ipsec_kpipe_txring[ring_idx]) {
+                       goto done;
+               }
 
-               // Allocate rx packet
-               kern_packet_t rx_ph = 0;
-               error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph);
-               if (__improbable(error != 0)) {
-                       STATS_INC(nifs, NETIF_STATS_NOMEM_PKT);
-                       STATS_INC(nifs, NETIF_STATS_DROPPED);
-                       break;
+               tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL);
+               if (tx_slot == NULL) {
+                       // Nothing to read, don't bother signalling
+                       goto done;
                }
 
-               kern_packet_t tx_ph = kern_channel_slot_get_packet(tx_ring, tx_slot);
+               while (rx_slot != NULL && tx_slot != NULL) {
+                       size_t length = 0;
+                       mbuf_t data = NULL;
+                       errno_t error = 0;
+                       uint32_t af;
+
+                       // Allocate rx packet
+                       kern_packet_t rx_ph = 0;
+                       error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph);
+                       if (__improbable(error != 0)) {
+                               STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT);
+                               STATS_INC(nifs, NETIF_STATS_DROP);
+                               break;
+                       }
 
-               // Advance TX ring
-               tx_pslot = tx_slot;
-               tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL);
+                       kern_packet_t tx_ph = kern_channel_slot_get_packet(tx_ring, tx_slot);
 
-               if (tx_ph == 0) {
-                       kern_pbufpool_free(rx_pp, rx_ph);
-                       continue;
-               }
+                       // Advance TX ring
+                       tx_pslot = tx_slot;
+                       tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL);
 
-               kern_buflet_t tx_buf = kern_packet_get_next_buflet(tx_ph, NULL);
-               VERIFY(tx_buf != NULL);
-               uint8_t *tx_baddr = kern_buflet_get_object_address(tx_buf);
-               VERIFY(tx_baddr != 0);
-               tx_baddr += kern_buflet_get_data_offset(tx_buf);
+                       if (tx_ph == 0) {
+                               kern_pbufpool_free(rx_pp, rx_ph);
+                               continue;
+                       }
 
-               length = MIN(kern_packet_get_data_length(tx_ph),
-                   pcb->ipsec_slot_size);
+                       kern_buflet_t tx_buf = kern_packet_get_next_buflet(tx_ph, NULL);
+                       VERIFY(tx_buf != NULL);
+                       uint8_t *tx_baddr = kern_buflet_get_object_address(tx_buf);
+                       VERIFY(tx_baddr != 0);
+                       tx_baddr += kern_buflet_get_data_offset(tx_buf);
 
-               // Increment TX stats
-               tx_ring_stats.kcrsi_slots_transferred++;
-               tx_ring_stats.kcrsi_bytes_transferred += length;
+                       length = MIN(kern_packet_get_data_length(tx_ph),
+                           pcb->ipsec_slot_size);
 
-               if (length >= sizeof(struct ip)) {
-                       error = mbuf_gethdr(MBUF_DONTWAIT, MBUF_TYPE_HEADER, &data);
-                       if (error == 0) {
-                               error = mbuf_copyback(data, 0, length, tx_baddr, MBUF_DONTWAIT);
+                       // Increment TX stats
+                       tx_ring_stats.kcrsi_slots_transferred++;
+                       tx_ring_stats.kcrsi_bytes_transferred += length;
+
+                       if (length >= sizeof(struct ip)) {
+                               error = mbuf_gethdr(MBUF_DONTWAIT, MBUF_TYPE_HEADER, &data);
                                if (error == 0) {
-                                       struct ip *ip = mtod(data, struct ip *);
-                                       u_int ip_version = ip->ip_v;
-                                       switch (ip_version) {
-                                       case 4: {
-                                               af = AF_INET;
-                                               ip->ip_len = ntohs(ip->ip_len) - sizeof(struct ip);
-                                               ip->ip_off = ntohs(ip->ip_off);
-
-                                               if (length < ip->ip_len) {
-                                                       printf("ipsec_netif_sync_rx %s: IPv4 packet length too short (%zu < %u)\n",
-                                                           pcb->ipsec_ifp->if_xname, length, ip->ip_len);
-                                                       STATS_INC(nifs, NETIF_STATS_BADLEN);
-                                                       STATS_INC(nifs, NETIF_STATS_DROPPED);
-                                                       mbuf_freem(data);
-                                                       data = NULL;
-                                               } else {
-                                                       data = esp4_input_extended(data, sizeof(struct ip), pcb->ipsec_ifp);
+                                       error = mbuf_copyback(data, 0, length, tx_baddr, MBUF_DONTWAIT);
+                                       if (error == 0) {
+                                               lck_mtx_lock(&pcb->ipsec_kpipe_decrypt_lock);
+                                               struct ip *ip = mtod(data, struct ip *);
+                                               u_int ip_version = ip->ip_v;
+                                               switch (ip_version) {
+                                               case 4: {
+                                                       af = AF_INET;
+                                                       ip->ip_len = ntohs(ip->ip_len) - sizeof(struct ip);
+                                                       ip->ip_off = ntohs(ip->ip_off);
+
+                                                       if (length < ip->ip_len) {
+                                                               os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: IPv4 packet length too short (%zu < %u)\n",
+                                                                   pcb->ipsec_ifp->if_xname, length, ip->ip_len);
+                                                               STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                                                               STATS_INC(nifs, NETIF_STATS_DROP);
+                                                               mbuf_freem(data);
+                                                               data = NULL;
+                                                       } else {
+                                                               data = esp4_input_extended(data, sizeof(struct ip), pcb->ipsec_ifp);
+                                                       }
+                                                       break;
                                                }
-                                               break;
-                                       }
-                                       case 6: {
-                                               if (length < sizeof(struct ip6_hdr)) {
-                                                       printf("ipsec_netif_sync_rx %s: IPv6 packet length too short for header %zu\n",
-                                                           pcb->ipsec_ifp->if_xname, length);
-                                                       STATS_INC(nifs, NETIF_STATS_BADLEN);
-                                                       STATS_INC(nifs, NETIF_STATS_DROPPED);
-                                                       mbuf_freem(data);
-                                                       data = NULL;
-                                               } else {
-                                                       af = AF_INET6;
-                                                       struct ip6_hdr *ip6 = mtod(data, struct ip6_hdr *);
-                                                       const size_t ip6_len = sizeof(*ip6) + ntohs(ip6->ip6_plen);
-                                                       if (length < ip6_len) {
-                                                               printf("ipsec_netif_sync_rx %s: IPv6 packet length too short (%zu < %zu)\n",
-                                                                   pcb->ipsec_ifp->if_xname, length, ip6_len);
-                                                               STATS_INC(nifs, NETIF_STATS_BADLEN);
-                                                               STATS_INC(nifs, NETIF_STATS_DROPPED);
+                                               case 6: {
+                                                       if (length < sizeof(struct ip6_hdr)) {
+                                                               os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: IPv6 packet length too short for header %zu\n",
+                                                                   pcb->ipsec_ifp->if_xname, length);
+                                                               STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                                                               STATS_INC(nifs, NETIF_STATS_DROP);
                                                                mbuf_freem(data);
                                                                data = NULL;
                                                        } else {
-                                                               int offset = sizeof(struct ip6_hdr);
-                                                               esp6_input_extended(&data, &offset, ip6->ip6_nxt, pcb->ipsec_ifp);
+                                                               af = AF_INET6;
+                                                               struct ip6_hdr *ip6 = mtod(data, struct ip6_hdr *);
+                                                               const size_t ip6_len = sizeof(*ip6) + ntohs(ip6->ip6_plen);
+                                                               if (length < ip6_len) {
+                                                                       os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: IPv6 packet length too short (%zu < %zu)\n",
+                                                                           pcb->ipsec_ifp->if_xname, length, ip6_len);
+                                                                       STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                                                                       STATS_INC(nifs, NETIF_STATS_DROP);
+                                                                       mbuf_freem(data);
+                                                                       data = NULL;
+                                                               } else {
+                                                                       int offset = sizeof(struct ip6_hdr);
+                                                                       esp6_input_extended(&data, &offset, ip6->ip6_nxt, pcb->ipsec_ifp);
+                                                               }
                                                        }
+                                                       break;
                                                }
-                                               break;
-                                       }
-                                       default: {
-                                               printf("ipsec_netif_sync_rx %s: unknown ip version %u\n",
-                                                   pcb->ipsec_ifp->if_xname, ip_version);
-                                               STATS_INC(nifs, NETIF_STATS_DROPPED);
+                                               default: {
+                                                       os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: unknown ip version %u\n",
+                                                           pcb->ipsec_ifp->if_xname, ip_version);
+                                                       STATS_INC(nifs, NETIF_STATS_DROP);
+                                                       mbuf_freem(data);
+                                                       data = NULL;
+                                                       break;
+                                               }
+                                               }
+                                               lck_mtx_unlock(&pcb->ipsec_kpipe_decrypt_lock);
+                                       } else {
+                                               os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error);
+                                               STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
+                                               STATS_INC(nifs, NETIF_STATS_DROP);
                                                mbuf_freem(data);
                                                data = NULL;
-                                               break;
-                                       }
                                        }
                                } else {
-                                       printf("ipsec_netif_sync_rx %s - mbuf_copyback(%zu) error %d\n", pcb->ipsec_ifp->if_xname, length, error);
-                                       STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF);
-                                       STATS_INC(nifs, NETIF_STATS_DROPPED);
-                                       mbuf_freem(data);
-                                       data = NULL;
+                                       os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error);
+                                       STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
+                                       STATS_INC(nifs, NETIF_STATS_DROP);
                                }
                        } else {
-                               printf("ipsec_netif_sync_rx %s - mbuf_gethdr error %d\n", pcb->ipsec_ifp->if_xname, error);
-                               STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF);
-                               STATS_INC(nifs, NETIF_STATS_DROPPED);
+                               os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s - bad packet length %zu\n", pcb->ipsec_ifp->if_xname, length);
+                               STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                               STATS_INC(nifs, NETIF_STATS_DROP);
                        }
-               } else {
-                       printf("ipsec_netif_sync_rx %s - bad packet length %zu\n", pcb->ipsec_ifp->if_xname, length);
-                       STATS_INC(nifs, NETIF_STATS_BADLEN);
-                       STATS_INC(nifs, NETIF_STATS_DROPPED);
-               }
-
-               if (data == NULL) {
-                       // Failed to get decrypted data data
-                       kern_pbufpool_free(rx_pp, rx_ph);
-                       continue;
-               }
 
-               length = mbuf_pkthdr_len(data);
-               if (length > rx_pp->pp_buflet_size) {
-                       // Flush data
-                       mbuf_freem(data);
-                       kern_pbufpool_free(rx_pp, rx_ph);
-                       STATS_INC(nifs, NETIF_STATS_BADLEN);
-                       STATS_INC(nifs, NETIF_STATS_DROPPED);
-                       printf("ipsec_netif_sync_rx %s: decrypted packet length %zu > %u\n",
-                           pcb->ipsec_ifp->if_xname, length, rx_pp->pp_buflet_size);
-                       continue;
-               }
-
-               mbuf_pkthdr_setrcvif(data, pcb->ipsec_ifp);
-
-               // Fillout rx packet
-               kern_buflet_t rx_buf = kern_packet_get_next_buflet(rx_ph, NULL);
-               VERIFY(rx_buf != NULL);
-               void *rx_baddr = kern_buflet_get_object_address(rx_buf);
-               VERIFY(rx_baddr != NULL);
+                       if (data == NULL) {
+                               // Failed to get decrypted data data
+                               kern_pbufpool_free(rx_pp, rx_ph);
+                               continue;
+                       }
 
-               // Copy-in data from mbuf to buflet
-               mbuf_copydata(data, 0, length, (void *)rx_baddr);
-               kern_packet_clear_flow_uuid(rx_ph);     // Zero flow id
+                       length = mbuf_pkthdr_len(data);
+                       if (length > rx_pp->pp_buflet_size) {
+                               // Flush data
+                               mbuf_freem(data);
+                               kern_pbufpool_free(rx_pp, rx_ph);
+                               STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                               STATS_INC(nifs, NETIF_STATS_DROP);
+                               os_log_error(OS_LOG_DEFAULT, "ipsec_netif_sync_rx %s: decrypted packet length %zu > %u\n",
+                                   pcb->ipsec_ifp->if_xname, length, rx_pp->pp_buflet_size);
+                               continue;
+                       }
 
-               // Finalize and attach the packet
-               error = kern_buflet_set_data_offset(rx_buf, 0);
-               VERIFY(error == 0);
-               error = kern_buflet_set_data_length(rx_buf, length);
-               VERIFY(error == 0);
-               error = kern_packet_set_link_header_offset(rx_ph, 0);
-               VERIFY(error == 0);
-               error = kern_packet_set_network_header_offset(rx_ph, 0);
-               VERIFY(error == 0);
-               error = kern_packet_finalize(rx_ph);
-               VERIFY(error == 0);
-               error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph);
-               VERIFY(error == 0);
+                       mbuf_pkthdr_setrcvif(data, pcb->ipsec_ifp);
+
+                       // Fillout rx packet
+                       kern_buflet_t rx_buf = kern_packet_get_next_buflet(rx_ph, NULL);
+                       VERIFY(rx_buf != NULL);
+                       void *rx_baddr = kern_buflet_get_object_address(rx_buf);
+                       VERIFY(rx_baddr != NULL);
+
+                       // Copy-in data from mbuf to buflet
+                       mbuf_copydata(data, 0, length, (void *)rx_baddr);
+                       kern_packet_clear_flow_uuid(rx_ph);     // Zero flow id
+
+                       // Finalize and attach the packet
+                       error = kern_buflet_set_data_offset(rx_buf, 0);
+                       VERIFY(error == 0);
+                       error = kern_buflet_set_data_length(rx_buf, length);
+                       VERIFY(error == 0);
+                       error = kern_packet_set_link_header_offset(rx_ph, 0);
+                       VERIFY(error == 0);
+                       error = kern_packet_set_network_header_offset(rx_ph, 0);
+                       VERIFY(error == 0);
+                       error = kern_packet_finalize(rx_ph);
+                       VERIFY(error == 0);
+                       error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph);
+                       VERIFY(error == 0);
+
+                       STATS_INC(nifs, NETIF_STATS_RX_PACKETS);
+                       STATS_INC(nifs, NETIF_STATS_RX_COPY_DIRECT);
+                       bpf_tap_packet_in(pcb->ipsec_ifp, DLT_RAW, rx_ph, NULL, 0);
+
+                       rx_ring_stats.kcrsi_slots_transferred++;
+                       rx_ring_stats.kcrsi_bytes_transferred += length;
+
+                       if (!pcb->ipsec_ext_ifdata_stats) {
+                               ifnet_stat_increment_in(pcb->ipsec_ifp, 1, length, 0);
+                       }
 
-               STATS_INC(nifs, NETIF_STATS_RXPKTS);
-               STATS_INC(nifs, NETIF_STATS_RXCOPY_DIRECT);
-               bpf_tap_packet_in(pcb->ipsec_ifp, DLT_RAW, rx_ph, NULL, 0);
+                       mbuf_freem(data);
 
-               rx_ring_stats.kcrsi_slots_transferred++;
-               rx_ring_stats.kcrsi_bytes_transferred += length;
+                       rx_pslot = rx_slot;
+                       rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL);
+               }
 
-               if (!pcb->ipsec_ext_ifdata_stats) {
-                       ifnet_stat_increment_in(pcb->ipsec_ifp, 1, length, 0);
+done:
+               if (tx_pslot) {
+                       kern_channel_advance_slot(tx_ring, tx_pslot);
+                       kern_channel_increment_ring_net_stats(tx_ring, pcb->ipsec_ifp, &tx_ring_stats);
+                       (void)kern_channel_reclaim(tx_ring);
                }
 
-               mbuf_freem(data);
+               // Unlock first, then exit ring
+               lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
+               if (tx_ring != NULL) {
+                       if (tx_pslot != NULL) {
+                               kern_channel_notify(tx_ring, 0);
+                       }
+                       kr_exit(tx_ring);
+               }
 
-               rx_pslot = rx_slot;
-               rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL);
+               lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
        }
 
-done:
        if (rx_pslot) {
                kern_channel_advance_slot(rx_ring, rx_pslot);
                kern_channel_increment_ring_net_stats(rx_ring, pcb->ipsec_ifp, &rx_ring_stats);
        }
 
-       if (tx_pslot) {
-               kern_channel_advance_slot(tx_ring, tx_pslot);
-               kern_channel_increment_ring_net_stats(tx_ring, pcb->ipsec_ifp, &tx_ring_stats);
-               (void)kern_channel_reclaim(tx_ring);
-       }
 
-       // Unlock first, then exit ring
        lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
-       if (tx_ring != NULL) {
-               if (tx_pslot != NULL) {
-                       kern_channel_notify(tx_ring, 0);
-               }
-               kr_exit(tx_ring);
-       }
 
+       ipsec_data_move_end(pcb);
        return 0;
 }
 
@@ -1622,7 +1877,7 @@ ipsec_nexus_ifattach(struct ipsec_pcb *pcb,
        err = kern_nexus_attr_create(&nxa);
        IPSEC_IF_VERIFY(err == 0);
        if (err != 0) {
-               printf("%s: kern_nexus_attr_create failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n",
                    __func__, err);
                goto failed;
        }
@@ -1638,20 +1893,45 @@ ipsec_nexus_ifattach(struct ipsec_pcb *pcb,
        err = kern_nexus_attr_set(nxa, NEXUS_ATTR_RX_SLOTS, ring_size);
        VERIFY(err == 0);
 
+       assert(err == 0);
+
+       if (ipsec_in_wmm_mode(pcb)) {
+               os_log(OS_LOG_DEFAULT, "%s: %s enabling wmm mode\n",
+                   __func__, pcb->ipsec_if_xname);
+
+               init_params->output_sched_model = IFNET_SCHED_MODEL_DRIVER_MANAGED;
+
+               err = kern_nexus_attr_set(nxa, NEXUS_ATTR_TX_RINGS,
+                   IPSEC_NETIF_WMM_TX_RING_COUNT);
+               VERIFY(err == 0);
+               err = kern_nexus_attr_set(nxa, NEXUS_ATTR_RX_RINGS,
+                   IPSEC_NETIF_WMM_RX_RING_COUNT);
+               VERIFY(err == 0);
+
+               err = kern_nexus_attr_set(nxa, NEXUS_ATTR_QMAP, NEXUS_QMAP_TYPE_WMM);
+               VERIFY(err == 0);
+       }
+
        pcb->ipsec_netif_txring_size = ring_size;
 
        bzero(&pp_init, sizeof(pp_init));
        pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION;
-       pp_init.kbi_packets = pcb->ipsec_netif_ring_size * 2;
+       pp_init.kbi_flags |= KBIF_VIRTUAL_DEVICE;
+       // Note: we need more packets than can be held in the tx and rx rings because
+       // packets can also be in the AQM queue(s)
+       pp_init.kbi_packets = pcb->ipsec_netif_ring_size * (2 * pcb->ipsec_kpipe_count + 1);
        pp_init.kbi_bufsize = pcb->ipsec_slot_size;
        pp_init.kbi_buf_seg_size = IPSEC_IF_DEFAULT_BUF_SEG_SIZE;
        pp_init.kbi_max_frags = 1;
        (void) snprintf((char *)pp_init.kbi_name, sizeof(pp_init.kbi_name),
            "%s", provider_name);
+       pp_init.kbi_ctx = NULL;
+       pp_init.kbi_ctx_retain = NULL;
+       pp_init.kbi_ctx_release = NULL;
 
-       err = kern_pbufpool_create(&pp_init, &pp_init, &pcb->ipsec_netif_pp, NULL);
+       err = kern_pbufpool_create(&pp_init, &pcb->ipsec_netif_pp, NULL);
        if (err != 0) {
-               printf("%s pbufbool create failed, error %d\n", __func__, err);
+               os_log_error(OS_LOG_DEFAULT, "%s pbufbool create failed, error %d\n", __func__, err);
                goto failed;
        }
 
@@ -1664,7 +1944,7 @@ ipsec_nexus_ifattach(struct ipsec_pcb *pcb,
            &pcb->ipsec_nx.if_provider);
        IPSEC_IF_VERIFY(err == 0);
        if (err != 0) {
-               printf("%s register provider failed, error %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s register provider failed, error %d\n",
                    __func__, err);
                goto failed;
        }
@@ -1684,7 +1964,7 @@ ipsec_nexus_ifattach(struct ipsec_pcb *pcb,
            ifp);
        IPSEC_IF_VERIFY(err == 0);
        if (err != 0) {
-               printf("%s alloc_net_provider_instance failed, %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s alloc_net_provider_instance failed, %d\n",
                    __func__, err);
                kern_nexus_controller_deregister_provider(controller,
                    pcb->ipsec_nx.if_provider);
@@ -1713,7 +1993,7 @@ ipsec_detach_provider_and_instance(uuid_t provider, uuid_t instance)
                err = kern_nexus_controller_free_provider_instance(controller,
                    instance);
                if (err != 0) {
-                       printf("%s free_provider_instance failed %d\n",
+                       os_log_error(OS_LOG_DEFAULT, "%s free_provider_instance failed %d\n",
                            __func__, err);
                }
                uuid_clear(instance);
@@ -1722,7 +2002,7 @@ ipsec_detach_provider_and_instance(uuid_t provider, uuid_t instance)
                err = kern_nexus_controller_deregister_provider(controller,
                    provider);
                if (err != 0) {
-                       printf("%s deregister_provider %d\n", __func__, err);
+                       os_log_error(OS_LOG_DEFAULT, "%s deregister_provider %d\n", __func__, err);
                }
                uuid_clear(provider);
        }
@@ -1736,30 +2016,30 @@ ipsec_nexus_detach(struct ipsec_pcb *pcb)
        nexus_controller_t controller = kern_nexus_shared_controller();
        errno_t err;
 
-       if (!uuid_is_null(nx->ms_host)) {
+       if (!uuid_is_null(nx->fsw_host)) {
                err = kern_nexus_ifdetach(controller,
-                   nx->ms_instance,
-                   nx->ms_host);
+                   nx->fsw_instance,
+                   nx->fsw_host);
                if (err != 0) {
-                       printf("%s: kern_nexus_ifdetach ms host failed %d\n",
+                       os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_ifdetach ms host failed %d\n",
                            __func__, err);
                }
        }
 
-       if (!uuid_is_null(nx->ms_device)) {
+       if (!uuid_is_null(nx->fsw_device)) {
                err = kern_nexus_ifdetach(controller,
-                   nx->ms_instance,
-                   nx->ms_device);
+                   nx->fsw_instance,
+                   nx->fsw_device);
                if (err != 0) {
-                       printf("%s: kern_nexus_ifdetach ms device failed %d\n",
+                       os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_ifdetach ms device failed %d\n",
                            __func__, err);
                }
        }
 
        ipsec_detach_provider_and_instance(nx->if_provider,
            nx->if_instance);
-       ipsec_detach_provider_and_instance(nx->ms_provider,
-           nx->ms_instance);
+       ipsec_detach_provider_and_instance(nx->fsw_provider,
+           nx->fsw_instance);
 
        if (pcb->ipsec_netif_pp != NULL) {
                kern_pbufpool_destroy(pcb->ipsec_netif_pp);
@@ -1770,7 +2050,7 @@ ipsec_nexus_detach(struct ipsec_pcb *pcb)
 
 static errno_t
 ipsec_create_fs_provider_and_instance(struct ipsec_pcb *pcb,
-    uint32_t subtype, const char *type_name,
+    const char *type_name,
     const char *ifname,
     uuid_t *provider, uuid_t *instance)
 {
@@ -1781,11 +2061,11 @@ ipsec_create_fs_provider_and_instance(struct ipsec_pcb *pcb,
        struct kern_nexus_init init;
        nexus_name_t    provider_name;
 
-       err = kern_nexus_get_builtin_domain_provider(NEXUS_TYPE_FLOW_SWITCH,
+       err = kern_nexus_get_default_domain_provider(NEXUS_TYPE_FLOW_SWITCH,
            &dom_prov);
        IPSEC_IF_VERIFY(err == 0);
        if (err != 0) {
-               printf("%s can't get %s provider, error %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s can't get %s provider, error %d\n",
                    __func__, type_name, err);
                goto failed;
        }
@@ -1793,14 +2073,11 @@ ipsec_create_fs_provider_and_instance(struct ipsec_pcb *pcb,
        err = kern_nexus_attr_create(&attr);
        IPSEC_IF_VERIFY(err == 0);
        if (err != 0) {
-               printf("%s: kern_nexus_attr_create failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n",
                    __func__, err);
                goto failed;
        }
 
-       err = kern_nexus_attr_set(attr, NEXUS_ATTR_EXTENSIONS, subtype);
-       VERIFY(err == 0);
-
        uint64_t slot_buffer_size = pcb->ipsec_slot_size;
        err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, slot_buffer_size);
        VERIFY(err == 0);
@@ -1826,7 +2103,7 @@ ipsec_create_fs_provider_and_instance(struct ipsec_pcb *pcb,
        attr = NULL;
        IPSEC_IF_VERIFY(err == 0);
        if (err != 0) {
-               printf("%s register %s provider failed, error %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s register %s provider failed, error %d\n",
                    __func__, type_name, err);
                goto failed;
        }
@@ -1838,7 +2115,7 @@ ipsec_create_fs_provider_and_instance(struct ipsec_pcb *pcb,
            instance, &init);
        IPSEC_IF_VERIFY(err == 0);
        if (err != 0) {
-               printf("%s alloc_provider_instance %s failed, %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s alloc_provider_instance %s failed, %d\n",
                    __func__, type_name, err);
                kern_nexus_controller_deregister_provider(controller,
                    *provider);
@@ -1849,62 +2126,56 @@ failed:
 }
 
 static errno_t
-ipsec_multistack_attach(struct ipsec_pcb *pcb)
+ipsec_flowswitch_attach(struct ipsec_pcb *pcb)
 {
        nexus_controller_t controller = kern_nexus_shared_controller();
        errno_t err = 0;
        ipsec_nx_t nx = &pcb->ipsec_nx;
 
-       // Allocate multistack flowswitch
+       // Allocate flowswitch
        err = ipsec_create_fs_provider_and_instance(pcb,
-           NEXUS_EXTENSION_FSW_TYPE_MULTISTACK,
-           "multistack",
+           "flowswitch",
            pcb->ipsec_ifp->if_xname,
-           &nx->ms_provider,
-           &nx->ms_instance);
+           &nx->fsw_provider,
+           &nx->fsw_instance);
        if (err != 0) {
-               printf("%s: failed to create bridge provider and instance\n",
+               os_log_error(OS_LOG_DEFAULT, "%s: failed to create bridge provider and instance\n",
                    __func__);
                goto failed;
        }
 
-       // Attach multistack to device port
-       err = kern_nexus_ifattach(controller, nx->ms_instance,
+       // Attach flowswitch to device port
+       err = kern_nexus_ifattach(controller, nx->fsw_instance,
            NULL, nx->if_instance,
-           FALSE, &nx->ms_device);
+           FALSE, &nx->fsw_device);
        if (err != 0) {
-               printf("%s kern_nexus_ifattach ms device %d\n", __func__, err);
+               os_log_error(OS_LOG_DEFAULT, "%s kern_nexus_ifattach ms device %d\n", __func__, err);
                goto failed;
        }
 
-       // Attach multistack to host port
-       err = kern_nexus_ifattach(controller, nx->ms_instance,
+       // Attach flowswitch to host port
+       err = kern_nexus_ifattach(controller, nx->fsw_instance,
            NULL, nx->if_instance,
-           TRUE, &nx->ms_host);
+           TRUE, &nx->fsw_host);
        if (err != 0) {
-               printf("%s kern_nexus_ifattach ms host %d\n", __func__, err);
+               os_log_error(OS_LOG_DEFAULT, "%s kern_nexus_ifattach ms host %d\n", __func__, err);
                goto failed;
        }
 
        // Extract the agent UUID and save for later
-       struct kern_nexus *multistack_nx = nx_find(nx->ms_instance, false);
-       if (multistack_nx != NULL) {
-               struct nx_flowswitch *flowswitch = NX_FSW_PRIVATE(multistack_nx);
+       struct kern_nexus *flowswitch_nx = nx_find(nx->fsw_instance, false);
+       if (flowswitch_nx != NULL) {
+               struct nx_flowswitch *flowswitch = NX_FSW_PRIVATE(flowswitch_nx);
                if (flowswitch != NULL) {
                        FSW_RLOCK(flowswitch);
-                       struct fsw_ms_context *ms_context = (struct fsw_ms_context *)flowswitch->fsw_ops_private;
-                       if (ms_context != NULL) {
-                               uuid_copy(nx->ms_agent, ms_context->mc_agent_uuid);
-                       } else {
-                               printf("ipsec_multistack_attach - fsw_ms_context is NULL\n");
-                       }
+                       uuid_copy(nx->fsw_agent, flowswitch->fsw_agent_uuid);
                        FSW_UNLOCK(flowswitch);
                } else {
-                       printf("ipsec_multistack_attach - flowswitch is NULL\n");
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_flowswitch_attach - flowswitch is NULL\n");
                }
-               nx_release(multistack_nx);
+               nx_release(flowswitch_nx);
        } else {
-               printf("ipsec_multistack_attach - unable to find multistack nexus\n");
+               os_log_error(OS_LOG_DEFAULT, "ipsec_flowswitch_attach - unable to find flowswitch nexus\n");
        }
 
        return 0;
@@ -1914,7 +2185,7 @@ failed:
 
        errno_t detach_error = 0;
        if ((detach_error = ifnet_detach(pcb->ipsec_ifp)) != 0) {
-               panic("ipsec_multistack_attach - ifnet_detach failed: %d\n", detach_error);
+               panic("ipsec_flowswitch_attach - ifnet_detach failed: %d\n", detach_error);
                /* NOT REACHED */
        }
 
@@ -1924,7 +2195,7 @@ failed:
 #pragma mark Kernel Pipe Nexus
 
 static errno_t
-ipsec_register_kernel_pipe_nexus(void)
+ipsec_register_kernel_pipe_nexus(struct ipsec_pcb *pcb)
 {
        nexus_attr_t nxa = NULL;
        errno_t result;
@@ -1937,16 +2208,16 @@ ipsec_register_kernel_pipe_nexus(void)
 
        result = kern_nexus_controller_create(&ipsec_ncd);
        if (result) {
-               printf("%s: kern_nexus_controller_create failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_controller_create failed: %d\n",
                    __FUNCTION__, result);
                goto done;
        }
 
        uuid_t dom_prov;
-       result = kern_nexus_get_builtin_domain_provider(
+       result = kern_nexus_get_default_domain_provider(
                NEXUS_TYPE_KERNEL_PIPE, &dom_prov);
        if (result) {
-               printf("%s: kern_nexus_get_builtin_domain_provider failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_get_default_domain_provider failed: %d\n",
                    __FUNCTION__, result);
                goto done;
        }
@@ -1969,7 +2240,7 @@ ipsec_register_kernel_pipe_nexus(void)
 
        result = kern_nexus_attr_create(&nxa);
        if (result) {
-               printf("%s: kern_nexus_attr_create failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n",
                    __FUNCTION__, result);
                goto done;
        }
@@ -1979,9 +2250,19 @@ ipsec_register_kernel_pipe_nexus(void)
        VERIFY(result == 0);
 
        // Reset ring size for kernel pipe nexus to limit memory usage
-       uint64_t ring_size = if_ipsec_ring_size;
+       // Note: It's better to have less on slots on the kpipe TX ring than the netif
+       // so back pressure is applied at the AQM layer
+       uint64_t ring_size =
+           pcb->ipsec_kpipe_tx_ring_size != 0 ? pcb->ipsec_kpipe_tx_ring_size :
+           pcb->ipsec_netif_ring_size != 0 ? pcb->ipsec_netif_ring_size :
+           if_ipsec_ring_size;
        result = kern_nexus_attr_set(nxa, NEXUS_ATTR_TX_SLOTS, ring_size);
        VERIFY(result == 0);
+
+       ring_size =
+           pcb->ipsec_kpipe_rx_ring_size != 0 ? pcb->ipsec_kpipe_rx_ring_size :
+           pcb->ipsec_netif_ring_size != 0 ? pcb->ipsec_netif_ring_size :
+           if_ipsec_ring_size;
        result = kern_nexus_attr_set(nxa, NEXUS_ATTR_RX_SLOTS, ring_size);
        VERIFY(result == 0);
 
@@ -1993,7 +2274,7 @@ ipsec_register_kernel_pipe_nexus(void)
            nxa,
            &ipsec_kpipe_uuid);
        if (result) {
-               printf("%s: kern_nexus_controller_register_provider failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_controller_register_provider failed: %d\n",
                    __FUNCTION__, result);
                goto done;
        }
@@ -2031,41 +2312,74 @@ ipsec_unregister_kernel_pipe_nexus(void)
        lck_mtx_unlock(&ipsec_lock);
 }
 
-// For use by socket option, not internally
-static errno_t
-ipsec_disable_channel(struct ipsec_pcb *pcb)
-{
-       errno_t result;
-       int enabled;
-       uuid_t uuid;
+/* This structure only holds onto kpipe channels that need to be
+ * freed in the future, but are cleared from the pcb under lock
+ */
+struct ipsec_detached_channels {
+       int count;
+       kern_pbufpool_t pp;
+       uuid_t uuids[IPSEC_IF_MAX_RING_COUNT];
+};
 
-       lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock);
+static void
+ipsec_detach_channels(struct ipsec_pcb *pcb, struct ipsec_detached_channels *dc)
+{
+       LCK_RW_ASSERT(&pcb->ipsec_pcb_lock, LCK_RW_TYPE_EXCLUSIVE);
 
-       enabled = pcb->ipsec_kpipe_enabled;
-       uuid_copy(uuid, pcb->ipsec_kpipe_uuid);
+       if (!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) {
+               for (int i = 0; i < IPSEC_IF_MAX_RING_COUNT; i++) {
+                       VERIFY(uuid_is_null(pcb->ipsec_kpipe_uuid[i]));
+               }
+               dc->count = 0;
+               return;
+       }
 
-       VERIFY(uuid_is_null(pcb->ipsec_kpipe_uuid) == !enabled);
+       dc->count = pcb->ipsec_kpipe_count;
 
-       pcb->ipsec_kpipe_enabled = 0;
-       uuid_clear(pcb->ipsec_kpipe_uuid);
+       VERIFY(dc->count >= 0);
+       VERIFY(dc->count <= IPSEC_IF_MAX_RING_COUNT);
 
-       lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock);
+       for (int i = 0; i < dc->count; i++) {
+               VERIFY(!uuid_is_null(pcb->ipsec_kpipe_uuid[i]));
+               uuid_copy(dc->uuids[i], pcb->ipsec_kpipe_uuid[i]);
+               uuid_clear(pcb->ipsec_kpipe_uuid[i]);
+       }
+       for (int i = dc->count; i < IPSEC_IF_MAX_RING_COUNT; i++) {
+               VERIFY(uuid_is_null(pcb->ipsec_kpipe_uuid[i]));
+       }
 
-       if (enabled) {
-               result = kern_nexus_controller_free_provider_instance(ipsec_ncd, uuid);
+       if (dc->count) {
+               VERIFY(pcb->ipsec_kpipe_pp);
        } else {
-               result = ENXIO;
+               VERIFY(!pcb->ipsec_kpipe_pp);
        }
 
-       if (!result) {
-               if (pcb->ipsec_kpipe_pp != NULL) {
-                       kern_pbufpool_destroy(pcb->ipsec_kpipe_pp);
-                       pcb->ipsec_kpipe_pp = NULL;
-               }
-               ipsec_unregister_kernel_pipe_nexus();
+       dc->pp = pcb->ipsec_kpipe_pp;
+
+       pcb->ipsec_kpipe_pp = NULL;
+
+       ipsec_flag_clr(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED);
+}
+
+static void
+ipsec_free_channels(struct ipsec_detached_channels *dc)
+{
+       if (!dc->count) {
+               return;
        }
 
-       return result;
+       for (int i = 0; i < dc->count; i++) {
+               errno_t result;
+               result = kern_nexus_controller_free_provider_instance(ipsec_ncd, dc->uuids[i]);
+               VERIFY(!result);
+       }
+
+       VERIFY(dc->pp);
+       kern_pbufpool_destroy(dc->pp);
+
+       ipsec_unregister_kernel_pipe_nexus();
+
+       memset(dc, 0, sizeof(*dc));
 }
 
 static errno_t
@@ -2081,65 +2395,76 @@ ipsec_enable_channel(struct ipsec_pcb *pcb, struct proc *proc)
                return result;
        }
 
-       result = ipsec_register_kernel_pipe_nexus();
-       if (result) {
-               return result;
-       }
+       VERIFY(pcb->ipsec_kpipe_count);
+       VERIFY(!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED));
 
-       VERIFY(ipsec_ncd);
+       result = ipsec_register_kernel_pipe_nexus(pcb);
 
        lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock);
 
-       /* ipsec driver doesn't support channels without a netif */
-       if (!pcb->ipsec_use_netif) {
-               result = EOPNOTSUPP;
+       if (result) {
+               os_log_error(OS_LOG_DEFAULT, "%s: %s failed to register kernel pipe nexus\n",
+                   __func__, pcb->ipsec_if_xname);
                goto done;
        }
 
-       if (pcb->ipsec_kpipe_enabled) {
-               result = EEXIST; // return success instead?
-               goto done;
-       }
+       VERIFY(ipsec_ncd);
 
        bzero(&pp_init, sizeof(pp_init));
        pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION;
-       pp_init.kbi_packets = pcb->ipsec_netif_ring_size * 2;
+       pp_init.kbi_flags |= KBIF_VIRTUAL_DEVICE;
+       // Note: We only needs are many packets as can be held in the tx and rx rings
+       pp_init.kbi_packets = pcb->ipsec_netif_ring_size * 2 * pcb->ipsec_kpipe_count;
        pp_init.kbi_bufsize = pcb->ipsec_slot_size;
        pp_init.kbi_buf_seg_size = IPSEC_IF_DEFAULT_BUF_SEG_SIZE;
        pp_init.kbi_max_frags = 1;
        pp_init.kbi_flags |= KBIF_QUANTUM;
        (void) snprintf((char *)pp_init.kbi_name, sizeof(pp_init.kbi_name),
            "com.apple.kpipe.%s", pcb->ipsec_if_xname);
+       pp_init.kbi_ctx = NULL;
+       pp_init.kbi_ctx_retain = NULL;
+       pp_init.kbi_ctx_release = NULL;
 
-       result = kern_pbufpool_create(&pp_init, &pp_init, &pcb->ipsec_kpipe_pp,
+       result = kern_pbufpool_create(&pp_init, &pcb->ipsec_kpipe_pp,
            NULL);
        if (result != 0) {
-               printf("%s pbufbool create failed, error %d\n", __func__, result);
+               os_log_error(OS_LOG_DEFAULT, "%s: %s pbufbool create failed, error %d\n",
+                   __func__, pcb->ipsec_if_xname, result);
                goto done;
        }
 
-       VERIFY(uuid_is_null(pcb->ipsec_kpipe_uuid));
        bzero(&init, sizeof(init));
        init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
        init.nxi_tx_pbufpool = pcb->ipsec_kpipe_pp;
-       result = kern_nexus_controller_alloc_provider_instance(ipsec_ncd,
-           ipsec_kpipe_uuid, pcb, &pcb->ipsec_kpipe_uuid, &init);
-       if (result) {
-               goto done;
-       }
 
-       nexus_port_t port = NEXUS_PORT_KERNEL_PIPE_CLIENT;
-       result = kern_nexus_controller_bind_provider_instance(ipsec_ncd,
-           pcb->ipsec_kpipe_uuid, &port,
-           proc_pid(proc), NULL, NULL, 0, NEXUS_BIND_PID);
-       if (result) {
-               kern_nexus_controller_free_provider_instance(ipsec_ncd,
-                   pcb->ipsec_kpipe_uuid);
-               uuid_clear(pcb->ipsec_kpipe_uuid);
-               goto done;
-       }
+       for (unsigned int i = 0; i < pcb->ipsec_kpipe_count; i++) {
+               VERIFY(uuid_is_null(pcb->ipsec_kpipe_uuid[i]));
+               result = kern_nexus_controller_alloc_provider_instance(ipsec_ncd,
+                   ipsec_kpipe_uuid, pcb, &pcb->ipsec_kpipe_uuid[i], &init);
+
+               if (result == 0) {
+                       nexus_port_t port = NEXUS_PORT_KERNEL_PIPE_CLIENT;
+                       pid_t pid = pcb->ipsec_kpipe_pid;
+                       if (!pid) {
+                               pid = proc_pid(proc);
+                       }
+                       result = kern_nexus_controller_bind_provider_instance(ipsec_ncd,
+                           pcb->ipsec_kpipe_uuid[i], &port,
+                           pid, NULL, NULL, 0, NEXUS_BIND_PID);
+               }
 
-       pcb->ipsec_kpipe_enabled = 1;
+               if (result) {
+                       /* Unwind all of them on error */
+                       for (int j = 0; j < IPSEC_IF_MAX_RING_COUNT; j++) {
+                               if (!uuid_is_null(pcb->ipsec_kpipe_uuid[j])) {
+                                       kern_nexus_controller_free_provider_instance(ipsec_ncd,
+                                           pcb->ipsec_kpipe_uuid[j]);
+                                       uuid_clear(pcb->ipsec_kpipe_uuid[j]);
+                               }
+                       }
+                       goto done;
+               }
+       }
 
 done:
        lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock);
@@ -2150,6 +2475,8 @@ done:
                        pcb->ipsec_kpipe_pp = NULL;
                }
                ipsec_unregister_kernel_pipe_nexus();
+       } else {
+               ipsec_flag_set(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED);
        }
 
        return result;
@@ -2165,8 +2492,12 @@ ipsec_free_pcb(struct ipsec_pcb *pcb, bool in_list)
 {
 #if IPSEC_NEXUS
        mbuf_freem_list(pcb->ipsec_input_chain);
+       pcb->ipsec_input_chain_count = 0;
        lck_mtx_destroy(&pcb->ipsec_input_chain_lock, ipsec_lck_grp);
+       lck_mtx_destroy(&pcb->ipsec_kpipe_encrypt_lock, ipsec_lck_grp);
+       lck_mtx_destroy(&pcb->ipsec_kpipe_decrypt_lock, ipsec_lck_grp);
 #endif // IPSEC_NEXUS
+       lck_mtx_destroy(&pcb->ipsec_pcb_data_move_lock, ipsec_lck_grp);
        lck_rw_destroy(&pcb->ipsec_pcb_lock, ipsec_lck_grp);
        if (in_list) {
                lck_mtx_lock(&ipsec_lock);
@@ -2193,14 +2524,18 @@ ipsec_ctl_bind(kern_ctl_ref kctlref,
 #if IPSEC_NEXUS
        pcb->ipsec_use_netif = false;
        pcb->ipsec_slot_size = IPSEC_IF_DEFAULT_SLOT_SIZE;
-       pcb->ipsec_netif_ring_size = IPSEC_IF_DEFAULT_RING_SIZE;
-       pcb->ipsec_tx_fsw_ring_size = IPSEC_IF_DEFAULT_TX_FSW_RING_SIZE;
-       pcb->ipsec_rx_fsw_ring_size = IPSEC_IF_DEFAULT_RX_FSW_RING_SIZE;
+       pcb->ipsec_netif_ring_size = if_ipsec_ring_size;
+       pcb->ipsec_tx_fsw_ring_size = if_ipsec_tx_fsw_ring_size;
+       pcb->ipsec_rx_fsw_ring_size = if_ipsec_rx_fsw_ring_size;
 #endif // IPSEC_NEXUS
 
        lck_rw_init(&pcb->ipsec_pcb_lock, ipsec_lck_grp, ipsec_lck_attr);
+       lck_mtx_init(&pcb->ipsec_pcb_data_move_lock, ipsec_lck_grp, ipsec_lck_attr);
 #if IPSEC_NEXUS
+       pcb->ipsec_input_chain_count = 0;
        lck_mtx_init(&pcb->ipsec_input_chain_lock, ipsec_lck_grp, ipsec_lck_attr);
+       lck_mtx_init(&pcb->ipsec_kpipe_encrypt_lock, ipsec_lck_grp, ipsec_lck_attr);
+       lck_mtx_init(&pcb->ipsec_kpipe_decrypt_lock, ipsec_lck_grp, ipsec_lck_attr);
 #endif // IPSEC_NEXUS
 
        return 0;
@@ -2259,7 +2594,7 @@ ipsec_ctl_connect(kern_ctl_ref kctlref,
 
        snprintf(pcb->ipsec_if_xname, sizeof(pcb->ipsec_if_xname), "ipsec%d", pcb->ipsec_unit - 1);
        snprintf(pcb->ipsec_unique_name, sizeof(pcb->ipsec_unique_name), "ipsecid%d", pcb->ipsec_unique_id - 1);
-       printf("ipsec_ctl_connect: creating interface %s (id %s)\n", pcb->ipsec_if_xname, pcb->ipsec_unique_name);
+       os_log(OS_LOG_DEFAULT, "ipsec_ctl_connect: creating interface %s (id %s)\n", pcb->ipsec_if_xname, pcb->ipsec_unique_name);
 
        /* Create the interface */
        bzero(&ipsec_init, sizeof(ipsec_init));
@@ -2279,8 +2614,7 @@ ipsec_ctl_connect(kern_ctl_ref kctlref,
        ipsec_init.unit = pcb->ipsec_unit - 1;
        ipsec_init.uniqueid = pcb->ipsec_unique_name;
        ipsec_init.uniqueid_len = strlen(pcb->ipsec_unique_name);
-       ipsec_init.family = ipsec_family;
-       ipsec_init.subfamily = IFNET_SUBFAMILY_IPSEC;
+       ipsec_init.family = IFNET_FAMILY_IPSEC;
        ipsec_init.type = IFT_OTHER;
        ipsec_init.demux = ipsec_demux;
        ipsec_init.add_proto = ipsec_add_proto;
@@ -2290,18 +2624,52 @@ ipsec_ctl_connect(kern_ctl_ref kctlref,
        ipsec_init.detach = ipsec_detached;
 
 #if IPSEC_NEXUS
+       /* We don't support kpipes without a netif */
+       if (pcb->ipsec_kpipe_count && !pcb->ipsec_use_netif) {
+               result = ENOTSUP;
+               os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - kpipe requires netif: failed %d\n", result);
+               ipsec_free_pcb(pcb, true);
+               *unitinfo = NULL;
+               return result;
+       }
+
+       if (if_ipsec_debug != 0) {
+               printf("%s: %s%d use_netif %d kpipe_count %d slot_size %u ring_size %u "
+                   "kpipe_tx_ring_size %u kpipe_rx_ring_size %u\n",
+                   __func__,
+                   ipsec_init.name, ipsec_init.unit,
+                   pcb->ipsec_use_netif,
+                   pcb->ipsec_kpipe_count,
+                   pcb->ipsec_slot_size,
+                   pcb->ipsec_netif_ring_size,
+                   pcb->ipsec_kpipe_tx_ring_size,
+                   pcb->ipsec_kpipe_rx_ring_size);
+       }
        if (pcb->ipsec_use_netif) {
+               if (pcb->ipsec_kpipe_count) {
+                       result = ipsec_enable_channel(pcb, current_proc());
+                       if (result) {
+                               os_log_error(OS_LOG_DEFAULT, "%s: %s failed to enable channels\n",
+                                   __func__, pcb->ipsec_if_xname);
+                               ipsec_free_pcb(pcb, true);
+                               *unitinfo = NULL;
+                               return result;
+                       }
+               }
+
                result = ipsec_nexus_ifattach(pcb, &ipsec_init, &pcb->ipsec_ifp);
                if (result != 0) {
-                       printf("ipsec_ctl_connect - ipsec_nexus_ifattach failed: %d\n", result);
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - ipsec_nexus_ifattach failed: %d\n", result);
                        ipsec_free_pcb(pcb, true);
                        *unitinfo = NULL;
                        return result;
                }
 
-               result = ipsec_multistack_attach(pcb);
+               result = ipsec_flowswitch_attach(pcb);
                if (result != 0) {
-                       printf("ipsec_ctl_connect - ipsec_multistack_attach failed: %d\n", result);
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - ipsec_flowswitch_attach failed: %d\n", result);
+                       // Do not call ipsec_free_pcb(). We will be attached already, and will be freed later
+                       // in ipsec_detached().
                        *unitinfo = NULL;
                        return result;
                }
@@ -2313,7 +2681,7 @@ ipsec_ctl_connect(kern_ctl_ref kctlref,
        {
                result = ifnet_allocate_extended(&ipsec_init, &pcb->ipsec_ifp);
                if (result != 0) {
-                       printf("ipsec_ctl_connect - ifnet_allocate failed: %d\n", result);
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - ifnet_allocate failed: %d\n", result);
                        ipsec_free_pcb(pcb, true);
                        *unitinfo = NULL;
                        return result;
@@ -2323,7 +2691,7 @@ ipsec_ctl_connect(kern_ctl_ref kctlref,
                /* Attach the interface */
                result = ifnet_attach(pcb->ipsec_ifp, NULL);
                if (result != 0) {
-                       printf("ipsec_ctl_connect - ifnet_attach failed: %d\n", result);
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_connect - ifnet_attach failed: %d\n", result);
                        ifnet_release(pcb->ipsec_ifp);
                        ipsec_free_pcb(pcb, true);
                        *unitinfo = NULL;
@@ -2334,6 +2702,16 @@ ipsec_ctl_connect(kern_ctl_ref kctlref,
                bpfattach(pcb->ipsec_ifp, DLT_NULL, 0);
        }
 
+       /*
+        * Mark the data path as ready.
+        * If kpipe nexus is being used then the data path is marked ready only when a kpipe channel is connected.
+        */
+       if (pcb->ipsec_kpipe_count == 0) {
+               lck_mtx_lock(&pcb->ipsec_pcb_data_move_lock);
+               IPSEC_SET_DATA_PATH_READY(pcb);
+               lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
+       }
+
        /* The interfaces resoures allocated, mark it as running */
        ifnet_set_flags(pcb->ipsec_ifp, IFF_RUNNING, IFF_RUNNING);
 
@@ -2386,11 +2764,11 @@ ipsec_remove_address(ifnet_t                            interface,
                    ifnet_name(interface), ifnet_unit(interface));
                result = ifaddr_address(address, &ifr.ifr_addr, sizeof(ifr.ifr_addr));
                if (result != 0) {
-                       printf("ipsec_remove_address - ifaddr_address failed: %d", result);
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_remove_address - ifaddr_address failed: %d", result);
                } else {
                        result = sock_ioctl(pf_socket, SIOCDIFADDR, &ifr);
                        if (result != 0) {
-                               printf("ipsec_remove_address - SIOCDIFADDR failed: %d", result);
+                               os_log_error(OS_LOG_DEFAULT, "ipsec_remove_address - SIOCDIFADDR failed: %d", result);
                        }
                }
        } else if (protocol == PF_INET6) {
@@ -2402,12 +2780,12 @@ ipsec_remove_address(ifnet_t                            interface,
                result = ifaddr_address(address, (struct sockaddr*)&ifr6.ifr_addr,
                    sizeof(ifr6.ifr_addr));
                if (result != 0) {
-                       printf("ipsec_remove_address - ifaddr_address failed (v6): %d",
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_remove_address - ifaddr_address failed (v6): %d",
                            result);
                } else {
                        result = sock_ioctl(pf_socket, SIOCDIFADDR_IN6, &ifr6);
                        if (result != 0) {
-                               printf("ipsec_remove_address - SIOCDIFADDR_IN6 failed: %d",
+                               os_log_error(OS_LOG_DEFAULT, "ipsec_remove_address - SIOCDIFADDR_IN6 failed: %d",
                                    result);
                        }
                }
@@ -2424,7 +2802,7 @@ ipsec_cleanup_family(ifnet_t                            interface,
        int                     i;
 
        if (protocol != PF_INET && protocol != PF_INET6) {
-               printf("ipsec_cleanup_family - invalid protocol family %d\n", protocol);
+               os_log_error(OS_LOG_DEFAULT, "ipsec_cleanup_family - invalid protocol family %d\n", protocol);
                return;
        }
 
@@ -2432,7 +2810,7 @@ ipsec_cleanup_family(ifnet_t                            interface,
        result = sock_socket(protocol, SOCK_DGRAM, 0, NULL, NULL, &pf_socket);
        if (result != 0) {
                if (result != EAFNOSUPPORT) {
-                       printf("ipsec_cleanup_family - failed to create %s socket: %d\n",
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_cleanup_family - failed to create %s socket: %d\n",
                            protocol == PF_INET ? "IP" : "IPv6", result);
                }
                goto cleanup;
@@ -2447,7 +2825,7 @@ ipsec_cleanup_family(ifnet_t                            interface,
                goto cleanup;
        } else if (result != EBUSY) {
                /* Uh, not really sure what happened here... */
-               printf("ipsec_cleanup_family - ipsec_detach_ip failed: %d\n", result);
+               os_log_error(OS_LOG_DEFAULT, "ipsec_cleanup_family - ipsec_detach_ip failed: %d\n", result);
                goto cleanup;
        }
 
@@ -2457,7 +2835,7 @@ ipsec_cleanup_family(ifnet_t                            interface,
         */
        result = ifnet_get_address_list_family(interface, &addresses, protocol);
        if (result != 0) {
-               printf("fnet_get_address_list_family(%s%d, 0xblah, %s) - failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "fnet_get_address_list_family(%s%d, 0xblah, %s) - failed: %d\n",
                    ifnet_name(interface), ifnet_unit(interface),
                    protocol == PF_INET ? "PF_INET" : "PF_INET6", result);
                goto cleanup;
@@ -2474,7 +2852,7 @@ ipsec_cleanup_family(ifnet_t                            interface,
         */
        result = ipsec_detach_ip(interface, protocol, pf_socket);
        if (result != 0 && result != ENXIO) {
-               printf("ipsec_cleanup_family - ipsec_detach_ip failed: %d\n", result);
+               os_log_error(OS_LOG_DEFAULT, "ipsec_cleanup_family - ipsec_detach_ip failed: %d\n", result);
        }
 
 cleanup:
@@ -2500,6 +2878,9 @@ ipsec_ctl_disconnect(__unused kern_ctl_ref      kctlref,
                return EINVAL;
        }
 
+       /* Wait until all threads in the data paths are done. */
+       ipsec_wait_data_move_drain(pcb);
+
 #if IPSEC_NEXUS
        // Tell the nexus to stop all rings
        if (pcb->ipsec_netif_nexus != NULL) {
@@ -2510,10 +2891,13 @@ ipsec_ctl_disconnect(__unused kern_ctl_ref      kctlref,
        lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock);
 
 #if IPSEC_NEXUS
-       uuid_t kpipe_uuid;
-       uuid_copy(kpipe_uuid, pcb->ipsec_kpipe_uuid);
-       uuid_clear(pcb->ipsec_kpipe_uuid);
-       pcb->ipsec_kpipe_enabled = FALSE;
+       if (if_ipsec_debug != 0) {
+               printf("ipsec_ctl_disconnect: detaching interface %s (id %s)\n",
+                   pcb->ipsec_if_xname, pcb->ipsec_unique_name);
+       }
+
+       struct ipsec_detached_channels dc;
+       ipsec_detach_channels(pcb, &dc);
 #endif // IPSEC_NEXUS
 
        pcb->ipsec_ctlref = NULL;
@@ -2547,15 +2931,8 @@ ipsec_ctl_disconnect(__unused kern_ctl_ref      kctlref,
 
                        lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock);
 
-                       if (!uuid_is_null(kpipe_uuid)) {
-                               if (kern_nexus_controller_free_provider_instance(ipsec_ncd, kpipe_uuid) == 0) {
-                                       if (pcb->ipsec_kpipe_pp != NULL) {
-                                               kern_pbufpool_destroy(pcb->ipsec_kpipe_pp);
-                                               pcb->ipsec_kpipe_pp = NULL;
-                                       }
-                                       ipsec_unregister_kernel_pipe_nexus();
-                               }
-                       }
+                       ipsec_free_channels(&dc);
+
                        ipsec_nexus_detach(pcb);
 
                        /* Decrement refcnt to finish detaching and freeing */
@@ -2566,15 +2943,7 @@ ipsec_ctl_disconnect(__unused kern_ctl_ref      kctlref,
                        lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock);
 
 #if IPSEC_NEXUS
-                       if (!uuid_is_null(kpipe_uuid)) {
-                               if (kern_nexus_controller_free_provider_instance(ipsec_ncd, kpipe_uuid) == 0) {
-                                       if (pcb->ipsec_kpipe_pp != NULL) {
-                                               kern_pbufpool_destroy(pcb->ipsec_kpipe_pp);
-                                               pcb->ipsec_kpipe_pp = NULL;
-                                       }
-                                       ipsec_unregister_kernel_pipe_nexus();
-                               }
-                       }
+                       ipsec_free_channels(&dc);
 #endif // IPSEC_NEXUS
 
                        /*
@@ -2594,7 +2963,7 @@ ipsec_ctl_disconnect(__unused kern_ctl_ref      kctlref,
                         * ifnet_release().
                         */
                        if ((result = ifnet_detach(ifp)) != 0) {
-                               printf("ipsec_ctl_disconnect - ifnet_detach failed: %d\n", result);
+                               os_log_error(OS_LOG_DEFAULT, "ipsec_ctl_disconnect - ifnet_detach failed: %d\n", result);
                        }
                }
        } else {
@@ -2642,15 +3011,16 @@ ipsec_ctl_setopt(__unused kern_ctl_ref  kctlref,
        }
 
        switch (opt) {
-       case IPSEC_OPT_FLAGS:
+       case IPSEC_OPT_FLAGS: {
                if (len != sizeof(u_int32_t)) {
                        result = EMSGSIZE;
                } else {
-                       pcb->ipsec_flags = *(u_int32_t *)data;
+                       pcb->ipsec_external_flags = *(u_int32_t *)data;
                }
                break;
+       }
 
-       case IPSEC_OPT_EXT_IFDATA_STATS:
+       case IPSEC_OPT_EXT_IFDATA_STATS: {
                if (len != sizeof(int)) {
                        result = EMSGSIZE;
                        break;
@@ -2662,6 +3032,7 @@ ipsec_ctl_setopt(__unused kern_ctl_ref  kctlref,
                }
                pcb->ipsec_ext_ifdata_stats = (*(int *)data) ? 1 : 0;
                break;
+       }
 
        case IPSEC_OPT_INC_IFDATA_STATS_IN:
        case IPSEC_OPT_INC_IFDATA_STATS_OUT: {
@@ -2691,8 +3062,8 @@ ipsec_ctl_setopt(__unused kern_ctl_ref  kctlref,
        }
 
        case IPSEC_OPT_SET_DELEGATE_INTERFACE: {
-               ifnet_t         del_ifp = NULL;
-               char            name[IFNAMSIZ];
+               ifnet_t del_ifp = NULL;
+               char name[IFNAMSIZ];
 
                if (len > IFNAMSIZ - 1) {
                        result = EMSGSIZE;
@@ -2703,13 +3074,13 @@ ipsec_ctl_setopt(__unused kern_ctl_ref  kctlref,
                        result = EINVAL;
                        break;
                }
-               if (len != 0) {           /* if len==0, del_ifp will be NULL causing the delegate to be removed */
+               if (len != 0) {                   /* if len==0, del_ifp will be NULL causing the delegate to be removed */
                        bcopy(data, name, len);
                        name[len] = 0;
                        result = ifnet_find_by_name(name, &del_ifp);
                }
                if (result == 0) {
-                       printf("%s IPSEC_OPT_SET_DELEGATE_INTERFACE %s to %s\n",
+                       os_log_error(OS_LOG_DEFAULT, "%s IPSEC_OPT_SET_DELEGATE_INTERFACE %s to %s\n",
                            __func__, pcb->ipsec_ifp->if_xname,
                            del_ifp ? del_ifp->if_xname : "NULL");
 
@@ -2737,7 +3108,7 @@ ipsec_ctl_setopt(__unused kern_ctl_ref  kctlref,
                } else {
                        pcb->ipsec_output_service_class = output_service_class;
                }
-               printf("%s IPSEC_OPT_OUTPUT_TRAFFIC_CLASS %s svc %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s IPSEC_OPT_OUTPUT_TRAFFIC_CLASS %s svc %d\n",
                    __func__, pcb->ipsec_ifp->if_xname,
                    pcb->ipsec_output_service_class);
                break;
@@ -2749,16 +3120,36 @@ ipsec_ctl_setopt(__unused kern_ctl_ref  kctlref,
                        result = EMSGSIZE;
                        break;
                }
-               if (pcb->ipsec_ifp == NULL) {
-                       // Only can set after connecting
+               if (pcb->ipsec_ifp != NULL) {
+                       // Only can set before connecting
                        result = EINVAL;
                        break;
                }
-               if (*(int *)data) {
-                       result = ipsec_enable_channel(pcb, current_proc());
-               } else {
-                       result = ipsec_disable_channel(pcb);
+               if ((*(int *)data) != 0 &&
+                   (*(int *)data) != 1 &&
+                   (*(int *)data) != IPSEC_IF_WMM_RING_COUNT) {
+                       result = EINVAL;
+                       break;
                }
+               lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock);
+               pcb->ipsec_kpipe_count = *(int *)data;
+               lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock);
+               break;
+       }
+
+       case IPSEC_OPT_CHANNEL_BIND_PID: {
+               if (len != sizeof(pid_t)) {
+                       result = EMSGSIZE;
+                       break;
+               }
+               if (pcb->ipsec_ifp != NULL) {
+                       // Only can set before connecting
+                       result = EINVAL;
+                       break;
+               }
+               lck_rw_lock_exclusive(&pcb->ipsec_pcb_lock);
+               pcb->ipsec_kpipe_pid = *(pid_t *)data;
+               lck_rw_unlock_exclusive(&pcb->ipsec_pcb_lock);
                break;
        }
 
@@ -2772,21 +3163,27 @@ ipsec_ctl_setopt(__unused kern_ctl_ref  kctlref,
                        result = EINVAL;
                        break;
                }
-               if (!if_is_netagent_enabled()) {
+               if (!if_is_fsw_transport_netagent_enabled()) {
                        result = ENOTSUP;
                        break;
                }
-               if (uuid_is_null(pcb->ipsec_nx.ms_agent)) {
+               if (uuid_is_null(pcb->ipsec_nx.fsw_agent)) {
                        result = ENOENT;
                        break;
                }
 
+               uint32_t flags = netagent_get_flags(pcb->ipsec_nx.fsw_agent);
+
                if (*(int *)data) {
-                       if_add_netagent(pcb->ipsec_ifp, pcb->ipsec_nx.ms_agent);
+                       flags |= (NETAGENT_FLAG_NEXUS_PROVIDER |
+                           NETAGENT_FLAG_NEXUS_LISTENER);
+                       result = netagent_set_flags(pcb->ipsec_nx.fsw_agent, flags);
                        pcb->ipsec_needs_netagent = true;
                } else {
                        pcb->ipsec_needs_netagent = false;
-                       if_delete_netagent(pcb->ipsec_ifp, pcb->ipsec_nx.ms_agent);
+                       flags &= ~(NETAGENT_FLAG_NEXUS_PROVIDER |
+                           NETAGENT_FLAG_NEXUS_LISTENER);
+                       result = netagent_set_flags(pcb->ipsec_nx.fsw_agent, flags);
                }
                break;
        }
@@ -2801,7 +3198,6 @@ ipsec_ctl_setopt(__unused kern_ctl_ref  kctlref,
                        pcb->ipsec_frag_size_set = FALSE;
                        pcb->ipsec_input_frag_size = 0;
                } else {
-                       printf("SET FRAG SIZE TO %u\n", input_frag_size);
                        pcb->ipsec_frag_size_set = TRUE;
                        pcb->ipsec_input_frag_size = input_frag_size;
                }
@@ -2838,6 +3234,9 @@ ipsec_ctl_setopt(__unused kern_ctl_ref  kctlref,
                        return EINVAL;
                }
                pcb->ipsec_slot_size = slot_size;
+               if (if_ipsec_debug != 0) {
+                       printf("%s: IPSEC_OPT_SLOT_SIZE %u\n", __func__, slot_size);
+               }
                break;
        }
        case IPSEC_OPT_NETIF_RING_SIZE: {
@@ -2856,6 +3255,9 @@ ipsec_ctl_setopt(__unused kern_ctl_ref  kctlref,
                        return EINVAL;
                }
                pcb->ipsec_netif_ring_size = ring_size;
+               if (if_ipsec_debug != 0) {
+                       printf("%s: IPSEC_OPT_NETIF_RING_SIZE %u\n", __func__, ring_size);
+               }
                break;
        }
        case IPSEC_OPT_TX_FSW_RING_SIZE: {
@@ -2874,6 +3276,9 @@ ipsec_ctl_setopt(__unused kern_ctl_ref  kctlref,
                        return EINVAL;
                }
                pcb->ipsec_tx_fsw_ring_size = ring_size;
+               if (if_ipsec_debug != 0) {
+                       printf("%s: IPSEC_OPT_TX_FSW_RING_SIZE %u\n", __func__, ring_size);
+               }
                break;
        }
        case IPSEC_OPT_RX_FSW_RING_SIZE: {
@@ -2892,15 +3297,61 @@ ipsec_ctl_setopt(__unused kern_ctl_ref  kctlref,
                        return EINVAL;
                }
                pcb->ipsec_rx_fsw_ring_size = ring_size;
+               if (if_ipsec_debug != 0) {
+                       printf("%s: IPSEC_OPT_TX_FSW_RING_SIZE %u\n", __func__, ring_size);
+               }
+               break;
+       }
+       case IPSEC_OPT_KPIPE_TX_RING_SIZE: {
+               if (len != sizeof(u_int32_t)) {
+                       result = EMSGSIZE;
+                       break;
+               }
+               if (pcb->ipsec_ifp != NULL) {
+                       // Only can set before connecting
+                       result = EINVAL;
+                       break;
+               }
+               u_int32_t ring_size = *(u_int32_t *)data;
+               if (ring_size < IPSEC_IF_MIN_RING_SIZE ||
+                   ring_size > IPSEC_IF_MAX_RING_SIZE) {
+                       return EINVAL;
+               }
+               pcb->ipsec_kpipe_tx_ring_size = ring_size;
+               if (if_ipsec_debug != 0) {
+                       printf("%s: IPSEC_OPT_KPIPE_TX_RING_SIZE %u\n", __func__, ring_size);
+               }
+               break;
+       }
+       case IPSEC_OPT_KPIPE_RX_RING_SIZE: {
+               if (len != sizeof(u_int32_t)) {
+                       result = EMSGSIZE;
+                       break;
+               }
+               if (pcb->ipsec_ifp != NULL) {
+                       // Only can set before connecting
+                       result = EINVAL;
+                       break;
+               }
+               u_int32_t ring_size = *(u_int32_t *)data;
+               if (ring_size < IPSEC_IF_MIN_RING_SIZE ||
+                   ring_size > IPSEC_IF_MAX_RING_SIZE) {
+                       return EINVAL;
+               }
+               pcb->ipsec_kpipe_rx_ring_size = ring_size;
+               if (if_ipsec_debug != 0) {
+                       printf("%s: IPSEC_OPT_KPIPE_RX_RING_SIZE %u\n", __func__, ring_size);
+               }
                break;
        }
 
 #endif // IPSEC_NEXUS
 
-       default:
+       default: {
                result = ENOPROTOOPT;
                break;
        }
+       }
 
        return result;
 }
@@ -2921,7 +3372,7 @@ ipsec_ctl_getopt(__unused kern_ctl_ref kctlref,
                if (*len != sizeof(u_int32_t)) {
                        result = EMSGSIZE;
                } else {
-                       *(u_int32_t *)data = pcb->ipsec_flags;
+                       *(u_int32_t *)data = pcb->ipsec_external_flags;
                }
                break;
        }
@@ -2965,7 +3416,18 @@ ipsec_ctl_getopt(__unused kern_ctl_ref kctlref,
                        result = EMSGSIZE;
                } else {
                        lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
-                       *(int *)data = pcb->ipsec_kpipe_enabled;
+                       *(int *)data = pcb->ipsec_kpipe_count;
+                       lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
+               }
+               break;
+       }
+
+       case IPSEC_OPT_CHANNEL_BIND_PID: {
+               if (*len != sizeof(pid_t)) {
+                       result = EMSGSIZE;
+               } else {
+                       lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
+                       *(pid_t *)data = pcb->ipsec_kpipe_pid;
                        lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
                }
                break;
@@ -2975,7 +3437,7 @@ ipsec_ctl_getopt(__unused kern_ctl_ref kctlref,
                if (*len != sizeof(int)) {
                        result = EMSGSIZE;
                } else {
-                       *(int *)data = if_check_netagent(pcb->ipsec_ifp, pcb->ipsec_nx.ms_agent);
+                       *(int *)data = if_check_netagent(pcb->ipsec_ifp, pcb->ipsec_nx.fsw_agent);
                }
                break;
        }
@@ -2993,12 +3455,14 @@ ipsec_ctl_getopt(__unused kern_ctl_ref kctlref,
 
        case IPSEC_OPT_GET_CHANNEL_UUID: {
                lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
-               if (uuid_is_null(pcb->ipsec_kpipe_uuid)) {
+               if (!ipsec_flag_isset(pcb, IPSEC_FLAGS_KPIPE_ALLOCATED)) {
                        result = ENXIO;
-               } else if (*len != sizeof(uuid_t)) {
+               } else if (*len != sizeof(uuid_t) * pcb->ipsec_kpipe_count) {
                        result = EMSGSIZE;
                } else {
-                       uuid_copy(data, pcb->ipsec_kpipe_uuid);
+                       for (unsigned int i = 0; i < pcb->ipsec_kpipe_count; i++) {
+                               uuid_copy(((uuid_t *)data)[i], pcb->ipsec_kpipe_uuid[i]);
+                       }
                }
                lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
                break;
@@ -3044,6 +3508,22 @@ ipsec_ctl_getopt(__unused kern_ctl_ref kctlref,
                }
                break;
        }
+       case IPSEC_OPT_KPIPE_TX_RING_SIZE: {
+               if (*len != sizeof(u_int32_t)) {
+                       result = EMSGSIZE;
+               } else {
+                       *(u_int32_t *)data = pcb->ipsec_kpipe_tx_ring_size;
+               }
+               break;
+       }
+       case IPSEC_OPT_KPIPE_RX_RING_SIZE: {
+               if (*len != sizeof(u_int32_t)) {
+                       result = EMSGSIZE;
+               } else {
+                       *(u_int32_t *)data = pcb->ipsec_kpipe_rx_ring_size;
+               }
+               break;
+       }
 
 #endif // IPSEC_NEXUS
 
@@ -3112,7 +3592,7 @@ ipsec_output(ifnet_t interface,
                data = ipsec_state.m;
                if (error || data == NULL) {
                        if (error) {
-                               printf("ipsec_output: ipsec4_output error %d.\n", error);
+                               os_log_error(OS_LOG_DEFAULT, "ipsec_output: ipsec4_output error %d.\n", error);
                        }
                        goto ipsec_output_err;
                }
@@ -3171,7 +3651,7 @@ ipsec_output(ifnet_t interface,
 
                data = ipsec6_splithdr(data);
                if (data == NULL) {
-                       printf("ipsec_output: ipsec6_splithdr returned NULL\n");
+                       os_log_error(OS_LOG_DEFAULT, "ipsec_output: ipsec6_splithdr returned NULL\n");
                        goto ipsec_output_err;
                }
 
@@ -3189,7 +3669,7 @@ ipsec_output(ifnet_t interface,
                data = ipsec_state.m;
                if (error || data == NULL) {
                        if (error) {
-                               printf("ipsec_output: ipsec6_output error %d\n", error);
+                               os_log_error(OS_LOG_DEFAULT, "ipsec_output: ipsec6_output error %d\n", error);
                        }
                        goto ipsec_output_err;
                }
@@ -3232,7 +3712,7 @@ ipsec_output(ifnet_t interface,
                goto done;
        }
        default: {
-               printf("ipsec_output: Received unknown packet version %d.\n", ip_version);
+               os_log_error(OS_LOG_DEFAULT, "ipsec_output: Received unknown packet version %d.\n", ip_version);
                error = EINVAL;
                goto ipsec_output_err;
        }
@@ -3357,6 +3837,30 @@ ipsec_ioctl(ifnet_t interface,
                /* ifioctl() takes care of it */
                break;
 
+       case SIOCSIFSUBFAMILY: {
+               uint32_t subfamily;
+
+               subfamily = ((struct ifreq*)data)->ifr_type.ift_subfamily;
+               switch (subfamily) {
+               case IFRTYPE_SUBFAMILY_BLUETOOTH:
+                       interface->if_subfamily = IFNET_SUBFAMILY_BLUETOOTH;
+                       break;
+               case IFRTYPE_SUBFAMILY_WIFI:
+                       interface->if_subfamily = IFNET_SUBFAMILY_WIFI;
+                       break;
+               case IFRTYPE_SUBFAMILY_QUICKRELAY:
+                       interface->if_subfamily = IFNET_SUBFAMILY_QUICKRELAY;
+                       break;
+               case IFRTYPE_SUBFAMILY_DEFAULT:
+                       interface->if_subfamily = IFNET_SUBFAMILY_DEFAULT;
+                       break;
+               default:
+                       result = EINVAL;
+                       break;
+               }
+               break;
+       }
+
        default:
                result = EOPNOTSUPP;
        }
@@ -3368,6 +3872,7 @@ static void
 ipsec_detached(ifnet_t interface)
 {
        struct ipsec_pcb *pcb = ifnet_softc(interface);
+
        (void)ifnet_release(interface);
        ipsec_free_pcb(pcb, true);
 }
@@ -3435,7 +3940,7 @@ ipsec_attach_proto(ifnet_t                              interface,
 
        result = ifnet_attach_protocol(interface, protocol, &proto);
        if (result != 0 && result != EEXIST) {
-               printf("ipsec_attach_inet - ifnet_attach_protocol %d failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "ipsec_attach_inet - ifnet_attach_protocol %d failed: %d\n",
                    protocol, result);
        }
 
@@ -3450,28 +3955,45 @@ ipsec_inject_inbound_packet(ifnet_t     interface,
        struct ipsec_pcb *pcb = ifnet_softc(interface);
 
        if (pcb->ipsec_use_netif) {
+               if (!ipsec_data_move_begin(pcb)) {
+                       os_log_info(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__,
+                           if_name(pcb->ipsec_ifp));
+                       return ENXIO;
+               }
+
                lck_rw_lock_shared(&pcb->ipsec_pcb_lock);
 
                lck_mtx_lock(&pcb->ipsec_input_chain_lock);
+
+               if (pcb->ipsec_input_chain_count > (u_int32_t)if_ipsec_max_pending_input) {
+                       lck_mtx_unlock(&pcb->ipsec_input_chain_lock);
+                       lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
+                       ipsec_data_move_end(pcb);
+                       return ENOSPC;
+               }
+
                if (pcb->ipsec_input_chain != NULL) {
                        pcb->ipsec_input_chain_last->m_nextpkt = packet;
                } else {
                        pcb->ipsec_input_chain = packet;
                }
+               pcb->ipsec_input_chain_count++;
                while (packet->m_nextpkt) {
                        VERIFY(packet != packet->m_nextpkt);
                        packet = packet->m_nextpkt;
+                       pcb->ipsec_input_chain_count++;
                }
                pcb->ipsec_input_chain_last = packet;
                lck_mtx_unlock(&pcb->ipsec_input_chain_lock);
 
-               kern_channel_ring_t rx_ring = pcb->ipsec_netif_rxring;
+               kern_channel_ring_t rx_ring = pcb->ipsec_netif_rxring[0];
                lck_rw_unlock_shared(&pcb->ipsec_pcb_lock);
 
                if (rx_ring != NULL) {
                        kern_channel_notify(rx_ring, 0);
                }
 
+               ipsec_data_move_end(pcb);
                return 0;
        } else
 #endif // IPSEC_NEXUS
@@ -3551,3 +4073,63 @@ ipsec_set_ip6oa_for_interface(ifnet_t interface, struct ip6_out_args *ip6oa)
                ip6oa->ip6oa_sotc = SO_TC_VO;
        }
 }
+
+static boolean_t
+ipsec_data_move_begin(struct ipsec_pcb *pcb)
+{
+       boolean_t ret = 0;
+
+       lck_mtx_lock_spin(&pcb->ipsec_pcb_data_move_lock);
+       if ((ret = IPSEC_IS_DATA_PATH_READY(pcb))) {
+               pcb->ipsec_pcb_data_move++;
+       }
+       lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
+
+       return ret;
+}
+
+static void
+ipsec_data_move_end(struct ipsec_pcb *pcb)
+{
+       lck_mtx_lock_spin(&pcb->ipsec_pcb_data_move_lock);
+       VERIFY(pcb->ipsec_pcb_data_move > 0);
+       /*
+        * if there's no more thread moving data, wakeup any
+        * drainers that's blocked waiting for this.
+        */
+       if (--pcb->ipsec_pcb_data_move == 0 && pcb->ipsec_pcb_drainers > 0) {
+               wakeup(&(pcb->ipsec_pcb_data_move));
+       }
+       lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
+}
+
+static void
+ipsec_data_move_drain(struct ipsec_pcb *pcb)
+{
+       lck_mtx_lock(&pcb->ipsec_pcb_data_move_lock);
+       /* data path must already be marked as not ready */
+       VERIFY(!IPSEC_IS_DATA_PATH_READY(pcb));
+       pcb->ipsec_pcb_drainers++;
+       while (pcb->ipsec_pcb_data_move != 0) {
+               (void)msleep(&(pcb->ipsec_pcb_data_move), &pcb->ipsec_pcb_data_move_lock,
+                   (PZERO - 1), __func__, NULL);
+       }
+       VERIFY(!IPSEC_IS_DATA_PATH_READY(pcb));
+       VERIFY(pcb->ipsec_pcb_drainers > 0);
+       pcb->ipsec_pcb_drainers--;
+       lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
+}
+
+static void
+ipsec_wait_data_move_drain(struct ipsec_pcb *pcb)
+{
+       /*
+        * Mark the data path as not usable.
+        */
+       lck_mtx_lock(&pcb->ipsec_pcb_data_move_lock);
+       IPSEC_CLR_DATA_PATH_READY(pcb);
+       lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
+
+       /* Wait until all threads in the data paths are done. */
+       ipsec_data_move_drain(pcb);
+}
index 3c0fcbd2b5c11d665b0f7e063d075585b4ce5959..39e4f35d118c8e523ad266374647d3eb1a0d306b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -40,7 +40,6 @@ errno_t ipsec_register_control(void);
 
 /* Helpers */
 int ipsec_interface_isvalid(ifnet_t interface);
-boolean_t ipsec_interface_needs_netagent(ifnet_t interface);
 
 errno_t ipsec_inject_inbound_packet(ifnet_t     interface, mbuf_t packet);
 
@@ -61,23 +60,27 @@ void ipsec_set_ip6oa_for_interface(ifnet_t interface, struct ip6_out_args *ip6oa
 /*
  * Socket option names to manage ipsec
  */
-#define IPSEC_OPT_FLAGS                                                 1
-#define IPSEC_OPT_IFNAME                                                2
-#define IPSEC_OPT_EXT_IFDATA_STATS                              3       /* get|set (type int) */
+#define IPSEC_OPT_FLAGS                                 1
+#define IPSEC_OPT_IFNAME                                2
+#define IPSEC_OPT_EXT_IFDATA_STATS                      3       /* get|set (type int) */
 #define IPSEC_OPT_INC_IFDATA_STATS_IN                   4       /* set to increment stat counters (type struct ipsec_stats_param) */
 #define IPSEC_OPT_INC_IFDATA_STATS_OUT                  5       /* set to increment stat counters (type struct ipsec_stats_param) */
 #define IPSEC_OPT_SET_DELEGATE_INTERFACE                6       /* set the delegate interface (char[]) */
 #define IPSEC_OPT_OUTPUT_TRAFFIC_CLASS                  7       /* set the traffic class for packets leaving the interface, see sys/socket.h */
-#define IPSEC_OPT_ENABLE_CHANNEL                                8       /* enable a kernel pipe nexus that allows the owner to open a channel to act as a driver */
-#define IPSEC_OPT_GET_CHANNEL_UUID                              9       /* get the uuid of the kernel pipe nexus instance */
-#define IPSEC_OPT_ENABLE_FLOWSWITCH                             10      /* enable a flowswitch nexus that clients can use */
-#define IPSEC_OPT_INPUT_FRAG_SIZE                               11      /* set the maximum size of input packets before fragmenting as a uint32_t */
-
-#define IPSEC_OPT_ENABLE_NETIF                                  12              /* Must be set before connecting */
-#define IPSEC_OPT_SLOT_SIZE                                             13              /* Must be set before connecting */
-#define IPSEC_OPT_NETIF_RING_SIZE                               14              /* Must be set before connecting */
-#define IPSEC_OPT_TX_FSW_RING_SIZE                              15              /* Must be set before connecting */
-#define IPSEC_OPT_RX_FSW_RING_SIZE                              16              /* Must be set before connecting */
+#define IPSEC_OPT_ENABLE_CHANNEL                        8       /* enable a kernel pipe nexus that allows the owner to open a channel to act as a driver,
+                                                                *  Must be set before connecting */
+#define IPSEC_OPT_GET_CHANNEL_UUID                      9       /* get the uuid of the kernel pipe nexus instance */
+#define IPSEC_OPT_ENABLE_FLOWSWITCH                     10      /* enable a flowswitch nexus that clients can use */
+#define IPSEC_OPT_INPUT_FRAG_SIZE                       11      /* set the maximum size of input packets before fragmenting as a uint32_t */
+
+#define IPSEC_OPT_ENABLE_NETIF                          12      /* Must be set before connecting */
+#define IPSEC_OPT_SLOT_SIZE                             13      /* Must be set before connecting */
+#define IPSEC_OPT_NETIF_RING_SIZE                       14      /* Must be set before connecting */
+#define IPSEC_OPT_TX_FSW_RING_SIZE                      15      /* Must be set before connecting */
+#define IPSEC_OPT_RX_FSW_RING_SIZE                      16      /* Must be set before connecting */
+#define IPSEC_OPT_CHANNEL_BIND_PID                      17      /* Must be set before connecting */
+#define IPSEC_OPT_KPIPE_TX_RING_SIZE                    18      /* Must be set before connecting */
+#define IPSEC_OPT_KPIPE_RX_RING_SIZE                    19      /* Must be set before connecting */
 
 /*
  * ipsec stats parameter structure
index 3f9e3e89bb44133ef2ec66197793e55c9018627e..b93387b156f8cb8860e52ab9452adb125753ad2f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2018-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -135,7 +135,7 @@ if_low_power_evhdlr_init(void)
 {
        eventhandler_lists_ctxt_init(&if_low_power_evhdlr_ctx);
 
-       (void) EVENTHANDLER_REGISTER(&if_low_power_evhdlr_ctx,
+       (void)EVENTHANDLER_REGISTER(&if_low_power_evhdlr_ctx,
            if_low_power_event,
            if_low_power_evhdlr_callback,
            eventhandler_entry_dummy_arg,
index 55fd50f64f395ab15c0dcf099a8a13248e24d3a1..f1ee7273ad4c8381f8b2b4c03f7621ad6e54a0db 100644 (file)
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
-/* $NetBSD: if_media.h,v 1.3 1997/03/26 01:19:27 thorpej Exp $ */
+/*     $NetBSD: if_media.h,v 1.3 1997/03/26 01:19:27 thorpej Exp $     */
 /* $FreeBSD: src/sys/net/if_media.h,v 1.9.2.1 2001/07/04 00:12:38 brooks Exp $ */
 
 /*
  * Copyright (c) 1997
- * Jonathan Stone and Jason R. Thorpe.  All rights reserved.
+ *     Jonathan Stone and Jason R. Thorpe.  All rights reserved.
  *
  * This software is derived from information provided by Matt Thomas.
  *
@@ -44,8 +44,8 @@
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
- *    This product includes software developed by Jonathan Stone
- *     and Jason R. Thorpe for the NetBSD Project.
+ *     This product includes software developed by Jonathan Stone
+ *     and Jason R. Thorpe for the NetBSD Project.
  * 4. The names of the authors may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
@@ -88,7 +88,7 @@
  *     Bits    Use
  *     ----    -------
  *     0-4     Media variant
- *     5-7     Media type
+ *     5-7     Media type
  *     8-15    Type specific options
  *     16-19   Extended media variant bits
  *     20-27   Shared (global) options
 /*
  * Masks
  */
-#define IFM_NMASK       0x000000e0              /* Network type */
+#define IFM_NMASK       0x000000e0      /* Network type */
 #define IFM_TMASK       (IFM_TMASK_COMPAT|IFM_TMASK_EXT)    /* Media sub-type */
-#define IFM_IMASK       0xf0000000               /* Instance */
-#define IFM_ISHIFT      28                      /* Instance shift */
-#define IFM_OMASK       0x0000ff00              /* Type specific options */
-#define IFM_GMASK       0x0ff00000              /* Global options */
+#define IFM_IMASK       0xf0000000      /* Instance */
+#define IFM_ISHIFT      28              /* Instance shift */
+#define IFM_OMASK       0x0000ff00      /* Type specific options */
+#define IFM_GMASK       0x0ff00000      /* Global options */
 
 /*
  * Status bits
 /*
  * Macros to extract various bits of information from the media word.
  */
-
 #define IFM_TYPE(x)         ((x) & IFM_NMASK)
 #define IFM_SUBTYPE(x)      ((x) & IFM_TMASK)
 #define IFM_TYPE_OPTIONS(x) ((x) & IFM_OMASK)
 #define IFM_INST(x)         (((x) & IFM_IMASK) >> IFM_ISHIFT)
-#define IFM_OPTIONS(x)      ((x) & (IFM_OMASK|IFM_GMASK))
+#define IFM_OPTIONS(x)  ((x) & (IFM_OMASK|IFM_GMASK))
 
 #define IFM_INST_MAX    IFM_INST(IFM_IMASK)
 
@@ -350,21 +349,21 @@ struct ifmedia_description {
     { 0, NULL },                                    \
 }
 
-#define IFM_SUBTYPE_ETHERNET_DESCRIPTIONS {                             \
-    { IFM_10_T,                 "10baseT/UTP" },                        \
-    { IFM_10_2,                 "10base2/BNC" },                        \
-    { IFM_10_5,                 "10base5/AUI" },                        \
-    { IFM_100_TX,               "100baseTX" },                          \
-    { IFM_100_FX,               "100baseFX" },                          \
-    { IFM_100_T4,               "100baseT4" },                          \
-    { IFM_100_VG,               "100baseVG" },                          \
-    { IFM_100_T2,               "100baseT2" },                          \
-    { IFM_10_STP,               "10baseSTP" },                          \
-    { IFM_10_FL,                "10baseFL" },                           \
+#define IFM_SUBTYPE_ETHERNET_DESCRIPTIONS {         \
+    { IFM_10_T,     "10baseT/UTP" },                \
+    { IFM_10_2,     "10base2/BNC" },                \
+    { IFM_10_5,     "10base5/AUI" },                \
+    { IFM_100_TX,   "100baseTX"   },                \
+    { IFM_100_FX,   "100baseFX"   },                \
+    { IFM_100_T4,   "100baseT4"   },                \
+    { IFM_100_VG,   "100baseVG"   },                \
+    { IFM_100_T2,   "100baseT2"   },                \
+    { IFM_10_STP,   "10baseSTP"   },                \
+    { IFM_10_FL,    "10baseFL"    },                \
     { IFM_1000_SX,              "1000baseSX" },                         \
-    { IFM_1000_LX,              "1000baseLX" },                         \
-    { IFM_1000_CX,              "1000baseCX" },                         \
-    { IFM_1000_T,               "1000baseT" },                          \
+    { IFM_1000_LX,  "1000baseLX"  },                \
+    { IFM_1000_CX,  "1000baseCX"  },                \
+    { IFM_1000_T,   "1000baseT"   },                \
     { IFM_HPNA_1,               "homePNA" },                            \
     { IFM_10G_LR,               "10Gbase-LR" },                         \
     { IFM_10G_SR,               "10Gbase-SR" },                         \
@@ -377,11 +376,11 @@ struct ifmedia_description {
     { IFM_40G_CR4,              "40Gbase-CR4" },                        \
     { IFM_40G_SR4,              "40Gbase-SR4" },                        \
     { IFM_40G_LR4,              "40Gbase-LR4" },                        \
-    { IFM_1000_KX,              "1000Base-KX" },                        \
+    { IFM_1000_KX,  "1000Base-KX" },                \
     { IFM_OTHER,                "Other" },                              \
-    { IFM_10G_KX4,              "10GBase-KX4" },                        \
-    { IFM_10G_KR,               "10GBase-KR" },                         \
-    { IFM_10G_CR1,              "10GBase-CR1" },                        \
+    { IFM_10G_KX4,  "10GBase-KX4" },                \
+    { IFM_10G_KR,   "10GBase-KR" },                 \
+    { IFM_10G_CR1,  "10GBase-CR1" },                \
     { IFM_20G_KR2,              "20GBase-KR2" },                        \
     { IFM_2500_KX,              "2500Base-KX" },                        \
     { IFM_2500_T,               "2500Base-T" },                         \
@@ -393,18 +392,18 @@ struct ifmedia_description {
     { IFM_40G_XLPPI,            "40GBase-XLPPI" },                      \
     { IFM_1000_CX_SGMII,        "1000Base-CX-SGMII" },                  \
     { IFM_40G_KR4,              "40GBase-KR4" },                        \
-    { IFM_10G_ER,               "10GBase-ER" },                         \
+    { IFM_10G_ER,   "10GBase-ER" },                 \
     { IFM_100G_CR4,             "100GBase-CR4" },                       \
     { IFM_100G_SR4,             "100GBase-SR4" },                       \
     { IFM_100G_KR4,             "100GBase-KR4" },                       \
     { IFM_100G_LR4,             "100GBase-LR4" },                       \
     { IFM_56G_R4,               "56GBase-R4" },                         \
     { IFM_100_T,                "100BaseT" },                           \
-    { IFM_25G_CR,               "25GBase-CR" },                         \
-    { IFM_25G_KR,               "25GBase-KR" },                         \
-    { IFM_25G_SR,               "25GBase-SR" },                         \
-    { IFM_50G_CR2,              "50GBase-CR2" },                        \
-    { IFM_50G_KR2,              "50GBase-KR2" },                        \
+    { IFM_25G_CR,   "25GBase-CR" },                 \
+    { IFM_25G_KR,   "25GBase-KR" },                 \
+    { IFM_25G_SR,   "25GBase-SR" },                 \
+    { IFM_50G_CR2,  "50GBase-CR2" },                \
+    { IFM_50G_KR2,  "50GBase-KR2" },                \
     { IFM_25G_LR,               "25GBase-LR" },                         \
     { IFM_10G_AOC,              "10GBase-AOC" },                        \
     { IFM_25G_ACC,              "25GBase-ACC" },                        \
@@ -422,8 +421,8 @@ struct ifmedia_description {
     { IFM_40G_XLAUI,            "40G-XLAUI" },                          \
     { IFM_40G_XLAUI_AC,         "40G-XLAUI-AC" },                       \
     { IFM_40G_ER4,              "40GBase-ER4" },                        \
-    { IFM_50G_SR2,              "50GBase-SR2" },                        \
-    { IFM_50G_LR2,              "50GBase-LR2" },                        \
+    { IFM_50G_SR2,  "50GBase-SR2" },                \
+    { IFM_50G_LR2,  "50GBase-LR2" },                \
     { IFM_50G_LAUI2_AC,         "50G-LAUI2-AC" },                       \
     { IFM_50G_LAUI2,            "50G-LAUI2" },                          \
     { IFM_50G_AUI2_AC,          "50G-AUI2-AC" },                        \
@@ -465,7 +464,7 @@ struct ifmedia_description {
     { IFM_400G_DR4,             "400GBase-DR4" },                       \
     { IFM_400G_AUI8_AC,         "400G-AUI8-AC" },                       \
     { IFM_400G_AUI8,            "400G-AUI8" },                          \
-    { 0, NULL },                                                        \
+    { 0, NULL },                                    \
 }
 
 #define IFM_SUBTYPE_ETHERNET_ALIASES {              \
@@ -574,7 +573,7 @@ struct ifmedia_description {
     { IFM_FDX,      "full-duplex" },                \
     { IFM_HDX,      "half-duplex" },                \
     { IFM_FLOW,     "flow-control" },               \
-    { IFM_EEE,      "energy-efficient-ethernet" },  \
+    { IFM_EEE,     "energy-efficient-ethernet" },  \
     { IFM_FLAG0,    "flag0" },                      \
     { IFM_FLAG1,    "flag1" },                      \
     { IFM_FLAG2,    "flag2" },                      \
index 6c0f94ef84db7fb66fa282b01ec18b9b185abb81..93bd59ce7e3036299b65c0e8848560909da47417 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -192,6 +192,7 @@ make_ifmibdata(struct ifnet *ifp, int *name, struct sysctl_req *req)
                if_copy_data_extended(ifp, &ifmd_supp->ifmd_data_extended);
                if_copy_packet_stats(ifp, &ifmd_supp->ifmd_packet_stats);
                if_copy_rxpoll_stats(ifp, &ifmd_supp->ifmd_rxpoll_stats);
+               if_copy_netif_stats(ifp, &ifmd_supp->ifmd_netif_stats);
 
                if (req->oldptr == USER_ADDR_NULL) {
                        req->oldlen = sizeof(*ifmd_supp);
index 0cec1231093f39c2039a63fa2925a1c2d6bfa84e..b0c74483aaf21c1aa937fb5d2aa926c8e4bf8f87 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -77,6 +77,7 @@ struct ifmibdata_supplemental {
        struct if_data_extended ifmd_data_extended;
        struct if_packet_stats  ifmd_packet_stats;
        struct if_rxpoll_stats  ifmd_rxpoll_stats;
+       struct if_netif_stats   ifmd_netif_stats;
 };
 #endif /* PRIVATE */
 
index 2d95e9eeb8dace63256594b27e2f9de1f2102493..e4d3bcf5f96db845cd3727537590fce199b6e157 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define IFT_PROPVIRTUAL 0x35            /* Proprietary Virtual/internal */
 #define IFT_PROPMUX     0x36            /* Proprietary Multiplexing */
 /*
- * IFT_GIF, IFT_FAITH and IFT_FAITH are not based on IANA assignments.
+ * IFT_GIF, IFT_FAITH and IFT_6LOWPAN are not based on IANA assignments.
  * Note: IFT_STF has a defined ifType: 0xd7 (215), but we use 0x39.
  */
 #define IFT_GIF         0x37            /*0xf0*/
 #define IFT_FAITH       0x38            /*0xf2*/
 #define IFT_STF         0x39            /*0xf3*/
+#define IFT_6LOWPAN     0x40            /* IETF RFC 6282 */
 
 #define IFT_L2VLAN      0x87            /* Layer 2 Virtual LAN using 802.1Q */
 #define IFT_IEEE8023ADLAG 0x88          /* IEEE802.3ad Link Aggregate */
index 416ef153757dd4023d51a45c08235f21a677aacd..d29785b8b71c3838c558906276aaa3de8cafe9ce 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -54,6 +54,7 @@
 #include <sys/kauth.h>
 #include <net/necp.h>
 #include <kern/zalloc.h>
+#include <os/log.h>
 
 #define UTUN_NEXUS 0
 
@@ -66,11 +67,11 @@ static uuid_t utun_nx_dom_prov;
 typedef struct utun_nx {
        uuid_t if_provider;
        uuid_t if_instance;
-       uuid_t ms_provider;
-       uuid_t ms_instance;
-       uuid_t ms_device;
-       uuid_t ms_host;
-       uuid_t ms_agent;
+       uuid_t fsw_provider;
+       uuid_t fsw_instance;
+       uuid_t fsw_device;
+       uuid_t fsw_host;
+       uuid_t fsw_agent;
 } *utun_nx_t;
 
 #endif // UTUN_NEXUS
@@ -91,6 +92,7 @@ struct utun_pcb {
        decl_lck_rw_data(, utun_pcb_lock);
        struct mbuf *   utun_input_chain;
        struct mbuf *   utun_input_chain_last;
+       u_int32_t               utun_input_chain_count;
        // Input chain lock protects the list of input mbufs
        // The input chain lock must be taken AFTER the PCB lock if both are held
        lck_mtx_t               utun_input_chain_lock;
@@ -102,6 +104,8 @@ struct utun_pcb {
        void *                  utun_kpipe_rxring;
        void *                  utun_kpipe_txring;
        kern_pbufpool_t         utun_kpipe_pp;
+       u_int32_t               utun_kpipe_tx_ring_size;
+       u_int32_t               utun_kpipe_rx_ring_size;
 
        kern_nexus_t    utun_netif_nexus;
        kern_pbufpool_t         utun_netif_pp;
@@ -113,6 +117,10 @@ struct utun_pcb {
        u_int32_t               utun_netif_ring_size;
        u_int32_t               utun_tx_fsw_ring_size;
        u_int32_t               utun_rx_fsw_ring_size;
+       // Auto attach flowswitch when netif is enabled. When set to false,
+       // it allows userspace nexus controller to attach and own flowswitch.
+       bool                    utun_attach_fsw;
+       bool                    utun_netif_connected;
        bool                    utun_use_netif;
        bool                    utun_needs_netagent;
 #endif // UTUN_NEXUS
@@ -167,12 +175,16 @@ static errno_t utun_pkt_input(struct utun_pcb *pcb, mbuf_t m);
 #define UTUN_IF_DEFAULT_BUF_SEG_SIZE    skmem_usr_buf_seg_size
 #define UTUN_IF_HEADROOM_SIZE 32
 
-#define UTUN_IF_MIN_RING_SIZE 16
+#define UTUN_IF_MIN_RING_SIZE 8
 #define UTUN_IF_MAX_RING_SIZE 1024
 
 #define UTUN_IF_MIN_SLOT_SIZE 1024
 #define UTUN_IF_MAX_SLOT_SIZE 4096
 
+#define UTUN_DEFAULT_MAX_PENDING_INPUT_COUNT 512
+
+static int if_utun_max_pending_input = UTUN_DEFAULT_MAX_PENDING_INPUT_COUNT;
+
 static int sysctl_if_utun_ring_size SYSCTL_HANDLER_ARGS;
 static int sysctl_if_utun_tx_fsw_ring_size SYSCTL_HANDLER_ARGS;
 static int sysctl_if_utun_rx_fsw_ring_size SYSCTL_HANDLER_ARGS;
@@ -184,6 +196,7 @@ static int if_utun_rx_fsw_ring_size = UTUN_IF_DEFAULT_RX_FSW_RING_SIZE;
 SYSCTL_DECL(_net_utun);
 SYSCTL_NODE(_net, OID_AUTO, utun, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "UTun");
 
+SYSCTL_INT(_net_utun, OID_AUTO, max_pending_input, CTLFLAG_LOCKED | CTLFLAG_RW, &if_utun_max_pending_input, 0, "");
 SYSCTL_PROC(_net_utun, OID_AUTO, ring_size, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
     &if_utun_ring_size, UTUN_IF_DEFAULT_RING_SIZE, &sysctl_if_utun_ring_size, "I", "");
 SYSCTL_PROC(_net_utun, OID_AUTO, tx_fsw_ring_size, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
@@ -231,7 +244,6 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
 #define UTUN_HEADER_SIZE(_pcb) (sizeof(u_int32_t) + (((_pcb)->utun_flags & UTUN_FLAGS_ENABLE_PROC_UUID) ? sizeof(uuid_t) : 0))
 
 static kern_ctl_ref     utun_kctlref;
-static u_int32_t        utun_family;
 static lck_attr_t *utun_lck_attr;
 static lck_grp_attr_t *utun_lck_grp_attr;
 static lck_grp_t *utun_lck_grp;
@@ -359,7 +371,7 @@ utun_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
        kern_channel_slot_t tx_pslot = NULL;
        kern_channel_slot_t tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL);
 
-       STATS_INC(nifs, NETIF_STATS_TXSYNC);
+       STATS_INC(nifs, NETIF_STATS_TX_SYNC);
 
        if (tx_slot == NULL) {
                // Nothing to write, don't bother signalling
@@ -426,7 +438,7 @@ utun_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                        break;
                }
                default: {
-                       printf("utun_netif_sync_tx %s: unknown ip version %u vhl %u tx_offset %u len %u header_size %zu\n",
+                       os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_tx %s: unknown ip version %u vhl %u tx_offset %u len %u header_size %zu\n",
                            pcb->utun_ifp->if_xname, ip_version, vhl, tx_offset, tx_length,
                            UTUN_HEADER_SIZE(pcb));
                        break;
@@ -452,24 +464,24 @@ utun_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                                if (error == 0) {
                                        error = utun_output(pcb->utun_ifp, data);
                                        if (error != 0) {
-                                               printf("utun_netif_sync_tx %s - utun_output error %d\n", pcb->utun_ifp->if_xname, error);
+                                               os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_tx %s - utun_output error %d\n", pcb->utun_ifp->if_xname, error);
                                        }
                                } else {
-                                       printf("utun_netif_sync_tx %s - mbuf_copyback(%zu) error %d\n", pcb->utun_ifp->if_xname, length, error);
-                                       STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF);
-                                       STATS_INC(nifs, NETIF_STATS_DROPPED);
+                                       os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_tx %s - mbuf_copyback(%zu) error %d\n", pcb->utun_ifp->if_xname, length, error);
+                                       STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
+                                       STATS_INC(nifs, NETIF_STATS_DROP);
                                        mbuf_freem(data);
                                        data = NULL;
                                }
                        } else {
-                               printf("utun_netif_sync_tx %s - mbuf_gethdr error %d\n", pcb->utun_ifp->if_xname, error);
-                               STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF);
-                               STATS_INC(nifs, NETIF_STATS_DROPPED);
+                               os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_tx %s - mbuf_gethdr error %d\n", pcb->utun_ifp->if_xname, error);
+                               STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
+                               STATS_INC(nifs, NETIF_STATS_DROP);
                        }
                } else {
-                       printf("utun_netif_sync_tx %s - 0 length packet\n", pcb->utun_ifp->if_xname);
-                       STATS_INC(nifs, NETIF_STATS_NOMEM_MBUF);
-                       STATS_INC(nifs, NETIF_STATS_DROPPED);
+                       os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_tx %s - 0 length packet\n", pcb->utun_ifp->if_xname);
+                       STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
+                       STATS_INC(nifs, NETIF_STATS_DROP);
                }
 
                kern_pbufpool_free(tx_ring->ckr_pp, tx_ph);
@@ -478,8 +490,8 @@ utun_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                        continue;
                }
 
-               STATS_INC(nifs, NETIF_STATS_TXPKTS);
-               STATS_INC(nifs, NETIF_STATS_TXCOPY_MBUF);
+               STATS_INC(nifs, NETIF_STATS_TX_PACKETS);
+               STATS_INC(nifs, NETIF_STATS_TX_COPY_MBUF);
 
                tx_ring_stats.kcrsi_slots_transferred++;
                tx_ring_stats.kcrsi_bytes_transferred += length;
@@ -512,7 +524,7 @@ utun_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
         */
        rc = kern_channel_tx_refill_canblock(ring, UINT32_MAX, UINT32_MAX, true, &more);
        if (rc != 0 && rc != EAGAIN && rc != EBUSY) {
-               printf("%s, tx refill failed %d\n", __func__, rc);
+               os_log_error(OS_LOG_DEFAULT, "%s, tx refill failed %d\n", __func__, rc);
        }
 
        (void) kr_enter(ring, TRUE);
@@ -525,7 +537,7 @@ utun_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                        // No room left in tx ring, disable output for now
                        errno_t error = ifnet_disable_output(pcb->utun_ifp);
                        if (error != 0) {
-                               printf("utun_netif_tx_doorbell: ifnet_disable_output returned error %d\n", error);
+                               os_log_error(OS_LOG_DEFAULT, "utun_netif_tx_doorbell: ifnet_disable_output returned error %d\n", error);
                        }
                }
        }
@@ -564,7 +576,7 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
        // Reclaim user-released slots
        (void) kern_channel_reclaim(rx_ring);
 
-       STATS_INC(nifs, NETIF_STATS_RXSYNC);
+       STATS_INC(nifs, NETIF_STATS_RX_SYNC);
 
        uint32_t avail = kern_channel_available_slot_count(rx_ring);
        if (avail == 0) {
@@ -591,13 +603,16 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                kern_packet_t rx_ph = 0;
                errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph);
                if (__improbable(error != 0)) {
-                       STATS_INC(nifs, NETIF_STATS_NOMEM_PKT);
-                       STATS_INC(nifs, NETIF_STATS_DROPPED);
+                       STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT);
+                       STATS_INC(nifs, NETIF_STATS_DROP);
                        lck_mtx_unlock(&pcb->utun_input_chain_lock);
                        break;
                }
 
                // Advance waiting packets
+               if (pcb->utun_input_chain_count > 0) {
+                       pcb->utun_input_chain_count--;
+               }
                pcb->utun_input_chain = data->m_nextpkt;
                data->m_nextpkt = NULL;
                if (pcb->utun_input_chain == NULL) {
@@ -612,9 +627,9 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                        // mbuf is too small
                        mbuf_freem(data);
                        kern_pbufpool_free(rx_pp, rx_ph);
-                       STATS_INC(nifs, NETIF_STATS_BADLEN);
-                       STATS_INC(nifs, NETIF_STATS_DROPPED);
-                       printf("utun_netif_sync_rx %s: legacy packet length too short for header %zu < %zu\n",
+                       STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                       STATS_INC(nifs, NETIF_STATS_DROP);
+                       os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_rx %s: legacy packet length too short for header %zu < %zu\n",
                            pcb->utun_ifp->if_xname, length, header_offset);
                        continue;
                }
@@ -624,9 +639,9 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                        // Flush data
                        mbuf_freem(data);
                        kern_pbufpool_free(rx_pp, rx_ph);
-                       STATS_INC(nifs, NETIF_STATS_BADLEN);
-                       STATS_INC(nifs, NETIF_STATS_DROPPED);
-                       printf("utun_netif_sync_rx %s: legacy packet length %zu > %u\n",
+                       STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                       STATS_INC(nifs, NETIF_STATS_DROP);
+                       os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_rx %s: legacy packet length %zu > %u\n",
                            pcb->utun_ifp->if_xname, length, rx_pp->pp_buflet_size);
                        continue;
                }
@@ -648,17 +663,15 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                VERIFY(error == 0);
                error = kern_buflet_set_data_length(rx_buf, length);
                VERIFY(error == 0);
-               error = kern_packet_set_link_header_offset(rx_ph, 0);
-               VERIFY(error == 0);
-               error = kern_packet_set_network_header_offset(rx_ph, 0);
+               error = kern_packet_set_headroom(rx_ph, 0);
                VERIFY(error == 0);
                error = kern_packet_finalize(rx_ph);
                VERIFY(error == 0);
                error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph);
                VERIFY(error == 0);
 
-               STATS_INC(nifs, NETIF_STATS_RXPKTS);
-               STATS_INC(nifs, NETIF_STATS_RXCOPY_MBUF);
+               STATS_INC(nifs, NETIF_STATS_RX_PACKETS);
+               STATS_INC(nifs, NETIF_STATS_RX_COPY_MBUF);
                bpf_tap_packet_in(pcb->utun_ifp, DLT_RAW, rx_ph, NULL, 0);
 
                rx_ring_stats.kcrsi_slots_transferred++;
@@ -717,8 +730,8 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                 */
                errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph);
                if (__improbable(error != 0)) {
-                       STATS_INC(nifs, NETIF_STATS_NOMEM_PKT);
-                       STATS_INC(nifs, NETIF_STATS_DROPPED);
+                       STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT);
+                       STATS_INC(nifs, NETIF_STATS_DROP);
                        break;
                }
 
@@ -734,9 +747,9 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                if (tx_length < header_offset) {
                        // Packet is too small
                        kern_pbufpool_free(rx_pp, rx_ph);
-                       STATS_INC(nifs, NETIF_STATS_BADLEN);
-                       STATS_INC(nifs, NETIF_STATS_DROPPED);
-                       printf("utun_netif_sync_rx %s: packet length too short for header %u < %zu\n",
+                       STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                       STATS_INC(nifs, NETIF_STATS_DROP);
+                       os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_rx %s: packet length too short for header %u < %zu\n",
                            pcb->utun_ifp->if_xname, tx_length, header_offset);
                        continue;
                }
@@ -762,17 +775,15 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                VERIFY(error == 0);
                error = kern_buflet_set_data_length(rx_buf, length);
                VERIFY(error == 0);
-               error = kern_packet_set_link_header_offset(rx_ph, 0);
-               VERIFY(error == 0);
-               error = kern_packet_set_network_header_offset(rx_ph, 0);
+               error = kern_packet_set_headroom(rx_ph, 0);
                VERIFY(error == 0);
                error = kern_packet_finalize(rx_ph);
                VERIFY(error == 0);
                error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph);
                VERIFY(error == 0);
 
-               STATS_INC(nifs, NETIF_STATS_RXPKTS);
-               STATS_INC(nifs, NETIF_STATS_RXCOPY_DIRECT);
+               STATS_INC(nifs, NETIF_STATS_RX_PACKETS);
+               STATS_INC(nifs, NETIF_STATS_RX_COPY_DIRECT);
                bpf_tap_packet_in(pcb->utun_ifp, DLT_RAW, rx_ph, NULL, 0);
 
                rx_ring_stats.kcrsi_slots_transferred++;
@@ -839,7 +850,7 @@ utun_nexus_ifattach(struct utun_pcb *pcb,
        nexus_attr_t nxa = NULL;
        err = kern_nexus_attr_create(&nxa);
        if (err != 0) {
-               printf("%s: kern_nexus_attr_create failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n",
                    __func__, err);
                goto failed;
        }
@@ -859,16 +870,20 @@ utun_nexus_ifattach(struct utun_pcb *pcb,
 
        bzero(&pp_init, sizeof(pp_init));
        pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION;
+       pp_init.kbi_flags |= KBIF_VIRTUAL_DEVICE;
        pp_init.kbi_packets = pcb->utun_netif_ring_size * 2;
        pp_init.kbi_bufsize = pcb->utun_slot_size;
        pp_init.kbi_buf_seg_size = UTUN_IF_DEFAULT_BUF_SEG_SIZE;
        pp_init.kbi_max_frags = 1;
        (void) snprintf((char *)pp_init.kbi_name, sizeof(pp_init.kbi_name),
            "%s", provider_name);
+       pp_init.kbi_ctx = NULL;
+       pp_init.kbi_ctx_retain = NULL;
+       pp_init.kbi_ctx_release = NULL;
 
-       err = kern_pbufpool_create(&pp_init, &pp_init, &pcb->utun_netif_pp, NULL);
+       err = kern_pbufpool_create(&pp_init, &pcb->utun_netif_pp, NULL);
        if (err != 0) {
-               printf("%s pbufbool create failed, error %d\n", __func__, err);
+               os_log_error(OS_LOG_DEFAULT, "%s pbufbool create failed, error %d\n", __func__, err);
                goto failed;
        }
 
@@ -880,7 +895,7 @@ utun_nexus_ifattach(struct utun_pcb *pcb,
            nxa,
            &pcb->utun_nx.if_provider);
        if (err != 0) {
-               printf("%s register provider failed, error %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s register provider failed, error %d\n",
                    __func__, err);
                goto failed;
        }
@@ -899,7 +914,7 @@ utun_nexus_ifattach(struct utun_pcb *pcb,
            &net_init,
            ifp);
        if (err != 0) {
-               printf("%s alloc_net_provider_instance failed, %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s alloc_net_provider_instance failed, %d\n",
                    __func__, err);
                kern_nexus_controller_deregister_provider(controller,
                    pcb->utun_nx.if_provider);
@@ -928,7 +943,7 @@ utun_detach_provider_and_instance(uuid_t provider, uuid_t instance)
                err = kern_nexus_controller_free_provider_instance(controller,
                    instance);
                if (err != 0) {
-                       printf("%s free_provider_instance failed %d\n",
+                       os_log_error(OS_LOG_DEFAULT, "%s free_provider_instance failed %d\n",
                            __func__, err);
                }
                uuid_clear(instance);
@@ -937,7 +952,7 @@ utun_detach_provider_and_instance(uuid_t provider, uuid_t instance)
                err = kern_nexus_controller_deregister_provider(controller,
                    provider);
                if (err != 0) {
-                       printf("%s deregister_provider %d\n", __func__, err);
+                       os_log_error(OS_LOG_DEFAULT, "%s deregister_provider %d\n", __func__, err);
                }
                uuid_clear(provider);
        }
@@ -951,30 +966,30 @@ utun_nexus_detach(struct utun_pcb *pcb)
        nexus_controller_t controller = kern_nexus_shared_controller();
        errno_t err;
 
-       if (!uuid_is_null(nx->ms_host)) {
+       if (!uuid_is_null(nx->fsw_host)) {
                err = kern_nexus_ifdetach(controller,
-                   nx->ms_instance,
-                   nx->ms_host);
+                   nx->fsw_instance,
+                   nx->fsw_host);
                if (err != 0) {
-                       printf("%s: kern_nexus_ifdetach ms host failed %d\n",
+                       os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_ifdetach ms host failed %d\n",
                            __func__, err);
                }
        }
 
-       if (!uuid_is_null(nx->ms_device)) {
+       if (!uuid_is_null(nx->fsw_device)) {
                err = kern_nexus_ifdetach(controller,
-                   nx->ms_instance,
-                   nx->ms_device);
+                   nx->fsw_instance,
+                   nx->fsw_device);
                if (err != 0) {
-                       printf("%s: kern_nexus_ifdetach ms device failed %d\n",
+                       os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_ifdetach ms device failed %d\n",
                            __func__, err);
                }
        }
 
        utun_detach_provider_and_instance(nx->if_provider,
            nx->if_instance);
-       utun_detach_provider_and_instance(nx->ms_provider,
-           nx->ms_instance);
+       utun_detach_provider_and_instance(nx->fsw_provider,
+           nx->fsw_instance);
 
        if (pcb->utun_netif_pp != NULL) {
                kern_pbufpool_destroy(pcb->utun_netif_pp);
@@ -985,7 +1000,7 @@ utun_nexus_detach(struct utun_pcb *pcb)
 
 static errno_t
 utun_create_fs_provider_and_instance(struct utun_pcb *pcb,
-    uint32_t subtype, const char *type_name,
+    const char *type_name,
     const char *ifname,
     uuid_t *provider, uuid_t *instance)
 {
@@ -996,24 +1011,21 @@ utun_create_fs_provider_and_instance(struct utun_pcb *pcb,
        struct kern_nexus_init init;
        nexus_name_t    provider_name;
 
-       err = kern_nexus_get_builtin_domain_provider(NEXUS_TYPE_FLOW_SWITCH,
+       err = kern_nexus_get_default_domain_provider(NEXUS_TYPE_FLOW_SWITCH,
            &dom_prov);
        if (err != 0) {
-               printf("%s can't get %s provider, error %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s can't get %s provider, error %d\n",
                    __func__, type_name, err);
                goto failed;
        }
 
        err = kern_nexus_attr_create(&attr);
        if (err != 0) {
-               printf("%s: kern_nexus_attr_create failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n",
                    __func__, err);
                goto failed;
        }
 
-       err = kern_nexus_attr_set(attr, NEXUS_ATTR_EXTENSIONS, subtype);
-       VERIFY(err == 0);
-
        uint64_t slot_buffer_size = pcb->utun_slot_size;
        err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, slot_buffer_size);
        VERIFY(err == 0);
@@ -1038,7 +1050,7 @@ utun_create_fs_provider_and_instance(struct utun_pcb *pcb,
        kern_nexus_attr_destroy(attr);
        attr = NULL;
        if (err != 0) {
-               printf("%s register %s provider failed, error %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s register %s provider failed, error %d\n",
                    __func__, type_name, err);
                goto failed;
        }
@@ -1049,7 +1061,7 @@ utun_create_fs_provider_and_instance(struct utun_pcb *pcb,
            NULL,
            instance, &init);
        if (err != 0) {
-               printf("%s alloc_provider_instance %s failed, %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s alloc_provider_instance %s failed, %d\n",
                    __func__, type_name, err);
                kern_nexus_controller_deregister_provider(controller,
                    *provider);
@@ -1060,62 +1072,56 @@ failed:
 }
 
 static errno_t
-utun_multistack_attach(struct utun_pcb *pcb)
+utun_flowswitch_attach(struct utun_pcb *pcb)
 {
        nexus_controller_t controller = kern_nexus_shared_controller();
        errno_t err = 0;
        utun_nx_t nx = &pcb->utun_nx;
 
-       // Allocate multistack flowswitch
+       // Allocate flowswitch
        err = utun_create_fs_provider_and_instance(pcb,
-           NEXUS_EXTENSION_FSW_TYPE_MULTISTACK,
-           "multistack",
+           "flowswitch",
            pcb->utun_ifp->if_xname,
-           &nx->ms_provider,
-           &nx->ms_instance);
+           &nx->fsw_provider,
+           &nx->fsw_instance);
        if (err != 0) {
-               printf("%s: failed to create bridge provider and instance\n",
+               os_log_error(OS_LOG_DEFAULT, "%s: failed to create bridge provider and instance\n",
                    __func__);
                goto failed;
        }
 
-       // Attach multistack to device port
-       err = kern_nexus_ifattach(controller, nx->ms_instance,
+       // Attach flowswitch to device port
+       err = kern_nexus_ifattach(controller, nx->fsw_instance,
            NULL, nx->if_instance,
-           FALSE, &nx->ms_device);
+           FALSE, &nx->fsw_device);
        if (err != 0) {
-               printf("%s kern_nexus_ifattach ms device %d\n", __func__, err);
+               os_log_error(OS_LOG_DEFAULT, "%s kern_nexus_ifattach ms device %d\n", __func__, err);
                goto failed;
        }
 
-       // Attach multistack to host port
-       err = kern_nexus_ifattach(controller, nx->ms_instance,
+       // Attach flowswitch to host port
+       err = kern_nexus_ifattach(controller, nx->fsw_instance,
            NULL, nx->if_instance,
-           TRUE, &nx->ms_host);
+           TRUE, &nx->fsw_host);
        if (err != 0) {
-               printf("%s kern_nexus_ifattach ms host %d\n", __func__, err);
+               os_log_error(OS_LOG_DEFAULT, "%s kern_nexus_ifattach ms host %d\n", __func__, err);
                goto failed;
        }
 
        // Extract the agent UUID and save for later
-       struct kern_nexus *multistack_nx = nx_find(nx->ms_instance, false);
-       if (multistack_nx != NULL) {
-               struct nx_flowswitch *flowswitch = NX_FSW_PRIVATE(multistack_nx);
+       struct kern_nexus *flowswitch_nx = nx_find(nx->fsw_instance, false);
+       if (flowswitch_nx != NULL) {
+               struct nx_flowswitch *flowswitch = NX_FSW_PRIVATE(flowswitch_nx);
                if (flowswitch != NULL) {
                        FSW_RLOCK(flowswitch);
-                       struct fsw_ms_context *ms_context = (struct fsw_ms_context *)flowswitch->fsw_ops_private;
-                       if (ms_context != NULL) {
-                               uuid_copy(nx->ms_agent, ms_context->mc_agent_uuid);
-                       } else {
-                               printf("utun_multistack_attach - fsw_ms_context is NULL\n");
-                       }
+                       uuid_copy(nx->fsw_agent, flowswitch->fsw_agent_uuid);
                        FSW_UNLOCK(flowswitch);
                } else {
-                       printf("utun_multistack_attach - flowswitch is NULL\n");
+                       os_log_error(OS_LOG_DEFAULT, "utun_flowswitch_attach - flowswitch is NULL\n");
                }
-               nx_release(multistack_nx);
+               nx_release(flowswitch_nx);
        } else {
-               printf("utun_multistack_attach - unable to find multistack nexus\n");
+               os_log_error(OS_LOG_DEFAULT, "utun_flowswitch_attach - unable to find flowswitch nexus\n");
        }
 
        return 0;
@@ -1125,7 +1131,7 @@ failed:
 
        errno_t detach_error = 0;
        if ((detach_error = ifnet_detach(pcb->utun_ifp)) != 0) {
-               panic("utun_multistack_attach - ifnet_detach failed: %d\n", detach_error);
+               panic("utun_flowswitch_attach - ifnet_detach failed: %d\n", detach_error);
                /* NOT REACHED */
        }
 
@@ -1133,7 +1139,7 @@ failed:
 }
 
 static errno_t
-utun_register_kernel_pipe_nexus(void)
+utun_register_kernel_pipe_nexus(struct utun_pcb *pcb)
 {
        nexus_attr_t nxa = NULL;
        errno_t result;
@@ -1146,16 +1152,16 @@ utun_register_kernel_pipe_nexus(void)
 
        result = kern_nexus_controller_create(&utun_ncd);
        if (result) {
-               printf("%s: kern_nexus_controller_create failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_controller_create failed: %d\n",
                    __FUNCTION__, result);
                goto done;
        }
 
        uuid_t dom_prov;
-       result = kern_nexus_get_builtin_domain_provider(
+       result = kern_nexus_get_default_domain_provider(
                NEXUS_TYPE_KERNEL_PIPE, &dom_prov);
        if (result) {
-               printf("%s: kern_nexus_get_builtin_domain_provider failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_get_default_domain_provider failed: %d\n",
                    __FUNCTION__, result);
                goto done;
        }
@@ -1178,7 +1184,7 @@ utun_register_kernel_pipe_nexus(void)
 
        result = kern_nexus_attr_create(&nxa);
        if (result) {
-               printf("%s: kern_nexus_attr_create failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_attr_create failed: %d\n",
                    __FUNCTION__, result);
                goto done;
        }
@@ -1188,9 +1194,15 @@ utun_register_kernel_pipe_nexus(void)
        VERIFY(result == 0);
 
        // Reset ring size for kernel pipe nexus to limit memory usage
-       uint64_t ring_size = if_utun_ring_size;
+       uint64_t ring_size =
+           pcb->utun_kpipe_tx_ring_size != 0 ? pcb->utun_kpipe_tx_ring_size :
+           if_utun_ring_size;
        result = kern_nexus_attr_set(nxa, NEXUS_ATTR_TX_SLOTS, ring_size);
        VERIFY(result == 0);
+
+       ring_size =
+           pcb->utun_kpipe_rx_ring_size != 0 ? pcb->utun_kpipe_rx_ring_size :
+           if_utun_ring_size;
        result = kern_nexus_attr_set(nxa, NEXUS_ATTR_RX_SLOTS, ring_size);
        VERIFY(result == 0);
 
@@ -1202,7 +1214,7 @@ utun_register_kernel_pipe_nexus(void)
            nxa,
            &utun_kpipe_uuid);
        if (result) {
-               printf("%s: kern_nexus_controller_register_provider failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "%s: kern_nexus_controller_register_provider failed: %d\n",
                    __FUNCTION__, result);
                goto done;
        }
@@ -1290,7 +1302,7 @@ utun_enable_channel(struct utun_pcb *pcb, struct proc *proc)
                return result;
        }
 
-       result = utun_register_kernel_pipe_nexus();
+       result = utun_register_kernel_pipe_nexus(pcb);
        if (result) {
                return result;
        }
@@ -1315,6 +1327,7 @@ utun_enable_channel(struct utun_pcb *pcb, struct proc *proc)
 
        bzero(&pp_init, sizeof(pp_init));
        pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION;
+       pp_init.kbi_flags |= KBIF_VIRTUAL_DEVICE;
        pp_init.kbi_packets = pcb->utun_netif_ring_size * 2;
        pp_init.kbi_bufsize = pcb->utun_slot_size;
        pp_init.kbi_buf_seg_size = UTUN_IF_DEFAULT_BUF_SEG_SIZE;
@@ -1322,11 +1335,14 @@ utun_enable_channel(struct utun_pcb *pcb, struct proc *proc)
        pp_init.kbi_flags |= KBIF_QUANTUM;
        (void) snprintf((char *)pp_init.kbi_name, sizeof(pp_init.kbi_name),
            "com.apple.kpipe.%s", pcb->utun_if_xname);
+       pp_init.kbi_ctx = NULL;
+       pp_init.kbi_ctx_retain = NULL;
+       pp_init.kbi_ctx_release = NULL;
 
-       result = kern_pbufpool_create(&pp_init, &pp_init, &pcb->utun_kpipe_pp,
+       result = kern_pbufpool_create(&pp_init, &pcb->utun_kpipe_pp,
            NULL);
        if (result != 0) {
-               printf("%s pbufbool create failed, error %d\n", __func__, result);
+               os_log_error(OS_LOG_DEFAULT, "%s pbufbool create failed, error %d\n", __func__, result);
                goto done;
        }
 
@@ -1375,19 +1391,12 @@ utun_register_control(void)
        struct kern_ctl_reg kern_ctl;
        errno_t result = 0;
 
-       /* Find a unique value for our interface family */
-       result = mbuf_tag_id_find(UTUN_CONTROL_NAME, &utun_family);
-       if (result != 0) {
-               printf("utun_register_control - mbuf_tag_id_find_internal failed: %d\n", result);
-               return result;
-       }
-
        utun_pcb_size = sizeof(struct utun_pcb);
        utun_pcb_zone = zinit(utun_pcb_size,
            UTUN_PCB_ZONE_MAX * utun_pcb_size,
            0, UTUN_PCB_ZONE_NAME);
        if (utun_pcb_zone == NULL) {
-               printf("utun_register_control - zinit(utun_pcb) failed");
+               os_log_error(OS_LOG_DEFAULT, "utun_register_control - zinit(utun_pcb) failed");
                return ENOMEM;
        }
 
@@ -1413,26 +1422,26 @@ utun_register_control(void)
 
        result = ctl_register(&kern_ctl, &utun_kctlref);
        if (result != 0) {
-               printf("utun_register_control - ctl_register failed: %d\n", result);
+               os_log_error(OS_LOG_DEFAULT, "utun_register_control - ctl_register failed: %d\n", result);
                return result;
        }
 
        /* Register the protocol plumbers */
-       if ((result = proto_register_plumber(PF_INET, utun_family,
+       if ((result = proto_register_plumber(PF_INET, IFNET_FAMILY_UTUN,
            utun_attach_proto, NULL)) != 0) {
-               printf("utun_register_control - proto_register_plumber(PF_INET, %d) failed: %d\n",
-                   utun_family, result);
+               os_log_error(OS_LOG_DEFAULT, "utun_register_control - proto_register_plumber(PF_INET, IFNET_FAMILY_UTUN) failed: %d\n",
+                   result);
                ctl_deregister(utun_kctlref);
                return result;
        }
 
        /* Register the protocol plumbers */
-       if ((result = proto_register_plumber(PF_INET6, utun_family,
+       if ((result = proto_register_plumber(PF_INET6, IFNET_FAMILY_UTUN,
            utun_attach_proto, NULL)) != 0) {
-               proto_unregister_plumber(PF_INET, utun_family);
+               proto_unregister_plumber(PF_INET, IFNET_FAMILY_UTUN);
                ctl_deregister(utun_kctlref);
-               printf("utun_register_control - proto_register_plumber(PF_INET6, %d) failed: %d\n",
-                   utun_family, result);
+               os_log_error(OS_LOG_DEFAULT, "utun_register_control - proto_register_plumber(PF_INET6, IFNET_FAMILY_UTUN) failed: %d\n",
+                   result);
                return result;
        }
 
@@ -1452,6 +1461,7 @@ utun_free_pcb(struct utun_pcb *pcb, bool in_list)
 {
 #ifdef UTUN_NEXUS
        mbuf_freem_list(pcb->utun_input_chain);
+       pcb->utun_input_chain_count = 0;
        lck_mtx_destroy(&pcb->utun_input_chain_lock, utun_lck_grp);
 #endif // UTUN_NEXUS
        lck_rw_destroy(&pcb->utun_pcb_lock, utun_lck_grp);
@@ -1478,13 +1488,16 @@ utun_ctl_bind(kern_ctl_ref kctlref,
 
 #if UTUN_NEXUS
        pcb->utun_use_netif = false;
+       pcb->utun_attach_fsw = true;
+       pcb->utun_netif_connected = false;
        pcb->utun_slot_size = UTUN_IF_DEFAULT_SLOT_SIZE;
-       pcb->utun_netif_ring_size = UTUN_IF_DEFAULT_RING_SIZE;
-       pcb->utun_tx_fsw_ring_size = UTUN_IF_DEFAULT_TX_FSW_RING_SIZE;
-       pcb->utun_rx_fsw_ring_size = UTUN_IF_DEFAULT_RX_FSW_RING_SIZE;
+       pcb->utun_netif_ring_size = if_utun_ring_size;
+       pcb->utun_tx_fsw_ring_size = if_utun_tx_fsw_ring_size;
+       pcb->utun_rx_fsw_ring_size = if_utun_rx_fsw_ring_size;
+       pcb->utun_input_chain_count = 0;
+       lck_mtx_init(&pcb->utun_input_chain_lock, utun_lck_grp, utun_lck_attr);
 #endif // UTUN_NEXUS
 
-       lck_mtx_init(&pcb->utun_input_chain_lock, utun_lck_grp, utun_lck_attr);
        lck_rw_init(&pcb->utun_pcb_lock, utun_lck_grp, utun_lck_attr);
 
        return 0;
@@ -1543,7 +1556,7 @@ utun_ctl_connect(kern_ctl_ref kctlref,
 
        snprintf(pcb->utun_if_xname, sizeof(pcb->utun_if_xname), "utun%d", pcb->utun_unit - 1);
        snprintf(pcb->utun_unique_name, sizeof(pcb->utun_unique_name), "utunid%d", pcb->utun_unique_id - 1);
-       printf("utun_ctl_connect: creating interface %s (id %s)\n", pcb->utun_if_xname, pcb->utun_unique_name);
+       os_log(OS_LOG_DEFAULT, "utun_ctl_connect: creating interface %s (id %s)\n", pcb->utun_if_xname, pcb->utun_unique_name);
 
        /* Create the interface */
        bzero(&utun_init, sizeof(utun_init));
@@ -1565,8 +1578,7 @@ utun_ctl_connect(kern_ctl_ref kctlref,
        utun_init.unit = pcb->utun_unit - 1;
        utun_init.uniqueid = pcb->utun_unique_name;
        utun_init.uniqueid_len = strlen(pcb->utun_unique_name);
-       utun_init.family = utun_family;
-       utun_init.subfamily = IFNET_SUBFAMILY_UTUN;
+       utun_init.family = IFNET_FAMILY_UTUN;
        utun_init.type = IFT_OTHER;
        utun_init.demux = utun_demux;
        utun_init.add_proto = utun_add_proto;
@@ -1579,17 +1591,19 @@ utun_ctl_connect(kern_ctl_ref kctlref,
        if (pcb->utun_use_netif) {
                result = utun_nexus_ifattach(pcb, &utun_init, &pcb->utun_ifp);
                if (result != 0) {
-                       printf("utun_ctl_connect - utun_nexus_ifattach failed: %d\n", result);
+                       os_log_error(OS_LOG_DEFAULT, "utun_ctl_connect - utun_nexus_ifattach failed: %d\n", result);
                        utun_free_pcb(pcb, true);
                        *unitinfo = NULL;
                        return result;
                }
 
-               result = utun_multistack_attach(pcb);
-               if (result != 0) {
-                       printf("utun_ctl_connect - utun_multistack_attach failed: %d\n", result);
-                       *unitinfo = NULL;
-                       return result;
+               if (pcb->utun_attach_fsw) {
+                       result = utun_flowswitch_attach(pcb);
+                       if (result != 0) {
+                               os_log_error(OS_LOG_DEFAULT, "utun_ctl_connect - utun_flowswitch_attach failed: %d\n", result);
+                               *unitinfo = NULL;
+                               return result;
+                       }
                }
 
                /* Attach to bpf */
@@ -1603,7 +1617,7 @@ utun_ctl_connect(kern_ctl_ref kctlref,
                 */
                result = ifnet_allocate_extended(&utun_init, &pcb->utun_ifp);
                if (result != 0) {
-                       printf("utun_ctl_connect - ifnet_allocate failed: %d\n", result);
+                       os_log_error(OS_LOG_DEFAULT, "utun_ctl_connect - ifnet_allocate failed: %d\n", result);
                        utun_free_pcb(pcb, true);
                        *unitinfo = NULL;
                        return result;
@@ -1626,7 +1640,7 @@ utun_ctl_connect(kern_ctl_ref kctlref,
                /* Attach the interface */
                result = ifnet_attach(pcb->utun_ifp, NULL);
                if (result != 0) {
-                       printf("utun_ctl_connect - ifnet_attach failed: %d\n", result);
+                       os_log_error(OS_LOG_DEFAULT, "utun_ctl_connect - ifnet_attach failed: %d\n", result);
                        /* Release reference now since attach failed */
                        ifnet_release(pcb->utun_ifp);
                        utun_free_pcb(pcb, true);
@@ -1690,11 +1704,11 @@ utun_remove_address(ifnet_t interface,
                    ifnet_name(interface), ifnet_unit(interface));
                result = ifaddr_address(address, &ifr.ifr_addr, sizeof(ifr.ifr_addr));
                if (result != 0) {
-                       printf("utun_remove_address - ifaddr_address failed: %d", result);
+                       os_log_error(OS_LOG_DEFAULT, "utun_remove_address - ifaddr_address failed: %d", result);
                } else {
                        result = sock_ioctl(pf_socket, SIOCDIFADDR, &ifr);
                        if (result != 0) {
-                               printf("utun_remove_address - SIOCDIFADDR failed: %d", result);
+                               os_log_error(OS_LOG_DEFAULT, "utun_remove_address - SIOCDIFADDR failed: %d", result);
                        }
                }
        } else if (protocol == PF_INET6) {
@@ -1706,12 +1720,12 @@ utun_remove_address(ifnet_t interface,
                result = ifaddr_address(address, (struct sockaddr*)&ifr6.ifr_addr,
                    sizeof(ifr6.ifr_addr));
                if (result != 0) {
-                       printf("utun_remove_address - ifaddr_address failed (v6): %d",
+                       os_log_error(OS_LOG_DEFAULT, "utun_remove_address - ifaddr_address failed (v6): %d",
                            result);
                } else {
                        result = sock_ioctl(pf_socket, SIOCDIFADDR_IN6, &ifr6);
                        if (result != 0) {
-                               printf("utun_remove_address - SIOCDIFADDR_IN6 failed: %d",
+                               os_log_error(OS_LOG_DEFAULT, "utun_remove_address - SIOCDIFADDR_IN6 failed: %d",
                                    result);
                        }
                }
@@ -1728,7 +1742,7 @@ utun_cleanup_family(ifnet_t interface,
        int i;
 
        if (protocol != PF_INET && protocol != PF_INET6) {
-               printf("utun_cleanup_family - invalid protocol family %d\n", protocol);
+               os_log_error(OS_LOG_DEFAULT, "utun_cleanup_family - invalid protocol family %d\n", protocol);
                return;
        }
 
@@ -1736,7 +1750,7 @@ utun_cleanup_family(ifnet_t interface,
        result = sock_socket(protocol, SOCK_DGRAM, 0, NULL, NULL, &pf_socket);
        if (result != 0) {
                if (result != EAFNOSUPPORT) {
-                       printf("utun_cleanup_family - failed to create %s socket: %d\n",
+                       os_log_error(OS_LOG_DEFAULT, "utun_cleanup_family - failed to create %s socket: %d\n",
                            protocol == PF_INET ? "IP" : "IPv6", result);
                }
                goto cleanup;
@@ -1751,7 +1765,7 @@ utun_cleanup_family(ifnet_t interface,
                goto cleanup;
        } else if (result != EBUSY) {
                /* Uh, not really sure what happened here... */
-               printf("utun_cleanup_family - utun_detach_ip failed: %d\n", result);
+               os_log_error(OS_LOG_DEFAULT, "utun_cleanup_family - utun_detach_ip failed: %d\n", result);
                goto cleanup;
        }
 
@@ -1761,7 +1775,7 @@ utun_cleanup_family(ifnet_t interface,
         */
        result = ifnet_get_address_list_family(interface, &addresses, protocol);
        if (result != 0) {
-               printf("fnet_get_address_list_family(%s%d, 0xblah, %s) - failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "fnet_get_address_list_family(%s%d, 0xblah, %s) - failed: %d\n",
                    ifnet_name(interface), ifnet_unit(interface),
                    protocol == PF_INET ? "PF_INET" : "PF_INET6", result);
                goto cleanup;
@@ -1778,7 +1792,7 @@ utun_cleanup_family(ifnet_t interface,
         */
        result = utun_detach_ip(interface, protocol, pf_socket);
        if (result != 0 && result != ENXIO) {
-               printf("utun_cleanup_family - utun_detach_ip failed: %d\n", result);
+               os_log_error(OS_LOG_DEFAULT, "utun_cleanup_family - utun_detach_ip failed: %d\n", result);
        }
 
 cleanup:
@@ -1806,7 +1820,7 @@ utun_ctl_disconnect(__unused kern_ctl_ref kctlref,
 
 #if UTUN_NEXUS
        // Tell the nexus to stop all rings
-       if (pcb->utun_netif_nexus != NULL) {
+       if (pcb->utun_netif_nexus != NULL && pcb->utun_netif_connected) {
                kern_nexus_stop(pcb->utun_netif_nexus);
        }
 #endif // UTUN_NEXUS
@@ -1894,7 +1908,7 @@ utun_ctl_disconnect(__unused kern_ctl_ref kctlref,
                         * ifnet_release().
                         */
                        if ((result = ifnet_detach(ifp)) != 0) {
-                               printf("utun_ctl_disconnect - ifnet_detach failed: %d\n", result);
+                               os_log_error(OS_LOG_DEFAULT, "utun_ctl_disconnect - ifnet_detach failed: %d\n", result);
                        }
                }
        } else {
@@ -1920,7 +1934,7 @@ utun_ctl_send(__unused kern_ctl_ref kctlref,
        if (m_pktlen(m) >= (int32_t)UTUN_HEADER_SIZE((struct utun_pcb *)unitinfo)) {
                *(protocol_family_t *)mbuf_data(m) = ntohl(*(protocol_family_t *)mbuf_data(m));
        } else {
-               printf("%s - unexpected short mbuf pkt len %d\n", __func__, m_pktlen(m));
+               os_log_error(OS_LOG_DEFAULT, "%s - unexpected short mbuf pkt len %d\n", __func__, m_pktlen(m));
        }
 
        return utun_pkt_input((struct utun_pcb *)unitinfo, m);
@@ -2081,24 +2095,45 @@ utun_ctl_setopt(__unused kern_ctl_ref kctlref,
                        result = EINVAL;
                        break;
                }
-               if (!if_is_netagent_enabled()) {
+               if (!if_is_fsw_transport_netagent_enabled()) {
                        result = ENOTSUP;
                        break;
                }
-               if (uuid_is_null(pcb->utun_nx.ms_agent)) {
+               if (uuid_is_null(pcb->utun_nx.fsw_agent)) {
                        result = ENOENT;
                        break;
                }
 
+               uint32_t flags = netagent_get_flags(pcb->utun_nx.fsw_agent);
+
                if (*(int *)data) {
-                       if_add_netagent(pcb->utun_ifp, pcb->utun_nx.ms_agent);
                        pcb->utun_needs_netagent = true;
+                       flags |= (NETAGENT_FLAG_NEXUS_PROVIDER |
+                           NETAGENT_FLAG_NEXUS_LISTENER);
+                       result = netagent_set_flags(pcb->utun_nx.fsw_agent, flags);
                } else {
+                       flags &= ~(NETAGENT_FLAG_NEXUS_PROVIDER |
+                           NETAGENT_FLAG_NEXUS_LISTENER);
+                       result = netagent_set_flags(pcb->utun_nx.fsw_agent, flags);
                        pcb->utun_needs_netagent = false;
-                       if_delete_netagent(pcb->utun_ifp, pcb->utun_nx.ms_agent);
                }
                break;
        }
+       case UTUN_OPT_ATTACH_FLOWSWITCH: {
+               if (len != sizeof(int)) {
+                       result = EMSGSIZE;
+                       break;
+               }
+               if (pcb->utun_ifp != NULL) {
+                       // Only can set before connecting
+                       result = EINVAL;
+                       break;
+               }
+               lck_rw_lock_exclusive(&pcb->utun_pcb_lock);
+               pcb->utun_attach_fsw = !!(*(int *)data);
+               lck_rw_unlock_exclusive(&pcb->utun_pcb_lock);
+               break;
+       }
        case UTUN_OPT_ENABLE_NETIF: {
                if (len != sizeof(int)) {
                        result = EMSGSIZE;
@@ -2186,6 +2221,42 @@ utun_ctl_setopt(__unused kern_ctl_ref kctlref,
                pcb->utun_rx_fsw_ring_size = ring_size;
                break;
        }
+       case UTUN_OPT_KPIPE_TX_RING_SIZE: {
+               if (len != sizeof(u_int32_t)) {
+                       result = EMSGSIZE;
+                       break;
+               }
+               if (pcb->utun_ifp != NULL) {
+                       // Only can set before connecting
+                       result = EINVAL;
+                       break;
+               }
+               u_int32_t ring_size = *(u_int32_t *)data;
+               if (ring_size < UTUN_IF_MIN_RING_SIZE ||
+                   ring_size > UTUN_IF_MAX_RING_SIZE) {
+                       return EINVAL;
+               }
+               pcb->utun_kpipe_tx_ring_size = ring_size;
+               break;
+       }
+       case UTUN_OPT_KPIPE_RX_RING_SIZE: {
+               if (len != sizeof(u_int32_t)) {
+                       result = EMSGSIZE;
+                       break;
+               }
+               if (pcb->utun_ifp != NULL) {
+                       // Only can set before connecting
+                       result = EINVAL;
+                       break;
+               }
+               u_int32_t ring_size = *(u_int32_t *)data;
+               if (ring_size < UTUN_IF_MIN_RING_SIZE ||
+                   ring_size > UTUN_IF_MAX_RING_SIZE) {
+                       return EINVAL;
+               }
+               pcb->utun_kpipe_rx_ring_size = ring_size;
+               break;
+       }
 #endif // UTUN_NEXUS
        default: {
                result = ENOPROTOOPT;
@@ -2262,7 +2333,7 @@ utun_ctl_getopt(__unused kern_ctl_ref kctlref,
                if (*len != sizeof(int)) {
                        result = EMSGSIZE;
                } else {
-                       *(int *)data = if_check_netagent(pcb->utun_ifp, pcb->utun_nx.ms_agent);
+                       *(int *)data = if_check_netagent(pcb->utun_ifp, pcb->utun_nx.fsw_agent);
                }
                break;
        }
@@ -2322,6 +2393,22 @@ utun_ctl_getopt(__unused kern_ctl_ref kctlref,
                }
                break;
        }
+       case UTUN_OPT_KPIPE_TX_RING_SIZE: {
+               if (*len != sizeof(u_int32_t)) {
+                       result = EMSGSIZE;
+               } else {
+                       *(u_int32_t *)data = pcb->utun_kpipe_tx_ring_size;
+               }
+               break;
+       }
+       case UTUN_OPT_KPIPE_RX_RING_SIZE: {
+               if (*len != sizeof(u_int32_t)) {
+                       result = EMSGSIZE;
+               } else {
+                       *(u_int32_t *)data = pcb->utun_kpipe_rx_ring_size;
+               }
+               break;
+       }
 #endif // UTUN_NEXUS
 
        default:
@@ -2346,7 +2433,7 @@ utun_ctl_rcvd(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, int flags)
        u_int32_t utun_packet_cnt;
        errno_t error_pc = ctl_getenqueuepacketcount(kctlref, unit, &utun_packet_cnt);
        if (error_pc != 0) {
-               printf("utun_ctl_rcvd: ctl_getenqueuepacketcount returned error %d\n", error_pc);
+               os_log_error(OS_LOG_DEFAULT, "utun_ctl_rcvd: ctl_getenqueuepacketcount returned error %d\n", error_pc);
                utun_packet_cnt = 0;
        }
 
@@ -2357,7 +2444,7 @@ utun_ctl_rcvd(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, int flags)
        if (reenable_output) {
                errno_t error = ifnet_enable_output(pcb->utun_ifp);
                if (error != 0) {
-                       printf("utun_ctl_rcvd: ifnet_enable_output returned error %d\n", error);
+                       os_log_error(OS_LOG_DEFAULT, "utun_ctl_rcvd: ifnet_enable_output returned error %d\n", error);
                }
        }
        ifnet_lock_done(pcb->utun_ifp);
@@ -2394,7 +2481,7 @@ utun_start(ifnet_t interface)
                u_int32_t utun_packet_cnt;
                errno_t error_pc = ctl_getenqueuepacketcount(pcb->utun_ctlref, pcb->utun_unit, &utun_packet_cnt);
                if (error_pc != 0) {
-                       printf("utun_start: ctl_getenqueuepacketcount returned error %d\n", error_pc);
+                       os_log_error(OS_LOG_DEFAULT, "utun_start: ctl_getenqueuepacketcount returned error %d\n", error_pc);
                        utun_packet_cnt = 0;
                }
 
@@ -2412,7 +2499,7 @@ utun_start(ifnet_t interface)
                if (!can_accept_packets) {
                        errno_t error = ifnet_disable_output(interface);
                        if (error != 0) {
-                               printf("utun_start: ifnet_disable_output returned error %d\n", error);
+                               os_log_error(OS_LOG_DEFAULT, "utun_start: ifnet_disable_output returned error %d\n", error);
                        }
                        ifnet_lock_done(pcb->utun_ifp);
                        break;
@@ -2466,7 +2553,7 @@ utun_output(ifnet_t     interface,
                result = ctl_enqueuembuf(pcb->utun_ctlref, pcb->utun_unit, data, CTL_DATA_EOR);
                if (result != 0) {
                        mbuf_freem(data);
-                       printf("utun_output - ctl_enqueuembuf failed: %d\n", result);
+                       os_log_error(OS_LOG_DEFAULT, "utun_output - ctl_enqueuembuf failed: %d\n", result);
 #if UTUN_NEXUS
                        if (!pcb->utun_use_netif)
 #endif // UTUN_NEXUS
@@ -2549,7 +2636,7 @@ utun_framer(ifnet_t interface,
 
        u_int32_t header_length = UTUN_HEADER_SIZE(pcb);
        if (mbuf_prepend(packet, header_length, MBUF_DONTWAIT) != 0) {
-               printf("utun_framer - ifnet_output prepend failed\n");
+               os_log_error(OS_LOG_DEFAULT, "utun_framer - ifnet_output prepend failed\n");
 
                ifnet_stat_increment_out(interface, 0, 0, 1);
 
@@ -2704,7 +2791,7 @@ utun_attach_proto(ifnet_t interface,
 
        errno_t result = ifnet_attach_protocol(interface, protocol, &proto);
        if (result != 0 && result != EEXIST) {
-               printf("utun_attach_inet - ifnet_attach_protocol %d failed: %d\n",
+               os_log_error(OS_LOG_DEFAULT, "utun_attach_inet - ifnet_attach_protocol %d failed: %d\n",
                    protocol, result);
        }
 
@@ -2719,14 +2806,23 @@ utun_pkt_input(struct utun_pcb *pcb, mbuf_t packet)
                lck_rw_lock_shared(&pcb->utun_pcb_lock);
 
                lck_mtx_lock(&pcb->utun_input_chain_lock);
+
+               if (pcb->utun_input_chain_count > (u_int32_t)if_utun_max_pending_input) {
+                       lck_mtx_unlock(&pcb->utun_input_chain_lock);
+                       lck_rw_unlock_shared(&pcb->utun_pcb_lock);
+                       return ENOSPC;
+               }
+
                if (pcb->utun_input_chain != NULL) {
                        pcb->utun_input_chain_last->m_nextpkt = packet;
                } else {
                        pcb->utun_input_chain = packet;
                }
+               pcb->utun_input_chain_count++;
                while (packet->m_nextpkt) {
                        VERIFY(packet != packet->m_nextpkt);
                        packet = packet->m_nextpkt;
+                       pcb->utun_input_chain_count++;
                }
                pcb->utun_input_chain_last = packet;
                lck_mtx_unlock(&pcb->utun_input_chain_lock);
@@ -2740,7 +2836,7 @@ utun_pkt_input(struct utun_pcb *pcb, mbuf_t packet)
 
                return 0;
        } else
-#endif // IPSEC_NEXUS
+#endif // UTUN_NEXUS
        {
                mbuf_pkthdr_setrcvif(packet, pcb->utun_ifp);
 
@@ -2765,7 +2861,7 @@ utun_pkt_input(struct utun_pcb *pcb, mbuf_t packet)
                if (result != 0) {
                        ifnet_stat_increment_in(pcb->utun_ifp, 0, 0, 1);
 
-                       printf("%s - ifnet_input failed: %d\n", __FUNCTION__, result);
+                       os_log_error(OS_LOG_DEFAULT, "%s - ifnet_input failed: %d\n", __FUNCTION__, result);
                        mbuf_freem(packet);
                }
 
@@ -2804,7 +2900,7 @@ utun_register_nexus(void)
            &dp_init, sizeof(dp_init),
            &utun_nx_dom_prov);
        if (err != 0) {
-               printf("%s: failed to register domain provider\n", __func__);
+               os_log_error(OS_LOG_DEFAULT, "%s: failed to register domain provider\n", __func__);
                return err;
        }
        return 0;
@@ -2867,6 +2963,9 @@ utun_nexus_connected(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
 #pragma unused(nxprov, channel)
        struct utun_pcb *pcb = kern_nexus_get_context(nexus);
        boolean_t ok = ifnet_is_attached(pcb->utun_ifp, 1);
+       if (pcb->utun_netif_nexus == nexus) {
+               pcb->utun_netif_connected = true;
+       }
        return ok ? 0 : ENXIO;
 }
 
@@ -2891,7 +2990,11 @@ utun_nexus_disconnected(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
 #pragma unused(nxprov, channel)
        struct utun_pcb *pcb = kern_nexus_get_context(nexus);
        if (pcb->utun_netif_nexus == nexus) {
-               pcb->utun_netif_nexus = NULL;
+               pcb->utun_netif_connected = false;
+               if (pcb->utun_attach_fsw) {
+                       // disconnected by flowswitch that was attached by us
+                       pcb->utun_netif_nexus = NULL;
+               }
        }
        ifnet_decr_iorefcnt(pcb->utun_ifp);
 }
@@ -3111,7 +3214,7 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                        kern_packet_t rx_ph = 0;
                        errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph);
                        if (__improbable(error != 0)) {
-                               printf("utun_kpipe_sync_rx %s: failed to allocate packet\n",
+                               os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: failed to allocate packet\n",
                                    pcb->utun_ifp->if_xname);
                                break;
                        }
@@ -3136,10 +3239,10 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                            (pcb->utun_flags & UTUN_FLAGS_NO_OUTPUT)) {
                                /* flush data */
                                kern_pbufpool_free(rx_pp, rx_ph);
-                               printf("utun_kpipe_sync_rx %s: invalid length %zu header_size %zu\n",
+                               os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: invalid length %zu header_size %zu\n",
                                    pcb->utun_ifp->if_xname, length, UTUN_HEADER_SIZE(pcb));
-                               STATS_INC(nifs, NETIF_STATS_BADLEN);
-                               STATS_INC(nifs, NETIF_STATS_DROPPED);
+                               STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
+                               STATS_INC(nifs, NETIF_STATS_DROP);
                                continue;
                        }
 
@@ -3163,7 +3266,7 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                                break;
                        }
                        default: {
-                               printf("utun_kpipe_sync_rx %s: unknown ip version %u vhl %u header_size %zu\n",
+                               os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: unknown ip version %u vhl %u header_size %zu\n",
                                    pcb->utun_ifp->if_xname, ip_version, vhl, UTUN_HEADER_SIZE(pcb));
                                break;
                        }
@@ -3190,8 +3293,8 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                        error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph);
                        VERIFY(error == 0);
 
-                       STATS_INC(nifs, NETIF_STATS_TXPKTS);
-                       STATS_INC(nifs, NETIF_STATS_TXCOPY_DIRECT);
+                       STATS_INC(nifs, NETIF_STATS_TX_PACKETS);
+                       STATS_INC(nifs, NETIF_STATS_TX_COPY_DIRECT);
 
                        rx_ring_stats.kcrsi_slots_transferred++;
                        rx_ring_stats.kcrsi_bytes_transferred += length;
@@ -3214,7 +3317,7 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                /* just like utun_ctl_rcvd(), always reenable output */
                errno_t error = ifnet_enable_output(pcb->utun_ifp);
                if (error != 0) {
-                       printf("utun_kpipe_sync_rx: ifnet_enable_output returned error %d\n", error);
+                       os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx: ifnet_enable_output returned error %d\n", error);
                }
 
                // Unlock first, then exit ring
@@ -3267,7 +3370,7 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus,
                        kern_packet_t rx_ph = 0;
                        errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph);
                        if (__improbable(error != 0)) {
-                               printf("utun_kpipe_sync_rx %s: failed to allocate packet\n",
+                               os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: failed to allocate packet\n",
                                    pcb->utun_ifp->if_xname);
                                break;
                        }
index 0a8f9f967a78044e7e01e0e363da8fd56d47d606..22dc6926557969ab6dcbfb109719383218ebcbfa 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -37,7 +37,6 @@
 void* utun_alloc(size_t size);
 void utun_free(void *ptr);
 errno_t utun_register_control(void);
-boolean_t utun_interface_needs_netagent(ifnet_t interface);
 
 #endif
 
@@ -49,30 +48,33 @@ boolean_t utun_interface_needs_netagent(ifnet_t interface);
 /*
  * Socket option names to manage utun
  */
-#define UTUN_OPT_FLAGS                                                  1
-#define UTUN_OPT_IFNAME                                                 2
-#define UTUN_OPT_EXT_IFDATA_STATS                               3       /* get|set (type int) */
+#define UTUN_OPT_FLAGS                                  1
+#define UTUN_OPT_IFNAME                                 2
+#define UTUN_OPT_EXT_IFDATA_STATS                       3       /* get|set (type int) */
 #define UTUN_OPT_INC_IFDATA_STATS_IN                    4       /* set to increment stat counters (type struct utun_stats_param) */
 #define UTUN_OPT_INC_IFDATA_STATS_OUT                   5       /* set to increment stat counters (type struct utun_stats_param) */
 
 #define UTUN_OPT_SET_DELEGATE_INTERFACE                 15      /* set the delegate interface (char[]) */
 #define UTUN_OPT_MAX_PENDING_PACKETS                    16      /* the number of packets that can be waiting to be read
-                                                                *                                                       from the control socket at a time */
+                                                                * from the control socket at a time */
 #define UTUN_OPT_ENABLE_CHANNEL                         17
 #define UTUN_OPT_GET_CHANNEL_UUID                       18
 #define UTUN_OPT_ENABLE_FLOWSWITCH                      19
 
-#define UTUN_OPT_ENABLE_NETIF                           20              /* Must be set before connecting */
-#define UTUN_OPT_SLOT_SIZE                                      21              /* Must be set before connecting */
-#define UTUN_OPT_NETIF_RING_SIZE                        22              /* Must be set before connecting */
-#define UTUN_OPT_TX_FSW_RING_SIZE                       23              /* Must be set before connecting */
-#define UTUN_OPT_RX_FSW_RING_SIZE                       24              /* Must be set before connecting */
+#define UTUN_OPT_ENABLE_NETIF                           20      /* Must be set before connecting */
+#define UTUN_OPT_SLOT_SIZE                              21      /* Must be set before connecting */
+#define UTUN_OPT_NETIF_RING_SIZE                        22      /* Must be set before connecting */
+#define UTUN_OPT_TX_FSW_RING_SIZE                       23      /* Must be set before connecting */
+#define UTUN_OPT_RX_FSW_RING_SIZE                       24      /* Must be set before connecting */
+#define UTUN_OPT_KPIPE_TX_RING_SIZE                     25      /* Must be set before connecting */
+#define UTUN_OPT_KPIPE_RX_RING_SIZE                     26      /* Must be set before connecting */
+#define UTUN_OPT_ATTACH_FLOWSWITCH                      27      /* Must be set before connecting */
 
 /*
  * Flags for by UTUN_OPT_FLAGS
  */
 #define UTUN_FLAGS_NO_OUTPUT            0x0001
-#define UTUN_FLAGS_NO_INPUT                     0x0002
+#define UTUN_FLAGS_NO_INPUT             0x0002
 #define UTUN_FLAGS_ENABLE_PROC_UUID     0x0004
 
 /*
index c980cd7e3edd11f00ee7993ab00eefbe1536f7cb..cd7010f732dc24bd05bcfa93d027920ee7e6b979 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -75,7 +75,7 @@
 #ifdef PRIVATE
 #include <net/route.h>
 #endif
-#ifdef BSD_KERNEL_PRIVATE
+#ifdef BSD_KERN_PRIVATE
 #include <sys/eventhandler.h>
 #endif
 
 #define APPLE_IF_FAM_STF       12
 #define APPLE_IF_FAM_FIREWIRE  13
 #define APPLE_IF_FAM_BOND      14
+#define APPLE_IF_FAM_CELLULAR  15
+#define APPLE_IF_FAM_6LOWPAN   16
+#define APPLE_IF_FAM_UTUN      17
+#define APPLE_IF_FAM_IPSEC     18
 #endif /* __APPLE__ */
 
 /*
@@ -302,6 +306,37 @@ struct if_latencies {
        u_int64_t       max_lt;         /* maximum theoretical latency */
 };
 
+#define IF_NETEM_PARAMS_PSCALE  100000
+struct if_netem_params {
+       /* bandwidth limit */
+       uint64_t        ifnetem_bandwidth_bps;
+
+       /* latency (normal distribution with jitter as stdev) */
+       uint32_t        ifnetem_latency_ms;
+       uint32_t        ifnetem_jitter_ms;
+
+       /*
+        * NetEm probabilistic model parameters has a scaling factor of 100,000
+        * for 5 digits precision. For instance, probability 12.345% is
+        * expressed as uint32_t fixed point 12345 in ifnet_*_p variable below.
+        */
+       /* random packet corruption */
+       uint32_t        ifnetem_corruption_p;
+
+       /* random packet duplication */
+       uint32_t        ifnetem_duplication_p;
+
+       /* 4 state Markov loss model */
+       uint32_t        ifnetem_loss_p_gr_gl; /* P( gap_loss   | gap_rx     ) */
+       uint32_t        ifnetem_loss_p_gr_bl; /* P( burst_loss | gap_rx     ) */
+       uint32_t        ifnetem_loss_p_bl_br; /* P( burst_rx   | burst_loss ) */
+       uint32_t        ifnetem_loss_p_bl_gr; /* P( gap_rx     | burst_loss ) */
+       uint32_t        ifnetem_loss_p_br_bl; /* P( burst_loss | burst_rx   ) */
+
+       /* random packet reordering */
+       uint32_t        ifnetem_reordering_p;
+};
+
 struct if_rxpoll_stats {
        u_int32_t       ifi_poll_off_req;       /* total # of POLL_OFF reqs */
        u_int32_t       ifi_poll_off_err;       /* total # of POLL_OFF errors */
@@ -330,6 +365,23 @@ struct if_rxpoll_stats {
        u_int64_t       ifi_poll_interval_time; /* poll interval (nsec) */
 };
 
+struct if_netif_stats {
+       u_int64_t       ifn_rx_mit_interval;    /* rx mitigation ival (nsec) */
+       u_int32_t       ifn_rx_mit_mode;        /* 0: static, 1: dynamic */
+       u_int32_t       ifn_rx_mit_packets_avg; /* average # of packets */
+       u_int32_t       ifn_rx_mit_packets_min; /* smallest # of packets */
+       u_int32_t       ifn_rx_mit_packets_max; /* largest # of packets */
+       u_int32_t       ifn_rx_mit_bytes_avg;   /* average # of bytes */
+       u_int32_t       ifn_rx_mit_bytes_min;   /* smallest # of bytes */
+       u_int32_t       ifn_rx_mit_bytes_max;   /* largest # of bytes */
+       u_int32_t       ifn_rx_mit_cfg_idx;     /* current config selector */
+       u_int32_t       ifn_rx_mit_cfg_packets_lowat; /* pkts low watermark */
+       u_int32_t       ifn_rx_mit_cfg_packets_hiwat; /* pkts high watermark */
+       u_int32_t       ifn_rx_mit_cfg_bytes_lowat; /* bytes low watermark */
+       u_int32_t       ifn_rx_mit_cfg_bytes_hiwat; /* bytes high watermark */
+       u_int32_t       ifn_rx_mit_cfg_interval; /* delay interval (nsec) */
+};
+
 struct if_tcp_ecn_perf_stat {
        u_int64_t total_txpkts;
        u_int64_t total_rxmitpkts;
@@ -633,6 +685,77 @@ struct chain_len_stats {
        uint64_t        cls_five_or_more;
 } __attribute__((__aligned__(sizeof(uint64_t))));
 
+/*
+ * This structure is used to define the parameters for advisory notifications
+ * on an interface.
+ */
+#pragma pack(push, 1)
+struct ifnet_interface_advisory {
+       /* The current structure version */
+       uint8_t     version;
+#define IF_INTERFACE_ADVISORY_VERSION_1    0x1
+#define IF_INTERFACE_ADVISORY_VERSION_CURRENT  IF_INTERFACE_ADVISORY_VERSION_1
+       /*  Specifies if the advisory is for transmit or receive path */
+       uint8_t     direction;
+#define IF_INTERFACE_ADVISORY_DIRECTION_TX    0x1
+#define IF_INTERFACE_ADVISORY_DIRECTION_RX    0x2
+       /* reserved for future use */
+       uint16_t    _reserved;
+       /*
+        * suggestion for data rate change to keep the latency low.
+        * unit: bits per second (bps)
+        * NOTE: if the interface cannot provide suggestions in terms of bps,
+        * it should use the following values:
+        * INT32_MAX : ramp up
+        * INT32_MIN : ramp down
+        * 0         : neutral
+        */
+#define IF_INTERFACE_ADVISORY_RATE_SUGGESTION_RAMP_UP         INT32_MAX
+#define IF_INTERFACE_ADVISORY_RATE_SUGGESTION_RAMP_DOWN       INT32_MIN
+#define IF_INTERFACE_ADVISORY_RATE_SUGGESTION_RAMP_NEUTRAL    0
+       int32_t     rate_trend_suggestion;
+       /*
+        * Time of the issue of advisory.
+        * Timestamp should be in the host domain.
+        * unit: mach absolute time
+        */
+       uint64_t    timestamp;
+       /*
+        * Maximum theoretical bandwidth of the interface.
+        * unit: bits per second (bps)
+        */
+       uint64_t    max_bandwidth;
+       /*
+        * Total bytes sent or received on the interface.
+        * wrap around possible and the application should account for that.
+        * unit: byte
+        */
+       uint64_t    total_byte_count;
+       /*
+        * average throughput observed at the driver stack.
+        * unit: bits per second (bps)
+        */
+       uint64_t    average_throughput;
+       /*
+        * flushable queue size at the driver.
+        * should be set to UINT32_MAX if not available.
+        * unit: byte
+        */
+       uint32_t    flushable_queue_size;
+       /*
+        * non flushable queue size at the driver.
+        * should be set to UINT32_MAX if not available.
+        * unit: byte
+        */
+       uint32_t    non_flushable_queue_size;
+       /*
+        * average delay observed at the interface.
+        * unit: milliseconds (ms)
+        */
+       uint32_t    average_delay;
+} __attribute__((aligned(sizeof(uint64_t))));
+#pragma pack(pop)
+
 #endif /* PRIVATE */
 
 #pragma pack()
@@ -837,9 +960,13 @@ struct ifnet {
        TAILQ_ENTRY(ifnet) if_detaching_link; /* list of detaching ifnets */
        TAILQ_ENTRY(ifnet) if_ordered_link;     /* list of ordered ifnets */
 
-       decl_lck_mtx_data(, if_ref_lock)
+       decl_lck_mtx_data(, if_ref_lock);
        u_int32_t       if_refflags;    /* see IFRF flags below */
        u_int32_t       if_refio;       /* number of io ops to the underlying driver */
+       u_int32_t       if_threads_pending;    /* Threads created but waiting for first run */
+       u_int32_t       if_datamov;     /* number of threads moving data */
+       u_int32_t       if_drainers;    /* number of draining threads */
+       u_int32_t       if_suspend;     /* number of suspend requests */
 
 #define if_list         if_link
        struct ifaddrhead if_addrhead;  /* linked list of addresses per if */
@@ -913,7 +1040,7 @@ struct ifnet {
        struct if_latencies     if_output_lt;
        struct if_latencies     if_input_lt;
 
-       decl_lck_mtx_data(, if_flt_lock)
+       decl_lck_mtx_data(, if_flt_lock);
        u_int32_t               if_flt_busy;
        u_int32_t               if_flt_waiters;
        struct ifnet_filter_head if_flt_head;
@@ -924,12 +1051,64 @@ struct ifnet {
        decl_lck_mtx_data(, if_addrconfig_lock); /* for serializing addr config */
        struct in_multi         *if_allhostsinm; /* store all-hosts inm for this ifp */
 
+       /*
+        * Opportunistic polling parameters.
+        */
        decl_lck_mtx_data(, if_poll_lock);
-       u_int16_t               if_poll_req;
-       u_int16_t               if_poll_update; /* link update */
-       u_int32_t               if_poll_active; /* polling is active */
-       struct timespec         if_poll_cycle;  /* poll interval */
-       struct thread           *if_poll_thread;
+       struct if_poll_params {
+               u_int16_t       poll_req;
+               u_int16_t       poll_update; /* link update */
+               u_int32_t       poll_flags;
+#define IF_POLLF_READY          0x1     /* poll thread is ready */
+#define IF_POLLF_RUNNING        0x2     /* poll thread is running/active */
+               struct timespec poll_cycle;  /* poll interval */
+               struct thread   *poll_thread;
+
+               ifnet_model_t   poll_mode;   /* current mode */
+               struct pktcntr  poll_tstats; /* incremental polling statistics */
+               struct if_rxpoll_stats poll_pstats;  /* polling statistics */
+               struct pktcntr  poll_sstats; /* packets and bytes per sampling */
+               struct timespec poll_mode_holdtime; /* mode holdtime in nsec */
+               struct timespec poll_mode_lasttime; /* last mode change time in nsec */
+               struct timespec poll_sample_holdtime; /* sampling holdtime in nsec */
+               struct timespec poll_sample_lasttime; /* last sampling time in nsec */
+               struct timespec poll_dbg_lasttime; /* last debug message time in nsec */
+       } rxpoll_params;
+#define if_poll_req     rxpoll_params.poll_req
+#define if_poll_update  rxpoll_params.poll_update
+#define if_poll_flags   rxpoll_params.poll_flags
+#define if_poll_cycle   rxpoll_params.poll_cycle
+#define if_poll_thread  rxpoll_params.poll_thread
+#define if_poll_mode    rxpoll_params.poll_mode
+#define if_poll_tstats  rxpoll_params.poll_tstats
+#define if_poll_sstats  rxpoll_params.poll_sstats
+#define if_poll_pstats  rxpoll_params.poll_pstats
+
+#define if_poll_mode_holdtime    rxpoll_params.poll_mode_holdtime
+#define if_poll_mode_lasttime    rxpoll_params.poll_mode_lasttime
+#define if_poll_sample_holdtime  rxpoll_params.poll_sample_holdtime
+#define if_poll_sample_lasttime  rxpoll_params.poll_sample_lasttime
+#define if_poll_dbg_lasttime     rxpoll_params.poll_dbg_lasttime
+
+#define if_rxpoll_offreq   rxpoll_params.poll_pstats.ifi_poll_off_req
+#define if_rxpoll_offerr   rxpoll_params.poll_pstats.ifi_poll_off_err
+#define if_rxpoll_onreq    rxpoll_params.poll_pstats.ifi_poll_on_req
+#define if_rxpoll_onerr    rxpoll_params.poll_pstats.ifi_poll_on_err
+#define if_rxpoll_wavg     rxpoll_params.poll_pstats.ifi_poll_wakeups_avg
+#define if_rxpoll_wlowat   rxpoll_params.poll_pstats.ifi_poll_wakeups_lowat
+#define if_rxpoll_whiwat   rxpoll_params.poll_pstats.ifi_poll_wakeups_hiwat
+#define if_rxpoll_pavg     rxpoll_params.poll_pstats.ifi_poll_packets_avg
+#define if_rxpoll_pmin     rxpoll_params.poll_pstats.ifi_poll_packets_min
+#define if_rxpoll_pmax     rxpoll_params.poll_pstats.ifi_poll_packets_max
+#define if_rxpoll_plowat   rxpoll_params.poll_pstats.ifi_poll_packets_lowat
+#define if_rxpoll_phiwat   rxpoll_params.poll_pstats.ifi_poll_packets_hiwat
+#define if_rxpoll_bavg     rxpoll_params.poll_pstats.ifi_poll_bytes_avg
+#define if_rxpoll_bmin     rxpoll_params.poll_pstats.ifi_poll_bytes_min
+#define if_rxpoll_bmax     rxpoll_params.poll_pstats.ifi_poll_bytes_max
+#define if_rxpoll_blowat   rxpoll_params.poll_pstats.ifi_poll_bytes_lowat
+#define if_rxpoll_bhiwat   rxpoll_params.poll_pstats.ifi_poll_bytes_hiwat
+#define if_rxpoll_plim     rxpoll_params.poll_pstats.ifi_poll_packets_limit
+#define if_rxpoll_ival     rxpoll_params.poll_pstats.ifi_poll_interval_time
 
        struct dlil_threading_info *if_inp;
 
@@ -992,7 +1171,8 @@ struct ifnet {
                u_int32_t       type;           /* delegated i/f type */
                u_int32_t       family;         /* delegated i/f family */
                u_int32_t       subfamily;      /* delegated i/f sub-family */
-               uint32_t        expensive:1;    /* delegated i/f expensive? */
+               uint32_t        expensive:1,    /* delegated i/f expensive? */
+                   constrained:1;              /* delegated i/f constrained? */
        } if_delegated;
 
        uuid_t                  *if_agentids;   /* network agents attached to interface */
@@ -1027,6 +1207,11 @@ struct ifnet {
        struct if_tcp_ecn_stat *if_ipv6_stat;
 
        struct if_lim_perf_stat if_lim_stat;
+
+       uint32_t        if_tcp_kao_max;
+       uint32_t        if_tcp_kao_cnt;
+
+       struct netem    *if_output_netem;
 };
 
 /* Interface event handling declarations */
@@ -1062,11 +1247,16 @@ EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn);
 #define IFRF_EMBRYONIC  0x1     /* ifnet is allocated; awaiting attach */
 #define IFRF_ATTACHED   0x2     /* ifnet attach is completely done */
 #define IFRF_DETACHING  0x4     /* detach has been requested */
+#define IFRF_READY      0x8     /* data path is ready */
+
 #define IFRF_ATTACH_MASK        \
        (IFRF_EMBRYONIC|IFRF_ATTACHED|IFRF_DETACHING)
 
 #define IF_FULLY_ATTACHED(_ifp) \
        (((_ifp)->if_refflags & IFRF_ATTACH_MASK) == IFRF_ATTACHED)
+
+#define IF_FULLY_ATTACHED_AND_READY(_ifp) \
+       (IF_FULLY_ATTACHED(_ifp) && ((_ifp)->if_refflags & IFRF_READY))
 /*
  * Valid values for if_start_flags
  */
@@ -1203,6 +1393,8 @@ struct ifaddr {
        (struct ifaddr *, int);
        void (*ifa_attached)(struct ifaddr *); /* callback fn for attaching */
        void (*ifa_detached)(struct ifaddr *); /* callback fn for detaching */
+       void *ifa_del_wc;               /* Wait channel to avoid address deletion races */
+       int ifa_del_waiters;            /* Threads in wait to delete the address */
 #if __arm__ && (__BIGGEST_ALIGNMENT__ > 4)
 /* For the newer ARMv7k ABI where 64-bit types are 64-bit aligned, but pointers
  * are 32-bit:
@@ -1360,8 +1552,10 @@ struct ifmultiaddr {
  * IFNET_FAMILY_ETHERNET (as well as type to IFT_ETHER) which is too generic.
  */
 #define IFNET_IS_WIFI(_ifp)                                             \
-       ((_ifp)->if_subfamily == IFNET_SUBFAMILY_WIFI ||                \
-       (_ifp)->if_delegated.subfamily == IFNET_SUBFAMILY_WIFI)
+       (((_ifp)->if_family == IFNET_FAMILY_ETHERNET  &&                \
+       (_ifp)->if_subfamily == IFNET_SUBFAMILY_WIFI) ||                \
+       ((_ifp)->if_delegated.family == IFNET_FAMILY_ETHERNET &&        \
+       (_ifp)->if_delegated.subfamily == IFNET_SUBFAMILY_WIFI))
 
 /*
  * Indicate whether or not the immediate interface, or the interface delegated
@@ -1388,6 +1582,17 @@ struct ifmultiaddr {
        (_ifp)->if_subfamily == IFNET_SUBFAMILY_WIFI &&         \
        !((_ifp)->if_eflags & IFEF_AWDL))
 
+/*
+ * Indicate whether or not the immediate interface is a companion link
+ * interface.
+ */
+#define IFNET_IS_COMPANION_LINK(_ifp)                           \
+       ((_ifp)->if_family == IFNET_FAMILY_IPSEC &&             \
+       ((_ifp)->if_subfamily == IFNET_SUBFAMILY_BLUETOOTH ||   \
+       (_ifp)->if_subfamily == IFNET_SUBFAMILY_WIFI ||         \
+       (_ifp)->if_subfamily == IFNET_SUBFAMILY_QUICKRELAY ||   \
+       (_ifp)->if_subfamily == IFNET_SUBFAMILY_DEFAULT))
+
 /*
  * Indicate whether or not the immediate interface, or the interface delegated
  * by it, is marked as expensive.  The delegated interface is set/cleared
@@ -1397,7 +1602,7 @@ struct ifmultiaddr {
  * Note that this is meant to be used only for policy purposes.
  */
 #define IFNET_IS_EXPENSIVE(_ifp)                                        \
-       ((_ifp)->if_eflags & IFEF_EXPENSIVE ||                          \
+       ((_ifp)->if_eflags & IFEF_EXPENSIVE ||                              \
        (_ifp)->if_delegated.expensive)
 
 #define IFNET_IS_LOW_POWER(_ifp)                                        \
@@ -1406,6 +1611,10 @@ struct ifmultiaddr {
        ((_ifp)->if_delegated.ifp != NULL &&                            \
        ((_ifp)->if_delegated.ifp->if_xflags & IFXF_LOW_POWER)))
 
+#define IFNET_IS_CONSTRAINED(_ifp)                                      \
+       ((_ifp)->if_xflags & IFXF_CONSTRAINED ||                            \
+       (_ifp)->if_delegated.constrained)
+
 /*
  * We don't support AWDL interface delegation.
  */
@@ -1499,8 +1708,15 @@ __private_extern__ void ifnet_head_assert_exclusive(void);
 __private_extern__ errno_t ifnet_set_idle_flags_locked(ifnet_t, u_int32_t,
     u_int32_t);
 __private_extern__ int ifnet_is_attached(struct ifnet *, int refio);
+__private_extern__ void ifnet_incr_pending_thread_count(struct ifnet *);
+__private_extern__ void ifnet_decr_pending_thread_count(struct ifnet *);
 __private_extern__ void ifnet_incr_iorefcnt(struct ifnet *);
 __private_extern__ void ifnet_decr_iorefcnt(struct ifnet *);
+__private_extern__ boolean_t ifnet_datamov_begin(struct ifnet *);
+__private_extern__ void ifnet_datamov_end(struct ifnet *);
+__private_extern__ void ifnet_datamov_suspend(struct ifnet *);
+__private_extern__ void ifnet_datamov_drain(struct ifnet *);
+__private_extern__ void ifnet_datamov_resume(struct ifnet *);
 __private_extern__ void ifnet_set_start_cycle(struct ifnet *,
     struct timespec *);
 __private_extern__ void ifnet_set_poll_cycle(struct ifnet *,
@@ -1743,6 +1959,8 @@ __private_extern__ void if_copy_packet_stats(struct ifnet *ifp,
     struct if_packet_stats *if_ps);
 __private_extern__ void if_copy_rxpoll_stats(struct ifnet *ifp,
     struct if_rxpoll_stats *if_rs);
+__private_extern__ void if_copy_netif_stats(struct ifnet *ifp,
+    struct if_netif_stats *if_ns);
 
 __private_extern__ struct rtentry *ifnet_cached_rtlookup_inet(struct ifnet *,
     struct in_addr);
@@ -1801,6 +2019,7 @@ __private_extern__ u_int32_t ifnet_get_generation(struct ifnet *);
 
 /* Adding and deleting netagents will take ifnet lock */
 __private_extern__ int if_add_netagent(struct ifnet *, uuid_t);
+__private_extern__ int if_add_netagent_locked(struct ifnet *, uuid_t);
 __private_extern__ int if_delete_netagent(struct ifnet *, uuid_t);
 __private_extern__ boolean_t if_check_netagent(struct ifnet *, uuid_t);
 
@@ -1811,6 +2030,7 @@ __private_extern__ void intf_event_enqueue_nwk_wq_entry(struct ifnet *ifp,
     struct sockaddr *addrp, uint32_t intf_event_code);
 __private_extern__ void ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *,
     struct ifnet *);
+__private_extern__ int if_get_tcp_kao_max(struct ifnet *);
 #if !CONFIG_EMBEDDED
 __private_extern__ errno_t ifnet_framer_stub(struct ifnet *, struct mbuf **,
     const struct sockaddr *, const char *, const char *, u_int32_t *,
@@ -1820,6 +2040,8 @@ __private_extern__ void ifnet_enqueue_multi_setup(struct ifnet *, uint16_t,
     uint16_t);
 __private_extern__ errno_t ifnet_enqueue_mbuf(struct ifnet *, struct mbuf *,
     boolean_t, boolean_t *);
+__private_extern__ int ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts,
+    uint32_t n_pkts);
 
 extern int if_low_power_verbose;
 extern int if_low_power_restricted;
index b4a922301f3dc03ff90a82c0c2d137d42b87fc54..8509cca99a223da29239362326e08bbe69f23302 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -1053,7 +1053,7 @@ vlan_output(struct ifnet * ifp, struct mbuf * m)
        u_short                     tag;
        vlan_parent_ref             vlp = NULL;
        int                         err;
-       struct flowadv              adv = { FADV_SUCCESS };
+       struct flowadv              adv = { .code = FADV_SUCCESS };
 
        if (m == 0) {
                return 0;
@@ -1129,6 +1129,13 @@ vlan_output(struct ifnet * ifp, struct mbuf * m)
                evl->evl_proto = evl->evl_encap_proto;
                evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
                evl->evl_tag = htons(tag);
+
+               /* adjust partial checksum offload offsets */
+               if ((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID |
+                   CSUM_PARTIAL)) == (CSUM_DATA_VALID | CSUM_PARTIAL)) {
+                       m->m_pkthdr.csum_tx_start += ETHER_VLAN_ENCAP_LEN;
+                       m->m_pkthdr.csum_tx_stuff += ETHER_VLAN_ENCAP_LEN;
+               }
        }
 
        err = dlil_output(p, PF_VLAN, m, NULL, NULL, 1, &adv);
@@ -1176,6 +1183,7 @@ vlan_input(ifnet_t p, __unused protocol_family_t protocol,
                soft_vlan = 1;
                switch (ifnet_type(p)) {
                case IFT_ETHER:
+               case IFT_IEEE8023ADLAG:
                        if (m->m_len < ETHER_VLAN_ENCAP_LEN) {
                                m_freem(m);
                                return 0;
index 22281c271f67363a4e0a8b0d65b9109485ed2152..4e849c3e55fab7dd51e7a7382b3efa7a224c9d20 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -206,7 +206,6 @@ ifnet_allocate_extended(const struct ifnet_init_eparams *einit0,
                    (einit.flags & IFNET_INIT_INPUT_POLL)) {
                        return EINVAL;
                }
-
                einit.pre_enqueue = NULL;
                einit.start = NULL;
                einit.output_ctl = NULL;
@@ -233,7 +232,6 @@ ifnet_allocate_extended(const struct ifnet_init_eparams *einit0,
                }
        }
 
-
        /* Initialize external name (name + unit) */
        (void) snprintf(if_xname, sizeof(if_xname), "%s%d",
            einit.name, einit.unit);
@@ -417,6 +415,8 @@ ifnet_allocate_extended(const struct ifnet_init_eparams *einit0,
                }
 
                ifp->if_xflags = 0;
+               /* legacy interface */
+               ifp->if_xflags |= IFXF_LEGACY;
 
                /*
                 * output target queue delay is specified in millisecond
@@ -635,6 +635,14 @@ ifnet_set_eflags(ifnet_t interface, u_int32_t new_flags, u_int32_t mask)
                ifnet_lock_done(interface);
                return EINVAL;
        }
+       /*
+        * Currently Interface advisory reporting is supported only for
+        * skywalk interface.
+        */
+       if ((((new_flags & mask) & IFEF_ADV_REPORT) != 0) &&
+           ((interface->if_eflags & IFEF_SKYWALK_NATIVE) == 0)) {
+               return EINVAL;
+       }
        oeflags = interface->if_eflags;
        interface->if_eflags =
            (new_flags & mask) | (interface->if_eflags & ~mask);
@@ -2229,7 +2237,24 @@ ifnet_add_multicast(ifnet_t interface, const struct sockaddr *maddr,
        }
 
        /* Don't let users screw up protocols' entries. */
-       if (maddr->sa_family != AF_UNSPEC && maddr->sa_family != AF_LINK) {
+       switch (maddr->sa_family) {
+       case AF_LINK: {
+               const struct sockaddr_dl *sdl =
+                   (const struct sockaddr_dl *)(uintptr_t)maddr;
+               if (sdl->sdl_len < sizeof(struct sockaddr_dl) ||
+                   (sdl->sdl_nlen + sdl->sdl_alen + sdl->sdl_slen +
+                   offsetof(struct sockaddr_dl, sdl_data) > sdl->sdl_len)) {
+                       return EINVAL;
+               }
+               break;
+       }
+       case AF_UNSPEC:
+               if (maddr->sa_len < ETHER_ADDR_LEN +
+                   offsetof(struct sockaddr, sa_data)) {
+                       return EINVAL;
+               }
+               break;
+       default:
                return EINVAL;
        }
 
@@ -2870,8 +2895,34 @@ ifnet_notice_node_presence(ifnet_t ifp, struct sockaddr *sa, int32_t rssi,
                return EINVAL;
        }
 
-       dlil_node_present(ifp, sa, rssi, lqm, npm, srvinfo);
-       return 0;
+       return dlil_node_present(ifp, sa, rssi, lqm, npm, srvinfo);
+}
+
+errno_t
+ifnet_notice_node_presence_v2(ifnet_t ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
+    int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
+{
+       /* Support older version if sdl is NULL */
+       if (sdl == NULL) {
+               return ifnet_notice_node_presence(ifp, sa, rssi, lqm, npm, srvinfo);
+       }
+
+       if (ifp == NULL || sa == NULL || srvinfo == NULL) {
+               return EINVAL;
+       }
+       if (sa->sa_len > sizeof(struct sockaddr_storage)) {
+               return EINVAL;
+       }
+
+       if (sa->sa_family != AF_INET6) {
+               return EINVAL;
+       }
+
+       if (sdl->sdl_family != AF_LINK) {
+               return EINVAL;
+       }
+
+       return dlil_node_present_v2(ifp, sa, sdl, rssi, lqm, npm, srvinfo);
 }
 
 errno_t
@@ -2970,6 +3021,8 @@ ifnet_set_delegate(ifnet_t ifp, ifnet_t delegated_ifp)
                ifp->if_delegated.subfamily = delegated_ifp->if_subfamily;
                ifp->if_delegated.expensive =
                    delegated_ifp->if_eflags & IFEF_EXPENSIVE ? 1 : 0;
+               ifp->if_delegated.constrained =
+                   delegated_ifp->if_xflags & IFXF_CONSTRAINED ? 1 : 0;
 
                /*
                 * Propogate flags related to ECN from delegated interface
@@ -3061,7 +3114,7 @@ ifnet_get_keepalive_offload_frames(ifnet_t ifp,
                bzero(frame, sizeof(struct ifnet_keepalive_offload_frame));
        }
 
-       /* First collect IPSec related keep-alive frames */
+       /* First collect IPsec related keep-alive frames */
        *used_frames_count = key_fill_offload_frames_for_savs(ifp,
            frames_array, frames_array_count, frame_data_offset);
 
@@ -3084,6 +3137,32 @@ ifnet_get_keepalive_offload_frames(ifnet_t ifp,
        return 0;
 }
 
+errno_t
+ifnet_notify_tcp_keepalive_offload_timeout(ifnet_t ifp,
+    struct ifnet_keepalive_offload_frame *frame)
+{
+       errno_t error = 0;
+
+       if (ifp == NULL || frame == NULL) {
+               return EINVAL;
+       }
+
+       if (frame->type != IFNET_KEEPALIVE_OFFLOAD_FRAME_TCP) {
+               return EINVAL;
+       }
+       if (frame->ether_type != IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV4 &&
+           frame->ether_type != IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV6) {
+               return EINVAL;
+       }
+       if (frame->local_port == 0 || frame->remote_port == 0) {
+               return EINVAL;
+       }
+
+       error = tcp_notify_kao_timeout(ifp, frame);
+
+       return error;
+}
+
 errno_t
 ifnet_link_status_report(ifnet_t ifp, const void *buffer,
     size_t buffer_len)
@@ -3161,7 +3240,7 @@ ifnet_link_status_report(ifnet_t ifp, const void *buffer,
                ifp->if_link_status->ifsr_len = ifsr->ifsr_len;
                if_cell_sr->valid_bitmask = 0;
                bcopy(new_cell_sr, if_cell_sr, sizeof(*if_cell_sr));
-       } else if (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI) {
+       } else if (IFNET_IS_WIFI(ifp)) {
                struct if_wifi_status_v1 *if_wifi_sr, *new_wifi_sr;
 
                /* Check version */
@@ -3252,7 +3331,7 @@ ifnet_get_fastlane_capable(ifnet_t interface, boolean_t *capable)
        if (interface == NULL || capable == NULL) {
                return EINVAL;
        }
-       if (interface->if_eflags & IFEF_QOSMARKING_CAPABLE) {
+       if (interface->if_qosmarking_mode == IFRTYPE_QOSMARKING_FASTLANE) {
                *capable = true;
        } else {
                *capable = false;
@@ -3356,3 +3435,17 @@ ifnet_get_low_power_mode(ifnet_t ifp, boolean_t *on)
 
        return 0;
 }
+
+/*************************************************************************/
+/* Interface advisory notifications                                      */
+/*************************************************************************/
+errno_t
+ifnet_interface_advisory_report(ifnet_t ifp,
+    const struct ifnet_interface_advisory *advisory)
+{
+
+#pragma unused(ifp)
+#pragma unused(advisory)
+       return ENOTSUP;
+
+}
index 3d71fdeecbf32624362b7aed2641130115ac0309..f131c57462b8b8ec66c7031b7f68df9cedd5d063 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -43,6 +43,7 @@
 
 #ifdef KERNEL_PRIVATE
 struct if_interface_state;
+struct ifnet_interface_advisory;
 #include <sys/kpi_mbuf.h>
 #endif /* KERNEL_PRIVATE */
 
@@ -55,7 +56,7 @@ struct if_interface_state;
 #define KPI_INTERFACE_EMBEDDED 0
 #endif
 #else
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 #define KPI_INTERFACE_EMBEDDED 1
 #else
 #define KPI_INTERFACE_EMBEDDED 0
@@ -88,6 +89,9 @@ struct ifnet_demux_desc;
  *       @constant IFNET_FAMILY_FIREWIRE An IEEE 1394 [Firewire] interface.
  *       @constant IFNET_FAMILY_BOND A virtual bonded interface.
  *       @constant IFNET_FAMILY_CELLULAR A cellular interface.
+ *       @constant IFNET_FAMILY_6LOWPAN A 6LoWPAN interface.
+ *       @constant IFNET_FAMILY_UTUN A utun interface.
+ *       @constant IFNET_FAMILY_IPSEC An IPsec interface.
  */
 enum {
        IFNET_FAMILY_ANY                = 0,
@@ -105,7 +109,10 @@ enum {
        IFNET_FAMILY_STF                = 12,
        IFNET_FAMILY_FIREWIRE           = 13,
        IFNET_FAMILY_BOND               = 14,
-       IFNET_FAMILY_CELLULAR           = 15
+       IFNET_FAMILY_CELLULAR           = 15,
+       IFNET_FAMILY_6LOWPAN            = 16,
+       IFNET_FAMILY_UTUN               = 17,
+       IFNET_FAMILY_IPSEC              = 18
 };
 
 /*!
@@ -131,8 +138,8 @@ enum {
        IFNET_SUBFAMILY_THUNDERBOLT     = 4,
        IFNET_SUBFAMILY_RESERVED        = 5,
        IFNET_SUBFAMILY_INTCOPROC       = 6,
-       IFNET_SUBFAMILY_UTUN            = 7,
-       IFNET_SUBFAMILY_IPSEC           = 8,
+       IFNET_SUBFAMILY_QUICKRELAY      = 7,
+       IFNET_SUBFAMILY_DEFAULT         = 8,
 };
 
 /*
@@ -3326,12 +3333,36 @@ ifnet_notice_node_presence(ifnet_t ifp, struct sockaddr *sa, int32_t rssi,
  *               system that the absence of the specified node has been detected.
  *       @param ifp The interface attached to the link where the absence of the
  *               specified node has been detected.
- *       @param sa The AF_LINK family address of the node whose absence has been
- *               detected.
+ *       @param sa The AF_INET6 or AF_LINK family address of the node whose absence has been
+ *               detected. If AF_LINK is specified, AF_INET6 address is derived from the
+ *               AF_LINK address.
  *       @result Returns 0 on success, or EINVAL if arguments are invalid.
  */
 extern errno_t ifnet_notice_node_absence(ifnet_t ifp, struct sockaddr *sa);
 
+/*
+ *       @function ifnet_notice_node_presence_v2
+ *       @discussion Provided for network interface drivers to notify the
+ *               system of a change detected in the presence of the specified
+ *               node.
+ *       @param ifp The interface attached to the link where the specified node
+ *               is present.
+ *       @param sa The AF_INET6 family address of the node whose presence is
+ *               changing.
+ *       @param sdl The AF_LINK family address of the node whose presence is
+ *               changing.
+ *       @param rssi The received signal strength indication as measured in
+ *               dBm by a radio receiver.
+ *       @param lqm A link quality metric associated with the specified node.
+ *       @param npm A node proximity metric associated with the specified node.
+ *       @param srvinfo A fixed-size array of octets containing opaque service
+ *               information data used by the mDNS responder subsystem.
+ *       @result Returns 0 on success, or EINVAL if arguments are invalid.
+ */
+extern errno_t
+ifnet_notice_node_presence_v2(ifnet_t ifp, struct sockaddr *sa, struct sockaddr_dl *sdl, int32_t rssi,
+    int lqm, int npm, u_int8_t srvinfo[48]);
+
 /*
  *       @function ifnet_notice_master_elected
  *       @discussion Provided for network interface drivers to notify the system
@@ -3392,7 +3423,7 @@ ifnet_get_delegate(ifnet_t ifp, ifnet_t *pdelegated_ifp);
  *       @struct ifnet_keepalive_offload_frame
  *       @discussion This structure is used to define various opportunistic
  *               polling parameters for an interface.
- *               For IPSec and AirPlay UDP keep alive only a subset of the
+ *               For IPsec and AirPlay UDP keep alive only a subset of the
  *               fields are relevant.
  *               An incoming TCP keep alive probe has the sequence number
  *               in the TCP header equal to "remote_seq" and the
@@ -3415,6 +3446,7 @@ ifnet_get_delegate(ifnet_t ifp, ifnet_t *pdelegated_ifp);
  *       @field keep_retry Interval before retrying if previous probe was not answered (TCP only)
  *       @field reply_length The length of the frame in the reply_data field (TCP only)
  *       @field addr_length Length in bytes of local_addr and remote_addr (TCP only)
+ *       @field flags Flags (TCP only)
  *       @field reply_data Keep alive reply to be sent to incoming probe (TCP only)
  *       @field local_addr Local address: 4 bytes IPv4 or 16 bytes IPv6 address (TCP only)
  *       @field remote_addr Remote address: 4 bytes IPv4 or 16 bytes IPv6 address (TCP only)
@@ -3442,7 +3474,9 @@ struct ifnet_keepalive_offload_frame {
        u_int16_t keep_retry; /* interval before retrying if previous probe was not answered */
        u_int8_t reply_length; /* Length of valid reply_data bytes including offset */
        u_int8_t addr_length; /* Length of valid bytes in local_addr and remote_addr */
-       u_int8_t reserved[2];
+#define  IFNET_KEEPALIVE_OFFLOAD_FLAG_NOWAKEFROMSLEEP   0x01
+       u_int8_t flags;
+       u_int8_t reserved[1];
        u_int8_t reply_data[IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE]; /* Response packet */
        u_int8_t local_addr[IFNET_KEEPALIVE_OFFLOAD_MAX_ADDR_SIZE]; /* in network byte order  */
        u_int8_t remote_addr[IFNET_KEEPALIVE_OFFLOAD_MAX_ADDR_SIZE]; /* in network byte order  */
@@ -3457,13 +3491,13 @@ struct ifnet_keepalive_offload_frame {
  *       @discussion Fills out frames_array with IP packets to send at
  *               periodic intervals as Keep-alive or heartbeat messages.
  *               This can be used to offload keep alives for UDP or TCP.
- *               Note: The frames are returned in this order: first the IPSec
+ *               Note: The frames are returned in this order: first the IPsec
  *               frames, then the AirPlay frames and finally the TCP frames.
  *               If a device does not support one kind of keep alive frames_array
  *               it should provide a frames_array large enough to accomodate
  *               the other frames
  *       @param ifp The interface to send frames out on. This is used to
- *               select which sockets or IPSec SAs should generate the
+ *               select which sockets or IPsec SAs should generate the
  *               packets.
  *       @param frames_array An array of ifnet_keepalive_offload_frame
  *               structs. This is allocated by the caller, and has
@@ -3481,6 +3515,28 @@ extern errno_t ifnet_get_keepalive_offload_frames(ifnet_t ifp,
     u_int32_t frames_array_count, size_t frame_data_offset,
     u_int32_t *used_frames_count);
 
+
+/*
+ *       @function ifnet_notify_tcp_keepalive_offload_timeout
+ *       @discussion Used by an interface to notify a TCP connection whose
+ *               keep alive was offloaded did experience a timeout.
+ *       @param ifp The interface for which the TCP keep alive offload timed out
+ *       @param frame The ifnet_keepalive_offload_frame structure that identifies
+ *               the TCP connection that experienced the timeout.
+ *               All the fields must be zeroed by the caller except for:
+ *               - type: must be IFNET_KEEPALIVE_OFFLOAD_FRAME_TCP
+ *               and for the fields identifying the 5-tup;e of the
+ *               TCP connection:
+ *               - ether_type
+ *               - local_addr
+ *               - remote_addr
+ *               - local_port
+ *               - remote_port
+ *       @result Returns 0 on success, error number otherwise.
+ */
+extern errno_t ifnet_notify_tcp_keepalive_offload_timeout(ifnet_t ifp,
+    struct ifnet_keepalive_offload_frame *frame);
+
 /*************************************************************************/
 /* Link level notifications                                              */
 /*************************************************************************/
@@ -3594,6 +3650,21 @@ extern errno_t ifnet_touch_lastupdown(ifnet_t interface);
  */
 extern errno_t ifnet_updown_delta(ifnet_t interface, struct timeval *updown_delta);
 
+/*************************************************************************/
+/* Interface advisory notifications                                      */
+/*************************************************************************/
+/*!
+ *       @function ifnet_interface_advisory_report
+ *       @discussion KPI to let the driver provide interface advisory
+ *       notifications to the user space.
+ *       @param ifp The interface that is generating the advisory report.
+ *       @param advisory structure containing the advisory notification
+ *              information.
+ *       @result Returns 0 on success, error number otherwise.
+ */
+extern errno_t ifnet_interface_advisory_report(ifnet_t ifp,
+    const struct ifnet_interface_advisory *advisory);
+
 #endif /* KERNEL_PRIVATE */
 
 __END_DECLS
diff --git a/bsd/net/linkaddr.c b/bsd/net/linkaddr.c
new file mode 100644 (file)
index 0000000..d8cc855
--- /dev/null
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ * Copyright (c) 2007, Swedish Institute of Computer Science.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Institute nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file is part of the Contiki operating system.
+ *
+ */
+
+/**
+ * \file
+ *         Functions for manipulating Rime addresses
+ * \author
+ *         Adam Dunkels <adam@sics.se>
+ */
+
+/**
+ * \addtogroup linkaddr
+ * @{
+ */
+
+#include "linkaddr.h"
+#include <string.h>
+
+linkaddr_t linkaddr_node_addr;
+#if LINKADDR_SIZE == 2
+const linkaddr_t linkaddr_null = { { 0, 0 } };
+#else /*LINKADDR_SIZE == 2*/
+#if LINKADDR_SIZE == 8
+const linkaddr_t linkaddr_null = { { 0, 0, 0, 0, 0, 0, 0, 0 } };
+#endif /*LINKADDR_SIZE == 8*/
+#endif /*LINKADDR_SIZE == 2*/
+
+
+/*---------------------------------------------------------------------------*/
+void
+linkaddr_copy(linkaddr_t *dest, const linkaddr_t *src)
+{
+       memcpy(dest, src, LINKADDR_SIZE);
+}
+/*---------------------------------------------------------------------------*/
+int
+linkaddr_cmp(const linkaddr_t *addr1, const linkaddr_t *addr2)
+{
+       return memcmp(addr1, addr2, LINKADDR_SIZE) == 0;
+}
+/*---------------------------------------------------------------------------*/
+void
+linkaddr_set_node_addr(linkaddr_t *t)
+{
+       linkaddr_copy(&linkaddr_node_addr, t);
+}
+/*---------------------------------------------------------------------------*/
+/** @} */
diff --git a/bsd/net/linkaddr.h b/bsd/net/linkaddr.h
new file mode 100644 (file)
index 0000000..aed3471
--- /dev/null
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ * Copyright (c) 2007, Swedish Institute of Computer Science.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Institute nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file is part of the Contiki operating system.
+ *
+ */
+
+/**
+ * \file
+ *         Header file for the Rime address representation
+ * \author
+ *         Adam Dunkels <adam@sics.se>
+ */
+
+/**
+ * \addtogroup rime
+ * @{
+ */
+
+/**
+ * \defgroup linkaddr Rime addresses
+ * @{
+ *
+ * The linkaddr module is an abstract representation of addresses in
+ * Rime.
+ *
+ */
+
+#ifndef LINKADDR_H_
+#define LINKADDR_H_
+
+#include "contiki-conf.h"
+
+#include <stdint.h>
+
+#ifdef LINKADDR_CONF_SIZE
+#define LINKADDR_SIZE LINKADDR_CONF_SIZE
+#else /* LINKADDR_SIZE */
+#define LINKADDR_SIZE 2
+#endif /* LINKADDR_SIZE */
+
+typedef union {
+       unsigned char u8[LINKADDR_SIZE];
+#if LINKADDR_SIZE == 2
+       uint16_t u16;
+#endif /* LINKADDR_SIZE == 2 */
+} linkaddr_t;
+
+typedef union {
+       uint8_t u8[8];
+       uint16_t u16[4];
+} linkaddr_extended_t;
+
+/**
+ * \brief      Copy a Rime address
+ * \param dest The destination
+ * \param from The source
+ *
+ *             This function copies a Rime address from one location
+ *             to another.
+ *
+ */
+void linkaddr_copy(linkaddr_t *dest, const linkaddr_t *from);
+
+/**
+ * \brief      Compare two Rime addresses
+ * \param addr1 The first address
+ * \param addr2 The second address
+ * \return     Non-zero if the addresses are the same, zero if they are different
+ *
+ *             This function compares two Rime addresses and returns
+ *             the result of the comparison. The function acts like
+ *             the '==' operator and returns non-zero if the addresses
+ *             are the same, and zero if the addresses are different.
+ *
+ */
+int linkaddr_cmp(const linkaddr_t *addr1, const linkaddr_t *addr2);
+
+
+/**
+ * \brief      Set the address of the current node
+ * \param addr The address
+ *
+ *             This function sets the Rime address of the node.
+ *
+ */
+void linkaddr_set_node_addr(linkaddr_t *addr);
+
+/**
+ * \brief      The Rime address of the node
+ *
+ *             This variable contains the Rime address of the
+ *             node. This variable should not be changed directly;
+ *             rather, the linkaddr_set_node_addr() function should be
+ *             used.
+ *
+ */
+extern linkaddr_t linkaddr_node_addr;
+
+/**
+ * \brief      The null Rime address
+ *
+ *             This variable contains the null Rime address. The null
+ *             address is used in route tables to indicate that the
+ *             table entry is unused. Nodes with no configured address
+ *             has the null address. Nodes with their node address set
+ *             to the null address will have problems communicating
+ *             with other nodes.
+ *
+ */
+extern const linkaddr_t linkaddr_null;
+
+#endif /* LINKADDR_H_ */
+/** @} */
+/** @} */
diff --git a/bsd/net/multi_layer_pkt_log.c b/bsd/net/multi_layer_pkt_log.c
new file mode 100644 (file)
index 0000000..b6af63d
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <net/multi_layer_pkt_log.h>
+
+SYSCTL_NODE(_net, OID_AUTO, mpklog,
+    CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Multi-layer packet logging");
+
+/*
+ * Note:  net_mpklog_enabled allows to override the interface flags IFXF_MPK_LOG
+ */
+int net_mpklog_enabled = 1;
+static int sysctl_net_mpklog_enabled SYSCTL_HANDLER_ARGS;
+SYSCTL_PROC(_net_mpklog, OID_AUTO, enabled, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
+    0, 0, &sysctl_net_mpklog_enabled, "I", "Multi-layer packet logging enabled");
+
+static int sysctl_net_mpklog_type SYSCTL_HANDLER_ARGS;
+int net_mpklog_type =  OS_LOG_TYPE_DEFAULT;
+SYSCTL_PROC(_net_mpklog, OID_AUTO, type, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
+    0, 0, &sysctl_net_mpklog_type, "I", "Multi-layer packet logging type");
+
+SYSCTL_INT(_net_mpklog, OID_AUTO, version, CTLFLAG_RD | CTLFLAG_LOCKED,
+    (int *)NULL, MPKL_VERSION, "Multi-layer packet logging version");
+
+static int
+sysctl_net_mpklog_enabled SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int value = net_mpklog_enabled;
+
+       int error = sysctl_handle_int(oidp, &value, 0, req);
+       if (error || !req->newptr) {
+               return error;
+       }
+
+       net_mpklog_enabled = (value == 0) ? 0 : 1;
+
+       os_log(OS_LOG_DEFAULT, "%s:%d set net_mpklog_enabled to %d",
+           proc_best_name(current_proc()), proc_selfpid(), net_mpklog_enabled);
+
+       return 0;
+}
+
+static int
+sysctl_net_mpklog_type SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int value = net_mpklog_type;
+
+       int error = sysctl_handle_int(oidp, &value, 0, req);
+       if (error || !req->newptr) {
+               return error;
+       }
+
+       if (value != OS_LOG_TYPE_DEFAULT &&
+           value != OS_LOG_TYPE_INFO) {
+               return EINVAL;
+       }
+
+       net_mpklog_type = value;
+
+       os_log(OS_LOG_DEFAULT, "%s:%d set net_mpklog_type to %d (%s)",
+           proc_best_name(current_proc()), proc_selfpid(), net_mpklog_type,
+           net_mpklog_type == OS_LOG_TYPE_DEFAULT ? "default" : "info");
+
+       return 0;
+}
diff --git a/bsd/net/multi_layer_pkt_log.h b/bsd/net/multi_layer_pkt_log.h
new file mode 100644 (file)
index 0000000..ef3a3ea
--- /dev/null
@@ -0,0 +1,463 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _NET_MULTI_LAYER_PKT_LOG_H_
+#define _NET_MULTI_LAYER_PKT_LOG_H_
+
+#include <os/log.h>
+
+/*
+ * Bump this version whenever the format of a log is modified
+ */
+#define MPKL_VERSION 1
+
+/*
+ * Protocol ID, use to track inter-layer transitions and direction of data flow.
+ * Watch transport physical layer has lowest numeric value, increases to the highest layer in the system.
+ * Direction is  to physical layer, or away from physical layer.
+ *
+ */
+
+#define MPKL_PROTOCOL_PHYSICAL               ((uint8_t)0)       /*  (OTA/serial-port/etc..) */
+
+#define MPKL_PROTOCOL_BT                     ((uint8_t)20)
+#define MPKL_PROTOCOL_WIFI                   ((uint8_t)30)
+#define MPKL_PROTOCOL_CELLULAR               ((uint8_t)40)
+#define MPKL_PROTOCOL_TERMINUS               ((uint8_t)60)
+#define MPKL_PROTOCOL_IPSEC                  ((uint8_t)80)
+#define MPKL_PROTOCOL_TCP                    ((uint8_t)100)
+#define MPKL_PROTOCOL_IDS                    ((uint8_t)120)
+#define MPKL_PROTOCOL_LIBNETCORE             ((uint8_t)140)
+#define MPKL_PROTOCOL_CFNETWORK              ((uint8_t)160)
+#define MPKL_PROTOCOL_REMOTE_CONNECTION      ((uint8_t)200)
+
+#define MPKL_TOPMOST_LAYER                   ((uint8_t)255)     /*  Top-most layer */
+
+
+/*!
+ *  @macro MPKL_CREATE_LOGOBJECT
+ *  @discussion    Creates a log object with input category name for the transportpacketlog subsystem
+ *
+ *  @param Name    string name of os_log_t category
+ *
+ *  @return        os_log_t object
+ *
+ */
+#define MPKL_CREATE_LOGOBJECT(Name)    os_log_create("com.apple.magnetpacketlog", Name)
+
+/*
+ * Cross-layer association APIs
+ *
+ */
+
+/*!
+ *  @macro MPKL_UUID_UUID_ASSOCIATE_PREV
+ *  @discussion    Associate current layer's packet UUID to previous layer's packet UUID, data is flowing into the current layer
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param CUR_PROTOCOL_ID      uint8_t     ID of current layer from MPKL_PROTOCOL_XXX defines above
+ *  @param PREV_PROTOCOL_ID     uint8_t     ID of previous layer being associated
+ *  @param CUR_UUID             uuid_t      Current layer 16-byte UUID of packet
+ *  @param PREV_UUID            uuid_t      Previous layer 16-byte UUID of packet
+ *  @param CUR_LEN              uint16_t    Current layer packet length
+ *  @param LOG_SEQ              uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+
+#define MPKL_UUID_UUID_ASSOCIATE_PREV(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, CUR_UUID, PREV_UUID, CUR_LEN, LOG_SEQN) \
+os_log(LOGOBJECT, "1 {curProtocol: %hhu, prevProtocol: %hhu, curUUID: %{public,uuid_t}.16P, prevUUID: %{public,uuid_t}.16P, curPktLen: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, CUR_UUID, PREV_UUID, CUR_LEN, LOG_SEQN)
+
+/*!
+ *  @macro MPKL_UUID_UUID_ASSOCIATE_NEXT
+ *  @discussion    Associate current layer's packet UUID to next layer's packet UUID, data is flowing out of the current layer
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param LABEL                string      optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {}
+ *  @param CUR_PROTOCOL_ID      uint8_t     ID of current layer from MPKL_PROTOCOL_XXX defines above
+ *  @param NEXT_PROTOCOL_ID     uint8_t     ID of next layer being associated
+ *  @param CUR_UUID             uuid_t      Current layer 16-byte UUID of packet
+ *  @param NEXT_UUID            uuid_t      Next layer 16-byte UUID of packet
+ *  @param CUR_LEN              uint16_t    Current layer packet length
+ *  @param LOG_SEQ              uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+
+#define MPKL_UUID_UUID_ASSOCIATE_NEXT(LOGOBJECT, CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, NEXT_UUID, CUR_LEN, LOG_SEQN) \
+os_log(LOGOBJECT, "2 {curProtocol: %hhu, nextProtocol: %hhu, curUUID: %{public,uuid_t}.16P, nextUUID: %{public,uuid_t}.16P, curPktLen: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, NEXT_UUID, CUR_LEN, LOG_SEQN)
+
+/*!
+ *  @macro MPKL_SEQRANGE_UUID_ASSOCIATE
+ *  @discussion    Associate previous layer's byte sequence range (start/end) to current layer's packet UUID
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param LABEL                string      optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {}
+ *  @param CUR_PROTOCOL_ID      uint8_t     ID of current layer from MPKL_PROTOCOL_XXX defines above
+ *  @param PREV_PROTOCOL_ID    uint8_t     ID of other layer being associated
+ *  @param PREV_RANGE_START    uint32_t    Previous layer byte sequence range start
+ *  @param PREV_RANGE_END      uint32_t    Previous layer byte sequence range end
+ *  @param CUR_UUID           uuid_t      Other layer 16-byte UUID of packet
+ *  @param CUR_LEN              uint16_t    Current layer packet length
+ *  @param LOG_SEQN             uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+
+#define MPKL_SEQRANGE_UUID_ASSOCIATE(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_RANGE_START, PREV_RANGE_END, CUR_UUID, CUR_LEN, LOG_SEQN) \
+os_log(LOGOBJECT, "3 {curProtocol: %hhu, prevProtocol: %hhu, prevStart: %u, prevEnd: %u, curUUID: %{public,uuid_t}.16P, curPktLen: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_RANGE_START, PREV_RANGE_END, CUR_UUID, CUR_LEN, LOG_SEQN)
+
+/*!
+ *  @macro MPKL_UUID_SEQRANGE_ASSOCIATE
+ *  @discussion    Associate previous layer's packet UUID to current layer's byte sequence range (start/end)
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param LABEL                string      optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {}
+ *  @param CUR_PROTOCOL_ID      uint8_t     ID of current layer from MPKL_PROTOCOL_XXX defines above
+ *  @param PREV_PROTOCOL_ID    uint8_t     ID of other layer being associated
+ *  @param PREV_UUID            uuid_t      Previous layer 16-byte UUID of packet
+ *  @param CUR_RANGE_START      uint16_t    Current layer byte sequence range start
+ *  @param CUR_RANGE_END        uint16_t    Current layer byte sequence range end
+ *  @param PREV_LEN            uint16_t    PRevious layer message length
+ *  @param LOG_SEQN             uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+
+#define MPKL_UUID_SEQRANGE_ASSOCIATE(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_UUID, CUR_RANGE_START, CUR_RANGE_END, PREV_LEN, LOG_SEQN) \
+os_log(LOGOBJECT, "4 {curProtocol: %hhu, prevProtocol: %hhu, prevUUID: %{public,uuid_t}.16P, curStart: %u, curEnd: %u, prevPktLen: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_UUID, CUR_RANGE_START, CUR_RANGE_END, PREV_LEN, LOG_SEQN)
+
+
+/*!
+ *  @macro MPKL_BUNDLEID_UUID_ASSOCIATE
+ *  @discussion    Associate previous layer's packet BUNDLEID to current layer's UUID
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param LABEL                string      optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {}
+ *  @param CUR_PROTOCOL_ID      uint8_t     ID of current layer from MPKL_PROTOCOL_XXX defines above
+ *  @param PREV_PROTOCOL_ID     uint8_t     ID of other layer being associated
+ *  @param PREV_BUNDLE_ID       NSString    BundleID of previous layer
+ *  @param CUR_UUID             uuid_t      Current layer 16-byte UUID of packet
+ *  @param CUR_LEN              uint32_t    Current layer packet length
+ *  @param LOG_SEQ              uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+
+#define MPKL_BUNDLEID_UUID_ASSOCIATE(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_BUNDLE_ID, CUR_UUID, CUR_LEN, LOG_SEQN) \
+os_log(LOGOBJECT, "5 {curProtocol: %hhu, prevProtocol: %hhu, prevBundleID: %@, curUUID: %{public,uuid_t}.16P, curPktLen: %u, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_BUNDLE_ID, CUR_UUID, CUR_LEN, LOG_SEQN)
+
+
+/*!
+ *  @macro MPKL_SEQRANGE_UUID_ASSOCIATE_W_BUNDLEID
+ *  @discussion    Associate previous layer's packet byte sequence range to to current layer's UUID and client's bundle id
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param LABEL                string      optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {}
+ *  @param CUR_PROTOCOL_ID      uint8_t     ID of current layer from MPKL_PROTOCOL_XXX defines above
+ *  @param PREV_PROTOCOL_ID     uint8_t     ID of other layer being associated
+ *  @param PREV_RANGE_START    uint32_t    Previous layer byte sequence range start
+ *  @param PREV_RANGE_END      uint32_t    Previous layer byte sequence range end
+ *  @param CUR_UUID             uuid_t      Current layer 16-byte UUID of packet
+ *  @param PREV_BUNDLE_ID       NSString    BundleID of previous layer
+ *  @param CUR_LEN              uint16_t    Current layer packet length
+ *  @param LOG_SEQ              uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+
+#define MPKL_SEQRANGE_UUID_ASSOCIATE_W_BUNDLEID(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_RANGE_START, PREV_RANGE_END, CUR_UUID, CLIENT_BUNDLE_ID, CUR_LEN, LOG_SEQN) \
+os_log(LOGOBJECT, "6 {curProtocol: %hhu, prevProtocol: %hhu, prevStart: %u, prevEnd: %u, curUUID: %{public,uuid_t}.16P, curBundleID: %@, curPktLen: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_RANGE_START, PREV_RANGE_END, CUR_UUID, CLIENT_BUNDLE_ID, CUR_LEN, LOG_SEQN)
+
+
+/*!
+ *  @macro MPKL_SEQN_UUID_ASSOCIATE_PREV
+ *  @discussion    Associate current layer's packet unique protocol sequenceNumber to another layer's message UUID
+ *                 Support fragmentation and re-assembly (for layers like BT), map byte-sequence range (2 byte) of current and other layer data
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param LABEL                string      optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {}
+ *  @param CUR_PROTOCOL_ID      uint8_t     ID of current layer from MPKL_PROTOCOL_XXX defines above
+ *  @param PREV_PROTOCOL_ID    uint8_t     ID of other layer being associated
+ *  @param PREV_UUID           uuid_t      Other layer 16-byte UUID of message
+ *  @param PREV_RANGE_START    uint16_t    Current layer byte sequence range start
+ *  @param PREV_RANGE_END      uint16_t    Current layer byte sequence range end
+ *  @param PREV_LEN            uint16_t    PRevious layer message length
+ *  @param CUR_SEQ_N            uint16_t    Current layer message length
+ *  @param CUR_RANGE_START      uint16_t    Current layer byte sequence range start
+ *  @param CUR_RANGE_END        uint16_t    Current layer byte sequence range end
+ *  @param LOG_SEQN             uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+
+#define MPKL_SEQN_UUID_ASSOCIATE_PREV(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_UUID, PREV_RANGE_START, PREV_RANGE_END, PREV_LEN, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, LOG_SEQN) \
+os_log(LOGOBJECT, "7 {Send, curProtocol: %hhu, prevProtocol: %hhu, prevUUID: %{public,uuid_t}.16P, prevStart: %hu, prevEnd: %hu, prevPktLen %hu, curSeqN: %hu, curStart: %hu, curEnd: %hu,  logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_UUID, PREV_RANGE_START, PREV_RANGE_END, PREV_LEN, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, LOG_SEQN)
+
+/*!
+ *  @macro MPKL_SEQN_UUID_ASSOCIATE_NEXT
+ *  @discussion    Associate current layer's packet unique protocol sequenceNumber to another layer's message UUID
+ *                 Support fragmentation and re-assembly (for layers like BT), map byte-sequence range (2 byte) of current and other layer data
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param LABEL                string      optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {}
+ *  @param CUR_PROTOCOL_ID      uint8_t     ID of current layer from MPKL_PROTOCOL_XXX defines above
+ *  @param NEXT_PROTOCOL_ID    uint8_t     ID of other layer being associated
+ *  @param NEXT_UUID           uuid_t      Other layer 16-byte UUID of message
+ *  @param NEXT_RANGE_START    uint16_t    Current layer byte sequence range start
+ *  @param NEXT_RANGE_END      uint16_t    Current layer byte sequence range end
+ *  @param NEXT_LEN            uint16_t    Current layer message length
+ *  @param CUR_SEQ_N            uint16_t    Current layer message length
+ *  @param CUR_RANGE_START      uint16_t    Current layer byte sequence range start
+ *  @param CUR_RANGE_END        uint16_t    Current layer byte sequence range end
+ *  @param LOG_SEQN             uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+
+#define MPKL_SEQN_UUID_ASSOCIATE_NEXT(LOGOBJECT, CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, NEXT_UUID, NEXT_RANGE_START, NEXT_RANGE_END, NEXT_LEN, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, LOG_SEQN) \
+os_log(LOGOBJECT, "8 {Receive, curProtocol: %hhu, nextProtocol: %hhu, nextUUID: %{public,uuid_t}.16P, nextStart: %hu, nextEnd: %hu, nextPktLen %hu, curSeqN: %hu, curStart: %hu, curEnd: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, NEXT_UUID, NEXT_RANGE_START, NEXT_RANGE_END, NEXT_LEN, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, LOG_SEQN)
+
+/*
+ * APIs to indicate transitioning of messages; example in/out of a layer
+ */
+
+/*!
+ *  @macro MPKL_UUID_NEXT
+ *  @discussion    Log the transition of current layer's message with UUID to next layer
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param LABEL                string      optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {}
+ *  @param CUR_PROTOCOL_ID      uint8_t     ID of current layer from MPKL_PROTOCOL_XXX defines above
+ *  @param NEXT_PROTOCOL_ID     uint8_t     ID of next layer
+ *  @param CUR_UUID             uuid_t      Current layer 16-byte UUID of message
+ *  @param CUR_LEN              uint32_t    Current layer message length
+ *  @param LOG_SEQN             uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+
+#define MPKL_UUID_NEXT(LOGOBJECT, CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, CUR_LEN, LOG_SEQN) \
+os_log(LOGOBJECT, "9 {curProtocol: %hhu, nextProtocol: %hhu, curUUID: %{public,uuid_t}.16P, curPktLen: %u, logSeqn: %hhu}", CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, CUR_LEN, LOG_SEQN)
+
+/*!
+ *  @macro MPKL_SEQRANGE_NEXT
+ *  @discussion    Log the transition of current layer's message with UUID to next layer
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param LABEL                string      optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {}
+ *  @param CUR_PROTOCOL_ID      uint8_t     ID of current layer from MPKL_PROTOCOL_XXX defines above
+ *  @param NEXT_PROTOCOL_ID     uint8_t     ID of next layer
+ *  @param CUR_RANGE_START      uint16_t    Current layer byte sequence range start
+ *  @param CUR_RANGE_END        uint16_t    Current layer byte sequence range end
+ *  @param LOG_SEQN             uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+
+#define MPKL_SEQRANGE_NEXT(LOGOBJECT, CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, CUR_LEN, LOG_SEQN) \
+os_log(LOGOBJECT, "10 {curProtocol: %hhu, nextProtocol: %hhu, curUUID: %{public,uuid_t}.16P, curPktLen: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, CUR_LEN, LOG_SEQN)
+
+
+/*!
+ *  @macro MPKL_UUID_PREV
+ *  @discussion    Log the transition of previous layer's message with UUID to current layer
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param LABEL                string      optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {}
+ *  @param CUR_PROTOCOL_ID      uint8_t     ID of current layer from MPKL_PROTOCOL_XXX defines above
+ *  @param PREV_PROTOCOL_ID     uint8_t     ID of other layer being associated
+ *  @param PREV_UUID             uuid_t     Previous layer 16-byte UUID of message
+ *  @param PREV_LEN              uint16_t   Previous layer message length
+ *  @param LOG_SEQN             uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+
+#define MPKL_UUID_PREV(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_UUID, PREV_LEN, LOG_SEQN) \
+os_log(LOGOBJECT, "11 {curProtocol: %hhu, prevProtocol: %hhu, prevUUID: %{public,uuid_t}.16P, prevPktLen: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, PREV_UUID, PREV_LEN, LOG_SEQN)
+
+/*
+ *  APIs to indicate a Task Start/End
+ */
+
+/*!
+ *  @macro MPKL_TASK_START
+ *  @discussion    Log the start of a task
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param LABEL                string      optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {}
+ *  @param CLIENT_BUNDLE_ID     NSString    bundleID of the client
+ *  @param TASK_UUID            uuid_t      16-byte UUID of NSURL task
+ *  @param CONN_UUID            uuid_t      16-byte UUID of associated libnetcore connection
+ *  @param LOG_SEQN             uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+
+#define MPKL_TASK_START(LOGOBJECT, CLIENT_BUNDLE_ID, TASK_UUID, CONN_UUID, LOG_SEQN) \
+os_log(LOGOBJECT, "12 {startBundleID: %@, taskUUID: %{public,uuid_t}.16P, connUUID: %{public,uuid_t}.16P, logSeqn: %hhu}", CLIENT_BUNDLE_ID, TASK_UUID, CONN_UUID, LOG_SEQN)
+
+/*!
+ *  @macro MPKL_TASK_START
+ *  @discussion    Log the end of a task
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param LABEL                string      optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {}
+ *  @param CLIENT_BUNDLE_ID     NSString    bundleID of the client
+ *  @param TASK_UUID            uuid_t      16-byte UUID of NSURL task
+ *  @param CONN_UUID            uuid_t      16-byte UUID of associated libnetcore connection
+ *  @param LOG_SEQN             uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+
+#define MPKL_TASK_END(LOGOBJECT, CLIENT_BUNDLE_ID, TASK_UUID, CONN_UUID, LOG_SEQN) \
+os_log(LOGOBJECT, "13 {endBundleID: %@, taskUUID: %{public,uuid_t}.16P, connUUID: %{public,uuid_t}.16P, logSeqn: %hhu}", CLIENT_BUNDLE_ID, TASK_UUID, CONN_UUID, LOG_SEQN)
+
+/*!
+ *  @macro MPKL_SEQN_INCOMPLETE_PREV
+ *  @discussion    An incomplete packet was sent with a given protocol sequence number and couldn't be associated to another protocol.
+ *                 The incomplete packet is saved, its byte sequence range is logged and it is associated once more data arrives.
+ *
+ */
+
+#define MPKL_SEQN_INCOMPLETE_PREV(LOGOBJECT, CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, PREV_RANGE_START, PREV_RANGE_END, LOG_SEQN) \
+os_log(LOGOBJECT, "14 {Send Incomplete. curProtocol: %hhu, prevProtocol: %hhu, curSeqN: %hu, curStart: %hu, curEnd: %hu, prevStart: %hu, prevEnd: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, PREV_PROTOCOL_ID, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, PREV_RANGE_START, PREV_RANGE_END, LOG_SEQN)
+
+/*!
+ *  @macro MPKL_SEQN_INCOMPLETE_NEXT
+ *  @discussion    An incomplete packet was sent with a given protocol sequence number and couldn't be associated to another protocol.
+ *                 The incomplete packet is saved, its byte sequence range is logged and it is associated once more data arrives.
+ *
+ */
+
+#define MPKL_SEQN_INCOMPLETE_NEXT(LOGOBJECT, CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, NEXT_RANGE_START, NEXT_RANGE_END, LOG_SEQN) \
+os_log(LOGOBJECT, "15 {Receive Incomplete. curProtocol: %hhu, nextProtocol: %hhu, curSeqN: %hu, curStart: %hu, curEnd: %hu, nextStart: %hu, nextEnd: %hu, logSeqn: %hhu}", CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_SEQ_N, CUR_RANGE_START, CUR_RANGE_END, NEXT_RANGE_START, NEXT_RANGE_END, LOG_SEQN)
+
+#ifdef KERNEL
+/*!
+ *  @macro MPKL_TCP_SEND
+ *  @discussion    Associate data sent by a process with a TCP connection
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param PREV_PROTOCOL_ID     uint8_t     Protocol identifier passed by the process (may be 0)
+ *  @param PREV_UUID            uuid_t      UUID passed by the process (may be null UUID)
+ *  @param LOCAL_PORT           uint16_t    Local port of the TCP connection
+ *  @param REMOTE_PORT          uint16_t    Remote port of the TCP connection
+ *  @param TCP_SEQ              uint32_t    TCP sequence number of the first byte of the data being sent by the process
+ *  @param TCP_LEN              uint32_t    Length of the data
+ *  @param PID                  uint16_t    pid of the process using the TCP connection
+ *  @param LOG_SEQN             uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+#define MPKL_TCP_SEND(LOGOBJECT, PREV_PROTOCOL_ID, PREV_UUID, LOCAL_PORT, REMOTE_PORT, TCP_SEQ, TCP_LEN, PID, LOG_SEQN)           \
+       os_log_with_type(LOGOBJECT, net_mpklog_type,                                                                              \
+           "16 {curProtocol: 100, prevProtocol: %hhu, "                                                                          \
+           "prevUUID: "                                                                                                          \
+           "%02X%02X%02X%02X-%02X%02X-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X, "                                              \
+           "localPort: %hu, remotePort: %hu, tcpSeq: %u, length: %u, "                                                           \
+           "pid: %hu, logSeqn: %hhu}",                                                                                           \
+           PREV_PROTOCOL_ID,                                                                                                     \
+           PREV_UUID[0], PREV_UUID[1], PREV_UUID[2], PREV_UUID[3], PREV_UUID[4], PREV_UUID[5], PREV_UUID[6], PREV_UUID[7],       \
+           PREV_UUID[8], PREV_UUID[9], PREV_UUID[10], PREV_UUID[11], PREV_UUID[12], PREV_UUID[13], PREV_UUID[14], PREV_UUID[15], \
+           LOCAL_PORT, REMOTE_PORT, TCP_SEQ, TCP_LEN,                                                                            \
+           (uint16_t)PID, LOG_SEQN)
+
+/*!
+ *  @macro MPKL_TCP_INPUT
+ *  @discussion    Associate TCP segment being with a packet received to a TCP connection
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param LOCAL_PORT           uint16_t    Local port in the TCP header of the segment
+ *  @param REMOTE_PORT          uint16_t    Remote port in the TCP header of the segment
+ *  @param TCP_SEQ              uint32_t    Sequence number in the TCP header of the segment
+ *  @param TCP_ACK              uint32_t    Acknowledgement number in the TCP header of the segment
+ *  @param TCP_LEN              uint16_t    Length in the TCP header of the segment
+ *  @param TCP_FLAGS            uint8_t     Flags of the TCP header of the segment
+ *  @param PID                  uint16_t    pid of the process using the TCP connection
+ *  @param LOG_SEQN             uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+#define MPKL_TCP_INPUT(LOGOBJECT, LOCAL_PORT, REMOTE_PORT, TCP_SEQ, TCP_ACK, TCP_LEN, TCP_FLAGS, PID, LOG_SEQN) \
+       os_log_with_type(LOGOBJECT, net_mpklog_type,                                                            \
+           "17 {curProtocol: 100, prevProtocol: 80, "                                                          \
+           "localPort: %hu, remotePort: %hu, tcpSeq: %u, tcpAck: %u, tcpLen: %hu, tcpFlags: 0x%02x, "          \
+           "pid: %hu, logSeqn: %hhu}",                                                                         \
+           LOCAL_PORT, REMOTE_PORT, TCP_SEQ, TCP_ACK, (uint16_t)TCP_LEN, TCP_FLAGS,               \
+           (uint16_t)PID, LOG_SEQN)
+
+/*!
+ *  @macro MPKL_ESP_OUTPUT_TCP
+ *  @discussion    Associate a packet with a TCP segment being sent to an ESP packet
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param SPI                  uint32_t    SPI field in the ESP header
+ *  @param ESP_SEQ              uint32_t    Sequence number field in the ESP header
+ *  @param LOCAL_PORT           uint16_t    Local port of the TCP connection
+ *  @param REMOTE_PORT          uint16_t    Remote port of the TCP connection
+ *  @param TCP_SEQ              uint32_t    Sequence number in the TCP header of the segment
+ *  @param TCP_ACK              uint32_t    Acknowledgement number in the TCP header of the segment
+ *  @param TCP_LEN              uint16_t    Length in the TCP header of the segment
+ *  @param TCP_FLAGS            uint8_t     Flags of the TCP header of the segment
+ */
+#define MPKL_ESP_OUTPUT_TCP(LOGOBJECT, SPI, ESP_SEQ, LOCAL_PORT, REMOTE_PORT, TCP_SEQ, TCP_ACK, TCP_LEN, TCP_FLAGS)     \
+       os_log_with_type(LOGOBJECT, net_mpklog_type,                                                                    \
+           "18 {curProtocol: 80, spi: 0x%X, espSeq: %u, PayloadProtocol: 100, "                                        \
+           "localPort: %hu, remotePort: %hu, tcpSeq: %u, tcpAck: %u, tcpLen: %hu, tcpFlags: 0x%02x}",                  \
+           SPI, ESP_SEQ,                                                                                               \
+           LOCAL_PORT, REMOTE_PORT, TCP_SEQ, TCP_ACK, (uint16_t)TCP_LEN, TCP_FLAGS)
+
+/*!
+ *  @macro MPKL_ESP_INPUT_TCP
+ *  @discussion    Associate an ESP packet for TCP to the TCP segment
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param SPI                  uint32_t    SPI field in the ESP header
+ *  @param ESP_SEQ              uint32_t    Sequence number field in the ESP header
+ *  @param LOCAL_PORT           uint16_t    Local port of the TCP connection
+ *  @param REMOTE_PORT          uint16_t    Remote port of the TCP connection
+ *  @param TCP_SEQ              uint32_t    Sequence number in the TCP header of the segment
+ *  @param TCP_LEN              uint16_t    Length in the TCP header of the segment
+ */
+#define MPKL_ESP_INPUT_TCP(LOGOBJECT, SPI, ESP_SEQ, LOCAL_PORT, REMOTE_PORT, TCP_SEQ, TCP_LEN)  \
+       os_log_with_type(LOGOBJECT, net_mpklog_type,                                            \
+           "19 {curProtocol: 80 spi: 0x%X, espSeq: %u, PayloadProtocol: 100, "                 \
+           "localPort: %hu, remotePort: %hu, tcpSeq: %u, tcpLen: %hu}",                        \
+           SPI, ESP_SEQ,                                                                       \
+           LOCAL_PORT, REMOTE_PORT, TCP_SEQ, (uint16_t)TCP_LEN)
+#endif /* KERNEL */
+
+/*!
+ *  @macro MPKL_BYTERANGE_UUID_ASSOCIATE
+ *  @discussion    Associate current layer's byte range (start/end) to current layer's UUID
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param CUR_PROTOCOL_ID      uint8_t     ID of current layer from MPKL_PROTOCOL_XXX defines above
+ *  @param NEXT_PROTOCOL_ID     uint8_t     ID of other layer being associated
+ *  @param CUR_UUID             uuid_t      Current layer 16-byte UUID of endpoint handler
+ *  @param CUR_RANGE_START      uint64_t    Current layer byte range start
+ *  @param CUR_RANGE_END        uint64_t    Current layer byte range end
+ *  @param LOG_SEQN             uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+#define MPKL_BYTERANGE_UUID_ASSOCIATE(LOGOBJECT, CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, CUR_RANGE_START, CUR_RANGE_END, LOG_SEQN) \
+os_log(LOGOBJECT, "32 {curProtocol: %hhu, nextProtocol: %hhu, curUUID: %{public}.16P, curStart: %llu, curEnd: %llu, logSeqn: %hhu}", CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, CUR_RANGE_START, CUR_RANGE_END, LOG_SEQN)
+
+/*!
+ *  @macro MPKL_UUID_ONLY_ASSOCIATE_NEXT
+ *  @discussion    Associate current layer's UUID to next layer's UUID
+ *
+ *  @param LOGOBJECT            os_log_t    object to write data into
+ *  @param LABEL                string      optional layer-specific label for readability/debugability, this is ignored by the parser. Can not contain {}
+ *  @param CUR_PROTOCOL_ID      uint8_t     ID of current layer from MPKL_PROTOCOL_XXX defines above
+ *  @param NEXT_PROTOCOL_ID     uint8_t     ID of next layer being associated
+ *  @param CUR_UUID             uuid_t      Current layer 16-byte UUID
+ *  @param NEXT_UUID            uuid_t      Next layer 16-byte UUID
+ *  @param LOG_SEQ              uint8_t     Incrementing sequence number to detect logging system drop of messages
+ */
+#define MPKL_UUID_ONLY_ASSOCIATE_NEXT(LOGOBJECT, CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, NEXT_UUID, LOG_SEQN) \
+os_log(LOGOBJECT, "33 {curProtocol: %hhu, nextProtocol: %hhu, curUUID: %{public}.16P, nextUUID: %{public}.16P, logSeqn: %hhu}", CUR_PROTOCOL_ID, NEXT_PROTOCOL_ID, CUR_UUID, NEXT_UUID, LOG_SEQN)
+
+#ifdef KERNEL_PRIVATE
+extern int net_mpklog_enabled;
+extern int net_mpklog_type;
+#endif /* KERNEL_PRIVATE */
+
+#endif /* _NET_MULTI_LAYER_PKT_LOG_H_ */
index fd99d34b59581e4fb9527daa9e51993a5bf57e18..30370a84ef28732d4d542f73df78f2249b66438f 100644 (file)
@@ -808,9 +808,13 @@ nat464_translate_proto(pbuf_t *pbuf, struct nat464_addr *osrc,
                proto = &ip6h->ip6_nxt;
                break;
        }
+       default:
+               return NT_DROP; /* We should never come here */
        }
 
-       VERIFY(*proto == oproto);
+       if (*proto != oproto) {
+               return NT_DROP;
+       }
 
        /*
         * We may want to manipulate csum flags in some cases
index 513cea4aead8943e76fc98b846ba65f1ced59095..18c84be938f09e90659efe7c34f8882277c8587b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2013-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -43,6 +43,9 @@
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/coalition.h>
+#include <sys/ubc.h>
+#include <sys/codesign.h>
+#include <kern/cs_blobs.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <sys/kern_event.h>
 #include <sys/file_internal.h>
 #include <IOKit/IOBSD.h>
+#include <libkern/crypto/rand.h>
+#include <corecrypto/cchmac.h>
+#include <corecrypto/ccsha2.h>
+#include <os/refcnt.h>
 #include <net/network_agent.h>
 #include <net/necp.h>
 
@@ -136,6 +143,14 @@ u_int32_t necp_drop_all_level = 0;
 
 u_int32_t necp_pass_loopback = 1; // 0=Off, 1=On
 u_int32_t necp_pass_keepalives = 1; // 0=Off, 1=On
+u_int32_t necp_pass_interpose = 1; // 0=Off, 1=On
+
+u_int32_t necp_drop_unentitled_order = 0;
+#ifdef XNU_TARGET_OS_WATCH
+u_int32_t necp_drop_unentitled_level = NECP_SESSION_PRIORITY_CONTROL + 1; // Block all unentitled traffic from policies below control level
+#else // XNU_TARGET_OS_WATCH
+u_int32_t necp_drop_unentitled_level = 0;
+#endif // XNU_TARGET_OS_WATCH
 
 u_int32_t necp_debug = 0; // 0=None, 1=Basic, 2=EveryMatch
 
@@ -182,6 +197,9 @@ u_int32_t necp_session_count = 0;
 
 #define IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(x)     ((x) == NECP_ROUTE_RULE_DENY_INTERFACE || (x) == NECP_ROUTE_RULE_ALLOW_INTERFACE)
 
+#define IS_NECP_DEST_IN_LOCAL_NETWORKS(rt) \
+    ((rt) != NULL && !((rt)->rt_flags & RTF_GATEWAY) && ((rt)->rt_ifa && (rt)->rt_ifa->ifa_ifp && !((rt)->rt_ifa->ifa_ifp->if_flags & IFF_POINTOPOINT)))
+
 #define NECP_KERNEL_CONDITION_ALL_INTERFACES            0x000001
 #define NECP_KERNEL_CONDITION_BOUND_INTERFACE           0x000002
 #define NECP_KERNEL_CONDITION_PROTOCOL                          0x000004
@@ -203,6 +221,12 @@ u_int32_t necp_session_count = 0;
 #define NECP_KERNEL_CONDITION_ENTITLEMENT                       0x040000
 #define NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT        0x080000
 #define NECP_KERNEL_CONDITION_AGENT_TYPE                        0x100000
+#define NECP_KERNEL_CONDITION_HAS_CLIENT                        0x200000
+#define NECP_KERNEL_CONDITION_LOCAL_NETWORKS                    0x400000
+#define NECP_KERNEL_CONDITION_CLIENT_FLAGS                      0x800000
+#define NECP_KERNEL_CONDITION_LOCAL_EMPTY                       0x1000000
+#define NECP_KERNEL_CONDITION_REMOTE_EMPTY                      0x2000000
+#define NECP_KERNEL_CONDITION_PLATFORM_BINARY                   0x4000000
 
 #define NECP_MAX_POLICY_RESULT_SIZE                                     512
 #define NECP_MAX_ROUTE_RULES_ARRAY_SIZE                         1024
@@ -256,8 +280,13 @@ struct necp_socket_info {
        u_int32_t application_id;
        u_int32_t real_application_id;
        u_int32_t account_id;
+       u_int32_t drop_order;
+       u_int32_t client_flags;
        char *domain;
        errno_t cred_result;
+       unsigned has_client : 1;
+       unsigned is_platform_binary : 1;
+       unsigned __pad_bits : 6;
 };
 
 static kern_ctl_ref     necp_kctlref;
@@ -273,6 +302,8 @@ static  lck_attr_t              *necp_route_rule_mtx_attr       = NULL;
 static  lck_grp_t               *necp_route_rule_mtx_grp        = NULL;
 decl_lck_rw_data(static, necp_route_rule_lock);
 
+os_refgrp_decl(static, necp_refgrp, "NECPRefGroup", NULL);
+
 /*
  * On modification, invalidate cached lookups by bumping the generation count.
  * Other calls will need to take the slowpath of taking
@@ -285,6 +316,22 @@ static volatile int32_t necp_kernel_socket_policies_gencount;
        }                                                                                                                                                               \
 } while (0)
 
+/*
+ * Drop-all Bypass:
+ * Allow priviledged processes to bypass the default drop-all
+ * via entitlement check.  For OSX, since entitlement check is
+ * not supported for configd, configd signing identity is checked
+ * instead.
+ */
+#define SIGNING_ID_CONFIGD "com.apple.configd"
+#define SIGNING_ID_CONFIGD_LEN (sizeof(SIGNING_ID_CONFIGD) - 1)
+
+typedef enum {
+       NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE = 0,
+       NECP_DROP_ALL_BYPASS_CHECK_RESULT_TRUE = 1,
+       NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE = 2,
+} necp_drop_all_bypass_check_result_t;
+
 static u_int32_t necp_kernel_application_policies_condition_mask;
 static size_t necp_kernel_application_policies_count;
 static u_int32_t necp_kernel_socket_policies_condition_mask;
@@ -310,6 +357,11 @@ static LIST_HEAD(_necpkernelipoutputpolicies, necp_kernel_ip_output_policy) necp
 #define NECP_KERNEL_IP_OUTPUT_POLICIES_MAP_NUM_ID_BUCKETS 5
 #define NECP_IP_OUTPUT_MAP_ID_TO_BUCKET(id) (id ? (id%(NECP_KERNEL_IP_OUTPUT_POLICIES_MAP_NUM_ID_BUCKETS - 1) + 1) : 0)
 static struct necp_kernel_ip_output_policy **necp_kernel_ip_output_policies_map[NECP_KERNEL_IP_OUTPUT_POLICIES_MAP_NUM_ID_BUCKETS];
+static struct necp_kernel_socket_policy pass_policy =
+{
+       .id = NECP_KERNEL_POLICY_ID_NO_MATCH,
+       .result = NECP_KERNEL_POLICY_RESULT_PASS,
+};
 
 static struct necp_session *necp_create_session(void);
 static void necp_delete_session(struct necp_session *session);
@@ -338,11 +390,11 @@ static bool necp_policy_mark_all_for_deletion(struct necp_session *session);
 static bool necp_policy_delete(struct necp_session *session, struct necp_session_policy *policy);
 static void necp_policy_apply_all(struct necp_session *session);
 
-static necp_kernel_policy_id necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter);
+static necp_kernel_policy_id necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, u_int32_t cond_client_flags, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter);
 static bool necp_kernel_socket_policy_delete(necp_kernel_policy_id policy_id);
 static bool necp_kernel_socket_policies_reprocess(void);
 static bool necp_kernel_socket_policies_update_uuid_table(void);
-static inline struct necp_kernel_socket_policy *necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, u_int32_t *return_netagent_use_flags_array, size_t netagent_array_count, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, proc_t proc, necp_kernel_policy_id *skip_policy_id);
+static inline struct necp_kernel_socket_policy *necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id_array, size_t *return_route_rule_id_array_count, size_t route_rule_id_array_count, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, u_int32_t *return_netagent_use_flags_array, size_t netagent_array_count, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, proc_t proc, necp_kernel_policy_id *skip_policy_id, struct rtentry *rt, necp_kernel_policy_result *return_drop_dest_policy_result, necp_drop_all_bypass_check_result_t *return_drop_all_bypass);
 
 static necp_kernel_policy_id necp_kernel_ip_output_policy_add(necp_policy_order order, necp_policy_order suborder, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_kernel_policy_id cond_policy_id, ifnet_t cond_bound_interface, u_int32_t cond_last_interface_index, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter);
 static bool necp_kernel_ip_output_policy_delete(necp_kernel_policy_id policy_id);
@@ -353,6 +405,7 @@ static bool necp_is_range_in_range(struct sockaddr *inner_range_start, struct so
 static bool necp_is_addr_in_subnet(struct sockaddr *addr, struct sockaddr *subnet_addr, u_int8_t subnet_prefix);
 static int necp_addr_compare(struct sockaddr *sa1, struct sockaddr *sa2, int check_port);
 static bool necp_buffer_compare_with_bit_prefix(u_int8_t *p1, u_int8_t *p2, u_int32_t bits);
+static bool necp_addr_is_empty(struct sockaddr *addr);
 static bool necp_is_loopback(struct sockaddr *local_addr, struct sockaddr *remote_addr, struct inpcb *inp, struct mbuf *packet);
 static bool necp_is_intcoproc(struct inpcb *inp, struct mbuf *packet);
 
@@ -360,8 +413,8 @@ struct necp_uuid_id_mapping {
        LIST_ENTRY(necp_uuid_id_mapping) chain;
        uuid_t          uuid;
        u_int32_t       id;
-       u_int32_t       refcount;
-       u_int32_t       table_refcount; // Add to UUID policy table count
+       os_refcnt_t     refcount;
+       u_int32_t       table_usecount; // Add to UUID policy table count
 };
 static size_t necp_num_uuid_app_id_mappings;
 static bool necp_uuid_app_id_mappings_dirty;
@@ -383,7 +436,7 @@ struct necp_string_id_mapping {
        LIST_ENTRY(necp_string_id_mapping) chain;
        char            *string;
        necp_app_id     id;
-       u_int32_t       refcount;
+       os_refcnt_t     refcount;
 };
 static LIST_HEAD(necp_string_id_mapping_list, necp_string_id_mapping) necp_account_id_list;
 static u_int32_t necp_create_string_to_id_mapping(struct necp_string_id_mapping_list *list, char *domain);
@@ -412,9 +465,10 @@ struct necp_route_rule {
        u_int8_t        wifi_action;
        u_int8_t        wired_action;
        u_int8_t        expensive_action;
+       u_int8_t        constrained_action;
        u_int           exception_if_indices[MAX_ROUTE_RULE_INTERFACES];
        u_int8_t        exception_if_actions[MAX_ROUTE_RULE_INTERFACES];
-       u_int32_t       refcount;
+       os_refcnt_t     refcount;
 };
 static LIST_HEAD(necp_route_rule_list, necp_route_rule) necp_route_rules;
 static u_int32_t necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_array, u_int32_t route_rules_array_size);
@@ -434,17 +488,30 @@ static u_int32_t necp_create_aggregate_route_rule(u_int32_t *rule_ids);
 
 // Sysctl definitions
 static int sysctl_handle_necp_level SYSCTL_HANDLER_ARGS;
+static int sysctl_handle_necp_unentitled_level SYSCTL_HANDLER_ARGS;
 
 SYSCTL_NODE(_net, OID_AUTO, necp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "NECP");
 SYSCTL_INT(_net_necp, NECPCTL_PASS_LOOPBACK, pass_loopback, CTLFLAG_LOCKED | CTLFLAG_RW, &necp_pass_loopback, 0, "");
 SYSCTL_INT(_net_necp, NECPCTL_PASS_KEEPALIVES, pass_keepalives, CTLFLAG_LOCKED | CTLFLAG_RW, &necp_pass_keepalives, 0, "");
+SYSCTL_INT(_net_necp, NECPCTL_PASS_INTERPOSE, pass_interpose, CTLFLAG_LOCKED | CTLFLAG_RW, &necp_pass_interpose, 0, "");
 SYSCTL_INT(_net_necp, NECPCTL_DEBUG, debug, CTLFLAG_LOCKED | CTLFLAG_RW, &necp_debug, 0, "");
+SYSCTL_PROC(_net_necp, NECPCTL_DROP_UNENTITLED_LEVEL, drop_unentitled_level, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, &necp_drop_unentitled_level, 0, &sysctl_handle_necp_unentitled_level, "IU", "");
 SYSCTL_PROC(_net_necp, NECPCTL_DROP_ALL_LEVEL, drop_all_level, CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, &necp_drop_all_level, 0, &sysctl_handle_necp_level, "IU", "");
 SYSCTL_LONG(_net_necp, NECPCTL_SOCKET_POLICY_COUNT, socket_policy_count, CTLFLAG_LOCKED | CTLFLAG_RD, &necp_kernel_socket_policies_count, "");
 SYSCTL_LONG(_net_necp, NECPCTL_SOCKET_NON_APP_POLICY_COUNT, socket_non_app_policy_count, CTLFLAG_LOCKED | CTLFLAG_RD, &necp_kernel_socket_policies_non_app_count, "");
 SYSCTL_LONG(_net_necp, NECPCTL_IP_POLICY_COUNT, ip_policy_count, CTLFLAG_LOCKED | CTLFLAG_RD, &necp_kernel_ip_output_policies_count, "");
 SYSCTL_INT(_net_necp, NECPCTL_SESSION_COUNT, session_count, CTLFLAG_LOCKED | CTLFLAG_RD, &necp_session_count, 0, "");
 
+static struct necp_drop_dest_policy necp_drop_dest_policy;
+static int necp_drop_dest_debug = 0;    // 0: off, 1: match, >1: every evaluation
+SYSCTL_INT(_net_necp, OID_AUTO, drop_dest_debug, CTLFLAG_LOCKED | CTLFLAG_RW, &necp_drop_dest_debug, 0, "");
+
+static int sysctl_handle_necp_drop_dest_level SYSCTL_HANDLER_ARGS;
+SYSCTL_PROC(_net_necp, OID_AUTO, drop_dest_level, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_RW,
+    0, 0, &sysctl_handle_necp_drop_dest_level, "S,necp_drop_dest_level", "");
+
+static bool necp_address_matches_drop_dest_policy(union necp_sockaddr_union *, u_int32_t);
+
 // Session order allocation
 static u_int32_t
 necp_allocate_new_session_order(u_int32_t priority, u_int32_t control_unit)
@@ -465,6 +532,9 @@ necp_allocate_new_session_order(u_int32_t priority, u_int32_t control_unit)
 static inline u_int32_t
 necp_get_first_order_for_priority(u_int32_t priority)
 {
+       if (priority == 0) {
+               return 0;
+       }
        return ((priority - 1) * 1000) + 1;
 }
 
@@ -474,72 +544,76 @@ sysctl_handle_necp_level SYSCTL_HANDLER_ARGS
 {
 #pragma unused(arg1, arg2)
        int error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
-       if (necp_drop_all_level == 0) {
-               necp_drop_all_order = 0;
+       necp_drop_all_order = necp_get_first_order_for_priority(necp_drop_all_level);
+       return error;
+}
+
+static int
+sysctl_handle_necp_unentitled_level SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
+       necp_drop_unentitled_order = necp_get_first_order_for_priority(necp_drop_unentitled_level);
+       return error;
+}
+
+// Use a macro here to avoid computing the kauth_cred_t when necp_drop_unentitled_level is 0
+static inline u_int32_t
+_necp_process_drop_order_inner(kauth_cred_t cred)
+{
+       if (priv_check_cred(cred, PRIV_NET_PRIVILEGED_CLIENT_ACCESS, 0) != 0 &&
+           priv_check_cred(cred, PRIV_NET_PRIVILEGED_SERVER_ACCESS, 0) != 0) {
+               return necp_drop_unentitled_order;
        } else {
-               necp_drop_all_order = necp_get_first_order_for_priority(necp_drop_all_level);
+               return 0;
        }
-       return error;
 }
 
+#define necp_process_drop_order(_cred) (necp_drop_unentitled_order != 0 ? _necp_process_drop_order_inner(_cred) : necp_drop_unentitled_order)
+#pragma GCC poison _necp_process_drop_order_inner
+
 // Session fd
 
-static int noop_read(struct fileproc *, struct uio *, int, vfs_context_t);
-static int noop_write(struct fileproc *, struct uio *, int, vfs_context_t);
-static int noop_ioctl(struct fileproc *, unsigned long, caddr_t,
-    vfs_context_t);
-static int noop_select(struct fileproc *, int, void *, vfs_context_t);
 static int necp_session_op_close(struct fileglob *, vfs_context_t);
-static int noop_kqfilter(struct fileproc *, struct knote *,
-    struct kevent_internal_s *, vfs_context_t);
 
 static const struct fileops necp_session_fd_ops = {
-       .fo_type = DTYPE_NETPOLICY,
-       .fo_read = noop_read,
-       .fo_write = noop_write,
-       .fo_ioctl = noop_ioctl,
-       .fo_select = noop_select,
-       .fo_close = necp_session_op_close,
-       .fo_kqfilter = noop_kqfilter,
-       .fo_drain = NULL,
+       .fo_type     = DTYPE_NETPOLICY,
+       .fo_read     = fo_no_read,
+       .fo_write    = fo_no_write,
+       .fo_ioctl    = fo_no_ioctl,
+       .fo_select   = fo_no_select,
+       .fo_close    = necp_session_op_close,
+       .fo_drain    = fo_no_drain,
+       .fo_kqfilter = fo_no_kqfilter,
 };
 
-static int
-noop_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx)
-{
-#pragma unused(fp, uio, flags, ctx)
-       return ENXIO;
-}
-
-static int
-noop_write(struct fileproc *fp, struct uio *uio, int flags,
-    vfs_context_t ctx)
-{
-#pragma unused(fp, uio, flags, ctx)
-       return ENXIO;
-}
-
-static int
-noop_ioctl(struct fileproc *fp, unsigned long com, caddr_t data,
-    vfs_context_t ctx)
+static inline necp_drop_all_bypass_check_result_t
+necp_check_drop_all_bypass_result(proc_t proc)
 {
-#pragma unused(fp, com, data, ctx)
-       return ENOTTY;
-}
+       if (proc == NULL) {
+               proc = current_proc();
+               if (proc == NULL) {
+                       return NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE;
+               }
+       }
 
-static int
-noop_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx)
-{
-#pragma unused(fp, which, wql, ctx)
-       return ENXIO;
-}
+#if defined(XNU_TARGET_OS_OSX)
+       const char *signing_id = NULL;
+       const bool isConfigd = (csproc_get_platform_binary(proc) &&
+           (signing_id = cs_identity_get(proc)) &&
+           (strlen(signing_id) == SIGNING_ID_CONFIGD_LEN) &&
+           (memcmp(signing_id, SIGNING_ID_CONFIGD, SIGNING_ID_CONFIGD_LEN) == 0));
+       if (isConfigd) {
+               return NECP_DROP_ALL_BYPASS_CHECK_RESULT_TRUE;
+       }
+#endif
 
-static int
-noop_kqfilter(struct fileproc *fp, struct knote *kn,
-    struct kevent_internal_s *kev, vfs_context_t ctx)
-{
-#pragma unused(fp, kn, kev, ctx)
-       return ENXIO;
+       const task_t task = proc_task(proc);
+       if (task == NULL || !IOTaskHasEntitlement(task, "com.apple.private.necp.drop_all_bypass")) {
+               return NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE;
+       } else {
+               return NECP_DROP_ALL_BYPASS_CHECK_RESULT_TRUE;
+       }
 }
 
 int
@@ -1124,6 +1198,117 @@ static errno_t necp_ctl_setopt(kern_ctl_ref kctlref, u_int32_t unit, void *uniti
 
 static bool necp_send_ctl_data(struct necp_session *session, u_int8_t *buffer, size_t buffer_size);
 
+struct necp_resolver_key_state {
+       const struct ccdigest_info *digest_info;
+       uint8_t key[CCSHA256_OUTPUT_SIZE];
+};
+static struct necp_resolver_key_state s_necp_resolver_key_state;
+
+static void
+necp_generate_resolver_key(void)
+{
+       s_necp_resolver_key_state.digest_info = ccsha256_di();
+       cc_rand_generate(s_necp_resolver_key_state.key, sizeof(s_necp_resolver_key_state.key));
+}
+
+static void
+necp_sign_update_context(const struct ccdigest_info *di,
+    cchmac_ctx_t ctx,
+    uuid_t client_id,
+    u_int8_t *query,
+    u_int32_t query_length,
+    u_int8_t *answer,
+    u_int32_t answer_length)
+{
+       const uint8_t context[32] = {[0 ... 31] = 0x20}; // 0x20 repeated 32 times
+       const char *context_string = "NECP Resolver Binder";
+       uint8_t separator = 0;
+       cchmac_update(di, ctx, sizeof(context), context);
+       cchmac_update(di, ctx, strlen(context_string), context_string);
+       cchmac_update(di, ctx, sizeof(separator), &separator);
+       cchmac_update(di, ctx, sizeof(uuid_t), client_id);
+       cchmac_update(di, ctx, sizeof(query_length), &query_length);
+       cchmac_update(di, ctx, query_length, query);
+       cchmac_update(di, ctx, sizeof(answer_length), &answer_length);
+       cchmac_update(di, ctx, answer_length, answer);
+}
+
+int
+necp_sign_resolver_answer(uuid_t client_id, u_int8_t *query, u_int32_t query_length,
+    u_int8_t *answer, u_int32_t answer_length,
+    u_int8_t *tag, u_int32_t *out_tag_length)
+{
+       if (s_necp_resolver_key_state.digest_info == NULL) {
+               return EINVAL;
+       }
+
+       if (query == NULL ||
+           query_length == 0 ||
+           answer == NULL ||
+           answer_length == 0 ||
+           tag == NULL ||
+           out_tag_length == NULL) {
+               return EINVAL;
+       }
+
+       size_t required_tag_length = s_necp_resolver_key_state.digest_info->output_size;
+       if (*out_tag_length < required_tag_length) {
+               return ERANGE;
+       }
+
+       *out_tag_length = required_tag_length;
+
+       cchmac_ctx_decl(s_necp_resolver_key_state.digest_info->state_size,
+           s_necp_resolver_key_state.digest_info->block_size, ctx);
+       cchmac_init(s_necp_resolver_key_state.digest_info, ctx,
+           sizeof(s_necp_resolver_key_state.key),
+           s_necp_resolver_key_state.key);
+       necp_sign_update_context(s_necp_resolver_key_state.digest_info,
+           ctx, client_id, query, query_length,
+           answer, answer_length);
+       cchmac_final(s_necp_resolver_key_state.digest_info, ctx, tag);
+
+       return 0;
+}
+
+bool
+necp_validate_resolver_answer(uuid_t client_id, u_int8_t *query, u_int32_t query_length,
+    u_int8_t *answer, u_int32_t answer_length,
+    u_int8_t *tag, u_int32_t tag_length)
+{
+       if (s_necp_resolver_key_state.digest_info == NULL) {
+               return false;
+       }
+
+       if (query == NULL ||
+           query_length == 0 ||
+           answer == NULL ||
+           answer_length == 0 ||
+           tag == NULL ||
+           tag_length == 0) {
+               return false;
+       }
+
+       size_t required_tag_length = s_necp_resolver_key_state.digest_info->output_size;
+       if (tag_length != required_tag_length) {
+               return false;
+       }
+
+       uint8_t actual_tag[required_tag_length];
+
+       cchmac_ctx_decl(s_necp_resolver_key_state.digest_info->state_size,
+           s_necp_resolver_key_state.digest_info->block_size, ctx);
+       cchmac_init(s_necp_resolver_key_state.digest_info, ctx,
+           sizeof(s_necp_resolver_key_state.key),
+           s_necp_resolver_key_state.key);
+       necp_sign_update_context(s_necp_resolver_key_state.digest_info,
+           ctx, client_id, query, query_length,
+           answer, answer_length);
+       cchmac_final(s_necp_resolver_key_state.digest_info, ctx, actual_tag);
+
+       return cc_cmp_safe(s_necp_resolver_key_state.digest_info->output_size, tag, actual_tag) == 0;
+}
+
 errno_t
 necp_init(void)
 {
@@ -1196,6 +1381,8 @@ necp_init(void)
        LIST_INIT(&necp_route_rules);
        LIST_INIT(&necp_aggregate_route_rules);
 
+       necp_generate_resolver_key();
+
        necp_uuid_app_id_hashtbl = hashinit(NECP_UUID_APP_ID_HASH_SIZE, M_NECP, &necp_uuid_app_id_hash_mask);
        necp_uuid_app_id_hash_num_buckets = necp_uuid_app_id_hash_mask + 1;
        necp_num_uuid_app_id_mappings = 0;
@@ -1217,6 +1404,8 @@ necp_init(void)
        memset(&necp_kernel_ip_output_policies_map, 0, sizeof(necp_kernel_ip_output_policies_map));
        necp_kernel_socket_policies_app_layer_map = NULL;
 
+       necp_drop_unentitled_order = necp_get_first_order_for_priority(necp_drop_unentitled_level);
+
 done:
        if (result != 0) {
                if (necp_kernel_policy_mtx_attr != NULL) {
@@ -1442,7 +1631,8 @@ necp_buffer_write_tlv_if_different(u_int8_t *cursor, u_int8_t type,
     u_int8_t *buffer, u_int32_t buffer_length)
 {
        if (!necp_buffer_write_tlv_validate(cursor, type, length, buffer, buffer_length)) {
-               return NULL;
+               // If we can't fit this TLV, return the current cursor
+               return cursor;
        }
        u_int8_t *next_tlv = (u_int8_t *)(cursor + sizeof(type) + sizeof(length) + length);
        if (*updated || *(u_int8_t *)(cursor) != type) {
@@ -1524,9 +1714,15 @@ necp_buffer_get_tlv_value(u_int8_t *buffer, int tlv_offset, u_int32_t *value_siz
 }
 
 int
-necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int offset, u_int8_t type, int next)
+necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int offset, u_int8_t type, int *err, int next)
 {
+       if (err != NULL) {
+               *err = ENOENT;
+       }
        if (offset < 0) {
+               if (err != NULL) {
+                       *err = EINVAL;
+               }
                return -1;
        }
        int cursor = offset;
@@ -1553,6 +1749,9 @@ necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int offset, u_in
                if (curr_type == type) {
                        // check if entire TLV fits inside buffer
                        if (((u_int32_t)next_cursor) <= buffer_length) {
+                               if (err != NULL) {
+                                       *err = 0;
+                               }
                                return cursor;
                        } else {
                                return -1;
@@ -1569,7 +1768,7 @@ necp_find_tlv(mbuf_t packet, u_int8_t *buffer, u_int32_t buffer_length, int offs
        if (packet != NULL) {
                cursor = necp_packet_find_tlv(packet, offset, type, err, next);
        } else if (buffer != NULL) {
-               cursor = necp_buffer_find_tlv(buffer, buffer_length, offset, type, next);
+               cursor = necp_buffer_find_tlv(buffer, buffer_length, offset, type, err, next);
        }
        return cursor;
 }
@@ -1981,7 +2180,8 @@ necp_policy_result_is_valid(u_int8_t *buffer, u_int32_t length)
        case NECP_POLICY_RESULT_PASS:
        case NECP_POLICY_RESULT_DROP:
        case NECP_POLICY_RESULT_ROUTE_RULES:
-       case NECP_POLICY_RESULT_SCOPED_DIRECT: {
+       case NECP_POLICY_RESULT_SCOPED_DIRECT:
+       case NECP_POLICY_RESULT_ALLOW_UNENTITLED: {
                validated = TRUE;
                break;
        }
@@ -2099,7 +2299,8 @@ necp_policy_condition_is_valid(u_int8_t *buffer, u_int32_t length, u_int8_t poli
            policy_result_type == NECP_POLICY_RESULT_ROUTE_RULES ||
            policy_result_type == NECP_POLICY_RESULT_USE_NETAGENT ||
            policy_result_type == NECP_POLICY_RESULT_NETAGENT_SCOPED ||
-           policy_result_type == NECP_POLICY_RESULT_SCOPED_DIRECT) ? TRUE : FALSE;
+           policy_result_type == NECP_POLICY_RESULT_SCOPED_DIRECT ||
+           policy_result_type == NECP_POLICY_RESULT_ALLOW_UNENTITLED) ? TRUE : FALSE;
        u_int32_t condition_length = necp_policy_condition_get_value_length_from_buffer(buffer, length);
        u_int8_t *condition_value = necp_policy_condition_get_value_pointer_from_buffer(buffer, length);
        u_int8_t type = necp_policy_condition_get_type_from_buffer(buffer, length);
@@ -2131,7 +2332,10 @@ necp_policy_condition_is_valid(u_int8_t *buffer, u_int32_t length, u_int8_t poli
        }
        case NECP_POLICY_CONDITION_DEFAULT:
        case NECP_POLICY_CONDITION_ALL_INTERFACES:
-       case NECP_POLICY_CONDITION_ENTITLEMENT: {
+       case NECP_POLICY_CONDITION_ENTITLEMENT:
+       case NECP_POLICY_CONDITION_PLATFORM_BINARY:
+       case NECP_POLICY_CONDITION_HAS_CLIENT:
+       case NECP_POLICY_CONDITION_LOCAL_NETWORKS: {
                if (!(flags & NECP_POLICY_CONDITION_FLAGS_NEGATIVE)) {
                        validated = TRUE;
                }
@@ -2181,6 +2385,43 @@ necp_policy_condition_is_valid(u_int8_t *buffer, u_int32_t length, u_int8_t poli
                }
                break;
        }
+       case NECP_POLICY_CONDITION_FLOW_IP_PROTOCOL: {
+               if (condition_length >= sizeof(u_int16_t)) {
+                       validated = TRUE;
+               }
+               break;
+       }
+       case NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR:
+       case NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR: {
+               if (condition_length >= sizeof(struct necp_policy_condition_addr) &&
+                   necp_address_is_valid(&((struct necp_policy_condition_addr *)(void *)condition_value)->address.sa)) {
+                       validated = TRUE;
+               }
+               break;
+       }
+       case NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_RANGE:
+       case NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_RANGE: {
+               if (condition_length >= sizeof(struct necp_policy_condition_addr_range) &&
+                   necp_address_is_valid(&((struct necp_policy_condition_addr_range *)(void *)condition_value)->start_address.sa) &&
+                   necp_address_is_valid(&((struct necp_policy_condition_addr_range *)(void *)condition_value)->end_address.sa)) {
+                       validated = TRUE;
+               }
+               break;
+       }
+       case NECP_POLICY_CONDITION_CLIENT_FLAGS: {
+               if (condition_length == 0 || condition_length >= sizeof(u_int32_t)) {
+                       validated = TRUE;
+               }
+               break;
+       }
+       case NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_EMPTY: {
+               validated = TRUE;
+               break;
+       }
+       case NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_EMPTY: {
+               validated = TRUE;
+               break;
+       }
        default: {
                validated = FALSE;
                break;
@@ -2454,6 +2695,11 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_
 
        // Read policy result
        cursor = necp_find_tlv(packet, tlv_buffer, tlv_buffer_length, offset, NECP_TLV_POLICY_RESULT, &error, 0);
+       if (error || cursor < 0) {
+               NECPLOG(LOG_ERR, "Failed to find policy result TLV: %d", error);
+               response_error = NECP_ERROR_INVALID_TLV;
+               goto fail;
+       }
        error = necp_get_tlv_at_offset(packet, tlv_buffer, tlv_buffer_length, cursor, 0, NULL, &policy_result_size);
        if (error || policy_result_size == 0) {
                NECPLOG(LOG_ERR, "Failed to get policy result length: %d", error);
@@ -2490,8 +2736,12 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_
                    cursor = necp_find_tlv(packet, tlv_buffer, tlv_buffer_length, cursor, NECP_TLV_ROUTE_RULE, &error, 1)) {
                        u_int32_t route_rule_size = 0;
                        necp_get_tlv_at_offset(packet, tlv_buffer, tlv_buffer_length, cursor, 0, NULL, &route_rule_size);
-                       if (route_rule_size > 0) {
-                               route_rules_array_size += (sizeof(u_int8_t) + sizeof(u_int32_t) + route_rule_size);
+                       if (os_add_overflow(route_rules_array_size,
+                           (sizeof(u_int8_t) + sizeof(u_int32_t) + route_rule_size),
+                           &route_rules_array_size)) {
+                               NECPLOG0(LOG_ERR, "Route rules size overflowed, too large");
+                               response_error = NECP_ERROR_INVALID_TLV;
+                               goto fail;
                        }
                }
 
@@ -2519,7 +2769,8 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_
                        u_int8_t route_rule_type = NECP_TLV_ROUTE_RULE;
                        u_int32_t route_rule_size = 0;
                        necp_get_tlv_at_offset(packet, tlv_buffer, tlv_buffer_length, cursor, 0, NULL, &route_rule_size);
-                       if (route_rule_size > 0 && route_rule_size <= (route_rules_array_size - route_rules_array_cursor)) {
+                       if (route_rule_size > 0 &&
+                           (sizeof(route_rule_type) + sizeof(route_rule_size) + route_rule_size) <= (route_rules_array_size - route_rules_array_cursor)) {
                                // Add type
                                memcpy((route_rules_array + route_rules_array_cursor), &route_rule_type, sizeof(route_rule_type));
                                route_rules_array_cursor += sizeof(route_rule_type);
@@ -2559,7 +2810,13 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_
                necp_get_tlv_at_offset(packet, tlv_buffer, tlv_buffer_length, cursor, 0, NULL, &condition_size);
 
                if (condition_size > 0) {
-                       conditions_array_size += (sizeof(u_int8_t) + sizeof(u_int32_t) + condition_size);
+                       if (os_add_overflow(conditions_array_size,
+                           (sizeof(u_int8_t) + sizeof(u_int32_t) + condition_size),
+                           &conditions_array_size)) {
+                               NECPLOG0(LOG_ERR, "Conditions size overflowed, too large");
+                               response_error = NECP_ERROR_INVALID_TLV;
+                               goto fail;
+                       }
                }
        }
 
@@ -2587,7 +2844,8 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_
                u_int8_t condition_type = NECP_TLV_POLICY_CONDITION;
                u_int32_t condition_size = 0;
                necp_get_tlv_at_offset(packet, tlv_buffer, tlv_buffer_length, cursor, 0, NULL, &condition_size);
-               if (condition_size > 0 && condition_size <= (conditions_array_size - conditions_array_cursor)) {
+               if (condition_size > 0 &&
+                   (sizeof(condition_type) + sizeof(condition_size) + condition_size) <= (conditions_array_size - conditions_array_cursor)) {
                        // Add type
                        memcpy((conditions_array + conditions_array_cursor), &condition_type, sizeof(condition_type));
                        conditions_array_cursor += sizeof(condition_type);
@@ -3035,6 +3293,9 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id,
                        if (condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES) {
                                num_conditions++;
                        }
+                       if (condition_mask & NECP_KERNEL_CONDITION_HAS_CLIENT) {
+                               num_conditions++;
+                       }
                        if (condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) {
                                snprintf(if_name, IFXNAMSIZ, "%s%d", ifnet_name(policy->cond_bound_interface), ifnet_unit(policy->cond_bound_interface));
                                condition_tlv_length += strlen(if_name) + 1;
@@ -3086,6 +3347,12 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id,
                                condition_tlv_length += entitlement_len;
                                num_conditions++;
                        }
+                       if (condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) {
+                               num_conditions++;
+                       }
+                       if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) {
+                               num_conditions++;
+                       }
                        if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_START) {
                                if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_END) {
                                        condition_tlv_length += sizeof(struct necp_policy_condition_addr_range);
@@ -3106,6 +3373,16 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id,
                                condition_tlv_length += sizeof(struct necp_policy_condition_agent_type);
                                num_conditions++;
                        }
+                       if (condition_mask & NECP_KERNEL_CONDITION_CLIENT_FLAGS) {
+                               condition_tlv_length += sizeof(u_int32_t);
+                               num_conditions++;
+                       }
+                       if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_EMPTY) {
+                               num_conditions++;
+                       }
+                       if (condition_mask & NECP_KERNEL_CONDITION_REMOTE_EMPTY) {
+                               num_conditions++;
+                       }
                }
 
                condition_tlv_length += num_conditions * (sizeof(u_int8_t) + sizeof(u_int32_t)); // These are for the condition TLVs. The space for "value" is already accounted for above.
@@ -3148,6 +3425,12 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id,
                        if (condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES) {
                                cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_ALL_INTERFACES, 0, "", cond_buf, condition_tlv_length);
                        }
+                       if (condition_mask & NECP_KERNEL_CONDITION_HAS_CLIENT) {
+                               cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_HAS_CLIENT, 0, "", cond_buf, condition_tlv_length);
+                       }
+                       if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) {
+                               cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_LOCAL_NETWORKS, 0, "", cond_buf, condition_tlv_length);
+                       }
                        if (condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) {
                                cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_BOUND_INTERFACE, strlen(if_name) + 1,
                                    if_name, cond_buf, condition_tlv_length);
@@ -3200,6 +3483,9 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id,
                                cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_ENTITLEMENT, strlen(policy->cond_custom_entitlement) + 1, policy->cond_custom_entitlement,
                                    cond_buf, condition_tlv_length);
                        }
+                       if (condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) {
+                               cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_PLATFORM_BINARY, 0, "", cond_buf, condition_tlv_length);
+                       }
                        if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_START) {
                                if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_END) {
                                        struct necp_policy_condition_addr_range range;
@@ -3235,6 +3521,15 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id,
                                    sizeof(policy->cond_agent_type), &policy->cond_agent_type,
                                    cond_buf, condition_tlv_length);
                        }
+                       if (condition_mask & NECP_KERNEL_CONDITION_CLIENT_FLAGS) {
+                               cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_CLIENT_FLAGS, sizeof(policy->cond_client_flags), &policy->cond_client_flags, cond_buf, condition_tlv_length);
+                       }
+                       if (condition_mask & NECP_KERNEL_CONDITION_LOCAL_EMPTY) {
+                               cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_EMPTY, 0, "", cond_buf, condition_tlv_length);
+                       }
+                       if (condition_mask & NECP_KERNEL_CONDITION_REMOTE_EMPTY) {
+                               cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_EMPTY, 0, "", cond_buf, condition_tlv_length);
+                       }
                }
 
                cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_CONDITION, cond_buf_cursor - cond_buf, cond_buf, tlv_buffer, total_allocated_bytes);
@@ -3606,6 +3901,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
        union necp_sockaddr_union cond_remote_start;
        union necp_sockaddr_union cond_remote_end;
        u_int8_t cond_remote_prefix = 0;
+       u_int32_t cond_client_flags = 0;
        u_int32_t offset = 0;
        u_int8_t ultimate_result = 0;
        u_int32_t secondary_result = 0;
@@ -3642,6 +3938,11 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                        socket_ip_conditions = TRUE;
                        break;
                }
+               case NECP_POLICY_CONDITION_HAS_CLIENT: {
+                       master_condition_mask |= NECP_KERNEL_CONDITION_HAS_CLIENT;
+                       socket_only_conditions = TRUE;
+                       break;
+               }
                case NECP_POLICY_CONDITION_ENTITLEMENT: {
                        if (condition_length > 0) {
                                if (cond_custom_entitlement == NULL) {
@@ -3657,6 +3958,11 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                        }
                        break;
                }
+               case NECP_POLICY_CONDITION_PLATFORM_BINARY: {
+                       master_condition_mask |= NECP_KERNEL_CONDITION_PLATFORM_BINARY;
+                       socket_only_conditions = TRUE;
+                       break;
+               }
                case NECP_POLICY_CONDITION_DOMAIN: {
                        // Make sure there is only one such rule
                        if (condition_length > 0 && cond_domain == NULL) {
@@ -3781,18 +4087,29 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                        }
                        break;
                }
-               case NECP_POLICY_CONDITION_IP_PROTOCOL: {
+               case NECP_POLICY_CONDITION_IP_PROTOCOL:
+               case NECP_POLICY_CONDITION_FLOW_IP_PROTOCOL: {
                        if (condition_length >= sizeof(u_int16_t)) {
                                master_condition_mask |= NECP_KERNEL_CONDITION_PROTOCOL;
                                if (condition_is_negative) {
                                        master_condition_negated_mask |= NECP_KERNEL_CONDITION_PROTOCOL;
                                }
                                memcpy(&cond_protocol, condition_value, sizeof(cond_protocol));
-                               socket_ip_conditions = TRUE;
+                               if (condition_type == NECP_POLICY_CONDITION_FLOW_IP_PROTOCOL) {
+                                       socket_only_conditions = TRUE;
+                               } else {
+                                       socket_ip_conditions = TRUE;
+                               }
                        }
                        break;
                }
-               case NECP_POLICY_CONDITION_LOCAL_ADDR: {
+               case NECP_POLICY_CONDITION_LOCAL_NETWORKS: {
+                       master_condition_mask |= NECP_KERNEL_CONDITION_LOCAL_NETWORKS;
+                       socket_ip_conditions = TRUE;
+                       break;
+               }
+               case NECP_POLICY_CONDITION_LOCAL_ADDR:
+               case NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR: {
                        struct necp_policy_condition_addr *address_struct = (struct necp_policy_condition_addr *)(void *)condition_value;
                        if (!necp_address_is_valid(&address_struct->address.sa)) {
                                break;
@@ -3806,10 +4123,15 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                                master_condition_negated_mask |= NECP_KERNEL_CONDITION_LOCAL_START;
                                master_condition_negated_mask |= NECP_KERNEL_CONDITION_LOCAL_PREFIX;
                        }
-                       socket_ip_conditions = TRUE;
+                       if (condition_type == NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR) {
+                               socket_only_conditions = TRUE;
+                       } else {
+                               socket_ip_conditions = TRUE;
+                       }
                        break;
                }
-               case NECP_POLICY_CONDITION_REMOTE_ADDR: {
+               case NECP_POLICY_CONDITION_REMOTE_ADDR:
+               case NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR: {
                        struct necp_policy_condition_addr *address_struct = (struct necp_policy_condition_addr *)(void *)condition_value;
                        if (!necp_address_is_valid(&address_struct->address.sa)) {
                                break;
@@ -3823,10 +4145,15 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                                master_condition_negated_mask |= NECP_KERNEL_CONDITION_REMOTE_START;
                                master_condition_negated_mask |= NECP_KERNEL_CONDITION_REMOTE_PREFIX;
                        }
-                       socket_ip_conditions = TRUE;
+                       if (condition_type == NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR) {
+                               socket_only_conditions = TRUE;
+                       } else {
+                               socket_ip_conditions = TRUE;
+                       }
                        break;
                }
-               case NECP_POLICY_CONDITION_LOCAL_ADDR_RANGE: {
+               case NECP_POLICY_CONDITION_LOCAL_ADDR_RANGE:
+               case NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_RANGE: {
                        struct necp_policy_condition_addr_range *address_struct = (struct necp_policy_condition_addr_range *)(void *)condition_value;
                        if (!necp_address_is_valid(&address_struct->start_address.sa) ||
                            !necp_address_is_valid(&address_struct->end_address.sa)) {
@@ -3841,10 +4168,15 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                                master_condition_negated_mask |= NECP_KERNEL_CONDITION_LOCAL_START;
                                master_condition_negated_mask |= NECP_KERNEL_CONDITION_LOCAL_END;
                        }
-                       socket_ip_conditions = TRUE;
+                       if (condition_type == NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_RANGE) {
+                               socket_only_conditions = TRUE;
+                       } else {
+                               socket_ip_conditions = TRUE;
+                       }
                        break;
                }
-               case NECP_POLICY_CONDITION_REMOTE_ADDR_RANGE: {
+               case NECP_POLICY_CONDITION_REMOTE_ADDR_RANGE:
+               case NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_RANGE: {
                        struct necp_policy_condition_addr_range *address_struct = (struct necp_policy_condition_addr_range *)(void *)condition_value;
                        if (!necp_address_is_valid(&address_struct->start_address.sa) ||
                            !necp_address_is_valid(&address_struct->end_address.sa)) {
@@ -3859,7 +4191,11 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                                master_condition_negated_mask |= NECP_KERNEL_CONDITION_REMOTE_START;
                                master_condition_negated_mask |= NECP_KERNEL_CONDITION_REMOTE_END;
                        }
-                       socket_ip_conditions = TRUE;
+                       if (condition_type == NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_RANGE) {
+                               socket_only_conditions = TRUE;
+                       } else {
+                               socket_ip_conditions = TRUE;
+                       }
                        break;
                }
                case NECP_POLICY_CONDITION_AGENT_TYPE: {
@@ -3870,6 +4206,36 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                        }
                        break;
                }
+               case NECP_POLICY_CONDITION_CLIENT_FLAGS: {
+                       if (condition_is_negative) {
+                               master_condition_negated_mask |= NECP_KERNEL_CONDITION_CLIENT_FLAGS;
+                       }
+                       master_condition_mask |= NECP_KERNEL_CONDITION_CLIENT_FLAGS;
+                       socket_only_conditions = TRUE;
+                       if (condition_length >= sizeof(u_int32_t)) {
+                               memcpy(&cond_client_flags, condition_value, sizeof(cond_client_flags));
+                       } else {
+                               // Empty means match on fallback traffic
+                               cond_client_flags = NECP_CLIENT_PARAMETER_FLAG_FALLBACK_TRAFFIC;
+                       }
+                       break;
+               }
+               case NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_EMPTY: {
+                       master_condition_mask |= NECP_KERNEL_CONDITION_LOCAL_EMPTY;
+                       if (condition_is_negative) {
+                               master_condition_negated_mask |= NECP_KERNEL_CONDITION_LOCAL_EMPTY;
+                       }
+                       socket_only_conditions = TRUE;
+                       break;
+               }
+               case NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_EMPTY: {
+                       master_condition_mask |= NECP_KERNEL_CONDITION_REMOTE_EMPTY;
+                       if (condition_is_negative) {
+                               master_condition_negated_mask |= NECP_KERNEL_CONDITION_REMOTE_EMPTY;
+                       }
+                       socket_only_conditions = TRUE;
+                       break;
+               }
                default: {
                        break;
                }
@@ -4017,13 +4383,23 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                socket_layer_non_id_conditions = TRUE;
                break;
        }
+       case NECP_POLICY_RESULT_ALLOW_UNENTITLED: {
+               socket_layer_non_id_conditions = TRUE;
+               break;
+       }
        case NECP_POLICY_RESULT_ROUTE_RULES: {
                if (policy->route_rules != NULL && policy->route_rules_size > 0) {
                        u_int32_t route_rule_id = necp_create_route_rule(&necp_route_rules, policy->route_rules, policy->route_rules_size);
                        if (route_rule_id > 0) {
                                policy->applied_route_rules_id = route_rule_id;
                                ultimate_result_parameter.route_rule_id = route_rule_id;
-                               socket_layer_non_id_conditions = TRUE;
+                               if (socket_only_conditions) { // socket_ip_conditions can be TRUE or FALSE
+                                       socket_layer_non_id_conditions = TRUE;
+                               } else if (socket_ip_conditions) {
+                                       socket_layer_non_id_conditions = TRUE;
+                                       ip_output_layer_non_id_conditions = TRUE;
+                                       ip_output_layer_non_id_only = TRUE; // Only apply route rules to packets that didn't go through socket layer
+                               }
                        }
                }
                break;
@@ -4034,7 +4410,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
        }
 
        if (socket_layer_non_id_conditions) {
-               necp_kernel_policy_id policy_id = necp_kernel_socket_policy_add(policy->order, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, cond_app_id, cond_real_app_id, cond_custom_entitlement, cond_account_id, cond_domain, cond_pid, cond_uid, cond_bound_interface, cond_traffic_class, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, &cond_agent_type, ultimate_result, ultimate_result_parameter);
+               necp_kernel_policy_id policy_id = necp_kernel_socket_policy_add(policy->order, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, cond_app_id, cond_real_app_id, cond_custom_entitlement, cond_account_id, cond_domain, cond_pid, cond_uid, cond_bound_interface, cond_traffic_class, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, &cond_agent_type, cond_client_flags, ultimate_result, ultimate_result_parameter);
 
                if (policy_id == 0) {
                        NECPLOG0(LOG_DEBUG, "Error applying socket kernel policy");
@@ -4050,6 +4426,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                if (ip_output_layer_non_id_only) {
                        condition_mask |= NECP_KERNEL_CONDITION_POLICY_ID;
                }
+
                necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->order, NECP_KERNEL_POLICY_SUBORDER_NON_ID_CONDITIONS, session->session_order, session->proc_pid, condition_mask, master_condition_negated_mask, NECP_KERNEL_POLICY_ID_NONE, cond_bound_interface, 0, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, ultimate_result, ultimate_result_parameter);
 
                if (policy_id == 0) {
@@ -4202,9 +4579,10 @@ necp_kernel_policy_get_new_id(bool socket_level)
        return newid;
 }
 
-#define NECP_KERNEL_VALID_SOCKET_CONDITIONS (NECP_KERNEL_CONDITION_APP_ID | NECP_KERNEL_CONDITION_REAL_APP_ID | NECP_KERNEL_CONDITION_DOMAIN | NECP_KERNEL_CONDITION_ACCOUNT_ID | NECP_KERNEL_CONDITION_PID | NECP_KERNEL_CONDITION_UID | NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_TRAFFIC_CLASS | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_ENTITLEMENT | NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT | NECP_KERNEL_CONDITION_AGENT_TYPE)
+#define NECP_KERNEL_VALID_SOCKET_CONDITIONS (NECP_KERNEL_CONDITION_APP_ID | NECP_KERNEL_CONDITION_REAL_APP_ID | NECP_KERNEL_CONDITION_DOMAIN | NECP_KERNEL_CONDITION_ACCOUNT_ID | NECP_KERNEL_CONDITION_PID | NECP_KERNEL_CONDITION_UID | NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_TRAFFIC_CLASS | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_ENTITLEMENT | NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT | NECP_KERNEL_CONDITION_AGENT_TYPE | NECP_KERNEL_CONDITION_HAS_CLIENT | NECP_KERNEL_CONDITION_LOCAL_NETWORKS | NECP_KERNEL_CONDITION_CLIENT_FLAGS | NECP_KERNEL_CONDITION_LOCAL_EMPTY | NECP_KERNEL_CONDITION_REMOTE_EMPTY | NECP_KERNEL_CONDITION_PLATFORM_BINARY)
+
 static necp_kernel_policy_id
-necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *cond_domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter)
+necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *cond_domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, u_int32_t cond_client_flags, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter)
 {
        struct necp_kernel_socket_policy *new_kernel_policy = NULL;
        struct necp_kernel_socket_policy *tmp_kernel_policy = NULL;
@@ -4237,6 +4615,12 @@ necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order,
        if ((new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_REMOTE_END) && (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_REMOTE_PREFIX)) {
                new_kernel_policy->condition_mask &= ~NECP_KERNEL_CONDITION_REMOTE_PREFIX;
        }
+       if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_EMPTY) {
+               new_kernel_policy->condition_mask &= ~(NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_LOCAL_END);
+       }
+       if ((new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_REMOTE_EMPTY)) {
+               new_kernel_policy->condition_mask &= ~(NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_REMOTE_END);
+       }
        new_kernel_policy->condition_negated_mask = condition_negated_mask & new_kernel_policy->condition_mask;
 
        // Set condition values
@@ -4296,6 +4680,9 @@ necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order,
        if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE) {
                memcpy(&new_kernel_policy->cond_agent_type, cond_agent_type, sizeof(*cond_agent_type));
        }
+       if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_CLIENT_FLAGS) {
+               new_kernel_policy->cond_client_flags = cond_client_flags;
+       }
 
        new_kernel_policy->result = result;
        memcpy(&new_kernel_policy->result_parameter, &result_parameter, sizeof(result_parameter));
@@ -4407,9 +4794,13 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul
                snprintf(result_string, MAX_RESULT_STRING_LEN, "ScopedDirect");
                break;
        }
+       case NECP_KERNEL_POLICY_RESULT_ALLOW_UNENTITLED: {
+               snprintf(result_string, MAX_RESULT_STRING_LEN, "AllowUnentitled");
+               break;
+       }
        case NECP_KERNEL_POLICY_RESULT_ROUTE_RULES: {
                int index = 0;
-               char interface_names[IFXNAMSIZ][MAX_ROUTE_RULE_INTERFACES];
+               char interface_names[MAX_ROUTE_RULE_INTERFACES][IFXNAMSIZ];
                struct necp_route_rule *route_rule = necp_lookup_route_rule_locked(&necp_route_rules, result_parameter.route_rule_id);
                if (route_rule != NULL) {
                        for (index = 0; index < MAX_ROUTE_RULE_INTERFACES; index++) {
@@ -4422,11 +4813,12 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul
                        }
                        switch (route_rule->default_action) {
                        case NECP_ROUTE_RULE_DENY_INTERFACE:
-                               snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (Only %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
+                               snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (Only %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
                                    (route_rule->cellular_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Cell " : "",
                                    (route_rule->wifi_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "WiFi " : "",
                                    (route_rule->wired_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Wired " : "",
                                    (route_rule->expensive_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Exp " : "",
+                                   (route_rule->constrained_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Constrained " : "",
                                    (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? interface_names[0] : "",
                                    (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? " " : "",
                                    (route_rule->exception_if_actions[1] == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? interface_names[1] : "",
@@ -4448,11 +4840,12 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul
                                    (route_rule->exception_if_actions[9] == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? interface_names[9] : "");
                                break;
                        case NECP_ROUTE_RULE_ALLOW_INTERFACE:
-                               snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
+                               snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
                                    (route_rule->cellular_action == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!Cell " : "",
                                    (route_rule->wifi_action == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!WiFi " : "",
                                    (route_rule->wired_action == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!Wired " : "",
                                    (route_rule->expensive_action == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!Exp " : "",
+                                   (route_rule->constrained_action == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!Constrained " : "",
                                    (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!" : "",
                                    (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_DENY_INTERFACE) ? interface_names[0] : "",
                                    (route_rule->exception_if_actions[1] == NECP_ROUTE_RULE_DENY_INTERFACE) ? "!" : "",
@@ -4475,11 +4868,12 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul
                                    (route_rule->exception_if_actions[9] == NECP_ROUTE_RULE_DENY_INTERFACE) ? interface_names[9] : "");
                                break;
                        case NECP_ROUTE_RULE_QOS_MARKING:
-                               snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (QoSMarking %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
+                               snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (QoSMarking %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
                                    (route_rule->cellular_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Cell " : "",
                                    (route_rule->wifi_action == NECP_ROUTE_RULE_QOS_MARKING) ? "WiFi " : "",
                                    (route_rule->wired_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Wired " : "",
                                    (route_rule->expensive_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Exp " : "",
+                                   (route_rule->constrained_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Constrained " : "",
                                    (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[0] : "",
                                    (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_QOS_MARKING) ? " " : "",
                                    (route_rule->exception_if_actions[1] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[1] : "",
@@ -4627,7 +5021,8 @@ necp_kernel_socket_policy_results_overlap(struct necp_kernel_socket_policy *uppe
        } else if (upper_policy->result == NECP_KERNEL_POLICY_RESULT_SOCKET_FILTER ||
            upper_policy->result == NECP_KERNEL_POLICY_RESULT_ROUTE_RULES ||
            upper_policy->result == NECP_KERNEL_POLICY_RESULT_USE_NETAGENT ||
-           upper_policy->result == NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED) {
+           upper_policy->result == NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED ||
+           upper_policy->result == NECP_KERNEL_POLICY_RESULT_ALLOW_UNENTITLED) {
                // Filters and route rules never cancel out lower policies
                return FALSE;
        } else if (necp_kernel_socket_result_is_trigger_service_type(upper_policy)) {
@@ -4699,6 +5094,11 @@ necp_kernel_socket_policy_is_unnecessary(struct necp_kernel_socket_policy *polic
                        continue;
                }
 
+               // If new policy matches Local Networks, compared policy must also
+               if ((policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) && !(compared_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS)) {
+                       continue;
+               }
+
                // Default makes lower policies unecessary always
                if (compared_policy->condition_mask == 0) {
                        return TRUE;
@@ -4764,6 +5164,11 @@ necp_kernel_socket_policy_is_unnecessary(struct necp_kernel_socket_policy *polic
                        continue;
                }
 
+               if (compared_policy->condition_mask & NECP_KERNEL_CONDITION_CLIENT_FLAGS &&
+                   compared_policy->cond_client_flags != policy->cond_client_flags) {
+                       continue;
+               }
+
                if (compared_policy->condition_mask & NECP_KERNEL_CONDITION_TRAFFIC_CLASS &&
                    !(compared_policy->cond_traffic_class.start_tc <= policy->cond_traffic_class.start_tc &&
                    compared_policy->cond_traffic_class.end_tc >= policy->cond_traffic_class.end_tc)) {
@@ -5020,7 +5425,7 @@ necp_create_string_to_id_mapping(struct necp_string_id_mapping_list *list, char
        existing_mapping = necp_lookup_string_to_id_locked(list, string);
        if (existing_mapping != NULL) {
                string_id = existing_mapping->id;
-               existing_mapping->refcount++;
+               os_ref_retain_locked(&existing_mapping->refcount);
        } else {
                struct necp_string_id_mapping *new_mapping = NULL;
                MALLOC(new_mapping, struct necp_string_id_mapping *, sizeof(struct necp_string_id_mapping), M_NECP, M_WAITOK);
@@ -5032,7 +5437,7 @@ necp_create_string_to_id_mapping(struct necp_string_id_mapping_list *list, char
                        if (new_mapping->string != NULL) {
                                memcpy(new_mapping->string, string, length);
                                new_mapping->id = necp_get_new_string_id();
-                               new_mapping->refcount = 1;
+                               os_ref_init(&new_mapping->refcount, &necp_refgrp);
                                LIST_INSERT_HEAD(list, new_mapping, chain);
                                string_id = new_mapping->id;
                        } else {
@@ -5053,7 +5458,7 @@ necp_remove_string_to_id_mapping(struct necp_string_id_mapping_list *list, char
 
        existing_mapping = necp_lookup_string_to_id_locked(list, string);
        if (existing_mapping != NULL) {
-               if (--existing_mapping->refcount == 0) {
+               if (os_ref_release_locked(&existing_mapping->refcount) == 0) {
                        LIST_REMOVE(existing_mapping, chain);
                        FREE(existing_mapping->string, M_NECP);
                        FREE(existing_mapping, M_NECP);
@@ -5138,7 +5543,7 @@ necp_lookup_route_rule_locked(struct necp_route_rule_list *list, u_int32_t route
 }
 
 static struct necp_route_rule *
-necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_int32_t default_action, u_int8_t cellular_action, u_int8_t wifi_action, u_int8_t wired_action, u_int8_t expensive_action, u_int32_t *if_indices, u_int8_t *if_actions)
+necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_int32_t default_action, u_int8_t cellular_action, u_int8_t wifi_action, u_int8_t wired_action, u_int8_t expensive_action, u_int8_t constrained_action, u_int32_t *if_indices, u_int8_t *if_actions)
 {
        struct necp_route_rule *searchentry = NULL;
        struct necp_route_rule *foundentry = NULL;
@@ -5148,7 +5553,8 @@ necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_i
                    searchentry->cellular_action == cellular_action &&
                    searchentry->wifi_action == wifi_action &&
                    searchentry->wired_action == wired_action &&
-                   searchentry->expensive_action == expensive_action) {
+                   searchentry->expensive_action == expensive_action &&
+                   searchentry->constrained_action == constrained_action) {
                        bool match_failed = FALSE;
                        size_t index_a = 0;
                        size_t index_b = 0;
@@ -5199,6 +5605,7 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_
        u_int8_t wifi_action = NECP_ROUTE_RULE_NONE;
        u_int8_t wired_action = NECP_ROUTE_RULE_NONE;
        u_int8_t expensive_action = NECP_ROUTE_RULE_NONE;
+       u_int8_t constrained_action = NECP_ROUTE_RULE_NONE;
        u_int32_t if_indices[MAX_ROUTE_RULE_INTERFACES];
        size_t num_valid_indices = 0;
        memset(&if_indices, 0, sizeof(if_indices));
@@ -5241,6 +5648,9 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_
                        if (rule_flags & NECP_ROUTE_RULE_FLAG_EXPENSIVE) {
                                expensive_action = rule_type;
                        }
+                       if (rule_flags & NECP_ROUTE_RULE_FLAG_CONSTRAINED) {
+                               constrained_action = rule_type;
+                       }
                        if (rule_flags == 0) {
                                default_action = rule_type;
                        }
@@ -5265,10 +5675,10 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_
                offset += sizeof(u_int8_t) + sizeof(u_int32_t) + length;
        }
 
-       existing_rule = necp_lookup_route_rule_by_contents_locked(list, default_action, cellular_action, wifi_action, wired_action, expensive_action, if_indices, if_actions);
+       existing_rule = necp_lookup_route_rule_by_contents_locked(list, default_action, cellular_action, wifi_action, wired_action, expensive_action, constrained_action, if_indices, if_actions);
        if (existing_rule != NULL) {
                route_rule_id = existing_rule->id;
-               existing_rule->refcount++;
+               os_ref_retain_locked(&existing_rule->refcount);
        } else {
                struct necp_route_rule *new_rule = NULL;
                MALLOC(new_rule, struct necp_route_rule *, sizeof(struct necp_route_rule), M_NECP, M_WAITOK);
@@ -5280,9 +5690,10 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_
                        new_rule->wifi_action = wifi_action;
                        new_rule->wired_action = wired_action;
                        new_rule->expensive_action = expensive_action;
+                       new_rule->constrained_action =  constrained_action;
                        memcpy(&new_rule->exception_if_indices, &if_indices, sizeof(if_indices));
                        memcpy(&new_rule->exception_if_actions, &if_actions, sizeof(if_actions));
-                       new_rule->refcount = 1;
+                       os_ref_init(&new_rule->refcount, &necp_refgrp);
                        LIST_INSERT_HEAD(list, new_rule, chain);
                }
        }
@@ -5323,7 +5734,7 @@ necp_remove_route_rule(struct necp_route_rule_list *list, u_int32_t route_rule_i
 
        existing_rule = necp_lookup_route_rule_locked(list, route_rule_id);
        if (existing_rule != NULL) {
-               if (--existing_rule->refcount == 0) {
+               if (os_ref_release_locked(&existing_rule->refcount) == 0) {
                        necp_remove_aggregate_route_rule_for_id(existing_rule->id);
                        LIST_REMOVE(existing_rule, chain);
                        FREE(existing_rule, M_NECP);
@@ -5361,16 +5772,10 @@ necp_create_aggregate_route_rule(u_int32_t *rule_ids)
        struct necp_aggregate_route_rule *new_rule = NULL;
        struct necp_aggregate_route_rule *existing_rule = NULL;
 
-       LIST_FOREACH(existing_rule, &necp_aggregate_route_rules, chain) {
-               if (memcmp(existing_rule->rule_ids, rule_ids, (sizeof(u_int32_t) * MAX_AGGREGATE_ROUTE_RULES)) == 0) {
-                       return existing_rule->id;
-               }
-       }
-
        lck_rw_lock_exclusive(&necp_route_rule_lock);
 
+       // Check if the rule already exists
        LIST_FOREACH(existing_rule, &necp_aggregate_route_rules, chain) {
-               // Re-check, in case something else created the rule while we are waiting to lock
                if (memcmp(existing_rule->rule_ids, rule_ids, (sizeof(u_int32_t) * MAX_AGGREGATE_ROUTE_RULES)) == 0) {
                        lck_rw_done(&necp_route_rule_lock);
                        return existing_rule->id;
@@ -5494,9 +5899,9 @@ necp_create_uuid_app_id_mapping(uuid_t uuid, bool *allocated_mapping, bool uuid_
        existing_mapping = necp_uuid_lookup_app_id_locked(uuid);
        if (existing_mapping != NULL) {
                local_id = existing_mapping->id;
-               existing_mapping->refcount++;
+               os_ref_retain_locked(&existing_mapping->refcount);
                if (uuid_policy_table) {
-                       existing_mapping->table_refcount++;
+                       existing_mapping->table_usecount++;
                }
        } else {
                struct necp_uuid_id_mapping *new_mapping = NULL;
@@ -5504,11 +5909,11 @@ necp_create_uuid_app_id_mapping(uuid_t uuid, bool *allocated_mapping, bool uuid_
                if (new_mapping != NULL) {
                        uuid_copy(new_mapping->uuid, uuid);
                        new_mapping->id = necp_get_new_uuid_id(false);
-                       new_mapping->refcount = 1;
+                       os_ref_init(&new_mapping->refcount, &necp_refgrp);
                        if (uuid_policy_table) {
-                               new_mapping->table_refcount = 1;
+                               new_mapping->table_usecount = 1;
                        } else {
-                               new_mapping->table_refcount = 0;
+                               new_mapping->table_usecount = 0;
                        }
 
                        LIST_INSERT_HEAD(APPUUIDHASH(uuid), new_mapping, chain);
@@ -5538,9 +5943,9 @@ necp_remove_uuid_app_id_mapping(uuid_t uuid, bool *removed_mapping, bool uuid_po
        existing_mapping = necp_uuid_lookup_app_id_locked(uuid);
        if (existing_mapping != NULL) {
                if (uuid_policy_table) {
-                       existing_mapping->table_refcount--;
+                       existing_mapping->table_usecount--;
                }
-               if (--existing_mapping->refcount == 0) {
+               if (os_ref_release_locked(&existing_mapping->refcount) == 0) {
                        LIST_REMOVE(existing_mapping, chain);
                        FREE(existing_mapping, M_NECP);
                        if (removed_mapping) {
@@ -5618,14 +6023,14 @@ necp_create_uuid_service_id_mapping(uuid_t uuid)
        existing_mapping = necp_uuid_lookup_service_id_locked(uuid);
        if (existing_mapping != NULL) {
                local_id = existing_mapping->id;
-               existing_mapping->refcount++;
+               os_ref_retain_locked(&existing_mapping->refcount);
        } else {
                struct necp_uuid_id_mapping *new_mapping = NULL;
                MALLOC(new_mapping, struct necp_uuid_id_mapping *, sizeof(*new_mapping), M_NECP, M_WAITOK);
                if (new_mapping != NULL) {
                        uuid_copy(new_mapping->uuid, uuid);
                        new_mapping->id = necp_get_new_uuid_id(true);
-                       new_mapping->refcount = 1;
+                       os_ref_init(&new_mapping->refcount, &necp_refgrp);
 
                        LIST_INSERT_HEAD(&necp_uuid_service_id_list, new_mapping, chain);
 
@@ -5647,9 +6052,9 @@ necp_remove_uuid_service_id_mapping(uuid_t uuid)
 
        LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE);
 
-       existing_mapping = necp_uuid_lookup_app_id_locked(uuid);
+       existing_mapping = necp_uuid_lookup_service_id_locked(uuid);
        if (existing_mapping != NULL) {
-               if (--existing_mapping->refcount == 0) {
+               if (os_ref_release_locked(&existing_mapping->refcount) == 0) {
                        LIST_REMOVE(existing_mapping, chain);
                        FREE(existing_mapping, M_NECP);
                }
@@ -5676,7 +6081,7 @@ necp_kernel_socket_policies_update_uuid_table(void)
                        for (uuid_list_head = &necp_uuid_app_id_hashtbl[necp_uuid_app_id_hash_num_buckets - 1]; uuid_list_head >= necp_uuid_app_id_hashtbl; uuid_list_head--) {
                                struct necp_uuid_id_mapping *mapping = NULL;
                                LIST_FOREACH(mapping, uuid_list_head, chain) {
-                                       if (mapping->table_refcount > 0 &&
+                                       if (mapping->table_usecount > 0 &&
                                            proc_uuid_policy_kernel(PROC_UUID_POLICY_OPERATION_ADD, mapping->uuid, PROC_UUID_NECP_APP_POLICY) < 0) {
                                                NECPLOG0(LOG_DEBUG, "Error adding uuid to policy table\n");
                                        }
@@ -5690,7 +6095,7 @@ necp_kernel_socket_policies_update_uuid_table(void)
        return TRUE;
 }
 
-#define NECP_KERNEL_VALID_IP_OUTPUT_CONDITIONS (NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_LAST_INTERFACE)
+#define NECP_KERNEL_VALID_IP_OUTPUT_CONDITIONS (NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_LAST_INTERFACE | NECP_KERNEL_CONDITION_LOCAL_NETWORKS)
 static necp_kernel_policy_id
 necp_kernel_ip_output_policy_add(necp_policy_order order, necp_policy_order suborder, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_kernel_policy_id cond_policy_id, ifnet_t cond_bound_interface, u_int32_t cond_last_interface_index, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter)
 {
@@ -5905,6 +6310,11 @@ necp_kernel_ip_output_policy_is_unnecessary(struct necp_kernel_ip_output_policy
                        continue;
                }
 
+               // If new policy matches Local Networks, compared policy must also
+               if ((policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) && !(compared_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS)) {
+                       continue;
+               }
+
                // Default makes lower policies unecessary always
                if (compared_policy->condition_mask == 0) {
                        return TRUE;
@@ -5999,8 +6409,10 @@ necp_kernel_ip_output_policies_reprocess(void)
 
                /* Update bucket counts:
                 * Non-id and SKIP policies will be added to all buckets
+                * Add local networks policy to all buckets for incoming IP
                 */
                if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID) ||
+                   (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) ||
                    kernel_policy->result == NECP_KERNEL_POLICY_RESULT_SKIP) {
                        for (i = 0; i < NECP_KERNEL_IP_OUTPUT_POLICIES_MAP_NUM_ID_BUCKETS; i++) {
                                bucket_allocation_counts[i]++;
@@ -6030,6 +6442,7 @@ necp_kernel_ip_output_policies_reprocess(void)
        LIST_FOREACH(kernel_policy, &necp_kernel_ip_output_policies, chain) {
                // Insert pointers into map
                if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID) ||
+                   (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) ||
                    kernel_policy->result == NECP_KERNEL_POLICY_RESULT_SKIP) {
                        for (i = 0; i < NECP_KERNEL_IP_OUTPUT_POLICIES_MAP_NUM_ID_BUCKETS; i++) {
                                if (!necp_kernel_ip_output_policy_is_unnecessary(kernel_policy, necp_kernel_ip_output_policies_map[i], bucket_current_free_index[i])) {
@@ -6167,11 +6580,35 @@ necp_hostname_matches_domain(struct substring hostname_substring, u_int8_t hostn
        return FALSE;
 }
 
+bool
+net_domain_contains_hostname(char *hostname_string, char *domain_string)
+{
+       if (hostname_string == NULL ||
+           domain_string == NULL) {
+               return false;
+       }
+
+       struct substring hostname_substring;
+       hostname_substring.string = hostname_string;
+       hostname_substring.length = strlen(hostname_string);
+
+       return necp_hostname_matches_domain(hostname_substring,
+                  necp_count_dots(hostname_string, hostname_substring.length),
+                  domain_string,
+                  necp_count_dots(domain_string, strlen(domain_string)));
+}
+
+#define NECP_MAX_STRING_LEN 1024
+
 static char *
 necp_copy_string(char *string, size_t length)
 {
        char *copied_string = NULL;
 
+       if (length > NECP_MAX_STRING_LEN) {
+               return NULL;
+       }
+
        MALLOC(copied_string, char *, length + 1, M_NECP, M_WAITOK);
        if (copied_string == NULL) {
                return NULL;
@@ -6208,34 +6645,31 @@ static inline void
 necp_get_parent_cred_result(proc_t proc, struct necp_socket_info *info)
 {
        task_t task = proc_task(proc ? proc : current_proc());
-       coalition_t coal = COALITION_NULL;
-       Boolean is_leader = coalition_is_leader(task, COALITION_TYPE_JETSAM, &coal);
+       coalition_t coal = task_get_coalition(task, COALITION_TYPE_JETSAM);
 
-       if (is_leader == TRUE) {
+       if (coal == COALITION_NULL || coalition_is_leader(task, coal)) {
                // No parent, nothing to do
                return;
        }
 
-       if (coal != NULL) {
-               task_t lead_task = coalition_get_leader(coal);
-               if (lead_task != NULL) {
-                       proc_t lead_proc = get_bsdtask_info(lead_task);
-                       if (lead_proc != NULL) {
-                               kauth_cred_t lead_cred = kauth_cred_proc_ref(lead_proc);
-                               if (lead_cred != NULL) {
-                                       errno_t cred_result = priv_check_cred(lead_cred, PRIV_NET_PRIVILEGED_NECP_MATCH, 0);
-                                       kauth_cred_unref(&lead_cred);
-                                       info->cred_result = cred_result;
-                               }
+       task_t lead_task = coalition_get_leader(coal);
+       if (lead_task != NULL) {
+               proc_t lead_proc = get_bsdtask_info(lead_task);
+               if (lead_proc != NULL) {
+                       kauth_cred_t lead_cred = kauth_cred_proc_ref(lead_proc);
+                       if (lead_cred != NULL) {
+                               errno_t cred_result = priv_check_cred(lead_cred, PRIV_NET_PRIVILEGED_NECP_MATCH, 0);
+                               kauth_cred_unref(&lead_cred);
+                               info->cred_result = cred_result;
                        }
-                       task_deallocate(lead_task);
                }
+               task_deallocate(lead_task);
        }
 }
 
-#define NECP_KERNEL_ADDRESS_TYPE_CONDITIONS (NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX)
+#define NECP_KERNEL_ADDRESS_TYPE_CONDITIONS (NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_LOCAL_EMPTY | NECP_KERNEL_CONDITION_REMOTE_EMPTY | NECP_KERNEL_CONDITION_LOCAL_NETWORKS)
 static void
-necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_application_uuid, char *account, char *domain, pid_t pid, uid_t uid, u_int16_t protocol, u_int32_t bound_interface_index, u_int32_t traffic_class, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr, proc_t proc, struct necp_socket_info *info)
+necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_application_uuid, char *account, char *domain, pid_t pid, uid_t uid, u_int16_t protocol, u_int32_t bound_interface_index, u_int32_t traffic_class, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr, u_int16_t local_port, u_int16_t remote_port, bool has_client, proc_t proc, u_int32_t drop_order, u_int32_t client_flags, struct necp_socket_info *info)
 {
        memset(info, 0, sizeof(struct necp_socket_info));
 
@@ -6244,6 +6678,9 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic
        info->protocol = protocol;
        info->bound_interface_index = bound_interface_index;
        info->traffic_class = traffic_class;
+       info->has_client = has_client;
+       info->drop_order = drop_order;
+       info->client_flags = client_flags;
 
        if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_ENTITLEMENT && proc != NULL) {
                info->cred_result = priv_check_cred(proc_ucred(proc), PRIV_NET_PRIVILEGED_NECP_MATCH, 0);
@@ -6253,6 +6690,10 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic
                }
        }
 
+       if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY && proc != NULL) {
+               info->is_platform_binary = csproc_get_platform_binary(proc) ? true : false;
+       }
+
        if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_APP_ID && !uuid_is_null(application_uuid)) {
                struct necp_uuid_id_mapping *existing_mapping = necp_uuid_lookup_app_id_locked(application_uuid);
                if (existing_mapping) {
@@ -6285,9 +6726,23 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic
        if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_ADDRESS_TYPE_CONDITIONS) {
                if (local_addr && local_addr->sa.sa_len > 0) {
                        memcpy(&info->local_addr, local_addr, local_addr->sa.sa_len);
+                       if (local_port != 0) {
+                               info->local_addr.sin6.sin6_port = local_port;
+                       }
+               } else if (local_port != 0) {
+                       info->local_addr.sin6.sin6_len = sizeof(struct sockaddr_in6);
+                       info->local_addr.sin6.sin6_family = AF_INET6;
+                       info->local_addr.sin6.sin6_port = local_port;
                }
                if (remote_addr && remote_addr->sa.sa_len > 0) {
                        memcpy(&info->remote_addr, remote_addr, remote_addr->sa.sa_len);
+                       if (remote_port != 0) {
+                               info->remote_addr.sin6.sin6_port = remote_port;
+                       }
+               } else if (remote_port != 0) {
+                       info->remote_addr.sin6.sin6_len = sizeof(struct sockaddr_in6);
+                       info->remote_addr.sin6.sin6_family = AF_INET6;
+                       info->remote_addr.sin6.sin6_port = remote_port;
                }
        }
 }
@@ -6325,10 +6780,14 @@ necp_application_find_policy_match_internal(proc_t proc,
     u_int32_t parameters_size,
     struct necp_aggregate_result *returned_result,
     u_int32_t *flags,
+    u_int32_t *reason,
     u_int required_interface_index,
     const union necp_sockaddr_union *override_local_addr,
     const union necp_sockaddr_union *override_remote_addr,
-    struct rtentry **returned_route, bool ignore_address)
+    struct necp_client_endpoint *returned_v4_gateway,
+    struct necp_client_endpoint *returned_v6_gateway,
+    struct rtentry **returned_route, bool ignore_address,
+    bool has_client)
 {
        int error = 0;
        size_t offset = 0;
@@ -6336,7 +6795,6 @@ necp_application_find_policy_match_internal(proc_t proc,
        struct necp_kernel_socket_policy *matched_policy = NULL;
        struct necp_socket_info info;
        necp_kernel_policy_filter filter_control_unit = 0;
-       u_int32_t route_rule_id = 0;
        necp_kernel_policy_result service_action = 0;
        necp_kernel_policy_service service = { 0, 0 };
 
@@ -6349,6 +6807,9 @@ necp_application_find_policy_match_internal(proc_t proc,
        bool no_remote_addr = FALSE;
        u_int8_t remote_family = 0;
        bool no_local_addr = FALSE;
+       u_int16_t local_port = 0;
+       u_int16_t remote_port = 0;
+       necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE;
 
        if (override_local_addr) {
                memcpy(&local_addr, override_local_addr, sizeof(local_addr));
@@ -6392,11 +6853,23 @@ necp_application_find_policy_match_internal(proc_t proc,
                return EINVAL;
        }
 
+       if (returned_v4_gateway != NULL) {
+               memset(returned_v4_gateway, 0, sizeof(struct necp_client_endpoint));
+       }
+
+       if (returned_v6_gateway != NULL) {
+               memset(returned_v6_gateway, 0, sizeof(struct necp_client_endpoint));
+       }
+
        memset(returned_result, 0, sizeof(struct necp_aggregate_result));
 
+       u_int32_t drop_order = necp_process_drop_order(proc_ucred(proc));
+
+       necp_kernel_policy_result drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE;
+
        lck_rw_lock_shared(&necp_kernel_policy_lock);
        if (necp_kernel_application_policies_count == 0) {
-               if (necp_drop_all_order > 0) {
+               if (necp_drop_all_order > 0 || drop_order > 0) {
                        returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP;
                        lck_rw_done(&necp_kernel_policy_lock);
                        return 0;
@@ -6489,6 +6962,8 @@ necp_application_find_policy_match_internal(proc_t proc,
                                case NECP_CLIENT_PARAMETER_IP_PROTOCOL: {
                                        if (length >= sizeof(u_int16_t)) {
                                                memcpy(&protocol, value, sizeof(u_int16_t));
+                                       } else if (length >= sizeof(u_int8_t)) {
+                                               memcpy(&protocol, value, sizeof(u_int8_t));
                                        }
                                        break;
                                }
@@ -6506,7 +6981,7 @@ necp_application_find_policy_match_internal(proc_t proc,
                                        break;
                                }
                                case NECP_CLIENT_PARAMETER_LOCAL_ADDRESS: {
-                                       if (ignore_address) {
+                                       if (ignore_address || override_local_addr) {
                                                break;
                                        }
 
@@ -6519,7 +6994,7 @@ necp_application_find_policy_match_internal(proc_t proc,
                                        break;
                                }
                                case NECP_CLIENT_PARAMETER_REMOTE_ADDRESS: {
-                                       if (ignore_address) {
+                                       if (ignore_address || override_remote_addr) {
                                                break;
                                        }
 
@@ -6531,18 +7006,49 @@ necp_application_find_policy_match_internal(proc_t proc,
                                        }
                                        break;
                                }
-                               case NECP_CLIENT_PARAMETER_FLAGS: {
-                                       if (length >= sizeof(client_flags)) {
-                                               memcpy(&client_flags, value, sizeof(client_flags));
+                               case NECP_CLIENT_PARAMETER_LOCAL_ENDPOINT: {
+                                       if (ignore_address || override_local_addr) {
+                                               break;
+                                       }
+
+                                       if (length >= sizeof(struct necp_client_endpoint)) {
+                                               struct necp_client_endpoint *endpoint = (struct necp_client_endpoint *)(void *)value;
+                                               if (endpoint->u.endpoint.endpoint_family == AF_UNSPEC &&
+                                                   endpoint->u.endpoint.endpoint_port != 0) {
+                                                       // Save port
+                                                       local_port = endpoint->u.endpoint.endpoint_port;
+                                               }
                                        }
                                        break;
                                }
-                               case NECP_CLIENT_PARAMETER_REQUIRE_AGENT_TYPE: {
-                                       if (num_required_agent_types >= NECP_MAX_REQUIRED_AGENTS) {
+                               case NECP_CLIENT_PARAMETER_REMOTE_ENDPOINT: {
+                                       if (ignore_address || override_remote_addr) {
                                                break;
                                        }
-                                       if (length >= sizeof(struct necp_client_parameter_netagent_type)) {
-                                               memcpy(&required_agent_types[num_required_agent_types], value, sizeof(struct necp_client_parameter_netagent_type));
+
+                                       if (length >= sizeof(struct necp_client_endpoint)) {
+                                               struct necp_client_endpoint *endpoint = (struct necp_client_endpoint *)(void *)value;
+                                               if (endpoint->u.endpoint.endpoint_family == AF_UNSPEC &&
+                                                   endpoint->u.endpoint.endpoint_port != 0) {
+                                                       // Save port
+                                                       remote_port = endpoint->u.endpoint.endpoint_port;
+                                               }
+                                       }
+                                       break;
+                               }
+                               case NECP_CLIENT_PARAMETER_FLAGS: {
+                                       if (length >= sizeof(client_flags)) {
+                                               memcpy(&client_flags, value, sizeof(client_flags));
+                                       }
+                                       break;
+                               }
+                               case NECP_CLIENT_PARAMETER_REQUIRE_AGENT_TYPE:
+                               case NECP_CLIENT_PARAMETER_PREFER_AGENT_TYPE: {
+                                       if (num_required_agent_types >= NECP_MAX_REQUIRED_AGENTS) {
+                                               break;
+                                       }
+                                       if (length >= sizeof(struct necp_client_parameter_netagent_type)) {
+                                               memcpy(&required_agent_types[num_required_agent_types], value, sizeof(struct necp_client_parameter_netagent_type));
                                                num_required_agent_types++;
                                        }
                                        break;
@@ -6560,21 +7066,36 @@ necp_application_find_policy_match_internal(proc_t proc,
        // Lock
        lck_rw_lock_shared(&necp_kernel_policy_lock);
 
-       necp_application_fillout_info_locked(application_uuid, real_application_uuid, account, domain, pid, uid, protocol, bound_interface_index, traffic_class, &local_addr, &remote_addr, proc, &info);
-       matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_app_layer_map, &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, netagent_use_flags, NECP_MAX_NETAGENTS, required_agent_types, num_required_agent_types, proc, NULL);
+       u_int32_t route_rule_id_array[MAX_AGGREGATE_ROUTE_RULES];
+       size_t route_rule_id_array_count = 0;
+       necp_application_fillout_info_locked(application_uuid, real_application_uuid, account, domain, pid, uid, protocol, bound_interface_index, traffic_class, &local_addr, &remote_addr, local_port, remote_port, has_client, proc, drop_order, client_flags, &info);
+       matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_app_layer_map, &info, &filter_control_unit, route_rule_id_array, &route_rule_id_array_count, MAX_AGGREGATE_ROUTE_RULES, &service_action, &service, netagent_ids, netagent_use_flags, NECP_MAX_NETAGENTS, required_agent_types, num_required_agent_types, proc, NULL, NULL, &drop_dest_policy_result, &drop_all_bypass);
        if (matched_policy) {
                returned_result->policy_id = matched_policy->id;
                returned_result->routing_result = matched_policy->result;
                memcpy(&returned_result->routing_result_parameter, &matched_policy->result_parameter, sizeof(returned_result->routing_result_parameter));
-       } else if (necp_drop_all_order > 0) {
-               // Mark socket as a drop if drop_all is set
-               returned_result->policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
-               returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP;
        } else {
-               returned_result->policy_id = 0;
-               returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_NONE;
+               bool drop_all = false;
+               if (necp_drop_all_order > 0 || info.drop_order > 0 || drop_dest_policy_result == NECP_KERNEL_POLICY_RESULT_DROP) {
+                       // Mark socket as a drop if drop_all is set
+                       drop_all = true;
+                       if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE) {
+                               drop_all_bypass = necp_check_drop_all_bypass_result(proc);
+                       }
+               }
+               if (drop_all && drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE) {
+                       returned_result->policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+                       returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP;
+               } else {
+                       returned_result->policy_id = 0;
+                       returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_NONE;
+               }
+       }
+       if (filter_control_unit == NECP_FILTER_UNIT_NO_FILTER) {
+               returned_result->filter_control_unit = 0;
+       } else {
+               returned_result->filter_control_unit = filter_control_unit;
        }
-       returned_result->filter_control_unit = filter_control_unit;
        returned_result->service_action = service_action;
 
        // Handle trigger service
@@ -6671,7 +7192,7 @@ necp_application_find_policy_match_internal(proc_t proc,
                        returned_result->routed_interface_index = 0;
                }
 
-               if (no_remote_addr && remote_family == 0 &&
+               if (no_remote_addr && remote_family == AF_UNSPEC &&
                    (rt == NULL || rt->rt_ifp == NULL)) {
                        // Route lookup for default IPv4 failed, try IPv6
 
@@ -6834,13 +7355,28 @@ necp_application_find_policy_match_internal(proc_t proc,
                                }
 
                                // Check QoS marking (fastlane)
-                               if (necp_update_qos_marking(rt->rt_ifp, route_rule_id)) {
-                                       *flags |= NECP_CLIENT_RESULT_FLAG_ALLOW_QOS_MARKING;
+                               for (size_t route_rule_index = 0; route_rule_index < route_rule_id_array_count; route_rule_index++) {
+                                       if (necp_update_qos_marking(rt->rt_ifp, route_rule_id_array[route_rule_index])) {
+                                               *flags |= NECP_CLIENT_RESULT_FLAG_ALLOW_QOS_MARKING;
+                                               // If the route can use QoS markings, stop iterating route rules
+                                               break;
+                                       }
                                }
 
                                if (IFNET_IS_LOW_POWER(rt->rt_ifp)) {
                                        *flags |= NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER;
                                }
+
+                               if (traffic_class == SO_TC_BK_SYS) {
+                                       // Block BK_SYS traffic if interface is throttled
+                                       u_int32_t throttle_level = 0;
+                                       if (ifnet_get_throttle(rt->rt_ifp, &throttle_level) == 0) {
+                                               if (throttle_level == IFNET_THROTTLE_OPPORTUNISTIC) {
+                                                       returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP;
+                                                       memset(&returned_result->routing_result_parameter, 0, sizeof(returned_result->routing_result_parameter));
+                                               }
+                                       }
+                               }
                        }
                }
 
@@ -6867,6 +7403,12 @@ necp_application_find_policy_match_internal(proc_t proc,
                                if (v4Route->rt_ifp != NULL && !IS_INTF_CLAT46(v4Route->rt_ifp)) {
                                        *flags |= NECP_CLIENT_RESULT_FLAG_HAS_IPV4;
                                }
+                               if (returned_v4_gateway != NULL &&
+                                   v4Route->rt_gateway != NULL &&
+                                   v4Route->rt_gateway->sa_len == sizeof(returned_v4_gateway->u.sin)) {
+                                       memcpy(&returned_v4_gateway->u.sin, v4Route->rt_gateway, sizeof(returned_v4_gateway->u.sin));
+                                       memset(&returned_v4_gateway->u.sin.sin_zero, 0, sizeof(returned_v4_gateway->u.sin.sin_zero));
+                               }
                                rtfree(v4Route);
                                v4Route = NULL;
                        }
@@ -6879,21 +7421,56 @@ necp_application_find_policy_match_internal(proc_t proc,
                                                *flags |= NECP_CLIENT_RESULT_FLAG_HAS_NAT64;
                                        }
                                }
+                               if (returned_v6_gateway != NULL &&
+                                   v6Route->rt_gateway != NULL &&
+                                   v6Route->rt_gateway->sa_len == sizeof(returned_v6_gateway->u.sin6)) {
+                                       memcpy(&returned_v6_gateway->u.sin6, v6Route->rt_gateway, sizeof(returned_v6_gateway->u.sin6));
+                               }
                                rtfree(v6Route);
                                v6Route = NULL;
                        }
                }
        }
 
-       u_int32_t interface_type_denied = IFRTYPE_FUNCTIONAL_UNKNOWN;
-       bool route_is_allowed = necp_route_is_allowed(rt, NULL, route_rule_id, &interface_type_denied);
-       if (!route_is_allowed) {
-               // If the route is blocked, treat the lookup as a drop
-               returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP;
-               memset(&returned_result->routing_result_parameter, 0, sizeof(returned_result->routing_result_parameter));
+       for (size_t route_rule_index = 0; route_rule_index < route_rule_id_array_count; route_rule_index++) {
+               u_int32_t interface_type_denied = IFRTYPE_FUNCTIONAL_UNKNOWN;
+               bool route_is_allowed = necp_route_is_allowed(rt, NULL, route_rule_id_array[route_rule_index], &interface_type_denied);
+               if (!route_is_allowed) {
+                       // If the route is blocked, treat the lookup as a drop
+                       returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP;
+                       memset(&returned_result->routing_result_parameter, 0, sizeof(returned_result->routing_result_parameter));
+
+                       if (interface_type_denied != IFRTYPE_FUNCTIONAL_UNKNOWN) {
+                               if (reason != NULL) {
+                                       if (interface_type_denied == IFRTYPE_FUNCTIONAL_CELLULAR) {
+                                               *reason = NECP_CLIENT_RESULT_REASON_CELLULAR_DENIED;
+                                       } else if (interface_type_denied == IFRTYPE_FUNCTIONAL_WIFI_INFRA) {
+                                               *reason = NECP_CLIENT_RESULT_REASON_WIFI_DENIED;
+                                       }
+                               }
+                               necp_send_application_interface_denied_event(pid, application_uuid, interface_type_denied);
+                       }
+                       // If the route gets denied, stop matching rules
+                       break;
+               }
+       }
 
-               if (interface_type_denied != IFRTYPE_FUNCTIONAL_UNKNOWN) {
-                       necp_send_application_interface_denied_event(pid, application_uuid, interface_type_denied);
+       if (rt != NULL && rt->rt_ifp != NULL) {
+               const bool expensive_prohibited = ((client_flags & NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_EXPENSIVE) &&
+                   IFNET_IS_EXPENSIVE(rt->rt_ifp));
+               const bool constrained_prohibited = ((client_flags & NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_CONSTRAINED) &&
+                   IFNET_IS_CONSTRAINED(rt->rt_ifp));
+               if (reason != NULL) {
+                       if (expensive_prohibited) {
+                               *reason = NECP_CLIENT_RESULT_REASON_EXPENSIVE_PROHIBITED;
+                       } else if (constrained_prohibited) {
+                               *reason = NECP_CLIENT_RESULT_REASON_CONSTRAINED_PROHIBITED;
+                       }
+               }
+               if (expensive_prohibited || constrained_prohibited) {
+                       // If the client flags prohibited a property of the interface, treat it as a drop
+                       returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP;
+                       memset(&returned_result->routing_result_parameter, 0, sizeof(returned_result->routing_result_parameter));
                }
        }
 
@@ -6912,7 +7489,58 @@ necp_application_find_policy_match_internal(proc_t proc,
 }
 
 static bool
-necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_app_id app_id, necp_app_id real_app_id, errno_t cred_result, u_int32_t account_id, struct substring domain, u_int8_t domain_dot_count, pid_t pid, uid_t uid, u_int32_t bound_interface_index, u_int32_t traffic_class, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, proc_t proc)
+necp_is_route_local(union necp_sockaddr_union *remote_addr)
+{
+       bool no_remote_addr = FALSE;
+       u_int8_t remote_family = 0;
+       struct rtentry *rt = NULL;
+       bool is_local = FALSE;
+
+       if (remote_addr == NULL) {
+               return NULL;
+       }
+
+       if (remote_addr->sa.sa_len == 0 ||
+           (remote_addr->sa.sa_family == AF_INET && remote_addr->sin.sin_addr.s_addr == 0) ||
+           (remote_addr->sa.sa_family == AF_INET6 && IN6_IS_ADDR_UNSPECIFIED(&remote_addr->sin6.sin6_addr))) {
+               no_remote_addr = TRUE;
+               remote_family = remote_addr->sa.sa_family;
+       }
+
+       if (no_remote_addr) {
+               memset(remote_addr, 0, sizeof(union necp_sockaddr_union));
+               if (remote_family == AF_INET6) {
+                       // Reset address to ::
+                       remote_addr->sa.sa_family = AF_INET6;
+                       remote_addr->sa.sa_len = sizeof(struct sockaddr_in6);
+               } else {
+                       // Reset address to 0.0.0.0
+                       remote_addr->sa.sa_family = AF_INET;
+                       remote_addr->sa.sa_len = sizeof(struct sockaddr_in);
+               }
+       }
+
+       // Lookup route regardless of the scoped interface to check if
+       // remote address is in a local network.
+       rt = rtalloc1_scoped((struct sockaddr *)remote_addr, 0, 0, 0);
+
+       if (rt == NULL) {
+               goto done;
+       }
+       if (remote_addr->sa.sa_family == AF_INET && IS_INTF_CLAT46(rt->rt_ifp)) {
+               goto free_rt;
+       }
+       is_local = IS_NECP_DEST_IN_LOCAL_NETWORKS(rt);
+
+free_rt:
+       rtfree(rt);
+
+done:
+       return is_local;
+}
+
+static bool
+necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_app_id app_id, necp_app_id real_app_id, errno_t cred_result, u_int32_t account_id, struct substring domain, u_int8_t domain_dot_count, pid_t pid, uid_t uid, u_int32_t bound_interface_index, u_int32_t traffic_class, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, bool has_client, uint32_t client_flags, int is_platform_binary, proc_t proc, struct rtentry *rt)
 {
        if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES)) {
                if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) {
@@ -6968,6 +7596,12 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a
                }
        }
 
+       if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_HAS_CLIENT) {
+               if (!has_client) {
+                       return FALSE;
+               }
+       }
+
        if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ENTITLEMENT) {
                if (cred_result != 0) {
                        // Process is missing entitlement
@@ -6975,6 +7609,13 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a
                }
        }
 
+       if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) {
+               if (is_platform_binary == 0) {
+                       // Process is not platform binary
+                       return FALSE;
+               }
+       }
+
        if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT) {
                if (kernel_policy->cond_custom_entitlement_matched == necp_boolean_state_false) {
                        // Process is missing entitlement based on previous check
@@ -7103,6 +7744,21 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a
                }
        }
 
+       if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) {
+               bool is_local = FALSE;
+
+               if (rt != NULL) {
+                       is_local = IS_NECP_DEST_IN_LOCAL_NETWORKS(rt);
+               } else {
+                       is_local = necp_is_route_local(remote);
+               }
+
+               if (!is_local) {
+                       // Either no route to validate or no match for local networks
+                       return FALSE;
+               }
+       }
+
        if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_START) {
                if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_END) {
                        bool inRange = necp_is_addr_in_range((struct sockaddr *)local, (struct sockaddr *)&kernel_policy->cond_local_start, (struct sockaddr *)&kernel_policy->cond_local_end);
@@ -7155,6 +7811,46 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a
                }
        }
 
+       if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_CLIENT_FLAGS) {
+               if (kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_CLIENT_FLAGS) {
+                       if ((client_flags & kernel_policy->cond_client_flags) == kernel_policy->cond_client_flags) {
+                               // Flags do match, and condition is negative, fail.
+                               return FALSE;
+                       }
+               } else {
+                       if ((client_flags & kernel_policy->cond_client_flags) != kernel_policy->cond_client_flags) {
+                               // Flags do not match, fail.
+                               return FALSE;
+                       }
+               }
+       }
+
+       if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_EMPTY) {
+               bool isEmpty = necp_addr_is_empty((struct sockaddr *)local);
+               if (kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_LOCAL_EMPTY) {
+                       if (isEmpty) {
+                               return FALSE;
+                       }
+               } else {
+                       if (!isEmpty) {
+                               return FALSE;
+                       }
+               }
+       }
+
+       if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_REMOTE_EMPTY) {
+               bool isEmpty = necp_addr_is_empty((struct sockaddr *)remote);
+               if (kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_REMOTE_EMPTY) {
+                       if (isEmpty) {
+                               return FALSE;
+                       }
+               } else {
+                       if (!isEmpty) {
+                               return FALSE;
+                       }
+               }
+       }
+
        return TRUE;
 }
 
@@ -7165,7 +7861,7 @@ necp_socket_calc_flowhash_locked(struct necp_socket_info *info)
 }
 
 static void
-necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, u_int32_t override_bound_interface, struct necp_socket_info *info)
+necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, u_int32_t override_bound_interface, u_int32_t drop_order, struct necp_socket_info *info)
 {
        struct socket *so = NULL;
 
@@ -7173,6 +7869,8 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc
 
        so = inp->inp_socket;
 
+       info->drop_order = drop_order;
+
        if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PID) {
                info->pid = ((so->so_flags & SOF_DELEGATED) ? so->e_pid : so->last_pid);
        }
@@ -7185,6 +7883,30 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc
                info->traffic_class = so->so_traffic_class;
        }
 
+       if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_HAS_CLIENT) {
+               info->has_client = !uuid_is_null(inp->necp_client_uuid);
+       }
+
+       if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_CLIENT_FLAGS) {
+               info->client_flags = 0;
+               if (INP_NO_CONSTRAINED(inp)) {
+                       info->client_flags |= NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_CONSTRAINED;
+               }
+               if (INP_NO_EXPENSIVE(inp)) {
+                       info->client_flags |= NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_EXPENSIVE;
+               }
+               if (inp->inp_socket->so_flags1 & SOF1_CELLFALLBACK) {
+                       info->client_flags |= NECP_CLIENT_PARAMETER_FLAG_FALLBACK_TRAFFIC;
+               }
+               if (inp->inp_socket->so_flags1 & SOF1_INBOUND) {
+                       info->client_flags |= NECP_CLIENT_PARAMETER_FLAG_INBOUND;
+               }
+               if (inp->inp_socket->so_options & SO_ACCEPTCONN ||
+                   inp->inp_flags2 & INP2_EXTERNAL_PORT) {
+                       info->client_flags |= NECP_CLIENT_PARAMETER_FLAG_LISTENER;
+               }
+       }
+
        if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PROTOCOL) {
                if (inp->inp_ip_p) {
                        info->protocol = inp->inp_ip_p;
@@ -7215,6 +7937,10 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc
                                necp_get_parent_cred_result(NULL, info);
                        }
                }
+
+               if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) {
+                       info->is_platform_binary = csproc_get_platform_binary(current_proc()) ? true : false;
+               }
        }
 
        if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_ACCOUNT_ID && inp->inp_necp_attributes.inp_account != NULL) {
@@ -7237,45 +7963,51 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc
        }
 
        if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_ADDRESS_TYPE_CONDITIONS) {
-               if (inp->inp_vflag & INP_IPV4) {
-                       if (override_local_addr) {
-                               if (override_local_addr->sa_len <= sizeof(struct sockaddr_in)) {
-                                       memcpy(&info->local_addr, override_local_addr, override_local_addr->sa_len);
-                               }
-                       } else {
+               if (override_local_addr != NULL) {
+                       if (override_local_addr->sa_family == AF_INET6 && override_local_addr->sa_len <= sizeof(struct sockaddr_in6)) {
+                               memcpy(&info->local_addr, override_local_addr, override_local_addr->sa_len);
+                               if (IN6_IS_ADDR_V4MAPPED(&(info->local_addr.sin6.sin6_addr))) {
+                                       struct sockaddr_in sin;
+                                       in6_sin6_2_sin(&sin, &(info->local_addr.sin6));
+                                       memset(&info->local_addr, 0, sizeof(union necp_sockaddr_union));
+                                       memcpy(&info->local_addr, &sin, sin.sin_len);
+                               }
+                       } else if (override_local_addr->sa_family == AF_INET && override_local_addr->sa_len <= sizeof(struct sockaddr_in)) {
+                               memcpy(&info->local_addr, override_local_addr, override_local_addr->sa_len);
+                       }
+               } else {
+                       if (inp->inp_vflag & INP_IPV4) {
                                ((struct sockaddr_in *)&info->local_addr)->sin_family = AF_INET;
                                ((struct sockaddr_in *)&info->local_addr)->sin_len = sizeof(struct sockaddr_in);
                                ((struct sockaddr_in *)&info->local_addr)->sin_port = inp->inp_lport;
                                memcpy(&((struct sockaddr_in *)&info->local_addr)->sin_addr, &inp->inp_laddr, sizeof(struct in_addr));
-                       }
-
-                       if (override_remote_addr) {
-                               if (override_remote_addr->sa_len <= sizeof(struct sockaddr_in)) {
-                                       memcpy(&info->remote_addr, override_remote_addr, override_remote_addr->sa_len);
-                               }
-                       } else {
-                               ((struct sockaddr_in *)&info->remote_addr)->sin_family = AF_INET;
-                               ((struct sockaddr_in *)&info->remote_addr)->sin_len = sizeof(struct sockaddr_in);
-                               ((struct sockaddr_in *)&info->remote_addr)->sin_port = inp->inp_fport;
-                               memcpy(&((struct sockaddr_in *)&info->remote_addr)->sin_addr, &inp->inp_faddr, sizeof(struct in_addr));
-                       }
-               } else if (inp->inp_vflag & INP_IPV6) {
-                       if (override_local_addr) {
-                               if (override_local_addr->sa_len <= sizeof(struct sockaddr_in6)) {
-                                       memcpy(&info->local_addr, override_local_addr, override_local_addr->sa_len);
-                               }
-                       } else {
+                       } else if (inp->inp_vflag & INP_IPV6) {
                                ((struct sockaddr_in6 *)&info->local_addr)->sin6_family = AF_INET6;
                                ((struct sockaddr_in6 *)&info->local_addr)->sin6_len = sizeof(struct sockaddr_in6);
                                ((struct sockaddr_in6 *)&info->local_addr)->sin6_port = inp->inp_lport;
                                memcpy(&((struct sockaddr_in6 *)&info->local_addr)->sin6_addr, &inp->in6p_laddr, sizeof(struct in6_addr));
                        }
+               }
 
-                       if (override_remote_addr) {
-                               if (override_remote_addr->sa_len <= sizeof(struct sockaddr_in6)) {
-                                       memcpy(&info->remote_addr, override_remote_addr, override_remote_addr->sa_len);
+               if (override_remote_addr != NULL) {
+                       if (override_remote_addr->sa_family == AF_INET6 && override_remote_addr->sa_len <= sizeof(struct sockaddr_in6)) {
+                               memcpy(&info->remote_addr, override_remote_addr, override_remote_addr->sa_len);
+                               if (IN6_IS_ADDR_V4MAPPED(&(info->remote_addr.sin6.sin6_addr))) {
+                                       struct sockaddr_in sin;
+                                       in6_sin6_2_sin(&sin, &(info->remote_addr.sin6));
+                                       memset(&info->remote_addr, 0, sizeof(union necp_sockaddr_union));
+                                       memcpy(&info->remote_addr, &sin, sin.sin_len);
                                }
-                       } else {
+                       } else if (override_remote_addr->sa_family == AF_INET && override_remote_addr->sa_len <= sizeof(struct sockaddr_in)) {
+                               memcpy(&info->remote_addr, override_remote_addr, override_remote_addr->sa_len);
+                       }
+               } else {
+                       if (inp->inp_vflag & INP_IPV4) {
+                               ((struct sockaddr_in *)&info->remote_addr)->sin_family = AF_INET;
+                               ((struct sockaddr_in *)&info->remote_addr)->sin_len = sizeof(struct sockaddr_in);
+                               ((struct sockaddr_in *)&info->remote_addr)->sin_port = inp->inp_fport;
+                               memcpy(&((struct sockaddr_in *)&info->remote_addr)->sin_addr, &inp->inp_faddr, sizeof(struct in_addr));
+                       } else if (inp->inp_vflag & INP_IPV6) {
                                ((struct sockaddr_in6 *)&info->remote_addr)->sin6_family = AF_INET6;
                                ((struct sockaddr_in6 *)&info->remote_addr)->sin6_len = sizeof(struct sockaddr_in6);
                                ((struct sockaddr_in6 *)&info->remote_addr)->sin6_port = inp->inp_fport;
@@ -7287,45 +8019,75 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc
 
 static inline struct necp_kernel_socket_policy *
 necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info,
-    necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id,
+    necp_kernel_policy_filter *return_filter,
+    u_int32_t *return_route_rule_id_array, size_t *return_route_rule_id_array_count, size_t route_rule_id_array_count,
     necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service,
     u_int32_t *return_netagent_array, u_int32_t *return_netagent_use_flags_array, size_t netagent_array_count,
     struct necp_client_parameter_netagent_type *required_agent_types,
-    u_int32_t num_required_agent_types, proc_t proc, necp_kernel_policy_id *skip_policy_id)
+    u_int32_t num_required_agent_types, proc_t proc, necp_kernel_policy_id *skip_policy_id, struct rtentry *rt,
+    necp_kernel_policy_result *return_drop_dest_policy_result, necp_drop_all_bypass_check_result_t *return_drop_all_bypass)
 {
        struct necp_kernel_socket_policy *matched_policy = NULL;
        u_int32_t skip_order = 0;
        u_int32_t skip_session_order = 0;
-       u_int32_t route_rule_id_array[MAX_AGGREGATE_ROUTE_RULES];
        size_t route_rule_id_count = 0;
        int i;
        size_t netagent_cursor = 0;
+       necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE;
+       if (return_drop_all_bypass != NULL) {
+               *return_drop_all_bypass = drop_all_bypass;
+       }
 
        // Pre-process domain for quick matching
        struct substring domain_substring = necp_trim_dots_and_stars(info->domain, info->domain ? strlen(info->domain) : 0);
        u_int8_t domain_dot_count = necp_count_dots(domain_substring.string, domain_substring.length);
 
-       if (return_filter) {
+       if (return_filter != NULL) {
                *return_filter = 0;
        }
 
-       if (return_route_rule_id) {
-               *return_route_rule_id = 0;
+       if (return_route_rule_id_array_count != NULL) {
+               *return_route_rule_id_array_count = 0;
        }
 
-       if (return_service_action) {
+       if (return_service_action != NULL) {
                *return_service_action = 0;
        }
 
-       if (return_service) {
+       if (return_service != NULL) {
                return_service->identifier = 0;
                return_service->data = 0;
        }
 
+       // Do not subject layer-2 filter to NECP policies, return a PASS policy
+       if (necp_pass_interpose > 0 && info->client_flags & NECP_CLIENT_PARAMETER_FLAG_INTERPOSE) {
+               return &pass_policy;
+       }
+
+       *return_drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE;
+
        if (policy_search_array != NULL) {
                for (i = 0; policy_search_array[i] != NULL; i++) {
                        if (necp_drop_all_order != 0 && policy_search_array[i]->session_order >= necp_drop_all_order) {
                                // We've hit a drop all rule
+                               if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE) {
+                                       drop_all_bypass = necp_check_drop_all_bypass_result(proc);
+                                       if (return_drop_all_bypass != NULL) {
+                                               *return_drop_all_bypass = drop_all_bypass;
+                                       }
+                               }
+                               if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE) {
+                                       break;
+                               }
+                       }
+                       if (necp_drop_dest_policy.entry_count != 0 &&
+                           necp_address_matches_drop_dest_policy(&info->remote_addr, policy_search_array[i]->session_order)) {
+                               // We've hit a drop by destination address rule
+                               *return_drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_DROP;
+                               break;
+                       }
+                       if (info->drop_order != 0 && policy_search_array[i]->session_order >= info->drop_order) {
+                               // We've hit a drop order for this socket
                                break;
                        }
                        if (skip_session_order && policy_search_array[i]->session_order >= skip_session_order) {
@@ -7346,18 +8108,24 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy
                                // Skip this policy
                                continue;
                        }
-                       if (necp_socket_check_policy(policy_search_array[i], info->application_id, info->real_application_id, info->cred_result, info->account_id, domain_substring, domain_dot_count, info->pid, info->uid, info->bound_interface_index, info->traffic_class, info->protocol, &info->local_addr, &info->remote_addr, required_agent_types, num_required_agent_types, proc)) {
+
+                       if (necp_socket_check_policy(policy_search_array[i], info->application_id, info->real_application_id, info->cred_result, info->account_id, domain_substring, domain_dot_count, info->pid, info->uid, info->bound_interface_index, info->traffic_class, info->protocol, &info->local_addr, &info->remote_addr, required_agent_types, num_required_agent_types, info->has_client, info->client_flags, info->is_platform_binary, proc, rt)) {
                                if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SOCKET_FILTER) {
-                                       if (return_filter && *return_filter == 0) {
-                                               *return_filter = policy_search_array[i]->result_parameter.filter_control_unit;
+                                       if (return_filter && *return_filter != NECP_FILTER_UNIT_NO_FILTER) {
+                                               necp_kernel_policy_filter control_unit = policy_search_array[i]->result_parameter.filter_control_unit;
+                                               if (control_unit == NECP_FILTER_UNIT_NO_FILTER) {
+                                                       *return_filter = control_unit;
+                                               } else {
+                                                       *return_filter |= control_unit;
+                                               }
                                                if (necp_debug > 1) {
                                                        NECPLOG(LOG_DEBUG, "Socket Policy: (Application %d Real Application %d BoundInterface %d Proto %d) Filter %d", info->application_id, info->real_application_id, info->bound_interface_index, info->protocol, policy_search_array[i]->result_parameter.filter_control_unit);
                                                }
                                        }
                                        continue;
                                } else if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_ROUTE_RULES) {
-                                       if (return_route_rule_id && route_rule_id_count < MAX_AGGREGATE_ROUTE_RULES) {
-                                               route_rule_id_array[route_rule_id_count++] = policy_search_array[i]->result_parameter.route_rule_id;
+                                       if (return_route_rule_id_array && route_rule_id_count < route_rule_id_array_count) {
+                                               return_route_rule_id_array[route_rule_id_count++] = policy_search_array[i]->result_parameter.route_rule_id;
                                                if (necp_debug > 1) {
                                                        NECPLOG(LOG_DEBUG, "Socket Policy: (Application %d Real Application %d BoundInterface %d Proto %d) Route Rule %d", info->application_id, info->real_application_id, info->bound_interface_index, info->protocol, policy_search_array[i]->result_parameter.route_rule_id);
                                                }
@@ -7408,6 +8176,12 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy
                                        continue;
                                }
 
+                               // Matched an allow unentitled, which clears any drop order
+                               if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_ALLOW_UNENTITLED) {
+                                       info->drop_order = 0;
+                                       continue;
+                               }
+
                                // Passed all tests, found a match
                                matched_policy = policy_search_array[i];
                                break;
@@ -7415,10 +8189,8 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy
                }
        }
 
-       if (route_rule_id_count == 1) {
-               *return_route_rule_id = route_rule_id_array[0];
-       } else if (route_rule_id_count > 1) {
-               *return_route_rule_id = necp_create_aggregate_route_rule(route_rule_id_array);
+       if (return_route_rule_id_array_count != NULL) {
+               *return_route_rule_id_array_count = route_rule_id_count;
        }
        return matched_policy;
 }
@@ -7495,11 +8267,12 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
 {
        struct socket *so = NULL;
        necp_kernel_policy_filter filter_control_unit = 0;
-       u_int32_t route_rule_id = 0;
        struct necp_kernel_socket_policy *matched_policy = NULL;
        necp_kernel_policy_id matched_policy_id = NECP_KERNEL_POLICY_ID_NONE;
        necp_kernel_policy_result service_action = 0;
        necp_kernel_policy_service service = { 0, 0 };
+       u_int32_t drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE;
+       necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE;
 
        u_int32_t netagent_ids[NECP_MAX_NETAGENTS];
        memset(&netagent_ids, 0, sizeof(netagent_ids));
@@ -7523,10 +8296,12 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
 
        so = inp->inp_socket;
 
+       u_int32_t drop_order = necp_process_drop_order(so->so_cred);
+
        // Don't lock. Possible race condition, but we don't want the performance hit.
        if (necp_kernel_socket_policies_count == 0 ||
            (!(inp->inp_flags2 & INP2_WANT_APP_POLICY) && necp_kernel_socket_policies_non_app_count == 0)) {
-               if (necp_drop_all_order > 0) {
+               if (necp_drop_all_order > 0 || drop_order > 0) {
                        inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
                        inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
                        inp->inp_policyresult.policy_gencount = 0;
@@ -7560,7 +8335,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
        // Lock
        lck_rw_lock_shared(&necp_kernel_policy_lock);
 
-       necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, override_bound_interface, &info);
+       necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, override_bound_interface, drop_order, &info);
        inp->inp_policyresult.app_id = info.application_id;
 
        // Check info
@@ -7578,7 +8353,10 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
 
        // Match socket to policy
        necp_kernel_policy_id skip_policy_id;
-       matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, NULL, NECP_MAX_NETAGENTS, NULL, 0, current_proc(), &skip_policy_id);
+       u_int32_t route_rule_id_array[MAX_AGGREGATE_ROUTE_RULES];
+       size_t route_rule_id_array_count = 0;
+       matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, &filter_control_unit, route_rule_id_array, &route_rule_id_array_count, MAX_AGGREGATE_ROUTE_RULES, &service_action, &service, netagent_ids, NULL, NECP_MAX_NETAGENTS, NULL, 0, current_proc(), &skip_policy_id, inp->inp_route.ro_rt, &drop_dest_policy_result, &drop_all_bypass);
+
        // If the socket matched a scoped service policy, mark as Drop if not registered.
        // This covers the cases in which a service is required (on demand) but hasn't started yet.
        if ((service_action == NECP_KERNEL_POLICY_RESULT_TRIGGER_SCOPED ||
@@ -7655,6 +8433,15 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
                        }
                }
        }
+
+       u_int32_t route_rule_id = 0;
+       if (route_rule_id_array_count == 1) {
+               route_rule_id = route_rule_id_array[0];
+       } else if (route_rule_id_array_count > 1) {
+               route_rule_id = necp_create_aggregate_route_rule(route_rule_id_array);
+       }
+
+       bool reset_tcp_mss = false;
        if (matched_policy) {
                matched_policy_id = matched_policy->id;
                inp->inp_policyresult.policy_id = matched_policy->id;
@@ -7677,40 +8464,54 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
                    matched_policy->result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL &&
                    info.protocol == IPPROTO_TCP) {
                        // Reset MSS on TCP socket if tunnel policy changes
-                       tcp_mtudisc(inp, 0);
+                       reset_tcp_mss = true;
                }
 
                if (necp_debug > 1) {
                        NECPLOG(LOG_DEBUG, "Socket Policy: %p (BoundInterface %d Proto %d) Policy %d Result %d Parameter %d", inp->inp_socket, info.bound_interface_index, info.protocol, matched_policy->id, matched_policy->result, matched_policy->result_parameter.tunnel_interface_index);
                }
-       } else if (necp_drop_all_order > 0) {
-               // Mark socket as a drop if set
-               inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
-               inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
-               inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount;
-               inp->inp_policyresult.flowhash = flowhash;
-               inp->inp_policyresult.results.filter_control_unit = 0;
-               inp->inp_policyresult.results.route_rule_id = 0;
-               inp->inp_policyresult.results.result = NECP_KERNEL_POLICY_RESULT_DROP;
        } else {
-               // Mark non-matching socket so we don't re-check it
-               inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
-               inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
-               inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount;
-               inp->inp_policyresult.flowhash = flowhash;
-               inp->inp_policyresult.results.filter_control_unit = filter_control_unit; // We may have matched a filter, so mark it!
-               inp->inp_policyresult.results.route_rule_id = route_rule_id; // We may have matched a route rule, so mark it!
-               inp->inp_policyresult.results.result = NECP_KERNEL_POLICY_RESULT_NONE;
+               bool drop_all = false;
+               if (necp_drop_all_order > 0 || info.drop_order > 0 || drop_dest_policy_result == NECP_KERNEL_POLICY_RESULT_DROP) {
+                       // Mark socket as a drop if set
+                       drop_all = true;
+                       if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE) {
+                               drop_all_bypass = necp_check_drop_all_bypass_result(NULL);
+                       }
+               }
+               if (drop_all && drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE) {
+                       inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+                       inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+                       inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount;
+                       inp->inp_policyresult.flowhash = flowhash;
+                       inp->inp_policyresult.results.filter_control_unit = 0;
+                       inp->inp_policyresult.results.route_rule_id = 0;
+                       inp->inp_policyresult.results.result = NECP_KERNEL_POLICY_RESULT_DROP;
+               } else {
+                       // Mark non-matching socket so we don't re-check it
+                       inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+                       inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+                       inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount;
+                       inp->inp_policyresult.flowhash = flowhash;
+                       inp->inp_policyresult.results.filter_control_unit = filter_control_unit; // We may have matched a filter, so mark it!
+                       inp->inp_policyresult.results.route_rule_id = route_rule_id; // We may have matched a route rule, so mark it!
+                       inp->inp_policyresult.results.result = NECP_KERNEL_POLICY_RESULT_NONE;
+               }
        }
 
        // Unlock
        lck_rw_done(&necp_kernel_policy_lock);
 
+       if (reset_tcp_mss) {
+               // Update MSS when not holding the policy lock to avoid recursive locking
+               tcp_mtudisc(inp, 0);
+       }
+
        return matched_policy_id;
 }
 
 static bool
-necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy, necp_kernel_policy_id socket_policy_id, necp_kernel_policy_id socket_skip_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote)
+necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy, necp_kernel_policy_id socket_policy_id, necp_kernel_policy_id socket_skip_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, struct rtentry *rt)
 {
        if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES)) {
                if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) {
@@ -7767,6 +8568,21 @@ necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy,
                }
        }
 
+       if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_NETWORKS) {
+               bool is_local = FALSE;
+
+               if (rt != NULL) {
+                       is_local = IS_NECP_DEST_IN_LOCAL_NETWORKS(rt);
+               } else {
+                       is_local = necp_is_route_local(remote);
+               }
+
+               if (!is_local) {
+                       // Either no route to validate or no match for local networks
+                       return FALSE;
+               }
+       }
+
        if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_START) {
                if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_END) {
                        bool inRange = necp_is_addr_in_range((struct sockaddr *)local, (struct sockaddr *)&kernel_policy->cond_local_start, (struct sockaddr *)&kernel_policy->cond_local_end);
@@ -7823,17 +8639,43 @@ necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy,
 }
 
 static inline struct necp_kernel_ip_output_policy *
-necp_ip_output_find_policy_match_locked(necp_kernel_policy_id socket_policy_id, necp_kernel_policy_id socket_skip_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr)
+necp_ip_output_find_policy_match_locked(necp_kernel_policy_id socket_policy_id, necp_kernel_policy_id socket_skip_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr, struct rtentry *rt, u_int32_t *return_route_rule_id, necp_kernel_policy_result *return_drop_dest_policy_result, necp_drop_all_bypass_check_result_t *return_drop_all_bypass)
 {
        u_int32_t skip_order = 0;
        u_int32_t skip_session_order = 0;
-       int i;
        struct necp_kernel_ip_output_policy *matched_policy = NULL;
        struct necp_kernel_ip_output_policy **policy_search_array = necp_kernel_ip_output_policies_map[NECP_IP_OUTPUT_MAP_ID_TO_BUCKET(socket_policy_id)];
+       u_int32_t route_rule_id_array[MAX_AGGREGATE_ROUTE_RULES];
+       size_t route_rule_id_count = 0;
+       necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE;
+       if (return_drop_all_bypass != NULL) {
+               *return_drop_all_bypass = drop_all_bypass;
+       }
+
+       if (return_route_rule_id != NULL) {
+               *return_route_rule_id = 0;
+       }
+
+       *return_drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE;
+
        if (policy_search_array != NULL) {
-               for (i = 0; policy_search_array[i] != NULL; i++) {
+               for (int i = 0; policy_search_array[i] != NULL; i++) {
                        if (necp_drop_all_order != 0 && policy_search_array[i]->session_order >= necp_drop_all_order) {
                                // We've hit a drop all rule
+                               if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE) {
+                                       drop_all_bypass = necp_check_drop_all_bypass_result(NULL);
+                                       if (return_drop_all_bypass != NULL) {
+                                               *return_drop_all_bypass = drop_all_bypass;
+                                       }
+                               }
+                               if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE) {
+                                       break;
+                               }
+                       }
+                       if (necp_drop_dest_policy.entry_count > 0 &&
+                           necp_address_matches_drop_dest_policy(remote_addr, policy_search_array[i]->session_order)) {
+                               // We've hit a drop by destination address rule
+                               *return_drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_DROP;
                                break;
                        }
                        if (skip_session_order && policy_search_array[i]->session_order >= skip_session_order) {
@@ -7854,21 +8696,32 @@ necp_ip_output_find_policy_match_locked(necp_kernel_policy_id socket_policy_id,
                                // Skip this policy
                                continue;
                        }
-                       if (necp_ip_output_check_policy(policy_search_array[i], socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, local_addr, remote_addr)) {
-                               // Passed all tests, found a match
-                               matched_policy = policy_search_array[i];
 
-                               if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SKIP) {
+                       if (necp_ip_output_check_policy(policy_search_array[i], socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, local_addr, remote_addr, rt)) {
+                               if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_ROUTE_RULES) {
+                                       if (return_route_rule_id != NULL && route_rule_id_count < MAX_AGGREGATE_ROUTE_RULES) {
+                                               route_rule_id_array[route_rule_id_count++] = policy_search_array[i]->result_parameter.route_rule_id;
+                                       }
+                                       continue;
+                               } else if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SKIP) {
                                        skip_order = policy_search_array[i]->result_parameter.skip_policy_order;
                                        skip_session_order = policy_search_array[i]->session_order + 1;
                                        continue;
                                }
 
+                               // Passed all tests, found a match
+                               matched_policy = policy_search_array[i];
                                break;
                        }
                }
        }
 
+       if (route_rule_id_count == 1) {
+               *return_route_rule_id = route_rule_id_array[0];
+       } else if (route_rule_id_count > 1) {
+               *return_route_rule_id = necp_create_aggregate_route_rule(route_rule_id_array);
+       }
+
        return matched_policy;
 }
 
@@ -7888,7 +8741,8 @@ necp_output_bypass(struct mbuf *packet)
 }
 
 necp_kernel_policy_id
-necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_args *ipoa, necp_kernel_policy_result *result, necp_kernel_policy_result_parameter *result_parameter)
+necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_args *ipoa, struct rtentry *rt,
+    necp_kernel_policy_result *result, necp_kernel_policy_result_parameter *result_parameter)
 {
        struct ip *ip = NULL;
        int hlen = sizeof(struct ip);
@@ -7901,6 +8755,8 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a
        u_int32_t last_interface_index = 0;
        union necp_sockaddr_union local_addr;
        union necp_sockaddr_union remote_addr;
+       u_int32_t drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE;
+       necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE;
 
        if (result) {
                *result = 0;
@@ -7920,7 +8776,7 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a
        // Exit early for an empty list
        // Don't lock. Possible race condition, but we don't want the performance hit.
        if (necp_kernel_ip_output_policies_count == 0 ||
-           ((socket_policy_id == NECP_KERNEL_POLICY_ID_NONE) && necp_kernel_ip_output_policies_non_id_count == 0)) {
+           (socket_policy_id == NECP_KERNEL_POLICY_ID_NONE && necp_kernel_ip_output_policies_non_id_count == 0 && necp_drop_dest_policy.entry_count == 0)) {
                if (necp_drop_all_order > 0) {
                        matched_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
                        if (result) {
@@ -7998,7 +8854,8 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a
 
        // Match packet to policy
        lck_rw_lock_shared(&necp_kernel_policy_lock);
-       matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr);
+       u_int32_t route_rule_id = 0;
+       matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr, rt, &route_rule_id, &drop_dest_policy_result, &drop_all_bypass);
        if (matched_policy) {
                matched_policy_id = matched_policy->id;
                if (result) {
@@ -8009,13 +8866,36 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a
                        memcpy(result_parameter, &matched_policy->result_parameter, sizeof(matched_policy->result_parameter));
                }
 
+               if (route_rule_id != 0 &&
+                   packet->m_pkthdr.necp_mtag.necp_route_rule_id == 0) {
+                       packet->m_pkthdr.necp_mtag.necp_route_rule_id = route_rule_id;
+               }
+
                if (necp_debug > 1) {
-                       NECPLOG(LOG_DEBUG, "IP Output: (ID %d BoundInterface %d LastInterface %d Proto %d) Policy %d Result %d Parameter %d", socket_policy_id, bound_interface_index, last_interface_index, protocol, matched_policy->id, matched_policy->result, matched_policy->result_parameter.tunnel_interface_index);
+                       NECPLOG(LOG_DEBUG, "IP Output: (ID %d BoundInterface %d LastInterface %d Proto %d) Policy %d Result %d Parameter %d Route Rule %u", socket_policy_id, bound_interface_index, last_interface_index, protocol, matched_policy->id, matched_policy->result, matched_policy->result_parameter.tunnel_interface_index, route_rule_id);
                }
-       } else if (necp_drop_all_order > 0) {
-               matched_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
-               if (result) {
-                       *result = NECP_KERNEL_POLICY_RESULT_DROP;
+       } else {
+               bool drop_all = false;
+               /*
+                * Apply drop-all only to packets which have never matched a primary policy (check
+                * if the packet saved policy id is none or falls within the socket policy id range).
+                */
+               if (socket_policy_id < NECP_KERNEL_POLICY_ID_FIRST_VALID_IP &&
+                   (necp_drop_all_order > 0 || drop_dest_policy_result == NECP_KERNEL_POLICY_RESULT_DROP)) {
+                       drop_all = true;
+                       if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE) {
+                               drop_all_bypass = necp_check_drop_all_bypass_result(NULL);
+                       }
+               }
+               if (drop_all && drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE) {
+                       matched_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+                       if (result) {
+                               *result = NECP_KERNEL_POLICY_RESULT_DROP;
+                       }
+               } else if (route_rule_id != 0 &&
+                   packet->m_pkthdr.necp_mtag.necp_route_rule_id == 0) {
+                       // If we matched a route rule, mark it
+                       packet->m_pkthdr.necp_mtag.necp_route_rule_id = route_rule_id;
                }
        }
 
@@ -8025,7 +8905,8 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a
 }
 
 necp_kernel_policy_id
-necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out_args *ip6oa, necp_kernel_policy_result *result, necp_kernel_policy_result_parameter *result_parameter)
+necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out_args *ip6oa, struct rtentry *rt,
+    necp_kernel_policy_result *result, necp_kernel_policy_result_parameter *result_parameter)
 {
        struct ip6_hdr *ip6 = NULL;
        int next = -1;
@@ -8039,6 +8920,8 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out
        u_int32_t last_interface_index = 0;
        union necp_sockaddr_union local_addr;
        union necp_sockaddr_union remote_addr;
+       u_int32_t drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE;
+       necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE;
 
        if (result) {
                *result = 0;
@@ -8058,7 +8941,7 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out
        // Exit early for an empty list
        // Don't lock. Possible race condition, but we don't want the performance hit.
        if (necp_kernel_ip_output_policies_count == 0 ||
-           ((socket_policy_id == NECP_KERNEL_POLICY_ID_NONE) && necp_kernel_ip_output_policies_non_id_count == 0)) {
+           (socket_policy_id == NECP_KERNEL_POLICY_ID_NONE && necp_kernel_ip_output_policies_non_id_count == 0 && necp_drop_dest_policy.entry_count == 0)) {
                if (necp_drop_all_order > 0) {
                        matched_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
                        if (result) {
@@ -8133,7 +9016,8 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out
 
        // Match packet to policy
        lck_rw_lock_shared(&necp_kernel_policy_lock);
-       matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr);
+       u_int32_t route_rule_id = 0;
+       matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr, rt, &route_rule_id, &drop_dest_policy_result, &drop_all_bypass);
        if (matched_policy) {
                matched_policy_id = matched_policy->id;
                if (result) {
@@ -8144,13 +9028,36 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out
                        memcpy(result_parameter, &matched_policy->result_parameter, sizeof(matched_policy->result_parameter));
                }
 
+               if (route_rule_id != 0 &&
+                   packet->m_pkthdr.necp_mtag.necp_route_rule_id == 0) {
+                       packet->m_pkthdr.necp_mtag.necp_route_rule_id = route_rule_id;
+               }
+
                if (necp_debug > 1) {
-                       NECPLOG(LOG_DEBUG, "IP6 Output: (ID %d BoundInterface %d LastInterface %d Proto %d) Policy %d Result %d Parameter %d", socket_policy_id, bound_interface_index, last_interface_index, protocol, matched_policy->id, matched_policy->result, matched_policy->result_parameter.tunnel_interface_index);
+                       NECPLOG(LOG_DEBUG, "IP6 Output: (ID %d BoundInterface %d LastInterface %d Proto %d) Policy %d Result %d Parameter %d Route Rule %u", socket_policy_id, bound_interface_index, last_interface_index, protocol, matched_policy->id, matched_policy->result, matched_policy->result_parameter.tunnel_interface_index, route_rule_id);
                }
-       } else if (necp_drop_all_order > 0) {
-               matched_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
-               if (result) {
-                       *result = NECP_KERNEL_POLICY_RESULT_DROP;
+       } else {
+               bool drop_all = false;
+               /*
+                * Apply drop-all only to packets which have never matched a primary policy (check
+                * if the packet saved policy id is none or falls within the socket policy id range).
+                */
+               if (socket_policy_id < NECP_KERNEL_POLICY_ID_FIRST_VALID_IP &&
+                   (necp_drop_all_order > 0 || drop_dest_policy_result == NECP_KERNEL_POLICY_RESULT_DROP)) {
+                       drop_all = true;
+                       if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE) {
+                               drop_all_bypass = necp_check_drop_all_bypass_result(NULL);
+                       }
+               }
+               if (drop_all && drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE) {
+                       matched_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+                       if (result) {
+                               *result = NECP_KERNEL_POLICY_RESULT_DROP;
+                       }
+               } else if (route_rule_id != 0 &&
+                   packet->m_pkthdr.necp_mtag.necp_route_rule_id == 0) {
+                       // If we matched a route rule, mark it
+                       packet->m_pkthdr.necp_mtag.necp_route_rule_id = route_rule_id;
                }
        }
 
@@ -8360,6 +9267,54 @@ necp_buffer_compare_with_bit_prefix(u_int8_t *p1, u_int8_t *p2, u_int32_t bits)
        return TRUE;
 }
 
+static bool
+necp_addr_is_empty(struct sockaddr *addr)
+{
+       if (addr == NULL) {
+               return TRUE;
+       }
+
+       if (addr->sa_len == 0) {
+               return TRUE;
+       }
+
+       switch (addr->sa_family) {
+       case AF_INET: {
+               static struct sockaddr_in ipv4_empty_address = {
+                       .sin_len = sizeof(struct sockaddr_in),
+                       .sin_family = AF_INET,
+                       .sin_port = 0,
+                       .sin_addr = { .s_addr = 0 }, // 0.0.0.0
+                       .sin_zero = {0},
+               };
+               if (necp_addr_compare(addr, (struct sockaddr *)&ipv4_empty_address, 0) == 0) {
+                       return TRUE;
+               } else {
+                       return FALSE;
+               }
+       }
+       case AF_INET6: {
+               static struct sockaddr_in6 ipv6_empty_address = {
+                       .sin6_len = sizeof(struct sockaddr_in6),
+                       .sin6_family = AF_INET6,
+                       .sin6_port = 0,
+                       .sin6_flowinfo = 0,
+                       .sin6_addr = IN6ADDR_ANY_INIT, // ::
+                       .sin6_scope_id = 0,
+               };
+               if (necp_addr_compare(addr, (struct sockaddr *)&ipv6_empty_address, 0) == 0) {
+                       return TRUE;
+               } else {
+                       return FALSE;
+               }
+       }
+       default:
+               return FALSE;
+       }
+
+       return FALSE;
+}
+
 static bool
 necp_update_qos_marking(struct ifnet *ifp, u_int32_t route_rule_id)
 {
@@ -8399,12 +9354,13 @@ necp_update_qos_marking(struct ifnet *ifp, u_int32_t route_rule_id)
        if ((route_rule->cellular_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_CELLULAR(ifp)) ||
            (route_rule->wifi_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_WIFI(ifp)) ||
            (route_rule->wired_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_WIRED(ifp)) ||
-           (route_rule->expensive_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_EXPENSIVE(ifp))) {
+           (route_rule->expensive_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_EXPENSIVE(ifp)) ||
+           (route_rule->constrained_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_CONSTRAINED(ifp))) {
                qos_marking = TRUE;
                if (necp_debug > 2) {
-                       NECPLOG(LOG_DEBUG, "QoS Marking: C:%d WF:%d W:%d E:%d for Rule %d Allowed %d",
+                       NECPLOG(LOG_DEBUG, "QoS Marking: C:%d WF:%d W:%d E:%d Cn:%d for Rule %d Allowed %d",
                            route_rule->cellular_action, route_rule->wifi_action, route_rule->wired_action,
-                           route_rule->expensive_action, route_rule_id, qos_marking);
+                           route_rule->expensive_action, route_rule->constrained_action, route_rule_id, qos_marking);
                }
                goto done;
        }
@@ -8630,6 +9586,22 @@ necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t
                }
        }
 
+       if (IFNET_IS_CONSTRAINED(ifp)) {
+               if (route_rule->constrained_action == NECP_ROUTE_RULE_DENY_LQM_ABORT) {
+                       if (necp_route_is_lqm_abort(ifp, delegated_ifp)) {
+                               // Mark aggregate action as deny
+                               type_aggregate_action = NECP_ROUTE_RULE_DENY_INTERFACE;
+                       }
+               } else if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->constrained_action)) {
+                       if (type_aggregate_action == NECP_ROUTE_RULE_NONE ||
+                           (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE &&
+                           route_rule->constrained_action == NECP_ROUTE_RULE_DENY_INTERFACE)) {
+                               // Deny wins if there is a conflict
+                               type_aggregate_action = route_rule->constrained_action;
+                       }
+               }
+       }
+
        if (type_aggregate_action != NECP_ROUTE_RULE_NONE) {
                if (necp_debug > 1) {
                        NECPLOG(LOG_DEBUG, "Route Allowed: C:%d WF:%d W:%d E:%d for Rule %d Allowed %d", route_rule->cellular_action, route_rule->wifi_action, route_rule->wired_action, route_rule->expensive_action, route_rule_id, ((type_aggregate_action == NECP_ROUTE_RULE_DENY_INTERFACE) ? FALSE : TRUE));
@@ -8726,7 +9698,8 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr
        u_int32_t route_rule_id = 0;
        struct rtentry *route = NULL;
        u_int32_t interface_type_denied = IFRTYPE_FUNCTIONAL_UNKNOWN;
-
+       necp_kernel_policy_result drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE;
+       necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE;
        u_int32_t netagent_ids[NECP_MAX_NETAGENTS];
        memset(&netagent_ids, 0, sizeof(netagent_ids));
 
@@ -8746,10 +9719,14 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr
 
        route = inp->inp_route.ro_rt;
 
+       struct socket *so = inp->inp_socket;
+
+       u_int32_t drop_order = necp_process_drop_order(so->so_cred);
+
        // Don't lock. Possible race condition, but we don't want the performance hit.
        if (necp_kernel_socket_policies_count == 0 ||
            (!(inp->inp_flags2 & INP2_WANT_APP_POLICY) && necp_kernel_socket_policies_non_app_count == 0)) {
-               if (necp_drop_all_order > 0) {
+               if (necp_drop_all_order > 0 || drop_order > 0) {
                        if (necp_socket_bypass(override_local_addr, override_remote_addr, inp)) {
                                allowed_to_receive = TRUE;
                        } else {
@@ -8806,7 +9783,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr
 
        // Actually calculate policy result
        lck_rw_lock_shared(&necp_kernel_policy_lock);
-       necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, 0, &info);
+       necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, 0, drop_order, &info);
 
        flowhash = necp_socket_calc_flowhash_locked(&info);
        if (inp->inp_policyresult.policy_id != NECP_KERNEL_POLICY_ID_NONE &&
@@ -8834,7 +9811,16 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr
                goto done;
        }
 
-       struct necp_kernel_socket_policy *matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, NULL, &route_rule_id, &service_action, &service, netagent_ids, NULL, NECP_MAX_NETAGENTS, NULL, 0, current_proc(), return_skip_policy_id);
+       u_int32_t route_rule_id_array[MAX_AGGREGATE_ROUTE_RULES];
+       size_t route_rule_id_array_count = 0;
+       struct necp_kernel_socket_policy *matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, NULL, route_rule_id_array, &route_rule_id_array_count, MAX_AGGREGATE_ROUTE_RULES, &service_action, &service, netagent_ids, NULL, NECP_MAX_NETAGENTS, NULL, 0, current_proc(), return_skip_policy_id, inp->inp_route.ro_rt, &drop_dest_policy_result, &drop_all_bypass);
+
+       if (route_rule_id_array_count == 1) {
+               route_rule_id = route_rule_id_array[0];
+       } else if (route_rule_id_array_count > 1) {
+               route_rule_id = necp_create_aggregate_route_rule(route_rule_id_array);
+       }
+
        if (matched_policy != NULL) {
                if (matched_policy->result == NECP_KERNEL_POLICY_RESULT_DROP ||
                    matched_policy->result == NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT ||
@@ -8861,14 +9847,23 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr
                        NECPLOG(LOG_DEBUG, "Socket Send/Recv Policy: Policy %d Allowed %d", return_policy_id ? *return_policy_id : 0, allowed_to_receive);
                }
                goto done;
-       } else if (necp_drop_all_order > 0) {
-               allowed_to_receive = FALSE;
        } else {
-               if (return_policy_id) {
-                       *return_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+               bool drop_all = false;
+               if (necp_drop_all_order > 0 || info.drop_order > 0 || drop_dest_policy_result == NECP_KERNEL_POLICY_RESULT_DROP) {
+                       drop_all = true;
+                       if (drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE) {
+                               drop_all_bypass = necp_check_drop_all_bypass_result(NULL);
+                       }
                }
-               if (return_route_rule_id) {
-                       *return_route_rule_id = route_rule_id;
+               if (drop_all && drop_all_bypass == NECP_DROP_ALL_BYPASS_CHECK_RESULT_FALSE) {
+                       allowed_to_receive = FALSE;
+               } else {
+                       if (return_policy_id) {
+                               *return_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+                       }
+                       if (return_route_rule_id) {
+                               *return_route_rule_id = route_rule_id;
+                       }
                }
        }
 
@@ -8915,10 +9910,13 @@ necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port,
 }
 
 bool
-necp_socket_is_allowed_to_send_recv(struct inpcb *inp, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id,
+necp_socket_is_allowed_to_send_recv(struct inpcb *inp, ifnet_t interface, necp_kernel_policy_id *return_policy_id,
+    u_int32_t *return_route_rule_id,
     necp_kernel_policy_id *return_skip_policy_id)
 {
-       return necp_socket_is_allowed_to_send_recv_internal(inp, NULL, NULL, NULL, return_policy_id, return_route_rule_id, return_skip_policy_id);
+       return necp_socket_is_allowed_to_send_recv_internal(inp, NULL, NULL, interface,
+                  return_policy_id, return_route_rule_id,
+                  return_skip_policy_id);
 }
 
 int
@@ -8946,8 +9944,18 @@ necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel
        }
        packet->m_pkthdr.necp_mtag.necp_app_id = inp->inp_policyresult.app_id;
 
-       if (skip_policy_id != NECP_KERNEL_POLICY_ID_NONE) {
+       if (skip_policy_id != NECP_KERNEL_POLICY_ID_NONE &&
+           skip_policy_id != NECP_KERNEL_POLICY_ID_NO_MATCH) {
+               // Only mark the skip policy if it is a valid policy ID
                packet->m_pkthdr.necp_mtag.necp_skip_policy_id = skip_policy_id;
+       } else if (inp->inp_policyresult.results.filter_control_unit == NECP_FILTER_UNIT_NO_FILTER) {
+               // Overload the meaning of "NECP_KERNEL_POLICY_ID_NO_MATCH"
+               // to indicate that NECP_FILTER_UNIT_NO_FILTER was set
+               // See necp_get_skip_policy_id_from_packet() and
+               // necp_packet_should_skip_filters().
+               packet->m_pkthdr.necp_mtag.necp_skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+       } else {
+               packet->m_pkthdr.necp_mtag.necp_skip_policy_id = NECP_KERNEL_POLICY_ID_NONE;
        }
 
        return 0;
@@ -9018,9 +10026,25 @@ necp_get_skip_policy_id_from_packet(struct mbuf *packet)
                return NECP_KERNEL_POLICY_ID_NONE;
        }
 
+       // Check for overloaded value. See necp_mark_packet_from_socket().
+       if (packet->m_pkthdr.necp_mtag.necp_skip_policy_id == NECP_KERNEL_POLICY_ID_NO_MATCH) {
+               return NECP_KERNEL_POLICY_ID_NONE;
+       }
+
        return packet->m_pkthdr.necp_mtag.necp_skip_policy_id;
 }
 
+bool
+necp_packet_should_skip_filters(struct mbuf *packet)
+{
+       if (packet == NULL || !(packet->m_flags & M_PKTHDR)) {
+               return false;
+       }
+
+       // Check for overloaded value. See necp_mark_packet_from_socket().
+       return packet->m_pkthdr.necp_mtag.necp_skip_policy_id == NECP_KERNEL_POLICY_ID_NO_MATCH;
+}
+
 u_int32_t
 necp_get_last_interface_index_from_packet(struct mbuf *packet)
 {
@@ -9332,7 +10356,17 @@ static bool
 necp_is_intcoproc(struct inpcb *inp, struct mbuf *packet)
 {
        if (inp != NULL) {
-               return sflt_permission_check(inp) ? true : false;
+               if (!(inp->inp_vflag & INP_IPV6)) {
+                       return false;
+               }
+               if (INP_INTCOPROC_ALLOWED(inp)) {
+                       return true;
+               }
+               if ((inp->inp_flags & INP_BOUND_IF) &&
+                   IFNET_IS_INTCOPROC(inp->inp_boundifp)) {
+                       return true;
+               }
+               return false;
        }
        if (packet != NULL) {
                struct ip6_hdr *ip6 = mtod(packet, struct ip6_hdr *);
@@ -9346,3 +10380,176 @@ necp_is_intcoproc(struct inpcb *inp, struct mbuf *packet)
 
        return false;
 }
+
+static bool
+necp_address_matches_drop_dest_policy(union necp_sockaddr_union *sau, u_int32_t session_order)
+{
+       char dest_str[MAX_IPv6_STR_LEN];
+
+       if (necp_drop_dest_debug > 0) {
+               if (sau->sa.sa_family == AF_INET) {
+                       (void) inet_ntop(AF_INET, &sau->sin.sin_addr, dest_str, sizeof(dest_str));
+               } else if (sau->sa.sa_family == AF_INET6) {
+                       (void) inet_ntop(AF_INET6, &sau->sin6.sin6_addr, dest_str, sizeof(dest_str));
+               } else {
+                       dest_str[0] = 0;
+               }
+       }
+       for (u_int32_t i = 0; i < necp_drop_dest_policy.entry_count; i++) {
+               struct necp_drop_dest_entry *necp_drop_dest_entry = &necp_drop_dest_policy.entries[i];
+               struct necp_policy_condition_addr *npca = &necp_drop_dest_entry->cond_addr;
+
+               if (session_order >= necp_drop_dest_entry->order && necp_is_addr_in_subnet(&sau->sa, &npca->address.sa, npca->prefix)) {
+                       if (necp_drop_dest_debug > 0) {
+                               char subnet_str[MAX_IPv6_STR_LEN];
+                               struct proc *p = current_proc();
+                               pid_t pid = proc_pid(p);
+
+                               if (sau->sa.sa_family == AF_INET) {
+                                       (void) inet_ntop(AF_INET, &npca->address.sin, subnet_str, sizeof(subnet_str));
+                                       os_log(OS_LOG_DEFAULT, "%s (process %s:%u) %s matches %s/%u", __func__, proc_best_name(p), pid, dest_str, subnet_str, npca->prefix);
+                               } else if (sau->sa.sa_family == AF_INET6) {
+                                       (void) inet_ntop(AF_INET6, &npca->address.sin6, subnet_str, sizeof(subnet_str));
+                                       os_log(OS_LOG_DEFAULT, "%s (process %s:%u) %s matches %s/%u", __func__, proc_best_name(p), pid, dest_str, subnet_str, npca->prefix);
+                               }
+                       }
+                       return true;
+               }
+       }
+       if (necp_drop_dest_debug > 1) {
+               struct proc *p = current_proc();
+               pid_t pid = proc_pid(p);
+
+               os_log(OS_LOG_DEFAULT, "%s (process %s:%u) %s no match", __func__, proc_best_name(p), pid, dest_str);
+       }
+       return false;
+}
+
+static int
+sysctl_handle_necp_drop_dest_level SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2, oidp)
+       int changed = 0;
+       int error = 0;
+       struct necp_drop_dest_policy tmp_drop_dest_policy;
+       struct proc *p = current_proc();
+       pid_t pid = proc_pid(p);
+
+       if (req->newptr != USER_ADDR_NULL && proc_suser(current_proc()) != 0 &&
+           priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_NECP_POLICIES, 0) != 0) {
+               NECPLOG(LOG_ERR, "%s (process %s:%u) not permitted", __func__, proc_best_name(p), pid);
+               return EPERM;
+       }
+       if (req->newptr != USER_ADDR_NULL && req->newlen != sizeof(struct necp_drop_dest_policy)) {
+               NECPLOG(LOG_ERR, "%s (process %s:%u) bad newlen %lu", __func__, proc_best_name(p), pid, req->newlen);
+               return EINVAL;
+       }
+
+       memcpy(&tmp_drop_dest_policy, &necp_drop_dest_policy, sizeof(struct necp_drop_dest_policy));
+       error = sysctl_io_opaque(req, &tmp_drop_dest_policy, sizeof(struct necp_drop_dest_policy), &changed);
+       if (error != 0) {
+               NECPLOG(LOG_ERR, "%s (process %s:%u) sysctl_io_opaque() error %d", __func__, proc_best_name(p), pid, error);
+               return error;
+       }
+       if (changed == 0 || req->newptr == USER_ADDR_NULL) {
+               return error;
+       }
+
+       //
+       // Validate the passed parameters
+       //
+       if (tmp_drop_dest_policy.entry_count >= MAX_NECP_DROP_DEST_LEVEL_ADDRS) {
+               NECPLOG(LOG_ERR, "%s (process %s:%u) bad entry_count %u", __func__, proc_best_name(p), pid, tmp_drop_dest_policy.entry_count);
+               return EINVAL;
+       }
+       for (u_int32_t i = 0; i < tmp_drop_dest_policy.entry_count; i++) {
+               struct necp_drop_dest_entry *tmp_drop_dest_entry = &tmp_drop_dest_policy.entries[i];
+               struct necp_policy_condition_addr *npca = &tmp_drop_dest_entry->cond_addr;
+
+               switch (tmp_drop_dest_entry->level) {
+               case NECP_SESSION_PRIORITY_UNKNOWN:
+                       if (tmp_drop_dest_policy.entry_count != 0) {
+                               NECPLOG(LOG_ERR, "%s (process %s:%u) NECP_SESSION_PRIORITY_UNKNOWN bad entry_count %u", __func__, proc_best_name(p), pid, tmp_drop_dest_policy.entry_count);
+                               return EINVAL;
+                       }
+                       break;
+               case NECP_SESSION_PRIORITY_CONTROL:
+               case NECP_SESSION_PRIORITY_PRIVILEGED_TUNNEL:
+               case NECP_SESSION_PRIORITY_HIGH:
+               case NECP_SESSION_PRIORITY_DEFAULT:
+               case NECP_SESSION_PRIORITY_LOW:
+                       if (tmp_drop_dest_policy.entry_count == 0) {
+                               NECPLOG(LOG_ERR, "%s (process %s:%u) priority %u entry_count 0", __func__, proc_best_name(p), pid, tmp_drop_dest_entry->level);
+                               return EINVAL;
+                       }
+                       break;
+               default: {
+                       NECPLOG(LOG_ERR, "%s (process %s:%u) bad level %u", __func__, proc_best_name(p), pid, tmp_drop_dest_entry->level);
+                       return EINVAL;
+               }
+               }
+
+               switch (npca->address.sa.sa_family) {
+               case AF_INET: {
+                       if (npca->prefix > 32) {
+                               NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET bad prefix %u", __func__, proc_best_name(p), pid, npca->prefix);
+                               return EINVAL;
+                       }
+                       if (npca->address.sin.sin_len != sizeof(struct sockaddr_in)) {
+                               NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET bad sin_len %u", __func__, proc_best_name(p), pid, npca->address.sin.sin_len);
+                               return EINVAL;
+                       }
+                       if (npca->address.sin.sin_port != 0) {
+                               NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET bad sin_port %u, not zero", __func__, proc_best_name(p), pid, npca->address.sin.sin_port);
+                               return EINVAL;
+                       }
+                       break;
+               }
+               case AF_INET6: {
+                       if (npca->prefix > 128) {
+                               NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET6 bad prefix %u", __func__, proc_best_name(p), pid, npca->prefix);
+                               return EINVAL;
+                       }
+                       if (npca->address.sin6.sin6_len != sizeof(struct sockaddr_in6)) {
+                               NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET6 bad sin6_len %u", __func__, proc_best_name(p), pid, npca->address.sin6.sin6_len);
+                               return EINVAL;
+                       }
+                       if (npca->address.sin6.sin6_port != 0) {
+                               NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET6 bad sin6_port %u, not zero", __func__, proc_best_name(p), pid, npca->address.sin6.sin6_port);
+                               return EINVAL;
+                       }
+                       if (npca->address.sin6.sin6_flowinfo != 0) {
+                               NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET6 bad sin6_flowinfo %u, not zero", __func__, proc_best_name(p), pid, npca->address.sin6.sin6_flowinfo);
+                               return EINVAL;
+                       }
+                       if (npca->address.sin6.sin6_scope_id != 0) {
+                               NECPLOG(LOG_ERR, "%s (process %s:%u) AF_INET6 bad sin6_scope_id %u, not zero", __func__, proc_best_name(p), pid, npca->address.sin6.sin6_scope_id);
+                               return EINVAL;
+                       }
+                       break;
+               }
+               default: {
+                       return EINVAL;
+               }
+               }
+       }
+
+       //
+       // Commit the changed policy
+       //
+       lck_rw_lock_exclusive(&necp_kernel_policy_lock);
+       memset(&necp_drop_dest_policy, 0, sizeof(struct necp_drop_dest_policy));
+
+       necp_drop_dest_policy.entry_count = tmp_drop_dest_policy.entry_count;
+       for (u_int32_t i = 0; i < tmp_drop_dest_policy.entry_count; i++) {
+               struct necp_drop_dest_entry *tmp_drop_dest_entry = &tmp_drop_dest_policy.entries[i];
+               struct necp_drop_dest_entry *necp_drop_dest_entry = &necp_drop_dest_policy.entries[i];
+
+               memcpy(necp_drop_dest_entry, tmp_drop_dest_entry, sizeof(struct necp_drop_dest_entry));
+
+               necp_drop_dest_entry->order = necp_get_first_order_for_priority(necp_drop_dest_entry->level);
+       }
+       lck_rw_done(&necp_kernel_policy_lock);
+
+       return 0;
+}
index 5ae4af20d711adcf0d4132e3e4804623a8eaedd1..b6f9db0afe9862dbd55007845eb184b34b8a6498 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2013-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -130,12 +130,25 @@ struct necp_packet_header {
 #define NECP_POLICY_CONDITION_BOUND_INTERFACE   9       // String
 #define NECP_POLICY_CONDITION_TRAFFIC_CLASS             10      // necp_policy_condition_tc_range
 // Socket/IP conditions
-#define NECP_POLICY_CONDITION_IP_PROTOCOL               11      // u_int8_t
+#define NECP_POLICY_CONDITION_IP_PROTOCOL               11      // u_int16_t
 #define NECP_POLICY_CONDITION_LOCAL_ADDR                12      // necp_policy_condition_addr
 #define NECP_POLICY_CONDITION_REMOTE_ADDR               13      // necp_policy_condition_addr
 #define NECP_POLICY_CONDITION_LOCAL_ADDR_RANGE  14      // necp_policy_condition_addr_range
 #define NECP_POLICY_CONDITION_REMOTE_ADDR_RANGE 15      // necp_policy_condition_addr_range
 #define NECP_POLICY_CONDITION_AGENT_TYPE                16      // struct necp_policy_condition_agent_type
+#define NECP_POLICY_CONDITION_HAS_CLIENT                17      // N/A
+#define NECP_POLICY_CONDITION_LOCAL_NETWORKS            18      // Matches all local networks
+// Socket-only conditions
+#define NECP_POLICY_CONDITION_FLOW_IP_PROTOCOL          19      // u_int16_t
+#define NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR           20      // necp_policy_condition_addr
+#define NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR          21      // necp_policy_condition_addr
+#define NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_RANGE     22      // necp_policy_condition_addr_range
+#define NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_RANGE    23      // necp_policy_condition_addr_range
+// Socket/Application conditions, continued
+#define NECP_POLICY_CONDITION_CLIENT_FLAGS              24      // u_int32_t, values from NECP_CLIENT_PARAMETER_FLAG_*
+#define NECP_POLICY_CONDITION_FLOW_LOCAL_ADDR_EMPTY     25      // N/A
+#define NECP_POLICY_CONDITION_FLOW_REMOTE_ADDR_EMPTY    26      // N/A
+#define NECP_POLICY_CONDITION_PLATFORM_BINARY           27      // N/A
 
 /*
  * Results
@@ -156,8 +169,9 @@ struct necp_packet_header {
 #define NECP_POLICY_RESULT_USE_NETAGENT                 14      // netagent uuid_t
 #define NECP_POLICY_RESULT_NETAGENT_SCOPED              15      // netagent uuid_t
 #define NECP_POLICY_RESULT_SCOPED_DIRECT                16      // N/A, scopes to primary physical interface
+#define NECP_POLICY_RESULT_ALLOW_UNENTITLED                             17              // N/A
 
-#define NECP_POLICY_RESULT_MAX                                  NECP_POLICY_RESULT_SCOPED_DIRECT
+#define NECP_POLICY_RESULT_MAX                                  NECP_POLICY_RESULT_ALLOW_UNENTITLED
 
 /*
  * Route Rules
@@ -170,9 +184,10 @@ struct necp_packet_header {
 #define NECP_ROUTE_RULE_DENY_LQM_ABORT                  4       // String, or empty to match all
 
 #define NECP_ROUTE_RULE_FLAG_CELLULAR                   0x01
-#define NECP_ROUTE_RULE_FLAG_WIFI                               0x02
-#define NECP_ROUTE_RULE_FLAG_WIRED                              0x04
+#define NECP_ROUTE_RULE_FLAG_WIFI                       0x02
+#define NECP_ROUTE_RULE_FLAG_WIRED                      0x04
 #define NECP_ROUTE_RULE_FLAG_EXPENSIVE                  0x08
+#define NECP_ROUTE_RULE_FLAG_CONSTRAINED                0x10
 
 /*
  * Error types
@@ -303,7 +318,7 @@ struct necp_basic_metadata {
        u_int32_t       rcvbufused;
 };
 
-struct necp_tcp_probe_status {
+struct necp_connection_probe_status {
        unsigned int    probe_activated : 1;
        unsigned int    write_probe_failed : 1;
        unsigned int    read_probe_failed : 1;
@@ -311,7 +326,7 @@ struct necp_tcp_probe_status {
 };
 
 struct necp_extra_tcp_metadata {
-       struct necp_tcp_probe_status probestatus;
+       struct necp_connection_probe_status probestatus;
 
        u_int32_t       sndbufsize;
        u_int32_t       sndbufused;
@@ -323,7 +338,6 @@ struct necp_extra_tcp_metadata {
        u_int32_t       traffic_mgt_flags;
        u_int32_t       cc_alg_index;
        u_int32_t       state;
-       activity_bitmap_t       activity_bitmap;
 };
 
 struct necp_stats_hdr {
@@ -334,11 +348,15 @@ struct necp_stats_hdr {
 
 #define NECP_CLIENT_STATISTICS_TYPE_TCP                         1       // Identifies use of necp_tcp_stats
 #define NECP_CLIENT_STATISTICS_TYPE_UDP                         2       // Identifies use of necp_udp_stats
+#define NECP_CLIENT_STATISTICS_TYPE_QUIC                        3       // Identifies use of necp_quic_stats
+
 #define NECP_CLIENT_STATISTICS_TYPE_TCP_VER_1           1       // Currently supported version for TCP
 #define NECP_CLIENT_STATISTICS_TYPE_UDP_VER_1           1       // Currently supported version for UDP
+#define NECP_CLIENT_STATISTICS_TYPE_QUIC_VER_1          1       // Currently supported version for QUIC
 
 #define NECP_CLIENT_STATISTICS_TYPE_TCP_CURRENT_VER             NECP_CLIENT_STATISTICS_TYPE_TCP_VER_1
 #define NECP_CLIENT_STATISTICS_TYPE_UDP_CURRENT_VER             NECP_CLIENT_STATISTICS_TYPE_UDP_VER_1
+#define NECP_CLIENT_STATISTICS_TYPE_QUIC_CURRENT_VER            NECP_CLIENT_STATISTICS_TYPE_QUIC_VER_1
 
 #define NECP_CLIENT_STATISTICS_EVENT_INIT                       0x00000000              // Register the flow
 #define NECP_CLIENT_STATISTICS_EVENT_TIME_WAIT          0x00000001              // The flow is effectively finished but waiting on timer
@@ -356,10 +374,42 @@ struct necp_udp_stats {
        struct necp_basic_metadata      necp_udp_basic;
 };
 
+
+/*
+ * The following reflects the special case for QUIC.
+ * It is a streaming protocol built on top of UDP.
+ * Therefore QUIC stats are defined as basic UDP stats
+ * with some extra meta data.
+ * TODO: For now the extra metadata is an exact replica
+ * of the metadata for TCP. However keeping that separate allows
+ * the structures to diverge later as new stats are added.
+ */
+#define QUIC_STATELESS_RESET_TOKEN_SIZE               16
+struct necp_extra_quic_metadata {
+       u_int32_t       sndbufsize;
+       u_int32_t       sndbufused;
+       u_int32_t       txunacked;
+       u_int32_t       txwindow;
+       u_int32_t       txcwindow;
+       u_int32_t       traffic_mgt_flags;
+       u_int32_t       cc_alg_index;
+       u_int32_t       state;
+       u_int8_t        ssr_token[QUIC_STATELESS_RESET_TOKEN_SIZE];
+};
+
+#define necp_quic_hdr           necp_quic_udp_stats.necp_udp_hdr
+#define necp_quic_counts        necp_quic_udp_stats.necp_udp_counts
+#define necp_quic_basic         necp_quic_udp_stats.necp_udp_basic
+struct necp_quic_stats {
+       struct necp_udp_stats           necp_quic_udp_stats;
+       struct necp_extra_quic_metadata necp_quic_extra;
+};
+
 typedef struct necp_all_stats {
        union {
                struct necp_tcp_stats   tcp_stats;
                struct necp_udp_stats   udp_stats;
+               struct necp_quic_stats  quic_stats;
        } all_stats_u;
 } necp_all_stats;
 
@@ -448,7 +498,8 @@ typedef struct necp_cache_buffer {
 #define NECP_CLIENT_ACTION_COPY_UPDATED_RESULT                  16 // Copy client result only if changed. Input: client_id; Output: result in buffer
 #define NECP_CLIENT_ACTION_ADD_FLOW                                             17 // Add a flow. Input: client_id; Output: struct necp_client_add_flow
 #define NECP_CLIENT_ACTION_REMOVE_FLOW                                  18 // Remove a flow. Input: flow_id, optional struct ifnet_stats_per_flow
-
+#define NECP_CLIENT_ACTION_CLAIM                                       19 // Claim a client that has been added for this unique PID. Input: client_id
+#define NECP_CLIENT_ACTION_SIGN                                       20 // Sign a resolver answer. Input: struct necp_client_resolver_answer; Output: signed tag, expected to be 32 bytes
 
 #define NECP_CLIENT_PARAMETER_APPLICATION                               NECP_POLICY_CONDITION_APPLICATION               // Requires entitlement
 #define NECP_CLIENT_PARAMETER_REAL_APPLICATION                  NECP_POLICY_CONDITION_REAL_APPLICATION  // Requires entitlement
@@ -487,9 +538,24 @@ typedef struct necp_cache_buffer {
 #define NECP_CLIENT_PARAMETER_ASSERT_AGENT                              131             // uuid_t, network agent UUID
 #define NECP_CLIENT_PARAMETER_UNASSERT_AGENT                    132             // uuid_t, network agent UUID
 
+#define NECP_CLIENT_PARAMETER_PARENT_ID                                                 150 // uuid_t, client UUID
+
 #define NECP_CLIENT_PARAMETER_LOCAL_ENDPOINT                    200             // struct necp_client_endpoint
 #define NECP_CLIENT_PARAMETER_REMOTE_ENDPOINT                   201             // struct necp_client_endpoint
-#define NECP_CLIENT_PARAMETER_BROWSE_CATEGORY                   202             // struct necp_client_endpoint
+#define NECP_CLIENT_PARAMETER_BROWSE_DESCRIPTOR                  202             // struct necp_client_endpoint
+#define NECP_CLIENT_PARAMETER_RESOLVER_TAG                      203                             // Tag as bytes, expected to be 32 bytes
+#define NECP_CLIENT_PARAMETER_ADVERTISE_DESCRIPTOR                  204             // struct necp_client_endpoint
+
+#define NECP_CLIENT_PARAMETER_DELEGATED_UPID                              210 // u_int64_t, requires entitlement
+
+#define NECP_CLIENT_PARAMETER_ETHERTYPE                              220 // u_int16_t, ethertype
+#define NECP_CLIENT_PARAMETER_TRANSPORT_PROTOCOL                        221 // u_int8_t, IPPROTO_
+
+#define NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE                        230 // u_int8_t, NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE_
+
+#define NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE_DEFAULT                    0
+#define NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE_TEMPORARY                    1
+#define NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE_STABLE                    2
 
 #define NECP_CLIENT_PARAMETER_FLAGS                                             250             // u_int32_t, see NECP_CLIENT_PAREMETER_FLAG_* values
 
@@ -501,8 +567,13 @@ typedef struct necp_cache_buffer {
 #define NECP_CLIENT_PARAMETER_FLAG_ECN_ENABLE                   0x0020  // Client is requesting to enable ECN
 #define NECP_CLIENT_PARAMETER_FLAG_ECN_DISABLE                  0x0040  // Client is requesting to disable ECN
 #define NECP_CLIENT_PARAMETER_FLAG_TFO_ENABLE                   0x0080  // Client is requesting to enable TFO
-#define NECP_CLIENT_PARAMETER_FLAG_ONLY_PRIMARY_REQUIRES_TYPE 0x0100    // Interpret NECP_CLIENT_PARAMETER_REQUIRE_IF_TYPE only for primary
-// interface, and allow exceptions for multipath or listeners
+#define NECP_CLIENT_PARAMETER_FLAG_ONLY_PRIMARY_REQUIRES_TYPE   0x0100    // Interpret NECP_CLIENT_PARAMETER_REQUIRE_IF_TYPE only for primary interface, and allow exceptions for multipath or listeners
+#define NECP_CLIENT_PARAMETER_FLAG_CUSTOM_ETHER                 0x0200  // Client expects to open a custom ethernet channel
+#define NECP_CLIENT_PARAMETER_FLAG_CUSTOM_IP                    0x0400  // Client expects to open a custom IP protocol channel
+#define NECP_CLIENT_PARAMETER_FLAG_INTERPOSE                    0x0800  // Client expects to open an interpose filter channel
+#define NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_CONSTRAINED       0x1000  // Prohibit constrained interfaces
+#define NECP_CLIENT_PARAMETER_FLAG_FALLBACK_TRAFFIC           0x2000  // Fallback traffic
+#define NECP_CLIENT_PARAMETER_FLAG_INBOUND                    0x4000  // Flow is inbound (passive)
 
 #define NECP_CLIENT_RESULT_CLIENT_ID                                    1               // uuid_t
 #define NECP_CLIENT_RESULT_POLICY_RESULT                                2               // u_int32_t
@@ -521,6 +592,7 @@ typedef struct necp_cache_buffer {
 #define NECP_CLIENT_RESULT_RECOMMENDED_MSS                              15              // u_int8_t
 #define NECP_CLIENT_RESULT_FLOW_ID                                              16              // uuid_t
 #define NECP_CLIENT_RESULT_INTERFACE_TIME_DELTA                 17              // u_int32_t, seconds since interface up/down
+#define NECP_CLIENT_RESULT_REASON                                               18              // u_int32_t, see NECP_CLIENT_RESULT_REASON_* values
 
 #define NECP_CLIENT_RESULT_NEXUS_INSTANCE                               100             // uuid_t
 #define NECP_CLIENT_RESULT_NEXUS_PORT                                   101             // u_int16_t
@@ -531,8 +603,12 @@ typedef struct necp_cache_buffer {
 #define NECP_CLIENT_RESULT_LOCAL_ENDPOINT                               200             // struct necp_client_endpoint
 #define NECP_CLIENT_RESULT_REMOTE_ENDPOINT                              201             // struct necp_client_endpoint
 #define NECP_CLIENT_RESULT_DISCOVERED_ENDPOINT                  202             // struct necp_client_endpoint, result of browse
+#define NECP_CLIENT_RESULT_RESOLVED_ENDPOINT                  203             // struct necp_client_endpoint, result of resolve
+#define NECP_CLIENT_RESULT_LOCAL_ETHER_ADDR                                     204                     // struct ether_addr
+#define NECP_CLIENT_RESULT_REMOTE_ETHER_ADDR                                    205                     // struct ether_addr
 #define NECP_CLIENT_RESULT_EFFECTIVE_TRAFFIC_CLASS              210             // u_int32_t
 #define NECP_CLIENT_RESULT_TRAFFIC_MGMT_BG                              211             // u_int32_t, 1: background, 0: not background
+#define NECP_CLIENT_RESULT_GATEWAY                                      212             // struct necp_client_endpoint
 
 #define NECP_CLIENT_RESULT_FLAG_IS_LOCAL                                0x0001  // Routes to this device
 #define NECP_CLIENT_RESULT_FLAG_IS_DIRECT                               0x0002  // Routes to directly accessible peer
@@ -549,6 +625,7 @@ typedef struct necp_cache_buffer {
 #define NECP_CLIENT_RESULT_FLAG_ALLOW_QOS_MARKING               0x1000  // QoS marking is allowed
 #define NECP_CLIENT_RESULT_FLAG_HAS_NAT64                       0x2000  // Has NAT64 prefix
 #define NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER             0x4000  // Interface is in low-power mode
+#define NECP_CLIENT_RESULT_FLAG_SPECIFIC_LISTENER               0x8000  // Listener should not listen on all interfaces
 
 #define NECP_CLIENT_RESULT_FLAG_FORCE_UPDATE (NECP_CLIENT_RESULT_FLAG_HAS_IPV4 | NECP_CLIENT_RESULT_FLAG_HAS_IPV6 | NECP_CLIENT_RESULT_FLAG_HAS_NAT64 | NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER)
 
@@ -559,6 +636,11 @@ typedef struct necp_cache_buffer {
 #define NECP_CLIENT_RESULT_RECOMMENDED_MSS_LOW                  0x02
 #define NECP_CLIENT_RESULT_RECOMMENDED_MSS_MEDIUM               0x04
 
+#define NECP_CLIENT_RESULT_REASON_EXPENSIVE_PROHIBITED          1  // Expensive networks were prohibited
+#define NECP_CLIENT_RESULT_REASON_CONSTRAINED_PROHIBITED                2  // Constrained networks were prohibited
+#define NECP_CLIENT_RESULT_REASON_CELLULAR_DENIED                3  // Denied by a cellular route rule
+#define NECP_CLIENT_RESULT_REASON_WIFI_DENIED                4  // Denied by a wifi route rule
+
 struct necp_interface_signature {
        u_int8_t signature[IFNET_SIGNATURELEN];
        u_int8_t signature_len;
@@ -574,6 +656,8 @@ struct necp_interface_details {
        u_int32_t mtu;
        struct necp_interface_signature ipv4_signature;
        struct necp_interface_signature ipv6_signature;
+       u_int32_t ipv4_netmask;
+       u_int32_t ipv4_broadcast;
 };
 
 #define NECP_INTERFACE_FLAG_EXPENSIVE                                   0x0001
@@ -581,6 +665,10 @@ struct necp_interface_details {
 #define NECP_INTERFACE_FLAG_NOACKPRI                                    0x0004
 #define NECP_INTERFACE_FLAG_3CARRIERAGG                                 0x0008
 #define NECP_INTERFACE_FLAG_IS_LOW_POWER                                0x0010
+#define NECP_INTERFACE_FLAG_MPK_LOG                                     0x0020 // Multi-layer Packet Logging
+#define NECP_INTERFACE_FLAG_CONSTRAINED                                 0x0040
+#define NECP_INTERFACE_FLAG_HAS_NETMASK                                 0x0080
+#define NECP_INTERFACE_FLAG_HAS_BROADCAST                               0x0100
 
 struct necp_client_parameter_netagent_type {
        char netagent_domain[32];
@@ -629,6 +717,8 @@ struct kev_necp_policies_changed_data {
 
 #define NECP_CLIENT_FLOW_FLAGS_ALLOW_NEXUS                      0x01    // Request a nexus instance upon adding a flow
 #define NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID            0x02    // Register the client ID rather than the flow registration ID with network agents
+#define NECP_CLIENT_FLOW_FLAGS_BROWSE                      0x04    // Create request with a browse agent
+#define NECP_CLIENT_FLOW_FLAGS_RESOLVE                      0x08    // Create request with a resolution agent
 
 struct necp_client_flow_stats {
        u_int32_t stats_type; // NECP_CLIENT_STATISTICS_TYPE_*
@@ -666,6 +756,41 @@ struct necp_client_observer_update {
        u_int8_t tlv_buffer[0]; // Parameters or result as TLVs, based on type
 };
 
+#define NECP_CLIENT_SIGN_TYPE_RESOLVER_ANSWER           1
+
+struct necp_client_signable {
+       uuid_t client_id;
+       u_int32_t sign_type;
+} __attribute__((__packed__));
+
+struct necp_client_resolver_answer {
+       uuid_t client_id;
+       u_int32_t sign_type;
+       union sockaddr_in_4_6 address_answer;
+       u_int32_t hostname_length;
+       // hostname
+} __attribute__((__packed__));
+
+#define NECP_FILTER_UNIT_NO_FILTER              UINT32_MAX // Reserved filter unit value that prohibits all filters and socket filters
+
+/*
+ * The sysctl "net.necp.necp_drop_dest_level" controls the global drop rule policy for
+ * a set of destinations addresses at the given level -- the drop rule is the last one
+ * to be evaluated at this level.
+ */
+#define MAX_NECP_DROP_DEST_LEVEL_ADDRS 8
+
+struct necp_drop_dest_entry {
+       u_int32_t                           level;          // priority level
+       u_int32_t                           order;          // session order (read only via sysctl)
+       struct necp_policy_condition_addr   cond_addr;
+};
+
+struct necp_drop_dest_policy {
+       u_int32_t entry_count;
+       struct necp_drop_dest_entry entries[MAX_NECP_DROP_DEST_LEVEL_ADDRS];
+};
+
 #ifdef BSD_KERNEL_PRIVATE
 #include <stdbool.h>
 #include <sys/socketvar.h>
@@ -675,6 +800,8 @@ struct necp_client_observer_update {
 #include <net/if_var.h>
 #include <sys/syslog.h>
 #include <net/network_agent.h>
+#include <net/ethernet.h>
+
 
 SYSCTL_DECL(_net_necp);
 
@@ -713,10 +840,13 @@ struct necp_all_kstats {
 extern errno_t necp_client_init(void);
 extern int necp_application_find_policy_match_internal(proc_t proc, u_int8_t *parameters, u_int32_t parameters_size,
     struct necp_aggregate_result *returned_result,
-    u_int32_t *flags, u_int required_interface_index,
+    u_int32_t *flags, u_int32_t *reason, u_int required_interface_index,
     const union necp_sockaddr_union *override_local_addr,
     const union necp_sockaddr_union *override_remote_addr,
-    struct rtentry **returned_route, bool ignore_address);
+    struct necp_client_endpoint *returned_v4_gateway,
+    struct necp_client_endpoint *returned_v6_gateway,
+    struct rtentry **returned_route, bool ignore_address,
+    bool has_client);
 /*
  * TLV utilities
  *
@@ -736,7 +866,7 @@ extern u_int8_t *necp_buffer_write_tlv_if_different(u_int8_t *cursor, u_int8_t t
 extern u_int8_t necp_buffer_get_tlv_type(u_int8_t *buffer, int tlv_offset);
 extern u_int32_t necp_buffer_get_tlv_length(u_int8_t *buffer, int tlv_offset);
 extern u_int8_t *necp_buffer_get_tlv_value(u_int8_t *buffer, int tlv_offset, u_int32_t *value_size);
-extern int necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int offset, u_int8_t type, int next);
+extern int necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int offset, u_int8_t type, int *err, int next);
 
 #define NECPCTL_DROP_ALL_LEVEL                          1       /* Drop all packets if no policy matches above this level */
 #define NECPCTL_DEBUG                                           2       /* Log all kernel policy matches */
@@ -755,13 +885,16 @@ extern int necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int o
 #define NECPCTL_OBSERVER_FD_COUNT                       15      /* Count of NECP observer fds */
 #define NECPCTL_OBSERVER_MESSAGE_LIMIT          16      /* Number of of NECP observer messages allowed to be queued */
 #define NECPCTL_SYSCTL_ARENA_COUNT                      17      /* Count of sysctl arenas */
+#define NECPCTL_DROP_UNENTITLED_LEVEL                   18      /* Drop unentitled process traffic above this level */
+#define NECPCTL_PASS_INTERPOSE                          19      /* Pass interpose */
 
 #define NECPCTL_NAMES {                                 \
        { 0, 0 },                                                       \
        { "drop_all_level", CTLTYPE_INT },      \
        { "debug", CTLTYPE_INT },                       \
        { "pass_loopback", CTLTYPE_INT },       \
-       { "pass_keepalives", CTLTYPE_INT },     \
+    { "pass_keepalives", CTLTYPE_INT },     \
+    { "pass_interpose", CTLTYPE_INT },      \
 }
 
 typedef u_int32_t necp_kernel_policy_id;
@@ -789,6 +922,7 @@ typedef u_int32_t necp_app_id;
 #define NECP_KERNEL_POLICY_RESULT_USE_NETAGENT                  NECP_POLICY_RESULT_USE_NETAGENT
 #define NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED               NECP_POLICY_RESULT_NETAGENT_SCOPED
 #define NECP_KERNEL_POLICY_RESULT_SCOPED_DIRECT                 NECP_POLICY_RESULT_SCOPED_DIRECT
+#define NECP_KERNEL_POLICY_RESULT_ALLOW_UNENTITLED                                  NECP_POLICY_RESULT_ALLOW_UNENTITLED
 
 typedef struct {
        u_int32_t identifier;
@@ -821,6 +955,7 @@ struct necp_kernel_socket_policy {
 
        u_int32_t                                       condition_mask;
        u_int32_t                                       condition_negated_mask;
+       u_int32_t                                       cond_client_flags;
        necp_kernel_policy_id           cond_policy_id;
        u_int32_t                                       cond_app_id;                                    // Locally assigned ID value stored
        u_int32_t                                       cond_real_app_id;                               // Locally assigned ID value stored
@@ -932,7 +1067,10 @@ extern bool necp_socket_should_rescope(struct inpcb *inp);
 extern u_int necp_socket_get_rescope_if_index(struct inpcb *inp);
 extern u_int32_t necp_socket_get_effective_mtu(struct inpcb *inp, u_int32_t current_mtu);
 
-extern bool necp_socket_is_allowed_to_send_recv(struct inpcb *inp, necp_kernel_policy_id *return_policy_id,
+extern bool necp_socket_is_allowed_to_recv_on_interface(struct inpcb *inp, ifnet_t interface);
+
+extern bool necp_socket_is_allowed_to_send_recv(struct inpcb *inp, ifnet_t interface,
+    necp_kernel_policy_id *return_policy_id,
     u_int32_t *return_route_rule_id,
     necp_kernel_policy_id *return_skip_policy_id);
 extern bool necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port,
@@ -950,6 +1088,7 @@ extern int necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp,
     u_int32_t route_rule_id, necp_kernel_policy_id skip_policy_id);
 extern necp_kernel_policy_id necp_get_policy_id_from_packet(struct mbuf *packet);
 extern necp_kernel_policy_id necp_get_skip_policy_id_from_packet(struct mbuf *packet);
+extern bool necp_packet_should_skip_filters(struct mbuf *packet);
 extern u_int32_t necp_get_last_interface_index_from_packet(struct mbuf *packet);
 extern u_int32_t necp_get_route_rule_id_from_packet(struct mbuf *packet);
 extern int necp_get_app_uuid_from_packet(struct mbuf *packet,
@@ -958,9 +1097,11 @@ extern int necp_get_app_uuid_from_packet(struct mbuf *packet,
 extern necp_kernel_policy_id necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local_addr,
     struct sockaddr *override_remote_addr, u_int32_t override_bound_interface);
 extern necp_kernel_policy_id necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_args *ipoa,
+    struct rtentry *rt,
     necp_kernel_policy_result *result,
     necp_kernel_policy_result_parameter *result_parameter);
 extern necp_kernel_policy_id necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out_args *ip6oa,
+    struct rtentry *rt,
     necp_kernel_policy_result *result,
     necp_kernel_policy_result_parameter *result_parameter);
 
@@ -975,11 +1116,20 @@ extern bool necp_packet_is_allowed_over_interface(struct mbuf *packet, struct if
 extern int necp_mark_packet_as_keepalive(struct mbuf *packet, bool is_keepalive);
 extern bool necp_get_is_keepalive_from_packet(struct mbuf *packet);
 
+extern int necp_sign_resolver_answer(uuid_t client_id, u_int8_t *query, u_int32_t query_length,
+    u_int8_t *answer, u_int32_t answer_length,
+    u_int8_t *tag, u_int32_t *out_tag_length);
+
+extern bool necp_validate_resolver_answer(uuid_t client_id, u_int8_t *query, u_int32_t query_length,
+    u_int8_t *answer, u_int32_t answer_length,
+    u_int8_t *tag, u_int32_t tag_length);
+
 extern void necp_update_all_clients(void); // Handle general re-evaluate event
+extern void necp_update_all_clients_immediately_if_needed(bool should_update_immediately); // Handle general re-evaluate event
 
 extern void necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid, u_int32_t agent_generation); // Cause a single client to get an update event
 
-extern void necp_set_client_as_background(proc_t proc, struct fileproc *fp, bool background); // Set all clients for an fp as background or not
+extern bool necp_set_client_as_background(proc_t proc, struct fileproc *fp, bool background); // Set all clients for an fp as background or not
 
 struct necp_fd_data;
 extern void necp_fd_memstatus(proc_t proc, uint32_t status, struct necp_fd_data *client_fd); // Purge memory of clients for the process
@@ -987,6 +1137,9 @@ extern void necp_fd_defunct(proc_t proc, struct necp_fd_data *client_fd); // Set
 
 extern int necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp);
 
+extern int necp_client_register_socket_listener(pid_t pid, uuid_t client_id, struct inpcb *inp);
+
+
 extern int necp_client_assert_bb_radio_manager(uuid_t client_id, bool assert);
 
 extern int necp_client_assign_from_socket(pid_t pid, uuid_t client_id, struct inpcb *inp);
@@ -1007,6 +1160,7 @@ necp_update_flow_protoctl_event(uuid_t netagent_uuid, uuid_t client_id,
 #define NECP_FLOWADV_IDX_INVALID        UINT32_MAX
 extern void *necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, void *key, uint32_t key_length,
     struct necp_client_endpoint *local_endpoint, struct necp_client_endpoint *remote_endpoint,
+    struct ether_addr *local_ether_addr,
     u_int32_t flow_adv_index, void *flow_stats, size_t *message_length);
 
 struct necp_client_nexus_parameters {
@@ -1015,11 +1169,24 @@ struct necp_client_nexus_parameters {
        uuid_t euuid;
        union necp_sockaddr_union local_addr;
        union necp_sockaddr_union remote_addr;
-       u_int16_t ip_protocol;
+       u_int8_t ip_protocol;
+       u_int8_t transport_protocol;
+       u_int16_t ethertype;
        u_int32_t traffic_class;
        necp_policy_id policy_id;
        unsigned is_listener:1;
+       unsigned is_interpose:1;
+       unsigned is_custom_ether:1;
        unsigned allow_qos_marking:1;
+       unsigned override_address_selection:1;
+       unsigned use_stable_address:1; // Used if override_address_selection is set
+};
+
+struct necp_client_agent_parameters {
+       union {
+               struct necp_client_nexus_parameters nexus_request;
+               u_int8_t close_token[QUIC_STATELESS_RESET_TOKEN_SIZE];
+       } u;
 };
 
 #define NECP_CLIENT_CBACTION_NONVIABLE  1
@@ -1040,6 +1207,13 @@ extern void necp_client_reap_caches(boolean_t purge);
 
 
 #endif /* BSD_KERNEL_PRIVATE */
+
+#ifdef KERNEL
+#ifdef KERNEL_PRIVATE
+extern bool net_domain_contains_hostname(char *hostname_string, char *domain_string);
+#endif /* KERNEL_PRIVATE */
+#endif /* KERNEL */
+
 #ifndef KERNEL
 
 extern int necp_match_policy(const uint8_t *parameters, size_t parameters_size, struct necp_aggregate_result *returned_result);
index 893ce06f04af39ff9deffd908895792a59e2917e..ec1fd72f0e8645b9b24d5ca1792c69308261a616 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2015-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -41,6 +41,7 @@
 #include <net/ntstat.h>
 
 #include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/mp_pcb.h>
@@ -66,6 +67,8 @@
 #include <sys/codesign.h>
 #include <libkern/section_keywords.h>
 
+#include <os/refcnt.h>
+
 
 /*
  * NECP Client Architecture
 
 extern u_int32_t necp_debug;
 
-static int noop_read(struct fileproc *, struct uio *, int, vfs_context_t);
-static int noop_write(struct fileproc *, struct uio *, int, vfs_context_t);
-static int noop_ioctl(struct fileproc *, unsigned long, caddr_t,
-    vfs_context_t);
 static int necpop_select(struct fileproc *, int, void *, vfs_context_t);
 static int necpop_close(struct fileglob *, vfs_context_t);
-static int necpop_kqfilter(struct fileproc *, struct knote *,
-    struct kevent_internal_s *kev, vfs_context_t);
+static int necpop_kqfilter(struct fileproc *, struct knote *, struct kevent_qos_s *);
 
 // Timer functions
 static int necp_timeout_microseconds = 1000 * 100; // 100ms
@@ -165,6 +163,8 @@ static int necp_socket_flow_count = 0;
 static int necp_if_flow_count = 0;
 static int necp_observer_message_limit = 256;
 
+os_refgrp_decl(static, necp_client_refgrp, "NECPClientRefGroup", NULL);
+
 SYSCTL_INT(_net_necp, NECPCTL_CLIENT_FD_COUNT, client_fd_count, CTLFLAG_LOCKED | CTLFLAG_RD, &necp_client_fd_count, 0, "");
 SYSCTL_INT(_net_necp, NECPCTL_OBSERVER_FD_COUNT, observer_fd_count, CTLFLAG_LOCKED | CTLFLAG_RD, &necp_observer_fd_count, 0, "");
 SYSCTL_INT(_net_necp, NECPCTL_CLIENT_COUNT, client_count, CTLFLAG_LOCKED | CTLFLAG_RD, &necp_client_count, 0, "");
@@ -199,26 +199,36 @@ extern unsigned int get_maxmtu(struct rtentry *);
 #define NECP_PARSED_PARAMETERS_FIELD_EFFECTIVE_UUID                     0x20000
 #define NECP_PARSED_PARAMETERS_FIELD_TRAFFIC_CLASS                      0x40000
 #define NECP_PARSED_PARAMETERS_FIELD_LOCAL_PORT                         0x80000
+#define NECP_PARSED_PARAMETERS_FIELD_DELEGATED_UPID                                             0x100000
+#define NECP_PARSED_PARAMETERS_FIELD_ETHERTYPE                                             0x200000
+#define NECP_PARSED_PARAMETERS_FIELD_TRANSPORT_PROTOCOL                        0x400000
+#define NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR_PREFERENCE                                              0x800000
 
-#define NECP_MAX_PARSED_PARAMETERS 16
+
+#define NECP_MAX_INTERFACE_PARAMETERS 16
+#define NECP_MAX_AGENT_PARAMETERS 4
 struct necp_client_parsed_parameters {
        u_int32_t valid_fields;
        u_int32_t flags;
+       u_int64_t delegated_upid;
        union necp_sockaddr_union local_addr;
        union necp_sockaddr_union remote_addr;
        u_int32_t required_interface_index;
-       char prohibited_interfaces[IFXNAMSIZ][NECP_MAX_PARSED_PARAMETERS];
+       char prohibited_interfaces[NECP_MAX_INTERFACE_PARAMETERS][IFXNAMSIZ];
        u_int8_t required_interface_type;
-       u_int8_t prohibited_interface_types[NECP_MAX_PARSED_PARAMETERS];
-       struct necp_client_parameter_netagent_type required_netagent_types[NECP_MAX_PARSED_PARAMETERS];
-       struct necp_client_parameter_netagent_type prohibited_netagent_types[NECP_MAX_PARSED_PARAMETERS];
-       struct necp_client_parameter_netagent_type preferred_netagent_types[NECP_MAX_PARSED_PARAMETERS];
-       struct necp_client_parameter_netagent_type avoided_netagent_types[NECP_MAX_PARSED_PARAMETERS];
-       uuid_t required_netagents[NECP_MAX_PARSED_PARAMETERS];
-       uuid_t prohibited_netagents[NECP_MAX_PARSED_PARAMETERS];
-       uuid_t preferred_netagents[NECP_MAX_PARSED_PARAMETERS];
-       uuid_t avoided_netagents[NECP_MAX_PARSED_PARAMETERS];
-       u_int16_t ip_protocol;
+       u_int8_t local_address_preference;
+       u_int8_t prohibited_interface_types[NECP_MAX_INTERFACE_PARAMETERS];
+       struct necp_client_parameter_netagent_type required_netagent_types[NECP_MAX_AGENT_PARAMETERS];
+       struct necp_client_parameter_netagent_type prohibited_netagent_types[NECP_MAX_AGENT_PARAMETERS];
+       struct necp_client_parameter_netagent_type preferred_netagent_types[NECP_MAX_AGENT_PARAMETERS];
+       struct necp_client_parameter_netagent_type avoided_netagent_types[NECP_MAX_AGENT_PARAMETERS];
+       uuid_t required_netagents[NECP_MAX_AGENT_PARAMETERS];
+       uuid_t prohibited_netagents[NECP_MAX_AGENT_PARAMETERS];
+       uuid_t preferred_netagents[NECP_MAX_AGENT_PARAMETERS];
+       uuid_t avoided_netagents[NECP_MAX_AGENT_PARAMETERS];
+       u_int8_t ip_protocol;
+       u_int8_t transport_protocol;
+       u_int16_t ethertype;
        pid_t effective_pid;
        uuid_t effective_uuid;
        u_int32_t traffic_class;
@@ -234,18 +244,20 @@ necp_ifnet_matches_local_address(struct ifnet *ifp, struct sockaddr *sa);
 static bool
 necp_ifnet_matches_parameters(struct ifnet *ifp,
     struct necp_client_parsed_parameters *parsed_parameters,
+    u_int32_t override_flags,
     u_int32_t *preferred_count,
-    bool secondary_interface);
+    bool secondary_interface,
+    bool require_scoped_field);
 
 static const struct fileops necp_fd_ops = {
-       .fo_type = DTYPE_NETPOLICY,
-       .fo_read = noop_read,
-       .fo_write = noop_write,
-       .fo_ioctl = noop_ioctl,
-       .fo_select = necpop_select,
-       .fo_close = necpop_close,
+       .fo_type     = DTYPE_NETPOLICY,
+       .fo_read     = fo_no_read,
+       .fo_write    = fo_no_write,
+       .fo_ioctl    = fo_no_ioctl,
+       .fo_select   = necpop_select,
+       .fo_close    = necpop_close,
+       .fo_drain    = fo_no_drain,
        .fo_kqfilter = necpop_kqfilter,
-       .fo_drain = NULL,
 };
 
 struct necp_client_assertion {
@@ -338,15 +350,13 @@ struct necp_client {
 
        decl_lck_mtx_data(, lock);
        decl_lck_mtx_data(, route_lock);
-       uint32_t reference_count;
+       os_refcnt_t reference_count;
 
        uuid_t client_id;
        unsigned result_read : 1;
        unsigned allow_multiple_flows : 1;
        unsigned legacy_client_is_flow : 1;
 
-       unsigned background : 1;
-       unsigned background_update : 1;
        unsigned platform_binary : 1;
 
        size_t result_length;
@@ -354,9 +364,11 @@ struct necp_client {
 
        necp_policy_id policy_id;
 
-       u_int16_t ip_protocol;
+       u_int8_t ip_protocol;
        int proc_pid;
 
+       u_int64_t delegated_upid;
+
        struct _necp_client_flow_tree flow_registrations;
        LIST_HEAD(_necp_client_assertion_list, necp_client_assertion) assertion_list;
 
@@ -370,6 +382,7 @@ struct necp_client {
 
        void *agent_handle;
 
+
        size_t parameters_length;
        u_int8_t parameters[0];
 };
@@ -383,8 +396,9 @@ struct necp_client {
 #define NECP_CLIENT_ROUTE_UNLOCK(_c) lck_mtx_unlock(&_c->route_lock)
 
 static void necp_client_retain_locked(struct necp_client *client);
-static void necp_client_retain(struct necp_client *client);
+
 static bool necp_client_release_locked(struct necp_client *client);
+static bool necp_client_release(struct necp_client *client);
 
 static void
 necp_client_add_assertion(struct necp_client *client, uuid_t netagent_uuid);
@@ -402,6 +416,9 @@ struct necp_flow_defunct {
        uuid_t nexus_agent;
        void *agent_handle;
        int proc_pid;
+       u_int32_t flags;
+       struct necp_client_agent_parameters close_parameters;
+       bool has_close_parameters;
 };
 
 LIST_HEAD(_necp_flow_defunct_list, necp_flow_defunct);
@@ -449,6 +466,9 @@ struct necp_fd_data {
        TAILQ_HEAD(_necp_client_update_list, necp_client_update) update_list;
        int update_count;
        int flags;
+
+       unsigned background : 1;
+
        int proc_pid;
        decl_lck_mtx_data(, fd_lock);
        struct selinfo si;
@@ -537,29 +557,6 @@ static thread_call_t necp_client_update_tcall;
 
 /// NECP file descriptor functions
 
-static int
-noop_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx)
-{
-#pragma unused(fp, uio, flags, ctx)
-       return ENXIO;
-}
-
-static int
-noop_write(struct fileproc *fp, struct uio *uio, int flags,
-    vfs_context_t ctx)
-{
-#pragma unused(fp, uio, flags, ctx)
-       return ENXIO;
-}
-
-static int
-noop_ioctl(struct fileproc *fp, unsigned long com, caddr_t data,
-    vfs_context_t ctx)
-{
-#pragma unused(fp, com, data, ctx)
-       return ENOTTY;
-}
-
 static void
 necp_fd_notify(struct necp_fd_data *fd_data, bool locked)
 {
@@ -793,9 +790,8 @@ necp_fd_knread(struct knote *kn, long hint)
 }
 
 static int
-necp_fd_knrprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
+necp_fd_knrprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-#pragma unused(data)
        struct necp_fd_data *fd_data;
        int revents;
        int res;
@@ -806,14 +802,14 @@ necp_fd_knrprocess(struct knote *kn, struct filt_process_s *data, struct kevent_
        revents = necp_fd_poll(fd_data, POLLIN, NULL, current_proc(), 1);
        res = ((revents & POLLIN) != 0);
        if (res) {
-               *kev = kn->kn_kevent;
+               knote_fill_kevent(kn, kev, 0);
        }
        NECP_FD_UNLOCK(fd_data);
        return res;
 }
 
 static int
-necp_fd_knrtouch(struct knote *kn, struct kevent_internal_s *kev)
+necp_fd_knrtouch(struct knote *kn, struct kevent_qos_s *kev)
 {
 #pragma unused(kev)
        struct necp_fd_data *fd_data;
@@ -838,24 +834,21 @@ SECURITY_READ_ONLY_EARLY(struct filterops) necp_fd_rfiltops = {
 
 static int
 necpop_kqfilter(struct fileproc *fp, struct knote *kn,
-    __unused struct kevent_internal_s *kev, vfs_context_t ctx)
+    __unused struct kevent_qos_s *kev)
 {
-#pragma unused(fp, ctx)
        struct necp_fd_data *fd_data = NULL;
        int revents;
 
        if (kn->kn_filter != EVFILT_READ) {
                NECPLOG(LOG_ERR, "bad filter request %d", kn->kn_filter);
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = EINVAL;
+               knote_set_error(kn, EINVAL);
                return 0;
        }
 
-       fd_data = (struct necp_fd_data *)kn->kn_fp->f_fglob->fg_data;
+       fd_data = (struct necp_fd_data *)fp->f_fglob->fg_data;
        if (fd_data == NULL) {
                NECPLOG0(LOG_ERR, "No channel for kqfilter");
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = ENOENT;
+               knote_set_error(kn, ENOENT);
                return 0;
        }
 
@@ -908,7 +901,7 @@ necp_defunct_flow_registration(struct necp_client *client,
                                            flow_registration->registration_id));
                                        flow_defunct->proc_pid = client->proc_pid;
                                        flow_defunct->agent_handle = client->agent_handle;
-
+                                       flow_defunct->flags = flow_registration->flags;
                                        // Add to the list provided by caller
                                        LIST_INSERT_HEAD(defunct_list, flow_defunct, chain);
                                }
@@ -958,33 +951,35 @@ necp_client_retain_locked(struct necp_client *client)
 {
        NECP_CLIENT_ASSERT_LOCKED(client);
 
-       client->reference_count++;
-       ASSERT(client->reference_count != 0);
+       os_ref_retain_locked(&client->reference_count);
 }
 
-static void
-necp_client_retain(struct necp_client *client)
-{
-       NECP_CLIENT_LOCK(client);
-       necp_client_retain_locked(client);
-       NECP_CLIENT_UNLOCK(client);
-}
 
 static bool
 necp_client_release_locked(struct necp_client *client)
 {
        NECP_CLIENT_ASSERT_LOCKED(client);
 
-       uint32_t old_ref = client->reference_count;
-
-       ASSERT(client->reference_count != 0);
-       if (--client->reference_count == 0) {
+       os_ref_count_t count = os_ref_release_locked(&client->reference_count);
+       if (count == 0) {
                necp_client_free(client);
        }
 
-       return old_ref == 1;
+       return count == 0;
 }
 
+static bool
+necp_client_release(struct necp_client *client)
+{
+       bool last_ref;
+
+       NECP_CLIENT_LOCK(client);
+       if (!(last_ref = necp_client_release_locked(client))) {
+               NECP_CLIENT_UNLOCK(client);
+       }
+
+       return last_ref;
+}
 
 static void
 necp_client_update_observer_add_internal(struct necp_fd_data *observer_fd, struct necp_client *client)
@@ -1127,6 +1122,9 @@ necp_destroy_client_flow_registration(struct necp_client *client,
 {
        NECP_CLIENT_ASSERT_LOCKED(client);
 
+       bool has_close_parameters = false;
+       struct necp_client_agent_parameters close_parameters = {};
+       memset(close_parameters.u.close_token, 0, sizeof(close_parameters.u.close_token));
 
        struct necp_client_flow *search_flow = NULL;
        struct necp_client_flow *temp_flow = NULL;
@@ -1134,15 +1132,23 @@ necp_destroy_client_flow_registration(struct necp_client *client,
                if (search_flow->nexus &&
                    !uuid_is_null(search_flow->u.nexus_agent)) {
                        // Note that if we had defuncted the client earlier, this would result in a harmless ENOENT
-                       int netagent_error = netagent_client_message(search_flow->u.nexus_agent,
+                       u_int8_t message_type = (abort ? NETAGENT_MESSAGE_TYPE_ABORT_NEXUS :
+                           NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS);
+                       if (((flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_BROWSE) ||
+                           (flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_RESOLVE)) &&
+                           !(flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_ALLOW_NEXUS)) {
+                               message_type = NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT;
+                       }
+                       int netagent_error = netagent_client_message_with_params(search_flow->u.nexus_agent,
                            ((flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID) ?
                            client->client_id :
                            flow_registration->registration_id),
                            pid, client->agent_handle,
-                           (abort ? NETAGENT_MESSAGE_TYPE_ABORT_NEXUS :
-                           NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS));
+                           message_type,
+                           has_close_parameters ? &close_parameters : NULL,
+                           NULL, 0);
                        if (netagent_error != 0 && netagent_error != ENOENT) {
-                               NECPLOG(LOG_ERR, "necp_client_remove close nexus error (%d)", netagent_error);
+                               NECPLOG(LOG_ERR, "necp_client_remove close nexus error (%d) MESSAGE TYPE %u", netagent_error, message_type);
                        }
                        uuid_clear(search_flow->u.nexus_agent);
                }
@@ -1189,6 +1195,7 @@ necp_destroy_client(struct necp_client *client, pid_t pid, bool abort)
                necp_destroy_client_flow_registration(client, flow_registration, pid, abort);
        }
 
+
        // Remove agent assertions
        struct necp_client_assertion *search_assertion = NULL;
        struct necp_client_assertion *temp_assertion = NULL;
@@ -1439,6 +1446,8 @@ necp_client_add_interface_option_if_needed(struct necp_client *client,
                option->interface_generation = interface_generation;
                if (nexus_agent != NULL) {
                        uuid_copy(option->nexus_agent, *nexus_agent);
+               } else {
+                       uuid_clear(option->nexus_agent);
                }
                client->interface_option_count++;
        } else {
@@ -1452,6 +1461,8 @@ necp_client_add_interface_option_if_needed(struct necp_client *client,
                        option->interface_generation = interface_generation;
                        if (nexus_agent != NULL) {
                                uuid_copy(option->nexus_agent, *nexus_agent);
+                       } else {
+                               uuid_clear(option->nexus_agent);
                        }
                        client->interface_option_count++;
                }
@@ -1468,9 +1479,10 @@ necp_client_flow_is_viable(proc_t proc, struct necp_client *client,
        flow->necp_flow_flags = 0;
        int error = necp_application_find_policy_match_internal(proc, client->parameters,
            (u_int32_t)client->parameters_length,
-           &result, &flow->necp_flow_flags,
+           &result, &flow->necp_flow_flags, NULL,
            flow->interface_index,
-           &flow->local_addr, &flow->remote_addr, NULL, ignore_address);
+           &flow->local_addr, &flow->remote_addr, NULL, NULL,
+           NULL, ignore_address, true);
 
        return error == 0 &&
               result.routed_interface_index != IFSCOPE_NONE &&
@@ -1547,14 +1559,14 @@ necp_client_update_flows(proc_t proc,
 
                        if (flow->viable && client_updated && (flow->socket || (!flow->socket && !flow->nexus)) && flow->u.cb) {
                                bool flow_viable = flow->viable;
-                               flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_VIABLE, flow->interface_index, flow->necp_flow_flags, &viable);
+                               flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_VIABLE, flow->interface_index, flow->necp_flow_flags, &flow_viable);
                                flow->viable = flow_viable;
                        }
 
                        if (!flow->viable || flow->invalid) {
                                if (client_updated && (flow->socket || (!flow->socket && !flow->nexus)) && flow->u.cb) {
                                        bool flow_viable = flow->viable;
-                                       flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_NONVIABLE, flow->interface_index, flow->necp_flow_flags, &viable);
+                                       flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_NONVIABLE, flow->interface_index, flow->necp_flow_flags, &flow_viable);
                                        flow->viable = flow_viable;
                                }
                                // The callback might change the viable-flag of the
@@ -1619,12 +1631,44 @@ necp_netagent_applies_to_client(struct necp_client *client,
                return applies;
        }
 
-       if (!allow_nexus &&
-           (flags & NETAGENT_FLAG_NEXUS_PROVIDER)) {
-               // Hide nexus providers unless allowed
-               // Direct interfaces and direct policies are allowed to use a nexus
-               // Delegate interfaces or re-scoped interfaces are not allowed
-               return applies;
+       const bool is_nexus_agent = ((flags & NETAGENT_FLAG_NEXUS_PROVIDER) ||
+           (flags & NETAGENT_FLAG_NEXUS_LISTENER) ||
+           (flags & NETAGENT_FLAG_CUSTOM_ETHER_NEXUS) ||
+           (flags & NETAGENT_FLAG_CUSTOM_IP_NEXUS) ||
+           (flags & NETAGENT_FLAG_INTERPOSE_NEXUS));
+       if (is_nexus_agent) {
+               if (!allow_nexus) {
+                       // Hide nexus providers unless allowed
+                       // Direct interfaces and direct policies are allowed to use a nexus
+                       // Delegate interfaces or re-scoped interfaces are not allowed
+                       return applies;
+               }
+
+               if ((parameters->flags & NECP_CLIENT_PARAMETER_FLAG_CUSTOM_ETHER) &&
+                   !(flags & NETAGENT_FLAG_CUSTOM_ETHER_NEXUS)) {
+                       // Client requested a custom ether nexus, but this nexus isn't one
+                       return applies;
+               }
+
+               if ((parameters->flags & NECP_CLIENT_PARAMETER_FLAG_CUSTOM_IP) &&
+                   !(flags & NETAGENT_FLAG_CUSTOM_IP_NEXUS)) {
+                       // Client requested a custom IP nexus, but this nexus isn't one
+                       return applies;
+               }
+
+               if ((parameters->flags & NECP_CLIENT_PARAMETER_FLAG_INTERPOSE) &&
+                   !(flags & NETAGENT_FLAG_INTERPOSE_NEXUS)) {
+                       // Client requested an interpose nexus, but this nexus isn't one
+                       return applies;
+               }
+
+               if (!(parameters->flags & NECP_CLIENT_PARAMETER_FLAG_CUSTOM_ETHER) &&
+                   !(parameters->flags & NECP_CLIENT_PARAMETER_FLAG_CUSTOM_IP) &&
+                   !(parameters->flags & NECP_CLIENT_PARAMETER_FLAG_INTERPOSE) &&
+                   !(flags & NETAGENT_FLAG_NEXUS_PROVIDER)) {
+                       // Client requested default parameters, but this nexus isn't generic
+                       return applies;
+               }
        }
 
        if (uuid_compare(client->failed_trigger_agent.netagent_uuid, *netagent_uuid) == 0) {
@@ -1643,7 +1687,7 @@ necp_netagent_applies_to_client(struct necp_client *client,
                bool required = FALSE;
                if (parameters != NULL) {
                        // Check required agent UUIDs
-                       for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+                       for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) {
                                if (uuid_is_null(parameters->required_netagents[i])) {
                                        break;
                                }
@@ -1661,7 +1705,7 @@ necp_netagent_applies_to_client(struct necp_client *client,
                                memset(&netagent_domain, 0, NETAGENT_DOMAINSIZE);
                                memset(&netagent_type, 0, NETAGENT_TYPESIZE);
 
-                               for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+                               for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) {
                                        if (strlen(parameters->required_netagent_types[i].netagent_domain) == 0 ||
                                            strlen(parameters->required_netagent_types[i].netagent_type) == 0) {
                                                break;
@@ -1724,6 +1768,23 @@ necp_client_address_is_valid(struct sockaddr *address)
        }
 }
 
+static inline bool
+necp_client_endpoint_is_unspecified(struct necp_client_endpoint *endpoint)
+{
+       if (necp_client_address_is_valid(&endpoint->u.sa)) {
+               if (endpoint->u.sa.sa_family == AF_INET) {
+                       return endpoint->u.sin.sin_addr.s_addr == INADDR_ANY;
+               } else if (endpoint->u.sa.sa_family == AF_INET6) {
+                       return IN6_IS_ADDR_UNSPECIFIED(&endpoint->u.sin6.sin6_addr);
+               } else {
+                       return TRUE;
+               }
+       } else {
+               return TRUE;
+       }
+}
+
+
 static int
 necp_client_parse_parameters(u_int8_t *parameters,
     u_int32_t parameters_size,
@@ -1742,6 +1803,11 @@ necp_client_parse_parameters(u_int8_t *parameters,
        u_int32_t num_prohibited_agent_types = 0;
        u_int32_t num_preferred_agent_types = 0;
        u_int32_t num_avoided_agent_types = 0;
+       u_int8_t *resolver_tag = NULL;
+       u_int32_t resolver_tag_length = 0;
+       u_int8_t *client_hostname = NULL;
+       u_int32_t hostname_length = 0;
+       uuid_t parent_id = {};
 
        if (parsed_parameters == NULL) {
                return EINVAL;
@@ -1830,7 +1896,7 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                        break;
                                }
                                case NECP_CLIENT_PARAMETER_PROHIBIT_INTERFACE: {
-                                       if (num_prohibited_interfaces >= NECP_MAX_PARSED_PARAMETERS) {
+                                       if (num_prohibited_interfaces >= NECP_MAX_INTERFACE_PARAMETERS) {
                                                break;
                                        }
                                        if (length <= IFXNAMSIZ && length > 0) {
@@ -1854,7 +1920,7 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                        break;
                                }
                                case NECP_CLIENT_PARAMETER_PROHIBIT_IF_TYPE: {
-                                       if (num_prohibited_interface_types >= NECP_MAX_PARSED_PARAMETERS) {
+                                       if (num_prohibited_interface_types >= NECP_MAX_INTERFACE_PARAMETERS) {
                                                break;
                                        }
                                        if (length >= sizeof(u_int8_t)) {
@@ -1865,7 +1931,7 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                        break;
                                }
                                case NECP_CLIENT_PARAMETER_REQUIRE_AGENT: {
-                                       if (num_required_agents >= NECP_MAX_PARSED_PARAMETERS) {
+                                       if (num_required_agents >= NECP_MAX_AGENT_PARAMETERS) {
                                                break;
                                        }
                                        if (length >= sizeof(uuid_t)) {
@@ -1876,7 +1942,7 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                        break;
                                }
                                case NECP_CLIENT_PARAMETER_PROHIBIT_AGENT: {
-                                       if (num_prohibited_agents >= NECP_MAX_PARSED_PARAMETERS) {
+                                       if (num_prohibited_agents >= NECP_MAX_AGENT_PARAMETERS) {
                                                break;
                                        }
                                        if (length >= sizeof(uuid_t)) {
@@ -1887,7 +1953,7 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                        break;
                                }
                                case NECP_CLIENT_PARAMETER_PREFER_AGENT: {
-                                       if (num_preferred_agents >= NECP_MAX_PARSED_PARAMETERS) {
+                                       if (num_preferred_agents >= NECP_MAX_AGENT_PARAMETERS) {
                                                break;
                                        }
                                        if (length >= sizeof(uuid_t)) {
@@ -1898,7 +1964,7 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                        break;
                                }
                                case NECP_CLIENT_PARAMETER_AVOID_AGENT: {
-                                       if (num_avoided_agents >= NECP_MAX_PARSED_PARAMETERS) {
+                                       if (num_avoided_agents >= NECP_MAX_AGENT_PARAMETERS) {
                                                break;
                                        }
                                        if (length >= sizeof(uuid_t)) {
@@ -1909,7 +1975,7 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                        break;
                                }
                                case NECP_CLIENT_PARAMETER_REQUIRE_AGENT_TYPE: {
-                                       if (num_required_agent_types >= NECP_MAX_PARSED_PARAMETERS) {
+                                       if (num_required_agent_types >= NECP_MAX_AGENT_PARAMETERS) {
                                                break;
                                        }
                                        if (length >= sizeof(struct necp_client_parameter_netagent_type)) {
@@ -1920,7 +1986,7 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                        break;
                                }
                                case NECP_CLIENT_PARAMETER_PROHIBIT_AGENT_TYPE: {
-                                       if (num_prohibited_agent_types >= NECP_MAX_PARSED_PARAMETERS) {
+                                       if (num_prohibited_agent_types >= NECP_MAX_AGENT_PARAMETERS) {
                                                break;
                                        }
                                        if (length >= sizeof(struct necp_client_parameter_netagent_type)) {
@@ -1931,7 +1997,7 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                        break;
                                }
                                case NECP_CLIENT_PARAMETER_PREFER_AGENT_TYPE: {
-                                       if (num_preferred_agent_types >= NECP_MAX_PARSED_PARAMETERS) {
+                                       if (num_preferred_agent_types >= NECP_MAX_AGENT_PARAMETERS) {
                                                break;
                                        }
                                        if (length >= sizeof(struct necp_client_parameter_netagent_type)) {
@@ -1942,7 +2008,7 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                        break;
                                }
                                case NECP_CLIENT_PARAMETER_AVOID_AGENT_TYPE: {
-                                       if (num_avoided_agent_types >= NECP_MAX_PARSED_PARAMETERS) {
+                                       if (num_avoided_agent_types >= NECP_MAX_AGENT_PARAMETERS) {
                                                break;
                                        }
                                        if (length >= sizeof(struct necp_client_parameter_netagent_type)) {
@@ -1960,12 +2026,24 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                        break;
                                }
                                case NECP_CLIENT_PARAMETER_IP_PROTOCOL: {
-                                       if (length >= sizeof(parsed_parameters->ip_protocol)) {
+                                       if (length == sizeof(u_int16_t)) {
+                                               u_int16_t large_ip_protocol = 0;
+                                               memcpy(&large_ip_protocol, value, sizeof(large_ip_protocol));
+                                               parsed_parameters->ip_protocol = (u_int8_t)large_ip_protocol;
+                                               parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_IP_PROTOCOL;
+                                       } else if (length >= sizeof(parsed_parameters->ip_protocol)) {
                                                memcpy(&parsed_parameters->ip_protocol, value, sizeof(parsed_parameters->ip_protocol));
                                                parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_IP_PROTOCOL;
                                        }
                                        break;
                                }
+                               case NECP_CLIENT_PARAMETER_TRANSPORT_PROTOCOL: {
+                                       if (length >= sizeof(parsed_parameters->transport_protocol)) {
+                                               memcpy(&parsed_parameters->transport_protocol, value, sizeof(parsed_parameters->transport_protocol));
+                                               parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_TRANSPORT_PROTOCOL;
+                                       }
+                                       break;
+                               }
                                case NECP_CLIENT_PARAMETER_PID: {
                                        if (length >= sizeof(parsed_parameters->effective_pid)) {
                                                memcpy(&parsed_parameters->effective_pid, value, sizeof(parsed_parameters->effective_pid));
@@ -1973,6 +2051,20 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                        }
                                        break;
                                }
+                               case NECP_CLIENT_PARAMETER_DELEGATED_UPID: {
+                                       if (length >= sizeof(parsed_parameters->delegated_upid)) {
+                                               memcpy(&parsed_parameters->delegated_upid, value, sizeof(parsed_parameters->delegated_upid));
+                                               parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_DELEGATED_UPID;
+                                       }
+                                       break;
+                               }
+                               case NECP_CLIENT_PARAMETER_ETHERTYPE: {
+                                       if (length >= sizeof(parsed_parameters->ethertype)) {
+                                               memcpy(&parsed_parameters->ethertype, value, sizeof(parsed_parameters->ethertype));
+                                               parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_ETHERTYPE;
+                                       }
+                                       break;
+                               }
                                case NECP_CLIENT_PARAMETER_APPLICATION: {
                                        if (length >= sizeof(parsed_parameters->effective_uuid)) {
                                                memcpy(&parsed_parameters->effective_uuid, value, sizeof(parsed_parameters->effective_uuid));
@@ -1987,6 +2079,33 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                        }
                                        break;
                                }
+                               case NECP_CLIENT_PARAMETER_RESOLVER_TAG: {
+                                       if (length > 0) {
+                                               resolver_tag = (u_int8_t *)value;
+                                               resolver_tag_length = length;
+                                       }
+                                       break;
+                               }
+                               case NECP_CLIENT_PARAMETER_DOMAIN: {
+                                       if (length > 0) {
+                                               client_hostname = (u_int8_t *)value;
+                                               hostname_length = length;
+                                       }
+                                       break;
+                               }
+                               case NECP_CLIENT_PARAMETER_PARENT_ID: {
+                                       if (length == sizeof(parent_id)) {
+                                               uuid_copy(parent_id, value);
+                                       }
+                                       break;
+                               }
+                               case NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE: {
+                                       if (length >= sizeof(parsed_parameters->local_address_preference)) {
+                                               memcpy(&parsed_parameters->local_address_preference, value, sizeof(parsed_parameters->local_address_preference));
+                                               parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR_PREFERENCE;
+                                       }
+                                       break;
+                               }
                                default: {
                                        break;
                                }
@@ -1997,6 +2116,20 @@ necp_client_parse_parameters(u_int8_t *parameters,
                offset += sizeof(struct necp_tlv_header) + length;
        }
 
+       if (resolver_tag != NULL) {
+               union necp_sockaddr_union remote_addr;
+               memcpy(&remote_addr, &parsed_parameters->remote_addr, sizeof(remote_addr));
+               remote_addr.sin.sin_port = 0;
+               const bool validated = necp_validate_resolver_answer(parent_id,
+                   client_hostname, hostname_length,
+                   (u_int8_t *)&remote_addr, sizeof(remote_addr),
+                   resolver_tag, resolver_tag_length);
+               if (!validated) {
+                       error = EAUTH;
+                       NECPLOG(LOG_ERR, "Failed to validate answer for hostname %s", client_hostname);
+               }
+       }
+
        return error;
 }
 
@@ -2108,8 +2241,8 @@ necp_client_add_socket_flow(struct necp_client_flow_registration *flow_registrat
        LIST_INSERT_HEAD(&flow_registration->flow_list, new_flow, flow_chain);
 }
 
-int
-necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp)
+static int
+necp_client_register_socket_inner(pid_t pid, uuid_t client_id, struct inpcb *inp, bool is_listener)
 {
        int error = 0;
        struct necp_fd_data *client_fd = NULL;
@@ -2121,20 +2254,25 @@ necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp)
                struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id);
                if (client != NULL) {
                        if (!pid || client->proc_pid == pid) {
-                               struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id);
-                               if (flow_registration != NULL) {
-                                       // Found the right client and flow registration, add a new flow
+                               if (is_listener) {
                                        found_client = TRUE;
-                                       necp_client_add_socket_flow(flow_registration, inp);
-                               } else if (RB_EMPTY(&client->flow_registrations) && !necp_client_id_is_flow(client_id)) {
-                                       // No flows yet on this client, add a new registration
-                                       flow_registration = necp_client_create_flow_registration(client_fd, client);
-                                       if (flow_registration == NULL) {
-                                               error = ENOMEM;
-                                       } else {
-                                               // Add a new flow
+                               } else {
+                                       // Find client flow and assign from socket
+                                       struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id);
+                                       if (flow_registration != NULL) {
+                                               // Found the right client and flow registration, add a new flow
                                                found_client = TRUE;
                                                necp_client_add_socket_flow(flow_registration, inp);
+                                       } else if (RB_EMPTY(&client->flow_registrations) && !necp_client_id_is_flow(client_id)) {
+                                               // No flows yet on this client, add a new registration
+                                               flow_registration = necp_client_create_flow_registration(client_fd, client);
+                                               if (flow_registration == NULL) {
+                                                       error = ENOMEM;
+                                               } else {
+                                                       // Add a new flow
+                                                       found_client = TRUE;
+                                                       necp_client_add_socket_flow(flow_registration, inp);
+                                               }
                                        }
                                }
                        }
@@ -2163,6 +2301,19 @@ necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp)
        return error;
 }
 
+int
+necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp)
+{
+       return necp_client_register_socket_inner(pid, client_id, inp, false);
+}
+
+int
+necp_client_register_socket_listener(pid_t pid, uuid_t client_id, struct inpcb *inp)
+{
+       return necp_client_register_socket_inner(pid, client_id, inp, true);
+}
+
+
 static void
 necp_client_add_multipath_interface_flows(struct necp_client_flow_registration *flow_registration,
     struct necp_client *client,
@@ -2250,7 +2401,7 @@ necp_client_lookup_bb_radio_manager(struct necp_client *client,
        }
 
        error = necp_application_find_policy_match_internal(proc, client->parameters, (u_int32_t)client->parameters_length,
-           &result, NULL, 0, NULL, NULL, NULL, true);
+           &result, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, true, true);
 
        proc_rele(proc);
        proc = PROC_NULL;
@@ -2530,7 +2681,7 @@ necp_client_assign_from_socket(pid_t pid, uuid_t client_id, struct inpcb *inp)
                                                flow->assigned_results = necp_create_nexus_assign_message(empty_uuid, 0, NULL, 0,
                                                    (struct necp_client_endpoint *)&flow->local_addr,
                                                    (struct necp_client_endpoint *)&flow->remote_addr,
-                                                   0, NULL, &flow->assigned_results_length);
+                                                   NULL, 0, NULL, &flow->assigned_results_length);
                                                flow_registration->flow_result_read = FALSE;
                                                client_updated = TRUE;
                                                break;
@@ -2565,6 +2716,57 @@ necp_client_assign_from_socket(pid_t pid, uuid_t client_id, struct inpcb *inp)
        return error;
 }
 
+bool
+necp_socket_is_allowed_to_recv_on_interface(struct inpcb *inp, ifnet_t interface)
+{
+       if (interface == NULL ||
+           inp == NULL ||
+           !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
+           uuid_is_null(inp->necp_client_uuid)) {
+               // If there's no interface or client ID to check,
+               // or if this is not a listener, pass.
+               // Outbound connections will have already been
+               // validated for policy.
+               return TRUE;
+       }
+
+       // Only filter out listener sockets (no remote address specified)
+       if ((inp->inp_vflag & INP_IPV4) &&
+           inp->inp_faddr.s_addr != INADDR_ANY) {
+               return TRUE;
+       }
+       if ((inp->inp_vflag & INP_IPV6) &&
+           !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
+               return TRUE;
+       }
+
+       bool allowed = TRUE;
+
+       NECP_CLIENT_TREE_LOCK_SHARED();
+
+       struct necp_client *client = necp_find_client_and_lock(inp->necp_client_uuid);
+       if (client != NULL) {
+               struct necp_client_parsed_parameters *parsed_parameters = NULL;
+
+               MALLOC(parsed_parameters, struct necp_client_parsed_parameters *, sizeof(*parsed_parameters), M_NECP, (M_WAITOK | M_ZERO));
+               if (parsed_parameters != NULL) {
+                       int error = necp_client_parse_parameters(client->parameters, (u_int32_t)client->parameters_length, parsed_parameters);
+                       if (error == 0) {
+                               if (!necp_ifnet_matches_parameters(interface, parsed_parameters, 0, NULL, true, false)) {
+                                       allowed = FALSE;
+                               }
+                       }
+                       FREE(parsed_parameters, M_NECP);
+               }
+
+               NECP_CLIENT_UNLOCK(client);
+       }
+
+       NECP_CLIENT_TREE_UNLOCK();
+
+       return allowed;
+}
+
 int
 necp_update_flow_protoctl_event(uuid_t netagent_uuid, uuid_t client_id,
     uint32_t protoctl_event_code, uint32_t protoctl_event_val,
@@ -2765,7 +2967,7 @@ necp_update_parsed_parameters(struct necp_client_parsed_parameters *parsed_param
                // This is a scoped agent. Add it to the required agents.
                if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT) {
                        // Already some required agents, add this at the end
-                       for (int j = 0; j < NECP_MAX_PARSED_PARAMETERS; j++) {
+                       for (int j = 0; j < NECP_MAX_AGENT_PARAMETERS; j++) {
                                if (uuid_compare(parsed_parameters->required_netagents[j], result->netagents[i]) == 0) {
                                        // Already required, break
                                        break;
@@ -2789,7 +2991,7 @@ necp_update_parsed_parameters(struct necp_client_parsed_parameters *parsed_param
                        char remove_agent_domain[NETAGENT_DOMAINSIZE] = { 0 };
                        char remove_agent_type[NETAGENT_TYPESIZE] = { 0 };
                        if (netagent_get_agent_domain_and_type(result->netagents[i], remove_agent_domain, remove_agent_type)) {
-                               for (int j = 0; j < NECP_MAX_PARSED_PARAMETERS; j++) {
+                               for (int j = 0; j < NECP_MAX_AGENT_PARAMETERS; j++) {
                                        if (strlen(parsed_parameters->required_netagent_types[j].netagent_domain) == 0 &&
                                            strlen(parsed_parameters->required_netagent_types[j].netagent_type) == 0) {
                                                break;
@@ -2799,16 +3001,16 @@ necp_update_parsed_parameters(struct necp_client_parsed_parameters *parsed_param
                                            strncmp(parsed_parameters->required_netagent_types[j].netagent_type, remove_agent_type, NETAGENT_TYPESIZE) == 0) {
                                                updated = true;
 
-                                               if (j == NECP_MAX_PARSED_PARAMETERS - 1) {
+                                               if (j == NECP_MAX_AGENT_PARAMETERS - 1) {
                                                        // Last field, just clear and break
-                                                       memset(&parsed_parameters->required_netagent_types[NECP_MAX_PARSED_PARAMETERS - 1], 0, sizeof(struct necp_client_parameter_netagent_type));
+                                                       memset(&parsed_parameters->required_netagent_types[NECP_MAX_AGENT_PARAMETERS - 1], 0, sizeof(struct necp_client_parameter_netagent_type));
                                                        break;
                                                } else {
                                                        // Move the parameters down, clear the last entry
                                                        memmove(&parsed_parameters->required_netagent_types[j],
                                                            &parsed_parameters->required_netagent_types[j + 1],
-                                                           sizeof(struct necp_client_parameter_netagent_type) * (NECP_MAX_PARSED_PARAMETERS - (j + 1)));
-                                                       memset(&parsed_parameters->required_netagent_types[NECP_MAX_PARSED_PARAMETERS - 1], 0, sizeof(struct necp_client_parameter_netagent_type));
+                                                           sizeof(struct necp_client_parameter_netagent_type) * (NECP_MAX_AGENT_PARAMETERS - (j + 1)));
+                                                       memset(&parsed_parameters->required_netagent_types[NECP_MAX_AGENT_PARAMETERS - 1], 0, sizeof(struct necp_client_parameter_netagent_type));
                                                        // Continue, don't increment but look at the new shifted item instead
                                                        continue;
                                                }
@@ -2847,7 +3049,10 @@ necp_calculate_client_result(proc_t proc,
     struct necp_client *client,
     struct necp_client_parsed_parameters *parsed_parameters,
     struct necp_aggregate_result *result,
-    u_int32_t *flags)
+    u_int32_t *flags,
+    u_int32_t *reason,
+    struct necp_client_endpoint *v4_gateway,
+    struct necp_client_endpoint *v6_gateway)
 {
        struct rtentry *route = NULL;
 
@@ -2862,8 +3067,10 @@ necp_calculate_client_result(proc_t proc,
                memset(result, 0, sizeof(*result));
                int error = necp_application_find_policy_match_internal(proc, client->parameters,
                    (u_int32_t)client->parameters_length,
-                   result, flags, matching_if_index,
-                   NULL, NULL, &route, false);
+                   result, flags, reason, matching_if_index,
+                   NULL, NULL,
+                   v4_gateway, v6_gateway,
+                   &route, false, true);
                if (error != 0) {
                        if (route != NULL) {
                                rtfree(route);
@@ -2874,7 +3081,7 @@ necp_calculate_client_result(proc_t proc,
                if (validate_agents) {
                        bool requirement_failed = FALSE;
                        if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT) {
-                               for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+                               for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) {
                                        if (uuid_is_null(parsed_parameters->required_netagents[i])) {
                                                break;
                                        }
@@ -2899,7 +3106,7 @@ necp_calculate_client_result(proc_t proc,
                        }
 
                        if (!requirement_failed && parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE) {
-                               for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+                               for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) {
                                        if (strlen(parsed_parameters->required_netagent_types[i].netagent_domain) == 0 &&
                                            strlen(parsed_parameters->required_netagent_types[i].netagent_type) == 0) {
                                                break;
@@ -2956,6 +3163,11 @@ necp_calculate_client_result(proc_t proc,
        return TRUE;
 }
 
+#define NECP_PARSED_PARAMETERS_REQUIRED_FIELDS (NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IF |             \
+                                                                                               NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE |          \
+                                                                                               NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT |           \
+                                                                                               NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE)
+
 static bool
 necp_update_client_result(proc_t proc,
     struct necp_fd_data *client_fd,
@@ -2966,6 +3178,7 @@ necp_update_client_result(proc_t proc,
        struct necp_aggregate_result result;
        struct necp_client_parsed_parameters *parsed_parameters = NULL;
        u_int32_t flags = 0;
+       u_int32_t reason = 0;
 
        NECP_CLIENT_ASSERT_LOCKED(client);
 
@@ -2988,19 +3201,28 @@ necp_update_client_result(proc_t proc,
        client->ip_protocol = parsed_parameters->ip_protocol;
 
        // Calculate the policy result
-       if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags)) {
+       struct necp_client_endpoint v4_gateway = {};
+       struct necp_client_endpoint v6_gateway = {};
+       if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags, &reason, &v4_gateway, &v6_gateway)) {
                FREE(parsed_parameters, M_NECP);
                return FALSE;
        }
 
        if (necp_update_parsed_parameters(parsed_parameters, &result)) {
                // Changed the parameters based on result, try again (only once)
-               if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags)) {
+               if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags, &reason, &v4_gateway, &v6_gateway)) {
                        FREE(parsed_parameters, M_NECP);
                        return FALSE;
                }
        }
 
+       if ((parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) &&
+           parsed_parameters->required_interface_index != IFSCOPE_NONE &&
+           (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IF) == 0) {
+               // Listener should not apply required interface index if
+               parsed_parameters->required_interface_index = IFSCOPE_NONE;
+       }
+
        // Save the last policy id on the client
        client->policy_id = result.policy_id;
 
@@ -3041,6 +3263,9 @@ necp_update_client_result(proc_t proc,
        bool updated = FALSE;
        u_int8_t *cursor = client->result;
        cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_FLAGS, sizeof(flags), &flags, &updated, client->result, sizeof(client->result));
+       if (reason != 0) {
+               cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_REASON, sizeof(reason), &reason, &updated, client->result, sizeof(client->result));
+       }
        cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_CLIENT_ID, sizeof(uuid_t), client->client_id, &updated,
            client->result, sizeof(client->result));
        cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_POLICY_RESULT, sizeof(result.routing_result), &result.routing_result, &updated,
@@ -3058,6 +3283,7 @@ necp_update_client_result(proc_t proc,
        if (result.routed_interface_index != 0) {
                u_int routed_interface_index = result.routed_interface_index;
                if (result.routing_result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL &&
+                   (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_REQUIRED_FIELDS) &&
                    parsed_parameters->required_interface_index != IFSCOPE_NONE &&
                    parsed_parameters->required_interface_index != result.routed_interface_index) {
                        routed_interface_index = parsed_parameters->required_interface_index;
@@ -3073,15 +3299,31 @@ necp_update_client_result(proc_t proc,
                    sizeof(effective_traffic_class), &effective_traffic_class, &updated,
                    client->result, sizeof(client->result));
        }
-       if (client->background_update) {
-               u_int32_t background = client->background;
-               cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_TRAFFIC_MGMT_BG,
-                   sizeof(background), &background, &updated,
-                   client->result, sizeof(client->result));
-               if (updated) {
-                       client->background_update = 0;
+
+       if (client_fd->background) {
+               bool has_assigned_flow = FALSE;
+               struct necp_client_flow_registration *flow_registration = NULL;
+               struct necp_client_flow *search_flow = NULL;
+               RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) {
+                       LIST_FOREACH(search_flow, &flow_registration->flow_list, flow_chain) {
+                               if (search_flow->assigned) {
+                                       has_assigned_flow = TRUE;
+                                       break;
+                               }
+                       }
+               }
+
+               if (has_assigned_flow) {
+                       u_int32_t background = client_fd->background;
+                       cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_TRAFFIC_MGMT_BG,
+                           sizeof(background), &background, &updated,
+                           client->result, sizeof(client->result));
                }
        }
+
+       bool write_v4_gateway = !necp_client_endpoint_is_unspecified(&v4_gateway);
+       bool write_v6_gateway = !necp_client_endpoint_is_unspecified(&v6_gateway);
+
        NECP_CLIENT_ROUTE_LOCK(client);
        if (client->current_route != NULL) {
                const u_int32_t route_mtu = get_maxmtu(client->current_route);
@@ -3090,9 +3332,29 @@ necp_update_client_result(proc_t proc,
                            sizeof(route_mtu), &route_mtu, &updated,
                            client->result, sizeof(client->result));
                }
+               bool has_remote_addr = parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REMOTE_ADDR;
+               if (has_remote_addr && client->current_route->rt_gateway != NULL) {
+                       if (client->current_route->rt_gateway->sa_family == AF_INET) {
+                               write_v6_gateway = false;
+                       } else if (client->current_route->rt_gateway->sa_family == AF_INET6) {
+                               write_v4_gateway = false;
+                       }
+               }
        }
        NECP_CLIENT_ROUTE_UNLOCK(client);
 
+       if (write_v4_gateway) {
+               cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_GATEWAY,
+                   sizeof(struct necp_client_endpoint), &v4_gateway, &updated,
+                   client->result, sizeof(client->result));
+       }
+
+       if (write_v6_gateway) {
+               cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_GATEWAY,
+                   sizeof(struct necp_client_endpoint), &v6_gateway, &updated,
+                   client->result, sizeof(client->result));
+       }
+
        if (result.mss_recommended != 0) {
                cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_RECOMMENDED_MSS,
                    sizeof(result.mss_recommended), &result.mss_recommended, &updated,
@@ -3131,6 +3393,7 @@ necp_update_client_result(proc_t proc,
                delegate_interface = direct_interface->if_delegated.ifp;
        }
        if (result.routing_result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL &&
+           (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_REQUIRED_FIELDS) &&
            parsed_parameters->required_interface_index != IFSCOPE_NONE &&
            parsed_parameters->required_interface_index != result.routing_result_parameter.tunnel_interface_index &&
            parsed_parameters->required_interface_index <= (u_int32_t)if_index) {
@@ -3174,7 +3437,7 @@ necp_update_client_result(proc_t proc,
                // Get multipath interface options from ordered list
                struct ifnet *multi_interface = NULL;
                TAILQ_FOREACH(multi_interface, &ifnet_ordered_head, if_ordered_link) {
-                       if (necp_ifnet_matches_parameters(multi_interface, parsed_parameters, NULL, true)) {
+                       if (necp_ifnet_matches_parameters(multi_interface, parsed_parameters, 0, NULL, true, false)) {
                                // Add multipath interface flows for kernel MPTCP
                                necp_client_add_interface_option_if_needed(client, multi_interface->if_index,
                                    ifnet_get_generation(multi_interface), NULL);
@@ -3183,14 +3446,26 @@ necp_update_client_result(proc_t proc,
                                necp_client_add_agent_interface_options(client, parsed_parameters, multi_interface);
                        }
                }
-       } else if ((parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) &&
-           result.routing_result != NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED) {
-               // Get listener interface options from global list
-               struct ifnet *listen_interface = NULL;
-               TAILQ_FOREACH(listen_interface, &ifnet_head, if_link) {
-                       if (necp_ifnet_matches_parameters(listen_interface, parsed_parameters, NULL, true)) {
+       } else if (parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) {
+               if (result.routing_result == NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED) {
+                       if (direct_interface != NULL) {
+                               // If scoped, only listen on that interface
                                // Add nexus agents for listeners
-                               necp_client_add_agent_interface_options(client, parsed_parameters, listen_interface);
+                               necp_client_add_agent_interface_options(client, parsed_parameters, direct_interface);
+
+                               // Add interface option in case it is not a nexus
+                               necp_client_add_interface_option_if_needed(client, direct_interface->if_index,
+                                   ifnet_get_generation(direct_interface), NULL);
+                       }
+               } else {
+                       // Get listener interface options from global list
+                       struct ifnet *listen_interface = NULL;
+                       TAILQ_FOREACH(listen_interface, &ifnet_head, if_link) {
+                               if ((listen_interface->if_flags & (IFF_UP | IFF_RUNNING)) &&
+                                   necp_ifnet_matches_parameters(listen_interface, parsed_parameters, 0, NULL, true, false)) {
+                                       // Add nexus agents for listeners
+                                       necp_client_add_agent_interface_options(client, parsed_parameters, listen_interface);
+                               }
                        }
                }
        }
@@ -3305,17 +3580,15 @@ necp_defunct_client_fd_locked(struct necp_fd_data *client_fd, struct _necp_flow_
                        LIST_FOREACH(search_flow, &flow_registration->flow_list, flow_chain) {
                                if (search_flow->nexus &&
                                    !uuid_is_null(search_flow->u.nexus_agent)) {
-                                       struct necp_flow_defunct *flow_defunct;
-
                                        // Sleeping alloc won't fail; copy only what's necessary
-                                       flow_defunct = _MALLOC(sizeof(struct necp_flow_defunct), M_NECP, M_WAITOK | M_ZERO);
+                                       struct necp_flow_defunct *flow_defunct = _MALLOC(sizeof(struct necp_flow_defunct), M_NECP, M_WAITOK | M_ZERO);
                                        uuid_copy(flow_defunct->nexus_agent, search_flow->u.nexus_agent);
                                        uuid_copy(flow_defunct->flow_id, ((flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID) ?
                                            client->client_id :
                                            flow_registration->registration_id));
                                        flow_defunct->proc_pid = client->proc_pid;
                                        flow_defunct->agent_handle = client->agent_handle;
-
+                                       flow_defunct->flags = flow_registration->flags;
                                        // Add to the list provided by caller
                                        LIST_INSERT_HEAD(defunct_list, flow_defunct, chain);
 
@@ -3391,11 +3664,19 @@ necp_update_all_clients_callout(__unused thread_call_param_t dummy,
                // For each newly defunct client, send a message to the nexus to remove the flow
                LIST_FOREACH_SAFE(flow_defunct, &defunct_list, chain, temp_flow_defunct) {
                        if (!uuid_is_null(flow_defunct->nexus_agent)) {
-                               int netagent_error = netagent_client_message(flow_defunct->nexus_agent,
+                               u_int8_t message_type = NETAGENT_MESSAGE_TYPE_ABORT_NEXUS;
+                               if (((flow_defunct->flags & NECP_CLIENT_FLOW_FLAGS_BROWSE) ||
+                                   (flow_defunct->flags & NECP_CLIENT_FLOW_FLAGS_RESOLVE)) &&
+                                   !(flow_defunct->flags & NECP_CLIENT_FLOW_FLAGS_ALLOW_NEXUS)) {
+                                       message_type = NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT;
+                               }
+                               int netagent_error = netagent_client_message_with_params(flow_defunct->nexus_agent,
                                    flow_defunct->flow_id,
                                    flow_defunct->proc_pid,
                                    flow_defunct->agent_handle,
-                                   NETAGENT_MESSAGE_TYPE_ABORT_NEXUS);
+                                   message_type,
+                                   flow_defunct->has_close_parameters ? &flow_defunct->close_parameters : NULL,
+                                   NULL, 0);
                                if (netagent_error != 0) {
                                        char namebuf[MAXCOMLEN + 1];
                                        (void) strlcpy(namebuf, "unknown", sizeof(namebuf));
@@ -3412,6 +3693,12 @@ necp_update_all_clients_callout(__unused thread_call_param_t dummy,
 
 void
 necp_update_all_clients(void)
+{
+       necp_update_all_clients_immediately_if_needed(false);
+}
+
+void
+necp_update_all_clients_immediately_if_needed(bool should_update_immediately)
 {
        if (necp_client_update_tcall == NULL) {
                // Don't try to update clients if the module is not initialized
@@ -3420,72 +3707,51 @@ necp_update_all_clients(void)
 
        uint64_t deadline = 0;
        uint64_t leeway = 0;
-       clock_interval_to_deadline(necp_timeout_microseconds, NSEC_PER_USEC, &deadline);
-       clock_interval_to_absolutetime_interval(necp_timeout_leeway_microseconds, NSEC_PER_USEC, &leeway);
+
+       uint32_t timeout_to_use = necp_timeout_microseconds;
+       uint32_t leeway_to_use = necp_timeout_leeway_microseconds;
+       if (should_update_immediately) {
+               timeout_to_use = 1000 * 10; // 10ms
+               leeway_to_use = 1000 * 10; // 10ms;
+       }
+
+       clock_interval_to_deadline(timeout_to_use, NSEC_PER_USEC, &deadline);
+       clock_interval_to_absolutetime_interval(leeway_to_use, NSEC_PER_USEC, &leeway);
 
        thread_call_enter_delayed_with_leeway(necp_client_update_tcall, NULL,
            deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
 }
 
-void
+bool
 necp_set_client_as_background(proc_t proc,
     struct fileproc *fp,
     bool background)
 {
-       bool updated_result = FALSE;
-       struct necp_client *client = NULL;
-
        if (proc == PROC_NULL) {
                NECPLOG0(LOG_ERR, "NULL proc");
-               return;
+               return FALSE;
        }
 
        if (fp == NULL) {
                NECPLOG0(LOG_ERR, "NULL fp");
-               return;
+               return FALSE;
        }
 
        struct necp_fd_data *client_fd = (struct necp_fd_data *)fp->f_fglob->fg_data;
        if (client_fd == NULL) {
                NECPLOG0(LOG_ERR, "Could not find client structure for backgrounded client");
-               return;
+               return FALSE;
        }
 
        if (client_fd->necp_fd_type != necp_fd_type_client) {
                // Not a client fd, ignore
                NECPLOG0(LOG_ERR, "Not a client fd, ignore");
-               return;
+               return FALSE;
        }
 
-       NECP_FD_LOCK(client_fd);
-
-       RB_FOREACH(client, _necp_client_tree, &client_fd->clients) {
-               NECP_CLIENT_LOCK(client);
+       client_fd->background = background;
 
-               bool has_assigned_flow = FALSE;
-               struct necp_client_flow_registration *flow_registration = NULL;
-               struct necp_client_flow *search_flow = NULL;
-               RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) {
-                       LIST_FOREACH(search_flow, &flow_registration->flow_list, flow_chain) {
-                               if (search_flow->assigned) {
-                                       has_assigned_flow = TRUE;
-                                       break;
-                               }
-                       }
-               }
-
-               if (has_assigned_flow) {
-                       client->background = background;
-                       client->background_update = TRUE;
-                       updated_result = TRUE;
-               }
-
-               NECP_CLIENT_UNLOCK(client);
-       }
-       if (updated_result) {
-               necp_update_client_fd_locked(client_fd, proc, NULL);
-       }
-       NECP_FD_UNLOCK(client_fd);
+       return TRUE;
 }
 
 void
@@ -3528,11 +3794,19 @@ necp_fd_defunct(proc_t proc, struct necp_fd_data *client_fd)
                // For each defunct client, remove flow from the nexus
                LIST_FOREACH_SAFE(flow_defunct, &defunct_list, chain, temp_flow_defunct) {
                        if (!uuid_is_null(flow_defunct->nexus_agent)) {
-                               int netagent_error = netagent_client_message(flow_defunct->nexus_agent,
+                               u_int8_t message_type = NETAGENT_MESSAGE_TYPE_ABORT_NEXUS;
+                               if (((flow_defunct->flags & NECP_CLIENT_FLOW_FLAGS_BROWSE) ||
+                                   (flow_defunct->flags & NECP_CLIENT_FLOW_FLAGS_RESOLVE)) &&
+                                   !(flow_defunct->flags & NECP_CLIENT_FLOW_FLAGS_ALLOW_NEXUS)) {
+                                       message_type = NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT;
+                               }
+                               int netagent_error = netagent_client_message_with_params(flow_defunct->nexus_agent,
                                    flow_defunct->flow_id,
                                    flow_defunct->proc_pid,
                                    flow_defunct->agent_handle,
-                                   NETAGENT_MESSAGE_TYPE_ABORT_NEXUS);
+                                   message_type,
+                                   flow_defunct->has_close_parameters ? &flow_defunct->close_parameters : NULL,
+                                   NULL, 0);
                                if (netagent_error != 0) {
                                        NECPLOG((netagent_error == ENOENT ? LOG_DEBUG : LOG_ERR), "necp_defunct_client abort nexus error (%d)", netagent_error);
                                }
@@ -3632,10 +3906,8 @@ necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid, u_int32_
                                                                                          NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE |                \
                                                                                          NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT |         \
                                                                                          NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT |                \
-                                                                                         NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT |          \
                                                                                          NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE |    \
-                                                                                         NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE |   \
-                                                                                         NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE)
+                                                                                         NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE)
 
 #define NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS (NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR |           \
                                                                                                        NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE)
@@ -3785,23 +4057,51 @@ necp_interface_type_is_primary_eligible(u_int8_t interface_type)
 static bool
 necp_ifnet_matches_parameters(struct ifnet *ifp,
     struct necp_client_parsed_parameters *parsed_parameters,
+    u_int32_t override_flags,
     u_int32_t *preferred_count,
-    bool secondary_interface)
+    bool secondary_interface,
+    bool require_scoped_field)
 {
+       bool matched_some_scoped_field = FALSE;
+
        if (preferred_count) {
                *preferred_count = 0;
        }
 
+       if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IF) {
+               if (parsed_parameters->required_interface_index != ifp->if_index) {
+                       return FALSE;
+               }
+       }
+
        if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR) {
                if (!necp_ifnet_matches_local_address(ifp, &parsed_parameters->local_addr.sa)) {
                        return FALSE;
                }
+               if (require_scoped_field) {
+                       matched_some_scoped_field = TRUE;
+               }
        }
 
        if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_FLAGS) {
-               if ((parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_EXPENSIVE) &&
-                   IFNET_IS_EXPENSIVE(ifp)) {
-                       return FALSE;
+               if (override_flags != 0) {
+                       if ((override_flags & NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_EXPENSIVE) &&
+                           IFNET_IS_EXPENSIVE(ifp)) {
+                               return FALSE;
+                       }
+                       if ((override_flags & NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_CONSTRAINED) &&
+                           IFNET_IS_CONSTRAINED(ifp)) {
+                               return FALSE;
+                       }
+               } else {
+                       if ((parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_EXPENSIVE) &&
+                           IFNET_IS_EXPENSIVE(ifp)) {
+                               return FALSE;
+                       }
+                       if ((parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_CONSTRAINED) &&
+                           IFNET_IS_CONSTRAINED(ifp)) {
+                               return FALSE;
+                       }
                }
        }
 
@@ -3813,8 +4113,14 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
                return FALSE;
        }
 
+       if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE) {
+               if (require_scoped_field) {
+                       matched_some_scoped_field = TRUE;
+               }
+       }
+
        if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_IFTYPE) {
-               for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+               for (int i = 0; i < NECP_MAX_INTERFACE_PARAMETERS; i++) {
                        if (parsed_parameters->prohibited_interface_types[i] == 0) {
                                break;
                        }
@@ -3826,7 +4132,7 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
        }
 
        if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_IF) {
-               for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+               for (int i = 0; i < NECP_MAX_INTERFACE_PARAMETERS; i++) {
                        if (strlen(parsed_parameters->prohibited_interfaces[i]) == 0) {
                                break;
                        }
@@ -3838,7 +4144,7 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
        }
 
        if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT) {
-               for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+               for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) {
                        if (uuid_is_null(parsed_parameters->required_netagents[i])) {
                                break;
                        }
@@ -3846,11 +4152,15 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
                        if (!necp_ifnet_matches_agent(ifp, &parsed_parameters->required_netagents[i], FALSE)) {
                                return FALSE;
                        }
+
+                       if (require_scoped_field) {
+                               matched_some_scoped_field = TRUE;
+                       }
                }
        }
 
        if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT) {
-               for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+               for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) {
                        if (uuid_is_null(parsed_parameters->prohibited_netagents[i])) {
                                break;
                        }
@@ -3862,7 +4172,7 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
        }
 
        if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE) {
-               for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+               for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) {
                        if (strlen(parsed_parameters->required_netagent_types[i].netagent_domain) == 0 &&
                            strlen(parsed_parameters->required_netagent_types[i].netagent_type) == 0) {
                                break;
@@ -3871,11 +4181,15 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
                        if (!necp_ifnet_matches_agent_type(ifp, parsed_parameters->required_netagent_types[i].netagent_domain, parsed_parameters->required_netagent_types[i].netagent_type, FALSE)) {
                                return FALSE;
                        }
+
+                       if (require_scoped_field) {
+                               matched_some_scoped_field = TRUE;
+                       }
                }
        }
 
        if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT_TYPE) {
-               for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+               for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) {
                        if (strlen(parsed_parameters->prohibited_netagent_types[i].netagent_domain) == 0 &&
                            strlen(parsed_parameters->prohibited_netagent_types[i].netagent_type) == 0) {
                                break;
@@ -3890,19 +4204,22 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
        // Checked preferred properties
        if (preferred_count) {
                if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT) {
-                       for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+                       for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) {
                                if (uuid_is_null(parsed_parameters->preferred_netagents[i])) {
                                        break;
                                }
 
                                if (necp_ifnet_matches_agent(ifp, &parsed_parameters->preferred_netagents[i], TRUE)) {
                                        (*preferred_count)++;
+                                       if (require_scoped_field) {
+                                               matched_some_scoped_field = TRUE;
+                                       }
                                }
                        }
                }
 
                if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE) {
-                       for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+                       for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) {
                                if (strlen(parsed_parameters->preferred_netagent_types[i].netagent_domain) == 0 &&
                                    strlen(parsed_parameters->preferred_netagent_types[i].netagent_type) == 0) {
                                        break;
@@ -3910,12 +4227,15 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
 
                                if (necp_ifnet_matches_agent_type(ifp, parsed_parameters->preferred_netagent_types[i].netagent_domain, parsed_parameters->preferred_netagent_types[i].netagent_type, TRUE)) {
                                        (*preferred_count)++;
+                                       if (require_scoped_field) {
+                                               matched_some_scoped_field = TRUE;
+                                       }
                                }
                        }
                }
 
                if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT) {
-                       for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+                       for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) {
                                if (uuid_is_null(parsed_parameters->avoided_netagents[i])) {
                                        break;
                                }
@@ -3927,7 +4247,7 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
                }
 
                if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE) {
-                       for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+                       for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) {
                                if (strlen(parsed_parameters->avoided_netagent_types[i].netagent_domain) == 0 &&
                                    strlen(parsed_parameters->avoided_netagent_types[i].netagent_type) == 0) {
                                        break;
@@ -3941,6 +4261,10 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
                }
        }
 
+       if (require_scoped_field) {
+               return matched_some_scoped_field;
+       }
+
        return TRUE;
 }
 
@@ -3958,7 +4282,18 @@ necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_
                return TRUE;
        }
 
-       if (!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_INTERESTING_IFNET_FIELDS)) {
+       // Check and save off flags
+       u_int32_t flags = 0;
+       bool has_prohibit_flags = FALSE;
+       if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_FLAGS) {
+               flags = parsed_parameters->flags;
+               has_prohibit_flags = (parsed_parameters->flags &
+                   (NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_EXPENSIVE |
+                   NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_CONSTRAINED));
+       }
+
+       if (!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_INTERESTING_IFNET_FIELDS) &&
+           !has_prohibit_flags) {
                return TRUE;
        }
 
@@ -3967,11 +4302,12 @@ necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_
        // We have interesting parameters to parse and find a matching interface
        ifnet_head_lock_shared();
 
-       if (!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_FIELDS)) {
+       if (!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_FIELDS) &&
+           !has_preferred_fields) {
                // We do have fields to match, but they are only prohibitory
                // If the first interface in the list matches, or there are no ordered interfaces, we don't need to scope
                ifp = TAILQ_FIRST(&ifnet_ordered_head);
-               if (ifp == NULL || necp_ifnet_matches_parameters(ifp, parsed_parameters, NULL, false)) {
+               if (ifp == NULL || necp_ifnet_matches_parameters(ifp, parsed_parameters, 0, NULL, false, false)) {
                        // Don't set return_ifindex, so the client doesn't need to scope
                        ifnet_head_done();
                        return TRUE;
@@ -3981,7 +4317,7 @@ necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_
        // First check the ordered interface list
        TAILQ_FOREACH(ifp, &ifnet_ordered_head, if_ordered_link) {
                u_int32_t preferred_count = 0;
-               if (necp_ifnet_matches_parameters(ifp, parsed_parameters, &preferred_count, false)) {
+               if (necp_ifnet_matches_parameters(ifp, parsed_parameters, flags, &preferred_count, false, false)) {
                        if (preferred_count > best_preferred_count ||
                            *return_ifindex == 0) {
                                // Everything matched, and is most preferred. Return this interface.
@@ -3993,20 +4329,34 @@ necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_
                                }
                        }
                }
+
+               if (has_prohibit_flags &&
+                   ifp == TAILQ_FIRST(&ifnet_ordered_head)) {
+                       // This was the first interface. From here on, if the
+                       // client prohibited either expensive or constrained,
+                       // don't allow either as a secondary interface option.
+                       flags |= (NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_EXPENSIVE |
+                           NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_CONSTRAINED);
+               }
        }
 
+       bool is_listener = ((parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_FLAGS) &&
+           (parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER));
+
        // Then check the remaining interfaces
        if ((parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_FIELDS) &&
            ((!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE)) ||
-           !necp_interface_type_is_primary_eligible(parsed_parameters->required_interface_type)) &&
-           *return_ifindex == 0) {
+           !necp_interface_type_is_primary_eligible(parsed_parameters->required_interface_type) ||
+           (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR) ||
+           is_listener) &&
+           (*return_ifindex == 0 || has_preferred_fields)) {
                TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
                        u_int32_t preferred_count = 0;
                        if (NECP_IFP_IS_ON_ORDERED_LIST(ifp)) {
                                // This interface was in the ordered list, skip
                                continue;
                        }
-                       if (necp_ifnet_matches_parameters(ifp, parsed_parameters, &preferred_count, false)) {
+                       if (necp_ifnet_matches_parameters(ifp, parsed_parameters, flags, &preferred_count, false, true)) {
                                if (preferred_count > best_preferred_count ||
                                    *return_ifindex == 0) {
                                        // Everything matched, and is most preferred. Return this interface.
@@ -4143,7 +4493,12 @@ done:
        return error;
 }
 
-static int
+// All functions called directly from necp_client_action() to handle one of the
+// types should be marked with NECP_CLIENT_ACTION_FUNCTION. This ensures that
+// necp_client_action() does not inline all the actions into a single function.
+#define NECP_CLIENT_ACTION_FUNCTION __attribute__((noinline))
+
+static NECP_CLIENT_ACTION_FUNCTION int
 necp_client_add(struct proc *p, struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
 {
        int error = 0;
@@ -4173,7 +4528,8 @@ necp_client_add(struct proc *p, struct necp_fd_data *fd_data, struct necp_client
 
        lck_mtx_init(&client->lock, necp_fd_mtx_grp, necp_fd_mtx_attr);
        lck_mtx_init(&client->route_lock, necp_fd_mtx_grp, necp_fd_mtx_attr);
-       necp_client_retain(client); // Hold our reference until close
+
+       os_ref_init(&client->reference_count, &necp_client_refgrp); // Hold our reference until close
 
        client->parameters_length = uap->buffer_size;
        client->proc_pid = fd_data->proc_pid; // Save off proc pid in case the client will persist past fd
@@ -4190,6 +4546,7 @@ necp_client_add(struct proc *p, struct necp_fd_data *fd_data, struct necp_client
                goto done;
        }
 
+
        necp_client_update_observer_add(client);
 
        NECP_FD_LOCK(fd_data);
@@ -4216,7 +4573,74 @@ done:
        return error;
 }
 
-static int
+static NECP_CLIENT_ACTION_FUNCTION int
+necp_client_claim(struct proc *p, struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
+{
+       int error = 0;
+       uuid_t client_id = {};
+       struct necp_client *client = NULL;
+
+       if (uap->client_id == 0 || uap->client_id_len != sizeof(uuid_t)) {
+               error = EINVAL;
+               goto done;
+       }
+
+       error = copyin(uap->client_id, client_id, sizeof(uuid_t));
+       if (error) {
+               NECPLOG(LOG_ERR, "necp_client_claim copyin client_id error (%d)", error);
+               goto done;
+       }
+
+       u_int64_t upid = proc_uniqueid(p);
+
+       NECP_FD_LIST_LOCK_SHARED();
+
+       struct necp_fd_data *find_fd = NULL;
+       LIST_FOREACH(find_fd, &necp_fd_list, chain) {
+               NECP_FD_LOCK(find_fd);
+               struct necp_client *find_client = necp_client_fd_find_client_and_lock(find_fd, client_id);
+               if (find_client != NULL) {
+                       if (find_client->delegated_upid == upid) {
+                               // Matched the client to claim; remove from the old fd
+                               client = find_client;
+                               RB_REMOVE(_necp_client_tree, &find_fd->clients, client);
+                               necp_client_retain_locked(client);
+                       }
+                       NECP_CLIENT_UNLOCK(find_client);
+               }
+               NECP_FD_UNLOCK(find_fd);
+
+               if (client != NULL) {
+                       break;
+               }
+       }
+
+       NECP_FD_LIST_UNLOCK();
+
+       if (client == NULL) {
+               error = ENOENT;
+               goto done;
+       }
+
+       client->proc_pid = fd_data->proc_pid; // Transfer client to claiming pid
+
+       // Add matched client to our fd and re-run result
+       NECP_FD_LOCK(fd_data);
+       RB_INSERT(_necp_client_tree, &fd_data->clients, client);
+       NECP_CLIENT_LOCK(client);
+       (void)necp_update_client_result(current_proc(), fd_data, client, NULL);
+       NECP_CLIENT_UNLOCK(client);
+       NECP_FD_UNLOCK(fd_data);
+
+       necp_client_release(client);
+
+done:
+       *retval = error;
+
+       return error;
+}
+
+static NECP_CLIENT_ACTION_FUNCTION int
 necp_client_remove(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
 {
        int error = 0;
@@ -4286,7 +4710,8 @@ done:
 }
 
 
-static int
+// Don't inline the function since it includes necp_client_parsed_parameters on the stack
+static __attribute__((noinline)) int
 necp_client_check_tcp_heuristics(struct necp_client *client, struct necp_client_flow *flow, u_int32_t *flags, u_int8_t *tfo_cookie, u_int8_t *tfo_cookie_len)
 {
        struct necp_client_parsed_parameters parsed_parameters;
@@ -4656,7 +5081,7 @@ necp_client_copy_internal(struct necp_client *client, uuid_t client_id, bool cli
        return 0;
 }
 
-static int
+static NECP_CLIENT_ACTION_FUNCTION int
 necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
 {
        int error = 0;
@@ -4752,7 +5177,7 @@ necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *u
        return error;
 }
 
-static int
+static NECP_CLIENT_ACTION_FUNCTION int
 necp_client_copy_client_update(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
 {
        int error = 0;
@@ -4828,9 +5253,17 @@ necp_client_copy_parameters_locked(struct necp_client *client,
        memcpy(&parameters->local_addr, &parsed_parameters.local_addr, sizeof(parameters->local_addr));
        memcpy(&parameters->remote_addr, &parsed_parameters.remote_addr, sizeof(parameters->remote_addr));
        parameters->ip_protocol = parsed_parameters.ip_protocol;
+       if (parsed_parameters.valid_fields & NECP_PARSED_PARAMETERS_FIELD_TRANSPORT_PROTOCOL) {
+               parameters->transport_protocol = parsed_parameters.transport_protocol;
+       } else {
+               parameters->transport_protocol = parsed_parameters.ip_protocol;
+       }
+       parameters->ethertype = parsed_parameters.ethertype;
        parameters->traffic_class = parsed_parameters.traffic_class;
        uuid_copy(parameters->euuid, parsed_parameters.effective_uuid);
        parameters->is_listener = (parsed_parameters.flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) ? 1 : 0;
+       parameters->is_interpose = (parsed_parameters.flags & NECP_CLIENT_PARAMETER_FLAG_INTERPOSE) ? 1 : 0;
+       parameters->is_custom_ether = (parsed_parameters.flags & NECP_CLIENT_PARAMETER_FLAG_CUSTOM_ETHER) ? 1 : 0;
        parameters->policy_id = client->policy_id;
 
        // parse client result flag
@@ -4843,10 +5276,24 @@ necp_client_copy_parameters_locked(struct necp_client *client,
        }
        parameters->allow_qos_marking = (client_result_flags & NECP_CLIENT_RESULT_FLAG_ALLOW_QOS_MARKING) ? 1 : 0;
 
+       if (parsed_parameters.valid_fields & NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR_PREFERENCE) {
+               if (parsed_parameters.local_address_preference == NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE_DEFAULT) {
+                       parameters->override_address_selection = false;
+               } else if (parsed_parameters.local_address_preference == NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE_TEMPORARY) {
+                       parameters->override_address_selection = true;
+                       parameters->use_stable_address = false;
+               } else if (parsed_parameters.local_address_preference == NECP_CLIENT_PARAMETER_LOCAL_ADDRESS_PREFERENCE_STABLE) {
+                       parameters->override_address_selection = true;
+                       parameters->use_stable_address = true;
+               }
+       } else {
+               parameters->override_address_selection = false;
+       }
+
        return error;
 }
 
-static int
+static NECP_CLIENT_ACTION_FUNCTION int
 necp_client_list(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
 {
        int error = 0;
@@ -4975,7 +5422,7 @@ necp_client_remove_assertion(struct necp_client *client, uuid_t netagent_uuid)
        return true;
 }
 
-static int
+static NECP_CLIENT_ACTION_FUNCTION int
 necp_client_agent_action(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
 {
        int error = 0;
@@ -5064,7 +5511,7 @@ necp_client_agent_action(struct necp_fd_data *fd_data, struct necp_client_action
                                            fd_data->proc_pid,
                                            client->agent_handle,
                                            netagent_message_type,
-                                           &parsed_parameters,
+                                           (struct necp_client_agent_parameters *)&parsed_parameters,
                                            NULL, NULL);
                                        if (error == 0) {
                                                acted_on_agent = TRUE;
@@ -5100,7 +5547,7 @@ done:
        return error;
 }
 
-static int
+static NECP_CLIENT_ACTION_FUNCTION int
 necp_client_copy_agent(__unused struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
 {
        int error = 0;
@@ -5130,7 +5577,7 @@ done:
        return error;
 }
 
-static int
+static NECP_CLIENT_ACTION_FUNCTION int
 necp_client_agent_use(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
 {
        int error = 0;
@@ -5181,15 +5628,28 @@ done:
        return error;
 }
 
-static int
+struct necp_interface_details_legacy {
+       char name[IFXNAMSIZ];
+       u_int32_t index;
+       u_int32_t generation;
+       u_int32_t functional_type;
+       u_int32_t delegate_index;
+       u_int32_t flags; // see NECP_INTERFACE_FLAG_*
+       u_int32_t mtu;
+       struct necp_interface_signature ipv4_signature;
+       struct necp_interface_signature ipv6_signature;
+};
+
+static NECP_CLIENT_ACTION_FUNCTION int
 necp_client_copy_interface(__unused struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
 {
        int error = 0;
        u_int32_t interface_index = 0;
-       struct necp_interface_details interface_details;
+       struct necp_interface_details interface_details = {};
 
        if (uap->client_id == 0 || uap->client_id_len != sizeof(u_int32_t) ||
-           uap->buffer_size < sizeof(interface_details) || uap->buffer == 0) {
+           uap->buffer_size < sizeof(struct necp_interface_details_legacy) ||
+           uap->buffer == 0) {
                NECPLOG0(LOG_ERR, "necp_client_copy_interface bad input");
                error = EINVAL;
                goto done;
@@ -5207,8 +5667,6 @@ necp_client_copy_interface(__unused struct necp_fd_data *fd_data, struct necp_cl
                goto done;
        }
 
-       memset(&interface_details, 0, sizeof(interface_details));
-
        ifnet_head_lock_shared();
        ifnet_t interface = NULL;
        if (interface_index != IFSCOPE_NONE && interface_index <= (u_int32_t)if_index) {
@@ -5228,6 +5686,9 @@ necp_client_copy_interface(__unused struct necp_fd_data *fd_data, struct necp_cl
                if (IFNET_IS_EXPENSIVE(interface)) {
                        interface_details.flags |= NECP_INTERFACE_FLAG_EXPENSIVE;
                }
+               if (IFNET_IS_CONSTRAINED(interface)) {
+                       interface_details.flags |= NECP_INTERFACE_FLAG_CONSTRAINED;
+               }
                if ((interface->if_eflags & IFEF_TXSTART) == IFEF_TXSTART) {
                        interface_details.flags |= NECP_INTERFACE_FLAG_TXSTART;
                }
@@ -5240,6 +5701,9 @@ necp_client_copy_interface(__unused struct necp_fd_data *fd_data, struct necp_cl
                if (IFNET_IS_LOW_POWER(interface)) {
                        interface_details.flags |= NECP_INTERFACE_FLAG_IS_LOW_POWER;
                }
+               if (interface->if_xflags & IFXF_MPK_LOG) {
+                       interface_details.flags |= NECP_INTERFACE_FLAG_MPK_LOG;
+               }
                interface_details.mtu = interface->if_mtu;
 
                u_int8_t ipv4_signature_len = sizeof(interface_details.ipv4_signature.signature);
@@ -5257,11 +5721,32 @@ necp_client_copy_interface(__unused struct necp_fd_data *fd_data, struct necp_cl
                        ipv6_signature_len = 0;
                }
                interface_details.ipv6_signature.signature_len = ipv6_signature_len;
+
+               ifnet_lock_shared(interface);
+               struct ifaddr *ifa = NULL;
+               TAILQ_FOREACH(ifa, &interface->if_addrhead, ifa_link) {
+                       IFA_LOCK(ifa);
+                       if (ifa->ifa_addr->sa_family == AF_INET) {
+                               interface_details.flags |= NECP_INTERFACE_FLAG_HAS_NETMASK;
+                               interface_details.ipv4_netmask = ((struct in_ifaddr *)ifa)->ia_sockmask.sin_addr.s_addr;
+                               if (interface->if_flags & IFF_BROADCAST) {
+                                       interface_details.flags |= NECP_INTERFACE_FLAG_HAS_BROADCAST;
+                                       interface_details.ipv4_broadcast = ((struct in_ifaddr *)ifa)->ia_broadaddr.sin_addr.s_addr;
+                               }
+                       }
+                       IFA_UNLOCK(ifa);
+               }
+               ifnet_lock_done(interface);
        }
 
        ifnet_head_done();
 
-       error = copyout(&interface_details, uap->buffer, sizeof(interface_details));
+       // If the client is using an older version of the struct, copy that length
+       size_t copy_length = sizeof(interface_details);
+       if (uap->buffer_size < sizeof(struct necp_interface_details_legacy)) {
+               copy_length = sizeof(struct necp_interface_details_legacy);
+       }
+       error = copyout(&interface_details, uap->buffer, copy_length);
        if (error) {
                NECPLOG(LOG_ERR, "necp_client_copy_interface copyout error (%d)", error);
                goto done;
@@ -5273,7 +5758,7 @@ done:
 }
 
 
-static int
+static NECP_CLIENT_ACTION_FUNCTION int
 necp_client_copy_route_statistics(__unused struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
 {
        int error = 0;
@@ -5337,7 +5822,7 @@ done:
        return error;
 }
 
-static int
+static NECP_CLIENT_ACTION_FUNCTION int
 necp_client_update_cache(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
 {
        int error = 0;
@@ -5460,6 +5945,119 @@ done:
        return error;
 }
 
+#define NECP_CLIENT_ACTION_SIGN_DEFAULT_HOSTNAME_LENGTH 64
+#define NECP_CLIENT_ACTION_SIGN_MAX_HOSTNAME_LENGTH 1024
+
+#define NECP_CLIENT_ACTION_SIGN_TAG_LENGTH 32
+
+static NECP_CLIENT_ACTION_FUNCTION int
+necp_client_sign(__unused struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
+{
+       int error = 0;
+       u_int32_t hostname_length = 0;
+       u_int8_t tag[NECP_CLIENT_ACTION_SIGN_TAG_LENGTH] = {};
+       struct necp_client_signable signable = {};
+       union necp_sockaddr_union address_answer = {};
+       u_int8_t *client_hostname = NULL;
+       u_int8_t *allocated_hostname = NULL;
+       u_int8_t default_hostname[NECP_CLIENT_ACTION_SIGN_DEFAULT_HOSTNAME_LENGTH] = "";
+       uint32_t tag_size = sizeof(tag);
+
+       *retval = 0;
+
+       const bool has_resolver_entitlement = (priv_check_cred(kauth_cred_get(), PRIV_NET_VALIDATED_RESOLVER, 0) == 0);
+       if (!has_resolver_entitlement) {
+               NECPLOG0(LOG_ERR, "Process does not hold the necessary entitlement to sign resolver answers");
+               error = EPERM;
+               goto done;
+       }
+
+       if (uap->client_id == 0 || uap->client_id_len < sizeof(struct necp_client_signable)) {
+               error = EINVAL;
+               goto done;
+       }
+
+       if (uap->buffer == 0 || uap->buffer_size != NECP_CLIENT_ACTION_SIGN_TAG_LENGTH) {
+               error = EINVAL;
+               goto done;
+       }
+
+       error = copyin(uap->client_id, &signable, sizeof(signable));
+       if (error) {
+               NECPLOG(LOG_ERR, "necp_client_sign copyin signable error (%d)", error);
+               goto done;
+       }
+
+       if (signable.sign_type != NECP_CLIENT_SIGN_TYPE_RESOLVER_ANSWER) {
+               NECPLOG(LOG_ERR, "necp_client_sign unknown signable type (%u)", signable.sign_type);
+               error = EINVAL;
+               goto done;
+       }
+
+       if (uap->client_id_len < sizeof(struct necp_client_resolver_answer)) {
+               error = EINVAL;
+               goto done;
+       }
+
+       error = copyin(uap->client_id + sizeof(signable), &address_answer, sizeof(address_answer));
+       if (error) {
+               NECPLOG(LOG_ERR, "necp_client_sign copyin address_answer error (%d)", error);
+               goto done;
+       }
+
+       error = copyin(uap->client_id + sizeof(signable) + sizeof(address_answer), &hostname_length, sizeof(hostname_length));
+       if (error) {
+               NECPLOG(LOG_ERR, "necp_client_sign copyin hostname_length error (%d)", error);
+               goto done;
+       }
+
+       if (hostname_length > NECP_CLIENT_ACTION_SIGN_MAX_HOSTNAME_LENGTH) {
+               error = EINVAL;
+               goto done;
+       }
+
+       if (hostname_length > NECP_CLIENT_ACTION_SIGN_DEFAULT_HOSTNAME_LENGTH) {
+               if ((allocated_hostname = _MALLOC(hostname_length, M_NECP, M_WAITOK | M_ZERO)) == NULL) {
+                       NECPLOG(LOG_ERR, "necp_client_sign malloc hostname %u failed", hostname_length);
+                       error = ENOMEM;
+                       goto done;
+               }
+
+               client_hostname = allocated_hostname;
+       } else {
+               client_hostname = default_hostname;
+       }
+
+       error = copyin(uap->client_id + sizeof(signable) + sizeof(address_answer) + sizeof(hostname_length), client_hostname, hostname_length);
+       if (error) {
+               NECPLOG(LOG_ERR, "necp_client_sign copyin hostname error (%d)", error);
+               goto done;
+       }
+
+       address_answer.sin.sin_port = 0;
+       error = necp_sign_resolver_answer(signable.client_id, client_hostname, hostname_length,
+           (u_int8_t *)&address_answer, sizeof(address_answer),
+           tag, &tag_size);
+       if (tag_size != sizeof(tag)) {
+               NECPLOG(LOG_ERR, "necp_client_sign unexpected tag size %u", tag_size);
+               error = EINVAL;
+               goto done;
+       }
+       error = copyout(tag, uap->buffer, tag_size);
+       if (error) {
+               NECPLOG(LOG_ERR, "necp_client_sign copyout error (%d)", error);
+               goto done;
+       }
+
+done:
+       if (allocated_hostname != NULL) {
+               FREE(allocated_hostname, M_NECP);
+               allocated_hostname = NULL;
+       }
+       *retval = error;
+       return error;
+}
+
 int
 necp_client_action(struct proc *p, struct necp_client_action_args *uap, int *retval)
 {
@@ -5479,6 +6077,10 @@ necp_client_action(struct proc *p, struct necp_client_action_args *uap, int *ret
                return_value = necp_client_add(p, fd_data, uap, retval);
                break;
        }
+       case NECP_CLIENT_ACTION_CLAIM: {
+               return_value = necp_client_claim(p, fd_data, uap, retval);
+               break;
+       }
        case NECP_CLIENT_ACTION_REMOVE: {
                return_value = necp_client_remove(fd_data, uap, retval);
                break;
@@ -5521,6 +6123,10 @@ necp_client_action(struct proc *p, struct necp_client_action_args *uap, int *ret
                return_value = necp_client_copy_client_update(fd_data, uap, retval);
                break;
        }
+       case NECP_CLIENT_ACTION_SIGN: {
+               return_value = necp_client_sign(fd_data, uap, retval);
+               break;
+       }
        default: {
                NECPLOG(LOG_ERR, "necp_client_action unknown action (%u)", action);
                return_value = EINVAL;
@@ -5565,7 +6171,7 @@ necp_match_policy(struct proc *p, struct necp_match_policy_args *uap, int32_t *r
        }
 
        error = necp_application_find_policy_match_internal(p, parameters, uap->parameters_size,
-           &returned_result, NULL, 0, NULL, NULL, NULL, false);
+           &returned_result, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, false, false);
        if (error) {
                goto done;
        }
@@ -5594,7 +6200,7 @@ necp_set_socket_attribute(u_int8_t *buffer, size_t buffer_length, u_int8_t type,
        char *local_string = NULL;
        u_int8_t *value = NULL;
 
-       cursor = necp_buffer_find_tlv(buffer, buffer_length, 0, type, 0);
+       cursor = necp_buffer_find_tlv(buffer, buffer_length, 0, type, NULL, 0);
        if (cursor < 0) {
                // This will clear out the parameter
                goto done;
@@ -5752,7 +6358,7 @@ done:
 
 void *
 necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, void *key, uint32_t key_length,
-    struct necp_client_endpoint *local_endpoint, struct necp_client_endpoint *remote_endpoint,
+    struct necp_client_endpoint *local_endpoint, struct necp_client_endpoint *remote_endpoint, struct ether_addr *local_ether_addr,
     u_int32_t flow_adv_index, void *flow_stats, size_t *message_length)
 {
        u_int8_t *buffer = NULL;
@@ -5760,7 +6366,6 @@ necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, vo
        size_t valsize = 0;
        bool has_nexus_assignment = FALSE;
 
-
        if (!uuid_is_null(nexus_instance)) {
                has_nexus_assignment = TRUE;
                valsize += sizeof(struct necp_tlv_header) + sizeof(uuid_t);
@@ -5778,6 +6383,9 @@ necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, vo
        if (remote_endpoint != NULL) {
                valsize += sizeof(struct necp_tlv_header) + sizeof(struct necp_client_endpoint);
        }
+       if (local_ether_addr != NULL) {
+               valsize += sizeof(struct necp_tlv_header) + sizeof(struct ether_addr);
+       }
        if (flow_stats != NULL) {
                valsize += sizeof(struct necp_tlv_header) + sizeof(void *);
        }
@@ -5807,6 +6415,9 @@ necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, vo
        if (remote_endpoint != NULL) {
                cursor = necp_buffer_write_tlv(cursor, NECP_CLIENT_RESULT_REMOTE_ENDPOINT, sizeof(struct necp_client_endpoint), remote_endpoint, buffer, valsize);
        }
+       if (local_ether_addr != NULL) {
+               cursor = necp_buffer_write_tlv(cursor, NECP_CLIENT_RESULT_LOCAL_ETHER_ADDR, sizeof(struct ether_addr), local_ether_addr, buffer, valsize);
+       }
        if (flow_stats != NULL) {
                cursor = necp_buffer_write_tlv(cursor, NECP_CLIENT_RESULT_NEXUS_FLOW_STATS, sizeof(void *), &flow_stats, buffer, valsize);
        }
diff --git a/bsd/net/net_log_common.h b/bsd/net/net_log_common.h
new file mode 100644 (file)
index 0000000..d6b8a19
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _NET_LOG_COMMON_H_
+#define _NET_LOG_COMMON_H_
+
+#include <os/log.h>
+
+#define NET_LOG_SUBSYSTEM_PREFIX "com.apple.xnu.net"
+
+#define NET_LOG_SUBSYSTEM_MPTCP         NET_LOG_SUBSYSTEM_PREFIX ".mptcp"
+#define NET_LOG_SUBSYSTEM_TCP           NET_LOG_SUBSYSTEM_PREFIX ".tcp"
+
+#endif /* _NET_LOG_COMMON_H_ */
index 26f008ade75c8c00e6b30586b7af7b05db332d58..637006974d0f214cf36219d2a2406d444afb5048 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008,2011 Apple Inc. All rights reserved.
+ * Copyright (c) 2008,2011,2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -42,7 +42,7 @@
 #include "net/net_str_id.h"
 
 #define NET_ID_STR_ENTRY_SIZE(__str) \
-       ((size_t)&(((struct net_str_id_entry*)0)->nsi_string[0]) + \
+       (__builtin_offsetof(struct net_str_id_entry, nsi_string[0]) + \
        strlen(__str) + 1)
 
 #define FIRST_NET_STR_ID                                1000
index 27c2f1e036757d0373c97399debc98922623201f..169575c242b1885ce4efcfa28c6c6dddc9f95d11 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -258,6 +258,8 @@ STUB(mbuf_get_timestamp_requested);
 STUB(mbuf_set_timestamp_requested);
 STUB(mbuf_register_tx_compl_callback);
 STUB(mbuf_unregister_tx_compl_callback);
+STUB(mbuf_get_keepalive_flag);
+STUB(mbuf_set_keepalive_flag);
 STUB(net_init_add);
 STUB(proto_inject);
 STUB(proto_input);
@@ -326,6 +328,7 @@ STUB(ifnet_link_quality);
 STUB(ifnet_notice_master_elected);
 STUB(ifnet_notice_node_absence);
 STUB(ifnet_notice_node_presence);
+STUB(ifnet_notice_node_presence_v2);
 STUB(ifnet_poll_params);
 STUB(ifnet_purge);
 STUB(ifnet_report_issues);
@@ -351,6 +354,8 @@ STUB(ifnet_get_unsent_bytes);
 STUB(ifnet_get_buffer_status);
 STUB(ifnet_normalise_unsent_data);
 STUB(ifnet_set_low_power_mode);
+STUB(ifnet_notify_tcp_keepalive_offload_timeout);
+STUB(ifnet_interface_advisory_report);
 STUB(in6_localaddr);
 STUB(in_localaddr);
 STUB(in6addr_local);
@@ -388,6 +393,7 @@ STUB(net_del_domain);
 STUB(net_del_domain_old);
 STUB(net_del_proto);
 STUB(net_del_proto_old);
+STUB(net_domain_contains_hostname);
 STUB(pffinddomain);
 STUB(pffinddomain_old);
 STUB(pffindproto);
index 1b53940ae44ed5f20f00f528c898fefe4dd06914..4a10ea9659dd1f03752252961113b3981dc3b11e 100644 (file)
@@ -137,7 +137,7 @@ netsrc_common(struct rtentry *rt, struct netsrc_rep *reply)
        }
        reply->nrp_ifindex = rt->rt_ifp ? rt->rt_ifp->if_index : 0;
 
-       if (rt->rt_ifp->if_eflags & IFEF_AWDL) {
+       if (rt->rt_ifp != NULL && (rt->rt_ifp->if_eflags & IFEF_AWDL)) {
                reply->nrp_flags |= NETSRC_FLAG_AWDL;
        }
        if (rt->rt_flags & RTF_LOCAL) {
index d5401631197369f384cbd43d64c2de7914dd3255..a7d27aed899dd106b032e356c60ca1d761f50fd3 100644 (file)
@@ -44,6 +44,7 @@
 #include <net/network_agent.h>
 #include <net/if_var.h>
 #include <net/necp.h>
+#include <os/log.h>
 
 u_int32_t netagent_debug = LOG_NOTICE; // 0=None, 1=Basic
 
@@ -58,14 +59,24 @@ static int netagent_active_count = 0;
 SYSCTL_INT(_net_netagent, OID_AUTO, active_count, CTLFLAG_RD | CTLFLAG_LOCKED,
     &netagent_active_count, 0, "");
 
-#define NETAGENTLOG(level, format, ...) do {                                                                                    \
-       if (level <= netagent_debug)                                    \
-               log((level > LOG_NOTICE ? LOG_NOTICE : level), "%s: " format "\n", __FUNCTION__, __VA_ARGS__);  \
+#define NETAGENTLOG(level, format, ...) do {                                             \
+    if (level <= netagent_debug) {                                                       \
+       if (level == LOG_ERR) {                                                          \
+           os_log_error(OS_LOG_DEFAULT, "%s: " format "\n", __FUNCTION__, __VA_ARGS__); \
+       } else {                                                                         \
+           os_log(OS_LOG_DEFAULT, "%s: " format "\n", __FUNCTION__, __VA_ARGS__);       \
+       }                                                                                \
+    }                                                                                    \
 } while (0)
 
-#define NETAGENTLOG0(level, msg) do {                                                                                   \
-       if (level <= netagent_debug)                                    \
-               log((level > LOG_NOTICE ? LOG_NOTICE : level), "%s: %s\n", __FUNCTION__, msg);  \
+#define NETAGENTLOG0(level, msg) do {                                                    \
+    if (level <= netagent_debug) {                                                       \
+               if (level == LOG_ERR) {                                                          \
+           os_log_error(OS_LOG_DEFAULT, "%s: %s\n", __FUNCTION__, msg);                 \
+       } else {                                                                         \
+           os_log(OS_LOG_DEFAULT, "%s: %s\n", __FUNCTION__, msg);                       \
+       }                                                                                \
+    }                                                                                    \
 } while (0)
 
 struct netagent_client {
@@ -285,10 +296,10 @@ netagent_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo)
 
 // Kernel events
 static void
-netagent_post_event(uuid_t agent_uuid, u_int32_t event_code, bool update_necp)
+netagent_post_event(uuid_t agent_uuid, u_int32_t event_code, bool update_necp, bool should_update_immediately)
 {
        if (update_necp) {
-               necp_update_all_clients();
+               necp_update_all_clients_immediately_if_needed(should_update_immediately);
        }
 
        struct kev_msg ev_msg;
@@ -678,7 +689,7 @@ netagent_unregister_session_wrapper(struct netagent_session *session)
 
        if (unregistered) {
                ifnet_clear_netagent(unregistered_uuid);
-               netagent_post_event(unregistered_uuid, KEV_NETAGENT_UNREGISTERED, TRUE);
+               netagent_post_event(unregistered_uuid, KEV_NETAGENT_UNREGISTERED, TRUE, false);
        }
 }
 
@@ -777,7 +788,7 @@ netagent_register(netagent_session_t _session, struct netagent *agent)
        }
 
        memset(new_wrapper, 0, sizeof(*new_wrapper) + data_size);
-       memcpy(&new_wrapper->netagent, agent, sizeof(struct netagent) + data_size);
+       __nochk_memcpy(&new_wrapper->netagent, agent, sizeof(struct netagent) + data_size);
 
        int error = netagent_handle_register_inner(session, new_wrapper);
        if (error != 0) {
@@ -786,7 +797,7 @@ netagent_register(netagent_session_t _session, struct netagent *agent)
        }
 
        NETAGENTLOG0(LOG_DEBUG, "Registered new agent");
-       netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE);
+       netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE, false);
 
        return 0;
 }
@@ -846,7 +857,7 @@ netagent_handle_register_setopt(struct netagent_session *session, u_int8_t *payl
        }
 
        memset(new_wrapper, 0, sizeof(*new_wrapper) + data_size);
-       memcpy(&new_wrapper->netagent, register_netagent, sizeof(struct netagent) + data_size);
+       __nochk_memcpy(&new_wrapper->netagent, register_netagent, sizeof(struct netagent) + data_size);
 
        response_error = netagent_handle_register_inner(session, new_wrapper);
        if (response_error != 0) {
@@ -855,7 +866,7 @@ netagent_handle_register_setopt(struct netagent_session *session, u_int8_t *payl
        }
 
        NETAGENTLOG0(LOG_DEBUG, "Registered new agent");
-       netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE);
+       netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE, false);
 
 done:
        return response_error;
@@ -921,7 +932,7 @@ netagent_handle_register_message(struct netagent_session *session, u_int32_t mes
 
        NETAGENTLOG0(LOG_DEBUG, "Registered new agent");
        netagent_send_success_response(session, NETAGENT_MESSAGE_TYPE_REGISTER, message_id);
-       netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE);
+       netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE, false);
        return;
 fail:
        netagent_send_error_response(session, NETAGENT_MESSAGE_TYPE_REGISTER, message_id, response_error);
@@ -1121,11 +1132,12 @@ netagent_update(netagent_session_t _session, struct netagent *agent)
        }
 
        memset(new_wrapper, 0, sizeof(*new_wrapper) + data_size);
-       memcpy(&new_wrapper->netagent, agent, sizeof(struct netagent) + data_size);
+       __nochk_memcpy(&new_wrapper->netagent, agent, sizeof(struct netagent) + data_size);
 
        int error = netagent_handle_update_inner(session, new_wrapper, data_size, &agent_changed, kNetagentErrorDomainPOSIX);
        if (error == 0) {
-               netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed);
+               bool should_update_immediately = (NETAGENT_FLAG_UPDATE_IMMEDIATELY == (session->wrapper->netagent.netagent_flags & NETAGENT_FLAG_UPDATE_IMMEDIATELY));
+               netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed, should_update_immediately);
                if (agent_changed == FALSE) {
                        // The session wrapper does not need the "new_wrapper" as nothing changed
                        FREE(new_wrapper, M_NETAGENT);
@@ -1193,11 +1205,12 @@ netagent_handle_update_setopt(struct netagent_session *session, u_int8_t *payloa
        }
 
        memset(new_wrapper, 0, sizeof(*new_wrapper) + data_size);
-       memcpy(&new_wrapper->netagent, update_netagent, sizeof(struct netagent) + data_size);
+       __nochk_memcpy(&new_wrapper->netagent, update_netagent, sizeof(struct netagent) + data_size);
 
        response_error = netagent_handle_update_inner(session, new_wrapper, data_size, &agent_changed, kNetagentErrorDomainPOSIX);
        if (response_error == 0) {
-               netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed);
+               bool should_update_immediately = (NETAGENT_FLAG_UPDATE_IMMEDIATELY == (session->wrapper->netagent.netagent_flags & NETAGENT_FLAG_UPDATE_IMMEDIATELY));
+               netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed, should_update_immediately);
                if (agent_changed == FALSE) {
                        // The session wrapper does not need the "new_wrapper" as nothing changed
                        FREE(new_wrapper, M_NETAGENT);
@@ -1271,7 +1284,8 @@ netagent_handle_update_message(struct netagent_session *session, u_int32_t messa
        }
 
        netagent_send_success_response(session, NETAGENT_MESSAGE_TYPE_UPDATE, message_id);
-       netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed);
+       bool should_update_immediately = (NETAGENT_FLAG_UPDATE_IMMEDIATELY == (session->wrapper->netagent.netagent_flags & NETAGENT_FLAG_UPDATE_IMMEDIATELY));
+       netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed, should_update_immediately);
 
        if (agent_changed == FALSE) {
                // The session wrapper does not need the "new_wrapper" as nothing changed
@@ -1624,7 +1638,7 @@ netagent_post_updated_interfaces(uuid_t uuid)
        lck_rw_done(&netagent_lock);
 
        if (wrapper != NULL) {
-               netagent_post_event(uuid, KEV_NETAGENT_UPDATED_INTERFACES, TRUE);
+               netagent_post_event(uuid, KEV_NETAGENT_UPDATED_INTERFACES, TRUE, false);
        } else {
                NETAGENTLOG0(LOG_DEBUG, "Interface event with no associated agent");
        }
@@ -1802,6 +1816,24 @@ netagent_get_flags(uuid_t uuid)
        return flags;
 }
 
+errno_t
+netagent_set_flags(uuid_t uuid, u_int32_t flags)
+{
+       errno_t error = 0;
+       lck_rw_lock_exclusive(&netagent_lock);
+       struct netagent_wrapper *wrapper = netagent_find_agent_with_uuid(uuid);
+       if (wrapper != NULL) {
+               wrapper->netagent.netagent_flags = flags;
+       } else {
+               NETAGENTLOG0(LOG_DEBUG,
+                   "Attempt to set flags for invalid netagent");
+               error = ENOENT;
+       }
+       lck_rw_done(&netagent_lock);
+
+       return error;
+}
+
 u_int32_t
 netagent_get_generation(uuid_t uuid)
 {
@@ -1881,7 +1913,7 @@ netagent_client_message_with_params(uuid_t agent_uuid,
     pid_t pid,
     void *handle,
     u_int8_t message_type,
-    struct necp_client_nexus_parameters *parameters,
+    struct necp_client_agent_parameters *parameters,
     void **assigned_results,
     size_t *assigned_results_length)
 {
@@ -1916,8 +1948,8 @@ netagent_client_message_with_params(uuid_t agent_uuid,
                        pid_t report_pid = 0;
                        uuid_t report_proc_uuid = {};
                        if (parameters != NULL) {
-                               report_pid = parameters->epid;
-                               uuid_copy(report_proc_uuid, parameters->euuid);
+                               report_pid = parameters->u.nexus_request.epid;
+                               uuid_copy(report_proc_uuid, parameters->u.nexus_request.euuid);
                        } else {
                                struct proc *p = current_proc();
                                if (p != NULL) {
@@ -1931,7 +1963,13 @@ netagent_client_message_with_params(uuid_t agent_uuid,
        } else if (message_type == NETAGENT_MESSAGE_TYPE_REQUEST_NEXUS ||
            message_type == NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS ||
            message_type == NETAGENT_MESSAGE_TYPE_ABORT_NEXUS) {
-               if ((wrapper->netagent.netagent_flags & NETAGENT_FLAG_NEXUS_PROVIDER) == 0) {
+               bool is_nexus_agent = ((wrapper->netagent.netagent_flags &
+                   (NETAGENT_FLAG_NEXUS_PROVIDER |
+                   NETAGENT_FLAG_NEXUS_LISTENER |
+                   NETAGENT_FLAG_CUSTOM_IP_NEXUS |
+                   NETAGENT_FLAG_CUSTOM_ETHER_NEXUS |
+                   NETAGENT_FLAG_INTERPOSE_NEXUS)) != 0);
+               if (!is_nexus_agent) {
                        NETAGENTLOG0(LOG_ERR, "Requested netagent for nexus instance is not a nexus provider");
                        // Agent is not a nexus provider
                        error = EINVAL;
@@ -1979,8 +2017,8 @@ netagent_client_message_with_params(uuid_t agent_uuid,
                                } else {
                                        uuid_copy(new_pending_client->client_id, necp_client_uuid);
                                        if (parameters != NULL) {
-                                               new_pending_client->client_pid = parameters->epid;
-                                               uuid_copy(new_pending_client->client_proc_uuid, parameters->euuid);
+                                               new_pending_client->client_pid = parameters->u.nexus_request.epid;
+                                               uuid_copy(new_pending_client->client_proc_uuid, parameters->u.nexus_request.euuid);
                                        } else {
                                                struct proc *p = current_proc();
                                                if (p != NULL) {
@@ -2082,8 +2120,8 @@ netagent_trigger(struct proc *p, struct netagent_trigger_args *uap, int32_t *ret
 
        if (uap->agent_uuid) {
                if (uap->agent_uuidlen != sizeof(uuid_t)) {
-                       NETAGENTLOG(LOG_ERR, "Incorrect length (got %llu, expected %lu)",
-                           uap->agent_uuidlen, sizeof(uuid_t));
+                       NETAGENTLOG(LOG_ERR, "Incorrect length (got %zu, expected %lu)",
+                           (size_t)uap->agent_uuidlen, sizeof(uuid_t));
                        return ERANGE;
                }
 
index d51352628d84e9fa7d7f232545c0c5e6f9660237..3afa6624ae508d1708ac4d51391628d436dacf29 100644 (file)
@@ -108,15 +108,22 @@ struct netagent_assign_nexus_message {
 
 #define NETAGENT_MAX_DATA_SIZE  4096
 
-#define NETAGENT_FLAG_REGISTERED                0x0001  // Agent is registered
-#define NETAGENT_FLAG_ACTIVE                    0x0002  // Agent is active
-#define NETAGENT_FLAG_KERNEL_ACTIVATED          0x0004  // Agent can be activated by kernel activity
-#define NETAGENT_FLAG_USER_ACTIVATED            0x0008  // Agent can be activated by system call (netagent_trigger)
-#define NETAGENT_FLAG_VOLUNTARY                 0x0010  // Use of agent is optional
-#define NETAGENT_FLAG_SPECIFIC_USE_ONLY         0x0020  // Agent should only be used and activated when specifically required
+#define NETAGENT_FLAG_REGISTERED                0x0001 // Agent is registered
+#define NETAGENT_FLAG_ACTIVE                    0x0002 // Agent is active
+#define NETAGENT_FLAG_KERNEL_ACTIVATED          0x0004 // Agent can be activated by kernel activity
+#define NETAGENT_FLAG_USER_ACTIVATED            0x0008 // Agent can be activated by system call (netagent_trigger)
+#define NETAGENT_FLAG_VOLUNTARY                 0x0010 // Use of agent is optional
+#define NETAGENT_FLAG_SPECIFIC_USE_ONLY         0x0020 // Agent should only be used and activated when specifically required
 #define NETAGENT_FLAG_NETWORK_PROVIDER          0x0040 // Agent provides network access
 #define NETAGENT_FLAG_NEXUS_PROVIDER            0x0080 // Agent provides a skywalk nexus
 #define NETAGENT_FLAG_SUPPORTS_BROWSE           0x0100 // Assertions will cause agent to fill in browse endpoints
+#define NETAGENT_FLAG_REQUIRES_ASSERT           0x0200 // Assertions are expected to be taken against this agent
+#define NETAGENT_FLAG_NEXUS_LISTENER            0x0400 // Nexus supports listeners
+#define NETAGENT_FLAG_UPDATE_IMMEDIATELY        0x0800 // Updates the clients without waiting for a leeway
+#define NETAGENT_FLAG_CUSTOM_ETHER_NEXUS        0x2000 // Agent provides a custom ethertype nexus
+#define NETAGENT_FLAG_CUSTOM_IP_NEXUS           0x4000 // Agent provides a custom IP nexus
+#define NETAGENT_FLAG_INTERPOSE_NEXUS           0x8000 // Agent provides an interpose nexus
+#define NETAGENT_FLAG_SUPPORTS_RESOLVE          0x10000 // Assertions will cause agent to fill in resolved endpoints
 
 #define NETAGENT_NEXUS_MAX_REQUEST_TYPES                        16
 #define NETAGENT_NEXUS_MAX_RESOLUTION_TYPE_PAIRS        16
@@ -130,9 +137,11 @@ struct netagent_assign_nexus_message {
 #define NETAGENT_NEXUS_ENDPOINT_TYPE_ADDRESS    1
 #define NETAGENT_NEXUS_ENDPOINT_TYPE_HOST               2
 #define NETAGENT_NEXUS_ENDPOINT_TYPE_BONJOUR    3
+#define NETAGENT_NEXUS_ENDPOINT_TYPE_SRV        5
 
 #define NETAGENT_NEXUS_FLAG_SUPPORTS_USER_PACKET_POOL   0x1
 #define NETAGENT_NEXUS_FLAG_ASSERT_UNSUPPORTED                  0x2 // No calls to assert the agent are required
+#define NETAGENT_NEXUS_FLAG_SHOULD_USE_EVENT_RING       0x4 // indicates that nexus agent should use event rings
 
 struct netagent_nexus {
        u_int32_t       frame_type;
@@ -206,13 +215,15 @@ struct netagentlist_req64 {
        user64_addr_t   data __attribute__((aligned(8)));
 };
 
-struct necp_client_nexus_parameters;
+struct necp_client_agent_parameters;
 
 // Kernel accessors
 extern void netagent_post_updated_interfaces(uuid_t uuid); // To be called from interface ioctls
 
 extern u_int32_t netagent_get_flags(uuid_t uuid);
 
+extern errno_t netagent_set_flags(uuid_t uuid, u_int32_t flags);
+
 extern u_int32_t netagent_get_generation(uuid_t uuid);
 
 extern bool netagent_get_agent_domain_and_type(uuid_t uuid, char *domain, char *type);
@@ -226,7 +237,7 @@ extern int netagent_client_message_with_params(uuid_t agent_uuid,
     pid_t pid,
     void *handle,
     u_int8_t message_type,
-    struct necp_client_nexus_parameters *parameters,
+    struct necp_client_agent_parameters *parameters,
     void **assigned_results,
     size_t *assigned_results_length);
 
@@ -249,7 +260,7 @@ struct netagent_nexus_agent {
 #define NETAGENT_EVENT_NEXUS_FLOW_REMOVE                        NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS
 #define NETAGENT_EVENT_NEXUS_FLOW_ABORT                         NETAGENT_MESSAGE_TYPE_ABORT_NEXUS
 
-typedef errno_t (*netagent_event_f)(u_int8_t event, uuid_t necp_client_uuid, pid_t pid, void *necp_handle, void *context, struct necp_client_nexus_parameters *parameters, void **assigned_results, size_t *assigned_results_length);
+typedef errno_t (*netagent_event_f)(u_int8_t event, uuid_t necp_client_uuid, pid_t pid, void *necp_handle, void *context, struct necp_client_agent_parameters *parameters, void **assigned_results, size_t *assigned_results_length);
 
 extern netagent_session_t netagent_create(netagent_event_f event_handler, void *handle);
 
index 7a33d6832aef94db6b49ee6e18a3c084c6ce0ab8..cd6c3dacb9eca6472d3510660babbf3932c9a181 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2010-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -139,9 +139,9 @@ SYSCTL_UINT(_net_stats, OID_AUTO, api_report_interval,
 #endif /* DEBUG || DEVELOPMENT */
 
 enum{
-       NSTAT_FLAG_CLEANUP                              = (1 << 0),
-       NSTAT_FLAG_REQCOUNTS                    = (1 << 1),
-       NSTAT_FLAG_SUPPORTS_UPDATES             = (1 << 2),
+       NSTAT_FLAG_CLEANUP              = (1 << 0),
+       NSTAT_FLAG_REQCOUNTS            = (1 << 1),
+       NSTAT_FLAG_SUPPORTS_UPDATES     = (1 << 2),
        NSTAT_FLAG_SYSINFO_SUBSCRIBED   = (1 << 3),
 };
 
@@ -151,8 +151,8 @@ enum{
 #define QUERY_CONTINUATION_SRC_COUNT 100
 #endif
 
-typedef TAILQ_HEAD(, nstat_src)         tailq_head_nstat_src;
-typedef TAILQ_ENTRY(nstat_src)          tailq_entry_nstat_src;
+typedef TAILQ_HEAD(, nstat_src)     tailq_head_nstat_src;
+typedef TAILQ_ENTRY(nstat_src)      tailq_entry_nstat_src;
 
 typedef struct nstat_provider_filter {
        u_int64_t                       npf_flags;
@@ -164,36 +164,36 @@ typedef struct nstat_provider_filter {
 
 typedef struct nstat_control_state {
        struct nstat_control_state      *ncs_next;
-       u_int32_t                               ncs_watching;
+       u_int32_t               ncs_watching;
        decl_lck_mtx_data(, ncs_mtx);
-       kern_ctl_ref                    ncs_kctl;
-       u_int32_t                               ncs_unit;
-       nstat_src_ref_t                 ncs_next_srcref;
+       kern_ctl_ref            ncs_kctl;
+       u_int32_t               ncs_unit;
+       nstat_src_ref_t         ncs_next_srcref;
        tailq_head_nstat_src    ncs_src_queue;
-       mbuf_t                                  ncs_accumulated;
-       u_int32_t                               ncs_flags;
+       mbuf_t                  ncs_accumulated;
+       u_int32_t               ncs_flags;
        nstat_provider_filter   ncs_provider_filters[NSTAT_PROVIDER_COUNT];
        /* state maintained for partial query requests */
-       u_int64_t                               ncs_context;
-       u_int64_t                               ncs_seq;
+       u_int64_t               ncs_context;
+       u_int64_t               ncs_seq;
 } nstat_control_state;
 
 typedef struct nstat_provider {
        struct nstat_provider   *next;
-       nstat_provider_id_t             nstat_provider_id;
-       size_t                                  nstat_descriptor_length;
-       errno_t                                 (*nstat_lookup)(const void *data, u_int32_t length, nstat_provider_cookie_t *out_cookie);
-       int                                             (*nstat_gone)(nstat_provider_cookie_t cookie);
-       errno_t                                 (*nstat_counts)(nstat_provider_cookie_t cookie, struct nstat_counts *out_counts, int *out_gone);
-       errno_t                                 (*nstat_watcher_add)(nstat_control_state *state, nstat_msg_add_all_srcs *req);
-       void                                    (*nstat_watcher_remove)(nstat_control_state *state);
-       errno_t                                 (*nstat_copy_descriptor)(nstat_provider_cookie_t cookie, void *data, u_int32_t len);
-       void                                    (*nstat_release)(nstat_provider_cookie_t cookie, boolean_t locked);
-       bool                                    (*nstat_reporting_allowed)(nstat_provider_cookie_t cookie, nstat_provider_filter *filter);
+       nstat_provider_id_t     nstat_provider_id;
+       size_t                  nstat_descriptor_length;
+       errno_t                 (*nstat_lookup)(const void *data, u_int32_t length, nstat_provider_cookie_t *out_cookie);
+       int                     (*nstat_gone)(nstat_provider_cookie_t cookie);
+       errno_t                 (*nstat_counts)(nstat_provider_cookie_t cookie, struct nstat_counts *out_counts, int *out_gone);
+       errno_t                 (*nstat_watcher_add)(nstat_control_state *state, nstat_msg_add_all_srcs *req);
+       void                    (*nstat_watcher_remove)(nstat_control_state *state);
+       errno_t                 (*nstat_copy_descriptor)(nstat_provider_cookie_t cookie, void *data, u_int32_t len);
+       void                    (*nstat_release)(nstat_provider_cookie_t cookie, boolean_t locked);
+       bool                    (*nstat_reporting_allowed)(nstat_provider_cookie_t cookie, nstat_provider_filter *filter);
 } nstat_provider;
 
-typedef STAILQ_HEAD(, nstat_src)                stailq_head_nstat_src;
-typedef STAILQ_ENTRY(nstat_src)                 stailq_entry_nstat_src;
+typedef STAILQ_HEAD(, nstat_src)        stailq_head_nstat_src;
+typedef STAILQ_ENTRY(nstat_src)         stailq_entry_nstat_src;
 
 typedef TAILQ_HEAD(, nstat_tu_shadow)   tailq_head_tu_shadow;
 typedef TAILQ_ENTRY(nstat_tu_shadow)    tailq_entry_tu_shadow;
@@ -203,31 +203,31 @@ typedef TAILQ_ENTRY(nstat_procdetails)  tailq_entry_procdetails;
 
 typedef struct nstat_src {
        tailq_entry_nstat_src   ns_control_link;        // All sources for the nstat_control_state, for iterating over.
-       nstat_control_state             *ns_control;            // The nstat_control_state that this is a source for
-       nstat_src_ref_t                 srcref;
-       nstat_provider                  *provider;
-       nstat_provider_cookie_t         cookie;
-       uint32_t                        filter;
-       uint64_t                        seq;
+       nstat_control_state     *ns_control;            // The nstat_control_state that this is a source for
+       nstat_src_ref_t         srcref;
+       nstat_provider          *provider;
+       nstat_provider_cookie_t cookie;
+       uint32_t                filter;
+       uint64_t                seq;
 } nstat_src;
 
-static errno_t          nstat_control_send_counts(nstat_control_state *,
-    nstat_src *, unsigned long long, u_int16_t, int *);
-static int              nstat_control_send_description(nstat_control_state *state, nstat_src *src, u_int64_t context, u_int16_t hdr_flags);
-static int nstat_control_send_update(nstat_control_state *state, nstat_src *src, u_int64_t context, u_int16_t hdr_flags, int *gone);
-static errno_t          nstat_control_send_removed(nstat_control_state *, nstat_src *);
-static errno_t          nstat_control_send_goodbye(nstat_control_state  *state, nstat_src *src);
-static void             nstat_control_cleanup_source(nstat_control_state *state, nstat_src *src, boolean_t);
-static bool             nstat_control_reporting_allowed(nstat_control_state *state, nstat_src *src);
-static boolean_t        nstat_control_begin_query(nstat_control_state *state, const nstat_msg_hdr *hdrp);
-static u_int16_t        nstat_control_end_query(nstat_control_state *state, nstat_src *last_src, boolean_t partial);
-static void             nstat_ifnet_report_ecn_stats(void);
-static void             nstat_ifnet_report_lim_stats(void);
-static void             nstat_net_api_report_stats(void);
-static errno_t  nstat_set_provider_filter( nstat_control_state  *state, nstat_msg_add_all_srcs *req);
-
-static u_int32_t        nstat_udp_watchers = 0;
-static u_int32_t        nstat_tcp_watchers = 0;
+static errno_t      nstat_control_send_counts(nstat_control_state *, nstat_src *, unsigned long long, u_int16_t, int *);
+static int          nstat_control_send_description(nstat_control_state *state, nstat_src *src, u_int64_t context, u_int16_t hdr_flags);
+static int          nstat_control_send_update(nstat_control_state *state, nstat_src *src, u_int64_t context, u_int64_t event, u_int16_t hdr_flags, int *gone);
+static errno_t      nstat_control_send_removed(nstat_control_state *, nstat_src *);
+static errno_t      nstat_control_send_goodbye(nstat_control_state  *state, nstat_src *src);
+static void         nstat_control_cleanup_source(nstat_control_state *state, nstat_src *src, boolean_t);
+static bool         nstat_control_reporting_allowed(nstat_control_state *state, nstat_src *src);
+static boolean_t    nstat_control_begin_query(nstat_control_state *state, const nstat_msg_hdr *hdrp);
+static u_int16_t    nstat_control_end_query(nstat_control_state *state, nstat_src *last_src, boolean_t partial);
+static void         nstat_ifnet_report_ecn_stats(void);
+static void         nstat_ifnet_report_lim_stats(void);
+static void         nstat_net_api_report_stats(void);
+static errno_t      nstat_set_provider_filter( nstat_control_state  *state, nstat_msg_add_all_srcs *req);
+static errno_t nstat_control_send_event(nstat_control_state *state, nstat_src *src, u_int64_t event);
+
+static u_int32_t    nstat_udp_watchers = 0;
+static u_int32_t    nstat_tcp_watchers = 0;
 
 static void nstat_control_register(void);
 
@@ -273,9 +273,9 @@ nstat_copy_sa_out(
 static void
 nstat_ip_to_sockaddr(
        const struct in_addr    *ip,
-       u_int16_t                               port,
-       struct sockaddr_in              *sin,
-       u_int32_t                               maxlen)
+       u_int16_t               port,
+       struct sockaddr_in      *sin,
+       u_int32_t               maxlen)
 {
        if (maxlen < sizeof(struct sockaddr_in)) {
                return;
@@ -318,11 +318,17 @@ nstat_ifnet_to_flags(
        case IFRTYPE_FUNCTIONAL_CELLULAR:
                flags |= NSTAT_IFNET_IS_CELLULAR;
                break;
+       case IFRTYPE_FUNCTIONAL_COMPANIONLINK:
+               flags |= NSTAT_IFNET_IS_COMPANIONLINK;
+               break;
        }
 
        if (IFNET_IS_EXPENSIVE(ifp)) {
                flags |= NSTAT_IFNET_IS_EXPENSIVE;
        }
+       if (IFNET_IS_CONSTRAINED(ifp)) {
+               flags |= NSTAT_IFNET_IS_CONSTRAINED;
+       }
 
        return flags;
 }
@@ -333,20 +339,27 @@ nstat_inpcb_to_flags(
 {
        u_int16_t flags = 0;
 
-       if ((inp != NULL) && (inp->inp_last_outifp != NULL)) {
-               struct ifnet *ifp = inp->inp_last_outifp;
-               flags = nstat_ifnet_to_flags(ifp);
+       if (inp != NULL) {
+               if (inp->inp_last_outifp != NULL) {
+                       struct ifnet *ifp = inp->inp_last_outifp;
+                       flags = nstat_ifnet_to_flags(ifp);
 
-               if (flags & NSTAT_IFNET_IS_CELLULAR) {
-                       if (inp->inp_socket != NULL &&
-                           (inp->inp_socket->so_flags1 & SOF1_CELLFALLBACK)) {
-                               flags |= NSTAT_IFNET_VIA_CELLFALLBACK;
+                       struct tcpcb  *tp = intotcpcb(inp);
+                       if (tp) {
+                               if (tp->t_flags & TF_LOCAL) {
+                                       flags |= NSTAT_IFNET_IS_LOCAL;
+                               } else {
+                                       flags |= NSTAT_IFNET_IS_NON_LOCAL;
+                               }
                        }
+               } else {
+                       flags = NSTAT_IFNET_IS_UNKNOWN_TYPE;
+               }
+               if (inp->inp_socket != NULL &&
+                   (inp->inp_socket->so_flags1 & SOF1_CELLFALLBACK)) {
+                       flags |= NSTAT_IFNET_VIA_CELLFALLBACK;
                }
-       } else {
-               flags = NSTAT_IFNET_IS_UNKNOWN_TYPE;
        }
-
        return flags;
 }
 
@@ -372,10 +385,10 @@ nstat_find_provider_by_id(
 
 static errno_t
 nstat_lookup_entry(
-       nstat_provider_id_t             id,
-       const void                              *data,
-       u_int32_t                               length,
-       nstat_provider                  **out_provider,
+       nstat_provider_id_t     id,
+       const void              *data,
+       u_int32_t               length,
+       nstat_provider          **out_provider,
        nstat_provider_cookie_t *out_cookie)
 {
        *out_provider = nstat_find_provider_by_id(id);
@@ -426,14 +439,14 @@ nstat_malloc_aligned(
        OSMallocTag     tag)
 {
        struct align_header     *hdr = NULL;
-       u_int32_t       size = length + sizeof(*hdr) + alignment - 1;
+       u_int32_t size = length + sizeof(*hdr) + alignment - 1;
 
-       u_int8_t        *buffer = OSMalloc(size, tag);
+       u_int8_t *buffer = OSMalloc(size, tag);
        if (buffer == NULL) {
                return NULL;
        }
 
-       u_int8_t        *aligned = buffer + sizeof(*hdr);
+       u_int8_t *aligned = buffer + sizeof(*hdr);
        aligned = (u_int8_t*)P2ROUNDUP(aligned, alignment);
 
        hdr = (struct align_header*)(void *)(aligned - sizeof(*hdr));
@@ -458,8 +471,8 @@ static nstat_provider   nstat_route_provider;
 
 static errno_t
 nstat_route_lookup(
-       const void                              *data,
-       u_int32_t                               length,
+       const void      *data,
+       u_int32_t       length,
        nstat_provider_cookie_t *out_cookie)
 {
        // rt_lookup doesn't take const params but it doesn't modify the parameters for
@@ -523,8 +536,8 @@ nstat_route_gone(
 static errno_t
 nstat_route_counts(
        nstat_provider_cookie_t cookie,
-       struct nstat_counts             *out_counts,
-       int                                             *out_gone)
+       struct nstat_counts     *out_counts,
+       int                     *out_gone)
 {
        struct rtentry          *rt = (struct rtentry*)cookie;
        struct nstat_counts     *rt_stats = rt->rt_stats;
@@ -566,7 +579,7 @@ nstat_route_release(
        rtfree((struct rtentry*)cookie);
 }
 
-static u_int32_t        nstat_route_watchers = 0;
+static u_int32_t    nstat_route_watchers = 0;
 
 static int
 nstat_route_walktree_add(
@@ -607,7 +620,7 @@ nstat_route_walktree_add(
 
 static errno_t
 nstat_route_add_watcher(
-       nstat_control_state     *state,
+       nstat_control_state *state,
        nstat_msg_add_all_srcs *req)
 {
        int i;
@@ -678,8 +691,8 @@ nstat_route_remove_watcher(
 static errno_t
 nstat_route_copy_descriptor(
        nstat_provider_cookie_t cookie,
-       void                                    *data,
-       u_int32_t                               len)
+       void                    *data,
+       u_int32_t               len)
 {
        nstat_route_descriptor  *desc = (nstat_route_descriptor*)data;
        if (len < sizeof(*desc)) {
@@ -828,9 +841,9 @@ nstat_route_connect_success(
 __private_extern__ void
 nstat_route_tx(
        struct rtentry  *rte,
-       u_int32_t               packets,
-       u_int32_t               bytes,
-       u_int32_t               flags)
+       u_int32_t       packets,
+       u_int32_t       bytes,
+       u_int32_t       flags)
 {
        while (rte) {
                struct nstat_counts*    stats = nstat_route_attach(rte);
@@ -850,9 +863,9 @@ nstat_route_tx(
 __private_extern__ void
 nstat_route_rx(
        struct rtentry  *rte,
-       u_int32_t               packets,
-       u_int32_t               bytes,
-       u_int32_t               flags)
+       u_int32_t       packets,
+       u_int32_t       bytes,
+       u_int32_t       flags)
 {
        while (rte) {
                struct nstat_counts*    stats = nstat_route_attach(rte);
@@ -1154,8 +1167,8 @@ nstat_tcpudp_lookup(
 
 static errno_t
 nstat_tcp_lookup(
-       const void                              *data,
-       u_int32_t                               length,
+       const void              *data,
+       u_int32_t               length,
        nstat_provider_cookie_t *out_cookie)
 {
        return nstat_tcpudp_lookup(&tcbinfo, data, length, out_cookie);
@@ -1178,8 +1191,8 @@ nstat_tcp_gone(
 static errno_t
 nstat_tcp_counts(
        nstat_provider_cookie_t cookie,
-       struct nstat_counts             *out_counts,
-       int                                             *out_gone)
+       struct nstat_counts     *out_counts,
+       int                     *out_gone)
 {
        struct nstat_tucookie *tucookie =
            (struct nstat_tucookie *)cookie;
@@ -1242,7 +1255,7 @@ nstat_tcp_release(
 static errno_t
 nstat_tcp_add_watcher(
        nstat_control_state     *state,
-       nstat_msg_add_all_srcs *req)
+       nstat_msg_add_all_srcs  *req)
 {
        // There is a tricky issue around getting all TCP sockets added once
        // and only once.  nstat_tcp_new_pcb() is called prior to the new item
@@ -1369,6 +1382,46 @@ nstat_pcb_detach(struct inpcb *inp)
        }
 }
 
+__private_extern__ void
+nstat_pcb_event(struct inpcb *inp, u_int64_t event)
+{
+       nstat_control_state *state;
+       nstat_src *src;
+       struct nstat_tucookie *tucookie;
+       errno_t result;
+       nstat_provider_id_t provider_id;
+
+       if (inp == NULL || (nstat_tcp_watchers == 0 && nstat_udp_watchers == 0)) {
+               return;
+       }
+
+       lck_mtx_lock(&nstat_mtx);
+       for (state = nstat_controls; state; state = state->ncs_next) {
+               if (((state->ncs_provider_filters[NSTAT_PROVIDER_TCP_KERNEL].npf_events & event) == 0) &&
+                   ((state->ncs_provider_filters[NSTAT_PROVIDER_UDP_KERNEL].npf_events & event) == 0)) {
+                       continue;
+               }
+               lck_mtx_lock(&state->ncs_mtx);
+               TAILQ_FOREACH(src, &state->ncs_src_queue, ns_control_link)
+               {
+                       provider_id = src->provider->nstat_provider_id;
+                       if (provider_id == NSTAT_PROVIDER_TCP_KERNEL || provider_id == NSTAT_PROVIDER_UDP_KERNEL) {
+                               tucookie = (struct nstat_tucookie *)src->cookie;
+                               if (tucookie->inp == inp) {
+                                       break;
+                               }
+                       }
+               }
+
+               if (src && ((state->ncs_provider_filters[provider_id].npf_events & event) != 0)) {
+                       result = nstat_control_send_event(state, src, event);
+               }
+               lck_mtx_unlock(&state->ncs_mtx);
+       }
+       lck_mtx_unlock(&nstat_mtx);
+}
+
+
 __private_extern__ void
 nstat_pcb_cache(struct inpcb *inp)
 {
@@ -1639,8 +1692,8 @@ static nstat_provider   nstat_udp_provider;
 
 static errno_t
 nstat_udp_lookup(
-       const void                              *data,
-       u_int32_t                               length,
+       const void              *data,
+       u_int32_t               length,
        nstat_provider_cookie_t *out_cookie)
 {
        return nstat_tcpudp_lookup(&udbinfo, data, length, out_cookie);
@@ -1710,7 +1763,7 @@ nstat_udp_release(
 static errno_t
 nstat_udp_add_watcher(
        nstat_control_state     *state,
-       nstat_msg_add_all_srcs *req)
+       nstat_msg_add_all_srcs  *req)
 {
        // There is a tricky issue around getting all UDP sockets added once
        // and only once.  nstat_udp_new_pcb() is called prior to the new item
@@ -1798,8 +1851,8 @@ nstat_udp_new_pcb(
 static errno_t
 nstat_udp_copy_descriptor(
        nstat_provider_cookie_t cookie,
-       void                                    *data,
-       u_int32_t                               len)
+       void                    *data,
+       u_int32_t               len)
 {
        if (len < sizeof(nstat_udp_descriptor)) {
                return EINVAL;
@@ -1811,8 +1864,8 @@ nstat_udp_copy_descriptor(
 
        struct nstat_tucookie   *tucookie =
            (struct nstat_tucookie *)cookie;
-       nstat_udp_descriptor            *desc = (nstat_udp_descriptor*)data;
-       struct inpcb                    *inp = tucookie->inp;
+       nstat_udp_descriptor    *desc = (nstat_udp_descriptor*)data;
+       struct inpcb            *inp = tucookie->inp;
 
        bzero(desc, sizeof(*desc));
 
@@ -2208,7 +2261,7 @@ nstat_ifnet_copy_link_status(
                        cell_status->valid_bitmask |= NSTAT_IFNET_DESC_CELL_MSS_RECOMMENDED_VALID;
                        cell_status->mss_recommended = if_cell_sr->mss_recommended;
                }
-       } else if (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI) {
+       } else if (IFNET_IS_WIFI(ifp)) {
                nstat_ifnet_desc_wifi_status *wifi_status = &link_status->u.wifi;
                struct if_wifi_status_v1 *if_wifi_sr =
                    &ifsr->ifsr_u.ifsr_wifi.if_wifi_u.if_status_v1;
@@ -3550,6 +3603,33 @@ nstat_enqueue_success(
        return result;
 }
 
+static errno_t
+nstat_control_send_event(
+       nstat_control_state     *state,
+       nstat_src                       *src,
+       u_int64_t               event)
+{
+       errno_t result = 0;
+       int failed = 0;
+
+       if (nstat_control_reporting_allowed(state, src)) {
+               if ((state->ncs_flags & NSTAT_FLAG_SUPPORTS_UPDATES) != 0) {
+                       result = nstat_control_send_update(state, src, 0, event, 0, NULL);
+                       if (result != 0) {
+                               failed = 1;
+                               if (nstat_debug != 0) {
+                                       printf("%s - nstat_control_send_event() %d\n", __func__, result);
+                               }
+                       }
+               } else {
+                       if (nstat_debug != 0) {
+                               printf("%s - nstat_control_send_event() used when updates not supported\n", __func__);
+                       }
+               }
+       }
+       return result;
+}
+
 static errno_t
 nstat_control_send_goodbye(
        nstat_control_state     *state,
@@ -3560,7 +3640,7 @@ nstat_control_send_goodbye(
 
        if (nstat_control_reporting_allowed(state, src)) {
                if ((state->ncs_flags & NSTAT_FLAG_SUPPORTS_UPDATES) != 0) {
-                       result = nstat_control_send_update(state, src, 0, NSTAT_MSG_HDR_FLAG_CLOSING, NULL);
+                       result = nstat_control_send_update(state, src, 0, 0, NSTAT_MSG_HDR_FLAG_CLOSING, NULL);
                        if (result != 0) {
                                failed = 1;
                                if (nstat_debug != 0) {
@@ -3677,7 +3757,7 @@ nstat_idle_check(
        __unused thread_call_param_t p1)
 {
        nstat_control_state *control;
-       nstat_src       *src, *tmpsrc;
+       nstat_src  *src, *tmpsrc;
        tailq_head_nstat_src dead_list;
        TAILQ_INIT(&dead_list);
 
@@ -3785,20 +3865,18 @@ nstat_control_reporting_allowed(
                return TRUE;
        }
 
-       return
-               src->provider->nstat_reporting_allowed(src->cookie,
-                   &state->ncs_provider_filters[src->provider->nstat_provider_id])
-       ;
+       return src->provider->nstat_reporting_allowed(src->cookie,
+                  &state->ncs_provider_filters[src->provider->nstat_provider_id]);
 }
 
 
 static errno_t
 nstat_control_connect(
-       kern_ctl_ref            kctl,
-       struct sockaddr_ctl     *sac,
-       void                            **uinfo)
+       kern_ctl_ref        kctl,
+       struct sockaddr_ctl *sac,
+       void                **uinfo)
 {
-       nstat_control_state     *state = OSMalloc(sizeof(*state), nstat_malloc_tag);
+       nstat_control_state *state = OSMalloc(sizeof(*state), nstat_malloc_tag);
        if (state == NULL) {
                return ENOMEM;
        }
@@ -3827,11 +3905,11 @@ nstat_control_connect(
 static errno_t
 nstat_control_disconnect(
        __unused kern_ctl_ref   kctl,
-       __unused u_int32_t              unit,
-       void                                    *uinfo)
+       __unused u_int32_t      unit,
+       void                    *uinfo)
 {
-       u_int32_t       watching;
-       nstat_control_state     *state = (nstat_control_state*)uinfo;
+       u_int32_t   watching;
+       nstat_control_state *state = (nstat_control_state*)uinfo;
        tailq_head_nstat_src cleanup_list;
        nstat_src *src;
 
@@ -3892,11 +3970,11 @@ nstat_control_next_src_ref(
 
 static errno_t
 nstat_control_send_counts(
-       nstat_control_state     *state,
-       nstat_src               *src,
-       unsigned long long      context,
-       u_int16_t hdr_flags,
-       int *gone)
+       nstat_control_state *state,
+       nstat_src           *src,
+       unsigned long long  context,
+       u_int16_t           hdr_flags,
+       int                 *gone)
 {
        nstat_msg_src_counts counts;
        errno_t result = 0;
@@ -3933,9 +4011,9 @@ nstat_control_send_counts(
 
 static errno_t
 nstat_control_append_counts(
-       nstat_control_state     *state,
-       nstat_src                       *src,
-       int                                     *gone)
+       nstat_control_state *state,
+       nstat_src           *src,
+       int                 *gone)
 {
        /* Some providers may not have any counts to send */
        if (!src->provider->nstat_counts) {
@@ -3965,10 +4043,10 @@ nstat_control_append_counts(
 
 static int
 nstat_control_send_description(
-       nstat_control_state     *state,
-       nstat_src                       *src,
-       u_int64_t                       context,
-       u_int16_t                       hdr_flags)
+       nstat_control_state *state,
+       nstat_src           *src,
+       u_int64_t           context,
+       u_int16_t           hdr_flags)
 {
        // Provider doesn't support getting the descriptor? Done.
        if (src->provider->nstat_descriptor_length == 0 ||
@@ -3977,14 +4055,14 @@ nstat_control_send_description(
        }
 
        // Allocate storage for the descriptor message
-       mbuf_t                  msg;
+       mbuf_t          msg;
        unsigned int    one = 1;
-       u_int32_t               size = offsetof(nstat_msg_src_description, data) + src->provider->nstat_descriptor_length;
+       u_int32_t       size = offsetof(nstat_msg_src_description, data) + src->provider->nstat_descriptor_length;
        if (mbuf_allocpacket(MBUF_DONTWAIT, size, &one, &msg) != 0) {
                return ENOMEM;
        }
 
-       nstat_msg_src_description       *desc = (nstat_msg_src_description*)mbuf_data(msg);
+       nstat_msg_src_description *desc = (nstat_msg_src_description*)mbuf_data(msg);
        bzero(desc, size);
        mbuf_setlen(msg, size);
        mbuf_pkthdr_setlen(msg, mbuf_len(msg));
@@ -4016,8 +4094,8 @@ nstat_control_send_description(
 
 static errno_t
 nstat_control_append_description(
-       nstat_control_state     *state,
-       nstat_src                       *src)
+       nstat_control_state *state,
+       nstat_src           *src)
 {
        size_t  size = offsetof(nstat_msg_src_description, data) + src->provider->nstat_descriptor_length;
        if (size > 512 || src->provider->nstat_descriptor_length == 0 ||
@@ -4029,7 +4107,7 @@ nstat_control_append_description(
        u_int64_t buffer[size / sizeof(u_int64_t)  + 1]; // u_int64_t to ensure alignment
        bzero(buffer, size);
 
-       nstat_msg_src_description       *desc = (nstat_msg_src_description*)buffer;
+       nstat_msg_src_description *desc = (nstat_msg_src_description*)buffer;
        desc->hdr.type = NSTAT_MSG_TYPE_SRC_DESC;
        desc->hdr.length = size;
        desc->srcref = src->srcref;
@@ -4050,11 +4128,12 @@ nstat_control_append_description(
 
 static int
 nstat_control_send_update(
-       nstat_control_state     *state,
-       nstat_src                       *src,
-       u_int64_t                       context,
-       u_int16_t               hdr_flags,
-       int                                     *gone)
+       nstat_control_state *state,
+       nstat_src           *src,
+       u_int64_t           context,
+       u_int64_t           event,
+       u_int16_t           hdr_flags,
+       int                 *gone)
 {
        // Provider doesn't support getting the descriptor or counts? Done.
        if ((src->provider->nstat_descriptor_length == 0 ||
@@ -4064,22 +4143,22 @@ nstat_control_send_update(
        }
 
        // Allocate storage for the descriptor message
-       mbuf_t                  msg;
+       mbuf_t          msg;
        unsigned int    one = 1;
-       u_int32_t               size = offsetof(nstat_msg_src_update, data) +
+       u_int32_t       size = offsetof(nstat_msg_src_update, data) +
            src->provider->nstat_descriptor_length;
        if (mbuf_allocpacket(MBUF_DONTWAIT, size, &one, &msg) != 0) {
                return ENOMEM;
        }
 
-       nstat_msg_src_update    *desc = (nstat_msg_src_update*)mbuf_data(msg);
+       nstat_msg_src_update *desc = (nstat_msg_src_update*)mbuf_data(msg);
        bzero(desc, size);
        desc->hdr.context = context;
        desc->hdr.type = NSTAT_MSG_TYPE_SRC_UPDATE;
        desc->hdr.length = size;
        desc->hdr.flags = hdr_flags;
        desc->srcref = src->srcref;
-       desc->event_flags = 0;
+       desc->event_flags = event;
        desc->provider = src->provider->nstat_provider_id;
 
        mbuf_setlen(msg, size);
@@ -4118,9 +4197,9 @@ nstat_control_send_update(
 
 static errno_t
 nstat_control_append_update(
-       nstat_control_state     *state,
-       nstat_src                       *src,
-       int                                     *gone)
+       nstat_control_state *state,
+       nstat_src           *src,
+       int                 *gone)
 {
        size_t  size = offsetof(nstat_msg_src_update, data) + src->provider->nstat_descriptor_length;
        if (size > 512 || ((src->provider->nstat_descriptor_length == 0 ||
@@ -4176,8 +4255,8 @@ nstat_control_append_update(
 
 static errno_t
 nstat_control_send_removed(
-       nstat_control_state     *state,
-       nstat_src               *src)
+       nstat_control_state *state,
+       nstat_src           *src)
 {
        nstat_msg_src_removed removed;
        errno_t result;
@@ -4198,8 +4277,8 @@ nstat_control_send_removed(
 
 static errno_t
 nstat_control_handle_add_request(
-       nstat_control_state     *state,
-       mbuf_t                          m)
+       nstat_control_state *state,
+       mbuf_t              m)
 {
        errno_t result;
 
@@ -4214,7 +4293,7 @@ nstat_control_handle_add_request(
                return EINVAL;
        }
 
-       nstat_provider                  *provider = NULL;
+       nstat_provider          *provider = NULL;
        nstat_provider_cookie_t cookie = NULL;
        nstat_msg_add_src_req   *req = mbuf_data(m);
        if (mbuf_pkthdr_len(m) > mbuf_len(m)) {
@@ -4248,7 +4327,7 @@ nstat_control_handle_add_request(
 static errno_t
 nstat_set_provider_filter(
        nstat_control_state     *state,
-       nstat_msg_add_all_srcs *req)
+       nstat_msg_add_all_srcs  *req)
 {
        nstat_provider_id_t provider_id = req->provider;
 
@@ -4269,7 +4348,7 @@ nstat_set_provider_filter(
 static errno_t
 nstat_control_handle_add_all(
        nstat_control_state     *state,
-       mbuf_t                          m)
+       mbuf_t                  m)
 {
        errno_t result = 0;
 
@@ -4283,7 +4362,7 @@ nstat_control_handle_add_all(
                return ENOENT;
        }
 
-       nstat_provider                  *provider = nstat_find_provider_by_id(req->provider);
+       nstat_provider *provider = nstat_find_provider_by_id(req->provider);
 
        if (!provider) {
                return ENOENT;
@@ -4323,10 +4402,10 @@ nstat_control_handle_add_all(
 
 static errno_t
 nstat_control_source_add(
-       u_int64_t                       context,
-       nstat_control_state             *state,
-       nstat_provider                  *provider,
-       nstat_provider_cookie_t         cookie)
+       u_int64_t               context,
+       nstat_control_state     *state,
+       nstat_provider          *provider,
+       nstat_provider_cookie_t cookie)
 {
        // Fill out source added message if appropriate
        mbuf_t                  msg = NULL;
@@ -4416,8 +4495,8 @@ nstat_control_source_add(
 
 static errno_t
 nstat_control_handle_remove_request(
-       nstat_control_state     *state,
-       mbuf_t                          m)
+       nstat_control_state *state,
+       mbuf_t              m)
 {
        nstat_src_ref_t srcref = NSTAT_SRC_REF_INVALID;
        nstat_src *src;
@@ -4450,8 +4529,8 @@ nstat_control_handle_remove_request(
 
 static errno_t
 nstat_control_handle_query_request(
-       nstat_control_state     *state,
-       mbuf_t                          m)
+       nstat_control_state *state,
+       mbuf_t              m)
 {
        // TBD: handle this from another thread so we can enqueue a lot of data
        // As written, if a client requests query all, this function will be
@@ -4583,8 +4662,8 @@ nstat_control_handle_query_request(
 
 static errno_t
 nstat_control_handle_get_src_description(
-       nstat_control_state     *state,
-       mbuf_t                          m)
+       nstat_control_state *state,
+       mbuf_t              m)
 {
        nstat_msg_get_src_description   req;
        errno_t result = ENOENT;
@@ -4664,8 +4743,8 @@ nstat_control_handle_get_src_description(
 
 static errno_t
 nstat_control_handle_set_filter(
-       nstat_control_state         *state,
-       mbuf_t                      m)
+       nstat_control_state *state,
+       mbuf_t              m)
 {
        nstat_msg_set_filter req;
        nstat_src *src;
@@ -4798,7 +4877,7 @@ nstat_control_handle_get_update(
 
        TAILQ_FOREACH_SAFE(src, &state->ncs_src_queue, ns_control_link, tmpsrc)
        {
-               int                     gone;
+               int gone;
 
                gone = 0;
                if (nstat_control_reporting_allowed(state, src)) {
@@ -4824,7 +4903,7 @@ nstat_control_handle_get_update(
                                        src_count++;
                                }
                        } else if (src->srcref == req.srcref) {
-                               result = nstat_control_send_update(state, src, req.hdr.context, 0, &gone);
+                               result = nstat_control_send_update(state, src, req.hdr.context, 0, 0, &gone);
                        }
                }
 
@@ -4890,9 +4969,9 @@ nstat_control_handle_subscribe_sysinfo(
 static errno_t
 nstat_control_send(
        kern_ctl_ref    kctl,
-       u_int32_t               unit,
-       void    *uinfo,
-       mbuf_t                  m,
+       u_int32_t       unit,
+       void            *uinfo,
+       mbuf_t          m,
        __unused int    flags)
 {
        nstat_control_state     *state = (nstat_control_state*)uinfo;
@@ -5001,7 +5080,7 @@ nstat_control_send(
 
 
 static int
-tcp_progress_indicators_for_interface(unsigned int ifindex, uint64_t recentflow_maxduration, struct xtcpprogress_indicators *indicators)
+tcp_progress_indicators_for_interface(unsigned int ifindex, uint64_t recentflow_maxduration, uint16_t filter_flags, struct xtcpprogress_indicators *indicators)
 {
        int error = 0;
        struct inpcb *inp;
@@ -5020,7 +5099,9 @@ tcp_progress_indicators_for_interface(unsigned int ifindex, uint64_t recentflow_
                if (tp && inp->inp_last_outifp &&
                    inp->inp_last_outifp->if_index == ifindex &&
                    inp->inp_state != INPCB_STATE_DEAD &&
-                   !(tp->t_flags & TF_LOCAL)) {
+                   ((filter_flags == 0) ||
+                   ((filter_flags & NSTAT_IFNET_IS_NON_LOCAL) && !(tp->t_flags & TF_LOCAL)) ||
+                   ((filter_flags & NSTAT_IFNET_IS_LOCAL) && (tp->t_flags & TF_LOCAL)))) {
                        struct tcp_conn_status connstatus;
                        indicators->xp_numflows++;
                        tcp_get_connectivity_status(tp, &connstatus);
@@ -5077,7 +5158,7 @@ ntstat_tcp_progress_indicators(struct sysctl_req *req)
        if (error != 0) {
                return error;
        }
-       error = tcp_progress_indicators_for_interface(requested.ifindex, requested.recentflow_maxduration, &indicators);
+       error = tcp_progress_indicators_for_interface(requested.ifindex, requested.recentflow_maxduration, (uint16_t)requested.filter_flags, &indicators);
        if (error != 0) {
                return error;
        }
index 7f204fcf3d5662e8c7903bdcf0f3523d84bbc977..a5b976061350d2b1773b76042e532ef4ab0ce106 100644 (file)
@@ -45,16 +45,22 @@ typedef u_int64_t       nstat_event_flags_t;
 
 // The following event definitions are very provisional..
 enum{
-       NSTAT_EVENT_SRC_ADDED                                   = 0x00000001
-       , NSTAT_EVENT_SRC_REMOVED                                = 0x00000002
-       , NSTAT_EVENT_SRC_QUERIED                                = 0x00000004
-       , NSTAT_EVENT_SRC_QUERIED_ALL                    = 0x00000008
-       , NSTAT_EVENT_SRC_WILL_CHANGE_STATE              = 0x00000010
-       , NSTAT_EVENT_SRC_DID_CHANGE_STATE               = 0x00000020
-       , NSTAT_EVENT_SRC_WILL_CHANGE_OWNER              = 0x00000040
-       , NSTAT_EVENT_SRC_DID_CHANGE_OWNER               = 0x00000080
+       NSTAT_EVENT_SRC_ADDED                    = 0x00000001
+       , NSTAT_EVENT_SRC_REMOVED                = 0x00000002
+       , NSTAT_EVENT_SRC_QUERIED                = 0x00000004
+       , NSTAT_EVENT_SRC_QUERIED_ALL            = 0x00000008
+       , NSTAT_EVENT_SRC_WILL_CHANGE_STATE      = 0x00000010
+       , NSTAT_EVENT_SRC_DID_CHANGE_STATE       = 0x00000020
+       , NSTAT_EVENT_SRC_WILL_CHANGE_OWNER      = 0x00000040
+       , NSTAT_EVENT_SRC_DID_CHANGE_OWNER       = 0x00000080
        , NSTAT_EVENT_SRC_WILL_CHANGE_PROPERTY   = 0x00000100
        , NSTAT_EVENT_SRC_DID_CHANGE_PROPERTY    = 0x00000200
+       , NSTAT_EVENT_SRC_ENTER_CELLFALLBACK     = 0x00000400
+       , NSTAT_EVENT_SRC_EXIT_CELLFALLBACK      = 0x00000800
+#if (DEBUG || DEVELOPMENT)
+       , NSTAT_EVENT_SRC_RESERVED_1             = 0x00001000
+       , NSTAT_EVENT_SRC_RESERVED_2             = 0x00002000
+#endif /* (DEBUG || DEVELOPMENT) */
 };
 
 typedef struct nstat_counts {
@@ -110,7 +116,7 @@ typedef struct nstat_sysinfo_counts {
 }  nstat_sysinfo_counts;
 
 enum{
-       NSTAT_SYSINFO_KEY_MBUF_256B_TOTAL       = 1
+       NSTAT_SYSINFO_KEY_MBUF_256B_TOTAL        = 1
        , NSTAT_SYSINFO_KEY_MBUF_2KB_TOTAL       = 2
        , NSTAT_SYSINFO_KEY_MBUF_4KB_TOTAL       = 3
        , NSTAT_SYSINFO_KEY_SOCK_MBCNT           = 4
@@ -325,31 +331,38 @@ enum{
 
 // Interface properties
 
-#define NSTAT_IFNET_IS_UNKNOWN_TYPE      0x01
-#define NSTAT_IFNET_IS_LOOPBACK          0x02
-#define NSTAT_IFNET_IS_CELLULAR          0x04
-#define NSTAT_IFNET_IS_WIFI              0x08
-#define NSTAT_IFNET_IS_WIRED             0x10
-#define NSTAT_IFNET_IS_AWDL              0x20
-#define NSTAT_IFNET_IS_EXPENSIVE         0x40
-#define NSTAT_IFNET_IS_VPN               0x80
-#define NSTAT_IFNET_VIA_CELLFALLBACK     0x100
+#define NSTAT_IFNET_IS_UNKNOWN_TYPE      0x0001
+#define NSTAT_IFNET_IS_LOOPBACK          0x0002
+#define NSTAT_IFNET_IS_CELLULAR          0x0004
+#define NSTAT_IFNET_IS_WIFI              0x0008
+#define NSTAT_IFNET_IS_WIRED             0x0010
+#define NSTAT_IFNET_IS_AWDL              0x0020
+#define NSTAT_IFNET_IS_EXPENSIVE         0x0040
+#define NSTAT_IFNET_IS_VPN               0x0080
+#define NSTAT_IFNET_VIA_CELLFALLBACK     0x0100
+#define NSTAT_IFNET_IS_COMPANIONLINK     0x0200
+#define NSTAT_IFNET_IS_CONSTRAINED       0x0400
+// The following local and non-local flags are set only if fully known
+// They are mutually exclusive but there is no guarantee that one or the other will be set
+#define NSTAT_IFNET_IS_LOCAL             0x0800
+#define NSTAT_IFNET_IS_NON_LOCAL         0x1000
 // Temporary properties of use for bringing up userland providers
-#define NSTAT_IFNET_ROUTE_VALUE_UNOBTAINABLE      0x1000
-#define NSTAT_IFNET_FLOWSWITCH_VALUE_UNOBTAINABLE 0x2000
+#define NSTAT_IFNET_ROUTE_VALUE_UNOBTAINABLE      0x2000
+#define NSTAT_IFNET_FLOWSWITCH_VALUE_UNOBTAINABLE 0x4000
 
 
-enum{
-       NSTAT_PROVIDER_NONE     = 0
-       , NSTAT_PROVIDER_ROUTE   = 1
-       , NSTAT_PROVIDER_TCP_KERNEL      = 2
+typedef enum {
+       NSTAT_PROVIDER_NONE           = 0
+       , NSTAT_PROVIDER_ROUTE        = 1
+       , NSTAT_PROVIDER_TCP_KERNEL   = 2
        , NSTAT_PROVIDER_TCP_USERLAND = 3
-       , NSTAT_PROVIDER_UDP_KERNEL      = 4
+       , NSTAT_PROVIDER_UDP_KERNEL   = 4
        , NSTAT_PROVIDER_UDP_USERLAND = 5
-       , NSTAT_PROVIDER_IFNET   = 6
-       , NSTAT_PROVIDER_SYSINFO = 7
-};
-#define NSTAT_PROVIDER_LAST NSTAT_PROVIDER_SYSINFO
+       , NSTAT_PROVIDER_IFNET        = 6
+       , NSTAT_PROVIDER_SYSINFO      = 7
+       , NSTAT_PROVIDER_QUIC_USERLAND = 8
+} nstat_provider_type_t;
+#define NSTAT_PROVIDER_LAST NSTAT_PROVIDER_QUIC_USERLAND
 #define NSTAT_PROVIDER_COUNT (NSTAT_PROVIDER_LAST+1)
 
 typedef struct nstat_route_add_param {
@@ -463,6 +476,15 @@ typedef struct nstat_udp_descriptor {
        u_int8_t        reserved[6];
 } nstat_udp_descriptor;
 
+/*
+ * XXX For now just typedef'ing TCP Nstat descriptor to nstat_quic_descriptor
+ * as for now they report very similar data.
+ * Later when we extend the QUIC descriptor we can just declare its own
+ * descriptor struct.
+ */
+typedef struct nstat_tcp_add_param      nstat_quic_add_param;
+typedef struct nstat_tcp_descriptor     nstat_quic_descriptor;
+
 typedef struct nstat_route_descriptor {
        u_int64_t       id __attribute__((aligned(sizeof(u_int64_t))));
        u_int64_t       parent_id __attribute__((aligned(sizeof(u_int64_t))));
@@ -666,26 +688,26 @@ typedef struct nstat_sysinfo_add_param {
 
 enum{
        // generic response messages
-       NSTAT_MSG_TYPE_SUCCESS                  = 0
-       , NSTAT_MSG_TYPE_ERROR                   = 1
+       NSTAT_MSG_TYPE_SUCCESS               = 0
+       , NSTAT_MSG_TYPE_ERROR               = 1
 
            // Requests
-       , NSTAT_MSG_TYPE_ADD_SRC                         = 1001
-       , NSTAT_MSG_TYPE_ADD_ALL_SRCS            = 1002
-       , NSTAT_MSG_TYPE_REM_SRC                         = 1003
-       , NSTAT_MSG_TYPE_QUERY_SRC                       = 1004
-       , NSTAT_MSG_TYPE_GET_SRC_DESC            = 1005
-       , NSTAT_MSG_TYPE_SET_FILTER                      = 1006
-       , NSTAT_MSG_TYPE_GET_UPDATE                      = 1007
-       , NSTAT_MSG_TYPE_SUBSCRIBE_SYSINFO       = 1008
+       , NSTAT_MSG_TYPE_ADD_SRC             = 1001
+       , NSTAT_MSG_TYPE_ADD_ALL_SRCS        = 1002
+       , NSTAT_MSG_TYPE_REM_SRC             = 1003
+       , NSTAT_MSG_TYPE_QUERY_SRC           = 1004
+       , NSTAT_MSG_TYPE_GET_SRC_DESC        = 1005
+       , NSTAT_MSG_TYPE_SET_FILTER          = 1006
+       , NSTAT_MSG_TYPE_GET_UPDATE          = 1007
+       , NSTAT_MSG_TYPE_SUBSCRIBE_SYSINFO   = 1008
 
            // Responses/Notfications
-       , NSTAT_MSG_TYPE_SRC_ADDED                               = 10001
-       , NSTAT_MSG_TYPE_SRC_REMOVED                             = 10002
-       , NSTAT_MSG_TYPE_SRC_DESC                                = 10003
-       , NSTAT_MSG_TYPE_SRC_COUNTS                              = 10004
-       , NSTAT_MSG_TYPE_SYSINFO_COUNTS                  = 10005
-       , NSTAT_MSG_TYPE_SRC_UPDATE                              = 10006
+       , NSTAT_MSG_TYPE_SRC_ADDED           = 10001
+       , NSTAT_MSG_TYPE_SRC_REMOVED         = 10002
+       , NSTAT_MSG_TYPE_SRC_DESC            = 10003
+       , NSTAT_MSG_TYPE_SRC_COUNTS          = 10004
+       , NSTAT_MSG_TYPE_SYSINFO_COUNTS      = 10005
+       , NSTAT_MSG_TYPE_SRC_UPDATE          = 10006
 };
 
 enum{
@@ -700,7 +722,7 @@ enum{
 
 /* Provider-level filters */
 enum{
-       NSTAT_FILTER_ACCEPT_UNKNOWN          = 0x00000001
+       NSTAT_FILTER_ACCEPT_UNKNOWN           = 0x00000001
        , NSTAT_FILTER_ACCEPT_LOOPBACK        = 0x00000002
        , NSTAT_FILTER_ACCEPT_CELLULAR        = 0x00000004
        , NSTAT_FILTER_ACCEPT_WIFI            = 0x00000008
@@ -708,13 +730,15 @@ enum{
        , NSTAT_FILTER_ACCEPT_AWDL            = 0x00000020
        , NSTAT_FILTER_ACCEPT_EXPENSIVE       = 0x00000040
        , NSTAT_FILTER_ACCEPT_CELLFALLBACK    = 0x00000100
-       , NSTAT_FILTER_IFNET_FLAGS            = 0x00000FFF
+       , NSTAT_FILTER_ACCEPT_COMPANIONLINK   = 0x00000200
+       , NSTAT_FILTER_ACCEPT_IS_CONSTRAINED  = 0x00000400
+       , NSTAT_FILTER_ACCEPT_IS_LOCAL        = 0x00000800
+       , NSTAT_FILTER_ACCEPT_IS_NON_LOCAL    = 0x00001000
+       , NSTAT_FILTER_IFNET_FLAGS            = 0x00001FFF
 
-       , NSTAT_FILTER_TCP_NO_LISTENER        = 0x00001000
-       , NSTAT_FILTER_TCP_ONLY_LISTENER      = 0x00002000
        , NSTAT_FILTER_TCP_INTERFACE_ATTACH   = 0x00004000
        , NSTAT_FILTER_TCP_NO_EARLY_CLOSE     = 0x00008000
-       , NSTAT_FILTER_TCP_FLAGS              = 0x0000F000
+       , NSTAT_FILTER_TCP_FLAGS              = 0x0000C000
 
        , NSTAT_FILTER_UDP_INTERFACE_ATTACH   = 0x00010000
        , NSTAT_FILTER_UDP_FLAGS              = 0x000F0000
@@ -778,18 +802,18 @@ typedef struct nstat_msg_add_src_convenient {
 
 typedef struct nstat_msg_add_all_srcs {
        nstat_msg_hdr           hdr;
-       u_int64_t                       filter __attribute__((aligned(sizeof(u_int64_t))));
+       u_int64_t               filter __attribute__((aligned(sizeof(u_int64_t))));
        nstat_event_flags_t     events __attribute__((aligned(sizeof(u_int64_t))));
        nstat_provider_id_t     provider;
-       pid_t                           target_pid;
-       uuid_t                          target_uuid;
+       pid_t                   target_pid;
+       uuid_t                  target_uuid;
 } nstat_msg_add_all_srcs;
 
 typedef struct nstat_msg_src_added {
        nstat_msg_hdr           hdr;
        nstat_src_ref_t         srcref __attribute__((aligned(sizeof(u_int64_t))));
        nstat_provider_id_t     provider;
-       u_int8_t                        reserved[4];
+       u_int8_t                reserved[4];
 } nstat_msg_src_added;
 
 typedef struct nstat_msg_rem_src {
@@ -805,8 +829,8 @@ typedef struct nstat_msg_get_src_description {
 typedef struct nstat_msg_set_filter {
        nstat_msg_hdr           hdr;
        nstat_src_ref_t         srcref __attribute__((aligned(sizeof(u_int64_t))));
-       u_int32_t                       filter;
-       u_int8_t                        reserved[4];
+       u_int32_t               filter;
+       u_int8_t                reserved[4];
 } nstat_msg_set_filter;
 
 #define NSTAT_SRC_DESCRIPTION_FIELDS                                                                                            \
@@ -826,13 +850,14 @@ typedef struct nstat_msg_src_description_header {
 } nstat_msg_src_description_header;
 
 typedef struct nstat_msg_src_description_convenient {
-       nstat_msg_src_description_header        hdr;
+       nstat_msg_src_description_header    hdr;
        union {
-               nstat_tcp_descriptor                    tcp;
-               nstat_udp_descriptor                    udp;
-               nstat_route_descriptor                  route;
-               nstat_ifnet_descriptor                  ifnet;
-               nstat_sysinfo_descriptor                sysinfo;
+               nstat_tcp_descriptor            tcp;
+               nstat_udp_descriptor            udp;
+               nstat_route_descriptor          route;
+               nstat_ifnet_descriptor          ifnet;
+               nstat_sysinfo_descriptor        sysinfo;
+               nstat_quic_descriptor           quic;
        };
 } nstat_msg_src_description_convenient;
 
@@ -875,6 +900,7 @@ typedef struct nstat_msg_src_update_convenient {
                nstat_route_descriptor          route;
                nstat_ifnet_descriptor          ifnet;
                nstat_sysinfo_descriptor        sysinfo;
+               nstat_quic_descriptor           quic;
        };
 } nstat_msg_src_update_convenient;
 
@@ -920,91 +946,91 @@ struct nstat_stats {
 #pragma mark -- System Information Internal Support --
 
 typedef struct nstat_sysinfo_mbuf_stats {
-       u_int32_t               total_256b;     /* Peak usage, 256B pool */
-       u_int32_t               total_2kb;      /* Peak usage, 2KB pool */
-       u_int32_t               total_4kb;      /* Peak usage, 4KB pool */
-       u_int32_t               total_16kb;     /* Peak usage, 16KB pool */
-       u_int32_t               sbmb_total;     /* Total mbufs in sock buffer pool */
-       u_int32_t               sb_atmbuflimit; /* Memory limit reached for socket buffer autoscaling */
-       u_int32_t               draincnt;       /* Number of times mbuf pool has been drained under memory pressure */
-       u_int32_t               memreleased;    /* Memory (bytes) released from mbuf pool to VM */
-       u_int32_t               sbmb_floor;     /* Lowest mbufs in sock buffer pool */
+       u_int32_t       total_256b;     /* Peak usage, 256B pool */
+       u_int32_t       total_2kb;      /* Peak usage, 2KB pool */
+       u_int32_t       total_4kb;      /* Peak usage, 4KB pool */
+       u_int32_t       total_16kb;     /* Peak usage, 16KB pool */
+       u_int32_t       sbmb_total;     /* Total mbufs in sock buffer pool */
+       u_int32_t       sb_atmbuflimit; /* Memory limit reached for socket buffer autoscaling */
+       u_int32_t       draincnt;       /* Number of times mbuf pool has been drained under memory pressure */
+       u_int32_t       memreleased;    /* Memory (bytes) released from mbuf pool to VM */
+       u_int32_t       sbmb_floor;     /* Lowest mbufs in sock buffer pool */
 } nstat_sysinfo_mbuf_stats;
 
 typedef struct nstat_sysinfo_tcp_stats {
        /* When adding/removing here, also adjust NSTAT_SYSINFO_TCP_STATS_COUNT */
-       u_int32_t               ipv4_avgrtt;    /* Average RTT for IPv4 */
-       u_int32_t               ipv6_avgrtt;    /* Average RTT for IPv6 */
-       u_int32_t               send_plr;       /* Average uplink packet loss rate */
-       u_int32_t               recv_plr;       /* Average downlink packet loss rate */
-       u_int32_t               send_tlrto_rate; /* Average rxt timeout after tail loss */
-       u_int32_t               send_reorder_rate; /* Average packet reordering rate */
-       u_int32_t               connection_attempts; /* TCP client connection attempts */
-       u_int32_t               connection_accepts; /* TCP server connection accepts */
-       u_int32_t               ecn_client_enabled; /* Global setting for ECN client side */
-       u_int32_t               ecn_server_enabled; /* Global setting for ECN server side */
-       u_int32_t               ecn_client_setup; /* Attempts to setup TCP client connection with ECN */
-       u_int32_t               ecn_server_setup; /* Attempts to setup TCP server connection with ECN */
-       u_int32_t               ecn_client_success; /* Number of successful negotiations of ECN for a client connection */
-       u_int32_t               ecn_server_success; /* Number of successful negotiations of ECN for a server connection */
-       u_int32_t               ecn_not_supported; /* Number of falbacks to Non-ECN, no support from peer */
-       u_int32_t               ecn_lost_syn;   /* Number of SYNs lost with ECN bits */
-       u_int32_t               ecn_lost_synack; /* Number of SYN-ACKs lost with ECN bits */
-       u_int32_t               ecn_recv_ce;    /* Number of CEs received from network */
-       u_int32_t               ecn_recv_ece;   /* Number of ECEs received from receiver */
-       u_int32_t               ecn_sent_ece;   /* Number of ECEs sent in response to CE */
-       u_int32_t               ecn_conn_recv_ce; /* Number of connections using ECN received CE at least once */
-       u_int32_t               ecn_conn_recv_ece; /* Number of connections using ECN received ECE at least once */
-       u_int32_t               ecn_conn_plnoce; /* Number of connections using ECN seen packet loss but never received CE */
-       u_int32_t               ecn_conn_pl_ce; /* Number of connections using ECN seen packet loss and CE */
-       u_int32_t               ecn_conn_nopl_ce; /* Number of connections using ECN with no packet loss but received CE */
-       u_int32_t               ecn_fallback_synloss; /* Number of times we did fall back due to SYN-Loss */
-       u_int32_t               ecn_fallback_reorder; /* Number of times we fallback because we detected the PAWS-issue */
-       u_int32_t               ecn_fallback_ce; /* Number of times we fallback because we received too many CEs */
-       u_int32_t               tfo_syn_data_rcv;       /* Number of SYN+data received with valid cookie */
-       u_int32_t               tfo_cookie_req_rcv;/* Number of TFO cookie-requests received */
-       u_int32_t               tfo_cookie_sent;        /* Number of TFO-cookies offered to the client */
-       u_int32_t               tfo_cookie_invalid;/* Number of invalid TFO-cookies received */
-       u_int32_t               tfo_cookie_req; /* Number of SYNs with cookie request received*/
-       u_int32_t               tfo_cookie_rcv; /* Number of SYN/ACKs with Cookie received */
-       u_int32_t               tfo_syn_data_sent;      /* Number of SYNs+data+cookie sent */
-       u_int32_t               tfo_syn_data_acked;/* Number of times our SYN+data has been acknowledged */
-       u_int32_t               tfo_syn_loss;   /* Number of times SYN+TFO has been lost and we fallback */
-       u_int32_t               tfo_blackhole;  /* Number of times SYN+TFO has been lost and we fallback */
-       u_int32_t               tfo_cookie_wrong;       /* TFO-cookie we sent was wrong */
-       u_int32_t               tfo_no_cookie_rcv;      /* We asked for a cookie but didn't get one */
-       u_int32_t               tfo_heuristics_disable; /* TFO got disabled due to heuristics */
-       u_int32_t               tfo_sndblackhole;       /* TFO got blackholed in the sending direction */
-       u_int32_t               mptcp_handover_attempt; /* Total number of MPTCP-attempts using handover mode */
-       u_int32_t               mptcp_interactive_attempt;      /* Total number of MPTCP-attempts using interactive mode */
-       u_int32_t               mptcp_aggregate_attempt;        /* Total number of MPTCP-attempts using aggregate mode */
-       u_int32_t               mptcp_fp_handover_attempt; /* Same as previous three but only for first-party apps */
-       u_int32_t               mptcp_fp_interactive_attempt;
-       u_int32_t               mptcp_fp_aggregate_attempt;
-       u_int32_t               mptcp_heuristic_fallback;       /* Total number of MPTCP-connections that fell back due to heuristics */
-       u_int32_t               mptcp_fp_heuristic_fallback;    /* Same as previous but for first-party apps */
-       u_int32_t               mptcp_handover_success_wifi;    /* Total number of successfull handover-mode connections that *started* on WiFi */
-       u_int32_t               mptcp_handover_success_cell;    /* Total number of successfull handover-mode connections that *started* on Cell */
-       u_int32_t               mptcp_interactive_success;              /* Total number of interactive-mode connections that negotiated MPTCP */
-       u_int32_t               mptcp_aggregate_success;                /* Same as previous but for aggregate */
-       u_int32_t               mptcp_fp_handover_success_wifi; /* Same as previous four, but for first-party apps */
-       u_int32_t               mptcp_fp_handover_success_cell;
-       u_int32_t               mptcp_fp_interactive_success;
-       u_int32_t               mptcp_fp_aggregate_success;
-       u_int32_t               mptcp_handover_cell_from_wifi;  /* Total number of connections that use cell in handover-mode (coming from WiFi) */
-       u_int32_t               mptcp_handover_wifi_from_cell;  /* Total number of connections that use WiFi in handover-mode (coming from cell) */
-       u_int32_t               mptcp_interactive_cell_from_wifi;       /* Total number of connections that use cell in interactive mode (coming from WiFi) */
-       u_int32_t               mptcp_back_to_wifi;     /* Total number of connections that succeed to move traffic away from cell (when starting on cell) */
-       u_int64_t               mptcp_handover_cell_bytes;              /* Total number of bytes sent on cell in handover-mode (on new subflows, ignoring initial one) */
-       u_int64_t               mptcp_interactive_cell_bytes;   /* Same as previous but for interactive */
-       u_int64_t               mptcp_aggregate_cell_bytes;
-       u_int64_t               mptcp_handover_all_bytes;               /* Total number of bytes sent in handover */
-       u_int64_t               mptcp_interactive_all_bytes;
-       u_int64_t               mptcp_aggregate_all_bytes;
-       u_int32_t               mptcp_wifi_proxy;               /* Total number of new subflows that fell back to regular TCP on cell */
-       u_int32_t               mptcp_cell_proxy;               /* Total number of new subflows that fell back to regular TCP on WiFi */
-       u_int32_t               mptcp_triggered_cell;           /* Total number of times an MPTCP-connection triggered cell bringup */
-       u_int32_t               _padding;
+       u_int32_t       ipv4_avgrtt;    /* Average RTT for IPv4 */
+       u_int32_t       ipv6_avgrtt;    /* Average RTT for IPv6 */
+       u_int32_t       send_plr;       /* Average uplink packet loss rate */
+       u_int32_t       recv_plr;       /* Average downlink packet loss rate */
+       u_int32_t       send_tlrto_rate; /* Average rxt timeout after tail loss */
+       u_int32_t       send_reorder_rate; /* Average packet reordering rate */
+       u_int32_t       connection_attempts; /* TCP client connection attempts */
+       u_int32_t       connection_accepts; /* TCP server connection accepts */
+       u_int32_t       ecn_client_enabled; /* Global setting for ECN client side */
+       u_int32_t       ecn_server_enabled; /* Global setting for ECN server side */
+       u_int32_t       ecn_client_setup; /* Attempts to setup TCP client connection with ECN */
+       u_int32_t       ecn_server_setup; /* Attempts to setup TCP server connection with ECN */
+       u_int32_t       ecn_client_success; /* Number of successful negotiations of ECN for a client connection */
+       u_int32_t       ecn_server_success; /* Number of successful negotiations of ECN for a server connection */
+       u_int32_t       ecn_not_supported; /* Number of falbacks to Non-ECN, no support from peer */
+       u_int32_t       ecn_lost_syn;   /* Number of SYNs lost with ECN bits */
+       u_int32_t       ecn_lost_synack; /* Number of SYN-ACKs lost with ECN bits */
+       u_int32_t       ecn_recv_ce;    /* Number of CEs received from network */
+       u_int32_t       ecn_recv_ece;   /* Number of ECEs received from receiver */
+       u_int32_t       ecn_sent_ece;   /* Number of ECEs sent in response to CE */
+       u_int32_t       ecn_conn_recv_ce; /* Number of connections using ECN received CE at least once */
+       u_int32_t       ecn_conn_recv_ece; /* Number of connections using ECN received ECE at least once */
+       u_int32_t       ecn_conn_plnoce; /* Number of connections using ECN seen packet loss but never received CE */
+       u_int32_t       ecn_conn_pl_ce; /* Number of connections using ECN seen packet loss and CE */
+       u_int32_t       ecn_conn_nopl_ce; /* Number of connections using ECN with no packet loss but received CE */
+       u_int32_t       ecn_fallback_synloss; /* Number of times we did fall back due to SYN-Loss */
+       u_int32_t       ecn_fallback_reorder; /* Number of times we fallback because we detected the PAWS-issue */
+       u_int32_t       ecn_fallback_ce; /* Number of times we fallback because we received too many CEs */
+       u_int32_t       tfo_syn_data_rcv;       /* Number of SYN+data received with valid cookie */
+       u_int32_t       tfo_cookie_req_rcv;/* Number of TFO cookie-requests received */
+       u_int32_t       tfo_cookie_sent;        /* Number of TFO-cookies offered to the client */
+       u_int32_t       tfo_cookie_invalid;/* Number of invalid TFO-cookies received */
+       u_int32_t       tfo_cookie_req; /* Number of SYNs with cookie request received*/
+       u_int32_t       tfo_cookie_rcv; /* Number of SYN/ACKs with Cookie received */
+       u_int32_t       tfo_syn_data_sent;      /* Number of SYNs+data+cookie sent */
+       u_int32_t       tfo_syn_data_acked;/* Number of times our SYN+data has been acknowledged */
+       u_int32_t       tfo_syn_loss;   /* Number of times SYN+TFO has been lost and we fallback */
+       u_int32_t       tfo_blackhole;  /* Number of times SYN+TFO has been lost and we fallback */
+       u_int32_t       tfo_cookie_wrong;       /* TFO-cookie we sent was wrong */
+       u_int32_t       tfo_no_cookie_rcv;      /* We asked for a cookie but didn't get one */
+       u_int32_t       tfo_heuristics_disable; /* TFO got disabled due to heuristics */
+       u_int32_t       tfo_sndblackhole;       /* TFO got blackholed in the sending direction */
+       u_int32_t       mptcp_handover_attempt; /* Total number of MPTCP-attempts using handover mode */
+       u_int32_t       mptcp_interactive_attempt;      /* Total number of MPTCP-attempts using interactive mode */
+       u_int32_t       mptcp_aggregate_attempt;        /* Total number of MPTCP-attempts using aggregate mode */
+       u_int32_t       mptcp_fp_handover_attempt; /* Same as previous three but only for first-party apps */
+       u_int32_t       mptcp_fp_interactive_attempt;
+       u_int32_t       mptcp_fp_aggregate_attempt;
+       u_int32_t       mptcp_heuristic_fallback;       /* Total number of MPTCP-connections that fell back due to heuristics */
+       u_int32_t       mptcp_fp_heuristic_fallback;    /* Same as previous but for first-party apps */
+       u_int32_t       mptcp_handover_success_wifi;    /* Total number of successfull handover-mode connections that *started* on WiFi */
+       u_int32_t       mptcp_handover_success_cell;    /* Total number of successfull handover-mode connections that *started* on Cell */
+       u_int32_t       mptcp_interactive_success;              /* Total number of interactive-mode connections that negotiated MPTCP */
+       u_int32_t       mptcp_aggregate_success;                /* Same as previous but for aggregate */
+       u_int32_t       mptcp_fp_handover_success_wifi; /* Same as previous four, but for first-party apps */
+       u_int32_t       mptcp_fp_handover_success_cell;
+       u_int32_t       mptcp_fp_interactive_success;
+       u_int32_t       mptcp_fp_aggregate_success;
+       u_int32_t       mptcp_handover_cell_from_wifi;  /* Total number of connections that use cell in handover-mode (coming from WiFi) */
+       u_int32_t       mptcp_handover_wifi_from_cell;  /* Total number of connections that use WiFi in handover-mode (coming from cell) */
+       u_int32_t       mptcp_interactive_cell_from_wifi;       /* Total number of connections that use cell in interactive mode (coming from WiFi) */
+       u_int32_t       mptcp_back_to_wifi;     /* Total number of connections that succeed to move traffic away from cell (when starting on cell) */
+       u_int64_t       mptcp_handover_cell_bytes;              /* Total number of bytes sent on cell in handover-mode (on new subflows, ignoring initial one) */
+       u_int64_t       mptcp_interactive_cell_bytes;   /* Same as previous but for interactive */
+       u_int64_t       mptcp_aggregate_cell_bytes;
+       u_int64_t       mptcp_handover_all_bytes;               /* Total number of bytes sent in handover */
+       u_int64_t       mptcp_interactive_all_bytes;
+       u_int64_t       mptcp_aggregate_all_bytes;
+       u_int32_t       mptcp_wifi_proxy;               /* Total number of new subflows that fell back to regular TCP on cell */
+       u_int32_t       mptcp_cell_proxy;               /* Total number of new subflows that fell back to regular TCP on WiFi */
+       u_int32_t       mptcp_triggered_cell;           /* Total number of times an MPTCP-connection triggered cell bringup */
+       u_int32_t       _padding;
        /* When adding/removing here, also adjust NSTAT_SYSINFO_TCP_STATS_COUNT */
 } nstat_sysinfo_tcp_stats;
 #define NSTAT_SYSINFO_TCP_STATS_COUNT   71
@@ -1021,25 +1047,25 @@ enum {
 };
 
 typedef struct nstat_sysinfo_ifnet_ecn_stats {
-       u_int32_t                       ifnet_proto;
-       u_int32_t                       ifnet_type;
-       struct if_tcp_ecn_stat          ecn_stat;
+       u_int32_t               ifnet_proto;
+       u_int32_t               ifnet_type;
+       struct if_tcp_ecn_stat  ecn_stat;
 } nstat_sysinfo_ifnet_ecn_stats;
 
 /* Total number of Low Internet stats that will be reported */
 #define NSTAT_LIM_STAT_KEYVAL_COUNT     12
 typedef struct nstat_sysinfo_lim_stats {
-       u_int8_t                        ifnet_signature[NSTAT_SYSINFO_KEYVAL_STRING_MAXSIZE];
-       u_int32_t                       ifnet_siglen;
-       u_int32_t                       ifnet_type;
-       struct if_lim_perf_stat         lim_stat;
+       u_int8_t                ifnet_signature[NSTAT_SYSINFO_KEYVAL_STRING_MAXSIZE];
+       u_int32_t               ifnet_siglen;
+       u_int32_t               ifnet_type;
+       struct if_lim_perf_stat lim_stat;
 } nstat_sysinfo_lim_stats;
 
 #define NSTAT_NET_API_STAT_KEYVAL_COUNT (NSTAT_SYSINFO_API_LAST - NSTAT_SYSINFO_API_FIRST + 1)
 typedef struct nstat_sysinfo_net_api_stats {
-       u_int32_t                       report_interval;
-       u_int32_t                       _padding;
-       struct net_api_stats            net_api_stats;
+       u_int32_t               report_interval;
+       u_int32_t               _padding;
+       struct net_api_stats    net_api_stats;
 } nstat_sysinfo_net_api_stats;
 
 typedef struct nstat_sysinfo_data {
@@ -1094,6 +1120,7 @@ void nstat_tcp_new_pcb(struct inpcb *inp);
 void nstat_udp_new_pcb(struct inpcb *inp);
 void nstat_route_new_entry(struct rtentry *rt);
 void nstat_pcb_detach(struct inpcb *inp);
+void nstat_pcb_event(struct inpcb *inp, u_int64_t event);
 void nstat_pcb_cache(struct inpcb *inp);
 void nstat_pcb_invalidate_cache(struct inpcb *inp);
 
index 9eaa778e5bd85d33945091bf1fdd59935fbe74b3..e0e97386263407464ec49725e0df8136e1af2c9b 100644 (file)
@@ -112,6 +112,7 @@ nwk_wq_thread_cont(int err)
        }
 }
 
+__dead2
 static void
 nwk_wq_thread_func(void *v, wait_result_t w)
 {
index db7b4e6436c8b375e579ccc5145ec9a357f62ae6..8e7f41be02a75853811d8e7531fcddb3b5aa6a31 100644 (file)
@@ -50,6 +50,7 @@
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/ip.h>
+#include <netinet/ip6.h>
 #include <netinet/kpi_ipfilter.h>
 #include <string.h>
 #include <libkern/libkern.h>
@@ -840,6 +841,7 @@ static errno_t
 pktmnglr_ipfilter_input(void *cookie, mbuf_t *data, int offset, u_int8_t protocol)
 {
        struct packet_mangler *p_pkt_mnglr = (struct packet_mangler *)cookie;
+       struct ip6_hdr ip6;
        struct ip ip;
        struct tcphdr tcp;
        int ip_pld_len;
@@ -871,6 +873,14 @@ pktmnglr_ipfilter_input(void *cookie, mbuf_t *data, int offset, u_int8_t protoco
                goto input_done;
        }
 
+       if (ip.ip_v == 6) {
+               error = mbuf_copydata(*data, 0, sizeof(ip6), &ip6);
+               if (error) {
+                       PKT_MNGLR_LOG(LOG_ERR, "Could not make local IPv6 header copy");
+                       goto input_done;
+               }
+       }
+
        if ((p_pkt_mnglr->lsaddr.ss_family == AF_INET6) && (ip.ip_v == 4)) {
                PKT_MNGLR_LOG(LOG_INFO, "Skipping filtering as address family of packet is IPv4 but local "
                    "address is set to IPv6");
@@ -888,6 +898,11 @@ pktmnglr_ipfilter_input(void *cookie, mbuf_t *data, int offset, u_int8_t protoco
                if (ip.ip_dst.s_addr != laddr.sin_addr.s_addr) {
                        goto input_done;
                }
+       } else if (p_pkt_mnglr->lsaddr.ss_family == AF_INET6) {
+               struct sockaddr_in6 laddr = *(struct sockaddr_in6 *)(&(p_pkt_mnglr->lsaddr));
+               if (!IN6_ARE_ADDR_EQUAL(&ip6.ip6_dst, &laddr.sin6_addr)) {
+                       goto input_done;
+               }
        }
 
        if (p_pkt_mnglr->rsaddr.ss_family == AF_INET) {
@@ -898,13 +913,25 @@ pktmnglr_ipfilter_input(void *cookie, mbuf_t *data, int offset, u_int8_t protoco
                PKT_MNGLR_LOG(LOG_INFO, "Remote IP: %x Source IP: %x in input path",
                    raddr.sin_addr.s_addr,
                    ip.ip_src.s_addr);
+       } else if (p_pkt_mnglr->rsaddr.ss_family == AF_INET6) {
+               struct sockaddr_in6 raddr = *(struct sockaddr_in6 *)(&(p_pkt_mnglr->rsaddr));
+               if (!IN6_ARE_ADDR_EQUAL(&ip6.ip6_src, &raddr.sin6_addr)) {
+                       goto input_done;
+               }
        }
 
-       if (ip.ip_v != 4) {
+       if (ip.ip_v == 4) {
+               ip_pld_len = ntohs(ip.ip_len) - (ip.ip_hl << 2);
+       } else if (ip.ip_v == 6) {
+               if (ip6.ip6_nxt != p_pkt_mnglr->proto) {
+                       /* Don't support IPv6 extension headers */
+                       goto input_done;
+               }
+               ip_pld_len = ntohs(ip6.ip6_plen) + sizeof(struct ip6_hdr);
+       } else {
                goto input_done;
        }
 
-       ip_pld_len = ntohs(ip.ip_len) - (ip.ip_hl << 2);
 
        if (protocol != p_pkt_mnglr->proto) {
                PKT_MNGLR_LOG(LOG_INFO, "Skip: Protocol mismatch");
index e9fa2a37dc90975645123fa673b6dc322d26b1d0..87a3925338bd3a07050527a2222260efb4cb0141 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -328,12 +328,12 @@ extern struct pool pfr_kentry_pl;
 extern int path_mtu_discovery;
 
 struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX] = {
-       { &pf_state_pl, PFSTATE_HIWAT },
-       { &pf_app_state_pl, PFAPPSTATE_HIWAT },
-       { &pf_src_tree_pl, PFSNODE_HIWAT },
-       { &pf_frent_pl, PFFRAG_FRENT_HIWAT },
-       { &pfr_ktable_pl, PFR_KTABLE_HIWAT },
-       { &pfr_kentry_pl, PFR_KENTRY_HIWAT },
+       { .pp = &pf_state_pl, .limit = PFSTATE_HIWAT },
+       { .pp = &pf_app_state_pl, .limit = PFAPPSTATE_HIWAT },
+       { .pp = &pf_src_tree_pl, .limit = PFSNODE_HIWAT },
+       { .pp = &pf_frent_pl, .limit = PFFRAG_FRENT_HIWAT },
+       { .pp = &pfr_ktable_pl, .limit = PFR_KTABLE_HIWAT },
+       { .pp = &pfr_kentry_pl, .limit = PFR_KENTRY_HIWAT },
 };
 
 void *
@@ -2381,8 +2381,10 @@ pf_change_addr(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u,
 {
        struct pf_addr  ao;
 
-       PF_ACPY(&ao, a, af);
-       PF_ACPY(a, an, afn);
+       if (af != afn) {
+               PF_ACPY(&ao, a, af);
+               PF_ACPY(a, an, afn);
+       }
 
        switch (af) {
        case AF_INET:
@@ -6081,24 +6083,6 @@ pf_is_dummynet_enabled(void)
 #endif /* DUMMYNET */
 }
 
-boolean_t
-pf_is_nlc_enabled(void)
-{
-#if DUMMYNET
-       if (__probable(!pf_is_dummynet_enabled())) {
-               return FALSE;
-       }
-
-       if (__probable(!is_nlc_enabled_glb)) {
-               return FALSE;
-       }
-
-       return TRUE;
-#else
-       return FALSE;
-#endif /* DUMMYNET */
-}
-
 #if DUMMYNET
 /*
  * When pf_test_dummynet() returns PF_PASS, the rule matching parameter "rm"
@@ -6347,7 +6331,6 @@ pf_test_dummynet(struct pf_rule **rm, int direction, struct pfi_kif *kif,
                        dnflow.fwa_ro6_pmtu = fwa->fwa_ro6_pmtu;
                        dnflow.fwa_origifp = fwa->fwa_origifp;
                        dnflow.fwa_mtu = fwa->fwa_mtu;
-                       dnflow.fwa_alwaysfrag = fwa->fwa_alwaysfrag;
                        dnflow.fwa_unfragpartlen = fwa->fwa_unfragpartlen;
                        dnflow.fwa_exthdrs = fwa->fwa_exthdrs;
                }
@@ -7826,7 +7809,7 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
                                        pf_change_a6(saddr,
                                            &pd->hdr.icmp6->icmp6_cksum,
                                            &sk->gwy.addr, 0);
-                                       if (pf_lazy_makewritable(pd, NULL,
+                                       if (pf_lazy_makewritable(pd, pbuf,
                                            off + sizeof(struct icmp6_hdr)) ==
                                            NULL) {
                                                return PF_DROP;
@@ -9340,6 +9323,7 @@ pf_route6(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp,
        struct pf_addr           naddr;
        struct pf_src_node      *sn = NULL;
        int                      error = 0;
+       struct pf_mtag          *pf_mtag;
 
        if (pbufp == NULL || !pbuf_is_valid(*pbufp) || r == NULL ||
            (dir != PF_IN && dir != PF_OUT) || oifp == NULL) {
@@ -9388,11 +9372,8 @@ pf_route6(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp,
 
        /* Cheat. XXX why only in the v6addr case??? */
        if (r->rt == PF_FASTROUTE) {
-               struct pf_mtag *pf_mtag;
-
-               if ((pf_mtag = pf_get_mtag(m0)) == NULL) {
-                       goto bad;
-               }
+               pf_mtag = pf_get_mtag(m0);
+               ASSERT(pf_mtag != NULL);
                pf_mtag->pftag_flags |= PF_TAG_GENERATED;
                ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
                return;
@@ -9433,6 +9414,24 @@ pf_route6(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp,
                            "< sizeof (struct ip6_hdr)\n"));
                        goto bad;
                }
+               pf_mtag = pf_get_mtag(m0);
+               /*
+                * send refragmented packets.
+                */
+               if ((pf_mtag->pftag_flags & PF_TAG_REFRAGMENTED) != 0) {
+                       pf_mtag->pftag_flags &= ~PF_TAG_REFRAGMENTED;
+                       /*
+                        * nd6_output() frees packet chain in both success and
+                        * failure cases.
+                        */
+                       error = nd6_output(ifp, ifp, m0, dst, NULL, NULL);
+                       m0 = NULL;
+                       if (error) {
+                               DPFPRINTF(PF_DEBUG_URGENT, ("pf_route6:"
+                                   "dropped refragmented packet\n"));
+                       }
+                       goto done;
+               }
                ip6 = mtod(m0, struct ip6_hdr *);
        }
 
@@ -9460,6 +9459,7 @@ done:
 bad:
        if (m0) {
                m_freem(m0);
+               m0 = NULL;
        }
        goto done;
 }
@@ -9648,6 +9648,11 @@ pf_test(int dir, struct ifnet *ifp, pbuf_t **pbufp,
                return PF_PASS;
        }
 
+       if (pbuf->pb_packet_len < (int)sizeof(*h)) {
+               REASON_SET(&reason, PFRES_SHORT);
+               return PF_DROP;
+       }
+
        /* initialize enough of pd for the done label */
        h = pbuf->pb_data;
        pd.mp = pbuf;
@@ -9666,13 +9671,6 @@ pf_test(int dir, struct ifnet *ifp, pbuf_t **pbufp,
        pd.tot_len = ntohs(h->ip_len);
        pd.eh = eh;
 
-       if (pbuf->pb_packet_len < (int)sizeof(*h)) {
-               action = PF_DROP;
-               REASON_SET(&reason, PFRES_SHORT);
-               log = 1;
-               goto done;
-       }
-
 #if DUMMYNET
        if (fwa != NULL && fwa->fwa_pf_rule != NULL) {
                goto nonormalize;
@@ -10209,9 +10207,15 @@ pf_test6(int dir, struct ifnet *ifp, pbuf_t **pbufp,
        struct pf_pdesc          pd;
        int                      off, terminal = 0, dirndx, rh_cnt = 0;
        u_int8_t                 nxt;
+       boolean_t                fwd = FALSE;
 
        LCK_MTX_ASSERT(pf_lock, LCK_MTX_ASSERT_OWNED);
 
+       ASSERT(ifp != NULL);
+       if ((dir == PF_OUT) && (pbuf->pb_ifp) && (ifp != pbuf->pb_ifp)) {
+               fwd = TRUE;
+       }
+
        if (!pf_status.running) {
                return PF_PASS;
        }
@@ -10239,8 +10243,12 @@ pf_test6(int dir, struct ifnet *ifp, pbuf_t **pbufp,
                return PF_PASS;
        }
 
-       h = pbuf->pb_data;
+       if (pbuf->pb_packet_len < (int)sizeof(*h)) {
+               REASON_SET(&reason, PFRES_SHORT);
+               return PF_DROP;
+       }
 
+       h = pbuf->pb_data;
        nxt = h->ip6_nxt;
        off = ((caddr_t)h - (caddr_t)pbuf->pb_data) + sizeof(struct ip6_hdr);
        pd.mp = pbuf;
@@ -10266,13 +10274,6 @@ pf_test6(int dir, struct ifnet *ifp, pbuf_t **pbufp,
                pd.pktflags = (*pbuf->pb_flags & PKTF_FLOW_MASK);
        }
 
-       if (pbuf->pb_packet_len < (int)sizeof(*h)) {
-               action = PF_DROP;
-               REASON_SET(&reason, PFRES_SHORT);
-               log = 1;
-               goto done;
-       }
-
 #if DUMMYNET
        if (fwa != NULL && fwa->fwa_pf_rule != NULL) {
                goto nonormalize;
@@ -10302,7 +10303,6 @@ nonormalize:
                goto done;
        }
 #endif
-
        pd.src = (struct pf_addr *)(uintptr_t)&h->ip6_src;
        pd.dst = (struct pf_addr *)(uintptr_t)&h->ip6_dst;
        PF_ACPY(&pd.baddr, pd.src, AF_INET6);
@@ -10322,7 +10322,7 @@ nonormalize:
        pd.pf_mtag = pf_get_mtag_pbuf(pbuf);
 
        do {
-               switch (nxt) {
+               switch (pd.proto) {
                case IPPROTO_FRAGMENT: {
                        struct ip6_frag ip6f;
 
@@ -10336,7 +10336,7 @@ nonormalize:
                                log = 1;
                                goto done;
                        }
-                       pd.proto = nxt = ip6f.ip6f_nxt;
+                       pd.proto = ip6f.ip6f_nxt;
 #if DUMMYNET
                        /* Traffic goes through dummynet first */
                        action = pf_test_dummynet(&r, dir, kif, &pbuf, &pd,
@@ -10377,7 +10377,7 @@ nonormalize:
                        } else {
                                off += (opt6.ip6e_len + 1) * 8;
                        }
-                       nxt = opt6.ip6e_nxt;
+                       pd.proto = opt6.ip6e_nxt;
                        /* goto the next header */
                        break;
                }
@@ -10800,11 +10800,17 @@ done:
                *pbufp = NULL;
                action = PF_PASS;
        } else if (r->rt) {
-               /* pf_route6 can free the mbuf causing *m0 to become NULL */
+               /* pf_route6 can free the mbuf causing *pbufp to become NULL */
                pf_route6(pbufp, r, dir, kif->pfik_ifp, s, &pd);
        }
 #endif /* 0 */
 
+       /* if reassembled packet passed, create new fragments */
+       struct pf_fragment_tag *ftag = NULL;
+       if ((action == PF_PASS) && (*pbufp != NULL) && (fwd) &&
+           ((ftag = pf_find_fragment_tag_pbuf(*pbufp)) != NULL)) {
+               action = pf_refragment6(ifp, pbufp, ftag);
+       }
        return action;
 }
 #endif /* INET6 */
@@ -10909,6 +10915,51 @@ pf_get_mtag_pbuf(pbuf_t *pbuf)
        return pf_find_mtag_pbuf(pbuf);
 }
 
+struct pf_fragment_tag *
+pf_copy_fragment_tag(struct mbuf *m, struct pf_fragment_tag *ftag, int how)
+{
+       struct m_tag *tag;
+       struct pf_mtag *pftag = pf_find_mtag(m);
+
+       tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_PF_REASS,
+           sizeof(*ftag), how, m);
+       if (tag == NULL) {
+               return NULL;
+       } else {
+               m_tag_prepend(m, tag);
+               tag = tag + 1;
+       }
+       bcopy(ftag, tag, sizeof(*ftag));
+       pftag->pftag_flags |= PF_TAG_REASSEMBLED;
+       return (struct pf_fragment_tag *)tag;
+}
+
+struct pf_fragment_tag *
+pf_find_fragment_tag(struct mbuf *m)
+{
+       struct m_tag *tag;
+       struct pf_fragment_tag *ftag;
+       struct pf_mtag *pftag = pf_find_mtag(m);
+
+       tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_PF_REASS,
+           NULL);
+       VERIFY((tag == NULL) || (pftag->pftag_flags & PF_TAG_REASSEMBLED));
+       if (tag != NULL) {
+               tag = tag + 1;
+       }
+       ftag = (struct pf_fragment_tag *)tag;
+       return ftag;
+}
+
+struct pf_fragment_tag *
+pf_find_fragment_tag_pbuf(pbuf_t *pbuf)
+{
+       struct pf_mtag *mtag = pf_find_mtag_pbuf(pbuf);
+
+       return (mtag->pftag_flags & PF_TAG_REASSEMBLED) ?
+              pbuf->pb_pf_fragtag : NULL;
+}
+
 uint64_t
 pf_time_second(void)
 {
index fad147b5a988cd3397ffde325c4d775f88fcb6ce..4afbfed57817ba6d48d1633f7c6288ada1a439d0 100644 (file)
@@ -481,12 +481,19 @@ pfi_instance_add(struct ifnet *ifp, int net, int flags)
                        IFA_UNLOCK(ia);
                        continue;
                }
-               if ((flags & PFI_AFLAG_NETWORK) && af == AF_INET6 &&
+               if ((af == AF_INET6) &&
                    IN6_IS_ADDR_LINKLOCAL(&((struct sockaddr_in6 *)
                    (void *)ia->ifa_addr)->sin6_addr)) {
                        IFA_UNLOCK(ia);
                        continue;
                }
+               if ((af == AF_INET6) &&
+                   (((struct in6_ifaddr *)ia)->ia6_flags &
+                   (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY | IN6_IFF_DETACHED |
+                   IN6_IFF_CLAT46 | IN6_IFF_TEMPORARY | IN6_IFF_DEPRECATED))) {
+                       IFA_UNLOCK(ia);
+                       continue;
+               }
                if (flags & PFI_AFLAG_NOALIAS) {
                        if (af == AF_INET && got4) {
                                IFA_UNLOCK(ia);
index 43a8e23c785741883364c4d17a9cd68e2af0d929..560dee4ab465117f3792fb93975f4537bcb22779 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -80,6 +80,7 @@
 #include <sys/conf.h>
 #include <sys/mcache.h>
 #include <sys/queue.h>
+#include <os/log.h>
 
 #include <mach/vm_param.h>
 
@@ -189,20 +190,20 @@ static void pf_deleterule_anchor_step_out(struct pf_ruleset **,
 #define PF_CDEV_MAJOR   (-1)
 
 static struct cdevsw pf_cdevsw = {
-       /* open */ pfopen,
-       /* close */ pfclose,
-       /* read */ eno_rdwrt,
-       /* write */ eno_rdwrt,
-       /* ioctl */ pfioctl,
-       /* stop */ eno_stop,
-       /* reset */ eno_reset,
-       /* tty */ NULL,
-       /* select */ eno_select,
-       /* mmap */ eno_mmap,
-       /* strategy */ eno_strat,
-       /* getc */ eno_getc,
-       /* putc */ eno_putc,
-       /* type */ 0
+       .d_open       = pfopen,
+       .d_close      = pfclose,
+       .d_read       = eno_rdwrt,
+       .d_write      = eno_rdwrt,
+       .d_ioctl      = pfioctl,
+       .d_stop       = eno_stop,
+       .d_reset      = eno_reset,
+       .d_ttys       = NULL,
+       .d_select     = eno_select,
+       .d_mmap       = eno_mmap,
+       .d_strategy   = eno_strat,
+       .d_reserved_1 = eno_getc,
+       .d_reserved_2 = eno_putc,
+       .d_type       = 0
 };
 
 static void pf_attach_hooks(void);
@@ -224,6 +225,8 @@ int16_t pf_nat64_configured = 0;
 /*
  * These are the pf enabled reference counting variables
  */
+#define NR_TOKENS_LIMIT (INT_MAX / sizeof(struct pfioc_token))
+
 static u_int64_t pf_enabled_ref_count;
 static u_int32_t nr_tokens = 0;
 static u_int64_t pffwrules;
@@ -344,6 +347,11 @@ generate_token(struct proc *p)
        u_int64_t token_value;
        struct pfioc_kernel_token *new_token;
 
+       if (nr_tokens + 1 > NR_TOKENS_LIMIT) {
+               os_log_error(OS_LOG_DEFAULT, "%s: NR_TOKENS_LIMIT reached", __func__);
+               return 0;
+       }
+
        new_token = _MALLOC(sizeof(struct pfioc_kernel_token), M_TEMP,
            M_WAITOK | M_ZERO);
 
@@ -351,7 +359,7 @@ generate_token(struct proc *p)
 
        if (new_token == NULL) {
                /* malloc failed! bail! */
-               printf("%s: unable to allocate pf token structure!", __func__);
+               os_log_error(OS_LOG_DEFAULT, "%s: unable to allocate pf token structure!", __func__);
                return 0;
        }
 
@@ -2292,6 +2300,11 @@ pfioctl_ioc_tokens(u_long cmd, struct pfioc_tokens_32 *tok32,
                }
 
                size = sizeof(struct pfioc_token) * nr_tokens;
+               if (size / nr_tokens != sizeof(struct pfioc_token)) {
+                       os_log_error(OS_LOG_DEFAULT, "%s: size overflows", __func__);
+                       error = ERANGE;
+                       break;
+               }
                ocnt = cnt = (p64 ? tok64->size : tok32->size);
                if (cnt == 0) {
                        if (p64) {
index 9c28415dedc26a91ab246d8026b2a2e421ea8cd3..96b85f462ffb451b9c9d6b404aedf656bfcd7e9d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -84,6 +84,7 @@
 
 #if INET6
 #include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
 #endif /* INET6 */
 
 #include <net/pfvar.h>
@@ -98,7 +99,8 @@ struct pf_frent {
                struct ip6_hdr  *fru_ipv6;
        } fr_u;
        struct ip6_frag         fr_ip6f_opt;
-       int                     fr_ip6f_hlen;
+       uint16_t                fr_ip6f_hlen;   /* total header length */
+       uint16_t                fr_ip6f_extoff; /* last extension header offset or 0 */
 };
 
 struct pf_frcache {
@@ -136,6 +138,7 @@ struct pf_fragment {
        } fr_u;
        uint32_t        fr_csum_flags;  /* checksum flags */
        uint32_t        fr_csum;        /* partial checksum value */
+       uint16_t        fr_ip6_maxlen;  /* maximum length of a single fragment in IPv6 */
 };
 
 static TAILQ_HEAD(pf_fragqueue, pf_fragment)    pf_fragqueue;
@@ -159,19 +162,21 @@ static struct pf_fragment *pf_find_fragment_by_key(struct pf_fragment *,
     struct pf_frag_tree *);
 static __inline struct pf_fragment *
 pf_find_fragment_by_ipv4_header(struct ip *, struct pf_frag_tree *);
-static __inline struct pf_fragment *
-pf_find_fragment_by_ipv6_header(struct ip6_hdr *, struct ip6_frag *,
-    struct pf_frag_tree *);
 static struct mbuf *pf_reassemble(struct mbuf *, struct pf_fragment **,
     struct pf_frent *, int);
 static struct mbuf *pf_fragcache(struct mbuf **, struct ip *,
     struct pf_fragment **, int, int, int *);
+static int pf_normalize_tcpopt(struct pf_rule *, int, struct pfi_kif *,
+    struct pf_pdesc *, pbuf_t *, struct tcphdr *, int, int *);
+#if INET6
+static __inline struct pf_fragment *
+pf_find_fragment_by_ipv6_header(struct ip6_hdr *, struct ip6_frag *,
+    struct pf_frag_tree *);
 static struct mbuf *pf_reassemble6(struct mbuf **, struct pf_fragment **,
     struct pf_frent *, int);
 static struct mbuf *pf_frag6cache(struct mbuf **, struct ip6_hdr*,
     struct ip6_frag *, struct pf_fragment **, int, int, int, int *);
-static int pf_normalize_tcpopt(struct pf_rule *, int, struct pfi_kif *,
-    struct pf_pdesc *, pbuf_t *, struct tcphdr *, int, int *);
+#endif /* INET6 */
 
 #define DPFPRINTF(x) do {                               \
        if (pf_status.debug >= PF_DEBUG_MISC) {         \
@@ -483,17 +488,7 @@ pf_find_fragment_by_ipv4_header(struct ip *ip, struct pf_frag_tree *tree)
        return pf_find_fragment_by_key(&key, tree);
 }
 
-static __inline struct pf_fragment *
-pf_find_fragment_by_ipv6_header(struct ip6_hdr *ip6, struct ip6_frag *fh,
-    struct pf_frag_tree *tree)
-{
-       struct pf_fragment key;
-       pf_ip6hdr2key(&key, ip6, fh);
-       return pf_find_fragment_by_key(&key, tree);
-}
-
 /* Removes a fragment from the fragment queue and frees the fragment */
-
 static void
 pf_remove_fragment(struct pf_fragment *frag)
 {
@@ -773,7 +768,6 @@ insert:
            (m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
                /* loopback checksums are always OK */
                m->m_pkthdr.csum_data = 0xffff;
-               m->m_pkthdr.csum_flags &= ~CSUM_PARTIAL;
                m->m_pkthdr.csum_flags =
                    CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
                    CSUM_IP_CHECKED | CSUM_IP_VALID;
@@ -1120,6 +1114,7 @@ drop_fragment:
        return NULL;
 }
 
+#if INET6
 #define FR_IP6_OFF(fr) \
        (ntohs((fr)->fr_ip6f_opt.ip6f_offlg & IP6F_OFF_MASK))
 #define FR_IP6_PLEN(fr) (ntohs((fr)->fr_ip6->ip6_plen))
@@ -1131,7 +1126,7 @@ pf_reassemble6(struct mbuf **m0, struct pf_fragment **frag,
        struct pf_frent *frea, *frep, *next;
        struct ip6_hdr *ip6;
        struct ip6_frag *ip6f;
-       int plen, off, fr_max;
+       int plen, off, fr_max, pktlen;
        uint32_t uoff, csum, csum_flags;
 
        VERIFY(*frag == NULL || BUFFER_FRAGMENTS(*frag));
@@ -1142,7 +1137,8 @@ pf_reassemble6(struct mbuf **m0, struct pf_fragment **frag,
        off = FR_IP6_OFF(frent);
        uoff = frent->fr_ip6f_hlen;
        plen = FR_IP6_PLEN(frent);
-       fr_max = off + plen - (frent->fr_ip6f_hlen - sizeof *ip6);
+       fr_max = off + plen - (frent->fr_ip6f_hlen - sizeof(*ip6));
+       pktlen = plen + sizeof(*ip6);
 
        DPFPRINTF(("0x%llx IPv6 frag plen %u off %u fr_ip6f_hlen %u "
            "fr_max %u m_len %u\n", (uint64_t)VM_KERNEL_ADDRPERM(m), plen, off,
@@ -1230,6 +1226,7 @@ pf_reassemble6(struct mbuf **m0, struct pf_fragment **frag,
 
                (*frag)->fr_flags = 0;
                (*frag)->fr_max = 0;
+               (*frag)->fr_ip6_maxlen = pktlen;
                (*frag)->fr_af = AF_INET6;
                (*frag)->fr_srcx.v6addr = frent->fr_ip6->ip6_src;
                (*frag)->fr_dstx.v6addr = frent->fr_ip6->ip6_dst;
@@ -1250,6 +1247,10 @@ pf_reassemble6(struct mbuf **m0, struct pf_fragment **frag,
                goto insert;
        }
 
+       /* Remember maximum fragment len for refragmentation */
+       if (pktlen > (*frag)->fr_ip6_maxlen) {
+               (*frag)->fr_ip6_maxlen = pktlen;
+       }
        /*
         * If this fragment contains similar checksum offload info
         * as that of the existing ones, accumulate checksum.  Otherwise,
@@ -1369,6 +1370,31 @@ insert:
                return NULL;
        }
 
+       ASSERT(*frag != NULL);
+       ASSERT(frent != NULL);
+       next = LIST_NEXT(frent, fr_next);
+       if (next == NULL) {
+               DPFPRINTF(("drop: atomic fragment\n"));
+               pf_free_fragment(*frag);
+               *frag = NULL;
+               return NULL;
+       }
+
+       /* retrieve the values to be filled in to reassembled tag */
+       uint16_t hdrlen, unfragpartlen, extoff, maxlen;
+       uint32_t id;
+
+       /* Get total extension header length from the first fragment */
+       hdrlen = frent->fr_ip6f_hlen - sizeof(struct ip6_frag);
+       /*
+        * Get total extension header length of per-fragment headers from the
+        * subsequent fragment.
+        */
+       unfragpartlen = next->fr_ip6f_hlen - sizeof(struct ip6_frag);
+       extoff = frent->fr_ip6f_extoff;
+       maxlen = (*frag)->fr_ip6_maxlen;
+       id = (*frag)->fr_id6;
+
        ip6 = frent->fr_ip6;
        ip6->ip6_nxt = (*frag)->fr_p;
        ip6->ip6_plen = htons(off);
@@ -1387,7 +1413,6 @@ insert:
            (m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
                /* loopback checksums are always OK */
                m->m_pkthdr.csum_data = 0xffff;
-               m->m_pkthdr.csum_flags &= ~CSUM_PARTIAL;
                m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
        }
 
@@ -1414,17 +1439,38 @@ insert:
 
        /* XXX this should be done elsewhere */
        if (m->m_flags & M_PKTHDR) {
-               int pktlen = 0;
+               int len = 0;
                for (m2 = m; m2; m2 = m2->m_next) {
-                       pktlen += m2->m_len;
+                       len += m2->m_len;
                }
-               m->m_pkthdr.len = pktlen;
+               m->m_pkthdr.len = len;
        }
 
        DPFPRINTF(("complete: 0x%llx ip6_plen %d m_pkthdr.len %d\n",
            (uint64_t)VM_KERNEL_ADDRPERM(m), ntohs(ip6->ip6_plen),
            m->m_pkthdr.len));
 
+       /* Add the reassembled tag */
+       struct m_tag *mtag;
+       struct pf_fragment_tag *ftag;
+       mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_PF_REASS,
+           sizeof(*ftag), M_NOWAIT, m);
+       if (mtag == NULL) {
+               /* XXX: add stats */
+               m_freem(m);
+               return NULL;
+       }
+       ftag = (struct pf_fragment_tag *)(mtag + 1);
+       ftag->ft_hdrlen = hdrlen;
+       ftag->ft_unfragpartlen = unfragpartlen;
+       ftag->ft_extoff = extoff;
+       ftag->ft_maxlen = maxlen;
+       ftag->ft_id = id;
+       m_tag_prepend(m, mtag);
+
+       struct pf_mtag *pftag = pf_get_mtag(m);
+       ASSERT(pftag != NULL);
+       pftag->pftag_flags |= PF_TAG_REASSEMBLED;
        return m;
 
 drop_fragment:
@@ -1771,6 +1817,99 @@ drop_fragment:
        return NULL;
 }
 
+int
+pf_refragment6(struct ifnet *ifp, pbuf_t **pbufp, struct pf_fragment_tag *ftag)
+{
+       struct mbuf        *m;
+       uint32_t           frag_id;
+       uint16_t           hdrlen, extoff, maxlen, unfragpartlen;
+       uint8_t            proto;
+       int                error, action;
+       uint8_t            *lexthdrsp;
+       struct route_in6   ip6route;
+       struct route_in6   *ro;
+       struct sockaddr_in6     *dst;
+       struct ip6_hdr *hdr;
+       struct pf_mtag *mtag;
+       struct m_tag *tag;
+
+       if (pbufp == NULL || !pbuf_is_valid(*pbufp) || ftag == NULL) {
+               panic("pf_route6: invalid parameters");
+               /* NOT REACHED */
+       }
+       m = pbuf_to_mbuf(*pbufp, FALSE);
+       hdr = mtod(m, struct ip6_hdr *);
+       mtag = pf_find_mtag(m);
+       hdrlen = ftag->ft_hdrlen - sizeof(struct ip6_hdr);
+       extoff = ftag->ft_extoff;
+       maxlen = ftag->ft_maxlen;
+       frag_id = ftag->ft_id;
+       unfragpartlen = ftag->ft_unfragpartlen;
+       tag = (struct m_tag *)(void *)ftag;
+       tag = tag - 1;
+       m_tag_delete(m, tag);
+       ftag = NULL;
+       tag = NULL;
+       mtag->pftag_flags &= ~PF_TAG_REASSEMBLED;
+       ro = &ip6route;
+       bzero((caddr_t)ro, sizeof(*ro));
+       dst = (struct sockaddr_in6 *)&ro->ro_dst;
+       dst->sin6_family = AF_INET6;
+       dst->sin6_len = sizeof(*dst);
+       dst->sin6_addr = hdr->ip6_dst;
+
+       if (extoff) {
+               int off;
+               struct mbuf *mexthdr;
+
+               /* Use protocol from next field of last extension header */
+               mexthdr = m_getptr(m, extoff +
+                   offsetof(struct ip6_ext, ip6e_nxt), &off);
+               ASSERT(mexthdr != NULL);
+               lexthdrsp = (mtod(mexthdr, uint8_t *) + off);
+               proto = *lexthdrsp;
+               if (proto == IPPROTO_DSTOPTS) {
+                       struct ip6_ext ext;
+                       if (!pf_pull_hdr(*pbufp, off, &ext, sizeof(ext), NULL,
+                           NULL, AF_INET6)) {
+                               DPFPRINTF(("pkt too short"));
+                               action = PF_DROP;
+                               goto done;
+                       }
+                       proto = ext.ip6e_nxt;
+               }
+       } else {
+               lexthdrsp = NULL;
+               proto = hdr->ip6_nxt;
+       }
+
+       /*
+        * The MTU must be a multiple of 8 bytes, or we risk doing the
+        * fragmentation wrong.
+        */
+       maxlen = maxlen & ~7;
+
+       error = ip6_do_fragmentation(&m, hdrlen, NULL, unfragpartlen,
+           hdr, lexthdrsp, maxlen, proto, frag_id);
+
+       if (error == 0) {
+               /*
+                * PF_TAG_REFRAGMENTED flag set to indicate ip6_forward()
+                * and pf_route6() that the mbuf contains a chain of fragments.
+                */
+               mtag->pftag_flags |= PF_TAG_REFRAGMENTED;
+               action = PF_PASS;
+               pbuf_init_mbuf(*pbufp, m, ifp);
+       } else {
+               DPFPRINTF(("refragment error %d", error));
+               action = PF_DROP;
+               goto done;
+       }
+done:
+       return action;
+}
+#endif /* INET6 */
+
 int
 pf_normalize_ip(pbuf_t *pbuf, int dir, struct pfi_kif *kif, u_short *reason,
     struct pf_pdesc *pd)
@@ -2093,22 +2232,29 @@ bad:
 }
 
 #if INET6
+static __inline struct pf_fragment *
+pf_find_fragment_by_ipv6_header(struct ip6_hdr *ip6, struct ip6_frag *fh,
+    struct pf_frag_tree *tree)
+{
+       struct pf_fragment key;
+       pf_ip6hdr2key(&key, ip6, fh);
+       return pf_find_fragment_by_key(&key, tree);
+}
+
 int
 pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif,
     u_short *reason, struct pf_pdesc *pd)
 {
-       struct mbuf             *m;
+       struct mbuf             *m = NULL;
        struct pf_rule          *r;
        struct ip6_hdr          *h = pbuf->pb_data;
+       int                      extoff;
        int                      off;
        struct ip6_ext           ext;
-/* adi XXX */
-#if 0
        struct ip6_opt           opt;
        struct ip6_opt_jumbo     jumbo;
        int                      optend;
        int                      ooff;
-#endif
        struct ip6_frag          frag;
        u_int32_t                jumbolen = 0, plen;
        u_int16_t                fragoff = 0;
@@ -2172,6 +2318,7 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif,
                goto drop;
        }
 
+       extoff = 0;
        off = sizeof(struct ip6_hdr);
        proto = h->ip6_nxt;
        terminal = 0;
@@ -2187,6 +2334,7 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif,
                            NULL, AF_INET6)) {
                                goto shortpkt;
                        }
+                       extoff = off;
                        /*
                         * <jhw@apple.com>
                         * Multiple routing headers not allowed.
@@ -2209,16 +2357,15 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif,
                        proto = ext.ip6e_nxt;
                        break;
                case IPPROTO_HOPOPTS:
-/* adi XXX */
-#if 0
-                       if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
+                       if (!pf_pull_hdr(pbuf, off, &ext, sizeof(ext), NULL,
                            NULL, AF_INET6)) {
                                goto shortpkt;
                        }
+                       extoff = off;
                        optend = off + (ext.ip6e_len + 1) * 8;
                        ooff = off + sizeof(ext);
                        do {
-                               if (!pf_pull_hdr(m, ooff, &opt.ip6o_type,
+                               if (!pf_pull_hdr(pbuf, ooff, &opt.ip6o_type,
                                    sizeof(opt.ip6o_type), NULL, NULL,
                                    AF_INET6)) {
                                        goto shortpkt;
@@ -2227,11 +2374,12 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif,
                                        ooff++;
                                        continue;
                                }
-                               if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt),
+                               if (!pf_pull_hdr(pbuf, ooff, &opt, sizeof(opt),
                                    NULL, NULL, AF_INET6)) {
                                        goto shortpkt;
                                }
-                               if (ooff + sizeof(opt) + opt.ip6o_len > optend) {
+                               if ((ooff + (int) sizeof(opt) + opt.ip6o_len) >
+                                   optend) {
                                        goto drop;
                                }
                                switch (opt.ip6o_type) {
@@ -2239,7 +2387,7 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif,
                                        if (h->ip6_plen != 0) {
                                                goto drop;
                                        }
-                                       if (!pf_pull_hdr(m, ooff, &jumbo,
+                                       if (!pf_pull_hdr(pbuf, ooff, &jumbo,
                                            sizeof(jumbo), NULL, NULL,
                                            AF_INET6)) {
                                                goto shortpkt;
@@ -2250,8 +2398,8 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif,
                                        if (jumbolen <= IPV6_MAXPACKET) {
                                                goto drop;
                                        }
-                                       if (sizeof(struct ip6_hdr) +
-                                           jumbolen != m->m_pkthdr.len) {
+                                       if ((sizeof(struct ip6_hdr) +
+                                           jumbolen) != pbuf->pb_packet_len) {
                                                goto drop;
                                        }
                                        break;
@@ -2264,7 +2412,6 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif,
                        off = optend;
                        proto = ext.ip6e_nxt;
                        break;
-#endif
                default:
                        terminal = 1;
                        break;
@@ -2292,10 +2439,11 @@ pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif,
        return PF_PASS;
 
 fragment:
-       if (ntohs(h->ip6_plen) == 0 || jumbolen) {
+       plen = ntohs(h->ip6_plen);
+       /* Jumbo payload packets cannot be fragmented */
+       if (plen == 0 || jumbolen) {
                goto drop;
        }
-       plen = ntohs(h->ip6_plen);
 
        if (!pf_pull_hdr(pbuf, off, &frag, sizeof(frag), NULL, NULL, AF_INET6)) {
                goto shortpkt;
@@ -2303,7 +2451,7 @@ fragment:
        fragoff = ntohs(frag.ip6f_offlg & IP6F_OFF_MASK);
        pd->proto = frag.ip6f_nxt;
        mff = ntohs(frag.ip6f_offlg & IP6F_MORE_FRAG);
-       off += sizeof frag;
+       off += sizeof(frag);
        if (fragoff + (plen - off) > IPV6_MAXPACKET) {
                goto badfrag;
        }
@@ -2346,7 +2494,16 @@ fragment:
                frent->fr_ip6 = h;
                frent->fr_m = m;
                frent->fr_ip6f_opt = frag;
+               frent->fr_ip6f_extoff = extoff;
                frent->fr_ip6f_hlen = off;
+               /* account for 2nd Destination Options header if present */
+               if (pd->proto == IPPROTO_DSTOPTS) {
+                       if (!pf_pull_hdr(pbuf, off, &ext, sizeof(ext), NULL,
+                           NULL, AF_INET6)) {
+                               goto shortpkt;
+                       }
+                       frent->fr_ip6f_hlen += (ext.ip6e_len + 1) * 8;
+               }
 
                /* Might return a completely reassembled mbuf, or NULL */
                DPFPRINTF(("reass IPv6 frag %d @ %d-%d\n",
@@ -2363,7 +2520,8 @@ fragment:
                if (pff != NULL && (pff->fr_flags & PFFRAG_DROP)) {
                        goto drop;
                }
-       } else if (dir == PF_IN || !(pd->pf_mtag->pftag_flags & PF_TAG_FRAGCACHE)) {
+       } else if (dir == PF_IN ||
+           !(pd->pf_mtag->pftag_flags & PF_TAG_FRAGCACHE)) {
                /* non-buffering fragment cache (overlaps: see RFC 5722) */
                int nomem = 0;
 
index be08224fa16e7962f321be05c77ef7ee2758741f..5352a4922f357836b671be83adc5af6a23a9aa37 100644 (file)
@@ -28,6 +28,7 @@
 #include <sys/mcache.h>
 #include <kern/kern_types.h>
 #include <net/pf_pbuf.h>
+#include <net/pfvar.h>
 #include <netinet/in.h>
 
 void
@@ -90,6 +91,9 @@ pbuf_sync(pbuf_t *pbuf)
                pbuf->pb_flowid = &m->m_pkthdr.pkt_flowid;
                pbuf->pb_flags = &m->m_pkthdr.pkt_flags;
                pbuf->pb_pftag = m_pftag(m);
+               pbuf->pb_pf_fragtag = pf_find_fragment_tag(m);
+               ASSERT((pbuf->pb_pf_fragtag == NULL) ||
+                   (pbuf->pb_pftag->pftag_flags & PF_TAG_REASSEMBLED));
        } else if (pbuf->pb_type == PBUF_TYPE_MEMORY) {
                struct pbuf_memory *nm = &pbuf->pb_memory;
 
@@ -109,6 +113,7 @@ pbuf_sync(pbuf_t *pbuf)
                pbuf->pb_flowid = &nm->pm_flowid;
                pbuf->pb_flags = &nm->pm_flags;
                pbuf->pb_pftag = &nm->pm_pftag;
+               pbuf->pb_pf_fragtag = &nm->pm_pf_fragtag;
        } else {
                panic("%s: bad pb_type: %d", __func__, pbuf->pb_type);
        }
@@ -125,9 +130,10 @@ pbuf_to_mbuf(pbuf_t *pbuf, boolean_t release_ptr)
                m = pbuf->pb_mbuf;
                if (release_ptr) {
                        pbuf->pb_mbuf = NULL;
-                       pbuf_destroy(pbuf);
                }
        } else if (pbuf->pb_type == PBUF_TYPE_MEMORY) {
+               boolean_t fragtag = FALSE;
+
                if (pbuf->pb_packet_len > (u_int)MHLEN) {
                        if (pbuf->pb_packet_len > (u_int)MCLBYTES) {
                                printf("%s: packet too big for cluster (%u)\n",
@@ -139,7 +145,7 @@ pbuf_to_mbuf(pbuf_t *pbuf, boolean_t release_ptr)
                        m = m_gethdr(M_DONTWAIT, MT_DATA);
                }
                if (m == NULL) {
-                       return NULL;
+                       goto done;
                }
 
                m_copyback(m, 0, pbuf->pb_packet_len, pbuf->pb_data);
@@ -153,16 +159,26 @@ pbuf_to_mbuf(pbuf_t *pbuf, boolean_t release_ptr)
                if (pbuf->pb_pftag != NULL) {
                        struct pf_mtag *pftag = m_pftag(m);
 
-                       if (pftag != NULL) {
-                               *pftag = *pbuf->pb_pftag;
-                       }
+                       ASSERT(pftag != NULL);
+                       *pftag = *pbuf->pb_pftag;
+                       fragtag =
+                           ((pftag->pftag_flags & PF_TAG_REASSEMBLED) != 0);
                }
 
-               if (release_ptr) {
-                       pbuf_destroy(pbuf);
+               if (fragtag && pbuf->pb_pf_fragtag != NULL) {
+                       if (pf_copy_fragment_tag(m, pbuf->pb_pf_fragtag,
+                           M_NOWAIT) == NULL) {
+                               m_freem(m);
+                               m = NULL;
+                               goto done;
+                       }
                }
        }
 
+done:
+       if (release_ptr) {
+               pbuf_destroy(pbuf);
+       }
        return m;
 }
 
@@ -335,7 +351,7 @@ pbuf_copy_back(pbuf_t *pbuf, int off, int len, void *src)
 
        if (pbuf->pb_type == PBUF_TYPE_MBUF) {
                m_copyback(pbuf->pb_mbuf, off, len, src);
-       } else if (pbuf->pb_type == PBUF_TYPE_MBUF) {
+       } else if (pbuf->pb_type == PBUF_TYPE_MEMORY) {
                if (len) {
                        memcpy(&((uint8_t *)pbuf->pb_data)[off], src, len);
                }
@@ -353,7 +369,7 @@ pbuf_copy_data(pbuf_t *pbuf, int off, int len, void *dst)
 
        if (pbuf->pb_type == PBUF_TYPE_MBUF) {
                m_copydata(pbuf->pb_mbuf, off, len, dst);
-       } else if (pbuf->pb_type == PBUF_TYPE_MBUF) {
+       } else if (pbuf->pb_type == PBUF_TYPE_MEMORY) {
                if (len) {
                        memcpy(dst, &((uint8_t *)pbuf->pb_data)[off], len);
                }
index fd8f7dd57135dff997089f0f22607ef828732dfe..232b9b2a1cd543356689939e34dd8b40b0336589 100644 (file)
@@ -51,6 +51,7 @@ struct pbuf_memory {
        uint32_t pm_flowid;
        uint32_t pm_flags;
        struct pf_mtag pm_pftag;
+       struct pf_fragment_tag  pm_pf_fragtag;
        int (*pm_action)(struct pbuf_memory *, enum pbuf_action);
        void *pm_action_cookie;
 };
@@ -74,6 +75,7 @@ typedef struct pbuf {
        uint32_t        *pb_flowid;
        uint32_t        *pb_flags;
        struct pf_mtag  *pb_pftag;
+       struct pf_fragment_tag  *pb_pf_fragtag;
        struct ifnet    *pb_ifp;
        struct pbuf     *pb_next;
 } pbuf_t;
index c66741be2f74ee2588720148b141eb3b2c0a0980..2d172aacc57f7b5cf66c1fafda0c1974074aea21 100644 (file)
@@ -1199,13 +1199,10 @@ pfr_walktree(struct radix_node *rn, void *arg)
                if (w->pfrw_free-- > 0) {
                        struct pfr_astats as;
 
+                       bzero(&as, sizeof(as));
+
                        pfr_copyout_addr(&as.pfras_a, ke);
 
-#if !defined(__LP64__)
-                       /* Initialized to avoid potential info leak to
-                        * userspace */
-                       as._pad = 0;
-#endif
                        bcopy(ke->pfrke_packets, as.pfras_packets,
                            sizeof(as.pfras_packets));
                        bcopy(ke->pfrke_bytes, as.pfras_bytes,
index d10af5141a589a65474cb07452a0abb7fc81b4b9..7a354d17ce656c1c2a3fb3594ffab1bde361b01e 100644 (file)
@@ -145,10 +145,9 @@ struct sadb_sa_2 {
                u_int16_t               sadb_sa_natt_interval;
        };
 
-       union {
-               u_int32_t               sadb_reserved1;
-               u_int16_t               sadb_sa_natt_offload_interval;
-       };
+       u_int16_t               sadb_sa_natt_offload_interval;
+#define SADB_SA_NATT_SRC_PORT   1
+       u_int16_t               sadb_sa_natt_src_port;
 };
 #endif /* PRIVATE */
 
@@ -293,9 +292,9 @@ struct sadb_x_policy {
  *     = (sadb_x_policy_len * sizeof(uint64_t) - sizeof(struct sadb_x_policy))
  */
 #ifdef PRIVATE
-/* IPSec Interface Extension:
- * IPSec interface can be specified alone, or all three
- * of internal, outgoing, and IPSec interfaces must be
+/* IPsec Interface Extension:
+ * IPsec interface can be specified alone, or all three
+ * of internal, outgoing, and IPsec interfaces must be
  * specified.
  */
 struct sadb_x_ipsecif {
@@ -492,6 +491,7 @@ struct sadb_sastat {
 
 #ifdef PRIVATE
 #define SADB_X_EXT_SA2_DELETE_ON_DETACH   0x0001
+#define SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS  0x0002
 #endif
 
 /* SPI size for PF_KEYv2 */
index 2eed3f3c3f75f6d575fa77963693463c9bb242bd..c82e61d027394f652e7983079eae24fd15cab80f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -634,8 +634,8 @@ struct pf_os_fingerprint {
 #define PF_OSFP_TCPOPT_TS       0x4             /* TCP timestamp option */
 #define PF_OSFP_TCPOPT_BITS     3               /* bits used by each option */
 #define PF_OSFP_MAX_OPTS \
-    (sizeof(((struct pf_os_fingerprint *)0)->fp_tcpopts) * 8) \
-    / PF_OSFP_TCPOPT_BITS
+    ((sizeof(pf_tcpopts_t) * 8) \
+    / PF_OSFP_TCPOPT_BITS)
 
        SLIST_ENTRY(pf_os_fingerprint)  fp_next;
 };
@@ -1432,8 +1432,7 @@ struct pf_pdesc {
        struct pf_rule  *nat_rule;      /* nat/rdr rule applied to packet */
        struct pf_addr  *src;
        struct pf_addr  *dst;
-       struct ether_header
-       *eh;
+       struct ether_header     *eh;
        pbuf_t          *mp;
        int             lmw;            /* lazy writable offset */
        struct pf_mtag  *pf_mtag;
@@ -2186,7 +2185,7 @@ extern struct pool pf_app_state_pl;
 extern struct thread *pf_purge_thread;
 
 __private_extern__ void pfinit(void);
-__private_extern__ void pf_purge_thread_fn(void *, wait_result_t);
+__private_extern__ void pf_purge_thread_fn(void *, wait_result_t) __dead2;
 __private_extern__ void pf_purge_expired_src_nodes(void);
 __private_extern__ void pf_purge_expired_states(u_int32_t);
 __private_extern__ void pf_unlink_state(struct pf_state *);
@@ -2212,7 +2211,6 @@ __private_extern__ void pf_rm_rule(struct pf_rulequeue *, struct pf_rule *);
 struct ip_fw_args;
 
 extern boolean_t is_nlc_enabled_glb;
-extern boolean_t pf_is_nlc_enabled(void);
 
 #if INET
 __private_extern__ int pf_test(int, struct ifnet *, pbuf_t **,
@@ -2229,6 +2227,10 @@ __private_extern__ int pf_test6_mbuf(int, struct ifnet *, struct mbuf **,
 __private_extern__ void pf_poolmask(struct pf_addr *, struct pf_addr *,
     struct pf_addr *, struct pf_addr *, u_int8_t);
 __private_extern__ void pf_addr_inc(struct pf_addr *, sa_family_t);
+__private_extern__ int pf_normalize_ip6(pbuf_t *, int, struct pfi_kif *,
+    u_short *, struct pf_pdesc *);
+__private_extern__ int pf_refragment6(struct ifnet *, pbuf_t **,
+    struct pf_fragment_tag *);
 #endif /* INET6 */
 
 __private_extern__ void *pf_lazy_makewritable(struct pf_pdesc *,
@@ -2254,8 +2256,6 @@ __private_extern__ void pf_normalize_init(void);
 __private_extern__ int pf_normalize_isempty(void);
 __private_extern__ int pf_normalize_ip(pbuf_t *, int, struct pfi_kif *,
     u_short *, struct pf_pdesc *);
-__private_extern__ int pf_normalize_ip6(pbuf_t *, int, struct pfi_kif *,
-    u_short *, struct pf_pdesc *);
 __private_extern__ int pf_normalize_tcp(int, struct pfi_kif *, pbuf_t *,
     int, int, void *, struct pf_pdesc *);
 __private_extern__ void pf_normalize_tcp_cleanup(struct pf_state *);
@@ -2413,6 +2413,10 @@ __private_extern__ struct pf_mtag *pf_find_mtag(struct mbuf *);
 __private_extern__ struct pf_mtag *pf_find_mtag_pbuf(pbuf_t *);
 __private_extern__ struct pf_mtag *pf_get_mtag(struct mbuf *);
 __private_extern__ struct pf_mtag *pf_get_mtag_pbuf(pbuf_t *);
+__private_extern__ struct pf_fragment_tag * pf_find_fragment_tag_pbuf(pbuf_t *);
+__private_extern__ struct pf_fragment_tag * pf_find_fragment_tag(struct mbuf *);
+__private_extern__ struct pf_fragment_tag * pf_copy_fragment_tag(struct mbuf *,
+    struct pf_fragment_tag *, int);
 #else /* !KERNEL */
 extern struct pf_anchor_global pf_anchors;
 extern struct pf_anchor pf_main_anchor;
index da700024f7eb0b2c565543f8e96c1ea90675819d..02340a9779d2381de75f1500be7db5cedfb7ca9d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -252,11 +252,9 @@ pktap_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params)
        pktap->pktp_filters[1].filter_param_if_type = IFT_IEEE1394;
 #endif /* CONFIG_EMBEDDED */
 
-#if (DEVELOPMENT || DEBUG)
        pktap->pktp_filters[2].filter_op = PKTAP_FILTER_OP_PASS;
        pktap->pktp_filters[2].filter_param = PKTAP_FILTER_PARAM_IF_TYPE;
        pktap->pktp_filters[2].filter_param_if_type = IFT_OTHER;
-#endif /* DEVELOPMENT || DEBUG */
 
        /*
         * We do not use a set_bpf_tap() function as we rather rely on the more
@@ -786,6 +784,8 @@ pktap_set_procinfo(struct pktap_header *hdr, struct so_procinfo *soprocinfo)
        if (hdr->pth_comm[0] == 0) {
                proc_name(soprocinfo->spi_pid, hdr->pth_comm, MAXCOMLEN);
        }
+       strlcpy(&hdr->pth_comm[0], &soprocinfo->spi_proc_name[0], sizeof(hdr->pth_comm));
+
        if (soprocinfo->spi_pid != 0) {
                uuid_copy(hdr->pth_uuid, soprocinfo->spi_uuid);
        }
@@ -793,9 +793,7 @@ pktap_set_procinfo(struct pktap_header *hdr, struct so_procinfo *soprocinfo)
        if (soprocinfo->spi_delegated != 0) {
                hdr->pth_flags |= PTH_FLAG_PROC_DELEGATED;
                hdr->pth_epid = soprocinfo->spi_epid;
-               if (hdr->pth_ecomm[0] == 0) {
-                       proc_name(soprocinfo->spi_epid, hdr->pth_ecomm, MAXCOMLEN);
-               }
+               strlcpy(&hdr->pth_ecomm[0], &soprocinfo->spi_e_proc_name[0], sizeof(hdr->pth_ecomm));
                uuid_copy(hdr->pth_euuid, soprocinfo->spi_euuid);
        }
 }
@@ -837,8 +835,7 @@ pktap_v2_set_procinfo(struct pktap_v2_hdr *pktap_v2_hdr,
                        char *ptr = ((char *)pktap_v2_hdr) +
                            pktap_v2_hdr->pth_comm_offset;
 
-                       proc_name(soprocinfo->spi_pid,
-                           ptr, PKTAP_MAX_COMM_SIZE);
+                       strlcpy(ptr, &soprocinfo->spi_proc_name[0], PKTAP_MAX_COMM_SIZE);
                }
                if (pktap_v2_hdr->pth_uuid_offset != 0) {
                        uuid_t *ptr = (uuid_t *) (((char *)pktap_v2_hdr) +
@@ -864,8 +861,7 @@ pktap_v2_set_procinfo(struct pktap_v2_hdr *pktap_v2_hdr,
                        char *ptr = ((char *)pktap_v2_hdr) +
                            pktap_v2_hdr->pth_e_comm_offset;
 
-                       proc_name(soprocinfo->spi_epid,
-                           ptr, PKTAP_MAX_COMM_SIZE);
+                       strlcpy(ptr, &soprocinfo->spi_e_proc_name[0], PKTAP_MAX_COMM_SIZE);
                }
                if (pktap_v2_hdr->pth_e_uuid_offset != 0) {
                        uuid_t *ptr = (uuid_t *) (((char *)pktap_v2_hdr) +
@@ -1213,8 +1209,8 @@ pktap_bpf_tap(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m,
                                hdr->pth_dlt = DLT_APPLE_IP_OVER_IEEE1394;
                                break;
                        case IFT_OTHER:
-                               if (ifp->if_subfamily == IFNET_SUBFAMILY_IPSEC ||
-                                   ifp->if_subfamily == IFNET_SUBFAMILY_UTUN) {
+                               if (ifp->if_family == IFNET_FAMILY_IPSEC ||
+                                   ifp->if_family == IFNET_FAMILY_UTUN) {
                                        /*
                                         * For utun:
                                         * - incoming packets do not have the prefix set to four
index 4c0d3b7fe5cdc9bf94dcc50cf6f9de28a36bd2a9..f08febca33167347e61385f2d76fd43fb0801875 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2011-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -37,6 +37,7 @@
 #include <sys/mcache.h>
 #include <sys/sysctl.h>
 
+#include <dev/random/randomdev.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/pktsched/pktsched_tcq.h>
 #include <net/pktsched/pktsched_qfq.h>
 #include <net/pktsched/pktsched_fq_codel.h>
+#include <net/pktsched/pktsched_netem.h>
 
 #include <pexpert/pexpert.h>
 
 
 u_int32_t machclk_freq = 0;
 u_int64_t machclk_per_sec = 0;
-u_int32_t pktsched_verbose;     /* more noise if greater than 1 */
+u_int32_t pktsched_verbose = 0; /* more noise if greater than 1 */
 
 static void init_machclk(void);
 
@@ -72,6 +74,7 @@ pktsched_init(void)
 
        tcq_init();
        qfq_init();
+       netem_init();
 }
 
 static void
@@ -225,47 +228,102 @@ pktsched_getqstats(struct ifclassq *ifq, u_int32_t qid,
 }
 
 void
-pktsched_pkt_encap(pktsched_pkt_t *pkt, classq_pkt_type_t ptype, void *pp)
+pktsched_pkt_encap(pktsched_pkt_t *pkt, classq_pkt_t *cpkt)
 {
-       pkt->pktsched_ptype = ptype;
-       pkt->pktsched_pkt = pp;
+       pkt->pktsched_pkt = *cpkt;
 
-       switch (ptype) {
+       switch (cpkt->cp_ptype) {
        case QP_MBUF:
                pkt->pktsched_plen =
-                   (uint32_t)m_pktlen((struct mbuf *)pkt->pktsched_pkt);
+                   (uint32_t)m_pktlen(pkt->pktsched_pkt_mbuf);
                break;
 
 
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 }
 
+int
+pktsched_clone_pkt(pktsched_pkt_t *pkt1, pktsched_pkt_t *pkt2)
+{
+       struct mbuf *m1, *m2;
+
+       ASSERT(pkt1 != NULL);
+       ASSERT(pkt1->pktsched_pkt_mbuf != NULL);
+       /* allow in place clone, but make sure pkt2->pktsched_pkt won't leak */
+       ASSERT((pkt1 == pkt2 && pkt1->pktsched_pkt_mbuf ==
+           pkt2->pktsched_pkt_mbuf) || (pkt1 != pkt2 &&
+           pkt2->pktsched_pkt_mbuf == NULL));
+
+       switch (pkt1->pktsched_ptype) {
+       case QP_MBUF:
+               m1 = (struct mbuf *)pkt1->pktsched_pkt_mbuf;
+               m2 = m_dup(m1, M_NOWAIT);
+               if (__improbable(m2 == NULL)) {
+                       return ENOBUFS;
+               }
+               pkt2->pktsched_pkt_mbuf = m2;
+               break;
+
+
+       default:
+               VERIFY(0);
+               /* NOTREACHED */
+               __builtin_unreachable();
+       }
+
+       pkt2->pktsched_plen = pkt1->pktsched_plen;
+       pkt2->pktsched_ptype = pkt1->pktsched_ptype;
+       return 0;
+}
+
+void
+pktsched_corrupt_packet(pktsched_pkt_t *pkt)
+{
+       struct mbuf *m = NULL;
+       uint8_t *data = NULL;
+       uint32_t data_len = 0;
+       uint32_t rand32, rand_off, rand_bit;
+
+       switch (pkt->pktsched_ptype) {
+       case QP_MBUF:
+               m = pkt->pktsched_pkt_mbuf;
+               data = mtod(m, uint8_t *);
+               data_len = m->m_pkthdr.len;
+               break;
+
+       default:
+               /* NOTREACHED */
+               VERIFY(0);
+               __builtin_unreachable();
+       }
+
+       read_frandom(&rand32, sizeof(rand32));
+       rand_bit = rand32 & 0x8;
+       rand_off = (rand32 >> 3) % data_len;
+       data[rand_off] ^= 1 << rand_bit;
+}
+
 void
 pktsched_free_pkt(pktsched_pkt_t *pkt)
 {
        switch (pkt->pktsched_ptype) {
        case QP_MBUF:
-               m_freem(pkt->pktsched_pkt);
+               m_freem(pkt->pktsched_pkt_mbuf);
                break;
 
 
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 
-       pkt->pktsched_pkt = NULL;
+       pkt->pktsched_pkt = CLASSQ_PKT_INITIALIZER(pkt->pktsched_pkt);
        pkt->pktsched_plen = 0;
-       pkt->pktsched_ptype = 0;
-}
-
-uint32_t
-pktsched_get_pkt_len(pktsched_pkt_t *pkt)
-{
-       return pkt->pktsched_plen;
 }
 
 mbuf_svc_class_t
@@ -275,27 +333,27 @@ pktsched_get_pkt_svc(pktsched_pkt_t *pkt)
 
        switch (pkt->pktsched_ptype) {
        case QP_MBUF:
-               svc = m_get_service_class((mbuf_t)pkt->pktsched_pkt);
+               svc = m_get_service_class(pkt->pktsched_pkt_mbuf);
                break;
 
 
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        return svc;
 }
 
 void
-pktsched_get_pkt_vars(pktsched_pkt_t *pkt, uint32_t **flags,
+pktsched_get_pkt_vars(pktsched_pkt_t *pkt, volatile uint32_t **flags,
     uint64_t **timestamp, uint32_t *flowid, uint8_t *flowsrc, uint8_t *proto,
     uint32_t *tcp_start_seq)
 {
        switch (pkt->pktsched_ptype) {
        case QP_MBUF: {
-               struct mbuf *m = (struct mbuf *)pkt->pktsched_pkt;
-               struct pkthdr *pkth = &m->m_pkthdr;
+               struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
 
                if (flags != NULL) {
                        *flags = &pkth->pkt_flags;
@@ -327,6 +385,7 @@ pktsched_get_pkt_vars(pktsched_pkt_t *pkt, uint32_t **flags,
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 }
 
@@ -338,7 +397,7 @@ pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how)
 
        switch (pkt->pktsched_ptype) {
        case QP_MBUF: {
-               struct mbuf *m = (struct mbuf *)pkt->pktsched_pkt;
+               struct mbuf *m = pkt->pktsched_pkt_mbuf;
 
                fce = flowadv_alloc_entry(how);
                if (fce == NULL) {
@@ -357,6 +416,7 @@ pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how)
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        return fce;
@@ -369,12 +429,10 @@ pktsched_get_pkt_sfb_vars(pktsched_pkt_t *pkt, uint32_t **sfb_flags)
 
        switch (pkt->pktsched_ptype) {
        case QP_MBUF: {
-               struct mbuf *m = (struct mbuf *)pkt->pktsched_pkt;
-               struct pkthdr *pkth = &m->m_pkthdr;
+               struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
 
                _CASSERT(sizeof(pkth->pkt_mpriv_hash) == sizeof(uint32_t));
                _CASSERT(sizeof(pkth->pkt_mpriv_flags) == sizeof(uint32_t));
-
                *sfb_flags = &pkth->pkt_mpriv_flags;
                hashp = &pkth->pkt_mpriv_hash;
                break;
@@ -384,6 +442,7 @@ pktsched_get_pkt_sfb_vars(pktsched_pkt_t *pkt, uint32_t **sfb_flags)
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        return hashp;
index b094eb623258fb6aa2b666ea72c7bdb4e0c9c9b0..624e2e58dbfe50b0868d16990f35e8edb0657e30 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2011-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -58,18 +58,18 @@ extern "C" {
 #define PKTSCHEDF_QALG_DRIVER_MANAGED   0x10    /* driver managed */
 
 typedef struct _pktsched_pkt_ {
-       classq_pkt_type_t       __ptype;
+       classq_pkt_t            __pkt;
        uint32_t                __plen;
-       void                    *__pkt;
-#define pktsched_ptype  __ptype
+#define pktsched_ptype  __pkt.cp_ptype
 #define pktsched_plen   __plen
 #define pktsched_pkt    __pkt
+#define pktsched_pkt_mbuf       __pkt.cp_mbuf
+#define pktsched_pkt_kpkt       __pkt.cp_kpkt
 } pktsched_pkt_t;
 
-#define _PKTSCHED_PKT_INIT(_p)  do {            \
-       (_p)->pktsched_ptype = QP_INVALID;      \
-       (_p)->pktsched_plen = 0;                \
-       (_p)->pktsched_pkt = NULL;              \
+#define _PKTSCHED_PKT_INIT(_p)  do {                                    \
+       (_p)->pktsched_pkt = CLASSQ_PKT_INITIALIZER((_p)->pktsched_pkt);\
+       (_p)->pktsched_plen = 0;                                        \
 } while (0)
 
 /* macro for timeout/untimeout */
@@ -137,6 +137,12 @@ __fls(pktsched_bitmap_t word)
        return pktsched_fls(word) - 1;
 }
 
+static inline uint32_t
+pktsched_get_pkt_len(pktsched_pkt_t *pkt)
+{
+       return pkt->pktsched_plen;
+}
+
 /*
  * We can use mach_absolute_time which returns a 64-bit value with
  * granularity less than a microsecond even on the slowest processor.
@@ -164,11 +170,12 @@ extern int pktsched_getqstats(struct ifclassq *, u_int32_t,
 extern u_int64_t pktsched_abs_to_nsecs(u_int64_t);
 extern u_int64_t pktsched_nsecs_to_abstime(u_int64_t);
 extern void pktsched_free_pkt(pktsched_pkt_t *);
-extern uint32_t pktsched_get_pkt_len(pktsched_pkt_t *);
-extern void pktsched_get_pkt_vars(pktsched_pkt_t *, uint32_t **, uint64_t **,
-    uint32_t *, uint8_t *, uint8_t *, uint32_t *);
+extern int pktsched_clone_pkt(pktsched_pkt_t *, pktsched_pkt_t *);
+extern void pktsched_corrupt_packet(pktsched_pkt_t *pkt);
+extern void pktsched_get_pkt_vars(pktsched_pkt_t *, volatile uint32_t **,
+    uint64_t **, uint32_t *, uint8_t *, uint8_t *, uint32_t *);
 extern uint32_t *pktsched_get_pkt_sfb_vars(pktsched_pkt_t *, uint32_t **);
-extern void pktsched_pkt_encap(pktsched_pkt_t *, classq_pkt_type_t, void *);
+extern void pktsched_pkt_encap(pktsched_pkt_t *, classq_pkt_t *);
 extern mbuf_svc_class_t pktsched_get_pkt_svc(pktsched_pkt_t *);
 extern struct flowadv_fcentry *pktsched_alloc_fcentry(pktsched_pkt_t *,
     struct ifnet *, int);
index b6cd0c67ffd4db0cba91d9ab468639571266f407..e523e80968e701ea7da34607245785469357a2a7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2016-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -42,19 +42,18 @@ static fq_if_t *fq_if_alloc(struct ifnet *, classq_pkt_type_t);
 static void fq_if_destroy(fq_if_t *fqs);
 static void fq_if_classq_init(fq_if_t *fqs, u_int32_t priority,
     u_int32_t quantum, u_int32_t drr_max, u_int32_t svc_class);
-static int fq_if_enqueue_classq(struct ifclassq *ifq, void *p,
-    classq_pkt_type_t ptype, boolean_t *pdrop);
-static void *fq_if_dequeue_classq(struct ifclassq *, classq_pkt_type_t *);
+static int fq_if_enqueue_classq(struct ifclassq *, classq_pkt_t *, boolean_t *);
+static void fq_if_dequeue_classq(struct ifclassq *, classq_pkt_t *);
 static int fq_if_dequeue_classq_multi(struct ifclassq *, u_int32_t,
-    u_int32_t, void **, void **, u_int32_t *, u_int32_t *, classq_pkt_type_t *);
-static void *fq_if_dequeue_sc_classq(struct ifclassq *, mbuf_svc_class_t,
-    classq_pkt_type_t *);
+    u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *, u_int32_t *);
+static void fq_if_dequeue_sc_classq(struct ifclassq *, mbuf_svc_class_t,
+    classq_pkt_t *);
 static int fq_if_dequeue_sc_classq_multi(struct ifclassq *,
-    mbuf_svc_class_t, u_int32_t, u_int32_t, void **,
-    void **, u_int32_t *, u_int32_t *, classq_pkt_type_t *);
+    mbuf_svc_class_t, u_int32_t, u_int32_t, classq_pkt_t *,
+    classq_pkt_t *, u_int32_t *, u_int32_t *);
 static void fq_if_dequeue(fq_if_t *, fq_if_classq_t *, u_int32_t,
-    u_int32_t, void **, void **, u_int32_t *, u_int32_t *,
-    boolean_t drvmgmt, classq_pkt_type_t *);
+    u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *,
+    u_int32_t *, boolean_t drvmgmt);
 static int fq_if_request_classq(struct ifclassq *ifq, cqrq_t op, void *arg);
 void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat);
 static void fq_if_purge(fq_if_t *);
@@ -75,26 +74,25 @@ static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl,
        (STAILQ_EMPTY(&(_fcl_)->fcl_new_flows) && \
        STAILQ_EMPTY(&(_fcl_)->fcl_old_flows))
 
-typedef void (* fq_if_append_pkt_t)(void *, void *);
+typedef void (* fq_if_append_pkt_t)(classq_pkt_t *, classq_pkt_t *);
 typedef boolean_t (* fq_getq_flow_t)(fq_if_t *, fq_if_classq_t *, fq_t *,
-    u_int32_t, u_int32_t, void **, void **, u_int32_t *, u_int32_t *,
-    boolean_t *, u_int32_t);
+    u_int32_t, u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *,
+    u_int32_t *, boolean_t *, u_int32_t);
 
 static void
-fq_if_append_mbuf(void *pkt, void *next_pkt)
+fq_if_append_mbuf(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
 {
-       ((mbuf_t)pkt)->m_nextpkt = (mbuf_t)next_pkt;
+       pkt->cp_mbuf->m_nextpkt = next_pkt->cp_mbuf;
 }
 
 
 
 static boolean_t
 fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
-    u_int32_t byte_limit, u_int32_t pkt_limit, void **top, void **last,
-    u_int32_t *byte_cnt, u_int32_t *pkt_cnt, boolean_t *qempty,
-    u_int32_t pflags)
+    u_int32_t byte_limit, u_int32_t pkt_limit, classq_pkt_t *top,
+    classq_pkt_t *last, u_int32_t *byte_cnt, u_int32_t *pkt_cnt,
+    boolean_t *qempty, u_int32_t pflags)
 {
-       struct mbuf *m;
        u_int32_t plen;
        pktsched_pkt_t pkt;
        boolean_t limit_reached = FALSE;
@@ -104,28 +102,28 @@ fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
        while (fq->fq_deficit > 0 && limit_reached == FALSE &&
            !MBUFQ_EMPTY(&fq->fq_mbufq)) {
                _PKTSCHED_PKT_INIT(&pkt);
-               m = fq_getq_flow(fqs, fq, &pkt);
+               fq_getq_flow(fqs, fq, &pkt);
                ASSERT(pkt.pktsched_ptype == QP_MBUF);
 
                plen = pktsched_get_pkt_len(&pkt);
                fq->fq_deficit -= plen;
-               m->m_pkthdr.pkt_flags |= pflags;
+               pkt.pktsched_pkt_mbuf->m_pkthdr.pkt_flags |= pflags;
 
-               if (*top == NULL) {
-                       *top = m;
+               if (top->cp_mbuf == NULL) {
+                       *top = pkt.pktsched_pkt;
                } else {
-                       ASSERT(*last != NULL);
-                       ASSERT((*(struct mbuf **)last)->m_nextpkt == NULL);
-                       (*(struct mbuf **)last)->m_nextpkt = m;
+                       ASSERT(last->cp_mbuf != NULL);
+                       ASSERT(last->cp_mbuf->m_nextpkt == NULL);
+                       last->cp_mbuf->m_nextpkt = pkt.pktsched_pkt_mbuf;
                }
-               *last = m;
-               (*(mbuf_t *)last)->m_nextpkt = NULL;
+               *last = pkt.pktsched_pkt;
+               last->cp_mbuf->m_nextpkt = NULL;
                fq_cl->fcl_stat.fcl_dequeue++;
                fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
                *pkt_cnt += 1;
                *byte_cnt += plen;
 
-               ifclassq_set_packet_metadata(ifq, ifp, m, QP_MBUF);
+               ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt);
 
                /* Check if the limit is reached */
                if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
@@ -267,11 +265,10 @@ fq_if_classq_init(fq_if_t *fqs, u_int32_t pri, u_int32_t quantum,
     u_int32_t drr_max, u_int32_t svc_class)
 {
        fq_if_classq_t *fq_cl;
-
+       VERIFY(pri < FQ_IF_MAX_CLASSES);
        fq_cl = &fqs->fqs_classq[pri];
 
-       VERIFY(pri >= 0 && pri < FQ_IF_MAX_CLASSES &&
-           fq_cl->fcl_quantum == 0);
+       VERIFY(fq_cl->fcl_quantum == 0);
        fq_cl->fcl_quantum = quantum;
        fq_cl->fcl_pri = pri;
        fq_cl->fcl_drr_max = drr_max;
@@ -281,8 +278,7 @@ fq_if_classq_init(fq_if_t *fqs, u_int32_t pri, u_int32_t quantum,
 }
 
 int
-fq_if_enqueue_classq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype,
-    boolean_t *pdrop)
+fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *p, boolean_t *pdrop)
 {
        u_int32_t pri;
        fq_if_t *fqs;
@@ -292,18 +288,19 @@ fq_if_enqueue_classq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype,
        pktsched_pkt_t pkt;
 
        IFCQ_LOCK_ASSERT_HELD(ifq);
-       if ((ptype == QP_MBUF) && !(((mbuf_t)p)->m_flags & M_PKTHDR)) {
+       if ((p->cp_ptype == QP_MBUF) && !(p->cp_mbuf->m_flags & M_PKTHDR)) {
                IFCQ_CONVERT_LOCK(ifq);
-               m_freem((mbuf_t)p);
+               m_freem(p->cp_mbuf);
+               *p = CLASSQ_PKT_INITIALIZER(*p);
                *pdrop = TRUE;
                return ENOBUFS;
        }
-       pktsched_pkt_encap(&pkt, ptype, p);
+       pktsched_pkt_encap(&pkt, p);
 
        fqs = (fq_if_t *)ifq->ifcq_disc;
        svc = pktsched_get_pkt_svc(&pkt);
        pri = fq_if_service_to_priority(fqs, svc);
-       VERIFY(pri >= 0 && pri < FQ_IF_MAX_CLASSES);
+       VERIFY(pri < FQ_IF_MAX_CLASSES);
        fq_cl = &fqs->fqs_classq[pri];
 
        if (svc == MBUF_SC_BK_SYS && fqs->fqs_throttle == 1) {
@@ -357,21 +354,17 @@ fq_if_enqueue_classq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype,
        return ret;
 }
 
-static void *
-fq_if_dequeue_classq(struct ifclassq *ifq, classq_pkt_type_t *ptype)
+static void
+fq_if_dequeue_classq(struct ifclassq *ifq, classq_pkt_t *pkt)
 {
-       void *top;
-
        (void) fq_if_dequeue_classq_multi(ifq, 1,
-           CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &top, NULL, NULL, NULL, ptype);
-       return top;
+           CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL);
 }
 
-static void *
+static void
 fq_if_dequeue_sc_classq(struct ifclassq *ifq, mbuf_svc_class_t svc,
-    classq_pkt_type_t *ptype)
+    classq_pkt_t *pkt)
 {
-       void *top;
        fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
        fq_if_classq_t *fq_cl;
        u_int32_t pri;
@@ -380,22 +373,23 @@ fq_if_dequeue_sc_classq(struct ifclassq *ifq, mbuf_svc_class_t svc,
        fq_cl = &fqs->fqs_classq[pri];
 
        fq_if_dequeue(fqs, fq_cl, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
-           &top, NULL, NULL, NULL, TRUE, ptype);
-       return top;
+           pkt, NULL, NULL, NULL, TRUE);
 }
 
 int
 fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt,
-    u_int32_t maxbytecnt, void **first_packet,
-    void **last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
-    classq_pkt_type_t *ptype)
+    u_int32_t maxbytecnt, classq_pkt_t *first_packet,
+    classq_pkt_t *last_packet, u_int32_t *retpktcnt,
+    u_int32_t *retbytecnt)
 {
-       void *top = NULL, *tail = NULL, *first, *last;
-       u_int32_t pktcnt = 0, bytecnt = 0, total_pktcnt, total_bytecnt;
-       fq_if_t *fqs;
+       u_int32_t pktcnt = 0, bytecnt = 0, total_pktcnt = 0, total_bytecnt = 0;
+       classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
+       classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
+       classq_pkt_t tmp = CLASSQ_PKT_INITIALIZER(tmp);
+       fq_if_append_pkt_t append_pkt;
        fq_if_classq_t *fq_cl;
+       fq_if_t *fqs;
        int pri;
-       fq_if_append_pkt_t append_pkt;
 
        IFCQ_LOCK_ASSERT_HELD(ifq);
 
@@ -410,14 +404,13 @@ fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt,
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 
-       first = last = NULL;
-       total_pktcnt = total_bytecnt = 0;
-       *ptype = fqs->fqs_ptype;
-
        for (;;) {
-               classq_pkt_type_t tmp_ptype;
+               classq_pkt_t top = CLASSQ_PKT_INITIALIZER(top);
+               classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
+
                if (fqs->fqs_bitmaps[FQ_IF_ER] == 0 &&
                    fqs->fqs_bitmaps[FQ_IF_EB] == 0) {
                        fqs->fqs_bitmaps[FQ_IF_EB] = fqs->fqs_bitmaps[FQ_IF_IB];
@@ -454,22 +447,21 @@ fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt,
                }
                fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt),
                    (maxbytecnt - total_bytecnt), &top, &tail, &pktcnt,
-                   &bytecnt, FALSE, &tmp_ptype);
-               if (top != NULL) {
-                       ASSERT(tmp_ptype == *ptype);
+                   &bytecnt, FALSE);
+               if (top.cp_mbuf != NULL) {
                        ASSERT(pktcnt > 0 && bytecnt > 0);
-                       if (first == NULL) {
+                       if (first.cp_mbuf == NULL) {
                                first = top;
-                               last = tail;
                                total_pktcnt = pktcnt;
                                total_bytecnt = bytecnt;
                        } else {
-                               append_pkt(last, top);
-                               last = tail;
+                               ASSERT(last.cp_mbuf != NULL);
+                               append_pkt(&last, &top);
                                total_pktcnt += pktcnt;
                                total_bytecnt += bytecnt;
                        }
-                       append_pkt(last, NULL);
+                       last = tail;
+                       append_pkt(&last, &tmp);
                        fq_cl->fcl_budget -= bytecnt;
                        pktcnt = 0;
                        bytecnt = 0;
@@ -498,49 +490,35 @@ state_change:
                        break;
                }
        }
-       if (first != NULL) {
-               if (first_packet != NULL) {
-                       *first_packet = first;
-               }
-               if (last_packet != NULL) {
-                       *last_packet = last;
-               }
-               if (retpktcnt != NULL) {
-                       *retpktcnt = total_pktcnt;
-               }
-               if (retbytecnt != NULL) {
-                       *retbytecnt = total_bytecnt;
-               }
-               IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
-       } else {
-               if (first_packet != NULL) {
-                       *first_packet = NULL;
-               }
-               if (last_packet != NULL) {
-                       *last_packet = NULL;
-               }
-               if (retpktcnt != NULL) {
-                       *retpktcnt = 0;
-               }
-               if (retbytecnt != NULL) {
-                       *retbytecnt = 0;
-               }
+
+       if (__probable(first_packet != NULL)) {
+               *first_packet = first;
+       }
+       if (last_packet != NULL) {
+               *last_packet = last;
        }
+       if (retpktcnt != NULL) {
+               *retpktcnt = total_pktcnt;
+       }
+       if (retbytecnt != NULL) {
+               *retbytecnt = total_bytecnt;
+       }
+
+       IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
        return 0;
 }
 
 int
 fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc,
-    u_int32_t maxpktcnt, u_int32_t maxbytecnt, void **first_packet,
-    void **last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
-    classq_pkt_type_t *ptype)
+    u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
+    classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt)
 {
-#pragma unused(maxpktcnt, maxbytecnt, first_packet, last_packet, retpktcnt, retbytecnt)
        fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
        u_int32_t pri;
        u_int32_t total_pktcnt = 0, total_bytecnt = 0;
        fq_if_classq_t *fq_cl;
-       void *first = NULL, *last = NULL;
+       classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
+       classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
        fq_if_append_pkt_t append_pkt;
 
        switch (fqs->fqs_ptype) {
@@ -552,11 +530,11 @@ fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc,
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        pri = fq_if_service_to_priority(fqs, svc);
        fq_cl = &fqs->fqs_classq[pri];
-
        /*
         * Now we have the queue for a particular service class. We need
         * to dequeue as many packets as needed, first from the new flows
@@ -564,49 +542,41 @@ fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc,
         */
        while (total_pktcnt < maxpktcnt && total_bytecnt < maxbytecnt &&
            fq_cl->fcl_stat.fcl_pkt_cnt > 0) {
-               void *top, *tail;
+               classq_pkt_t top = CLASSQ_PKT_INITIALIZER(top);
+               classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
                u_int32_t pktcnt = 0, bytecnt = 0;
+
                fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt),
                    (maxbytecnt - total_bytecnt), &top, &tail, &pktcnt,
-                   &bytecnt, TRUE, ptype);
-               if (first == NULL) {
-                       first = top;
-                       total_pktcnt = pktcnt;
-                       total_bytecnt = bytecnt;
-               } else {
-                       append_pkt(last, top);
-                       total_pktcnt += pktcnt;
-                       total_bytecnt += bytecnt;
+                   &bytecnt, TRUE);
+               if (top.cp_mbuf != NULL) {
+                       if (first.cp_mbuf == NULL) {
+                               first = top;
+                               total_pktcnt = pktcnt;
+                               total_bytecnt = bytecnt;
+                       } else {
+                               ASSERT(last.cp_mbuf != NULL);
+                               append_pkt(&last, &top);
+                               total_pktcnt += pktcnt;
+                               total_bytecnt += bytecnt;
+                       }
+                       last = tail;
                }
-               last = tail;
        }
-       if (first != NULL) {
-               if (first_packet != NULL) {
-                       *first_packet = first;
-               }
-               if (last_packet != NULL) {
-                       *last_packet = last;
-               }
-               if (retpktcnt != NULL) {
-                       *retpktcnt = total_pktcnt;
-               }
-               if (retbytecnt != NULL) {
-                       *retbytecnt = total_bytecnt;
-               }
-       } else {
-               if (first_packet != NULL) {
-                       *first_packet = NULL;
-               }
-               if (last_packet != NULL) {
-                       *last_packet = NULL;
-               }
-               if (retpktcnt != NULL) {
-                       *retpktcnt = 0;
-               }
-               if (retbytecnt != NULL) {
-                       *retbytecnt = 0;
-               }
+
+       if (__probable(first_packet != NULL)) {
+               *first_packet = first;
+       }
+       if (last_packet != NULL) {
+               *last_packet = last;
        }
+       if (retpktcnt != NULL) {
+               *retpktcnt = total_pktcnt;
+       }
+       if (retbytecnt != NULL) {
+               *retbytecnt = total_bytecnt;
+       }
+
        return 0;
 }
 
@@ -621,7 +591,12 @@ fq_if_purge_flow(fq_if_t *fqs, fq_t *fq, u_int32_t *pktsp,
        fq_cl = &fqs->fqs_classq[fq->fq_sc_index];
        pkts = bytes = 0;
        _PKTSCHED_PKT_INIT(&pkt);
-       while (fq_getq_flow(fqs, fq, &pkt) != NULL) {
+       for (;;) {
+               fq_getq_flow(fqs, fq, &pkt);
+               if (pkt.pktsched_pkt_mbuf == NULL) {
+                       VERIFY(pkt.pktsched_ptype == QP_INVALID);
+                       break;
+               }
                pkts++;
                bytes += pktsched_get_pkt_len(&pkt);
                pktsched_free_pkt(&pkt);
@@ -1007,7 +982,7 @@ fq_if_drop_packet(fq_if_t *fqs)
        fq_t *fq = fqs->fqs_large_flow;
        fq_if_classq_t *fq_cl;
        pktsched_pkt_t pkt;
-       uint32_t *pkt_flags;
+       volatile uint32_t *pkt_flags;
        uint64_t *pkt_timestamp;
 
        if (fq == NULL) {
@@ -1018,15 +993,22 @@ fq_if_drop_packet(fq_if_t *fqs)
 
        fq_cl = &fqs->fqs_classq[fq->fq_sc_index];
        _PKTSCHED_PKT_INIT(&pkt);
-       (void)fq_getq_flow_internal(fqs, fq, &pkt);
+       fq_getq_flow_internal(fqs, fq, &pkt);
+       ASSERT(pkt.pktsched_ptype != QP_INVALID);
 
        pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
            NULL, NULL);
 
        IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
        *pkt_timestamp = 0;
-       if (pkt.pktsched_ptype == QP_MBUF) {
+       switch (pkt.pktsched_ptype) {
+       case QP_MBUF:
                *pkt_flags &= ~PKTF_PRIV_GUARDED;
+               break;
+       default:
+               VERIFY(0);
+               /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        if (fq_empty(fq)) {
@@ -1115,15 +1097,14 @@ fq_if_flow_feedback(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl)
 
 void
 fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, u_int32_t pktlimit,
-    u_int32_t bytelimit, void **top, void **tail,
-    u_int32_t *retpktcnt, u_int32_t *retbytecnt, boolean_t drvmgmt,
-    classq_pkt_type_t *ptype)
+    u_int32_t bytelimit, classq_pkt_t *top, classq_pkt_t *tail,
+    u_int32_t *retpktcnt, u_int32_t *retbytecnt, boolean_t drvmgmt)
 {
        fq_t *fq = NULL, *tfq = NULL;
        flowq_stailq_t temp_stailq;
        u_int32_t pktcnt, bytecnt;
        boolean_t qempty, limit_reached = FALSE;
-       void *last = NULL;
+       classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
        fq_getq_flow_t fq_getq_flow_fn;
 
        switch (fqs->fqs_ptype) {
@@ -1135,6 +1116,7 @@ fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, u_int32_t pktlimit,
        default:
                VERIFY(0);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        /*
@@ -1146,9 +1128,6 @@ fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, u_int32_t pktlimit,
        }
 
        VERIFY(pktlimit > 0 && bytelimit > 0 && top != NULL);
-
-       *top = NULL;
-       *ptype = fqs->fqs_ptype;
        pktcnt = bytecnt = 0;
        STAILQ_INIT(&temp_stailq);
 
@@ -1201,8 +1180,8 @@ done:
                fq_cl->fcl_old_flows = temp_stailq;
        }
 
-       if (last != NULL) {
-               VERIFY(*top != NULL);
+       if (last.cp_mbuf != NULL) {
+               VERIFY(top->cp_mbuf != NULL);
                if (tail != NULL) {
                        *tail = last;
                }
diff --git a/bsd/net/pktsched/pktsched_netem.c b/bsd/net/pktsched/pktsched_netem.c
new file mode 100644 (file)
index 0000000..5344b01
--- /dev/null
@@ -0,0 +1,1523 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <kern/thread.h>
+#include <kern/sched_prim.h>
+#include <dev/random/randomdev.h>
+
+#include <net/if.h>
+#include <net/classq/classq.h>
+#include <net/pktsched/pktsched.h>
+#include <net/pktsched/pktsched_netem.h>
+
+enum {
+       NETEM_LOG_ERROR = 0,
+       NETEM_LOG_INFO = 1,
+       NETEM_LOG_DEBUG = 2,
+       NETEM_LOG_HIDEBUG = 3,
+};
+
+#define NETEM_HEAP_SIZE 1024
+#define NETEM_PSCALE    IF_NETEM_PARAMS_PSCALE
+
+#define netem_log(_level, _fmt, ...)                            \
+       do {                                                    \
+               if (pktsched_verbose > _level) {                \
+                       log(LOG_DEBUG, "NETEM: %-30s "_fmt "\n",\
+                           __FUNCTION__, ##__VA_ARGS__);       \
+               }                                               \
+       } while (0);
+
+extern kern_return_t thread_terminate(thread_t);
+
+static lck_attr_t       *netem_lock_attr;
+static lck_grp_t        *netem_lock_group;
+static lck_grp_attr_t   *netem_lock_group_attr;
+static int              __netem_inited = 0;
+
+static const int32_t NORM_DIST_SCALE = 8192;
+/* normal distribution lookup table */
+static int32_t norm_dist_table[] =
+{
+       -32768, -28307, -26871, -25967, -25298, -24765, -24320, -23937,
+       -23600, -23298, -23025, -22776, -22546, -22333, -22133, -21946,
+       -21770, -21604, -21445, -21295, -21151, -21013, -20882, -20755,
+       -20633, -20516, -20403, -20293, -20187, -20084, -19984, -19887,
+       -19793, -19702, -19612, -19526, -19441, -19358, -19277, -19198,
+       -19121, -19045, -18971, -18899, -18828, -18758, -18690, -18623,
+       -18557, -18492, -18429, -18366, -18305, -18245, -18185, -18127,
+       -18070, -18013, -17957, -17902, -17848, -17794, -17741, -17690,
+       -17638, -17588, -17538, -17489, -17440, -17392, -17345, -17298,
+       -17252, -17206, -17160, -17116, -17071, -17028, -16984, -16942,
+       -16899, -16857, -16816, -16775, -16735, -16694, -16654, -16615,
+       -16576, -16538, -16499, -16461, -16424, -16386, -16350, -16313,
+       -16277, -16241, -16205, -16170, -16135, -16100, -16066, -16031,
+       -15998, -15964, -15931, -15897, -15865, -15832, -15800, -15768,
+       -15736, -15704, -15673, -15642, -15611, -15580, -15550, -15519,
+       -15489, -15460, -15430, -15401, -15371, -15342, -15313, -15285,
+       -15256, -15228, -15200, -15172, -15144, -15116, -15089, -15062,
+       -15035, -15008, -14981, -14954, -14928, -14902, -14875, -14850,
+       -14823, -14798, -14772, -14747, -14722, -14696, -14671, -14647,
+       -14622, -14597, -14573, -14549, -14524, -14500, -14476, -14453,
+       -14429, -14405, -14382, -14359, -14335, -14312, -14289, -14266,
+       -14243, -14221, -14198, -14176, -14153, -14131, -14109, -14087,
+       -14065, -14043, -14021, -14000, -13978, -13957, -13935, -13914,
+       -13893, -13872, -13851, -13830, -13809, -13788, -13768, -13747,
+       -13727, -13706, -13686, -13666, -13646, -13626, -13606, -13586,
+       -13566, -13547, -13527, -13507, -13488, -13468, -13449, -13430,
+       -13411, -13392, -13373, -13354, -13335, -13316, -13297, -13278,
+       -13260, -13242, -13223, -13204, -13186, -13168, -13150, -13131,
+       -13113, -13095, -13077, -13060, -13042, -13024, -13006, -12988,
+       -12971, -12954, -12936, -12918, -12901, -12884, -12867, -12850,
+       -12832, -12815, -12798, -12781, -12764, -12748, -12731, -12714,
+       -12697, -12681, -12664, -12648, -12631, -12615, -12598, -12582,
+       -12566, -12549, -12533, -12517, -12501, -12485, -12469, -12453,
+       -12437, -12422, -12406, -12390, -12374, -12358, -12343, -12327,
+       -12312, -12296, -12281, -12265, -12250, -12235, -12220, -12204,
+       -12189, -12174, -12159, -12144, -12129, -12114, -12099, -12084,
+       -12069, -12054, -12039, -12025, -12010, -11995, -11981, -11966,
+       -11952, -11937, -11923, -11908, -11894, -11879, -11865, -11851,
+       -11837, -11822, -11808, -11794, -11780, -11766, -11752, -11737,
+       -11724, -11710, -11696, -11682, -11668, -11654, -11640, -11627,
+       -11613, -11599, -11586, -11572, -11559, -11545, -11531, -11518,
+       -11504, -11491, -11478, -11464, -11451, -11438, -11425, -11411,
+       -11398, -11385, -11372, -11359, -11346, -11332, -11319, -11306,
+       -11293, -11280, -11268, -11255, -11242, -11229, -11216, -11203,
+       -11191, -11178, -11165, -11153, -11140, -11127, -11114, -11102,
+       -11090, -11077, -11065, -11052, -11040, -11027, -11015, -11002,
+       -10990, -10978, -10965, -10953, -10941, -10929, -10917, -10904,
+       -10892, -10880, -10868, -10856, -10844, -10832, -10820, -10808,
+       -10796, -10784, -10772, -10760, -10748, -10736, -10725, -10713,
+       -10701, -10689, -10677, -10666, -10654, -10643, -10631, -10619,
+       -10607, -10596, -10584, -10573, -10562, -10550, -10539, -10527,
+       -10516, -10504, -10493, -10481, -10470, -10459, -10447, -10436,
+       -10425, -10414, -10402, -10391, -10380, -10369, -10358, -10346,
+       -10335, -10324, -10313, -10302, -10291, -10280, -10269, -10258,
+       -10247, -10236, -10225, -10214, -10203, -10192, -10181, -10171,
+       -10160, -10149, -10138, -10127, -10117, -10106, -10095, -10085,
+       -10074, -10063, -10052, -10042, -10031, -10021, -10010, -10000,
+       -9989, -9978, -9968, -9957, -9947, -9936, -9926, -9916,
+       -9905, -9895, -9884, -9874, -9864, -9853, -9843, -9833,
+       -9822, -9812, -9802, -9791, -9781, -9771, -9761, -9751,
+       -9741, -9730, -9720, -9710, -9700, -9690, -9680, -9670,
+       -9660, -9650, -9640, -9630, -9619, -9610, -9600, -9590,
+       -9580, -9570, -9560, -9550, -9540, -9530, -9520, -9511,
+       -9501, -9491, -9481, -9472, -9462, -9452, -9442, -9432,
+       -9423, -9413, -9403, -9394, -9384, -9374, -9365, -9355,
+       -9345, -9336, -9326, -9317, -9307, -9298, -9288, -9278,
+       -9269, -9259, -9250, -9241, -9231, -9221, -9212, -9202,
+       -9193, -9184, -9175, -9165, -9156, -9146, -9137, -9128,
+       -9119, -9109, -9100, -9090, -9081, -9072, -9063, -9053,
+       -9044, -9035, -9026, -9017, -9008, -8998, -8989, -8980,
+       -8971, -8962, -8953, -8944, -8934, -8925, -8916, -8907,
+       -8898, -8889, -8880, -8871, -8862, -8853, -8844, -8835,
+       -8826, -8817, -8808, -8799, -8790, -8781, -8772, -8764,
+       -8755, -8746, -8737, -8728, -8719, -8711, -8702, -8693,
+       -8684, -8675, -8667, -8658, -8649, -8640, -8632, -8623,
+       -8614, -8605, -8597, -8588, -8579, -8570, -8562, -8553,
+       -8545, -8536, -8527, -8519, -8510, -8502, -8493, -8484,
+       -8476, -8467, -8459, -8450, -8442, -8433, -8425, -8416,
+       -8408, -8399, -8391, -8382, -8374, -8365, -8357, -8348,
+       -8340, -8332, -8323, -8315, -8306, -8298, -8290, -8281,
+       -8273, -8264, -8256, -8248, -8240, -8231, -8223, -8215,
+       -8206, -8198, -8190, -8182, -8174, -8165, -8157, -8149,
+       -8140, -8132, -8124, -8116, -8108, -8099, -8091, -8083,
+       -8075, -8067, -8059, -8051, -8042, -8034, -8027, -8018,
+       -8010, -8002, -7994, -7986, -7978, -7970, -7962, -7954,
+       -7946, -7938, -7930, -7922, -7913, -7906, -7897, -7890,
+       -7882, -7874, -7866, -7858, -7850, -7842, -7834, -7826,
+       -7818, -7810, -7802, -7795, -7787, -7779, -7771, -7763,
+       -7755, -7748, -7739, -7732, -7724, -7716, -7708, -7700,
+       -7693, -7685, -7677, -7669, -7662, -7654, -7646, -7638,
+       -7630, -7623, -7615, -7608, -7600, -7592, -7584, -7577,
+       -7569, -7561, -7553, -7546, -7538, -7530, -7523, -7515,
+       -7508, -7500, -7492, -7485, -7477, -7469, -7462, -7454,
+       -7447, -7439, -7432, -7424, -7417, -7409, -7401, -7394,
+       -7386, -7379, -7372, -7364, -7356, -7349, -7341, -7334,
+       -7327, -7319, -7311, -7304, -7297, -7289, -7281, -7274,
+       -7267, -7259, -7252, -7245, -7237, -7230, -7222, -7215,
+       -7208, -7200, -7193, -7186, -7178, -7171, -7163, -7156,
+       -7149, -7141, -7134, -7127, -7119, -7112, -7105, -7098,
+       -7090, -7083, -7075, -7068, -7061, -7054, -7046, -7039,
+       -7032, -7025, -7018, -7010, -7003, -6996, -6989, -6981,
+       -6974, -6967, -6960, -6953, -6946, -6938, -6931, -6924,
+       -6917, -6910, -6903, -6895, -6888, -6881, -6874, -6867,
+       -6860, -6853, -6845, -6838, -6831, -6824, -6817, -6810,
+       -6803, -6796, -6789, -6782, -6775, -6767, -6760, -6753,
+       -6747, -6740, -6732, -6725, -6718, -6711, -6704, -6697,
+       -6690, -6683, -6676, -6669, -6662, -6655, -6648, -6641,
+       -6634, -6627, -6620, -6613, -6607, -6600, -6593, -6586,
+       -6579, -6572, -6565, -6558, -6551, -6544, -6538, -6531,
+       -6524, -6517, -6510, -6503, -6496, -6489, -6482, -6476,
+       -6469, -6462, -6455, -6448, -6441, -6434, -6428, -6421,
+       -6414, -6407, -6400, -6394, -6387, -6380, -6373, -6366,
+       -6360, -6353, -6346, -6339, -6333, -6326, -6319, -6312,
+       -6306, -6299, -6292, -6286, -6279, -6272, -6265, -6259,
+       -6252, -6245, -6239, -6232, -6225, -6219, -6212, -6205,
+       -6198, -6192, -6185, -6178, -6172, -6165, -6158, -6152,
+       -6145, -6139, -6132, -6125, -6119, -6112, -6105, -6099,
+       -6092, -6085, -6079, -6072, -6066, -6059, -6053, -6046,
+       -6040, -6033, -6026, -6019, -6013, -6006, -6000, -5993,
+       -5987, -5980, -5974, -5967, -5961, -5954, -5948, -5941,
+       -5935, -5928, -5922, -5915, -5908, -5902, -5895, -5889,
+       -5883, -5876, -5870, -5863, -5857, -5850, -5844, -5837,
+       -5831, -5825, -5818, -5811, -5805, -5799, -5792, -5786,
+       -5779, -5773, -5766, -5760, -5754, -5747, -5741, -5734,
+       -5728, -5722, -5715, -5709, -5702, -5696, -5690, -5683,
+       -5677, -5671, -5664, -5658, -5651, -5645, -5639, -5632,
+       -5626, -5620, -5613, -5607, -5600, -5594, -5588, -5582,
+       -5575, -5569, -5563, -5556, -5550, -5544, -5537, -5531,
+       -5525, -5519, -5512, -5506, -5500, -5494, -5487, -5481,
+       -5475, -5468, -5462, -5456, -5450, -5443, -5437, -5431,
+       -5425, -5418, -5412, -5406, -5400, -5393, -5387, -5381,
+       -5375, -5369, -5362, -5356, -5350, -5344, -5337, -5331,
+       -5325, -5319, -5313, -5306, -5300, -5294, -5288, -5282,
+       -5276, -5270, -5263, -5257, -5251, -5245, -5239, -5233,
+       -5226, -5220, -5214, -5208, -5202, -5196, -5190, -5183,
+       -5177, -5171, -5165, -5159, -5153, -5147, -5140, -5135,
+       -5129, -5122, -5116, -5110, -5104, -5098, -5092, -5086,
+       -5080, -5074, -5068, -5061, -5055, -5050, -5043, -5037,
+       -5031, -5025, -5019, -5013, -5007, -5001, -4995, -4989,
+       -4983, -4977, -4971, -4965, -4959, -4953, -4947, -4941,
+       -4935, -4929, -4923, -4917, -4911, -4905, -4899, -4893,
+       -4887, -4881, -4875, -4869, -4863, -4857, -4851, -4845,
+       -4839, -4833, -4827, -4821, -4815, -4809, -4803, -4797,
+       -4791, -4785, -4779, -4773, -4767, -4762, -4755, -4750,
+       -4744, -4738, -4732, -4726, -4720, -4714, -4708, -4702,
+       -4696, -4690, -4685, -4678, -4673, -4667, -4661, -4655,
+       -4649, -4643, -4637, -4631, -4626, -4620, -4614, -4608,
+       -4602, -4596, -4590, -4585, -4579, -4573, -4567, -4561,
+       -4555, -4549, -4544, -4538, -4532, -4526, -4520, -4514,
+       -4508, -4503, -4497, -4491, -4485, -4479, -4474, -4468,
+       -4462, -4456, -4450, -4445, -4439, -4433, -4427, -4421,
+       -4415, -4410, -4404, -4398, -4392, -4386, -4381, -4375,
+       -4369, -4363, -4358, -4352, -4346, -4340, -4334, -4329,
+       -4323, -4317, -4311, -4306, -4300, -4294, -4289, -4283,
+       -4277, -4271, -4266, -4260, -4254, -4248, -4243, -4237,
+       -4231, -4225, -4220, -4214, -4208, -4202, -4197, -4191,
+       -4185, -4180, -4174, -4168, -4162, -4157, -4151, -4146,
+       -4140, -4134, -4128, -4123, -4117, -4111, -4105, -4100,
+       -4094, -4089, -4083, -4077, -4071, -4066, -4060, -4055,
+       -4049, -4043, -4037, -4032, -4026, -4021, -4015, -4009,
+       -4003, -3998, -3992, -3987, -3981, -3975, -3970, -3964,
+       -3958, -3953, -3947, -3942, -3936, -3930, -3925, -3919,
+       -3913, -3908, -3902, -3897, -3891, -3885, -3880, -3874,
+       -3869, -3863, -3857, -3852, -3846, -3840, -3835, -3829,
+       -3824, -3818, -3813, -3807, -3801, -3796, -3790, -3785,
+       -3779, -3774, -3768, -3762, -3757, -3751, -3746, -3740,
+       -3734, -3729, -3723, -3718, -3712, -3707, -3701, -3696,
+       -3690, -3684, -3679, -3673, -3668, -3662, -3657, -3651,
+       -3646, -3640, -3635, -3629, -3624, -3618, -3613, -3607,
+       -3602, -3596, -3591, -3585, -3579, -3574, -3568, -3563,
+       -3557, -3552, -3546, -3541, -3535, -3530, -3524, -3519,
+       -3514, -3508, -3502, -3497, -3491, -3486, -3480, -3475,
+       -3469, -3464, -3459, -3453, -3448, -3442, -3437, -3431,
+       -3425, -3420, -3415, -3409, -3404, -3398, -3393, -3387,
+       -3382, -3376, -3371, -3366, -3360, -3355, -3349, -3344,
+       -3338, -3333, -3328, -3322, -3317, -3311, -3305, -3300,
+       -3295, -3289, -3284, -3278, -3273, -3268, -3262, -3257,
+       -3251, -3246, -3240, -3235, -3230, -3224, -3219, -3213,
+       -3208, -3203, -3197, -3192, -3186, -3181, -3176, -3170,
+       -3165, -3159, -3154, -3149, -3143, -3138, -3132, -3127,
+       -3122, -3116, -3111, -3105, -3100, -3095, -3089, -3084,
+       -3079, -3073, -3068, -3062, -3057, -3052, -3046, -3041,
+       -3036, -3030, -3025, -3019, -3014, -3009, -3003, -2998,
+       -2993, -2987, -2982, -2977, -2971, -2966, -2961, -2955,
+       -2950, -2944, -2939, -2934, -2928, -2923, -2918, -2912,
+       -2907, -2902, -2896, -2891, -2886, -2880, -2875, -2870,
+       -2864, -2859, -2854, -2848, -2843, -2838, -2832, -2827,
+       -2822, -2816, -2811, -2806, -2800, -2795, -2790, -2784,
+       -2779, -2774, -2768, -2763, -2758, -2753, -2747, -2742,
+       -2737, -2732, -2726, -2721, -2716, -2710, -2705, -2700,
+       -2694, -2689, -2684, -2678, -2673, -2668, -2663, -2657,
+       -2652, -2647, -2642, -2636, -2631, -2626, -2620, -2615,
+       -2610, -2605, -2599, -2594, -2589, -2583, -2578, -2573,
+       -2568, -2562, -2557, -2552, -2546, -2542, -2536, -2531,
+       -2526, -2520, -2515, -2510, -2505, -2499, -2494, -2489,
+       -2483, -2478, -2473, -2468, -2463, -2457, -2452, -2447,
+       -2442, -2436, -2431, -2426, -2421, -2415, -2410, -2405,
+       -2400, -2395, -2389, -2384, -2379, -2374, -2368, -2363,
+       -2358, -2353, -2347, -2342, -2337, -2332, -2327, -2321,
+       -2316, -2311, -2306, -2300, -2295, -2290, -2285, -2279,
+       -2275, -2269, -2264, -2259, -2254, -2248, -2243, -2238,
+       -2233, -2227, -2222, -2217, -2212, -2207, -2202, -2196,
+       -2191, -2186, -2181, -2175, -2170, -2165, -2160, -2155,
+       -2150, -2144, -2139, -2134, -2129, -2124, -2118, -2113,
+       -2108, -2103, -2098, -2093, -2087, -2082, -2077, -2072,
+       -2067, -2062, -2056, -2051, -2046, -2041, -2036, -2030,
+       -2025, -2020, -2015, -2010, -2005, -2000, -1994, -1989,
+       -1984, -1979, -1974, -1969, -1963, -1958, -1953, -1948,
+       -1943, -1937, -1932, -1927, -1922, -1917, -1912, -1907,
+       -1901, -1896, -1891, -1886, -1881, -1876, -1871, -1865,
+       -1860, -1855, -1850, -1845, -1840, -1835, -1829, -1824,
+       -1819, -1814, -1809, -1804, -1799, -1794, -1788, -1783,
+       -1778, -1773, -1768, -1763, -1758, -1752, -1747, -1742,
+       -1737, -1732, -1727, -1722, -1717, -1711, -1706, -1701,
+       -1696, -1691, -1686, -1681, -1676, -1670, -1665, -1660,
+       -1655, -1650, -1645, -1640, -1635, -1629, -1624, -1619,
+       -1614, -1609, -1604, -1599, -1594, -1589, -1584, -1579,
+       -1573, -1568, -1563, -1558, -1553, -1548, -1543, -1538,
+       -1532, -1527, -1522, -1517, -1512, -1507, -1502, -1497,
+       -1492, -1486, -1482, -1477, -1471, -1466, -1461, -1456,
+       -1451, -1446, -1441, -1436, -1431, -1425, -1420, -1415,
+       -1410, -1405, -1400, -1395, -1390, -1385, -1380, -1375,
+       -1370, -1364, -1359, -1354, -1349, -1344, -1339, -1334,
+       -1329, -1324, -1319, -1314, -1309, -1303, -1298, -1294,
+       -1288, -1283, -1278, -1273, -1268, -1263, -1258, -1253,
+       -1248, -1243, -1237, -1232, -1228, -1222, -1217, -1212,
+       -1207, -1202, -1197, -1192, -1187, -1182, -1177, -1171,
+       -1167, -1162, -1156, -1151, -1146, -1141, -1136, -1131,
+       -1126, -1121, -1116, -1111, -1106, -1101, -1096, -1091,
+       -1085, -1081, -1076, -1070, -1065, -1060, -1055, -1050,
+       -1045, -1040, -1035, -1030, -1025, -1020, -1015, -1010,
+       -1005, -1000, -995, -990, -985, -979, -974, -970,
+       -964, -959, -954, -949, -944, -939, -934, -929,
+       -924, -919, -914, -909, -904, -899, -894, -889,
+       -884, -879, -874, -868, -863, -859, -853, -848,
+       -843, -838, -833, -828, -823, -818, -813, -808,
+       -803, -798, -793, -788, -783, -778, -773, -768,
+       -763, -758, -752, -748, -743, -738, -732, -727,
+       -723, -717, -712, -707, -702, -697, -692, -687,
+       -682, -677, -672, -667, -662, -657, -652, -647,
+       -642, -637, -632, -627, -622, -617, -612, -607,
+       -602, -597, -591, -587, -582, -577, -571, -566,
+       -562, -557, -551, -546, -541, -537, -531, -526,
+       -521, -516, -511, -506, -501, -496, -491, -486,
+       -481, -476, -471, -466, -461, -456, -451, -446,
+       -441, -436, -431, -426, -421, -416, -411, -406,
+       -401, -396, -391, -386, -381, -376, -371, -366,
+       -360, -356, -351, -346, -340, -335, -331, -326,
+       -320, -315, -310, -306, -300, -295, -290, -285,
+       -281, -275, -270, -265, -261, -255, -250, -245,
+       -240, -235, -230, -225, -220, -215, -210, -205,
+       -200, -195, -190, -185, -180, -175, -170, -165,
+       -160, -155, -150, -145, -140, -135, -130, -125,
+       -120, -115, -110, -105, -100, -95, -90, -85,
+       -80, -75, -70, -65, -60, -55, -50, -45,
+       -40, -35, -29, -25, -20, -15, -9, -5,
+       0, 5, 11, 16, 20, 25, 30, 36,
+       41, 45, 50, 56, 61, 66, 70, 76,
+       81, 86, 91, 96, 101, 106, 111, 116,
+       121, 126, 131, 136, 141, 146, 151, 156,
+       161, 166, 171, 176, 181, 186, 191, 196,
+       201, 206, 211, 216, 221, 226, 231, 236,
+       241, 246, 251, 256, 261, 266, 271, 276,
+       281, 286, 291, 296, 301, 306, 311, 316,
+       322, 326, 331, 336, 342, 347, 351, 356,
+       362, 367, 372, 376, 382, 387, 392, 396,
+       402, 407, 412, 417, 422, 427, 432, 437,
+       442, 447, 452, 457, 462, 467, 472, 477,
+       482, 487, 492, 497, 502, 507, 512, 517,
+       522, 527, 532, 537, 542, 547, 552, 557,
+       562, 567, 572, 578, 582, 587, 593, 598,
+       603, 607, 613, 618, 623, 628, 633, 638,
+       643, 648, 653, 658, 663, 668, 673, 678,
+       683, 688, 693, 698, 703, 708, 713, 718,
+       723, 728, 733, 739, 743, 748, 754, 759,
+       763, 768, 774, 779, 784, 789, 794, 799,
+       804, 809, 814, 819, 824, 829, 834, 839,
+       844, 849, 854, 859, 864, 869, 874, 879,
+       884, 890, 895, 899, 905, 910, 915, 920,
+       925, 930, 935, 940, 945, 950, 955, 960,
+       965, 970, 975, 980, 985, 990, 995, 1001,
+       1006, 1010, 1016, 1021, 1026, 1031, 1036, 1041,
+       1046, 1051, 1056, 1061, 1066, 1071, 1076, 1081,
+       1086, 1092, 1096, 1102, 1107, 1112, 1117, 1122,
+       1127, 1132, 1137, 1142, 1147, 1152, 1157, 1162,
+       1167, 1173, 1178, 1183, 1188, 1193, 1198, 1203,
+       1208, 1213, 1218, 1223, 1228, 1233, 1238, 1244,
+       1248, 1254, 1259, 1264, 1269, 1274, 1279, 1284,
+       1289, 1294, 1299, 1304, 1309, 1314, 1320, 1325,
+       1330, 1335, 1340, 1345, 1350, 1355, 1360, 1365,
+       1371, 1375, 1381, 1386, 1391, 1396, 1401, 1406,
+       1411, 1416, 1421, 1426, 1432, 1436, 1442, 1447,
+       1452, 1457, 1462, 1467, 1472, 1477, 1482, 1488,
+       1493, 1497, 1503, 1508, 1513, 1518, 1523, 1528,
+       1534, 1538, 1543, 1549, 1554, 1559, 1564, 1569,
+       1574, 1579, 1584, 1590, 1595, 1600, 1605, 1610,
+       1615, 1620, 1625, 1630, 1636, 1640, 1646, 1651,
+       1656, 1661, 1666, 1671, 1676, 1681, 1687, 1692,
+       1697, 1702, 1707, 1712, 1717, 1722, 1728, 1733,
+       1738, 1743, 1748, 1753, 1758, 1764, 1769, 1774,
+       1779, 1784, 1789, 1794, 1799, 1805, 1810, 1815,
+       1820, 1825, 1831, 1835, 1841, 1846, 1851, 1856,
+       1861, 1866, 1871, 1877, 1882, 1887, 1892, 1897,
+       1902, 1908, 1913, 1918, 1923, 1928, 1933, 1939,
+       1944, 1949, 1954, 1959, 1964, 1969, 1975, 1980,
+       1985, 1990, 1995, 2000, 2005, 2011, 2016, 2021,
+       2026, 2031, 2037, 2042, 2047, 2052, 2057, 2062,
+       2068, 2073, 2078, 2083, 2088, 2093, 2099, 2104,
+       2109, 2114, 2119, 2125, 2130, 2135, 2140, 2145,
+       2150, 2156, 2161, 2166, 2171, 2177, 2182, 2187,
+       2192, 2197, 2202, 2208, 2213, 2218, 2223, 2229,
+       2234, 2239, 2244, 2249, 2254, 2260, 2265, 2270,
+       2275, 2281, 2286, 2291, 2296, 2302, 2306, 2312,
+       2317, 2322, 2327, 2333, 2338, 2343, 2348, 2354,
+       2359, 2364, 2369, 2374, 2380, 2385, 2390, 2395,
+       2401, 2406, 2411, 2416, 2422, 2427, 2432, 2437,
+       2442, 2448, 2453, 2458, 2463, 2469, 2474, 2479,
+       2485, 2490, 2495, 2500, 2506, 2511, 2516, 2521,
+       2526, 2532, 2537, 2542, 2548, 2553, 2558, 2563,
+       2569, 2574, 2579, 2585, 2589, 2595, 2600, 2605,
+       2611, 2616, 2621, 2627, 2632, 2637, 2642, 2648,
+       2653, 2658, 2664, 2669, 2674, 2680, 2685, 2690,
+       2695, 2700, 2706, 2711, 2716, 2722, 2727, 2732,
+       2738, 2743, 2748, 2754, 2759, 2764, 2769, 2775,
+       2780, 2785, 2791, 2796, 2801, 2807, 2812, 2817,
+       2823, 2828, 2833, 2839, 2844, 2849, 2855, 2860,
+       2865, 2870, 2876, 2881, 2886, 2892, 2897, 2902,
+       2908, 2913, 2918, 2924, 2929, 2935, 2940, 2945,
+       2951, 2956, 2961, 2967, 2972, 2977, 2983, 2988,
+       2993, 2999, 3004, 3010, 3015, 3020, 3026, 3031,
+       3036, 3042, 3047, 3052, 3058, 3063, 3069, 3074,
+       3079, 3085, 3090, 3095, 3101, 3106, 3112, 3117,
+       3122, 3128, 3133, 3139, 3144, 3149, 3155, 3160,
+       3166, 3171, 3176, 3182, 3187, 3193, 3198, 3203,
+       3209, 3214, 3220, 3225, 3231, 3236, 3242, 3247,
+       3252, 3258, 3263, 3269, 3274, 3279, 3285, 3290,
+       3296, 3301, 3307, 3312, 3317, 3323, 3328, 3334,
+       3339, 3345, 3350, 3355, 3361, 3367, 3372, 3378,
+       3383, 3388, 3394, 3399, 3405, 3410, 3416, 3421,
+       3427, 3432, 3437, 3443, 3448, 3454, 3459, 3465,
+       3471, 3476, 3481, 3487, 3492, 3498, 3503, 3509,
+       3514, 3520, 3525, 3531, 3536, 3542, 3548, 3553,
+       3558, 3564, 3569, 3575, 3580, 3586, 3591, 3597,
+       3602, 3608, 3613, 3619, 3625, 3630, 3636, 3641,
+       3647, 3652, 3658, 3663, 3669, 3675, 3680, 3686,
+       3691, 3697, 3702, 3708, 3713, 3719, 3724, 3730,
+       3736, 3741, 3747, 3752, 3758, 3763, 3769, 3774,
+       3780, 3786, 3791, 3797, 3802, 3808, 3813, 3819,
+       3825, 3830, 3836, 3842, 3847, 3853, 3858, 3864,
+       3869, 3875, 3881, 3886, 3892, 3898, 3903, 3909,
+       3915, 3920, 3926, 3931, 3937, 3942, 3948, 3954,
+       3960, 3965, 3971, 3976, 3982, 3987, 3993, 3999,
+       4005, 4010, 4016, 4021, 4027, 4033, 4039, 4044,
+       4050, 4055, 4061, 4067, 4073, 4078, 4084, 4089,
+       4095, 4101, 4107, 4112, 4118, 4123, 4129, 4135,
+       4141, 4146, 4152, 4158, 4164, 4169, 4175, 4181,
+       4187, 4192, 4198, 4203, 4209, 4215, 4221, 4226,
+       4232, 4238, 4243, 4249, 4255, 4261, 4266, 4272,
+       4278, 4284, 4289, 4295, 4301, 4307, 4313, 4318,
+       4324, 4330, 4336, 4341, 4347, 4353, 4359, 4364,
+       4370, 4376, 4382, 4388, 4393, 4399, 4405, 4411,
+       4417, 4422, 4428, 4434, 4440, 4445, 4452, 4457,
+       4463, 4469, 4474, 4481, 4486, 4492, 4498, 4504,
+       4510, 4515, 4521, 4527, 4533, 4539, 4545, 4551,
+       4556, 4562, 4568, 4574, 4580, 4585, 4592, 4597,
+       4603, 4609, 4615, 4621, 4627, 4633, 4638, 4644,
+       4650, 4656, 4662, 4668, 4674, 4680, 4686, 4692,
+       4697, 4703, 4709, 4715, 4721, 4727, 4733, 4739,
+       4745, 4751, 4757, 4762, 4769, 4774, 4780, 4786,
+       4792, 4798, 4804, 4810, 4816, 4822, 4828, 4834,
+       4840, 4846, 4852, 4858, 4864, 4870, 4876, 4882,
+       4888, 4894, 4900, 4906, 4912, 4918, 4924, 4930,
+       4936, 4942, 4948, 4954, 4960, 4966, 4972, 4978,
+       4984, 4990, 4996, 5002, 5008, 5014, 5020, 5026,
+       5032, 5038, 5045, 5050, 5057, 5063, 5069, 5075,
+       5081, 5087, 5093, 5099, 5105, 5111, 5118, 5123,
+       5129, 5136, 5142, 5148, 5154, 5160, 5166, 5172,
+       5179, 5185, 5191, 5197, 5203, 5209, 5215, 5221,
+       5227, 5233, 5240, 5246, 5252, 5258, 5265, 5271,
+       5277, 5283, 5289, 5295, 5301, 5308, 5314, 5320,
+       5326, 5333, 5339, 5345, 5351, 5357, 5363, 5369,
+       5376, 5382, 5388, 5394, 5401, 5407, 5413, 5419,
+       5426, 5432, 5438, 5444, 5451, 5457, 5463, 5469,
+       5476, 5482, 5488, 5494, 5501, 5507, 5513, 5520,
+       5526, 5532, 5539, 5545, 5551, 5557, 5564, 5570,
+       5576, 5583, 5589, 5596, 5602, 5608, 5614, 5621,
+       5627, 5634, 5640, 5646, 5652, 5659, 5665, 5672,
+       5678, 5684, 5691, 5697, 5704, 5710, 5716, 5723,
+       5729, 5736, 5742, 5748, 5755, 5761, 5768, 5774,
+       5780, 5787, 5793, 5800, 5806, 5813, 5819, 5826,
+       5832, 5838, 5845, 5852, 5858, 5864, 5871, 5877,
+       5884, 5890, 5897, 5903, 5910, 5916, 5923, 5929,
+       5936, 5942, 5949, 5956, 5962, 5968, 5975, 5981,
+       5988, 5994, 6001, 6008, 6014, 6021, 6027, 6034,
+       6041, 6047, 6054, 6060, 6067, 6074, 6080, 6087,
+       6093, 6100, 6107, 6113, 6120, 6126, 6133, 6140,
+       6146, 6153, 6160, 6167, 6173, 6180, 6186, 6193,
+       6200, 6206, 6213, 6220, 6226, 6233, 6240, 6246,
+       6253, 6260, 6266, 6273, 6280, 6287, 6294, 6300,
+       6307, 6314, 6321, 6327, 6334, 6341, 6348, 6354,
+       6361, 6368, 6375, 6382, 6388, 6395, 6402, 6409,
+       6416, 6422, 6429, 6436, 6443, 6450, 6457, 6463,
+       6470, 6477, 6484, 6491, 6497, 6504, 6511, 6518,
+       6525, 6532, 6539, 6546, 6553, 6559, 6566, 6573,
+       6580, 6587, 6594, 6601, 6608, 6615, 6622, 6629,
+       6636, 6643, 6650, 6657, 6664, 6671, 6678, 6685,
+       6692, 6699, 6706, 6713, 6719, 6727, 6734, 6741,
+       6748, 6755, 6762, 6769, 6776, 6783, 6790, 6797,
+       6804, 6811, 6818, 6826, 6833, 6840, 6847, 6854,
+       6861, 6868, 6875, 6883, 6889, 6897, 6904, 6911,
+       6918, 6925, 6932, 6939, 6947, 6954, 6961, 6969,
+       6975, 6983, 6990, 6997, 7005, 7012, 7019, 7026,
+       7033, 7041, 7048, 7055, 7062, 7070, 7077, 7084,
+       7091, 7099, 7106, 7114, 7121, 7128, 7135, 7143,
+       7150, 7157, 7165, 7172, 7179, 7187, 7194, 7202,
+       7209, 7216, 7224, 7231, 7238, 7246, 7253, 7261,
+       7268, 7276, 7283, 7290, 7298, 7306, 7313, 7320,
+       7328, 7336, 7343, 7350, 7358, 7365, 7373, 7381,
+       7388, 7395, 7403, 7410, 7418, 7426, 7433, 7441,
+       7448, 7456, 7463, 7471, 7479, 7486, 7494, 7501,
+       7509, 7517, 7524, 7532, 7540, 7547, 7555, 7563,
+       7571, 7578, 7586, 7594, 7601, 7609, 7617, 7624,
+       7632, 7640, 7648, 7655, 7663, 7671, 7679, 7687,
+       7694, 7702, 7710, 7718, 7725, 7733, 7741, 7749,
+       7757, 7765, 7773, 7780, 7788, 7796, 7804, 7812,
+       7820, 7828, 7836, 7843, 7852, 7859, 7868, 7875,
+       7883, 7891, 7899, 7907, 7915, 7923, 7931, 7939,
+       7947, 7955, 7963, 7971, 7979, 7988, 7995, 8004,
+       8012, 8020, 8028, 8036, 8044, 8052, 8061, 8069,
+       8076, 8085, 8093, 8101, 8109, 8117, 8126, 8134,
+       8142, 8150, 8158, 8167, 8175, 8183, 8192, 8200,
+       8208, 8217, 8225, 8233, 8241, 8250, 8258, 8266,
+       8275, 8283, 8292, 8300, 8308, 8317, 8325, 8333,
+       8342, 8350, 8359, 8367, 8376, 8384, 8392, 8401,
+       8409, 8418, 8426, 8435, 8443, 8452, 8461, 8469,
+       8477, 8486, 8495, 8503, 8512, 8520, 8529, 8538,
+       8546, 8555, 8564, 8573, 8581, 8590, 8598, 8607,
+       8616, 8625, 8633, 8642, 8651, 8659, 8668, 8677,
+       8686, 8695, 8704, 8712, 8721, 8730, 8739, 8748,
+       8756, 8765, 8774, 8783, 8792, 8801, 8810, 8819,
+       8828, 8837, 8846, 8855, 8864, 8873, 8882, 8891,
+       8900, 8909, 8918, 8927, 8936, 8945, 8954, 8964,
+       8973, 8982, 8991, 9000, 9009, 9019, 9028, 9037,
+       9046, 9055, 9064, 9074, 9083, 9092, 9102, 9111,
+       9120, 9130, 9139, 9148, 9157, 9167, 9176, 9186,
+       9195, 9205, 9214, 9223, 9233, 9242, 9252, 9261,
+       9271, 9280, 9290, 9300, 9309, 9318, 9328, 9338,
+       9347, 9357, 9367, 9376, 9386, 9395, 9405, 9415,
+       9424, 9434, 9444, 9454, 9464, 9473, 9483, 9493,
+       9503, 9513, 9522, 9532, 9542, 9552, 9562, 9572,
+       9582, 9592, 9602, 9612, 9622, 9632, 9642, 9652,
+       9662, 9672, 9682, 9692, 9702, 9712, 9722, 9733,
+       9743, 9753, 9763, 9773, 9783, 9794, 9804, 9814,
+       9825, 9835, 9845, 9855, 9866, 9876, 9887, 9897,
+       9907, 9918, 9928, 9939, 9949, 9960, 9970, 9981,
+       9991, 10002, 10012, 10023, 10034, 10044, 10055, 10066,
+       10076, 10087, 10097, 10108, 10119, 10130, 10140, 10152,
+       10162, 10173, 10184, 10195, 10206, 10217, 10227, 10238,
+       10249, 10260, 10271, 10282, 10293, 10304, 10315, 10326,
+       10337, 10349, 10360, 10371, 10382, 10394, 10405, 10416,
+       10427, 10438, 10450, 10461, 10472, 10484, 10495, 10507,
+       10518, 10530, 10541, 10553, 10564, 10575, 10587, 10598,
+       10610, 10622, 10633, 10645, 10657, 10668, 10680, 10692,
+       10704, 10715, 10727, 10739, 10751, 10763, 10775, 10786,
+       10798, 10811, 10822, 10834, 10847, 10858, 10870, 10883,
+       10895, 10907, 10919, 10931, 10944, 10956, 10968, 10981,
+       10993, 11005, 11017, 11030, 11042, 11055, 11067, 11080,
+       11092, 11105, 11117, 11130, 11142, 11155, 11168, 11180,
+       11193, 11206, 11219, 11232, 11245, 11257, 11270, 11283,
+       11296, 11309, 11322, 11335, 11348, 11361, 11375, 11388,
+       11401, 11414, 11427, 11441, 11454, 11467, 11481, 11494,
+       11508, 11521, 11534, 11548, 11561, 11575, 11589, 11602,
+       11616, 11630, 11644, 11657, 11671, 11685, 11699, 11713,
+       11727, 11741, 11755, 11769, 11783, 11797, 11811, 11826,
+       11839, 11854, 11868, 11882, 11897, 11911, 11926, 11940,
+       11955, 11969, 11984, 11998, 12013, 12028, 12043, 12057,
+       12072, 12087, 12102, 12117, 12132, 12147, 12162, 12177,
+       12193, 12208, 12223, 12238, 12254, 12269, 12284, 12299,
+       12315, 12331, 12346, 12362, 12378, 12393, 12409, 12425,
+       12441, 12457, 12473, 12489, 12505, 12521, 12537, 12553,
+       12569, 12586, 12602, 12619, 12635, 12651, 12668, 12684,
+       12701, 12718, 12734, 12751, 12768, 12785, 12802, 12819,
+       12836, 12853, 12870, 12888, 12905, 12922, 12940, 12957,
+       12975, 12993, 13010, 13028, 13046, 13064, 13081, 13099,
+       13117, 13135, 13154, 13172, 13190, 13209, 13227, 13246,
+       13264, 13283, 13301, 13320, 13339, 13358, 13377, 13396,
+       13415, 13434, 13454, 13473, 13492, 13512, 13532, 13551,
+       13571, 13591, 13611, 13631, 13651, 13671, 13691, 13711,
+       13732, 13752, 13773, 13793, 13814, 13835, 13856, 13877,
+       13898, 13919, 13940, 13962, 13983, 14005, 14026, 14048,
+       14070, 14092, 14114, 14136, 14159, 14181, 14203, 14226,
+       14249, 14272, 14294, 14318, 14341, 14364, 14387, 14411,
+       14434, 14458, 14482, 14506, 14530, 14554, 14578, 14603,
+       14628, 14653, 14677, 14703, 14728, 14753, 14778, 14804,
+       14830, 14855, 14882, 14908, 14934, 14961, 14987, 15014,
+       15041, 15068, 15095, 15123, 15151, 15179, 15206, 15235,
+       15263, 15291, 15320, 15349, 15378, 15408, 15437, 15466,
+       15496, 15527, 15557, 15587, 15618, 15649, 15680, 15712,
+       15743, 15775, 15808, 15840, 15872, 15906, 15939, 15972,
+       16006, 16040, 16074, 16108, 16143, 16178, 16214, 16249,
+       16285, 16322, 16358, 16395, 16433, 16470, 16508, 16547,
+       16586, 16624, 16664, 16704, 16744, 16785, 16826, 16867,
+       16910, 16952, 16995, 17038, 17082, 17126, 17171, 17217,
+       17263, 17309, 17356, 17403, 17452, 17501, 17550, 17600,
+       17651, 17702, 17754, 17807, 17861, 17915, 17970, 18026,
+       18083, 18141, 18200, 18259, 18320, 18382, 18444, 18508,
+       18573, 18639, 18706, 18775, 18845, 18917, 18989, 19064,
+       19140, 19217, 19297, 19378, 19461, 19547, 19634, 19724,
+       19816, 19911, 20009, 20109, 20213, 20319, 20430, 20544,
+       20663, 20786, 20914, 21047, 21186, 21331, 21484, 21644,
+       21813, 21991, 22181, 22384, 22601, 22836, 23091, 23370,
+       23679, 24027, 24424, 24888, 25450, 26164, 27159, 28858,
+};
+#define NORM_DIST_TABLE_SIZE \
+       (sizeof (norm_dist_table) / sizeof (norm_dist_table[0]))
+
+struct heap_elem {
+       uint64_t key;
+       pktsched_pkt_t pkt;
+};
+
+struct heap {
+       uint32_t        limit;  /* max size */
+       uint32_t        size;   /* current size */
+       struct heap_elem p[0];
+};
+
+static struct heap *heap_create(uint32_t size);
+static int heap_insert(struct heap *h, uint64_t k, pktsched_pkt_t *p);
+static int heap_peek(struct heap *h, uint64_t *k, pktsched_pkt_t *p);
+static int heap_extract(struct heap *h, uint64_t *k, pktsched_pkt_t *p);
+
+struct netem {
+       decl_lck_mtx_data(, netem_lock);
+
+       /* Init Time Constants */
+       char            netem_name[MAXTHREADNAMESIZE];
+       uint32_t        netem_flags;
+       struct thread   *netem_output_thread;
+
+       void            *netem_output_handle;
+       int             (*netem_output)(void *handle, pktsched_pkt_t *pkts,
+           uint32_t n_pkts);
+       uint32_t        netem_output_max_batch_size;
+
+       struct heap     *netem_heap;
+
+       /* Parameters variables */
+       /* bandwidth token bucket limit */
+#define TOKEN_INVALID   UINT64_MAX
+       struct token_bucket {
+               uint64_t        depth;
+               uint64_t        token;
+               uint64_t        last;
+               uint64_t        rate;
+       } netem_bandwidth_model;
+
+       /* XXX (need correlated) naive corruption model */
+       struct corruption {
+               uint32_t        corruption_p;
+       } netem_corruption_model;
+
+       /* naive duplication model */
+       struct duplication {
+               uint32_t        duplication_p;
+       } netem_duplication_model;
+
+       /* latency (with jitter following random distribution) */
+       struct latency {
+               uint32_t        latency_ms;
+               uint32_t        jitter_ms;
+               uint64_t        last_time_to_send;
+       } netem_latency_model;
+
+       /* 4 state Markov packet loss model */
+       struct loss {
+               enum _4state_markov_packet_loss_state {
+                       __NO_LOSS = 0,
+                       GAP_RX = 1,
+                       GAP_LOSS,
+                       BURST_RX,
+                       BURST_LOSS,
+               } state;
+
+               uint32_t        p_gr_gl; /* P( gap_loss   | gap_rx     ) */
+               uint32_t        p_gr_bl; /* P( burst_loss | gap_rx     ) */
+               uint32_t        p_bl_br; /* P( burst_rx   | burst_loss ) */
+               uint32_t        p_bl_gr; /* P( gap_rx     | burst_loss ) */
+               uint32_t        p_br_bl; /* P( burst_loss | burst_rx   ) */
+       } netem_loss_model;
+
+       /*
+        * Reordering Model --
+        * randomly select packets and re-inject with additional delay
+        */
+       struct reordering {
+               uint32_t        reordering_p;
+       } netem_reordering_model;
+};
+
+#define NETEMF_INITIALIZED      0x00000001      /* has been initialized */
+#define NETEMF_RUNNING          0x00000002      /* thread is running */
+#define NETEMF_TERMINATEBLOCK   0x20000000      /* block waiting terminate */
+#define NETEMF_TERMINATING      0x40000000      /* thread is terminating */
+#define NETEMF_TERMINATED       0x80000000      /* thread is terminated */
+
+#define NETEM_MTX_LOCK(_sch)                    \
+       lck_mtx_lock(&(_sch)->netem_lock)
+#define NETEM_MTX_LOCK_ASSERT_HELD(_sch)                \
+       LCK_MTX_ASSERT(&(_sch)->netem_lock, LCK_ASSERT_OWNED)
+#define NETEM_MTX_LOCK_ASSERT_NOTHELD(_sch)     \
+       LCK_MTX_ASSERT(&(_sch)->netem_lock, LCK_ASSERT_NOTOWNED)
+#define NETEM_MTX_UNLOCK(_sch)                  \
+       lck_mtx_unlock(&(_sch)->netem_lock)
+
+static struct heap *
+heap_create(uint32_t limit)
+{
+       struct heap *h = NULL;
+
+       // verify limit
+       size_t size = sizeof(struct heap) + sizeof(struct heap_elem) * limit;
+
+       h = _MALLOC(size, M_DEVBUF, M_WAITOK | M_ZERO);
+       if (h == NULL) {
+               return NULL;
+       }
+
+       h->limit = limit;
+       h->size = 0;
+
+       return h;
+}
+
+static void
+heap_destroy(struct heap *h)
+{
+       ASSERT(h->size == 0);
+
+       _FREE(h, M_DEVBUF);
+}
+
+#define HEAP_FATHER(child) (((child) - 1) / 2)
+#define HEAP_SWAP(a, b, tmp) { tmp = a; a = b; b = tmp; }
+#define HEAP_LEFT(x) (2 * (x) + 1)
+
+static int
+heap_insert(struct heap *h, uint64_t key, pktsched_pkt_t *pkt)
+{
+       ASSERT(h != NULL);
+
+       if (h->size == h->limit) {
+               return ENOMEM;
+       }
+
+       uint32_t child, parent;
+       if (pkt == NULL) {
+               child = key;
+               ASSERT(child < h->size);
+       } else {
+               child = h->size;
+               h->p[child].key = key;
+               h->p[child].pkt = *pkt;
+               h->size++;
+       }
+
+       while (child > 0) {
+               struct heap_elem tmp;
+               parent = HEAP_FATHER(child);
+               if (h->p[parent].key < h->p[child].key) {
+                       break;
+               }
+               HEAP_SWAP(h->p[child], h->p[parent], tmp);
+               child = parent;
+       }
+
+       return 0;
+}
+
+static int
+heap_peek(struct heap *h, uint64_t *key, pktsched_pkt_t *pkt)
+{
+       if (h->size == 0) {
+               return ENOENT;
+       }
+
+       *key = h->p[0].key;
+       *pkt = h->p[0].pkt;
+       return 0;
+}
+
+static int
+heap_extract(struct heap *h, uint64_t *key, pktsched_pkt_t *pkt)
+{
+       uint32_t child, parent, max;
+
+       if (h->size == 0) {
+               netem_log(NETEM_LOG_ERROR, "warning: extract from empty heap");
+               return ENOENT;
+       }
+
+       *key = h->p[0].key;
+       *pkt = h->p[0].pkt;
+
+       /* re-heapify */
+       parent = 0;
+       child = HEAP_LEFT(parent);      /* start from left child */
+       max = h->size - 1;
+       while (child <= max) {
+               if (child != max && h->p[child + 1].key < h->p[child].key) {
+                       child = child + 1;        /* right child */
+               }
+               h->p[parent] = h->p[child];
+               parent = child;
+               child = HEAP_LEFT(child);       /* left child for next loop */
+       }
+
+       h->size--;
+       if (parent != max) {
+               /* Fill hole with last entry, bubble up reusing insert code */
+               h->p[parent] = h->p[max];
+               _PKTSCHED_PKT_INIT(&h->p[max].pkt);
+               heap_insert(h, parent, NULL); /* this one cannot fail */
+       }
+
+       return 0;
+}
+
+static void
+token_bucket_update(struct token_bucket *tb)
+{
+       uint64_t now, elapsed;
+       clock_sec_t sec;
+       clock_usec_t usec;
+
+       if (tb->rate == 0) {
+               return;
+       }
+
+       now = mach_absolute_time();
+       elapsed = now - tb->last;
+       absolutetime_to_microtime(elapsed, &sec, &usec);
+       tb->token += ((sec * USEC_PER_SEC + usec) * tb->rate / USEC_PER_SEC);
+       if (__improbable(tb->token > tb->depth)) {
+               tb->token = tb->depth;
+       }
+       tb->last = now;
+}
+
+static boolean_t
+bandwidth_limited(struct netem *ne, uint32_t pkt_len)
+{
+       struct token_bucket *tb = &ne->netem_bandwidth_model;
+
+       if (tb->rate == 0) {
+               return FALSE;
+       }
+
+       if (tb->token < pkt_len * 8) {
+               netem_log(NETEM_LOG_DEBUG, "limited");
+               return TRUE;
+       }
+       tb->token -= pkt_len * 8;
+
+       netem_log(NETEM_LOG_DEBUG, "token left %llu", tb->token);
+
+       return FALSE;
+}
+
+static void
+corruption_event(struct netem *ne, pktsched_pkt_t *pkt)
+{
+       struct corruption *corr = &ne->netem_corruption_model;
+       uint32_t rand;
+
+       if (corr->corruption_p == 0) {
+               return;
+       }
+
+       read_frandom(&rand, sizeof(rand));
+       rand %= NETEM_PSCALE;
+
+       if (rand < corr->corruption_p) {
+               netem_log(NETEM_LOG_ERROR, "\t corrupted");
+               pktsched_corrupt_packet(pkt);
+       }
+}
+
+static boolean_t
+duplication_event(struct netem *ne)
+{
+       struct duplication *dup = &ne->netem_duplication_model;
+       uint32_t rand;
+
+       if (dup->duplication_p == 0) {
+               return FALSE;
+       }
+
+       read_frandom(&rand, sizeof(rand));
+       rand %= NETEM_PSCALE;
+
+       return rand < dup->duplication_p;
+}
+
+static uint64_t
+latency_event(struct netem *ne, boolean_t reordering)
+{
+       struct latency *l = &ne->netem_latency_model;
+       int32_t delay_ms = 0, jitter_ms = 0;
+       uint64_t time_to_send = 0;
+
+       delay_ms = l->latency_ms;
+       if (l->jitter_ms != 0) {
+               int32_t rand, x, t, s = l->jitter_ms;
+               read_frandom(&rand, sizeof(rand));
+               t = norm_dist_table[rand % NORM_DIST_TABLE_SIZE];
+               x = (s % NORM_DIST_SCALE) * t;
+               if (x >= 0) {
+                       x += NORM_DIST_SCALE / 2;
+               } else {
+                       x -= NORM_DIST_SCALE / 2;
+               }
+               jitter_ms = x / NORM_DIST_SCALE + (s * t / NORM_DIST_SCALE);
+       }
+
+       delay_ms += jitter_ms;
+       delay_ms = MAX(delay_ms, 0);
+
+       netem_log(NETEM_LOG_DEBUG, "\tdelay %dms", delay_ms);
+       clock_interval_to_deadline(delay_ms, NSEC_PER_MSEC, &time_to_send);
+
+       if (l->last_time_to_send != 0) {
+               if (reordering) {
+                       /* reorder with last packet */
+                       time_to_send = l->last_time_to_send - 1;
+               } else {
+                       /* make sure packet time to send is monotonic */
+                       if (time_to_send < l->last_time_to_send) {
+                               /* send this one immediately afterwards */
+                               time_to_send = l->last_time_to_send + 1;
+                       }
+               }
+       }
+
+       l->last_time_to_send = time_to_send;
+
+       return time_to_send;
+}
+
+static boolean_t
+loss_event(struct netem *ne)
+{
+       struct loss *loss = &ne->netem_loss_model;
+       uint32_t rand;
+
+       if (loss->state == __NO_LOSS) {
+               return FALSE;
+       }
+
+       read_frandom(&rand, sizeof(rand));
+       rand %= NETEM_PSCALE;
+
+       switch (loss->state) {
+       case GAP_RX:
+               if (rand < loss->p_gr_gl) {
+                       loss->state = GAP_RX;
+                       return TRUE;
+               } else if (loss->p_gr_gl < rand &&
+                   rand < loss->p_gr_gl + loss->p_gr_bl) {
+                       loss->state = BURST_LOSS;
+                       return TRUE;
+               } else {
+                       loss->state = GAP_RX;
+                       return FALSE;
+               }
+       case BURST_LOSS:
+               if (rand < loss->p_bl_br) {
+                       loss->state = BURST_RX;
+                       return FALSE;
+               } else if (loss->p_bl_br < rand &&
+                   rand < loss->p_bl_br + loss->p_bl_gr) {
+                       loss->state = GAP_RX;
+                       return FALSE;
+               } else {
+                       loss->state = BURST_LOSS;
+                       return TRUE;
+               }
+       case BURST_RX:
+               if (rand < loss->p_br_bl) {
+                       loss->state = BURST_LOSS;
+                       return TRUE;
+               } else {
+                       loss->state = BURST_RX;
+                       return FALSE;
+               }
+       case GAP_LOSS:
+       /* This is instantaneous (stateless), should not be reached */
+       default:
+               VERIFY(0);
+               break;
+       }
+
+       /* not reached */
+       VERIFY(0);
+       return FALSE;
+}
+
+static boolean_t
+reordering_event(struct netem *ne)
+{
+       struct reordering *reord = &ne->netem_reordering_model;
+       uint32_t rand;
+
+       if (reord->reordering_p == 0) {
+               return FALSE;
+       }
+
+       read_frandom(&rand, sizeof(rand));
+       rand %= NETEM_PSCALE;
+
+       return rand < reord->reordering_p;
+}
+
+static void
+netem_update_locked(struct netem *ne)
+{
+       ASSERT(ne != NULL);
+       NETEM_MTX_LOCK_ASSERT_HELD(ne);
+
+       token_bucket_update(&ne->netem_bandwidth_model);
+}
+
+int
+netem_enqueue(struct netem *ne, classq_pkt_t *p, boolean_t *pdrop)
+{
+       int ret = 0;
+       int pkt_count = 1;
+       uint64_t time_to_send;
+       pktsched_pkt_t pkt;
+
+       pktsched_pkt_encap(&pkt, p);
+
+       ASSERT(ne != NULL);
+       ASSERT(pdrop != NULL);
+       NETEM_MTX_LOCK(ne);
+
+       netem_log(NETEM_LOG_DEBUG, "+ %p begin", p->cp_mbuf);
+
+       if (loss_event(ne)) {
+               netem_log(NETEM_LOG_DEBUG, "\t lost");
+               pkt_count--;
+       }
+
+       if (duplication_event(ne)) {
+               netem_log(NETEM_LOG_DEBUG, "\t dup'ed");
+               pkt_count++;
+       }
+
+       if (pkt_count == 0) {
+               pktsched_free_pkt(&pkt);
+               *pdrop = TRUE;
+               goto done;
+       }
+
+       do {
+               corruption_event(ne, &pkt);
+
+               time_to_send = latency_event(ne, reordering_event(ne));
+
+               ret = heap_insert(ne->netem_heap, time_to_send, &pkt);
+               if (ret != 0) {
+                       netem_log(NETEM_LOG_DEBUG, "\t%p err heap_insert %d",
+                           p->cp_mbuf, ret);
+                       pktsched_free_pkt(&pkt);
+                       goto done;
+               }
+               netem_log(NETEM_LOG_DEBUG, "\t%p enqueued",
+                   pkt.pktsched_pkt_mbuf);
+       } while (--pkt_count > 0 &&
+           __probable((ret = pktsched_clone_pkt(&pkt, &pkt)) == 0));
+
+done:
+       if (__probable(ne->netem_output_thread != THREAD_NULL)) {
+               if (!(ne->netem_flags & (NETEMF_RUNNING |
+                   NETEMF_TERMINATING | NETEMF_TERMINATED))) {
+                       netem_log(NETEM_LOG_DEBUG, "wakeup output thread");
+                       (void) thread_wakeup((caddr_t)&ne->netem_flags);
+               }
+       }
+
+       NETEM_MTX_UNLOCK(ne);
+       netem_log(NETEM_LOG_DEBUG, "- %p end", p->cp_mbuf);
+
+       return ret;
+}
+
+static int
+netem_dequeue_internal_locked(struct netem *ne, pktsched_pkt_t *pp,
+    boolean_t *ppending)
+{
+       int ret = 0;
+       uint64_t time_to_send;
+       pktsched_pkt_t pkt;
+
+       ASSERT(ne != NULL);
+       NETEM_MTX_LOCK_ASSERT_HELD(ne);
+
+       netem_log(NETEM_LOG_HIDEBUG, "+ begin");
+
+       ret = heap_peek(ne->netem_heap, &time_to_send, &pkt);
+       if (ret != 0) {
+               netem_log(NETEM_LOG_HIDEBUG, "\theap empty");
+               ret = ENOENT;
+               goto done;
+       }
+
+       /* latency limit */
+       if (time_to_send > mach_absolute_time()) {
+               netem_log(NETEM_LOG_DEBUG,
+                   "held back: time_to_send %llu now %llu",
+                   time_to_send, mach_absolute_time());
+               ret = EAGAIN;
+               goto done;
+       }
+
+       /* bandwidth limited */
+       if (bandwidth_limited(ne, pkt.pktsched_plen)) {
+               ret = EAGAIN;
+               goto done;
+       }
+
+       ret = heap_extract(ne->netem_heap, &time_to_send, &pkt);
+       ASSERT(ret == 0);
+       *pp = pkt;
+
+       netem_log(NETEM_LOG_HIDEBUG, "- %p end", pkt.pktsched_pkt_mbuf);
+
+done:
+       *ppending = (ret == EAGAIN) ? TRUE : FALSE;
+
+       return ret;
+}
+
+int
+netem_dequeue(struct netem *ne, pktsched_pkt_t *p,
+    boolean_t *ppending)
+{
+       int ret;
+
+       NETEM_MTX_LOCK(ne);
+       netem_update_locked(ne);
+       ret = netem_dequeue_internal_locked(ne, p, ppending);
+       NETEM_MTX_UNLOCK(ne);
+
+       return ret;
+}
+
+__attribute__((noreturn))
+static void
+netem_output_thread_cont(void *v, wait_result_t w)
+__attribute__((optnone))
+{
+       struct netem *ne = v;
+       boolean_t pending = FALSE;
+       pktsched_pkt_t pkts[NETEM_MAX_BATCH_SIZE];
+       uint32_t n_pkts = 0;
+       int ret;
+
+       NETEM_MTX_LOCK(ne);
+       ASSERT(!(ne->netem_flags & NETEMF_TERMINATED));
+       ne->netem_flags |= NETEMF_RUNNING;
+
+       if (__improbable(w == THREAD_INTERRUPTED ||
+           (ne->netem_flags & NETEMF_TERMINATING) != 0)) {
+               ASSERT(!(ne->netem_flags & NETEMF_TERMINATED));
+               ne->netem_flags &= ~(NETEMF_RUNNING | NETEMF_TERMINATING);
+               ne->netem_flags |= NETEMF_TERMINATED;
+
+               netem_log(NETEM_LOG_INFO, "%s output thread terminated",
+                   ne->netem_name);
+
+               if (ne->netem_flags & NETEMF_TERMINATEBLOCK) {
+                       thread_wakeup((caddr_t)&ne->netem_output_thread);
+               }
+
+               NETEM_MTX_UNLOCK(ne);
+
+               /* for the extra refcnt from kernel_thread_start() */
+               thread_deallocate(current_thread());
+               /* this is the end */
+               thread_terminate(current_thread());
+               /* NOTREACHED */
+               __builtin_unreachable();
+       }
+
+       ASSERT(ne->netem_output != NULL);
+       netem_update_locked(ne);
+       n_pkts = 0;
+       for (;;) {
+               ret = netem_dequeue_internal_locked(ne, &pkts[n_pkts],
+                   &pending);
+               if (__probable(ret == 0 &&
+                   ++n_pkts < ne->netem_output_max_batch_size)) {
+                       continue;
+               }
+
+               if (__probable(n_pkts != 0)) {
+                       NETEM_MTX_UNLOCK(ne);
+                       (void) ne->netem_output(ne->netem_output_handle,
+                           pkts, n_pkts);
+                       NETEM_MTX_LOCK(ne);
+                       n_pkts = 0;
+               }
+               if (ret != 0) {
+                       break;
+               }
+       }
+
+       uint64_t deadline = TIMEOUT_WAIT_FOREVER;
+       if (pending) {
+               clock_interval_to_deadline(1, NSEC_PER_MSEC, &deadline);
+       }
+       (void) assert_wait_deadline(&ne->netem_flags, THREAD_UNINT, deadline);
+       ne->netem_flags &= ~NETEMF_RUNNING;
+       NETEM_MTX_UNLOCK(ne);
+       (void) thread_block_parameter(netem_output_thread_cont, ne);
+       /* NOTREACHED */
+       __builtin_unreachable();
+}
+
+__attribute__((noreturn))
+static void
+netem_output_thread_func(void *v, wait_result_t w)
+{
+#pragma unused(w)
+       struct netem *ne = v;
+
+       ASSERT(ne->netem_output_thread == current_thread());
+       thread_set_thread_name(current_thread(), ne->netem_name);
+
+       NETEM_MTX_LOCK(ne);
+       VERIFY(!(ne->netem_flags & NETEMF_RUNNING));
+       (void) assert_wait(&ne->netem_flags, THREAD_UNINT);
+       NETEM_MTX_UNLOCK(ne);
+       thread_block_parameter(netem_output_thread_cont, ne);
+       /* NOTREACHED */
+       __builtin_unreachable();
+}
+
+int
+netem_init(void)
+{
+       ASSERT(!__netem_inited);
+       __netem_inited = 1;
+
+       netem_lock_attr = lck_attr_alloc_init();
+       netem_lock_group_attr = lck_grp_attr_alloc_init();
+       netem_lock_group = lck_grp_alloc_init("pktsched_netem_lock",
+           netem_lock_group_attr);
+
+       return 0;
+}
+
+static struct netem *
+netem_create(const char *name, void *output_handle,
+    int (*output)(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts),
+    uint32_t output_max_batch_size)
+{
+       struct netem *ne;
+
+       ne = _MALLOC(sizeof(struct netem), M_DEVBUF, M_WAITOK | M_ZERO);
+
+       lck_mtx_init(&ne->netem_lock, netem_lock_group, netem_lock_attr);
+
+       ne->netem_heap = heap_create(NETEM_HEAP_SIZE);
+       ne->netem_flags = NETEMF_INITIALIZED;
+       ne->netem_output_handle = output_handle;
+       ne->netem_output = output;
+       ne->netem_output_max_batch_size =
+           MIN(output_max_batch_size, NETEM_MAX_BATCH_SIZE);
+       ne->netem_output_thread = THREAD_NULL;
+       if (output != NULL) {
+               strlcpy(ne->netem_name, name, sizeof(ne->netem_name));
+               if (kernel_thread_start(netem_output_thread_func, ne,
+                   &ne->netem_output_thread) != KERN_SUCCESS) {
+                       panic_plain("%s can't create thread", ne->netem_name);
+               }
+       }
+
+       return ne;
+}
+
+void
+netem_destroy(struct netem *ne)
+{
+       uint64_t f = (1 * NSEC_PER_MSEC);       /* 1 ms */
+       uint64_t s = (1000 * NSEC_PER_MSEC);    /* 1 sec */
+       uint32_t i = 0;
+       int ret = 0;
+       uint64_t key = 0;
+       pktsched_pkt_t pkt;
+
+       ASSERT(ne != NULL);
+
+       if (ne->netem_output_thread != THREAD_NULL) {
+               ASSERT(ne->netem_flags & NETEMF_INITIALIZED);
+               /* signal thread to begin self-termination */
+               NETEM_MTX_LOCK(ne);
+               ne->netem_flags |= NETEMF_TERMINATING;
+
+               /* and wait for thread to terminate */
+               while (!(ne->netem_flags & NETEMF_TERMINATED)) {
+                       uint64_t t = 0;
+                       nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t);
+                       clock_absolutetime_interval_to_deadline(t, &t);
+                       ASSERT(t != 0);
+
+                       ne->netem_flags |= NETEMF_TERMINATEBLOCK;
+                       if (!(ne->netem_flags & NETEMF_RUNNING)) {
+                               thread_wakeup((caddr_t)&ne->netem_flags);
+                       }
+                       (void) assert_wait_deadline(&ne->netem_output_thread,
+                           THREAD_UNINT, t);
+                       NETEM_MTX_UNLOCK(ne);
+                       (void) thread_block(THREAD_CONTINUE_NULL);
+                       NETEM_MTX_LOCK(ne);
+                       ne->netem_flags &= ~NETEMF_TERMINATEBLOCK;
+               }
+               ASSERT(ne->netem_flags & NETEMF_TERMINATED);
+               NETEM_MTX_UNLOCK(ne);
+               ne->netem_output_thread = THREAD_NULL;
+       }
+       ASSERT(ne->netem_output_thread == THREAD_NULL);
+
+       lck_mtx_destroy(&ne->netem_lock, netem_lock_group);
+
+       while ((ret = heap_extract(ne->netem_heap, &key, &pkt)) == 0) {
+               pktsched_free_pkt(&pkt);
+       }
+       heap_destroy(ne->netem_heap);
+
+       _FREE(ne, M_DEVBUF);
+}
+
+static int
+netem_check_params(const struct if_netem_params *p)
+{
+       if (p->ifnetem_corruption_p > NETEM_PSCALE) {
+               netem_log(NETEM_LOG_ERROR, "error: corruption_p %d > %d",
+                   p->ifnetem_corruption_p, NETEM_PSCALE);
+               return EINVAL;
+       }
+
+       if (p->ifnetem_duplication_p > NETEM_PSCALE) {
+               netem_log(NETEM_LOG_ERROR, "error: duplication_p %d > %d",
+                   p->ifnetem_duplication_p, NETEM_PSCALE);
+               return EINVAL;
+       }
+
+       if (p->ifnetem_duplication_p > 0 &&
+           p->ifnetem_latency_ms == 0) {
+               /* we need to insert dup'ed packet with latency */
+               netem_log(NETEM_LOG_ERROR,
+                   "error: duplication needs latency param");
+               return EINVAL;
+       }
+
+       if (p->ifnetem_latency_ms > 1000) {
+               netem_log(NETEM_LOG_ERROR,
+                   "error: latency %d too big (> 1 sec)",
+                   p->ifnetem_latency_ms);
+               return EINVAL;
+       }
+
+       if (p->ifnetem_jitter_ms * 3 > p->ifnetem_latency_ms) {
+               netem_log(NETEM_LOG_ERROR,
+                   "error: jitter %dms too big (latency %dms)",
+                   p->ifnetem_jitter_ms, p->ifnetem_latency_ms);
+               return EINVAL;
+       }
+
+       /* if gr_gl == 0 (no loss), other prob should all be zero */
+       if (p->ifnetem_loss_p_gr_gl == 0 &&
+           (p->ifnetem_loss_p_gr_bl != 0 ||
+           p->ifnetem_loss_p_bl_br != 0 ||
+           p->ifnetem_loss_p_bl_gr != 0 ||
+           p->ifnetem_loss_p_br_bl != 0)) {
+               netem_log(NETEM_LOG_ERROR,
+                   "error: loss params not all zero when p_gr_gl is zero");
+               return EINVAL;
+       }
+
+       /* check state machine transition prob integrity */
+       if (p->ifnetem_loss_p_gr_gl > NETEM_PSCALE ||
+           /* gr_gl = NETEM_PSCALE for total loss */
+           p->ifnetem_loss_p_gr_bl > NETEM_PSCALE ||
+           p->ifnetem_loss_p_bl_br > NETEM_PSCALE ||
+           p->ifnetem_loss_p_bl_gr > NETEM_PSCALE ||
+           p->ifnetem_loss_p_br_bl > NETEM_PSCALE ||
+           p->ifnetem_loss_p_gr_gl + p->ifnetem_loss_p_gr_bl > NETEM_PSCALE ||
+           p->ifnetem_loss_p_bl_br + p->ifnetem_loss_p_bl_gr > NETEM_PSCALE) {
+               netem_log(NETEM_LOG_ERROR, "error: loss params too big");
+               return EINVAL;
+       }
+
+       if (p->ifnetem_reordering_p > NETEM_PSCALE) {
+               netem_log(NETEM_LOG_ERROR, "error: reordering %d > %d",
+                   p->ifnetem_reordering_p, NETEM_PSCALE);
+               return EINVAL;
+       }
+
+       return 0;
+}
+
+static void
+netem_set_params(struct netem *ne, const struct if_netem_params *p)
+{
+       NETEM_MTX_LOCK(ne);
+
+       struct token_bucket *tb = &ne->netem_bandwidth_model;
+       if (p->ifnetem_bandwidth_bps == 0) {
+               tb->depth = 0;
+               tb->rate = 0;
+               tb->token = 0;
+               tb->last = 0;
+       } else {
+               tb->depth = p->ifnetem_bandwidth_bps;
+               tb->rate = p->ifnetem_bandwidth_bps;
+               tb->token = p->ifnetem_bandwidth_bps / 2;
+               tb->last = mach_absolute_time();
+       }
+
+       struct corruption *corr = &ne->netem_corruption_model;
+       corr->corruption_p = p->ifnetem_corruption_p;
+
+       struct duplication *dup = &ne->netem_duplication_model;
+       dup->duplication_p = p->ifnetem_duplication_p;
+
+       struct latency *late = &ne->netem_latency_model;
+       late->latency_ms = p->ifnetem_latency_ms;
+       late->jitter_ms = p->ifnetem_jitter_ms;
+
+       struct loss *loss = &ne->netem_loss_model;
+       loss->state = GAP_RX;
+       loss->p_gr_gl = p->ifnetem_loss_p_gr_gl;
+       loss->p_gr_bl = p->ifnetem_loss_p_gr_bl;
+       loss->p_bl_gr = p->ifnetem_loss_p_bl_gr;
+       loss->p_bl_br = p->ifnetem_loss_p_bl_br;
+       loss->p_br_bl = p->ifnetem_loss_p_br_bl;
+
+       struct reordering *r = &ne->netem_reordering_model;
+       r->reordering_p = p->ifnetem_reordering_p;
+
+       netem_log(NETEM_LOG_INFO, "success: bandwidth %d bps", tb->rate);
+       netem_log(NETEM_LOG_INFO, "success: corruption %d\%",
+           corr->corruption_p);
+       netem_log(NETEM_LOG_INFO, "success: duplication %d\%",
+           dup->duplication_p);
+       netem_log(NETEM_LOG_INFO, "success: latency_ms %d jitter_ms %d",
+           late->latency_ms, late->jitter_ms);
+       netem_log(NETEM_LOG_INFO, "changed loss p_gr_gl %d p_gr_bl %d "
+           "p_bl_gr %d p_bl_br %d p_br_bl %d", loss->p_gr_gl, loss->p_gr_bl,
+           loss->p_bl_gr, loss->p_bl_br, loss->p_br_bl);
+       netem_log(NETEM_LOG_DEBUG, "success: reordering %d\%",
+           r->reordering_p);
+
+       NETEM_MTX_UNLOCK(ne);
+}
+
+void
+netem_get_params(struct netem *ne, struct if_netem_params *p)
+{
+       ASSERT(ne != NULL);
+       NETEM_MTX_LOCK(ne);
+
+       struct token_bucket *tb = &ne->netem_bandwidth_model;
+       p->ifnetem_bandwidth_bps = tb->depth;
+
+       struct corruption *corr = &ne->netem_corruption_model;
+       p->ifnetem_corruption_p = corr->corruption_p;
+
+       struct duplication *dup = &ne->netem_duplication_model;
+       p->ifnetem_duplication_p = dup->duplication_p;
+
+       struct latency *late = &ne->netem_latency_model;
+       p->ifnetem_latency_ms = late->latency_ms;
+       p->ifnetem_jitter_ms = late->jitter_ms;
+
+       struct loss *loss = &ne->netem_loss_model;
+       p->ifnetem_loss_p_gr_gl = loss->p_gr_gl;
+       p->ifnetem_loss_p_gr_bl = loss->p_gr_bl;
+       p->ifnetem_loss_p_bl_gr = loss->p_bl_gr;
+       p->ifnetem_loss_p_bl_br = loss->p_bl_br;
+       p->ifnetem_loss_p_br_bl = loss->p_br_bl;
+
+       struct reordering *r = &ne->netem_reordering_model;
+       p->ifnetem_reordering_p = r->reordering_p;
+
+       NETEM_MTX_UNLOCK(ne);
+}
+
+int
+netem_config(struct netem **ne, const char *name,
+    const struct if_netem_params *p, void *output_handle,
+    int (*output_func)(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts),
+    uint32_t output_max_batch_size)
+{
+       struct netem *netem = NULL;
+       boolean_t enable = TRUE;
+       int ret = 0;
+
+       if (p == NULL || (
+                   p->ifnetem_bandwidth_bps == 0 &&
+                   p->ifnetem_corruption_p == 0 &&
+                   p->ifnetem_duplication_p == 0 &&
+                   p->ifnetem_latency_ms == 0 &&
+                   p->ifnetem_jitter_ms == 0 &&
+                   p->ifnetem_loss_p_gr_gl == 0 &&
+                   p->ifnetem_loss_p_gr_bl == 0 &&
+                   p->ifnetem_loss_p_bl_br == 0 &&
+                   p->ifnetem_loss_p_bl_gr == 0 &&
+                   p->ifnetem_loss_p_br_bl == 0 &&
+                   p->ifnetem_reordering_p == 0)) {
+               enable = FALSE;
+       }
+
+       ret = netem_check_params(p);
+       if (ret != 0) {
+               goto done;
+       }
+
+       if (enable) {
+               if (*ne == NULL) {
+                       netem_log(NETEM_LOG_INFO, "netem create %s", name);
+                       netem = netem_create(name, output_handle, output_func,
+                           output_max_batch_size);
+                       if (netem == NULL) {
+                               return ENOMEM;
+                       }
+                       atomic_set_ptr(ne, netem);
+               }
+               netem_set_params(*ne, p);
+       } else {
+               netem_log(NETEM_LOG_INFO, "netem disable %s", name);
+               if (*ne != NULL) {
+                       netem = *ne;
+                       atomic_set_ptr(ne, NULL);
+                       netem_log(NETEM_LOG_INFO, "netem destroy %s", name);
+                       netem_destroy(netem);
+               }
+               ret = 0;
+       }
+
+done:
+       netem_log(NETEM_LOG_INFO, "netem config ret %d", ret);
+       return ret;
+}
diff --git a/bsd/net/pktsched/pktsched_netem.h b/bsd/net/pktsched/pktsched_netem.h
new file mode 100644 (file)
index 0000000..bcd8b5a
--- /dev/null
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#define NETEM_MAX_BATCH_SIZE    32
+
+__BEGIN_DECLS
+
+extern int netem_init(void);
+extern void netem_fini(void);
+
+extern int netem_config(struct netem **ne, const char *name,
+    const struct if_netem_params *p, void *output_handle,
+    int (*output)(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts),
+    uint32_t output_max_batch_size);
+extern void netem_get_params(struct netem *ne, struct if_netem_params *p);
+extern void netem_destroy(struct netem *ne);
+extern int netem_enqueue(struct netem *ne, classq_pkt_t *p, boolean_t *pdrop);
+extern int netem_dequeue(struct netem *ne, pktsched_pkt_t *p,
+    boolean_t *ppending);
+
+__END_DECLS
index e4c6c2d8c8ffd34b395b04ba8a57787bbc9b8133..7ed8559a2416e1d28f0a28c48332c85db20927fb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2011-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -82,9 +82,8 @@
 /*
  * function prototypes
  */
-static int qfq_enqueue_ifclassq(struct ifclassq *, void *, classq_pkt_type_t,
-    boolean_t *);
-static void *qfq_dequeue_ifclassq(struct ifclassq *, classq_pkt_type_t *);
+static int qfq_enqueue_ifclassq(struct ifclassq *, classq_pkt_t *, boolean_t *);
+static void qfq_dequeue_ifclassq(struct ifclassq *, classq_pkt_t *);
 static int qfq_request_ifclassq(struct ifclassq *, cqrq_t, void *);
 static int qfq_clear_interface(struct qfq_if *);
 static struct qfq_class *qfq_class_create(struct qfq_if *, u_int32_t,
@@ -852,7 +851,7 @@ qfq_dequeue(struct qfq_if *qif, pktsched_pkt_t *pkt)
 
        IFCQ_LOCK_ASSERT_HELD(ifq);
 
-       pkt->pktsched_pkt = NULL;
+       _PKTSCHED_PKT_INIT(pkt);
 
        for (;;) {
                if (er_bits == 0) {
@@ -880,7 +879,8 @@ qfq_dequeue(struct qfq_if *qif, pktsched_pkt_t *pkt)
        VERIFY(cl != NULL && !qempty(&cl->cl_q));
 
        qfq_getq(cl, pkt);
-       VERIFY(pkt->pktsched_pkt != NULL); /* qalg must be work conserving */
+       /* qalg must be work conserving */
+       VERIFY(pkt->pktsched_ptype != QP_INVALID);
        len = pktsched_get_pkt_len(pkt);
 
 #if QFQ_DEBUG
@@ -902,8 +902,8 @@ qfq_dequeue(struct qfq_if *qif, pktsched_pkt_t *pkt)
                log(LOG_DEBUG, "%s: %s qid=%d dequeue pkt=0x%llx F=0x%llx "
                    "V=0x%llx", if_name(QFQIF_IFP(qif)), qfq_style(qif),
                    cl->cl_handle,
-                   (uint64_t)VM_KERNEL_ADDRPERM(pkt->pktsched_pkt), cl->cl_F,
-                   qif->qif_V);
+                   (uint64_t)VM_KERNEL_ADDRPERM(pkt->pktsched_pkt_mbuf),
+                   cl->cl_F, qif->qif_V);
        }
 
        if (qfq_update_class(qif, grp, cl)) {
@@ -1071,7 +1071,7 @@ qfq_enqueue(struct qfq_if *qif, struct qfq_class *cl, pktsched_pkt_t *pkt,
                log(LOG_DEBUG, "%s: %s qid=%d enqueue m=0x%llx state=%s 0x%x "
                    "S=0x%llx F=0x%llx V=0x%llx\n", if_name(QFQIF_IFP(qif)),
                    qfq_style(qif), cl->cl_handle,
-                   (uint64_t)VM_KERNEL_ADDRPERM(pkt->pktsched_pkt),
+                   (uint64_t)VM_KERNEL_ADDRPERM(pkt->pktsched_pkt_mbuf),
                    qfq_state2str(s),
                    qif->qif_bitmaps[s], cl->cl_S, cl->cl_F, qif->qif_V);
        }
@@ -1273,20 +1273,23 @@ qfq_addq(struct qfq_class *cl, pktsched_pkt_t *pkt, struct pf_mtag *t)
 #endif /* PF_ECN */
 
        VERIFY(pkt->pktsched_ptype == qptype(&cl->cl_q));
-       _addq(&cl->cl_q, pkt->pktsched_pkt);
+       _addq(&cl->cl_q, &pkt->pktsched_pkt);
        return 0;
 }
 
 static inline void
 qfq_getq(struct qfq_class *cl, pktsched_pkt_t *pkt)
 {
+       classq_pkt_t p = CLASSQ_PKT_INITIALIZER(p);
+
        IFCQ_LOCK_ASSERT_HELD(cl->cl_qif->qif_ifq);
 
        if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) {
                return sfb_getq(cl->cl_sfb, &cl->cl_q, pkt);
        }
 
-       return pktsched_pkt_encap(pkt, qptype(&cl->cl_q), _getq(&cl->cl_q));
+       _getq(&cl->cl_q, &p);
+       return pktsched_pkt_encap(pkt, &p);
 }
 
 static void
@@ -1564,7 +1567,7 @@ qfq_dump_sched(struct qfq_if *qif, const char *msg)
        log(LOG_DEBUG, "%s: %s      IB 0x%08x\n",
            if_name(QFQIF_IFP(qif)), qfq_style(qif), qif->qif_bitmaps[IB]);
        qfq_dump_groups(qif, 0xffffffff);
-};
+}
 #endif /* QFQ_DEBUG */
 
 /*
@@ -1572,8 +1575,7 @@ qfq_dump_sched(struct qfq_if *qif, const char *msg)
  * (*ifcq_enqueue) in struct ifclassq.
  */
 static int
-qfq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype,
-    boolean_t *pdrop)
+qfq_enqueue_ifclassq(struct ifclassq *ifq, classq_pkt_t *p, boolean_t *pdrop)
 {
        u_int32_t i = 0;
        int ret;
@@ -1582,15 +1584,16 @@ qfq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype,
 
        IFCQ_LOCK_ASSERT_HELD(ifq);
 
-       switch (ptype) {
+       switch (p->cp_ptype) {
        case QP_MBUF: {
-               struct mbuf *m = p;
+               struct mbuf *m = p->cp_mbuf;
                if (!(m->m_flags & M_PKTHDR)) {
                        /* should not happen */
                        log(LOG_ERR, "%s: packet does not have pkthdr\n",
                            if_name(ifq->ifcq_ifp));
                        IFCQ_CONVERT_LOCK(ifq);
                        m_freem(m);
+                       *p = CLASSQ_PKT_INITIALIZER(*p);
                        *pdrop = TRUE;
                        return ENOBUFS;
                }
@@ -1602,12 +1605,13 @@ qfq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype,
 
        default:
                VERIFY(0);
+               __builtin_unreachable();
                /* NOTREACHED */
        }
 
        VERIFY((u_int32_t)i < IFCQ_SC_MAX);
 
-       pktsched_pkt_encap(&pkt, ptype, p);
+       pktsched_pkt_encap(&pkt, p);
 
        ret = qfq_enqueue(ifq->ifcq_disc,
            ifq->ifcq_disc_slots[i].cl, &pkt, t);
@@ -1650,14 +1654,13 @@ qfq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype,
  *     CLASSQDQ_REMOVE must return the same packet if called immediately
  *     after CLASSQDQ_POLL.
  */
-static void *
-qfq_dequeue_ifclassq(struct ifclassq *ifq, classq_pkt_type_t *ptype)
+static void
+qfq_dequeue_ifclassq(struct ifclassq *ifq, classq_pkt_t *cpkt)
 {
        pktsched_pkt_t pkt;
-       bzero(&pkt, sizeof(pkt));
+       _PKTSCHED_PKT_INIT(&pkt);
        qfq_dequeue(ifq->ifcq_disc, &pkt);
-       *ptype = pkt.pktsched_ptype;
-       return pkt.pktsched_pkt;
+       *cpkt = pkt.pktsched_pkt;
 }
 
 static int
index d59bf0d5e4016dd1ac3c18ab6031d2756a699269..aa4ccec537e5cb67d2efc67e5aaa2df028de5e4b 100644 (file)
 /*
  * function prototypes
  */
-static int tcq_enqueue_ifclassq(struct ifclassq *, void *, classq_pkt_type_t,
-    boolean_t *);
-static void *tcq_dequeue_tc_ifclassq(struct ifclassq *, mbuf_svc_class_t,
-    classq_pkt_type_t *);
+static int tcq_enqueue_ifclassq(struct ifclassq *, classq_pkt_t *, boolean_t *);
+static void tcq_dequeue_tc_ifclassq(struct ifclassq *, mbuf_svc_class_t,
+    classq_pkt_t *);
 static int tcq_request_ifclassq(struct ifclassq *, cqrq_t, void *);
 static int tcq_clear_interface(struct tcq_if *);
 static struct tcq_class *tcq_class_create(struct tcq_if *, int, u_int32_t,
@@ -489,24 +488,23 @@ tcq_dequeue_cl(struct tcq_if *tif, struct tcq_class *cl, mbuf_svc_class_t sc,
        uint32_t len;
 
        IFCQ_LOCK_ASSERT_HELD(ifq);
+       pkt->pktsched_pkt_mbuf = NULL;
 
        if (cl == NULL) {
                cl = tcq_clh_to_clp(tif, MBUF_SCIDX(sc));
                if (cl == NULL) {
-                       pkt->pktsched_pkt = NULL;
                        return;
                }
        }
 
        if (qempty(&cl->cl_q)) {
-               pkt->pktsched_pkt = NULL;
                return;
        }
 
        VERIFY(!IFCQ_IS_EMPTY(ifq));
 
        tcq_getq(cl, pkt);
-       if (pkt->pktsched_pkt != NULL) {
+       if (pkt->pktsched_pkt_mbuf != NULL) {
                len = pktsched_get_pkt_len(pkt);
                IFCQ_DEC_LEN(ifq);
                IFCQ_DEC_BYTES(ifq, len);
@@ -578,7 +576,7 @@ tcq_addq(struct tcq_class *cl, pktsched_pkt_t *pkt, struct pf_mtag *t)
 #endif /* PF_ECN */
 
        VERIFY(pkt->pktsched_ptype == qptype(&cl->cl_q));
-       _addq(&cl->cl_q, pkt->pktsched_pkt);
+       _addq(&cl->cl_q, &pkt->pktsched_pkt);
 
        return 0;
 }
@@ -586,13 +584,16 @@ tcq_addq(struct tcq_class *cl, pktsched_pkt_t *pkt, struct pf_mtag *t)
 static inline void
 tcq_getq(struct tcq_class *cl, pktsched_pkt_t *pkt)
 {
+       classq_pkt_t p = CLASSQ_PKT_INITIALIZER(p);
+
        IFCQ_LOCK_ASSERT_HELD(cl->cl_tif->tif_ifq);
 
        if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) {
                return sfb_getq(cl->cl_sfb, &cl->cl_q, pkt);
        }
 
-       return pktsched_pkt_encap(pkt, qptype(&cl->cl_q), _getq(&cl->cl_q));
+       _getq(&cl->cl_q, &p);
+       return pktsched_pkt_encap(pkt, &p);
 }
 
 static void
@@ -739,8 +740,7 @@ tcq_style(struct tcq_if *tif)
  * (*ifcq_enqueue) in struct ifclassq.
  */
 static int
-tcq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype,
-    boolean_t *pdrop)
+tcq_enqueue_ifclassq(struct ifclassq *ifq, classq_pkt_t *p, boolean_t *pdrop)
 {
        u_int32_t i = 0;
        int ret;
@@ -749,14 +749,15 @@ tcq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype,
 
        IFCQ_LOCK_ASSERT_HELD(ifq);
 
-       if (ptype == QP_MBUF) {
-               struct mbuf *m = p;
+       if (p->cp_ptype == QP_MBUF) {
+               struct mbuf *m = p->cp_mbuf;
                if (!(m->m_flags & M_PKTHDR)) {
                        /* should not happen */
                        log(LOG_ERR, "%s: packet does not have pkthdr\n",
                            if_name(ifq->ifcq_ifp));
                        IFCQ_CONVERT_LOCK(ifq);
                        m_freem(m);
+                       *p = CLASSQ_PKT_INITIALIZER(*p);
                        *pdrop = TRUE;
                        return ENOBUFS;
                }
@@ -765,7 +766,7 @@ tcq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype,
        }
        VERIFY((u_int32_t)i < IFCQ_SC_MAX);
 
-       pktsched_pkt_encap(&pkt, ptype, p);
+       pktsched_pkt_encap(&pkt, p);
 
        ret = tcq_enqueue(ifq->ifcq_disc,
            ifq->ifcq_disc_slots[i].cl, &pkt, t);
@@ -795,6 +796,7 @@ tcq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype,
                break;
        default:
                VERIFY(0);
+               __builtin_unreachable();
        }
        return ret;
 }
@@ -808,19 +810,18 @@ tcq_enqueue_ifclassq(struct ifclassq *ifq, void *p, classq_pkt_type_t ptype,
  *     CLASSQDQ_REMOVE must return the same packet if called immediately
  *     after CLASSQDQ_POLL.
  */
-static void *
+static void
 tcq_dequeue_tc_ifclassq(struct ifclassq *ifq, mbuf_svc_class_t sc,
-    classq_pkt_type_t *ptype)
+    classq_pkt_t *cpkt)
 {
        pktsched_pkt_t pkt;
        u_int32_t i = MBUF_SCIDX(sc);
 
        VERIFY((u_int32_t)i < IFCQ_SC_MAX);
 
-       bzero(&pkt, sizeof(pkt));
+       _PKTSCHED_PKT_INIT(&pkt);
        (tcq_dequeue_cl(ifq->ifcq_disc, ifq->ifcq_disc_slots[i].cl, sc, &pkt));
-       *ptype = pkt.pktsched_ptype;
-       return pkt.pktsched_pkt;
+       *cpkt = pkt.pktsched_pkt;
 }
 
 static int
diff --git a/bsd/net/restricted_in_port.c b/bsd/net/restricted_in_port.c
new file mode 100644 (file)
index 0000000..7e39a0d
--- /dev/null
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <kern/task.h>
+#include <IOKit/IOBSD.h>
+#include <net/restricted_in_port.h>
+#include <netinet/in.h>
+#include <os/log.h>
+
+/*
+ * Entitlement required for using the port of the test entry
+ */
+#define ENTITLEMENT_TEST_PORT "com.apple.private.network.restricted.port.test"
+
+/*
+ * Entitlement required for setting the test sysctl variables
+ */
+#define ENTITLEMENT_TEST_CONTROL "com.apple.private.network.restricted.port.control"
+
+/*
+ * Use a single bitmap for quickly checking if a TCP or UDP port is restricted
+ */
+bitmap_t *restricted_port_bitmap = NULL;
+
+struct restricted_port_entry {
+       const char      *rpe_entitlement;   // entitlement to check for this port
+       in_port_t       rpe_port;           // restricted port number (host byte order)
+       uint16_t        rpe_flags;          // RPE_FLAG_xxx
+};
+
+/*
+ * Possible values for the field rpe_flags
+ */
+#define RPE_FLAG_SUPERUSER     0x01    // superuser can use the port
+#define RPE_FLAG_ENTITLEMENT   0x02    // can use the port with the required entitlement
+#define RPE_FLAG_TCP           0x04    // require entitlement for TCP
+#define RPE_FLAG_UDP           0x08    // require entitlement for TCP
+#define RPE_FLAG_TEST          0x10    // entry for testing
+
+static struct restricted_port_entry restricted_port_list[] = {
+#if CONFIG_EMBEDDED
+       /*
+        * Network relay proxy
+        */
+       {
+               .rpe_port = 62742,
+               .rpe_flags = RPE_FLAG_ENTITLEMENT | RPE_FLAG_TCP | RPE_FLAG_UDP,
+               .rpe_entitlement = "com.apple.private.network.restricted.port.nr_proxy",
+       },
+
+       /*
+        * Network relay control
+        */
+       {
+               .rpe_port = 62743,
+               .rpe_flags = RPE_FLAG_ENTITLEMENT | RPE_FLAG_UDP,
+               .rpe_entitlement = "com.apple.private.network.restricted.port.nr_control",
+       },
+
+       /*
+        * Entries for identityservicesd
+        */
+       {
+               .rpe_port = 61314,
+               .rpe_flags = RPE_FLAG_ENTITLEMENT | RPE_FLAG_TCP | RPE_FLAG_UDP,
+               .rpe_entitlement = "com.apple.private.network.restricted.port.ids_service_connector",
+       },
+       {
+               .rpe_port = 61315,
+               .rpe_flags = RPE_FLAG_ENTITLEMENT | RPE_FLAG_TCP | RPE_FLAG_UDP,
+               .rpe_entitlement = "com.apple.private.network.restricted.port.ids_cloud_service_connector",
+       },
+#endif /* CONFIG_EMBEDDED */
+
+#if (DEBUG || DEVELOPMENT)
+       /*
+        * Entries reserved for unit testing
+        */
+       {
+               .rpe_port = 0,
+               .rpe_flags = RPE_FLAG_TCP | RPE_FLAG_TEST,
+               .rpe_entitlement = ENTITLEMENT_TEST_PORT,
+       },
+       {
+               .rpe_port = 0,
+               .rpe_flags = RPE_FLAG_UDP | RPE_FLAG_TEST,
+               .rpe_entitlement = ENTITLEMENT_TEST_PORT,
+       },
+#endif /* (DEBUG || DEVELOPMENT) */
+
+       /*
+        * Sentinel to mark the actual end of the list (rpe_entitlement == NULL)
+        */
+       {
+               .rpe_port = 0,
+               .rpe_flags = 0,
+               .rpe_entitlement = NULL,
+       }
+};
+
+#define RPE_ENTRY_COUNT (sizeof(restricted_port_list) / sizeof(restricted_port_list[0]))
+
+SYSCTL_NODE(_net, OID_AUTO, restricted_port,
+    CTLFLAG_RW | CTLFLAG_LOCKED, 0, "restricted port");
+
+static int sysctl_restricted_port_bitmap SYSCTL_HANDLER_ARGS;
+static int sysctl_restricted_port_enforced SYSCTL_HANDLER_ARGS;
+static int sysctl_restricted_port_verbose SYSCTL_HANDLER_ARGS;
+
+SYSCTL_PROC(_net_restricted_port, OID_AUTO, bitmap,
+    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
+    0, 0, &sysctl_restricted_port_bitmap, "", "");
+
+/*
+ * In order to set the following sysctl variables the process needs to run as superuser
+ * or have the entitlement ENTITLEMENT_TEST_CONTROL
+ */
+#if (DEBUG || DEVELOPMENT)
+static int restricted_port_enforced = 1;
+SYSCTL_PROC(_net_restricted_port, OID_AUTO, enforced,
+    CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW | CTLFLAG_ANYBODY,
+    0, 0, &sysctl_restricted_port_enforced, "I", "");
+#else /* (DEBUG || DEVELOPMENT) */
+const int restricted_port_enforced = 1;
+SYSCTL_PROC(_net_restricted_port, OID_AUTO, enforced,
+    CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RD,
+    0, 0, &sysctl_restricted_port_enforced, "I", "");
+#endif /* (DEBUG || DEVELOPMENT) */
+
+static int restricted_port_verbose = 0;
+SYSCTL_PROC(_net_restricted_port, OID_AUTO, verbose,
+    CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW | CTLFLAG_ANYBODY,
+    0, 0, &sysctl_restricted_port_verbose, "I", "");
+
+#if (DEBUG || DEVELOPMENT)
+
+/*
+ * Register dynamically a test port set by the unit test program to avoid conflict with
+ * a restricted port currently used by its legetimate process.
+ * The value must be passed is in host byte order.
+ */
+static uint16_t restricted_port_test = 0;
+
+static int sysctl_restricted_port_test_entitlement SYSCTL_HANDLER_ARGS;
+static int sysctl_restricted_port_test_superuser SYSCTL_HANDLER_ARGS;
+
+SYSCTL_PROC(_net_restricted_port, OID_AUTO, test_entitlement,
+    CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW | CTLFLAG_ANYBODY,
+    0, 0, &sysctl_restricted_port_test_entitlement, "UI", "");
+
+SYSCTL_PROC(_net_restricted_port, OID_AUTO, test_superuser,
+    CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW | CTLFLAG_ANYBODY,
+    0, 0, &sysctl_restricted_port_test_superuser, "UI", "");
+#endif /* (DEBUG || DEVELOPMENT) */
+
+static int
+sysctl_restricted_port_bitmap SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+
+       if (req->newptr) {
+               return EPERM;
+       }
+       int error = SYSCTL_OUT(req, restricted_port_bitmap, BITMAP_SIZE(UINT16_MAX));
+
+       return error;
+}
+
+static int
+sysctl_restricted_port_enforced SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int old_value = restricted_port_enforced;
+       int value = old_value;
+
+       int error = sysctl_handle_int(oidp, &value, 0, req);
+       if (error != 0 || !req->newptr) {
+               return error;
+       }
+#if (DEBUG || DEVELOPMENT)
+       if (proc_suser(current_proc()) != 0 &&
+           !IOTaskHasEntitlement(current_task(), ENTITLEMENT_TEST_CONTROL)) {
+               return EPERM;
+       }
+       restricted_port_enforced = value;
+       os_log(OS_LOG_DEFAULT,
+           "%s:%u sysctl net.restricted_port.enforced: %d -> %d",
+           proc_best_name(current_proc()), proc_selfpid(),
+           old_value, restricted_port_enforced);
+       return error;
+#else
+       return EPERM;
+#endif /* (DEBUG || DEVELOPMENT) */
+}
+
+static int
+sysctl_restricted_port_verbose SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int old_value = restricted_port_verbose;
+       int value = old_value;
+
+       int error = sysctl_handle_int(oidp, &value, 0, req);
+       if (error != 0 || !req->newptr) {
+               return error;
+       }
+       if (proc_suser(current_proc()) != 0 &&
+           !IOTaskHasEntitlement(current_task(), ENTITLEMENT_TEST_CONTROL)) {
+               return EPERM;
+       }
+       restricted_port_verbose = value;
+       os_log(OS_LOG_DEFAULT,
+           "%s:%u sysctl net.restricted_port.verbose: %d -> %d)",
+           proc_best_name(current_proc()), proc_selfpid(),
+           old_value, restricted_port_verbose);
+
+       return error;
+}
+
+#if (DEBUG || DEVELOPMENT)
+
+static int
+sysctl_restricted_port_test_common(struct sysctl_oid *oidp,
+    struct sysctl_req *req, bool test_superuser)
+{
+       uint16_t old_value = restricted_port_test;
+       int value = old_value;
+       unsigned int i;
+
+       int error = sysctl_handle_int(oidp, &value, 0, req);
+       if (error != 0 || !req->newptr) {
+               return error;
+       }
+       if (proc_suser(current_proc()) != 0 &&
+           !IOTaskHasEntitlement(current_task(), ENTITLEMENT_TEST_CONTROL)) {
+               return EPERM;
+       }
+       if (value < 0 || value > UINT16_MAX) {
+               return EINVAL;
+       }
+       if (value == 0) {
+               /*
+                * Clear the current test port entries
+                */
+               if (restricted_port_test != 0) {
+                       for (i = 0; i < RPE_ENTRY_COUNT; i++) {
+                               struct restricted_port_entry *rpe = &restricted_port_list[i];
+
+                               if (rpe->rpe_entitlement == NULL) {
+                                       break;
+                               }
+                               if (!(rpe->rpe_flags & RPE_FLAG_TEST)) {
+                                       continue;
+                               }
+                               rpe->rpe_port = 0;
+                               rpe->rpe_flags &= ~(RPE_FLAG_ENTITLEMENT | RPE_FLAG_SUPERUSER);
+                       }
+                       bitmap_clear(restricted_port_bitmap, restricted_port_test);
+                       restricted_port_test = 0;
+               }
+       } else {
+               for (i = 0; i < RPE_ENTRY_COUNT; i++) {
+                       struct restricted_port_entry *rpe = &restricted_port_list[i];
+
+                       if (rpe->rpe_entitlement == NULL) {
+                               break;
+                       }
+                       if (!(rpe->rpe_flags & RPE_FLAG_TEST)) {
+                               continue;
+                       }
+                       rpe->rpe_port = value;
+                       if (test_superuser) {
+                               rpe->rpe_flags |= RPE_FLAG_SUPERUSER;
+                               rpe->rpe_flags &= ~RPE_FLAG_ENTITLEMENT;
+                       } else {
+                               rpe->rpe_flags |= RPE_FLAG_ENTITLEMENT;
+                               rpe->rpe_flags &= ~RPE_FLAG_SUPERUSER;
+                       }
+               }
+               restricted_port_test = (uint16_t)value;
+               bitmap_set(restricted_port_bitmap, restricted_port_test);
+       }
+
+       return 0;
+}
+
+static int
+sysctl_restricted_port_test_entitlement SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       uint16_t old_value = restricted_port_test;
+       int error;
+
+       error = sysctl_restricted_port_test_common(oidp, req, false);
+       if (error == 0) {
+               os_log(OS_LOG_DEFAULT,
+                   "%s:%u sysctl net.restricted_port.test_entitlement: %u -> %u)",
+                   proc_best_name(current_proc()), proc_selfpid(),
+                   old_value, restricted_port_test);
+       }
+       return error;
+}
+
+static int
+sysctl_restricted_port_test_superuser SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       uint16_t old_value = restricted_port_test;
+       int error;
+
+       error = sysctl_restricted_port_test_common(oidp, req, true);
+       if (error == 0) {
+               os_log(OS_LOG_DEFAULT,
+                   "%s:%u sysctl net.restricted_port.test_superuser: %u -> %u)",
+                   proc_best_name(current_proc()), proc_selfpid(),
+                   old_value, restricted_port_test);
+       }
+       return error;
+}
+
+#endif /* (DEBUG || DEVELOPMENT) */
+
+void
+restricted_in_port_init(void)
+{
+       unsigned int i;
+
+
+       restricted_port_bitmap = bitmap_alloc(UINT16_MAX);
+
+       if (restricted_port_bitmap == NULL) {
+               panic("restricted_port_init: bitmap allocation failed");
+       }
+
+       for (i = 0; i < RPE_ENTRY_COUNT; i++) {
+               struct restricted_port_entry *rpe = &restricted_port_list[i];
+
+               if (rpe->rpe_entitlement == NULL) {
+                       break;
+               }
+               if (rpe->rpe_port == 0) {
+                       continue;
+               }
+               bitmap_set(restricted_port_bitmap, rpe->rpe_port);
+       }
+}
+
+static const char *
+port_flag_str(uint32_t port_flags)
+{
+       switch (port_flags) {
+       case PORT_FLAGS_LISTENER:
+               return "listener";
+       case PORT_FLAGS_BSD:
+               return "bsd";
+       case PORT_FLAGS_PF:
+               return "pf";
+       default:
+               break;
+       }
+       return "?";
+}
+
+/*
+ * The port is passed in network byte order
+ */
+bool
+current_task_can_use_restricted_in_port(in_port_t port, uint8_t protocol, uint32_t port_flags)
+{
+       unsigned int i;
+       struct proc *p = current_proc();
+       pid_t pid = proc_pid(p);
+
+       /*
+        * Quick check that does not take in account the protocol
+        */
+       if (!IS_RESTRICTED_IN_PORT(port) || restricted_port_enforced == 0) {
+               if (restricted_port_verbose > 1) {
+                       os_log(OS_LOG_DEFAULT,
+                           "port %u for protocol %u via %s can be used by process %s:%u",
+                           ntohs(port), protocol, port_flag_str(port_flags), proc_best_name(p), pid);
+               }
+               return true;
+       }
+
+       for (i = 0; i < RPE_ENTRY_COUNT; i++) {
+               struct restricted_port_entry *rpe = &restricted_port_list[i];
+
+               if (rpe->rpe_entitlement == NULL) {
+                       break;
+               }
+               if (rpe->rpe_port == 0) {
+                       continue;
+               }
+               if ((protocol == IPPROTO_TCP && !(rpe->rpe_flags & RPE_FLAG_TCP)) ||
+                   (protocol == IPPROTO_UDP && !(rpe->rpe_flags & RPE_FLAG_UDP))) {
+                       continue;
+               }
+               if (rpe->rpe_port != ntohs(port)) {
+                       continue;
+               }
+               /*
+                * Found an entry in the list of restricted ports
+                *
+                * A process can use a restricted port if it meets at least one of
+                * the following conditions:
+                * - The process has the required entitlement
+                * - The port is marked as usable by root
+                */
+               task_t task = current_task();
+               if (rpe->rpe_flags & RPE_FLAG_SUPERUSER) {
+                       if (task == kernel_task || proc_suser(current_proc()) == 0) {
+                               os_log(OS_LOG_DEFAULT,
+                                   "root restricted port %u for protocol %u via %s can be used by superuser process %s:%u",
+                                   ntohs(port), protocol, port_flag_str(port_flags), proc_best_name(p), pid);
+                               return true;
+                       }
+               }
+               if (rpe->rpe_flags & RPE_FLAG_ENTITLEMENT) {
+                       /*
+                        * Do not let the kernel use the port because there is
+                        * no entitlement for kernel extensions
+                        */
+                       if (task == kernel_task) {
+                               os_log(OS_LOG_DEFAULT,
+                                   "entitlement restricted port %u for protocol %u via %s cannot be used by kernel",
+                                   ntohs(port), protocol, port_flag_str(port_flags));
+                               return false;
+                       }
+                       if (!IOTaskHasEntitlement(current_task(), rpe->rpe_entitlement)) {
+                               os_log(OS_LOG_DEFAULT,
+                                   "entitlement restricted port %u for protocol %u via %s cannot be used by process %s:%u -- IOTaskHasEntitlement(%s) failed",
+                                   ntohs(port), protocol, port_flag_str(port_flags), proc_best_name(p), pid, rpe->rpe_entitlement);
+                               return false;
+                       }
+                       os_log(OS_LOG_DEFAULT,
+                           "entitlement restricted port %u for protocol %u via %s can be used by process %s:%u",
+                           ntohs(port), protocol, port_flag_str(port_flags), proc_best_name(p), pid);
+                       return true;
+               }
+               os_log(OS_LOG_DEFAULT,
+                   "root restricted port %u for protocol %u via %s cannot be used by process %s:%u",
+                   ntohs(port), protocol, port_flag_str(port_flags), proc_best_name(p), pid);
+               return false;
+       }
+       if (restricted_port_verbose > 1) {
+               os_log(OS_LOG_DEFAULT,
+                   "port %u for protocol %u via %s can be used by process %s:%u",
+                   ntohs(port), protocol, port_flag_str(port_flags), proc_best_name(p), pid);
+       }
+       return true;
+}
diff --git a/bsd/net/restricted_in_port.h b/bsd/net/restricted_in_port.h
new file mode 100644 (file)
index 0000000..2520d9b
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _NETINET_IN_RESTRICTED_PORT_H_
+#define _NETINET_IN_RESTRICTED_PORT_H_
+
+#ifdef BSD_KERNEL_PRIVATE
+
+#include <kern/bits.h>
+
+#define PORT_FLAGS_LISTENER 0x00
+#define PORT_FLAGS_BSD      0x02
+#define PORT_FLAGS_PF       0x03
+#define PORT_FLAGS_MAX      0x03
+
+/*
+ * the port in network byte order
+ */
+#define IS_RESTRICTED_IN_PORT(x) (bitmap_test(restricted_port_bitmap, ntohs((uint16_t)(x))))
+
+extern bitmap_t *restricted_port_bitmap;
+
+extern void restricted_in_port_init(void);
+
+/*
+ * The port must be in network byte order
+ */
+extern bool current_task_can_use_restricted_in_port(in_port_t port, uint8_t protocol, uint32_t port_flags);
+
+#endif /* BSD_KERNEL_PRIVATE */
+
+#endif /* _NETINET_IN_RESTRICTED_PORT_H_ */
index bba83ba464e19dc73ead68945d6f204249504185..b63ba0183744560e61d098c382ac4dca1c38f96f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -76,6 +76,7 @@
 #include <sys/mcache.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
+#include <sys/sdt.h>
 #include <sys/kernel.h>
 #include <kern/locks.h>
 #include <kern/zalloc.h>
 
 extern void kdp_set_gateway_mac(void *gatewaymac);
 
-__private_extern__ struct rtstat rtstat  = { 0, 0, 0, 0, 0, 0 };
+__private_extern__ struct rtstat rtstat  = {
+       .rts_badredirect = 0,
+       .rts_dynamic = 0,
+       .rts_newgateway = 0,
+       .rts_unreach = 0,
+       .rts_wildcard = 0,
+       .rts_badrtgwroute = 0
+};
 struct radix_node_head *rt_tables[AF_MAX+1];
 
 decl_lck_mtx_data(, rnh_lock_data);    /* global routing tables mutex */
@@ -231,6 +239,7 @@ static lck_grp_attr_t       *rte_mtx_grp_attr;
 
 int rttrash = 0;               /* routes not in table but not freed */
 
+boolean_t trigger_v6_defrtr_select = FALSE;
 unsigned int rte_debug = 0;
 
 /* Possible flags for rte_debug */
@@ -362,11 +371,18 @@ struct matchleaf_arg {
  * of sockaddr_in for convenience).
  */
 static struct sockaddr sin_def = {
-       sizeof (struct sockaddr_in), AF_INET, { 0, }
+       .sa_len = sizeof (struct sockaddr_in),
+       .sa_family = AF_INET,
+       .sa_data = { 0, }
 };
 
 static struct sockaddr_in6 sin6_def = {
-       sizeof (struct sockaddr_in6), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0
+       .sin6_len = sizeof (struct sockaddr_in6),
+       .sin6_family = AF_INET6,
+       .sin6_port = 0,
+       .sin6_flowinfo = 0,
+       .sin6_addr = IN6ADDR_ANY_INIT,
+       .sin6_scope_id = 0
 };
 
 /*
@@ -1765,6 +1781,10 @@ rtrequest_common_locked(int req, struct sockaddr *dst0,
 
 #define        senderr(x) { error = x; goto bad; }
 
+       DTRACE_ROUTE6(rtrequest, int, req, struct sockaddr *, dst0,
+           struct sockaddr *, gateway, struct sockaddr *, netmask,
+           int, flags, unsigned int, ifscope);
+
        LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED);
        /*
         * Find the correct routing tree to use for this Address Family
@@ -1930,6 +1950,10 @@ rtrequest_common_locked(int req, struct sockaddr *dst0,
                if (rt_primary_default(rt, rt_key(rt))) {
                        set_primary_ifscope(rt_key(rt)->sa_family,
                            IFSCOPE_NONE);
+                       if ((rt->rt_flags & RTF_STATIC) &&
+                           rt_key(rt)->sa_family == PF_INET6) {
+                               trigger_v6_defrtr_select = TRUE;
+                       }
                }
 
 #if NECP
@@ -2453,7 +2477,7 @@ delete_rt:
  * Round up sockaddr len to multiples of 32-bytes.  This will reduce
  * or even eliminate the need to re-allocate the chunk of memory used
  * for rt_key and rt_gateway in the event the gateway portion changes.
- * Certain code paths (e.g. IPSec) are notorious for caching the address
+ * Certain code paths (e.g. IPsec) are notorious for caching the address
  * of rt_gateway; this rounding-up would help ensure that the gateway
  * portion never gets deallocated (though it may change contents) and
  * thus greatly simplifies things.
@@ -2823,7 +2847,7 @@ node_lookup(struct sockaddr *dst, struct sockaddr *netmask,
        struct radix_node *rn;
        struct sockaddr_storage ss, mask;
        int af = dst->sa_family;
-       struct matchleaf_arg ma = { ifscope };
+       struct matchleaf_arg ma = { .ifscope = ifscope };
        rn_matchf_t *f = rn_match_ifscope;
        void *w = &ma;
 
@@ -4410,7 +4434,7 @@ route_op_entitlement_check(struct socket *so,
                         * allowed accesses.
                         */
                        if (soopt_cred_check(so, PRIV_NET_RESTRICTED_ROUTE_NC_READ,
-                           allow_root) == 0)
+                           allow_root, false) == 0)
                                return (0);
                        else
                                return (-1);
index 5b4ea82ed58ebfe2d5276ec5a3059d07e00a697a..d1406262f5122ec0c829af5159a330721757e423 100644 (file)
@@ -107,6 +107,7 @@ struct route_old {
 #include <sys/eventhandler.h>
 #include <net/if_dl.h>
 
+extern boolean_t trigger_v6_defrtr_select;
 /*
  * Kernel resident routing tables.
  *
index 23d7bf2010f15bcd6672d033e71e893a881b2736..a8b2601593c7397ac9d3339cecad3917be078234 100644 (file)
@@ -94,9 +94,9 @@ static struct domain *routedomain = NULL;
 
 MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");
 
-static struct sockaddr route_dst = { 2, PF_ROUTE, { 0, } };
-static struct sockaddr route_src = { 2, PF_ROUTE, { 0, } };
-static struct sockaddr sa_zero   = { sizeof(sa_zero), AF_INET, { 0, } };
+static struct sockaddr route_dst = { .sa_len = 2, .sa_family = PF_ROUTE, .sa_data = { 0, } };
+static struct sockaddr route_src = { .sa_len = 2, .sa_family = PF_ROUTE, .sa_data = { 0, } };
+static struct sockaddr sa_zero   = { .sa_len = sizeof(sa_zero), .sa_family = AF_INET, .sa_data = { 0, } };
 
 struct route_cb {
        u_int32_t       ip_count;       /* attached w/ AF_INET */
@@ -160,6 +160,9 @@ SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "routing");
 #define ADVANCE32(x, n)                                                 \
        (x += ROUNDUP32((n)->sa_len))
 
+#define RT_HAS_IFADDR(rt)                                               \
+       ((rt)->rt_ifa != NULL && (rt)->rt_ifa->ifa_addr != NULL)
+
 /*
  * It really doesn't make any sense at all for this code to share much
  * with raw_usrreq.c, since its functionality is so restricted.  XXX
@@ -383,7 +386,7 @@ route_output(struct mbuf *m, struct socket *so)
        }
 
        if (info.rti_info[RTAX_DST]->sa_family == AF_INET &&
-           info.rti_info[RTAX_DST]->sa_len != sizeof(dst_in)) {
+           info.rti_info[RTAX_DST]->sa_len != sizeof(struct sockaddr_in)) {
                /* At minimum, we need up to sin_addr */
                if (info.rti_info[RTAX_DST]->sa_len <
                    offsetof(struct sockaddr_in, sin_zero)) {
@@ -396,22 +399,29 @@ route_output(struct mbuf *m, struct socket *so)
                dst_in.sin_addr = SIN(info.rti_info[RTAX_DST])->sin_addr;
                info.rti_info[RTAX_DST] = (struct sockaddr *)&dst_in;
                dst_sa_family = info.rti_info[RTAX_DST]->sa_family;
+       } else if (info.rti_info[RTAX_DST]->sa_family == AF_INET6 &&
+           info.rti_info[RTAX_DST]->sa_len < sizeof(struct sockaddr_in6)) {
+               senderr(EINVAL);
        }
 
-       if (info.rti_info[RTAX_GATEWAY] != NULL &&
-           info.rti_info[RTAX_GATEWAY]->sa_family == AF_INET &&
-           info.rti_info[RTAX_GATEWAY]->sa_len != sizeof(gate_in)) {
-               /* At minimum, we need up to sin_addr */
-               if (info.rti_info[RTAX_GATEWAY]->sa_len <
-                   offsetof(struct sockaddr_in, sin_zero)) {
+       if (info.rti_info[RTAX_GATEWAY] != NULL) {
+               if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_INET &&
+                   info.rti_info[RTAX_GATEWAY]->sa_len != sizeof(struct sockaddr_in)) {
+                       /* At minimum, we need up to sin_addr */
+                       if (info.rti_info[RTAX_GATEWAY]->sa_len <
+                           offsetof(struct sockaddr_in, sin_zero)) {
+                               senderr(EINVAL);
+                       }
+                       bzero(&gate_in, sizeof(gate_in));
+                       gate_in.sin_len = sizeof(gate_in);
+                       gate_in.sin_family = AF_INET;
+                       gate_in.sin_port = SIN(info.rti_info[RTAX_GATEWAY])->sin_port;
+                       gate_in.sin_addr = SIN(info.rti_info[RTAX_GATEWAY])->sin_addr;
+                       info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gate_in;
+               } else if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_INET6 &&
+                   info.rti_info[RTAX_GATEWAY]->sa_len < sizeof(struct sockaddr_in6)) {
                        senderr(EINVAL);
                }
-               bzero(&gate_in, sizeof(gate_in));
-               gate_in.sin_len = sizeof(gate_in);
-               gate_in.sin_family = AF_INET;
-               gate_in.sin_port = SIN(info.rti_info[RTAX_GATEWAY])->sin_port;
-               gate_in.sin_addr = SIN(info.rti_info[RTAX_GATEWAY])->sin_addr;
-               info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gate_in;
        }
 
        if (info.rti_info[RTAX_GENMASK]) {
@@ -755,7 +765,7 @@ flush:
                        return error;
                }
        } else {
-               struct sockproto route_proto = { PF_ROUTE, 0 };
+               struct sockproto route_proto = { .sp_family = PF_ROUTE, .sp_protocol = 0 };
                if (rp != NULL) {
                        rp->rcb_proto.sp_family = 0; /* Avoid us */
                }
@@ -1315,7 +1325,7 @@ rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error)
        struct rt_msghdr *rtm;
        struct mbuf *m;
        struct sockaddr *sa = rtinfo->rti_info[RTAX_DST];
-       struct sockproto route_proto = { PF_ROUTE, 0 };
+       struct sockproto route_proto = { .sp_family = PF_ROUTE, .sp_protocol = 0 };
 
        if (route_cb.any_count == 0) {
                return;
@@ -1342,7 +1352,7 @@ rt_ifmsg(struct ifnet *ifp)
        struct if_msghdr *ifm;
        struct mbuf *m;
        struct rt_addrinfo info;
-       struct  sockproto route_proto = { PF_ROUTE, 0 };
+       struct  sockproto route_proto = { .sp_family = PF_ROUTE, .sp_protocol = 0 };
 
        if (route_cb.any_count == 0) {
                return;
@@ -1379,7 +1389,7 @@ rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt)
        int pass;
        struct mbuf *m = 0;
        struct ifnet *ifp = ifa->ifa_ifp;
-       struct sockproto route_proto = { PF_ROUTE, 0 };
+       struct sockproto route_proto = { .sp_family = PF_ROUTE, .sp_protocol = 0 };
 
        LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED);
        RT_LOCK_ASSERT_HELD(rt);
@@ -1461,7 +1471,7 @@ rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma)
        struct mbuf *m = 0;
        struct ifnet *ifp = ifma->ifma_ifp;
        struct ifma_msghdr *ifmam;
-       struct sockproto route_proto = { PF_ROUTE, 0 };
+       struct sockproto route_proto = { .sp_family = PF_ROUTE, .sp_protocol = 0 };
 
        if (route_cb.any_count == 0) {
                return;
@@ -1608,6 +1618,9 @@ sysctl_dumpentry(struct radix_node *rn, void *vw)
        info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
        info.rti_info[RTAX_NETMASK] = rt_mask(rt);
        info.rti_info[RTAX_GENMASK] = rt->rt_genmask;
+       if (RT_HAS_IFADDR(rt)) {
+               info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
+       }
 
        if (w->w_op != NET_RT_DUMP2) {
                size = rt_msg2(RTM_GET, &info, NULL, w, credp);
diff --git a/bsd/net/sixxlowpan.c b/bsd/net/sixxlowpan.c
new file mode 100644 (file)
index 0000000..8ccaab0
--- /dev/null
@@ -0,0 +1,897 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ * Copyright (c) 2008, Swedish Institute of Computer Science.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Institute nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file is part of the Contiki operating system.
+ *
+ */
+
+/**
+ * \file
+ *         Header file for the 6lowpan implementation
+ *         (RFC4944 and draft-hui-6lowpan-hc-01)
+ * \author Adam Dunkels <adam@sics.se>
+ * \author Nicolas Tsiftes <nvt@sics.se>
+ * \author Niclas Finne <nfi@sics.se>
+ * \author Mathilde Durvy <mdurvy@cisco.com>
+ * \author Julien Abeille <jabeille@cisco.com>
+ */
+
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/domain.h>
+#include <net/ethernet.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <netinet/icmp6.h>
+#include <sys/errno.h>
+#include <libkern/libkern.h>
+
+
+#include <net/sixxlowpan.h>
+#include <net/frame802154.h>
+
+errno_t
+compress_hdr_hc1(struct frame802154 *, u_int8_t *,
+    long *, size_t *, u_int8_t *);
+errno_t
+uncompress_hdr_hc1(struct frame802154 *, u_int8_t *,
+    uint16_t, long *, size_t *, u_int8_t *);
+
+
+
+/**
+ * \addtogroup sicslowpan
+ * @{
+ */
+
+/**
+ * \name General sicslowpan defines
+ * @{
+ */
+/* Min and Max compressible UDP ports - HC06 */
+#define SICSLOWPAN_UDP_PORT_MIN                     0xF0B0
+#define SICSLOWPAN_UDP_PORT_MAX                     0xF0BF   /* F0B0 + 15 */
+
+/** @} */
+
+/**
+ * \name 6lowpan compressions
+ * @{
+ */
+#define SICSLOWPAN_COMPRESSION_IPV6        0
+#define SICSLOWPAN_COMPRESSION_HC1         1
+#define SICSLOWPAN_COMPRESSION_HC06        2
+/** @} */
+
+/**
+ * \name 6lowpan dispatches
+ * @{
+ */
+#define SICSLOWPAN_DISPATCH_IPV6                    0x41 /* 01000001 = 65 */
+#define SICSLOWPAN_DISPATCH_HC1                     0x42 /* 01000010 = 66 */
+#define SICSLOWPAN_DISPATCH_IPHC                    0x60 /* 011xxxxx = ... */
+#define SICSLOWPAN_DISPATCH_FRAG1                   0xc0 /* 11000xxx */
+#define SICSLOWPAN_DISPATCH_FRAGN                   0xe0 /* 11100xxx */
+/** @} */
+
+/** \name HC1 encoding
+ * @{
+ */
+#define SICSLOWPAN_HC1_NH_UDP                       0x02
+#define SICSLOWPAN_HC1_NH_TCP                       0x06
+#define SICSLOWPAN_HC1_NH_ICMP6                     0x04
+/** @} */
+
+/** \name HC_UDP encoding (works together with HC1)
+ * @{
+ */
+#define SICSLOWPAN_HC_UDP_ALL_C                     0xE0
+/** @} */
+
+/**
+ * \name IPHC encoding
+ * @{
+ */
+/*
+ * Values of fields within the IPHC encoding first byte
+ * (C stands for compressed and I for inline)
+ */
+#define SICSLOWPAN_IPHC_FL_C                        0x10
+#define SICSLOWPAN_IPHC_TC_C                        0x08
+#define SICSLOWPAN_IPHC_NH_C                        0x04
+#define SICSLOWPAN_IPHC_TTL_1                       0x01
+#define SICSLOWPAN_IPHC_TTL_64                      0x02
+#define SICSLOWPAN_IPHC_TTL_255                     0x03
+#define SICSLOWPAN_IPHC_TTL_I                       0x00
+
+
+/* Values of fields within the IPHC encoding second byte */
+#define SICSLOWPAN_IPHC_CID                         0x80
+
+#define SICSLOWPAN_IPHC_SAC                         0x40
+#define SICSLOWPAN_IPHC_SAM_00                      0x00
+#define SICSLOWPAN_IPHC_SAM_01                      0x10
+#define SICSLOWPAN_IPHC_SAM_10                      0x20
+#define SICSLOWPAN_IPHC_SAM_11                      0x30
+
+#define SICSLOWPAN_IPHC_SAM_BIT                     4
+
+#define SICSLOWPAN_IPHC_M                           0x08
+#define SICSLOWPAN_IPHC_DAC                         0x04
+#define SICSLOWPAN_IPHC_DAM_00                      0x00
+#define SICSLOWPAN_IPHC_DAM_01                      0x01
+#define SICSLOWPAN_IPHC_DAM_10                      0x02
+#define SICSLOWPAN_IPHC_DAM_11                      0x03
+
+#define SICSLOWPAN_IPHC_DAM_BIT                     0
+
+/* Link local context number */
+#define SICSLOWPAN_IPHC_ADDR_CONTEXT_LL             0
+/* 16-bit multicast addresses compression */
+#define SICSLOWPAN_IPHC_MCAST_RANGE                 0xA0
+/** @} */
+
+/* NHC_EXT_HDR */
+#define SICSLOWPAN_NHC_MASK                         0xF0
+#define SICSLOWPAN_NHC_EXT_HDR                      0xE0
+
+/**
+ * \name LOWPAN_UDP encoding (works together with IPHC)
+ * @{
+ */
+/**
+ * \name LOWPAN_UDP encoding (works together with IPHC)
+ * @{
+ */
+#define SICSLOWPAN_NHC_UDP_MASK                     0xF8
+#define SICSLOWPAN_NHC_UDP_ID                       0xF0
+#define SICSLOWPAN_NHC_UDP_CHECKSUMC                0x04
+#define SICSLOWPAN_NHC_UDP_CHECKSUMI                0x00
+/* values for port compression, _with checksum_ ie bit 5 set to 0 */
+#define SICSLOWPAN_NHC_UDP_CS_P_00  0xF0 /* all inline */
+#define SICSLOWPAN_NHC_UDP_CS_P_01  0xF1 /* source 16bit inline, dest = 0xF0 + 8 bit inline */
+#define SICSLOWPAN_NHC_UDP_CS_P_10  0xF2 /* source = 0xF0 + 8bit inline, dest = 16 bit inline */
+#define SICSLOWPAN_NHC_UDP_CS_P_11  0xF3 /* source & dest = 0xF0B + 4bit inline */
+/** @} */
+
+
+/**
+ * \name The 6lowpan "headers" length
+ * @{
+ */
+
+#define SICSLOWPAN_IPV6_HDR_LEN                     1    /*one byte*/
+#define SICSLOWPAN_HC1_HDR_LEN                      3
+#define SICSLOWPAN_HC1_HC_UDP_HDR_LEN               7
+#define SICSLOWPAN_FRAG1_HDR_LEN                    4
+#define SICSLOWPAN_FRAGN_HDR_LEN                    5
+
+// Minimum size of the compressed 6LoWPAN header length
+#define SICSLOWPAN_MIN_COMP_HDR_LEN                 7
+
+// Minimum size of the uncompressed IPv6 header length
+#define SICSLOWPAN_MIN_UNCOMP_HDR_LEN               40
+
+
+#define UIP_IPH_LEN    40
+#define UIP_UDPH_LEN    8    /* Size of UDP header */
+#define UIP_TCPH_LEN   20    /* Size of TCP header */
+#define UIP_ICMPH_LEN   4    /* Size of ICMP header */
+
+/** @} */
+
+/**
+ * \brief The header for fragments
+ * \note We do not define different structures for FRAG1
+ * and FRAGN headers, which are different. For FRAG1, the
+ * offset field is just not used
+ */
+/* struct sicslowpan_frag_hdr { */
+/*   uint16_t dispatch_size; */
+/*   uint16_t tag; */
+/*   uint8_t offset; */
+/* }; */
+
+/**
+ * \brief The HC1 header when HC_UDP is not used
+ *
+ * When all fields are compressed and HC_UDP is not used,
+ * we use this structure. If HC_UDP is used, the ttl is
+ * in another spot, and we use the sicslowpan_hc1_hc_udp
+ * structure
+ */
+/* struct sicslowpan_hc1_hdr { */
+/*   uint8_t dispatch; */
+/*   uint8_t encoding; */
+/*   uint8_t ttl; */
+/* }; */
+
+/**
+ * \brief HC1 followed by HC_UDP
+ */
+/* struct sicslowpan_hc1_hc_udp_hdr { */
+/*   uint8_t dispatch; */
+/*   uint8_t hc1_encoding; */
+/*   uint8_t hc_udp_encoding; */
+/*   uint8_t ttl; */
+/*   uint8_t ports; */
+/*   uint16_t udpchksum; */
+/* }; */
+
+/**
+ * \brief An address context for IPHC address compression
+ * each context can have upto 8 bytes
+ */
+struct sicslowpan_addr_context {
+       uint8_t used; /* possibly use as prefix-length */
+       uint8_t number;
+       uint8_t prefix[8];
+};
+
+/**
+ * \name Address compressibility test functions
+ * @{
+ */
+
+/**
+ * \brief check whether we can compress the IID in
+ * address 'a' to 16 bits.
+ * This is used for unicast addresses only, and is true
+ * if the address is on the format \<PREFIX\>::0000:00ff:fe00:XXXX
+ * NOTE: we currently assume 64-bits prefixes
+ */
+#define sicslowpan_is_iid_16_bit_compressable(a) \
+((((a)->u16[4]) == 0) &&                       \
+(((a)->u8[10]) == 0)&&                      \
+(((a)->u8[11]) == 0xff)&&                           \
+(((a)->u8[12]) == 0xfe)&&                           \
+(((a)->u8[13]) == 0))
+
+/**
+ * \brief check whether the 9-bit group-id of the
+ * compressed multicast address is known. It is true
+ * if the 9-bit group is the all nodes or all routers
+ * group.
+ * \param a is typed uint8_t *
+ */
+#define sicslowpan_is_mcast_addr_decompressable(a) \
+(((*a & 0x01) == 0) &&                           \
+((*(a + 1) == 0x01) || (*(a + 1) == 0x02)))
+
+/**
+ * \brief check whether the 112-bit group-id of the
+ * multicast address is mappable to a 9-bit group-id
+ * It is true if the group is the all nodes or all
+ * routers group.
+ */
+#define sicslowpan_is_mcast_addr_compressable(a) \
+((((a)->u16[1]) == 0) &&                       \
+(((a)->u16[2]) == 0) &&                       \
+(((a)->u16[3]) == 0) &&                       \
+(((a)->u16[4]) == 0) &&                       \
+(((a)->u16[5]) == 0) &&                       \
+(((a)->u16[6]) == 0) &&                       \
+(((a)->u8[14]) == 0) &&                       \
+((((a)->u8[15]) == 1) || (((a)->u8[15]) == 2)))
+
+/* FFXX::00XX:XXXX:XXXX */
+#define sicslowpan_is_mcast_addr_compressable48(a) \
+((((a)->u16[1]) == 0) &&                       \
+(((a)->u16[2]) == 0) &&                       \
+(((a)->u16[3]) == 0) &&                       \
+(((a)->u16[4]) == 0) &&                       \
+(((a)->u8[10]) == 0))
+
+/* FFXX::00XX:XXXX */
+#define sicslowpan_is_mcast_addr_compressable32(a) \
+((((a)->u16[1]) == 0) &&                       \
+(((a)->u16[2]) == 0) &&                       \
+(((a)->u16[3]) == 0) &&                       \
+(((a)->u16[4]) == 0) &&                       \
+(((a)->u16[5]) == 0) &&                       \
+(((a)->u8[12]) == 0))
+
+/* FF02::00XX */
+#define sicslowpan_is_mcast_addr_compressable8(a) \
+((((a)->u8[1]) == 2) &&                        \
+(((a)->u16[1]) == 0) &&                       \
+(((a)->u16[2]) == 0) &&                       \
+(((a)->u16[3]) == 0) &&                       \
+(((a)->u16[4]) == 0) &&                       \
+(((a)->u16[5]) == 0) &&                       \
+(((a)->u16[6]) == 0) &&                       \
+(((a)->u8[14]) == 0))
+
+#define uip_is_addr_mac_addr_based(a, m) \
+((((a)->s6_addr[8])  == (((m)[0]) ^ 0x02)) &&        \
+(((a)->s6_addr[9])  == (m)[1]) &&            \
+(((a)->s6_addr[10]) == (m)[2]) &&            \
+(((a)->s6_addr[11]) == (m)[3]) &&            \
+(((a)->s6_addr[12]) == (m)[4]) &&            \
+(((a)->s6_addr[13]) == (m)[5]) &&            \
+(((a)->s6_addr[14]) == (m)[6]) &&            \
+(((a)->s6_addr[15]) == (m)[7]))
+
+/**
+ * Construct an IPv6 address from eight 16-bit words.
+ *
+ * This function constructs an IPv6 address.
+ *
+ * \hideinitializer
+ */
+#define uip_ip6addr(addr, addr0, addr1, addr2, addr3, addr4, addr5, addr6, addr7) do {\
+(addr)->s6_addr[0] = htons(addr0);                                      \
+(addr)->s6_addr[1] = htons(addr1);                                      \
+(addr)->s6_addr[2] = htons(addr2);                                      \
+(addr)->s6_addr[3] = htons(addr3);                                      \
+(addr)->s6_addr[4] = htons(addr4);                                      \
+(addr)->s6_addr[5] = htons(addr5);                                      \
+(addr)->s6_addr[6] = htons(addr6);                                      \
+(addr)->s6_addr[7] = htons(addr7);                                      \
+} while(0)
+
+/**
+ * Construct an IPv6 address from sixteen 8-bit words.
+ *
+ * This function constructs an IPv6 address.
+ *
+ * \hideinitializer
+ */
+#define uip_ip6addr_u8(addr, addr0, addr1, addr2, addr3, addr4, addr5, addr6, addr7, addr8, addr9, addr10, addr11, addr12, addr13, addr14, addr15) do {\
+(addr)->s6_addr[0] = addr0;                                       \
+(addr)->s6_addr[1] = addr1;                                       \
+(addr)->s6_addr[2] = addr2;                                       \
+(addr)->s6_addr[3] = addr3;                                       \
+(addr)->s6_addr[4] = addr4;                                       \
+(addr)->s6_addr[5] = addr5;                                       \
+(addr)->s6_addr[6] = addr6;                                       \
+(addr)->s6_addr[7] = addr7;                                       \
+(addr)->s6_addr[8] = addr8;                                       \
+(addr)->s6_addr[9] = addr9;                                       \
+(addr)->s6_addr[10] = addr10;                                     \
+(addr)->s6_addr[11] = addr11;                                     \
+(addr)->s6_addr[12] = addr12;                                     \
+(addr)->s6_addr[13] = addr13;                                     \
+(addr)->s6_addr[14] = addr14;                                     \
+(addr)->s6_addr[15] = addr15;                                     \
+} while(0)
+
+
+
+/** \brief 16 bit 802.15.4 address */
+typedef struct uip_802154_shortaddr {
+       uint8_t addr[2];
+} uip_802154_shortaddr;
+/** \brief 64 bit 802.15.4 address */
+typedef struct uip_802154_longaddr {
+       uint8_t addr[8];
+} uip_802154_longaddr;
+
+/** \brief 802.11 address */
+typedef struct uip_80211_addr {
+       uint8_t addr[6];
+} uip_80211_addr;
+
+/** \brief 802.3 address */
+typedef struct uip_eth_addr {
+       uint8_t addr[6];
+} uip_eth_addr;
+typedef uip_802154_longaddr uip_lladdr_t;
+
+#define UIP_802154_SHORTADDR_LEN 2
+#define UIP_802154_LONGADDR_LEN  8
+#define UIP_LLADDR_LEN UIP_802154_LONGADDR_LEN
+
+
+#define GET16(ptr) (((uint16_t)(((u_int8_t *)ptr)[0] << 8)) | (((u_int8_t *)ptr)[1]))
+#define SET16(ptr, value) do {     \
+((u_int8_t *)ptr)[0] = ((value) >> 8) & 0xff; \
+((u_int8_t *)ptr)[1] = (value) & 0xff;    \
+} while(0)
+
+/** \name Pointers in the packetbuf buffer
+ *  @{
+ */
+#define PACKETBUF_FRAG_DISPATCH_SIZE 0   /* 16 bit */
+#define PACKETBUF_FRAG_TAG           2   /* 16 bit */
+#define PACKETBUF_FRAG_OFFSET        4   /* 8 bit */
+
+#define PACKETBUF_HC1_DISPATCH       0 /* 8 bit */
+#define PACKETBUF_HC1_ENCODING       1 /* 8 bit */
+#define PACKETBUF_HC1_TTL            2 /* 8 bit */
+
+#define PACKETBUF_HC1_HC_UDP_DISPATCH      0 /* 8 bit */
+#define PACKETBUF_HC1_HC_UDP_HC1_ENCODING  1 /* 8 bit */
+#define PACKETBUF_HC1_HC_UDP_UDP_ENCODING  2 /* 8 bit */
+#define PACKETBUF_HC1_HC_UDP_TTL           3 /* 8 bit */
+#define PACKETBUF_HC1_HC_UDP_PORTS         4 /* 8 bit */
+#define PACKETBUF_HC1_HC_UDP_CHKSUM        5 /* 16 bit */
+
+
+#define LINKADDR_SIZE 8
+typedef union {
+       unsigned char u8[LINKADDR_SIZE];
+       uint16_t u16;
+} linkaddr_t;
+
+static void
+uip_ds6_set_addr_iid(struct in6_addr *ipaddr, uip_lladdr_t *lladdr)
+{
+       /* We consider only links with IEEE EUI-64 identifier or
+        * IEEE 48-bit MAC addresses */
+#if (UIP_LLADDR_LEN == 8)
+       memcpy(ipaddr->s6_addr + 8, lladdr, UIP_LLADDR_LEN);
+       ipaddr->s6_addr[8] ^= 0x02;
+#elif (UIP_LLADDR_LEN == 6)
+       memcpy(ipaddr->s6_addr + 8, lladdr, 3);
+       ipaddr->s6_addr[11] = 0xff;
+       ipaddr->s6_addr[12] = 0xfe;
+       memcpy(ipaddr->s6_addr + 13, (uint8_t *)lladdr + 3, 3);
+       ipaddr->s6_addr[8] ^= 0x02;
+#else
+#error uip-ds6.c cannot build interface address when UIP_LLADDR_LEN is not 6 or 8
+#endif
+}
+
+static errno_t
+compress_hdr_ipv6(__unused struct frame802154 *ieee02154hdr,
+    __unused u_int8_t *payload,
+    long *hdroffset, size_t *hdrlen, u_int8_t *hdrbuf)
+{
+       /*
+        * Negative offset: 6LoWPAN header needs to ve prepended to the data
+        */
+       *hdroffset = -SICSLOWPAN_IPV6_HDR_LEN;
+       *hdrlen = SICSLOWPAN_IPV6_HDR_LEN;
+       hdrbuf[0] = SICSLOWPAN_DISPATCH_IPV6;
+
+       return 0;
+}
+
+
+#if 0
+/*--------------------------------------------------------------------*/
+/** \name HC1 compression and uncompression functions
+ *  @{                                                                */
+/*--------------------------------------------------------------------*/
+/**
+ * \brief Compress IP/UDP header using HC1 and HC_UDP
+ *
+ * This function is called by the 6lowpan code to create a compressed
+ * 6lowpan packet in the packetbuf buffer from a full IPv6 packet in the
+ * uip_buf buffer.
+ *
+ *
+ * If we can compress everything, we use HC1 dispatch, if not we use
+ * IPv6 dispatch.\n
+ * We can compress everything if:
+ *   - IP version is
+ *   - Flow label and traffic class are 0
+ *   - Both src and dest ip addresses are link local
+ *   - Both src and dest interface ID are recoverable from lower layer
+ *     header
+ *   - Next header is either ICMP, UDP or TCP
+ * Moreover, if next header is UDP, we try to compress it using HC_UDP.
+ * This is feasible is both ports are between F0B0 and F0B0 + 15\n\n
+ *
+ * Resulting header structure:
+ * - For ICMP, TCP, non compressed UDP\n
+ *   HC1 encoding = 11111010 (UDP) 11111110 (TCP) 11111100 (ICMP)\n
+ * \verbatim
+ *                      1                   2                   3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | LoWPAN HC1 Dsp | HC1 encoding  | IPv6 Hop limit| L4 hdr + data|
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | ...
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * \endverbatim
+ *
+ * - For compressed UDP
+ *   HC1 encoding = 11111011, HC_UDP encoding = 11100000\n
+ * \verbatim
+ *                      1                   2                   3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | LoWPAN HC1 Dsp| HC1 encoding  |  HC_UDP encod.| IPv6 Hop limit|
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | src p.| dst p.| UDP checksum                  | L4 data...
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * \endverbatim
+ *
+ * \param link_destaddr L2 destination address, needed to compress the
+ * IP destination field
+ */
+#endif
+errno_t
+compress_hdr_hc1(struct frame802154 *ieee02154hdr, u_int8_t *payload,
+    long *hdroffset, size_t *hdrlen, u_int8_t *hdrbuf)
+{
+       struct ip6_hdr *ip6 = (struct ip6_hdr *)(payload);
+
+       if (*hdrlen < SICSLOWPAN_MIN_COMP_HDR_LEN) {
+               return EINVAL;
+       }
+
+       *hdroffset = 0;
+
+       /*
+        * Check if all the assumptions for full compression
+        * are valid :
+        */
+       if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION ||
+           !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src) ||
+           !uip_is_addr_mac_addr_based(&ip6->ip6_src, ieee02154hdr->src_addr) ||
+           !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst) ||
+           !uip_is_addr_mac_addr_based(&ip6->ip6_dst,
+           ieee02154hdr->dest_addr) ||
+           (ip6->ip6_nxt != IPPROTO_ICMPV6 &&
+           ip6->ip6_nxt != IPPROTO_UDP &&
+           ip6->ip6_nxt != IPPROTO_TCP)) {
+               /*
+                * IPV6 DISPATCH
+                * Something cannot be compressed, use IPV6 DISPATCH,
+                * compress nothing, copy IPv6 header in packetbuf buffer
+                */
+               return compress_hdr_ipv6(ieee02154hdr, payload, hdroffset, hdrlen, hdrbuf);
+       } else {
+               /*
+                * HC1 DISPATCH
+                * maximum compresssion:
+                * All fields in the IP header but Hop Limit are elided
+                * If next header is UDP, we compress UDP header using HC2
+                */
+               hdrbuf[PACKETBUF_HC1_DISPATCH] = SICSLOWPAN_DISPATCH_HC1;
+
+               switch (ip6->ip6_nxt) {
+               case IPPROTO_ICMPV6:
+                       /* HC1 encoding and ttl */
+                       hdrbuf[PACKETBUF_HC1_ENCODING] = 0xFC;
+                       hdrbuf[PACKETBUF_HC1_TTL] = ip6->ip6_hlim;
+                       *hdrlen = SICSLOWPAN_HC1_HDR_LEN;
+                       *hdroffset = sizeof(struct ip6_hdr);
+                       break;
+
+               case IPPROTO_TCP:
+                       /* HC1 encoding and ttl */
+                       hdrbuf[PACKETBUF_HC1_ENCODING] = 0xFE;
+                       hdrbuf[PACKETBUF_HC1_TTL] = ip6->ip6_hlim;
+                       *hdrlen = SICSLOWPAN_HC1_HDR_LEN;
+                       *hdroffset = sizeof(struct ip6_hdr);
+                       break;
+
+               case IPPROTO_UDP: {
+                       struct udphdr *udp = (struct udphdr *)(uintptr_t)(ip6 + 1);
+
+                       /*
+                        * try to compress UDP header (we do only full compression).
+                        * This is feasible if both src and dest ports are between
+                        * SICSLOWPAN_UDP_PORT_MIN and SICSLOWPAN_UDP_PORT_MIN + 15
+                        */
+                       printf("source/remote ports %u/%u\n", ntohs(udp->uh_sport), ntohs(udp->uh_dport));
+                       if (ntohs(udp->uh_sport) >= SICSLOWPAN_UDP_PORT_MIN &&
+                           ntohs(udp->uh_sport) < SICSLOWPAN_UDP_PORT_MAX &&
+                           ntohs(udp->uh_dport) >= SICSLOWPAN_UDP_PORT_MIN &&
+                           ntohs(udp->uh_dport) < SICSLOWPAN_UDP_PORT_MAX) {
+                               /* HC1 encoding */
+                               hdrbuf[PACKETBUF_HC1_HC_UDP_HC1_ENCODING] = 0xFB;
+
+                               /* HC_UDP encoding, ttl, src and dest ports, checksum */
+                               hdrbuf[PACKETBUF_HC1_HC_UDP_UDP_ENCODING] = 0xE0;
+                               hdrbuf[PACKETBUF_HC1_HC_UDP_TTL] = ip6->ip6_hlim;
+
+                               hdrbuf[PACKETBUF_HC1_HC_UDP_PORTS] =
+                                   (uint8_t)((ntohs(udp->uh_sport) - SICSLOWPAN_UDP_PORT_MIN) << 4) +
+                                   (uint8_t)((ntohs(udp->uh_dport) - SICSLOWPAN_UDP_PORT_MIN));
+
+                               memcpy(&hdrbuf[PACKETBUF_HC1_HC_UDP_CHKSUM], &udp->uh_sum, 2);
+                               *hdrlen = SICSLOWPAN_HC1_HC_UDP_HDR_LEN;
+                               *hdroffset = sizeof(struct ip6_hdr) + sizeof(struct udphdr);
+                       } else {
+                               /* HC1 encoding and ttl */
+                               hdrbuf[PACKETBUF_HC1_ENCODING] = 0xFA;
+                               hdrbuf[PACKETBUF_HC1_TTL] = ip6->ip6_hlim;
+                               *hdrlen = SICSLOWPAN_HC1_HDR_LEN;
+                               *hdroffset = sizeof(struct ip6_hdr);
+                       }
+                       break;
+               }
+               }
+       }
+       return 0;
+}
+
+
+/*--------------------------------------------------------------------*/
+/**
+ * \brief Uncompress HC1 (and HC_UDP) headers and put them in
+ * sicslowpan_buf
+ *
+ * This function is called by the input function when the dispatch is
+ * HC1.
+ * We %process the packet in the packetbuf buffer, uncompress the header
+ * fields, and copy the result in the sicslowpan buffer.
+ * At the end of the decompression, packetbuf_hdr_len and uncompressed_hdr_len
+ * are set to the appropriate values
+ *
+ * \param ip_len Equal to 0 if the packet is not a fragment (IP length
+ * is then inferred from the L2 length), non 0 if the packet is a 1st
+ * fragment.
+ */
+errno_t
+uncompress_hdr_hc1(struct frame802154 *frame, u_int8_t *payload,
+    uint16_t ip_len, long *hdroffset, size_t *hdrlen, u_int8_t *hdrbuf)
+{
+       struct ip6_hdr *ip6 = (struct ip6_hdr *)hdrbuf;
+
+       if (payload[PACKETBUF_HC1_DISPATCH] == SICSLOWPAN_DISPATCH_IPV6) {
+               *hdroffset = -SICSLOWPAN_IPV6_HDR_LEN;
+               *hdrlen = SICSLOWPAN_IPV6_HDR_LEN;
+               return 0;
+       }
+
+       *hdroffset = 0;
+
+       /* version, traffic class, flow label */
+       ip6->ip6_flow = 0;
+       ip6->ip6_vfc = IPV6_VERSION;
+
+       /* src and dest ip addresses */
+       uip_ip6addr_u8(&ip6->ip6_src, 0xfe, 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+       uip_ds6_set_addr_iid(&ip6->ip6_src,
+           (uip_lladdr_t *)frame->src_addr);
+
+       uip_ip6addr_u8(&ip6->ip6_dst, 0xfe, 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+       uip_ds6_set_addr_iid(&ip6->ip6_dst,
+           (uip_lladdr_t *)frame->dest_addr);
+
+       *hdrlen = UIP_IPH_LEN;
+
+       /* Next header field */
+       switch (payload[PACKETBUF_HC1_ENCODING] & 0x06) {
+       case SICSLOWPAN_HC1_NH_ICMP6:
+               ip6->ip6_nxt = IPPROTO_ICMPV6;
+               ip6->ip6_hlim = payload[PACKETBUF_HC1_TTL];
+               *hdroffset = SICSLOWPAN_HC1_HDR_LEN;
+               break;
+
+       case SICSLOWPAN_HC1_NH_TCP:
+               ip6->ip6_nxt = IPPROTO_TCP;
+               ip6->ip6_hlim = payload[PACKETBUF_HC1_TTL];
+               *hdroffset = SICSLOWPAN_HC1_HDR_LEN;
+               break;
+
+       case SICSLOWPAN_HC1_NH_UDP:
+               ip6->ip6_nxt = IPPROTO_UDP;
+               if (payload[PACKETBUF_HC1_HC_UDP_HC1_ENCODING] & 0x01) {
+                       struct udphdr *udp = (struct udphdr *)(uintptr_t)ip6;
+
+                       /* UDP header is compressed with HC_UDP */
+                       if (payload[PACKETBUF_HC1_HC_UDP_UDP_ENCODING] !=
+                           SICSLOWPAN_HC_UDP_ALL_C) {
+                               printf("sicslowpan (uncompress_hdr), packet not supported");
+                               return EINVAL;
+                       }
+                       /* IP TTL */
+
+                       ip6->ip6_hlim = payload[PACKETBUF_HC1_HC_UDP_TTL];
+                       /* UDP ports, len, checksum */
+                       udp->uh_sport =
+                           htons(SICSLOWPAN_UDP_PORT_MIN + (payload[PACKETBUF_HC1_HC_UDP_PORTS] >> 4));
+                       udp->uh_dport =
+                           htons(SICSLOWPAN_UDP_PORT_MIN + (payload[PACKETBUF_HC1_HC_UDP_PORTS] & 0x0F));
+
+                       memcpy(&udp->uh_sum, &payload[PACKETBUF_HC1_HC_UDP_CHKSUM], 2);
+                       *hdrlen += UIP_UDPH_LEN;
+                       *hdroffset = SICSLOWPAN_HC1_HC_UDP_HDR_LEN;
+               } else {
+                       ip6->ip6_hlim = payload[PACKETBUF_HC1_TTL];
+                       *hdroffset = SICSLOWPAN_HC1_HDR_LEN;
+               }
+               break;
+
+       default:
+               /* this shouldn't happen, drop */
+               return EINVAL;
+       }
+
+       /* IP length field. */
+       if (ip_len == 0) {
+               size_t len = frame->payload_len - *hdroffset + *hdrlen - sizeof(struct ip6_hdr);
+
+               /* This is not a fragmented packet */
+               SET16(&ip6->ip6_plen, len);
+       } else {
+               /* This is a 1st fragment */
+               SET16(&ip6->ip6_plen, ip_len - UIP_IPH_LEN);
+       }
+       /* length field in UDP header */
+       if (ip6->ip6_nxt == IPPROTO_UDP) {
+               struct udphdr *udp = (struct udphdr *)(uintptr_t)ip6;
+
+               memcpy(&udp->uh_ulen, &ip6->ip6_plen, 2);
+       }
+       return 0;
+}
+
+errno_t
+sixxlowpan_compress(struct frame802154 *ieee02154hdr, u_int8_t *payload)
+{
+       long hdroffset;
+       size_t hdrlen;
+       u_int8_t hdrbuf[128];
+       errno_t error;
+
+       bzero(hdrbuf, sizeof(hdrbuf));
+       hdrlen = sizeof(hdrbuf);
+
+       error = compress_hdr_hc1(ieee02154hdr, payload,
+           &hdroffset, &hdrlen, hdrbuf);
+       if (error != 0) {
+               return error;
+       }
+
+       if (hdroffset < 0) {
+               /*
+                * hdroffset negative means that we have to add
+                * hdrlen of extra stuff
+                */
+               memmove(&payload[hdrlen],
+                   &payload[0],
+                   ieee02154hdr->payload_len);
+               memcpy(&payload[0], hdrbuf, hdrlen);
+
+               ieee02154hdr->payload_len += hdrlen;
+       } else if (hdroffset > 0) {
+               /*
+                * hdroffset is the size of the compressed header
+                *
+                * hdrlen is the size of the data that has been compressed
+                * -- i.e. when the untouched data starts
+                */
+               memmove(&payload[hdrlen],
+                   &payload[hdroffset],
+                   ieee02154hdr->payload_len - hdroffset);
+               memcpy(&payload[0], hdrbuf, hdrlen);
+
+               ieee02154hdr->payload_len += hdrlen - hdroffset;
+       }
+
+       return 0;
+}
+
+errno_t
+sixxlowpan_uncompress(struct frame802154 *ieee02154hdr, u_int8_t *payload)
+{
+       long hdroffset;
+       size_t hdrlen;
+       u_int8_t hdrbuf[128];
+       errno_t error;
+
+       bzero(hdrbuf, sizeof(hdrbuf));
+       hdrlen = sizeof(hdrbuf);
+
+       error = uncompress_hdr_hc1(ieee02154hdr, (u_int8_t *)payload,
+           0, &hdroffset, &hdrlen, hdrbuf);
+
+       if (error != 0) {
+               return error;
+       }
+
+       if (hdroffset < 0) {
+               /*
+                * hdroffset negative means that we have to remove
+                * hdrlen of extra stuff
+                */
+               memmove(&payload[0],
+                   &payload[hdrlen],
+                   ieee02154hdr->payload_len - hdrlen);
+               ieee02154hdr->payload_len -= hdrlen;
+       } else {
+               /*
+                * hdroffset is the size of the compressed header
+                * -- i.e. when the untouched data starts
+                *
+                * hdrlen is the size of the decompressed header
+                * that takes the place of compressed header of size hdroffset
+                */
+               memmove(payload + hdrlen,
+                   payload + hdroffset,
+                   ieee02154hdr->payload_len - hdroffset);
+               memcpy(payload, hdrbuf, hdrlen);
+               ieee02154hdr->payload_len += hdrlen - hdroffset;
+       }
+
+       return 0;
+}
+
+errno_t
+sixxlowpan_output(struct frame802154 *ieee02154hdr, u_int8_t *payload)
+{
+       errno_t error = 0;
+
+       error = sixxlowpan_compress(ieee02154hdr, payload);
+       if (error != 0) {
+               goto done;
+       }
+
+       /*
+        * TO DO: fragmentation
+        */
+
+done:
+       return error;
+}
+
+errno_t
+sixxlowpan_input(struct frame802154 *ieee02154hdr, u_int8_t *payload)
+{
+       errno_t error = 0;
+
+       error = sixxlowpan_uncompress(ieee02154hdr, payload);
+       if (error != 0) {
+               goto done;
+       }
+
+       /*
+        * TO DO: fragmentation
+        */
+
+done:
+       return error;
+}
diff --git a/bsd/net/sixxlowpan.h b/bsd/net/sixxlowpan.h
new file mode 100644 (file)
index 0000000..3c5528e
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifndef sixxlowpan_h
+#define sixxlowpan_h
+
+#include <sys/types.h>
+
+#include "frame802154.h"
+
+errno_t sixxlowpan_compress(struct frame802154 *, u_int8_t *);
+errno_t sixxlowpan_uncompress(struct frame802154 *, u_int8_t *);
+
+errno_t sixxlowpan_output(struct frame802154 *, u_int8_t *);
+errno_t sixxlowpan_input(struct frame802154 *, u_int8_t *);
+
+#endif /* sixxlowpan_h */
index a4425e74151f85e7f9a73b4f359513274a72e677..f8af8c65722d88f8ee7c19f259b996e33a16b24a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2015-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -49,6 +49,8 @@ STUB(kern_buflet_get_object_segment);
 STUB(kern_buflet_set_data_offset);
 STUB(kern_buflet_set_data_length);
 STUB(kern_buflet_get_data_limit);
+STUB(kern_buflet_attach_buffer);
+STUB(kern_buflet_attach_buffer_with_segment_info);
 STUB(kern_channel_advance_slot);
 STUB(kern_channel_available_slot_count);
 STUB(kern_channel_get_context);
@@ -83,7 +85,7 @@ STUB(kern_nexus_controller_read_provider_attr);
 STUB(kern_nexus_controller_register_provider);
 STUB(kern_nexus_controller_unbind_provider_instance);
 STUB(kern_nexus_deregister_domain_provider);
-STUB(kern_nexus_get_builtin_domain_provider);
+STUB(kern_nexus_get_default_domain_provider);
 STUB(kern_nexus_get_context);
 STUB(kern_nexus_get_pbufpool);
 STUB(kern_nexus_register_domain_provider);
@@ -91,12 +93,15 @@ STUB(kern_packet_clear_flow_uuid);
 STUB(kern_packet_get_euuid);
 STUB(kern_packet_finalize);
 STUB(kern_packet_get_buflet_count);
+STUB(kern_packet_set_buflet_count);
 STUB(kern_packet_get_data_length);
 STUB(kern_packet_get_flow_uuid);
 STUB(kern_packet_get_inet_checksum);
+STUB(kern_packet_get_headroom);
 STUB(kern_packet_get_link_broadcast);
 STUB(kern_packet_get_link_ethfcs);
 STUB(kern_packet_get_link_header_offset);
+STUB(kern_packet_get_link_header_length);
 STUB(kern_packet_get_link_multicast);
 STUB(kern_packet_get_network_header_offset);
 STUB(kern_packet_get_next_buflet);
@@ -114,8 +119,10 @@ STUB(kern_packet_get_transport_traffic_background)
 STUB(kern_packet_get_transport_traffic_realtime)
 STUB(kern_packet_set_flow_uuid);
 STUB(kern_packet_set_inet_checksum);
+STUB(kern_packet_set_headroom);
 STUB(kern_packet_set_link_broadcast);
 STUB(kern_packet_set_link_header_offset);
+STUB(kern_packet_set_link_header_length);
 STUB(kern_packet_set_link_multicast);
 STUB(kern_packet_set_link_ethfcs);
 STUB(kern_packet_set_network_header_offset);
@@ -128,16 +135,34 @@ STUB(kern_packet_get_timestamp_requested);
 STUB(kern_packet_get_tx_completion_status);
 STUB(kern_packet_set_tx_completion_status);
 STUB(kern_packet_tx_completion);
+STUB(kern_packet_set_group_start);
+STUB(kern_packet_get_group_start);
+STUB(kern_packet_set_group_end);
+STUB(kern_packet_get_group_end);
+STUB(kern_packet_set_expire_time);
+STUB(kern_packet_get_expire_time);
+STUB(kern_packet_set_token);
+STUB(kern_packet_get_token);
+STUB(kern_packet_get_packetid);
+STUB(kern_packet_set_vlan_tag);
+STUB(kern_packet_get_vlan_tag);
+STUB(kern_packet_get_vlan_id);
+STUB(kern_packet_get_vlan_priority);
 STUB(kern_pbufpool_alloc);
 STUB(kern_pbufpool_alloc_batch);
+STUB(kern_pbufpool_alloc_batch_callback);
 STUB(kern_pbufpool_alloc_nosleep);
 STUB(kern_pbufpool_alloc_batch_nosleep);
+STUB(kern_pbufpool_alloc_batch_nosleep_callback);
 STUB(kern_pbufpool_create);
 STUB(kern_pbufpool_destroy);
 STUB(kern_pbufpool_free);
 STUB(kern_pbufpool_free_batch);
 STUB(kern_pbufpool_get_context);
 STUB(kern_pbufpool_get_memory_info);
+STUB(kern_pbufpool_alloc_buffer);
+STUB(kern_pbufpool_alloc_buffer_nosleep);
+STUB(kern_pbufpool_free_buffer);
 STUB(kern_segment_get_index);
 #undef STUB
 #endif /* !SKYWALK */
index 6eddebfe3d3678ed7b97a4db3f0ed52fac6593e1..dc4f2c43b2e410d7349361140f9b75c994417d1d 100644 (file)
@@ -32,6 +32,7 @@ PRIVATE_DATAFILES = \
        ip_fw2.h \
        mptcp_var.h \
        tcp.h \
+       tcp_cc.h \
        tcp_debug.h \
        tcp_var.h \
        tcp_cache.h \
@@ -39,7 +40,7 @@ PRIVATE_DATAFILES = \
        in_stat.h
 
 PRIVATE_KERNELFILES = ${KERNELFILES} \
-       ip_ecn.h ip_encap.h
+       ip_ecn.h ip_encap.h tcp_log.h
 
 INSTALL_MI_LIST        = ${DATAFILES}
 
index f4f4bddf90e19f1c46d7d8c7b54759b1f2dd826d..ff68ee31d122ef9a410972262f6ff3d6d55cc065 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -42,89 +42,117 @@ struct cbrt_table_entry {
 
 static const struct cbrt_table_entry cbrt_table[] = {
        /* mantissa = 0x1.00... */
-       {0x1.0000000000000p+0, 0x1.0000000000000p+0,
-        0x1.0000000000000p+0, 0x1.0000000000000p+0}, /* exponent = 0 */
-       {0x1.037e200000000p+1, 0x1.4400000000000p+0,
-        0x1.948b0fcd6e9e0p-1, 0x1.f91bd1b62b9cfp-2}, /* exponent = 1 */
-       {0x1.0315800000000p+2, 0x1.9800000000000p+0,
-        0x1.4141414141414p-1, 0x1.f9e7cba5753afp-3}, /* exponent = 2 */
+       {
+               .x            = 0x1.0000000000000p+0,
+               .cbrt_x       = 0x1.0000000000000p+0,
+               .recip_cbrt_x = 0x1.0000000000000p+0,
+               .recip_x      = 0x1.0000000000000p+0
+       }, /* exponent = 0 */
+       {
+               .x            = 0x1.037e200000000p+1,
+               .cbrt_x       = 0x1.4400000000000p+0,
+               .recip_cbrt_x = 0x1.948b0fcd6e9e0p-1,
+               .recip_x      = 0x1.f91bd1b62b9cfp-2
+       }, /* exponent = 1 */
+       {
+               .x            = 0x1.0315800000000p+2,
+               .cbrt_x       = 0x1.9800000000000p+0,
+               .recip_cbrt_x = 0x1.4141414141414p-1,
+               .recip_x      = 0x1.f9e7cba5753afp-3
+       }, /* exponent = 2 */
 
        /* mantissa = 0x1.04... */
-       {0x1.060c080000000p+0, 0x1.0200000000000p+0,
-        0x1.fc07f01fc07f0p-1, 0x1.f42f61dacddc6p-1}, /* exponent = 0 */
-       {0x1.05ff4c356ff40p+1, 0x1.450a000000000p+0,
-        0x1.933fff9b30002p-1, 0x1.f447b132ca3acp-2}, /* exponent = 1 */
-       {0x1.06e9aa0000000p+2, 0x1.9a00000000000p+0,
-        0x1.3fb013fb013fbp-1, 0x1.f289bb31fd41cp-3}, /* exponent = 2 */
+       {
+               .x = 0x1.060c080000000p+0,
+               .cbrt_x = 0x1.0200000000000p+0,
+               0x1.fc07f01fc07f0p-1,
+               0x1.f42f61dacddc6p-1
+       }, /* exponent = 0 */
+       {
+               .x = 0x1.05ff4c356ff40p+1,
+               .cbrt_x = 0x1.450a000000000p+0,
+               0x1.933fff9b30002p-1,
+               0x1.f447b132ca3acp-2
+       }, /* exponent = 1 */
+       {
+               .x = 0x1.06e9aa0000000p+2,
+               .cbrt_x = 0x1.9a00000000000p+0,
+               0x1.3fb013fb013fbp-1,
+               0x1.f289bb31fd41cp-3
+       }, /* exponent = 2 */
 
        /* mantissa = 0x1.08...*/
-       {0x1.09fe97c0b2e80p+0, 0x1.034a000000000p+0,
+       {.x = 0x1.09fe97c0b2e80p+0, .cbrt_x = 0x1.034a000000000p+0,
         0x1.f9815c85b04a3p-1, 0x1.ecc3168ac46e4p-1}, // exponent = 0
-       {0x1.0853ec0000000p+1, 0x1.4600000000000p+0, 0x1.920fb49d0e229p-1, 0x1.efde7dcdacefdp-2}, // exponent = 1
-       {0x1.0ac7700000000p+2, 0x1.9c00000000000p+0, 0x1.3e22cbce4a902p-1, 0x1.eb501ca81bb3ep-3}, // exponent = 2
+       {.x = 0x1.0853ec0000000p+1, .cbrt_x = 0x1.4600000000000p+0,
+        0x1.920fb49d0e229p-1, 0x1.efde7dcdacefdp-2}, // exponent = 1
+       {.x = 0x1.0ac7700000000p+2, .cbrt_x = 0x1.9c00000000000p+0,
+        0x1.3e22cbce4a902p-1, 0x1.eb501ca81bb3ep-3}, // exponent = 2
 
        /* mantissa = 0x1.0c...*/
-       {0x1.0c30400000000p+0, 0x1.0400000000000p+0, 0x1.f81f81f81f820p-1, 0x1.e8bb1d5b6e585p-1}, // exponent = 0
-       {0x1.0d39000000000p+1, 0x1.4800000000000p+0, 0x1.8f9c18f9c18fap-1, 0x1.e6da80ced1523p-2}, // exponent = 1
-       {0x1.0eaede0000000p+2, 0x1.9e00000000000p+0, 0x1.3c995a47babe7p-1, 0x1.e43a0fc24fe4bp-3}, // exponent = 2
+       {.x = 0x1.0c30400000000p+0, 0x1.0400000000000p+0,
+        0x1.f81f81f81f820p-1, 0x1.e8bb1d5b6e585p-1}, // exponent = 0
+       {.x = 0x1.0d39000000000p+1, 0x1.4800000000000p+0,
+        0x1.8f9c18f9c18fap-1, 0x1.e6da80ced1523p-2}, // exponent = 1
+       {.x = 0x1.0eaede0000000p+2, 0x1.9e00000000000p+0, 0x1.3c995a47babe7p-1, 0x1.e43a0fc24fe4bp-3}, // exponent = 2
 
        /* mantissa = 0x1.10...*/
-       {0x1.126cd80000000p+0, 0x1.0600000000000p+0, 0x1.f44659e4a4271p-1, 0x1.dd9fb30af3365p-1}, // exponent = 0
-       {0x1.122d740000000p+1, 0x1.4a00000000000p+0, 0x1.8d3018d3018d3p-1, 0x1.de0e209af882ep-2}, // exponent = 1
-       {0x1.12a0000000000p+2, 0x1.a000000000000p+0, 0x1.3b13b13b13b14p-1, 0x1.dd46baab49c24p-3}, // exponent = 2
+       {.x = 0x1.126cd80000000p+0, 0x1.0600000000000p+0, 0x1.f44659e4a4271p-1, 0x1.dd9fb30af3365p-1}, // exponent = 0
+       {.x = 0x1.122d740000000p+1, 0x1.4a00000000000p+0, 0x1.8d3018d3018d3p-1, 0x1.de0e209af882ep-2}, // exponent = 1
+       {.x = 0x1.12a0000000000p+2, 0x1.a000000000000p+0, 0x1.3b13b13b13b14p-1, 0x1.dd46baab49c24p-3}, // exponent = 2
 
        /* mantissa = 0x1.14...*/
-       {0x1.15f9b5b480000p+0, 0x1.0720000000000p+0, 0x1.f222c82dba316p-1, 0x1.d786108fd7a9fp-1}, // exponent = 0
-       {0x1.1731600000000p+1, 0x1.4c00000000000p+0, 0x1.8acb90f6bf3aap-1, 0x1.d577b2f5c6f87p-2}, // exponent = 1
-       {0x1.169ae20000000p+2, 0x1.a200000000000p+0, 0x1.3991c2c187f63p-1, 0x1.d67549c6f9b67p-3}, // exponent = 2
+       {.x = 0x1.15f9b5b480000p+0, 0x1.0720000000000p+0, 0x1.f222c82dba316p-1, 0x1.d786108fd7a9fp-1}, // exponent = 0
+       {.x = 0x1.1731600000000p+1, 0x1.4c00000000000p+0, 0x1.8acb90f6bf3aap-1, 0x1.d577b2f5c6f87p-2}, // exponent = 1
+       {.x = 0x1.169ae20000000p+2, 0x1.a200000000000p+0, 0x1.3991c2c187f63p-1, 0x1.d67549c6f9b67p-3}, // exponent = 2
 
        /* mantissa = 0x1.18...*/
-       {0x1.18c2000000000p+0, 0x1.0800000000000p+0, 0x1.f07c1f07c1f08p-1, 0x1.d2d9cbd756afdp-1}, // exponent = 0
-       {0x1.19fb2ce620540p+1, 0x1.4d1a000000000p+0, 0x1.897d564f5cf98p-1, 0x1.d0d34ccd78141p-2}, // exponent = 1
-       {0x1.1a9f900000000p+2, 0x1.a400000000000p+0, 0x1.3813813813814p-1, 0x1.cfc4ef7db5bffp-3}, // exponent = 2
+       {.x = 0x1.18c2000000000p+0, 0x1.0800000000000p+0, 0x1.f07c1f07c1f08p-1, 0x1.d2d9cbd756afdp-1}, // exponent = 0
+       {.x = 0x1.19fb2ce620540p+1, 0x1.4d1a000000000p+0, 0x1.897d564f5cf98p-1, 0x1.d0d34ccd78141p-2}, // exponent = 1
+       {.x = 0x1.1a9f900000000p+2, 0x1.a400000000000p+0, 0x1.3813813813814p-1, 0x1.cfc4ef7db5bffp-3}, // exponent = 2
 
        /* mantissa = 0x1.1c...*/
-       {0x1.1f2fe80000000p+0, 0x1.0a00000000000p+0, 0x1.ecc07b301ecc0p-1, 0x1.c86636f753a66p-1}, // exponent = 0
-       {0x1.1c44dc0000000p+1, 0x1.4e00000000000p+0, 0x1.886e5f0abb04ap-1, 0x1.cd159cdbba714p-2}, // exponent = 1
-       {0x1.1eae160000000p+2, 0x1.a600000000000p+0, 0x1.3698df3de0748p-1, 0x1.c934e4095d202p-3}, // exponent = 2
+       {.x = 0x1.1f2fe80000000p+0, 0x1.0a00000000000p+0, 0x1.ecc07b301ecc0p-1, 0x1.c86636f753a66p-1}, // exponent = 0
+       {.x = 0x1.1c44dc0000000p+1, 0x1.4e00000000000p+0, 0x1.886e5f0abb04ap-1, 0x1.cd159cdbba714p-2}, // exponent = 1
+       {.x = 0x1.1eae160000000p+2, 0x1.a600000000000p+0, 0x1.3698df3de0748p-1, 0x1.c934e4095d202p-3}, // exponent = 2
 
        /* mantissa = 0x1.20...*/
-       {0x1.21fac7ca59c00p+0, 0x1.0adc000000000p+0, 0x1.eb2a412496abdp-1, 0x1.c40112c606d3ep-1}, // exponent = 0
-       {0x1.2168000000000p+1, 0x1.5000000000000p+0, 0x1.8618618618618p-1, 0x1.c4e651e0c37d7p-2}, // exponent = 1
-       {0x1.22c6800000000p+2, 0x1.a800000000000p+0, 0x1.3521cfb2b78c1p-1, 0x1.c2c46544650c1p-3}, // exponent = 2
+       {.x = 0x1.21fac7ca59c00p+0, 0x1.0adc000000000p+0, 0x1.eb2a412496abdp-1, 0x1.c40112c606d3ep-1}, // exponent = 0
+       {.x = 0x1.2168000000000p+1, 0x1.5000000000000p+0, 0x1.8618618618618p-1, 0x1.c4e651e0c37d7p-2}, // exponent = 1
+       {.x = 0x1.22c6800000000p+2, 0x1.a800000000000p+0, 0x1.3521cfb2b78c1p-1, 0x1.c2c46544650c1p-3}, // exponent = 2
 
        /* mantissa = 0x1.24...*/
-       {0x1.25b6c00000000p+0, 0x1.0c00000000000p+0, 0x1.e9131abf0b767p-1, 0x1.be41e7ee3f7edp-1}, // exponent = 0
-       {0x1.269ae40000000p+1, 0x1.5200000000000p+0, 0x1.83c977ab2beddp-1, 0x1.bce853967753cp-2}, // exponent = 1
-       {0x1.26e8da0000000p+2, 0x1.aa00000000000p+0, 0x1.33ae45b57bcb2p-1, 0x1.bc72b67ab9ce7p-3}, // exponent = 2
+       {.x = 0x1.25b6c00000000p+0, 0x1.0c00000000000p+0, 0x1.e9131abf0b767p-1, 0x1.be41e7ee3f7edp-1}, // exponent = 0
+       {.x = 0x1.269ae40000000p+1, 0x1.5200000000000p+0, 0x1.83c977ab2beddp-1, 0x1.bce853967753cp-2}, // exponent = 1
+       {.x = 0x1.26e8da0000000p+2, 0x1.aa00000000000p+0, 0x1.33ae45b57bcb2p-1, 0x1.bc72b67ab9ce7p-3}, // exponent = 2
 
        /* mantissa = 0x1.28...*/
-       {0x1.29ff9aaaa2c00p+0, 0x1.0d4c000000000p+0, 0x1.e6b8275501adbp-1, 0x1.b7d7596e80007p-1}, // exponent = 0
-       {0x1.2bdda00000000p+1, 0x1.5400000000000p+0, 0x1.8181818181818p-1, 0x1.b51a30f9739f8p-2}, // exponent = 1
-       {0x1.2b15300000000p+2, 0x1.ac00000000000p+0, 0x1.323e34a2b10bfp-1, 0x1.b63f203c60c07p-3}, // exponent = 2
+       {.x = 0x1.29ff9aaaa2c00p+0, 0x1.0d4c000000000p+0, 0x1.e6b8275501adbp-1, 0x1.b7d7596e80007p-1}, // exponent = 0
+       {.x = 0x1.2bdda00000000p+1, 0x1.5400000000000p+0, 0x1.8181818181818p-1, 0x1.b51a30f9739f8p-2}, // exponent = 1
+       {.x = 0x1.2b15300000000p+2, 0x1.ac00000000000p+0, 0x1.323e34a2b10bfp-1, 0x1.b63f203c60c07p-3}, // exponent = 2
 
        /* mantissa = 0x1.2c...*/
-       {0x1.2c56b80000000p+0, 0x1.0e00000000000p+0, 0x1.e573ac901e574p-1, 0x1.b469f4adc7794p-1}, // exponent = 0
-       {0x1.2dfff74f29dc0p+1, 0x1.54ce000000000p+0, 0x1.80987c755886ap-1, 0x1.b203708429799p-2}, // exponent = 1
-       {0x1.2f4b8e0000000p+2, 0x1.ae00000000000p+0, 0x1.30d190130d190p-1, 0x1.b028f031c8644p-3}, // exponent = 2
+       {.x = 0x1.2c56b80000000p+0, 0x1.0e00000000000p+0, 0x1.e573ac901e574p-1, 0x1.b469f4adc7794p-1}, // exponent = 0
+       {.x = 0x1.2dfff74f29dc0p+1, 0x1.54ce000000000p+0, 0x1.80987c755886ap-1, 0x1.b203708429799p-2}, // exponent = 1
+       {.x = 0x1.2f4b8e0000000p+2, 0x1.ae00000000000p+0, 0x1.30d190130d190p-1, 0x1.b028f031c8644p-3}, // exponent = 2
 
        /* mantissa = 0x1.30...*/
-       {0x1.3310000000000p+0, 0x1.1000000000000p+0, 0x1.e1e1e1e1e1e1ep-1, 0x1.aadb93d39ae9cp-1}, // exponent = 0
-       {0x1.31304c0000000p+1, 0x1.5600000000000p+0, 0x1.7f405fd017f40p-1, 0x1.ad7a85e593e54p-2}, // exponent = 1
-       {0x1.338c000000000p+2, 0x1.b000000000000p+0, 0x1.2f684bda12f68p-1, 0x1.aa2f78f1b4cc6p-3}, // exponent = 2
+       {.x = 0x1.3310000000000p+0, 0x1.1000000000000p+0, 0x1.e1e1e1e1e1e1ep-1, 0x1.aadb93d39ae9cp-1}, // exponent = 0
+       {.x = 0x1.31304c0000000p+1, 0x1.5600000000000p+0, 0x1.7f405fd017f40p-1, 0x1.ad7a85e593e54p-2}, // exponent = 1
+       {.x = 0x1.338c000000000p+2, 0x1.b000000000000p+0, 0x1.2f684bda12f68p-1, 0x1.aa2f78f1b4cc6p-3}, // exponent = 2
 
        /* mantissa = 0x1.34... */
-       {0x1.35fb6f4579c00p+0, 0x1.10dc000000000p+0, 0x1.e05d5a24448c5p-1, 0x1.a6d6548fa984dp-1}, // exponent = 0
-       {0x1.3693000000000p+1, 0x1.5800000000000p+0, 0x1.7d05f417d05f4p-1, 0x1.a607fa909db1fp-2}, // exponent = 1
-       {0x1.37d6920000000p+2, 0x1.b200000000000p+0, 0x1.2e025c04b8097p-1, 0x1.a45211d8b748ap-3}, // exponent = 2
+       {.x = 0x1.35fb6f4579c00p+0, 0x1.10dc000000000p+0, 0x1.e05d5a24448c5p-1, 0x1.a6d6548fa984dp-1}, // exponent = 0
+       {.x = 0x1.3693000000000p+1, 0x1.5800000000000p+0, 0x1.7d05f417d05f4p-1, 0x1.a607fa909db1fp-2}, // exponent = 1
+       {.x = 0x1.37d6920000000p+2, 0x1.b200000000000p+0, 0x1.2e025c04b8097p-1, 0x1.a45211d8b748ap-3}, // exponent = 2
 
 /* mantissa = 0x1.38... */
-       {0x1.39e2c80000000p+0, 0x1.1200000000000p+0, 0x1.de5d6e3f8868ap-1, 0x1.a1941b013022dp-1}, // exponent = 0
-       {0x1.39fe541ac7840p+1, 0x1.5942000000000p+0, 0x1.7ba298eae8947p-1, 0x1.a16f787114257p-2}, // exponent = 1
-       {0x1.39ffaac000000p+2, 0x1.b300000000000p+0, 0x1.2d50a012d50a0p-1, 0x1.a16db0ec408b2p-3}, // exponent = 2
+       {.x = 0x1.39e2c80000000p+0, 0x1.1200000000000p+0, 0x1.de5d6e3f8868ap-1, 0x1.a1941b013022dp-1}, // exponent = 0
+       {.x = 0x1.39fe541ac7840p+1, 0x1.5942000000000p+0, 0x1.7ba298eae8947p-1, 0x1.a16f787114257p-2}, // exponent = 1
+       {.x = 0x1.39ffaac000000p+2, 0x1.b300000000000p+0, 0x1.2d50a012d50a0p-1, 0x1.a16db0ec408b2p-3}, // exponent = 2
 
        /* mantissa = 0x1.3c... */
-       {0x1.3dfc1312b0000p+0, 0x1.1330000000000p+0, 0x1.dc4cfaf10eb5cp-1, 0x1.9c322b87f17e8p-1}, // exponent = 0
+       {.x = 0x1.3dfc1312b0000p+0, 0x1.1330000000000p+0, 0x1.dc4cfaf10eb5cp-1, 0x1.9c322b87f17e8p-1}, // exponent = 0
        {0x1.3c05d40000000p+1, 0x1.5a00000000000p+0, 0x1.7ad2208e0ecc3p-1, 0x1.9ec1430b0dfc7p-2}, // exponent = 1
        {0x1.3c2b500000000p+2, 0x1.b400000000000p+0, 0x1.2c9fb4d812ca0p-1, 0x1.9e9016e2211b6p-3}, // exponent = 2
 
@@ -349,24 +377,24 @@ static const struct cbrt_table_entry cbrt_table[] = {
        {0x1.ee35ca0000000p+2, 0x1.fa00000000000p+0, 0x1.03091b51f5e1ap-1, 0x1.093712d33ff42p-3}, // exponent = 2
 
        /* mantissa = 0x1.f0... */
-       {0x1.f1fd112ab0c80p+0, 0x1.3f92000000000p+0, 0x1.9a2696dd75ba1p-1, 0x1.0733ed7907e73p-1}, // exponent = 0
-       {0x1.f1fc8b255bc40p+1, 0x1.92a2000000000p+0, 0x1.45898cb57730cp-1, 0x1.0734344eaebefp-2}, // exponent = 1
-       {0x1.f1ff2ff2d4ba0p+2, 0x1.fb4a000000000p+0, 0x1.02609989a73cfp-1, 0x1.0732ce999c3d1p-3}, // exponent = 2
+       {.x = 0x1.f1fd112ab0c80p+0, 0x1.3f92000000000p+0, 0x1.9a2696dd75ba1p-1, 0x1.0733ed7907e73p-1}, // exponent = 0
+       {.x = 0x1.f1fc8b255bc40p+1, 0x1.92a2000000000p+0, 0x1.45898cb57730cp-1, 0x1.0734344eaebefp-2}, // exponent = 1
+       {.x = 0x1.f1ff2ff2d4ba0p+2, 0x1.fb4a000000000p+0, 0x1.02609989a73cfp-1, 0x1.0732ce999c3d1p-3}, // exponent = 2
 
        /* mantissa = 0x1.f4... */
-       {0x1.f400000000000p+0, 0x1.4000000000000p+0, 0x1.999999999999ap-1, 0x1.0624dd2f1a9fcp-1}, // exponent = 0
-       {0x1.f713a00000000p+1, 0x1.9400000000000p+0, 0x1.446f86562d9fbp-1, 0x1.048a727489527p-2}, // exponent = 1
-       {0x1.f417f00000000p+2, 0x1.fc00000000000p+0, 0x1.0204081020408p-1, 0x1.061850f2a7123p-3}, // exponent = 2
+       {.x = 0x1.f400000000000p+0, 0x1.4000000000000p+0, 0x1.999999999999ap-1, 0x1.0624dd2f1a9fcp-1}, // exponent = 0
+       {.x = 0x1.f713a00000000p+1, 0x1.9400000000000p+0, 0x1.446f86562d9fbp-1, 0x1.048a727489527p-2}, // exponent = 1
+       {.x = 0x1.f417f00000000p+2, 0x1.fc00000000000p+0, 0x1.0204081020408p-1, 0x1.061850f2a7123p-3}, // exponent = 2
 
        /* mantissa = 0x1.f8... */
-       {0x1.f9fe36d7a7d80p+0, 0x1.4146000000000p+0, 0x1.97f9f956c92fdp-1, 0x1.030a055aebeddp-1}, // exponent = 0
-       {0x1.f9f8b6ce70ec0p+1, 0x1.94c6000000000p+0, 0x1.43d0d2af8e146p-1, 0x1.030cd637fd65ep-2}, // exponent = 1
-       {0x1.fa05fe0000000p+2, 0x1.fe00000000000p+0, 0x1.0101010101010p-1, 0x1.03060a0f151c2p-3}, // exponent = 2
+       {.x = 0x1.f9fe36d7a7d80p+0, 0x1.4146000000000p+0, 0x1.97f9f956c92fdp-1, 0x1.030a055aebeddp-1}, // exponent = 0
+       {.x = 0x1.f9f8b6ce70ec0p+1, 0x1.94c6000000000p+0, 0x1.43d0d2af8e146p-1, 0x1.030cd637fd65ep-2}, // exponent = 1
+       {.x = 0x1.fa05fe0000000p+2, 0x1.fe00000000000p+0, 0x1.0101010101010p-1, 0x1.03060a0f151c2p-3}, // exponent = 2
 
        /* mantissa = 0x1.fc... */
-       {0x1.fd6f080000000p+0, 0x1.4200000000000p+0, 0x1.970e4f80cb872p-1, 0x1.014a239d8b1a9p-1}, // exponent = 0
-       {0x1.fe95cc0000000p+1, 0x1.9600000000000p+0, 0x1.42d6625d51f87p-1, 0x1.00b59a78a8ffcp-2}, // exponent = 1
-       {0x1.0000000000000p+3, 0x1.0000000000000p+1, 0x1.0000000000000p-1, 0x1.0000000000000p-3}, // exponent = 2
+       {.x = 0x1.fd6f080000000p+0, 0x1.4200000000000p+0, 0x1.970e4f80cb872p-1, 0x1.014a239d8b1a9p-1}, // exponent = 0
+       {.x = 0x1.fe95cc0000000p+1, 0x1.9600000000000p+0, 0x1.42d6625d51f87p-1, 0x1.00b59a78a8ffcp-2}, // exponent = 1
+       {.x = 0x1.0000000000000p+3, 0x1.0000000000000p+1, 0x1.0000000000000p-1, 0x1.0000000000000p-3}, // exponent = 2
 };
 
 union floatdata { float f; int32_t x; };
index c38c6a6fa852a151116e2ff203b1ad68fb063221..2e2d4581d4ac0647a3f23d2fe22941d565adc47f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2002-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -457,20 +457,20 @@ struct test {
 };
 
 struct test tests[] = {
-       { "empty", test_empty, sizeof(test_empty), TRUE },
-       { "simple", test_simple, sizeof(test_simple), TRUE },
-       { "vendor", test_vendor, sizeof(test_vendor), TRUE },
-       { "no_end", test_no_end, sizeof(test_no_end), TRUE },
-       { "no magic", test_no_magic, sizeof(test_no_magic), FALSE },
-       { "short", test_short, sizeof(test_short), FALSE },
-       { NULL, NULL, 0, FALSE },
+       { .name = "empty", .data = test_empty, .len = sizeof(test_empty), .result = TRUE },
+       { .name = "simple", .data = test_simple, .len = sizeof(test_simple), .result = TRUE },
+       { .name = "vendor", .data = test_vendor, .len = sizeof(test_vendor), .result = TRUE },
+       { .name = "no_end", .data = test_no_end, .len = sizeof(test_no_end), .result = TRUE },
+       { .name = "no magic", .data = test_no_magic, .len = sizeof(test_no_magic), .result = FALSE },
+       { .name = "short", .data = test_short, .len = sizeof(test_short), .result =  FALSE },
+       { .name = NULL, .data = NULL, .len = 0, .result = FALSE },
 };
 
 
 static char buf[2048];
 
 int
-main()
+main(void)
 {
        int         i;
        dhcpol_t    options;
index 13b9cad5d7ff1637fb7d40f064de93a33f40cc6f..ebd14a5b1e5e9ea9a08be57c970c3e98682cbc8e 100644 (file)
@@ -45,6 +45,8 @@
 #include <libkern/tree.h>
 #include <kern/locks.h>
 #include <kern/debug.h>
+#include <kern/task.h>
+#include <mach/task_info.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/flowhash.h>
 #include <libkern/crypto/sha1.h>
 #include <libkern/crypto/crypto_internal.h>
 #include <os/log.h>
+#include <corecrypto/cc.h>
+#if CONTENT_FILTER
+#include <net/content_filter.h>
+#endif /* CONTENT_FILTER */
 
 #define FLOW_DIVERT_CONNECT_STARTED             0x00000001
 #define FLOW_DIVERT_READ_CLOSED                 0x00000002
@@ -472,13 +478,21 @@ flow_divert_packet_get_tlv(mbuf_t packet, int offset, uint8_t type, size_t buff_
 
        length = ntohl(length);
 
+       uint32_t data_offset = tlv_offset + sizeof(type) + sizeof(length);
+
+       if (length > (mbuf_pkthdr_len(packet) - data_offset)) {
+               FDLOG(LOG_ERR, &nil_pcb, "Length of %u TLV (%u) is larger than remaining packet data (%lu)", type, length, (mbuf_pkthdr_len(packet) - data_offset));
+               return EINVAL;
+       }
+
        if (val_size != NULL) {
                *val_size = length;
        }
 
        if (buff != NULL && buff_len > 0) {
+               memset(buff, 0, buff_len);
                size_t to_copy = (length < buff_len) ? length : buff_len;
-               error = mbuf_copydata(packet, tlv_offset + sizeof(type) + sizeof(length), to_copy, buff);
+               error = mbuf_copydata(packet, data_offset, to_copy, buff);
                if (error) {
                        return error;
                }
@@ -560,7 +574,7 @@ flow_divert_packet_verify_hmac(mbuf_t packet, uint32_t ctl_unit)
                goto done;
        }
 
-       if (memcmp(packet_hmac, computed_hmac, sizeof(packet_hmac))) {
+       if (cc_cmp_safe(sizeof(packet_hmac), packet_hmac, computed_hmac)) {
                FDLOG0(LOG_WARNING, &nil_pcb, "HMAC in token does not match computed HMAC");
                error = EINVAL;
                goto done;
@@ -631,6 +645,20 @@ flow_divert_check_no_expensive(struct flow_divert_pcb *fd_cb)
        return 0;
 }
 
+static errno_t
+flow_divert_check_no_constrained(struct flow_divert_pcb *fd_cb)
+{
+       struct inpcb *inp = NULL;
+
+       inp = sotoinpcb(fd_cb->so);
+       if (inp && INP_NO_CONSTRAINED(inp) && inp->inp_last_outifp &&
+           IFNET_IS_CONSTRAINED(inp->inp_last_outifp)) {
+               return EHOSTUNREACH;
+       }
+
+       return 0;
+}
+
 static void
 flow_divert_update_closed_state(struct flow_divert_pcb *fd_cb, int how, Boolean tunnel)
 {
@@ -1022,10 +1050,10 @@ flow_divert_create_connect_packet(struct flow_divert_pcb *fd_cb, struct sockaddr
 
        socket_unlock(so, 0);
 
-       if (signing_id == NULL) {
-               release_proc = flow_divert_get_src_proc(so, &src_proc);
-               if (src_proc != PROC_NULL) {
-                       proc_lock(src_proc);
+       release_proc = flow_divert_get_src_proc(so, &src_proc);
+       if (src_proc != PROC_NULL) {
+               proc_lock(src_proc);
+               if (signing_id == NULL) {
                        if (src_proc->p_csflags & (CS_VALID | CS_DEBUGGED)) {
                                const char * cs_id;
                                cs_id = cs_identity_get(src_proc);
@@ -1033,11 +1061,9 @@ flow_divert_create_connect_packet(struct flow_divert_pcb *fd_cb, struct sockaddr
                        } else {
                                FDLOG0(LOG_WARNING, fd_cb, "Signature is invalid");
                        }
-               } else {
-                       FDLOG0(LOG_WARNING, fd_cb, "Failed to determine the current proc");
                }
        } else {
-               src_proc = PROC_NULL;
+               FDLOG0(LOG_WARNING, fd_cb, "Failed to determine the current proc");
        }
 
        if (signing_id != NULL) {
@@ -1080,6 +1106,27 @@ flow_divert_create_connect_packet(struct flow_divert_pcb *fd_cb, struct sockaddr
                }
        }
 
+       if (error == 0 && src_proc != PROC_NULL) {
+               task_t task = proc_task(src_proc);
+               if (task != TASK_NULL) {
+                       audit_token_t audit_token;
+                       mach_msg_type_number_t count = TASK_AUDIT_TOKEN_COUNT;
+                       kern_return_t rc = task_info(task, TASK_AUDIT_TOKEN, (task_info_t)&audit_token, &count);
+                       if (rc == KERN_SUCCESS) {
+                               error = flow_divert_packet_append_tlv(connect_packet,
+                                   FLOW_DIVERT_TLV_APP_AUDIT_TOKEN,
+                                   sizeof(audit_token_t),
+                                   &audit_token);
+                               if (error) {
+                                       FDLOG(LOG_ERR, fd_cb, "failed to append app audit token: %d", error);
+                                       error = 0; /* do not treat this as fatal error, proceed */
+                               }
+                       } else {
+                               FDLOG(LOG_ERR, fd_cb, "failed to retrieve app audit token: %d", rc);
+                       }
+               }
+       }
+
        if (src_proc != PROC_NULL) {
                proc_unlock(src_proc);
                if (release_proc) {
@@ -1768,12 +1815,38 @@ flow_divert_handle_connect_result(struct flow_divert_pcb *fd_cb, mbuf_t packet,
                        }
                        fd_cb->local_address = dup_sockaddr((struct sockaddr *)&local_address, 1);
                }
+               if (flow_divert_is_sockaddr_valid((struct sockaddr *)&local_address)) {
+                       if (inp->inp_vflag & INP_IPV4 && local_address.ss_family == AF_INET) {
+                               struct sockaddr_in *local_in_address = (struct sockaddr_in *)&local_address;
+                               inp->inp_lport = local_in_address->sin_port;
+                               memcpy(&inp->inp_laddr, &local_in_address->sin_addr, sizeof(struct in_addr));
+                       } else if (inp->inp_vflag & INP_IPV6 && local_address.ss_family == AF_INET6) {
+                               struct sockaddr_in6 *local_in6_address = (struct sockaddr_in6 *)&local_address;
+                               inp->inp_lport = local_in6_address->sin6_port;
+                               memcpy(&inp->in6p_laddr, &local_in6_address->sin6_addr, sizeof(struct in6_addr));
+                       }
+               }
 
                if (remote_address.ss_family != 0) {
+                       if (fd_cb->remote_address != NULL) {
+                               FREE(fd_cb->remote_address, M_SONAME);
+                               fd_cb->remote_address = NULL;
+                       }
                        if (remote_address.ss_len > sizeof(remote_address)) {
                                remote_address.ss_len = sizeof(remote_address);
                        }
                        fd_cb->remote_address = dup_sockaddr((struct sockaddr *)&remote_address, 1);
+                       if (flow_divert_is_sockaddr_valid((struct sockaddr *)&remote_address)) {
+                               if (inp->inp_vflag & INP_IPV4 && remote_address.ss_family == AF_INET) {
+                                       struct sockaddr_in *remote_in_address = (struct sockaddr_in *)&remote_address;
+                                       inp->inp_fport = remote_in_address->sin_port;
+                                       memcpy(&inp->inp_faddr, &remote_in_address->sin_addr, sizeof(struct in_addr));
+                               } else if (inp->inp_vflag & INP_IPV6 && remote_address.ss_family == AF_INET6) {
+                                       struct sockaddr_in6 *remote_in6_address = (struct sockaddr_in6 *)&remote_address;
+                                       inp->inp_fport = remote_in6_address->sin6_port;
+                                       memcpy(&inp->in6p_faddr, &remote_in6_address->sin6_addr, sizeof(struct in6_addr));
+                               }
+                       }
                } else {
                        error = EINVAL;
                        goto set_socket_state;
@@ -1857,6 +1930,15 @@ set_socket_state:
                        }
                        flow_divert_disconnect_socket(fd_cb->so);
                } else {
+#if NECP
+                       /* Update NECP client with connected five-tuple */
+                       if (!uuid_is_null(inp->necp_client_uuid)) {
+                               socket_unlock(fd_cb->so, 0);
+                               necp_client_assign_from_socket(fd_cb->so->last_pid, inp->necp_client_uuid, inp);
+                               socket_lock(fd_cb->so, 0);
+                       }
+#endif /* NECP */
+
                        flow_divert_send_buffered_data(fd_cb, FALSE);
                        soisconnected(fd_cb->so);
                }
@@ -1917,26 +1999,27 @@ flow_divert_handle_close(struct flow_divert_pcb *fd_cb, mbuf_t packet, int offse
 static mbuf_t
 flow_divert_get_control_mbuf(struct flow_divert_pcb *fd_cb)
 {
-       if (fd_cb->local_address != NULL) {
-               struct inpcb *inp = sotoinpcb(fd_cb->so);
-               if ((inp->inp_vflag & INP_IPV4) &&
-                   (inp->inp_flags & INP_RECVDSTADDR) &&
-                   fd_cb->local_address->sa_family == AF_INET &&
-                   fd_cb->local_address->sa_len >= sizeof(struct sockaddr_in)) {
+       struct inpcb *inp = sotoinpcb(fd_cb->so);
+       if ((inp->inp_vflag & INP_IPV4) && (inp->inp_flags & INP_RECVDSTADDR)) {
+               struct in_addr ia = { };
+
+               if (fd_cb->local_address != NULL && fd_cb->local_address->sa_family == AF_INET && fd_cb->local_address->sa_len >= sizeof(struct sockaddr_in)) {
                        struct sockaddr_in *sin = (struct sockaddr_in *)(void *)fd_cb->local_address;
+                       bcopy(&sin->sin_addr, &ia, sizeof(struct in_addr));
+               }
 
-                       return sbcreatecontrol((caddr_t) &sin->sin_addr, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
-               } else if ((inp->inp_vflag & INP_IPV6) &&
-                   (inp->inp_flags & IN6P_PKTINFO) &&
-                   fd_cb->local_address->sa_family == AF_INET6 &&
-                   fd_cb->local_address->sa_len >= sizeof(struct sockaddr_in6)) {
-                       struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)(void *)fd_cb->local_address;
-                       struct in6_pktinfo pi6;
+               return sbcreatecontrol((caddr_t)&ia, sizeof(ia), IP_RECVDSTADDR, IPPROTO_IP);
+       } else if ((inp->inp_vflag & INP_IPV6) && (inp->inp_flags & IN6P_PKTINFO)) {
+               struct in6_pktinfo pi6;
+               memset(&pi6, 0, sizeof(pi6));
 
+               if (fd_cb->local_address != NULL && fd_cb->local_address->sa_family == AF_INET6 && fd_cb->local_address->sa_len >= sizeof(struct sockaddr_in6)) {
+                       struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)(void *)fd_cb->local_address;
                        bcopy(&sin6->sin6_addr, &pi6.ipi6_addr, sizeof(struct in6_addr));
                        pi6.ipi6_ifindex = 0;
-                       return sbcreatecontrol((caddr_t)&pi6, sizeof(struct in6_pktinfo), IPV6_PKTINFO, IPPROTO_IPV6);
                }
+
+               return sbcreatecontrol((caddr_t)&pi6, sizeof(pi6), IPV6_PKTINFO, IPPROTO_IPV6);
        }
        return NULL;
 }
@@ -1981,7 +2064,8 @@ flow_divert_handle_data(struct flow_divert_pcb *fd_cb, mbuf_t packet, size_t off
                        FDLOG(LOG_ERR, fd_cb, "mbuf_split failed: %d", error);
                } else {
                        if (flow_divert_check_no_cellular(fd_cb) ||
-                           flow_divert_check_no_expensive(fd_cb)) {
+                           flow_divert_check_no_expensive(fd_cb) ||
+                           flow_divert_check_no_constrained(fd_cb)) {
                                flow_divert_update_closed_state(fd_cb, SHUT_RDWR, TRUE);
                                flow_divert_send_close(fd_cb, SHUT_RDWR);
                                flow_divert_disconnect_socket(fd_cb->so);
@@ -2012,13 +2096,14 @@ flow_divert_handle_data(struct flow_divert_pcb *fd_cb, mbuf_t packet, size_t off
                                        }
 
                                        mctl = flow_divert_get_control_mbuf(fd_cb);
-                                       if (sbappendaddr(&fd_cb->so->so_rcv, append_sa, data, mctl, NULL)) {
+                                       int append_error = 0;
+                                       if (sbappendaddr(&fd_cb->so->so_rcv, append_sa, data, mctl, &append_error)) {
                                                fd_cb->bytes_received += data_size;
                                                flow_divert_add_data_statistics(fd_cb, data_size, FALSE);
                                                fd_cb->sb_size = fd_cb->so->so_rcv.sb_cc;
                                                sorwakeup(fd_cb->so);
                                                data = NULL;
-                                       } else {
+                                       } else if (append_error != EJUSTRETURN) {
                                                FDLOG0(LOG_ERR, fd_cb, "received data, but sbappendaddr failed");
                                        }
                                        if (!error) {
@@ -2082,6 +2167,11 @@ flow_divert_handle_group_init(struct flow_divert_group *group, mbuf_t packet, in
 
        lck_rw_lock_exclusive(&group->lck);
 
+       if (group->token_key != NULL) {
+               FREE(group->token_key, M_TEMP);
+               group->token_key = NULL;
+       }
+
        MALLOC(group->token_key, uint8_t *, key_size, M_TEMP, M_WAITOK);
        error = flow_divert_packet_get_tlv(packet, offset, FLOW_DIVERT_TLV_TOKEN_KEY, key_size, group->token_key, NULL);
        if (error) {
@@ -2554,6 +2644,12 @@ flow_divert_append_target_endpoint_tlv(mbuf_t connect_packet, struct sockaddr *t
        int error = 0;
        int port  = 0;
 
+       if (!flow_divert_is_sockaddr_valid(toaddr)) {
+               FDLOG(LOG_ERR, &nil_pcb, "Invalid target address, family = %u, length = %u", toaddr->sa_family, toaddr->sa_len);
+               error = EINVAL;
+               goto done;
+       }
+
        error = flow_divert_packet_append_tlv(connect_packet, FLOW_DIVERT_TLV_TARGET_ADDRESS, toaddr->sa_len, toaddr);
        if (error) {
                goto done;
@@ -2594,13 +2690,13 @@ flow_divert_is_sockaddr_valid(struct sockaddr *addr)
 {
        switch (addr->sa_family) {
        case AF_INET:
-               if (addr->sa_len != sizeof(struct sockaddr_in)) {
+               if (addr->sa_len < sizeof(struct sockaddr_in)) {
                        return FALSE;
                }
                break;
 #if INET6
        case AF_INET6:
-               if (addr->sa_len != sizeof(struct sockaddr_in6)) {
+               if (addr->sa_len < sizeof(struct sockaddr_in6)) {
                        return FALSE;
                }
                break;
@@ -3095,7 +3191,8 @@ flow_divert_data_out(struct socket *so, int flags, mbuf_t data, struct sockaddr
        }
 
        error = flow_divert_check_no_cellular(fd_cb) ||
-           flow_divert_check_no_expensive(fd_cb);
+           flow_divert_check_no_expensive(fd_cb) ||
+           flow_divert_check_no_constrained(fd_cb);
        if (error) {
                goto done;
        }
@@ -3103,6 +3200,21 @@ flow_divert_data_out(struct socket *so, int flags, mbuf_t data, struct sockaddr
        /* Implicit connect */
        if (!(fd_cb->flags & FLOW_DIVERT_CONNECT_STARTED)) {
                FDLOG0(LOG_INFO, fd_cb, "implicit connect");
+
+#if CONTENT_FILTER
+               /*
+                * If the socket is subject to a UDP Content Filter and no remote address is passed in,
+                * retrieve the CFIL saved remote address from the mbuf and use it.
+                */
+               if (to == NULL && so->so_cfil_db) {
+                       struct sockaddr *cfil_faddr = NULL;
+                       struct m_tag *cfil_tag = cfil_udp_get_socket_state(data, NULL, NULL, &cfil_faddr);
+                       if (cfil_tag) {
+                               to = (struct sockaddr *)(void *)cfil_faddr;
+                       }
+                       FDLOG(LOG_INFO, fd_cb, "Using remote address from CFIL saved state: %p", to);
+               }
+#endif
                error = flow_divert_connect_out(so, to, p);
                if (error) {
                        goto done;
@@ -3658,8 +3770,21 @@ flow_divert_kctl_disconnect(kern_ctl_ref kctlref __unused, uint32_t unit, void *
                panic("group with unit %d (%p) != unit info (%p)", unit, group, unitinfo);
        }
 
+       g_flow_divert_groups[unit] = NULL;
+       g_active_group_count--;
+
+       if (g_active_group_count == 0) {
+               FREE(g_flow_divert_groups, M_TEMP);
+               g_flow_divert_groups = NULL;
+       }
+
+       lck_rw_done(&g_flow_divert_group_lck);
+
        if (group != NULL) {
                flow_divert_close_all(group);
+
+               lck_rw_lock_exclusive(&group->lck);
+
                if (group->token_key != NULL) {
                        memset(group->token_key, 0, group->token_key_size);
                        FREE(group->token_key, M_TEMP);
@@ -3674,20 +3799,13 @@ flow_divert_kctl_disconnect(kern_ctl_ref kctlref __unused, uint32_t unit, void *
                memset(&group->signing_id_trie, 0, sizeof(group->signing_id_trie));
                group->signing_id_trie.root = NULL_TRIE_IDX;
 
+               lck_rw_done(&group->lck);
+
                FREE_ZONE(group, sizeof(*group), M_FLOW_DIVERT_GROUP);
-               g_flow_divert_groups[unit] = NULL;
-               g_active_group_count--;
        } else {
                error = EINVAL;
        }
 
-       if (g_active_group_count == 0) {
-               FREE(g_flow_divert_groups, M_TEMP);
-               g_flow_divert_groups = NULL;
-       }
-
-       lck_rw_done(&g_flow_divert_group_lck);
-
        return error;
 }
 
index 84c39eb663b3328cf46076a4f8bf6d2c4a775dfe..5961653b9ef7064be4cc9176a7761341b37a69a1 100644 (file)
@@ -69,6 +69,7 @@
 #define FLOW_DIVERT_TLV_FLAGS                   29
 #define FLOW_DIVERT_TLV_FLOW_TYPE               30
 #define FLOW_DIVERT_TLV_APP_DATA                31
+#define FLOW_DIVERT_TLV_APP_AUDIT_TOKEN         32
 
 #define FLOW_DIVERT_FLOW_TYPE_TCP               1
 #define FLOW_DIVERT_FLOW_TYPE_UDP               3
index 6f437293563a33b4d3ddf28a169157f8e02209ec..4e7daeaf13ff6268a97c56a3540da28bb92a2fa7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -202,7 +202,7 @@ static struct igmpstat_v3 igmpstat_v3 = {
        .igps_len = sizeof(struct igmpstat_v3),
 };
 static struct igmpstat igmpstat; /* old IGMPv2 stats structure */
-static struct timeval igmp_gsrdelay = {10, 0};
+static struct timeval igmp_gsrdelay = {.tv_sec = 10, .tv_usec = 0};
 
 static int igmp_recvifkludge = 1;
 static int igmp_sendra = 1;
@@ -847,7 +847,7 @@ igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
        struct igmp_ifinfo      *igi;
        struct in_multi         *inm;
        struct in_multistep     step;
-       struct igmp_tparams     itp = { 0, 0, 0, 0 };
+       struct igmp_tparams     itp = { .qpt = 0, .it = 0, .cst = 0, .sct = 0 };
 
        IGMP_LOCK_ASSERT_NOTHELD();
 
@@ -937,7 +937,7 @@ igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
        struct in_multi         *inm;
        int                      is_general_query;
        uint16_t                 timer;
-       struct igmp_tparams      itp = { 0, 0, 0, 0 };
+       struct igmp_tparams      itp = { .qpt = 0, .it = 0, .cst = 0, .sct = 0 };
 
        IGMP_LOCK_ASSERT_NOTHELD();
 
@@ -1104,7 +1104,7 @@ igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
        uint32_t                 maxresp, nsrc, qqi;
        uint16_t                 timer;
        uint8_t                  qrv;
-       struct igmp_tparams      itp = { 0, 0, 0, 0 };
+       struct igmp_tparams      itp = { .qpt = 0, .it = 0, .cst = 0, .sct = 0 };
 
        IGMP_LOCK_ASSERT_NOTHELD();
 
index 5f464c32581ca93a7415dbfde0f22901051a1ecf..f51b22b4e984cb4f19c5db0ddef0b021074619b9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -77,6 +77,7 @@
 
 #include <kern/zalloc.h>
 #include <pexpert/pexpert.h>
+#include <os/log.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
@@ -778,10 +779,6 @@ inctl_ifaddr(struct ifnet *ifp, struct in_ifaddr *ia, u_long cmd,
                        error = 0;
                }
                if (error != 0) {
-                       /* Reset the detaching flag */
-                       IFA_LOCK(&ia->ia_ifa);
-                       ia->ia_ifa.ifa_debug &= ~IFD_DETACHING;
-                       IFA_UNLOCK(&ia->ia_ifa);
                        break;
                }
 
@@ -1346,21 +1343,6 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
                        if (iap->ia_ifp == ifp &&
                            iap->ia_addr.sin_addr.s_addr ==
                            sa->sin_addr.s_addr) {
-                               /*
-                                * Avoid the race condition seen when two
-                                * threads process SIOCDIFADDR command
-                                * at the same time (radar 28942007)
-                                */
-                               if (cmd == SIOCDIFADDR) {
-                                       if (iap->ia_ifa.ifa_debug &
-                                           IFD_DETACHING) {
-                                               IFA_UNLOCK(&iap->ia_ifa);
-                                               continue;
-                                       } else {
-                                               iap->ia_ifa.ifa_debug |=
-                                                   IFD_DETACHING;
-                                       }
-                               }
                                ia = iap;
                                IFA_ADDREF_LOCKED(&iap->ia_ifa);
                                IFA_UNLOCK(&iap->ia_ifa);
@@ -1377,15 +1359,12 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
                                IFA_LOCK(&iap->ia_ifa);
                                if (iap->ia_addr.sin_family == AF_INET) {
                                        ia = iap;
+                                       IFA_ADDREF_LOCKED(&iap->ia_ifa);
                                        IFA_UNLOCK(&iap->ia_ifa);
                                        break;
                                }
                                IFA_UNLOCK(&iap->ia_ifa);
                        }
-                       /* take a reference on ia before releasing lock */
-                       if (ia != NULL) {
-                               IFA_ADDREF(&ia->ia_ifa);
-                       }
                        ifnet_lock_done(ifp);
                }
        }
@@ -1444,10 +1423,40 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
                        error = EINVAL;
                        goto done;
                }
-               if (cmd == SIOCDIFADDR && ia == NULL) {
-                       error = EADDRNOTAVAIL;
-                       goto done;
+               if (cmd == SIOCDIFADDR) {
+                       if (ia == NULL) {
+                               error = EADDRNOTAVAIL;
+                               goto done;
+                       }
+
+                       IFA_LOCK(&ia->ia_ifa);
+                       /*
+                        * Avoid the race condition seen when two
+                        * threads process SIOCDIFADDR command
+                        * at the same time.
+                        */
+                       while (ia->ia_ifa.ifa_debug & IFD_DETACHING) {
+                               os_log(OS_LOG_DEFAULT,
+                                   "Another thread is already attempting to "
+                                   "delete IPv4 address: %s on interface %s. "
+                                   "Go to sleep and check again after the operation is done",
+                                   inet_ntoa(sa->sin_addr), ia->ia_ifp->if_xname);
+                               ia->ia_ifa.ifa_del_waiters++;
+                               (void) msleep(ia->ia_ifa.ifa_del_wc, &ia->ia_ifa.ifa_lock, (PZERO - 1),
+                                   __func__, NULL);
+                               IFA_LOCK_ASSERT_HELD(&ia->ia_ifa);
+                       }
+
+                       if ((ia->ia_ifa.ifa_debug & IFD_ATTACHED) == 0) {
+                               error = EADDRNOTAVAIL;
+                               IFA_UNLOCK(&ia->ia_ifa);
+                               goto done;
+                       }
+
+                       ia->ia_ifa.ifa_debug |= IFD_DETACHING;
+                       IFA_UNLOCK(&ia->ia_ifa);
                }
+
        /* FALLTHROUGH */
        case SIOCSIFADDR:               /* struct ifreq */
        case SIOCSIFDSTADDR:            /* struct ifreq */
@@ -1543,8 +1552,18 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
                error = EOPNOTSUPP;
                break;
        }
+
 done:
        if (ia != NULL) {
+               if (cmd == SIOCDIFADDR) {
+                       IFA_LOCK(&ia->ia_ifa);
+                       ia->ia_ifa.ifa_debug &= ~IFD_DETACHING;
+                       if (ia->ia_ifa.ifa_del_waiters > 0) {
+                               ia->ia_ifa.ifa_del_waiters = 0;
+                               wakeup(ia->ia_ifa.ifa_del_wc);
+                       }
+                       IFA_UNLOCK(&ia->ia_ifa);
+               }
                IFA_REMREF(&ia->ia_ifa);
        }
        if (so_unlocked) {
@@ -2036,6 +2055,8 @@ in_ifaddr_alloc(int how)
                bzero(inifa, inifa_size);
                inifa->ia_ifa.ifa_free = in_ifaddr_free;
                inifa->ia_ifa.ifa_debug |= IFD_ALLOC;
+               inifa->ia_ifa.ifa_del_wc = &inifa->ia_ifa.ifa_debug;
+               inifa->ia_ifa.ifa_del_waiters = 0;
                ifa_lock_init(&inifa->ia_ifa);
                if (inifa_debug != 0) {
                        struct in_ifaddr_dbg *inifa_dbg =
index 6be1d82726975310d6339004b55411258c71e52e..66674c208aead5f92d4bdf7a0bb05c4d962a2e01 100644 (file)
 #define IPPROTO_ENCAP           98              /* encapsulation header */
 #define IPPROTO_APES            99              /* any private encr. scheme */
 #define IPPROTO_GMTP            100             /* GMTP*/
-/* 101-254: Partly Unassigned */
+/* 101-252: Partly Unassigned */
 #define IPPROTO_PIM             103             /* Protocol Independent Mcast */
 #define IPPROTO_IPCOMP          108             /* payload compression (IPComp) */
 #define IPPROTO_PGM             113             /* PGM */
 #define IPPROTO_SCTP            132             /* SCTP */
-/* 255: Reserved */
+/* 253-254: Experimentation and testing; 255: Reserved (RFC3692) */
 /* BSD Private, local use, namespace incursion */
+#ifdef PRIVATE
+#define IPPROTO_QUIC            253             /* QUIC protocol (Over UDP) */
+#endif /* PRIVATE */
 #define IPPROTO_DIVERT          254             /* divert pseudo-protocol */
 #endif  /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */
 #define IPPROTO_RAW             255             /* raw IP packet */
@@ -814,7 +817,6 @@ union sockaddr_in_4_6 {
        struct sockaddr_in      sin;
        struct sockaddr_in6     sin6;
 };
-
 #define CLAT46_HDR_EXPANSION_OVERHD     (sizeof(struct ip6_hdr) - sizeof(struct ip))
 
 /*
index 1aec999a6beb509a30cb6ddf329d99ab7ca3a169..d13e22e9aa01cf4d955e6096110bf364bb76563b 100644 (file)
@@ -247,6 +247,11 @@ static int arp_verbose;
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, verbose,
     CTLFLAG_RW | CTLFLAG_LOCKED, &arp_verbose, 0, "");
 
+static uint32_t arp_maxhold_total = 1024; /* max total packets in the holdq */
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold_total,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &arp_maxhold_total, 0, "");
+
+
 /*
  * Generally protected by rnh_lock; use atomic operations on fields
  * that are also modified outside of that lock (if needed).
@@ -324,15 +329,29 @@ arp_llinfo_free(void *arg)
        zfree(llinfo_arp_zone, la);
 }
 
-static void
+static bool
 arp_llinfo_addq(struct llinfo_arp *la, struct mbuf *m)
 {
+       classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
+
+       if (arpstat.held >= arp_maxhold_total) {
+               if (arp_verbose) {
+                       log(LOG_DEBUG,
+                           "%s: dropping packet due to maxhold_total\n",
+                           __func__);
+               }
+               atomic_add_32(&arpstat.dropped, 1);
+               return false;
+       }
+
        if (qlen(&la->la_holdq) >= qlimit(&la->la_holdq)) {
                struct mbuf *_m;
                /* prune less than CTL, else take what's at the head */
-               _m = _getq_scidx_lt(&la->la_holdq, SCIDX_CTL);
+               _getq_scidx_lt(&la->la_holdq, &pkt, SCIDX_CTL);
+               _m = pkt.cp_mbuf;
                if (_m == NULL) {
-                       _m = _getq(&la->la_holdq);
+                       _getq(&la->la_holdq, &pkt);
+                       _m = pkt.cp_mbuf;
                }
                VERIFY(_m != NULL);
                if (arp_verbose) {
@@ -343,13 +362,16 @@ arp_llinfo_addq(struct llinfo_arp *la, struct mbuf *m)
                atomic_add_32(&arpstat.dropped, 1);
                atomic_add_32(&arpstat.held, -1);
        }
-       _addq(&la->la_holdq, m);
+       CLASSQ_PKT_INIT_MBUF(&pkt, m);
+       _addq(&la->la_holdq, &pkt);
        atomic_add_32(&arpstat.held, 1);
        if (arp_verbose) {
                log(LOG_DEBUG, "%s: enqueued packet (scidx %u), qlen now %u\n",
                    __func__, MBUF_SCIDX(mbuf_get_service_class(m)),
                    qlen(&la->la_holdq));
        }
+
+       return true;
 }
 
 static uint32_t
@@ -1250,6 +1272,7 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest,
        uint32_t rtflags;
        struct sockaddr_dl sdl;
        boolean_t send_probe_notif = FALSE;
+       boolean_t enqueued = FALSE;
 
        if (ifp == NULL || net_dest == NULL) {
                return EINVAL;
@@ -1455,7 +1478,7 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest,
         * we still hold the route's rt_lock.
         */
        if (packet != NULL) {
-               arp_llinfo_addq(llinfo, packet);
+               enqueued = arp_llinfo_addq(llinfo, packet);
        } else {
                llinfo->la_prbreq_cnt++;
        }
@@ -1545,14 +1568,15 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest,
                                 * from the time of _addq() above, this packet
                                 * must be at the tail.
                                 */
-                               if (packet != NULL) {
-                                       struct mbuf *_m =
-                                           _getq_tail(&llinfo->la_holdq);
+                               if (packet != NULL && enqueued) {
+                                       classq_pkt_t pkt =
+                                           CLASSQ_PKT_INITIALIZER(pkt);
+
+                                       _getq_tail(&llinfo->la_holdq, &pkt);
                                        atomic_add_32(&arpstat.held, -1);
-                                       VERIFY(_m == packet);
+                                       VERIFY(pkt.cp_mbuf == packet);
                                }
                                result = EHOSTUNREACH;
-
                                /*
                                 * Enqueue work item to invoke callback for this route entry
                                 */
@@ -1563,8 +1587,12 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest,
                }
        }
 
-       /* The packet is now held inside la_holdq */
+       /* The packet is now held inside la_holdq or dropped */
        result = EJUSTRETURN;
+       if (packet != NULL && !enqueued) {
+               mbuf_free(packet);
+               packet = NULL;
+       }
 
 release:
        if (result == EHOSTUNREACH) {
@@ -1659,14 +1687,11 @@ arp_ip_handle_input(ifnet_t ifp, u_short arpop,
 
        /*
         * Determine if this ARP is for us
-        * For a bridge, we want to check the address irrespective
-        * of the receive interface.
         */
        lck_rw_lock_shared(in_ifaddr_rwlock);
        TAILQ_FOREACH(ia, INADDR_HASH(target_ip->sin_addr.s_addr), ia_hash) {
                IFA_LOCK_SPIN(&ia->ia_ifa);
-               if (((bridged && ia->ia_ifp->if_bridge != NULL) ||
-                   (ia->ia_ifp == ifp)) &&
+               if (ia->ia_ifp == ifp &&
                    ia->ia_addr.sin_addr.s_addr == target_ip->sin_addr.s_addr) {
                        best_ia = ia;
                        best_ia_sin = best_ia->ia_addr;
@@ -1680,8 +1705,7 @@ arp_ip_handle_input(ifnet_t ifp, u_short arpop,
 
        TAILQ_FOREACH(ia, INADDR_HASH(sender_ip->sin_addr.s_addr), ia_hash) {
                IFA_LOCK_SPIN(&ia->ia_ifa);
-               if (((bridged && ia->ia_ifp->if_bridge != NULL) ||
-                   (ia->ia_ifp == ifp)) &&
+               if (ia->ia_ifp == ifp &&
                    ia->ia_addr.sin_addr.s_addr == sender_ip->sin_addr.s_addr) {
                        best_ia = ia;
                        best_ia_sin = best_ia->ia_addr;
@@ -2132,8 +2156,11 @@ match:
 
        if (!qempty(&llinfo->la_holdq)) {
                uint32_t held;
-               struct mbuf *m0 =
-                   _getq_all(&llinfo->la_holdq, NULL, &held, NULL);
+               struct mbuf *m0;
+               classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
+
+               _getq_all(&llinfo->la_holdq, &pkt, NULL, &held, NULL);
+               m0 = pkt.cp_mbuf;
                if (arp_verbose) {
                        log(LOG_DEBUG, "%s: sending %u held packets\n",
                            __func__, held);
index 03532eaa3314b06a22e03219c594a9d2f198a6b1..b4eed3bd8b87fc006b275af2fd955d73cf47998d 100644 (file)
@@ -2772,11 +2772,6 @@ inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
                                return EADDRNOTAVAIL;
                        }
                }
-               /* XXX remove? */
-#ifdef IGMP_DEBUG0
-               IGMP_PRINTF(("%s: ifp = 0x%llx, addr = %s\n", __func__,
-                   (uint64_t)VM_KERNEL_ADDRPERM(ifp), inet_ntoa(addr)));
-#endif
        }
 
        /* Reject interfaces which do not support multicast. */
index 6f275437348d9f065db96bef27a47e007e76625f..d1627fb3948f864c91a493dcc9e701b0d27c9626 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <net/flowadv.h>
 #include <net/nat464_utils.h>
 #include <net/ntstat.h>
+#include <net/restricted_in_port.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
+
 #if INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <sys/ubc.h>
 #include <sys/vnode.h>
 
+#include <os/log.h>
+
+extern const char *proc_name_address(struct proc *);
+
 static lck_grp_t        *inpcb_lock_grp;
 static lck_attr_t       *inpcb_lock_attr;
 static lck_grp_attr_t   *inpcb_lock_grp_attr;
@@ -173,6 +179,20 @@ sysctl_net_ipport_check SYSCTL_HANDLER_ARGS
 {
 #pragma unused(arg1, arg2)
        int error;
+#if (DEBUG | DEVELOPMENT)
+       int old_value = *(int *)oidp->oid_arg1;
+       /*
+        * For unit testing allow a non-superuser process with the
+        * proper entitlement to modify the variables
+        */
+       if (req->newptr) {
+               if (proc_suser(current_proc()) != 0 &&
+                   (error = priv_check_cred(kauth_cred_get(),
+                   PRIV_NETINET_RESERVEDPORT, 0))) {
+                       return EPERM;
+               }
+       }
+#endif /* (DEBUG | DEVELOPMENT) */
 
        error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
        if (!error) {
@@ -183,6 +203,14 @@ sysctl_net_ipport_check SYSCTL_HANDLER_ARGS
                RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX);
                RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX);
        }
+
+#if (DEBUG | DEVELOPMENT)
+       os_log(OS_LOG_DEFAULT,
+           "%s:%u sysctl net.restricted_port.verbose: %d -> %d)",
+           proc_best_name(current_proc()), proc_selfpid(),
+           old_value, *(int *)oidp->oid_arg1);
+#endif /* (DEBUG | DEVELOPMENT) */
+
        return error;
 }
 
@@ -191,23 +219,29 @@ sysctl_net_ipport_check SYSCTL_HANDLER_ARGS
 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IP Ports");
 
+#if (DEBUG | DEVELOPMENT)
+#define CTLFAGS_IP_PORTRANGE (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY)
+#else
+#define CTLFAGS_IP_PORTRANGE (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED)
+#endif /* (DEBUG | DEVELOPMENT) */
+
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    CTLFAGS_IP_PORTRANGE,
     &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    CTLFAGS_IP_PORTRANGE,
     &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    CTLFAGS_IP_PORTRANGE,
     &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    CTLFAGS_IP_PORTRANGE,
     &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    CTLFAGS_IP_PORTRANGE,
     &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    CTLFAGS_IP_PORTRANGE,
     &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
 
 static uint32_t apn_fallbk_debug = 0;
@@ -652,7 +686,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, struct proc *p)
  * in_pcblookup_local_and_cleanup does everything
  * in_pcblookup_local does but it checks for a socket
  * that's going away. Since we know that the lock is
- * held read+write when this funciton is called, we
+ * held read+write when this function is called, we
  * can safely dispose of this socket like the slow
  * timer would usually do and return NULL. This is
  * great for bind.
@@ -816,13 +850,16 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
                                IFA_REMREF(ifa);
                        }
                }
+
+
                if (lport != 0) {
                        struct inpcb *t;
                        uid_t u;
 
 #if !CONFIG_EMBEDDED
                        if (ntohs(lport) < IPPORT_RESERVED &&
-                           SIN(nam)->sin_addr.s_addr != 0) {
+                           SIN(nam)->sin_addr.s_addr != 0 &&
+                           !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
                                cred = kauth_cred_proc_ref(p);
                                error = priv_check_cred(cred,
                                    PRIV_NETINET_RESERVEDPORT, 0);
@@ -834,6 +871,16 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
                                }
                        }
 #endif /* !CONFIG_EMBEDDED */
+                       /*
+                        * Check wether the process is allowed to bind to a restricted port
+                        */
+                       if (!current_task_can_use_restricted_in_port(lport,
+                           so->so_proto->pr_protocol, PORT_FLAGS_BSD)) {
+                               lck_rw_done(pcbinfo->ipi_lock);
+                               socket_lock(so, 0);
+                               return EADDRINUSE;
+                       }
+
                        if (!IN_MULTICAST(ntohl(SIN(nam)->sin_addr.s_addr)) &&
                            (u = kauth_cred_getuid(so->so_cred)) != 0 &&
                            (t = in_pcblookup_local_and_cleanup(
@@ -845,7 +892,10 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
                            (u != kauth_cred_getuid(t->inp_socket->so_cred)) &&
                            !(t->inp_socket->so_flags & SOF_REUSESHAREUID) &&
                            (SIN(nam)->sin_addr.s_addr != INADDR_ANY ||
-                           t->inp_laddr.s_addr != INADDR_ANY)) {
+                           t->inp_laddr.s_addr != INADDR_ANY) &&
+                           (!(t->inp_flags2 & INP2_EXTERNAL_PORT) ||
+                           !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
+                           uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) {
                                if ((t->inp_socket->so_flags &
                                    SOF_NOTIFYCONFLICT) &&
                                    !(so->so_flags & SOF_NOTIFYCONFLICT)) {
@@ -864,7 +914,10 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
                        t = in_pcblookup_local_and_cleanup(pcbinfo,
                            SIN(nam)->sin_addr, lport, wild);
                        if (t != NULL &&
-                           (reuseport & t->inp_socket->so_options) == 0) {
+                           (reuseport & t->inp_socket->so_options) == 0 &&
+                           (!(t->inp_flags2 & INP2_EXTERNAL_PORT) ||
+                           !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
+                           uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) {
 #if INET6
                                if (SIN(nam)->sin_addr.s_addr != INADDR_ANY ||
                                    t->inp_laddr.s_addr != INADDR_ANY ||
@@ -895,6 +948,13 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
                int count;
                bool found;
 
+               /*
+                * Override wild = 1 for implicit bind (mainly used by connect)
+                * For implicit bind (lport == 0), we always use an unused port,
+                * so REUSEADDR|REUSEPORT don't apply
+                */
+               wild = 1;
+
                randomport = (so->so_flags & SOF_BINDRANDOMPORT) ||
                    (so->so_type == SOCK_STREAM ? tcp_use_randomport :
                    udp_use_randomport);
@@ -967,6 +1027,14 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
                                }
                                lport = htons(*lastport);
 
+                               /*
+                                * Skip if this is a restricted port as we do not want to
+                                * restricted ports as ephemeral
+                                */
+                               if (IS_RESTRICTED_IN_PORT(lport)) {
+                                       continue;
+                               }
+
                                found = in_pcblookup_local_and_cleanup(pcbinfo,
                                    lookup_addr, lport, wild) == NULL;
                        } while (!found);
@@ -999,6 +1067,14 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
                                }
                                lport = htons(*lastport);
 
+                               /*
+                                * Skip if this is a restricted port as we do not want to
+                                * restricted ports as ephemeral
+                                */
+                               if (IS_RESTRICTED_IN_PORT(lport)) {
+                                       continue;
+                               }
+
                                found = in_pcblookup_local_and_cleanup(pcbinfo,
                                    lookup_addr, lport, wild) == NULL;
                        } while (!found);
@@ -1159,7 +1235,7 @@ apn_fallback_required(proc_t proc, struct socket *so, struct sockaddr_in *p_dstv
 
                bzero(&sb, sizeof(struct stat64));
                context = vfs_context_create(NULL);
-               vn_stat_error = vn_stat(proc->p_textvp, &sb, NULL, 1, context);
+               vn_stat_error = vn_stat(proc->p_textvp, &sb, NULL, 1, 0, context);
                (void)vfs_context_rele(context);
 
                if (vn_stat_error != 0 ||
@@ -2172,6 +2248,12 @@ in_pcblookup_hash_exists(struct inpcbinfo *pcbinfo, struct in_addr faddr,
                        continue;
                }
 
+#if NECP
+               if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
+                       continue;
+               }
+#endif /* NECP */
+
                if (inp->inp_faddr.s_addr == faddr.s_addr &&
                    inp->inp_laddr.s_addr == laddr.s_addr &&
                    inp->inp_fport == fport &&
@@ -2210,6 +2292,12 @@ in_pcblookup_hash_exists(struct inpcbinfo *pcbinfo, struct in_addr faddr,
                        continue;
                }
 
+#if NECP
+               if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
+                       continue;
+               }
+#endif /* NECP */
+
                if (inp->inp_faddr.s_addr == INADDR_ANY &&
                    inp->inp_lport == lport) {
                        if (inp->inp_laddr.s_addr == laddr.s_addr) {
@@ -2295,6 +2383,12 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
                        continue;
                }
 
+#if NECP
+               if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
+                       continue;
+               }
+#endif /* NECP */
+
                if (inp->inp_faddr.s_addr == faddr.s_addr &&
                    inp->inp_laddr.s_addr == laddr.s_addr &&
                    inp->inp_fport == fport &&
@@ -2334,6 +2428,12 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
                        continue;
                }
 
+#if NECP
+               if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
+                       continue;
+               }
+#endif /* NECP */
+
                if (inp->inp_faddr.s_addr == INADDR_ANY &&
                    inp->inp_lport == lport) {
                        if (inp->inp_laddr.s_addr == laddr.s_addr) {
@@ -2897,6 +2997,15 @@ inp_set_noexpensive(struct inpcb *inp)
        ROUTE_RELEASE(&inp->inp_route);
 }
 
+void
+inp_set_noconstrained(struct inpcb *inp)
+{
+       inp->inp_flags2 |= INP2_NO_IFF_CONSTRAINED;
+
+       /* Blow away any cached route in the PCB */
+       ROUTE_RELEASE(&inp->inp_route);
+}
+
 void
 inp_set_awdl_unrestricted(struct inpcb *inp)
 {
@@ -3233,6 +3342,8 @@ inp_get_soprocinfo(struct inpcb *inp, struct so_procinfo *soprocinfo)
        struct socket *so = inp->inp_socket;
 
        soprocinfo->spi_pid = so->last_pid;
+       strlcpy(&soprocinfo->spi_proc_name[0], &inp->inp_last_proc_name[0],
+           sizeof(soprocinfo->spi_proc_name));
        if (so->last_pid != 0) {
                uuid_copy(soprocinfo->spi_uuid, so->last_uuid);
        }
@@ -3247,6 +3358,8 @@ inp_get_soprocinfo(struct inpcb *inp, struct so_procinfo *soprocinfo)
                soprocinfo->spi_delegated = 0;
                soprocinfo->spi_epid = so->last_pid;
        }
+       strlcpy(&soprocinfo->spi_e_proc_name[0], &inp->inp_e_proc_name[0],
+           sizeof(soprocinfo->spi_e_proc_name));
 }
 
 int
@@ -3479,6 +3592,10 @@ _inp_restricted_recv(struct inpcb *inp, struct ifnet *ifp)
                return TRUE;
        }
 
+       if (IFNET_IS_CONSTRAINED(ifp) && INP_NO_CONSTRAINED(inp)) {
+               return TRUE;
+       }
+
        if (IFNET_IS_AWDL_RESTRICTED(ifp) && !INP_AWDL_UNRESTRICTED(inp)) {
                return TRUE;
        }
@@ -3545,6 +3662,10 @@ _inp_restricted_send(struct inpcb *inp, struct ifnet *ifp)
                return TRUE;
        }
 
+       if (IFNET_IS_CONSTRAINED(ifp) && INP_NO_CONSTRAINED(inp)) {
+               return TRUE;
+       }
+
        if (IFNET_IS_AWDL_RESTRICTED(ifp) && !INP_AWDL_UNRESTRICTED(inp)) {
                return TRUE;
        }
@@ -3576,8 +3697,7 @@ inp_count_sndbytes(struct inpcb *inp, u_int32_t th_ack)
        struct ifnet *ifp = inp->inp_last_outifp;
        struct socket *so = inp->inp_socket;
        if (ifp != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
-           (ifp->if_type == IFT_CELLULAR ||
-           ifp->if_subfamily == IFNET_SUBFAMILY_WIFI)) {
+           (ifp->if_type == IFT_CELLULAR || IFNET_IS_WIFI(ifp))) {
                int32_t unsent;
 
                so->so_snd.sb_flags |= SB_SNDBYTE_CNT;
@@ -3636,13 +3756,13 @@ inp_incr_sndbytes_unsent(struct socket *so, int32_t len)
 inline void
 inp_decr_sndbytes_unsent(struct socket *so, int32_t len)
 {
-       struct inpcb *inp = (struct inpcb *)so->so_pcb;
-       struct ifnet *ifp = inp->inp_last_outifp;
-
        if (so == NULL || !(so->so_snd.sb_flags & SB_SNDBYTE_CNT)) {
                return;
        }
 
+       struct inpcb *inp = (struct inpcb *)so->so_pcb;
+       struct ifnet *ifp = inp->inp_last_outifp;
+
        if (ifp != NULL) {
                if (ifp->if_sndbyte_unsent >= len) {
                        OSAddAtomic64(-len, &ifp->if_sndbyte_unsent);
@@ -3677,3 +3797,40 @@ inp_get_activity_bitmap(struct inpcb *inp, activity_bitmap_t *ab)
 {
        bcopy(&inp->inp_nw_activity, ab, sizeof(*ab));
 }
+
+void
+inp_update_last_owner(struct socket *so, struct proc *p, struct proc *ep)
+{
+       struct inpcb *inp = (struct inpcb *)so->so_pcb;
+
+       if (inp == NULL) {
+               return;
+       }
+
+       if (p != NULL) {
+               strlcpy(&inp->inp_last_proc_name[0], proc_name_address(p), sizeof(inp->inp_last_proc_name));
+       }
+       if (so->so_flags & SOF_DELEGATED) {
+               if (ep != NULL) {
+                       strlcpy(&inp->inp_e_proc_name[0], proc_name_address(ep), sizeof(inp->inp_e_proc_name));
+               } else {
+                       inp->inp_e_proc_name[0] = 0;
+               }
+       } else {
+               inp->inp_e_proc_name[0] = 0;
+       }
+}
+
+void
+inp_copy_last_owner(struct socket *so, struct socket *head)
+{
+       struct inpcb *inp = (struct inpcb *)so->so_pcb;
+       struct inpcb *head_inp = (struct inpcb *)head->so_pcb;
+
+       if (inp == NULL || head_inp == NULL) {
+               return;
+       }
+
+       strlcpy(&inp->inp_last_proc_name[0], &head_inp->inp_last_proc_name[0], sizeof(inp->inp_last_proc_name));
+       strlcpy(&inp->inp_e_proc_name[0], &head_inp->inp_e_proc_name[0], sizeof(inp->inp_e_proc_name));
+}
index 90e0e07698e98ec2261d0123d21ce4b58518410e..a5ec42ab233526acff140e2b515a6e6bc1c1bd56 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -160,6 +160,7 @@ struct inpcb {
        u_int32_t inp_flow;             /* IPv6 flow information */
 
        u_char  inp_sndinprog_cnt;      /* outstanding send operations */
+       uint32_t inp_sndingprog_waiters;/* waiters for outstanding send */
        u_char  inp_vflag;              /* INP_IPV4 or INP_IPV6 */
 
        u_char inp_ip_ttl;              /* time to live proto */
@@ -212,7 +213,7 @@ struct inpcb {
        struct label *inp_label;        /* MAC label */
 #endif
 #if IPSEC
-       struct inpcbpolicy *inp_sp;     /* for IPSec */
+       struct inpcbpolicy *inp_sp;     /* for IPsec */
 #endif /* IPSEC */
 #if NECP
        struct {
@@ -238,6 +239,9 @@ struct inpcb {
        u_int8_t inp_Wstat_store[sizeof(struct inp_stat) + sizeof(u_int64_t)];
        activity_bitmap_t inp_nw_activity;
        u_int64_t inp_start_timestamp;
+
+       char inp_last_proc_name[MAXCOMLEN + 1];
+       char inp_e_proc_name[MAXCOMLEN + 1];
 };
 
 #define INP_ADD_STAT(_inp, _cnt_cellular, _cnt_wifi, _cnt_wired, _a, _n) \
@@ -624,6 +628,8 @@ struct inpcbinfo {
        ((_inp)->inp_flags & INP_NO_IFT_CELLULAR)
 #define INP_NO_EXPENSIVE(_inp) \
        ((_inp)->inp_flags2 & INP2_NO_IFF_EXPENSIVE)
+#define INP_NO_CONSTRAINED(_inp) \
+       ((_inp)->inp_flags2 & INP2_NO_IFF_CONSTRAINED)
 #define INP_AWDL_UNRESTRICTED(_inp) \
        ((_inp)->inp_flags2 & INP2_AWDL_UNRESTRICTED)
 #define INP_INTCOPROC_ALLOWED(_inp) \
@@ -709,6 +715,8 @@ struct inpcbinfo {
 #define INP2_INTCOPROC_ALLOWED  0x00000080 /* Allow communication via internal co-processor interfaces */
 #define INP2_CONNECT_IN_PROGRESS        0x00000100 /* A connect call is in progress, so binds are intermediate steps */
 #define INP2_CLAT46_FLOW        0x00000200 /* The flow is going to use CLAT46 path */
+#define INP2_EXTERNAL_PORT      0x00000400 /* The port is registered externally, for NECP listeners */
+#define INP2_NO_IFF_CONSTRAINED 0x00000800 /* do not use constrained interface */
 
 /*
  * Flags passed to in_pcblookup*() functions.
@@ -807,6 +815,7 @@ extern int inp_bindif(struct inpcb *, unsigned int, struct ifnet **);
 extern void inp_set_nocellular(struct inpcb *);
 extern void inp_clear_nocellular(struct inpcb *);
 extern void inp_set_noexpensive(struct inpcb *);
+extern void inp_set_noconstrained(struct inpcb *);
 extern void inp_set_awdl_unrestricted(struct inpcb *);
 extern boolean_t inp_get_awdl_unrestricted(struct inpcb *);
 extern void inp_clear_awdl_unrestricted(struct inpcb *);
@@ -838,6 +847,8 @@ extern int32_t inp_get_sndbytes_allunsent(struct socket *, u_int32_t);
 extern void inp_decr_sndbytes_allunsent(struct socket *, u_int32_t);
 extern void inp_set_activity_bitmap(struct inpcb *inp);
 extern void inp_get_activity_bitmap(struct inpcb *inp, activity_bitmap_t *b);
+extern void inp_update_last_owner(struct socket *so, struct proc *p, struct proc *ep);
+extern void inp_copy_last_owner(struct socket *so, struct socket *head);
 #endif /* BSD_KERNEL_PRIVATE */
 #ifdef KERNEL_PRIVATE
 /* exported for PPP */
index 73b55db69036e1f25976770e9c54a328c49ff81d..dcd59d9c3300467d0830dd7bf0f13d1d5d1ba95e 100644 (file)
@@ -112,50 +112,54 @@ sotoxsocket_n(struct socket *so, struct xsocket_n *xso)
        xso->xso_len = sizeof(struct xsocket_n);
        xso->xso_kind = XSO_SOCKET;
 
-       if (so != NULL) {
-               xso->xso_so = (uint64_t)VM_KERNEL_ADDRPERM(so);
-               xso->so_type = so->so_type;
-               xso->so_options = so->so_options;
-               xso->so_linger = so->so_linger;
-               xso->so_state = so->so_state;
-               xso->so_pcb = (uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb);
-               if (so->so_proto) {
-                       xso->xso_protocol = SOCK_PROTO(so);
-                       xso->xso_family = SOCK_DOM(so);
-               } else {
-                       xso->xso_protocol = xso->xso_family = 0;
-               }
-               xso->so_qlen = so->so_qlen;
-               xso->so_incqlen = so->so_incqlen;
-               xso->so_qlimit = so->so_qlimit;
-               xso->so_timeo = so->so_timeo;
-               xso->so_error = so->so_error;
-               xso->so_pgid = so->so_pgid;
-               xso->so_oobmark = so->so_oobmark;
-               xso->so_uid = kauth_cred_getuid(so->so_cred);
-               xso->so_last_pid = so->last_pid;
-               xso->so_e_pid = so->e_pid;
+       if (so == NULL) {
+               return;
+       }
+
+       xso->xso_so = (uint64_t)VM_KERNEL_ADDRPERM(so);
+       xso->so_type = so->so_type;
+       xso->so_options = so->so_options;
+       xso->so_linger = so->so_linger;
+       xso->so_state = so->so_state;
+       xso->so_pcb = (uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb);
+       if (so->so_proto) {
+               xso->xso_protocol = SOCK_PROTO(so);
+               xso->xso_family = SOCK_DOM(so);
+       } else {
+               xso->xso_protocol = xso->xso_family = 0;
        }
+       xso->so_qlen = so->so_qlen;
+       xso->so_incqlen = so->so_incqlen;
+       xso->so_qlimit = so->so_qlimit;
+       xso->so_timeo = so->so_timeo;
+       xso->so_error = so->so_error;
+       xso->so_pgid = so->so_pgid;
+       xso->so_oobmark = so->so_oobmark;
+       xso->so_uid = kauth_cred_getuid(so->so_cred);
+       xso->so_last_pid = so->last_pid;
+       xso->so_e_pid = so->e_pid;
 }
 
 __private_extern__ void
 sbtoxsockbuf_n(struct sockbuf *sb, struct xsockbuf_n *xsb)
 {
        xsb->xsb_len = sizeof(struct xsockbuf_n);
-       xsb->xsb_kind = (sb->sb_flags & SB_RECV) ? XSO_RCVBUF : XSO_SNDBUF;
 
-       if (sb != NULL) {
-               xsb->sb_cc = sb->sb_cc;
-               xsb->sb_hiwat = sb->sb_hiwat;
-               xsb->sb_mbcnt = sb->sb_mbcnt;
-               xsb->sb_mbmax = sb->sb_mbmax;
-               xsb->sb_lowat = sb->sb_lowat;
-               xsb->sb_flags = sb->sb_flags;
-               xsb->sb_timeo = (short)(sb->sb_timeo.tv_sec * hz) +
-                   sb->sb_timeo.tv_usec / tick;
-               if (xsb->sb_timeo == 0 && sb->sb_timeo.tv_usec != 0) {
-                       xsb->sb_timeo = 1;
-               }
+       if (sb == NULL) {
+               return;
+       }
+
+       xsb->xsb_kind = (sb->sb_flags & SB_RECV) ? XSO_RCVBUF : XSO_SNDBUF;
+       xsb->sb_cc = sb->sb_cc;
+       xsb->sb_hiwat = sb->sb_hiwat;
+       xsb->sb_mbcnt = sb->sb_mbcnt;
+       xsb->sb_mbmax = sb->sb_mbmax;
+       xsb->sb_lowat = sb->sb_lowat;
+       xsb->sb_flags = sb->sb_flags;
+       xsb->sb_timeo = (short)(sb->sb_timeo.tv_sec * hz) +
+           sb->sb_timeo.tv_usec / tick;
+       if (xsb->sb_timeo == 0 && sb->sb_timeo.tv_usec != 0) {
+               xsb->sb_timeo = 1;
        }
 }
 
@@ -167,6 +171,10 @@ sbtoxsockstat_n(struct socket *so, struct xsockstat_n *xst)
        xst->xst_len = sizeof(struct xsockstat_n);
        xst->xst_kind = XSO_STATS;
 
+       if (so == NULL) {
+               return;
+       }
+
        for (i = 0; i < SO_TC_STATS_MAX; i++) {
                xst->xst_tc_stats[i].rxpackets = so->so_tc_stats[i].rxpackets;
                xst->xst_tc_stats[i].rxbytes = so->so_tc_stats[i].rxbytes;
index bc3ce2a5e4c76fbdbdf1b99fe629d75731c4ad83..340f31f65f02a23664105d9000ff48ac61822abf 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #if IPSEC_ESP
 #include <netinet6/esp.h>
 #endif
-#include <netinet6/ipcomp.h>
 #endif /* IPSEC */
 
 static void in_dinit(struct domain *);
@@ -139,6 +138,8 @@ static struct protosw inetsw[] = {
                .pr_lock =              udp_lock,
                .pr_unlock =            udp_unlock,
                .pr_getlock =           udp_getlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
        {
                .pr_type =              SOCK_STREAM,
@@ -155,6 +156,8 @@ static struct protosw inetsw[] = {
                .pr_lock =              tcp_lock,
                .pr_unlock =            tcp_unlock,
                .pr_getlock =           tcp_getlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
        {
                .pr_type =              SOCK_RAW,
@@ -165,6 +168,8 @@ static struct protosw inetsw[] = {
                .pr_ctloutput =         rip_ctloutput,
                .pr_usrreqs =           &rip_usrreqs,
                .pr_unlock =            rip_unlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
        {
                .pr_type =              SOCK_RAW,
@@ -174,6 +179,8 @@ static struct protosw inetsw[] = {
                .pr_ctloutput =         rip_ctloutput,
                .pr_usrreqs =           &rip_usrreqs,
                .pr_unlock =            rip_unlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
        {
                .pr_type =              SOCK_DGRAM,
@@ -183,6 +190,8 @@ static struct protosw inetsw[] = {
                .pr_ctloutput =         icmp_dgram_ctloutput,
                .pr_usrreqs =           &icmp_dgram_usrreqs,
                .pr_unlock =            rip_unlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
        {
                .pr_type =              SOCK_RAW,
@@ -193,6 +202,8 @@ static struct protosw inetsw[] = {
                .pr_init =              igmp_init,
                .pr_usrreqs =           &rip_usrreqs,
                .pr_unlock =            rip_unlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
        {
                .pr_type =              SOCK_RAW,
@@ -203,6 +214,8 @@ static struct protosw inetsw[] = {
                .pr_ctloutput =         rip_ctloutput,
                .pr_usrreqs =           &rip_usrreqs,
                .pr_unlock =            rip_unlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
 #if IPSEC
        {
@@ -221,14 +234,6 @@ static struct protosw inetsw[] = {
                .pr_usrreqs =           &nousrreqs,
        },
 #endif /* IPSEC_ESP */
-       {
-               .pr_type =              SOCK_RAW,
-               .pr_protocol =          IPPROTO_IPCOMP,
-               .pr_flags =             PR_ATOMIC | PR_ADDR | PR_PROTOLOCK,
-               .pr_input =             ipcomp4_input,
-               .pr_init =              ipcomp_init,
-               .pr_usrreqs =           &nousrreqs,
-       },
 #endif /* IPSEC */
        {
                .pr_type =              SOCK_RAW,
@@ -239,6 +244,8 @@ static struct protosw inetsw[] = {
                .pr_init =              encap4_init,
                .pr_usrreqs =           &rip_usrreqs,
                .pr_unlock =            rip_unlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
 #if INET6
        {
@@ -250,6 +257,8 @@ static struct protosw inetsw[] = {
                .pr_init =              encap4_init,
                .pr_usrreqs =           &rip_usrreqs,
                .pr_unlock =            rip_unlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
 #endif /* INET6 */
 #if IPDIVERT
@@ -275,6 +284,8 @@ static struct protosw inetsw[] = {
                .pr_init =              rip_init,
                .pr_usrreqs =           &rip_usrreqs,
                .pr_unlock =            rip_unlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
 };
 
index 522bf5184e15ddc84d6f3de40628224b5057ea29..c47255185672fee81014426fbf0b64882558e8a0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2017-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -41,7 +41,8 @@ in_stat_set_activity_bitmap(activity_bitmap_t *activity, uint64_t now)
        uint64_t elapsed_time, slot;
        uint64_t *bitmap;
        if (activity->start == 0) {
-               activity->start = now;
+               // Align all activity maps
+               activity->start = now - (now % IN_STAT_ACTIVITY_GRANULARITY);
        }
        elapsed_time = now - activity->start;
 
index 638f14bbe272a05e2a1b99e0d1f2c5d5f4aa3413..c191fed8d7145c3ff3b31523d6cf3207427c6dff 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2015 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -85,6 +85,8 @@ typedef __uint32_t n_long;              /* long as received from the net */
 typedef __uint32_t n_time;              /* ms since 00:00 GMT, byte rev */
 
 #ifdef BSD_KERNEL_PRIVATE
+#define ABS(v) (((v) > 0) ? (v) : -(v))
+
 u_int32_t iptime(void);
 #endif /* BSD_KERNEL_PRIVATE */
 #endif /* _NETINET_IN_SYSTM_H_ */
index 7d8f336bde2ec3bc3c72a9559f9b8d289a3660cf..6c939c5aa5ccc6fdc539ddfd8a7eac17d231ef4b 100644 (file)
 #include <netinet/lro_ext.h>
 #include <netinet/in_tclass.h>
 
+struct net_qos_dscp_map {
+       uint8_t        sotc_to_dscp[SO_TC_MAX];
+       uint8_t        netsvctype_to_dscp[_NET_SERVICE_TYPE_COUNT];
+};
+
 struct dcsp_msc_map {
-       u_int8_t                dscp;
+       uint8_t                 dscp;
        mbuf_svc_class_t        msc;
 };
 static inline int so_throttle_best_effort(struct socket *, struct ifnet *);
@@ -117,10 +122,6 @@ int net_qos_policy_wifi_enabled = 0;
 SYSCTL_INT(_net_qos_policy, OID_AUTO, wifi_enabled,
     CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_wifi_enabled, 0, "");
 
-int net_qos_policy_none_wifi_enabled = 0;
-SYSCTL_INT(_net_qos_policy, OID_AUTO, none_wifi_enabled,
-    CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_none_wifi_enabled, 0, "");
-
 int net_qos_policy_capable_enabled = 0;
 SYSCTL_INT(_net_qos_policy, OID_AUTO, capable_enabled,
     CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_capable_enabled, 0, "");
@@ -145,18 +146,36 @@ const int sotc_by_netservicetype[_NET_SERVICE_TYPE_COUNT] = {
  */
 static const
 struct netsvctype_dscp_map fastlane_netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = {
-       { NET_SERVICE_TYPE_BE, _DSCP_DF },
-       { NET_SERVICE_TYPE_BK, _DSCP_AF11 },
-       { NET_SERVICE_TYPE_SIG, _DSCP_CS3 },
-       { NET_SERVICE_TYPE_VI, _DSCP_AF41 },
-       { NET_SERVICE_TYPE_VO, _DSCP_EF },
-       { NET_SERVICE_TYPE_RV, _DSCP_CS4 },
-       { NET_SERVICE_TYPE_AV, _DSCP_AF31 },
-       { NET_SERVICE_TYPE_OAM, _DSCP_CS2 },
-       { NET_SERVICE_TYPE_RD, _DSCP_AF21 },
+       { .netsvctype = NET_SERVICE_TYPE_BE, .dscp = _DSCP_DF },
+       { .netsvctype = NET_SERVICE_TYPE_BK, .dscp = _DSCP_AF11 },
+       { .netsvctype = NET_SERVICE_TYPE_SIG, .dscp = _DSCP_CS3 },
+       { .netsvctype = NET_SERVICE_TYPE_VI, .dscp = _DSCP_AF41 },
+       { .netsvctype = NET_SERVICE_TYPE_VO, .dscp = _DSCP_EF },
+       { .netsvctype = NET_SERVICE_TYPE_RV, .dscp = _DSCP_CS4 },
+       { .netsvctype = NET_SERVICE_TYPE_AV, .dscp = _DSCP_AF31 },
+       { .netsvctype = NET_SERVICE_TYPE_OAM, .dscp = _DSCP_CS2 },
+       { .netsvctype = NET_SERVICE_TYPE_RD, .dscp = _DSCP_AF21 },
+};
+
+
+/*
+ * DSCP mappings for QoS RFC4594 as based on network service types
+ */
+static const
+struct netsvctype_dscp_map rfc4594_netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = {
+       { .netsvctype = NET_SERVICE_TYPE_BE, .dscp = _DSCP_DF },
+       { .netsvctype = NET_SERVICE_TYPE_BK, .dscp = _DSCP_CS1 },
+       { .netsvctype = NET_SERVICE_TYPE_SIG, .dscp = _DSCP_CS5 },
+       { .netsvctype = NET_SERVICE_TYPE_VI, .dscp = _DSCP_AF41 },
+       { .netsvctype = NET_SERVICE_TYPE_VO, .dscp = _DSCP_EF },
+       { .netsvctype = NET_SERVICE_TYPE_RV, .dscp = _DSCP_CS4 },
+       { .netsvctype = NET_SERVICE_TYPE_AV, .dscp = _DSCP_AF31 },
+       { .netsvctype = NET_SERVICE_TYPE_OAM, .dscp = _DSCP_CS2 },
+       { .netsvctype = NET_SERVICE_TYPE_RD, .dscp = _DSCP_AF21 },
 };
 
-static struct net_qos_dscp_map default_net_qos_dscp_map;
+static struct net_qos_dscp_map fastlane_net_qos_dscp_map;
+static struct net_qos_dscp_map rfc4594_net_qos_dscp_map;
 
 /*
  * The size is one more than the max because DSCP start at zero
@@ -174,79 +193,79 @@ static struct net_qos_dscp_map default_net_qos_dscp_map;
  * option instead to select L2 QoS marking instead of IP_TOS or IPV6_TCLASS.
  */
 static const struct dcsp_msc_map default_dscp_to_wifi_ac_map[] = {
-       { _DSCP_DF, MBUF_SC_BE },               /* RFC 2474 Standard */
-       { 1, MBUF_SC_BE },                      /*  */
-       { 2, MBUF_SC_BE },                      /*  */
-       { 3, MBUF_SC_BE },                      /*  */
-       { 4, MBUF_SC_BE },                      /*  */
-       { 5, MBUF_SC_BE },                      /*  */
-       { 6, MBUF_SC_BE },                      /*  */
-       { 7, MBUF_SC_BE },                      /*  */
-
-       { _DSCP_CS1, MBUF_SC_BK },              /* RFC 3662 Low-Priority Data */
-       { 9, MBUF_SC_BK },                      /*  */
-       { _DSCP_AF11, MBUF_SC_BK },             /* RFC 2597 High-Throughput Data */
-       { 11, MBUF_SC_BK },                     /*  */
-       { _DSCP_AF12, MBUF_SC_BK },             /* RFC 2597 High-Throughput Data */
-       { 13, MBUF_SC_BK },                     /*  */
-       { _DSCP_AF13, MBUF_SC_BK },             /* RFC 2597 High-Throughput Data */
-       { 15, MBUF_SC_BK },                     /*  */
-
-       { _DSCP_CS2, MBUF_SC_BK },              /* RFC 4594 OAM */
-       { 17, MBUF_SC_BK },                     /*  */
-       { _DSCP_AF21, MBUF_SC_BK },             /* RFC 2597 Low-Latency Data */
-       { 19, MBUF_SC_BK },                     /*  */
-       { _DSCP_AF22, MBUF_SC_BK },             /* RFC 2597 Low-Latency Data */
-       { 21, MBUF_SC_BK },                     /*  */
-       { _DSCP_AF23, MBUF_SC_BK },             /* RFC 2597 Low-Latency Data */
-       { 23, MBUF_SC_BK },                     /*  */
-
-       { _DSCP_CS3, MBUF_SC_BE },              /* RFC 2474 Broadcast Video */
-       { 25, MBUF_SC_BE },                     /*  */
-       { _DSCP_AF31, MBUF_SC_BE },             /* RFC 2597 Multimedia Streaming */
-       { 27, MBUF_SC_BE },                     /*  */
-       { _DSCP_AF32, MBUF_SC_BE },             /* RFC 2597 Multimedia Streaming */
-       { 29, MBUF_SC_BE },                     /*  */
-       { _DSCP_AF33, MBUF_SC_BE },             /* RFC 2597 Multimedia Streaming */
-       { 31, MBUF_SC_BE },                     /*  */
-
-       { _DSCP_CS4, MBUF_SC_VI },              /* RFC 2474 Real-Time Interactive */
-       { 33, MBUF_SC_VI },                     /*  */
-       { _DSCP_AF41, MBUF_SC_VI },             /* RFC 2597 Multimedia Conferencing */
-       { 35, MBUF_SC_VI },                     /*  */
-       { _DSCP_AF42, MBUF_SC_VI },             /* RFC 2597 Multimedia Conferencing */
-       { 37, MBUF_SC_VI },                     /*  */
-       { _DSCP_AF43, MBUF_SC_VI },             /* RFC 2597 Multimedia Conferencing */
-       { 39, MBUF_SC_VI },                     /*  */
-
-       { _DSCP_CS5, MBUF_SC_VI },              /* RFC 2474 Signaling */
-       { 41, MBUF_SC_VI },                     /*  */
-       { 42, MBUF_SC_VI },                     /*  */
-       { 43, MBUF_SC_VI },                     /*  */
-       { _DSCP_VA, MBUF_SC_VI },               /* RFC 5865 VOICE-ADMIT */
-       { 45, MBUF_SC_VI },                     /*  */
-       { _DSCP_EF, MBUF_SC_VI },               /* RFC 3246 Telephony */
-       { 47, MBUF_SC_VI },                     /*  */
-
-       { _DSCP_CS6, MBUF_SC_VO },              /* Wi-Fi WMM Certification: Chariot */
-       { 49, MBUF_SC_VO },                     /*  */
-       { 50, MBUF_SC_VO },                     /*  */
-       { 51, MBUF_SC_VO },                     /*  */
-       { 52, MBUF_SC_VO },                     /* Wi-Fi WMM Certification: Sigma */
-       { 53, MBUF_SC_VO },                     /*  */
-       { 54, MBUF_SC_VO },                     /*  */
-       { 55, MBUF_SC_VO },                     /*  */
-
-       { _DSCP_CS7, MBUF_SC_VO },              /* Wi-Fi WMM Certification: Chariot */
-       { 57, MBUF_SC_VO },                     /*  */
-       { 58, MBUF_SC_VO },                     /*  */
-       { 59, MBUF_SC_VO },                     /*  */
-       { 60, MBUF_SC_VO },                     /*  */
-       { 61, MBUF_SC_VO },                     /*  */
-       { 62, MBUF_SC_VO },                     /*  */
-       { 63, MBUF_SC_VO },                     /*  */
-
-       { 255, MBUF_SC_UNSPEC }                  /* invalid DSCP to mark last entry */
+       { .dscp = _DSCP_DF, .msc = MBUF_SC_BE },        /* RFC 2474 Standard */
+       { .dscp = 1, .msc = MBUF_SC_BE },               /*  */
+       { .dscp = 2, .msc = MBUF_SC_BE },               /*  */
+       { .dscp = 3, .msc = MBUF_SC_BE },               /*  */
+       { .dscp = 4, .msc = MBUF_SC_BE },               /*  */
+       { .dscp = 5, .msc = MBUF_SC_BE },               /*  */
+       { .dscp = 6, .msc = MBUF_SC_BE },               /*  */
+       { .dscp = 7, .msc = MBUF_SC_BE },               /*  */
+
+       { .dscp = _DSCP_CS1, .msc = MBUF_SC_BK },       /* RFC 3662 Low-Priority Data */
+       { .dscp = 9, .msc = MBUF_SC_BK },               /*  */
+       { .dscp = _DSCP_AF11, .msc = MBUF_SC_BK },      /* RFC 2597 High-Throughput Data */
+       { .dscp = 11, .msc = MBUF_SC_BK },              /*  */
+       { .dscp = _DSCP_AF12, .msc = MBUF_SC_BK },      /* RFC 2597 High-Throughput Data */
+       { .dscp = 13, .msc = MBUF_SC_BK },              /*  */
+       { .dscp = _DSCP_AF13, .msc = MBUF_SC_BK },      /* RFC 2597 High-Throughput Data */
+       { .dscp = 15, .msc = MBUF_SC_BK },              /*  */
+
+       { .dscp = _DSCP_CS2, .msc = MBUF_SC_BK },       /* RFC 4594 OAM */
+       { .dscp = 17, .msc = MBUF_SC_BK },              /*  */
+       { .dscp = _DSCP_AF21, .msc = MBUF_SC_BK },      /* RFC 2597 Low-Latency Data */
+       { .dscp = 19, .msc = MBUF_SC_BK },              /*  */
+       { .dscp = _DSCP_AF22, .msc = MBUF_SC_BK },      /* RFC 2597 Low-Latency Data */
+       { .dscp = 21, .msc = MBUF_SC_BK },              /*  */
+       { .dscp = _DSCP_AF23, .msc = MBUF_SC_BK },      /* RFC 2597 Low-Latency Data */
+       { .dscp = 23, .msc = MBUF_SC_BK },              /*  */
+
+       { .dscp = _DSCP_CS3, .msc = MBUF_SC_BE },       /* RFC 2474 Broadcast Video */
+       { .dscp = 25, .msc = MBUF_SC_BE },              /*  */
+       { .dscp = _DSCP_AF31, .msc = MBUF_SC_BE },      /* RFC 2597 Multimedia Streaming */
+       { .dscp = 27, .msc = MBUF_SC_BE },              /*  */
+       { .dscp = _DSCP_AF32, .msc = MBUF_SC_BE },      /* RFC 2597 Multimedia Streaming */
+       { .dscp = 29, .msc = MBUF_SC_BE },              /*  */
+       { .dscp = _DSCP_AF33, .msc = MBUF_SC_BE },      /* RFC 2597 Multimedia Streaming */
+       { .dscp = 31, .msc = MBUF_SC_BE },              /*  */
+
+       { .dscp = _DSCP_CS4, .msc = MBUF_SC_VI },       /* RFC 2474 Real-Time Interactive */
+       { .dscp = 33, .msc = MBUF_SC_VI },              /*  */
+       { .dscp = _DSCP_AF41, .msc = MBUF_SC_VI },      /* RFC 2597 Multimedia Conferencing */
+       { .dscp = 35, .msc = MBUF_SC_VI },              /*  */
+       { .dscp = _DSCP_AF42, .msc = MBUF_SC_VI },      /* RFC 2597 Multimedia Conferencing */
+       { .dscp = 37, .msc = MBUF_SC_VI },              /*  */
+       { .dscp = _DSCP_AF43, .msc = MBUF_SC_VI },      /* RFC 2597 Multimedia Conferencing */
+       { .dscp = 39, .msc = MBUF_SC_VI },              /*  */
+
+       { .dscp = _DSCP_CS5, .msc = MBUF_SC_VI },       /* RFC 2474 Signaling */
+       { .dscp = 41, .msc = MBUF_SC_VI },              /*  */
+       { .dscp = 42, .msc = MBUF_SC_VI },              /*  */
+       { .dscp = 43, .msc = MBUF_SC_VI },              /*  */
+       { .dscp = _DSCP_VA, .msc = MBUF_SC_VI },        /* RFC 5865 VOICE-ADMIT */
+       { .dscp = 45, .msc = MBUF_SC_VI },              /*  */
+       { .dscp = _DSCP_EF, .msc = MBUF_SC_VI },        /* RFC 3246 Telephony */
+       { .dscp = 47, .msc = MBUF_SC_VI },              /*  */
+
+       { .dscp = _DSCP_CS6, .msc = MBUF_SC_VO },       /* Wi-Fi WMM Certification: Chariot */
+       { .dscp = 49, .msc = MBUF_SC_VO },              /*  */
+       { .dscp = 50, .msc = MBUF_SC_VO },              /*  */
+       { .dscp = 51, .msc = MBUF_SC_VO },              /*  */
+       { .dscp = 52, .msc = MBUF_SC_VO },              /* Wi-Fi WMM Certification: Sigma */
+       { .dscp = 53, .msc = MBUF_SC_VO },              /*  */
+       { .dscp = 54, .msc = MBUF_SC_VO },              /*  */
+       { .dscp = 55, .msc = MBUF_SC_VO },              /*  */
+
+       { .dscp = _DSCP_CS7, .msc = MBUF_SC_VO },       /* Wi-Fi WMM Certification: Chariot */
+       { .dscp = 57, .msc = MBUF_SC_VO },              /*  */
+       { .dscp = 58, .msc = MBUF_SC_VO },              /*  */
+       { .dscp = 59, .msc = MBUF_SC_VO },              /*  */
+       { .dscp = 60, .msc = MBUF_SC_VO },              /*  */
+       { .dscp = 61, .msc = MBUF_SC_VO },              /*  */
+       { .dscp = 62, .msc = MBUF_SC_VO },              /*  */
+       { .dscp = 63, .msc = MBUF_SC_VO },              /*  */
+
+       { .dscp = 255, .msc = MBUF_SC_UNSPEC }          /* invalid DSCP to mark last entry */
 };
 
 mbuf_svc_class_t wifi_dscp_to_msc_array[DSCP_ARRAY_SIZE];
@@ -270,7 +289,7 @@ struct tclass_for_proc {
        int             tfp_class;
        pid_t           tfp_pid;
        char            tfp_pname[(2 * MAXCOMLEN) + 1];
-       u_int32_t       tfp_qos_mode;
+       uint32_t        tfp_qos_mode;
 };
 
 static int get_pid_tclass(struct so_tcdbg *);
@@ -873,9 +892,7 @@ so_get_netsvc_marking_level(struct socket *so)
                break;
        }
        if (ifp != NULL) {
-               if ((ifp->if_eflags &
-                   (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) ==
-                   (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) {
+               if ((ifp->if_eflags & IFEF_QOSMARKING_ENABLED) != 0) {
                        if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
                                marking_level = NETSVC_MRKNG_LVL_L3L2_ALL;
                        } else {
@@ -1080,7 +1097,7 @@ so_inc_recv_data_stat(struct socket *so, size_t pkts, size_t bytes,
 static inline int
 so_throttle_best_effort(struct socket *so, struct ifnet *ifp)
 {
-       u_int32_t uptime = net_uptime();
+       uint32_t uptime = net_uptime();
        return soissrcbesteffort(so) &&
               net_io_policy_throttle_best_effort == 1 &&
               ifp->if_rt_sendts > 0 &&
@@ -1096,7 +1113,7 @@ set_tcp_stream_priority(struct socket *so)
        u_char old_cc = tp->tcp_cc_index;
        int recvbg = IS_TCP_RECV_BG(so);
        bool is_local = false, fg_active = false;
-       u_int32_t uptime;
+       uint32_t uptime;
 
        VERIFY((SOCK_CHECK_DOM(so, PF_INET) ||
            SOCK_CHECK_DOM(so, PF_INET6)) &&
@@ -1210,7 +1227,7 @@ set_tcp_stream_priority(struct socket *so)
  */
 __private_extern__ void
 set_packet_service_class(struct mbuf *m, struct socket *so,
-    int sotc, u_int32_t flags)
+    int sotc, uint32_t flags)
 {
        mbuf_svc_class_t msc = MBUF_SC_BE;         /* Best effort by default */
        struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */
@@ -1473,28 +1490,160 @@ sotc_index(int sotc)
        return SIZE_T_MAX;
 }
 
+uint8_t
+fastlane_sc_to_dscp(uint32_t svc_class)
+{
+       uint8_t dscp = _DSCP_DF;
+
+       switch (svc_class) {
+       case MBUF_SC_BK_SYS:
+       case MBUF_SC_BK:
+               dscp = _DSCP_AF11;
+               break;
+
+       case MBUF_SC_BE:
+               dscp = _DSCP_DF;
+               break;
+       case MBUF_SC_RD:
+               dscp = _DSCP_AF21;
+               break;
+       case MBUF_SC_OAM:
+               dscp = _DSCP_CS2;
+               break;
+
+       case MBUF_SC_AV:
+               dscp = _DSCP_AF31;
+               break;
+       case MBUF_SC_RV:
+               dscp = _DSCP_CS4;
+               break;
+       case MBUF_SC_VI:
+               dscp = _DSCP_AF41;
+               break;
+       case MBUF_SC_SIG:
+               dscp = _DSCP_CS3;
+               break;
+
+       case MBUF_SC_VO:
+               dscp = _DSCP_EF;
+               break;
+       case MBUF_SC_CTL:
+               dscp = _DSCP_DF;
+               break;
+       default:
+               dscp = _DSCP_DF;
+               break;
+       }
+
+       return dscp;
+}
+
+uint8_t
+rfc4594_sc_to_dscp(uint32_t svc_class)
+{
+       uint8_t dscp = _DSCP_DF;
+
+       switch (svc_class) {
+       case MBUF_SC_BK_SYS:            /* Low-Priority Data */
+       case MBUF_SC_BK:
+               dscp = _DSCP_CS1;
+               break;
+
+       case MBUF_SC_BE:                        /* Standard */
+               dscp = _DSCP_DF;
+               break;
+       case MBUF_SC_RD:                        /* Low-Latency Data */
+               dscp = _DSCP_AF21;
+               break;
+
+       /* SVC_CLASS Not Defined:  High-Throughput Data */
+
+       case MBUF_SC_OAM:               /* OAM */
+               dscp = _DSCP_CS2;
+               break;
+
+       /* SVC_CLASS Not Defined:  Broadcast Video */
+
+       case MBUF_SC_AV:                        /* Multimedia Streaming */
+               dscp = _DSCP_AF31;
+               break;
+       case MBUF_SC_RV:                        /* Real-Time Interactive */
+               dscp = _DSCP_CS4;
+               break;
+       case MBUF_SC_VI:                        /* Multimedia Conferencing */
+               dscp = _DSCP_AF41;
+               break;
+       case MBUF_SC_SIG:               /* Signaling */
+               dscp = _DSCP_CS5;
+               break;
+
+       case MBUF_SC_VO:                        /* Telephony */
+               dscp = _DSCP_EF;
+               break;
+       case MBUF_SC_CTL:               /* Network Control*/
+               dscp = _DSCP_CS6;
+               break;
+       default:
+               dscp = _DSCP_DF;
+               break;
+       }
+
+       return dscp;
+}
+
+mbuf_traffic_class_t
+rfc4594_dscp_to_tc(uint8_t dscp)
+{
+       mbuf_traffic_class_t tc = MBUF_TC_BE;
+
+       switch (dscp) {
+       case _DSCP_CS1:
+               tc = MBUF_TC_BK;
+               break;
+       case _DSCP_DF:
+       case _DSCP_AF21:
+       case _DSCP_CS2:
+               tc = MBUF_TC_BE;
+               break;
+       case _DSCP_AF31:
+       case _DSCP_CS4:
+       case _DSCP_AF41:
+       case _DSCP_CS5:
+               tc = MBUF_TC_VI;
+               break;
+       case _DSCP_EF:
+       case _DSCP_CS6:
+               tc = MBUF_TC_VO;
+               break;
+       default:
+               tc = MBUF_TC_BE;
+               break;
+       }
+
+       return tc;
+}
+
 /*
  * Pass NULL ifp for default map
  */
 static errno_t
-set_netsvctype_dscp_map(size_t in_count,
+set_netsvctype_dscp_map(struct net_qos_dscp_map *net_qos_dscp_map,
     const struct netsvctype_dscp_map *netsvctype_dscp_map)
 {
        size_t i;
-       struct net_qos_dscp_map *net_qos_dscp_map = NULL;
        int netsvctype;
 
        /*
         * Do not accept more that max number of distinct DSCPs
         */
-       if (in_count > _MAX_DSCP || netsvctype_dscp_map == NULL) {
+       if (net_qos_dscp_map == NULL || netsvctype_dscp_map == NULL) {
                return EINVAL;
        }
 
        /*
         * Validate input parameters
         */
-       for (i = 0; i < in_count; i++) {
+       for (i = 0; i < _NET_SERVICE_TYPE_COUNT; i++) {
                if (!IS_VALID_NET_SERVICE_TYPE(netsvctype_dscp_map[i].netsvctype)) {
                        return EINVAL;
                }
@@ -1503,9 +1652,7 @@ set_netsvctype_dscp_map(size_t in_count,
                }
        }
 
-       net_qos_dscp_map = &default_net_qos_dscp_map;
-
-       for (i = 0; i < in_count; i++) {
+       for (i = 0; i < _NET_SERVICE_TYPE_COUNT; i++) {
                netsvctype = netsvctype_dscp_map[i].netsvctype;
 
                net_qos_dscp_map->netsvctype_to_dscp[netsvctype] =
@@ -1568,7 +1715,7 @@ get_netsvctype_dscp_map(size_t *out_count,
                return EINVAL;
        }
 
-       net_qos_dscp_map = &default_net_qos_dscp_map;
+       net_qos_dscp_map = &fastlane_net_qos_dscp_map;
 
        for (i = 0; i < MIN(_NET_SERVICE_TYPE_COUNT, *out_count); i++) {
                netsvctype_dscp_map[i].netsvctype = i;
@@ -1584,17 +1731,13 @@ net_qos_map_init()
 {
        errno_t error;
 
-       /*
-        * By default use the Fastlane DSCP mappngs
-        */
-       error = set_netsvctype_dscp_map(_NET_SERVICE_TYPE_COUNT,
+       error = set_netsvctype_dscp_map(&fastlane_net_qos_dscp_map,
            fastlane_netsvctype_dscp_map);
        ASSERT(error == 0);
 
-       /*
-        * No DSCP mapping for network control
-        */
-       default_net_qos_dscp_map.sotc_to_dscp[SOTCIX_CTL] = _DSCP_DF;
+       error = set_netsvctype_dscp_map(&rfc4594_net_qos_dscp_map,
+           rfc4594_netsvctype_dscp_map);
+       ASSERT(error == 0);
 
        set_dscp_to_wifi_ac_map(default_dscp_to_wifi_ac_map, 1);
 }
@@ -1604,8 +1747,6 @@ sysctl_default_netsvctype_to_dscp_map SYSCTL_HANDLER_ARGS
 {
 #pragma unused(oidp, arg1, arg2)
        int error = 0;
-       const size_t max_netsvctype_to_dscp_map_len =
-           _NET_SERVICE_TYPE_COUNT * sizeof(struct netsvctype_dscp_map);
        size_t len;
        struct netsvctype_dscp_map netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = {};
        size_t count;
@@ -1627,48 +1768,37 @@ sysctl_default_netsvctype_to_dscp_map SYSCTL_HANDLER_ARGS
                }
        }
 
-       if (req->newptr == USER_ADDR_NULL) {
-               goto done;
+       if (req->newptr != USER_ADDR_NULL) {
+               error = EPERM;
        }
-
-       error = proc_suser(current_proc());
-       if (error != 0) {
-               goto done;
-       }
-
-       /*
-        * Check input length
-        */
-       if (req->newlen > max_netsvctype_to_dscp_map_len) {
-               error = EINVAL;
-               goto done;
-       }
-       /*
-        * Cap the number of entries to copy from input buffer
-        */
-       error = SYSCTL_IN(req, netsvctype_dscp_map, req->newlen);
-       if (error != 0) {
-               goto done;
-       }
-
-       count = req->newlen / sizeof(struct netsvctype_dscp_map);
-       error = set_netsvctype_dscp_map(count, netsvctype_dscp_map);
 done:
        return error;
 }
 
 __private_extern__ errno_t
 set_packet_qos(struct mbuf *m, struct ifnet *ifp, boolean_t qos_allowed,
-    int sotc, int netsvctype, u_int8_t *dscp_inout)
+    int sotc, int netsvctype, uint8_t *dscp_inout)
 {
        if (ifp == NULL || dscp_inout == NULL) {
                return EINVAL;
        }
 
-       if ((ifp->if_eflags &
-           (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) ==
-           (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) {
-               u_int8_t dscp;
+       if ((ifp->if_eflags & IFEF_QOSMARKING_ENABLED) != 0 &&
+           ifp->if_qosmarking_mode != IFRTYPE_QOSMARKING_MODE_NONE) {
+               uint8_t dscp;
+               const struct net_qos_dscp_map *net_qos_dscp_map = NULL;
+
+               switch (ifp->if_qosmarking_mode) {
+               case IFRTYPE_QOSMARKING_FASTLANE:
+                       net_qos_dscp_map = &fastlane_net_qos_dscp_map;
+                       break;
+               case IFRTYPE_QOSMARKING_RFC4594:
+                       net_qos_dscp_map = &rfc4594_net_qos_dscp_map;
+                       break;
+               default:
+                       panic("invalid QoS marking type");
+                       /* NOTREACHED */
+               }
 
                /*
                 * When on a Fastlane network, IP_TOS/IPV6_TCLASS are no-ops
@@ -1688,7 +1818,7 @@ set_packet_qos(struct mbuf *m, struct ifnet *ifp, boolean_t qos_allowed,
                 */
                if (IS_VALID_NET_SERVICE_TYPE(netsvctype) &&
                    netsvctype != NET_SERVICE_TYPE_BE) {
-                       dscp = default_net_qos_dscp_map.netsvctype_to_dscp[netsvctype];
+                       dscp = net_qos_dscp_map->netsvctype_to_dscp[netsvctype];
 
                        if (qos_allowed == FALSE &&
                            netsvctype != NET_SERVICE_TYPE_BE &&
@@ -1701,7 +1831,7 @@ set_packet_qos(struct mbuf *m, struct ifnet *ifp, boolean_t qos_allowed,
                } else if (sotc != SO_TC_UNSPEC) {
                        size_t sotcix = sotc_index(sotc);
                        if (sotcix != SIZE_T_MAX) {
-                               dscp = default_net_qos_dscp_map.sotc_to_dscp[sotcix];
+                               dscp = net_qos_dscp_map->sotc_to_dscp[sotcix];
 
                                if (qos_allowed == FALSE && sotc != SO_TC_BE &&
                                    sotc != SO_TC_BK && sotc != SO_TC_BK_SYS &&
@@ -1790,7 +1920,7 @@ dscp_msc_map_from_netsvctype_dscp_map(struct netsvctype_dscp_map *netsvctype_dsc
     size_t count, struct dcsp_msc_map *dcsp_msc_map)
 {
        errno_t error = 0;
-       u_int32_t i;
+       uint32_t i;
 
        /*
         * Validate input parameters
@@ -1825,7 +1955,7 @@ sysctl_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS
        struct netsvctype_dscp_map netsvctype_dscp_map[DSCP_ARRAY_SIZE] = {};
        struct dcsp_msc_map dcsp_msc_map[DSCP_ARRAY_SIZE];
        size_t count;
-       u_int32_t i;
+       uint32_t i;
 
        if (req->oldptr == USER_ADDR_NULL) {
                req->oldidx = len;
@@ -1967,6 +2097,15 @@ net_qos_guideline(struct proc *p, struct net_qos_guideline_args *arg,
                        return 0;
                }
        }
+       if (ipv4_primary != NULL && IFNET_IS_CONSTRAINED(ipv4_primary) &&
+           ipv6_primary != NULL && IFNET_IS_CONSTRAINED(ipv6_primary)) {
+               if (qos_arg.nq_use_constrained) {
+                       return 0;
+               } else {
+                       *retval = RETURN_USE_BK;
+                       return 0;
+               }
+       }
        if (qos_arg.nq_transfer_size >= 5 * 1024 * 1024) {
                *retval = RETURN_USE_BK;
                return 0;
index 1d8493b575b111dd56be08d3ff948266d0e129ab..7f2c2600d6c7909df78f1e2334d580f76e0c9afe 100644 (file)
@@ -67,7 +67,8 @@ struct so_tcdbg {
 struct net_qos_param {
        u_int64_t nq_transfer_size;     /* transfer size in bytes */
        u_int32_t nq_use_expensive:1,   /* allowed = 1 otherwise 0 */
-           nq_uplink:1;                /* uplink = 1 otherwise 0 */
+           nq_uplink:1,                /* uplink = 1 otherwise 0 */
+           nq_use_constrained:1;       /* allowed = 1 otherwise 0 */
        u_int32_t nq_unused;            /* for future expansion */
 };
 
@@ -91,14 +92,18 @@ extern int net_qos_guideline(struct net_qos_param *param, size_t param_len);
 
 extern int net_qos_policy_restricted;
 extern int net_qos_policy_wifi_enabled;
-extern int net_qos_policy_none_wifi_enabled;
 extern int net_qos_policy_capable_enabled;
 
 extern void net_qos_map_init(void);
+extern void net_qos_map_change(uint32_t mode);
 extern errno_t set_packet_qos(struct mbuf *, struct ifnet *, boolean_t, int,
     int, u_int8_t *);
 extern int so_get_netsvc_marking_level(struct socket *);
 
+extern uint8_t fastlane_sc_to_dscp(uint32_t svc_class);
+extern uint8_t rfc4594_sc_to_dscp(uint32_t svc_class);
+extern mbuf_traffic_class_t rfc4594_dscp_to_tc(uint8_t dscp);
+
 #endif /* BSD_KERNEL_PRIVATE */
 
 #endif /* PRIVATE */
index 18c55a17a2ddab53a3eb7508b56841b8fa082c8f..454a58288430771c804b4280de749999f13a63b4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -135,6 +135,20 @@ struct kev_in_arpalive {
        struct net_event_data link_data; /* link where ARP was received */
 };
 
+#ifdef PRIVATE
+/*
+ * Common structure for KEV_SOCKET_SUBCLASS
+ * Have to place here to avoid declaration dependencies.
+ */
+struct kev_socket_event_data {
+       union sockaddr_in_4_6 kev_sockname;
+       union sockaddr_in_4_6 kev_peername;
+};
+
+struct kev_socket_closed {
+       struct kev_socket_event_data ev_data;
+};
+#endif /* PRIVATE */
 
 #ifdef __APPLE_API_PRIVATE
 struct kev_in_portinuse {
@@ -145,6 +159,10 @@ struct kev_in_portinuse {
 #endif /* __APPLE_API_PRIVATE */
 
 #ifdef BSD_KERNEL_PRIVATE
+extern void socket_post_kev_msg(uint32_t, struct kev_socket_event_data *,
+    uint32_t);
+extern void socket_post_kev_msg_closed(struct socket *);
+
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_llatbl.h>
index 723b499617059587ea1e41f618f325269c2e25f8..aef9c2c11fa2a21a8de1d20b057002dbd5823518 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -136,7 +136,13 @@ static u_int32_t        div_sendspace = DIVSNDQ;        /* XXX sysctl ? */
 static u_int32_t        div_recvspace = DIVRCVQ;        /* XXX sysctl ? */
 
 /* Optimization: have this preinitialized */
-static struct sockaddr_in divsrc = { sizeof(divsrc), AF_INET, 0, { 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 } };
+static struct sockaddr_in divsrc = {
+       .sin_len = sizeof(divsrc),
+       .sin_family = AF_INET,
+       .sin_port = 0,
+       .sin_addr = { .s_addr = 0 },
+       .sin_zero = { 0, 0, 0, 0, 0, 0, 0, 0 }
+};
 
 /* Internal functions */
 static int div_output(struct socket *so,
index c9f5668220d15bf0ec9e2dc6ab134aa1cd11b4e6..3a854db0500abc3dfa2d60d74939a04db6576c95 100644 (file)
@@ -1620,7 +1620,6 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa, int cl
                }
                pkt->dn_origifp = fwa->fwa_origifp;
                pkt->dn_mtu = fwa->fwa_mtu;
-               pkt->dn_alwaysfrag = fwa->fwa_alwaysfrag;
                pkt->dn_unfragpartlen = fwa->fwa_unfragpartlen;
                if (fwa->fwa_exthdrs) {
                        bcopy (fwa->fwa_exthdrs, &pkt->dn_exthdrs, sizeof(pkt->dn_exthdrs));
index 884ce05da08c0161690a09e1cb8bdc7e24ae2b96..fae71f8e05ad7da265def2f2143ebf8777a45afc 100644 (file)
@@ -109,12 +109,6 @@ typedef u_int64_t dn_key;       /* sorting key */
  * virtual time wraps every 15 days.
  */
 
-/*
- * The OFFSET_OF macro is used to return the offset of a field within
- * a structure. It is used by the heap management routines.
- */
-#define OFFSET_OF(type, field) ((int)&( ((type *)0)->field) )
-
 /*
  * The maximum hash table size for queues.  This value must be a power
  * of 2.
@@ -188,7 +182,6 @@ struct dn_pkt_tag {
        struct route_in6    dn_ro6_pmtu;        /* for ip6_output */
        struct ifnet        *dn_origifp;        /* for ip6_output */
        u_int32_t           dn_mtu;             /* for ip6_output */
-       int                 dn_alwaysfrag;      /* for ip6_output */
        u_int32_t           dn_unfragpartlen;   /* for ip6_output */
        struct ip6_exthdrs  dn_exthdrs;         /* for ip6_output */
        int                 dn_flags;           /* flags, for ip[6]_output */
index 02c4e8141d7b5b24056ab440c3ca777186749e93..4aaa6fa38823f08957e6fecaeaed5949b1f785a5 100644 (file)
@@ -76,7 +76,7 @@
  * Well, what can I say.  They impose different en/decapsulation mechanism
  * from each other, so they need separate protocol handler.  The only one
  * we can easily determine by protocol # is IPsec, which always has
- * AH/ESP/IPComp header right after outer IP header.
+ * AH/ESP header right after outer IP header.
  *
  * So, clearly good old protosw does not work for protocol #4 and #41.
  * The code will let you match protocol via src/dst address pair.
index 4c7f8f371a77548087a3c008515792a16e2cd75f..3c68a809f3be55a22f8636289625ee99943e8336 100644 (file)
@@ -113,7 +113,6 @@ struct ip_fw_args {
        struct route_in6        *fwa_ro6_pmtu;  /* for IPv6 output */
        struct ifnet            *fwa_origifp;   /* for IPv6 output */
        u_int32_t               fwa_mtu;        /* for IPv6 output */
-       int                     fwa_alwaysfrag; /* for IPv6 output */
        u_int32_t               fwa_unfragpartlen;  /* for IPv6 output */
        struct ip6_exthdrs      *fwa_exthdrs;   /* for IPv6 output */
        struct ip_flow_id       fwa_id;         /* grabbed from IP header       */
index acbb060cba5f7b0071100305fc1b6d60477ddb03..f6f9baf01ca3a8332de18c9c4d4c1306591cd7e9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -2136,7 +2136,7 @@ ipfw_chk(struct ip_fw_args *args)
         */
        u_int8_t proto;
        u_int16_t src_port = 0, dst_port = 0;   /* NOTE: host format    */
-       struct in_addr src_ip = { 0 }, dst_ip = { 0 };          /* NOTE: network format */
+       struct in_addr src_ip = { .s_addr = 0 }, dst_ip = { .s_addr = 0 };              /* NOTE: network format */
        u_int16_t ip_len = 0;
        int pktlen;
        int dyn_dir = MATCH_UNKNOWN;
index 2965c4adc3d3c519114a83c831e0d6822d825225..7360f96685d486f71e70024c92b004d11bb2c070 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -70,26 +70,26 @@ struct _s_x {
 #if FW2_DEBUG_VERBOSE
 
 static struct _s_x f_tcpflags[] = {
-       { "syn", TH_SYN },
-       { "fin", TH_FIN },
-       { "ack", TH_ACK },
-       { "psh", TH_PUSH },
-       { "rst", TH_RST },
-       { "urg", TH_URG },
-       { "tcp flag", 0 },
-       { NULL, 0 }
+       { .s = "syn", .x = TH_SYN },
+       { .s = "fin", .x = TH_FIN },
+       { .s = "ack", .x = TH_ACK },
+       { .s = "psh", .x = TH_PUSH },
+       { .s = "rst", .x = TH_RST },
+       { .s = "urg", .x = TH_URG },
+       { .s = "tcp flag", .x = 0 },
+       { .s = NULL, .x = 0 }
 };
 
 static struct _s_x f_tcpopts[] = {
-       { "mss", IP_FW_TCPOPT_MSS },
-       { "maxseg", IP_FW_TCPOPT_MSS },
-       { "window", IP_FW_TCPOPT_WINDOW },
-       { "sack", IP_FW_TCPOPT_SACK },
-       { "ts", IP_FW_TCPOPT_TS },
-       { "timestamp", IP_FW_TCPOPT_TS },
-       { "cc", IP_FW_TCPOPT_CC },
-       { "tcp option", 0 },
-       { NULL, 0 }
+       { .s = "mss", .x = IP_FW_TCPOPT_MSS },
+       { .s = "maxseg", .x = IP_FW_TCPOPT_MSS },
+       { .s = "window", .x = IP_FW_TCPOPT_WINDOW },
+       { .s = "sack", .x = IP_FW_TCPOPT_SACK },
+       { .s = "ts", .x = IP_FW_TCPOPT_TS },
+       { .s = "timestamp", .x = IP_FW_TCPOPT_TS },
+       { .s = "cc", .x = IP_FW_TCPOPT_CC },
+       { .s = "tcp option", .x = 0 },
+       { .s = NULL, .x = 0 }
 };
 
 
@@ -98,32 +98,32 @@ static struct _s_x f_tcpopts[] = {
  * (though in fact only the low 5 bits are significant).
  */
 static struct _s_x f_ipopts[] = {
-       { "ssrr", IP_FW_IPOPT_SSRR},
-       { "lsrr", IP_FW_IPOPT_LSRR},
-       { "rr", IP_FW_IPOPT_RR},
-       { "ts", IP_FW_IPOPT_TS},
-       { "ip option", 0 },
-       { NULL, 0 }
+       { .s = "ssrr", .x = IP_FW_IPOPT_SSRR},
+       { .s = "lsrr", .x = IP_FW_IPOPT_LSRR},
+       { .s = "rr", .x = IP_FW_IPOPT_RR},
+       { .s = "ts", .x = IP_FW_IPOPT_TS},
+       { .s = "ip option", .x = 0 },
+       { .s = NULL, .x = 0 }
 };
 
 static struct _s_x f_iptos[] = {
-       { "lowdelay", IPTOS_LOWDELAY},
-       { "throughput", IPTOS_THROUGHPUT},
-       { "reliability", IPTOS_RELIABILITY},
-       { "mincost", IPTOS_MINCOST},
-       { "congestion", IPTOS_CE},
-       { "ecntransport", IPTOS_ECT},
-       { "ip tos option", 0},
-       { NULL, 0 }
+       { .s = "lowdelay", .x = IPTOS_LOWDELAY},
+       { .s = "throughput", .x = IPTOS_THROUGHPUT},
+       { .s = "reliability", .x = IPTOS_RELIABILITY},
+       { .s = "mincost", .x = IPTOS_MINCOST},
+       { .s = "congestion", .x = IPTOS_CE},
+       { .s = "ecntransport", .x = IPTOS_ECT},
+       { .s = "ip tos option", .x = 0},
+       { .s = NULL, .x = 0 }
 };
 
 static struct _s_x limit_masks[] = {
-       {"all", DYN_SRC_ADDR | DYN_SRC_PORT | DYN_DST_ADDR | DYN_DST_PORT},
-       {"src-addr", DYN_SRC_ADDR},
-       {"src-port", DYN_SRC_PORT},
-       {"dst-addr", DYN_DST_ADDR},
-       {"dst-port", DYN_DST_PORT},
-       {NULL, 0}
+       { .s = "all", .x = DYN_SRC_ADDR | DYN_SRC_PORT | DYN_DST_ADDR | DYN_DST_PORT},
+       { .s = "src-addr", .x = DYN_SRC_ADDR},
+       { .s = "src-port", .x = DYN_SRC_PORT},
+       { .s = "dst-addr", .x = DYN_DST_ADDR},
+       { .s = "dst-port", .x = DYN_DST_PORT},
+       { .s = NULL, .x = 0}
 };
 
 #endif /* !FW2_DEBUG_VERBOSE */
index 74b051aa1b99af3c43a37fca5660da9e7f19a41c..44804c8a2e08d76dccad0cdecbf1d62dea00c9c9 100644 (file)
@@ -445,6 +445,7 @@ icmp_input(struct mbuf *m, int hlen)
        struct in_ifaddr *ia;
        void (*ctlfunc)(int, struct sockaddr *, void *, struct ifnet *);
        int code;
+       boolean_t should_log_redirect = false;
 
        /* Expect 32-bit aligned data pointer on strict-align platforms */
        MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
@@ -578,11 +579,15 @@ deliver:
                 */
                if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp)
                    || IP_VHL_HL(icp->icmp_ip.ip_vhl) <
-                   (sizeof(struct ip) >> 2)) {
+                   (sizeof(struct ip) >> 2) ||
+                   (m = m_pullup(m, hlen + ICMP_ADVLEN(icp))) == NULL) {
                        icmpstat.icps_badlen++;
                        goto freeit;
                }
 
+               ip = mtod(m, struct ip *);
+               icp = (struct icmp *)(void *)(mtod(m, uint8_t *) + hlen);
+
 #if BYTE_ORDER != BIG_ENDIAN
                NTOHS(icp->icmp_ip.ip_len);
 #endif
@@ -735,21 +740,6 @@ reflect:
                return;
 
        case ICMP_REDIRECT:
-               if (log_redirect) {
-                       u_int32_t src, dst, gw;
-
-                       src = ntohl(ip->ip_src.s_addr);
-                       dst = ntohl(icp->icmp_ip.ip_dst.s_addr);
-                       gw = ntohl(icp->icmp_gwaddr.s_addr);
-                       printf("icmp redirect from %d.%d.%d.%d: "
-                           "%d.%d.%d.%d => %d.%d.%d.%d\n",
-                           (int)(src >> 24), (int)((src >> 16) & 0xff),
-                           (int)((src >> 8) & 0xff), (int)(src & 0xff),
-                           (int)(dst >> 24), (int)((dst >> 16) & 0xff),
-                           (int)((dst >> 8) & 0xff), (int)(dst & 0xff),
-                           (int)(gw >> 24), (int)((gw >> 16) & 0xff),
-                           (int)((gw >> 8) & 0xff), (int)(gw & 0xff));
-               }
                if (drop_redirect) {
                        break;
                }
@@ -761,6 +751,12 @@ reflect:
                        icmpstat.icps_badlen++;
                        break;
                }
+
+#if (DEBUG | DEVELOPMENT)
+               should_log_redirect = log_redirect || (icmpprintfs > 0);
+#else
+               should_log_redirect = log_redirect;
+#endif
                /*
                 * Short circuit routing redirects to force
                 * immediate change in the kernel's routing
@@ -770,16 +766,18 @@ reflect:
                 */
                icmpgw.sin_addr = ip->ip_src;
                icmpdst.sin_addr = icp->icmp_gwaddr;
-#if (DEBUG | DEVELOPMENT)
-               if (icmpprintfs > 0) {
+
+               if (should_log_redirect) {
+                       char src_str[MAX_IPv4_STR_LEN];
                        char dst_str[MAX_IPv4_STR_LEN];
                        char gw_str[MAX_IPv4_STR_LEN];
 
+                       inet_ntop(AF_INET, &ip->ip_src, src_str, sizeof(src_str));
                        inet_ntop(AF_INET, &icp->icmp_ip.ip_dst, dst_str, sizeof(dst_str));
                        inet_ntop(AF_INET, &icp->icmp_gwaddr, gw_str, sizeof(gw_str));
-                       printf("%s: redirect dst %s to %s\n", __func__, dst_str, gw_str);
+                       printf("%s: redirect dst %s to %s from %s\n", __func__,
+                           dst_str, gw_str, src_str);
                }
-#endif
                icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
                rtredirect(m->m_pkthdr.rcvif, (struct sockaddr *)&icmpsrc,
                    (struct sockaddr *)&icmpdst, NULL, RTF_GATEWAY | RTF_HOST,
index 35ee3066632f9b877b62490eac5e2be7520eb3fd..1cdd96affd587207d1956599d0508083c17f4271 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -1225,9 +1225,6 @@ ipfw_tags_done:
                ip_input_adjust(m, ip, inifp);
        }
 
-       /* for consistency */
-       m->m_pkthdr.pkt_proto = ip->ip_p;
-
        /* for netstat route statistics */
        src_ip = ip->ip_src;
        len = m->m_pkthdr.len;
@@ -2121,9 +2118,6 @@ tooshort:
                ip_input_adjust(m, ip, inifp);
        }
 
-       /* for consistency */
-       m->m_pkthdr.pkt_proto = ip->ip_p;
-
 #if DUMMYNET
 check_with_pf:
 #endif
@@ -2732,6 +2726,8 @@ found:
 
                ASSERT(trailer >= 0);
                if ((start != 0 && start != hlen) || trailer != 0) {
+                       uint32_t datalen = ip->ip_len - hlen;
+
 #if BYTE_ORDER != BIG_ENDIAN
                        if (start < hlen) {
                                HTONS(ip->ip_len);
@@ -2739,8 +2735,7 @@ found:
                        }
 #endif /* BYTE_ORDER != BIG_ENDIAN */
                        /* callee folds in sum */
-                       csum = m_adj_sum16(m, start, hlen,
-                           (ip->ip_len - hlen), csum);
+                       csum = m_adj_sum16(m, start, hlen, datalen, csum);
                        if (hlen > start) {
                                swbytes += (hlen - start);
                        } else {
@@ -3053,7 +3048,6 @@ found:
            (m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
                /* loopback checksums are always OK */
                m->m_pkthdr.csum_data = 0xffff;
-               m->m_pkthdr.csum_flags &= ~CSUM_PARTIAL;
                m->m_pkthdr.csum_flags =
                    CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
                    CSUM_IP_CHECKED | CSUM_IP_VALID;
@@ -3308,7 +3302,11 @@ ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop)
        struct in_addr *sin, dst;
        u_int32_t ntime;
        struct sockaddr_in ipaddr = {
-               sizeof(ipaddr), AF_INET, 0, { 0 }, { 0, }
+               .sin_len = sizeof(ipaddr),
+               .sin_family = AF_INET,
+               .sin_port = 0,
+               .sin_addr = { .s_addr = 0 },
+               .sin_zero = { 0, }
        };
 
        /* Expect 32-bit aligned data pointer on strict-align platforms */
@@ -3822,6 +3820,24 @@ ip_stripoptions(struct mbuf *m)
 #endif /* BYTE_ORDER != BIG_ENDIAN */
 
        ip->ip_len -= sizeof(struct ip);
+
+       /*
+        * Given that we've just stripped IP options from the header,
+        * we need to adjust the start offset accordingly if this
+        * packet had gone thru partial checksum offload.
+        */
+       if ((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
+           (CSUM_DATA_VALID | CSUM_PARTIAL)) {
+               if (m->m_pkthdr.csum_rx_start >= (sizeof(struct ip) + olen)) {
+                       /* most common case */
+                       m->m_pkthdr.csum_rx_start -= olen;
+               } else {
+                       /* compute checksum in software instead */
+                       m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
+                       m->m_pkthdr.csum_data = 0;
+                       ipstat.ips_adj_hwcsum_clr++;
+               }
+       }
 }
 
 u_char inetctlerrmap[PRC_NCMDS] = {
@@ -3829,7 +3845,7 @@ u_char inetctlerrmap[PRC_NCMDS] = {
        0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
        ENETUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED,
        EMSGSIZE, EHOSTUNREACH, 0, 0,
-       0, 0, 0, 0,
+       0, 0, EHOSTUNREACH, 0,
        ENOPROTOOPT, ECONNREFUSED
 };
 
index ee8eef60f39cb825288c651de31820d72f09b4a9..16782affc91168fdeaef56144bda893ee03a0242 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -345,6 +345,7 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt,
                        boolean_t isbroadcast : 1;
                        boolean_t didfilter : 1;
                        boolean_t noexpensive : 1;      /* set once */
+                       boolean_t noconstrained : 1;      /* set once */
                        boolean_t awdl_unrestricted : 1;        /* set once */
 #if IPFIREWALL_FORWARD
                        boolean_t fwd_rewrite_src : 1;
@@ -362,7 +363,8 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt,
 #define IP_CHECK_RESTRICTIONS(_ifp, _ipobf)                             \
        (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) ||                \
         ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) ||          \
-        (IFNET_IS_INTCOPROC(_ifp)) ||                                  \
+        ((_ipobf).noconstrained && IFNET_IS_CONSTRAINED(_ifp)) ||      \
+         (IFNET_IS_INTCOPROC(_ifp)) ||                                 \
         (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp)))
 
        if (ip_output_measure) {
@@ -497,6 +499,10 @@ ipfw_tags_done:
                        ipobf.noexpensive = TRUE;
                        ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_EXPENSIVE;
                }
+               if (ipoa->ipoa_flags & IPOAF_NO_CONSTRAINED) {
+                       ipobf.noconstrained = TRUE;
+                       ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_CONSTRAINED;
+               }
                if (ipoa->ipoa_flags & IPOAF_AWDL_UNRESTRICTED) {
                        ipobf.awdl_unrestricted = TRUE;
                }
@@ -1007,7 +1013,11 @@ loopit:
                         * on the outgoing interface, and the caller did not
                         * forbid loopback, loop back a copy.
                         */
-                       if (!TAILQ_EMPTY(&ipv4_filters)) {
+                       if (!TAILQ_EMPTY(&ipv4_filters)
+#if NECP
+                           && !necp_packet_should_skip_filters(m)
+#endif // NECP
+                           ) {
                                struct ipfilter *filter;
                                int seen = (inject_filter_ref == NULL);
 
@@ -1186,7 +1196,12 @@ sendit:
                }
        }
 
-       if (!ipobf.didfilter && !TAILQ_EMPTY(&ipv4_filters)) {
+       if (!ipobf.didfilter &&
+           !TAILQ_EMPTY(&ipv4_filters)
+#if NECP
+           && !necp_packet_should_skip_filters(m)
+#endif // NECP
+           ) {
                struct ipfilter *filter;
                int seen = (inject_filter_ref == NULL);
                ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
@@ -1241,7 +1256,7 @@ sendit:
 #if NECP
        /* Process Network Extension Policy. Will Pass, Drop, or Rebind packet. */
        necp_matched_policy_id = necp_ip_output_find_policy_match(m,
-           flags, (flags & IP_OUTARGS) ? ipoa : NULL, &necp_result, &necp_result_parameter);
+           flags, (flags & IP_OUTARGS) ? ipoa : NULL, ro ? ro->ro_rt : NULL, &necp_result, &necp_result_parameter);
        if (necp_matched_policy_id) {
                necp_mark_packet_from_ip(m, necp_matched_policy_id);
                switch (necp_result) {
@@ -1512,7 +1527,11 @@ sendit:
            7, 0xff, 0xff, 0xff, 0xff);
 
        /* Pass to filters again */
-       if (!TAILQ_EMPTY(&ipv4_filters)) {
+       if (!TAILQ_EMPTY(&ipv4_filters)
+#if NECP
+           && !necp_packet_should_skip_filters(m)
+#endif // NECP
+           ) {
                struct ipfilter *filter;
 
                ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
@@ -1850,16 +1869,6 @@ pass:
                }
        }
 
-       /*
-        * Some Wi-Fi AP implementations do not correctly handle multicast IP
-        * packets with DSCP bits set -- see radr://9331522 -- so as a
-        * workaround we clear the DSCP bits and set the service class to BE
-        */
-       if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && IFNET_IS_WIFI_INFRA(ifp)) {
-               ip->ip_tos &= IPTOS_ECN_MASK;
-               mbuf_set_service_class(m, MBUF_SC_BE);
-       }
-
        ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2),
            ip->ip_len, &sw_csum);
 
@@ -2559,6 +2568,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
        struct  inpcb *inp = sotoinpcb(so);
        int     error, optval;
+       lck_mtx_t *mutex_held = NULL;
 
        error = optval = 0;
        if (sopt->sopt_level != IPPROTO_IP) {
@@ -2567,6 +2577,21 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
 
        switch (sopt->sopt_dir) {
        case SOPT_SET:
+               mutex_held = socket_getlock(so, PR_F_WILLUNLOCK);
+               /*
+                *  Wait if we are in the middle of ip_output
+                *  as we unlocked the socket there and don't
+                *  want to overwrite the IP options
+                */
+               if (inp->inp_sndinprog_cnt > 0) {
+                       inp->inp_sndingprog_waiters++;
+
+                       while (inp->inp_sndinprog_cnt > 0) {
+                               msleep(&inp->inp_sndinprog_cnt, mutex_held,
+                                   PSOCK | PCATCH, "inp_sndinprog_cnt", NULL);
+                       }
+                       inp->inp_sndingprog_waiters--;
+               }
                switch (sopt->sopt_name) {
 #ifdef notyet
                case IP_RETOPTS:
index 9c2f730354d82265b16beffe7901192b95186a84..a9ecaa856251927350a7c94c73b87dd84439de10 100644 (file)
@@ -248,7 +248,7 @@ struct ip_moptions;
 /* flags passed to ip_output as last parameter */
 #define IP_FORWARDING   0x1             /* most of ip header exists */
 #define IP_RAWOUTPUT    0x2             /* raw ip header exists */
-#define IP_NOIPSEC      0x4             /* No IPSec processing */
+#define IP_NOIPSEC      0x4             /* No IPsec processing */
 #define IP_ROUTETOIF    SO_DONTROUTE    /* bypass routing tables (0x0010) */
 #define IP_ALLOWBROADCAST SO_BROADCAST  /* can send broadcast pkts (0x0020) */
 #define IP_OUTARGS      0x100           /* has ancillary output info */
@@ -297,6 +297,7 @@ struct ip_out_args {
 #define IPOAF_AWDL_UNRESTRICTED 0x00000040      /* can send over
                                                 *  AWDL_RESTRICTED */
 #define IPOAF_QOSMARKING_ALLOWED        0x00000080      /* policy allows Fastlane DSCP marking */
+#define IPOAF_NO_CONSTRAINED    0x00000100      /* skip IFXF_CONSTRAINED */
        u_int32_t       ipoa_retflags;  /* IPOARF return flags (see below) */
 #define IPOARF_IFDENIED 0x00000001      /* denied access to interface */
        int             ipoa_sotc;      /* traffic class for Fastlane DSCP mapping */
index 07ac305729985a53b735917fe898b05d93d2684b..b783eb2141cef8f539c43ce64e40c11d8ba7e69a 100644 (file)
@@ -85,7 +85,9 @@ __private_extern__ void
 ipf_ref(void)
 {
        lck_mtx_lock(kipf_lock);
-       kipf_ref++;
+       if (os_inc_overflow(&kipf_ref)) {
+               panic("kipf_ref overflow");
+       }
        lck_mtx_unlock(kipf_lock);
 }
 
@@ -94,11 +96,10 @@ ipf_unref(void)
 {
        lck_mtx_lock(kipf_lock);
 
-       if (kipf_ref == 0) {
-               panic("ipf_unref: kipf_ref == 0\n");
+       if (os_dec_overflow(&kipf_ref)) {
+               panic("kipf_ref underflow");
        }
 
-       kipf_ref--;
        if (kipf_ref == 0 && kipf_delayed_remove != 0) {
                struct ipfilter *filter;
 
@@ -434,6 +435,9 @@ ipf_injectv4_out(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options)
                if (options->ippo_flags & IPPOF_NO_IFF_EXPENSIVE) {
                        ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE;
                }
+               if (options->ippo_flags & IPPOF_NO_IFF_CONSTRAINED) {
+                       ipoa.ipoa_flags |= IPOAF_NO_CONSTRAINED;
+               }
        }
 
        bzero(&ro, sizeof(struct route));
@@ -521,6 +525,9 @@ ipf_injectv6_out(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options)
                if (options->ippo_flags & IPPOF_NO_IFF_EXPENSIVE) {
                        ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE;
                }
+               if (options->ippo_flags & IPPOF_NO_IFF_CONSTRAINED) {
+                       ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED;
+               }
        }
 
        bzero(&ro, sizeof(struct route_in6));
index 1739bc7089fa65ab1aa26fa69ac2ff35e508ac5f..0aafb5c296365f369804fdde0735f85e05e806e5 100644 (file)
 #ifndef __KPI_IPFILTER__
 #define __KPI_IPFILTER__
 
-#include <sys/kernel_types.h>
+#ifndef PRIVATE
+#include <Availability.h>
+#define __NKE_API_DEPRECATED __API_DEPRECATED("Network Kernel Extension KPI is deprecated", macos(10.4, 10.15))
+#else
+#define __NKE_API_DEPRECATED
+#endif /* PRIVATE */
 
 /*
  * ipf_pktopts
@@ -50,14 +55,15 @@ struct ipf_pktopts {
        int                             ippo_mcast_loop;
        u_int8_t                        ippo_mcast_ttl;
 };
-#define IPPOF_MCAST_OPTS        0x1
+#define IPPOF_MCAST_OPTS            0x1
 #ifdef PRIVATE
-#define IPPOF_BOUND_IF          0x2
-#define IPPOF_NO_IFT_CELLULAR   0x4
-#define IPPOF_SELECT_SRCIF      0x8
-#define IPPOF_BOUND_SRCADDR     0x10
-#define IPPOF_SHIFT_IFSCOPE     16
-#define IPPOF_NO_IFF_EXPENSIVE  0x20
+#define IPPOF_BOUND_IF              0x2
+#define IPPOF_NO_IFT_CELLULAR       0x4
+#define IPPOF_SELECT_SRCIF          0x8
+#define IPPOF_BOUND_SRCADDR         0x10
+#define IPPOF_SHIFT_IFSCOPE         16
+#define IPPOF_NO_IFF_EXPENSIVE      0x20
+#define IPPOF_NO_IFF_CONSTRAINED    0x40
 #endif /* PRIVATE */
 
 typedef struct ipf_pktopts *ipf_pktopts_t;
@@ -72,7 +78,7 @@ __BEGIN_DECLS
  *               filter is called between when the general IP processing is
  *               handled and when the packet is passed up to the next layer
  *               protocol such as udp or tcp. In the case of encapsulation, such
- *               as UDP in ESP (IPSec), your filter will be called once for ESP
+ *               as UDP in ESP (IPsec), your filter will be called once for ESP
  *               and then again for UDP. This will give your filter an
  *               opportunity to process the ESP header as well as the decrypted
  *               packet. Offset and protocol are used to determine where in the
@@ -101,7 +107,7 @@ typedef errno_t (*ipf_input_func)(void *cookie, mbuf_t *data, int offset,
  *
  *       @discussion ipf_output_func is used to filter outbound ip packets.
  *               The IP filter is called for packets to all interfaces. The
- *               filter is called before fragmentation and IPSec processing. If
+ *               filter is called before fragmentation and IPsec processing. If
  *               you need to change the destination IP address, call
  *               ipf_inject_output and return EJUSTRETURN.
  *       @param cookie The cookie specified when your filter was attached.
@@ -164,7 +170,8 @@ extern errno_t ipf_addv4_internal(const struct ipf_filter *filter,
     ipf_addv4_internal((filter), (filter_ref))
 #else
 extern errno_t ipf_addv4(const struct ipf_filter *filter,
-    ipfilter_t *filter_ref);
+    ipfilter_t *filter_ref)
+__NKE_API_DEPRECATED;
 #endif /* KERNEL_PRIVATE */
 
 /*!
@@ -182,7 +189,8 @@ extern errno_t ipf_addv6_internal(const struct ipf_filter *filter,
     ipf_addv6_internal((filter), (filter_ref))
 #else
 extern errno_t ipf_addv6(const struct ipf_filter *filter,
-    ipfilter_t *filter_ref);
+    ipfilter_t *filter_ref)
+__NKE_API_DEPRECATED;
 #endif /* KERNEL_PRIVATE */
 
 /*!
@@ -192,7 +200,8 @@ extern errno_t ipf_addv6(const struct ipf_filter *filter,
  *               ipf_addv6.
  *       @result 0 on success otherwise the errno error.
  */
-extern errno_t ipf_remove(ipfilter_t filter_ref);
+extern errno_t ipf_remove(ipfilter_t filter_ref)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function ipf_inject_input
@@ -212,7 +221,8 @@ extern errno_t ipf_remove(ipfilter_t filter_ref);
  *       @param filter_ref The reference to the filter injecting the data
  *       @result 0 on success otherwise the errno error.
  */
-extern errno_t ipf_inject_input(mbuf_t data, ipfilter_t filter_ref);
+extern errno_t ipf_inject_input(mbuf_t data, ipfilter_t filter_ref)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function ipf_inject_output
@@ -231,7 +241,8 @@ extern errno_t ipf_inject_input(mbuf_t data, ipfilter_t filter_ref);
  *               will always free the mbuf.
  */
 extern errno_t ipf_inject_output(mbuf_t data, ipfilter_t filter_ref,
-    ipf_pktopts_t options);
+    ipf_pktopts_t options)
+__NKE_API_DEPRECATED;
 
 __END_DECLS
 #endif /* __KPI_IPFILTER__ */
index 6aabe6b8dfba633e52144e5e744e3a92864db9cf..581be9c54ac1cccc07cda9ae67f323aa1a127b10 100644 (file)
@@ -59,6 +59,15 @@ static boolean_t mp_ticking;
 static void mp_sched_timeout(void);
 static void mp_timeout(void *);
 
+static void
+mpp_lock_assert_held(struct mppcb *mp)
+{
+#if !MACH_ASSERT
+#pragma unused(mp)
+#endif
+       LCK_MTX_ASSERT(&mp->mpp_lock, LCK_MTX_ASSERT_OWNED);
+}
+
 void
 mp_pcbinit(void)
 {
@@ -222,7 +231,7 @@ mp_pcballoc(struct socket *so, struct mppcbinfo *mppi)
        mpp->mpp_socket = so;
        so->so_pcb = mpp;
 
-       error = mptcp_sescreate(mpp);
+       error = mptcp_session_create(mpp);
        if (error) {
                lck_mtx_destroy(&mpp->mpp_lock, mppi->mppi_lock_grp);
                zfree(mppi->mppi_zone, mpp);
@@ -233,6 +242,7 @@ mp_pcballoc(struct socket *so, struct mppcbinfo *mppi)
        mpp->mpp_flags |= MPP_ATTACHED;
        TAILQ_INSERT_TAIL(&mppi->mppi_pcbs, mpp, mpp_entry);
        mppi->mppi_count++;
+
        lck_mtx_unlock(&mppi->mppi_lock);
 
        return 0;
@@ -244,9 +254,6 @@ mp_pcbdetach(struct socket *mp_so)
        struct mppcb *mpp = mpsotomppcb(mp_so);
 
        mpp->mpp_state = MPPCB_STATE_DEAD;
-       if (!(mp_so->so_flags & SOF_PCBCLEARING)) {
-               mp_so->so_flags |= SOF_PCBCLEARING;
-       }
 
        mp_gc_sched();
 }
@@ -269,6 +276,16 @@ mp_pcbdispose(struct mppcb *mpp)
        VERIFY(mppi->mppi_count != 0);
        mppi->mppi_count--;
 
+       if (mppi->mppi_count == 0) {
+               if (mptcp_cellicon_refcount) {
+                       os_log_error(mptcp_log_handle, "%s: No more MPTCP-flows, but cell icon counter is %u\n",
+                           __func__, mptcp_cellicon_refcount);
+                       mptcp_clear_cellicon();
+                       mptcp_cellicon_refcount = 0;
+               }
+       }
+
+       VERIFY(mpp->mpp_inside == 0);
        mpp_unlock(mpp);
 
 #if NECP
index 0fc2a103dcad5a7b29c86dce695bb03ad2c6e9d8..e2cce3f7b2762be7041b0b7a71777251cb5bd7af 100644 (file)
@@ -54,6 +54,7 @@ struct mppcb {
        struct socket           *mpp_socket;    /* back pointer to socket */
        uint32_t                mpp_flags;      /* PCB flags */
        mppcb_state_t           mpp_state;      /* PCB state */
+       int32_t                 mpp_inside;     /* Indicates whether or not a thread is processing MPTCP */
 
 #if NECP
        uuid_t necp_client_uuid;
@@ -72,19 +73,17 @@ mpsotomppcb(struct socket *mp_so)
 #define MPP_ATTACHED            0x001
 #define MPP_INSIDE_OUTPUT       0x002           /* MPTCP-stack is inside mptcp_subflow_output */
 #define MPP_INSIDE_INPUT        0x004           /* MPTCP-stack is inside mptcp_subflow_input */
-#define MPP_RUPCALL             0x008           /* MPTCP-stack is handling a read upcall */
+#define MPP_INPUT_HANDLE        0x008           /* MPTCP-stack is handling input */
 #define MPP_WUPCALL             0x010           /* MPTCP-stack is handling a read upcall */
 #define MPP_SHOULD_WORKLOOP     0x020           /* MPTCP-stack should call the workloop function */
 #define MPP_SHOULD_RWAKEUP      0x040           /* MPTCP-stack should call sorwakeup */
 #define MPP_SHOULD_WWAKEUP      0x080           /* MPTCP-stack should call sowwakeup */
 #define MPP_CREATE_SUBFLOWS     0x100           /* This connection needs to create subflows */
-#define MPP_SET_CELLICON        0x200           /* Set the cellicon (deferred) */
-#define MPP_UNSET_CELLICON      0x400           /* Unset the cellicon (deferred) */
 
 static inline boolean_t
 mptcp_should_defer_upcall(struct mppcb *mpp)
 {
-       return !!(mpp->mpp_flags & (MPP_INSIDE_OUTPUT | MPP_INSIDE_INPUT | MPP_RUPCALL | MPP_WUPCALL));
+       return !!(mpp->mpp_flags & (MPP_INSIDE_OUTPUT | MPP_INSIDE_INPUT | MPP_INPUT_HANDLE | MPP_WUPCALL));
 }
 
 /*
index c40a144a57602b7b94dcffed5b148df5fe1753c5..a2883309e5a9c756d20a56b522cd6e58b1536059 100644 (file)
@@ -111,8 +111,12 @@ int mptcp_enable = 1;
 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
     &mptcp_enable, 0, "Enable Multipath TCP Support");
 
-/* Number of times to try negotiating MPTCP on SYN retransmissions */
-int mptcp_mpcap_retries = MPTCP_CAPABLE_RETRIES;
+/*
+ * Number of times to try negotiating MPTCP on SYN retransmissions.
+ * We haven't seen any reports of a middlebox that is dropping all SYN-segments
+ * that have an MPTCP-option. Thus, let's be generous and retransmit it 4 times.
+ */
+int mptcp_mpcap_retries = 4;
 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
     CTLFLAG_RW | CTLFLAG_LOCKED,
     &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
@@ -178,7 +182,8 @@ static uint16_t mptcp_input_csum(struct tcpcb *, struct mbuf *, uint64_t,
 static int
 mptcp_reass_present(struct socket *mp_so)
 {
-       struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
+       struct mptses *mpte = mpsotompte(mp_so);
+       struct mptcb *mp_tp = mpte->mpte_mptcb;
        struct tseg_qent *q;
        int dowakeup = 0;
        int flags = 0;
@@ -363,11 +368,11 @@ mptcp_input(struct mptses *mpte, struct mbuf *m)
 
        VERIFY(m->m_flags & M_PKTHDR);
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-
        mp_so = mptetoso(mpte);
        mp_tp = mpte->mpte_mptcb;
 
+       socket_lock_assert_owned(mp_so);
+
        DTRACE_MPTCP(input);
 
        mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
@@ -437,9 +442,6 @@ fallback:
                        mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
                        socantrcvmore(mp_so);
                }
-
-               mptcplog((LOG_DEBUG, "%s: Fallback read %d bytes\n", __func__,
-                   count), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
                return;
        }
 
@@ -449,6 +451,8 @@ fallback:
                int64_t todrop;
                int mb_dfin = 0;
 
+               VERIFY(m->m_flags & M_PKTHDR);
+
                /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
                if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
                        goto fallback;
@@ -482,6 +486,11 @@ fallback:
                if (todrop > 0) {
                        tcpstat.tcps_mptcp_rcvpackafterwin++;
 
+                       os_log_info(mptcp_log_handle, "%s - %lx: dropping dsn %u dlen %u rcvnxt %u rcvwnd %u todrop %lld\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                           (uint32_t)mb_dsn, mb_datalen, (uint32_t)mp_tp->mpt_rcvnxt,
+                           mp_tp->mpt_rcvwnd, todrop);
+
                        if (todrop >= mb_datalen) {
                                if (freelist == NULL) {
                                        freelist = m;
@@ -501,6 +510,7 @@ fallback:
                        } else {
                                m_adj(m, -todrop);
                                mb_datalen -= todrop;
+                               m->m_pkthdr.mp_rlen -= todrop;
                        }
 
                        /*
@@ -510,7 +520,6 @@ fallback:
                        m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
                }
 
-
                if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
                        if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
                            mp_tp->mpt_rcvnxt)) {
@@ -531,10 +540,11 @@ fallback:
                                continue;
                        } else {
                                m_adj(m, (mp_tp->mpt_rcvnxt - mb_dsn));
+                               mb_datalen -= (mp_tp->mpt_rcvnxt - mb_dsn);
+                               mb_dsn = mp_tp->mpt_rcvnxt;
+                               m->m_pkthdr.mp_rlen = mb_datalen;
+                               m->m_pkthdr.mp_dsn = mb_dsn;
                        }
-                       mptcplog((LOG_INFO, "%s: Left Edge %llu\n", __func__,
-                           mp_tp->mpt_rcvnxt),
-                           MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
                }
 
                if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
@@ -559,8 +569,6 @@ fallback:
                count = mp_so->so_rcv.sb_cc - count;
                tcpstat.tcps_mp_rcvtotal++;
                tcpstat.tcps_mp_rcvbytes += count;
-               mptcplog((LOG_DEBUG, "%s: Read %d bytes\n", __func__, count),
-                   MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
 
                mp_tp->mpt_rcvnxt += count;
 
@@ -637,41 +645,29 @@ mptcp_output(struct mptses *mpte)
        uint64_t old_snd_nxt;
        int error = 0;
 
-       mpte_lock_assert_held(mpte);
        mp_so = mptetoso(mpte);
+       socket_lock_assert_owned(mp_so);
        mp_tp = mpte->mpte_mptcb;
 
        VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
        mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
 
-       mptcplog((LOG_DEBUG, "%s: snxt %u sndmax %u suna %u swnd %u reinjectq %u state %u\n",
-           __func__, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
-           (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_sndwnd,
-           mpte->mpte_reinjectq ? 1 : 0,
-           mp_tp->mpt_state),
-           MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
-
        old_snd_nxt = mp_tp->mpt_sndnxt;
        while (mptcp_can_send_more(mp_tp, FALSE)) {
                /* get the "best" subflow to be used for transmission */
-               mpts = mptcp_get_subflow(mpte, NULL, &preferred_mpts);
+               mpts = mptcp_get_subflow(mpte, &preferred_mpts);
                if (mpts == NULL) {
                        mptcplog((LOG_INFO, "%s: no subflow\n", __func__),
                            MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
                        break;
                }
 
-               mptcplog((LOG_DEBUG, "%s: using id %u\n", __func__, mpts->mpts_connid),
-                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
-
                /* In case there's just one flow, we reattempt later */
                if (mpts_tried != NULL &&
                    (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
                        mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
                        mpts_tried->mpts_flags |= MPTSF_ACTIVE;
                        mptcp_start_timer(mpte, MPTT_REXMT);
-                       mptcplog((LOG_DEBUG, "%s: retry later\n", __func__),
-                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
                        break;
                }
 
@@ -691,11 +687,6 @@ mptcp_output(struct mptses *mpte)
                                    min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
                                    tcp_autosndbuf_max)) == 1) {
                                        mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
-
-                                       mptcplog((LOG_DEBUG, "%s: increased snd hiwat to %u lowat %u\n",
-                                           __func__, mp_so->so_snd.sb_hiwat,
-                                           mp_so->so_snd.sb_lowat),
-                                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
                                }
                        }
                }
@@ -709,9 +700,9 @@ mptcp_output(struct mptses *mpte)
                        mpts->mpts_flags &= ~MPTSF_ACTIVE;
                        mpts_tried = mpts;
                        if (error != ECANCELED) {
-                               mptcplog((LOG_ERR, "%s: Error = %d mpts_flags %#x\n", __func__,
-                                   error, mpts->mpts_flags),
-                                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+                               os_log_error(mptcp_log_handle, "%s - %lx: Error = %d mpts_flags %#x\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                                   error, mpts->mpts_flags);
                        }
                        break;
                }
@@ -738,14 +729,6 @@ mptcp_output(struct mptses *mpte)
                if (mpte->mpte_active_sub == NULL) {
                        mpte->mpte_active_sub = mpts;
                } else if (mpte->mpte_active_sub != mpts) {
-                       struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
-                       struct tcpcb *acttp = sototcpcb(mpte->mpte_active_sub->mpts_socket);
-
-                       mptcplog((LOG_DEBUG, "%s: switch [%u, srtt %d] to [%u, srtt %d]\n", __func__,
-                           mpte->mpte_active_sub->mpts_connid, acttp->t_srtt >> TCP_RTT_SHIFT,
-                           mpts->mpts_connid, tp->t_srtt >> TCP_RTT_SHIFT),
-                           (MPTCP_SENDER_DBG | MPTCP_SOCKET_DBG), MPTCP_LOGLVL_LOG);
-
                        mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
                        mpte->mpte_active_sub = mpts;
 
@@ -807,11 +790,25 @@ mptcp_return_subflow(struct mptsub *mpts)
        return mpts;
 }
 
+static boolean_t
+mptcp_subflow_is_slow(struct mptses *mpte, struct mptsub *mpts)
+{
+       struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
+       int fail_thresh = mptcp_fail_thresh;
+
+       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
+               fail_thresh *= 2;
+       }
+
+       return tp->t_rxtshift >= fail_thresh &&
+              (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq);
+}
+
 /*
  * Return the most eligible subflow to be used for sending data.
  */
 struct mptsub *
-mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **preferred)
+mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred)
 {
        struct tcpcb *besttp, *secondtp;
        struct inpcb *bestinp, *secondinp;
@@ -830,8 +827,8 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr
                struct tcpcb *tp = sototcpcb(so);
                struct inpcb *inp = sotoinpcb(so);
 
-               mptcplog((LOG_DEBUG, "%s mpts %u ignore %d, mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
-                   __func__, mpts->mpts_connid, ignore ? ignore->mpts_connid : -1, mpts->mpts_flags,
+               mptcplog((LOG_DEBUG, "%s mpts %u mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
+                   __func__, mpts->mpts_connid, mpts->mpts_flags,
                    INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state,
                    inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1,
                    tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt,
@@ -842,7 +839,7 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr
                 * First, the hard conditions to reject subflows
                 * (e.g., not connected,...)
                 */
-               if (mpts == ignore || inp->inp_last_outifp == NULL) {
+               if (inp->inp_last_outifp == NULL) {
                        continue;
                }
 
@@ -920,7 +917,7 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr
                 * Only handover if Symptoms tells us to do so.
                 */
                if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
-                   mptcp_is_wifi_unusable(mpte) != 0 && mptcp_subflow_is_bad(mpte, best)) {
+                   mptcp_is_wifi_unusable_for_session(mpte) != 0 && mptcp_subflow_is_slow(mpte, best)) {
                        return mptcp_return_subflow(second_best);
                }
 
@@ -931,7 +928,7 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr
 
                /* Adjust with symptoms information */
                if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
-                   mptcp_is_wifi_unusable(mpte) != 0) {
+                   mptcp_is_wifi_unusable_for_session(mpte) != 0) {
                        rtt_thresh /= 2;
                        rto_thresh /= 2;
                }
@@ -948,7 +945,7 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr
                        return mptcp_return_subflow(second_best);
                }
 
-               if (mptcp_subflow_is_bad(mpte, best) &&
+               if (mptcp_subflow_is_slow(mpte, best) &&
                    secondtp->t_rxtshift == 0) {
                        return mptcp_return_subflow(second_best);
                }
@@ -972,7 +969,7 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr
                 * has some space in the congestion-window.
                 */
                return mptcp_return_subflow(best);
-       } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_AGGREGATE) {
+       } else if (mpte->mpte_svctype >= MPTCP_SVCTYPE_AGGREGATE) {
                struct mptsub *tmp;
 
                /*
@@ -1062,7 +1059,10 @@ mptcp_state_to_str(mptcp_state_t state)
 void
 mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
 {
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
+       struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
+
+       socket_lock_assert_owned(mp_so);
+
        mptcp_state_t old_state = mp_tp->mpt_state;
 
        DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
@@ -1161,20 +1161,16 @@ mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
     uint16_t csum)
 {
        if (mdss_data_len == 0) {
-               mptcplog((LOG_INFO, "%s: Infinite Mapping.\n", __func__),
-                   MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
+               os_log_error(mptcp_log_handle, "%s - %lx: Infinite Mapping.\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte));
 
                if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
-                       mptcplog((LOG_ERR, "%s: Bad checksum %x \n", __func__,
-                           csum), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
+                       os_log_error(mptcp_log_handle, "%s - %lx: Bad checksum %x \n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), csum);
                }
                mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
                return;
        }
-       mptcplog((LOG_DEBUG,
-           "%s: seqn = %u len = %u full = %u rcvnxt = %u \n", __func__,
-           seqn, mdss_data_len, (uint32_t)full_dsn, (uint32_t)mp_tp->mpt_rcvnxt),
-           MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
 
        mptcp_notify_mpready(tp->t_inpcb->inp_socket);
 
@@ -1200,9 +1196,8 @@ mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
 
        /* unacceptable DSS option, fallback to TCP */
        if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
-               mptcplog((LOG_ERR, "%s: mbuf len %d, MPTCP expected %d",
-                   __func__, m->m_pkthdr.len, datalen),
-                   MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
+               os_log_error(mptcp_log_handle, "%s - %lx: mbuf len %d, MPTCP expected %d",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), m->m_pkthdr.len, datalen);
        } else {
                return 0;
        }
@@ -1369,18 +1364,6 @@ mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
 
                sowwakeup(mpp->mpp_socket);
        }
-
-       if (mpp->mpp_flags & MPP_SET_CELLICON) {
-               mpp->mpp_flags &= ~MPP_SET_CELLICON;
-
-               mptcp_set_cellicon(mpp->mpp_pcbe);
-       }
-
-       if (mpp->mpp_flags & MPP_UNSET_CELLICON) {
-               mpp->mpp_flags &= ~MPP_UNSET_CELLICON;
-
-               mptcp_unset_cellicon();
-       }
 }
 
 void
@@ -1396,10 +1379,7 @@ mptcp_ask_for_nat64(struct ifnet *ifp)
 static void
 mptcp_reset_itfinfo(struct mpt_itf_info *info)
 {
-       info->ifindex = 0;
-       info->has_v4_conn = 0;
-       info->has_v6_conn = 0;
-       info->has_nat64_conn = 0;
+       memset(info, 0, sizeof(*info));
 }
 
 void
@@ -1425,8 +1405,10 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
                return;
        }
 
+       mp_so = mptetoso(mpte);
+
        if (action != NECP_CLIENT_CBACTION_INITIAL) {
-               mpte_lock(mpte);
+               socket_lock(mp_so, 1);
                locked = 1;
 
                /* Check again, because it might have changed while waiting */
@@ -1435,13 +1417,13 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
                }
        }
 
-       mpte_lock_assert_held(mpte);
+       socket_lock_assert_owned(mp_so);
 
        mp_tp = mpte->mpte_mptcb;
-       mp_so = mptetoso(mpte);
 
-       os_log_info(mptcp_log_handle, "%s, action: %u ifindex %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
-           __func__, action, ifindex, mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
+       os_log_info(mptcp_log_handle, "%s - %lx: action: %u ifindex %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
+           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), action, ifindex,
+           mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
            has_v4, has_v6, has_nat64, low_power);
 
        /* No need on fallen back sockets */
@@ -1472,6 +1454,7 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
        } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
            action == NECP_CLIENT_CBACTION_INITIAL) {
                int found_slot = 0, slot_index = -1;
+               struct sockaddr *dst;
                struct ifnet *ifp;
 
                ifnet_head_lock_shared();
@@ -1487,6 +1470,11 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
                        goto out;
                }
 
+               if (IFNET_IS_CONSTRAINED(ifp) &&
+                   (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
+                       goto out;
+               }
+
                if (IFNET_IS_CELLULAR(ifp) &&
                    (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
                        goto out;
@@ -1526,8 +1514,9 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
                        }
                }
 
-               if ((mpte->mpte_dst.sa_family == AF_INET || mpte->mpte_dst.sa_family == 0) &&
-                   !has_nat64 && !has_v4) {
+               dst = mptcp_get_session_dst(mpte, has_v6, has_v4);
+               if (dst && (dst->sa_family == AF_INET || dst->sa_family == 0) &&
+                   has_v6 && !has_nat64 && !has_v4) {
                        if (found_slot) {
                                mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
                                mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
@@ -1542,8 +1531,8 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
                        struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO);
 
                        if (info == NULL) {
-                               os_log_error(mptcp_log_handle, "%s malloc failed for %u\n",
-                                   __func__, new_size);
+                               os_log_error(mptcp_log_handle, "%s - %lx: malloc failed for %u\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), new_size);
                                goto out;
                        }
 
@@ -1571,7 +1560,7 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
 
 out:
        if (locked) {
-               mpte_unlock(mpte);
+               socket_unlock(mp_so, 1);
        }
 }
 
@@ -1581,7 +1570,7 @@ mptcp_set_restrictions(struct socket *mp_so)
        struct mptses *mpte = mpsotompte(mp_so);
        uint32_t i;
 
-       mpte_lock_assert_held(mpte);
+       socket_lock_assert_owned(mp_so);
 
        ifnet_head_lock_shared();
 
@@ -1604,6 +1593,11 @@ mptcp_set_restrictions(struct socket *mp_so)
                        info->ifindex = IFSCOPE_NONE;
                }
 
+               if (IFNET_IS_CONSTRAINED(ifp) &&
+                   (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
+                       info->ifindex = IFSCOPE_NONE;
+               }
+
                if (IFNET_IS_CELLULAR(ifp) &&
                    (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
                        info->ifindex = IFSCOPE_NONE;
index d98f50e83d0391a76d9a58d2b36a654891ea3844..122476f652713bab5ce17bb448a9f9f1139159c8 100644 (file)
  * Used to establish an MPTCP connection and first subflow.
  */
 struct mptcp_mpcapable_opt_common {
-       u_int8_t        mmco_kind;
-       u_int8_t        mmco_len;
+       uint8_t        mmco_kind;
+       uint8_t        mmco_len;
 #if BYTE_ORDER == LITTLE_ENDIAN
-       u_int8_t        mmco_version:4,
+       uint8_t        mmco_version:4,
            mmco_subtype:4;
 #else /* BIG_ENDIAN */
-       u_int8_t        mmco_subtype:4,
+       uint8_t        mmco_subtype:4,
            mmco_version:4;
 #endif
 #define MPCAP_PROPOSAL_SBIT     0x01    /* SHA1 Algorithm */
@@ -79,11 +79,10 @@ struct mptcp_mpcapable_opt_common {
 #define MPCAP_FBIT              0x04    /* must be 0 */
 #define MPCAP_EBIT              0x08    /* must be 0 */
 #define MPCAP_DBIT              0x10    /* must be 0 */
-#define MPCAP_CBIT              0x20    /* must be 0 */
+#define MPCAP_UNICAST_IPBIT     0x20    /* Should MPTCP only use ADD_ADDR IPs for new subflows */
 #define MPCAP_BBIT              0x40    /* Extensibility bit, must be 0 */
-#define MPCAP_ABIT              0x80    /* alias of MPCAP_CHECKSUM_CBIT */
 #define MPCAP_CHECKSUM_CBIT     0x80    /* DSS Checksum bit */
-       u_int8_t        mmco_flags;
+       uint8_t        mmco_flags;
 } __attribute__((__packed__));
 
 struct mptcp_mpcapable_opt_rsp {
@@ -105,86 +104,53 @@ struct mptcp_mpcapable_opt_rsp1 {
 
 /* MP_JOIN Option for SYN */
 struct mptcp_mpjoin_opt_req {
-       u_int8_t        mmjo_kind;
-       u_int8_t        mmjo_len;
+       uint8_t        mmjo_kind;
+       uint8_t        mmjo_len;
 #define MPTCP_BACKUP    0x1
-       u_int8_t        mmjo_subtype_bkp;
-       u_int8_t        mmjo_addr_id;
-       u_int32_t       mmjo_peer_token;
-       u_int32_t       mmjo_rand;
+       uint8_t        mmjo_subtype_bkp;
+       uint8_t        mmjo_addr_id;
+       uint32_t       mmjo_peer_token;
+       uint32_t       mmjo_rand;
 } __attribute__((__packed__));
 
 /* MP_JOIN Option for SYN/ACK */
 struct mptcp_mpjoin_opt_rsp {
-       u_int8_t        mmjo_kind;
-       u_int8_t        mmjo_len;
+       uint8_t        mmjo_kind;
+       uint8_t        mmjo_len;
 #define MPTCP_BACKUP    0x1
-       u_int8_t        mmjo_subtype_bkp;
-       u_int8_t        mmjo_addr_id;
-       u_int64_t       mmjo_mac; /* Truncated message auth code */
-       u_int32_t       mmjo_rand;
+       uint8_t        mmjo_subtype_bkp;
+       uint8_t        mmjo_addr_id;
+       uint64_t       mmjo_mac; /* Truncated message auth code */
+       uint32_t       mmjo_rand;
 } __attribute__((__packed__));
 
 /* MP_Join Option for ACK */
 struct mptcp_mpjoin_opt_rsp2 {
-       u_int8_t        mmjo_kind;
-       u_int8_t        mmjo_len;
+       uint8_t        mmjo_kind;
+       uint8_t        mmjo_len;
 #if BYTE_ORDER == LITTLE_ENDIAN
-       u_int8_t        mmjo_reserved1:4,
+       uint8_t        mmjo_reserved1:4,
            mmjo_subtype:4;
 #else /* BIG_ENDIAN */
-       u_int8_t        mmjo_subtype:4,
+       uint8_t        mmjo_subtype:4,
            mmjo_reserved1:4;
 #endif
-       u_int8_t        mmjo_reserved2;
-       u_int8_t        mmjo_mac[SHA1_RESULTLEN]; /* This is 160 bits HMAC SHA-1 per RFC */
-} __attribute__((__packed__));
-
-
-/*
- * MPTCP ADD_ADDR and REMOVE_ADDR TCP Options
- *
- * ADD_ADDR option shall be ignored by this implementation
- * REMOVE_ADDR option shall be sent to help flush dead subflows
- */
-
-/* Add Address Option */
-struct mptcp_addaddr_opt {
-       u_int8_t        ma_kind;
-       u_int8_t        ma_len;
-#if BYTE_ORDER == LITTLE_ENDIAN
-       u_int8_t        ma_ipver:4,
-           ma_subtype:4;
-#else /* BIG_ENDIAN */
-       u_int8_t        ma_subtype:4,
-           ma_ipver:4;
-#endif
-#define MA_IPVer_V4 4   /* IPv4 Address tagged to the option */
-#define MA_IPVer_V6 6   /* IPv6 Address tagged to the option */
-       u_int8_t        ma_addr_id;
-} __attribute__((__packed__));
-
-/* Address sent in the ADD_ADDR option */
-struct mptcp_addr_family_val {
-       union {
-               struct in_addr  ma_v4_addr;
-               struct in6_addr ma_v6_addr;
-       } ma_addr;
-       /* u_int16_t ma_ports; */       /* optional field */
+       uint8_t        mmjo_reserved2;
+       uint8_t        mmjo_mac[SHA1_RESULTLEN]; /* This is 160 bits HMAC SHA-1 per RFC */
 } __attribute__((__packed__));
 
 /* Remove Address Option */
 struct mptcp_remaddr_opt {
-       u_int8_t        mr_kind;
-       u_int8_t        mr_len;
+       uint8_t        mr_kind;
+       uint8_t        mr_len;
 #if BYTE_ORDER == LITTLE_ENDIAN
-       u_int8_t        mr_rest:4,
+       uint8_t        mr_rest:4,
            mr_subtype:4;
 #else /* BIG_ENDIAN */
-       u_int8_t        mr_subtype:4,
+       uint8_t        mr_subtype:4,
            mr_rest:4;
 #endif
-       u_int8_t        mr_addr_id;
+       uint8_t        mr_addr_id;
 } __attribute__((__packed__));
 
 /*
@@ -205,85 +171,85 @@ struct mptcp_remaddr_opt {
 
 /* DSS fields common to all DSS option variants */
 struct mptcp_dss_copt {
-       u_int8_t        mdss_kind;
-       u_int8_t        mdss_len;
+       uint8_t        mdss_kind;
+       uint8_t        mdss_len;
 #if BYTE_ORDER == LITTLE_ENDIAN
-       u_int8_t        mdss_reserved1:4,
+       uint8_t        mdss_reserved1:4,
            mdss_subtype:4;
 #else /* BIG_ENDIAN */
-       u_int8_t        mdss_subtype:4,
+       uint8_t        mdss_subtype:4,
            mdss_reserved1:4;
 #endif
-       u_int8_t        mdss_flags;
+       uint8_t        mdss_flags;
 }__attribute__((__packed__));
 
 /* 32-bit DSS option */
 struct mptcp_dsn_opt {
        struct mptcp_dss_copt   mdss_copt;
-       u_int32_t       mdss_dsn;               /* Data Sequence Number */
-       u_int32_t       mdss_subflow_seqn;      /* Relative Subflow Seq Num */
-       u_int16_t       mdss_data_len;          /* Data Length */
-       /* u_int16_t    mdss_xsum; */           /* Data checksum - optional */
+       uint32_t       mdss_dsn;               /* Data Sequence Number */
+       uint32_t       mdss_subflow_seqn;      /* Relative Subflow Seq Num */
+       uint16_t       mdss_data_len;          /* Data Length */
+       /* uint16_t     mdss_xsum; */           /* Data checksum - optional */
 }__attribute__((__packed__));
 
 /* 64-bit DSS option */
 struct mptcp_dsn64_opt {
        struct mptcp_dss_copt   mdss_copt;
-       u_int64_t       mdss_dsn;               /* Data Sequence Number */
-       u_int32_t       mdss_subflow_seqn;      /* Relative Subflow Seq Num */
-       u_int16_t       mdss_data_len;          /* Data Length */
-       /* u_int16_t    mdss_xsum; */           /* Data checksum - optional */
+       uint64_t       mdss_dsn;               /* Data Sequence Number */
+       uint32_t       mdss_subflow_seqn;      /* Relative Subflow Seq Num */
+       uint16_t       mdss_data_len;          /* Data Length */
+       /* uint16_t     mdss_xsum; */           /* Data checksum - optional */
 }__attribute__((__packed__));
 
 /* 32-bit DSS Data ACK option */
 struct mptcp_data_ack_opt {
        struct mptcp_dss_copt   mdss_copt;
-       u_int32_t               mdss_ack;
+       uint32_t               mdss_ack;
 }__attribute__((__packed__));
 
 /* 64-bit DSS Data ACK option */
 struct mptcp_data_ack64_opt {
        struct mptcp_dss_copt   mdss_copt;
-       u_int64_t               mdss_ack;
+       uint64_t               mdss_ack;
 }__attribute__((__packed__));
 
 /* 32-bit DSS+Data ACK option */
 struct mptcp_dss_ack_opt {
        struct mptcp_dss_copt   mdss_copt;
-       u_int32_t       mdss_ack;               /* Data ACK */
-       u_int32_t       mdss_dsn;               /* Data Sequence Number */
-       u_int32_t       mdss_subflow_seqn;      /* Relative Subflow Seq Num */
-       u_int16_t       mdss_data_len;          /* Data Length */
-       /* u_int16_t mdss_xsum; */              /* Data checksum - optional */
+       uint32_t       mdss_ack;               /* Data ACK */
+       uint32_t       mdss_dsn;               /* Data Sequence Number */
+       uint32_t       mdss_subflow_seqn;      /* Relative Subflow Seq Num */
+       uint16_t       mdss_data_len;          /* Data Length */
+       /* uint16_t mdss_xsum; */               /* Data checksum - optional */
 }__attribute__((__packed__));
 
 /* 64-bit DSS+Data ACK option */
 struct mptcp_dss64_ack64_opt {
        struct mptcp_dss_copt   mdss_copt;
-       u_int64_t       mdss_ack;               /* Data ACK */
-       u_int64_t       mdss_dsn;               /* Data Sequence Number */
-       u_int32_t       mdss_subflow_seqn;      /* Relative Subflow Seq Num */
-       u_int16_t       mdss_data_len;          /* Data Length */
-       /* u_int16_t mdss_xsum; */              /* Data checksum - optional */
+       uint64_t       mdss_ack;               /* Data ACK */
+       uint64_t       mdss_dsn;               /* Data Sequence Number */
+       uint32_t       mdss_subflow_seqn;      /* Relative Subflow Seq Num */
+       uint16_t       mdss_data_len;          /* Data Length */
+       /* uint16_t mdss_xsum; */               /* Data checksum - optional */
 }__attribute__((__packed__));
 
 /* DSS+Data ACK mixed option variants */
 struct mptcp_dss32_ack64_opt {
        struct mptcp_dss_copt   mdss_copt;
-       u_int64_t       mdss_ack;               /* Data ACK */
-       u_int32_t       mdss_dsn;               /* Data Sequence Number */
-       u_int32_t       mdss_subflow_seqn;      /* Relative Subflow Seq Num */
-       u_int16_t       mdss_data_len;          /* Data Length */
-       /* u_int16_t mdss_xsum; */              /* Data checksum - optional */
+       uint64_t       mdss_ack;               /* Data ACK */
+       uint32_t       mdss_dsn;               /* Data Sequence Number */
+       uint32_t       mdss_subflow_seqn;      /* Relative Subflow Seq Num */
+       uint16_t       mdss_data_len;          /* Data Length */
+       /* uint16_t mdss_xsum; */               /* Data checksum - optional */
 }__attribute__((__packed__));
 
 struct mptcp_dss64_ack32_opt {
        struct mptcp_dss_copt   mdss_copt;
-       u_int32_t       mdss_ack;               /* Data ACK */
-       u_int64_t       mdss_dsn;               /* Data Sequence Number */
-       u_int32_t       mdss_subflow_seqn;      /* Relative Subflow Seq Num */
-       u_int16_t       mdss_data_len;          /* Data Length */
-       /* u_int16_t mdss_xsum; */              /* Data checksum - optional */
+       uint32_t       mdss_ack;               /* Data ACK */
+       uint64_t       mdss_dsn;               /* Data Sequence Number */
+       uint32_t       mdss_subflow_seqn;      /* Relative Subflow Seq Num */
+       uint16_t       mdss_data_len;          /* Data Length */
+       /* uint16_t mdss_xsum; */               /* Data checksum - optional */
 }__attribute__((__packed__));
 
 
@@ -295,17 +261,17 @@ struct mptcp_dss64_ack32_opt {
  * API is supported.
  */
 struct mptcp_fastclose_opt {
-       u_int8_t        mfast_kind;
-       u_int8_t        mfast_len;
+       uint8_t        mfast_kind;
+       uint8_t        mfast_len;
 #if BYTE_ORDER == LITTLE_ENDIAN
-       u_int8_t        mfast_reserved:4,
+       uint8_t        mfast_reserved:4,
            mfast_subtype:4;
 #else /* BIG_ENDIAN */
-       u_int8_t        mfast_subtype:4,
+       uint8_t        mfast_subtype:4,
            mfast_reserved:4;
 #endif
-       u_int8_t        mfast_reserved1;
-       u_int64_t       mfast_key;              /* Option receiver's key */
+       uint8_t        mfast_reserved1;
+       uint64_t       mfast_key;              /* Option receiver's key */
 }__attribute__((__packed__));
 
 /*
@@ -316,19 +282,44 @@ struct mptcp_fastclose_opt {
  * option.
  */
 struct mptcp_mpfail_opt {
-       u_int8_t        mfail_kind;
-       u_int8_t        mfail_len;
+       uint8_t        mfail_kind;
+       uint8_t        mfail_len;
 #if BYTE_ORDER == LITTLE_ENDIAN
-       u_int8_t        mfail_reserved:4,
+       uint8_t        mfail_reserved:4,
            mfail_subtype:4;
 #else /* BIG_ENDIAN */
-       u_int8_t        mfail_subtype:4,
+       uint8_t        mfail_subtype:4,
            mfail_reserved:4;
 #endif
-       u_int8_t        mfail_reserved1:8;
-       u_int64_t       mfail_dsn;
+       uint8_t        mfail_reserved1:8;
+       uint64_t       mfail_dsn;
+}__attribute__((__packed__));
+
+struct mptcp_add_addr_opt {
+       uint8_t         maddr_kind;
+       uint8_t         maddr_len;
+#if BYTE_ORDER == LITTLE_ENDIAN
+       uint8_t         maddr_ipversion:4,
+           maddr_subtype:4;
+#else /* BIG_ENDIAN */
+       uint8_t         maddr_subtype:4,
+           maddr_ipversion:4;
+#endif
+       uint8_t         maddr_addrid;
+       union {
+               struct {
+                       struct in_addr maddr_addrv4;
+                       uint32_t maddr_pad[3];
+               };
+
+               struct {
+                       struct in6_addr maddr_addrv6;
+               };
+       } maddr_u;
 }__attribute__((__packed__));
 
+#define MPTCP_ADD_ADDR_OPT_LEN_V4       8
+#define MPTCP_ADD_ADDR_OPT_LEN_V6       20
 
 /*
  * MPTCP MP_PRIO Option
@@ -340,31 +331,31 @@ struct mptcp_mpfail_opt {
 
 /* Option to change priority of self */
 struct mptcp_mpprio_opt {
-       u_int8_t        mpprio_kind;
-       u_int8_t        mpprio_len;
+       uint8_t        mpprio_kind;
+       uint8_t        mpprio_len;
 #define MPTCP_MPPRIO_BKP        0x1
 #if BYTE_ORDER == LITTLE_ENDIAN
-       u_int8_t        mpprio_flags:4,
+       uint8_t        mpprio_flags:4,
            mpprio_subtype:4;
 #else /* BIG_ENDIAN */
-       u_int8_t        mpprio_subtype:4,
+       uint8_t        mpprio_subtype:4,
            mpprio_flags:4;
 #endif
 }__attribute__((__packed__));
 
 /* Option to change priority of some other subflow(s) using addr_id */
 struct mptcp_mpprio_addr_opt {
-       u_int8_t        mpprio_kind;
-       u_int8_t        mpprio_len;
+       uint8_t        mpprio_kind;
+       uint8_t        mpprio_len;
 #define MPTCP_MPPRIO_BKP        0x1
 #if BYTE_ORDER == LITTLE_ENDIAN
-       u_int8_t        mpprio_flags:4,
+       uint8_t        mpprio_flags:4,
            mpprio_subtype:4;
 #else /* BIG_ENDIAN */
-       u_int8_t        mpprio_subtype:4,
+       uint8_t        mpprio_subtype:4,
            mpprio_flags:4;
 #endif
-       u_int8_t        mpprio_addrid;
+       uint8_t        mpprio_addrid;
 }__attribute__((__packed__));
 
 /*
@@ -372,10 +363,10 @@ struct mptcp_mpprio_addr_opt {
  *
  */
 struct mptcp_pseudohdr {
-       u_int64_t       mphdr_dsn;      /* Data Sequence Number */
-       u_int32_t       mphdr_ssn;      /* Subflow Sequence Number */
-       u_int16_t       mphdr_len;      /* Data-Level Length */
-       u_int16_t       mphdr_xsum;     /* MPTCP Level Checksum */
+       uint64_t       mphdr_dsn;      /* Data Sequence Number */
+       uint32_t       mphdr_ssn;      /* Subflow Sequence Number */
+       uint16_t       mphdr_len;      /* Data-Level Length */
+       uint16_t       mphdr_xsum;     /* MPTCP Level Checksum */
 }__attribute__((__packed__));
 
 #endif /* BSD_KERNEL_PRIVATE */
index 377f0d567493db055ca5dc1b13c1cd8666df7f89..6b63ab6e00be4b760a342a01fd1c12aac4b3d4d8 100644 (file)
@@ -67,13 +67,22 @@ mptcp_setup_first_subflow_syn_opts(struct socket *so, u_char *opt, unsigned optl
        struct mptcp_mpcapable_opt_common mptcp_opt;
        struct tcpcb *tp = sototcpcb(so);
        struct mptcb *mp_tp = tptomptp(tp);
+       int ret;
 
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
+       ret = tcp_heuristic_do_mptcp(tp);
+       if (ret > 0) {
+               os_log_info(mptcp_log_handle, "%s - %lx: Not doing MPTCP due to heuristics",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte));
+               mp_tp->mpt_flags |= MPTCPF_FALLBACK_HEURISTIC;
+               return optlen;
+       }
 
        /*
         * Avoid retransmitting the MP_CAPABLE option.
         */
-       if (tp->t_rxtshift > mptcp_mpcap_retries) {
+       if (ret == 0 &&
+           tp->t_rxtshift > mptcp_mpcap_retries &&
+           !(tptomptp(tp)->mpt_mpte->mpte_flags & MPTE_FORCE_ENABLE)) {
                if (!(mp_tp->mpt_flags & (MPTCPF_FALLBACK_HEURISTIC | MPTCPF_HEURISTIC_TRAC))) {
                        mp_tp->mpt_flags |= MPTCPF_HEURISTIC_TRAC;
                        tcp_heuristic_mptcp_loss(tp);
@@ -81,11 +90,6 @@ mptcp_setup_first_subflow_syn_opts(struct socket *so, u_char *opt, unsigned optl
                return optlen;
        }
 
-       if (!tcp_heuristic_do_mptcp(tp)) {
-               mp_tp->mpt_flags |= MPTCPF_FALLBACK_HEURISTIC;
-               return optlen;
-       }
-
        bzero(&mptcp_opt, sizeof(struct mptcp_mpcapable_opt_common));
 
        mptcp_opt.mmco_kind = TCPOPT_MULTIPATH;
@@ -125,9 +129,6 @@ mptcp_setup_join_subflow_syn_opts(struct socket *so, u_char *opt, unsigned optle
 
        mpts = tp->t_mpsub;
 
-       VERIFY(tptomptp(tp));
-       mpte_lock_assert_held(tptomptp(tp)->mpt_mpte);
-
        bzero(&mpjoin_req, sizeof(mpjoin_req));
        mpjoin_req.mmjo_kind = TCPOPT_MULTIPATH;
        mpjoin_req.mmjo_len = sizeof(mpjoin_req);
@@ -136,7 +137,7 @@ mptcp_setup_join_subflow_syn_opts(struct socket *so, u_char *opt, unsigned optle
        if (tp->t_mpflags & TMPF_BACKUP_PATH) {
                mpjoin_req.mmjo_subtype_bkp |= MPTCP_BACKUP;
        } else if (inp->inp_boundifp && IFNET_IS_CELLULAR(inp->inp_boundifp) &&
-           mpts->mpts_mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
+           mpts->mpts_mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) {
                mpjoin_req.mmjo_subtype_bkp |= MPTCP_BACKUP;
                tp->t_mpflags |= TMPF_BACKUP_PATH;
        } else {
@@ -209,8 +210,6 @@ mptcp_send_mpfail(struct tcpcb *tp, u_char *opt, unsigned int optlen)
                return optlen;
        }
 
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
-
        /* if option space low give up */
        if ((MAX_TCPOPTLEN - optlen) < sizeof(struct mptcp_mpfail_opt)) {
                tp->t_mpflags &= ~TMPF_SND_MPFAIL;
@@ -251,8 +250,6 @@ mptcp_send_infinite_mapping(struct tcpcb *tp, u_char *opt, unsigned int optlen)
                return optlen;
        }
 
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
-
        if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
                csum_len = 2;
        }
@@ -326,8 +323,6 @@ mptcp_ok_to_fin(struct tcpcb *tp, u_int64_t dsn, u_int32_t datalen)
 {
        struct mptcb *mp_tp = tptomptp(tp);
 
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
-
        dsn = (mp_tp->mpt_sndmax & MPTCP_DATASEQ_LOW32_MASK) | dsn;
        if ((dsn + datalen) == mp_tp->mpt_sndmax) {
                return 1;
@@ -354,7 +349,7 @@ mptcp_setup_opts(struct tcpcb *tp, int32_t off, u_char *opt,
                goto ret_optlen;
        }
 
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
+       socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
 
        if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
                do_csum = TRUE;
@@ -873,12 +868,6 @@ mptcp_sanitize_option(struct tcpcb *tp, int mptcp_subtype)
        struct mptcb *mp_tp = tptomptp(tp);
        int ret = 1;
 
-       if (mp_tp == NULL) {
-               mptcplog((LOG_ERR, "%s: NULL mpsocket \n", __func__),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
-               return 0;
-       }
-
        switch (mptcp_subtype) {
        case MPO_CAPABLE:
                break;
@@ -895,9 +884,8 @@ mptcp_sanitize_option(struct tcpcb *tp, int mptcp_subtype)
                break;
        default:
                ret = 0;
-               mptcplog((LOG_ERR, "%s: type = %d \n", __func__,
-                   mptcp_subtype),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: type = %d \n", __func__,
+                   (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), mptcp_subtype);
                break;
        }
        return ret;
@@ -915,7 +903,7 @@ mptcp_valid_mpcapable_common_opt(u_char *cp)
                return 0;
        }
 
-       if (rsp->mmco_flags & (MPCAP_BBIT | MPCAP_CBIT | MPCAP_DBIT |
+       if (rsp->mmco_flags & (MPCAP_BBIT | MPCAP_DBIT |
            MPCAP_EBIT | MPCAP_FBIT | MPCAP_GBIT)) {
                return 0;
        }
@@ -930,8 +918,7 @@ mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th,
 {
        struct mptcp_mpcapable_opt_rsp *rsp = NULL;
        struct mptcb *mp_tp = tptomptp(tp);
-
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
+       struct mptses *mpte = mp_tp->mpt_mpte;
 
        /* Only valid on SYN/ACK */
        if ((th->th_flags & (TH_SYN | TH_ACK)) != (TH_SYN | TH_ACK)) {
@@ -952,10 +939,9 @@ mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th,
        /* A SYN/ACK contains peer's key and flags */
        if (optlen != sizeof(struct mptcp_mpcapable_opt_rsp)) {
                /* complain */
-               mptcplog((LOG_ERR, "%s: SYN_ACK optlen = %d, sizeof mp opt = %lu \n",
-                   __func__, optlen,
-                   sizeof(struct mptcp_mpcapable_opt_rsp)),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: SYN_ACK optlen = %d, sizeof mp opt = %lu \n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), optlen,
+                   sizeof(struct mptcp_mpcapable_opt_rsp));
                tcpstat.tcps_invalid_mpcap++;
                return;
        }
@@ -969,6 +955,11 @@ mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th,
                mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
        }
 
+       if (((struct mptcp_mpcapable_opt_common *)cp)->mmco_flags &
+           MPCAP_UNICAST_IPBIT) {
+               mpte->mpte_flags |= MPTE_UNICAST_IP;
+       }
+
        rsp = (struct mptcp_mpcapable_opt_rsp *)cp;
        mp_tp->mpt_remotekey = rsp->mmc_localkey;
        /* For now just downgrade to the peer's version */
@@ -990,7 +981,6 @@ static void
 mptcp_do_mpjoin_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, int optlen)
 {
 #define MPTCP_JOPT_ERROR_PATH(tp) {                                     \
-       tp->t_mpflags |= TMPF_RESET;                                    \
        tcpstat.tcps_invalid_joins++;                                   \
        if (tp->t_inpcb->inp_socket != NULL) {                          \
                soevent(tp->t_inpcb->inp_socket,                        \
@@ -1007,10 +997,9 @@ mptcp_do_mpjoin_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, int optlen)
        }
 
        if (optlen != sizeof(struct mptcp_mpjoin_opt_rsp)) {
-               mptcplog((LOG_ERR, "%s: SYN_ACK: unexpected optlen = %d mp "
-                   "option = %lu\n", __func__, optlen,
-                   sizeof(struct mptcp_mpjoin_opt_rsp)),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: SYN_ACK: unexpected optlen = %d mp option = %lu\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte),
+                   optlen, sizeof(struct mptcp_mpjoin_opt_rsp));
                tp->t_mpflags &= ~TMPF_PREESTABLISHED;
                /* send RST and close */
                MPTCP_JOPT_ERROR_PATH(tp);
@@ -1022,8 +1011,9 @@ mptcp_do_mpjoin_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, int optlen)
        error = mptcp_validate_join_hmac(tp,
            (u_char*)&join_rsp->mmjo_mac, SHA1_TRUNCATED);
        if (error) {
-               mptcplog((LOG_ERR, "%s: SYN_ACK error = %d \n", __func__, error),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: SYN_ACK error = %d \n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte),
+                   error);
                tp->t_mpflags &= ~TMPF_PREESTABLISHED;
                /* send RST and close */
                MPTCP_JOPT_ERROR_PATH(tp);
@@ -1039,8 +1029,6 @@ mptcp_validate_join_hmac(struct tcpcb *tp, u_char* hmac, int mac_len)
        struct mptcb *mp_tp = tptomptp(tp);
        u_int32_t rem_rand, loc_rand;
 
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
-
        rem_rand = loc_rand = 0;
 
        mptcp_get_rands(tp->t_local_aid, mp_tp, &loc_rand, &rem_rand);
@@ -1068,7 +1056,7 @@ mptcp_validate_join_hmac(struct tcpcb *tp, u_char* hmac, int mac_len)
 void
 mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack)
 {
-       u_int64_t acked = full_dack - mp_tp->mpt_snduna;
+       uint64_t acked = full_dack - mp_tp->mpt_snduna;
 
        if (acked) {
                struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
@@ -1076,11 +1064,11 @@ mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack)
                if (acked > mp_so->so_snd.sb_cc) {
                        if (acked > mp_so->so_snd.sb_cc + 1 ||
                            mp_tp->mpt_state < MPTCPS_FIN_WAIT_1) {
-                               mptcplog((LOG_ERR, "%s: acked %u, sb_cc %u full %u suna %u state %u\n",
-                                   __func__, (uint32_t)acked, mp_so->so_snd.sb_cc,
+                               os_log_error(mptcp_log_handle, "%s - %lx: acked %u, sb_cc %u full %u suna %u state %u\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
+                                   (uint32_t)acked, mp_so->so_snd.sb_cc,
                                    (uint32_t)full_dack, (uint32_t)mp_tp->mpt_snduna,
-                                   mp_tp->mpt_state),
-                                   MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
+                                   mp_tp->mpt_state);
                        }
 
                        sbdrop(&mp_so->so_snd, (int)mp_so->so_snd.sb_cc);
@@ -1116,7 +1104,7 @@ mptcp_update_window_wakeup(struct tcpcb *tp)
 {
        struct mptcb *mp_tp = tptomptp(tp);
 
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
+       socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
 
        if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
                mp_tp->mpt_sndwnd = tp->snd_wnd;
@@ -1130,9 +1118,9 @@ mptcp_update_window_wakeup(struct tcpcb *tp)
 static void
 mptcp_update_window(struct mptcb *mp_tp, u_int64_t ack, u_int64_t seq, u_int32_t tiwin)
 {
-       if (SEQ_LT(mp_tp->mpt_sndwl1, seq) ||
+       if (MPTCP_SEQ_LT(mp_tp->mpt_sndwl1, seq) ||
            (mp_tp->mpt_sndwl1 == seq &&
-           (SEQ_LT(mp_tp->mpt_sndwl2, ack) ||
+           (MPTCP_SEQ_LT(mp_tp->mpt_sndwl2, ack) ||
            (mp_tp->mpt_sndwl2 == ack && tiwin > mp_tp->mpt_sndwnd)))) {
                mp_tp->mpt_sndwnd = tiwin;
                mp_tp->mpt_sndwl1 = seq;
@@ -1163,12 +1151,6 @@ mptcp_do_dss_opt_ack_meat(u_int64_t full_dack, u_int64_t full_dsn,
                if (close_notify) {
                        mptcp_notify_close(tp->t_inpcb->inp_socket);
                }
-       } else {
-               os_log_error(mptcp_log_handle,
-                   "%s: unexpected dack %u snduna %u sndmax %u\n",
-                   __func__, (u_int32_t)full_dack,
-                   (u_int32_t)mp_tp->mpt_snduna,
-                   (u_int32_t)mp_tp->mpt_sndmax);
        }
 
        mptcp_update_window(mp_tp, full_dack, full_dsn, tiwin);
@@ -1414,27 +1396,21 @@ mptcp_do_dss_opt_meat(u_char *cp, struct tcpcb *tp, struct tcphdr *th)
 }
 
 static void
-mptcp_do_dss_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, int optlen)
+mptcp_do_dss_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th)
 {
-#pragma unused(optlen)
+       struct mptcp_dss_copt *dss_rsp = (struct mptcp_dss_copt *)cp;
        struct mptcb *mp_tp = tptomptp(tp);
 
        if (!mp_tp) {
                return;
        }
 
-       /* We may get Data ACKs just during fallback, so don't ignore those */
-       if ((tp->t_mpflags & TMPF_MPTCP_TRUE) ||
-           (tp->t_mpflags & TMPF_TCP_FALLBACK)) {
-               struct mptcp_dss_copt *dss_rsp = (struct mptcp_dss_copt *)cp;
-
-               if (dss_rsp->mdss_subtype == MPO_DSS) {
-                       if (dss_rsp->mdss_flags & MDSS_F) {
-                               tp->t_rcv_map.mpt_dfin = 1;
-                       }
-
-                       mptcp_do_dss_opt_meat(cp, tp, th);
+       if (dss_rsp->mdss_subtype == MPO_DSS) {
+               if (dss_rsp->mdss_flags & MDSS_F) {
+                       tp->t_rcv_map.mpt_dfin = 1;
                }
+
+               mptcp_do_dss_opt_meat(cp, tp, th);
        }
 }
 
@@ -1473,7 +1449,7 @@ mptcp_do_fastclose_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th)
        }
 
        /* Reset this flow */
-       tp->t_mpflags |= (TMPF_RESET | TMPF_FASTCLOSERCV);
+       tp->t_mpflags |= TMPF_FASTCLOSERCV;
 
        if (tp->t_inpcb->inp_socket != NULL) {
                soevent(tp->t_inpcb->inp_socket,
@@ -1485,9 +1461,9 @@ mptcp_do_fastclose_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th)
 static void
 mptcp_do_mpfail_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th)
 {
-       struct mptcb *mp_tp = NULL;
        struct mptcp_mpfail_opt *fail_opt = (struct mptcp_mpfail_opt *)cp;
        u_int32_t mdss_subflow_seqn = 0;
+       struct mptcb *mp_tp;
        int error = 0;
 
        /*
@@ -1521,6 +1497,96 @@ mptcp_do_mpfail_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th)
        mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
 }
 
+static void
+mptcp_do_add_addr_opt(struct mptses *mpte, u_char *cp)
+{
+       struct mptcp_add_addr_opt *addr_opt = (struct mptcp_add_addr_opt *)cp;
+
+       if (addr_opt->maddr_len != MPTCP_ADD_ADDR_OPT_LEN_V4 &&
+           addr_opt->maddr_len != MPTCP_ADD_ADDR_OPT_LEN_V6) {
+               os_log_info(mptcp_log_handle, "%s - %lx: Wrong ADD_ADDR length %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                   addr_opt->maddr_len);
+
+               return;
+       }
+
+       if (addr_opt->maddr_len == MPTCP_ADD_ADDR_OPT_LEN_V4 &&
+           addr_opt->maddr_ipversion != 4) {
+               os_log_info(mptcp_log_handle, "%s - %lx: ADD_ADDR length for v4 but version is %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                   addr_opt->maddr_ipversion);
+
+               return;
+       }
+
+       if (addr_opt->maddr_len == MPTCP_ADD_ADDR_OPT_LEN_V6 &&
+           addr_opt->maddr_ipversion != 6) {
+               os_log_info(mptcp_log_handle, "%s - %lx: ADD_ADDR length for v6 but version is %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                   addr_opt->maddr_ipversion);
+
+               return;
+       }
+
+       if (addr_opt->maddr_len == MPTCP_ADD_ADDR_OPT_LEN_V4) {
+               struct sockaddr_in *dst = &mpte->mpte_dst_unicast_v4;
+               struct in_addr *addr = &addr_opt->maddr_u.maddr_addrv4;
+               in_addr_t haddr = ntohl(addr->s_addr);
+
+               if (IN_ZERONET(haddr) ||
+                   IN_LOOPBACK(haddr) ||
+                   IN_LINKLOCAL(haddr) ||
+                   IN_DS_LITE(haddr) ||
+                   IN_6TO4_RELAY_ANYCAST(haddr) ||
+                   IN_MULTICAST(haddr) ||
+                   INADDR_BROADCAST == haddr ||
+                   IN_PRIVATE(haddr) ||
+                   IN_SHARED_ADDRESS_SPACE(haddr)) {
+                       os_log_info(mptcp_log_handle, "%s - %lx: ADD_ADDR invalid addr: %x\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                           addr->s_addr);
+
+                       return;
+               }
+
+               dst->sin_len = sizeof(*dst);
+               dst->sin_family = AF_INET;
+               dst->sin_port = mpte->__mpte_dst_v4.sin_port;
+               dst->sin_addr.s_addr = addr->s_addr;
+       } else {
+               struct sockaddr_in6 *dst = &mpte->mpte_dst_unicast_v6;
+               struct in6_addr *addr = &addr_opt->maddr_u.maddr_addrv6;
+
+               if (IN6_IS_ADDR_LINKLOCAL(addr) ||
+                   IN6_IS_ADDR_MULTICAST(addr) ||
+                   IN6_IS_ADDR_UNSPECIFIED(addr) ||
+                   IN6_IS_ADDR_LOOPBACK(addr) ||
+                   IN6_IS_ADDR_V4COMPAT(addr) ||
+                   IN6_IS_ADDR_V4MAPPED(addr)) {
+                       char dbuf[MAX_IPv6_STR_LEN];
+
+                       inet_ntop(AF_INET6, &dst->sin6_addr, dbuf, sizeof(dbuf));
+                       os_log_info(mptcp_log_handle, "%s - %lx: ADD_ADDRv6 invalid addr: %s\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                           dbuf);
+
+                       return;
+               }
+
+               dst->sin6_len = sizeof(*dst);
+               dst->sin6_family = AF_INET6;
+               dst->sin6_port = mpte->__mpte_dst_v6.sin6_port;
+               memcpy(&dst->sin6_addr, addr, sizeof(*addr));
+       }
+
+       os_log_info(mptcp_log_handle, "%s - %lx: Received ADD_ADDRv%u\n",
+           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+           addr_opt->maddr_ipversion);
+
+       mptcp_sched_create_subflows(mpte);
+}
+
 void
 tcp_do_mptcp_options(struct tcpcb *tp, u_char *cp, struct tcphdr *th,
     struct tcpopt *to, int optlen)
@@ -1532,7 +1598,7 @@ tcp_do_mptcp_options(struct tcpcb *tp, u_char *cp, struct tcphdr *th,
                return;
        }
 
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
+       socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
 
        /* All MPTCP options have atleast 4 bytes */
        if (optlen < 4) {
@@ -1553,7 +1619,7 @@ tcp_do_mptcp_options(struct tcpcb *tp, u_char *cp, struct tcphdr *th,
                mptcp_do_mpjoin_opt(tp, cp, th, optlen);
                break;
        case MPO_DSS:
-               mptcp_do_dss_opt(tp, cp, th, optlen);
+               mptcp_do_dss_opt(tp, cp, th);
                break;
        case MPO_FASTCLOSE:
                mptcp_do_fastclose_opt(tp, cp, th);
@@ -1561,7 +1627,9 @@ tcp_do_mptcp_options(struct tcpcb *tp, u_char *cp, struct tcphdr *th,
        case MPO_FAIL:
                mptcp_do_mpfail_opt(tp, cp, th);
                break;
-       case MPO_ADD_ADDR:              /* fall through */
+       case MPO_ADD_ADDR:
+               mptcp_do_add_addr_opt(mp_tp->mpt_mpte, cp);
+               break;
        case MPO_REMOVE_ADDR:           /* fall through */
        case MPO_PRIO:
                to->to_flags |= TOF_MPTCP;
index 0a65d56519a8b77909a3de775fc8cd1a7cc72812..5ca0e32e8e547cad8f53cf6adc56d3f250746109 100644 (file)
 
 #ifdef BSD_KERNEL_PRIVATE
 
-/*
- * Try setting up an MPTCP connection by making atleast 3 attempts,
- * that is 2 retransmissions - needed for Weak WiFi and long delay cellular.
- * This number must be bumped higher when we are assured that middleboxes
- * are not the reason for retries. Generally, on weak wifi and cold start
- * cellular, more than 2 retries are necessary.
- */
-#define MPTCP_CAPABLE_RETRIES   (2)
-
 __BEGIN_DECLS
 extern void mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack);
 extern void mptcp_update_window_wakeup(struct tcpcb *tp);
index f21312da56f7c67de2549463dec9360980f109f2..f7980b76c26a2d4577ec1fb396b8ae724ba9d698 100644 (file)
@@ -117,8 +117,6 @@ static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
     struct uio *, struct mbuf **, struct mbuf **, int *);
 static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
     struct uio *, struct mbuf *, struct mbuf *, int);
-static void mptcp_subflow_rupcall(struct socket *, void *, int);
-static void mptcp_subflow_input(struct mptses *, struct mptsub *);
 static void mptcp_subflow_wupcall(struct socket *, void *, int);
 static void mptcp_subflow_eupcall1(struct socket *, void *, uint32_t);
 static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
@@ -127,6 +125,9 @@ static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
 static void mptcp_subflow_abort(struct mptsub *, int);
 
 static void mptcp_send_dfin(struct socket *so);
+static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
+static void mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, long val);
+static int mptcp_freeq(struct mptcb *mp_tp);
 
 /*
  * Possible return values for subflow event handlers.  Note that success
@@ -142,7 +143,6 @@ typedef enum {
        MPTS_EVRET_DISCONNECT_FALLBACK  = 4,    /* abort all but preferred */
 } ev_ret_t;
 
-static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *);
 static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
 static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
 static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
@@ -152,11 +152,10 @@ static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *,
 static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
 static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
 static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
+static ev_ret_t mptcp_subflow_mpsuberror_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
 static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
 static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
 
-static const char *mptcp_evret2str(ev_ret_t);
-
 static void mptcp_do_sha1(mptcp_key_t *, char *);
 static void mptcp_init_local_parms(struct mptses *);
 
@@ -171,9 +170,6 @@ static struct zone *mpt_subauth_zone;           /* zone of subf auth entry */
 
 struct mppcbinfo mtcbinfo;
 
-#define MPTCP_SUBFLOW_WRITELEN  (8 * 1024)      /* bytes to write each time */
-#define MPTCP_SUBFLOW_READLEN   (8 * 1024)      /* bytes to read each time */
-
 SYSCTL_DECL(_net_inet);
 
 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
@@ -212,8 +208,13 @@ typedef struct mptcp_subflow_event_entry {
                uint64_t event);
 } mptsub_ev_entry_t;
 
-static uint8_t mptcp_cellicon_is_set;
-static uint32_t mptcp_last_cellicon_set;
+/* Using Symptoms Advisory to detect poor WiFi or poor Cell */
+static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
+static uint32_t mptcp_kern_skt_inuse = 0;
+static uint32_t mptcp_kern_skt_unit;
+static symptoms_advisory_t mptcp_advisory;
+
+uint32_t mptcp_cellicon_refcount = 0;
 #define MPTCP_CELLICON_TOGGLE_RATE      (5 * TCP_RETRANSHZ) /* Only toggle every 5 seconds */
 
 /*
@@ -221,6 +222,10 @@ static uint32_t mptcp_last_cellicon_set;
  * really important. Think twice before changing it.
  */
 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
+       {
+               .sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
+               .sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
+       },
        {
                .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
                .sofilt_hint_ev_hdlr =  mptcp_subflow_mpcantrcvmore_ev,
@@ -298,6 +303,8 @@ mptcp_init(struct protosw *pp, struct domain *dp)
        }
        mptcp_initialized = 1;
 
+       mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
+
        /*
         * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
         * we must be able to find IPPROTO_TCP entries for both.
@@ -399,40 +406,51 @@ mptcp_init(struct protosw *pp, struct domain *dp)
        zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
        zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
 
-       mptcp_last_cellicon_set = tcp_now;
-
        mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
 }
 
 int
-mptcp_get_statsindex(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
+mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, int ifindex, boolean_t create)
 {
-       const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
-
        int i, index = -1;
 
-       if (ifp == NULL) {
-               mptcplog((LOG_ERR, "%s: no ifp on subflow\n", __func__),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
-               return -1;
-       }
-
        for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
-               if (stats[i].ifindex == IFSCOPE_NONE) {
+               if (create && stats[i].ifindex == IFSCOPE_NONE) {
                        if (index < 0) {
                                index = i;
                        }
                        continue;
                }
 
-               if (stats[i].ifindex == ifp->if_index) {
+               if (stats[i].ifindex == ifindex) {
                        index = i;
                        return index;
                }
        }
 
        if (index != -1) {
-               stats[index].ifindex = ifp->if_index;
+               stats[index].ifindex = ifindex;
+       }
+
+       return index;
+}
+
+static int
+mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
+{
+       const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
+       int index;
+
+       if (ifp == NULL) {
+               os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
+                   sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
+               return -1;
+       }
+
+       index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true);
+
+       if (index != -1) {
                if (stats[index].is_expensive == 0) {
                        stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
                }
@@ -449,7 +467,7 @@ mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
        tcpstat.tcps_mp_switches++;
        mpte->mpte_subflow_switches++;
 
-       index = mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
+       index = mptcpstats_get_index(mpte->mpte_itfstats, mpts);
 
        if (index != -1) {
                mpte->mpte_itfstats[index].switches++;
@@ -475,7 +493,7 @@ mptcp_flush_sopts(struct mptses *mpte)
  * Create an MPTCP session, called as a result of opening a MPTCP socket.
  */
 int
-mptcp_sescreate(struct mppcb *mpp)
+mptcp_session_create(struct mppcb *mpp)
 {
        struct mppcbinfo *mppi;
        struct mptses *mpte;
@@ -500,6 +518,8 @@ mptcp_sescreate(struct mppcb *mpp)
        mpte->mpte_associd = SAE_ASSOCID_ANY;
        mpte->mpte_connid_last = SAE_CONNID_ANY;
 
+       mptcp_init_urgency_timer(mpte);
+
        mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
        mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
 
@@ -507,6 +527,8 @@ mptcp_sescreate(struct mppcb *mpp)
                mpte->mpte_alternate_port = htons(mptcp_alternate_port);
        }
 
+       mpte->mpte_last_cellicon_set = tcp_now;
+
        /* MPTCP Protocol Control Block */
        bzero(mp_tp, sizeof(*mp_tp));
        mp_tp->mpt_mpte = mpte;
@@ -517,6 +539,36 @@ mptcp_sescreate(struct mppcb *mpp)
        return 0;
 }
 
+struct sockaddr *
+mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
+{
+       if (!(mpte->mpte_flags & MPTE_UNICAST_IP)) {
+               return &mpte->mpte_dst;
+       }
+
+       if (ipv6 && mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) {
+               return (struct sockaddr *)&mpte->mpte_dst_unicast_v6;
+       }
+
+       if (ipv4 && mpte->mpte_dst_unicast_v4.sin_family == AF_INET) {
+               return (struct sockaddr *)&mpte->mpte_dst_unicast_v4;
+       }
+
+       /* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
+        * meaning we prefer IPv6 over IPv4.
+        */
+       if (mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) {
+               return (struct sockaddr *)&mpte->mpte_dst_unicast_v6;
+       }
+
+       if (mpte->mpte_dst_unicast_v4.sin_family == AF_INET) {
+               return (struct sockaddr *)&mpte->mpte_dst_unicast_v4;
+       }
+
+       /* We don't yet have a unicast IP */
+       return NULL;
+}
+
 static void
 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
     uint64_t *cellbytes, uint64_t *allbytes)
@@ -537,12 +589,12 @@ mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
 
        if (initial_cell) {
                mycellbytes -= mpte->mpte_init_txbytes;
-               mycellbytes -= mpte->mpte_init_txbytes;
+               mycellbytes -= mpte->mpte_init_rxbytes;
        }
 
        if (mycellbytes < 0) {
-               mptcplog((LOG_ERR, "%s cellbytes is %d\n", __func__, mycellbytes),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
                *cellbytes = 0;
                *allbytes = 0;
        } else {
@@ -677,39 +729,27 @@ mptcpstats_session_wrapup(struct mptses *mpte)
 static void
 mptcp_session_destroy(struct mptses *mpte)
 {
-       struct mptcb *mp_tp;
-
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+       struct mptcb *mp_tp = mpte->mpte_mptcb;
 
-       mp_tp = mpte->mpte_mptcb;
        VERIFY(mp_tp != NULL);
+       VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
 
        mptcpstats_session_wrapup(mpte);
-
-       mptcp_unset_cellicon();
-
-       /*
-        * MPTCP Multipath PCB Extension section
-        */
+       mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
        mptcp_flush_sopts(mpte);
-       VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
 
        if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
                _FREE(mpte->mpte_itfinfo, M_TEMP);
        }
-
        mpte->mpte_itfinfo = NULL;
 
        m_freem_list(mpte->mpte_reinjectq);
 
-       /*
-        * MPTCP Protocol Control Block section
-        */
-       DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
-           struct mptcb *, mp_tp);
+       os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
+           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
 }
 
-static boolean_t
+boolean_t
 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
 {
        return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
@@ -718,15 +758,16 @@ mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
 }
 
 static int
-mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addrv4)
+mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
+    const struct in_addr *addrv4)
 {
        static const struct in6_addr well_known_prefix = {
                .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
                                         0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                                         0x00, 0x00, 0x00, 0x00},
        };
+       const char *ptrv4 = (const char *)addrv4;
        char buf[MAX_IPv6_STR_LEN];
-       char *ptrv4 = (char *)addrv4;
        char *ptr = (char *)addr;
 
        if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
@@ -790,23 +831,44 @@ mptcp_trigger_cell_bringup(struct mptses *mpte)
                uuid_string_t uuidstr;
                int err;
 
-               mpte_unlock(mpte);
+               socket_unlock(mp_so, 0);
                err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
                    TRUE);
-               mpte_lock(mpte);
+               socket_lock(mp_so, 0);
 
                if (err == 0) {
                        mpte->mpte_triggered_cell = 1;
                }
 
                uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
-               os_log_info(mptcp_log_handle, "%s asked irat to bringup cell for uuid %s, err %d\n",
-                   __func__, uuidstr, err);
+               os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
        } else {
-               os_log_info(mptcp_log_handle, "%s UUID is already null\n", __func__);
+               os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
        }
 }
 
+static boolean_t
+mptcp_subflow_disconnecting(struct mptsub *mpts)
+{
+       /* Split out in if-statements for readability. Compile should
+        * optimize that.
+        */
+       if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
+               return true;
+       }
+
+       if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
+               return true;
+       }
+
+       if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
+               return true;
+       }
+
+       return false;
+}
 
 void
 mptcp_check_subflows_and_add(struct mptses *mpte)
@@ -817,24 +879,36 @@ mptcp_check_subflows_and_add(struct mptses *mpte)
        uint32_t i;
 
        if (!mptcp_ok_to_create_subflows(mp_tp)) {
+               os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
+               return;
+       }
+
+       if (mptcp_get_session_dst(mpte, false, false) == NULL) {
                return;
        }
 
        for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
+               boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
                struct mpt_itf_info *info;
+               struct sockaddr_in6 nat64pre;
+               struct sockaddr *dst;
                struct mptsub *mpts;
                struct ifnet *ifp;
                uint32_t ifindex;
-               int found = 0;
 
                info = &mpte->mpte_itfinfo[i];
 
-               if (info->no_mptcp_support) {
+               ifindex = info->ifindex;
+               if (ifindex == IFSCOPE_NONE) {
                        continue;
                }
 
-               ifindex = info->ifindex;
-               if (ifindex == IFSCOPE_NONE) {
+               os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
+                   info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
+
+               if (info->no_mptcp_support) {
                        continue;
                }
 
@@ -852,11 +926,24 @@ mptcp_check_subflows_and_add(struct mptses *mpte)
 
                TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
                        const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
+                       struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
 
                        if (subifp == NULL) {
                                continue;
                        }
 
+                       /*
+                        * If there is at least one functioning subflow on WiFi
+                        * and we are checking for the cell interface, then
+                        * we always need to ask symptoms for permission as
+                        * cell is triggered even if WiFi is available.
+                        */
+                       if (!IFNET_IS_CELLULAR(subifp) &&
+                           !mptcp_subflow_disconnecting(mpts) &&
+                           IFNET_IS_CELLULAR(ifp)) {
+                               need_to_ask_symptoms = TRUE;
+                       }
+
                        /*
                         * In Handover mode, only create cell subflow if
                         * 1. Wi-Fi Assist is active
@@ -876,109 +963,140 @@ mptcp_check_subflows_and_add(struct mptses *mpte)
                         */
                        if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER &&
                            !IFNET_IS_CELLULAR(subifp) &&
-                           !(mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) &&
-                           (mptcp_is_wifi_unusable(mpte) == 0 ||
-                           (sototcpcb(mpts->mpts_socket)->t_rxtshift < mptcp_fail_thresh * 2 &&
-                           ((mpte->mpte_flags & MPTE_FIRSTPARTY) || mptetoso(mpte)->so_snd.sb_cc)))) {
-                               os_log_debug(mptcp_log_handle, "%s handover, wifi state %d rxt %u first-party %u sb_cc %u ifindex %u this %u\n",
-                                   __func__, mptcp_is_wifi_unusable(mpte),
-                                   sototcpcb(mpts->mpts_socket)->t_rxtshift,
+                           !mptcp_subflow_disconnecting(mpts) &&
+                           (mptcp_is_wifi_unusable_for_session(mpte) == 0 ||
+                           (tp->t_rxtshift < mptcp_fail_thresh * 2 && mptetoso(mpte)->so_snd.sb_cc))) {
+                               os_log_debug(mptcp_log_handle,
+                                   "%s - %lx: handover, wifi state %d rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                                   mptcp_is_wifi_unusable_for_session(mpte),
+                                   tp->t_rxtshift,
                                    !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
                                    mptetoso(mpte)->so_snd.sb_cc,
-                                   ifindex, subifp->if_index);
-                               found = 1;
+                                   ifindex, subifp->if_index,
+                                   tp->t_srtt >> TCP_RTT_SHIFT,
+                                   tp->t_rttvar >> TCP_RTTVAR_SHIFT,
+                                   tp->t_rxtcur);
+                               found = TRUE;
 
                                /* We found a proper subflow on WiFi - no need for cell */
                                want_cellular = FALSE;
                                break;
+                       } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
+                               uint64_t time_now = mach_continuous_time();
+
+                               os_log(mptcp_log_handle,
+                                   "%s - %lx: target-based: %llu now %llu unusable? %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
+                                   time_now, mptcp_is_wifi_unusable_for_session(mpte),
+                                   IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
+                                   mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
+
+                               if (!IFNET_IS_CELLULAR(subifp) &&
+                                   !mptcp_subflow_disconnecting(mpts) &&
+                                   (mpte->mpte_time_target == 0 ||
+                                   (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
+                                   !mptcp_is_wifi_unusable_for_session(mpte))) {
+                                       found = TRUE;
+
+                                       want_cellular = FALSE;
+                                       break;
+                               }
                        } else {
-                               os_log_debug(mptcp_log_handle, "%s svc %u cell %u flags %#x unusable %d rtx %u first %u sbcc %u\n",
-                                   __func__, mpte->mpte_svctype, IFNET_IS_CELLULAR(subifp), mpts->mpts_flags,
-                                   mptcp_is_wifi_unusable(mpte), sototcpcb(mpts->mpts_socket)->t_rxtshift,
-                                   !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc);
+                               os_log_debug(mptcp_log_handle,
+                                   "%s - %lx: svc %u cell %u flags %#x unusable %d rtx %u first %u sbcc %u rtt %u rttvar %u rto %u\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                                   mpte->mpte_svctype, IFNET_IS_CELLULAR(subifp), mpts->mpts_flags,
+                                   mptcp_is_wifi_unusable_for_session(mpte), tp->t_rxtshift,
+                                   !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc,
+                                   tp->t_srtt >> TCP_RTT_SHIFT,
+                                   tp->t_rttvar >> TCP_RTTVAR_SHIFT,
+                                   tp->t_rxtcur);
                        }
 
                        if (subifp->if_index == ifindex &&
-                           !(mpts->mpts_socket->so_state & SS_ISDISCONNECTED) &&
-                           sototcpcb(mpts->mpts_socket)->t_state != TCPS_CLOSED) {
+                           !mptcp_subflow_disconnecting(mpts)) {
                                /*
                                 * We found a subflow on this interface.
                                 * No need to create a new one.
                                 */
-                               found = 1;
+                               found = TRUE;
                                break;
                        }
                }
 
-               if (!found && !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
+               if (found) {
+                       continue;
+               }
+
+               if (need_to_ask_symptoms &&
+                   !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
                    !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
                    mptcp_developer_mode == 0) {
                        mptcp_ask_symptoms(mpte);
                        return;
                }
 
-               if (!found) {
-                       struct sockaddr *dst = &mpte->mpte_dst;
-                       struct sockaddr_in6 nat64pre;
-
-                       if (mpte->mpte_dst.sa_family == AF_INET &&
-                           !info->has_v4_conn && info->has_nat64_conn) {
-                               struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
-                               int error, j;
-
-                               bzero(&nat64pre, sizeof(struct sockaddr_in6));
+               dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
 
-                               error = ifnet_get_nat64prefix(ifp, nat64prefixes);
-                               if (error) {
-                                       os_log_error(mptcp_log_handle, "%s: no NAT64-prefix on itf %s, error %d\n",
-                                           __func__, ifp->if_name, error);
-                                       continue;
-                               }
+               if (dst->sa_family == AF_INET &&
+                   !info->has_v4_conn && info->has_nat64_conn) {
+                       struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
+                       int error, j;
 
-                               for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
-                                       if (nat64prefixes[j].prefix_len != 0) {
-                                               break;
-                                       }
-                               }
+                       bzero(&nat64pre, sizeof(struct sockaddr_in6));
 
-                               VERIFY(j < NAT64_MAX_NUM_PREFIXES);
+                       error = ifnet_get_nat64prefix(ifp, nat64prefixes);
+                       if (error) {
+                               os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
+                               continue;
+                       }
 
-                               error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
-                                   nat64prefixes[j].prefix_len,
-                                   &mpte->__mpte_dst_v4.sin_addr);
-                               if (error != 0) {
-                                       os_log_info(mptcp_log_handle, "%s: cannot synthesize this addr\n",
-                                           __func__);
-                                       continue;
+                       for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
+                               if (nat64prefixes[j].prefix_len != 0) {
+                                       break;
                                }
-
-                               memcpy(&nat64pre.sin6_addr,
-                                   &nat64prefixes[j].ipv6_prefix,
-                                   sizeof(nat64pre.sin6_addr));
-                               nat64pre.sin6_len = sizeof(struct sockaddr_in6);
-                               nat64pre.sin6_family = AF_INET6;
-                               nat64pre.sin6_port = mpte->__mpte_dst_v6.sin6_port;
-                               nat64pre.sin6_flowinfo = 0;
-                               nat64pre.sin6_scope_id = 0;
-
-                               dst = (struct sockaddr *)&nat64pre;
                        }
 
-                       /* Initial subflow started on a NAT64'd address? */
-                       if (mpte->mpte_dst.sa_family == AF_INET6 &&
-                           mpte->mpte_dst_v4_nat64.sin_family == AF_INET) {
-                               dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64;
-                       }
+                       VERIFY(j < NAT64_MAX_NUM_PREFIXES);
 
-                       if (dst->sa_family == AF_INET && !info->has_v4_conn) {
-                               continue;
-                       }
-                       if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
+                       error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
+                           nat64prefixes[j].prefix_len,
+                           &((struct sockaddr_in *)(void *)dst)->sin_addr);
+                       if (error != 0) {
+                               os_log_info(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
                                continue;
                        }
 
-                       mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
+                       memcpy(&nat64pre.sin6_addr,
+                           &nat64prefixes[j].ipv6_prefix,
+                           sizeof(nat64pre.sin6_addr));
+                       nat64pre.sin6_len = sizeof(struct sockaddr_in6);
+                       nat64pre.sin6_family = AF_INET6;
+                       nat64pre.sin6_port = ((struct sockaddr_in *)(void *)dst)->sin_port;
+                       nat64pre.sin6_flowinfo = 0;
+                       nat64pre.sin6_scope_id = 0;
+
+                       dst = (struct sockaddr *)&nat64pre;
                }
+
+               /* Initial subflow started on a NAT64'd address? */
+               if (!(mpte->mpte_flags & MPTE_UNICAST_IP) &&
+                   mpte->mpte_dst.sa_family == AF_INET6 &&
+                   mpte->mpte_dst_v4_nat64.sin_family == AF_INET) {
+                       dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64;
+               }
+
+               if (dst->sa_family == AF_INET && !info->has_v4_conn) {
+                       continue;
+               }
+               if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
+                       continue;
+               }
+
+               mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
        }
 
        if (!cellular_viable && want_cellular) {
@@ -987,21 +1105,56 @@ mptcp_check_subflows_and_add(struct mptses *mpte)
        }
 }
 
-/*
- * Based on the MPTCP Service-type and the state of the subflows, we
- * will destroy subflows here.
- */
 static void
-mptcp_check_subflows_and_remove(struct mptses *mpte)
+mptcp_remove_cell_subflows(struct mptses *mpte)
 {
        struct mptsub *mpts, *tmpts;
-       int found_working_subflow = 0, removed_some = 0;
-       int wifi_unusable = mptcp_is_wifi_unusable(mpte);
+       boolean_t found = false;
+
+       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+               const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
 
-       if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER) {
+               if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
+                       continue;
+               }
+
+               /* We have a functioning subflow on WiFi. No need for cell! */
+               if (mpts->mpts_flags & MPTSF_CONNECTED &&
+                   !mptcp_subflow_disconnecting(mpts)) {
+                       found = true;
+               }
+       }
+
+       /* Didn't found functional sub on WiFi - stay on cell */
+       if (!found) {
                return;
        }
 
+       TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
+               const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
+
+               /* Only remove cellular subflows */
+               if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
+                       continue;
+               }
+
+               os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
+
+               soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
+       }
+
+       return;
+}
+
+/* Returns true if it removed a subflow on cell */
+static void
+mptcp_handover_subflows_remove(struct mptses *mpte)
+{
+       int wifi_unusable = mptcp_is_wifi_unusable_for_session(mpte);
+       boolean_t found_working_subflow = false;
+       struct mptsub *mpts;
+
        /*
         * Look for a subflow that is on a non-cellular interface
         * and actually works (aka, no retransmission timeout).
@@ -1023,14 +1176,17 @@ mptcp_check_subflows_and_remove(struct mptses *mpte)
                        continue;
                }
 
+               os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u unusable %d\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_unusable);
+
                /* Is this subflow in good condition? */
-               if (tp->t_rxtshift == 0) {
-                       found_working_subflow = 1;
+               if (tp->t_rxtshift == 0 && mptetoso(mpte)->so_snd.sb_cc) {
+                       found_working_subflow = true;
                }
 
                /* Or WiFi is fine */
                if (!wifi_unusable) {
-                       found_working_subflow = 1;
+                       found_working_subflow = true;
                }
        }
 
@@ -1042,20 +1198,43 @@ mptcp_check_subflows_and_remove(struct mptses *mpte)
                return;
        }
 
-       TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
-               const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
+       mptcp_remove_cell_subflows(mpte);
+}
 
-               /* Only remove cellular subflows */
-               if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
-                       continue;
-               }
+static void
+mptcp_targetbased_subflows_remove(struct mptses *mpte)
+{
+       uint64_t time_now = mach_continuous_time();
 
-               soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
-               removed_some = 1;
+       if (mpte->mpte_time_target != 0 &&
+           (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
+           mptcp_is_wifi_unusable_for_session(mpte)) {
+               /* WiFi is bad and we are below the target - don't remove any subflows */
+               return;
        }
 
-       if (removed_some) {
-               mptcp_unset_cellicon();
+       mptcp_remove_cell_subflows(mpte);
+}
+
+/*
+ * Based on the MPTCP Service-type and the state of the subflows, we
+ * will destroy subflows here.
+ */
+void
+mptcp_check_subflows_and_remove(struct mptses *mpte)
+{
+       if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
+               return;
+       }
+
+       socket_lock_assert_owned(mptetoso(mpte));
+
+       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
+               mptcp_handover_subflows_remove(mpte);
+       }
+
+       if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
+               mptcp_targetbased_subflows_remove(mpte);
        }
 }
 
@@ -1064,10 +1243,63 @@ mptcp_remove_subflows(struct mptses *mpte)
 {
        struct mptsub *mpts, *tmpts;
 
+       if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
+               return;
+       }
+
        TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
+               const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
+               boolean_t found = false;
+               uint32_t ifindex;
+               uint32_t i;
+
                if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
                        mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
 
+                       os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
+                           ifp ? ifp->if_index : -1);
+                       soevent(mpts->mpts_socket,
+                           SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
+
+                       continue;
+               }
+
+               if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
+                       continue;
+               }
+
+               if (ifp) {
+                       ifindex = ifp->if_index;
+               } else {
+                       ifindex = mpts->mpts_ifscope;
+               }
+
+               for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
+                       if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
+                               continue;
+                       }
+
+                       if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
+                               if (mpts->mpts_dst.sa_family == AF_INET6 &&
+                                   (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
+                                       found = true;
+                                       break;
+                               }
+
+                               if (mpts->mpts_dst.sa_family == AF_INET &&
+                                   mpte->mpte_itfinfo[i].has_v4_conn) {
+                                       found = true;
+                                       break;
+                               }
+                       }
+               }
+
+               if (!found) {
+                       os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                           ifindex, mpts->mpts_flags);
+
                        soevent(mpts->mpts_socket,
                            SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
                }
@@ -1084,8 +1316,7 @@ mptcp_create_subflows(__unused void *arg)
         * while a new event comes in.
         */
        if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
-               mptcplog((LOG_ERR, "%s: bit was already cleared!\n", __func__),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
        }
 
        /* Iterate over all MPTCP connections */
@@ -1093,27 +1324,23 @@ mptcp_create_subflows(__unused void *arg)
        lck_mtx_lock(&mtcbinfo.mppi_lock);
 
        TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
-               struct mptses *mpte;
-               struct socket *mp_so;
+               struct socket *mp_so = mpp->mpp_socket;
+               struct mptses *mpte = mpp->mpp_pcbe;
 
                if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
                        continue;
                }
 
-               mpp_lock(mpp);
+               socket_lock(mp_so, 1);
+               VERIFY(mp_so->so_usecount > 0);
 
                mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
 
-               mpte = mpp->mpp_pcbe;
-               mp_so = mpp->mpp_socket;
-
-               VERIFY(mp_so->so_usecount > 0);
-
                mptcp_check_subflows_and_add(mpte);
                mptcp_remove_subflows(mpte);
 
                mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
-               mpp_unlock(mpp);
+               socket_unlock(mp_so, 1);
        }
 
        lck_mtx_unlock(&mtcbinfo.mppi_lock);
@@ -1136,9 +1363,8 @@ mptcp_sched_create_subflows(struct mptses *mpte)
        struct socket *mp_so = mpp->mpp_socket;
 
        if (!mptcp_ok_to_create_subflows(mp_tp)) {
-               mptcplog((LOG_DEBUG, "%s: not a good time for subflows, state %u flags %#x",
-                   __func__, mp_tp->mpt_state, mp_tp->mpt_flags),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+               os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
                return;
        }
 
@@ -1189,7 +1415,7 @@ mptcp_sopt_free(struct mptopt *mpo)
 void
 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
 {
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+       socket_lock_assert_owned(mptetoso(mpte));
        mpo->mpo_flags |= MPOF_ATTACHED;
        TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
 }
@@ -1200,7 +1426,7 @@ mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
 void
 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
 {
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+       socket_lock_assert_owned(mptetoso(mpte));
        VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
        mpo->mpo_flags &= ~MPOF_ATTACHED;
        TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
@@ -1214,7 +1440,7 @@ mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
 {
        struct mptopt *mpo;
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+       socket_lock_assert_owned(mptetoso(mpte));
 
        TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
                if (mpo->mpo_level == sopt->sopt_level &&
@@ -1338,28 +1564,30 @@ mptcp_subflow_necp_cb(void *handle, __unused int action,
         * The socket is being garbage-collected. There is nothing to be done
         * here.
         */
-       if (so->so_usecount == 0) {
+       if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
                return;
        }
 
        socket_lock(so, 1);
 
        /* Check again after we acquired the lock. */
-       if (so->so_usecount == 0) {
+       if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
                goto out;
        }
 
        mpte = tptomptp(sototcpcb(so))->mpt_mpte;
        mpts = sototcpcb(so)->t_mpsub;
 
-       os_log_debug(mptcp_log_handle, "%s Subflow on itf %u became non-viable, power %u",
-           __func__, mpts->mpts_ifscope, low_power);
+       os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
+           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
 
        mpts->mpts_flags |= MPTSF_CLOSE_REQD;
 
        mptcp_sched_create_subflows(mpte);
 
-       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER && viable != NULL) {
+       if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
+           mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
+           viable != NULL) {
                *viable = 1;
        }
 
@@ -1381,13 +1609,13 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
        int error;
 
        *so = NULL;
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+
        mp_so = mptetoso(mpte);
 
        p = proc_find(mp_so->last_pid);
        if (p == PROC_NULL) {
-               mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
 
                return ESRCH;
        }
@@ -1405,14 +1633,13 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
         * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
         * the ipi-lock. We cannot hold the socket-lock at that point.
         */
-       mpte_unlock(mpte);
+       socket_unlock(mp_so, 0);
        error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
-           SOCF_ASYNC, PROC_NULL);
-       mpte_lock(mpte);
+           SOCF_MPTCP, PROC_NULL);
+       socket_lock(mp_so, 0);
        if (error) {
-               mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx unable to create subflow socket error %d\n",
-                   __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
 
                proc_rele(p);
 
@@ -1469,25 +1696,52 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
                 * then socket-locks) is no more respected. So, we need to
                 * unlock here.
                 */
-               mpte_unlock(mpte);
+               socket_unlock(mp_so, 0);
                error = necp_client_register_socket_flow(mp_so->last_pid,
                    mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so));
-               mpte_lock(mpte);
+               socket_lock(mp_so, 0);
 
                if (error) {
+                       os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
+
                        goto out_err;
                }
 
                /* Possible state-change during the unlock above */
                if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
                    (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
+                       os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                           mp_tp->mpt_state, mp_tp->mpt_flags);
+
+                       error = EINVAL;
                        goto out_err;
                }
 
                uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid);
-       } else {
-               mptcplog((LOG_NOTICE, "%s: uuid is not set!\n"),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
+       }
+
+       /* Needs to happen prior to the delegation! */
+       (*so)->last_pid = mp_so->last_pid;
+
+       if (mp_so->so_flags & SOF_DELEGATED) {
+               if (mpte->mpte_epid) {
+                       error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
+                       if (error) {
+                               os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
+                               goto out_err;
+                       }
+               }
+               if (!uuid_is_null(mpte->mpte_euuid)) {
+                       error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
+                       if (error) {
+                               os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
+                               goto out_err;
+                       }
+               }
        }
 
        /* inherit the other socket options */
@@ -1508,19 +1762,6 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
                goto out_err;
        }
 
-       /* enable keepalive */
-       smpo.mpo_name = SO_KEEPALIVE;
-       if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
-               goto out_err;
-       }
-
-       smpo.mpo_level = IPPROTO_TCP;
-       smpo.mpo_intval = mptcp_subflow_keeptime;
-       smpo.mpo_name = TCP_KEEPALIVE;
-       if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
-               goto out_err;
-       }
-
        if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
                /*
                 * On secondary subflows we might need to set the cell-fallback
@@ -1556,12 +1797,10 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
 
                interim = (mpo->mpo_flags & MPOF_INTERIM);
                if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
-                       mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx"
-                           " sopt %s val %d interim record removed\n", __func__,
-                           (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
+                       os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
                            mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
-                           mpo->mpo_intval),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+                           mpo->mpo_intval);
                        mptcp_sopt_remove(mpte, mpo);
                        mptcp_sopt_free(mpo);
                        continue;
@@ -1599,9 +1838,6 @@ out_err:
 
        proc_rele(p);
 
-       mptcplog((LOG_ERR, "%s: subflow socreate failed with error %d\n",
-           __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
-
        return error;
 }
 
@@ -1681,13 +1917,13 @@ mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
        }
 
        os_log_info(mptcp_log_handle,
-           "%s: ifindex %u dst %s:%d pended %u\n", __func__, mpts->mpts_ifscope,
-           dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
+           "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+           mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
 
        p = proc_find(mp_so->last_pid);
        if (p == PROC_NULL) {
-               mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
 
                return ESRCH;
        }
@@ -1720,14 +1956,65 @@ mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
        DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
            struct mptsub *, mpts, int, error);
        if (error) {
-               mptcplog((LOG_ERR, "%s: connectx failed with error %d ifscope %u\n",
-                   __func__, error, mpts->mpts_ifscope),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
        }
 
        return error;
 }
 
+static int
+mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
+    uint32_t rseq, uint16_t dlen)
+{
+       struct mptsub *mpts = sototcpcb(so)->t_mpsub;
+
+       if (m_pktlen(m) == 0) {
+               return 0;
+       }
+
+       if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
+               if (off && (dsn != m->m_pkthdr.mp_dsn ||
+                   rseq != m->m_pkthdr.mp_rseq ||
+                   dlen != m->m_pkthdr.mp_rlen)) {
+                       os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: %u - %u , %u - %u, %u - %u\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
+                           (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
+                           rseq, m->m_pkthdr.mp_rseq,
+                           dlen, m->m_pkthdr.mp_rlen);
+
+                       soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
+                       return -1;
+               }
+               m->m_pkthdr.mp_dsn += off;
+               m->m_pkthdr.mp_rseq += off;
+               m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
+       } else {
+               if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
+                       /* data arrived without an DSS option mapping */
+
+                       /* initial subflow can fallback right after SYN handshake */
+                       if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
+                               mptcp_notify_mpfail(so);
+                       } else {
+                               soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
+
+                               return -1;
+                       }
+               } else if (m->m_flags & M_PKTHDR) {
+                       /* We need to fake the DATA-mapping */
+                       m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
+                       m->m_pkthdr.mp_dsn = dsn + off;
+                       m->m_pkthdr.mp_rseq = rseq + off;
+                       m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
+               }
+       }
+
+       mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
+
+       return 0;
+}
+
 /*
  * MPTCP subflow socket receive routine, derived from soreceive().
  */
@@ -1742,7 +2029,6 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
        struct mbuf *m, **mp = mp0;
        boolean_t proc_held = FALSE;
 
-       mpte_lock_assert_held(tptomptp(sototcpcb(so))->mpt_mpte);
        VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
 
 #ifdef MORE_LOCKING_DEBUG
@@ -1892,7 +2178,11 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
                        csum = m->m_pkthdr.mp_csum;
                } else {
                        /* We did fallback */
-                       mptcp_adj_rmap(so, m, 0, 0, 0, 0);
+                       if (mptcp_adj_rmap(so, m, 0, 0, 0, 0)) {
+                               error = EIO;
+                               *mp0 = NULL;
+                               goto release;
+                       }
 
                        sbfree(&so->so_rcv, m);
 
@@ -1937,7 +2227,6 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
                                error = EIO;
                                dlen = 0;
                                *mp0 = NULL;
-                               mptcp_subflow_abort(sototcpcb(so)->t_mpsub, ECONNABORTED);
                                break;
                        }
 
@@ -2032,7 +2321,7 @@ mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
                        en_tracing = TRUE;
                        en_tracing_val = top->m_pkthdr.len;
                        KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
-                           VM_KERNEL_ADDRPERM(so),
+                           (unsigned long)VM_KERNEL_ADDRPERM(so),
                            ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
                            (int64_t)en_tracing_val);
                }
@@ -2076,7 +2365,7 @@ out:
 
        if (en_tracing) {
                KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
-                   VM_KERNEL_ADDRPERM(so),
+                   (unsigned long)VM_KERNEL_ADDRPERM(so),
                    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
                    (int64_t)en_tracing_val);
        }
@@ -2097,22 +2386,23 @@ mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
        struct mptsub *mpts = NULL;
        int af, error = 0;
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
        mp_so = mptetoso(mpte);
        mp_tp = mpte->mpte_mptcb;
 
+       socket_lock_assert_owned(mp_so);
+
        if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
                /* If the remote end sends Data FIN, refuse subflow adds */
-               mptcplog((LOG_ERR, "%s state %u\n", __func__, mp_tp->mpt_state),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
                error = ENOTCONN;
                goto out_err;
        }
 
        mpts = mptcp_subflow_alloc();
        if (mpts == NULL) {
-               mptcplog((LOG_ERR, "%s malloc subflow failed\n", __func__),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
                error = ENOMEM;
                goto out_err;
        }
@@ -2161,7 +2451,7 @@ mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
                goto out_err;
        }
 
-       memcpy(&mpts->mpts_dst, dst, dst->sa_len);
+       memcpy(&mpts->mpts_u_dst, dst, dst->sa_len);
 
        af = mpts->mpts_dst.sa_family;
 
@@ -2214,7 +2504,7 @@ mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
        }
 
        /* register for subflow socket read/write events */
-       sock_setupcalls_locked(so, mptcp_subflow_rupcall, mpts, mptcp_subflow_wupcall, mpts, 1);
+       sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
 
        /* Register for subflow socket control events */
        sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
@@ -2224,7 +2514,7 @@ mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
            SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
            SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
            SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
-           SO_FILT_HINT_ADAPTIVE_WTIMO);
+           SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
 
        /* sanity check */
        VERIFY(!(mpts->mpts_flags &
@@ -2256,25 +2546,6 @@ mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
 
        mpts->mpts_flags |= MPTSF_CONNECTING;
 
-       if (af == AF_INET || af == AF_INET6) {
-               char dbuf[MAX_IPv6_STR_LEN];
-
-               mptcplog((LOG_DEBUG, "MPTCP Socket: %s "
-                   "mp_so 0x%llx dst %s[%d] cid %d "
-                   "[pending %s]\n", __func__,
-                   (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   inet_ntop(af, ((af == AF_INET) ?
-                   (void *)&SIN(&mpts->mpts_dst)->sin_addr.s_addr :
-                   (void *)&SIN6(&mpts->mpts_dst)->sin6_addr),
-                   dbuf, sizeof(dbuf)), ((af == AF_INET) ?
-                   ntohs(SIN(&mpts->mpts_dst)->sin_port) :
-                   ntohs(SIN6(&mpts->mpts_dst)->sin6_port)),
-                   mpts->mpts_connid,
-                   ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
-                   "YES" : "NO")),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
-       }
-
        /* connect right away if first attempt, or if join can be done now */
        if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
                error = mptcp_subflow_soconnectx(mpte, mpts);
@@ -2304,15 +2575,24 @@ out_err:
 }
 
 void
-mptcpstats_update(struct mptcp_itf_stats *stats, struct mptsub *mpts)
+mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
 {
-       int index = mptcp_get_statsindex(stats, mpts);
+       int index = mptcpstats_get_index(stats, mpts);
 
        if (index != -1) {
                struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
 
                stats[index].mpis_txbytes += inp->inp_stat->txbytes;
                stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
+
+               stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
+               stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
+
+               stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
+               stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
+
+               stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
+               stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
        }
 }
 
@@ -2328,19 +2608,16 @@ mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
        struct socket *so = mpts->mpts_socket;
        struct tcpcb *tp = sototcpcb(so);
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+       socket_lock_assert_owned(mp_so);
        VERIFY(mpts->mpts_mpte == mpte);
        VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
        VERIFY(mpte->mpte_numflows != 0);
        VERIFY(mp_so->so_usecount > 0);
 
-       mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d %x error %d\n",
-           __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-           mp_so->so_usecount, mp_so->so_retaincnt, mpts->mpts_connid,
-           mpts->mpts_flags, mp_so->so_error),
-           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
-
        mptcpstats_update(mpte->mpte_itfstats, mpts);
+
+       mptcp_unset_cellicon(mpte, mpts, 1);
+
        mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
        mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
 
@@ -2426,15 +2703,14 @@ mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
        struct mptcb *mp_tp;
        int send_dfin = 0;
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-
-       VERIFY(mpts->mpts_mpte == mpte);
-       VERIFY(mpts->mpts_socket != NULL);
+       socket_lock_assert_owned(mptetoso(mpte));
 
        if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
                return;
        }
 
+       mptcp_unset_cellicon(mpte, mpts, 1);
+
        mpts->mpts_flags |= MPTSF_DISCONNECTING;
 
        so = mpts->mpts_socket;
@@ -2464,46 +2740,6 @@ mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
        mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
 }
 
-/*
- * Called when the associated subflow socket posted a read event.
- */
-static void
-mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
-{
-#pragma unused(so, waitf)
-       struct mptsub *mpts = arg, *tmpts;
-       struct mptses *mpte = mpts->mpts_mpte;
-
-       VERIFY(mpte != NULL);
-
-       if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
-               if (!(mpte->mpte_mppcb->mpp_flags & MPP_RUPCALL)) {
-                       mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
-               }
-               return;
-       }
-
-       mpte->mpte_mppcb->mpp_flags |= MPP_RUPCALL;
-       TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
-               if (mpts->mpts_socket->so_usecount == 0) {
-                       /* Will be removed soon by tcp_garbage_collect */
-                       continue;
-               }
-
-               mptcp_subflow_addref(mpts);
-               mpts->mpts_socket->so_usecount++;
-
-               mptcp_subflow_input(mpte, mpts);
-
-               mptcp_subflow_remref(mpts);             /* ours */
-
-               VERIFY(mpts->mpts_socket->so_usecount != 0);
-               mpts->mpts_socket->so_usecount--;
-       }
-
-       mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_RUPCALL);
-}
-
 /*
  * Subflow socket input.
  */
@@ -2529,9 +2765,8 @@ mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
 
        error = sock_receive_internal(so, NULL, &m, 0, NULL);
        if (error != 0 && error != EWOULDBLOCK) {
-               mptcplog((LOG_ERR, "%s: cid %d error %d\n",
-                   __func__, mpts->mpts_connid, error),
-                   MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
                if (error == ENODATA) {
                        /*
                         * Don't ignore ENODATA so as to discover
@@ -2558,11 +2793,17 @@ mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
 
        if (m != NULL) {
                if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
-                       mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
+                       mptcp_set_cellicon(mpte, mpts);
 
                        mpte->mpte_used_cell = 1;
                } else {
-                       mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
+                       /*
+                        * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
+                        * explicitly set the cellicon, then we unset it again.
+                        */
+                       if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
+                               mptcp_unset_cellicon(mpte, NULL, 1);
+                       }
 
                        mpte->mpte_used_wifi = 1;
                }
@@ -2570,18 +2811,55 @@ mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
                mptcp_input(mpte, m);
        }
 
-       /* notify protocol that we drained all the data */
-       if (error == 0 && m != NULL &&
-           (so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
-               (*so->so_proto->pr_usrreqs->pru_rcvd)(so, 0);
-       }
-
 out:
        if (wakeup) {
                mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
        }
 
-       mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
+       mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
+}
+
+void
+mptcp_handle_input(struct socket *so)
+{
+       struct mptsub *mpts, *tmpts;
+       struct mptses *mpte;
+
+       if (!(so->so_flags & SOF_MP_SUBFLOW)) {
+               return;
+       }
+
+       mpts = sototcpcb(so)->t_mpsub;
+       mpte = mpts->mpts_mpte;
+
+       socket_lock_assert_owned(mptetoso(mpte));
+
+       if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
+               if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
+                       mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
+               }
+               return;
+       }
+
+       mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
+       TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
+               if (mpts->mpts_socket->so_usecount == 0) {
+                       /* Will be removed soon by tcp_garbage_collect */
+                       continue;
+               }
+
+               mptcp_subflow_addref(mpts);
+               mpts->mpts_socket->so_usecount++;
+
+               mptcp_subflow_input(mpte, mpts);
+
+               mptcp_subflow_remref(mpts);             /* ours */
+
+               VERIFY(mpts->mpts_socket->so_usecount != 0);
+               mpts->mpts_socket->so_usecount--;
+       }
+
+       mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
 }
 
 /*
@@ -2648,12 +2926,12 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
        uint16_t tot_sent = 0;
        boolean_t reinjected = FALSE;
 
-       mpte_lock_assert_held(mpte);
-
        mp_so = mptetoso(mpte);
        so = mpts->mpts_socket;
        tp = sototcpcb(so);
 
+       socket_lock_assert_owned(mp_so);
+
        VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
        mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
 
@@ -2698,10 +2976,10 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
        }
 
        if (sb_mb == NULL) {
-               mptcplog((LOG_ERR, "%s: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
-                   __func__, (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
-                   (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1),
-                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                   (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
+                   (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
 
                /* Fix it to prevent looping */
                if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
@@ -2723,11 +3001,10 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
 
        /* First, drop acknowledged data */
        if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
-               mptcplog((LOG_ERR, "%s: dropping data, should have been done earlier "
+               os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
                    "dsn %u suna %u reinject? %u\n",
-                   __func__, (uint32_t)mpt_dsn,
-                   (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq),
-                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
+                   (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
                if (mpte->mpte_reinjectq) {
                        mptcp_clean_reinjectq(mpte);
                } else {
@@ -2740,8 +3017,8 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
 
        /* Check again because of above sbdrop */
        if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
-               mptcplog((LOG_ERR, "%s send-buffer is empty\n", __func__),
-                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
                goto out;
        }
 
@@ -2759,9 +3036,9 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
                        sbdrop(&mp_so->so_snd, (int)len);
                        wakeup = 1;
 
-                       mptcplog((LOG_ERR, "%s: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
-                           __func__, (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna),
-                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+                       os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                           (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
                }
        }
 
@@ -2787,8 +3064,8 @@ dont_reinject:
                sb_mb = mp_so->so_snd.sb_mb;
        }
        if (sb_mb == NULL) {
-               mptcplog((LOG_ERR, "%s send-buffer is still empty\n", __func__),
-                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
+                   (unsigned long)VM_KERNEL_ADDRPERM(mpte));
                goto out;
        }
 
@@ -2821,10 +3098,9 @@ dont_reinject:
                        off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
                        sb_cc -= off;
                } else {
-                       mptcplog((LOG_ERR, "%s this should not happen: sndnxt %u sndmax %u\n",
-                           __func__, (uint32_t)mp_tp->mpt_sndnxt,
-                           (uint32_t)mp_tp->mpt_sndmax),
-                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+                       os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
+                           (uint32_t)mp_tp->mpt_sndmax);
 
                        goto out;
                }
@@ -2832,11 +3108,10 @@ dont_reinject:
 
        sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
        if (sb_cc <= 0) {
-               mptcplog((LOG_ERR, "%s sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
-                   __func__, sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
+               os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
                    (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
-                   mptcp_subflow_cwnd_space(so)),
-                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+                   mptcp_subflow_cwnd_space(so));
        }
 
        sb_cc = min(sb_cc, UINT16_MAX);
@@ -2877,10 +3152,9 @@ dont_reinject:
                mlen = min(mlen, sb_cc - tot_sent);
 
                if (mlen < 0) {
-                       mptcplog((LOG_ERR, "%s mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
-                           __func__, (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen,
-                           (uint32_t)off, sb_cc, tot_sent),
-                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+                       os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen,
+                           (uint32_t)off, sb_cc, tot_sent);
                        goto out;
                }
 
@@ -2891,8 +3165,8 @@ dont_reinject:
                m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
                    M_COPYM_MUST_COPY_HDR);
                if (m == NULL) {
-                       mptcplog((LOG_ERR, "%s m_copym_mode failed\n", __func__),
-                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+                       os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
+                           (unsigned long)VM_KERNEL_ADDRPERM(mpte));
                        error = ENOBUFS;
                        break;
                }
@@ -3010,11 +3284,17 @@ done_sending:
                }
 
                if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
-                       mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
+                       mptcp_set_cellicon(mpte, mpts);
 
                        mpte->mpte_used_cell = 1;
                } else {
-                       mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
+                       /*
+                        * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
+                        * explicitly set the cellicon, then we unset it again.
+                        */
+                       if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
+                               mptcp_unset_cellicon(mpte, NULL, 1);
+                       }
 
                        mpte->mpte_used_wifi = 1;
                }
@@ -3025,9 +3305,8 @@ done_sending:
                 */
                error = 0;
        } else {
-               mptcplog((LOG_ERR, "%s: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
-                   __func__, mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat),
-                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
        }
 out:
 
@@ -3155,7 +3434,7 @@ mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
 }
 
 static struct mbuf *
-mptcp_copy_mbuf_list(struct mbuf *m, int len)
+mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
 {
        struct mbuf *top = NULL, *tail = NULL;
        uint64_t dsn;
@@ -3172,8 +3451,8 @@ mptcp_copy_mbuf_list(struct mbuf *m, int len)
 
                n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
                if (n == NULL) {
-                       mptcplog((LOG_ERR, "%s m_copym_mode returned NULL\n", __func__),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+                       os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
                        goto err;
                }
 
@@ -3251,7 +3530,7 @@ mptcp_reinject_mbufs(struct socket *so)
                }
 
                /* Copy the mbuf with headers (aka, DSN-numbers) */
-               m = mptcp_copy_mbuf_list(m, m->m_pkthdr.mp_rlen);
+               m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
                if (m == NULL) {
                        break;
                }
@@ -3287,7 +3566,7 @@ mptcp_clean_reinjectq(struct mptses *mpte)
 {
        struct mptcb *mp_tp = mpte->mpte_mptcb;
 
-       mpte_lock_assert_held(mpte);
+       socket_lock_assert_owned(mptetoso(mpte));
 
        while (mpte->mpte_reinjectq) {
                struct mbuf *m = mpte->mpte_reinjectq;
@@ -3313,8 +3592,7 @@ mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
        struct mptsub *mpts = arg;
        struct mptses *mpte = mpts->mpts_mpte;
 
-       VERIFY(mpte != NULL);
-       mpte_lock_assert_held(mpte);
+       socket_lock_assert_owned(mptetoso(mpte));
 
        if ((mpts->mpts_evctl & events) == events) {
                return;
@@ -3343,8 +3621,6 @@ mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
        int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
            sizeof(mpsub_ev_entry_tbl[0]);
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-
        /* bail if there's nothing to process */
        if (!mpts->mpts_evctl) {
                return ret;
@@ -3388,10 +3664,10 @@ mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
         * so loudly complain if we have any unprocessed one(s).
         */
        if (mpts->mpts_evctl || ret < MPTS_EVRET_OK) {
-               mptcplog((LOG_WARNING, "%s%s: cid %d evret %s (%d) unhandled events=%b\n", __func__,
+               mptcplog((LOG_WARNING, "%s%s: cid %d evret %d unhandled events=%b\n", __func__,
                    (mpts->mpts_evctl && ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
                    mpts->mpts_connid,
-                   mptcp_evret2str(ret), ret, mpts->mpts_evctl, SO_FILT_HINT_BITS),
+                   ret, mpts->mpts_evctl, SO_FILT_HINT_BITS),
                    MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
        } else {
                mptcplog((LOG_DEBUG, "%s: Done, events %b\n", __func__,
@@ -3409,8 +3685,6 @@ mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
        struct socket *mp_so, *so;
        struct mptcb *mp_tp;
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-       VERIFY(mpte->mpte_mppcb != NULL);
        mp_so = mptetoso(mpte);
        mp_tp = mpte->mpte_mptcb;
        so = mpts->mpts_socket;
@@ -3424,6 +3698,7 @@ mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
         * based on the state of the MPTCP connection.
         */
        if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
+           (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
            ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
                mp_so->so_error = so->so_error;
                *p_mpsofilt_hint |= event;
@@ -3443,9 +3718,6 @@ mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
        struct socket *mp_so;
        struct tcpcb *tp;
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-
-       VERIFY(mpte->mpte_mppcb != NULL);
        mp_so = mptetoso(mpte);
        tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
 
@@ -3471,6 +3743,31 @@ mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
        return MPTS_EVRET_DELETE;
 }
 
+static ev_ret_t
+mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
+    uint64_t *p_mpsofilt_hint, uint64_t event)
+{
+#pragma unused(event, p_mpsofilt_hint)
+       struct socket *so, *mp_so;
+
+       so = mpts->mpts_socket;
+
+       if (so->so_error != ENODATA) {
+               return MPTS_EVRET_OK;
+       }
+
+
+       mp_so = mptetoso(mpte);
+
+       mp_so->so_error = ENODATA;
+
+       sorwakeup(mp_so);
+       sowwakeup(mp_so);
+
+       return MPTS_EVRET_OK;
+}
+
+
 /*
  * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
  * indicates that the remote side sent a Data FIN
@@ -3480,10 +3777,7 @@ mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
     uint64_t *p_mpsofilt_hint, uint64_t event)
 {
 #pragma unused(event)
-       struct mptcb *mp_tp;
-
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-       mp_tp = mpte->mpte_mptcb;
+       struct mptcb *mp_tp = mpte->mpte_mptcb;
 
        mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
            MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
@@ -3514,22 +3808,17 @@ mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
        struct socket *mp_so;
        int altpath_exists = 0;
 
-       mpte_lock_assert_held(mpte);
        mp_so = mptetoso(mpte);
-       mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__,
-           (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
-           MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+       os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
 
        mptcp_reinject_mbufs(mpts->mpts_socket);
 
-       mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
-       /*
-        * If there is no alternate eligible subflow, ignore the
-        * failover hint.
-        */
-       if (mpts_alt == NULL) {
-               mptcplog((LOG_WARNING, "%s: no alternate path\n", __func__),
-                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+       mpts_alt = mptcp_get_subflow(mpte, NULL);
+
+       /* If there is no alternate eligible subflow, ignore the failover hint. */
+       if (mpts_alt == NULL || mpts_alt == mpts) {
+               os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
+                   (unsigned long)VM_KERNEL_ADDRPERM(mpte));
 
                goto done;
        }
@@ -3553,9 +3842,8 @@ mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
                mpts->mpts_flags |= MPTSF_FAILINGOVER;
                mpts->mpts_flags &= ~MPTSF_ACTIVE;
 
-               mptcplog((LOG_NOTICE, "%s: switched from %d to %d\n",
-                   __func__, mpts->mpts_connid, mpts_alt->mpts_connid),
-                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+               os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
 
                mptcpstats_inc_switch(mpte, mpts);
 
@@ -3578,9 +3866,6 @@ static ev_ret_t
 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
     uint64_t *p_mpsofilt_hint, uint64_t event)
 {
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-       VERIFY(mpte->mpte_mppcb != NULL);
-
        mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
            mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
 
@@ -3653,6 +3938,13 @@ mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
        struct ifnet *ifp;
        int j;
 
+       /* Subflow IPs will be steered directly by the server - no need to
+        * desynthesize.
+        */
+       if (mpte->mpte_flags & MPTE_UNICAST_IP) {
+               return;
+       }
+
        ifp = sotoinpcb(so)->inp_last_outifp;
 
        if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
@@ -3695,9 +3987,6 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
        int af;
        boolean_t mpok = FALSE;
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-       VERIFY(mpte->mpte_mppcb != NULL);
-
        mp_so = mptetoso(mpte);
        mp_tp = mpte->mpte_mptcb;
        so = mpts->mpts_socket;
@@ -3797,7 +4086,7 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
                        mptcp_notify_mpfail(so);
                } else {
                        if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
-                           mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
+                           mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) {
                                tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
                        } else {
                                mpts->mpts_flags |= MPTSF_PREFERRED;
@@ -3822,10 +4111,6 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
                mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
                mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
                soisconnected(mp_so);
-
-               mptcplog((LOG_DEBUG, "%s: MPTCPS_ESTABLISHED for mp_so 0x%llx mpok %u\n",
-                   __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpok),
-                   MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
        } else if (mpok) {
                /*
                 * case (b) above
@@ -3836,7 +4121,7 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
                 */
                if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
                    !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
-                   mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
+                   mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) {
                        tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
                        mpts->mpts_flags &= ~MPTSF_PREFERRED;
                } else {
@@ -3888,7 +4173,7 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
        }
 
        /* This call, just to "book" an entry in the stats-table for this ifindex */
-       mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
+       mptcpstats_get_index(mpte->mpte_itfstats, mpts);
 
        mptcp_output(mpte);
 
@@ -3906,8 +4191,6 @@ mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
        struct socket *mp_so, *so;
        struct mptcb *mp_tp;
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-       VERIFY(mpte->mpte_mppcb != NULL);
        mp_so = mptetoso(mpte);
        mp_tp = mpte->mpte_mptcb;
        so = mpts->mpts_socket;
@@ -3941,10 +4224,6 @@ mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
                mptcp_drop(mpte, mp_tp, so->so_error);
        }
 
-       if (sototcpcb(so)->t_mpflags & TMPF_FASTCLOSERCV) {
-               mptcp_drop(mpte, mp_tp, mp_so->so_error);
-       }
-
        /*
         * Clear flags that are used by getconninfo to return state.
         * Retain like MPTSF_DELETEOK for internal purposes.
@@ -3964,12 +4243,10 @@ mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
     uint64_t *p_mpsofilt_hint, uint64_t event)
 {
 #pragma unused(event, p_mpsofilt_hint)
+       ev_ret_t ret = MPTS_EVRET_OK;
        struct socket *mp_so, *so;
        struct mptcb *mp_tp;
-       ev_ret_t ret = MPTS_EVRET_OK;
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-       VERIFY(mpte->mpte_mppcb != NULL);
        mp_so = mptetoso(mpte);
        mp_tp = mpte->mpte_mptcb;
        so = mpts->mpts_socket;
@@ -4001,7 +4278,6 @@ mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
        }
 
        if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
-               VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
                ret = MPTS_EVRET_DISCONNECT_FALLBACK;
 
                m_freem_list(mpte->mpte_reinjectq);
@@ -4011,12 +4287,6 @@ mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
                ret = MPTS_EVRET_CONNECT_PENDING;
        }
 
-       mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d mptsf=%b\n",
-           __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-           mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
-           mpts->mpts_flags, MPTSF_BITS),
-           MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
-
 done:
        return ret;
 }
@@ -4033,8 +4303,6 @@ mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
        struct mptcb *mp_tp;
        boolean_t is_fastclose;
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-       VERIFY(mpte->mpte_mppcb != NULL);
        mp_so = mptetoso(mpte);
        mp_tp = mpte->mpte_mptcb;
        so = mpts->mpts_socket;
@@ -4049,6 +4317,8 @@ mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
 
        is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
 
+       tp->t_mpflags |= TMPF_RESET;
+
        t_template = tcp_maketemplate(tp);
        if (t_template) {
                struct tcp_respond_args tra;
@@ -4065,29 +4335,32 @@ mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
                    &t_template->tt_t, (struct mbuf *)NULL,
                    tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
                (void) m_free(dtom(t_template));
-               mptcplog((LOG_DEBUG, "MPTCP Events: "
-                   "%s: mp_so 0x%llx cid %d \n",
-                   __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   so, mpts->mpts_connid),
-                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
        }
-       mptcp_subflow_abort(mpts, ECONNABORTED);
 
        if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
+               struct mptsub *iter, *tmp;
+
                *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
 
-               if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
-                       mp_so->so_error = ECONNABORTED;
-               } else {
-                       mp_so->so_error = ECONNRESET;
+               mp_so->so_error = ECONNRESET;
+
+               TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
+                       if (iter == mpts) {
+                               continue;
+                       }
+                       mptcp_subflow_abort(iter, ECONNABORTED);
                }
 
                /*
                 * mptcp_drop is being called after processing the events, to fully
                 * close the MPTCP connection
                 */
+               mptcp_drop(mpte, mp_tp, mp_so->so_error);
        }
 
+       mptcp_subflow_abort(mpts, ECONNABORTED);
+
+
        if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
                mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
        }
@@ -4155,30 +4428,6 @@ mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
        return MPTS_EVRET_OK;
 }
 
-static const char *
-mptcp_evret2str(ev_ret_t ret)
-{
-       const char *c = "UNKNOWN";
-
-       switch (ret) {
-       case MPTS_EVRET_DELETE:
-               c = "MPTS_EVRET_DELETE";
-               break;
-       case MPTS_EVRET_CONNECT_PENDING:
-               c = "MPTS_EVRET_CONNECT_PENDING";
-               break;
-       case MPTS_EVRET_DISCONNECT_FALLBACK:
-               c = "MPTS_EVRET_DISCONNECT_FALLBACK";
-               break;
-       case MPTS_EVRET_OK:
-               c = "MPTS_EVRET_OK";
-               break;
-       default:
-               break;
-       }
-       return c;
-}
-
 /*
  * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
  * caller must ensure that the option can be issued on subflow sockets, via
@@ -4192,18 +4441,19 @@ mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *
        int error;
 
        VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
-       mpte_lock_assert_held(mpte);
 
        mp_so = mptetoso(mpte);
        so = mpts->mpts_socket;
 
+       socket_lock_assert_owned(mp_so);
+
        if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
            mpo->mpo_level == SOL_SOCKET &&
            mpo->mpo_name == SO_MARK_CELLFALLBACK) {
                struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
 
                mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n",
-                   __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable(mpte),
+                   __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable_for_session(mpte),
                    sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
                    mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1),
                    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
@@ -4246,20 +4496,12 @@ mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *
        sopt.sopt_p = kernproc;
 
        error = sosetoptlock(so, &sopt, 0);
-       if (error == 0) {
-               mptcplog((LOG_INFO, "%s: mp_so 0x%llx sopt %s "
-                   "val %d set successful\n", __func__,
-                   (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
-                   mpo->mpo_intval),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
-       } else {
-               mptcplog((LOG_ERR, "%s:mp_so 0x%llx sopt %s "
+       if (error) {
+               os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
                    "val %d set error %d\n", __func__,
-                   (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
+                   (unsigned long)VM_KERNEL_ADDRPERM(mpte),
                    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
-                   mpo->mpo_intval, error),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+                   mpo->mpo_intval, error);
        }
        return error;
 }
@@ -4278,9 +4520,10 @@ mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
        int error;
 
        VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
        mp_so = mptetoso(mpte);
 
+       socket_lock_assert_owned(mp_so);
+
        bzero(&sopt, sizeof(sopt));
        sopt.sopt_dir = SOPT_GET;
        sopt.sopt_level = mpo->mpo_level;
@@ -4290,20 +4533,11 @@ mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
        sopt.sopt_p = kernproc;
 
        error = sogetoptlock(so, &sopt, 0);     /* already locked */
-       if (error == 0) {
-               mptcplog((LOG_DEBUG, "MPTCP Socket: "
-                   "%s: mp_so 0x%llx sopt %s "
-                   "val %d get successful\n", __func__,
-                   (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
-                   mpo->mpo_intval),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
-       } else {
-               mptcplog((LOG_ERR, "MPTCP Socket: "
-                   "%s: mp_so 0x%llx sopt %s get error %d\n",
-                   __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+       if (error) {
+               os_log_error(mptcp_log_handle,
+                   "%s - %lx: sopt %s get error %d\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                   mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
        }
        return error;
 }
@@ -4329,46 +4563,22 @@ mptcp_gc(struct mppcbinfo *mppi)
                struct mptses *mpte;
                struct mptcb *mp_tp;
 
-               VERIFY(mpp->mpp_flags & MPP_ATTACHED);
                mp_so = mpp->mpp_socket;
-               VERIFY(mp_so != NULL);
                mpte = mptompte(mpp);
-               VERIFY(mpte != NULL);
                mp_tp = mpte->mpte_mptcb;
-               VERIFY(mp_tp != NULL);
-
-               mptcplog((LOG_DEBUG, "MPTCP Socket: "
-                   "%s: mp_so 0x%llx found "
-                   "(u=%d,r=%d,s=%d)\n", __func__,
-                   (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
-                   mp_so->so_retaincnt, mpp->mpp_state),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
 
-               if (!mpte_try_lock(mpte)) {
-                       mptcplog((LOG_DEBUG, "MPTCP Socket: "
-                           "%s: mp_so 0x%llx skipped lock "
-                           "(u=%d,r=%d)\n", __func__,
-                           (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                           mp_so->so_usecount, mp_so->so_retaincnt),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+               if (!mpp_try_lock(mpp)) {
                        active++;
                        continue;
                }
 
+               VERIFY(mpp->mpp_flags & MPP_ATTACHED);
+
                /* check again under the lock */
                if (mp_so->so_usecount > 0) {
                        boolean_t wakeup = FALSE;
                        struct mptsub *mpts, *tmpts;
 
-                       mptcplog((LOG_DEBUG, "MPTCP Socket: "
-                           "%s: mp_so 0x%llx skipped usecount "
-                           "[u=%d,r=%d] %d %d\n", __func__,
-                           (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                           mp_so->so_usecount, mp_so->so_retaincnt,
-                           mp_tp->mpt_gc_ticks,
-                           mp_tp->mpt_state),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
-
                        if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
                                if (mp_tp->mpt_gc_ticks > 0) {
                                        mp_tp->mpt_gc_ticks--;
@@ -4384,15 +4594,15 @@ mptcp_gc(struct mppcbinfo *mppi)
                                            mpts, SO_FILT_HINT_DISCONNECTED);
                                }
                        }
-                       mpte_unlock(mpte);
+                       socket_unlock(mp_so, 0);
                        active++;
                        continue;
                }
 
                if (mpp->mpp_state != MPPCB_STATE_DEAD) {
-                       panic("MPTCP Socket: %s: mp_so 0x%llx skipped state "
+                       panic("%s - %lx: skipped state "
                            "[u=%d,r=%d,s=%d]\n", __func__,
-                           (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
+                           (unsigned long)VM_KERNEL_ADDRPERM(mpte),
                            mp_so->so_usecount, mp_so->so_retaincnt,
                            mpp->mpp_state);
                }
@@ -4403,12 +4613,6 @@ mptcp_gc(struct mppcbinfo *mppi)
 
                mptcp_session_destroy(mpte);
 
-               mptcplog((LOG_DEBUG, "MPTCP Socket: "
-                   "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
-                   __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   mp_so->so_usecount, mp_so->so_retaincnt),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
-
                DTRACE_MPTCP4(dispose, struct socket *, mp_so,
                    struct sockbuf *, &mp_so->so_rcv,
                    struct sockbuf *, &mp_so->so_snd,
@@ -4427,11 +4631,11 @@ mptcp_gc(struct mppcbinfo *mppi)
 struct mptses *
 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
 {
-       struct socket *mp_so;
+       struct socket *mp_so = mptetoso(mpte);
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
        VERIFY(mpte->mpte_mptcb == mp_tp);
-       mp_so = mptetoso(mpte);
+
+       socket_lock_assert_owned(mp_so);
 
        DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
            uint32_t, 0 /* event */);
@@ -4450,12 +4654,11 @@ mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
 struct mptses *
 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
 {
-       struct socket *mp_so = NULL;
        struct mptsub *mpts = NULL, *tmpts = NULL;
+       struct socket *mp_so = mptetoso(mpte);
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+       socket_lock_assert_owned(mp_so);
        VERIFY(mpte->mpte_mptcb == mp_tp);
-       mp_so = mptetoso(mpte);
 
        mp_tp->mpt_state = MPTCPS_TERMINATE;
 
@@ -4484,11 +4687,13 @@ void
 mptcp_subflow_workloop(struct mptses *mpte)
 {
        boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
-       uint64_t mpsofilt_hint_mask;
+       uint64_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
        struct mptsub *mpts, *tmpts;
        struct socket *mp_so;
 
-       mpte_lock_assert_held(mpte);
+       mp_so = mptetoso(mpte);
+
+       socket_lock_assert_owned(mp_so);
 
        if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
                mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
@@ -4496,10 +4701,7 @@ mptcp_subflow_workloop(struct mptses *mpte)
        }
        mpte->mpte_flags |= MPTE_IN_WORKLOOP;
 
-       mp_so = mptetoso(mpte);
-
 relaunch:
-       mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
        mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
 
        TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
@@ -4555,6 +4757,11 @@ relaunch:
        if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
                VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
 
+               if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
+                       mp_so->so_state |= SS_CANTRCVMORE;
+                       sorwakeup(mp_so);
+               }
+
                soevent(mp_so, mpsofilt_hint_mask);
        }
 
@@ -4596,10 +4803,6 @@ relaunch:
                            ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
                        tp->t_mpflags |= TMPF_TCP_FALLBACK;
 
-                       if (mpts->mpts_flags & MPTSF_ACTIVE) {
-                               continue;
-                       }
-                       tp->t_mpflags |= TMPF_RESET;
                        soevent(so, SO_FILT_HINT_MUSTRST);
                } else if (connect_pending) {
                        /*
@@ -4656,6 +4859,7 @@ mptcp_lock(struct socket *mp_so, int refcount, void *lr)
        }
        if (refcount != 0) {
                mp_so->so_usecount++;
+               mpp->mpp_inside++;
        }
        mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
        mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
@@ -4684,10 +4888,11 @@ mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
                    solockhistory_nr(mp_so));
                /* NOTREACHED */
        }
-       mpp_lock_assert_held(mpp);
+       socket_lock_assert_owned(mp_so);
 
        if (refcount != 0) {
                mp_so->so_usecount--;
+               mpp->mpp_inside--;
        }
 
        if (mp_so->so_usecount < 0) {
@@ -4695,6 +4900,11 @@ mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
                    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
                /* NOTREACHED */
        }
+       if (mpp->mpp_inside < 0) {
+               panic("%s: mpp=%p inside=%x lrh= %s\n", __func__,
+                   mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
+               /* NOTREACHED */
+       }
        mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
        mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
        mpp_unlock(mpp);
@@ -4728,12 +4938,10 @@ mptcp_getlock(struct socket *mp_so, int flags)
  */
 
 static void
-mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
-    uint8_t addr_id)
+mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
 {
        struct tcpcb *tp = sototcpcb(so);
        struct mptcp_subf_auth_entry *sauth_entry;
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
 
        /*
         * The address ID of the first flow is implicitly 0.
@@ -4789,7 +4997,6 @@ mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
     u_int32_t *rrand)
 {
        struct mptcp_subf_auth_entry *sauth_entry;
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
 
        LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
                if (sauth_entry->msae_laddr_id == addr_id) {
@@ -4809,26 +5016,23 @@ mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
     mptcp_addr_id raddr_id, u_int32_t raddr_rand)
 {
        struct mptcp_subf_auth_entry *sauth_entry;
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
 
        LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
                if (sauth_entry->msae_laddr_id == laddr_id) {
                        if ((sauth_entry->msae_raddr_id != 0) &&
                            (sauth_entry->msae_raddr_id != raddr_id)) {
-                               mptcplog((LOG_ERR, "MPTCP Socket: %s mismatched"
-                                   " address ids %d %d \n", __func__, raddr_id,
-                                   sauth_entry->msae_raddr_id),
-                                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
+                               os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
+                                   " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
+                                   raddr_id, sauth_entry->msae_raddr_id);
                                return;
                        }
                        sauth_entry->msae_raddr_id = raddr_id;
                        if ((sauth_entry->msae_raddr_rand != 0) &&
                            (sauth_entry->msae_raddr_rand != raddr_rand)) {
-                               mptcplog((LOG_ERR, "MPTCP Socket: "
-                                   "%s: dup SYN_ACK %d %d \n",
-                                   __func__, raddr_rand,
-                                   sauth_entry->msae_raddr_rand),
-                                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
+                               os_log_error(mptcp_log_handle, "%s - %lx: "
+                                   "dup SYN_ACK %d %d \n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
+                                   raddr_rand, sauth_entry->msae_raddr_rand);
                                return;
                        }
                        sauth_entry->msae_raddr_rand = raddr_rand;
@@ -4908,8 +5112,6 @@ mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest)
 {
        uint32_t lrand, rrand;
 
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
-
        lrand = rrand = 0;
        mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
        mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand,
@@ -4996,7 +5198,6 @@ int
 mptcp_init_remote_parms(struct mptcb *mp_tp)
 {
        char remote_digest[SHA1_RESULTLEN];
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
 
        /* Only Version 0 is supported for auth purposes */
        if (mp_tp->mpt_version != MPTCP_STD_VERSION_0) {
@@ -5010,6 +5211,7 @@ mptcp_init_remote_parms(struct mptcb *mp_tp)
        mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
            (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t));
        mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
+       mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
 
        return 0;
 }
@@ -5048,7 +5250,6 @@ mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
        }
 
        __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
 
        while (m) {
                VERIFY(m->m_flags & M_PKTHDR);
@@ -5108,7 +5309,15 @@ mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
 
        mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
            MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
-       mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
+
+       /* We can have data in the subflow's send-queue that is being acked,
+        * while the DATA_ACK has already advanced. Thus, we should check whether
+        * or not the DATA_ACK is actually new here.
+        */
+       if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
+           MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
+               mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
+       }
 }
 
 void
@@ -5259,44 +5468,6 @@ mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
        }
 }
 
-int
-mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
-    uint32_t rseq, uint16_t dlen)
-{
-       struct mptsub *mpts = sototcpcb(so)->t_mpsub;
-
-       if (m_pktlen(m) == 0) {
-               return 0;
-       }
-
-       if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
-               if (off && (dsn != m->m_pkthdr.mp_dsn ||
-                   rseq != m->m_pkthdr.mp_rseq ||
-                   dlen != m->m_pkthdr.mp_rlen)) {
-                       mptcplog((LOG_ERR, "%s: Received incorrect second mapping: %llu - %llu , %u - %u, %u - %u\n",
-                           __func__, dsn, m->m_pkthdr.mp_dsn,
-                           rseq, m->m_pkthdr.mp_rseq,
-                           dlen, m->m_pkthdr.mp_rlen),
-                           MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
-                       return -1;
-               }
-               m->m_pkthdr.mp_dsn += off;
-               m->m_pkthdr.mp_rseq += off;
-               m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
-       } else {
-               if (!(mpts->mpts_flags & MPTSF_CONFIRMED)) {
-                       /* data arrived without an DSS option mapping */
-
-                       /* initial subflow can fallback right after SYN handshake */
-                       mptcp_notify_mpfail(so);
-               }
-       }
-
-       mpts->mpts_flags |= MPTSF_CONFIRMED;
-
-       return 0;
-}
-
 /*
  * Following routines help with failure detection and failover of data
  * transfer from one subflow to another.
@@ -5361,9 +5532,7 @@ mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
         * not much else to do.
         */
 
-       mptcplog((LOG_ERR, "MPTCP Sender: "
-           "%s: %llu not found \n", __func__, dsn_fail),
-           MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
+       os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
        return -1;
 }
 
@@ -5576,7 +5745,7 @@ mptcp_sbspace(struct mptcb *mp_tp)
        int32_t space;
        int32_t pending = 0;
 
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
+       socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
 
        mptcp_sbrcv_grow_rwin(mp_tp, sb);
 
@@ -5674,7 +5843,8 @@ boolean_t
 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
 {
        boolean_t ret = 1;
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
+
+       socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
 
        if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
                ret = 0;
@@ -5703,7 +5873,7 @@ mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
                return 0;
        }
 
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
+       socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
 
        /*
         * For the first subflow and subsequent subflows, adjust mss for
@@ -5810,13 +5980,12 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS
        }
        TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
                flows = NULL;
-               mpp_lock(mpp);
+               socket_lock(mpp->mpp_socket, 1);
                VERIFY(mpp->mpp_flags & MPP_ATTACHED);
                mpte = mptompte(mpp);
-               VERIFY(mpte != NULL);
-               mpte_lock_assert_held(mpte);
+
+               socket_lock_assert_owned(mptetoso(mpte));
                mp_tp = mpte->mpte_mptcb;
-               VERIFY(mp_tp != NULL);
 
                bzero(&mptcpci, sizeof(mptcpci));
                mptcpci.mptcpci_state = mp_tp->mpt_state;
@@ -5844,7 +6013,7 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS
                if (mpte->mpte_numflows != 0) {
                        flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
                        if (flows == NULL) {
-                               mpp_unlock(mpp);
+                               socket_unlock(mpp->mpp_socket, 1);
                                break;
                        }
                        mptcpci.mptcpci_len = sizeof(mptcpci) +
@@ -5856,7 +6025,7 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS
                        error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
                }
                if (error) {
-                       mpp_unlock(mpp);
+                       socket_unlock(mpp->mpp_socket, 1);
                        FREE(flows, M_TEMP);
                        break;
                }
@@ -5866,7 +6035,7 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS
                        fill_mptcp_subflow(so, &flows[f], mpts);
                        f++;
                }
-               mpp_unlock(mpp);
+               socket_unlock(mpp->mpp_socket, 1);
                if (flows) {
                        error = SYSCTL_OUT(req, flows, len);
                        FREE(flows, M_TEMP);
@@ -5938,7 +6107,7 @@ mptcp_notsent_lowat_check(struct socket *so)
        }
 
        mpte = mptompte(mpp);
-       mpte_lock_assert_held(mpte);
+       socket_lock_assert_owned(mptetoso(mpte));
        mp_tp = mpte->mpte_mptcb;
 
        notsent = so->so_snd.sb_cc;
@@ -5981,12 +6150,6 @@ mptcp_notsent_lowat_check(struct socket *so)
        return 0;
 }
 
-/* Using Symptoms Advisory to detect poor WiFi or poor Cell */
-static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
-static uint32_t mptcp_kern_skt_inuse = 0;
-static uint32_t mptcp_kern_skt_unit;
-symptoms_advisory_t mptcp_advisory;
-
 static errno_t
 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
     void **unitinfo)
@@ -5994,7 +6157,7 @@ mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
 #pragma unused(kctlref, sac, unitinfo)
 
        if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
-               os_log_error(mptcp_log_handle, "%s MPTCP kernel-control socket for Symptoms already open!", __func__);
+               os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
        }
 
        mptcp_kern_skt_unit = sac->sc_unit;
@@ -6003,7 +6166,7 @@ mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
 }
 
 static void
-mptcp_allow_uuid(uuid_t uuid)
+mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
 {
        struct mppcb *mpp;
 
@@ -6012,13 +6175,10 @@ mptcp_allow_uuid(uuid_t uuid)
        lck_mtx_lock(&mtcbinfo.mppi_lock);
 
        TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
-               struct mptses *mpte;
-               struct socket *mp_so;
+               struct socket *mp_so = mpp->mpp_socket;
+               struct mptses *mpte = mpp->mpp_pcbe;
 
-               mpp_lock(mpp);
-
-               mpte = mpp->mpp_pcbe;
-               mp_so = mpp->mpp_socket;
+               socket_lock(mp_so, 1);
 
                if (mp_so->so_flags & SOF_DELEGATED &&
                    uuid_compare(uuid, mp_so->e_uuid)) {
@@ -6028,18 +6188,22 @@ mptcp_allow_uuid(uuid_t uuid)
                        goto next;
                }
 
-               os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp\n",
-                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
+               os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
 
                mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
 
+               if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
+                       mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
+               }
+
                mptcp_check_subflows_and_add(mpte);
                mptcp_remove_subflows(mpte);
 
-               mpte->mpte_flags &= ~MPTE_ACCESS_GRANTED;
+               mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
 
 next:
-               mpp_unlock(mpp);
+               socket_unlock(mp_so, 1);
        }
 
        lck_mtx_unlock(&mtcbinfo.mppi_lock);
@@ -6055,16 +6219,14 @@ mptcp_wifi_status_changed(void)
        lck_mtx_lock(&mtcbinfo.mppi_lock);
 
        TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
-               struct mptses *mpte;
-               struct socket *mp_so;
-
-               mpp_lock(mpp);
+               struct socket *mp_so = mpp->mpp_socket;
+               struct mptses *mpte = mpp->mpp_pcbe;
 
-               mpte = mpp->mpp_pcbe;
-               mp_so = mpp->mpp_socket;
+               socket_lock(mp_so, 1);
 
-               /* Only handover-mode is purely driven by Symptom's Wi-Fi status */
-               if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER) {
+               /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
+               if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
+                   mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
                        goto next;
                }
 
@@ -6072,7 +6234,7 @@ mptcp_wifi_status_changed(void)
                mptcp_check_subflows_and_remove(mpte);
 
 next:
-               mpp_unlock(mpp);
+               socket_unlock(mp_so, 1);
        }
 
        lck_mtx_unlock(&mtcbinfo.mppi_lock);
@@ -6087,7 +6249,8 @@ mptcp_ask_symptoms(struct mptses *mpte)
        int pid, prio, err;
 
        if (mptcp_kern_skt_unit == 0) {
-               os_log_error(mptcp_log_handle, "%s skt_unit is still 0\n", __func__);
+               os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
                return;
        }
 
@@ -6101,7 +6264,8 @@ mptcp_ask_symptoms(struct mptses *mpte)
 
        p = proc_find(pid);
        if (p == PROC_NULL) {
-               os_log_error(mptcp_log_handle, "%s Couldn't find proc for pid %u\n", __func__, pid);
+               os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
                return;
        }
 
@@ -6115,7 +6279,8 @@ mptcp_ask_symptoms(struct mptses *mpte)
 
        prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
 
-       if (prio == TASK_BACKGROUND_APPLICATION) {
+       if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
+           prio == TASK_DARWINBG_APPLICATION) {
                ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
        } else if (prio == TASK_FOREGROUND_APPLICATION) {
                ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
@@ -6126,8 +6291,8 @@ mptcp_ask_symptoms(struct mptses *mpte)
        err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
            &ask, sizeof(ask), CTL_DATA_EOR);
 
-       os_log_debug(mptcp_log_handle, "%s asked symptoms about pid %u, prio %u, err %d\n",
-           __func__, pid, ask.priority, err);
+       os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
+           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
 
 
        proc_rele(p);
@@ -6152,7 +6317,7 @@ mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
        symptoms_advisory_t *sa = NULL;
 
        if (kcunit != mptcp_kern_skt_unit) {
-               os_log_error(mptcp_log_handle, "%s kcunit %u is different from expected one %u\n",
+               os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
                    __func__, kcunit, mptcp_kern_skt_unit);
        }
 
@@ -6170,46 +6335,39 @@ mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
 
        sa = mbuf_data(m);
 
-       if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT &&
-           sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
-               uint8_t old_wifi_status = mptcp_advisory.sa_wifi_status;
-
-               mptcplog((LOG_DEBUG, "%s: wifi %d,%d\n",
-                   __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status),
-                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
+       if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
+               os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
+                   sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
+                   sa->sa_cell_status, mptcp_advisory.sa_cell_status);
 
-               if ((sa->sa_wifi_status &
-                   (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) !=
-                   (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) {
+               if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
                        mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
-               }
-
-               if (old_wifi_status != mptcp_advisory.sa_wifi_status) {
                        mptcp_wifi_status_changed();
                }
-       } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_NOCOMMENT) {
-               mptcplog((LOG_DEBUG, "%s: NOCOMMENT wifi %d\n", __func__,
-                   mptcp_advisory.sa_wifi_status),
-                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
-       } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_USEAPP) {
-               uuid_t uuid;
+       } else {
+               struct mptcp_symptoms_answer answer;
                errno_t err;
 
-               if (mbuf_len(m) < sizeof(uuid_t) + sizeof(*sa)) {
-                       os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
-                           __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa));
+               /* We temporarily allow different sizes for ease of submission */
+               if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
+                   mbuf_len(m) != sizeof(answer)) {
+                       os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
+                           __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
+                           sizeof(answer));
                        mbuf_free(m);
                        return EINVAL;
                }
 
-               err = mbuf_copydata(m, sizeof(*sa), sizeof(uuid_t), uuid);
+               memset(&answer, 0, sizeof(answer));
+
+               err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
                if (err) {
                        os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
                        mbuf_free(m);
                        return err;
                }
 
-               mptcp_allow_uuid(uuid);
+               mptcp_allow_uuid(answer.uuid, answer.rssi);
        }
 
        mbuf_freem(m);
@@ -6237,14 +6395,14 @@ mptcp_control_register(void)
  * Three return-values:
  * 1  : WiFi is bad
  * 0  : WiFi is good
- * -1 : WiFi-state is unknown, use subflow-only heuristics
+ * -1 : WiFi-state is unknown
  */
 int
-mptcp_is_wifi_unusable(struct mptses *mpte)
+mptcp_is_wifi_unusable_for_session(struct mptses *mpte)
 {
        if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
                if (mptcp_advisory.sa_wifi_status) {
-                       return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0;
+                       return symptoms_is_wifi_lossy() ? 1 : 0;
                }
 
                /*
@@ -6252,23 +6410,39 @@ mptcp_is_wifi_unusable(struct mptses *mpte)
                 * about the Wi-Fi state, let's be pessimistic.
                 */
                return -1;
-       }
+       } else {
+               if (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) {
+                       return 1;
+               }
 
-       return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0;
-}
+               /*
+                * If we are target-based (meaning, we allow to be more lax on
+                * the "unusable" target. We only *know* about the state once
+                * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
+                *
+                * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
+                * be set.
+                *
+                * In any other case (while in target-mode), consider WiFi bad
+                * and we are going to ask for allowance from Symptoms anyway.
+                */
+               if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
+                       if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
+                           mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
+                               return 0;
+                       }
 
-boolean_t
-mptcp_subflow_is_bad(struct mptses *mpte, struct mptsub *mpts)
-{
-       struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
-       int fail_thresh = mptcp_fail_thresh;
+                       return 1;
+               }
 
-       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
-               fail_thresh *= 2;
+               return 0;
        }
+}
 
-       return tp->t_rxtshift >= fail_thresh &&
-              (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq);
+boolean_t
+symptoms_is_wifi_lossy(void)
+{
+       return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
 }
 
 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
@@ -6308,10 +6482,6 @@ mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
                        VERIFY(mp_so->so_snd.sb_mb != NULL);
                        sbdrop(&mp_so->so_snd, (int)mp_droplen);
                }
-               mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d TFO tcp len %d mptcp len %d\n",
-                   __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   mpts->mpts_connid, tcp_droplen, mp_droplen),
-                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
        }
 }
 
@@ -6352,8 +6522,8 @@ mptcp_post_event(u_int32_t event_code, int value)
        return kev_post_msg(&ev_msg);
 }
 
-void
-mptcp_set_cellicon(struct mptses *mpte)
+static void
+mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
 {
        int error;
 
@@ -6362,54 +6532,124 @@ mptcp_set_cellicon(struct mptses *mpte)
                return;
        }
 
-       /* Remember the last time we set the cellicon (see mptcp_unset_cellicon) */
-       mptcp_last_cellicon_set = tcp_now;
+       /* Subflow is disappearing - don't set it on this one */
+       if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
+               return;
+       }
+
+       /* Remember the last time we set the cellicon. Needed for debouncing */
+       mpte->mpte_last_cellicon_set = tcp_now;
+
+       if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
+           mpte->mpte_cellicon_increments != 0) {
+               if (mptcp_cellicon_refcount == 0) {
+                       os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
+
+                       /* Continue, so that the icon gets set... */
+               } else {
+                       /*
+                        * In this case, the cellicon is already set. No need to bump it
+                        * even higher
+                        */
+
+                       return;
+               }
+       }
+
+       /* When tearing down this subflow, we need to decrement the
+        * reference counter
+        */
+       mpts->mpts_flags |= MPTSF_CELLICON_SET;
+
+       /* This counter, so that when a session gets destroyed we decrement
+        * the reference counter by whatever is left
+        */
+       mpte->mpte_cellicon_increments++;
 
-       /* If cellicon is already set, get out of here! */
-       if (OSTestAndSet(7, &mptcp_cellicon_is_set)) {
+       if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
+               /* If cellicon is already set, get out of here! */
                return;
        }
 
        error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
 
        if (error) {
-               mptcplog((LOG_ERR, "%s: Setting cellicon failed with %d\n",
-                   __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
        } else {
-               mptcplog((LOG_DEBUG, "%s successfully set the cellicon\n", __func__),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+               os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
        }
 }
 
 void
-mptcp_unset_cellicon(void)
+mptcp_clear_cellicon(void)
 {
-       int error;
+       int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
+
+       if (error) {
+               os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
+                   __func__, error);
+       } else {
+               os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
+                   __func__);
+       }
+}
+
+/*
+ * Returns true if the icon has been flipped to WiFi.
+ */
+static boolean_t
+__mptcp_unset_cellicon(long val)
+{
+       if (OSAddAtomic(-val, &mptcp_cellicon_refcount) != 1) {
+               return false;
+       }
+
+       mptcp_clear_cellicon();
+
+       return true;
+}
 
-       /* If cellicon is already unset, get out of here! */
-       if (OSTestAndClear(7, &mptcp_cellicon_is_set)) {
+static void
+mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, long val)
+{
+       /* First-party apps (Siri) don't flip the cellicon */
+       if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
                return;
        }
 
-       /*
-        * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
-        * explicitly set the cellicon (see mptcp_set_cellicon()), then we unset
-        * it again.
-        */
-       if (TSTMP_GT(mptcp_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE,
-           tcp_now)) {
-               OSTestAndSet(7, &mptcp_cellicon_is_set);
+       if (mpte->mpte_cellicon_increments == 0) {
+               /* This flow never used cell - get out of here! */
                return;
        }
 
-       error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
+       if (mptcp_cellicon_refcount == 0) {
+               os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
 
-       if (error) {
-               mptcplog((LOG_ERR, "%s: Unsetting cellicon failed with %d\n",
-                   __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
-       } else {
-               mptcplog((LOG_DEBUG, "%s successfully unset the cellicon\n", __func__),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+               return;
+       }
+
+       if (mpts) {
+               if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
+                       return;
+               }
+
+               mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
+       }
+
+       mpte->mpte_cellicon_increments--;
+
+       if (__mptcp_unset_cellicon(val) == false) {
+               return;
+       }
+
+       /* All flows are gone - our counter should be at zero too! */
+       if (mpte->mpte_cellicon_increments != 0) {
+               os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
        }
 }
 
index b376cea471154c0dffb08677ac86a30db1b1ba00..ac7595aead653fda613e8d6cc77e10e7ab9bb646 100644 (file)
@@ -66,6 +66,8 @@ SYSCTL_INT(_net_inet_mptcp, OID_AUTO, tw, CTLFLAG_RW | CTLFLAG_LOCKED,
 
 #define TIMEVAL_TO_HZ(_tv_)     ((_tv_).tv_sec * hz + (_tv_).tv_usec / hz)
 
+static int mptcp_cancel_urgency_timer(struct mptses *mpte);
+
 static int
 mptcp_timer_demux(struct mptses *mpte, uint32_t now_msecs)
 {
@@ -75,7 +77,6 @@ mptcp_timer_demux(struct mptses *mpte, uint32_t now_msecs)
 
        DTRACE_MPTCP2(timer, struct mptses *, mpte, struct mptcb *, mp_tp);
 
-       mpte_lock_assert_held(mpte);
        switch (mp_tp->mpt_timer_vals) {
        case MPTT_REXMT:
                if (mp_tp->mpt_rxtstart == 0) {
@@ -144,16 +145,15 @@ mptcp_timer(struct mppcbinfo *mppi)
                struct mptses *mpte;
 
                mp_so = mpp->mpp_socket;
-               VERIFY(mp_so != NULL);
                mpte = mptompte(mpp);
-               VERIFY(mpte != NULL);
-               mpte_lock(mpte);
+               socket_lock(mp_so, 1);
+
                VERIFY(mpp->mpp_flags & MPP_ATTACHED);
 
                if (mptcp_timer_demux(mpte, now_msecs)) {
                        resched_timer = 1;
                }
-               mpte_unlock(mpte);
+               socket_unlock(mp_so, 1);
        }
 
        return resched_timer;
@@ -171,7 +171,7 @@ mptcp_start_timer(struct mptses *mpte, int timer_type)
        mptcplog((LOG_DEBUG, "MPTCP Socket: %s: %d\n", __func__, timer_type),
            MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
 
-       mpte_lock_assert_held(mpte);
+       socket_lock_assert_owned(mptetoso(mpte));
 
        switch (timer_type) {
        case MPTT_REXMT:
@@ -198,8 +198,7 @@ mptcp_start_timer(struct mptses *mpte, int timer_type)
 void
 mptcp_cancel_timer(struct mptcb *mp_tp, int timer_type)
 {
-       mpte_lock_assert_held(mp_tp->mpt_mpte);
-       DTRACE_MPTCP2(cancel__timer, struct mptcb *, mp_tp, int, timer_type);
+       socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
 
        switch (timer_type) {
        case MPTT_REXMT:
@@ -221,7 +220,113 @@ mptcp_cancel_timer(struct mptcb *mp_tp, int timer_type)
 void
 mptcp_cancel_all_timers(struct mptcb *mp_tp)
 {
+       struct mptses *mpte = mp_tp->mpt_mpte;
+
+       if (mpte->mpte_time_target) {
+               mptcp_cancel_urgency_timer(mpte);
+       }
+
        mptcp_cancel_timer(mp_tp, MPTT_REXMT);
        mptcp_cancel_timer(mp_tp, MPTT_TW);
        mptcp_cancel_timer(mp_tp, MPTT_FASTCLOSE);
 }
+
+static void
+mptcp_urgency_timer_locked(struct mptses *mpte)
+{
+       uint64_t time_now = mach_continuous_time();
+       struct socket *mp_so = mptetoso(mpte);
+
+       VERIFY(mp_so->so_usecount >= 0);
+
+       os_log(mptcp_log_handle, "%s - %lx: timer at %llu now %llu usecount %u\n",
+           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target, time_now, mp_so->so_usecount);
+
+       mptcp_check_subflows_and_add(mpte);
+
+       mp_so->so_usecount--;
+}
+
+static void
+mptcp_urgency_timer(void *param0, __unused void *param1)
+{
+       struct mptses *mpte = (struct mptses *)param0;
+       struct socket *mp_so = mptetoso(mpte);
+
+       socket_lock(mp_so, 1);
+
+       mptcp_urgency_timer_locked(mpte);
+
+       socket_unlock(mp_so, 1);
+}
+
+void
+mptcp_init_urgency_timer(struct mptses *mpte)
+{
+       /* thread_call_allocate never fails */
+       mpte->mpte_time_thread = thread_call_allocate(mptcp_urgency_timer, mpte);
+}
+
+void
+mptcp_set_urgency_timer(struct mptses *mpte)
+{
+       struct socket *mp_so = mptetoso(mpte);
+       uint64_t time_now = 0;
+       boolean_t ret = FALSE;
+
+       socket_lock_assert_owned(mp_so);
+
+       VERIFY(mp_so->so_usecount >= 0);
+       if (mp_so->so_usecount == 0) {
+               goto exit_log;
+       }
+
+       if (mpte->mpte_time_target == 0) {
+               mptcp_cancel_urgency_timer(mpte);
+
+               goto exit_log;
+       }
+
+       time_now = mach_continuous_time();
+
+       if ((int64_t)(mpte->mpte_time_target - time_now) > 0) {
+               mptcp_check_subflows_and_remove(mpte);
+
+               ret = thread_call_enter_delayed_with_leeway(mpte->mpte_time_thread, NULL,
+                   mpte->mpte_time_target, 0, THREAD_CALL_CONTINUOUS);
+
+               if (!ret) {
+                       mp_so->so_usecount++;
+               }
+       } else if ((int64_t)(mpte->mpte_time_target - time_now) <= 0) {
+               mp_so->so_usecount++;
+
+               /* Already passed the deadline, trigger subflows now */
+               mptcp_urgency_timer_locked(mpte);
+       }
+
+exit_log:
+       os_log(mptcp_log_handle, "%s - %lx: timer at %llu now %llu usecount %u ret %u\n",
+           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target, time_now,
+           mp_so->so_usecount, ret);
+}
+
+static int
+mptcp_cancel_urgency_timer(struct mptses *mpte)
+{
+       struct socket *mp_so = mptetoso(mpte);
+       boolean_t ret;
+
+       ret = thread_call_cancel(mpte->mpte_time_thread);
+
+       os_log(mptcp_log_handle, "%s - %lx: Canceled timer thread usecount %u ret %u\n",
+           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->so_usecount, ret);
+
+       mptcp_check_subflows_and_remove(mpte);
+
+       if (ret) {
+               mp_so->so_usecount--;
+       }
+
+       return 0;
+}
index cab9306d6a99b7efc1bf8e82c565b23c5599fee8..231cf201a43e3c8f455848e937f2e70da4e81513 100644 (file)
 #define MPT_TIMEWAIT    1       /* timewait timer */
 
 __BEGIN_DECLS
-extern uint32_t mptcp_timer(struct mppcbinfo *);
-extern void mptcp_start_timer(struct mptses *, int);
-extern void mptcp_cancel_timer(struct mptcb *, int);
-extern void mptcp_cancel_all_timers(struct mptcb *);
+extern uint32_t mptcp_timer(struct mppcbinfo *mppi);
+extern void mptcp_start_timer(struct mptses *mpte, int timer_type);
+extern void mptcp_cancel_timer(struct mptcb *mp_tp, int timer_type);
+extern void mptcp_cancel_all_timers(struct mptcb *mp_tp);
+extern void mptcp_init_urgency_timer(struct mptses *mpte);
+extern void mptcp_set_urgency_timer(struct mptses *mpte);
 __END_DECLS
 
 #endif /* BSD_KERNEL_PRIVATE */
index a73d3339f0aa490e6ab40414e3732574aab8f404..a47b8a51226b016d51f508222416350dccf3b04a 100644 (file)
@@ -78,7 +78,6 @@ static int mptcp_usr_shutdown(struct socket *);
 static int mptcp_usr_sosend(struct socket *, struct sockaddr *, struct uio *,
     struct mbuf *, struct mbuf *, int);
 static int mptcp_usr_socheckopt(struct socket *, struct sockopt *);
-static int mptcp_default_tcp_optval(struct mptses *, struct sockopt *, int *);
 static int mptcp_usr_preconnect(struct socket *so);
 
 struct pr_usrreqs mptcp_usrreqs = {
@@ -110,6 +109,10 @@ int mptcp_developer_mode = 0;
 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, allow_aggregate, CTLFLAG_RW | CTLFLAG_LOCKED,
     &mptcp_developer_mode, 0, "Allow the Multipath aggregation mode");
 
+static unsigned long mptcp_expected_progress_headstart = 5000;
+SYSCTL_ULONG(_net_inet_mptcp, OID_AUTO, expected_progress_headstart, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &mptcp_expected_progress_headstart, "Headstart to give MPTCP before meeting the progress deadline");
+
 
 /*
  * Attaches an MPTCP control block to a socket.
@@ -148,9 +151,9 @@ mptcp_usr_detach(struct socket *mp_so)
        struct mppcb *mpp = mpsotomppcb(mp_so);
 
        if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
-               mptcplog((LOG_ERR, "%s state: %d\n", __func__,
-                   mpp ? mpp->mpp_state : -1),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: state: %d\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                   mpp ? mpp->mpp_state : -1);
                return EINVAL;
        }
 
@@ -199,22 +202,20 @@ mptcp_attach(struct socket *mp_so, struct proc *p)
        }
 
        /*
-        * MPTCP socket buffers cannot be compressed, due to the
+        * MPTCP send-socket buffers cannot be compressed, due to the
         * fact that each mbuf chained via m_next is a M_PKTHDR
         * which carries some MPTCP metadata.
         */
        mp_so->so_snd.sb_flags |= SB_NOCOMPRESS;
-       mp_so->so_rcv.sb_flags |= SB_NOCOMPRESS;
 
        if ((error = mp_pcballoc(mp_so, &mtcbinfo)) != 0) {
                goto out;
        }
 
        mpp = mpsotomppcb(mp_so);
-       VERIFY(mpp != NULL);
        mpte = (struct mptses *)mpp->mpp_pcbe;
-       VERIFY(mpte != NULL);
        mp_tp = mpte->mpte_mptcb;
+
        VERIFY(mp_tp != NULL);
 out:
        return error;
@@ -225,39 +226,57 @@ mptcp_entitlement_check(struct socket *mp_so)
 {
        struct mptses *mpte = mpsotompte(mp_so);
 
-       if (soopt_cred_check(mp_so, PRIV_NET_RESTRICTED_MULTIPATH_EXTENDED, TRUE) == 0) {
+       /* First, check for mptcp_extended without delegation */
+       if (soopt_cred_check(mp_so, PRIV_NET_RESTRICTED_MULTIPATH_EXTENDED, TRUE, FALSE) == 0) {
+               /*
+                * This means the app has the extended entitlement. Thus,
+                * it's a first party app and can run without restrictions.
+                */
+               mpte->mpte_flags |= MPTE_FIRSTPARTY;
+               return 0;
+       }
+
+       /* Now with delegation */
+       if (mp_so->so_flags & SOF_DELEGATED &&
+           soopt_cred_check(mp_so, PRIV_NET_RESTRICTED_MULTIPATH_EXTENDED, TRUE, TRUE) == 0) {
                /*
                 * This means the app has the extended entitlement. Thus,
                 * it's a first party app and can run without restrictions.
                 */
                mpte->mpte_flags |= MPTE_FIRSTPARTY;
-               goto grant;
+               return 0;
        }
 
+       /* Now, take a look at exceptions configured through sysctl */
 #if (DEVELOPMENT || DEBUG)
        if (mptcp_disable_entitlements) {
-               goto grant;
+               return 0;
        }
 #endif
 
-       if (soopt_cred_check(mp_so, PRIV_NET_PRIVILEGED_MULTIPATH, TRUE)) {
-               mptcplog((LOG_NOTICE, "%s Multipath Capability needed\n", __func__),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
-               return -1;
+       if (mpte->mpte_svctype == MPTCP_SVCTYPE_AGGREGATE) {
+               if (mptcp_developer_mode) {
+                       return 0;
+               }
+
+               goto deny;
        }
 
-       if (mpte->mpte_svctype > MPTCP_SVCTYPE_INTERACTIVE &&
-           mptcp_developer_mode == 0) {
-               mptcplog((LOG_NOTICE, "%s need to set allow_aggregate sysctl\n",
-                   __func__), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
-               return -1;
+       /* Second, check for regular users that are within the data-limits */
+       if (soopt_cred_check(mp_so, PRIV_NET_PRIVILEGED_MULTIPATH, TRUE, FALSE) == 0) {
+               return 0;
        }
 
-grant:
-       mptcplog((LOG_NOTICE, "%s entitlement granted for %u\n", __func__, mpte->mpte_svctype),
-           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
+       if (mp_so->so_flags & SOF_DELEGATED &&
+           soopt_cred_check(mp_so, PRIV_NET_PRIVILEGED_MULTIPATH, TRUE, TRUE) == 0) {
+               return 0;
+       }
 
-       return 0;
+deny:
+       os_log_error(mptcp_log_handle, "%s - %lx: MPTCP prohibited on svc %u\n",
+           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_svctype);
+
+       return -1;
 }
 
 /*
@@ -270,17 +289,11 @@ static int
 mptcp_connectx(struct mptses *mpte, struct sockaddr *src,
     struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
 {
-       struct socket *mp_so = mptetoso(mpte);
        int error = 0;
 
        VERIFY(dst != NULL);
        VERIFY(pcid != NULL);
 
-       mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx\n", __func__,
-           (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
-           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
-       DTRACE_MPTCP2(connectx, struct mptses *, mpte, struct socket *, mp_so);
-
        error = mptcp_subflow_add(mpte, src, dst, ifscope, pcid);
 
        return error;
@@ -303,22 +316,18 @@ mptcp_usr_connectx(struct socket *mp_so, struct sockaddr *src,
        int error = 0;
 
        if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
-               mptcplog((LOG_ERR, "%s state %d\n", __func__,
-                   mpp ? mpp->mpp_state : -1),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: state %d\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                   mpp ? mpp->mpp_state : -1);
                error = EINVAL;
                goto out;
        }
        mpte = mptompte(mpp);
-       VERIFY(mpte != NULL);
-       mpte_lock_assert_held(mpte);
-
        mp_tp = mpte->mpte_mptcb;
-       VERIFY(mp_tp != NULL);
 
        if (mp_tp->mpt_flags &  MPTCPF_FALLBACK_TO_TCP) {
-               mptcplog((LOG_ERR, "%s fell back to TCP\n", __func__),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: fell back to TCP\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
                error = EINVAL;
                goto out;
        }
@@ -330,18 +339,16 @@ mptcp_usr_connectx(struct socket *mp_so, struct sockaddr *src,
 
        if (dst->sa_family == AF_INET &&
            dst->sa_len != sizeof(mpte->__mpte_dst_v4)) {
-               mptcplog((LOG_ERR, "%s IPv4 dst len %u\n", __func__,
-                   dst->sa_len),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: IPv4 dst len %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), dst->sa_len);
                error = EINVAL;
                goto out;
        }
 
        if (dst->sa_family == AF_INET6 &&
            dst->sa_len != sizeof(mpte->__mpte_dst_v6)) {
-               mptcplog((LOG_ERR, "%s IPv6 dst len %u\n", __func__,
-                   dst->sa_len),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: IPv6 dst len %u\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), dst->sa_len);
                error = EINVAL;
                goto out;
        }
@@ -356,7 +363,7 @@ mptcp_usr_connectx(struct socket *mp_so, struct sockaddr *src,
        }
 
        if ((mp_so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) {
-               memcpy(&mpte->mpte_dst, dst, dst->sa_len);
+               memcpy(&mpte->mpte_u_dst, dst, dst->sa_len);
        }
 
        if (src) {
@@ -367,24 +374,22 @@ mptcp_usr_connectx(struct socket *mp_so, struct sockaddr *src,
 
                if (src->sa_family == AF_INET &&
                    src->sa_len != sizeof(mpte->__mpte_src_v4)) {
-                       mptcplog((LOG_ERR, "%s IPv4 src len %u\n", __func__,
-                           src->sa_len),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+                       os_log_error(mptcp_log_handle, "%s - %lx: IPv4 src len %u\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), src->sa_len);
                        error = EINVAL;
                        goto out;
                }
 
                if (src->sa_family == AF_INET6 &&
                    src->sa_len != sizeof(mpte->__mpte_src_v6)) {
-                       mptcplog((LOG_ERR, "%s IPv6 src len %u\n", __func__,
-                           src->sa_len),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+                       os_log_error(mptcp_log_handle, "%s - %lx: IPv6 src len %u\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), src->sa_len);
                        error = EINVAL;
                        goto out;
                }
 
                if ((mp_so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) {
-                       memcpy(&mpte->mpte_src, src, src->sa_len);
+                       memcpy(&mpte->mpte_u_src, src, src->sa_len);
                }
        }
 
@@ -418,8 +423,6 @@ out:
 static int
 mptcp_getassocids(struct mptses *mpte, uint32_t *cnt, user_addr_t aidp)
 {
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-
        /* MPTCP has at most 1 association */
        *cnt = (mpte->mpte_associd != SAE_ASSOCID_ANY) ? 1 : 0;
 
@@ -442,8 +445,6 @@ mptcp_getconnids(struct mptses *mpte, sae_associd_t aid, uint32_t *cnt,
        struct mptsub *mpts;
        int error = 0;
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-
        if (aid != SAE_ASSOCID_ANY && aid != SAE_ASSOCID_ALL &&
            aid != mpte->mpte_associd) {
                return EINVAL;
@@ -477,20 +478,17 @@ mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags,
     user_addr_t dst, socklen_t *dst_len, uint32_t *aux_type,
     user_addr_t aux_data, uint32_t *aux_len)
 {
-       struct socket *so;
-       struct inpcb *inp;
-       struct mptsub *mpts;
-       int error = 0;
-
        *flags = 0;
        *aux_type = 0;
        *ifindex = 0;
        *soerror = 0;
 
+       /* MPTCP-level global stats */
        if (*cid == SAE_CONNID_ALL) {
                struct socket *mp_so = mptetoso(mpte);
                struct mptcb *mp_tp = mpte->mpte_mptcb;
                struct conninfo_multipathtcp mptcp_ci;
+               int error = 0;
 
                if (*aux_len != 0 && *aux_len != sizeof(mptcp_ci)) {
                        return EINVAL;
@@ -522,8 +520,9 @@ mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags,
                *aux_len = sizeof(mptcp_ci);
 
                if (aux_data != USER_ADDR_NULL) {
-                       unsigned long i = 0;
+                       const struct mptsub *mpts;
                        int initial_info_set = 0;
+                       unsigned long i = 0;
 
                        bzero(&mptcp_ci, sizeof(mptcp_ci));
                        mptcp_ci.mptcpci_subflow_count = mpte->mpte_numflows;
@@ -539,6 +538,8 @@ mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags,
                                mptcp_ci.mptcpci_subflow_connids[i] = mpts->mpts_connid;
 
                                if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
+                                       const struct inpcb *inp;
+
                                        inp = sotoinpcb(mpts->mpts_socket);
 
                                        mptcp_ci.mptcpci_init_rxbytes = inp->inp_stat->rxbytes;
@@ -562,9 +563,8 @@ mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags,
 
                        error = copyout(&mptcp_ci, aux_data, sizeof(mptcp_ci));
                        if (error != 0) {
-                               mptcplog((LOG_ERR, "%s copyout failed: %d\n",
-                                   __func__, error),
-                                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+                               os_log_error(mptcp_log_handle, "%s - %lx: copyout failed: %d\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
                                return error;
                        }
                }
@@ -572,51 +572,221 @@ mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags,
                return 0;
        }
 
-       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
-               if (mpts->mpts_connid == *cid || *cid == SAE_CONNID_ANY) {
-                       break;
+       /* Any stats of any subflow */
+       if (*cid == SAE_CONNID_ANY) {
+               const struct mptsub *mpts;
+               struct socket *so;
+               const struct inpcb *inp;
+               int error = 0;
+
+               mpts = TAILQ_FIRST(&mpte->mpte_subflows);
+               if (mpts == NULL) {
+                       return ENXIO;
                }
-       }
-       if (mpts == NULL) {
-               return (*cid == SAE_CONNID_ANY) ? ENXIO : EINVAL;
-       }
 
-       so = mpts->mpts_socket;
-       inp = sotoinpcb(so);
+               so = mpts->mpts_socket;
+               inp = sotoinpcb(so);
+
+               if (inp->inp_vflag & INP_IPV4) {
+                       error = in_getconninfo(so, SAE_CONNID_ANY, flags, ifindex,
+                           soerror, src, src_len, dst, dst_len,
+                           aux_type, aux_data, aux_len);
+               } else {
+                       error = in6_getconninfo(so, SAE_CONNID_ANY, flags, ifindex,
+                           soerror, src, src_len, dst, dst_len,
+                           aux_type, aux_data, aux_len);
+               }
+
+               if (error != 0) {
+                       os_log_error(mptcp_log_handle, "%s - %lx:error from in_getconninfo %d\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
+                       return error;
+               }
+
+               if (mpts->mpts_flags & MPTSF_MP_CAPABLE) {
+                       *flags |= CIF_MP_CAPABLE;
+               }
+               if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
+                       *flags |= CIF_MP_DEGRADED;
+               }
+               if (mpts->mpts_flags & MPTSF_MP_READY) {
+                       *flags |= CIF_MP_READY;
+               }
+               if (mpts->mpts_flags & MPTSF_ACTIVE) {
+                       *flags |= CIF_MP_ACTIVE;
+               }
 
-       if (inp->inp_vflag & INP_IPV4) {
-               error = in_getconninfo(so, SAE_CONNID_ANY, flags, ifindex,
-                   soerror, src, src_len, dst, dst_len,
-                   aux_type, aux_data, aux_len);
+               return 0;
        } else {
-               error = in6_getconninfo(so, SAE_CONNID_ANY, flags, ifindex,
-                   soerror, src, src_len, dst, dst_len,
-                   aux_type, aux_data, aux_len);
-       }
+               /* Per-interface stats */
+               const struct mptsub *mpts, *orig_mpts;
+               struct conninfo_tcp tcp_ci;
+               const struct inpcb *inp;
+               struct socket *so;
+               int error = 0;
+               int index;
 
-       if (error != 0) {
-               mptcplog((LOG_ERR, "%s error from in_getconninfo %d\n",
-                   __func__, error),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
-               return error;
-       }
+               bzero(&tcp_ci, sizeof(tcp_ci));
 
-       if (mpts->mpts_flags & MPTSF_MP_CAPABLE) {
-               *flags |= CIF_MP_CAPABLE;
-       }
-       if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
-               *flags |= CIF_MP_DEGRADED;
-       }
-       if (mpts->mpts_flags & MPTSF_MP_READY) {
-               *flags |= CIF_MP_READY;
-       }
-       if (mpts->mpts_flags & MPTSF_ACTIVE) {
-               *flags |= CIF_MP_ACTIVE;
-       }
+               /* First, get a subflow to fill in the "regular" info. */
+               TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+                       const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
 
-       mptcplog((LOG_DEBUG, "%s: cid %d flags %x \n", __func__,
-           mpts->mpts_connid, mpts->mpts_flags),
-           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+                       if (ifp && ifp->if_index == *cid) {
+                               break;
+                       }
+               }
+
+               if (mpts == NULL) {
+                       /* No subflow there - well, let's just get the basic itf-info */
+                       goto interface_info;
+               }
+
+               so = mpts->mpts_socket;
+               inp = sotoinpcb(so);
+
+               /* Give it USER_ADDR_NULL, because we are doing this on our own */
+               if (inp->inp_vflag & INP_IPV4) {
+                       error = in_getconninfo(so, SAE_CONNID_ANY, flags, ifindex,
+                           soerror, src, src_len, dst, dst_len,
+                           aux_type, USER_ADDR_NULL, aux_len);
+               } else {
+                       error = in6_getconninfo(so, SAE_CONNID_ANY, flags, ifindex,
+                           soerror, src, src_len, dst, dst_len,
+                           aux_type, USER_ADDR_NULL, aux_len);
+               }
+
+               if (error != 0) {
+                       os_log_error(mptcp_log_handle, "%s - %lx:error from in_getconninfo %d\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
+                       return error;
+               }
+
+               /* ToDo: Nobody is reading these flags on subflows. Why bother ? */
+               if (mpts->mpts_flags & MPTSF_MP_CAPABLE) {
+                       *flags |= CIF_MP_CAPABLE;
+               }
+               if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
+                       *flags |= CIF_MP_DEGRADED;
+               }
+               if (mpts->mpts_flags & MPTSF_MP_READY) {
+                       *flags |= CIF_MP_READY;
+               }
+               if (mpts->mpts_flags & MPTSF_ACTIVE) {
+                       *flags |= CIF_MP_ACTIVE;
+               }
+
+               /*
+                * Now, we gather the metrics (aka., tcp_info) and roll them in
+                * across all subflows of this interface to build an aggregated
+                * view.
+                *
+                * We take the TCP_INFO from the first subflow as the "master",
+                * feeding into those fields that we do not roll.
+                */
+               if (aux_data != USER_ADDR_NULL) {
+                       tcp_getconninfo(so, &tcp_ci);
+
+                       orig_mpts = mpts;
+                       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+                               const struct inpcb *mptsinp = sotoinpcb(mpts->mpts_socket);
+                               const struct ifnet *ifp;
+
+                               ifp = mptsinp->inp_last_outifp;
+
+                               if (ifp == NULL || ifp->if_index != *cid || mpts == orig_mpts) {
+                                       continue;
+                               }
+
+                               /* Roll the itf-stats into the tcp_info */
+                               tcp_ci.tcpci_tcp_info.tcpi_txbytes +=
+                                   mptsinp->inp_stat->txbytes;
+                               tcp_ci.tcpci_tcp_info.tcpi_rxbytes +=
+                                   mptsinp->inp_stat->rxbytes;
+
+                               tcp_ci.tcpci_tcp_info.tcpi_wifi_txbytes +=
+                                   mptsinp->inp_wstat->txbytes;
+                               tcp_ci.tcpci_tcp_info.tcpi_wifi_rxbytes +=
+                                   mptsinp->inp_wstat->rxbytes;
+
+                               tcp_ci.tcpci_tcp_info.tcpi_wired_txbytes +=
+                                   mptsinp->inp_Wstat->txbytes;
+                               tcp_ci.tcpci_tcp_info.tcpi_wired_rxbytes +=
+                                   mptsinp->inp_Wstat->rxbytes;
+
+                               tcp_ci.tcpci_tcp_info.tcpi_cell_txbytes +=
+                                   mptsinp->inp_cstat->txbytes;
+                               tcp_ci.tcpci_tcp_info.tcpi_cell_rxbytes +=
+                                   mptsinp->inp_cstat->rxbytes;
+                       }
+               }
+
+interface_info:
+               *aux_type = CIAUX_TCP;
+               if (*aux_len == 0) {
+                       *aux_len = sizeof(tcp_ci);
+               } else if (aux_data != USER_ADDR_NULL) {
+                       boolean_t create;
+
+                       /*
+                        * Finally, old subflows might have been closed - we
+                        * want this data as well, so grab it from the interface
+                        * stats.
+                        */
+                       create = orig_mpts != NULL;
+
+                       /*
+                        * When we found a subflow, we are willing to create a stats-index
+                        * because we have some data to return. If there isn't a subflow,
+                        * nor anything in the stats, return EINVAL. Because the
+                        * ifindex belongs to something that doesn't exist.
+                        */
+                       index = mptcpstats_get_index_by_ifindex(mpte->mpte_itfstats, *cid, false);
+                       if (index == -1) {
+                               os_log_error(mptcp_log_handle,
+                                   "%s - %lx: Asking for too many ifindex: %u subcount %u, mpts? %s\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                                   *cid, mpte->mpte_numflows,
+                                   orig_mpts ? "yes" : "no");
+
+                               if (orig_mpts == NULL) {
+                                       return EINVAL;
+                               }
+                       } else {
+                               struct mptcp_itf_stats *stats;
+
+                               stats = &mpte->mpte_itfstats[index];
+
+                               /* Roll the itf-stats into the tcp_info */
+                               tcp_ci.tcpci_tcp_info.tcpi_last_outif = *cid;
+                               tcp_ci.tcpci_tcp_info.tcpi_txbytes +=
+                                   stats->mpis_txbytes;
+                               tcp_ci.tcpci_tcp_info.tcpi_rxbytes +=
+                                   stats->mpis_rxbytes;
+
+                               tcp_ci.tcpci_tcp_info.tcpi_wifi_txbytes +=
+                                   stats->mpis_wifi_txbytes;
+                               tcp_ci.tcpci_tcp_info.tcpi_wifi_rxbytes +=
+                                   stats->mpis_wifi_rxbytes;
+
+                               tcp_ci.tcpci_tcp_info.tcpi_wired_txbytes +=
+                                   stats->mpis_wired_txbytes;
+                               tcp_ci.tcpci_tcp_info.tcpi_wired_rxbytes +=
+                                   stats->mpis_wired_rxbytes;
+
+                               tcp_ci.tcpci_tcp_info.tcpi_cell_txbytes +=
+                                   stats->mpis_cell_txbytes;
+                               tcp_ci.tcpci_tcp_info.tcpi_cell_rxbytes +=
+                                   stats->mpis_cell_rxbytes;
+                       }
+
+                       *aux_len = min(*aux_len, sizeof(tcp_ci));
+                       error = copyout(&tcp_ci, aux_data, *aux_len);
+                       if (error != 0) {
+                               return error;
+                       }
+               }
+       }
 
        return 0;
 }
@@ -638,9 +808,6 @@ mptcp_usr_control(struct socket *mp_so, u_long cmd, caddr_t data,
                goto out;
        }
        mpte = mptompte(mpp);
-       VERIFY(mpte != NULL);
-
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
 
        switch (cmd) {
        case SIOCGASSOCIDS32: {         /* struct so_aidreq32 */
@@ -730,15 +897,9 @@ mptcp_disconnect(struct mptses *mpte)
        struct mptcb *mp_tp;
        int error = 0;
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-
        mp_so = mptetoso(mpte);
        mp_tp = mpte->mpte_mptcb;
 
-       mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx %d\n", __func__,
-           (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_error),
-           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
-
        DTRACE_MPTCP3(disconnectx, struct mptses *, mpte,
            struct socket *, mp_so, struct mptcb *, mp_tp);
 
@@ -856,14 +1017,23 @@ mptcp_usr_rcvd(struct socket *mp_so, int flags)
 #pragma unused(flags)
        struct mppcb *mpp = mpsotomppcb(mp_so);
        struct mptses *mpte;
+       struct mptsub *mpts;
        int error = 0;
 
        if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
                error = EINVAL;
                goto out;
        }
+
        mpte = mptompte(mpp);
-       VERIFY(mpte != NULL);
+
+       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+               struct socket *so = mpts->mpts_socket;
+
+               if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
+                       (*so->so_proto->pr_usrreqs->pru_rcvd)(so, 0);
+               }
+       }
 
        error = mptcp_output(mpte);
 out:
@@ -1096,7 +1266,7 @@ mptcp_usr_sosend(struct socket *mp_so, struct sockaddr *addr, struct uio *uio,
        VERIFY(mp_so->so_type == SOCK_STREAM);
        VERIFY(!(mp_so->so_flags & SOF_MP_SUBFLOW));
 
-       if ((flags & (MSG_OOB | MSG_DONTROUTE | MSG_HOLD | MSG_SEND | MSG_FLUSH)) ||
+       if ((flags & (MSG_OOB | MSG_DONTROUTE)) ||
            (mp_so->so_flags & SOF_ENABLE_MSGS)) {
                error = EOPNOTSUPP;
                socket_unlock(mp_so, 1);
@@ -1253,6 +1423,7 @@ mptcp_usr_socheckopt(struct socket *mp_so, struct sockopt *sopt)
        case SO_NECP_ATTRIBUTES:
        case SO_NECP_CLIENTUUID:
 #endif /* NECP */
+       case SO_MPKL_SEND_INFO:
                /*
                 * Tell the caller that these options are to be processed.
                 */
@@ -1321,7 +1492,6 @@ mptcp_setopt_apply(struct mptses *mpte, struct mptopt *mpo)
                goto out;
        }
 
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
        mp_so = mptetoso(mpte);
 
        /*
@@ -1441,18 +1611,34 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
                        rec = 0;
                        break;
 
-                       /* Next ones, record at MPTCP-level */
+               /* Next ones, record at MPTCP-level */
+               case SO_DELEGATED:
+                       error = sooptcopyin(sopt, &mpte->mpte_epid,
+                           sizeof(int), sizeof(int));
+                       if (error != 0) {
+                               goto err_out;
+                       }
+
+                       goto out;
+               case SO_DELEGATED_UUID:
+                       error = sooptcopyin(sopt, &mpte->mpte_euuid,
+                           sizeof(uuid_t), sizeof(uuid_t));
+                       if (error != 0) {
+                               goto err_out;
+                       }
+
+                       goto out;
 #if NECP
                case SO_NECP_CLIENTUUID:
                        if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
                                error = EINVAL;
-                               goto out;
+                               goto err_out;
                        }
 
                        error = sooptcopyin(sopt, &mpsotomppcb(mp_so)->necp_client_uuid,
                            sizeof(uuid_t), sizeof(uuid_t));
                        if (error != 0) {
-                               goto out;
+                               goto err_out;
                        }
 
                        mpsotomppcb(mp_so)->necp_cb = mptcp_session_necp_cb;
@@ -1460,12 +1646,12 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
                            mpsotomppcb(mp_so)->necp_client_uuid,
                            mpsotomppcb(mp_so));
                        if (error) {
-                               goto out;
+                               goto err_out;
                        }
 
                        if (uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
                                error = EINVAL;
-                               goto out;
+                               goto err_out;
                        }
 
                        goto out;
@@ -1494,11 +1680,11 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
                        error = sooptcopyin(sopt, &optval, sizeof(optval),
                            sizeof(optval));
                        if (error) {
-                               goto out;
+                               goto err_out;
                        }
                        if (optval < 0) {
                                error = EINVAL;
-                               goto out;
+                               goto err_out;
                        } else {
                                if (optval == 0) {
                                        mp_so->so_flags &= ~SOF_NOTSENT_LOWAT;
@@ -1508,6 +1694,10 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
                                        error = mptcp_set_notsent_lowat(mpte,
                                            optval);
                                }
+
+                               if (error) {
+                                       goto err_out;
+                               }
                        }
                        goto out;
                case MPTCP_SERVICE_TYPE:
@@ -1515,18 +1705,18 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
                        error = sooptcopyin(sopt, &optval, sizeof(optval),
                            sizeof(optval));
                        if (error) {
-                               goto out;
+                               goto err_out;
                        }
                        if (optval < 0 || optval >= MPTCP_SVCTYPE_MAX) {
                                error = EINVAL;
-                               goto out;
+                               goto err_out;
                        }
 
                        mpte->mpte_svctype = optval;
 
                        if (mptcp_entitlement_check(mp_so) < 0) {
                                error = EACCES;
-                               goto out;
+                               goto err_out;
                        }
 
                        mpte->mpte_flags |= MPTE_SVCTYPE_CHECKED;
@@ -1537,27 +1727,104 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
                        error = sooptcopyin(sopt, &optval, sizeof(optval),
                            sizeof(optval));
                        if (error) {
-                               goto out;
+                               goto err_out;
                        }
 
                        if (optval < 0 || optval > UINT16_MAX) {
                                error = EINVAL;
-                               goto out;
+                               goto err_out;
                        }
 
                        mpte->mpte_alternate_port = optval;
 
                        goto out;
+               case MPTCP_FORCE_ENABLE:
+                       /* record at MPTCP level */
+                       error = sooptcopyin(sopt, &optval, sizeof(optval),
+                           sizeof(optval));
+                       if (error) {
+                               goto err_out;
+                       }
+
+                       if (optval < 0 || optval > 1) {
+                               error = EINVAL;
+                               goto err_out;
+                       }
+
+                       if (optval) {
+                               mpte->mpte_flags |= MPTE_FORCE_ENABLE;
+                       } else {
+                               mpte->mpte_flags &= ~MPTE_FORCE_ENABLE;
+                       }
+
+                       goto out;
+               case MPTCP_EXPECTED_PROGRESS_TARGET:
+               {
+                       struct mptcb *mp_tp = mpte->mpte_mptcb;
+                       uint64_t mach_time_target;
+                       uint64_t nanoseconds;
+
+                       if (mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
+                               os_log(mptcp_log_handle, "%s - %lx: Can't set urgent activity when svctype is %u\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_svctype);
+                               error = EINVAL;
+                               goto err_out;
+                       }
+
+                       error = sooptcopyin(sopt, &mach_time_target, sizeof(mach_time_target), sizeof(mach_time_target));
+                       if (error) {
+                               goto err_out;
+                       }
+
+                       if (!mptcp_ok_to_create_subflows(mp_tp)) {
+                               os_log(mptcp_log_handle, "%s - %lx: Not ok to create subflows, state %u flags %#x\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
+                               error = EINVAL;
+                               goto err_out;
+                       }
+
+                       if (mach_time_target) {
+                               uint64_t time_now = 0;
+                               uint64_t time_now_nanoseconds;
+
+                               absolutetime_to_nanoseconds(mach_time_target, &nanoseconds);
+                               nanoseconds = nanoseconds - (mptcp_expected_progress_headstart * NSEC_PER_MSEC);
+
+                               time_now = mach_continuous_time();
+                               absolutetime_to_nanoseconds(time_now, &time_now_nanoseconds);
+
+                               nanoseconds_to_absolutetime(nanoseconds, &mach_time_target);
+                               /* If the timer is already running and it would
+                                * fire in less than mptcp_expected_progress_headstart
+                                * seconds, then it's not worth canceling it.
+                                */
+                               if (mpte->mpte_time_target &&
+                                   mpte->mpte_time_target < time_now &&
+                                   time_now_nanoseconds > nanoseconds - (mptcp_expected_progress_headstart * NSEC_PER_MSEC)) {
+                                       os_log(mptcp_log_handle, "%s - %lx: Not rescheduling timer %llu now %llu target %llu\n",
+                                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                                           mpte->mpte_time_target,
+                                           time_now,
+                                           mach_time_target);
+                                       goto out;
+                               }
+                       }
+
+                       mpte->mpte_time_target = mach_time_target;
+                       mptcp_set_urgency_timer(mpte);
+
+                       goto out;
+               }
                default:
                        /* not eligible */
                        error = ENOPROTOOPT;
-                       goto out;
+                       goto err_out;
                }
        }
 
        if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
            sizeof(optval))) != 0) {
-               goto out;
+               goto err_out;
        }
 
        if (rec) {
@@ -1568,14 +1835,8 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
 
                if (mpo == NULL) {
                        error = ENOBUFS;
+                       goto err_out;
                } else {
-                       mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s val %d %s\n",
-                           __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                           mptcp_sopt2str(level, optname), optval,
-                           (mpo->mpo_flags & MPOF_ATTACHED) ?
-                           "updated" : "recorded"),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
-
                        /* initialize or update, as needed */
                        mpo->mpo_intval = optval;
                        if (!(mpo->mpo_flags & MPOF_ATTACHED)) {
@@ -1596,34 +1857,154 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
        }
 
        /* issue this socket option on existing subflows */
-       if (error == 0) {
-               error = mptcp_setopt_apply(mpte, mpo);
-               if (error != 0 && (mpo->mpo_flags & MPOF_ATTACHED)) {
-                       VERIFY(mpo != &smpo);
-                       mptcp_sopt_remove(mpte, mpo);
-                       mptcp_sopt_free(mpo);
-               }
-               if (mpo == &smpo) {
-                       mpo->mpo_flags &= ~MPOF_INTERIM;
-               }
+       error = mptcp_setopt_apply(mpte, mpo);
+       if (error != 0 && (mpo->mpo_flags & MPOF_ATTACHED)) {
+               VERIFY(mpo != &smpo);
+               mptcp_sopt_remove(mpte, mpo);
+               mptcp_sopt_free(mpo);
        }
-out:
-       if (error == 0 && mpo != NULL) {
-               mptcplog((LOG_INFO, "%s:  mp_so 0x%llx sopt %s val %d set %s\n",
-                   __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   mptcp_sopt2str(level, optname), optval,
-                   (mpo->mpo_flags & MPOF_INTERIM) ?
-                   "pending" : "successful"),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
-       } else if (error != 0) {
-               mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s (%d, %d) val %d can't be issued error %d\n",
-                   __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   mptcp_sopt2str(level, optname), level, optname, optval, error),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+       if (mpo == &smpo) {
+               mpo->mpo_flags &= ~MPOF_INTERIM;
        }
+
+       if (error) {
+               goto err_out;
+       }
+
+out:
+
+       return 0;
+
+err_out:
+       os_log_error(mptcp_log_handle, "%s - %lx: sopt %s (%d, %d) val %d can't be issued error %d\n",
+           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+           mptcp_sopt2str(level, optname), level, optname, optval, error);
        return error;
 }
 
+static void
+mptcp_fill_info_bytestats(struct tcp_info *ti, struct mptses *mpte)
+{
+       struct mptsub *mpts;
+       int i;
+
+       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+               const struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
+
+               if (inp == NULL) {
+                       continue;
+               }
+
+               ti->tcpi_txbytes += inp->inp_stat->txbytes;
+               ti->tcpi_rxbytes += inp->inp_stat->rxbytes;
+               ti->tcpi_cell_txbytes += inp->inp_cstat->txbytes;
+               ti->tcpi_cell_rxbytes += inp->inp_cstat->rxbytes;
+               ti->tcpi_wifi_txbytes += inp->inp_wstat->txbytes;
+               ti->tcpi_wifi_rxbytes += inp->inp_wstat->rxbytes;
+               ti->tcpi_wired_txbytes += inp->inp_Wstat->txbytes;
+               ti->tcpi_wired_rxbytes += inp->inp_Wstat->rxbytes;
+       }
+
+       for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
+               struct mptcp_itf_stats *stats = &mpte->mpte_itfstats[i];
+
+               ti->tcpi_txbytes += stats->mpis_txbytes;
+               ti->tcpi_rxbytes += stats->mpis_rxbytes;
+
+               ti->tcpi_wifi_txbytes += stats->mpis_wifi_txbytes;
+               ti->tcpi_wifi_rxbytes += stats->mpis_wifi_rxbytes;
+
+               ti->tcpi_wired_txbytes += stats->mpis_wired_txbytes;
+               ti->tcpi_wired_rxbytes += stats->mpis_wired_rxbytes;
+
+               ti->tcpi_cell_txbytes += stats->mpis_cell_txbytes;
+               ti->tcpi_cell_rxbytes += stats->mpis_cell_rxbytes;
+       }
+}
+
+static void
+mptcp_fill_info(struct mptses *mpte, struct tcp_info *ti)
+{
+       struct mptsub *actsub = mpte->mpte_active_sub;
+       struct mptcb *mp_tp = mpte->mpte_mptcb;
+       struct tcpcb *acttp = NULL;
+
+       if (actsub) {
+               acttp = sototcpcb(actsub->mpts_socket);
+       }
+
+       bzero(ti, sizeof(*ti));
+
+       ti->tcpi_state = mp_tp->mpt_state;
+       /* tcpi_options */
+       /* tcpi_snd_wscale */
+       /* tcpi_rcv_wscale */
+       /* tcpi_flags */
+       if (acttp) {
+               ti->tcpi_rto = acttp->t_timer[TCPT_REXMT] ? acttp->t_rxtcur : 0;
+       }
+
+       /* tcpi_snd_mss */
+       /* tcpi_rcv_mss */
+       if (acttp) {
+               ti->tcpi_rttcur = acttp->t_rttcur;
+               ti->tcpi_srtt = acttp->t_srtt >> TCP_RTT_SHIFT;
+               ti->tcpi_rttvar = acttp->t_rttvar >> TCP_RTTVAR_SHIFT;
+               ti->tcpi_rttbest = acttp->t_rttbest >> TCP_RTT_SHIFT;
+       }
+       /* tcpi_snd_ssthresh */
+       /* tcpi_snd_cwnd */
+       /* tcpi_rcv_space */
+       ti->tcpi_snd_wnd = mp_tp->mpt_sndwnd;
+       ti->tcpi_snd_nxt = mp_tp->mpt_sndnxt;
+       ti->tcpi_rcv_nxt = mp_tp->mpt_rcvnxt;
+       if (acttp) {
+               ti->tcpi_last_outif = (acttp->t_inpcb->inp_last_outifp == NULL) ? 0 :
+                   acttp->t_inpcb->inp_last_outifp->if_index;
+       }
+
+       mptcp_fill_info_bytestats(ti, mpte);
+       /* tcpi_txpackets */
+
+       /* tcpi_txretransmitbytes */
+       /* tcpi_txunacked */
+       /* tcpi_rxpackets */
+
+       /* tcpi_rxduplicatebytes */
+       /* tcpi_rxoutoforderbytes */
+       /* tcpi_snd_bw */
+       /* tcpi_synrexmits */
+       /* tcpi_unused1 */
+       /* tcpi_unused2 */
+       /* tcpi_cell_rxpackets */
+
+       /* tcpi_cell_txpackets */
+
+       /* tcpi_wifi_rxpackets */
+
+       /* tcpi_wifi_txpackets */
+
+       /* tcpi_wired_rxpackets */
+       /* tcpi_wired_txpackets */
+       /* tcpi_connstatus */
+       /* TFO-stuff */
+       /* ECN stuff */
+       /* tcpi_ecn_recv_ce */
+       /* tcpi_ecn_recv_cwr */
+       if (acttp) {
+               ti->tcpi_rcvoopack = acttp->t_rcvoopack;
+       }
+       /* tcpi_pawsdrop */
+       /* tcpi_sack_recovery_episode */
+       /* tcpi_reordered_pkts */
+       /* tcpi_dsack_sent */
+       /* tcpi_dsack_recvd */
+       /* tcpi_flowhash */
+       if (acttp) {
+               ti->tcpi_txretransmitpackets = acttp->t_stat.rxmitpkts;
+       }
+}
+
 /*
  * Handle SOPT_GET for socket options issued on MP socket.
  */
@@ -1643,6 +2024,9 @@ mptcp_getopt(struct mptses *mpte, struct sockopt *sopt)
        }
 
        switch (sopt->sopt_name) {
+       case PERSIST_TIMEOUT:
+               /* Only case for which we have a non-zero default */
+               optval = tcp_max_persist_timeout;
        case TCP_NODELAY:
        case TCP_RXT_FINDROP:
        case TCP_KEEPALIVE:
@@ -1650,100 +2034,58 @@ mptcp_getopt(struct mptses *mpte, struct sockopt *sopt)
        case TCP_KEEPCNT:
        case TCP_CONNECTIONTIMEOUT:
        case TCP_RXT_CONNDROPTIME:
-       case PERSIST_TIMEOUT:
        case TCP_ADAPTIVE_READ_TIMEOUT:
        case TCP_ADAPTIVE_WRITE_TIMEOUT:
-       case TCP_NOTSENT_LOWAT:
-       case MPTCP_SERVICE_TYPE:
-       case MPTCP_ALTERNATE_PORT:
-               /* eligible; get the default value just in case */
-               error = mptcp_default_tcp_optval(mpte, sopt, &optval);
-               break;
-       default:
-               /* not eligible */
-               error = ENOPROTOOPT;
+       {
+               struct mptopt *mpo = mptcp_sopt_find(mpte, sopt);
+
+               if (mpo != NULL) {
+                       optval = mpo->mpo_intval;
+               }
                break;
        }
 
-       switch (sopt->sopt_name) {
+       /* The next ones are stored at the MPTCP-level */
        case TCP_NOTSENT_LOWAT:
                if (mptetoso(mpte)->so_flags & SOF_NOTSENT_LOWAT) {
                        optval = mptcp_get_notsent_lowat(mpte);
                } else {
                        optval = 0;
                }
-               goto out;
-       case MPTCP_SERVICE_TYPE:
-               optval = mpte->mpte_svctype;
-               goto out;
-       case MPTCP_ALTERNATE_PORT:
-               optval = mpte->mpte_alternate_port;
-               goto out;
-       }
+               break;
+       case TCP_INFO:
+       {
+               struct tcp_info ti;
 
-       /*
-        * Search for a previously-issued TCP level socket option and
-        * return the recorded option value.  This assumes that the
-        * value did not get modified by the lower layer after it was
-        * issued at setsockopt(2) time.  If not found, we'll return
-        * the default value obtained ealier.
-        */
-       if (error == 0) {
-               struct mptopt *mpo;
+               mptcp_fill_info(mpte, &ti);
+               error = sooptcopyout(sopt, &ti, sizeof(struct tcp_info));
 
-               if ((mpo = mptcp_sopt_find(mpte, sopt)) != NULL) {
-                       optval = mpo->mpo_intval;
-               }
-
-               error = sooptcopyout(sopt, &optval, sizeof(int));
+               goto out;
        }
-out:
-       return error;
-}
-
-/*
- * Return default values for TCP socket options.  Ideally we would query the
- * subflow TCP socket, but that requires creating a subflow socket before
- * connectx(2) time.  To simplify things, just return the default values
- * that we know of.
- */
-static int
-mptcp_default_tcp_optval(struct mptses *mpte, struct sockopt *sopt, int *optval)
-{
-       int error = 0;
-
-       VERIFY(sopt->sopt_level == IPPROTO_TCP);
-       VERIFY(sopt->sopt_dir == SOPT_GET);
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
-
-       /* try to do what tcp_newtcpcb() does */
-       switch (sopt->sopt_name) {
-       case TCP_NODELAY:
-       case TCP_RXT_FINDROP:
-       case TCP_KEEPINTVL:
-       case TCP_KEEPCNT:
-       case TCP_CONNECTIONTIMEOUT:
-       case TCP_RXT_CONNDROPTIME:
-       case TCP_NOTSENT_LOWAT:
-       case TCP_ADAPTIVE_READ_TIMEOUT:
-       case TCP_ADAPTIVE_WRITE_TIMEOUT:
        case MPTCP_SERVICE_TYPE:
-       case MPTCP_ALTERNATE_PORT:
-               *optval = 0;
+               optval = mpte->mpte_svctype;
                break;
-
-       case TCP_KEEPALIVE:
-               *optval = mptcp_subflow_keeptime;
+       case MPTCP_ALTERNATE_PORT:
+               optval = mpte->mpte_alternate_port;
                break;
-
-       case PERSIST_TIMEOUT:
-               *optval = tcp_max_persist_timeout;
+       case MPTCP_FORCE_ENABLE:
+               optval = !!(mpte->mpte_flags & MPTE_FORCE_ENABLE);
                break;
+       case MPTCP_EXPECTED_PROGRESS_TARGET:
+               error = sooptcopyout(sopt, &mpte->mpte_time_target, sizeof(mpte->mpte_time_target));
 
+               goto out;
        default:
+               /* not eligible */
                error = ENOPROTOOPT;
                break;
        }
+
+       if (error == 0) {
+               error = sooptcopyout(sopt, &optval, sizeof(int));
+       }
+
+out:
        return error;
 }
 
@@ -1764,15 +2106,10 @@ mptcp_ctloutput(struct socket *mp_so, struct sockopt *sopt)
                goto out;
        }
        mpte = mptompte(mpp);
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+       socket_lock_assert_owned(mp_so);
 
        /* we only handle socket and TCP-level socket options for MPTCP */
        if (sopt->sopt_level != SOL_SOCKET && sopt->sopt_level != IPPROTO_TCP) {
-               mptcplog((LOG_DEBUG, "MPTCP Socket: "
-                   "%s: mp_so 0x%llx sopt %s level not "
-                   "handled\n", __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   mptcp_sopt2str(sopt->sopt_level, sopt->sopt_name)),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
                error = EINVAL;
                goto out;
        }
@@ -1903,6 +2240,10 @@ mptcp_sopt2str(int level, int optname)
                        return "MPTCP_SERVICE_TYPE";
                case MPTCP_ALTERNATE_PORT:
                        return "MPTCP_ALTERNATE_PORT";
+               case MPTCP_FORCE_ENABLE:
+                       return "MPTCP_FORCE_ENABLE";
+               case MPTCP_EXPECTED_PROGRESS_TARGET:
+                       return "MPTCP_EXPECTED_PROGRESS_TARGET";
                }
 
                break;
@@ -1922,14 +2263,11 @@ mptcp_usr_preconnect(struct socket *mp_so)
        int error;
 
        mpte = mptompte(mpp);
-       VERIFY(mpte != NULL);
-       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
 
-       mpts = mptcp_get_subflow(mpte, NULL, NULL);
+       mpts = mptcp_get_subflow(mpte, NULL);
        if (mpts == NULL) {
-               mptcplog((LOG_ERR, "%s: mp_so 0x%llx invalid preconnect ",
-                   __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s - %lx: invalid preconnect ",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
                return EINVAL;
        }
        mpts->mpts_flags &= ~MPTSF_TFO_REQD;
index 5ae998afc64496dd5fc15044ab95f88eab4081dd..4c9037db7d071d767fe6d5a5f8e9bd74f4339bd8 100644 (file)
@@ -67,32 +67,52 @@ struct mptses {
        sae_associd_t   mpte_associd;           /* MPTCP association ID */
        sae_connid_t    mpte_connid_last;       /* last used connection ID */
 
+       uint64_t        mpte_time_target;
+       thread_call_t   mpte_time_thread;
+
+       uint32_t        mpte_last_cellicon_set;
+       uint32_t        mpte_cellicon_increments;
+
        union {
                /* Source address of initial subflow */
-               struct sockaddr mpte_src;
-               struct sockaddr_in __mpte_src_v4;
-               struct sockaddr_in6 __mpte_src_v6;
-       };
-
+               struct sockaddr _mpte_src;
+               struct sockaddr_in _mpte_src_v4;
+               struct sockaddr_in6 _mpte_src_v6;
+       } mpte_u_src;
+#define mpte_src mpte_u_src._mpte_src
+#define __mpte_src_v4 mpte_u_src._mpte_src_v4
+#define __mpte_src_v6 mpte_u_src._mpte_src_v6
        union {
                /* Destination address of initial subflow */
-               struct sockaddr mpte_dst;
-               struct sockaddr_in __mpte_dst_v4;
-               struct sockaddr_in6 __mpte_dst_v6;
-       };
+               struct sockaddr _mpte_dst;
+               struct sockaddr_in _mpte_dst_v4;
+               struct sockaddr_in6 _mpte_dst_v6;
+       } mpte_u_dst;
+#define mpte_dst mpte_u_dst._mpte_dst
+#define __mpte_dst_v4 mpte_u_dst._mpte_dst_v4
+#define __mpte_dst_v6 mpte_u_dst._mpte_dst_v6
 
        struct sockaddr_in mpte_dst_v4_nat64;
 
+       struct sockaddr_in mpte_dst_unicast_v4;
+       struct sockaddr_in6 mpte_dst_unicast_v6;
+
        uint16_t        mpte_alternate_port;    /* Alternate port for subflow establishment (network-byte-order) */
 
+       int mpte_epid;
+       uuid_t mpte_euuid;
+
        struct mptsub   *mpte_active_sub;       /* ptr to last active subf */
-       uint8_t mpte_flags;                     /* per mptcp session flags */
+       uint16_t mpte_flags;                    /* per mptcp session flags */
 #define MPTE_SND_REM_ADDR       0x01            /* Send Remove_addr option */
 #define MPTE_SVCTYPE_CHECKED    0x02            /* Did entitlement-check for service-type */
 #define MPTE_FIRSTPARTY         0x04            /* First-party app used multipath_extended entitlement */
 #define MPTE_ACCESS_GRANTED     0x08            /* Access to cellular has been granted for this connection */
-#define MPTE_IN_WORKLOOP        0x10            /* Are we currently inside the workloop ? */
-#define MPTE_WORKLOOP_RELAUNCH  0x20            /* Another event got queued, we should restart the workloop */
+#define MPTE_FORCE_ENABLE       0x10            /* For MPTCP regardless of heuristics to detect middleboxes */
+#define MPTE_IN_WORKLOOP        0x20            /* Are we currently inside the workloop ? */
+#define MPTE_WORKLOOP_RELAUNCH  0x40            /* Another event got queued, we should restart the workloop */
+#define MPTE_UNICAST_IP         0x80            /* New subflows are only being established towards the unicast IP in the ADD_ADDR */
+#define MPTE_CELL_PROHIBITED    0x100           /* Cell access has been prohibited based on signal quality */
        uint8_t mpte_svctype;                   /* MPTCP Service type */
        uint8_t mpte_lost_aid;                  /* storing lost address id */
        uint8_t mpte_addrid_last;               /* storing address id parm */
@@ -135,24 +155,6 @@ mpsotompte(struct socket *so)
        return mptompte(mpsotomppcb(so));
 }
 
-static inline void
-mpp_lock_assert_held(struct mppcb *mp)
-{
-#if !MACH_ASSERT
-#pragma unused(mp)
-#endif
-       LCK_MTX_ASSERT(&mp->mpp_lock, LCK_MTX_ASSERT_OWNED);
-}
-
-static inline void
-mpp_lock_assert_notheld(struct mppcb *mp)
-{
-#if !MACH_ASSERT
-#pragma unused(mp)
-#endif
-       LCK_MTX_ASSERT(&mp->mpp_lock, LCK_MTX_ASSERT_NOTOWNED);
-}
-
 static inline boolean_t
 mpp_try_lock(struct mppcb *mp)
 {
@@ -193,42 +195,6 @@ mpp_getlock(struct mppcb *mp, int flags)
        return &mp->mpp_lock;
 }
 
-static inline void
-mpte_lock_assert_held(struct mptses *mpte)
-{
-       mpp_lock_assert_held(mpte->mpte_mppcb);
-}
-
-static inline void
-mpte_lock_assert_notheld(struct mptses *mpte)
-{
-       mpp_lock_assert_notheld(mpte->mpte_mppcb);
-}
-
-static inline boolean_t
-mpte_try_lock(struct mptses *mpte)
-{
-       return mpp_try_lock(mpte->mpte_mppcb);
-}
-
-static inline void
-mpte_lock(struct mptses *mpte)
-{
-       mpp_lock(mpte->mpte_mppcb);
-}
-
-static inline void
-mpte_unlock(struct mptses *mpte)
-{
-       mpp_unlock(mpte->mpte_mppcb);
-}
-
-static inline lck_mtx_t *
-mpte_getlock(struct mptses *mpte, int flags)
-{
-       return mpp_getlock(mpte->mpte_mppcb, flags);
-}
-
 static inline int
 mptcp_subflow_cwnd_space(struct socket *so)
 {
@@ -272,11 +238,13 @@ struct mptsub {
 
        union {
                /* destination address */
-               struct sockaddr         mpts_dst;
-               struct sockaddr_in      __mpts_dst_v4;
-               struct sockaddr_in6     __mpts_dst_v6;
-       };
-
+               struct sockaddr         _mpts_dst;
+               struct sockaddr_in      _mpts_dst_v4;
+               struct sockaddr_in6     _mpts_dst_v6;
+       } mpts_u_dst;
+#define mpts_dst mpts_u_dst._mpts_dst
+#define __mpts_dst_v4 mpts_u_dst._mpts_dst_v4
+#define __mpts_dst_v6 mpts_u_dst._mpts_dst_v6
        u_int32_t               mpts_rel_seq;   /* running count of subflow # */
        u_int32_t               mpts_iss;       /* Initial sequence number, taking TFO into account */
        u_int32_t               mpts_ifscope;   /* scoped to the interface */
@@ -335,7 +303,11 @@ struct mptsub {
 #define MPTSF_INITIAL_SUB       0x00040000      /* This is the initial subflow */
 #define MPTSF_READ_STALL        0x00080000      /* A read-stall has been detected */
 #define MPTSF_WRITE_STALL       0x00100000      /* A write-stall has been detected */
-#define MPTSF_CONFIRMED         0x00200000      /* Subflow confirmed to be MPTCP-capable */
+#define MPTSF_FULLY_ESTABLISHED 0x00200000      /* Subflow is fully established and it has been confirmed
+                                                * whether or not it supports MPTCP.
+                                                * No need for further middlebox-detection.
+                                                */
+#define MPTSF_CELLICON_SET      0x00400000      /* This subflow set the cellicon */
 
 #define MPTSF_BITS \
        "\020\1ATTACHED\2CONNECTING\3PENDING\4CONNECTED\5DISCONNECTING" \
@@ -385,8 +357,8 @@ struct mptcp_subf_auth_entry {
 struct mptcb {
        struct mptses   *mpt_mpte;              /* back ptr to MPTCP session */
        mptcp_state_t   mpt_state;              /* MPTCP state */
-       u_int32_t       mpt_flags;              /* see flags below */
-       u_int32_t       mpt_version;            /* MPTCP proto version */
+       uint32_t       mpt_flags;              /* see flags below */
+       uint32_t       mpt_version;            /* MPTCP proto version */
        int             mpt_softerror;          /* error not yet reported */
        /*
         * Authentication and metadata invariants
@@ -401,32 +373,33 @@ struct mptcb {
         * Data ACKs do not.
         */
        int             mpt_rxtshift;           /* num of consecutive retrans */
-       u_int32_t       mpt_rxtstart;           /* time at which rxt started */
-       u_int64_t       mpt_rtseq;              /* seq # being tracked */
-       u_int32_t       mpt_timer_vals;         /* timer related values */
-       u_int32_t       mpt_timewait;           /* timewait */
+       uint32_t        mpt_rxtstart;           /* time at which rxt started */
+       uint64_t        mpt_rtseq;              /* seq # being tracked */
+       uint32_t        mpt_timer_vals;         /* timer related values */
+       uint32_t        mpt_timewait;           /* timewait */
        /*
         * Sending side
         */
-       u_int64_t       mpt_snduna;             /* DSN of last unacked byte */
-       u_int64_t       mpt_sndnxt;             /* DSN of next byte to send */
-       u_int64_t       mpt_sndmax;             /* DSN of max byte sent */
-       u_int64_t       mpt_local_idsn;         /* First byte's DSN */
-       u_int32_t       mpt_sndwnd;
-       u_int64_t       mpt_sndwl1;
-       u_int64_t       mpt_sndwl2;
+       uint64_t        mpt_snduna;             /* DSN of last unacked byte */
+       uint64_t        mpt_sndnxt;             /* DSN of next byte to send */
+       uint64_t        mpt_sndmax;             /* DSN of max byte sent */
+       uint64_t        mpt_local_idsn;         /* First byte's DSN */
+       uint32_t        mpt_sndwnd;
+       uint64_t        mpt_sndwl1;
+       uint64_t        mpt_sndwl2;
        /*
         * Receiving side
         */
-       u_int64_t       mpt_rcvnxt;             /* Next expected DSN */
-       u_int64_t       mpt_remote_idsn;        /* Peer's IDSN */
-       u_int32_t       mpt_rcvwnd;
+       uint64_t        mpt_rcvnxt;             /* Next expected DSN */
+       uint64_t        mpt_remote_idsn;        /* Peer's IDSN */
+       uint32_t        mpt_rcvwnd;
+       uint32_t        mpt_rcvadv;
        LIST_HEAD(, mptcp_subf_auth_entry) mpt_subauth_list; /* address IDs */
        /*
         * Fastclose
         */
-       u_int64_t       mpt_dsn_at_csum_fail;   /* MPFail Opt DSN */
-       u_int32_t       mpt_ssn_at_csum_fail;   /* MPFail Subflow Seq */
+       uint64_t        mpt_dsn_at_csum_fail;   /* MPFail Opt DSN */
+       uint32_t        mpt_ssn_at_csum_fail;   /* MPFail Subflow Seq */
        /*
         * Zombie handling
         */
@@ -434,11 +407,11 @@ struct mptcb {
 #define MPT_GC_TICKS_FAST       (10)
        int32_t         mpt_gc_ticks;           /* Used for zombie deletion */
 
-       u_int32_t       mpt_notsent_lowat;      /* TCP_NOTSENT_LOWAT support */
-       u_int32_t       mpt_peer_version;       /* Version from peer */
+       uint32_t        mpt_notsent_lowat;      /* TCP_NOTSENT_LOWAT support */
+       uint32_t        mpt_peer_version;       /* Version from peer */
 
        struct tsegqe_head      mpt_segq;
-       u_int16_t       mpt_reassqlen;          /* length of reassembly queue */
+       uint16_t        mpt_reassqlen;          /* length of reassembly queue */
 };
 
 /* valid values for mpt_flags (see also notes on mpts_flags above) */
@@ -453,6 +426,7 @@ struct mptcb {
 #define MPTCPF_FALLBACK_HEURISTIC       0x100   /* Send SYN without MP_CAPABLE due to heuristic */
 #define MPTCPF_HEURISTIC_TRAC           0x200   /* Tracked this connection in the heuristics as a failure */
 #define MPTCPF_REASS_INPROG             0x400   /* Reassembly is in progress */
+#define MPTCPF_UNICAST_IP               0x800
 
 #define MPTCPF_BITS \
        "\020\1CHECKSUM\2FALLBACK_TO_TCP\3JOIN_READY\4RECVD_MPFAIL" \
@@ -544,7 +518,7 @@ extern os_log_t mptcp_log_handle;
 #define MPTCP_EXTEND_DSN(x, y, z) {                                     \
        if ((MPTCP_DATASEQ_LOW32(x) > y) &&                             \
            ((((u_int32_t)MPTCP_DATASEQ_LOW32(x)) - (u_int32_t)y) >=    \
-           (u_int32_t)(1 << 31))) {                                    \
+           (u_int32_t)(1U << 31))) {                                    \
        /* \
         * y wrapped around and x and y are 2**31 bytes  apart \
         */                                                             \
@@ -553,7 +527,7 @@ extern os_log_t mptcp_log_handle;
        } else if ((MPTCP_DATASEQ_LOW32(x) < y) &&                      \
            (((u_int32_t)y -                                            \
            ((u_int32_t)MPTCP_DATASEQ_LOW32(x))) >=                     \
-           (u_int32_t)(1 << 31))) {                                    \
+           (u_int32_t)(1U << 31))) {                                    \
        /* \
         * x wrapped around and x and y are 2**31 apart \
         */                                                             \
@@ -578,17 +552,20 @@ extern int mptcp_subflow_keeptime; /* Multipath subflow TCP_KEEPALIVE opt */
 extern uint32_t mptcp_dbg_level;        /* Multipath TCP debugging level */
 extern uint32_t mptcp_dbg_area; /* Multipath TCP debugging area */
 extern int mptcp_developer_mode;        /* Allow aggregation mode */
+extern uint32_t mptcp_cellicon_refcount;
 
 extern int tcp_jack_rxmt;       /* Join ACK retransmission value in msecs */
 
 __BEGIN_DECLS
 extern void mptcp_init(struct protosw *, struct domain *);
 extern int mptcp_ctloutput(struct socket *, struct sockopt *);
-extern int mptcp_sescreate(struct mppcb *);
-extern void mptcp_check_subflows_and_add(struct mptses *);
-extern int mptcp_get_statsindex(struct mptcp_itf_stats *stats,
-    const struct mptsub *mpts);
-extern void mptcpstats_inc_switch(struct mptses *, const struct mptsub *);
+extern int mptcp_session_create(struct mppcb *);
+extern boolean_t mptcp_ok_to_create_subflows(struct mptcb *mp_tp);
+extern void mptcp_check_subflows_and_add(struct mptses *mpte);
+extern void mptcp_check_subflows_and_remove(struct mptses *mpte);
+extern void mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts);
+extern void mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts);
+extern int mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, int ifindex, boolean_t create);
 extern struct mptses *mptcp_drop(struct mptses *, struct mptcb *, int);
 extern struct mptses *mptcp_close(struct mptses *, struct mptcb *);
 extern int mptcp_lock(struct socket *, int, void *);
@@ -608,9 +585,9 @@ extern struct mptopt *mptcp_sopt_find(struct mptses *, struct sockopt *);
 
 extern int mptcp_subflow_add(struct mptses *, struct sockaddr *,
     struct sockaddr *, uint32_t, sae_connid_t *);
-extern void mptcpstats_update(struct mptcp_itf_stats *stats, struct mptsub *mpts);
 extern void mptcp_subflow_del(struct mptses *, struct mptsub *);
 
+extern void mptcp_handle_input(struct socket *so);
 #define MPTCP_SUBOUT_PROBING    0x01
 extern int mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags);
 extern void mptcp_clean_reinjectq(struct mptses *mpte);
@@ -643,8 +620,7 @@ extern void mptcp_output_getm_dsnmap64(struct socket *so, int off,
     uint64_t *dsn, uint32_t *relseq,
     uint16_t *data_len, uint16_t *dss_csum);
 extern void mptcp_act_on_txfail(struct socket *);
-extern struct mptsub *mptcp_get_subflow(struct mptses *, struct mptsub *,
-    struct mptsub **);
+extern struct mptsub *mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred);
 extern int mptcp_get_map_for_dsn(struct socket *, u_int64_t, u_int32_t *);
 extern int32_t mptcp_adj_sendlen(struct socket *so, int32_t off);
 extern void mptcp_sbrcv_grow(struct mptcb *mp_tp);
@@ -658,14 +634,14 @@ extern u_int32_t mptcp_get_notsent_lowat(struct mptses *mpte);
 extern int mptcp_notsent_lowat_check(struct socket *so);
 extern void mptcp_ask_symptoms(struct mptses *mpte);
 extern void mptcp_control_register(void);
-extern int mptcp_is_wifi_unusable(struct mptses *mpte);
-extern boolean_t mptcp_subflow_is_bad(struct mptses *mpte, struct mptsub *mpts);
+extern int mptcp_is_wifi_unusable_for_session(struct mptses *mpte);
+extern boolean_t symptoms_is_wifi_lossy(void);
 extern void mptcp_ask_for_nat64(struct ifnet *ifp);
 extern void mptcp_session_necp_cb(void *, int, uint32_t, uint32_t, bool *);
+extern struct sockaddr *mptcp_get_session_dst(struct mptses *mpte,
+    boolean_t has_v6, boolean_t has_v4);
 extern void mptcp_set_restrictions(struct socket *mp_so);
-extern int mptcp_freeq(struct mptcb *);
-extern void mptcp_set_cellicon(struct mptses *mpte);
-extern void mptcp_unset_cellicon(void);
+extern void mptcp_clear_cellicon(void);
 extern void mptcp_reset_rexmit_state(struct tcpcb *tp);
 extern void mptcp_reset_keepalive(struct tcpcb *tp);
 extern int mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
@@ -742,6 +718,13 @@ typedef struct symptoms_advisory {
        };
 } symptoms_advisory_t;
 
+#define MPTCP_TARGET_BASED_RSSI_THRESHOLD -75
+struct mptcp_symptoms_answer {
+       struct symptoms_advisory advisory;
+       uuid_t  uuid;
+       int32_t rssi;
+};
+
 struct mptcp_symptoms_ask_uuid {
        uint32_t        cmd;
 #define MPTCP_SYMPTOMS_ASK_UUID         1
index d88d42b992221e42f2b1b1b2a47d297bf17fd691..66b0102bbb7ecbcc3f1c0e1bcc386c07b273ed12 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -193,7 +193,14 @@ rip_init(struct protosw *pp, struct domain *dp)
        in_pcbinfo_attach(&ripcbinfo);
 }
 
-static struct   sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET, 0, {0}, {0, 0, 0, 0, 0, 0, 0, 0, } };
+static struct   sockaddr_in ripsrc = {
+       .sin_len = sizeof(ripsrc),
+       .sin_family = AF_INET,
+       .sin_port = 0,
+       .sin_addr = { .s_addr = 0 },
+       .sin_zero = {0, 0, 0, 0, 0, 0, 0, 0, }
+};
+
 /*
  * Setup generic address and protocol structures
  * for raw_input routine, then pass them along with
@@ -410,6 +417,9 @@ rip_output(
        if (INP_NO_EXPENSIVE(inp)) {
                ipoa.ipoa_flags |=  IPOAF_NO_EXPENSIVE;
        }
+       if (INP_NO_CONSTRAINED(inp)) {
+               ipoa.ipoa_flags |=  IPOAF_NO_CONSTRAINED;
+       }
        if (INP_AWDL_UNRESTRICTED(inp)) {
                ipoa.ipoa_flags |=  IPOAF_AWDL_UNRESTRICTED;
        }
@@ -609,11 +619,11 @@ rip_output(
        }
 
        /*
-        * If output interface was cellular/expensive, and this socket is
+        * If output interface was cellular/expensive/constrained, and this socket is
         * denied access to it, generate an event.
         */
        if (error != 0 && (ipoa.ipoa_retflags & IPOARF_IFDENIED) &&
-           (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp))) {
+           (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp) || INP_NO_CONSTRAINED(inp))) {
                soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED));
        }
 
index 369a709d7a2decc38d7fd0f66aef9cd1b135acce..06e25d6bb5c1586e46cc512b91f4c3d0eae3d3f2 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -288,7 +288,8 @@ struct tcp_notify_ack_complete {
 #define MPTCP_SVCTYPE_HANDOVER          0 /* Default 0 */
 #define MPTCP_SVCTYPE_INTERACTIVE       1
 #define MPTCP_SVCTYPE_AGGREGATE         2
-#define MPTCP_SVCTYPE_MAX               3
+#define MPTCP_SVCTYPE_TARGET_BASED      3
+#define MPTCP_SVCTYPE_MAX               4
 /*
  * Specify minimum time in seconds before which an established
  * TCP connection will not be dropped when there is no response from the
@@ -299,6 +300,9 @@ struct tcp_notify_ack_complete {
 #define TCP_RXT_MINIMUM_TIMEOUT_LIMIT   (5 * 60) /* Limit is 5 minutes */
 
 #define MPTCP_ALTERNATE_PORT            0x216
+#define MPTCP_FORCE_ENABLE              0x217
+#define TCP_FASTOPEN_FORCE_ENABLE       0x218
+#define MPTCP_EXPECTED_PROGRESS_TARGET  0x219
 
 /*
  * The TCP_INFO socket option is a private API and is subject to change
@@ -479,6 +483,12 @@ struct mptcp_itf_stats {
        uint32_t        is_expensive:1;
        uint64_t        mpis_txbytes __attribute__((aligned(8)));
        uint64_t        mpis_rxbytes __attribute__((aligned(8)));
+       uint64_t        mpis_wifi_txbytes __attribute__((aligned(8)));
+       uint64_t        mpis_wifi_rxbytes __attribute__((aligned(8)));
+       uint64_t        mpis_wired_txbytes __attribute__((aligned(8)));
+       uint64_t        mpis_wired_rxbytes __attribute__((aligned(8)));
+       uint64_t        mpis_cell_txbytes __attribute__((aligned(8)));
+       uint64_t        mpis_cell_rxbytes __attribute__((aligned(8)));
 };
 
 /* Version solely used to let libnetcore survive */
index deaa3bcb2f2cc68094d8d25c38202d6ceae2dac9..aad8ce29ceb7615c7834aaeb5c5290d7217d6e9f 100644 (file)
@@ -67,6 +67,7 @@ struct tcp_heuristic {
        uint8_t         th_tfo_data_rst; /* The number of times a SYN+data has received a RST */
        uint8_t         th_tfo_req_rst; /* The number of times a SYN+cookie-req has received a RST */
        uint8_t         th_mptcp_loss; /* The number of times a SYN+MP_CAPABLE has been lost */
+       uint8_t         th_mptcp_success; /* The number of times MPTCP-negotiation has been successful */
        uint8_t         th_ecn_loss; /* The number of times a SYN+ecn has been lost */
        uint8_t         th_ecn_aggressive; /* The number of times we did an aggressive fallback */
        uint8_t         th_ecn_droprst; /* The number of times ECN connections received a RST after first data pkt */
@@ -79,7 +80,8 @@ struct tcp_heuristic {
        uint32_t        th_ecn_backoff; /* Time until when we should not try out ECN */
 
        uint8_t         th_tfo_in_backoff:1, /* Are we avoiding TFO due to the backoff timer? */
-           th_mptcp_in_backoff:1;             /* Are we avoiding MPTCP due to the backoff timer? */
+           th_mptcp_in_backoff:1,             /* Are we avoiding MPTCP due to the backoff timer? */
+           th_mptcp_heuristic_disabled:1;             /* Are heuristics disabled? */
 
        char            th_val_end[0]; /* Marker for memsetting to 0 */
 };
@@ -181,6 +183,7 @@ tcp_min_to_hz(uint32_t minutes)
 #define TFO_MAX_COOKIE_LOSS     2
 #define ECN_MAX_SYN_LOSS        2
 #define MPTCP_MAX_SYN_LOSS      2
+#define MPTCP_SUCCESS_TRIGGER   10
 #define ECN_MAX_DROPRST         1
 #define ECN_MAX_DROPRXMT        4
 #define ECN_MAX_SYNRST          4
@@ -634,38 +637,67 @@ tcp_heuristic_reset_counters(struct tcp_cache_key_src *tcks, u_int8_t flags)
        struct tcp_heuristic *tpheur;
 
        /*
-        * Don't attempt to create it! Keep the heuristics clean if the
-        * server does not support TFO. This reduces the lookup-cost on
-        * our side.
+        * Always create heuristics here because MPTCP needs to write success
+        * into it. Thus, we always end up creating them.
         */
-       tpheur = tcp_getheuristic_with_lock(tcks, 0, &head);
+       tpheur = tcp_getheuristic_with_lock(tcks, 1, &head);
        if (tpheur == NULL) {
                return;
        }
 
        if (flags & TCPCACHE_F_TFO_DATA) {
+               if (tpheur->th_tfo_data_loss >= TFO_MAX_COOKIE_LOSS) {
+                       os_log(OS_LOG_DEFAULT, "%s: Resetting TFO-data loss to 0 from %u on heur %lx\n",
+                           __func__, tpheur->th_tfo_data_loss, (unsigned long)VM_KERNEL_ADDRPERM(tpheur));
+               }
                tpheur->th_tfo_data_loss = 0;
        }
 
        if (flags & TCPCACHE_F_TFO_REQ) {
+               if (tpheur->th_tfo_req_loss >= TFO_MAX_COOKIE_LOSS) {
+                       os_log(OS_LOG_DEFAULT, "%s: Resetting TFO-req loss to 0 from %u on heur %lx\n",
+                           __func__, tpheur->th_tfo_req_loss, (unsigned long)VM_KERNEL_ADDRPERM(tpheur));
+               }
                tpheur->th_tfo_req_loss = 0;
        }
 
        if (flags & TCPCACHE_F_TFO_DATA_RST) {
+               if (tpheur->th_tfo_data_rst >= TFO_MAX_COOKIE_LOSS) {
+                       os_log(OS_LOG_DEFAULT, "%s: Resetting TFO-data RST to 0 from %u on heur %lx\n",
+                           __func__, tpheur->th_tfo_data_rst, (unsigned long)VM_KERNEL_ADDRPERM(tpheur));
+               }
                tpheur->th_tfo_data_rst = 0;
        }
 
        if (flags & TCPCACHE_F_TFO_REQ_RST) {
+               if (tpheur->th_tfo_req_rst >= TFO_MAX_COOKIE_LOSS) {
+                       os_log(OS_LOG_DEFAULT, "%s: Resetting TFO-req RST to 0 from %u on heur %lx\n",
+                           __func__, tpheur->th_tfo_req_rst, (unsigned long)VM_KERNEL_ADDRPERM(tpheur));
+               }
                tpheur->th_tfo_req_rst = 0;
        }
 
        if (flags & TCPCACHE_F_ECN) {
+               if (tpheur->th_ecn_loss >= ECN_MAX_SYN_LOSS || tpheur->th_ecn_synrst >= ECN_MAX_SYNRST) {
+                       os_log(OS_LOG_DEFAULT, "%s: Resetting ECN-loss to 0 from %u and synrst from %u on heur %lx\n",
+                           __func__, tpheur->th_ecn_loss, tpheur->th_ecn_synrst, (unsigned long)VM_KERNEL_ADDRPERM(tpheur));
+               }
                tpheur->th_ecn_loss = 0;
                tpheur->th_ecn_synrst = 0;
        }
 
        if (flags & TCPCACHE_F_MPTCP) {
                tpheur->th_mptcp_loss = 0;
+               if (tpheur->th_mptcp_success < MPTCP_SUCCESS_TRIGGER) {
+                       tpheur->th_mptcp_success++;
+
+                       if (tpheur->th_mptcp_success == MPTCP_SUCCESS_TRIGGER) {
+                               os_log(mptcp_log_handle, "%s disabling heuristics for 12 hours", __func__);
+                               tpheur->th_mptcp_heuristic_disabled = 1;
+                               /* Disable heuristics for 12 hours */
+                               tpheur->th_mptcp_backoff = tcp_now + tcp_min_to_hz(tcp_ecn_timeout * 12);
+                       }
+               }
        }
 
        tcp_heuristic_unlock(head);
@@ -734,6 +766,9 @@ __tcp_heuristic_tfo_middlebox_common(struct tcp_heuristic *tpheur)
        if (tpheur->th_tfo_backoff > tcp_min_to_hz(tcp_backoff_maximum)) {
                tpheur->th_tfo_backoff = tcp_min_to_hz(tcp_ecn_timeout);
        }
+
+       os_log(OS_LOG_DEFAULT, "%s disable TFO until %u now %u on %lx\n", __func__,
+           tpheur->th_tfo_backoff_until, tcp_now, (unsigned long)VM_KERNEL_ADDRPERM(tpheur));
 }
 
 static void
@@ -797,7 +832,9 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks,
                }
        }
 
-       if ((flags & TCPCACHE_F_ECN) && tpheur->th_ecn_loss < TCP_CACHE_OVERFLOW_PROTECT) {
+       if ((flags & TCPCACHE_F_ECN) &&
+           tpheur->th_ecn_loss < TCP_CACHE_OVERFLOW_PROTECT &&
+           TSTMP_LEQ(tpheur->th_ecn_backoff, tcp_now)) {
                tpheur->th_ecn_loss++;
                if (tpheur->th_ecn_loss >= ECN_MAX_SYN_LOSS) {
                        tcpstat.tcps_ecn_fallback_synloss++;
@@ -805,11 +842,16 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks,
                        tpheur->th_ecn_backoff = tcp_now +
                            (tcp_min_to_hz(tcp_ecn_timeout) <<
                            (tpheur->th_ecn_loss - ECN_MAX_SYN_LOSS));
+
+                       os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx for SYN-loss\n",
+                           __func__, tpheur->th_ecn_backoff, tcp_now,
+                           (unsigned long)VM_KERNEL_ADDRPERM(tpheur));
                }
        }
 
        if ((flags & TCPCACHE_F_MPTCP) &&
-           tpheur->th_mptcp_loss < TCP_CACHE_OVERFLOW_PROTECT) {
+           tpheur->th_mptcp_loss < TCP_CACHE_OVERFLOW_PROTECT &&
+           tpheur->th_mptcp_heuristic_disabled == 0) {
                tpheur->th_mptcp_loss++;
                if (tpheur->th_mptcp_loss >= MPTCP_MAX_SYN_LOSS) {
                        /*
@@ -819,11 +861,17 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks,
                        tpheur->th_mptcp_backoff = tcp_now +
                            (tcp_min_to_hz(tcp_ecn_timeout) <<
                            (tpheur->th_mptcp_loss - MPTCP_MAX_SYN_LOSS));
+                       tpheur->th_mptcp_in_backoff = 1;
+
+                       os_log(OS_LOG_DEFAULT, "%s disable MPTCP until %u now %u on %lx\n",
+                           __func__, tpheur->th_mptcp_backoff, tcp_now,
+                           (unsigned long)VM_KERNEL_ADDRPERM(tpheur));
                }
        }
 
        if ((flags & TCPCACHE_F_ECN_DROPRST) &&
-           tpheur->th_ecn_droprst < TCP_CACHE_OVERFLOW_PROTECT) {
+           tpheur->th_ecn_droprst < TCP_CACHE_OVERFLOW_PROTECT &&
+           TSTMP_LEQ(tpheur->th_ecn_backoff, tcp_now)) {
                tpheur->th_ecn_droprst++;
                if (tpheur->th_ecn_droprst >= ECN_MAX_DROPRST) {
                        tcpstat.tcps_ecn_fallback_droprst++;
@@ -832,11 +880,16 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks,
                        tpheur->th_ecn_backoff = tcp_now +
                            (tcp_min_to_hz(tcp_ecn_timeout) <<
                            (tpheur->th_ecn_droprst - ECN_MAX_DROPRST));
+
+                       os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx for drop-RST\n",
+                           __func__, tpheur->th_ecn_backoff, tcp_now,
+                           (unsigned long)VM_KERNEL_ADDRPERM(tpheur));
                }
        }
 
        if ((flags & TCPCACHE_F_ECN_DROPRXMT) &&
-           tpheur->th_ecn_droprxmt < TCP_CACHE_OVERFLOW_PROTECT) {
+           tpheur->th_ecn_droprxmt < TCP_CACHE_OVERFLOW_PROTECT &&
+           TSTMP_LEQ(tpheur->th_ecn_backoff, tcp_now)) {
                tpheur->th_ecn_droprxmt++;
                if (tpheur->th_ecn_droprxmt >= ECN_MAX_DROPRXMT) {
                        tcpstat.tcps_ecn_fallback_droprxmt++;
@@ -845,6 +898,10 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks,
                        tpheur->th_ecn_backoff = tcp_now +
                            (tcp_min_to_hz(tcp_ecn_timeout) <<
                            (tpheur->th_ecn_droprxmt - ECN_MAX_DROPRXMT));
+
+                       os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx for drop-Rxmit\n",
+                           __func__, tpheur->th_ecn_backoff, tcp_now,
+                           (unsigned long)VM_KERNEL_ADDRPERM(tpheur));
                }
        }
        if ((flags & TCPCACHE_F_ECN_SYNRST) &&
@@ -857,6 +914,10 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks,
                        tpheur->th_ecn_backoff = tcp_now +
                            (tcp_min_to_hz(tcp_ecn_timeout) <<
                            (tpheur->th_ecn_synrst - ECN_MAX_SYNRST));
+
+                       os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx for SYN-RST\n",
+                           __func__, tpheur->th_ecn_backoff, tcp_now,
+                           (unsigned long)VM_KERNEL_ADDRPERM(tpheur));
                }
        }
        tcp_heuristic_unlock(head);
@@ -868,6 +929,11 @@ tcp_heuristic_tfo_loss(struct tcpcb *tp)
        struct tcp_cache_key_src tcks;
        uint32_t flag = 0;
 
+       if (symptoms_is_wifi_lossy() &&
+           IFNET_IS_WIFI(tp->t_inpcb->inp_last_outifp)) {
+               return;
+       }
+
        tcp_cache_key_src_create(tp, &tcks);
 
        if (tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) {
@@ -903,6 +969,11 @@ tcp_heuristic_mptcp_loss(struct tcpcb *tp)
 {
        struct tcp_cache_key_src tcks;
 
+       if (symptoms_is_wifi_lossy() &&
+           IFNET_IS_WIFI(tp->t_inpcb->inp_last_outifp)) {
+               return;
+       }
+
        tcp_cache_key_src_create(tp, &tcks);
 
        tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_MPTCP);
@@ -913,6 +984,11 @@ tcp_heuristic_ecn_loss(struct tcpcb *tp)
 {
        struct tcp_cache_key_src tcks;
 
+       if (symptoms_is_wifi_lossy() &&
+           IFNET_IS_WIFI(tp->t_inpcb->inp_last_outifp)) {
+               return;
+       }
+
        tcp_cache_key_src_create(tp, &tcks);
 
        tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_ECN);
@@ -970,6 +1046,12 @@ tcp_heuristic_ecn_aggressive_common(struct tcp_cache_key_src *tcks)
                return;
        }
 
+       if (TSTMP_GT(tpheur->th_ecn_backoff, tcp_now)) {
+               /* We are already in aggressive mode */
+               tcp_heuristic_unlock(head);
+               return;
+       }
+
        /* Must be done before, otherwise we will start off with expo-backoff */
        tpheur->th_ecn_backoff = tcp_now +
            (tcp_min_to_hz(tcp_ecn_timeout) << (tpheur->th_ecn_aggressive));
@@ -983,6 +1065,9 @@ tcp_heuristic_ecn_aggressive_common(struct tcp_cache_key_src *tcks)
        }
 
        tcp_heuristic_unlock(head);
+
+       os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx\n", __func__,
+           tpheur->th_ecn_backoff, tcp_now, (unsigned long)VM_KERNEL_ADDRPERM(tpheur));
 }
 
 void
@@ -1041,16 +1126,23 @@ tcp_heuristic_do_tfo(struct tcpcb *tp)
 
        return FALSE;
 }
-
-boolean_t
+/*
+ * @return:
+ *         0   Enable MPTCP (we are still discovering middleboxes)
+ *         -1  Enable MPTCP (heuristics have been temporarily disabled)
+ *         1   Disable MPTCP
+ */
+int
 tcp_heuristic_do_mptcp(struct tcpcb *tp)
 {
        struct tcp_cache_key_src tcks;
        struct tcp_heuristics_head *head = NULL;
        struct tcp_heuristic *tpheur;
+       int ret = 0;
 
-       if (disable_tcp_heuristics) {
-               return TRUE;
+       if (disable_tcp_heuristics ||
+           (tptomptp(tp)->mpt_mpte->mpte_flags & MPTE_FORCE_ENABLE)) {
+               return 0;
        }
 
        tcp_cache_key_src_create(tp, &tcks);
@@ -1058,16 +1150,32 @@ tcp_heuristic_do_mptcp(struct tcpcb *tp)
        /* Get the tcp-heuristic. */
        tpheur = tcp_getheuristic_with_lock(&tcks, 0, &head);
        if (tpheur == NULL) {
-               return TRUE;
+               return 0;
+       }
+
+       if (tpheur->th_mptcp_in_backoff == 0 ||
+           tpheur->th_mptcp_heuristic_disabled == 1) {
+               goto mptcp_ok;
        }
 
        if (TSTMP_GT(tpheur->th_mptcp_backoff, tcp_now)) {
                goto fallback;
        }
 
-       tcp_heuristic_unlock(head);
+       tpheur->th_mptcp_in_backoff = 0;
 
-       return TRUE;
+mptcp_ok:
+       if (tpheur->th_mptcp_heuristic_disabled) {
+               ret = -1;
+
+               if (TSTMP_GT(tcp_now, tpheur->th_mptcp_backoff)) {
+                       tpheur->th_mptcp_heuristic_disabled = 0;
+                       tpheur->th_mptcp_success = 0;
+               }
+       }
+
+       tcp_heuristic_unlock(head);
+       return ret;
 
 fallback:
        if (head) {
@@ -1080,7 +1188,7 @@ fallback:
                tcpstat.tcps_mptcp_heuristic_fallback++;
        }
 
-       return FALSE;
+       return 1;
 }
 
 static boolean_t
@@ -1113,6 +1221,9 @@ tcp_heuristic_do_ecn_common(struct tcp_cache_key_src *tcks)
                if (tpheur->th_ecn_synrst >= ECN_RETRY_LIMIT) {
                        tpheur->th_ecn_synrst = 0;
                }
+
+               /* Make sure it follows along */
+               tpheur->th_ecn_backoff = tcp_now;
        }
 
        tcp_heuristic_unlock(head);
index d9344c84ed0887a7cef0ff81565191b339845ef5..9259076ca7cdf08636248e990ceeba5c2bfbeb76 100644 (file)
@@ -51,7 +51,7 @@ extern void tcp_heuristic_tfo_success(struct tcpcb *tp);
 extern void tcp_heuristic_mptcp_success(struct tcpcb *tp);
 extern void tcp_heuristic_ecn_success(struct tcpcb *tp);
 extern boolean_t tcp_heuristic_do_tfo(struct tcpcb *tp);
-extern boolean_t tcp_heuristic_do_mptcp(struct tcpcb *tp);
+extern int tcp_heuristic_do_mptcp(struct tcpcb *tp);
 extern boolean_t tcp_heuristic_do_ecn(struct tcpcb *tp);
 extern void tcp_heuristic_ecn_droprst(struct tcpcb *tp);
 extern void tcp_heuristic_ecn_droprxmt(struct tcpcb *tp);
index 3512bc9a1e0163cf45af1fef8a1e57d64bc958f1..2eb6faf90c3b1edbb1f446dba32589ab1620cb63 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2013-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <mach/sdt.h>
 #include <libkern/OSAtomic.h>
 
-struct tcp_cc_debug_state {
-       u_int64_t ccd_tsns;
-       char ccd_srcaddr[INET6_ADDRSTRLEN];
-       uint16_t ccd_srcport;
-       char ccd_destaddr[INET6_ADDRSTRLEN];
-       uint16_t ccd_destport;
-       uint32_t ccd_snd_cwnd;
-       uint32_t ccd_snd_wnd;
-       uint32_t ccd_snd_ssthresh;
-       uint32_t ccd_pipeack;
-       uint32_t ccd_rttcur;
-       uint32_t ccd_rxtcur;
-       uint32_t ccd_srtt;
-       uint32_t ccd_event;
-       uint32_t ccd_sndcc;
-       uint32_t ccd_sndhiwat;
-       uint32_t ccd_bytes_acked;
-       u_int8_t ccd_cc_index;
-       u_int8_t ccd_unused_1__;
-       u_int16_t ccd_unused_2__;
-       union {
-               struct {
-                       uint32_t ccd_last_max;
-                       uint32_t ccd_tcp_win;
-                       uint32_t ccd_target_win;
-                       uint32_t ccd_avg_lastmax;
-                       uint32_t ccd_mean_deviation;
-               } cubic_state;
-               struct {
-                       u_int32_t led_base_rtt;
-               } ledbat_state;
-       } u;
-};
-
 SYSCTL_SKMEM_TCP_INT(OID_AUTO, cc_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
     int, tcp_cc_debug, 0, "Enable debug data collection");
 
@@ -113,8 +79,6 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, cwnd_nonvalidated,
 struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT];
 struct zone *tcp_cc_zone;
 
-/* Information for colelcting TCP debug information using control socket */
-#define TCP_CCDEBUG_CONTROL_NAME "com.apple.network.tcp_ccdebug"
 #define TCP_CCDBG_NOUNIT 0xffffffff
 static kern_ctl_ref tcp_ccdbg_ctlref = NULL;
 volatile UInt32 tcp_ccdbg_unit = TCP_CCDBG_NOUNIT;
@@ -151,12 +115,13 @@ tcp_cc_control_register(void)
        errno_t err;
 
        bzero(&ccdbg_control, sizeof(ccdbg_control));
-       strlcpy(ccdbg_control.ctl_name, TCP_CCDEBUG_CONTROL_NAME,
+       strlcpy(ccdbg_control.ctl_name, TCP_CC_CONTROL_NAME,
            sizeof(ccdbg_control.ctl_name));
        ccdbg_control.ctl_connect = tcp_ccdbg_control_connect;
        ccdbg_control.ctl_disconnect = tcp_ccdbg_control_disconnect;
        ccdbg_control.ctl_flags |= CTL_FLAG_PRIVILEGED;
        ccdbg_control.ctl_flags |= CTL_FLAG_REG_SOCK_STREAM;
+       ccdbg_control.ctl_sendsize = 32 * 1024;
 
        err = ctl_register(&ccdbg_control, &tcp_ccdbg_ctlref);
        if (err != 0) {
@@ -340,7 +305,7 @@ tcp_cc_cwnd_init_or_reset(struct tcpcb *tp)
 /*
  * Indicate whether this ack should be delayed.
  * Here is the explanation for different settings of tcp_delack_enabled:
- *  - when set to 1, the bhavior is same as when set to 2. We kept this
+ *  - when set to 1, the behavior is same as when set to 2. We kept this
  *    for binary compatibility.
  *  - when set to 2, will "ack every other packet"
  *      - if our last ack wasn't a 0-sized window.
@@ -372,8 +337,8 @@ tcp_cc_delay_ack(struct tcpcb *tp, struct tcphdr *th)
                if ((tp->t_flags & TF_RXWIN0SENT) == 0 &&
                    (th->th_flags & TH_PUSH) == 0 &&
                    ((tp->t_unacksegs == 1) ||
-                   ((tp->t_flags & TF_STRETCHACK) != 0 &&
-                   tp->t_unacksegs < (maxseg_unacked)))) {
+                   ((tp->t_flags & TF_STRETCHACK) &&
+                   tp->t_unacksegs < maxseg_unacked))) {
                        return 1;
                }
                break;
index 8a1f584ad1589f60a4d38ad91dcd2bdc0836ec84..3f484dac04f19e39510490aa046edc8b56a18cac 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2010-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #ifndef _NETINET_CC_H_
 #define _NETINET_CC_H_
 
-#ifdef KERNEL
-#include <netinet/tcp.h>
-#include <netinet/tcp_var.h>
-#include <kern/zalloc.h>
+#ifdef PRIVATE
+
+#include <netinet/in.h>
+
+/*
+ * Data structure to collect and display congestion control debug information
+ */
+struct tcp_cc_debug_state {
+       u_int64_t ccd_tsns;
+       char ccd_srcaddr[INET6_ADDRSTRLEN];
+       uint16_t ccd_srcport;
+       char ccd_destaddr[INET6_ADDRSTRLEN];
+       uint16_t ccd_destport;
+       uint32_t ccd_snd_cwnd;
+       uint32_t ccd_snd_wnd;
+       uint32_t ccd_snd_ssthresh;
+       uint32_t ccd_pipeack;
+       uint32_t ccd_rttcur;
+       uint32_t ccd_rxtcur;
+       uint32_t ccd_srtt;
+       uint32_t ccd_event;
+       uint32_t ccd_sndcc;
+       uint32_t ccd_sndhiwat;
+       uint32_t ccd_bytes_acked;
+       u_int8_t ccd_cc_index;
+       u_int8_t ccd_unused_1__;
+       u_int16_t ccd_unused_2__;
+       union {
+               struct {
+                       uint32_t ccd_last_max;
+                       uint32_t ccd_tcp_win;
+                       uint32_t ccd_target_win;
+                       uint32_t ccd_avg_lastmax;
+                       uint32_t ccd_mean_deviation;
+               } cubic_state;
+               struct {
+                       u_int32_t led_base_rtt;
+               } ledbat_state;
+       } u;
+};
 
+/*
+ * Values of ccd_cc_index
+ */
 #define TCP_CC_ALGO_NONE                0
 #define TCP_CC_ALGO_NEWRENO_INDEX       1
 #define TCP_CC_ALGO_BACKGROUND_INDEX    2 /* CC for background transport */
 #define TCP_CC_ALGO_CUBIC_INDEX         3 /* default CC algorithm */
 #define TCP_CC_ALGO_COUNT               4 /* Count of CC algorithms */
 
-#define TCP_CA_NAME_MAX 16              /* Maximum characters in the name of a CC algorithm */
+/*
+ * Values of ccd_event
+ */
+#define TCP_CC_EVENT_LIST                       \
+       X(TCP_CC_CWND_INIT)                     \
+       X(TCP_CC_INSEQ_ACK_RCVD)                \
+       X(TCP_CC_ACK_RCVD)                      \
+       X(TCP_CC_ENTER_FASTRECOVERY)            \
+       X(TCP_CC_IN_FASTRECOVERY)               \
+       X(TCP_CC_EXIT_FASTRECOVERY)             \
+       X(TCP_CC_PARTIAL_ACK)                   \
+       X(TCP_CC_IDLE_TIMEOUT)                  \
+       X(TCP_CC_REXMT_TIMEOUT)                 \
+       X(TCP_CC_ECN_RCVD)                      \
+       X(TCP_CC_BAD_REXMT_RECOVERY)            \
+       X(TCP_CC_OUTPUT_ERROR)                  \
+       X(TCP_CC_CHANGE_ALGO)                   \
+       X(TCP_CC_FLOW_CONTROL)                  \
+       X(TCP_CC_SUSPEND)                       \
+       X(TCP_CC_LIMITED_TRANSMIT)              \
+       X(TCP_CC_EARLY_RETRANSMIT)              \
+       X(TCP_CC_TLP_RECOVERY)                  \
+       X(TCP_CC_TLP_RECOVER_LASTPACKET)        \
+       X(TCP_CC_DELAY_FASTRECOVERY)            \
+       X(TCP_CC_TLP_IN_FASTRECOVERY)           \
+       X(TCP_CC_DSACK_BAD_REXMT)               \
+       X(TCP_CC_FIRST_REXMT)                   \
+       X(MAX_TCP_CC_EVENTS)
+
+enum tcp_cc_event {
+#define X(name, ...) name,
+       TCP_CC_EVENT_LIST
+#undef X
+};
+
+/*
+ * Kernel control ID
+ */
+#define TCP_CC_CONTROL_NAME     "com.apple.network.tcp_ccdebug"
+
+#endif /* PRIVATE */
+
+#ifdef KERNEL_PRIVATE
+
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <kern/zalloc.h>
+
+/*
+ * Maximum characters in the name of a CC algorithm
+ */
+#define TCP_CA_NAME_MAX 16
 
 extern int tcp_recv_bg;
 
@@ -153,5 +243,5 @@ extern void tcp_cc_adjust_nonvalidated_cwnd(struct tcpcb *tp);
 extern u_int32_t tcp_get_max_pipeack(struct tcpcb *tp);
 extern void tcp_clear_pipeack_state(struct tcpcb *tp);
 
-#endif /* KERNEL */
+#endif /* KERNEL_PRIVATE */
 #endif /* _NETINET_CC_H_ */
index 2d1ad246bb8edae4fb28d515c2e50cb9c9123f81..a347a0dcbaba7a8923bbff3df49307a7b368f89e 100644 (file)
@@ -85,9 +85,9 @@ struct tcp_cc_algo tcp_cc_cubic = {
        .switch_to = tcp_cubic_switch_cc
 };
 
-const float tcp_cubic_backoff = 0.2; /* multiplicative decrease factor */
-const float tcp_cubic_coeff = 0.4;
-const float tcp_cubic_fast_convergence_factor = 0.875;
+const float tcp_cubic_backoff = 0.2f; /* multiplicative decrease factor */
+const float tcp_cubic_coeff = 0.4f;
+const float tcp_cubic_fast_convergence_factor = 0.875f;
 
 SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_tcp_friendliness, CTLFLAG_RW | CTLFLAG_LOCKED,
     static int, tcp_cubic_tcp_friendliness, 0, "Enable TCP friendliness");
index 8990100b8c237e55da3f5c6719244bd176933671..d7d04516aa2ea1c0d8d5e6dbd0456a103ed4cef0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -91,7 +91,9 @@
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/ntstat.h>
+#include <net/content_filter.h>
 #include <net/dlil.h>
+#include <net/multi_layer_pkt_log.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
 struct tcphdr tcp_savetcp;
 #endif /* TCPDEBUG */
+#include <netinet/tcp_log.h>
 
 #if IPSEC
 #include <netinet6/ipsec.h>
@@ -291,7 +294,6 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_rfc5961,
     CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_do_rfc5961, 1,
     "Enable/Disable full RFC 5961 compliance");
 
-extern int tcp_TCPTV_MIN;
 extern int tcp_acc_iaj_high;
 extern int tcp_acc_iaj_react_limit;
 
@@ -310,8 +312,6 @@ static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *,
 static void tcp_finalize_options(struct tcpcb *, struct tcpopt *, unsigned int);
 static void tcp_pulloutofband(struct socket *,
     struct tcphdr *, struct mbuf *, int);
-static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *,
-    struct ifnet *);
 static void tcp_xmit_timer(struct tcpcb *, int, u_int32_t, tcp_seq);
 static inline unsigned int tcp_maxmtu(struct rtentry *);
 static inline int tcp_stretch_ack_enable(struct tcpcb *tp, int thflags);
@@ -637,7 +637,7 @@ tcp_bwmeas_check(struct tcpcb *tp)
 
 static int
 tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m,
-    struct ifnet *ifp)
+    struct ifnet *ifp, int *dowakeup)
 {
        struct tseg_qent *q;
        struct tseg_qent *p = NULL;
@@ -646,7 +646,6 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m,
        struct inpcb *inp = tp->t_inpcb;
        struct socket *so = inp->inp_socket;
        int flags = 0;
-       int dowakeup = 0;
        struct mbuf *oodata = NULL;
        int copy_oodata = 0;
        u_int16_t qlimit;
@@ -896,6 +895,13 @@ present:
                if (so->so_state & SS_CANTRCVMORE) {
                        m_freem(q->tqe_m);
                } else {
+                       /*
+                        * The mbuf may be freed after it has been added to the
+                        * receive socket buffer so we reinitialize th to point
+                        * to a safe copy of the TCP header
+                        */
+                       struct tcphdr saved_tcphdr = {};
+
                        so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */
                        if (so->so_flags & SOF_ENABLE_MSGS) {
                                /*
@@ -911,10 +917,13 @@ present:
                                        copy_oodata = 0;
                                }
                        }
+                       memcpy(&saved_tcphdr, th, sizeof(struct tcphdr));
                        if (sbappendstream_rcvdemux(so, q->tqe_m,
                            q->tqe_th->th_seq - (tp->irs + 1), 0)) {
-                               dowakeup = 1;
+                               *dowakeup = 1;
                        }
+                       th = &saved_tcphdr;
+
                        if (tp->t_flagsext & TF_LRO_OFFLOADED) {
                                tcp_update_lro_seq(tp->rcv_nxt,
                                    inp->inp_laddr, inp->inp_faddr,
@@ -955,7 +964,7 @@ msg_unordered_delivery:
                if (oodata != NULL) {
                        if (sbappendmsgstream_rcv(&so->so_rcv, oodata,
                            te->tqe_th->th_seq - (tp->irs + 1), 1)) {
-                               dowakeup = 1;
+                               *dowakeup = 1;
                                tcpstat.tcps_msg_unopkts++;
                        } else {
                                tcpstat.tcps_msg_unoappendfail++;
@@ -963,9 +972,6 @@ msg_unordered_delivery:
                }
        }
 
-       if (dowakeup) {
-               sorwakeup(so); /* done with socket lock held */
-       }
        return flags;
 }
 
@@ -1186,7 +1192,7 @@ tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv,
                 */
                if (TSTMP_GEQ(tcp_now,
                    tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) {
-                       if (tp->rfbuf_cnt >= TCP_RCVNOTS_BYTELEVEL) {
+                       if (tp->rfbuf_cnt + pktlen >= TCP_RCVNOTS_BYTELEVEL) {
                                tcp_sbrcv_reserve(tp, sbrcv,
                                    tcp_autorcvbuf_max, 0,
                                    tcp_autorcvbuf_max);
@@ -1207,8 +1213,9 @@ tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv,
                 * on the link.
                 */
                if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
-                       if (tp->rfbuf_cnt > (sbrcv->sb_hiwat -
+                       if (tp->rfbuf_cnt + pktlen > (sbrcv->sb_hiwat -
                            (sbrcv->sb_hiwat >> 1))) {
+                               tp->rfbuf_cnt += pktlen;
                                int32_t rcvbuf_inc, min_incr;
                                /*
                                 * Increment the receive window by a
@@ -1238,7 +1245,7 @@ tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv,
                                    (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
                                tcp_sbrcv_reserve(tp, sbrcv,
                                    sbrcv->sb_hiwat + rcvbuf_inc,
-                                   (tp->rfbuf_cnt * 2), rcvbuf_max);
+                                   (tp->rfbuf_cnt << 1), rcvbuf_max);
                        }
                        /* Measure instantaneous receive bandwidth */
                        if (tp->t_bwmeas != NULL && tp->rfbuf_cnt > 0 &&
@@ -1429,6 +1436,7 @@ tcp_reset_stretch_ack(struct tcpcb *tp)
 {
        tp->t_flags &= ~(TF_STRETCHACK | TF_STREAMING_ON);
        tp->rcv_by_unackwin = 0;
+       tp->rcv_by_unackhalfwin = 0;
        tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
 
        /*
@@ -1802,7 +1810,8 @@ tcp_tfo_synack(struct tcpcb *tp, struct tcpopt *to)
                 * rexmit the SYN. If that's the case, it's better to start
                 * backing of TFO-cookie requests.
                 */
-               if (tp->t_tfo_flags & TFO_F_SYN_LOSS) {
+               if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
+                   tp->t_tfo_flags & TFO_F_SYN_LOSS) {
                        tp->t_tfo_stats |= TFO_S_SYN_LOSS;
                        tcpstat.tcps_tfo_syn_loss++;
 
@@ -1892,6 +1901,17 @@ tcp_update_window(struct tcpcb *tp, int thflags, struct tcphdr * th,
        return false;
 }
 
+static void
+tcp_handle_wakeup(struct socket *so, int read_wakeup, int write_wakeup)
+{
+       if (read_wakeup != 0) {
+               sorwakeup(so);
+       }
+       if (write_wakeup != 0) {
+               sowwakeup(so);
+       }
+}
+
 void
 tcp_input(struct mbuf *m, int off0)
 {
@@ -1906,6 +1926,8 @@ tcp_input(struct mbuf *m, int off0)
        int thflags;
        struct socket *so = 0;
        int todrop, acked, ourfinisacked, needoutput = 0;
+       int read_wakeup = 0;
+       int write_wakeup = 0;
        struct in_addr laddr;
 #if INET6
        struct in6_addr laddr6;
@@ -1936,7 +1958,22 @@ tcp_input(struct mbuf *m, int off0)
        boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
        boolean_t recvd_dsack = FALSE;
        struct tcp_respond_args tra;
+       int prev_t_state;
+       boolean_t check_cfil = cfil_filter_present();
        bool findpcb_iterated = false;
+       /*
+        * The mbuf may be freed after it has been added to the receive socket
+        * buffer or the reassembly queue, so we reinitialize th to point to a
+        * safe copy of the TCP header
+        */
+       struct tcphdr saved_tcphdr = {};
+       /*
+        * Save copy of the IPv4/IPv6 header.
+        * Note: use array of uint32_t to silence compiler warning when casting
+        * to a struct ip6_hdr pointer.
+        */
+#define MAX_IPWORDS ((sizeof(struct ip) + MAX_IPOPTLEN) / sizeof(uint32_t))
+       uint32_t saved_hdr[MAX_IPWORDS];
 
 #define TCP_INC_VAR(stat, npkts) do {                   \
                stat += npkts;                          \
@@ -1988,6 +2025,7 @@ tcp_input(struct mbuf *m, int off0)
                th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
 
                if (tcp_input_checksum(AF_INET6, m, th, off0, tlen)) {
+                       TCP_LOG_DROP_PKT(ip6, th, ifp, "IPv6 bad tcp checksum");
                        goto dropnosock;
                }
 
@@ -2005,6 +2043,7 @@ tcp_input(struct mbuf *m, int off0)
                if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
                        /* XXX stat */
                        IF_TCP_STATINC(ifp, unspecv6);
+                       TCP_LOG_DROP_PKT(ip6, th, ifp, "src IPv6 address unspecified");
                        goto dropnosock;
                }
                DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
@@ -2038,6 +2077,7 @@ tcp_input(struct mbuf *m, int off0)
                tlen = ip->ip_len;
 
                if (tcp_input_checksum(AF_INET, m, th, off0, tlen)) {
+                       TCP_LOG_DROP_PKT(ip, th, ifp, "IPv4 bad tcp checksum");
                        goto dropnosock;
                }
 
@@ -2055,6 +2095,8 @@ tcp_input(struct mbuf *m, int off0)
                    th->th_seq, th->th_ack, th->th_win);
        }
 
+#define TCP_LOG_HDR (isipv6 ? (void *)ip6 : (void *)ip)
+
        /*
         * Check that TCP offset makes sense,
         * pull out TCP options and adjust length.
@@ -2063,6 +2105,7 @@ tcp_input(struct mbuf *m, int off0)
        if (off < sizeof(struct tcphdr) || off > tlen) {
                tcpstat.tcps_rcvbadoff++;
                IF_TCP_STATINC(ifp, badformat);
+               TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "bad tcp offset");
                goto dropnosock;
        }
        tlen -= off;    /* tlen is used instead of ti->ti_len */
@@ -2116,6 +2159,7 @@ tcp_input(struct mbuf *m, int off0)
         */
        if (drop_synfin && (thflags & (TH_SYN | TH_FIN)) == (TH_SYN | TH_FIN)) {
                IF_TCP_STATINC(ifp, synfin);
+               TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "drop SYN FIN");
                goto dropnosock;
        }
 #endif
@@ -2286,18 +2330,22 @@ findpcb:
                                switch (blackhole) {
                                case 1:
                                        if (thflags & TH_SYN) {
+                                               TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole 1 syn for closed port");
                                                goto dropnosock;
                                        }
                                        break;
                                case 2:
+                                       TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole 2 closed port");
                                        goto dropnosock;
                                default:
+                                       TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole closed port");
                                        goto dropnosock;
                                }
                        }
                }
                rstreason = BANDLIM_RST_CLOSEDPORT;
                IF_TCP_STATINC(ifp, noconnnolist);
+               TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "closed port");
                goto dropwithresetnosock;
        }
        so = inp->inp_socket;
@@ -2311,6 +2359,7 @@ findpcb:
 #if TEMPDEBUG
                printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
 #endif
+               TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "inp_socket NULL");
                goto dropnosock;
        }
 
@@ -2318,6 +2367,7 @@ findpcb:
        if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
                socket_unlock(so, 1);
                inp = NULL;     // pretend we didn't find it
+               TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "inp state WNT_STOPUSING");
                goto dropnosock;
        }
 
@@ -2357,20 +2407,49 @@ findpcb:
                }
        }
 
+       tp = intotcpcb(inp);
+       if (tp == NULL) {
+               rstreason = BANDLIM_RST_CLOSEDPORT;
+               IF_TCP_STATINC(ifp, noconnlist);
+               TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "tp is NULL");
+               goto dropwithreset;
+       }
+
+       TCP_LOG_TH_FLAGS(TCP_LOG_HDR, th, tp, false, ifp);
+
+       if (tp->t_state == TCPS_CLOSED) {
+               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "tp state TCPS_CLOSED");
+               goto drop;
+       }
+
 #if NECP
        if (so->so_state & SS_ISCONNECTED) {
                // Connected TCP sockets have a fully-bound local and remote,
                // so the policy check doesn't need to override addresses
-               if (!necp_socket_is_allowed_to_send_recv(inp, NULL, NULL, NULL)) {
+               if (!necp_socket_is_allowed_to_send_recv(inp, ifp, NULL, NULL, NULL)) {
+                       TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
                        IF_TCP_STATINC(ifp, badformat);
                        goto drop;
                }
        } else {
+               /*
+                * If the proc_uuid_policy table has been updated since the last use
+                * of the listening socket (i.e., the proc_uuid_policy_table_gencount
+                * has been updated), the flags in the socket may be out of date.
+                * If INP2_WANT_APP_POLICY is stale, inbound packets may
+                * be dropped by NECP if the socket should now match a per-app
+                * exception policy.
+                * In order to avoid this refresh the proc_uuid_policy state to
+                * potentially recalculate the socket's flags before checking
+                * with NECP.
+                */
+               (void) inp_update_policy(inp);
 #if INET6
                if (isipv6) {
                        if (!necp_socket_is_allowed_to_send_recv_v6(inp,
                            th->th_dport, th->th_sport, &ip6->ip6_dst,
                            &ip6->ip6_src, ifp, NULL, NULL, NULL)) {
+                               TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
                                IF_TCP_STATINC(ifp, badformat);
                                goto drop;
                        }
@@ -2380,6 +2459,7 @@ findpcb:
                        if (!necp_socket_is_allowed_to_send_recv_v4(inp,
                            th->th_dport, th->th_sport, &ip->ip_dst, &ip->ip_src,
                            ifp, NULL, NULL, NULL)) {
+                               TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
                                IF_TCP_STATINC(ifp, badformat);
                                goto drop;
                        }
@@ -2387,18 +2467,11 @@ findpcb:
        }
 #endif /* NECP */
 
-       tp = intotcpcb(inp);
-       if (tp == 0) {
-               rstreason = BANDLIM_RST_CLOSEDPORT;
-               IF_TCP_STATINC(ifp, noconnlist);
-               goto dropwithreset;
-       }
-       if (tp->t_state == TCPS_CLOSED) {
-               goto drop;
-       }
+       prev_t_state = tp->t_state;
 
        /* If none of the FIN|SYN|RST|ACK flag is set, drop */
        if (tcp_do_rfc5961 && (thflags & TH_ACCEPT) == 0) {
+               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 TH_ACCEPT == 0");
                goto drop;
        }
 
@@ -2409,8 +2482,10 @@ findpcb:
                tiwin = th->th_win;
        }
 
+
 #if CONFIG_MACF_NET
        if (mac_inpcb_check_deliver(inp, m, AF_INET, SOCK_STREAM)) {
+               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "mac_inpcb_check_deliver failed");
                goto drop;
        }
 #endif
@@ -2418,6 +2493,7 @@ findpcb:
        /* Avoid processing packets while closing a listen socket */
        if (tp->t_state == TCPS_LISTEN &&
            (so->so_options & SO_ACCEPTCONN) == 0) {
+               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "closing a listening socket");
                goto drop;
        }
 
@@ -2440,13 +2516,15 @@ findpcb:
                        struct socket *so2;
                        struct socket *oso;
                        struct sockaddr_storage from;
+                       struct sockaddr_storage to2;
 #if INET6
                        struct inpcb *oinp = sotoinpcb(so);
 #endif /* INET6 */
                        struct ifnet *head_ifscope;
                        unsigned int head_nocell, head_recvanyif,
                            head_noexpensive, head_awdl_unrestricted,
-                           head_intcoproc_allowed;
+                           head_intcoproc_allowed, head_external_port,
+                           head_noconstrained;
 
                        /* Get listener's bound-to-interface, if any */
                        head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
@@ -2457,8 +2535,10 @@ findpcb:
                        head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF);
                        /* Get listener's no-expensive information, if any */
                        head_noexpensive = INP_NO_EXPENSIVE(inp);
+                       head_noconstrained = INP_NO_CONSTRAINED(inp);
                        head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
                        head_intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp);
+                       head_external_port = (inp->inp_flags2 & INP2_EXTERNAL_PORT);
 
                        /*
                         * If the state is LISTEN then ignore segment if it contains an RST.
@@ -2470,9 +2550,11 @@ findpcb:
                                IF_TCP_STATINC(ifp, listbadsyn);
 
                                if (thflags & TH_RST) {
+                                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN with RST");
                                        goto drop;
                                }
                                if (thflags & TH_ACK) {
+                                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN with ACK");
                                        tp = NULL;
                                        tcpstat.tcps_badsyn++;
                                        rstreason = BANDLIM_RST_OPENPORT;
@@ -2481,6 +2563,7 @@ findpcb:
 
                                /* We come here if there is no SYN set */
                                tcpstat.tcps_badsyn++;
+                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad SYN");
                                goto drop;
                        }
                        KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START, 0, 0, 0, 0, 0);
@@ -2489,11 +2572,13 @@ findpcb:
                                if (isipv6) {
                                        if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
                                            &ip6->ip6_src)) {
+                                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same port");
                                                goto drop;
                                        }
                                } else
 #endif /* INET6 */
                                if (ip->ip_dst.s_addr == ip->ip_src.s_addr) {
+                                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same IPv4 address");
                                        goto drop;
                                }
                        }
@@ -2506,12 +2591,14 @@ findpcb:
                         * be discarded.
                         */
                        if (m->m_flags & (M_BCAST | M_MCAST)) {
+                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "mbuf M_BCAST | M_MCAST");
                                goto drop;
                        }
 #if INET6
                        if (isipv6) {
                                if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
                                    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
+                                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "IN6_IS_ADDR_MULTICAST");
                                        goto drop;
                                }
                        } else
@@ -2520,6 +2607,7 @@ findpcb:
                            IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
                            ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
                            in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
+                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "multicast or broadcast address");
                                goto drop;
                        }
 
@@ -2549,12 +2637,13 @@ findpcb:
                                                tp = NULL;
                                                rstreason = BANDLIM_RST_OPENPORT;
                                                IF_TCP_STATINC(ifp, deprecate6);
+                                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "deprecated IPv6 address");
                                                goto dropwithreset;
                                        }
                                }
                        }
 #endif
-                       if (so->so_filt) {
+                       if (so->so_filt || check_cfil) {
 #if INET6
                                if (isipv6) {
                                        struct sockaddr_in6     *sin6 = (struct sockaddr_in6*)&from;
@@ -2565,6 +2654,15 @@ findpcb:
                                        sin6->sin6_flowinfo = 0;
                                        sin6->sin6_addr = ip6->ip6_src;
                                        sin6->sin6_scope_id = 0;
+
+                                       sin6 = (struct sockaddr_in6*)&to2;
+
+                                       sin6->sin6_len = sizeof(struct sockaddr_in6);
+                                       sin6->sin6_family = AF_INET6;
+                                       sin6->sin6_port = th->th_dport;
+                                       sin6->sin6_flowinfo = 0;
+                                       sin6->sin6_addr = ip6->ip6_dst;
+                                       sin6->sin6_scope_id = 0;
                                } else
 #endif
                                {
@@ -2574,7 +2672,17 @@ findpcb:
                                        sin->sin_family = AF_INET;
                                        sin->sin_port = th->th_sport;
                                        sin->sin_addr = ip->ip_src;
+
+                                       sin = (struct sockaddr_in*)&to2;
+
+                                       sin->sin_len = sizeof(struct sockaddr_in);
+                                       sin->sin_family = AF_INET;
+                                       sin->sin_port = th->th_dport;
+                                       sin->sin_addr = ip->ip_dst;
                                }
+                       }
+
+                       if (so->so_filt) {
                                so2 = sonewconn(so, 0, (struct sockaddr*)&from);
                        } else {
                                so2 = sonewconn(so, 0, NULL);
@@ -2589,6 +2697,7 @@ findpcb:
                                        }
                                }
                                if (!so2) {
+                                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " listen drop");
                                        goto drop;
                                }
                        }
@@ -2635,6 +2744,9 @@ findpcb:
                        if (head_noexpensive) {
                                inp_set_noexpensive(inp);
                        }
+                       if (head_noconstrained) {
+                               inp_set_noconstrained(inp);
+                       }
                        if (head_awdl_unrestricted) {
                                inp_set_awdl_unrestricted(inp);
                        }
@@ -2649,6 +2761,10 @@ findpcb:
                        } else {
                                inp->inp_flags &= ~INP_RECV_ANYIF;
                        }
+
+                       if (head_external_port) {
+                               inp->inp_flags2 |= INP2_EXTERNAL_PORT;
+                       }
 #if INET6
                        if (isipv6) {
                                inp->in6p_laddr = ip6->ip6_dst;
@@ -2675,6 +2791,7 @@ findpcb:
                                inp->inp_lport = 0;
                                socket_lock(oso, 0);    /* release ref on parent */
                                socket_unlock(oso, 1);
+                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " in_pcbinshash failed");
                                goto drop;
                        }
 #if INET6
@@ -2742,11 +2859,29 @@ findpcb:
 
                        tcp_set_max_rwinscale(tp, so, ifp);
 
+#if CONTENT_FILTER
+                       if (check_cfil) {
+                               int error = cfil_sock_attach(so2, (struct sockaddr*)&to2, (struct sockaddr*)&from,
+                                   CFS_CONNECTION_DIR_IN);
+                               if (error != 0) {
+                                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " cfil_sock_attach failed");
+                                       goto drop;
+                               }
+                       }
+#endif /* CONTENT_FILTER */
+
                        KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END, 0, 0, 0, 0, 0);
                }
        }
        socket_lock_assert_owned(so);
 
+       if (net_mpklog_enabled && (m->m_pkthdr.rcvif->if_xflags & IFXF_MPK_LOG)) {
+               MPKL_TCP_INPUT(tcp_mpkl_log_object,
+                   ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
+                   th->th_seq, th->th_ack, tlen, thflags,
+                   so->last_pid, so->so_log_seqn++);
+       }
+
        if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
                /*
                 * Evaluate the rate of arrival of packets to see if the
@@ -2764,11 +2899,13 @@ findpcb:
                } else {
                        tp->t_flags &= ~(TF_STRETCHACK);
                }
-               if (TSTMP_GT(tp->rcv_unackwin, tcp_now)) {
+               if (TSTMP_GT(tp->rcv_unackwin - (tcp_rcvunackwin >> 1), tcp_now)) {
+                       tp->rcv_by_unackhalfwin += (tlen + off);
                        tp->rcv_by_unackwin += (tlen + off);
                } else {
                        tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
-                       tp->rcv_by_unackwin = tlen + off;
+                       tp->rcv_by_unackwin = tp->rcv_by_unackhalfwin + tlen + off;
+                       tp->rcv_by_unackhalfwin = tlen + off;
                }
        }
 
@@ -2780,13 +2917,13 @@ findpcb:
        }
        /*
         * Explicit Congestion Notification - Flag that we need to send ECT if
-        *      + The IP Congestion experienced flag was set.
-        *      + Socket is in established state
-        *      + We negotiated ECN in the TCP setup
-        *      + This isn't a pure ack (tlen > 0)
-        *      + The data is in the valid window
+        *      + The IP Congestion experienced flag was set.
+        *      + Socket is in established state
+        *      + We negotiated ECN in the TCP setup
+        *      + This isn't a pure ack (tlen > 0)
+        *      + The data is in the valid window
         *
-        *      TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
+        *      TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
         */
        if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
            TCP_ECN_ENABLED(tp) && tlen > 0 &&
@@ -3100,7 +3237,7 @@ findpcb:
                                        tcp_bwmeas_check(tp);
                                }
 
-                               sowwakeup(so); /* has to be done with socket lock held */
+                               write_wakeup = 1;
                                if (!SLIST_EMPTY(&tp->t_notify_ack)) {
                                        tcp_notify_acknowledgement(tp, so);
                                }
@@ -3112,6 +3249,9 @@ findpcb:
                                tcp_tfo_rcv_ack(tp, th);
 
                                tcp_check_timer_state(tp);
+
+                               tcp_handle_wakeup(so, read_wakeup, write_wakeup);
+
                                socket_unlock(so, 1);
                                KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
                                return;
@@ -3203,10 +3343,21 @@ findpcb:
                         * this socket, deliver the packet received as an
                         * in-order message with sequence number attached to it.
                         */
+                       if (isipv6) {
+                               memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr));
+                               ip6 = (struct ip6_hdr *)&saved_hdr[0];
+                       } else {
+                               memcpy(&saved_hdr, ip, ip->ip_hl << 2);
+                               ip = (struct ip *)&saved_hdr[0];
+                       }
+                       memcpy(&saved_tcphdr, th, sizeof(struct tcphdr));
                        if (sbappendstream_rcvdemux(so, m,
                            th->th_seq - (tp->irs + 1), 0)) {
-                               sorwakeup(so);
+                               mptcp_handle_input(so);
+                               read_wakeup = 1;
                        }
+                       th = &saved_tcphdr;
+
 #if INET6
                        if (isipv6) {
                                KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
@@ -3237,6 +3388,9 @@ findpcb:
                        }
 
                        tcp_check_timer_state(tp);
+
+                       tcp_handle_wakeup(so, read_wakeup, write_wakeup);
+
                        socket_unlock(so, 1);
                        KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
                        return;
@@ -3266,9 +3420,10 @@ findpcb:
         */
        if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
            (mp_tp = tptomptp(tp))) {
-               mpte_lock_assert_held(mp_tp->mpt_mpte);
-               if (tp->rcv_wnd > mp_tp->mpt_rcvwnd) {
-                       tp->rcv_wnd = imax(mp_tp->mpt_rcvwnd, (int)(tp->rcv_adv - tp->rcv_nxt));
+               socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
+
+               if (tp->rcv_wnd > (int)(mp_tp->mpt_rcvadv - (uint32_t)mp_tp->mpt_rcvnxt)) {
+                       tp->rcv_wnd = mp_tp->mpt_rcvadv - (uint32_t)mp_tp->mpt_rcvnxt;
                        tcpstat.tcps_mp_reducedwin++;
                }
        }
@@ -3291,11 +3446,17 @@ findpcb:
 #endif
 
                socket_lock_assert_owned(so);
+
+               /* Clear the logging flags inherited from the listening socket */
+               tp->t_log_flags = 0;
+               tp->t_flagsext &= ~TF_LOGGED_CONN_SUMMARY;
+
 #if INET6
                if (isipv6) {
                        MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
                            M_SONAME, M_NOWAIT);
                        if (sin6 == NULL) {
+                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN malloc M_SONAME failed");
                                goto drop;
                        }
                        bzero(sin6, sizeof(*sin6));
@@ -3311,6 +3472,7 @@ findpcb:
                            proc0)) {
                                inp->in6p_laddr = laddr6;
                                FREE(sin6, M_SONAME);
+                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " LISTEN in6_pcbconnect failed");
                                goto drop;
                        }
                        FREE(sin6, M_SONAME);
@@ -3321,6 +3483,7 @@ findpcb:
                        MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
                            M_NOWAIT);
                        if (sin == NULL) {
+                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN malloc M_SONAME failed");
                                goto drop;
                        }
                        sin->sin_family = AF_INET;
@@ -3336,6 +3499,7 @@ findpcb:
                            IFSCOPE_NONE, NULL)) {
                                inp->inp_laddr = laddr;
                                FREE(sin, M_SONAME);
+                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " LISTEN in_pcbconnect failed");
                                goto drop;
                        }
                        FREE(sin, M_SONAME);
@@ -3371,6 +3535,7 @@ findpcb:
                tp->t_state = TCPS_SYN_RECEIVED;
                tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
                    TCP_CONN_KEEPINIT(tp));
+               tp->t_connect_time = tcp_now;
                dropsocket = 0;         /* committed to socket */
 
                if (inp->inp_flowhash == 0) {
@@ -3394,6 +3559,11 @@ findpcb:
                        tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT);
                }
 
+               /*
+                * The address and connection state are finalized
+                */
+               TCP_LOG_CONNECT(tp, false, 0);
+
                goto trimthenstep6;
        }
 
@@ -3407,6 +3577,7 @@ findpcb:
                    SEQ_GT(th->th_ack, tp->snd_max))) {
                        rstreason = BANDLIM_RST_OPENPORT;
                        IF_TCP_STATINC(ifp, ooopacket);
+                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_RECEIVED bad ACK");
                        goto dropwithreset;
                }
 
@@ -3441,11 +3612,13 @@ findpcb:
                    SEQ_GT(th->th_ack, tp->snd_max))) {
                        rstreason = BANDLIM_UNLIMITED;
                        IF_TCP_STATINC(ifp, ooopacket);
+                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT bad ACK");
                        goto dropwithreset;
                }
                if (thflags & TH_RST) {
                        if ((thflags & TH_ACK) != 0) {
-                               if (tfo_enabled(tp)) {
+                               if (tfo_enabled(tp) &&
+                                   !(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE)) {
                                        tcp_heuristic_tfo_rst(tp);
                                }
                                if ((tp->ecn_flags & (TE_SETUPSENT | TE_RCVD_SYN_RST)) == TE_SETUPSENT) {
@@ -3467,9 +3640,11 @@ findpcb:
                                tp = tcp_drop(tp, ECONNREFUSED);
                                postevent(so, 0, EV_RESET);
                        }
+                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT got RST");
                        goto drop;
                }
                if ((thflags & TH_SYN) == 0) {
+                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT no SYN");
                        goto drop;
                }
                tp->snd_wnd = th->th_win;       /* initial send window */
@@ -3531,12 +3706,15 @@ findpcb:
                                 * There is a middlebox that acks all but one
                                 * byte and still drops the data.
                                 */
-                               if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
+                               if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
+                                   (tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
                                    tp->snd_max == th->th_ack + 1 &&
                                    tp->snd_max > tp->snd_una + 1) {
                                        tcp_heuristic_tfo_middlebox(tp);
 
                                        so->so_error = ENODATA;
+                                       soevent(so,
+                                           (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MP_SUB_ERROR));
 
                                        tp->t_tfo_stats |= TFO_S_ONE_BYTE_PROXY;
                                }
@@ -3573,6 +3751,8 @@ findpcb:
                                tp->t_state = TCPS_FIN_WAIT_1;
                                tp->t_flags &= ~TF_NEEDFIN;
                                thflags &= ~TH_SYN;
+
+                               TCP_LOG_CONNECTION_SUMMARY(tp);
                        } else {
                                DTRACE_TCP4(state__change, void, NULL,
                                    struct inpcb *, inp, struct tcpcb *,
@@ -3692,10 +3872,12 @@ trimthenstep6:
                if (thflags & TH_SYN) {
                        /* Drop the packet silently if we have reached the limit */
                        if (tcp_do_rfc5961 && tcp_is_ack_ratelimited(tp)) {
+                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 rate limited");
                                goto drop;
                        } else {
                                /* Send challenge ACK */
                                tcpstat.tcps_synchallenge++;
+                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 challenge ACK");
                                goto dropafterack;
                        }
                }
@@ -3791,6 +3973,7 @@ trimthenstep6:
                                case TCPS_ESTABLISHED:
                                        if (tcp_do_rfc5961 == 0 && tp->last_ack_sent != th->th_seq) {
                                                tcpstat.tcps_badrst++;
+                                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 bad RST");
                                                goto drop;
                                        }
                                        if (TCP_ECN_ENABLED(tp) &&
@@ -3832,10 +4015,12 @@ close:
                                tcpstat.tcps_badrst++;
                                /* Drop if we have reached the ACK limit */
                                if (tcp_is_ack_ratelimited(tp)) {
+                                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 rate limited");
                                        goto drop;
                                } else {
                                        /* Send challenge ACK */
                                        tcpstat.tcps_rstchallenge++;
+                                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 challenge ACK");
                                        goto dropafterack;
                                }
                        }
@@ -3911,6 +4096,7 @@ close:
        if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
                rstreason = BANDLIM_RST_OPENPORT;
                IF_TCP_STATINC(ifp, dospacket);
+               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_RECEIVED bad SEQ");
                goto dropwithreset;
        }
 
@@ -4018,15 +4204,18 @@ close:
 
                if (!(so->so_flags & SOF_MP_SUBFLOW) && (so->so_state & SS_NOFDREF) &&
                    tp->t_state > TCPS_CLOSE_WAIT) {
+                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SS_NOFDREF");
                        close_it = TRUE;
                }
 
                if ((so->so_flags & SOF_MP_SUBFLOW) && (mptetoso(tptomptp(tp)->mpt_mpte)->so_state & SS_NOFDREF) &&
                    tp->t_state > TCPS_CLOSE_WAIT) {
+                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SOF_MP_SUBFLOW SS_NOFDREF");
                        close_it = TRUE;
                }
 
                if ((so->so_flags & SOF_DEFUNCT) && tp->t_state > TCPS_FIN_WAIT_1) {
+                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SOF_DEFUNCT");
                        close_it = TRUE;
                }
 
@@ -4122,10 +4311,12 @@ close:
                        tcpstat.tcps_badsyn++;
                        /* Drop if we have reached ACK limit */
                        if (tcp_is_ack_ratelimited(tp)) {
+                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad SYN rate limited");
                                goto drop;
                        } else {
                                /* Send challenge ACK */
                                tcpstat.tcps_synchallenge++;
+                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad SYN challenge ack");
                                goto dropafterack;
                        }
                } else {
@@ -4133,6 +4324,7 @@ close:
                        rstreason = BANDLIM_UNLIMITED;
                        postevent(so, 0, EV_RESET);
                        IF_TCP_STATINC(ifp, synwindow);
+                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad SYN");
                        goto dropwithreset;
                }
        }
@@ -4173,8 +4365,10 @@ close:
 
                        goto step6;
                } else if (tp->t_flags & TF_ACKNOW) {
+                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad ACK");
                        goto dropafterack;
                } else {
+                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad ACK");
                        goto drop;
                }
        }
@@ -4214,6 +4408,8 @@ close:
                            struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
                        tp->t_state = TCPS_FIN_WAIT_1;
                        tp->t_flags &= ~TF_NEEDFIN;
+
+                       TCP_LOG_CONNECTION_SUMMARY(tp);
                } else {
                        DTRACE_TCP4(state__change, void, NULL,
                            struct inpcb *, inp,
@@ -4237,8 +4433,17 @@ close:
                 * later; if not, do so now to pass queued data to user.
                 */
                if (tlen == 0 && (thflags & TH_FIN) == 0) {
+                       if (isipv6) {
+                               memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr));
+                               ip6 = (struct ip6_hdr *)&saved_hdr[0];
+                       } else {
+                               memcpy(&saved_hdr, ip, ip->ip_hl << 2);
+                               ip = (struct ip *)&saved_hdr[0];
+                       }
+                       memcpy(&saved_tcphdr, th, sizeof(struct tcphdr));
                        (void) tcp_reass(tp, (struct tcphdr *)0, &tlen,
-                           NULL, ifp);
+                           NULL, ifp, &read_wakeup);
+                       th = &saved_tcphdr;
                }
                tp->snd_wl1 = th->th_seq - 1;
 
@@ -4323,6 +4528,7 @@ close:
                if (SEQ_GT(th->th_ack, tp->snd_max)) {
                        tcpstat.tcps_rcvacktoomuch++;
                        if (tcp_do_rfc5961 && tcp_is_ack_ratelimited(tp)) {
+                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 rcvacktoomuch");
                                goto drop;
                        } else {
                                goto dropafterack;
@@ -4330,6 +4536,7 @@ close:
                }
                if (tcp_do_rfc5961 && SEQ_LT(th->th_ack, tp->snd_una - tp->max_sndwnd)) {
                        if (tcp_is_ack_ratelimited(tp)) {
+                               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad ACK");
                                goto drop;
                        } else {
                                goto dropafterack;
@@ -4366,10 +4573,6 @@ close:
                                                    ~TMPF_PREESTABLISHED;
                                                tp->t_mpflags |=
                                                    TMPF_MPTCP_TRUE;
-                                               mptcplog((LOG_DEBUG, "MPTCP "
-                                                   "Sockets: %s \n", __func__),
-                                                   MPTCP_SOCKET_DBG,
-                                                   MPTCP_LOGLVL_LOG);
 
                                                tp->t_timer[TCPT_JACK_RXMT] = 0;
                                                tp->t_mprxtshift = 0;
@@ -4793,6 +4996,12 @@ process_ACK:
                            tp->t_rxtcur);
                }
 
+               if ((prev_t_state == TCPS_SYN_SENT ||
+                   prev_t_state == TCPS_SYN_RECEIVED) &&
+                   tp->t_state == TCPS_ESTABLISHED) {
+                       TCP_LOG_RTT_INFO(tp);
+               }
+
                /*
                 * If no data (only SYN) was ACK'd, skip rest of ACK
                 * processing.
@@ -4899,12 +5108,7 @@ process_ACK:
                        tcp_bwmeas_check(tp);
                }
 
-               /*
-                * sowwakeup must happen after snd_una, et al. are
-                * updated so that the sequence numbers are in sync with
-                * so_snd
-                */
-               sowwakeup(so);
+               write_wakeup = 1;
 
                if (!SLIST_EMPTY(&tp->t_notify_ack)) {
                        tcp_notify_acknowledgement(tp, so);
@@ -5100,6 +5304,7 @@ dodata:
         */
        if (inp->inp_state == INPCB_STATE_DEAD) {
                /* Just drop the packet that we are processing and return */
+               TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "INPCB_STATE_DEAD");
                goto drop;
        }
 
@@ -5175,12 +5380,30 @@ dodata:
                            TCP_AUTORCVBUF_MAX(ifp));
                        so_recv_data_stat(so, m, drop_hdrlen);
 
+                       if (isipv6) {
+                               memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr));
+                               ip6 = (struct ip6_hdr *)&saved_hdr[0];
+                       } else {
+                               memcpy(&saved_hdr, ip, ip->ip_hl << 2);
+                               ip = (struct ip *)&saved_hdr[0];
+                       }
+                       memcpy(&saved_tcphdr, th, sizeof(struct tcphdr));
                        if (sbappendstream_rcvdemux(so, m,
                            th->th_seq - (tp->irs + 1), 0)) {
-                               sorwakeup(so);
+                               read_wakeup = 1;
                        }
+                       th = &saved_tcphdr;
                } else {
-                       thflags = tcp_reass(tp, th, &tlen, m, ifp);
+                       if (isipv6) {
+                               memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr));
+                               ip6 = (struct ip6_hdr *)&saved_hdr[0];
+                       } else {
+                               memcpy(&saved_hdr, ip, ip->ip_hl << 2);
+                               ip = (struct ip *)&saved_hdr[0];
+                       }
+                       memcpy(&saved_tcphdr, th, sizeof(struct tcphdr));
+                       thflags = tcp_reass(tp, th, &tlen, m, ifp, &read_wakeup);
+                       th = &saved_tcphdr;
                        tp->t_flags |= TF_ACKNOW;
                }
 
@@ -5309,6 +5532,10 @@ dodata:
        }
 #endif
 
+       if (read_wakeup) {
+               mptcp_handle_input(so);
+       }
+
        /*
         * Return any desired output.
         */
@@ -5318,6 +5545,7 @@ dodata:
 
        tcp_check_timer_state(tp);
 
+       tcp_handle_wakeup(so, read_wakeup, write_wakeup);
 
        socket_unlock(so, 1);
        KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
@@ -5354,8 +5582,11 @@ dropafterack:
 #endif
        m_freem(m);
        tp->t_flags |= TF_ACKNOW;
+
        (void) tcp_output(tp);
 
+       tcp_handle_wakeup(so, read_wakeup, write_wakeup);
+
        /* Don't need to check timer state as we should have done it during tcp_output */
        socket_unlock(so, 1);
        KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
@@ -5423,6 +5654,8 @@ dropwithreset:
                (void) soabort(so);
                socket_unlock(so, 1);
        } else if ((inp != NULL) && (nosock == 0)) {
+               tcp_handle_wakeup(so, read_wakeup, write_wakeup);
+
                socket_unlock(so, 1);
        }
        KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
@@ -5445,6 +5678,8 @@ drop:
                (void) soabort(so);
                socket_unlock(so, 1);
        } else if (nosock == 0) {
+               tcp_handle_wakeup(so, read_wakeup, write_wakeup);
+
                socket_unlock(so, 1);
        }
        KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
@@ -5781,6 +6016,9 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt,
     u_int32_t tsecr, tcp_seq th_ack)
 {
        int delta;
+       int old_srtt = tp->t_srtt;
+       int old_rttvar = tp->t_rttvar;
+       bool log_rtt = false;
 
        /*
         * On AWDL interface, the initial RTT measurement on SYN
@@ -5908,6 +6146,12 @@ compute_rto:
         * and the return path might not be symmetrical).
         */
        tp->t_softerror = 0;
+
+       if (log_rtt) {
+               TCP_LOG_RTT_INFO(tp);
+       }
+
+       TCP_LOG_RTT_CHANGE(tp, old_srtt, old_rttvar);
 }
 
 static inline unsigned int
@@ -6026,6 +6270,15 @@ tcp_mss(struct tcpcb *tp, int offer, unsigned int input_ifscope)
 #endif
 
        inp = tp->t_inpcb;
+
+       so = inp->inp_socket;
+       /*
+        * Nothing left to send after the socket is defunct or TCP is in the closed state
+        */
+       if ((so->so_state & SS_DEFUNCT) || tp->t_state == TCPS_CLOSED) {
+               return;
+       }
+
 #if INET6
        isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
        min_protoh = isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr)
@@ -6064,7 +6317,6 @@ tcp_mss(struct tcpcb *tp, int offer, unsigned int input_ifscope)
            ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
                tp->t_flags |= TF_SLOWLINK;
        }
-       so = inp->inp_socket;
 
        taop = rmx_taop(rt->rt_rmx);
        /*
@@ -6663,41 +6915,8 @@ tcp_getstat SYSCTL_HANDLER_ARGS
        struct tcpstat *stat;
        stat = &tcpstat;
 #if !CONFIG_EMBEDDED
-       proc_t caller = PROC_NULL;
-       proc_t caller_parent = PROC_NULL;
-       char command_name[MAXCOMLEN + 1] = "";
-       char parent_name[MAXCOMLEN + 1] = "";
        struct tcpstat zero_stat;
-       if ((caller = proc_self()) != PROC_NULL) {
-               /* get process name */
-               strlcpy(command_name, caller->p_comm, sizeof(command_name));
-
-               /* get parent process name if possible */
-               if ((caller_parent = proc_find(caller->p_ppid)) != PROC_NULL) {
-                       strlcpy(parent_name, caller_parent->p_comm,
-                           sizeof(parent_name));
-                       proc_rele(caller_parent);
-               }
-
-               if ((escape_str(command_name, strlen(command_name) + 1,
-                   sizeof(command_name)) == 0) &&
-                   (escape_str(parent_name, strlen(parent_name) + 1,
-                   sizeof(parent_name)) == 0)) {
-                       kern_asl_msg(LOG_DEBUG, "messagetracer",
-                           5,
-                           "com.apple.message.domain",
-                           "com.apple.kernel.tcpstat", /* 1 */
-                           "com.apple.message.signature",
-                           "tcpstat", /* 2 */
-                           "com.apple.message.signature2", command_name, /* 3 */
-                           "com.apple.message.signature3", parent_name, /* 4 */
-                           "com.apple.message.summarize", "YES", /* 5 */
-                           NULL);
-               }
-       }
-       if (caller != PROC_NULL) {
-               proc_rele(caller);
-       }
+
        if (tcp_disable_access_to_stats &&
            !kauth_cred_issuser(kauth_cred_get())) {
                bzero(&zero_stat, sizeof(zero_stat));
diff --git a/bsd/netinet/tcp_log.c b/bsd/netinet/tcp_log.c
new file mode 100644 (file)
index 0000000..0fcdb48
--- /dev/null
@@ -0,0 +1,938 @@
+/*
+ * Copyright (c) 2018-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <sys/param.h>
+#include <sys/protosw.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+
+#include <netinet/ip.h>
+#if INET6
+#include <netinet/ip6.h>
+#endif /* INET6 */
+
+#if !TCPDEBUG
+#define TCPSTATES
+#endif /* TCPDEBUG */
+#include <netinet/tcp_fsm.h>
+
+#include <netinet/tcp_log.h>
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, log, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
+    "TCP logs");
+
+static int tcp_log_level_info = 0;
+SYSCTL_INT(_net_inet_tcp_log, OID_AUTO, level_info,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_log_level_info, 0, "");
+
+#if (DEVELOPMENT || DEBUG)
+#if defined(XNU_TARGET_OS_OSX)
+/*
+ * Log less on macOS as sockets are more prevalent than channels
+ */
+#define TCP_LOG_ENABLE_DEFAULT \
+    (TLEF_CONNECTION | TLEF_DST_LOCAL | TLEF_DST_GW | \
+    TLEF_DROP_NECP)
+#else /* XNU_TARGET_OS_OSX */
+#define TCP_LOG_ENABLE_DEFAULT \
+    (TLEF_CONNECTION | TLEF_DST_LOCAL | TLEF_DST_GW | \
+    TLEF_DROP_NECP | TLEF_DROP_PCB | TLEF_DROP_PKT | TLEF_THF_SYN)
+#endif /* XNU_TARGET_OS_OSX */
+#else /* (DEVELOPMENT || DEBUG) */
+#define TCP_LOG_ENABLE_DEFAULT 0
+#endif /* (DEVELOPMENT || DEBUG) */
+
+uint32_t tcp_log_enable_flags = TCP_LOG_ENABLE_DEFAULT;
+SYSCTL_UINT(_net_inet_tcp_log, OID_AUTO, enable,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_log_enable_flags, 0, "");
+
+/*
+ * The following is a help to describe the values of the flags
+ */
+#define X(name, value, description, ...) #description ":" #value " "
+SYSCTL_STRING(_net_inet_tcp_log, OID_AUTO, enable_usage, CTLFLAG_RD | CTLFLAG_LOCKED,
+    TCP_ENABLE_FLAG_LIST, 0, "");
+#undef X
+
+/*
+ * Values for tcp_log_port when TLEF_RTT is enabled:
+ *  0: log all TCP connections regardless of the port numbers
+ *  1 to 65535: log TCP connections with this local or foreign port
+ *  other: do not log (same effect as as tcp_log_rtt == 0)
+ */
+uint32_t tcp_log_port = 0;
+SYSCTL_UINT(_net_inet_tcp_log, OID_AUTO, rtt_port, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &tcp_log_port, 0, "");
+
+/*
+ * Values for tcp_log_thflags_if_family when TLEF_THF_XXX is enabled:
+ *  0: all interfaces
+ *  other: only for interfaces with the corresponding interface functional type
+ */
+#if (DEVELOPMENT || DEBUG)
+#define TCP_LOG_THFLAGS_IF_FAMILY_DEFAULT IFNET_FAMILY_IPSEC
+#else /* (DEVELOPMENT || DEBUG) */
+#define TCP_LOG_THFLAGS_IF_FAMILY_DEFAULT 0
+#endif /* (DEVELOPMENT || DEBUG) */
+
+static uint32_t tcp_log_thflags_if_family = TCP_LOG_THFLAGS_IF_FAMILY_DEFAULT;
+SYSCTL_UINT(_net_inet_tcp_log, OID_AUTO, thflags_if_family,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_log_thflags_if_family, 0, "");
+
+#if (DEVELOPMENT || DEBUG)
+#define TCP_LOG_PRIVACY_DEFAULT 0
+#else
+#define TCP_LOG_PRIVACY_DEFAULT 1
+#endif /* (DEVELOPMENT || DEBUG) */
+
+int tcp_log_privacy = TCP_LOG_PRIVACY_DEFAULT;
+SYSCTL_INT(_net_inet_tcp_log, OID_AUTO, privacy,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_log_privacy, 0, "");
+
+#define TCP_LOG_RATE_LIMIT 600
+static unsigned int tcp_log_rate_limit = TCP_LOG_RATE_LIMIT;
+SYSCTL_UINT(_net_inet_tcp_log, OID_AUTO, rate_limit,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_log_rate_limit, 0, "");
+
+/* 1 minute by default */
+#define TCP_LOG_RATE_DURATION 60
+static unsigned int tcp_log_rate_duration = TCP_LOG_RATE_DURATION;
+SYSCTL_UINT(_net_inet_tcp_log, OID_AUTO, rate_duration,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_log_rate_duration, 0, "");
+
+static unsigned long tcp_log_rate_max = 0;
+SYSCTL_ULONG(_net_inet_tcp_log, OID_AUTO, rate_max,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_log_rate_max, "");
+
+static unsigned long tcp_log_rate_exceeded_total = 0;
+SYSCTL_ULONG(_net_inet_tcp_log, OID_AUTO, rate_exceeded_total,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_log_rate_exceeded_total, "");
+
+static unsigned long tcp_log_rate_current = 0;
+SYSCTL_ULONG(_net_inet_tcp_log, OID_AUTO, rate_current,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_log_rate_current, "");
+
+static bool tcp_log_rate_exceeded_logged = false;
+
+static uint64_t tcp_log_current_period = 0;
+
+#define ADDRESS_STR_LEN (MAX_IPv6_STR_LEN + 6)
+
+#define TCP_LOG_COMMON_FMT \
+           "[%s:%u<->%s:%u] " \
+           "interface: %s " \
+           "(skipped: %lu)\n"
+
+#define TCP_LOG_COMMON_ARGS \
+       laddr_buf, ntohs(local_port), faddr_buf, ntohs(foreign_port), \
+           ifp != NULL ? if_name(ifp) : "", \
+           tcp_log_rate_exceeded_total
+
+#define TCP_LOG_COMMON_PCB_FMT \
+       TCP_LOG_COMMON_FMT \
+       "t_state: %s " \
+       "process: %s:%u "
+
+#define TCP_LOG_COMMON_PCB_ARGS \
+       TCP_LOG_COMMON_ARGS, \
+       tcpstates[tp->t_state], \
+       inp->inp_last_proc_name, so->last_pid
+
+/*
+ * Returns true when above the rate limit
+ */
+static bool
+tcp_log_is_rate_limited(void)
+{
+       uint64_t current_net_period = net_uptime();
+
+       /* When set to zero it means to reset to default */
+       if (tcp_log_rate_duration == 0) {
+               tcp_log_rate_duration = TCP_LOG_RATE_DURATION;
+       }
+       if (tcp_log_rate_limit == 0) {
+               tcp_log_rate_duration = TCP_LOG_RATE_LIMIT;
+       }
+
+       if (current_net_period > tcp_log_current_period + tcp_log_rate_duration) {
+               if (tcp_log_rate_current > tcp_log_rate_max) {
+                       tcp_log_rate_max = tcp_log_rate_current;
+               }
+               tcp_log_current_period = current_net_period;
+               tcp_log_rate_current = 0;
+               tcp_log_rate_exceeded_logged = false;
+       }
+
+       tcp_log_rate_current += 1;
+
+       if (tcp_log_rate_current > (unsigned long) tcp_log_rate_limit) {
+               tcp_log_rate_exceeded_total += 1;
+               return true;
+       }
+
+       return false;
+}
+
+static void
+tcp_log_inp_addresses(struct inpcb *inp, char *lbuf, size_t lbuflen, char *fbuf, size_t fbuflen)
+{
+       /*
+        * Ugly but %{private} does not work in the kernel version of os_log()
+        */
+       if (tcp_log_privacy != 0) {
+               if (inp->inp_vflag & INP_IPV6) {
+                       strlcpy(lbuf, "<IPv6-redacted>", lbuflen);
+                       strlcpy(fbuf, "<IPv6-redacted>", fbuflen);
+               } else {
+                       strlcpy(lbuf, "<IPv4-redacted>", lbuflen);
+                       strlcpy(fbuf, "<IPv4-redacted>", fbuflen);
+               }
+       } else if (inp->inp_vflag & INP_IPV6) {
+               inet_ntop(AF_INET6, (void *)&inp->in6p_laddr, lbuf, lbuflen);
+               inet_ntop(AF_INET6, (void *)&inp->in6p_faddr, fbuf, fbuflen);
+       } else {
+               inet_ntop(AF_INET, (void *)&inp->inp_laddr.s_addr, lbuf, lbuflen);
+               inet_ntop(AF_INET, (void *)&inp->inp_faddr.s_addr, fbuf, fbuflen);
+       }
+}
+
+void
+tcp_log_rtt_info(const char *func_name, int line_no, struct tcpcb *tp)
+{
+       struct inpcb *inp = tp->t_inpcb;
+       struct socket *so = inp->inp_socket;
+       struct ifnet *ifp;
+       char laddr_buf[ADDRESS_STR_LEN];
+       char faddr_buf[ADDRESS_STR_LEN];
+       in_port_t local_port = inp->inp_lport;
+       in_port_t foreign_port = inp->inp_fport;
+
+       /* Do not log too much */
+       if (tcp_log_is_rate_limited()) {
+               return;
+       }
+
+       ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp :
+           inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL;
+
+       tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf));
+
+       os_log(OS_LOG_DEFAULT,
+           "tcp_rtt_info (%s:%d) "
+           TCP_LOG_COMMON_PCB_FMT
+           "rttcur: %u ms srtt: %u ms rttvar: %u ms rttmin: %u ms rxtcur: %u rxtshift: %u",
+           func_name, line_no,
+           TCP_LOG_COMMON_PCB_ARGS,
+           tp->t_rttcur, tp->t_srtt >> TCP_RTT_SHIFT,
+           tp->t_rttvar >> TCP_RTTVAR_SHIFT,
+           tp->t_rttmin, tp->t_rxtcur, tp->t_rxtshift);
+}
+
+void
+tcp_log_rt_rtt(const char *func_name, int line_no, struct tcpcb *tp,
+    struct rtentry *rt)
+{
+       struct inpcb *inp = tp->t_inpcb;
+       struct socket *so = inp->inp_socket;
+       struct ifnet *ifp;
+       char laddr_buf[ADDRESS_STR_LEN];
+       char faddr_buf[ADDRESS_STR_LEN];
+       in_port_t local_port = inp->inp_lport;
+       in_port_t foreign_port = inp->inp_fport;
+
+       /* Do not log too much */
+       if (tcp_log_is_rate_limited()) {
+               return;
+       }
+
+       ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp :
+           inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL;
+
+       tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf));
+
+       /*
+        * Log RTT values in milliseconds
+        */
+       os_log(OS_LOG_DEFAULT,
+           "tcp_rt_rtt (%s:%d) "
+           TCP_LOG_COMMON_PCB_FMT
+           "rt_rmx: RTV_RTT: %d ms rtt: %u ms rttvar: %u ms",
+           func_name, line_no,
+           TCP_LOG_COMMON_PCB_ARGS,
+           (rt->rt_rmx.rmx_locks & RTV_RTT),
+           rt->rt_rmx.rmx_rtt / (RTM_RTTUNIT / TCP_RETRANSHZ),
+           rt->rt_rmx.rmx_rttvar / (RTM_RTTUNIT / TCP_RETRANSHZ));
+}
+
+void
+tcp_log_rtt_change(const char *func_name, int line_no, struct tcpcb *tp,
+    int old_srtt, int old_rttvar)
+{
+       int srtt_diff;
+       int rttvar_diff;
+
+       srtt_diff = ABS(tp->t_srtt  - old_srtt) >> TCP_RTT_SHIFT;
+       rttvar_diff =
+           ABS((tp->t_rttvar - old_rttvar) >> TCP_RTTVAR_SHIFT);
+       if (srtt_diff >= 1000 || rttvar_diff >= 500) {
+               struct inpcb *inp = tp->t_inpcb;
+               struct socket *so = inp->inp_socket;
+               struct ifnet *ifp;
+               char laddr_buf[ADDRESS_STR_LEN];
+               char faddr_buf[ADDRESS_STR_LEN];
+               in_port_t local_port = inp->inp_lport;
+               in_port_t foreign_port = inp->inp_fport;
+
+               /* Do not log too much */
+               if (tcp_log_is_rate_limited()) {
+                       return;
+               }
+
+               ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp :
+                   inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL;
+
+               tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf));
+
+               os_log(OS_LOG_DEFAULT,
+                   "tcp_rtt_change (%s:%d) "
+                   TCP_LOG_COMMON_PCB_FMT
+                   "srtt: %u ms old_rtt: %u ms "
+                   "rttvar: %u old_rttvar: %u ms ",
+                   func_name, line_no,
+                   TCP_LOG_COMMON_PCB_ARGS,
+                   tp->t_srtt >> TCP_RTT_SHIFT,
+                   old_srtt >> TCP_RTT_SHIFT,
+                   tp->t_rttvar >> TCP_RTTVAR_SHIFT,
+                   old_rttvar >> TCP_RTTVAR_SHIFT);
+       }
+}
+
+void
+tcp_log_keepalive(const char *func_name, int line_no, struct tcpcb *tp,
+    int32_t idle_time)
+{
+       struct inpcb *inp = tp->t_inpcb;
+       struct socket *so = inp->inp_socket;
+       struct ifnet *ifp;
+       char laddr_buf[ADDRESS_STR_LEN];
+       char faddr_buf[ADDRESS_STR_LEN];
+       in_port_t local_port = inp->inp_lport;
+       in_port_t foreign_port = inp->inp_fport;
+
+       /* Do not log too much */
+       if (tcp_log_is_rate_limited()) {
+               return;
+       }
+
+       ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp :
+           inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL;
+
+       tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf));
+
+       os_log(OS_LOG_DEFAULT,
+           "tcp_keepalive (%s:%d) "
+           TCP_LOG_COMMON_PCB_FMT
+           "snd_una: %u snd_max: %u "
+           "SO_KA: %d RSTALL: %d TFOPRB: %d idle_time: %u "
+           "KIDLE: %d KINTV: %d KCNT: %d",
+           func_name, line_no,
+           TCP_LOG_COMMON_PCB_ARGS,
+           tp->snd_una, tp->snd_max,
+           tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE,
+           tp->t_flagsext & TF_DETECT_READSTALL,
+           tp->t_tfo_probe_state == TFO_PROBE_PROBING,
+           idle_time,
+           TCP_CONN_KEEPIDLE(tp), TCP_CONN_KEEPINTVL(tp),
+           TCP_CONN_KEEPCNT(tp));
+}
+
+
+void
+tcp_log_connection(struct tcpcb *tp, const char *event, int error)
+{
+       struct inpcb *inp;
+       struct socket *so;
+       struct ifnet *ifp;
+       char laddr_buf[ADDRESS_STR_LEN];
+       char faddr_buf[ADDRESS_STR_LEN];
+       in_port_t local_port;
+       in_port_t foreign_port;
+
+       if (tp == NULL || tp->t_inpcb == NULL || tp->t_inpcb->inp_socket == NULL || event == NULL) {
+               return;
+       }
+
+       /* Do not log too much */
+       if (tcp_log_is_rate_limited()) {
+               return;
+       }
+       inp = tp->t_inpcb;
+       so = inp->inp_socket;
+
+       local_port = inp->inp_lport;
+       foreign_port = inp->inp_fport;
+
+       ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp :
+           inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL;
+
+       tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf));
+
+#define TCP_LOG_CONNECT_FMT \
+           "tcp %s: " \
+           TCP_LOG_COMMON_PCB_FMT \
+           "rtt: %u.%u ms " \
+           "rttvar: %u.%u ms " \
+           "error: %d " \
+           "so_error: %d " \
+           "svc/tc: %u"
+
+#define TCP_LOG_CONNECT_ARGS \
+           event, \
+           TCP_LOG_COMMON_PCB_ARGS, \
+           tp->t_srtt >> TCP_RTT_SHIFT, tp->t_srtt - ((tp->t_srtt >> TCP_RTT_SHIFT) << TCP_RTT_SHIFT), \
+           tp->t_rttvar >> TCP_RTTVAR_SHIFT, tp->t_rttvar - ((tp->t_rttvar >> TCP_RTTVAR_SHIFT) << TCP_RTTVAR_SHIFT), \
+           error, \
+           so->so_error, \
+           (so->so_flags1 & SOF1_TC_NET_SERV_TYPE) ? so->so_netsvctype : so->so_traffic_class
+
+       if (so->so_head == NULL) {
+               if (tcp_log_level_info == 0) {
+                       os_log(OS_LOG_DEFAULT, TCP_LOG_CONNECT_FMT,
+                           TCP_LOG_CONNECT_ARGS);
+               } else {
+                       os_log_info(OS_LOG_DEFAULT, TCP_LOG_CONNECT_FMT,
+                           TCP_LOG_CONNECT_ARGS);
+               }
+       } else {
+#define TCP_LOG_CONN_Q_FMT \
+       "so_qlimit: %d "\
+       "so_qlen: %d "\
+       "so_incqlen: %d "
+
+#define TCP_LOG_CONN_Q_ARGS \
+       so->so_head->so_qlimit, \
+       so->so_head->so_qlen, \
+       so->so_head->so_incqlen
+
+               if (tcp_log_level_info == 0) {
+                       os_log(OS_LOG_DEFAULT, TCP_LOG_CONNECT_FMT "\n" TCP_LOG_CONN_Q_FMT,
+                           TCP_LOG_CONNECT_ARGS, TCP_LOG_CONN_Q_ARGS);
+               } else {
+                       os_log_info(OS_LOG_DEFAULT, TCP_LOG_CONNECT_FMT "\n" TCP_LOG_CONN_Q_FMT,
+                           TCP_LOG_CONNECT_ARGS, TCP_LOG_CONN_Q_ARGS);
+               }
+#undef TCP_LOG_CONN_Q_FMT
+#undef TCP_LOG_CONN_Q_ARGS
+       }
+#undef TCP_LOG_CONNECT_FMT
+#undef TCP_LOG_CONNECT_ARGS
+}
+
+void
+tcp_log_listen(struct tcpcb *tp, int error)
+{
+       struct inpcb *inp = tp->t_inpcb;
+       struct socket *so = inp->inp_socket;
+       struct ifnet *ifp;
+       char laddr_buf[ADDRESS_STR_LEN];
+       char faddr_buf[ADDRESS_STR_LEN];
+       in_port_t local_port;
+       in_port_t foreign_port;
+
+       if (tp == NULL || tp->t_inpcb == NULL || tp->t_inpcb->inp_socket == NULL) {
+               return;
+       }
+
+       /* Do not log too much */
+       if (tcp_log_is_rate_limited()) {
+               return;
+       }
+       inp = tp->t_inpcb;
+       so = inp->inp_socket;
+
+       local_port = inp->inp_lport;
+       foreign_port = inp->inp_fport;
+
+       ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp :
+           inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL;
+
+       tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf));
+
+#define TCP_LOG_LISTEN_FMT \
+           "tcp listen: " \
+           TCP_LOG_COMMON_PCB_FMT \
+           "so_qlimit: %d "\
+           "error: %d " \
+           "so_error: %d " \
+           "svc/tc: %u"
+
+#define TCP_LOG_LISTEN_ARGS \
+           TCP_LOG_COMMON_PCB_ARGS, \
+           so->so_qlimit, \
+           error, \
+           so->so_error, \
+           (so->so_flags1 & SOF1_TC_NET_SERV_TYPE) ? so->so_netsvctype : so->so_traffic_class
+
+       if (tcp_log_level_info == 0) {
+               os_log(OS_LOG_DEFAULT, TCP_LOG_LISTEN_FMT,
+                   TCP_LOG_LISTEN_ARGS);
+       } else {
+               os_log_info(OS_LOG_DEFAULT, TCP_LOG_LISTEN_FMT,
+                   TCP_LOG_LISTEN_ARGS);
+       }
+#undef TCP_LOG_LISTEN_FMT
+#undef TCP_LOG_LISTEN_ARGS
+}
+
+void
+tcp_log_connection_summary(struct tcpcb *tp)
+{
+       struct inpcb *inp;
+       struct socket *so;
+       struct ifnet *ifp;
+       uint32_t conntime = 0;
+       uint32_t duration = 0;
+       char laddr_buf[ADDRESS_STR_LEN];
+       char faddr_buf[ADDRESS_STR_LEN];
+       in_port_t local_port;
+       in_port_t foreign_port;
+
+       if (tp == NULL || tp->t_inpcb == NULL || tp->t_inpcb->inp_socket == NULL) {
+               return;
+       }
+
+       /* Do not log too much */
+       if (tcp_log_is_rate_limited()) {
+               return;
+       }
+       inp = tp->t_inpcb;
+       so = inp->inp_socket;
+
+       local_port = inp->inp_lport;
+       foreign_port = inp->inp_fport;
+
+       /* Make sure the summary is logged once */
+       if (tp->t_flagsext & TF_LOGGED_CONN_SUMMARY) {
+               return;
+       }
+       tp->t_flagsext |= TF_LOGGED_CONN_SUMMARY;
+
+       /*
+        * t_connect_time is the time when the connection started on
+        * the first SYN.
+        *
+        * t_starttime is when the three way handshake was completed.
+        */
+       if (tp->t_connect_time > 0) {
+               duration = tcp_now - tp->t_connect_time;
+
+               if (tp->t_starttime > 0) {
+                       conntime = tp->t_starttime - tp->t_connect_time;
+               }
+       }
+
+       ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp :
+           inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL;
+
+       tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf));
+
+#define TCP_LOG_CONNECTION_SUMMARY_FMT \
+           "tcp_connection_summary " \
+           TCP_LOG_COMMON_PCB_FMT \
+           "Duration: %u.%u sec " \
+           "Conn_Time: %u.%u sec " \
+           "syn rxmit: %u\n" \
+           "bytes in/out: %llu/%llu " \
+           "pkts in/out: %llu/%llu " \
+           "rtt: %u.%u ms " \
+           "rttvar: %u.%u ms " \
+           "pkt rxmit: %u " \
+           "ooo pkts: %u dup bytes in: %u " \
+           "so_error: %d " \
+           "svc/tc: %u"
+
+#define TCP_LOG_CONNECTION_SUMMARY_ARGS \
+           TCP_LOG_COMMON_PCB_ARGS, \
+           duration / TCP_RETRANSHZ, duration % TCP_RETRANSHZ, \
+           conntime / TCP_RETRANSHZ, conntime % TCP_RETRANSHZ,  \
+           tp->t_stat.synrxtshift, \
+           inp->inp_stat->rxbytes, inp->inp_stat->txbytes, \
+           inp->inp_stat->rxpackets, inp->inp_stat->txpackets, \
+           tp->t_srtt >> TCP_RTT_SHIFT, tp->t_srtt - ((tp->t_srtt >> TCP_RTT_SHIFT) << TCP_RTT_SHIFT), \
+           tp->t_rttvar >> TCP_RTTVAR_SHIFT, tp->t_rttvar - ((tp->t_rttvar >> TCP_RTTVAR_SHIFT) << TCP_RTTVAR_SHIFT), \
+           tp->t_stat.rxmitpkts, \
+           tp->t_rcvoopack, tp->t_stat.rxduplicatebytes, \
+           so->so_error, \
+           (so->so_flags1 & SOF1_TC_NET_SERV_TYPE) ? so->so_netsvctype : so->so_traffic_class
+
+       if (tcp_log_level_info == 0) {
+               os_log(OS_LOG_DEFAULT, TCP_LOG_CONNECTION_SUMMARY_FMT,
+                   TCP_LOG_CONNECTION_SUMMARY_ARGS);
+       } else {
+               os_log_info(OS_LOG_DEFAULT, TCP_LOG_CONNECTION_SUMMARY_FMT,
+                   TCP_LOG_CONNECTION_SUMMARY_ARGS);
+       }
+#undef TCP_LOG_CONNECTION_SUMMARY_FMT
+#undef TCP_LOG_CONNECTION_SUMMARY_ARGS
+}
+
+static bool
+tcp_log_pkt_addresses(void *hdr, struct tcphdr *th, bool outgoing,
+    char *lbuf, size_t lbuflen, char *fbuf, size_t fbuflen)
+{
+       bool isipv6;
+       uint8_t thflags;
+
+       isipv6 = (((struct ip *)hdr)->ip_v == 6);
+       thflags = th->th_flags;
+
+       if (isipv6) {
+               struct ip6_hdr *ip6 = (struct ip6_hdr *)hdr;
+
+               if (memcmp(&ip6->ip6_src, &in6addr_loopback, sizeof(struct in6_addr)) == 0 ||
+                   memcmp(&ip6->ip6_dst, &in6addr_loopback, sizeof(struct in6_addr)) == 0) {
+                       if (!(tcp_log_enable_flags & TLEF_DST_LOOPBACK)) {
+                               return false;
+                       }
+               }
+
+               if (tcp_log_privacy != 0) {
+                       strlcpy(lbuf, "<IPv6-redacted>", lbuflen);
+                       strlcpy(fbuf, "<IPv6-redacted>", fbuflen);
+               } else if (outgoing) {
+                       inet_ntop(AF_INET6, &ip6->ip6_src, lbuf, lbuflen);
+                       inet_ntop(AF_INET6, &ip6->ip6_dst, fbuf, fbuflen);
+               } else {
+                       inet_ntop(AF_INET6, &ip6->ip6_dst, lbuf, lbuflen);
+                       inet_ntop(AF_INET6, &ip6->ip6_src, fbuf, fbuflen);
+               }
+       } else {
+               struct ip *ip = (struct ip *)hdr;
+
+               if (ntohl(ip->ip_src.s_addr) == INADDR_LOOPBACK ||
+                   ntohl(ip->ip_dst.s_addr) == INADDR_LOOPBACK) {
+                       if (!(tcp_log_enable_flags & TLEF_DST_LOOPBACK)) {
+                               return false;
+                       }
+               }
+
+               if (tcp_log_privacy != 0) {
+                       strlcpy(lbuf, "<IPv4-redacted>", lbuflen);
+                       strlcpy(fbuf, "<IPv4-redacted>", fbuflen);
+               } else if (outgoing) {
+                       inet_ntop(AF_INET, (void *)&ip->ip_src.s_addr, lbuf, lbuflen);
+                       inet_ntop(AF_INET, (void *)&ip->ip_dst.s_addr, fbuf, fbuflen);
+               } else {
+                       inet_ntop(AF_INET, (void *)&ip->ip_dst.s_addr, lbuf, lbuflen);
+                       inet_ntop(AF_INET, (void *)&ip->ip_src.s_addr, fbuf, fbuflen);
+               }
+       }
+       return true;
+}
+
+/*
+ * Note: currently only used in the input path
+ */
+void
+tcp_log_drop_pcb(void *hdr, struct tcphdr *th, struct tcpcb *tp, bool outgoing, const char *reason)
+{
+       struct inpcb *inp = tp->t_inpcb;
+       struct socket *so = inp->inp_socket;
+       struct ifnet *ifp;
+       char laddr_buf[ADDRESS_STR_LEN];
+       char faddr_buf[ADDRESS_STR_LEN];
+       in_port_t local_port;
+       in_port_t foreign_port;
+
+       if (tp == NULL) {
+               return;
+       }
+
+       /* Do not log too much */
+       if (tcp_log_is_rate_limited()) {
+               return;
+       }
+
+       /* Use the packet addresses when in the data path */
+       if (hdr != NULL && th != NULL) {
+               if (outgoing) {
+                       local_port = th->th_sport;
+                       foreign_port = th->th_dport;
+               } else {
+                       local_port = th->th_dport;
+                       foreign_port = th->th_sport;
+               }
+               (void) tcp_log_pkt_addresses(hdr, th, outgoing, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf));
+       } else {
+               local_port = inp->inp_lport;
+               foreign_port = inp->inp_fport;
+               tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf));
+       }
+
+       ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp :
+           inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL;
+
+#define TCP_LOG_DROP_PCB_FMT \
+           "tcp drop %s " \
+           TCP_LOG_COMMON_PCB_FMT \
+           "t_state: %s " \
+           "so_error: %d " \
+           "reason: %s"
+
+#define TCP_LOG_DROP_PCB_ARGS \
+           outgoing ? "outgoing" : "incoming", \
+           TCP_LOG_COMMON_PCB_ARGS, \
+           tcpstates[tp->t_state], \
+           so->so_error, \
+           reason
+
+       if (tcp_log_level_info == 0) {
+               os_log(OS_LOG_DEFAULT, TCP_LOG_DROP_PCB_FMT,
+                   TCP_LOG_DROP_PCB_ARGS);
+       } else {
+               os_log_info(OS_LOG_DEFAULT, TCP_LOG_DROP_PCB_FMT,
+                   TCP_LOG_DROP_PCB_ARGS);
+       }
+#undef TCP_LOG_DROP_PCB_FMT
+#undef TCP_LOG_DROP_PCB_ARGS
+}
+
+#define TCP_LOG_TH_FLAGS_COMMON_FMT \
+       "tcp control %s " \
+       "%s" \
+       "%s" \
+       "%s" \
+       "%s" \
+       TCP_LOG_COMMON_FMT
+
+#define TCP_LOG_TH_FLAGS_COMMON_ARGS \
+       outgoing ? "outgoing" : "incoming", \
+       thflags & TH_SYN ? "SYN " : "", \
+       thflags & TH_FIN ? "FIN " : "", \
+       thflags & TH_RST ? "RST " : "", \
+       thflags & TH_ACK ? "ACK " : "", \
+       TCP_LOG_COMMON_ARGS
+
+void
+tcp_log_th_flags(void *hdr, struct tcphdr *th, struct tcpcb *tp, bool outgoing, struct ifnet *ifp)
+{
+       struct socket *so = tp->t_inpcb != NULL ? tp->t_inpcb->inp_socket : NULL;
+       char laddr_buf[ADDRESS_STR_LEN];
+       char faddr_buf[ADDRESS_STR_LEN];
+       in_port_t local_port;
+       in_port_t foreign_port;
+       uint8_t thflags;
+
+       if (hdr == NULL || th == NULL) {
+               return;
+       }
+
+       if (outgoing) {
+               local_port = th->th_sport;
+               foreign_port = th->th_dport;
+       } else {
+               local_port = th->th_dport;
+               foreign_port = th->th_sport;
+       }
+       thflags = th->th_flags;
+
+       if ((((thflags & TH_SYN) && (tcp_log_enable_flags & TLEF_THF_SYN)) ||
+           ((thflags & TH_FIN) && (tcp_log_enable_flags & TLEF_THF_FIN)) ||
+           ((thflags & TH_RST) && (tcp_log_enable_flags & TLEF_THF_RST))) == false) {
+               return;
+       }
+
+       if (ifp != NULL && tcp_log_thflags_if_family != 0 && ifp->if_family != tcp_log_thflags_if_family) {
+               return;
+       }
+
+       if (!tcp_log_pkt_addresses(hdr, th, outgoing, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf))) {
+               return;
+       }
+
+       /* Do not log too much */
+       if (tcp_log_is_rate_limited()) {
+               return;
+       }
+
+       /*
+        * When no PCB or socket just log the packet
+        */
+       if (tp == NULL || so == NULL) {
+               if (tcp_log_level_info == 0) {
+                       os_log(OS_LOG_DEFAULT, TCP_LOG_TH_FLAGS_COMMON_FMT " no pcb",
+                           TCP_LOG_TH_FLAGS_COMMON_ARGS);
+               } else {
+                       os_log_info(OS_LOG_DEFAULT, TCP_LOG_TH_FLAGS_COMMON_FMT,
+                           TCP_LOG_TH_FLAGS_COMMON_ARGS);
+               }
+       } else {
+#define TCP_LOG_TH_FLAGS_PCB_FMT \
+               TCP_LOG_TH_FLAGS_COMMON_FMT \
+               "rtt: %u.%u ms " \
+               "rttvar: %u.%u ms " \
+               "syn rxmit: %u " \
+               "pkt rxmit: %u " \
+               "so_error: %d " \
+               "svc/tc: %u "
+
+#define TCP_LOG_TH_FLAGS_PCB_ARGS \
+           TCP_LOG_TH_FLAGS_COMMON_ARGS, \
+           tp->t_srtt >> TCP_RTT_SHIFT, tp->t_srtt - ((tp->t_srtt >> TCP_RTT_SHIFT) << TCP_RTT_SHIFT), \
+           tp->t_rttvar >> TCP_RTTVAR_SHIFT, tp->t_rttvar - ((tp->t_rttvar >> TCP_RTTVAR_SHIFT) << TCP_RTTVAR_SHIFT), \
+           tp->t_stat.synrxtshift, \
+           tp->t_stat.rxmitpkts, \
+           so->so_error, \
+           (so->so_flags1 & SOF1_TC_NET_SERV_TYPE) ? \
+           so->so_netsvctype : so->so_traffic_class
+
+               if (tcp_log_level_info == 0) {
+                       os_log(OS_LOG_DEFAULT, TCP_LOG_TH_FLAGS_PCB_FMT,
+                           TCP_LOG_TH_FLAGS_PCB_ARGS);
+               } else {
+                       os_log_info(OS_LOG_DEFAULT, TCP_LOG_TH_FLAGS_PCB_FMT,
+                           TCP_LOG_TH_FLAGS_PCB_ARGS);
+               }
+#undef TCP_LOG_TH_FLAGS_PCB_FMT
+#undef TCP_LOG_TH_FLAGS_PCB_ARGS
+       }
+}
+
+void
+tcp_log_drop_pkt(void *hdr, struct tcphdr *th, struct ifnet *ifp, const char *reason)
+{
+       char laddr_buf[ADDRESS_STR_LEN];
+       char faddr_buf[ADDRESS_STR_LEN];
+       in_port_t local_port;
+       in_port_t foreign_port;
+       uint8_t thflags;
+       bool outgoing = false;  /* This is only for incoming packets */
+
+       if (hdr == NULL || th == NULL) {
+               return;
+       }
+
+       local_port = th->th_dport;
+       foreign_port = th->th_sport;
+       thflags = th->th_flags;
+
+       if ((((thflags & TH_SYN) && (tcp_log_enable_flags & TLEF_THF_SYN)) ||
+           ((thflags & TH_FIN) && (tcp_log_enable_flags & TLEF_THF_FIN)) ||
+           ((thflags & TH_RST) && (tcp_log_enable_flags & TLEF_THF_RST))) == false) {
+               return;
+       }
+
+       if (ifp != NULL && tcp_log_thflags_if_family != 0 && ifp->if_family != tcp_log_thflags_if_family) {
+               return;
+       }
+
+       if (!tcp_log_pkt_addresses(hdr, th, outgoing, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf))) {
+               return;
+       }
+
+       /* Do not log too much */
+       if (tcp_log_is_rate_limited()) {
+               return;
+       }
+
+#define TCP_LOG_DROP_PKT_FMT \
+           "tcp drop incoming control packet " \
+               TCP_LOG_TH_FLAGS_COMMON_FMT \
+           "reason: %s"
+
+#define TCP_LOG_DROP_PKT_ARGS \
+           TCP_LOG_TH_FLAGS_COMMON_ARGS, \
+           reason != NULL ? reason : ""
+
+       if (tcp_log_level_info == 0) {
+               os_log(OS_LOG_DEFAULT, TCP_LOG_DROP_PKT_FMT,
+                   TCP_LOG_DROP_PKT_ARGS);
+       } else {
+               os_log_info(OS_LOG_DEFAULT, TCP_LOG_DROP_PKT_FMT,
+                   TCP_LOG_DROP_PKT_ARGS);
+       }
+#undef TCP_LOG_DROP_PKT_FMT
+#undef TCP_LOG_DROP_PKT_ARGS
+}
+
+void
+tcp_log_message(const char *func_name, int line_no, struct tcpcb *tp, const char *format, ...)
+{
+       struct inpcb *inp;
+       struct socket *so;
+       struct ifnet *ifp;
+       char laddr_buf[ADDRESS_STR_LEN];
+       char faddr_buf[ADDRESS_STR_LEN];
+       in_port_t local_port;
+       in_port_t foreign_port;
+       char message[256];
+
+       if (tp == NULL || tp->t_inpcb == NULL || tp->t_inpcb->inp_socket == NULL) {
+               return;
+       }
+
+       /* Do not log too much */
+       if (tcp_log_is_rate_limited()) {
+               return;
+       }
+       inp = tp->t_inpcb;
+       so = inp->inp_socket;
+
+       local_port = inp->inp_lport;
+       foreign_port = inp->inp_fport;
+
+       ifp = inp->inp_last_outifp != NULL ? inp->inp_last_outifp :
+           inp->inp_boundifp != NULL ? inp->inp_boundifp : NULL;
+
+       tcp_log_inp_addresses(inp, laddr_buf, sizeof(laddr_buf), faddr_buf, sizeof(faddr_buf));
+
+       va_list ap;
+       va_start(ap, format);
+       vsnprintf(message, sizeof(message), format, ap);
+       va_end(ap);
+
+#define TCP_LOG_MESSAGE_FMT \
+       "tcp (%s:%d) " \
+       TCP_LOG_COMMON_PCB_FMT \
+       "%s"
+
+#define TCP_LOG_MESSAGE_ARGS \
+       func_name, line_no, \
+       TCP_LOG_COMMON_PCB_ARGS, \
+       message
+
+       if (tcp_log_level_info == 0) {
+               os_log(OS_LOG_DEFAULT, TCP_LOG_MESSAGE_FMT,
+                   TCP_LOG_MESSAGE_ARGS);
+       } else {
+               os_log_info(OS_LOG_DEFAULT, TCP_LOG_MESSAGE_FMT,
+                   TCP_LOG_MESSAGE_ARGS);
+       }
+#undef TCP_LOG_MESSAGE_FMT
+#undef TCP_LOG_MESSAGE_ARGS
+}
diff --git a/bsd/netinet/tcp_log.h b/bsd/netinet/tcp_log.h
new file mode 100644 (file)
index 0000000..040948f
--- /dev/null
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+
+#ifndef _NETINET_TCP_LOG_H_
+#define _NETINET_TCP_LOG_H_
+
+#ifdef BSD_KERNEL_PRIVATE
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_pcb.h>
+#if INET6
+#include <netinet6/in6_pcb.h>
+#endif
+
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#if INET6
+#include <netinet6/tcp6_var.h>
+#endif
+
+#include <net/net_log_common.h>
+
+#include <os/log.h>
+
+#include <stdbool.h>
+
+extern os_log_t tcp_mpkl_log_object;
+extern uint32_t tcp_log_enable_flags;
+extern uint32_t tcp_log_port;
+extern int tcp_log_privacy;
+
+#define TCP_ENABLE_FLAG_LIST \
+       X(TLEF_CONNECTION,      0x1, connection)        \
+       X(TLEF_RTT,             0x2, rtt)               \
+       X(TLEF_KEEP_ALIVE,      0x4, ka)                \
+       X(TLEF_DST_LOOPBACK,    0x10, loop)             \
+       X(TLEF_DST_LOCAL,       0x20, local)            \
+       X(TLEF_DST_GW,          0x40, gw)               \
+       X(TLEF_THF_SYN,         0x100, syn)             \
+       X(TLEF_THF_FIN,         0x200, fin)             \
+       X(TLEF_THF_RST,         0x400, rst)             \
+       X(TLEF_DROP_NECP,       0x1000, dropnecp)       \
+       X(TLEF_DROP_PCB,        0x2000, droppcb)       \
+       X(TLEF_DROP_PKT,        0x4000, droppkt)       \
+
+/*
+ * Flag values for tcp_log_enabled
+ */
+enum {
+#define X(name, value, ...) name = value,
+       TCP_ENABLE_FLAG_LIST
+#undef X
+};
+
+#define TLEF_MASK_DST (TLEF_DST_LOOPBACK | TLEF_DST_LOCAL | TLEF_DST_GW)
+
+#define TLEF_MASK_THF (TLEF_THF_SYN | TLEF_THF_FIN | TLEF_THF_RST)
+
+extern void tcp_log_connection_summary(struct tcpcb *tp);
+extern void tcp_log_th_flags(void *hdr, struct tcphdr *th, struct tcpcb *tp, bool outgoing, struct ifnet *ifp);
+extern void tcp_log_connection(struct tcpcb *tp, const char *event, int error);
+extern void tcp_log_listen(struct tcpcb *tp, int error);
+extern void tcp_log_drop_pcb(void *hdr, struct tcphdr *th, struct tcpcb *tp, bool outgoing, const char *reason);
+extern void tcp_log_drop_pkt(void *hdr, struct tcphdr *th, struct ifnet *ifp, const char *reason);
+extern void tcp_log_rtt_info(const char *func_name, int line_no, struct tcpcb *tp);
+extern void tcp_log_rt_rtt(const char *func_name, int line_no, struct tcpcb *tp, struct rtentry *rt);
+extern void tcp_log_rtt_change(const char *func_name, int line_no, struct tcpcb *tp, int old_srtt, int old_rttvar);
+extern void tcp_log_keepalive(const char *func_name, int line_no, struct tcpcb *tp, int32_t idle_time);
+extern void tcp_log_message(const char *func_name, int line_no, struct tcpcb *tp, const char *format, ...);
+
+
+static inline bool
+tcp_is_log_enabled(struct tcpcb *tp, uint32_t req_flags)
+{
+       if (tp == NULL || tp->t_inpcb == NULL) {
+               return false;
+       }
+       if (tcp_log_port > 0 && tcp_log_port <= IPPORT_HILASTAUTO) {
+               if (ntohs(tp->t_inpcb->inp_lport) != tcp_log_port &&
+                   ntohs(tp->t_inpcb->inp_fport) != tcp_log_port) {
+                       return false;
+               }
+       }
+       /*
+        * First find out the kind of destination
+        */
+       if (tp->t_log_flags == 0) {
+               if (tp->t_inpcb->inp_vflag & INP_IPV6) {
+                       if (IN6_IS_ADDR_LOOPBACK(&tp->t_inpcb->in6p_laddr) ||
+                           IN6_IS_ADDR_LOOPBACK(&tp->t_inpcb->in6p_faddr)) {
+                               tp->t_log_flags |= TLEF_DST_LOOPBACK;
+                       }
+               } else {
+                       if (ntohl(tp->t_inpcb->inp_laddr.s_addr) == INADDR_LOOPBACK ||
+                           ntohl(tp->t_inpcb->inp_faddr.s_addr) == INADDR_LOOPBACK) {
+                               tp->t_log_flags |= TLEF_DST_LOOPBACK;
+                       }
+               }
+               if (tp->t_log_flags == 0) {
+                       if (tp->t_flags & TF_LOCAL) {
+                               tp->t_log_flags |= TLEF_DST_LOCAL;
+                       } else {
+                               tp->t_log_flags |= TLEF_DST_GW;
+                       }
+               }
+       }
+       /*
+        * Check separately the destination flags that are per TCP connection
+        * and the other functional flags that are global
+        */
+       return (tp->t_log_flags & tcp_log_enable_flags & TLEF_MASK_DST) &&
+              (tcp_log_enable_flags & (req_flags & ~TLEF_MASK_DST));
+}
+
+#define TCP_LOG_RTT_INFO(tp) if (tcp_is_log_enabled(tp, TLEF_RTT)) \
+    tcp_log_rtt_info(__func__, __LINE__, (tp))
+
+#define TCP_LOG_RTM_RTT(tp, rt) if (tcp_is_log_enabled(tp, TLEF_RTT)) \
+    tcp_log_rt_rtt(__func__, __LINE__, (tp), (rt))
+
+#define TCP_LOG_RTT_CHANGE(tp, old_srtt, old_rttvar) if (tcp_is_log_enabled(tp, TLEF_RTT)) \
+    tcp_log_rtt_change(__func__, __LINE__, (tp), (old_srtt), (old_rttvar))
+
+#define TCP_LOG_KEEP_ALIVE(tp, idle_time) if (tcp_is_log_enabled(tp, TLEF_KEEP_ALIVE)) \
+    tcp_log_keepalive(__func__, __LINE__, (tp), (idle_time))
+
+#define TCP_LOG_CONNECT(tp, outgoing, error) if (tcp_is_log_enabled(tp, TLEF_CONNECTION)) \
+    tcp_log_connection((tp), (outgoing) ? "connect outgoing" : "connect incoming", (error))
+
+#define TCP_LOG_LISTEN(tp, error) if (tcp_is_log_enabled(tp, TLEF_CONNECTION)) \
+    tcp_log_listen((tp), (error))
+
+#define TCP_LOG_ACCEPT(tp, error) if (tcp_is_log_enabled(tp, TLEF_CONNECTION)) \
+    tcp_log_connection((tp), "accept", (error))
+
+#define TCP_LOG_CONNECTION_SUMMARY(tp) if (tcp_is_log_enabled(tp, TLEF_CONNECTION)) \
+    tcp_log_connection_summary((tp))
+
+#define TCP_LOG_DROP_NECP(hdr, th, tp, outgoing) if (tcp_is_log_enabled(tp, TLEF_DROP_NECP)) \
+    tcp_log_drop_pcb((hdr), (th), (tp), (outgoing), "NECP")
+
+#define TCP_LOG_DROP_PCB(hdr, th, tp, outgoing, reason) if (tcp_is_log_enabled(tp, TLEF_DROP_PCB)) \
+    tcp_log_drop_pcb((hdr), (th), (tp), (outgoing), reason)
+
+#define TCP_LOG_TH_FLAGS(hdr, th, tp, outgoing, ifp) \
+    if ((th) != NULL && ((th)->th_flags & (TH_SYN|TH_FIN|TH_RST))) \
+           tcp_log_th_flags((hdr), (th), (tp), (outgoing), (ifp))
+
+#define TCP_LOG_DROP_PKT(hdr, th, ifp, reason) \
+    if ((th) != NULL && ((th->th_flags) & (TH_SYN|TH_FIN|TH_RST)) && \
+       (tcp_log_enable_flags & TLEF_DROP_PKT)) \
+               tcp_log_drop_pkt((hdr), (th), (ifp), (reason))
+
+#define TCP_LOG(tp, format, ...) \
+    tcp_log_message(__func__, __LINE__, tp, format, ## __VA_ARGS__)
+
+
+
+#endif /* BSD_KERNEL_RPIVATE */
+
+#endif /* _NETINET_TCP_LOG_H_ */
index baacc13f3cf83355499c47c669a4540fff361639..8aef977e3b53d0b40caeadae6cd76d15942cd4de 100644 (file)
@@ -586,7 +586,6 @@ tcp_lro_process_pkt(struct mbuf *lro_mb, int drop_hdrlen)
        default:
                lck_mtx_unlock(&tcp_lro_lock);
                panic_plain("%s: unrecognized type %d", __func__, retval);
-               break;
        }
 
        if (ret_response == TCP_LRO_FLOW_NOTFOUND) {
index 75e8634c0bf63d96395584e7f4e4022f0840fe08..6f63e40f7b226d39429bed8aca5512f5be845e36 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #if TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
+#include <netinet/tcp_log.h>
 #include <sys/kdebug.h>
 #include <mach/sdt.h>
 
@@ -166,14 +167,14 @@ sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS
        err = sysctl_io_number(req, tcp_ecn_outbound, sizeof(int32_t),
            &i, &changed);
        if (err != 0 || req->newptr == USER_ADDR_NULL)
-               return(err);
+               return err;
 
        if (changed) {
                if ((tcp_ecn_outbound == 0 || tcp_ecn_outbound == 1) &&
                    (i == 0 || i == 1)) {
                        tcp_ecn_outbound = i;
                        SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_initiate_out, tcp_ecn_outbound);
-                       return(err);
+                       return err;
                }
                if (tcp_ecn_outbound == 2 && (i == 0 || i == 1)) {
                        /*
@@ -215,7 +216,7 @@ sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS
                tcp_ecn_inbound = i;
                SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_negotiate_in, tcp_ecn_inbound);
        }
-       return (err);
+       return err;
 }
 
 int     tcp_ecn_outbound = 2;
@@ -310,18 +311,15 @@ static int32_t tcp_tfo_check(struct tcpcb *tp, int32_t len)
        if (tp->t_flags & TF_NOOPT)
                goto fallback;
 
-       if ((so->so_flags1 & SOF1_DATA_AUTHENTICATED) &&
-           !(tp->t_flagsext & TF_FASTOPEN_HEUR))
-               return (len);
-
-       if (!tcp_heuristic_do_tfo(tp)) {
+       if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
+           !tcp_heuristic_do_tfo(tp)) {
                tp->t_tfo_stats |= TFO_S_HEURISTICS_DISABLE;
                tcpstat.tcps_tfo_heuristics_disable++;
                goto fallback;
        }
 
        if (so->so_flags1 & SOF1_DATA_AUTHENTICATED)
-               return (len);
+               return len;
 
        optlen += TCPOLEN_MAXSEG;
 
@@ -330,7 +328,8 @@ static int32_t tcp_tfo_check(struct tcpcb *tp, int32_t len)
 
 #if MPTCP
        if ((so->so_flags & SOF_MP_SUBFLOW) && mptcp_enable &&
-           tp->t_rxtshift <= mptcp_mpcap_retries)
+           (tp->t_rxtshift <= mptcp_mpcap_retries ||
+           (tptomptp(tp)->mpt_mpte->mpte_flags & MPTE_FORCE_ENABLE)))
                optlen += sizeof(struct mptcp_mpcapable_opt_common) + sizeof(mptcp_key_t);
 #endif /* MPTCP */
 
@@ -349,7 +348,7 @@ static int32_t tcp_tfo_check(struct tcpcb *tp, int32_t len)
        cookie_len = tcp_cache_get_cookie_len(tp);
        if (cookie_len == 0)
                /* No cookie, so we request one */
-               return (0);
+               return 0;
 
        /* There is not enough space for the cookie, so we cannot do TFO */
        if (MAX_TCPOPTLEN - optlen < cookie_len)
@@ -360,11 +359,11 @@ static int32_t tcp_tfo_check(struct tcpcb *tp, int32_t len)
                goto fallback;
 
        /* Ok, everything looks good. We can go on and do TFO */
-       return (len);
+       return len;
 
 fallback:
-       tp->t_flagsext &= ~TF_FASTOPEN;
-       return (0);
+       tcp_disable_tfo(tp);
+       return 0;
 }
 
 /* Returns the number of bytes written to the TCP option-space */
@@ -377,7 +376,7 @@ tcp_tfo_write_cookie_rep(struct tcpcb *tp, unsigned optlen, u_char *opt)
 
        if ((MAX_TCPOPTLEN - optlen) <
            (TCPOLEN_FASTOPEN_REQ + TFO_COOKIE_LEN_DEFAULT))
-               return (ret);
+               return ret;
 
        tcp_tfo_gen_cookie(tp->t_inpcb, out, sizeof(out));
 
@@ -391,7 +390,7 @@ tcp_tfo_write_cookie_rep(struct tcpcb *tp, unsigned optlen, u_char *opt)
        tp->t_tfo_stats |= TFO_S_COOKIE_SENT;
        tcpstat.tcps_tfo_cookie_sent++;
 
-       return (ret);
+       return ret;
 }
 
 static unsigned
@@ -411,7 +410,7 @@ tcp_tfo_write_cookie(struct tcpcb *tp, unsigned optlen, int32_t len,
                        tcpstat.tcps_tfo_syn_data_sent++;
                }
 
-               return (0);
+               return 0;
        }
 
        bp = opt + optlen;
@@ -446,15 +445,15 @@ tcp_tfo_write_cookie(struct tcpcb *tp, unsigned optlen, int32_t len,
                }
        }
 
-       return (ret);
+       return ret;
 }
 
 static inline bool
 tcp_send_ecn_flags_on_syn(struct tcpcb *tp, struct socket *so)
 {
-       return(!((tp->ecn_flags & TE_SETUPSENT) ||
+       return !((tp->ecn_flags & TE_SETUPSENT ||
            (so->so_flags & SOF_MP_SUBFLOW) ||
-           (tp->t_flagsext & TF_FASTOPEN)));
+           (tfo_enabled(tp))));
 }
 
 void
@@ -642,7 +641,7 @@ tcp_output(struct tcpcb *tp)
                 * the subflow socket stays around until deleted.
                 * No packets such as FINs must be sent after RST.
                 */
-               return (0);
+               return 0;
        }
 #endif /* MPTCP */
 
@@ -708,18 +707,18 @@ again:
 
                        if (tp->t_state >= TCPS_CLOSE_WAIT) {
                                tcp_drop(tp, EADDRNOTAVAIL);
-                               return(EADDRNOTAVAIL);
+                               return EADDRNOTAVAIL;
                        }
 
-                       /* Set retransmit  timer if it wasn't set,
+                       /*
+                        * Set retransmit  timer if it wasn't set,
                         * reset Persist timer and shift register as the
                         * advertised peer window may not be valid anymore
                         */
-
-                       if (!tp->t_timer[TCPT_REXMT]) {
+                       if (tp->t_timer[TCPT_REXMT] == 0) {
                                tp->t_timer[TCPT_REXMT] =
                                    OFFSET_FROM_START(tp, tp->t_rxtcur);
-                               if (tp->t_timer[TCPT_PERSIST]) {
+                               if (tp->t_timer[TCPT_PERSIST] != 0) {
                                        tp->t_timer[TCPT_PERSIST] = 0;
                                        tp->t_persist_stop = 0;
                                        TCP_RESET_REXMT_STATE(tp);
@@ -733,10 +732,10 @@ again:
                        /* drop connection if source address isn't available */
                        if (so->so_flags & SOF_NOADDRAVAIL) {
                                tcp_drop(tp, EADDRNOTAVAIL);
-                               return(EADDRNOTAVAIL);
+                               return EADDRNOTAVAIL;
                        } else {
                                tcp_check_timer_state(tp);
-                               return(0); /* silently ignore, keep data in socket: address may be back */
+                               return 0; /* silently ignore, keep data in socket: address may be back */
                        }
                }
                if (ia != NULL)
@@ -766,7 +765,7 @@ again:
                 *         has been disabled)
                 */
 
-               if (!path_mtu_discovery || ((rt != NULL) &&
+               if (!path_mtu_discovery || ((rt != NULL) &&
                    (!(rt->rt_flags & RTF_UP) ||
                    (rt->rt_rmx.rmx_locks & RTV_MTU))))
                        tp->t_flags &= ~TF_PMTUD;
@@ -1011,7 +1010,7 @@ after_sack_rexmit:
                        }
                        KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,
                            0,0,0,0,0);
-                       return(0);
+                       return 0;
                }
        }
 
@@ -1029,6 +1028,12 @@ after_sack_rexmit:
                flags &= ~TH_FIN;
        }
 
+       /*
+        * Don't send a RST with data.
+        */
+       if (flags & TH_RST)
+               len = 0;
+
        if ((flags & TH_SYN) && tp->t_state <= TCPS_SYN_SENT && tfo_enabled(tp))
                len = tcp_tfo_check(tp, len);
 
@@ -1142,6 +1147,12 @@ after_sack_rexmit:
        }
 
 #if MPTCP
+       if (so->so_flags & SOF_MP_SUBFLOW && off < 0) {
+               os_log_error(mptcp_log_handle, "%s - %lx: offset is negative! len %d off %d\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(tp->t_mpsub->mpts_mpte),
+                   len, off);
+       }
+
        if ((so->so_flags & SOF_MP_SUBFLOW) &&
            !(tp->t_mpflags & TMPF_TCP_FALLBACK)) {
                int newlen = len;
@@ -1170,11 +1181,12 @@ after_sack_rexmit:
                 * option can be sent in one packet, reduce length to match
                 * the contiguous MPTCP level. Set sendalot to send remainder.
                 */
-               if (len > 0)
+               if (len > 0 && off >= 0) {
                        newlen = mptcp_adj_sendlen(so, off);
+               }
+
                if (newlen < len) {
                        len = newlen;
-                       sendalot = 1;
                }
        }
 #endif /* MPTCP */
@@ -1217,23 +1229,23 @@ after_sack_rexmit:
         * next expected input).  If the difference is at least two
         * max size segments, or at least 25% of the maximum possible
         * window, then want to send a window update to peer.
-        * Skip this if the connection is in T/TCP half-open state.
         */
        recwin = tcp_sbspace(tp);
-#if MPTCP
-       if (so->so_flags & SOF_MP_SUBFLOW) {
+
+       if (!(so->so_flags & SOF_MP_SUBFLOW)) {
+               if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) &&
+                   recwin < (int)tp->t_maxseg) {
+                       recwin = 0;
+               }
+       } else {
                struct mptcb *mp_tp = tptomptp(tp);
+               struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
 
-               if (mp_tp != NULL) {
-                       mpte_lock_assert_held(mp_tp->mpt_mpte);
-                       recwin = imin(recwin, mptcp_sbspace(mp_tp));
+               if (recwin < (int32_t)(mp_so->so_rcv.sb_hiwat / 4) &&
+                   recwin < (int)tp->t_maxseg) {
+                       recwin = 0;
                }
        }
-#endif
-
-       if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) &&
-           recwin < (int)tp->t_maxseg)
-               recwin = 0;
 
 #if TRAFFIC_MGT
        if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so)) {
@@ -1253,15 +1265,18 @@ after_sack_rexmit:
        if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
                recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
 
-       /*
-        * MPTCP needs to be able to announce a smaller window than previously,
-        * because the other subflow may have filled up the available window-
-        * space. So we have to be able to go backwards and announce a smaller
-        * window.
-        */
-       if (!(so->so_flags & SOF_MP_SUBFLOW) &&
-           recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt))
-               recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt);
+       if (!(so->so_flags & SOF_MP_SUBFLOW)) {
+               if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt)) {
+                       recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt);
+               }
+       } else {
+               struct mptcb *mp_tp = tptomptp(tp);
+
+               /* Don't remove what we announced at the MPTCP-layer */
+               if (recwin < (int32_t)(mp_tp->mpt_rcvadv - (uint32_t)mp_tp->mpt_rcvnxt)) {
+                       recwin = (int32_t)(mp_tp->mpt_rcvadv - (uint32_t)mp_tp->mpt_rcvnxt);
+               }
+       }
 
        /*
         * Sender silly window avoidance.   We transmit under the following
@@ -1283,6 +1298,16 @@ after_sack_rexmit:
                if (sack_rxmit)
                        goto send;
 
+               /*
+                * If this here is the first segment after SYN/ACK and TFO
+                * is being used, then we always send it, regardless of Nagle,...
+                */
+               if (tp->t_state == TCPS_SYN_RECEIVED &&
+                   tfo_enabled(tp) &&
+                   (tp->t_tfo_flags & TFO_F_COOKIE_VALID) &&
+                   tp->snd_nxt == tp->iss + 1)
+                       goto send;
+
                /*
                 * Send new data on the connection only if it is
                 * not flow controlled
@@ -1449,7 +1474,7 @@ just_return:
                tcp_check_timer_state(tp);
        }
        KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
-       return (0);
+       return 0;
 
 send:
        /*
@@ -2024,12 +2049,12 @@ send:
                        goto out;
                }
                if (MHLEN < (hdrlen + max_linkhdr)) {
-                       MCLGET(m, M_DONTWAIT);
-                       if ((m->m_flags & M_EXT) == 0) {
-                               m_freem(m);
-                               error = ENOBUFS;
-                               goto out;
-                       }
+                       MCLGET(m, M_DONTWAIT);
+                       if ((m->m_flags & M_EXT) == 0) {
+                               m_freem(m);
+                               error = ENOBUFS;
+                               goto out;
+                       }
                }
                m->m_data += max_linkhdr;
                m->m_len = hdrlen;
@@ -2134,8 +2159,20 @@ send:
        }
        th->th_flags = flags;
        th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
-       if (recwin > 0 && SEQ_LT(tp->rcv_adv, tp->rcv_nxt + recwin))
-               tp->rcv_adv = tp->rcv_nxt + recwin;
+       if (!(so->so_flags & SOF_MP_SUBFLOW)) {
+               if (recwin > 0 && SEQ_LT(tp->rcv_adv, tp->rcv_nxt + recwin)) {
+                       tp->rcv_adv = tp->rcv_nxt + recwin;
+               }
+       } else {
+               struct mptcb *mp_tp = tptomptp(tp);
+               if (recwin > 0) {
+                       tp->rcv_adv = tp->rcv_nxt + recwin;
+               }
+
+               if (recwin > 0 && SEQ_LT(mp_tp->mpt_rcvadv, (uint32_t)mp_tp->mpt_rcvnxt + recwin)) {
+                       mp_tp->mpt_rcvadv = (uint32_t)mp_tp->mpt_rcvnxt + recwin;
+               }
+       }
 
        /*
         * Adjust the RXWIN0SENT flag - indicate that we have advertised
@@ -2149,6 +2186,7 @@ send:
                tp->t_flags |= TF_RXWIN0SENT;
        else
                tp->t_flags &= ~TF_RXWIN0SENT;
+
        if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
                th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
                th->th_flags |= TH_URG;
@@ -2402,7 +2440,8 @@ timer:
                necp_kernel_policy_id policy_id;
                necp_kernel_policy_id skip_policy_id;
                u_int32_t route_rule_id;
-               if (!necp_socket_is_allowed_to_send_recv(inp, &policy_id, &route_rule_id, &skip_policy_id)) {
+               if (!necp_socket_is_allowed_to_send_recv(inp, NULL, &policy_id, &route_rule_id, &skip_policy_id)) {
+                       TCP_LOG_DROP_NECP(isipv6 ? (void *)ip6 : (void *)ip, th, tp, true);
                        m_freem(m);
                        error = EHOSTUNREACH;
                        goto out;
@@ -2470,6 +2509,10 @@ timer:
                (void) m_set_service_class(m, so_tc2msc(sotc));
        }
 
+       TCP_LOG_TH_FLAGS(isipv6 ? (void *)ip6 : (void *)ip, th, tp, true,
+           inp->inp_last_outifp != NULL ? inp->inp_last_outifp :
+           inp->inp_boundifp);
+
        tp->t_pktlist_sentlen += len;
        tp->t_lastchain++;
 
@@ -2493,9 +2536,9 @@ timer:
                tp->t_pktlist_head = tp->t_pktlist_tail = m;
        }
 
-       if ((lro_ackmore) && (!sackoptlen) && (!tp->t_timer[TCPT_PERSIST]) &&
-                       ((th->th_flags & TH_ACK) == TH_ACK) && (!len) &&
-                       (tp->t_state == TCPS_ESTABLISHED)) {
+       if (lro_ackmore && !sackoptlen && tp->t_timer[TCPT_PERSIST] == 0 &&
+           (th->th_flags & TH_ACK) == TH_ACK && len == 0 &&
+           tp->t_state == TCPS_ESTABLISHED) {
                /* For a pure ACK, see if you need to send more of them */
                mnext = tcp_send_lroacks(tp, m, th);
                if (mnext) {
@@ -2553,7 +2596,7 @@ timer:
                        (tp->t_flags & TF_CLOSING)) {
                        tp->t_flags &= ~TF_CLOSING;
                        (void) tcp_close(tp);
-                       return (0);
+                       return 0;
                }
        } else {
                error = 0;
@@ -2606,8 +2649,8 @@ out:
                         * when we failed to send a segment that can be
                         * retransmitted (i.e. not pure ack or rst)
                         */
-                       if (!tp->t_timer[TCPT_REXMT] &&
-                           !tp->t_timer[TCPT_PERSIST] &&
+                       if (tp->t_timer[TCPT_REXMT] == 0 &&
+                           tp->t_timer[TCPT_PERSIST] == 0 &&
                            (len != 0 || (flags & (TH_SYN | TH_FIN)) != 0 ||
                            so->so_snd.sb_cc > 0))
                                tp->t_timer[TCPT_REXMT] =
@@ -2618,7 +2661,7 @@ out:
                        KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
 
                        tcp_ccdbg_trace(tp, NULL, TCP_CC_OUTPUT_ERROR);
-                       return (0);
+                       return 0;
                }
                if (error == EMSGSIZE) {
                        /*
@@ -2654,7 +2697,7 @@ out:
                }
                tcp_check_timer_state(tp);
                KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
-               return (error);
+               return error;
        }
 
        tcpstat.tcps_sndtotal++;
@@ -2664,7 +2707,8 @@ out:
                goto again;
 
        tcp_check_timer_state(tp);
-       return (0);
+
+       return 0;
 }
 
 static int
@@ -2732,6 +2776,14 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
                        ipoa.ipoa_flags |=  IPOAF_NO_EXPENSIVE;
 
        }
+       if (INP_NO_CONSTRAINED(inp)) {
+#if INET6
+               if (isipv6)
+                       ip6oa.ip6oa_flags |=  IP6OAF_NO_CONSTRAINED;
+               else
+#endif /* INET6 */
+                       ipoa.ipoa_flags |=  IPOAF_NO_CONSTRAINED;
+       }
        if (INP_AWDL_UNRESTRICTED(inp)) {
 #if INET6
                if (isipv6)
@@ -2822,7 +2874,6 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
 #endif
                ; // I'm important, not extraneous
 
-
        while (pkt != NULL) {
                struct mbuf *npkt = pkt->m_nextpkt;
 
@@ -2831,7 +2882,7 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
                        /*
                         * If we are not chaining, make sure to set the packet
                         * list count to 0 so that IP takes the right path;
-                        * this is important for cases such as IPSec where a
+                        * this is important for cases such as IPsec where a
                         * single mbuf might result in multiple mbufs as part
                         * of the encapsulation.  If a non-zero count is passed
                         * down to IP, the head of the chain might change and
@@ -2901,22 +2952,33 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
                error = ENOBUFS;
 
        VERIFY(inp->inp_sndinprog_cnt > 0);
-       if ( --inp->inp_sndinprog_cnt == 0)
+       if ( --inp->inp_sndinprog_cnt == 0) {
                inp->inp_flags &= ~(INP_FC_FEEDBACK);
+               if (inp->inp_sndingprog_waiters > 0) {
+                       wakeup(&inp->inp_sndinprog_cnt);
+               }
+       }
 
 #if INET6
        if (isipv6) {
-               if (ro6.ro_rt != NULL)
+               /*
+                * When an NECP IP tunnel policy forces the outbound interface,
+                * ip6_output_list() informs the transport layer what is the actual
+                * outgoing interface
+                */
+               if (ip6oa.ip6oa_flags & IP6OAF_BOUND_IF) {
+                       outif = ifindex2ifnet[ip6oa.ip6oa_boundif];
+               } else if (ro6.ro_rt != NULL) {
                        outif = ro6.ro_rt->rt_ifp;
+               }
        } else
 #endif /* INET6 */
                if (ro.ro_rt != NULL)
                        outif = ro.ro_rt->rt_ifp;
 
-       if (outif != NULL && outif != inp->inp_last_outifp &&
-           so->so_snd.sb_cc > 0) {
+       if (outif != NULL && outif != inp->inp_last_outifp) {
                /* Update the send byte count */
-               if (so->so_snd.sb_flags & SB_SNDBYTE_CNT) {
+               if (so->so_snd.sb_cc > 0 && so->so_snd.sb_flags & SB_SNDBYTE_CNT) {
                        inp_decr_sndbytes_total(so, so->so_snd.sb_cc);
                        inp_decr_sndbytes_allunsent(so, tp->snd_una);
                        so->so_snd.sb_flags &= ~SB_SNDBYTE_CNT;
@@ -2926,7 +2988,7 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
        }
 
        if (error != 0 && ifdenied &&
-           (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp)))
+           (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp) || INP_NO_CONSTRAINED(inp)))
                soevent(so,
                    (SO_FILT_HINT_LOCKED|SO_FILT_HINT_IFDENIED));
 
@@ -2946,7 +3008,7 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
                tcp_getrt_rtt(tp, tp->t_inpcb->in6p_route.ro_rt);
                tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
        }
-       return (error);
+       return error;
 }
 
 int tcptv_persmin_val = TCPTV_PERSMIN;
@@ -2962,9 +3024,9 @@ tcp_setpersist(struct tcpcb *tp)
         * see rdar://5805356
         */
 
-       if ((tp->t_persist_timeout != 0) &&
-                   (tp->t_timer[TCPT_PERSIST] == 0) &&
-                   (tp->t_persist_stop == 0)) {
+       if (tp->t_persist_timeout != 0 &&
+           tp->t_timer[TCPT_PERSIST] == 0 &&
+           tp->t_persist_stop == 0) {
                tp->t_persist_stop = tcp_now + tp->t_persist_timeout;
        }
 
@@ -3097,7 +3159,7 @@ tcp_recv_throttle (struct tcpcb *tp)
                 * in that state until rtt comes closer to base rtt
                 */
                if (tp->t_flagsext & TF_RECV_THROTTLE)
-                       return (1);
+                       return 1;
 
                base_rtt = get_base_rtt(tp);
 
@@ -3123,9 +3185,9 @@ tcp_recv_throttle (struct tcpcb *tp)
                                            tcp_recv_throttle_minwin);
                                        sbrcv->sb_idealsize = newsize;
                                }
-                               return (1);
+                               return 1;
                        } else {
-                               return (0);
+                               return 0;
                        }
                }
        }
@@ -3135,7 +3197,7 @@ tcp_recv_throttle (struct tcpcb *tp)
         * measurement. Use IPDV in this case.
         */
        if (tp->acc_iaj > tcp_acc_iaj_react_limit)
-               return (1);
+               return 1;
 
-       return (0);
+       return 0;
 }
index 67381dffe655070599ece9c02bfd4fd3b5d6d034..20c8a7b610bad723f4c4b89a44d5a617671cd2b4 100644 (file)
@@ -91,6 +91,7 @@
 #include <net/if.h>
 #include <net/content_filter.h>
 #include <net/ntstat.h>
+#include <net/multi_layer_pkt_log.h>
 
 #define tcp_minmssoverload fring
 #define _IP_VHL
 #if TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
+#include <netinet/tcp_log.h>
+
 #include <netinet6/ip6protosw.h>
 
 #if IPSEC
 #include <libkern/crypto/md5.h>
 #include <sys/kdebug.h>
 #include <mach/sdt.h>
+#include <atm/atm_internal.h>
+#include <pexpert/pexpert.h>
 
 #include <netinet/lro_ext.h>
 
@@ -257,6 +262,16 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED,
 SYSCTL_SKMEM_TCP_INT(OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED,
     __private_extern__ int, tcp_win_scale, 3, "Window scaling factor");
 
+#if (DEVELOPMENT || DEBUG)
+SYSCTL_SKMEM_TCP_INT(OID_AUTO, init_rtt_from_cache,
+    CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_init_rtt_from_cache, 1,
+    "Initalize RTT from route cache");
+#else
+SYSCTL_SKMEM_TCP_INT(OID_AUTO, init_rtt_from_cache,
+    CTLFLAG_RD | CTLFLAG_LOCKED, static int, tcp_init_rtt_from_cache, 1,
+    "Initalize RTT from route cache");
+#endif /* (DEVELOPMENT || DEBUG) */
+
 static void     tcp_cleartaocache(void);
 static void     tcp_notify(struct inpcb *, int);
 
@@ -307,6 +322,8 @@ struct  inp_tp {
 int  get_inpcb_str_size(void);
 int  get_tcp_str_size(void);
 
+os_log_t tcp_mpkl_log_object = NULL;
+
 static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *);
 
 static lck_attr_t *tcp_uptime_mtx_attr = NULL;
@@ -462,6 +479,7 @@ tcp_init(struct protosw *pp, struct domain *dp)
        static int tcp_initialized = 0;
        vm_size_t str_size;
        struct inpcbinfo *pcbinfo;
+       uint32_t logging_config;
 
        VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
 
@@ -639,6 +657,18 @@ tcp_init(struct protosw *pp, struct domain *dp)
        /* Initialize TCP Cache */
        tcp_cache_init();
 
+       tcp_mpkl_log_object = MPKL_CREATE_LOGOBJECT("com.apple.xnu.tcp");
+       if (tcp_mpkl_log_object == NULL) {
+               panic("MPKL_CREATE_LOGOBJECT failed");
+       }
+
+       logging_config = atm_get_diagnostic_config();
+       if (logging_config & 0x80000000) {
+               tcp_log_privacy = 1;
+       }
+
+       PE_parse_boot_argn("tcp_log", &tcp_log_enable_flags, sizeof(tcp_log_enable_flags));
+
        /*
         * If more than 60 MB of mbuf pool is available, increase the
         * maximum allowed receive and send socket buffer size.
@@ -875,6 +905,9 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
        m->m_len = tlen;
        m->m_pkthdr.len = tlen;
        m->m_pkthdr.rcvif = 0;
+       if (tra->keep_alive) {
+               m->m_pkthdr.pkt_flags |= PKTF_KEEPALIVE;
+       }
 #if CONFIG_MACF_NET
        if (tp != NULL && tp->t_inpcb != NULL) {
                /*
@@ -973,6 +1006,9 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
                if (tra->noexpensive) {
                        ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE;
                }
+               if (tra->noconstrained) {
+                       ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED;
+               }
                if (tra->awdl_unrestricted) {
                        ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED;
                }
@@ -1017,6 +1053,9 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
                if (tra->noexpensive) {
                        ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE;
                }
+               if (tra->noconstrained) {
+                       ipoa.ipoa_flags |= IPOAF_NO_CONSTRAINED;
+               }
                if (tra->awdl_unrestricted) {
                        ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED;
                }
@@ -1123,6 +1162,7 @@ tcp_newtcpcb(struct inpcb *inp)
        tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT;
        tp->t_rcvtime = tcp_now;
        tp->tentry.timer_start = tcp_now;
+       tp->rcv_unackwin = tcp_now;
        tp->t_persist_timeout = tcp_max_persist_timeout;
        tp->t_persist_stop = 0;
        tp->t_flagsext |= TF_RCVUNACK_WAITSS;
@@ -1177,6 +1217,9 @@ tcp_drop(struct tcpcb *tp, int errno)
                errno = tp->t_softerror;
        }
        so->so_error = errno;
+
+       TCP_LOG_CONNECTION_SUMMARY(tp);
+
        return tcp_close(tp);
 }
 
@@ -1186,7 +1229,9 @@ tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt)
        u_int32_t rtt = rt->rt_rmx.rmx_rtt;
        int isnetlocal = (tp->t_flags & TF_LOCAL);
 
-       if (rtt != 0) {
+       TCP_LOG_RTM_RTT(tp, rt);
+
+       if (rtt != 0 && tcp_init_rtt_from_cache != 0) {
                /*
                 * XXX the lock bit for RTT indicates that the value
                 * is also a minimum value; this is subject to time.
@@ -1197,9 +1242,11 @@ tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt)
                        tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN :
                            TCPTV_REXMTMIN;
                }
+
                tp->t_srtt =
                    rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
                tcpstat.tcps_usedrtt++;
+
                if (rt->rt_rmx.rmx_rttvar) {
                        tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
                            (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
@@ -1209,11 +1256,19 @@ tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt)
                        tp->t_rttvar =
                            tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
                }
+
+               /*
+                * The RTO formula in the route metric case is based on:
+                *     4 * srtt + 8 * rttvar
+                * modulo the min, max and slop
+                */
                TCPT_RANGESET(tp->t_rxtcur,
                    ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
                    tp->t_rttmin, TCPTV_REXMTMAX,
                    TCP_ADD_REXMTSLOP(tp));
        }
+
+       TCP_LOG_RTT_INFO(tp);
 }
 
 static inline void
@@ -1415,6 +1470,8 @@ tcp_close(struct tcpcb *tp)
                return NULL;
        }
 
+       TCP_LOG_CONNECTION_SUMMARY(tp);
+
        DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
            struct tcpcb *, tp, int32_t, TCPS_CLOSED);
 
@@ -1441,6 +1498,7 @@ tcp_close(struct tcpcb *tp)
         */
        if (tp->t_rttupdated >= 16) {
                u_int32_t i = 0;
+               bool log_rtt = false;
 
 #if INET6
                if (isipv6) {
@@ -1481,6 +1539,7 @@ tcp_close(struct tcpcb *tp)
                                rt->rt_rmx.rmx_rtt = i;
                        }
                        tcpstat.tcps_cachedrtt++;
+                       log_rtt = true;
                }
                if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
                        i = tp->t_rttvar *
@@ -1492,6 +1551,11 @@ tcp_close(struct tcpcb *tp)
                                rt->rt_rmx.rmx_rttvar = i;
                        }
                        tcpstat.tcps_cachedrttvar++;
+                       log_rtt = true;
+               }
+               if (log_rtt) {
+                       TCP_LOG_RTM_RTT(tp, rt);
+                       TCP_LOG_RTT_INFO(tp);
                }
                /*
                 * The old comment here said:
@@ -1597,6 +1661,11 @@ no_valid_rt:
                    inp->inp_lport, inp->inp_fport);
                tp->t_flagsext &= ~TF_LRO_OFFLOADED;
        }
+       /*
+        * Make sure to clear the TCP Keep Alive Offload as it is
+        * ref counted on the interface
+        */
+       tcp_clear_keep_alive_offload(so);
 
        /*
         * If this is a socket that does not want to wakeup the device
@@ -1742,11 +1811,6 @@ tcp_notify(struct inpcb *inp, int error)
        } else {
                tp->t_softerror = error;
        }
-#if 0
-       wakeup((caddr_t) &so->so_timeo);
-       sorwakeup(so);
-       sowwakeup(so);
-#endif
 }
 
 struct bwmeas *
@@ -2229,9 +2293,9 @@ tcp_handle_msgsize(struct ip *ip, struct inpcb *inp)
        u_short ifscope = IFSCOPE_NONE;
        int mtu;
        struct sockaddr_in icmpsrc = {
-               sizeof(struct sockaddr_in),
-               AF_INET, 0, { 0 },
-               { 0, 0, 0, 0, 0, 0, 0, 0 }
+               .sin_len = sizeof(struct sockaddr_in),
+               .sin_family = AF_INET, .sin_port = 0, .sin_addr = { .s_addr = 0 },
+               .sin_zero = { 0, 0, 0, 0, 0, 0, 0, 0 }
        };
        struct icmp *icp = NULL;
 
@@ -2699,13 +2763,20 @@ tcp_mtudisc(
 #if INET6
        int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 
+       /*
+        * Nothing left to send after the socket is defunct or TCP is in the closed state
+        */
+       if ((so->so_state & SS_DEFUNCT) || (tp != NULL && tp->t_state == TCPS_CLOSED)) {
+               return;
+       }
+
        if (isipv6) {
                protoHdrOverhead = sizeof(struct ip6_hdr) +
                    sizeof(struct tcphdr);
        }
 #endif /* INET6 */
 
-       if (tp) {
+       if (tp != NULL) {
 #if INET6
                if (isipv6) {
                        rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
@@ -3103,18 +3174,16 @@ retry:
        if (so->so_pcb != NULL) {
                if (so->so_flags & SOF_MP_SUBFLOW) {
                        struct mptcb *mp_tp = tptomptp(sototcpcb(so));
-                       VERIFY(mp_tp);
-
-                       mpte_lock_assert_notheld(mp_tp->mpt_mpte);
+                       struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
 
-                       mpte_lock(mp_tp->mpt_mpte);
+                       socket_lock(mp_so, refcount);
 
                        /*
                         * Check if we became non-MPTCP while waiting for the lock.
                         * If yes, we have to retry to grab the right lock.
                         */
                        if (!(so->so_flags & SOF_MP_SUBFLOW)) {
-                               mpte_unlock(mp_tp->mpt_mpte);
+                               socket_unlock(mp_so, refcount);
                                goto retry;
                        }
                } else {
@@ -3186,11 +3255,11 @@ tcp_unlock(struct socket *so, int refcount, void *lr)
 
                if (so->so_flags & SOF_MP_SUBFLOW) {
                        struct mptcb *mp_tp = tptomptp(sototcpcb(so));
+                       struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
 
-                       VERIFY(mp_tp);
-                       mpte_lock_assert_held(mp_tp->mpt_mpte);
+                       socket_lock_assert_owned(mp_so);
 
-                       mpte_unlock(mp_tp->mpt_mpte);
+                       socket_unlock(mp_so, refcount);
                } else {
                        LCK_MTX_ASSERT(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
                            LCK_MTX_ASSERT_OWNED);
@@ -3213,8 +3282,9 @@ tcp_getlock(struct socket *so, int flags)
 
                if (so->so_flags & SOF_MP_SUBFLOW) {
                        struct mptcb *mp_tp = tptomptp(sototcpcb(so));
+                       struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
 
-                       return mpte_getlock(mp_tp->mpt_mpte, flags);
+                       return mp_so->so_proto->pr_getlock(mp_so, flags);
                } else {
                        return &inp->inpcb_mtx;
                }
@@ -3272,6 +3342,13 @@ tcp_sbspace(struct tcpcb *tp)
        int32_t space;
        int32_t pending = 0;
 
+       if (so->so_flags & SOF_MP_SUBFLOW) {
+               /* We still need to grow TCP's buffer to have a BDP-estimate */
+               tcp_sbrcv_grow_rwin(tp, sb);
+
+               return mptcp_sbspace(tptomptp(tp));
+       }
+
        tcp_sbrcv_grow_rwin(tp, sb);
 
        /* hiwat might have changed */
@@ -3390,7 +3467,7 @@ void
 calculate_tcp_clock(void)
 {
        struct timeval tv = tcp_uptime;
-       struct timeval interval = {0, TCP_RETRANSHZ_TO_USEC};
+       struct timeval interval = {.tv_sec = 0, .tv_usec = TCP_RETRANSHZ_TO_USEC};
        struct timeval now, hold_now;
        uint32_t incr = 0;
 
@@ -3929,6 +4006,10 @@ tcp_fill_keepalive_offload_frames(ifnet_t ifp,
                    tcp_keepidle;
                frame->keep_cnt = TCP_CONN_KEEPCNT(tp);
                frame->keep_retry = TCP_CONN_KEEPINTVL(tp);
+               if (so->so_options & SO_NOWAKEFROMSLEEP) {
+                       frame->flags |=
+                           IFNET_KEEPALIVE_OFFLOAD_FLAG_NOWAKEFROMSLEEP;
+               }
                frame->local_port = ntohs(inp->inp_lport);
                frame->remote_port = ntohs(inp->inp_fport);
                frame->local_seq = tp->snd_nxt;
@@ -3995,6 +4076,110 @@ tcp_fill_keepalive_offload_frames(ifnet_t ifp,
        *used_frames_count = frame_index;
 }
 
+static bool
+inp_matches_kao_frame(ifnet_t ifp, struct ifnet_keepalive_offload_frame *frame,
+    struct inpcb *inp)
+{
+       if (inp->inp_ppcb == NULL) {
+               return false;
+       }
+       /* Release the want count */
+       if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
+               return false;
+       }
+       if (inp->inp_last_outifp == NULL ||
+           inp->inp_last_outifp->if_index != ifp->if_index) {
+               return false;
+       }
+       if (frame->local_port != ntohs(inp->inp_lport) ||
+           frame->remote_port != ntohs(inp->inp_fport)) {
+               return false;
+       }
+       if (inp->inp_vflag & INP_IPV4) {
+               if (memcmp(&inp->inp_laddr, frame->local_addr,
+                   sizeof(struct in_addr)) != 0 ||
+                   memcmp(&inp->inp_faddr, frame->remote_addr,
+                   sizeof(struct in_addr)) != 0) {
+                       return false;
+               }
+       } else if (inp->inp_vflag & INP_IPV6) {
+               if (memcmp(&inp->inp_laddr, frame->local_addr,
+                   sizeof(struct in6_addr)) != 0 ||
+                   memcmp(&inp->inp_faddr, frame->remote_addr,
+                   sizeof(struct in6_addr)) != 0) {
+                       return false;
+               }
+       } else {
+               return false;
+       }
+       return true;
+}
+
+int
+tcp_notify_kao_timeout(ifnet_t ifp,
+    struct ifnet_keepalive_offload_frame *frame)
+{
+       struct inpcb *inp = NULL;
+       struct socket *so = NULL;
+       bool found = false;
+
+       /*
+        *  Unlock the list before posting event on the matching socket
+        */
+       lck_rw_lock_shared(tcbinfo.ipi_lock);
+
+       LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
+               if ((so = inp->inp_socket) == NULL ||
+                   (so->so_state & SS_DEFUNCT)) {
+                       continue;
+               }
+               if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) {
+                       continue;
+               }
+               if (!(inp->inp_vflag & (INP_IPV4 | INP_IPV6))) {
+                       continue;
+               }
+               if (inp->inp_ppcb == NULL ||
+                   in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
+                       continue;
+               }
+               socket_lock(so, 1);
+               if (inp_matches_kao_frame(ifp, frame, inp)) {
+                       /*
+                        * Keep the matching socket locked
+                        */
+                       found = true;
+                       break;
+               }
+               socket_unlock(so, 1);
+       }
+       lck_rw_done(tcbinfo.ipi_lock);
+
+       if (found) {
+               ASSERT(inp != NULL);
+               ASSERT(so != NULL);
+               ASSERT(so == inp->inp_socket);
+               /*
+                * Drop the TCP connection like tcptimers() does
+                */
+               struct tcpcb *tp = inp->inp_ppcb;
+
+               tcpstat.tcps_keepdrops++;
+               postevent(so, 0, EV_TIMEOUT);
+               soevent(so,
+                   (SO_FILT_HINT_LOCKED | SO_FILT_HINT_TIMEOUT));
+               tp = tcp_drop(tp, ETIMEDOUT);
+
+               tcpstat.tcps_ka_offload_drops++;
+               os_log_info(OS_LOG_DEFAULT, "%s: dropped lport %u fport %u\n",
+                   __func__, frame->local_port, frame->remote_port);
+
+               socket_unlock(so, 1);
+       }
+
+       return 0;
+}
+
 errno_t
 tcp_notify_ack_id_valid(struct tcpcb *tp, struct socket *so,
     u_int32_t notify_id)
index fda0f86f674ea97494b32c12a4b9fd1272f4b846..784c0e879dee716621bbf3f90ed938bac9e8436d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -84,6 +84,7 @@
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
 #if INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #if TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
+#include <netinet/tcp_log.h>
+
 #include <sys/kdebug.h>
 #include <mach/sdt.h>
 #include <netinet/mptcp_var.h>
@@ -128,7 +131,10 @@ sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
        int error, s, tt;
 
        tt = *(int *)arg1;
-       s = tt * 1000 / TCP_RETRANSHZ;;
+       if (tt < 0 || tt >= INT_MAX / 1000) {
+               return EINVAL;
+       }
+       s = tt * 1000 / TCP_RETRANSHZ;
 
        error = sysctl_handle_int(oidp, &s, 0, req);
        if (error || !req->newptr) {
@@ -266,6 +272,13 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, pmtud_blackhole_mss,
     CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_pmtud_black_hole_mss, 1200,
     "Path MTU Discovery Black Hole Detection lowered MSS");
 
+#if (DEBUG || DEVELOPMENT)
+int tcp_probe_if_fix_port = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, probe_if_fix_port,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    &tcp_probe_if_fix_port, 0, "");
+#endif /* (DEBUG || DEVELOPMENT) */
+
 static u_int32_t tcp_mss_rec_medium = 1200;
 static u_int32_t tcp_mss_rec_low = 512;
 
@@ -477,7 +490,7 @@ inline int32_t
 timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2)
 {
        return (int32_t)((t1 + toff1) - (t2 + toff2));
-};
+}
 
 /*
  * Add to tcp timewait list, delay is given in milliseconds.
@@ -565,7 +578,19 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait)
                        active = TRUE;
                        goto out;
                }
+               if (mpsotomppcb(mp_so)->mpp_inside > 0) {
+                       os_log(mptcp_log_handle, "%s - %lx: Still inside %d usecount %d\n", __func__,
+                           (unsigned long)VM_KERNEL_ADDRPERM(mpsotompte(mp_so)),
+                           mpsotomppcb(mp_so)->mpp_inside,
+                           mp_so->so_usecount);
+                       socket_unlock(mp_so, 0);
+                       mp_so = NULL;
+                       active = TRUE;
+                       goto out;
+               }
+               /* We call socket_unlock with refcount further below */
                mp_so->so_usecount++;
+               tptomptp(tp)->mpt_mpte->mpte_mppcb->mpp_inside++;
        }
 
        /*
@@ -1004,6 +1029,7 @@ retransmit_packet:
                         * is spurious.
                         */
                        tcp_rexmt_save_state(tp);
+                       tcp_ccdbg_trace(tp, NULL, TCP_CC_FIRST_REXMT);
                }
 #if MPTCP
                if ((tp->t_rxtshift >= mptcp_fail_thresh) &&
@@ -1012,10 +1038,13 @@ retransmit_packet:
                        mptcp_act_on_txfail(so);
                }
 
-               if (so->so_flags & SOF_MP_SUBFLOW) {
+               if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+                   (so->so_flags & SOF_MP_SUBFLOW)) {
                        struct mptses *mpte = tptomptp(tp)->mpt_mpte;
 
-                       mptcp_check_subflows_and_add(mpte);
+                       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
+                               mptcp_check_subflows_and_add(mpte);
+                       }
                }
 #endif /* MPTCP */
 
@@ -1049,11 +1078,13 @@ retransmit_packet:
                        tp->t_flagsext &= ~(TF_DELAY_RECOVERY);
                }
 
-               if (tp->t_state == TCPS_SYN_RECEIVED) {
+               if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
+                   tp->t_state == TCPS_SYN_RECEIVED) {
                        tcp_disable_tfo(tp);
                }
 
-               if (!(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
+               if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
+                   !(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
                    (tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
                    !(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) &&
                    ((tp->t_state != TCPS_SYN_SENT && tp->t_rxtshift > 1) ||
@@ -1070,6 +1101,8 @@ retransmit_packet:
                        tcp_heuristic_tfo_middlebox(tp);
 
                        so->so_error = ENODATA;
+                       soevent(so,
+                           (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MP_SUB_ERROR));
                        sorwakeup(so);
                        sowwakeup(so);
 
@@ -1077,13 +1110,16 @@ retransmit_packet:
                        tcpstat.tcps_tfo_sndblackhole++;
                }
 
-               if (!(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
+               if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
+                   !(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
                    (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) &&
                    tp->t_rxtshift > 3) {
                        if (TSTMP_GT(tp->t_sndtime - 10 * TCP_RETRANSHZ, tp->t_rcvtime)) {
                                tcp_heuristic_tfo_middlebox(tp);
 
                                so->so_error = ENODATA;
+                               soevent(so,
+                                   (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MP_SUB_ERROR));
                                sorwakeup(so);
                                sowwakeup(so);
                        }
@@ -1092,12 +1128,12 @@ retransmit_packet:
                if (tp->t_state == TCPS_SYN_SENT) {
                        rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
                        tp->t_stat.synrxtshift = tp->t_rxtshift;
+                       tp->t_stat.rxmitsyns++;
 
                        /* When retransmitting, disable TFO */
                        if (tfo_enabled(tp) &&
-                           (!(so->so_flags1 & SOF1_DATA_AUTHENTICATED) ||
-                           (tp->t_flagsext & TF_FASTOPEN_HEUR))) {
-                               tp->t_flagsext &= ~TF_FASTOPEN;
+                           !(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE)) {
+                               tcp_disable_tfo(tp);
                                tp->t_tfo_flags |= TFO_F_SYN_LOSS;
                        }
                } else {
@@ -1108,6 +1144,8 @@ retransmit_packet:
                    TCP_ADD_REXMTSLOP(tp));
                tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
 
+               TCP_LOG_RTT_INFO(tp);
+
                if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb)) {
                        goto fc_output;
                }
@@ -1347,8 +1385,10 @@ fc_output:
                                bzero(&tra, sizeof(tra));
                                tra.nocell = INP_NO_CELLULAR(inp);
                                tra.noexpensive = INP_NO_EXPENSIVE(inp);
+                               tra.noconstrained = INP_NO_CONSTRAINED(inp);
                                tra.awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
                                tra.intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp);
+                               tra.keep_alive = 1;
                                if (tp->t_inpcb->inp_flags & INP_BOUND_IF) {
                                        tra.ifscope = tp->t_inpcb->inp_boundifp->if_index;
                                } else {
@@ -1362,6 +1402,9 @@ fc_output:
                                        tp->t_rtimo_probes++;
                                }
                        }
+
+                       TCP_LOG_KEEP_ALIVE(tp, idle_time);
+
                        tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
                            TCP_CONN_KEEPINTVL(tp));
                } else {
@@ -1418,12 +1461,15 @@ fc_output:
                        tp->t_timer[TCPT_KEEP] = min(OFFSET_FROM_START(
                                    tp, tcp_backoff[ind] * TCP_REXMTVAL(tp)),
                            tp->t_timer[TCPT_KEEP]);
-               } else if (!(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
+               } else if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
+                   !(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
                    tp->t_tfo_probe_state == TFO_PROBE_WAIT_DATA) {
                        /* Still no data! Let's assume a TFO-error and err out... */
                        tcp_heuristic_tfo_middlebox(tp);
 
                        so->so_error = ENODATA;
+                       soevent(so,
+                           (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MP_SUB_ERROR));
                        sorwakeup(so);
                        tp->t_tfo_stats |= TFO_S_RECV_BLACKHOLE;
                        tcpstat.tcps_tfo_blackhole++;
@@ -1508,51 +1554,101 @@ fc_output:
 
        case TCPT_PTO:
        {
-               int32_t snd_len;
-               tp->t_flagsext &= ~(TF_SENT_TLPROBE);
+               int32_t ret = 0;
 
+               if (!(tp->t_flagsext & TF_IF_PROBING)) {
+                       tp->t_flagsext &= ~(TF_SENT_TLPROBE);
+               }
                /*
                 * Check if the connection is in the right state to
                 * send a probe
                 */
-               if (tp->t_state != TCPS_ESTABLISHED ||
-                   (tp->t_rxtshift > 0 && !(tp->t_flagsext & TF_PROBING)) ||
+               if ((tp->t_state != TCPS_ESTABLISHED ||
+                   tp->t_rxtshift > 0 ||
                    tp->snd_max == tp->snd_una ||
                    !SACK_ENABLED(tp) ||
                    !TAILQ_EMPTY(&tp->snd_holes) ||
-                   IN_FASTRECOVERY(tp)) {
+                   IN_FASTRECOVERY(tp)) &&
+                   !(tp->t_flagsext & TF_IF_PROBING)) {
                        break;
                }
 
                /*
-                * If there is no new data to send or if the
-                * connection is limited by receive window then
-                * retransmit the last segment, otherwise send
-                * new data.
+                * When the interface state is changed explicitly reset the retransmission
+                * timer state for both SYN and data packets because we do not want to
+                * wait unnecessarily or timeout too quickly if the link characteristics
+                * have changed drastically
                 */
-               snd_len = min(so->so_snd.sb_cc, tp->snd_wnd)
-                   - (tp->snd_max - tp->snd_una);
-               if (snd_len > 0) {
-                       tp->snd_nxt = tp->snd_max;
+               if (tp->t_flagsext & TF_IF_PROBING) {
+                       tp->t_rxtshift = 0;
+                       if (tp->t_state == TCPS_SYN_SENT) {
+                               tp->t_stat.synrxtshift = tp->t_rxtshift;
+                       }
+                       /*
+                        * Reset to the the default RTO
+                        */
+                       tp->t_srtt = TCPTV_SRTTBASE;
+                       tp->t_rttvar =
+                           ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
+                       tp->t_rttmin = tp->t_flags & TF_LOCAL ? tcp_TCPTV_MIN :
+                           TCPTV_REXMTMIN;
+                       TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
+                           tp->t_rttmin, TCPTV_REXMTMAX, TCP_ADD_REXMTSLOP(tp));
+                       TCP_LOG_RTT_INFO(tp);
+               }
+
+               if (tp->t_state == TCPS_SYN_SENT) {
+                       /*
+                        * The PTO for SYN_SENT reinitializes TCP as if it was a fresh
+                        * connection attempt
+                        */
+                       tp->snd_nxt = tp->snd_una;
+                       /*
+                        * Note:  We overload snd_recover to function also as the
+                        * snd_last variable described in RFC 2582
+                        */
+                       tp->snd_recover = tp->snd_max;
+                       /*
+                        * Force a segment to be sent.
+                        */
+                       tp->t_flags |= TF_ACKNOW;
+
+                       /* If timing a segment in this window, stop the timer */
+                       tp->t_rtttime = 0;
                } else {
-                       snd_len = min((tp->snd_max - tp->snd_una),
-                           tp->t_maxseg);
-                       tp->snd_nxt = tp->snd_max - snd_len;
+                       int32_t snd_len;
+
+                       /*
+                        * If there is no new data to send or if the
+                        * connection is limited by receive window then
+                        * retransmit the last segment, otherwise send
+                        * new data.
+                        */
+                       snd_len = min(so->so_snd.sb_cc, tp->snd_wnd)
+                           - (tp->snd_max - tp->snd_una);
+                       if (snd_len > 0) {
+                               tp->snd_nxt = tp->snd_max;
+                       } else {
+                               snd_len = min((tp->snd_max - tp->snd_una),
+                                   tp->t_maxseg);
+                               tp->snd_nxt = tp->snd_max - snd_len;
+                       }
                }
 
                tcpstat.tcps_pto++;
-               if (tp->t_flagsext & TF_PROBING) {
+               if (tp->t_flagsext & TF_IF_PROBING) {
                        tcpstat.tcps_probe_if++;
                }
 
                /* If timing a segment in this window, stop the timer */
                tp->t_rtttime = 0;
-               /* Note that tail loss probe is being sent */
-               tp->t_flagsext |= TF_SENT_TLPROBE;
-               tp->t_tlpstart = tcp_now;
+               /* Note that tail loss probe is being sent. Exclude IF probe */
+               if (!(tp->t_flagsext & TF_IF_PROBING)) {
+                       tp->t_flagsext |= TF_SENT_TLPROBE;
+                       tp->t_tlpstart = tcp_now;
+               }
 
                tp->snd_cwnd += tp->t_maxseg;
-
                /*
                 * When tail-loss-probe fires, we reset the RTO timer, because
                 * a probe just got sent, so we are good to push out the timer.
@@ -1560,11 +1656,57 @@ fc_output:
                 * Set to 0 to ensure that tcp_output() will reschedule it
                 */
                tp->t_timer[TCPT_REXMT] = 0;
+               ret = tcp_output(tp);
+
+#if (DEBUG || DEVELOPMENT)
+               if ((tp->t_flagsext & TF_IF_PROBING) &&
+                   ((IFNET_IS_COMPANION_LINK(tp->t_inpcb->inp_last_outifp)) ||
+                   tp->t_state == TCPS_SYN_SENT)) {
+                       if (ret == 0 && tcp_probe_if_fix_port > 0 &&
+                           tcp_probe_if_fix_port <= IPPORT_HILASTAUTO) {
+                               tp->t_timer[TCPT_REXMT] = 0;
+                               tcp_set_lotimer_index(tp);
+                       }
 
-               (void)tcp_output(tp);
+                       os_log(OS_LOG_DEFAULT,
+                           "%s: sent %s probe for %u > %u on interface %s"
+                           " (%u) %s(%d)",
+                           __func__,
+                           tp->t_state == TCPS_SYN_SENT ? "SYN" : "data",
+                           ntohs(tp->t_inpcb->inp_lport),
+                           ntohs(tp->t_inpcb->inp_fport),
+                           if_name(tp->t_inpcb->inp_last_outifp),
+                           tp->t_inpcb->inp_last_outifp->if_index,
+                           ret == 0 ? "succeeded" :"failed", ret);
+               }
+#endif /* DEBUG || DEVELOPMENT */
+
+               /*
+                * When the connection is not idle, make sure the retransmission timer
+                * is armed because it was set to zero above
+                */
+               if ((tp->t_timer[TCPT_REXMT] == 0 || tp->t_timer[TCPT_PERSIST] == 0) &&
+                   (tp->t_inpcb->inp_socket->so_snd.sb_cc != 0 || tp->t_state == TCPS_SYN_SENT ||
+                   tp->t_state == TCPS_SYN_RECEIVED)) {
+                       tp->t_timer[TCPT_REXMT] =
+                           OFFSET_FROM_START(tp, tp->t_rxtcur);
+
+                       os_log(OS_LOG_DEFAULT,
+                           "%s: tcp_output() returned %u with retransmission timer disabled "
+                           "for %u > %u in state %d, reset timer to %d",
+                           __func__, ret,
+                           ntohs(tp->t_inpcb->inp_lport),
+                           ntohs(tp->t_inpcb->inp_fport),
+                           tp->t_state,
+                           tp->t_timer[TCPT_REXMT]);
+
+                       tcp_check_timer_state(tp);
+               }
                tp->snd_cwnd -= tp->t_maxseg;
 
-               tp->t_tlphighrxt = tp->snd_nxt;
+               if (!(tp->t_flagsext & TF_IF_PROBING)) {
+                       tp->t_tlphighrxt = tp->snd_nxt;
+               }
                break;
        }
        case TCPT_DELAYFR:
@@ -1762,12 +1904,11 @@ tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode,
         * If this connection is over an interface that needs to
         * be probed, send probe packets to reinitiate communication.
         */
-       if (probe_if_index > 0 && tp->t_inpcb->inp_last_outifp != NULL &&
-           tp->t_inpcb->inp_last_outifp->if_index == probe_if_index) {
-               tp->t_flagsext |= TF_PROBING;
+       if (TCP_IF_STATE_CHANGED(tp, probe_if_index)) {
+               tp->t_flagsext |= TF_IF_PROBING;
                tcp_timers(tp, TCPT_PTO);
                tp->t_timer[TCPT_PTO] = 0;
-               tp->t_flagsext &= ~TF_PROBING;
+               tp->t_flagsext &= ~TF_IF_PROBING;
        }
 
        /*
@@ -1907,7 +2048,14 @@ tcp_run_timerlist(void * arg1, void * arg2)
        LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) {
                uint32_t offset = 0;
                uint32_t runtime = te->runtime;
-               if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now)) {
+
+               tp = TIMERENTRY_TO_TP(te);
+
+               /*
+                * An interface probe may need to happen before the previously scheduled runtime
+                */
+               if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now) &&
+                   !TCP_IF_STATE_CHANGED(tp, listp->probe_if_index)) {
                        offset = timer_diff(runtime, 0, tcp_now, 0);
                        if (next_timer == 0 || offset < next_timer) {
                                next_timer = offset;
@@ -1916,8 +2064,6 @@ tcp_run_timerlist(void * arg1, void * arg2)
                        continue;
                }
 
-               tp = TIMERENTRY_TO_TP(te);
-
                /*
                 * Acquire an inp wantcnt on the inpcb so that the socket
                 * won't get detached even if tcp_close is called
@@ -2473,13 +2619,19 @@ tcp_interface_send_probe(u_int16_t probe_if_index)
        calculate_tcp_clock();
 
        lck_mtx_lock(listp->mtx);
-       if (listp->probe_if_index > 0) {
+       if (listp->probe_if_index > 0 && listp->probe_if_index != probe_if_index) {
                tcpstat.tcps_probe_if_conflict++;
+               os_log(OS_LOG_DEFAULT,
+                   "%s: probe_if_index %u conflicts with %u, tcps_probe_if_conflict %u\n",
+                   __func__, probe_if_index, listp->probe_if_index,
+                   tcpstat.tcps_probe_if_conflict);
                goto done;
        }
 
        listp->probe_if_index = probe_if_index;
        if (listp->running) {
+               os_log(OS_LOG_DEFAULT, "%s: timer list already running for if_index %u\n",
+                   __func__, probe_if_index);
                goto done;
        }
 
@@ -2493,6 +2645,9 @@ tcp_interface_send_probe(u_int16_t probe_if_index)
                diff = timer_diff(listp->runtime, 0, tcp_now, offset);
                if (diff <= 0) {
                        /* The timer will fire sooner than what's needed */
+                       os_log(OS_LOG_DEFAULT,
+                           "%s: timer will fire sooner than needed for if_index %u\n",
+                           __func__, probe_if_index);
                        goto done;
                }
        }
index ef53f55b915d3aa7d735ae2f71a6788f7a6bf6bb..92c445448f665d47a5fd51752037d687a4b42ebb 100644 (file)
@@ -73,6 +73,7 @@
 #if !CONFIG_EMBEDDED
 #include <sys/kasl.h>
 #endif
+#include <sys/priv.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
@@ -82,6 +83,7 @@
 #include <net/route.h>
 #include <net/ntstat.h>
 #include <net/content_filter.h>
+#include <net/multi_layer_pkt_log.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_cc.h>
+#include <netinet/tcp_log.h>
 #include <mach/sdt.h>
 #if TCPDEBUG
 #include <netinet/tcp_debug.h>
@@ -125,12 +128,11 @@ errno_t tcp_fill_info_for_info_tuple(struct info_tuple *, struct tcp_info *);
 int tcp_sysctl_info(struct sysctl_oid *, void *, int, struct sysctl_req *);
 static void tcp_connection_fill_info(struct tcpcb *tp,
     struct tcp_connection_info *tci);
+static int tcp_get_mpkl_send_info(struct mbuf *, struct so_mpkl_send_info *);
 
 /*
  * TCP protocol interface to socket abstraction.
  */
-extern  char *tcpstates[];      /* XXX ??? */
-
 static int      tcp_attach(struct socket *, struct proc *);
 static int      tcp_connect(struct tcpcb *, struct sockaddr *, struct proc *);
 #if INET6
@@ -387,6 +389,7 @@ tcp_usr_listen(struct socket *so, struct proc *p)
        if (error == 0) {
                tp->t_state = TCPS_LISTEN;
        }
+       TCP_LOG_LISTEN(tp, error);
        COMMON_END(PRU_LISTEN);
 }
 
@@ -409,6 +412,7 @@ tcp6_usr_listen(struct socket *so, struct proc *p)
        if (error == 0) {
                tp->t_state = TCPS_LISTEN;
        }
+       TCP_LOG_LISTEN(tp, error);
        COMMON_END(PRU_LISTEN);
 }
 #endif /* INET6 */
@@ -422,7 +426,8 @@ tcp_connect_complete(struct socket *so)
 
        /* TFO delays the tcp_output until later, when the app calls write() */
        if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
-               if (!necp_socket_is_allowed_to_send_recv(sotoinpcb(so), NULL, NULL, NULL)) {
+               if (!necp_socket_is_allowed_to_send_recv(sotoinpcb(so), NULL, NULL, NULL, NULL)) {
+                       TCP_LOG_DROP_NECP(NULL, NULL, tp, true);
                        return EHOSTUNREACH;
                }
 
@@ -474,8 +479,14 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
                }
        }
 #if NECP
+#if CONTENT_FILTER
+       error = cfil_sock_attach(so, NULL, nam, CFS_CONNECTION_DIR_OUT);
+       if (error != 0) {
+               return error;
+       }
+#endif /* CONTENT_FILTER */
 #if FLOW_DIVERT
-       else if (necp_socket_should_use_flow_divert(inp)) {
+       if (necp_socket_should_use_flow_divert(inp)) {
                uint32_t fd_ctl_unit = necp_socket_get_flow_divert_control_unit(inp);
                if (fd_ctl_unit > 0) {
                        error = flow_divert_pcb_init(so, fd_ctl_unit);
@@ -489,12 +500,6 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
                return error;
        }
 #endif /* FLOW_DIVERT */
-#if CONTENT_FILTER
-       error = cfil_sock_attach(so);
-       if (error != 0) {
-               return error;
-       }
-#endif /* CONTENT_FILTER */
 #endif /* NECP */
        tp = intotcpcb(inp);
        TCPDEBUG1();
@@ -516,11 +521,14 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
        }
 
        if ((error = tcp_connect(tp, nam, p)) != 0) {
+               TCP_LOG_CONNECT(tp, true, error);
                goto out;
        }
 
        error = tcp_connect_complete(so);
 
+       TCP_LOG_CONNECT(tp, true, error);
+
        COMMON_END(PRU_CONNECT);
 }
 
@@ -658,8 +666,14 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
                }
        }
 #if NECP
+#if CONTENT_FILTER
+       error = cfil_sock_attach(so, NULL, nam, CFS_CONNECTION_DIR_OUT);
+       if (error != 0) {
+               return error;
+       }
+#endif /* CONTENT_FILTER */
 #if FLOW_DIVERT
-       else if (necp_socket_should_use_flow_divert(inp)) {
+       if (necp_socket_should_use_flow_divert(inp)) {
                uint32_t fd_ctl_unit = necp_socket_get_flow_divert_control_unit(inp);
                if (fd_ctl_unit > 0) {
                        error = flow_divert_pcb_init(so, fd_ctl_unit);
@@ -673,12 +687,6 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
                return error;
        }
 #endif /* FLOW_DIVERT */
-#if CONTENT_FILTER
-       error = cfil_sock_attach(so);
-       if (error != 0) {
-               return error;
-       }
-#endif /* CONTENT_FILTER */
 #endif /* NECP */
 
        tp = intotcpcb(inp);
@@ -712,6 +720,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
                inp->inp_vflag |= INP_IPV4;
                inp->inp_vflag &= ~INP_IPV6;
                if ((error = tcp_connect(tp, (struct sockaddr *)&sin, p)) != 0) {
+                       TCP_LOG_CONNECT(tp, true, error);
                        goto out;
                }
 
@@ -721,10 +730,14 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
        inp->inp_vflag &= ~INP_IPV4;
        inp->inp_vflag |= INP_IPV6;
        if ((error = tcp6_connect(tp, nam, p)) != 0) {
+               TCP_LOG_CONNECT(tp, true, error);
                goto out;
        }
 
        error = tcp_connect_complete(so);
+
+       TCP_LOG_CONNECT(tp, true, error);
+
        COMMON_END(PRU_CONNECT);
 }
 
@@ -807,17 +820,14 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam)
        else if (necp_socket_should_use_flow_divert(inp)) {
                return EPROTOTYPE;
        }
-#if CONTENT_FILTER
-       error = cfil_sock_attach(so);
-       if (error != 0) {
-               return error;
-       }
-#endif /* CONTENT_FILTER */
+
 #endif /* NECP */
 
        tp = intotcpcb(inp);
        TCPDEBUG1();
 
+       TCP_LOG_ACCEPT(tp, 0);
+
        calculate_tcp_clock();
 
        COMMON_END(PRU_ACCEPT);
@@ -843,17 +853,14 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
        else if (necp_socket_should_use_flow_divert(inp)) {
                return EPROTOTYPE;
        }
-#if CONTENT_FILTER
-       error = cfil_sock_attach(so);
-       if (error != 0) {
-               return error;
-       }
-#endif /* CONTENT_FILTER */
+
 #endif /* NECP */
 
        tp = intotcpcb(inp);
        TCPDEBUG1();
 
+       TCP_LOG_ACCEPT(tp, 0);
+
        calculate_tcp_clock();
 
        in6_mapped_peeraddr(so, nam);
@@ -1005,6 +1012,10 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
        struct inpcb *inp = sotoinpcb(so);
        struct tcpcb *tp;
        uint32_t msgpri = MSG_PRI_DEFAULT;
+       uint32_t mpkl_len = 0; /* length of mbuf chain */
+       uint32_t mpkl_seq; /* sequence number where new data is added */
+       struct so_mpkl_send_info mpkl_send_info = {};
+
 #if INET6
        int isipv6;
 #endif
@@ -1045,6 +1056,17 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
 
        calculate_tcp_clock();
 
+       if (net_mpklog_enabled) {
+               mpkl_seq = tp->snd_una + so->so_snd.sb_cc;
+               if (m) {
+                       mpkl_len = m_length(m);
+               }
+               if (so->so_flags1 & SOF1_MPKL_SEND_INFO) {
+                       uuid_copy(mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
+                       mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
+               }
+       }
+
        if (control != NULL) {
                if (so->so_flags & SOF_ENABLE_MSGS) {
                        /* Get the msg priority from control mbufs */
@@ -1058,22 +1080,30 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
                                m = NULL;
                                goto out;
                        }
-                       m_freem(control);
-                       control = NULL;
-               } else if (control->m_len) {
+               }
+               if (control->m_len > 0 && net_mpklog_enabled) {
+                       error = tcp_get_mpkl_send_info(control, &mpkl_send_info);
                        /*
-                        * if not unordered, TCP should not have
-                        * control mbufs
+                        * Intepretation of the returned code:
+                        *  0: client wants us to use value passed in SCM_MPKL_SEND_INFO
+                        *  1: SCM_MPKL_SEND_INFO was not present
+                        *  other: failure
                         */
-                       m_freem(control);
-                       if (m != NULL) {
-                               m_freem(m);
+                       if (error != 0 && error != ENOMSG) {
+                               m_freem(control);
+                               if (m != NULL) {
+                                       m_freem(m);
+                               }
+                               control = NULL;
+                               m = NULL;
+                               goto out;
                        }
-                       control = NULL;
-                       m = NULL;
-                       error = EINVAL;
-                       goto out;
                }
+               /*
+                * Silently drop unsupported ancillary data messages
+                */
+               m_freem(control);
+               control = NULL;
        }
 
        if (so->so_flags & SOF_ENABLE_MSGS) {
@@ -1107,11 +1137,17 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
 #endif /* INET6 */
                        error = tcp_connect(tp, nam, p);
                        if (error) {
+                               TCP_LOG_CONNECT(tp, true, error);
                                goto out;
                        }
                        tp->snd_wnd = TTCP_CLIENT_SND_WND;
                        tp->max_sndwnd = tp->snd_wnd;
                        tcp_mss(tp, -1, IFSCOPE_NONE);
+
+                       TCP_LOG_CONNECT(tp, true, error);
+
+                       /* The sequence number of the data is past the SYN */
+                       mpkl_seq = tp->iss + 1;
                }
 
                if (flags & PRUS_EOF) {
@@ -1162,11 +1198,14 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
 #endif /* INET6 */
                        error = tcp_connect(tp, nam, p);
                        if (error) {
+                               TCP_LOG_CONNECT(tp, true, error);
                                goto out;
                        }
                        tp->snd_wnd = TTCP_CLIENT_SND_WND;
                        tp->max_sndwnd = tp->snd_wnd;
                        tcp_mss(tp, -1, IFSCOPE_NONE);
+
+                       TCP_LOG_CONNECT(tp, true, error);
                }
                tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
                tp->t_flagsext |= TF_FORCE;
@@ -1174,6 +1213,17 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
                tp->t_flagsext &= ~TF_FORCE;
        }
 
+       if (net_mpklog_enabled && (inp = tp->t_inpcb) != NULL &&
+           ((inp->inp_last_outifp != NULL &&
+           (inp->inp_last_outifp->if_xflags & IFXF_MPK_LOG)) ||
+           (inp->inp_boundifp != NULL &&
+           (inp->inp_boundifp->if_xflags & IFXF_MPK_LOG)))) {
+               MPKL_TCP_SEND(tcp_mpkl_log_object,
+                   mpkl_send_info.mpkl_proto, mpkl_send_info.mpkl_uuid,
+                   ntohs(inp->inp_lport), ntohs(inp->inp_fport),
+                   mpkl_seq, mpkl_len,
+                   so->last_pid, so->so_log_seqn++);
+       }
 
        /*
         * We wait for the socket to successfully connect before returning.
@@ -1445,6 +1495,7 @@ skip_oinp:
        tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_CONN_KEEPINIT(tp));
        tp->iss = tcp_new_isn(tp);
        tcp_sendseqinit(tp);
+       tp->t_connect_time = tcp_now;
        if (nstat_collect) {
                nstat_route_connect_attempt(inp->inp_route.ro_rt);
        }
@@ -1546,6 +1597,7 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct proc *p)
            TCP_CONN_KEEPINIT(tp));
        tp->iss = tcp_new_isn(tp);
        tcp_sendseqinit(tp);
+       tp->t_connect_time = tcp_now;
        if (nstat_collect) {
                nstat_route_connect_attempt(inp->inp_route.ro_rt);
        }
@@ -1639,7 +1691,7 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
                ti->tcpi_rxoutoforderbytes = tp->t_stat.rxoutoforderbytes;
 
                if (tp->t_state > TCPS_LISTEN) {
-                       ti->tcpi_synrexmits = tp->t_stat.synrxtshift;
+                       ti->tcpi_synrexmits = tp->t_stat.rxmitsyns;
                }
                ti->tcpi_cell_rxpackets = inp->inp_cstat->rxpackets;
                ti->tcpi_cell_rxbytes = inp->inp_cstat->rxbytes;
@@ -1856,44 +1908,6 @@ tcp_sysctl_info(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused
        int error;
        struct tcp_info ti = {};
        struct info_tuple itpl;
-#if !CONFIG_EMBEDDED
-       proc_t caller = PROC_NULL;
-       proc_t caller_parent = PROC_NULL;
-       char command_name[MAXCOMLEN + 1] = "";
-       char parent_name[MAXCOMLEN + 1] = "";
-
-       if ((caller = proc_self()) != PROC_NULL) {
-               /* get process name */
-               strlcpy(command_name, caller->p_comm, sizeof(command_name));
-
-               /* get parent process name if possible */
-               if ((caller_parent = proc_find(caller->p_ppid)) != PROC_NULL) {
-                       strlcpy(parent_name, caller_parent->p_comm,
-                           sizeof(parent_name));
-                       proc_rele(caller_parent);
-               }
-
-               if ((escape_str(command_name, strlen(command_name) + 1,
-                   sizeof(command_name)) == 0) &&
-                   (escape_str(parent_name, strlen(parent_name) + 1,
-                   sizeof(parent_name)) == 0)) {
-                       kern_asl_msg(LOG_DEBUG, "messagetracer",
-                           5,
-                           "com.apple.message.domain",
-                           "com.apple.kernel.tcpstat", /* 1 */
-                           "com.apple.message.signature",
-                           "tcpinfo", /* 2 */
-                           "com.apple.message.signature2", command_name, /* 3 */
-                           "com.apple.message.signature3", parent_name, /* 4 */
-                           "com.apple.message.summarize", "YES", /* 5 */
-                           NULL);
-               }
-       }
-
-       if (caller != PROC_NULL) {
-               proc_rele(caller);
-       }
-#endif /* !CONFIG_EMBEDDED */
 
        if (req->newptr == USER_ADDR_NULL) {
                return EINVAL;
@@ -1965,6 +1979,90 @@ tcp_getconninfo(struct socket *so, struct conninfo_tcp *tcp_ci)
        tcp_fill_info(sototcpcb(so), &tcp_ci->tcpci_tcp_info);
 }
 
+void
+tcp_clear_keep_alive_offload(struct socket *so)
+{
+       struct inpcb *inp;
+       struct ifnet *ifp;
+
+       inp = sotoinpcb(so);
+       if (inp == NULL) {
+               return;
+       }
+
+       if ((inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD) == 0) {
+               return;
+       }
+
+       ifp = inp->inp_boundifp != NULL ? inp->inp_boundifp :
+           inp->inp_last_outifp;
+       if (ifp == NULL) {
+               panic("%s: so %p inp %p ifp NULL",
+                   __func__, so, inp);
+       }
+
+       ifnet_lock_exclusive(ifp);
+
+       if (ifp->if_tcp_kao_cnt == 0) {
+               panic("%s: so %p inp %p ifp %p if_tcp_kao_cnt == 0",
+                   __func__, so, inp, ifp);
+       }
+       ifp->if_tcp_kao_cnt--;
+       inp->inp_flags2 &= ~INP2_KEEPALIVE_OFFLOAD;
+
+       ifnet_lock_done(ifp);
+}
+
+static int
+tcp_set_keep_alive_offload(struct socket *so, struct proc *proc)
+{
+       int error = 0;
+       struct inpcb *inp;
+       struct ifnet *ifp;
+
+       inp = sotoinpcb(so);
+       if (inp == NULL) {
+               return ECONNRESET;
+       }
+       if ((inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD) != 0) {
+               return 0;
+       }
+
+       ifp = inp->inp_boundifp != NULL ? inp->inp_boundifp :
+           inp->inp_last_outifp;
+       if (ifp == NULL) {
+               error = ENXIO;
+               os_log_info(OS_LOG_DEFAULT,
+                   "%s: error %d for proc %s[%u] out ifp is not set\n",
+                   __func__, error,
+                   proc != NULL ? proc->p_comm : "kernel",
+                   proc != NULL ? proc->p_pid : 0);
+               return ENXIO;
+       }
+
+       error = if_get_tcp_kao_max(ifp);
+       if (error != 0) {
+               return error;
+       }
+
+       ifnet_lock_exclusive(ifp);
+       if (ifp->if_tcp_kao_cnt < ifp->if_tcp_kao_max) {
+               ifp->if_tcp_kao_cnt++;
+               inp->inp_flags2 |= INP2_KEEPALIVE_OFFLOAD;
+       } else {
+               error = ETOOMANYREFS;
+               os_log_info(OS_LOG_DEFAULT,
+                   "%s: error %d for proc %s[%u] if_tcp_kao_max %u\n",
+                   __func__, error,
+                   proc != NULL ? proc->p_comm : "kernel",
+                   proc != NULL ? proc->p_pid : 0,
+                   ifp->if_tcp_kao_max);
+       }
+       ifnet_lock_done(ifp);
+
+       return error;
+}
+
 /*
  * The new sockopt interface makes it possible for us to block in the
  * copyin/out step (if we take a page fault).  Taking a page fault at
@@ -2203,6 +2301,10 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
                        break;
 
                case TCP_KEEPALIVE_OFFLOAD:
+                       if ((error = priv_check_cred(kauth_cred_get(),
+                           PRIV_NETINET_TCP_KA_OFFLOAD, 0)) != 0) {
+                               break;
+                       }
                        error = sooptcopyin(sopt, &optval, sizeof(optval),
                            sizeof(optval));
                        if (error) {
@@ -2213,9 +2315,10 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
                                break;
                        }
                        if (optval != 0) {
-                               inp->inp_flags2 |= INP2_KEEPALIVE_OFFLOAD;
+                               error = tcp_set_keep_alive_offload(so,
+                                   sopt->sopt_p);
                        } else {
-                               inp->inp_flags2 &= ~INP2_KEEPALIVE_OFFLOAD;
+                               tcp_clear_keep_alive_offload(so);
                        }
                        break;
 
@@ -2398,6 +2501,9 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
                        }
                        break;
                case TCP_FASTOPEN_FORCE_HEURISTICS:
+
+                       break;
+               case TCP_FASTOPEN_FORCE_ENABLE:
                        error = sooptcopyin(sopt, &optval, sizeof(optval),
                            sizeof(optval));
 
@@ -2414,9 +2520,9 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
                                break;
                        }
                        if (optval) {
-                               tp->t_flagsext |= TF_FASTOPEN_HEUR;
+                               tp->t_flagsext |= TF_FASTOPEN_FORCE_ENABLE;
                        } else {
-                               tp->t_flagsext &= ~TF_FASTOPEN_HEUR;
+                               tp->t_flagsext &= ~TF_FASTOPEN_FORCE_ENABLE;
                        }
 
                        break;
@@ -2600,7 +2706,10 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
                        optval = tfo_enabled(tp);
                        break;
                case TCP_FASTOPEN_FORCE_HEURISTICS:
-                       optval = (tp->t_flagsext & TF_FASTOPEN_HEUR) ? 1 : 0;
+                       optval = 0;
+                       break;
+               case TCP_FASTOPEN_FORCE_ENABLE:
+                       optval = (tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) ? 1 : 0;
                        break;
                case TCP_MEASURE_SND_BW:
                        optval = tp->t_flagsext & TF_MEASURESNDBW;
@@ -2915,6 +3024,7 @@ tcp_usrclosed(struct tcpcb *tp)
                    struct tcpcb *, tp,
                    int32_t, TCPS_FIN_WAIT_1);
                tp->t_state = TCPS_FIN_WAIT_1;
+               TCP_LOG_CONNECTION_SUMMARY(tp);
                break;
 
        case TCPS_CLOSE_WAIT:
@@ -2923,6 +3033,7 @@ tcp_usrclosed(struct tcpcb *tp)
                    struct tcpcb *, tp,
                    int32_t, TCPS_LAST_ACK);
                tp->t_state = TCPS_LAST_ACK;
+               TCP_LOG_CONNECTION_SUMMARY(tp);
                break;
        }
        if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
@@ -2964,6 +3075,7 @@ tcp_out6_cksum_stats(u_int32_t len)
        tcpstat.tcps_snd6_swcsum++;
        tcpstat.tcps_snd6_swcsum_bytes += len;
 }
+#endif /* INET6 */
 
 /*
  * When messages are enabled on a TCP socket, the message priority
@@ -2973,6 +3085,7 @@ int
 tcp_get_msg_priority(struct mbuf *control, uint32_t *msgpri)
 {
        struct cmsghdr *cm;
+
        if (control == NULL) {
                return EINVAL;
        }
@@ -2994,4 +3107,33 @@ tcp_get_msg_priority(struct mbuf *control, uint32_t *msgpri)
        }
        return 0;
 }
-#endif /* INET6 */
+
+int
+tcp_get_mpkl_send_info(struct mbuf *control,
+    struct so_mpkl_send_info *mpkl_send_info)
+{
+       struct cmsghdr *cm;
+
+       if (control == NULL || mpkl_send_info == NULL) {
+               return EINVAL;
+       }
+
+       for (cm = M_FIRST_CMSGHDR(control); cm;
+           cm = M_NXT_CMSGHDR(control, cm)) {
+               if (cm->cmsg_len < sizeof(struct cmsghdr) ||
+                   cm->cmsg_len > control->m_len) {
+                       return EINVAL;
+               }
+               if (cm->cmsg_level != SOL_SOCKET ||
+                   cm->cmsg_type != SCM_MPKL_SEND_INFO) {
+                       continue;
+               }
+               if (cm->cmsg_len != CMSG_LEN(sizeof(struct so_mpkl_send_info))) {
+                       return EINVAL;
+               }
+               memcpy(mpkl_send_info, CMSG_DATA(cm),
+                   sizeof(struct so_mpkl_send_info));
+               return 0;
+       }
+       return ENOMSG;
+}
index e9fde2f3a051288343efcb681edf44a87bb12910..5358d21a094c768c234e1840a31e05da84e1dcfb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -380,6 +380,7 @@ struct tcpcb {
 /* Receiver state for stretch-ack algorithm */
        u_int32_t       rcv_unackwin;   /* to measure win for stretching acks */
        u_int32_t       rcv_by_unackwin; /* bytes seen during the last ack-stretching win */
+       u_int32_t       rcv_by_unackhalfwin;
        u_int32_t       rcv_nostrack_ts; /* timestamp when stretch ack was disabled automatically */
        u_int32_t       rcv_nostrack_pkts; /* pkts received since strech ack was disabled */
        u_int16_t       rcv_waitforss;  /* wait for packets during slow-start */
@@ -449,7 +450,7 @@ struct tcpcb {
                u_int32_t       rxoutoforderbytes;
                u_int32_t       txretransmitbytes;
                u_int8_t        synrxtshift;
-               u_int8_t        unused;
+               u_int8_t        rxmitsyns;
                u_int16_t       unused_pad_to_8;
                u_int32_t       rxmitpkts;
        } t_stat;
@@ -479,10 +480,11 @@ struct tcpcb {
 #define TF_DISABLE_DSACK        0x40000         /* Ignore DSACK due to n/w duplication */
 #define TF_RESCUE_RXT           0x80000         /* SACK rescue retransmit */
 #define TF_CWND_NONVALIDATED    0x100000        /* cwnd non validated */
-#define TF_PROBING              0x200000        /* Trigger probe timeout */
+#define TF_IF_PROBING           0x200000        /* Trigger interface probe timeout */
 #define TF_FASTOPEN             0x400000        /* TCP Fastopen is enabled */
 #define TF_REASS_INPROG         0x800000        /* Reassembly is in progress */
-#define TF_FASTOPEN_HEUR        0x1000000       /* Make sure that heuristics get never skipped */
+#define TF_FASTOPEN_FORCE_ENABLE 0x1000000      /* Force-enable TCP Fastopen */
+#define TF_LOGGED_CONN_SUMMARY  0x2000000       /* Connection summary was logged */
 
 #if TRAFFIC_MGT
        /* Inter-arrival jitter related state */
@@ -621,6 +623,9 @@ struct tcpcb {
        u_int32_t       t_rxt_minimum_timeout;  /* minimum retransmit timeout in ms */
        uint32_t        t_challengeack_last;    /* last time challenge ACK was sent per sec */
        uint32_t        t_challengeack_count;   /* # of challenge ACKs already sent per sec */
+
+       u_int32_t       t_log_flags;            /* TCP logging flags*/
+       u_int32_t       t_connect_time;         /* time when the connection started */
 };
 
 #define IN_FASTRECOVERY(tp)     (tp->t_flags & TF_FASTRECOVERY)
@@ -722,30 +727,9 @@ extern int tcprexmtthresh;
 #define TCP_AUTORCVBUF_MAX(_ifp_) (((_ifp_) != NULL && (IFNET_IS_CELLULAR((_ifp_))) && ((_ifp_)->if_eflags & IFEF_3CA)) ? \
                (tcp_autorcvbuf_max << 1) : tcp_autorcvbuf_max)
 
-enum tcp_cc_event {
-       TCP_CC_CWND_INIT,       /* 0 */
-       TCP_CC_INSEQ_ACK_RCVD,  /* 1 */
-       TCP_CC_ACK_RCVD,        /* 2 */
-       TCP_CC_ENTER_FASTRECOVERY, /* 3 */
-       TCP_CC_IN_FASTRECOVERY, /* 4 */
-       TCP_CC_EXIT_FASTRECOVERY,  /* 5 */
-       TCP_CC_PARTIAL_ACK,     /* 6 */
-       TCP_CC_IDLE_TIMEOUT,    /* 7 */
-       TCP_CC_REXMT_TIMEOUT,   /* 8 */
-       TCP_CC_ECN_RCVD,        /* 9 */
-       TCP_CC_BAD_REXMT_RECOVERY, /* 10 */
-       TCP_CC_OUTPUT_ERROR,    /* 11 */
-       TCP_CC_CHANGE_ALGO,     /* 12 */
-       TCP_CC_FLOW_CONTROL,    /* 13 */
-       TCP_CC_SUSPEND,         /* 14 */
-       TCP_CC_LIMITED_TRANSMIT, /* 15 */
-       TCP_CC_EARLY_RETRANSMIT, /* 16 */
-       TCP_CC_TLP_RECOVERY,    /* 17 */
-       TCP_CC_TLP_RECOVER_LASTPACKET, /* 18 */
-       TCP_CC_DELAY_FASTRECOVERY, /* 19 */
-       TCP_CC_TLP_IN_FASTRECOVERY, /* 20 */
-       TCP_CC_DSACK_BAD_REXMT  /* 21 */
-};
+#define TCP_IF_STATE_CHANGED(tp, probe_if_index)                        \
+       (probe_if_index > 0 && tp->t_inpcb->inp_last_outifp != NULL &&  \
+       probe_if_index == tp->t_inpcb->inp_last_outifp->if_index)
 
 /*
  * Structure to hold TCP options that are only used during segment
@@ -1205,6 +1189,10 @@ struct  tcpstat {
        u_int32_t       tcps_mptcp_back_to_wifi;        /* Total number of connections that succeed to move traffic away from cell (when starting on cell) */
        u_int32_t       tcps_mptcp_wifi_proxy;          /* Total number of new subflows that fell back to regular TCP on cell */
        u_int32_t       tcps_mptcp_cell_proxy;          /* Total number of new subflows that fell back to regular TCP on WiFi */
+
+       /* TCP offload statistics */
+       u_int32_t       tcps_ka_offload_drops;  /* Keep alive drops for timeout reported by firmware */
+
        u_int32_t       tcps_mptcp_triggered_cell;      /* Total number of times an MPTCP-connection triggered cell bringup */
 };
 
@@ -1444,17 +1432,17 @@ struct  xtcpprogress_indicators {
        u_int64_t       xp_recentflows_rxooo;   /* Total of "recent" flows received out of order bytes */
        u_int64_t       xp_recentflows_rxdup;   /* Total of "recent" flows received duplicate bytes */
        u_int64_t       xp_recentflows_retx;    /* Total of "recent" flows retransmitted bytes */
-       u_int64_t       xp_reserved1;                   /* Expansion */
-       u_int64_t       xp_reserved2;                   /* Expansion */
-       u_int64_t       xp_reserved3;                   /* Expansion */
-       u_int64_t       xp_reserved4;                   /* Expansion */
+       u_int64_t       xp_reserved1;           /* Expansion */
+       u_int64_t       xp_reserved2;           /* Expansion */
+       u_int64_t       xp_reserved3;           /* Expansion */
+       u_int64_t       xp_reserved4;           /* Expansion */
 };
 
 struct tcpprogressreq {
-       u_int64_t       ifindex;                                /* Interface index for progress indicators */
+       u_int64_t       ifindex;                /* Interface index for progress indicators */
        u_int64_t       recentflow_maxduration; /* In mach_absolute_time, max duration for flow to be counted as "recent" */
-       u_int64_t       xp_reserved1;                   /* Expansion */
-       u_int64_t       xp_reserved2;                   /* Expansion */
+       u_int64_t       filter_flags;           /* Optional additional filtering, values are interface properties per ntstat.h */
+       u_int64_t       xp_reserved2;           /* Expansion */
 };
 
 #endif /* PRIVATE */
@@ -1504,6 +1492,8 @@ struct tcpprogressreq {
        { "v6mssdflt", CTLTYPE_INT }, \
 }
 
+extern int tcp_TCPTV_MIN;
+
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_inet_tcp);
 #endif /* SYSCTL_DECL */
@@ -1550,7 +1540,9 @@ struct tcp_respond_args {
        unsigned int nocell:1,
            noexpensive:1,
            awdl_unrestricted:1,
-           intcoproc_allowed:1;
+           intcoproc_allowed:1,
+           keep_alive:1,
+           noconstrained:1;
 };
 
 void     tcp_canceltimers(struct tcpcb *);
@@ -1660,8 +1652,11 @@ extern void tcp_probe_connectivity(struct ifnet *ifp, u_int32_t enable);
 extern void tcp_get_connectivity_status(struct tcpcb *,
     struct tcp_conn_status *);
 
+extern void tcp_clear_keep_alive_offload(struct socket *so);
 extern void tcp_fill_keepalive_offload_frames(struct ifnet *,
     struct ifnet_keepalive_offload_frame *, u_int32_t, size_t, u_int32_t *);
+extern int tcp_notify_kao_timeout(ifnet_t ifp,
+    struct ifnet_keepalive_offload_frame *frame);
 
 extern boolean_t tfo_enabled(const struct tcpcb *tp);
 extern void tcp_disable_tfo(struct tcpcb *tp);
@@ -1693,6 +1688,6 @@ extern void mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *t
 __private_extern__ void tcp_update_stats_per_flow(
        struct ifnet_stats_per_flow *, struct ifnet *);
 
-#endif /* BSD_KERNEL_RPIVATE */
+#endif /* BSD_KERNEL_PRIVATE */
 
 #endif /* _NETINET_TCP_VAR_H_ */
index 571afd2acd900f5f542a449259c3cdf804ad37b5..247e0180270b987f2756080ee088425a4fe09240 100644 (file)
@@ -596,7 +596,7 @@ udp_input(struct mbuf *m, int iphlen)
                        goto bad;
                }
 
-               /* free the extra copy of mbuf or skipped by IPSec */
+               /* free the extra copy of mbuf or skipped by IPsec */
                if (m != NULL) {
                        m_freem(m);
                }
@@ -607,13 +607,14 @@ udp_input(struct mbuf *m, int iphlen)
 #if IPSEC
        /*
         * UDP to port 4500 with a payload where the first four bytes are
-        * not zero is a UDP encapsulated IPSec packet. Packets where
+        * not zero is a UDP encapsulated IPsec packet. Packets where
         * the payload is one byte and that byte is 0xFF are NAT keepalive
-        * packets. Decapsulate the ESP packet and carry on with IPSec input
+        * packets. Decapsulate the ESP packet and carry on with IPsec input
         * or discard the NAT keep-alive.
         */
        if (ipsec_bypass == 0 && (esp_udp_encap_port & 0xFFFF) != 0 &&
-           uh->uh_dport == ntohs((u_short)esp_udp_encap_port)) {
+           (uh->uh_dport == ntohs((u_short)esp_udp_encap_port) ||
+           uh->uh_sport == ntohs((u_short)esp_udp_encap_port))) {
                int payload_len = len - sizeof(struct udphdr) > 4 ? 4 :
                    len - sizeof(struct udphdr);
 
@@ -643,7 +644,7 @@ udp_input(struct mbuf *m, int iphlen)
                        return;
                } else if (payload_len == 4 && *(u_int32_t *)(void *)
                    ((caddr_t)uh + sizeof(struct udphdr)) != 0) {
-                       /* UDP encapsulated IPSec packet to pass through NAT */
+                       /* UDP encapsulated IPsec packet to pass through NAT */
                        KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END,
                            0, 0, 0, 0, 0);
                        /* preserve the udp header */
@@ -1571,6 +1572,9 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
        if (INP_NO_EXPENSIVE(inp)) {
                ipoa.ipoa_flags |=  IPOAF_NO_EXPENSIVE;
        }
+       if (INP_NO_CONSTRAINED(inp)) {
+               ipoa.ipoa_flags |=  IPOAF_NO_CONSTRAINED;
+       }
        if (INP_AWDL_UNRESTRICTED(inp)) {
                ipoa.ipoa_flags |=  IPOAF_AWDL_UNRESTRICTED;
        }
@@ -1948,6 +1952,9 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
        VERIFY(inp->inp_sndinprog_cnt > 0);
        if (--inp->inp_sndinprog_cnt == 0) {
                inp->inp_flags &= ~(INP_FC_FEEDBACK);
+               if (inp->inp_sndingprog_waiters > 0) {
+                       wakeup(&inp->inp_sndinprog_cnt);
+               }
        }
 
        /* Synchronize PCB cached route */
@@ -2008,7 +2015,7 @@ abort:
         * denied access to it, generate an event.
         */
        if (error != 0 && (ipoa.ipoa_retflags & IPOARF_IFDENIED) &&
-           (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp))) {
+           (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp) || INP_NO_CONSTRAINED(inp))) {
                soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED));
        }
 
index 9bccd060b222248702decc3247629c7fc5d49f47..c1b816806bd05aacf7b61483aa48eaa17c061991 100644 (file)
@@ -24,7 +24,7 @@ PRIVATE_DATAFILES = \
 PRIVATE_KERNELFILES = \
        ah6.h esp6.h esp_rijndael.h esp_chachapoly.h \
        in6_gif.h in6_ifattach.h ip6_ecn.h ip6protosw.h \
-       ipcomp6.h ipsec6.h tcp6_var.h udp6_var.h
+       ipsec6.h tcp6_var.h udp6_var.h
 
 INSTALL_MI_LIST        = ${DATAFILES}
 
index cd503bea625c646a26a6a55be98cab8790643cab..578e06257194d3cba98a24abe750bbdfb54f4a6c 100644 (file)
@@ -164,37 +164,37 @@ const struct ah_algorithm *
 ah_algorithm_lookup(int idx)
 {
        /* checksum algorithms */
-       static struct ah_algorithm hmac_md5 =
+       static const struct ah_algorithm hmac_md5 =
        { ah_sumsiz_1216, ah_hmac_md5_mature, 128, 128, "hmac-md5",
          ah_hmac_md5_init, ah_hmac_md5_loop,
          ah_hmac_md5_result, };
-       static struct ah_algorithm keyed_md5 =
+       static const struct ah_algorithm keyed_md5 =
        { ah_sumsiz_1216, ah_keyed_md5_mature, 128, 128, "keyed-md5",
          ah_keyed_md5_init, ah_keyed_md5_loop,
          ah_keyed_md5_result, };
-       static struct ah_algorithm hmac_sha1 =
+       static const struct ah_algorithm hmac_sha1 =
        { ah_sumsiz_1216, ah_hmac_sha1_mature, 160, 160, "hmac-sha1",
          ah_hmac_sha1_init, ah_hmac_sha1_loop,
          ah_hmac_sha1_result, };
-       static struct ah_algorithm keyed_sha1 =
+       static const struct ah_algorithm keyed_sha1 =
        { ah_sumsiz_1216, ah_keyed_sha1_mature, 160, 160, "keyed-sha1",
          ah_keyed_sha1_init, ah_keyed_sha1_loop,
          ah_keyed_sha1_result, };
-       static struct ah_algorithm ah_none =
+       static const struct ah_algorithm ah_none =
        { ah_sumsiz_zero, ah_none_mature, 0, 2048, "none",
          ah_none_init, ah_none_loop, ah_none_result, };
 #if AH_ALL_CRYPTO
-       static struct ah_algorithm hmac_sha2_256 =
+       static const struct ah_algorithm hmac_sha2_256 =
        { ah_sumsiz_sha2_256, ah_hmac_sha2_256_mature, 256, 256,
          "hmac-sha2-256",
          ah_hmac_sha2_256_init, ah_hmac_sha2_256_loop,
          ah_hmac_sha2_256_result, };
-       static struct ah_algorithm hmac_sha2_384 =
+       static const struct ah_algorithm hmac_sha2_384 =
        { ah_sumsiz_sha2_384, ah_hmac_sha2_384_mature, 384, 384,
          "hmac-sha2-384",
          ah_hmac_sha2_384_init, ah_hmac_sha2_384_loop,
          ah_hmac_sha2_384_result, };
-       static struct ah_algorithm hmac_sha2_512 =
+       static const struct ah_algorithm hmac_sha2_512 =
        { ah_sumsiz_sha2_512, ah_hmac_sha2_512_mature, 512, 512,
          "hmac-sha2-512",
          ah_hmac_sha2_512_init, ah_hmac_sha2_512_loop,
index 2a67501f6d7079c94af621210d4d20334bd92e31..104f5a9c68c446440bd59bc30981ce27b2b4f3ff 100644 (file)
 
 #define IPLEN_FLIPPED
 
+extern lck_mtx_t  *sadb_mutex;
+
 #if INET
 void
 ah4_input(struct mbuf *m, int off)
@@ -263,8 +265,8 @@ ah4_input(struct mbuf *m, int off)
        /*
         * check for sequence number.
         */
-       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) {
-               if (ipsec_chkreplay(ntohl(((struct newah *)ah)->ah_seq), sav)) {
+       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[0] != NULL) {
+               if (ipsec_chkreplay(ntohl(((struct newah *)ah)->ah_seq), sav, 0)) {
                        ; /*okey*/
                } else {
                        IPSEC_STAT_INCREMENT(ipsecstat.in_ahreplay);
@@ -386,8 +388,8 @@ ah4_input(struct mbuf *m, int off)
        /*
         * update sequence number.
         */
-       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) {
-               if (ipsec_updatereplay(ntohl(((struct newah *)ah)->ah_seq), sav)) {
+       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[0] != NULL) {
+               if (ipsec_updatereplay(ntohl(((struct newah *)ah)->ah_seq), sav, 0)) {
                        IPSEC_STAT_INCREMENT(ipsecstat.in_ahreplay);
                        goto fail;
                }
@@ -499,9 +501,18 @@ ah4_input(struct mbuf *m, int off)
                        IFA_REMREF(ifa);
                }
 
-               // Input via IPSec interface
-               if (sav->sah->ipsec_if != NULL) {
-                       if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) {
+               // Input via IPsec interface
+               lck_mtx_lock(sadb_mutex);
+               ifnet_t ipsec_if = sav->sah->ipsec_if;
+               if (ipsec_if != NULL) {
+                       // If an interface is found, add a reference count before dropping the lock
+                       ifnet_reference(ipsec_if);
+               }
+               lck_mtx_unlock(sadb_mutex);
+               if (ipsec_if != NULL) {
+                       errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m);
+                       ifnet_release(ipsec_if);
+                       if (inject_error == 0) {
                                m = NULL;
                                goto done;
                        } else {
@@ -555,13 +566,22 @@ ah4_input(struct mbuf *m, int off)
                    struct ip *, ip, struct ip6_hdr *, NULL);
 
                if (nxt != IPPROTO_DONE) {
-                       // Input via IPSec interface
-                       if (sav->sah->ipsec_if != NULL) {
+                       // Input via IPsec interface
+                       lck_mtx_lock(sadb_mutex);
+                       ifnet_t ipsec_if = sav->sah->ipsec_if;
+                       if (ipsec_if != NULL) {
+                               // If an interface is found, add a reference count before dropping the lock
+                               ifnet_reference(ipsec_if);
+                       }
+                       lck_mtx_unlock(sadb_mutex);
+                       if (ipsec_if != NULL) {
                                ip->ip_len = htons(ip->ip_len + hlen);
                                ip->ip_off = htons(ip->ip_off);
                                ip->ip_sum = 0;
                                ip->ip_sum = ip_cksum_hdr_in(m, hlen);
-                               if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) {
+                               errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m);
+                               ifnet_release(ipsec_if);
+                               if (inject_error == 0) {
                                        m = NULL;
                                        goto done;
                                } else {
@@ -709,8 +729,8 @@ ah6_input(struct mbuf **mp, int *offp, int proto)
        /*
         * check for sequence number.
         */
-       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) {
-               if (ipsec_chkreplay(ntohl(((struct newah *)ah)->ah_seq), sav)) {
+       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[0] != NULL) {
+               if (ipsec_chkreplay(ntohl(((struct newah *)ah)->ah_seq), sav, 0)) {
                        ; /*okey*/
                } else {
                        IPSEC_STAT_INCREMENT(ipsec6stat.in_ahreplay);
@@ -815,8 +835,8 @@ ah6_input(struct mbuf **mp, int *offp, int proto)
        /*
         * update sequence number.
         */
-       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) {
-               if (ipsec_updatereplay(ntohl(((struct newah *)ah)->ah_seq), sav)) {
+       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[0] != NULL) {
+               if (ipsec_updatereplay(ntohl(((struct newah *)ah)->ah_seq), sav, 0)) {
                        IPSEC_STAT_INCREMENT(ipsec6stat.in_ahreplay);
                        goto fail;
                }
@@ -907,9 +927,18 @@ ah6_input(struct mbuf **mp, int *offp, int proto)
                        IFA_REMREF(ifa);
                }
 
-               // Input via IPSec interface
-               if (sav->sah->ipsec_if != NULL) {
-                       if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) {
+               // Input via IPsec interface
+               lck_mtx_lock(sadb_mutex);
+               ifnet_t ipsec_if = sav->sah->ipsec_if;
+               if (ipsec_if != NULL) {
+                       // If an interface is found, add a reference count before dropping the lock
+                       ifnet_reference(ipsec_if);
+               }
+               lck_mtx_unlock(sadb_mutex);
+               if (ipsec_if != NULL) {
+                       errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m);
+                       ifnet_release(ipsec_if);
+                       if (inject_error == 0) {
                                m = NULL;
                                nxt = IPPROTO_DONE;
                                goto done;
@@ -955,9 +984,18 @@ ah6_input(struct mbuf **mp, int *offp, int proto)
                        goto fail;
                }
 
-               // Input via IPSec interface
-               if (sav->sah->ipsec_if != NULL) {
-                       if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) {
+               // Input via IPsec interface
+               lck_mtx_lock(sadb_mutex);
+               ifnet_t ipsec_if = sav->sah->ipsec_if;
+               if (ipsec_if != NULL) {
+                       // If an interface is found, add a reference count before dropping the lock
+                       ifnet_reference(ipsec_if);
+               }
+               lck_mtx_unlock(sadb_mutex);
+               if (ipsec_if != NULL) {
+                       errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m);
+                       ifnet_release(ipsec_if);
+                       if (inject_error == 0) {
                                m = NULL;
                                nxt = IPPROTO_DONE;
                                goto done;
index d41e2f6796c5b3fff8754ee99f9dbca3ff6b630b..10f39472bcbcfacb1fc44f22a6fb1ea394b0a88c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2011 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -197,12 +197,12 @@ ah4_output(struct mbuf *m, struct secasvar *sav)
        size_t plen = 0;        /*AH payload size in bytes*/
        size_t ahlen = 0;       /*plen + sizeof(ah)*/
        struct ip *ip;
-       struct in_addr dst = { 0 };
+       struct in_addr dst = { .s_addr = 0 };
        struct in_addr *finaldst;
        int error;
 
        /* sanity checks */
-       if ((sav->flags & SADB_X_EXT_OLD) == 0 && !sav->replay) {
+       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[0] == NULL) {
                ip = mtod(m, struct ip *);
                ipseclog((LOG_DEBUG, "ah4_output: internal error: "
                    "sav->replay is null: %x->%x, SPI=%u\n",
@@ -295,7 +295,7 @@ ah4_output(struct mbuf *m, struct secasvar *sav)
                ahdr->ah_nxt = ip->ip_p;
                ahdr->ah_reserve = htons(0);
                ahdr->ah_spi = spi;
-               if (sav->replay->count == ~0) {
+               if (sav->replay[0]->count == ~0) {
                        if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0) {
                                /* XXX Is it noisy ? */
                                ipseclog((LOG_WARNING,
@@ -307,13 +307,13 @@ ah4_output(struct mbuf *m, struct secasvar *sav)
                        }
                }
                lck_mtx_lock(sadb_mutex);
-               sav->replay->count++;
+               sav->replay[0]->count++;
                lck_mtx_unlock(sadb_mutex);
                /*
                 * XXX sequence number must not be cycled, if the SA is
                 * installed by IKE daemon.
                 */
-               ahdr->ah_seq = htonl(sav->replay->count);
+               ahdr->ah_seq = htonl(sav->replay[0]->count);
                bzero(ahdr + 1, plen);
        }
 
@@ -461,7 +461,7 @@ ah6_output(struct mbuf *m, u_char *nexthdrp, struct mbuf *md,
        ip6 = mtod(m, struct ip6_hdr *);
        ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));
 
-       if ((sav->flags & SADB_X_EXT_OLD) == 0 && !sav->replay) {
+       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[0] == NULL) {
                ipseclog((LOG_DEBUG, "ah6_output: internal error: "
                    "sav->replay is null: SPI=%u\n",
                    (u_int32_t)ntohl(sav->spi)));
@@ -504,7 +504,7 @@ ah6_output(struct mbuf *m, u_char *nexthdrp, struct mbuf *md,
                ahdr->ah_len = (plen >> 2) + 1; /* plus one for seq# */
                ahdr->ah_reserve = htons(0);
                ahdr->ah_spi = spi;
-               if (sav->replay->count == ~0) {
+               if (sav->replay[0]->count == ~0) {
                        if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0) {
                                /* XXX Is it noisy ? */
                                ipseclog((LOG_WARNING,
@@ -516,13 +516,13 @@ ah6_output(struct mbuf *m, u_char *nexthdrp, struct mbuf *md,
                        }
                }
                lck_mtx_lock(sadb_mutex);
-               sav->replay->count++;
+               sav->replay[0]->count++;
                lck_mtx_unlock(sadb_mutex);
                /*
                 * XXX sequence number must not be cycled, if the SA is
                 * installed by IKE daemon.
                 */
-               ahdr->ah_seq = htonl(sav->replay->count);
+               ahdr->ah_seq = htonl(sav->replay[0]->count);
                bzero(ahdr + 1, plen);
        }
 
index c49d5eff3b3aeabfea4ba33c9f8033d190171342..c72ac1c725b5685da0d81322f15c218c472b9d8f 100644 (file)
@@ -65,7 +65,7 @@
 #ifndef _NETINET6_ESP_H_
 #define _NETINET6_ESP_H_
 #include <sys/appleapiopts.h>
-
+#include <net/multi_layer_pkt_log.h>
 
 struct esp {
        u_int32_t       esp_spi;        /* ESP */
@@ -123,6 +123,8 @@ struct esp_algorithm {
        int (*finalizeencrypt)(struct secasvar *, u_int8_t *, uint);
 };
 
+extern os_log_t esp_mpkl_log_object;
+
 extern const struct esp_algorithm *esp_algorithm_lookup(int);
 extern int esp_max_ivlen(void);
 
@@ -135,6 +137,8 @@ extern size_t esp_hdrsiz(struct ipsecrequest *);
 extern int esp_schedule(const struct esp_algorithm *, struct secasvar *);
 extern int esp_auth(struct mbuf *, size_t, size_t,
     struct secasvar *, u_char *);
+
+extern void esp_init(void);
 #endif /* BSD_KERNEL_PRIVATE */
 
 #endif /* _NETINET6_ESP_H_ */
index c5448b8a5534f8f84aa6d74c26c9e42b64c8d415..17bd8e242007cd75c1a75ef10a61ecb2b8abdc66 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define MAX_SBUF_LEN            2000
 
 extern lck_mtx_t *sadb_mutex;
+os_log_t esp_mpkl_log_object = NULL;
 
 static int esp_null_mature(struct secasvar *);
 static int esp_null_decrypt(struct mbuf *, size_t,
@@ -151,47 +152,119 @@ static int esp_gcm_mature(struct secasvar *);
 #define ESP_AESGCM_KEYLEN192 224 // 24-bytes key + 4 bytes salt
 #define ESP_AESGCM_KEYLEN256 288 // 32-bytes key + 4 bytes salt
 
-static const struct esp_algorithm des_cbc =
-{ 8, -1, esp_descbc_mature, 64, 64, esp_des_schedlen,
-  "des-cbc",
-  esp_descbc_ivlen, esp_cbc_decrypt,
-  esp_cbc_encrypt, esp_des_schedule,
-  esp_des_blockdecrypt, esp_des_blockencrypt,
-  0, 0, 0 };
-static const struct esp_algorithm des3_cbc =
-{ 8, 8, esp_cbc_mature, 192, 192, esp_3des_schedlen,
-  "3des-cbc",
-  esp_common_ivlen, esp_cbc_decrypt,
-  esp_cbc_encrypt, esp_3des_schedule,
-  esp_3des_blockdecrypt, esp_3des_blockencrypt,
-  0, 0, 0 };
-static const struct esp_algorithm null_esp =
-{ 1, 0, esp_null_mature, 0, 2048, 0, "null",
-  esp_common_ivlen, esp_null_decrypt,
-  esp_null_encrypt, NULL, NULL, NULL,
-  0, 0, 0 };
-static const struct esp_algorithm aes_cbc =
-{ 16, 16, esp_cbc_mature, 128, 256, esp_aes_schedlen,
-  "aes-cbc",
-  esp_common_ivlen, esp_cbc_decrypt_aes,
-  esp_cbc_encrypt_aes, esp_aes_schedule,
-  0, 0,
-  0, 0, 0 };
-static const struct esp_algorithm aes_gcm =
-{ 4, 8, esp_gcm_mature, ESP_AESGCM_KEYLEN128, ESP_AESGCM_KEYLEN256, esp_gcm_schedlen,
-  "aes-gcm",
-  esp_common_ivlen, esp_gcm_decrypt_aes,
-  esp_gcm_encrypt_aes, esp_gcm_schedule,
-  0, 0,
-  16, esp_gcm_decrypt_finalize, esp_gcm_encrypt_finalize};
-static const struct esp_algorithm chacha_poly =
-{ ESP_CHACHAPOLY_PAD_BOUND, ESP_CHACHAPOLY_IV_LEN,
-  esp_chachapoly_mature, ESP_CHACHAPOLY_KEYBITS_WITH_SALT,
-  ESP_CHACHAPOLY_KEYBITS_WITH_SALT, esp_chachapoly_schedlen,
-  "chacha-poly", esp_chachapoly_ivlen, esp_chachapoly_decrypt,
-  esp_chachapoly_encrypt, esp_chachapoly_schedule,
-  NULL, NULL, ESP_CHACHAPOLY_ICV_LEN,
-  esp_chachapoly_decrypt_finalize, esp_chachapoly_encrypt_finalize};
+static const struct esp_algorithm des_cbc = {
+       .padbound = 8,
+       .ivlenval = -1,
+       .mature = esp_descbc_mature,
+       .keymin = 64,
+       .keymax = 64,
+       .schedlen = esp_des_schedlen,
+       .name = "des-cbc",
+       .ivlen = esp_descbc_ivlen,
+       .decrypt = esp_cbc_decrypt,
+       .encrypt = esp_cbc_encrypt,
+       .schedule = esp_des_schedule,
+       .blockdecrypt = esp_des_blockdecrypt,
+       .blockencrypt = esp_des_blockencrypt,
+       .icvlen = 0,
+       .finalizedecrypt = NULL,
+       .finalizeencrypt = NULL
+};
+
+static const struct esp_algorithm des3_cbc = {
+       .padbound = 8,
+       .ivlenval = 8,
+       .mature = esp_cbc_mature,
+       .keymin = 192,
+       .keymax = 192,
+       .schedlen = esp_3des_schedlen,
+       .name = "3des-cbc",
+       .ivlen = esp_common_ivlen,
+       .decrypt = esp_cbc_decrypt,
+       .encrypt = esp_cbc_encrypt,
+       .schedule = esp_3des_schedule,
+       .blockdecrypt = esp_3des_blockdecrypt,
+       .blockencrypt = esp_3des_blockencrypt,
+       .icvlen = 0,
+       .finalizedecrypt = NULL,
+       .finalizeencrypt = NULL
+};
+
+static const struct esp_algorithm null_esp = {
+       .padbound = 1,
+       .ivlenval = 0,
+       .mature = esp_null_mature,
+       .keymin = 0,
+       .keymax = 2048,
+       .schedlen = NULL,
+       .name = "null",
+       .ivlen = esp_common_ivlen,
+       .decrypt = esp_null_decrypt,
+       .encrypt = esp_null_encrypt,
+       .schedule = NULL,
+       .blockdecrypt = NULL,
+       .blockencrypt = NULL,
+       .icvlen = 0,
+       .finalizedecrypt = NULL,
+       .finalizeencrypt = NULL
+};
+
+static const struct esp_algorithm aes_cbc = {
+       .padbound = 16,
+       .ivlenval = 16,
+       .mature = esp_cbc_mature,
+       .keymin = 128,
+       .keymax = 256,
+       .schedlen = esp_aes_schedlen,
+       .name = "aes-cbc",
+       .ivlen = esp_common_ivlen,
+       .decrypt = esp_cbc_decrypt_aes,
+       .encrypt = esp_cbc_encrypt_aes,
+       .schedule = esp_aes_schedule,
+       .blockdecrypt = NULL,
+       .blockencrypt = NULL,
+       .icvlen = 0,
+       .finalizedecrypt = NULL,
+       .finalizeencrypt = NULL
+};
+
+static const struct esp_algorithm aes_gcm = {
+       .padbound = 4,
+       .ivlenval = 8,
+       .mature = esp_gcm_mature,
+       .keymin = ESP_AESGCM_KEYLEN128,
+       .keymax = ESP_AESGCM_KEYLEN256,
+       .schedlen = esp_gcm_schedlen,
+       .name = "aes-gcm",
+       .ivlen = esp_common_ivlen,
+       .decrypt = esp_gcm_decrypt_aes,
+       .encrypt = esp_gcm_encrypt_aes,
+       .schedule = esp_gcm_schedule,
+       .blockdecrypt = NULL,
+       .blockencrypt = NULL,
+       .icvlen = 16,
+       .finalizedecrypt = esp_gcm_decrypt_finalize,
+       .finalizeencrypt = esp_gcm_encrypt_finalize
+};
+
+static const struct esp_algorithm chacha_poly = {
+       .padbound = ESP_CHACHAPOLY_PAD_BOUND,
+       .ivlenval = ESP_CHACHAPOLY_IV_LEN,
+       .mature = esp_chachapoly_mature,
+       .keymin = ESP_CHACHAPOLY_KEYBITS_WITH_SALT,
+       .keymax = ESP_CHACHAPOLY_KEYBITS_WITH_SALT,
+       .schedlen = esp_chachapoly_schedlen,
+       .name = "chacha-poly",
+       .ivlen = esp_chachapoly_ivlen,
+       .decrypt = esp_chachapoly_decrypt,
+       .encrypt = esp_chachapoly_encrypt,
+       .schedule = esp_chachapoly_schedule,
+       .blockdecrypt = NULL,
+       .blockencrypt = NULL,
+       .icvlen = ESP_CHACHAPOLY_ICV_LEN,
+       .finalizedecrypt = esp_chachapoly_decrypt_finalize,
+       .finalizeencrypt = esp_chachapoly_encrypt_finalize
+};
 
 static const struct esp_algorithm *esp_algorithms[] = {
        &des_cbc,
@@ -425,9 +498,8 @@ esp_des_blockdecrypt(
 {
        /* assumption: d has a good alignment */
        bcopy(s, d, sizeof(DES_LONG) * 2);
-       des_ecb_encrypt((des_cblock *)d, (des_cblock *)d,
-           (des_ecb_key_schedule *)sav->sched, DES_DECRYPT);
-       return 0;
+       return des_ecb_encrypt((des_cblock *)d, (des_cblock *)d,
+                  (des_ecb_key_schedule *)sav->sched, DES_DECRYPT);
 }
 
 static int
@@ -439,9 +511,8 @@ esp_des_blockencrypt(
 {
        /* assumption: d has a good alignment */
        bcopy(s, d, sizeof(DES_LONG) * 2);
-       des_ecb_encrypt((des_cblock *)d, (des_cblock *)d,
-           (des_ecb_key_schedule *)sav->sched, DES_ENCRYPT);
-       return 0;
+       return des_ecb_encrypt((des_cblock *)d, (des_cblock *)d,
+                  (des_ecb_key_schedule *)sav->sched, DES_ENCRYPT);
 }
 
 static int
@@ -597,9 +668,8 @@ esp_3des_blockdecrypt(
 {
        /* assumption: d has a good alignment */
        bcopy(s, d, sizeof(DES_LONG) * 2);
-       des3_ecb_encrypt((des_cblock *)d, (des_cblock *)d,
-           (des3_ecb_key_schedule *)sav->sched, DES_DECRYPT);
-       return 0;
+       return des3_ecb_encrypt((des_cblock *)d, (des_cblock *)d,
+                  (des3_ecb_key_schedule *)sav->sched, DES_DECRYPT);
 }
 
 static int
@@ -611,9 +681,8 @@ esp_3des_blockencrypt(
 {
        /* assumption: d has a good alignment */
        bcopy(s, d, sizeof(DES_LONG) * 2);
-       des3_ecb_encrypt((des_cblock *)d, (des_cblock *)d,
-           (des3_ecb_key_schedule *)sav->sched, DES_ENCRYPT);
-       return 0;
+       return des3_ecb_encrypt((des_cblock *)d, (des_cblock *)d,
+                  (des3_ecb_key_schedule *)sav->sched, DES_ENCRYPT);
 }
 
 static int
@@ -1206,3 +1275,22 @@ esp_auth(
        KERNEL_DEBUG(DBG_FNC_ESPAUTH | DBG_FUNC_END, 6, 0, 0, 0, 0);
        return 0;
 }
+
+void
+esp_init(void)
+{
+       static int esp_initialized = 0;
+
+       if (esp_initialized) {
+               return;
+       }
+
+       esp_initialized = 1;
+
+       esp_mpkl_log_object = MPKL_CREATE_LOGOBJECT("com.apple.xnu.esp");
+       if (esp_mpkl_log_object == NULL) {
+               panic("MPKL_CREATE_LOGOBJECT for ESP failed");
+       }
+
+       return;
+}
index 36311e312b6239f12d5b80c0a6362c659bf7df88..f53236153f7f09ecf9d1a74f7985777e1ecd2d94 100644 (file)
@@ -58,6 +58,8 @@
  * SUCH DAMAGE.
  */
 
+#define _IP_VHL
+
 /*
  * RFC1827/2406 Encapsulated Security Payload.
  */
@@ -89,6 +91,8 @@
 #include <netinet/ip_ecn.h>
 #include <netinet/in_pcb.h>
 #include <netinet/udp.h>
+#include <netinet/tcp.h>
+#include <netinet/in_tclass.h>
 #if INET6
 #include <netinet6/ip6_ecn.h>
 #endif
@@ -174,6 +178,40 @@ esp6_input_strip_udp_encap(struct mbuf *m, int ip6hlen)
        return ip6;
 }
 
+static void
+esp_input_log(struct mbuf *m, struct secasvar *sav, u_int32_t spi, u_int32_t seq)
+{
+       if (net_mpklog_enabled &&
+           (sav->sah->ipsec_if->if_xflags & IFXF_MPK_LOG) == IFXF_MPK_LOG) {
+               struct tcphdr th = {};
+               size_t iphlen = 0;
+               u_int32_t proto_len = 0;
+               u_int8_t proto = 0;
+
+               struct ip *inner_ip = mtod(m, struct ip *);
+               if (IP_VHL_V(inner_ip->ip_vhl) == 4) {
+                       iphlen = IP_VHL_HL(inner_ip->ip_vhl) << 2;
+                       proto = inner_ip->ip_p;
+               } else if (IP_VHL_V(inner_ip->ip_vhl) == 6) {
+                       struct ip6_hdr *inner_ip6 = mtod(m, struct ip6_hdr *);
+                       iphlen = sizeof(struct ip6_hdr);
+                       proto = inner_ip6->ip6_nxt;
+               }
+
+               if (proto == IPPROTO_TCP) {
+                       if ((int)(iphlen + sizeof(th)) <= m->m_pkthdr.len) {
+                               m_copydata(m, iphlen, sizeof(th), (u_int8_t *)&th);
+                       }
+
+                       proto_len = m->m_pkthdr.len - iphlen - (th.th_off << 2);
+                       MPKL_ESP_INPUT_TCP(esp_mpkl_log_object,
+                           ntohl(spi), seq,
+                           ntohs(th.th_sport), ntohs(th.th_dport),
+                           ntohl(th.th_seq), proto_len);
+               }
+       }
+}
+
 void
 esp4_input(struct mbuf *m, int off)
 {
@@ -200,6 +238,7 @@ esp4_input_extended(struct mbuf *m, int off, ifnet_t interface)
        size_t esplen;
        sa_family_t     ifamily;
        struct mbuf *out_m = NULL;
+       mbuf_traffic_class_t traffic_class = 0;
 
        KERNEL_DEBUG(DBG_FNC_ESPIN | DBG_FUNC_START, 0, 0, 0, 0, 0);
        /* sanity check for alignment. */
@@ -248,8 +287,8 @@ esp4_input_extended(struct mbuf *m, int off, ifnet_t interface)
            (caddr_t)&ip->ip_src, (caddr_t)&ip->ip_dst,
            IPPROTO_ESP, spi, interface)) == 0) {
                ipseclog((LOG_WARNING,
-                   "IPv4 ESP input: no key association found for spi %u\n",
-                   (u_int32_t)ntohl(spi)));
+                   "IPv4 ESP input: no key association found for spi %u (0x%08x)\n",
+                   (u_int32_t)ntohl(spi), (u_int32_t)ntohl(spi)));
                IPSEC_STAT_INCREMENT(ipsecstat.in_nosa);
                goto bad;
        }
@@ -259,16 +298,16 @@ esp4_input_extended(struct mbuf *m, int off, ifnet_t interface)
        if (sav->state != SADB_SASTATE_MATURE
            && sav->state != SADB_SASTATE_DYING) {
                ipseclog((LOG_DEBUG,
-                   "IPv4 ESP input: non-mature/dying SA found for spi %u\n",
-                   (u_int32_t)ntohl(spi)));
+                   "IPv4 ESP input: non-mature/dying SA found for spi %u (0x%08x)\n",
+                   (u_int32_t)ntohl(spi), (u_int32_t)ntohl(spi)));
                IPSEC_STAT_INCREMENT(ipsecstat.in_badspi);
                goto bad;
        }
        algo = esp_algorithm_lookup(sav->alg_enc);
        if (!algo) {
                ipseclog((LOG_DEBUG, "IPv4 ESP input: "
-                   "unsupported encryption algorithm for spi %u\n",
-                   (u_int32_t)ntohl(spi)));
+                   "unsupported encryption algorithm for spi %u (0x%08x)\n",
+                   (u_int32_t)ntohl(spi), (u_int32_t)ntohl(spi)));
                IPSEC_STAT_INCREMENT(ipsecstat.in_badspi);
                goto bad;
        }
@@ -284,6 +323,12 @@ esp4_input_extended(struct mbuf *m, int off, ifnet_t interface)
 
        seq = ntohl(((struct newesp *)esp)->esp_seq);
 
+       if ((sav->flags2 & SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) ==
+           SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) {
+               u_int8_t dscp = ip->ip_tos >> IPTOS_DSCP_SHIFT;
+               traffic_class = rfc4594_dscp_to_tc(dscp);
+       }
+
        /* Save ICV from packet for verification later */
        size_t siz = 0;
        unsigned char saved_icv[AH_MAXSUMSIZE];
@@ -293,8 +338,8 @@ esp4_input_extended(struct mbuf *m, int off, ifnet_t interface)
                goto delay_icv;
        }
 
-       if (!((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay
-           && (sav->alg_auth && sav->key_auth))) {
+       if (!((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[traffic_class] != NULL &&
+           (sav->alg_auth && sav->key_auth))) {
                goto noreplaycheck;
        }
 
@@ -306,7 +351,7 @@ esp4_input_extended(struct mbuf *m, int off, ifnet_t interface)
        /*
         * check for sequence number.
         */
-       if (ipsec_chkreplay(seq, sav)) {
+       if (ipsec_chkreplay(seq, sav, traffic_class)) {
                ; /*okey*/
        } else {
                IPSEC_STAT_INCREMENT(ipsecstat.in_espreplay);
@@ -372,8 +417,8 @@ delay_icv:
        /*
         * update sequence number.
         */
-       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) {
-               if (ipsec_updatereplay(seq, sav)) {
+       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[traffic_class] != NULL) {
+               if (ipsec_updatereplay(seq, sav, traffic_class)) {
                        IPSEC_STAT_INCREMENT(ipsecstat.in_espreplay);
                        goto bad;
                }
@@ -442,7 +487,7 @@ noreplaycheck:
 
        if (algo->finalizedecrypt) {
                if ((*algo->finalizedecrypt)(sav, saved_icv, algo->icvlen)) {
-                       ipseclog((LOG_ERR, "packet decryption ICV failure\n"));
+                       ipseclog((LOG_ERR, "esp4 packet decryption ICV failure\n"));
                        IPSEC_STAT_INCREMENT(ipsecstat.in_inval);
                        KERNEL_DEBUG(DBG_FNC_DECRYPT | DBG_FUNC_END, 1, 0, 0, 0, 0);
                        goto bad;
@@ -491,8 +536,8 @@ noreplaycheck:
                // if peer is behind nat and this is the latest esp packet
                if ((sav->flags & SADB_X_EXT_NATT_DETECTED_PEER) != 0 &&
                    (sav->flags & SADB_X_EXT_OLD) == 0 &&
-                   seq && sav->replay &&
-                   seq >= sav->replay->lastseq) {
+                   seq && sav->replay[traffic_class] &&
+                   seq >= sav->replay[traffic_class]->lastseq) {
                        struct udphdr *encap_uh = (__typeof__(encap_uh))(void *)((caddr_t)ip + off);
                        if (encap_uh->uh_sport &&
                            ntohs(encap_uh->uh_sport) != sav->remote_ike_port) {
@@ -629,16 +674,30 @@ noreplaycheck:
                /* Clear the csum flags, they can't be valid for the inner headers */
                m->m_pkthdr.csum_flags = 0;
 
-               // Input via IPSec interface
-               if (sav->sah->ipsec_if != NULL) {
+               // Input via IPsec interface
+               lck_mtx_lock(sadb_mutex);
+               ifnet_t ipsec_if = sav->sah->ipsec_if;
+               if (ipsec_if != NULL) {
+                       // If an interface is found, add a reference count before dropping the lock
+                       ifnet_reference(ipsec_if);
+               }
+               lck_mtx_unlock(sadb_mutex);
+               if (ipsec_if != NULL) {
+                       esp_input_log(m, sav, spi, seq);
+                       ipsec_save_wake_packet(m, ntohl(spi), seq);
+
                        // Return mbuf
                        if (interface != NULL &&
-                           interface == sav->sah->ipsec_if) {
+                           interface == ipsec_if) {
                                out_m = m;
+                               ifnet_release(ipsec_if);
                                goto done;
                        }
 
-                       if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) {
+                       errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m);
+                       ifnet_release(ipsec_if);
+
+                       if (inject_error == 0) {
                                m = NULL;
                                goto done;
                        } else {
@@ -741,13 +800,21 @@ noreplaycheck:
                            struct ip *, ip, struct ip6_hdr *, NULL);
 
                        // Input via IPsec interface legacy path
-                       if (sav->sah->ipsec_if != NULL) {
+                       lck_mtx_lock(sadb_mutex);
+                       ifnet_t ipsec_if = sav->sah->ipsec_if;
+                       if (ipsec_if != NULL) {
+                               // If an interface is found, add a reference count before dropping the lock
+                               ifnet_reference(ipsec_if);
+                       }
+                       lck_mtx_unlock(sadb_mutex);
+                       if (ipsec_if != NULL) {
                                int mlen;
                                if ((mlen = m_length2(m, NULL)) < hlen) {
                                        ipseclog((LOG_DEBUG,
                                            "IPv4 ESP input: decrypted packet too short %d < %d\n",
                                            mlen, hlen));
                                        IPSEC_STAT_INCREMENT(ipsecstat.in_inval);
+                                       ifnet_release(ipsec_if);
                                        goto bad;
                                }
                                ip->ip_len = htons(ip->ip_len + hlen);
@@ -755,14 +822,21 @@ noreplaycheck:
                                ip->ip_sum = 0;
                                ip->ip_sum = ip_cksum_hdr_in(m, hlen);
 
+                               esp_input_log(m, sav, spi, seq);
+                               ipsec_save_wake_packet(m, ntohl(spi), seq);
+
                                // Return mbuf
                                if (interface != NULL &&
-                                   interface == sav->sah->ipsec_if) {
+                                   interface == ipsec_if) {
                                        out_m = m;
+                                       ifnet_release(ipsec_if);
                                        goto done;
                                }
 
-                               if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) {
+                               errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m);
+                               ifnet_release(ipsec_if);
+
+                               if (inject_error == 0) {
                                        m = NULL;
                                        goto done;
                                } else {
@@ -829,6 +903,7 @@ esp6_input_extended(struct mbuf **mp, int *offp, int proto, ifnet_t interface)
        int ivlen;
        size_t esplen;
        sa_family_t ifamily;
+       mbuf_traffic_class_t traffic_class = 0;
 
        /* sanity check for alignment. */
        if (off % 4 != 0 || m->m_pkthdr.len % 4 != 0) {
@@ -877,8 +952,19 @@ esp6_input_extended(struct mbuf **mp, int *offp, int proto, ifnet_t interface)
            (caddr_t)&ip6->ip6_src, (caddr_t)&ip6->ip6_dst,
            IPPROTO_ESP, spi, interface)) == 0) {
                ipseclog((LOG_WARNING,
-                   "IPv6 ESP input: no key association found for spi %u\n",
-                   (u_int32_t)ntohl(spi)));
+                   "IPv6 ESP input: no key association found for spi %u (0x%08x) seq %u"
+                   " src %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x"
+                   " dst %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x if %s\n",
+                   (u_int32_t)ntohl(spi), (u_int32_t)ntohl(spi), ntohl(((struct newesp *)esp)->esp_seq),
+                   ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[0]), ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[1]),
+                   ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[2]), ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[3]),
+                   ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[4]), ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[5]),
+                   ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[6]), ntohs(ip6->ip6_src.__u6_addr.__u6_addr16[7]),
+                   ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[0]), ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[1]),
+                   ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[2]), ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[3]),
+                   ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[4]), ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[5]),
+                   ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[6]), ntohs(ip6->ip6_dst.__u6_addr.__u6_addr16[7]),
+                   ((interface != NULL) ? if_name(interface) : "NONE")));
                IPSEC_STAT_INCREMENT(ipsec6stat.in_nosa);
                goto bad;
        }
@@ -888,16 +974,16 @@ esp6_input_extended(struct mbuf **mp, int *offp, int proto, ifnet_t interface)
        if (sav->state != SADB_SASTATE_MATURE
            && sav->state != SADB_SASTATE_DYING) {
                ipseclog((LOG_DEBUG,
-                   "IPv6 ESP input: non-mature/dying SA found for spi %u\n",
-                   (u_int32_t)ntohl(spi)));
+                   "IPv6 ESP input: non-mature/dying SA found for spi %u (0x%08x)\n",
+                   (u_int32_t)ntohl(spi), (u_int32_t)ntohl(spi)));
                IPSEC_STAT_INCREMENT(ipsec6stat.in_badspi);
                goto bad;
        }
        algo = esp_algorithm_lookup(sav->alg_enc);
        if (!algo) {
                ipseclog((LOG_DEBUG, "IPv6 ESP input: "
-                   "unsupported encryption algorithm for spi %u\n",
-                   (u_int32_t)ntohl(spi)));
+                   "unsupported encryption algorithm for spi %u (0x%08x)\n",
+                   (u_int32_t)ntohl(spi), (u_int32_t)ntohl(spi)));
                IPSEC_STAT_INCREMENT(ipsec6stat.in_badspi);
                goto bad;
        }
@@ -913,6 +999,12 @@ esp6_input_extended(struct mbuf **mp, int *offp, int proto, ifnet_t interface)
 
        seq = ntohl(((struct newesp *)esp)->esp_seq);
 
+       if ((sav->flags2 & SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) ==
+           SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) {
+               u_int8_t dscp = (ntohl(ip6->ip6_flow) & IP6FLOW_DSCP_MASK) >> IP6FLOW_DSCP_SHIFT;
+               traffic_class = rfc4594_dscp_to_tc(dscp);
+       }
+
        /* Save ICV from packet for verification later */
        size_t siz = 0;
        unsigned char saved_icv[AH_MAXSUMSIZE];
@@ -922,8 +1014,9 @@ esp6_input_extended(struct mbuf **mp, int *offp, int proto, ifnet_t interface)
                goto delay_icv;
        }
 
-       if (!((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay
-           && (sav->alg_auth && sav->key_auth))) {
+       if (!((sav->flags & SADB_X_EXT_OLD) == 0 &&
+           sav->replay[traffic_class] != NULL &&
+           (sav->alg_auth && sav->key_auth))) {
                goto noreplaycheck;
        }
 
@@ -935,7 +1028,7 @@ esp6_input_extended(struct mbuf **mp, int *offp, int proto, ifnet_t interface)
        /*
         * check for sequence number.
         */
-       if (ipsec_chkreplay(seq, sav)) {
+       if (ipsec_chkreplay(seq, sav, traffic_class)) {
                ; /*okey*/
        } else {
                IPSEC_STAT_INCREMENT(ipsec6stat.in_espreplay);
@@ -998,8 +1091,8 @@ delay_icv:
        /*
         * update sequence number.
         */
-       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) {
-               if (ipsec_updatereplay(seq, sav)) {
+       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[traffic_class] != NULL) {
+               if (ipsec_updatereplay(seq, sav, traffic_class)) {
                        IPSEC_STAT_INCREMENT(ipsec6stat.in_espreplay);
                        goto bad;
                }
@@ -1067,7 +1160,7 @@ noreplaycheck:
 
        if (algo->finalizedecrypt) {
                if ((*algo->finalizedecrypt)(sav, saved_icv, algo->icvlen)) {
-                       ipseclog((LOG_ERR, "packet decryption ICV failure\n"));
+                       ipseclog((LOG_ERR, "esp6 packet decryption ICV failure\n"));
                        IPSEC_STAT_INCREMENT(ipsecstat.in_inval);
                        KERNEL_DEBUG(DBG_FNC_DECRYPT | DBG_FUNC_END, 1, 0, 0, 0, 0);
                        goto bad;
@@ -1113,8 +1206,8 @@ noreplaycheck:
                // if peer is behind nat and this is the latest esp packet
                if ((sav->flags & SADB_X_EXT_NATT_DETECTED_PEER) != 0 &&
                    (sav->flags & SADB_X_EXT_OLD) == 0 &&
-                   seq && sav->replay &&
-                   seq >= sav->replay->lastseq) {
+                   seq && sav->replay[traffic_class] &&
+                   seq >= sav->replay[traffic_class]->lastseq) {
                        struct udphdr *encap_uh = (__typeof__(encap_uh))(void *)((caddr_t)ip6 + off);
                        if (encap_uh->uh_sport &&
                            ntohs(encap_uh->uh_sport) != sav->remote_ike_port) {
@@ -1240,15 +1333,29 @@ noreplaycheck:
                        IFA_REMREF(ifa);
                }
 
-               // Input via IPSec interface
-               if (sav->sah->ipsec_if != NULL) {
+               // Input via IPsec interface
+               lck_mtx_lock(sadb_mutex);
+               ifnet_t ipsec_if = sav->sah->ipsec_if;
+               if (ipsec_if != NULL) {
+                       // If an interface is found, add a reference count before dropping the lock
+                       ifnet_reference(ipsec_if);
+               }
+               lck_mtx_unlock(sadb_mutex);
+               if (ipsec_if != NULL) {
+                       esp_input_log(m, sav, spi, seq);
+                       ipsec_save_wake_packet(m, ntohl(spi), seq);
+
                        // Return mbuf
                        if (interface != NULL &&
-                           interface == sav->sah->ipsec_if) {
+                           interface == ipsec_if) {
+                               ifnet_release(ipsec_if);
                                goto done;
                        }
 
-                       if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) {
+                       errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m);
+                       ifnet_release(ipsec_if);
+
+                       if (inject_error == 0) {
                                m = NULL;
                                nxt = IPPROTO_DONE;
                                goto done;
@@ -1348,7 +1455,6 @@ noreplaycheck:
                        m = n;
                }
 #endif
-
                ip6 = mtod(m, struct ip6_hdr *);
                ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - stripsiz);
 
@@ -1370,15 +1476,29 @@ noreplaycheck:
                        _CASSERT(offsetof(struct pkthdr, csum_data) == offsetof(struct pkthdr, csum_rx_val));
                }
 
-               // Input via IPSec interface
-               if (sav->sah->ipsec_if != NULL) {
+               // Input via IPsec interface
+               lck_mtx_lock(sadb_mutex);
+               ifnet_t ipsec_if = sav->sah->ipsec_if;
+               if (ipsec_if != NULL) {
+                       // If an interface is found, add a reference count before dropping the lock
+                       ifnet_reference(ipsec_if);
+               }
+               lck_mtx_unlock(sadb_mutex);
+               if (ipsec_if != NULL) {
+                       esp_input_log(m, sav, spi, seq);
+                       ipsec_save_wake_packet(m, ntohl(spi), seq);
+
                        // Return mbuf
                        if (interface != NULL &&
-                           interface == sav->sah->ipsec_if) {
+                           interface == ipsec_if) {
+                               ifnet_release(ipsec_if);
                                goto done;
                        }
 
-                       if (ipsec_inject_inbound_packet(sav->sah->ipsec_if, m) == 0) {
+                       errno_t inject_error = ipsec_inject_inbound_packet(ipsec_if, m);
+                       ifnet_release(ipsec_if);
+
+                       if (inject_error == 0) {
                                m = NULL;
                                nxt = IPPROTO_DONE;
                                goto done;
index 36c91b56fa85e594f2160a4a234a1fe3008a4127..9401200f33ba5dab7507ea9d25b8a5dc34266e63 100644 (file)
 
 #include <net/if.h>
 #include <net/route.h>
+#include <net/multi_layer_pkt_log.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_var.h>
 #include <netinet/udp.h> /* for nat traversal */
+#include <netinet/tcp.h>
+#include <netinet/in_tclass.h>
 
 #if INET6
 #include <netinet/ip6.h>
@@ -184,7 +187,7 @@ esp_hdrsiz(__unused struct ipsecrequest *isr)
                } else {
                        /* RFC 2406 */
                        aalgo = ah_algorithm_lookup(sav->alg_auth);
-                       if (aalgo && sav->replay && sav->key_auth) {
+                       if (aalgo && sav->replay[0] != NULL && sav->key_auth) {
                                authlen = (aalgo->sumsiz)(sav);
                        } else {
                                authlen = 0;
@@ -251,7 +254,11 @@ esp_output(
        struct esp *esp;
        struct esptail *esptail;
        const struct esp_algorithm *algo;
+       struct tcphdr th = {};
        u_int32_t spi;
+       u_int32_t seq;
+       u_int32_t inner_payload_len = 0;
+       u_int8_t inner_protocol = 0;
        u_int8_t nxt = 0;
        size_t plen;    /*payload length to be encrypted*/
        size_t espoff;
@@ -263,7 +270,7 @@ esp_output(
        struct ipsecstat *stat;
        struct udphdr *udp = NULL;
        int     udp_encapsulate = (sav->flags & SADB_X_EXT_NATT && (af == AF_INET || af == AF_INET6) &&
-           (esp_udp_encap_port & 0xFFFF) != 0);
+           ((esp_udp_encap_port & 0xFFFF) != 0 || sav->natt_encapsulated_src_port != 0));
 
        KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_START, sav->ivlen, 0, 0, 0, 0);
        switch (af) {
@@ -285,8 +292,35 @@ esp_output(
                return 0;       /* no change at all */
        }
 
+       mbuf_traffic_class_t traffic_class = 0;
+       if ((sav->flags2 & SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) ==
+           SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) {
+               u_int8_t dscp = 0;
+               switch (af) {
+#if INET
+               case AF_INET:
+               {
+                       struct ip *ip = mtod(m, struct ip *);
+                       dscp = ip->ip_tos >> IPTOS_DSCP_SHIFT;
+                       break;
+               }
+#endif /*INET*/
+#if INET6
+               case AF_INET6:
+               {
+                       struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
+                       dscp = (ntohl(ip6->ip6_flow) & IP6FLOW_DSCP_MASK) >> IP6FLOW_DSCP_SHIFT;
+                       break;
+               }
+#endif /*INET6*/
+               default:
+                       panic("esp_output: should not reach here");
+               }
+               traffic_class = rfc4594_dscp_to_tc(dscp);
+       }
+
        /* some sanity check */
-       if ((sav->flags & SADB_X_EXT_OLD) == 0 && !sav->replay) {
+       if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay[traffic_class] == NULL) {
                switch (af) {
 #if INET
                case AF_INET:
@@ -398,6 +432,58 @@ esp_output(
 #endif
                }
 
+               /* grab info for packet logging */
+               struct secashead *sah = sav->sah;
+               if (net_mpklog_enabled &&
+                   sah != NULL && sah->ipsec_if != NULL) {
+                       ifnet_t ifp = sah->ipsec_if;
+
+                       if ((ifp->if_xflags & IFXF_MPK_LOG) == IFXF_MPK_LOG) {
+                               size_t iphlen = 0;
+
+                               if (sav->sah->saidx.mode == IPSEC_MODE_TUNNEL) {
+                                       struct ip *inner_ip = mtod(md, struct ip *);
+                                       if (IP_VHL_V(inner_ip->ip_vhl) == IPVERSION) {
+#ifdef _IP_VHL
+                                               iphlen = IP_VHL_HL(inner_ip->ip_vhl) << 2;
+#else
+                                               iphlen = inner_ip->ip_hl << 2;
+#endif
+                                               inner_protocol = inner_ip->ip_p;
+                                       } else if (IP_VHL_V(inner_ip->ip_vhl) == IPV6_VERSION) {
+                                               struct ip6_hdr *inner_ip6 = mtod(md, struct ip6_hdr *);
+                                               iphlen = sizeof(struct ip6_hdr);
+                                               inner_protocol = inner_ip6->ip6_nxt;
+                                       }
+
+                                       if (inner_protocol == IPPROTO_TCP) {
+                                               if ((int)(iphlen + sizeof(th)) <=
+                                                   (m->m_pkthdr.len - m->m_len)) {
+                                                       m_copydata(md, iphlen, sizeof(th), (u_int8_t *)&th);
+                                               }
+
+                                               inner_payload_len = m->m_pkthdr.len - m->m_len - iphlen - (th.th_off << 2);
+                                       }
+                               } else {
+                                       iphlen = hlen;
+                                       if (af == AF_INET) {
+                                               inner_protocol = ip->ip_p;
+                                       } else if (af == AF_INET6) {
+                                               inner_protocol = ip6->ip6_nxt;
+                                       }
+
+                                       if (inner_protocol == IPPROTO_TCP) {
+                                               if ((int)(iphlen + sizeof(th)) <=
+                                                   m->m_pkthdr.len) {
+                                                       m_copydata(m, iphlen, sizeof(th), (u_int8_t *)&th);
+                                               }
+
+                                               inner_payload_len = m->m_pkthdr.len - iphlen - (th.th_off << 2);
+                                       }
+                               }
+                       }
+               }
+
                /* make the packet over-writable */
                mprev->m_next = NULL;
                if ((md = ipsec_copypkt(md)) == NULL) {
@@ -514,7 +600,7 @@ esp_output(
        if ((sav->flags & SADB_X_EXT_OLD) == 0) {
                struct newesp *nesp;
                nesp = (struct newesp *)esp;
-               if (sav->replay->count == ~0) {
+               if (sav->replay[traffic_class]->count == sav->replay[traffic_class]->lastseq) {
                        if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0) {
                                /* XXX Is it noisy ? */
                                ipseclog((LOG_WARNING,
@@ -527,13 +613,14 @@ esp_output(
                        }
                }
                lck_mtx_lock(sadb_mutex);
-               sav->replay->count++;
+               sav->replay[traffic_class]->count++;
                lck_mtx_unlock(sadb_mutex);
                /*
                 * XXX sequence number must not be cycled, if the SA is
                 * installed by IKE daemon.
                 */
-               nesp->esp_seq = htonl(sav->replay->count);
+               nesp->esp_seq = htonl(sav->replay[traffic_class]->count);
+               seq = sav->replay[traffic_class]->count;
        }
 
        {
@@ -665,9 +752,13 @@ esp_output(
                        *nexthdrp = IPPROTO_UDP;
 
                        /* Fill out the UDP header */
-                       udp->uh_sport = ntohs((u_short)esp_udp_encap_port);
-                       udp->uh_dport = ntohs(sav->remote_ike_port);
-//             udp->uh_len set later, after all length tweaks are complete
+                       if (sav->natt_encapsulated_src_port != 0) {
+                               udp->uh_sport = (u_short)sav->natt_encapsulated_src_port;
+                       } else {
+                               udp->uh_sport = htons((u_short)esp_udp_encap_port);
+                       }
+                       udp->uh_dport = htons(sav->remote_ike_port);
+                       // udp->uh_len set later, after all length tweaks are complete
                        udp->uh_sum = 0;
 
                        /* Update last sent so we know if we need to send keepalive */
@@ -753,7 +844,7 @@ esp_output(
                goto fill_icv;
        }
 
-       if (!sav->replay) {
+       if (!sav->replay[traffic_class]) {
                goto noantireplay;
        }
        if (!sav->key_auth) {
@@ -863,6 +954,17 @@ fill_icv:
        }
 
 noantireplay:
+       if (net_mpklog_enabled && sav->sah != NULL &&
+           sav->sah->ipsec_if != NULL &&
+           (sav->sah->ipsec_if->if_xflags & IFXF_MPK_LOG) &&
+           inner_protocol == IPPROTO_TCP) {
+               MPKL_ESP_OUTPUT_TCP(esp_mpkl_log_object,
+                   ntohl(spi), seq,
+                   ntohs(th.th_sport), ntohs(th.th_dport),
+                   ntohl(th.th_seq), ntohl(th.th_ack),
+                   th.th_flags, inner_payload_len);
+       }
+
        lck_mtx_lock(sadb_mutex);
        if (!m) {
                ipseclog((LOG_ERR,
index 1b5925265097737eabaf8e97b4c275fd341e46ed..9357d5e9960a4aad5e110fa729d0ffc2ab8d7853 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -364,20 +364,15 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
         */
        if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) {
                /*
+                * Mark packet as reassembled.
                 * In ICMPv6 processing, we drop certain
                 * NDP messages that are not expected to
                 * have fragment header based on recommendations
                 * against security vulnerability as described in
                 * RFC 6980.
-                * We set PKTF_REASSEMBLED flag to let ICMPv6 NDP
-                * drop such packets.
-                * However there are already devices running software
-                * that are creating interface with MTU < IPv6 Min
-                * MTU. We should not have allowed that but they are
-                * out, and sending atomic NDP fragments.
-                * For that reason, we do not set the same flag here
-                * and relax the check.
+                * Treat atomic fragments as re-assembled packets as well.
                 */
+               m->m_pkthdr.pkt_flags |= PKTF_REASSEMBLED;
                ip6stat.ip6s_atmfrag_rcvd++;
                in6_ifstat_inc(dstifp, ifs6_atmfrag_rcvd);
                *offp = offset;
@@ -785,7 +780,6 @@ insert:
            (m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
                /* loopback checksums are always OK */
                m->m_pkthdr.csum_data = 0xffff;
-               m->m_pkthdr.csum_flags &= ~CSUM_PARTIAL;
                m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
        }
 
index f1d66606e8947ba511fd5772d5e28a8d5a9d8c8e..a7376b6861c486ece41cddd955a173e3ec73fdfd 100644 (file)
@@ -410,7 +410,7 @@ icmp6_error_flag(struct mbuf *m, int type, int code, int param, int flags)
                m = m_pullup(m, preplen);
        }
        if (m == NULL) {
-               nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__));
+               nd6log(debug, "ENOBUFS in icmp6_error %d\n", __LINE__);
                return;
        }
 
@@ -551,9 +551,9 @@ icmp6_input(struct mbuf **mp, int *offp, int proto)
         * calculate the checksum
         */
        if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "ICMP6 checksum error(%d|%x) %s\n",
-                   icmp6->icmp6_type, sum, ip6_sprintf(&ip6->ip6_src)));
+                   icmp6->icmp6_type, sum, ip6_sprintf(&ip6->ip6_src));
                icmp6stat.icp6s_checksum++;
                goto freeit;
        }
@@ -909,11 +909,11 @@ icmp6_input(struct mbuf **mp, int *offp, int proto)
                break;
 
        default:
-               nd6log((LOG_DEBUG,
+               nd6log(debug,
                    "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n",
                    icmp6->icmp6_type, ip6_sprintf(&ip6->ip6_src),
                    ip6_sprintf(&ip6->ip6_dst),
-                   m->m_pkthdr.rcvif ? m->m_pkthdr.rcvif->if_index : 0));
+                   m->m_pkthdr.rcvif ? m->m_pkthdr.rcvif->if_index : 0);
                if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) {
                        /* ICMPv6 error: MUST deliver it by spec... */
                        code = PRC_NCMDS;
@@ -1213,14 +1213,9 @@ icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated)
                return;
        }
 
-       /*
-        * In case the suggested mtu is less than IPV6_MMTU, we
-        * only need to remember that it was for above mentioned
-        * "alwaysfrag" case.
-        * Try to be as close to the spec as possible.
-        */
+       /* Limit the MTU to the minimum IPv6 MTU */
        if (mtu < IPV6_MMTU) {
-               mtu = IPV6_MMTU - 8;
+               mtu = IPV6_MMTU;
        }
 
        bzero(&sin6, sizeof(sin6));
@@ -1336,9 +1331,8 @@ ni6_input(struct mbuf *m, int off)
                }
                if ((ia6_flags & IN6_IFF_TEMPORARY) &&
                    !(icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) {
-                       nd6log((LOG_DEBUG, "ni6_input: ignore node info to "
-                           "a temporary address in %s:%d",
-                           __func__, __LINE__));
+                       nd6log(debug, "ni6_input: ignore node info to a temporary address in %s:%d",
+                           __func__, __LINE__);
                        goto bad;
                }
        }
@@ -1438,7 +1432,9 @@ ni6_input(struct mbuf *m, int off)
                         *   wildcard match, if gethostname(3) side has
                         *   truncated hostname.
                         */
+                       lck_mtx_lock(&hostname_lock);
                        n = ni6_nametodns(hostname, hostnamelen, 0);
+                       lck_mtx_unlock(&hostname_lock);
                        if (!n || n->m_next || n->m_len == 0) {
                                goto bad;
                        }
@@ -1571,7 +1567,9 @@ ni6_input(struct mbuf *m, int off)
                /*
                 * XXX do we really have FQDN in variable "hostname"?
                 */
+               lck_mtx_lock(&hostname_lock);
                n->m_next = ni6_nametodns(hostname, hostnamelen, oldfqdn);
+               lck_mtx_unlock(&hostname_lock);
                if (n->m_next == NULL) {
                        goto bad;
                }
@@ -2259,10 +2257,10 @@ icmp6_reflect(struct mbuf *m, size_t off)
 
        /* too short to reflect */
        if (off < sizeof(struct ip6_hdr)) {
-               nd6log((LOG_DEBUG,
-                   "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n",
+               nd6log(debug,
+                   "sanity fail: off=%x, sizeof(ip6)=%x in %s:%d\n",
                    (u_int32_t)off, (u_int32_t)sizeof(struct ip6_hdr),
-                   __func__, __LINE__));
+                   __func__, __LINE__);
                goto bad;
        }
 
@@ -2384,10 +2382,10 @@ icmp6_reflect(struct mbuf *m, size_t off)
                    &src_storage, ip6oa.ip6oa_boundif, &e);
                ROUTE_RELEASE(&ro);
                if (src == NULL) {
-                       nd6log((LOG_DEBUG,
+                       nd6log(debug,
                            "icmp6_reflect: source can't be determined: "
                            "dst=%s, error=%d\n",
-                           ip6_sprintf(&sa6_src.sin6_addr), e));
+                           ip6_sprintf(&sa6_src.sin6_addr), e);
                        goto bad;
                }
        }
@@ -2462,26 +2460,35 @@ icmp6_redirect_diag(struct in6_addr *src6,
 void
 icmp6_redirect_input(struct mbuf *m, int off)
 {
-       struct ifnet *ifp = m->m_pkthdr.rcvif;
-       struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
-       struct nd_redirect *nd_rd;
-       int icmp6len = ntohs(ip6->ip6_plen);
+       struct ifnet *ifp = NULL;
+       struct ip6_hdr *ip6 = NULL;
+       struct nd_redirect *nd_rd = NULL;
        char *lladdr = NULL;
+       int icmp6len = 0;
        int lladdrlen = 0;
        u_char *redirhdr = NULL;
        int redirhdrlen = 0;
        struct rtentry *rt = NULL;
        int is_router;
        int is_onlink;
-       struct in6_addr src6 = ip6->ip6_src;
+       struct in6_addr src6;
        struct in6_addr redtgt6;
        struct in6_addr reddst6;
        union nd_opts ndopts;
 
-       if (!m || !ifp) {
+       if (m == NULL) {
                return;
        }
 
+       ifp = m->m_pkthdr.rcvif;
+       if (ifp == NULL) {
+               goto freeit;
+       }
+
+       ip6 = mtod(m, struct ip6_hdr *);
+       icmp6len = ntohs(ip6->ip6_plen);
+       src6 = ip6->ip6_src;
+
        /*
         * If we are an advertising router on this interface,
         * don't update route by icmp6 redirect.
@@ -2500,7 +2507,7 @@ icmp6_redirect_input(struct mbuf *m, int off)
        IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len);
        if (nd_rd == NULL) {
                icmp6stat.icp6s_tooshort++;
-               return;
+               goto freeit;
        }
 #endif
        redtgt6 = nd_rd->nd_rd_target;
@@ -2513,16 +2520,16 @@ icmp6_redirect_input(struct mbuf *m, int off)
 
        /* validation */
        if (!IN6_IS_ADDR_LINKLOCAL(&src6)) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "ICMP6 redirect sent from %s rejected; "
-                   "must be from linklocal\n", ip6_sprintf(&src6)));
+                   "must be from linklocal\n", ip6_sprintf(&src6));
                goto bad;
        }
-       if (ip6->ip6_hlim != 255) {
-               nd6log((LOG_ERR,
+       if (ip6->ip6_hlim != IPV6_MAXHLIM) {
+               nd6log(error,
                    "ICMP6 redirect sent from %s rejected; "
                    "hlim=%d (must be 255)\n",
-                   ip6_sprintf(&src6), ip6->ip6_hlim));
+                   ip6_sprintf(&src6), ip6->ip6_hlim);
                goto bad;
        }
        {
@@ -2539,10 +2546,10 @@ icmp6_redirect_input(struct mbuf *m, int off)
                        RT_LOCK(rt);
                        if (rt->rt_gateway == NULL ||
                            rt->rt_gateway->sa_family != AF_INET6) {
-                               nd6log((LOG_ERR,
+                               nd6log(error,
                                    "ICMP6 redirect rejected; no route "
                                    "with inet6 gateway found for redirect dst: %s\n",
-                                   icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
+                                   icmp6_redirect_diag(&src6, &reddst6, &redtgt6));
                                RT_UNLOCK(rt);
                                rtfree(rt);
                                goto bad;
@@ -2551,21 +2558,21 @@ icmp6_redirect_input(struct mbuf *m, int off)
                        gw6 = &(((struct sockaddr_in6 *)(void *)
                            rt->rt_gateway)->sin6_addr);
                        if (bcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) {
-                               nd6log((LOG_ERR,
+                               nd6log(error,
                                    "ICMP6 redirect rejected; "
                                    "not equal to gw-for-src=%s (must be same): "
                                    "%s\n",
                                    ip6_sprintf(gw6),
-                                   icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
+                                   icmp6_redirect_diag(&src6, &reddst6, &redtgt6));
                                RT_UNLOCK(rt);
                                rtfree(rt);
                                goto bad;
                        }
                } else {
-                       nd6log((LOG_ERR,
+                       nd6log(error,
                            "ICMP6 redirect rejected; "
                            "no route found for redirect dst: %s\n",
-                           icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
+                           icmp6_redirect_diag(&src6, &reddst6, &redtgt6));
                        goto bad;
                }
                RT_UNLOCK(rt);
@@ -2573,10 +2580,10 @@ icmp6_redirect_input(struct mbuf *m, int off)
                rt = NULL;
        }
        if (IN6_IS_ADDR_MULTICAST(&reddst6)) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "ICMP6 redirect rejected; "
                    "redirect dst must be unicast: %s\n",
-                   icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
+                   icmp6_redirect_diag(&src6, &reddst6, &redtgt6));
                goto bad;
        }
 
@@ -2588,10 +2595,10 @@ icmp6_redirect_input(struct mbuf *m, int off)
                is_onlink = 1;  /* on-link destination case */
        }
        if (!is_router && !is_onlink) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "ICMP6 redirect rejected; "
                    "neither router case nor onlink case: %s\n",
-                   icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
+                   icmp6_redirect_diag(&src6, &reddst6, &redtgt6));
                goto bad;
        }
        /* validation passed */
@@ -2599,9 +2606,9 @@ icmp6_redirect_input(struct mbuf *m, int off)
        icmp6len -= sizeof(*nd_rd);
        nd6_option_init(nd_rd + 1, icmp6len, &ndopts);
        if (nd6_options(&ndopts) < 0) {
-               nd6log((LOG_INFO, "icmp6_redirect_input: "
+               nd6log(info, "icmp6_redirect_input: "
                    "invalid ND option, rejected: %s\n",
-                   icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
+                   icmp6_redirect_diag(&src6, &reddst6, &redtgt6));
                /* nd6_options have incremented stats */
                goto freeit;
        }
@@ -2617,11 +2624,11 @@ icmp6_redirect_input(struct mbuf *m, int off)
        }
 
        if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
-               nd6log((LOG_INFO,
+               nd6log(info,
                    "icmp6_redirect_input: lladdrlen mismatch for %s "
                    "(if %d, icmp6 packet %d): %s\n",
                    ip6_sprintf(&redtgt6), ifp->if_addrlen, lladdrlen - 2,
-                   icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
+                   icmp6_redirect_diag(&src6, &reddst6, &redtgt6));
                goto bad;
        }
 
@@ -2799,7 +2806,7 @@ icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt)
        ip6->ip6_vfc |= IPV6_VERSION;
        /* ip6->ip6_plen will be set later */
        ip6->ip6_nxt = IPPROTO_ICMPV6;
-       ip6->ip6_hlim = 255;
+       ip6->ip6_hlim = IPV6_MAXHLIM;
        /* ip6->ip6_src must be linklocal addr for my outgoing if. */
        bcopy(&ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr));
        bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr));
index 950911de794a40ab3fb4bd218b5329ece67f5037..5ed97682799105354309706bf7af458feb963249 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -174,7 +174,12 @@ const struct in6_addr in6mask96 = IN6MASK96;
 const struct in6_addr in6mask128 = IN6MASK128;
 
 const struct sockaddr_in6 sa6_any = {
-       sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0
+       .sin6_len = sizeof(sa6_any),
+       .sin6_family = AF_INET6,
+       .sin6_port = 0,
+       .sin6_flowinfo = 0,
+       .sin6_addr = IN6ADDR_ANY_INIT,
+       .sin6_scope_id = 0
 };
 
 static int in6ctl_associd(struct socket *, u_long, caddr_t);
@@ -816,7 +821,7 @@ in6ctl_llstop(struct ifnet *ifp)
        pr0.ndpr_plen = 64;
        pr0.ndpr_ifp = ifp;
        pr0.ndpr_prefix.sin6_addr.s6_addr16[0] = IPV6_ADDR_INT16_ULL;
-       in6_setscope(&pr0.ndpr_prefix.sin6_addr, ifp, NULL);
+       (void)in6_setscope(&pr0.ndpr_prefix.sin6_addr, ifp, NULL);
        pr = nd6_prefix_lookup(&pr0, ND6_PREFIX_EXPIRY_UNSPEC);
        if (pr) {
                lck_mtx_lock(nd6_mutex);
@@ -1007,7 +1012,7 @@ in6ctl_alifetime(struct in6_ifaddr *ia, u_long cmd, struct in6_ifreq *ifr,
                        lt.ia6t_preferred = ia6_lt.ia6t_preferred;
                        lt.ia6t_vltime = ia6_lt.ia6t_vltime;
                        lt.ia6t_pltime = ia6_lt.ia6t_pltime;
-                       bcopy(&lt, &ifr->ifr_ifru.ifru_lifetime, sizeof(lt));
+                       bcopy(&lt, &ifr->ifr_ifru.ifru_lifetime, sizeof(ifr->ifr_ifru.ifru_lifetime));
                } else {
                        struct in6_addrlifetime_32 lt;
 
@@ -1016,7 +1021,7 @@ in6ctl_alifetime(struct in6_ifaddr *ia, u_long cmd, struct in6_ifreq *ifr,
                        lt.ia6t_preferred = (uint32_t)ia6_lt.ia6t_preferred;
                        lt.ia6t_vltime = (uint32_t)ia6_lt.ia6t_vltime;
                        lt.ia6t_pltime = (uint32_t)ia6_lt.ia6t_pltime;
-                       bcopy(&lt, &ifr->ifr_ifru.ifru_lifetime, sizeof(lt));
+                       bcopy(&lt, &ifr->ifr_ifru.ifru_lifetime, sizeof(ifr->ifr_ifru.ifru_lifetime));
                }
                IFA_UNLOCK(&ia->ia_ifa);
                break;
@@ -1153,8 +1158,8 @@ in6ctl_clat46start(struct ifnet *ifp)
 
        if (pr != NULL) {
                if ((ia6 = in6_pfx_newpersistaddr(pr, FALSE, &error, TRUE)) == NULL) {
-                       nd6log0((LOG_ERR, "Could not configure CLAT46 address on interface "
-                           "%s.\n", ifp->if_xname));
+                       nd6log0(error, "Could not configure CLAT46 address on interface "
+                           "%s.\n", ifp->if_xname);
                } else {
                        IFA_LOCK(&ia6->ia_ifa);
                        NDPR_LOCK(pr);
@@ -1981,12 +1986,12 @@ in6_ifaupdate_aux(struct in6_ifaddr *ia, struct ifnet *ifp, int ifaupflags)
        ifa = &ia->ia_ifa;
        in6m_sol = NULL;
 
-       nd6log2((LOG_DEBUG, "%s - %s ifp %s ia6_flags 0x%x ifaupflags 0x%x\n",
+       nd6log2(debug, "%s - %s ifp %s ia6_flags 0x%x ifaupflags 0x%x\n",
            __func__,
            ip6_sprintf(&ia->ia_addr.sin6_addr),
            if_name(ia->ia_ifp),
            ia->ia6_flags,
-           ifaupflags));
+           ifaupflags);
 
        /*
         * Just to be safe, always clear certain flags when address
@@ -2045,10 +2050,10 @@ in6_ifaupdate_aux(struct in6_ifaddr *ia, struct ifnet *ifp, int ifaupflags)
                }
                imm = in6_joingroup(ifp, &llsol, &error, delay);
                if (imm == NULL) {
-                       nd6log((LOG_WARNING,
+                       nd6log(info,
                            "%s: addmulti failed for %s on %s (errno=%d)\n",
                            __func__, ip6_sprintf(&llsol), if_name(ifp),
-                           error));
+                           error);
                        VERIFY(error != 0);
                        goto unwind;
                }
@@ -2106,10 +2111,10 @@ in6_ifaupdate_aux(struct in6_ifaddr *ia, struct ifnet *ifp, int ifaupflags)
 
                imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0);
                if (!imm) {
-                       nd6log((LOG_WARNING,
+                       nd6log(info,
                            "%s: addmulti failed for %s on %s (errno=%d)\n",
                            __func__, ip6_sprintf(&mltaddr.sin6_addr),
-                           if_name(ifp), error));
+                           if_name(ifp), error);
                        VERIFY(error != 0);
                        goto unwind;
                }
@@ -2129,16 +2134,18 @@ in6_ifaupdate_aux(struct in6_ifaddr *ia, struct ifnet *ifp, int ifaupflags)
                         */
                        delay = random() % MAX_RTR_SOLICITATION_DELAY;
                }
-               if (in6_nigroup(ifp, hostname, hostnamelen, &mltaddr.sin6_addr)
-                   == 0) {
+               lck_mtx_lock(&hostname_lock);
+               int n = in6_nigroup(ifp, hostname, hostnamelen, &mltaddr.sin6_addr);
+               lck_mtx_unlock(&hostname_lock);
+               if (n == 0) {
                        imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error,
                            delay); /* XXX jinmei */
                        if (!imm) {
-                               nd6log((LOG_WARNING,
+                               nd6log(info,
                                    "%s: addmulti failed for %s on %s "
                                    "(errno=%d)\n",
                                    __func__, ip6_sprintf(&mltaddr.sin6_addr),
-                                   if_name(ifp), error));
+                                   if_name(ifp), error);
                                /* XXX not very fatal, go on... */
                                error = 0;
                        } else {
@@ -2183,10 +2190,10 @@ in6_ifaupdate_aux(struct in6_ifaddr *ia, struct ifnet *ifp, int ifaupflags)
 
                imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0);
                if (!imm) {
-                       nd6log((LOG_WARNING,
+                       nd6log(info,
                            "%s: addmulti failed for %s on %s (errno=%d)\n",
                            __func__, ip6_sprintf(&mltaddr.sin6_addr),
-                           if_name(ifp), error));
+                           if_name(ifp), error);
                        VERIFY(error != 0);
                        goto unwind;
                }
@@ -2200,13 +2207,14 @@ in6_ifaupdate_aux(struct in6_ifaddr *ia, struct ifnet *ifp, int ifaupflags)
        ++nd6_sched_timeout_want;
 
        /*
-        * Perform DAD, if needed.
-        * XXX It may be of use, if we can administratively
-        * disable DAD.
+        * Perform DAD, if:
+        * * Interface is marked to perform DAD, AND
+        * * Address is not marked to skip DAD, AND
+        * * Address is in a pre-DAD state (Tentative or Optimistic)
         */
        IFA_LOCK_SPIN(ifa);
-       if (in6if_do_dad(ifp) && ((ifa->ifa_flags & IN6_IFF_NODAD) == 0) &&
-           (ia->ia6_flags & IN6_IFF_DADPROGRESS)) {
+       if (in6if_do_dad(ifp) && (ia->ia6_flags & IN6_IFF_NODAD) == 0 &&
+           (ia->ia6_flags & IN6_IFF_DADPROGRESS) != 0) {
                int mindelay, maxdelay;
                int *delayptr, delayval;
 
@@ -3711,8 +3719,8 @@ in6if_do_dad(
                return 0;
        }
 
-       if (ifp->if_subfamily == IFNET_SUBFAMILY_IPSEC ||
-           ifp->if_subfamily == IFNET_SUBFAMILY_UTUN) {
+       if (ifp->if_family == IFNET_FAMILY_IPSEC ||
+           ifp->if_family == IFNET_FAMILY_UTUN) {
                /*
                 * Ignore DAD for tunneling virtual interfaces, which get
                 * their IPv6 address explicitly assigned.
@@ -3832,6 +3840,8 @@ in6_if2idlen(struct ifnet *ifp)
                return 64;    /* Packet Data over Cellular */
        case IFT_BRIDGE:
                return 64;    /* Transparent bridge interface */
+       case IFT_6LOWPAN:
+               return 64;    /* 6LoWPAN */
        default:
                /*
                 * Unknown link type:
@@ -4016,6 +4026,8 @@ in6_ifaddr_alloc(int how)
                bzero(in6ifa, in6ifa_size);
                in6ifa->ia_ifa.ifa_free = in6_ifaddr_free;
                in6ifa->ia_ifa.ifa_debug |= IFD_ALLOC;
+               in6ifa->ia_ifa.ifa_del_wc = &in6ifa->ia_ifa.ifa_debug;
+               in6ifa->ia_ifa.ifa_del_waiters = 0;
                ifa_lock_init(&in6ifa->ia_ifa);
                if (in6ifa_debug != 0) {
                        struct in6_ifaddr_dbg *in6ifa_dbg =
@@ -4804,9 +4816,9 @@ in6_eventhdlr_callback(struct eventhandler_entry_arg arg0 __unused,
        bzero(&ev_msg, sizeof(ev_msg));
        bzero(&nd6_event, sizeof(nd6_event));
 
-       nd6log0((LOG_INFO, "%s Event %s received for %s\n",
+       nd6log0(info, "%s Event %s received for %s\n",
            __func__, in6_event2kev_array[in6_ev_code].in6_event_str,
-           ip6_sprintf(p_addr6)));
+           ip6_sprintf(p_addr6));
 
        ev_msg.vendor_code      = KEV_VENDOR_APPLE;
        ev_msg.kev_class        = KEV_NETWORK_CLASS;
index 59967ec1b8e30389c315a7e5a51d3e76a5495482..fa67c43a4ea4f42fa58da87f1248eaacae73ec1c 100644 (file)
@@ -1004,7 +1004,6 @@ extern int inet6_rth_add(void *, const struct in6_addr *);
 extern int inet6_rth_reverse(const void *, void *);
 extern int inet6_rth_segments(const void *);
 extern struct in6_addr *inet6_rth_getaddr(const void *, int);
-extern void addrsel_policy_init(void);
 
 __END_DECLS
 #endif /* !KERNEL */
index 643e3e3636c4386436bb318510e74eddaefaa1cd..bd4ad95c04a049fb7007cdd0759b1ed2ebc622ed 100644 (file)
@@ -131,12 +131,15 @@ get_rand_iid(
 {
        SHA1_CTX ctxt;
        u_int8_t digest[SHA1_RESULTLEN];
-       int hostnlen    = strlen(hostname);
+       int hostnlen;
 
        /* generate 8 bytes of pseudo-random value. */
        bzero(&ctxt, sizeof(ctxt));
        SHA1Init(&ctxt);
+       lck_mtx_lock(&hostname_lock);
+       hostnlen = strlen(hostname);
        SHA1Update(&ctxt, hostname, hostnlen);
+       lck_mtx_unlock(&hostname_lock);
        SHA1Final(digest, &ctxt);
 
        /* assumes sizeof (digest) > sizeof (iid) */
@@ -212,8 +215,8 @@ in6_generate_tmp_iid(
         * use a random non-zero value as the last resort.
         */
        if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) {
-               nd6log((LOG_INFO,
-                   "%s: computed SHA1 value is zero.\n", __func__));
+               nd6log(info,
+                   "%s: computed SHA1 value is zero.\n", __func__);
 
                getmicrotime(&tv);
                val32 = random() ^ tv.tv_usec;
@@ -289,6 +292,7 @@ in6_iid_from_hw(struct ifnet *ifp, struct in6_addr *in6)
        case IFT_IEEE80211:
 #endif
        case IFT_BRIDGE:
+       case IFT_6LOWPAN:
                /* IEEE802/EUI64 cases - what others? */
                /* IEEE1394 uses 16byte length address starting with EUI64 */
                if (addrlen > 8) {
@@ -412,15 +416,15 @@ in6_select_iid_from_all_hw(
 
        /* first, try to get it from the interface itself */
        if (in6_iid_from_hw(ifp0, in6) == 0) {
-               nd6log((LOG_DEBUG, "%s: IID derived from HW interface.\n",
-                   if_name(ifp0)));
+               nd6log(debug, "%s: IID derived from HW interface.\n",
+                   if_name(ifp0));
                goto success;
        }
 
        /* try secondary EUI64 source. this basically is for ATM PVC */
        if (altifp && in6_iid_from_hw(altifp, in6) == 0) {
-               nd6log((LOG_DEBUG, "%s: IID from alterate HW interface %s.\n",
-                   if_name(ifp0), if_name(altifp)));
+               nd6log(debug, "%s: IID from alterate HW interface %s.\n",
+                   if_name(ifp0), if_name(altifp));
                goto success;
        }
 
@@ -439,8 +443,8 @@ in6_select_iid_from_all_hw(
                 * globally unique
                 */
                if (ND6_IFID_UNIVERSAL(in6)) {
-                       nd6log((LOG_DEBUG, "%s: borrowed IID from %s\n",
-                           if_name(ifp0), if_name(ifp)));
+                       nd6log(debug, "%s: borrowed IID from %s\n",
+                           if_name(ifp0), if_name(ifp));
                        ifnet_head_done();
                        goto success;
                }
@@ -449,7 +453,7 @@ in6_select_iid_from_all_hw(
 
        /* last resort: get from random number source */
        if (get_rand_iid(ifp, in6) == 0) {
-               nd6log((LOG_DEBUG, "%s: IID from PRNG.\n", if_name(ifp0)));
+               nd6log(debug, "%s: IID from PRNG.\n", if_name(ifp0));
                goto success;
        }
 
@@ -457,13 +461,13 @@ in6_select_iid_from_all_hw(
        return -1;
 
 success:
-       nd6log((LOG_INFO, "%s: IID: "
+       nd6log(info, "%s: IID: "
            "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
            if_name(ifp0),
            in6->s6_addr[8], in6->s6_addr[9],
            in6->s6_addr[10], in6->s6_addr[11],
            in6->s6_addr[12], in6->s6_addr[13],
-           in6->s6_addr[14], in6->s6_addr[15]));
+           in6->s6_addr[14], in6->s6_addr[15]);
        return 0;
 }
 
@@ -487,10 +491,10 @@ in6_ifattach_linklocal(struct ifnet *ifp, struct in6_aliasreq *ifra)
                 * suppress it.  (jinmei@kame.net 20010130)
                 */
                if (error != EAFNOSUPPORT) {
-                       nd6log((LOG_NOTICE, "%s: failed to "
+                       nd6log(info, "%s: failed to "
                            "configure a link-local address on %s "
                            "(errno=%d)\n",
-                           __func__, if_name(ifp), error));
+                           __func__, if_name(ifp), error);
                }
                return EADDRNOTAVAIL;
        }
@@ -593,9 +597,9 @@ in6_ifattach_loopback(
        /* add the new interface address */
        error = in6_update_ifa(ifp, &ifra, 0, &ia);
        if (error != 0) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "%s: failed to configure loopback address %s (error=%d)\n",
-                   __func__, if_name(ifp), error));
+                   __func__, if_name(ifp), error);
                VERIFY(ia == NULL);
                return EADDRNOTAVAIL;
        }
@@ -730,9 +734,8 @@ in6_ifattach_prelim(struct ifnet *ifp)
         *   (previously, this was a silent error.)
         */
        if ((ifp->if_flags & IFF_MULTICAST) == 0) {
-               nd6log0((LOG_INFO, "in6_ifattach: ",
-                   "%s is not multicast capable, IPv6 not enabled\n",
-                   if_name(ifp)));
+               nd6log0(info, "in6_ifattach: %s is not multicast capable, IPv6 not enabled\n",
+                   if_name(ifp));
                return EINVAL;
        }
 
@@ -902,8 +905,8 @@ in6_ifattach_aliasreq(struct ifnet *ifp, struct ifnet *altifp,
                } else {
                        if (in6_select_iid_from_all_hw(ifp, altifp,
                            &ifra.ifra_addr.sin6_addr) != 0) {
-                               nd6log((LOG_ERR, "%s: no IID available\n",
-                                   if_name(ifp)));
+                               nd6log(error, "%s: no IID available\n",
+                                   if_name(ifp));
                                return EADDRNOTAVAIL;
                        }
                }
@@ -924,9 +927,9 @@ in6_ifattach_aliasreq(struct ifnet *ifp, struct ifnet *altifp,
 
        /* Attach the link-local address */
        if (in6_ifattach_linklocal(ifp, &ifra) != 0) {
-               nd6log((LOG_INFO,
+               nd6log(info,
                    "%s: %s could not attach link-local address.\n",
-                   __func__, if_name(ifp)));
+                   __func__, if_name(ifp));
                /* NB: not an error */
        }
 
@@ -1014,9 +1017,9 @@ in6_ifattach_llcgareq(struct ifnet *ifp, struct in6_cgareq *llcgasr)
        /* Attach the link-local address */
        if (in6_ifattach_linklocal(ifp, &ifra) != 0) {
                /* NB: not an error */
-               nd6log((LOG_INFO,
+               nd6log(info,
                    "%s: %s could not attach link-local address.\n",
-                   __func__, if_name(ifp)));
+                   __func__, if_name(ifp));
        }
 
        VERIFY(error == 0);
@@ -1144,9 +1147,9 @@ in6_ifdetach(struct ifnet *ifp)
                        if (ia->ia_next) {
                                ia->ia_next = oia->ia_next;
                        } else {
-                               nd6log((LOG_ERR,
+                               nd6log(error,
                                    "%s: didn't unlink in6ifaddr from "
-                                   "list\n", if_name(ifp)));
+                                   "list\n", if_name(ifp));
                                unlinked = 0;
                        }
                }
index 65d4c090ad66af1a26431b165b68f673efd56597..4635a2b1844a7dd9d5ea1f873d6c05eaeda4ba58 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2010-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -3012,7 +3012,7 @@ ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt)
                if (error) {
                        break;
                }
-               if (hlim < -1 || hlim > 255) {
+               if (hlim < -1 || hlim > IPV6_MAXHLIM) {
                        error = EINVAL;
                        break;
                } else if (hlim == -1) {
index 674f64d68474d719a8c972e85fba24f4fea84b8a..c1d2ff08ebd155d220983a2f29c1fb6615494270 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/ntstat.h>
+#include <net/restricted_in_port.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip6.h>
 #include <netinet/ip_var.h>
+
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/in_pcb.h>
 #include <netinet6/in6_pcb.h>
+
 #include <net/if_types.h>
 #include <net/if_var.h>
 
@@ -295,13 +298,16 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
                                IFA_REMREF(ifa);
                        }
                }
+
+
                if (lport != 0) {
                        struct inpcb *t;
                        uid_t u;
 
 #if !CONFIG_EMBEDDED
                        if (ntohs(lport) < IPV6PORT_RESERVED &&
-                           !IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr)) {
+                           !IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr) &&
+                           !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) {
                                cred = kauth_cred_proc_ref(p);
                                error = priv_check_cred(cred,
                                    PRIV_NETINET_RESERVEDPORT, 0);
@@ -313,19 +319,30 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
                                }
                        }
 #endif /* !CONFIG_EMBEDDED */
+                       /*
+                        * Check wether the process is allowed to bind to a restricted port
+                        */
+                       if (!current_task_can_use_restricted_in_port(lport,
+                           so->so_proto->pr_protocol, PORT_FLAGS_BSD)) {
+                               lck_rw_done(pcbinfo->ipi_lock);
+                               socket_lock(so, 0);
+                               return EADDRINUSE;
+                       }
+
                        if (!IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr) &&
                            (u = kauth_cred_getuid(so->so_cred)) != 0) {
                                t = in6_pcblookup_local_and_cleanup(pcbinfo,
                                    &sin6.sin6_addr, lport,
                                    INPLOOKUP_WILDCARD);
-                               if (t != NULL && (!IN6_IS_ADDR_UNSPECIFIED(
-                                           &sin6.sin6_addr) ||
+                               if (t != NULL &&
+                                   (!IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr) ||
                                    !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) ||
-                                   !(t->inp_socket->so_options &
-                                   SO_REUSEPORT)) && (u != kauth_cred_getuid(
-                                           t->inp_socket->so_cred)) &&
-                                   !(t->inp_socket->so_flags &
-                                   SOF_REUSESHAREUID)) {
+                                   !(t->inp_socket->so_options & SO_REUSEPORT)) &&
+                                   (u != kauth_cred_getuid(t->inp_socket->so_cred)) &&
+                                   !(t->inp_socket->so_flags & SOF_REUSESHAREUID) &&
+                                   (!(t->inp_flags2 & INP2_EXTERNAL_PORT) ||
+                                   !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
+                                   uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) {
                                        lck_rw_done(pcbinfo->ipi_lock);
                                        socket_lock(so, 0);
                                        return EADDRINUSE;
@@ -339,23 +356,28 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
                                                pcbinfo, sin.sin_addr, lport,
                                                INPLOOKUP_WILDCARD);
                                        if (t != NULL &&
-                                           !(t->inp_socket->so_options &
-                                           SO_REUSEPORT) &&
+                                           !(t->inp_socket->so_options & SO_REUSEPORT) &&
                                            (kauth_cred_getuid(so->so_cred) !=
-                                           kauth_cred_getuid(t->inp_socket->
-                                           so_cred)) && (t->inp_laddr.s_addr !=
-                                           INADDR_ANY || SOCK_DOM(so) ==
-                                           SOCK_DOM(t->inp_socket))) {
+                                           kauth_cred_getuid(t->inp_socket->so_cred)) &&
+                                           (t->inp_laddr.s_addr != INADDR_ANY ||
+                                           SOCK_DOM(so) == SOCK_DOM(t->inp_socket)) &&
+                                           (!(t->inp_flags2 & INP2_EXTERNAL_PORT) ||
+                                           !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
+                                           uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) {
                                                lck_rw_done(pcbinfo->ipi_lock);
                                                socket_lock(so, 0);
                                                return EADDRINUSE;
                                        }
+
                                }
                        }
                        t = in6_pcblookup_local_and_cleanup(pcbinfo,
                            &sin6.sin6_addr, lport, wild);
                        if (t != NULL &&
-                           (reuseport & t->inp_socket->so_options) == 0) {
+                           (reuseport & t->inp_socket->so_options) == 0 &&
+                           (!(t->inp_flags2 & INP2_EXTERNAL_PORT) ||
+                           !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
+                           uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) {
                                lck_rw_done(pcbinfo->ipi_lock);
                                socket_lock(so, 0);
                                return EADDRINUSE;
@@ -370,7 +392,10 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
                                if (t != NULL && (reuseport &
                                    t->inp_socket->so_options) == 0 &&
                                    (t->inp_laddr.s_addr != INADDR_ANY ||
-                                   SOCK_DOM(so) == SOCK_DOM(t->inp_socket))) {
+                                   SOCK_DOM(so) == SOCK_DOM(t->inp_socket)) &&
+                                   (!(t->inp_flags2 & INP2_EXTERNAL_PORT) ||
+                                   !(inp->inp_flags2 & INP2_EXTERNAL_PORT) ||
+                                   uuid_compare(t->necp_client_uuid, inp->necp_client_uuid) != 0)) {
                                        lck_rw_done(pcbinfo->ipi_lock);
                                        socket_lock(so, 0);
                                        return EADDRINUSE;
@@ -546,9 +571,7 @@ in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
        struct socket *so = inp->inp_socket;
 
 #if CONTENT_FILTER
-       if (so) {
-               so->so_state_change_cnt++;
-       }
+       so->so_state_change_cnt++;
 #endif
 
        if (so->so_proto->pr_protocol == IPPROTO_UDP &&
@@ -700,6 +723,7 @@ in6_pcbdetach(struct inpcb *inp)
                        inp->in6p_options = NULL;
                }
                ip6_freepcbopts(inp->in6p_outputopts);
+               inp->in6p_outputopts = NULL;
                ROUTE_RELEASE(&inp->in6p_route);
                /* free IPv4 related resources in case of mapped addr */
                if (inp->inp_options != NULL) {
@@ -1180,6 +1204,12 @@ in6_pcblookup_hash_exists(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
                        continue;
                }
 
+#if NECP
+               if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
+                       continue;
+               }
+#endif /* NECP */
+
                if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) &&
                    IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) &&
                    inp->inp_fport == fport &&
@@ -1211,6 +1241,12 @@ in6_pcblookup_hash_exists(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
                                continue;
                        }
 
+#if NECP
+                       if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
+                               continue;
+                       }
+#endif /* NECP */
+
                        if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
                            inp->inp_lport == lport) {
                                if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
@@ -1277,6 +1313,12 @@ in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
                        continue;
                }
 
+#if NECP
+               if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
+                       continue;
+               }
+#endif /* NECP */
+
                if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) &&
                    IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) &&
                    inp->inp_fport == fport &&
@@ -1309,6 +1351,12 @@ in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
                                continue;
                        }
 
+#if NECP
+                       if (!necp_socket_is_allowed_to_recv_on_interface(inp, ifp)) {
+                               continue;
+                       }
+#endif /* NECP */
+
                        if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
                            inp->inp_lport == lport) {
                                if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
index 78d084ee68c96673b667f00debb736ef6246bbca..7a1f23c44eb45b1f36b1f4ce704cd9481395667f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <netinet6/esp6.h>
 #endif
 #endif
-#include <netinet6/ipcomp.h>
-#if INET6
-#include <netinet6/ipcomp6.h>
-#endif
 #endif /*IPSEC*/
 
 #include <netinet6/ip6protosw.h>
@@ -190,6 +186,8 @@ struct ip6protosw inet6sw[] = {
                .pr_lock =              udp_lock,
                .pr_unlock =            udp_unlock,
                .pr_getlock =           udp_getlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
        {
                .pr_type =              SOCK_STREAM,
@@ -208,6 +206,8 @@ struct ip6protosw inet6sw[] = {
                .pr_lock =              tcp_lock,
                .pr_unlock =            tcp_unlock,
                .pr_getlock =           tcp_getlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
        {
                .pr_type =              SOCK_RAW,
@@ -222,6 +222,8 @@ struct ip6protosw inet6sw[] = {
 #endif /* !INET */
                .pr_usrreqs =           &rip6_usrreqs,
                .pr_unlock =            rip_unlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
        {
                .pr_type =              SOCK_RAW,
@@ -234,6 +236,8 @@ struct ip6protosw inet6sw[] = {
                .pr_init =              icmp6_init,
                .pr_usrreqs =           &rip6_usrreqs,
                .pr_unlock =            rip_unlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
        {
                .pr_type =              SOCK_DGRAM,
@@ -246,6 +250,8 @@ struct ip6protosw inet6sw[] = {
                .pr_init =              icmp6_init,
                .pr_usrreqs =           &icmp6_dgram_usrreqs,
                .pr_unlock =            rip_unlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
        {
                .pr_type =              SOCK_RAW,
@@ -286,16 +292,6 @@ struct ip6protosw inet6sw[] = {
                .pr_usrreqs =           &nousrreqs,
        },
 #endif /* IPSEC_ESP */
-       {
-               .pr_type =              SOCK_RAW,
-               .pr_protocol =          IPPROTO_IPCOMP,
-               .pr_flags =             PR_ATOMIC | PR_ADDR | PR_PROTOLOCK,
-               .pr_input =             ipcomp6_input,
-#if !INET       /* don't call initialization and timeout routines twice */
-               .pr_init =              ipcomp_init,
-#endif /* !INET */
-               .pr_usrreqs =           &nousrreqs,
-       },
 #endif /* IPSEC */
 #if INET
        {
@@ -308,6 +304,8 @@ struct ip6protosw inet6sw[] = {
                .pr_init =              encap6_init,
                .pr_usrreqs =           &rip6_usrreqs,
                .pr_unlock =            rip_unlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
 #endif /*INET*/
        {
@@ -320,6 +318,8 @@ struct ip6protosw inet6sw[] = {
                .pr_init =              encap6_init,
                .pr_usrreqs =           &rip6_usrreqs,
                .pr_unlock =            rip_unlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
 /* raw wildcard */
        {
@@ -331,6 +331,8 @@ struct ip6protosw inet6sw[] = {
                .pr_ctloutput =         rip6_ctloutput,
                .pr_usrreqs =           &rip6_usrreqs,
                .pr_unlock =            rip_unlock,
+               .pr_update_last_owner = inp_update_last_owner,
+               .pr_copy_last_owner =   inp_copy_last_owner,
        },
 };
 
@@ -398,6 +400,10 @@ in6_dinit(struct domain *dp)
            offsetof(struct protosw, pr_filter_head));
        _CASSERT(offsetof(struct ip6protosw, pr_old) ==
            offsetof(struct protosw, pr_old));
+       _CASSERT(offsetof(struct ip6protosw, pr_update_last_owner) ==
+           offsetof(struct protosw, pr_update_last_owner));
+       _CASSERT(offsetof(struct ip6protosw, pr_copy_last_owner) ==
+           offsetof(struct protosw, pr_copy_last_owner));
 
        /*
         * Attach first, then initialize.  ip6_init() needs raw IP6 handler.
index 85e2dc7feb7a48af5030eb010ff82ad351906e6f..f018e5111f0ecfe9aff7851ea87f37510f269bef 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/route.h>
+#include <net/restricted_in_port.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
+
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
@@ -257,10 +259,8 @@ in6_selectsrc_core_ifa(struct sockaddr_in6 *addr, struct ifnet *ifp, int srcsel_
        if ((ifa->ifa_debug & IFD_DETACHING) != 0) {
                err = EHOSTUNREACH;
                ifnet_lock_done(ifp);
-               if (ifa != NULL) {
-                       IFA_REMREF(ifa);
-                       ifa = NULL;
-               }
+               IFA_REMREF(ifa);
+               ifa = NULL;
                goto done;
        }
        ifnet_lock_done(ifp);
@@ -672,6 +672,9 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
                if (INP_NO_EXPENSIVE(inp)) {
                        ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE;
                }
+               if (INP_NO_CONSTRAINED(inp)) {
+                       ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED;
+               }
                if (INP_AWDL_UNRESTRICTED(inp)) {
                        ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED;
                }
@@ -843,6 +846,7 @@ selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock,
        boolean_t select_srcif, proxied_ifa = FALSE, local_dst = FALSE;
        unsigned int ifscope = ((ip6oa != NULL) ?
            ip6oa->ip6oa_boundif : IFSCOPE_NONE);
+       boolean_t is_direct = FALSE;
 
        if (retifp != NULL) {
                *retifp = NULL;
@@ -868,15 +872,49 @@ selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock,
        }
 
        /*
-        * Perform source interface selection only if Scoped Routing
+        * Perform source interface selection if Scoped Routing
         * is enabled and a source address that isn't unspecified.
         */
        select_srcif = (srcsock != NULL &&
            !IN6_IS_ADDR_UNSPECIFIED(&srcsock->sin6_addr));
 
+       /*
+        * For scoped routing, if interface scope is 0 or src/dst addr is linklocal
+        * or dst addr is multicast, source interface selection should be performed even
+        * if the destination is directly reachable.
+        */
+       if (ifscope != IFSCOPE_NONE &&
+           !(srcsock != NULL && IN6_IS_ADDR_LINKLOCAL(&srcsock->sin6_addr)) &&
+           !IN6_IS_ADDR_MULTICAST(dst) && !IN6_IS_ADDR_LINKLOCAL(dst)) {
+               struct rtentry *temp_rt = NULL;
+
+               lck_mtx_lock(rnh_lock);
+               temp_rt = rt_lookup(TRUE, (struct sockaddr *)dstsock,
+                   NULL, rt_tables[AF_INET6], ifscope);
+               lck_mtx_unlock(rnh_lock);
+
+               /*
+                * If the destination is directly reachable, relax
+                * the behavior around select_srcif, i.e. don't force
+                * the packet to go out from the interface that is hosting
+                * the source address.
+                * It happens when we share v6 with NAT66 and want
+                * the external interface's v6 address to be reachable
+                * to the clients we are sharing v6 connectivity with
+                * using NAT.
+                */
+               if (temp_rt != NULL) {
+                       if ((temp_rt->rt_flags & RTF_GATEWAY) == 0) {
+                               select_srcif = FALSE;
+                               is_direct = TRUE;
+                       }
+                       rtfree(temp_rt);
+               }
+       }
+
        if (ip6_select_srcif_debug) {
-               printf("%s src %s dst %s ifscope %d select_srcif %d\n",
-                   __func__, s_src, s_dst, ifscope, select_srcif);
+               printf("%s src %s dst %s ifscope %d is_direct %d select_srcif %d\n",
+                   __func__, s_src, s_dst, ifscope, is_direct, select_srcif);
        }
 
        /* If the caller specified the outgoing interface explicitly, use it */
@@ -1292,6 +1330,8 @@ done:
            IFNET_IS_CELLULAR(_ifp)) ||                         \
        (((_ip6oa)->ip6oa_flags & IP6OAF_NO_EXPENSIVE) &&       \
            IFNET_IS_EXPENSIVE(_ifp)) ||                        \
+       (((_ip6oa)->ip6oa_flags & IP6OAF_NO_CONSTRAINED) &&     \
+           IFNET_IS_CONSTRAINED(_ifp)) ||                      \
        (!((_ip6oa)->ip6oa_flags & IP6OAF_INTCOPROC_ALLOWED) && \
            IFNET_IS_INTCOPROC(_ifp)) ||                        \
        (!((_ip6oa)->ip6oa_flags & IP6OAF_AWDL_UNRESTRICTED) && \
@@ -1594,6 +1634,15 @@ in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct proc *p,
                        }
                }
                lport = htons(*lastport);
+
+               /*
+                * Skip if this is a restricted port as we do not want to
+                * restricted ports as ephemeral
+                */
+               if (IS_RESTRICTED_IN_PORT(lport)) {
+                       continue;
+               }
+
                found = (in6_pcblookup_local(pcbinfo, &inp->in6p_laddr,
                    lport, wild) == NULL);
        } while (!found);
index baa9a541ba1117c123640f1e7f7a1efef234fbca..b7c08035205e54f76de0f2b2e13caa6e7f8ffff6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -784,23 +784,27 @@ extern u_char inet6ctlerrmap[];
 extern u_int32_t in6_maxmtu;
 
 /* N.B.: if_inet6data is never freed once set, so we don't need to lock */
-#define in6_ifstat_inc_common(_ifp, _tag, _atomic) do {                 \
+#define in6_ifstat_add_common(_ifp, _tag, _count, _atomic) do {         \
        if (_ifp != NULL && IN6_IFEXTRA(_ifp) != NULL) {                \
                if (_atomic)                                            \
                        atomic_add_64(                                  \
-                           &IN6_IFEXTRA(_ifp)->in6_ifstat._tag, 1);    \
+                           &IN6_IFEXTRA(_ifp)->in6_ifstat._tag, _count);\
                else                                                    \
-                       IN6_IFEXTRA(_ifp)->in6_ifstat._tag++;           \
+                       IN6_IFEXTRA(_ifp)->in6_ifstat._tag += _count;   \
        }                                                               \
 } while (0)
 
 /* atomic version */
 #define in6_ifstat_inc(_ifp, _tag) \
-       in6_ifstat_inc_common(_ifp, _tag, TRUE)
+       in6_ifstat_add_common(_ifp, _tag, 1, TRUE)
 
 /* non-atomic version (for fast paths) */
 #define in6_ifstat_inc_na(_ifp, _tag) \
-       in6_ifstat_inc_common(_ifp, _tag, FALSE)
+       in6_ifstat_add_common(_ifp, _tag, 1, FALSE)
+
+/* atomic add version */
+#define in6_ifstat_add(_ifp, _tag, _count) \
+       in6_ifstat_add_common(_ifp, _tag, _count, TRUE)
 
 /*
  * Macro for finding the internet address structure (in6_ifaddr) corresponding
index 8498211c71ee3168bec6da27b85e2282e3984cc9..57c44b85c52975c2de337e13366f3d7fef5f917d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2009-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -132,7 +132,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt,
        boolean_t proxy = FALSE;
        struct mbuf *mcopy = NULL;
        struct ifnet *ifp, *rcvifp, *origifp;   /* maybe unnecessary */
-       u_int32_t inzone, outzone, len;
+       u_int32_t inzone, outzone, len = 0, pktcnt = 0;
        struct in6_addr src_in6, dst_in6;
        uint64_t curtime = net_uptime();
 #if IPSEC
@@ -141,7 +141,10 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt,
        unsigned int ifscope = IFSCOPE_NONE;
 #if PF
        struct pf_mtag *pf_mtag;
+       struct pf_fragment_tag *pf_ftagp, pf_ftag;
+       boolean_t pf_ftag_valid = FALSE;
 #endif /* PF */
+       uint32_t mpktlen = 0;
 
        /*
         * In the prefix proxying case, the route to the proxied node normally
@@ -164,10 +167,23 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt,
 
 #if PF
        pf_mtag = pf_find_mtag(m);
+       /*
+        * save the PF fragmentation metadata as m_copy() removes the
+        * mbufs tags from the original mbuf.
+        */
+       pf_ftagp = pf_find_fragment_tag(m);
+       if (pf_ftagp != NULL) {
+               ASSERT(pf_mtag->pftag_flags & PF_TAG_REASSEMBLED);
+               pf_ftag = *pf_ftagp;
+               pf_ftag_valid = TRUE;
+               mpktlen = pf_ftag.ft_maxlen;
+               ASSERT(mpktlen);
+       }
        if (pf_mtag != NULL && pf_mtag->pftag_rtableid != IFSCOPE_NONE) {
                ifscope = pf_mtag->pftag_rtableid;
        }
-
+       pf_mtag = NULL;
+       pf_ftagp = NULL;
        /*
         * If the caller provides a route which is on a different interface
         * than the one specified for scoped forwarding, discard the route
@@ -543,7 +559,11 @@ skip_ipsec:
                return NULL;
        }
 
-       if (m->m_pkthdr.len > rt->rt_ifp->if_mtu) {
+       if (mpktlen == 0) {
+               mpktlen = m->m_pkthdr.len;
+       }
+
+       if (mpktlen > rt->rt_ifp->if_mtu) {
                in6_ifstat_inc(rt->rt_ifp, ifs6_in_toobig);
                if (mcopy) {
                        uint32_t mtu;
@@ -704,6 +724,14 @@ skip_ipsec:
 
 #if PF
        if (PF_IS_ENABLED) {
+               /*
+                * PF refragments any packet which it reassembled due to scrub
+                * rules, in which case it will set the PF_TAG_REFRAGMENTED
+                * flag in PF mbuf tag.
+                */
+               if (pf_ftag_valid) {
+                       pf_copy_fragment_tag(m, &pf_ftag, M_DONTWAIT);
+               }
 #if DUMMYNET
                struct ip_fw_args args;
                bzero(&args, sizeof(args));
@@ -729,6 +757,31 @@ skip_ipsec:
                        /* Already freed by callee */
                        goto senderr;
                }
+
+               pf_mtag = pf_find_mtag(m);
+               /*
+                * refragmented packets from PF.
+                */
+               if ((pf_mtag->pftag_flags & PF_TAG_REFRAGMENTED) != 0) {
+                       struct mbuf *t;
+
+                       pf_mtag->pftag_flags &= ~PF_TAG_REFRAGMENTED;
+                       /* for statistics */
+                       t = m;
+                       while (t != NULL) {
+                               pktcnt++;
+                               len += m_pktlen(t);
+                               t = t->m_nextpkt;
+                       }
+
+                       /*
+                        * nd6_output() frees packetchain in both success and
+                        * failure cases.
+                        */
+                       error = nd6_output(ifp, origifp, m, dst, rt, NULL);
+                       m = NULL;
+                       goto sent;
+               }
                /*
                 * We do not use ip6 header again in the code below,
                 * however still adding the bit here so that any new
@@ -740,21 +793,23 @@ skip_ipsec:
 #endif /* PF */
 
        len = m_pktlen(m);
+       pktcnt = 1;
        error = nd6_output(ifp, origifp, m, dst, rt, NULL);
+sent:
        if (error) {
-               in6_ifstat_inc(ifp, ifs6_out_discard);
-               ip6stat.ip6s_cantforward++;
+               in6_ifstat_add(ifp, ifs6_out_discard, pktcnt);
+               ip6stat.ip6s_cantforward += pktcnt;
        } else {
                /*
                 * Increment stats on the source interface; the ones
                 * for destination interface has been taken care of
                 * during output above by virtue of PKTF_FORWARDED.
                 */
-               rcvifp->if_fpackets++;
+               rcvifp->if_fpackets += pktcnt;
                rcvifp->if_fbytes += len;
 
-               ip6stat.ip6s_forward++;
-               in6_ifstat_inc(ifp, ifs6_out_forward);
+               ip6stat.ip6s_forward += pktcnt;
+               in6_ifstat_add(ifp, ifs6_out_forward, pktcnt);
                if (type) {
                        ip6stat.ip6s_redirectsent++;
                } else {
index b767c8edbbe20ac66e6bcf1e6769d6c561b349f2..508274bf4005ddf637897ccb1c73fb5f72611890 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2009-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -149,27 +149,43 @@ struct randomtab {
 };
 
 static struct randomtab randomtab_32 = {
-       32,                     /* resulting bits */
-       180,                    /* Time after wich will be reseeded */
-       1000000000,             /* Uniq cycle, avoid blackjack prediction */
-       2,                      /* Starting generator */
-       2147483629,             /* RU_N-1 = 2^2*3^2*59652323 */
-       7,                      /* determine ru_a as RU_AGEN^(2*rand) */
-       1836660096,             /* RU_M = 2^7*3^15 - don't change */
-       { 2, 3, 59652323, 0 },  /* factors of ru_n */
-       0, 0, 0, 0, 0, 0, 0, 0, 0
+       .ru_bits = 32,          /* resulting bits */
+       .ru_out = 180,          /* Time after wich will be reseeded */
+       .ru_max = 1000000000,   /* Uniq cycle, avoid blackjack prediction */
+       .ru_gen = 2,            /* Starting generator */
+       .ru_n = 2147483629,     /* RU_N-1 = 2^2*3^2*59652323 */
+       .ru_agen = 7,           /* determine ru_a as RU_AGEN^(2*rand) */
+       .ru_m = 1836660096,     /* RU_M = 2^7*3^15 - don't change */
+       .pfacts = { 2, 3, 59652323, 0 },        /* factors of ru_n */
+       .ru_counter = 0,
+       .ru_msb = 0,
+       .ru_x = 0,
+       .ru_seed = 0,
+       .ru_seed2 = 0,
+       .ru_a = 0,
+       .ru_b = 0,
+       .ru_g = 0,
+       .ru_reseed = 0
 };
 
 static struct randomtab randomtab_20 = {
-       20,                     /* resulting bits */
-       180,                    /* Time after wich will be reseeded */
-       200000,                 /* Uniq cycle, avoid blackjack prediction */
-       2,                      /* Starting generator */
-       524269,                 /* RU_N-1 = 2^2*3^2*14563 */
-       7,                      /* determine ru_a as RU_AGEN^(2*rand) */
-       279936,                 /* RU_M = 2^7*3^7 - don't change */
-       { 2, 3, 14563, 0 },     /* factors of ru_n */
-       0, 0, 0, 0, 0, 0, 0, 0, 0
+       .ru_bits = 20,                  /* resulting bits */
+       .ru_out = 180,                  /* Time after wich will be reseeded */
+       .ru_max = 200000,                       /* Uniq cycle, avoid blackjack prediction */
+       .ru_gen = 2,                    /* Starting generator */
+       .ru_n = 524269,                 /* RU_N-1 = 2^2*3^2*14563 */
+       .ru_agen = 7,                   /* determine ru_a as RU_AGEN^(2*rand) */
+       .ru_m = 279936,                 /* RU_M = 2^7*3^7 - don't change */
+       .pfacts = { 2, 3, 14563, 0 },   /* factors of ru_n */
+       .ru_counter = 0,
+       .ru_msb = 0,
+       .ru_x = 0,
+       .ru_seed = 0,
+       .ru_seed2 = 0,
+       .ru_a = 0,
+       .ru_b = 0,
+       .ru_g = 0,
+       .ru_reseed = 0
 };
 
 static u_int32_t pmod(u_int32_t, u_int32_t, u_int32_t);
index b35e7f5015278d51ec05a082a3c558df7eaf8b88..dad053c63c76617a42162830c856145c6f403007 100644 (file)
@@ -803,9 +803,6 @@ ip6_input(struct mbuf *m)
                }
        }
 
-       /* for consistency */
-       m->m_pkthdr.pkt_proto = ip6->ip6_nxt;
-
 #if DUMMYNET
 check_with_pf:
 #endif /* DUMMYNET */
@@ -928,9 +925,9 @@ check_with_pf:
                lck_rw_done(&in6_ifaddr_rwlock);
                ia6 = NULL;
                /* address is not ready, so discard the packet. */
-               nd6log((LOG_INFO, "%s: packet to an unready address %s->%s\n",
+               nd6log(info, "%s: packet to an unready address %s->%s\n",
                    __func__, ip6_sprintf(&ip6->ip6_src),
-                   ip6_sprintf(&ip6->ip6_dst)));
+                   ip6_sprintf(&ip6->ip6_dst));
                goto bad;
        }
        lck_rw_done(&in6_ifaddr_rwlock);
@@ -1000,9 +997,9 @@ check_with_pf:
                RT_UNLOCK(rin6.ro_rt);
                ia6 = NULL;
                /* address is not ready, so discard the packet. */
-               nd6log((LOG_INFO, "%s: packet to an unready address %s->%s\n",
+               nd6log(error, "%s: packet to an unready address %s->%s\n",
                    __func__, ip6_sprintf(&ip6->ip6_src),
-                   ip6_sprintf(&ip6->ip6_dst)));
+                   ip6_sprintf(&ip6->ip6_dst));
                goto bad;
        }
 
@@ -1679,9 +1676,9 @@ ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp,
                // Send ECN flags for v4-mapped addresses
                if ((inp->inp_flags & IN6P_TCLASS) != 0) {
                        struct ip *ip_header = mtod(m, struct ip *);
-                       u_int8_t tos = (ip_header->ip_tos & IPTOS_ECN_MASK);
 
-                       mp = sbcreatecontrol_mbuf((caddr_t)&tos, sizeof(tos),
+                       int tclass = (int)(ip_header->ip_tos);
+                       mp = sbcreatecontrol_mbuf((caddr_t)&tclass, sizeof(tclass),
                            IPV6_TCLASS, IPPROTO_IPV6, mp);
                        if (*mp == NULL) {
                                return NULL;
index a468a93f60837f2e7ef7c7768497e3d94534c77f..b87f6ba7bf5b9d9ac648ff4cc90af78b1beb610b 100644 (file)
@@ -170,7 +170,7 @@ static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
 static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
     struct ip6_frag **);
 static int ip6_getpmtu(struct route_in6 *, struct route_in6 *,
-    struct ifnet *, struct in6_addr *, u_int32_t *, boolean_t *);
+    struct ifnet *, struct in6_addr *, u_int32_t *);
 static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, struct socket *,
     struct sockopt *sopt);
 static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **, int);
@@ -185,7 +185,7 @@ static void ip6_output_checksum(struct ifnet *, uint32_t, struct mbuf *,
 extern int udp_ctloutput(struct socket *, struct sockopt *);
 static int ip6_fragment_packet(struct mbuf **m,
     struct ip6_pktopts *opt, struct ip6_exthdrs *exthdrsp, struct ifnet *ifp,
-    uint32_t mtu, boolean_t alwaysfrag, uint32_t unfragpartlen,
+    uint32_t mtu, uint32_t unfragpartlen,
     struct route_in6 *ro_pmtu, int nxt0, uint32_t optlen);
 
 SYSCTL_DECL(_net_inet6_ip6);
@@ -285,7 +285,6 @@ ip6_output_list(struct mbuf *m0, int packetchain, struct ip6_pktopts *opt,
        int error = 0;
        struct in6_ifaddr *ia = NULL, *src_ia = NULL;
        u_int32_t mtu = 0;
-       boolean_t alwaysfrag = FALSE;
        u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
        struct ip6_rthdr *rh;
        struct in6_addr finaldst;
@@ -295,6 +294,11 @@ ip6_output_list(struct mbuf *m0, int packetchain, struct ip6_pktopts *opt,
        uint32_t pktcnt = 0;
        uint32_t packets_processed = 0;
        struct timeval start_tv;
+#if PF
+       boolean_t skip_pf = (ip6oa != NULL) &&
+           (ip6oa->ip6oa_flags & IP6OAF_SKIP_PF);
+#endif
+
 #if DUMMYNET
        struct m_tag *tag;
        struct ip6_out_args saved_ip6oa;
@@ -399,7 +403,6 @@ ip6_output_list(struct mbuf *m0, int packetchain, struct ip6_pktopts *opt,
                        ifnet_reference(origifp);
                }
                mtu = dn_tag->dn_mtu;
-               alwaysfrag = (dn_tag->dn_alwaysfrag != 0);
                unfragpartlen = dn_tag->dn_unfragpartlen;
 
                bcopy(&dn_tag->dn_exthdrs, &exthdrs, sizeof(exthdrs));
@@ -473,6 +476,9 @@ tags_done:
                if (ip6oa->ip6oa_flags & IP6OAF_NO_EXPENSIVE) {
                        ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_EXPENSIVE;
                }
+               if (ip6oa->ip6oa_flags & IP6OAF_NO_CONSTRAINED) {
+                       ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_CONSTRAINED;
+               }
                adv = &ip6oa->ip6oa_flowadv;
                adv->code = FADV_SUCCESS;
                ip6oa->ip6oa_retflags = 0;
@@ -501,7 +507,7 @@ tags_done:
         * only needs to happen once per function entry.
         */
        necp_matched_policy_id = necp_ip6_output_find_policy_match(m, flags,
-           (flags & IPV6_OUTARGS) ? ip6oa : NULL, &necp_result,
+           (flags & IPV6_OUTARGS) ? ip6oa : NULL, ro ? ro->ro_rt : NULL, &necp_result,
            &necp_result_parameter);
 #endif /* NECP */
 
@@ -576,6 +582,9 @@ loopit:
                case NECP_KERNEL_POLICY_RESULT_PASS:
                        goto skip_ipsec;
                case NECP_KERNEL_POLICY_RESULT_DROP:
+                       error = EHOSTUNREACH;
+                       ip6stat.ip6s_necp_policy_drop++;
+                       goto freehdrs;
                case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT:
                        /*
                         * Flow divert packets should be blocked at the IP
@@ -824,7 +833,11 @@ skip_ipsec:
 
        if (!TAILQ_EMPTY(&ipv6_filters) &&
            !((flags & IPV6_OUTARGS) &&
-           (ip6oa->ip6oa_flags & IP6OAF_INTCOPROC_ALLOWED))) {
+           (ip6oa->ip6oa_flags & IP6OAF_INTCOPROC_ALLOWED)
+#if NECP
+           && !necp_packet_should_skip_filters(m)
+#endif // NECP
+           )) {
                struct ipfilter *filter;
                int seen = (inject_filter_ref == NULL);
                int fixscope = 0;
@@ -1354,8 +1367,7 @@ routefound:
        }
 
        /* Determine path MTU. */
-       if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu,
-           &alwaysfrag)) != 0) {
+       if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu)) != 0) {
                goto bad;
        }
 
@@ -1430,7 +1442,7 @@ routefound:
 check_with_pf:
 #endif /* DUMMYNET */
 #if PF
-       if (PF_IS_ENABLED) {
+       if (PF_IS_ENABLED && !skip_pf) {
 #if DUMMYNET
 
                /*
@@ -1448,7 +1460,6 @@ check_with_pf:
                args.fwa_ro6_pmtu = ro_pmtu;
                args.fwa_origifp = origifp;
                args.fwa_mtu = mtu;
-               args.fwa_alwaysfrag = alwaysfrag;
                args.fwa_unfragpartlen = unfragpartlen;
                args.fwa_exthdrs = &exthdrs;
                /* Invoke outbound packet filter */
@@ -1497,7 +1508,7 @@ check_with_pf:
         * is unchanged.
         */
        error = ip6_fragment_packet(&m, opt,
-           &exthdrs, ifp, mtu, alwaysfrag, unfragpartlen, ro_pmtu, nxt0,
+           &exthdrs, ifp, mtu, unfragpartlen, ro_pmtu, nxt0,
            optlen);
 
        if (error) {
@@ -1654,25 +1665,19 @@ bad:
 /* ip6_fragment_packet
  *
  * The fragmentation logic is rather complex:
- * 1: normal case (dontfrag == 0, alwaysfrag == 0)
+ * 1: normal case (dontfrag == 0)
  * 1-a:        send as is if tlen <= path mtu
  * 1-b:        fragment if tlen > path mtu
  *
  * 2: if user asks us not to fragment (dontfrag == 1)
  * 2-a:        send as is if tlen <= interface mtu
  * 2-b:        error if tlen > interface mtu
- *
- * 3: if we always need to attach fragment header (alwaysfrag == 1)
- *     always fragment
- *
- * 4: if dontfrag == 1 && alwaysfrag == 1
- *     error, as we cannot handle this conflicting request
  */
 
 static int
 ip6_fragment_packet(struct mbuf **mptr, struct ip6_pktopts *opt,
     struct ip6_exthdrs *exthdrsp, struct ifnet *ifp, uint32_t mtu,
-    boolean_t alwaysfrag, uint32_t unfragpartlen, struct route_in6 *ro_pmtu,
+    uint32_t unfragpartlen, struct route_in6 *ro_pmtu,
     int nxt0, uint32_t optlen)
 {
        VERIFY(NULL != mptr);
@@ -1695,11 +1700,6 @@ ip6_fragment_packet(struct mbuf **mptr, struct ip6_pktopts *opt,
                }
        }
 
-       if (dontfrag && alwaysfrag) {   /* case 4 */
-               /* conflicting request - can't transmit */
-               return EMSGSIZE;
-       }
-
        /* Access without acquiring nd_ifinfo lock for performance */
        if (dontfrag && tlen > IN6_LINKMTU(ifp)) {      /* case 2-b */
                /*
@@ -1723,9 +1723,9 @@ ip6_fragment_packet(struct mbuf **mptr, struct ip6_pktopts *opt,
        /*
         * transmit packet without fragmentation
         */
-       if (dontfrag || (!alwaysfrag &&         /* case 1-a and 2-a */
+       if (dontfrag ||
            (tlen <= mtu || TSO_IPV6_OK(ifp, m) ||
-           (ifp->if_hwassist & CSUM_FRAGMENT_IPV6)))) {
+           (ifp->if_hwassist & CSUM_FRAGMENT_IPV6))) {
                /*
                 * mppn not updated in this case because no new chain is formed
                 * and inserted
@@ -1733,12 +1733,24 @@ ip6_fragment_packet(struct mbuf **mptr, struct ip6_pktopts *opt,
                ip6_output_checksum(ifp, mtu, m, nxt0, tlen, optlen);
        } else {
                /*
-                * time to fragment - cases 1-b and 3 are handled inside
+                * time to fragment - cases 1-b is handled inside
                 * ip6_do_fragmentation().
                 * mppn is passed down to be updated to point at fragment chain.
                 */
+               u_int8_t *lexthdrsp;
+
+               if (exthdrsp->ip6e_rthdr != NULL) {
+                       lexthdrsp = mtod(exthdrsp->ip6e_rthdr, uint8_t *);
+               } else if (exthdrsp->ip6e_dest1 != NULL) {
+                       lexthdrsp = mtod(exthdrsp->ip6e_dest1, uint8_t *);
+               } else if (exthdrsp->ip6e_hbh != NULL) {
+                       lexthdrsp = mtod(exthdrsp->ip6e_hbh, uint8_t *);
+               } else {
+                       lexthdrsp = NULL;
+               }
                error = ip6_do_fragmentation(mptr, optlen, ifp,
-                   unfragpartlen, mtod(m, struct ip6_hdr *), exthdrsp, mtu, nxt0);
+                   unfragpartlen, mtod(m, struct ip6_hdr *), lexthdrsp, mtu,
+                   nxt0, htonl(ip6_randomid()));
        }
 
        return error;
@@ -1749,11 +1761,19 @@ ip6_fragment_packet(struct mbuf **mptr, struct ip6_pktopts *opt,
  * the packet needs to be fragmented. on success, morig is freed and a chain
  * of fragments is linked into the packet chain where morig existed. Otherwise,
  * an errno is returned.
+ * optlen:        total length of all extension headers (excludes the IPv6 header).
+ * unfragpartlen: length of the per-fragment headers which consist of the IPv6
+ *                header plus any extension headers that must be processed by nodes
+ *                en route to the destination.
+ * lexthdrsp:     pointer to the last extension header in the unfragmentable part
+ *                or NULL.
+ * nxt0:          upper-layer protocol number.
+ * id:            Identification value to be used in the fragment header.
  */
 int
 ip6_do_fragmentation(struct mbuf **mptr, uint32_t optlen, struct ifnet *ifp,
-    uint32_t unfragpartlen, struct ip6_hdr *ip6, struct ip6_exthdrs *exthdrsp,
-    uint32_t mtu, int nxt0)
+    uint32_t unfragpartlen, struct ip6_hdr *ip6, uint8_t *lexthdrsp,
+    uint32_t mtu, int nxt0, uint32_t id)
 {
        VERIFY(NULL != mptr);
        int error = 0;
@@ -1764,9 +1784,7 @@ ip6_do_fragmentation(struct mbuf **mptr, uint32_t optlen, struct ifnet *ifp,
 
        size_t tlen = morig->m_pkthdr.len;
 
-       /*
-        * try to fragment the packet.  case 1-b and 3
-        */
+       /* try to fragment the packet. case 1-b */
        if ((morig->m_pkthdr.csum_flags & CSUM_TSO_IPV6)) {
                /* TSO and fragment aren't compatible */
                in6_ifstat_inc(ifp, ifs6_out_fragfail);
@@ -1783,7 +1801,6 @@ ip6_do_fragmentation(struct mbuf **mptr, uint32_t optlen, struct ifnet *ifp,
                size_t hlen, len, off;
                struct mbuf **mnext = NULL;
                struct ip6_frag *ip6f;
-               u_int32_t id = htonl(ip6_randomid());
                u_char nextproto;
 
                /*
@@ -1806,15 +1823,9 @@ ip6_do_fragmentation(struct mbuf **mptr, uint32_t optlen, struct ifnet *ifp,
                 * Change the next header field of the last header in the
                 * unfragmentable part.
                 */
-               if (exthdrsp->ip6e_rthdr != NULL) {
-                       nextproto = *mtod(exthdrsp->ip6e_rthdr, u_char *);
-                       *mtod(exthdrsp->ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
-               } else if (exthdrsp->ip6e_dest1 != NULL) {
-                       nextproto = *mtod(exthdrsp->ip6e_dest1, u_char *);
-                       *mtod(exthdrsp->ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
-               } else if (exthdrsp->ip6e_hbh != NULL) {
-                       nextproto = *mtod(exthdrsp->ip6e_hbh, u_char *);
-                       *mtod(exthdrsp->ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
+               if (lexthdrsp != NULL) {
+                       nextproto = *lexthdrsp;
+                       *lexthdrsp = IPPROTO_FRAGMENT;
                } else {
                        nextproto = ip6->ip6_nxt;
                        ip6->ip6_nxt = IPPROTO_FRAGMENT;
@@ -2257,17 +2268,11 @@ ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
 
 static int
 ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro,
-    struct ifnet *ifp, struct in6_addr *dst, u_int32_t *mtup,
-    boolean_t *alwaysfragp)
+    struct ifnet *ifp, struct in6_addr *dst, u_int32_t *mtup)
 {
        u_int32_t mtu = 0;
-       boolean_t alwaysfrag = FALSE;
        int error = 0;
-       boolean_t is_local = FALSE;
 
-       if (IN6_IS_SCOPE_LINKLOCAL(dst)) {
-               is_local = TRUE;
-       }
 
        if (ro_pmtu != ro) {
                /* The first hop and the final destination may differ. */
@@ -2319,17 +2324,6 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro,
                        if (!(ro_pmtu->ro_rt->rt_rmx.rmx_locks & RTV_MTU)) {
                                ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu; /* XXX */
                        }
-               } else if (mtu < IPV6_MMTU) {
-                       /*
-                        * RFC2460 section 5, last paragraph:
-                        * if we record ICMPv6 too big message with
-                        * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
-                        * or smaller, with framgent header attached.
-                        * (fragment header is needed regardless from the
-                        * packet size, for translators to identify packets)
-                        */
-                       alwaysfrag = TRUE;
-                       mtu = IPV6_MMTU;
                }
        } else {
                if (ifp) {
@@ -2341,9 +2335,6 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro,
        }
 
        *mtup = mtu;
-       if ((alwaysfragp != NULL) && !is_local) {
-               *alwaysfragp = alwaysfrag;
-       }
        return error;
 }
 
@@ -2361,6 +2352,7 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt)
        int level, op = -1, optname = 0;
        int optlen = 0;
        struct proc *p;
+       lck_mtx_t *mutex_held = NULL;
 
        VERIFY(sopt != NULL);
 
@@ -2377,6 +2369,22 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt)
                boolean_t capture_exthdrstat_in = FALSE;
                switch (op) {
                case SOPT_SET:
+                       mutex_held = socket_getlock(so, PR_F_WILLUNLOCK);
+                       /*
+                        * Wait if we are in the middle of ip6_output
+                        * as we unlocked the socket there and don't
+                        * want to overwrite the IP options
+                        */
+                       if (in6p->inp_sndinprog_cnt > 0) {
+                               in6p->inp_sndingprog_waiters++;
+
+                               while (in6p->inp_sndinprog_cnt > 0) {
+                                       msleep(&in6p->inp_sndinprog_cnt, mutex_held,
+                                           PSOCK | PCATCH, "inp_sndinprog_cnt",
+                                           NULL);
+                               }
+                               in6p->inp_sndingprog_waiters--;
+                       }
                        switch (optname) {
                        case IPV6_2292PKTOPTIONS: {
                                struct mbuf *m;
@@ -2923,7 +2931,7 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt)
                                 * the outgoing interface.
                                 */
                                error = ip6_getpmtu(&sro, NULL, NULL,
-                                   &in6p->in6p_faddr, &pmtu, NULL);
+                                   &in6p->in6p_faddr, &pmtu);
                                ROUTE_RELEASE(&sro);
                                if (error) {
                                        break;
@@ -3783,7 +3791,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
                        return EINVAL;
                }
                hlimp = (int *)(void *)buf;
-               if (*hlimp < -1 || *hlimp > 255) {
+               if (*hlimp < -1 || *hlimp > IPV6_MAXHLIM) {
                        return EINVAL;
                }
 
index b0c1b84cb790bd6096c2736f0aea1db684ed243d..3dca8411e4a9491f1f80f28712b02b15ef5ee773 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -448,6 +448,8 @@ struct ip6_out_args {
 #define IP6OAF_QOSMARKING_ALLOWED 0x00000080    /* policy allows Fastlane DSCP marking */
 #define IP6OAF_INTCOPROC_ALLOWED 0x00000100     /* access to internal coproc interfaces */
 #define IP6OAF_NO_LOW_POWER     0x00000200      /* skip low power */
+#define IP6OAF_NO_CONSTRAINED   0x00000400      /* skip IFXF_CONSTRAINED */
+#define IP6OAF_SKIP_PF          0x00000800      /* skip PF */
        u_int32_t       ip6oa_retflags; /* IP6OARF return flags (see below) */
 #define IP6OARF_IFDENIED        0x00000001      /* denied access to interface */
        int             ip6oa_sotc;             /* traffic class for Fastlane DSCP mapping */
@@ -556,7 +558,7 @@ extern struct ip6_pktopts *ip6_copypktopts(struct ip6_pktopts *, int);
 extern int ip6_optlen(struct inpcb *);
 extern void ip6_drain(void);
 extern int ip6_do_fragmentation(struct mbuf **, uint32_t, struct ifnet *, uint32_t,
-    struct ip6_hdr *, struct ip6_exthdrs *, uint32_t, int);
+    struct ip6_hdr *, uint8_t *, uint32_t, int, uint32_t);
 
 extern int route6_input(struct mbuf **, int *, int);
 
index a9f8cefea234a6a42c750002582dd52d29b4b7a7..55d1bf799b252df88261e7ce4c81408bd27ef59c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -194,6 +194,12 @@ struct ip6protosw {
         */
        TAILQ_HEAD(, socket_filter) pr_filter_head;
        struct protosw_old *pr_old;
+
+       void    (*pr_update_last_owner) /* update last socket owner) */
+       (struct socket *so, struct proc *p, struct proc *ep);
+
+       void    (*pr_copy_last_owner) /* copy last socket from listener */
+       (struct socket *so, struct socket *head);
 };
 #endif /* BSD_KERNEL_PRIVATE */
 #endif /* _NETINET6_IP6PROTOSW_H_ */
index 991764ea0859935cfcf719cdbcb3231ac767c81f..ad996afcd0bb31ed71c4d92215f5ac7ac8bef01d 100644 (file)
@@ -53,18 +53,4 @@ struct ipcomp {
 
 #define IPCOMP_CPI_NEGOTIATE_MIN        256
 
-#ifdef BSD_KERNEL_PRIVATE
-struct ipcomp_algorithm {
-       int (*compress)(struct mbuf *, struct mbuf *, size_t *);
-       int (*decompress)(struct mbuf *, struct mbuf *, size_t *);
-       size_t minplen;         /* minimum required length for compression */
-};
-
-struct ipsecrequest;
-extern void ipcomp_init(struct protosw *, struct domain *);
-extern const struct ipcomp_algorithm *ipcomp_algorithm_lookup(int);
-extern void ipcomp4_input(struct mbuf *, int);
-extern int ipcomp4_output(struct mbuf *, struct secasvar *);
-#endif /* BSD_KERNEL_PRIVATE */
-
 #endif /* _NETINET6_IPCOMP_H_ */
diff --git a/bsd/netinet6/ipcomp6.h b/bsd/netinet6/ipcomp6.h
deleted file mode 100644 (file)
index 6703070..0000000
+++ /dev/null
@@ -1,48 +0,0 @@
-/*     $FreeBSD: src/sys/netinet6/ipcomp6.h,v 1.1.2.2 2001/07/03 11:01:54 ume Exp $    */
-/*     $KAME: ipcomp.h,v 1.8 2000/09/26 07:55:14 itojun Exp $  */
-
-/*
- * Copyright (C) 1999 WIDE Project.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the project nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * RFC2393 IP payload compression protocol (IPComp).
- */
-
-#ifndef _NETINET6_IPCOMP6_H_
-#define _NETINET6_IPCOMP6_H_
-#include <sys/appleapiopts.h>
-#include <netinet6/ipsec.h>
-
-#ifdef BSD_KERNEL_PRIVATE
-extern int ipcomp6_input(struct mbuf **, int *, int);
-extern int ipcomp6_output(struct mbuf *, u_char *, struct mbuf *,
-    struct secasvar *);
-#endif /* BSD_KERNEL_PRIVATE */
-
-#endif /*_NETINET6_IPCOMP6_H_*/
diff --git a/bsd/netinet6/ipcomp_core.c b/bsd/netinet6/ipcomp_core.c
deleted file mode 100644 (file)
index ef4fc2d..0000000
+++ /dev/null
@@ -1,480 +0,0 @@
-/*
- * Copyright (c) 2016 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*     $FreeBSD: src/sys/netinet6/ipcomp_core.c,v 1.1.2.2 2001/07/03 11:01:54 ume Exp $        */
-/*     $KAME: ipcomp_core.c,v 1.24 2000/10/23 04:24:22 itojun Exp $    */
-
-/*
- * Copyright (C) 1999 WIDE Project.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the project nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * RFC2393 IP payload compression protocol (IPComp).
- */
-
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#include <sys/domain.h>
-#include <sys/protosw.h>
-#include <sys/socket.h>
-#include <sys/errno.h>
-#include <sys/time.h>
-#include <sys/kernel.h>
-#include <sys/syslog.h>
-#include <sys/queue.h>
-
-#include <net/if.h>
-#include <net/route.h>
-#if IPCOMP_ZLIB
-#include <libkern/zlib.h>
-#endif
-#include <kern/cpu_number.h>
-
-#include <netinet6/ipcomp.h>
-#if INET6
-#include <netinet6/ipcomp6.h>
-#endif
-#include <netinet6/ipsec.h>
-#if INET6
-#include <netinet6/ipsec6.h>
-#endif
-
-#include <net/net_osdep.h>
-
-#if IPCOMP_ZLIB
-static void *deflate_alloc(void *, u_int, u_int);
-static void deflate_free(void *, void *);
-static int deflate_common(struct mbuf *, struct mbuf *, size_t *, int);
-static int deflate_compress(struct mbuf *, struct mbuf *, size_t *);
-static int deflate_decompress(struct mbuf *, struct mbuf *, size_t *);
-
-/*
- * We need to use default window size (2^15 = 32Kbytes as of writing) for
- * inbound case.  Otherwise we get interop problem.
- * Use negative value to avoid Adler32 checksum.  This is an undocumented
- * feature in zlib (see ipsec wg mailing list archive in January 2000).
- */
-static int deflate_policy = Z_DEFAULT_COMPRESSION;
-static int deflate_window_out = -12;
-static const int deflate_window_in = -1 * MAX_WBITS;    /* don't change it */
-static int deflate_memlevel = MAX_MEM_LEVEL;
-
-static z_stream deflate_stream;
-static z_stream inflate_stream;
-#endif /* IPCOMP_ZLIB */
-
-#if IPCOMP_ZLIB
-static const struct ipcomp_algorithm ipcomp_algorithms[] = {
-       { deflate_compress, deflate_decompress, 90 },
-};
-#else
-static const struct ipcomp_algorithm ipcomp_algorithms[] __unused = {};
-#endif
-
-decl_lck_mtx_data(static, ipcomp_mutex_data);
-static lck_mtx_t *ipcomp_mutex = &ipcomp_mutex_data;
-
-void
-ipcomp_init(struct protosw *pp, struct domain *dp)
-{
-#pragma unused(dp)
-       static int ipcomp_initialized = 0;
-       lck_grp_attr_t *ipcomp_mutex_grp_attr = NULL;
-       lck_attr_t *ipcomp_mutex_attr = NULL;
-       lck_grp_t *ipcomp_mutex_grp = NULL;
-
-       VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
-
-       if (ipcomp_initialized) {
-               return;
-       }
-
-       ipcomp_mutex_grp_attr = lck_grp_attr_alloc_init();
-       ipcomp_mutex_grp = lck_grp_alloc_init("ipcomp", ipcomp_mutex_grp_attr);
-       lck_grp_attr_free(ipcomp_mutex_grp_attr);
-
-       ipcomp_mutex_attr = lck_attr_alloc_init();
-       lck_mtx_init(ipcomp_mutex, ipcomp_mutex_grp, ipcomp_mutex_attr);
-       lck_grp_free(ipcomp_mutex_grp);
-       lck_attr_free(ipcomp_mutex_attr);
-
-       ipcomp_initialized = 1;
-}
-
-const struct ipcomp_algorithm *
-ipcomp_algorithm_lookup(
-#if IPCOMP_ZLIB
-       int idx
-#else
-       __unused int idx
-#endif
-       )
-{
-#if IPCOMP_ZLIB
-       if (idx == SADB_X_CALG_DEFLATE) {
-               /*
-                * Avert your gaze, ugly hack follows!
-                * We init here so our malloc can allocate using M_WAIT.
-                * We don't want to allocate if ipcomp isn't used, and we
-                * don't want to allocate on the input or output path.
-                * Allocation fails if we use M_NOWAIT because init allocates
-                * something like 256k (ouch).
-                */
-               if (deflate_stream.zalloc == NULL) {
-                       deflate_stream.zalloc = deflate_alloc;
-                       deflate_stream.zfree = deflate_free;
-                       if (deflateInit2(&deflate_stream, deflate_policy, Z_DEFLATED,
-                           deflate_window_out, deflate_memlevel, Z_DEFAULT_STRATEGY)) {
-                               /* Allocation failed */
-                               bzero(&deflate_stream, sizeof(deflate_stream));
-#if IPSEC_DEBUG
-                               printf("ipcomp_algorithm_lookup: deflateInit2 failed.\n");
-#endif
-                       }
-               }
-
-               if (inflate_stream.zalloc == NULL) {
-                       inflate_stream.zalloc = deflate_alloc;
-                       inflate_stream.zfree = deflate_free;
-                       if (inflateInit2(&inflate_stream, deflate_window_in)) {
-                               /* Allocation failed */
-                               bzero(&inflate_stream, sizeof(inflate_stream));
-#if IPSEC_DEBUG
-                               printf("ipcomp_algorithm_lookup: inflateInit2 failed.\n");
-#endif
-                       }
-               }
-
-               return &ipcomp_algorithms[0];
-       }
-#endif /* IPCOMP_ZLIB */
-       return NULL;
-}
-
-#if IPCOMP_ZLIB
-static void *
-deflate_alloc(
-       __unused void *aux,
-       u_int items,
-       u_int siz)
-{
-       void *ptr;
-       ptr = _MALLOC(items * siz, M_TEMP, M_NOWAIT);
-       return ptr;
-}
-
-static void
-deflate_free(
-       __unused void *aux,
-       void *ptr)
-{
-       FREE(ptr, M_TEMP);
-}
-
-/* @param mode 0: compress 1: decompress */
-static int
-deflate_common(struct mbuf *m, struct mbuf *md, size_t *lenp, int mode)
-{
-       struct mbuf *mprev;
-       struct mbuf *p;
-       struct mbuf *n = NULL, *n0 = NULL, **np;
-       z_stream *zs;
-       int error = 0;
-       int zerror;
-       size_t offset;
-
-#define MOREBLOCK() \
-do { \
-       /* keep the reply buffer into our chain */              \
-       if (n) {                                                \
-               n->m_len = zs->total_out - offset;              \
-               offset = zs->total_out;                         \
-               *np = n;                                        \
-               np = &n->m_next;                                \
-               n = NULL;                                       \
-       }                                                       \
-                                                                \
-       /* get a fresh reply buffer */                          \
-       MGET(n, M_DONTWAIT, MT_DATA);                           \
-       if (n) {                                                \
-               MCLGET(n, M_DONTWAIT);                          \
-       }                                                       \
-       if (!n) {                                               \
-               error = ENOBUFS;                                \
-               goto fail;                                      \
-       }                                                       \
-       n->m_len = 0;                                           \
-       n->m_len = M_TRAILINGSPACE(n);                          \
-       n->m_next = NULL;                                       \
-       /* \
-        * if this is the first reply buffer, reserve \
-        * region for ipcomp header. \
-        */                                                     \
-       if (*np == NULL) {                                      \
-               n->m_len -= sizeof(struct ipcomp);              \
-               n->m_data += sizeof(struct ipcomp);             \
-       }                                                       \
-                                                                \
-       zs->next_out = mtod(n, u_int8_t *);                     \
-       zs->avail_out = n->m_len;                               \
-} while (0)
-
-       for (mprev = m; mprev && mprev->m_next != md; mprev = mprev->m_next) {
-               ;
-       }
-       if (!mprev) {
-               panic("md is not in m in deflate_common");
-       }
-
-
-       lck_mtx_lock(ipcomp_mutex);
-       zs = mode ? &inflate_stream : &deflate_stream;
-       if (zs->zalloc == NULL) {
-               /*
-                * init is called in ipcomp_algorithm_lookup.
-                * if zs->zalloc is NULL, either init hasn't been called (unlikely)
-                * or init failed because of no memory.
-                */
-               error = ENOBUFS;
-               goto fail;
-       }
-
-       zs->next_in = 0;
-       zs->avail_in = 0;
-       zs->next_out = 0;
-       zs->avail_out = 0;
-
-       n0 = n = NULL;
-       np = &n0;
-       offset = 0;
-       zerror = 0;
-       p = md;
-       while (p && p->m_len == 0) {
-               p = p->m_next;
-       }
-
-       /* input stream and output stream are available */
-       while (p && zs->avail_in == 0) {
-               /* get input buffer */
-               if (p && zs->avail_in == 0) {
-                       zs->next_in = mtod(p, u_int8_t *);
-                       zs->avail_in = p->m_len;
-                       p = p->m_next;
-                       while (p && p->m_len == 0) {
-                               p = p->m_next;
-                       }
-               }
-
-               /* get output buffer */
-               if (zs->next_out == NULL || zs->avail_out == 0) {
-                       MOREBLOCK();
-               }
-
-               zerror = mode ? inflate(zs, Z_NO_FLUSH)
-                   : deflate(zs, Z_NO_FLUSH);
-
-               if (zerror == Z_STREAM_END) {
-                       ; /*once more.*/
-               } else if (zerror == Z_OK) {
-                       /* inflate: Z_OK can indicate the end of decode */
-                       if (mode && !p && zs->avail_out != 0) {
-                               goto terminate;
-                       }
-
-                       /* else once more.*/
-               } else {
-                       if (zs->msg) {
-                               ipseclog((LOG_ERR, "ipcomp_%scompress: "
-                                   "%sflate(Z_NO_FLUSH): %s\n",
-                                   mode ? "de" : "", mode ? "in" : "de",
-                                   zs->msg));
-                       } else {
-                               ipseclog((LOG_ERR, "ipcomp_%scompress: "
-                                   "%sflate(Z_NO_FLUSH): unknown error (%d)\n",
-                                   mode ? "de" : "", mode ? "in" : "de",
-                                   zerror));
-                       }
-                       mode ? inflateReset(zs) : deflateReset(zs);
-/*                     mode ? inflateEnd(zs) : deflateEnd(zs);*/
-                       error = EINVAL;
-                       goto fail;
-               }
-       }
-
-       if (zerror == Z_STREAM_END) {
-               goto terminate;
-       }
-
-       /* termination */
-       while (1) {
-               /* get output buffer */
-               if (zs->next_out == NULL || zs->avail_out == 0) {
-                       MOREBLOCK();
-               }
-
-               zerror = mode ? inflate(zs, Z_FINISH)
-                   : deflate(zs, Z_FINISH);
-
-               if (zerror == Z_STREAM_END) {
-                       break;
-               } else if (zerror == Z_OK) {
-                       ; /*once more.*/
-               } else {
-                       if (zs->msg) {
-                               ipseclog((LOG_ERR, "ipcomp_%scompress: "
-                                   "%sflate(Z_FINISH): %s\n",
-                                   mode ? "de" : "", mode ? "in" : "de",
-                                   zs->msg));
-                       } else {
-                               ipseclog((LOG_ERR, "ipcomp_%scompress: "
-                                   "%sflate(Z_FINISH): unknown error (%d)\n",
-                                   mode ? "de" : "", mode ? "in" : "de",
-                                   zerror));
-                       }
-                       mode ? inflateReset(zs) : deflateReset(zs);
-/*                     mode ? inflateEnd(zs) : deflateEnd(zs); */
-                       error = EINVAL;
-                       goto fail;
-               }
-       }
-
-terminate:
-       /* keep the final reply buffer into our chain */
-       if (n) {
-               n->m_len = zs->total_out - offset;
-               offset = zs->total_out;
-               *np = n;
-               np = &n->m_next;
-               n = NULL;
-       }
-
-       /* switch the mbuf to the new one */
-       mprev->m_next = n0;
-       m_freem(md);
-       *lenp = zs->total_out;
-
-       /* reset the inflate/deflate state */
-       zerror = mode ? inflateReset(zs) : deflateReset(zs);
-       if (zerror != Z_OK) {
-               /*
-                * A failure here is uncommon. If this does
-                * fail, the packet can still be used but
-                * the z_stream will be messed up so subsequent
-                * inflates/deflates will probably fail.
-                */
-               if (zs->msg) {
-                       ipseclog((LOG_ERR, "ipcomp_%scompress: "
-                           "%sflateEnd: %s\n",
-                           mode ? "de" : "", mode ? "in" : "de",
-                           zs->msg));
-               } else {
-                       ipseclog((LOG_ERR, "ipcomp_%scompress: "
-                           "%sflateEnd: unknown error (%d)\n",
-                           mode ? "de" : "", mode ? "in" : "de",
-                           zerror));
-               }
-       }
-
-       lck_mtx_unlock(ipcomp_mutex);
-       return 0;
-
-fail:
-       lck_mtx_unlock(ipcomp_mutex);
-       if (m) {
-               m_freem(m);
-       }
-       if (n) {
-               m_freem(n);
-       }
-       if (n0) {
-               m_freem(n0);
-       }
-       return error;
-#undef MOREBLOCK
-}
-
-static int
-deflate_compress(struct mbuf *m, struct mbuf *md, size_t *lenp)
-{
-       if (!m) {
-               panic("m == NULL in deflate_compress");
-       }
-       if (!md) {
-               panic("md == NULL in deflate_compress");
-       }
-       if (!lenp) {
-               panic("lenp == NULL in deflate_compress");
-       }
-
-       return deflate_common(m, md, lenp, 0);
-}
-
-static int
-deflate_decompress(struct mbuf *m, struct mbuf *md, size_t *lenp)
-{
-       if (!m) {
-               panic("m == NULL in deflate_decompress");
-       }
-       if (!md) {
-               panic("md == NULL in deflate_decompress");
-       }
-       if (!lenp) {
-               panic("lenp == NULL in deflate_decompress");
-       }
-
-       return deflate_common(m, md, lenp, 1);
-}
-#endif /* IPCOMP_ZLIB */
diff --git a/bsd/netinet6/ipcomp_input.c b/bsd/netinet6/ipcomp_input.c
deleted file mode 100644 (file)
index a50e11d..0000000
+++ /dev/null
@@ -1,362 +0,0 @@
-/*     $FreeBSD: src/sys/netinet6/ipcomp_input.c,v 1.1.2.2 2001/07/03 11:01:54 ume Exp $       */
-/*     $KAME: ipcomp_input.c,v 1.25 2001/03/01 09:12:09 itojun Exp $   */
-
-/*
- * Copyright (C) 1999 WIDE Project.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the project nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * RFC2393 IP payload compression protocol (IPComp).
- */
-
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#include <sys/mcache.h>
-#include <sys/domain.h>
-#include <sys/protosw.h>
-#include <sys/socket.h>
-#include <sys/errno.h>
-#include <sys/time.h>
-#include <sys/kernel.h>
-#include <sys/syslog.h>
-
-#include <net/if.h>
-#include <net/route.h>
-#include <libkern/zlib.h>
-#include <kern/cpu_number.h>
-#include <kern/locks.h>
-
-#include <netinet/in.h>
-#include <netinet/in_systm.h>
-#include <netinet/in_var.h>
-#include <netinet/ip.h>
-#include <netinet/ip_var.h>
-#include <netinet/ip_ecn.h>
-#include <netinet/kpi_ipfilter_var.h>
-
-#if INET6
-#include <netinet/ip6.h>
-#include <netinet6/ip6_var.h>
-#endif
-#include <netinet6/ipcomp.h>
-#if INET6
-#include <netinet6/ipcomp6.h>
-#endif
-
-#include <netinet6/ipsec.h>
-#if INET6
-#include <netinet6/ipsec6.h>
-#endif
-#include <netkey/key.h>
-#include <netkey/keydb.h>
-
-#include <net/net_osdep.h>
-#include <mach/sdt.h>
-
-#define IPLEN_FLIPPED
-
-void
-ipcomp4_input(struct mbuf *m, int off)
-{
-       struct mbuf *md;
-       struct ip *ip;
-       struct ipcomp *ipcomp;
-       const struct ipcomp_algorithm *algo;
-       u_int16_t cpi;  /* host order */
-       u_int16_t nxt;
-       size_t hlen;
-       int error;
-       size_t newlen, olen;
-       struct secasvar *sav = NULL;
-
-       if (m->m_pkthdr.len < off + sizeof(struct ipcomp)) {
-               ipseclog((LOG_DEBUG, "IPv4 IPComp input: assumption failed "
-                   "(packet too short)\n"));
-               IPSEC_STAT_INCREMENT(ipsecstat.in_inval);
-               goto fail;
-       }
-
-       md = m_pulldown(m, off, sizeof(*ipcomp), NULL);
-       if (!md) {
-               m = NULL;       /*already freed*/
-               ipseclog((LOG_DEBUG, "IPv4 IPComp input: assumption failed "
-                   "(pulldown failure)\n"));
-               IPSEC_STAT_INCREMENT(ipsecstat.in_inval);
-               goto fail;
-       }
-       ipcomp = mtod(md, struct ipcomp *);
-
-       /* Expect 32-bit aligned data pointer on strict-align platforms */
-       MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
-
-       ip = mtod(m, struct ip *);
-       nxt = ipcomp->comp_nxt;
-#ifdef _IP_VHL
-       hlen = IP_VHL_HL(ip->ip_vhl) << 2;
-#else
-       hlen = ip->ip_hl << 2;
-#endif
-
-       cpi = ntohs(ipcomp->comp_cpi);
-
-       if (cpi >= IPCOMP_CPI_NEGOTIATE_MIN) {
-               sav = key_allocsa(AF_INET, (caddr_t)&ip->ip_src,
-                   (caddr_t)&ip->ip_dst, IPPROTO_IPCOMP, htonl(cpi));
-               if (sav != NULL
-                   && (sav->state == SADB_SASTATE_MATURE
-                   || sav->state == SADB_SASTATE_DYING)) {
-                       cpi = sav->alg_enc;     /*XXX*/
-                       /* other parameters to look at? */
-               }
-       }
-       algo = ipcomp_algorithm_lookup(cpi);
-       if (!algo) {
-               ipseclog((LOG_WARNING, "IPv4 IPComp input: unknown cpi %u\n",
-                   cpi));
-               IPSEC_STAT_INCREMENT(ipsecstat.in_nosa);
-               goto fail;
-       }
-
-       /* chop ipcomp header */
-       ipcomp = NULL;
-       md->m_data += sizeof(struct ipcomp);
-       md->m_len -= sizeof(struct ipcomp);
-       m->m_pkthdr.len -= sizeof(struct ipcomp);
-#ifdef IPLEN_FLIPPED
-       ip->ip_len -= sizeof(struct ipcomp);
-#else
-       ip->ip_len = htons(ntohs(ip->ip_len) - sizeof(struct ipcomp));
-#endif
-
-       olen = m->m_pkthdr.len;
-       newlen = m->m_pkthdr.len - off;
-       error = (*algo->decompress)(m, m->m_next, &newlen);
-       if (error != 0) {
-               if (error == EINVAL) {
-                       IPSEC_STAT_INCREMENT(ipsecstat.in_inval);
-               } else if (error == ENOBUFS) {
-                       IPSEC_STAT_INCREMENT(ipsecstat.in_nomem);
-               }
-               m = NULL;
-               goto fail;
-       }
-       IPSEC_STAT_INCREMENT(ipsecstat.in_comphist[cpi]);
-
-       /*
-        * returning decompressed packet onto icmp is meaningless.
-        * mark it decrypted to prevent icmp from attaching original packet.
-        */
-       m->m_flags |= M_DECRYPTED;
-
-       m->m_pkthdr.len = off + newlen;
-       ip = mtod(m, struct ip *);
-       {
-               size_t len;
-#ifdef IPLEN_FLIPPED
-               len = ip->ip_len;
-#else
-               len = ntohs(ip->ip_len);
-#endif
-               /*
-                * be careful about underflow.  also, do not assign exact value
-                * as ip_len is manipulated differently on *BSDs.
-                */
-               len += m->m_pkthdr.len;
-               len -= olen;
-               if (len & ~0xffff) {
-                       /* packet too big after decompress */
-                       IPSEC_STAT_INCREMENT(ipsecstat.in_inval);
-                       goto fail;
-               }
-#ifdef IPLEN_FLIPPED
-               ip->ip_len = len & 0xffff;
-#else
-               ip->ip_len = htons(len & 0xffff);
-#endif
-               ip->ip_p = nxt;
-       }
-
-       if (sav) {
-               key_sa_recordxfer(sav, m);
-               if (ipsec_addhist(m, IPPROTO_IPCOMP, (u_int32_t)cpi) != 0) {
-                       IPSEC_STAT_INCREMENT(ipsecstat.in_nomem);
-                       goto fail;
-               }
-               key_freesav(sav, KEY_SADB_UNLOCKED);
-               sav = NULL;
-       }
-
-       if (nxt != IPPROTO_DONE) {
-               if ((ip_protox[nxt]->pr_flags & PR_LASTHDR) != 0 &&
-                   ipsec4_in_reject(m, NULL)) {
-                       IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
-                       goto fail;
-               }
-
-               DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL,
-                   struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif,
-                   struct ip *, ip, struct ip6_hdr *, NULL);
-
-               ip_proto_dispatch_in(m, off, nxt, 0);
-       } else {
-               m_freem(m);
-       }
-       m = NULL;
-
-       IPSEC_STAT_INCREMENT(ipsecstat.in_success);
-       return;
-
-fail:
-       if (sav) {
-               key_freesav(sav, KEY_SADB_UNLOCKED);
-       }
-
-       if (m) {
-               m_freem(m);
-       }
-       return;
-}
-
-#if INET6
-int
-ipcomp6_input(struct mbuf **mp, int *offp, int proto)
-{
-#pragma unused(proto)
-       struct mbuf *m, *md;
-       int off;
-       struct ip6_hdr *ip6;
-       struct ipcomp *ipcomp;
-       const struct ipcomp_algorithm *algo;
-       u_int16_t cpi;  /* host order */
-       u_int16_t nxt;
-       int error;
-       size_t newlen;
-       struct secasvar *sav = NULL;
-       char *prvnxtp;
-
-       m = *mp;
-       off = *offp;
-
-       md = m_pulldown(m, off, sizeof(*ipcomp), NULL);
-       if (!md) {
-               m = NULL;       /*already freed*/
-               ipseclog((LOG_DEBUG, "IPv6 IPComp input: assumption failed "
-                   "(pulldown failure)\n"));
-               IPSEC_STAT_INCREMENT(ipsec6stat.in_inval);
-               goto fail;
-       }
-       ipcomp = mtod(md, struct ipcomp *);
-
-       /* Expect 32-bit aligned data pointer on strict-align platforms */
-       MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
-
-       ip6 = mtod(m, struct ip6_hdr *);
-       nxt = ipcomp->comp_nxt;
-
-       cpi = ntohs(ipcomp->comp_cpi);
-
-       if (cpi >= IPCOMP_CPI_NEGOTIATE_MIN) {
-               sav = key_allocsa(AF_INET6, (caddr_t)&ip6->ip6_src,
-                   (caddr_t)&ip6->ip6_dst, IPPROTO_IPCOMP, htonl(cpi));
-               if (sav != NULL
-                   && (sav->state == SADB_SASTATE_MATURE
-                   || sav->state == SADB_SASTATE_DYING)) {
-                       cpi = sav->alg_enc;     /*XXX*/
-                       /* other parameters to look at? */
-               }
-       }
-       algo = ipcomp_algorithm_lookup(cpi);
-       if (!algo) {
-               ipseclog((LOG_WARNING, "IPv6 IPComp input: unknown cpi %u; "
-                   "dropping the packet for simplicity\n", cpi));
-               IPSEC_STAT_INCREMENT(ipsec6stat.in_nosa);
-               goto fail;
-       }
-
-       /* chop ipcomp header */
-       ipcomp = NULL;
-       md->m_data += sizeof(struct ipcomp);
-       md->m_len -= sizeof(struct ipcomp);
-       m->m_pkthdr.len -= sizeof(struct ipcomp);
-
-       newlen = m->m_pkthdr.len - off;
-       error = (*algo->decompress)(m, md, &newlen);
-       if (error != 0) {
-               if (error == EINVAL) {
-                       IPSEC_STAT_INCREMENT(ipsec6stat.in_inval);
-               } else if (error == ENOBUFS) {
-                       IPSEC_STAT_INCREMENT(ipsec6stat.in_nomem);
-               }
-               m = NULL;
-               goto fail;
-       }
-       IPSEC_STAT_INCREMENT(ipsec6stat.in_comphist[cpi]);
-       m->m_pkthdr.len = off + newlen;
-
-       /*
-        * returning decompressed packet onto icmp is meaningless.
-        * mark it decrypted to prevent icmp from attaching original packet.
-        */
-       m->m_flags |= M_DECRYPTED;
-
-       /* update next header field */
-       prvnxtp = ip6_get_prevhdr(m, off);
-       *prvnxtp = nxt;
-
-       /*
-        * no need to adjust payload length, as all the IPv6 protocols
-        * look at m->m_pkthdr.len
-        */
-
-       if (sav) {
-               key_sa_recordxfer(sav, m);
-               if (ipsec_addhist(m, IPPROTO_IPCOMP, (u_int32_t)cpi) != 0) {
-                       IPSEC_STAT_INCREMENT(ipsec6stat.in_nomem);
-                       goto fail;
-               }
-               key_freesav(sav, KEY_SADB_UNLOCKED);
-               sav = NULL;
-       }
-       *offp = off;
-       *mp = m;
-       IPSEC_STAT_INCREMENT(ipsec6stat.in_success);
-       return nxt;
-
-fail:
-       if (m) {
-               m_freem(m);
-       }
-       if (sav) {
-               key_freesav(sav, KEY_SADB_UNLOCKED);
-       }
-       return IPPROTO_DONE;
-}
-#endif /* INET6 */
diff --git a/bsd/netinet6/ipcomp_output.c b/bsd/netinet6/ipcomp_output.c
deleted file mode 100644 (file)
index 6abaede..0000000
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- * Copyright (c) 2016 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*     $FreeBSD: src/sys/netinet6/ipcomp_output.c,v 1.1.2.2 2001/07/03 11:01:54 ume Exp $      */
-/*     $KAME: ipcomp_output.c,v 1.23 2001/01/23 08:59:37 itojun Exp $  */
-
-/*
- * Copyright (C) 1999 WIDE Project.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the project nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * RFC2393 IP payload compression protocol (IPComp).
- */
-
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#include <sys/domain.h>
-#include <sys/protosw.h>
-#include <sys/socket.h>
-#include <sys/errno.h>
-#include <sys/time.h>
-#include <sys/kernel.h>
-#include <sys/syslog.h>
-
-#include <net/if.h>
-#include <net/route.h>
-#include <libkern/zlib.h>
-#include <kern/cpu_number.h>
-#include <kern/locks.h>
-
-#include <netinet/in.h>
-#include <netinet/in_systm.h>
-#include <netinet/in_var.h>
-#include <netinet/ip.h>
-#include <netinet/ip_var.h>
-#include <netinet/ip_ecn.h>
-
-#if INET6
-#include <netinet/ip6.h>
-#include <netinet6/ip6_var.h>
-#endif
-#include <netinet6/ipcomp.h>
-#if INET6
-#include <netinet6/ipcomp6.h>
-#endif
-
-#include <netinet6/ipsec.h>
-#if INET6
-#include <netinet6/ipsec6.h>
-#endif
-#include <netkey/key.h>
-#include <netkey/keydb.h>
-
-#include <net/net_osdep.h>
-
-
-static int ipcomp_output(struct mbuf *, u_char *, struct mbuf *,
-    int, struct secasvar *sav);
-
-/*
- * Modify the packet so that the payload is compressed.
- * The mbuf (m) must start with IPv4 or IPv6 header.
- * On failure, free the given mbuf and return non-zero.
- *
- * on invocation:
- *     m   nexthdrp md
- *     v   v        v
- *     IP ......... payload
- * during the encryption:
- *     m   nexthdrp mprev md
- *     v   v        v     v
- *     IP ............... ipcomp payload
- *                        <-----><----->
- *                        complen  plen
- *     <-> hlen
- *     <-----------------> compoff
- */
-static int
-ipcomp_output(struct mbuf *m, u_char *nexthdrp, struct mbuf *md, int af, struct secasvar *sav)
-{
-       struct mbuf *n;
-       struct mbuf *md0;
-       struct mbuf *mcopy;
-       struct mbuf *mprev;
-       struct ipcomp *ipcomp;
-       const struct ipcomp_algorithm *algo;
-       u_int16_t cpi;          /* host order */
-       size_t plen0, plen;     /*payload length to be compressed*/
-       size_t compoff;
-       int afnumber;
-       int error = 0;
-       struct ipsecstat *stat;
-
-       switch (af) {
-#if INET
-       case AF_INET:
-               afnumber = 4;
-               stat = &ipsecstat;
-               break;
-#endif
-#if INET6
-       case AF_INET6:
-               afnumber = 6;
-               stat = &ipsec6stat;
-               break;
-#endif
-       default:
-               ipseclog((LOG_ERR, "ipcomp_output: unsupported af %d\n", af));
-               return 0;       /* no change at all */
-       }
-
-       /* grab parameters */
-       algo = ipcomp_algorithm_lookup(sav->alg_enc);
-       if ((ntohl(sav->spi) & ~0xffff) != 0 || !algo) {
-               IPSEC_STAT_INCREMENT(stat->out_inval);
-               m_freem(m);
-               return EINVAL;
-       }
-       if ((sav->flags & SADB_X_EXT_RAWCPI) == 0) {
-               cpi = sav->alg_enc;
-       } else {
-               cpi = ntohl(sav->spi) & 0xffff;
-       }
-
-       /* compute original payload length */
-       plen = 0;
-       for (n = md; n; n = n->m_next) {
-               plen += n->m_len;
-       }
-
-       /* if the payload is short enough, we don't need to compress */
-       if (plen < algo->minplen) {
-               return 0;
-       }
-
-       /*
-        * retain the original packet for two purposes:
-        * (1) we need to backout our changes when compression is not necessary.
-        * (2) byte lifetime computation should use the original packet.
-        *     see RFC2401 page 23.
-        * compromise two m_copym().  we will be going through every byte of
-        * the payload during compression process anyways.
-        */
-       mcopy = m_copym(m, 0, M_COPYALL, M_NOWAIT);
-       if (mcopy == NULL) {
-               error = ENOBUFS;
-               return 0;
-       }
-       md0 = m_copym(md, 0, M_COPYALL, M_NOWAIT);
-       if (md0 == NULL) {
-               m_freem(mcopy);
-               error = ENOBUFS;
-               return 0;
-       }
-       plen0 = plen;
-
-       /* make the packet over-writable */
-       for (mprev = m; mprev && mprev->m_next != md; mprev = mprev->m_next) {
-               ;
-       }
-       if (mprev == NULL || mprev->m_next != md) {
-               ipseclog((LOG_DEBUG, "ipcomp%d_output: md is not in chain\n",
-                   afnumber));
-               IPSEC_STAT_INCREMENT(stat->out_inval);
-               m_freem(m);
-               m_freem(md0);
-               m_freem(mcopy);
-               return EINVAL;
-       }
-       mprev->m_next = NULL;
-       if ((md = ipsec_copypkt(md)) == NULL) {
-               m_freem(m);
-               m_freem(md0);
-               m_freem(mcopy);
-               error = ENOBUFS;
-               goto fail;
-       }
-       mprev->m_next = md;
-
-       /* compress data part */
-       if ((*algo->compress)(m, md, &plen) || mprev->m_next == NULL) {
-               ipseclog((LOG_ERR, "packet compression failure\n"));
-               m = NULL;
-               m_freem(md0);
-               m_freem(mcopy);
-               IPSEC_STAT_INCREMENT(stat->out_inval);
-               error = EINVAL;
-               goto fail;
-       }
-       IPSEC_STAT_INCREMENT(stat->out_comphist[sav->alg_enc]);
-       md = mprev->m_next;
-
-       /*
-        * if the packet became bigger, meaningless to use IPComp.
-        * we've only wasted our cpu time.
-        */
-       if (plen0 < plen) {
-               m_freem(md);
-               m_freem(mcopy);
-               mprev->m_next = md0;
-               return 0;
-       }
-
-       /*
-        * no need to backout change beyond here.
-        */
-       m_freem(md0);
-       md0 = NULL;
-
-       m->m_pkthdr.len -= plen0;
-       m->m_pkthdr.len += plen;
-
-       {
-               /*
-                * insert IPComp header.
-                */
-#if INET
-               struct ip *ip = NULL;
-#endif
-#if INET6
-               struct ip6_hdr *ip6 = NULL;
-#endif
-               size_t hlen = 0; /*ip header len*/
-               size_t complen = sizeof(struct ipcomp);
-
-               switch (af) {
-#if INET
-               case AF_INET:
-                       ip = mtod(m, struct ip *);
-#ifdef _IP_VHL
-                       hlen = IP_VHL_HL(ip->ip_vhl) << 2;
-#else
-                       hlen = ip->ip_hl << 2;
-#endif
-                       break;
-#endif
-#if INET6
-               case AF_INET6:
-                       ip6 = mtod(m, struct ip6_hdr *);
-                       hlen = sizeof(*ip6);
-                       break;
-#endif
-               }
-
-               compoff = m->m_pkthdr.len - plen;
-
-               /*
-                * grow the mbuf to accomodate ipcomp header.
-                * before: IP ... payload
-                * after:  IP ... ipcomp payload
-                */
-               if (M_LEADINGSPACE(md) < complen) {
-                       MGET(n, M_DONTWAIT, MT_DATA);
-                       if (!n) {
-                               m_freem(m);
-                               error = ENOBUFS;
-                               goto fail;
-                       }
-                       n->m_len = complen;
-                       mprev->m_next = n;
-                       n->m_next = md;
-                       m->m_pkthdr.len += complen;
-                       ipcomp = mtod(n, struct ipcomp *);
-               } else {
-                       md->m_len += complen;
-                       md->m_data -= complen;
-                       m->m_pkthdr.len += complen;
-                       ipcomp = mtod(md, struct ipcomp *);
-               }
-
-               bzero(ipcomp, sizeof(*ipcomp));
-               ipcomp->comp_nxt = *nexthdrp;
-               *nexthdrp = IPPROTO_IPCOMP;
-               ipcomp->comp_cpi = htons(cpi);
-               switch (af) {
-#if INET
-               case AF_INET:
-                       if (compoff + complen + plen < IP_MAXPACKET) {
-                               ip->ip_len = htons(compoff + complen + plen);
-                       } else {
-                               ipseclog((LOG_ERR,
-                                   "IPv4 ESP output: size exceeds limit\n"));
-                               IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
-                               m_freem(m);
-                               error = EMSGSIZE;
-                               goto fail;
-                       }
-                       break;
-#endif
-#if INET6
-               case AF_INET6:
-                       /* total packet length will be computed in ip6_output() */
-                       break;
-#endif
-               }
-       }
-
-       if (!m) {
-               ipseclog((LOG_DEBUG,
-                   "NULL mbuf after compression in ipcomp%d_output",
-                   afnumber));
-               IPSEC_STAT_INCREMENT(stat->out_inval);
-       }
-       IPSEC_STAT_INCREMENT(stat->out_success);
-
-       /* compute byte lifetime against original packet */
-       key_sa_recordxfer(sav, mcopy);
-       m_freem(mcopy);
-
-       return 0;
-
-fail:
-#if 1
-       return error;
-#else
-       panic("something bad in ipcomp_output");
-#endif
-}
-
-#if INET
-int
-ipcomp4_output(struct mbuf *m, struct secasvar *sav)
-{
-       struct ip *ip;
-       if (m->m_len < sizeof(struct ip)) {
-               ipseclog((LOG_DEBUG, "ipcomp4_output: first mbuf too short\n"));
-               IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
-               m_freem(m);
-               return 0;
-       }
-       ip = mtod(m, struct ip *);
-       /* XXX assumes that m->m_next points to payload */
-       return ipcomp_output(m, &ip->ip_p, m->m_next, AF_INET, sav);
-}
-#endif /*INET*/
-
-#if INET6
-int
-ipcomp6_output(
-       struct mbuf *m,
-       u_char *nexthdrp,
-       struct mbuf *md,
-       struct secasvar *sav)
-{
-       if (m->m_len < sizeof(struct ip6_hdr)) {
-               ipseclog((LOG_DEBUG, "ipcomp6_output: first mbuf too short\n"));
-               IPSEC_STAT_INCREMENT(ipsec6stat.out_inval);
-               m_freem(m);
-               return 0;
-       }
-       return ipcomp_output(m, nexthdrp, md, AF_INET6, sav);
-}
-#endif /*INET6*/
index 32683adc31c1bd52c8a2b2a1d8c5b722ef7a864c..671a6a64f7f459ca3947ce1b4d3d3a617fd1d8d9 100644 (file)
@@ -76,6 +76,7 @@
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
+#include <sys/priv.h>
 #include <kern/locks.h>
 #include <sys/kauth.h>
 #include <libkern/OSAtomic.h>
 #include <netinet6/esp6.h>
 #endif
 #endif
-#include <netinet6/ipcomp.h>
-#if INET6
-#include <netinet6/ipcomp6.h>
-#endif
 #include <netkey/key.h>
 #include <netkey/keydb.h>
 #include <netkey/key_debug.h>
 
 #include <net/net_osdep.h>
 
+#include <IOKit/pwr_mgt/IOPM.h>
+
 #if IPSEC_DEBUG
 int ipsec_debug = 1;
 #else
@@ -164,6 +163,9 @@ extern u_int64_t natt_now;
 
 struct ipsec_tag;
 
+void *sleep_wake_handle = NULL;
+bool ipsec_save_wake_pkt = false;
+
 SYSCTL_DECL(_net_inet_ipsec);
 #if INET6
 SYSCTL_DECL(_net_inet6_ipsec6);
@@ -238,6 +240,10 @@ SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ESP_RANDPAD,
     esp_randpad, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_esp_randpad, 0, "");
 #endif /* INET6 */
 
+SYSCTL_DECL(_net_link_generic_system);
+
+struct ipsec_wake_pkt_info ipsec_wake_pkt;
+
 static int ipsec_setspidx_interface(struct secpolicyindex *, u_int, struct mbuf *,
     int, int, int);
 static int ipsec_setspidx_mbuf(struct secpolicyindex *, u_int, u_int,
@@ -271,23 +277,27 @@ static void ipsec_optaux(struct mbuf *, struct ipsec_tag *);
 int ipsec_send_natt_keepalive(struct secasvar *sav);
 bool ipsec_fill_offload_frame(ifnet_t ifp, struct secasvar *sav, struct ifnet_keepalive_offload_frame *frame, size_t frame_data_offset);
 
+extern bool IOPMCopySleepWakeUUIDKey(char *, size_t);
+extern void *registerSleepWakeInterest(void *, void *, void *);
+
 static int
 sysctl_def_policy SYSCTL_HANDLER_ARGS
 {
-       int old_policy = ip4_def_policy.policy;
-       int error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
+       int new_policy = ip4_def_policy.policy;
+       int error = sysctl_handle_int(oidp, &new_policy, 0, req);
 
 #pragma unused(arg1, arg2)
+       if (error == 0) {
+               if (new_policy != IPSEC_POLICY_NONE &&
+                   new_policy != IPSEC_POLICY_DISCARD) {
+                       return EINVAL;
+               }
+               ip4_def_policy.policy = new_policy;
 
-       if (ip4_def_policy.policy != IPSEC_POLICY_NONE &&
-           ip4_def_policy.policy != IPSEC_POLICY_DISCARD) {
-               ip4_def_policy.policy = old_policy;
-               return EINVAL;
-       }
-
-       /* Turn off the bypass if the default security policy changes */
-       if (ipsec_bypass != 0 && ip4_def_policy.policy != IPSEC_POLICY_NONE) {
-               ipsec_bypass = 0;
+               /* Turn off the bypass if the default security policy changes */
+               if (ipsec_bypass != 0 && ip4_def_policy.policy != IPSEC_POLICY_NONE) {
+                       ipsec_bypass = 0;
+               }
        }
 
        return error;
@@ -627,7 +637,7 @@ ipsec4_getpolicybyinterface(struct mbuf *m,
                        /* Disabled policies go in the clear */
                        key_freesp(*sp, KEY_SADB_UNLOCKED);
                        *sp = NULL;
-                       *flags |= IP_NOIPSEC; /* Avoid later IPSec check */
+                       *flags |= IP_NOIPSEC; /* Avoid later IPsec check */
                } else {
                        /* If policy is enabled, redirect to ipsec interface */
                        ipoa->ipoa_boundif = (*sp)->ipsec_if->if_index;
@@ -939,7 +949,7 @@ ipsec6_getpolicybyinterface(struct mbuf *m,
                        /* Disabled policies go in the clear */
                        key_freesp(*sp, KEY_SADB_UNLOCKED);
                        *sp = NULL;
-                       *noipsec = 1; /* Avoid later IPSec check */
+                       *noipsec = 1; /* Avoid later IPsec check */
                } else {
                        /* If policy is enabled, redirect to ipsec interface */
                        ip6oap->ip6oa_boundif = (*sp)->ipsec_if->if_index;
@@ -1894,11 +1904,8 @@ ipsec_get_reqlevel(struct ipsecrequest *isr)
                        }
                        break;
                case IPPROTO_IPCOMP:
-                       /*
-                        * we don't really care, as IPcomp document says that
-                        * we shouldn't compress small packets
-                        */
-                       level = IPSEC_LEVEL_USE;
+                       ipseclog((LOG_ERR, "ipsec_get_reqlevel: "
+                           "still got IPCOMP - exiting\n"));
                        break;
                default:
                        panic("ipsec_get_reqlevel: "
@@ -2183,8 +2190,10 @@ ipsec_hdrsiz(struct secpolicy *sp)
                case IPPROTO_AH:
                        clen = ah_hdrsiz(isr);
                        break;
-               case IPPROTO_IPCOMP:
-                       clen = sizeof(struct ipcomp);
+               default:
+                       ipseclog((LOG_ERR, "ipsec_hdrsiz: "
+                           "unknown protocol %u\n",
+                           isr->saidx.proto));
                        break;
                }
 
@@ -2679,9 +2688,6 @@ ipsec6_update_routecache_and_output(
        case IPPROTO_AH:
                error = ah6_output(state->m, &ip6->ip6_nxt, state->m->m_next, sav);
                break;
-       case IPPROTO_IPCOMP:
-       /* XXX code should be here */
-       /*FALLTHROUGH*/
        default:
                ipseclog((LOG_ERR, "%s: unknown ipsec protocol %d\n", __FUNCTION__, sav->sah->saidx.proto));
                m_freem(state->m);
@@ -2875,7 +2881,7 @@ ipsec46_encapsulate(struct ipsec_output_state *state, struct secasvar *sav)
  * based on RFC 2401.
  */
 int
-ipsec_chkreplay(u_int32_t seq, struct secasvar *sav)
+ipsec_chkreplay(u_int32_t seq, struct secasvar *sav, u_int8_t replay_index)
 {
        const struct secreplay *replay;
        u_int32_t diff;
@@ -2890,7 +2896,7 @@ ipsec_chkreplay(u_int32_t seq, struct secasvar *sav)
        }
 
        lck_mtx_lock(sadb_mutex);
-       replay = sav->replay;
+       replay = sav->replay[replay_index];
 
        if (replay->wsize == 0) {
                lck_mtx_unlock(sadb_mutex);
@@ -2947,7 +2953,7 @@ ipsec_chkreplay(u_int32_t seq, struct secasvar *sav)
  *     1:      NG
  */
 int
-ipsec_updatereplay(u_int32_t seq, struct secasvar *sav)
+ipsec_updatereplay(u_int32_t seq, struct secasvar *sav, u_int8_t replay_index)
 {
        struct secreplay *replay;
        u_int32_t diff;
@@ -2961,7 +2967,7 @@ ipsec_updatereplay(u_int32_t seq, struct secasvar *sav)
        }
 
        lck_mtx_lock(sadb_mutex);
-       replay = sav->replay;
+       replay = sav->replay[replay_index];
 
        if (replay->wsize == 0) {
                goto ok;        /* no need to check replay. */
@@ -3351,19 +3357,13 @@ ipsec4_output_internal(struct ipsec_output_state *state, struct secasvar *sav)
                        goto bad;
                }
                break;
-       case IPPROTO_IPCOMP:
-               if ((error = ipcomp4_output(state->m, sav)) != 0) {
-                       state->m = NULL;
-                       goto bad;
-               }
-               break;
        default:
                ipseclog((LOG_ERR,
                    "ipsec4_output: unknown ipsec protocol %d\n",
                    sav->sah->saidx.proto));
                m_freem(state->m);
                state->m = NULL;
-               error = EINVAL;
+               error = EPROTONOSUPPORT;
                goto bad;
        }
 
@@ -3607,15 +3607,12 @@ ipsec6_output_trans_internal(
        case IPPROTO_AH:
                error = ah6_output(state->m, nexthdrp, mprev->m_next, sav);
                break;
-       case IPPROTO_IPCOMP:
-               error = ipcomp6_output(state->m, nexthdrp, mprev->m_next, sav);
-               break;
        default:
                ipseclog((LOG_ERR, "ipsec6_output_trans: "
                    "unknown ipsec protocol %d\n", sav->sah->saidx.proto));
                m_freem(state->m);
                IPSEC_STAT_INCREMENT(ipsec6stat.out_inval);
-               error = EINVAL;
+               error = EPROTONOSUPPORT;
                break;
        }
        if (error) {
@@ -3907,20 +3904,13 @@ ipsec6_output_tunnel_internal(struct ipsec_output_state *state, struct secasvar
                                        goto bad;
                                }
                                break;
-                       case IPPROTO_IPCOMP:
-                               if ((error = ipcomp4_output(state->m, sav)) != 0) {
-                                       state->m = NULL;
-                                       ROUTE_RELEASE(&ro4_copy);
-                                       goto bad;
-                               }
-                               break;
                        default:
                                ipseclog((LOG_ERR,
                                    "ipsec4_output: unknown ipsec protocol %d\n",
                                    sav->sah->saidx.proto));
                                m_freem(state->m);
                                state->m = NULL;
-                               error = EINVAL;
+                               error = EPROTONOSUPPORT;
                                ROUTE_RELEASE(&ro4_copy);
                                goto bad;
                        }
@@ -4027,9 +4017,6 @@ ipsec6_output_tunnel_internal(struct ipsec_output_state *state, struct secasvar
        case IPPROTO_AH:
                error = ah6_output(state->m, &ip6->ip6_nxt, state->m->m_next, sav);
                break;
-       case IPPROTO_IPCOMP:
-       /* XXX code should be here */
-       /*FALLTHROUGH*/
        default:
                ipseclog((LOG_ERR, "ipsec6_output_tunnel: "
                    "unknown ipsec protocol %d\n", sav->sah->saidx.proto));
@@ -4892,7 +4879,7 @@ ipsec_send_natt_keepalive(
        LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED);
        lck_mtx_lock(sadb_mutex);
 
-       if ((esp_udp_encap_port & 0xFFFF) == 0 || sav->remote_ike_port == 0) {
+       if (((esp_udp_encap_port & 0xFFFF) == 0 && sav->natt_encapsulated_src_port == 0) || sav->remote_ike_port == 0) {
                lck_mtx_unlock(sadb_mutex);
                return FALSE;
        }
@@ -4953,6 +4940,11 @@ ipsec_send_natt_keepalive(
                        ip->ip_src = ((struct sockaddr_in*)&sav->sah->saidx.dst)->sin_addr;
                        ip->ip_dst = ((struct sockaddr_in*)&sav->sah->saidx.src)->sin_addr;
                }
+               if (sav->natt_encapsulated_src_port != 0) {
+                       uh->uh_sport = (u_short)sav->natt_encapsulated_src_port;
+               } else {
+                       uh->uh_sport = htons((u_short)esp_udp_encap_port);
+               }
                uh->uh_sport = htons((u_short)esp_udp_encap_port);
                uh->uh_dport = htons(sav->remote_ike_port);
                uh->uh_ulen = htons(1 + sizeof(*uh));
@@ -5018,7 +5010,11 @@ ipsec_send_natt_keepalive(
                        ip6->ip6_dst.s6_addr16[1] = 0;
                }
 
-               uh->uh_sport = htons((u_short)esp_udp_encap_port);
+               if (sav->natt_encapsulated_src_port != 0) {
+                       uh->uh_sport = (u_short)sav->natt_encapsulated_src_port;
+               } else {
+                       uh->uh_sport = htons((u_short)esp_udp_encap_port);
+               }
                uh->uh_dport = htons(sav->remote_ike_port);
                uh->uh_ulen = htons(1 + sizeof(*uh));
                *(u_int8_t*)((char*)m_mtod(m) + sizeof(*ip6) + sizeof(*uh)) = 0xFF;
@@ -5073,7 +5069,7 @@ ipsec_fill_offload_frame(ifnet_t ifp,
            !(sav->flags & SADB_X_EXT_NATT_KEEPALIVE) ||
            !(sav->flags & SADB_X_EXT_NATT_KEEPALIVE_OFFLOAD) ||
            sav->flags & SADB_X_EXT_ESP_KEEPALIVE ||
-           (esp_udp_encap_port & 0xFFFF) == 0 ||
+           ((esp_udp_encap_port & 0xFFFF) == 0 && sav->natt_encapsulated_src_port == 0) ||
            sav->remote_ike_port == 0 ||
            (natt_keepalive_interval == 0 && sav->natt_interval == 0 && sav->natt_offload_interval == 0)) {
                /* SA is not eligible for keepalive offload on this interface */
@@ -5127,7 +5123,12 @@ ipsec_fill_offload_frame(ifnet_t ifp,
                ip->ip_dst = ((struct sockaddr_in*)&sav->sah->saidx.src)->sin_addr;
        }
        ip->ip_sum = in_cksum_hdr_opt(ip);
-       uh->uh_sport = htons((u_short)esp_udp_encap_port);
+       /* Fill out the UDP header */
+       if (sav->natt_encapsulated_src_port != 0) {
+               uh->uh_sport = (u_short)sav->natt_encapsulated_src_port;
+       } else {
+               uh->uh_sport = htons((u_short)esp_udp_encap_port);
+       }
        uh->uh_dport = htons(sav->remote_ike_port);
        uh->uh_ulen = htons(1 + sizeof(*uh));
        uh->uh_sum = 0;
@@ -5142,3 +5143,96 @@ ipsec_fill_offload_frame(ifnet_t ifp,
        }
        return TRUE;
 }
+
+static int
+sysctl_ipsec_wake_packet SYSCTL_HANDLER_ARGS
+{
+ #pragma unused(oidp, arg1, arg2)
+       if (req->newptr != USER_ADDR_NULL) {
+               ipseclog((LOG_ERR, "ipsec: invalid parameters"));
+               return EINVAL;
+       }
+
+       struct proc *p = current_proc();
+       if (p != NULL) {
+               uid_t uid = kauth_cred_getuid(proc_ucred(p));
+               if (uid != 0 && priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_IPSEC_WAKE_PACKET, 0) != 0) {
+                       ipseclog((LOG_ERR, "process does not hold necessary entitlement to get ipsec wake packet"));
+                       return EPERM;
+               }
+
+               int result = sysctl_io_opaque(req, &ipsec_wake_pkt, sizeof(ipsec_wake_pkt), NULL);
+               return result;
+       }
+
+       return EINVAL;
+}
+
+SYSCTL_PROC(_net_link_generic_system, OID_AUTO, ipsec_wake_pkt, CTLTYPE_STRUCT | CTLFLAG_RD |
+    CTLFLAG_LOCKED, 0, 0, &sysctl_ipsec_wake_packet, "S,ipsec wake packet", "");
+
+void
+ipsec_save_wake_packet(struct mbuf *wake_mbuf, u_int32_t spi, u_int32_t seq)
+{
+       if (wake_mbuf == NULL) {
+               ipseclog((LOG_ERR, "ipsec: bad wake packet"));
+               return;
+       }
+
+       lck_mtx_lock(sadb_mutex);
+       if (__probable(!ipsec_save_wake_pkt)) {
+               goto done;
+       }
+
+       u_int16_t max_len = (wake_mbuf->m_pkthdr.len > IPSEC_MAX_WAKE_PKT_LEN) ? IPSEC_MAX_WAKE_PKT_LEN : wake_mbuf->m_pkthdr.len;
+       m_copydata(wake_mbuf, 0, max_len, (void *)ipsec_wake_pkt.wake_pkt);
+       ipsec_wake_pkt.wake_pkt_len = max_len;
+
+       ipsec_wake_pkt.wake_pkt_spi = spi;
+       ipsec_wake_pkt.wake_pkt_seq = seq;
+
+       ipsec_save_wake_pkt = false;
+done:
+       lck_mtx_unlock(sadb_mutex);
+       return;
+}
+
+static IOReturn
+ipsec_sleep_wake_handler(void *target, void *refCon, UInt32 messageType,
+    void *provider, void *messageArgument, vm_size_t argSize)
+{
+#pragma unused(target, refCon, provider, messageArgument, argSize)
+       switch (messageType) {
+       case kIOMessageSystemWillSleep:
+               memset(&ipsec_wake_pkt, 0, sizeof(ipsec_wake_pkt));
+               IOPMCopySleepWakeUUIDKey(ipsec_wake_pkt.wake_uuid,
+                   sizeof(ipsec_wake_pkt.wake_uuid));
+               ipseclog((LOG_INFO,
+                   "ipsec: system will sleep"));
+               break;
+       case kIOMessageSystemHasPoweredOn:
+               ipsec_save_wake_pkt = true;
+               ipseclog((LOG_INFO,
+                   "ipsec: system has powered on"));
+               break;
+       default:
+               break;
+       }
+
+       return IOPMAckImplied;
+}
+
+void
+ipsec_monitor_sleep_wake(void)
+{
+       LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_OWNED);
+
+       if (sleep_wake_handle == NULL) {
+               sleep_wake_handle = registerSleepWakeInterest(ipsec_sleep_wake_handler,
+                   NULL, NULL);
+               if (sleep_wake_handle != NULL) {
+                       ipseclog((LOG_INFO,
+                           "ipsec: monitoring sleep wake"));
+               }
+       }
+}
index 0cfe8a0b3c97ed8706de920ceded253287f9dd95..eb094bfce7a038056dc7745290aaa8ce2063d346 100644 (file)
 #include <sys/appleapiopts.h>
 
 #include <net/pfkeyv2.h>
+#include <uuid/uuid.h>
 #ifdef BSD_KERNEL_PRIVATE
 #include <netkey/keydb.h>
 #include <netinet/ip_var.h>
 
-/* lock for IPSec stats */
+/* lock for IPsec stats */
 extern lck_grp_t         *sadb_stat_mutex_grp;
 extern lck_grp_attr_t    *sadb_stat_mutex_grp_attr;
 extern lck_attr_t        *sadb_stat_mutex_attr;
@@ -66,7 +67,7 @@ struct secpolicyaddrrange {
  * specifies ICMPv6 type, and the port field in "dst" specifies ICMPv6 code.
  */
 struct secpolicyindex {
-       u_int8_t dir;                   /* direction of packet flow, see blow */
+       u_int8_t dir;                   /* direction of packet flow, see below */
        struct sockaddr_storage src;    /* IP src address for SP */
        struct sockaddr_storage dst;    /* IP dst address for SP */
        u_int8_t prefs;                 /* prefix length in bits for src */
@@ -99,7 +100,7 @@ struct secpolicy {
        /* pointer to the ipsec request tree, */
        /* if policy == IPSEC else this value == NULL.*/
 
-       ifnet_t ipsec_if; /* IPSec interface to use */
+       ifnet_t ipsec_if; /* IPsec interface to use */
        ifnet_t outgoing_if; /* Outgoing interface for encrypted traffic */
 
        char disabled; /* Set to ignore policy */
@@ -232,6 +233,15 @@ struct ipsecstat {
        u_quad_t out_comphist[256] __attribute__ ((aligned(8)));
 };
 
+#define IPSEC_MAX_WAKE_PKT_LEN  100
+struct ipsec_wake_pkt_info {
+       u_int8_t wake_pkt[IPSEC_MAX_WAKE_PKT_LEN];
+       uuid_string_t wake_uuid;
+       u_int32_t wake_pkt_spi;
+       u_int32_t wake_pkt_seq;
+       u_int16_t wake_pkt_len;
+};
+
 #ifdef BSD_KERNEL_PRIVATE
 /*
  * Definitions for IPsec & Key sysctl operations.
@@ -325,6 +335,8 @@ extern int ip4_ipsec_dfbit;
 extern int ip4_ipsec_ecn;
 extern int ip4_esp_randpad;
 
+extern bool ipsec_save_wake_pkt;
+
 #define ipseclog(x)     do { if (ipsec_debug) log x; } while (0)
 
 extern struct secpolicy *ipsec4_getpolicybysock(struct mbuf *, u_int,
@@ -349,8 +361,8 @@ extern int ipsec4_in_reject(struct mbuf *, struct inpcb *);
 
 struct secas;
 struct tcpcb;
-extern int ipsec_chkreplay(u_int32_t, struct secasvar *);
-extern int ipsec_updatereplay(u_int32_t, struct secasvar *);
+extern int ipsec_chkreplay(u_int32_t, struct secasvar *, u_int8_t);
+extern int ipsec_updatereplay(u_int32_t, struct secasvar *, u_int8_t);
 
 extern size_t ipsec4_hdrsiz(struct mbuf *, u_int, struct inpcb *);
 extern size_t ipsec_hdrsiz_tcp(struct tcpcb *);
@@ -380,6 +392,8 @@ extern struct socket *ipsec_getsocket(struct mbuf *);
 extern int ipsec_addhist(struct mbuf *, int, u_int32_t);
 extern struct ipsec_history *ipsec_gethist(struct mbuf *, int *);
 extern void ipsec_clearhist(struct mbuf *);
+extern void ipsec_monitor_sleep_wake(void);
+extern void ipsec_save_wake_packet(struct mbuf *, u_int32_t, u_int32_t);
 #endif /* BSD_KERNEL_PRIVATE */
 
 #ifndef KERNEL
index 60b7777cded0d469975979fb8cca3b803ef4d57a..ba2daacdc8af07b7401cb6dc919167a3ccf63729 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -220,7 +220,7 @@ static void mld_sched_timeout(void);
 /*
  * Normative references: RFC 2710, RFC 3590, RFC 3810.
  */
-static struct timeval mld_gsrdelay = {10, 0};
+static struct timeval mld_gsrdelay = {.tv_sec = 10, .tv_usec = 0};
 static LIST_HEAD(, mld_ifinfo) mli_head;
 
 static int querier_present_timers_running6;
@@ -304,8 +304,8 @@ struct mld_raopt {
  * Router Alert hop-by-hop option header.
  */
 static struct mld_raopt mld_ra = {
-       .hbh = { 0, 0 },
-       .pad = { .ip6o_type = IP6OPT_PADN, 0 },
+       .hbh = { .ip6h_nxt = 0, .ip6h_len = 0 },
+       .pad = { .ip6o_type = IP6OPT_PADN, .ip6o_len = 0 },
        .ra = {
                .ip6or_type = (u_int8_t)IP6OPT_ROUTER_ALERT,
                .ip6or_len = (u_int8_t)(IP6OPT_RTALERT_LEN - 2),
@@ -449,7 +449,7 @@ sysctl_mld_v2enable SYSCTL_HANDLER_ARGS
        int error;
        int i;
        struct mld_ifinfo *mli;
-       struct mld_tparams mtp = { 0, 0, 0, 0 };
+       struct mld_tparams mtp = { .qpt = 0, .it = 0, .cst = 0, .sct = 0 };
 
        MLD_LOCK();
 
@@ -860,7 +860,7 @@ mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
        struct in6_multi        *inm;
        int                      err = 0, is_general_query;
        uint16_t                 timer;
-       struct mld_tparams       mtp = { 0, 0, 0, 0 };
+       struct mld_tparams       mtp = { .qpt = 0, .it = 0, .cst = 0, .sct = 0 };
 
        MLD_LOCK_ASSERT_NOTHELD();
 
@@ -907,7 +907,7 @@ mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
                 * Embed scope ID of receiving interface in MLD query for
                 * lookup whilst we don't hold other locks.
                 */
-               in6_setscope(&mld->mld_addr, ifp, NULL);
+               (void)in6_setscope(&mld->mld_addr, ifp, NULL);
        }
 
        /*
@@ -1049,7 +1049,7 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
        int                      err = 0, is_general_query;
        uint16_t                 timer;
        uint8_t                  qrv;
-       struct mld_tparams       mtp = { 0, 0, 0, 0 };
+       struct mld_tparams       mtp = { .qpt = 0, .it = 0, .cst = 0, .sct = 0 };
 
        MLD_LOCK_ASSERT_NOTHELD();
 
@@ -1132,7 +1132,7 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
                 * lookup whilst we don't hold other locks (due to KAME
                 * locking lameness). We own this mbuf chain just now.
                 */
-               in6_setscope(&mld->mld_addr, ifp, NULL);
+               (void)in6_setscope(&mld->mld_addr, ifp, NULL);
        }
 
        mli = MLD_IFINFO(ifp);
@@ -1432,7 +1432,7 @@ mld_v1_input_report(struct ifnet *ifp, struct mbuf *m,
         * whilst we don't hold other locks (due to KAME locking lameness).
         */
        if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
-               in6_setscope(&mld->mld_addr, ifp, NULL);
+               (void)in6_setscope(&mld->mld_addr, ifp, NULL);
        }
 
        /*
@@ -3651,7 +3651,7 @@ mld_dispatch_packet(struct mbuf *m)
        m0->m_pkthdr.rcvif = lo_ifp;
 
        ip6 = mtod(m0, struct ip6_hdr *);
-       (void) in6_setscope(&ip6->ip6_dst, ifp, NULL);
+       (void)in6_setscope(&ip6->ip6_dst, ifp, NULL);
 
        /*
         * Retrieve the ICMPv6 type before handoff to ip6_output(),
index 5b8a5d4777a6a3814fd25196dd1cb32aac18f5c1..0af74cc8da20f75e69f6446a4973a0af096d11e9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <netinet6/scope6_var.h>
 #include <netinet/icmp6.h>
 
+#include <os/log.h>
+
 #include "loop.h"
 
 #define ND6_SLOWTIMER_INTERVAL          (60 * 60)       /* 1 hour */
@@ -175,7 +177,7 @@ static lck_attr_t       *nd_if_lock_attr = NULL;
 
 /* Protected by nd6_mutex */
 struct nd_drhead nd_defrouter;
-struct nd_prhead nd_prefix = { 0 };
+struct nd_prhead nd_prefix = { .lh_first = 0 };
 
 /*
  * nd6_timeout() is scheduled on a demand basis.  nd6_timeout_run is used
@@ -536,9 +538,9 @@ nd6_ifattach(struct ifnet *ifp)
        lck_mtx_unlock(&ndi->lock);
        nd6_setmtu(ifp);
 
-       nd6log0((LOG_INFO, ": ",
-           "%s Reinit'd ND information for interface %s\n",
-           if_name(ifp)));
+       nd6log0(info,
+           "Reinit'd ND information for interface %s\n",
+           if_name(ifp));
        return;
 }
 
@@ -712,9 +714,9 @@ nd6_options(union nd_opts *ndopts)
                case ND_OPT_REDIRECTED_HEADER:
                case ND_OPT_NONCE:
                        if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
-                               nd6log((LOG_INFO,
+                               nd6log(error,
                                    "duplicated ND6 option found (type=%d)\n",
-                                   nd_opt->nd_opt_type));
+                                   nd_opt->nd_opt_type);
                                /* XXX bark? */
                        } else {
                                ndopts->nd_opt_array[nd_opt->nd_opt_type] =
@@ -738,16 +740,16 @@ nd6_options(union nd_opts *ndopts)
                         * Unknown options must be silently ignored,
                         * to accomodate future extension to the protocol.
                         */
-                       nd6log((LOG_DEBUG,
+                       nd6log(debug,
                            "nd6_options: unsupported option %d - "
-                           "option ignored\n", nd_opt->nd_opt_type));
+                           "option ignored\n", nd_opt->nd_opt_type);
                }
 
 skip1:
                i++;
                if (i > nd6_maxndopt) {
                        icmp6stat.icp6s_nd_toomanyopt++;
-                       nd6log((LOG_INFO, "too many loop in nd opt\n"));
+                       nd6log(info, "too many loop in nd opt\n");
                        break;
                }
 
@@ -792,9 +794,9 @@ nd6_service(void *arg)
         * to run this entire operation single threaded.
         */
        while (nd6_service_busy) {
-               nd6log2((LOG_DEBUG, "%s: %s is blocked by %d waiters\n",
+               nd6log2(debug, "%s: %s is blocked by %d waiters\n",
                    __func__, ap->draining ? "drainer" : "timer",
-                   nd6_service_waiters));
+                   nd6_service_waiters);
                nd6_service_waiters++;
                (void) msleep(nd6_service_wc, rnh_lock, (PZERO - 1),
                    __func__, NULL);
@@ -1201,10 +1203,10 @@ again:
                                 * learned on cellular interface. Ever.
                                 */
                                dr->expire += dr->rtlifetime;
-                               nd6log2((LOG_DEBUG,
+                               nd6log2(debug,
                                    "%s: Refreshing expired default router entry "
                                    "%s for interface %s\n", __func__,
-                                   ip6_sprintf(&dr->rtaddr), if_name(dr->ifp)));
+                                   ip6_sprintf(&dr->rtaddr), if_name(dr->ifp));
                        } else {
                                ap->killed++;
                                /*
@@ -1244,6 +1246,17 @@ again:
                defrtrlist_del(dr);
                NDDR_REMREF(dr);        /* remove list reference */
        }
+
+       /*
+        * Also check if default router selection needs to be triggered
+        * for default interface, to avoid an issue with co-existence of
+        * static un-scoped default route configuration and default router
+        * discovery/selection.
+        */
+       if (trigger_v6_defrtr_select) {
+               defrouter_select(NULL);
+               trigger_v6_defrtr_select = FALSE;
+       }
        lck_mtx_unlock(nd6_mutex);
 
        /*
@@ -1460,7 +1473,7 @@ void
 nd6_drain(void *arg)
 {
 #pragma unused(arg)
-       nd6log2((LOG_DEBUG, "%s: draining ND6 entries\n", __func__));
+       nd6log2(debug, "%s: draining ND6 entries\n", __func__);
 
        lck_mtx_lock(rnh_lock);
        nd6_need_draining = 1;
@@ -1487,9 +1500,9 @@ nd6_timeout(void *arg)
                sarg.draining = 1;
        }
        nd6_service(&sarg);
-       nd6log2((LOG_DEBUG, "%s: found %u, aging_lazy %u, aging %u, "
+       nd6log2(debug, "%s: found %u, aging_lazy %u, aging %u, "
            "sticky %u, killed %u\n", __func__, sarg.found, sarg.aging_lazy,
-           sarg.aging, sarg.sticky, sarg.killed));
+           sarg.aging, sarg.sticky, sarg.killed);
        /* re-arm the timer if there's work to do */
        nd6_timeout_run--;
        VERIFY(nd6_timeout_run >= 0 && nd6_timeout_run < 2);
@@ -1515,7 +1528,7 @@ nd6_timeout(void *arg)
                }
                nd6_sched_timeout(&atv, leeway);
        } else if (nd6_debug) {
-               nd6log2((LOG_DEBUG, "%s: not rescheduling timer\n", __func__));
+               nd6log2(debug, "%s: not rescheduling timer\n", __func__);
        }
        lck_mtx_unlock(rnh_lock);
 }
@@ -1535,18 +1548,18 @@ nd6_sched_timeout(struct timeval *atv, struct timeval *ltv)
        /* see comments on top of this file */
        if (nd6_timeout_run == 0) {
                if (ltv == NULL) {
-                       nd6log2((LOG_DEBUG, "%s: timer scheduled in "
+                       nd6log2(debug, "%s: timer scheduled in "
                            "T+%llus.%lluu (demand %d)\n", __func__,
                            (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec,
-                           nd6_sched_timeout_want));
+                           nd6_sched_timeout_want);
                        nd6_fast_timer_on = TRUE;
                        timeout(nd6_timeout, &nd6_fast_timer_on, tvtohz(atv));
                } else {
-                       nd6log2((LOG_DEBUG, "%s: timer scheduled in "
+                       nd6log2(debug, "%s: timer scheduled in "
                            "T+%llus.%lluu with %llus.%lluu leeway "
                            "(demand %d)\n", __func__, (uint64_t)atv->tv_sec,
                            (uint64_t)atv->tv_usec, (uint64_t)ltv->tv_sec,
-                           (uint64_t)ltv->tv_usec, nd6_sched_timeout_want));
+                           (uint64_t)ltv->tv_usec, nd6_sched_timeout_want);
                        nd6_fast_timer_on = FALSE;
                        timeout_with_leeway(nd6_timeout, NULL,
                            tvtohz(atv), tvtohz(ltv));
@@ -1555,27 +1568,27 @@ nd6_sched_timeout(struct timeval *atv, struct timeval *ltv)
                nd6_sched_timeout_want = 0;
        } else if (nd6_timeout_run == 1 && ltv == NULL &&
            nd6_fast_timer_on == FALSE) {
-               nd6log2((LOG_DEBUG, "%s: fast timer scheduled in "
+               nd6log2(debug, "%s: fast timer scheduled in "
                    "T+%llus.%lluu (demand %d)\n", __func__,
                    (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec,
-                   nd6_sched_timeout_want));
+                   nd6_sched_timeout_want);
                nd6_fast_timer_on = TRUE;
                nd6_sched_timeout_want = 0;
                nd6_timeout_run++;
                timeout(nd6_timeout, &nd6_fast_timer_on, tvtohz(atv));
        } else {
                if (ltv == NULL) {
-                       nd6log2((LOG_DEBUG, "%s: not scheduling timer: "
+                       nd6log2(debug, "%s: not scheduling timer: "
                            "timers %d, fast_timer %d, T+%llus.%lluu\n",
                            __func__, nd6_timeout_run, nd6_fast_timer_on,
-                           (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec));
+                           (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec);
                } else {
-                       nd6log2((LOG_DEBUG, "%s: not scheduling timer: "
+                       nd6log2(debug, "%s: not scheduling timer: "
                            "timers %d, fast_timer %d, T+%llus.%lluu "
                            "with %llus.%lluu leeway\n", __func__,
                            nd6_timeout_run, nd6_fast_timer_on,
                            (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec,
-                           (uint64_t)ltv->tv_sec, (uint64_t)ltv->tv_usec));
+                           (uint64_t)ltv->tv_sec, (uint64_t)ltv->tv_usec);
                }
        }
 }
@@ -2162,9 +2175,9 @@ nd6_is_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp,
 void
 nd6_free(struct rtentry *rt)
 {
-       struct llinfo_nd6 *ln;
-       struct in6_addr in6;
-       struct nd_defrouter *dr;
+       struct llinfo_nd6 *ln = NULL;
+       struct in6_addr in6 = {};
+       struct nd_defrouter *dr = NULL;
 
        LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
        RT_LOCK_ASSERT_NOTHELD(rt);
@@ -2571,9 +2584,9 @@ nd6_rtrequest(int req, struct rtentry *rt, struct sockaddr *sa)
                                error = in6_mc_join(ifp, &llsol,
                                    NULL, &in6m, 0);
                                if (error) {
-                                       nd6log((LOG_ERR, "%s: failed to join "
+                                       nd6log(error, "%s: failed to join "
                                            "%s (errno=%d)\n", if_name(ifp),
-                                           ip6_sprintf(&llsol), error));
+                                           ip6_sprintf(&llsol), error);
                                } else {
                                        IN6M_REMREF(in6m);
                                }
@@ -3168,7 +3181,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
 
                if (cmd == SIOCGDEFIFACE_IN6_64) {
                        u_int64_t j = nd6_defifindex;
-                       bcopy(&j, &ndif_64->ifindex, sizeof(j));
+                       __nochk_bcopy(&j, &ndif_64->ifindex, sizeof(j));
                } else {
                        bcopy(&nd6_defifindex, &ndif_32->ifindex,
                            sizeof(u_int32_t));
@@ -3186,7 +3199,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
 
                if (cmd == SIOCSDEFIFACE_IN6_64) {
                        u_int64_t j;
-                       bcopy(&ndif_64->ifindex, &j, sizeof(j));
+                       __nochk_bcopy(&ndif_64->ifindex, &j, sizeof(j));
                        idx = (u_int32_t)j;
                } else {
                        bcopy(&ndif_32->ifindex, &idx, sizeof(idx));
@@ -3287,9 +3300,6 @@ nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr,
                is_newentry = 0;
        }
 
-       if (rt == NULL) {
-               return;
-       }
        if ((rt->rt_flags & (RTF_GATEWAY | RTF_LLINFO)) != RTF_LLINFO) {
 fail:
                RT_UNLOCK(rt);
@@ -4132,6 +4142,7 @@ nd6_need_cache(struct ifnet *ifp)
 #endif
        case IFT_BRIDGE:
        case IFT_CELLULAR:
+       case IFT_6LOWPAN:
                return 1;
        default:
                return 0;
@@ -4329,7 +4340,7 @@ sysctl_nd6_lookup_ipv6 SYSCTL_HANDLER_ARGS
         */
        error = proc_suser(current_proc());
        if (error != 0) {
-               printf("%s: proc_suser() error %d\n",
+               nd6log0(error, "%s: proc_suser() error %d\n",
                    __func__, error);
                goto done;
        }
@@ -4342,23 +4353,31 @@ sysctl_nd6_lookup_ipv6 SYSCTL_HANDLER_ARGS
        if (req->oldlen != sizeof(struct nd6_lookup_ipv6_args) ||
            req->newlen != sizeof(struct nd6_lookup_ipv6_args)) {
                error = EINVAL;
-               printf("%s: bad req, error %d\n",
+               nd6log0(error, "%s: bad req, error %d\n",
                    __func__, error);
                goto done;
        }
        error = SYSCTL_IN(req, &nd6_lookup_ipv6_args,
            sizeof(struct nd6_lookup_ipv6_args));
        if (error != 0) {
-               printf("%s: SYSCTL_IN() error %d\n",
+               nd6log0(error, "%s: SYSCTL_IN() error %d\n",
                    __func__, error);
                goto done;
        }
+
+       if (nd6_lookup_ipv6_args.ll_dest_len > sizeof(nd6_lookup_ipv6_args.ll_dest_)) {
+               error = EINVAL;
+               nd6log0(error, "%s: bad ll_dest_len, error %d\n",
+                   __func__, error);
+               goto done;
+       }
+
        /* Make sure to terminate the string */
        nd6_lookup_ipv6_args.ifname[IFNAMSIZ - 1] = 0;
 
        error = ifnet_find_by_name(nd6_lookup_ipv6_args.ifname, &ifp);
        if (error != 0) {
-               printf("%s: ifnet_find_by_name() error %d\n",
+               nd6log0(error, "%s: ifnet_find_by_name() error %d\n",
                    __func__, error);
                goto done;
        }
@@ -4367,7 +4386,7 @@ sysctl_nd6_lookup_ipv6 SYSCTL_HANDLER_ARGS
            &nd6_lookup_ipv6_args.ll_dest_._sdl,
            nd6_lookup_ipv6_args.ll_dest_len, NULL, NULL);
        if (error != 0) {
-               printf("%s: nd6_lookup_ipv6() error %d\n",
+               nd6log0(error, "%s: nd6_lookup_ipv6() error %d\n",
                    __func__, error);
                goto done;
        }
@@ -4375,7 +4394,7 @@ sysctl_nd6_lookup_ipv6 SYSCTL_HANDLER_ARGS
        error = SYSCTL_OUT(req, &nd6_lookup_ipv6_args,
            sizeof(struct nd6_lookup_ipv6_args));
        if (error != 0) {
-               printf("%s: SYSCTL_OUT() error %d\n",
+               nd6log0(error, "%s: SYSCTL_OUT() error %d\n",
                    __func__, error);
                goto done;
        }
@@ -4717,9 +4736,9 @@ in6_ifaddr_set_dadprogress(struct in6_ifaddr *ia)
        ia->ia6_flags &= ~(IN6_IFF_DUPLICATED | IN6_IFF_DADPROGRESS);
        ia->ia6_flags |= flags;
 
-       nd6log2((LOG_DEBUG, "%s - %s ifp %s ia6_flags 0x%x\n",
+       nd6log2(debug, "%s - %s ifp %s ia6_flags 0x%x\n",
            __func__,
            ip6_sprintf(&ia->ia_addr.sin6_addr),
            if_name(ia->ia_ifp),
-           ia->ia6_flags));
+           ia->ia6_flags);
 }
index d3bc920ee1a467e8434fcc39ce18b064654f848d..1ff88945f454a5e88f3862aa5d4e06c9d678ff5e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -118,22 +118,18 @@ struct  llinfo_nd6 {
 #ifdef BSD_KERNEL_PRIVATE
 
 #define ND6_CACHE_STATE_TRANSITION(ln, nstate) do {\
-       struct rtentry *ln_rt = (ln)->ln_rt; \
        if (nd6_debug >= 1) {\
-               nd6log((LOG_INFO,\
-                   "[%s:%d]: NDP cache entry changed from %s -> %s",\
+               struct rtentry *ln_rt = ln != NULL ? (ln)->ln_rt : NULL; \
+               nd6log(info,\
+                   "[%s:%d]: NDP cache entry changed from %s -> %s for address %s.\n",\
                    __func__,\
                    __LINE__,\
                    ndcache_state2str((ln)->ln_state),\
-                   ndcache_state2str(nstate)));\
-               if (ln_rt != NULL)\
-                       nd6log((LOG_INFO,\
-                           " for address: %s.\n",\
-                           ip6_sprintf(&SIN6(rt_key(ln_rt))->sin6_addr)));\
-               else\
-                       nd6log((LOG_INFO, "\n"));\
+                   ndcache_state2str(nstate),\
+                   ln_rt != NULL ? ip6_sprintf(&SIN6(rt_key(ln_rt))->sin6_addr) : "N/A");\
        }\
-       (ln)->ln_state = nstate;\
+       if (ln != NULL)\
+               (ln)->ln_state = nstate;\
 } while(0)
 
 #define ND6_IS_LLINFO_PROBREACH(n) ((n)->ln_state > ND6_LLINFO_INCOMPLETE)
@@ -444,7 +440,7 @@ struct  in6_ndifreq_32 {
 
 struct  in6_ndifreq_64 {
        char ifname[IFNAMSIZ];
-       u_long ifindex  __attribute__((aligned(8)));
+       u_int64_t ifindex  __attribute__((aligned(8)));
 };
 #endif /* BSD_KERNEL_PRIVATE */
 
@@ -758,9 +754,11 @@ extern int nd6_debug;
 extern int nd6_onlink_ns_rfc4861;
 extern int nd6_optimistic_dad;
 
-#define nd6log0(x)      do { log x; } while (0)
-#define nd6log(x)       do { if (nd6_debug >= 1) log x; } while (0)
-#define nd6log2(x)      do { if (nd6_debug >= 2) log x; } while (0)
+#include <os/log.h>
+
+#define nd6log0(type, ...)      do { os_log_##type(OS_LOG_DEFAULT, __VA_ARGS__); } while (0)
+#define nd6log(type, ...)       do { if (nd6_debug >= 1) os_log_##type(OS_LOG_DEFAULT, __VA_ARGS__); } while (0)
+#define nd6log2(type, ...)      do { if (nd6_debug >= 2) os_log_##type(OS_LOG_DEFAULT, __VA_ARGS__); } while (0)
 
 #define ND6_OPTIMISTIC_DAD_LINKLOCAL    (1 << 0)
 #define ND6_OPTIMISTIC_DAD_AUTOCONF     (1 << 1)
@@ -867,9 +865,9 @@ extern void nd6_llreach_set_reachable(struct ifnet *, void *, unsigned int);
 extern void nd6_llreach_use(struct llinfo_nd6 *);
 extern void nd6_alt_node_addr_decompose(struct ifnet *, struct sockaddr *,
     struct sockaddr_dl *, struct sockaddr_in6 *);
-extern void nd6_alt_node_present(struct ifnet *, struct sockaddr_in6 *,
+extern int nd6_alt_node_present(struct ifnet *, struct sockaddr_in6 *,
     struct sockaddr_dl *, int32_t, int, int);
-extern void nd6_alt_node_absent(struct ifnet *, struct sockaddr_in6 *);
+extern void nd6_alt_node_absent(struct ifnet *, struct sockaddr_in6 *, struct sockaddr_dl *);
 
 /* nd6_rtr.c */
 extern struct in6_ifaddr *in6_pfx_newpersistaddr(struct nd_prefix *, int,
index 9adb4feae3a6ebc3e711838a534b2a7b196f2255..ae3e33e539e4e65aed41e4ef5b30d8593c98ed74 100644 (file)
@@ -230,9 +230,9 @@ nd6_llreach_alloc(struct rtentry *rt, struct ifnet *ifp, void *addr,
                if (nd6_debug && lr != NULL && why != NULL) {
                        char tmp[MAX_IPv6_STR_LEN];
 
-                       nd6log((LOG_DEBUG, "%s: %s%s for %s\n", if_name(ifp),
+                       nd6log(debug, "%s: %s%s for %s\n", if_name(ifp),
                            type, why, inet_ntop(AF_INET6,
-                           &SIN6(rt_key(rt))->sin6_addr, tmp, sizeof(tmp))));
+                           &SIN6(rt_key(rt))->sin6_addr, tmp, sizeof(tmp)));
                }
        }
 }
@@ -289,10 +289,10 @@ nd6_ns_input(
        }
 
        if (ip6->ip6_hlim != IPV6_MAXHLIM) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n",
                    ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src),
-                   ip6_sprintf(&ip6->ip6_dst), if_name(ifp)));
+                   ip6_sprintf(&ip6->ip6_dst), if_name(ifp));
                goto bad;
        }
 
@@ -306,8 +306,8 @@ nd6_ns_input(
                    daddr6.s6_addr8[12] == 0xff) {
                        ; /* good */
                } else {
-                       nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
-                           "(wrong ip6 dst)\n"));
+                       nd6log(info, "nd6_ns_input: bad DAD packet "
+                           "(wrong ip6 dst)\n");
                        goto bad;
                }
        } else if (!nd6_onlink_ns_rfc4861) {
@@ -324,22 +324,22 @@ nd6_ns_input(
                src_sa6.sin6_len = sizeof(src_sa6);
                src_sa6.sin6_addr = saddr6;
                if (!nd6_is_addr_neighbor(&src_sa6, ifp, 0)) {
-                       nd6log((LOG_INFO, "nd6_ns_input: "
-                           "NS packet from non-neighbor\n"));
+                       nd6log(info, "nd6_ns_input: "
+                           "NS packet from non-neighbor\n");
                        goto bad;
                }
        }
 
        if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
-               nd6log((LOG_INFO, "nd6_ns_input: bad NS target (multicast)\n"));
+               nd6log(info, "nd6_ns_input: bad NS target (multicast)\n");
                goto bad;
        }
 
        icmp6len -= sizeof(*nd_ns);
        nd6_option_init(nd_ns + 1, icmp6len, &ndopts);
        if (nd6_options(&ndopts) < 0) {
-               nd6log((LOG_INFO,
-                   "nd6_ns_input: invalid ND option, ignored\n"));
+               nd6log(info,
+                   "nd6_ns_input: invalid ND option, ignored\n");
                /* nd6_options have incremented stats */
                goto freeit;
        }
@@ -350,8 +350,8 @@ nd6_ns_input(
        }
 
        if (is_dad_probe && lladdr) {
-               nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
-                   "(link-layer address option)\n"));
+               nd6log(info, "nd6_ns_input: bad DAD packet "
+                   "(link-layer address option)\n");
                goto bad;
        }
 
@@ -446,17 +446,17 @@ nd6_ns_input(
        IFA_UNLOCK(ifa);
 
        if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
-               nd6log((LOG_INFO,
+               nd6log(info,
                    "nd6_ns_input: lladdrlen mismatch for %s "
                    "(if %d, NS packet %d)\n",
-                   ip6_sprintf(&taddr6), ifp->if_addrlen, lladdrlen - 2));
+                   ip6_sprintf(&taddr6), ifp->if_addrlen, lladdrlen - 2);
                goto bad;
        }
 
        if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) {
-               nd6log((LOG_INFO,
+               nd6log(info,
                    "nd6_ns_input: duplicate IP6 address %s\n",
-                   ip6_sprintf(&saddr6)));
+                   ip6_sprintf(&saddr6));
                goto freeit;
        }
 
@@ -539,9 +539,9 @@ freeit:
        return;
 
 bad:
-       nd6log((LOG_ERR, "nd6_ns_input: src=%s\n", ip6_sprintf(&saddr6)));
-       nd6log((LOG_ERR, "nd6_ns_input: dst=%s\n", ip6_sprintf(&daddr6)));
-       nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n", ip6_sprintf(&taddr6)));
+       nd6log(error, "nd6_ns_input: src=%s\n", ip6_sprintf(&saddr6));
+       nd6log(error, "nd6_ns_input: dst=%s\n", ip6_sprintf(&daddr6));
+       nd6log(error, "nd6_ns_input: tgt=%s\n", ip6_sprintf(&taddr6));
        icmp6stat.icp6s_badns++;
        m_freem(m);
        if (ifa != NULL) {
@@ -722,11 +722,11 @@ nd6_ns_output(
                            NULL, &ro, NULL, &src_storage, ip6oa.ip6oa_boundif,
                            &error);
                        if (src == NULL) {
-                               nd6log((LOG_DEBUG,
+                               nd6log(debug,
                                    "nd6_ns_output: source can't be "
                                    "determined: dst=%s, error=%d\n",
                                    ip6_sprintf(&dst_sa.sin6_addr),
-                                   error));
+                                   error);
                                goto bad;
                        }
 
@@ -744,10 +744,10 @@ nd6_ns_output(
                         */
                        ia = in6ifa_ifpwithaddr(ifp, src);
                        if (!ia || (ia->ia6_flags & IN6_IFF_OPTIMISTIC)) {
-                               nd6log((LOG_DEBUG,
+                               nd6log(debug,
                                    "nd6_ns_output: no preferred source "
                                    "available: dst=%s\n",
-                                   ip6_sprintf(&dst_sa.sin6_addr)));
+                                   ip6_sprintf(&dst_sa.sin6_addr));
                                goto bad;
                        }
                }
@@ -848,6 +848,7 @@ nd6_ns_output(
                (void) m_set_service_class(m, MBUF_SC_CTL);
        }
 
+       ip6oa.ip6oa_flags |= IP6OAF_SKIP_PF;
        ip6_output(m, NULL, NULL, flags, im6o, &outif, &ip6oa);
        if (outif) {
                icmp6_ifstat_inc(outif, ifs6_out_msg);
@@ -906,7 +907,7 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len)
        bool send_nc_alive_kev = false;
 
        if ((ifp->if_eflags & IFEF_IPV6_ND6ALT) != 0) {
-               nd6log((LOG_INFO, "nd6_na_input: on ND6ALT interface!\n"));
+               nd6log(info, "nd6_na_input: on ND6ALT interface!\n");
                goto freeit;
        }
 
@@ -914,10 +915,10 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len)
        MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
 
        if (ip6->ip6_hlim != IPV6_MAXHLIM) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "nd6_na_input: invalid hlim (%d) from %s to %s on %s\n",
                    ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src),
-                   ip6_sprintf(&ip6->ip6_dst), if_name(ifp)));
+                   ip6_sprintf(&ip6->ip6_dst), if_name(ifp));
                goto bad;
        }
 
@@ -935,15 +936,15 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len)
                goto bad;       /* XXX: impossible */
        }
        if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "nd6_na_input: invalid target address %s\n",
-                   ip6_sprintf(&taddr6)));
+                   ip6_sprintf(&taddr6));
                goto bad;
        }
        if (IN6_IS_ADDR_MULTICAST(&daddr6)) {
                if (is_solicited) {
-                       nd6log((LOG_ERR,
-                           "nd6_na_input: a solicited adv is multicasted\n"));
+                       nd6log(error,
+                           "nd6_na_input: a solicited adv is multicasted\n");
                        goto bad;
                }
        }
@@ -951,8 +952,8 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len)
        icmp6len -= sizeof(*nd_na);
        nd6_option_init(nd_na + 1, icmp6len, &ndopts);
        if (nd6_options(&ndopts) < 0) {
-               nd6log((LOG_INFO,
-                   "nd6_na_input: invalid ND option, ignored\n"));
+               nd6log(info,
+                   "nd6_na_input: invalid ND option, ignored\n");
                /* nd6_options have incremented stats */
                goto freeit;
        }
@@ -962,11 +963,11 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len)
                lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
 
                if (((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
-                       nd6log((LOG_INFO,
+                       nd6log(info,
                            "nd6_na_input: lladdrlen mismatch for %s "
                            "(if %d, NA packet %d)\n",
                            ip6_sprintf(&taddr6), ifp->if_addrlen,
-                           lladdrlen - 2));
+                           lladdrlen - 2);
                        goto bad;
                }
        }
@@ -1465,9 +1466,9 @@ nd6_na_output(
        src = in6_selectsrc(&dst_sa, NULL, NULL, &ro, NULL, &src_storage,
            ip6oa.ip6oa_boundif, &error);
        if (src == NULL) {
-               nd6log((LOG_DEBUG, "nd6_na_output: source can't be "
+               nd6log(debug, "nd6_na_output: source can't be "
                    "determined: dst=%s, error=%d\n",
-                   ip6_sprintf(&dst_sa.sin6_addr), error));
+                   ip6_sprintf(&dst_sa.sin6_addr), error);
                goto bad;
        }
        ip6->ip6_src = *src;
@@ -1545,6 +1546,7 @@ nd6_na_output(
                (void) m_set_service_class(m, MBUF_SC_CTL);
        }
 
+       ip6oa.ip6oa_flags |= IP6OAF_SKIP_PF;
        ip6_output(m, NULL, NULL, IPV6_OUTARGS, im6o, &outif, &ip6oa);
        if (outif) {
                icmp6_ifstat_inc(outif, ifs6_out_msg);
@@ -1587,6 +1589,7 @@ nd6_ifptomac(
 #endif
        case IFT_BRIDGE:
        case IFT_ISO88025:
+       case IFT_6LOWPAN:
                return (caddr_t)IF_LLADDR(ifp);
        default:
                return NULL;
@@ -1662,10 +1665,10 @@ nd6_dad_find(struct ifaddr *ifa, struct nd_opt_nonce *nonce)
                    nonce->nd_opt_nonce_len == (ND_OPT_NONCE_LEN + 2) / 8 &&
                    memcmp(&nonce->nd_opt_nonce[0], &dp->dad_nonce[0],
                    ND_OPT_NONCE_LEN) == 0) {
-                       nd6log((LOG_ERR, "%s: a looped back NS message is "
+                       nd6log(error, "%s: a looped back NS message is "
                            "detected during DAD for %s. Ignoring.\n",
                            if_name(ifa->ifa_ifp),
-                           ip6_sprintf(IFA_IN6(ifa))));
+                           ip6_sprintf(IFA_IN6(ifa)));
                        dp->dad_ns_lcount++;
                        ++ip6stat.ip6s_dad_loopcount;
                        DAD_UNLOCK(dp);
@@ -1698,11 +1701,11 @@ nd6_dad_start(
        struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
        struct dadq *dp;
 
-       nd6log2((LOG_DEBUG, "%s - %s ifp %s ia6_flags 0x%x\n",
+       nd6log2(debug, "%s - %s ifp %s ia6_flags 0x%x\n",
            __func__,
            ip6_sprintf(&ia->ia_addr.sin6_addr),
            if_name(ia->ia_ifp),
-           ia->ia6_flags));
+           ia->ia6_flags);
 
        /*
         * If we don't need DAD, don't do it.
@@ -1712,11 +1715,11 @@ nd6_dad_start(
         */
        IFA_LOCK(&ia->ia_ifa);
        if (!(ia->ia6_flags & IN6_IFF_DADPROGRESS)) {
-               nd6log0((LOG_DEBUG,
+               nd6log0(debug,
                    "nd6_dad_start: not a tentative or optimistic address "
                    "%s(%s)\n",
                    ip6_sprintf(&ia->ia_addr.sin6_addr),
-                   ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"));
+                   ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
                IFA_UNLOCK(&ia->ia_ifa);
                return;
        }
@@ -1741,10 +1744,9 @@ nd6_dad_start(
 
        dp = zalloc(dad_zone);
        if (dp == NULL) {
-               nd6log0((LOG_ERR, "nd6_dad_start: memory allocation failed for "
-                   "%s(%s)\n",
+               nd6log0(error, "nd6_dad_start: memory allocation failed for %s(%s)\n",
                    ip6_sprintf(&ia->ia_addr.sin6_addr),
-                   ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"));
+                   ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
                return;
        }
        bzero(dp, dad_size);
@@ -1753,11 +1755,11 @@ nd6_dad_start(
        /* Callee adds one reference for us */
        dp = nd6_dad_attach(dp, ifa);
 
-       nd6log0((LOG_DEBUG, "%s: starting %sDAD %sfor %s\n",
+       nd6log0(debug, "%s: starting %sDAD %sfor %s\n",
            if_name(ifa->ifa_ifp),
            (ia->ia6_flags & IN6_IFF_OPTIMISTIC) ? "optimistic " : "",
            (tick_delay == NULL) ? "immediately " : "",
-           ip6_sprintf(&ia->ia_addr.sin6_addr)));
+           ip6_sprintf(&ia->ia_addr.sin6_addr));
 
        /*
         * Send NS packet for DAD, ip6_dad_count times.
@@ -1880,8 +1882,8 @@ nd6_unsol_na_output(struct ifaddr *ifa)
                return;
        }
 
-       nd6log((LOG_INFO, "%s: sending unsolicited NA\n",
-           if_name(ifa->ifa_ifp)));
+       nd6log(info, "%s: sending unsolicited NA\n",
+           if_name(ifa->ifa_ifp));
 
        nd6_na_output(ifp, &saddr6, &taddr6, ND_NA_FLAG_OVERRIDE, 1, NULL);
 }
@@ -1896,35 +1898,35 @@ nd6_dad_timer(struct ifaddr *ifa)
 
        /* Sanity check */
        if (ia == NULL) {
-               nd6log0((LOG_ERR, "nd6_dad_timer: called with null parameter\n"));
+               nd6log0(error, "nd6_dad_timer: called with null parameter\n");
                goto done;
        }
 
-       nd6log2((LOG_DEBUG, "%s - %s ifp %s ia6_flags 0x%x\n",
+       nd6log2(debug, "%s - %s ifp %s ia6_flags 0x%x\n",
            __func__,
            ip6_sprintf(&ia->ia_addr.sin6_addr),
            if_name(ia->ia_ifp),
-           ia->ia6_flags));
+           ia->ia6_flags);
 
        dp = nd6_dad_find(ifa, NULL);
        if (dp == NULL) {
-               nd6log0((LOG_ERR, "nd6_dad_timer: DAD structure not found\n"));
+               nd6log0(error, "nd6_dad_timer: DAD structure not found\n");
                goto done;
        }
        IFA_LOCK(&ia->ia_ifa);
        if (ia->ia6_flags & IN6_IFF_DUPLICATED) {
-               nd6log0((LOG_ERR, "nd6_dad_timer: called with duplicated address "
+               nd6log0(error, "nd6_dad_timer: called with duplicated address "
                    "%s(%s)\n",
                    ip6_sprintf(&ia->ia_addr.sin6_addr),
-                   ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"));
+                   ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
                IFA_UNLOCK(&ia->ia_ifa);
                goto done;
        }
        if ((ia->ia6_flags & IN6_IFF_DADPROGRESS) == 0) {
-               nd6log0((LOG_ERR, "nd6_dad_timer: not a tentative or optimistic "
+               nd6log0(error, "nd6_dad_timer: not a tentative or optimistic "
                    "address %s(%s)\n",
                    ip6_sprintf(&ia->ia_addr.sin6_addr),
-                   ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"));
+                   ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
                IFA_UNLOCK(&ia->ia_ifa);
                goto done;
        }
@@ -1934,8 +1936,8 @@ nd6_dad_timer(struct ifaddr *ifa)
        DAD_LOCK(dp);
        if (dp->dad_ns_tcount > dad_maxtry) {
                DAD_UNLOCK(dp);
-               nd6log0((LOG_INFO, "%s: could not run DAD, driver problem?\n",
-                   if_name(ifa->ifa_ifp)));
+               nd6log0(info, "%s: could not run DAD, driver problem?\n",
+                   if_name(ifa->ifa_ifp));
 
                nd6_dad_detach(dp, ifa);
                goto done;
@@ -1962,10 +1964,10 @@ nd6_dad_timer(struct ifaddr *ifa)
                if (dp->dad_na_icount > 0 || dp->dad_ns_icount) {
                        /* We've seen NS or NA, means DAD has failed. */
                        DAD_UNLOCK(dp);
-                       nd6log0((LOG_INFO,
-                           "%s: duplicate IPv6 address %s [timer]\n",
+                       nd6log0(info,
+                           "%s: duplicate IPv6 address %s if:%s [timer]\n",
                            __func__, ip6_sprintf(&ia->ia_addr.sin6_addr),
-                           if_name(ia->ia_ifp)));
+                           if_name(ia->ia_ifp));
                        nd6_dad_duplicated(ifa);
                        /* (*dp) will be freed in nd6_dad_duplicated() */
                } else if (dad_enhanced != 0 &&
@@ -1986,12 +1988,10 @@ nd6_dad_timer(struct ifaddr *ifa)
                         * additional probes until the loopback condition
                         * becomes clear when a looped back probe is detected.
                         */
-                       nd6log0((LOG_INFO,
-                           "%s: a looped back NS message is "
-                           "detected during DAD for %s. "
-                           "Another DAD probe is being sent on interface.\n",
+                       nd6log0(info,
+                           "%s: a looped back NS message is detected during DAD for %s. Another DAD probe is being sent on interface %s.\n",
                            __func__, ip6_sprintf(&ia->ia_addr.sin6_addr),
-                           if_name(ia->ia_ifp)));
+                           if_name(ia->ia_ifp));
                        /*
                         * Send an NS immediately and increase dad_count by
                         * nd6_mmaxtries - 1.
@@ -2020,20 +2020,20 @@ nd6_dad_timer(struct ifaddr *ifa)
                                nd6_unsol_na_output(ifa);
                        }
 
-                       nd6log0((LOG_DEBUG,
-                           "%s: DAD complete for %s - no duplicates found%s\n",
+                       nd6log0(debug,
+                           "%s: DAD complete for %s - no duplicates found %s\n",
                            if_name(ifa->ifa_ifp),
                            ip6_sprintf(&ia->ia_addr.sin6_addr),
-                           txunsolna ? ", tx unsolicited NA with O=1" : "."));
+                           txunsolna ? ", tx unsolicited NA with O=1" : ".");
 
                        if (dp->dad_ns_lcount > 0) {
-                               nd6log0((LOG_DEBUG,
+                               nd6log0(debug,
                                    "%s: DAD completed while "
                                    "a looped back NS message is detected "
                                    "during DAD for %s om interface %s\n",
                                    __func__,
                                    ip6_sprintf(&ia->ia_addr.sin6_addr),
-                                   if_name(ia->ia_ifp)));
+                                   if_name(ia->ia_ifp));
                        }
 
                        in6_post_msg(ia->ia_ifp, KEV_INET6_NEW_USER_ADDR, ia,
@@ -2063,9 +2063,9 @@ nd6_dad_duplicated(struct ifaddr *ifa)
        }
        IFA_LOCK(&ia->ia_ifa);
        DAD_LOCK(dp);
-       nd6log((LOG_ERR, "%s: NS in/out/loopback=%d/%d, NA in=%d\n",
+       nd6log(error, "%s: NS in/out/loopback=%d/%d/%d, NA in=%d\n",
            __func__, dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_ns_lcount,
-           dp->dad_na_icount));
+           dp->dad_na_icount);
        candisable = FALSE;
 
        if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr) &&
@@ -2290,8 +2290,8 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr,
        }
 
        if (replicated) {
-               nd6log((LOG_INFO, "%s: ignoring duplicate NA on "
-                   "replicated interface %s\n", __func__, if_name(ifp)));
+               nd6log(info, "%s: ignoring duplicate NA on "
+                   "replicated interface %s\n", __func__, if_name(ifp));
                goto done;
        }
 
@@ -2301,9 +2301,9 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr,
 
        if (!(ia->ia6_flags & IN6_IFF_DADPROGRESS)) {
                IFA_UNLOCK(ifa);
-               nd6log((LOG_INFO, "%s: ignoring duplicate NA on "
+               nd6log(info, "%s: ignoring duplicate NA on "
                    "%s [DAD not in progress]\n", __func__,
-                   if_name(ifp)));
+                   if_name(ifp));
                goto done;
        }
 
@@ -2317,8 +2317,8 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr,
                if (ip6a && (ip6a->ip6a_flags & IP6A_HASEEN) != 0 &&
                    bcmp(ip6a->ip6a_ehsrc, lladdr, ETHER_ADDR_LEN) != 0) {
                        IFA_UNLOCK(ifa);
-                       nd6log((LOG_ERR, "%s: ignoring duplicate NA on %s "
-                           "[eh_src != tgtlladdr]\n", __func__, if_name(ifp)));
+                       nd6log(error, "%s: ignoring duplicate NA on %s "
+                           "[eh_src != tgtlladdr]\n", __func__, if_name(ifp));
                        goto done;
                }
        }
@@ -2327,8 +2327,8 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr,
 
        dp = nd6_dad_find(ifa, NULL);
        if (dp == NULL) {
-               nd6log((LOG_INFO, "%s: no DAD structure for %s on %s.\n",
-                   __func__, ip6_sprintf(taddr), if_name(ifp)));
+               nd6log(info, "%s: no DAD structure for %s on %s.\n",
+                   __func__, ip6_sprintf(taddr), if_name(ifp));
                goto done;
        }
 
@@ -2342,9 +2342,9 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr,
        DAD_REMREF(dp);
 
        /* remove the address. */
-       nd6log((LOG_INFO,
+       nd6log(info,
            "%s: duplicate IPv6 address %s [processing NA on %s]\n", __func__,
-           ip6_sprintf(taddr), if_name(ifp)));
+           ip6_sprintf(taddr), if_name(ifp));
 done:
        IFA_LOCK_ASSERT_NOTHELD(ifa);
        IFA_REMREF(ifa);
@@ -2423,11 +2423,11 @@ nd6_alt_node_addr_decompose(struct ifnet *ifp, struct sockaddr *sa,
        VERIFY(sdl && (void *)sa != (void *)sdl);
        VERIFY(sin6 && (void *)sa != (void *)sin6);
 
-       bzero(sin6, sizeof *sin6);
+       bzero(sin6, sizeof(*sin6));
        sin6->sin6_len = sizeof *sin6;
        sin6->sin6_family = AF_INET6;
 
-       bzero(sdl, sizeof *sdl);
+       bzero(sdl, sizeof(*sdl));
        sdl->sdl_len = sizeof *sdl;
        sdl->sdl_family = AF_LINK;
        sdl->sdl_type = ifp->if_type;
@@ -2463,7 +2463,7 @@ nd6_alt_node_addr_decompose(struct ifnet *ifp, struct sockaddr *sa,
                struct in6_addr *in6 = &sin6->sin6_addr;
                caddr_t lla = LLADDR(sdla);
 
-               VERIFY(sa->sa_len <= sizeof *sdl);
+               VERIFY(sa->sa_len <= sizeof(*sdl));
                bcopy(sa, sdl, sa->sa_len);
 
                sin6->sin6_scope_id = sdla->sdl_index;
@@ -2495,7 +2495,7 @@ nd6_alt_node_addr_decompose(struct ifnet *ifp, struct sockaddr *sa,
        }
 }
 
-void
+int
 nd6_alt_node_present(struct ifnet *ifp, struct sockaddr_in6 *sin6,
     struct sockaddr_dl *sdl, int32_t rssi, int lqm, int npm)
 {
@@ -2550,21 +2550,23 @@ nd6_alt_node_present(struct ifnet *ifp, struct sockaddr_in6 *sin6,
        if (rt == NULL) {
                log(LOG_ERR, "%s: failed to add/update host route to %s.\n",
                    __func__, ip6_sprintf(&sin6->sin6_addr));
+               return EHOSTUNREACH;
        } else {
-               nd6log((LOG_DEBUG, "%s: host route to %s [lr=0x%llx]\n",
+               nd6log(debug, "%s: host route to %s [lr=0x%llx]\n",
                    __func__, ip6_sprintf(&sin6->sin6_addr),
-                   (uint64_t)VM_KERNEL_ADDRPERM(lr)));
+                   (uint64_t)VM_KERNEL_ADDRPERM(lr));
+               return 0;
        }
 }
 
 void
-nd6_alt_node_absent(struct ifnet *ifp, struct sockaddr_in6 *sin6)
+nd6_alt_node_absent(struct ifnet *ifp, struct sockaddr_in6 *sin6, struct sockaddr_dl *sdl)
 {
        struct rtentry *rt;
        const uint16_t temp_embedded_id = sin6->sin6_addr.s6_addr16[1];
 
-       nd6log((LOG_DEBUG, "%s: host route to %s\n", __func__,
-           ip6_sprintf(&sin6->sin6_addr)));
+       nd6log(debug, "%s: host route to %s\n", __func__,
+           ip6_sprintf(&sin6->sin6_addr));
 
        if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) &&
            (temp_embedded_id == 0)) {
@@ -2588,6 +2590,17 @@ nd6_alt_node_absent(struct ifnet *ifp, struct sockaddr_in6 *sin6)
                if (!(rt->rt_flags & (RTF_CLONING | RTF_PRCLONING)) &&
                    (rt->rt_flags & (RTF_HOST | RTF_LLINFO | RTF_WASCLONED)) ==
                    (RTF_HOST | RTF_LLINFO | RTF_WASCLONED)) {
+                       /*
+                        * Copy the link layer information in SDL when present
+                        * as it later gets used to issue the kernel event for
+                        * node absence.
+                        */
+                       if (sdl != NULL && rt->rt_gateway != NULL &&
+                           rt->rt_gateway->sa_family == AF_LINK &&
+                           SDL(rt->rt_gateway)->sdl_len <= sizeof(*sdl)) {
+                               bcopy(rt->rt_gateway, sdl, SDL(rt->rt_gateway)->sdl_len);
+                       }
+
                        rt->rt_flags |= RTF_CONDEMNED;
                        RT_UNLOCK(rt);
 
index a3006a929d6acc6081c30ce1d87809c7b56234f4..c3cb5ecd4180c3af3661dd5e21ff90e240dbb6a8 100644 (file)
@@ -619,10 +619,10 @@ nd6_proxy_find_fwdroute(struct ifnet *ifp, struct route_in6 *ro6)
        if ((rt = ro6->ro_rt) != NULL) {
                RT_LOCK(rt);
                if (!(rt->rt_flags & RTF_PROXY) || rt->rt_ifp == ifp) {
-                       nd6log2((LOG_DEBUG, "%s: found incorrect prefix "
+                       nd6log2(debug, "%s: found incorrect prefix "
                            "proxy route for dst %s on %s\n", if_name(ifp),
                            ip6_sprintf(dst6),
-                           if_name(rt->rt_ifp)));
+                           if_name(rt->rt_ifp));
                        RT_UNLOCK(rt);
                        /* look it up below */
                } else {
@@ -701,9 +701,9 @@ nd6_proxy_find_fwdroute(struct ifnet *ifp, struct route_in6 *ro6)
                        rtfree_locked(rt);
                        rt = NULL;
                } else {
-                       nd6log2((LOG_DEBUG, "%s: found prefix proxy route "
+                       nd6log2(debug, "%s: found prefix proxy route "
                            "for dst %s\n", if_name(rt->rt_ifp),
-                           ip6_sprintf(dst6)));
+                           ip6_sprintf(dst6));
                        RT_UNLOCK(rt);
                        ro6->ro_rt = rt;        /* refcnt held by rtalloc1 */
                        lck_mtx_unlock(rnh_lock);
@@ -723,9 +723,9 @@ nd6_proxy_find_fwdroute(struct ifnet *ifp, struct route_in6 *ro6)
                        rtfree_locked(rt);
                        rt = NULL;
                } else {
-                       nd6log2((LOG_DEBUG, "%s: allocated prefix proxy "
+                       nd6log2(debug, "%s: allocated prefix proxy "
                            "route for dst %s\n", if_name(rt->rt_ifp),
-                           ip6_sprintf(dst6)));
+                           ip6_sprintf(dst6));
                        RT_UNLOCK(rt);
                        ro6->ro_rt = rt;        /* refcnt held by rtalloc1 */
                }
@@ -733,9 +733,9 @@ nd6_proxy_find_fwdroute(struct ifnet *ifp, struct route_in6 *ro6)
        VERIFY(rt != NULL || ro6->ro_rt == NULL);
 
        if (fwd_ifp == NULL || rt == NULL) {
-               nd6log2((LOG_ERR, "%s: failed to find forwarding prefix "
+               nd6log2(error, "%s: failed to find forwarding prefix "
                    "proxy entry for dst %s\n", if_name(ifp),
-                   ip6_sprintf(dst6)));
+                   ip6_sprintf(dst6));
        }
        lck_mtx_unlock(rnh_lock);
 }
@@ -929,12 +929,12 @@ nd6_prproxy_ns_output(struct ifnet *ifp, struct ifnet *exclifp,
        }
 
        if (exclifp == NULL) {
-               nd6log2((LOG_DEBUG, "%s: sending NS who has %s on ALL\n",
-                   if_name(ifp), ip6_sprintf(taddr)));
+               nd6log2(debug, "%s: sending NS who has %s on ALL\n",
+                   if_name(ifp), ip6_sprintf(taddr));
        } else {
-               nd6log2((LOG_DEBUG, "%s: sending NS who has %s on ALL "
+               nd6log2(debug, "%s: sending NS who has %s on ALL "
                    "(except %s)\n", if_name(ifp),
-                   ip6_sprintf(taddr), if_name(exclifp)));
+                   ip6_sprintf(taddr), if_name(exclifp));
        }
 
        SLIST_INIT(&ndprl_head);
@@ -1001,10 +1001,10 @@ nd6_prproxy_ns_output(struct ifnet *ifp, struct ifnet *exclifp,
                NDPR_LOCK(pr);
                if (pr->ndpr_stateflags & NDPRF_ONLINK) {
                        NDPR_UNLOCK(pr);
-                       nd6log2((LOG_DEBUG,
+                       nd6log2(debug,
                            "%s: Sending cloned NS who has %s, originally "
                            "on %s\n", if_name(fwd_ifp),
-                           ip6_sprintf(taddr), if_name(ifp)));
+                           ip6_sprintf(taddr), if_name(ifp));
 
                        nd6_ns_output(fwd_ifp, daddr, taddr, NULL, NULL);
                } else {
@@ -1133,12 +1133,12 @@ nd6_prproxy_ns_input(struct ifnet *ifp, struct in6_addr *saddr,
                NDPR_LOCK(pr);
                if (pr->ndpr_stateflags & NDPRF_ONLINK) {
                        NDPR_UNLOCK(pr);
-                       nd6log2((LOG_DEBUG,
+                       nd6log2(debug,
                            "%s: Forwarding NS (%s) from %s to %s who "
                            "has %s, originally on %s\n", if_name(fwd_ifp),
                            ndprl->ndprl_sol ? "NUD/AR" :
                            "DAD", ip6_sprintf(saddr), ip6_sprintf(daddr),
-                           ip6_sprintf(taddr), if_name(ifp)));
+                           ip6_sprintf(taddr), if_name(ifp));
 
                        nd6_ns_output(fwd_ifp, ndprl->ndprl_sol ? taddr : NULL,
                            taddr, NULL, nonce);
@@ -1278,20 +1278,20 @@ nd6_prproxy_na_input(struct ifnet *ifp, struct in6_addr *saddr,
 
                if (send_na) {
                        if (!ndprl->ndprl_sol) {
-                               nd6log2((LOG_DEBUG,
+                               nd6log2(debug,
                                    "%s: Forwarding NA (DAD) from %s to %s "
                                    "tgt is %s, originally on %s\n",
                                    if_name(fwd_ifp),
                                    ip6_sprintf(saddr), ip6_sprintf(&daddr),
-                                   ip6_sprintf(taddr), if_name(ifp)));
+                                   ip6_sprintf(taddr), if_name(ifp));
                        } else {
-                               nd6log2((LOG_DEBUG,
+                               nd6log2(debug,
                                    "%s: Forwarding NA (NUD/AR) from %s to "
                                    "%s (was %s) tgt is %s, originally on "
                                    "%s\n", if_name(fwd_ifp),
                                    ip6_sprintf(saddr),
                                    ip6_sprintf(&daddr), ip6_sprintf(daddr0),
-                                   ip6_sprintf(taddr), if_name(ifp)));
+                                   ip6_sprintf(taddr), if_name(ifp));
                        }
 
                        nd6_na_output(fwd_ifp, &daddr, taddr, flags, 1, NULL);
index 53259c5ea33eeb4c64b3bc01ea50cb82ce7c4e82..e0c8b48644f80e72db264c50e387acc4c008684c 100644 (file)
@@ -285,11 +285,11 @@ nd6_rs_input(
        }
 
        /* Sanity checks */
-       if (ip6->ip6_hlim != 255) {
-               nd6log((LOG_ERR,
+       if (ip6->ip6_hlim != IPV6_MAXHLIM) {
+               nd6log(error,
                    "nd6_rs_input: invalid hlim (%d) from %s to %s on %s\n",
                    ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src),
-                   ip6_sprintf(&ip6->ip6_dst), if_name(ifp)));
+                   ip6_sprintf(&ip6->ip6_dst), if_name(ifp));
                goto bad;
        }
 
@@ -308,8 +308,8 @@ nd6_rs_input(
                src_sa6.sin6_len = sizeof(src_sa6);
                src_sa6.sin6_addr = ip6->ip6_src;
                if (!nd6_is_addr_neighbor(&src_sa6, ifp, 0)) {
-                       nd6log((LOG_INFO, "nd6_rs_input: "
-                           "RS packet from non-neighbor\n"));
+                       nd6log(info, "nd6_rs_input: "
+                           "RS packet from non-neighbor\n");
                        goto freeit;
                }
        }
@@ -319,8 +319,8 @@ nd6_rs_input(
        icmp6len -= sizeof(*nd_rs);
        nd6_option_init(nd_rs + 1, icmp6len, &ndopts);
        if (nd6_options(&ndopts) < 0) {
-               nd6log((LOG_INFO,
-                   "nd6_rs_input: invalid ND option, ignored\n"));
+               nd6log(info,
+                   "nd6_rs_input: invalid ND option, ignored\n");
                /* nd6_options have incremented stats */
                goto freeit;
        }
@@ -331,10 +331,10 @@ nd6_rs_input(
        }
 
        if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
-               nd6log((LOG_INFO,
+               nd6log(info,
                    "nd6_rs_input: lladdrlen mismatch for %s "
                    "(if %d, RS packet %d)\n",
-                   ip6_sprintf(&saddr6), ifp->if_addrlen, lladdrlen - 2));
+                   ip6_sprintf(&saddr6), ifp->if_addrlen, lladdrlen - 2);
                goto bad;
        }
 
@@ -409,18 +409,18 @@ nd6_ra_input(
                ia6 = NULL;
        }
 
-       if (ip6->ip6_hlim != 255) {
-               nd6log((LOG_ERR,
+       if (ip6->ip6_hlim != IPV6_MAXHLIM) {
+               nd6log(error,
                    "nd6_ra_input: invalid hlim (%d) from %s to %s on %s\n",
                    ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src),
-                   ip6_sprintf(&ip6->ip6_dst), if_name(ifp)));
+                   ip6_sprintf(&ip6->ip6_dst), if_name(ifp));
                goto bad;
        }
 
        if (!IN6_IS_ADDR_LINKLOCAL(&saddr6)) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "nd6_ra_input: src %s is not link-local\n",
-                   ip6_sprintf(&saddr6)));
+                   ip6_sprintf(&saddr6));
                goto bad;
        }
 
@@ -430,8 +430,8 @@ nd6_ra_input(
        icmp6len -= sizeof(*nd_ra);
        nd6_option_init(nd_ra + 1, icmp6len, &ndopts);
        if (nd6_options(&ndopts) < 0) {
-               nd6log((LOG_INFO,
-                   "nd6_ra_input: invalid ND option, ignored\n"));
+               nd6log(info,
+                   "nd6_ra_input: invalid ND option, ignored\n");
                /* nd6_options have incremented stats */
                goto freeit;
        }
@@ -469,12 +469,12 @@ nd6_ra_input(
                if (ndi->chlim < nd_ra->nd_ra_curhoplimit) {
                        ndi->chlim = nd_ra->nd_ra_curhoplimit;
                } else if (ndi->chlim != nd_ra->nd_ra_curhoplimit) {
-                       nd6log((LOG_ERR,
+                       nd6log(error,
                            "RA with a lower CurHopLimit sent from "
                            "%s on %s (current = %d, received = %d). "
                            "Ignored.\n", ip6_sprintf(&ip6->ip6_src),
                            if_name(ifp), ndi->chlim,
-                           nd_ra->nd_ra_curhoplimit));
+                           nd_ra->nd_ra_curhoplimit);
                }
        }
        lck_mtx_unlock(&ndi->lock);
@@ -503,18 +503,18 @@ nd6_ra_input(
                        pi = (struct nd_opt_prefix_info *)pt;
 
                        if (pi->nd_opt_pi_len != 4) {
-                               nd6log((LOG_INFO,
+                               nd6log(info,
                                    "nd6_ra_input: invalid option "
                                    "len %d for prefix information option, "
-                                   "ignored\n", pi->nd_opt_pi_len));
+                                   "ignored\n", pi->nd_opt_pi_len);
                                continue;
                        }
 
                        if (128 < pi->nd_opt_pi_prefix_len) {
-                               nd6log((LOG_INFO,
+                               nd6log(info,
                                    "nd6_ra_input: invalid prefix "
                                    "len %d for prefix information option, "
-                                   "ignored\n", pi->nd_opt_pi_prefix_len));
+                                   "ignored\n", pi->nd_opt_pi_prefix_len);
                                continue;
                        }
 
@@ -531,10 +531,10 @@ nd6_ra_input(
                        if (IN6_IS_ADDR_UNSPECIFIED(&pi->nd_opt_pi_prefix) ||
                            IN6_IS_ADDR_MULTICAST(&pi->nd_opt_pi_prefix) ||
                            IN6_IS_ADDR_LINKLOCAL(&pi->nd_opt_pi_prefix)) {
-                               nd6log((LOG_INFO,
+                               nd6log(info,
                                    "%s: invalid prefix %s, ignored\n",
                                    __func__,
-                                   ip6_sprintf(&pi->nd_opt_pi_prefix)));
+                                   ip6_sprintf(&pi->nd_opt_pi_prefix));
                                continue;
                        }
 
@@ -563,17 +563,17 @@ nd6_ra_input(
                         */
                        if (ip6_only_allow_rfc4193_prefix &&
                            !IN6_IS_ADDR_UNIQUE_LOCAL(&pi->nd_opt_pi_prefix)) {
-                               nd6log((LOG_INFO,
+                               nd6log(info,
                                    "nd6_ra_input: no SLAAC on prefix %s "
                                    "[not RFC 4193]\n",
-                                   ip6_sprintf(&pi->nd_opt_pi_prefix)));
+                                   ip6_sprintf(&pi->nd_opt_pi_prefix));
                                pr.ndpr_raf_auto = 0;
                        } else if (!nd6_accept_6to4 &&
                            IN6_IS_ADDR_6TO4(&pi->nd_opt_pi_prefix)) {
-                               nd6log((LOG_INFO,
+                               nd6log(info,
                                    "%s: no SLAAC on prefix %s "
                                    "[6to4]\n", __func__,
-                                   ip6_sprintf(&pi->nd_opt_pi_prefix)));
+                                   ip6_sprintf(&pi->nd_opt_pi_prefix));
                                pr.ndpr_raf_auto = 0;
                        }
 
@@ -632,9 +632,9 @@ nd6_ra_input(
 
                /* lower bound */
                if (mtu < IPV6_MMTU) {
-                       nd6log((LOG_INFO, "nd6_ra_input: bogus mtu option "
+                       nd6log(info, "nd6_ra_input: bogus mtu option "
                            "mtu=%d sent from %s, ignoring\n",
-                           mtu, ip6_sprintf(&ip6->ip6_src)));
+                           mtu, ip6_sprintf(&ip6->ip6_src));
                        goto skip;
                }
 
@@ -650,19 +650,19 @@ nd6_ra_input(
                                        in6_setmaxmtu();
                                }
                        } else {
-                               nd6log((LOG_INFO, "nd6_ra_input: bogus mtu "
+                               nd6log(info, "nd6_ra_input: bogus mtu "
                                    "mtu=%d sent from %s; "
                                    "exceeds maxmtu %d, ignoring\n",
                                    mtu, ip6_sprintf(&ip6->ip6_src),
-                                   ndi->maxmtu));
+                                   ndi->maxmtu);
                                lck_mtx_unlock(&ndi->lock);
                        }
                } else {
                        lck_mtx_unlock(&ndi->lock);
-                       nd6log((LOG_INFO, "nd6_ra_input: mtu option "
+                       nd6log(info, "nd6_ra_input: mtu option "
                            "mtu=%d sent from %s; maxmtu unknown, "
                            "ignoring\n",
-                           mtu, ip6_sprintf(&ip6->ip6_src)));
+                           mtu, ip6_sprintf(&ip6->ip6_src));
                }
        }
 
@@ -677,10 +677,10 @@ skip:
        }
 
        if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
-               nd6log((LOG_INFO,
+               nd6log(info,
                    "nd6_ra_input: lladdrlen mismatch for %s "
                    "(if %d, RA packet %d)\n",
-                   ip6_sprintf(&saddr6), ifp->if_addrlen, lladdrlen - 2));
+                   ip6_sprintf(&saddr6), ifp->if_addrlen, lladdrlen - 2);
                goto bad;
        }
 
@@ -773,16 +773,16 @@ defrouter_addreq(struct nd_defrouter *new, boolean_t scoped)
        }
 
        if (new->ifp->if_eflags & IFEF_IPV6_ROUTER) {
-               nd6log2((LOG_INFO, "%s: ignoring router %s, scoped=%d, "
+               nd6log2(info, "%s: ignoring router %s, scoped=%d, "
                    "static=%d on advertising interface\n", if_name(new->ifp),
                    ip6_sprintf(&new->rtaddr), scoped,
-                   (new->stateflags & NDDRF_STATIC) ? 1 : 0));
+                   (new->stateflags & NDDRF_STATIC) ? 1 : 0);
                goto out;
        }
 
-       nd6log2((LOG_INFO, "%s: adding default router %s, scoped=%d, "
+       nd6log2(info, "%s: adding default router %s, scoped=%d, "
            "static=%d\n", if_name(new->ifp), ip6_sprintf(&new->rtaddr),
-           scoped, (new->stateflags & NDDRF_STATIC) ? 1 : 0));
+           scoped, (new->stateflags & NDDRF_STATIC) ? 1 : 0);
 
        Bzero(&def, sizeof(def));
        Bzero(&mask, sizeof(mask));
@@ -826,9 +826,9 @@ defrouter_addreq(struct nd_defrouter *new, boolean_t scoped)
                        new->rtaddr_mapped = gate.sin6_addr;
                        new->stateflags |= NDDRF_MAPPED;
 
-                       nd6log((LOG_INFO, "%s: Default router %s mapped "
-                           "to ", if_name(new->ifp), ip6_sprintf(&new->rtaddr)));
-                       nd6log((LOG_INFO, "%s\n", ip6_sprintf(&new->rtaddr_mapped)));
+                       nd6log(info, "%s: Default router %s mapped "
+                           "to ", if_name(new->ifp), ip6_sprintf(&new->rtaddr));
+                       nd6log(info, "%s\n", ip6_sprintf(&new->rtaddr_mapped));
                }
        }
 
@@ -847,10 +847,10 @@ defrouter_addreq(struct nd_defrouter *new, boolean_t scoped)
                        new->stateflags |= NDDRF_IFSCOPE;
                }
        } else {
-               nd6log((LOG_ERR, "%s: failed to add default router "
+               nd6log(error, "%s: failed to add default router "
                    "%s on %s scoped %d (errno = %d)\n", __func__,
                    ip6_sprintf(&gate.sin6_addr), if_name(new->ifp),
-                   (ifscope != IFSCOPE_NONE), err));
+                   (ifscope != IFSCOPE_NONE), err);
                NDDR_LOCK(new);
        }
        new->err = err;
@@ -907,10 +907,10 @@ defrouter_delreq(struct nd_defrouter *dr)
                goto out;
        }
 
-       nd6log2((LOG_INFO, "%s: removing default router %s, scoped=%d, "
+       nd6log2(info, "%s: removing default router %s, scoped=%d, "
            "static=%d\n", dr->ifp != NULL ? if_name(dr->ifp) : "ANY",
            ip6_sprintf(&dr->rtaddr), (dr->stateflags & NDDRF_IFSCOPE) ? 1 : 0,
-           (dr->stateflags & NDDRF_STATIC) ? 1 : 0));
+           (dr->stateflags & NDDRF_STATIC) ? 1 : 0);
 
        Bzero(&def, sizeof(def));
        Bzero(&mask, sizeof(mask));
@@ -951,10 +951,10 @@ defrouter_delreq(struct nd_defrouter *dr)
                RT_UNLOCK(oldrt);
                rtfree(oldrt);
        } else if (err != ESRCH) {
-               nd6log((LOG_ERR, "%s: failed to delete default router "
+               nd6log(error, "%s: failed to delete default router "
                    "%s on %s scoped %d (errno = %d)\n", __func__,
                    ip6_sprintf(&gate.sin6_addr), dr->ifp != NULL ?
-                   if_name(dr->ifp) : "ANY", (ifscope != IFSCOPE_NONE), err));
+                   if_name(dr->ifp) : "ANY", (ifscope != IFSCOPE_NONE), err);
        }
        NDDR_LOCK(dr);
        /* ESRCH means it's no longer in the routing table; ignore it */
@@ -1140,8 +1140,8 @@ defrtrlist_del(struct nd_defrouter *dr)
 
        lck_mtx_lock(nd6_mutex);
        NDDR_REMREF(dr);
-       nd6log2((LOG_INFO, "%s: freeing defrouter %s\n", if_name(dr->ifp),
-           ip6_sprintf(&dr->rtaddr)));
+       nd6log2(info, "%s: freeing defrouter %s\n", if_name(dr->ifp),
+           ip6_sprintf(&dr->rtaddr));
        /*
         * Delete it from the routing table.
         */
@@ -1319,26 +1319,32 @@ defrouter_select(struct ifnet *ifp)
        LCK_MTX_ASSERT(nd6_mutex, LCK_MTX_ASSERT_OWNED);
 
        if (ifp == NULL) {
-               nd6log2((LOG_INFO,
-                   "%s:%d: Return early. NULL interface",
-                   __func__, __LINE__));
-               return;
+               ifp = nd6_defifp;
+               if (ifp == NULL) {
+                       nd6log2(info,
+                           "%s:%d: Return early. NULL interface",
+                           __func__, __LINE__);
+                       return;
+               }
+               nd6log2(info,
+                   "%s:%d: NULL interface passed. Setting to default interface %s.\n",
+                   __func__, __LINE__, if_name(ifp));
        }
 
        if (ifp == lo_ifp) {
-               nd6log2((LOG_INFO,
+               nd6log2(info,
                    "%s:%d: Return early. "
                    "Default router select called for loopback.\n",
-                   __func__, __LINE__));
+                   __func__, __LINE__);
                return;
        }
 
        if (ifp->if_eflags & IFEF_IPV6_ROUTER) {
-               nd6log2((LOG_INFO,
+               nd6log2(info,
                    "%s:%d: Return early. "
                    "Default router select called for interface"
                    " %s with IFEF_IPV6_ROUTER flag set\n",
-                   __func__, __LINE__, if_name(ifp)));
+                   __func__, __LINE__, if_name(ifp));
                return;
        }
 
@@ -1347,9 +1353,9 @@ defrouter_select(struct ifnet *ifp)
         * If default router list is empty, there's nothing to be done.
         */
        if (!TAILQ_FIRST(&nd_defrouter)) {
-               nd6log2((LOG_INFO,
+               nd6log2(info,
                    "%s:%d: Return early. "
-                   "Default router is empty.\n", __func__, __LINE__));
+                   "Default router is empty.\n", __func__, __LINE__);
                return;
        }
 
@@ -1359,18 +1365,18 @@ defrouter_select(struct ifnet *ifp)
         */
        ndi = ND_IFINFO(ifp);
        if (!ndi || !ndi->initialized) {
-               nd6log2((LOG_INFO,
+               nd6log2(info,
                    "%s:%d: Return early. "
                    "Interface %s's nd_ifinfo not initialized.\n",
-                   __func__, __LINE__, if_name(ifp)));
+                   __func__, __LINE__, if_name(ifp));
                return;
        }
 
        if (ndi->ndefrouters == 0) {
-               nd6log2((LOG_INFO,
+               nd6log2(info,
                    "%s:%d: Return early. "
                    "%s does not have any default routers.\n",
-                   __func__, __LINE__, if_name(ifp)));
+                   __func__, __LINE__, if_name(ifp));
                return;
        }
 
@@ -1418,11 +1424,11 @@ defrouter_select(struct ifnet *ifp)
                 * there's nothing else to choose from.
                 */
                if (ndi->ndefrouters == 1) {
-                       nd6log2((LOG_INFO,
+                       nd6log2(info,
                            "%s:%d: Fast forward default router selection "
                            "as interface %s has learned only one default "
                            "router and there's nothing else to choose from.\n",
-                           __func__, __LINE__, if_name(ifp)));
+                           __func__, __LINE__, if_name(ifp));
                        VERIFY(selected_dr == NULL && installed_dr == NULL);
                        selected_dr = dr;
                        if (dr->stateflags & NDDRF_INSTALLED) {
@@ -1511,9 +1517,9 @@ defrouter_select(struct ifnet *ifp)
                                lck_mtx_lock(nd6_mutex);
                        } else {
                                /* this should not happen; warn for diagnosis */
-                               nd6log((LOG_ERR, "defrouter_select: more than one "
+                               nd6log(error, "defrouter_select: more than one "
                                    "default router is installed for interface :%s.\n",
-                                   if_name(ifp)));
+                                   if_name(ifp));
                                NDDR_UNLOCK(dr);
                        }
                } else {
@@ -1541,18 +1547,18 @@ defrouter_select(struct ifnet *ifp)
                        }
 
                        if (ndi->ndefrouters == 0) {
-                               nd6log2((LOG_INFO,
+                               nd6log2(info,
                                    "%s:%d: Interface %s no longer "
                                    "has any default routers. Abort.\n",
-                                   __func__, __LINE__, if_name(ifp)));
+                                   __func__, __LINE__, if_name(ifp));
                                goto out;
                        }
-                       nd6log2((LOG_INFO,
+                       nd6log2(info,
                            "%s:%d: Iterate default router list again "
                            "for interface %s, as the list seems to have "
                            "changed during release-reaquire of global "
                            "nd6_mutex lock.\n",
-                           __func__, __LINE__, if_name(ifp)));
+                           __func__, __LINE__, if_name(ifp));
 
                        is_installed_reachable = FALSE;
                        genid = nd6_defrouter_genid;
@@ -1595,12 +1601,12 @@ defrouter_select(struct ifnet *ifp)
                }
 
                if ((selected_dr == NULL) && (installed_dr == NULL)) {
-                       nd6log2((LOG_INFO,
+                       nd6log2(info,
                            "%s:%d: Between release and reaquire of global "
                            "nd6_mutex lock, the list seems to have changed "
                            "and it does not have any default routers for "
                            "interface %s.\n",
-                           __func__, __LINE__, if_name(ifp)));
+                           __func__, __LINE__, if_name(ifp));
                        goto out;
                }
 
@@ -1639,10 +1645,10 @@ install_route:
         */
        lck_mtx_unlock(nd6_mutex);
        if (installed_dr != selected_dr) {
-               nd6log((LOG_INFO,
+               nd6log(info,
                    "%s:%d: Found a better router for interface "
                    "%s. Installing new default route.\n",
-                   __func__, __LINE__, if_name(ifp)));
+                   __func__, __LINE__, if_name(ifp));
                if (installed_dr != NULL) {
                        defrouter_delreq(installed_dr);
                }
@@ -1656,18 +1662,18 @@ install_route:
            (installed_dr->ifp == nd6_defifp)) ||
            (!(installed_dr->stateflags & NDDRF_IFSCOPE) &&
            (installed_dr->ifp != nd6_defifp))) {
-               nd6log((LOG_INFO,
+               nd6log(info,
                    "%s:%d: Need to reinstall default route for interface "
                    "%s as its scope has changed.\n",
-                   __func__, __LINE__, if_name(ifp)));
+                   __func__, __LINE__, if_name(ifp));
                defrouter_delreq(installed_dr);
                defrouter_addreq(installed_dr,
                    (installed_dr->ifp != nd6_defifp));
        } else {
-               nd6log2((LOG_INFO,
+               nd6log2(info,
                    "%s:%d: No need to change the default "
                    "route for interface %s.\n",
-                   __func__, __LINE__, if_name(ifp)));
+                   __func__, __LINE__, if_name(ifp));
        }
        lck_mtx_lock(nd6_mutex);
 out:
@@ -1795,8 +1801,8 @@ defrtrlist_update_common(struct nd_defrouter *new, boolean_t scoped)
        VERIFY(ndi->ndefrouters != 0);
        lck_mtx_unlock(&ndi->lock);
 
-       nd6log2((LOG_INFO, "%s: allocating defrouter %s\n", if_name(ifp),
-           ip6_sprintf(&new->rtaddr)));
+       nd6log2(info, "%s: allocating defrouter %s\n", if_name(ifp),
+           ip6_sprintf(&new->rtaddr));
 
        getmicrotime(&caltime);
        NDDR_LOCK(n);
@@ -2022,11 +2028,11 @@ nd6_prelist_add(struct nd_prefix *pr, struct nd_defrouter *dr,
 
                if ((e = nd6_prefix_onlink_common(new, force_scoped,
                    new->ndpr_ifp->if_index)) != 0) {
-                       nd6log((LOG_ERR, "nd6_prelist_add: failed to make "
+                       nd6log(error, "nd6_prelist_add: failed to make "
                            "the prefix %s/%d on-link %s on %s (errno=%d)\n",
                            ip6_sprintf(&new->ndpr_prefix.sin6_addr),
                            new->ndpr_plen, force_scoped ? "scoped" :
-                           "non-scoped", if_name(ifp), e));
+                           "non-scoped", if_name(ifp), e);
                        /* proceed anyway. XXX: is it correct? */
                }
        }
@@ -2082,10 +2088,10 @@ prelist_remove(struct nd_prefix *pr)
                NDPR_UNLOCK(pr);
                lck_mtx_unlock(nd6_mutex);
                if ((e = nd6_prefix_offlink(pr)) != 0) {
-                       nd6log((LOG_ERR, "prelist_remove: failed to make "
+                       nd6log(error, "prelist_remove: failed to make "
                            "%s/%d offlink on %s, errno=%d\n",
                            ip6_sprintf(&pr->ndpr_prefix.sin6_addr),
-                           pr->ndpr_plen, if_name(ifp), e));
+                           pr->ndpr_plen, if_name(ifp), e);
                        /* what should we do? */
                }
                lck_mtx_lock(nd6_mutex);
@@ -2198,12 +2204,12 @@ prelist_update(
 
                        NDPR_UNLOCK(pr);
                        if ((e = nd6_prefix_onlink(pr)) != 0) {
-                               nd6log((LOG_ERR,
+                               nd6log(error,
                                    "prelist_update: failed to make "
                                    "the prefix %s/%d on-link on %s "
                                    "(errno=%d)\n",
                                    ip6_sprintf(&pr->ndpr_prefix.sin6_addr),
-                                   pr->ndpr_plen, if_name(pr->ndpr_ifp), e));
+                                   pr->ndpr_plen, if_name(pr->ndpr_ifp), e);
                                /* proceed anyway. XXX: is it correct? */
                        }
                        NDPR_LOCK(pr);
@@ -2231,12 +2237,12 @@ prelist_update(
 
                error = nd6_prelist_add(new, dr, &pr, FALSE);
                if (error != 0 || pr == NULL) {
-                       nd6log((LOG_NOTICE, "prelist_update: "
+                       nd6log(info, "prelist_update: "
                            "nd6_prelist_add failed for %s/%d on %s "
                            "errno=%d, returnpr=0x%llx\n",
                            ip6_sprintf(&new->ndpr_prefix.sin6_addr),
                            new->ndpr_plen, if_name(new->ndpr_ifp),
-                           error, (uint64_t)VM_KERNEL_ADDRPERM(pr)));
+                           error, (uint64_t)VM_KERNEL_ADDRPERM(pr));
                        goto end; /* we should just give up in this case. */
                }
        }
@@ -2447,10 +2453,10 @@ prelist_update(
                        if (ip6_use_tempaddr) {
                                int e;
                                if ((e = in6_tmpifadd(ia6, 1)) != 0) {
-                                       nd6log((LOG_NOTICE, "prelist_update: "
+                                       nd6log(info, "prelist_update: "
                                            "failed to create a temporary "
                                            "address, errno=%d\n",
-                                           e));
+                                           e);
                                }
                        }
                        IFA_REMREF(&ia6->ia_ifa);
@@ -2485,8 +2491,7 @@ prelist_update(
                                                IN6_CLAT46_EVENT_V6_ADDR_CONFFAIL,
                                                0,
                                                tmp_uuid);
-                                       nd6log0((LOG_ERR, "Could not configure CLAT46 address on interface "
-                                           "%s.\n", ifp->if_xname));
+                                       nd6log0(error, "Could not configure CLAT46 address on interface %s.\n", ifp->if_xname);
                                }
                                /*
                                 * Reset the error as we do not want to
@@ -3054,11 +3059,11 @@ pfxlist_onlink_check(void)
                        NDPR_UNLOCK(pr);
                        lck_mtx_unlock(nd6_mutex);
                        if ((e = nd6_prefix_offlink(pr)) != 0) {
-                               nd6log((LOG_ERR,
+                               nd6log(error,
                                    "pfxlist_onlink_check: failed to "
                                    "make %s/%d offlink, errno=%d\n",
                                    ip6_sprintf(&pr->ndpr_prefix.sin6_addr),
-                                   pr->ndpr_plen, e));
+                                   pr->ndpr_plen, e);
                        }
                        lck_mtx_lock(nd6_mutex);
                        NDPR_REMREF(pr);
@@ -3070,11 +3075,11 @@ pfxlist_onlink_check(void)
                    pr->ndpr_raf_onlink) {
                        NDPR_UNLOCK(pr);
                        if ((e = nd6_prefix_onlink(pr)) != 0) {
-                               nd6log((LOG_ERR,
+                               nd6log(error,
                                    "pfxlist_onlink_check: failed to "
                                    "make %s/%d offlink, errno=%d\n",
                                    ip6_sprintf(&pr->ndpr_prefix.sin6_addr),
-                                   pr->ndpr_plen, e));
+                                   pr->ndpr_plen, e);
                        }
                        NDPR_REMREF(pr);
                        pr = nd_prefix.lh_first;
@@ -3112,8 +3117,8 @@ pfxlist_onlink_check(void)
        err = ifnet_get_address_list_family_internal(NULL, &ifap, AF_INET6, 0,
            M_NOWAIT, 0);
        if (err != 0 || ifap == NULL) {
-               nd6log((LOG_ERR, "%s: ifnet_get_address_list_family_internal "
-                   "failed", __func__));
+               nd6log(error, "%s: ifnet_get_address_list_family_internal "
+                   "failed", __func__);
                return;
        }
        for (i = 0; ifap[i]; i++) {
@@ -3286,61 +3291,61 @@ nd6_prefix_sync(struct ifnet *ifp)
                err = nd6_prefix_offlink(opr);
                lck_mtx_lock(nd6_mutex);
                if (err != 0) {
-                       nd6log((LOG_ERR,
+                       nd6log(error,
                            "%s: failed to make %s/%d offlink on %s, "
                            "errno=%d\n", __func__,
                            ip6_sprintf(&opr->ndpr_prefix.sin6_addr),
-                           opr->ndpr_plen, if_name(opr->ndpr_ifp), err));
+                           opr->ndpr_plen, if_name(opr->ndpr_ifp), err);
                }
        } else {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "%s: scoped %s/%d on %s has no matching unscoped prefix\n",
                    __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr),
-                   pr->ndpr_plen, if_name(pr->ndpr_ifp)));
+                   pr->ndpr_plen, if_name(pr->ndpr_ifp));
        }
 
        lck_mtx_unlock(nd6_mutex);
        err = nd6_prefix_offlink(pr);
        lck_mtx_lock(nd6_mutex);
        if (err != 0) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "%s: failed to make %s/%d offlink on %s, errno=%d\n",
                    __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr),
-                   pr->ndpr_plen, if_name(pr->ndpr_ifp), err));
+                   pr->ndpr_plen, if_name(pr->ndpr_ifp), err);
        }
 
        /* Add the entries back */
        if (opr != NULL) {
                err = nd6_prefix_onlink_scoped(opr, opr->ndpr_ifp->if_index);
                if (err != 0) {
-                       nd6log((LOG_ERR,
+                       nd6log(error,
                            "%s: failed to make %s/%d scoped onlink on %s, "
                            "errno=%d\n", __func__,
                            ip6_sprintf(&opr->ndpr_prefix.sin6_addr),
-                           opr->ndpr_plen, if_name(opr->ndpr_ifp), err));
+                           opr->ndpr_plen, if_name(opr->ndpr_ifp), err);
                }
        }
 
        err = nd6_prefix_onlink_scoped(pr, IFSCOPE_NONE);
        if (err != 0) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "%s: failed to make %s/%d onlink on %s, errno=%d\n",
                    __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr),
-                   pr->ndpr_plen, if_name(pr->ndpr_ifp), err));
+                   pr->ndpr_plen, if_name(pr->ndpr_ifp), err);
        }
 
        if (err != 0) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "%s: error promoting %s/%d to %s from %s\n",
                    __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr),
                    pr->ndpr_plen, if_name(pr->ndpr_ifp),
-                   (opr != NULL) ? if_name(opr->ndpr_ifp) : "NONE"));
+                   (opr != NULL) ? if_name(opr->ndpr_ifp) : "NONE");
        } else {
-               nd6log2((LOG_INFO,
+               nd6log2(info,
                    "%s: %s/%d promoted, previously on %s\n",
                    if_name(pr->ndpr_ifp),
                    ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen,
-                   (opr != NULL) ? if_name(opr->ndpr_ifp) : "NONE"));
+                   (opr != NULL) ? if_name(opr->ndpr_ifp) : "NONE");
        }
 
        if (opr != NULL) {
@@ -3365,13 +3370,13 @@ nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped,
        /* sanity check */
        NDPR_LOCK(pr);
        if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "%s: %s/%d on %s scoped=%d is already on-link\n",
                    __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr),
                    pr->ndpr_plen, if_name(pr->ndpr_ifp),
                    (pr->ndpr_stateflags & NDPRF_IFSCOPE) ? 1 : 0);
-                   NDPR_UNLOCK(pr);
-                   return (EEXIST));
+               NDPR_UNLOCK(pr);
+               return EEXIST;
        }
        NDPR_UNLOCK(pr);
 
@@ -3424,11 +3429,11 @@ nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped,
                 * after removing all IPv6 addresses on the receiving
                 * interface.  This should, of course, be rare though.
                 */
-               nd6log((LOG_NOTICE,
+               nd6log(info,
                    "nd6_prefix_onlink: failed to find any ifaddr"
                    " to add route for a prefix(%s/%d) on %s\n",
                    ip6_sprintf(&pr->ndpr_prefix.sin6_addr),
-                   pr->ndpr_plen, if_name(ifp)));
+                   pr->ndpr_plen, if_name(ifp));
                NDPR_UNLOCK(pr);
                return 0;
        }
@@ -3483,15 +3488,15 @@ nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped,
                NDPR_LOCK(pr);
        } else {
                NDPR_LOCK(pr);
-               nd6log((LOG_ERR, "nd6_prefix_onlink: failed to add route for a"
-                   " prefix (%s/%d) on %s, gw=%s, mask=%s, flags=%lx,"
+               nd6log(error, "nd6_prefix_onlink: failed to add route for a"
+                   " prefix (%s/%d) on %s, gw=%s, mask=%s, flags=%x,"
                    " scoped=%d, errno = %d\n",
                    ip6_sprintf(&pr->ndpr_prefix.sin6_addr),
                    pr->ndpr_plen, if_name(ifp),
                    ip6_sprintf(&((struct sockaddr_in6 *)
                    (void *)ifa->ifa_addr)->sin6_addr),
                    ip6_sprintf(&mask6.sin6_addr), rtflags,
-                   (ifscope != IFSCOPE_NONE), error));
+                   (ifscope != IFSCOPE_NONE), error);
        }
        NDPR_LOCK_ASSERT_HELD(pr);
 
@@ -3583,11 +3588,11 @@ nd6_prefix_offlink(struct nd_prefix *pr)
        /* sanity check */
        NDPR_LOCK(pr);
        if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "nd6_prefix_offlink: %s/%d on %s scoped=%d is already "
                    "off-link\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr),
                    pr->ndpr_plen, if_name(pr->ndpr_ifp),
-                   (pr->ndpr_stateflags & NDPRF_IFSCOPE) ? 1 : 0));
+                   (pr->ndpr_stateflags & NDPRF_IFSCOPE) ? 1 : 0);
                NDPR_UNLOCK(pr);
                return EEXIST;
        }
@@ -3622,11 +3627,11 @@ nd6_prefix_offlink(struct nd_prefix *pr)
                RT_UNLOCK(rt);
                rtfree(rt);
        } else {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "nd6_prefix_offlink: failed to delete route: "
                    "%s/%d on %s, scoped %d, (errno = %d)\n",
                    ip6_sprintf(&sa6.sin6_addr), plen, if_name(ifp),
-                   (ifscope != IFSCOPE_NONE), error));
+                   (ifscope != IFSCOPE_NONE), error);
        }
 
        if (ndpr_rt != NULL) {
@@ -3700,16 +3705,16 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp, boolean_t i
                 * stateless autoconfiguration not yet well-defined for IID
                 * lengths other than 64 octets. Just give up for now.
                 */
-               nd6log((LOG_INFO, "%s: IID length not 64 octets (%s)\n",
-                   __func__, if_name(ifp)));
+               nd6log(info, "%s: IID length not 64 octets (%s)\n",
+                   __func__, if_name(ifp));
                goto unlock1;
        }
 
        if (iidlen + pr->ndpr_plen != 128) {
                error = EADDRNOTAVAIL;
-               nd6log((LOG_INFO,
+               nd6log(info,
                    "%s: invalid prefix length %d for %s, ignored\n",
-                   __func__, pr->ndpr_plen, if_name(ifp)));
+                   __func__, pr->ndpr_plen, if_name(ifp));
                goto unlock1;
        }
 
@@ -3741,8 +3746,8 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp, boolean_t i
                ia6 = in6ifa_ifpforlinklocal(ifp, 0);
                if (ia6 == NULL) {
                        error = EADDRNOTAVAIL;
-                       nd6log((LOG_INFO, "%s: no link-local address (%s)\n",
-                           __func__, if_name(ifp)));
+                       nd6log(info, "%s: no link-local address (%s)\n",
+                           __func__, if_name(ifp));
                        goto done;
                }
 
@@ -3796,11 +3801,11 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp, boolean_t i
                        }
                } else {
                        if (!is_clat46) {
-                               nd6log((LOG_ERR, "%s: no CGA available (%s)\n",
-                                   __func__, if_name(ifp)));
+                               nd6log(error, "%s: no CGA available (%s)\n",
+                                   __func__, if_name(ifp));
                        } else {
-                               nd6log((LOG_ERR, "%s: no CLAT46 available (%s)\n",
-                                   __func__, if_name(ifp)));
+                               nd6log(error, "%s: no CLAT46 available (%s)\n",
+                                   __func__, if_name(ifp));
                        }
                        goto done;
                }
@@ -3850,10 +3855,10 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp, boolean_t i
        }
        error = in6_update_ifa(ifp, &ifra, ifaupdate, &ia6);
        if (error != 0) {
-               nd6log((LOG_ERR,
+               nd6log(error,
                    "%s: failed to make ifaddr %s on %s (errno=%d)\n",
                    __func__, ip6_sprintf(&ifra.ifra_addr.sin6_addr),
-                   if_name(ifp), error));
+                   if_name(ifp), error);
                error = EADDRNOTAVAIL;
                goto done;
        }
@@ -3918,8 +3923,8 @@ again:
        if ((ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr)) != NULL) {
                IFA_REMREF(&ia->ia_ifa);
                if (trylimit-- == 0) {
-                       nd6log((LOG_NOTICE, "in6_tmpifadd: failed to find "
-                           "a unique random IFID\n"));
+                       nd6log(info, "in6_tmpifadd: failed to find "
+                           "a unique random IFID\n");
                        return EEXIST;
                }
                forcegen = 1;
@@ -3974,7 +3979,7 @@ again:
        ifaupdate = IN6_IFAUPDATE_NOWAIT | IN6_IFAUPDATE_DADDELAY;
        error = in6_update_ifa(ifp, &ifra, ifaupdate, &newia);
        if (error != 0) {
-               nd6log((LOG_ERR, "in6_tmpifadd: failed to add address.\n"));
+               nd6log(error, "in6_tmpifadd: failed to add address.\n");
                return error;
        }
        VERIFY(newia != NULL);
@@ -3986,7 +3991,7 @@ again:
                 * We lost the race with another thread that has purged
                 * ia0 address; in this case, purge the tmp addr as well.
                 */
-               nd6log((LOG_ERR, "in6_tmpifadd: no public address\n"));
+               nd6log(error, "in6_tmpifadd: no public address\n");
                VERIFY(!(ia0->ia6_flags & IN6_IFF_AUTOCONF));
                IFA_UNLOCK(&IA6_NONCONST(ia0)->ia_ifa);
                in6_purgeaddr(&newia->ia_ifa);
@@ -4044,9 +4049,9 @@ in6_init_prefix_ltimes(struct nd_prefix *ndpr)
 
        /* check if preferred lifetime > valid lifetime.  RFC 4862 5.5.3 (c) */
        if (ndpr->ndpr_pltime > ndpr->ndpr_vltime) {
-               nd6log((LOG_INFO, "in6_init_prefix_ltimes: preferred lifetime"
+               nd6log(info, "in6_init_prefix_ltimes: preferred lifetime"
                    "(%d) is greater than valid lifetime(%d)\n",
-                   (u_int)ndpr->ndpr_pltime, (u_int)ndpr->ndpr_vltime));
+                   (u_int)ndpr->ndpr_pltime, (u_int)ndpr->ndpr_vltime);
                return EINVAL;
        }
        if (ndpr->ndpr_pltime == ND6_INFINITE_LIFETIME) {
@@ -4188,11 +4193,11 @@ nd6_setdefaultiface(
                }
 
                if (nd6_defifp != NULL) {
-                       nd6log((LOG_INFO, "%s: is now the default "
+                       nd6log(info, "%s: is now the default "
                            "interface (was %s)\n", if_name(nd6_defifp),
-                           odef_ifp != NULL ? if_name(odef_ifp) : "NONE"));
+                           odef_ifp != NULL ? if_name(odef_ifp) : "NONE");
                } else {
-                       nd6log((LOG_INFO, "No default interface set\n"));
+                       nd6log(info, "No default interface set\n");
                }
 
                /*
index b1ce62b0cfece81d19fa8ca8e887f56688a96bf7..5b2b17517f61c6c8dd5d46b5585cb8f54af612c9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -397,6 +397,9 @@ rip6_output(
        if (INP_NO_EXPENSIVE(in6p)) {
                ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE;
        }
+       if (INP_NO_CONSTRAINED(in6p)) {
+               ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED;
+       }
        if (INP_AWDL_UNRESTRICTED(in6p)) {
                ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED;
        }
@@ -710,9 +713,20 @@ rip6_output(
                 * route is not multicast, update outif with that of
                 * the route interface index used by IP.
                 */
-               if (rt != NULL &&
-                   (outif = rt->rt_ifp) != in6p->in6p_last_outifp) {
-                       in6p->in6p_last_outifp = outif;
+               if (rt != NULL) {
+                       /*
+                        * When an NECP IP tunnel policy forces the outbound interface,
+                        * ip6_output_list() informs the transport layer what is the actual
+                        * outgoing interface
+                        */
+                       if (ip6oa.ip6oa_flags & IP6OAF_BOUND_IF) {
+                               outif = ifindex2ifnet[ip6oa.ip6oa_boundif];
+                       } else {
+                               outif = rt->rt_ifp;
+                       }
+                       if (outif != NULL) {
+                               in6p->in6p_last_outifp = outif;
+                       }
                }
        } else {
                ROUTE_RELEASE(&in6p->in6p_route);
@@ -723,7 +737,7 @@ rip6_output(
         * denied access to it, generate an event.
         */
        if (error != 0 && (ip6oa.ip6oa_retflags & IP6OARF_IFDENIED) &&
-           (INP_NO_CELLULAR(in6p) || INP_NO_EXPENSIVE(in6p))) {
+           (INP_NO_CELLULAR(in6p) || INP_NO_EXPENSIVE(in6p) || INP_NO_CONSTRAINED(in6p))) {
                soevent(in6p->inp_socket, (SO_FILT_HINT_LOCKED |
                    SO_FILT_HINT_IFDENIED));
        }
index 7898c179fb7d22236b49fc5596e0354e5ef49255..66025ca433e5c673ef665c5a7f1ae34f29e34764 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -200,6 +200,9 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
        if (INP_NO_EXPENSIVE(in6p)) {
                ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE;
        }
+       if (INP_NO_CONSTRAINED(in6p)) {
+               ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED;
+       }
        if (INP_AWDL_UNRESTRICTED(in6p)) {
                ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED;
        }
@@ -568,6 +571,9 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
                VERIFY(in6p->inp_sndinprog_cnt > 0);
                if (--in6p->inp_sndinprog_cnt == 0) {
                        in6p->inp_flags &= ~(INP_FC_FEEDBACK);
+                       if (in6p->inp_sndingprog_waiters > 0) {
+                               wakeup(&in6p->inp_sndinprog_cnt);
+                       }
                }
 
                if (ro.ro_rt != NULL) {
@@ -612,16 +618,27 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
                         * If the destination route is unicast, update outif
                         * with that of the route interface used by IP.
                         */
-                       if (rt != NULL &&
-                           (outif = rt->rt_ifp) != in6p->in6p_last_outifp) {
-                               in6p->in6p_last_outifp = outif;
-
-                               so->so_pktheadroom = P2ROUNDUP(
-                                       sizeof(struct udphdr) +
-                                       hlen +
-                                       ifnet_hdrlen(outif) +
-                                       ifnet_mbuf_packetpreamblelen(outif),
-                                       sizeof(u_int32_t));
+                       if (rt != NULL) {
+                               /*
+                                * When an NECP IP tunnel policy forces the outbound interface,
+                                * ip6_output_list() informs the transport layer what is the actual
+                                * outgoing interface
+                                */
+                               if (ip6oa.ip6oa_flags & IP6OAF_BOUND_IF) {
+                                       outif = ifindex2ifnet[ip6oa.ip6oa_boundif];
+                               } else {
+                                       outif = rt->rt_ifp;
+                               }
+                               if (outif != NULL && outif != in6p->in6p_last_outifp) {
+                                       in6p->in6p_last_outifp = outif;
+
+                                       so->so_pktheadroom = P2ROUNDUP(
+                                               sizeof(struct udphdr) +
+                                               hlen +
+                                               ifnet_hdrlen(outif) +
+                                               ifnet_mbuf_packetpreamblelen(outif),
+                                               sizeof(u_int32_t));
+                               }
                        }
                } else {
                        ROUTE_RELEASE(&in6p->in6p_route);
@@ -632,7 +649,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
                 * socket is denied access to it, generate an event.
                 */
                if (error != 0 && (ip6oa.ip6oa_retflags & IP6OARF_IFDENIED) &&
-                   (INP_NO_CELLULAR(in6p) || INP_NO_EXPENSIVE(in6p))) {
+                   (INP_NO_CELLULAR(in6p) || INP_NO_EXPENSIVE(in6p) || INP_NO_CONSTRAINED(in6p))) {
                        soevent(in6p->inp_socket, (SO_FILT_HINT_LOCKED |
                            SO_FILT_HINT_IFDENIED));
                }
index 64b57f861fcf2e82aa4229cbf2e01c0335e7479d..9b4c3a16e174e29492874b166923296ba2e0ec67 100644 (file)
@@ -484,13 +484,14 @@ udp6_input(struct mbuf **mp, int *offp, int proto)
 #if IPSEC
        /*
         * UDP to port 4500 with a payload where the first four bytes are
-        * not zero is a UDP encapsulated IPSec packet. Packets where
+        * not zero is a UDP encapsulated IPsec packet. Packets where
         * the payload is one byte and that byte is 0xFF are NAT keepalive
-        * packets. Decapsulate the ESP packet and carry on with IPSec input
+        * packets. Decapsulate the ESP packet and carry on with IPsec input
         * or discard the NAT keep-alive.
         */
        if (ipsec_bypass == 0 && (esp_udp_encap_port & 0xFFFF) != 0 &&
-           uh->uh_dport == ntohs((u_short)esp_udp_encap_port)) {
+           (uh->uh_dport == ntohs((u_short)esp_udp_encap_port) ||
+           uh->uh_sport == ntohs((u_short)esp_udp_encap_port))) {
                int payload_len = ulen - sizeof(struct udphdr) > 4 ? 4 :
                    ulen - sizeof(struct udphdr);
 
@@ -515,7 +516,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto)
                        goto bad;
                } else if (payload_len == 4 && *(u_int32_t*)(void *)
                    ((caddr_t)uh + sizeof(struct udphdr)) != 0) {
-                       /* UDP encapsulated IPSec packet to pass through NAT */
+                       /* UDP encapsulated IPsec packet to pass through NAT */
                        /* preserve the udp header */
                        *offp = off + sizeof(struct udphdr);
                        return esp6_input(mp, offp, IPPROTO_UDP);
index a340f34b781811cee941126e56910e3912448c32..e1230e472fa8f8fbebcce6eef4da779a333e8a10 100644 (file)
 #include <netinet6/esp6.h>
 #endif
 #endif
-#include <netinet6/ipcomp.h>
-#if INET6
-#include <netinet6/ipcomp6.h>
-#endif
 
 
 /* randomness */
@@ -546,7 +542,6 @@ static void key_getcomb_setlifetime(struct sadb_comb *);
 static struct mbuf *key_getcomb_esp(void);
 #endif
 static struct mbuf *key_getcomb_ah(void);
-static struct mbuf *key_getcomb_ipcomp(void);
 static struct mbuf *key_getprop(const struct secasindex *);
 
 static int key_acquire(struct secasindex *, struct secpolicy *);
@@ -612,6 +607,7 @@ key_init(struct protosw *pp, struct domain *dp)
        VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
 
        _CASSERT(PFKEY_ALIGN8(sizeof(struct sadb_msg)) <= _MHLEN);
+       _CASSERT(MAX_REPLAY_WINDOWS == MBUF_TC_MAX);
 
        if (key_initialized) {
                return;
@@ -670,6 +666,7 @@ key_init(struct protosw *pp, struct domain *dp)
        /* initialize key statistics */
        keystat.getspi_count = 1;
 
+       esp_init();
 #ifndef __APPLE__
        printf("IPsec: Initialized Security Association Processing.\n");
 #endif
@@ -898,7 +895,7 @@ key_alloc_outbound_sav_for_interface(ifnet_t interface, int family,
                                        }
                                }
 
-                               /* This SAH is linked to the IPSec interface, and the right family. We found it! */
+                               /* This SAH is linked to the IPsec interface, and the right family. We found it! */
                                if (key_preferred_oldsa) {
                                        saorder_state_valid = saorder_state_valid_prefer_old;
                                        arraysize = _ARRAYLEN(saorder_state_valid_prefer_old);
@@ -1915,7 +1912,6 @@ key_msg2sp(
                        switch (xisr->sadb_x_ipsecrequest_proto) {
                        case IPPROTO_ESP:
                        case IPPROTO_AH:
-                       case IPPROTO_IPCOMP:
                                break;
                        default:
                                ipseclog((LOG_DEBUG,
@@ -3943,6 +3939,7 @@ key_newsav(
        LIST_INSERT_TAIL(&sah->savtree[SADB_SASTATE_LARVAL], newsav,
            secasvar, chain);
        ipsec_sav_count++;
+       ipsec_monitor_sleep_wake();
 
        return newsav;
 }
@@ -4111,9 +4108,12 @@ key_delsav(
                KFREE(sav->sched);
                sav->sched = NULL;
        }
-       if (sav->replay != NULL) {
-               keydb_delsecreplay(sav->replay);
-               sav->replay = NULL;
+
+       for (int i = 0; i < MAX_REPLAY_WINDOWS; i++) {
+               if (sav->replay[i] != NULL) {
+                       keydb_delsecreplay(sav->replay[i]);
+                       sav->replay[i] = NULL;
+               }
        }
        if (sav->lft_c != NULL) {
                KFREE(sav->lft_c);
@@ -4298,7 +4298,9 @@ key_setsaval(
        }
 
        /* initialization */
-       sav->replay = NULL;
+       for (int i = 0; i < MAX_REPLAY_WINDOWS; i++) {
+               sav->replay[i] = NULL;
+       }
        sav->key_auth = NULL;
        sav->key_enc = NULL;
        sav->sched = NULL;
@@ -4337,6 +4339,7 @@ key_setsaval(
                                error = EINVAL;
                                goto fail;
                        }
+                       sav->natt_encapsulated_src_port = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_src_port;
                        sav->remote_ike_port = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_port;
                        sav->natt_interval = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_interval;
                        sav->natt_offload_interval = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_offload_interval;
@@ -4356,11 +4359,28 @@ key_setsaval(
 
                /* replay window */
                if ((sa0->sadb_sa_flags & SADB_X_EXT_OLD) == 0) {
-                       sav->replay = keydb_newsecreplay(sa0->sadb_sa_replay);
-                       if (sav->replay == NULL) {
-                               ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n"));
-                               error = ENOBUFS;
-                               goto fail;
+                       if ((sav->flags2 & SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) ==
+                           SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) {
+                               uint32_t range = (1ULL << (sizeof(((struct secreplay *)0)->count) * 8)) / MAX_REPLAY_WINDOWS;
+                               for (int i = 0; i < MAX_REPLAY_WINDOWS; i++) {
+                                       sav->replay[i] = keydb_newsecreplay(sa0->sadb_sa_replay);
+                                       if (sav->replay[i] == NULL) {
+                                               ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n"));
+                                               error = ENOBUFS;
+                                               goto fail;
+                                       }
+                                       /* Allowed range for sequence per traffic class */
+                                       sav->replay[i]->count = i * range;
+                                       sav->replay[i]->lastseq = ((i + 1) * range) - 1;
+                               }
+                       } else {
+                               sav->replay[0] = keydb_newsecreplay(sa0->sadb_sa_replay);
+                               if (sav->replay[0] == NULL) {
+                                       ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n"));
+                                       error = ENOBUFS;
+                                       goto fail;
+                               }
+                               sav->replay[0]->lastseq = ~0;
                        }
                }
        }
@@ -4387,7 +4407,6 @@ key_setsaval(
                                error = EINVAL;
                        }
                        break;
-               case SADB_X_SATYPE_IPCOMP:
                default:
                        error = EINVAL;
                        break;
@@ -4434,12 +4453,6 @@ key_setsaval(
                                goto fail;
                        }
                        break;
-               case SADB_X_SATYPE_IPCOMP:
-                       if (len != PFKEY_ALIGN8(sizeof(struct sadb_key))) {
-                               error = EINVAL;
-                       }
-                       sav->key_enc = NULL;            /*just in case*/
-                       break;
                case SADB_SATYPE_AH:
                default:
                        error = EINVAL;
@@ -4485,7 +4498,6 @@ key_setsaval(
 #endif
                break;
        case SADB_SATYPE_AH:
-       case SADB_X_SATYPE_IPCOMP:
                break;
        default:
                ipseclog((LOG_DEBUG, "key_setsaval: invalid SA type.\n"));
@@ -4567,9 +4579,11 @@ key_setsaval(
 
 fail:
        /* initialization */
-       if (sav->replay != NULL) {
-               keydb_delsecreplay(sav->replay);
-               sav->replay = NULL;
+       for (int i = 0; i < MAX_REPLAY_WINDOWS; i++) {
+               if (sav->replay[i] != NULL) {
+                       keydb_delsecreplay(sav->replay[i]);
+                       sav->replay[i] = NULL;
+               }
        }
        if (sav->key_auth != NULL) {
                bzero(_KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth));
@@ -4641,7 +4655,10 @@ key_setsaval2(struct secasvar      *sav,
        LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_OWNED);
 
        /* initialization */
-       sav->replay = NULL;
+       for (int i = 0; i < MAX_REPLAY_WINDOWS; i++) {
+               sav->replay[i] = NULL;
+       }
+
        sav->key_auth = NULL;
        sav->key_enc = NULL;
        sav->sched = NULL;
@@ -4688,11 +4705,28 @@ key_setsaval2(struct secasvar      *sav,
 
        /* replay window */
        if ((flags & SADB_X_EXT_OLD) == 0) {
-               sav->replay = keydb_newsecreplay(replay);
-               if (sav->replay == NULL) {
-                       ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n"));
-                       error = ENOBUFS;
-                       goto fail;
+               if ((sav->flags2 & SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) ==
+                   SADB_X_EXT_SA2_SEQ_PER_TRAFFIC_CLASS) {
+                       uint32_t range = (1ULL << (sizeof(((struct secreplay *)0)->count) * 8)) / MAX_REPLAY_WINDOWS;
+                       for (int i = 0; i < MAX_REPLAY_WINDOWS; i++) {
+                               sav->replay[i] = keydb_newsecreplay(replay);
+                               if (sav->replay[i] == NULL) {
+                                       ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n"));
+                                       error = ENOBUFS;
+                                       goto fail;
+                               }
+                               /* Allowed range for sequence per traffic class */
+                               sav->replay[i]->count = i * range;
+                               sav->replay[i]->lastseq = ((i + 1) * range) - 1;
+                       }
+               } else {
+                       sav->replay[0] = keydb_newsecreplay(replay);
+                       if (sav->replay[0] == NULL) {
+                               ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n"));
+                               error = ENOBUFS;
+                               goto fail;
+                       }
+                       sav->replay[0]->lastseq = ~0;
                }
        }
 
@@ -4792,9 +4826,11 @@ key_setsaval2(struct secasvar      *sav,
 
 fail:
        /* initialization */
-       if (sav->replay != NULL) {
-               keydb_delsecreplay(sav->replay);
-               sav->replay = NULL;
+       for (int i = 0; i < MAX_REPLAY_WINDOWS; i++) {
+               if (sav->replay[i] != NULL) {
+                       keydb_delsecreplay(sav->replay[i]);
+                       sav->replay[i] = NULL;
+               }
        }
        if (sav->key_auth != NULL) {
                bzero(_KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth));
@@ -4895,20 +4931,6 @@ key_mature(
                checkmask = 2;
                mustmask = 2;
                break;
-       case IPPROTO_IPCOMP:
-               if (sav->alg_auth != SADB_AALG_NONE) {
-                       ipseclog((LOG_DEBUG, "key_mature: "
-                           "protocol and algorithm mismated.\n"));
-                       return EINVAL;
-               }
-               if ((sav->flags & SADB_X_EXT_RAWCPI) == 0
-                   && ntohl(sav->spi) >= 0x10000) {
-                       ipseclog((LOG_DEBUG, "key_mature: invalid cpi for IPComp.\n"));
-                       return EINVAL;
-               }
-               checkmask = 4;
-               mustmask = 4;
-               break;
        default:
                ipseclog((LOG_DEBUG, "key_mature: Invalid satype.\n"));
                return EPROTONOSUPPORT;
@@ -5000,18 +5022,6 @@ key_mature(
 #endif
        }
 
-       /* check compression algorithm */
-       if ((checkmask & 4) != 0) {
-               const struct ipcomp_algorithm *algo;
-
-               /* algorithm-dependent check */
-               algo = ipcomp_algorithm_lookup(sav->alg_enc);
-               if (!algo) {
-                       ipseclog((LOG_DEBUG, "key_mature: unknown compression algorithm.\n"));
-                       return EINVAL;
-               }
-       }
-
        key_sa_chgstate(sav, SADB_SASTATE_MATURE);
 
        return 0;
@@ -5060,7 +5070,7 @@ key_setdumpsa(
 
                case SADB_X_EXT_SA2:
                        m = key_setsadbxsa2(sav->sah->saidx.mode,
-                           sav->replay ? sav->replay->count : 0,
+                           sav->replay[0] ? sav->replay[0]->count : 0,
                            sav->sah->saidx.reqid,
                            sav->flags2);
                        if (!m) {
@@ -5268,7 +5278,7 @@ key_setsadbsa(
        p->sadb_sa_len = PFKEY_UNIT64(len);
        p->sadb_sa_exttype = SADB_EXT_SA;
        p->sadb_sa_spi = sav->spi;
-       p->sadb_sa_replay = (sav->replay != NULL ? sav->replay->wsize : 0);
+       p->sadb_sa_replay = (sav->replay[0] != NULL ? sav->replay[0]->wsize : 0);
        p->sadb_sa_state = sav->state;
        p->sadb_sa_auth = sav->alg_auth;
        p->sadb_sa_encrypt = sav->alg_enc;
@@ -6684,8 +6694,6 @@ key_satype2proto(
                return IPPROTO_AH;
        case SADB_SATYPE_ESP:
                return IPPROTO_ESP;
-       case SADB_X_SATYPE_IPCOMP:
-               return IPPROTO_IPCOMP;
        default:
                return 0;
        }
@@ -6706,8 +6714,6 @@ key_proto2satype(
                return SADB_SATYPE_AH;
        case IPPROTO_ESP:
                return SADB_SATYPE_ESP;
-       case IPPROTO_IPCOMP:
-               return SADB_X_SATYPE_IPCOMP;
        default:
                return 0;
        }
@@ -7063,20 +7069,6 @@ key_do_getnewspi(
                keymin = key_spi_minval;
                keymax = key_spi_maxval;
        }
-       /* IPCOMP needs 2-byte SPI */
-       if (saidx->proto == IPPROTO_IPCOMP) {
-               u_int32_t t;
-               if (keymin >= 0x10000) {
-                       keymin = 0xffff;
-               }
-               if (keymax >= 0x10000) {
-                       keymax = 0xffff;
-               }
-               if (keymin > keymax) {
-                       t = keymin; keymin = keymax; keymax = t;
-               }
-       }
-
        if (keymin == keymax) {
                if (key_checkspidup(saidx, keymin) != NULL) {
                        ipseclog((LOG_DEBUG, "key_do_getnewspi: SPI %u exists already.\n", keymin));
@@ -7419,6 +7411,7 @@ key_migrate(struct socket *so,
 
        /* Reset NAT values */
        sav->flags = sa0->sadb_sa_flags;
+       sav->natt_encapsulated_src_port = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_src_port;
        sav->remote_ike_port = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_port;
        sav->natt_interval = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_interval;
        sav->natt_offload_interval = ((const struct sadb_sa_2*)(sa0))->sadb_sa_natt_offload_interval;
@@ -8380,55 +8373,6 @@ key_getcomb_ah(void)
        return m;
 }
 
-/*
- * not really an official behavior.  discussed in pf_key@inner.net in Sep2000.
- * XXX reorder combinations by preference
- */
-static struct mbuf *
-key_getcomb_ipcomp(void)
-{
-       struct sadb_comb *comb;
-       const struct ipcomp_algorithm *algo;
-       struct mbuf *m;
-       int i;
-       const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));
-
-       m = NULL;
-       for (i = 1; i <= SADB_X_CALG_MAX; i++) {
-               algo = ipcomp_algorithm_lookup(i);
-               if (!algo) {
-                       continue;
-               }
-
-               if (!m) {
-#if DIAGNOSTIC
-                       if (l > MLEN) {
-                               panic("assumption failed in key_getcomb_ipcomp");
-                       }
-#endif
-                       MGET(m, M_WAITOK, MT_DATA);
-                       if (m) {
-                               M_ALIGN(m, l);
-                               m->m_len = l;
-                               m->m_next = NULL;
-                       }
-               } else {
-                       M_PREPEND(m, l, M_WAITOK, 1);
-               }
-               if (!m) {
-                       return NULL;
-               }
-
-               comb = mtod(m, struct sadb_comb *);
-               bzero(comb, sizeof(*comb));
-               key_getcomb_setlifetime(comb);
-               comb->sadb_comb_encrypt = i;
-               /* what should we set into sadb_comb_*_{min,max}bits? */
-       }
-
-       return m;
-}
-
 /*
  * XXX no way to pass mode (transport/tunnel) to userland
  * XXX replay checking?
@@ -8452,9 +8396,6 @@ key_getprop(
        case IPPROTO_AH:
                m = key_getcomb_ah();
                break;
-       case IPPROTO_IPCOMP:
-               m = key_getcomb_ipcomp();
-               break;
        default:
                return NULL;
        }
@@ -8494,8 +8435,6 @@ key_getprop(
  *
  * XXX x_policy is outside of RFC2367 (KAME extension).
  * XXX sensitivity is not supported.
- * XXX for ipcomp, RFC2367 does not define how to fill in proposal.
- * see comment for key_getcomb_ipcomp().
  *
  * OUT:
  *    0     : succeed
@@ -8644,25 +8583,12 @@ key_acquire(
 
        /* create proposal/combination extension */
        m = key_getprop(saidx);
-#if 0
-       /*
-        * spec conformant: always attach proposal/combination extension,
-        * the problem is that we have no way to attach it for ipcomp,
-        * due to the way sadb_comb is declared in RFC2367.
-        */
-       if (!m) {
-               error = ENOBUFS;
-               goto fail;
-       }
-       m_cat(result, m);
-#else
        /*
         * outside of spec; make proposal/combination extension optional.
         */
        if (m) {
                m_cat(result, m);
        }
-#endif
 
        if ((result->m_flags & M_PKTHDR) == 0) {
                error = EINVAL;
@@ -9123,7 +9049,7 @@ setmsg:
                }
 #endif
 
-#if DIGAGNOSTIC
+#if DIAGNOSTIC
                if (off != len) {
                        panic("length assumption failed in key_register");
                }
@@ -9248,7 +9174,7 @@ key_expire(
 
        /* create SA extension */
        m = key_setsadbxsa2(sav->sah->saidx.mode,
-           sav->replay ? sav->replay->count : 0,
+           sav->replay[0] ? sav->replay[0]->count : 0,
            sav->sah->saidx.reqid,
            sav->flags2);
        if (!m) {
@@ -9825,7 +9751,7 @@ key_parse(
        target = KEY_SENDUP_ONE;
 
        if ((m->m_flags & M_PKTHDR) == 0 ||
-           m->m_pkthdr.len != m->m_pkthdr.len) {
+           m->m_pkthdr.len != orglen) {
                ipseclog((LOG_DEBUG, "key_parse: invalid message length.\n"));
                PFKEY_STAT_INCREMENT(pfkeystat.out_invlen);
                error = EINVAL;
@@ -9913,7 +9839,6 @@ key_parse(
                break;
        case SADB_SATYPE_AH:
        case SADB_SATYPE_ESP:
-       case SADB_X_SATYPE_IPCOMP:
                switch (msg->sadb_msg_type) {
                case SADB_X_SPDADD:
                case SADB_X_SPDDELETE:
@@ -10755,7 +10680,7 @@ key_delsp_for_ipsec_if(ifnet_t ipsec_if)
 
        LIST_FOREACH(sah, &sahtree, chain) {
                if (sah->ipsec_if == ipsec_if) {
-                       /* This SAH is linked to the IPSec interface. It now needs to close. */
+                       /* This SAH is linked to the IPsec interface. It now needs to close. */
                        ifnet_release(sah->ipsec_if);
                        sah->ipsec_if = NULL;
 
index f1be1810444ff21bea28c378935b04ec33f9b6e2..ac3a0cb0c020bd6693f74ac1f4a71e3047a3152a 100644 (file)
@@ -619,8 +619,8 @@ kdebug_secasv(sav)
                printf("\n");
        }
 
-       if (sav->replay != NULL)
-               kdebug_secreplay(sav->replay);
+       if (sav->replay[0] != NULL)
+               kdebug_secreplay(sav->replay[0]);
        if (sav->lft_c != NULL)
                kdebug_sadb_lifetime((struct sadb_ext *)sav->lft_c);
        if (sav->lft_h != NULL)
index 19450e6bdeaaf685f8af19bd21594b438404a0a3..db7a04ef30eb3cc0ed24c795181662ad406f4ca6 100644 (file)
@@ -70,6 +70,8 @@ struct secashead {
        struct route_in6 sa_route;              /* route cache */
 };
 
+#define MAX_REPLAY_WINDOWS 4
+
 /* Security Association */
 struct secasvar {
        LIST_ENTRY(secasvar) chain;
@@ -90,7 +92,8 @@ struct secasvar {
        void *sched;                    /* intermediate encryption key */
        size_t schedlen;
 
-       struct secreplay *replay;       /* replay prevention */
+       struct secreplay *replay[MAX_REPLAY_WINDOWS]; /* replay prevention */
+
        long created;                   /* for lifetime */
 
        struct sadb_lifetime *lft_c;    /* CURRENT lifetime, it's constant. */
@@ -119,7 +122,7 @@ struct secreplay {
        u_int32_t count;
        u_int wsize;            /* window size, i.g. 4 bytes */
        u_int32_t seq;          /* used by sender */
-       u_int32_t lastseq;      /* used by receiver */
+       u_int32_t lastseq;      /* used by sender/receiver */
        caddr_t bitmap;         /* used by receiver */
        int overflow;           /* overflow flag */
 };
index 9b7f46424e995ae000c9340c74616e9a8e80d005..83dd1d8748f30749ece343d1742efcd1f15753b6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -87,8 +87,8 @@
 extern lck_mtx_t *raw_mtx;
 extern void key_init(struct protosw *, struct domain *);
 
-struct sockaddr key_dst = { 2, PF_KEY, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, } };
-struct sockaddr key_src = { 2, PF_KEY, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, } };
+struct sockaddr key_dst = { .sa_len = 2, .sa_family = PF_KEY, .sa_data = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, } };
+struct sockaddr key_src = { .sa_len = 2, .sa_family = PF_KEY, .sa_data = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, } };
 
 static void key_dinit(struct domain *);
 static int key_sendup0(struct rawcb *, struct mbuf *, int);
index 70f4971013a7783e9f97563b71589d27741dd9c4..747b7fb6dae91a5fcd7bc6e9b0d8c0bb4e2017a0 100644 (file)
@@ -1333,6 +1333,9 @@ gss_krb5_cfx_verify_mic_mbuf(uint32_t *minor,   /* minor_status */
        header.value = mic->value;
 
        *minor = krb5_mic_mbuf(cctx, NULL, mbp, offset, len, &header, digest, &verified, 0, 0);
+       if (*minor) {
+               return GSS_S_FAILURE;
+       }
 
        //XXX  errors and such? Sequencing and replay? Not Supported RPCSEC_GSS
        memcpy(&seq, token->SND_SEQ, sizeof(uint64_t));
@@ -2171,7 +2174,7 @@ gss_krb5_3des_unwrap_mbuf(uint32_t *minor,
                        break;
                }
                wrap.Seal_Alg[0] = 0xff;
-               wrap.Seal_Alg[0] = 0xff;
+               wrap.Seal_Alg[1] = 0xff;
        }
        if (*minor) {
                return GSS_S_FAILURE;
@@ -2204,12 +2207,12 @@ gss_krb5_3des_unwrap_mbuf(uint32_t *minor,
        header.value = &wrap;
 
        *minor = krb5_mic_mbuf(cctx, &header, smb, 0, length, NULL, hashval, &verified, 0, 0);
-       if (!verified) {
-               return GSS_S_BAD_SIG;
-       }
        if (*minor) {
                return GSS_S_FAILURE;
        }
+       if (!verified) {
+               return GSS_S_BAD_SIG;
+       }
 
        /* Get the pad bytes */
        *minor = mbuf_copydata(smb, length - 1, 1, &padlen);
index cc09ff6749f32911a6bcef5feb8a80582567877a..dcb9647e8e8d1fa9a24dd54fbdea38b35c6b4075 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -117,7 +117,7 @@ extern int nfs_ticks;
 #endif
 
 /* default values for unresponsive mount timeouts */
-#define NFS_TPRINTF_INITIAL_DELAY       12
+#define NFS_TPRINTF_INITIAL_DELAY       5
 #define NFS_TPRINTF_DELAY               30
 
 /*
@@ -187,6 +187,9 @@ extern int nfs_ticks;
 #define NFS_MATTR_SVCPRINCIPAL          26      /* GSS principal to authenticate to, the server principal */
 #define NFS_MATTR_NFS_VERSION_RANGE     27      /* Packed version range to try */
 #define NFS_MATTR_KERB_ETYPE            28      /* Enctype to use for kerberos mounts */
+#define NFS_MATTR_LOCAL_NFS_PORT        29      /* Unix domain socket for NFS protocol */
+#define NFS_MATTR_LOCAL_MOUNT_PORT      30      /* Unix domain socket for MOUNT protocol */
+#define NFS_MATTR_SET_MOUNT_OWNER       31      /* Set owner of mount point */
 
 /* NFS mount flags */
 #define NFS_MFLAG_SOFT                  0       /* soft mount (requests fail if unresponsive) */
@@ -207,6 +210,8 @@ extern int nfs_ticks;
 #define NFS_MFLAG_NOQUOTA               15      /* don't support QUOTA requests */
 #define NFS_MFLAG_MNTUDP                16      /* MOUNT protocol should use UDP */
 #define NFS_MFLAG_MNTQUICK              17      /* use short timeouts while mounting */
+/*                                      18         reserved */
+#define NFS_MFLAG_NOOPAQUE_AUTH         19      /* don't make the mount AUTH_OPAQUE. Used by V3 */
 
 /* Macros for packing and unpacking packed versions */
 #define PVER2MAJOR(M) ((uint32_t)(((M) >> 16) & 0xffff))
@@ -1139,22 +1144,25 @@ extern int nfs_request_timer_on;
 /* mutex for nfs client globals */
 extern lck_mtx_t *nfs_global_mutex;
 
+#if CONFIG_NFS4
 /* NFSv4 callback globals */
 extern int nfs4_callback_timer_on;
 extern in_port_t nfs4_cb_port, nfs4_cb_port6;
 
+/* nfs 4 default domain for user mapping */
+extern char nfs4_default_domain[MAXPATHLEN];
+/* nfs 4 timer call structure */
+extern thread_call_t    nfs4_callback_timer_call;
+#endif
+
 /* nfs timer call structures */
 extern thread_call_t    nfs_request_timer_call;
 extern thread_call_t    nfs_buf_timer_call;
-extern thread_call_t    nfs4_callback_timer_call;
 extern thread_call_t    nfsrv_idlesock_timer_call;
 #if CONFIG_FSE
 extern thread_call_t    nfsrv_fmod_timer_call;
 #endif
 
-/* nfs 4 default domain for user mapping */
-extern char nfs4_default_domain[MAXPATHLEN];
-
 __BEGIN_DECLS
 
 nfstype vtonfs_type(enum vtype, int);
@@ -1167,6 +1175,7 @@ void    nfs_nhinit(void);
 void    nfs_nhinit_finish(void);
 u_long  nfs_hash(u_char *, int);
 
+#if CONFIG_NFS4
 int     nfs4_init_clientid(struct nfsmount *);
 int     nfs4_setclientid(struct nfsmount *);
 int     nfs4_renew(struct nfsmount *, int);
@@ -1178,8 +1187,10 @@ void    nfs4_cb_rcv(socket_t, void *, int);
 void    nfs4_callback_timer(void *, void *);
 int     nfs4_secinfo_rpc(struct nfsmount *, struct nfsreq_secinfo_args *, kauth_cred_t, uint32_t *, int *);
 int     nfs4_get_fs_locations(struct nfsmount *, nfsnode_t, u_char *, int, const char *, vfs_context_t, struct nfs_fs_locations *);
-void    nfs_fs_locations_cleanup(struct nfs_fs_locations *);
 void    nfs4_default_attrs_for_referral_trigger(nfsnode_t, char *, int, struct nfs_vattr *, fhandle_t *);
+#endif
+
+void    nfs_fs_locations_cleanup(struct nfs_fs_locations *);
 
 int     nfs_sockaddr_cmp(struct sockaddr *, struct sockaddr *);
 int     nfs_connect(struct nfsmount *, int, int);
@@ -1257,6 +1268,7 @@ int     nfs_dir_buf_cache_lookup(nfsnode_t, nfsnode_t *, struct componentname *,
 int     nfs_dir_buf_search(struct nfsbuf *, struct componentname *, fhandle_t *, struct nfs_vattr *, uint64_t *, time_t *, daddr64_t *, int);
 void    nfs_name_cache_purge(nfsnode_t, nfsnode_t, struct componentname *, vfs_context_t);
 
+#if CONFIG_NFS4
 uint32_t nfs4_ace_nfstype_to_vfstype(uint32_t, int *);
 uint32_t nfs4_ace_vfstype_to_nfstype(uint32_t, int *);
 uint32_t nfs4_ace_nfsflags_to_vfsflags(uint32_t);
@@ -1266,8 +1278,11 @@ uint32_t nfs4_ace_vfsrights_to_nfsmask(uint32_t);
 int nfs4_id2guid(char *, guid_t *, int);
 int nfs4_guid2id(guid_t *, char *, size_t *, int);
 
-int     nfs_parsefattr(struct nfsm_chain *, int, struct nfs_vattr *);
 int     nfs4_parsefattr(struct nfsm_chain *, struct nfs_fsattr *, struct nfs_vattr *, fhandle_t *, struct dqblk *, struct nfs_fs_locations *);
+#endif
+
+int     nfs_parsefattr(struct nfsmount *nmp, struct nfsm_chain *, int,
+    struct nfs_vattr *);
 void    nfs_vattr_set_supported(uint32_t *, struct vnode_attr *);
 void    nfs_vattr_set_bitmap(struct nfsmount *, uint32_t *, struct vnode_attr *);
 void    nfs3_pathconf_cache(struct nfsmount *, struct nfs_fsattr *);
@@ -1277,7 +1292,6 @@ int     nfs_node_access_slot(nfsnode_t, uid_t, int);
 void    nfs_vnode_notify(nfsnode_t, uint32_t);
 
 void    nfs_avoid_needless_id_setting_on_create(nfsnode_t, struct vnode_attr *, vfs_context_t);
-int     nfs4_create_rpc(vfs_context_t, nfsnode_t, struct componentname *, struct vnode_attr *, int, char *, nfsnode_t *);
 int     nfs_open_state_set_busy(nfsnode_t, thread_t);
 void    nfs_open_state_clear_busy(nfsnode_t);
 struct nfs_open_owner *nfs_open_owner_find(struct nfsmount *, kauth_cred_t, int);
@@ -1296,24 +1310,9 @@ void    nfs_open_file_add_open(struct nfs_open_file *, uint32_t, uint32_t, int);
 void    nfs_open_file_remove_open_find(struct nfs_open_file *, uint32_t, uint32_t, uint32_t *, uint32_t *, int*);
 void    nfs_open_file_remove_open(struct nfs_open_file *, uint32_t, uint32_t);
 void    nfs_get_stateid(nfsnode_t, thread_t, kauth_cred_t, nfs_stateid *);
-int     nfs4_open(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t);
-int     nfs4_open_delegated(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t);
-int     nfs_close(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t);
 int     nfs_check_for_locks(struct nfs_open_owner *, struct nfs_open_file *);
-int     nfs4_reopen(struct nfs_open_file *, thread_t);
-int     nfs4_open_rpc(struct nfs_open_file *, vfs_context_t, struct componentname *, struct vnode_attr *, vnode_t, vnode_t *, int, int, int);
-int     nfs4_open_rpc_internal(struct nfs_open_file *, vfs_context_t, thread_t, kauth_cred_t, struct componentname *, struct vnode_attr *, vnode_t, vnode_t *, int, int, int);
-int     nfs4_open_confirm_rpc(struct nfsmount *, nfsnode_t, u_char *, int, struct nfs_open_owner *, nfs_stateid *, thread_t, kauth_cred_t, struct nfs_vattr *, uint64_t *);
-int     nfs4_open_reopen_rpc(struct nfs_open_file *, thread_t, kauth_cred_t, struct componentname *, vnode_t, vnode_t *, int, int);
-int     nfs4_open_reclaim_rpc(struct nfs_open_file *, int, int);
-int     nfs4_claim_delegated_open_rpc(struct nfs_open_file *, int, int, int);
-int     nfs4_claim_delegated_state_for_open_file(struct nfs_open_file *, int);
-int     nfs4_claim_delegated_state_for_node(nfsnode_t, int);
-int     nfs4_open_downgrade_rpc(nfsnode_t, struct nfs_open_file *, vfs_context_t);
-int     nfs4_close_rpc(nfsnode_t, struct nfs_open_file *, thread_t, kauth_cred_t, int);
-void    nfs4_delegation_return_enqueue(nfsnode_t);
-int     nfs4_delegation_return(nfsnode_t, int, thread_t, kauth_cred_t);
-int     nfs4_delegreturn_rpc(struct nfsmount *, u_char *, int, struct nfs_stateid *, int, thread_t, kauth_cred_t);
+int     nfs_close(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t);
+
 void    nfs_release_open_state_for_node(nfsnode_t, int);
 void    nfs_revoke_open_state_for_node(nfsnode_t);
 struct nfs_lock_owner *nfs_lock_owner_find(nfsnode_t, proc_t, int);
@@ -1326,15 +1325,35 @@ void    nfs_lock_owner_insert_held_lock(struct nfs_lock_owner *, struct nfs_file
 struct nfs_file_lock *nfs_file_lock_alloc(struct nfs_lock_owner *);
 void    nfs_file_lock_destroy(struct nfs_file_lock *);
 int     nfs_file_lock_conflict(struct nfs_file_lock *, struct nfs_file_lock *, int *);
-int     nfs4_lock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t);
 int     nfs_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, thread_t, kauth_cred_t, int);
 int     nfs_advlock_getlock(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t);
 int     nfs_advlock_setlock(nfsnode_t, struct nfs_open_file *, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, short, vfs_context_t);
 int     nfs_advlock_unlock(nfsnode_t, struct nfs_open_file *, struct nfs_lock_owner *, uint64_t, uint64_t, int, vfs_context_t);
 
+#if CONFIG_NFS4
+int     nfs4_create_rpc(vfs_context_t, nfsnode_t, struct componentname *, struct vnode_attr *, int, char *, nfsnode_t *);
+int     nfs4_open(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t);
+int     nfs4_open_delegated(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t);
+int     nfs4_reopen(struct nfs_open_file *, thread_t);
+int     nfs4_open_rpc(struct nfs_open_file *, vfs_context_t, struct componentname *, struct vnode_attr *, vnode_t, vnode_t *, int, int, int);
+int     nfs4_open_rpc_internal(struct nfs_open_file *, vfs_context_t, thread_t, kauth_cred_t, struct componentname *, struct vnode_attr *, vnode_t, vnode_t *, int, int, int);
+int     nfs4_open_confirm_rpc(struct nfsmount *, nfsnode_t, u_char *, int, struct nfs_open_owner *, nfs_stateid *, thread_t, kauth_cred_t, struct nfs_vattr *, uint64_t *);
+int     nfs4_open_reopen_rpc(struct nfs_open_file *, thread_t, kauth_cred_t, struct componentname *, vnode_t, vnode_t *, int, int);
+int     nfs4_open_reclaim_rpc(struct nfs_open_file *, int, int);
+int     nfs4_claim_delegated_open_rpc(struct nfs_open_file *, int, int, int);
+int     nfs4_claim_delegated_state_for_open_file(struct nfs_open_file *, int);
+int     nfs4_claim_delegated_state_for_node(nfsnode_t, int);
+int     nfs4_open_downgrade_rpc(nfsnode_t, struct nfs_open_file *, vfs_context_t);
+int     nfs4_close_rpc(nfsnode_t, struct nfs_open_file *, thread_t, kauth_cred_t, int);
+void    nfs4_delegation_return_enqueue(nfsnode_t);
+int     nfs4_delegation_return(nfsnode_t, int, thread_t, kauth_cred_t);
+int     nfs4_lock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t);
+int     nfs4_delegreturn_rpc(struct nfsmount *, u_char *, int, struct nfs_stateid *, int, thread_t, kauth_cred_t);
+
 nfsnode_t nfs4_named_attr_dir_get(nfsnode_t, int, vfs_context_t);
 int     nfs4_named_attr_get(nfsnode_t, struct componentname *, uint32_t, int, vfs_context_t, nfsnode_t *, struct nfs_open_file **);
 int     nfs4_named_attr_remove(nfsnode_t, nfsnode_t, const char *, vfs_context_t);
+#endif
 
 int     nfs_mount_state_in_use_start(struct nfsmount *, thread_t);
 int     nfs_mount_state_in_use_end(struct nfsmount *, int);
@@ -1355,6 +1374,7 @@ int     nfs_vnop_advlock(struct vnop_advlock_args *);
 int     nfs_vnop_mmap(struct vnop_mmap_args *);
 int     nfs_vnop_mnomap(struct vnop_mnomap_args *);
 
+#if CONFIG_NFS4
 int     nfs4_vnop_create(struct vnop_create_args *);
 int     nfs4_vnop_mknod(struct vnop_mknod_args *);
 int     nfs4_vnop_close(struct vnop_close_args *);
@@ -1373,46 +1393,48 @@ int     nfs4_vnop_makenamedstream(struct vnop_makenamedstream_args *);
 int     nfs4_vnop_removenamedstream(struct vnop_removenamedstream_args *);
 #endif
 
+int     nfs4_access_rpc(nfsnode_t, u_int32_t *, int, vfs_context_t);
+int     nfs4_getattr_rpc(nfsnode_t, mount_t, u_char *, size_t, int, vfs_context_t, struct nfs_vattr *, u_int64_t *);
+int     nfs4_setattr_rpc(nfsnode_t, struct vnode_attr *, vfs_context_t);
+int     nfs4_read_rpc_async(nfsnode_t, off_t, size_t, thread_t, kauth_cred_t, struct nfsreq_cbinfo *, struct nfsreq **);
+int     nfs4_read_rpc_async_finish(nfsnode_t, struct nfsreq *, uio_t, size_t *, int *);
+int     nfs4_write_rpc_async(nfsnode_t, uio_t, size_t, thread_t, kauth_cred_t, int, struct nfsreq_cbinfo *, struct nfsreq **);
+int     nfs4_write_rpc_async_finish(nfsnode_t, struct nfsreq *, int *, size_t *, uint64_t *);
+int     nfs4_readdir_rpc(nfsnode_t, struct nfsbuf *, vfs_context_t);
+int     nfs4_readlink_rpc(nfsnode_t, char *, uint32_t *, vfs_context_t);
+int     nfs4_commit_rpc(nfsnode_t, uint64_t, uint64_t, kauth_cred_t, uint64_t);
+int     nfs4_lookup_rpc_async(nfsnode_t, char *, int, vfs_context_t, struct nfsreq **);
+int     nfs4_lookup_rpc_async_finish(nfsnode_t, char *, int, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *);
+int     nfs4_remove_rpc(nfsnode_t, char *, int, thread_t, kauth_cred_t);
+int     nfs4_rename_rpc(nfsnode_t, char *, int, nfsnode_t, char *, int, vfs_context_t);
+int     nfs4_pathconf_rpc(nfsnode_t, struct nfs_fsattr *, vfs_context_t);
+int     nfs4_setlock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t);
+int     nfs4_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, thread_t, kauth_cred_t);
+int     nfs4_getlock_rpc(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t);
+#endif
+
 int     nfs_read_rpc(nfsnode_t, uio_t, vfs_context_t);
 int     nfs_write_rpc(nfsnode_t, uio_t, vfs_context_t, int *, uint64_t *);
 int     nfs_write_rpc2(nfsnode_t, uio_t, thread_t, kauth_cred_t, int *, uint64_t *);
 
 int     nfs3_access_rpc(nfsnode_t, u_int32_t *, int, vfs_context_t);
-int     nfs4_access_rpc(nfsnode_t, u_int32_t *, int, vfs_context_t);
 int     nfs3_getattr_rpc(nfsnode_t, mount_t, u_char *, size_t, int, vfs_context_t, struct nfs_vattr *, u_int64_t *);
-int     nfs4_getattr_rpc(nfsnode_t, mount_t, u_char *, size_t, int, vfs_context_t, struct nfs_vattr *, u_int64_t *);
 int     nfs3_setattr_rpc(nfsnode_t, struct vnode_attr *, vfs_context_t);
-int     nfs4_setattr_rpc(nfsnode_t, struct vnode_attr *, vfs_context_t);
 int     nfs3_read_rpc_async(nfsnode_t, off_t, size_t, thread_t, kauth_cred_t, struct nfsreq_cbinfo *, struct nfsreq **);
-int     nfs4_read_rpc_async(nfsnode_t, off_t, size_t, thread_t, kauth_cred_t, struct nfsreq_cbinfo *, struct nfsreq **);
 int     nfs3_read_rpc_async_finish(nfsnode_t, struct nfsreq *, uio_t, size_t *, int *);
-int     nfs4_read_rpc_async_finish(nfsnode_t, struct nfsreq *, uio_t, size_t *, int *);
 int     nfs3_write_rpc_async(nfsnode_t, uio_t, size_t, thread_t, kauth_cred_t, int, struct nfsreq_cbinfo *, struct nfsreq **);
-int     nfs4_write_rpc_async(nfsnode_t, uio_t, size_t, thread_t, kauth_cred_t, int, struct nfsreq_cbinfo *, struct nfsreq **);
 int     nfs3_write_rpc_async_finish(nfsnode_t, struct nfsreq *, int *, size_t *, uint64_t *);
-int     nfs4_write_rpc_async_finish(nfsnode_t, struct nfsreq *, int *, size_t *, uint64_t *);
 int     nfs3_readdir_rpc(nfsnode_t, struct nfsbuf *, vfs_context_t);
-int     nfs4_readdir_rpc(nfsnode_t, struct nfsbuf *, vfs_context_t);
 int     nfs3_readlink_rpc(nfsnode_t, char *, uint32_t *, vfs_context_t);
-int     nfs4_readlink_rpc(nfsnode_t, char *, uint32_t *, vfs_context_t);
 int     nfs3_commit_rpc(nfsnode_t, uint64_t, uint64_t, kauth_cred_t, uint64_t);
-int     nfs4_commit_rpc(nfsnode_t, uint64_t, uint64_t, kauth_cred_t, uint64_t);
 int     nfs3_lookup_rpc_async(nfsnode_t, char *, int, vfs_context_t, struct nfsreq **);
-int     nfs4_lookup_rpc_async(nfsnode_t, char *, int, vfs_context_t, struct nfsreq **);
 int     nfs3_lookup_rpc_async_finish(nfsnode_t, char *, int, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *);
-int     nfs4_lookup_rpc_async_finish(nfsnode_t, char *, int, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *);
 int     nfs3_remove_rpc(nfsnode_t, char *, int, thread_t, kauth_cred_t);
-int     nfs4_remove_rpc(nfsnode_t, char *, int, thread_t, kauth_cred_t);
 int     nfs3_rename_rpc(nfsnode_t, char *, int, nfsnode_t, char *, int, vfs_context_t);
-int     nfs4_rename_rpc(nfsnode_t, char *, int, nfsnode_t, char *, int, vfs_context_t);
 int     nfs3_pathconf_rpc(nfsnode_t, struct nfs_fsattr *, vfs_context_t);
-int     nfs4_pathconf_rpc(nfsnode_t, struct nfs_fsattr *, vfs_context_t);
 int     nfs3_setlock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t);
-int     nfs4_setlock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t);
 int     nfs3_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, thread_t, kauth_cred_t);
-int     nfs4_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, thread_t, kauth_cred_t);
 int     nfs3_getlock_rpc(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t);
-int     nfs4_getlock_rpc(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t);
 
 void    nfsrv_active_user_list_reclaim(void);
 void    nfsrv_cleancache(void);
@@ -1503,21 +1525,24 @@ void nfsrv_uc_dequeue(struct nfsrv_sock *);
 
 /* Debug support */
 #define NFS_DEBUG_LEVEL   (nfs_debug_ctl & 0xf)
-#define NFS_DEBUG_FACILITY ((nfs_debug_ctl >> 4) & 0xff)
-#define NFS_DEBUG_FLAGS ((nfs_debug_ctl >> 12) & 0xff)
+#define NFS_DEBUG_FACILITY ((nfs_debug_ctl >> 4) & 0xfff)
+#define NFS_DEBUG_FLAGS ((nfs_debug_ctl >> 16) & 0xf)
 #define NFS_DEBUG_VALUE ((nfs_debug_ctl >> 20) & 0xfff)
-#define NFS_FAC_SOCK    0x01
-#define NFS_FAC_STATE   0x02
-#define NFS_FAC_NODE    0x04
-#define NFS_FAC_VNOP    0x08
-#define NFS_FAC_BIO     0x10
-#define NFS_FAC_GSS     0x20
-#define NFS_FAC_VFS     0x40
-
-#define NFS_DBG(fac, lev, fmt, ...) \
-       if (__builtin_expect(NFS_DEBUG_LEVEL, 0)) nfs_printf(fac, lev, "%s: %d: " fmt, __func__, __LINE__, ## __VA_ARGS__)
-
-void nfs_printf(int, int, const char *, ...) __printflike(3, 4);
+#define NFS_FAC_SOCK    0x001
+#define NFS_FAC_STATE   0x002
+#define NFS_FAC_NODE    0x004
+#define NFS_FAC_VNOP    0x008
+#define NFS_FAC_BIO     0x010
+#define NFS_FAC_GSS     0x020
+#define NFS_FAC_VFS     0x040
+#define NFS_FAC_SRV     0x080
+
+#define NFS_IS_DBG(fac, lev) \
+       (__builtin_expect((NFS_DEBUG_FACILITY & (fac)) && ((lev) <= NFS_DEBUG_LEVEL), 0))
+#define NFS_DBG(fac, lev, fmt, ...)  nfs_printf((fac), (lev), "%s: %d: " fmt, __func__, __LINE__, ## __VA_ARGS__)
+
+void nfs_printf(unsigned int, unsigned int, const char *, ...) __printflike(3, 4);
+void nfs_dump_mbuf(const char *, int, const char *, mbuf_t);
 int  nfs_mountopts(struct nfsmount *, char *, int);
 
 __END_DECLS
index b906597c5967136dbc342ce79ed4854d7e774f59..ffb82cb064d6b987052ee5d2069a825fee8499cb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2006-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -69,6 +69,7 @@
 #include <netinet/in.h>
 #include <net/kpi_interface.h>
 
+#if CONFIG_NFS4
 /*
  * NFS_MAX_WHO is the maximum length of a string representation used
  * in as an ace who, owner, or group. There is no explicit limit in the
@@ -527,6 +528,7 @@ nfs4_secinfo_rpc(struct nfsmount *nmp, struct nfsreq_secinfo_args *siap, kauth_c
                dnp = nsp->nsr_dnp;
                dvp = NFSTOV(dnp);
                if ((error = vnode_get(dvp))) {
+                       dvp = NULLVP;
                        nfs_node_unlock(np);
                        goto nfsmout;
                }
@@ -602,6 +604,7 @@ nfsmout:
        }
        return error;
 }
+#endif /* CONFIG_NFS4 */
 
 /*
  * Parse an NFSv4 SECINFO array to an array of pseudo flavors.
@@ -611,8 +614,12 @@ int
 nfsm_chain_get_secinfo(struct nfsm_chain *nmc, uint32_t *sec, int *seccountp)
 {
        int error = 0, secmax, seccount, srvcount;
-       uint32_t flavor, val;
+       uint32_t flavor;
+
+#if CONFIG_NFS_GSS
+       uint32_t val;
        u_char oid[12];
+#endif
 
        seccount = srvcount = 0;
        secmax = *seccountp;
@@ -625,11 +632,14 @@ nfsm_chain_get_secinfo(struct nfsm_chain *nmc, uint32_t *sec, int *seccountp)
                switch (flavor) {
                case RPCAUTH_NONE:
                case RPCAUTH_SYS:
+#if CONFIG_NFS_GSS
                case RPCAUTH_KRB5:
                case RPCAUTH_KRB5I:
                case RPCAUTH_KRB5P:
+#endif /* CONFIG_NFS_GSS */
                        sec[seccount++] = flavor;
                        break;
+#if CONFIG_NFS_GSS
                case RPCSEC_GSS:
                        /* we only recognize KRB5, KRB5I, KRB5P */
                        nfsm_chain_get_32(error, nmc, val); /* OID length */
@@ -660,6 +670,7 @@ nfsm_chain_get_secinfo(struct nfsm_chain *nmc, uint32_t *sec, int *seccountp)
                                break;
                        }
                        break;
+#endif /* CONFIG_NFS_GSS */
                }
                srvcount--;
        }
@@ -670,7 +681,7 @@ nfsmout:
        return error;
 }
 
-
+#if CONFIG_NFS4
 /*
  * Fetch the FS_LOCATIONS attribute for the node found at directory/name.
  */
@@ -2634,6 +2645,7 @@ nfsmout:
        }
        return error;
 }
+#endif /* CONFIG_NFS4 */
 
 /*
  * Got the given error and need to start recovery (if not already started).
@@ -2655,6 +2667,7 @@ nfs_need_recover(struct nfsmount *nmp, int error)
        }
 }
 
+#if CONFIG_NFS4
 /*
  * After recovery due to state expiry, check each node and
  * drop any lingering delegation we thought we had.
@@ -2722,6 +2735,7 @@ nfs4_expired_check_delegation(nfsnode_t np, vfs_context_t ctx)
 
        lck_mtx_unlock(&np->n_openlock);
 }
+#endif /* CONFIG_NFS4*/
 
 /*
  * Recover state for an NFS mount.
@@ -2731,14 +2745,16 @@ nfs4_expired_check_delegation(nfsnode_t np, vfs_context_t ctx)
 void
 nfs_recover(struct nfsmount *nmp)
 {
-       struct timespec ts = { 1, 0 };
+       struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
        int error, lost, reopen;
        struct nfs_open_owner *noop;
        struct nfs_open_file *nofp;
        struct nfs_file_lock *nflp, *nextnflp;
        struct nfs_lock_owner *nlop;
        thread_t thd = current_thread();
+#if CONFIG_NFS4
        nfsnode_t np, nextnp;
+#endif
        struct timeval now;
 
 restart:
@@ -2804,6 +2820,7 @@ restart:
                        if (nmp->nm_vers < NFS_VER4) {
                                goto reclaim_locks;
                        }
+#if CONFIG_NFS4
                        if (nofp->nof_rw_drw) {
                                error = nfs4_open_reclaim_rpc(nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_BOTH);
                        }
@@ -2912,7 +2929,7 @@ restart:
                                nofp->nof_flags &= ~NFS_OPEN_FILE_REOPEN;
                                lck_mtx_unlock(&nofp->nof_lock);
                        }
-
+#endif /* CONFIG_NFS4 */
                        /*
                         * Scan this node's lock owner list for entries with this open owner,
                         * then walk the lock owner's held lock list recovering each lock.
@@ -2959,7 +2976,7 @@ reclaim_locks:
                                        break;
                                }
                        }
-
+#if CONFIG_NFS4
                        /*
                         * If we've determined that we need to reopen the file then we probably
                         * didn't receive any delegation we think we hold.  We should attempt to
@@ -2979,7 +2996,7 @@ reclaim_locks:
                                        goto restart;
                                }
                        }
-
+#endif
                        if (lost) {
                                /* revoke open file state */
                                NP(nofp->nof_np, "nfs_recover: state lost for %d %p 0x%x",
@@ -2992,6 +3009,7 @@ reclaim_locks:
        if (!error) {
                /* If state expired, make sure we're not holding onto any stale delegations */
                lck_mtx_lock(&nmp->nm_lock);
+#if CONFIG_NFS4
                if ((nmp->nm_vers >= NFS_VER4) && (nmp->nm_state & NFSSTA_RECOVER_EXPIRED)) {
 recheckdeleg:
                        TAILQ_FOREACH_SAFE(np, &nmp->nm_delegations, n_dlink, nextnp) {
@@ -3003,6 +3021,7 @@ recheckdeleg:
                                }
                        }
                }
+#endif
                nmp->nm_state &= ~(NFSSTA_RECOVER | NFSSTA_RECOVER_EXPIRED);
                wakeup(&nmp->nm_state);
                printf("nfs recovery completed for %s, 0x%x\n",
index 223ae28f7dc8db766d7fca14eaa25086a310f882..261da73e24f107fcd097a6154d0cde00362b2d31 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2006-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -79,6 +79,7 @@
 #include <kern/task.h>
 #include <kern/sched_prim.h>
 
+#if CONFIG_NFS4
 int
 nfs4_access_rpc(nfsnode_t np, u_int32_t *access, int rpcflags, vfs_context_t ctx)
 {
@@ -1752,6 +1753,7 @@ nfsmout:
        }
        return error;
 }
+#endif /* CONFIG_NFS4 */
 
 /*
  * Wait for any pending recovery to complete.
@@ -1759,7 +1761,7 @@ nfsmout:
 int
 nfs_mount_state_wait_for_recovery(struct nfsmount *nmp)
 {
-       struct timespec ts = { 1, 0 };
+       struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
        int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0;
 
        lck_mtx_lock(&nmp->nm_lock);
@@ -1785,7 +1787,7 @@ nfs_mount_state_wait_for_recovery(struct nfsmount *nmp)
 int
 nfs_mount_state_in_use_start(struct nfsmount *nmp, thread_t thd)
 {
-       struct timespec ts = { 1, 0 };
+       struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
        int error = 0, slpflag = (NMFLAG(nmp, INTR) && thd) ? PCATCH : 0;
 
        if (nfs_mount_gone(nmp)) {
@@ -1903,7 +1905,7 @@ int
 nfs_open_state_set_busy(nfsnode_t np, thread_t thd)
 {
        struct nfsmount *nmp;
-       struct timespec ts = {2, 0};
+       struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
        int error = 0, slpflag;
 
        nmp = NFSTONMP(np);
@@ -2061,7 +2063,7 @@ int
 nfs_open_owner_set_busy(struct nfs_open_owner *noop, thread_t thd)
 {
        struct nfsmount *nmp;
-       struct timespec ts = {2, 0};
+       struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
        int error = 0, slpflag;
 
        nmp = noop->noo_mount;
@@ -2256,7 +2258,7 @@ int
 nfs_open_file_set_busy(struct nfs_open_file *nofp, thread_t thd)
 {
        struct nfsmount *nmp;
-       struct timespec ts = {2, 0};
+       struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
        int error = 0, slpflag;
 
        nmp = nofp->nof_owner->noo_mount;
@@ -2617,7 +2619,7 @@ nfs_open_file_remove_open(struct nfs_open_file *nofp, uint32_t accessMode, uint3
        lck_mtx_unlock(&nofp->nof_lock);
 }
 
-
+#if CONFIG_NFS4
 /*
  * Get the current (delegation, lock, open, default) stateid for this node.
  * If node has a delegation, use that stateid.
@@ -2882,6 +2884,7 @@ out:
        }
        return error;
 }
+#endif /* CONFIG_NFS4 */
 
 int
 nfs_vnop_mmap(
@@ -2946,6 +2949,7 @@ restart:
                NP(np, "nfs_vnop_mmap: no open file for owner, error %d, %d", error, kauth_cred_getuid(noop->noo_cred));
                error = EPERM;
        }
+#if CONFIG_NFS4
        if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) {
                nfs_mount_state_in_use_end(nmp, 0);
                error = nfs4_reopen(nofp, NULL);
@@ -2954,6 +2958,7 @@ restart:
                        goto restart;
                }
        }
+#endif
        if (!error) {
                error = nfs_open_file_set_busy(nofp, NULL);
        }
@@ -2996,9 +3001,12 @@ restart:
                        /* NFS v2/v3 opens are always allowed - so just add it. */
                        nfs_open_file_add_open(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, 0);
                        error = 0;
-               } else {
+               }
+#if CONFIG_NFS4
+               else {
                        error = nfs4_open(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx);
                }
+#endif
                if (!error) {
                        nofp->nof_flags |= NFS_OPEN_FILE_NEEDCLOSE;
                }
@@ -3201,6 +3209,7 @@ loop:
                        continue;
                }
                lck_mtx_unlock(&np->n_openlock);
+#if CONFIG_NFS4
                if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) {
                        nfs_mount_state_in_use_end(nmp, 0);
                        error = nfs4_reopen(nofp, NULL);
@@ -3208,6 +3217,7 @@ loop:
                                goto loop;
                        }
                }
+#endif
                if (!error) {
                        error = nfs_open_file_set_busy(nofp, NULL);
                }
@@ -3364,7 +3374,7 @@ int
 nfs_lock_owner_set_busy(struct nfs_lock_owner *nlop, thread_t thd)
 {
        struct nfsmount *nmp;
-       struct timespec ts = {2, 0};
+       struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
        int error = 0, slpflag;
 
        nmp = nlop->nlo_open_owner->noo_mount;
@@ -3518,6 +3528,7 @@ nfs_file_lock_conflict(struct nfs_file_lock *nflp1, struct nfs_file_lock *nflp2,
        return 1;
 }
 
+#if CONFIG_NFS4
 /*
  * Send an NFSv4 LOCK RPC to the server.
  */
@@ -3816,7 +3827,7 @@ nfsmout:
        nfsm_chain_cleanup(&nmrep);
        return error;
 }
-
+#endif /* CONFIG_NFS4 */
 
 /*
  * Check for any conflicts with the given lock.
@@ -3917,7 +3928,7 @@ nfs_advlock_setlock(
        struct nfs_file_lock *newnflp, *nflp, *nflp2 = NULL, *nextnflp, *flocknflp = NULL;
        struct nfs_file_lock *coalnflp;
        int error = 0, error2, willsplit = 0, delay, slpflag, busy = 0, inuse = 0, restart, inqueue = 0;
-       struct timespec ts = {1, 0};
+       struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
 
        nmp = NFSTONMP(np);
        if (nfs_mount_gone(nmp)) {
@@ -3973,6 +3984,7 @@ restart:
                inuse = 0;
                goto error_out;
        }
+#if CONFIG_NFS4
        if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) {
                nfs_mount_state_in_use_end(nmp, 0);
                inuse = 0;
@@ -3982,6 +3994,7 @@ restart:
                }
                goto restart;
        }
+#endif
 
        lck_mtx_lock(&np->n_openlock);
        if (!inqueue) {
@@ -4085,6 +4098,7 @@ restart:
        busy = 1;
        delay = 0;
        do {
+#if CONFIG_NFS4
                /* do we have a delegation? (that we're not returning?) */
                if ((np->n_openflags & N_DELEG_MASK) && !(np->n_openflags & N_DELEG_RETURN)) {
                        if (np->n_openflags & N_DELEG_WRITE) {
@@ -4117,6 +4131,7 @@ restart:
                                }
                        }
                }
+#endif
                if (np->n_flag & NREVOKE) {
                        error = EIO;
                }
@@ -4358,7 +4373,11 @@ error_out:
 int
 nfs_advlock_unlock(
        nfsnode_t np,
-       struct nfs_open_file *nofp,
+       struct nfs_open_file *nofp
+#if !CONFIG_NFS4
+       __unused
+#endif
+       ,
        struct nfs_lock_owner *nlop,
        uint64_t start,
        uint64_t end,
@@ -4378,6 +4397,7 @@ restart:
        if ((error = nfs_mount_state_in_use_start(nmp, NULL))) {
                return error;
        }
+#if CONFIG_NFS4
        if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) {
                nfs_mount_state_in_use_end(nmp, 0);
                error = nfs4_reopen(nofp, NULL);
@@ -4386,6 +4406,7 @@ restart:
                }
                goto restart;
        }
+#endif
        if ((error = nfs_open_state_set_busy(np, NULL))) {
                nfs_mount_state_in_use_end(nmp, error);
                return error;
@@ -4752,7 +4773,9 @@ nfs_vnop_advlock(
                        goto out;
                }
                /* find the open file */
+#if CONFIG_NFS4
 restart:
+#endif
                error = nfs_open_file_find(np, noop, &nofp, 0, 0, 0);
                if (error) {
                        error = EBADF;
@@ -4761,6 +4784,7 @@ restart:
                        NP(np, "nfs_vnop_advlock: LOST %d", kauth_cred_getuid(nofp->nof_owner->noo_cred));
                        error = EIO;
                }
+#if CONFIG_NFS4
                if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) {
                        error = nfs4_reopen(nofp, ((op == F_UNLCK) ? NULL : vfs_context_thread(ctx)));
                        nofp = NULL;
@@ -4768,6 +4792,7 @@ restart:
                                goto restart;
                        }
                }
+#endif
                if (error) {
                        NP(np, "nfs_vnop_advlock: no open file %d, %d", error, kauth_cred_getuid(noop->noo_cred));
                        goto out;
@@ -4814,6 +4839,7 @@ nfs_check_for_locks(struct nfs_open_owner *noop, struct nfs_open_file *nofp)
        return nlop ? 1 : 0;
 }
 
+#if CONFIG_NFS4
 /*
  * Reopen simple (no deny, no locks) open state that was lost.
  */
@@ -4832,7 +4858,7 @@ nfs4_reopen(struct nfs_open_file *nofp, thread_t thd)
        char smallname[128];
        char *filename = NULL;
        int error = 0, done = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0;
-       struct timespec ts = { 1, 0 };
+       struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
 
        lck_mtx_lock(&nofp->nof_lock);
        while (nofp->nof_flags & NFS_OPEN_FILE_REOPENING) {
@@ -4858,6 +4884,7 @@ nfs4_reopen(struct nfs_open_file *nofp, thread_t thd)
                struct nfs_sillyrename *nsp = np->n_sillyrename;
                dvp = NFSTOV(nsp->nsr_dnp);
                if ((error = vnode_get(dvp))) {
+                       dvp = NULLVP;
                        nfs_node_unlock(np);
                        goto out;
                }
@@ -5473,6 +5500,7 @@ nfs4_claim_delegated_open_rpc(
                struct nfs_sillyrename *nsp = np->n_sillyrename;
                dvp = NFSTOV(nsp->nsr_dnp);
                if ((error = vnode_get(dvp))) {
+                       dvp = NULLVP;
                        nfs_node_unlock(np);
                        goto out;
                }
@@ -6266,6 +6294,7 @@ nfs4_claim_delegated_state_for_open_file(struct nfs_open_file *nofp, int flags)
 
        return error;
 }
+#endif /* CONFIG_NFS4*/
 
 /*
  * Release all open state for the given node.
@@ -6318,9 +6347,11 @@ nfs_release_open_state_for_node(nfsnode_t np, int force)
                nofp->nof_flags |= NFS_OPEN_FILE_LOST;
 
                lck_mtx_unlock(&nofp->nof_lock);
+#if CONFIG_NFS4
                if (!force && nmp && (nmp->nm_vers >= NFS_VER4)) {
                        nfs4_close_rpc(np, nofp, NULL, nofp->nof_owner->noo_cred, R_RECOVER);
                }
+#endif
        }
 
        lck_mtx_unlock(&np->n_openlock);
@@ -6358,6 +6389,7 @@ nfs_revoke_open_state_for_node(nfsnode_t np)
        }
 }
 
+#if CONFIG_NFS4
 /*
  * Claim the delegated open combinations that each of this node's open files hold.
  */
@@ -6537,7 +6569,7 @@ nfsmout:
        nfsm_chain_cleanup(&nmrep);
        return error;
 }
-
+#endif /* CONFIG_NFS4 */
 
 /*
  * NFS read call.
@@ -6587,6 +6619,7 @@ restart:
                NP(np, "nfs_vnop_read: LOST %d", kauth_cred_getuid(noop->noo_cred));
                error = EIO;
        }
+#if CONFIG_NFS4
        if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) {
                error = nfs4_reopen(nofp, vfs_context_thread(ctx));
                nofp = NULL;
@@ -6594,6 +6627,7 @@ restart:
                        goto restart;
                }
        }
+#endif
        if (error) {
                nfs_open_owner_rele(noop);
                return error;
@@ -6652,9 +6686,12 @@ restart:
                if (nmp->nm_vers < NFS_VER4) {
                        /* NFS v2/v3 opens are always allowed - so just add it. */
                        nfs_open_file_add_open(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, 0);
-               } else {
+               }
+#if CONFIG_NFS4
+               else {
                        error = nfs4_open(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx);
                }
+#endif
                if (!error) {
                        nofp->nof_flags |= NFS_OPEN_FILE_NEEDCLOSE;
                }
@@ -6674,6 +6711,7 @@ do_read:
        return nfs_bioread(VTONFS(ap->a_vp), ap->a_uio, ap->a_ioflag, ap->a_context);
 }
 
+#if CONFIG_NFS4
 /*
  * Note: the NFSv4 CREATE RPC is for everything EXCEPT regular files.
  * Files are created using the NFSv4 OPEN RPC.  So we must open the
@@ -8913,3 +8951,4 @@ nfs4_vnop_removenamedstream(
 }
 
 #endif
+#endif /* CONFIG_NFS4 */
index cb1f92939b45cba33d036d49f2611754618bce90..2e2dec099c1fcfddec673452083a96e9e4572682 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -96,6 +96,7 @@
 #include <nfs/nfsnode.h>
 #include <sys/buf_internal.h>
 #include <libkern/OSAtomic.h>
+#include <os/refcnt.h>
 
 #define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__)
 
@@ -212,7 +213,7 @@ nfs_buf_freeup(int timer)
                if (!fbp) {
                        break;
                }
-               if (fbp->nb_refs) {
+               if (os_ref_get_count(&fbp->nb_refs) > 1) {
                        break;
                }
                if (NBUFSTAMPVALID(fbp) &&
@@ -239,7 +240,7 @@ nfs_buf_freeup(int timer)
                if (!fbp) {
                        break;
                }
-               if (fbp->nb_refs) {
+               if (os_ref_get_count(&fbp->nb_refs) > 1) {
                        break;
                }
                if (NBUFSTAMPVALID(fbp) &&
@@ -609,7 +610,7 @@ nfs_buf_delwri_service(void)
 void
 nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
 {
-       struct timespec ts = { 30, 0 };
+       struct timespec ts = { .tv_sec = 30, .tv_nsec = 0 };
        int error = 0;
 
        lck_mtx_lock(nfs_buf_mutex);
@@ -907,6 +908,8 @@ loop:
                        NFSBUFCNTCHK();
                        /* init nfsbuf */
                        bzero(bp, sizeof(*bp));
+                       os_ref_init(&bp->nb_refs, NULL);
+
                        bp->nb_free.tqe_next = NFSNOLIST;
                        bp->nb_validoff = bp->nb_validend = -1;
                        FSDBG(545, np, blkno, bp, 0);
@@ -1387,7 +1390,7 @@ nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
 void
 nfs_buf_refget(struct nfsbuf *bp)
 {
-       bp->nb_refs++;
+       os_ref_retain_locked(&bp->nb_refs);
 }
 /*
  * release a reference on a buffer
@@ -1396,7 +1399,7 @@ nfs_buf_refget(struct nfsbuf *bp)
 void
 nfs_buf_refrele(struct nfsbuf *bp)
 {
-       bp->nb_refs--;
+       (void) os_ref_release_locked(&bp->nb_refs);
 }
 
 /*
@@ -1609,7 +1612,7 @@ nfs_buf_read_finish(struct nfsbuf *bp)
                    ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) {
                        bp->nb_validend = 0x100000000LL - NBOFF(bp);
                }
-               bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
+               bp->nb_valid = (uint32_t)(1LLU << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
                if (bp->nb_validend & PAGE_MASK) {
                        /* zero-fill remainder of last page */
                        bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK));
@@ -1680,9 +1683,11 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
                len = (length > nmrsize) ? nmrsize : length;
                cb.rcb_args[0] = offset;
                cb.rcb_args[1] = len;
+#if CONFIG_NFS4
                if (nmp->nm_vers >= NFS_VER4) {
                        cb.rcb_args[2] = nmp->nm_stategenid;
                }
+#endif
                req = NULL;
                error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
                if (error) {
@@ -1794,6 +1799,7 @@ finish:
                }
                return;
        }
+#if CONFIG_NFS4
        if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
                lck_mtx_lock(&nmp->nm_lock);
                if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
@@ -1840,6 +1846,7 @@ finish:
                        }
                }
        }
+#endif
        if (error) {
                SET(bp->nb_flags, NB_ERROR);
                bp->nb_error = error;
@@ -1867,14 +1874,18 @@ finish:
                 * requested, so we need to issue another read for the rest.
                 * (Don't bother if the buffer already hit an error.)
                 */
+#if CONFIG_NFS4
 readagain:
+#endif
                offset += rlen;
                length -= rlen;
                cb.rcb_args[0] = offset;
                cb.rcb_args[1] = length;
+#if CONFIG_NFS4
                if (nmp->nm_vers >= NFS_VER4) {
                        cb.rcb_args[2] = nmp->nm_stategenid;
                }
+#endif
                error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq);
                if (!error) {
                        if (IS_VALID_CRED(cred)) {
@@ -2348,6 +2359,7 @@ buffer_ready:
                        error = uiomove(bp->nb_data + on, n, uio);
                }
 
+
                nfs_buf_release(bp, 1);
                nfs_data_unlock(np);
                nfs_node_lock_force(np);
@@ -2365,7 +2377,7 @@ int
 nfs_async_write_start(struct nfsmount *nmp)
 {
        int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0;
-       struct timespec ts = {1, 0};
+       struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
 
        if (nfs_max_async_writes <= 0) {
                return 0;
@@ -2910,9 +2922,11 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
                len = (length > nmwsize) ? nmwsize : length;
                cb.rcb_args[0] = offset;
                cb.rcb_args[1] = len;
+#if CONFIG_NFS4
                if (nmp->nm_vers >= NFS_VER4) {
                        cb.rcb_args[2] = nmp->nm_stategenid;
                }
+#endif
                if (async && ((error = nfs_async_write_start(nmp)))) {
                        break;
                }
@@ -3029,6 +3043,7 @@ finish:
                }
                return;
        }
+#if CONFIG_NFS4
        if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
                lck_mtx_lock(&nmp->nm_lock);
                if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
@@ -3075,6 +3090,7 @@ finish:
                        }
                }
        }
+#endif
        if (error) {
                SET(bp->nb_flags, NB_ERROR);
                bp->nb_error = error;
@@ -3111,7 +3127,9 @@ finish:
         * (Don't bother if the buffer hit an error or stale wverf.)
         */
        if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF | NB_ERROR))) {
+#if CONFIG_NFS4
 writeagain:
+#endif
                offset += rlen;
                length -= rlen;
 
@@ -3121,10 +3139,11 @@ writeagain:
 
                cb.rcb_args[0] = offset;
                cb.rcb_args[1] = length;
+#if CONFIG_NFS4
                if (nmp->nm_vers >= NFS_VER4) {
                        cb.rcb_args[2] = nmp->nm_stategenid;
                }
-
+#endif
                // XXX iomode should really match the original request
                error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred,
                    NFS_WRITE_FILESYNC, &cb, &wreq);
@@ -3845,7 +3864,7 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
        struct nfsmount *nmp = VTONMP(vp);
        int error, slpflag, slptimeo, nflags, retry = 0;
        int ubcflags = UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE;
-       struct timespec ts = { 2, 0 };
+       struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
        off_t size;
 
        FSDBG_TOP(554, np, flags, intrflg, 0);
@@ -4085,7 +4104,9 @@ nfs_asyncio_resend(struct nfsreq *req)
                return;
        }
 
+#if CONFIG_NFS_GSS
        nfs_gss_clnt_rpcdone(req);
+#endif
        lck_mtx_lock(&nmp->nm_lock);
        if (!(req->r_flags & R_RESENDQ)) {
                TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
@@ -4119,10 +4140,12 @@ nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
 
        if (nmp->nm_vers < NFS_VER4) {
                error = nfs3_readdir_rpc(np, bp, ctx);
-       } else {
+       }
+#if CONFIG_NFS4
+       else {
                error = nfs4_readdir_rpc(np, bp, ctx);
        }
-
+#endif
        if (error && (error != NFSERR_DIRBUFDROPPED)) {
                SET(bp->nb_flags, NB_ERROR);
                bp->nb_error = error;
index 67d3d5ef4420158bcea5f6a24e62da75bcf7f55f..9f5ec1030e6a0c51c1fc85d819ef060c6a659f92 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -245,7 +245,6 @@ nfs_boot_init(struct nfs_diskless *nd)
                bp_sin.sin_len = sizeof(bp_sin);
                bp_sin.sin_family = AF_INET;
                bp_sin.sin_addr.s_addr = INADDR_BROADCAST;
-               hostnamelen = MAXHOSTNAMELEN;
                router.s_addr = 0;
                error = bp_whoami(&bp_sin, &my_ip, &router);
                if (error) {
@@ -254,7 +253,9 @@ nfs_boot_init(struct nfs_diskless *nd)
                }
                printf("nfs_boot: BOOTPARAMS server " IP_FORMAT "\n",
                    IP_LIST(&bp_sin.sin_addr));
+               lck_mtx_lock(&hostname_lock);
                printf("nfs_boot: hostname %s\n", hostname);
+               lck_mtx_unlock(&hostname_lock);
        }
        if (do_bpgetfile) {
                error = bp_getfile(&bp_sin, "root", &nd->nd_root.ndm_saddr,
@@ -537,9 +538,10 @@ bp_whoami(struct sockaddr_in *bpsin,
        if (cn_len >= MAXHOSTNAMELEN) {
                goto bad;
        }
+       lck_mtx_lock(&hostname_lock);
        bcopy(str->data, hostname, cn_len);
        hostname[cn_len] = '\0';
-       hostnamelen = cn_len;
+       lck_mtx_unlock(&hostname_lock);
        p += RPC_STR_SIZE(cn_len);
        msg_len -= RPC_STR_SIZE(cn_len);
 
@@ -555,9 +557,10 @@ bp_whoami(struct sockaddr_in *bpsin,
        if (dn_len >= MAXHOSTNAMELEN) {
                goto bad;
        }
+       lck_mtx_lock(&domainname_lock);
        bcopy(str->data, domainname, dn_len);
        domainname[dn_len] = '\0';
-       domainnamelen = dn_len;
+       lck_mtx_unlock(&domainname_lock);
        p += RPC_STR_SIZE(dn_len);
        msg_len -= RPC_STR_SIZE(dn_len);
 
@@ -611,7 +614,9 @@ bp_getfile(struct sockaddr_in *bpsin,
        /*
         * Get message buffer of sufficient size.
         */
-       cn_len = hostnamelen;
+       lck_mtx_lock(&hostname_lock);
+       cn_len = strlen(hostname);
+       lck_mtx_unlock(&hostname_lock);
        key_len = strlen(key);
        msg_len = 0;
        msg_len += RPC_STR_SIZE(cn_len);
@@ -629,7 +634,9 @@ bp_getfile(struct sockaddr_in *bpsin,
        /* client name (hostname) */
        str = (struct rpc_string *)p;
        str->len = htonl(cn_len);
+       lck_mtx_lock(&hostname_lock);
        bcopy(hostname, str->data, cn_len);
+       lck_mtx_unlock(&hostname_lock);
        p += RPC_STR_SIZE(cn_len);
        /* key name (root or swap) */
        str = (struct rpc_string *)p;
index c1d300d0f6c2ecd7d4ecf59df99eb0290fbfe2b0..95d21f6c6defdbc7edf7402dda8f93f50b25c6f9 100644 (file)
@@ -157,6 +157,9 @@ static void     nfs_gss_svc_ctx_insert(struct nfs_gss_svc_ctx *);
 static void     nfs_gss_svc_ctx_timer(void *, void *);
 static int      nfs_gss_svc_gssd_upcall(struct nfs_gss_svc_ctx *);
 static int      nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *, uint32_t);
+
+/* This is only used by server code */
+static void     nfs_gss_nfsm_chain(struct nfsm_chain *, mbuf_t);
 #endif /* NFSSERVER */
 
 static void     host_release_special_port(mach_port_t);
@@ -166,7 +169,6 @@ static int      nfs_gss_mach_vmcopyout(vm_map_copy_t, uint32_t, u_char *);
 
 static int      nfs_gss_mchain_length(mbuf_t);
 static int      nfs_gss_append_chain(struct nfsm_chain *, mbuf_t);
-static void     nfs_gss_nfsm_chain(struct nfsm_chain *, mbuf_t);
 
 #if NFSSERVER
 thread_call_t nfs_gss_svc_ctx_timer_call;
@@ -3896,6 +3898,12 @@ nfs_gss_mach_alloc_buffer(u_char *buf, uint32_t buflen, vm_map_copy_t *addr)
 
        tbuflen = vm_map_round_page(buflen,
            vm_map_page_mask(ipc_kernel_map));
+
+       if (tbuflen < buflen) {
+               printf("nfs_gss_mach_alloc_buffer: vm_map_round_page failed\n");
+               return;
+       }
+
        kr = vm_allocate_kernel(ipc_kernel_map, &kmem_buf, tbuflen, VM_FLAGS_ANYWHERE, VM_KERN_MEMORY_FILE);
        if (kr != 0) {
                printf("nfs_gss_mach_alloc_buffer: vm_allocate failed\n");
@@ -4005,6 +4013,7 @@ nfs_gss_append_chain(struct nfsm_chain *nmc, mbuf_t mc)
        return 0;
 }
 
+#if NFSSERVER /* Only used by NFSSERVER */
 /*
  * Convert an mbuf chain to an NFS mbuf chain
  */
@@ -4025,7 +4034,7 @@ nfs_gss_nfsm_chain(struct nfsm_chain *nmc, mbuf_t mc)
        nmc->nmc_left = mbuf_trailingspace(tail);
        nmc->nmc_flags = 0;
 }
-
+#endif /* NFSSERVER */
 
 
 #if 0
index 6ab20e01a1b51270ff414e512f2055c42957a7fd..5f9b1fc2d68d90a919a8e35e1279a96bf7440e3d 100644 (file)
@@ -71,6 +71,8 @@ struct user_nfs_gss_principal {
 
 #define NFS_IOC_GET_CRED                _IOWR('n', 3, struct nfs_gss_principal)
 
+#define NFS_IOC_DISARM_TRIGGER          _IO('n', 4)
+
 #ifdef KERNEL
 
 #define NFS_IOC_SET_CRED64              _IOW('n', 2, struct user_nfs_gss_principal)
index c48c14954b55a16d0325c75690069103e9bc5cde..8f7da7ea03bff19f3b246f78a8c4c2abc55d01b7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -354,6 +354,9 @@ loop:
 
                                cmp = nfs_case_insensitive(mp) ? strncasecmp : strncmp;
 
+                               if (vp->v_name && (size_t)cnp->cn_namelen != strnlen(vp->v_name, MAXPATHLEN)) {
+                                       update_flags |= VNODE_UPDATE_NAME;
+                               }
                                if (vp->v_name && cnp->cn_namelen && (*cmp)(cnp->cn_nameptr, vp->v_name, cnp->cn_namelen)) {
                                        update_flags |= VNODE_UPDATE_NAME;
                                }
@@ -504,6 +507,7 @@ loop:
        vfsp.vnfs_str = "nfs";
        vfsp.vnfs_dvp = dnp ? NFSTOV(dnp) : NULL;
        vfsp.vnfs_fsnode = np;
+#if CONFIG_NFS4
        if (nfsvers == NFS_VER4) {
 #if FIFO
                if (nvap->nva_type == VFIFO) {
@@ -515,7 +519,9 @@ loop:
                } else {
                        vfsp.vnfs_vops = nfsv4_vnodeop_p;
                }
-       } else {
+       } else
+#endif /* CONFIG_NFS4 */
+       {
 #if FIFO
                if (nvap->nva_type == VFIFO) {
                        vfsp.vnfs_vops = fifo_nfsv2nodeop_p;
@@ -538,20 +544,24 @@ loop:
        }
 
 #if CONFIG_TRIGGERS
-       if ((nfsvers >= NFS_VER4) && (nvap->nva_type == VDIR) && (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)) {
+       if (((nfsvers >= NFS_VER4)
+           )
+           && (nvap->nva_type == VDIR) && (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)
+           && !(flags & NG_MARKROOT)) {
                struct vnode_trigger_param vtp;
                bzero(&vtp, sizeof(vtp));
                bcopy(&vfsp, &vtp.vnt_params, sizeof(vfsp));
                vtp.vnt_resolve_func = nfs_mirror_mount_trigger_resolve;
                vtp.vnt_unresolve_func = nfs_mirror_mount_trigger_unresolve;
                vtp.vnt_rearm_func = nfs_mirror_mount_trigger_rearm;
-               vtp.vnt_flags = VNT_AUTO_REARM;
+               vtp.vnt_flags = VNT_AUTO_REARM | VNT_KERN_RESOLVE;
                error = vnode_create(VNCREATE_TRIGGER, VNCREATE_TRIGGER_SIZE, &vtp, &np->n_vnode);
        } else
 #endif
        {
                error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &np->n_vnode);
        }
+notsup:
        if (error) {
                FSDBG(266, 0, np, np->n_flag, 0xb1eb1e);
                nfs_node_unlock(np);
@@ -677,6 +687,7 @@ restart:
                 * node has gone inactive without being open, we need to
                 * clean up (close) the open done in the create.
                 */
+#if CONFIG_NFS4
                if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && nofp->nof_creator && !force) {
                        if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) {
                                lck_mtx_unlock(&np->n_openlock);
@@ -705,6 +716,7 @@ restart:
                        }
                        goto restart;
                }
+#endif
                if (nofp->nof_flags & NFS_OPEN_FILE_NEEDCLOSE) {
                        /*
                         * If the file is marked as needing reopen, but this was the only
@@ -725,9 +737,11 @@ restart:
                                        if (inuse) {
                                                nfs_mount_state_in_use_end(nmp, 0);
                                        }
+#if CONFIG_NFS4
                                        if (!nfs4_reopen(nofp, NULL)) {
                                                goto restart;
                                        }
+#endif
                                }
                                error = nfs_close(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx);
                                if (error) {
@@ -910,9 +924,11 @@ nfs_vnop_reclaim(
        FSDBG_TOP(265, vp, np, np->n_flag, 0);
        force = (!mp || vfs_isforce(mp) || nfs_mount_gone(nmp));
 
+
        /* There shouldn't be any open or lock state at this point */
        lck_mtx_lock(&np->n_openlock);
 
+#if CONFIG_NFS4
        if (nmp && (nmp->nm_vers >= NFS_VER4)) {
                /* need to drop a delegation */
                if (np->n_dreturn.tqe_next != NFSNOLIST) {
@@ -944,6 +960,7 @@ nfs_vnop_reclaim(
                        np->n_attrdirfh = NULL;
                }
        }
+#endif
 
        /* clean up file locks */
        TAILQ_FOREACH_SAFE(nflp, &np->n_locks, nfl_link, nextnflp) {
@@ -1004,12 +1021,14 @@ nfs_vnop_reclaim(
                                    nofp->nof_r_drw, nofp->nof_d_r_drw,
                                    nofp->nof_w_drw, nofp->nof_d_w_drw,
                                    nofp->nof_rw_drw, nofp->nof_d_rw_drw);
+#if CONFIG_NFS4
                                /* try sending a close RPC if it wasn't delegated */
                                if (nofp->nof_r || nofp->nof_w || nofp->nof_rw ||
                                    nofp->nof_r_dw || nofp->nof_w_dw || nofp->nof_rw_dw ||
                                    nofp->nof_r_drw || nofp->nof_w_drw || nofp->nof_rw_drw) {
                                        nfs4_close_rpc(np, nofp, NULL, nofp->nof_owner->noo_cred, R_RECOVER);
                                }
+#endif
                        }
                }
                TAILQ_REMOVE(&np->n_opens, nofp, nof_link);
@@ -1022,7 +1041,7 @@ nfs_vnop_reclaim(
                /* then remove this node from the monitored node list. */
                lck_mtx_lock(&nmp->nm_lock);
                while (np->n_mflag & NMMONSCANINPROG) {
-                       struct timespec ts = { 1, 0 };
+                       struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
                        np->n_mflag |= NMMONSCANWANT;
                        msleep(&np->n_mflag, &nmp->nm_lock, PZERO - 1, "nfswaitmonscan", &ts);
                }
@@ -1178,7 +1197,7 @@ nfs_node_unlock2(nfsnode_t np1, nfsnode_t np2)
 int
 nfs_node_set_busy(nfsnode_t np, thread_t thd)
 {
-       struct timespec ts = { 2, 0 };
+       struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
        int error;
 
        if ((error = nfs_node_lock(np))) {
index b5cf7e407e5569b6372e9878e60f9d2f45acb9b8..2ebb8994bdc082f7ee17d8378458212d7935dc31 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc.  All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc.  All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -449,7 +449,7 @@ nfsrv_getattr(
        error = nfsrv_credcheck(nd, ctx, nx, nxo);
        nfsmerr_if(error);
 
-#if CONFIG_MAC
+#if CONFIG_MACF
        if (mac_vnode_check_open(ctx, vp, FREAD)) {
                error = ESTALE;
        }
@@ -459,7 +459,7 @@ nfsrv_getattr(
        nfsm_srv_vattr_init(&vattr, nd->nd_vers);
        error = vnode_getattr(vp, &vattr, ctx);
 
-#if CONFIG_MAC
+#if CONFIG_MACF
        /* XXXab: Comment in the VFS code makes it sound like
         *        some arguments can be filtered out, but not
         *        what it actually means. Hopefully not like
@@ -511,7 +511,7 @@ nfsrv_setattr(
        struct nfs_export_options *nxo;
        int error, preattrerr, postattrerr, gcheck;
        struct nfs_filehandle nfh;
-       struct timespec guard = { 0, 0 };
+       struct timespec guard = { .tv_sec = 0, .tv_nsec = 0 };
        kauth_action_t action;
        uid_t saved_uid;
 
index 55ba36619b3856754cd10cdeac79a782d24560d9..0adab689bc5fb2c0c258e5eed824b4dd8bbba145 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -82,6 +82,7 @@
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
+#include <sys/un.h>
 #include <sys/syslog.h>
 #include <sys/tprintf.h>
 #include <libkern/OSAtomic.h>
 #include <nfs/nfsnode.h>
 
 #define NFS_SOCK_DBG(...) NFS_DBG(NFS_FAC_SOCK, 7, ## __VA_ARGS__)
+#define NFS_SOCK_DUMP_MBUF(msg, mb) if (NFS_IS_DBG(NFS_FAC_SOCK, 15)) nfs_dump_mbuf(__func__, __LINE__, (msg), (mb))
 
 /* XXX */
 boolean_t       current_thread_aborted(void);
@@ -203,8 +205,30 @@ int     nfs_is_dead(int, struct nfsmount *);
  * 3 - read
  * 4 - write
  */
-static int proct[NFS_NPROCS] = {
-       0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
+static const int proct[] = {
+       [NFSPROC_NULL]                  =       0,
+       [NFSPROC_GETATTR]               =       1,
+       [NFSPROC_SETATTR]               =       0,
+       [NFSPROC_LOOKUP]                =       2,
+       [NFSPROC_ACCESS]                =       1,
+       [NFSPROC_READLINK]              =       3,
+       [NFSPROC_READ]                  =       3,
+       [NFSPROC_WRITE]                 =       4,
+       [NFSPROC_CREATE]                =       0,
+       [NFSPROC_MKDIR]                 =       0,
+       [NFSPROC_SYMLINK]               =       0,
+       [NFSPROC_MKNOD]                 =       0,
+       [NFSPROC_REMOVE]                =       0,
+       [NFSPROC_RMDIR]                 =       0,
+       [NFSPROC_RENAME]                =       0,
+       [NFSPROC_LINK]                  =       0,
+       [NFSPROC_READDIR]               =       3,
+       [NFSPROC_READDIRPLUS]           =       3,
+       [NFSPROC_FSSTAT]                =       0,
+       [NFSPROC_FSINFO]                =       0,
+       [NFSPROC_PATHCONF]              =       0,
+       [NFSPROC_COMMIT]                =       0,
+       [NFSPROC_NOOP]                  =       0,
 };
 
 /*
@@ -296,7 +320,18 @@ nfs_location_mntfromname(struct nfs_fs_locations *locs, struct nfs_location_inde
 
        p = s;
        if (!pathonly) {
-               cnt = snprintf(p, size, "%s:", fsl->nl_servers[idx.nli_serv]->ns_name);
+               char *name = fsl->nl_servers[idx.nli_serv]->ns_name;
+               if (name == NULL) {
+                       name = "";
+               }
+               if (*name == '\0') {
+                       if (*fsl->nl_servers[idx.nli_serv]->ns_addresses[idx.nli_addr]) {
+                               name = fsl->nl_servers[idx.nli_serv]->ns_addresses[idx.nli_addr];
+                       }
+                       cnt = snprintf(p, size, "<%s>:", name);
+               } else {
+                       cnt = snprintf(p, size, "%s:", name);
+               }
                p += cnt;
                size -= cnt;
        }
@@ -329,7 +364,7 @@ nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag)
        int error = 0, recv = 1;
 
        if (nso->nso_flags & NSO_CONNECTING) {
-               NFS_SOCK_DBG("nfs connect - socket %p upcall - connecting\n", nso);
+               NFS_SOCK_DBG("nfs connect - socket %p upcall - connecting flags = %8.8x\n", nso, nso->nso_flags);
                wakeup(nso->nso_wake);
                return;
        }
@@ -340,7 +375,7 @@ nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag)
                lck_mtx_unlock(&nso->nso_lock);
                return;
        }
-       NFS_SOCK_DBG("nfs connect - socket %p upcall\n", nso);
+       NFS_SOCK_DBG("nfs connect - socket %p upcall %8.8x\n", nso, nso->nso_flags);
        nso->nso_flags |= NSO_UPCALL;
 
        /* loop while we make error-free progress */
@@ -353,6 +388,7 @@ nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag)
                m = NULL;
                if (nso->nso_sotype == SOCK_STREAM) {
                        error = nfs_rpc_record_read(so, &nso->nso_rrs, MSG_DONTWAIT, &recv, &m);
+                       NFS_SOCK_DBG("nfs_rpc_record_read returned %d recv = %d\n", error, recv);
                } else {
                        rcvlen = 1000000;
                        error = sock_receivembuf(so, NULL, &m, MSG_DONTWAIT, &rcvlen);
@@ -365,6 +401,7 @@ nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag)
                        uint32_t reply = 0, rxid = 0, verf_type, verf_len;
                        uint32_t reply_status, rejected_status, accepted_status;
 
+                       NFS_SOCK_DUMP_MBUF("Got mbuf from ping", m);
                        nfsm_chain_dissect_init(error, &nmrep, m);
                        nfsm_chain_get_32(error, &nmrep, rxid);
                        nfsm_chain_get_32(error, &nmrep, reply);
@@ -386,6 +423,7 @@ nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag)
                        }
                        nfsm_chain_get_32(error, &nmrep, accepted_status);
                        nfsmout_if(error);
+                       NFS_SOCK_DBG("Recevied accepted_status of %d  nso_version = %d\n", accepted_status, nso->nso_version);
                        if ((accepted_status == RPC_PROGMISMATCH) && !nso->nso_version) {
                                uint32_t minvers, maxvers;
                                nfsm_chain_get_32(error, &nmrep, minvers);
@@ -454,6 +492,8 @@ nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag)
 nfsmout:
                        nso->nso_flags &= ~NSO_PINGING;
                        if (error) {
+                               NFS_SOCK_DBG("nfs upcalled failed for %d program %d vers error = %d\n",
+                                   nso->nso_protocol, nso->nso_version, error);
                                nso->nso_error = error;
                                nso->nso_flags |= NSO_DEAD;
                        } else {
@@ -469,6 +509,7 @@ nfsmout:
        nso->nso_flags &= ~NSO_UPCALL;
        if ((error != EWOULDBLOCK) && (error || !recv)) {
                /* problems with the socket... */
+               NFS_SOCK_DBG("connect upcall failed %d\n", error);
                nso->nso_error = error ? error : EPIPE;
                nso->nso_flags |= NSO_DEAD;
                wakeup(nso->nso_wake);
@@ -496,17 +537,29 @@ nfs_socket_create(
        struct nfs_socket *nso;
        struct timeval now;
        int error;
+#define NFS_SOCKET_DEBUGGING
 #ifdef NFS_SOCKET_DEBUGGING
-       char naddr[MAX_IPv6_STR_LEN];
+       char naddr[sizeof((struct sockaddr_un *)0)->sun_path];
        void *sinaddr;
 
-       if (sa->sa_family == AF_INET) {
-               sinaddr = &((struct sockaddr_in*)sa)->sin_addr;
-       } else {
-               sinaddr = &((struct sockaddr_in6*)sa)->sin6_addr;
-       }
-       if (inet_ntop(sa->sa_family, sinaddr, naddr, sizeof(naddr)) != naddr) {
-               strlcpy(naddr, "<unknown>", sizeof(naddr));
+       switch (sa->sa_family) {
+       case AF_INET:
+       case AF_INET6:
+               if (sa->sa_family == AF_INET) {
+                       sinaddr = &((struct sockaddr_in*)sa)->sin_addr;
+               } else {
+                       sinaddr = &((struct sockaddr_in6*)sa)->sin6_addr;
+               }
+               if (inet_ntop(sa->sa_family, sinaddr, naddr, sizeof(naddr)) != naddr) {
+                       strlcpy(naddr, "<unknown>", sizeof(naddr));
+               }
+               break;
+       case AF_LOCAL:
+               strlcpy(naddr, ((struct sockaddr_un *)sa)->sun_path, sizeof(naddr));
+               break;
+       default:
+               strlcpy(naddr, "<unsupported address family>", sizeof(naddr));
+               break;
        }
 #else
        char naddr[1] =  { 0 };
@@ -533,10 +586,17 @@ nfs_socket_create(
        microuptime(&now);
        nso->nso_timestamp = now.tv_sec;
        bcopy(sa, nso->nso_saddr, sa->sa_len);
-       if (sa->sa_family == AF_INET) {
-               ((struct sockaddr_in*)nso->nso_saddr)->sin_port = htons(port);
-       } else if (sa->sa_family == AF_INET6) {
-               ((struct sockaddr_in6*)nso->nso_saddr)->sin6_port = htons(port);
+       switch (sa->sa_family) {
+       case AF_INET:
+       case AF_INET6:
+               if (sa->sa_family == AF_INET) {
+                       ((struct sockaddr_in*)nso->nso_saddr)->sin_port = htons(port);
+               } else if (sa->sa_family == AF_INET6) {
+                       ((struct sockaddr_in6*)nso->nso_saddr)->sin6_port = htons(port);
+               }
+               break;
+       case AF_LOCAL:
+               break;
        }
        nso->nso_protocol = protocol;
        nso->nso_version = vers;
@@ -577,7 +637,7 @@ nfs_socket_create(
                    resvport ? "r" : "", port, protocol, vers);
                nfs_socket_destroy(nso);
        } else {
-               NFS_SOCK_DBG("nfs connect %s created socket %p %s type %d%s port %d prot %d %d\n",
+               NFS_SOCK_DBG("nfs connect %s created socket %p <%s> type %d%s port %d prot %d %d\n",
                    vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, naddr,
                    sotype, resvport ? "r" : "", port, protocol, vers);
                *nsop = nso;
@@ -591,8 +651,9 @@ nfs_socket_create(
 void
 nfs_socket_destroy(struct nfs_socket *nso)
 {
-       struct timespec ts = { 4, 0 };
+       struct timespec ts = { .tv_sec = 4, .tv_nsec = 0 };
 
+       NFS_SOCK_DBG("Destoring socket %p flags = %8.8x error = %d\n", nso, nso->nso_flags, nso->nso_error);
        lck_mtx_lock(&nso->nso_lock);
        nso->nso_flags |= NSO_DISCONNECTING;
        if (nso->nso_flags & NSO_UPCALL) { /* give upcall a chance to complete */
@@ -644,8 +705,8 @@ nfs_socket_options(struct nfsmount *nmp, struct nfs_socket *nso)
                        sock_setsockopt(nso->nso_so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
                }
        }
-       if (nso->nso_sotype == SOCK_DGRAM) { /* set socket buffer sizes for UDP */
-               int reserve = NFS_UDPSOCKBUF;
+       if (nso->nso_sotype == SOCK_DGRAM || nso->nso_saddr->sa_family == AF_LOCAL) { /* set socket buffer sizes for UDP */
+               int reserve = (nso->nso_sotype == SOCK_DGRAM) ? NFS_UDPSOCKBUF : (2 * 1024 * 1024);
                sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_SNDBUF, &reserve, sizeof(reserve));
                sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_RCVBUF, &reserve, sizeof(reserve));
        }
@@ -765,7 +826,24 @@ nfs_connect_search_new_socket(struct nfsmount *nmp, struct nfs_socket_search *ns
                fsl = nmp->nm_locations.nl_locations[nss->nss_nextloc.nli_loc];
                fss = fsl->nl_servers[nss->nss_nextloc.nli_serv];
                addrstr = fss->ns_addresses[nss->nss_nextloc.nli_addr];
+               NFS_SOCK_DBG("Trying address %s for program %d on port %d\n", addrstr, nss->nss_protocol, nss->nss_port);
+               if (*addrstr == '\0') {
+                       /*
+                        * We have an unspecified local domain address. We use the program to translate to
+                        * a well known local transport address. We only support PMAPROG and NFS for this.
+                        */
+                       if (nss->nss_protocol == PMAPPROG) {
+                               addrstr = (nss->nss_sotype == SOCK_DGRAM) ? RPCB_TICLTS_PATH : RPCB_TICOTSORD_PATH;
+                       } else if (nss->nss_protocol == NFS_PROG) {
+                               addrstr = nmp->nm_nfs_localport;
+                               if (!addrstr || *addrstr == '\0') {
+                                       addrstr = (nss->nss_sotype == SOCK_DGRAM) ? NFS_TICLTS_PATH : NFS_TICOTSORD_PATH;
+                               }
+                       }
+                       NFS_SOCK_DBG("Calling  prog %d with <%s>\n", nss->nss_protocol, addrstr);
+               }
                if (!nfs_uaddr2sockaddr(addrstr, (struct sockaddr*)&ss)) {
+                       NFS_SOCK_DBG("Could not convert address %s to socket\n", addrstr);
                        nfs_location_next(&nmp->nm_locations, &nss->nss_nextloc);
                        nss->nss_addrcnt -= 1;
                        nss->nss_last = -2;
@@ -773,6 +851,7 @@ nfs_connect_search_new_socket(struct nfsmount *nmp, struct nfs_socket_search *ns
                }
                /* Check that socket family is acceptable. */
                if (nmp->nm_sofamily && (ss.ss_family != nmp->nm_sofamily)) {
+                       NFS_SOCK_DBG("Skipping socket family %d, want mount family %d\n", ss.ss_family, nmp->nm_sofamily);
                        nfs_location_next(&nmp->nm_locations, &nss->nss_nextloc);
                        nss->nss_addrcnt -= 1;
                        nss->nss_last = -2;
@@ -791,6 +870,7 @@ nfs_connect_search_new_socket(struct nfsmount *nmp, struct nfs_socket_search *ns
                nso->nso_wake = nss;
                error = sock_setupcall(nso->nso_so, nfs_connect_upcall, nso);
                if (error) {
+                       NFS_SOCK_DBG("sock_setupcall failed for socket %p setting nfs_connect_upcall error = %d\n", nso, error);
                        lck_mtx_lock(&nso->nso_lock);
                        nso->nso_error = error;
                        nso->nso_flags |= NSO_DEAD;
@@ -834,9 +914,14 @@ nfs_connect_search_socket_connect(struct nfsmount *nmp, struct nfs_socket *nso,
                /* initiate the connection */
                nso->nso_flags |= NSO_CONNECTING;
                lck_mtx_unlock(&nso->nso_lock);
-               NFS_SOCK_DBG("nfs connect %s connecting socket %p\n",
-                   vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso);
+               NFS_SOCK_DBG("nfs connect %s connecting socket %p %s\n",
+                   vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso,
+                   nso->nso_saddr->sa_family == AF_LOCAL ? ((struct sockaddr_un*)nso->nso_saddr)->sun_path : "");
                error = sock_connect(nso->nso_so, nso->nso_saddr, MSG_DONTWAIT);
+               if (error) {
+                       NFS_SOCK_DBG("nfs connect %s connecting socket %p returned %d\n",
+                           vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error);
+               }
                lck_mtx_lock(&nso->nso_lock);
                if (error && (error != EINPROGRESS)) {
                        nso->nso_error = error;
@@ -896,6 +981,7 @@ nfs_connect_search_ping(struct nfsmount *nmp, struct nfs_socket *nso, struct tim
                }
        }
        lck_mtx_unlock(&nso->nso_lock);
+       NFS_SOCK_DBG("Pinging  socket %p %d %d %d\n", nso, nso->nso_sotype, nso->nso_protocol, vers);
        error = nfsm_rpchead2(nmp, nso->nso_sotype, nso->nso_protocol, vers, 0, RPCAUTH_SYS,
            vfs_context_ucred(vfs_context_kernel()), NULL, NULL, &xid, &mreq);
        lck_mtx_lock(&nso->nso_lock);
@@ -912,6 +998,7 @@ nfs_connect_search_ping(struct nfsmount *nmp, struct nfs_socket *nso, struct tim
                        reqlen += mbuf_len(m);
                }
                lck_mtx_unlock(&nso->nso_lock);
+               NFS_SOCK_DUMP_MBUF("Sending ping packet", mreq);
                error = sock_sendmbuf(nso->nso_so, &msg, mreq, 0, &sentlen);
                NFS_SOCK_DBG("nfs connect %s verifying socket %p send rv %d\n",
                    vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error);
@@ -975,8 +1062,8 @@ nfs_connect_search_socket_reap(struct nfsmount *nmp __unused, struct nfs_socket_
                        continue;
                }
                lck_mtx_unlock(&nso->nso_lock);
-               NFS_SOCK_DBG("nfs connect %s reaping socket %p %d\n",
-                   vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, nso->nso_error);
+               NFS_SOCK_DBG("nfs connect %s reaping socket %p error = %d flags = %8.8x\n",
+                   vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, nso->nso_error, nso->nso_flags);
                nfs_socket_search_update_error(nss, nso->nso_error);
                TAILQ_REMOVE(&nss->nss_socklist, nso, nso_link);
                nss->nss_sockcnt--;
@@ -1113,7 +1200,10 @@ nfs_connect(struct nfsmount *nmp, int verbose, int timeo)
        struct sockaddr_storage ss;
        struct sockaddr *saddr, *oldsaddr;
        sock_upcall upcall;
-       struct timeval now, start;
+#if CONFIG_NFS4
+       struct timeval now;
+#endif
+       struct timeval start;
        int error, savederror, nfsvers;
        int tryv4 = 1;
        uint8_t sotype = nmp->nm_sotype ? nmp->nm_sotype : SOCK_STREAM;
@@ -1167,26 +1257,34 @@ tryagain:
 
        /* First time connecting, we may need to negotiate some things */
        if (!(nmp->nm_sockflags & NMSOCK_HASCONNECTED)) {
+               NFS_SOCK_DBG("so_family = %d\n", nmp->nm_sofamily);
+               NFS_SOCK_DBG("nfs port = %d local: <%s>\n", nmp->nm_nfsport, nmp->nm_nfs_localport ? nmp->nm_nfs_localport : "");
+               NFS_SOCK_DBG("mount port = %d local: <%s>\n", nmp->nm_mountport, nmp->nm_mount_localport ? nmp->nm_mount_localport : "");
                if (!nmp->nm_vers) {
                        /* No NFS version specified... */
                        if (!nmp->nm_nfsport || (!NM_OMATTR_GIVEN(nmp, FH) && !nmp->nm_mountport)) {
+#if CONFIG_NFS4
                                if (PVER2MAJOR(nmp->nm_max_vers) >= NFS_VER4 && tryv4) {
                                        nss.nss_port = NFS_PORT;
                                        nss.nss_protocol = NFS_PROG;
                                        nss.nss_version = 4;
                                        nss.nss_flags |= NSS_FALLBACK2PMAP;
                                } else {
-                                       /* ...connect to portmapper first if we (may) need any ports. */
-                                       nss.nss_port = PMAPPORT;
-                                       nss.nss_protocol = PMAPPROG;
-                                       nss.nss_version = 0;
-                               }
+#endif
+                               /* ...connect to portmapper first if we (may) need any ports. */
+                               nss.nss_port = PMAPPORT;
+                               nss.nss_protocol = PMAPPROG;
+                               nss.nss_version = 0;
+#if CONFIG_NFS4
+                       }
+#endif
                        } else {
                                /* ...connect to NFS port first. */
                                nss.nss_port = nmp->nm_nfsport;
                                nss.nss_protocol = NFS_PROG;
                                nss.nss_version = 0;
                        }
+#if CONFIG_NFS4
                } else if (nmp->nm_vers >= NFS_VER4) {
                        if (tryv4) {
                                /* For NFSv4, we use the given (or default) port. */
@@ -1206,6 +1304,7 @@ tryagain:
                                nss.nss_protocol = PMAPPROG;
                                nss.nss_version = 0;
                        }
+#endif
                } else {
                        /* For NFSv3/v2... */
                        if (!nmp->nm_nfsport || (!NM_OMATTR_GIVEN(nmp, FH) && !nmp->nm_mountport)) {
@@ -1304,9 +1403,14 @@ keepsearching:
        /* We may be speaking to portmap first... to determine port(s). */
        if (nso->nso_saddr->sa_family == AF_INET) {
                port = ntohs(((struct sockaddr_in*)nso->nso_saddr)->sin_port);
-       } else {
+       } else if (nso->nso_saddr->sa_family == AF_INET6) {
                port = ntohs(((struct sockaddr_in6*)nso->nso_saddr)->sin6_port);
+       } else if (nso->nso_saddr->sa_family == AF_LOCAL) {
+               if (nso->nso_protocol == PMAPPROG) {
+                       port = PMAPPORT;
+               }
        }
+
        if (port == PMAPPORT) {
                /* Use this portmapper port to get the port #s we need. */
                NFS_SOCK_DBG("nfs connect %s got portmapper socket %p\n",
@@ -1325,29 +1429,46 @@ keepsearching:
                                ((struct sockaddr_in*)&ss)->sin_port = htons(0);
                        } else if (ss.ss_family == AF_INET6) {
                                ((struct sockaddr_in6*)&ss)->sin6_port = htons(0);
+                       } else if (ss.ss_family == AF_LOCAL) {
+                               if (((struct sockaddr_un*)&ss)->sun_path[0] == '/') {
+                                       NFS_SOCK_DBG("Looking up  NFS socket over %s\n", ((struct sockaddr_un*)&ss)->sun_path);
+                               }
                        }
                        for (; nfsvers >= (int)PVER2MAJOR(nmp->nm_min_vers); nfsvers--) {
                                if (nmp->nm_vers && nmp->nm_vers != nfsvers) {
                                        continue; /* Wrong version */
                                }
+#if CONFIG_NFS4
                                if (nfsvers == NFS_VER4 && nso->nso_sotype == SOCK_DGRAM) {
                                        continue; /* NFSv4 does not do UDP */
                                }
-                               error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss,
-                                   nso->nso_so, NFS_PROG, nfsvers,
-                                   (nso->nso_sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP, timeo);
+#endif
+                               if (ss.ss_family == AF_LOCAL && nmp->nm_nfs_localport) {
+                                       struct sockaddr_un *sun = (struct sockaddr_un *)&ss;
+                                       NFS_SOCK_DBG("Using supplied local address %s for NFS_PROG\n", nmp->nm_nfs_localport);
+                                       strlcpy(sun->sun_path, nmp->nm_nfs_localport, sizeof(sun->sun_path));
+                                       error = 0;
+                               } else {
+                                       NFS_SOCK_DBG("Calling Portmap/Rpcbind for NFS_PROG");
+                                       error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss,
+                                           nso->nso_so, NFS_PROG, nfsvers, nso->nso_sotype, timeo);
+                               }
                                if (!error) {
                                        if (ss.ss_family == AF_INET) {
                                                port = ntohs(((struct sockaddr_in*)&ss)->sin_port);
                                        } else if (ss.ss_family == AF_INET6) {
                                                port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port);
+                                       } else if (ss.ss_family == AF_LOCAL) {
+                                               port = ((struct sockaddr_un *)&ss)->sun_path[0] ? NFS_PORT : 0;
                                        }
                                        if (!port) {
                                                error = EPROGUNAVAIL;
                                        }
+#if CONFIG_NFS4
                                        if (port == NFS_PORT && nfsvers == NFS_VER4 && tryv4 == 0) {
                                                continue; /* We already tried this */
                                        }
+#endif
                                }
                                if (!error) {
                                        break;
@@ -1359,16 +1480,25 @@ keepsearching:
                        if (error) {
                                nfs_socket_search_update_error(&nss, error);
                                nfs_socket_destroy(nso);
+                               NFS_SOCK_DBG("Could not lookup NFS socket address for version %d error = %d\n", nfsvers, error);
                                goto keepsearching;
                        }
+               } else if (nmp->nm_nfs_localport) {
+                       strlcpy(((struct sockaddr_un*)&ss)->sun_path, nmp->nm_nfs_localport, sizeof(((struct sockaddr_un*)&ss)->sun_path));
+                       NFS_SOCK_DBG("Using supplied nfs_local_port %s for NFS_PROG\n", nmp->nm_nfs_localport);
                }
+
                /* Create NFS protocol socket and add it to the list of sockets. */
                /* N.B. If nfsvers is NFS_VER4 at this point then we're on a non standard port */
+               if (ss.ss_family == AF_LOCAL) {
+                       NFS_SOCK_DBG("Creating NFS socket for %s port = %d\n", ((struct sockaddr_un*)&ss)->sun_path, port);
+               }
                error = nfs_socket_create(nmp, (struct sockaddr*)&ss, nso->nso_sotype, port,
                    NFS_PROG, nfsvers, NMFLAG(nmp, RESVPORT), &nsonfs);
                if (error) {
                        nfs_socket_search_update_error(&nss, error);
                        nfs_socket_destroy(nso);
+                       NFS_SOCK_DBG("Could not create NFS socket: %d\n", error);
                        goto keepsearching;
                }
                nsonfs->nso_location = nso->nso_location;
@@ -1378,6 +1508,7 @@ keepsearching:
                        nfs_socket_search_update_error(&nss, error);
                        nfs_socket_destroy(nsonfs);
                        nfs_socket_destroy(nso);
+                       NFS_SOCK_DBG("Could not nfs_connect_upcall: %d", error);
                        goto keepsearching;
                }
                TAILQ_INSERT_TAIL(&nss.nss_socklist, nsonfs, nso_link);
@@ -1387,24 +1518,31 @@ keepsearching:
                        error = 0;
                        bcopy(nso->nso_saddr, &ss, nso->nso_saddr->sa_len);
                        port = nmp->nm_mountport;
+                       NFS_SOCK_DBG("mount port = %d\n", port);
                        if (ss.ss_family == AF_INET) {
                                ((struct sockaddr_in*)&ss)->sin_port = htons(port);
                        } else if (ss.ss_family == AF_INET6) {
                                ((struct sockaddr_in6*)&ss)->sin6_port = htons(port);
+                       } else if (ss.ss_family == AF_LOCAL && nmp->nm_mount_localport) {
+                               NFS_SOCK_DBG("Setting mount address to %s port = %d\n", nmp->nm_mount_localport, nmp->nm_mountport);
+                               strlcpy(((struct sockaddr_un*)&ss)->sun_path, nmp->nm_mount_localport, sizeof(((struct sockaddr_un*)&ss)->sun_path));
                        }
                        if (!port) {
                                /* Get port/sockaddr for MOUNT version corresponding to NFS version. */
                                /* If NFS version is unknown, optimistically choose for NFSv3. */
                                int mntvers = (nfsvers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3;
                                int mntproto = (NM_OMFLAG(nmp, MNTUDP) || (nso->nso_sotype == SOCK_DGRAM)) ? IPPROTO_UDP : IPPROTO_TCP;
+                               NFS_SOCK_DBG("Looking up mount port with socket %p\n", nso->nso_so);
                                error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss,
-                                   nso->nso_so, RPCPROG_MNT, mntvers, mntproto, timeo);
+                                   nso->nso_so, RPCPROG_MNT, mntvers, mntproto == IPPROTO_UDP ? SOCK_DGRAM : SOCK_STREAM, timeo);
                        }
                        if (!error) {
                                if (ss.ss_family == AF_INET) {
                                        port = ntohs(((struct sockaddr_in*)&ss)->sin_port);
                                } else if (ss.ss_family == AF_INET6) {
                                        port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port);
+                               } else if (ss.ss_family == AF_LOCAL) {
+                                       port = (((struct sockaddr_un*)&ss)->sun_path[0] != '\0');
                                }
                                if (!port) {
                                        error = EPROGUNAVAIL;
@@ -1421,12 +1559,14 @@ keepsearching:
                                bcopy(&ss, nsonfs->nso_saddr2, ss.ss_len);
                        }
                        if (error) {
+                               NFS_SOCK_DBG("Could not create mount sockaet address %d", error);
                                lck_mtx_lock(&nsonfs->nso_lock);
                                nsonfs->nso_error = error;
                                nsonfs->nso_flags |= NSO_DEAD;
                                lck_mtx_unlock(&nsonfs->nso_lock);
                        }
                }
+               NFS_SOCK_DBG("Destroying socket %p so %p\n", nso, nso->nso_so);
                nfs_socket_destroy(nso);
                goto keepsearching;
        }
@@ -1443,19 +1583,23 @@ keepsearching:
                saddr = nso->nso_saddr2;
                if (!saddr) {
                        /* Need sockaddr for MOUNT port */
+                       NFS_SOCK_DBG("Getting mount address mountport = %d, mount_localport = %s\n", nmp->nm_mountport, nmp->nm_mount_localport);
                        bcopy(nso->nso_saddr, &ss, nso->nso_saddr->sa_len);
                        port = nmp->nm_mountport;
                        if (ss.ss_family == AF_INET) {
                                ((struct sockaddr_in*)&ss)->sin_port = htons(port);
                        } else if (ss.ss_family == AF_INET6) {
                                ((struct sockaddr_in6*)&ss)->sin6_port = htons(port);
+                       } else if (ss.ss_family == AF_LOCAL && nmp->nm_mount_localport) {
+                               NFS_SOCK_DBG("Setting mount address to %s port = %d\n", nmp->nm_mount_localport, nmp->nm_mountport);
+                               strlcpy(((struct sockaddr_un*)&ss)->sun_path, nmp->nm_mount_localport, sizeof(((struct sockaddr_un*)&ss)->sun_path));
                        }
                        if (!port) {
                                /* Get port/sockaddr for MOUNT version corresponding to NFS version. */
                                int mntvers = (nfsvers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3;
-                               int mntproto = (NM_OMFLAG(nmp, MNTUDP) || (nso->nso_sotype == SOCK_DGRAM)) ? IPPROTO_UDP : IPPROTO_TCP;
+                               int so_type = NM_OMFLAG(nmp, MNTUDP) ? SOCK_DGRAM : nso->nso_sotype;
                                error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss,
-                                   NULL, RPCPROG_MNT, mntvers, mntproto, timeo);
+                                   NULL, RPCPROG_MNT, mntvers, so_type, timeo);
                                if (ss.ss_family == AF_INET) {
                                        port = ntohs(((struct sockaddr_in*)&ss)->sin_port);
                                } else if (ss.ss_family == AF_INET6) {
@@ -1588,6 +1732,7 @@ keepsearching:
                }
                if (!nmp->nm_vers) {
                        nmp->nm_vers = nfsvers;
+#if CONFIG_NFS4
                        /* If we negotiated NFSv4, set nm_nfsport if we ended up on the standard NFS port */
                        if ((nfsvers >= NFS_VER4) && !NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_PORT)) {
                                if (nso->nso_saddr->sa_family == AF_INET) {
@@ -1601,7 +1746,9 @@ keepsearching:
                                        nmp->nm_nfsport = NFS_PORT;
                                }
                        }
+#endif
                }
+#if CONFIG_NFS4
                /* do some version-specific pre-mount set up */
                if (nmp->nm_vers >= NFS_VER4) {
                        microtime(&now);
@@ -1610,6 +1757,7 @@ keepsearching:
                                nfs4_mount_callback_setup(nmp);
                        }
                }
+#endif
        }
 
        /* Initialize NFS socket state variables */
@@ -1649,6 +1797,7 @@ keepsearching:
                                nmp->nm_sotype = 0;
                        }
                        if (!NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_VERSION)) {
+#if CONFIG_NFS4
                                if (nmp->nm_vers >= NFS_VER4) {
                                        if (!NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_PORT)) {
                                                nmp->nm_nfsport = 0;
@@ -1661,6 +1810,7 @@ keepsearching:
                                        }
                                        bzero(&nmp->nm_un, sizeof(nmp->nm_un));
                                }
+#endif
                                nmp->nm_vers = 0;
                        }
                }
@@ -1709,10 +1859,14 @@ keepsearching:
 
 /* setup & confirm socket connection is functional */
 int
-nfs_connect_setup(struct nfsmount *nmp)
+nfs_connect_setup(
+#if !CONFIG_NFS4
+       __unused
+#endif
+       struct nfsmount *nmp)
 {
        int error = 0;
-
+#if CONFIG_NFS4
        if (nmp->nm_vers >= NFS_VER4) {
                if (nmp->nm_state & NFSSTA_CLIENTID) {
                        /* first, try to renew our current state */
@@ -1729,6 +1883,7 @@ nfs_connect_setup(struct nfsmount *nmp)
                }
                error = nfs4_setclientid(nmp);
        }
+#endif
        return error;
 }
 
@@ -1840,7 +1995,7 @@ nfs_disconnect(struct nfsmount *nmp)
        lck_mtx_lock(&nmp->nm_lock);
 tryagain:
        if (nmp->nm_nso) {
-               struct timespec ts = { 1, 0 };
+               struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
                if (nmp->nm_state & NFSSTA_SENDING) { /* wait for sending to complete */
                        nmp->nm_state |= NFSSTA_WANTSND;
                        msleep(&nmp->nm_state, &nmp->nm_lock, PZERO - 1, "nfswaitsending", &ts);
@@ -1909,7 +2064,7 @@ void
 nfs_mount_sock_thread(void *arg, __unused wait_result_t wr)
 {
        struct nfsmount *nmp = arg;
-       struct timespec ts = { 30, 0 };
+       struct timespec ts = { .tv_sec = 30, .tv_nsec = 0 };
        thread_t thd = current_thread();
        struct nfsreq *req;
        struct timeval now;
@@ -1949,7 +2104,7 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr)
                                if (error == EIO || error == EINTR) {
                                        lvl = (do_reconnect_sleep++ % 600) ? 7 : 0;
                                }
-                               nfs_printf(NFS_FAC_SOCK, lvl, "nfs reconnect %s: returned %d\n",
+                               NFS_DBG(NFS_FAC_SOCK, lvl, "nfs reconnect %s: returned %d\n",
                                    vfs_statfs(nmp->nm_mountp)->f_mntfromname, error);
                        } else {
                                nmp->nm_reconnect_start = 0;
@@ -1966,6 +2121,7 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr)
                        nfs_recover(nmp);
                        lck_mtx_lock(&nmp->nm_lock);
                }
+#if CONFIG_NFS4
                /* handle NFSv4 delegation returns */
                while ((nmp->nm_vers >= NFS_VER4) && !(nmp->nm_state & (NFSSTA_FORCE | NFSSTA_DEAD)) &&
                    (nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_RECOVER) &&
@@ -1974,6 +2130,7 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr)
                        nfs4_delegation_return(np, R_RECOVER, thd, nmp->nm_mcred);
                        lck_mtx_lock(&nmp->nm_lock);
                }
+#endif
                /* do resends, if necessary/possible */
                while ((((nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_RECOVER)) ||
                    (nmp->nm_state & (NFSSTA_FORCE | NFSSTA_DEAD))) &&
@@ -2010,6 +2167,7 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr)
                                lck_mtx_unlock(&req->r_mtx);
                                /* async RPCs on GSS mounts need to be rebuilt and resent. */
                                nfs_reqdequeue(req);
+#if CONFIG_NFS_GSS
                                if (nfs_request_using_gss(req)) {
                                        nfs_gss_clnt_rpcdone(req);
                                        error = nfs_gss_clnt_args_restore(req);
@@ -2017,6 +2175,7 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr)
                                                req->r_xid = 0;
                                        }
                                }
+#endif /* CONFIG_NFS_GSS */
                                NFS_SOCK_DBG("nfs async%s restart: p %d x 0x%llx f 0x%x rtt %d\n",
                                    nfs_request_using_gss(req) ? " gss" : "", req->r_procnum, req->r_xid,
                                    req->r_flags, req->r_rtt);
@@ -2227,6 +2386,7 @@ struct nfs_callback_socket {
 #define NCBSOCK_UPCALLWANT      0x0002
 #define NCBSOCK_DEAD            0x0004
 
+#if CONFIG_NFS4
 /*
  * NFS callback channel state
  *
@@ -2415,7 +2575,7 @@ nfs4_mount_callback_shutdown(struct nfsmount *nmp)
        struct nfs_callback_socket *ncbsp;
        socket_t so, so6;
        struct nfs4_cb_sock_list cb_socks;
-       struct timespec ts = {1, 0};
+       struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
 
        lck_mtx_lock(nfs_global_mutex);
        TAILQ_REMOVE(&nfs4_cb_mounts, nmp, nm_cblink);
@@ -2592,7 +2752,7 @@ void
 nfs4_cb_rcv(socket_t so, void *arg, __unused int waitflag)
 {
        struct nfs_callback_socket *ncbsp = arg;
-       struct timespec ts = {1, 0};
+       struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
        struct timeval now;
        mbuf_t m;
        int error = 0, recv = 1;
@@ -3001,7 +3161,7 @@ out:
        }
        return error;
 }
-
+#endif /* CONFIG_NFS4 */
 
 /*
  * Initialize an nfs_rpc_record_state structure.
@@ -3155,7 +3315,7 @@ nfs_send(struct nfsreq *req, int wait)
        struct sockaddr *sendnam;
        mbuf_t mreqcopy;
        size_t sentlen = 0;
-       struct timespec ts = { 2, 0 };
+       struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
 
 again:
        error = nfs_sndlock(req);
@@ -3345,6 +3505,7 @@ again:
                msg.msg_name = (caddr_t)sendnam;
                msg.msg_namelen = sendnam->sa_len;
        }
+       NFS_SOCK_DUMP_MBUF("Sending mbuf\n", mreqcopy);
        error = sock_sendmbuf(nso->nso_so, &msg, mreqcopy, 0, &sentlen);
        if (error || (sentlen != req->r_mreqlen)) {
                NFS_SOCK_DBG("nfs_send: 0x%llx sent %d/%d error %d\n",
@@ -3731,9 +3892,11 @@ nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep)
                /* signal anyone waiting on this request */
                wakeup(req);
                asyncioq = (req->r_callback.rcb_func != NULL);
+#if CONFIG_NFS_GSS
                if (nfs_request_using_gss(req)) {
                        nfs_gss_clnt_rpcdone(req);
                }
+#endif /* CONFIG_NFS_GSS */
                lck_mtx_unlock(&req->r_mtx);
                lck_mtx_unlock(nfs_request_mutex);
                /* if it's an async RPC with a callback, queue it up */
@@ -3758,7 +3921,7 @@ nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep)
 int
 nfs_wait_reply(struct nfsreq *req)
 {
-       struct timespec ts = { 2, 0 };
+       struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
        int error = 0, slpflag, first = 1;
 
        if (req->r_nmp && NMFLAG(req->r_nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR)) {
@@ -3949,9 +4112,12 @@ void
 nfs_request_destroy(struct nfsreq *req)
 {
        struct nfsmount *nmp;
-       struct gss_seq *gsp, *ngsp;
        int clearjbtimeo = 0;
 
+#if CONFIG_NFS_GSS
+       struct gss_seq *gsp, *ngsp;
+#endif
+
        if (!req || !(req->r_flags & R_INITTED)) {
                return;
        }
@@ -4025,6 +4191,7 @@ nfs_request_destroy(struct nfsreq *req)
        if (IS_VALID_CRED(req->r_cred)) {
                kauth_cred_unref(&req->r_cred);
        }
+#if CONFIG_NFS_GSS
        if (nfs_request_using_gss(req)) {
                nfs_gss_clnt_rpcdone(req);
        }
@@ -4033,6 +4200,7 @@ nfs_request_destroy(struct nfsreq *req)
        if (req->r_gss_ctx) {
                nfs_gss_clnt_ctx_unref(req);
        }
+#endif /* CONFIG_NFS_GSS */
        if (req->r_wrongsec) {
                FREE(req->r_wrongsec, M_TEMP);
        }
@@ -4233,6 +4401,7 @@ nfs_request_finish(
                lck_mtx_unlock(&nmp->nm_lock);
        }
 
+#if CONFIG_NFS_GSS
        if (nfs_request_using_gss(req)) {
                /*
                 * If the request used an RPCSEC_GSS credential
@@ -4261,6 +4430,7 @@ nfs_request_finish(
                        goto nfsmout;
                }
        }
+#endif /* CONFIG_NFS_GSS */
 
        /*
         * If there was a successful reply, make sure to mark the mount as up.
@@ -4297,6 +4467,7 @@ nfs_request_finish(
                nfsm_chain_get_32(error, &nmrep, auth_status);
                nfsmout_if(error);
                switch (auth_status) {
+#if CONFIG_NFS_GSS
                case RPCSEC_GSS_CREDPROBLEM:
                case RPCSEC_GSS_CTXPROBLEM:
                        /*
@@ -4321,6 +4492,7 @@ nfs_request_finish(
                        req->r_xid = 0;         // get a new XID
                        req->r_flags |= R_RESTART;
                        goto nfsmout;
+#endif /* CONFIG_NFS_GSS */
                default:
                        error = EACCES;
                        break;
@@ -4342,12 +4514,14 @@ nfs_request_finish(
                }
                nfsm_chain_get_32(error, &nmrep, accepted_status);
                break;
+#if CONFIG_NFS_GSS
        case RPCAUTH_KRB5:
        case RPCAUTH_KRB5I:
        case RPCAUTH_KRB5P:
                error = nfs_gss_clnt_verf_get(req, &nmrep,
                    verf_type, verf_len, &accepted_status);
                break;
+#endif /* CONFIG_NFS_GSS */
        }
        nfsmout_if(error);
 
@@ -4432,6 +4606,7 @@ nfs_request_finish(
                        nfs_up(nmp, req->r_thread, clearjbtimeo, "resource available again");
                }
 
+#if CONFIG_NFS4
                if ((nmp->nm_vers >= NFS_VER4) && (*status == NFSERR_WRONGSEC)) {
                        /*
                         * Hmmm... we need to try a different security flavor.
@@ -4524,7 +4699,7 @@ nfs_request_finish(
                                req->r_np->n_auth = req->r_auth;
                        }
                }
-
+#endif /* CONFIG_NFS4 */
                if (*status == NFS_OK) {
                        /*
                         * Successful NFS request
@@ -4676,6 +4851,7 @@ nfs_request2(
 }
 
 
+#if CONFIG_NFS_GSS
 /*
  * Set up a new null proc request to exchange GSS context tokens with the
  * server. Associate the context that we are setting up with the request that we
@@ -4744,6 +4920,7 @@ nfs_request_gss(
 
        return error;
 }
+#endif /* CONFIG_NFS_GSS */
 
 /*
  * Create and start an asynchronous NFS request.
@@ -4790,7 +4967,7 @@ nfs_request_async(
                if (!error && !(req->r_flags & R_SENT) && req->r_callback.rcb_func) {
                        /* make sure to wait until this async I/O request gets sent */
                        int slpflag = (req->r_nmp && NMFLAG(req->r_nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR)) ? PCATCH : 0;
-                       struct timespec ts = { 2, 0 };
+                       struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
                        while (!(req->r_flags & R_SENT)) {
                                nmp = req->r_nmp;
                                if ((req->r_flags & R_RESENDQ) && !nfs_mount_gone(nmp)) {
@@ -4859,7 +5036,7 @@ nfs_request_async_finish(
                req->r_flags |= R_ASYNCWAIT;
        }
        while (req->r_flags & R_RESENDQ) {  /* wait until the request is off the resend queue */
-               struct timespec ts = { 2, 0 };
+               struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
 
                if ((nmp = req->r_nmp)) {
                        lck_mtx_lock(&nmp->nm_lock);
@@ -5409,7 +5586,7 @@ nfs_sndlock(struct nfsreq *req)
        struct nfsmount *nmp = req->r_nmp;
        int *statep;
        int error = 0, slpflag = 0;
-       struct timespec ts = { 0, 0 };
+       struct timespec ts = { .tv_sec = 0, .tv_nsec = 0 };
 
        if (nfs_mount_gone(nmp)) {
                return ENXIO;
@@ -5486,7 +5663,7 @@ nfs_aux_request(
        int error = 0, on = 1, try, sendat = 2, soproto, recv, optlen, restoreto = 0;
        socket_t newso = NULL;
        struct sockaddr_storage ss;
-       struct timeval orig_rcvto, orig_sndto, tv = { 1, 0 };
+       struct timeval orig_rcvto, orig_sndto, tv = { .tv_sec = 1, .tv_usec = 0 };
        mbuf_t m, mrep = NULL;
        struct msghdr msg;
        uint32_t rxid = 0, reply = 0, reply_status, rejected_status;
@@ -5496,12 +5673,16 @@ nfs_aux_request(
 
        if (!so) {
                /* create socket and set options */
-               soproto = (sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP;
+               if (saddr->sa_family == AF_LOCAL) {
+                       soproto = 0;
+               } else {
+                       soproto = (sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP;
+               }
                if ((error = sock_socket(saddr->sa_family, sotype, soproto, NULL, NULL, &newso))) {
                        goto nfsmout;
                }
 
-               if (bindresv) {
+               if (bindresv && saddr->sa_family != AF_LOCAL) {
                        int level = (saddr->sa_family == AF_INET) ? IPPROTO_IP : IPPROTO_IPV6;
                        int optname = (saddr->sa_family == AF_INET) ? IP_PORTRANGE : IPV6_PORTRANGE;
                        int portrange = IP_PORTRANGE_LOW;
@@ -5673,13 +5854,23 @@ nfs_portmap_lookup(
        socket_t so,
        uint32_t protocol,
        uint32_t vers,
-       uint32_t ipproto,
+       uint32_t stype,
        int timeo)
 {
        thread_t thd = vfs_context_thread(ctx);
        kauth_cred_t cred = vfs_context_ucred(ctx);
        struct sockaddr_storage ss;
        struct sockaddr *saddr = (struct sockaddr*)&ss;
+       static struct sockaddr_un rpcbind_cots = {
+               sizeof(struct sockaddr_un),
+               AF_LOCAL,
+               RPCB_TICOTSORD_PATH
+       };
+       static struct sockaddr_un rpcbind_clts = {
+               sizeof(struct sockaddr_un),
+               AF_LOCAL,
+               RPCB_TICLTS_PATH
+       };
        struct nfsm_chain nmreq, nmrep;
        mbuf_t mreq;
        int error = 0, ip, pmprog, pmvers, pmproc;
@@ -5699,6 +5890,13 @@ nfs_portmap_lookup(
                pmprog = RPCBPROG;
                pmvers = RPCBVERS4;
                pmproc = RPCBPROC_GETVERSADDR;
+       } else if (saddr->sa_family == AF_LOCAL) {
+               ip = 0;
+               pmprog = RPCBPROG;
+               pmvers = RPCBVERS4;
+               pmproc = RPCBPROC_GETVERSADDR;
+               NFS_SOCK_DBG("%s\n", ((struct sockaddr_un*)sa)->sun_path);
+               saddr = (struct sockaddr*)((stype == SOCK_STREAM) ? &rpcbind_cots : &rpcbind_clts);
        } else {
                return EINVAL;
        }
@@ -5709,33 +5907,46 @@ tryagain:
        /* send portmapper request to get port/uaddr */
        if (ip == 4) {
                ((struct sockaddr_in*)saddr)->sin_port = htons(PMAPPORT);
-       } else {
+       } else if (ip == 6) {
                ((struct sockaddr_in6*)saddr)->sin6_port = htons(PMAPPORT);
        }
        nfsm_chain_build_alloc_init(error, &nmreq, 8 * NFSX_UNSIGNED);
        nfsm_chain_add_32(error, &nmreq, protocol);
        nfsm_chain_add_32(error, &nmreq, vers);
        if (ip == 4) {
-               nfsm_chain_add_32(error, &nmreq, ipproto);
+               nfsm_chain_add_32(error, &nmreq, stype == SOCK_STREAM ? IPPROTO_TCP : IPPROTO_UDP);
                nfsm_chain_add_32(error, &nmreq, 0);
        } else {
-               if (ipproto == IPPROTO_TCP) {
-                       nfsm_chain_add_string(error, &nmreq, "tcp6", 4);
+               if (stype == SOCK_STREAM) {
+                       if (ip == 6) {
+                               nfsm_chain_add_string(error, &nmreq, "tcp6", 4);
+                       } else {
+                               nfsm_chain_add_string(error, &nmreq, "ticotsord", 9);
+                       }
                } else {
-                       nfsm_chain_add_string(error, &nmreq, "udp6", 4);
+                       if (ip == 6) {
+                               nfsm_chain_add_string(error, &nmreq, "udp6", 4);
+                       } else {
+                               nfsm_chain_add_string(error, &nmreq, "ticlts", 6);
+                       }
                }
                nfsm_chain_add_string(error, &nmreq, "", 0); /* uaddr */
                nfsm_chain_add_string(error, &nmreq, "", 0); /* owner */
        }
        nfsm_chain_build_done(error, &nmreq);
        nfsmout_if(error);
-       error = nfsm_rpchead2(nmp, (ipproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM,
-           pmprog, pmvers, pmproc, RPCAUTH_SYS, cred, NULL, nmreq.nmc_mhead,
-           &xid, &mreq);
+       error = nfsm_rpchead2(nmp, stype, pmprog, pmvers, pmproc,
+           RPCAUTH_SYS, cred, NULL, nmreq.nmc_mhead, &xid, &mreq);
        nfsmout_if(error);
        nmreq.nmc_mhead = NULL;
-       error = nfs_aux_request(nmp, thd, saddr, so, (ipproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM,
-           mreq, R_XID32(xid), 0, timeo, &nmrep);
+
+       NFS_SOCK_DUMP_MBUF("nfs_portmap_loockup request", mreq);
+       error = nfs_aux_request(nmp, thd, saddr, so,
+           stype, mreq, R_XID32(xid), 0, timeo, &nmrep);
+       NFS_SOCK_DUMP_MBUF("nfs_portmap_lookup reply", nmrep.nmc_mhead);
+       NFS_SOCK_DBG("rpcbind request returned %d for program %u vers %u: %s\n", error, protocol, vers,
+           (saddr->sa_family == AF_LOCAL) ? ((struct sockaddr_un *)saddr)->sun_path :
+           (saddr->sa_family == AF_INET6) ? "INET6 socket" : "INET socket");
 
        /* grab port from portmap response */
        if (ip == 4) {
@@ -5753,9 +5964,15 @@ tryagain:
                        if (ualen < 1) {
                                /* program is not available, just return a zero port */
                                bcopy(sa, saddr, min(sizeof(ss), sa->sa_len));
-                               ((struct sockaddr_in6*)saddr)->sin6_port = htons(0);
+                               if (ip == 6) {
+                                       ((struct sockaddr_in6*)saddr)->sin6_port = htons(0);
+                               } else {
+                                       ((struct sockaddr_un*)saddr)->sun_path[0] = '\0';
+                               }
+                               NFS_SOCK_DBG("Program %u version %u unavailable", protocol, vers);
                        } else {
                                nfsm_chain_get_opaque(error, &nmrep, ualen, uaddr);
+                               NFS_SOCK_DBG("Got uaddr %s\n", uaddr);
                                if (!error) {
                                        uaddr[ualen] = '\0';
                                        if (!nfs_uaddr2sockaddr(uaddr, saddr)) {
@@ -5785,6 +6002,8 @@ tryagain:
 nfsmout:
        nfsm_chain_cleanup(&nmreq);
        nfsm_chain_cleanup(&nmrep);
+       NFS_SOCK_DBG("Returned %d\n", error);
+
        return error;
 }
 
@@ -6247,6 +6466,9 @@ nfsrv_send(struct nfsrv_sock *slp, mbuf_t nam, mbuf_t top)
                        msg.msg_namelen = sendnam->sa_len;
                }
        }
+       if (NFS_IS_DBG(NFS_FAC_SRV, 15)) {
+               nfs_dump_mbuf(__func__, __LINE__, "nfsrv_send\n", top);
+       }
        error = sock_sendmbuf(so, &msg, top, 0, NULL);
        if (!error) {
                return 0;
index 6b8cf9140f60e0ee148609d1a54b71234a4d5482..6a6878fc5f538526da109627b87f07dc05728dba 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -79,6 +79,7 @@
 #include <sys/vnode_internal.h>
 #include <sys/kpi_mbuf.h>
 #include <sys/socket.h>
+#include <sys/un.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/syscall.h>
@@ -88,6 +89,7 @@
 #include <sys/domain.h>
 #include <libkern/OSAtomic.h>
 #include <kern/thread_call.h>
+#include <kern/task.h>
 
 #include <sys/vm.h>
 #include <sys/vmparam.h>
@@ -889,7 +891,10 @@ nfsm_chain_add_v2sattr_f(struct nfsm_chain *nmc, struct vnode_attr *vap, uint32_
  * Add an NFSv3 "sattr" structure to an mbuf chain
  */
 int
-nfsm_chain_add_v3sattr_f(struct nfsm_chain *nmc, struct vnode_attr *vap)
+nfsm_chain_add_v3sattr_f(
+       struct nfsmount *nmp,
+       struct nfsm_chain *nmc,
+       struct vnode_attr *vap)
 {
        int error = 0;
 
@@ -937,6 +942,7 @@ nfsm_chain_add_v3sattr_f(struct nfsm_chain *nmc, struct vnode_attr *vap)
                }
        }
 
+
        return error;
 }
 
@@ -948,6 +954,7 @@ nfsm_chain_add_v3sattr_f(struct nfsm_chain *nmc, struct vnode_attr *vap)
  */
 int
 nfsm_chain_get_fh_attr(
+       struct nfsmount *nmp,
        struct nfsm_chain *nmc,
        nfsnode_t dnp,
        vfs_context_t ctx,
@@ -976,7 +983,7 @@ nfsm_chain_get_fh_attr(
                if (!gotfh) { /* skip attributes */
                        nfsm_chain_adv(error, nmc, NFSX_V3FATTR);
                } else { /* get attributes */
-                       error = nfs_parsefattr(nmc, nfsvers, nvap);
+                       error = nfs_parsefattr(nmp, nmc, nfsvers, nvap);
                }
        } else if (gotfh) {
                /* we need valid attributes in order to call nfs_nget() */
@@ -1146,6 +1153,7 @@ nfsm_rpchead2(struct nfsmount *nmp, int sotype, int prog, int vers, int proc, in
                auth_len = ((uint32_t)groupcount + 5) * NFSX_UNSIGNED;
                break;
        }
+#if CONFIG_NFS_GSS
        case RPCAUTH_KRB5:
        case RPCAUTH_KRB5I:
        case RPCAUTH_KRB5P:
@@ -1154,6 +1162,7 @@ nfsm_rpchead2(struct nfsmount *nmp, int sotype, int prog, int vers, int proc, in
                }
                auth_len = 5 * NFSX_UNSIGNED + 0;         // zero context handle for now
                break;
+#endif /* CONFIG_NFS_GSS */
        default:
                return EINVAL;
        }
@@ -1207,7 +1216,9 @@ nfsm_rpchead2(struct nfsmount *nmp, int sotype, int prog, int vers, int proc, in
        nfsm_chain_add_32(error, &nmreq, vers);
        nfsm_chain_add_32(error, &nmreq, proc);
 
+#if CONFIG_NFS_GSS
 add_cred:
+#endif
        switch (auth_type) {
        case RPCAUTH_NONE:
                nfsm_chain_add_32(error, &nmreq, RPCAUTH_NONE); /* auth */
@@ -1223,7 +1234,9 @@ add_cred:
        case RPCAUTH_SYS: {
                nfsm_chain_add_32(error, &nmreq, RPCAUTH_SYS);
                nfsm_chain_add_32(error, &nmreq, authsiz);
-               nfsm_chain_add_32(error, &nmreq, 0);    /* stamp */
+               {
+                       nfsm_chain_add_32(error, &nmreq, 0);    /* stamp */
+               }
                nfsm_chain_add_32(error, &nmreq, 0);    /* zero-length hostname */
                nfsm_chain_add_32(error, &nmreq, kauth_cred_getuid(cred));      /* UID */
                nfsm_chain_add_32(error, &nmreq, kauth_cred_getgid(cred));      /* GID */
@@ -1243,6 +1256,7 @@ add_cred:
                }
                break;
        }
+#if CONFIG_NFS_GSS
        case RPCAUTH_KRB5:
        case RPCAUTH_KRB5I:
        case RPCAUTH_KRB5P:
@@ -1264,6 +1278,7 @@ add_cred:
                        goto add_cred;
                }
                break;
+#endif /* CONFIG_NFS_GSS */
        }
        ;
 
@@ -1304,7 +1319,11 @@ add_cred:
  * Parse an NFS file attribute structure out of an mbuf chain.
  */
 int
-nfs_parsefattr(struct nfsm_chain *nmc, int nfsvers, struct nfs_vattr *nvap)
+nfs_parsefattr(
+       struct nfsmount *nmp,
+       struct nfsm_chain *nmc,
+       int nfsvers,
+       struct nfs_vattr *nvap)
 {
        int error = 0;
        enum vtype vtype;
@@ -1407,10 +1426,12 @@ nfs_parsefattr(struct nfsm_chain *nmc, int nfsvers, struct nfs_vattr *nvap)
        nfsm_chain_get_time(error, nmc, nfsvers,
            nvap->nva_timesec[NFSTIME_CHANGE],
            nvap->nva_timensec[NFSTIME_CHANGE]);
+
 nfsmout:
        return error;
 }
 
+
 /*
  * Load the attribute cache (that lives in the nfsnode entry) with
  * the value pointed to by nvap, unless the file type in the attribute
@@ -1531,6 +1552,7 @@ nfs_loadattrcache(
                } else if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER_GROUP) &&
                    (nvap->nva_gid != npnvap->nva_gid)) {
                        events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS;
+#if CONFIG_NFS4
                } else if (nmp->nm_vers >= NFS_VER4) {
                        if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER) &&
                            !kauth_guid_equal(&nvap->nva_uuuid, &npnvap->nva_uuuid)) {
@@ -1544,11 +1566,15 @@ nfs_loadattrcache(
                            bcmp(nvap->nva_acl, npnvap->nva_acl, KAUTH_ACL_COPYSIZE(nvap->nva_acl))))) {
                                events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS;
                        }
+#endif
                }
-               if (((nmp->nm_vers >= NFS_VER4) && (nvap->nva_change != npnvap->nva_change)) ||
-                   (NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_TIME_MODIFY) &&
-                   ((nvap->nva_timesec[NFSTIME_MODIFY] != npnvap->nva_timesec[NFSTIME_MODIFY]) ||
-                   (nvap->nva_timensec[NFSTIME_MODIFY] != npnvap->nva_timensec[NFSTIME_MODIFY])))) {
+               if (/* Oh, C... */
+#if CONFIG_NFS4
+                       ((nmp->nm_vers >= NFS_VER4) && (nvap->nva_change != npnvap->nva_change)) ||
+#endif
+                       (NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_TIME_MODIFY) &&
+                       ((nvap->nva_timesec[NFSTIME_MODIFY] != npnvap->nva_timesec[NFSTIME_MODIFY]) ||
+                       (nvap->nva_timensec[NFSTIME_MODIFY] != npnvap->nva_timensec[NFSTIME_MODIFY])))) {
                        events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_WRITE;
                }
                if (!events && NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_RAWDEV) &&
@@ -1625,6 +1651,7 @@ nfs_loadattrcache(
        }
 
 #if CONFIG_TRIGGERS
+#if CONFIG_NFS4
        /*
         * For NFSv4, if the fsid doesn't match the fsid for the mount, then
         * this node is for a different file system on the server.  So we mark
@@ -1635,7 +1662,8 @@ nfs_loadattrcache(
            (np->n_vattr.nva_fsid.minor != nmp->nm_fsid.minor))) {
                np->n_vattr.nva_flags |= NFS_FFLAG_TRIGGER;
        }
-#endif
+#endif /* CONFIG_NFS4 */
+#endif /* CONFIG_TRIGGERS */
 
        if (!vp || (nvap->nva_type != VREG)) {
                np->n_size = nvap->nva_size;
@@ -1703,11 +1731,13 @@ nfs_attrcachetimeout(nfsnode_t np)
        }
 
        isdir = vnode_isdir(NFSTOV(np));
-
+#if CONFIG_NFS4
        if ((nmp->nm_vers >= NFS_VER4) && (np->n_openflags & N_DELEG_MASK)) {
                /* If we have a delegation, we always use the max timeout. */
                timeo = isdir ? nmp->nm_acdirmax : nmp->nm_acregmax;
-       } else if ((np)->n_flag & NMODIFIED) {
+       } else
+#endif
+       if ((np)->n_flag & NMODIFIED) {
                /* If we have modifications, we always use the min timeout. */
                timeo = isdir ? nmp->nm_acdirmin : nmp->nm_acregmin;
        } else {
@@ -1914,8 +1944,19 @@ nfs_uaddr2sockaddr(const char *uaddr, struct sockaddr *addr)
        unsigned long val;      /* decoded value */
        int s;                  /* index used for sliding array to insert elided zeroes */
 
+       /* AF_LOCAL address are paths that start with '/' or are empty */
+       if (*uaddr == '/' || *uaddr == '\0') { /* AF_LOCAL address */
+               struct sockaddr_un *sun = (struct sockaddr_un *)addr;
+               sun->sun_family = AF_LOCAL;
+               sun->sun_len = sizeof(struct sockaddr_un);
+               strlcpy(sun->sun_path, uaddr, sizeof(sun->sun_path));
+
+               return 1;
+       }
+
 #define HEXVALUE        0
 #define DECIMALVALUE    1
+
 #define GET(TYPE) \
        do { \
                if ((dcount <= 0) || (dcount > (((TYPE) == DECIMALVALUE) ? 3 : 4))) \
@@ -2104,20 +2145,57 @@ uint32_t nfs_debug_ctl;
 #include <stdarg.h>
 
 void
-nfs_printf(int facility, int level, const char *fmt, ...)
+nfs_printf(unsigned int facility, unsigned int level, const char *fmt, ...)
 {
        va_list ap;
 
-       if ((uint32_t)level > NFS_DEBUG_LEVEL) {
-               return;
+       if (NFS_IS_DBG(facility, level)) {
+               va_start(ap, fmt);
+               vprintf(fmt, ap);
+               va_end(ap);
        }
-       if (NFS_DEBUG_FACILITY && !((uint32_t)facility & NFS_DEBUG_FACILITY)) {
-               return;
+}
+
+
+#define DISPLAYLEN 16
+
+static bool
+isprint(int ch)
+{
+       return ch >= 0x20 && ch <= 0x7e;
+}
+
+static void
+hexdump(void *data, size_t len)
+{
+       size_t i, j;
+       unsigned char *d = data;
+       char *p, disbuf[3 * DISPLAYLEN + 1];
+
+       for (i = 0; i < len; i += DISPLAYLEN) {
+               for (p = disbuf, j = 0; (j + i) < len && j < DISPLAYLEN; j++, p += 3) {
+                       snprintf(p, 4, "%2.2x ", d[i + j]);
+               }
+               for (; j < DISPLAYLEN; j++, p += 3) {
+                       snprintf(p, 4, "   ");
+               }
+               printf("%s    ", disbuf);
+               for (p = disbuf, j = 0; (j + i) < len && j < DISPLAYLEN; j++, p++) {
+                       snprintf(p, 2, "%c", isprint(d[i + j]) ? d[i + j] : '.');
+               }
+               printf("%s\n", disbuf);
        }
+}
 
-       va_start(ap, fmt);
-       vprintf(fmt, ap);
-       va_end(ap);
+void
+nfs_dump_mbuf(const char *func, int lineno, const char *msg, mbuf_t mb)
+{
+       mbuf_t m;
+
+       printf("%s:%d %s\n", func, lineno, msg);
+       for (m = mb; m; m = mbuf_next(m)) {
+               hexdump(mbuf_data(m), mbuf_len(m));
+       }
 }
 
 /* Is a mount gone away? */
index a27683203ce2ea2a585dd55e09ebe5bd5f93aed4..78d83c9519360381ec63a719ea5bece6eee0a5d0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc.  All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc.  All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -170,8 +170,12 @@ SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, is_mobile, CTLFLAG_RW | CTLFLAG_LO
 SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, squishy_flags, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_squishy_flags, 0, "");
 SYSCTL_UINT(_vfs_generic_nfs_client, OID_AUTO, debug_ctl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_debug_ctl, 0, "");
 SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, readlink_nocache, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_readlink_nocache, 0, "");
+#if CONFIG_NFS_GSS
 SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, root_steals_gss_context, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_root_steals_ctx, 0, "");
+#endif
+#if CONFIG_NFS4
 SYSCTL_STRING(_vfs_generic_nfs_client, OID_AUTO, default_nfs4domain, CTLFLAG_RW | CTLFLAG_LOCKED, nfs4_default_domain, sizeof(nfs4_default_domain), "");
+#endif
 #endif /* NFSCLIENT */
 
 #if NFSSERVER
@@ -203,11 +207,11 @@ SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_count, CTLFLAG_RD | C
 
 #if NFSCLIENT
 
+#if CONFIG_NFS4
 static int
 mapname2id(struct nfs_testmapid *map)
 {
        int error;
-
        error = nfs4_id2guid(map->ntm_name, &map->ntm_guid, map->ntm_grpflag);
        if (error) {
                return error;
@@ -257,6 +261,8 @@ nfsclnt_testidmap(proc_t p, user_addr_t argp)
        }
 
        error = copyin(argp, &mapid, sizeof(mapid));
+       mapid.ntm_name[MAXIDNAMELEN - 1] = '\0';
+
        if (error) {
                return error;
        }
@@ -281,6 +287,7 @@ nfsclnt_testidmap(proc_t p, user_addr_t argp)
 
        return error ? error : coerror;
 }
+#endif
 
 int
 nfsclnt(proc_t p, struct nfsclnt_args *uap, __unused int *retval)
@@ -298,9 +305,11 @@ nfsclnt(proc_t p, struct nfsclnt_args *uap, __unused int *retval)
        case NFSCLNT_LOCKDNOTIFY:
                error = nfslockdnotify(p, uap->argp);
                break;
+#if CONFIG_NFS4
        case NFSCLNT_TESTIDMAP:
                error = nfsclnt_testidmap(p, uap->argp);
                break;
+#endif
        default:
                error = EINVAL;
        }
@@ -907,6 +916,8 @@ nfssvc_addsock(socket_t so, mbuf_t mynam)
        if (sotype == SOCK_STREAM) {
                error = nfsrv_check_exports_allow_address(mynam);
                if (error) {
+                       log(LOG_INFO, "nfsvc_addsock:: nfsrv_check_exports_allow_address(myname) returned %d\n", error);
+                       mbuf_freem(mynam);
                        return error;
                }
                sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
@@ -914,8 +925,8 @@ nfssvc_addsock(socket_t so, mbuf_t mynam)
        if ((sodomain == AF_INET) && (soprotocol == IPPROTO_TCP)) {
                sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
        }
-       if (sotype == SOCK_DGRAM) { /* set socket buffer sizes for UDP */
-               int reserve = NFS_UDPSOCKBUF;
+       if (sotype == SOCK_DGRAM || sodomain == AF_LOCAL) { /* set socket buffer sizes for UDP */
+               int reserve = (sotype == SOCK_DGRAM) ? NFS_UDPSOCKBUF : (2 * 1024 * 1024);
                error |= sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &reserve, sizeof(reserve));
                error |= sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &reserve, sizeof(reserve));
                if (error) {
@@ -977,7 +988,7 @@ nfssvc_addsock(socket_t so, mbuf_t mynam)
        /* add the socket to the list */
        first = TAILQ_EMPTY(&nfsrv_socklist);
        TAILQ_INSERT_TAIL(&nfsrv_socklist, slp, ns_chain);
-       if (soprotocol == IPPROTO_TCP) {
+       if (sotype == SOCK_STREAM) {
                nfsrv_sock_tcp_cnt++;
                if (nfsrv_sock_idle_timeout < 0) {
                        nfsrv_sock_idle_timeout = 0;
index b6dced9066e1c64e416c9ec2cf961b64baab1f56..9b83d3fc6cd34398dc1766aa121e35a530855033 100644 (file)
@@ -333,7 +333,7 @@ nfsrv_uc_proxy(socket_t so, void *arg, int waitflag)
        TAILQ_INSERT_TAIL(myqueue->ucq_queue, uap, nua_svcq);
 
        uap->nua_flags |= NFS_UC_QUEUED;
-       if (myqueue->ucq_flags | NFS_UC_QUEUE_SLEEPING) {
+       if (myqueue->ucq_flags & NFS_UC_QUEUE_SLEEPING) {
                wakeup(myqueue);
        }
 
index 1ac2b3bd5123c517b0f048ea2140eb615b489c0c..67b409bae2d4a6ea720e93b60094ba9aee7b8aa2 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -85,6 +85,7 @@
 #include <sys/mount_internal.h>
 #include <sys/kpi_mbuf.h>
 #include <sys/socket.h>
+#include <sys/un.h>
 #include <sys/socketvar.h>
 #include <sys/fcntl.h>
 #include <sys/quota.h>
@@ -178,10 +179,12 @@ int nfs_tprintf_delay = NFS_TPRINTF_DELAY;
 
 
 int             mountnfs(char *, mount_t, vfs_context_t, vnode_t *);
+#if CONFIG_NETBOOT
 static int      nfs_mount_diskless(struct nfs_dlmount *, const char *, int, vnode_t *, mount_t *, vfs_context_t);
 #if !defined(NO_MOUNT_PRIVATE)
 static int      nfs_mount_diskless_private(struct nfs_dlmount *, const char *, int, vnode_t *, mount_t *, vfs_context_t);
 #endif /* NO_MOUNT_PRIVATE */
+#endif
 int             nfs_mount_connect(struct nfsmount *);
 void            nfs_mount_drain_and_cleanup(struct nfsmount *);
 void            nfs_mount_cleanup(struct nfsmount *);
@@ -238,47 +241,49 @@ int nfs4_getquota(struct nfsmount *, vfs_context_t, uid_t, int, struct dqblk *);
 #endif
 
 const struct nfs_funcs nfs3_funcs = {
-       nfs3_mount,
-       nfs3_update_statfs,
-       nfs3_getquota,
-       nfs3_access_rpc,
-       nfs3_getattr_rpc,
-       nfs3_setattr_rpc,
-       nfs3_read_rpc_async,
-       nfs3_read_rpc_async_finish,
-       nfs3_readlink_rpc,
-       nfs3_write_rpc_async,
-       nfs3_write_rpc_async_finish,
-       nfs3_commit_rpc,
-       nfs3_lookup_rpc_async,
-       nfs3_lookup_rpc_async_finish,
-       nfs3_remove_rpc,
-       nfs3_rename_rpc,
-       nfs3_setlock_rpc,
-       nfs3_unlock_rpc,
-       nfs3_getlock_rpc
+       .nf_mount = nfs3_mount,
+       .nf_update_statfs = nfs3_update_statfs,
+       .nf_getquota = nfs3_getquota,
+       .nf_access_rpc = nfs3_access_rpc,
+       .nf_getattr_rpc = nfs3_getattr_rpc,
+       .nf_setattr_rpc = nfs3_setattr_rpc,
+       .nf_read_rpc_async = nfs3_read_rpc_async,
+       .nf_read_rpc_async_finish = nfs3_read_rpc_async_finish,
+       .nf_readlink_rpc = nfs3_readlink_rpc,
+       .nf_write_rpc_async = nfs3_write_rpc_async,
+       .nf_write_rpc_async_finish = nfs3_write_rpc_async_finish,
+       .nf_commit_rpc = nfs3_commit_rpc,
+       .nf_lookup_rpc_async = nfs3_lookup_rpc_async,
+       .nf_lookup_rpc_async_finish = nfs3_lookup_rpc_async_finish,
+       .nf_remove_rpc = nfs3_remove_rpc,
+       .nf_rename_rpc = nfs3_rename_rpc,
+       .nf_setlock_rpc = nfs3_setlock_rpc,
+       .nf_unlock_rpc = nfs3_unlock_rpc,
+       .nf_getlock_rpc = nfs3_getlock_rpc
 };
+#if CONFIG_NFS4
 const struct nfs_funcs nfs4_funcs = {
-       nfs4_mount,
-       nfs4_update_statfs,
-       nfs4_getquota,
-       nfs4_access_rpc,
-       nfs4_getattr_rpc,
-       nfs4_setattr_rpc,
-       nfs4_read_rpc_async,
-       nfs4_read_rpc_async_finish,
-       nfs4_readlink_rpc,
-       nfs4_write_rpc_async,
-       nfs4_write_rpc_async_finish,
-       nfs4_commit_rpc,
-       nfs4_lookup_rpc_async,
-       nfs4_lookup_rpc_async_finish,
-       nfs4_remove_rpc,
-       nfs4_rename_rpc,
-       nfs4_setlock_rpc,
-       nfs4_unlock_rpc,
-       nfs4_getlock_rpc
+       .nf_mount = nfs4_mount,
+       .nf_update_statfs = nfs4_update_statfs,
+       .nf_getquota = nfs4_getquota,
+       .nf_access_rpc = nfs4_access_rpc,
+       .nf_getattr_rpc = nfs4_getattr_rpc,
+       .nf_setattr_rpc = nfs4_setattr_rpc,
+       .nf_read_rpc_async = nfs4_read_rpc_async,
+       .nf_read_rpc_async_finish = nfs4_read_rpc_async_finish,
+       .nf_readlink_rpc = nfs4_readlink_rpc,
+       .nf_write_rpc_async = nfs4_write_rpc_async,
+       .nf_write_rpc_async_finish = nfs4_write_rpc_async_finish,
+       .nf_commit_rpc = nfs4_commit_rpc,
+       .nf_lookup_rpc_async = nfs4_lookup_rpc_async,
+       .nf_lookup_rpc_async_finish = nfs4_lookup_rpc_async_finish,
+       .nf_remove_rpc = nfs4_remove_rpc,
+       .nf_rename_rpc = nfs4_rename_rpc,
+       .nf_setlock_rpc = nfs4_setlock_rpc,
+       .nf_unlock_rpc = nfs4_unlock_rpc,
+       .nf_getlock_rpc = nfs4_getlock_rpc
 };
+#endif
 
 /*
  * Called once to initialize data structures...
@@ -286,8 +291,9 @@ const struct nfs_funcs nfs4_funcs = {
 int
 nfs_vfs_init(__unused struct vfsconf *vfsp)
 {
+#if CONFIG_NFS4
        int i;
-
+#endif
        /*
         * Check to see if major data structures haven't bloated.
         */
@@ -328,8 +334,11 @@ nfs_vfs_init(__unused struct vfsconf *vfsp)
        nfs_nbinit();                   /* Init the nfsbuf table */
        nfs_nhinit();                   /* Init the nfsnode table */
        nfs_lockinit();                 /* Init the nfs lock state */
+#if CONFIG_NFS_GSS
        nfs_gss_init();                 /* Init RPCSEC_GSS security */
+#endif
 
+#if CONFIG_NFS4
        /* NFSv4 stuff */
        NFS4_PER_FS_ATTRIBUTES(nfs_fs_attr_bitmap);
        NFS4_PER_OBJECT_ATTRIBUTES(nfs_object_attr_bitmap);
@@ -338,15 +347,18 @@ nfs_vfs_init(__unused struct vfsconf *vfsp)
                nfs_getattr_bitmap[i] &= nfs_object_attr_bitmap[i];
        }
        TAILQ_INIT(&nfsclientids);
+#endif
 
        /* initialize NFS timer callouts */
        nfs_request_timer_call = thread_call_allocate(nfs_request_timer, NULL);
        nfs_buf_timer_call = thread_call_allocate(nfs_buf_timer, NULL);
+#if CONFIG_NFS4
        nfs4_callback_timer_call = thread_call_allocate(nfs4_callback_timer, NULL);
-
+#endif
        return 0;
 }
 
+
 /*
  * nfs statfs call
  */
@@ -434,6 +446,7 @@ nfsmout:
        return error;
 }
 
+#if CONFIG_NFS4
 int
 nfs4_update_statfs(struct nfsmount *nmp, vfs_context_t ctx)
 {
@@ -506,16 +519,22 @@ nfsmout:
        vnode_put(NFSTOV(np));
        return error;
 }
+#endif /* CONFIG_NFS4 */
+
 
 /*
  * Return an NFS volume name from the mntfrom name.
  */
 static void
-nfs_get_volname(struct mount *mp, char *volname, size_t len)
+nfs_get_volname(struct mount *mp, char *volname, size_t len, vfs_context_t ctx)
 {
        const char *ptr, *cptr;
        const char *mntfrom = mp->mnt_vfsstat.f_mntfromname;
-       size_t mflen = strnlen(mntfrom, MAXPATHLEN + 1);
+       struct nfsmount *nmp = VFSTONFS(mp);
+       size_t mflen;
+
+
+       mflen = strnlen(mntfrom, MAXPATHLEN + 1);
 
        if (mflen > MAXPATHLEN || mflen == 0) {
                strlcpy(volname, "Bad volname", len);
@@ -557,6 +576,7 @@ nfs_get_volname(struct mount *mp, char *volname, size_t len)
        strlcpy(volname, ptr, len);
 }
 
+
 /*
  * The NFS VFS_GETATTR function: "statfs"-type information is retrieved
  * using the nf_update_statfs() function, and other attributes are cobbled
@@ -646,10 +666,11 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx)
 
        if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) {
                /*%%% IF fail over support is implemented we may need to take nm_lock */
-               nfs_get_volname(mp, fsap->f_vol_name, MAXPATHLEN);
+               nfs_get_volname(mp, fsap->f_vol_name, MAXPATHLEN, ctx);
                VFSATTR_SET_SUPPORTED(fsap, f_vol_name);
        }
-       if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) {
+       if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)
+           ) {
                u_int32_t caps, valid;
                nfsnode_t np = nmp->nm_dnp;
 
@@ -663,10 +684,10 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx)
                 * The capabilities[] array defines what this volume supports.
                 *
                 * The valid[] array defines which bits this code understands
-                * the meaning of (whether the volume has that capability or not).
-                * Any zero bits here means "I don't know what you're asking about"
-                * and the caller cannot tell whether that capability is
-                * present or not.
+                * the meaning of (whether the volume has that capability or
+                * not).  Any zero bits here means "I don't know what you're
+                * asking about" and the caller cannot tell whether that
+                * capability is present or not.
                 */
                caps = valid = 0;
                if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_SYMLINK_SUPPORT)) {
@@ -706,6 +727,7 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx)
                         */
                        caps |= VOL_CAP_FMT_2TB_FILESIZE;
                }
+#if CONFIG_NFS4
                if (nfsvers >= NFS_VER4) {
                        caps |= VOL_CAP_FMT_HIDDEN_FILES;
                        valid |= VOL_CAP_FMT_HIDDEN_FILES;
@@ -713,6 +735,7 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx)
 //                     caps |= VOL_CAP_FMT_OPENDENYMODES;
 //                     valid |= VOL_CAP_FMT_OPENDENYMODES;
                }
+#endif
                // no version of nfs supports immutable files
                caps |= VOL_CAP_FMT_NO_IMMUTABLE_FILES;
                valid |= VOL_CAP_FMT_NO_IMMUTABLE_FILES;
@@ -753,16 +776,18 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx)
                /*
                 * We don't support most of the interfaces.
                 *
-                * We MAY support locking, but we don't have any easy way of probing.
-                * We can tell if there's no lockd running or if locks have been
-                * disabled for a mount, so we can definitely answer NO in that case.
-                * Any attempt to send a request to lockd to test for locking support
-                * may cause the lazily-launched locking daemons to be started
-                * unnecessarily.  So we avoid that.  However, we do record if we ever
-                * successfully perform a lock operation on a mount point, so if it
-                * looks like lock ops have worked, we do report that we support them.
+                * We MAY support locking, but we don't have any easy way of
+                * probing.  We can tell if there's no lockd running or if
+                * locks have been disabled for a mount, so we can definitely
+                * answer NO in that case.  Any attempt to send a request to
+                * lockd to test for locking support may cause the lazily-
+                * launched locking daemons to be started unnecessarily.  So
+                * we avoid that.  However, we do record if we ever successfully
+                * perform a lock operation on a mount point, so if it looks
+                * like lock ops have worked, we do report that we support them.
                 */
                caps = valid = 0;
+#if CONFIG_NFS4
                if (nfsvers >= NFS_VER4) {
                        caps = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK;
                        valid = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK;
@@ -780,7 +805,9 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx)
                        }
                        valid |= VOL_CAP_INT_NAMEDSTREAMS;
 #endif
-               } else if (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED) {
+               } else
+#endif
+               if (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED) {
                        /* locks disabled on this mount, so they definitely won't work */
                        valid = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK;
                } else if (nmp->nm_state & NFSSTA_LOCKSWORK) {
@@ -980,6 +1007,7 @@ nfsmout:
  *     if swdevt[0].sw_dev == NODEV
  * - build the rootfs mount point and call mountnfs() to do the rest.
  */
+#if CONFIG_NETBOOT
 int
 nfs_mountroot(void)
 {
@@ -1341,7 +1369,7 @@ nfs_mount_diskless_private(
        uint32_t argslength_offset, attrslength_offset, end_offset;
 
        procp = current_proc(); /* XXX */
-       xb_init(&xb, 0);
+       xb_init(&xb, XDRBUF_NONE);
 
        {
                /*
@@ -1592,6 +1620,8 @@ out:
 }
 #endif /* NO_MOUNT_PRIVATE */
 
+#endif
+
 /*
  * Convert old style NFS mount args to XDR.
  */
@@ -2158,6 +2188,7 @@ out:
        return error;
 }
 
+#if CONFIG_NFS4
 /*
  * Update an NFSv4 mount path with the contents of the symlink.
  *
@@ -2763,6 +2794,7 @@ nfsmout:
        nfsm_chain_cleanup(&nmrep);
        return error;
 }
+#endif /* CONFIG_NFS4 */
 
 /*
  * Thread to handle initial NFS mount connection.
@@ -2844,7 +2876,7 @@ nfs_mount_connect(struct nfsmount *nmp)
 {
        int error = 0, slpflag;
        thread_t thd;
-       struct timespec ts = { 2, 0 };
+       struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
 
        /*
         * Set up the socket.  Perform initial search for a location/server/address to
@@ -2923,7 +2955,13 @@ mountnfs(
        uint32_t *mflags_mask;
        uint32_t *mflags;
        uint32_t argslength, attrslength;
-       struct nfs_location_index firstloc = { NLI_VALID, 0, 0, 0 };
+       uid_t set_owner;
+       struct nfs_location_index firstloc = {
+               .nli_flags = NLI_VALID,
+               .nli_loc = 0,
+               .nli_serv = 0,
+               .nli_addr = 0
+       };
        static const struct nfs_etype nfs_default_etypes = {
                .count = NFS_MAX_ETYPES,
                .selected = NFS_MAX_ETYPES,
@@ -2931,6 +2969,7 @@ mountnfs(
                            NFS_AES128_CTS_HMAC_SHA1_96,
                            NFS_DES3_CBC_SHA1_KD}
        };
+
        /* make sure mbuf constants are set up */
        if (!nfs_mbuf_mhlen) {
                nfs_mbuf_init();
@@ -3115,11 +3154,13 @@ mountnfs(
                switch (val) {
                case NFS_LOCK_MODE_DISABLED:
                case NFS_LOCK_MODE_LOCAL:
+#if CONFIG_NFS4
                        if (nmp->nm_vers >= NFS_VER4) {
                                /* disabled/local lock mode only allowed on v2/v3 */
                                error = EINVAL;
                                break;
                        }
+#endif
                /* FALLTHROUGH */
                case NFS_LOCK_MODE_ENABLED:
                        nmp->nm_lockmode = val;
@@ -3184,10 +3225,11 @@ mountnfs(
                xb_get_32(error, &xb, nmp->nm_numgrps);
        }
        if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOCKET_TYPE)) {
-               char sotype[6];
+               char sotype[16];
 
+               *sotype = '\0';
                xb_get_32(error, &xb, val);
-               if (!error && ((val < 3) || (val > 5))) {
+               if (!error && ((val < 3) || (val > sizeof(sotype)))) {
                        error = EINVAL;
                }
                nfsmerr_if(error);
@@ -3216,13 +3258,24 @@ mountnfs(
                        nmp->nm_sofamily = AF_INET6;
                } else if (!strcmp(sotype, "inet")) {
                        nmp->nm_sofamily = 0; /* ok */
+               } else if (!strcmp(sotype, "ticotsord")) {
+                       nmp->nm_sofamily = AF_LOCAL;
+                       nmp->nm_sotype = SOCK_STREAM;
+               } else if (!strcmp(sotype, "ticlts")) {
+                       nmp->nm_sofamily = AF_LOCAL;
+                       nmp->nm_sotype = SOCK_DGRAM;
                } else {
                        error = EINVAL;
                }
+#if CONFIG_NFS4
                if (!error && (nmp->nm_vers >= NFS_VER4) && nmp->nm_sotype &&
                    (nmp->nm_sotype != SOCK_STREAM)) {
                        error = EINVAL;         /* NFSv4 is only allowed over TCP. */
                }
+#endif
+               if (error) {
+                       NFS_VFS_DBG("EINVAL sotype = \"%s\"\n", sotype);
+               }
                nfsmerr_if(error);
        }
        if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_PORT)) {
@@ -3279,6 +3332,7 @@ mountnfs(
                xb_get_32(error, &xb, nmp->nm_locations.nl_numlocs); /* fs location count */
                /* sanity check location count */
                if (!error && ((nmp->nm_locations.nl_numlocs < 1) || (nmp->nm_locations.nl_numlocs > 256))) {
+                       NFS_VFS_DBG("Invalid number of fs_locations: %d", nmp->nm_locations.nl_numlocs);
                        error = EINVAL;
                }
                nfsmerr_if(error);
@@ -3296,12 +3350,14 @@ mountnfs(
                        xb_get_32(error, &xb, fsl->nl_servcount); /* server count */
                        /* sanity check server count */
                        if (!error && ((fsl->nl_servcount < 1) || (fsl->nl_servcount > 256))) {
+                               NFS_VFS_DBG("Invalid server count %d", fsl->nl_servcount);
                                error = EINVAL;
                        }
                        nfsmerr_if(error);
                        MALLOC(fsl->nl_servers, struct nfs_fs_server **, fsl->nl_servcount * sizeof(struct nfs_fs_server*), M_TEMP, M_WAITOK | M_ZERO);
                        if (!fsl->nl_servers) {
                                error = ENOMEM;
+                               NFS_VFS_DBG("Server count = %d, error = %d\n", fsl->nl_servcount, error);
                        }
                        for (serv = 0; serv < fsl->nl_servcount; serv++) {
                                nfsmerr_if(error);
@@ -3312,7 +3368,8 @@ mountnfs(
                                fsl->nl_servers[serv] = fss;
                                xb_get_32(error, &xb, val); /* server name length */
                                /* sanity check server name length */
-                               if (!error && ((val < 1) || (val > MAXPATHLEN))) {
+                               if (!error && (val > MAXPATHLEN)) {
+                                       NFS_VFS_DBG("Invalid server name length %d", val);
                                        error = EINVAL;
                                }
                                nfsmerr_if(error);
@@ -3325,6 +3382,7 @@ mountnfs(
                                xb_get_32(error, &xb, fss->ns_addrcount); /* address count */
                                /* sanity check address count (OK to be zero) */
                                if (!error && (fss->ns_addrcount > 256)) {
+                                       NFS_VFS_DBG("Invalid address count %d", fss->ns_addrcount);
                                        error = EINVAL;
                                }
                                nfsmerr_if(error);
@@ -3336,7 +3394,8 @@ mountnfs(
                                        for (addr = 0; addr < fss->ns_addrcount; addr++) {
                                                xb_get_32(error, &xb, val); /* address length */
                                                /* sanity check address length */
-                                               if (!error && ((val < 1) || (val > 128))) {
+                                               if (!error && val > 128) {
+                                                       NFS_VFS_DBG("Invalid address length %d", val);
                                                        error = EINVAL;
                                                }
                                                nfsmerr_if(error);
@@ -3356,6 +3415,7 @@ mountnfs(
                        xb_get_32(error, &xb, fsp->np_compcount); /* component count */
                        /* sanity check component count */
                        if (!error && (fsp->np_compcount > MAXPATHLEN)) {
+                               NFS_VFS_DBG("Invalid component count %d", fsp->np_compcount);
                                error = EINVAL;
                        }
                        nfsmerr_if(error);
@@ -3383,6 +3443,7 @@ mountnfs(
                                        continue;
                                }
                                if (!error && ((val < 1) || (val > MAXPATHLEN))) {
+                                       NFS_VFS_DBG("Invalid component path length %d", val);
                                        error = EINVAL;
                                }
                                nfsmerr_if(error);
@@ -3394,7 +3455,8 @@ mountnfs(
                                error = xb_get_bytes(&xb, fsp->np_components[comp], val, 0); /* component */
                        }
                        xb_get_32(error, &xb, val); /* fs location info length */
-                       xb_skip(error, &xb, val); /* skip fs location info */
+                       NFS_VFS_DBG("Skipping fs location info bytes %d", val);
+                       xb_skip(error, &xb, xdr_rndup(val)); /* skip fs location info */
                }
        }
        if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MNTFLAGS)) {
@@ -3466,6 +3528,62 @@ mountnfs(
        }
        nfsmerr_if(error);
 
+       if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCAL_NFS_PORT)) {
+               if (nmp->nm_nfsport) {
+                       error = EINVAL;
+                       NFS_VFS_DBG("Can't have ports specified over incompatible socket families");
+               }
+               nfsmerr_if(error);
+               xb_get_32(error, &xb, len);
+               if (!error && ((len < 1) || (len > sizeof(((struct sockaddr_un *)0)->sun_path)))) {
+                       error = EINVAL;
+               }
+               nfsmerr_if(error);
+               MALLOC(nmp->nm_nfs_localport, char *, len + 1, M_TEMP, M_WAITOK | M_ZERO);
+               if (!nmp->nm_nfs_localport) {
+                       error = ENOMEM;
+               }
+               nfsmerr_if(error);
+               error = xb_get_bytes(&xb, nmp->nm_nfs_localport, len, 0);
+               nmp->nm_sofamily = AF_LOCAL;
+               nmp->nm_nfsport = 1; /* We use the now deprecated tpcmux port to indcate that we have an AF_LOCAL port */
+               NFS_VFS_DBG("Setting nfs local port %s (%d)\n", nmp->nm_nfs_localport, nmp->nm_nfsport);
+       }
+       if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCAL_MOUNT_PORT)) {
+               if (nmp->nm_mountport) {
+                       error = EINVAL;
+                       NFS_VFS_DBG("Can't have ports specified over mulitple socket families");
+               }
+               nfsmerr_if(error);
+               xb_get_32(error, &xb, len);
+               if (!error && ((len < 1) || (len > sizeof(((struct sockaddr_un *)0)->sun_path)))) {
+                       error = EINVAL;
+               }
+               nfsmerr_if(error);
+               MALLOC(nmp->nm_mount_localport, char *, len + 1, M_TEMP, M_WAITOK | M_ZERO);
+               if (!nmp->nm_mount_localport) {
+                       error = ENOMEM;
+               }
+               nfsmerr_if(error);
+               error = xb_get_bytes(&xb, nmp->nm_mount_localport, len, 0);
+               nmp->nm_sofamily = AF_LOCAL;
+               nmp->nm_mountport = 1; /* We use the now deprecated tpcmux port to indcate that we have an AF_LOCAL port */
+               NFS_VFS_DBG("Setting mount local port %s (%d)\n", nmp->nm_mount_localport, nmp->nm_mountport);
+       }
+       if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SET_MOUNT_OWNER)) {
+               xb_get_32(error, &xb, set_owner);
+               nfsmerr_if(error);
+               error = vfs_context_suser(ctx);
+               /*
+                * root can set owner to whatever, user can set owner to self
+                */
+               if ((error) && (set_owner == kauth_cred_getuid(vfs_context_ucred(ctx)))) {
+                       /* ok for non-root can set owner to self */
+                       error = 0;
+               }
+               nfsmerr_if(error);
+       }
+
        /*
         * Sanity check/finalize settings.
         */
@@ -3498,10 +3616,11 @@ mountnfs(
        }
        nfsmerr_if(error);
 
-       /* init mount's mntfromname to first location */
        if (!NM_OMATTR_GIVEN(nmp, MNTFROM)) {
+               /* init mount's mntfromname to first location */
                nfs_location_mntfromname(&nmp->nm_locations, firstloc,
-                   vfs_statfs(mp)->f_mntfromname, sizeof(vfs_statfs(mp)->f_mntfromname), 0);
+                   vfs_statfs(mp)->f_mntfromname,
+                   sizeof(vfs_statfs(mp)->f_mntfromname), 0);
        }
 
        /* Need to save the mounting credential for v4. */
@@ -3520,21 +3639,29 @@ mountnfs(
        }
        nfsmerr_if(error);
 
-       /* do mount's initial socket connection */
-       error = nfs_mount_connect(nmp);
-       nfsmerr_if(error);
-
        /* set up the version-specific function tables */
        if (nmp->nm_vers < NFS_VER4) {
                nmp->nm_funcs = &nfs3_funcs;
        } else {
+#if CONFIG_NFS4
                nmp->nm_funcs = &nfs4_funcs;
+#else
+               /* don't go any further if we don't support NFS4 */
+               nmp->nm_funcs = NULL;
+               error = ENOTSUP;
+               nfsmerr_if(error);
+#endif
        }
 
+       /* do mount's initial socket connection */
+       error = nfs_mount_connect(nmp);
+       nfsmerr_if(error);
+
        /* sanity check settings now that version/connection is set */
        if (nmp->nm_vers == NFS_VER2) {         /* ignore RDIRPLUS on NFSv2 */
                NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_RDIRPLUS);
        }
+#if CONFIG_NFS4
        if (nmp->nm_vers >= NFS_VER4) {
                if (NFS_BITMAP_ISSET(nmp->nm_flags, NFS_MFLAG_ACLONLY)) { /* aclonly trumps noacl */
                        NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOACL);
@@ -3544,12 +3671,15 @@ mountnfs(
                        error = EINVAL; /* disabled/local lock mode only allowed on v2/v3 */
                }
        } else {
-               /* ignore these if not v4 */
-               NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOCALLBACK);
-               NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NAMEDATTR);
-               NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOACL);
-               NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_ACLONLY);
-       }
+#endif
+       /* ignore these if not v4 */
+       NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOCALLBACK);
+       NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NAMEDATTR);
+       NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOACL);
+       NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_ACLONLY);
+#if CONFIG_NFS4
+}
+#endif
        nfsmerr_if(error);
 
        if (nmp->nm_sotype == SOCK_DGRAM) {
@@ -3597,6 +3727,19 @@ mountnfs(
                TAILQ_INIT(&nmp->nm_cwndq);
        }
 
+       if (nmp->nm_saddr->sa_family == AF_LOCAL) {
+               struct sockaddr_un *un = (struct sockaddr_un *)nmp->nm_saddr;
+               size_t size;
+               int n = snprintf(vfs_statfs(mp)->f_mntfromname, sizeof(vfs_statfs(mp)->f_mntfromname), "<%s>:", un->sun_path);
+
+               if (n > 0 && (size_t)n < sizeof(vfs_statfs(mp)->f_mntfromname)) {
+                       size = sizeof(vfs_statfs(mp)->f_mntfromname) - n;
+                       nfs_location_mntfromname(&nmp->nm_locations, firstloc,
+                           &vfs_statfs(mp)->f_mntfromname[n], size, 1);
+               }
+       }
+
+
        /*
         * Get the root node/attributes from the NFS server and
         * do any basic, version-specific setup.
@@ -3612,6 +3755,8 @@ mountnfs(
         */
        nmp->nm_dnp = np;
        *vpp = NFSTOV(np);
+
+
        /* get usecount and drop iocount */
        error = vnode_ref(*vpp);
        vnode_put(*vpp);
@@ -3643,6 +3788,10 @@ mountnfs(
        sbp->f_ffree = nmp->nm_fsattr.nfsa_files_free;
        sbp->f_iosize = nfs_iosize;
 
+       if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SET_MOUNT_OWNER)) {
+               sbp->f_owner = set_owner;
+       }
+
        /*
         * Calculate the size used for I/O buffers.  Use the larger
         * of the two sizes to minimise NFS requests but make sure
@@ -3652,18 +3801,15 @@ mountnfs(
         * buffers into multiple requests if the buffer size is
         * larger than the I/O size.
         */
-#ifndef CONFIG_EMBEDDED
        iosize = max(nmp->nm_rsize, nmp->nm_wsize);
        if (iosize < PAGE_SIZE) {
                iosize = PAGE_SIZE;
        }
-#else
-       iosize = PAGE_SIZE;
-#endif
        nmp->nm_biosize = trunc_page_32(iosize);
 
        /* For NFSv3 and greater, there is a (relatively) reliable ACCESS call. */
-       if (nmp->nm_vers > NFS_VER2) {
+       if (nmp->nm_vers > NFS_VER2 && !NMFLAG(nmp, NOOPAQUE_AUTH)
+           ) {
                vfs_setauthopaqueaccess(mp);
        }
 
@@ -3681,6 +3827,7 @@ mountnfs(
                break;
        }
 
+
        /* success! */
        lck_mtx_lock(&nmp->nm_lock);
        nmp->nm_state |= NFSSTA_MOUNTED;
@@ -3704,7 +3851,9 @@ int
 nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx)
 {
        nfsnode_t np = VTONFS(vp);
+#if CONFIG_NFS4
        nfsnode_t dnp = VTONFS(dvp);
+#endif
        struct nfsmount *nmp = NFSTONMP(np);
        char fstype[MFSTYPENAMELEN], *mntfromname = NULL, *path = NULL, *relpath, *p, *cp;
        int error = 0, pathbuflen = MAXPATHLEN, i, mntflags = 0, referral, skipcopy = 0;
@@ -3725,7 +3874,7 @@ nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx)
                bzero(&nfsls, sizeof(nfsls));
        }
 
-       xb_init(&xbnew, 0);
+       xb_init(&xbnew, XDRBUF_NONE);
 
        if (!nmp || (nmp->nm_state & (NFSSTA_FORCE | NFSSTA_DEAD))) {
                return ENXIO;
@@ -3793,13 +3942,16 @@ nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx)
                const char *vname = vnode_getname(NFSTOV(np));
                if (!vname) {
                        error = ENOENT;
-               } else {
+               }
+#if CONFIG_NFS4
+               else {
                        error = nfs4_get_fs_locations(nmp, dnp, NULL, 0, vname, ctx, &nfsls);
                        vnode_putname(vname);
                        if (!error && (nfsls.nl_numlocs < 1)) {
                                error = ENOENT;
                        }
                }
+#endif
                nfsmerr_if(error);
        }
 
@@ -3841,12 +3993,13 @@ nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx)
        }
        if (referral) {
                NFS_BITMAP_SET(newmattrs, NFS_MATTR_FS_LOCATIONS);
+               NFS_BITMAP_CLR(newmattrs, NFS_MATTR_MNTFROM);
        } else {
                NFS_BITMAP_SET(newmattrs, NFS_MATTR_FH);
        }
        NFS_BITMAP_SET(newmattrs, NFS_MATTR_FLAGS);
        NFS_BITMAP_SET(newmattrs, NFS_MATTR_MNTFLAGS);
-       NFS_BITMAP_CLR(newmattrs, NFS_MATTR_MNTFROM);
+       NFS_BITMAP_SET(newmattrs, NFS_MATTR_SET_MOUNT_OWNER);
        xb_add_bitmap(error, &xbnew, newmattrs, NFS_MATTR_BITMAP_LEN);
        attrslength_offset = xb_offset(&xbnew);
        xb_copy_32(error, &xb, &xbnew, val); /* attrs length */
@@ -3980,20 +4133,18 @@ nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx)
                                xb_copy_opaque(error, &xb, &xbnew); /* component */
                        }
                        /* add additional components */
-                       for (comp = 0; !skipcopy && !error && (comp < relpathcomps); comp++) {
-                               p = relpath;
-                               while (*p && (*p == '/')) {
+                       p = relpath;
+                       while (*p && (*p == '/')) {
+                               p++;
+                       }
+                       while (*p && !error) {
+                               cp = p;
+                               while (*p && (*p != '/')) {
                                        p++;
                                }
-                               while (*p && !error) {
-                                       cp = p;
-                                       while (*p && (*p != '/')) {
-                                               p++;
-                                       }
-                                       xb_add_string(error, &xbnew, cp, (p - cp)); /* component */
-                                       while (*p && (*p == '/')) {
-                                               p++;
-                                       }
+                               xb_add_string(error, &xbnew, cp, (p - cp)); /* component */
+                               while (*p && (*p == '/')) {
+                                       p++;
                                }
                        }
                        xb_copy_opaque(error, &xb, &xbnew); /* fs location info */
@@ -4070,6 +4221,31 @@ nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx)
                        error = xb_add_bytes(&xbnew, buf, count, 1);
                }
        }
+       /*
+        * The following string copies rely on the fact that we already validated
+        * these data when creating the initial mount point.
+        */
+       if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_REALM)) {
+               xb_add_string(error, &xbnew, nmp->nm_realm, strlen(nmp->nm_realm));
+       }
+       if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_PRINCIPAL)) {
+               xb_add_string(error, &xbnew, nmp->nm_principal, strlen(nmp->nm_principal));
+       }
+       if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SVCPRINCIPAL)) {
+               xb_add_string(error, &xbnew, nmp->nm_sprinc, strlen(nmp->nm_sprinc));
+       }
+       if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCAL_NFS_PORT)) {
+               xb_add_string(error, &xbnew, nmp->nm_nfs_localport, strlen(nmp->nm_nfs_localport));
+       }
+       if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCAL_MOUNT_PORT)) {
+               xb_add_string(error, &xbnew, nmp->nm_mount_localport, strlen(nmp->nm_mount_localport));
+       }
+       if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SET_MOUNT_OWNER)) {
+               /* drop embedded owner value */
+               xb_get_32(error, &xb, count);
+       }
+       /* New mount always gets same owner as this mount */
+       xb_add_32(error, &xbnew, vnode_mount(vp)->mnt_vfsstat.f_owner);
        xb_build_done(error, &xbnew);
 
        /* update opaque counts */
@@ -4088,10 +4264,13 @@ nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx)
        /*
         * For kernel_mount() call, use the existing mount flags (instead of the
         * original flags) because flags like MNT_NOSUID and MNT_NODEV may have
-        * been silently enforced.
+        * been silently enforced. Also, in terms of MACF, the _kernel_ is
+        * performing the mount (and enforcing all of the mount options), so we
+        * use the kernel context for the mount call.
         */
        mntflags = vnode_vfsvisflags(vp);
        mntflags |= (MNT_AUTOMOUNTED | MNT_DONTBROWSE);
+       ctx = vfs_context_kernel();
 
        /* do the mount */
        error = kernel_mount(fstype, dvp, vp, path, xb_buffer_base(&xbnew), argslength,
@@ -4122,6 +4301,7 @@ nfsmerr:
 /*
  * trigger vnode functions
  */
+#define NFS_TRIGGER_DEBUG 1
 
 resolver_result_t
 nfs_mirror_mount_trigger_resolve(
@@ -4132,9 +4312,10 @@ nfs_mirror_mount_trigger_resolve(
        __unused void *data,
        vfs_context_t ctx)
 {
-       nfsnode_t np = VTONFS(vp);
-       vnode_t pvp = NULLVP;
-       int error = 0;
+       nfsnode_t         np = VTONFS(vp);
+       vnode_t           pvp = NULLVP;
+       int               error = 0;
+       int               didBusy = 0;
        resolver_result_t result;
 
        /*
@@ -4204,6 +4385,21 @@ nfs_mirror_mount_trigger_resolve(
 #endif
                return result;
        }
+       didBusy = 1;
+
+       /* Check again, in case the mount happened while we were setting busy */
+       if (vnode_mountedhere(vp) != NULL) {
+               /* Been there.  Done that.  Let's just say it succeeded.  */
+               error = 0;
+               goto skipmount;
+       }
+       nfs_node_lock_force(np);
+       if (np->n_flag & NDISARMTRIGGER) {
+               error = ECANCELED;
+               nfs_node_unlock(np);
+               goto skipmount;
+       }
+       nfs_node_unlock(np);
 
        pvp = vnode_getparent(vp);
        if (pvp == NULLVP) {
@@ -4226,7 +4422,9 @@ skipmount:
        if (pvp != NULLVP) {
                vnode_put(pvp);
        }
-       nfs_node_clear_busy(np);
+       if (didBusy) {
+               nfs_node_clear_busy(np);
+       }
        return result;
 }
 
@@ -4326,7 +4524,8 @@ nfs_ephemeral_mount_harvester_callback(mount_t mp, void *arg)
                return VFS_RETURNED;
        }
        nmp = VFSTONFS(mp);
-       if (!nmp || !NMFLAG(nmp, EPHEMERAL)) {
+       if (!nmp || !NMFLAG(nmp, EPHEMERAL)
+           ) {
                return VFS_RETURNED;
        }
        hinfo->mountcount++;
@@ -4438,6 +4637,7 @@ nfs3_mount_rpc(struct nfsmount *nmp, struct sockaddr *sa, int sotype, int nfsver
        uint32_t mntvers, mntport, val;
        struct sockaddr_storage ss;
        struct sockaddr *saddr = (struct sockaddr*)&ss;
+       struct sockaddr_un *sun = (struct sockaddr_un*)saddr;
 
        nfsm_chain_null(&nmreq);
        nfsm_chain_null(&nmrep);
@@ -4452,20 +4652,26 @@ nfs3_mount_rpc(struct nfsmount *nmp, struct sockaddr *sa, int sotype, int nfsver
                        ((struct sockaddr_in*)saddr)->sin_port = htons(nmp->nm_mountport);
                }
                mntport = ntohs(((struct sockaddr_in*)saddr)->sin_port);
-       } else {
+       } else if (saddr->sa_family == AF_INET6) {
                if (nmp->nm_mountport) {
                        ((struct sockaddr_in6*)saddr)->sin6_port = htons(nmp->nm_mountport);
                }
                mntport = ntohs(((struct sockaddr_in6*)saddr)->sin6_port);
+       } else {  /* Local domain socket */
+               mntport = ((struct sockaddr_un *)saddr)->sun_path[0]; /* Do we have and address? */
+               mntproto = IPPROTO_TCP;  /* XXX rpcbind only listens on streams sockets for now */
        }
 
        while (!mntport) {
-               error = nfs_portmap_lookup(nmp, ctx, saddr, NULL, RPCPROG_MNT, mntvers, mntproto, timeo);
+               error = nfs_portmap_lookup(nmp, ctx, saddr, NULL, RPCPROG_MNT, mntvers,
+                   mntproto == IPPROTO_UDP ? SOCK_DGRAM : SOCK_STREAM, timeo);
                nfsmout_if(error);
                if (saddr->sa_family == AF_INET) {
                        mntport = ntohs(((struct sockaddr_in*)saddr)->sin_port);
-               } else {
+               } else if (saddr->sa_family == AF_INET6) {
                        mntport = ntohs(((struct sockaddr_in6*)saddr)->sin6_port);
+               } else if (saddr->sa_family == AF_LOCAL) {
+                       mntport = ((struct sockaddr_un*)saddr)->sun_path[0];
                }
                if (!mntport) {
                        /* if not found and TCP, then retry with UDP */
@@ -4475,6 +4681,9 @@ nfs3_mount_rpc(struct nfsmount *nmp, struct sockaddr *sa, int sotype, int nfsver
                        }
                        mntproto = IPPROTO_UDP;
                        bcopy(sa, saddr, min(sizeof(ss), sa->sa_len));
+                       if (saddr->sa_family == AF_LOCAL) {
+                               strlcpy(sun->sun_path, RPCB_TICLTS_PATH, sizeof(sun->sun_path));
+                       }
                }
        }
        nfsmout_if(error || !mntport);
@@ -4541,8 +4750,10 @@ nfs3_umount_rpc(struct nfsmount *nmp, vfs_context_t ctx, int timeo)
        bcopy(nmp->nm_saddr, saddr, min(sizeof(ss), nmp->nm_saddr->sa_len));
        if (saddr->sa_family == AF_INET) {
                ((struct sockaddr_in*)saddr)->sin_port = htons(mntport);
-       } else {
+       } else if (saddr->sa_family == AF_INET6) {
                ((struct sockaddr_in6*)saddr)->sin6_port = htons(mntport);
+       } else { /* Local domain socket */
+               mntport = ((struct sockaddr_un *)saddr)->sun_path[0]; /* Do we have and address? */
        }
 
        while (!mntport) {
@@ -4550,8 +4761,10 @@ nfs3_umount_rpc(struct nfsmount *nmp, vfs_context_t ctx, int timeo)
                nfsmout_if(error);
                if (saddr->sa_family == AF_INET) {
                        mntport = ntohs(((struct sockaddr_in*)saddr)->sin_port);
-               } else {
+               } else if (saddr->sa_family == AF_INET6) {
                        mntport = ntohs(((struct sockaddr_in6*)saddr)->sin6_port);
+               } else { /* Local domain socket */
+                       mntport = ((struct sockaddr_un *)saddr)->sun_path[0]; /* Do we have and address? */
                }
                /* if not found and mntvers > VER1, then retry with VER1 */
                if (!mntport) {
@@ -4603,7 +4816,7 @@ nfs_vfs_unmount(
        struct nfsmount *nmp;
        vnode_t vp;
        int error, flags = 0;
-       struct timespec ts = { 1, 0 };
+       struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
 
        nmp = VFSTONFS(mp);
        lck_mtx_lock(&nmp->nm_lock);
@@ -4774,7 +4987,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
 {
        struct nfsreq *req, *treq;
        struct nfs_reqqhead iodq, resendq;
-       struct timespec ts = { 1, 0 };
+       struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
        struct nfs_open_owner *noop, *nextnoop;
        nfsnode_t np;
        int docallback;
@@ -4783,14 +4996,16 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
        nmp->nm_state |= nm_state_flags;
        nmp->nm_ref++;
        lck_mtx_unlock(&nmp->nm_lock);
-
+#if CONFIG_NFS4
        /* stop callbacks */
        if ((nmp->nm_vers >= NFS_VER4) && !NMFLAG(nmp, NOCALLBACK) && nmp->nm_cbid) {
                nfs4_mount_callback_shutdown(nmp);
        }
-
+#endif
+#if CONFIG_NFS_GSS
        /* Destroy any RPCSEC_GSS contexts */
        nfs_gss_clnt_ctx_unmount(nmp);
+#endif
 
        /* mark the socket for termination */
        lck_mtx_lock(&nmp->nm_lock);
@@ -4814,6 +5029,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
 
        lck_mtx_lock(&nmp->nm_lock);
 
+#if CONFIG_NFS4
        if ((nmp->nm_vers >= NFS_VER4) && !NMFLAG(nmp, NOCALLBACK) && nmp->nm_cbid) {
                /* clear out any pending delegation return requests */
                while ((np = TAILQ_FIRST(&nmp->nm_dreturnq))) {
@@ -4828,7 +5044,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
                thread_call_free(nmp->nm_renew_timer);
                nmp->nm_renew_timer = NULL;
        }
-
+#endif
        lck_mtx_unlock(&nmp->nm_lock);
 
        if (nmp->nm_state & NFSSTA_MOUNTED) {
@@ -4846,6 +5062,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
                }
        }
 
+#if CONFIG_NFS4
        if ((nmp->nm_vers >= NFS_VER4) && nmp->nm_longid) {
                /* remove/deallocate the client ID data */
                lck_mtx_lock(nfs_global_mutex);
@@ -4857,7 +5074,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
                nmp->nm_longid = NULL;
                lck_mtx_unlock(nfs_global_mutex);
        }
-
+#endif
        /*
         * Be sure all requests for this mount are completed
         * and removed from the resend queue.
@@ -4967,6 +5184,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
        }
        lck_mtx_unlock(&nmp->nm_lock);
 
+#if CONFIG_NFS4
        /* clean up NFSv4 state */
        if (nmp->nm_vers >= NFS_VER4) {
                lck_mtx_lock(&nmp->nm_lock);
@@ -4976,7 +5194,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
                }
                lck_mtx_unlock(&nmp->nm_lock);
        }
-
+#endif
        nfs_mount_rele(nmp);
 }
 
@@ -5042,6 +5260,8 @@ nfs_mount_cleanup(struct nfsmount *nmp)
        if (nmp->nm_fh) {
                FREE(nmp->nm_fh, M_TEMP);
        }
+
+
        FREE_ZONE(nmp, sizeof(struct nfsmount), M_NFSMNT);
 }
 
@@ -5130,13 +5350,13 @@ nfs3_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struc
        uint32_t val = 0, bsize = 0;
        struct sockaddr *rqsaddr;
        struct timeval now;
-       struct timespec ts = { 1, 0 };
+       struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
 
        if (!nmp->nm_saddr) {
                return ENXIO;
        }
 
-       if (NMFLAG(nmp, NOQUOTA)) {
+       if (NMFLAG(nmp, NOQUOTA) || nmp->nm_saddr->sa_family == AF_LOCAL /* XXX for now */) {
                return ENOTSUP;
        }
 
@@ -5291,7 +5511,7 @@ nfsmout:
        nfsm_chain_cleanup(&nmrep);
        return error;
 }
-
+#if CONFIG_NFS4
 int
 nfs4_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struct dqblk *dqb)
 {
@@ -5382,7 +5602,7 @@ nfsmout:
        kauth_cred_unref(&cred);
        return error;
 }
-
+#endif /* CONFIG_NFS4 */
 int
 nfs_vfs_quotactl(mount_t mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t ctx)
 {
@@ -5554,7 +5774,7 @@ int
 nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb)
 {
        struct xdrbuf xbinfo, xborig;
-       char sotype[6];
+       char sotype[16];
        uint32_t origargsvers, origargslength;
        uint32_t infolength_offset, curargsopaquelength_offset, curargslength_offset, attrslength_offset, curargs_end_offset, end_offset;
        uint32_t miattrs[NFS_MIATTR_BITMAP_LEN];
@@ -5598,9 +5818,11 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb)
        NFS_BITMAP_ZERO(mattrs, NFS_MATTR_BITMAP_LEN);
        NFS_BITMAP_SET(mattrs, NFS_MATTR_FLAGS);
        NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_VERSION);
+#if CONFIG_NFS4
        if (nmp->nm_vers >= NFS_VER4) {
                NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_MINOR_VERSION);
        }
+#endif
        NFS_BITMAP_SET(mattrs, NFS_MATTR_READ_SIZE);
        NFS_BITMAP_SET(mattrs, NFS_MATTR_WRITE_SIZE);
        NFS_BITMAP_SET(mattrs, NFS_MATTR_READDIR_SIZE);
@@ -5616,8 +5838,10 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb)
        }
        NFS_BITMAP_SET(mattrs, NFS_MATTR_MAX_GROUP_LIST);
        NFS_BITMAP_SET(mattrs, NFS_MATTR_SOCKET_TYPE);
-       NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_PORT);
-       if ((nmp->nm_vers < NFS_VER4) && nmp->nm_mountport) {
+       if (nmp->nm_saddr->sa_family != AF_LOCAL) {
+               NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_PORT);
+       }
+       if ((nmp->nm_vers < NFS_VER4) && nmp->nm_mountport && !nmp->nm_mount_localport) {
                NFS_BITMAP_SET(mattrs, NFS_MATTR_MOUNT_PORT);
        }
        NFS_BITMAP_SET(mattrs, NFS_MATTR_REQUEST_TIMEOUT);
@@ -5644,6 +5868,12 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb)
        if (nmp->nm_sprinc) {
                NFS_BITMAP_SET(mattrs, NFS_MATTR_SVCPRINCIPAL);
        }
+       if (nmp->nm_nfs_localport) {
+               NFS_BITMAP_SET(mattrs, NFS_MATTR_LOCAL_NFS_PORT);
+       }
+       if ((nmp->nm_vers < NFS_VER4) && nmp->nm_mount_localport) {
+               NFS_BITMAP_SET(mattrs, NFS_MATTR_LOCAL_MOUNT_PORT);
+       }
 
        /* set up current mount flags bitmap */
        /* first set the flags that we will be setting - either on OR off */
@@ -5663,6 +5893,7 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb)
        }
        NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NONEGNAMECACHE);
        NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_MUTEJUKEBOX);
+#if CONFIG_NFS4
        if (nmp->nm_vers >= NFS_VER4) {
                NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_EPHEMERAL);
                NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOCALLBACK);
@@ -5670,6 +5901,7 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb)
                NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOACL);
                NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_ACLONLY);
        }
+#endif
        NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NFC);
        NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOQUOTA);
        if (nmp->nm_vers < NFS_VER4) {
@@ -5705,6 +5937,7 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb)
        if (NMFLAG(nmp, MUTEJUKEBOX)) {
                NFS_BITMAP_SET(mflags, NFS_MFLAG_MUTEJUKEBOX);
        }
+#if CONFIG_NFS4
        if (nmp->nm_vers >= NFS_VER4) {
                if (NMFLAG(nmp, EPHEMERAL)) {
                        NFS_BITMAP_SET(mflags, NFS_MFLAG_EPHEMERAL);
@@ -5722,6 +5955,7 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb)
                        NFS_BITMAP_SET(mflags, NFS_MFLAG_ACLONLY);
                }
        }
+#endif
        if (NMFLAG(nmp, NFC)) {
                NFS_BITMAP_SET(mflags, NFS_MFLAG_NFC);
        }
@@ -5765,9 +5999,11 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb)
        xb_add_bitmap(error, &xbinfo, mflags_mask, NFS_MFLAG_BITMAP_LEN);
        xb_add_bitmap(error, &xbinfo, mflags, NFS_MFLAG_BITMAP_LEN);
        xb_add_32(error, &xbinfo, nmp->nm_vers);                /* NFS_VERSION */
+#if CONFIG_NFS4
        if (nmp->nm_vers >= NFS_VER4) {
                xb_add_32(error, &xbinfo, nmp->nm_minor_vers);  /* NFS_MINOR_VERSION */
        }
+#endif
        xb_add_32(error, &xbinfo, nmp->nm_rsize);               /* READ_SIZE */
        xb_add_32(error, &xbinfo, nmp->nm_wsize);               /* WRITE_SIZE */
        xb_add_32(error, &xbinfo, nmp->nm_readdirsize);         /* READDIR_SIZE */
@@ -5807,13 +6043,29 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb)
        }
        xb_add_32(error, &xbinfo, nmp->nm_numgrps);             /* MAX_GROUP_LIST */
        nfsmerr_if(error);
-       snprintf(sotype, sizeof(sotype), "%s%s", (nmp->nm_sotype == SOCK_DGRAM) ? "udp" : "tcp",
-           nmp->nm_sofamily ? (nmp->nm_sofamily == AF_INET) ? "4" : "6" : "");
-       xb_add_string(error, &xbinfo, sotype, strlen(sotype));  /* SOCKET_TYPE */
-       xb_add_32(error, &xbinfo, ntohs(((struct sockaddr_in*)nmp->nm_saddr)->sin_port)); /* NFS_PORT */
-       if ((nmp->nm_vers < NFS_VER4) && nmp->nm_mountport) {
-               xb_add_32(error, &xbinfo, nmp->nm_mountport);   /* MOUNT_PORT */
+
+       switch (nmp->nm_saddr->sa_family) {
+       case AF_INET:
+       case AF_INET6:
+               snprintf(sotype, sizeof(sotype), "%s%s", (nmp->nm_sotype == SOCK_DGRAM) ? "udp" : "tcp",
+                   nmp->nm_sofamily ? (nmp->nm_sofamily == AF_INET) ? "4" : "6" : "");
+               xb_add_string(error, &xbinfo, sotype, strlen(sotype));  /* SOCKET_TYPE */
+               xb_add_32(error, &xbinfo, ntohs(((struct sockaddr_in*)nmp->nm_saddr)->sin_port)); /* NFS_PORT */
+               if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MOUNT_PORT)) {
+                       xb_add_32(error, &xbinfo, nmp->nm_mountport);   /* MOUNT_PORT */
+               }
+               break;
+       case AF_LOCAL:
+               strlcpy(sotype, (nmp->nm_sotype == SOCK_DGRAM) ? "ticlts" : "ticotsord", sizeof(sotype));
+               xb_add_string(error, &xbinfo, sotype, strlen(sotype));
+               break;
+       default:
+               NFS_VFS_DBG("Unsupported address family %d\n", nmp->nm_saddr->sa_family);
+               printf("Unsupported address family %d\n", nmp->nm_saddr->sa_family);
+               error = EINVAL;
+               break;
        }
+
        timeo = (nmp->nm_timeo * 10) / NFS_HZ;
        xb_add_32(error, &xbinfo, timeo / 10);                    /* REQUEST_TIMEOUT */
        xb_add_32(error, &xbinfo, (timeo % 10) * 100000000);        /* REQUEST_TIMEOUT */
@@ -5861,7 +6113,13 @@ nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb)
        if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SVCPRINCIPAL)) {
                xb_add_string(error, &xbinfo, nmp->nm_sprinc, strlen(nmp->nm_sprinc));
        }
-
+       if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCAL_NFS_PORT)) {
+               struct sockaddr_un *un = (struct sockaddr_un *)nmp->nm_saddr;
+               xb_add_string(error, &xbinfo, un->sun_path, strlen(un->sun_path));
+       }
+       if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCAL_MOUNT_PORT)) {
+               xb_add_string(error, &xbinfo, nmp->nm_mount_localport, strlen(nmp->nm_mount_localport));
+       }
        curargs_end_offset = xb_offset(&xbinfo);
 
        /* NFS_MIATTR_CUR_LOC_INDEX */
@@ -5924,8 +6182,9 @@ nfs_vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp,
        struct xdrbuf xb;
        struct netfs_status *nsp = NULL;
        int timeoutmask;
-       uint pos, totlen, count, numThreads;
+       uint totlen, count, numThreads;
 #if NFSSERVER
+       uint pos;
        struct nfs_exportfs *nxfs;
        struct nfs_export *nx;
        struct nfs_active_user_list *ulist;
@@ -6033,7 +6292,7 @@ nfs_vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp,
                if (((nmp = VFSTONFS(mp))) == NULL) {
                        return ENOENT;
                }
-               xb_init(&xb, 0);
+               xb_init(&xb, XDRBUF_NONE);
                if ((error = nfs_mountinfo_assemble(nmp, &xb))) {
                        return error;
                }
@@ -6311,9 +6570,11 @@ ustat_skip:
                        if (nmp->nm_lockmode == NFS_LOCK_MODE_LOCAL) {
                                /* can't toggle locks when using local locks */
                                error = EINVAL;
+#if CONFIG_NFS4
                        } else if ((nmp->nm_vers >= NFS_VER4) && val) {
                                /* can't disable locks for NFSv4 */
                                error = EINVAL;
+#endif
                        } else if (val) {
                                if ((nmp->nm_vers <= NFS_VER3) && (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED)) {
                                        nfs_lockd_mount_unregister(nmp);
index 0991a5373ff166f549c6def6f7a999034c6d21cc..b460a0411a3529c7b1fc2aa716d358a2be6447b1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -87,6 +87,7 @@
 #include <sys/attr.h>
 #include <sys/signalvar.h>
 #include <sys/uio_internal.h>
+#include <sys/xattr.h>
 
 #include <vfs/vfs_support.h>
 
@@ -157,53 +158,56 @@ int     nfs3_vnop_mkdir(struct vnop_mkdir_args *);
 int     nfs3_vnop_rmdir(struct vnop_rmdir_args *);
 int     nfs3_vnop_symlink(struct vnop_symlink_args *);
 
+
 vnop_t **nfsv2_vnodeop_p;
-static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = {
-       { &vnop_default_desc, (vnop_t *)vn_default_error },
-       { &vnop_lookup_desc, (vnop_t *)nfs_vnop_lookup },       /* lookup */
-       { &vnop_create_desc, (vnop_t *)nfs3_vnop_create },      /* create */
-       { &vnop_mknod_desc, (vnop_t *)nfs3_vnop_mknod },        /* mknod */
-       { &vnop_open_desc, (vnop_t *)nfs_vnop_open },           /* open */
-       { &vnop_close_desc, (vnop_t *)nfs_vnop_close },         /* close */
-       { &vnop_access_desc, (vnop_t *)nfs_vnop_access },       /* access */
-       { &vnop_getattr_desc, (vnop_t *)nfs3_vnop_getattr },    /* getattr */
-       { &vnop_setattr_desc, (vnop_t *)nfs_vnop_setattr },     /* setattr */
-       { &vnop_read_desc, (vnop_t *)nfs_vnop_read },           /* read */
-       { &vnop_write_desc, (vnop_t *)nfs_vnop_write },         /* write */
-       { &vnop_ioctl_desc, (vnop_t *)nfs_vnop_ioctl },         /* ioctl */
-       { &vnop_select_desc, (vnop_t *)nfs_vnop_select },       /* select */
-       { &vnop_revoke_desc, (vnop_t *)nfs_vnop_revoke },       /* revoke */
-       { &vnop_mmap_desc, (vnop_t *)nfs_vnop_mmap },           /* mmap */
-       { &vnop_mnomap_desc, (vnop_t *)nfs_vnop_mnomap },       /* mnomap */
-       { &vnop_fsync_desc, (vnop_t *)nfs_vnop_fsync },         /* fsync */
-       { &vnop_remove_desc, (vnop_t *)nfs_vnop_remove },       /* remove */
-       { &vnop_link_desc, (vnop_t *)nfs3_vnop_link },          /* link */
-       { &vnop_rename_desc, (vnop_t *)nfs_vnop_rename },       /* rename */
-       { &vnop_mkdir_desc, (vnop_t *)nfs3_vnop_mkdir },        /* mkdir */
-       { &vnop_rmdir_desc, (vnop_t *)nfs3_vnop_rmdir },        /* rmdir */
-       { &vnop_symlink_desc, (vnop_t *)nfs3_vnop_symlink },    /* symlink */
-       { &vnop_readdir_desc, (vnop_t *)nfs_vnop_readdir },     /* readdir */
-       { &vnop_readlink_desc, (vnop_t *)nfs_vnop_readlink },   /* readlink */
-       { &vnop_inactive_desc, (vnop_t *)nfs_vnop_inactive },   /* inactive */
-       { &vnop_reclaim_desc, (vnop_t *)nfs_vnop_reclaim },     /* reclaim */
-       { &vnop_strategy_desc, (vnop_t *)err_strategy },        /* strategy */
-       { &vnop_pathconf_desc, (vnop_t *)nfs_vnop_pathconf },   /* pathconf */
-       { &vnop_advlock_desc, (vnop_t *)nfs_vnop_advlock },     /* advlock */
-       { &vnop_bwrite_desc, (vnop_t *)err_bwrite },            /* bwrite */
-       { &vnop_pagein_desc, (vnop_t *)nfs_vnop_pagein },       /* Pagein */
-       { &vnop_pageout_desc, (vnop_t *)nfs_vnop_pageout },     /* Pageout */
-       { &vnop_copyfile_desc, (vnop_t *)err_copyfile },        /* Copyfile */
-       { &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff },   /* blktooff */
-       { &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk },   /* offtoblk */
-       { &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap },   /* blockmap */
-       { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor },     /* monitor */
-       { NULL, NULL }
+static const struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = {
+       { .opve_op = &vnop_default_desc, .opve_impl = (vnop_t *)vn_default_error },
+       { .opve_op = &vnop_lookup_desc, .opve_impl = (vnop_t *)nfs_vnop_lookup },       /* lookup */
+       { .opve_op = &vnop_create_desc, .opve_impl = (vnop_t *)nfs3_vnop_create },      /* create */
+       { .opve_op = &vnop_mknod_desc, .opve_impl = (vnop_t *)nfs3_vnop_mknod },        /* mknod */
+       { .opve_op = &vnop_open_desc, .opve_impl = (vnop_t *)nfs_vnop_open },           /* open */
+       { .opve_op = &vnop_close_desc, .opve_impl = (vnop_t *)nfs_vnop_close },         /* close */
+       { .opve_op = &vnop_access_desc, .opve_impl = (vnop_t *)nfs_vnop_access },       /* access */
+       { .opve_op = &vnop_getattr_desc, .opve_impl = (vnop_t *)nfs3_vnop_getattr },    /* getattr */
+       { .opve_op = &vnop_setattr_desc, .opve_impl = (vnop_t *)nfs_vnop_setattr },     /* setattr */
+       { .opve_op = &vnop_read_desc, .opve_impl = (vnop_t *)nfs_vnop_read },           /* read */
+       { .opve_op = &vnop_write_desc, .opve_impl = (vnop_t *)nfs_vnop_write },         /* write */
+       { .opve_op = &vnop_ioctl_desc, .opve_impl = (vnop_t *)nfs_vnop_ioctl },         /* ioctl */
+       { .opve_op = &vnop_select_desc, .opve_impl = (vnop_t *)nfs_vnop_select },       /* select */
+       { .opve_op = &vnop_revoke_desc, .opve_impl = (vnop_t *)nfs_vnop_revoke },       /* revoke */
+       { .opve_op = &vnop_mmap_desc, .opve_impl = (vnop_t *)nfs_vnop_mmap },           /* mmap */
+       { .opve_op = &vnop_mnomap_desc, .opve_impl = (vnop_t *)nfs_vnop_mnomap },       /* mnomap */
+       { .opve_op = &vnop_fsync_desc, .opve_impl = (vnop_t *)nfs_vnop_fsync },         /* fsync */
+       { .opve_op = &vnop_remove_desc, .opve_impl = (vnop_t *)nfs_vnop_remove },       /* remove */
+       { .opve_op = &vnop_link_desc, .opve_impl = (vnop_t *)nfs3_vnop_link },          /* link */
+       { .opve_op = &vnop_rename_desc, .opve_impl = (vnop_t *)nfs_vnop_rename },       /* rename */
+       { .opve_op = &vnop_mkdir_desc, .opve_impl = (vnop_t *)nfs3_vnop_mkdir },        /* mkdir */
+       { .opve_op = &vnop_rmdir_desc, .opve_impl = (vnop_t *)nfs3_vnop_rmdir },        /* rmdir */
+       { .opve_op = &vnop_symlink_desc, .opve_impl = (vnop_t *)nfs3_vnop_symlink },    /* symlink */
+       { .opve_op = &vnop_readdir_desc, .opve_impl = (vnop_t *)nfs_vnop_readdir },     /* readdir */
+       { .opve_op = &vnop_readlink_desc, .opve_impl = (vnop_t *)nfs_vnop_readlink },   /* readlink */
+       { .opve_op = &vnop_inactive_desc, .opve_impl = (vnop_t *)nfs_vnop_inactive },   /* inactive */
+       { .opve_op = &vnop_reclaim_desc, .opve_impl = (vnop_t *)nfs_vnop_reclaim },     /* reclaim */
+       { .opve_op = &vnop_strategy_desc, .opve_impl = (vnop_t *)err_strategy },        /* strategy */
+       { .opve_op = &vnop_pathconf_desc, .opve_impl = (vnop_t *)nfs_vnop_pathconf },   /* pathconf */
+       { .opve_op = &vnop_advlock_desc, .opve_impl = (vnop_t *)nfs_vnop_advlock },     /* advlock */
+       { .opve_op = &vnop_bwrite_desc, .opve_impl = (vnop_t *)err_bwrite },            /* bwrite */
+       { .opve_op = &vnop_pagein_desc, .opve_impl = (vnop_t *)nfs_vnop_pagein },       /* Pagein */
+       { .opve_op = &vnop_pageout_desc, .opve_impl = (vnop_t *)nfs_vnop_pageout },     /* Pageout */
+       { .opve_op = &vnop_copyfile_desc, .opve_impl = (vnop_t *)err_copyfile },        /* Copyfile */
+       { .opve_op = &vnop_blktooff_desc, .opve_impl = (vnop_t *)nfs_vnop_blktooff },   /* blktooff */
+       { .opve_op = &vnop_offtoblk_desc, .opve_impl = (vnop_t *)nfs_vnop_offtoblk },   /* offtoblk */
+       { .opve_op = &vnop_blockmap_desc, .opve_impl = (vnop_t *)nfs_vnop_blockmap },   /* blockmap */
+       { .opve_op = &vnop_monitor_desc, .opve_impl = (vnop_t *)nfs_vnop_monitor },     /* monitor */
+       { .opve_op = NULL, .opve_impl = NULL }
 };
-struct vnodeopv_desc nfsv2_vnodeop_opv_desc =
+const struct vnodeopv_desc nfsv2_vnodeop_opv_desc =
 { &nfsv2_vnodeop_p, nfsv2_vnodeop_entries };
 
+
+#if CONFIG_NFS4
 vnop_t **nfsv4_vnodeop_p;
-static struct vnodeopv_entry_desc nfsv4_vnodeop_entries[] = {
+static const struct vnodeopv_entry_desc nfsv4_vnodeop_entries[] = {
        { &vnop_default_desc, (vnop_t *)vn_default_error },
        { &vnop_lookup_desc, (vnop_t *)nfs_vnop_lookup },       /* lookup */
        { &vnop_create_desc, (vnop_t *)nfs4_vnop_create },      /* create */
@@ -253,14 +257,15 @@ static struct vnodeopv_entry_desc nfsv4_vnodeop_entries[] = {
        { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor },     /* monitor */
        { NULL, NULL }
 };
-struct vnodeopv_desc nfsv4_vnodeop_opv_desc =
+const struct vnodeopv_desc nfsv4_vnodeop_opv_desc =
 { &nfsv4_vnodeop_p, nfsv4_vnodeop_entries };
+#endif
 
 /*
  * Special device vnode ops
  */
 vnop_t **spec_nfsv2nodeop_p;
-static struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = {
+static const struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = {
        { &vnop_default_desc, (vnop_t *)vn_default_error },
        { &vnop_lookup_desc, (vnop_t *)spec_lookup },           /* lookup */
        { &vnop_create_desc, (vnop_t *)spec_create },           /* create */
@@ -298,10 +303,11 @@ static struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = {
        { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor },     /* monitor */
        { NULL, NULL }
 };
-struct vnodeopv_desc spec_nfsv2nodeop_opv_desc =
+const struct vnodeopv_desc spec_nfsv2nodeop_opv_desc =
 { &spec_nfsv2nodeop_p, spec_nfsv2nodeop_entries };
+#if CONFIG_NFS4
 vnop_t **spec_nfsv4nodeop_p;
-static struct vnodeopv_entry_desc spec_nfsv4nodeop_entries[] = {
+static const struct vnodeopv_entry_desc spec_nfsv4nodeop_entries[] = {
        { &vnop_default_desc, (vnop_t *)vn_default_error },
        { &vnop_lookup_desc, (vnop_t *)spec_lookup },           /* lookup */
        { &vnop_create_desc, (vnop_t *)spec_create },           /* create */
@@ -348,12 +354,13 @@ static struct vnodeopv_entry_desc spec_nfsv4nodeop_entries[] = {
        { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor },     /* monitor */
        { NULL, NULL }
 };
-struct vnodeopv_desc spec_nfsv4nodeop_opv_desc =
+const struct vnodeopv_desc spec_nfsv4nodeop_opv_desc =
 { &spec_nfsv4nodeop_p, spec_nfsv4nodeop_entries };
+#endif /* CONFIG_NFS4 */
 
 #if FIFO
 vnop_t **fifo_nfsv2nodeop_p;
-static struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = {
+static const struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = {
        { &vnop_default_desc, (vnop_t *)vn_default_error },
        { &vnop_lookup_desc, (vnop_t *)fifo_lookup },           /* lookup */
        { &vnop_create_desc, (vnop_t *)fifo_create },           /* create */
@@ -391,11 +398,14 @@ static struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = {
        { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor },     /* monitor */
        { NULL, NULL }
 };
-struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc =
+const struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc =
 { &fifo_nfsv2nodeop_p, fifo_nfsv2nodeop_entries };
+#endif
 
+#if CONFIG_NFS4
+#if FIFO
 vnop_t **fifo_nfsv4nodeop_p;
-static struct vnodeopv_entry_desc fifo_nfsv4nodeop_entries[] = {
+static const struct vnodeopv_entry_desc fifo_nfsv4nodeop_entries[] = {
        { &vnop_default_desc, (vnop_t *)vn_default_error },
        { &vnop_lookup_desc, (vnop_t *)fifo_lookup },           /* lookup */
        { &vnop_create_desc, (vnop_t *)fifo_create },           /* create */
@@ -442,14 +452,16 @@ static struct vnodeopv_entry_desc fifo_nfsv4nodeop_entries[] = {
        { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor },     /* monitor */
        { NULL, NULL }
 };
-struct vnodeopv_desc fifo_nfsv4nodeop_opv_desc =
+const struct vnodeopv_desc fifo_nfsv4nodeop_opv_desc =
 { &fifo_nfsv4nodeop_p, fifo_nfsv4nodeop_entries };
 #endif /* FIFO */
+#endif /* CONFIG_NFS4 */
 
 int     nfs_sillyrename(nfsnode_t, nfsnode_t, struct componentname *, vfs_context_t);
 int     nfs_getattr_internal(nfsnode_t, struct nfs_vattr *, vfs_context_t, int);
 int     nfs_refresh_fh(nfsnode_t, vfs_context_t);
 
+
 /*
  * Find the slot in the access cache for this UID.
  * If adding and no existing slot is found, reuse slots in FIFO order.
@@ -514,11 +526,15 @@ nfs3_access_rpc(nfsnode_t np, u_int32_t *access, int rpcflags, vfs_context_t ctx
        }
        nfsmout_if(error);
 
+#if CONFIG_NFS_GSS
        if (auth_is_kerberized(np->n_auth) || auth_is_kerberized(nmp->nm_auth)) {
                uid = nfs_cred_getasid2uid(vfs_context_ucred(ctx));
        } else {
                uid = kauth_cred_getuid(vfs_context_ucred(ctx));
        }
+#else
+       uid = kauth_cred_getuid(vfs_context_ucred(ctx));
+#endif /* CONFIG_NFS_GSS */
        slot = nfs_node_access_slot(np, uid, 1);
        np->n_accessuid[slot] = uid;
        microuptime(&now);
@@ -551,6 +567,7 @@ nfsmout:
        return error;
 }
 
+
 /*
  * NFS access vnode op.
  * For NFS version 2, just return ok. File accesses may fail later.
@@ -582,7 +599,8 @@ nfs_vnop_access(
        }
        nfsvers = nmp->nm_vers;
 
-       if (nfsvers == NFS_VER2) {
+
+       if (nfsvers == NFS_VER2 || NMFLAG(nmp, NOOPAQUE_AUTH)) {
                if ((ap->a_action & KAUTH_VNODE_WRITE_RIGHTS) &&
                    vfs_isrdonly(vnode_mount(vp))) {
                        return EROFS;
@@ -670,11 +688,15 @@ nfs_vnop_access(
         * Does our cached result allow us to give a definite yes to
         * this request?
         */
+#if CONFIG_NFS_GSS
        if (auth_is_kerberized(np->n_auth) || auth_is_kerberized(nmp->nm_auth)) {
                uid = nfs_cred_getasid2uid(vfs_context_ucred(ctx));
        } else {
                uid = kauth_cred_getuid(vfs_context_ucred(ctx));
        }
+#else
+       uid = kauth_cred_getuid(vfs_context_ucred(ctx));
+#endif /* CONFIG_NFS_GSS */
        slot = nfs_node_access_slot(np, uid, 0);
        dorpc = 1;
        if (access == 0) {
@@ -851,6 +873,7 @@ restart:
                NP(np, "nfs_vnop_open: LOST %d", kauth_cred_getuid(nofp->nof_owner->noo_cred));
                error = EIO;
        }
+#if CONFIG_NFS4
        if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) {
                nfs_mount_state_in_use_end(nmp, 0);
                error = nfs4_reopen(nofp, vfs_context_thread(ctx));
@@ -859,6 +882,7 @@ restart:
                        goto restart;
                }
        }
+#endif
        if (!error) {
                error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx));
        }
@@ -886,9 +910,11 @@ restart:
                nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE;
                nofp->nof_creator = NULL;
        } else {
+#if CONFIG_NFS4
                if (!opened) {
                        error = nfs4_open(np, nofp, accessMode, denyMode, ctx);
                }
+#endif
                if ((error == EACCES) && (nofp->nof_flags & NFS_OPEN_FILE_CREATE) &&
                    (nofp->nof_creator == current_thread())) {
                        /*
@@ -1154,6 +1180,7 @@ restart:
        }
 
        error = nfs_open_file_find(np, noop, &nofp, 0, 0, 0);
+#if CONFIG_NFS4
        if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) {
                nfs_mount_state_in_use_end(nmp, 0);
                error = nfs4_reopen(nofp, NULL);
@@ -1162,6 +1189,7 @@ restart:
                        goto restart;
                }
        }
+#endif
        if (error) {
                NP(np, "nfs_vnop_close: no open file for owner, error %d, %d", error, kauth_cred_getuid(noop->noo_cred));
                error = EBADF;
@@ -1212,7 +1240,9 @@ nfs_close(
        uint32_t denyMode,
        vfs_context_t ctx)
 {
+#if CONFIG_NFS4
        struct nfs_lock_owner *nlop;
+#endif
        int error = 0, changed = 0, delegated = 0, closed = 0, downgrade = 0;
        uint32_t newAccessMode, newDenyMode;
 
@@ -1254,10 +1284,11 @@ nfs_close(
                changed = 0;
        }
 
-       if (NFSTONMP(np)->nm_vers < NFS_VER4) { /* NFS v2/v3 closes simply need to remove the open. */
+       if (NFSTONMP(np)->nm_vers < NFS_VER4) {
+               /* NFS v2/v3 closes simply need to remove the open. */
                goto v3close;
        }
-
+#if CONFIG_NFS4
        if ((newAccessMode == 0) || (nofp->nof_opencnt == 1)) {
                /*
                 * No more access after this close, so clean up and close it.
@@ -1305,13 +1336,13 @@ nfs_close(
                        }
                }
        }
-
+#endif
+v3close:
        if (error) {
                NP(np, "nfs_close: error %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred));
                return error;
        }
 
-v3close:
        if (!downgrade) {
                nfs_open_file_remove_open(nofp, accessMode, denyMode);
        }
@@ -1402,7 +1433,7 @@ nfs3_getattr_rpc(
                error = status;
        }
        nfsmout_if(error);
-       error = nfs_parsefattr(&nmrep, nfsvers, nvap);
+       error = nfs_parsefattr(nmp, &nmrep, nfsvers, nvap);
 nfsmout:
        nfsm_chain_cleanup(&nmreq);
        nfsm_chain_cleanup(&nmrep);
@@ -1426,7 +1457,7 @@ nfs_refresh_fh(nfsnode_t np, vfs_context_t ctx)
        int namelen, fhsize, refreshed;
        int error, wanted = 0;
        uint8_t *fhp;
-       struct timespec ts = {2, 0};
+       struct timespec ts = {.tv_sec = 2, .tv_nsec = 0};
 
        NFS_VNOP_DBG("vnode is %d\n", vnode_vtype(vp));
 
@@ -1574,7 +1605,7 @@ nfs_getattr_internal(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, in
        struct nfsmount *nmp;
        int error = 0, nfsvers, inprogset = 0, wanted = 0, avoidfloods;
        struct nfs_vattr nvattr;
-       struct timespec ts = { 2, 0 };
+       struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
        u_int64_t xid;
 
        FSDBG_TOP(513, np->n_size, np, np->n_vattr.nva_size, np->n_flag);
@@ -1796,6 +1827,21 @@ nfsmout:
        return error;
 }
 
+static int
+nfs_parse_user_access(
+       mount_t mp,
+       enum vtype type)
+{
+       int user_access = R_OK;
+       if ((vfs_flags(mp) & MNT_RDONLY) == 0) {
+               user_access |= W_OK;
+       }
+       if (type == VDIR) {
+               user_access |= X_OK;
+       }
+       return user_access;
+}
+
 /*
  * NFS getattr call from vfs.
  */
@@ -1821,6 +1867,7 @@ nfsmout:
         VNODE_ATTR_va_fileid |         \
         VNODE_ATTR_va_type)
 
+
 int
 nfs3_vnop_getattr(
        struct vnop_getattr_args /* {
@@ -1836,12 +1883,15 @@ nfs3_vnop_getattr(
        struct nfsmount *nmp;
        dev_t rdev;
 
+       nmp = VTONMP(ap->a_vp);
+
        /*
         * Lets don't go over the wire if we don't support any of the attributes.
         * Just fall through at the VFS layer and let it cons up what it needs.
         */
        /* Return the io size no matter what, since we don't go over the wire for this */
        VATTR_RETURN(vap, va_iosize, nfs_iosize);
+
        if ((vap->va_active & NFS3_SUPPORTED_VATTRS) == 0) {
                return 0;
        }
@@ -1857,7 +1907,6 @@ nfs3_vnop_getattr(
        }
 
        /* copy nva to *a_vap */
-       nmp = VTONMP(ap->a_vp);
        VATTR_RETURN(vap, va_type, nva.nva_type);
        VATTR_RETURN(vap, va_mode, nva.nva_mode);
        rdev = makedev(nva.nva_rawdev.specdata1, nva.nva_rawdev.specdata2);
@@ -1878,6 +1927,7 @@ nfs3_vnop_getattr(
        vap->va_change_time.tv_nsec = nva.nva_timensec[NFSTIME_CHANGE];
        VATTR_SET_SUPPORTED(vap, va_change_time);
 
+
        // VATTR_RETURN(vap, va_encoding, 0xffff /* kTextEncodingUnknown */);
        return error;
 }
@@ -1907,9 +1957,10 @@ nfs_vnop_setattr(
        int dul_in_progress = 0;
        vnode_t dvp = NULL;
        const char *vname = NULL;
+#if CONFIG_NFS4
        struct nfs_open_owner *noop = NULL;
        struct nfs_open_file *nofp = NULL;
-
+#endif
        nmp = VTONMP(vp);
        if (nfs_mount_gone(nmp)) {
                return ENXIO;
@@ -1966,6 +2017,7 @@ nfs_vnop_setattr(
                                FSDBG_BOT(512, np->n_size, vap->va_data_size, np->n_vattr.nva_size, -1);
                                return error;
                        }
+#if CONFIG_NFS4
                        if (nfsvers >= NFS_VER4) {
                                /* setting file size requires having the file open for write access */
                                if (np->n_flag & NREVOKE) {
@@ -2018,6 +2070,7 @@ restart:
                                        }
                                }
                        }
+#endif
                        nfs_data_lock(np, NFS_DATA_LOCK_EXCLUSIVE);
                        if (np->n_size > vap->va_data_size) { /* shrinking? */
                                daddr64_t obn, bn;
@@ -2201,6 +2254,7 @@ restart:
                        nfs_node_unlock(np);
                }
                nfs_data_unlock(np);
+#if CONFIG_NFS4
                if (nfsvers >= NFS_VER4) {
                        if (nofp) {
                                /* don't close our setattr open if we'll be restarting... */
@@ -2220,6 +2274,7 @@ restart:
                        }
                        nfs_open_owner_rele(noop);
                }
+#endif
        }
        return error;
 }
@@ -2250,7 +2305,9 @@ nfs3_setattr_rpc(
        VATTR_SET_SUPPORTED(vap, va_access_time);
        VATTR_SET_SUPPORTED(vap, va_modify_time);
 
-       if (VATTR_IS_ACTIVE(vap, va_flags)) {
+
+       if (VATTR_IS_ACTIVE(vap, va_flags)
+           ) {
                if (vap->va_flags) {    /* we don't support setting flags */
                        if (vap->va_active & ~VNODE_ATTR_va_flags) {
                                return EINVAL;        /* return EINVAL if other attributes also set */
@@ -2348,7 +2405,7 @@ nfs3_setattr_rpc(
                error = lockerror;
        }
        if (nfsvers == NFS_VER3) {
-               struct timespec premtime = { 0, 0 };
+               struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 };
                nfsm_chain_get_wcc_data(error, &nmrep, np, &premtime, &wccpostattr, &xid);
                nfsmout_if(error);
                /* if file hadn't changed, update cached mtime */
@@ -2503,11 +2560,13 @@ nfs_vnop_lookup(
                fh.fh_len = 0;
                goto found;
        }
+#if CONFIG_NFS4
        if ((nfsvers >= NFS_VER4) && (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)) {
                /* we should never be looking things up in a trigger directory, return nothing */
                error = ENOENT;
                goto error_return;
        }
+#endif
 
        /* do we know this name is too long? */
        nmp = VTONMP(dvp);
@@ -2788,8 +2847,9 @@ nfs_read_rpc(nfsnode_t np, uio_t uio, vfs_context_t ctx)
        user_ssize_t tsiz;
        off_t txoffset;
        struct nfsreq rq, *req = &rq;
+#if CONFIG_NFS4
        uint32_t stategenid = 0, restart = 0;
-
+#endif
        FSDBG_TOP(536, np, uio_offset(uio), uio_resid(uio), 0);
        nmp = NFSTONMP(np);
        if (nfs_mount_gone(nmp)) {
@@ -2812,14 +2872,17 @@ nfs_read_rpc(nfsnode_t np, uio_t uio, vfs_context_t ctx)
                        error = EIO;
                        break;
                }
+#if CONFIG_NFS4
                if (nmp->nm_vers >= NFS_VER4) {
                        stategenid = nmp->nm_stategenid;
                }
+#endif
                error = nmp->nm_funcs->nf_read_rpc_async(np, txoffset, len,
                    vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, &req);
                if (!error) {
                        error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, uio, &retlen, &eof);
                }
+#if CONFIG_NFS4
                if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) &&
                    (++restart <= nfs_mount_state_max_restarts(nmp))) { /* guard against no progress */
                        lck_mtx_lock(&nmp->nm_lock);
@@ -2839,6 +2902,7 @@ nfs_read_rpc(nfsnode_t np, uio_t uio, vfs_context_t ctx)
                                }
                        }
                }
+#endif
                if (error) {
                        break;
                }
@@ -3557,6 +3621,8 @@ skipread:
                        }
                        nfs_buf_write_delayed(bp);
                }
+
+
                if (np->n_needcommitcnt >= NFS_A_LOT_OF_NEEDCOMMITS) {
                        nfs_flushcommits(np, 1);
                }
@@ -3601,7 +3667,10 @@ nfs_write_rpc2(
        uint64_t wverf = 0, wverf2;
        size_t nmwsize, totalsize, tsiz, len, rlen;
        struct nfsreq rq, *req = &rq;
-       uint32_t stategenid = 0, vrestart = 0, restart = 0;
+#if CONFIG_NFS4
+       uint32_t stategenid = 0, restart = 0;
+#endif
+       uint32_t vrestart = 0;
        uio_t uio_save = NULL;
 
 #if DIAGNOSTIC
@@ -3639,9 +3708,11 @@ nfs_write_rpc2(
                        error = EIO;
                        break;
                }
+#if CONFIG_NFS4
                if (nmp->nm_vers >= NFS_VER4) {
                        stategenid = nmp->nm_stategenid;
                }
+#endif
                error = nmp->nm_funcs->nf_write_rpc_async(np, uio, len, thd, cred, *iomodep, NULL, &req);
                if (!error) {
                        error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &commit, &rlen, &wverf2);
@@ -3650,6 +3721,7 @@ nfs_write_rpc2(
                if (nfs_mount_gone(nmp)) {
                        error = ENXIO;
                }
+#if CONFIG_NFS4
                if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) &&
                    (++restart <= nfs_mount_state_max_restarts(nmp))) { /* guard against no progress */
                        lck_mtx_lock(&nmp->nm_lock);
@@ -3669,6 +3741,7 @@ nfs_write_rpc2(
                                }
                        }
                }
+#endif
                if (error) {
                        break;
                }
@@ -3811,7 +3884,7 @@ nfs3_write_rpc_async_finish(
                error = lockerror;
        }
        if (nfsvers == NFS_VER3) {
-               struct timespec premtime = { 0, 0 };
+               struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 };
                nfsm_chain_get_wcc_data(error, &nmrep, np, &premtime, &wccpostattr, &xid);
                if (nfstimespeccmp(&np->n_mtime, &premtime, ==)) {
                        updatemtime = 1;
@@ -3891,7 +3964,7 @@ nfs3_vnop_mknod(
        struct nfs_vattr nvattr;
        fhandle_t fh;
        int error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0;
-       struct timespec premtime = { 0, 0 };
+       struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 };
        u_int32_t rdev;
        u_int64_t xid = 0, dxid;
        int nfsvers, gotuid, gotgid;
@@ -3942,7 +4015,7 @@ nfs3_vnop_mknod(
        nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp);
        if (nfsvers == NFS_VER3) {
                nfsm_chain_add_32(error, &nmreq, vtonfs_type(vap->va_type, nfsvers));
-               nfsm_chain_add_v3sattr(error, &nmreq, vap);
+               nfsm_chain_add_v3sattr(nmp, error, &nmreq, vap);
                if (vap->va_type == VCHR || vap->va_type == VBLK) {
                        nfsm_chain_add_32(error, &nmreq, major(vap->va_rdev));
                        nfsm_chain_add_32(error, &nmreq, minor(vap->va_rdev));
@@ -3972,7 +4045,7 @@ nfs3_vnop_mknod(
                        dnp->n_flag &= ~NNEGNCENTRIES;
                        cache_purge_negatives(dvp);
                }
-               error = nfsm_chain_get_fh_attr(&nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr);
+               error = nfsm_chain_get_fh_attr(nmp, &nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr);
        }
        if (nfsvers == NFS_VER3) {
                nfsm_chain_get_wcc_data(error, &nmrep, dnp, &premtime, &wccpostattr, &dxid);
@@ -4054,19 +4127,22 @@ nfs3_vnop_create(
        nfsnode_t dnp = VTONFS(dvp);
        vnode_t newvp = NULL;
        int error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0, fmode = 0;
-       struct timespec premtime = { 0, 0 };
+       struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 };
        int nfsvers, gotuid, gotgid;
        u_int64_t xid, dxid;
        uint32_t val;
        struct nfsm_chain nmreq, nmrep;
        struct nfsreq rq, *req = &rq;
        struct nfs_dulookup dul;
+       int dul_in_progress = 0;
+       int namedattrs;
 
        nmp = VTONMP(dvp);
        if (nfs_mount_gone(nmp)) {
                return ENXIO;
        }
        nfsvers = nmp->nm_vers;
+       namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR);
 
        if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN)) {
                return ENAMETOOLONG;
@@ -4083,7 +4159,8 @@ nfs3_vnop_create(
        gotuid = VATTR_IS_ACTIVE(vap, va_uid);
        gotgid = VATTR_IS_ACTIVE(vap, va_gid);
 
-       if (vap->va_vaflags & VA_EXCLUSIVE) {
+       if ((vap->va_vaflags & VA_EXCLUSIVE)
+           ) {
                fmode |= O_EXCL;
                if (!VATTR_IS_ACTIVE(vap, va_access_time) || !VATTR_IS_ACTIVE(vap, va_modify_time)) {
                        vap->va_vaflags |= VA_UTIMES_NULL;
@@ -4092,7 +4169,9 @@ nfs3_vnop_create(
 
 again:
        error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx));
-       nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx);
+       if (!namedattrs) {
+               nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx);
+       }
 
        nfsm_chain_null(&nmreq);
        nfsm_chain_null(&nmrep);
@@ -4117,7 +4196,7 @@ again:
                        nfsm_chain_add_32(error, &nmreq, create_verf);
                } else {
                        nfsm_chain_add_32(error, &nmreq, NFS_CREATE_UNCHECKED);
-                       nfsm_chain_add_v3sattr(error, &nmreq, vap);
+                       nfsm_chain_add_v3sattr(nmp, error, &nmreq, vap);
                }
        } else {
                nfsm_chain_add_v2sattr(error, &nmreq, vap, 0);
@@ -4128,7 +4207,10 @@ again:
        error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_CREATE,
            vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req);
        if (!error) {
-               nfs_dulookup_start(&dul, dnp, ctx);
+               if (!namedattrs) {
+                       nfs_dulookup_start(&dul, dnp, ctx);
+                       dul_in_progress = 1;
+               }
                error = nfs_request_async_finish(req, &nmrep, &xid, &status);
        }
 
@@ -4141,7 +4223,7 @@ again:
                        dnp->n_flag &= ~NNEGNCENTRIES;
                        cache_purge_negatives(dvp);
                }
-               error = nfsm_chain_get_fh_attr(&nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr);
+               error = nfsm_chain_get_fh_attr(nmp, &nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr);
        }
        if (nfsvers == NFS_VER3) {
                nfsm_chain_get_wcc_data(error, &nmrep, dnp, &premtime, &wccpostattr, &dxid);
@@ -4174,7 +4256,9 @@ nfsmout:
                newvp = NFSTOV(np);
        }
 
-       nfs_dulookup_finish(&dul, dnp, ctx);
+       if (dul_in_progress) {
+               nfs_dulookup_finish(&dul, dnp, ctx);
+       }
        if (!busyerror) {
                nfs_node_clear_busy(dnp);
        }
@@ -4320,11 +4404,11 @@ again:
                        }
                        goto again_relock;
                }
-
+#if CONFIG_NFS4
                if ((nmp->nm_vers >= NFS_VER4) && (np->n_openflags & N_DELEG_MASK)) {
                        nfs4_delegation_return(np, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx));
                }
-
+#endif
                /*
                 * Purge the name cache so that the chance of a lookup for
                 * the name succeeding while the remove is in progress is
@@ -4440,7 +4524,7 @@ nfs3_remove_rpc(
        kauth_cred_t cred)
 {
        int error = 0, lockerror = ENOENT, status, wccpostattr = 0;
-       struct timespec premtime = { 0, 0 };
+       struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 };
        struct nfsmount *nmp;
        int nfsvers;
        u_int64_t xid;
@@ -4581,10 +4665,12 @@ nfs_vnop_rename(
                        /* sillyrename succeeded.*/
                        tvp = NULL;
                }
-       } else if (tvp && (nmp->nm_vers >= NFS_VER4) && (tnp->n_openflags & N_DELEG_MASK)) {
+       }
+#if CONFIG_NFS4
+       else if (tvp && (nmp->nm_vers >= NFS_VER4) && (tnp->n_openflags & N_DELEG_MASK)) {
                nfs4_delegation_return(tnp, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx));
        }
-
+#endif
        error = nmp->nm_funcs->nf_rename_rpc(fdnp, fcnp->cn_nameptr, fcnp->cn_namelen,
            tdnp, tcnp->cn_nameptr, tcnp->cn_namelen, ctx);
 
@@ -4685,7 +4771,7 @@ nfs3_rename_rpc(
        vfs_context_t ctx)
 {
        int error = 0, lockerror = ENOENT, status, fwccpostattr = 0, twccpostattr = 0;
-       struct timespec fpremtime = { 0, 0 }, tpremtime = { 0, 0 };
+       struct timespec fpremtime = { .tv_sec = 0, .tv_nsec = 0 }, tpremtime = { .tv_sec = 0, .tv_nsec = 0 };
        struct nfsmount *nmp;
        int nfsvers;
        u_int64_t xid, txid;
@@ -4770,7 +4856,7 @@ nfs3_vnop_link(
        vnode_t tdvp = ap->a_tdvp;
        struct componentname *cnp = ap->a_cnp;
        int error = 0, lockerror = ENOENT, status, wccpostattr = 0, attrflag = 0;
-       struct timespec premtime = { 0, 0 };
+       struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 };
        struct nfsmount *nmp;
        nfsnode_t np = VTONFS(vp);
        nfsnode_t tdnp = VTONFS(tdvp);
@@ -4880,7 +4966,7 @@ nfs3_vnop_symlink(
        struct nfs_vattr nvattr;
        fhandle_t fh;
        int slen, error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0;
-       struct timespec premtime = { 0, 0 };
+       struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 };
        vnode_t newvp = NULL;
        int nfsvers, gotuid, gotgid;
        u_int64_t xid = 0, dxid;
@@ -4890,12 +4976,15 @@ nfs3_vnop_symlink(
        struct nfsm_chain nmreq, nmrep;
        struct nfsreq rq, *req = &rq;
        struct nfs_dulookup dul;
+       int namedattrs;
+       int dul_in_progress = 0;
 
        nmp = VTONMP(dvp);
        if (nfs_mount_gone(nmp)) {
                return ENXIO;
        }
        nfsvers = nmp->nm_vers;
+       namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR);
 
        slen = strlen(ap->a_target);
        if ((nfsvers == NFS_VER2) &&
@@ -4915,7 +5004,9 @@ nfs3_vnop_symlink(
        gotgid = VATTR_IS_ACTIVE(vap, va_gid);
 
        error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx));
-       nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx);
+       if (!namedattrs) {
+               nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx);
+       }
 
        nfsm_chain_null(&nmreq);
        nfsm_chain_null(&nmrep);
@@ -4926,7 +5017,7 @@ nfs3_vnop_symlink(
        nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize);
        nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp);
        if (nfsvers == NFS_VER3) {
-               nfsm_chain_add_v3sattr(error, &nmreq, vap);
+               nfsm_chain_add_v3sattr(nmp, error, &nmreq, vap);
        }
        nfsm_chain_add_name(error, &nmreq, ap->a_target, slen, nmp);
        if (nfsvers == NFS_VER2) {
@@ -4938,7 +5029,10 @@ nfs3_vnop_symlink(
        error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_SYMLINK,
            vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req);
        if (!error) {
-               nfs_dulookup_start(&dul, dnp, ctx);
+               if (!namedattrs) {
+                       nfs_dulookup_start(&dul, dnp, ctx);
+                       dul_in_progress = 1;
+               }
                error = nfs_request_async_finish(req, &nmrep, &xid, &status);
        }
 
@@ -4952,7 +5046,7 @@ nfs3_vnop_symlink(
                        cache_purge_negatives(dvp);
                }
                if (nfsvers == NFS_VER3) {
-                       error = nfsm_chain_get_fh_attr(&nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr);
+                       error = nfsm_chain_get_fh_attr(nmp, &nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr);
                } else {
                        fh.fh_len = 0;
                }
@@ -4985,7 +5079,9 @@ nfsmout:
                newvp = NFSTOV(np);
        }
 
-       nfs_dulookup_finish(&dul, dnp, ctx);
+       if (dul_in_progress) {
+               nfs_dulookup_finish(&dul, dnp, ctx);
+       }
 
        /*
         * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry
@@ -5052,19 +5148,23 @@ nfs3_vnop_mkdir(
        nfsnode_t dnp = VTONFS(dvp);
        vnode_t newvp = NULL;
        int error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0;
-       struct timespec premtime = { 0, 0 };
+       struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 };
        int nfsvers, gotuid, gotgid;
        u_int64_t xid = 0, dxid;
        fhandle_t fh;
        struct nfsm_chain nmreq, nmrep;
        struct nfsreq rq, *req = &rq;
        struct nfs_dulookup dul;
+       int namedattrs;
+       int dul_in_progress = 0;
 
        nmp = VTONMP(dvp);
        if (nfs_mount_gone(nmp)) {
                return ENXIO;
        }
        nfsvers = nmp->nm_vers;
+       namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR);
+
        if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN)) {
                return ENAMETOOLONG;
        }
@@ -5081,7 +5181,9 @@ nfs3_vnop_mkdir(
        gotgid = VATTR_IS_ACTIVE(vap, va_gid);
 
        error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx));
-       nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx);
+       if (!namedattrs) {
+               nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx);
+       }
 
        nfsm_chain_null(&nmreq);
        nfsm_chain_null(&nmrep);
@@ -5092,7 +5194,7 @@ nfs3_vnop_mkdir(
        nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize);
        nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp);
        if (nfsvers == NFS_VER3) {
-               nfsm_chain_add_v3sattr(error, &nmreq, vap);
+               nfsm_chain_add_v3sattr(nmp, error, &nmreq, vap);
        } else {
                nfsm_chain_add_v2sattr(error, &nmreq, vap, -1);
        }
@@ -5102,7 +5204,10 @@ nfs3_vnop_mkdir(
        error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_MKDIR,
            vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req);
        if (!error) {
-               nfs_dulookup_start(&dul, dnp, ctx);
+               if (!namedattrs) {
+                       nfs_dulookup_start(&dul, dnp, ctx);
+                       dul_in_progress = 1;
+               }
                error = nfs_request_async_finish(req, &nmrep, &xid, &status);
        }
 
@@ -5115,7 +5220,7 @@ nfs3_vnop_mkdir(
                        dnp->n_flag &= ~NNEGNCENTRIES;
                        cache_purge_negatives(dvp);
                }
-               error = nfsm_chain_get_fh_attr(&nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr);
+               error = nfsm_chain_get_fh_attr(nmp, &nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr);
        }
        if (nfsvers == NFS_VER3) {
                nfsm_chain_get_wcc_data(error, &nmrep, dnp, &premtime, &wccpostattr, &dxid);
@@ -5145,7 +5250,9 @@ nfsmout:
                newvp = NFSTOV(np);
        }
 
-       nfs_dulookup_finish(&dul, dnp, ctx);
+       if (dul_in_progress) {
+               nfs_dulookup_finish(&dul, dnp, ctx);
+       }
 
        /*
         * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry
@@ -5206,7 +5313,7 @@ nfs3_vnop_rmdir(
        vnode_t dvp = ap->a_dvp;
        struct componentname *cnp = ap->a_cnp;
        int error = 0, lockerror = ENOENT, status, wccpostattr = 0;
-       struct timespec premtime = { 0, 0 };
+       struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 };
        struct nfsmount *nmp;
        nfsnode_t np = VTONFS(vp);
        nfsnode_t dnp = VTONFS(dvp);
@@ -5215,12 +5322,16 @@ nfs3_vnop_rmdir(
        struct nfsm_chain nmreq, nmrep;
        struct nfsreq rq, *req = &rq;
        struct nfs_dulookup dul;
+       int namedattrs;
+       int dul_in_progress = 0;
 
        nmp = VTONMP(vp);
        if (nfs_mount_gone(nmp)) {
                return ENXIO;
        }
        nfsvers = nmp->nm_vers;
+       namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR);
+
        if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN)) {
                return ENAMETOOLONG;
        }
@@ -5229,7 +5340,9 @@ nfs3_vnop_rmdir(
                return error;
        }
 
-       nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx);
+       if (!namedattrs) {
+               nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx);
+       }
 
        nfsm_chain_null(&nmreq);
        nfsm_chain_null(&nmrep);
@@ -5244,7 +5357,10 @@ nfs3_vnop_rmdir(
        error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_RMDIR,
            vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req);
        if (!error) {
-               nfs_dulookup_start(&dul, dnp, ctx);
+               if (!namedattrs) {
+                       nfs_dulookup_start(&dul, dnp, ctx);
+                       dul_in_progress = 1;
+               }
                error = nfs_request_async_finish(req, &nmrep, &xid, &status);
        }
 
@@ -5272,7 +5388,9 @@ nfsmout:
                /* nfs_getattr() will check changed and purge caches */
                nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED);
        }
-       nfs_dulookup_finish(&dul, dnp, ctx);
+       if (dul_in_progress) {
+               nfs_dulookup_finish(&dul, dnp, ctx);
+       }
        nfs_node_clear_busy2(dnp, np);
 
        /*
@@ -5366,12 +5484,12 @@ nfs_vnop_readdir(
        if (uio_resid(uio) == 0) {
                return 0;
        }
-
+#if CONFIG_NFS4
        if ((nfsvers >= NFS_VER4) && (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)) {
                /* trigger directories should never be read, return nothing */
                return 0;
        }
-
+#endif
        thd = vfs_context_thread(ctx);
        numdirent = done = 0;
        nextcookie = uio_offset(uio);
@@ -6250,7 +6368,7 @@ nextbuffer:
                                nfsmout_if(error);
                                if (attrflag) {
                                        /* grab attributes */
-                                       error = nfs_parsefattr(&nmrep, NFS_VER3, nvattrp);
+                                       error = nfs_parsefattr(nmp, &nmrep, NFS_VER3, nvattrp);
                                        nfsmout_if(error);
                                        dp->d_type = IFTODT(VTTOIF(nvattrp->nva_type));
                                        /* fileid is already in d_fileno, so stash xid in attrs */
@@ -6521,13 +6639,13 @@ nfs3_lookup_rpc_async_finish(
 
        /* get the attributes */
        if (nfsvers == NFS_VER3) {
-               nfsm_chain_postop_attr_get(error, &nmrep, attrflag, nvap);
+               nfsm_chain_postop_attr_get(nmp, error, &nmrep, attrflag, nvap);
                nfsm_chain_postop_attr_update(error, &nmrep, dnp, &xid);
                if (!error && !attrflag) {
                        error = nfs3_getattr_rpc(NULL, NFSTOMP(dnp), fhp->fh_data, fhp->fh_len, 0, ctx, nvap, xidp);
                }
        } else {
-               error = nfs_parsefattr(&nmrep, nfsvers, nvap);
+               error = nfs_parsefattr(nmp, &nmrep, nfsvers, nvap);
        }
 nfsmout:
        if (!lockerror) {
@@ -6771,7 +6889,7 @@ nfs3_commit_rpc(
 {
        struct nfsmount *nmp;
        int error = 0, lockerror, status, wccpostattr = 0, nfsvers;
-       struct timespec premtime = { 0, 0 };
+       struct timespec premtime = { .tv_sec = 0, .tv_nsec = 0 };
        u_int64_t xid, newwverf;
        uint32_t count32;
        struct nfsm_chain nmreq, nmrep;
@@ -7039,7 +7157,9 @@ nfs_vnop_pathconf(
                } else {
                        nfsap = &nmp->nm_fsattr;
                }
-       } else if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_HOMOGENEOUS)) {
+       }
+#if CONFIG_NFS4
+       else if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_HOMOGENEOUS)) {
                /* no pathconf info cached */
                lck_mtx_unlock(&nmp->nm_lock);
                NFS_CLEAR_ATTRIBUTES(nfsa.nfsa_bitmap);
@@ -7053,16 +7173,19 @@ nfs_vnop_pathconf(
                }
                lck_mtx_lock(&nmp->nm_lock);
                nfsap = &nfsa;
-       } else {
+       }
+#endif
+       else {
                nfsap = &nmp->nm_fsattr;
        }
-
        switch (ap->a_name) {
        case _PC_LINK_MAX:
                if (NFS_BITMAP_ISSET(nfsap->nfsa_bitmap, NFS_FATTR_MAXLINK)) {
                        *ap->a_retval = nfsap->nfsa_maxlink;
+#if CONFIG_NFS4
                } else if ((nmp->nm_vers == NFS_VER4) && NFS_BITMAP_ISSET(np->n_vattr.nva_bitmap, NFS_FATTR_MAXLINK)) {
                        *ap->a_retval = np->n_vattr.nva_maxlink;
+#endif
                } else {
                        error = EINVAL;
                }
@@ -7390,14 +7513,15 @@ nfs_vnop_ioctl(
        vfs_context_t ctx = ap->a_context;
        vnode_t vp = ap->a_vp;
        struct nfsmount *mp = VTONMP(vp);
+       int error = ENOTTY;
+#if CONFIG_NFS_GSS
        struct user_nfs_gss_principal gprinc = {};
        uint32_t len;
-       int error = ENOTTY;
+#endif
 
        if (mp == NULL) {
                return ENXIO;
        }
-
        switch (ap->a_command) {
        case F_FULLFSYNC:
                if (vnode_vfsisrdonly(vp)) {
@@ -7405,6 +7529,7 @@ nfs_vnop_ioctl(
                }
                error = nfs_flush(VTONFS(vp), MNT_WAIT, vfs_context_thread(ctx), 0);
                break;
+#if CONFIG_NFS_GSS
        case NFS_IOC_DESTROY_CRED:
                if (!auth_is_kerberized(mp->nm_auth)) {
                        return ENOTSUP;
@@ -7499,6 +7624,7 @@ nfs_vnop_ioctl(
                if (gprinc.principal) {
                        FREE(gprinc.principal, M_TEMP);
                }
+#endif /* CONFIG_NFS_GSS */
        }
 
        return error;
@@ -7561,7 +7687,10 @@ nfs_vnop_pagein(
 #define MAXPAGINGREQS   16      /* max outstanding RPCs for pagein/pageout */
        struct nfsreq *req[MAXPAGINGREQS];
        int nextsend, nextwait;
-       uint32_t stategenid = 0, restart = 0;
+#if CONFIG_NFS4
+       uint32_t stategenid = 0;
+#endif
+       uint32_t restart = 0;
        kern_return_t kret;
 
        FSDBG(322, np, f_offset, size, flags);
@@ -7611,9 +7740,11 @@ nfs_vnop_pagein(
        ioaddr += pl_offset;
 
 tryagain:
+#if CONFIG_NFS4
        if (nmp->nm_vers >= NFS_VER4) {
                stategenid = nmp->nm_stategenid;
        }
+#endif
        txsize = rxsize = size;
        txoffset = f_offset;
        rxaddr = ioaddr;
@@ -7649,6 +7780,7 @@ tryagain:
                        error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req[nextwait], uio, &retsize, NULL);
                        req[nextwait] = NULL;
                        nextwait = (nextwait + 1) % MAXPAGINGREQS;
+#if CONFIG_NFS4
                        if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) {
                                lck_mtx_lock(&nmp->nm_lock);
                                if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) {
@@ -7659,6 +7791,7 @@ tryagain:
                                restart++;
                                goto cancel;
                        }
+#endif
                        if (error) {
                                FSDBG(322, uio_offset(uio), uio_resid(uio), error, -1);
                                break;
@@ -7681,7 +7814,9 @@ tryagain:
        restart = 0;
 
        if (error) {
+#if CONFIG_NFS4
 cancel:
+#endif
                /* cancel any outstanding requests */
                while (req[nextwait]) {
                        nfs_request_async_cancel(req[nextwait]);
@@ -7885,7 +8020,10 @@ nfs_vnop_pageout(
        struct nfsreq *req[MAXPAGINGREQS];
        int nextsend, nextwait, wverfset, commit;
        uint64_t wverf, wverf2;
-       uint32_t stategenid = 0, vrestart = 0, restart = 0, vrestarts = 0, restarts = 0;
+#if CONFIG_NFS4
+       uint32_t stategenid = 0;
+#endif
+       uint32_t vrestart = 0, restart = 0, vrestarts = 0, restarts = 0;
        kern_return_t kret;
 
        FSDBG(323, f_offset, size, pl, pl_offset);
@@ -8081,9 +8219,11 @@ nfs_vnop_pageout(
            &uio_buf, sizeof(uio_buf));
 
 tryagain:
+#if CONFIG_NFS4
        if (nmp->nm_vers >= NFS_VER4) {
                stategenid = nmp->nm_stategenid;
        }
+#endif
        wverf = wverf2 = wverfset = 0;
        txsize = rxsize = xsize;
        txoffset = rxoffset = f_offset;
@@ -8132,6 +8272,7 @@ tryagain:
                        nfs_node_lock_force(np);
                        np->n_numoutput--;
                        nfs_node_unlock(np);
+#if CONFIG_NFS4
                        if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) {
                                lck_mtx_lock(&nmp->nm_lock);
                                if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) {
@@ -8142,6 +8283,7 @@ tryagain:
                                restart = 1;
                                goto cancel;
                        }
+#endif
                        if (error) {
                                FSDBG(323, rxoffset, rxsize, error, -1);
                                break;
@@ -8169,6 +8311,7 @@ tryagain:
                                uio_addiov(auio, CAST_USER_ADDR_T(rxaddr), remsize);
                                iomode = NFS_WRITE_UNSTABLE;
                                error = nfs_write_rpc2(np, auio, thd, cred, &iomode, &wverf2);
+#if CONFIG_NFS4
                                if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) {
                                        NP(np, "nfs_vnop_pageout: restart: error %d", error);
                                        lck_mtx_lock(&nmp->nm_lock);
@@ -8180,6 +8323,7 @@ tryagain:
                                        restart = 1;
                                        goto cancel;
                                }
+#endif
                                if (error) {
                                        FSDBG(323, rxoffset, rxsize, error, -1);
                                        break;
@@ -8394,7 +8538,7 @@ nfs_vnop_monitor(
                /* This vnode is no longer being monitored, make sure we're not tracking it. */
                /* Wait for any in-progress getattr to complete first. */
                while (np->n_mflag & NMMONSCANINPROG) {
-                       struct timespec ts = { 1, 0 };
+                       struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
                        np->n_mflag |= NMMONSCANWANT;
                        msleep(&np->n_mflag, &nmp->nm_lock, PZERO - 1, "nfswaitmonscan", &ts);
                }
@@ -8443,3 +8587,4 @@ nfs_vnode_notify(nfsnode_t np, uint32_t events)
        }
        vnode_notify(NFSTOV(np), events, vap);
 }
+
index 6f348ac36e48e7d213aef891ff5edd33e745fd12..b16669fc5940f2c4c46b0e1db788ba81490789d8 100644 (file)
@@ -82,7 +82,7 @@ int nfsm_chain_add_opaque_nopad_f(struct nfsm_chain *, const u_char *, uint32_t)
 int nfsm_chain_add_uio(struct nfsm_chain *, uio_t, uint32_t);
 int nfsm_chain_add_fattr4_f(struct nfsm_chain *, struct vnode_attr *, struct nfsmount *);
 int nfsm_chain_add_v2sattr_f(struct nfsm_chain *, struct vnode_attr *, uint32_t);
-int nfsm_chain_add_v3sattr_f(struct nfsm_chain *, struct vnode_attr *);
+int nfsm_chain_add_v3sattr_f(struct nfsmount *, struct nfsm_chain *, struct vnode_attr *);
 int nfsm_chain_add_string_nfc(struct nfsm_chain *, const uint8_t *, uint32_t);
 
 int nfsm_chain_advance(struct nfsm_chain *, uint32_t);
@@ -91,7 +91,7 @@ int nfsm_chain_reverse(struct nfsm_chain *, uint32_t);
 int nfsm_chain_get_opaque_pointer_f(struct nfsm_chain *, uint32_t, u_char **);
 int nfsm_chain_get_opaque_f(struct nfsm_chain *, uint32_t, u_char *);
 int nfsm_chain_get_uio(struct nfsm_chain *, uint32_t, uio_t);
-int nfsm_chain_get_fh_attr(struct nfsm_chain *, nfsnode_t,
+int nfsm_chain_get_fh_attr(struct nfsmount *, struct nfsm_chain *, nfsnode_t,
     vfs_context_t, int, uint64_t *, fhandle_t *, struct nfs_vattr *);
 int nfsm_chain_get_wcc_data_f(struct nfsm_chain *, nfsnode_t, struct timespec *, int *, u_int64_t *);
 int nfsm_chain_get_secinfo(struct nfsm_chain *, uint32_t *, int *);
@@ -415,10 +415,10 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *);
        } while (0)
 
 /* Add an NFSv3 "sattr" structure to an mbuf chain */
-#define nfsm_chain_add_v3sattr(E, NMC, VAP) \
+#define nfsm_chain_add_v3sattr(NMP, E, NMC, VAP) \
        do { \
                if (E) break; \
-               (E) = nfsm_chain_add_v3sattr_f((NMC), (VAP)); \
+               (E) = nfsm_chain_add_v3sattr_f((NMP), (NMC), (VAP)); \
        } while (0)
 
 /* Add an NFSv4 "fattr" structure to an mbuf chain */
@@ -664,13 +664,13 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *);
        } while (0)
 
 /* get postop attributes from an mbuf chain */
-#define nfsm_chain_postop_attr_get(E, NMC, F, VAP) \
+#define nfsm_chain_postop_attr_get(NMP, E, NMC, F, VAP) \
        do { \
                (F) = 0; \
                if ((E) || !(NMC)->nmc_mhead) break; \
                nfsm_chain_get_32((E), (NMC), (F)); \
                if ((E) || !(F)) break; \
-               if (((E) = nfs_parsefattr((NMC), NFS_VER3, (VAP)))) \
+               if (((E) = nfs_parsefattr((NMP), (NMC), NFS_VER3, (VAP)))) \
                        (F) = 0; \
        } while (0)
 
@@ -679,7 +679,7 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *);
 #define nfsm_chain_postop_attr_update_flag(E, NMC, NP, F, X) \
        do { \
                struct nfs_vattr ttvattr; \
-               nfsm_chain_postop_attr_get((E), (NMC), (F), &ttvattr); \
+               nfsm_chain_postop_attr_get(NFSTONMP(NP), (E), (NMC), (F), &ttvattr); \
                if ((E) || !(F)) break; \
                if (((E) = nfs_loadattrcache((NP), &ttvattr, (X), 1))) { \
                        (F) = 0; \
@@ -703,15 +703,28 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *);
                (E) = nfsm_chain_get_wcc_data_f((NMC), (NP), (PREMTIME), (NEWPOSTATTR), (X)); \
        } while (0)
 
+#if CONFIG_NFS4
+/* separate v4 variant for loading attrs that only runs when NFSv4 is set */
+#define __nfsm_chain_loadattr_v4(E, NMC, VERS, X, VATTR) \
+       do { \
+               (E) = nfs4_parsefattr((NMC), NULL, (VATTR), NULL, NULL, NULL); \
+       } while (0)
+#else
+#define __nfsm_chain_loadattr_v4(E, NMC, VERS, X, VATTR) \
+       do { \
+               break; \
+       } while (0)
+#endif
+
 /* update a node's attribute cache with attributes from an mbuf chain */
 #define nfsm_chain_loadattr(E, NMC, NP, VERS, X) \
        do { \
                struct nfs_vattr ttvattr; \
                if (E) break; \
                if ((VERS) == NFS_VER4) { \
-                       (E) = nfs4_parsefattr((NMC), NULL, &ttvattr, NULL, NULL, NULL); \
+                       __nfsm_chain_loadattr_v4((E), (NMC), (VERS), (X), &ttvattr); \
                } else { \
-                       (E) = nfs_parsefattr((NMC), (VERS), &ttvattr); \
+                       (E) = nfs_parsefattr(NFSTONMP(NP), (NMC), (VERS), &ttvattr); \
                } \
                if (!(E) && (NP)) \
                        (E) = nfs_loadattrcache((NP), &ttvattr, (X), 0); \
index c9dc924dea840220e441e25feedfa3760e00471a..0743b8383f6d3b859f24812127a376369c2abdeb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -177,7 +177,9 @@ struct nfs_socket {
        int                     nso_error;              /* saved error/status */
        struct nfs_rpc_record_state nso_rrs;            /* RPC record parsing state (TCP) */
 };
+
 TAILQ_HEAD(nfssocketlist, nfs_socket);
+
 /* nso_flags */
 #define NSO_UPCALL              0x0001                  /* socket upcall in progress */
 #define NSO_DEAD                0x0002                  /* socket is dead */
@@ -337,6 +339,8 @@ struct nfsmount {
        uint8_t nm_sotype;              /* (preferred) type of socket */
        in_port_t       nm_nfsport;     /* NFS protocol port */
        in_port_t       nm_mountport;   /* MOUNT protocol port (v2/v3) */
+       char    *nm_nfs_localport;      /* Unix domain address (port) for nfs */
+       char    *nm_mount_localport;    /* Unix domain address (port) for mountd */
        struct nfs_socket_search *nm_nss; /* current socket search structure */
        struct nfs_socket *nm_nso;      /* current socket */
        struct sockaddr *nm_saddr;      /* Address of server */
index 81341cc91bab1a66ac2596c984057325f2da0da2..9562d61448ea021874b5f71a7e13a0a3658972f8 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -98,7 +98,7 @@ struct nfsbuf {
        TAILQ_ENTRY(nfsbuf)     nb_free;        /* free list position if not active. */
        volatile uint32_t       nb_flags;       /* NB_* flags. */
        volatile uint32_t       nb_lflags;      /* NBL_* flags. */
-       volatile uint32_t       nb_refs;        /* outstanding references. */
+       os_refcnt_t             nb_refs;        /* outstanding references. */
        uint32_t                nb_bufsize;     /* buffer size */
        daddr64_t               nb_lblkno;      /* logical block number. */
        uint64_t                nb_verf;        /* V3 write verifier */
@@ -300,7 +300,8 @@ struct nfsdmap {
 #define NFSTIME_CHANGE  2       /* time file changed */
 #define NFSTIME_CREATE  3       /* time file created */
 #define NFSTIME_BACKUP  4       /* time of last backup */
-#define NFSTIME_COUNT   5
+#define NFSTIME_ADDED   5       /* time added (FPnfs only) */
+#define NFSTIME_COUNT   6
 
 #define NFS_COMPARE_MTIME(TVP, NVAP, CMP) \
        (((TVP)->tv_sec == (NVAP)->nva_timesec[NFSTIME_MODIFY]) ?       \
@@ -332,6 +333,11 @@ struct nfs_vattr {
        int64_t         nva_timesec[NFSTIME_COUNT];
        int32_t         nva_timensec[NFSTIME_COUNT];
        uint32_t        nva_bitmap[NFS_ATTR_BITMAP_LEN]; /* attributes that are valid */
+
+       /* FPnfs only. */
+       uint32_t        nva_bsd_flags;  /* BSD flags */
+       uint64_t        nva_parentid;   /* parent file id */
+       uint64_t        nva_allocsize;  /* size allocated on disk */
 };
 
 /* nva_flags */
@@ -341,6 +347,10 @@ struct nfs_vattr {
 #define NFS_FFLAG_TRIGGER               0x0008  /* node is a trigger/mirror mount point */
 #define NFS_FFLAG_TRIGGER_REFERRAL      0x0010  /* trigger is a referral */
 #define NFS_FFLAG_IS_ATTR               0x8000  /* file is a named attribute file/directory */
+/* FPnfs only */
+#define NFS_FFLAG_FPNFS_BSD_FLAGS   0x01000000
+#define NFS_FFLAG_FPNFS_PARENTID    0x02000000
+#define NFS_FFLAG_FPNFS_ADDEDTIME   0x04000000
 
 /* flags for nfs_getattr() */
 #define NGA_CACHED      0x0001  /* use cached attributes (if still valid) */
@@ -692,6 +702,7 @@ struct nfsnode {
 #define NISMAPPED       0x10000 /* node is mmapped   */
 #define NREFRESH        0x20000 /* node's fh needs to be refreshed */
 #define NREFRESHWANT    0x40000 /* Waiting for fh to be refreshed */
+#define NDISARMTRIGGER  0x80000 /* Ignore node's mirror mount trigger */
 
 /*
  * Flags for n_hflag
@@ -793,11 +804,13 @@ extern lck_mtx_t *nfsiod_mutex;
 typedef int     vnop_t(void *);
 extern  vnop_t  **fifo_nfsv2nodeop_p;
 extern  vnop_t  **nfsv2_vnodeop_p;
+extern  vnop_t  **fpnfs_vnodeop_p;
 extern  vnop_t  **spec_nfsv2nodeop_p;
+#if CONFIG_NFS4
 extern  vnop_t  **fifo_nfsv4nodeop_p;
 extern  vnop_t  **nfsv4_vnodeop_p;
 extern  vnop_t  **spec_nfsv4nodeop_p;
-
+#endif
 /*
  * Prototypes for NFS vnode operations
  */
@@ -875,7 +888,7 @@ int nfs_flushcommits(nfsnode_t, int);
 int nfs_flush(nfsnode_t, int, thread_t, int);
 void nfs_buf_delwri_push(int);
 void nfs_buf_delwri_service(void);
-void nfs_buf_delwri_thread(void *, wait_result_t);;
+void nfs_buf_delwri_thread(void *, wait_result_t);
 
 int nfsiod_start(void);
 void nfsiod_terminate(struct nfsiod *);
index b45f35145fa454419fa41574bbf710ec3ff2f585..1ade820c33e0a4fa00e1ad7c86ab70d4d919686f 100644 (file)
 #define NFS_V2MAXDATA   8192
 #define NFS_MAXDGRAMDATA 16384
 #define NFS_PREFDGRAMDATA 8192
-#define NFS_MAXDATA     (64*1024) // XXX not ready for >64K
+
+#ifdef XNU_TARGET_OS_IOS
+#define NFS_MAXDATA     (32 * PAGE_SIZE) /* Same as NFS_MAXBSIZE from nfsnode.h */
+#else /*  TARGET_OS_IOS */
+#define NFS_MAXDATA     (64*1024)
+#endif /*  TARGET_OS_IOS */
+
 #define NFSRV_MAXDATA   (64*1024) // XXX not ready for >64K
 #define NFS_MAXPATHLEN  1024
 #define NFS_MAXNAMLEN   255
@@ -348,9 +354,9 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5,
  * NFS attribute management stuff
  */
 #define NFS_ATTR_BITMAP_LEN     2
-#define NFS_BITMAP_SET(B, I)    (((uint32_t *)(B))[(I)/32] |= 1<<((I)%32))
-#define NFS_BITMAP_CLR(B, I)    (((uint32_t *)(B))[(I)/32] &= ~(1<<((I)%32)))
-#define NFS_BITMAP_ISSET(B, I)  (((uint32_t *)(B))[(I)/32] & (1<<((I)%32)))
+#define NFS_BITMAP_SET(B, I)    (((uint32_t *)(B))[(I)/32] |= 1U<<((I)%32))
+#define NFS_BITMAP_CLR(B, I)    (((uint32_t *)(B))[(I)/32] &= ~(1U<<((I)%32)))
+#define NFS_BITMAP_ISSET(B, I)  (((uint32_t *)(B))[(I)/32] & (1U<<((I)%32)))
 #define NFS_BITMAP_ZERO(B, L) \
        do { \
                int __i; \
index 2a5bc9c6f740d729acb1114b1e0043512ca9113e..f23f98572a6570f2d1f1fd2ed156677648dc5c31 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define RQUOTA_STAT_NOQUOTA     2
 #define RQUOTA_STAT_EPERM       3
 
+/* Local transports for rpcbind */
+#define RPCB_TICOTSORD_PATH "/var/run/rpcb.ticotsord"
+#define RPCB_TICLTS_PATH "/var/run/rpcb.ticlst"
+
+/* Local transport for nfs */
+#define NFS_TICOTSORD_PATH "/var/ran/nfs.ticotsord"
+#define NFS_TICLTS_PATH "/var/run/nfs.ticlts"
+
 #endif /* __APPLE_API_PRIVATE */
 #endif /* _NFS_RPCV2_H_ */
index 24295f4875ddd28eb64549cbe528b150115e664c..36e4dc9292b941eae07c473ab6f67d508456aa10 100644 (file)
  *
  * generalized functionality for managing the building/dissecting of XDR data
  */
-typedef enum xdrbuf_type { XDRBUF_BUFFER=1 } xdrbuf_type;
+typedef enum xdrbuf_type {
+       XDRBUF_NONE   = 0,
+       XDRBUF_BUFFER = 1,
+} xdrbuf_type;
 
 struct xdrbuf {
        union {
@@ -192,6 +195,8 @@ xb_cleanup(struct xdrbuf *xbp)
                        xb_free(xbp->xb_u.xb_buffer.xbb_base);
                }
                break;
+       default:
+               break;
        }
        xbp->xb_flags &= ~XB_CLEANUP;
 }
@@ -207,6 +212,8 @@ xb_set_cur_buf_len(struct xdrbuf *xbp)
        case XDRBUF_BUFFER:
                xbp->xb_u.xb_buffer.xbb_len = xbp->xb_ptr - xbp->xb_u.xb_buffer.xbb_base;
                break;
+       default:
+               break;
        }
 }
 
@@ -244,6 +251,8 @@ xb_offset(struct xdrbuf *xbp)
        case XDRBUF_BUFFER:
                offset = xbp->xb_ptr - xbp->xb_u.xb_buffer.xbb_base;
                break;
+       default:
+               break;
        }
 
        return offset;
@@ -260,6 +269,8 @@ xb_seek(struct xdrbuf *xbp, uint32_t offset)
                xbp->xb_ptr = xbp->xb_u.xb_buffer.xbb_base + offset;
                xbp->xb_left = xbp->xb_u.xb_buffer.xbb_len - offset;
                break;
+       default:
+               break;
        }
 
        return 0;
@@ -323,6 +334,8 @@ xb_grow(struct xdrbuf *xbp)
                xbp->xb_ptr = newbuf + oldsize;
                xbp->xb_left = xbp->xb_growsize;
                break;
+       default:
+               break;
        }
 
        return 0;
index ef0643f8ac15a633e73df9bed7dd8d4d4452001c..4016262f6c8547994d19553f6f529a20f024eb94 100644 (file)
@@ -31,7 +31,7 @@ INSTALL_MI_DIR = pthread
 
 # /usr/local/include without PRIVATE stuff
 # /System/Library/Frameworks/System.framework/PrivateHeaders
-INCDIR = /usr/local/include
+INCDIR = $(SDKHEADERSROOT)/usr/local/include
 INSTALL_MI_LIST = ${DATAFILES}
 INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES}
 
index add1853ba24d24ec76a86eeef693a57d44becbf1..fa5f0fdc18f98e3206f3de5d2ac50a61727bf512 100644 (file)
@@ -56,6 +56,8 @@
 #define BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET          0x402
 /* bsdthread_ctl(BSDTHREAD_CTL_QOS_MAX_PARALLELISM, priority, flags, 0) */
 #define BSDTHREAD_CTL_QOS_MAX_PARALLELISM       0x800
+/* bsdthread_ctl(BSDTHREAD_CTL_WORKQ_ALLOW_KILL, enable, 0, 0) */
+#define BSDTHREAD_CTL_WORKQ_ALLOW_KILL 0x1000
 
 #define _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL 0x1
 #define _PTHREAD_QOS_PARALLELISM_REALTIME 0x2
index b73c0ad95bf4a6a57433a2ef7fb0273d02196074..dbfff7e5452f652eff34c45263e9566570a4e10e 100644 (file)
@@ -179,14 +179,21 @@ _pthread_default_priority(unsigned long flags)
        return _pthread_priority_make_from_thread_qos(THREAD_QOS_LEGACY, 0, flags);
 }
 
+__attribute__((always_inline, const))
+static inline thread_qos_t
+_pthread_priority_thread_qos_fast(pthread_priority_t pp)
+{
+       pp &= _PTHREAD_PRIORITY_QOS_CLASS_MASK;
+       pp >>= _PTHREAD_PRIORITY_QOS_CLASS_SHIFT;
+       return (thread_qos_t)__builtin_ffs((int)pp);
+}
+
 __attribute__((always_inline, const))
 static inline thread_qos_t
 _pthread_priority_thread_qos(pthread_priority_t pp)
 {
        if (_pthread_priority_has_qos(pp)) {
-               pp &= _PTHREAD_PRIORITY_QOS_CLASS_MASK;
-               pp >>= _PTHREAD_PRIORITY_QOS_CLASS_SHIFT;
-               return (thread_qos_t)__builtin_ffs((int)pp);
+               return _pthread_priority_thread_qos_fast(pp);
        }
        return THREAD_QOS_UNSPECIFIED;
 }
index 2f3aadbf30abed02f035ac297a443213af004623..86e618e7dbc75b085ffd882ba535e4b657e67961 100644 (file)
 /* version number of the in-kernel shims given to pthread.kext */
 #define PTHREAD_SHIMS_VERSION 1
 
-/* on arm, the callbacks function has two #ifdef arm ponters */
+/* on arm, the callbacks function has two #ifdef arm pointers */
 #if defined(__arm__)
 #define PTHREAD_CALLBACK_MEMBER __unused_was_map_is_1gb
 #else
-#define PTHREAD_CALLBACK_MEMBER __unused_was_ml_get_max_cpus
+#define PTHREAD_CALLBACK_MEMBER kevent_workq_internal
 #endif
 
 /* compile time asserts to check the length of structures in pthread_shims.h */
@@ -255,7 +255,7 @@ static void
 psynch_wait_complete(uintptr_t kwq, struct turnstile **tstore)
 {
        assert(tstore);
-       turnstile_complete(kwq, tstore, NULL);
+       turnstile_complete(kwq, tstore, NULL, TURNSTILE_PTHREAD_MUTEX);
 }
 
 static void
@@ -270,7 +270,7 @@ psynch_wait_update_owner(uintptr_t kwq, thread_t owner,
        turnstile_update_inheritor(ts, owner,
            (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
        turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
-       turnstile_complete(kwq, tstore, NULL);
+       turnstile_complete(kwq, tstore, NULL, TURNSTILE_PTHREAD_MUTEX);
 }
 
 static void
@@ -300,7 +300,7 @@ psynch_wait_wakeup(uintptr_t kwq, struct ksyn_waitq_element *kwe,
                    uth->uu_thread, THREAD_AWAKENED);
 
                turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
-               turnstile_complete(kwq, tstore, NULL);
+               turnstile_complete(kwq, tstore, NULL, TURNSTILE_PTHREAD_MUTEX);
        } else {
                kr = thread_wakeup_thread((event_t)kwq, uth->uu_thread);
        }
@@ -481,11 +481,8 @@ kdp_pthread_get_thread_kwq(thread_t thread)
 }
 
 void
-thread_will_park_or_terminate(thread_t thread)
+thread_will_park_or_terminate(__unused thread_t thread)
 {
-       if (thread_owned_workloops_count(thread)) {
-               (void)kevent_exit_on_workloop_ownership_leak(thread);
-       }
 }
 
 /*
@@ -540,6 +537,8 @@ static const struct pthread_callbacks_s pthread_callbacks = {
        .thread_create = thread_create,
        .thread_resume = thread_resume,
 
+       .kevent_workq_internal = kevent_workq_internal,
+
        .convert_thread_to_port = convert_thread_to_port,
 
        .proc_get_stack_addr_hint = proc_get_stack_addr_hint,
index 0ad001488e90a97c7336fb5790688214a8d62cd1..c979f80eb7d1f89b4e286150a05dab0e63bd3114 100644 (file)
@@ -29,9 +29,6 @@
 
 #include <sys/cdefs.h>
 
-// <rdar://problem/26158937> panic() should be marked noreturn
-extern void panic(const char *string, ...) __printflike(1, 2) __dead2;
-
 #include <kern/assert.h>
 #include <kern/ast.h>
 #include <kern/clock.h>
@@ -82,10 +79,9 @@ extern void panic(const char *string, ...) __printflike(1, 2) __dead2;
 
 #include <os/log.h>
 
-extern thread_t port_name_to_thread(mach_port_name_t port_name); /* osfmk/kern/ipc_tt.h   */
-
 static void workq_unpark_continue(void *uth, wait_result_t wr) __dead2;
-static void workq_schedule_creator(proc_t p, struct workqueue *wq, int flags);
+static void workq_schedule_creator(proc_t p, struct workqueue *wq,
+    workq_kern_threadreq_flags_t flags);
 
 static bool workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth,
     workq_threadreq_t req);
@@ -116,6 +112,7 @@ static lck_attr_t     *workq_lck_attr;
 static lck_grp_attr_t *workq_lck_grp_attr;
 os_refgrp_decl(static, workq_refgrp, "workq", NULL);
 
+static struct mpsc_daemon_queue workq_deallocate_queue;
 static zone_t workq_zone_workqueue;
 static zone_t workq_zone_threadreq;
 
@@ -184,10 +181,10 @@ proc_init_wqptr_or_wait(struct proc *p)
        struct workqueue *wq;
 
        proc_lock(p);
-       wq = p->p_wqptr;
+       wq = os_atomic_load(&p->p_wqptr, relaxed);
 
        if (wq == NULL) {
-               p->p_wqptr = WQPTR_IS_INITING_VALUE;
+               os_atomic_store(&p->p_wqptr, WQPTR_IS_INITING_VALUE, relaxed);
                proc_unlock(p);
                return true;
        }
@@ -211,9 +208,7 @@ workq_parked_wait_event(struct uthread *uth)
 static inline void
 workq_thread_wakeup(struct uthread *uth)
 {
-       if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) == 0) {
-               thread_wakeup_thread(workq_parked_wait_event(uth), uth->uu_thread);
-       }
+       thread_wakeup_thread(workq_parked_wait_event(uth), uth->uu_thread);
 }
 
 #pragma mark wq_thactive
@@ -242,7 +237,7 @@ static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3,
 static inline wq_thactive_t
 _wq_thactive(struct workqueue *wq)
 {
-       return os_atomic_load(&wq->wq_thactive, relaxed);
+       return os_atomic_load_wide(&wq->wq_thactive, relaxed);
 }
 
 static inline int
@@ -323,7 +318,7 @@ _wq_thactive_move(struct workqueue *wq,
 {
        wq_thactive_t v = _wq_thactive_offset_for_qos(new_qos) -
            _wq_thactive_offset_for_qos(old_qos);
-       os_atomic_add_orig(&wq->wq_thactive, v, relaxed);
+       os_atomic_add(&wq->wq_thactive, v, relaxed);
        wq->wq_thscheduled_count[_wq_bucket(old_qos)]--;
        wq->wq_thscheduled_count[_wq_bucket(new_qos)]++;
 }
@@ -388,13 +383,6 @@ workq_is_exiting(struct proc *p)
        return !wq || _wq_exiting(wq);
 }
 
-struct turnstile *
-workq_turnstile(struct proc *p)
-{
-       struct workqueue *wq = proc_get_wqptr(p);
-       return wq ? wq->wq_turnstile : TURNSTILE_NULL;
-}
-
 #pragma mark workqueue lock
 
 static bool
@@ -450,7 +438,7 @@ workq_thread_needs_params_change(workq_threadreq_t req, struct uthread *uth)
        workq_threadreq_param_t cur_trp, req_trp = { };
 
        cur_trp.trp_value = uth->uu_save.uus_workq_park_data.workloop_params;
-       if (req->tr_flags & TR_FLAG_WL_PARAMS) {
+       if (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS) {
                req_trp = kqueue_threadreq_workloop_param(req);
        }
 
@@ -537,7 +525,7 @@ workq_thread_reset_cpupercent(workq_threadreq_t req, struct uthread *uth)
        assert(uth == current_uthread());
        workq_threadreq_param_t trp = { };
 
-       if (req && (req->tr_flags & TR_FLAG_WL_PARAMS)) {
+       if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)) {
                trp = kqueue_threadreq_workloop_param(req);
        }
 
@@ -560,7 +548,7 @@ workq_thread_reset_cpupercent(workq_threadreq_t req, struct uthread *uth)
 
 static void
 workq_thread_reset_pri(struct workqueue *wq, struct uthread *uth,
-    workq_threadreq_t req)
+    workq_threadreq_t req, bool unpark)
 {
        thread_t th = uth->uu_thread;
        thread_qos_t qos = req ? req->tr_qos : WORKQ_THREAD_QOS_CLEANUP;
@@ -568,16 +556,18 @@ workq_thread_reset_pri(struct workqueue *wq, struct uthread *uth,
        int priority = 31;
        int policy = POLICY_TIMESHARE;
 
-       if (req && (req->tr_flags & TR_FLAG_WL_PARAMS)) {
+       if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)) {
                trp = kqueue_threadreq_workloop_param(req);
        }
 
        uth->uu_workq_pri = WORKQ_POLICY_INIT(qos);
        uth->uu_workq_flags &= ~UT_WORKQ_OUTSIDE_QOS;
-       uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
 
-       // qos sent out to userspace (may differ from uu_workq_pri on param threads)
-       uth->uu_save.uus_workq_park_data.qos = qos;
+       if (unpark) {
+               uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
+               // qos sent out to userspace (may differ from uu_workq_pri on param threads)
+               uth->uu_save.uus_workq_park_data.qos = qos;
+       }
 
        if (qos == WORKQ_THREAD_QOS_MANAGER) {
                uint32_t mgr_pri = wq->wq_event_manager_priority;
@@ -611,12 +601,12 @@ workq_thread_reset_pri(struct workqueue *wq, struct uthread *uth,
  * every time a servicer is being told about a new max QoS.
  */
 void
-workq_thread_set_max_qos(struct proc *p, struct kqrequest *kqr)
+workq_thread_set_max_qos(struct proc *p, workq_threadreq_t kqr)
 {
        struct uu_workq_policy old_pri, new_pri;
-       struct uthread *uth = get_bsdthread_info(kqr->kqr_thread);
+       struct uthread *uth = current_uthread();
        struct workqueue *wq = proc_get_wqptr_fast(p);
-       thread_qos_t qos = kqr->kqr_qos_index;
+       thread_qos_t qos = kqr->tr_kq_qos_index;
 
        if (uth->uu_workq_pri.qos_max == qos) {
                return;
@@ -729,7 +719,9 @@ workq_death_policy_evaluate(struct workqueue *wq, uint16_t decrement)
                    wq, wq->wq_thidlecount, 0, 0, 0);
                wq->wq_thdying_count++;
                uth->uu_workq_flags |= UT_WORKQ_DYING;
-               workq_thread_wakeup(uth);
+               if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) == 0) {
+                       workq_thread_wakeup(uth);
+               }
                return;
        }
 
@@ -770,14 +762,15 @@ workq_kill_old_threads_call(void *param0, void *param1 __unused)
 
        workq_lock_spin(wq);
        WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_START, wq, 0, 0, 0, 0);
-       os_atomic_and(&wq->wq_flags, ~WQ_DEATH_CALL_SCHEDULED, relaxed);
+       os_atomic_andnot(&wq->wq_flags, WQ_DEATH_CALL_SCHEDULED, relaxed);
        workq_death_policy_evaluate(wq, 0);
        WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_END, wq, 0, 0, 0, 0);
        workq_unlock(wq);
 }
 
 static struct uthread *
-workq_pop_idle_thread(struct workqueue *wq)
+workq_pop_idle_thread(struct workqueue *wq, uint8_t uu_flags,
+    bool *needs_wakeup)
 {
        struct uthread *uth;
 
@@ -790,13 +783,21 @@ workq_pop_idle_thread(struct workqueue *wq)
        TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry);
 
        assert((uth->uu_workq_flags & UT_WORKQ_RUNNING) == 0);
-       uth->uu_workq_flags |= UT_WORKQ_RUNNING | UT_WORKQ_OVERCOMMIT;
+       uth->uu_workq_flags |= UT_WORKQ_RUNNING | uu_flags;
+       if ((uu_flags & UT_WORKQ_OVERCOMMIT) == 0) {
+               wq->wq_constrained_threads_scheduled++;
+       }
        wq->wq_threads_scheduled++;
        wq->wq_thidlecount--;
 
        if (__improbable(uth->uu_workq_flags & UT_WORKQ_DYING)) {
                uth->uu_workq_flags ^= UT_WORKQ_DYING;
                workq_death_policy_evaluate(wq, 1);
+               *needs_wakeup = false;
+       } else if (uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) {
+               *needs_wakeup = false;
+       } else {
+               *needs_wakeup = true;
        }
        return uth;
 }
@@ -814,6 +815,7 @@ workq_thread_init_and_wq_lock(task_t task, thread_t th)
        uth->uu_workq_pri = WORKQ_POLICY_INIT(THREAD_QOS_LEGACY);
        uth->uu_workq_thport = MACH_PORT_NULL;
        uth->uu_workq_stackaddr = 0;
+       uth->uu_workq_pthread_kill_allowed = 0;
 
        thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
        thread_reset_workq_qos(th, THREAD_QOS_LEGACY);
@@ -886,13 +888,13 @@ out:
 __attribute__((noreturn, noinline))
 static void
 workq_unpark_for_death_and_unlock(proc_t p, struct workqueue *wq,
-    struct uthread *uth, uint32_t death_flags)
+    struct uthread *uth, uint32_t death_flags, uint32_t setup_flags)
 {
        thread_qos_t qos = workq_pri_override(uth->uu_workq_pri);
        bool first_use = uth->uu_workq_flags & UT_WORKQ_NEW;
 
        if (qos > WORKQ_THREAD_QOS_CLEANUP) {
-               workq_thread_reset_pri(wq, uth, NULL);
+               workq_thread_reset_pri(wq, uth, NULL, /*unpark*/ true);
                qos = WORKQ_THREAD_QOS_CLEANUP;
        }
 
@@ -910,8 +912,13 @@ workq_unpark_for_death_and_unlock(proc_t p, struct workqueue *wq,
 
        workq_unlock(wq);
 
+       if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
+               __assert_only kern_return_t kr;
+               kr = thread_set_voucher_name(MACH_PORT_NULL);
+               assert(kr == KERN_SUCCESS);
+       }
+
        uint32_t flags = WQ_FLAG_THREAD_NEWSPI | qos | WQ_FLAG_THREAD_PRIO_QOS;
-       uint32_t setup_flags = WQ_SETUP_EXIT_THREAD;
        thread_t th = uth->uu_thread;
        vm_map_t vmap = get_task_map(p->task);
 
@@ -920,7 +927,7 @@ workq_unpark_for_death_and_unlock(proc_t p, struct workqueue *wq,
        }
 
        pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr,
-           uth->uu_workq_thport, 0, setup_flags, flags);
+           uth->uu_workq_thport, 0, WQ_SETUP_EXIT_THREAD, flags);
        __builtin_unreachable();
 }
 
@@ -946,6 +953,10 @@ workq_turnstile_update_inheritor(struct workqueue *wq,
     turnstile_inheritor_t inheritor,
     turnstile_update_flags_t flags)
 {
+       if (wq->wq_inheritor == inheritor) {
+               return;
+       }
+       wq->wq_inheritor = inheritor;
        workq_perform_turnstile_operation_locked(wq, ^{
                turnstile_update_inheritor(wq->wq_turnstile, inheritor,
                flags | TURNSTILE_IMMEDIATE_UPDATE);
@@ -955,35 +966,44 @@ workq_turnstile_update_inheritor(struct workqueue *wq,
 }
 
 static void
-workq_push_idle_thread(proc_t p, struct workqueue *wq, struct uthread *uth)
+workq_push_idle_thread(proc_t p, struct workqueue *wq, struct uthread *uth,
+    uint32_t setup_flags)
 {
        uint64_t now = mach_absolute_time();
+       bool is_creator = (uth == wq->wq_creator);
 
-       uth->uu_workq_flags &= ~UT_WORKQ_RUNNING;
        if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
                wq->wq_constrained_threads_scheduled--;
        }
+       uth->uu_workq_flags &= ~(UT_WORKQ_RUNNING | UT_WORKQ_OVERCOMMIT);
        TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry);
        wq->wq_threads_scheduled--;
 
-       if (wq->wq_creator == uth) {
+       if (is_creator) {
+               wq->wq_creator = NULL;
                WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 3, 0,
                    uth->uu_save.uus_workq_park_data.yields, 0);
-               wq->wq_creator = NULL;
+       }
+
+       if (wq->wq_inheritor == uth->uu_thread) {
+               assert(wq->wq_creator == NULL);
                if (wq->wq_reqcount) {
                        workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ);
                } else {
                        workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
                }
-               if (uth->uu_workq_flags & UT_WORKQ_NEW) {
-                       TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry);
-                       wq->wq_thidlecount++;
-                       return;
-               }
-       } else {
+       }
+
+       if (uth->uu_workq_flags & UT_WORKQ_NEW) {
+               assert(is_creator || (_wq_flags(wq) & WQ_EXITING));
+               TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry);
+               wq->wq_thidlecount++;
+               return;
+       }
+
+       if (!is_creator) {
                _wq_thactive_dec(wq, uth->uu_workq_pri.qos_bucket);
                wq->wq_thscheduled_count[_wq_bucket(uth->uu_workq_pri.qos_bucket)]--;
-               assert(!(uth->uu_workq_flags & UT_WORKQ_NEW));
                uth->uu_workq_flags |= UT_WORKQ_IDLE_CLEANUP;
        }
 
@@ -1014,7 +1034,7 @@ workq_push_idle_thread(proc_t p, struct workqueue *wq, struct uthread *uth)
                wq->wq_thdying_count++;
                uth->uu_workq_flags |= UT_WORKQ_DYING;
                uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP;
-               workq_unpark_for_death_and_unlock(p, wq, uth, 0);
+               workq_unpark_for_death_and_unlock(p, wq, uth, 0, setup_flags);
                __builtin_unreachable();
        }
 
@@ -1045,7 +1065,7 @@ workq_priority_for_req(workq_threadreq_t req)
 {
        thread_qos_t qos = req->tr_qos;
 
-       if (req->tr_flags & TR_FLAG_WL_OUTSIDE_QOS) {
+       if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
                workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req);
                assert(trp.trp_flags & TRP_PRIORITY);
                return trp.trp_pri;
@@ -1056,9 +1076,9 @@ workq_priority_for_req(workq_threadreq_t req)
 static inline struct priority_queue *
 workq_priority_queue_for_req(struct workqueue *wq, workq_threadreq_t req)
 {
-       if (req->tr_flags & TR_FLAG_WL_OUTSIDE_QOS) {
+       if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
                return &wq->wq_special_queue;
-       } else if (req->tr_flags & TR_FLAG_OVERCOMMIT) {
+       } else if (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
                return &wq->wq_overcommit_queue;
        } else {
                return &wq->wq_constrained_queue;
@@ -1072,14 +1092,14 @@ workq_priority_queue_for_req(struct workqueue *wq, workq_threadreq_t req)
 static bool
 workq_threadreq_enqueue(struct workqueue *wq, workq_threadreq_t req)
 {
-       assert(req->tr_state == TR_STATE_NEW);
+       assert(req->tr_state == WORKQ_TR_STATE_NEW);
 
-       req->tr_state = TR_STATE_QUEUED;
+       req->tr_state = WORKQ_TR_STATE_QUEUED;
        wq->wq_reqcount += req->tr_count;
 
        if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
                assert(wq->wq_event_manager_threadreq == NULL);
-               assert(req->tr_flags & TR_FLAG_KEVENT);
+               assert(req->tr_flags & WORKQ_TR_FLAG_KEVENT);
                assert(req->tr_count == 1);
                wq->wq_event_manager_threadreq = req;
                return true;
@@ -1087,7 +1107,7 @@ workq_threadreq_enqueue(struct workqueue *wq, workq_threadreq_t req)
        if (priority_queue_insert(workq_priority_queue_for_req(wq, req),
            &req->tr_entry, workq_priority_for_req(req),
            PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
-               if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
+               if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
                        _wq_thactive_refresh_best_constrained_req_qos(wq);
                }
                return true;
@@ -1113,7 +1133,7 @@ workq_threadreq_dequeue(struct workqueue *wq, workq_threadreq_t req)
                }
                if (priority_queue_remove(workq_priority_queue_for_req(wq, req),
                    &req->tr_entry, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
-                       if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
+                       if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
                                _wq_thactive_refresh_best_constrained_req_qos(wq);
                        }
                        return true;
@@ -1125,113 +1145,14 @@ workq_threadreq_dequeue(struct workqueue *wq, workq_threadreq_t req)
 static void
 workq_threadreq_destroy(proc_t p, workq_threadreq_t req)
 {
-       req->tr_state = TR_STATE_IDLE;
-       if (req->tr_flags & (TR_FLAG_WORKLOOP | TR_FLAG_KEVENT)) {
+       req->tr_state = WORKQ_TR_STATE_CANCELED;
+       if (req->tr_flags & (WORKQ_TR_FLAG_WORKLOOP | WORKQ_TR_FLAG_KEVENT)) {
                kqueue_threadreq_cancel(p, req);
        } else {
                zfree(workq_zone_threadreq, req);
        }
 }
 
-/*
- * Mark a thread request as complete.  At this point, it is treated as owned by
- * the submitting subsystem and you should assume it could be freed.
- *
- * Called with the workqueue lock held.
- */
-static void
-workq_threadreq_bind_and_unlock(proc_t p, struct workqueue *wq,
-    workq_threadreq_t req, struct uthread *uth)
-{
-       uint8_t tr_flags = req->tr_flags;
-       bool needs_commit = false;
-       int creator_flags = 0;
-
-       wq->wq_fulfilled++;
-
-       if (req->tr_state == TR_STATE_QUEUED) {
-               workq_threadreq_dequeue(wq, req);
-               creator_flags = WORKQ_THREADREQ_CAN_CREATE_THREADS;
-       }
-
-       if (wq->wq_creator == uth) {
-               WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 4, 0,
-                   uth->uu_save.uus_workq_park_data.yields, 0);
-               creator_flags = WORKQ_THREADREQ_CAN_CREATE_THREADS |
-                   WORKQ_THREADREQ_CREATOR_TRANSFER;
-               wq->wq_creator = NULL;
-               _wq_thactive_inc(wq, req->tr_qos);
-               wq->wq_thscheduled_count[_wq_bucket(req->tr_qos)]++;
-       } else if (uth->uu_workq_pri.qos_bucket != req->tr_qos) {
-               _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos);
-       }
-       workq_thread_reset_pri(wq, uth, req);
-
-       if (tr_flags & TR_FLAG_OVERCOMMIT) {
-               if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
-                       uth->uu_workq_flags |= UT_WORKQ_OVERCOMMIT;
-                       wq->wq_constrained_threads_scheduled--;
-               }
-       } else {
-               if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) != 0) {
-                       uth->uu_workq_flags &= ~UT_WORKQ_OVERCOMMIT;
-                       wq->wq_constrained_threads_scheduled++;
-               }
-       }
-
-       if (tr_flags & (TR_FLAG_KEVENT | TR_FLAG_WORKLOOP)) {
-               if (req->tr_state == TR_STATE_NEW) {
-                       /*
-                        * We're called from workq_kern_threadreq_initiate()
-                        * due to an unbind, with the kq req held.
-                        */
-                       assert(!creator_flags);
-                       req->tr_state = TR_STATE_IDLE;
-                       kqueue_threadreq_bind(p, req, uth->uu_thread, 0);
-               } else {
-                       assert(req->tr_count == 0);
-                       workq_perform_turnstile_operation_locked(wq, ^{
-                               kqueue_threadreq_bind_prepost(p, req, uth->uu_thread);
-                       });
-                       needs_commit = true;
-               }
-               req = NULL;
-       } else if (req->tr_count > 0) {
-               req = NULL;
-       }
-
-       if (creator_flags) {
-               /* This can drop the workqueue lock, and take it again */
-               workq_schedule_creator(p, wq, creator_flags);
-       }
-
-       workq_unlock(wq);
-
-       if (req) {
-               zfree(workq_zone_threadreq, req);
-       }
-       if (needs_commit) {
-               kqueue_threadreq_bind_commit(p, uth->uu_thread);
-       }
-
-       /*
-        * Run Thread, Run!
-        */
-       uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
-       if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
-               upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
-       } else if (tr_flags & TR_FLAG_OVERCOMMIT) {
-               upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
-       }
-       if (tr_flags & TR_FLAG_KEVENT) {
-               upcall_flags |= WQ_FLAG_THREAD_KEVENT;
-       }
-       if (tr_flags & TR_FLAG_WORKLOOP) {
-               upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
-       }
-       uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
-}
-
 #pragma mark workqueue thread creation thread calls
 
 static inline bool
@@ -1332,8 +1253,8 @@ workq_proc_resumed(struct proc *p)
                return;
        }
 
-       wq_flags = os_atomic_and_orig(&wq->wq_flags, ~(WQ_PROC_SUSPENDED |
-           WQ_DELAYED_CALL_PENDED | WQ_IMMEDIATE_CALL_PENDED), relaxed);
+       wq_flags = os_atomic_andnot_orig(&wq->wq_flags, WQ_PROC_SUSPENDED |
+           WQ_DELAYED_CALL_PENDED | WQ_IMMEDIATE_CALL_PENDED, relaxed);
        if ((wq_flags & WQ_EXITING) == 0) {
                disable_preemption();
                if (wq_flags & WQ_IMMEDIATE_CALL_PENDED) {
@@ -1352,7 +1273,7 @@ workq_proc_resumed(struct proc *p)
 static bool
 workq_thread_is_busy(uint64_t now, _Atomic uint64_t *lastblocked_tsp)
 {
-       uint64_t lastblocked_ts = os_atomic_load(lastblocked_tsp, relaxed);
+       uint64_t lastblocked_ts = os_atomic_load_wide(lastblocked_tsp, relaxed);
        if (now <= lastblocked_ts) {
                /*
                 * Because the update of the timestamp when a thread blocks
@@ -1392,7 +1313,7 @@ workq_add_new_threads_call(void *_p, void *flags)
        workq_lock_spin(wq);
 
        wq->wq_thread_call_last_run = mach_absolute_time();
-       os_atomic_and(&wq->wq_flags, ~my_flag, release);
+       os_atomic_andnot(&wq->wq_flags, my_flag, release);
 
        /* This can drop the workqueue lock, and take it again */
        workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
@@ -1434,7 +1355,7 @@ workq_sched_callback(int type, thread_t thread)
                 * get scheduled and then block after we start down this path), it's
                 * not a problem.  Either timestamp is adequate, so no need to retry
                 */
-               os_atomic_store(&wq->wq_lastblocked_ts[_wq_bucket(qos)],
+               os_atomic_store_wide(&wq->wq_lastblocked_ts[_wq_bucket(qos)],
                    thread_last_run_time(thread), relaxed);
 
                if (req_qos == THREAD_QOS_UNSPECIFIED) {
@@ -1506,12 +1427,17 @@ workq_reference(struct workqueue *wq)
        os_ref_retain(&wq->wq_refcnt);
 }
 
-void
-workq_destroy(struct workqueue *wq)
+static void
+workq_deallocate_queue_invoke(mpsc_queue_chain_t e,
+    __assert_only mpsc_daemon_queue_t dq)
 {
+       struct workqueue *wq;
        struct turnstile *ts;
 
-       turnstile_complete((uintptr_t)wq, &wq->wq_turnstile, &ts);
+       wq = mpsc_queue_element(e, struct workqueue, wq_destroy_link);
+       assert(dq == &workq_deallocate_queue);
+
+       turnstile_complete((uintptr_t)wq, &wq->wq_turnstile, &ts, TURNSTILE_WORKQS);
        assert(ts);
        turnstile_cleanup();
        turnstile_deallocate(ts);
@@ -1524,7 +1450,8 @@ static void
 workq_deallocate(struct workqueue *wq)
 {
        if (os_ref_release_relaxed(&wq->wq_refcnt) == 0) {
-               workq_destroy(wq);
+               workq_deallocate_queue_invoke(&wq->wq_destroy_link,
+                   &workq_deallocate_queue);
        }
 }
 
@@ -1532,7 +1459,8 @@ void
 workq_deallocate_safe(struct workqueue *wq)
 {
        if (__improbable(os_ref_release_relaxed(&wq->wq_refcnt) == 0)) {
-               workq_deallocate_enqueue(wq);
+               mpsc_daemon_enqueue(&workq_deallocate_queue, &wq->wq_destroy_link,
+                   MPSC_QUEUE_DISABLE_PREEMPTION);
        }
 }
 
@@ -1677,7 +1605,8 @@ workq_mark_exiting(struct proc *p)
        mgr_req = wq->wq_event_manager_threadreq;
        wq->wq_event_manager_threadreq = NULL;
        wq->wq_reqcount = 0; /* workq_schedule_creator must not look at queues */
-       workq_turnstile_update_inheritor(wq, NULL, 0);
+       wq->wq_creator = NULL;
+       workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
 
        workq_unlock(wq);
 
@@ -1809,18 +1738,18 @@ bsdthread_set_self(proc_t p, thread_t th, pthread_priority_t priority,
                        goto qos;
                }
 
-               struct kqrequest *kqr = uth->uu_kqr_bound;
+               workq_threadreq_t kqr = uth->uu_kqr_bound;
                if (kqr == NULL) {
                        unbind_rv = EALREADY;
                        goto qos;
                }
 
-               if (kqr->kqr_state & KQR_WORKLOOP) {
+               if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
                        unbind_rv = EINVAL;
                        goto qos;
                }
 
-               kqueue_threadreq_unbind(p, uth->uu_kqr_bound);
+               kqueue_threadreq_unbind(p, kqr);
        }
 
 qos:
@@ -1840,9 +1769,10 @@ qos:
                                qos_rv = EPERM;
                                goto voucher;
                        }
-               } else if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
+               } else if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER ||
+                   uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_ABOVEUI) {
                        /*
-                        * Workqueue manager threads can't change QoS
+                        * Workqueue manager threads or threads above UI can't change QoS
                         */
                        qos_rv = EINVAL;
                        goto voucher;
@@ -1960,7 +1890,8 @@ bsdthread_add_explicit_override(proc_t p, mach_port_name_t kport,
                return EINVAL;
        }
 
-       thread_t th = port_name_to_thread(kport);
+       thread_t th = port_name_to_thread(kport,
+           PORT_TO_THREAD_IN_CURRENT_TASK);
        if (th == THREAD_NULL) {
                return ESRCH;
        }
@@ -1976,7 +1907,8 @@ static int
 bsdthread_remove_explicit_override(proc_t p, mach_port_name_t kport,
     user_addr_t resource)
 {
-       thread_t th = port_name_to_thread(kport);
+       thread_t th = port_name_to_thread(kport,
+           PORT_TO_THREAD_IN_CURRENT_TASK);
        if (th == THREAD_NULL) {
                return ESRCH;
        }
@@ -2000,7 +1932,8 @@ workq_thread_add_dispatch_override(proc_t p, mach_port_name_t kport,
                return EINVAL;
        }
 
-       thread_t thread = port_name_to_thread(kport);
+       thread_t thread = port_name_to_thread(kport,
+           PORT_TO_THREAD_IN_CURRENT_TASK);
        if (thread == THREAD_NULL) {
                return ESRCH;
        }
@@ -2017,16 +1950,16 @@ workq_thread_add_dispatch_override(proc_t p, mach_port_name_t kport,
        thread_mtx_lock(thread);
 
        if (ulock_addr) {
-               uint64_t val;
+               uint32_t val;
                int rc;
                /*
                 * Workaround lack of explicit support for 'no-fault copyin'
                 * <rdar://problem/24999882>, as disabling preemption prevents paging in
                 */
                disable_preemption();
-               rc = copyin_word(ulock_addr, &val, sizeof(kport));
+               rc = copyin_atomic32(ulock_addr, &val);
                enable_preemption();
-               if (rc == 0 && ulock_owner_value_to_port_name((uint32_t)val) != kport) {
+               if (rc == 0 && ulock_owner_value_to_port_name(val) != kport) {
                        goto out;
                }
        }
@@ -2076,6 +2009,23 @@ workq_thread_reset_dispatch_override(proc_t p, thread_t thread)
        return 0;
 }
 
+static int
+workq_thread_allow_kill(__unused proc_t p, thread_t thread, bool enable)
+{
+       if (!(thread_get_tag(thread) & THREAD_TAG_WORKQUEUE)) {
+               // If the thread isn't a workqueue thread, don't set the
+               // kill_allowed bit; however, we still need to return 0
+               // instead of an error code since this code is executed
+               // on the abort path which needs to not depend on the
+               // pthread_t (returning an error depends on pthread_t via
+               // cerror_nocancel)
+               return 0;
+       }
+       struct uthread *uth = get_bsdthread_info(thread);
+       uth->uu_workq_pthread_kill_allowed = enable;
+       return 0;
+}
+
 static int
 bsdthread_get_max_parallelism(thread_qos_t qos, unsigned long flags,
     int *retval)
@@ -2131,6 +2081,10 @@ bsdthread_ctl(struct proc *p, struct bsdthread_ctl_args *uap, int *retval)
                ENSURE_UNUSED(uap->arg3);
                return bsdthread_get_max_parallelism((thread_qos_t)uap->arg1,
                           (unsigned long)uap->arg2, retval);
+       case BSDTHREAD_CTL_WORKQ_ALLOW_KILL:
+               ENSURE_UNUSED(uap->arg2);
+               ENSURE_UNUSED(uap->arg3);
+               return workq_thread_allow_kill(p, current_thread(), (bool)uap->arg1);
 
        case BSDTHREAD_CTL_SET_QOS:
        case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
@@ -2145,9 +2099,13 @@ bsdthread_ctl(struct proc *p, struct bsdthread_ctl_args *uap, int *retval)
 
 #pragma mark workqueue thread manipulation
 
+static void __dead2
+workq_unpark_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
+    struct uthread *uth, uint32_t setup_flags);
+
 static void __dead2
 workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
-    struct uthread *uth);
+    struct uthread *uth, uint32_t setup_flags);
 
 static void workq_setup_and_run(proc_t p, struct uthread *uth, int flags) __dead2;
 
@@ -2156,8 +2114,8 @@ static inline uint64_t
 workq_trace_req_id(workq_threadreq_t req)
 {
        struct kqworkloop *kqwl;
-       if (req->tr_flags & TR_FLAG_WORKLOOP) {
-               kqwl = __container_of(req, struct kqworkloop, kqwl_request.kqr_req);
+       if (req->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
+               kqwl = __container_of(req, struct kqworkloop, kqwl_request);
                return kqwl->kqwl_dynamicid;
        }
 
@@ -2185,12 +2143,12 @@ workq_reqthreads(struct proc *p, uint32_t reqcount, pthread_priority_t pp)
 
        workq_threadreq_t req = zalloc(workq_zone_threadreq);
        priority_queue_entry_init(&req->tr_entry);
-       req->tr_state = TR_STATE_NEW;
+       req->tr_state = WORKQ_TR_STATE_NEW;
        req->tr_flags = 0;
        req->tr_qos   = qos;
 
        if (pp & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) {
-               req->tr_flags |= TR_FLAG_OVERCOMMIT;
+               req->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
                upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
        }
 
@@ -2213,7 +2171,7 @@ workq_reqthreads(struct proc *p, uint32_t reqcount, pthread_priority_t pp)
                 * If there aren't enough threads, add one, but re-evaluate everything
                 * as conditions may now have changed.
                 */
-               if (reqcount > 1 && (req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
+               if (reqcount > 1 && (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
                        unpaced = workq_constrained_allowance(wq, qos, NULL, false);
                        if (unpaced >= reqcount - 1) {
                                unpaced = reqcount - 1;
@@ -2226,27 +2184,32 @@ workq_reqthreads(struct proc *p, uint32_t reqcount, pthread_priority_t pp)
                 * This path does not currently handle custom workloop parameters
                 * when creating threads for parallelism.
                 */
-               assert(!(req->tr_flags & TR_FLAG_WL_PARAMS));
+               assert(!(req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS));
 
                /*
                 * This is a trimmed down version of workq_threadreq_bind_and_unlock()
                 */
                while (unpaced > 0 && wq->wq_thidlecount) {
-                       struct uthread *uth = workq_pop_idle_thread(wq);
+                       struct uthread *uth;
+                       bool needs_wakeup;
+                       uint8_t uu_flags = UT_WORKQ_EARLY_BOUND;
+
+                       if (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
+                               uu_flags |= UT_WORKQ_OVERCOMMIT;
+                       }
+
+                       uth = workq_pop_idle_thread(wq, uu_flags, &needs_wakeup);
 
                        _wq_thactive_inc(wq, qos);
                        wq->wq_thscheduled_count[_wq_bucket(qos)]++;
-                       workq_thread_reset_pri(wq, uth, req);
+                       workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
                        wq->wq_fulfilled++;
 
-                       uth->uu_workq_flags |= UT_WORKQ_EARLY_BOUND;
-                       if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
-                               uth->uu_workq_flags &= ~UT_WORKQ_OVERCOMMIT;
-                               wq->wq_constrained_threads_scheduled++;
-                       }
                        uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
                        uth->uu_save.uus_workq_park_data.thread_request = req;
-                       workq_thread_wakeup(uth);
+                       if (needs_wakeup) {
+                               workq_thread_wakeup(uth);
+                       }
                        unpaced--;
                        reqcount--;
                }
@@ -2272,41 +2235,27 @@ exiting:
 }
 
 bool
-workq_kern_threadreq_initiate(struct proc *p, struct kqrequest *kqr,
-    struct turnstile *workloop_ts, thread_qos_t qos, int flags)
+workq_kern_threadreq_initiate(struct proc *p, workq_threadreq_t req,
+    struct turnstile *workloop_ts, thread_qos_t qos,
+    workq_kern_threadreq_flags_t flags)
 {
        struct workqueue *wq = proc_get_wqptr_fast(p);
-       workq_threadreq_t req = &kqr->kqr_req;
        struct uthread *uth = NULL;
-       uint8_t tr_flags = 0;
 
-       if (kqr->kqr_state & KQR_WORKLOOP) {
-               tr_flags = TR_FLAG_WORKLOOP;
+       assert(req->tr_flags & (WORKQ_TR_FLAG_WORKLOOP | WORKQ_TR_FLAG_KEVENT));
 
+       if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
                workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req);
-               if (trp.trp_flags & TRP_PRIORITY) {
-                       tr_flags |= TR_FLAG_WL_OUTSIDE_QOS;
-                       qos = thread_workq_qos_for_pri(trp.trp_pri);
-                       if (qos == THREAD_QOS_UNSPECIFIED) {
-                               qos = WORKQ_THREAD_QOS_ABOVEUI;
-                       }
-               }
-               if (trp.trp_flags) {
-                       tr_flags |= TR_FLAG_WL_PARAMS;
+               qos = thread_workq_qos_for_pri(trp.trp_pri);
+               if (qos == THREAD_QOS_UNSPECIFIED) {
+                       qos = WORKQ_THREAD_QOS_ABOVEUI;
                }
-       } else {
-               tr_flags = TR_FLAG_KEVENT;
-       }
-       if (qos != WORKQ_THREAD_QOS_MANAGER &&
-           (kqr->kqr_state & KQR_THOVERCOMMIT)) {
-               tr_flags |= TR_FLAG_OVERCOMMIT;
        }
 
-       assert(req->tr_state == TR_STATE_IDLE);
+       assert(req->tr_state == WORKQ_TR_STATE_IDLE);
        priority_queue_entry_init(&req->tr_entry);
        req->tr_count = 1;
-       req->tr_state = TR_STATE_NEW;
-       req->tr_flags = tr_flags;
+       req->tr_state = WORKQ_TR_STATE_NEW;
        req->tr_qos   = qos;
 
        WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE, wq,
@@ -2324,13 +2273,25 @@ workq_kern_threadreq_initiate(struct proc *p, struct kqrequest *kqr,
 
        workq_lock_spin(wq);
        if (_wq_exiting(wq)) {
+               req->tr_state = WORKQ_TR_STATE_IDLE;
                workq_unlock(wq);
                return false;
        }
 
        if (uth && workq_threadreq_admissible(wq, uth, req)) {
                assert(uth != wq->wq_creator);
-               workq_threadreq_bind_and_unlock(p, wq, req, uth);
+               if (uth->uu_workq_pri.qos_bucket != req->tr_qos) {
+                       _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos);
+                       workq_thread_reset_pri(wq, uth, req, /*unpark*/ false);
+               }
+               /*
+                * We're called from workq_kern_threadreq_initiate()
+                * due to an unbind, with the kq req held.
+                */
+               WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
+                   workq_trace_req_id(req), 0, 0, 0);
+               wq->wq_fulfilled++;
+               kqueue_threadreq_bind(p, req, uth->uu_thread, 0);
        } else {
                if (workloop_ts) {
                        workq_perform_turnstile_operation_locked(wq, ^{
@@ -2343,21 +2304,21 @@ workq_kern_threadreq_initiate(struct proc *p, struct kqrequest *kqr,
                if (workq_threadreq_enqueue(wq, req)) {
                        workq_schedule_creator(p, wq, flags);
                }
-               workq_unlock(wq);
        }
 
+       workq_unlock(wq);
+
        return true;
 }
 
 void
-workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr,
-    thread_qos_t qos, int flags)
+workq_kern_threadreq_modify(struct proc *p, workq_threadreq_t req,
+    thread_qos_t qos, workq_kern_threadreq_flags_t flags)
 {
        struct workqueue *wq = proc_get_wqptr_fast(p);
-       workq_threadreq_t req = &kqr->kqr_req;
-       bool change_overcommit = false;
+       bool make_overcommit = false;
 
-       if (req->tr_flags & TR_FLAG_WL_OUTSIDE_QOS) {
+       if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
                /* Requests outside-of-QoS shouldn't accept modify operations */
                return;
        }
@@ -2365,24 +2326,25 @@ workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr,
        workq_lock_spin(wq);
 
        assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
-       assert(req->tr_flags & (TR_FLAG_KEVENT | TR_FLAG_WORKLOOP));
+       assert(req->tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP));
 
-       if (req->tr_state == TR_STATE_BINDING) {
-               kqueue_threadreq_bind(p, req, req->tr_binding_thread, 0);
+       if (req->tr_state == WORKQ_TR_STATE_BINDING) {
+               kqueue_threadreq_bind(p, req, req->tr_thread, 0);
                workq_unlock(wq);
                return;
        }
 
-       change_overcommit = (bool)(kqr->kqr_state & KQR_THOVERCOMMIT) !=
-           (bool)(req->tr_flags & TR_FLAG_OVERCOMMIT);
+       if (flags & WORKQ_THREADREQ_MAKE_OVERCOMMIT) {
+               make_overcommit = (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0;
+       }
 
-       if (_wq_exiting(wq) || (req->tr_qos == qos && !change_overcommit)) {
+       if (_wq_exiting(wq) || (req->tr_qos == qos && !make_overcommit)) {
                workq_unlock(wq);
                return;
        }
 
        assert(req->tr_count == 1);
-       if (req->tr_state != TR_STATE_QUEUED) {
+       if (req->tr_state != WORKQ_TR_STATE_QUEUED) {
                panic("Invalid thread request (%p) state %d", req, req->tr_state);
        }
 
@@ -2400,7 +2362,7 @@ workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr,
         */
        if (priority_queue_remove(pq, &req->tr_entry,
            PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
-               if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
+               if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
                        _wq_thactive_refresh_best_constrained_req_qos(wq);
                }
        }
@@ -2411,8 +2373,8 @@ workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr,
         * If the item will not become the root of the priority queue it belongs to,
         * then we need to wait in line, just enqueue and return quickly.
         */
-       if (__improbable(change_overcommit)) {
-               req->tr_flags ^= TR_FLAG_OVERCOMMIT;
+       if (__improbable(make_overcommit)) {
+               req->tr_flags ^= WORKQ_TR_FLAG_OVERCOMMIT;
                pq = workq_priority_queue_for_req(wq, req);
        }
        req->tr_qos = qos;
@@ -2430,11 +2392,11 @@ workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr,
         *
         * Pretend the thread request is new again:
         * - adjust wq_reqcount to not count it anymore.
-        * - make its state TR_STATE_NEW (so that workq_threadreq_bind_and_unlock
+        * - make its state WORKQ_TR_STATE_NEW (so that workq_threadreq_bind_and_unlock
         *   properly attempts a synchronous bind)
         */
        wq->wq_reqcount--;
-       req->tr_state = TR_STATE_NEW;
+       req->tr_state = WORKQ_TR_STATE_NEW;
        if (workq_threadreq_enqueue(wq, req)) {
                workq_schedule_creator(p, wq, flags);
        }
@@ -2454,20 +2416,19 @@ workq_kern_threadreq_unlock(struct proc *p)
 }
 
 void
-workq_kern_threadreq_update_inheritor(struct proc *p, struct kqrequest *kqr,
+workq_kern_threadreq_update_inheritor(struct proc *p, workq_threadreq_t req,
     thread_t owner, struct turnstile *wl_ts,
     turnstile_update_flags_t flags)
 {
        struct workqueue *wq = proc_get_wqptr_fast(p);
-       workq_threadreq_t req = &kqr->kqr_req;
        turnstile_inheritor_t inheritor;
 
        assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
-       assert(req->tr_flags & TR_FLAG_WORKLOOP);
+       assert(req->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
        workq_lock_held(wq);
 
-       if (req->tr_state == TR_STATE_BINDING) {
-               kqueue_threadreq_bind(p, req, req->tr_binding_thread,
+       if (req->tr_state == WORKQ_TR_STATE_BINDING) {
+               kqueue_threadreq_bind(p, req, req->tr_thread,
                    KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE);
                return;
        }
@@ -2475,7 +2436,7 @@ workq_kern_threadreq_update_inheritor(struct proc *p, struct kqrequest *kqr,
        if (_wq_exiting(wq)) {
                inheritor = TURNSTILE_INHERITOR_NULL;
        } else {
-               if (req->tr_state != TR_STATE_QUEUED) {
+               if (req->tr_state != WORKQ_TR_STATE_QUEUED) {
                        panic("Invalid thread request (%p) state %d", req, req->tr_state);
                }
 
@@ -2494,7 +2455,7 @@ workq_kern_threadreq_update_inheritor(struct proc *p, struct kqrequest *kqr,
 }
 
 void
-workq_kern_threadreq_redrive(struct proc *p, int flags)
+workq_kern_threadreq_redrive(struct proc *p, workq_kern_threadreq_flags_t flags)
 {
        struct workqueue *wq = proc_get_wqptr_fast(p);
 
@@ -2506,12 +2467,10 @@ workq_kern_threadreq_redrive(struct proc *p, int flags)
 void
 workq_schedule_creator_turnstile_redrive(struct workqueue *wq, bool locked)
 {
-       if (!locked) {
-               workq_lock_spin(wq);
-       }
-       workq_schedule_creator(NULL, wq, WORKQ_THREADREQ_CREATOR_SYNC_UPDATE);
-       if (!locked) {
-               workq_unlock(wq);
+       if (locked) {
+               workq_schedule_creator(NULL, wq, WORKQ_THREADREQ_NONE);
+       } else {
+               workq_schedule_immediate_thread_creation(wq);
        }
 }
 
@@ -2521,7 +2480,7 @@ workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap,
 {
        thread_t th = current_thread();
        struct uthread *uth = get_bsdthread_info(th);
-       struct kqrequest *kqr = uth->uu_kqr_bound;
+       workq_threadreq_t kqr = uth->uu_kqr_bound;
        workq_threadreq_param_t trp = { };
        int nevents = uap->affinity, error;
        user_addr_t eventlist = uap->item;
@@ -2542,17 +2501,26 @@ workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap,
                proc_unlock(p);
        }
 
-       if (kqr && kqr->kqr_req.tr_flags & TR_FLAG_WL_PARAMS) {
+       if (kqr && kqr->tr_flags & WORKQ_TR_FLAG_WL_PARAMS) {
                /*
                 * Ensure we store the threadreq param before unbinding
                 * the kqr from this thread.
                 */
-               trp = kqueue_threadreq_workloop_param(&kqr->kqr_req);
+               trp = kqueue_threadreq_workloop_param(kqr);
        }
 
+       /*
+        * Freeze thee base pri while we decide the fate of this thread.
+        *
+        * Either:
+        * - we return to user and kevent_cleanup will have unfrozen the base pri,
+        * - or we proceed to workq_select_threadreq_or_park_and_unlock() who will.
+        */
+       thread_freeze_base_pri(th);
+
        if (kqr) {
                uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI | WQ_FLAG_THREAD_REUSE;
-               if (kqr->kqr_state & KQR_WORKLOOP) {
+               if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
                        upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
                } else {
                        upcall_flags |= WQ_FLAG_THREAD_KEVENT;
@@ -2575,6 +2543,7 @@ workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap,
                    get_task_map(p->task), uth->uu_workq_stackaddr,
                    uth->uu_workq_thport, eventlist, nevents, upcall_flags);
                if (error) {
+                       assert(uth->uu_kqr_bound == kqr);
                        return error;
                }
 
@@ -2597,7 +2566,8 @@ workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap,
        workq_lock_spin(wq);
        WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0);
        uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
-       workq_select_threadreq_or_park_and_unlock(p, wq, uth);
+       workq_select_threadreq_or_park_and_unlock(p, wq, uth,
+           WQ_SETUP_CLEAR_VOUCHER);
        __builtin_unreachable();
 }
 
@@ -2714,6 +2684,35 @@ workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, int32_t *ret
                *retval = should_narrow;
                break;
        }
+       case WQOPS_SETUP_DISPATCH: {
+               /*
+                * item = pointer to workq_dispatch_config structure
+                * arg2 = sizeof(item)
+                */
+               struct workq_dispatch_config cfg;
+               bzero(&cfg, sizeof(cfg));
+
+               error = copyin(uap->item, &cfg, MIN(sizeof(cfg), (unsigned long) arg2));
+               if (error) {
+                       break;
+               }
+
+               if (cfg.wdc_flags & ~WORKQ_DISPATCH_SUPPORTED_FLAGS ||
+                   cfg.wdc_version < WORKQ_DISPATCH_MIN_SUPPORTED_VERSION) {
+                       error = ENOTSUP;
+                       break;
+               }
+
+               /* Load fields from version 1 */
+               p->p_dispatchqueue_serialno_offset = cfg.wdc_queue_serialno_offs;
+
+               /* Load fields from version 2 */
+               if (cfg.wdc_version >= 2) {
+                       p->p_dispatchqueue_label_offset = cfg.wdc_queue_label_offs;
+               }
+
+               break;
+       }
        default:
                error = EINVAL;
                break;
@@ -2729,15 +2728,17 @@ workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, int32_t *ret
  */
 __attribute__((noreturn, noinline))
 static void
-workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth)
+workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth,
+    uint32_t setup_flags)
 {
        assert(uth == current_uthread());
        assert(uth->uu_kqr_bound == NULL);
-       workq_push_idle_thread(p, wq, uth); // may not return
+       workq_push_idle_thread(p, wq, uth, setup_flags); // may not return
 
        workq_thread_reset_cpupercent(NULL, uth);
 
-       if (uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) {
+       if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) &&
+           !(uth->uu_workq_flags & UT_WORKQ_DYING)) {
                workq_unlock(wq);
 
                /*
@@ -2762,6 +2763,7 @@ workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth)
 
                workq_lock_spin(wq);
                uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP;
+               setup_flags &= ~WQ_SETUP_CLEAR_VOUCHER;
        }
 
        if (uth->uu_workq_flags & UT_WORKQ_RUNNING) {
@@ -2772,13 +2774,13 @@ workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth)
                 * we just run the continuation ourselves.
                 */
                WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0);
-               workq_select_threadreq_or_park_and_unlock(p, wq, uth);
+               workq_unpark_select_threadreq_or_park_and_unlock(p, wq, uth, setup_flags);
                __builtin_unreachable();
        }
 
        if (uth->uu_workq_flags & UT_WORKQ_DYING) {
                workq_unpark_for_death_and_unlock(p, wq, uth,
-                   WORKQ_UNPARK_FOR_DEATH_WAS_IDLE);
+                   WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, setup_flags);
                __builtin_unreachable();
        }
 
@@ -2883,7 +2885,7 @@ workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth,
        if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
                return workq_may_start_event_mgr_thread(wq, uth);
        }
-       if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
+       if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
                return workq_constrained_allowance(wq, req->tr_qos, uth, true);
        }
        return true;
@@ -2990,8 +2992,8 @@ workq_threadreq_select(struct workqueue *wq, struct uthread *uth)
            &proprietor);
        if (pri) {
                struct kqworkloop *kqwl = (struct kqworkloop *)proprietor;
-               req_pri = &kqwl->kqwl_request.kqr_req;
-               if (req_pri->tr_state != TR_STATE_QUEUED) {
+               req_pri = &kqwl->kqwl_request;
+               if (req_pri->tr_state != WORKQ_TR_STATE_QUEUED) {
                        panic("Invalid thread request (%p) state %d",
                            req_pri, req_pri->tr_state);
                }
@@ -3063,10 +3065,12 @@ workq_threadreq_select(struct workqueue *wq, struct uthread *uth)
  * efficient scheduling and reduced context switches.
  */
 static void
-workq_schedule_creator(proc_t p, struct workqueue *wq, int flags)
+workq_schedule_creator(proc_t p, struct workqueue *wq,
+    workq_kern_threadreq_flags_t flags)
 {
        workq_threadreq_t req;
        struct uthread *uth;
+       bool needs_wakeup;
 
        workq_lock_held(wq);
        assert(p || (flags & WORKQ_THREADREQ_CAN_CREATE_THREADS) == 0);
@@ -3075,6 +3079,14 @@ again:
        uth = wq->wq_creator;
 
        if (!wq->wq_reqcount) {
+               /*
+                * There is no thread request left.
+                *
+                * If there is a creator, leave everything in place, so that it cleans
+                * up itself in workq_push_idle_thread().
+                *
+                * Else, make sure the turnstile state is reset to no inheritor.
+                */
                if (uth == NULL) {
                        workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
                }
@@ -3083,13 +3095,16 @@ again:
 
        req = workq_threadreq_select_for_creator(wq);
        if (req == NULL) {
-               if (flags & WORKQ_THREADREQ_CREATOR_SYNC_UPDATE) {
-                       assert((flags & WORKQ_THREADREQ_CREATOR_TRANSFER) == 0);
-                       /*
-                        * turnstile propagation code is reaching out to us,
-                        * and we still don't want to do anything, do not recurse.
-                        */
-               } else {
+               /*
+                * There isn't a thread request that passes the admission check.
+                *
+                * If there is a creator, do not touch anything, the creator will sort
+                * it out when it runs.
+                *
+                * Else, set the inheritor to "WORKQ" so that the turnstile propagation
+                * code calls us if anything changes.
+                */
+               if (uth == NULL) {
                        workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ);
                }
                return;
@@ -3102,15 +3117,17 @@ again:
                if (workq_thread_needs_priority_change(req, uth)) {
                        WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE,
                            wq, 1, thread_tid(uth->uu_thread), req->tr_qos, 0);
-                       workq_thread_reset_pri(wq, uth, req);
+                       workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
                }
+               assert(wq->wq_inheritor == uth->uu_thread);
        } else if (wq->wq_thidlecount) {
                /*
                 * We need to unpark a creator thread
                 */
-               wq->wq_creator = uth = workq_pop_idle_thread(wq);
+               wq->wq_creator = uth = workq_pop_idle_thread(wq, UT_WORKQ_OVERCOMMIT,
+                   &needs_wakeup);
                if (workq_thread_needs_priority_change(req, uth)) {
-                       workq_thread_reset_pri(wq, uth, req);
+                       workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
                }
                workq_turnstile_update_inheritor(wq, uth->uu_thread,
                    TURNSTILE_INHERITOR_THREAD);
@@ -3118,13 +3135,16 @@ again:
                    wq, 2, thread_tid(uth->uu_thread), req->tr_qos, 0);
                uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled;
                uth->uu_save.uus_workq_park_data.yields = 0;
-               workq_thread_wakeup(uth);
+               if (needs_wakeup) {
+                       workq_thread_wakeup(uth);
+               }
        } else {
                /*
                 * We need to allocate a thread...
                 */
                if (__improbable(wq->wq_nthreads >= wq_max_threads)) {
                        /* out of threads, just go away */
+                       flags = WORKQ_THREADREQ_NONE;
                } else if (flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) {
                        act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ);
                } else if (!(flags & WORKQ_THREADREQ_CAN_CREATE_THREADS)) {
@@ -3136,16 +3156,173 @@ again:
                        workq_schedule_delayed_thread_creation(wq, 0);
                }
 
-               if (flags & WORKQ_THREADREQ_CREATOR_TRANSFER) {
-                       /*
-                        * workq_schedule_creator() failed at creating a thread,
-                        * and the responsibility of redriving is now with a thread-call.
-                        *
-                        * We still need to tell the turnstile the previous creator is gone.
-                        */
-                       workq_turnstile_update_inheritor(wq, NULL, 0);
+               /*
+                * If the current thread is the inheritor:
+                *
+                * If we set the AST, then the thread will stay the inheritor until
+                * either the AST calls workq_kern_threadreq_redrive(), or it parks
+                * and calls workq_push_idle_thread().
+                *
+                * Else, the responsibility of the thread creation is with a thread-call
+                * and we need to clear the inheritor.
+                */
+               if ((flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) == 0 &&
+                   wq->wq_inheritor == current_thread()) {
+                       workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
+               }
+       }
+}
+
+/**
+ * Same as workq_unpark_select_threadreq_or_park_and_unlock,
+ * but do not allow early binds.
+ *
+ * Called with the base pri frozen, will unfreeze it.
+ */
+__attribute__((noreturn, noinline))
+static void
+workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
+    struct uthread *uth, uint32_t setup_flags)
+{
+       workq_threadreq_t req = NULL;
+       bool is_creator = (wq->wq_creator == uth);
+       bool schedule_creator = false;
+
+       if (__improbable(_wq_exiting(wq))) {
+               WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
+               goto park;
+       }
+
+       if (wq->wq_reqcount == 0) {
+               WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 1, 0, 0, 0);
+               goto park;
+       }
+
+       req = workq_threadreq_select(wq, uth);
+       if (__improbable(req == NULL)) {
+               WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 2, 0, 0, 0);
+               goto park;
+       }
+
+       uint8_t tr_flags = req->tr_flags;
+       struct turnstile *req_ts = kqueue_threadreq_get_turnstile(req);
+
+       /*
+        * Attempt to setup ourselves as the new thing to run, moving all priority
+        * pushes to ourselves.
+        *
+        * If the current thread is the creator, then the fact that we are presently
+        * running is proof that we'll do something useful, so keep going.
+        *
+        * For other cases, peek at the AST to know whether the scheduler wants
+        * to preempt us, if yes, park instead, and move the thread request
+        * turnstile back to the workqueue.
+        */
+       if (req_ts) {
+               workq_perform_turnstile_operation_locked(wq, ^{
+                       turnstile_update_inheritor(req_ts, uth->uu_thread,
+                       TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
+                       turnstile_update_inheritor_complete(req_ts,
+                       TURNSTILE_INTERLOCK_HELD);
+               });
+       }
+
+       if (is_creator) {
+               WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 4, 0,
+                   uth->uu_save.uus_workq_park_data.yields, 0);
+               wq->wq_creator = NULL;
+               _wq_thactive_inc(wq, req->tr_qos);
+               wq->wq_thscheduled_count[_wq_bucket(req->tr_qos)]++;
+       } else if (uth->uu_workq_pri.qos_bucket != req->tr_qos) {
+               _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos);
+       }
+
+       workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
+
+       if (__improbable(thread_unfreeze_base_pri(uth->uu_thread) && !is_creator)) {
+               if (req_ts) {
+                       workq_perform_turnstile_operation_locked(wq, ^{
+                               turnstile_update_inheritor(req_ts, wq->wq_turnstile,
+                               TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
+                               turnstile_update_inheritor_complete(req_ts,
+                               TURNSTILE_INTERLOCK_HELD);
+                       });
                }
+               WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 3, 0, 0, 0);
+               goto park_thawed;
+       }
+
+       /*
+        * We passed all checks, dequeue the request, bind to it, and set it up
+        * to return to user.
+        */
+       WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
+           workq_trace_req_id(req), 0, 0, 0);
+       wq->wq_fulfilled++;
+       schedule_creator = workq_threadreq_dequeue(wq, req);
+
+       if (tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)) {
+               kqueue_threadreq_bind_prepost(p, req, uth);
+               req = NULL;
+       } else if (req->tr_count > 0) {
+               req = NULL;
+       }
+
+       workq_thread_reset_cpupercent(req, uth);
+       if (uth->uu_workq_flags & UT_WORKQ_NEW) {
+               uth->uu_workq_flags ^= UT_WORKQ_NEW;
+               setup_flags |= WQ_SETUP_FIRST_USE;
+       }
+       if (tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
+               if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
+                       uth->uu_workq_flags |= UT_WORKQ_OVERCOMMIT;
+                       wq->wq_constrained_threads_scheduled--;
+               }
+       } else {
+               if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) != 0) {
+                       uth->uu_workq_flags &= ~UT_WORKQ_OVERCOMMIT;
+                       wq->wq_constrained_threads_scheduled++;
+               }
+       }
+
+       if (is_creator || schedule_creator) {
+               /* This can drop the workqueue lock, and take it again */
+               workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
+       }
+
+       workq_unlock(wq);
+
+       if (req) {
+               zfree(workq_zone_threadreq, req);
+       }
+
+       /*
+        * Run Thread, Run!
+        */
+       uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
+       if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
+               upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
+       } else if (tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
+               upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
        }
+       if (tr_flags & WORKQ_TR_FLAG_KEVENT) {
+               upcall_flags |= WQ_FLAG_THREAD_KEVENT;
+       }
+       if (tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
+               upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
+       }
+       uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
+
+       if (tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)) {
+               kqueue_threadreq_bind_commit(p, uth->uu_thread);
+       }
+       workq_setup_and_run(p, uth, setup_flags);
+       __builtin_unreachable();
+
+park:
+       thread_unfreeze_base_pri(uth->uu_thread);
+park_thawed:
+       workq_park_and_unlock(p, wq, uth, setup_flags);
 }
 
 /**
@@ -3161,16 +3338,14 @@ again:
  *   Either way, the thread request object serviced will be moved to state
  *   BINDING and attached to the uthread.
  *
- *   Should be called with the workqueue lock held.  Will drop it.
+ * Should be called with the workqueue lock held.  Will drop it.
+ * Should be called with the base pri not frozen.
  */
 __attribute__((noreturn, noinline))
 static void
-workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
-    struct uthread *uth)
+workq_unpark_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
+    struct uthread *uth, uint32_t setup_flags)
 {
-       uint32_t setup_flags = 0;
-       workq_threadreq_t req;
-
        if (uth->uu_workq_flags & UT_WORKQ_EARLY_BOUND) {
                if (uth->uu_workq_flags & UT_WORKQ_NEW) {
                        setup_flags |= WQ_SETUP_FIRST_USE;
@@ -3179,33 +3354,17 @@ workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
                /*
                 * This pointer is possibly freed and only used for tracing purposes.
                 */
-               req = uth->uu_save.uus_workq_park_data.thread_request;
+               workq_threadreq_t req = uth->uu_save.uus_workq_park_data.thread_request;
                workq_unlock(wq);
                WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
                    VM_KERNEL_ADDRHIDE(req), 0, 0, 0);
-               goto run;
-       } else if (_wq_exiting(wq)) {
-               WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
-       } else if (wq->wq_reqcount == 0) {
-               WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 1, 0, 0, 0);
-       } else if ((req = workq_threadreq_select(wq, uth)) == NULL) {
-               WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 2, 0, 0, 0);
-       } else {
-               WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
-                   workq_trace_req_id(req), 0, 0, 0);
-               if (uth->uu_workq_flags & UT_WORKQ_NEW) {
-                       uth->uu_workq_flags ^= UT_WORKQ_NEW;
-                       setup_flags |= WQ_SETUP_FIRST_USE;
-               }
-               workq_thread_reset_cpupercent(req, uth);
-               workq_threadreq_bind_and_unlock(p, wq, req, uth);
-run:
+               (void)req;
                workq_setup_and_run(p, uth, setup_flags);
                __builtin_unreachable();
        }
 
-       workq_park_and_unlock(p, wq, uth);
-       __builtin_unreachable();
+       thread_freeze_base_pri(uth->uu_thread);
+       workq_select_threadreq_or_park_and_unlock(p, wq, uth, setup_flags);
 }
 
 static bool
@@ -3250,7 +3409,8 @@ __attribute__((noreturn, noinline))
 static void
 workq_unpark_continue(void *parameter __unused, wait_result_t wr __unused)
 {
-       struct uthread *uth = current_uthread();
+       thread_t th = current_thread();
+       struct uthread *uth = get_bsdthread_info(th);
        proc_t p = current_proc();
        struct workqueue *wq = proc_get_wqptr_fast(p);
 
@@ -3270,7 +3430,7 @@ workq_unpark_continue(void *parameter __unused, wait_result_t wr __unused)
        }
 
        if (__probable(uth->uu_workq_flags & UT_WORKQ_RUNNING)) {
-               workq_select_threadreq_or_park_and_unlock(p, wq, uth);
+               workq_unpark_select_threadreq_or_park_and_unlock(p, wq, uth, WQ_SETUP_NONE);
                __builtin_unreachable();
        }
 
@@ -3294,7 +3454,7 @@ workq_unpark_continue(void *parameter __unused, wait_result_t wr __unused)
        }
 
        workq_unpark_for_death_and_unlock(p, wq, uth,
-           WORKQ_UNPARK_FOR_DEATH_WAS_IDLE);
+           WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, WQ_SETUP_NONE);
        __builtin_unreachable();
 }
 
@@ -3490,4 +3650,7 @@ workq_init(void)
            NSEC_PER_USEC, &wq_reduce_pool_window.abstime);
        clock_interval_to_absolutetime_interval(wq_max_timer_interval.usecs,
            NSEC_PER_USEC, &wq_max_timer_interval.abstime);
+
+       thread_deallocate_daemon_register_queue(&workq_deallocate_queue,
+           workq_deallocate_queue_invoke);
 }
index c2a67f5e74c26ec6bbfd8b79cc3bba57a5b7ee4d..f7ed3080cf87e61f04b076e7611386a88675a9b7 100644 (file)
@@ -67,6 +67,7 @@
 #define WORKQUEUE_CONSTRAINED_FACTOR 5
 
 #if BSD_KERNEL_PRIVATE
+#include <kern/mpsc_queue.h>
 #include <kern/priority_queue.h>
 #include <kern/thread_call.h>
 #include <kern/turnstile.h>
@@ -96,33 +97,96 @@ typedef union workq_threadreq_param_s {
 } workq_threadreq_param_t;
 
 #define TRP_PRIORITY            0x1
-#define TRP_POLICY                      0x2
+#define TRP_POLICY              0x2
 #define TRP_CPUPERCENT          0x4
 #define TRP_RELEASED            0x8000
 
+/*!
+ * @enum workq_tr_state_t
+ *
+ * @brief
+ * This enum represents the state of a workq thread request.
+ *
+ * @discussion
+ * The states are used and set by both kevent and the workq subsystem under very
+ * precise locking domains.
+ *
+ * When for kevent requests, this structure is embedded on the kqueue itself,
+ * for non kevent related thread requests, it is allocated.
+ *
+ * Only the BINDING state isn't set under the kqlock, but then only QUEUED could
+ * be read by kqueue in its stead.
+ *
+ * @const WORKQ_TR_STATE_IDLE
+ * This thread request is idle.
+ * The state is only transient for non kevent thread requests.
+ * Set under the kqlock (kevent) or after allocation (workq).
+ *
+ * tr_entry/tr_thread are unused.
+ *
+ * @const WORKQ_TR_STATE_NEW
+ * This thread request is being initialized. This state is transient.
+ * Set workq lock for all kinds, set under the kqlock to for kevent requests.
+ *
+ * tr_entry is initialized, tr_thread is unused.
+ *
+ * @const WORKQ_TR_STATE_QUEUED
+ * This thread request has been pended, waiting for a thread to be bound.
+ * Set workq lock for all kinds, set under the kqlock to for kevent requests.
+ *
+ * tr_entry is used as linkage in a workq priority queue, tr_thread is unused.
+ *
+ * @const WORKQ_TR_STATE_CANCELED
+ * When the process exits, Queued thread requests are marked canceled.
+ * This happens under the workqueue lock.
+ *
+ * @const WORKQ_TR_STATE_BINDING (kevent only)
+ * A thread was found to bind to the thread request.
+ * The bind is preposted this way under the workq lock and will be
+ * acknowledged by the kevent subsystem.
+ *
+ * tr_entry is unused, tr_thread is the thread we're binding to.
+ *
+ * @const WORKQ_TR_STATE_BOUND (kevent only)
+ * A thread bind has been acknowledged by the kevent subsystem.
+ * This is always set under the kqlock, sometimes also under the workq lock.
+ *
+ * tr_entry is unused, tr_thread is the thread we're bound to.
+ */
+__enum_decl(workq_tr_state_t, uint8_t, {
+       WORKQ_TR_STATE_IDLE        = 0, /* request isn't in flight       */
+       WORKQ_TR_STATE_NEW         = 1, /* request is being initiated    */
+       WORKQ_TR_STATE_QUEUED      = 2, /* request is being queued       */
+       WORKQ_TR_STATE_CANCELED    = 3, /* request is canceled           */
+       WORKQ_TR_STATE_BINDING     = 4, /* request is preposted for bind */
+       WORKQ_TR_STATE_BOUND       = 5, /* request is bound to a thread  */
+});
+
+__options_decl(workq_tr_flags_t, uint8_t, {
+       WORKQ_TR_FLAG_KEVENT         = 0x01,
+       WORKQ_TR_FLAG_WORKLOOP       = 0x02,
+       WORKQ_TR_FLAG_OVERCOMMIT     = 0x04,
+       WORKQ_TR_FLAG_WL_PARAMS      = 0x08,
+       WORKQ_TR_FLAG_WL_OUTSIDE_QOS = 0x10,
+});
+
 typedef struct workq_threadreq_s {
        union {
                struct priority_queue_entry tr_entry;
-               thread_t tr_binding_thread;
+               thread_t tr_thread;
        };
-       uint32_t     tr_flags;
-       uint8_t      tr_state;
-       thread_qos_t tr_qos;
-       uint16_t     tr_count;
-} *workq_threadreq_t;
-
-TAILQ_HEAD(threadreq_head, workq_threadreq_s);
+       uint16_t           tr_count;
+       workq_tr_flags_t   tr_flags;
+       workq_tr_state_t   tr_state;
+       thread_qos_t       tr_qos;                 /* qos for the thread request */
 
-#define TR_STATE_IDLE           0  /* request isn't in flight       */
-#define TR_STATE_NEW            1  /* request is being initiated    */
-#define TR_STATE_QUEUED         2  /* request is being queued       */
-#define TR_STATE_BINDING        4  /* request is preposted for bind */
+       /* kqueue states, modified under the kqlock */
+       kq_index_t         tr_kq_override_index;   /* highest wakeup override index */
+       kq_index_t         tr_kq_qos_index;        /* QoS for the servicer */
+       bool               tr_kq_wakeup;           /* an event has fired */
+} workq_threadreq_s, *workq_threadreq_t;
 
-#define TR_FLAG_KEVENT                  0x01
-#define TR_FLAG_WORKLOOP                0x02
-#define TR_FLAG_OVERCOMMIT              0x04
-#define TR_FLAG_WL_PARAMS               0x08
-#define TR_FLAG_WL_OUTSIDE_QOS  0x10
+TAILQ_HEAD(threadreq_head, workq_threadreq_s);
 
 #if defined(__LP64__)
 typedef unsigned __int128 wq_thactive_t;
@@ -130,7 +194,7 @@ typedef unsigned __int128 wq_thactive_t;
 typedef uint64_t wq_thactive_t;
 #endif
 
-typedef enum {
+__options_decl(workq_state_flags_t, uint32_t, {
        WQ_EXITING                  = 0x0001,
        WQ_PROC_SUSPENDED           = 0x0002,
        WQ_DEATH_CALL_SCHEDULED     = 0x0004,
@@ -139,7 +203,7 @@ typedef enum {
        WQ_DELAYED_CALL_PENDED      = 0x0020,
        WQ_IMMEDIATE_CALL_SCHEDULED = 0x0040,
        WQ_IMMEDIATE_CALL_PENDED    = 0x0080,
-} workq_state_flags_t;
+});
 
 TAILQ_HEAD(workq_uthread_head, uthread);
 
@@ -147,7 +211,11 @@ struct workqueue {
        thread_call_t   wq_delayed_call;
        thread_call_t   wq_immediate_call;
        thread_call_t   wq_death_call;
-       struct turnstile *wq_turnstile;
+
+       union {
+               struct turnstile *wq_turnstile;
+               struct mpsc_queue_chain wq_destroy_link;
+       };
 
        lck_spin_t      wq_lock;
 
@@ -171,6 +239,7 @@ struct workqueue {
 
        struct proc    *wq_proc;
        struct uthread *wq_creator;
+       turnstile_inheritor_t wq_inheritor;
        thread_t wq_turnstile_updater; // thread doing a turnstile_update_ineritor
        struct workq_uthread_head wq_thrunlist;
        struct workq_uthread_head wq_thnewlist;
@@ -182,9 +251,6 @@ struct workqueue {
        workq_threadreq_t wq_event_manager_threadreq;
 };
 
-static_assert(offsetof(struct workqueue, wq_lock) >= sizeof(struct queue_entry),
-    "Make sure workq_deallocate_enqueue can cast the workqueue");
-
 #define WORKQUEUE_MAXTHREADS            512
 #define WQ_STALLED_WINDOW_USECS         200
 #define WQ_REDUCE_POOL_WINDOW_USECS     5000000
@@ -192,7 +258,7 @@ static_assert(offsetof(struct workqueue, wq_lock) >= sizeof(struct queue_entry),
 
 #pragma mark definitions
 
-struct kqrequest;
+struct workq_threadreq_s;
 uint32_t _get_pwq_state_kdp(proc_t p);
 
 void workq_exit(struct proc *p);
@@ -200,34 +266,34 @@ void workq_mark_exiting(struct proc *p);
 
 bool workq_is_exiting(struct proc *p);
 
-struct turnstile *workq_turnstile(struct proc *p);
-
-void workq_thread_set_max_qos(struct proc *p, struct kqrequest *kqr);
+void workq_thread_set_max_qos(struct proc *p, struct workq_threadreq_s *kqr);
 
 void workq_thread_terminate(struct proc *p, struct uthread *uth);
 
-#define WORKQ_THREADREQ_SET_AST_ON_FAILURE  0x01
-#define WORKQ_THREADREQ_ATTEMPT_REBIND      0x02
-#define WORKQ_THREADREQ_CAN_CREATE_THREADS  0x04
-#define WORKQ_THREADREQ_CREATOR_TRANSFER    0x08
-#define WORKQ_THREADREQ_CREATOR_SYNC_UPDATE 0x10
+__options_decl(workq_kern_threadreq_flags_t, uint32_t, {
+       WORKQ_THREADREQ_NONE                = 0x00,
+       WORKQ_THREADREQ_SET_AST_ON_FAILURE  = 0x01,
+       WORKQ_THREADREQ_ATTEMPT_REBIND      = 0x02,
+       WORKQ_THREADREQ_CAN_CREATE_THREADS  = 0x04,
+       WORKQ_THREADREQ_MAKE_OVERCOMMIT     = 0x08,
+});
 
 // called with the kq req lock held
-bool workq_kern_threadreq_initiate(struct proc *p, struct kqrequest *kqr,
-    struct turnstile *ts, thread_qos_t qos, int flags);
+bool workq_kern_threadreq_initiate(struct proc *p, struct workq_threadreq_s *kqr,
+    struct turnstile *ts, thread_qos_t qos, workq_kern_threadreq_flags_t flags);
 
 // called with the kq req lock held
-void workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr,
-    thread_qos_t qos, int flags);
+void workq_kern_threadreq_modify(struct proc *p, struct workq_threadreq_s *kqr,
+    thread_qos_t qos, workq_kern_threadreq_flags_t flags);
 
 // called with the kq req lock held
-void workq_kern_threadreq_update_inheritor(struct proc *p, struct kqrequest *kqr,
+void workq_kern_threadreq_update_inheritor(struct proc *p, struct workq_threadreq_s *kqr,
     thread_t owner, struct turnstile *ts, turnstile_update_flags_t flags);
 
 void workq_kern_threadreq_lock(struct proc *p);
 void workq_kern_threadreq_unlock(struct proc *p);
 
-void workq_kern_threadreq_redrive(struct proc *p, int flags);
+void workq_kern_threadreq_redrive(struct proc *p, workq_kern_threadreq_flags_t flags);
 
 enum workq_set_self_flags {
        WORKQ_SET_SELF_QOS_FLAG = 0x1,
index f12656aac4f7429b397b5c988facc31d3a2a5e46..e4ce73082d8eb7a0eb6fe27715e9259b3ae6447f 100644 (file)
 
 #ifdef __PTHREAD_EXPOSE_INTERNALS__
 /* workq_kernreturn commands */
-#define WQOPS_THREAD_RETURN        0x04 /* parks the thread back into the kernel */
-#define WQOPS_QUEUE_NEWSPISUPP     0x10 /* this is to check for newer SPI support */
-#define WQOPS_QUEUE_REQTHREADS     0x20 /* request number of threads of a prio */
-#define WQOPS_QUEUE_REQTHREADS2    0x30 /* request a number of threads in a given priority bucket */
-#define WQOPS_THREAD_KEVENT_RETURN 0x40 /* parks the thread after delivering the passed kevent array */
-#define WQOPS_SET_EVENT_MANAGER_PRIORITY 0x80   /* max() in the provided priority in the the priority of the event manager */
-#define WQOPS_THREAD_WORKLOOP_RETURN 0x100      /* parks the thread after delivering the passed kevent array */
-#define WQOPS_SHOULD_NARROW 0x200       /* checks whether we should narrow our concurrency */
+#define WQOPS_THREAD_RETURN              0x004 /* parks the thread back into the kernel */
+#define WQOPS_QUEUE_NEWSPISUPP           0x010 /* this is to check for newer SPI support */
+#define WQOPS_QUEUE_REQTHREADS           0x020 /* request number of threads of a prio */
+#define WQOPS_QUEUE_REQTHREADS2          0x030 /* request a number of threads in a given priority bucket */
+#define WQOPS_THREAD_KEVENT_RETURN       0x040 /* parks the thread after delivering the passed kevent array */
+#define WQOPS_SET_EVENT_MANAGER_PRIORITY 0x080 /* max() in the provided priority in the the priority of the event manager */
+#define WQOPS_THREAD_WORKLOOP_RETURN     0x100 /* parks the thread after delivering the passed kevent array */
+#define WQOPS_SHOULD_NARROW              0x200 /* checks whether we should narrow our concurrency */
+#define WQOPS_SETUP_DISPATCH             0x400 /* setup pthread workqueue-related operations */
 
 /* flag values for upcall flags field, only 8 bits per struct threadlist */
 #define WQ_FLAG_THREAD_PRIO_SCHED               0x00008000
@@ -53,7 +54,7 @@
 #define WQ_FLAG_THREAD_REUSE                    0x00020000  /* thread is being reused */
 #define WQ_FLAG_THREAD_NEWSPI                   0x00040000  /* the call is with new SPIs */
 #define WQ_FLAG_THREAD_KEVENT                   0x00080000  /* thread is response to kevent req */
-#define WQ_FLAG_THREAD_EVENT_MANAGER    0x00100000  /* event manager thread */
+#define WQ_FLAG_THREAD_EVENT_MANAGER            0x00100000  /* event manager thread */
 #define WQ_FLAG_THREAD_TSD_BASE_SET             0x00200000  /* tsd base has already been set */
 #define WQ_FLAG_THREAD_WORKLOOP                 0x00400000  /* workloop thread */
 #define WQ_FLAG_THREAD_OUTSIDEQOS               0x00800000  /* thread qos changes should not be sent to kernel */
@@ -93,10 +94,22 @@ int
 __kqueue_workloop_ctl(uintptr_t cmd, uint64_t options, void *addr, size_t sz);
 
 /* SPI flags between WQ and workq_setup_thread in pthread.kext */
+#define WQ_SETUP_NONE           0
 #define WQ_SETUP_FIRST_USE      1
 #define WQ_SETUP_CLEAR_VOUCHER  2
 // was  WQ_SETUP_SET_SCHED_CALL 4
 #define WQ_SETUP_EXIT_THREAD    8
 
 #endif // __PTHREAD_EXPOSE_INTERNALS__
+
+#define WORKQ_DISPATCH_CONFIG_VERSION        2
+#define WORKQ_DISPATCH_MIN_SUPPORTED_VERSION 1
+#define WORKQ_DISPATCH_SUPPORTED_FLAGS       0
+struct workq_dispatch_config {
+       uint32_t wdc_version;
+       uint32_t wdc_flags;
+       uint64_t wdc_queue_serialno_offs;
+       uint64_t wdc_queue_label_offs;
+} __attribute__((packed, aligned(4)));
+
 #endif // _PTHREAD_WORKQUEUE_PRIVATE_H_
index 9610b52dd4d24a55a201bb8544be968f82dad9ff..18e98c0f5c1f256e8864a0d1b4660bdbfce46859 100644 (file)
@@ -1846,6 +1846,7 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau)
                }
                break;
 
+       case AUE_FSGETPATH_EXTENDED:
        case AUE_FSGETPATH:
                if (ARG_IS_VALID(kar, ARG_VALUE32)) {
                        tok = au_to_arg32(3, "volfsid", ar->ar_arg_value32);
@@ -2068,7 +2069,7 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau)
  * record is good, 0 otherwise.
  */
 int
-bsm_rec_verify(void *rec, int length)
+bsm_rec_verify(void *rec, int length, boolean_t kern_events_allowed)
 {
        /* Used to partially deserialize the buffer */
        struct hdr_tok_partial *hdr;
@@ -2105,6 +2106,10 @@ bsm_rec_verify(void *rec, int length)
                return 0;
        }
 
+       if (!kern_events_allowed && AUE_IS_A_KEVENT(ntohs(hdr->e_type))) {
+               return 0;
+       }
+
        return 1;
 }
 #endif /* CONFIG_AUDIT */
index 88626e09a1098cc129540c32f7c7a8a9915de7c4..e49285e42bae1b54e8be4d6a19dcf17da679c4a6 100644 (file)
@@ -1,6 +1,5 @@
 /*-
- * Copyright (c) 2008-2009 Apple Inc.
- * All rights reserved.
+ * Copyright (c) 2008-2019 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -45,406 +44,405 @@ struct bsm_domain {
 #define PF_NO_LOCAL_MAPPING     -600
 
 static const struct bsm_domain bsm_domains[] = {
-       { BSM_PF_UNSPEC, PF_UNSPEC },
-       { BSM_PF_LOCAL, PF_LOCAL },
-       { BSM_PF_INET, PF_INET },
-       { BSM_PF_IMPLINK,
+       { .bd_bsm_domain = BSM_PF_UNSPEC, .bd_local_domain = PF_UNSPEC },
+       { .bd_bsm_domain = BSM_PF_LOCAL, .bd_local_domain = PF_LOCAL },
+       { .bd_bsm_domain = BSM_PF_INET, .bd_local_domain = PF_INET },
+       { .bd_bsm_domain = BSM_PF_IMPLINK,
 #ifdef PF_IMPLINK
-         PF_IMPLINK
+         .bd_local_domain = PF_IMPLINK
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_PUP,
+       { .bd_bsm_domain = BSM_PF_PUP,
 #ifdef PF_PUP
-         PF_PUP
+         .bd_local_domain = PF_PUP
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_CHAOS,
+       { .bd_bsm_domain = BSM_PF_CHAOS,
 #ifdef PF_CHAOS
-         PF_CHAOS
+         .bd_local_domain = PF_CHAOS
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_NS,
+       { .bd_bsm_domain = BSM_PF_NS,
 #ifdef PF_NS
-         PF_NS
+         .bd_local_domain = PF_NS
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_NBS,
+       { .bd_bsm_domain = BSM_PF_NBS,
 #ifdef PF_NBS
-         PF_NBS
+         .bd_local_domain = PF_NBS
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_ECMA,
+       { .bd_bsm_domain = BSM_PF_ECMA,
 #ifdef PF_ECMA
-         PF_ECMA
+         .bd_local_domain = PF_ECMA
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_DATAKIT,
+       { .bd_bsm_domain = BSM_PF_DATAKIT,
 #ifdef PF_DATAKIT
-         PF_DATAKIT
+         .bd_local_domain = PF_DATAKIT
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_CCITT,
+       { .bd_bsm_domain = BSM_PF_CCITT,
 #ifdef PF_CCITT
-         PF_CCITT
+         .bd_local_domain = PF_CCITT
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_SNA, PF_SNA },
-       { BSM_PF_DECnet, PF_DECnet },
-       { BSM_PF_DLI,
+       { .bd_bsm_domain = BSM_PF_SNA, .bd_local_domain = PF_SNA },
+       { .bd_bsm_domain = BSM_PF_DECnet, .bd_local_domain = PF_DECnet },
+       { .bd_bsm_domain = BSM_PF_DLI,
 #ifdef PF_DLI
-         PF_DLI
+         .bd_local_domain = PF_DLI
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_LAT,
+       { .bd_bsm_domain = BSM_PF_LAT,
 #ifdef PF_LAT
-         PF_LAT
+         .bd_local_domain = PF_LAT
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_HYLINK,
+       { .bd_bsm_domain = BSM_PF_HYLINK,
 #ifdef PF_HYLINK
-         PF_HYLINK
+         .bd_local_domain = PF_HYLINK
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_APPLETALK, PF_APPLETALK },
-       { BSM_PF_NIT,
+       { .bd_bsm_domain = BSM_PF_APPLETALK, .bd_local_domain = PF_APPLETALK },
+       { .bd_bsm_domain = BSM_PF_NIT,
 #ifdef PF_NIT
-         PF_NIT
+         .bd_local_domain = PF_NIT
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_802,
+       { .bd_bsm_domain = BSM_PF_802,
 #ifdef PF_802
-         PF_802
+         .bd_local_domain = PF_802
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_OSI,
+       { .bd_bsm_domain = BSM_PF_OSI,
 #ifdef PF_OSI
-         PF_OSI
+         .bd_local_domain = PF_OSI
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_X25,
+       { .bd_bsm_domain = BSM_PF_X25,
 #ifdef PF_X25
-         PF_X25
+         .bd_local_domain = PF_X25
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_OSINET,
+       { .bd_bsm_domain = BSM_PF_OSINET,
 #ifdef PF_OSINET
-         PF_OSINET
+         .bd_local_domain = PF_OSINET
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_GOSIP,
+       { .bd_bsm_domain = BSM_PF_GOSIP,
 #ifdef PF_GOSIP
-         PF_GOSIP
+         .bd_local_domain = PF_GOSIP
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_IPX, PF_IPX },
-       { BSM_PF_ROUTE, PF_ROUTE },
-       { BSM_PF_LINK,
+       { .bd_bsm_domain = BSM_PF_IPX, .bd_local_domain = PF_IPX },
+       { .bd_bsm_domain = BSM_PF_ROUTE, .bd_local_domain = PF_ROUTE },
+       { .bd_bsm_domain = BSM_PF_LINK,
 #ifdef PF_LINK
-         PF_LINK
+         .bd_local_domain = PF_LINK
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_INET6, PF_INET6 },
-       { BSM_PF_KEY, PF_KEY },
-       { BSM_PF_NCA,
+       { .bd_bsm_domain = BSM_PF_KEY, .bd_local_domain = PF_KEY },
+       { .bd_bsm_domain = BSM_PF_NCA,
 #ifdef PF_NCA
-         PF_NCA
+         .bd_local_domain = PF_NCA
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_POLICY,
+       { .bd_bsm_domain = BSM_PF_POLICY,
 #ifdef PF_POLICY
-         PF_POLICY
+         .bd_local_domain = PF_POLICY
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_INET_OFFLOAD,
+       { .bd_bsm_domain = BSM_PF_INET_OFFLOAD,
 #ifdef PF_INET_OFFLOAD
-         PF_INET_OFFLOAD
+         .bd_local_domain = PF_INET_OFFLOAD
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_NETBIOS,
+       { .bd_bsm_domain = BSM_PF_NETBIOS,
 #ifdef PF_NETBIOS
-         PF_NETBIOS
+         .bd_local_domain = PF_NETBIOS
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_ISO,
+       { .bd_bsm_domain = BSM_PF_ISO,
 #ifdef PF_ISO
-         PF_ISO
+         .bd_local_domain = PF_ISO
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_XTP,
+       { .bd_bsm_domain = BSM_PF_XTP,
 #ifdef PF_XTP
-         PF_XTP
+         .bd_local_domain = PF_XTP
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_COIP,
+       { .bd_bsm_domain = BSM_PF_COIP,
 #ifdef PF_COIP
-         PF_COIP
+         .bd_local_domain = PF_COIP
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_CNT,
+       { .bd_bsm_domain = BSM_PF_CNT,
 #ifdef PF_CNT
-         PF_CNT
+         .bd_local_domain = PF_CNT
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_RTIP,
+       { .bd_bsm_domain = BSM_PF_RTIP,
 #ifdef PF_RTIP
-         PF_RTIP
+         .bd_local_domain = PF_RTIP
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_SIP,
+       { .bd_bsm_domain = BSM_PF_SIP,
 #ifdef PF_SIP
-         PF_SIP
+         .bd_local_domain = PF_SIP
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_PIP,
+       { .bd_bsm_domain = BSM_PF_PIP,
 #ifdef PF_PIP
-         PF_PIP
+         .bd_local_domain = PF_PIP
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_ISDN,
+       { .bd_bsm_domain = BSM_PF_ISDN,
 #ifdef PF_ISDN
-         PF_ISDN
+         .bd_local_domain = PF_ISDN
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_E164,
+       { .bd_bsm_domain = BSM_PF_E164,
 #ifdef PF_E164
-         PF_E164
+         .bd_local_domain = PF_E164
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_NATM,
+       { .bd_bsm_domain = BSM_PF_NATM,
 #ifdef PF_NATM
-         PF_NATM
+         .bd_local_domain = PF_NATM
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_ATM,
+       { .bd_bsm_domain = BSM_PF_ATM,
 #ifdef PF_ATM
-         PF_ATM
+         .bd_local_domain = PF_ATM
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_NETGRAPH,
+       { .bd_bsm_domain = BSM_PF_NETGRAPH,
 #ifdef PF_NETGRAPH
-         PF_NETGRAPH
+         .bd_local_domain = PF_NETGRAPH
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_SLOW,
+       { .bd_bsm_domain = BSM_PF_SLOW,
 #ifdef PF_SLOW
-         PF_SLOW
+         .bd_local_domain = PF_SLOW
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_SCLUSTER,
+       { .bd_bsm_domain = BSM_PF_SCLUSTER,
 #ifdef PF_SCLUSTER
-         PF_SCLUSTER
+         .bd_local_domain = PF_SCLUSTER
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_ARP,
+       { .bd_bsm_domain = BSM_PF_ARP,
 #ifdef PF_ARP
-         PF_ARP
+         .bd_local_domain = PF_ARP
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_BLUETOOTH,
+       { .bd_bsm_domain = BSM_PF_BLUETOOTH,
 #ifdef PF_BLUETOOTH
-         PF_BLUETOOTH
+         .bd_local_domain = PF_BLUETOOTH
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_IEEE80211,
+       { .bd_bsm_domain = BSM_PF_IEEE80211,
 #ifdef PF_IEEE80211
-         PF_IEEE80211
+         .bd_local_domain = PF_IEEE80211
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_AX25,
+       { .bd_bsm_domain = BSM_PF_AX25,
 #ifdef PF_AX25
-         PF_AX25
+         .bd_local_domain = PF_AX25
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_ROSE,
+       { .bd_bsm_domain = BSM_PF_ROSE,
 #ifdef PF_ROSE
-         PF_ROSE
+         .bd_local_domain = PF_ROSE
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_NETBEUI,
+       { .bd_bsm_domain = BSM_PF_NETBEUI,
 #ifdef PF_NETBEUI
-         PF_NETBEUI
+         .bd_local_domain = PF_NETBEUI
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_SECURITY,
+       { .bd_bsm_domain = BSM_PF_SECURITY,
 #ifdef PF_SECURITY
-         PF_SECURITY
+         .bd_local_domain = PF_SECURITY
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_PACKET,
+       { .bd_bsm_domain = BSM_PF_PACKET,
 #ifdef PF_PACKET
-         PF_PACKET
+         .bd_local_domain = PF_PACKET
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_ASH,
+       { .bd_bsm_domain = BSM_PF_ASH,
 #ifdef PF_ASH
-         PF_ASH
+         .bd_local_domain = PF_ASH
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_ECONET,
+       { .bd_bsm_domain = BSM_PF_ECONET,
 #ifdef PF_ECONET
-         PF_ECONET
+         .bd_local_domain = PF_ECONET
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_ATMSVC,
+       { .bd_bsm_domain = BSM_PF_ATMSVC,
 #ifdef PF_ATMSVC
-         PF_ATMSVC
+         .bd_local_domain = PF_ATMSVC
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_IRDA,
+       { .bd_bsm_domain = BSM_PF_IRDA,
 #ifdef PF_IRDA
-         PF_IRDA
+         .bd_local_domain = PF_IRDA
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_PPPOX,
+       { .bd_bsm_domain = BSM_PF_PPPOX,
 #ifdef PF_PPPOX
-         PF_PPPOX
+         .bd_local_domain = PF_PPPOX
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_WANPIPE,
+       { .bd_bsm_domain = BSM_PF_WANPIPE,
 #ifdef PF_WANPIPE
-         PF_WANPIPE
+         .bd_local_domain = PF_WANPIPE
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_LLC,
+       { .bd_bsm_domain = BSM_PF_LLC,
 #ifdef PF_LLC
-         PF_LLC
+         .bd_local_domain = PF_LLC
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_CAN,
+       { .bd_bsm_domain = BSM_PF_CAN,
 #ifdef PF_CAN
-         PF_CAN
+         .bd_local_domain = PF_CAN
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_TIPC,
+       { .bd_bsm_domain = BSM_PF_TIPC,
 #ifdef PF_TIPC
-         PF_TIPC
+         .bd_local_domain = PF_TIPC
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_IUCV,
+       { .bd_bsm_domain = BSM_PF_IUCV,
 #ifdef PF_IUCV
-         PF_IUCV
+         .bd_local_domain = PF_IUCV
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_RXRPC,
+       { .bd_bsm_domain = BSM_PF_RXRPC,
 #ifdef PF_RXRPC
-         PF_RXRPC
+         .bd_local_domain = PF_RXRPC
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
-       { BSM_PF_PHONET,
+       { .bd_bsm_domain = BSM_PF_PHONET,
 #ifdef PF_PHONET
-         PF_PHONET
+         .bd_local_domain = PF_PHONET
 #else
-         PF_NO_LOCAL_MAPPING
+         .bd_local_domain = PF_NO_LOCAL_MAPPING
 #endif
        },
 };
index ecdbc71dc3b3ea09f0e1320e082a9a3857584506..a1b1b77fb9d2c5b2c2a796373d250b7465162b91 100644 (file)
@@ -1,6 +1,5 @@
 /*-
- * Copyright (c) 2008-2011 Apple Inc.
- * All rights reserved.
+ * Copyright (c) 2008-2019 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -58,7 +57,7 @@ struct bsm_errno {
 #define ERRNO_NO_LOCAL_MAPPING  -600
 
 #if !defined(KERNEL) && !defined(_KERNEL)
-#define ES(x)   x
+#define ES(x)   .be_strerror = x
 #else
 #define ES(x)
 #endif
@@ -79,511 +78,511 @@ struct bsm_errno {
  * string using strerror(3).
  */
 static const struct bsm_errno bsm_errnos[] = {
-       { BSM_ERRNO_ESUCCESS, 0, ES("Success") },
-       { BSM_ERRNO_EPERM, EPERM, ES("Operation not permitted") },
-       { BSM_ERRNO_ENOENT, ENOENT, ES("No such file or directory") },
-       { BSM_ERRNO_ESRCH, ESRCH, ES("No such process") },
-       { BSM_ERRNO_EINTR, EINTR, ES("Interrupted system call") },
-       { BSM_ERRNO_EIO, EIO, ES("Input/output error") },
-       { BSM_ERRNO_ENXIO, ENXIO, ES("Device not configured") },
-       { BSM_ERRNO_E2BIG, E2BIG, ES("Argument list too long") },
-       { BSM_ERRNO_ENOEXEC, ENOEXEC, ES("Exec format error") },
-       { BSM_ERRNO_EBADF, EBADF, ES("Bad file descriptor") },
-       { BSM_ERRNO_ECHILD, ECHILD, ES("No child processes") },
-       { BSM_ERRNO_EAGAIN, EAGAIN, ES("Resource temporarily unavailable") },
-       { BSM_ERRNO_ENOMEM, ENOMEM, ES("Cannot allocate memory") },
-       { BSM_ERRNO_EACCES, EACCES, ES("Permission denied") },
-       { BSM_ERRNO_EFAULT, EFAULT, ES("Bad address") },
-       { BSM_ERRNO_ENOTBLK, ENOTBLK, ES("Block device required") },
-       { BSM_ERRNO_EBUSY, EBUSY, ES("Device busy") },
-       { BSM_ERRNO_EEXIST, EEXIST, ES("File exists") },
-       { BSM_ERRNO_EXDEV, EXDEV, ES("Cross-device link") },
-       { BSM_ERRNO_ENODEV, ENODEV, ES("Operation not supported by device") },
-       { BSM_ERRNO_ENOTDIR, ENOTDIR, ES("Not a directory") },
-       { BSM_ERRNO_EISDIR, EISDIR, ES("Is a directory") },
-       { BSM_ERRNO_EINVAL, EINVAL, ES("Invalid argument") },
-       { BSM_ERRNO_ENFILE, ENFILE, ES("Too many open files in system") },
-       { BSM_ERRNO_EMFILE, EMFILE, ES("Too many open files") },
-       { BSM_ERRNO_ENOTTY, ENOTTY, ES("Inappropriate ioctl for device") },
-       { BSM_ERRNO_ETXTBSY, ETXTBSY, ES("Text file busy") },
-       { BSM_ERRNO_EFBIG, EFBIG, ES("File too large") },
-       { BSM_ERRNO_ENOSPC, ENOSPC, ES("No space left on device") },
-       { BSM_ERRNO_ESPIPE, ESPIPE, ES("Illegal seek") },
-       { BSM_ERRNO_EROFS, EROFS, ES("Read-only file system") },
-       { BSM_ERRNO_EMLINK, EMLINK, ES("Too many links") },
-       { BSM_ERRNO_EPIPE, EPIPE, ES("Broken pipe") },
-       { BSM_ERRNO_EDOM, EDOM, ES("Numerical argument out of domain") },
-       { BSM_ERRNO_ERANGE, ERANGE, ES("Result too large") },
-       { BSM_ERRNO_ENOMSG, ENOMSG, ES("No message of desired type") },
-       { BSM_ERRNO_EIDRM, EIDRM, ES("Identifier removed") },
-       { BSM_ERRNO_ECHRNG,
+       { .be_bsm_errno = BSM_ERRNO_ESUCCESS, .be_local_errno = 0, ES("Success") },
+       { .be_bsm_errno = BSM_ERRNO_EPERM, .be_local_errno = EPERM, ES("Operation not permitted") },
+       { .be_bsm_errno = BSM_ERRNO_ENOENT, .be_local_errno = ENOENT, ES("No such file or directory") },
+       { .be_bsm_errno = BSM_ERRNO_ESRCH, .be_local_errno = ESRCH, ES("No such process") },
+       { .be_bsm_errno = BSM_ERRNO_EINTR, .be_local_errno = EINTR, ES("Interrupted system call") },
+       { .be_bsm_errno = BSM_ERRNO_EIO, .be_local_errno = EIO, ES("Input/output error") },
+       { .be_bsm_errno = BSM_ERRNO_ENXIO, .be_local_errno = ENXIO, ES("Device not configured") },
+       { .be_bsm_errno = BSM_ERRNO_E2BIG, .be_local_errno = E2BIG, ES("Argument list too long") },
+       { .be_bsm_errno = BSM_ERRNO_ENOEXEC, .be_local_errno = ENOEXEC, ES("Exec format error") },
+       { .be_bsm_errno = BSM_ERRNO_EBADF, .be_local_errno = EBADF, ES("Bad file descriptor") },
+       { .be_bsm_errno = BSM_ERRNO_ECHILD, .be_local_errno = ECHILD, ES("No child processes") },
+       { .be_bsm_errno = BSM_ERRNO_EAGAIN, .be_local_errno = EAGAIN, ES("Resource temporarily unavailable") },
+       { .be_bsm_errno = BSM_ERRNO_ENOMEM, .be_local_errno = ENOMEM, ES("Cannot allocate memory") },
+       { .be_bsm_errno = BSM_ERRNO_EACCES, .be_local_errno = EACCES, ES("Permission denied") },
+       { .be_bsm_errno = BSM_ERRNO_EFAULT, .be_local_errno = EFAULT, ES("Bad address") },
+       { .be_bsm_errno = BSM_ERRNO_ENOTBLK, .be_local_errno = ENOTBLK, ES("Block device required") },
+       { .be_bsm_errno = BSM_ERRNO_EBUSY, .be_local_errno = EBUSY, ES("Device busy") },
+       { .be_bsm_errno = BSM_ERRNO_EEXIST, .be_local_errno = EEXIST, ES("File exists") },
+       { .be_bsm_errno = BSM_ERRNO_EXDEV, .be_local_errno = EXDEV, ES("Cross-device link") },
+       { .be_bsm_errno = BSM_ERRNO_ENODEV, .be_local_errno = ENODEV, ES("Operation not supported by device") },
+       { .be_bsm_errno = BSM_ERRNO_ENOTDIR, .be_local_errno = ENOTDIR, ES("Not a directory") },
+       { .be_bsm_errno = BSM_ERRNO_EISDIR, .be_local_errno = EISDIR, ES("Is a directory") },
+       { .be_bsm_errno = BSM_ERRNO_EINVAL, .be_local_errno = EINVAL, ES("Invalid argument") },
+       { .be_bsm_errno = BSM_ERRNO_ENFILE, .be_local_errno = ENFILE, ES("Too many open files in system") },
+       { .be_bsm_errno = BSM_ERRNO_EMFILE, .be_local_errno = EMFILE, ES("Too many open files") },
+       { .be_bsm_errno = BSM_ERRNO_ENOTTY, .be_local_errno = ENOTTY, ES("Inappropriate ioctl for device") },
+       { .be_bsm_errno = BSM_ERRNO_ETXTBSY, .be_local_errno = ETXTBSY, ES("Text file busy") },
+       { .be_bsm_errno = BSM_ERRNO_EFBIG, .be_local_errno = EFBIG, ES("File too large") },
+       { .be_bsm_errno = BSM_ERRNO_ENOSPC, .be_local_errno = ENOSPC, ES("No space left on device") },
+       { .be_bsm_errno = BSM_ERRNO_ESPIPE, .be_local_errno = ESPIPE, ES("Illegal seek") },
+       { .be_bsm_errno = BSM_ERRNO_EROFS, .be_local_errno = EROFS, ES("Read-only file system") },
+       { .be_bsm_errno = BSM_ERRNO_EMLINK, .be_local_errno = EMLINK, ES("Too many links") },
+       { .be_bsm_errno = BSM_ERRNO_EPIPE, .be_local_errno = EPIPE, ES("Broken pipe") },
+       { .be_bsm_errno = BSM_ERRNO_EDOM, .be_local_errno = EDOM, ES("Numerical argument out of domain") },
+       { .be_bsm_errno = BSM_ERRNO_ERANGE, .be_local_errno = ERANGE, ES("Result too large") },
+       { .be_bsm_errno = BSM_ERRNO_ENOMSG, .be_local_errno = ENOMSG, ES("No message of desired type") },
+       { .be_bsm_errno = BSM_ERRNO_EIDRM, .be_local_errno = EIDRM, ES("Identifier removed") },
+       { .be_bsm_errno = BSM_ERRNO_ECHRNG,
 #ifdef ECHRNG
-         ECHRNG,
+         .be_local_errno = ECHRNG,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Channel number out of range") },
-       { BSM_ERRNO_EL2NSYNC,
+       { .be_bsm_errno = BSM_ERRNO_EL2NSYNC,
 #ifdef EL2NSYNC
-         EL2NSYNC,
+         .be_local_errno = EL2NSYNC,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Level 2 not synchronized") },
-       { BSM_ERRNO_EL3HLT,
+       { .be_bsm_errno = BSM_ERRNO_EL3HLT,
 #ifdef EL3HLT
-         EL3HLT,
+         .be_local_errno = EL3HLT,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Level 3 halted") },
-       { BSM_ERRNO_EL3RST,
+       { .be_bsm_errno = BSM_ERRNO_EL3RST,
 #ifdef EL3RST
-         EL3RST,
+         .be_local_errno = EL3RST,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Level 3 reset") },
-       { BSM_ERRNO_ELNRNG,
+       { .be_bsm_errno = BSM_ERRNO_ELNRNG,
 #ifdef ELNRNG
-         ELNRNG,
+         .be_local_errno = ELNRNG,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Link number out of range") },
-       { BSM_ERRNO_EUNATCH,
+       { .be_bsm_errno = BSM_ERRNO_EUNATCH,
 #ifdef EUNATCH
-         EUNATCH,
+         .be_local_errno = EUNATCH,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Protocol driver not attached") },
-       { BSM_ERRNO_ENOCSI,
+       { .be_bsm_errno = BSM_ERRNO_ENOCSI,
 #ifdef ENOCSI
-         ENOCSI,
+         .be_local_errno = ENOCSI,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("No CSI structure available") },
-       { BSM_ERRNO_EL2HLT,
+       { .be_bsm_errno = BSM_ERRNO_EL2HLT,
 #ifdef EL2HLT
-         EL2HLT,
+         .be_local_errno = EL2HLT,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Level 2 halted") },
-       { BSM_ERRNO_EDEADLK, EDEADLK, ES("Resource deadlock avoided") },
-       { BSM_ERRNO_ENOLCK, ENOLCK, ES("No locks available") },
-       { BSM_ERRNO_ECANCELED, ECANCELED, ES("Operation canceled") },
-       { BSM_ERRNO_ENOTSUP, ENOTSUP, ES("Operation not supported") },
-       { BSM_ERRNO_EDQUOT, EDQUOT, ES("Disc quota exceeded") },
-       { BSM_ERRNO_EBADE,
+       { .be_bsm_errno = BSM_ERRNO_EDEADLK, .be_local_errno = EDEADLK, ES("Resource deadlock avoided") },
+       { .be_bsm_errno = BSM_ERRNO_ENOLCK, .be_local_errno = ENOLCK, ES("No locks available") },
+       { .be_bsm_errno = BSM_ERRNO_ECANCELED, .be_local_errno = ECANCELED, ES("Operation canceled") },
+       { .be_bsm_errno = BSM_ERRNO_ENOTSUP, .be_local_errno = ENOTSUP, ES("Operation not supported") },
+       { .be_bsm_errno = BSM_ERRNO_EDQUOT, .be_local_errno = EDQUOT, ES("Disc quota exceeded") },
+       { .be_bsm_errno = BSM_ERRNO_EBADE,
 #ifdef EBADE
-         EBADE,
+         .be_local_errno = EBADE,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Invalid exchange") },
-       { BSM_ERRNO_EBADR,
+       { .be_bsm_errno = BSM_ERRNO_EBADR,
 #ifdef EBADR
-         EBADR,
+         .be_local_errno = EBADR,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Invalid request descriptor") },
-       { BSM_ERRNO_EXFULL,
+       { .be_bsm_errno = BSM_ERRNO_EXFULL,
 #ifdef EXFULL
-         EXFULL,
+         .be_local_errno = EXFULL,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Exchange full") },
-       { BSM_ERRNO_ENOANO,
+       { .be_bsm_errno = BSM_ERRNO_ENOANO,
 #ifdef ENOANO
-         ENOANO,
+         .be_local_errno = ENOANO,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("No anode") },
-       { BSM_ERRNO_EBADRQC,
+       { .be_bsm_errno = BSM_ERRNO_EBADRQC,
 #ifdef EBADRQC
-         EBADRQC,
+         .be_local_errno = EBADRQC,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Invalid request descriptor") },
-       { BSM_ERRNO_EBADSLT,
+       { .be_bsm_errno = BSM_ERRNO_EBADSLT,
 #ifdef EBADSLT
-         EBADSLT,
+         .be_local_errno = EBADSLT,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Invalid slot") },
-       { BSM_ERRNO_EDEADLOCK,
+       { .be_bsm_errno = BSM_ERRNO_EDEADLOCK,
 #ifdef EDEADLOCK
-         EDEADLOCK,
+         .be_local_errno = EDEADLOCK,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Resource deadlock avoided") },
-       { BSM_ERRNO_EBFONT,
+       { .be_bsm_errno = BSM_ERRNO_EBFONT,
 #ifdef EBFONT
-         EBFONT,
+         .be_local_errno = EBFONT,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Bad font file format") },
-       { BSM_ERRNO_EOWNERDEAD,
+       { .be_bsm_errno = BSM_ERRNO_EOWNERDEAD,
 #ifdef EOWNERDEAD
-         EOWNERDEAD,
+         .be_local_errno = EOWNERDEAD,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Process died with the lock") },
-       { BSM_ERRNO_ENOTRECOVERABLE,
+       { .be_bsm_errno = BSM_ERRNO_ENOTRECOVERABLE,
 #ifdef ENOTRECOVERABLE
-         ENOTRECOVERABLE,
+         .be_local_errno = ENOTRECOVERABLE,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Lock is not recoverable") },
-       { BSM_ERRNO_ENOSTR,
+       { .be_bsm_errno = BSM_ERRNO_ENOSTR,
 #ifdef ENOSTR
-         ENOSTR,
+         .be_local_errno = ENOSTR,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Device not a stream") },
-       { BSM_ERRNO_ENONET,
+       { .be_bsm_errno = BSM_ERRNO_ENONET,
 #ifdef ENONET
-         ENONET,
+         .be_local_errno = ENONET,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Machine is not on the network") },
-       { BSM_ERRNO_ENOPKG,
+       { .be_bsm_errno = BSM_ERRNO_ENOPKG,
 #ifdef ENOPKG
-         ENOPKG,
+         .be_local_errno = ENOPKG,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Package not installed") },
-       { BSM_ERRNO_EREMOTE, EREMOTE,
+       { .be_bsm_errno = BSM_ERRNO_EREMOTE, .be_local_errno = EREMOTE,
          ES("Too many levels of remote in path") },
-       { BSM_ERRNO_ENOLINK,
+       { .be_bsm_errno = BSM_ERRNO_ENOLINK,
 #ifdef ENOLINK
-         ENOLINK,
+         .be_local_errno = ENOLINK,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Link has been severed") },
-       { BSM_ERRNO_EADV,
+       { .be_bsm_errno = BSM_ERRNO_EADV,
 #ifdef EADV
-         EADV,
+         .be_local_errno = EADV,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Advertise error") },
-       { BSM_ERRNO_ESRMNT,
+       { .be_bsm_errno = BSM_ERRNO_ESRMNT,
 #ifdef ESRMNT
-         ESRMNT,
+         .be_local_errno = ESRMNT,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("srmount error") },
-       { BSM_ERRNO_ECOMM,
+       { .be_bsm_errno = BSM_ERRNO_ECOMM,
 #ifdef ECOMM
-         ECOMM,
+         .be_local_errno = ECOMM,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Communication error on send") },
-       { BSM_ERRNO_EPROTO,
+       { .be_bsm_errno = BSM_ERRNO_EPROTO,
 #ifdef EPROTO
-         EPROTO,
+         .be_local_errno = EPROTO,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Protocol error") },
-       { BSM_ERRNO_ELOCKUNMAPPED,
+       { .be_bsm_errno = BSM_ERRNO_ELOCKUNMAPPED,
 #ifdef ELOCKUNMAPPED
-         ELOCKUNMAPPED,
+         .be_local_errno = ELOCKUNMAPPED,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Locked lock was unmapped") },
-       { BSM_ERRNO_ENOTACTIVE,
+       { .be_bsm_errno = BSM_ERRNO_ENOTACTIVE,
 #ifdef ENOTACTIVE
-         ENOTACTIVE,
+         .be_local_errno = ENOTACTIVE,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Facility is not active") },
-       { BSM_ERRNO_EMULTIHOP,
+       { .be_bsm_errno = BSM_ERRNO_EMULTIHOP,
 #ifdef EMULTIHOP
-         EMULTIHOP,
+         .be_local_errno = EMULTIHOP,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Multihop attempted") },
-       { BSM_ERRNO_EBADMSG,
+       { .be_bsm_errno = BSM_ERRNO_EBADMSG,
 #ifdef EBADMSG
-         EBADMSG,
+         .be_local_errno = EBADMSG,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Bad message") },
-       { BSM_ERRNO_ENAMETOOLONG, ENAMETOOLONG, ES("File name too long") },
-       { BSM_ERRNO_EOVERFLOW, EOVERFLOW,
+       { .be_bsm_errno = BSM_ERRNO_ENAMETOOLONG, .be_local_errno = ENAMETOOLONG, ES("File name too long") },
+       { .be_bsm_errno = BSM_ERRNO_EOVERFLOW, .be_local_errno = EOVERFLOW,
          ES("Value too large to be stored in data type") },
-       { BSM_ERRNO_ENOTUNIQ,
+       { .be_bsm_errno = BSM_ERRNO_ENOTUNIQ,
 #ifdef ENOTUNIQ
-         ENOTUNIQ,
+         .be_local_errno = ENOTUNIQ,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Given log name not unique") },
-       { BSM_ERRNO_EBADFD,
+       { .be_bsm_errno = BSM_ERRNO_EBADFD,
 #ifdef EBADFD
-         EBADFD,
+         .be_local_errno = EBADFD,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Given f.d. invalid for this operation") },
-       { BSM_ERRNO_EREMCHG,
+       { .be_bsm_errno = BSM_ERRNO_EREMCHG,
 #ifdef EREMCHG
-         EREMCHG,
+         .be_local_errno = EREMCHG,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Remote address changed") },
-       { BSM_ERRNO_ELIBACC,
+       { .be_bsm_errno = BSM_ERRNO_ELIBACC,
 #ifdef ELIBACC
-         ELIBACC,
+         .be_local_errno = ELIBACC,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Can't access a needed shared lib") },
-       { BSM_ERRNO_ELIBBAD,
+       { .be_bsm_errno = BSM_ERRNO_ELIBBAD,
 #ifdef ELIBBAD
-         ELIBBAD,
+         .be_local_errno = ELIBBAD,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Accessing a corrupted shared lib") },
-       { BSM_ERRNO_ELIBSCN,
+       { .be_bsm_errno = BSM_ERRNO_ELIBSCN,
 #ifdef ELIBSCN
-         ELIBSCN,
+         .be_local_errno = ELIBSCN,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES(".lib section in a.out corrupted") },
-       { BSM_ERRNO_ELIBMAX,
+       { .be_bsm_errno = BSM_ERRNO_ELIBMAX,
 #ifdef ELIBMAX
-         ELIBMAX,
+         .be_local_errno = ELIBMAX,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Attempting to link in too many libs") },
-       { BSM_ERRNO_ELIBEXEC,
+       { .be_bsm_errno = BSM_ERRNO_ELIBEXEC,
 #ifdef ELIBEXEC
-         ELIBEXEC,
+         .be_local_errno = ELIBEXEC,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Attempting to exec a shared library") },
-       { BSM_ERRNO_EILSEQ, EILSEQ, ES("Illegal byte sequence") },
-       { BSM_ERRNO_ENOSYS, ENOSYS, ES("Function not implemented") },
-       { BSM_ERRNO_ELOOP, ELOOP, ES("Too many levels of symbolic links") },
-       { BSM_ERRNO_ERESTART,
+       { .be_bsm_errno = BSM_ERRNO_EILSEQ, .be_local_errno = EILSEQ, ES("Illegal byte sequence") },
+       { .be_bsm_errno = BSM_ERRNO_ENOSYS, .be_local_errno = ENOSYS, ES("Function not implemented") },
+       { .be_bsm_errno = BSM_ERRNO_ELOOP, .be_local_errno = ELOOP, ES("Too many levels of symbolic links") },
+       { .be_bsm_errno = BSM_ERRNO_ERESTART,
 #ifdef ERESTART
-         ERESTART,
+         .be_local_errno = ERESTART,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Restart syscall") },
-       { BSM_ERRNO_ESTRPIPE,
+       { .be_bsm_errno = BSM_ERRNO_ESTRPIPE,
 #ifdef ESTRPIPE
-         ESTRPIPE,
+         .be_local_errno = ESTRPIPE,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("If pipe/FIFO, don't sleep in stream head") },
-       { BSM_ERRNO_ENOTEMPTY, ENOTEMPTY, ES("Directory not empty") },
-       { BSM_ERRNO_EUSERS, EUSERS, ES("Too many users") },
-       { BSM_ERRNO_ENOTSOCK, ENOTSOCK,
+       { .be_bsm_errno = BSM_ERRNO_ENOTEMPTY, .be_local_errno = ENOTEMPTY, ES("Directory not empty") },
+       { .be_bsm_errno = BSM_ERRNO_EUSERS, .be_local_errno = EUSERS, ES("Too many users") },
+       { .be_bsm_errno = BSM_ERRNO_ENOTSOCK, .be_local_errno = ENOTSOCK,
          ES("Socket operation on non-socket") },
-       { BSM_ERRNO_EDESTADDRREQ, EDESTADDRREQ,
+       { .be_bsm_errno = BSM_ERRNO_EDESTADDRREQ, .be_local_errno = EDESTADDRREQ,
          ES("Destination address required") },
-       { BSM_ERRNO_EMSGSIZE, EMSGSIZE, ES("Message too long") },
-       { BSM_ERRNO_EPROTOTYPE, EPROTOTYPE,
+       { .be_bsm_errno = BSM_ERRNO_EMSGSIZE, .be_local_errno = EMSGSIZE, ES("Message too long") },
+       { .be_bsm_errno = BSM_ERRNO_EPROTOTYPE, .be_local_errno = EPROTOTYPE,
          ES("Protocol wrong type for socket") },
-       { BSM_ERRNO_ENOPROTOOPT, ENOPROTOOPT, ES("Protocol not available") },
-       { BSM_ERRNO_EPROTONOSUPPORT, EPROTONOSUPPORT,
+       { .be_bsm_errno = BSM_ERRNO_ENOPROTOOPT, .be_local_errno = ENOPROTOOPT, ES("Protocol not available") },
+       { .be_bsm_errno = BSM_ERRNO_EPROTONOSUPPORT, .be_local_errno = EPROTONOSUPPORT,
          ES("Protocol not supported") },
-       { BSM_ERRNO_ESOCKTNOSUPPORT, ESOCKTNOSUPPORT,
+       { .be_bsm_errno = BSM_ERRNO_ESOCKTNOSUPPORT, .be_local_errno = ESOCKTNOSUPPORT,
          ES("Socket type not supported") },
-       { BSM_ERRNO_EOPNOTSUPP, EOPNOTSUPP, ES("Operation not supported") },
-       { BSM_ERRNO_EPFNOSUPPORT, EPFNOSUPPORT,
+       { .be_bsm_errno = BSM_ERRNO_EOPNOTSUPP, .be_local_errno = EOPNOTSUPP, ES("Operation not supported") },
+       { .be_bsm_errno = BSM_ERRNO_EPFNOSUPPORT, .be_local_errno = EPFNOSUPPORT,
          ES("Protocol family not supported") },
-       { BSM_ERRNO_EAFNOSUPPORT, EAFNOSUPPORT,
+       { .be_bsm_errno = BSM_ERRNO_EAFNOSUPPORT, .be_local_errno = EAFNOSUPPORT,
          ES("Address family not supported by protocol family") },
-       { BSM_ERRNO_EADDRINUSE, EADDRINUSE, ES("Address already in use") },
-       { BSM_ERRNO_EADDRNOTAVAIL, EADDRNOTAVAIL,
+       { .be_bsm_errno = BSM_ERRNO_EADDRINUSE, .be_local_errno = EADDRINUSE, ES("Address already in use") },
+       { .be_bsm_errno = BSM_ERRNO_EADDRNOTAVAIL, .be_local_errno = EADDRNOTAVAIL,
          ES("Can't assign requested address") },
-       { BSM_ERRNO_ENETDOWN, ENETDOWN, ES("Network is down") },
-       { BSM_ERRNO_ENETRESET, ENETRESET,
+       { .be_bsm_errno = BSM_ERRNO_ENETDOWN, .be_local_errno = ENETDOWN, ES("Network is down") },
+       { .be_bsm_errno = BSM_ERRNO_ENETRESET, .be_local_errno = ENETRESET,
          ES("Network dropped connection on reset") },
-       { BSM_ERRNO_ECONNABORTED, ECONNABORTED,
+       { .be_bsm_errno = BSM_ERRNO_ECONNABORTED, .be_local_errno = ECONNABORTED,
          ES("Software caused connection abort") },
-       { BSM_ERRNO_ECONNRESET, ECONNRESET, ES("Connection reset by peer") },
-       { BSM_ERRNO_ENOBUFS, ENOBUFS, ES("No buffer space available") },
-       { BSM_ERRNO_EISCONN, EISCONN, ES("Socket is already connected") },
-       { BSM_ERRNO_ENOTCONN, ENOTCONN, ES("Socket is not connected") },
-       { BSM_ERRNO_ESHUTDOWN, ESHUTDOWN,
+       { .be_bsm_errno = BSM_ERRNO_ECONNRESET, .be_local_errno = ECONNRESET, ES("Connection reset by peer") },
+       { .be_bsm_errno = BSM_ERRNO_ENOBUFS, .be_local_errno = ENOBUFS, ES("No buffer space available") },
+       { .be_bsm_errno = BSM_ERRNO_EISCONN, .be_local_errno = EISCONN, ES("Socket is already connected") },
+       { .be_bsm_errno = BSM_ERRNO_ENOTCONN, .be_local_errno = ENOTCONN, ES("Socket is not connected") },
+       { .be_bsm_errno = BSM_ERRNO_ESHUTDOWN, .be_local_errno = ESHUTDOWN,
          ES("Can't send after socket shutdown") },
-       { BSM_ERRNO_ETOOMANYREFS, ETOOMANYREFS,
+       { .be_bsm_errno = BSM_ERRNO_ETOOMANYREFS, .be_local_errno = ETOOMANYREFS,
          ES("Too many references: can't splice") },
-       { BSM_ERRNO_ETIMEDOUT, ETIMEDOUT, ES("Operation timed out") },
-       { BSM_ERRNO_ECONNREFUSED, ECONNREFUSED, ES("Connection refused") },
-       { BSM_ERRNO_EHOSTDOWN, EHOSTDOWN, ES("Host is down") },
-       { BSM_ERRNO_EHOSTUNREACH, EHOSTUNREACH, ES("No route to host") },
-       { BSM_ERRNO_EALREADY, EALREADY, ES("Operation already in progress") },
-       { BSM_ERRNO_EINPROGRESS, EINPROGRESS,
+       { .be_bsm_errno = BSM_ERRNO_ETIMEDOUT, .be_local_errno = ETIMEDOUT, ES("Operation timed out") },
+       { .be_bsm_errno = BSM_ERRNO_ECONNREFUSED, .be_local_errno = ECONNREFUSED, ES("Connection refused") },
+       { .be_bsm_errno = BSM_ERRNO_EHOSTDOWN, .be_local_errno = EHOSTDOWN, ES("Host is down") },
+       { .be_bsm_errno = BSM_ERRNO_EHOSTUNREACH, .be_local_errno = EHOSTUNREACH, ES("No route to host") },
+       { .be_bsm_errno = BSM_ERRNO_EALREADY, .be_local_errno = EALREADY, ES("Operation already in progress") },
+       { .be_bsm_errno = BSM_ERRNO_EINPROGRESS, .be_local_errno = EINPROGRESS,
          ES("Operation now in progress") },
-       { BSM_ERRNO_ESTALE, ESTALE, ES("Stale NFS file handle") },
-       { BSM_ERRNO_EQFULL, EQFULL, ES("Interface output queue is full") },
-       { BSM_ERRNO_EPWROFF,
+       { .be_bsm_errno = BSM_ERRNO_ESTALE, .be_local_errno = ESTALE, ES("Stale NFS file handle") },
+       { .be_bsm_errno = BSM_ERRNO_EQFULL, .be_local_errno = EQFULL, ES("Interface output queue is full") },
+       { .be_bsm_errno = BSM_ERRNO_EPWROFF,
 #ifdef EPWROFF
-         EPWROFF,
+         .be_local_errno = EPWROFF,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Device power is off") },
-       { BSM_ERRNO_EDEVERR,
+       { .be_bsm_errno = BSM_ERRNO_EDEVERR,
 #ifdef EDEVERR
-         EDEVERR,
+         .be_local_errno = EDEVERR,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Device error") },
-       { BSM_ERRNO_EBADEXEC,
+       { .be_bsm_errno = BSM_ERRNO_EBADEXEC,
 #ifdef EBADEXEC
-         EBADEXEC,
+         .be_local_errno = EBADEXEC,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Bad executable") },
-       { BSM_ERRNO_EBADARCH,
+       { .be_bsm_errno = BSM_ERRNO_EBADARCH,
 #ifdef EBADARCH
-         EBADARCH,
+         .be_local_errno = EBADARCH,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Bad CPU type in executable") },
-       { BSM_ERRNO_ESHLIBVERS,
+       { .be_bsm_errno = BSM_ERRNO_ESHLIBVERS,
 #ifdef ESHLIBVERS
-         ESHLIBVERS,
+         .be_local_errno = ESHLIBVERS,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Shared library version mismatch") },
-       { BSM_ERRNO_EBADMACHO,
+       { .be_bsm_errno = BSM_ERRNO_EBADMACHO,
 #ifdef EBADMACHO
-         EBADMACHO,
+         .be_local_errno = EBADMACHO,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Malformed Macho file") },
-       { BSM_ERRNO_EPOLICY,
+       { .be_bsm_errno = BSM_ERRNO_EPOLICY,
 #ifdef EPOLICY
-         EPOLICY,
+         .be_local_errno = EPOLICY,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Operation failed by policy") },
-       { BSM_ERRNO_EDOTDOT,
+       { .be_bsm_errno = BSM_ERRNO_EDOTDOT,
 #ifdef EDOTDOT
-         EDOTDOT,
+         .be_local_errno = EDOTDOT,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("RFS specific error") },
-       { BSM_ERRNO_EUCLEAN,
+       { .be_bsm_errno = BSM_ERRNO_EUCLEAN,
 #ifdef EUCLEAN
-         EUCLEAN,
+         .be_local_errno = EUCLEAN,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Structure needs cleaning") },
-       { BSM_ERRNO_ENOTNAM,
+       { .be_bsm_errno = BSM_ERRNO_ENOTNAM,
 #ifdef ENOTNAM
-         ENOTNAM,
+         .be_local_errno = ENOTNAM,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Not a XENIX named type file") },
-       { BSM_ERRNO_ENAVAIL,
+       { .be_bsm_errno = BSM_ERRNO_ENAVAIL,
 #ifdef ENAVAIL
-         ENAVAIL,
+         .be_local_errno = ENAVAIL,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("No XENIX semaphores available") },
-       { BSM_ERRNO_EISNAM,
+       { .be_bsm_errno = BSM_ERRNO_EISNAM,
 #ifdef EISNAM
-         EISNAM,
+         .be_local_errno = EISNAM,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Is a named type file") },
-       { BSM_ERRNO_EREMOTEIO,
+       { .be_bsm_errno = BSM_ERRNO_EREMOTEIO,
 #ifdef EREMOTEIO
-         EREMOTEIO,
+         .be_local_errno = EREMOTEIO,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Remote I/O error") },
-       { BSM_ERRNO_ENOMEDIUM,
+       { .be_bsm_errno = BSM_ERRNO_ENOMEDIUM,
 #ifdef ENOMEDIUM
-         ENOMEDIUM,
+         .be_local_errno = ENOMEDIUM,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("No medium found") },
-       { BSM_ERRNO_EMEDIUMTYPE,
+       { .be_bsm_errno = BSM_ERRNO_EMEDIUMTYPE,
 #ifdef EMEDIUMTYPE
-         EMEDIUMTYPE,
+         .be_local_errno = EMEDIUMTYPE,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Wrong medium type") },
-       { BSM_ERRNO_ENOKEY,
+       { .be_bsm_errno = BSM_ERRNO_ENOKEY,
 #ifdef ENOKEY
-         ENOKEY,
+         .be_local_errno = ENOKEY,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Required key not available") },
-       { BSM_ERRNO_EKEYEXPIRED,
+       { .be_bsm_errno = BSM_ERRNO_EKEYEXPIRED,
 #ifdef EKEEXPIRED
-         EKEYEXPIRED,
+         .be_local_errno = EKEYEXPIRED,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Key has expired") },
-       { BSM_ERRNO_EKEYREVOKED,
+       { .be_bsm_errno = BSM_ERRNO_EKEYREVOKED,
 #ifdef EKEYREVOKED
-         EKEYREVOKED,
+         .be_local_errno = EKEYREVOKED,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Key has been revoked") },
-       { BSM_ERRNO_EKEYREJECTED,
+       { .be_bsm_errno = BSM_ERRNO_EKEYREJECTED,
 #ifdef EKEREJECTED
-         EKEYREJECTED,
+         .be_local_errno = EKEYREJECTED,
 #else
-         ERRNO_NO_LOCAL_MAPPING,
+         .be_local_errno = ERRNO_NO_LOCAL_MAPPING,
 #endif
          ES("Key was rejected by service") },
 };
index 11454fbf42e0071ea74417a797d7f85f4e1793c1..3236e918fae2e44413a54bc3a441e2fd78b66fe7 100644 (file)
@@ -226,6 +226,9 @@ static const bsm_fcntl_cmd_t bsm_fcntl_cmdtab[] = {
 #ifdef  F_TRIM_ACTIVE_FILE
        { BSM_F_TRIM_ACTIVE_FILE, F_TRIM_ACTIVE_FILE },
 #endif
+#ifdef  F_SPECULATIVE_READ
+       { BSM_F_SPECULATIVE_READ, F_SPECULATIVE_READ },
+#endif
 
 #ifdef  FCNTL_FS_SPECIFIC_BASE
        { BSM_F_FS_SPECIFIC_0, FCNTL_FS_SPECIFIC_BASE},
index 8b34ab598bf49b9fa697d6687b80586cca5f8581..6b096473f950160fb4327fa1b7997c595758d920 100644 (file)
@@ -1084,45 +1084,6 @@ audit_pipe_poll(dev_t dev, int events, void *wql, struct proc *p)
        return revents;
 }
 
-#ifndef __APPLE__
-/*
- * Return true if there are records available for reading on the pipe.
- */
-static int
-audit_pipe_kqread(struct knote *kn, long hint)
-{
-       struct audit_pipe *ap;
-
-       ap = (struct audit_pipe *)kn->kn_hook;
-       KASSERT(ap != NULL, ("audit_pipe_kqread: ap == NULL"));
-       AUDIT_PIPE_LOCK_ASSERT(ap);
-
-       if (ap->ap_qlen != 0) {
-               kn->kn_data = ap->ap_qbyteslen - ap->ap_qoffset;
-               return 1;
-       } else {
-               kn->kn_data = 0;
-               return 0;
-       }
-}
-
-/*
- * Detach kqueue state from audit pipe.
- */
-static void
-audit_pipe_kqdetach(struct knote *kn)
-{
-       struct audit_pipe *ap;
-
-       ap = (struct audit_pipe *)kn->kn_hook;
-       KASSERT(ap != NULL, ("audit_pipe_kqdetach: ap == NULL"));
-
-       AUDIT_PIPE_LOCK(ap);
-       knlist_remove(&ap->ap_selinfo.si_note, kn, 1);
-       AUDIT_PIPE_UNLOCK(ap);
-}
-#endif /* !__APPLE__ */
-
 static void *devnode;
 
 int
index 4da799daad349c48740d07b53d56913fb9e70771..e9e2e6dec38128906294d8c8eac3c35b6fa69e62 100644 (file)
@@ -348,7 +348,7 @@ struct kaudit_record    *audit_new(int event, proc_t p, struct uthread *td);
  */
 struct au_record;
 int      kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau);
-int      bsm_rec_verify(void *rec, int length);
+int      bsm_rec_verify(void *rec, int length, boolean_t kern_events_allowed);
 
 /*
  * Kernel versions of the libbsm audit record functions.
@@ -488,6 +488,11 @@ int     audit_session_lookup(au_asid_t asid, auditinfo_addr_t *ret_aia);
  */
 #define AU_AUDITCTL_RESERVED_ENTITLEMENT "com.apple.private.protected-audit-control"
 
+/*
+ * Entitlement required to control auditctl sys call
+ */
+#define AU_AUDIT_USER_ENTITLEMENT "com.apple.private.audit.user"
+
 /*
  * Max sizes used by the kernel for signing id and team id values of the
  * identity tokens. These lengths include space for the null terminator.
@@ -498,8 +503,10 @@ int     audit_session_lookup(au_asid_t asid, auditinfo_addr_t *ret_aia);
 struct __attribute__((__packed__)) hdr_tok_partial {
        u_char type;
        uint32_t len;
+       u_char ver;
+       uint16_t e_type;
 };
-static_assert(sizeof(struct hdr_tok_partial) == 5);
+static_assert(sizeof(struct hdr_tok_partial) == 8);
 
 struct __attribute__((__packed__)) trl_tok_partial {
        u_char type;
index d99b186fa582423f709aebedbb266cb1c6ed5779..fed2634710bd84343f04d808522476a887a1eebc 100644 (file)
@@ -34,6 +34,7 @@
 #include <sys/kauth.h>
 #include <sys/conf.h>
 #include <sys/poll.h>
+#include <sys/priv.h>
 #include <sys/queue.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
@@ -100,8 +101,10 @@ static au_sentry_t audit_default_se = {
 
 struct auditinfo_addr *audit_default_aia_p = &audit_default_se.se_auinfo;
 
+/* Copied from <ipc/ipc_object.h> */
+#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1
 kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t,
-    mach_msg_type_name_t, ipc_port_t *);
+    mach_msg_type_name_t, ipc_port_t *, mach_port_context_t, mach_msg_guard_flags_t *, uint32_t);
 void ipc_port_release_send(ipc_port_t);
 
 #if CONFIG_AUDIT
@@ -556,7 +559,7 @@ audit_sysctl_session_debug(__unused struct sysctl_oid *oidp,
        AUDIT_SENTRY_RUNLOCK();
 
        /* Reconcile with the process table. */
-       (void) proc_iterate(PROC_ALLPROCLIST | PROC_ZOMBPROCLIST,
+       proc_iterate(PROC_ALLPROCLIST | PROC_ZOMBPROCLIST,
            audit_session_debug_callout, NULL,
            audit_session_debug_filterfn, (void *)&sed_tab[0]);
 
@@ -1350,10 +1353,15 @@ audit_session_port(proc_t p, struct audit_session_port_args *uap,
                 */
                se = AU_SENTRY_PTR(aia_p);
                audit_ref_session(se);
-       } else if (kauth_cred_issuser(cred)) {
-               /* The superuser may obtain a port for any existing
-                * session.
+       } else {
+               /*
+                * Only privileged processes may obtain a port for
+                * any existing session.
                 */
+               err = priv_check_cred(cred, PRIV_AUDIT_SESSION_PORT, 0);
+               if (err != 0) {
+                       goto done;
+               }
                AUDIT_SENTRY_RLOCK();
                se = audit_session_find(uap->asid);
                AUDIT_SENTRY_RUNLOCK();
@@ -1362,9 +1370,6 @@ audit_session_port(proc_t p, struct audit_session_port_args *uap,
                        goto done;
                }
                aia_p = &se->se_auinfo;
-       } else {
-               err = EPERM;
-               goto done;
        }
 
        /*
@@ -1513,7 +1518,7 @@ audit_session_join(proc_t p, struct audit_session_join_args *uap,
 
 
        if (ipc_object_copyin(get_task_ipcspace(p->task), send,
-           MACH_MSG_TYPE_COPY_SEND, &port) != KERN_SUCCESS) {
+           MACH_MSG_TYPE_COPY_SEND, &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND) != KERN_SUCCESS) {
                *ret_asid = AU_DEFAUDITSID;
                err = EINVAL;
        } else {
index 0df9209ce721dfaebae7bbe3474f70dde8ab1b3a..4db4b53f7ec97dace2d38257b808052327d07f39 100644 (file)
@@ -1,6 +1,5 @@
 /*-
- * Copyright (c) 1999-2010, Apple Inc.
- * All rights reserved.
+ * Copyright (c) 1999-2019 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -159,12 +158,30 @@ audit(proc_t p, struct audit_args *uap, __unused int32_t *retval)
        int max_record_length = MAX_AUDIT_RECORD_SIZE;
        void *udata = NULL;
        u_int ulen = 0;
-       struct au_identity_info id_info = {0, NULL, 0, NULL, 0, NULL, 0};
+       struct au_identity_info id_info = {
+               .signer_type = 0,
+               .signing_id = NULL,
+               .signing_id_trunc = 0,
+               .team_id = NULL,
+               .team_id_trunc = 0,
+               .cdhash = NULL,
+               .cdhash_len = 0
+       };
        token_t *id_tok = NULL;
+       boolean_t kern_events_allowed = FALSE;
 
        error = suser(kauth_cred_get(), &p->p_acflag);
        if (error) {
-               goto free_out;
+               /*
+                * If a process is not running as root but is properly
+                * entitled, allow it to audit non-kernel events only.
+                */
+               if (!IOTaskHasEntitlement(current_task(),
+                   AU_AUDIT_USER_ENTITLEMENT)) {
+                       goto free_out;
+               }
+       } else {
+               kern_events_allowed = TRUE;
        }
 
        mtx_lock(&audit_mtx);
@@ -234,7 +251,7 @@ audit(proc_t p, struct audit_args *uap, __unused int32_t *retval)
 #endif
 
        /* Verify the record. */
-       if (bsm_rec_verify(rec, uap->length) == 0) {
+       if (bsm_rec_verify(rec, uap->length, kern_events_allowed) == 0) {
                error = EINVAL;
                goto free_out;
        }
index e2ad05581027ae82b4e50fc0f400a2b3a9ec20ba..b813428ca42aaf644cc517ad236205efd9dc9e76 100644 (file)
@@ -33,7 +33,12 @@ DATAFILES = \
        user.h utfconv.h utsname.h vadvise.h vcmd.h \
        vm.h vmmeter.h vmparam.h vnioctl.h vnode.h vnode_if.h vstat.h wait.h xattr.h \
        _select.h _structs.h _types.h _endian.h domain.h protosw.h \
-       spawn.h timex.h commpage.h
+       spawn.h timex.h commpage.h log_data.h
+
+# Installs header file for DriverKit drivers -
+#        $(DSTROOT)/System/DriverKit/System/usr/include/
+DRIVERKIT_DATAFILES = \
+       cdefs.h _types.h
 
 # Installs header file for Apple internal use in user level -
 #        $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
@@ -64,7 +69,10 @@ PRIVATE_DATAFILES = \
        kern_control.h \
        kern_event.h \
        kern_memorystatus.h \
+       kern_memorystatus_freeze.h \
+       kern_memorystatus_notify.h \
        kern_overrides.h \
+       kern_sysctl.h \
        mbuf.h \
        mman.h \
        monotonic.h \
@@ -98,6 +106,11 @@ PRIVATE_DATAFILES = \
        memory_maintenance.h \
        commpage.h
 
+# Installs header file for Apple internal use by DriverKit drivers -
+#        $(DSTROOT)/System/DriverKit/System/usr/local/include/
+DRIVERKIT_PRIVATE_DATAFILES = \
+       appleapiopts.h kdebug.h
+
 # Installs header file for kernel extensions -
 #        $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers
 #        $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders
@@ -108,8 +121,8 @@ KERNELFILES = \
        errno.h ev.h event.h fcntl.h file.h filio.h \
        ioccom.h ioctl.h ipc.h \
        ioctl_compat.h  kernel.h kernel_types.h kern_event.h lock.h lockf.h \
-       kauth.h kdebug.h  md5.h kern_control.h imgact.h malloc.h namei.h \
-       mman.h mbuf.h mount.h netport.h param.h paths.h \
+       kauth.h kdebug.h kdebug_kernel.h md5.h kern_control.h imgact.h malloc.h \
+       namei.h mman.h mbuf.h mount.h netport.h param.h paths.h \
        proc.h  queue.h random.h resource.h \
        sbuf.h posix_sem.h posix_shm.h sem.h shm.h \
        select.h signal.h socket.h socketvar.h sockio.h stat.h stdio.h \
@@ -140,6 +153,8 @@ PRIVATE_KERNELFILES = \
        fslog.h \
        kasl.h \
        kern_memorystatus.h \
+       kern_memorystatus_freeze.h \
+       kern_memorystatus_notify.h \
        kpi_private.h \
        ktrace.h \
        mach_swapon.h \
@@ -168,6 +183,7 @@ PRIVATE_KERNELFILES = \
        doc_tombstone.h \
        fsevents.h \
        work_interval.h \
+       kern_sysctl.h \
 
 XNU_ONLY_EXPORTS = \
        bsdtask_info.h \
@@ -191,8 +207,12 @@ XNU_ONLY_EXPORTS = \
 # /usr/include
 INSTALL_MI_LIST        = ${DATAFILES}
 
+INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES}
+
 INSTALL_MI_GEN_LIST = syscall.h _posix_availability.h _symbol_aliasing.h
 
+INSTALL_DRIVERKIT_MI_GEN_LIST = _posix_availability.h _symbol_aliasing.h
+
 INSTALL_MI_DIR = sys
 
 EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} ${XNU_ONLY_EXPORTS}
@@ -204,6 +224,8 @@ EXPORT_MI_DIR = sys
 # /System/Library/Frameworks/System.framework/PrivateHeaders
 INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES}
 
+INSTALL_DRIVERKIT_MI_LCL_LIST = ${DRIVERKIT_PRIVATE_DATAFILES}
+
 # /System/Library/Frameworks/Kernel.framework/PrivateHeaders
 INSTALL_KF_MI_LCL_LIST =  ${KERNELFILES} ${PRIVATE_KERNELFILES}
 
@@ -221,43 +243,48 @@ $(OBJROOT)/cscope.genhdrs:
        $(_v)mkdir -p $(OBJROOT)/cscope.genhdrs
 
 $(OBJROOT)/syscall.codes: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS)
-       @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0)";
+       $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)$@$(Color0) from $(ColorF)$<$(Color0))
        $(_v)$(MAKESYSCALLS) $< trace > $@
 
 $(OBJROOT)/trace.codes: $(SRCROOT)/bsd/kern/trace_codes $(OBJROOT)/syscall.codes
        $(_v)sort -g $(SRCROOT)/bsd/kern/trace_codes $(OBJROOT)/syscall.codes >$@
 
 syscall.h: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) $(OBJROOT)/cscope.genhdrs
-       @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0)";
+       $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0))
        @echo "$(OBJPATH)/bsd/sys/$@" > $(OBJROOT)/cscope.genhdrs/$@.path
        $(_v)$(MAKESYSCALLS) $< header > /dev/null
 
 sysproto.h: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) $(OBJROOT)/cscope.genhdrs
-       @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0)";
+       $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0))
        @echo "$(OBJPATH)/bsd/sys/$@" > $(OBJROOT)/cscope.genhdrs/$@.path
        $(_v)$(MAKESYSCALLS) $< proto > /dev/null
 
 kdebugevents.h:  $(OBJROOT)/trace.codes $(MAKEKDEBUGEVENTS) $(OBJROOT)/cscope.genhdrs
-       @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0)";
+       $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0) from $(ColorF)$<$(Color0))
        @echo "$(OBJPATH)/bsd/sys/$@" > $(OBJROOT)/cscope.genhdrs/$@.path
        $(_v)$(MAKEKDEBUGEVENTS) $< > "$(OBJPATH)/bsd/sys/$@"
 
 MAKE_POSIX_AVAILABILITY = $(SRCROOT)/bsd/sys/make_posix_availability.sh
 _posix_availability.h: $(MAKE_POSIX_AVAILABILITY)
-       @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0)";
+       $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0))
        $(_v)$(MAKE_POSIX_AVAILABILITY) "$@"
 
 MAKE_SYMBOL_ALIASING = $(SRCROOT)/bsd/sys/make_symbol_aliasing.sh
 _symbol_aliasing.h: $(MAKE_SYMBOL_ALIASING)
-       @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0)";
+       $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)bsd/sys/$@$(Color0))
        $(_v)$(MAKE_SYMBOL_ALIASING) "$(SDKROOT)" "$@"
 
+# generated headers needed early (used by iig during installhdrs of iokit/DriverKit)
+SETUP_GEN_LIST = _posix_availability.h _symbol_aliasing.h
+
+do_build_setup:: $(SETUP_GEN_LIST)
+
 TRACE_CODES_DEST = \
        $(DSTROOT)/$(INSTALL_SHARE_MISC_DIR)/trace.codes
 
 $(TRACE_CODES_DEST): $(OBJROOT)/trace.codes
        $(_v)$(MKDIR) $(DSTROOT)/$(INSTALL_SHARE_MISC_DIR)
-       @echo INSTALL $(@F)
+       $(call makelog,INSTALL $(@F))
        $(_v)$(INSTALL) $(INSTALL_FLAGS) $(OBJROOT)/trace.codes $@
 
 do_textfiles_install:: $(TRACE_CODES_DEST)
index 08691552c9b530bdcabd969c4f5117797bd07fef..1d63f5c85ab8fb2a3a27d0bfc804c181f7a976b1 100644 (file)
@@ -52,6 +52,7 @@
 #define __DARWIN_NULL ((void *)0)
 #endif /* __cplusplus */
 
+#if !defined(DRIVERKIT)
 typedef __int64_t       __darwin_blkcnt_t;      /* total blocks */
 typedef __int32_t       __darwin_blksize_t;     /* preferred block size */
 typedef __int32_t       __darwin_dev_t;         /* dev_t */
@@ -74,12 +75,13 @@ typedef __uint32_t      __darwin_sigset_t;      /* [???] signal set */
 typedef __int32_t       __darwin_suseconds_t;   /* [???] microseconds */
 typedef __uint32_t      __darwin_uid_t;         /* [???] user IDs */
 typedef __uint32_t      __darwin_useconds_t;    /* [???] microseconds */
+#endif /* !defined(DRIVERKIT) */
 typedef unsigned char   __darwin_uuid_t[16];
 typedef char    __darwin_uuid_string_t[37];
 
-#ifndef KERNEL
+#if !defined(KERNEL) && !defined(DRIVERKIT)
 #include <sys/_pthread/_pthread_types.h>
-#endif /* KERNEL */
+#endif /* !defined(KERNEL) && !defined(DRIVERKIT) */
 
 #if defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 5 || __GNUC__ > 3)
 #define __offsetof(type, field) __builtin_offsetof(type, field)
index c64ec4c8ab3e5aea9df1ad7433b8c53f8d5b514b..5b5373961d544b14a7d5a71b7da286c0f102943d 100644 (file)
@@ -98,6 +98,35 @@ DATAFILES = \
        _user32_ntptimeval.h \
        _user64_ntptimeval.h \
 
+# Installs header file for DriverKit drivers -
+#        $(DSTROOT)/System/DriverKit/System/usr/include/
+DRIVERKIT_DATAFILES = \
+       _ct_rune_t.h \
+       _errno_t.h \
+       _int16_t.h \
+       _int32_t.h \
+       _int64_t.h \
+       _int8_t.h \
+       _intptr_t.h \
+       _mbstate_t.h \
+       _null.h \
+       _offsetof.h \
+       _os_inline.h \
+       _ptrdiff_t.h \
+       _rsize_t.h \
+       _rune_t.h \
+       _size_t.h \
+       _ssize_t.h \
+       _u_int16_t.h \
+       _u_int32_t.h \
+       _u_int64_t.h \
+       _u_int8_t.h \
+       _uintptr_t.h \
+       _uuid_t.h \
+       _va_list.h \
+       _wchar_t.h \
+       _wint_t.h \
+
 # Installs header file for Apple internal use in user level -
 #        $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
 PRIVATE_DATAFILES = \
@@ -122,6 +151,8 @@ PRIVATE_KERNELFILES = \
 # /System/Library/Frameworks/System.framework/Headers and /usr/include
 INSTALL_MI_LIST        = ${DATAFILES}
 
+INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES}
+
 INSTALL_MI_GEN_LIST =
 
 INSTALL_MI_DIR = sys/_types
index ac9cd5c764289c4fbfe03f24559d45487b1cd734..df29f9ca3938db6e4c28f522105c0952021d1a70 100644 (file)
 #ifndef _KAUTH_GUID
 #define _KAUTH_GUID
 /* Apple-style globally unique identifier */
-typedef struct {
+typedef union {
 #define KAUTH_GUID_SIZE 16      /* 128-bit identifier */
        unsigned char g_guid[KAUTH_GUID_SIZE];
+       unsigned int g_guid_asint[KAUTH_GUID_SIZE / sizeof(unsigned int)];
 } guid_t;
 #define _GUID_T
 #endif /* _KAUTH_GUID */
index cdf7e13a7242ced92a78d72e2950f1337b769694..5b4f4c133d8f791b774393cd7b2283cd0e43a95f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define FSOPT_ATTR_CMN_EXTENDED 0x00000020
 #ifdef PRIVATE
 #define FSOPT_LIST_SNAPSHOT     0x00000040
+#ifndef FSOPT_NOFIRMLINKPATH /*a copy is in fsgetpath.h */
+#define FSOPT_NOFIRMLINKPATH     0x00000080
+#endif /* FSOPT_NOFIRMLINKPATH */
+#define FSOPT_FOLLOW_FIRMLINK    0x00000100
+#define FSOPT_RETURN_REALDEV     0x00000200
+#ifndef FSOPT_ISREALFSID  /*a copy is in fsgetpath.h */
+#define FSOPT_ISREALFSID         FSOPT_RETURN_REALDEV
+#endif
 #endif /* PRIVATE */
 
 /* we currently aren't anywhere near this amount for a valid
@@ -235,6 +243,16 @@ typedef struct vol_capabilities_attr {
  *
  * VOL_CAP_FMT_NO_PERMISSIONS: When set, the volume does not support setting
  * permissions.
+ *
+ * VOL_CAP_FMT_SHARED_SPACE: When set, the volume supports sharing space with
+ * other filesystems i.e. multiple logical filesystems can exist in the same
+ * "partition". An implication of this is that the filesystem which sets
+ * this capability treats waitfor arguments to VFS_SYNC as bit flags.
+ *
+ * VOL_CAP_FMT_VOL_GROUPS: When set, this volume is part of a volume-group
+ * that implies multiple volumes must be mounted in order to boot and root the
+ * operating system. Typically, this means a read-only system volume and a
+ * writable data volume.
  */
 #define VOL_CAP_FMT_PERSISTENTOBJECTIDS         0x00000001
 #define VOL_CAP_FMT_SYMBOLICLINKS               0x00000002
@@ -259,7 +277,8 @@ typedef struct vol_capabilities_attr {
 #define VOL_CAP_FMT_WRITE_GENERATION_COUNT      0x00100000
 #define VOL_CAP_FMT_NO_IMMUTABLE_FILES          0x00200000
 #define VOL_CAP_FMT_NO_PERMISSIONS              0x00400000
-
+#define VOL_CAP_FMT_SHARED_SPACE                0x00800000
+#define VOL_CAP_FMT_VOL_GROUPS                  0x01000000
 
 /*
  * VOL_CAP_INT_SEARCHFS: When set, the volume implements the
@@ -328,6 +347,8 @@ typedef struct vol_capabilities_attr {
  * VOL_CAP_INT_RENAME_EXCL: When set, the volume supports an
  * exclusive rename operation.
  *
+ * VOL_CAP_INT_RENAME_OPENFAIL: When set, the volume may fail rename
+ * operations on files that are open.
  */
 #define VOL_CAP_INT_SEARCHFS                    0x00000001
 #define VOL_CAP_INT_ATTRLIST                    0x00000002
@@ -352,6 +373,7 @@ typedef struct vol_capabilities_attr {
 #define VOL_CAP_INT_SNAPSHOT                    0x00020000
 #define VOL_CAP_INT_RENAME_SWAP                 0x00040000
 #define VOL_CAP_INT_RENAME_EXCL                 0x00080000
+#define VOL_CAP_INT_RENAME_OPENFAIL             0x00100000
 
 typedef struct vol_attributes_attr {
        attribute_set_t validattr;
@@ -506,8 +528,11 @@ typedef struct vol_attributes_attr {
 #define ATTR_CMNEXT_RELPATH     0x00000004
 #define ATTR_CMNEXT_PRIVATESIZE 0x00000008
 #define ATTR_CMNEXT_LINKID      0x00000010
+#define ATTR_CMNEXT_NOFIRMLINKPATH     0x00000020
+#define ATTR_CMNEXT_REALDEVID   0x00000040
+#define ATTR_CMNEXT_REALFSID    0x00000080
 
-#define ATTR_CMNEXT_VALIDMASK   0x0000001c
+#define ATTR_CMNEXT_VALIDMASK   0x000000fc
 #define ATTR_CMNEXT_SETMASK             0x00000000
 
 /* Deprecated fork attributes */
index e69067255407c22ed1e7e1d5063d5e5965eaef54..536fab617f5ffcf86bffa95bf0364f1e20ac4c79 100644 (file)
@@ -94,7 +94,7 @@ typedef uint8_t bitstr_t;
 
 /* set bit N of bitstring name (atomic) */
 #define bitstr_set_atomic(name, bit)                                    \
-       atomic_bitset_8(&((name)[_bitstr_byte(bit)]), _bitstr_mask(bit))
+       (void)os_atomic_or(&((name)[_bitstr_byte(bit)]), _bitstr_mask(bit), relaxed)
 
 /* clear bit N of bitstring name */
 #define bitstr_clear(name, bit)                                         \
@@ -102,7 +102,7 @@ typedef uint8_t bitstr_t;
 
 /* clear bit N of bitstring name (atomic) */
 #define bitstr_clear_atomic(name, bit)                                  \
-       atomic_bitclear_8(&((name)[_bitstr_byte(bit)]), _bitstr_mask(bit))
+       (void)os_atomic_andnot(&((name)[_bitstr_byte(bit)]), _bitstr_mask(bit), relaxed)
 
 /* clear bits start ... stop in bitstring */
 #define bitstr_nclear(name, start, stop) do {                           \
index a0f18249345cbcfac4073472aeefbb06fe59d95a..0c9a6901df1830f4b1e896d408288c37905d67b2 100644 (file)
@@ -100,15 +100,19 @@ struct proc_regioninfo_internal {
 #define PROC_REGION_SHARED      2
 
 extern uint32_t vnode_vid(void *vp);
+
 #if CONFIG_IOSCHED
 kern_return_t vnode_pager_get_object_devvp(memory_object_t mem_obj, uintptr_t *devvp);
 extern struct vnode *vnode_mountdevvp(struct vnode *);
 #endif
 
+extern boolean_t vnode_isonexternalstorage(void *vp);
+
 #endif /* MACH_KERNEL_PRIVATE */
 
 extern int fill_procregioninfo(task_t t, uint64_t arg, struct proc_regioninfo_internal *pinfo, uintptr_t *vp, uint32_t *vid);
 extern int fill_procregioninfo_onlymappedvnodes(task_t t, uint64_t arg, struct proc_regioninfo_internal *pinfo, uintptr_t *vp, uint32_t *vid);
+extern int find_region_details(task_t task, vm_map_offset_t offset, uintptr_t *vnodeaddr, uint32_t *vid, uint64_t *start, uint64_t *len);
 void fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo);
 int fill_taskthreadinfo(task_t task, uint64_t thaddr, bool thuniqueid, struct proc_threadinfo_internal * ptinfo, void *, int *);
 int fill_taskthreadlist(task_t task, void * buffer, int thcount, bool thuniqueid);
@@ -118,5 +122,6 @@ void bsd_getthreadname(void *uth, char* buffer);
 void bsd_setthreadname(void *uth, const char* buffer);
 void bsd_threadcdir(void * uth, void *vptr, int *vidp);
 extern void bsd_copythreadname(void *dst_uth, void *src_uth);
+int fill_taskipctableinfo(task_t task, uint32_t *table_size, uint32_t *table_free);
 
 #endif /*_SYS_BSDTASK_INFO_H */
index fa96b304c7c6ef82f16d418f72bd76cc0fa622c5..f46094803d7d5caa5013e09d9b112cd49f72678d 100644 (file)
@@ -1020,6 +1020,22 @@ void buf_markstatic(buf_t bp);
  */
 int     buf_static(buf_t bp);
 
+/*!
+ *  @function bufattr_markiosched
+ *  @abstract Mark a buffer as belonging to an io scheduled mount point
+ *  @param bap Buffer attributes to mark.
+ *  @discussion Marks the buffer so that spec_strategy() will know that it belongs to an io scheduled mount point
+ */
+void bufattr_markioscheduled(bufattr_t bap);
+
+/*!
+ *  @function bufattr_iosched
+ *  @abstract Check if a buffer is marked as io scheduled
+ *  @param bap Buffer attributes to test.
+ *  @return Nonzero if the buffer is marked io scheduled, 0 otherwise.
+ */
+int bufattr_ioscheduled(bufattr_t bap);
+
 #ifdef KERNEL_PRIVATE
 void    buf_setfilter(buf_t, void (*)(buf_t, void *), void *, void(**)(buf_t, void *), void **);
 
index c7b206823b4ab93af01e731f2dd38e0c69689e94..2bd3511b2a6dcf667abe5f03a756f95c7474cec5 100644 (file)
@@ -271,8 +271,8 @@ extern vm_offset_t buf_kernel_addrperm;
 #define BA_ISOCHRONOUS          0x00001000 /* device specific isochronous throughput to media */
 
 #define BA_STRATEGY_TRACKED_IO  0x00002000 /* tracked by spec_strategy */
-#define BA_IO_TIER_UPGRADE  0x00004000 /* effective I/O tier is higher than BA_IO_TIER */
-
+#define BA_IO_TIER_UPGRADE      0x00004000 /* effective I/O tier is higher than BA_IO_TIER */
+#define BA_IO_SCHEDULED         0x00008000 /* buf is associated with a mount point that is io scheduled */
 
 #define GET_BUFATTR_IO_TIER(bap)        ((bap->ba_flags & BA_IO_TIER_MASK) >> BA_IO_TIER_SHIFT)
 #define SET_BUFATTR_IO_TIER(bap, tier)                                          \
index bb42543dfebcea41e3d48a6393ae5eb6381a450c..066b91859ce8872a23393c337cb10e3cf56db473 100644 (file)
 #endif /* !NO_ANSI_KEYWORDS */
 #endif /* !(__STDC__ || __cplusplus) */
 
-#define __dead2         __attribute__((noreturn))
-#define __pure2         __attribute__((const))
+#define __dead2         __attribute__((__noreturn__))
+#define __pure2         __attribute__((__const__))
 
 /* __unused denotes variables and functions that may not be used, preventing
  * the compiler from warning about it if not used.
  */
-#define __unused        __attribute__((unused))
+#define __unused        __attribute__((__unused__))
 
 /* __used forces variables and functions to be included even if it appears
  * to the compiler that they are not used (and would thust be discarded).
  */
-#define __used          __attribute__((used))
+#define __used          __attribute__((__used__))
+
+/* __cold marks code used for debugging or that is rarely taken
+ * and tells the compiler to optimize for size and outline code.
+ */
+#if __has_attribute(cold)
+#define __cold          __attribute__((__cold__))
+#else
+#define __cold
+#endif
 
 /* __deprecated causes the compiler to produce a warning when encountering
  * code using the deprecated functionality.
  * This may require turning on such warning with the -Wdeprecated flag.
  * __deprecated_enum_msg() should be used on enums, and compilers that support
  * it will print the deprecation warning.
+ * __kpi_deprecated() specifically indicates deprecation of kernel programming
+ * interfaces in Kernel.framework used by KEXTs.
  */
-#define __deprecated    __attribute__((deprecated))
+#define __deprecated    __attribute__((__deprecated__))
 
 #if __has_extension(attribute_deprecated_with_message) || \
         (defined(__GNUC__) && ((__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5))))
-       #define __deprecated_msg(_msg) __attribute__((deprecated(_msg)))
+       #define __deprecated_msg(_msg) __attribute__((__deprecated__(_msg)))
 #else
-       #define __deprecated_msg(_msg) __attribute__((deprecated))
+       #define __deprecated_msg(_msg) __attribute__((__deprecated__))
 #endif
 
 #if __has_extension(enumerator_attributes)
        #define __deprecated_enum_msg(_msg)
 #endif
 
+#if defined(KERNEL) && !defined(KERNEL_PRIVATE)
+#define __kpi_deprecated(_msg) __deprecated_msg(_msg)
+#else /* !defined(KERNEL) || defined(KERNEL_PRIVATE) */
+#define __kpi_deprecated(_msg)
+#endif /* !defined(KERNEL) || defined(KERNEL_PRIVATE) */
+
 /* __unavailable causes the compiler to error out when encountering
  * code using the tagged function of variable.
  */
-#define __unavailable   __attribute__((unavailable))
+#define __unavailable   __attribute__((__unavailable__))
 
 /* Delete pseudo-keywords wherever they are not available or needed. */
 #ifndef __dead
 #define __swift_unavailable(_msg)
 #endif
 
+/*
+ * __abortlike is the attribute to put on functions like abort() that are
+ * typically used to mark assertions. These optimize the codegen
+ * for outlining while still maintaining debugability.
+ */
+#ifndef __abortlike
+#define __abortlike __dead2 __cold __not_tail_called
+#endif
+
 /* Declaring inline functions within headers is error-prone due to differences
  * across various versions of the C language and extensions.  __header_inline
  * can be used to declare inline functions within system headers.  In cases
 #define __DARWIN_ONLY_UNIX_CONFORMANCE  1
 #define __DARWIN_ONLY_VERS_1050         1
 #endif /* PLATFORM_BridgeOS */
+#ifdef PLATFORM_DriverKit
+/* Platform: DriverKit */
+#define __DARWIN_ONLY_64_BIT_INO_T      1
+#define __DARWIN_ONLY_UNIX_CONFORMANCE  1
+#define __DARWIN_ONLY_VERS_1050         1
+#endif /* PLATFORM_DriverKit */
 #ifdef PLATFORM_MacOSX
 /* Platform: MacOSX */
 #define __DARWIN_ONLY_64_BIT_INO_T      0
 
 #define __DARWIN_EXTSN(sym)             __asm("_" __STRING(sym) __DARWIN_SUF_EXTSN)
 #define __DARWIN_EXTSN_C(sym)           __asm("_" __STRING(sym) __DARWIN_SUF_EXTSN __DARWIN_SUF_NON_CANCELABLE)
+#if XNU_KERNEL_PRIVATE
+#define __XNU_INTERNAL(sym)             __asm("_" __STRING(sym) "$XNU_INTERNAL")
+#endif
 
 /*
  * symbol release macros
 #define _DARWIN_FEATURE_UNIX_CONFORMANCE        3
 #endif
 
+#if defined(DRIVERKIT) && !defined(KERNEL)
+/*
+ * __DRIVERKIT_LIBC__ indicates to the C++ standard library headers and
+ * similar components that only the restricted set of standard C library
+ * functionality and headers for the DriverKit userspace driver environment
+ * are available.
+ */
+#define __DRIVERKIT_LIBC__                      1
+#endif /* defined(DRIVERKIT) && !defined(KERNEL) */
+
 /*
  * This macro casts away the qualifier from the variable
  *
 #define __improbable(x) __builtin_expect(!!(x), 0)
 #endif /* !defined(__probable) && !defined(__improbable) */
 
-#define __container_of(ptr, type, field) ({ \
-               const typeof(((type *)0)->field) *__ptr = (ptr); \
+#if defined(__cplusplus)
+#define __container_of(ptr, type, field) __extension__({ \
+               const typeof(((type *)nullptr)->field) *__ptr = (ptr); \
+               (type *)((uintptr_t)__ptr - offsetof(type, field)); \
+       })
+#else
+#define __container_of(ptr, type, field) __extension__({ \
+               const typeof(((type *)NULL)->field) *__ptr = (ptr); \
                (type *)((uintptr_t)__ptr - offsetof(type, field)); \
        })
+#endif
 
 #endif /* KERNEL || PRIVATE */
 
 #define __compiler_barrier() __asm__ __volatile__("" ::: "memory")
 
+#if __has_attribute(enum_extensibility)
+#define __enum_open __attribute__((__enum_extensibility__(open)))
+#define __enum_closed __attribute__((__enum_extensibility__(closed)))
+#else
+#define __enum_open
+#define __enum_closed
+#endif // __has_attribute(enum_extensibility)
+
+#if __has_attribute(flag_enum)
+#define __enum_options __attribute__((__flag_enum__))
+#else
+#define __enum_options
+#endif
+
+/*
+ * Similar to OS_ENUM/OS_CLOSED_ENUM/OS_OPTIONS/OS_CLOSED_OPTIONS
+ *
+ * This provides more advanced type checking on compilers supporting
+ * the proper extensions, even in C.
+ */
+#if __has_feature(objc_fixed_enum) || __has_extension(cxx_fixed_enum) || \
+        __has_extension(cxx_strong_enums)
+#define __enum_decl(_name, _type, ...) \
+               typedef enum : _type __VA_ARGS__ __enum_open _name
+#define __enum_closed_decl(_name, _type, ...) \
+               typedef enum : _type __VA_ARGS__ __enum_closed _name
+#define __options_decl(_name, _type, ...) \
+               typedef enum : _type __VA_ARGS__ __enum_open __enum_options _name
+#define __options_closed_decl(_name, _type, ...) \
+               typedef enum : _type __VA_ARGS__ __enum_closed __enum_options _name
+#else
+#define __enum_decl(_name, _type, ...) \
+               typedef _type _name; enum __VA_ARGS__ __enum_open
+#define __enum_closed_decl(_name, _type, ...) \
+               typedef _type _name; enum __VA_ARGS__ __enum_closed
+#define __options_decl(_name, _type, ...) \
+               typedef _type _name; enum __VA_ARGS__ __enum_open __enum_options
+#define __options_closed_decl(_name, _type, ...) \
+               typedef _type _name; enum __VA_ARGS__ __enum_closed __enum_options
+#endif
+
 #endif /* !_CDEFS_H_ */
index 34a532a9da30d8790ada5e534753230f20d7a3e0..147959094cb69cf5022d200efb145b5dfd1594f8 100644 (file)
@@ -49,6 +49,7 @@ int coalition_reap(uint64_t cid, uint32_t flags);
 int coalition_info_resource_usage(uint64_t cid, struct coalition_resource_usage *cru, size_t sz);
 int coalition_info_set_name(uint64_t cid, const char *name, size_t size);
 int coalition_info_set_efficiency(uint64_t cid, uint64_t flags);
+int coalition_ledger_set_logical_writes_limit(uint64_t cid, int64_t limit);
 
 #else /* KERNEL */
 
@@ -86,25 +87,31 @@ extern int coalitions_get_list(int type, struct procinfo_coalinfo *coal_list, in
 
 
 /*
- * coalition_is_leader:
- * Determine if a task is a coalition leader.
+ * task_get_coalition:
+ * Return the coalition of a task.
  *
  * Parameters:
  *      task      : The task to investigate
  *      coal_type : The COALITION_TYPE of the coalition to investigate.
  *                  Valid types can be found in <mach/coalition.h>
- *      coal      : If 'task' is a valid task, and is a member of a coalition
- *                  of type 'coal_type', then 'coal' will be filled in with
- *                  the corresponding coalition_t object.
- *                  NOTE: This will be filled in whether or not the 'task' is
- *                        a leader in the coalition. However, if 'task' is
- *                        not a member of a coalition of type 'coal_type' then
- *                        'coal' will be filled in with COALITION_NULL.
- *                  NOTE: This can be NULL
- *
- * Returns: TRUE if 'task' is a coalition leader, FALSE otherwise.
+ *
+ * Returns: valid coalition_t or COALITION_NULL
+ */
+extern coalition_t task_get_coalition(task_t task, int coal_type);
+
+
+/*
+ * coalition_is_leader:
+ * Determine if a task is a coalition leader.
+ *
+ * Parameters:
+ *      task      : The task to investigate
+ *      coal      : The coalition to test against.
+ *                  NOTE: This can be COALITION_NULL, in case FALSE is returned.
+ *
+ * Returns: TRUE if 'task' is the coalition's leader, FALSE otherwise.
  */
-extern boolean_t coalition_is_leader(task_t task, int coal_type, coalition_t *coal);
+extern boolean_t coalition_is_leader(task_t task, coalition_t coal);
 
 /*
  * coalition_get_leader:
@@ -203,12 +210,17 @@ coalitions_get_list(__unused int type,
        return 0;
 }
 
+static inline coalition_t
+coalition_get_leader(__unused task_t task,
+    __unused int coal_type)
+{
+       return COALITION_NULL;
+}
+
 static inline boolean_t
 coalition_is_leader(__unused task_t task,
-    __unused int coal_type,
-    coalition_t *coal)
+    __unused coalition_t coal)
 {
-       *coal = COALITION_NULL;
        return FALSE;
 }
 
index 26e3e1f64635d326ead1448d9b8b6b8511ce0e4a..25e569fcb8c6e754c035dfd0737238071a3d12e9 100644 (file)
@@ -92,6 +92,9 @@ int csproc_forced_lv(struct proc* p);
 int     cs_system_require_lv(void);
 uint32_t cs_entitlement_flags(struct proc *p);
 int     cs_entitlements_blob_get(struct proc *, void **, size_t *);
+#ifdef KERNEL_PRIVATE
+int     cs_entitlements_dictionary_copy(struct proc *, void **);
+#endif
 int     cs_restricted(struct proc *);
 uint8_t * cs_get_cdhash(struct proc *);
 
index 83871f07f542f5171f5275f9c4e4a661b71ea3e5..ccdd509491143bbedaffc1e01f90521abe2b5bba 100644 (file)
 #define _COMMPAGE_H
 
 #ifdef  PRIVATE
+
+#define _COMM_PAGE32_SIGNATURE_STRING           "commpage 32-bit"
+#define _COMM_PAGE64_SIGNATURE_STRING           "commpage 64-bit"
+
 typedef volatile struct commpage_timeofday_data {
        uint64_t        TimeStamp_tick;
        uint64_t        TimeStamp_sec;
@@ -36,6 +40,7 @@ typedef volatile struct commpage_timeofday_data {
        uint64_t        Ticks_scale;
        uint64_t        Ticks_per_sec;
 } new_commpage_timeofday_data_t;
+
 #endif
 
 #endif
index e8f6f3a2701cd7234366d1d719c4a120dfeee2ae..51fbdfdd62e615be0d83cb5ec57178bc2d622886 100644 (file)
@@ -76,11 +76,47 @@ enum {
 
 #define DECMPFS_XATTR_NAME "com.apple.decmpfs" /* extended attribute to use for decmpfs */
 
+/*
+ * This single field is to be interpreted differently depending on the
+ * corresponding item type.
+ * For regular files: it is a 64bits-encoded logical size
+ * For directories: it is a 64bits-encoded number of children (ie st_nlink - 2)
+ * For packages: it is 40bits encoded size and 24bits number of children at root
+ */
+typedef struct __attribute__((packed)) {
+       uint64_t  value;
+} decmpfs_raw_item_size;
+
+#define DECMPFS_PKG_SIZE_MASK           0x000000ffffffffffULL
+#define DECMPFS_PKG_COUNT_MASK          0xffffff
+#define DECMPFS_PKG_CHLD_COUNT_SHIFT    40
+
+#define DECMPFS_PKG_SIZE(x)             ((x).value & DECMPFS_PKG_SIZE_MASK)
+#define DECMPFS_PKG_CHLD_COUNT(x)       ((uint32_t)(((x).value >> DECMPFS_PKG_CHLD_COUNT_SHIFT) & DECMPFS_PKG_COUNT_MASK))
+#define DECMPFS_PKG_VALUE_FROM_SIZE_COUNT(size, count) \
+       (((size) & DECMPFS_PKG_SIZE_MASK) | ((uint64_t)(count) << DECMPFS_PKG_CHLD_COUNT_SHIFT))
+
+/* Dataless file or directory */
+#define DATALESS_CMPFS_TYPE     0x80000001
+
+/* Dataless package, with number of root children and total size encoded on disk */
+#define DATALESS_PKG_CMPFS_TYPE 0x80000002
+
+
+static inline bool
+decmpfs_type_is_dataless(uint32_t cmp_type)
+{
+       return cmp_type == DATALESS_CMPFS_TYPE || cmp_type == DATALESS_PKG_CMPFS_TYPE;
+}
+
 typedef struct __attribute__((packed)) {
        /* this structure represents the xattr on disk; the fields below are little-endian */
        uint32_t compression_magic;
-       uint32_t compression_type; /* see the enum below */
-       uint64_t uncompressed_size;
+       uint32_t compression_type;   /* see the enum below */
+       union {
+               uint64_t uncompressed_size;  /* compatility accessor */
+               decmpfs_raw_item_size _size;
+       };
        unsigned char attr_bytes[0]; /* the bytes of the attribute after the header */
 } decmpfs_disk_header;
 
@@ -89,10 +125,38 @@ typedef struct __attribute__((packed)) {
        uint32_t attr_size;
        uint32_t compression_magic;
        uint32_t compression_type;
-       uint64_t uncompressed_size;
+       union {
+               /*
+                * although uncompressed_size remains available for backward-compatibility reasons
+                * the uncompressed size and nchildren should be accessed using the inline helpers
+                * below
+                */
+               uint64_t uncompressed_size;
+               decmpfs_raw_item_size _size;
+       };
        unsigned char attr_bytes[0]; /* the bytes of the attribute after the header */
 } decmpfs_header;
 
+static inline uint64_t
+decmpfs_get_uncompressed_size(const decmpfs_header *hdr)
+{
+       if (hdr->compression_magic == DECMPFS_MAGIC && hdr->compression_type == DATALESS_PKG_CMPFS_TYPE) {
+               return DECMPFS_PKG_SIZE(hdr->_size);
+       }
+
+       return hdr->uncompressed_size;
+}
+
+static inline uint32_t
+decmpfs_get_directory_entries(const decmpfs_header *hdr)
+{
+       if (hdr->compression_magic == DECMPFS_MAGIC && hdr->compression_type == DATALESS_PKG_CMPFS_TYPE) {
+               return DECMPFS_PKG_CHLD_COUNT(hdr->_size);
+       }
+
+       return (uint32_t)hdr->uncompressed_size;
+}
+
 /* compression_type values */
 enum {
        CMP_Type1       = 1,/* uncompressed data in xattr */
@@ -120,6 +184,8 @@ struct decmpfs_cnode {
        uint32_t lockcount;
        void    *lockowner;          /* cnode's lock owner (if a thread is currently holding an exclusive lock) */
        uint64_t uncompressed_size __attribute__((aligned(8)));
+       uint64_t nchildren __attribute__((aligned(8))); /* for dataless directories (incl. packages) */
+       uint64_t total_size __attribute__((aligned(8)));/* for dataless directories (incl. packages) */
        uint64_t decompression_flags;
        lck_rw_t compressed_data_lock;
 };
@@ -156,6 +222,11 @@ void decmpfs_unlock_compressed_data(decmpfs_cnode *cp, int exclusive);
 uint32_t decmpfs_cnode_get_vnode_state(decmpfs_cnode *cp);
 void decmpfs_cnode_set_vnode_state(decmpfs_cnode *cp, uint32_t state, int skiplock);
 uint64_t decmpfs_cnode_get_vnode_cached_size(decmpfs_cnode *cp);
+uint64_t decmpfs_cnode_get_vnode_cached_nchildren(decmpfs_cnode *cp);
+uint64_t decmpfs_cnode_get_vnode_cached_total_size(decmpfs_cnode *cp);
+void decmpfs_cnode_set_vnode_cached_size(decmpfs_cnode *cp, uint64_t size);
+void decmpfs_cnode_set_vnode_cached_nchildren(decmpfs_cnode *cp, uint64_t nchildren);
+void decmpfs_cnode_set_vnode_cached_total_size(decmpfs_cnode *cp, uint64_t total_sz);
 uint32_t decmpfs_cnode_cmp_type(decmpfs_cnode *cp);
 
 int decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp);
index c6e1d8868912bc733bcbd4dfae6cb3fcb05bace9..4970773018aa911cde4ffb80700f14791c7b6480 100644 (file)
@@ -141,4 +141,17 @@ struct direntry __DARWIN_STRUCT_DIRENTRY;
 #define DTTOIF(dirtype) ((dirtype) << 12)
 #endif
 
+#if PRIVATE
+/*
+ * If a buffer at least this size is passed to __getdirentries64,
+ * the the last 4 bytes will be the flags below.
+ */
+#define GETDIRENTRIES64_EXTENDED_BUFSIZE  1024
+
+__options_decl(getdirentries64_flags_t, unsigned, {
+       /* the __getdirentries64 returned all entries */
+       GETDIRENTRIES64_EOF = 1U << 0,
+});
+#endif
+
 #endif /* _SYS_DIRENT_H  */
index f0a7a15da9fec3da85558b71629ded81051b2667..66d31790270f02779c0f777f0ca629bfd8bab562 100644 (file)
@@ -57,6 +57,8 @@
  * DKIOCREQUESTIDLE                      idle media
  * DKIOCUNMAP                            delete unused data
  *
+ * DKIOCGETLOCATION                      get device's physical location
+ *
  * DKIOCGETMAXBLOCKCOUNTREAD             get maximum block count for reads
  * DKIOCGETMAXBLOCKCOUNTWRITE            get maximum block count for writes
  * DKIOCGETMAXBYTECOUNTREAD              get maximum byte count for reads
@@ -135,7 +137,6 @@ typedef struct{
 #endif /* !__LP64__ */
 } dk_unmap_t;
 
-
 typedef struct{
        uint64_t           flags;
        uint64_t           hotfile_size;           /* in bytes */
@@ -176,6 +177,8 @@ typedef struct{
        char *                 description;
 } dk_error_description_t;
 
+#define DK_LOCATION_INTERNAL                   0x00000000
+#define DK_LOCATION_EXTERNAL                   0x00000001
 
 #ifdef KERNEL
 #ifdef PRIVATE
@@ -203,6 +206,8 @@ typedef struct{
 #define DKIOCUNMAP                            _IOW('d', 31, dk_unmap_t)
 #define DKIOCCORESTORAGE                      _IOR('d', 32, dk_corestorage_info_t)
 
+#define DKIOCGETLOCATION                      _IOR('d', 33, uint64_t)
+
 #define DKIOCGETMAXBLOCKCOUNTREAD             _IOR('d', 64, uint64_t)
 #define DKIOCGETMAXBLOCKCOUNTWRITE            _IOR('d', 65, uint64_t)
 #define DKIOCGETMAXBYTECOUNTREAD              _IOR('d', 70, uint64_t)
@@ -344,9 +349,9 @@ typedef struct dk_apfs_wbc_range {
 #endif /* KERNEL */
 
 #ifdef PRIVATE
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 #define _DKIOCSETSTATIC                       _IO('d', 84)
-#endif /* TARGET_OS_EMBEDDED */
+#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
 #endif /* PRIVATE */
 
 #endif  /* _SYS_DISK_H_ */
index 10ada89caf51665efc2838ced4a5b8e15087d0a8..9d6ee92e5b8ada4d0dd505dda501818537b9b543 100644 (file)
@@ -186,11 +186,12 @@ typedef const struct domain_unguard *domain_unguard_t;
 extern domain_unguard_t domain_unguard_deploy(void);
 extern void domain_unguard_release(domain_unguard_t);
 extern struct domain_old *pffinddomain_old(int);
+extern struct domain *pffinddomain(int) __XNU_INTERNAL(pffinddomain);
 #else
 extern void net_add_domain(struct domain *dp);
 extern int net_del_domain(struct domain *);
-#endif /* XNU_KERNEL_PRIVATE */
 extern struct domain *pffinddomain(int);
+#endif /* XNU_KERNEL_PRIVATE */
 __END_DECLS
 #endif /* KERNEL_PRIVATE */
 #endif /* PRIVATE */
index 29540c2a3dd4a59a529f014c3ee6faa9ec1ccefc..98d3628dd305bf12c6a84960548776ef50ae99c2 100644 (file)
@@ -34,8 +34,6 @@
 #ifndef _SYS_DTRACE_H
 #define _SYS_DTRACE_H
 
-/* #pragma ident       "@(#)dtrace.h   1.37    07/06/05 SMI" */
-
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -73,12 +71,6 @@ extern "C" {
 #endif
 #endif
 
-#ifdef KERNEL
-#ifndef _KERNEL
-#define _KERNEL /* Solaris vs. Darwin */
-#endif
-#endif
-
 #if defined(__BIG_ENDIAN__)
 #if !defined(_BIG_ENDIAN)
 #define _BIG_ENDIAN /* Solaris vs. Darwin */
@@ -91,6 +83,12 @@ extern "C" {
 #error Unknown endian-ness
 #endif
 
+#ifdef KERNEL
+#ifndef _KERNEL
+#define _KERNEL /* Solaris vs. Darwin */
+#endif
+#endif
+
 #include <sys/types.h>
 #include <sys/param.h>
 #include <stdint.h>
@@ -286,6 +284,7 @@ typedef enum dtrace_probespec {
 #define        DIF_OP_RLDX     77              /* rldx  [r1], rd */
 #define        DIF_OP_XLATE    78              /* xlate xlrindex, rd */
 #define        DIF_OP_XLARG    79              /* xlarg xlrindex, rd */
+#define        DIF_OP_STRIP    80              /* strip r1, key, rd */
 
 #define        DIF_INTOFF_MAX          0xffff  /* highest integer table offset */
 #define        DIF_STROFF_MAX          0xffff  /* highest string table offset */
@@ -394,7 +393,10 @@ typedef enum dtrace_probespec {
 #define        DIF_SUBR_INET_NTOA6             43
 #define        DIF_SUBR_TOUPPER                44
 #define        DIF_SUBR_TOLOWER                45
-#define DIF_SUBR_MAX                   46      /* max subroutine value */
+#define DIF_SUBR_JSON                  46
+#define DIF_SUBR_STRTOLL               47
+#define DIF_SUBR_STRIP                 48
+#define DIF_SUBR_MAX                   48      /* max subroutine value */
 
 /* Apple-specific subroutines */
 #if defined(__APPLE__)
@@ -412,6 +414,7 @@ typedef uint32_t dif_instr_t;
 #define DIF_INSTR_R2(i)                 (((i) >>  8) & 0xff)
 #define DIF_INSTR_RD(i)                 ((i) & 0xff)
 #define DIF_INSTR_RS(i)                 ((i) & 0xff)
+#define DIF_INSTR_IMM2(i)               (((i) >>  8) & 0xff)
 #define DIF_INSTR_LABEL(i)              ((i) & 0xffffff)
 #define DIF_INSTR_VAR(i)                (((i) >>  8) & 0xffff)
 #define DIF_INSTR_INTEGER(i)            (((i) >>  8) & 0xffff)
@@ -2558,25 +2561,6 @@ extern void dtrace_sync(void);
 extern void dtrace_toxic_ranges(void (*)(uintptr_t, uintptr_t));
 extern void dtrace_xcall(processorid_t, dtrace_xcall_t, void *);
 
-extern int dtrace_safe_defer_signal(void);
-extern void dtrace_safe_synchronous_signal(void);
-
-extern int dtrace_mach_aframes(void);
-
-#if !defined(__APPLE__)
-#if defined(__i386) || defined(__amd64)
-extern int dtrace_instr_size(uchar_t *instr);
-extern int dtrace_instr_size_isa(uchar_t *, model_t, int *);
-extern void dtrace_invop_add(int (*)(uintptr_t, uintptr_t *, uintptr_t));
-extern void dtrace_invop_remove(int (*)(uintptr_t, uintptr_t *, uintptr_t));
-extern void dtrace_invop_callsite(void);
-#endif
-
-#ifdef __sparc
-extern int dtrace_blksuword32(uintptr_t, uint32_t *, int);
-extern void dtrace_getfsr(uint64_t *);
-#endif
-#else
 #if defined(__i386__) || defined(__x86_64__)
 extern int dtrace_instr_size(uchar_t *instr);
 extern int dtrace_instr_size_isa(uchar_t *, model_t, int *);
@@ -2586,16 +2570,17 @@ extern void *dtrace_invop_callsite_pre;
 extern void *dtrace_invop_callsite_post;
 #endif
 
-#if defined(__arm__) || defined(__arm64__)
+#if defined(__arm__)
 extern int dtrace_instr_size(uint32_t instr, int thumb_mode);
+#endif
+#if defined(__arm__) || defined(__arm64__)
 extern void dtrace_invop_add(int (*)(uintptr_t, uintptr_t *, uintptr_t));    
 extern void dtrace_invop_remove(int (*)(uintptr_t, uintptr_t *, uintptr_t));
 extern void *dtrace_invop_callsite_pre;
 extern void *dtrace_invop_callsite_post;
 #endif
-    
+
 #undef proc_t
-#endif /* __APPLE__ */
 
 #define DTRACE_CPUFLAG_ISSET(flag) \
         (cpu_core[CPU->cpu_id].cpuc_dtrace_flags & (flag))
@@ -2610,17 +2595,6 @@ extern void *dtrace_invop_callsite_post;
 
 #endif  /* _ASM */
 
-#if !defined(__APPLE__)
-#if defined(__i386) || defined(__amd64)
-
-#define        DTRACE_INVOP_PUSHL_EBP          1
-#define        DTRACE_INVOP_POPL_EBP           2
-#define        DTRACE_INVOP_LEAVE              3
-#define        DTRACE_INVOP_NOP                4
-#define        DTRACE_INVOP_RET                5
-
-#endif
-#else
 #if defined(__i386__) || defined(__x86_64__)
 
 #define DTRACE_INVOP_PUSHL_EBP          1
@@ -2639,8 +2613,6 @@ extern void *dtrace_invop_callsite_post;
 
 #endif
 
-#endif /* __APPLE__ */
-
 #ifdef  __cplusplus
 }
 #endif
index 51d9804c77badc693e3d7d19430b28386cbe655b..be144f4d47c64acb3108b433ce3247b345fc6a29 100644 (file)
 #include <kern/debug.h>
 #include <kern/thread_call.h>
 #include <kern/thread.h>
+#include <machine/atomic.h>
 #include <machine/machine_routines.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 #include <stdarg.h>
 #include <mach/kmod.h>
-#include <libkern/OSAtomic.h>
 
 #if defined(__i386__) || defined(__x86_64__)
 #include <i386/mp.h>
@@ -226,8 +226,6 @@ typedef struct modctl {
 #define MODCTL_SDT_PROBES_PROVIDED              0x10  // sdt probes have been provided
 #define MODCTL_SDT_INVALID                      0x20  // Module is invalid for sdt probes
 #define MODCTL_HAS_UUID                         0x40  // Module has UUID
-#define MODCTL_FBT_PRIVATE_PROBES_PROVIDED      0x80  // fbt private probes have been provided
-#define MODCTL_FBT_PROVIDE_PRIVATE_PROBES       0x100 // fbt provider must provide private probes
 #define MODCTL_FBT_PROVIDE_BLACKLISTED_PROBES   0x200 // fbt provider must provide blacklisted probes
 #define MODCTL_FBT_BLACKLISTED_PROBES_PROVIDED  0x400 // fbt blacklisted probes have been provided
 #define MODCTL_IS_STATIC_KEXT                   0x800 // module is a static kext
@@ -241,16 +239,13 @@ typedef struct modctl {
 #define MOD_SDT_PROBES_PROVIDED(mod)            (mod->mod_flags & MODCTL_SDT_PROBES_PROVIDED)
 #define MOD_SDT_INVALID(mod)                    (mod->mod_flags & MODCTL_SDT_INVALID)
 #define MOD_HAS_UUID(mod)                       (mod->mod_flags & MODCTL_HAS_UUID)
-#define MOD_FBT_PRIVATE_PROBES_PROVIDED(mod)    (mod->mod_flags & MODCTL_FBT_PRIVATE_PROBES_PROVIDED)
-#define MOD_FBT_PROVIDE_PRIVATE_PROBES(mod)     (mod->mod_flags & MODCTL_FBT_PROVIDE_PRIVATE_PROBES)
 #define MOD_FBT_BLACKLISTED_PROBES_PROVIDED(mod) (mod->mod_flags & MODCTL_FBT_BLACKLISTED_PROBES_PROVIDED)
 #define MOD_FBT_PROVIDE_BLACKLISTED_PROBES(mod) (mod->mod_flags & MODCTL_FBT_PROVIDE_BLACKLISTED_PROBES)
 #define MOD_IS_STATIC_KEXT(mod)                 (mod->mod_flags & MODCTL_IS_STATIC_KEXT)
 
 /* Compound accessors */
-#define MOD_FBT_PRIVATE_PROBES_DONE(mod)        (MOD_FBT_PRIVATE_PROBES_PROVIDED(mod) || !MOD_FBT_PROVIDE_PRIVATE_PROBES(mod))
 #define MOD_FBT_BLACKLISTED_PROBES_DONE(mod)    (MOD_FBT_BLACKLISTED_PROBES_PROVIDED(mod) || !MOD_FBT_PROVIDE_BLACKLISTED_PROBES(mod))
-#define MOD_FBT_DONE(mod)                       ((MOD_FBT_PROBES_PROVIDED(mod) && MOD_FBT_PRIVATE_PROBES_DONE(mod) && MOD_FBT_BLACKLISTED_PROBES_DONE(mod)) || MOD_FBT_INVALID(mod))
+#define MOD_FBT_DONE(mod)                       ((MOD_FBT_PROBES_PROVIDED(mod) && MOD_FBT_BLACKLISTED_PROBES_DONE(mod)) || MOD_FBT_INVALID(mod))
 #define MOD_SDT_DONE(mod)                       (MOD_SDT_PROBES_PROVIDED(mod) || MOD_SDT_INVALID(mod))
 #define MOD_SYMBOLS_DONE(mod)                   (MOD_FBT_DONE(mod) && MOD_SDT_DONE(mod))
 
@@ -449,60 +444,6 @@ extern vmem_t *vmem_create(const char *, void *, size_t, size_t, void *,
 extern void vmem_destroy(vmem_t *);
 extern void vmem_free(vmem_t *vmp, void *vaddr, size_t size);
 
-/*
- * Atomic
- */
-
-static inline uint8_t
-atomic_or_8(uint8_t *addr, uint8_t mask)
-{
-       return OSBitOrAtomic8(mask, addr);
-}
-
-static inline uint32_t
-atomic_and_32( uint32_t *addr, int32_t mask)
-{
-       return OSBitAndAtomic(mask, addr);
-}
-
-static inline uint32_t
-atomic_add_32( uint32_t *theAddress, int32_t theAmount )
-{
-       return OSAddAtomic( theAmount, theAddress );
-}
-
-#if defined(__i386__) || defined(__x86_64__)
-static inline void
-atomic_add_64( uint64_t *theAddress, int64_t theAmount )
-{
-       (void)OSAddAtomic64( theAmount, (SInt64 *)theAddress );
-}
-#elif defined(__arm__)
-static inline void
-atomic_add_64( uint64_t *theAddress, int64_t theAmount )
-{
-       // FIXME
-       // atomic_add_64() is at present only called from fasttrap.c to increment
-       // or decrement a 64bit counter. Narrow to 32bits since arm has
-       // no convenient 64bit atomic op.
-
-       (void)OSAddAtomic((int32_t)theAmount, &(((SInt32 *)theAddress)[0]));
-}
-#elif defined (__arm64__)
-static inline void
-atomic_add_64( uint64_t *theAddress, int64_t theAmount )
-{
-       (void)OSAddAtomic64( theAmount, (SInt64 *)theAddress );
-}
-#endif
-
-static inline uint32_t
-atomic_or_32(uint32_t *addr, uint32_t mask)
-{
-       return OSBitOrAtomic(mask, addr);
-}
-
-
 /*
  * Miscellaneous
  */
@@ -514,7 +455,6 @@ typedef uintptr_t greg_t; /* For dtrace_impl.h prototype of dtrace_getfp() */
 #endif
 extern struct regs *find_user_regs( thread_t thread);
 extern vm_offset_t dtrace_get_cpu_int_stack_top(void);
-extern vm_offset_t max_valid_stack_address(void); /* kern/thread.h */
 
 #define panic_quiesce (panic_active())
 
@@ -542,13 +482,6 @@ int dtrace_buffer_copyout(const void*, user_addr_t, vm_size_t);
  */
 #define LIT_STRNEQL(s1, lit_s2) (0 == strncmp( (s1), (lit_s2), sizeof((lit_s2)) ))
 
-/*
- * Safe counted string compare of a literal against the beginning of a string. Here
- * the sizeof() is reduced by 1 so that the trailing null of the literal does not
- * participate in the comparison.
- */
-#define LIT_STRNSTART(s1, lit_s2) (0 == strncmp( (s1), (lit_s2), sizeof((lit_s2)) - 1 ))
-
 #define KERNELBASE VM_MIN_KERNEL_ADDRESS
 #endif /* KERNEL_BUILD */
 #endif /* _DTRACE_GLUE_H */
index f463b49e35e8c8886e75f5cdab9993ab33c870ff..cfc07b33fe813b8b77d2232857e8f8b5ec23fed2 100644 (file)
@@ -30,8 +30,6 @@
 #ifndef _SYS_DTRACE_IMPL_H
 #define        _SYS_DTRACE_IMPL_H
 
-/* #pragma ident       "@(#)dtrace_impl.h      1.23    07/02/16 SMI" */
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -1232,6 +1230,7 @@ struct dtrace_state {
        size_t dts_nretained;                   /* number of retained enabs */
        uint64_t dts_arg_error_illval;
        uint32_t dts_buf_over_limit;            /* number of bufs over dtb_limit */
+       uint64_t **dts_rstate;                  /* per-CPU random state */
 };
 
 struct dtrace_provider {
@@ -1394,6 +1393,9 @@ extern void dtrace_flush_caches(void);
 extern void dtrace_copy(uintptr_t, uintptr_t, size_t);
 extern void dtrace_copystr(uintptr_t, uintptr_t, size_t, volatile uint16_t *);
 
+extern void* dtrace_ptrauth_strip(void*, uint64_t);
+extern int dtrace_is_valid_ptrauth_key(uint64_t);
+
 /*
  * DTrace state handling
  */
index 74e47d3d562628e14354427664f14c795cf5f41e..502fabcd0cbb57ce09ee4aa501b92c96929bb70e 100644 (file)
@@ -276,6 +276,7 @@ __END_DECLS
 #ifdef BSD_KERNEL_PRIVATE
 #define EREDRIVEOPEN    (-6)
 #define EKEEPLOOKING    (-7)
+#define EDATALESS       (-8)
 /* used for cvwait error returns to Libc */
 #define ECVCERORR       256
 #define ECVPERORR       512
index e8c171fd94e6e53bdc5d0dd60f2bf64369f0c05c..5966311eb5c2d08060a18877a01c119a8aa32de7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define EVFILT_MACHPORT         (-8)    /* Mach portsets */
 #define EVFILT_FS               (-9)    /* Filesystem events */
 #define EVFILT_USER             (-10)   /* User events */
-                                        /* (-11) unused */
+#ifdef PRIVATE
+#define EVFILT_UNUSED_11        (-11)   /* (-11) unused */
+#endif
 #define EVFILT_VM               (-12)   /* Virtual memory events */
-
 #ifdef PRIVATE
 #define EVFILT_SOCK             (-13)   /* Socket events */
 #define EVFILT_MEMORYSTATUS     (-14)   /* Memorystatus events */
 #endif /* PRIVATE */
 #define EVFILT_EXCEPT           (-15)   /* Exception events */
 #ifdef PRIVATE
-#define EVFILT_WORKLOOP     (-17)   /* Workloop events */
+#define EVFILT_WORKLOOP         (-17)   /* Workloop events */
 #endif /* PRIVATE */
 
 #define EVFILT_SYSCOUNT         17
 #pragma pack(4)
 
 struct kevent {
-       uintptr_t       ident;          /* identifier for this event */
-       int16_t         filter;         /* filter for event */
-       uint16_t        flags;          /* general flags */
-       uint32_t        fflags;         /* filter-specific flags */
-       intptr_t        data;           /* filter-specific data */
-       void            *udata;         /* opaque user data identifier */
+       uintptr_t       ident;  /* identifier for this event */
+       int16_t         filter; /* filter for event */
+       uint16_t        flags;  /* general flags */
+       uint32_t        fflags; /* filter-specific flags */
+       intptr_t        data;   /* filter-specific data */
+       void            *udata; /* opaque user data identifier */
 };
 
 #ifdef KERNEL_PRIVATE
 
 struct user64_kevent {
-       uint64_t        ident;          /* identifier for this event */
-       int16_t         filter;         /* filter for event */
-       uint16_t        flags;          /* general flags */
-       uint32_t        fflags;         /* filter-specific flags */
-       int64_t         data;           /* filter-specific data */
-       user_addr_t     udata;          /* opaque user data identifier */
+       uint64_t        ident;  /* identifier for this event */
+       int16_t         filter; /* filter for event */
+       uint16_t        flags;  /* general flags */
+       uint32_t        fflags; /* filter-specific flags */
+       int64_t         data;   /* filter-specific data */
+       user_addr_t     udata;  /* opaque user data identifier */
 };
 
 struct user32_kevent {
-       uint32_t        ident;          /* identifier for this event */
-       int16_t         filter;         /* filter for event */
-       uint16_t        flags;          /* general flags */
-       uint32_t        fflags;         /* filter-specific flags */
-       int32_t         data;           /* filter-specific data */
+       uint32_t        ident;  /* identifier for this event */
+       int16_t         filter; /* filter for event */
+       uint16_t        flags;  /* general flags */
+       uint32_t        fflags; /* filter-specific flags */
+       int32_t         data;   /* filter-specific data */
        user32_addr_t   udata;  /* opaque user data identifier */
 };
 
-struct kevent_internal_s {
-       uint64_t    ident;      /* identifier for this event */
-       int16_t     filter;     /* filter for event */
-       uint16_t    flags;      /* general flags */
-       int32_t         qos;            /* quality of service */
-       uint32_t    fflags;     /* filter-specific flags */
-//     uint32_t        xflags;     /* extra filter-specific flags */
-       int64_t     data;       /* filter-specific data */
-       uint64_t    udata;      /* opaque user data identifier */
-       uint64_t    ext[4];     /* filter-specific extensions */
-};
-
 #endif /* KERNEL_PRIVATE */
 
 #pragma pack()
@@ -162,7 +151,6 @@ struct kevent_qos_s {
  * Type definition for names/ids of dynamically allocated kqueues.
  */
 typedef uint64_t kqueue_id_t;
-
 #endif /* PRIVATE */
 
 #define EV_SET(kevp, a, b, c, d, e, f) do {     \
@@ -201,19 +189,19 @@ typedef uint64_t kqueue_id_t;
  * instead.
  */
 
-#define KEVENT_FLAG_STACK_EVENTS                 0x000004   /* output events treated as stack (grows down) */
+// was  KEVENT_FLAG_STACK_EVENTS                 0x000004
 #define KEVENT_FLAG_STACK_DATA                   0x000008   /* output data allocated as stack (grows down) */
-//                                               0x000010
+//      KEVENT_FLAG_POLL                         0x000010
 #define KEVENT_FLAG_WORKQ                        0x000020   /* interact with the default workq kq */
 //      KEVENT_FLAG_LEGACY32                     0x000040
 //      KEVENT_FLAG_LEGACY64                     0x000080
-//                                               0x000100
+//      KEVENT_FLAG_PROC64                       0x000100
 #define KEVENT_FLAG_WORKQ_MANAGER                0x000200   /* obsolete */
 #define KEVENT_FLAG_WORKLOOP                     0x000400   /* interact with the specified workloop kq */
 #define KEVENT_FLAG_PARKING                      0x000800   /* workq thread is parking */
 //      KEVENT_FLAG_KERNEL                       0x001000
 //      KEVENT_FLAG_DYNAMIC_KQUEUE               0x002000
-//                                               0x004000
+//      KEVENT_FLAG_NEEDS_END_PROCESSING         0x004000
 #define KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH     0x008000   /* obsolete */
 #define KEVENT_FLAG_WORKLOOP_SERVICER_DETACH     0x010000   /* obsolete */
 #define KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST        0x020000   /* kq lookup by id must exist */
@@ -222,14 +210,19 @@ typedef uint64_t kqueue_id_t;
 
 #ifdef XNU_KERNEL_PRIVATE
 
+#define KEVENT_FLAG_POLL                         0x0010  /* Call is for poll() */
 #define KEVENT_FLAG_LEGACY32                     0x0040  /* event data in legacy 32-bit format */
 #define KEVENT_FLAG_LEGACY64                     0x0080  /* event data in legacy 64-bit format */
+#define KEVENT_FLAG_PROC64                       0x0100  /* proc is 64bits */
 #define KEVENT_FLAG_KERNEL                       0x1000  /* caller is in-kernel */
 #define KEVENT_FLAG_DYNAMIC_KQUEUE               0x2000  /* kqueue is dynamically allocated */
+#define KEVENT_FLAG_NEEDS_END_PROCESSING         0x4000  /* end processing required before returning */
+
+#define KEVENT_ID_FLAG_USER (KEVENT_FLAG_WORKLOOP | \
+               KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)
 
 #define KEVENT_FLAG_USER (KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS | \
-               KEVENT_FLAG_STACK_EVENTS | KEVENT_FLAG_STACK_DATA | \
-               KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP | \
+               KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP | \
                KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)
 
 /*
@@ -238,22 +231,24 @@ typedef uint64_t kqueue_id_t;
  * let kn_fops() get the correct fops for all cases.
  */
 #define EVFILTID_KQREAD            (EVFILT_SYSCOUNT)
-#define EVFILTID_PIPE_R            (EVFILT_SYSCOUNT + 1)
-#define EVFILTID_PIPE_W            (EVFILT_SYSCOUNT + 2)
-#define EVFILTID_PTSD              (EVFILT_SYSCOUNT + 3)
-#define EVFILTID_SOREAD            (EVFILT_SYSCOUNT + 4)
-#define EVFILTID_SOWRITE           (EVFILT_SYSCOUNT + 5)
-#define EVFILTID_SCK               (EVFILT_SYSCOUNT + 6)
-#define EVFILTID_SOEXCEPT          (EVFILT_SYSCOUNT + 7)
-#define EVFILTID_SPEC              (EVFILT_SYSCOUNT + 8)
-#define EVFILTID_BPFREAD           (EVFILT_SYSCOUNT + 9)
-#define EVFILTID_NECP_FD           (EVFILT_SYSCOUNT + 10)
-#define EVFILTID_FSEVENT           (EVFILT_SYSCOUNT + 13)
-#define EVFILTID_VN                (EVFILT_SYSCOUNT + 14)
-#define EVFILTID_TTY               (EVFILT_SYSCOUNT + 16)
-#define EVFILTID_PTMX              (EVFILT_SYSCOUNT + 17)
-
-#define EVFILTID_MAX               (EVFILT_SYSCOUNT + 18)
+#define EVFILTID_PIPE_N            (EVFILT_SYSCOUNT + 1)
+#define EVFILTID_PIPE_R            (EVFILT_SYSCOUNT + 2)
+#define EVFILTID_PIPE_W            (EVFILT_SYSCOUNT + 3)
+#define EVFILTID_PTSD              (EVFILT_SYSCOUNT + 4)
+#define EVFILTID_SOREAD            (EVFILT_SYSCOUNT + 5)
+#define EVFILTID_SOWRITE           (EVFILT_SYSCOUNT + 6)
+#define EVFILTID_SCK               (EVFILT_SYSCOUNT + 7)
+#define EVFILTID_SOEXCEPT          (EVFILT_SYSCOUNT + 8)
+#define EVFILTID_SPEC              (EVFILT_SYSCOUNT + 9)
+#define EVFILTID_BPFREAD           (EVFILT_SYSCOUNT + 10)
+#define EVFILTID_NECP_FD           (EVFILT_SYSCOUNT + 11)
+#define EVFILTID_FSEVENT           (EVFILT_SYSCOUNT + 15)
+#define EVFILTID_VN                (EVFILT_SYSCOUNT + 16)
+#define EVFILTID_TTY               (EVFILT_SYSCOUNT + 17)
+#define EVFILTID_PTMX              (EVFILT_SYSCOUNT + 18)
+
+#define EVFILTID_DETACHED          (EVFILT_SYSCOUNT + 19)
+#define EVFILTID_MAX               (EVFILT_SYSCOUNT + 20)
 
 #endif /* defined(XNU_KERNEL_PRIVATE) */
 
@@ -371,6 +366,8 @@ typedef uint64_t kqueue_id_t;
  * Marks the waiter knote as being eligible to become an owner
  * This bit can only be set once, trying it again will fail with EALREADY.
  *
+ * @const NOTE_WL_SYNC_IPC [in/out]
+ * The knote is a sync IPC redirected turnstile push.
  *
  * Flags/Modifiers:
  *
@@ -402,24 +399,27 @@ typedef uint64_t kqueue_id_t;
 #define NOTE_WL_THREAD_REQUEST   0x00000001
 #define NOTE_WL_SYNC_WAIT        0x00000004
 #define NOTE_WL_SYNC_WAKE        0x00000008
-#define NOTE_WL_COMMANDS_MASK    0x0000000f /* Mask of all the [in] commands above */
+#define NOTE_WL_SYNC_IPC         0x80000000
+#define NOTE_WL_COMMANDS_MASK    0x8000000f /* Mask of all the [in] commands above */
 
 #define NOTE_WL_UPDATE_QOS       0x00000010
 #define NOTE_WL_END_OWNERSHIP    0x00000020
-#define NOTE_WL_UPDATE_OWNER     0 /* ... compatibility define ... */
 #define NOTE_WL_DISCOVER_OWNER   0x00000080
 #define NOTE_WL_IGNORE_ESTALE    0x00000100
 #define NOTE_WL_UPDATES_MASK     0x000001f0 /* Mask of all the [in] updates above */
 
+#define NOTE_WL_UPDATE_OWNER     0 /* ... compatibility define ... */
+
 /*
  * EVFILT_WORKLOOP ext[] array indexes/meanings.
  */
 #define EV_EXTIDX_WL_LANE        0         /* lane identifier  [in: sync waiter]
-                                           *                   [out: thread request]     */
+                                           *                  [out: thread request]     */
 #define EV_EXTIDX_WL_ADDR        1         /* debounce address [in: NULL==no debounce]   */
 #define EV_EXTIDX_WL_MASK        2         /* debounce mask    [in]                      */
 #define EV_EXTIDX_WL_VALUE       3         /* debounce value   [in: not current->ESTALE]
-                                           *                   [out: new/debounce value] */
+                                           *                  [out: new/debounce value] */
+
 #endif /* PRIVATE */
 
 /*
@@ -532,6 +532,7 @@ enum {
 #define NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE      0x00000080      /* Used to restrict sending a warn event only once, per inactive limit, soft limit only */
 #define NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE    0x00000100      /* Used to restrict sending a critical event only once per active limit, soft limit only */
 #define NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE  0x00000200      /* Used to restrict sending a critical event only once per inactive limit, soft limit only */
+#define NOTE_MEMORYSTATUS_JETSAM_FG_BAND                0x00000400      /* jetsam is approaching foreground band */
 
 /*
  * Use this mask to protect the kernel private flags.
@@ -547,6 +548,7 @@ typedef enum vm_pressure_level {
        kVMPressureWarning  = 1,
        kVMPressureUrgent   = 2,
        kVMPressureCritical = 3,
+       kVMPressureJetsam   = 4,  /* jetsam approaching FG bands */
 } vm_pressure_level_t;
 
 #endif /* PRIVATE */
@@ -677,35 +679,30 @@ SLIST_HEAD(klist, knote);
 MALLOC_DECLARE(M_KQUEUE);
 #endif
 
+LIST_HEAD(knote_list, knote);
 TAILQ_HEAD(kqtailq, knote);     /* a list of "queued" events */
 
 /* index into various kq queues */
 typedef uint8_t kq_index_t;
-typedef uint16_t kn_status_t;
-
-#define KN_ACTIVE          0x0001       /* event has been triggered */
-#define KN_QUEUED          0x0002       /* event is on queue */
-#define KN_DISABLED        0x0004       /* event is disabled */
-#define KN_DROPPING        0x0008       /* knote is being dropped */
-#define KN_LOCKED          0x0010       /* knote is locked (kq_knlocks) */
-#define KN_ATTACHING       0x0020       /* event is pending attach */
-#define KN_STAYACTIVE      0x0040       /* force event to stay active */
-#define KN_DEFERDELETE     0x0080       /* defer delete until re-enabled */
-#define KN_ATTACHED        0x0100       /* currently attached to source */
-#define KN_DISPATCH        0x0200       /* disables as part of deliver */
-#define KN_UDATA_SPECIFIC  0x0400       /* udata is part of matching */
-#define KN_SUPPRESSED      0x0800       /* event is suppressed during delivery */
-#define KN_MERGE_QOS       0x1000       /* f_event() / f_* ran concurrently and
-                                        *                                  overrides must merge */
-#define KN_REQVANISH       0x2000       /* requested EV_VANISH */
-#define KN_VANISHED        0x4000       /* has vanished */
-//                         0x8000
-
-/* combination defines deferred-delete mode enabled */
-#define KN_DISPATCH2            (KN_DISPATCH | KN_UDATA_SPECIFIC)
+
+/* lskq(1) knows about this type */
+__options_decl(kn_status_t, uint16_t /* 12 bits really */, {
+       KN_ACTIVE         = 0x001,  /* event has been triggered */
+       KN_QUEUED         = 0x002,  /* event is on queue */
+       KN_DISABLED       = 0x004,  /* event is disabled */
+       KN_DROPPING       = 0x008,  /* knote is being dropped */
+       KN_LOCKED         = 0x010,  /* knote is locked (kq_knlocks) */
+       KN_POSTING        = 0x020,  /* f_event() in flight */
+       KN_STAYACTIVE     = 0x040,  /* force event to stay active */
+       KN_DEFERDELETE    = 0x080,  /* defer delete until re-enabled */
+       KN_MERGE_QOS      = 0x100,  /* f_event() / f_* ran concurrently and overrides must merge */
+       KN_REQVANISH      = 0x200,  /* requested EV_VANISH */
+       KN_VANISHED       = 0x400,  /* has vanished */
+       KN_SUPPRESSED     = 0x800,  /* event is suppressed during delivery */
+});
 
 #define KNOTE_KQ_BITSIZE    42
-_Static_assert(KNOTE_KQ_BITSIZE >= VM_KERNEL_POINTER_SIGNIFICANT_BITS,
+_Static_assert(KNOTE_KQ_BITSIZE > VM_KERNEL_POINTER_SIGNIFICANT_BITS,
     "Make sure sign extending kn_kq_packed is legit");
 
 struct kqueue;
@@ -713,43 +710,82 @@ struct knote {
        TAILQ_ENTRY(knote)       kn_tqe;            /* linkage for tail queue */
        SLIST_ENTRY(knote)       kn_link;           /* linkage for search list */
        SLIST_ENTRY(knote)       kn_selnext;        /* klist element chain */
-       uintptr_t                kn_filtid:8,       /* filter id to index filter ops */
-           kn_req_index:4,                         /* requested qos index */
+
+       kn_status_t              kn_status : 12;
+       uintptr_t
            kn_qos_index:4,                         /* in-use qos index */
-           kn_qos_override:4,                      /* qos override index */
+           kn_qos_override:3,                      /* qos override index */
+           kn_is_fd:1,                             /* knote is an fd */
            kn_vnode_kqok:1,
            kn_vnode_use_ofst:1;
 #if __LP64__
-       intptr_t                 kn_kq_packed : KNOTE_KQ_BITSIZE;
+       intptr_t                    kn_kq_packed : KNOTE_KQ_BITSIZE;
 #else
-       intptr_t                 kn_kq_packed;
+       intptr_t                    kn_kq_packed;
 #endif
+
+       /* per filter stash of data (pointer, uint32_t or uint64_t) */
        union {
-               void                 *kn_hook;
-               uint64_t             kn_hook_data;
+               void               *kn_hook;
+               uint32_t            kn_hook32;
+               uint64_t            kn_hook64;
        };
-       int64_t                  kn_sdata;          /* saved data field */
+
+       /* per filter pointer to the resource being watched */
        union {
-               struct fileproc      *p_fp;             /* file data pointer */
-               struct proc          *p_proc;           /* proc pointer */
-               struct ipc_mqueue    *p_mqueue;         /* pset pointer */
-       } kn_ptr;
-       struct kevent_internal_s kn_kevent;
-       int                      kn_sfflags;        /* saved filter flags */
-       int                      kn_hookid;
-       uint16_t                 kn_inuse;          /* inuse count */
-       kn_status_t              kn_status;         /* status bits */
-
-#define kn_id           kn_kevent.ident
-#define kn_filter       kn_kevent.filter
-#define kn_flags        kn_kevent.flags
-#define kn_qos          kn_kevent.qos
-#define kn_udata        kn_kevent.udata
-#define kn_fflags       kn_kevent.fflags
-#define kn_xflags       kn_kevent.xflags
-#define kn_data         kn_kevent.data
-#define kn_ext          kn_kevent.ext
-#define kn_fp           kn_ptr.p_fp
+               struct fileproc    *kn_fp;          /* file data pointer */
+               struct proc        *kn_proc;        /* proc pointer */
+               struct ipc_mqueue  *kn_mqueue;      /* pset pointer */
+               struct thread_call *kn_thcall;
+               struct thread      *kn_thread;
+       };
+
+       /*
+        * Mimic kevent_qos so that knote_fill_kevent code is not horrid,
+        * but with subtleties:
+        *
+        * - kevent_qos_s::filter is 16bits where ours is 8, and we use the top
+        *   bits to store the real specialized filter.
+        *   knote_fill_kevent* will always force the top bits to 0xff.
+        *
+        * - kevent_qos_s::xflags is not kept, kn_sfflags takes its place,
+        *   knote_fill_kevent* will set xflags to 0.
+        *
+        * - kevent_qos_s::data is saved as kn_sdata and filters are encouraged
+        *   to use knote_fill_kevent, knote_fill_kevent_with_sdata will copy
+        *   kn_sdata as the output value.
+        *
+        * knote_fill_kevent_with_sdata() programatically asserts
+        * these aliasings are respected.
+        */
+       struct kevent_internal_s {
+               uint64_t    kei_ident;      /* identifier for this event */
+#ifdef __LITTLE_ENDIAN__
+               int8_t      kei_filter;     /* filter for event */
+               uint8_t     kei_filtid;     /* actual filter for event */
+#else
+               uint8_t     kei_filtid;     /* actual filter for event */
+               int8_t      kei_filter;     /* filter for event */
+#endif
+               uint16_t    kei_flags;      /* general flags */
+               int32_t     kei_qos;        /* quality of service */
+               uint64_t    kei_udata;      /* opaque user data identifier */
+               uint32_t    kei_fflags;     /* filter-specific flags */
+               uint32_t    kei_sfflags;    /* knote: saved fflags */
+               int64_t     kei_sdata;      /* knote: filter-specific saved data */
+               uint64_t    kei_ext[4];     /* filter-specific extensions */
+       } kn_kevent;
+
+#define kn_id           kn_kevent.kei_ident
+#define kn_filtid       kn_kevent.kei_filtid
+#define kn_filter       kn_kevent.kei_filter
+#define kn_flags        kn_kevent.kei_flags
+#define kn_qos          kn_kevent.kei_qos
+#define kn_udata        kn_kevent.kei_udata
+#define kn_fflags       kn_kevent.kei_fflags
+#define kn_sfflags      kn_kevent.kei_sfflags
+#define kn_sdata        kn_kevent.kei_sdata
+#define kn_ext          kn_kevent.kei_ext
 };
 
 static inline struct kqueue *
@@ -773,21 +809,25 @@ knote_get_seltype(struct knote *kn)
        }
 }
 
-static inline void
-knote_set_error(struct knote *kn, int error)
-{
-       kn->kn_flags |= EV_ERROR;
-       kn->kn_data = error;
-}
-
-struct filt_process_s {
-       int fp_fd;
-       unsigned int fp_flags;
-       user_addr_t fp_data_out;
-       user_size_t fp_data_size;
-       user_size_t fp_data_resid;
+struct kevent_ctx_s {
+       uint64_t         kec_data_avail;    /* address of remaining data size */
+       user_addr_t      kec_data_out;      /* extra data pointer */
+       user_size_t      kec_data_size;     /* total extra data size */
+       user_size_t      kec_data_resid;    /* residual extra data size */
+       uint64_t         kec_deadline;      /* wait deadline unless KEVENT_FLAG_IMMEDIATE */
+       struct fileproc *kec_fp;            /* fileproc to pass to fp_drop or NULL */
+       int              kec_fd;            /* fd to pass to fp_drop or -1 */
+
+       /* the fields below are only set during process / scan */
+       int              kec_process_nevents;       /* user-level event count */
+       int              kec_process_noutputs;      /* number of events output */
+       unsigned int     kec_process_flags;         /* kevent flags, only set for process  */
+       user_addr_t      kec_process_eventlist;     /* user-level event list address */
 };
-typedef struct filt_process_s *filt_process_data_t;
+typedef struct kevent_ctx_s *kevent_ctx_t;
+
+kevent_ctx_t
+kevent_get_context(thread_t thread);
 
 /*
  * Filter operators
@@ -955,16 +995,16 @@ struct filterops {
        bool    f_adjusts_qos;    /* true if the filter can override the knote */
        bool    f_extended_codes; /* hooks return extended codes */
 
-       int     (*f_attach)(struct knote *kn, struct kevent_internal_s *kev);
+       int     (*f_attach)(struct knote *kn, struct kevent_qos_s *kev);
        void    (*f_detach)(struct knote *kn);
        int     (*f_event)(struct knote *kn, long hint);
-       int     (*f_touch)(struct knote *kn, struct kevent_internal_s *kev);
-       int     (*f_process)(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
+       int     (*f_touch)(struct knote *kn, struct kevent_qos_s *kev);
+       int     (*f_process)(struct knote *kn, struct kevent_qos_s *kev);
        int     (*f_peek)(struct knote *kn);
 
        /* optional & advanced */
-       bool    (*f_allow_drop)(struct knote *kn, struct kevent_internal_s *kev);
-       void    (*f_post_register_wait)(struct uthread *uth, struct knote_lock_ctx *ctx,
+       bool    (*f_allow_drop)(struct knote *kn, struct kevent_qos_s *kev);
+       void    (*f_post_register_wait)(struct uthread *uth, struct knote *kn,
            struct _kevent_register *ss_kr);
 };
 
@@ -1026,6 +1066,16 @@ struct filterops {
  *     Valid:    f_touch, f_attach, f_event, f_process
  *     Implicit: -
  *     Ignored:  f_peek
+ *
+ * FILTER_THREADREQ_NODEFEER
+ *     The filter has moved a turnstile priority push away from the current
+ *     thread, preemption has been disabled, and thread requests need to be
+ *     commited before preemption is re-enabled.
+ *
+ *
+ *     Valid:    f_attach, f_touch
+ *     Implicit: -
+ *     Invalid:  f_event, f_process, f_peek
  */
 #define FILTER_ACTIVE                       0x00000001
 #define FILTER_REGISTER_WAIT                0x00000002
@@ -1036,6 +1086,7 @@ struct filterops {
 #define FILTER_ADJUST_EVENT_QOS(qos) \
                (((qos) << FILTER_ADJUST_EVENT_QOS_SHIFT) | FILTER_ADJUST_EVENT_QOS_BIT)
 #define FILTER_RESET_EVENT_QOS              FILTER_ADJUST_EVENT_QOS_BIT
+#define FILTER_THREADREQ_NODEFEER           0x00000080
 
 #define filter_call(_ops, call)  \
                ((_ops)->f_extended_codes ? (_ops)->call : !!((_ops)->call))
@@ -1048,24 +1099,28 @@ extern void     klist_init(struct klist *list);
 #define KNOTE_ATTACH(list, kn)  knote_attach(list, kn)
 #define KNOTE_DETACH(list, kn)  knote_detach(list, kn)
 
-extern void     knote(struct klist *list, long hint);
-extern int      knote_attach(struct klist *list, struct knote *kn);
-extern int      knote_detach(struct klist *list, struct knote *kn);
-extern void     knote_vanish(struct klist *list, bool make_active);
-extern void     knote_link_waitqset_lazy_alloc(struct knote *kn);
+extern void knote(struct klist *list, long hint);
+extern int knote_attach(struct klist *list, struct knote *kn);
+extern int knote_detach(struct klist *list, struct knote *kn);
+extern void knote_vanish(struct klist *list, bool make_active);
+
+extern void knote_set_error(struct knote *kn, int error);
+extern int64_t knote_low_watermark(const struct knote *kn) __pure2;
+extern void knote_fill_kevent_with_sdata(struct knote *kn, struct kevent_qos_s *kev);
+extern void knote_fill_kevent(struct knote *kn, struct kevent_qos_s *kev, int64_t data);
+
+extern void knote_link_waitqset_lazy_alloc(struct knote *kn);
 extern boolean_t knote_link_waitqset_should_lazy_alloc(struct knote *kn);
-extern int      knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link);
-extern int      knote_unlink_waitq(struct knote *kn, struct waitq *wq);
-extern void     knote_fdclose(struct proc *p, int fd);
-extern void     knote_markstayactive(struct knote *kn);
-extern void     knote_clearstayactive(struct knote *kn);
+extern int knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link);
+extern int knote_unlink_waitq(struct knote *kn, struct waitq *wq);
+extern void knote_fdclose(struct proc *p, int fd);
+extern void knote_markstayactive(struct knote *kn);
+extern void knote_clearstayactive(struct knote *kn);
 extern const struct filterops *knote_fops(struct knote *kn);
-extern void knote_set_error(struct knote *kn, int error);
 
 extern struct turnstile *kqueue_turnstile(struct kqueue *);
 extern struct turnstile *kqueue_alloc_turnstile(struct kqueue *);
 
-int kevent_exit_on_workloop_ownership_leak(thread_t thread);
 int kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize);
 int kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf,
     uint32_t ubufsize, int32_t *nkqueues_out);
@@ -1074,6 +1129,15 @@ int kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
 int kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
     uint32_t ubufsize, int32_t *nknotes_out);
 
+extern int filt_wlattach_sync_ipc(struct knote *kn);
+extern void filt_wldetach_sync_ipc(struct knote *kn);
+
+extern int kevent_workq_internal(struct proc *p,
+    user_addr_t changelist, int nchanges,
+    user_addr_t eventlist, int nevents,
+    user_addr_t data_out, user_size_t *data_available,
+    unsigned int flags, int32_t *retval);
+
 #elif defined(KERNEL_PRIVATE) /* !XNU_KERNEL_PRIVATE: kexts still need a klist structure definition */
 
 #include <sys/queue.h>
@@ -1083,25 +1147,6 @@ SLIST_HEAD(klist, knote);
 
 #endif /* !XNU_KERNEL_PRIVATE && KERNEL_PRIVATE */
 
-#ifdef KERNEL_PRIVATE
-#ifdef PRIVATE
-
-/* make these private functions available to the pthread kext */
-extern int      kevent_qos_internal(struct proc *p, int fd,
-    user_addr_t changelist, int nchanges,
-    user_addr_t eventlist, int nevents,
-    user_addr_t data_out, user_size_t *data_available,
-    unsigned int flags, int32_t *retval);
-
-extern int      kevent_id_internal(struct proc *p, kqueue_id_t *id,
-    user_addr_t changelist, int nchanges,
-    user_addr_t eventlist, int nevents,
-    user_addr_t data_out, user_size_t *data_available,
-    unsigned int flags, int32_t *retval);
-
-#endif  /* PRIVATE */
-#endif  /* KERNEL_PRIVATE */
-
 #else   /* KERNEL */
 
 #include <sys/types.h>
index 82f2c84395fc092845de679eb203b40eed46c1a8..7934d169e44bafcf2c499a94ab1db76e14685722 100644 (file)
@@ -74,7 +74,7 @@ struct eventhandler_lists_ctxt {
 };
 
 struct eventhandler_entry_arg {
-       uuid_t ee_fmc_uuid;     /* Flow manager UUID */
+       uuid_t ee_fm_uuid;      /* Flow manager UUID */
        uuid_t ee_fr_uuid;      /* Flow route UUID */
 };
 
index e15a1a757a6176a6e3c751fe2fb6fe0e65b72ddd..04d31067ec9c353d2bb3018a2077f6156c25a8ab 100644 (file)
@@ -63,8 +63,7 @@
 
 #if defined(XNU_KERNEL_PRIVATE)
 
-typedef int (*kevent_callback_t)(struct kqueue *, struct kevent_internal_s *, void *);
-typedef void (*kqueue_continue_t)(struct kqueue *, void *, int);
+typedef int (*kevent_callback_t)(struct kevent_qos_s *, struct kevent_ctx_s *);
 
 #include <stdint.h>
 #include <kern/locks.h>
@@ -80,7 +79,7 @@ typedef void (*kqueue_continue_t)(struct kqueue *, void *, int);
  *     proc fd lock -> kq lock -> kq-waitq-set lock -> thread lock
  *
  * WorkQ/WorkLoop kqueues (from above):
- *     proc fd lock -> kq lock -> kq-request lock -> pthread kext locks -> thread lock
+ *     proc fd lock -> kq lock -> workq lock -> thread lock
  *
  * Whenever kqueues interact with source locks, it drops all of its own
  * locks in exchange for a use-reference on the knote used to synchronize
@@ -89,26 +88,18 @@ typedef void (*kqueue_continue_t)(struct kqueue *, void *, int);
  *
  * Standard file-based kqueues (from below):
  *     XXX lock -> kq lock -> kq-waitq-set lock -> thread lock
- * Standard file-based kqueues with non-kq-aware sources (from below):
- *     XXX lock -> kq-waitq-set lock -> thread lock
  *
  * WorkQ/WorkLoop kqueues (from below):
- *     XXX lock -> kq lock -> kq-request lock -> pthread kext locks -> thread lock
- * WorkQ/WorkLoop kqueues with non-kq-aware sources (from below):
- *     XXX -> kq-waitq-set lock -> kq-request lock -> pthread kext locks -> thread lock
+ *     XXX lock -> kq lock -> workq lock -> thread lock
  */
 
 #define KQEXTENT        256             /* linear growth by this amount */
 
 struct knote_lock_ctx {
-       struct knote                       *knlc_knote;
-       thread_t                            knlc_thread;
-       // TODO: knlc_turnstile
-       TAILQ_HEAD(, knote_lock_ctx)        knlc_head;
-       union {
-               LIST_ENTRY(knote_lock_ctx)      knlc_le;
-               TAILQ_ENTRY(knote_lock_ctx)     knlc_tqe;
-       };
+       struct knote               *knlc_knote;
+       thread_t                    knlc_thread;
+       uintptr_t                   knlc_waiters;
+       LIST_ENTRY(knote_lock_ctx)  knlc_link;
 #if DEBUG || DEVELOPMENT
 #define KNOTE_LOCK_CTX_UNLOCKED 0
 #define KNOTE_LOCK_CTX_LOCKED   1
@@ -124,8 +115,12 @@ LIST_HEAD(knote_locks, knote_lock_ctx);
  * the stack named `name`. In development kernels, it uses tricks to make sure
  * not locks was still held when exiting the C-scope that contains this context.
  */
-__attribute__((noinline, not_tail_called))
-void knote_lock_ctx_chk(struct knote_lock_ctx *ctx);
+static inline void
+knote_lock_ctx_chk(struct knote_lock_ctx *knlc)
+{
+       /* evil hackery to make sure no one forgets to unlock */
+       assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
+}
 #define KNOTE_LOCK_CTX(n) \
                struct knote_lock_ctx n __attribute__((cleanup(knote_lock_ctx_chk))); \
                n.knlc_state = KNOTE_LOCK_CTX_UNLOCKED
@@ -134,6 +129,24 @@ void knote_lock_ctx_chk(struct knote_lock_ctx *ctx);
                struct knote_lock_ctx n
 #endif
 
+
+__options_decl(kq_state_t, uint16_t, {
+       KQ_SEL            = 0x0001, /* select was recorded for kq */
+       KQ_SLEEP          = 0x0002, /* thread is waiting for events */
+       KQ_PROCWAIT       = 0x0004, /* thread waiting for processing */
+       KQ_KEV32          = 0x0008, /* kq is used with 32-bit events */
+       KQ_KEV64          = 0x0010, /* kq is used with 64-bit events */
+       KQ_KEV_QOS        = 0x0020, /* kq events carry QoS info */
+       KQ_WORKQ          = 0x0040, /* KQ is bound to process workq */
+       KQ_WORKLOOP       = 0x0080, /* KQ is part of a workloop */
+       KQ_PROCESSING     = 0x0100, /* KQ is being processed */
+       KQ_DRAIN          = 0x0200, /* kq is draining */
+       KQ_WAKEUP         = 0x0400, /* kq awakened while processing */
+       KQ_DYNAMIC        = 0x0800, /* kqueue is dynamically managed */
+       KQ_R2K_ARMED      = 0x1000, /* ast notification armed */
+       KQ_HAS_TURNSTILE  = 0x2000, /* this kqueue has a turnstile */
+});
+
 /*
  * kqueue - common core definition of a kqueue
  *
@@ -145,28 +158,18 @@ struct kqueue {
        struct {
                struct waitq_set    kq_wqs;       /* private waitq set */
                lck_spin_t          kq_lock;      /* kqueue lock */
-               uint16_t            kq_state;     /* state of the kq */
-               uint16_t            kq_level;     /* nesting level of the kq */
+               kq_state_t          kq_state;     /* state of the kq */
+               union {
+                       uint16_t    kq_waitq_hook;/* prepost hook (kqwl/kqwq) */
+                       uint16_t    kq_level;     /* nesting level of the kq */
+               };
                uint32_t            kq_count;     /* number of queued events */
                struct proc        *kq_p;         /* process containing kqueue */
                struct knote_locks  kq_knlocks;   /* list of knote locks held */
-               lck_spin_t          kq_reqlock;   /* kqueue request lock */
        }; /* make sure struct padding is put before kq_queue */
        struct kqtailq      kq_queue[0];      /* variable array of queues */
 };
 
-#define KQ_SEL            0x001  /* select was recorded for kq */
-#define KQ_SLEEP          0x002  /* thread is waiting for events */
-#define KQ_PROCWAIT       0x004  /* thread waiting for processing */
-#define KQ_KEV32          0x008  /* kq is used with 32-bit events */
-#define KQ_KEV64          0x010  /* kq is used with 64-bit events */
-#define KQ_KEV_QOS        0x020  /* kq events carry QoS info */
-#define KQ_WORKQ          0x040  /* KQ is bound to process workq */
-#define KQ_WORKLOOP       0x080  /* KQ is part of a workloop */
-#define KQ_PROCESSING     0x100  /* KQ is being processed */
-#define KQ_DRAIN          0x200  /* kq is draining */
-#define KQ_WAKEUP         0x400  /* kq awakened while processing */
-#define KQ_DYNAMIC        0x800  /* kqueue is dynamically managed */
 /*
  * kqfile - definition of a typical kqueue opened as a file descriptor
  *          via the kqueue() system call.
@@ -179,40 +182,15 @@ struct kqfile {
        struct kqtailq      kqf_queue;      /* queue of woken up knotes */
        struct kqtailq      kqf_suppressed; /* suppression queue */
        struct selinfo      kqf_sel;        /* parent select/kqueue info */
-};
-
 #define kqf_wqs      kqf_kqueue.kq_wqs
 #define kqf_lock     kqf_kqueue.kq_lock
 #define kqf_state    kqf_kqueue.kq_state
 #define kqf_level    kqf_kqueue.kq_level
 #define kqf_count    kqf_kqueue.kq_count
 #define kqf_p        kqf_kqueue.kq_p
-
-#define QOS_INDEX_KQFILE   0          /* number of qos levels in a file kq */
-
-/*
- * kqrequest - per-QoS thread request status
- */
-struct kqrequest {
-       struct workq_threadreq_s kqr_req;      /* used when request oustanding */
-       struct kqtailq   kqr_suppressed;       /* Per-QoS suppression queues */
-       thread_t         kqr_thread;           /* thread to satisfy request */
-       uint8_t          kqr_state;            /* KQ/workq interaction state */
-#define KQWL_STAYACTIVE_FIRED_BIT     (1 << 0)
-       uint8_t          kqr_wakeup_indexes;   /* QoS/override levels that woke */
-       uint16_t         kqr_dsync_waiters;    /* number of dispatch sync waiters */
-       kq_index_t       kqr_stayactive_qos;   /* max QoS of statyactive knotes */
-       kq_index_t       kqr_override_index;   /* highest wakeup override index */
-       kq_index_t       kqr_qos_index;        /* QoS for the thread request */
 };
 
-
-#define KQR_WORKLOOP                 0x01   /* owner is a workloop */
-#define KQR_THREQUESTED              0x02       /* thread has been requested from workq */
-#define KQR_WAKEUP                   0x04       /* wakeup called during processing */
-#define KQR_THOVERCOMMIT             0x08   /* overcommit needed for thread requests */
-#define KQR_R2K_NOTIF_ARMED          0x10   /* ast notifications armed */
-#define KQR_ALLOCATED_TURNSTILE      0x20   /* kqwl_turnstile is allocated */
+#define QOS_INDEX_KQFILE   0          /* number of qos levels in a file kq */
 
 /*
  * WorkQ kqueues need to request threads to service the triggered
@@ -240,17 +218,18 @@ struct kqrequest {
  *           values.
  */
 struct kqworkq {
-       struct kqueue    kqwq_kqueue;
-       struct kqtailq   kqwq_queue[KQWQ_NBUCKETS];       /* array of queues */
-       struct kqrequest kqwq_request[KQWQ_NBUCKETS];     /* per-QoS request states */
+       struct kqueue       kqwq_kqueue;
+       struct kqtailq      kqwq_queue[KQWQ_NBUCKETS];       /* array of queues */
+       struct kqtailq      kqwq_suppressed[KQWQ_NBUCKETS];  /* Per-QoS suppression queues */
+       workq_threadreq_s   kqwq_request[KQWQ_NBUCKETS];     /* per-QoS request states */
 };
 
-#define kqwq_wqs     kqwq_kqueue.kq_wqs
-#define kqwq_lock    kqwq_kqueue.kq_lock
-#define kqwq_state   kqwq_kqueue.kq_state
-#define kqwq_level   kqwq_kqueue.kq_level
-#define kqwq_count   kqwq_kqueue.kq_count
-#define kqwq_p       kqwq_kqueue.kq_p
+#define kqwq_wqs         kqwq_kqueue.kq_wqs
+#define kqwq_lock        kqwq_kqueue.kq_lock
+#define kqwq_state       kqwq_kqueue.kq_state
+#define kqwq_waitq_hook  kqwq_kqueue.kq_waitq_hook
+#define kqwq_count       kqwq_kqueue.kq_count
+#define kqwq_p           kqwq_kqueue.kq_p
 
 /*
  * WorkLoop kqueues need to request a thread to service the triggered
@@ -292,16 +271,20 @@ struct kqworkq {
  *      NOTE:   "lane" support is TBD.
  */
 struct kqworkloop {
-       struct kqueue    kqwl_kqueue;                     /* queue of events */
-       struct kqtailq   kqwl_queue[KQWL_NBUCKETS];       /* array of queues */
-       struct kqrequest kqwl_request;                    /* thread request state */
-       lck_mtx_t        kqwl_statelock;                  /* state/debounce lock */
-       thread_t         kqwl_owner;                      /* current [sync] owner thread */
-       uint32_t         kqwl_retains;                    /* retain references */
-       kqueue_id_t      kqwl_dynamicid;                  /* dynamic identity */
-       uint64_t         kqwl_params;                     /* additional parameters */
-       struct turnstile *kqwl_turnstile;                 /* turnstile for sync IPC/waiters */
-       SLIST_ENTRY(kqworkloop) kqwl_hashlink;            /* linkage for search list */
+       struct kqueue       kqwl_kqueue;                  /* queue of events */
+       struct kqtailq      kqwl_queue[KQWL_NBUCKETS];    /* array of queues */
+       struct kqtailq      kqwl_suppressed;              /* Per-QoS suppression queues */
+       workq_threadreq_s   kqwl_request;                 /* thread request state */
+       lck_spin_t          kqwl_statelock;               /* state/debounce lock */
+       thread_t            kqwl_owner;                   /* current [sync] owner thread */
+       uint32_t            kqwl_retains;                 /* retain references */
+#define KQWL_STAYACTIVE_FIRED_BIT     (1 << 0)
+       uint8_t             kqwl_wakeup_indexes;          /* QoS/override levels that woke */
+       kq_index_t          kqwl_stayactive_qos;          /* max QoS of statyactive knotes */
+       kqueue_id_t         kqwl_dynamicid;               /* dynamic identity */
+       uint64_t            kqwl_params;                  /* additional parameters */
+       struct turnstile   *kqwl_turnstile;               /* turnstile for sync IPC/waiters */
+       LIST_ENTRY(kqworkloop) kqwl_hashlink;             /* linkage for search list */
 #if CONFIG_WORKLOOP_DEBUG
 #define KQWL_HISTORY_COUNT 32
 #define KQWL_HISTORY_WRITE_ENTRY(kqwl, ...) ({ \
@@ -328,6 +311,7 @@ struct kqworkloop {
        unsigned int kqwl_index;
 #endif // CONFIG_WORKLOOP_DEBUG
 };
+LIST_HEAD(kqwllist, kqworkloop);
 
 typedef union {
        struct kqueue       *kq;
@@ -336,26 +320,28 @@ typedef union {
        struct kqworkloop   *kqwl;
 } __attribute__((transparent_union)) kqueue_t;
 
-SLIST_HEAD(kqlist, kqworkloop);
 
-#define kqwl_wqs     kqwl_kqueue.kq_wqs
-#define kqwl_lock    kqwl_kqueue.kq_lock
-#define kqwl_state   kqwl_kqueue.kq_state
-#define kqwl_level   kqwl_kqueue.kq_level
-#define kqwl_count   kqwl_kqueue.kq_count
-#define kqwl_p       kqwl_kqueue.kq_p
+#define kqwl_wqs         kqwl_kqueue.kq_wqs
+#define kqwl_lock        kqwl_kqueue.kq_lock
+#define kqwl_state       kqwl_kqueue.kq_state
+#define kqwl_waitq_hook  kqwl_kqueue.kq_waitq_hook
+#define kqwl_count       kqwl_kqueue.kq_count
+#define kqwl_p           kqwl_kqueue.kq_p
 
 #define KQ_WORKLOOP_RETAINS_MAX UINT32_MAX
 
-extern void kqueue_threadreq_unbind(struct proc *p, struct kqrequest *kqr);
+extern void kqueue_threadreq_unbind(struct proc *p, workq_threadreq_t);
 
 // called with the kq req held
 #define KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE 0x1
 extern void kqueue_threadreq_bind(struct proc *p, workq_threadreq_t req,
     thread_t thread, unsigned int flags);
 
+struct turnstile *kqueue_threadreq_get_turnstile(workq_threadreq_t kqr);
+
 // called with the wq lock held
-extern void kqueue_threadreq_bind_prepost(struct proc *p, workq_threadreq_t req, thread_t thread);
+extern void kqueue_threadreq_bind_prepost(struct proc *p, workq_threadreq_t req,
+    struct uthread *uth);
 
 // called with no lock held
 extern void kqueue_threadreq_bind_commit(struct proc *p, thread_t thread);
@@ -365,16 +351,17 @@ extern void kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t req);
 // lock not held as kqwl_params is immutable after creation
 extern workq_threadreq_param_t kqueue_threadreq_workloop_param(workq_threadreq_t req);
 
-extern struct kqueue *kqueue_alloc(struct proc *, unsigned int);
+extern struct kqueue *kqueue_alloc(struct proc *);
 extern void kqueue_dealloc(struct kqueue *);
+extern void kqworkq_dealloc(struct kqworkq *kqwq);
 
 extern void knotes_dealloc(struct proc *);
 extern void kqworkloops_dealloc(struct proc *);
 
-extern int kevent_register(struct kqueue *, struct kevent_internal_s *,
-    struct knote_lock_ctx *);
-extern int kqueue_scan(struct kqueue *, kevent_callback_t, kqueue_continue_t,
-    void *, struct filt_process_s *, struct timeval *, struct proc *);
+extern int kevent_register(struct kqueue *, struct kevent_qos_s *,
+    struct knote **);
+extern int kqueue_scan(struct kqueue *, int flags,
+    struct kevent_ctx_s *, kevent_callback_t);
 extern int kqueue_stat(struct kqueue *, void *, int, proc_t);
 
 #endif /* XNU_KERNEL_PRIVATE */
index ec2ece46ce111736f1bf90cb474171c993fc635b..7fa9815453a066db7d4c8aa41c727f3e99b85f86 100644 (file)
@@ -27,8 +27,6 @@
 #ifndef        _SYS_FASTTRAP_H
 #define        _SYS_FASTTRAP_H
 
-/* #pragma ident       "@(#)fasttrap.h 1.5     06/03/30 SMI" */
-
 #include <sys/fasttrap_isa.h>
 #include <sys/dtrace.h>
 #include <sys/types.h>
index 863e6037eb2b591b45c0d2c9e79df934e2de206b..109118fb192c09e87fc4a1265480da90f591355f 100644 (file)
 #ifndef        _FASTTRAP_IMPL_H
 #define        _FASTTRAP_IMPL_H
 
-/*
- * #pragma ident       "@(#)fasttrap_impl.h    1.14    08/04/09 SMI"
- */
-
 #include <sys/types.h>
 #include <sys/dtrace.h>
 #include <sys/proc.h>
index a6411a57f727d9a908befefe30c8f4ee61c390c3..88b365d793daf29c13a61c5c794e4e8063de1078 100644 (file)
@@ -66,8 +66,9 @@ extern int fbt_invop(uintptr_t, uintptr_t *, uintptr_t);
 extern void fbt_provide_module(void *, struct modctl *);
 extern int fbt_enable (void *arg, dtrace_id_t id, void *parg);
 
-extern int fbt_module_excluded(struct modctl*);
-extern int fbt_excluded(const char *);
+extern bool fbt_module_excluded(struct modctl*);
+extern bool fbt_excluded(const char *);
 
+extern void fbt_blacklist_init(void);
 extern void fbt_provide_probe(struct modctl *ctl, const char *modname, const char *name, machine_inst_t *instr, machine_inst_t *limit);
 #endif /* _FBT_H */
index de413f34c85c721a8cdbba477d894cf1f57f1e85..f0f301865bef853472623c4c11ddb8567348be57 100644 (file)
 #define AT_SYMLINK_NOFOLLOW     0x0020  /* Act on the symlink itself not the target */
 #define AT_SYMLINK_FOLLOW       0x0040  /* Act on target of symlink */
 #define AT_REMOVEDIR            0x0080  /* Path refers to directory */
+#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL
+#ifdef PRIVATE
+#define AT_REMOVEDIR_DATALESS   0x0100  /* Remove a dataless directory without materializing first */
+#endif
+#define AT_REALDEV              0x0200  /* Return real device inodes resides on for fstatat(2) */
+#define AT_FDONLY               0x0400  /* Use only the fd and Ignore the path for fstatat(2) */
+#endif
 #endif
 
 #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE)
 
 #define F_PUNCHHOLE     99              /* Deallocate a range of the file */
 
-#define F_TRIM_ACTIVE_FILE      100 /* Trim an active file */
+#define F_TRIM_ACTIVE_FILE      100     /* Trim an active file */
+
+#define F_SPECULATIVE_READ     101      /* Synchronous advisory read fcntl for regular and compressed file */
+
+#define F_GETPATH_NOFIRMLINK       102              /* return the full path without firmlinks of the fd */
 
 // FS-specific fcntl()'s numbers begin at 0x00010000 and go up
 #define FCNTL_FS_SPECIFIC_BASE  0x00010000
@@ -618,6 +629,14 @@ typedef struct ftrimactivefile {
        off_t fta_length; /* IN: size of the region */
 } ftrimactivefile_t;
 
+/* fspecread_t used by F_SPECULATIVE_READ */
+typedef struct fspecread {
+       unsigned int fsr_flags;  /* IN: flags word */
+       unsigned int reserved;   /* to maintain 8-byte alignment */
+       off_t fsr_offset;        /* IN: start of the region */
+       off_t fsr_length;        /* IN: size of the region */
+} fspecread_t;
+
 /* fbootstraptransfer_t used by F_READBOOTSTRAP and F_WRITEBOOTSTRAP commands */
 
 typedef struct fbootstraptransfer {
index 1234072621d60c69fb3fbf9bdb85136ec029f895..d9f6b1a5c26053b48ce60fa81a5122b6a2ccffa4 100644 (file)
@@ -97,9 +97,11 @@ int file_drop(int);
 #ifdef KERNEL_PRIVATE
 int fd_rdwr(int fd, enum uio_rw, uint64_t base, int64_t len, enum uio_seg,
     off_t offset, int io_flg, int64_t *aresid);
+struct fileglob;
 struct fileproc;
 struct vnode;
 int fp_getfvp(struct proc *p, int fd, struct fileproc **resultfp, struct vnode  **resultvp);
+struct vnode *fg_get_vnode(struct fileglob *fg);
 #endif  /* KERNEL_PRIVATE */
 __END_DECLS
 #endif /* !_SYS_FILE_H_ */
index fbd615cbde0be546c38581c13b8a0f3132c5c17a..e5fde07608fa576cd65dcad348fc5df0efe55eb2 100644 (file)
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/guarded.h>
+#include <os/refcnt.h>
 
 struct proc;
 struct uio;
 struct knote;
-struct kevent_internal_s;
+struct kevent_qos_s;
 
 #ifdef __APPLE_API_UNSTABLE
 
 struct file;
 
+__options_decl(fileproc_vflags_t, unsigned int, {
+       FPV_NONE        = 0,
+       FPV_DRAIN       = 0x01,
+});
 
 /*
  * Kernel descriptor table.
@@ -93,7 +98,8 @@ struct file;
  */
 struct fileproc {
        unsigned int f_flags;
-       int32_t f_iocount;
+       _Atomic fileproc_vflags_t f_vflags;
+       os_refcnt_t f_iocount;
        struct fileglob * f_fglob;
        void *f_wset;
 };
@@ -164,36 +170,37 @@ typedef enum {
 #define FG_CONFINED     0x200   /* fileglob confined to process, immutably */
 #define FG_HAS_OFDLOCK  0x400   /* Has or has had an OFD lock */
 
+struct fileops {
+       file_type_t     fo_type;        /* descriptor type */
+       int (*fo_read)      (struct fileproc *fp, struct uio *uio,
+           int flags, vfs_context_t ctx);
+       int (*fo_write)     (struct fileproc *fp, struct uio *uio,
+           int flags, vfs_context_t ctx);
+#define FOF_OFFSET      0x00000001      /* offset supplied to vn_write */
+#define FOF_PCRED       0x00000002      /* cred from proc, not current thread */
+       int (*fo_ioctl)(struct fileproc *fp, u_long com,
+           caddr_t data, vfs_context_t ctx);
+       int (*fo_select)    (struct fileproc *fp, int which,
+           void *wql, vfs_context_t ctx);
+       int (*fo_close)     (struct fileglob *fg, vfs_context_t ctx);
+       int (*fo_kqfilter)  (struct fileproc *fp, struct knote *, struct kevent_qos_s *);
+       int (*fo_drain)     (struct fileproc *fp, vfs_context_t ctx);
+};
+
 struct fileglob {
        LIST_ENTRY(fileglob) f_msglist;/* list of active files */
-       int32_t fg_flag;                /* see fcntl.h */
+       int32_t fg_flag;        /* see fcntl.h */
        int32_t fg_count;       /* reference count */
        int32_t fg_msgcount;    /* references from message queue */
        int32_t fg_lflags;      /* file global flags */
        kauth_cred_t fg_cred;   /* credentials associated with descriptor */
-       const struct fileops {
-               file_type_t     fo_type;        /* descriptor type */
-               int     (*fo_read)      (struct fileproc *fp, struct uio *uio,
-                   int flags, vfs_context_t ctx);
-               int     (*fo_write)     (struct fileproc *fp, struct uio *uio,
-                   int flags, vfs_context_t ctx);
-#define FOF_OFFSET      0x00000001      /* offset supplied to vn_write */
-#define FOF_PCRED       0x00000002      /* cred from proc, not current thread */
-               int     (*fo_ioctl)(struct fileproc *fp, u_long com,
-                   caddr_t data, vfs_context_t ctx);
-               int     (*fo_select)    (struct fileproc *fp, int which,
-                   void *wql, vfs_context_t ctx);
-               int     (*fo_close)     (struct fileglob *fg, vfs_context_t ctx);
-               int     (*fo_kqfilter)  (struct fileproc *fp, struct knote *kn,
-                   struct kevent_internal_s *kev, vfs_context_t ctx);
-               int     (*fo_drain)     (struct fileproc *fp, vfs_context_t ctx);
-       } *fg_ops;
+       const struct fileops *fg_ops;
        off_t   fg_offset;
-       void    *fg_data;       /* vnode or socket or SHM or semaphore */
-       void    *fg_vn_data;    /* Per fd vnode data, used for directories */
+       void   *fg_data;        /* vnode or socket or SHM or semaphore */
+       void   *fg_vn_data;     /* Per fd vnode data, used for directories */
        lck_mtx_t fg_lock;
 #if CONFIG_MACF
-       struct label *fg_label;  /* JMM - use the one in the cred? */
+       struct label *fg_label; /* JMM - use the one in the cred? */
 #endif
 };
 
@@ -209,20 +216,32 @@ extern int maxfilesperproc;
 
 
 __BEGIN_DECLS
+
+/* wrappers for fp->f_ops->fo_... */
 int fo_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx);
 int fo_write(struct fileproc *fp, struct uio *uio, int flags,
     vfs_context_t ctx);
 int fo_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx);
 int fo_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx);
 int fo_close(struct fileglob *fg, vfs_context_t ctx);
-int fo_kqfilter(struct fileproc *fp, struct knote *kn,
-    struct kevent_internal_s *kev, vfs_context_t ctx);
+int fo_drain(struct fileproc *fp, vfs_context_t ctx);
+int fo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev);
+
+/* Functions to use for unsupported fileops */
+int fo_no_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx);
+int fo_no_write(struct fileproc *fp, struct uio *uio, int flags,
+    vfs_context_t ctx);
+int fo_no_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx);
+int fo_no_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx);
+int fo_no_drain(struct fileproc *fp, vfs_context_t ctx);
+int fo_no_kqfilter(struct fileproc *, struct knote *, struct kevent_qos_s *kev);
+
 void fileproc_drain(proc_t, struct fileproc *);
 int fp_tryswap(proc_t, int fd, struct fileproc *nfp);
 int fp_drop(struct proc *p, int fd, struct fileproc *fp, int locked);
 int fp_drop_written(proc_t p, int fd, struct fileproc *fp);
 int fp_drop_event(proc_t p, int fd, struct fileproc *fp);
-int fp_free(struct proc * p, int fd, struct fileproc * fp);
+void fp_free(struct proc * p, int fd, struct fileproc * fp);
 struct kqueue;
 int fp_getfkq(struct proc *p, int fd, struct fileproc **resultfp, struct kqueue  **resultkq);
 struct psemnode;
@@ -242,12 +261,14 @@ int fp_isguarded(struct fileproc *fp, u_int attribs);
 int fp_guard_exception(proc_t p, int fd, struct fileproc *fp, u_int attribs);
 int closef_locked(struct fileproc *fp, struct fileglob *fg, struct proc *p);
 int close_internal_locked(proc_t p, int fd, struct fileproc *fp, int flags);
+int fileport_makefd_internal(proc_t p, ipc_port_t port, int uf_flags, int *fd);
 struct nameidata;
 struct vnode_attr;
 int open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
     int32_t *retval);
-int kqueue_body(struct proc *p, fp_allocfn_t, void *cra, int32_t *retval);
+int chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread);
+int kqueue_internal(struct proc *p, fp_allocfn_t, void *cra, int32_t *retval);
 void fg_insertuipc(struct fileglob * fg);
 boolean_t fg_insertuipc_mark(struct fileglob * fg);
 void fg_removeuipc(struct fileglob * fg);
@@ -267,6 +288,8 @@ extern void fg_vn_data_free(void *fgvndata);
 extern int nameiat(struct nameidata *ndp, int dirfd);
 extern int falloc_guarded(struct proc *p, struct fileproc **fp, int *fd,
     vfs_context_t ctx, const guardid_t *guard, u_int attrs);
+extern void fileproc_modify_vflags(struct fileproc *fp, fileproc_vflags_t vflags, boolean_t clearflags);
+fileproc_vflags_t fileproc_get_vflags(struct fileproc *fp);
 __END_DECLS
 
 #endif /* __APPLE_API_UNSTABLE */
index 80d91f2e4ed6fe7e49e015418c44c71d55353a3c..aebf4b0548e6440e88a1024210186656d545fdad 100644 (file)
 #include <kern/locks.h>
 
 struct klist;
-struct kqlist;
+struct kqwllist;
 
 struct filedesc {
        struct  fileproc **fd_ofiles;   /* file structures for open files */
        lck_mtx_t fd_kqhashlock;        /* lock for dynamic kqueue hash */
        u_long  fd_kqhashmask;          /* size of dynamic kqueue hash */
-       struct  kqlist *fd_kqhash;      /* hash table for dynamic kqueues */
-       struct  kqueue *fd_wqkqueue;    /* the workq kqueue */
+       struct  kqwllist *fd_kqhash;    /* hash table for dynamic kqueues */
+       struct  kqworkq *fd_wqkqueue;   /* the workq kqueue */
        char    *fd_ofileflags;         /* per-process open file flags */
        struct  vnode *fd_cdir;         /* current directory */
        struct  vnode *fd_rdir;         /* root directory */
index 8c7ec89b07423c30c965078cbb00db997755bf11..3c2c3783cf9b58e0449c72547a38ef4b837c7edd 100644 (file)
@@ -159,7 +159,6 @@ typedef struct namespace_handler_data {
 
 
 extern int resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg);
-extern int get_nspace_item_status(struct vnode *vp, int32_t *status);
 
 #else
 
@@ -216,8 +215,6 @@ int nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg)
 
 #define NAMESPACE_HANDLER_EVENT_TYPE_MASK (NAMESPACE_HANDLER_NSPACE_EVENT | NAMESPACE_HANDLER_SNAPSHOT_EVENT | NAMESPACE_HANDLER_TRACK_EVENT)
 
-#define DATALESS_CMPFS_TYPE     0x80000001
-
 
 typedef int32_t nspace_handler_info[2];
 typedef char fstypename_t[MFSTYPENAMELEN];
@@ -260,6 +257,27 @@ typedef struct disk_conditioner_info {
        uint32_t segwritecnt;
 } disk_conditioner_info;
 
+/*
+ * BSD flags manipulation arguments.
+ *
+ * This provides a safe way to update the BSD flags field of an inode,
+ * which has some user components as well as some system components.
+ * What it provides is a compare-and-swap operation, whereby the caller
+ * fetches what the expected flags are, computes the new set, and then
+ * provides the old along with the new.  If the old that's provided matches
+ * what's actually in the inode, the new value is set.  The actual inode
+ * value is returned to the caller, and expected == actual is how the
+ * caller can determine that the operation succeeded.
+ *
+ * Some BSD flags (e.g. UF_COMPRESSED) can only be manipulated via this
+ * safe mechanism.
+ */
+struct fsioc_cas_bsdflags {
+       uint32_t expected_flags;        /* [IN] expected flags */
+       uint32_t new_flags;             /* [IN] new value to set */
+       uint32_t actual_flags;          /* [OUT] the actual flags in inode */
+};
+
 #define FSCTL_SYNC_FULLSYNC     (1<<0)  /* Flush the data fully to disk, if supported by the filesystem */
 #define FSCTL_SYNC_WAIT         (1<<1)  /* Wait for the sync to complete */
 
@@ -273,35 +291,16 @@ typedef struct disk_conditioner_info {
 /* Unsupported - previously FSIOC_WAIT_FOR_SYNC */
 #define FSIOC_UNSUPPORTED                         _IOR('A', 3, int32_t)
 
-#define FSIOC_NAMESPACE_HANDLER_GET               _IOW('A', 4, struct namespace_handler_info)
-#define FSCTL_NAMESPACE_HANDLER_GET               IOCBASECMD(FSIOC_NAMESPACE_HANDLER_GET)
-
-#define FSIOC_NAMESPACE_HANDLER_UPDATE            _IOW('A', 5, nspace_handler_info)
-#define FSCTL_NAMESPACE_HANDLER_UPDATE            IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UPDATE)
-
-#define FSIOC_NAMESPACE_HANDLER_UNBLOCK           _IOW('A', 6, nspace_handler_info)
-#define FSCTL_NAMESPACE_HANDLER_UNBLOCK           IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UNBLOCK)
-
-#define FSIOC_NAMESPACE_HANDLER_CANCEL            _IOW('A', 7, nspace_handler_info)
-#define FSCTL_NAMESPACE_HANDLER_CANCEL            IOCBASECMD(FSIOC_NAMESPACE_HANDLER_CANCEL)
-
-#define FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME _IOW('A', 8, int32_t)
-#define FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME IOCBASECMD(FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME)
-
-#define FSIOC_OLD_SNAPSHOT_HANDLER_GET            _IOW('A', 9, struct namespace_handler_info)
-#define FSCTL_OLD_SNAPSHOT_HANDLER_GET            IOCBASECMD(FSIOC_OLD_SNAPSHOT_HANDLER_GET)
+/* 4 - 9 was used for NAMESPACE handler operation to support dataless file faults
+ * no and no longer user */
 
 #define FSIOC_SET_FSTYPENAME_OVERRIDE             _IOW('A', 10, fstypename_t)
 #define FSCTL_SET_FSTYPENAME_OVERRIDE             IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE)
 
-#define FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS _IOW('A', 11, int32_t)
-#define FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS IOCBASECMD(FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS)
-
 /* 12 was used for TRACKED_HANDLER_GET which has now been removed
  *  as it is no longer used. */
 
-#define FSIOC_SNAPSHOT_HANDLER_GET_EXT            _IOW('A', 13, struct namespace_handler_info_ext)
-#define FSCTL_SNAPSHOT_HANDLER_GET_EXT            IOCBASECMD(FSIOC_SNAPSHOT_HANDLER_GET_EXT)
+/* 13 was used for FSIOC_SNAPSHOT_HANDLER_GET_EXT and now been removed */
 
 /* 14 was used for NAMESPACE_HANDLER_GETDATA which has now been
  *  removed as it is no longer used. */
@@ -321,6 +320,9 @@ typedef struct disk_conditioner_info {
 #define DISK_CONDITIONER_IOC_SET                  _IOW('A', 19, disk_conditioner_info)
 #define DISK_CONDITIONER_FSCTL_SET                IOCBASECMD(DISK_CONDITIONER_IOC_SET)
 
+/* Set the value of a file's BSD flags in a safe way. */
+#define FSIOC_CAS_BSDFLAGS      _IOWR('A', 20, struct fsioc_cas_bsdflags)
+
 /* Check if a file is only open once (pass zero for the extra arg) */
 #define FSIOC_FD_ONLY_OPEN_ONCE _IOWR('A', 21, uint32_t)
 
@@ -346,6 +348,14 @@ typedef struct disk_conditioner_info {
 #define FSIOC_THAW_EXTENTS                              _IO('h', 21)
 #define FSCTL_THAW_EXTENTS                              IOCBASECMD(FSIOC_THAW_EXTENTS)
 
+/* this FSCTL selector is duplicated in XNU with the intent of making the VFS/generic one the only one eventually */
+#define FIRMLINK_STRUCT_LEN 1032
+typedef struct generic_firmlink {
+       uint8_t array[FIRMLINK_STRUCT_LEN];
+} generic_firmlink_t;
+
+#define FSIOC_FIRMLINK_CTL _IOWR ('J', 60, generic_firmlink_t)
+
 #ifndef KERNEL
 
 #include <sys/cdefs.h>
index 8779bc362c22fb2130677aad7f99271b411441ff..4ee5a460aa3c7b97db9c0b2dda7032f2d300cb51 100644 (file)
 // These are special bits that be set in the 32-bit mode
 // field that /dev/fsevents provides.
 //
-#define FSE_MODE_HLINK         (1 << 31)    // notification is for a hard-link
-#define FSE_MODE_LAST_HLINK    (1 << 30)    // link count == 0 on a hard-link delete
-#define FSE_REMOTE_DIR_EVENT   (1 << 29)    // this is a remotely generated directory-level granularity event
-#define FSE_TRUNCATED_PATH     (1 << 28)    // the path for this item had to be truncated
-#define FSE_MODE_CLONE         (1 << 27)    // notification is for a clone
+#define FSE_MODE_HLINK         (1U << 31)    // notification is for a hard-link
+#define FSE_MODE_LAST_HLINK    (1U << 30)    // link count == 0 on a hard-link delete
+#define FSE_REMOTE_DIR_EVENT   (1U << 29)    // this is a remotely generated directory-level granularity event
+#define FSE_TRUNCATED_PATH     (1U << 28)    // the path for this item had to be truncated
+#define FSE_MODE_CLONE         (1U << 27)    // notification is for a clone
 
 // ioctl's on /dev/fsevents
 typedef struct fsevent_clone_args {
index bde5ce6e89e4a662a4c51d225bfe637c6c87736d..75a96e3f455031098feec2ab89fd6cf380bcfef5 100644 (file)
@@ -45,6 +45,7 @@
 #include <sys/types.h>
 #include <sys/mount.h>
 #ifdef __APPLE_API_PRIVATE
+#include <sys/attr.h>
 #include <sys/syscall.h>
 #include <unistd.h>
 #endif  /* __APPLE_API_PRIVATE */
@@ -60,6 +61,18 @@ ssize_t fsgetpath(char *, size_t, fsid_t *, uint64_t) __OSX_AVAILABLE(10.13) __I
 #ifdef PRIVATE
 #include <sys/_types/_fsobj_id_t.h>
 
+#ifndef FSOPT_NOFIRMLINKPATH     /* also in attr.h */
+#define FSOPT_NOFIRMLINKPATH     0x00000080
+#endif
+
+#ifndef FSOPT_ISREALFSID     /* also in attr.h */
+#ifdef  FSOPT_RETURN_REALDEV
+#define FSOPT_ISREALFSID         FSOPT_RETURN_REALDEV
+#else
+#define FSOPT_ISREALFSID         0x00000200
+#endif
+#endif /* FSOPT_ISREALFSID */
+
 #ifdef __APPLE_API_PRIVATE
 
 
@@ -81,6 +94,8 @@ ssize_t fsgetpath(char *, size_t, fsid_t *, uint64_t) __OSX_AVAILABLE(10.13) __I
  */
 int openbyid_np(fsid_t* fsid, fsobj_id_t* objid, int flags);
 
+ssize_t fsgetpath_ext(char *, size_t, fsid_t *, uint64_t, uint32_t) __OSX_AVAILABLE(10.15) __IOS_AVAILABLE(13.0) __TVOS_AVAILABLE(13.0) __WATCHOS_AVAILABLE(6.0);
+
 #endif /* __APPLE_API_PRIVATE */
 #endif /* PRIVATE */
 
index c50bf146a252f3d3600f14824526556a1bb1391d..ef7b40d85b10a62fd83f715dd3ab77b21bdc8ab5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -221,31 +221,6 @@ extern struct gmonparam _gmonparam;
 #define GMON_PROF_ERROR 2
 #define GMON_PROF_OFF   3
 
-/*
- * Sysctl definitions for extracting profiling information from the kernel.
- */
-#define GPROF_STATE     0       /* int: profiling enabling variable */
-#define GPROF_COUNT     1       /* struct: profile tick count buffer */
-#define GPROF_FROMS     2       /* struct: from location hash bucket */
-#define GPROF_TOS       3       /* struct: destination/count structure */
-#define GPROF_GMONPARAM 4       /* struct: profiling parameters (see above) */
-
-
-/*
- * Declarations for various profiling related functions from
- * bsd/kern/subr_prof.c
- */
-#ifdef GPROF
-#ifdef XNU_KERNEL_PRIVATE
-
-void kmstartup(void);
-void cfreemem(caddr_t, int);  /* Currently only a stub function. */
-void mcount(uintptr_t, uintptr_t);
-
-#endif /* XNU_KERNEL_PRIVATE */
-#endif /* GPROF */
-
-
 /*
  * In order to support more information than in the original mon.out and
  * gmon.out files there is an alternate gmon.out file format.  The alternate
index 6bd3d8e62a10eb0b21e5c87b42f2d5a923ab9a30..8534410a83d6f8bd11686db67dfbc751e4f665f4 100644 (file)
@@ -131,8 +131,15 @@ struct vnguard_set {
        guardid_t vns_guard;
 };
 
+struct vnguard_getattr {
+       int vga_fd;             /* in */
+       unsigned vga_attrs;     /* out */
+       guardid_t vga_guard;    /* in */
+};
+
 #define VNG_SYSC_PING           0
 #define VNG_SYSC_SET_GUARD      1
+#define VNG_SYSC_GET_ATTR       2
 
 #define VNG_POLICY_NAME         "vnguard"
 
index 7b0f11d9eb4a335d216852da4b48d3ff2cd516f8..e42f1e39d55f2d725b462cf2dc54b8ac65f32f66 100644 (file)
 #ifndef _IMAGEBOOT_H_
 #define _IMAGEBOOT_H_
 
-int     imageboot_needed(void);
-void    imageboot_setup(void);
+typedef enum imageboot_type {
+       IMAGEBOOT_NONE,
+       IMAGEBOOT_DMG,
+       IMAGEBOOT_LOCKER,
+} imageboot_type_t;
+
+imageboot_type_t        imageboot_needed(void);
+void    imageboot_setup(imageboot_type_t type);
 int     imageboot_format_is_valid(const char *root_path);
-int     imageboot_mount_image(const char *root_path, int height);
+int     imageboot_mount_image(const char *root_path, int height, imageboot_type_t type);
 
 #define IMAGEBOOT_CONTAINER_ARG         "container-dmg"
 #define IMAGEBOOT_ROOT_ARG              "root-dmg"
 #define IMAGEBOOT_AUTHROOT_ARG          "auth-root-dmg"
+#if CONFIG_LOCKERBOOT
+#define IMAGEBOOT_LOCKER_ARG "locker"
+#define LOCKERFS_NAME "lockerfs"
+#endif
 
 #endif
index 8d5da2872002e74ca0e0d25e0673837922f15a7a..a0138830e034ed41216cd94fc1f6e1289e23e2b7 100644 (file)
@@ -118,10 +118,12 @@ struct image_params {
        void            *ip_px_spa;
        void            *ip_px_smpx;            /* MAC-specific spawn attrs. */
        void            *ip_px_persona;         /* persona args */
+       void            *ip_px_pcred_info;      /* posix cred args */
        void            *ip_cs_error;           /* codesigning error reason */
 
        uint64_t ip_dyld_fsid;
        uint64_t ip_dyld_fsobjid;
+       unsigned int    ip_simulator_binary;    /* simulator binary flags */
 };
 
 /*
@@ -139,6 +141,14 @@ struct image_params {
 #define IMGPF_EXEC                              0x00000100      /* exec */
 #define IMGPF_HIGH_BITS_ASLR    0x00000200      /* randomize high bits of ASLR slide */
 #define IMGPF_IS_64BIT_DATA             0x00000400      /* exec to a 64Bit register state */
+#define IMGPF_DRIVER             0x00000800      /* exec of a driver binary (no LC_MAIN) */
+#define IMGPF_NOJOP             0x80000000
 
+/*
+ * Simulator binary flags
+ */
+#define IMGPF_SB_DEFAULT         0               /* Default value, did not check if it is a simulator binary */
+#define IMGPF_SB_TRUE            1               /* Binary is a simulator binary */
+#define IMGPF_SB_FALSE           2               /* Binary is not a simulator binary */
 
 #endif  /* !_SYS_IMGACT */
index c3b9b415ff7b4978bd181ec894ca9d34427f293f..1de38642a516ee6b5539b267c6e29b19c5d4bc7a 100644 (file)
 
 #endif /* BSD_KERNEL_PRIVATE */
 
-extern int
-kern_asl_msg_va(int level, const char *facility, int num_pairs,
-    va_list vargs, ...);
-
 extern int
 kern_asl_msg(int level, const char *facility, int num_pairs, ...);
 
index 3a72e0b745d3bbd425c6fd2f12f2f6d19881d6c5..7013904086f0e084a358c8c5a43a034fa6baa916 100644 (file)
@@ -507,7 +507,9 @@ kauth_filesec_t kauth_filesec_alloc(int size);
 void            kauth_filesec_free(kauth_filesec_t fsp);
 extern kauth_scope_t kauth_register_scope(const char *_identifier, kauth_scope_callback_t _callback, void *_idata);
 extern void     kauth_deregister_scope(kauth_scope_t _scope);
+__kpi_deprecated("Use EndpointSecurity instead")
 extern kauth_listener_t kauth_listen_scope(const char *_identifier, kauth_scope_callback_t _callback, void *_idata);
+__kpi_deprecated("Use EndpointSecurity instead")
 extern void     kauth_unlisten_scope(kauth_listener_t _scope);
 extern int      kauth_authorize_action(kauth_scope_t _scope, kauth_cred_t _credential, kauth_action_t _action,
     uintptr_t _arg0, uintptr_t _arg1, uintptr_t _arg2, uintptr_t _arg3);
@@ -624,29 +626,29 @@ __END_DECLS
 /* Actions, also rights bits in an ACE */
 
 #if defined(KERNEL) || defined (_SYS_ACL_H)
-#define KAUTH_VNODE_READ_DATA                   (1<<1)
+#define KAUTH_VNODE_READ_DATA                   (1U<<1)
 #define KAUTH_VNODE_LIST_DIRECTORY              KAUTH_VNODE_READ_DATA
-#define KAUTH_VNODE_WRITE_DATA                  (1<<2)
+#define KAUTH_VNODE_WRITE_DATA                  (1U<<2)
 #define KAUTH_VNODE_ADD_FILE                    KAUTH_VNODE_WRITE_DATA
-#define KAUTH_VNODE_EXECUTE                     (1<<3)
+#define KAUTH_VNODE_EXECUTE                     (1U<<3)
 #define KAUTH_VNODE_SEARCH                      KAUTH_VNODE_EXECUTE
-#define KAUTH_VNODE_DELETE                      (1<<4)
-#define KAUTH_VNODE_APPEND_DATA                 (1<<5)
+#define KAUTH_VNODE_DELETE                      (1U<<4)
+#define KAUTH_VNODE_APPEND_DATA                 (1U<<5)
 #define KAUTH_VNODE_ADD_SUBDIRECTORY            KAUTH_VNODE_APPEND_DATA
-#define KAUTH_VNODE_DELETE_CHILD                (1<<6)
-#define KAUTH_VNODE_READ_ATTRIBUTES             (1<<7)
-#define KAUTH_VNODE_WRITE_ATTRIBUTES            (1<<8)
-#define KAUTH_VNODE_READ_EXTATTRIBUTES          (1<<9)
-#define KAUTH_VNODE_WRITE_EXTATTRIBUTES         (1<<10)
-#define KAUTH_VNODE_READ_SECURITY               (1<<11)
-#define KAUTH_VNODE_WRITE_SECURITY              (1<<12)
-#define KAUTH_VNODE_TAKE_OWNERSHIP              (1<<13)
+#define KAUTH_VNODE_DELETE_CHILD                (1U<<6)
+#define KAUTH_VNODE_READ_ATTRIBUTES             (1U<<7)
+#define KAUTH_VNODE_WRITE_ATTRIBUTES            (1U<<8)
+#define KAUTH_VNODE_READ_EXTATTRIBUTES          (1U<<9)
+#define KAUTH_VNODE_WRITE_EXTATTRIBUTES         (1U<<10)
+#define KAUTH_VNODE_READ_SECURITY               (1U<<11)
+#define KAUTH_VNODE_WRITE_SECURITY              (1U<<12)
+#define KAUTH_VNODE_TAKE_OWNERSHIP              (1U<<13)
 
 /* backwards compatibility only */
 #define KAUTH_VNODE_CHANGE_OWNER                KAUTH_VNODE_TAKE_OWNERSHIP
 
 /* For Windows interoperability only */
-#define KAUTH_VNODE_SYNCHRONIZE                 (1<<20)
+#define KAUTH_VNODE_SYNCHRONIZE                 (1U<<20)
 
 /* (1<<21) - (1<<24) are reserved for generic rights bits */
 
@@ -654,13 +656,13 @@ __END_DECLS
 /*
  * Authorizes the vnode as the target of a hard link.
  */
-#define KAUTH_VNODE_LINKTARGET                  (1<<25)
+#define KAUTH_VNODE_LINKTARGET                  (1U<<25)
 
 /*
  * Indicates that other steps have been taken to authorise the action,
  * but authorisation should be denied for immutable objects.
  */
-#define KAUTH_VNODE_CHECKIMMUTABLE              (1<<26)
+#define KAUTH_VNODE_CHECKIMMUTABLE              (1U<<26)
 
 /* Action modifiers */
 /*
@@ -671,7 +673,7 @@ __END_DECLS
  *
  * This bit will never be present in an ACE.
  */
-#define KAUTH_VNODE_ACCESS                      (1<<31)
+#define KAUTH_VNODE_ACCESS                      (1U<<31)
 
 /*
  * The KAUTH_VNODE_NOIMMUTABLE bit is passed to the callback along with the
@@ -681,7 +683,7 @@ __END_DECLS
  * The system immutable flags are only ignored when the system securelevel
  * is low enough to allow their removal.
  */
-#define KAUTH_VNODE_NOIMMUTABLE                 (1<<30)
+#define KAUTH_VNODE_NOIMMUTABLE                 (1U<<30)
 
 
 /*
@@ -692,7 +694,7 @@ __END_DECLS
  * for an exact match on the last credential to lookup
  * the component being acted on
  */
-#define KAUTH_VNODE_SEARCHBYANYONE              (1<<29)
+#define KAUTH_VNODE_SEARCHBYANYONE              (1U<<29)
 
 
 /*
@@ -758,7 +760,7 @@ void kprintf(const char *fmt, ...);
 # endif /* !_FN_KPRINTF */
 # define KAUTH_DEBUG_ENABLE
 # define K_UUID_FMT "%08x:%08x:%08x:%08x"
-# define K_UUID_ARG(_u) *(int *)&_u.g_guid[0],*(int *)&_u.g_guid[4],*(int *)&_u.g_guid[8],*(int *)&_u.g_guid[12]
+# define K_UUID_ARG(_u) &_u.g_guid_asint[0],&_u.g_guid_asint[1],&_u.g_guid_asint[2],&_u.g_guid_asint[3]
 # define KAUTH_DEBUG(fmt, args...)      do { kprintf("%s:%d: " fmt "\n", __PRETTY_FUNCTION__, __LINE__ , ##args); } while (0)
 # define KAUTH_DEBUG_CTX(_c)            KAUTH_DEBUG("p = %p c = %p", _c->vc_proc, _c->vc_ucred)
 # define VFS_DEBUG(_ctx, _vp, fmt, args...)                                             \
index 203b8dc57e29cc176402f68fbfa0dc424abc3d81..03c7af88f8286640a30aae222055742717fe2b53 100644 (file)
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
-/*
- * kdebug.h - kernel_debug definitions
- */
-
 #ifndef BSD_SYS_KDEBUG_H
 #define BSD_SYS_KDEBUG_H
 
@@ -40,17 +36,12 @@ __BEGIN_DECLS
 
 #ifdef __APPLE_API_UNSTABLE
 
-#include <mach/clock_types.h>
-#include <stdint.h>
-
-#ifndef KERNEL
-#include <Availability.h>
-#endif
-
 /*
- * Kdebug is a facility for tracing events occurring on a system.
+ * Kdebug is a kernel facility for tracing events occurring on a system.  User
+ * space processes should prefer os_signpost, instead.
  *
- * All events are tagged with a 32-bit debugid:
+ * This header defines reserved debugids, which are 32-bit values that describe
+ * each event:
  *
  * +----------------+----------------+----------------------------+----+
  * |   Class (8)    |  Subclass (8)  |          Code (14)         |Func|
@@ -91,9 +82,9 @@ __BEGIN_DECLS
 
 /* Generate an eventid corresponding to Class, SubClass, and Code. */
 #define KDBG_EVENTID(Class, SubClass, Code)                \
-       ((((Class)    &   0xff) << KDBG_CLASS_OFFSET)    | \
-        (((SubClass) &   0xff) << KDBG_SUBCLASS_OFFSET) | \
-        (((Code)     & 0x3fff) << KDBG_CODE_OFFSET))
+       (((unsigned)((Class)    &   0xff) << KDBG_CLASS_OFFSET)    | \
+        ((unsigned)((SubClass) &   0xff) << KDBG_SUBCLASS_OFFSET) | \
+        ((unsigned)((Code)     & 0x3fff) << KDBG_CODE_OFFSET))
 /* Deprecated macro using old naming convention. */
 #define KDBG_CODE(Class, SubClass, Code) \
        KDBG_EVENTID(Class, SubClass, Code)
@@ -107,77 +98,16 @@ __BEGIN_DECLS
        ((uint16_t)(((Debugid) & KDBG_CSC_MASK) >> KDBG_CSC_OFFSET))
 #define KDBG_EXTRACT_CODE(Debugid) \
        ((uint16_t)(((Debugid) & KDBG_CODE_MASK) >> KDBG_CODE_OFFSET))
+#define KDBG_CLASS_ENCODE(Class, SubClass) KDBG_EVENTID(Class, SubClass, 0)
+#define KDBG_CLASS_DECODE(Debugid) (Debugid & KDBG_CSC_MASK)
 
 /* function qualifiers  */
-#define DBG_FUNC_START 1
-#define DBG_FUNC_END   2
-#define DBG_FUNC_NONE  0
-
-/*
- * Definitions to support IOP tracing.
- */
-
-#ifdef KERNEL_PRIVATE
-
-typedef enum {
-       /* Trace is now enabled; no arguments.  */
-       KD_CALLBACK_KDEBUG_ENABLED,
-       /* Trace is now disabled; no arguments.  */
-       KD_CALLBACK_KDEBUG_DISABLED,
-       /*
-        * Request the latest entries from the IOP and block until complete; no
-        * arguments.
-        */
-       KD_CALLBACK_SYNC_FLUSH,
-       /*
-        * The typefilter is enabled; a read-only pointer to the typefilter is
-        * provided, valid only while in the callback.
-        */
-       KD_CALLBACK_TYPEFILTER_CHANGED,
-} kd_callback_type;
-typedef void (*kd_callback_fn) (void* context, kd_callback_type reason, void* arg);
-
-struct kd_callback {
-       kd_callback_fn func;
-       void *context;
-       /* name of IOP, NUL-terminated */
-       char iop_name[8];
-};
-
-typedef struct kd_callback kd_callback_t;
-
-/*
- * Registers an IOP for participation in tracing.
- *
- * The registered callback function will be called with the
- * supplied context as the first argument, followed by a
- * kd_callback_type and an associated void* argument.
- *
- * The return value is a nonzero coreid that shall be used in
- * kernel_debug_enter() to refer to your IOP. If the allocation
- * failed, then 0 will be returned.
- *
- * Caveats:
- * Note that not all callback calls will indicate a change in
- * state (e.g. disabling trace twice would send two disable
- * notifications).
- */
-extern int kernel_debug_register_callback(kd_callback_t callback);
-
-extern void kernel_debug_enter(
-       uint32_t coreid,
-       uint32_t debugid,
-       uint64_t timestamp,
-       uintptr_t arg1,
-       uintptr_t arg2,
-       uintptr_t arg3,
-       uintptr_t arg4,
-       uintptr_t threadid
-       );
-
-#endif /* KERNEL_PRIVATE */
+#define DBG_FUNC_START 1U
+#define DBG_FUNC_END   2U
+#define DBG_FUNC_NONE  0U
 
 /* The Kernel Debug Classes  */
+
 #define DBG_MACH        1
 #define DBG_NETWORK     2
 #define DBG_FSYSTEM     3
@@ -196,6 +126,7 @@ extern void kernel_debug_enter(
 #define DBG_QT          32
 #define DBG_APPS        33
 #define DBG_LAUNCHD     34
+#define DBG_SILICON     35
 #define DBG_PERF        37
 #define DBG_IMPORTANCE  38
 #define DBG_BANK        40
@@ -209,149 +140,8 @@ extern void kernel_debug_enter(
 #define DBG_UMALLOC     51
 #define DBG_TURNSTILE   53
 
-
 #define DBG_MIG         255
 
-#ifdef PRIVATE
-
-/*
- * Private kdebug userspace API
- */
-#ifndef KERNEL
-#include <stdbool.h>
-
-/*
- * OS components can use the full precision of the "code" field
- * (Class, SubClass, Code) to inject events using kdebug_trace() by
- * using:
- *
- * kdebug_trace(KDBG_CODE(DBG_XPC, 15, 1) | DBG_FUNC_NONE, 1, 2, 3, 4);
- *
- * These trace points can be included in production code, since they
- * use reserved, non-overlapping ranges. The performance impact when
- * kernel tracing is not enabled is minimal. Classes can be reserved
- * by filing a Radar in xnu|all.
- *
- * 64-bit arguments may be truncated if the system is using a 32-bit
- * kernel.
- *
- * On error, -1 will be returned and errno will indicate the error.
- */
-extern int kdebug_trace(
-       uint32_t code,
-       uint64_t arg1,
-       uint64_t arg2,
-       uint64_t arg3,
-       uint64_t arg4)
-__OSX_AVAILABLE(10.10.2) __IOS_AVAILABLE(8.2);
-
-/*!
- * @function kdebug_trace_string
- *
- * @discussion
- * This function emits strings to kdebug trace along with an ID and allows
- * for previously-traced strings to be overwritten and invalidated.
- *
- * To start tracing a string and generate an ID to use to refer to it:
- *
- *      string_id = kdebug_trace_string(debugid, 0, "string");
- *
- * To replace a string previously traced:
- *
- *      string_id = kdebug_trace_string(debugid, string_id, "new string");
- *
- * To invalidate a string ID:
- *
- *      string_id = kdebug_trace_string(debugid, string_id, NULL);
- *
- * To check for errors:
- *
- *      if ((int64_t)string_id == -1) { perror("string error") }
- *
- * @param debugid
- * The `debugid` to check if its enabled before tracing and include as
- * an argument in the event containing the string.
- *
- * Some classes or subclasses are reserved for specific uses and are not
- * allowed to be used with this function.  No function qualifiers are
- * allowed on `debugid`.
- *
- * @param str_id
- * When 0, a new ID will be generated and returned if tracing is
- * enabled.
- *
- * Otherwise `str_id` must contain an ID that was previously generated
- * with this function.  Clents should pass NULL in `str` if `str_id`
- * is no longer in use.  Otherwise, the string previously mapped to
- * `str_id` will be overwritten with the contents of `str`.
- *
- * @param str
- * A NUL-terminated 'C' string containing the characters that should be
- * traced alongside `str_id`.
- *
- * If necessary, the string will be truncated at an
- * implementation-defined length.  The string must not be the empty
- * string, but can be NULL if a valid `str_id` is provided.
- *
- * @return
- * 0 if tracing is disabled or `debugid` is being filtered out of trace.
- * It can also return (int64_t)-1 if an error occured. Otherwise,
- * it returns the ID to use to refer to the string in future
- * kdebug_trace(2) calls.
- *
- * The errors that can occur are:
- *
- * EINVAL
- *      There are function qualifiers on `debugid`, `str` is empty, or
- *      `str_id` was not generated by this function.
- * EPERM
- *      The `debugid`'s class or subclass is reserved for internal use.
- * EFAULT
- *      `str` is an invalid address or NULL when `str_id` is 0.
- */
-extern uint64_t kdebug_trace_string(uint32_t debugid, uint64_t str_id,
-    const char *str)
-__OSX_AVAILABLE(10.11) __IOS_AVAILABLE(9.0);
-
-/*
- * Although the performance impact of kdebug_trace() when kernel
- * tracing is not enabled is minimal, it may require the caller to
- * perform an expensive calculation/summarization. This cost can be
- * skipped by checking the kdebug_is_enabled() predicate:
- *
- * if (kdebug_is_enabled(KDBG_CODE(DBG_XPC, 15, 1))) {
- *     uint64_t arg1 = ...;
- *     uint64_t arg2 = ...;
- *     kdebug_trace(KDBG_CODE(DBG_XPC, 15, 1) | DBG_FUNC_NONE, arg1, arg2, 0, 0);
- * }
- *
- * If tracing is enabled for the code at the time of the check, 1
- * will be returned. Otherwise, 0 will be returned.
- */
-extern bool kdebug_is_enabled(uint32_t code)
-__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0)
-__WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0);
-
-/*
- * Returns a pointer to the userspace typefilter, if one is available.
- * May return NULL.
- */
-extern void *kdebug_typefilter(void)
-__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0)
-__WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0);
-
-#endif /* !KERNEL (Private kdebug userspace API) */
-#endif /* PRIVATE */
-
-#ifdef XNU_KERNEL_PRIVATE
-/* Used in early boot to log strings spanning only a single tracepoint. */
-extern void kernel_debug_string_early(const char *message);
-/* Used to trace strings within kdebug tracepoints on arbitrary eventids. */
-extern void kernel_debug_string_simple(uint32_t eventid, const char *str);
-/* Only used by ktrace to reset kdebug.  ktrace_lock must be held. */
-extern void kdebug_reset(void);
-#endif /* XNU_KERNEL_PRIVATE */
-
 /* **** The Kernel Debug Sub Classes for Mach (DBG_MACH) **** */
 #define DBG_MACH_EXCP_KTRAP_x86 0x02 /* Kernel Traps on x86 */
 #define DBG_MACH_EXCP_DFLT      0x03 /* Data Translation Fault */
@@ -387,6 +177,7 @@ extern void kdebug_reset(void);
 #define DBG_MACH_THREAD_GROUP   0xA6 /* Thread groups */
 #define DBG_MACH_COALITION      0xA7 /* Coalitions */
 #define DBG_MACH_SHAREDREGION   0xA8 /* Shared region */
+#define DBG_MACH_SCHED_CLUTCH   0xA9 /* Clutch scheduler */
 #define DBG_MACH_IO             0xAA /* I/O */
 
 /* Codes for DBG_MACH_IO */
@@ -456,10 +247,20 @@ extern void kdebug_reset(void);
 #define MACH_AMP_SIGNAL_SPILL      0x32 /* AMP spill signal sent to cpuid */
 #define MACH_AMP_STEAL             0x33 /* AMP thread stolen or spilled */
 #define MACH_SCHED_LOAD_EFFECTIVE  0x34 /* Effective scheduler load */
-#define MACH_PROMOTED              0x35 /* thread promoted due to mutex priority promotion */
-#define MACH_UNPROMOTED            0x36 /* thread unpromoted due to mutex priority promotion */
-#define MACH_PROMOTED_UPDATE       0x37 /* thread already promoted, but promotion priority changed */
+/* unused  MACH_PROMOTED              0x35 was: thread promoted due to mutex priority promotion */
+/* unused  MACH_UNPROMOTED            0x36 was: thread unpromoted due to mutex priority promotion */
+/* unused  MACH_PROMOTED_UPDATE       0x37 was: thread already promoted, but promotion priority changed */
 #define MACH_QUIESCENT_COUNTER     0x38 /* quiescent counter tick */
+#define MACH_TURNSTILE_USER_CHANGE 0x39 /* base priority change because of turnstile */
+#define MACH_AMP_RECOMMENDATION_CHANGE 0x3a /* Thread group recommendation change */
+#define MACH_TURNSTILE_KERNEL_CHANGE 0x40 /* sched priority change because of turnstile */
+
+/* Codes for Clutch Scheduler (DBG_MACH_SCHED_CLUTCH) */
+#define MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE     0x0
+#define MACH_SCHED_CLUTCH_TG_BUCKET_STATE       0x1
+#define MACH_SCHED_CLUTCH_THREAD_SELECT         0x2
+#define MACH_SCHED_CLUTCH_THREAD_STATE          0x3
+#define MACH_SCHED_CLUTCH_TG_BUCKET_PRI         0x4
 
 /* Variants for MACH_MULTIQ_DEQUEUE */
 #define MACH_MULTIQ_BOUND     1
@@ -491,7 +292,8 @@ extern void kdebug_reset(void);
 #define MACH_IPC_VOUCHER_DESTROY                0x9     /* Voucher removed from global voucher hashtable */
 #define MACH_IPC_KMSG_INFO                      0xa     /* Send/Receive info for a kmsg */
 #define MACH_IPC_KMSG_LINK                      0xb     /* link a kernel kmsg pointer to user mach_msg_header_t */
-#define MACH_IPC_PORT_ENTRY_MODIFY      0xc     /* A port space gained or lost a port right (reference) */
+#define MACH_IPC_PORT_ENTRY_MODIFY              0xc     /* A port space gained or lost a port right (reference) */
+#define MACH_IPC_DESTROY_GUARDED_DESC           0xd     /* Unable to receive a guarded descriptor */
 
 /* Codes for thread groups (DBG_MACH_THREAD_GROUP) */
 #define MACH_THREAD_GROUP_NEW           0x0
@@ -530,6 +332,7 @@ extern void kdebug_reset(void);
 #define PMAP__SWITCH            0x12
 #define PMAP__TTE               0x13
 #define PMAP__SWITCH_USER_TTB   0x14
+#define PMAP__UPDATE_CACHING    0x15
 
 /* Codes for clock (DBG_MACH_CLOCK) */
 #define MACH_EPOCH_CHANGE       0x0     /* wake epoch change */
@@ -654,6 +457,12 @@ extern void kdebug_reset(void);
 #define DBG_HIBERNATE           51      /* hibernation related events */
 #define DBG_IOTHUNDERBOLT       52      /* Thunderbolt */
 #define DBG_BOOTER              53      /* booter related events */
+#define DBG_IOAUDIO2            54      /* Audio (extended) */
+
+#define DBG_IOSURFACEPA         64      /* IOSurface page mappings */
+#define DBG_IOMDPA              65      /* IOMemoryDescriptor page mappings */
+#define DBG_IODARTPA            66      /* DART page mappings */
+/* **** 67-79 reserved for physical address mapping information **** */
 
 /* Backwards compatibility */
 #define DBG_IOPOINTING          DBG_IOHID                       /* OBSOLETE: Use DBG_IOHID instead */
@@ -686,6 +495,9 @@ extern void kdebug_reset(void);
 #define DBG_DRVSMC           25 /* System Management Controller */
 #define DBG_DRVMACEFIMANAGER 26 /* Mac EFI Manager */
 #define DBG_DRVANE           27 /* ANE */
+#define DBG_DRVETHERNET      28 /* Ethernet */
+#define DBG_DRVMCC           29 /* Memory Cache Controller */
+#define DBG_DRVACCESSORY     30 /* Accessories */
 
 /* Backwards compatibility */
 #define DBG_DRVPOINTING         DBG_DRVHID      /* OBSOLETE: Use DBG_DRVHID instead */
@@ -698,12 +510,7 @@ extern void kdebug_reset(void);
 #define DBG_DLIL_PR_FLT 4       /* DLIL Protocol Filter */
 #define DBG_DLIL_IF_FLT 5       /* DLIL Interface FIlter */
 
-
-/*
- * The Kernel Debug Sub Classes for File System (DBG_FSYSTEM)
- *
- * Please NOTE: sub class values 0xC and 0xD are currently unused.
- */
+/* The Kernel Debug Sub Classes for File System (DBG_FSYSTEM) */
 #define DBG_FSRW      0x1     /* reads and writes to the filesystem */
 #define DBG_DKRW      0x2     /* reads and writes to the disk */
 #define DBG_FSVN      0x3     /* vnode operations (inc. locking/unlocking) */
@@ -720,6 +527,8 @@ extern void kdebug_reset(void);
 #define DBG_ACFS      0x10    /* Xsan-specific events; see the XsanFS project */
 #define DBG_THROTTLE  0x11    /* I/O Throttling events */
 #define DBG_DECMP     0x12    /* Decmpfs-specific events */
+#define DBG_VFS       0x13    /* VFS layer events */
+#define DBG_LIVEFS    0x14    /* LiveFS events; see the UserFS project */
 #define DBG_CONTENT_PROT 0xCF /* Content Protection Events: see bsd/sys/cprotect.h */
 
 /*
@@ -756,7 +565,7 @@ extern void kdebug_reset(void);
 #define BSD_MEMSTAT_JETSAM           2  /* LRU jetsam */
 #define BSD_MEMSTAT_JETSAM_HIWAT     3  /* highwater jetsam */
 #define BSD_MEMSTAT_FREEZE           4  /* freeze process */
-#define BSD_MEMSTAT_LATENCY_COALESCE 5  /* delay imposed to coalesce jetsam reports */
+#define BSD_MEMSTAT_FREEZE_SCAN      5  /* select a process to freeze and freeze it */
 #define BSD_MEMSTAT_UPDATE           6  /* priority update */
 #define BSD_MEMSTAT_IDLE_DEMOTE      7  /* idle demotion fired */
 #define BSD_MEMSTAT_CLEAR_ERRORS     8  /* reset termination error state */
@@ -769,6 +578,9 @@ extern void kdebug_reset(void);
 #define BSD_MEMSTAT_CHANGE_PRIORITY 14  /* priority changed */
 #endif /* PRIVATE */
 #define BSD_MEMSTAT_FAST_JETSAM     15  /* Aggressive jetsam ("clear-the-deck") */
+#define BSD_MEMSTAT_COMPACTOR_RUN   16  /* run VM compactor after process kill */
+#define BSD_MEMSTAT_FREEZE_DISABLE  17  /* disable freeze and kill frozen processes */
+#define BSD_MEMSTAT_RELAUNCH_FLAGS  18  /* flags representing jetsam behavior; based on launchd data */
 
 /* Codes for BSD subcode class DBG_BSD_KEVENT */
 #define BSD_KEVENT_KQ_PROCESS_BEGIN   1
@@ -833,9 +645,11 @@ extern void kdebug_reset(void);
 #define DBG_MT_TMPCPU 0xff
 
 /* The Kernel Debug Sub Classes for DBG_MISC */
-#define DBG_EVENT       0x10
-#define DBG_MISC_LAYOUT 0x1a
-#define DBG_BUFFER      0x20
+#define DBG_EVENT              0x10
+#define DBG_MISC_INSTRUMENTS   0x11
+#define DBG_MISC_INSTRUMENTSBT 0x12
+#define DBG_MISC_LAYOUT        0x1a
+#define DBG_BUFFER             0x20
 
 /* The Kernel Debug Sub Classes for DBG_DYLD */
 #define DBG_DYLD_UUID (5)
@@ -890,21 +704,57 @@ extern void kdebug_reset(void);
 #define IO_THROTTLE_DISABLE     0x3
 #define IO_TIER_UPL_MISMATCH    0x4
 
-
 /* Subclasses for MACH Importance Policies (DBG_IMPORTANCE) */
 /* TODO: Split up boost and task policy? */
-#define IMP_ASSERTION           0x10    /* Task takes/drops a boost assertion */
-#define IMP_BOOST               0x11    /* Task boost level changed */
-#define IMP_MSG                 0x12    /* boosting message sent by donating task on donating port */
-#define IMP_WATCHPORT           0x13    /* port marked as watchport, and boost was transferred to the watched task */
-#define IMP_TASK_SUPPRESSION    0x17    /* Task changed suppression behaviors */
-#define IMP_TASK_APPTYPE        0x18    /* Task launched with apptype */
-#define IMP_UPDATE              0x19    /* Requested -> effective calculation */
-#define IMP_USYNCH_QOS_OVERRIDE 0x1A    /* Userspace synchronization applied QoS override to resource owning thread */
-#define IMP_DONOR_CHANGE        0x1B    /* The iit_donor bit changed */
-#define IMP_MAIN_THREAD_QOS     0x1C    /* The task's main thread QoS was set */
-#define IMP_SYNC_IPC_QOS        0x1D    /* Sync IPC QOS override */
-/* DBG_IMPORTANCE subclasses  0x20 - 0x3F reserved for task policy flavors */
+#define IMP_ASSERTION                       0x10    /* Task takes/drops a boost assertion */
+#define IMP_BOOST                           0x11    /* Task boost level changed */
+#define IMP_MSG                             0x12    /* boosting message sent by donating task on donating port */
+#define IMP_WATCHPORT                       0x13    /* port marked as watchport, and boost was transferred to the watched task */
+#define IMP_TASK_SUPPRESSION                0x17    /* Task changed suppression behaviors */
+#define IMP_TASK_APPTYPE                    0x18    /* Task launched with apptype */
+#define IMP_UPDATE                          0x19    /* Requested -> effective calculation */
+#define IMP_USYNCH_QOS_OVERRIDE             0x1A    /* Userspace synchronization applied QoS override to resource owning thread */
+#define IMP_DONOR_CHANGE                    0x1B    /* The iit_donor bit changed */
+#define IMP_MAIN_THREAD_QOS                 0x1C    /* The task's main thread QoS was set */
+#define IMP_SYNC_IPC_QOS                    0x1D    /* Sync IPC QOS override */
+/* DBG_IMPORTANCE subclasses  0x20 - 0x3F are reserved for task policy flavors */
+
+/* thread and task attributes */
+#define IMP_TASK_POLICY_DARWIN_BG           0x21
+#define IMP_TASK_POLICY_IOPOL               0x22
+#define IMP_TASK_POLICY_IO                  0x23
+#define IMP_TASK_POLICY_PASSIVE_IO          0x24
+
+/* task only attributes */
+#define IMP_TASK_POLICY_DARWIN_BG_IOPOL     0x27
+#define IMP_TASK_POLICY_TAL                 0x28
+#define IMP_TASK_POLICY_BOOST               0x29
+#define IMP_TASK_POLICY_ROLE                0x2A
+/* unused                                   0x2B */
+#define IMP_TASK_POLICY_TERMINATED          0x2C
+#define IMP_TASK_POLICY_NEW_SOCKETS_BG      0x2D
+#define IMP_TASK_POLICY_SUP_ACTIVE          0x2E
+#define IMP_TASK_POLICY_LATENCY_QOS         0x2F
+#define IMP_TASK_POLICY_THROUGH_QOS         0x30
+#define IMP_TASK_POLICY_WATCHERS_BG         0x31
+
+#define IMP_TASK_POLICY_SFI_MANAGED         0x34
+#define IMP_TASK_POLICY_ALL_SOCKETS_BG      0x37
+
+#define IMP_TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS  0x39 /* latency as value1, throughput as value2 */
+#define IMP_TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS  0x3A /* latency as value1, throughput as value2 */
+
+/* thread only attributes */
+#define IMP_TASK_POLICY_PIDBIND_BG          0x32
+/* unused                                   0x33 */
+/* reserved                                 0x35 */
+#define IMP_TASK_POLICY_QOS_OVERRIDE        0x36
+#define IMP_TASK_POLICY_QOS_AND_RELPRIO     0x38 /* QoS as value1, relative priority as value2 */
+#define IMP_TASK_POLICY_QOS_WORKQ_OVERRIDE  0x3B
+#define IMP_TASK_POLICY_QOS_PROMOTE         0x3C
+#define IMP_TASK_POLICY_QOS_KEVENT_OVERRIDE 0x3D
+#define IMP_TASK_POLICY_QOS_IPC_OVERRIDE    IMP_TASK_POLICY_QOS_KEVENT_OVERRIDE /* legacy name */
+#define IMP_TASK_POLICY_QOS_SERVICER_OVERRIDE 0x3E
 
 /* Codes for IMP_ASSERTION */
 #define IMP_HOLD                0x2     /* Task holds a boost assertion */
@@ -998,8 +848,8 @@ extern void kdebug_reset(void);
 
 /**********************************************************************/
 
-#define KDBG_MIGCODE(msgid) ((DBG_MIG << KDBG_CLASS_OFFSET) | \
-                            (((msgid) & 0x3fffff) << KDBG_CODE_OFFSET))
+#define KDBG_MIGCODE(msgid) (((unsigned)DBG_MIG << KDBG_CLASS_OFFSET) | \
+                            ((unsigned)((msgid) & 0x3fffff) << KDBG_CODE_OFFSET))
 
 #define MACHDBG_CODE(SubClass, code) KDBG_CODE(DBG_MACH, SubClass, code)
 #define NETDBG_CODE(SubClass, code) KDBG_CODE(DBG_NETWORK, SubClass, code)
@@ -1008,6 +858,7 @@ extern void kdebug_reset(void);
 #define IOKDBG_CODE(SubClass, code) KDBG_CODE(DBG_IOKIT, SubClass, code)
 #define DRVDBG_CODE(SubClass, code) KDBG_CODE(DBG_DRIVERS, SubClass, code)
 #define TRACEDBG_CODE(SubClass, code) KDBG_CODE(DBG_TRACE, SubClass, code)
+#define SILICONDBG_CODE(SubClass, code) KDBG_CODE(DBG_SILICON, SubClass, code)
 #define MISCDBG_CODE(SubClass, code) KDBG_CODE(DBG_MISC, SubClass, code)
 #define DLILDBG_CODE(SubClass, code) KDBG_CODE(DBG_DLIL, SubClass, code)
 #define SECURITYDBG_CODE(SubClass, code) KDBG_CODE(DBG_SECURITY, SubClass, code)
@@ -1029,7 +880,6 @@ extern void kdebug_reset(void);
 
 #define PMAP_CODE(code) MACHDBG_CODE(DBG_MACH_PMAP, code)
 
-
 #define IMPORTANCE_CODE(SubClass, code) KDBG_CODE(DBG_IMPORTANCE, (SubClass), (code))
 #define BANK_CODE(SubClass, code) KDBG_CODE(DBG_BANK, (SubClass), (code))
 #define ATM_CODE(SubClass, code) KDBG_CODE(DBG_ATM, (SubClass), (code))
@@ -1039,824 +889,20 @@ extern void kdebug_reset(void);
 #define COREDUETDBG_CODE(code) DAEMONDBG_CODE(DBG_DAEMON_COREDUET, code)
 #define POWERDDBG_CODE(code) DAEMONDBG_CODE(DBG_DAEMON_POWERD, code)
 
-/*
- * To use kdebug in the kernel:
- *
- * #include <sys/kdebug.h>
- *
- * #define DBG_NETIPINIT NETDBG_CODE(DBG_NETIP, 1)
- *
- * void
- * ip_init(void)
- * {
- *     KDBG(DBG_NETIPINIT | DBG_FUNC_START, 1, 2, 3, 4);
- *     ...
- *     KDBG(DBG_NETIPINIT);
- *     ...
- *     KDBG(DBG_NETIPINIT | DBG_FUNC_END);
- * }
- */
-
-#ifdef KERNEL_PRIVATE
-
-/*
- * The KDBG{,_DEBUG,_RELEASE,_FILTERED} macros are the preferred method of
- * making tracepoints.
- *
- * Kernel pointers must be unslid or permuted using VM_KERNEL_UNSLIDE_OR_PERM.
- * Do not trace any sensitive data.
- */
-
-/*
- * Traced on debug and development (and release macOS) kernels.
- */
-#define KDBG(x, ...) KDBG_(, x, ## __VA_ARGS__, 4, 3, 2, 1, 0)
-
-/*
- * Traced on debug and development (and release macOS) kernels if explicitly
- * requested.  Omitted from tracing without a typefilter.
- */
-#define KDBG_FILTERED(x, ...) KDBG_(_FILTERED, x, ## __VA_ARGS__, 4, 3, 2, 1, 0)
-
-/*
- * Traced on debug and development (and release macOS) kernels, even if the
- * process filter would reject it.
- */
-#define KDBG_RELEASE_NOPROCFILT(x, ...) \
-               KDBG_(_RELEASE_NOPROCFILT, x, ## __VA_ARGS__, 4, 3, 2, 1, 0)
-
-/*
- * Traced on debug, development, and release kernels.
- *
- * Only use this tracepoint if the events are required for a shipping trace
- * tool.
- */
-#define KDBG_RELEASE(x, ...) KDBG_(_RELEASE, x, ## __VA_ARGS__, 4, 3, 2, 1, 0)
-
-/*
- * Traced only on debug kernels.
- */
-#define KDBG_DEBUG(x, ...) KDBG_(_DEBUG, x, ## __VA_ARGS__, 4, 3, 2, 1, 0)
-
-#define KDBG_(f, x, a, b, c, d, n, ...) KDBG##n(f, x, a, b, c, d)
-#define KDBG0(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, 0, 0, 0, 0, 0)
-#define KDBG1(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, 0, 0, 0, 0)
-#define KDBG2(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, 0, 0, 0)
-#define KDBG3(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, c, 0, 0)
-#define KDBG4(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, c, d, 0)
-
-#endif /* defined(KERNEL_PRIVATE) */
-
-extern unsigned int kdebug_enable;
-
-/*
- * Bits used by kdebug_enable.  These control which events are traced at
- * runtime.
- */
-#define KDEBUG_ENABLE_TRACE   (1U << 0)
-#define KDEBUG_ENABLE_ENTROPY (1U << 1) /* obsolete */
-#define KDEBUG_ENABLE_CHUD    (1U << 2) /* obsolete */
-#define KDEBUG_ENABLE_PPT     (1U << 3)
-#define KDEBUG_ENABLE_SERIAL  (1U << 4)
-
-#define KDEBUG_TRACE (KDEBUG_ENABLE_TRACE)
-
-/*
- * Specify KDEBUG_PPT to indicate that the event belongs to the limited PPT set.
- * PPT is deprecated -- use a typefilter and the PPTDBG class instead.
- */
-#define KDEBUG_PPT    (KDEBUG_ENABLE_PPT)
-#define KDEBUG_COMMON (KDEBUG_ENABLE_TRACE | KDEBUG_ENABLE_PPT)
-
-/*
- * The kernel debug configuration level.  These values control which events are
- * compiled in under different build configurations.
- *
- * Infer the supported kernel debug event level from config option.  Use
- * (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) as a guard to protect unaudited debug
- * code.
- */
-#define KDEBUG_LEVEL_NONE     0
-#define KDEBUG_LEVEL_IST      1
-#define KDEBUG_LEVEL_STANDARD 2
-#define KDEBUG_LEVEL_FULL     3
-
-#if NO_KDEBUG
-#define KDEBUG_LEVEL KDEBUG_LEVEL_NONE
-#elif IST_KDEBUG
-#define KDEBUG_LEVEL KDEBUG_LEVEL_IST
-// currently configured for the iOS release kernel
-#elif KDEBUG
-#define KDEBUG_LEVEL KDEBUG_LEVEL_FULL
-#else
-#define KDEBUG_LEVEL KDEBUG_LEVEL_STANDARD
-/*
- * Currently, all other kernel configurations (development, etc) build with
- * KDEBUG_LEVEL_STANDARD.  As a result, KERNEL_DEBUG_CONSTANT*() are on by
- * default but KERNEL_DEBUG*() are not.
- */
-#endif
-
-#ifdef XNU_KERNEL_PRIVATE
-#define KDBG_IMPROBABLE __improbable
-#else
-#define KDBG_IMPROBABLE
-#endif
-
-/*
- * KERNEL_DEBUG_CONSTANT_FILTERED events are omitted from tracing unless they
- * are explicitly requested in the typefilter.  They are not emitted when
- * tracing without a typefilter.
- */
-#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
-#define KERNEL_DEBUG_CONSTANT_FILTERED(x, a, b, c, d, ...)           \
-       do {                                                             \
-               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {   \
-                       kernel_debug_filtered((x), (uintptr_t)(a), (uintptr_t)(b),  \
-                               (uintptr_t)(c), (uintptr_t)(d)); \
-               }                                                            \
-       } while (0)
-#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
-#define KERNEL_DEBUG_CONSTANT_FILTERED(type, x, a, b, c, d, ...) do {} while (0)
-#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
-
-#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST)
-#define KERNEL_DEBUG_CONSTANT_RELEASE_NOPROCFILT(x, a, b, c, d, ...)   \
-       do {                                                               \
-               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {     \
-                       kernel_debug_flags((x), (uintptr_t)(a), (uintptr_t)(b),    \
-                               (uintptr_t)(c), (uintptr_t)(d), KDBG_FLAG_NOPROCFILT); \
-               }                                                              \
-       } while (0)
-#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
-#define KERNEL_DEBUG_CONSTANT_RELEASE_NOPROCFILT(x, a, b, c, d, ...) \
-       do { } while (0)
-#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
-
-
-#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
-#define KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e)                               \
-       do {                                                                      \
-               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {            \
-                       kernel_debug((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \
-                               (uintptr_t)(d),(uintptr_t)(e));                               \
-               }                                                                     \
-       } while (0)
-
-/*
- * DO NOT USE THIS MACRO -- it breaks fundamental assumptions about ktrace and
- * is only meant to be used by the pthread kext and other points in the kernel
- * where the thread ID must be provided explicitly.
- */
-#define KERNEL_DEBUG_CONSTANT1(x, a, b, c, d, e)                               \
-       do {                                                                       \
-               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {             \
-                       kernel_debug1((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \
-                       (uintptr_t)(d), (uintptr_t)(e));                                   \
-               }                                                                      \
-       } while (0)
-
-#define KERNEL_DEBUG_EARLY(x, a, b, c, d)                                 \
-       do {                                                                  \
-               kernel_debug_early((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \
-                       (uintptr_t)(c), (uintptr_t)(d));                              \
-       } while (0)
-#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
-#define KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e) do {} while (0)
-#define KERNEL_DEBUG_CONSTANT1(x, a, b, c, d, e) do {} while (0)
-#define KERNEL_DEBUG_EARLY(x, a, b, c, d) do {} while (0)
-#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
-
-/*
- * KERNEL_DEBUG_CONSTANT_IST (in-system trace) events provide an audited subset
- * of tracepoints for userland system tracing tools.  This tracing level was
- * created by 8857227 to protect fairplayd and other PT_DENY_ATTACH processes.
- * It has two effects: only KERNEL_DEBUG_CONSTANT_IST() traces are emitted and
- * any PT_DENY_ATTACH processes will only emit basic traces as defined by the
- * kernel_debug_filter() routine.
- */
-#define KERNEL_DEBUG_CONSTANT_RELEASE(x, a, b, c, d, e) \
-       KERNEL_DEBUG_CONSTANT_IST(~KDEBUG_ENABLE_PPT, x, a, b, c, d, 0)
-
-#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST)
-#define KERNEL_DEBUG_CONSTANT_IST(type, x, a, b, c, d, e)                     \
-       do {                                                                      \
-               if (KDBG_IMPROBABLE(kdebug_enable & (type))) {                        \
-                       kernel_debug((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \
-                               (uintptr_t)(d), 0);                                           \
-               }                                                                     \
-       } while (0)
-#define KERNEL_DEBUG_CONSTANT_IST1(x, a, b, c, d, e)                     \
-       do {                                                                       \
-               if (KDBG_IMPROBABLE(kdebug_enable)) {                         \
-                       kernel_debug1((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \
-                               (uintptr_t)(d), (uintptr_t)(e));                               \
-               }                                                                      \
-       } while (0)
-#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
-#define KERNEL_DEBUG_CONSTANT_IST(type, x, a, b, c, d, e) do {} while (0)
-#define KERNEL_DEBUG_CONSTANT_IST1(x, a, b, c, d, e) do {} while (0)
-#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
-
-#if NO_KDEBUG
-#define __kdebug_constant_only __unused
-#endif
-
-/*
- * KERNEL_DEBUG events are only traced for DEBUG kernels.
- */
-#define KERNEL_DEBUG_CONSTANT_DEBUG(x, a, b, c, d, e) \
-       KERNEL_DEBUG(x, a, b, c, d, e)
-
-#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL)
-#define __kdebug_only
-
-#define KERNEL_DEBUG(x, a, b, c, d, e)                                  \
-       do {                                                                \
-               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {      \
-                       kernel_debug((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \
-                               (uintptr_t)(c), (uintptr_t)(d), (uintptr_t)(e));        \
-               }                                                               \
-       } while (0)
-
-/*
- * DO NOT USE THIS MACRO -- see warning above for KERNEL_DEBUG_CONSTANT1.
- */
-#define KERNEL_DEBUG1(x, a, b, c, d, e)                                  \
-       do {                                                                 \
-               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {       \
-                       kernel_debug1((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \
-                               (uintptr_t)(c), (uintptr_t)(d), (uintptr_t)(e));         \
-               }                                                                \
-       } while (0)
-
-#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) */
-#define __kdebug_only __unused
-
-#define KERNEL_DEBUG(x, a, b, c, d, e) do {} while (0)
-#define KERNEL_DEBUG1(x, a, b, c, d, e) do {} while (0)
-#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) */
-
-
-extern void kernel_debug(
-       uint32_t  debugid,
-       uintptr_t arg1,
-       uintptr_t arg2,
-       uintptr_t arg3,
-       uintptr_t arg4,
-       uintptr_t arg5);
-
-extern void kernel_debug1(
-       uint32_t  debugid,
-       uintptr_t arg1,
-       uintptr_t arg2,
-       uintptr_t arg3,
-       uintptr_t arg4,
-       uintptr_t arg5);
-
-#define KDBG_FLAG_FILTERED 0x01
-#define KDBG_FLAG_NOPROCFILT 0x02
-
-extern void kernel_debug_flags(
-       uint32_t  debugid,
-       uintptr_t arg1,
-       uintptr_t arg2,
-       uintptr_t arg3,
-       uintptr_t arg4,
-       uint64_t flags);
-
-extern void kernel_debug_filtered(
-       uint32_t  debugid,
-       uintptr_t arg1,
-       uintptr_t arg2,
-       uintptr_t arg3,
-       uintptr_t arg4);
-
-extern void kernel_debug_early(
-       uint32_t  debugid,
-       uintptr_t arg1,
-       uintptr_t arg2,
-       uintptr_t arg3,
-       uintptr_t arg4);
-
-/*
- * EnergyTracing macros.
- */
-
-#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST)
-// whether to bother calculating EnergyTracing inputs
-// could change in future to see if DBG_ENERGYTRACE is active
-#define ENTR_SHOULDTRACE kdebug_enable
-// encode logical EnergyTracing into 32/64 KDebug trace
-#define ENTR_KDTRACE(component, opcode, lifespan, id, quality, value)   \
-do {                                                                    \
-    uint32_t kdcode__;                                                  \
-    uintptr_t highval__, lowval__, mask__ = 0xffffffff;                 \
-    kdcode__ = KDBG_CODE(DBG_ENERGYTRACE,component,opcode)|(lifespan);  \
-    highval__ = ((value) >> 32) & mask__;                               \
-    lowval__ = (value) & mask__;                                        \
-    ENTR_KDTRACEFUNC(kdcode__, id, quality, highval__, lowval__);       \
-} while(0)
-
-/*
- *   Trace the association of two existing activations.
- *
- *   An association is traced as a modification to the parent activation.
- *   In order to fit the sub-activation's component, activation code, and
- *   activation ID into a kdebug tracepoint, the arguments that would hold
- *   the value are left separate, and one stores the component and opcode
- *   of the sub-activation, while the other stores the pointer-sized
- *   activation ID.
- *
- *           arg2                   arg3               arg4
- +-----------------+  +~+----+----+--------+   +----------+
- |kEnTrModAssociate|  | |    |    |        |   |          |
- +-----------------+  +~+----+----+--------+   +----------+
- *                           8-bits unused       sub-activation ID
- *                                8-bit sub-component
- *                                     16-bit sub-opcode
- *
- */
-#define kEnTrModAssociate (1 << 28)
-#define ENTR_KDASSOCIATE(par_comp, par_opcode, par_act_id,              \
-           sub_comp, sub_opcode, sub_act_id)              \
-do {                                                                    \
-    unsigned sub_compcode = ((unsigned)sub_comp << 16) | sub_opcode;    \
-    ENTR_KDTRACEFUNC(KDBG_CODE(DBG_ENERGYTRACE,par_comp,par_opcode),    \
-                    par_act_id, kEnTrModAssociate, sub_compcode,       \
-                    sub_act_id);                                       \
-} while(0)
-
-#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
-
-#define ENTR_SHOULDTRACE FALSE
-#define ENTR_KDTRACE(component, opcode, lifespan, id, quality, value)   \
-                                   do {} while (0)
-#define ENTR_KDASSOCIATE(par_comp, par_opcode, par_act_id,              \
-           sub_comp, sub_opcode, sub_act_id)              \
-                                   do {} while (0)
-
-#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
-
-#ifdef KERNEL_PRIVATE
-/*
- * kernel_debug_string provides the same functionality as the
- * kdebug_trace_string syscall as a KPI.  str_id is an in/out
- * parameter that, if it's pointing to a string ID of 0, will
- * receive a generated ID.  If it provides a value in str_id,
- * then that will be used, instead.
- *
- * Returns an errno indicating the type of failure.
- */
-extern int
-kernel_debug_string(uint32_t debugid, uint64_t *str_id, const char *str);
-
-/*
- * kernel_debug_disable disables event logging, but leaves any buffers
- * intact.
- */
-extern void kernel_debug_disable(void);
-#endif
-
-/*
- * Bits set in the comm page for kdebug.
- */
-#define KDEBUG_COMMPAGE_ENABLE_TRACE      0x1
-#define KDEBUG_COMMPAGE_ENABLE_TYPEFILTER 0x2 /* Forced to false if ENABLE_TRACE is 0 */
-
-// for EnergyTracing user space & clients
-#define kEnTrCompKernel     2
-
-/*
- *   EnergyTracing opcodes
- *
- *   Activations use DBG_FUNC_START/END.
- *   Events are DBG_FUNC_NONE.
- */
-
-/* Socket reads and writes are uniquely identified by the (sanitized)
- *  pointer to the socket struct in question.  To associate this address
- *  with the user space file descriptor, we have a socket activation with
- *  the FD as its identifier and the socket struct pointer as its value.
- */
-#define kEnTrActKernSocket      1
-#define kEnTrActKernSockRead    2
-#define kEnTrActKernSockWrite   3
-
-#define kEnTrActKernPoll        10
-#define kEnTrActKernSelect      11
-#define kEnTrActKernKQWait      12
-
-// events
-#define kEnTrEvUnblocked        256
-
-// EnergyTracing flags (the low-order 16 bits of 'quality')
-#define kEnTrFlagNonBlocking    1 << 0
-#define kEnTrFlagNoWork         1 << 1
-
-// and now the internal mechanism
-#ifdef KERNEL_PRIVATE
-
-// 20452597 requests that the trace macros not take an argument it throws away
-#define KERNEL_DBG_IST_SANE(x, a, b, c, d)                              \
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, x, a, b, c, d,          \
-                                 0 /*__unused in kernel_debug()*/ )
-#define ENTR_KDTRACEFUNC KERNEL_DBG_IST_SANE
-
-// value is int64_t, quality is uint32_t
-#define KERNEL_ENERGYTRACE(opcode, lifespan, id, quality, value)        \
-           ENTR_KDTRACE(kEnTrCompKernel, opcode, lifespan, id,         \
-                        quality, value)
-#define KERNEL_ENTR_ASSOCIATE(par_opcode, par_act_id, sub_opcode, sub_act_id) \
-           ENTR_KDASSOCIATE(kEnTrCompKernel, par_opcode, par_act_id,   \
-                            kEnTrCompKernel, sub_opcode, sub_act_id)
-
-// end EnergyTracing
-
-
-#include <mach/boolean.h>
-
-#define NUMPARMS 23
-
-struct proc;
-
-/*
- * Returns false if the debugid is disabled by filters, and true if the
- * debugid is allowed to be traced.  A debugid may not be traced if the
- * typefilter disables its class and subclass, it's outside a range
- * check, or if it's not an allowed debugid in a value check.  Trace
- * system events bypass this check.
- */
-boolean_t kdebug_debugid_enabled(uint32_t debugid);
-
-/*
- * Returns true only if the debugid is explicitly enabled by filters.  Returns
- * false otherwise, including when no filters are active.
- */
-boolean_t kdebug_debugid_explicitly_enabled(uint32_t debugid);
-
-uint32_t kdebug_commpage_state(void);
-
-#define KDBG_VFS_LOOKUP_FLAG_LOOKUP 0x01
-#define KDBG_VFS_LOOKUP_FLAG_NOPROCFILT 0x02
-void kdebug_vfs_lookup(long *dbg_parms, int dbg_namelen, void *dp,
-    uint32_t flags);
-
-void kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp,
-    boolean_t lookup);
-
-void kdbg_trace_data(struct proc *proc, long *arg_pid, long *arg_uniqueid);
-
-void kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, long *arg4);
-
-void kdbg_dump_trace_to_file(const char *);
-void kdebug_init(unsigned int n_events, char *filterdesc, boolean_t wrapping);
-void kdebug_trace_start(unsigned int n_events, const char *filterdesc,
-    boolean_t wrapping, boolean_t at_wake);
-void kdebug_free_early_buf(void);
-struct task;
-void release_storage_unit(int cpu, uint32_t storage_unit);
-int allocate_storage_unit(int cpu);
-
-#define KDBG_CLASS_ENCODE(Class, SubClass) KDBG_EVENTID(Class, SubClass, 0)
-#define KDBG_CLASS_DECODE(Debugid)         (Debugid & KDBG_CSC_MASK)
-
-#endif /* KERNEL_PRIVATE */
-#endif /* __APPLE_API_UNSTABLE */
-__END_DECLS
-
-#ifdef PRIVATE
-#ifdef __APPLE_API_PRIVATE
-/*
- * private kernel_debug definitions
- */
-
-/*
- * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
- * structure.
- */
-#if defined(__arm64__)
-typedef uint64_t kd_buf_argtype;
-#else
-typedef uintptr_t kd_buf_argtype;
-#endif
-
-typedef struct {
-       uint64_t timestamp;
-       kd_buf_argtype arg1;
-       kd_buf_argtype arg2;
-       kd_buf_argtype arg3;
-       kd_buf_argtype arg4;
-       kd_buf_argtype arg5; /* the thread ID */
-       uint32_t debugid;
-/*
- * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
- * structure.
- */
-#if defined(__LP64__) || defined(__arm64__)
-       uint32_t cpuid;
-       kd_buf_argtype unused;
-#endif
-} kd_buf;
-
-#if defined(__LP64__) || defined(__arm64__)
-#define KDBG_TIMESTAMP_MASK             0xffffffffffffffffULL
-static inline void
-kdbg_set_cpu(kd_buf *kp, int cpu)
-{
-       kp->cpuid = (unsigned int)cpu;
-}
-static inline int
-kdbg_get_cpu(kd_buf *kp)
-{
-       return (int)kp->cpuid;
-}
-static inline void
-kdbg_set_timestamp(kd_buf *kp, uint64_t thetime)
-{
-       kp->timestamp = thetime;
-}
-static inline uint64_t
-kdbg_get_timestamp(kd_buf *kp)
-{
-       return kp->timestamp;
-}
-static inline void
-kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu)
-{
-       kdbg_set_timestamp(kp, thetime);
-       kdbg_set_cpu(kp, cpu);
-}
-#else
-#define KDBG_TIMESTAMP_MASK 0x00ffffffffffffffULL
-#define KDBG_CPU_MASK       0xff00000000000000ULL
-#define KDBG_CPU_SHIFT      56
-static inline void
-kdbg_set_cpu(kd_buf *kp, int cpu)
-{
-       kp->timestamp = (kp->timestamp & KDBG_TIMESTAMP_MASK) |
-           (((uint64_t) cpu) << KDBG_CPU_SHIFT);
-}
-static inline int
-kdbg_get_cpu(kd_buf *kp)
-{
-       return (int) (((kp)->timestamp & KDBG_CPU_MASK) >> KDBG_CPU_SHIFT);
-}
-static inline void
-kdbg_set_timestamp(kd_buf *kp, uint64_t thetime)
-{
-       kp->timestamp = thetime & KDBG_TIMESTAMP_MASK;
-}
-static inline uint64_t
-kdbg_get_timestamp(kd_buf *kp)
-{
-       return kp->timestamp & KDBG_TIMESTAMP_MASK;
-}
-static inline void
-kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu)
-{
-       kp->timestamp = (thetime & KDBG_TIMESTAMP_MASK) |
-           (((uint64_t) cpu) << KDBG_CPU_SHIFT);
-}
-#endif
-
-/*
- * 2^16 bits (8 kilobytes), one for each possible class/subclass combination
- */
-#define KDBG_TYPEFILTER_BITMAP_SIZE ((256 * 256) / 8)
-
-/*
- * Bits for kd_ctrl_page.flags, KERN_KD{D,E}FLAGS.
- */
-#define KDBG_INIT       (1U << 0) /* obsolete */
-/* disable tracing when buffers are full */
-#define KDBG_NOWRAP     (1U << 1)
-#define KDBG_FREERUN    (1U << 2) /* obsolete */
-/* buffer has wrapped */
-#define KDBG_WRAPPED    (1U << 3)
-/* flags that are allowed to be set by user space */
-#define KDBG_USERFLAGS  (KDBG_FREERUN | KDBG_NOWRAP | KDBG_INIT)
-/* only include processes with kdebug bit set in proc */
-#define KDBG_PIDCHECK   (1U << 4)
-/* thread map is initialized */
-#define KDBG_MAPINIT    (1U << 5)
-/* exclude processes based on kdebug bit in proc */
-#define KDBG_PIDEXCLUDE (1U << 6)
-/* whether the kdebug locks are intialized */
-#define KDBG_LOCKINIT   (1U << 7)
-/* word size of the kernel */
-#define KDBG_LP64       (1U << 8)
-
-/* bits for kd_ctrl_page.flags and kbufinfo_t.flags */
-
-/* only trace events within a range */
-#define KDBG_RANGECHECK       0x00100000U
-/* only trace at most 4 types of events, at the code granularity */
-#define KDBG_VALCHECK         0x00200000U
-/* check class and subclass against the typefilter */
-#define KDBG_TYPEFILTER_CHECK 0x00400000U
-/* kdebug trace buffers are initialized */
-#define KDBG_BUFINIT          0x80000000U
-
-/* bits for the type field of kd_regtype */
-#define KDBG_CLASSTYPE  0x10000
-#define KDBG_SUBCLSTYPE 0x20000
-#define KDBG_RANGETYPE  0x40000
-#define KDBG_TYPENONE   0x80000
-#define KDBG_CKTYPES    0xF0000
-
-typedef struct {
-       unsigned int type;
-       unsigned int value1;
-       unsigned int value2;
-       unsigned int value3;
-       unsigned int value4;
-} kd_regtype;
-
-typedef struct {
-       /* number of events that can fit in the buffers */
-       int nkdbufs;
-       /* set if trace is disabled */
-       int nolog;
-       /* kd_ctrl_page.flags */
-       unsigned int flags;
-       /* number of threads in thread map */
-       int nkdthreads;
-       /* the owning pid */
-       int bufid;
-} kbufinfo_t;
-
-typedef struct {
-       /* the thread ID */
-#if defined(__arm64__)
-       uint64_t thread;
-#else
-       uintptr_t thread;
-#endif
-       /* 0 for invalid, otherwise the PID (or 1 for kernel_task) */
-       int valid;
-       /* the name of the process owning the thread */
-       char command[20];
-} kd_threadmap;
-
-typedef struct {
-       uint32_t version_no;
-       uint32_t cpu_count;
-} kd_cpumap_header;
-
-/* cpumap flags */
-#define KDBG_CPUMAP_IS_IOP      0x1
-
-typedef struct {
-       uint32_t cpu_id;
-       uint32_t flags;
-       char name[8];
-} kd_cpumap;
-
-/*
- * TRACE file formats...
- *
- * RAW_VERSION0
- *
- * uint32_t #threadmaps
- * kd_threadmap[]
- * kd_buf[]
- *
- * RAW_VERSION1
- *
- * RAW_header, with version_no set to RAW_VERSION1
- * kd_threadmap[]
- * Empty space to pad alignment to the nearest page boundary.
- * kd_buf[]
- *
- * RAW_VERSION1+
- *
- * RAW_header, with version_no set to RAW_VERSION1
- * kd_threadmap[]
- * kd_cpumap_header, with version_no set to RAW_VERSION1
- * kd_cpumap[]
- * Empty space to pad alignment to the nearest page boundary.
- * kd_buf[]
- *
- * V1+ implementation details...
- *
- * It would have been nice to add the cpumap data "correctly", but there were
- * several obstacles. Existing code attempts to parse both V1 and V0 files.
- * Due to the fact that V0 has no versioning or header, the test looks like
- * this:
- *
- * // Read header
- * if (header.version_no != RAW_VERSION1) { // Assume V0 }
- *
- * If we add a VERSION2 file format, all existing code is going to treat that
- * as a VERSION0 file when reading it, and crash terribly when trying to read
- * RAW_VERSION2 threadmap entries.
- *
- * To differentiate between a V1 and V1+ file, read as V1 until you reach
- * the padding bytes. Then:
- *
- * boolean_t is_v1plus = FALSE;
- * if (padding_bytes >= sizeof(kd_cpumap_header)) {
- *     kd_cpumap_header header = // read header;
- *     if (header.version_no == RAW_VERSION1) {
- *         is_v1plus = TRUE;
- *     }
- * }
- *
- */
-
-typedef struct {
-       int             version_no;
-       int             thread_count;
-       uint64_t        TOD_secs;
-       uint32_t        TOD_usecs;
-} RAW_header;
-
-// Version 3 header
-// The header chunk has the tag 0x00001000 which also serves as a magic word
-// that identifies the file as a version 3 trace file. The header payload is
-// a set of fixed fields followed by a variable number of sub-chunks:
-/*
- *  ____________________________________________________________________________
- | Offset | Size | Field                                                    |
- |  ----------------------------------------------------------------------------
- |    0   |  4   | Tag (0x00001000)                                         |
- |    4   |  4   | Sub-tag. Represents the version of the header.           |
- |    8   |  8   | Length of header payload (40+8x)                         |
- |   16   |  8   | Time base info. Two 32-bit numbers, numer/denom,         |
- |        |      | for converting timestamps to nanoseconds.                |
- |   24   |  8   | Timestamp of trace start.                                |
- |   32   |  8   | Wall time seconds since Unix epoch.                      |
- |        |      | As returned by gettimeofday().                           |
- |   40   |  4   | Wall time microseconds. As returned by gettimeofday().   |
- |   44   |  4   | Local time zone offset in minutes. ( " )                 |
- |   48   |  4   | Type of daylight savings time correction to apply. ( " ) |
- |   52   |  4   | Flags. 1 = 64-bit. Remaining bits should be written      |
- |        |      | as 0 and ignored when reading.                           |
- |   56   |  8x  | Variable number of sub-chunks. None are required.        |
- |        |      | Ignore unknown chunks.                                   |
- |  ----------------------------------------------------------------------------
- */
-// NOTE: The header sub-chunks are considered part of the header chunk,
-// so they must be included in the header chunk’s length field.
-// The CPU map is an optional sub-chunk of the header chunk. It provides
-// information about the CPUs that are referenced from the trace events.
-typedef struct {
-       uint32_t tag;
-       uint32_t sub_tag;
-       uint64_t length;
-       uint32_t timebase_numer;
-       uint32_t timebase_denom;
-       uint64_t timestamp;
-       uint64_t walltime_secs;
-       uint32_t walltime_usecs;
-       uint32_t timezone_minuteswest;
-       uint32_t timezone_dst;
-       uint32_t flags;
-} __attribute__((packed)) kd_header_v3;
-
-typedef struct {
-       uint32_t tag;
-       uint32_t sub_tag;
-       uint64_t length;
-} __attribute__((packed)) kd_chunk_header_v3;
-
-#define RAW_VERSION0    0x55aa0000
-#define RAW_VERSION1    0x55aa0101
-#define RAW_VERSION2    0x55aa0200 /* Only used by kperf and Instruments */
-#define RAW_VERSION3    0x00001000
-
-#define V3_CONFIG       0x00001b00
-#define V3_CPU_MAP      0x00001c00
-#define V3_THREAD_MAP   0x00001d00
-#define V3_RAW_EVENTS   0x00001e00
-#define V3_NULL_CHUNK   0x00002000
-
-// The current version of all kernel managed chunks is 1. The
-// V3_CURRENT_CHUNK_VERSION is added to ease the simple case
-// when most/all the kernel managed chunks have the same version.
-
-#define V3_CURRENT_CHUNK_VERSION 1
-#define V3_HEADER_VERSION     V3_CURRENT_CHUNK_VERSION
-#define V3_CPUMAP_VERSION     V3_CURRENT_CHUNK_VERSION
-#define V3_THRMAP_VERSION     V3_CURRENT_CHUNK_VERSION
-#define V3_EVENT_DATA_VERSION V3_CURRENT_CHUNK_VERSION
-
-// Apis to support writing v3 chunks in the kernel
-int kdbg_write_v3_chunk_header_to_buffer(void *buffer, uint32_t tag, uint32_t sub_tag, uint64_t length);
-int kdbg_write_v3_chunk_to_fd(uint32_t tag, uint32_t sub_tag, uint64_t length, void *payload, uint64_t payload_size, int fd);
-
 /* VFS lookup events for serial traces */
 #define VFS_LOOKUP      (FSDBG_CODE(DBG_FSRW,36))
 #define VFS_LOOKUP_DONE (FSDBG_CODE(DBG_FSRW,39))
 
-#if !CONFIG_EMBEDDED
-#if defined(XNU_KERNEL_PRIVATE) && (DEVELOPMENT || DEBUG)
-#define KDEBUG_MOJO_TRACE 1
-#endif
-#endif
+#endif /* __APPLE_API_UNSTABLE */
 
-#endif /* __APPLE_API_PRIVATE */
-#endif /* PRIVATE */
+__END_DECLS
+
+#if defined(__has_include) && __has_include(<sys/kdebug_private.h>)
+#include <sys/kdebug_private.h>
+#endif /* __has_include(<sys/kdebug_private.h>) */
+
+#ifdef KERNEL
+#include <sys/kdebug_kernel.h>
+#endif /* defined(KERNEL) */
 
-#endif /* !BSD_SYS_KDEBUG_H */
+#endif /* !defined(BSD_SYS_KDEBUG_H) */
diff --git a/bsd/sys/kdebug_kernel.h b/bsd/sys/kdebug_kernel.h
new file mode 100644 (file)
index 0000000..c366472
--- /dev/null
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef BSD_SYS_KDEBUG_KERNEL_H
+#define BSD_SYS_KDEBUG_KERNEL_H
+
+#include <mach/boolean.h>
+#include <mach/clock_types.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <sys/cdefs.h>
+
+__BEGIN_DECLS
+
+#ifdef KERNEL
+
+/*
+ * To use kdebug in the kernel:
+ *
+ * #include <sys/kdebug_kernel.h>
+ *
+ * #define DBG_NETIPINIT NETDBG_CODE(DBG_NETIP, 1)
+ *
+ * void
+ * ip_init(void)
+ * {
+ *     KDBG(DBG_NETIPINIT | DBG_FUNC_START, 1, 2, 3, 4);
+ *     ...
+ *     KDBG(DBG_NETIPINIT);
+ *     ...
+ *     KDBG(DBG_NETIPINIT | DBG_FUNC_END);
+ * }
+ */
+
+#pragma mark - kernel tracepoints
+
+/*
+ * The KDBG{,_DEBUG,_RELEASE,_FILTERED} macros are the preferred method of
+ * making tracepoints.
+ *
+ * Kernel pointers must be unslid or permuted using VM_KERNEL_UNSLIDE_OR_PERM.
+ * Do not trace any sensitive data.
+ */
+
+/*
+ * Traced on debug and development (and release macOS) kernels.
+ */
+#define KDBG(x, ...) KDBG_(, x, ## __VA_ARGS__, 4, 3, 2, 1, 0)
+
+/*
+ * Traced on debug and development (and release macOS) kernels if explicitly
+ * requested.  Omitted from tracing without a typefilter.
+ */
+#define KDBG_FILTERED(x, ...) KDBG_(_FILTERED, x, ## __VA_ARGS__, 4, 3, 2, 1, 0)
+
+#ifdef KERNEL_PRIVATE
+
+/*
+ * Traced on debug and development (and release macOS) kernels, even if the
+ * process filter would reject it.
+ */
+#define KDBG_RELEASE_NOPROCFILT(x, ...) \
+               KDBG_(_RELEASE_NOPROCFILT, x, ## __VA_ARGS__, 4, 3, 2, 1, 0)
+
+#endif /* KERNEL_PRIVATE */
+
+/*
+ * Traced on debug, development, and release kernels.
+ *
+ * Only use this tracepoint if the events are required for a shipping trace
+ * tool.
+ */
+#define KDBG_RELEASE(x, ...) KDBG_(_RELEASE, x, ## __VA_ARGS__, 4, 3, 2, 1, 0)
+
+/*
+ * Traced only on debug kernels.
+ */
+#define KDBG_DEBUG(x, ...) KDBG_(_DEBUG, x, ## __VA_ARGS__, 4, 3, 2, 1, 0)
+
+#pragma mark - kernel API
+
+#ifdef KERNEL_PRIVATE
+
+/*
+ * kernel_debug_string provides the same functionality as the
+ * kdebug_trace_string syscall as a KPI.  str_id is an in/out
+ * parameter that, if it's pointing to a string ID of 0, will
+ * receive a generated ID.  If it provides a value in str_id,
+ * then that will be used, instead.
+ *
+ * Returns an errno indicating the type of failure.
+ */
+int kernel_debug_string(uint32_t debugid, uint64_t *str_id, const char *str);
+
+/*
+ * kernel_debug_disable disables event logging, but leaves any buffers
+ * intact.
+ */
+void kernel_debug_disable(void);
+
+#endif /* KERNEL_PRIVATE */
+
+/*
+ * Returns true if kdebug is using continuous time for its events, and false
+ * otherwise.
+ */
+bool kdebug_using_continuous_time(void);
+
+/*
+ * Returns true if kdebug will log an event with the provided debugid, and
+ * false otherwise.
+ */
+bool kdebug_debugid_enabled(uint32_t debugid);
+
+/*
+ * Returns true only if the debugid is explicitly enabled by filters.  Returns
+ * false otherwise, including when no filters are active.
+ */
+bool kdebug_debugid_explicitly_enabled(uint32_t debugid);
+
+uint32_t kdebug_commpage_state(void);
+
+#pragma mark - IOP tracing
+
+/*
+ * Definitions to support IOP tracing.
+ */
+
+typedef enum {
+       /* Trace is now enabled; no arguments.  */
+       KD_CALLBACK_KDEBUG_ENABLED,
+       /* Trace is now disabled; no arguments.  */
+       KD_CALLBACK_KDEBUG_DISABLED,
+       /*
+        * Request the latest entries from the IOP and block until complete; no
+        * arguments.
+        */
+       KD_CALLBACK_SYNC_FLUSH,
+       /*
+        * The typefilter is enabled; a read-only pointer to the typefilter is
+        * provided, valid only while in the callback.
+        */
+       KD_CALLBACK_TYPEFILTER_CHANGED,
+} kd_callback_type;
+
+typedef void (*kd_callback_fn) (void *context, kd_callback_type reason,
+    void *arg);
+
+struct kd_callback {
+       kd_callback_fn func;
+       void *context;
+       /* name of IOP, NUL-terminated */
+       char iop_name[8];
+};
+
+typedef struct kd_callback kd_callback_t;
+
+/*
+ * Registers an IOP for participation in tracing.
+ *
+ * The registered callback function will be called with the
+ * supplied context as the first argument, followed by a
+ * kd_callback_type and an associated void* argument.
+ *
+ * The return value is a nonzero coreid that shall be used in
+ * kernel_debug_enter() to refer to your IOP. If the allocation
+ * failed, then 0 will be returned.
+ *
+ * Caveats:
+ * Note that not all callback calls will indicate a change in
+ * state (e.g. disabling trace twice would send two disable
+ * notifications).
+ */
+int kernel_debug_register_callback(kd_callback_t callback);
+
+void kernel_debug_enter(uint32_t coreid, uint32_t debugid, uint64_t timestamp,
+    uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4,
+    uintptr_t threadid);
+
+#pragma mark - internals
+
+#define KDBG_(f, x, a, b, c, d, n, ...) KDBG##n(f, x, a, b, c, d)
+#define KDBG0(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, 0, 0, 0, 0, 0)
+#define KDBG1(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, 0, 0, 0, 0)
+#define KDBG2(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, 0, 0, 0)
+#define KDBG3(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, c, 0, 0)
+#define KDBG4(f, x, a, b, c, d) KERNEL_DEBUG_CONSTANT##f(x, a, b, c, d, 0)
+
+#ifdef XNU_KERNEL_PRIVATE
+#define KDBG_IMPROBABLE __improbable
+#else
+#define KDBG_IMPROBABLE
+#endif
+
+extern unsigned int kdebug_enable;
+
+/*
+ * The kernel debug configuration level.  These values control which events are
+ * compiled in under different build configurations.
+ *
+ * Infer the supported kernel debug event level from config option.  Use
+ * (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) as a guard to protect unaudited debug
+ * code.
+ */
+#define KDEBUG_LEVEL_NONE     0
+#define KDEBUG_LEVEL_IST      1
+#define KDEBUG_LEVEL_STANDARD 2
+#define KDEBUG_LEVEL_FULL     3
+
+#if NO_KDEBUG
+#define KDEBUG_LEVEL KDEBUG_LEVEL_NONE
+#elif IST_KDEBUG
+#define KDEBUG_LEVEL KDEBUG_LEVEL_IST
+#elif KDEBUG
+#define KDEBUG_LEVEL KDEBUG_LEVEL_FULL
+#else
+#define KDEBUG_LEVEL KDEBUG_LEVEL_STANDARD
+/*
+ * Currently, all other kernel configurations (development, etc) build with
+ * KDEBUG_LEVEL_STANDARD.
+ */
+#endif
+
+/*
+ * KERNEL_DEBUG_CONSTANT_FILTERED events are omitted from tracing unless they
+ * are explicitly requested in the typefilter.  They are not emitted when
+ * tracing without a typefilter.
+ */
+#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
+#define KERNEL_DEBUG_CONSTANT_FILTERED(x, a, b, c, d, ...)           \
+       do {                                                             \
+               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {   \
+                       kernel_debug_filtered((x), (uintptr_t)(a), (uintptr_t)(b),  \
+                               (uintptr_t)(c), (uintptr_t)(d)); \
+               }                                                            \
+       } while (0)
+#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
+#define KERNEL_DEBUG_CONSTANT_FILTERED(type, x, a, b, c, d, ...) do {} while (0)
+#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
+
+#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST)
+#define KERNEL_DEBUG_CONSTANT_RELEASE_NOPROCFILT(x, a, b, c, d, ...)   \
+       do {                                                               \
+               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {     \
+                       kernel_debug_flags((x), (uintptr_t)(a), (uintptr_t)(b),    \
+                               (uintptr_t)(c), (uintptr_t)(d), KDBG_FLAG_NOPROCFILT); \
+               }                                                              \
+       } while (0)
+#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
+#define KERNEL_DEBUG_CONSTANT_RELEASE_NOPROCFILT(x, a, b, c, d, ...) \
+       do { } while (0)
+#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
+
+
+#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
+#define KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e)                               \
+       do {                                                                      \
+               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {            \
+                       kernel_debug((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \
+                               (uintptr_t)(d),(uintptr_t)(e));                               \
+               }                                                                     \
+       } while (0)
+
+/*
+ * DO NOT USE THIS MACRO -- it breaks fundamental assumptions about ktrace and
+ * is only meant to be used by the pthread kext and other points in the kernel
+ * where the thread ID must be provided explicitly.
+ */
+#define KERNEL_DEBUG_CONSTANT1(x, a, b, c, d, e)                               \
+       do {                                                                       \
+               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {             \
+                       kernel_debug1((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \
+                       (uintptr_t)(d), (uintptr_t)(e));                                   \
+               }                                                                      \
+       } while (0)
+
+#define KERNEL_DEBUG_EARLY(x, a, b, c, d)                                 \
+       do {                                                                  \
+               kernel_debug_early((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \
+                       (uintptr_t)(c), (uintptr_t)(d));                              \
+       } while (0)
+#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
+#define KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e) do {} while (0)
+#define KERNEL_DEBUG_CONSTANT1(x, a, b, c, d, e) do {} while (0)
+#define KERNEL_DEBUG_EARLY(x, a, b, c, d) do {} while (0)
+#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
+
+/*
+ * KERNEL_DEBUG_CONSTANT_IST (in-system trace) events provide an audited subset
+ * of tracepoints for userland system tracing tools.  This tracing level was
+ * created by 8857227 to protect fairplayd and other PT_DENY_ATTACH processes.
+ * It has two effects: only KERNEL_DEBUG_CONSTANT_IST() traces are emitted and
+ * any PT_DENY_ATTACH processes will only emit basic traces as defined by the
+ * kernel_debug_filter() routine.
+ */
+#define KERNEL_DEBUG_CONSTANT_RELEASE(x, a, b, c, d, e) \
+       KERNEL_DEBUG_CONSTANT_IST(~KDEBUG_ENABLE_PPT, x, a, b, c, d, 0)
+
+#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST)
+#define KERNEL_DEBUG_CONSTANT_IST(type, x, a, b, c, d, e)                     \
+       do {                                                                      \
+               if (KDBG_IMPROBABLE(kdebug_enable & (type))) {                        \
+                       kernel_debug((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \
+                               (uintptr_t)(d), 0);                                           \
+               }                                                                     \
+       } while (0)
+#define KERNEL_DEBUG_CONSTANT_IST1(x, a, b, c, d, e)                     \
+       do {                                                                       \
+               if (KDBG_IMPROBABLE(kdebug_enable)) {                         \
+                       kernel_debug1((x), (uintptr_t)(a), (uintptr_t)(b), (uintptr_t)(c), \
+                               (uintptr_t)(d), (uintptr_t)(e));                               \
+               }                                                                      \
+       } while (0)
+#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
+#define KERNEL_DEBUG_CONSTANT_IST(type, x, a, b, c, d, e) do {} while (0)
+#define KERNEL_DEBUG_CONSTANT_IST1(x, a, b, c, d, e) do {} while (0)
+#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
+
+#if NO_KDEBUG
+#define __kdebug_constant_only __unused
+#endif
+
+/*
+ * KERNEL_DEBUG events are only traced for DEBUG kernels.
+ */
+#define KERNEL_DEBUG_CONSTANT_DEBUG(x, a, b, c, d, e) \
+       KERNEL_DEBUG(x, a, b, c, d, e)
+
+#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL)
+#define __kdebug_only
+
+#undef KERNEL_DEBUG
+#define KERNEL_DEBUG(x, a, b, c, d, e)                                  \
+       do {                                                                \
+               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {      \
+                       kernel_debug((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \
+                               (uintptr_t)(c), (uintptr_t)(d), (uintptr_t)(e));        \
+               }                                                               \
+       } while (0)
+
+/*
+ * DO NOT USE THIS MACRO -- see warning above for KERNEL_DEBUG_CONSTANT1.
+ */
+#define KERNEL_DEBUG1(x, a, b, c, d, e)                                  \
+       do {                                                                 \
+               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {       \
+                       kernel_debug1((uint32_t)(x), (uintptr_t)(a), (uintptr_t)(b), \
+                               (uintptr_t)(c), (uintptr_t)(d), (uintptr_t)(e));         \
+               }                                                                \
+       } while (0)
+
+#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) */
+#define __kdebug_only __unused
+
+#undef KERNEL_DEBUG
+#define KERNEL_DEBUG(x, a, b, c, d, e) do {} while (0)
+#define KERNEL_DEBUG1(x, a, b, c, d, e) do {} while (0)
+#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) */
+
+void kernel_debug(uint32_t debugid, uintptr_t arg1, uintptr_t arg2,
+    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5);
+
+void kernel_debug1(uint32_t debugid, uintptr_t arg1, uintptr_t arg2,
+    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5);
+
+#define KDBG_FLAG_FILTERED 0x01
+#define KDBG_FLAG_NOPROCFILT 0x02
+
+void kernel_debug_flags(uint32_t debugid, uintptr_t arg1, uintptr_t arg2,
+    uintptr_t arg3, uintptr_t arg4, uint64_t flags);
+
+void kernel_debug_filtered(uint32_t debugid, uintptr_t arg1, uintptr_t arg2,
+    uintptr_t arg3, uintptr_t arg4);
+
+#pragma mark - xnu API
+
+#ifdef XNU_KERNEL_PRIVATE
+/* Used in early boot to log events. */
+void kernel_debug_early(uint32_t  debugid, uintptr_t arg1, uintptr_t arg2,
+    uintptr_t arg3, uintptr_t arg4);
+/* Used in early boot to log strings spanning only a single tracepoint. */
+void kernel_debug_string_early(const char *message);
+/* Used to trace strings within kdebug tracepoints on arbitrary eventids. */
+void kernel_debug_string_simple(uint32_t eventid, const char *str);
+/* Only used by ktrace to reset kdebug.  ktrace_lock must be held. */
+extern void kdebug_reset(void);
+
+void kdbg_dump_trace_to_file(const char *);
+void kdebug_init(unsigned int n_events, char *filterdesc, bool wrapping);
+void kdebug_trace_start(unsigned int n_events, const char *filterdesc,
+    bool wrapping, bool at_wake);
+void kdebug_free_early_buf(void);
+void release_storage_unit(int cpu, uint32_t storage_unit);
+bool allocate_storage_unit(int cpu);
+
+struct proc;
+void kdbg_trace_data(struct proc *proc, long *arg_pid, long *arg_uniqueid);
+void kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3,
+    long *arg4);
+
+#define KDBG_VFS_LOOKUP_FLAG_LOOKUP 0x01
+#define KDBG_VFS_LOOKUP_FLAG_NOPROCFILT 0x02
+void kdebug_vfs_lookup(long *dbg_parms, int dbg_namelen, void *dp,
+    uint32_t flags);
+
+#endif /* XNU_KERNEL_PRIVATE */
+
+#ifdef KERNEL_PRIVATE
+
+#define NUMPARMS 23
+void kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp,
+    bool lookup);
+
+#pragma mark - EnergyTracing
+
+#define KERNEL_DBG_IST_SANE KDBG_RELEASE
+#define ENTR_KDTRACEFUNC KDBG_RELEASE
+
+// value is int64_t, quality is uint32_t
+#define KERNEL_ENERGYTRACE(opcode, lifespan, id, quality, value)        \
+           ENTR_KDTRACE(kEnTrCompKernel, opcode, lifespan, id,         \
+                        quality, value)
+#define KERNEL_ENTR_ASSOCIATE(par_opcode, par_act_id, sub_opcode, sub_act_id) \
+           ENTR_KDASSOCIATE(kEnTrCompKernel, par_opcode, par_act_id,   \
+                            kEnTrCompKernel, sub_opcode, sub_act_id)
+
+#endif /* KERNEL_PRIVATE */
+
+#endif /* KERNEL */
+
+__END_DECLS
+
+#endif /* !defined(BSD_SYS_KDEBUG_KERNEL_H) */
index 7db2d075f346b9d6a5f47f62778fbf386485e0ca..250b60abf0ae6240484e5549ceb865f8cbdf25fc 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2016-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -38,35 +38,24 @@ __BEGIN_DECLS
 #ifndef KERNEL
 
 /*
- * In previous versions of the operating system, applications could use:
- *
- * syscall(SYS_kdebug_trace, APPSDBG_CODE(DBG_MACH_CHUD, <your event code>) | DBG_FUNC_<type>, arg1, arg2, arg3, arg4);
- *
- * to record events that would be displayed by Instruments.
- *
- * syscall(2) is now deprecated and this interface replaces the above call as follows:
- *
- * The code argument is <your event code>.  Only the low 14-bits of the code are
- * preserved.
+ * kdebug_signpost(2) is deprecated.  Use the os_signpost(3) family of tracing
+ * functions, instead.
  */
 
-/*
- * When <type> is NONE, use kdebug_signpost.
- */
-int kdebug_signpost(uint32_t code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
-__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0);
+int kdebug_signpost(uint32_t code, uintptr_t arg1, uintptr_t arg2,
+    uintptr_t arg3, uintptr_t arg4)
+__API_DEPRECATED_WITH_REPLACEMENT("os_signpost_event_emit",
+    macos(10.12, 10.15), ios(10.0, 13.0), watchos(3.0, 6.0), tvos(10.0, 13.0));
 
-/*
- * When <type> is START, use kdebug_signpost_start.
- */
-int kdebug_signpost_start(uint32_t code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
-__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0);
+int kdebug_signpost_start(uint32_t code, uintptr_t arg1, uintptr_t arg2,
+    uintptr_t arg3, uintptr_t arg4)
+__API_DEPRECATED_WITH_REPLACEMENT("os_signpost_interval_begin",
+    macos(10.12, 10.15), ios(10.0, 13.0), watchos(3.0, 6.0), tvos(10.0, 13.0));
 
-/*
- * When <type> is END, use kdebug_signpost_end.
- */
-int kdebug_signpost_end(uint32_t code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
-__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0);
+int kdebug_signpost_end(uint32_t code, uintptr_t arg1, uintptr_t arg2,
+    uintptr_t arg3, uintptr_t arg4)
+__API_DEPRECATED_WITH_REPLACEMENT("os_signpost_interval_end",
+    macos(10.12, 10.15), ios(10.0, 13.0), watchos(3.0, 6.0), tvos(10.0, 13.0));
 
 #endif /* !KERNEL */
 
index 0ef5a8132baec5bf59d5c9edd14a13649af53b42..8ea2bce397ec889b5b323b20f05b11ae3608be7e 100644 (file)
 #ifndef SYS_MEMORYSTATUS_H
 #define SYS_MEMORYSTATUS_H
 
-#include <stdint.h>
 #include <sys/time.h>
-#include <sys/proc.h>
-#include <sys/param.h>
 #include <mach_debug/zone_info.h>
+#include <sys/proc.h>
 
 #define MEMORYSTATUS_ENTITLEMENT "com.apple.private.memorystatus"
 
@@ -55,6 +53,7 @@
 #define JETSAM_PRIORITY_FOREGROUND               10
 #define JETSAM_PRIORITY_AUDIO_AND_ACCESSORY      12
 #define JETSAM_PRIORITY_CONDUCTOR                13
+#define JETSAM_PRIORITY_DRIVER_APPLE             15
 #define JETSAM_PRIORITY_HOME                     16
 #define JETSAM_PRIORITY_EXECUTIVE                17
 #define JETSAM_PRIORITY_IMPORTANT                18
 /* Compatibility */
 #define DEFAULT_JETSAM_PRIORITY                  18
 
+/*
+ * The deferral time used by default for apps and daemons in all aging
+ * policies except kJetsamAgingPolicySysProcsReclaimedFirst is
+ * DEFERRED_IDLE_EXIT_TIME_SECS.
+ *
+ * For kJetsamAgingPolicySysProcsReclaimedFirst,
+ *
+ * Daemons: The actual idle deferred time for the daemon is based on
+ * the relaunch behavior of the daemon. The relaunch behavior determines
+ * the scaling factor applied to DEFERRED_IDLE_EXIT_TIME_SECS. See
+ * kJetsamSysProcsIdleDelayTime* ratios defined in kern_memorystatus.c
+ *
+ * Apps: The apps are aged for DEFERRED_IDLE_EXIT_TIME_SECS factored
+ * by kJetsamAppsIdleDelayTimeRatio.
+ */
 #define DEFERRED_IDLE_EXIT_TIME_SECS             10
 
 #define KEV_MEMORYSTATUS_SUBCLASS                 3
@@ -185,15 +199,28 @@ typedef struct jetsam_snapshot {
        memorystatus_jetsam_snapshot_entry_t entries[];
 } memorystatus_jetsam_snapshot_t;
 
-typedef struct memorystatus_freeze_entry {
-       int32_t pid;
-       uint32_t flags;
-       uint32_t pages;
-} memorystatus_freeze_entry_t;
-
 /* TODO - deprecate; see <rdar://problem/12969599> */
 #define kMaxSnapshotEntries 192
 
+/*
+ * default jetsam snapshot support
+ */
+extern memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
+extern memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_copy;
+extern unsigned int memorystatus_jetsam_snapshot_count;
+extern unsigned int memorystatus_jetsam_snapshot_copy_count;
+extern unsigned int memorystatus_jetsam_snapshot_max;
+extern unsigned int memorystatus_jetsam_snapshot_size;
+extern uint64_t memorystatus_jetsam_snapshot_last_timestamp;
+extern uint64_t memorystatus_jetsam_snapshot_timeout;
+#define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot->entries
+#define JETSAM_SNAPSHOT_TIMEOUT_SECS 30
+
+/* General memorystatus stuff */
+
+extern uint64_t memorystatus_sysprocs_idle_delay_time;
+extern uint64_t memorystatus_apps_idle_delay_time;
+
 /* State */
 #define kMemorystatusSuspended        0x01
 #define kMemorystatusFrozen           0x02
@@ -201,6 +228,7 @@ typedef struct memorystatus_freeze_entry {
 #define kMemorystatusTracked          0x08
 #define kMemorystatusSupportsIdleExit 0x10
 #define kMemorystatusDirty            0x20
+#define kMemorystatusAssertion        0x40
 
 /*
  * Jetsam exit reason definitions - related to memorystatus
@@ -223,8 +251,8 @@ typedef struct memorystatus_freeze_entry {
 #define JETSAM_REASON_ZONE_MAP_EXHAUSTION                                       10
 #define JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING                     11
 #define JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE        12
-
-#define JETSAM_REASON_MEMORYSTATUS_MAX  JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE
+#define JETSAM_REASON_LOWSWAP                                   13
+#define JETSAM_REASON_MEMORYSTATUS_MAX  JETSAM_REASON_LOWSWAP
 
 /*
  * Jetsam exit reason definitions - not related to memorystatus
@@ -246,9 +274,14 @@ enum {
        kMemorystatusKilledZoneMapExhaustion                    = JETSAM_REASON_ZONE_MAP_EXHAUSTION,
        kMemorystatusKilledVMCompressorThrashing                = JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING,
        kMemorystatusKilledVMCompressorSpaceShortage    = JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE,
+       kMemorystatusKilledLowSwap                      = JETSAM_REASON_LOWSWAP,
 };
 
-/* For backwards compatibility */
+/*
+ * For backwards compatibility
+ * Keeping these around for external users (e.g. ReportCrash, Ariadne).
+ * TODO: Remove once they stop using these.
+ */
 #define kMemorystatusKilledDiagnostic           kMemorystatusKilledDiskSpaceShortage
 #define kMemorystatusKilledVMThrashing          kMemorystatusKilledVMCompressorThrashing
 #define JETSAM_REASON_MEMORY_VMTHRASHING        JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING
@@ -289,6 +322,10 @@ int memorystatus_control(uint32_t command, int32_t pid, uint32_t flags, void *bu
 #endif /* DEVELOPMENT || DEBUG */
 #endif /* CONFIG_FREEZE */
 
+#define MEMORYSTATUS_CMD_GET_AGGRESSIVE_JETSAM_LENIENT_MODE      21   /* Query if the lenient mode for aggressive jetsam is enabled. */
+
+#define MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT   22   /* Used by DYLD to increase the jetsam active and inactive limits, when using roots */
+
 /* Commands that act on a group of processes */
 #define MEMORYSTATUS_CMD_GRP_SET_PROPERTIES           100
 
@@ -362,6 +399,11 @@ typedef struct memorystatus_priority_properties {
        uint64_t user_data;
 } memorystatus_priority_properties_t;
 
+/*
+ * Inform the kernel that setting the priority property is driven by assertions.
+ */
+#define MEMORYSTATUS_SET_PRIORITY_ASSERTION     0x1
+
 /*
  * For use with memorystatus_control:
  * MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES
@@ -374,6 +416,12 @@ typedef struct memorystatus_memlimit_properties {
        uint32_t memlimit_inactive_attr;
 } memorystatus_memlimit_properties_t;
 
+typedef struct memorystatus_memlimit_properties2 {
+       memorystatus_memlimit_properties_t v1;
+       uint32_t memlimit_increase;             /* jetsam memory limit increase (in MB) for active and inactive states */
+       uint32_t memlimit_increase_bytes;       /* bytes used to determine the jetsam memory limit increase, for active and inactive states */
+} memorystatus_memlimit_properties2_t;
+
 #define MEMORYSTATUS_MEMLIMIT_ATTR_FATAL        0x1     /* if set, exceeding the memlimit is fatal */
 
 #ifdef XNU_KERNEL_PRIVATE
@@ -414,7 +462,6 @@ typedef struct memorystatus_memlimit_properties {
 #define P_MEMSTAT_FREEZE_IGNORE        0x00000040 /* Process was evaluated by freezer and will be ignored till the next time it goes active and does something */
 #define P_MEMSTAT_PRIORITYUPDATED      0x00000080 /* Process had its jetsam priority updated */
 #define P_MEMSTAT_FOREGROUND           0x00000100 /* Process is in the FG jetsam band...unused??? */
-#define P_MEMSTAT_DIAG_SUSPENDED       0x00000200 /* ...unused??? */
 #define P_MEMSTAT_REFREEZE_ELIGIBLE    0x00000400 /* Process was once thawed i.e. its state was brought back from disk. It is now refreeze eligible.*/
 #define P_MEMSTAT_MANAGED              0x00000800 /* Process is managed by assertiond i.e. is either application or extension */
 #define P_MEMSTAT_INTERNAL             0x00001000 /* Process is a system-critical-not-be-jetsammed process i.e. launchd */
@@ -423,24 +470,73 @@ typedef struct memorystatus_memlimit_properties {
 #define P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL         0x00008000   /* if set, exceeding limit is fatal when the process is inactive */
 #define P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND      0x00010000   /* if set, the process will go into this band & stay there when in the background instead
                                                                *  of the aging bands and/or the IDLE band. */
+#define P_MEMSTAT_PRIORITY_ASSERTION              0x00020000   /* jetsam priority is being driven by an assertion */
+
+
+/*
+ * p_memstat_relaunch_flags holds
+ *      - relaunch behavior when jetsammed
+ */
+#define P_MEMSTAT_RELAUNCH_UNKNOWN      0x0
+#define P_MEMSTAT_RELAUNCH_LOW          0x1
+#define P_MEMSTAT_RELAUNCH_MED          0x2
+#define P_MEMSTAT_RELAUNCH_HIGH         0x4
+
+/*
+ * Checking the p_memstat_state almost always requires the proc_list_lock
+ * because the jetsam thread could be on the other core changing the state.
+ *
+ * App -- almost always managed by a system process. Always have dirty tracking OFF. Can include extensions too.
+ * System Processes -- not managed by anybody. Always have dirty tracking ON. Can include extensions (here) too.
+ */
+#define isApp(p)            ((p->p_memstat_state & P_MEMSTAT_MANAGED) || ! (p->p_memstat_dirty & P_DIRTY_TRACK))
+#define isSysProc(p)            ( ! (p->p_memstat_state & P_MEMSTAT_MANAGED) || (p->p_memstat_dirty & P_DIRTY_TRACK))
+
+#define MEMSTAT_BUCKET_COUNT (JETSAM_PRIORITY_MAX + 1)
+
+typedef struct memstat_bucket {
+       TAILQ_HEAD(, proc) list;
+       int count;
+       int relaunch_high_count;
+} memstat_bucket_t;
+
+extern memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
+
+/*
+ * Table that expresses the probability of a process
+ * being used in the next hour.
+ */
+typedef struct memorystatus_internal_probabilities {
+       char proc_name[MAXCOMLEN + 1];
+       int use_probability;
+} memorystatus_internal_probabilities_t;
+
+extern memorystatus_internal_probabilities_t *memorystatus_global_probabilities_table;
+extern size_t memorystatus_global_probabilities_size;
+
 
 extern void memorystatus_init(void) __attribute__((section("__TEXT, initcode")));
 
 extern void memorystatus_init_at_boot_snapshot(void);
 
 extern int memorystatus_add(proc_t p, boolean_t locked);
-extern int memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effective,
+extern int memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t is_assertion, boolean_t effective,
     boolean_t update_memlimit, int32_t memlimit_active, boolean_t memlimit_active_is_fatal,
     int32_t memlimit_inactive, boolean_t memlimit_inactive_is_fatal);
 
-extern int memorystatus_remove(proc_t p, boolean_t locked);
+/* Remove this process from jetsam bands for killing or freezing.
+ * The proc_list_lock is held by the caller.
+ * @param p: The process to remove.
+ * @return: 0 if successful. EAGAIN if the process can't be removed right now (because it's being frozen) or ESRCH.
+ */
+extern int memorystatus_remove(proc_t p);
 
 int memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t opflags, int priority, boolean_t effective_now);
-
+int memorystatus_relaunch_flags_update(proc_t p, int relaunch_flags);
 
 extern int memorystatus_dirty_track(proc_t p, uint32_t pcontrol);
 extern int memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol);
-extern int memorystatus_dirty_get(proc_t p);
+extern int memorystatus_dirty_get(proc_t p, boolean_t locked);
 extern int memorystatus_dirty_clear(proc_t p, uint32_t pcontrol);
 
 extern int memorystatus_on_terminate(proc_t p);
@@ -463,6 +559,11 @@ void memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_i
 void memorystatus_on_ledger_footprint_exceeded(int warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal);
 void proc_memstat_terminated(proc_t p, boolean_t set);
 void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit);
+
+#if __arm64__
+void memorystatus_act_on_legacy_footprint_entitlement(proc_t p, boolean_t footprint_increase);
+#endif /* __arm64__ */
+
 #endif /* CONFIG_MEMORYSTATUS */
 
 int memorystatus_get_pressure_status_kdp(void);
@@ -472,9 +573,6 @@ int memorystatus_get_pressure_status_kdp(void);
 typedef enum memorystatus_policy {
        kPolicyDefault        = 0x0,
        kPolicyMoreFree       = 0x1,
-       kPolicyDiagnoseAll    = 0x2,
-       kPolicyDiagnoseFirst  = 0x4,
-       kPolicyDiagnoseActive = (kPolicyDiagnoseAll | kPolicyDiagnoseFirst),
 } memorystatus_policy_t;
 
 boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async);
@@ -487,48 +585,27 @@ void memorystatus_fast_jetsam_override(boolean_t enable_override);
 
 #endif /* CONFIG_JETSAM */
 
+/* These are very verbose printfs(), enable with
+ * MEMORYSTATUS_DEBUG_LOG
+ */
+#if MEMORYSTATUS_DEBUG_LOG
+#define MEMORYSTATUS_DEBUG(cond, format, ...)      \
+do {                                              \
+if (cond) { printf(format, ##__VA_ARGS__); } \
+} while(0)
+#else
+#define MEMORYSTATUS_DEBUG(cond, format, ...)
+#endif
+
 boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
 boolean_t memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async);
 void memorystatus_pages_update(unsigned int pages_avail);
-
 boolean_t memorystatus_idle_exit_from_VM(void);
-
-
-#ifdef CONFIG_FREEZE
-
-#define FREEZE_PAGES_MIN   ( 8 * 1024 * 1024 / PAGE_SIZE)
-#define FREEZE_PAGES_MAX   (32 * 1024 * 1024 / PAGE_SIZE)
-
-#define FREEZE_SUSPENDED_THRESHOLD_DEFAULT 4
-#define FREEZE_PROCESSES_MAX               20
-
-#define FREEZE_DAILY_MB_MAX_DEFAULT       1024
-#define FREEZE_DEGRADATION_BUDGET_THRESHOLD     25 //degraded perf. when the daily budget left falls below this threshold percentage
-
-#define MAX_FROZEN_SHARED_MB_PERCENT 10 /* max shared MB calculated as percent of system task limit. */
-#define MAX_FROZEN_PROCESS_DEMOTIONS 2  /* max demotions of frozen processes into IDLE band done daily. */
-#define MIN_THAW_DEMOTION_THRESHOLD  5  /* min # of thaws required for a process to be safe from demotion. */
-#define MIN_THAW_REFREEZE_THRESHOLD  3  /* min # of global thaws needed for us to consider refreezing these processes. */
-
-typedef struct throttle_interval_t {
-       uint32_t mins;
-       uint32_t burst_multiple;
-       uint32_t pageouts;
-       uint32_t max_pageouts;
-       mach_timespec_t ts;
-} throttle_interval_t;
-
-extern boolean_t memorystatus_freeze_enabled;
-extern int memorystatus_freeze_wakeup;
-
-extern void memorystatus_freeze_init(void) __attribute__((section("__TEXT, initcode")));
-extern int  memorystatus_freeze_process_sync(proc_t p);
-
-#if DEVELOPMENT || DEBUG
-#define FREEZER_CONTROL_GET_STATUS      (1)
-#endif /* DEVELOPMENT || DEBUG */
-
-#endif /* CONFIG_FREEZE */
+proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search);
+proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search);
+void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages);
+void memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clean_state);
+void memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check);
 
 #if VM_PRESSURE_EVENTS
 
diff --git a/bsd/sys/kern_memorystatus_freeze.h b/bsd/sys/kern_memorystatus_freeze.h
new file mode 100644 (file)
index 0000000..6c5a8b6
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2006-2018 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef SYS_MEMORYSTATUS_FREEZE_H
+#define SYS_MEMORYSTATUS_FREEZE_H
+
+#include <stdint.h>
+#include <sys/time.h>
+#include <sys/proc.h>
+#include <sys/param.h>
+#include <sys/kern_memorystatus.h>
+
+typedef struct memorystatus_freeze_entry {
+       int32_t pid;
+       uint32_t flags;
+       uint32_t pages;
+} memorystatus_freeze_entry_t;
+
+#ifdef XNU_KERNEL_PRIVATE
+
+extern unsigned long freeze_threshold_percentage;
+extern unsigned int memorystatus_frozen_count;
+extern unsigned int memorystatus_frozen_processes_max;
+extern unsigned int memorystatus_frozen_shared_mb;
+extern unsigned int memorystatus_frozen_shared_mb_max;
+extern unsigned int memorystatus_freeze_shared_mb_per_process_max; /* Max. MB allowed per process to be freezer-eligible. */
+extern unsigned int memorystatus_freeze_private_shared_pages_ratio; /* Ratio of private:shared pages for a process to be freezer-eligible. */
+extern unsigned int memorystatus_suspended_count;
+extern unsigned int memorystatus_thaw_count;
+extern unsigned int memorystatus_refreeze_eligible_count; /* # of processes currently thawed i.e. have state on disk & in-memory */
+
+void memorystatus_freeze_init(void);
+extern int  memorystatus_freeze_process_sync(proc_t p);
+
+#ifdef CONFIG_FREEZE
+
+#define FREEZE_PAGES_MIN   ( 8 * 1024 * 1024 / PAGE_SIZE)
+#define FREEZE_PAGES_MAX   (max_task_footprint_mb == 0 ? INT_MAX : (max_task_footprint_mb << (20 - PAGE_SHIFT)))
+
+#define FREEZE_SUSPENDED_THRESHOLD_DEFAULT 4
+#define FREEZE_PROCESSES_MAX               20
+
+#define FREEZE_DAILY_MB_MAX_DEFAULT       1024
+#define FREEZE_DEGRADATION_BUDGET_THRESHOLD     25 //degraded perf. when the daily budget left falls below this threshold percentage
+
+#define MAX_FROZEN_SHARED_MB_PERCENT 10
+#define MAX_FROZEN_PROCESS_DEMOTIONS 2
+#define MIN_THAW_DEMOTION_THRESHOLD  5
+#define MIN_THAW_REFREEZE_THRESHOLD  3  /* min # of global thaws needed for us to consider refreezing these processes. */
+
+typedef struct throttle_interval_t {
+       uint32_t mins;
+       uint32_t burst_multiple;
+       uint32_t pageouts;
+       uint32_t max_pageouts;
+       mach_timespec_t ts;
+} throttle_interval_t;
+
+extern boolean_t memorystatus_freeze_enabled;
+extern int memorystatus_freeze_wakeup;
+
+/* Thresholds */
+extern unsigned int memorystatus_freeze_threshold;
+extern unsigned int memorystatus_freeze_pages_min;
+extern unsigned int memorystatus_freeze_pages_max;
+extern unsigned int memorystatus_freeze_suspended_threshold;
+extern unsigned int memorystatus_freeze_daily_mb_max;
+extern uint64_t     memorystatus_freeze_budget_pages_remaining; //remaining # of pages that can be frozen to disk
+extern boolean_t memorystatus_freeze_degradation; //protected by the freezer mutex. Signals we are in a degraded freeze mode.
+
+extern unsigned int memorystatus_max_frozen_demotions_daily;
+extern unsigned int memorystatus_thaw_count_demotion_threshold;
+
+#if DEVELOPMENT || DEBUG
+#define FREEZER_CONTROL_GET_STATUS      (1)
+#endif /* DEVELOPMENT || DEBUG */
+
+extern boolean_t memorystatus_freeze_enabled;
+extern int memorystatus_freeze_wakeup;
+extern int memorystatus_freeze_jetsam_band; /* the jetsam band which will contain P_MEMSTAT_FROZEN processes */
+
+boolean_t memorystatus_freeze_thread_should_run(void);
+int memorystatus_set_process_is_freezable(pid_t pid, boolean_t is_freezable);
+int memorystatus_get_process_is_freezable(pid_t pid, int *is_freezable);
+int memorystatus_freezer_control(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval);
+
+#endif /* CONFIG_FREEZE */
+
+#endif /* XNU_KERNEL_PRIVATE */
+
+#endif /* SYS_MEMORYSTATUS_FREEZE_H */
diff --git a/bsd/sys/kern_memorystatus_notify.h b/bsd/sys/kern_memorystatus_notify.h
new file mode 100644 (file)
index 0000000..ee6c5a0
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2006-2018 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef SYS_MEMORYSTATUS_NOTIFY_H
+#define SYS_MEMORYSTATUS_NOTIFY_H
+
+#include <stdint.h>
+#include <sys/proc.h>
+#include <sys/param.h>
+
+#if VM_PRESSURE_EVENTS
+
+extern vm_pressure_level_t memorystatus_vm_pressure_level;
+extern boolean_t memorystatus_hwm_candidates;
+
+boolean_t memorystatus_warn_process(pid_t pid, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t exceeded);
+int memorystatus_send_note(int event_code, void *data, size_t data_length);
+void memorystatus_send_low_swap_note(void);
+void consider_vm_pressure_events(void);
+
+#if CONFIG_MEMORYSTATUS
+
+int memorystatus_low_mem_privileged_listener(uint32_t op_flags);
+int memorystatus_send_pressure_note(int pid);
+boolean_t memorystatus_is_foreground_locked(proc_t p);
+boolean_t memorystatus_bg_pressure_eligible(proc_t p);
+void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit);
+extern void memorystatus_issue_fg_band_notify(void);
+
+#endif /* CONFIG_MEMORYSTATUS */
+
+#if DEBUG
+#define VM_PRESSURE_DEBUG(cond, format, ...)      \
+do {                                              \
+if (cond) { printf(format, ##__VA_ARGS__); } \
+} while(0)
+#else
+#define VM_PRESSURE_DEBUG(cond, format, ...)
+#endif
+
+#endif /* VM_PRESSURE_EVENTS */
+
+#endif /* SYS_MEMORYSTATUS_NOTIFY_H */
diff --git a/bsd/sys/kern_sysctl.h b/bsd/sys/kern_sysctl.h
new file mode 100644 (file)
index 0000000..72d7c82
--- /dev/null
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2019 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _KERN_SYSCTL_H_
+#define _KERN_SYSCTL_H_
+
+#include <mach/mach_types.h>
+
+typedef struct _vm_object_query_data_ vm_object_query_data_t;
+typedef struct _vm_object_query_data_ *vm_object_query_t;
+
+struct _vm_object_query_data_ {
+       vm_object_id_t object_id;
+       mach_vm_size_t virtual_size;
+       mach_vm_size_t resident_size;
+       mach_vm_size_t wired_size;
+       mach_vm_size_t reusable_size;
+       mach_vm_size_t compressed_size;
+       struct {
+               uint64_t vo_no_footprint : 1; /* object not included in footprint */
+               uint64_t vo_ledger_tag   : 3; /* object ledger tag */
+               uint64_t purgable        : 2; /* object "purgable" state #defines */
+       };
+};
+
+typedef struct _vmobject_list_output_ vmobject_list_output_data_t;
+typedef struct _vmobject_list_output_ *vmobject_list_output_t;
+
+struct _vmobject_list_output_ {
+       int64_t entries; /* int64_t for alignment reasons, instead of int32_t */
+       vm_object_query_data_t data[0];
+};
+#endif /* _KERN_SYSCTL_H_ */
index efb737aa702eeae36cf0a9c2b8c76f8245e8cc02..b6ce1fc87358c8550fc89e1221524124fe6d52ba 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 
 #ifdef BSD_KERNEL_PRIVATE
 #include <stdbool.h>
+#include <kern/locks.h>
 
 /* Global variables for the kernel. */
 
 /* 1.1 */
 extern long hostid;
 extern char hostname[MAXHOSTNAMELEN];
-extern int hostnamelen;
+extern lck_mtx_t hostname_lock;
 extern char domainname[MAXHOSTNAMELEN];
-extern int domainnamelen;
+extern lck_mtx_t domainname_lock;
 
 /* 1.2 */
 extern int stathz;              /* statistics clock's frequency */
index f43d1f0c2953114367acbf632a7ea83607530f53..f5fbe196d6fd13906142e8949391c162d4c43801 100644 (file)
@@ -55,11 +55,13 @@ struct ucred;
 typedef struct ucred * ucred_t;
 #endif
 
+#if defined(KERNEL) || !defined(_SYS_MOUNT_H_) /* also defined in mount.h */
 struct mount;
 typedef struct mount * mount_t;
 
 struct vnode;
 typedef struct vnode * vnode_t;
+#endif
 
 struct proc;
 typedef struct proc * proc_t;
@@ -104,8 +106,10 @@ typedef struct file * file_t;
 #ifndef __LP64__
 typedef struct ucred * ucred_t;
 #endif
+#if defined(KERNEL) || !defined(_SYS_MOUNT_H_) /* also defined in mount.h */
 typedef struct mount * mount_t;
 typedef struct vnode * vnode_t;
+#endif
 typedef struct proc * proc_t;
 typedef struct uio * uio_t;
 typedef struct user_iovec * user_iovec_t;
index 76d960422a584a2687e4c89fd44f55457f3c0ca4..3e142976198a52f0173b6280bf162f6f31f00356 100644 (file)
@@ -1053,7 +1053,7 @@ extern void mbuf_inbound_modified(mbuf_t mbuf);
  *               There are a number of operations that are performed in hardware,
  *               such as calculating checksums. This function will perform in
  *               software the various opterations that were scheduled to be done
- *               in hardware. Future operations may include IPSec processing or
+ *               in hardware. Future operations may include IPsec processing or
  *               vlan support. If you are redirecting a packet to a new interface
  *               which may not have the same hardware support or encapsulating
  *               the packet, you should call this function to force the stack to
@@ -1911,6 +1911,27 @@ extern errno_t mbuf_get_flowid(mbuf_t mbuf, u_int16_t *flowid);
  */
 extern errno_t mbuf_set_flowid(mbuf_t mbuf, u_int16_t flowid);
 
+/*!
+ *       @function mbuf_get_keepalive_flag
+ *       @discussion Tell if it's a keep alive packet.
+ *       @param mbuf The mbuf representing the packet.
+ *       @param is_keepalive A pointer that returns the truth value.
+ *       @result 0 upon success otherwise the errno error. If the mbuf
+ *               packet header does not have valid data bytes, the error
+ *               code will be EINVAL
+ */
+extern errno_t mbuf_get_keepalive_flag(mbuf_t mbuf, boolean_t *is_keepalive);
+
+/*!
+ *       @function mbuf_set_keepalive_flag
+ *       @discussion Set or clear the packet keep alive flag.
+ *       @param mbuf The mbuf representing the packet.
+ *       @param is_keepalive The boolean value.
+ *       @result 0 upon success otherwise the errno error. If the mbuf
+ *               packet header does not have valid data bytes, the error
+ *               code will be EINVAL
+ */
+extern errno_t mbuf_set_keepalive_flag(mbuf_t mbuf, boolean_t is_keepalive);
 
 #endif /* KERNEL_PRIVATE */
 
index aa5a89f262f92cfe721a9a2f7816ec2328257150..7eaa3367a5a703fd9f601a8db6fc787626effe8a 100644 (file)
 #include <sys/kernel_types.h>
 #include <sys/socket.h>
 
+#ifndef PRIVATE
+#include <Availability.h>
+#define __NKE_API_DEPRECATED __API_DEPRECATED("Network Kernel Extension KPI is deprecated", macos(10.4, 10.15))
+#else
+#define __NKE_API_DEPRECATED
+#endif /* PRIVATE */
+
 __BEGIN_DECLS
 
 struct timeval;
@@ -114,7 +121,8 @@ extern errno_t sock_accept_internal(socket_t so, struct sockaddr *from, int from
        (cookie), (new_so))
 #else
 extern errno_t sock_accept(socket_t so, struct sockaddr *from, int fromlen,
-    int flags, sock_upcall callback, void *cookie, socket_t *new_so);
+    int flags, sock_upcall callback, void *cookie, socket_t *new_so)
+__NKE_API_DEPRECATED;
 #endif /* KERNEL_PRIVATE */
 
 /*!
@@ -125,7 +133,8 @@ extern errno_t sock_accept(socket_t so, struct sockaddr *from, int fromlen,
  *       @param to The local address the socket should be bound to.
  *       @result 0 on success otherwise the errno error.
  */
-extern errno_t sock_bind(socket_t so, const struct sockaddr *to);
+extern errno_t sock_bind(socket_t so, const struct sockaddr *to)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_connect
@@ -141,7 +150,8 @@ extern errno_t sock_bind(socket_t so, const struct sockaddr *to);
  *       @result 0 on success, EINPROGRESS for a non-blocking connect that
  *               has not completed, otherwise the errno error.
  */
-extern errno_t sock_connect(socket_t so, const struct sockaddr *to, int flags);
+extern errno_t sock_connect(socket_t so, const struct sockaddr *to, int flags)
+__NKE_API_DEPRECATED;
 
 #ifdef KERNEL_PRIVATE
 /*
@@ -174,7 +184,8 @@ extern errno_t sock_connectwait(socket_t so, const struct timeval *tv);
  *       @result 0 on success otherwise the errno error.
  */
 extern errno_t sock_getpeername(socket_t so, struct sockaddr *peername,
-    int peernamelen);
+    int peernamelen)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_getsockname
@@ -186,7 +197,8 @@ extern errno_t sock_getpeername(socket_t so, struct sockaddr *peername,
  *       @result 0 on success otherwise the errno error.
  */
 extern errno_t sock_getsockname(socket_t so, struct sockaddr *sockname,
-    int socknamelen);
+    int socknamelen)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_getsockopt
@@ -199,7 +211,8 @@ extern errno_t sock_getsockname(socket_t so, struct sockaddr *sockname,
  *       @result 0 on success otherwise the errno error.
  */
 extern errno_t sock_getsockopt(socket_t so, int level, int optname,
-    void *optval, int *optlen);
+    void *optval, int *optlen)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_ioctl
@@ -209,7 +222,8 @@ extern errno_t sock_getsockopt(socket_t so, int level, int optname,
  *       @param argp The argument.
  *       @result 0 on success otherwise the errno error.
  */
-extern errno_t sock_ioctl(socket_t so, unsigned long request, void *argp);
+extern errno_t sock_ioctl(socket_t so, unsigned long request, void *argp)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_setsockopt
@@ -222,7 +236,8 @@ extern errno_t sock_ioctl(socket_t so, unsigned long request, void *argp);
  *       @result 0 on success otherwise the errno error.
  */
 extern errno_t sock_setsockopt(socket_t so, int level, int optname,
-    const void *optval, int optlen);
+    const void *optval, int optlen)
+__NKE_API_DEPRECATED;
 
 #ifdef KERNEL_PRIVATE
 /*
@@ -277,7 +292,8 @@ extern errno_t sock_receive_internal(socket_t, struct msghdr *, mbuf_t *,
  *       @param backlog The maximum length of the queue of pending connections.
  *       @result 0 on success otherwise the errno error.
  */
-extern errno_t sock_listen(socket_t so, int backlog);
+extern errno_t sock_listen(socket_t so, int backlog)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_receive
@@ -292,7 +308,8 @@ extern errno_t sock_listen(socket_t so, int backlog);
  *               would cause the thread to block, otherwise the errno error.
  */
 extern errno_t sock_receive(socket_t so, struct msghdr *msg, int flags,
-    size_t *recvdlen);
+    size_t *recvdlen)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_receivembuf
@@ -313,7 +330,8 @@ extern errno_t sock_receive(socket_t so, struct msghdr *msg, int flags,
  *               would cause the thread to block, otherwise the errno error.
  */
 extern errno_t sock_receivembuf(socket_t so, struct msghdr *msg, mbuf_t *data,
-    int flags, size_t *recvlen);
+    int flags, size_t *recvlen)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_send
@@ -328,7 +346,8 @@ extern errno_t sock_receivembuf(socket_t so, struct msghdr *msg, mbuf_t *data,
  *               would cause the thread to block, otherwise the errno error.
  */
 extern errno_t sock_send(socket_t so, const struct msghdr *msg, int flags,
-    size_t *sentlen);
+    size_t *sentlen)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_sendmbuf
@@ -345,7 +364,8 @@ extern errno_t sock_send(socket_t so, const struct msghdr *msg, int flags,
  *               Regardless of return value, the mbuf chain 'data' will be freed.
  */
 extern errno_t sock_sendmbuf(socket_t so, const struct msghdr *msg, mbuf_t data,
-    int flags, size_t *sentlen);
+    int flags, size_t *sentlen)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_shutdown
@@ -357,7 +377,8 @@ extern errno_t sock_sendmbuf(socket_t so, const struct msghdr *msg, mbuf_t data,
  *               SHUT_RDWR - shutdown both.
  *       @result 0 on success otherwise the errno error.
  */
-extern errno_t sock_shutdown(socket_t so, int how);
+extern errno_t sock_shutdown(socket_t so, int how)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_socket
@@ -382,7 +403,8 @@ extern errno_t sock_socket_internal(int domain, int type, int protocol,
        (callback), (cookie), (new_so))
 #else
 extern errno_t sock_socket(int domain, int type, int protocol,
-    sock_upcall callback, void *cookie, socket_t *new_so);
+    sock_upcall callback, void *cookie, socket_t *new_so)
+__NKE_API_DEPRECATED;
 #endif /* KERNEL_PRIVATE */
 
 /*!
@@ -393,7 +415,8 @@ extern errno_t sock_socket(int domain, int type, int protocol,
  *               using sock_close may leave a file descriptor pointing to the
  *               closed socket, resulting in undefined behavior.
  */
-extern void sock_close(socket_t so);
+extern void sock_close(socket_t so)
+__NKE_API_DEPRECATED;
 
 #ifdef KERNEL_PRIVATE
 /*
@@ -427,7 +450,8 @@ extern void sock_release(socket_t so);
  *       @param on Indicate whether or not the SS_PRIV flag should be set.
  *       @result 0 on success otherwise the errno error.
  */
-extern errno_t sock_setpriv(socket_t so, int on);
+extern errno_t sock_setpriv(socket_t so, int on)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_isconnected
@@ -435,7 +459,8 @@ extern errno_t sock_setpriv(socket_t so, int on);
  *       @param so The socket to check.
  *       @result 0 - socket is not connected. 1 - socket is connected.
  */
-extern int sock_isconnected(socket_t so);
+extern int sock_isconnected(socket_t so)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_isnonblocking
@@ -448,7 +473,8 @@ extern int sock_isconnected(socket_t so);
  *               If the parameter is non-zero, the socket will not block.
  *       @result 0 - socket will block. 1 - socket will not block.
  */
-extern int sock_isnonblocking(socket_t so);
+extern int sock_isnonblocking(socket_t so)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_gettype
@@ -462,7 +488,8 @@ extern int sock_isnonblocking(socket_t so);
  *       @param protocol The socket protocol. May be NULL.
  *       @result 0 on success otherwise the errno error.
  */
-extern errno_t sock_gettype(socket_t so, int *domain, int *type, int *protocol);
+extern errno_t sock_gettype(socket_t so, int *domain, int *type, int *protocol)
+__NKE_API_DEPRECATED;
 
 #ifdef KERNEL_PRIVATE
 /*
index e82a0f52f6e7f32b3683ab2f37fce08f28cda53e..5af14bec008e8a6114a1601211b0b4c4ea03afa7 100644 (file)
 #include <sys/kernel_types.h>
 #include <sys/kpi_socket.h>
 
+#ifndef PRIVATE
+#include <Availability.h>
+#define __NKE_API_DEPRECATED __API_DEPRECATED("Network Kernel Extension KPI is deprecated", macos(10.4, 10.15))
+#else
+#define __NKE_API_DEPRECATED
+#endif /* PRIVATE */
+
 struct sockaddr;
 
 /*!
@@ -577,7 +584,8 @@ extern errno_t sflt_register_internal(const struct sflt_filter *filter,
     sflt_register_internal((filter), (domain), (type), (protocol))
 #else
 extern errno_t sflt_register(const struct sflt_filter *filter, int domain,
-    int type, int protocol);
+    int type, int protocol)
+__NKE_API_DEPRECATED;
 #endif /* KERNEL_PRIVATE */
 
 /*!
@@ -589,7 +597,8 @@ extern errno_t sflt_register(const struct sflt_filter *filter, int domain,
  *       @param handle The sf_handle of the socket filter to unregister.
  *       @result 0 on success otherwise the errno error.
  */
-extern errno_t sflt_unregister(sflt_handle handle);
+extern errno_t sflt_unregister(sflt_handle handle)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sflt_attach
@@ -599,7 +608,8 @@ extern errno_t sflt_unregister(sflt_handle handle);
  *       @param handle The handle of the registered filter to be attached.
  *       @result 0 on success otherwise the errno error.
  */
-extern errno_t sflt_attach(socket_t socket, sflt_handle handle);
+extern errno_t sflt_attach(socket_t socket, sflt_handle handle)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sflt_detach
@@ -608,7 +618,8 @@ extern errno_t sflt_attach(socket_t socket, sflt_handle handle);
  *       @param handle The handle of the registered filter to be detached.
  *       @result 0 on success otherwise the errno error.
  */
-extern errno_t sflt_detach(socket_t socket, sflt_handle handle);
+extern errno_t sflt_detach(socket_t socket, sflt_handle handle)
+__NKE_API_DEPRECATED;
 
 /* Functions for manipulating sockets */
 /*
@@ -635,7 +646,8 @@ extern errno_t sflt_detach(socket_t socket, sflt_handle handle);
  *               mbuf.
  */
 extern errno_t sock_inject_data_in(socket_t so, const struct sockaddr *from,
-    mbuf_t data, mbuf_t control, sflt_data_flag_t flags);
+    mbuf_t data, mbuf_t control, sflt_data_flag_t flags)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sock_inject_data_out
@@ -652,7 +664,8 @@ extern errno_t sock_inject_data_in(socket_t so, const struct sockaddr *from,
  *               values are always freed regardless of return value.
  */
 extern errno_t sock_inject_data_out(socket_t so, const struct sockaddr *to,
-    mbuf_t data, mbuf_t control, sflt_data_flag_t flags);
+    mbuf_t data, mbuf_t control, sflt_data_flag_t flags)
+__NKE_API_DEPRECATED;
 
 
 /*
@@ -672,7 +685,8 @@ typedef u_int8_t sockopt_dir;
  *       @param sopt The socket option.
  *       @result sock_opt_get or sock_opt_set.
  */
-extern sockopt_dir sockopt_direction(sockopt_t sopt);
+extern sockopt_dir sockopt_direction(sockopt_t sopt)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sockopt_level
@@ -680,7 +694,8 @@ extern sockopt_dir sockopt_direction(sockopt_t sopt);
  *       @param sopt The socket option.
  *       @result The socket option level. See man 2 setsockopt
  */
-extern int sockopt_level(sockopt_t sopt);
+extern int sockopt_level(sockopt_t sopt)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sockopt_name
@@ -688,7 +703,8 @@ extern int sockopt_level(sockopt_t sopt);
  *       @param sopt The socket option.
  *       @result The socket option name. See man 2 setsockopt
  */
-extern int sockopt_name(sockopt_t sopt);
+extern int sockopt_name(sockopt_t sopt)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sockopt_valsize
@@ -696,7 +712,8 @@ extern int sockopt_name(sockopt_t sopt);
  *       @param sopt The socket option.
  *       @result The length, in bytes, of the data.
  */
-extern size_t sockopt_valsize(sockopt_t sopt);
+extern size_t sockopt_valsize(sockopt_t sopt)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sockopt_copyin
@@ -706,7 +723,8 @@ extern size_t sockopt_valsize(sockopt_t sopt);
  *       @param length The number of bytes to copy.
  *       @result An errno error or zero upon success.
  */
-extern errno_t sockopt_copyin(sockopt_t sopt, void *data, size_t length);
+extern errno_t sockopt_copyin(sockopt_t sopt, void *data, size_t length)
+__NKE_API_DEPRECATED;
 
 /*!
  *       @function sockopt_copyout
@@ -716,7 +734,8 @@ extern errno_t sockopt_copyin(sockopt_t sopt, void *data, size_t length);
  *       @param length The number of bytes to copy.
  *       @result An errno error or zero upon success.
  */
-extern errno_t sockopt_copyout(sockopt_t sopt, void *data, size_t length);
+extern errno_t sockopt_copyout(sockopt_t sopt, void *data, size_t length)
+__NKE_API_DEPRECATED;
 
 __END_DECLS
 #endif /* __KPI_SOCKETFILTER__ */
index da52967a0772f7a6348e822fb65d3522e0771800..ab81a003ef139e9e69b0c7ecd61fdec88d1fc19f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2004-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -87,30 +87,23 @@ MALLOC_DECLARE(M_LOCKF);
  */
 TAILQ_HEAD(locklist, lockf);
 
-#pragma pack(4)
-
 struct lockf {
        short   lf_flags;           /* Semantics: F_POSIX, F_FLOCK, F_WAIT */
-       short   lf_type;         /* Lock type: F_RDLCK, F_WRLCK */
+       short   lf_type;            /* Lock type: F_RDLCK, F_WRLCK */
+#if IMPORTANCE_INHERITANCE
+       int     lf_boosted;         /* Is the owner of the lock boosted */
+#endif
        off_t   lf_start;           /* Byte # of the start of the lock */
        off_t   lf_end;             /* Byte # of the end of the lock (-1=EOF) */
        caddr_t lf_id;              /* Id of the resource holding the lock */
        struct  lockf **lf_head;    /* Back pointer to the head of the locf list */
-       struct vnode *lf_vnode;     /* Back pointer to the inode */
+       struct  vnode *lf_vnode;    /* Back pointer to the inode */
        struct  lockf *lf_next;     /* Pointer to the next lock on this inode */
        struct  locklist lf_blkhd;  /* List of requests blocked on this lock */
        TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */
-#if IMPORTANCE_INHERITANCE
-       int     lf_boosted;         /* Is the owner of the lock boosted */
-#endif
-       struct proc *lf_owner;      /* The proc that did the SETLK, if known */
+       struct  proc *lf_owner;     /* The proc that did the SETLK, if known */
 };
 
-#pragma pack()
-
-/* Maximum length of sleep chains to traverse to try and detect deadlock. */
-#define MAXDEPTH 50
-
 __BEGIN_DECLS
 
 #ifdef KERNEL_PRIVATE
index 327b033041c8e076b80d6ca7ffd15722f9120486..35c8e30b05fafad3840537f8c1b03243f0f0db3b 100644 (file)
@@ -27,8 +27,6 @@
 #ifndef _SYS_LOCKSTAT_H
 #define _SYS_LOCKSTAT_H
 
-/* #pragma ident       "@(#)lockstat.h 1.6     05/06/08 SMI" */
-
 #ifdef  __cplusplus
 extern "C" {
 #endif
diff --git a/bsd/sys/log_data.h b/bsd/sys/log_data.h
new file mode 100644 (file)
index 0000000..dde6185
--- /dev/null
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ * @OSF_COPYRIGHT@
+ */
+/*
+ */
+
+#ifndef _DATA_LOG_H_
+#define _DATA_LOG_H_
+
+/*
+ * rdar://problem/48252465
+ * This header should be exported only to dexts.
+ */
+int log_data_as_kernel(unsigned int tag, unsigned int flags, void *buffer, unsigned int size);
+
+#endif /* _DATA_LOG_H_ */
index ef47e37c87e0c812739d345242429661efbc02d6..8e98dee4b4130144ffc59a75ae4033f5a6b5ffe8 100755 (executable)
@@ -34,8 +34,10 @@ fi
 SDKROOT="$1"
 OUTPUT="$2"
 
-if [ ! -x "${SDKROOT}/usr/local/libexec/availability.pl" ] ; then
-    echo "Unable to locate ${SDKROOT}/usr/local/libexec/availability.pl (or not executable)" >&2
+AVAILABILITY_PL="${SDKROOT}/${DRIVERKITROOT}/usr/local/libexec/availability.pl"
+
+if [ ! -x "${AVAILABILITY_PL}" ] ; then
+    echo "Unable to locate ${AVAILABILITY_PL} (or not executable)" >&2
     exit 1
 fi
            
@@ -74,7 +76,7 @@ cat <<EOF
 
 EOF
 
-for ver in $(${SDKROOT}/usr/local/libexec/availability.pl --ios) ; do
+for ver in $(${AVAILABILITY_PL} --ios) ; do
     ver_major=${ver%.*}
     ver_minor=${ver#*.}
     value=$(printf "%d%02d00" ${ver_major} ${ver_minor})
@@ -87,7 +89,7 @@ for ver in $(${SDKROOT}/usr/local/libexec/availability.pl --ios) ; do
     echo ""
 done
 
-for ver in $(${SDKROOT}/usr/local/libexec/availability.pl --macosx) ; do
+for ver in $(${AVAILABILITY_PL} --macosx) ; do
     set -- $(echo "$ver" | tr '.' ' ')
     ver_major=$1
     ver_minor=$2
index afa2424a8107885a81763e3246f3bf30078fb44d..ff2653d9641c2478a9acf679a501be764c15bd08 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 1999-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -210,6 +210,8 @@ struct m_drvaux_tag {
 #define PF_TAG_HDR_INET                 0x8     /* hdr points to IPv4 */
 #define PF_TAG_HDR_INET6                0x10    /* hdr points to IPv6 */
 #endif /* PF_ECN */
+#define PF_TAG_REASSEMBLED              0x20    /* pkt reassembled by PF */
+#define PF_TAG_REFRAGMENTED             0x40    /* pkt refragmented by PF */
 /*
  * PF mbuf tag
  */
@@ -223,6 +225,17 @@ struct pf_mtag {
 #endif /* PF_ECN */
 };
 
+/*
+ * PF fragment tag
+ */
+struct pf_fragment_tag {
+       uint32_t ft_id;     /* fragment id */
+       uint16_t ft_hdrlen; /* header length of reassembled pkt */
+       uint16_t ft_unfragpartlen; /* length of the per-fragment headers */
+       uint16_t ft_extoff; /* last extension header offset or 0 */
+       uint16_t ft_maxlen; /* maximum fragment payload length */
+};
+
 /*
  * TCP mbuf tag
  */
@@ -311,7 +324,7 @@ struct driver_mtag_ {
  * Protocol specific mbuf tag (at most one protocol metadata per mbuf).
  *
  * Care must be taken to ensure that they are mutually exclusive, e.g.
- * IPSec policy ID implies no TCP segment offload (which is fine given
+ * IPsec policy ID implies no TCP segment offload (which is fine given
  * that the former is used on the virtual ipsec interface that does
  * not advertise the TSO capability.)
  */
@@ -433,7 +446,7 @@ struct pkthdr {
 #define bufstatus_if    _pkt_bsr.if_data
 #define bufstatus_sndbuf        _pkt_bsr.sndbuf_data
        };
-       u_int64_t pkt_timestamp;        /* enqueue time */
+       u_int64_t pkt_timestamp;        /* TX: enqueue time, RX: receive timestamp */
 
        /*
         * Tags (external and built-in)
@@ -481,6 +494,20 @@ struct pkthdr {
 #define FLOWSRC_PF              3       /* flow ID generated by PF */
 #define FLOWSRC_CHANNEL         4       /* flow ID generated by channel */
 
+/*
+ * FLOWSRC_MPKL_INPUT is not a true flow data source and is used for
+ * multi-layer packet logging. We're usurping the pkt_flowsrc field because
+ * the mbuf packet header ran out of space and pkt_flowsrc is normally
+ * used on output so we assume we can safely overwrite the normal semantic of
+ * the field.
+ * This value is meant to be used on incoming packet from a lower level protocol
+ * to pass information to some upper level protocol. When FLOWSRC_MPKL_INPUT
+ * is set, the following fields are used:
+ * - pkt_proto: the IP protocol ID of the lower level protocol
+ * - pkt_flowid: the identifier of the packet at the lower protocol.
+ * For example ESP would set pkt_proto to IPPROTO_ESP and pkt_flowid to the SPI.
+ */
+
 /*
  * Packet flags.  Unlike m_flags, all packet flags are copied along when
  * copying m_pkthdr, i.e. no equivalent of M_COPYFLAGS here.  These flags
@@ -660,11 +687,14 @@ struct mbuf {
  */
 
 /* VLAN tag present */
-#define CSUM_VLAN_TAG_VALID     0x10000         /* vlan_tag field is valid */
+#define CSUM_VLAN_TAG_VALID     0x00010000      /* vlan_tag field is valid */
+
+/* checksum start adjustment has been done */
+#define CSUM_ADJUST_DONE        0x00020000
 
 /* TCP Segment Offloading requested on this mbuf */
-#define CSUM_TSO_IPV4           0x100000        /* This mbuf needs to be segmented by the NIC */
-#define CSUM_TSO_IPV6           0x200000        /* This mbuf needs to be segmented by the NIC */
+#define CSUM_TSO_IPV4           0x00100000      /* This mbuf needs to be segmented by the NIC */
+#define CSUM_TSO_IPV6           0x00200000      /* This mbuf needs to be segmented by the NIC */
 
 #define TSO_IPV4_OK(_ifp, _m)                                           \
     (((_ifp)->if_hwassist & IFNET_TSO_IPV4) &&                          \
@@ -1017,7 +1047,7 @@ struct name {                                                   \
 #define MBUFQ_LAST(head)                                        \
        (((head)->mq_last == &MBUFQ_FIRST(head)) ? NULL :       \
        ((struct mbuf *)(void *)((char *)(head)->mq_last -      \
-           (size_t)(&MBUFQ_NEXT((struct mbuf *)0)))))
+            __builtin_offsetof(struct mbuf, m_nextpkt))))
 
 #define max_linkhdr     P2ROUNDUP(_max_linkhdr, sizeof (u_int32_t))
 #define max_protohdr    P2ROUNDUP(_max_protohdr, sizeof (u_int32_t))
@@ -1228,7 +1258,7 @@ struct mbuf;
 #define M_COPYM_MUST_COPY_HDR   3       /* MUST copy pkthdr from old to new */
 #define M_COPYM_MUST_MOVE_HDR   4       /* MUST move pkthdr from old to new */
 
-extern void m_freem(struct mbuf *);
+extern void m_freem(struct mbuf *) __XNU_INTERNAL(m_freem);
 extern u_int64_t mcl_to_paddr(char *);
 extern void m_adj(struct mbuf *, int);
 extern void m_cat(struct mbuf *, struct mbuf *);
@@ -1379,7 +1409,7 @@ __private_extern__ caddr_t m_16kalloc(int);
 __private_extern__ void m_16kfree(caddr_t, u_int, caddr_t);
 __private_extern__ struct mbuf *m_m16kget(struct mbuf *, int);
 __private_extern__ int m_reinit(struct mbuf *, int);
-__private_extern__ struct mbuf *m_free(struct mbuf *);
+__private_extern__ struct mbuf *m_free(struct mbuf *) __XNU_INTERNAL(m_free);
 __private_extern__ struct mbuf *m_getclr(int, int);
 __private_extern__ struct mbuf *m_getptr(struct mbuf *, int, int *);
 __private_extern__ unsigned int m_length(struct mbuf *);
@@ -1478,6 +1508,7 @@ enum {
        KERNEL_TAG_TYPE_IPSEC                   = 10,
        KERNEL_TAG_TYPE_DRVAUX                  = 11,
        KERNEL_TAG_TYPE_CFIL_UDP                = 13,
+       KERNEL_TAG_TYPE_PF_REASS                = 14,
 };
 
 /* Packet tag routines */
index 0c5aa5255a7fad108bb31dda02a4e457541330fa..db058126b75229b6321d1309ab1fce65dd8d5f28 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2006-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -246,6 +246,7 @@ typedef struct mcache_obj {
 
 typedef struct mcache_bkt {
        void            *bkt_next;      /* next bucket in list */
+       struct mcache_bkttype *bkt_type; /* bucket type */
        void            *bkt_obj[1];    /* one or more objects */
 } mcache_bkt_t;
 
@@ -373,7 +374,7 @@ typedef struct mcache_audit {
        } mca_trns[MCA_TRN_MAX];
 } mcache_audit_t;
 
-__private_extern__ int assfail(const char *, const char *, int);
+__private_extern__ int assfail(const char *, const char *, int) __abortlike;
 __private_extern__ void mcache_init(void);
 __private_extern__ unsigned int mcache_getflags(void);
 __private_extern__ unsigned int mcache_cache_line_size(void);
@@ -407,7 +408,7 @@ __private_extern__ void mcache_audit_free_verify_set(mcache_audit_t *,
     void *, size_t, size_t);
 __private_extern__ char *mcache_dump_mca(mcache_audit_t *);
 __private_extern__ void mcache_audit_panic(mcache_audit_t *, void *, size_t,
-    int64_t, int64_t);
+    int64_t, int64_t) __abortlike;
 
 extern int32_t total_sbmb_cnt;
 extern int32_t total_sbmb_cnt_floor;
index bd0b0618f52bbe170de62d9fcdfbb43a74d3fea2..abe0b93c05dab6ffa1dbdd05f7fd107abd0451b1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define MAP_RESILIENT_CODESIGN  0x2000 /* no code-signing failures */
 #define MAP_RESILIENT_MEDIA     0x4000 /* no backing-store failures */
 
+#if !defined(CONFIG_EMBEDDED)
+#define MAP_32BIT       0x8000          /* Return virtual addresses <4G only: Requires entitlement */
+#endif  /* !defined(CONFIG_EMBEDDED) */
+
 #endif  /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */
 
 /*
index cfca2afeef3f524d9ec1de931c9081a69e1d650e..6ec648972d27b70dba500497c332bbae29b11bf5 100644 (file)
@@ -1,9 +1,49 @@
+/*
+ * Copyright (c) 2017-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
 #ifndef SYS_MONOTONIC_H
 #define SYS_MONOTONIC_H
 
 #include <stdbool.h>
 #include <stdint.h>
 #include <sys/cdefs.h>
+
+__BEGIN_DECLS
+
+struct mt_cpu_inscyc {
+       uint64_t mtci_instructions;
+       uint64_t mtci_cycles;
+};
+
+__END_DECLS
+
+#if !MACH_KERNEL_PRIVATE
+
 #include <sys/ioccom.h>
 
 __BEGIN_DECLS
@@ -13,10 +53,17 @@ __BEGIN_DECLS
  */
 
 #define MT_IOC(x) _IO('m', (x))
-
 #define MT_IOC_RESET MT_IOC(0)
-
 #define MT_IOC_ADD MT_IOC(1)
+#define MT_IOC_ENABLE MT_IOC(2)
+#define MT_IOC_COUNTS MT_IOC(3)
+#define MT_IOC_GET_INFO MT_IOC(4)
+
+__END_DECLS
+
+#endif /* !MACH_KERNEL_PRIVATE */
+
+__BEGIN_DECLS
 
 struct monotonic_config {
        uint64_t event;
@@ -34,19 +81,12 @@ union monotonic_ctl_add {
        } out;
 };
 
-/*
- * - Consider a separate IOC for disable -- to avoid the copyin to determine
- *   which way to set it.
- */
-#define MT_IOC_ENABLE MT_IOC(2)
-
 union monotonic_ctl_enable {
        struct {
                bool enable;
        } in;
 };
 
-#define MT_IOC_COUNTS MT_IOC(3)
 
 union monotonic_ctl_counts {
        struct {
@@ -58,7 +98,6 @@ union monotonic_ctl_counts {
        } out;
 };
 
-#define MT_IOC_GET_INFO MT_IOC(4)
 
 union monotonic_ctl_info {
        struct {
@@ -67,13 +106,19 @@ union monotonic_ctl_info {
        } out;
 };
 
+__END_DECLS
+
 #if XNU_KERNEL_PRIVATE
 
+#if MONOTONIC
+
 #include <kern/monotonic.h>
 #include <machine/monotonic.h>
 #include <sys/kdebug.h>
 #include <kern/locks.h>
 
+__BEGIN_DECLS
+
 #ifdef MT_CORE_INSTRS
 #define COUNTS_INSTRS __counts[MT_CORE_INSTRS]
 #else /* defined(MT_CORE_INSTRS) */
@@ -131,6 +176,10 @@ union monotonic_ctl_info {
 #define MT_KDBG_TMPTH_START(CODE) MT_KDBG_TMPTH_(CODE, DBG_FUNC_START)
 #define MT_KDBG_TMPTH_END(CODE) MT_KDBG_TMPTH_(CODE, DBG_FUNC_END)
 
+extern lck_grp_t * mt_lock_grp;
+
+int mt_dev_init(void);
+
 struct mt_device {
        const char *mtd_name;
        int(*const mtd_init)(struct mt_device *dev);
@@ -148,12 +197,10 @@ typedef struct mt_device *mt_device_t;
 
 extern struct mt_device mt_devices[];
 
-extern lck_grp_t *mt_lock_grp;
+__END_DECLS
 
-int mt_dev_init(void);
+#endif /* MONOTONIC */
 
 #endif /* XNU_KERNEL_PRIVATE */
 
-__END_DECLS
-
 #endif /* !defined(SYS_MONOTONIC_H) */
index bff53904b556a74baa57e46fb33c9a07ea565589..c3e884bada8e789255eaa5a6d1a46e310ca3f4e5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -74,6 +74,7 @@
 #include <sys/appleapiopts.h>
 #include <sys/cdefs.h>
 #include <sys/attr.h>           /* needed for vol_capabilities_attr_t */
+#include <os/base.h>
 
 #ifndef KERNEL
 #include <stdint.h>
 #define MNAMELEN        90              /* length of buffer for returned name */
 #endif /* __DARWIN_64_BIT_INO_T */
 
+#define MNT_EXT_ROOT_DATA_VOL      0x00000001      /* Data volume of root volume group */
+
 #define __DARWIN_STRUCT_STATFS64 { \
        uint32_t        f_bsize;        /* fundamental file system block size */ \
        int32_t         f_iosize;       /* optimal transfer block size */ \
        char            f_fstypename[MFSTYPENAMELEN];   /* fs type name */ \
        char            f_mntonname[MAXPATHLEN];        /* directory on which mounted */ \
        char            f_mntfromname[MAXPATHLEN];      /* mounted filesystem */ \
-       uint32_t        f_reserved[8];  /* For future use */ \
+       uint32_t    f_flags_ext;    /* extended flags */ \
+       uint32_t        f_reserved[7];  /* For future use */ \
 }
 
 #if !__DARWIN_ONLY_64_BIT_INO_T
@@ -299,6 +303,12 @@ struct vfs_attr {
  */
 #define MNT_EXPORTED    0x00000100      /* file system is exported */
 
+/*
+ * Denotes storage which can be removed from the system by the user.
+ */
+
+#define MNT_REMOVABLE   0x00000200
+
 /*
  * MAC labeled / "quarantined" flag
  */
@@ -322,6 +332,7 @@ struct vfs_attr {
 #define MNT_MULTILABEL  0x04000000      /* MAC support for individual labels */
 #define MNT_NOATIME             0x10000000      /* disable update of file access time */
 #define MNT_SNAPSHOT    0x40000000 /* The mount is a snapshot */
+#define MNT_STRICTATIME 0x80000000      /* enable strict update of file access time */
 #ifdef BSD_KERNEL_PRIVATE
 /* #define MNT_IMGSRC_BY_INDEX 0x20000000 see sys/imgsrc.h */
 #endif /* BSD_KERNEL_PRIVATE */
@@ -337,11 +348,11 @@ struct vfs_attr {
 #define MNT_VISFLAGMASK (MNT_RDONLY    | MNT_SYNCHRONOUS | MNT_NOEXEC  | \
                        MNT_NOSUID      | MNT_NODEV     | MNT_UNION     | \
                        MNT_ASYNC       | MNT_EXPORTED  | MNT_QUARANTINE | \
-                       MNT_LOCAL       | MNT_QUOTA | \
+                       MNT_LOCAL       | MNT_QUOTA | MNT_REMOVABLE | \
                        MNT_ROOTFS      | MNT_DOVOLFS   | MNT_DONTBROWSE | \
                        MNT_IGNORE_OWNERSHIP | MNT_AUTOMOUNTED | MNT_JOURNALED | \
                        MNT_NOUSERXATTR | MNT_DEFWRITE  | MNT_MULTILABEL | \
-                       MNT_NOATIME | MNT_SNAPSHOT | MNT_CPROTECT)
+                       MNT_NOATIME | MNT_STRICTATIME | MNT_SNAPSHOT | MNT_CPROTECT)
 /*
  * External filesystem command modifier flags.
  * Unmount can use the MNT_FORCE flag.
@@ -381,9 +392,13 @@ struct vfs_attr {
 #define MNT_WAIT        1       /* synchronized I/O file integrity completion */
 #define MNT_NOWAIT      2       /* start all I/O, but do not wait for it */
 #define MNT_DWAIT       4       /* synchronized I/O data integrity completion */
+#ifdef KERNEL
+/* only for VFS_SYNC */
+#define MNT_VOLUME      8       /* sync on a single mounted filesystem  */
+#endif
 
 
-#ifndef KERNEL
+#if !defined(KERNEL) && !defined(_KERN_SYS_KERNELTYPES_H_) /* also defined in kernel_types.h */
 struct mount;
 typedef struct mount * mount_t;
 struct vnode;
@@ -489,7 +504,8 @@ struct netfs_status {
 #define VQ_QUOTA        0x1000  /* a user quota has been hit */
 #define VQ_NEARLOWDISK          0x2000  /* Above lowdisk and below desired disk space */
 #define VQ_DESIRED_DISK         0x4000  /* the desired disk space */
-#define VQ_FLAG8000     0x8000  /* placeholder */
+#define VQ_FREE_SPACE_CHANGE    0x8000  /* free disk space has significantly changed */
+#define VQ_FLAG10000    0x10000  /* placeholder */
 
 
 #ifdef KERNEL
@@ -772,6 +788,18 @@ struct fs_snapshot_root_args {
 };
 #define VFSIOC_ROOT_SNAPSHOT  _IOW('V', 3, struct fs_snapshot_root_args)
 
+typedef struct fs_role_mount_args {
+       mount_t root_mp;
+       uint32_t mount_role;
+} fs_role_mount_args_t;
+
+OS_ENUM(vfs_roles, uint32_t,
+    VFS_SYSTEM_ROLE = 1,
+    VFS_VM_ROLE = 8,
+    VFS_DATA_ROLE = 64);
+
+#define VFSIOC_MOUNT_BYROLE  _IOW('V', 4, fs_role_mount_args_t)
+
 #endif /* KERNEL */
 
 /*
@@ -780,6 +808,9 @@ struct fs_snapshot_root_args {
 #ifdef PRIVATE
 #define VFS_ITERATE_TAIL_FIRST  (1 << 0)
 #define VFS_ITERATE_CB_DROPREF  (1 << 1)        // Callback will drop the iterref
+#define VFS_ITERATE_NOSKIP_UNMOUNT  (1 << 2)    /* Callback will be made on FS in unmount.
+                                               * The callback cannot make any calls
+                                               * into the Filesystem when this is set. */
 #endif /* PRIVATE */
 
 /*
@@ -1280,6 +1311,7 @@ void *  vfs_mntlabel(mount_t mp); /* Safe to cast to "struct label*"; returns "v
 void    vfs_setcompoundopen(mount_t mp);
 uint64_t vfs_throttle_mask(mount_t mp);
 int vfs_isswapmount(mount_t mp);
+boolean_t vfs_context_is_dataless_manipulator(vfs_context_t);
 
 struct vnode_trigger_info;
 
@@ -1353,6 +1385,13 @@ int     vfs_settriggercallback(fsid_t *fsid, vfs_trigger_callback_t vtc, void *d
 /* tags a volume as not supporting extended readdir for NFS exports */
 void mount_set_noreaddirext(mount_t);
 
+/*!
+ *  @function vfs_get_statfs64
+ *  @abstract Get the same information as vfs_statfs(), but in a format suitable
+ *  for copying to userland.
+ */
+void vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs);
+
 #endif  /* KERNEL_PRIVATE */
 __END_DECLS
 
index 2bb9d8d112c0da258149b82ef9c1ecbf7bf42c83..05e522c99b21b23db953cbf59024d7d137ce53c2 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -105,60 +105,60 @@ typedef uint32_t  pending_io_t;
 TAILQ_HEAD(vnodelst, vnode);
 
 struct mount {
-       TAILQ_ENTRY(mount) mnt_list;            /* mount list */
-       int32_t         mnt_count;              /* reference on the mount */
-       lck_mtx_t       mnt_mlock;              /* mutex that protects mount point */
-       struct vfsops   *mnt_op;                /* operations on fs */
-       struct vfstable *mnt_vtable;            /* configuration info */
-       struct vnode    *mnt_vnodecovered;      /* vnode we mounted on */
-       struct vnodelst mnt_vnodelist;          /* list of vnodes this mount */
-       struct vnodelst mnt_workerqueue;                /* list of vnodes this mount */
-       struct vnodelst mnt_newvnodes;          /* list of vnodes this mount */
-       uint32_t                mnt_flag;               /* flags */
-       uint32_t                mnt_kern_flag;          /* kernel only flags */
-       uint32_t                mnt_compound_ops;       /* Available compound operations */
-       uint32_t                mnt_lflag;                      /* mount life cycle flags */
-       uint32_t                mnt_maxsymlinklen;      /* max size of short symlink */
-       struct vfsstatfs        mnt_vfsstat;            /* cache of filesystem stats */
-       qaddr_t         mnt_data;               /* private data */
+       TAILQ_ENTRY(mount)      mnt_list;                   /* mount list */
+       int32_t                 mnt_count;                  /* reference on the mount */
+       lck_mtx_t               mnt_mlock;                  /* mutex that protects mount point */
+       const struct vfsops     *mnt_op;                    /* operations on fs */
+       struct vfstable         *mnt_vtable;                /* configuration info */
+       struct vnode            *mnt_vnodecovered;          /* vnode we mounted on */
+       struct vnodelst         mnt_vnodelist;              /* list of vnodes this mount */
+       struct vnodelst         mnt_workerqueue;            /* list of vnodes this mount */
+       struct vnodelst         mnt_newvnodes;              /* list of vnodes this mount */
+       uint32_t                mnt_flag;                   /* flags */
+       uint32_t                mnt_kern_flag;              /* kernel only flags */
+       uint32_t                mnt_compound_ops;           /* Available compound operations */
+       uint32_t                mnt_lflag;                  /* mount life cycle flags */
+       uint32_t                mnt_maxsymlinklen;          /* max size of short symlink */
+       struct vfsstatfs        mnt_vfsstat;                /* cache of filesystem stats */
+       qaddr_t                 mnt_data;                   /* private data */
        /* Cached values of the IO constraints for the device */
-       uint32_t        mnt_maxreadcnt;         /* Max. byte count for read */
-       uint32_t        mnt_maxwritecnt;        /* Max. byte count for write */
-       uint32_t        mnt_segreadcnt;         /* Max. segment count for read */
-       uint32_t        mnt_segwritecnt;        /* Max. segment count for write */
-       uint32_t        mnt_maxsegreadsize;     /* Max. segment read size  */
-       uint32_t        mnt_maxsegwritesize;    /* Max. segment write size */
-       uint32_t        mnt_alignmentmask;      /* Mask of bits that aren't addressable via DMA */
-       uint32_t        mnt_devblocksize;       /* the underlying device block size */
-       uint32_t        mnt_ioqueue_depth;      /* the maxiumum number of commands a device can accept */
-       uint32_t        mnt_ioscale;            /* scale the various throttles/limits imposed on the amount of I/O in flight */
-       uint32_t        mnt_ioflags;            /* flags for  underlying device */
-       uint32_t        mnt_minsaturationbytecount;     /* if non-zero, mininum amount of writes (in bytes) needed to max out throughput */
-       pending_io_t    mnt_pending_write_size __attribute__((aligned(sizeof(pending_io_t))));  /* byte count of pending writes */
-       pending_io_t    mnt_pending_read_size  __attribute__((aligned(sizeof(pending_io_t))));  /* byte count of pending reads */
-       struct timeval  mnt_last_write_issued_timestamp;
-       struct timeval  mnt_last_write_completed_timestamp;
-       int64_t         mnt_max_swappin_available;
-
-       lck_rw_t        mnt_rwlock;             /* mutex readwrite lock */
-       lck_mtx_t       mnt_renamelock;         /* mutex that serializes renames that change shape of tree */
-       vnode_t         mnt_devvp;              /* the device mounted on for local file systems */
-       uint32_t        mnt_devbsdunit;         /* the BSD unit number of the device */
-       uint64_t        mnt_throttle_mask;      /* the throttle mask of what devices will be affected by I/O from this mnt */
-       void            *mnt_throttle_info;     /* used by the throttle code */
-       int32_t         mnt_crossref;           /* refernces to cover lookups  crossing into mp */
-       int32_t         mnt_iterref;            /* refernces to cover iterations; drained makes it -ve  */
+       uint32_t                mnt_maxreadcnt;             /* Max. byte count for read */
+       uint32_t                mnt_maxwritecnt;            /* Max. byte count for write */
+       uint32_t                mnt_segreadcnt;             /* Max. segment count for read */
+       uint32_t                mnt_segwritecnt;            /* Max. segment count for write */
+       uint32_t                mnt_maxsegreadsize;         /* Max. segment read size  */
+       uint32_t                mnt_maxsegwritesize;        /* Max. segment write size */
+       uint32_t                mnt_alignmentmask;          /* Mask of bits that aren't addressable via DMA */
+       uint32_t                mnt_devblocksize;           /* the underlying device block size */
+       uint32_t                mnt_ioqueue_depth;          /* the maxiumum number of commands a device can accept */
+       uint32_t                mnt_ioscale;                /* scale the various throttles/limits imposed on the amount of I/O in flight */
+       uint32_t                mnt_ioflags;                /* flags for  underlying device */
+       uint32_t                mnt_minsaturationbytecount; /* if non-zero, mininum amount of writes (in bytes) needed to max out throughput */
+       pending_io_t            mnt_pending_write_size __attribute__((aligned(sizeof(pending_io_t))));  /* byte count of pending writes */
+       pending_io_t            mnt_pending_read_size  __attribute__((aligned(sizeof(pending_io_t))));  /* byte count of pending reads */
+       struct timeval          mnt_last_write_issued_timestamp;
+       struct timeval          mnt_last_write_completed_timestamp;
+       int64_t                 mnt_max_swappin_available;
+
+       lck_rw_t                mnt_rwlock;                 /* mutex readwrite lock */
+       lck_mtx_t               mnt_renamelock;             /* mutex that serializes renames that change shape of tree */
+       vnode_t                 mnt_devvp;                  /* the device mounted on for local file systems */
+       uint32_t                mnt_devbsdunit;             /* the BSD unit number of the device */
+       uint64_t                mnt_throttle_mask;          /* the throttle mask of what devices will be affected by I/O from this mnt */
+       void                    *mnt_throttle_info;         /* used by the throttle code */
+       int32_t                 mnt_crossref;               /* refernces to cover lookups  crossing into mp */
+       int32_t                 mnt_iterref;                /* refernces to cover iterations; drained makes it -ve  */
 #if CONFIG_TRIGGERS
-       int32_t         mnt_numtriggers;        /* num of trigger vnodes for this mount */
-       vfs_trigger_callback_t *mnt_triggercallback;
-       void            *mnt_triggerdata;
+       int32_t                 mnt_numtriggers;            /* num of trigger vnodes for this mount */
+       vfs_trigger_callback_t  *mnt_triggercallback;
+       void                    *mnt_triggerdata;
 #endif
        /* XXX 3762912 hack to support HFS filesystem 'owner' */
-       uid_t           mnt_fsowner;
-       gid_t           mnt_fsgroup;
+       uid_t                   mnt_fsowner;
+       gid_t                   mnt_fsgroup;
 
-       struct label    *mnt_mntlabel;          /* MAC mount label */
-       struct label    *mnt_fslabel;           /* MAC default fs label */
+       struct label            *mnt_mntlabel;              /* MAC mount label */
+       struct label            *mnt_fslabel;               /* MAC default fs label */
 
        /*
         * cache the rootvp of the last mount point
@@ -174,14 +174,14 @@ struct mount {
         * we don't take an explicit long term reference
         * on it when we mount it
         */
-       vnode_t         mnt_realrootvp;
-       uint32_t        mnt_realrootvp_vid;
+       vnode_t                 mnt_realrootvp;
+       uint32_t                mnt_realrootvp_vid;
        /*
         * bumped each time a mount or unmount
         * occurs... its used to invalidate
         * 'mnt_realrootvp' from the cache
         */
-       uint32_t             mnt_generation;
+       uint32_t                mnt_generation;
        /*
         * if 'MNTK_AUTH_CACHE_TIMEOUT' is
         * set, then 'mnt_authcache_ttl' is
@@ -191,14 +191,14 @@ struct mount {
         * time-to-live for the cached lookup right for
         * volumes marked 'MNTK_AUTH_OPAQUE'.
         */
-       int             mnt_authcache_ttl;
-       char            fstypename_override[MFSTYPENAMELEN];
+       int                     mnt_authcache_ttl;
+       char                    fstypename_override[MFSTYPENAMELEN];
 
-       uint32_t        mnt_iobufinuse;
+       uint32_t                mnt_iobufinuse;
 
-       void *mnt_disk_conditioner_info;
+       void                    *mnt_disk_conditioner_info;
 
-       lck_mtx_t       mnt_iter_lock;          /* mutex that protects iteration of vnodes */
+       lck_mtx_t               mnt_iter_lock;              /* mutex that protects iteration of vnodes */
 };
 
 /*
@@ -216,6 +216,7 @@ struct mount {
 #define MNT_IOFLAGS_CSUNMAP_SUPPORTED   0x00000008
 #define MNT_IOFLAGS_SWAPPIN_SUPPORTED   0x00000010
 #define MNT_IOFLAGS_FUSION_DRIVE        0x00000020
+#define MNT_IOFLAGS_PERIPHERAL_DRIVE    0x00000040 /* External: Attached directly to the system (USB,TBT,FW,etc.) */
 
 /*
  * ioqueue depth for devices that don't report one
@@ -241,6 +242,7 @@ extern struct mount * dead_mountp;
  *             because the bits here were broken out from the high bits
  *             of the mount flags.
  */
+#define MNTK_SYSTEM             0x00000040     /* Volume associated with system volume (do not allow unmount) */
 #define MNTK_NOSWAP             0x00000080  /* swap files cannot be used on this mount */
 #define MNTK_SWAP_MOUNT         0x00000100      /* we are swapping to this mount */
 #define MNTK_DENY_READDIREXT 0x00000200 /* Deny Extended-style readdir's for this volume */
@@ -306,7 +308,7 @@ typedef struct fhandle  fhandle_t;
  * mount time to identify the requested filesystem.
  */
 struct vfstable {
-       struct  vfsops *vfc_vfsops;     /* filesystem operations vector */
+       const struct vfsops *vfc_vfsops;/* filesystem operations vector */
        char    vfc_name[MFSNAMELEN];   /* filesystem type name */
        int     vfc_typenum;            /* historic filesystem type number */
        int     vfc_refcount;           /* number mounted of this type */
@@ -375,10 +377,10 @@ struct user64_statfs {
        user64_long_t   f_ffree;                /* free file nodes in fs */
        fsid_t          f_fsid;                 /* file system id */
        uid_t           f_owner;                /* user that mounted the filesystem */
-       short           f_reserved1;    /* spare for later */
+       short           f_reserved1;            /* spare for later */
        short           f_type;                 /* type of filesystem */
-       user64_long_t       f_flags;            /* copy of mount exported flags */
-       user64_long_t f_reserved2[2];   /* reserved for future use */
+       user64_long_t   f_flags;                /* copy of mount exported flags */
+       user64_long_t   f_reserved2[2];         /* reserved for future use */
        char            f_fstypename[MFSNAMELEN]; /* fs type name */
        char            f_mntonname[MNAMELEN];  /* directory on which mounted */
        char            f_mntfromname[MNAMELEN];/* mounted filesystem */
@@ -442,6 +444,9 @@ int  mount_refdrain(mount_t);
 /* vfs_rootmountalloc should be kept as a private api */
 errno_t vfs_rootmountalloc(const char *, const char *, mount_t *mpp);
 
+int vfs_mount_rosv_data(void);
+int vfs_mount_vm(void);
+
 int     vfs_mountroot(void);
 void    vfs_unmountall(void);
 int     safedounmount(struct mount *, int, vfs_context_t);
@@ -460,11 +465,16 @@ void mount_iterdrop(mount_t);
 void mount_iterdrain(mount_t);
 void mount_iterreset(mount_t);
 
+/* These flags are used as flag bits in the `internal_flags` argument to mount_common */
 /* Private NFS spi */
 #define KERNEL_MOUNT_NOAUTH             0x01 /* Don't check the UID of the directory we are mounting on */
 #define KERNEL_MOUNT_PERMIT_UNMOUNT     0x02 /* Allow (non-forced) unmounts by users other the one who mounted the volume */
 /* used by snapshot mounting SPI */
 #define KERNEL_MOUNT_SNAPSHOT           0x04 /* Mounting a snapshot */
+#define KERNEL_MOUNT_DATAVOL            0x08 /* mount the data volume */
+#define KERNEL_MOUNT_VMVOL              0x10 /* mount the VM volume */
+
+
 #if NFSCLIENT || DEVFS || ROUTEFS
 /*
  * NOTE: kernel_mount() does not force MNT_NOSUID, MNT_NOEXEC, or MNT_NODEC for non-privileged
index ea81e399cbaf6cc8695e29139a92e8e72c8459d6..816e849c44e74d94ec7632c25d6b4c769197f731 100644 (file)
@@ -179,6 +179,7 @@ struct nameidata {
 #define AUDITVNPATH2    0x00200000 /* audit the path/vnode info */
 #define USEDVP          0x00400000 /* start the lookup at ndp.ni_dvp */
 #define CN_VOLFSPATH    0x00800000 /* user path was a volfs style path */
+#define CN_FIRMLINK_NOFOLLOW    0x01000000 /* Do not follow firm links */
 #define UNIONCREATED    0x02000000 /* union fs creation of vnode */
 #if NAMEDRSRCFORK
 #define CN_WANTSRSRCFORK 0x04000000
index c01074897928f3ee132c714a12a6644bc9b6c634..87907d17242875d16b927bc9c593f5950720b232 100644 (file)
 #include <sys/param.h>
 
 enum {
-       PERSONA_INVALID = 0,
-       PERSONA_GUEST   = 1,
-       PERSONA_MANAGED = 2,
-       PERSONA_PRIV    = 3,
-       PERSONA_SYSTEM  = 4,
-
-       PERSONA_TYPE_MAX = PERSONA_SYSTEM,
+       PERSONA_INVALID      = 0,
+       PERSONA_GUEST        = 1,
+       PERSONA_MANAGED      = 2,
+       PERSONA_PRIV         = 3,
+       PERSONA_SYSTEM       = 4,
+       PERSONA_DEFAULT      = 5,
+       PERSONA_SYSTEM_PROXY = 6,
+       PERSONA_SYS_EXT      = 7,
+       PERSONA_ENTERPRISE   = 8,
+
+       PERSONA_TYPE_MAX     = PERSONA_ENTERPRISE,
 };
 
 #define PERSONA_ID_NONE ((uid_t)-1)
@@ -62,11 +66,16 @@ struct kpersona_info {
 
 
 #define PERSONA_OP_ALLOC    1
-#define PERSONA_OP_DEALLOC  2
-#define PERSONA_OP_GET      3
-#define PERSONA_OP_INFO     4
-#define PERSONA_OP_PIDINFO  5
-#define PERSONA_OP_FIND     6
+#define PERSONA_OP_PALLOC   2
+#define PERSONA_OP_DEALLOC  3
+#define PERSONA_OP_GET      4
+#define PERSONA_OP_INFO     5
+#define PERSONA_OP_PIDINFO  6
+#define PERSONA_OP_FIND     7
+#define PERSONA_OP_GETPATH  8
+#define PERSONA_OP_FIND_BY_TYPE  9
+
+#define PERSONA_MGMT_ENTITLEMENT "com.apple.private.persona-mgmt"
 
 #ifndef KERNEL
 /*
@@ -91,6 +100,29 @@ struct kpersona_info {
  */
 int kpersona_alloc(struct kpersona_info *info, uid_t *id);
 
+/*
+ * kpersona_palloc: Allocate a new in-kernel persona with a directory
+ *                 pathname
+ *
+ * Parameters:
+ *       info: Pointer to persona info structure describing the
+ *             attributes of the persona to create / allocate.
+ *
+ *       path: Pointer to directory name that stores persona specific
+ *             data. Assumes path buffer length = MAXPATHLEN and is a
+ *             null-terminated string.
+ *
+ *         id: output: set to the ID of the created persona
+ *
+ * Note:
+ *      The 'persona_id' field of the 'info' parameter is ignored.
+ *
+ * Return:
+ *        != 0: ERROR
+ *        == 0: Success
+ */
+int kpersona_palloc(struct kpersona_info *info, uid_t *id, char path[MAXPATHLEN]);
+
 /*
  * kpersona_dealloc: delete / destroy an in-kernel persona
  *
@@ -103,13 +135,15 @@ int kpersona_alloc(struct kpersona_info *info, uid_t *id);
  */
 int kpersona_dealloc(uid_t id);
 
-
 /*
  * kpersona_get: retrieve the persona with which the current thread is running
  *
+ * To find the proc's persona id use kpersona_pidinfo
+ *
  * Parameters:
- *         id: output: will be filled with current thread's persona
- *             (or current processes persona) on success.
+ *         id: output: will be filled with the persona id from the voucher adopted
+ *             on the current thread. If that voucher contains no persona information
+ *             or there is no such voucher, then it defaults to the proc's persona id.
  *
  * Return:
  *        < 0: Thread is not running under any persona
@@ -117,12 +151,29 @@ int kpersona_dealloc(uid_t id);
  */
 int kpersona_get(uid_t *id);
 
+/*
+ * kpersona_get_path: retrieve the given persona's path
+ *
+ * Parameters:
+ *         id: ID of the persona
+ *
+ *         path: output: filled in with path on success.
+ *               Assumes path buffer length = MAXPATHLEN
+ *
+ * Return:
+ *        < 0: Error
+ *          0: Success
+ */
+int kpersona_getpath(uid_t id, char path[MAXPATHLEN]);
 
 /*
  * kpersona_info: gather info about the given persona
  *
  * Parameters:
  *         id: ID of the persona to investigate
+ *             If set to 0, it uses persona id from the voucher adopted on the current
+ *             thread. If that voucher contains no persona information or there is no
+ *             such voucher, then it defaults to the proc's persona id.
  *
  *       info: output: filled in with persona attributes on success.
  *
@@ -132,7 +183,6 @@ int kpersona_get(uid_t *id);
  */
 int kpersona_info(uid_t id, struct kpersona_info *info);
 
-
 /*
  * kpersona_pidinfo: gather persona info about the given PID
  *
@@ -147,7 +197,6 @@ int kpersona_info(uid_t id, struct kpersona_info *info);
  */
 int kpersona_pidinfo(pid_t pid, struct kpersona_info *info);
 
-
 /*
  * kpersona_find: lookup the kernel's UUID of a persona
  *
@@ -159,6 +208,8 @@ int kpersona_pidinfo(pid_t pid, struct kpersona_info *info);
  *             Set this to -1 to find personas by 'name'
  *
  *         id: output: the ID(s) matching the input parameters
+ *             This can be NULL
+ *
  *      idlen: input - size of 'id' buffer (in number of IDs)
  *             output - the total required size of the 'id' buffer
  *                      (in number of IDs) - may be larger than input size
@@ -170,6 +221,24 @@ int kpersona_pidinfo(pid_t pid, struct kpersona_info *info);
  *        >= 0: The number of IDs found to match the input parameters
  */
 int kpersona_find(const char *name, uid_t uid, uid_t *id, size_t *idlen);
+
+/*
+ * kpersona_find_by_type: lookup the persona ids by type
+ *
+ * Parameters:
+ *  persona_type: Type of persona id (see enum)
+ *
+ *           id: output: the ID(s) matching the input parameters
+ *               This can be NULL
+ *
+ *        idlen: input - size of 'id' buffer (in number of IDs)
+ *               output - the total required size of the 'id' buffer
+ *                      (in number of IDs) - may be larger than input size
+ * Return:
+ *         < 0: ERROR
+ *        >= 0: The number of IDs found to match the input parameters
+ */
+int kpersona_find_by_type(int persona_type, uid_t *id, size_t *idlen);
 #endif /* !KERNEL */
 
 #ifdef KERNEL_PRIVATE
@@ -201,6 +270,7 @@ struct persona {
        uid_t        pna_id;
        int          pna_type;
        char         pna_login[MAXLOGNAME + 1];
+       char         *pna_path;
 
        kauth_cred_t pna_cred;
        uid_t        pna_pgid;
@@ -302,13 +372,32 @@ kauth_cred_t persona_get_cred(struct persona *persona);
 struct persona *persona_lookup(uid_t id);
 
 /*
- * returns non-zero on error, on success returns 0 and updates 'plen' to
- * total found (could be more than original value of 'plen')
+ * Search for personas based on name or uid
+ *
+ * Parameters:
+ *       name: Local login name of the persona.
+ *             Set this to NULL to find personas by 'uid'.
+ *
+ *        uid: UID of the persona.
+ *             Set this to -1 to find personas by 'name'
+ *
+ *    persona: output - array of persona pointers. Each non-NULL value
+ *             must* be released with persona_put. This can be NULL.
+ *
+ *       plen: input - size of 'persona' buffer (in number of pointers)
+ *             output - the total required size of the 'persona' buffer (could be larger than input value)
+ *
+ * Return:
+ *           0: Success
+ *        != 0: failure (BSD errno value ESRCH or EINVAL)
  */
 int persona_find(const char *login, uid_t uid,
     struct persona **persona, size_t *plen);
 
-/* returns a reference to the persona tied to the current thread */
+/* returns a reference that must be released with persona_put() */
+struct persona *persona_proc_get(pid_t pid);
+
+/* returns a reference to the persona tied to the current thread (also uses adopted voucher) */
 struct persona *current_persona_get(void);
 
 /* get a reference to a persona structure */
@@ -317,6 +406,25 @@ struct persona *persona_get(struct persona *persona);
 /* release a reference to a persona structure */
 void persona_put(struct persona *persona);
 
+/*
+ * Search for personas of a given type, 'persona_type'.
+ *
+ * Parameters:
+ *   persona_type: Type of persona (see enum)
+ *
+ *        persona: output - array of persona pointers. Each non-NULL value
+ *        must* be released with persona_put. This can be NULL.
+ *
+ *           plen: input - size of 'persona' buffer (in number of pointers)
+ *                 output - the total required size of the 'persona' buffer (could be larger than input value)
+ *
+ * Return:
+ *           0: Success
+ *        != 0: failure (BSD errno value ESRCH or EINVAL)
+ */
+int persona_find_by_type(int persona_type, struct persona **persona,
+    size_t *plen);
+
 #ifdef XNU_KERNEL_PRIVATE
 
 #if CONFIG_PERSONAS
@@ -326,17 +434,18 @@ void persona_put(struct persona *persona);
  * In-kernel persona API
  */
 extern uint32_t g_max_personas;
-extern struct persona *g_system_persona;
 
 void personas_bootstrap(void);
 
 struct persona *persona_alloc(uid_t id, const char *login,
-    int type, int *error);
+    int type, char *path, int *error);
 
 int persona_init_begin(struct persona *persona);
 void persona_init_end(struct persona *persona, int error);
 
 struct persona *persona_lookup_and_invalidate(uid_t id);
+int persona_verify_and_set_uniqueness(struct persona *persona);
+boolean_t persona_is_unique(struct persona *persona);
 
 static inline int
 proc_has_persona(proc_t p)
@@ -382,6 +491,9 @@ int persona_get_login(struct persona *persona, char login[MAXLOGNAME + 1]);
 /* returns a reference that must be released with persona_put() */
 struct persona *persona_proc_get(pid_t pid);
 
+int persona_find_all(const char *login, uid_t uid, int persona_type,
+    struct persona **persona, size_t *plen);
+
 #else /* !CONFIG_PERSONAS */
 
 static inline int
index 294be47beb1fd6bba9c3feb609371dfdd0e1485d..e53f05080a8ee21827aa07b7752579fbe3281927 100644 (file)
@@ -133,7 +133,7 @@ struct pipemapping {
 #define PIPE_LWANT      0x200   /* Process wants exclusive access to pointers/data. */
 #define PIPE_DIRECTW    0x400   /* Pipe direct write active. */
 #define PIPE_DIRECTOK   0x800   /* Direct mode ok. */
-#define PIPE_KNOTE      0x1000   /* Pipe has kernel events activated */
+// was  PIPE_KNOTE      0x1000
 #define PIPE_DRAIN      0x2000  /* Waiting for I/O to drop for a close.  Treated like EOF;
                                 *       only separate for easier debugging. */
 #define PIPE_WSELECT    0x4000  /* Some thread has done an FWRITE select on the pipe */
index cdeb994a4108abe69417a767bc87bd00fb2c2e88..940debadf69522d02deeaca73a06f10ad663d2bc 100644 (file)
@@ -93,6 +93,8 @@
 #define PRIV_PACKAGE_EXTENSIONS         1013    /* Push package extension list used by vn_path_package_check() */
 #define PRIV_TRIM_ACTIVE_FILE           1014    /* Allow freeing space out from under an active file  */
 #define PRIV_PROC_CPUMON_OVERRIDE       1015    /* Allow CPU usage monitor parameters less restrictive than default */
+#define PRIV_ENDPOINTSECURITY_CLIENT    1016    /* Allow EndpointSecurity clients to connect */
+#define PRIV_AUDIT_SESSION_PORT         1017    /* Obtain send-right for arbitrary audit session's port. */
 
 /*
  * Virtual memory privileges.
 #define PRIV_NET_RESTRICTED_MULTIPATH_EXTENDED  10010   /* Extended multipath (more aggressive on cell) */
 #define PRIV_NET_RESTRICTED_ROUTE_NC_READ       10011   /* Enable route neighbhor cache read operations */
 
+#define PRIV_NET_PRIVILEGED_CLIENT_ACCESS       10012   /* Allow client networking access on restricted platforms */
+#define PRIV_NET_PRIVILEGED_SERVER_ACCESS       10013   /* Allow server networking access on restricted platforms */
+
+#define PRIV_NET_VALIDATED_RESOLVER             10014   /* Privilege to sign DNS resolver results for validation */
+
+#define PRIV_NET_CUSTOM_PROTOCOL                10015   /* Privilege to use custom protocol APIs */
+#define PRIV_NET_PRIVILEGED_NECP_DROP_ALL_BYPASS 10016  /* Privilege to bypass NECP drop-all */
+#define PRIV_NET_PRIVILEGED_IPSEC_WAKE_PACKET   10017   /* Privilege to get IPsec wake packet */
+
 /*
  * IPv4 and IPv6 privileges.
  */
 #define PRIV_NETINET_RESERVEDPORT       11000   /* Bind low port number. */
+#define PRIV_NETINET_TCP_KA_OFFLOAD     11001   /* Can set TCP keep alive offload option */
 
 
 /*
 #define PRIV_VFS_MOVE_DATA_EXTENTS      14001   /* Allow F_MOVEDATAEXTENTS fcntl */
 #define PRIV_VFS_SNAPSHOT               14002   /* Allow create/rename/delete of snapshots */
 #define PRIV_VFS_SNAPSHOT_REVERT        14003   /* Allow reverting filesystem to a previous snapshot */
+#define PRIV_VFS_DATALESS_RESOLVER      14004   /* Allow registration as dataless file resolver */
+#define PRIV_VFS_DATALESS_MANIPULATION  14005   /* Allow process to inspect dataless directories / manipulate dataless objects */
 
 #define PRIV_APFS_EMBED_DRIVER          14100   /* Allow embedding an EFI driver into the APFS container */
 #define PRIV_APFS_FUSION_DEBUG      14101   /* Allow getting internal statistics and controlling the APFS Fusion container */
 #define PRIV_APFS_FUSION_ALLOW_PIN_FASTPROMOTE  14102   /* Allow changing pinned/fastPromote inode flags in APFS Fusion container */
+// #define PRIV_APFS_UNUSED              14103
+#define PRIV_APFS_SET_FREE_SPACE_CHANGE_THRESHOLD       14104   /* Allow setting the free space change notification threshold */
+#define PRIV_APFS_SET_FIRMLINK       14105   /* Allow setting the SF_FIRM_LINK bsd flag */
 
 #ifdef KERNEL
 /*
index ef8015554f832a82113dd43a18f0592ef829d227..87a39398b29a8d76f340e78da9e0ef135337d77c 100644 (file)
@@ -272,6 +272,12 @@ extern int proc_rele(proc_t p);
 extern int proc_pid(proc_t);
 /* returns the pid of the parent of a given process */
 extern int proc_ppid(proc_t);
+/* returns the original pid of the parent of a given process */
+extern int proc_original_ppid(proc_t);
+/* returns the platform (macos, ios, watchos, tvos, ...) of the given process */
+extern uint32_t proc_platform(proc_t);
+/* returns the sdk version used by the current process */
+extern uint32_t proc_sdk(proc_t);
 /* returns 1 if the process is marked for no remote hangs */
 extern int proc_noremotehang(proc_t);
 /* returns 1 if the process is marked for force quota */
@@ -313,6 +319,14 @@ pid_t proc_selfpgrpid(void);
  */
 pid_t proc_pgrpid(proc_t p);
 
+/*!
+ *  @function proc_sessionid
+ *  @abstract Get the process session id for the passed-in process.
+ *  @param p Process whose session id to grab.
+ *  @return session id for "p", or -1 on failure
+ */
+pid_t proc_sessionid(proc_t p);
+
 #ifdef KERNEL_PRIVATE
 // mark a process as being allowed to call vfs_markdependency()
 void bsd_set_dependency_capable(task_t task);
@@ -357,18 +371,29 @@ extern int proc_pidbackgrounded(pid_t pid, uint32_t* state);
  */
 extern uint64_t proc_uniqueid(proc_t);
 
+/* unique 64bit id for process's original parent */
+extern uint64_t proc_puniqueid(proc_t);
+
 extern void proc_set_responsible_pid(proc_t target_proc, pid_t responsible_pid);
 
 /* return 1 if process is forcing case-sensitive HFS+ access, 0 for default */
 extern int proc_is_forcing_hfs_case_sensitivity(proc_t);
 
+/*!
+ *  @function    proc_exitstatus
+ *  @abstract    KPI to determine a process's exit status.
+ *  @discussion  This function is not safe to call if the process could be
+ *               concurrently stopped or started, but it can be called from a
+ *               mpo_proc_notify_exit callback.
+ *  @param p     The process to be queried.
+ *  @return      Value in the same format as wait()'s output parameter.
+ */
+extern int proc_exitstatus(proc_t p);
+
 #endif /* KERNEL_PRIVATE */
 
 #ifdef XNU_KERNEL_PRIVATE
 
-/* unique 64bit id for process's original parent */
-extern uint64_t proc_puniqueid(proc_t);
-
 extern void proc_getexecutableuuid(proc_t, unsigned char *, unsigned long);
 extern int proc_get_originatorbgstate(uint32_t *is_backgrounded);
 
@@ -387,9 +412,16 @@ extern uint64_t get_current_unique_pid(void);
 #endif /* XNU_KERNEL_PRIVATE*/
 
 #ifdef KERNEL_PRIVATE
+/* If buf argument is NULL, the necessary length to allocate will be set in buflen */
+extern int proc_selfexecutableargs(uint8_t *buf, size_t *buflen);
+extern off_t proc_getexecutableoffset(proc_t p);
 extern vnode_t proc_getexecutablevnode(proc_t); /* Returned with iocount, use vnode_put() to drop */
 extern int networking_memstatus_callout(proc_t p, uint32_t);
-#endif
+
+#define SYSCALL_MASK_UNIX 0
+extern size_t proc_get_syscall_filter_mask_size(int which);
+extern int proc_set_syscall_filter_mask(proc_t p, int which, unsigned char *maskptr, size_t masklen);
+#endif /* KERNEL_PRIVATE */
 
 __END_DECLS
 
@@ -422,6 +454,10 @@ int pid_shutdown_networking(int pid, int level);
 __END_DECLS
 
 #endif /* !KERNEL */
+
+/* Entitlement to allow non-root processes to suspend/resume any task */
+#define PROCESS_RESUME_SUSPEND_ENTITLEMENT "com.apple.private.process.suspend-resume.any"
+
 #endif /* PRIVATE */
 
 #endif  /* !_SYS_PROC_H_ */
index 15dc50f709312d84a94923eb922b3fca6ad03ba4..086ad784230bc9f7e9a181576ca3f6c0ff6f3dc7 100644 (file)
@@ -139,6 +139,11 @@ struct proc_originatorinfo {
        uint64_t                p_reserve4;
 };
 
+struct proc_ipctableinfo {
+       uint32_t               table_size;
+       uint32_t               table_free;
+};
+
 #endif
 
 
@@ -377,6 +382,12 @@ struct proc_regionwithpathinfo {
        struct vnode_info_path  prp_vip;
 };
 
+struct proc_regionpath {
+       uint64_t prpo_addr;
+       uint64_t prpo_regionlength;
+       char prpo_path[MAXPATHLEN];
+};
+
 struct proc_vnodepathinfo {
        struct vnode_info_path  pvi_cdir;
        struct vnode_info_path  pvi_rdir;
@@ -800,6 +811,16 @@ struct proc_fileportinfo {
 
 #define PROC_PIDVMRTFAULTINFO           29
 #define PROC_PIDVMRTFAULTINFO_SIZE (7 * sizeof(uint64_t))
+
+#define PROC_PIDPLATFORMINFO 30
+#define PROC_PIDPLATFORMINFO_SIZE (sizeof(uint32_t))
+
+#define PROC_PIDREGIONPATH              31
+#define PROC_PIDREGIONPATH_SIZE         (sizeof(struct proc_regionpath))
+
+#define PROC_PIDIPCTABLEINFO 32
+#define PROC_PIDIPCTABLEINFO_SIZE (sizeof(struct proc_ipctableinfo))
+
 #endif /* PRIVATE */
 /* Flavors for proc_pidfdinfo */
 
index adaef95ff468836f536ac49ec73c92c9cf149d77..763515e8f2a65883cb25d6e883e6463ac4dcb7c4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -79,7 +79,6 @@
 #include <sys/proc.h>
 #include <mach/resource_monitors.h>     // command/proc_name_t
 
-
 __BEGIN_DECLS
 #include <kern/locks.h>
 #if PSYNCH
@@ -121,7 +120,7 @@ struct  session {
        int                     s_listflags;
 };
 
-#define SESSION_NULL (struct session *)0
+#define SESSION_NULL (struct session *)NULL
 
 /*
  * accessor for s_ttyp which treats it as invalid if s_ttyvp is not valid;
@@ -167,10 +166,10 @@ struct  pgrp {
 #define PGRP_FLAG_ITERABEGIN    8
 #define PGRP_FLAG_ITERWAIT      0x10
 
-#define PGRP_NULL (struct pgrp *)0
+#define PGRP_NULL (struct pgrp *)NULL
 struct proc;
 
-#define PROC_NULL (struct proc *)0
+#define PROC_NULL (struct proc *)NULL
 
 #define PROC_UPDATE_CREDS_ONPROC(p) { \
        p->p_uid =  kauth_cred_getuid(p->p_ucred); \
@@ -197,6 +196,7 @@ struct  proc {
        void *          task;                   /* corresponding task (static)*/
        struct  proc *  p_pptr;                 /* Pointer to parent process.(LL) */
        pid_t           p_ppid;                 /* process's parent pid number */
+       pid_t           p_original_ppid;        /* process's original parent pid number, doesn't change if reparented */
        pid_t           p_pgrpid;               /* process group id of the process (LL)*/
        uid_t           p_uid;
        gid_t           p_gid;
@@ -331,6 +331,10 @@ struct  proc {
        cpu_type_t      p_cputype;
        cpu_subtype_t   p_cpusubtype;
 
+       uint8_t  *syscall_filter_mask;          /* syscall filter bitmask (length: nsysent bits) */
+       uint32_t        p_platform;
+       uint32_t        p_sdk;
+
 /* End area that is copied on creation. */
 /* XXXXXXXXXXXXX End of BCOPY'ed on fork (AIOLOCK)XXXXXXXXXXXXXXXX */
 #define p_endcopy       p_aio_total_count
@@ -374,6 +378,7 @@ struct  proc {
 #endif /* DIAGNOSTIC */
        uint64_t        p_dispatchqueue_offset;
        uint64_t        p_dispatchqueue_serialno_offset;
+       uint64_t        p_dispatchqueue_label_offset;
        uint64_t        p_return_to_kernel_offset;
        uint64_t        p_mach_thread_self_offset;
 #if VM_PRESSURE_EVENTS
@@ -383,9 +388,10 @@ struct  proc {
 #if CONFIG_MEMORYSTATUS
        /* Fields protected by proc list lock */
        TAILQ_ENTRY(proc) p_memstat_list;               /* priority bucket link */
-       uint32_t          p_memstat_state;              /* state */
+       uint32_t          p_memstat_state;              /* state. Also used as a wakeup channel when the memstat's LOCKED bit changes */
        int32_t           p_memstat_effectivepriority;  /* priority after transaction state accounted for */
        int32_t           p_memstat_requestedpriority;  /* active priority */
+       int32_t           p_memstat_assertionpriority;  /* assertion driven priority */
        uint32_t          p_memstat_dirty;              /* dirty state */
        uint64_t          p_memstat_userdata;           /* user state */
        uint64_t          p_memstat_idledeadline;       /* time at which process became clean */
@@ -394,6 +400,7 @@ struct  proc {
        int32_t           p_memstat_memlimit;           /* cached memory limit, toggles between active and inactive limits */
        int32_t           p_memstat_memlimit_active;    /* memory limit enforced when process is in active jetsam state */
        int32_t           p_memstat_memlimit_inactive;  /* memory limit enforced when process is in inactive jetsam state */
+       int32_t           p_memstat_relaunch_flags;     /* flags indicating relaunch behavior for the process */
 #if CONFIG_FREEZE
        uint32_t          p_memstat_freeze_sharedanon_pages; /* shared pages left behind after freeze */
        uint32_t          p_memstat_frozen_count;
@@ -405,6 +412,8 @@ struct  proc {
        pid_t             p_responsible_pid;    /* pid resonsible for this process */
        _Atomic uint32_t  p_user_faults; /* count the number of user faults generated */
 
+       uint32_t          p_memlimit_increase; /* byte increase for memory limit for dyld SPI rdar://problem/49950264, structure packing 32-bit and 64-bit */
+
        struct os_reason     *p_exit_reason;
 
 #if !CONFIG_EMBEDDED
@@ -465,6 +474,7 @@ struct  proc {
 #define P_LVMRSRCOWNER  0x01000000      /* can handle the resource ownership of  */
 #define P_LTERM_DECRYPTFAIL     0x04000000      /* process terminating due to key failure to decrypt */
 #define P_LTERM_JETSAM          0x08000000      /* process is being jetsam'd */
+
 #define P_JETSAM_VMPAGESHORTAGE 0x00000000      /* jetsam: lowest jetsam priority proc, killed due to vm page shortage */
 #define P_JETSAM_VMTHRASHING    0x10000000      /* jetsam: lowest jetsam priority proc, killed due to vm thrashing */
 #define P_JETSAM_HIWAT          0x20000000      /* jetsam: high water mark */
@@ -473,6 +483,7 @@ struct  proc {
 #define P_JETSAM_VNODE          0x50000000      /* jetsam: vnode kill */
 #define P_JETSAM_FCTHRASHING    0x60000000      /* jetsam: lowest jetsam priority proc, killed due to filecache thrashing */
 #define P_JETSAM_MASK           0x70000000      /* jetsam type mask */
+#define P_LNSPACE_RESOLVER      0x80000000      /* process is the namespace resolver */
 
 /* Process control state for resource starvation */
 #define P_PCTHROTTLE    1
@@ -498,7 +509,9 @@ struct  proc {
 /* p_vfs_iopolicy flags */
 #define P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY       0x0001
 #define P_VFS_IOPOLICY_ATIME_UPDATES                    0x0002
-#define P_VFS_IOPOLICY_VALID_MASK                       (P_VFS_IOPOLICY_ATIME_UPDATES | P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY)
+#define P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES       0x0004
+#define P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME            0x0008
+#define P_VFS_IOPOLICY_VALID_MASK                       (P_VFS_IOPOLICY_ATIME_UPDATES | P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY | P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES | P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)
 
 /* process creation arguments */
 #define PROC_CREATE_FORK        0       /* independent child (running) */
@@ -701,6 +714,11 @@ __private_extern__ void proc_drop_zombref(struct proc * p);     /* Find zombie b
 
 extern int      chgproccnt(uid_t uid, int diff);
 extern void     pinsertchild(struct proc *parent, struct proc *child);
+extern int      setsid_internal(struct proc *p);
+#ifndef __cplusplus
+extern void     setlogin_internal(proc_t p, const char login[static MAXLOGNAME]);
+#endif // __cplusplus
+extern int      setgroups_internal(proc_t p, u_int gidsetsize, gid_t *gidset, uid_t gmuid);
 extern int      enterpgrp(struct proc *p, pid_t pgid, int mksess);
 extern void     fixjobc(struct proc *p, struct pgrp *pgrp, int entering);
 extern int      inferior(struct proc *p);
@@ -819,7 +837,7 @@ typedef int (*proc_iterate_fn_t)(proc_t, void *);
  */
 #define PGRP_DROPREF (1)
 
-extern int pgrp_iterate(struct pgrp *pgrp, unsigned int flags, proc_iterate_fn_t callout, void *arg, proc_iterate_fn_t filterfn, void *filterarg);
+extern void pgrp_iterate(struct pgrp *pgrp, unsigned int flags, proc_iterate_fn_t callout, void *arg, proc_iterate_fn_t filterfn, void *filterarg);
 
 /*
  * proc_iterate walks the `allproc` and/or `zombproc` lists, calling `filterfn`
@@ -834,7 +852,7 @@ extern int pgrp_iterate(struct pgrp *pgrp, unsigned int flags, proc_iterate_fn_t
 #define PROC_ZOMBPROCLIST (1U << 1) /* walk the zombie list */
 #define PROC_NOWAITTRANS  (1U << 2) /* do not wait for transitions (checkdirs only) */
 
-extern int proc_iterate(unsigned int flags, proc_iterate_fn_t callout, void *arg, proc_iterate_fn_t filterfn, void *filterarg);
+extern void proc_iterate(unsigned int flags, proc_iterate_fn_t callout, void *arg, proc_iterate_fn_t filterfn, void *filterarg);
 
 /*
  * proc_childrenwalk walks the children of process `p`, calling `callout` for
@@ -843,7 +861,7 @@ extern int proc_iterate(unsigned int flags, proc_iterate_fn_t callout, void *arg
  * `PCHILDREN_FOREACH` might also be used under the `proc_list_lock` to achieve
  * a similar effect.
  */
-extern int proc_childrenwalk(proc_t p, proc_iterate_fn_t callout, void *arg);
+extern void proc_childrenwalk(proc_t p, proc_iterate_fn_t callout, void *arg);
 
 /*
  * proc_rebootscan should only be used by kern_shutdown.c
index 4c2e3ce6b26ab544ec0d8b8d64a1eec9b0926230..8ba38fb1985f82a2fcfbfa19a428446f2d434366 100644 (file)
@@ -65,7 +65,7 @@ __BEGIN_DECLS
 #define PROC_POLICY_HARDWARE_ACCESS     2       /* access to various hardware */
 #define PROC_POLICY_RESOURCE_STARVATION 3       /* behavior on resource starvation */
 #define PROC_POLICY_RESOURCE_USAGE      4       /* behavior on resource consumption */
-#if CONFIG_EMBEDDED || TARGET_OS_EMBEDDED
+#if CONFIG_EMBEDDED || (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 #define PROC_POLICY_APP_LIFECYCLE       5       /* app life cycle management */
 #else /* CONFIG_EMBEDDED */
 #define PROC_POLICY_RESERVED            5       /* behavior on resource consumption */
@@ -79,7 +79,7 @@ __BEGIN_DECLS
 #define PROC_POLICY_BG_DISKTHROTTLE     2       /* disk accesses throttled */
 #define PROC_POLICY_BG_NETTHROTTLE      4       /* network accesses throttled */
 #define PROC_POLICY_BG_GPUDENY          8       /* no access to GPU */
-#if CONFIG_EMBEDDED || TARGET_OS_EMBEDDED
+#if CONFIG_EMBEDDED || (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 #define PROC_POLICY_BG_ALL            0x0F
 #else /* CONFIG_EMBEDDED */
 #define PROC_POLICY_BG_ALL            0x07
@@ -169,7 +169,7 @@ typedef struct proc_policy_cpuusage_attr {
        uint64_t        ppattr_cpu_attr_deadline;     /* 64bit deadline in nsecs */
 } proc_policy_cpuusage_attr_t;
 
-#if CONFIG_EMBEDDED || TARGET_OS_EMBEDDED
+#if CONFIG_EMBEDDED || (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 /* sub policies for app lifecycle management */
 #define PROC_POLICY_APPLIFE_NONE        0       /* does nothing.. */
 #define PROC_POLICY_APPLIFE_STATE       1       /* sets the app to various lifecycle states */
@@ -180,7 +180,7 @@ typedef struct proc_policy_cpuusage_attr {
 /* sub policies for PROC_POLICY_APPTYPE */
 #define PROC_POLICY_APPTYPE_NONE        0       /* does nothing.. */
 #define PROC_POLICY_APPTYPE_MODIFY      1       /* sets the app to various lifecycle states */
-#if CONFIG_EMBEDDED || TARGET_OS_EMBEDDED
+#if CONFIG_EMBEDDED || (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 #define PROC_POLICY_APPTYPE_THREADTHR   2       /* notes the device in inactive or short/long term */
 #endif /* CONFIG_EMBEDDED */
 
index 0beec9bc543a36ed8ebd252b8e445019f74db967..a9f80ff068da329cb68ea59e8fb02e58d6c3c0f5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -127,6 +127,7 @@ struct uio;
 struct ifnet;
 #ifdef XNU_KERNEL_PRIVATE
 struct domain_old;
+struct proc;
 #endif /* XNU_KERNEL_PRIVATE */
 
 #pragma pack(4)
@@ -269,6 +270,12 @@ struct protosw {
         */
        TAILQ_HEAD(, socket_filter) pr_filter_head;
        struct protosw_old *pr_old;
+
+       void    (*pr_update_last_owner) /* update last socket owner */
+       (struct socket *so, struct proc *p, struct proc *ep);
+
+       void    (*pr_copy_last_owner) /* copy last socket from listener */
+       (struct socket *so, struct socket *head);
 };
 
 /*
@@ -562,20 +569,25 @@ extern struct protosw *pffindproto_locked(int, int, int);
 extern struct protosw *pffindprotonotype(int, int);
 extern struct protosw *pffindtype(int, int);
 extern struct protosw_old *pffindproto_old(int, int, int);
-extern int net_add_proto(struct protosw *, struct domain *, int);
+extern int net_add_proto(struct protosw *, struct domain *, int)
+__XNU_INTERNAL(net_add_proto);
 extern void net_init_proto(struct protosw *, struct domain *);
-extern int net_del_proto(int, int, struct domain *);
+extern int net_del_proto(int, int, struct domain *)
+__XNU_INTERNAL(net_del_proto);
 extern int net_add_proto_old(struct protosw_old *, struct domain_old *);
 extern int net_del_proto_old(int, int, struct domain_old *);
 extern void net_update_uptime(void);
 extern void net_update_uptime_with_time(const struct timeval *);
 extern u_int64_t net_uptime(void);
+extern u_int64_t net_uptime_ms(void);
 extern void net_uptime2timeval(struct timeval *);
+extern struct protosw *pffindproto(int family, int protocol, int type)
+__XNU_INTERNAL(pffindproto);
 #else
 extern int net_add_proto(struct protosw *, struct domain *);
 extern int net_del_proto(int, int, struct domain *);
-#endif /* XNU_KERNEL_PRIVATE */
 extern struct protosw *pffindproto(int family, int protocol, int type);
+#endif /* XNU_KERNEL_PRIVATE */
 __END_DECLS
 #endif /* KERNEL_PRIVATE */
 #endif  /* !_SYS_PROTOSW_H_ */
index 7a2d607dd1a22a3822019fa91a60a947782bb40d..e956225b2e6350c11acea025595d2eaae722c9d8 100644 (file)
@@ -234,7 +234,7 @@ typedef const struct pthread_callbacks_s {
        void *__unused_was_zfree;
        void *__unused_was_zinit;
 
-       /* bsd/kerb/kern_sig.c */
+       /* bsd/kern/kern_sig.c */
        void (*__pthread_testcancel)(int);
 
        /* calls without portfolio */
@@ -251,7 +251,13 @@ typedef const struct pthread_callbacks_s {
        /* mach/thread_act.h */
        kern_return_t (*thread_resume)(thread_act_t target_act);
 
-       void *__unused_was_ml_get_max_cpus;
+       /* bsd/sys/event.h */
+       int (*kevent_workq_internal)(struct proc *p,
+           user_addr_t changelist, int nchanges,
+           user_addr_t eventlist, int nevents,
+           user_addr_t data_out, user_size_t *data_available,
+           unsigned int flags, int32_t *retval);
+
 #if defined(__arm__)
        void *__unused_was_map_is_1gb;
 #endif
index 8791385d7d4b31959fa21089c50c81f909bf5b23..23dc242c63061ab6219c66b8581a74628d737089 100644 (file)
@@ -207,14 +207,44 @@ struct qm_trace {
 #define __MISMATCH_TAGS_POP
 #endif
 
+/*!
+ * Ensures that these macros can safely be used in structs when compiling with
+ * clang. The macros do not allow for nullability attributes to be specified due
+ * to how they are expanded. For example:
+ *
+ *     SLIST_HEAD(, foo _Nullable) bar;
+ *
+ * expands to
+ *
+ *     struct {
+ *         struct foo _Nullable *slh_first;
+ *     }
+ *
+ * which is not valid because the nullability specifier has to apply to the
+ * pointer. So just ignore nullability completeness in all the places where this
+ * is an issue.
+ */
+#if defined(__clang__)
+#define __NULLABILITY_COMPLETENESS_PUSH \
+       _Pragma("clang diagnostic push") \
+       _Pragma("clang diagnostic ignored \"-Wnullability-completeness\"")
+#define __NULLABILITY_COMPLETENESS_POP \
+       _Pragma("clang diagnostic pop")
+#else
+#define __NULLABILITY_COMPLETENESS_PUSH
+#define __NULLABILITY_COMPLETENESS_POP
+#endif
+
 /*
  * Singly-linked List declarations.
  */
 #define SLIST_HEAD(name, type)                                          \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 struct name {                                                           \
        struct type *slh_first; /* first element */                     \
 }                                                                       \
+__NULLABILITY_COMPLETENESS_POP                                          \
 __MISMATCH_TAGS_POP
 
 #define SLIST_HEAD_INITIALIZER(head)                                    \
@@ -222,9 +252,11 @@ __MISMATCH_TAGS_POP
 
 #define SLIST_ENTRY(type)                                               \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 struct {                                                                \
        struct type *sle_next;  /* next element */                      \
 }                                                                       \
+__NULLABILITY_COMPLETENESS_POP                                          \
 __MISMATCH_TAGS_POP
 
 /*
@@ -267,6 +299,7 @@ __MISMATCH_TAGS_POP
 
 #define SLIST_REMOVE(head, elm, type, field)                            \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 do {                                                                    \
        if (SLIST_FIRST((head)) == (elm)) {                             \
                SLIST_REMOVE_HEAD((head), field);                       \
@@ -279,6 +312,7 @@ do {                                                                    \
        }                                                               \
        TRASHIT((elm)->field.sle_next);                                 \
 } while (0)                                                             \
+__NULLABILITY_COMPLETENESS_POP                                      \
 __MISMATCH_TAGS_POP
 
 #define SLIST_REMOVE_AFTER(elm, field) do {                             \
@@ -295,10 +329,12 @@ __MISMATCH_TAGS_POP
  */
 #define STAILQ_HEAD(name, type)                                         \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 struct name {                                                           \
        struct type *stqh_first;/* first element */                     \
        struct type **stqh_last;/* addr of last next element */         \
 }                                                                       \
+__NULLABILITY_COMPLETENESS_POP                                          \
 __MISMATCH_TAGS_POP
 
 #define STAILQ_HEAD_INITIALIZER(head)                                   \
@@ -306,9 +342,11 @@ __MISMATCH_TAGS_POP
 
 #define STAILQ_ENTRY(type)                                              \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 struct {                                                                \
        struct type *stqe_next; /* next element */                      \
 }                                                                       \
+__NULLABILITY_COMPLETENESS_POP                                         \
 __MISMATCH_TAGS_POP
 
 /*
@@ -362,16 +400,19 @@ __MISMATCH_TAGS_POP
 
 #define STAILQ_LAST(head, type, field)                                  \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
        (STAILQ_EMPTY((head)) ?                                         \
                NULL :                                                  \
                ((struct type *)(void *)                                \
                ((char *)((head)->stqh_last) - __offsetof(struct type, field))))\
+__NULLABILITY_COMPLETENESS_POP                                         \
 __MISMATCH_TAGS_POP
 
 #define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next)
 
 #define STAILQ_REMOVE(head, elm, type, field)                           \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 do {                                                                    \
        if (STAILQ_FIRST((head)) == (elm)) {                            \
                STAILQ_REMOVE_HEAD((head), field);                      \
@@ -384,6 +425,7 @@ do {                                                                    \
        }                                                               \
        TRASHIT((elm)->field.stqe_next);                                \
 } while (0)                                                             \
+__NULLABILITY_COMPLETENESS_POP                                      \
 __MISMATCH_TAGS_POP
 
 #define STAILQ_REMOVE_HEAD(head, field) do {                            \
@@ -405,6 +447,7 @@ __MISMATCH_TAGS_POP
 
 #define STAILQ_SWAP(head1, head2, type)                                 \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 do {                                                                    \
        struct type *swap_first = STAILQ_FIRST(head1);                  \
        struct type **swap_last = (head1)->stqh_last;                   \
@@ -417,6 +460,7 @@ do {                                                                    \
        if (STAILQ_EMPTY(head2))                                        \
                (head2)->stqh_last = &STAILQ_FIRST(head2);              \
 } while (0)                                                             \
+__NULLABILITY_COMPLETENESS_POP                                          \
 __MISMATCH_TAGS_POP
 
 
@@ -425,9 +469,11 @@ __MISMATCH_TAGS_POP
  */
 #define LIST_HEAD(name, type)                                           \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 struct name {                                                           \
        struct type *lh_first;  /* first element */                     \
 }                                                                       \
+__NULLABILITY_COMPLETENESS_POP                                          \
 __MISMATCH_TAGS_POP
 
 #define LIST_HEAD_INITIALIZER(head)                                     \
@@ -435,10 +481,12 @@ __MISMATCH_TAGS_POP
 
 #define LIST_ENTRY(type)                                                \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 struct {                                                                \
        struct type *le_next;   /* next element */                      \
        struct type **le_prev;  /* address of previous next element */  \
 }                                                                       \
+__NULLABILITY_COMPLETENESS_POP                                          \
 __MISMATCH_TAGS_POP
 
 /*
@@ -530,6 +578,7 @@ __MISMATCH_TAGS_POP
 
 #define LIST_SWAP(head1, head2, type, field)                            \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 do {                                                                    \
        struct type *swap_tmp = LIST_FIRST((head1));                    \
        LIST_FIRST((head1)) = LIST_FIRST((head2));                      \
@@ -539,6 +588,7 @@ do {                                                                    \
        if ((swap_tmp = LIST_FIRST((head2))) != NULL)                   \
                swap_tmp->field.le_prev = &LIST_FIRST((head2));         \
 } while (0)                                                             \
+__NULLABILITY_COMPLETENESS_POP                                          \
 __MISMATCH_TAGS_POP
 
 /*
@@ -546,11 +596,13 @@ __MISMATCH_TAGS_POP
  */
 #define TAILQ_HEAD(name, type)                                          \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 struct name {                                                           \
        struct type *tqh_first; /* first element */                     \
        struct type **tqh_last; /* addr of last next element */         \
        TRACEBUF                                                        \
 }                                                                       \
+__NULLABILITY_COMPLETENESS_POP                                          \
 __MISMATCH_TAGS_POP
 
 #define TAILQ_HEAD_INITIALIZER(head)                                    \
@@ -558,11 +610,13 @@ __MISMATCH_TAGS_POP
 
 #define TAILQ_ENTRY(type)                                               \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 struct {                                                                \
        struct type *tqe_next;  /* next element */                      \
        struct type **tqe_prev; /* address of previous next element */  \
        TRACEBUF                                                        \
 }                                                                       \
+__NULLABILITY_COMPLETENESS_POP                                          \
 __MISMATCH_TAGS_POP
 
 /*
@@ -630,6 +684,17 @@ __MISMATCH_TAGS_POP
            (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1);  \
            (var) = (tvar))
 
+#if XNU_KERNEL_PRIVATE
+/*
+ * Can be used when the initialized HEAD was just bzeroed
+ * Works around deficiencies in clang analysis of initialization patterns.
+ * See: <rdar://problem/47939050>
+ */
+#define TAILQ_INIT_AFTER_BZERO(head) do {                               \
+       (head)->tqh_last = &TAILQ_FIRST((head));                        \
+} while (0)
+#endif /* XNU_KERNEL_PRIVATE */
+
 #define TAILQ_INIT(head) do {                                           \
        TAILQ_FIRST((head)) = NULL;                                     \
        (head)->tqh_last = &TAILQ_FIRST((head));                        \
@@ -686,14 +751,18 @@ __MISMATCH_TAGS_POP
 
 #define TAILQ_LAST(head, headname)                                      \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
        (*(((struct headname *)((head)->tqh_last))->tqh_last))          \
+__NULLABILITY_COMPLETENESS_POP                                          \
 __MISMATCH_TAGS_POP
 
 #define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
 
 #define TAILQ_PREV(elm, headname, field)                                \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
        (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))     \
+__NULLABILITY_COMPLETENESS_POP                                          \
 __MISMATCH_TAGS_POP
 
 #define TAILQ_REMOVE(head, elm, field) do {                             \
@@ -717,6 +786,7 @@ __MISMATCH_TAGS_POP
  */
 #define TAILQ_SWAP(head1, head2, type, field)                           \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 do {                                                                    \
        struct type *swap_first = (head1)->tqh_first;                   \
        struct type **swap_last = (head1)->tqh_last;                    \
@@ -733,6 +803,7 @@ do {                                                                    \
        else                                                            \
                (head2)->tqh_last = &(head2)->tqh_first;                \
 } while (0)                                                             \
+__NULLABILITY_COMPLETENESS_POP                                          \
 __MISMATCH_TAGS_POP
 
 /*
@@ -740,18 +811,22 @@ __MISMATCH_TAGS_POP
  */
 #define CIRCLEQ_HEAD(name, type)                                        \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 struct name {                                                           \
        struct type *cqh_first;         /* first element */             \
        struct type *cqh_last;          /* last element */              \
 }                                                                       \
+__NULLABILITY_COMPLETENESS_POP                                          \
 __MISMATCH_TAGS_POP
 
 #define CIRCLEQ_ENTRY(type)                                             \
 __MISMATCH_TAGS_PUSH                                                    \
+__NULLABILITY_COMPLETENESS_PUSH                                         \
 struct {                                                                \
        struct type *cqe_next;          /* next element */              \
        struct type *cqe_prev;          /* previous element */          \
 }                                                                       \
+__NULLABILITY_COMPLETENESS_POP                                         \
 __MISMATCH_TAGS_POP
 
 /*
index bc9da9345d7a42c7285a1dd1b0c0b02da49cb3f3..08fcfd7acd51d71545db407cfa6193183f9d228b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -196,7 +196,7 @@ struct user_dqblk {
 #define INITQMAGICS { \
        0xff31ff35,     /* USRQUOTA */ \
        0xff31ff27,     /* GRPQUOTA */ \
-};
+}
 
 #define QF_VERSION          1
 #define QF_STRING_TAG       "QUOTA HASH FILE"
index c695340910d43b79ce9dd433a4fb995d139005ad..355b59ae97c7c3a32ca91f42c87e19e598b5dea1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -36,13 +36,14 @@ __BEGIN_DECLS
 #ifdef KERNEL_PRIVATE
 
 #include <kern/kern_cdata.h>
+#include <os/refcnt.h>
 
 #ifdef XNU_KERNEL_PRIVATE
 #include <kern/locks.h>
 
 typedef struct os_reason {
-       decl_lck_mtx_data(, osr_lock)
-       unsigned int                    osr_refcount;
+       decl_lck_mtx_data(, osr_lock);
+       os_refcnt_t                     osr_refcount;
        uint32_t                        osr_namespace;
        uint64_t                        osr_code;
        uint64_t                        osr_flags;
@@ -76,7 +77,8 @@ int os_reason_alloc_buffer_noblock(os_reason_t cur_reason, uint32_t osr_bufsize)
 struct kcdata_descriptor * os_reason_get_kcdata_descriptor(os_reason_t cur_reason);
 void os_reason_ref(os_reason_t cur_reason);
 void os_reason_free(os_reason_t cur_reason);
-
+void os_reason_set_flags(os_reason_t cur_reason, uint64_t flags);
+void os_reason_set_description_data(os_reason_t cur_reason, uint32_t type, void *reason_data, uint32_t reason_data_len);
 #endif /* KERNEL_PRIVATE */
 
 /*
@@ -97,7 +99,8 @@ void os_reason_free(os_reason_t cur_reason);
 #define OS_REASON_REPORTCRASH   12
 #define OS_REASON_COREANIMATION 13
 #define OS_REASON_AGGREGATED    14
-#define OS_REASON_ASSERTIOND    15
+#define OS_REASON_RUNNINGBOARD  15
+#define OS_REASON_ASSERTIOND    OS_REASON_RUNNINGBOARD  /* old name */
 #define OS_REASON_SKYWALK       16
 #define OS_REASON_SETTINGS      17
 #define OS_REASON_LIBSYSTEM     18
@@ -107,11 +110,14 @@ void os_reason_free(os_reason_t cur_reason);
 #define OS_REASON_WATCHKIT      22
 #define OS_REASON_GUARD         23
 #define OS_REASON_ANALYTICS     24
+#define OS_REASON_SANDBOX       25
+#define OS_REASON_SECURITY      26
+#define OS_REASON_ENDPOINTSECURITY      27
 
 /*
  * Update whenever new OS_REASON namespaces are added.
  */
-#define OS_REASON_MAX_VALID_NAMESPACE OS_REASON_ANALYTICS
+#define OS_REASON_MAX_VALID_NAMESPACE OS_REASON_ENDPOINTSECURITY
 
 #define OS_REASON_BUFFER_MAX_SIZE 5120
 
@@ -153,7 +159,8 @@ void os_reason_free(os_reason_t cur_reason);
  *
  * Outputs:             Does not return.
  */
-void abort_with_reason(uint32_t reason_namespace, uint64_t reason_code, const char *reason_string, uint64_t reason_flags) __attribute__((noreturn));
+void abort_with_reason(uint32_t reason_namespace, uint64_t reason_code, const char *reason_string, uint64_t reason_flags)
+__attribute__((noreturn, cold));
 
 /*
  * abort_with_payload: Used to exit the current process and pass along
@@ -171,7 +178,7 @@ void abort_with_reason(uint32_t reason_namespace, uint64_t reason_code, const ch
  * Outputs:             Does not return.
  */
 void abort_with_payload(uint32_t reason_namespace, uint64_t reason_code, void *payload, uint32_t payload_size, const char *reason_string,
-    uint64_t reason_flags) __attribute__((noreturn));
+    uint64_t reason_flags) __attribute__((noreturn, cold));
 
 /*
  * terminate_with_reason: Used to terminate a specific process and pass along
index 38088aa9b7aa0ed2233970f5483c52a1a1f549e0..e0750b8db7fabbe70644b8062e5f81694310d1ec 100644 (file)
@@ -142,7 +142,6 @@ __END_DECLS
 #endif /* __APPLE_API_OBSOLETE */
 
 #ifdef BSD_KERNEL_PRIVATE
-#include <machine/reboot.h>
 
 __BEGIN_DECLS
 int     reboot_kernel(int, char *);
index 357768313cbb0bba7aa4ac9f21d2f9cbfb0c6ef2..0cc5a39837408923fcf2a10e3a4362981fd61d6f 100644 (file)
@@ -339,8 +339,7 @@ struct rusage_info_v4 {
        uint64_t ri_billed_energy;
        uint64_t ri_serviced_energy;
        uint64_t ri_interval_max_phys_footprint;
-       // 1 reserve counter(s) remaining for future extension
-       uint64_t ri_unused[1];
+       uint64_t ri_runnable_time;
 };
 
 typedef struct rusage_info_v4 rusage_info_current;
@@ -498,6 +497,8 @@ struct proc_rlimit_control_wakeupmon {
 #define IOPOL_TYPE_VFS_HFS_CASE_SENSITIVITY 1
 #endif
 #define IOPOL_TYPE_VFS_ATIME_UPDATES 2
+#define IOPOL_TYPE_VFS_MATERIALIZE_DATALESS_FILES 3
+#define IOPOL_TYPE_VFS_STATFS_NO_DATA_VOLUME 4
 
 /* scope */
 #define IOPOL_SCOPE_PROCESS   0
@@ -524,6 +525,13 @@ struct proc_rlimit_control_wakeupmon {
 #define IOPOL_ATIME_UPDATES_DEFAULT     0
 #define IOPOL_ATIME_UPDATES_OFF         1
 
+#define IOPOL_MATERIALIZE_DATALESS_FILES_DEFAULT 0
+#define IOPOL_MATERIALIZE_DATALESS_FILES_OFF     1
+#define IOPOL_MATERIALIZE_DATALESS_FILES_ON      2
+
+#define IOPOL_VFS_STATFS_NO_DATA_VOLUME_DEFAULT 0
+#define IOPOL_VFS_STATFS_FORCE_NO_DATA_VOLUME   1
+
 #ifdef PRIVATE
 /*
  * Structures for use in communicating via iopolicysys() between Libc and the
index 9637ead44b31cce80fce1ed227be1bd17eeb88d0..6d9244314052977d70c6d49dacc4ccb768d079b3 100644 (file)
@@ -119,16 +119,6 @@ struct plimit {
 };
 
 #ifdef KERNEL
-/* add user profiling from AST */
-#define ADDUPROF(p)                                                     \
-    addupc_task(p,                                                      \
-               (proc_is64bit((p)) ? (p)->p_stats->user_p_prof.pr_addr \
-                                  : CAST_USER_ADDR_T((p)->p_stats->p_prof.pr_addr)), \
-               (proc_is64bit((p)) ? (p)->p_stats->user_p_prof.pr_ticks \
-                                  : (p)->p_stats->p_prof.pr_ticks))
-
-void     addupc_intr(struct proc *p, uint32_t pc, u_int ticks);
-void     addupc_task(struct proc *p, user_addr_t pc, u_int ticks);
 void     calcru(struct proc *p, struct timeval *up, struct timeval *sp,
     struct timeval *ip);
 void     ruadd(struct rusage *ru, struct rusage *ru2);
index f0d840c3876186372f2403900a189d831a99c8e9..837e38c7411d2ee8e5b8a4e89626a88a9171746a 100644 (file)
 #ifndef _SDT_IMPL_H
 #define _SDT_IMPL_H
 
-/*
- * This file has been created by splitting up the original DTrace sdt.h
- * header. Keep the pragma notice here to allow version tracking.
- */
-
-/* #pragma ident       "@(#)sdt.h      1.7     05/06/08 SMI" */
-
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -51,8 +44,6 @@ typedef struct sdt_probedesc {
 }
 #endif
 
-/* #pragma ident       "@(#)sdt_impl.h 1.3     05/06/08 SMI" */
-
 #ifdef  __cplusplus
 extern "C" {
 #endif
index 1ffa4c0b062f5d0d82b5b190da945e86be6bb3a2..2fb516833d8e02496ac847799a2cdf8a9bc8e9dc 100644 (file)
@@ -127,6 +127,7 @@ struct selinfo {
 #define SI_RECORDED     0x0004          /* select has been recorded */
 #define SI_INITED       0x0008          /* selinfo has been inited */
 #define SI_CLEAR        0x0010          /* selinfo has been cleared */
+#define SI_KNPOSTING    0x0020          /* posting to knotes */
 
 #else
 struct selinfo;
index e96bfab71cfd6e2addcbc2d99096ce1d1dcd96ca..a209acb0febe478c61be45a23c8ea70c443965fd 100644 (file)
@@ -224,7 +224,7 @@ struct os_reason;
  * Machine-dependent functions:
  */
 void    sendsig(struct proc *, /*sig_t*/ user_addr_t  action, int sig,
-    int returnmask, uint32_t code);
+    int returnmask, uint32_t code, sigset_t siginfo);
 
 void    psignal(struct proc *p, int sig);
 void    psignal_with_reason(struct proc *p, int sig, struct os_reason *signal_reason);
@@ -250,6 +250,13 @@ int sig_try_locked(struct proc *p);
 
 #endif  /* BSD_KERNEL_PRIVATE */
 
+#if defined(KERNEL_PRIVATE)
+/* Forward-declare these for consumers of the SDK that don't know about BSD types */
+struct proc;
+typedef struct proc * proc_t;
+struct os_reason;
+void    psignal_sigkill_with_reason(proc_t p, struct os_reason *signal_reason);
+#endif /* defined(KERNEL_PRIVATE) */
 
 #ifdef XNU_KERNEL_PRIVATE
 
index 9f68d473f07358030265dfac4adc7b39d8e44a29..e851212be4ad45b8db78411ff1c32098329c46ee 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define  SO_RESTRICT_DENY_OUT   0x2     /* deny outbound (trapdoor) */
 #define  SO_RESTRICT_DENY_CELLULAR 0x4  /* deny use of cellular (trapdoor) */
 #define  SO_RESTRICT_DENY_EXPENSIVE 0x8 /* deny use of expensive if (trapdoor) */
+#define  SO_RESTRICT_DENY_CONSTRAINED 0x10 /* deny use of expensive if (trapdoor) */
 #endif /* PRIVATE */
 #define SO_RANDOMPORT   0x1082  /* APPLE: request local port randomization */
 #define SO_NP_EXTENSIONS        0x1083  /* To turn off some POSIX behavior */
 #define SO_EXTENDED_BK_IDLE     0x1114  /* extended time to keep socket idle after app is suspended (int) */
 #define SO_MARK_CELLFALLBACK    0x1115  /* Mark as initiated by cell fallback */
 #endif /* PRIVATE */
+#define SO_NET_SERVICE_TYPE     0x1116  /* Network service type */
 
+#ifdef PRIVATE
+#define SO_QOSMARKING_POLICY_OVERRIDE   0x1117  /* int */
+#define SO_INTCOPROC_ALLOW              0x1118  /* Try to use internal co-processor interfaces. */
+#endif /* PRIVATE */
+
+#define SO_NETSVC_MARKING_LEVEL 0x1119  /* Get QoS marking in effect for socket */
+
+#ifdef PRIVATE
+#define SO_NECP_LISTENUUID      0x1120  /* NECP client UUID for listener */
+#define SO_MPKL_SEND_INFO       0x1122  /* (struct so_mpkl_send_info) */
+#define SO_STATISTICS_EVENT 0x1123  /* int64 argument, an event in statistics collection */
+#endif /* PRIVATE */
 /*
  * Network Service Type for option SO_NET_SERVICE_TYPE
  *
  *     inelastic flow, constant packet rate, somewhat fixed size.
  *     E.g. VoIP.
  */
-#define SO_NET_SERVICE_TYPE     0x1116  /* Network service type */
 
 #define NET_SERVICE_TYPE_BE     0 /* Best effort */
 #define NET_SERVICE_TYPE_BK     1 /* Background system initiated */
 #define NET_SERVICE_TYPE_RD     8 /* Responsive Data */
 
 #if PRIVATE
-#define SO_QOSMARKING_POLICY_OVERRIDE   0x1117  /* int */
-#define SO_INTCOPROC_ALLOW              0x1118  /* Try to use internal co-processor interfaces. */
-
 #define _NET_SERVICE_TYPE_COUNT 9
 #define _NET_SERVICE_TYPE_UNSPEC        ((int)-1)
 
@@ -450,14 +460,14 @@ extern const int sotc_by_netservicetype[_NET_SERVICE_TYPE_COUNT];
 #define SO_TC_NETSVC_SIG        (SO_TC_NET_SERVICE_OFFSET + NET_SERVICE_TYPE_SIG)
 #endif /* PRIVATE */
 
-#define SO_NETSVC_MARKING_LEVEL 0x1119  /* Get QoS marking in effect for socket */
-
+/* These are supported values for SO_NETSVC_MARKING_LEVEL */
 #define NETSVC_MRKNG_UNKNOWN            0       /* The outgoing network interface is not known */
 #define NETSVC_MRKNG_LVL_L2             1       /* Default marking at layer 2 (for example Wi-Fi WMM) */
 #define NETSVC_MRKNG_LVL_L3L2_ALL       2       /* Layer 3 DSCP marking and layer 2 marking for all Network Service Types */
 #define NETSVC_MRKNG_LVL_L3L2_BK        3       /* The system policy limits layer 3 DSCP marking and layer 2 marking
                                                 * to background Network Service Types */
 
+
 typedef __uint32_t sae_associd_t;
 #define SAE_ASSOCID_ANY 0
 #define SAE_ASSOCID_ALL ((sae_associd_t)(-1ULL))
@@ -686,6 +696,7 @@ struct sockaddr_storage {
 #define PF_BOND         ((uint32_t)0x626f6e64)  /* 'bond' */
 #ifdef KERNEL_PRIVATE
 #define PF_BRIDGE       ((uint32_t)0x62726467)  /* 'brdg' */
+#define PF_802154       ((uint32_t)0x38313534)  /* '8154' */
 #endif /* KERNEL_PRIVATE */
 
 /*
@@ -769,6 +780,15 @@ struct sockaddr_storage {
 #define NET_RT_MAXID            11
 #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */
 
+#ifdef PRIVATE
+/* These are supported values for SO_STATISTICS_EVENT */
+#define SO_STATISTICS_EVENT_ENTER_CELLFALLBACK (1 << 0)
+#define SO_STATISTICS_EVENT_EXIT_CELLFALLBACK  (1 << 1)
+#define SO_STATISTICS_EVENT_RESERVED_1         (1 << 2)
+#define SO_STATISTICS_EVENT_RESERVED_2         (1 << 3)
+#endif /* PRIVATE */
+
+
 #ifdef KERNEL_PRIVATE
 #define CTL_NET_RT_NAMES { \
        { 0, 0 }, \
@@ -982,9 +1002,9 @@ struct user32_sa_endpoints {
 #else
 #define MSG_WAITSTREAM  0x200           /* wait up to full request.. may return partial */
 #endif
-#define MSG_FLUSH       0x400           /* Start of 'hold' seq; dump so_temp */
-#define MSG_HOLD        0x800           /* Hold frag in so_temp */
-#define MSG_SEND        0x1000          /* Send the packet in so_temp */
+#define MSG_FLUSH       0x400           /* Start of 'hold' seq; dump so_temp, deprecated */
+#define MSG_HOLD        0x800           /* Hold frag in so_temp, deprecated */
+#define MSG_SEND        0x1000          /* Send the packet in so_temp, deprecated */
 #define MSG_HAVEMORE    0x2000          /* Data ready to be read */
 #define MSG_RCVMORE     0x4000          /* Data remains in current pkt */
 #endif
@@ -1090,7 +1110,9 @@ struct cmsgcred {
 #ifdef PRIVATE
 #define SCM_SEQNUM                      0x05    /* TCP unordered recv seq no */
 #define SCM_MSG_PRIORITY                0x06    /* TCP unordered snd priority */
-#define SCM_TIMESTAMP_CONTINUOUS                0x07    /* timestamp (uint64_t) */
+#define SCM_TIMESTAMP_CONTINUOUS        0x07    /* timestamp (uint64_t) */
+#define SCM_MPKL_SEND_INFO              0x08    /* send info for multi-layer packet logging (struct so_mpkl_send_info) */
+#define SCM_MPKL_RECV_INFO              0x09    /* receive info for multi-layer packet logging (struct so_mpkl_recv_info */
 #endif /* PRIVATE */
 
 #ifdef KERNEL_PRIVATE
@@ -1290,10 +1312,7 @@ struct so_cordreq {
  */
 struct netpolicy_event_data {
        __uint64_t      eupid;          /* effective unique PID */
-       pid_t           epid;           /* effective PID */
-#if !defined(__LP64__)
-       __uint32_t      pad;
-#endif /* __LP64__ */
+       __uint64_t      epid;           /* effective PID */
        uuid_t          euuid;          /* effective UUID */
 };
 
@@ -1305,18 +1324,6 @@ struct kev_netpolicy_ifdenied {
        __uint32_t ev_if_functional_type;
 };
 
-/*
- * Common structure for KEV_SOCKET_SUBCLASS
- */
-struct kev_socket_event_data {
-       struct sockaddr_storage kev_sockname;
-       struct sockaddr_storage kev_peername;
-};
-
-struct kev_socket_closed {
-       struct kev_socket_event_data ev_data;
-};
-
 /*
  * Network Service Type to DiffServ Code Point mapping
  */
@@ -1325,6 +1332,19 @@ struct netsvctype_dscp_map {
        u_int8_t        dscp; /* 6 bits diffserv code point */
 };
 
+/*
+ * Multi-layer packet logging require SO_MPK_LOG to be set
+ */
+struct so_mpkl_send_info {
+       uuid_t          mpkl_uuid;
+       __uint8_t       mpkl_proto;     /* see net/multi_layer_pkt_log.h */
+};
+
+struct so_mpkl_recv_info {
+       __uint32_t      mpkl_seq;
+       __uint8_t       mpkl_proto;     /* see net/multi_layer_pkt_log.h */
+};
+
 #ifndef KERNEL
 __BEGIN_DECLS
 
index 250f8724f660d3ad406ddd17523ce7c4cd1c586b..f7e1e82ff41178ef7ba92cfa0e3dff4f95012906 100644 (file)
@@ -243,9 +243,6 @@ struct socket {
        pid_t           last_pid;       /* pid of most recent accessor */
        u_int64_t       last_upid;      /* upid of most recent accessor */
 
-       struct mbuf     *so_temp;       /* Holding area for outbound frags */
-       /* Plug-in support - make the socket interface overridable */
-       struct mbuf     *so_tail;
        struct socket_filter_entry *so_filt;    /* NKE hook */
        u_int32_t       so_flags;               /* Flags */
 #define SOF_NOSIGPIPE           0x00000001
@@ -284,7 +281,7 @@ struct socket {
 #define SOF_CONTENT_FILTER      0x20000000 /* Content filter enabled */
 
        uint32_t        so_upcallusecount; /* number of upcalls in progress */
-       int             so_usecount; /* refcounting of socket use */;
+       int             so_usecount;    /* refcounting of socket use */
        int             so_retaincnt;
        u_int32_t       so_filteruse;   /* usecount for the socket filters */
        u_int16_t       so_traffic_class;
@@ -355,8 +352,14 @@ struct socket {
 #define SOF1_IN_KERNEL_SOCKET           0x00100000 /* Socket created in kernel via KPI */
 #define SOF1_CONNECT_COUNTED            0x00200000 /* connect() call was counted */
 #define SOF1_DNS_COUNTED                0x00400000 /* socket counted to send DNS queries */
+#define SOF1_MPKL_SEND_INFO             0x00800000 /* SO_MPKL_SEND_INFO option is set */
+#define SOF1_INBOUND                    0x01000000 /* Created via a passive listener */
 
        u_int64_t       so_extended_bk_start;
+
+       u_int8_t        so_log_seqn;    /* Multi-layer Packet Logging rolling sequence number */
+       uuid_t          so_mpkl_send_uuid;
+       uint8_t         so_mpkl_send_proto;
 };
 
 /* Control message accessor in mbufs */
@@ -617,12 +620,14 @@ struct kextcb {
 #define SO_FILT_HINT_MUSTRST            0x00020000      /* must send RST and close */
 #define SO_FILT_HINT_MPCANTRCVMORE      0x00040000      /* MPTCP DFIN Received */
 #define SO_FILT_HINT_NOTIFY_ACK         0x00080000      /* Notify Acknowledgement */
+#define SO_FILT_HINT_MP_SUB_ERROR       0x00100000      /* Error happend on subflow */
 
 #define SO_FILT_HINT_BITS \
        "\020\1LOCKED\2CONNRESET\3CANTRCVMORE\4CANTSENDMORE\5TIMEOUT"   \
        "\6NOSRCADDR\7IFDENIED\10SUSPEND\11RESUME\12KEEPALIVE\13AWTIMO" \
        "\14ARTIMO\15CONNECTED\16DISCONNECTED\17CONNINFO_UPDATED"       \
-       "\20MPFAILOVER\21MPSTATUS\22MUSTRST\23MPCANTRCVMORE\24NOTIFYACK"
+       "\20MPFAILOVER\21MPSTATUS\22MUSTRST\23MPCANTRCVMORE\24NOTIFYACK"\
+       "\25MPSUBERROR"
 
 /* Mask for hints that have corresponding kqueue events */
 #define SO_FILT_HINT_EV                                                 \
@@ -703,6 +708,8 @@ struct so_procinfo {
        uuid_t          spi_uuid;
        uuid_t          spi_euuid;
        int             spi_delegated;
+       char            spi_proc_name[MAXCOMLEN + 1];
+       char            spi_e_proc_name[MAXCOMLEN + 1];
 };
 
 extern u_int32_t sb_max;
@@ -727,11 +734,6 @@ extern u_int32_t net_io_policy_uuid;
 
 extern struct soextbkidlestat soextbkidlestat;
 
-struct net_qos_dscp_map {
-       u_int8_t        sotc_to_dscp[SO_TC_MAX];
-       u_int8_t        netsvctype_to_dscp[_NET_SERVICE_TYPE_COUNT];
-};
-
 #endif /* BSD_KERNEL_PRIVATE */
 
 struct mbuf;
@@ -777,9 +779,12 @@ extern struct socket *sonewconn(struct socket *head, int connstatus,
     const struct sockaddr *from);
 extern int sopoll(struct socket *so, int events, struct ucred *cred, void *wql);
 extern int sooptcopyin(struct sockopt *sopt, void *data, size_t len,
-    size_t minlen);
-extern int sooptcopyout(struct sockopt *sopt, void *data, size_t len);
-extern int soopt_cred_check(struct socket *so, int priv, boolean_t allow_root);
+    size_t minlen)
+__attribute__ ((warn_unused_result));
+extern int sooptcopyout(struct sockopt *sopt, void *data, size_t len)
+__attribute__ ((warn_unused_result));
+extern int soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
+    boolean_t ignore_delegate);
 extern int soreceive(struct socket *so, struct sockaddr **paddr,
     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp);
 extern int soreserve(struct socket *so, u_int32_t sndcc, u_int32_t rcvcc);
@@ -838,7 +843,6 @@ extern struct mbuf **sbcreatecontrol_mbuf(caddr_t p, int size, int type,
     int level, struct mbuf **m);
 extern void sbdrop(struct sockbuf *sb, int len);
 extern void sbdroprecord(struct sockbuf *sb);
-extern int sbinsertoob(struct sockbuf *sb, struct mbuf *m0);
 extern void sbrelease(struct sockbuf *sb);
 extern int sbreserve(struct sockbuf *sb, u_int32_t cc);
 extern void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb);
@@ -878,7 +882,7 @@ extern int sodisconnectx(struct socket *so, sae_associd_t, sae_connid_t);
 extern int sodisconnectxlocked(struct socket *so, sae_associd_t, sae_connid_t);
 extern void soevupcall(struct socket *, u_int32_t);
 /* flags for socreate_internal */
-#define SOCF_ASYNC      0x1     /* non-blocking socket */
+#define SOCF_MPTCP      0x1     /* MPTCP-subflow */
 extern int socreate_internal(int dom, struct socket **aso, int type, int proto,
     struct proc *, uint32_t, struct proc *);
 extern int socreate(int dom, struct socket **aso, int type, int proto);
@@ -906,6 +910,7 @@ extern int soissrcbesteffort(struct socket *so);
 extern void soclearfastopen(struct socket *so);
 extern int solisten(struct socket *so, int backlog);
 extern struct socket *sodropablereq(struct socket *head);
+extern lck_mtx_t *socket_getlock(struct socket *so, int flags);
 extern void socket_lock(struct socket *so, int refcount);
 extern void socket_lock_assert_owned(struct socket *so);
 extern int socket_try_lock(struct socket *so);
@@ -915,7 +920,7 @@ extern const char *solockhistory_nr(struct socket *);
 extern void soevent(struct socket *so, long hint);
 extern void sorflush(struct socket *so);
 extern void sowflush(struct socket *so);
-extern void sowakeup(struct socket *so, struct sockbuf *sb);
+extern void sowakeup(struct socket *so, struct sockbuf *sb, struct socket *so2);
 extern int soioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p);
 extern int sogetoptlock(struct socket *so, struct sockopt *sopt, int);
 extern int sosetoptlock(struct socket *so, struct sockopt *sopt, int);
@@ -936,8 +941,7 @@ extern int sosendcheck(struct socket *, struct sockaddr *, user_ssize_t,
 extern int soo_ioctl(struct fileproc *, u_long, caddr_t, vfs_context_t);
 extern int soo_stat(struct socket *, void *, int);
 extern int soo_select(struct fileproc *, int, void *, vfs_context_t);
-extern int soo_kqfilter(struct fileproc *, struct knote *,
-    struct kevent_internal_s *kev, vfs_context_t);
+extern int soo_kqfilter(struct fileproc *, struct knote *, struct kevent_qos_s *);
 
 /* Service class flags used for setting service class on a packet */
 #define PKT_SCF_IPV6            0x00000001      /* IPv6 packet */
@@ -971,8 +975,8 @@ extern int so_set_opportunistic(struct socket *, int);
 extern int so_get_opportunistic(struct socket *);
 extern int so_set_recv_anyif(struct socket *, int);
 extern int so_get_recv_anyif(struct socket *);
-extern int so_set_effective_pid(struct socket *, int, struct proc *);
-extern int so_set_effective_uuid(struct socket *, uuid_t, struct proc *);
+extern int so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred);
+extern int so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred);
 extern int so_set_restrictions(struct socket *, uint32_t);
 extern uint32_t so_get_restrictions(struct socket *);
 extern void socket_tclass_init(void);
@@ -996,14 +1000,9 @@ extern void mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len);
 extern void mptcp_preproc_sbdrop(struct socket *, struct mbuf *, unsigned int);
 extern void mptcp_postproc_sbdrop(struct mbuf *, u_int64_t, u_int32_t,
     u_int32_t);
-extern int mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off,
-    uint64_t dsn, uint32_t rseq, uint16_t dlen);
 
 extern void netpolicy_post_msg(uint32_t, struct netpolicy_event_data *,
     uint32_t);
-extern void socket_post_kev_msg(uint32_t, struct kev_socket_event_data *,
-    uint32_t);
-extern void socket_post_kev_msg_closed(struct socket *);
 /*
  * Socket operation routines.
  * These routines are called by the routines in
index a973c4896b3f184fc01f2cfa0bdc2ae27db2b459..e0a96050c97a3ca0b878d66f966324b471d3eecb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 
 #define SIOCSIFDISABLEOUTPUT    _IOWR('i', 187, struct ifreq)
 
+#define SIOCSIFSUBFAMILY        _IOWR('i', 188, struct ifreq)
+
 #define SIOCGIFAGENTLIST        _IOWR('i', 190, struct netagentlist_req) /* Get netagent dump */
 
 #ifdef BSD_KERNEL_PRIVATE
 #endif /* BSD_KERNEL_PRIVATE */
 #endif /* PRIVATE */
 
+#define SIOCSIF6LOWPAN  _IOW('i', 196, struct ifreq)    /* set 6LOWPAN config */
+#define SIOCGIF6LOWPAN  _IOWR('i', 197, struct ifreq)   /* get 6LOWPAN config */
+
 #ifdef PRIVATE
+#define SIOCGIFTCPKAOMAX        _IOWR('i', 198, struct ifreq)   /* Max TCP keep alive offload slots */
 #define SIOCGIFLOWPOWER _IOWR('i', 199, struct ifreq)   /* Low Power Mode */
 #define SIOCSIFLOWPOWER _IOWR('i', 200, struct ifreq)   /* Low Power Mode */
 
 #if INET6
 #define SIOCGIFCLAT46ADDR       _IOWR('i', 201, struct if_clat46req)
 #endif /* INET6 */
+
+#define SIOCGIFMPKLOG _IOWR('i', 202, struct ifreq)     /* Multi-layer Packet Logging */
+#define SIOCSIFMPKLOG _IOWR('i', 203, struct ifreq)     /* Multi-layer Packet Logging */
+
+#define SIOCGIFCONSTRAINED _IOWR('i', 204, struct ifreq) /* get interface constrained flag */
+#define SIOCSIFCONSTRAINED _IOWR('i', 205, struct ifreq) /* mark interface constrained */
+
+#define SIOCGIFXFLAGS           _IOWR('i', 206, struct ifreq)   /* get extended ifnet flags */
+
+#define SIOCGIFNOACKPRIO _IOWR('i', 207, struct ifreq) /* get interface no ack prioritization flag */
+#define SIOCSIFNOACKPRIO _IOWR('i', 208, struct ifreq) /* mark interface no ack prioritization flagd */
 #endif /* PRIVATE */
 
 #endif /* !_SYS_SOCKIO_H_ */
index 790d9c47a0c40de85cb44ebeb246a393da165754..4bafc11c2fe7eae0d1300a45197db27c2646ebc1 100644 (file)
@@ -61,7 +61,9 @@
 #ifdef  PRIVATE
 #define _POSIX_SPAWN_DISABLE_ASLR       0x0100
 #define _POSIX_SPAWN_NANO_ALLOCATOR     0x0200
-/* unused                               0x0400 */
+#endif  /* PRIVATE */
+#define POSIX_SPAWN_SETSID              0x0400
+#ifdef  PRIVATE
 /* unused                               0x0800 */
 /* unused                               0x1000 */
 #define _POSIX_SPAWN_ALLOW_DATA_EXEC    0x2000
index 64877ea3dffed8117a7b8d52dd1f4f216514cad8..d963cfdb9726f4fe44d720fbd5670255261c372d 100644 (file)
@@ -76,6 +76,7 @@ typedef enum {
        PSPA_EXCEPTION = 1,
        PSPA_AU_SESSION = 2,
        PSPA_IMP_WATCHPORTS = 3,
+       PSPA_REGISTERED_PORTS = 4,
 } pspa_t;
 
 /*
@@ -150,6 +151,24 @@ struct _posix_spawn_coalition_info {
        } psci_info[COALITION_NUM_TYPES];
 };
 
+/*
+ * UID/GID attributes
+ */
+struct _posix_spawn_posix_cred_info {
+       uint32_t pspci_flags;    /* spawn persona flags */
+       uid_t    pspci_uid;      /* alternate posix/unix UID  */
+       gid_t    pspci_gid;      /* alternate posix/unix GID */
+       uint32_t pspci_ngroups;  /* alternate advisory groups */
+       gid_t    pspci_groups[NGROUPS];
+       uid_t    pspci_gmuid;    /* group membership UID */
+       char     pspci_login[MAXLOGNAME + 1];
+};
+
+#define POSIX_SPAWN_POSIX_CRED_UID          0x00010000
+#define POSIX_SPAWN_POSIX_CRED_GID          0x00020000
+#define POSIX_SPAWN_POSIX_CRED_GROUPS       0x00040000
+#define POSIX_SPAWN_POSIX_CRED_LOGIN        0x00080000
+
 /*
  * Persona attributes
  */
@@ -163,18 +182,18 @@ struct _posix_spawn_persona_info {
        uid_t    pspi_gmuid;    /* group membership UID */
 };
 
-#define POSIX_SPAWN_PERSONA_FLAGS_NONE     0x0
-#define POSIX_SPAWN_PERSONA_FLAGS_OVERRIDE 0x1
-#define POSIX_SPAWN_PERSONA_FLAGS_VERIFY   0x2
+#define POSIX_SPAWN_PERSONA_FLAGS_NONE      0x0
+#define POSIX_SPAWN_PERSONA_FLAGS_OVERRIDE  0x1
+#define POSIX_SPAWN_PERSONA_FLAGS_VERIFY    0x2
 
 #define POSIX_SPAWN_PERSONA_ALL_FLAGS \
        (POSIX_SPAWN_PERSONA_FLAGS_OVERRIDE \
         | POSIX_SPAWN_PERSONA_FLAGS_VERIFY \
        )
 
-#define POSIX_SPAWN_PERSONA_UID           0x00010000
-#define POSIX_SPAWN_PERSONA_GID           0x00020000
-#define POSIX_SPAWN_PERSONA_GROUPS        0x00040000
+#define POSIX_SPAWN_PERSONA_UID             POSIX_SPAWN_POSIX_CRED_UID
+#define POSIX_SPAWN_PERSONA_GID             POSIX_SPAWN_POSIX_CRED_GID
+#define POSIX_SPAWN_PERSONA_GROUPS          POSIX_SPAWN_POSIX_CRED_GROUPS
 
 
 /*
@@ -221,6 +240,7 @@ typedef struct _posix_spawnattr {
        _posix_spawn_mac_policy_extensions_t psa_mac_extensions; /* MAC policy-specific extensions. */
        struct _posix_spawn_coalition_info *psa_coalition_info;  /* coalition info */
        struct _posix_spawn_persona_info   *psa_persona_info;    /* spawn new process into given persona */
+       struct _posix_spawn_posix_cred_info *psa_posix_cred_info; /* posix creds: uid/gid/groups */
 } *_posix_spawnattr_t;
 
 /*
@@ -239,6 +259,20 @@ typedef struct _posix_spawnattr {
 #define POSIX_SPAWN_JETSAM_MEMLIMIT_ACTIVE_FATAL        0x04  /* if set, limit is fatal when the process is active   */
 #define POSIX_SPAWN_JETSAM_MEMLIMIT_INACTIVE_FATAL      0x08  /* if set, limit is fatal when the process is inactive */
 
+
+/*
+ * Flags set based on posix_spawnattr_set_jetsam_ttr_np().
+ * Indicate relaunch behavior of process when jetsammed
+ */
+/* Mask and bucket counts for relaunch behavior */
+#define POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_BUCKETS    (0x3)
+#define POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MASK       (0x30)
+
+/* Actual buckets based on behavior data */
+#define POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_HIGH       (0x30)
+#define POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MED        (0x20)
+#define POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_LOW        (0x10)
+
 /*
  * Deprecated posix_spawn psa_flags values
  *
@@ -271,7 +305,6 @@ typedef struct _posix_spawnattr {
  * posix_spawn psa_apptype process type settings.
  * when POSIX_SPAWN_PROC_TYPE is set, old psa_apptype bits are ignored
  */
-
 #define POSIX_SPAWN_PROCESS_TYPE_NORMAL             0x00000000
 #define POSIX_SPAWN_PROCESS_TYPE_DEFAULT            POSIX_SPAWN_PROCESS_TYPE_NORMAL
 
@@ -285,12 +318,15 @@ typedef struct _posix_spawnattr {
 #define POSIX_SPAWN_PROC_TYPE_DAEMON_BACKGROUND     0x00000500
 #define POSIX_SPAWN_PROC_TYPE_DAEMON_ADAPTIVE       0x00000600
 
+#define POSIX_SPAWN_PROC_TYPE_DRIVER                0x00000700
+
 #define POSIX_SPAWN_PROC_CLAMP_NONE                 0x00000000
 #define POSIX_SPAWN_PROC_CLAMP_UTILITY              0x00000001
 #define POSIX_SPAWN_PROC_CLAMP_BACKGROUND           0x00000002
 #define POSIX_SPAWN_PROC_CLAMP_MAINTENANCE          0x00000003
 #define POSIX_SPAWN_PROC_CLAMP_LAST                 0x00000004
 
+#define POSIX_SPAWN_ENTITLEMENT_DRIVER "com.apple.private.spawn-driver"
 /* Setting to indicate no change to darwin role */
 #define POSIX_SPAWN_DARWIN_ROLE_NONE                0x00000000
 /* Other possible values are specified by PRIO_DARWIN_ROLE in sys/resource.h */
@@ -302,7 +338,10 @@ typedef enum {
        PSFA_OPEN = 0,
        PSFA_CLOSE = 1,
        PSFA_DUP2 = 2,
-       PSFA_INHERIT = 3
+       PSFA_INHERIT = 3,
+       PSFA_FILEPORT_DUP2 = 4,
+       PSFA_CHDIR = 5,
+       PSFA_FCHDIR = 6
 } psfa_t;
 
 
@@ -317,17 +356,26 @@ typedef enum {
  *             a variable sized vector list to save space (i.e. a separate
  *             string area, allocation of least amount of path buffer per
  *             open action, etc.).
- *
- * XXX:                Currently overloading psfao_oflag for PSFA_DUP2
  */
 typedef struct _psfa_action {
-       psfa_t  psfaa_type;                     /* file action type */
-       int     psfaa_filedes;                  /* fd to operate on */
-       struct _psfaa_open {
-               int     psfao_oflag;            /* open flags to use */
-               mode_t  psfao_mode;             /* mode for open */
-               char    psfao_path[PATH_MAX];   /* path to open */
-       } psfaa_openargs;
+       psfa_t  psfaa_type;                         /* file action type */
+       union {
+               int psfaa_filedes;                  /* fd to operate on */
+               mach_port_name_t psfaa_fileport;    /* fileport to operate on */
+       };
+       union {
+               struct _psfaa_open {
+                       int     psfao_oflag;            /* open flags to use */
+                       mode_t  psfao_mode;             /* mode for open */
+                       char    psfao_path[PATH_MAX];   /* path to open */
+               } psfaa_openargs;
+               struct {
+                       int psfad_newfiledes;           /* new file descriptor to use */
+               } psfaa_dup2args;
+               struct {
+                       char    psfac_path[PATH_MAX];   /* path to chdir */
+               } psfaa_chdirargs;
+       };
 } _psfa_action_t;
 
 
@@ -393,6 +441,9 @@ struct _posix_spawn_args_desc {
 
        __darwin_size_t persona_info_size;
        struct _posix_spawn_persona_info   *persona_info;
+
+       __darwin_size_t posix_cred_info_size;
+       struct _posix_spawn_posix_cred_info *posix_cred_info;
 };
 
 #ifdef KERNEL
@@ -404,33 +455,37 @@ struct _posix_spawn_args_desc {
 #endif
 
 struct user32__posix_spawn_args_desc {
-       uint32_t                attr_size;      /* size of attributes block */
-       uint32_t                attrp;          /* pointer to block */
+       uint32_t        attr_size;              /* size of attributes block */
+       uint32_t        attrp;                  /* pointer to block */
        uint32_t        file_actions_size;      /* size of file actions block */
-       uint32_t                file_actions;   /* pointer to block */
+       uint32_t        file_actions;           /* pointer to block */
        uint32_t        port_actions_size;      /* size of port actions block */
-       uint32_t                port_actions;   /* pointer to block */
+       uint32_t        port_actions;           /* pointer to block */
        uint32_t        mac_extensions_size;
        uint32_t        mac_extensions;
        uint32_t        coal_info_size;
        uint32_t        coal_info;
        uint32_t        persona_info_size;
        uint32_t        persona_info;
+       uint32_t        posix_cred_info_size;
+       uint32_t        posix_cred_info;
 };
 
 struct user__posix_spawn_args_desc {
-       user_size_t             attr_size;      /* size of attributes block */
-       user_addr_t             attrp;          /* pointer to block */
+       user_size_t     attr_size;              /* size of attributes block */
+       user_addr_t     attrp;                  /* pointer to block */
        user_size_t     file_actions_size;      /* size of file actions block */
-       user_addr_t             file_actions;   /* pointer to block */
+       user_addr_t     file_actions;           /* pointer to block */
        user_size_t     port_actions_size;      /* size of port actions block */
-       user_addr_t             port_actions;   /* pointer to block */
+       user_addr_t     port_actions;           /* pointer to block */
        user_size_t     mac_extensions_size;    /* size of MAC-specific attrs. */
        user_addr_t     mac_extensions;         /* pointer to block */
        user_size_t     coal_info_size;
        user_addr_t     coal_info;
        user_size_t     persona_info_size;
        user_addr_t     persona_info;
+       user_size_t     posix_cred_info_size;
+       user_addr_t     posix_cred_info;
 };
 
 
index b5f73326a312961d6860196f83bfc58aedfd53f1..18c9ad95001b6a4364aaa941b91a312096f602e4 100644 (file)
@@ -493,8 +493,9 @@ extern void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp);
 /*
  * Super-user changeable flags.
  */
-#define SF_SUPPORTED    0x001f0000      /* mask of superuser supported flags */
-#define SF_SETTABLE     0xffff0000      /* mask of superuser changeable flags */
+#define SF_SUPPORTED    0x009f0000      /* mask of superuser supported flags */
+#define SF_SETTABLE     0x3fff0000      /* mask of superuser changeable flags */
+#define SF_SYNTHETIC    0xc0000000      /* mask of system read-only synthetic flags */
 #define SF_ARCHIVED     0x00010000      /* file is archived */
 #define SF_IMMUTABLE    0x00020000      /* file may not be changed */
 #define SF_APPEND       0x00040000      /* writes to file may only append */
@@ -508,6 +509,27 @@ extern void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp);
 /* #define SF_SNAPSHOT 0x00200000 */   /* snapshot inode */
 /* NOTE: There is no SF_HIDDEN bit. */
 
+#define SF_FIRMLINK     0x00800000      /* file is a firmlink */
+
+/*
+ * Synthetic flags.
+ *
+ * These are read-only.  We keep them out of SF_SUPPORTED so that
+ * attempts to set them will fail.
+ */
+#define SF_DATALESS     0x40000000     /* file is dataless object */
+
+#ifdef PRIVATE
+/*
+ * Protected flags.
+ *
+ * These flags are read-write, but can only be changed using the safe
+ * mechanism (FSIOC_CAS_BSDFLAGS).  The standard chflags(2) mechanism
+ * will simply preserve these bits as they are in the inode.
+ */
+#define UF_SF_PROTECTED (UF_COMPRESSED)
+#endif
+
 #ifdef KERNEL
 /*
  * Shorthand abbreviations of above.
index aea56f7003274115feec924037ffbd8d771e2fa3..f37e9a07d1f50efd633c929e85f9f950f69ad39e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -282,6 +282,8 @@ int sysctl_io_opaque(struct sysctl_req *req, void *pValue, size_t valueSize, int
 void sysctl_register_oid(struct sysctl_oid *oidp);
 void sysctl_unregister_oid(struct sysctl_oid *oidp);
 
+void sysctl_load_devicetree_entries(void);
+
 /* Deprecated */
 void sysctl_register_fixed(void) __deprecated;
 
@@ -327,7 +329,7 @@ __END_DECLS
 /* This constructs a "raw" MIB oid. */
 #define SYSCTL_STRUCT_INIT(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
        {                                                                                               \
-               &sysctl_##parent##_children, { 0 },                     \
+               &sysctl_##parent##_children, { NULL },                  \
                nbr, (int)(kind|CTLFLAG_OID2), a1, (int)(a2), #name, handler, fmt, descr, SYSCTL_OID_VERSION, 0 \
        }
 
@@ -340,7 +342,7 @@ __END_DECLS
        struct sysctl_oid_list sysctl_##parent##_##name##_children;         \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_NODE|access,                  \
                   (void*)&sysctl_##parent##_##name##_children, 0, handler, \
-                  "N", descr);
+                  "N", descr)
 
 /* Oid for a string.  len can be 0 to indicate '\0' termination. */
 #define SYSCTL_STRING(parent, nbr, name, access, arg, len, descr) \
@@ -359,31 +361,31 @@ __END_DECLS
 #define SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \
               ptr, val, sysctl_handle_int, "I", descr); \
-       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(int)) ? 0 : -1];
+       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(int)) ? 0 : -1]
 
 /* Oid for an unsigned int.  If ptr is NULL, val is returned. */
 #define SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr) \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \
                ptr, val, sysctl_handle_int, "IU", descr); \
-       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned int)) ? 0 : -1];
+       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned int)) ? 0 : -1]
 
 /* Oid for a long.  The pointer must be non NULL. */
 #define SYSCTL_LONG(parent, nbr, name, access, ptr, descr) \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \
                ptr, 0, sysctl_handle_long, "L", descr); \
-       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long)) ? 0 : -1];
+       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long)) ? 0 : -1]
 
 /* Oid for a unsigned long.  The pointer must be non NULL. */
 #define SYSCTL_ULONG(parent, nbr, name, access, ptr, descr) \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \
                ptr, 0, sysctl_handle_long, "LU", descr); \
-       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned long)) ? 0 : -1];
+       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned long)) ? 0 : -1]
 
 /* Oid for a quad.  The pointer must be non NULL. */
 #define SYSCTL_QUAD(parent, nbr, name, access, ptr, descr) \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_QUAD|access, \
                ptr, 0, sysctl_handle_quad, "Q", descr); \
-       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long long)) ? 0 : -1];
+       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long long)) ? 0 : -1]
 
 /* Oid for an opaque object.  Specified by a pointer and a length. */
 #define SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr) \
@@ -522,7 +524,7 @@ SYSCTL_DECL(_hw_features);
 #define KERN_LOGSIGEXIT 36      /* int: do we log sigexit procs? */
 #define KERN_SYMFILE            37      /* string: kernel symbol filename */
 #define KERN_PROCARGS           38
-/* 39 was KERN_PCSAMPLES... now deprecated */
+/* 39 was KERN_PCSAMPLES... now obsolete */
 #define KERN_NETBOOT            40      /* int: are we netbooted? 1=yes,0=no */
 /* 41 was KERN_PANICINFO : panic UI information (deprecated) */
 #define KERN_SYSV               42      /* node: System V IPC information */
@@ -717,6 +719,12 @@ SYSCTL_DECL(_hw_features);
 #define KERN_PROC_RUID          6       /* by real uid */
 #define KERN_PROC_LCID          7       /* by login context id */
 
+/*
+ * KERN_VFSNSPACE subtypes
+ */
+#define KERN_VFSNSPACE_HANDLE_PROC              1
+#define KERN_VFSNSPACE_UNHANDLE_PROC    2
+
 #if defined(XNU_KERNEL_PRIVATE) || !defined(KERNEL)
 /*
  * KERN_PROC subtype ops return arrays of augmented proc structures:
index a309bb90bfb070c526e88aa6b15e0cde187241be..b68b3cdedc884217e5f5c6d7865153c2fcba1a2b 100644 (file)
@@ -58,7 +58,7 @@ struct sysent {         /* system call table */
 extern struct sysent sysent[];
 #endif  /* __INIT_SYSENT_C__ */
 
-extern unsigned int nsysent;
+extern const unsigned int nsysent;
 
 /*
  * Valid values for sy_cancel
index a06576c40be9c2aec12fb68fb5b262689ba8ddae..dfcd2e73197b4037f239690beff1ee12cb338cba 100644 (file)
@@ -228,6 +228,8 @@ void    throttle_info_mount_rel(mount_t mp);
 void    throttle_info_release(void *throttle_info);
 void    throttle_info_update(void *throttle_info, int flags);
 uint32_t throttle_lowpri_io(int sleep_amount);
+/* returns TRUE if the throttle_lowpri_io called with the same sleep_amount would've slept */
+int     throttle_lowpri_io_will_be_throttled(int sleep_amount);
 void    throttle_set_thread_io_policy(int policy);
 int             throttle_get_thread_effective_io_policy(void);
 
index 45708bf47295629f10b68ea06b13aa3a57777aa9..5c3609ccb89feb03cafa381ee7604513efca7230 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -146,7 +146,7 @@ struct tty {
        int     t_refcnt;               /* reference count */
 };
 
-#define TTY_NULL (struct tty *)0
+#define TTY_NULL (struct tty *)NULL
 
 #define t_cc            t_termios.c_cc
 #define t_cflag         t_termios.c_cflag
index c8eed31458bc49ddc066492648e5d361d2ca98aa..3fdc94b858ecaa24f173ef251c6ab4f0d59a8436 100644 (file)
@@ -180,6 +180,9 @@ struct winsize {
 #define TIOCPTYGRANT    _IO('t', 84)            /* grantpt(3) */
 #define TIOCPTYGNAME    _IOC(IOC_OUT, 't', 83, 128)     /* ptsname(3) */
 #define TIOCPTYUNLK     _IO('t', 82)            /* unlockpt(3) */
+#ifdef KERNEL
+#define TIOCREVOKE       _IO('t', 81)
+#endif
 
 #define TTYDISC         0               /* termios tty line discipline */
 #define TABLDISC        3               /* tablet discipline */
index bc91fde31e9d52969bccc1cf6961892d4a5ded67..e0a5cca0ff1758e2d865af6528c0822060638854 100644 (file)
@@ -157,7 +157,7 @@ int     ubc_create_upl_external(vnode_t, off_t, int, upl_t *, upl_page_info_t **
 int     ubc_create_upl_kernel(vnode_t, off_t, int, upl_t *, upl_page_info_t **, int, vm_tag_t);
 #endif  /* XNU_KERNEL_PRIVATE */
 
-__attribute__((pure)) boolean_t ubc_is_mapped(const struct vnode *, boolean_t *writable);
+boolean_t ubc_is_mapped(const struct vnode *, boolean_t *writable);
 __attribute__((pure)) boolean_t ubc_is_mapped_writable(const struct vnode *);
 
 uint32_t cluster_max_io_size(mount_t, int);
index febbf1aea151439de5a04949be507f5e1008905a..b013af853698b3e4d1d1bc194289cf818f5ab3ff 100644 (file)
@@ -78,6 +78,7 @@
 struct label;
 
 #ifdef __APPLE_API_UNSTABLE
+#ifdef KERNEL
 #include <sys/queue.h>
 
 /*
@@ -119,6 +120,11 @@ struct ucred {
         */
        struct au_session cr_audit;             /* user auditing data */
 };
+#else /* KERNEL */
+struct ucred;
+struct posix_cred;
+#endif /* KERNEL */
+
 #ifndef _KAUTH_CRED_T
 #define _KAUTH_CRED_T
 typedef struct ucred *kauth_cred_t;
index 91f00abb291ea21330f5cc9835d5a1772ee05b13..86b3eb2217772e50cc12692c078fffb8da4a7bae 100644 (file)
  * WARNING - make sure to check when adding flags!  Be sure new flags
  * don't overlap the definitions in uio.h
  */
-//     UIO_USERSPACE                           0       defined in uio.h
-#define UIO_USERISPACE                  1
-//     UIO_SYSSPACE                            2       defined in uio.h
-#define UIO_PHYS_USERSPACE              3
-#define UIO_PHYS_SYSSPACE               4
-//     UIO_USERSPACE32                         5       defined in uio.h
-#define UIO_USERISPACE32                6
-#define UIO_PHYS_USERSPACE32    7
-//     UIO_USERSPACE64                         8       defined in uio.h
-#define UIO_USERISPACE64                9
-#define UIO_PHYS_USERSPACE64    10
-//     UIO_SYSSPACE32                          11      defined in uio.h
-//  UIO_PHYS_SYSSPACE32                        12      reserved, never used. Use UIO_PHYS_SYSSPACE
-//  UIO_SYSSPACE64                             13      reserved, never used. Use UIO_SYSSPACE
-//  UIO_PHYS_SYSSPACE64                        14      reserved, never used. Use UIO_PHYS_SYSSPACE
+//      UIO_USERSPACE           0       defined in uio.h
+#define UIO_USERISPACE          ((enum uio_seg)1)
+//      UIO_SYSSPACE            2       defined in uio.h
+#define UIO_PHYS_USERSPACE      ((enum uio_seg)3)
+#define UIO_PHYS_SYSSPACE       ((enum uio_seg)4)
+//      UIO_USERSPACE32         5       defined in uio.h
+#define UIO_USERISPACE32        ((enum uio_seg)6)
+#define UIO_PHYS_USERSPACE32    ((enum uio_seg)7)
+//      UIO_USERSPACE64         8       defined in uio.h
+#define UIO_USERISPACE64        ((enum uio_seg)9)
+#define UIO_PHYS_USERSPACE64    ((enum uio_seg)10)
+//      UIO_SYSSPACE32          11      defined in uio.h
+//      UIO_PHYS_SYSSPACE32     12      reserved, never used. Use UIO_PHYS_SYSSPACE
+//      UIO_SYSSPACE64          13      reserved, never used. Use UIO_SYSSPACE
+//      UIO_PHYS_SYSSPACE64     14      reserved, never used. Use UIO_PHYS_SYSSPACE
 
 __BEGIN_DECLS
 struct user_iovec;
index bb48d3a723f4913e8979ec2d3a6c2ef44947e637..b86d10eeff2d2c6e4767b1c569106ef71bb56c8c 100644 (file)
 #ifndef _SYS_ULOCK_H
 #define _SYS_ULOCK_H
 
+#include <mach/mach_port.h>
+#include <sys/cdefs.h>
+#include <stdint.h>
+
 __BEGIN_DECLS
 
 #if PRIVATE
@@ -64,23 +68,30 @@ extern int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value);
 #endif /* !KERNEL */
 
 /*
- * operation bits [7, 0] contain the operation code
+ * operation bits [7, 0] contain the operation code.
+ *
+ * NOTE: make sure to add logic for handling any new
+ *       types to kdp_ulock_find_owner()
  */
-#define UL_COMPARE_AND_WAIT                             1
-#define UL_UNFAIR_LOCK                                  2
+#define UL_COMPARE_AND_WAIT             1
+#define UL_UNFAIR_LOCK                  2
+#define UL_COMPARE_AND_WAIT_SHARED      3
+#define UL_UNFAIR_LOCK64_SHARED         4
+#define UL_COMPARE_AND_WAIT64           5
+#define UL_COMPARE_AND_WAIT64_SHARED    6
 /* obsolete names */
-#define UL_OSSPINLOCK                                   UL_COMPARE_AND_WAIT
-#define UL_HANDOFFLOCK                                  UL_UNFAIR_LOCK
+#define UL_OSSPINLOCK                   UL_COMPARE_AND_WAIT
+#define UL_HANDOFFLOCK                  UL_UNFAIR_LOCK
 /* These operation code are only implemented in (DEVELOPMENT || DEBUG) kernels */
 #define UL_DEBUG_SIMULATE_COPYIN_FAULT  253
-#define UL_DEBUG_HASH_DUMP_ALL                  254
-#define UL_DEBUG_HASH_DUMP_PID                  255
+#define UL_DEBUG_HASH_DUMP_ALL          254
+#define UL_DEBUG_HASH_DUMP_PID          255
 
 /*
  * operation bits [15, 8] contain the flags for __ulock_wake
  */
-#define ULF_WAKE_ALL                                    0x00000100
-#define ULF_WAKE_THREAD                                 0x00000200
+#define ULF_WAKE_ALL                    0x00000100
+#define ULF_WAKE_THREAD                 0x00000200
 
 /*
  * operation bits [23, 16] contain the flags for __ulock_wait
@@ -92,14 +103,19 @@ extern int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value);
  *
  * @const ULF_WAIT_CANCEL_POINT
  * This wait is a cancelation point
+ *
+ * @const ULF_WAIT_ADAPTIVE_SPIN
+ * Use adaptive spinning when the thread that currently holds the unfair lock
+ * is on core.
  */
 #define ULF_WAIT_WORKQ_DATA_CONTENTION  0x00010000
 #define ULF_WAIT_CANCEL_POINT           0x00020000
+#define ULF_WAIT_ADAPTIVE_SPIN          0x00040000
 
 /*
  * operation bits [31, 24] contain the generic flags
  */
-#define ULF_NO_ERRNO                                    0x01000000
+#define ULF_NO_ERRNO                    0x01000000
 
 /*
  * masks
@@ -109,12 +125,12 @@ extern int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value);
 #define ULF_GENERIC_MASK        0xFFFF0000
 
 #define ULF_WAIT_MASK           (ULF_NO_ERRNO | \
-                                                        ULF_WAIT_WORKQ_DATA_CONTENTION | \
-                                                        ULF_WAIT_CANCEL_POINT)
+                                ULF_WAIT_WORKQ_DATA_CONTENTION | \
+                                ULF_WAIT_CANCEL_POINT | ULF_WAIT_ADAPTIVE_SPIN)
 
-#define ULF_WAKE_MASK           (ULF_WAKE_ALL | \
-                                                        ULF_WAKE_THREAD | \
-                                                        ULF_NO_ERRNO)
+#define ULF_WAKE_MASK           (ULF_NO_ERRNO | \
+                                ULF_WAKE_ALL | \
+                                ULF_WAKE_THREAD)
 
 #endif /* PRIVATE */
 
index 4c79d0d8fddaa999ba0d8be9e91f49c50a0883e9..42734a4de5742c7dc14a8b6660ff79e43e32cdf3 100644 (file)
@@ -118,6 +118,11 @@ struct uthread {
        u_int64_t uu_arg[8]; /* arguments to current system call */
        int uu_rval[2];
        char uu_cursig; /* p_cursig for exc. */
+       /*
+        * uu_workq_pthread_kill_allowed is not modified under a lock and thus
+        * relies on single copy atomicity and cannot be changed to a bitfield.
+        */
+       bool uu_workq_pthread_kill_allowed;
        unsigned int syscall_code; /* current syscall code */
 
        /* thread exception handling */
@@ -135,37 +140,13 @@ struct uthread {
                        int32_t *retval;                    /* place to store return val */
                } uus_select_data;
 
-               struct _kqueue_scan {
-                       kevent_callback_t call;             /* per-event callback */
-                       kqueue_continue_t cont;             /* whole call continuation */
-                       filt_process_data_t process_data;   /* needed for filter processing */
-                       uint64_t deadline;                  /* computed deadline for operation */
-                       void *data;                         /* caller's private data */
-               } uus_kqueue_scan;                       /* saved state for kevent_scan() */
-
-               struct _kevent {
-                       struct _kqueue_scan scan;           /* space for the generic data */
-                       struct fileproc *fp;                /* fileproc we hold iocount on */
-                       int fd;                             /* fd for fileproc (if held) */
-                       int eventcount;                     /* user-level event count */
-                       int eventout;                       /* number of events output */
-                       struct filt_process_s process_data; /* space for process data fed thru */
-                       int32_t *retval;                    /* place to store return val */
-                       user_addr_t eventlist;              /* user-level event list address */
-                       uint64_t data_available;            /* [user/kernel] addr of in/out size */
-               } uus_kevent;                            /* saved state for kevent() */
+               struct kevent_ctx_s uus_kevent;
 
                struct _kevent_register {
-                       struct kevent_internal_s kev;       /* the kevent to maybe copy out */
-                       struct knote *knote;                /* the knote used for the wait */
-                       struct fileproc *fp;                /* fileproc we hold iocount on */
+                       struct kevent_qos_s kev;            /* the kevent to maybe copy out */
                        thread_t handoff_thread;            /* thread we handed off to, has +1 */
-                       struct kqueue *kq;
-                       int fd;                             /* fd for fileproc (if held) */
-                       int eventcount;                     /* user-level event count */
+                       struct kqworkloop *kqwl;
                        int eventout;                       /* number of events output */
-                       unsigned int flags;                 /* flags for kevent_copyout() */
-                       int32_t *retval;                    /* place to store return val */
                        user_addr_t ueventlist;             /* the user-address to copyout to */
                } uus_kevent_register;                   /* saved for EVFILT_WORKLOOP wait */
 
@@ -234,7 +215,10 @@ struct uthread {
        struct kaudit_record    *uu_ar;                 /* audit record */
        struct task*    uu_aio_task;                    /* target task for async io */
 
-       lck_mtx_t       *uu_mtx;
+       union {
+               lck_mtx_t  *uu_mtx;
+               struct knote_lock_ctx *uu_knlock;
+       };
 
        lck_spin_t      uu_rethrottle_lock;     /* locks was_rethrottled and is_throttled */
        TAILQ_ENTRY(uthread) uu_throttlelist;   /* List of uthreads currently throttled */
@@ -258,7 +242,7 @@ struct uthread {
         * Bound kqueue request. This field is only cleared by the current thread,
         * hence can be dereferenced safely by the current thread without locks.
         */
-       struct kqrequest *uu_kqr_bound;
+       struct workq_threadreq_s *uu_kqr_bound;
        TAILQ_ENTRY(uthread) uu_workq_entry;
        mach_vm_offset_t uu_workq_stackaddr;
        mach_port_name_t uu_workq_thport;
@@ -364,9 +348,10 @@ typedef struct uthread * uthread_t;
 #define UT_PASSIVE_IO   0x00000100      /* this thread issues passive I/O */
 #define UT_PROCEXIT     0x00000200      /* this thread completed the  proc exit */
 #define UT_RAGE_VNODES  0x00000400      /* rapid age any vnodes created by this thread */
-#define UT_KERN_RAGE_VNODES     0x00000800      /* rapid age any vnodes created by this thread (kernel set) */
-/* 0x00001000 unused, used to be UT_BACKGROUND_TRAFFIC_MGT */
+#define UT_KERN_RAGE_VNODES        0x00000800 /* rapid age any vnodes created by this thread (kernel set) */
+#define UT_NSPACE_NODATALESSFAULTS 0x00001000 /* thread does not materialize dataless files */
 #define UT_ATIME_UPDATE 0x00002000      /* don't update atime for files accessed by this thread */
+#define UT_NSPACE_FORCEDATALESSFAULTS  0x00004000 /* thread always materializes dataless files */
 #define UT_VFORK        0x02000000      /* thread has vfork children */
 #define UT_SETUID       0x04000000      /* thread is settugid() */
 #define UT_WASSETUID    0x08000000      /* thread was settugid() (in vfork) */
index a21365bfee2ebc8c6f9a31bf97f54e09b646349e..5ec22ac9334186ed90caf88e0f6627f215d41138 100644 (file)
@@ -109,9 +109,13 @@ enum vtagtype   {
        /* 16 - 20 */
        VT_HFS, VT_ZFS, VT_DEVFS, VT_WEBDAV, VT_UDF,
        /* 21 - 25 */
-       VT_AFP, VT_CDDA, VT_CIFS, VT_OTHER, VT_APFS
+       VT_AFP, VT_CDDA, VT_CIFS, VT_OTHER, VT_APFS,
+       /* 26 */
+       VT_LOCKERFS,
 };
 
+#define HAVE_VT_LOCKERFS 1
+
 /*
  * flags for VNOP_BLOCKMAP
  */
@@ -467,10 +471,16 @@ struct vnode_trigger_param {
  * VNT_NO_DIRECT_MOUNT:
  * A trigger vnode instance that doesn't directly trigger a mount,
  * instead it triggers the mounting of sub-trigger nodes.
+ *
+ * VNT_KERN_RESOLVE:
+ * A trigger vnode where all parameters have been set by the kernel,
+ * such as NFS mirror mounts.
  */
 #define VNT_AUTO_REARM          (1 << 0)
 #define VNT_NO_DIRECT_MOUNT     (1 << 1)
-#define VNT_VALID_MASK          (VNT_AUTO_REARM | VNT_NO_DIRECT_MOUNT)
+#define VNT_KERN_RESOLVE        (1 << 2)
+#define VNT_VALID_MASK          (VNT_AUTO_REARM | VNT_NO_DIRECT_MOUNT | \
+                                VNT_KERN_RESOLVE)
 
 #endif /* KERNEL_PRIVATE */
 
@@ -753,6 +763,8 @@ struct vnode_attr {
 #define VA_NOINHERIT            0x040000        /* Don't inherit ACLs from parent */
 #define VA_NOAUTH               0x080000
 #define VA_64BITOBJIDS          0x100000        /* fileid/linkid/parentid are 64 bit */
+#define VA_REALFSID             0x200000        /* Return real fsid */
+#define VA_USEFSID              0x400000        /* Use fsid from filesystem  */
 
 /*
  *  Modes.  Some values same as Ixxx entries from inode.h for now.
@@ -794,6 +806,7 @@ extern int              vttoif_tab[];
 #define VNODE_REMOVE_NODELETEBUSY                       0x0001 /* Don't delete busy files (Carbon) */
 #define VNODE_REMOVE_SKIP_NAMESPACE_EVENT       0x0002 /* Do not upcall to userland handlers */
 #define VNODE_REMOVE_NO_AUDIT_PATH              0x0004 /* Do not audit the path */
+#define VNODE_REMOVE_DATALESS_DIR               0x0008 /* Special handling for removing a dataless directory without materialization */
 
 /* VNOP_READDIR flags: */
 #define VNODE_READDIR_EXTENDED    0x0001   /* use extended directory entries */
@@ -825,7 +838,7 @@ struct vnodeopv_entry_desc {
 struct vnodeopv_desc {
        /* ptr to the ptr to the vector where op should go */
        int(***opv_desc_vector_p)(void *);
-       struct vnodeopv_entry_desc *opv_desc_ops;   /* null terminated list */
+       const struct vnodeopv_entry_desc *opv_desc_ops;   /* null terminated list */
 };
 
 /*!
@@ -973,6 +986,14 @@ enum vtype      vnode_vtype(vnode_t vp);
  */
 uint32_t        vnode_vid(vnode_t vp);
 
+/*!
+ *  @function vnode_isonexternalstorage
+ *  @abstract Return whether or not the storage device backing a vnode is external or not.
+ *  @param vp The vnode whose physical location is to be determined.
+ *  @return TRUE if storage device is external, FALSE if otherwise.
+ */
+boolean_t vnode_isonexternalstorage(vnode_t vp);
+
 /*!
  *  @function vnode_mountedhere
  *  @abstract Returns a pointer to a mount placed on top of a vnode, should it exist.
@@ -1111,7 +1132,26 @@ int     vnode_isnamedstream(vnode_t vp);
  *  @return 0 if the operation is successful, an error otherwise.
  */
 errno_t vnode_setasnamedstream(vnode_t vp, vnode_t svp);
-#endif
+
+/*!
+ *  @function vnode_setasfirmlink
+ *  @abstract Set a vnode to act as a firmlink i.e. point to a target vnode.
+ *  @param vp The vnode which is to be acted on as a firmlink.
+ *  @param target_vp The vnode which will be the target of the firmlink.
+ *  @return 0 if the operation is successful, an error otherwise.
+ */
+errno_t vnode_setasfirmlink(vnode_t vp, vnode_t target_vp);
+
+/*!
+ *  @function vnode_getfirmlink
+ *  @abstract If a vnode is a firmlink, get its target vnode.
+ *  @param vp The firmlink vnode.
+ *  @param target_vp The firmlink traget vnode. This vnode is returned with an iocount.
+ *  @return 0 if the operation is successful, an error otherwise.
+ */
+errno_t vnode_getfirmlink(vnode_t vp, vnode_t *target_vp);
+
+#endif /* KERNEL_PRIVATE */
 
 /*!
  *  @function vnode_ismountedon
@@ -1637,6 +1677,18 @@ int     vnode_ismonitored(vnode_t vp);
 int     vnode_isdyldsharedcache(vnode_t vp);
 
 
+/*!
+ *  @function vn_authorize_unlink
+ *  @abstract Authorize an unlink operation given the vfs_context_t
+ *  @discussion Check if the context assocated with vfs_context_t is allowed to unlink the vnode vp in directory dvp.
+ *  @param dvp Parent vnode of the file to be unlinked
+ *  @param vp The vnode to be unlinked
+ *  @param cnp A componentname containing the name of the file to be unlinked.  May be NULL.
+ *  @param reserved Pass NULL
+ *  @return returns zero if the operation is allowed, non-zero indicates the unlink is not authorized.
+ */
+int     vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved);
+
 /*!
  *  @function vn_getpath_fsenter
  *  @abstract Attempt to get a vnode's path, willing to enter the filesystem.
@@ -1651,6 +1703,19 @@ int     vnode_isdyldsharedcache(vnode_t vp);
  */
 int     vn_getpath_fsenter(struct vnode *vp, char *pathbuf, int *len);
 
+/*!
+ *  @function vn_getpath_no_firmlink
+ *  @abstract Attempt to get a vnode's path without a firm-link translation.
+ *  @discussion Paths to vnodes are not always straightforward: a file with multiple hard-links will have multiple pathnames,
+ *  and it is sometimes impossible to determine a vnode's full path. Like vn_getpath, it will not reenter the filesystem.
+ *  @param vp Vnode whose path to get
+ *  @param pathbuf Buffer in which to store path.
+ *  @param len Destination for length of resulting path string.  Result will include NULL-terminator in count--that is, "len"
+ *  will be strlen(pathbuf) + 1.
+ *  @return 0 for success or an error.
+ */
+int     vn_getpath_no_firmlink(struct vnode *vp, char *pathbuf, int *len);
+
 /*!
  *  @function vn_getpath_fsenter_with_parent
  *  @abstract Attempt to get a vnode's path by entering the file system if needed given a vnode and it's directory vnode.
@@ -1666,6 +1731,27 @@ int     vn_getpath_fsenter(struct vnode *vp, char *pathbuf, int *len);
  */
 int     vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char *pathbuf, int *len);
 
+/*!
+ *  @function vn_getpath_ext
+ *  @abstract Attempt to get a vnode's path without rentering filesystem (unless passed an option to allow)
+ *  @discussion Paths to vnodes are not always straightforward: a file with multiple hard-links will have multiple pathnames,
+ *  and it is sometimes impossible to determine a vnode's full path.  vn_getpath_fsenter() may enter the filesystem
+ *  to try to construct a path, so filesystems should be wary of calling it.
+ *  @param vp Vnode whose path to get
+ *  @param dvp parent vnode of vnode whose path to get, can be NULL if not available.
+ *  @param pathbuf Buffer in which to store path.
+ *  @param len Destination for length of resulting path string.  Result will include NULL-terminator in count--that is, "len"
+ *  will be strlen(pathbuf) + 1.
+ *  @param flags flags for controlling behavior.
+ *  @return 0 for success or an error.
+ */
+int     vn_getpath_ext(struct vnode *vp, struct vnode *dvp, char *pathbuf, int *len, int flags);
+
+/* supported flags for vn_getpath_ext */
+#define VN_GETPATH_FSENTER              0x0001 /* Can re-enter filesystem */
+#define VN_GETPATH_NO_FIRMLINK          0x0002
+#define VN_GETPATH_VOLUME_RELATIVE      0x0004 /* also implies VN_GETPATH_NO_FIRMLINK */
+
 #endif /* KERNEL_PRIVATE */
 
 #define VNODE_UPDATE_PARENT     0x01
@@ -1673,6 +1759,9 @@ int     vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char
 #define VNODE_UPDATE_NAME       0x02
 #define VNODE_UPDATE_CACHE      0x04
 #define VNODE_UPDATE_PURGE      0x08
+#ifdef BSD_KERNEL_PRIVATE
+#define VNODE_UPDATE_PURGEFIRMLINK      0x10
+#endif
 /*!
  *  @function vnode_update_identity
  *  @abstract Update vnode data associated with the vfs cache.
@@ -1833,12 +1922,26 @@ int     vfs_get_notify_attributes(struct vnode_attr *vap);
  */
 errno_t vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx);
 
+#ifdef KERNEL_PRIVATE
+/*!
+ *  @function vnode_lookup starting from a directory vnode (only if path is relative)
+ *  @abstract Convert a path into a vnode.
+ *  @discussion This routine is a thin wrapper around xnu-internal lookup routines; if successful,
+ *  it returns with an iocount held on the resulting vnode which must be dropped with vnode_put().
+ *  @param path Path to look up.
+ *  @param flags VNODE_LOOKUP_NOFOLLOW: do not follow symbolic links.  VNODE_LOOKUP_NOCROSSMOUNT: do not cross mount points.
+ *  @param start_dvp vnode of directory to start lookup from. This parameter is ignored if path is absolute. start_dvp should
+ *         have an iocount on it.
+ *  @return Results 0 for success or an error code.
+ */
+errno_t vnode_lookupat(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx, vnode_t start_dvp);
+#endif
+
 /*!
  *  @function vnode_open
  *  @abstract Open a file identified by a path--roughly speaking an in-kernel open(2).
- *  @discussion If vnode_open() succeeds, it returns with both an iocount and a usecount on the returned vnode.  These must
- *  be released eventually; the iocount should be released with vnode_put() as soon as any initial operations
- *  on the vnode are over, whereas the usecount should be released via vnode_close().
+ *  @discussion If vnode_open() succeeds, it returns with both an iocount and a usecount on the
+ *  returned vnode. Both will be release once vnode_close is called.
  *  @param path Path to look up.
  *  @param fmode e.g. O_NONBLOCK, O_APPEND; see bsd/sys/fcntl.h.
  *  @param cmode Permissions with which to create file if it does not exist.
@@ -2132,9 +2235,9 @@ int vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp);
 #ifdef BSD_KERNEL_PRIVATE
 /* Not in export list so can be private */
 struct stat;
-int     vn_stat(struct vnode *vp, void * sb, kauth_filesec_t *xsec, int isstat64,
+int     vn_stat(struct vnode *vp, void * sb, kauth_filesec_t *xsec, int isstat64, int needsrealdev,
     vfs_context_t ctx);
-int     vn_stat_noauth(struct vnode *vp, void * sb, kauth_filesec_t *xsec, int isstat64,
+int     vn_stat_noauth(struct vnode *vp, void * sb, kauth_filesec_t *xsec, int isstat64, int needsrealdev,
     vfs_context_t ctx, struct ucred *file_cred);
 int     vaccess(mode_t file_mode, uid_t uid, gid_t gid,
     mode_t acc_mode, kauth_cred_t cred);
@@ -2231,6 +2334,22 @@ errno_t vfs_setup_vattr_from_attrlist(struct attrlist *alp, struct vnode_attr *v
  */
 errno_t vfs_attr_pack(vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options, struct vnode_attr *vap, void *fndesc, vfs_context_t ctx);
 
+/*!
+ *  @function vfs_attr_pack_ex
+ *  @abstract Pack a vnode_attr structure into a buffer in the same format as getattrlist(2).
+ *  @Used by a VNOP_GETATTRLISTBULK implementation to pack data provided into a vnode_attr structure into a buffer the way getattrlist(2) does.
+ *  @param mp the mount structure for the filesystem the packing operation is happening on.
+ *  @param vp If available, the vnode for which the attributes are being given, NULL if vnode is not available (which will usually be the case for a VNOP_GETATTRLISTBULK implementation.
+ *  @param uio - a uio_t initialised with one iovec..
+ *  @param alp - Pointer to an attrlist structure.
+ *  @param options - options for call (same as options for getattrlistbulk(2)).
+ *  @param vap Pointer to a filled in vnode_attr structure. Data from the vnode_attr structure will be used to copy and lay out the data in the required format for getatrlistbulk(2) by this function.
+ *  @param fndesc Currently unused
+ *  @param ctx vfs context of caller.
+ *  @return error.
+ */
+errno_t vfs_attr_pack_ext(mount_t mp, vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options, struct vnode_attr *vap, void *fndesc, vfs_context_t ctx);
+
 #ifdef KERNEL_PRIVATE
 
 // Returns a value suitable, safe and consistent for tracing and logging
@@ -2258,6 +2377,7 @@ void vnode_clearnoflush(vnode_t);
 #define BUILDPATH_CHECKACCESS     0x2 /* Check if parents have search rights */
 #define BUILDPATH_CHECK_MOVED     0x4 /* Return EAGAIN if the parent hierarchy is modified */
 #define BUILDPATH_VOLUME_RELATIVE 0x8 /* Return path relative to the nearest mount point */
+#define BUILDPATH_NO_FIRMLINK     0x10 /* Return non-firmlinked path */
 
 int     build_path(vnode_t first_vp, char *buff, int buflen, int *outlen, int flags, vfs_context_t ctx);
 
index 1b8cc8af3cfbb2b284b4060cbcdd51f70e95ab0f..7959ff76452c05db4328e633780ff23d1851814c 100644 (file)
@@ -112,6 +112,7 @@ extern struct vnodeop_desc vnop_ioctl_desc;
 extern struct vnodeop_desc vnop_select_desc;
 extern struct vnodeop_desc vnop_exchange_desc;
 extern struct vnodeop_desc vnop_revoke_desc;
+extern struct vnodeop_desc vnop_mmap_check_desc;
 extern struct vnodeop_desc vnop_mmap_desc;
 extern struct vnodeop_desc vnop_mnomap_desc;
 extern struct vnodeop_desc vnop_fsync_desc;
@@ -593,6 +594,30 @@ struct vnop_revoke_args {
 extern errno_t VNOP_REVOKE(vnode_t, int, vfs_context_t);
 #endif /* XNU_KERNEL_PRIVATE */
 
+struct vnop_mmap_check_args {
+       struct vnodeop_desc *a_desc;
+       vnode_t a_vp;
+       int a_flags;
+       vfs_context_t a_context;
+};
+
+/*!
+ *  @function VNOP_MMAP_CHECK
+ *  @abstract Check with a filesystem if a file can be mmap-ed.
+ *  @discussion VNOP_MMAP_CHECK is used to check with the file system if a
+ *  file can be mmap-ed. It will be called before any call to VNOP_MMAP().
+ *  @param vp The vnode being mmapped.
+ *  @param flags Memory protection: PROT_READ, PROT_WRITE, PROT_EXEC.
+ *  @param ctx Context to authenticate for mmap request.
+ *  @return 0 for success; EPERM if the operation is not permitted; other
+ *  errors (except ENOTSUP) may be returned at the discretion of the file
+ *  system.  ENOTSUP will never be returned by VNOP_MMAP_CHECK().
+ */
+#ifdef XNU_KERNEL_PRIVATE
+extern errno_t VNOP_MMAP_CHECK(vnode_t, int, vfs_context_t);
+#endif /* XNU_KERNEL_PRIVATE */
+
+
 struct vnop_mmap_args {
        struct vnodeop_desc *a_desc;
        vnode_t a_vp;
@@ -753,6 +778,12 @@ enum {
        VFS_RENAME_SWAP                 = 0x00000002,
        VFS_RENAME_EXCL                 = 0x00000004,
 
+       /*
+        * VFS_RENAME_DATALESS is kernel-only and is intentionally
+        * not included in VFS_RENAME_FLAGS_MASK.
+        */
+       VFS_RENAME_DATALESS             = 0x00000008,
+
        VFS_RENAME_FLAGS_MASK   = (VFS_RENAME_SECLUDE | VFS_RENAME_SWAP
            | VFS_RENAME_EXCL),
 };
index bde95e48c9317eb7b2b53f098d5d4ba8c7e2cf4d..4e271502f2ce83f77287962fcaebbff0c58e02fa 100644 (file)
@@ -179,6 +179,11 @@ struct vnode {
 #if CONFIG_TRIGGERS
        vnode_resolve_t v_resolve;              /* trigger vnode resolve info (VDIR only) */
 #endif /* CONFIG_TRIGGERS */
+#if CONFIG_FIRMLINKS
+       vnode_t v_fmlink;                       /* firmlink if set (VDIR only), Points to source
+                                                *  if VFLINKTARGET is set, if  VFLINKTARGET is not
+                                                *  set, points to target */
+#endif /* CONFIG_FIRMLINKS */
 };
 
 #define v_mountedhere   v_un.vu_mountedhere
@@ -260,8 +265,8 @@ struct vnode {
 #define VISDIRTY       0x4000000        /* vnode will need IO if reclaimed */
 #define VFASTDEVCANDIDATE  0x8000000        /* vnode is a candidate to store on a fast device */
 #define VAUTOCANDIDATE 0x10000000       /* vnode was automatically marked as a fast-dev candidate */
+#define VFMLINKTARGET  0x20000000       /* vnode is firmlink target */
 /*
- *  0x20000000 not used
  *  0x40000000 not used
  *  0x80000000 not used.
  */
@@ -435,7 +440,6 @@ int     vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fm
 int     vn_authorize_create(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*);
 int     vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx);
 void    vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields);
-int     vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved);
 int     vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp,
     struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp,
     vfs_context_t ctx, void *reserved);
@@ -585,8 +589,6 @@ int vfs_sysctl_node SYSCTL_HANDLER_ARGS;
 void vnode_setneedinactive(vnode_t);
 int     vnode_hasnamedstreams(vnode_t); /* Does this vnode have associated named streams? */
 
-void nspace_proc_exit(struct proc *p);
-
 errno_t
 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
     int *numdirent, vfs_context_t ctxp);
@@ -605,6 +607,11 @@ void vfs_nested_trigger_unmounts(mount_t, int, vfs_context_t);
 
 int     build_path_with_parent(vnode_t, vnode_t /* parent */, char *, int, int *, int, vfs_context_t);
 
+void    nspace_resolver_init(void);
+void    nspace_resolver_exited(struct proc *);
+
+int     vnode_materialize_dataless_file(vnode_t, uint64_t);
+
 #endif /* BSD_KERNEL_PRIVATE */
 
 #endif /* !_SYS_VNODE_INTERNAL_H_ */
index ab5d80fb2df6e5f6d30e9111c1e91dcfb34dc335..695a28ea1913b1c58bac6576dabb98bee86fe2ea 100644 (file)
@@ -118,6 +118,7 @@ __BEGIN_DECLS
 #define WORK_INTERVAL_TYPE_CA_RENDER_SERVER     (0x2 << 28)
 #define WORK_INTERVAL_TYPE_CA_CLIENT            (0x3 << 28)
 #define WORK_INTERVAL_TYPE_HID_DELIVERY         (0x4 << 28)
+#define WORK_INTERVAL_TYPE_COREMEDIA            (0x5 << 28)
 #define WORK_INTERVAL_TYPE_LAST                 (0xF << 28)
 
 #ifndef KERNEL
diff --git a/bsd/sys_private/Makefile b/bsd/sys_private/Makefile
new file mode 100644 (file)
index 0000000..1789c0c
--- /dev/null
@@ -0,0 +1,37 @@
+# This private directory is necessary for BSD headers bound for
+# `/usr/local/include/sys/` and the System framework.
+
+export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
+export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
+export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
+export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
+
+include $(MakeInc_cmd)
+include $(MakeInc_def)
+
+INSTALL_MI_DIR = sys
+INCDIR = $(SDKHEADERSROOT)/usr/local/include
+DRIVERKITINCDIR = $(DRIVERKITSDKHEADERSROOT)/usr/local/include
+
+# Installs header files for Apple internal use in
+#     $(DSTROOT)/usr/local/include/sys
+
+INSTALL_MI_LIST = \
+       kdebug_private.h
+
+INSTALL_DRIVERKIT_MI_LIST = \
+       kdebug_private.h
+
+# Installs header files for Apple internal use in
+#     $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders/sys
+
+INSTALL_MI_LCL_LIST = \
+       kdebug_private.h
+
+EXPORT_MI_DIR = sys
+
+EXPORT_MI_LIST = \
+       kdebug_private.h
+
+include $(MakeInc_rule)
+include $(MakeInc_dir)
diff --git a/bsd/sys_private/kdebug_private.h b/bsd/sys_private/kdebug_private.h
new file mode 100644 (file)
index 0000000..6444ea6
--- /dev/null
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef BSD_KDEBUG_PRIVATE_H
+#define BSD_KDEBUG_PRIVATE_H
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <sys/cdefs.h>
+#include <sys/kdebug.h>
+
+__BEGIN_DECLS
+
+#if !KERNEL
+
+#include <Availability.h>
+
+#pragma mark - user space SPI
+
+/*
+ * OS components can use the full precision of the "code" field
+ * (Class, SubClass, Code) to inject events using kdebug_trace() by
+ * using:
+ *
+ * kdebug_trace(KDBG_CODE(DBG_XPC, 15, 1) | DBG_FUNC_NONE, 1, 2, 3, 4);
+ *
+ * These trace points can be included in production code, since they
+ * use reserved, non-overlapping ranges.  The performance impact when
+ * kernel tracing is not enabled is minimal.  However, when tracing is enabled,
+ * each tracepoint becomes a syscall.  For this reason, os_signpost(3) is
+ * recommended instead of kdebug_trace(2).
+ *
+ * Classes can be reserved by filing a Radar in xnu | ktrace.
+ *
+ * 64-bit arguments may be truncated if the system is using a 32-bit
+ * kernel.
+ *
+ * On error, -1 will be returned and errno will indicate the error.
+ */
+int kdebug_trace(uint32_t code, uint64_t arg1, uint64_t arg2, uint64_t arg3,
+    uint64_t arg4)
+__OSX_AVAILABLE(10.10) __IOS_AVAILABLE(8.2);
+
+/*!
+ * @function kdebug_trace_string
+ *
+ * @discussion
+ * This function emits strings to kdebug trace along with an ID and allows
+ * for previously-traced strings to be overwritten and invalidated.
+ *
+ * To start tracing a string and generate an ID to use to refer to it:
+ *
+ *      string_id = kdebug_trace_string(debugid, 0, "string");
+ *
+ * To replace a string previously traced:
+ *
+ *      string_id = kdebug_trace_string(debugid, string_id, "new string");
+ *
+ * To invalidate a string ID:
+ *
+ *      string_id = kdebug_trace_string(debugid, string_id, NULL);
+ *
+ * To check for errors:
+ *
+ *      if ((int64_t)string_id == -1) { perror("string error") }
+ *
+ * @param debugid
+ * The `debugid` to check if its enabled before tracing and include as
+ * an argument in the event containing the string.
+ *
+ * Some classes or subclasses are reserved for specific uses and are not
+ * allowed to be used with this function.  No function qualifiers are
+ * allowed on `debugid`.
+ *
+ * @param str_id
+ * When 0, a new ID will be generated and returned if tracing is
+ * enabled.
+ *
+ * Otherwise `str_id` must contain an ID that was previously generated
+ * with this function.  Clents should pass NULL in `str` if `str_id`
+ * is no longer in use.  Otherwise, the string previously mapped to
+ * `str_id` will be overwritten with the contents of `str`.
+ *
+ * @param str
+ * A NUL-terminated 'C' string containing the characters that should be
+ * traced alongside `str_id`.
+ *
+ * If necessary, the string will be truncated at an
+ * implementation-defined length.  The string must not be the empty
+ * string, but can be NULL if a valid `str_id` is provided.
+ *
+ * @return
+ * 0 if tracing is disabled or `debugid` is being filtered out of trace.
+ * It can also return (int64_t)-1 if an error occured. Otherwise,
+ * it returns the ID to use to refer to the string in future
+ * kdebug_trace(2) calls.
+ *
+ * The errors that can occur are:
+ *
+ * EINVAL
+ *      There are function qualifiers on `debugid`, `str` is empty, or
+ *      `str_id` was not generated by this function.
+ * EPERM
+ *      The `debugid`'s class or subclass is reserved for internal use.
+ * EFAULT
+ *      `str` is an invalid address or NULL when `str_id` is 0.
+ */
+extern uint64_t kdebug_trace_string(uint32_t debugid, uint64_t str_id,
+    const char *str)
+__OSX_AVAILABLE(10.11) __IOS_AVAILABLE(9.0);
+
+/*
+ * Although the performance impact of kdebug_trace() when kernel
+ * tracing is not enabled is minimal, it may require the caller to
+ * perform an expensive calculation/summarization. This cost can be
+ * skipped by checking the kdebug_is_enabled() predicate:
+ *
+ * if (kdebug_is_enabled(KDBG_CODE(DBG_XPC, 15, 1))) {
+ *     uint64_t arg1 = ...;
+ *     uint64_t arg2 = ...;
+ *     kdebug_trace(KDBG_CODE(DBG_XPC, 15, 1) | DBG_FUNC_NONE, arg1, arg2, 0, 0);
+ * }
+ *
+ * If tracing is enabled for the code at the time of the check, 1
+ * will be returned. Otherwise, 0 will be returned.
+ */
+extern bool kdebug_is_enabled(uint32_t code)
+__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0)
+__WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0);
+
+/*
+ * Returns a pointer to the userspace typefilter, if one is available.
+ * May return NULL.
+ */
+extern void *kdebug_typefilter(void)
+__OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0)
+__WATCHOS_AVAILABLE(3.0) __TVOS_AVAILABLE(10.0);
+
+/*
+ * Returns true if kdebug is using continuous time for its events, and false
+ * otherwise.
+ */
+extern bool kdebug_using_continuous_time(void)
+__API_AVAILABLE(macos(10.15), ios(13), tvos(13), watchos(6));
+
+#endif /* !KERNEL */
+
+#pragma mark - private debugids
+
+#define DBG_PPT         36
+#define DBG_PERFCTRL    39
+#define DBG_CLPC        50
+#define DBG_MUSE        52
+
+/* **** 128 to 139 are reserved for IOP tracing **** */
+#define DBG_ANS         128
+#define DBG_SIO         129
+#define DBG_SEP         130
+#define DBG_ISP         131
+#define DBG_OSCAR       132
+#define DBG_EMBEDDEDGFX 133
+#define DBG_PMP         134
+#define DBG_RTKIT       135
+
+#define MACH_BRIDGE_RCV_TS      0x1     /* receive timestamp pair from interrupt handler */
+#define MACH_BRIDGE_REMOTE_TIME 0x2     /* calculate remote timestamp */
+#define MACH_BRIDGE_RESET_TS    0x3     /* reset timestamp conversion parameters */
+#define MACH_BRIDGE_TS_PARAMS   0x4     /* recompute timestamp conversion parameters */
+#define MACH_BRIDGE_SKIP_TS     0x5     /* skip timestamp */
+#define MACH_BRIDGE_TS_MISMATCH 0x6     /* mismatch between predicted and received remote timestamp */
+#define MACH_BRIDGE_OBSV_RATE   0x7     /* out of range observed rates */
+
+/* DBG_SKYWALK has same toplevel code as DBG_DLIL, so don't reuse subcodes */
+#define DBG_SKYWALK_FLOWSWITCH  0x11
+#define DBG_SKYWALK_NETIF       0x12
+#define DBG_SKYWALK_CHANNEL     0x13
+
+#define PPT_TEST            0x01
+#define PPT_JETSAM_HIWAT    0x02
+#define PPT_JETSAM_TOPPROC  0x03
+
+#define SKYWALKDBG_CODE(SubClass, code) KDBG_CODE(DBG_DLIL, SubClass, code)
+#define PPTDBG_CODE(SubClass, code) KDBG_CODE(DBG_PPT, SubClass, code)
+#define PERFCTRL_CODE(SubClass, code) KDBG_CODE(DBG_PERFCTRL, SubClass, code)
+
+#if !defined(DRIVERKIT)
+
+extern unsigned int kdebug_enable;
+
+/*
+ * Bits used by kdebug_enable.  These control which events are traced at
+ * runtime.
+ */
+#define KDEBUG_ENABLE_TRACE   (1U << 0)
+#define KDEBUG_ENABLE_ENTROPY (1U << 1) /* obsolete */
+#define KDEBUG_ENABLE_CHUD    (1U << 2) /* obsolete */
+#define KDEBUG_ENABLE_PPT     (1U << 3) /* obsolete */
+#define KDEBUG_ENABLE_SERIAL  (1U << 4) /* obsolete */
+
+/*
+ * If set, the timestamps in events are expected to be continuous times.
+ * Otherwise, the timestamps are absolute times.  IOPs should observe this bit
+ * in order to log events that can be merged cleanly with other event streams.
+ */
+#define KDEBUG_ENABLE_CONT_TIME 0x20
+
+#define KDEBUG_TRACE (KDEBUG_ENABLE_TRACE)
+
+/*
+ * Specify KDEBUG_PPT to indicate that the event belongs to the limited PPT set.
+ * PPT is deprecated -- use a typefilter and the PPTDBG class instead.
+ */
+#define KDEBUG_PPT    (KDEBUG_ENABLE_PPT)
+#define KDEBUG_COMMON (KDEBUG_ENABLE_TRACE | KDEBUG_ENABLE_PPT)
+
+/*
+ * The kernel debug configuration level.  These values control which events are
+ * compiled in under different build configurations.
+ *
+ * Infer the supported kernel debug event level from config option.  Use
+ * (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) as a guard to protect unaudited debug
+ * code.
+ */
+#define KDEBUG_LEVEL_NONE     0
+#define KDEBUG_LEVEL_IST      1
+#define KDEBUG_LEVEL_STANDARD 2
+#define KDEBUG_LEVEL_FULL     3
+
+#if NO_KDEBUG
+#define KDEBUG_LEVEL KDEBUG_LEVEL_NONE
+#elif IST_KDEBUG
+#define KDEBUG_LEVEL KDEBUG_LEVEL_IST
+#elif KDEBUG
+#define KDEBUG_LEVEL KDEBUG_LEVEL_FULL
+#else
+#define KDEBUG_LEVEL KDEBUG_LEVEL_STANDARD
+/*
+ * Currently, all other kernel configurations (development, etc) build with
+ * KDEBUG_LEVEL_STANDARD.
+ */
+#endif
+
+/*
+ * Some Apple internal clients try to use the kernel macros in user space.
+ */
+#ifndef KERNEL_DEBUG
+#define KERNEL_DEBUG(...) do { } while (0)
+#endif /* !defined(KERNEL_DEBUG) */
+
+#pragma mark - private definitions
+
+/*
+ * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
+ * structure.
+ */
+#if defined(__arm64__)
+typedef uint64_t kd_buf_argtype;
+#else
+typedef uintptr_t kd_buf_argtype;
+#endif
+
+typedef struct {
+       uint64_t timestamp;
+       kd_buf_argtype arg1;
+       kd_buf_argtype arg2;
+       kd_buf_argtype arg3;
+       kd_buf_argtype arg4;
+       kd_buf_argtype arg5; /* the thread ID */
+       uint32_t debugid;
+/*
+ * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
+ * structure.
+ */
+#if defined(__LP64__) || defined(__arm64__)
+       uint32_t cpuid;
+       kd_buf_argtype unused;
+#endif
+} kd_buf;
+
+#if defined(__LP64__) || defined(__arm64__)
+#define KDBG_TIMESTAMP_MASK             0xffffffffffffffffULL
+static inline void
+kdbg_set_cpu(kd_buf *kp, int cpu)
+{
+       kp->cpuid = (unsigned int)cpu;
+}
+static inline int
+kdbg_get_cpu(kd_buf *kp)
+{
+       return (int)kp->cpuid;
+}
+static inline void
+kdbg_set_timestamp(kd_buf *kp, uint64_t thetime)
+{
+       kp->timestamp = thetime;
+}
+static inline uint64_t
+kdbg_get_timestamp(kd_buf *kp)
+{
+       return kp->timestamp;
+}
+static inline void
+kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu)
+{
+       kdbg_set_timestamp(kp, thetime);
+       kdbg_set_cpu(kp, cpu);
+}
+#else
+#define KDBG_TIMESTAMP_MASK 0x00ffffffffffffffULL
+#define KDBG_CPU_MASK       0xff00000000000000ULL
+#define KDBG_CPU_SHIFT      56
+static inline void
+kdbg_set_cpu(kd_buf *kp, int cpu)
+{
+       kp->timestamp = (kp->timestamp & KDBG_TIMESTAMP_MASK) |
+           (((uint64_t) cpu) << KDBG_CPU_SHIFT);
+}
+static inline int
+kdbg_get_cpu(kd_buf *kp)
+{
+       return (int) (((kp)->timestamp & KDBG_CPU_MASK) >> KDBG_CPU_SHIFT);
+}
+static inline void
+kdbg_set_timestamp(kd_buf *kp, uint64_t thetime)
+{
+       kp->timestamp = thetime & KDBG_TIMESTAMP_MASK;
+}
+static inline uint64_t
+kdbg_get_timestamp(kd_buf *kp)
+{
+       return kp->timestamp & KDBG_TIMESTAMP_MASK;
+}
+static inline void
+kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu)
+{
+       kp->timestamp = (thetime & KDBG_TIMESTAMP_MASK) |
+           (((uint64_t) cpu) << KDBG_CPU_SHIFT);
+}
+#endif
+
+/*
+ * 2^16 bits (8 kilobytes), one for each possible class/subclass combination
+ */
+#define KDBG_TYPEFILTER_BITMAP_SIZE ((256 * 256) / 8)
+
+/*
+ * Bits for kd_ctrl_page.flags, KERN_KD{D,E}FLAGS.
+ */
+#define KDBG_INIT       (1U << 0) /* obsolete */
+/* disable tracing when buffers are full */
+#define KDBG_NOWRAP     (1U << 1)
+#define KDBG_FREERUN    (1U << 2) /* obsolete */
+/* buffer has wrapped */
+#define KDBG_WRAPPED    (1U << 3)
+/* flags that are allowed to be set by user space */
+#define KDBG_USERFLAGS  (KDBG_FREERUN | KDBG_NOWRAP | KDBG_INIT)
+/* only include processes with kdebug bit set in proc */
+#define KDBG_PIDCHECK   (1U << 4)
+/* thread map is initialized */
+#define KDBG_MAPINIT    (1U << 5)
+/* exclude processes based on kdebug bit in proc */
+#define KDBG_PIDEXCLUDE (1U << 6)
+/* whether the kdebug locks are intialized */
+#define KDBG_LOCKINIT   (1U << 7)
+/* word size of the kernel */
+#define KDBG_LP64       (1U << 8)
+
+/* bits for kd_ctrl_page.flags and kbufinfo_t.flags */
+
+/* only trace events within a range */
+#define KDBG_RANGECHECK       0x00100000U
+/* only trace at most 4 types of events, at the code granularity */
+#define KDBG_VALCHECK         0x00200000U
+/* check class and subclass against the typefilter */
+#define KDBG_TYPEFILTER_CHECK 0x00400000U
+/* kdebug trace buffers are initialized */
+#define KDBG_BUFINIT          0x80000000U
+
+/* bits for the type field of kd_regtype */
+#define KDBG_CLASSTYPE  0x10000
+#define KDBG_SUBCLSTYPE 0x20000
+#define KDBG_RANGETYPE  0x40000
+#define KDBG_TYPENONE   0x80000
+#define KDBG_CKTYPES    0xF0000
+
+typedef struct {
+       unsigned int type;
+       unsigned int value1;
+       unsigned int value2;
+       unsigned int value3;
+       unsigned int value4;
+} kd_regtype;
+
+typedef struct {
+       /* number of events that can fit in the buffers */
+       int nkdbufs;
+       /* set if trace is disabled */
+       int nolog;
+       /* kd_ctrl_page.flags */
+       unsigned int flags;
+       /* number of threads in thread map */
+       int nkdthreads;
+       /* the owning pid */
+       int bufid;
+} kbufinfo_t;
+
+typedef struct {
+       /* the thread ID */
+#if defined(__arm64__)
+       uint64_t thread;
+#else
+       uintptr_t thread;
+#endif
+       /* 0 for invalid, otherwise the PID (or 1 for kernel_task) */
+       int valid;
+       /* the name of the process owning the thread */
+       char command[20];
+} kd_threadmap;
+
+typedef struct {
+       uint32_t version_no;
+       uint32_t cpu_count;
+} kd_cpumap_header;
+
+/* cpumap flags */
+#define KDBG_CPUMAP_IS_IOP      0x1
+
+typedef struct {
+       uint32_t cpu_id;
+       uint32_t flags;
+       char name[8];
+} kd_cpumap;
+
+typedef struct {
+       int             version_no;
+       int             thread_count;
+       uint64_t        TOD_secs;
+       uint32_t        TOD_usecs;
+} RAW_header;
+
+#define RAW_VERSION0    0x55aa0000
+#define RAW_VERSION1    0x55aa0101
+#define RAW_VERSION2    0x55aa0200 /* Only used by kperf and Instruments */
+
+/*
+ * Bits set in the comm page for kdebug.
+ */
+#define KDEBUG_COMMPAGE_ENABLE_TRACE      0x1
+#define KDEBUG_COMMPAGE_ENABLE_TYPEFILTER 0x2 /* Forced to false if ENABLE_TRACE is 0 */
+
+#pragma mark - EnergyTracing
+
+/* for EnergyTracing user space & clients */
+#define kEnTrCompKernel     2
+
+/*
+ *   EnergyTracing opcodes
+ *
+ *   Activations use DBG_FUNC_START/END.
+ *   Events are DBG_FUNC_NONE.
+ */
+
+/* Socket reads and writes are uniquely identified by the (sanitized)
+ *  pointer to the socket struct in question.  To associate this address
+ *  with the user space file descriptor, we have a socket activation with
+ *  the FD as its identifier and the socket struct pointer as its value.
+ */
+#define kEnTrActKernSocket      1
+#define kEnTrActKernSockRead    2
+#define kEnTrActKernSockWrite   3
+
+#define kEnTrActKernPoll        10
+#define kEnTrActKernSelect      11
+#define kEnTrActKernKQWait      12
+
+// events
+#define kEnTrEvUnblocked        256
+
+// EnergyTracing flags (the low-order 16 bits of 'quality')
+#define kEnTrFlagNonBlocking    1 << 0
+#define kEnTrFlagNoWork         1 << 1
+
+/*
+ * EnergyTracing macros.
+ */
+
+#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST)
+// whether to bother calculating EnergyTracing inputs
+// could change in future to see if DBG_ENERGYTRACE is active
+#define ENTR_SHOULDTRACE kdebug_enable
+// encode logical EnergyTracing into 32/64 KDebug trace
+#define ENTR_KDTRACE(component, opcode, lifespan, id, quality, value)   \
+do {                                                                    \
+    uint32_t kdcode__;                                                  \
+    uintptr_t highval__, lowval__, mask__ = 0xffffffff;                 \
+    kdcode__ = KDBG_CODE(DBG_ENERGYTRACE,component,opcode)|(lifespan);  \
+    highval__ = ((value) >> 32) & mask__;                               \
+    lowval__ = (value) & mask__;                                        \
+    ENTR_KDTRACEFUNC(kdcode__, id, quality, highval__, lowval__);       \
+} while(0)
+
+/*
+ *   Trace the association of two existing activations.
+ *
+ *   An association is traced as a modification to the parent activation.
+ *   In order to fit the sub-activation's component, activation code, and
+ *   activation ID into a kdebug tracepoint, the arguments that would hold
+ *   the value are left separate, and one stores the component and opcode
+ *   of the sub-activation, while the other stores the pointer-sized
+ *   activation ID.
+ *
+ *           arg2                   arg3               arg4
+ +-----------------+  +~+----+----+--------+   +----------+
+ |kEnTrModAssociate|  | |    |    |        |   |          |
+ +-----------------+  +~+----+----+--------+   +----------+
+ *                           8-bits unused       sub-activation ID
+ *                                8-bit sub-component
+ *                                     16-bit sub-opcode
+ *
+ */
+#define kEnTrModAssociate (1 << 28)
+#define ENTR_KDASSOCIATE(par_comp, par_opcode, par_act_id,              \
+           sub_comp, sub_opcode, sub_act_id)              \
+do {                                                                    \
+    unsigned sub_compcode = ((unsigned)sub_comp << 16) | sub_opcode;    \
+    ENTR_KDTRACEFUNC(KDBG_CODE(DBG_ENERGYTRACE,par_comp,par_opcode),    \
+                    par_act_id, kEnTrModAssociate, sub_compcode,       \
+                    sub_act_id);                                       \
+} while(0)
+
+#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
+
+#define ENTR_SHOULDTRACE FALSE
+#define ENTR_KDTRACE(component, opcode, lifespan, id, quality, value)   \
+                                   do {} while (0)
+#define ENTR_KDASSOCIATE(par_comp, par_opcode, par_act_id,              \
+           sub_comp, sub_opcode, sub_act_id)              \
+                                   do {} while (0)
+
+#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
+
+#endif /* !defined(DRIVERKIT) */
+
+__END_DECLS
+
+#endif /* !defined(BSD_KDEBUG_PRIVATE_H) */
index a53bbdc194c699d1c6cd815052f530f7da8e1ead..3a3622fc781d3d2a3ec20470a4869891537fc858 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -51,13 +51,27 @@ extern kern_return_t arm64_lock_test(void);
 #endif
 kern_return_t kalloc_test(void);
 kern_return_t ipi_test(void);
+#if __ARM_PAN_AVAILABLE__
+extern kern_return_t arm64_late_pan_test(void);
+#endif
+#if HAS_TWO_STAGE_SPR_LOCK
+extern kern_return_t arm64_spr_lock_test(void);
+#endif
+extern kern_return_t copyio_test(void);
 
 struct xnupost_test bsd_post_tests[] = {
 #ifdef __arm64__
        XNUPOST_TEST_CONFIG_BASIC(arm64_lock_test),
+#endif
+#if __ARM_PAN_AVAILABLE__
+       XNUPOST_TEST_CONFIG_BASIC(arm64_late_pan_test),
 #endif
        XNUPOST_TEST_CONFIG_BASIC(kalloc_test),
-       XNUPOST_TEST_CONFIG_BASIC(ipi_test)
+       XNUPOST_TEST_CONFIG_BASIC(ipi_test),
+#if HAS_TWO_STAGE_SPR_LOCK
+       XNUPOST_TEST_CONFIG_BASIC(arm64_spr_lock_test),
+#endif
+       XNUPOST_TEST_CONFIG_BASIC(copyio_test),
 };
 
 uint32_t bsd_post_tests_count = sizeof(bsd_post_tests) / sizeof(xnupost_test_data_t);
@@ -130,17 +144,17 @@ kalloc_test()
 #define XNUPOST_TNAME_MAXLEN 132
 
 struct kcdata_subtype_descriptor kc_xnupost_test_def[] = {
-       {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT16, 0, sizeof(uint16_t), "config"},
-       {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT16, 1 * sizeof(uint16_t), sizeof(uint16_t), "test_num"},
-       {KCS_SUBTYPE_FLAGS_NONE, KC_ST_INT32, 2 * sizeof(uint16_t), sizeof(int32_t), "retval"},
-       {KCS_SUBTYPE_FLAGS_NONE, KC_ST_INT32, 2 * sizeof(uint16_t) + sizeof(int32_t), sizeof(int32_t), "expected_retval"},
-       {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 2 * (sizeof(uint16_t) + sizeof(int32_t)), sizeof(uint64_t), "begin_time"},
-       {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 2 * (sizeof(uint16_t) + sizeof(int32_t)) + sizeof(uint64_t), sizeof(uint64_t), "end_time"},
-       {KCS_SUBTYPE_FLAGS_ARRAY,
-        KC_ST_CHAR,
-        2 * (sizeof(uint16_t) + sizeof(int32_t) + sizeof(uint64_t)),
-        KCS_SUBTYPE_PACK_SIZE(XNUPOST_TNAME_MAXLEN * sizeof(char), sizeof(char)),
-        "test_name"}
+       {.kcs_flags = KCS_SUBTYPE_FLAGS_NONE, .kcs_elem_type = KC_ST_UINT16, .kcs_elem_offset = 0, .kcs_elem_size = sizeof(uint16_t), .kcs_name = "config"},
+       {.kcs_flags = KCS_SUBTYPE_FLAGS_NONE, .kcs_elem_type = KC_ST_UINT16, .kcs_elem_offset = 1 * sizeof(uint16_t), .kcs_elem_size = sizeof(uint16_t), .kcs_name = "test_num"},
+       {.kcs_flags = KCS_SUBTYPE_FLAGS_NONE, .kcs_elem_type = KC_ST_INT32, .kcs_elem_offset = 2 * sizeof(uint16_t), .kcs_elem_size = sizeof(int32_t), .kcs_name = "retval"},
+       {.kcs_flags = KCS_SUBTYPE_FLAGS_NONE, .kcs_elem_type = KC_ST_INT32, .kcs_elem_offset = 2 * sizeof(uint16_t) + sizeof(int32_t), .kcs_elem_size = sizeof(int32_t), .kcs_name = "expected_retval"},
+       {.kcs_flags = KCS_SUBTYPE_FLAGS_NONE, .kcs_elem_type = KC_ST_UINT64, .kcs_elem_offset = 2 * (sizeof(uint16_t) + sizeof(int32_t)), .kcs_elem_size = sizeof(uint64_t), .kcs_name = "begin_time"},
+       {.kcs_flags = KCS_SUBTYPE_FLAGS_NONE, .kcs_elem_type = KC_ST_UINT64, .kcs_elem_offset = 2 * (sizeof(uint16_t) + sizeof(int32_t)) + sizeof(uint64_t), .kcs_elem_size = sizeof(uint64_t), .kcs_name = "end_time"},
+       {.kcs_flags = KCS_SUBTYPE_FLAGS_ARRAY,
+        .kcs_elem_type = KC_ST_CHAR,
+        .kcs_elem_offset = 2 * (sizeof(uint16_t) + sizeof(int32_t) + sizeof(uint64_t)),
+        .kcs_elem_size = KCS_SUBTYPE_PACK_SIZE(XNUPOST_TNAME_MAXLEN * sizeof(char), sizeof(char)),
+        .kcs_name = "test_name"}
 };
 
 const uint32_t kc_xnupost_test_def_count = sizeof(kc_xnupost_test_def) / sizeof(struct kcdata_subtype_descriptor);
diff --git a/bsd/tests/copyio_tests.c b/bsd/tests/copyio_tests.c
new file mode 100644 (file)
index 0000000..f3594be
--- /dev/null
@@ -0,0 +1,561 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <kern/kalloc.h>
+#include <kern/task.h>
+#include <kern/thread.h>
+#include <libkern/libkern.h>
+#include <mach/mach_vm.h>
+#include <mach/semaphore.h>
+#include <mach/task.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_protos.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/proc_internal.h>
+#include <sys/vm.h>
+#include <tests/ktest.h>
+
+kern_return_t copyio_test(void);
+
+#define copyio_test_buf_size (PAGE_SIZE * 16)
+static const char copyio_test_string[] = {'T', 'e', 's', 't', ' ', 'S', 't', 'r', 'i', 'n', 'g', '!', '\0', 'A', 'B', 'C'};
+
+struct copyio_test_data {
+       /* VM map of the current userspace process. */
+       vm_map_t user_map;
+       /* The start of a `copyio_test_buf_size'-sized region mapped into userspace. */
+       mach_vm_offset_t user_addr;
+       /* The start of a page-sized region that guaranteed to be unmapped in userspace. */
+       mach_vm_offset_t unmapped_addr;
+       /* The start of a page-sized region mapped at the largest possible userspace address. */
+       mach_vm_offset_t user_lastpage_addr;
+       /* Kernel mapping of the physical pages mapped at `user_addr'. */
+       void *kern_addr;
+
+       /* Scratch buffers of size `copyio_test_buf_size'. */
+       char *buf1, *buf2;
+       /* Scratch data to pass to helper threads */
+       union {
+               void *thread_ptr;
+               uint64_t thread_data;
+       };
+};
+
+typedef int (*copyio_thread_fn_t)(struct copyio_test_data *);
+
+struct copyio_test_thread_data {
+       copyio_thread_fn_t fn;
+       struct copyio_test_data *data;
+       int ret;
+       semaphore_t done;
+};
+
+static void
+copyio_thread_call_fn(void *arg, wait_result_t __unused res)
+{
+       struct copyio_test_thread_data *tdata = arg;
+       tdata->ret = tdata->fn(tdata->data);
+       semaphore_signal(tdata->done);
+}
+
+static int
+copyio_test_run_in_thread(copyio_thread_fn_t fn, struct copyio_test_data *data)
+{
+       struct copyio_test_thread_data tdata = {
+               .fn = fn,
+               .data = data,
+       };
+       thread_t thread;
+
+       semaphore_create(current_task(), &tdata.done, SYNC_POLICY_FIFO, 0);
+       kernel_thread_start(copyio_thread_call_fn, &tdata, &thread);
+
+       semaphore_wait(tdata.done);
+
+       thread_deallocate(thread);
+       semaphore_destroy(current_task(), tdata.done);
+
+       return tdata.ret;
+}
+
+static void
+copyio_test_protect(struct copyio_test_data *data, vm_prot_t prot)
+{
+       kern_return_t ret = mach_vm_protect(data->user_map, data->user_addr, copyio_test_buf_size, false, prot);
+       assert(ret == KERN_SUCCESS);
+}
+
+static int
+copyin_from_kernel(struct copyio_test_data *data)
+{
+       char *in_buf = data->buf2;
+       return copyin((uintptr_t)data->kern_addr, in_buf, copyio_test_buf_size);
+}
+
+static void
+copyin_test(struct copyio_test_data *data)
+{
+       char *out_buf = data->buf1;
+       char *in_buf = data->buf2;
+
+       for (size_t i = 0; i < copyio_test_buf_size; i++) {
+               out_buf[i] = (char)i;
+       }
+       memcpy(data->kern_addr, out_buf, copyio_test_buf_size);
+
+       int err = copyin(data->user_addr, in_buf, copyio_test_buf_size);
+       T_EXPECT_EQ_INT(err, 0, "copyin() with valid parameters should succeed");
+       int cmp = memcmp(out_buf, in_buf, copyio_test_buf_size);
+       T_EXPECT_EQ_INT(cmp, 0, "copyin() should correctly copy in data");
+
+       err = copyin(data->unmapped_addr, NULL, 0);
+       T_EXPECT_EQ_INT(err, 0, "copyin() with 0 size should always succeed");
+
+       err = copyin(data->unmapped_addr, in_buf, copyio_test_buf_size);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyin() from unmapped userspace address should return EFAULT");
+       err = copyin(data->unmapped_addr - PAGE_SIZE, in_buf, PAGE_SIZE * 2);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyin() from partially valid userspace range should return EFAULT");
+       err = copyin(data->user_lastpage_addr, in_buf, PAGE_SIZE * 2);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyin() past end of userspace address space should return EFAULT");
+
+       bzero(in_buf, copyio_test_buf_size);
+       err = copyio_test_run_in_thread(copyin_from_kernel, data);
+       T_EXPECT_EQ_INT(err, 0, "copyin() from kernel address in kernel_task thread should succeed");
+       cmp = memcmp(data->kern_addr, in_buf, copyio_test_buf_size);
+       T_EXPECT_EQ_INT(cmp, 0, "copyin() from kernel address should correctly copy in data");
+       err = copyin_from_kernel(data);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyin() from kernel address in other threads should return EFAULT");
+
+       copyio_test_protect(data, VM_PROT_WRITE);
+       err = copyin(data->user_addr, in_buf, copyio_test_buf_size);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyin() from write-only address should return EFAULT");
+       copyio_test_protect(data, VM_PROT_READ | VM_PROT_WRITE);
+}
+
+static int
+copyout_to_kernel(struct copyio_test_data *data)
+{
+       char *out_buf = data->buf1;
+       return copyout(out_buf, (uintptr_t)data->kern_addr, copyio_test_buf_size);
+}
+
+static void
+copyout_test(struct copyio_test_data *data)
+{
+       char *out_buf = data->buf1;
+
+       bzero(data->kern_addr, copyio_test_buf_size);
+
+       for (size_t i = 0; i < copyio_test_buf_size; i++) {
+               out_buf[i] = ~(char)i;
+       }
+       int err = copyout(out_buf, data->user_addr, copyio_test_buf_size);
+       T_EXPECT_EQ_INT(err, 0, "copyout() with valid parameters should succeed");
+
+       int cmp = memcmp(data->kern_addr, out_buf, copyio_test_buf_size);
+       T_EXPECT_EQ_INT(cmp, 0, "copyout() should correctly copy out data");
+
+       err = copyout(NULL, data->unmapped_addr, 0);
+       T_EXPECT_EQ_INT(err, 0, "copyout() with 0 size should always succeed");
+
+       err = copyout(out_buf, data->unmapped_addr, copyio_test_buf_size);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyout() to unmapped userspace address should return EFAULT");
+       err = copyout(out_buf, data->unmapped_addr - PAGE_SIZE, PAGE_SIZE * 2);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyout() to partially valid userspace range should return EFAULT");
+       err = copyout(out_buf, data->user_lastpage_addr, PAGE_SIZE * 2);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyout() past end of userspace address space should return EFAULT");
+
+       bzero(data->kern_addr, copyio_test_buf_size);
+
+       err = copyio_test_run_in_thread(copyout_to_kernel, data);
+       T_EXPECT_EQ_INT(err, 0, "copyout() to kernel address in kernel_task thread should succeed");
+       cmp = memcmp(out_buf, data->kern_addr, copyio_test_buf_size);
+       T_EXPECT_EQ_INT(cmp, 0, "copyout() to kernel address should correctly copy out data");
+       err = copyout_to_kernel(data);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyout() to kernel address in other threads should return EFAULT");
+
+       copyio_test_protect(data, VM_PROT_READ);
+       err = copyout(out_buf, data->user_addr, copyio_test_buf_size);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyout() to read-only address should return EFAULT");
+       copyio_test_protect(data, VM_PROT_READ | VM_PROT_WRITE);
+}
+
+static int
+copyinstr_from_kernel(struct copyio_test_data *data)
+{
+       char *in_buf = data->buf1;
+       size_t *lencopied = data->thread_ptr;
+       return copyinstr((user_addr_t)data->kern_addr, in_buf, copyio_test_buf_size, lencopied);
+}
+
+static void
+copyinstr_test(struct copyio_test_data *data)
+{
+       char *in_buf = data->buf1;
+
+       memcpy(data->kern_addr, copyio_test_string, sizeof(copyio_test_string));
+
+       bzero(in_buf, copyio_test_buf_size);
+       size_t lencopied;
+       int err = copyinstr(data->user_addr, in_buf, copyio_test_buf_size, &lencopied);
+       T_EXPECT_EQ_INT(err, 0, "copyinstr() with valid parameters should succeed");
+       T_EXPECT_EQ_ULONG(lencopied, strlen(copyio_test_string) + 1, "copyinstr() with a large enough buffer should read entire string");
+
+       int cmp = strncmp(in_buf, copyio_test_string, lencopied);
+       T_EXPECT_EQ_INT(cmp, 0, "copyinstr() should correctly copy string up to NULL terminator");
+       cmp = memcmp(in_buf, copyio_test_string, sizeof(copyio_test_string));
+       T_EXPECT_NE_INT(cmp, 0, "copyinstr() should not read past NULL terminator");
+
+       bzero(in_buf, copyio_test_buf_size);
+       const vm_size_t trunc_size = strlen(copyio_test_string) - 4;
+       err = copyinstr(data->user_addr, in_buf, trunc_size, &lencopied);
+       T_EXPECT_EQ_INT(err, ENAMETOOLONG, "truncated copyinstr() should return ENAMETOOLONG");
+       T_EXPECT_EQ_ULONG(lencopied, trunc_size, "truncated copyinstr() should copy exactly `maxlen' bytes");
+       cmp = memcmp(in_buf, copyio_test_string, trunc_size);
+       T_EXPECT_EQ_INT(cmp, 0, "copyinstr() should correctly copy in truncated string");
+       cmp = memcmp(in_buf, copyio_test_string, strlen(copyio_test_string));
+       T_EXPECT_NE_INT(cmp, 0, "copyinstr() should stop copying at `maxlen' bytes");
+
+       err = copyinstr(data->unmapped_addr, in_buf, copyio_test_buf_size, &lencopied);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyinstr() from unmapped userspace address should return EFAULT");
+       err = copyinstr(data->user_lastpage_addr, in_buf, PAGE_SIZE * 2, &lencopied);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyinstr() past end of userspace address space should return EFAULT");
+
+       bzero(in_buf, copyio_test_buf_size);
+       data->thread_ptr = &lencopied;
+
+       err = copyio_test_run_in_thread(copyinstr_from_kernel, data);
+#if defined(CONFIG_EMBEDDED)
+       T_EXPECT_EQ_INT(err, EFAULT, "copyinstr() from kernel address in kernel_task thread should return EFAULT");
+#else
+       T_EXPECT_EQ_INT(err, 0, "copyinstr() from kernel address in kernel_task thread should succeed");
+       T_EXPECT_EQ_ULONG(lencopied, strlen(copyio_test_string) + 1, "copyinstr() from kernel address should read entire string");
+       cmp = strncmp(in_buf, copyio_test_string, lencopied);
+       T_EXPECT_EQ_INT(cmp, 0, "copyinstr() from kernel address should correctly copy string up to NULL terminator");
+       cmp = memcmp(in_buf, copyio_test_string, sizeof(copyio_test_string));
+       T_EXPECT_NE_INT(cmp, 0, "copyinstr() from kernel address should not read past NULL terminator");
+#endif
+       err = copyinstr_from_kernel(data);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyinstr() from kernel address in other threads should return EFAULT");
+
+       copyio_test_protect(data, VM_PROT_WRITE);
+       err = copyinstr(data->user_addr, in_buf, copyio_test_buf_size, &lencopied);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyinstr() from write-only address should return EFAULT");
+       copyio_test_protect(data, VM_PROT_READ | VM_PROT_WRITE);
+
+       /* Place an unterminated string at the end of the mapped region */
+       const size_t unterminated_size = 16;
+       char *kern_unterminated_addr = (char *)data->kern_addr + copyio_test_buf_size - unterminated_size;
+       memset(kern_unterminated_addr, 'A', unterminated_size);
+
+       mach_vm_offset_t user_unterminated_addr = data->user_addr + copyio_test_buf_size - unterminated_size;
+       err = copyinstr(user_unterminated_addr, in_buf, copyio_test_buf_size, &lencopied);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyinstr() from userspace region without NULL terminator should return EFAULT");
+}
+
+static int
+copyoutstr_to_kernel(struct copyio_test_data *data)
+{
+       size_t *lencopied = data->thread_ptr;
+       return copyoutstr(copyio_test_string, (user_addr_t)data->kern_addr, sizeof(copyio_test_string), lencopied);
+}
+
+static void
+copyoutstr_test(struct copyio_test_data *data)
+{
+       bzero(data->kern_addr, sizeof(copyio_test_string));
+
+       size_t lencopied;
+       int err = copyoutstr(copyio_test_string, data->user_addr, sizeof(copyio_test_string), &lencopied);
+       T_EXPECT_EQ_INT(err, 0, "copyoutstr() with valid parameters should succeed");
+       T_EXPECT_EQ_ULONG(lencopied, strlen(copyio_test_string) + 1, "copyoutstr() should copy string up to NULL terminator");
+
+       int cmp = strncmp(data->kern_addr, copyio_test_string, sizeof(copyio_test_string));
+       T_EXPECT_EQ_INT(cmp, 0, "copyoutstr() should correctly copy out string");
+       cmp = memcmp(data->kern_addr, copyio_test_string, sizeof(copyio_test_string));
+       T_EXPECT_NE_INT(cmp, 0, "copyoutstr() should stop copying at NULL terminator");
+
+       bzero(data->kern_addr, sizeof(copyio_test_string));
+
+       const vm_size_t trunc_size = strlen(copyio_test_string) - 4;
+       err = copyoutstr(copyio_test_string, data->user_addr, trunc_size, &lencopied);
+       T_EXPECT_EQ_INT(err, ENAMETOOLONG, "truncated copyoutstr() should return ENAMETOOLONG");
+       T_EXPECT_EQ_ULONG(lencopied, trunc_size, "truncated copyoutstr() should copy exactly `maxlen' bytes");
+       cmp = strncmp(data->kern_addr, copyio_test_string, trunc_size);
+       T_EXPECT_EQ_INT(cmp, 0, "copyoutstr() should correctly copy out truncated string");
+       cmp = memcmp(data->kern_addr, copyio_test_string, sizeof(copyio_test_string));
+       T_EXPECT_NE_INT(cmp, 0, "copyoutstr() should stop copying at `maxlen' bytes");
+
+       err = copyoutstr(copyio_test_string, data->unmapped_addr, strlen(copyio_test_string), &lencopied);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyoutstr() to unmapped userspace address should return EFAULT");
+       err = copyoutstr(copyio_test_string, data->unmapped_addr - 1, strlen(copyio_test_string), &lencopied);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyoutstr() to partially valid userspace range should return EFAULT");
+       err = copyoutstr(copyio_test_string, data->user_lastpage_addr + PAGE_SIZE - 1, strlen(copyio_test_string), &lencopied);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyoutstr() past end of userspace address space should return EFAULT");
+
+       bzero(data->kern_addr, sizeof(copyio_test_string));
+       data->thread_ptr = &lencopied;
+
+       err = copyio_test_run_in_thread(copyoutstr_to_kernel, data);
+#if defined(CONFIG_EMBEDDED)
+       T_EXPECT_EQ_INT(err, EFAULT, "copyoutstr() to kernel address in kernel_task thread should return EFAULT");
+#else
+       T_EXPECT_EQ_INT(err, 0, "copyoutstr() to kernel address in kernel_task thread should succeed");
+       T_EXPECT_EQ_ULONG(lencopied, strlen(copyio_test_string) + 1, "copyoutstr() to kernel address should copy string up to NULL terminator");
+       cmp = strncmp(data->kern_addr, copyio_test_string, sizeof(copyio_test_string));
+       T_EXPECT_EQ_INT(cmp, 0, "copyoutstr() to kernel address should correctly copy out data");
+#endif
+       err = copyoutstr_to_kernel(data);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyoutstr() to kernel address in other threads should return EFAULT");
+
+       copyio_test_protect(data, VM_PROT_READ);
+       err = copyoutstr(copyio_test_string, data->user_addr, strlen(copyio_test_string), &lencopied);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyoutstr() to read-only address should return EFAULT");
+       copyio_test_protect(data, VM_PROT_READ | VM_PROT_WRITE);
+}
+
+static int
+copyin_atomic32_from_kernel(struct copyio_test_data *data)
+{
+       return copyin_atomic32((uintptr_t)data->kern_addr, data->thread_ptr);
+}
+
+static int
+copyin_atomic64_from_kernel(struct copyio_test_data *data)
+{
+       return copyin_atomic64((uintptr_t)data->kern_addr, data->thread_ptr);
+}
+
+static int
+copyout_atomic32_to_kernel(struct copyio_test_data *data)
+{
+       return copyout_atomic32(data->thread_data, (user_addr_t)data->kern_addr);
+}
+
+static int
+copyout_atomic64_to_kernel(struct copyio_test_data *data)
+{
+       return copyout_atomic64(data->thread_data, (user_addr_t)data->kern_addr);
+}
+
+/**
+ * Note: we can't test atomic copyio calls which go past the end of the
+ * userspace address space, since there's no way to provide a range
+ * that straddles the userspace address boundary while being suitably
+ * aligned for the copy.
+ */
+#define copyin_atomic_test(data, word_t, copyin_fn, copyin_from_kernel_fn)                                              \
+       do {                                                                                                            \
+               const word_t word_out = (word_t)0x123456789ABCDEF0UL;                                                   \
+               word_t word_in = 0;                                                                                     \
+               memcpy(data->kern_addr, &word_out, sizeof(word_out));                                                   \
+                                                                                                                        \
+               int err = copyin_fn(data->user_addr, &word_in);                                                         \
+               T_EXPECT_EQ_INT(err, 0, #copyin_fn "() with valid parameters should succeed");                          \
+                                                                                                                        \
+               int cmp = memcmp(&word_in, &word_out, sizeof(word_t));                                                  \
+               T_EXPECT_EQ_INT(cmp, 0, #copyin_fn "() should correctly copy word");                                    \
+                                                                                                                        \
+               for (unsigned int offset = 1; offset < sizeof(word_t); offset++) {                                      \
+                       err = copyin_fn(data->user_addr + offset, &word_in);                                            \
+                       T_EXPECT_EQ_INT(err, EINVAL,                                                                    \
+                           #copyin_fn "() from unaligned userspace address should return EINVAL (offset = %u)",        \
+                           offset);                                                                                    \
+               };                                                                                                      \
+               err = copyin_fn(data->unmapped_addr, &word_in);                                                         \
+               T_EXPECT_EQ_INT(err, EFAULT, #copyin_fn "() from unmapped userspace address should return EFAULT");     \
+                                                                                                                        \
+               data->thread_ptr = &word_in;                                                                            \
+                                                                                                                        \
+               err = copyio_test_run_in_thread(copyin_from_kernel_fn, data);                                           \
+               T_EXPECT_EQ_INT(err, EFAULT,                                                                            \
+                   #copyin_fn "() from kernel address in kernel_task threads should return EFAULT");                   \
+               err = copyin_from_kernel_fn(data);                                                                      \
+               T_EXPECT_EQ_INT(err, EFAULT,                                                                            \
+                   #copyin_fn "() from kernel address in other threads should return EFAULT");                         \
+                                                                                                                        \
+               copyio_test_protect(data, VM_PROT_WRITE);                                                               \
+               err = copyin_fn(data->user_addr, &word_in);                                                             \
+               T_EXPECT_EQ_INT(err, EFAULT, #copyin_fn "() from write-only address should return EFAULT");             \
+               copyio_test_protect(data, VM_PROT_READ | VM_PROT_WRITE);                                                \
+       } while (0)
+
+#define copyout_atomic_test(data, word_t, copyout_fn, copyout_to_kernel_fn)                                             \
+       do {                                                                                                            \
+               const word_t word_out = (word_t)0x123456789ABCDEF0UL;                                                   \
+               bzero(data->kern_addr, sizeof(word_t));                                                                 \
+                                                                                                                        \
+               int err = copyout_fn(word_out, data->user_addr);                                                        \
+               T_EXPECT_EQ_INT(err, 0, #copyout_fn "() with valid parameters should succeed");                         \
+                                                                                                                        \
+               int cmp = memcmp(data->kern_addr, &word_out, sizeof(word_t));                                           \
+               T_EXPECT_EQ_INT(cmp, 0, #copyout_fn "() should correctly copy word");                                   \
+                                                                                                                        \
+               for (unsigned int offset = 1; offset < sizeof(word_t); offset++) {                                      \
+                       err = copyout_fn(word_out, data->user_addr + offset);                                           \
+                       T_EXPECT_EQ_INT(err, EINVAL,                                                                    \
+                           #copyout_fn "() to unaligned userspace address should return EINVAL (offset = %u)",         \
+                           offset);                                                                                    \
+               };                                                                                                      \
+               err = copyout_fn(word_out, data->unmapped_addr);                                                        \
+               T_EXPECT_EQ_INT(err, EFAULT, #copyout_fn "() to unmapped userspace address should return EFAULT");      \
+               err = copyout_fn(word_out, (uintptr_t)data->kern_addr);                                                 \
+               T_EXPECT_EQ_INT(err, EFAULT, #copyout_fn "() to kernel address should return EFAULT");                  \
+                                                                                                                        \
+               data->thread_data = word_out;                                                                           \
+                                                                                                                        \
+               err = copyio_test_run_in_thread(copyout_to_kernel_fn, data);                                            \
+               T_EXPECT_EQ_INT(err, EFAULT,                                                                            \
+                       #copyout_fn "() to kernel address in kernel_task thread should return EFAULT");                 \
+               err = copyout_to_kernel_fn(data);                                                                       \
+               T_EXPECT_EQ_INT(err, EFAULT, #copyout_fn "() to kernel address in other threads should return EFAULT"); \
+                                                                                                                        \
+               copyio_test_protect(data, VM_PROT_READ);                                                                \
+               err = copyout_fn(word_out, data->user_addr);                                                            \
+               T_EXPECT_EQ_INT(err, EFAULT, #copyout_fn "() to read-only address should return EFAULT");               \
+               copyio_test_protect(data, VM_PROT_READ | VM_PROT_WRITE);                                                \
+       } while (0)
+
+#define copyio_atomic_test(data, size)                                                          \
+       do {                                                                                    \
+               copyin_atomic_test((data), uint ## size ## _t, copyin_atomic ## size,           \
+                   copyin_atomic ## size ## _from_kernel);                                     \
+               copyout_atomic_test((data), uint ## size ## _t, copyout_atomic ## size,         \
+                   copyout_atomic ## size ## _to_kernel);                                      \
+       } while (0)
+
+static int
+copyin_atomic32_wait_if_equals_from_kernel(struct copyio_test_data *data)
+{
+       return copyin_atomic32_wait_if_equals((uintptr_t)data->kern_addr, data->thread_data);
+}
+
+static void
+copyin_atomic32_wait_if_equals_test(struct copyio_test_data *data)
+{
+       bzero(data->kern_addr, sizeof(uint32_t));
+       int err = copyin_atomic32_wait_if_equals(data->user_addr, 0);
+       T_EXPECT_EQ_INT(err, 0, "copyin_atomic32_wait_if_equals() should return 0 when equals");
+       err = copyin_atomic32_wait_if_equals(data->user_addr, ~0U);
+       T_EXPECT_EQ_INT(err, ESTALE, "copyin_atomic32_wait_if_equals() should return ESTALE when not equals");
+
+       for (unsigned int offset = 1; offset < sizeof(uint32_t); offset++) {
+               err = copyin_atomic32_wait_if_equals(data->user_addr + offset, 0);
+               T_EXPECT_EQ_INT(err, EINVAL,
+                   "copyin_atomic32_wait_if_equals() on unaligned userspace address should return EINVAL (offset = %u)",
+                   offset);
+       }
+       err = copyin_atomic32_wait_if_equals(data->unmapped_addr, 0);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyin_atomic32_wait_if_equals() on unmapped userspace address should return EFAULT");
+
+       data->thread_data = 0;
+
+       err = copyio_test_run_in_thread(copyin_atomic32_wait_if_equals_from_kernel, data);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyin_atomic32_wait_if_equals() from kernel address in kernel_task thread should return EFAULT");
+       err = copyin_atomic32_wait_if_equals_from_kernel(data);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyin_atomic32_wait_if_equals() from kernel address in other threads should return EFAULT");
+
+       copyio_test_protect(data, VM_PROT_WRITE);
+       err = copyin_atomic32_wait_if_equals(data->user_addr, 0);
+       T_EXPECT_EQ_INT(err, EFAULT, "copyin_atomic32_wait_if_equals() on write-only address should return EFAULT");
+       copyio_test_protect(data, VM_PROT_READ | VM_PROT_WRITE);
+}
+
+kern_return_t
+copyio_test(void)
+{
+       struct copyio_test_data data = {};
+       kern_return_t ret = KERN_SUCCESS;
+
+       data.buf1 = kalloc(copyio_test_buf_size);
+       data.buf2 = kalloc(copyio_test_buf_size);
+       if (!data.buf1 || !data.buf2) {
+               T_FAIL("failed to allocate scratch buffers");
+               ret = KERN_NO_SPACE;
+               goto err_kalloc;
+       }
+
+       /**
+        * This test needs to manipulate the current userspace process's
+        * address space.  This is okay to do at the specific point in time
+        * when bsd_do_post() runs: current_proc() points to the init process,
+        * which has been set up to the point of having a valid vm_map, but
+        * not to the point of actually execing yet.
+        */
+       proc_t proc = current_proc();
+       assert(proc->p_pid == 1);
+       data.user_map = get_task_map_reference(proc->task);
+
+       ret = mach_vm_allocate_kernel(data.user_map, &data.user_addr, copyio_test_buf_size + PAGE_SIZE, VM_FLAGS_ANYWHERE, VM_KERN_MEMORY_NONE);
+       if (ret) {
+               T_FAIL("mach_vm_allocate_kernel(user_addr) failed: %d", ret);
+               goto err_user_alloc;
+       }
+
+       data.user_lastpage_addr = get_map_max(data.user_map) - PAGE_SIZE;
+       ret = mach_vm_allocate_kernel(data.user_map, &data.user_lastpage_addr, PAGE_SIZE, VM_FLAGS_FIXED, VM_KERN_MEMORY_NONE);
+       if (ret) {
+               T_FAIL("mach_vm_allocate_kernel(user_lastpage_addr) failed: %d", ret);
+               goto err_user_lastpage_alloc;
+       }
+
+       data.unmapped_addr = data.user_addr + copyio_test_buf_size;
+       mach_vm_deallocate(data.user_map, data.unmapped_addr, PAGE_SIZE);
+
+       vm_prot_t cur_protection, max_protection;
+       mach_vm_offset_t kern_addr = 0;
+       ret = mach_vm_remap_kernel(kernel_map, &kern_addr, copyio_test_buf_size, VM_PROT_READ | VM_PROT_WRITE, VM_FLAGS_ANYWHERE, VM_KERN_MEMORY_NONE,
+           data.user_map, data.user_addr, false, &cur_protection, &max_protection, VM_INHERIT_NONE);
+       if (ret) {
+               T_FAIL("mach_vm_remap_kernel() failed: %d", ret);
+               goto err_kern_remap;
+       }
+       data.kern_addr = (void *)kern_addr;
+
+       copyin_test(&data);
+       copyout_test(&data);
+       copyinstr_test(&data);
+       copyoutstr_test(&data);
+       copyio_atomic_test(&data, 32);
+       copyio_atomic_test(&data, 64);
+       copyin_atomic32_wait_if_equals_test(&data);
+
+       mach_vm_deallocate(kernel_map, kern_addr, copyio_test_buf_size);
+err_kern_remap:
+       mach_vm_deallocate(data.user_map, data.user_lastpage_addr, PAGE_SIZE);
+err_user_lastpage_alloc:
+       mach_vm_deallocate(data.user_map, data.user_addr, copyio_test_buf_size);
+err_user_alloc:
+       vm_map_deallocate(data.user_map);
+err_kalloc:
+       kfree(data.buf2, copyio_test_buf_size);
+       kfree(data.buf1, copyio_test_buf_size);
+       return ret;
+}
index f94028df85d0703f761a22dde94cc768c3b26b3e..ad27ee5ed6f1ce5cd97767e34b8e4f378da9bd91 100644 (file)
@@ -30,6 +30,7 @@
 
 extern kern_return_t test_pmap_enter_disconnect(unsigned int);
 extern kern_return_t test_pmap_iommu_disconnect(void);
+extern kern_return_t test_pmap_extended(void);
 
 static int
 sysctl_test_pmap_enter_disconnect(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
@@ -62,3 +63,19 @@ sysctl_test_pmap_iommu_disconnect(__unused struct sysctl_oid *oidp, __unused voi
 SYSCTL_PROC(_kern, OID_AUTO, pmap_iommu_disconnect_test,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
     0, 0, sysctl_test_pmap_iommu_disconnect, "I", "");
+
+static int
+sysctl_test_pmap_extended(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       unsigned int run = 0;
+       int error, changed;
+       error = sysctl_io_number(req, 0, sizeof(run), &run, &changed);
+       if (error || !changed) {
+               return error;
+       }
+       return test_pmap_extended();
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, pmap_extended_test,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    0, 0, sysctl_test_pmap_extended, "I", "");
index a72dd4259bd29a842263c1e311ecb24af337e042..79a40a8173e3a5a5d7d5582cad63d12d8242ae60 100644 (file)
@@ -719,7 +719,13 @@ vfs_setattr(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx)
 {
        int error;
 
-       if (vfs_isrdonly(mp)) {
+       /*
+        * with a read-only system volume, we need to allow rename of the root volume
+        * even if it's read-only.  Don't return EROFS here if setattr changes only
+        * the volume name
+        */
+       if (vfs_isrdonly(mp) &&
+           !((mp->mnt_flag & MNT_ROOTFS) && (vfa->f_active == VFSATTR_f_vol_name))) {
                return EROFS;
        }
 
@@ -868,7 +874,7 @@ vfs_fsadd(struct vfs_fsentry *vfe, vfstable_t *handle)
        int     i, j;
        int(***opv_desc_vector_p)(void *);
        int(**opv_desc_vector)(void *);
-       struct vnodeopv_entry_desc      *opve_descp;
+       const struct vnodeopv_entry_desc        *opve_descp;
        int desccount;
        int descsize;
        PFI *descptr;
@@ -1541,6 +1547,19 @@ vnode_mountdevvp(vnode_t vp)
 }
 #endif
 
+boolean_t
+vnode_isonexternalstorage(vnode_t vp)
+{
+       if (vp) {
+               if (vp->v_mount) {
+                       if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_PERIPHERAL_DRIVE) {
+                               return TRUE;
+                       }
+               }
+       }
+       return FALSE;
+}
+
 mount_t
 vnode_mountedhere(vnode_t vp)
 {
@@ -2436,6 +2455,8 @@ vnode_getattr(vnode_t vp, struct vnode_attr *vap, vfs_context_t ctx)
                VATTR_SET_ACTIVE(vap, va_total_alloc);
        }
 
+       vap->va_vaflags &= ~VA_USEFSID;
+
        error = VNOP_GETATTR(vp, vap, ctx);
        if (error) {
                KAUTH_DEBUG("ERROR - returning %d", error);
@@ -2476,7 +2497,7 @@ vnode_getattr(vnode_t vp, struct vnode_attr *vap, vfs_context_t ctx)
                                                error = ENOMEM;
                                                goto out;
                                        }
-                                       bcopy(&fsec->fsec_acl, facl, KAUTH_ACL_COPYSIZE(&fsec->fsec_acl));
+                                       __nochk_bcopy(&fsec->fsec_acl, facl, KAUTH_ACL_COPYSIZE(&fsec->fsec_acl));
                                        VATTR_RETURN(vap, va_acl, facl);
                                }
                        }
@@ -2627,9 +2648,14 @@ vnode_getattr(vnode_t vp, struct vnode_attr *vap, vfs_context_t ctx)
        /*
         * The fsid can be obtained from the mountpoint directly.
         */
-       VATTR_RETURN(vap, va_fsid, vp->v_mount->mnt_vfsstat.f_fsid.val[0]);
+       if (VATTR_IS_ACTIVE(vap, va_fsid) &&
+           (!VATTR_IS_SUPPORTED(vap, va_fsid) ||
+           vap->va_vaflags & VA_REALFSID || !(vap->va_vaflags & VA_USEFSID))) {
+               VATTR_RETURN(vap, va_fsid, vp->v_mount->mnt_vfsstat.f_fsid.val[0]);
+       }
 
 out:
+       vap->va_vaflags &= ~VA_USEFSID;
 
        return error;
 }
@@ -3812,6 +3838,39 @@ VNOP_REVOKE(vnode_t vp, int flags, vfs_context_t ctx)
 }
 
 
+#if 0
+/*
+*#
+*# mmap_check - vp U U U
+*#
+*/
+struct vnop_mmap_check_args {
+       struct vnodeop_desc *a_desc;
+       vnode_t a_vp;
+       int a_flags;
+       vfs_context_t a_context;
+};
+#endif /* 0 */
+errno_t
+VNOP_MMAP_CHECK(vnode_t vp, int flags, vfs_context_t ctx)
+{
+       int _err;
+       struct vnop_mmap_check_args a;
+
+       a.a_desc = &vnop_mmap_check_desc;
+       a.a_vp = vp;
+       a.a_flags = flags;
+       a.a_context = ctx;
+
+       _err = (*vp->v_op[vnop_mmap_check_desc.vdesc_offset])(&a);
+       if (_err == ENOTSUP) {
+               _err = 0;
+       }
+       DTRACE_FSINFO(mmap_check, vnode_t, vp);
+
+       return _err;
+}
+
 #if 0
 /*
 *#
@@ -4109,9 +4168,8 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s
                } else {
                        xfromname = &smallname1[0];
                }
-               strlcpy(xfromname, "._", min(sizeof smallname1, len));
-               strncat(xfromname, fcnp->cn_nameptr, fcnp->cn_namelen);
-               xfromname[len - 1] = '\0';
+               strlcpy(xfromname, "._", len);
+               strlcat(xfromname, fcnp->cn_nameptr, len);
 
                /* Get destination attribute file name. */
                len = tcnp->cn_namelen + 3;
@@ -4120,9 +4178,8 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s
                } else {
                        xtoname = &smallname2[0];
                }
-               strlcpy(xtoname, "._", min(sizeof smallname2, len));
-               strncat(xtoname, tcnp->cn_nameptr, tcnp->cn_namelen);
-               xtoname[len - 1] = '\0';
+               strlcpy(xtoname, "._", len);
+               strlcat(xtoname, tcnp->cn_nameptr, len);
 
                /*
                 * Look up source attribute file, keep reference on it if exists.
@@ -4207,6 +4264,9 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s
 #if CONFIG_MACF
        if (_err == 0) {
                mac_vnode_notify_rename(ctx, *fvpp, tdvp, tcnp);
+               if (flags & VFS_RENAME_SWAP) {
+                       mac_vnode_notify_rename(ctx, *tvpp, fdvp, fcnp);
+               }
        }
 #endif
 
index 5453d20c787a0945eb486986a8e2e50c5cba49f0..c344bef005a290ef5134ea8536319252845cfcc6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1995-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 1995-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -285,10 +285,10 @@ attrlist_pack_string(struct _attrlist_buf *ab, const char *source, ssize_t count
 #define ATTR_PACK_TIME(b, v, is64)                                                      \
        do {                                                                            \
                if (is64) {                                                             \
-                       struct user64_timespec us = {v.tv_sec, v.tv_nsec};              \
+                       struct user64_timespec us = {.tv_sec = v.tv_sec, .tv_nsec = v.tv_nsec};         \
                        ATTR_PACK(&b, us);                                              \
                } else {                                                                \
-                       struct user32_timespec us = {v.tv_sec, v.tv_nsec};              \
+                       struct user32_timespec us = {.tv_sec = v.tv_sec, .tv_nsec = v.tv_nsec};         \
                        ATTR_PACK(&b, us);                                              \
                }                                                                       \
        } while(0)
@@ -304,64 +304,64 @@ struct getvolattrlist_attrtab {
        ssize_t         size;
 };
 static struct getvolattrlist_attrtab getvolattrlist_common_tab[] = {
-       {ATTR_CMN_NAME, 0, sizeof(struct attrreference)},
-       {ATTR_CMN_DEVID, 0, sizeof(dev_t)},
-       {ATTR_CMN_FSID, 0, sizeof(fsid_t)},
-       {ATTR_CMN_OBJTYPE, 0, sizeof(fsobj_type_t)},
-       {ATTR_CMN_OBJTAG, 0, sizeof(fsobj_tag_t)},
-       {ATTR_CMN_OBJID, 0, sizeof(fsobj_id_t)},
-       {ATTR_CMN_OBJPERMANENTID, 0, sizeof(fsobj_id_t)},
-       {ATTR_CMN_PAROBJID, 0, sizeof(fsobj_id_t)},
-       {ATTR_CMN_SCRIPT, 0, sizeof(text_encoding_t)},
-       {ATTR_CMN_CRTIME, VFSATTR_BIT(f_create_time), ATTR_TIME_SIZE},
-       {ATTR_CMN_MODTIME, VFSATTR_BIT(f_modify_time), ATTR_TIME_SIZE},
-       {ATTR_CMN_CHGTIME, VFSATTR_BIT(f_modify_time), ATTR_TIME_SIZE},
-       {ATTR_CMN_ACCTIME, VFSATTR_BIT(f_access_time), ATTR_TIME_SIZE},
-       {ATTR_CMN_BKUPTIME, VFSATTR_BIT(f_backup_time), ATTR_TIME_SIZE},
-       {ATTR_CMN_FNDRINFO, 0, 32},
-       {ATTR_CMN_OWNERID, 0, sizeof(uid_t)},
-       {ATTR_CMN_GRPID, 0, sizeof(gid_t)},
-       {ATTR_CMN_ACCESSMASK, 0, sizeof(uint32_t)},
-       {ATTR_CMN_FLAGS, 0, sizeof(uint32_t)},
-       {ATTR_CMN_USERACCESS, 0, sizeof(uint32_t)},
-       {ATTR_CMN_EXTENDED_SECURITY, 0, sizeof(struct attrreference)},
-       {ATTR_CMN_UUID, 0, sizeof(guid_t)},
-       {ATTR_CMN_GRPUUID, 0, sizeof(guid_t)},
-       {ATTR_CMN_FILEID, 0, sizeof(uint64_t)},
-       {ATTR_CMN_PARENTID, 0, sizeof(uint64_t)},
-       {ATTR_CMN_RETURNED_ATTRS, 0, sizeof(attribute_set_t)},
-       {ATTR_CMN_ERROR, 0, sizeof(uint32_t)},
-       {0, 0, 0}
+       {.attr = ATTR_CMN_NAME, .bits = 0, .size = sizeof(struct attrreference)},
+       {.attr = ATTR_CMN_DEVID, .bits = 0, .size = sizeof(dev_t)},
+       {.attr = ATTR_CMN_FSID, .bits = 0, .size = sizeof(fsid_t)},
+       {.attr = ATTR_CMN_OBJTYPE, .bits = 0, .size = sizeof(fsobj_type_t)},
+       {.attr = ATTR_CMN_OBJTAG, .bits = 0, .size = sizeof(fsobj_tag_t)},
+       {.attr = ATTR_CMN_OBJID, .bits = 0, .size = sizeof(fsobj_id_t)},
+       {.attr = ATTR_CMN_OBJPERMANENTID, .bits = 0, .size = sizeof(fsobj_id_t)},
+       {.attr = ATTR_CMN_PAROBJID, .bits = 0, .size = sizeof(fsobj_id_t)},
+       {.attr = ATTR_CMN_SCRIPT, .bits = 0, .size = sizeof(text_encoding_t)},
+       {.attr = ATTR_CMN_CRTIME, .bits = VFSATTR_BIT(f_create_time), .size = ATTR_TIME_SIZE},
+       {.attr = ATTR_CMN_MODTIME, .bits = VFSATTR_BIT(f_modify_time), .size = ATTR_TIME_SIZE},
+       {.attr = ATTR_CMN_CHGTIME, .bits = VFSATTR_BIT(f_modify_time), .size = ATTR_TIME_SIZE},
+       {.attr = ATTR_CMN_ACCTIME, .bits = VFSATTR_BIT(f_access_time), .size = ATTR_TIME_SIZE},
+       {.attr = ATTR_CMN_BKUPTIME, .bits = VFSATTR_BIT(f_backup_time), .size = ATTR_TIME_SIZE},
+       {.attr = ATTR_CMN_FNDRINFO, .bits = 0, .size = 32},
+       {.attr = ATTR_CMN_OWNERID, .bits = 0, .size = sizeof(uid_t)},
+       {.attr = ATTR_CMN_GRPID, .bits = 0, .size = sizeof(gid_t)},
+       {.attr = ATTR_CMN_ACCESSMASK, .bits = 0, .size = sizeof(uint32_t)},
+       {.attr = ATTR_CMN_FLAGS, .bits = 0, .size = sizeof(uint32_t)},
+       {.attr = ATTR_CMN_USERACCESS, .bits = 0, .size = sizeof(uint32_t)},
+       {.attr = ATTR_CMN_EXTENDED_SECURITY, .bits = 0, .size = sizeof(struct attrreference)},
+       {.attr = ATTR_CMN_UUID, .bits = 0, .size = sizeof(guid_t)},
+       {.attr = ATTR_CMN_GRPUUID, .bits = 0, .size = sizeof(guid_t)},
+       {.attr = ATTR_CMN_FILEID, .bits = 0, .size = sizeof(uint64_t)},
+       {.attr = ATTR_CMN_PARENTID, .bits = 0, .size = sizeof(uint64_t)},
+       {.attr = ATTR_CMN_RETURNED_ATTRS, .bits = 0, .size = sizeof(attribute_set_t)},
+       {.attr = ATTR_CMN_ERROR, .bits = 0, .size = sizeof(uint32_t)},
+       {.attr = 0, .bits = 0, .size = 0}
 };
 #define ATTR_CMN_VOL_INVALID \
        (ATTR_CMN_EXTENDED_SECURITY | ATTR_CMN_UUID | ATTR_CMN_GRPUUID | \
         ATTR_CMN_FILEID | ATTR_CMN_PARENTID)
 
 static struct getvolattrlist_attrtab getvolattrlist_vol_tab[] = {
-       {ATTR_VOL_FSTYPE, 0, sizeof(uint32_t)},
-       {ATTR_VOL_SIGNATURE, VFSATTR_BIT(f_signature), sizeof(uint32_t)},
-       {ATTR_VOL_SIZE, VFSATTR_BIT(f_blocks)  |  VFSATTR_BIT(f_bsize), sizeof(off_t)},
-       {ATTR_VOL_SPACEFREE, VFSATTR_BIT(f_bfree) | VFSATTR_BIT(f_bsize), sizeof(off_t)},
-       {ATTR_VOL_SPACEAVAIL, VFSATTR_BIT(f_bavail) | VFSATTR_BIT(f_bsize), sizeof(off_t)},
-       {ATTR_VOL_MINALLOCATION, VFSATTR_BIT(f_bsize), sizeof(off_t)},
-       {ATTR_VOL_ALLOCATIONCLUMP, VFSATTR_BIT(f_bsize), sizeof(off_t)},
-       {ATTR_VOL_IOBLOCKSIZE, VFSATTR_BIT(f_iosize), sizeof(uint32_t)},
-       {ATTR_VOL_OBJCOUNT, VFSATTR_BIT(f_objcount), sizeof(uint32_t)},
-       {ATTR_VOL_FILECOUNT, VFSATTR_BIT(f_filecount), sizeof(uint32_t)},
-       {ATTR_VOL_DIRCOUNT, VFSATTR_BIT(f_dircount), sizeof(uint32_t)},
-       {ATTR_VOL_MAXOBJCOUNT, VFSATTR_BIT(f_maxobjcount), sizeof(uint32_t)},
-       {ATTR_VOL_MOUNTPOINT, 0, sizeof(struct attrreference)},
-       {ATTR_VOL_NAME, VFSATTR_BIT(f_vol_name), sizeof(struct attrreference)},
-       {ATTR_VOL_MOUNTFLAGS, 0, sizeof(uint32_t)},
-       {ATTR_VOL_MOUNTEDDEVICE, 0, sizeof(struct attrreference)},
-       {ATTR_VOL_ENCODINGSUSED, 0, sizeof(uint64_t)},
-       {ATTR_VOL_CAPABILITIES, VFSATTR_BIT(f_capabilities), sizeof(vol_capabilities_attr_t)},
-       {ATTR_VOL_UUID, VFSATTR_BIT(f_uuid), sizeof(uuid_t)},
-       {ATTR_VOL_QUOTA_SIZE, VFSATTR_BIT(f_quota) | VFSATTR_BIT(f_bsize), sizeof(off_t)},
-       {ATTR_VOL_RESERVED_SIZE, VFSATTR_BIT(f_reserved) | VFSATTR_BIT(f_bsize), sizeof(off_t)},
-       {ATTR_VOL_ATTRIBUTES, VFSATTR_BIT(f_attributes), sizeof(vol_attributes_attr_t)},
-       {ATTR_VOL_INFO, 0, 0},
-       {0, 0, 0}
+       {.attr = ATTR_VOL_FSTYPE, .bits = 0, .size = sizeof(uint32_t)},
+       {.attr = ATTR_VOL_SIGNATURE, .bits = VFSATTR_BIT(f_signature), .size = sizeof(uint32_t)},
+       {.attr = ATTR_VOL_SIZE, .bits = VFSATTR_BIT(f_blocks)  |  VFSATTR_BIT(f_bsize), .size = sizeof(off_t)},
+       {.attr = ATTR_VOL_SPACEFREE, .bits = VFSATTR_BIT(f_bfree) | VFSATTR_BIT(f_bsize), .size = sizeof(off_t)},
+       {.attr = ATTR_VOL_SPACEAVAIL, .bits = VFSATTR_BIT(f_bavail) | VFSATTR_BIT(f_bsize), .size = sizeof(off_t)},
+       {.attr = ATTR_VOL_MINALLOCATION, .bits = VFSATTR_BIT(f_bsize), .size = sizeof(off_t)},
+       {.attr = ATTR_VOL_ALLOCATIONCLUMP, .bits = VFSATTR_BIT(f_bsize), .size = sizeof(off_t)},
+       {.attr = ATTR_VOL_IOBLOCKSIZE, .bits = VFSATTR_BIT(f_iosize), .size = sizeof(uint32_t)},
+       {.attr = ATTR_VOL_OBJCOUNT, .bits = VFSATTR_BIT(f_objcount), .size = sizeof(uint32_t)},
+       {.attr = ATTR_VOL_FILECOUNT, .bits = VFSATTR_BIT(f_filecount), .size = sizeof(uint32_t)},
+       {.attr = ATTR_VOL_DIRCOUNT, .bits = VFSATTR_BIT(f_dircount), .size = sizeof(uint32_t)},
+       {.attr = ATTR_VOL_MAXOBJCOUNT, .bits = VFSATTR_BIT(f_maxobjcount), .size = sizeof(uint32_t)},
+       {.attr = ATTR_VOL_MOUNTPOINT, .bits = 0, .size = sizeof(struct attrreference)},
+       {.attr = ATTR_VOL_NAME, .bits = VFSATTR_BIT(f_vol_name), .size = sizeof(struct attrreference)},
+       {.attr = ATTR_VOL_MOUNTFLAGS, .bits = 0, .size = sizeof(uint32_t)},
+       {.attr = ATTR_VOL_MOUNTEDDEVICE, .bits = 0, .size = sizeof(struct attrreference)},
+       {.attr = ATTR_VOL_ENCODINGSUSED, .bits = 0, .size = sizeof(uint64_t)},
+       {.attr = ATTR_VOL_CAPABILITIES, .bits = VFSATTR_BIT(f_capabilities), .size = sizeof(vol_capabilities_attr_t)},
+       {.attr = ATTR_VOL_UUID, .bits = VFSATTR_BIT(f_uuid), .size = sizeof(uuid_t)},
+       {.attr = ATTR_VOL_QUOTA_SIZE, .bits = VFSATTR_BIT(f_quota) | VFSATTR_BIT(f_bsize), .size = sizeof(off_t)},
+       {.attr = ATTR_VOL_RESERVED_SIZE, .bits = VFSATTR_BIT(f_reserved) | VFSATTR_BIT(f_bsize), .size = sizeof(off_t)},
+       {.attr = ATTR_VOL_ATTRIBUTES, .bits = VFSATTR_BIT(f_attributes), .size = sizeof(vol_attributes_attr_t)},
+       {.attr = ATTR_VOL_INFO, .bits = 0, .size = 0},
+       {.attr = 0, .bits = 0, .size = 0}
 };
 
 static int
@@ -479,69 +479,73 @@ struct getattrlist_attrtab {
  * information, and we will synthesize it at the VFS level.
  */
 static struct getattrlist_attrtab getattrlist_common_tab[] = {
-       {ATTR_CMN_NAME, VATTR_BIT(va_name), sizeof(struct attrreference), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_DEVID, 0, sizeof(dev_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_FSID, 0, sizeof(fsid_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_OBJTYPE, 0, sizeof(fsobj_type_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_OBJTAG, 0, sizeof(fsobj_tag_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_OBJID, VATTR_BIT(va_fileid) | VATTR_BIT(va_linkid), sizeof(fsobj_id_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_OBJPERMANENTID, VATTR_BIT(va_fileid) | VATTR_BIT(va_linkid), sizeof(fsobj_id_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_PAROBJID, VATTR_BIT(va_parentid), sizeof(fsobj_id_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_SCRIPT, VATTR_BIT(va_encoding), sizeof(text_encoding_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_CRTIME, VATTR_BIT(va_create_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_MODTIME, VATTR_BIT(va_modify_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_CHGTIME, VATTR_BIT(va_change_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_ACCTIME, VATTR_BIT(va_access_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_BKUPTIME, VATTR_BIT(va_backup_time), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_FNDRINFO, 0, 32, KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_OWNERID, VATTR_BIT(va_uid), sizeof(uid_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_GRPID, VATTR_BIT(va_gid), sizeof(gid_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_ACCESSMASK, VATTR_BIT(va_mode), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_FLAGS, VATTR_BIT(va_flags), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_GEN_COUNT, VATTR_BIT(va_write_gencount), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_DOCUMENT_ID, VATTR_BIT(va_document_id), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_USERACCESS, 0, sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_EXTENDED_SECURITY, VATTR_BIT(va_acl), sizeof(struct attrreference), KAUTH_VNODE_READ_SECURITY},
-       {ATTR_CMN_UUID, VATTR_BIT(va_uuuid), sizeof(guid_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_GRPUUID, VATTR_BIT(va_guuid), sizeof(guid_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_FILEID, VATTR_BIT(va_fileid), sizeof(uint64_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_PARENTID, VATTR_BIT(va_parentid), sizeof(uint64_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_FULLPATH, 0, sizeof(struct attrreference), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_ADDEDTIME, VATTR_BIT(va_addedtime), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_RETURNED_ATTRS, 0, sizeof(attribute_set_t), 0},
-       {ATTR_CMN_ERROR, 0, sizeof(uint32_t), 0},
-       {ATTR_CMN_DATA_PROTECT_FLAGS, VATTR_BIT(va_dataprotect_class), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {0, 0, 0, 0}
+       {.attr = ATTR_CMN_NAME, .bits = VATTR_BIT(va_name), .size = sizeof(struct attrreference), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_DEVID, .bits = VATTR_BIT(va_fsid), .size = sizeof(dev_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_FSID, .bits = VATTR_BIT(va_fsid64), .size = sizeof(fsid_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_OBJTYPE, .bits = 0, .size = sizeof(fsobj_type_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_OBJTAG, .bits = 0, .size = sizeof(fsobj_tag_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_OBJID, .bits = VATTR_BIT(va_fileid) | VATTR_BIT(va_linkid), .size = sizeof(fsobj_id_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_OBJPERMANENTID, .bits = VATTR_BIT(va_fileid) | VATTR_BIT(va_linkid), .size = sizeof(fsobj_id_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_PAROBJID, .bits = VATTR_BIT(va_parentid), .size = sizeof(fsobj_id_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_SCRIPT, .bits = VATTR_BIT(va_encoding), .size = sizeof(text_encoding_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_CRTIME, .bits = VATTR_BIT(va_create_time), .size = ATTR_TIME_SIZE, .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_MODTIME, .bits = VATTR_BIT(va_modify_time), .size = ATTR_TIME_SIZE, .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_CHGTIME, .bits = VATTR_BIT(va_change_time), .size = ATTR_TIME_SIZE, .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_ACCTIME, .bits = VATTR_BIT(va_access_time), .size = ATTR_TIME_SIZE, .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_BKUPTIME, .bits = VATTR_BIT(va_backup_time), .size = ATTR_TIME_SIZE, .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_FNDRINFO, .bits = 0, .size = 32, .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_OWNERID, .bits = VATTR_BIT(va_uid), .size = sizeof(uid_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_GRPID, .bits = VATTR_BIT(va_gid), .size = sizeof(gid_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_ACCESSMASK, .bits = VATTR_BIT(va_mode), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_FLAGS, .bits = VATTR_BIT(va_flags), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_GEN_COUNT, .bits = VATTR_BIT(va_write_gencount), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_DOCUMENT_ID, .bits = VATTR_BIT(va_document_id), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_USERACCESS, .bits = 0, .size = sizeof(uint32_t), .action = 0},
+       {.attr = ATTR_CMN_EXTENDED_SECURITY, .bits = VATTR_BIT(va_acl), .size = sizeof(struct attrreference), .action = KAUTH_VNODE_READ_SECURITY},
+       {.attr = ATTR_CMN_UUID, .bits = VATTR_BIT(va_uuuid), .size = sizeof(guid_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_GRPUUID, .bits = VATTR_BIT(va_guuid), .size = sizeof(guid_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_FILEID, .bits = VATTR_BIT(va_fileid), .size = sizeof(uint64_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_PARENTID, .bits = VATTR_BIT(va_parentid), .size = sizeof(uint64_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_FULLPATH, .bits = 0, .size = sizeof(struct attrreference), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_ADDEDTIME, .bits = VATTR_BIT(va_addedtime), .size = ATTR_TIME_SIZE, .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_RETURNED_ATTRS, .bits = 0, .size = sizeof(attribute_set_t), .action = 0},
+       {.attr = ATTR_CMN_ERROR, .bits = 0, .size = sizeof(uint32_t), .action = 0},
+       {.attr = ATTR_CMN_DATA_PROTECT_FLAGS, .bits = VATTR_BIT(va_dataprotect_class), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = 0, .bits = 0, .size = 0, .action = 0}
 };
 
 static struct getattrlist_attrtab getattrlist_dir_tab[] = {
-       {ATTR_DIR_LINKCOUNT, VATTR_BIT(va_dirlinkcount), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_DIR_ENTRYCOUNT, VATTR_BIT(va_nchildren), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_DIR_MOUNTSTATUS, 0, sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_DIR_ALLOCSIZE, VATTR_BIT(va_total_alloc) | VATTR_BIT(va_total_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_DIR_IOBLOCKSIZE, VATTR_BIT(va_iosize), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_DIR_DATALENGTH, VATTR_BIT(va_total_size) | VATTR_BIT(va_data_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {0, 0, 0, 0}
+       {.attr = ATTR_DIR_LINKCOUNT, .bits = VATTR_BIT(va_dirlinkcount), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_DIR_ENTRYCOUNT, .bits = VATTR_BIT(va_nchildren), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_DIR_MOUNTSTATUS, .bits = 0, .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_DIR_ALLOCSIZE, .bits = VATTR_BIT(va_total_alloc) | VATTR_BIT(va_total_size), .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_DIR_IOBLOCKSIZE, .bits = VATTR_BIT(va_iosize), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_DIR_DATALENGTH, .bits = VATTR_BIT(va_total_size) | VATTR_BIT(va_data_size), .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = 0, .bits = 0, .size = 0, .action = 0}
 };
 static struct getattrlist_attrtab getattrlist_file_tab[] = {
-       {ATTR_FILE_LINKCOUNT, VATTR_BIT(va_nlink), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_FILE_TOTALSIZE, VATTR_BIT(va_total_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_FILE_ALLOCSIZE, VATTR_BIT(va_total_alloc) | VATTR_BIT(va_total_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_FILE_IOBLOCKSIZE, VATTR_BIT(va_iosize), sizeof(uint32_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_FILE_DEVTYPE, VATTR_BIT(va_rdev), sizeof(dev_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_FILE_DATALENGTH, VATTR_BIT(va_total_size) | VATTR_BIT(va_data_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_FILE_DATAALLOCSIZE, VATTR_BIT(va_total_alloc) | VATTR_BIT(va_data_alloc), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_FILE_RSRCLENGTH, 0, sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_FILE_RSRCALLOCSIZE, 0, sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {0, 0, 0, 0}
+       {.attr = ATTR_FILE_LINKCOUNT, .bits = VATTR_BIT(va_nlink), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_FILE_TOTALSIZE, .bits = VATTR_BIT(va_total_size), .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_FILE_ALLOCSIZE, .bits = VATTR_BIT(va_total_alloc) | VATTR_BIT(va_total_size), .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_FILE_IOBLOCKSIZE, .bits = VATTR_BIT(va_iosize), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_FILE_CLUMPSIZE, .bits = 0, .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_FILE_DEVTYPE, .bits = VATTR_BIT(va_rdev), .size = sizeof(dev_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_FILE_DATALENGTH, .bits = VATTR_BIT(va_total_size) | VATTR_BIT(va_data_size), .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_FILE_DATAALLOCSIZE, .bits = VATTR_BIT(va_total_alloc) | VATTR_BIT(va_data_alloc), .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_FILE_RSRCLENGTH, .bits = 0, .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_FILE_RSRCALLOCSIZE, .bits = 0, .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = 0, .bits = 0, .size = 0, .action = 0}
 };
 
 //for forkattr bits repurposed as new common attributes
 static struct getattrlist_attrtab getattrlist_common_extended_tab[] = {
-       {ATTR_CMNEXT_RELPATH, 0, sizeof(struct attrreference), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMNEXT_PRIVATESIZE, VATTR_BIT(va_private_size), sizeof(off_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMNEXT_LINKID, VATTR_BIT(va_fileid) | VATTR_BIT(va_linkid), sizeof(uint64_t), KAUTH_VNODE_READ_ATTRIBUTES},
-       {0, 0, 0, 0}
+       {.attr = ATTR_CMNEXT_RELPATH, .bits = 0, .size = sizeof(struct attrreference), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMNEXT_PRIVATESIZE, .bits = VATTR_BIT(va_private_size), .size = sizeof(off_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMNEXT_LINKID, .bits = VATTR_BIT(va_fileid) | VATTR_BIT(va_linkid), .size = sizeof(uint64_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMNEXT_NOFIRMLINKPATH, .bits = 0, .size = sizeof(struct attrreference), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMNEXT_REALDEVID, .bits = VATTR_BIT(va_devid), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMNEXT_REALFSID, .bits = VATTR_BIT(va_fsid64), .size = sizeof(fsid_t), .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = 0, .bits = 0, .size = 0, .action = 0}
 };
 
 /*
@@ -554,25 +558,25 @@ static struct getattrlist_attrtab getattrlist_common_extended_tab[] = {
  * accounted from the common, file and directory tables.
  */
 static struct getattrlist_attrtab getattrlistbulk_common_tab[] = {
-       {ATTR_CMN_DEVID, VATTR_BIT(va_devid), 0, KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_FSID, VATTR_BIT(va_fsid64), 0, KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_OBJTYPE, VATTR_BIT(va_objtype), 0, KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_OBJTAG, VATTR_BIT(va_objtag), 0, KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_USERACCESS, VATTR_BIT(va_user_access), 0, KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_CMN_FNDRINFO, VATTR_BIT(va_finderinfo), 0, KAUTH_VNODE_READ_ATTRIBUTES},
-       {0, 0, 0, 0}
+       {.attr = ATTR_CMN_DEVID, .bits = VATTR_BIT(va_devid), .size = 0, .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_FSID, .bits = VATTR_BIT(va_fsid64), .size = 0, .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_OBJTYPE, .bits = VATTR_BIT(va_objtype), .size = 0, .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_OBJTAG, .bits = VATTR_BIT(va_objtag), .size = 0, .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_CMN_USERACCESS, .bits = VATTR_BIT(va_user_access), .size = 0, .action = 0},
+       {.attr = ATTR_CMN_FNDRINFO, .bits = VATTR_BIT(va_finderinfo), .size = 0, .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = 0, .bits = 0, .size = 0, .action = 0}
 };
 
 static struct getattrlist_attrtab getattrlistbulk_file_tab[] = {
-       {ATTR_FILE_RSRCLENGTH, VATTR_BIT(va_rsrc_length), 0, KAUTH_VNODE_READ_ATTRIBUTES},
-       {ATTR_FILE_RSRCALLOCSIZE, VATTR_BIT(va_rsrc_alloc), 0, KAUTH_VNODE_READ_ATTRIBUTES},
-       {0, 0, 0, 0}
+       {.attr = ATTR_FILE_RSRCLENGTH, .bits = VATTR_BIT(va_rsrc_length), .size = 0, .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = ATTR_FILE_RSRCALLOCSIZE, .bits = VATTR_BIT(va_rsrc_alloc), .size = 0, .action = KAUTH_VNODE_READ_ATTRIBUTES},
+       {.attr = 0, .bits = 0, .size = 0, .action = 0}
 };
 
 static struct getattrlist_attrtab getattrlistbulk_common_extended_tab[] = {
        /* getattrlist_parsetab() expects > 1 entries */
-       {0, 0, 0, 0},
-       {0, 0, 0, 0}
+       {.attr = 0, .bits = 0, .size = 0, .action = 0},
+       {.attr = 0, .bits = 0, .size = 0, .action = 0}
 };
 
 /*
@@ -601,7 +605,9 @@ static struct getattrlist_attrtab getattrlistbulk_common_extended_tab[] = {
                                 ATTR_CMN_DOCUMENT_ID | ATTR_CMN_GEN_COUNT | \
                                 ATTR_CMN_DATA_PROTECT_FLAGS)
 
-#define VFS_DFLT_ATTR_CMN_EXT   (ATTR_CMNEXT_PRIVATESIZE | ATTR_CMNEXT_LINKID)
+#define VFS_DFLT_ATTR_CMN_EXT   (ATTR_CMNEXT_PRIVATESIZE | ATTR_CMNEXT_LINKID |  \
+                                ATTR_CMNEXT_NOFIRMLINKPATH | ATTR_CMNEXT_REALDEVID |  \
+                                ATTR_CMNEXT_REALFSID)
 
 #define VFS_DFLT_ATTR_DIR       (ATTR_DIR_LINKCOUNT | ATTR_DIR_MOUNTSTATUS)
 
@@ -718,11 +724,6 @@ getattrlist_setupvattr_all(struct attrlist *alp, struct vnode_attr *vap,
                        (void)getattrlist_parsetab(getattrlistbulk_common_tab,
                            alp->commonattr, vap, fixedsize, NULL, is_64bit,
                            sizeof(getattrlistbulk_common_tab) / sizeof(getattrlistbulk_common_tab[0]));
-                       /*
-                        * turn off va_fsid since we will be using only
-                        * va_fsid64 for ATTR_CMN_FSID.
-                        */
-                       VATTR_CLEAR_ACTIVE(vap, va_fsid);
                }
        }
 
@@ -765,6 +766,8 @@ int
 vfs_setup_vattr_from_attrlist(struct attrlist *alp, struct vnode_attr *vap,
     enum vtype obj_vtype, ssize_t *attrs_fixed_sizep, vfs_context_t ctx)
 {
+       VATTR_INIT(vap);
+
        // the caller passes us no options, we assume the caller wants the new fork
        // attr behavior, hence the hardcoded 1
        return getattrlist_setupvattr_all(alp, vap, obj_vtype,
@@ -925,6 +928,7 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp,
        mount_t         mnt;
        int             return_valid;
        int             pack_invalid;
+       vnode_t         root_vp = NULL;
 
        ab.base = NULL;
        VATTR_INIT(&va);
@@ -948,15 +952,20 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp,
                bcopy(&alp->commonattr, &ab.valid, sizeof(attribute_set_t));
        }
 
-       /*
-        * For now, the vnode must be the root of its filesystem.
-        * To relax this, we need to be able to find the root vnode of a filesystem
-        * from any vnode in the filesystem.
-        */
+       /* If we do not have root vnode, look it up and substitute it in */
        if (!vnode_isvroot(vp)) {
-               error = EINVAL;
-               VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: volume attributes requested but not the root of a filesystem");
-               goto out;
+               if (mnt != NULL) {
+                       error = VFS_ROOT(mnt, &root_vp, ctx);
+                       if (error) {
+                               VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: volume attributes requested on non-root vnode, but got an error getting root.");
+                               goto out;
+                       }
+                       vp = root_vp;
+               } else {
+                       error = EINVAL;
+                       VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: volume attributes requested on non-root vnode, but no backpointer to mount.");
+                       goto out;
+               }
        }
 
        /*
@@ -1552,6 +1561,10 @@ out:
                FREE(ab.base, M_TEMP);
        }
        VFS_DEBUG(ctx, vp, "ATTRLIST - returning %d", error);
+
+       if (root_vp != NULL) {
+               vnode_put(root_vp);
+       }
        return error;
 }
 
@@ -1563,7 +1576,7 @@ out:
  * are in ad.
  */
 static errno_t
-attr_pack_common(vfs_context_t ctx, struct vnode *vp, struct attrlist *alp,
+attr_pack_common(vfs_context_t ctx, mount_t mp, vnode_t vp, struct attrlist *alp,
     struct _attrlist_buf *abp, struct vnode_attr *vap, int proc_is64,
     const char *cnp, ssize_t cnl, const char *fullpathptr,
     ssize_t fullpathlen, int return_valid, int pack_invalid, int vtype,
@@ -1582,7 +1595,14 @@ attr_pack_common(vfs_context_t ctx, struct vnode *vp, struct attrlist *alp,
                abp->actual.commonattr |= ATTR_CMN_NAME;
        }
        if (alp->commonattr & ATTR_CMN_DEVID) {
-               if (vp) {
+               if (mp) { /* caller needs real devid */
+                       ATTR_PACK4((*abp),
+                           mp->mnt_vfsstat.f_fsid.val[0]);
+                       abp->actual.commonattr |= ATTR_CMN_DEVID;
+               } else if (VATTR_IS_ACTIVE(vap, va_fsid) && VATTR_IS_SUPPORTED(vap, va_fsid)) {
+                       ATTR_PACK4((*abp), vap->va_fsid);
+                       abp->actual.commonattr |= ATTR_CMN_DEVID;
+               } else if (vp) {
                        ATTR_PACK4((*abp),
                            vp->v_mount->mnt_vfsstat.f_fsid.val[0]);
                        abp->actual.commonattr |= ATTR_CMN_DEVID;
@@ -1594,16 +1614,19 @@ attr_pack_common(vfs_context_t ctx, struct vnode *vp, struct attrlist *alp,
                }
        }
        if (alp->commonattr & ATTR_CMN_FSID) {
-               if (vp) {
+               if (mp) { /* caller needs real fsid */
                        ATTR_PACK8((*abp),
-                           vp->v_mount->mnt_vfsstat.f_fsid);
+                           mp->mnt_vfsstat.f_fsid);
                        abp->actual.commonattr |= ATTR_CMN_FSID;
                } else if (VATTR_IS_SUPPORTED(vap, va_fsid64)) {
                        ATTR_PACK8((*abp), vap->va_fsid64);
                        abp->actual.commonattr |= ATTR_CMN_FSID;
+               } else if (vp) {
+                       ATTR_PACK8((*abp),
+                           vp->v_mount->mnt_vfsstat.f_fsid);
+                       abp->actual.commonattr |= ATTR_CMN_FSID;
                } else if (!return_valid || pack_invalid) {
                        fsid_t fsid = {{0}};
-
                        ATTR_PACK8((*abp), fsid);
                }
        }
@@ -1938,7 +1961,7 @@ attr_pack_common(vfs_context_t ctx, struct vnode *vp, struct attrlist *alp,
                        ATTR_PACK_TIME((*abp), vap->va_addedtime, proc_is64);
                        abp->actual.commonattr |= ATTR_CMN_ADDEDTIME;
                } else if (!return_valid || pack_invalid) {
-                       struct timespec zerotime = {0, 0};
+                       struct timespec zerotime = {.tv_sec = 0, .tv_nsec = 0};
 
                        ATTR_PACK_TIME((*abp), zerotime, proc_is64);
                }
@@ -2260,8 +2283,9 @@ out:
  * are in ad.
  */
 static errno_t
-attr_pack_common_extended(struct vnode *vp, struct attrlist *alp,
+attr_pack_common_extended(mount_t mp, struct vnode *vp, struct attrlist *alp,
     struct _attrlist_buf *abp, const char *relpathptr, ssize_t relpathlen,
+    const char *REALpathptr, ssize_t REALpathlen,
     struct vnode_attr *vap, int return_valid, int pack_invalid)
 {
        if (vp && (alp->forkattr & ATTR_CMNEXT_RELPATH)) {
@@ -2292,12 +2316,57 @@ attr_pack_common_extended(struct vnode *vp, struct attrlist *alp,
                abp->actual.forkattr |= ATTR_CMNEXT_LINKID;
        }
 
+       if (vp && (alp->forkattr & ATTR_CMNEXT_NOFIRMLINKPATH)) {
+               attrlist_pack_string(abp, REALpathptr, REALpathlen);
+               abp->actual.forkattr |= ATTR_CMNEXT_NOFIRMLINKPATH;
+       }
+
+       if (alp->forkattr & ATTR_CMNEXT_REALDEVID) {
+               if (mp) {
+                       ATTR_PACK4((*abp),
+                           mp->mnt_vfsstat.f_fsid.val[0]);
+                       abp->actual.forkattr |= ATTR_CMNEXT_REALDEVID;
+               } else if (vp) {
+                       ATTR_PACK4((*abp),
+                           vp->v_mount->mnt_vfsstat.f_fsid.val[0]);
+                       abp->actual.forkattr |= ATTR_CMNEXT_REALDEVID;
+               } else if (VATTR_IS_SUPPORTED(vap, va_fsid)) {
+                       ATTR_PACK4((*abp), vap->va_fsid);
+                       abp->actual.forkattr |= ATTR_CMN_DEVID;
+               } else if (!return_valid || pack_invalid) {
+                       ATTR_PACK4((*abp), 0);
+               }
+       }
+
+       if (alp->forkattr & ATTR_CMNEXT_REALFSID) {
+               if (mp) {
+                       ATTR_PACK8((*abp),
+                           mp->mnt_vfsstat.f_fsid);
+                       abp->actual.forkattr |= ATTR_CMNEXT_REALFSID;
+               } else if (vp) {
+                       ATTR_PACK8((*abp),
+                           vp->v_mount->mnt_vfsstat.f_fsid);
+                       abp->actual.forkattr |= ATTR_CMNEXT_REALFSID;
+               } else if (VATTR_IS_SUPPORTED(vap, va_fsid64)) {
+                       ATTR_PACK8((*abp), vap->va_fsid64);
+                       abp->actual.forkattr |= ATTR_CMN_FSID;
+               } else if (!return_valid || pack_invalid) {
+                       fsid_t fsid = {{0}};
+
+                       ATTR_PACK8((*abp), fsid);
+               }
+       }
+
        return 0;
 }
 
 static void
 vattr_get_alt_data(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap,
-    int return_valid, int is_bulk, vfs_context_t ctx)
+    int return_valid, int is_bulk,
+#if !CONFIG_FIRMLINKS
+    __unused
+#endif
+    int is_realdev, vfs_context_t ctx)
 {
        /*
         * There are a couple of special cases.
@@ -2310,27 +2379,66 @@ vattr_get_alt_data(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap,
                VATTR_CLEAR_ACTIVE(vap, va_linkid);
        }
 
+       /*
+        * A filesystem may not support va_fsid64.  If it is not available, then we'll
+        * synthesize it from the mount.
+        */
+       if ((alp->commonattr & ATTR_CMN_FSID) && !VATTR_IS_SUPPORTED(vap, va_fsid64)) {
+               VATTR_CLEAR_ACTIVE(vap, va_fsid64);
+       }
+
+       /* Same for fsid */
+       if ((alp->commonattr & ATTR_CMN_FSID) && !VATTR_IS_SUPPORTED(vap, va_fsid)) {
+               VATTR_CLEAR_ACTIVE(vap, va_fsid);
+       }
+
+       /* We request the fsid64 for the devid */
+       if ((alp->commonattr & ATTR_CMN_DEVID) && !VATTR_IS_SUPPORTED(vap, va_fsid)) {
+               VATTR_CLEAR_ACTIVE(vap, va_fsid);
+       }
+
+
        /*
         * Many filesystems don't know their parent object id.
         * If necessary, attempt to derive it from the vnode.
         */
-       if ((alp->commonattr & (ATTR_CMN_PAROBJID | ATTR_CMN_PARENTID)) &&
-           !VATTR_IS_SUPPORTED(vap, va_parentid) && vp && !is_bulk) {
+       if ((alp->commonattr & (ATTR_CMN_PAROBJID | ATTR_CMN_PARENTID)) && vp) {
                vnode_t dvp;
 
-               if ((dvp = vnode_getparent(vp)) != NULLVP) {
+#if CONFIG_FIRMLINKS
+               /* If this is a firmlink target, we get the fileid of the firmlink parent. */
+               if (!is_realdev && (vp->v_flag & VFMLINKTARGET) && ((dvp = vp->v_fmlink) != NULL) && (vnode_get(dvp) == 0)) {
                        struct vnode_attr lva;
 
                        VATTR_INIT(&lva);
-                       VATTR_WANTED(&lva, va_fileid);
+                       VATTR_WANTED(&lva, va_parentid);
+                       VATTR_WANTED(&lva, va_fsid);
                        if (vnode_getattr(dvp, &lva, ctx) == 0 &&
-                           VATTR_IS_SUPPORTED(vap, va_fileid)) {
-                               vap->va_parentid = lva.va_fileid;
+                           VATTR_IS_SUPPORTED(&lva, va_parentid) &&
+                           VATTR_IS_SUPPORTED(&lva, va_fsid) &&
+                           (lva.va_fsid == (uint32_t)vp->v_mount->mnt_vfsstat.f_fsid.val[0])) {
+                               vap->va_parentid = lva.va_parentid;
                                VATTR_SET_SUPPORTED(vap, va_parentid);
                        }
                        vnode_put(dvp);
+               } else
+#endif /* CONFIG_FIRMLINKS */
+               if (!VATTR_IS_SUPPORTED(vap, va_parentid) && !is_bulk) {
+                       if ((dvp = vnode_getparent(vp)) != NULLVP) {
+                               struct vnode_attr lva;
+
+                               VATTR_INIT(&lva);
+                               VATTR_WANTED(&lva, va_fileid);
+                               if (vnode_getattr(dvp, &lva, ctx) == 0 &&
+                                   VATTR_IS_SUPPORTED(vap, va_fileid)) {
+                                       vap->va_parentid = lva.va_fileid;
+                                       VATTR_SET_SUPPORTED(vap, va_parentid);
+                               }
+                               vnode_put(dvp);
+                       }
                }
        }
+
        /*
         * And we can report datasize/alloc from total.
         */
@@ -2369,10 +2477,18 @@ vattr_get_alt_data(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap,
        }
 }
 
+struct _attrlist_paths {
+       char *fullpathptr;
+       ssize_t *fullpathlenp;
+       char *relpathptr;
+       ssize_t *relpathlenp;
+       char *REALpathptr;
+       ssize_t *REALpathlenp;
+};
+
 static errno_t
 calc_varsize(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap,
-    ssize_t *varsizep, char *fullpathptr, ssize_t *fullpathlenp,
-    char *relpathptr, ssize_t *relpathlenp, const char **vnamep,
+    ssize_t *varsizep, struct _attrlist_paths *pathsp, const char **vnamep,
     const char **cnpp, ssize_t *cnlp)
 {
        int error = 0;
@@ -2426,16 +2542,17 @@ calc_varsize(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap,
                int err;
 
                /* call build_path making sure NOT to use the cache-only behavior */
-               err = build_path(vp, fullpathptr, len, &len, 0, vfs_context_current());
+               err = build_path(vp, pathsp->fullpathptr, len, &len, 0, vfs_context_current());
                if (err) {
                        error = err;
                        goto out;
                }
-               *fullpathlenp = 0;
-               if (fullpathptr) {
-                       *fullpathlenp = strlen(fullpathptr);
+               if (pathsp->fullpathptr) {
+                       *(pathsp->fullpathlenp) = strlen(pathsp->fullpathptr);
+               } else {
+                       *(pathsp->fullpathlenp) = 0;
                }
-               *varsizep += roundup(((*fullpathlenp) + 1), 4);
+               *varsizep += roundup(((*(pathsp->fullpathlenp)) + 1), 4);
        }
 
        /*
@@ -2446,14 +2563,33 @@ calc_varsize(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap,
                int err;
 
                /* call build_path making sure NOT to use the cache-only behavior */
-               err = build_path(vp, relpathptr, MAXPATHLEN, &len, BUILDPATH_VOLUME_RELATIVE, vfs_context_current());
+               err = build_path(vp, pathsp->relpathptr, MAXPATHLEN, &len, BUILDPATH_VOLUME_RELATIVE, vfs_context_current());
+               if (err) {
+                       error = err;
+                       goto out;
+               }
+
+               //`len' includes trailing null
+               *(pathsp->relpathlenp) = len - 1;
+               *varsizep += roundup(len, 4);
+       }
+
+       /*
+        * Compute this vnode's real (firmlink free) path.
+        */
+       if (vp && (alp->forkattr & ATTR_CMNEXT_NOFIRMLINKPATH)) {
+               int len;
+               int err;
+
+               /* call build_path making sure NOT to use the cache-only behavior */
+               err = build_path(vp, pathsp->REALpathptr, MAXPATHLEN, &len, BUILDPATH_NO_FIRMLINK, vfs_context_current());
                if (err) {
                        error = err;
                        goto out;
                }
 
                //`len' includes trailing null
-               *relpathlenp = len - 1;
+               *(pathsp->REALpathlenp) = len - 1;
                *varsizep += roundup(len, 4);
        }
 
@@ -2482,11 +2618,14 @@ out:
 }
 
 static errno_t
-vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp,
+vfs_attr_pack_internal(mount_t mp, vnode_t vp, uio_t auio, struct attrlist *alp,
     uint64_t options, struct vnode_attr *vap, __unused void *fndesc,
     vfs_context_t ctx, int is_bulk, enum vtype vtype, ssize_t fixedsize)
 {
        struct _attrlist_buf ab;
+       struct _attrlist_paths apaths = {.fullpathptr = NULL, .fullpathlenp = NULL,
+                                        .relpathptr = NULL, .relpathlenp = NULL,
+                                        .REALpathptr = NULL, .REALpathlenp = NULL};
        ssize_t buf_size;
        size_t copy_size;
        ssize_t varsize;
@@ -2497,10 +2636,13 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp,
        ssize_t fullpathlen;
        char *relpathptr;
        ssize_t relpathlen;
+       char *REALpathptr;
+       ssize_t REALpathlen;
        int error;
        int proc_is64;
        int return_valid;
        int pack_invalid;
+       int is_realdev;
        int alloc_local_buf;
        const int use_fork = options & FSOPT_ATTR_CMN_EXTENDED;
 
@@ -2512,6 +2654,8 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp,
        fullpathlen = 0;
        relpathptr = NULL;
        relpathlen = 0;
+       REALpathptr = NULL;
+       REALpathlen = 0;
        error = 0;
        alloc_local_buf = 0;
 
@@ -2524,6 +2668,7 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp,
        /* Check for special packing semantics */
        return_valid = (alp->commonattr & ATTR_CMN_RETURNED_ATTRS) ? 1 : 0;
        pack_invalid = (options & FSOPT_PACK_INVAL_ATTRS) ? 1 : 0;
+       is_realdev = options & FSOPT_RETURN_REALDEV ? 1 : 0;
 
        if (pack_invalid) {
                /* Generate a valid mask for post processing */
@@ -2531,8 +2676,17 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp,
        }
 
        /* did we ask for something the filesystem doesn't support? */
-       if (vap->va_active && !VATTR_ALL_SUPPORTED(vap)) {
-               vattr_get_alt_data(vp, alp, vap, return_valid, is_bulk,
+       if (vap->va_active &&
+           (!VATTR_ALL_SUPPORTED(vap)
+#if CONFIG_FIRMLINKS
+           /* For firmlink targets we have to overide what the FS returned for parentid */
+           ||
+           (!is_realdev && vp && (vp->v_flag & VFMLINKTARGET) && vp->v_fmlink &&
+           (alp->commonattr & (ATTR_CMN_PAROBJID | ATTR_CMN_PARENTID)))
+#endif
+           )) {
+               // this disables the selectors that were not supported by the filesystem
+               vattr_get_alt_data(vp, alp, vap, return_valid, is_bulk, is_realdev,
                    ctx);
 
                /* check again */
@@ -2566,24 +2720,41 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp,
                        goto out;
                }
                bzero(fullpathptr, MAXPATHLEN);
+               apaths.fullpathptr = fullpathptr;
+               apaths.fullpathlenp = &fullpathlen;
        }
 
        // only interpret fork attributes if they're used as new common attributes
-       if (vp && use_fork && (alp->forkattr & (ATTR_CMNEXT_RELPATH))) {
-               relpathptr = (char*) kalloc(MAXPATHLEN);
-               if (relpathptr == NULL) {
-                       error = ENOMEM;
-                       VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: cannot allocate relpath buffer");
-                       goto out;
+       if (vp && use_fork) {
+               if (alp->forkattr & (ATTR_CMNEXT_RELPATH)) {
+                       relpathptr = (char*) kalloc(MAXPATHLEN);
+                       if (relpathptr == NULL) {
+                               error = ENOMEM;
+                               VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: cannot allocate relpath buffer");
+                               goto out;
+                       }
+                       bzero(relpathptr, MAXPATHLEN);
+                       apaths.relpathptr = relpathptr;
+                       apaths.relpathlenp = &relpathlen;
+               }
+
+               if (alp->forkattr & (ATTR_CMNEXT_NOFIRMLINKPATH)) {
+                       REALpathptr = (char*) kalloc(MAXPATHLEN);
+                       if (REALpathptr == NULL) {
+                               error = ENOMEM;
+                               VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: cannot allocate canonpath buffer");
+                               goto out;
+                       }
+                       bzero(REALpathptr, MAXPATHLEN);
+                       apaths.REALpathptr = REALpathptr;
+                       apaths.REALpathlenp = &REALpathlen;
                }
-               bzero(relpathptr, MAXPATHLEN);
        }
 
        /*
         * Compute variable-space requirements.
         */
-       error = calc_varsize(vp, alp, vap, &varsize, fullpathptr, &fullpathlen,
-           relpathptr, &relpathlen, &vname, &cnp, &cnl);
+       error = calc_varsize(vp, alp, vap, &varsize, &apaths, &vname, &cnp, &cnl);
        if (error) {
                goto out;
        }
@@ -2593,7 +2764,7 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp,
         *
         * Note that we won't ever copy out more than the caller requested, even though
         * we might have to allocate more than they offer so that the diagnostic checks
-        * don't result in a panic if the caller's buffer is too small..
+        * don't result in a panic if the caller's buffer is too small.
         */
        ab.allocated = fixedsize + varsize;
        /* Cast 'allocated' to an unsigned to verify allocation size */
@@ -2702,8 +2873,9 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp,
        ab.needed = ab.allocated;
 
        /* common attributes ************************************************/
-       error = attr_pack_common(ctx, vp, alp, &ab, vap, proc_is64, cnp, cnl,
-           fullpathptr, fullpathlen, return_valid, pack_invalid, vtype, is_bulk);
+       error = attr_pack_common(ctx, (options & FSOPT_RETURN_REALDEV ? mp : NULL),
+           vp, alp, &ab, vap, proc_is64, cnp, cnl, fullpathptr, fullpathlen,
+           return_valid, pack_invalid, vtype, is_bulk);
 
        /* directory attributes *********************************************/
        if (!error && alp->dirattr && (vtype == VDIR)) {
@@ -2718,8 +2890,8 @@ vfs_attr_pack_internal(vnode_t vp, uio_t auio, struct attrlist *alp,
 
        /* common extended attributes *****************************************/
        if (!error && use_fork) {
-               error = attr_pack_common_extended(vp, alp, &ab, relpathptr, relpathlen,
-                   vap, return_valid, pack_invalid);
+               error = attr_pack_common_extended(mp, vp, alp, &ab, relpathptr, relpathlen,
+                   REALpathptr, REALpathlen, vap, return_valid, pack_invalid);
        }
 
        if (error) {
@@ -2789,6 +2961,9 @@ out:
        if (relpathptr) {
                kfree(relpathptr, MAXPATHLEN);
        }
+       if (REALpathptr) {
+               kfree(REALpathptr, MAXPATHLEN);
+       }
        if (ab.base != NULL && alloc_local_buf) {
                FREE(ab.base, M_TEMP);
        }
@@ -2796,7 +2971,7 @@ out:
 }
 
 errno_t
-vfs_attr_pack(vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options,
+vfs_attr_pack_ext(mount_t mp, vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options,
     struct vnode_attr *vap, __unused void *fndesc, vfs_context_t ctx)
 {
        int error;
@@ -2824,7 +2999,7 @@ vfs_attr_pack(vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options,
                goto out;
        }
 
-       error = vfs_attr_pack_internal(vp, uio, alp,
+       error = vfs_attr_pack_internal(mp, vp, uio, alp,
            options | FSOPT_REPORT_FULLSIZE, vap, NULL, ctx, 1, v_type,
            fixedsize);
 
@@ -2835,6 +3010,13 @@ out:
        return error;
 }
 
+errno_t
+vfs_attr_pack(vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options,
+    struct vnode_attr *vap, __unused void *fndesc, vfs_context_t ctx)
+{
+       return vfs_attr_pack_ext(NULL, vp, uio, alp, options, vap, fndesc, ctx);
+}
+
 /*
  * Obtain attribute information about a filesystem object.
  *
@@ -2889,7 +3071,7 @@ getattrlist_internal(vfs_context_t ctx, vnode_t vp, struct attrlist  *alp,
        }
 
        VFS_DEBUG(ctx, vp, "%p  ATTRLIST - %s request common %08x vol %08x file %08x dir %08x fork %08x %sfollow on '%s'",
-           vp, p->p_comm, alp->commonattr, alp->volattr, alp->fileattr, alp->dirattr, alp->forkattr,
+           vp, vfs_context_proc(ctx)->p_comm, alp->commonattr, alp->volattr, alp->fileattr, alp->dirattr, alp->forkattr,
            (options & FSOPT_NOFOLLOW) ? "no":"", vp->v_name);
 
 #if CONFIG_MACF
@@ -3002,6 +3184,10 @@ getattrlist_internal(vfs_context_t ctx, vnode_t vp, struct attrlist  *alp,
 
                va.va_name = authoritative_name ? NULL : va_name;
 
+               if (options & FSOPT_RETURN_REALDEV) {
+                       va.va_vaflags |= VA_REALFSID;
+               }
+
                /*
                 * Call the filesystem.
                 */
@@ -3047,7 +3233,7 @@ getattrlist_internal(vfs_context_t ctx, vnode_t vp, struct attrlist  *alp,
                va.va_name = va_name;
        }
 
-       error = vfs_attr_pack_internal(vp, auio, alp, options, &va, NULL, ctx,
+       error = vfs_attr_pack_internal(vp->v_mount, vp, auio, alp, options, &va, NULL, ctx,
            0, vtype, fixedsize);
 
 out:
index 8ba4e78d241fb65654fbc6ea27856527bdb714ca..5ce788691345c0c5e878e67bd800b0c50a85daa6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -607,6 +607,22 @@ bufattr_quickcomplete(bufattr_t bap)
        return 0;
 }
 
+void
+bufattr_markioscheduled(bufattr_t bap)
+{
+       SET(bap->ba_flags, BA_IO_SCHEDULED);
+}
+
+
+int
+bufattr_ioscheduled(bufattr_t bap)
+{
+       if ((bap->ba_flags & BA_IO_SCHEDULED)) {
+               return 1;
+       }
+       return 0;
+}
+
 errno_t
 buf_error(buf_t bp)
 {
@@ -2171,13 +2187,13 @@ struct meta_zone_entry {
 };
 
 struct meta_zone_entry meta_zones[] = {
-       {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
-       {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
-       {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
-       {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
-       {NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
-       {NULL, (MINMETA * 32), 512 * (MINMETA * 32), "buf.16384" },
-       {NULL, 0, 0, "" } /* End */
+       {.mz_zone = NULL, .mz_size = (MINMETA * 1), .mz_max = 128 * (MINMETA * 1), .mz_name = "buf.512" },
+       {.mz_zone = NULL, .mz_size = (MINMETA * 2), .mz_max = 64 * (MINMETA * 2), .mz_name = "buf.1024" },
+       {.mz_zone = NULL, .mz_size = (MINMETA * 4), .mz_max = 16 * (MINMETA * 4), .mz_name = "buf.2048" },
+       {.mz_zone = NULL, .mz_size = (MINMETA * 8), .mz_max = 512 * (MINMETA * 8), .mz_name = "buf.4096" },
+       {.mz_zone = NULL, .mz_size = (MINMETA * 16), .mz_max = 512 * (MINMETA * 16), .mz_name = "buf.8192" },
+       {.mz_zone = NULL, .mz_size = (MINMETA * 32), .mz_max = 512 * (MINMETA * 32), .mz_name = "buf.16384" },
+       {.mz_zone = NULL, .mz_size = 0, .mz_max = 0, .mz_name = "" } /* End */
 };
 
 /*
index 9f3aaa548be901f7f51a5f17d5b54f1ec145aa29..18a0906b8b82e9119e23a1b04b64d16c1b33f188 100644 (file)
@@ -164,6 +164,7 @@ static const char *add_name_internal(const char *, uint32_t, u_int, boolean_t, u
 static void init_string_table(void);
 static void cache_delete(struct namecache *, int);
 static void cache_enter_locked(vnode_t dvp, vnode_t vp, struct componentname *cnp, const char *strname);
+static void cache_purge_locked(vnode_t vp, kauth_cred_t *credp);
 
 #ifdef DUMP_STRING_TABLE
 /*
@@ -479,6 +480,13 @@ again:
         */
        NAME_CACHE_LOCK_SHARED();
 
+#if CONFIG_FIRMLINKS
+       if (!(flags & BUILDPATH_NO_FIRMLINK) &&
+           (vp->v_flag & VFMLINKTARGET) && vp->v_fmlink) {
+               vp = vp->v_fmlink;
+       }
+#endif
+
        /*
         * Check if this is the root of a file system.
         */
@@ -501,6 +509,12 @@ again:
                         * want to cross mount points.  Therefore just return
                         * '/' as the relative path.
                         */
+#if CONFIG_FIRMLINKS
+                       if (!(flags & BUILDPATH_NO_FIRMLINK) &&
+                           (vp->v_flag & VFMLINKTARGET) && vp->v_fmlink) {
+                               vp = vp->v_fmlink;
+                       } else
+#endif
                        if (flags & BUILDPATH_VOLUME_RELATIVE) {
                                *--end = '/';
                                goto out_unlock;
@@ -730,6 +744,15 @@ bad_news:
                        if (tvp == proc_root_dir_vp) {
                                goto out_unlock;        /* encountered the root */
                        }
+
+#if CONFIG_FIRMLINKS
+                       if (!(flags & BUILDPATH_NO_FIRMLINK) &&
+                           (tvp->v_flag & VFMLINKTARGET) && tvp->v_fmlink) {
+                               tvp = tvp->v_fmlink;
+                               break;
+                       }
+#endif
+
                        if (!(tvp->v_flag & VROOT) || !tvp->v_mount) {
                                break;                  /* not the root of a mounted FS */
                        }
@@ -790,6 +813,9 @@ vnode_getparent(vnode_t vp)
        int     pvid;
 
        NAME_CACHE_LOCK_SHARED();
+
+       pvp = vp->v_parent;
+
        /*
         * v_parent is stable behind the name_cache lock
         * however, the only thing we can really guarantee
@@ -797,7 +823,7 @@ vnode_getparent(vnode_t vp)
         * parent of 'vp' at the time we took the name_cache lock...
         * once we drop the lock, vp could get re-parented
         */
-       if ((pvp = vp->v_parent) != NULLVP) {
+       if (pvp != NULLVP) {
                pvid = pvp->v_id;
 
                NAME_CACHE_UNLOCK();
@@ -930,9 +956,34 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u
                        flags &= ~VNODE_UPDATE_NAME;
                }
        }
-       if ((flags & (VNODE_UPDATE_PURGE | VNODE_UPDATE_PARENT | VNODE_UPDATE_CACHE | VNODE_UPDATE_NAME))) {
+       if ((flags & (VNODE_UPDATE_PURGE | VNODE_UPDATE_PARENT | VNODE_UPDATE_CACHE | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGEFIRMLINK))) {
                NAME_CACHE_LOCK();
 
+#if CONFIG_FIRMLINKS
+               if (flags & VNODE_UPDATE_PURGEFIRMLINK) {
+                       vnode_t old_fvp = vp->v_fmlink;
+                       if (old_fvp) {
+                               vnode_lock_spin(vp);
+                               vp->v_flag &= ~VFMLINKTARGET;
+                               vp->v_fmlink = NULLVP;
+                               vnode_unlock(vp);
+                               NAME_CACHE_UNLOCK();
+
+                               /*
+                                * vnode_rele can result in cascading series of
+                                * usecount releases. The combination of calling
+                                * vnode_recycle and dont_reenter (3rd arg to
+                                * vnode_rele_internal) ensures we don't have
+                                * that issue.
+                                */
+                               vnode_recycle(old_fvp);
+                               vnode_rele_internal(old_fvp, O_EVTONLY, 1, 0);
+
+                               NAME_CACHE_LOCK();
+                       }
+               }
+#endif
+
                if ((flags & VNODE_UPDATE_PURGE)) {
                        if (vp->v_parent) {
                                vp->v_parent->v_nc_generation++;
@@ -1081,6 +1132,139 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u
        }
 }
 
+#if CONFIG_FIRMLINKS
+errno_t
+vnode_setasfirmlink(vnode_t vp, vnode_t target_vp)
+{
+       int error = 0;
+       vnode_t old_target_vp = NULLVP;
+       vnode_t old_target_vp_v_fmlink = NULLVP;
+       kauth_cred_t target_vp_cred = NULL;
+       kauth_cred_t old_target_vp_cred = NULL;
+
+       if (!vp) {
+               return EINVAL;
+       }
+
+       if (target_vp) {
+               if (vp->v_fmlink == target_vp) { /* Will be checked again under the name cache lock */
+                       return 0;
+               }
+
+               /*
+                * Firmlink source and target will take both a usecount
+                * and kusecount on each other.
+                */
+               if ((error = vnode_ref_ext(target_vp, O_EVTONLY, VNODE_REF_FORCE))) {
+                       return error;
+               }
+
+               if ((error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE))) {
+                       vnode_rele_ext(target_vp, O_EVTONLY, 1);
+                       return error;
+               }
+       }
+
+       NAME_CACHE_LOCK();
+
+       old_target_vp = vp->v_fmlink;
+       if (target_vp && (target_vp == old_target_vp)) {
+               NAME_CACHE_UNLOCK();
+               return 0;
+       }
+       vp->v_fmlink = target_vp;
+
+       vnode_lock_spin(vp);
+       vp->v_flag &= ~VFMLINKTARGET;
+       vnode_unlock(vp);
+
+       if (target_vp) {
+               target_vp->v_fmlink = vp;
+               vnode_lock_spin(target_vp);
+               target_vp->v_flag |= VFMLINKTARGET;
+               vnode_unlock(target_vp);
+               cache_purge_locked(vp, &target_vp_cred);
+       }
+
+       if (old_target_vp) {
+               old_target_vp_v_fmlink = old_target_vp->v_fmlink;
+               old_target_vp->v_fmlink = NULLVP;
+               vnode_lock_spin(old_target_vp);
+               old_target_vp->v_flag &= ~VFMLINKTARGET;
+               vnode_unlock(old_target_vp);
+               cache_purge_locked(vp, &old_target_vp_cred);
+       }
+
+       NAME_CACHE_UNLOCK();
+
+       if (target_vp_cred && IS_VALID_CRED(target_vp_cred)) {
+               kauth_cred_unref(&target_vp_cred);
+       }
+
+       if (old_target_vp) {
+               if (old_target_vp_cred && IS_VALID_CRED(old_target_vp_cred)) {
+                       kauth_cred_unref(&old_target_vp_cred);
+               }
+
+               vnode_rele_ext(old_target_vp, O_EVTONLY, 1);
+               if (old_target_vp_v_fmlink) {
+                       vnode_rele_ext(old_target_vp_v_fmlink, O_EVTONLY, 1);
+               }
+       }
+
+       return 0;
+}
+
+errno_t
+vnode_getfirmlink(vnode_t vp, vnode_t *target_vp)
+{
+       int error;
+
+       if (!vp->v_fmlink) {
+               return ENODEV;
+       }
+
+       NAME_CACHE_LOCK_SHARED();
+       if (vp->v_fmlink && !(vp->v_flag & VFMLINKTARGET) &&
+           (vnode_get(vp->v_fmlink) == 0)) {
+               vnode_t tvp = vp->v_fmlink;
+
+               vnode_lock_spin(tvp);
+               if (tvp->v_lflag & (VL_TERMINATE | VL_DEAD)) {
+                       vnode_unlock(tvp);
+                       NAME_CACHE_UNLOCK();
+                       vnode_put(tvp);
+                       return ENOENT;
+               }
+               if (!(tvp->v_flag & VFMLINKTARGET)) {
+                       panic("firmlink target for vnode %p does not have flag set", vp);
+               }
+               vnode_unlock(tvp);
+               *target_vp = tvp;
+               error = 0;
+       } else {
+               *target_vp = NULLVP;
+               error = ENODEV;
+       }
+       NAME_CACHE_UNLOCK();
+       return error;
+}
+
+#else /* CONFIG_FIRMLINKS */
+
+errno_t
+vnode_setasfirmlink(__unused vnode_t vp, __unused vnode_t src_vp)
+{
+       return ENOTSUP;
+}
+
+errno_t
+vnode_getfirmlink(__unused vnode_t vp, __unused vnode_t *target_vp)
+{
+       return ENOTSUP;
+}
+
+#endif
 
 /*
  * Mark a vnode as having multiple hard links.  HFS makes use of this
@@ -1476,6 +1660,12 @@ skiprsrcfork:
                                break;
                        }
                        if (cnp->cn_flags & ISDOTDOT) {
+#if CONFIG_FIRMLINKS
+                               if (dp->v_fmlink && (dp->v_flag & VFMLINKTARGET)) {
+                                       dp = dp->v_fmlink;
+                               }
+#endif
+
                                /*
                                 * Force directory hardlinks to go to
                                 * file system for ".." requests.
@@ -2336,12 +2526,12 @@ cache_delete(struct namecache *ncp, int free_entry)
  * purge the entry associated with the
  * specified vnode from the name cache
  */
-void
-cache_purge(vnode_t vp)
+static void
+cache_purge_locked(vnode_t vp, kauth_cred_t *credp)
 {
        struct namecache *ncp;
-       kauth_cred_t tcred = NULL;
 
+       *credp = NULL;
        if ((LIST_FIRST(&vp->v_nclinks) == NULL) &&
            (TAILQ_FIRST(&vp->v_ncchildren) == NULL) &&
            (vp->v_cred == NOCRED) &&
@@ -2349,8 +2539,6 @@ cache_purge(vnode_t vp)
                return;
        }
 
-       NAME_CACHE_LOCK();
-
        if (vp->v_parent) {
                vp->v_parent->v_nc_generation++;
        }
@@ -2366,13 +2554,30 @@ cache_purge(vnode_t vp)
        /*
         * Use a temp variable to avoid kauth_cred_unref() while NAME_CACHE_LOCK is held
         */
-       tcred = vp->v_cred;
+       *credp = vp->v_cred;
        vp->v_cred = NOCRED;
        vp->v_authorized_actions = 0;
+}
+
+void
+cache_purge(vnode_t vp)
+{
+       kauth_cred_t tcred = NULL;
+
+       if ((LIST_FIRST(&vp->v_nclinks) == NULL) &&
+           (TAILQ_FIRST(&vp->v_ncchildren) == NULL) &&
+           (vp->v_cred == NOCRED) &&
+           (vp->v_parent == NULLVP)) {
+               return;
+       }
+
+       NAME_CACHE_LOCK();
+
+       cache_purge_locked(vp, &tcred);
 
        NAME_CACHE_UNLOCK();
 
-       if (IS_VALID_CRED(tcred)) {
+       if (tcred && IS_VALID_CRED(tcred)) {
                kauth_cred_unref(&tcred);
        }
 }
index 56d36978792efd16c57cb8265ab6b7f7cd33a792..181614fcbf156111948e4acfa2341bab8b4e8f2f 100644 (file)
@@ -5067,7 +5067,7 @@ wait_for_dreads:
                 * vm_pre_fault() will call vm_fault() to enter the page into
                 * the pmap if there isn't _a_ physical page for that VA already.
                 */
-               vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK));
+               vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK), VM_PROT_READ);
        }
 
        if (io_req_size && retval == 0) {
@@ -6897,8 +6897,13 @@ vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
                                modulus_size = DRT_HASH_XLARGE_MODULUS;
                                map_size = DRT_XLARGE_ALLOCATION;
                        } else {
-                               modulus_size = DRT_HASH_LARGE_MODULUS;
-                               map_size = DRT_LARGE_ALLOCATION;
+                               /*
+                                * If the ring is completely full and we can't
+                                * expand, there's nothing useful for us to do.
+                                * Behave as though we had compacted into the new
+                                * array and return.
+                                */
+                               return KERN_SUCCESS;
                        }
                } else {
                        /* already using the xlarge modulus */
index d6595dc9629c873f9fa927c18adb07ff46ace839..1d61ed28446c538b29cb2d4d9d88b7c9bbaded40 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -94,7 +94,7 @@ extern  int nfs_mountroot(void);
 extern  struct vfsops afs_vfsops;
 extern  struct vfsops null_vfsops;
 extern  struct vfsops devfs_vfsops;
-extern  struct vfsops routefs_vfsops;
+extern  const struct vfsops routefs_vfsops;
 extern  struct vfsops nullfs_vfsops;
 
 #if MOCKFS
@@ -123,36 +123,149 @@ enum fs_type_num {
 static struct vfstable vfstbllist[] = {
        /* Sun-compatible Network Filesystem */
 #if NFSCLIENT
-       { &nfs_vfsops, "nfs", FT_NFS, 0, 0, NULL, NULL, 0, 0, VFC_VFSGENERICARGS | VFC_VFSPREFLIGHT | VFC_VFS64BITREADY | VFC_VFSREADDIR_EXTENDED, NULL, 0, NULL},
-#endif
+       {
+               .vfc_vfsops = &nfs_vfsops,
+               .vfc_name = "nfs",
+               .vfc_typenum = FT_NFS,
+               .vfc_refcount = 0,
+               .vfc_flags = 0,
+               .vfc_mountroot = NULL,
+               .vfc_next = NULL,
+               .vfc_reserved1 = 0,
+               .vfc_reserved2 = 0,
+               .vfc_vfsflags = VFC_VFSGENERICARGS | VFC_VFSPREFLIGHT | VFC_VFS64BITREADY | VFC_VFSREADDIR_EXTENDED,
+               .vfc_descptr = NULL,
+               .vfc_descsize = 0,
+               .vfc_sysctl = NULL
+       },
+#endif /* NFSCLIENT */
 
        /* Device Filesystem */
 #if DEVFS
 #if CONFIG_MACF
-       { &devfs_vfsops, "devfs", FT_DEVFS, 0, MNT_MULTILABEL, NULL, NULL, 0, 0, VFC_VFSGENERICARGS | VFC_VFS64BITREADY, NULL, 0, NULL},
-#else
-       { &devfs_vfsops, "devfs", FT_DEVFS, 0, 0, NULL, NULL, 0, 0, VFC_VFSGENERICARGS | VFC_VFS64BITREADY, NULL, 0, NULL},
-#endif /* MAC */
-#endif
+       {
+               .vfc_vfsops = &devfs_vfsops,
+               .vfc_name = "devfs",
+               .vfc_typenum = FT_DEVFS,
+               .vfc_refcount = 0,
+               .vfc_flags = MNT_MULTILABEL,
+               .vfc_mountroot = NULL,
+               .vfc_next = NULL,
+               .vfc_reserved1 = 0,
+               .vfc_reserved2 = 0,
+               .vfc_vfsflags = VFC_VFSGENERICARGS | VFC_VFS64BITREADY,
+               .vfc_descptr = NULL,
+               .vfc_descsize = 0,
+               .vfc_sysctl = NULL
+       },
+#else /* !CONFIG_MAC */
+       {
+               .vfc_vfsops = &devfs_vfsops,
+               .vfc_name = "devfs",
+               .vfc_typenum = FT_DEVFS,
+               .vfc_refcount = 0,
+               .vfc_flags = 0,
+               .vfc_mountroot = NULL,
+               .vfc_next = NULL,
+               .vfc_reserved1 = 0,
+               .vfc_reserved2 = 0,
+               .vfc_vfsflags = VFC_VFSGENERICARGS | VFC_VFS64BITREADY,
+               .vfc_descptr = NULL,
+               .vfc_descsize = 0,
+               .vfc_sysctl = NULL
+       },
+#endif /* CONFIG_MAC */
+#endif /* DEVFS */
 
 #ifndef __LP64__
 #endif /* __LP64__ */
 
 #if NULLFS
-       { &nullfs_vfsops, "nullfs", FT_NULLFS, 0, (MNT_DONTBROWSE | MNT_RDONLY), NULL, NULL, 0, 0, VFC_VFS64BITREADY, NULL, 0, NULL},
+       {
+               .vfc_vfsops = &nullfs_vfsops,
+               .vfc_name = "nullfs",
+               .vfc_typenum = FT_NULLFS,
+               .vfc_refcount = 0,
+               .vfc_flags = MNT_DONTBROWSE | MNT_RDONLY,
+               .vfc_mountroot = NULL,
+               .vfc_next = NULL,
+               .vfc_reserved1 = 0,
+               .vfc_reserved2 = 0,
+               .vfc_vfsflags = VFC_VFS64BITREADY,
+               .vfc_descptr = NULL,
+               .vfc_descsize = 0,
+               .vfc_sysctl = NULL
+       },
 #endif /* NULLFS */
 
 #if MOCKFS
        /* If we are configured for it, mockfs should always be the last standard entry (and thus the last FS we attempt mountroot with) */
-       { &mockfs_vfsops, "mockfs", FT_MOCKFS, 0, MNT_LOCAL, mockfs_mountroot, NULL, 0, 0, VFC_VFSGENERICARGS, NULL, 0, NULL},
+       {
+               .vfc_vfsops = &mockfs_vfsops,
+               .vfc_name = "mockfs",
+               .vfc_typenum = FT_MOCKFS,
+               .vfc_refcount = 0,
+               .vfc_flags = MNT_LOCAL,
+               .vfc_mountroot = mockfs_mountroot,
+               .vfc_next = NULL,
+               .vfc_reserved1 = 0,
+               .vfc_reserved2 = 0,
+               .vfc_vfsflags = VFC_VFSGENERICARGS,
+               .vfc_descptr = NULL,
+               .vfc_descsize = 0,
+               .vfc_sysctl = NULL
+       },
 #endif /* MOCKFS */
 
 #if ROUTEFS
        /* If we are configured for it, mockfs should always be the last standard entry (and thus the last FS we attempt mountroot with) */
-       { &routefs_vfsops, "routefs", FT_ROUTEFS, 0, MNT_LOCAL, NULL, NULL, 0, 0, VFC_VFSGENERICARGS | VFC_VFS64BITREADY, NULL, 0, NULL},
+       {
+               .vfc_vfsops = &routefs_vfsops,
+               .vfc_name = "routefs",
+               .vfc_typenum = FT_ROUTEFS,
+               .vfc_refcount = 0,
+               .vfc_flags = MNT_LOCAL,
+               .vfc_mountroot = NULL,
+               .vfc_next = NULL,
+               .vfc_reserved1 = 0,
+               .vfc_reserved2 = 0,
+               .vfc_vfsflags = VFC_VFSGENERICARGS | VFC_VFS64BITREADY,
+               .vfc_descptr = NULL,
+               .vfc_descsize = 0,
+               .vfc_sysctl = NULL
+       },
 #endif /* ROUTEFS */
-       {NULL, "<unassigned>", 0, 0, 0, NULL, NULL, 0, 0, 0, NULL, 0, NULL},
-       {NULL, "<unassigned>", 0, 0, 0, NULL, NULL, 0, 0, 0, NULL, 0, NULL},
+
+       {
+               .vfc_vfsops = NULL,
+               .vfc_name = "<unassigned>",
+               .vfc_typenum = 0,
+               .vfc_refcount = 0,
+               .vfc_flags = 0,
+               .vfc_mountroot = NULL,
+               .vfc_next = NULL,
+               .vfc_reserved1 = 0,
+               .vfc_reserved2 = 0,
+               .vfc_vfsflags = 0,
+               .vfc_descptr = NULL,
+               .vfc_descsize = 0,
+               .vfc_sysctl = NULL
+       },
+       {
+               .vfc_vfsops = NULL,
+               .vfc_name = "<unassigned>",
+               .vfc_typenum = 0,
+               .vfc_refcount = 0,
+               .vfc_flags = 0,
+               .vfc_mountroot = NULL,
+               .vfc_next = NULL,
+               .vfc_reserved1 = 0,
+               .vfc_reserved2 = 0,
+               .vfc_vfsflags = 0,
+               .vfc_descptr = NULL,
+               .vfc_descsize = 0,
+               .vfc_sysctl = NULL
+       },
 };
 
 /*
@@ -172,32 +285,34 @@ struct vfstable *vfsconf = vfstbllist;
  *
  */
 extern struct vnodeopv_desc mfs_vnodeop_opv_desc;
-extern struct vnodeopv_desc dead_vnodeop_opv_desc;
+extern const struct vnodeopv_desc dead_vnodeop_opv_desc;
 #if FIFO && SOCKETS
-extern struct vnodeopv_desc fifo_vnodeop_opv_desc;
+extern const struct vnodeopv_desc fifo_vnodeop_opv_desc;
 #endif /* SOCKETS */
-extern struct vnodeopv_desc spec_vnodeop_opv_desc;
-extern struct vnodeopv_desc nfsv2_vnodeop_opv_desc;
-extern struct vnodeopv_desc spec_nfsv2nodeop_opv_desc;
-extern struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc;
-extern struct vnodeopv_desc nfsv4_vnodeop_opv_desc;
-extern struct vnodeopv_desc spec_nfsv4nodeop_opv_desc;
-extern struct vnodeopv_desc fifo_nfsv4nodeop_opv_desc;
+extern const struct vnodeopv_desc spec_vnodeop_opv_desc;
+extern const struct vnodeopv_desc nfsv2_vnodeop_opv_desc;
+extern const struct vnodeopv_desc spec_nfsv2nodeop_opv_desc;
+extern const struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc;
+#if CONFIG_NFS4
+extern const struct vnodeopv_desc nfsv4_vnodeop_opv_desc;
+extern const struct vnodeopv_desc spec_nfsv4nodeop_opv_desc;
+extern const struct vnodeopv_desc fifo_nfsv4nodeop_opv_desc;
+#endif
 extern struct vnodeopv_desc null_vnodeop_opv_desc;
 extern struct vnodeopv_desc devfs_vnodeop_opv_desc;
 extern struct vnodeopv_desc devfs_spec_vnodeop_opv_desc;
 #if FDESC
 extern struct vnodeopv_desc devfs_devfd_vnodeop_opv_desc;
-extern struct vnodeopv_desc devfs_fdesc_vnodeop_opv_desc;
+extern const struct vnodeopv_desc devfs_fdesc_vnodeop_opv_desc;
 #endif /* FDESC */
 
 #if MOCKFS
-extern struct vnodeopv_desc mockfs_vnodeop_opv_desc;
+extern const struct vnodeopv_desc mockfs_vnodeop_opv_desc;
 #endif /* MOCKFS */
 
-extern struct vnodeopv_desc nullfs_vnodeop_opv_desc;
+extern const struct vnodeopv_desc nullfs_vnodeop_opv_desc;
 
-struct vnodeopv_desc *vfs_opv_descs[] = {
+const struct vnodeopv_desc *vfs_opv_descs[] = {
        &dead_vnodeop_opv_desc,
 #if FIFO && SOCKETS
        &fifo_vnodeop_opv_desc,
@@ -209,13 +324,17 @@ struct vnodeopv_desc *vfs_opv_descs[] = {
 #if NFSCLIENT
        &nfsv2_vnodeop_opv_desc,
        &spec_nfsv2nodeop_opv_desc,
+#if CONFIG_NFS4
        &nfsv4_vnodeop_opv_desc,
        &spec_nfsv4nodeop_opv_desc,
+#endif
 #if FIFO
        &fifo_nfsv2nodeop_opv_desc,
+#if CONFIG_NFS4
        &fifo_nfsv4nodeop_opv_desc,
-#endif
-#endif
+#endif /* CONFIG_NFS4 */
+#endif /* FIFO */
+#endif /* NFSCLIENT */
 #if DEVFS
        &devfs_vnodeop_opv_desc,
        &devfs_spec_vnodeop_opv_desc,
index 9bfbeab55cb2ec7bc615c14aba4bd60350c95b4a..7df2f287beb907fe2e2c376246f03103d22deee9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2016-2018 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -219,8 +219,9 @@ disk_conditioner_set_info(mount_t mp, disk_conditioner_info *uinfo)
 
        internal_info = mp->mnt_disk_conditioner_info;
        if (!internal_info) {
-               internal_info = mp->mnt_disk_conditioner_info = kalloc(sizeof(struct _disk_conditioner_info_t));
+               internal_info = kalloc(sizeof(struct _disk_conditioner_info_t));
                bzero(internal_info, sizeof(struct _disk_conditioner_info_t));
+               mp->mnt_disk_conditioner_info = internal_info;
                mnt_fields = &(internal_info->mnt_fields);
 
                /* save mount_t fields for restoration later */
@@ -300,7 +301,10 @@ disk_conditioner_mount_is_ssd(mount_t mp)
        struct _disk_conditioner_info_t *internal_info = mp->mnt_disk_conditioner_info;
 
        if (!internal_info || !internal_info->dcinfo.enabled) {
-               return !!(mp->mnt_kern_flag & MNTK_SSD);
+               if (mp->mnt_kern_flag & MNTK_SSD) {
+                       return TRUE;
+               }
+               return FALSE;
        }
 
        return internal_info->dcinfo.is_ssd;
index 1a6fa384408af401d44664ce94824b4191e98acb..f7916db48c0379a19ebf10116e0e73b5a29f2f8c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -605,7 +605,7 @@ add_fsevent(int type, vfs_context_t ctx, ...)
                        val = 0xbadc0de2;
                }
                // overlay the dest inode number on the str/dest pointer fields
-               memcpy(&cur->str, &val, sizeof(ino64_t));
+               __nochk_memcpy(&cur->str, &val, sizeof(ino64_t));
 
 
                // and last the document-id
@@ -619,7 +619,10 @@ add_fsevent(int type, vfs_context_t ctx, ...)
                }
 
                // the docid is 64-bit and overlays the uid/gid fields
-               memcpy(&cur->uid, &val, sizeof(uint64_t));
+               static_assert(sizeof(cur->uid) + sizeof(cur->gid) == sizeof(val), "gid/uid size mismatch");
+               static_assert(offsetof(struct kfs_event, gid) - offsetof(struct kfs_event, uid) == sizeof(cur->uid), "unexpected struct kfs_event layout");
+               memcpy(&cur->uid, &val, sizeof(cur->uid));
+               memcpy(&cur->gid, (u_int8_t *)&val + sizeof(cur->uid), sizeof(cur->gid));
 
                goto done_with_args;
        }
@@ -685,7 +688,7 @@ add_fsevent(int type, vfs_context_t ctx, ...)
                                pathbuff_len = MAXPATHLEN;
 
                                pathbuff[0] = '\0';
-                               if ((ret = vn_getpath(vp, pathbuff, &pathbuff_len)) != 0 || pathbuff[0] == '\0') {
+                               if ((ret = vn_getpath_no_firmlink(vp, pathbuff, &pathbuff_len)) != 0 || pathbuff[0] == '\0') {
                                        cur->flags |= KFSE_CONTAINS_DROPPED_EVENTS;
 
                                        do {
@@ -703,7 +706,7 @@ add_fsevent(int type, vfs_context_t ctx, ...)
                                                }
 
                                                pathbuff_len = MAXPATHLEN;
-                                               ret = vn_getpath(vp, pathbuff, &pathbuff_len);
+                                               ret = vn_getpath_no_firmlink(vp, pathbuff, &pathbuff_len);
                                        } while (ret == ENOSPC);
 
                                        if (ret != 0 || vp == NULL) {
@@ -1621,7 +1624,7 @@ fsevent_unmount(__unused struct mount *mp, __unused vfs_context_t ctx)
 #if CONFIG_EMBEDDED
        dev_t dev = mp->mnt_vfsstat.f_fsid.val[0];
        int error, waitcount = 0;
-       struct timespec ts = {1, 0};
+       struct timespec ts = {.tv_sec = 1, .tv_nsec = 0};
 
        // wait for any other pending unmounts to complete
        lock_watch_table();
@@ -1708,13 +1711,6 @@ fseventsf_read(struct fileproc *fp, struct uio *uio,
 }
 
 
-static int
-fseventsf_write(__unused struct fileproc *fp, __unused struct uio *uio,
-    __unused int flags, __unused vfs_context_t ctx)
-{
-       return EIO;
-}
-
 #pragma pack(push, 4)
 typedef struct fsevent_dev_filter_args32 {
        uint32_t            num_devices;
@@ -1939,11 +1935,12 @@ filt_fsevent_detach(struct knote *kn)
  *      --If hint is revoke, set special flags and activate
  */
 static int
-filt_fsevent(struct knote *kn, long hint)
+filt_fsevent_common(struct knote *kn, struct kevent_qos_s *kev, long hint)
 {
        fsevent_handle *fseh = (struct fsevent_handle *)kn->kn_hook;
        int activate = 0;
        int32_t rd, wr, amt;
+       int64_t data = 0;
 
        if (NOTE_REVOKE == hint) {
                kn->kn_flags |= (EV_EOF | EV_ONESHOT);
@@ -1960,11 +1957,8 @@ filt_fsevent(struct knote *kn, long hint)
 
        switch (kn->kn_filter) {
        case EVFILT_READ:
-               kn->kn_data = amt;
-
-               if (kn->kn_data != 0) {
-                       activate = 1;
-               }
+               data = amt;
+               activate = (data != 0);
                break;
        case EVFILT_VNODE:
                /* Check events this note matches against the hint */
@@ -1975,18 +1969,25 @@ filt_fsevent(struct knote *kn, long hint)
                        activate = 1;
                }
                break;
-       default: {
+       default:
                // nothing to do...
                break;
        }
-       }
 
+       if (activate && kev) {
+               knote_fill_kevent(kn, kev, data);
+       }
        return activate;
 }
 
+static int
+filt_fsevent(struct knote *kn, long hint)
+{
+       return filt_fsevent_common(kn, NULL, hint);
+}
 
 static int
-filt_fsevent_touch(struct knote *kn, struct kevent_internal_s *kev)
+filt_fsevent_touch(struct knote *kn, struct kevent_qos_s *kev)
 {
        int res;
 
@@ -2004,7 +2005,7 @@ filt_fsevent_touch(struct knote *kn, struct kevent_internal_s *kev)
        //kn->kn_fflags &= kev->fflags;
 
        /* determine if the filter is now fired */
-       res = filt_fsevent(kn, 0);
+       res = filt_fsevent_common(kn, NULL, 0);
 
        unlock_watch_table();
 
@@ -2012,23 +2013,16 @@ filt_fsevent_touch(struct knote *kn, struct kevent_internal_s *kev)
 }
 
 static int
-filt_fsevent_process(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
+filt_fsevent_process(struct knote *kn, struct kevent_qos_s *kev)
 {
-#pragma unused(data)
        int res;
 
        lock_watch_table();
 
-       res = filt_fsevent(kn, 0);
-       if (res) {
-               *kev = kn->kn_kevent;
-               if (kev->flags & EV_CLEAR) {
-                       kn->kn_data = 0;
-                       kn->kn_fflags = 0;
-               }
-       }
+       res = filt_fsevent_common(kn, kev, 0);
 
        unlock_watch_table();
+
        return res;
 }
 
@@ -2042,14 +2036,13 @@ SECURITY_READ_ONLY_EARLY(struct  filterops) fsevent_filtops = {
 };
 
 static int
-fseventsf_kqfilter(__unused struct fileproc *fp, __unused struct knote *kn,
-    __unused struct kevent_internal_s *kev, __unused vfs_context_t ctx)
+fseventsf_kqfilter(struct fileproc *fp, struct knote *kn,
+    __unused struct kevent_qos_s *kev)
 {
        fsevent_handle *fseh = (struct fsevent_handle *)fp->f_fglob->fg_data;
        int res;
 
        kn->kn_hook = (void*)fseh;
-       kn->kn_hookid = 1;
        kn->kn_filtid = EVFILTID_FSEVENT;
 
        lock_watch_table();
@@ -2057,7 +2050,7 @@ fseventsf_kqfilter(__unused struct fileproc *fp, __unused struct knote *kn,
        KNOTE_ATTACH(&fseh->knotes, kn);
 
        /* check to see if it is fired already */
-       res = filt_fsevent(kn, 0);
+       res = filt_fsevent_common(kn, NULL, 0);
 
        unlock_watch_table();
 
@@ -2289,14 +2282,14 @@ fseventswrite(__unused dev_t dev, struct uio *uio, __unused int ioflag)
 
 
 static const struct fileops fsevents_fops = {
-       .fo_type = DTYPE_FSEVENTS,
-       .fo_read = fseventsf_read,
-       .fo_write = fseventsf_write,
-       .fo_ioctl = fseventsf_ioctl,
-       .fo_select = fseventsf_select,
-       .fo_close = fseventsf_close,
+       .fo_type     = DTYPE_FSEVENTS,
+       .fo_read     = fseventsf_read,
+       .fo_write    = fo_no_write,
+       .fo_ioctl    = fseventsf_ioctl,
+       .fo_select   = fseventsf_select,
+       .fo_close    = fseventsf_close,
        .fo_kqfilter = fseventsf_kqfilter,
-       .fo_drain = fseventsf_drain,
+       .fo_drain    = fseventsf_drain,
 };
 
 typedef struct fsevent_clone_args32 {
@@ -2380,12 +2373,26 @@ handle_clone:
                        return error;
                }
 
+               /*
+                * Lock down the user's "fd" result buffer so it's safe
+                * to hold locks while we copy it out.
+                */
+               error = vslock((user_addr_t)fse_clone_args->fd,
+                   sizeof(int32_t));
+               if (error) {
+                       FREE(event_list, M_TEMP);
+                       FREE(fseh, M_TEMP);
+                       return error;
+               }
+
                error = add_watcher(event_list,
                    fse_clone_args->num_events,
                    fse_clone_args->event_queue_depth,
                    &fseh->watcher,
                    fseh);
                if (error) {
+                       vsunlock((user_addr_t)fse_clone_args->fd,
+                           sizeof(int32_t), 0);
                        FREE(event_list, M_TEMP);
                        FREE(fseh, M_TEMP);
                        return error;
@@ -2396,6 +2403,8 @@ handle_clone:
                error = falloc(p, &f, &fd, vfs_context_current());
                if (error) {
                        remove_watcher(fseh->watcher);
+                       vsunlock((user_addr_t)fse_clone_args->fd,
+                           sizeof(int32_t), 0);
                        FREE(event_list, M_TEMP);
                        FREE(fseh, M_TEMP);
                        return error;
@@ -2404,16 +2413,21 @@ handle_clone:
                f->f_fglob->fg_flag = FREAD | FWRITE;
                f->f_fglob->fg_ops = &fsevents_fops;
                f->f_fglob->fg_data = (caddr_t) fseh;
-               proc_fdunlock(p);
+               /*
+                * We can safely hold the proc_fdlock across this copyout()
+                * because of the vslock() call above.  The vslock() call
+                * also ensures that we will never get an error, so assert
+                * this.
+                */
                error = copyout((void *)&fd, fse_clone_args->fd, sizeof(int32_t));
-               if (error != 0) {
-                       fp_free(p, fd, f);
-               } else {
-                       proc_fdlock(p);
-                       procfdtbl_releasefd(p, fd, NULL);
-                       fp_drop(p, fd, f, 1);
-                       proc_fdunlock(p);
-               }
+               assert(error == 0);
+
+               procfdtbl_releasefd(p, fd, NULL);
+               fp_drop(p, fd, f, 1);
+               proc_fdunlock(p);
+
+               vsunlock((user_addr_t)fse_clone_args->fd,
+                   sizeof(int32_t), 1);
                break;
 
        default:
@@ -2510,6 +2524,7 @@ get_fse_info(struct vnode *vp, fse_info *fse, __unused vfs_context_t ctx)
 
        VATTR_INIT(&va);
        VATTR_WANTED(&va, va_fsid);
+       va.va_vaflags |= VA_REALFSID;
        VATTR_WANTED(&va, va_fileid);
        VATTR_WANTED(&va, va_mode);
        VATTR_WANTED(&va, va_uid);
@@ -2595,7 +2610,7 @@ create_fsevent_from_kevent(vnode_t vp, uint32_t kevents, struct vnode_attr *vap)
        fse.gid = vap->va_gid;
 
        len = sizeof(pathbuf);
-       if (vn_getpath(vp, pathbuf, &len) == 0) {
+       if (vn_getpath_no_firmlink(vp, pathbuf, &len) == 0) {
                add_fsevent(fsevent_type, vfs_context_current(), FSE_ARG_STRING, len, pathbuf, FSE_ARG_FINFO, &fse, FSE_ARG_DONE);
        }
        return;
index cc70d2c24f596470d9961ecfb8741578be1fd9a7..d17cb02bd4ddbcaa101d4596c3131a891794283f 100644 (file)
 
 __private_extern__ void vntblinit(void);
 
-extern struct vnodeopv_desc *vfs_opv_descs[];
+extern const struct vnodeopv_desc *vfs_opv_descs[];
 /* a list of lists of vnodeops defns */
 extern struct vnodeop_desc *vfs_op_descs[];
 /* and the operations they perform */
@@ -150,7 +150,7 @@ vfs_opv_init(void)
        int i, j, k;
        int(***opv_desc_vector_p)(void *);
        int(**opv_desc_vector)(void *);
-       struct vnodeopv_entry_desc *opve_descp;
+       const struct vnodeopv_entry_desc *opve_descp;
 
        /*
         * Allocate the dynamic vectors and fill them in.
@@ -319,8 +319,6 @@ lck_mtx_t *pkg_extensions_lck;
 
 struct mount * dead_mountp;
 
-extern void nspace_handler_init(void);
-
 /*
  * Initialize the vnode structures and initialize each file system type.
  */
@@ -415,8 +413,6 @@ vfsinit(void)
         */
        nchinit();
 
-       nspace_handler_init();
-
        /*
         * Build vnode operation vectors.
         */
@@ -516,6 +512,8 @@ vfsinit(void)
 #if FS_COMPRESSION
        decmpfs_init();
 #endif
+
+       nspace_resolver_init();
 }
 
 void
index 2764ecc730d7fbfefddf4aeaf3e13a7087b155e2..aaaf2fbb1270052804fdafbbc38b2b42d7b80044 100644 (file)
@@ -95,6 +95,8 @@
 #include <security/mac_framework.h>
 #endif
 
+#include <sys/paths.h>
+
 #if NAMEDRSRCFORK
 #include <sys/xattr.h>
 #endif
@@ -631,7 +633,21 @@ lookup_handle_rsrc_fork(vnode_t dp, struct nameidata *ndp, struct componentname
 
        /* Restore the truncated pathname buffer (for audits). */
        if (ndp->ni_pathlen == 1 && ndp->ni_next[0] == '\0') {
-               ndp->ni_next[0] = '/';
+               /*
+                * While we replaced only '/' with '\0' and would ordinarily
+                * need to just switch that back, the buffer in which we did
+                * this may not be what the pathname buffer is now when symlinks
+                * are involved. If we just restore the "/" we will make the
+                * string not terminated anymore, so be safe and restore the
+                * entire suffix.
+                */
+               strncpy(ndp->ni_next, _PATH_RSRCFORKSPEC, sizeof(_PATH_RSRCFORKSPEC));
+               cnp->cn_nameptr = ndp->ni_next + 1;
+               cnp->cn_namelen = sizeof(_PATH_RSRCFORKSPEC) - 1;
+               ndp->ni_next += cnp->cn_namelen;
+               if (ndp->ni_next[0] != '\0') {
+                       panic("Incorrect termination of path in %s", __FUNCTION__);
+               }
        }
        cnp->cn_flags  &= ~MAKEENTRY;
 
@@ -1535,6 +1551,7 @@ lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx)
        struct componentname *cnp = &ndp->ni_cnd;
        vnode_t dp;
        char *tmppn;
+       u_int rsrclen = (cnp->cn_flags & CN_WANTSRSRCFORK) ? sizeof(_PATH_RSRCFORKSPEC) : 0;
 
        if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
                return ELOOP;
@@ -1577,7 +1594,7 @@ lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx)
         * is only 1024.
         */
        linklen = MAXPATHLEN - (u_int)uio_resid(auio);
-       if (linklen + ndp->ni_pathlen > MAXPATHLEN) {
+       if (linklen + ndp->ni_pathlen + rsrclen > MAXPATHLEN) {
                if (need_newpathbuf) {
                        FREE_ZONE(cp, MAXPATHLEN, M_NAMEI);
                }
@@ -1848,7 +1865,7 @@ kdebug_vfs_lookup(long *dbg_parms, int dbg_namelen, void *dp, uint32_t flags)
 
 void
 kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp,
-    boolean_t lookup)
+    bool lookup)
 {
        kdebug_vfs_lookup(dbg_parms, dbg_namelen, dp,
            lookup ? KDBG_VFS_LOOKUP_FLAG_LOOKUP : 0);
@@ -1972,7 +1989,24 @@ vfs_getrealpath(const char * path, char * realpath, size_t bufsize, vfs_context_
 
        /* Get the target vnode. */
        if (ino == 2) {
-               error = VFS_ROOT(mp, &vp, ctx);
+               struct vfs_attr vfsattr;
+               int use_vfs_root = TRUE;
+
+               VFSATTR_INIT(&vfsattr);
+               VFSATTR_WANTED(&vfsattr, f_capabilities);
+               if (vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
+                   VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
+                       if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
+                           (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
+                               use_vfs_root = FALSE;
+                       }
+               }
+
+               if (use_vfs_root) {
+                       error = VFS_ROOT(mp, &vp, ctx);
+               } else {
+                       error = VFS_VGET(mp, ino, &vp, ctx);
+               }
        } else {
                error = VFS_VGET(mp, ino, &vp, ctx);
        }
index e32310c8e302bae18a07955dcc43cef1b0da61bf..8a3cdcc476c6b4ce55a4f4dedf823f1dee275d6f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -254,9 +254,20 @@ int     ragevnodes = 0;
 #define RAGE_LIMIT_MIN  100
 #define RAGE_TIME_LIMIT 5
 
+/*
+ * ROSV definitions
+ * NOTE: These are shadowed from PlatformSupport definitions, but XNU
+ * builds standalone.
+ */
+#define PLATFORM_DATA_VOLUME_MOUNT_POINT "/System/Volumes/Data"
+#define PLATFORM_VM_VOLUME_MOUNT_POINT "/private/var/vm"
+
+
 struct mntlist mountlist;                       /* mounted filesystem list */
 static int nummounts = 0;
 
+static int print_busy_vnodes = 0;                               /* print out busy vnodes */
+
 #if DIAGNOSTIC
 #define VLISTCHECK(fun, vp, list)       \
        if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
@@ -477,6 +488,7 @@ int
 vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags)
 {
        vnode_t vp;
+       int ret = 0;
 
        TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
                if (vp->v_type == VDIR) {
@@ -497,18 +509,28 @@ vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags)
 
                /* Look for busy vnode */
                if ((vp->v_usecount != 0) && ((vp->v_usecount - vp->v_kusecount) != 0)) {
-                       return 1;
+                       ret = 1;
+                       if (print_busy_vnodes && ((flags & FORCECLOSE) == 0)) {
+                               vprint("vnode_umount_preflight - busy vnode", vp);
+                       } else {
+                               return ret;
+                       }
                } else if (vp->v_iocount > 0) {
                        /* Busy if iocount is > 0 for more than 3 seconds */
                        tsleep(&vp->v_iocount, PVFS, "vnode_drain_network", 3 * hz);
                        if (vp->v_iocount > 0) {
-                               return 1;
+                               ret = 1;
+                               if (print_busy_vnodes && ((flags & FORCECLOSE) == 0)) {
+                                       vprint("vnode_umount_preflight - busy vnode", vp);
+                               } else {
+                                       return ret;
+                               }
                        }
                        continue;
                }
        }
 
-       return 0;
+       return ret;
 }
 
 /*
@@ -1259,6 +1281,98 @@ fail:
        return ENODEV;
 }
 
+/*
+ * Mount the data volume of an ROSV volume group
+ */
+int
+vfs_mount_rosv_data(void)
+{
+#if CONFIG_ROSV_STARTUP
+       int error = 0;
+       int do_rosv_mounts = 0;
+
+       error = vnode_get(rootvnode);
+       if (error) {
+               /* root must be mounted first */
+               printf("vnode_get(rootvnode) failed with error %d\n", error);
+               return error;
+       }
+
+       printf("NOTE: Attempting ROSV mount\n");
+       struct vfs_attr vfsattr;
+       VFSATTR_INIT(&vfsattr);
+       VFSATTR_WANTED(&vfsattr, f_capabilities);
+       if (vfs_getattr(rootvnode->v_mount, &vfsattr, vfs_context_kernel()) == 0 &&
+           VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
+               if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
+                   (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
+                       printf("NOTE: DETECTED ROSV CONFIG\n");
+                       do_rosv_mounts = 1;
+               }
+       }
+
+       if (!do_rosv_mounts) {
+               vnode_put(rootvnode);
+               //bail out if config not supported
+               return 0;
+       }
+
+       char datapath[] = PLATFORM_DATA_VOLUME_MOUNT_POINT; /* !const because of internal casting */
+
+       /* Mount the data volume */
+       printf("attempting kernel mount for data volume... \n");
+       error = kernel_mount(rootvnode->v_mount->mnt_vfsstat.f_fstypename, NULLVP, NULLVP,
+           datapath, (rootvnode->v_mount), 0, 0, (KERNEL_MOUNT_DATAVOL), vfs_context_kernel());
+
+       if (error) {
+               printf("Failed to mount data volume (%d)\n", error);
+       }
+
+       vnode_put(rootvnode);
+
+       return error;
+
+#else
+       return 0;
+#endif
+}
+
+/*
+ * Mount the VM volume of a container
+ */
+int
+vfs_mount_vm(void)
+{
+#if CONFIG_MOUNT_VM
+       int error = 0;
+
+       error = vnode_get(rootvnode);
+       if (error) {
+               /* root must be mounted first */
+               printf("vnode_get(rootvnode) failed with error %d\n", error);
+               return error;
+       }
+
+       char vmpath[] = PLATFORM_VM_VOLUME_MOUNT_POINT; /* !const because of internal casting */
+
+       /* Mount the VM volume */
+       printf("attempting kernel mount for vm volume... \n");
+       error = kernel_mount(rootvnode->v_mount->mnt_vfsstat.f_fstypename, NULLVP, NULLVP,
+           vmpath, (rootvnode->v_mount), 0, 0, (KERNEL_MOUNT_VMVOL), vfs_context_kernel());
+
+       if (error) {
+               printf("Failed to mount vm volume (%d)\n", error);
+       } else {
+               printf("mounted VM volume\n");
+       }
+
+       vnode_put(rootvnode);
+       return error;
+#else
+       return 0;
+#endif
+}
+
 /*
  * Lookup a mount point by filesystem identifier.
  */
@@ -2035,9 +2149,6 @@ done:
  * system error). If MNT_FORCE is specified, detach any active vnodes
  * that are found.
  */
-#if DIAGNOSTIC
-int busyprt = 0;        /* print out busy vnodes */
-#endif
 
 int
 vflush(struct mount *mp, struct vnode *skipvp, int flags)
@@ -2047,6 +2158,7 @@ vflush(struct mount *mp, struct vnode *skipvp, int flags)
        int reclaimed = 0;
        int retval;
        unsigned int vid;
+       bool first_try = true;
 
        /*
         * See comments in vnode_iterate() for the rationale for this lock
@@ -2191,11 +2303,12 @@ loop:
                        mount_lock(mp);
                        continue;
                }
-#if DIAGNOSTIC
-               if (busyprt) {
-                       vprint("vflush: busy vnode", vp);
+
+               /* log vnodes blocking unforced unmounts */
+               if (print_busy_vnodes && first_try && ((flags & FORCECLOSE) == 0)) {
+                       vprint("vflush - busy vnode", vp);
                }
-#endif
+
                vnode_unlock(vp);
                mount_lock(mp);
                busy++;
@@ -2206,6 +2319,7 @@ loop:
                busy = 0;
                reclaimed = 0;
                (void)vnode_iterate_reloadq(mp);
+               first_try = false;
                /* returned with mount lock held */
                goto loop;
        }
@@ -2213,6 +2327,7 @@ loop:
        /* if new vnodes were created in between retry the reclaim */
        if (vnode_iterate_reloadq(mp) != 0) {
                if (!(busy && ((flags & FORCECLOSE) == 0))) {
+                       first_try = false;
                        goto loop;
                }
        }
@@ -2367,7 +2482,7 @@ vclean(vnode_t vp, int flags)
        }
 
        // make sure the name & parent ptrs get cleaned out!
-       vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGE);
+       vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGE | VNODE_UPDATE_PURGEFIRMLINK);
 
        vnode_lock(vp);
 
@@ -2697,8 +2812,9 @@ vprint(const char *label, struct vnode *vp)
        if (label != NULL) {
                printf("%s: ", label);
        }
-       printf("type %s, usecount %d, writecount %d",
-           typename[vp->v_type], vp->v_usecount, vp->v_writecount);
+       printf("name %s type %s, usecount %d, writecount %d\n",
+           vp->v_name, typename[vp->v_type],
+           vp->v_usecount, vp->v_writecount);
        sbuf[0] = '\0';
        if (vp->v_flag & VROOT) {
                strlcat(sbuf, "|VROOT", sizeof(sbuf));
@@ -2719,7 +2835,7 @@ vprint(const char *label, struct vnode *vp)
                strlcat(sbuf, "|VALIASED", sizeof(sbuf));
        }
        if (sbuf[0] != '\0') {
-               printf(" flags (%s)", &sbuf[1]);
+               printf("vnode flags (%s\n", &sbuf[1]);
        }
 }
 
@@ -2772,6 +2888,29 @@ vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char *pathbu
        return build_path_with_parent(vp, dvp, pathbuf, *len, len, 0, vfs_context_current());
 }
 
+int
+vn_getpath_ext(struct vnode *vp, struct vnode *dvp, char *pathbuf, int *len, int flags)
+{
+       int bpflags = (flags & VN_GETPATH_FSENTER) ? 0 : BUILDPATH_NO_FS_ENTER;
+
+       if (flags && (flags != VN_GETPATH_FSENTER)) {
+               if (flags & VN_GETPATH_NO_FIRMLINK) {
+                       bpflags |= BUILDPATH_NO_FIRMLINK;;
+               }
+               if (flags & VN_GETPATH_VOLUME_RELATIVE) {
+                       bpflags |= (BUILDPATH_VOLUME_RELATIVE | BUILDPATH_NO_FIRMLINK);
+               }
+       }
+
+       return build_path_with_parent(vp, dvp, pathbuf, *len, len, bpflags, vfs_context_current());
+}
+
+int
+vn_getpath_no_firmlink(struct vnode *vp, char *pathbuf, int *len)
+{
+       return vn_getpath_ext(vp, NULLVP, pathbuf, len, VN_GETPATH_NO_FIRMLINK);
+}
+
 int
 vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash)
 {
@@ -3260,6 +3399,7 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp)
        u_int32_t blksize;
        u_int64_t temp;
        u_int32_t features;
+       u_int64_t location = 0;
        vfs_context_t ctx = vfs_context_current();
        dk_corestorage_info_t cs_info;
        boolean_t cs_present = FALSE;;
@@ -3497,6 +3637,16 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp)
                }
        }
 
+       if (VNOP_IOCTL(devvp, DKIOCGETLOCATION, (caddr_t)&location, 0, ctx) == 0) {
+               if (location & DK_LOCATION_EXTERNAL) {
+                       mp->mnt_ioflags |= MNT_IOFLAGS_PERIPHERAL_DRIVE;
+                       /* This must be called after MNTK_VIRTUALDEV has been determined via DKIOCISVIRTUAL */
+                       if ((MNTK_VIRTUALDEV & mp->mnt_kern_flag)) {
+                               mp->mnt_flag |= MNT_REMOVABLE;
+                       }
+               }
+       }
+
 #if CONFIG_IOSCHED
        if (iosched_enabled && (features & DK_FEATURE_PRIORITY)) {
                mp->mnt_ioflags |= MNT_IOFLAGS_IOSCHED_SUPPORTED;
@@ -3859,11 +4009,11 @@ out:
        return error;
 }
 
-static int      filt_fsattach(struct knote *kn, struct kevent_internal_s *kev);
+static int      filt_fsattach(struct knote *kn, struct kevent_qos_s *kev);
 static void     filt_fsdetach(struct knote *kn);
 static int      filt_fsevent(struct knote *kn, long hint);
-static int      filt_fstouch(struct knote *kn, struct kevent_internal_s *kev);
-static int      filt_fsprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
+static int      filt_fstouch(struct knote *kn, struct kevent_qos_s *kev);
+static int      filt_fsprocess(struct knote *kn, struct kevent_qos_s *kev);
 SECURITY_READ_ONLY_EARLY(struct filterops) fs_filtops = {
        .f_attach = filt_fsattach,
        .f_detach = filt_fsdetach,
@@ -3873,8 +4023,11 @@ SECURITY_READ_ONLY_EARLY(struct filterops) fs_filtops = {
 };
 
 static int
-filt_fsattach(struct knote *kn, __unused struct kevent_internal_s *kev)
+filt_fsattach(struct knote *kn, __unused struct kevent_qos_s *kev)
 {
+       kn->kn_flags |= EV_CLEAR; /* automatic */
+       kn->kn_sdata = 0;         /* incoming data is ignored */
+
        lck_mtx_lock(fs_klist_lock);
        KNOTE_ATTACH(&fs_klist, kn);
        lck_mtx_unlock(fs_klist_lock);
@@ -3910,7 +4063,7 @@ filt_fsevent(struct knote *kn, long hint)
 }
 
 static int
-filt_fstouch(struct knote *kn, struct kevent_internal_s *kev)
+filt_fstouch(struct knote *kn, struct kevent_qos_s *kev)
 {
        int res;
 
@@ -3936,18 +4089,14 @@ filt_fstouch(struct knote *kn, struct kevent_internal_s *kev)
 }
 
 static int
-filt_fsprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
+filt_fsprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-#pragma unused(data)
-       int res;
+       int res = 0;
 
        lck_mtx_lock(fs_klist_lock);
-       res = (kn->kn_fflags != 0);
-       if (res) {
-               *kev = kn->kn_kevent;
-               kn->kn_flags |= EV_CLEAR; /* automatic */
-               kn->kn_fflags = 0;
-               kn->kn_data = 0;
+       if (kn->kn_fflags) {
+               knote_fill_kevent(kn, kev, 0);
+               res = 1;
        }
        lck_mtx_unlock(fs_klist_lock);
        return res;
@@ -4062,6 +4211,12 @@ SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &s
 SYSCTL_NODE(_vfs_generic, VFS_CONF, conf,
     CTLFLAG_RD | CTLFLAG_LOCKED,
     sysctl_vfs_generic_conf, "");
+#if DEVELOPMENT || DEBUG
+SYSCTL_INT(_vfs_generic, OID_AUTO, print_busy_vnodes,
+    CTLTYPE_INT | CTLFLAG_RW,
+    &print_busy_vnodes, 0,
+    "VFS log busy vnodes blocking unmount");
+#endif
 
 /* Indicate that the root file system unmounted cleanly */
 static int vfs_root_unmounted_cleanly = 0;
@@ -4518,7 +4673,7 @@ steal_this_vp:
         */
        assert((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT);
        assert((vp->v_lflag & VL_LABEL) != VL_LABEL);
-       if (vp->v_lflag & VL_LABELED) {
+       if (vp->v_lflag & VL_LABELED || vp->v_label != NULL) {
                vnode_lock_convert(vp);
                mac_vnode_label_recycle(vp);
        } else if (mac_vnode_label_init_needed(vp)) {
@@ -4987,6 +5142,13 @@ vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags)
 
        vn_clearunionwait(vp, 1);
 
+       if (vnode_istty(vp) && (flags & REVOKEALL) && vp->v_usecount &&
+           (vp->v_iocount > 1)) {
+               vnode_unlock(vp);
+               VNOP_IOCTL(vp, TIOCREVOKE, (caddr_t)NULL, 0, vfs_context_kernel());
+               vnode_lock(vp);
+       }
+
        vnode_drain(vp);
 
        isfifo = (vp->v_type == VFIFO);
@@ -5179,6 +5341,11 @@ vnode_create_internal(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp,
        record_vp(vp, 1);
 #endif
 
+#if CONFIG_FIRMLINKS
+       vp->v_fmlink = NULLVP;
+#endif
+       vp->v_flag &= ~VFMLINKTARGET;
+
 #if CONFIG_TRIGGERS
        /*
         * For trigger vnodes, attach trigger info to vnode
@@ -5462,6 +5629,7 @@ vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg)
        void * allocmem;
        int indx_start, indx_stop, indx_incr;
        int cb_dropref = (flags & VFS_ITERATE_CB_DROPREF);
+       int noskip_unmount = (flags & VFS_ITERATE_NOSKIP_UNMOUNT);
 
        count = mount_getvfscnt();
        count += 10;
@@ -5493,7 +5661,8 @@ vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg)
                        continue;
                }
                mount_lock(mp);
-               if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
+               if ((mp->mnt_lflag & MNT_LDEAD) ||
+                   (!noskip_unmount && (mp->mnt_lflag & MNT_LUNMOUNT))) {
                        mount_unlock(mp);
                        mount_iterdrop(mp);
                        continue;
@@ -5721,7 +5890,8 @@ out:
 }
 
 errno_t
-vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx)
+vnode_lookupat(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx,
+    vnode_t start_dvp)
 {
        struct nameidata nd;
        int error;
@@ -5749,15 +5919,29 @@ vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx)
        NDINIT(&nd, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE,
            CAST_USER_ADDR_T(path), ctx);
 
+       if (start_dvp && (path[0] != '/')) {
+               nd.ni_dvp = start_dvp;
+               nd.ni_cnd.cn_flags |= USEDVP;
+       }
+
        if ((error = namei(&nd))) {
                return error;
        }
+
+       nd.ni_cnd.cn_flags &= ~USEDVP;
+
        *vpp = nd.ni_vp;
        nameidone(&nd);
 
        return 0;
 }
 
+errno_t
+vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx)
+{
+       return vnode_lookupat(path, flags, vpp, ctx, NULLVP);
+}
+
 errno_t
 vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx)
 {
@@ -7673,7 +7857,7 @@ vnode_authorize_checkimmutable(mount_t mp, struct vnode_attr *vap, int rights, i
                        /* check for no-EA filesystems */
                        if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) &&
                            (vfs_flags(mp) & MNT_NOUSERXATTR)) {
-                               KAUTH_DEBUG("%p    DENIED - filesystem disallowed extended attributes", vp);
+                               KAUTH_DEBUG("%p    DENIED - filesystem disallowed extended attributes", vap);
                                error = EACCES;  /* User attributes disabled */
                                goto out;
                        }
@@ -7694,7 +7878,7 @@ vnode_authorize_checkimmutable(mount_t mp, struct vnode_attr *vap, int rights, i
                        }
                }
                if ((error = vnode_immutable(vap, append, ignore)) != 0) {
-                       KAUTH_DEBUG("%p    DENIED - file is immutable", vp);
+                       KAUTH_DEBUG("%p    DENIED - file is immutable", vap);
                        goto out;
                }
        }
@@ -7954,14 +8138,14 @@ vnode_attr_authorize_internal(vauth_ctx vcp, mount_t mp,
                    VATTR_IS_SUPPORTED(vcp->vap, va_mode) &&
                    !(vcp->vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
                        result = EPERM;
-                       KAUTH_DEBUG("%p    DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode);
+                       KAUTH_DEBUG("%p    DENIED - root execute requires at least one x bit in 0x%x", vcp, vcp->vap->va_mode);
                        goto out;
                }
 
                /* Assume that there were DENYs so we don't wrongly cache KAUTH_VNODE_SEARCHBYANYONE */
                *found_deny = TRUE;
 
-               KAUTH_DEBUG("%p    ALLOWED - caller is superuser", vp);
+               KAUTH_DEBUG("%p    ALLOWED - caller is superuser", vcp);
        }
 out:
        return result;
@@ -8454,6 +8638,7 @@ vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uin
 
 
        if (VATTR_IS_ACTIVE(vap, va_flags)) {
+               vap->va_flags &= ~SF_SYNTHETIC;
                if (has_priv_suser) {
                        if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) {
                                error = EPERM;
@@ -8814,6 +8999,8 @@ vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_
         */
        if (VATTR_IS_ACTIVE(vap, va_flags)) {
                /* compute changing flags bits */
+               vap->va_flags &= ~SF_SYNTHETIC;
+               ova.va_flags &= ~SF_SYNTHETIC;
                if (VATTR_IS_SUPPORTED(&ova, va_flags)) {
                        fdelta = vap->va_flags ^ ova.va_flags;
                } else {
@@ -9040,12 +9227,12 @@ no_guuid_change:
                }
 
                /* chown always clears setuid/gid bits. An exception is made for
-                * setattrlist executed by a root process to set <uid, gid, mode> on a file:
+                * setattrlist which can set both at the same time: <uid, gid, mode> on a file:
                 * setattrlist is allowed to set the new mode on the file and change (chown)
                 * uid/gid.
                 */
                if (newmode & (S_ISUID | S_ISGID)) {
-                       if (!VATTR_IS_ACTIVE(vap, va_mode) || !has_priv_suser) {
+                       if (!VATTR_IS_ACTIVE(vap, va_mode)) {
                                KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o",
                                    newmode, newmode & ~(S_ISUID | S_ISGID));
                                newmode &= ~(S_ISUID | S_ISGID);
@@ -9195,6 +9382,59 @@ vn_clearunionwait(vnode_t vp, int locked)
        }
 }
 
+int
+vnode_materialize_dataless_file(vnode_t vp, uint64_t op_type)
+{
+       int error;
+
+       /* Swap files are special; ignore them */
+       if (vnode_isswap(vp)) {
+               return 0;
+       }
+
+       error = resolve_nspace_item(vp,
+           op_type | NAMESPACE_HANDLER_NSPACE_EVENT);
+
+       /*
+        * The file resolver owns the logic about what error to return
+        * to the caller.  We only need to handle a couple of special
+        * cases here:
+        */
+       if (error == EJUSTRETURN) {
+               /*
+                * The requesting process is allowed to interact with
+                * dataless objects.  Make a couple of sanity-checks
+                * here to ensure the action makes sense.
+                */
+               switch (op_type) {
+               case NAMESPACE_HANDLER_WRITE_OP:
+               case NAMESPACE_HANDLER_TRUNCATE_OP:
+               case NAMESPACE_HANDLER_RENAME_OP:
+                       /*
+                        * This handles the case of the resolver itself
+                        * writing data to the file (or throwing it
+                        * away).
+                        */
+                       error = 0;
+                       break;
+               case NAMESPACE_HANDLER_READ_OP:
+                       /*
+                        * This handles the case of the resolver needing
+                        * to look up inside of a dataless directory while
+                        * it's in the process of materializing it (for
+                        * example, creating files or directories).
+                        */
+                       error = (vnode_vtype(vp) == VDIR) ? 0 : EBADF;
+                       break;
+               default:
+                       error = EBADF;
+                       break;
+               }
+       }
+
+       return error;
+}
+
 /*
  * Removes orphaned apple double files during a rmdir
  * Works by:
@@ -9233,6 +9473,15 @@ rmdir_remove_orphaned_appleDouble(vnode_t vp, vfs_context_t ctx, int * restart_f
                return error;
        }
 
+       /*
+        * Prevent dataless fault materialization while we have
+        * a suspended vnode.
+        */
+       uthread_t ut = get_bsdthread_info(current_thread());
+       bool saved_nodatalessfaults =
+           (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? true : false;
+       ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
+
        /*
         * set up UIO
         */
@@ -9411,8 +9660,11 @@ outsc:
        }
        FREE(rbuf, M_TEMP);
 
-       vnode_resume(vp);
+       if (saved_nodatalessfaults == false) {
+               ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
+       }
 
+       vnode_resume(vp);
 
        return error;
 }
@@ -9883,9 +10135,16 @@ vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx)
        lck_mtx_unlock(&rp->vr_lock);
 
 #if CONFIG_MACF
-       int rv = mac_vnode_check_trigger_resolve(ctx, vp, &ndp->ni_cnd);
-       if (rv != 0) {
-               return rv;
+       if ((rp->vr_flags & VNT_KERN_RESOLVE) == 0) {
+               /*
+                * VNT_KERN_RESOLVE indicates this trigger has no parameters
+                * at the discression of the accessing process other than
+                * the act of access. All other triggers must be checked
+                */
+               int rv = mac_vnode_check_trigger_resolve(ctx, vp, &ndp->ni_cnd);
+               if (rv != 0) {
+                       return rv;
+               }
        }
 #endif
 
index c9dc444b94886c4aae49a1387f437f7e55a001a4..838ad8c12170b4c506e444675b85292670a831a5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1995-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 1995-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -94,6 +94,7 @@
 #include <sys/fsevents.h>
 #include <sys/imgsrc.h>
 #include <sys/sysproto.h>
+#include <sys/sysctl.h>
 #include <sys/xattr.h>
 #include <sys/fcntl.h>
 #include <sys/fsctl.h>
 #include <sys/clonefile.h>
 #include <sys/snapshot.h>
 #include <sys/priv.h>
+#include <sys/fsgetpath.h>
 #include <machine/cons.h>
 #include <machine/limits.h>
 #include <miscfs/specfs/specdev.h>
 #include <pexpert/pexpert.h>
 #include <IOKit/IOBSD.h>
 
+// deps for MIG call
+#include <kern/host.h>
+#include <kern/ipc_misc.h>
+#include <mach/host_priv.h>
+#include <mach/vfs_nspace.h>
+#include <os/log.h>
+
 #if ROUTEFS
 #include <miscfs/routefs/routefs.h>
 #endif /* ROUTEFS */
@@ -177,8 +186,6 @@ static int sync_callback(mount_t, void *);
 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
     boolean_t partial_copy);
-static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp,
-    user_addr_t bufp);
 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
     struct componentname *cnp, user_addr_t fsmountargs,
@@ -202,9 +209,10 @@ struct fd_vn_data * fg_vn_data_alloc(void);
  */
 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 
-static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
+static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
+    int unlink_flags);
 
-static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
+static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, uint32_t options, int *);
 
 #ifdef CONFIG_IMGSRC_ACCESS
 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
@@ -215,6 +223,11 @@ static void mount_end_update(mount_t mp);
 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 #endif /* CONFIG_IMGSRC_ACCESS */
 
+#if CONFIG_LOCKERBOOT
+int mount_locker_protoboot(const char *fsname, const char *mntpoint,
+    const char *pbdevpath);
+#endif
+
 //snapshot functions
 #if CONFIG_MNT_ROOTSNAP
 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
@@ -267,7 +280,7 @@ vfs_iskernelmount(mount_t mp)
 __private_extern__
 int
 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
-    void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
+    void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags, vfs_context_t ctx)
 {
        struct nameidata nd;
        boolean_t did_namei;
@@ -282,6 +295,9 @@ kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
        if (vp == NULLVP) {
                error = namei(&nd);
                if (error) {
+                       if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VMVOL | KERNEL_MOUNT_DATAVOL)) {
+                               printf("failed to locate mount-on path: %s ", path);
+                       }
                        return error;
                }
                vp = nd.ni_vp;
@@ -615,6 +631,22 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
        boolean_t did_rele = FALSE;
        boolean_t have_usecount = FALSE;
 
+#if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM
+       /* Check for mutually-exclusive flag bits */
+       uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL));
+       int bitcount = 0;
+       while (checkflags != 0) {
+               checkflags &= (checkflags - 1);
+               bitcount++;
+       }
+
+       if (bitcount > 1) {
+               //not allowed to request multiple mount-by-role flags
+               error = EINVAL;
+               goto out1;
+       }
+#endif
+
        /*
         * Process an update for an existing mount
         */
@@ -655,6 +687,16 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
                        goto out1;
                }
 
+               /*
+                * can't turn off MNT_REMOVABLE either but it may be an unexpected
+                * failure to return an error for this so we'll just silently
+                * add it if it is not passed in.
+                */
+               if ((mp->mnt_flag & MNT_REMOVABLE) &&
+                   ((flags & MNT_REMOVABLE) == 0)) {
+                       flags |= MNT_REMOVABLE;
+               }
+
 #ifdef CONFIG_IMGSRC_ACCESS
                /* Can't downgrade the backer of the root FS */
                if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
@@ -696,7 +738,7 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 
                vfsp = mp->mnt_vtable;
                goto update;
-       }
+       } // MNT_UPDATE
 
        /*
         * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
@@ -726,9 +768,11 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
        }
 
        /*
-        * VFC_VFSLOCALARGS is not currently supported for kernel mounts
+        * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
+        * except in ROSV configs.
         */
-       if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
+       if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
+           ((internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL)) == 0)) {
                error = EINVAL;  /* unsupported request */
                goto out1;
        }
@@ -770,7 +814,13 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
        //mp->mnt_stat.f_type = vfsp->vfc_typenum;
        mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
        strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
-       strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
+       do {
+               int pathlen = MAXPATHLEN;
+
+               if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
+                       strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
+               }
+       } while (0);
        mp->mnt_vnodecovered = vp;
        mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
        mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
@@ -807,7 +857,7 @@ update:
        mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
            MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
            MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
-           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
+           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
            MNT_QUARANTINE | MNT_CPROTECT);
 
 #if SECURE_KERNEL
@@ -824,7 +874,7 @@ update:
        mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
            MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
            MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
-           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
+           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
            MNT_QUARANTINE | MNT_CPROTECT);
 
 #if CONFIG_MACF
@@ -840,7 +890,8 @@ update:
         * Process device path for local file systems if requested
         */
        if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
-           !(internal_flags & KERNEL_MOUNT_SNAPSHOT)) {
+           !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL))) {
+               //snapshot, vm, datavolume mounts are special
                if (vfs_context_is64bit(ctx)) {
                        if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
                                goto out1;
@@ -969,7 +1020,8 @@ update:
                                goto out2;
                        }
                }
-       }
+       } // localargs && !(snapshot | data | vm)
+
 #if CONFIG_MACF
        if ((flags & MNT_UPDATE) == 0) {
                mac_mount_label_init(mp);
@@ -985,11 +1037,73 @@ update:
        }
 #endif
        /*
-        * Mount the filesystem.
+        * Mount the filesystem.  We already asserted that internal_flags
+        * cannot have more than one mount-by-role bit set.
         */
        if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
                error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
                    (caddr_t)fsmountargs, 0, ctx);
+       } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
+#if CONFIG_ROSV_STARTUP
+               struct mount *origin_mp = (struct mount*)fsmountargs;
+               fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
+               error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
+               if (error) {
+                       printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
+               } else {
+                       /* Mark volume associated with system volume */
+                       mp->mnt_kern_flag |= MNTK_SYSTEM;
+
+                       /* Attempt to acquire the mnt_devvp and set it up */
+                       struct vnode *mp_devvp = NULL;
+                       if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
+                               errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
+                                   0, &mp_devvp, vfs_context_kernel());
+                               if (!lerr) {
+                                       mp->mnt_devvp = mp_devvp;
+                                       //vnode_lookup took an iocount, need to drop it.
+                                       vnode_put(mp_devvp);
+                                       // now set `device_vnode` to the devvp that was acquired.
+                                       // this is needed in order to ensure vfs_init_io_attributes is invoked.
+                                       // note that though the iocount above was dropped, the mount acquires
+                                       // an implicit reference against the device.
+                                       device_vnode = mp_devvp;
+                               }
+                       }
+               }
+#else
+               error = EINVAL;
+#endif
+       } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
+#if CONFIG_MOUNT_VM
+               struct mount *origin_mp = (struct mount*)fsmountargs;
+               fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
+               error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
+               if (error) {
+                       printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
+               } else {
+                       /* Mark volume associated with system volume and a swap mount */
+                       mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
+                       /* Attempt to acquire the mnt_devvp and set it up */
+                       struct vnode *mp_devvp = NULL;
+                       if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
+                               errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
+                                   0, &mp_devvp, vfs_context_kernel());
+                               if (!lerr) {
+                                       mp->mnt_devvp = mp_devvp;
+                                       //vnode_lookup took an iocount, need to drop it.
+                                       vnode_put(mp_devvp);
+
+                                       // now set `device_vnode` to the devvp that was acquired.
+                                       // note that though the iocount above was dropped, the mount acquires
+                                       // an implicit reference against the device.
+                                       device_vnode = mp_devvp;
+                               }
+                       }
+               }
+#else
+               error = EINVAL;
+#endif
        } else {
                error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
        }
@@ -1019,6 +1133,11 @@ update:
        if (error == 0) {
                struct vfs_attr vfsattr;
 #if CONFIG_MACF
+               error = mac_mount_check_mount_late(ctx, mp);
+               if (error != 0) {
+                       goto out3;
+               }
+
                if (vfs_flags(mp) & MNT_MULTILABEL) {
                        error = VFS_ROOT(mp, &rvp, ctx);
                        if (error) {
@@ -1310,8 +1429,10 @@ out:
 
 #if CONFIG_IMGSRC_ACCESS
 
-#if DEBUG
-#define IMGSRC_DEBUG(args...) printf(args)
+#define DEBUG_IMGSRC 0
+
+#if DEBUG_IMGSRC
+#define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
 #else
 #define IMGSRC_DEBUG(args...) do { } while(0)
 #endif
@@ -1323,8 +1444,13 @@ authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_
        vnode_t vp, realdevvp;
        mode_t accessmode;
        int error;
+       enum uio_seg uio = UIO_USERSPACE;
+
+       if (ctx == vfs_context_kernel()) {
+               uio = UIO_SYSSPACE;
+       }
 
-       NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
+       NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
        if ((error = namei(&nd))) {
                IMGSRC_DEBUG("namei() failed with %d\n", error);
                return error;
@@ -1378,8 +1504,10 @@ authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_
 
 out1:
        vnode_put(realdevvp);
+
 out:
        nameidone(&nd);
+
        if (error) {
                vnode_put(vp);
        }
@@ -1398,6 +1526,9 @@ place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
 
        mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
 
+       IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
+           mp->mnt_vtable->vfc_name, vnode_getname(vp));
+
        vnode_lock_spin(vp);
        CLR(vp->v_flag, VMOUNT);
        vp->v_mountedhere = mp;
@@ -1518,18 +1649,18 @@ get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
 }
 
 static int
-relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
-    const char *fsname, vfs_context_t ctx,
+relocate_imageboot_source(vnode_t pvp, vnode_t vp,
+    struct componentname *cnp, const char *fsname, vfs_context_t ctx,
     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
 {
        int error;
        mount_t mp;
        boolean_t placed = FALSE;
-       vnode_t devvp = NULLVP;
        struct vfstable *vfsp;
        user_addr_t devpath;
        char *old_mntonname;
        vnode_t rvp;
+       vnode_t devvp;
        uint32_t height;
        uint32_t flags;
 
@@ -1601,11 +1732,11 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
 
        error = get_imgsrc_rootvnode(height, &rvp);
        if (error != 0) {
-               IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
+               IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
                return error;
        }
 
-       IMGSRC_DEBUG("got root vnode.\n");
+       IMGSRC_DEBUG("got old root vnode\n");
 
        MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
 
@@ -1617,6 +1748,7 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
                goto out0;
        }
 
+       IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
        IMGSRC_DEBUG("Starting updated.\n");
 
        /* Get exclusive rwlock on mount, authorize update on mp */
@@ -1635,7 +1767,6 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
                goto out1;
        }
 
-
        IMGSRC_DEBUG("Preparing coveredvp.\n");
 
        /* Mark covered vnode as mount in progress, authorize placing mount on top */
@@ -1650,7 +1781,8 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
        /* Sanity check the name caller has provided */
        vfsp = mp->mnt_vtable;
        if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
-               IMGSRC_DEBUG("Wrong fs name.\n");
+               IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
+                   vfsp->vfc_name, fsname);
                error = EINVAL;
                goto out2;
        }
@@ -1737,6 +1869,59 @@ out0:
        return error;
 }
 
+#if CONFIG_LOCKERBOOT
+__private_extern__
+int
+mount_locker_protoboot(const char *fsname, const char *mntpoint,
+    const char *pbdevpath)
+{
+       int error = -1;
+       struct nameidata nd;
+       boolean_t cleanup_nd = FALSE;
+       vfs_context_t ctx = vfs_context_kernel();
+       boolean_t is64 = TRUE;
+       boolean_t by_index = TRUE;
+       struct user64_mnt_imgsrc_args mia64 = {
+               .mi_height = 0,
+               .mi_flags = 0,
+               .mi_devpath = CAST_USER_ADDR_T(pbdevpath),
+       };
+       user_addr_t mia64addr = CAST_USER_ADDR_T(&mia64);
+
+       NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
+           UIO_SYSSPACE, CAST_USER_ADDR_T(mntpoint), ctx);
+       error = namei(&nd);
+       if (error) {
+               IMGSRC_DEBUG("namei: %d\n", error);
+               goto out;
+       }
+
+       cleanup_nd = TRUE;
+       error = relocate_imageboot_source(nd.ni_dvp, nd.ni_vp,
+           &nd.ni_cnd, fsname, ctx, is64, mia64addr, by_index);
+
+out:
+       if (cleanup_nd) {
+               int stashed = error;
+
+               error = vnode_put(nd.ni_vp);
+               if (error) {
+                       panic("vnode_put() returned non-zero: %d", error);
+               }
+
+               if (nd.ni_dvp) {
+                       error = vnode_put(nd.ni_dvp);
+                       if (error) {
+                               panic("vnode_put() returned non-zero: %d", error);
+                       }
+               }
+               nameidone(&nd);
+
+               error = stashed;
+       }
+       return error;
+}
+#endif /* CONFIG_LOCKERBOOT */
 #endif /* CONFIG_IMGSRC_ACCESS */
 
 void
@@ -1966,10 +2151,10 @@ safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
                }
        }
        /*
-        * Don't allow unmounting the root file system.
+        * Don't allow unmounting the root file system (or the associated VM or DATA mounts) .
         */
-       if (mp->mnt_flag & MNT_ROOTFS) {
-               error = EBUSY; /* the root is always busy */
+       if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
+               error = EBUSY; /* the root (or associated volumes) is always busy */
                goto out;
        }
 
@@ -2089,9 +2274,6 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
                }
        }
 
-       /* free disk_conditioner_info structure for this mount */
-       disk_conditioner_unmount(mp);
-
        IOBSDMountChange(mp, kIOMountChangeUnmount);
 
 #if CONFIG_TRIGGERS
@@ -2183,6 +2365,10 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
                wakeup((caddr_t)mp);
        }
        mount_refdrain(mp);
+
+       /* free disk_conditioner_info structure for this mount */
+       disk_conditioner_unmount(mp);
+
 out:
        if (mp->mnt_lflag & MNT_LWAIT) {
                mp->mnt_lflag &= ~MNT_LWAIT;
@@ -2376,14 +2562,44 @@ int syncprt = 0;
 
 int print_vmpage_stat = 0;
 
+/*
+ * sync_callback:      simple wrapper that calls VFS_SYNC() on volumes
+ *                     mounted read-write with the passed waitfor value.
+ *
+ * Parameters: mp      mount-point descriptor per mounted file-system instance.
+ *             arg     user argument (please see below)
+ *
+ * User argument is a pointer to 32 bit unsigned integer which describes the
+ * type of waitfor value to set for calling VFS_SYNC().  If user argument is
+ * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
+ * waitfor value.
+ *
+ * Returns:            VFS_RETURNED
+ */
 static int
-sync_callback(mount_t mp, __unused void *arg)
+sync_callback(mount_t mp, void *arg)
 {
        if ((mp->mnt_flag & MNT_RDONLY) == 0) {
                int asyncflag = mp->mnt_flag & MNT_ASYNC;
+               unsigned waitfor = MNT_NOWAIT;
+
+               if (arg) {
+                       waitfor = *(uint32_t*)arg;
+               }
+
+               /* Sanity check for flags - these are the only valid combinations for the flag bits*/
+               if (waitfor != MNT_WAIT &&
+                   waitfor != (MNT_WAIT | MNT_VOLUME) &&
+                   waitfor != MNT_NOWAIT &&
+                   waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
+                   waitfor != MNT_DWAIT &&
+                   waitfor != (MNT_DWAIT | MNT_VOLUME)) {
+                       panic("Passed inappropriate waitfor %u to "
+                           "sync_callback()", waitfor);
+               }
 
                mp->mnt_flag &= ~MNT_ASYNC;
-               VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel());
+               (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
                if (asyncflag) {
                        mp->mnt_flag |= MNT_ASYNC;
                }
@@ -2426,7 +2642,7 @@ sync_internal_callback(mount_t mp, void *arg)
 
                if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
                        return VFS_RETURNED;
-               } else if ((sync_type = SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
+               } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
                        return VFS_RETURNED;
                }
        }
@@ -2480,7 +2696,7 @@ sync_thread(__unused void *arg, __unused wait_result_t wr)
 #endif /* DIAGNOSTIC */
 }
 
-struct timeval sync_timeout_last_print = {0, 0};
+struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
 
 /*
  * An in-kernel sync for power management to call.
@@ -2492,7 +2708,7 @@ sync_internal(void)
        thread_t thd;
        int error;
        int thread_created = FALSE;
-       struct timespec ts = {sync_timeout_seconds, 0};
+       struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
 
        lck_mtx_lock(sync_mtx_lck);
        sync_thread_state |= SYNC_THREAD_RUN;
@@ -2670,6 +2886,7 @@ statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
 #if CONFIG_MACF
        error = mac_mount_check_stat(ctx, mp);
        if (error != 0) {
+               vnode_put(vp);
                return error;
        }
 #endif
@@ -2738,40 +2955,33 @@ out:
        return error;
 }
 
-/*
- * Common routine to handle copying of statfs64 data to user space
- */
-static int
-statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp)
-{
-       int error;
-       struct statfs64 sfs;
-
-       bzero(&sfs, sizeof(sfs));
-
-       sfs.f_bsize = sfsp->f_bsize;
-       sfs.f_iosize = (int32_t)sfsp->f_iosize;
-       sfs.f_blocks = sfsp->f_blocks;
-       sfs.f_bfree = sfsp->f_bfree;
-       sfs.f_bavail = sfsp->f_bavail;
-       sfs.f_files = sfsp->f_files;
-       sfs.f_ffree = sfsp->f_ffree;
-       sfs.f_fsid = sfsp->f_fsid;
-       sfs.f_owner = sfsp->f_owner;
-       sfs.f_type = mp->mnt_vtable->vfc_typenum;
-       sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
-       sfs.f_fssubtype = sfsp->f_fssubtype;
+void
+vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
+{
+       struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
+
+       bzero(sfs, sizeof(*sfs));
+
+       sfs->f_bsize = vsfs->f_bsize;
+       sfs->f_iosize = (int32_t)vsfs->f_iosize;
+       sfs->f_blocks = vsfs->f_blocks;
+       sfs->f_bfree = vsfs->f_bfree;
+       sfs->f_bavail = vsfs->f_bavail;
+       sfs->f_files = vsfs->f_files;
+       sfs->f_ffree = vsfs->f_ffree;
+       sfs->f_fsid = vsfs->f_fsid;
+       sfs->f_owner = vsfs->f_owner;
+       sfs->f_type = mp->mnt_vtable->vfc_typenum;
+       sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+       sfs->f_fssubtype = vsfs->f_fssubtype;
+       sfs->f_flags_ext = ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS)) ? MNT_EXT_ROOT_DATA_VOL : 0;
        if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
-               strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
+               strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
        } else {
-               strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN);
+               strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
        }
-       strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN);
-       strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN);
-
-       error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
-
-       return error;
+       strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
+       strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
 }
 
 /*
@@ -2781,9 +2991,9 @@ int
 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
 {
        struct mount *mp;
-       struct vfsstatfs *sp;
        int error;
        struct nameidata nd;
+       struct statfs64 sfs;
        vfs_context_t ctxp = vfs_context_current();
        vnode_t vp;
 
@@ -2795,12 +3005,12 @@ statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *r
        }
        vp = nd.ni_vp;
        mp = vp->v_mount;
-       sp = &mp->mnt_vfsstat;
        nameidone(&nd);
 
 #if CONFIG_MACF
        error = mac_mount_check_stat(ctxp, mp);
        if (error != 0) {
+               vnode_put(vp);
                return error;
        }
 #endif
@@ -2811,7 +3021,13 @@ statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *r
                return error;
        }
 
-       error = statfs64_common(mp, sp, uap->buf);
+       vfs_get_statfs64(mp, &sfs);
+       if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
+           (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
+               /* This process does not want to see a seperate data volume mountpoint */
+               strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
+       }
+       error = copyout(&sfs, uap->buf, sizeof(sfs));
        vnode_put(vp);
 
        return error;
@@ -2825,7 +3041,7 @@ fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t
 {
        struct vnode *vp;
        struct mount *mp;
-       struct vfsstatfs *sp;
+       struct statfs64 sfs;
        int error;
 
        AUDIT_ARG(fd, uap->fd);
@@ -2855,12 +3071,17 @@ fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t
        }
 #endif
 
-       sp = &mp->mnt_vfsstat;
        if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
                goto out;
        }
 
-       error = statfs64_common(mp, sp, uap->buf);
+       vfs_get_statfs64(mp, &sfs);
+       if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
+           (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
+               /* This process does not want to see a seperate data volume mountpoint */
+               strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
+       }
+       error = copyout(&sfs, uap->buf, sizeof(sfs));
 
 out:
        file_drop(uap->fd);
@@ -2900,9 +3121,10 @@ getfsstat_callback(mount_t mp, void * arg)
                 * If MNT_NOWAIT is specified, do not refresh the
                 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
                 */
-               if (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
-                   (error = vfs_update_vfsstat(mp, ctx,
-                   VFS_USER_EVENT))) {
+               if ((mp->mnt_lflag & MNT_LDEAD) ||
+                   (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
+                   (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
+                   (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
                        KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
                        return VFS_RETURNED;
                }
@@ -2975,6 +3197,10 @@ __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval
        size_t count, maxcount, bufsize, macsize;
        struct getfsstat_struct fst;
 
+       if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
+               return EINVAL;
+       }
+
        bufsize = (size_t) uap->bufsize;
        macsize = (size_t) uap->macsize;
 
@@ -3038,7 +3264,7 @@ __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval
        fst.maxcount = maxcount;
 
 
-       vfs_iterate(0, getfsstat_callback, &fst);
+       vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
 
        if (mp) {
                FREE(mp, M_MACTEMP);
@@ -3062,6 +3288,7 @@ getfsstat64_callback(mount_t mp, void * arg)
 {
        struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
        struct vfsstatfs *sp;
+       struct statfs64 sfs;
        int error;
 
        if (fstp->sfsp && fstp->count < fstp->maxcount) {
@@ -3081,19 +3308,21 @@ getfsstat64_callback(mount_t mp, void * arg)
                 * getfsstat, since the constants are out of the same
                 * namespace.
                 */
-               if (((fstp->flags & MNT_NOWAIT) == 0 ||
-                   (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
-                   (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
+               if ((mp->mnt_lflag & MNT_LDEAD) ||
+                   ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
+                   (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
+                   (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
                        KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
                        return VFS_RETURNED;
                }
 
-               error = statfs64_common(mp, sp, fstp->sfsp);
+               vfs_get_statfs64(mp, &sfs);
+               error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
                if (error) {
                        fstp->error = error;
                        return VFS_RETURNED_DONE;
                }
-               fstp->sfsp += sizeof(struct statfs64);
+               fstp->sfsp += sizeof(sfs);
        }
        fstp->count++;
        return VFS_RETURNED;
@@ -3120,7 +3349,7 @@ getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
        fst.error = 0;
        fst.maxcount = maxcount;
 
-       vfs_iterate(0, getfsstat64_callback, &fst);
+       vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
 
        if (fst.error) {
                KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
@@ -3353,6 +3582,7 @@ __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *
        return common_fchdir(p, (void *)uap, 1);
 }
 
+
 /*
  * Change current working directory (".").
  *
@@ -3362,45 +3592,41 @@ __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *
  *     vnode_ref:ENOENT                No such file or directory
  */
 /* ARGSUSED */
-static int
-common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
+int
+chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
 {
        struct filedesc *fdp = p->p_fd;
        int error;
-       struct nameidata nd;
        vnode_t tvp;
-       vfs_context_t ctx = vfs_context_current();
 
-       NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
-           UIO_USERSPACE, uap->path, ctx);
-       error = change_dir(&nd, ctx);
+       error = change_dir(ndp, ctx);
        if (error) {
                return error;
        }
-       if ((error = vnode_ref(nd.ni_vp))) {
-               vnode_put(nd.ni_vp);
+       if ((error = vnode_ref(ndp->ni_vp))) {
+               vnode_put(ndp->ni_vp);
                return error;
        }
        /*
         * drop the iocount we picked up in change_dir
         */
-       vnode_put(nd.ni_vp);
+       vnode_put(ndp->ni_vp);
 
        if (per_thread) {
                thread_t th = vfs_context_thread(ctx);
                if (th) {
                        uthread_t uth = get_bsdthread_info(th);
                        tvp = uth->uu_cdir;
-                       uth->uu_cdir = nd.ni_vp;
+                       uth->uu_cdir = ndp->ni_vp;
                        OSBitOrAtomic(P_THCWD, &p->p_flag);
                } else {
-                       vnode_rele(nd.ni_vp);
+                       vnode_rele(ndp->ni_vp);
                        return ENOENT;
                }
        } else {
                proc_fdlock(p);
                tvp = fdp->fd_cdir;
-               fdp->fd_cdir = nd.ni_vp;
+               fdp->fd_cdir = ndp->ni_vp;
                proc_fdunlock(p);
        }
 
@@ -3412,6 +3638,28 @@ common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
 }
 
 
+/*
+ * Change current working directory (".").
+ *
+ * Returns:    0                       Success
+ *     chdir_internal:ENOTDIR
+ *     chdir_internal:ENOENT           No such file or directory
+ *     chdir_internal:???
+ */
+/* ARGSUSED */
+static int
+common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
+{
+       struct nameidata nd;
+       vfs_context_t ctx = vfs_context_current();
+
+       NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
+           UIO_USERSPACE, uap->path, ctx);
+
+       return chdir_internal(p, ctx, &nd, per_thread);
+}
+
+
 /*
  * chdir
  *
@@ -3680,20 +3928,6 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
                fp->f_fglob->fg_flag |= FHASLOCK;
        }
 
-#if DEVELOPMENT || DEBUG
-       /*
-        * XXX VSWAP: Check for entitlements or special flag here
-        * so we can restrict access appropriately.
-        */
-#else /* DEVELOPMENT || DEBUG */
-
-       if (vnode_isswap(vp) && (flags & (FWRITE | O_TRUNC)) && (ctx != vfs_context_kernel())) {
-               /* block attempt to write/truncate swapfile */
-               error = EPERM;
-               goto bad;
-       }
-#endif /* DEVELOPMENT || DEBUG */
-
        /* try to truncate by setting the size attribute */
        if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
                goto bad;
@@ -3750,7 +3984,7 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
                        size_t copied;
                        /* XXX FBDP: better way to detect /Applications/ ? */
                        if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
-                               copyinstr(ndp->ni_dirp,
+                               (void)copyinstr(ndp->ni_dirp,
                                    pathname,
                                    sizeof(pathname),
                                    &copied);
@@ -3784,27 +4018,14 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
 #else
 /* not implemented... */
 #endif
-                       if (!strncmp(vp->v_name,
-                           DYLD_SHARED_CACHE_NAME,
-                           strlen(DYLD_SHARED_CACHE_NAME)) ||
-                           !strncmp(vp->v_name,
-                           "dyld",
-                           strlen(vp->v_name)) ||
-                           !strncmp(vp->v_name,
-                           "launchd",
-                           strlen(vp->v_name)) ||
-                           !strncmp(vp->v_name,
-                           "Camera",
-                           strlen(vp->v_name)) ||
-                           !strncmp(vp->v_name,
-                           "mediaserverd",
-                           strlen(vp->v_name)) ||
-                           !strncmp(vp->v_name,
-                           "SpringBoard",
-                           strlen(vp->v_name)) ||
-                           !strncmp(vp->v_name,
-                           "backboardd",
-                           strlen(vp->v_name))) {
+                       size_t len = strlen(vp->v_name);
+                       if (!strncmp(vp->v_name, DYLD_SHARED_CACHE_NAME, len) ||
+                           !strncmp(vp->v_name, "dyld", len) ||
+                           !strncmp(vp->v_name, "launchd", len) ||
+                           !strncmp(vp->v_name, "Camera", len) ||
+                           !strncmp(vp->v_name, "mediaserverd", len) ||
+                           !strncmp(vp->v_name, "SpringBoard", len) ||
+                           !strncmp(vp->v_name, "backboardd", len)) {
                                /*
                                 * This file matters when launching Camera:
                                 * do not store its contents in the secluded
@@ -4136,9 +4357,8 @@ openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
                        return ENOMEM;
                }
 
-               error = fsgetpath_internal(
-                       ctx, fsid.val[0], objid,
-                       buflen, buf, &pathlen);
+               error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
+                   buf, FSOPT_ISREALFSID, &pathlen);
 
                if (error) {
                        FREE(buf, M_TEMP);
@@ -4408,15 +4628,22 @@ my_strrchr(char *p, int ch)
        /* NOTREACHED */
 }
 
+extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
+extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
 
 int
-safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
+safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
 {
        int ret, len = _len;
 
        *truncated_path = 0;
-       ret = vn_getpath(dvp, path, &len);
+
+       if (firmlink) {
+               ret = vn_getpath(dvp, path, &len);
+       } else {
+               ret = vn_getpath_no_firmlink(dvp, path, &len);
+       }
        if (ret == 0 && len < (MAXPATHLEN - 1)) {
                if (leafname) {
                        path[len - 1] = '/';
@@ -4462,13 +4689,28 @@ safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *trunc
                        }
 
                        len = _len;
-                       ret = vn_getpath(mydvp, path, &len);
+                       if (firmlink) {
+                               ret = vn_getpath(mydvp, path, &len);
+                       } else {
+                               ret = vn_getpath_no_firmlink(mydvp, path, &len);
+                       }
                } while (ret == ENOSPC);
        }
 
        return len;
 }
 
+int
+safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
+{
+       return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
+}
+
+int
+safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
+{
+       return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
+}
 
 /*
  * Make a hard file link.
@@ -4486,7 +4728,7 @@ static int
 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
     user_addr_t link, int flag, enum uio_seg segflg)
 {
-       vnode_t vp, dvp, lvp;
+       vnode_t vp, pvp, dvp, lvp;
        struct nameidata nd;
        int follow;
        int error;
@@ -4653,10 +4895,22 @@ linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
                                    FSE_ARG_FINFO, &finfo,
                                    FSE_ARG_DONE);
                        }
-                       if (vp->v_parent) {
+
+                       pvp = vp->v_parent;
+                       // need an iocount on pvp in this case
+                       if (pvp && pvp != dvp) {
+                               error = vnode_get(pvp);
+                               if (error) {
+                                       pvp = NULLVP;
+                                       error = 0;
+                               }
+                       }
+                       if (pvp) {
                                add_fsevent(FSE_STAT_CHANGED, ctx,
-                                   FSE_ARG_VNODE, vp->v_parent,
-                                   FSE_ARG_DONE);
+                                   FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
+                       }
+                       if (pvp && pvp != dvp) {
+                               vnode_put(pvp);
                        }
                }
 #endif
@@ -4899,7 +5153,9 @@ unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
        int error;
        struct componentname *cnp;
        char  *path = NULL;
-       int  len = 0;
+       char  *no_firmlink_path = NULL;
+       int  len_path = 0;
+       int  len_no_firmlink_path = 0;
 #if CONFIG_FSE
        fse_info  finfo;
        struct vnode_attr va;
@@ -4908,6 +5164,7 @@ unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
        int need_event;
        int has_listeners;
        int truncated_path;
+       int truncated_no_firmlink_path;
        int batched;
        struct vnode_attr *vap;
        int do_retry;
@@ -4934,6 +5191,7 @@ retry:
        need_event = 0;
        has_listeners = 0;
        truncated_path = 0;
+       truncated_no_firmlink_path = 0;
        vap = NULL;
 
        NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
@@ -4967,8 +5225,9 @@ continue_lookup:
                /*
                 * The root of a mounted filesystem cannot be deleted.
                 */
-               if (vp->v_flag & VROOT) {
+               if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
                        error = EBUSY;
+                       goto out;
                }
 
 #if DEVELOPMENT || DEBUG
@@ -4988,7 +5247,6 @@ continue_lookup:
                        error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
                        if (error) {
                                if (error == ENOENT) {
-                                       assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
                                        if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
                                                do_retry = 1;
                                                retry_count++;
@@ -5032,7 +5290,15 @@ continue_lookup:
                                goto out;
                        }
                }
-               len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
+               len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
+               if (no_firmlink_path == NULL) {
+                       GET_PATH(no_firmlink_path);
+                       if (no_firmlink_path == NULL) {
+                               error = ENOMEM;
+                               goto out;
+                       }
+               }
+               len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
        }
 
 #if NAMEDRSRCFORK
@@ -5058,7 +5324,6 @@ continue_lookup:
                        }
                        goto continue_lookup;
                } else if (error == ENOENT && batched) {
-                       assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
                        if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
                                /*
                                 * For compound VNOPs, the authorization callback may
@@ -5106,7 +5371,7 @@ continue_lookup:
                                finfo.mode |= FSE_TRUNCATED_PATH;
                        }
                        add_fsevent(FSE_DELETE, ctx,
-                           FSE_ARG_STRING, lenpath,
+                           FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
                            FSE_ARG_FINFO, &finfo,
                            FSE_ARG_DONE);
                }
@@ -5116,8 +5381,13 @@ continue_lookup:
 out:
        if (path != NULL) {
                RELEASE_PATH(path);
+               path = NULL;
        }
 
+       if (no_firmlink_path != NULL) {
+               RELEASE_PATH(no_firmlink_path);
+               no_firmlink_path = NULL;
+       }
 #if NAMEDRSRCFORK
        /* recycle the deleted rsrc fork vnode to force a reclaim, which
         * will cause its shadow file to go away if necessary.
@@ -5176,13 +5446,18 @@ unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
 int
 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
 {
-       if (uap->flag & ~AT_REMOVEDIR) {
+       if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
                return EINVAL;
        }
 
-       if (uap->flag & AT_REMOVEDIR) {
+       if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
+               int unlink_flags = 0;
+
+               if (uap->flag & AT_REMOVEDIR_DATALESS) {
+                       unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
+               }
                return rmdirat_internal(vfs_context_current(), uap->fd,
-                          uap->path, UIO_USERSPACE);
+                          uap->path, UIO_USERSPACE, unlink_flags);
        } else {
                return unlinkat_internal(vfs_context_current(), uap->fd,
                           NULLVP, uap->path, UIO_USERSPACE, 0);
@@ -5674,7 +5949,7 @@ faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
        context.vc_thread = ctx->vc_thread;
 
 
-       niopts = FOLLOW | AUDITVNPATH1;
+       niopts = (flag & AT_SYMLINK_NOFOLLOW ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
        /* need parent for vnode_authorize for deletion test */
        if (amode & _DELETE_OK) {
                niopts |= WANTPARENT;
@@ -5738,7 +6013,7 @@ int
 faccessat(__unused proc_t p, struct faccessat_args *uap,
     __unused int32_t *retval)
 {
-       if (uap->flag & ~AT_EACCESS) {
+       if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW)) {
                return EINVAL;
        }
 
@@ -5775,6 +6050,8 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
        kauth_filesec_t fsec;
        size_t xsecurity_bufsize;
        void * statptr;
+       struct fileproc *fp = NULL;
+       int needsrealdev = 0;
 
        follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
        NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
@@ -5785,9 +6062,24 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
        /* stat calls are allowed for resource forks. */
        nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
 #endif
-       error = nameiat(&nd, fd);
-       if (error) {
-               return error;
+
+       if (flag & AT_FDONLY) {
+               vnode_t fvp;
+
+               error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
+               if (error) {
+                       return error;
+               }
+               if ((error = vnode_getwithref(fvp))) {
+                       file_drop(fd);
+                       return error;
+               }
+               nd.ni_vp = fvp;
+       } else {
+               error = nameiat(&nd, fd);
+               if (error) {
+                       return error;
+               }
        }
        fsec = KAUTH_FILESEC_NONE;
 
@@ -5806,7 +6098,19 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
        }
 #endif
 
-       error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
+       needsrealdev = flag & AT_REALDEV ? 1 : 0;
+       if (fp && (xsecurity == USER_ADDR_NULL)) {
+               /*
+                * If the caller has the file open, and is not
+                * requesting extended security information, we are
+                * going to let them get the basic stat information.
+                */
+               error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
+                   fp->f_fglob->fg_cred);
+       } else {
+               error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
+                   isstat64, needsrealdev, ctx);
+       }
 
 #if NAMEDRSRCFORK
        if (is_namedstream) {
@@ -5815,6 +6119,10 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
 #endif
        vnode_put(nd.ni_vp);
        nameidone(&nd);
+       if (fp) {
+               file_drop(fd);
+               fp = NULL;
+       }
 
        if (error) {
                return error;
@@ -6031,7 +6339,7 @@ lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused
 int
 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
 {
-       if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
+       if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
                return EINVAL;
        }
 
@@ -6043,7 +6351,7 @@ int
 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
     __unused int32_t *retval)
 {
-       if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
+       if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
                return EINVAL;
        }
 
@@ -6159,29 +6467,25 @@ readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
 }
 
 /*
- * Change file flags.
- *
- * NOTE: this will vnode_put() `vp'
+ * Change file flags, the deep inner layer.
  */
 static int
-chflags1(vnode_t vp, int flags, vfs_context_t ctx)
+chflags0(vnode_t vp, struct vnode_attr *va,
+    int (*setattr)(vnode_t, void *, vfs_context_t),
+    void *arg, vfs_context_t ctx)
 {
-       struct vnode_attr va;
-       kauth_action_t action;
+       kauth_action_t action = 0;
        int error;
 
-       VATTR_INIT(&va);
-       VATTR_SET(&va, va_flags, flags);
-
 #if CONFIG_MACF
-       error = mac_vnode_check_setflags(ctx, vp, flags);
+       error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
        if (error) {
                goto out;
        }
 #endif
 
        /* request authorisation, disregard immutability */
-       if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
+       if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
                goto out;
        }
        /*
@@ -6192,19 +6496,39 @@ chflags1(vnode_t vp, int flags, vfs_context_t ctx)
        if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
                goto out;
        }
-       error = vnode_setattr(vp, &va, ctx);
+       error = (*setattr)(vp, arg, ctx);
 
 #if CONFIG_MACF
        if (error == 0) {
-               mac_vnode_notify_setflags(ctx, vp, flags);
+               mac_vnode_notify_setflags(ctx, vp, va->va_flags);
        }
 #endif
 
+out:
+       return error;
+}
+
+/*
+ * Change file flags.
+ *
+ * NOTE: this will vnode_put() `vp'
+ */
+static int
+chflags1(vnode_t vp, int flags, vfs_context_t ctx)
+{
+       struct vnode_attr va;
+       int error;
+
+       VATTR_INIT(&va);
+       VATTR_SET(&va, va_flags, flags);
+
+       error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
+       vnode_put(vp);
+
        if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
                error = ENOTSUP;
        }
-out:
-       vnode_put(vp);
+
        return error;
 }
 
@@ -7578,18 +7902,48 @@ out:
        return error;
 }
 
-/*
- * Rename files.  Source and destination must either both be directories,
- * or both not be directories.  If target is a directory, it must be empty.
- */
-/* ARGSUSED */
 static int
-renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
-    int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
+rename_submounts_callback(mount_t mp, void *arg)
 {
-       if (flags & ~VFS_RENAME_FLAGS_MASK) {
-               return EINVAL;
-       }
+       int error = 0;
+       mount_t pmp = (mount_t)arg;
+       int prefix_len = strlen(pmp->mnt_vfsstat.f_mntonname);
+
+       if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
+               return 0;
+       }
+
+       if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
+               return 0;
+       }
+
+       if ((error = vfs_busy(mp, LK_NOWAIT))) {
+               printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
+               return -1;
+       }
+
+       int pathlen = MAXPATHLEN;
+       if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
+               printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
+       }
+
+       vfs_unbusy(mp);
+
+       return error;
+}
+
+/*
+ * Rename files.  Source and destination must either both be directories,
+ * or both not be directories.  If target is a directory, it must be empty.
+ */
+/* ARGSUSED */
+static int
+renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
+    int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
+{
+       if (flags & ~VFS_RENAME_FLAGS_MASK) {
+               return EINVAL;
+       }
 
        if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
                return EINVAL;
@@ -7607,14 +7961,17 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
        int has_listeners;
        const char *oname = NULL;
        char *from_name = NULL, *to_name = NULL;
+       char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
        int from_len = 0, to_len = 0;
+       int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
        int holding_mntlock;
        mount_t locked_mp = NULL;
        vnode_t oparent = NULLVP;
 #if CONFIG_FSE
        fse_info from_finfo, to_finfo;
 #endif
-       int from_truncated = 0, to_truncated;
+       int from_truncated = 0, to_truncated = 0;
+       int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
        int batched = 0;
        struct vnode_attr *fvap, *tvap;
        int continuing = 0;
@@ -7749,6 +8106,16 @@ continue_lookup:
                }
 
                from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
+
+               if (from_name_no_firmlink == NULL) {
+                       GET_PATH(from_name_no_firmlink);
+                       if (from_name_no_firmlink == NULL) {
+                               error = ENOMEM;
+                               goto out1;
+                       }
+               }
+
+               from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
        }
 
        if (need_event || need_kpath2 || has_listeners) {
@@ -7761,6 +8128,16 @@ continue_lookup:
                }
 
                to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
+
+               if (to_name_no_firmlink == NULL) {
+                       GET_PATH(to_name_no_firmlink);
+                       if (to_name_no_firmlink == NULL) {
+                               error = ENOMEM;
+                               goto out1;
+                       }
+               }
+
+               to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
                if (to_name && need_kpath2) {
                        AUDIT_ARG(kpath, to_name, ARG_KPATH2);
                }
@@ -7787,7 +8164,6 @@ continue_lookup:
                error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL);
                if (error) {
                        if (error == ENOENT) {
-                               assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
                                if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
                                        /*
                                         * We encountered a race where after doing the namei, tvp stops
@@ -7844,6 +8220,7 @@ continue_lookup:
            (fvp->v_mountedhere == NULL) &&
            (fdvp == tdvp) &&
            ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
+           ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
            (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
                vnode_t coveredvp;
 
@@ -7990,6 +8367,41 @@ skipped_lookup:
                holding_mntlock = 0;
        }
        if (error) {
+               if (error == EDATALESS) {
+                       /*
+                        * If we've been here before, something has gone
+                        * horribly wrong and we should just get out lest
+                        * we spiral around the drain forever.
+                        */
+                       if (flags & VFS_RENAME_DATALESS) {
+                               error = EIO;
+                               goto out1;
+                       }
+
+                       /*
+                        * The object we're renaming is dataless (or has a
+                        * dataless descendent) and requires materialization
+                        * before the rename occurs.  But we're holding the
+                        * mount point's rename lock, so it's not safe to
+                        * make the upcall.
+                        *
+                        * In this case, we release the lock, perform the
+                        * materialization, and start the whole thing over.
+                        */
+                       error = vnode_materialize_dataless_file(fvp,
+                           NAMESPACE_HANDLER_RENAME_OP);
+
+                       if (error == 0) {
+                               /*
+                                * The next time around we need to tell the
+                                * file system that the materializtaion has
+                                * been performed.
+                                */
+                               flags |= VFS_RENAME_DATALESS;
+                               do_retry = 1;
+                       }
+                       goto out1;
+               }
                if (error == EKEEPLOOKING) {
                        if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
                                if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
@@ -8020,7 +8432,6 @@ skipped_lookup:
                 * cache, redrive the lookup.
                 */
                if (batched && error == ENOENT) {
-                       assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
                        if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
                                do_retry = 1;
                                retry_count += 1;
@@ -8058,9 +8469,9 @@ skipped_lookup:
 
                if (tvp) {
                        add_fsevent(FSE_RENAME, ctx,
-                           FSE_ARG_STRING, from_len, from_name,
+                           FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
                            FSE_ARG_FINFO, &from_finfo,
-                           FSE_ARG_STRING, to_len, to_name,
+                           FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
                            FSE_ARG_FINFO, &to_finfo,
                            FSE_ARG_DONE);
                        if (flags & VFS_RENAME_SWAP) {
@@ -8071,17 +8482,17 @@ skipped_lookup:
                                 * two.
                                 */
                                add_fsevent(FSE_RENAME, ctx,
-                                   FSE_ARG_STRING, to_len, to_name,
+                                   FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
                                    FSE_ARG_FINFO, &to_finfo,
-                                   FSE_ARG_STRING, from_len, from_name,
+                                   FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
                                    FSE_ARG_FINFO, &from_finfo,
                                    FSE_ARG_DONE);
                        }
                } else {
                        add_fsevent(FSE_RENAME, ctx,
-                           FSE_ARG_STRING, from_len, from_name,
+                           FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
                            FSE_ARG_FINFO, &from_finfo,
-                           FSE_ARG_STRING, to_len, to_name,
+                           FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
                            FSE_ARG_DONE);
                }
        }
@@ -8124,14 +8535,21 @@ skipped_lookup:
                                        mpname = cp + 1;
                                }
                        }
+
+                       /* Update f_mntonname of sub mounts */
+                       vfs_iterate(0, rename_submounts_callback, (void *)mp);
+
                        /* append name to prefix */
                        maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
                        bzero(pathend, maxlen);
+
                        strlcpy(pathend, mpname, maxlen);
                }
                FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
 
                vfs_unbusy(mp);
+
+               vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
        }
        /*
         * fix up name & parent pointers.  note that we first
@@ -8157,10 +8575,18 @@ out1:
                RELEASE_PATH(to_name);
                to_name = NULL;
        }
+       if (to_name_no_firmlink != NULL) {
+               RELEASE_PATH(to_name_no_firmlink);
+               to_name_no_firmlink = NULL;
+       }
        if (from_name != NULL) {
                RELEASE_PATH(from_name);
                from_name = NULL;
        }
+       if (from_name_no_firmlink != NULL) {
+               RELEASE_PATH(from_name_no_firmlink);
+               from_name_no_firmlink = NULL;
+       }
        if (holding_mntlock) {
                mount_unlock_renames(locked_mp);
                mount_drop(locked_mp, 0);
@@ -8420,16 +8846,19 @@ mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
 
 static int
 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
-    enum uio_seg segflg)
+    enum uio_seg segflg, int unlink_flags)
 {
        vnode_t vp, dvp;
        int error;
        struct nameidata nd;
        char     *path = NULL;
-       int       len = 0;
+       char     *no_firmlink_path = NULL;
+       int       len_path = 0;
+       int       len_no_firmlink_path = 0;
        int has_listeners = 0;
        int need_event = 0;
-       int truncated = 0;
+       int truncated_path = 0;
+       int truncated_no_firmlink_path = 0;
 #if CONFIG_FSE
        struct vnode_attr va;
 #endif /* CONFIG_FSE */
@@ -8499,7 +8928,6 @@ continue_lookup:
                                error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
                                if (error) {
                                        if (error == ENOENT) {
-                                               assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
                                                if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
                                                        restart_flag = 1;
                                                        restart_count += 1;
@@ -8543,9 +8971,19 @@ continue_lookup:
                                }
                        }
 
-                       len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
+                       len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
+
+                       if (no_firmlink_path == NULL) {
+                               GET_PATH(no_firmlink_path);
+                               if (no_firmlink_path == NULL) {
+                                       error = ENOMEM;
+                                       goto out;
+                               }
+                       }
+
+                       len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
 #if CONFIG_FSE
-                       if (truncated) {
+                       if (truncated_no_firmlink_path) {
                                finfo.mode |= FSE_TRUNCATED_PATH;
                        }
 #endif
@@ -8561,7 +8999,6 @@ continue_lookup:
                if (error == EKEEPLOOKING) {
                        goto continue_lookup;
                } else if (batched && error == ENOENT) {
-                       assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
                        if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
                                /*
                                 * For compound VNOPs, the authorization callback
@@ -8573,6 +9010,27 @@ continue_lookup:
                                goto out;
                        }
                }
+
+               /*
+                * XXX There's no provision for passing flags
+                * to VNOP_RMDIR().  So, if vn_rmdir() fails
+                * because it's not empty, then we try again
+                * with VNOP_REMOVE(), passing in a special
+                * flag that clever file systems will know
+                * how to handle.
+                */
+               if (error == ENOTEMPTY &&
+                   (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
+                       /*
+                        * If this fails, we want to keep the original
+                        * error.
+                        */
+                       if (vn_remove(dvp, &vp, &nd,
+                           VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
+                               error = 0;
+                       }
+               }
+
 #if CONFIG_APPLEDOUBLE
                /*
                 * Special case to remove orphaned AppleDouble
@@ -8581,8 +9039,9 @@ continue_lookup:
                 * so here we are.
                 */
                if (error == ENOTEMPTY) {
-                       error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
-                       if (error == EBUSY) {
+                       int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
+                       if (ad_error == EBUSY) {
+                               error = ad_error;
                                goto out;
                        }
 
@@ -8590,7 +9049,7 @@ continue_lookup:
                        /*
                         * Assuming everything went well, we will try the RMDIR again
                         */
-                       if (!error) {
+                       if (!ad_error) {
                                error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
                        }
                }
@@ -8619,7 +9078,7 @@ continue_lookup:
                                        vnode_get_fse_info_from_vap(vp, &finfo, vap);
                                }
                                add_fsevent(FSE_DELETE, ctx,
-                                   FSE_ARG_STRING, lenpath,
+                                   FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
                                    FSE_ARG_FINFO, &finfo,
                                    FSE_ARG_DONE);
                        }
@@ -8631,6 +9090,12 @@ out:
                        RELEASE_PATH(path);
                        path = NULL;
                }
+
+               if (no_firmlink_path != NULL) {
+                       RELEASE_PATH(no_firmlink_path);
+                       no_firmlink_path = NULL;
+               }
+
                /*
                 * nameidone has to happen before we vnode_put(dvp)
                 * since it may need to release the fs_nodelock on the dvp
@@ -8660,7 +9125,7 @@ int
 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
 {
        return rmdirat_internal(vfs_context_current(), AT_FDCWD,
-                  CAST_USER_ADDR_T(uap->path), UIO_USERSPACE);
+                  CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
 }
 
 /* Get direntry length padded to 8 byte alignment */
@@ -8775,7 +9240,7 @@ vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
  */
 static int
 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
-    off_t *offset, int flags)
+    off_t *offset, int *eofflag, int flags)
 {
        vnode_t vp;
        struct vfs_context context = *vfs_context_current();    /* local copy */
@@ -8783,7 +9248,7 @@ getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *byt
        uio_t auio;
        int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
        off_t loff;
-       int error, eofflag, numdirent;
+       int error, numdirent;
        char uio_buf[UIO_SIZEOF(1)];
 
        error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
@@ -8831,10 +9296,10 @@ unionread:
        uio_addiov(auio, bufp, bufsize);
 
        if (flags & VNODE_READDIR_EXTENDED) {
-               error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
+               error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
                fp->f_fglob->fg_offset = uio_offset(auio);
        } else {
-               error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context);
+               error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
                fp->f_fglob->fg_offset = uio_offset(auio);
        }
        if (error) {
@@ -8885,10 +9350,11 @@ getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *
 {
        off_t offset;
        ssize_t bytesread;
-       int error;
+       int error, eofflag;
 
        AUDIT_ARG(fd, uap->fd);
-       error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0);
+       error = getdirentries_common(uap->fd, uap->buf, uap->count,
+           &bytesread, &offset, &eofflag, 0);
 
        if (error == 0) {
                if (proc_is64bit(p)) {
@@ -8908,14 +9374,37 @@ getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_
 {
        off_t offset;
        ssize_t bytesread;
-       int error;
+       int error, eofflag;
+       user_size_t bufsize;
 
        AUDIT_ARG(fd, uap->fd);
-       error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
+
+       /*
+        * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
+        * then the kernel carves out the last 4 bytes to return extended
+        * information to userspace (namely whether we reached EOF with this call).
+        */
+       if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
+               bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
+       } else {
+               bufsize = uap->bufsize;
+       }
+
+       error = getdirentries_common(uap->fd, uap->buf, bufsize,
+           &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
 
        if (error == 0) {
                *retval = bytesread;
                error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
+
+               if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
+                       getdirentries64_flags_t flags = 0;
+                       if (eofflag) {
+                               flags |= GETDIRENTRIES64_EOF;
+                       }
+                       error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
+                           sizeof(flags));
+               }
        }
        return error;
 }
@@ -9666,822 +10155,744 @@ searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t
 #endif /* CONFIG_SEARCHFS */
 
 
-lck_grp_attr_t *  nspace_group_attr;
-lck_attr_t *      nspace_lock_attr;
-lck_grp_t *       nspace_mutex_group;
+#if CONFIG_DATALESS_FILES
 
-lck_mtx_t         nspace_handler_lock;
-lck_mtx_t         nspace_handler_exclusion_lock;
+/*
+ * === Namespace Resolver Up-call Mechanism ===
+ *
+ * When I/O is performed to a dataless file or directory (read, write,
+ * lookup-in, etc.), the file system performs an upcall to the namespace
+ * resolver (filecoordinationd) to materialize the object.
+ *
+ * We need multiple up-calls to be in flight at once, and we need these
+ * up-calls to be interruptible, thus the following implementation:
+ *
+ * => The nspace_resolver_request represents the in-kernel request state.
+ *    It contains a request ID, storage space for the errno code returned
+ *    by filecoordinationd, and flags.
+ *
+ * => The request ID is simply a global monotonically incrementing 32-bit
+ *    number.  Outstanding requests are stored in a hash table, and the
+ *    hash function is extremely simple.
+ *
+ * => When an upcall is to be made to filecoordinationd, a request structure
+ *    is allocated on the stack (it is small, and needs to live only during
+ *    the duration of the call to resolve_nspace_item_ext()).  It is
+ *    initialized and inserted into the table.  Some backpressure from
+ *    filecoordinationd is applied by limiting the numnber of entries that
+ *    can be inserted into the table (and thus limiting the number of
+ *    outstanding requests issued to filecoordinationd); waiting for an
+ *    available slot is interruptible.
+ *
+ * => Once the request has been inserted into the table, the up-call is made
+ *    to filecoordinationd via a MiG-generated stub.  The up-call returns
+ *    immediately and filecoordinationd processes the request asynchronously.
+ *
+ * => The caller now waits for the request to complete.  Tnis is achieved by
+ *    sleeping on the address of the request structure and waiting for
+ *    filecoordinationd to mark the request structure as complete.  This
+ *    is an interruptible sleep call; if interrupted, the request structure
+ *    is removed from the table and EINTR is returned to the caller.  If
+ *    this occurs, an advisory up-call is made to filecoordinationd with
+ *    the request ID to indicate that the request can be aborted or
+ *    de-prioritized at the discretion of filecoordinationd.
+ *
+ * => When filecoordinationd has completed the request, it signals completion
+ *    by writing to the vfs.nspace.complete sysctl node.  Only a process
+ *    decorated as a namespace resolver can write to this sysctl node.  The
+ *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
+ *    The request ID is looked up in the table, and if the request is found,
+ *    the error code is stored in the request structure and a wakeup()
+ *    issued on the address of the request structure.  If the request is not
+ *    found, we simply drop the completion notification, assuming that the
+ *    caller was interrupted.
+ *
+ * => When the waiting thread wakes up, it extracts the error code from the
+ *    request structure, removes the request from the table, and returns the
+ *    error code to the calling function.  Fini!
+ */
 
-time_t snapshot_timestamp = 0;
-int nspace_allow_virtual_devs = 0;
+struct nspace_resolver_request {
+       LIST_ENTRY(nspace_resolver_request) r_hashlink;
+       uint32_t        r_req_id;
+       int             r_resolver_error;
+       int             r_flags;
+};
 
-void nspace_handler_init(void);
+#define RRF_COMPLETE    0x0001
 
-typedef struct nspace_item_info {
-       struct vnode *vp;
-       void         *arg;
-       uint64_t      op;
-       uint32_t      vid;
-       uint32_t      flags;
-       uint32_t      token;
-       uint32_t      refcount;
-} nspace_item_info;
-
-#define MAX_NSPACE_ITEMS   128
-nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
-uint32_t      nspace_item_idx = 0;              // also used as the sleep/wakeup rendezvous address
-uint32_t      nspace_token_id = 0;
-uint32_t      nspace_handler_timeout = 15;    // seconds
-
-#define NSPACE_ITEM_NEW         0x0001
-#define NSPACE_ITEM_PROCESSING  0x0002
-#define NSPACE_ITEM_DEAD        0x0004
-#define NSPACE_ITEM_CANCELLED   0x0008
-#define NSPACE_ITEM_DONE        0x0010
-#define NSPACE_ITEM_RESET_TIMER 0x0020
-
-#define NSPACE_ITEM_NSPACE_EVENT   0x0040
-#define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
-
-#define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT)
-
-//#pragma optimization_level 0
+static uint32_t
+next_nspace_req_id(void)
+{
+       static uint32_t next_req_id;
 
-typedef enum {
-       NSPACE_HANDLER_NSPACE = 0,
-       NSPACE_HANDLER_SNAPSHOT = 1,
-
-       NSPACE_HANDLER_COUNT,
-} nspace_type_t;
-
-typedef struct {
-       uint64_t handler_tid;
-       struct proc *handler_proc;
-       int handler_busy;
-} nspace_handler_t;
-
-nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
-
-/* namespace fsctl functions */
-static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
-static int nspace_item_flags_for_type(nspace_type_t nspace_type);
-static int nspace_open_flags_for_type(nspace_type_t nspace_type);
-static nspace_type_t nspace_type_for_op(uint64_t op);
-static int nspace_is_special_process(struct proc *proc);
-static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
-static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
-static int validate_namespace_args(int is64bit, int size);
-static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
-
-
-static inline int
-nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
-{
-       switch (nspace_type) {
-       case NSPACE_HANDLER_NSPACE:
-               return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
-       case NSPACE_HANDLER_SNAPSHOT:
-               return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
-       default:
-               printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
-               return 0;
-       }
+       return OSAddAtomic(1, &next_req_id);
 }
 
-static inline int
-nspace_item_flags_for_type(nspace_type_t nspace_type)
-{
-       switch (nspace_type) {
-       case NSPACE_HANDLER_NSPACE:
-               return NSPACE_ITEM_NSPACE_EVENT;
-       case NSPACE_HANDLER_SNAPSHOT:
-               return NSPACE_ITEM_SNAPSHOT_EVENT;
-       default:
-               printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
-               return 0;
-       }
-}
+#define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
+#define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
 
-static inline int
-nspace_open_flags_for_type(nspace_type_t nspace_type)
-{
-       switch (nspace_type) {
-       case NSPACE_HANDLER_NSPACE:
-               return FREAD | FWRITE | O_EVTONLY;
-       case NSPACE_HANDLER_SNAPSHOT:
-               return FREAD | O_EVTONLY;
-       default:
-               printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
-               return 0;
-       }
-}
+static LIST_HEAD(nspace_resolver_requesthead,
+    nspace_resolver_request) * nspace_resolver_request_hashtbl;
+static u_long nspace_resolver_request_hashmask;
+static u_int nspace_resolver_request_count;
+static bool nspace_resolver_request_wait_slot;
+static lck_grp_t *nspace_resolver_request_lck_grp;
+static lck_mtx_t nspace_resolver_request_hash_mutex;
 
-static inline nspace_type_t
-nspace_type_for_op(uint64_t op)
-{
-       switch (op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
-       case NAMESPACE_HANDLER_NSPACE_EVENT:
-               return NSPACE_HANDLER_NSPACE;
-       case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
-               return NSPACE_HANDLER_SNAPSHOT;
-       default:
-               printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
-               return NSPACE_HANDLER_NSPACE;
-       }
-}
+#define NSPACE_REQ_LOCK() \
+       lck_mtx_lock(&nspace_resolver_request_hash_mutex)
+#define NSPACE_REQ_UNLOCK() \
+       lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
+
+#define NSPACE_RESOLVER_HASH(req_id)    \
+       (&nspace_resolver_request_hashtbl[(req_id) & \
+        nspace_resolver_request_hashmask])
 
-static inline int
-nspace_is_special_process(struct proc *proc)
+static struct nspace_resolver_request *
+nspace_resolver_req_lookup(uint32_t req_id)
 {
-       int i;
-       for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
-               if (proc == nspace_handlers[i].handler_proc) {
-                       return 1;
+       struct nspace_resolver_requesthead *bucket;
+       struct nspace_resolver_request *req;
+
+       bucket = NSPACE_RESOLVER_HASH(req_id);
+       LIST_FOREACH(req, bucket, r_hashlink) {
+               if (req->r_req_id == req_id) {
+                       return req;
                }
        }
-       return 0;
-}
 
-void
-nspace_handler_init(void)
-{
-       nspace_lock_attr    = lck_attr_alloc_init();
-       nspace_group_attr   = lck_grp_attr_alloc_init();
-       nspace_mutex_group  = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
-       lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
-       lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
-       memset(&nspace_items[0], 0, sizeof(nspace_items));
+       return NULL;
 }
 
-void
-nspace_proc_exit(struct proc *p)
+static int
+nspace_resolver_req_add(struct nspace_resolver_request *req)
 {
-       int i, event_mask = 0;
+       struct nspace_resolver_requesthead *bucket;
+       int error;
 
-       for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
-               if (p == nspace_handlers[i].handler_proc) {
-                       event_mask |= nspace_item_flags_for_type(i);
-                       nspace_handlers[i].handler_tid = 0;
-                       nspace_handlers[i].handler_proc = NULL;
+       while (nspace_resolver_request_count >=
+           NSPACE_RESOLVER_MAX_OUTSTANDING) {
+               nspace_resolver_request_wait_slot = true;
+               error = msleep(&nspace_resolver_request_count,
+                   &nspace_resolver_request_hash_mutex,
+                   PVFS | PCATCH, "nspacerq", NULL);
+               if (error) {
+                       return error;
                }
        }
 
-       if (event_mask == 0) {
-               return;
-       }
+       bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
+#if DIAGNOSTIC
+       assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
+#endif /* DIAGNOSTIC */
+       LIST_INSERT_HEAD(bucket, req, r_hashlink);
+       nspace_resolver_request_count++;
 
-       lck_mtx_lock(&nspace_handler_lock);
-       if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
-               // if this process was the snapshot handler, zero snapshot_timeout
-               snapshot_timestamp = 0;
-       }
+       return 0;
+}
 
-       //
-       // unblock anyone that's waiting for the handler that died
-       //
-       for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-               if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) {
-                       if (nspace_items[i].flags & event_mask) {
-                               if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
-                                       vnode_lock_spin(nspace_items[i].vp);
-                                       nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
-                                       vnode_unlock(nspace_items[i].vp);
-                               }
-                               nspace_items[i].vp = NULL;
-                               nspace_items[i].vid = 0;
-                               nspace_items[i].flags = NSPACE_ITEM_DONE;
-                               nspace_items[i].token = 0;
+static void
+nspace_resolver_req_remove(struct nspace_resolver_request *req)
+{
+       struct nspace_resolver_requesthead *bucket;
 
-                               wakeup((caddr_t)&(nspace_items[i].vp));
-                       }
-               }
-       }
+       bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
+#if DIAGNOSTIC
+       assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
+#endif /* DIAGNOSTIC */
+       LIST_REMOVE(req, r_hashlink);
+       nspace_resolver_request_count--;
 
-       wakeup((caddr_t)&nspace_item_idx);
-       lck_mtx_unlock(&nspace_handler_lock);
+       if (nspace_resolver_request_wait_slot) {
+               nspace_resolver_request_wait_slot = false;
+               wakeup(&nspace_resolver_request_count);
+       }
 }
 
-
-int
-resolve_nspace_item(struct vnode *vp, uint64_t op)
+static void
+nspace_resolver_req_cancel(uint32_t req_id)
 {
-       return resolve_nspace_item_ext(vp, op, NULL);
-}
+       kern_return_t kr;
+       mach_port_t mp;
 
-int
-resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg)
-{
-       int i, error, keep_waiting;
-       struct timespec ts;
-       nspace_type_t nspace_type = nspace_type_for_op(op);
+       // Failures here aren't fatal -- the cancellation message
+       // sent to the resolver is merely advisory.
 
-       // only allow namespace events on regular files, directories and symlinks.
-       if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
-               return 0;
+       kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
+       if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
+               return;
        }
 
-       //
-       // if this is a snapshot event and the vnode is on a
-       // disk image just pretend nothing happened since any
-       // change to the disk image will cause the disk image
-       // itself to get backed up and this avoids multi-way
-       // deadlocks between the snapshot handler and the ever
-       // popular diskimages-helper process.  the variable
-       // nspace_allow_virtual_devs allows this behavior to
-       // be overridden (for use by the Mobile TimeMachine
-       // testing infrastructure which uses disk images)
-       //
-       if ((op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
-           && (vp->v_mount != NULL)
-           && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
-           && !nspace_allow_virtual_devs) {
-               return 0;
+       kr = send_nspace_resolve_cancel(mp, req_id);
+       if (kr != KERN_SUCCESS) {
+               os_log_error(OS_LOG_DEFAULT,
+                   "NSPACE send_nspace_resolve_cancel failure: %d", kr);
        }
 
-       // if (thread_tid(current_thread()) == namespace_handler_tid) {
-       if (nspace_handlers[nspace_type].handler_proc == NULL) {
-               return 0;
-       }
+       ipc_port_release_send(mp);
+}
 
-       if (nspace_is_special_process(current_proc())) {
-               return EDEADLK;
-       }
+static int
+nspace_resolver_req_wait(struct nspace_resolver_request *req)
+{
+       bool send_cancel_message = false;
+       int error;
 
-       lck_mtx_lock(&nspace_handler_lock);
+       NSPACE_REQ_LOCK();
 
-retry:
-       for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-               if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
+       while ((req->r_flags & RRF_COMPLETE) == 0) {
+               error = msleep(req, &nspace_resolver_request_hash_mutex,
+                   PVFS | PCATCH, "nspace", NULL);
+               if (error && error != ERESTART) {
+                       req->r_resolver_error = (error == EINTR) ? EINTR :
+                           ETIMEDOUT;
+                       send_cancel_message = true;
                        break;
                }
        }
 
-       if (i >= MAX_NSPACE_ITEMS) {
-               for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-                       if (nspace_items[i].flags == 0) {
-                               break;
-                       }
-               }
-       } else {
-               nspace_items[i].refcount++;
-       }
+       nspace_resolver_req_remove(req);
 
-       if (i >= MAX_NSPACE_ITEMS) {
-               ts.tv_sec = nspace_handler_timeout;
-               ts.tv_nsec = 0;
+       NSPACE_REQ_UNLOCK();
 
-               error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS | PCATCH, "nspace-no-space", &ts);
-               if (error == 0) {
-                       // an entry got free'd up, go see if we can get a slot
-                       goto retry;
-               } else {
-                       lck_mtx_unlock(&nspace_handler_lock);
-                       return error;
-               }
+       if (send_cancel_message) {
+               nspace_resolver_req_cancel(req->r_req_id);
        }
 
-       //
-       // if it didn't already exist, add it.  if it did exist
-       // we'll get woken up when someone does a wakeup() on
-       // the slot in the nspace_items table.
-       //
-       if (vp != nspace_items[i].vp) {
-               nspace_items[i].vp = vp;
-               nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg;  // arg is {NULL, true, uio *} - only pass uio thru to the user
-               nspace_items[i].op = op;
-               nspace_items[i].vid = vnode_vid(vp);
-               nspace_items[i].flags = NSPACE_ITEM_NEW;
-               nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type);
-               if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
-                       if (arg) {
-                               vnode_lock_spin(vp);
-                               vp->v_flag |= VNEEDSSNAPSHOT;
-                               vnode_unlock(vp);
-                       }
-               }
+       return req->r_resolver_error;
+}
 
-               nspace_items[i].token = 0;
-               nspace_items[i].refcount = 1;
+static void
+nspace_resolver_req_mark_complete(
+       struct nspace_resolver_request *req,
+       int resolver_error)
+{
+       req->r_resolver_error = resolver_error;
+       req->r_flags |= RRF_COMPLETE;
+       wakeup(req);
+}
 
-               wakeup((caddr_t)&nspace_item_idx);
-       }
+static void
+nspace_resolver_req_completed(uint32_t req_id, int resolver_error)
+{
+       struct nspace_resolver_request *req;
 
-       //
-       // Now go to sleep until the handler does a wakeup on this
-       // slot in the nspace_items table (or we timeout).
-       //
-       keep_waiting = 1;
-       while (keep_waiting) {
-               ts.tv_sec = nspace_handler_timeout;
-               ts.tv_nsec = 0;
-               error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS | PCATCH, "namespace-done", &ts);
+       NSPACE_REQ_LOCK();
 
-               if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
-                       error = 0;
-               } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
-                       error = nspace_items[i].token;
-               } else if (error == EWOULDBLOCK || error == ETIMEDOUT) {
-                       if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
-                               nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
-                               continue;
-                       } else {
-                               error = ETIMEDOUT;
-                       }
-               } else if (error == 0) {
-                       // hmmm, why did we get woken up?
-                       printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
-                           nspace_items[i].token);
-               }
+       // If we don't find the request corresponding to our req_id,
+       // just drop the completion signal on the floor; it's likely
+       // that the requester interrupted with a signal.
 
-               if (--nspace_items[i].refcount == 0) {
-                       nspace_items[i].vp = NULL;     // clear this so that no one will match on it again
-                       nspace_items[i].arg = NULL;
-                       nspace_items[i].token = 0;     // clear this so that the handler will not find it anymore
-                       nspace_items[i].flags = 0;     // this clears it for re-use
-               }
-               wakeup(&nspace_token_id);
-               keep_waiting = 0;
+       req = nspace_resolver_req_lookup(req_id);
+       if (req) {
+               nspace_resolver_req_mark_complete(req, resolver_error);
        }
 
-       lck_mtx_unlock(&nspace_handler_lock);
+       NSPACE_REQ_UNLOCK();
+}
+
+static struct proc *nspace_resolver_proc;
 
-       return error;
+static int
+nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
+{
+       *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
+           p == nspace_resolver_proc) ? 1 : 0;
+       return 0;
 }
 
-int
-nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg)
+static int
+nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
 {
-       int snapshot_error = 0;
+       vfs_context_t ctx = vfs_context_current();
+       int error = 0;
 
-       if (vp == NULL) {
-               return 0;
+       //
+       // The system filecoordinationd runs as uid == 0.  This also
+       // has the nice side-effect of filtering out filecoordinationd
+       // running in the simulator.
+       //
+       if (!vfs_context_issuser(ctx)) {
+               return EPERM;
        }
 
-       /* Swap files are special; skip them */
-       if (vnode_isswap(vp)) {
-               return 0;
+       error = priv_check_cred(vfs_context_ucred(ctx),
+           PRIV_VFS_DATALESS_RESOLVER, 0);
+       if (error) {
+               return error;
        }
 
-       if (ctime != 0 && snapshot_timestamp != 0 && (ctime <= snapshot_timestamp || vnode_needssnapshots(vp))) {
-               // the change time is within this epoch
-               int error;
+       if (is_resolver) {
+               NSPACE_REQ_LOCK();
 
-               error = resolve_nspace_item_ext(vp, op_type | NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg);
-               if (error == EDEADLK) {
-                       snapshot_error = 0;
-               } else if (error) {
-                       if (error == EAGAIN) {
-                               printf("nspace_snapshot_event: timed out waiting for namespace handler...\n");
-                       } else if (error == EINTR) {
-                               // printf("nspace_snapshot_event: got a signal while waiting for namespace handler...\n");
-                               snapshot_error = EINTR;
-                       }
+               if (nspace_resolver_proc == NULL) {
+                       proc_lock(p);
+                       p->p_lflag |= P_LNSPACE_RESOLVER;
+                       proc_unlock(p);
+                       nspace_resolver_proc = p;
+               } else {
+                       error = EBUSY;
                }
+
+               NSPACE_REQ_UNLOCK();
+       } else {
+               // This is basically just like the exit case.
+               // nspace_resolver_exited() will verify that the
+               // process is the resolver, and will clear the
+               // global.
+               nspace_resolver_exited(p);
        }
 
-       return snapshot_error;
+       return error;
 }
 
-int
-get_nspace_item_status(struct vnode *vp, int32_t *status)
+static int
+nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
 {
-       int i;
-
-       lck_mtx_lock(&nspace_handler_lock);
-       for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-               if (nspace_items[i].vp == vp) {
-                       break;
-               }
+       if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
+           (p->p_vfs_iopolicy &
+           P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
+               *is_prevented = 1;
+       } else {
+               *is_prevented = 0;
        }
+       return 0;
+}
 
-       if (i >= MAX_NSPACE_ITEMS) {
-               lck_mtx_unlock(&nspace_handler_lock);
-               return ENOENT;
+static int
+nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
+{
+       if (p->p_lflag & P_LNSPACE_RESOLVER) {
+               return is_prevented ? 0 : EBUSY;
        }
 
-       *status = nspace_items[i].flags;
-       lck_mtx_unlock(&nspace_handler_lock);
+       if (is_prevented) {
+               OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
+       } else {
+               OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
+       }
        return 0;
 }
 
-
-#if 0
 static int
-build_volfs_path(struct vnode *vp, char *path, int *len)
+nspace_materialization_get_thread_state(int *is_prevented)
 {
-       struct vnode_attr va;
-       int ret;
+       uthread_t ut = get_bsdthread_info(current_thread());
 
-       VATTR_INIT(&va);
-       VATTR_WANTED(&va, va_fsid);
-       VATTR_WANTED(&va, va_fileid);
+       *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
+       return 0;
+}
 
-       if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
-               *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
-               ret = -1;
+static int
+nspace_materialization_set_thread_state(int is_prevented)
+{
+       uthread_t ut = get_bsdthread_info(current_thread());
+
+       if (is_prevented) {
+               ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
        } else {
-               *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
-               ret = 0;
+               ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
        }
-
-       return ret;
+       return 0;
 }
-#endif
 
-//
-// Note: this function does NOT check permissions on all of the
-// parent directories leading to this vnode.  It should only be
-// called on behalf of a root process.  Otherwise a process may
-// get access to a file because the file itself is readable even
-// though its parent directories would prevent access.
-//
 static int
-vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
+nspace_materialization_is_prevented(void)
 {
-       int error, action;
+       proc_t p = current_proc();
+       uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
+       vfs_context_t ctx = vfs_context_current();
 
-       if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
-               return error;
+       /*
+        * Kernel context ==> return EDEADLK, as we would with any random
+        * process decorated as no-materialize.
+        */
+       if (ctx == vfs_context_kernel()) {
+               return EDEADLK;
        }
 
-#if CONFIG_MACF
-       error = mac_vnode_check_open(ctx, vp, fmode);
-       if (error) {
-               return error;
+       /*
+        * If the process has the dataless-manipulation entitlement,
+        * materialization is prevented, and depending on the kind
+        * of file system operation, things get to proceed as if the
+        * object is not dataless.
+        */
+       if (vfs_context_is_dataless_manipulator(ctx)) {
+               return EJUSTRETURN;
        }
-#endif
 
-       /* compute action to be authorized */
-       action = 0;
-       if (fmode & FREAD) {
-               action |= KAUTH_VNODE_READ_DATA;
-       }
-       if (fmode & (FWRITE | O_TRUNC)) {
-               /*
-                * If we are writing, appending, and not truncating,
-                * indicate that we are appending so that if the
-                * UF_APPEND or SF_APPEND bits are set, we do not deny
-                * the open.
-                */
-               if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
-                       action |= KAUTH_VNODE_APPEND_DATA;
-               } else {
-                       action |= KAUTH_VNODE_WRITE_DATA;
+       /*
+        * Per-thread decorations override any process-wide decorations.
+        * (Foundation uses this, and this overrides even the dataless-
+        * manipulation entitlement so as to make API contracts consistent.)
+        */
+       if (ut != NULL) {
+               if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
+                       return EDEADLK;
+               }
+               if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
+                       return 0;
                }
        }
 
-       if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0) {
-               return error;
+       /*
+        * If the process's iopolicy specifies that dataless files
+        * can be materialized, then we let it go ahead.
+        */
+       if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
+               return 0;
        }
 
+       /*
+        * The default behavior is to not materialize dataless files;
+        * return to the caller that deadlock was detected.
+        */
+       return EDEADLK;
+}
 
-       //
-       // if the vnode is tagged VOPENEVT and the current process
-       // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
-       // flag to the open mode so that this open won't count against
-       // the vnode when carbon delete() does a vnode_isinuse() to see
-       // if a file is currently in use.  this allows spotlight
-       // importers to not interfere with carbon apps that depend on
-       // the no-delete-if-busy semantics of carbon delete().
-       //
-       if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
-               fmode |= O_EVTONLY;
-       }
+/* the vfs.nspace branch */
+SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
 
-       if ((error = VNOP_OPEN(vp, fmode, ctx))) {
+static int
+sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
+    __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       struct proc *p = req->p;
+       int new_value, old_value, changed = 0;
+       int error;
+
+       error = nspace_resolver_get_proc_state(p, &old_value);
+       if (error) {
                return error;
        }
-       if ((error = vnode_ref_ext(vp, fmode, 0))) {
-               VNOP_CLOSE(vp, fmode, ctx);
-               return error;
+
+       error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
+           &changed);
+       if (error == 0 && changed) {
+               error = nspace_resolver_set_proc_state(p, new_value);
        }
+       return error;
+}
 
-       /* Call out to allow 3rd party notification of open.
-        * Ignore result of kauth_authorize_fileop call.
-        */
-#if CONFIG_MACF
-       mac_vnode_notify_open(ctx, vp, fmode);
-#endif
-       kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
-           (uintptr_t)vp, 0);
+/* decorate this process as the dataless file resolver */
+SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+    0, 0, sysctl_nspace_resolver, "I", "");
 
+static int
+sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
+    __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       struct proc *p = req->p;
+       int new_value, old_value, changed = 0;
+       int error;
 
-       return 0;
+       error = nspace_materialization_get_proc_state(p, &old_value);
+       if (error) {
+               return error;
+       }
+
+       error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
+           &changed);
+       if (error == 0 && changed) {
+               error = nspace_materialization_set_proc_state(p, new_value);
+       }
+       return error;
 }
 
+/* decorate this process as not wanting to materialize dataless files */
+SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+    0, 0, sysctl_nspace_prevent_materialization, "I", "");
+
 static int
-wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
+sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
+    __unused void *arg1, __unused int arg2, struct sysctl_req *req)
 {
-       int i;
-       int error = 0;
-       int unblock = 0;
-       task_t curtask;
+       int new_value, old_value, changed = 0;
+       int error;
 
-       lck_mtx_lock(&nspace_handler_exclusion_lock);
-       if (nspace_handlers[nspace_type].handler_busy) {
-               lck_mtx_unlock(&nspace_handler_exclusion_lock);
-               return EBUSY;
+       error = nspace_materialization_get_thread_state(&old_value);
+       if (error) {
+               return error;
+       }
+
+       error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
+           &changed);
+       if (error == 0 && changed) {
+               error = nspace_materialization_set_thread_state(new_value);
        }
+       return error;
+}
 
-       nspace_handlers[nspace_type].handler_busy = 1;
-       lck_mtx_unlock(&nspace_handler_exclusion_lock);
+/* decorate this thread as not wanting to materialize dataless files */
+SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+    0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
 
-       /*
-        * Any process that gets here will be one of the namespace handlers.
-        * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
-        * as we can cause deadlocks to occur, because the namespace handler may prevent
-        * VNOP_INACTIVE from proceeding.  Mark the current task as a P_DEPENDENCY_CAPABLE
-        * process.
-        */
-       curtask = current_task();
-       bsd_set_dependency_capable(curtask);
+static int
+sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
+    __unused int arg2, struct sysctl_req *req)
+{
+       struct proc *p = req->p;
+       uint32_t req_status[2] = { 0, 0 };
+       int error, is_resolver, changed = 0;
 
-       lck_mtx_lock(&nspace_handler_lock);
-       if (nspace_handlers[nspace_type].handler_proc == NULL) {
-               nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
-               nspace_handlers[nspace_type].handler_proc = current_proc();
+       error = nspace_resolver_get_proc_state(p, &is_resolver);
+       if (error) {
+               return error;
        }
 
-       if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
-           (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
-               error = EINVAL;
+       if (!is_resolver) {
+               return EPERM;
        }
 
-       while (error == 0) {
-               /* Try to find matching namespace item */
-               for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-                       if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
-                               if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
-                                       break;
-                               }
-                       }
-               }
-
-               if (i >= MAX_NSPACE_ITEMS) {
-                       /* Nothing is there yet. Wait for wake up and retry */
-                       error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS | PCATCH, "namespace-items", 0);
-                       if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
-                               /* Prevent infinite loop if snapshot handler exited */
-                               error = EINVAL;
-                               break;
-                       }
-                       continue;
-               }
+       error = sysctl_io_opaque(req, req_status, sizeof(req_status),
+           &changed);
+       if (error) {
+               return error;
+       }
 
-               nspace_items[i].flags  &= ~NSPACE_ITEM_NEW;
-               nspace_items[i].flags  |= NSPACE_ITEM_PROCESSING;
-               nspace_items[i].token  = ++nspace_token_id;
+       /*
+        * req_status[0] is the req_id
+        *
+        * req_status[1] is the errno
+        */
+       if (error == 0 && changed) {
+               nspace_resolver_req_completed(req_status[0],
+                   (int)req_status[1]);
+       }
+       return error;
+}
 
-               assert(nspace_items[i].vp);
-               struct fileproc *fp;
-               int32_t indx;
-               int32_t fmode;
-               struct proc *p = current_proc();
-               vfs_context_t ctx = vfs_context_current();
-               struct vnode_attr va;
-               bool vn_get_succsessful = false;
-               bool vn_open_successful = false;
-               bool fp_alloc_successful = false;
+/* Resolver reports completed reqs here. */
+SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
+    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+    0, 0, sysctl_nspace_complete, "-", "");
 
-               /*
-                * Use vnode pointer to acquire a file descriptor for
-                * hand-off to userland
-                */
-               fmode = nspace_open_flags_for_type(nspace_type);
-               error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
-               if (error) {
-                       goto cleanup;
-               }
-               vn_get_succsessful = true;
+#endif /* CONFIG_DATALESS_FILES */
 
-               error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
-               if (error) {
-                       goto cleanup;
-               }
-               vn_open_successful = true;
+#if CONFIG_DATALESS_FILES
+#define __no_dataless_unused    /* nothing */
+#else
+#define __no_dataless_unused    __unused
+#endif
 
-               error = falloc(p, &fp, &indx, ctx);
-               if (error) {
-                       goto cleanup;
-               }
-               fp_alloc_successful = true;
+void
+nspace_resolver_init(void)
+{
+#if CONFIG_DATALESS_FILES
+       nspace_resolver_request_lck_grp =
+           lck_grp_alloc_init("file namespace resolver", NULL);
 
-               fp->f_fglob->fg_flag = fmode;
-               fp->f_fglob->fg_ops = &vnops;
-               fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
+       lck_mtx_init(&nspace_resolver_request_hash_mutex,
+           nspace_resolver_request_lck_grp, NULL);
 
-               proc_fdlock(p);
-               procfdtbl_releasefd(p, indx, NULL);
-               fp_drop(p, indx, fp, 1);
-               proc_fdunlock(p);
+       nspace_resolver_request_hashtbl =
+           hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
+           M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
+#endif /* CONFIG_DATALESS_FILES */
+}
 
-               /*
-                * All variants of the namespace handler struct support these three fields:
-                * token, flags, and the FD pointer
-                */
-               error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
-               if (error) {
-                       goto cleanup;
-               }
-               error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
-               if (error) {
-                       goto cleanup;
-               }
-               error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
-               if (error) {
-                       goto cleanup;
-               }
+void
+nspace_resolver_exited(struct proc *p __no_dataless_unused)
+{
+#if CONFIG_DATALESS_FILES
+       struct nspace_resolver_requesthead *bucket;
+       struct nspace_resolver_request *req;
+       u_long idx;
 
-               /*
-                * Handle optional fields:
-                * extended version support an info ptr (offset, length), and the
-                *
-                * namedata version supports a unique per-link object ID
-                *
-                */
-               if (nhd->infoptr) {
-                       uio_t uio = (uio_t)nspace_items[i].arg;
-                       uint64_t u_offset, u_length;
+       NSPACE_REQ_LOCK();
 
-                       if (uio) {
-                               u_offset = uio_offset(uio);
-                               u_length = uio_resid(uio);
-                       } else {
-                               u_offset = 0;
-                               u_length = 0;
-                       }
-                       error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
-                       if (error) {
-                               goto cleanup;
-                       }
-                       error = copyout(&u_length, nhd->infoptr + sizeof(uint64_t), sizeof(uint64_t));
-                       if (error) {
-                               goto cleanup;
+       if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
+           p == nspace_resolver_proc) {
+               for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
+                       bucket = &nspace_resolver_request_hashtbl[idx];
+                       LIST_FOREACH(req, bucket, r_hashlink) {
+                               nspace_resolver_req_mark_complete(req,
+                                   ETIMEDOUT);
                        }
                }
+               nspace_resolver_proc = NULL;
+       }
 
-               if (nhd->objid) {
-                       VATTR_INIT(&va);
-                       VATTR_WANTED(&va, va_linkid);
-                       error = vnode_getattr(nspace_items[i].vp, &va, ctx);
-                       if (error) {
-                               goto cleanup;
-                       }
+       NSPACE_REQ_UNLOCK();
+#endif /* CONFIG_DATALESS_FILES */
+}
 
-                       uint64_t linkid = 0;
-                       if (VATTR_IS_SUPPORTED(&va, va_linkid)) {
-                               linkid = (uint64_t)va.va_linkid;
-                       }
-                       error = copyout(&linkid, nhd->objid, sizeof(uint64_t));
-               }
-cleanup:
-               if (error) {
-                       if (fp_alloc_successful) {
-                               fp_free(p, indx, fp);
-                       }
-                       if (vn_open_successful) {
-                               vn_close(nspace_items[i].vp, fmode, ctx);
-                       }
-                       unblock = 1;
-               }
+int
+resolve_nspace_item(struct vnode *vp, uint64_t op)
+{
+       return resolve_nspace_item_ext(vp, op, NULL);
+}
 
-               if (vn_get_succsessful) {
-                       vnode_put(nspace_items[i].vp);
-               }
+#define DATALESS_RESOLVER_ENTITLEMENT     \
+       "com.apple.private.vfs.dataless-resolver"
+#define DATALESS_MANIPULATION_ENTITLEMENT \
+       "com.apple.private.vfs.dataless-manipulation"
 
-               break;
-       }
+/*
+ * Return TRUE if the vfs context is associated with a process entitled
+ * for dataless manipulation.
+ *
+ * XXX Arguably belongs in vfs_subr.c, but is here because of the
+ * complication around CONFIG_DATALESS_FILES.
+ */
+boolean_t
+vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
+{
+#if CONFIG_DATALESS_FILES
+       assert(ctx->vc_thread == current_thread());
+       task_t const task = current_task();
+       return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
+              IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
+#else
+       return false;
+#endif /* CONFIG_DATALESS_FILES */
+}
 
-       if (unblock) {
-               if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
-                       vnode_lock_spin(nspace_items[i].vp);
-                       nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
-                       vnode_unlock(nspace_items[i].vp);
-               }
-               nspace_items[i].vp = NULL;
-               nspace_items[i].vid = 0;
-               nspace_items[i].flags = NSPACE_ITEM_DONE;
-               nspace_items[i].token = 0;
+int
+resolve_nspace_item_ext(
+       struct vnode *vp __no_dataless_unused,
+       uint64_t op __no_dataless_unused,
+       void *arg __unused)
+{
+#if CONFIG_DATALESS_FILES
+       int error;
+       mach_port_t mp;
+       char *path = NULL;
+       int path_len;
+       kern_return_t kr;
+       struct nspace_resolver_request req;
 
-               wakeup((caddr_t)&(nspace_items[i].vp));
+       // only allow namespace events on regular files, directories and symlinks.
+       if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
+               return EFTYPE;
        }
 
-       if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
-               // just go through every snapshot event and unblock it immediately.
-               if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
-                       for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-                               if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
-                                       if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
-                                               nspace_items[i].vp = NULL;
-                                               nspace_items[i].vid = 0;
-                                               nspace_items[i].flags = NSPACE_ITEM_DONE;
-                                               nspace_items[i].token = 0;
-
-                                               wakeup((caddr_t)&(nspace_items[i].vp));
-                                       }
-                               }
-                       }
-               }
+       //
+       // if this is a snapshot event and the vnode is on a
+       // disk image just pretend nothing happened since any
+       // change to the disk image will cause the disk image
+       // itself to get backed up and this avoids multi-way
+       // deadlocks between the snapshot handler and the ever
+       // popular diskimages-helper process.  the variable
+       // nspace_allow_virtual_devs allows this behavior to
+       // be overridden (for use by the Mobile TimeMachine
+       // testing infrastructure which uses disk images)
+       //
+       if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
+               os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
+               return ENOTSUP;
        }
 
-       lck_mtx_unlock(&nspace_handler_lock);
+       error = nspace_materialization_is_prevented();
+       if (error) {
+               os_log_debug(OS_LOG_DEFAULT,
+                   "NSPACE process/thread is decorated as no-materialization");
+               return error;
+       }
 
-       lck_mtx_lock(&nspace_handler_exclusion_lock);
-       nspace_handlers[nspace_type].handler_busy = 0;
-       lck_mtx_unlock(&nspace_handler_exclusion_lock);
+       kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
+       if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
+               os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
+               // Treat this like being unable to access the backing
+               // store server.
+               return ETIMEDOUT;
+       }
 
-       return error;
-}
+       MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
+       if (path == NULL) {
+               error = ENOMEM;
+               goto out_release_port;
+       }
+       path_len = MAXPATHLEN;
 
-static inline int
-validate_namespace_args(int is64bit, int size)
-{
-       if (is64bit) {
-               /* Must be one of these */
-               if (size == sizeof(user64_namespace_handler_info)) {
-                       goto sizeok;
-               }
-               if (size == sizeof(user64_namespace_handler_info_ext)) {
-                       goto sizeok;
-               }
-               if (size == sizeof(user64_namespace_handler_data)) {
-                       goto sizeok;
-               }
-               return EINVAL;
-       } else {
-               /* 32 bit -- must be one of these */
-               if (size == sizeof(user32_namespace_handler_info)) {
-                       goto sizeok;
-               }
-               if (size == sizeof(user32_namespace_handler_info_ext)) {
-                       goto sizeok;
+       error = vn_getpath(vp, path, &path_len);
+       if (error == 0) {
+               int xxx_rdar44371223;   /* XXX Mig bug */
+               req.r_req_id = next_nspace_req_id();
+               req.r_resolver_error = 0;
+               req.r_flags = 0;
+
+               NSPACE_REQ_LOCK();
+               error = nspace_resolver_req_add(&req);
+               NSPACE_REQ_UNLOCK();
+               if (error) {
+                       goto out_release_port;
                }
-               if (size == sizeof(user32_namespace_handler_data)) {
-                       goto sizeok;
+
+               os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
+               kr = send_nspace_resolve_path(mp, req.r_req_id,
+                   current_proc()->p_pid, (uint32_t)(op & 0xffffffff),
+                   path, &xxx_rdar44371223);
+               if (kr != KERN_SUCCESS) {
+                       // Also treat this like being unable to access
+                       // the backing store server.
+                       os_log_error(OS_LOG_DEFAULT,
+                           "NSPACE resolve_path failure: %d", kr);
+                       error = ETIMEDOUT;
+
+                       NSPACE_REQ_LOCK();
+                       nspace_resolver_req_remove(&req);
+                       NSPACE_REQ_UNLOCK();
+                       goto out_release_port;
                }
-               return EINVAL;
+
+               // Give back the memory we allocated earlier while
+               // we wait; we no longer need it.
+               FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
+               path = NULL;
+
+               // Request has been submitted to the resolver.
+               // Now (interruptibly) wait for completion.
+               // Upon requrn, the request will have been removed
+               // from the lookup table.
+               error = nspace_resolver_req_wait(&req);
        }
 
-sizeok:
+out_release_port:
+       if (path != NULL) {
+               FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
+       }
+       ipc_port_release_send(mp);
+
+       return error;
+#else
+       return ENOTSUP;
+#endif /* CONFIG_DATALESS_FILES */
+}
 
+int
+nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
+    __unused uint64_t op_type, __unused void *arg)
+{
        return 0;
 }
 
+#if 0
 static int
-process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
+build_volfs_path(struct vnode *vp, char *path, int *len)
 {
-       int error = 0;
-       namespace_handler_data nhd;
-
-       bzero(&nhd, sizeof(namespace_handler_data));
-
-       if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
-               return error;
-       }
-
-       error = validate_namespace_args(is64bit, size);
-       if (error) {
-               return error;
-       }
-
-       /* Copy in the userland pointers into our kernel-only struct */
+       struct vnode_attr va;
+       int ret;
 
-       if (is64bit) {
-               /* 64 bit userland structures */
-               nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
-               nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
-               nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
+       VATTR_INIT(&va);
+       VATTR_WANTED(&va, va_fsid);
+       VATTR_WANTED(&va, va_fileid);
 
-               /* If the size is greater than the standard info struct, add in extra fields */
-               if (size > (sizeof(user64_namespace_handler_info))) {
-                       if (size >= (sizeof(user64_namespace_handler_info_ext))) {
-                               nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
-                       }
-                       if (size == (sizeof(user64_namespace_handler_data))) {
-                               nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
-                       }
-                       /* Otherwise the fields were pre-zeroed when we did the bzero above. */
-               }
+       if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
+               *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
+               ret = -1;
        } else {
-               /* 32 bit userland structures */
-               nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
-               nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
-               nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
-
-               if (size > (sizeof(user32_namespace_handler_info))) {
-                       if (size >= (sizeof(user32_namespace_handler_info_ext))) {
-                               nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
-                       }
-                       if (size == (sizeof(user32_namespace_handler_data))) {
-                               nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
-                       }
-                       /* Otherwise the fields were pre-zeroed when we did the bzero above. */
-               }
+               *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
+               ret = 0;
        }
 
-       return wait_for_namespace_event(&nhd, nspace_type);
+       return ret;
 }
+#endif
 
 static unsigned long
 fsctl_bogus_command_compat(unsigned long cmd)
@@ -10493,22 +10904,6 @@ fsctl_bogus_command_compat(unsigned long cmd)
                return FSIOC_ROUTEFS_SETROUTEID;
        case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
                return FSIOC_SET_PACKAGE_EXTS;
-       case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_GET):
-               return FSIOC_NAMESPACE_HANDLER_GET;
-       case IOCBASECMD(FSIOC_OLD_SNAPSHOT_HANDLER_GET):
-               return FSIOC_OLD_SNAPSHOT_HANDLER_GET;
-       case IOCBASECMD(FSIOC_SNAPSHOT_HANDLER_GET_EXT):
-               return FSIOC_SNAPSHOT_HANDLER_GET_EXT;
-       case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UPDATE):
-               return FSIOC_NAMESPACE_HANDLER_UPDATE;
-       case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UNBLOCK):
-               return FSIOC_NAMESPACE_HANDLER_UNBLOCK;
-       case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_CANCEL):
-               return FSIOC_NAMESPACE_HANDLER_CANCEL;
-       case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME):
-               return FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME;
-       case IOCBASECMD(FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS):
-               return FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS;
        case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
                return FSIOC_SET_FSTYPENAME_OVERRIDE;
        case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
@@ -10528,6 +10923,12 @@ fsctl_bogus_command_compat(unsigned long cmd)
        return cmd;
 }
 
+static int
+cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
+{
+       return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
+}
+
 /*
  * Make a filesystem-specific control call:
  */
@@ -10543,6 +10944,10 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
        caddr_t data, memp;
        vnode_t vp = *arg_vp;
 
+       if (vp->v_type == VCHR || vp->v_type == VBLK) {
+               return ENOTTY;
+       }
+
        cmd = fsctl_bogus_command_compat(cmd);
 
        size = IOCPARM_LEN(cmd);
@@ -10596,8 +11001,10 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
        /* Check to see if it's a generic command */
        switch (cmd) {
        case FSIOC_SYNC_VOLUME: {
+               struct vfs_attr vfa;
                mount_t mp = vp->v_mount;
-               int arg = *(uint32_t*)data;
+               unsigned arg;
+
 
                /* record vid of vp so we can drop it below. */
                uint32_t vvid = vp->v_id;
@@ -10613,8 +11020,27 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
                }
                vnode_put(vp);
 
+               arg = MNT_NOWAIT;
+               if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
+                       arg = MNT_WAIT;
+               }
+
+               /*
+                * If the filessytem supports multiple filesytems in a
+                * partition (For eg APFS volumes in a container, it knows
+                * that the waitfor argument to VFS_SYNC are flags.
+                */
+               VFSATTR_INIT(&vfa);
+               VFSATTR_WANTED(&vfa, f_capabilities);
+               if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
+                   VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
+                   ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
+                   ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
+                       arg |= MNT_VOLUME;
+               }
+
                /* issue the sync for this volume */
-               (void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
+               (void)sync_callback(mp, &arg);
 
                /*
                 * Then release the mount_iterref once we're done syncing; it's not
@@ -10687,191 +11113,6 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
        }
        break;
 
-       /* namespace handlers */
-       case FSIOC_NAMESPACE_HANDLER_GET: {
-               error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
-       }
-       break;
-
-       /* Snapshot handlers */
-       case FSIOC_OLD_SNAPSHOT_HANDLER_GET: {
-               error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
-       }
-       break;
-
-       case FSIOC_SNAPSHOT_HANDLER_GET_EXT: {
-               error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
-       }
-       break;
-
-       case FSIOC_NAMESPACE_HANDLER_UPDATE: {
-               uint32_t token, val;
-               int i;
-
-               if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
-                       break;
-               }
-
-               if (!nspace_is_special_process(p)) {
-                       error = EINVAL;
-                       break;
-               }
-
-               token = ((uint32_t *)data)[0];
-               val   = ((uint32_t *)data)[1];
-
-               lck_mtx_lock(&nspace_handler_lock);
-
-               for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-                       if (nspace_items[i].token == token) {
-                               break;          /* exit for loop, not case stmt */
-                       }
-               }
-
-               if (i >= MAX_NSPACE_ITEMS) {
-                       error = ENOENT;
-               } else {
-                       //
-                       // if this bit is set, when resolve_nspace_item() times out
-                       // it will loop and go back to sleep.
-                       //
-                       nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER;
-               }
-
-               lck_mtx_unlock(&nspace_handler_lock);
-
-               if (error) {
-                       printf("nspace-handler-update: did not find token %u\n", token);
-               }
-       }
-       break;
-
-       case FSIOC_NAMESPACE_HANDLER_UNBLOCK: {
-               uint32_t token, val;
-               int i;
-
-               if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
-                       break;
-               }
-
-               if (!nspace_is_special_process(p)) {
-                       error = EINVAL;
-                       break;
-               }
-
-               token = ((uint32_t *)data)[0];
-               val   = ((uint32_t *)data)[1];
-
-               lck_mtx_lock(&nspace_handler_lock);
-
-               for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-                       if (nspace_items[i].token == token) {
-                               break;         /* exit for loop, not case statement */
-                       }
-               }
-
-               if (i >= MAX_NSPACE_ITEMS) {
-                       printf("nspace-handler-unblock: did not find token %u\n", token);
-                       error = ENOENT;
-               } else {
-                       if (val == 0 && nspace_items[i].vp) {
-                               vnode_lock_spin(nspace_items[i].vp);
-                               nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
-                               vnode_unlock(nspace_items[i].vp);
-                       }
-
-                       nspace_items[i].vp = NULL;
-                       nspace_items[i].arg = NULL;
-                       nspace_items[i].op = 0;
-                       nspace_items[i].vid = 0;
-                       nspace_items[i].flags = NSPACE_ITEM_DONE;
-                       nspace_items[i].token = 0;
-
-                       wakeup((caddr_t)&(nspace_items[i].vp));
-               }
-
-               lck_mtx_unlock(&nspace_handler_lock);
-       }
-       break;
-
-       case FSIOC_NAMESPACE_HANDLER_CANCEL: {
-               uint32_t token, val;
-               int i;
-
-               if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
-                       break;
-               }
-
-               if (!nspace_is_special_process(p)) {
-                       error = EINVAL;
-                       break;
-               }
-
-               token = ((uint32_t *)data)[0];
-               val   = ((uint32_t *)data)[1];
-
-               lck_mtx_lock(&nspace_handler_lock);
-
-               for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-                       if (nspace_items[i].token == token) {
-                               break;          /* exit for loop, not case stmt */
-                       }
-               }
-
-               if (i >= MAX_NSPACE_ITEMS) {
-                       printf("nspace-handler-cancel: did not find token %u\n", token);
-                       error = ENOENT;
-               } else {
-                       if (nspace_items[i].vp) {
-                               vnode_lock_spin(nspace_items[i].vp);
-                               nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
-                               vnode_unlock(nspace_items[i].vp);
-                       }
-
-                       nspace_items[i].vp = NULL;
-                       nspace_items[i].arg = NULL;
-                       nspace_items[i].vid = 0;
-                       nspace_items[i].token = val;
-                       nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
-                       nspace_items[i].flags |= NSPACE_ITEM_CANCELLED;
-
-                       wakeup((caddr_t)&(nspace_items[i].vp));
-               }
-
-               lck_mtx_unlock(&nspace_handler_lock);
-       }
-       break;
-
-       case FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: {
-               if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
-                       break;
-               }
-
-               // we explicitly do not do the namespace_handler_proc check here
-
-               lck_mtx_lock(&nspace_handler_lock);
-               snapshot_timestamp = ((uint32_t *)data)[0];
-               wakeup(&nspace_item_idx);
-               lck_mtx_unlock(&nspace_handler_lock);
-               printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
-       }
-       break;
-
-       case FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS:
-       {
-               if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
-                       break;
-               }
-
-               lck_mtx_lock(&nspace_handler_lock);
-               nspace_allow_virtual_devs = ((uint32_t *)data)[0];
-               lck_mtx_unlock(&nspace_handler_lock);
-               printf("nspace-snapshot-handler will%s allow events on disk-images\n",
-                   nspace_allow_virtual_devs ? "" : " NOT");
-               error = 0;
-       }
-       break;
-
        case FSIOC_SET_FSTYPENAME_OVERRIDE:
        {
                if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
@@ -10908,6 +11149,17 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
        }
        break;
 
+       case FSIOC_CAS_BSDFLAGS: {
+               struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
+               struct vnode_attr va;
+
+               VATTR_INIT(&va);
+               VATTR_SET(&va, va_flags, cas->new_flags);
+
+               error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
+       }
+       break;
+
        case FSIOC_FD_ONLY_OPEN_ONCE: {
                if (vnode_usecount(vp) > 1) {
                        error = EBUSY;
@@ -10993,6 +11245,9 @@ fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
        if ((uap->options & FSOPT_NOFOLLOW) == 0) {
                nameiflags |= FOLLOW;
        }
+       if (uap->cmd == FSIOC_FIRMLINK_CTL) {
+               nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
+       }
        NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
            UIO_USERSPACE, uap->path, ctx);
        if ((error = namei(&nd))) {
@@ -11509,9 +11764,8 @@ flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
 }
 
 static int
-fsgetpath_internal(
-       vfs_context_t ctx, int volfs_id, uint64_t objid,
-       vm_size_t bufsize, caddr_t buf, int *pathlen)
+fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
+    vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
 {
        int error;
        struct mount *mp = NULL;
@@ -11537,7 +11791,25 @@ retry:
 
 unionget:
        if (objid == 2) {
-               error = VFS_ROOT(mp, &vp, ctx);
+               struct vfs_attr vfsattr;
+               int use_vfs_root = TRUE;
+
+               VFSATTR_INIT(&vfsattr);
+               VFSATTR_WANTED(&vfsattr, f_capabilities);
+               if (!(options & FSOPT_ISREALFSID) &&
+                   vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
+                   VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
+                       if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
+                           (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
+                               use_vfs_root = FALSE;
+                       }
+               }
+
+               if (use_vfs_root) {
+                       error = VFS_ROOT(mp, &vp, ctx);
+               } else {
+                       error = VFS_VGET(mp, objid, &vp, ctx);
+               }
        } else {
                error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
        }
@@ -11572,6 +11844,9 @@ unionget:
 
        /* Obtain the absolute path to this vnode. */
        bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
+       if (options & FSOPT_NOFIRMLINKPATH) {
+               bpflags |= BUILDPATH_NO_FIRMLINK;
+       }
        bpflags |= BUILDPATH_CHECK_MOVED;
        error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
        vnode_put(vp);
@@ -11619,8 +11894,9 @@ out:
 /*
  * Obtain the full pathname of a file system object by id.
  */
-int
-fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
+static int
+fsgetpath_extended(user_addr_t buf, int bufsize, user_addr_t user_fsid, uint64_t objid,
+    uint32_t options, user_ssize_t *retval)
 {
        vfs_context_t ctx = vfs_context_current();
        fsid_t fsid;
@@ -11628,30 +11904,33 @@ fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
        int length;
        int error;
 
-       if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
+       if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
+               return EINVAL;
+       }
+
+       if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
                return error;
        }
        AUDIT_ARG(value32, fsid.val[0]);
-       AUDIT_ARG(value64, uap->objid);
+       AUDIT_ARG(value64, objid);
        /* Restrict output buffer size for now. */
 
-       if (uap->bufsize > PAGE_SIZE) {
+       if (bufsize > PAGE_SIZE || bufsize <= 0) {
                return EINVAL;
        }
-       MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK | M_ZERO);
+       MALLOC(realpath, char *, bufsize, M_TEMP, M_WAITOK | M_ZERO);
        if (realpath == NULL) {
                return ENOMEM;
        }
 
-       error = fsgetpath_internal(
-               ctx, fsid.val[0], uap->objid,
-               uap->bufsize, realpath, &length);
+       error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
+           options, &length);
 
        if (error) {
                goto out;
        }
 
-       error = copyout((caddr_t)realpath, uap->buf, length);
+       error = copyout((caddr_t)realpath, buf, length);
 
        *retval = (user_ssize_t)length; /* may be superseded by error */
 out:
@@ -11661,6 +11940,20 @@ out:
        return error;
 }
 
+int
+fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
+{
+       return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
+                  0, retval);
+}
+
+int
+fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
+{
+       return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
+                  uap->options, retval);
+}
+
 /*
  * Common routine to handle various flavors of statfs data heading out
  *     to user space.
index e6ffc2c720e93e1f44330771de7a0cf90847bf8a..cadc0d36740283d0721ae392a825f0edcb98ab57 100644 (file)
@@ -123,30 +123,27 @@ static int vn_write(struct fileproc *fp, struct uio *uio, int flags,
     vfs_context_t ctx);
 static int vn_select( struct fileproc *fp, int which, void * wql,
     vfs_context_t ctx);
-static int vn_kqfilt_add(struct fileproc *fp, struct knote *kn,
-    struct kevent_internal_s *kev, vfs_context_t ctx);
+static int vn_kqfilter(struct fileproc *fp, struct knote *kn,
+    struct kevent_qos_s *kev);
 static void filt_vndetach(struct knote *kn);
 static int filt_vnode(struct knote *kn, long hint);
-static int filt_vnode_common(struct knote *kn, vnode_t vp, long hint);
+static int filt_vnode_common(struct knote *kn, struct kevent_qos_s *kev,
+    vnode_t vp, long hint);
 static int vn_open_auth_finish(vnode_t vp, int fmode, vfs_context_t ctx);
-#if 0
-static int vn_kqfilt_remove(struct vnode *vp, uintptr_t ident,
-    vfs_context_t ctx);
-#endif
 
 const struct fileops vnops = {
-       .fo_type = DTYPE_VNODE,
-       .fo_read = vn_read,
-       .fo_write = vn_write,
-       .fo_ioctl = vn_ioctl,
-       .fo_select = vn_select,
-       .fo_close = vn_closefile,
-       .fo_kqfilter = vn_kqfilt_add,
-       .fo_drain = NULL,
+       .fo_type     = DTYPE_VNODE,
+       .fo_read     = vn_read,
+       .fo_write    = vn_write,
+       .fo_ioctl    = vn_ioctl,
+       .fo_select   = vn_select,
+       .fo_close    = vn_closefile,
+       .fo_drain    = fo_no_drain,
+       .fo_kqfilter = vn_kqfilter,
 };
 
-static int filt_vntouch(struct knote *kn, struct kevent_internal_s *kev);
-static int filt_vnprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
+static int filt_vntouch(struct knote *kn, struct kevent_qos_s *kev);
+static int filt_vnprocess(struct knote *kn, struct kevent_qos_s*kev);
 
 SECURITY_READ_ONLY_EARLY(struct  filterops) vnode_filtops = {
        .f_isfd = 1,
@@ -578,19 +575,6 @@ continue_create_lookup:
                panic("Haven't cleaned up adequately in vn_open_auth()");
        }
 
-#if DEVELOPMENT || DEBUG
-       /*
-        * XXX VSWAP: Check for entitlements or special flag here
-        * so we can restrict access appropriately.
-        */
-#else /* DEVELOPMENT || DEBUG */
-
-       if (vnode_isswap(vp) && (fmode & (FWRITE | O_TRUNC)) && (ctx != vfs_context_kernel())) {
-               error = EPERM;
-               goto bad;
-       }
-#endif /* DEVELOPMENT || DEBUG */
-
        /*
         * Expect to use this code for filesystems without compound VNOPs, for the root
         * of a filesystem, which can't be "looked up" in the sense of VNOP_LOOKUP(),
@@ -761,8 +745,15 @@ vn_close(struct vnode *vp, int flags, vfs_context_t ctx)
                }
        }
 #endif
-
-       /* work around for foxhound */
+       /*
+        * If vnode @vp belongs to a chardev or a blkdev then it is handled
+        * specially.  We first drop its user reference count @vp->v_usecount
+        * before calling VNOP_CLOSE().  This was done historically to ensure
+        * that the last close of a special device vnode performed some
+        * conditional cleanups.  Now we still need to drop this reference here
+        * to ensure that devfsspec_close() can check if the vnode is still in
+        * use.
+        */
        if (vnode_isspec(vp)) {
                (void)vnode_rele_ext(vp, flags, 0);
        }
@@ -953,20 +944,7 @@ vn_rdwr_64(
                                error = VNOP_READ(vp, auio, ioflg, &context);
                        }
                } else {
-#if DEVELOPMENT || DEBUG
-                       /*
-                        * XXX VSWAP: Check for entitlements or special flag here
-                        * so we can restrict access appropriately.
-                        */
                        error = VNOP_WRITE(vp, auio, ioflg, &context);
-#else /* DEVELOPMENT || DEBUG */
-
-                       if (vnode_isswap(vp) && ((ioflg & (IO_SWAP_DISPATCH | IO_SKIP_ENCRYPTION)) == 0)) {
-                               error = EPERM;
-                       } else {
-                               error = VNOP_WRITE(vp, auio, ioflg, &context);
-                       }
-#endif /* DEVELOPMENT || DEBUG */
                }
        }
 
@@ -1104,21 +1082,6 @@ vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx)
                return error;
        }
 
-#if DEVELOPMENT || DEBUG
-       /*
-        * XXX VSWAP: Check for entitlements or special flag here
-        * so we can restrict access appropriately.
-        */
-#else /* DEVELOPMENT || DEBUG */
-
-       if (vnode_isswap(vp)) {
-               (void)vnode_put(vp);
-               error = EPERM;
-               return error;
-       }
-#endif /* DEVELOPMENT || DEBUG */
-
-
 #if CONFIG_MACF
        error = mac_vnode_check_write(ctx, vfs_context_ucred(ctx), vp);
        if (error) {
@@ -1274,7 +1237,7 @@ error_out:
  */
 int
 vn_stat_noauth(struct vnode *vp, void *sbptr, kauth_filesec_t *xsec, int isstat64,
-    vfs_context_t ctx, struct ucred *file_cred)
+    int needsrealdev, vfs_context_t ctx, struct ucred *file_cred)
 {
        struct vnode_attr va;
        int error;
@@ -1313,6 +1276,9 @@ vn_stat_noauth(struct vnode *vp, void *sbptr, kauth_filesec_t *xsec, int isstat6
                VATTR_WANTED(&va, va_guuid);
                VATTR_WANTED(&va, va_acl);
        }
+       if (needsrealdev) {
+               va.va_vaflags = VA_REALFSID;
+       }
        error = vnode_getattr(vp, &va, ctx);
        if (error) {
                goto out;
@@ -1430,7 +1396,7 @@ vn_stat_noauth(struct vnode *vp, void *sbptr, kauth_filesec_t *xsec, int isstat6
                                fsec->fsec_group = kauth_null_guid;
                        }
                        if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) {
-                               bcopy(va.va_acl, &(fsec->fsec_acl), KAUTH_ACL_COPYSIZE(va.va_acl));
+                               __nochk_bcopy(va.va_acl, &(fsec->fsec_acl), KAUTH_ACL_COPYSIZE(va.va_acl));
                        } else {
                                fsec->fsec_acl.acl_entrycount = KAUTH_FILESEC_NOACL;
                        }
@@ -1462,7 +1428,7 @@ out:
 }
 
 int
-vn_stat(struct vnode *vp, void *sb, kauth_filesec_t *xsec, int isstat64, vfs_context_t ctx)
+vn_stat(struct vnode *vp, void *sb, kauth_filesec_t *xsec, int isstat64, int needsrealdev, vfs_context_t ctx)
 {
        int error;
 
@@ -1479,7 +1445,7 @@ vn_stat(struct vnode *vp, void *sb, kauth_filesec_t *xsec, int isstat64, vfs_con
        }
 
        /* actual stat */
-       return vn_stat_noauth(vp, sb, xsec, isstat64, ctx, NOCRED);
+       return vn_stat_noauth(vp, sb, xsec, isstat64, needsrealdev, ctx, NOCRED);
 }
 
 
@@ -1529,6 +1495,11 @@ vn_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx)
        case VCHR:
        case VBLK:
 
+               if (com == TIOCREVOKE) {
+                       error = ENOTTY;
+                       goto out;
+               }
+
                /* Should not be able to set block size from user space */
                if (com == DKIOCSETBLOCKSIZE) {
                        error = EPERM;
@@ -1721,9 +1692,9 @@ vn_pathconf(vnode_t vp, int name, int32_t *retval, vfs_context_t ctx)
 }
 
 static int
-vn_kqfilt_add(struct fileproc *fp, struct knote *kn,
-    struct kevent_internal_s *kev, vfs_context_t ctx)
+vn_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
 {
+       vfs_context_t ctx = vfs_context_current();
        struct vnode *vp;
        int error = 0;
        int result = 0;
@@ -1770,12 +1741,11 @@ vn_kqfilt_add(struct fileproc *fp, struct knote *kn,
 #endif
 
                        kn->kn_hook = (void*)vp;
-                       kn->kn_hookid = vnode_vid(vp);
                        kn->kn_filtid = EVFILTID_VN;
 
                        vnode_lock(vp);
                        KNOTE_ATTACH(&vp->v_knotes, kn);
-                       result = filt_vnode_common(kn, vp, 0);
+                       result = filt_vnode_common(kn, NULL, vp, 0);
                        vnode_unlock(vp);
 
                        /*
@@ -1790,8 +1760,7 @@ vn_kqfilt_add(struct fileproc *fp, struct knote *kn,
 
 out:
        if (error) {
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = error;
+               knote_set_error(kn, error);
        }
 
        return result;
@@ -1801,9 +1770,9 @@ static void
 filt_vndetach(struct knote *kn)
 {
        vfs_context_t ctx = vfs_context_current();
-       struct vnode *vp;
-       vp = (struct vnode *)kn->kn_hook;
-       if (vnode_getwithvid(vp, kn->kn_hookid)) {
+       struct vnode *vp = (struct vnode *)kn->kn_hook;
+       uint32_t vid = vnode_vid(vp);
+       if (vnode_getwithvid(vp, vid)) {
                return;
        }
 
@@ -1900,9 +1869,10 @@ vnode_writable_space_count(vnode_t vp)
  *      --If hint is revoke, set special flags and activate
  */
 static int
-filt_vnode_common(struct knote *kn, vnode_t vp, long hint)
+filt_vnode_common(struct knote *kn, struct kevent_qos_s *kev, vnode_t vp, long hint)
 {
        int activate = 0;
+       int64_t data = 0;
 
        lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
 
@@ -1917,32 +1887,29 @@ filt_vnode_common(struct knote *kn, vnode_t vp, long hint)
        } else {
                switch (kn->kn_filter) {
                case EVFILT_READ:
-                       kn->kn_data = vnode_readable_data_count(vp, kn->kn_fp->f_fglob->fg_offset, (kn->kn_flags & EV_POLL));
-
-                       if (kn->kn_data != 0) {
-                               activate = 1;
-                       }
+                       data = vnode_readable_data_count(vp, kn->kn_fp->f_fglob->fg_offset, (kn->kn_flags & EV_POLL));
+                       activate = (data != 0);
                        break;
                case EVFILT_WRITE:
-                       kn->kn_data = vnode_writable_space_count(vp);
-
-                       if (kn->kn_data != 0) {
-                               activate = 1;
-                       }
+                       data = vnode_writable_space_count(vp);
+                       activate = (data != 0);
                        break;
                case EVFILT_VNODE:
                        /* Check events this note matches against the hint */
                        if (kn->kn_sfflags & hint) {
                                kn->kn_fflags |= hint;         /* Set which event occurred */
                        }
-                       if (kn->kn_fflags != 0) {
-                               activate = 1;
-                       }
+                       activate = (kn->kn_fflags != 0);
                        break;
                default:
                        panic("Invalid knote filter on a vnode!\n");
                }
        }
+
+       if (kev && activate) {
+               knote_fill_kevent(kn, kev, data);
+       }
+
        return activate;
 }
 
@@ -1951,18 +1918,19 @@ filt_vnode(struct knote *kn, long hint)
 {
        vnode_t vp = (struct vnode *)kn->kn_hook;
 
-       return filt_vnode_common(kn, vp, hint);
+       return filt_vnode_common(kn, NULL, vp, hint);
 }
 
 static int
-filt_vntouch(struct knote *kn, struct kevent_internal_s *kev)
+filt_vntouch(struct knote *kn, struct kevent_qos_s *kev)
 {
        vnode_t vp = (struct vnode *)kn->kn_hook;
+       uint32_t vid = vnode_vid(vp);
        int activate;
        int hint = 0;
 
        vnode_lock(vp);
-       if (vnode_getiocount(vp, kn->kn_hookid, VNODE_NODEAD | VNODE_WITHID) != 0) {
+       if (vnode_getiocount(vp, vid, VNODE_NODEAD | VNODE_WITHID) != 0) {
                /* is recycled */
                hint = NOTE_REVOKE;
        }
@@ -1970,7 +1938,7 @@ filt_vntouch(struct knote *kn, struct kevent_internal_s *kev)
        /* accept new input fflags mask */
        kn->kn_sfflags = kev->fflags;
 
-       activate = filt_vnode_common(kn, vp, hint);
+       activate = filt_vnode_common(kn, NULL, vp, hint);
 
        if (hint == 0) {
                vnode_put_locked(vp);
@@ -1981,26 +1949,19 @@ filt_vntouch(struct knote *kn, struct kevent_internal_s *kev)
 }
 
 static int
-filt_vnprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
+filt_vnprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-#pragma unused(data)
        vnode_t vp = (struct vnode *)kn->kn_hook;
+       uint32_t vid = vnode_vid(vp);
        int activate;
        int hint = 0;
 
        vnode_lock(vp);
-       if (vnode_getiocount(vp, kn->kn_hookid, VNODE_NODEAD | VNODE_WITHID) != 0) {
+       if (vnode_getiocount(vp, vid, VNODE_NODEAD | VNODE_WITHID) != 0) {
                /* Is recycled */
                hint = NOTE_REVOKE;
        }
-       activate = filt_vnode_common(kn, vp, hint);
-       if (activate) {
-               *kev = kn->kn_kevent;
-               if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_data = 0;
-                       kn->kn_fflags = 0;
-               }
-       }
+       activate = filt_vnode_common(kn, kev, vp, hint);
 
        /* Definitely need to unlock, may need to put */
        if (hint == 0) {
index ffd01323c1a6f22315d117a755d884c0bb9bf030..a29e14f2447680c3a7cbe841e3b7bc87fd273ac5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -430,6 +430,22 @@ struct vnodeop_desc vnop_revoke_desc = {
        NULL
 };
 
+int vnop_mmap_check_vp_offsets[] = {
+       VOPARG_OFFSETOF(struct vnop_mmap_check_args, a_vp),
+       VDESC_NO_OFFSET
+};
+struct vnodeop_desc vnop_mmap_check_desc = {
+       0,
+       "vnop_mmap_check",
+       0,
+       vnop_mmap_check_vp_offsets,
+       VDESC_NO_OFFSET,
+       VDESC_NO_OFFSET,
+       VDESC_NO_OFFSET,
+       VDESC_NO_OFFSET,
+       VDESC_NO_OFFSET,
+       NULL
+};
 
 int vnop_mmap_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vnop_mmap_args, a_vp),
@@ -448,7 +464,6 @@ struct vnodeop_desc vnop_mmap_desc = {
        NULL
 };
 
-
 int vnop_mnomap_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vnop_mnomap_args, a_vp),
        VDESC_NO_OFFSET
@@ -466,7 +481,6 @@ struct vnodeop_desc vnop_mnomap_desc = {
        NULL
 };
 
-
 int vnop_fsync_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vnop_fsync_args, a_vp),
        VDESC_NO_OFFSET
@@ -895,16 +909,16 @@ int vnop_copyfile_vp_offsets[] = {
        VDESC_NO_OFFSET
 };
 struct vnodeop_desc vnop_copyfile_desc = {
-       0,
-       "vnop_copyfile",
-       0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE | VDESC_VP2_WILLRELE,
-       vnop_copyfile_vp_offsets,
-       VDESC_NO_OFFSET,
-       VDESC_NO_OFFSET,
-       VDESC_NO_OFFSET,
-       VOPARG_OFFSETOF(struct vnop_copyfile_args, a_tcnp),
-       VDESC_NO_OFFSET,
-       NULL
+       .vdesc_offset = 0,
+       .vdesc_name = "vnop_copyfile",
+       .vdesc_flags = 0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE | VDESC_VP2_WILLRELE,
+       .vdesc_vp_offsets = vnop_copyfile_vp_offsets,
+       .vdesc_vpp_offset = VDESC_NO_OFFSET,
+       .vdesc_cred_offset = VDESC_NO_OFFSET,
+       .vdesc_proc_offset = VDESC_NO_OFFSET,
+       .vdesc_componentname_offset = VOPARG_OFFSETOF(struct vnop_copyfile_args, a_tcnp),
+       .vdesc_context_offset = VDESC_NO_OFFSET,
+       .vdesc_transports = NULL
 };
 
 int vnop_clonefile_vp_offsets[] = {
@@ -913,16 +927,16 @@ int vnop_clonefile_vp_offsets[] = {
        VDESC_NO_OFFSET
 };
 struct vnodeop_desc vnop_clonefile_desc = {
-       0,
-       "vnop_clonefile",
-       0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE | VDESC_VPP_WILLRELE,
-       vnop_clonefile_vp_offsets,
-       VOPARG_OFFSETOF(struct vnop_clonefile_args, a_vpp),
-       VDESC_NO_OFFSET,
-       VDESC_NO_OFFSET,
-       VOPARG_OFFSETOF(struct vnop_clonefile_args, a_cnp),
-       VOPARG_OFFSETOF(struct vnop_clonefile_args, a_context),
-       NULL
+       .vdesc_offset = 0,
+       .vdesc_name = "vnop_clonefile",
+       .vdesc_flags = 0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE | VDESC_VPP_WILLRELE,
+       .vdesc_vp_offsets = vnop_clonefile_vp_offsets,
+       .vdesc_vpp_offset = VOPARG_OFFSETOF(struct vnop_clonefile_args, a_vpp),
+       .vdesc_cred_offset = VDESC_NO_OFFSET,
+       .vdesc_proc_offset = VDESC_NO_OFFSET,
+       .vdesc_componentname_offset = VOPARG_OFFSETOF(struct vnop_clonefile_args, a_cnp),
+       .vdesc_context_offset = VOPARG_OFFSETOF(struct vnop_clonefile_args, a_context),
+       .vdesc_transports = NULL
 };
 
 int vop_getxattr_vp_offsets[] = {
@@ -1205,6 +1219,7 @@ struct vnodeop_desc *vfs_op_descs[] = {
        &vnop_kqfilt_remove_desc,
        &vnop_setlabel_desc,
        &vnop_revoke_desc,
+       &vnop_mmap_check_desc,
        &vnop_mmap_desc,
        &vnop_mnomap_desc,
        &vnop_fsync_desc,
index ff699a78ec4bfe018c1cede93b99f5bd3b3573c2..c88ebe1dddb3b2f0ce11625525e4b1e99ff326b1 100755 (executable)
@@ -1,7 +1,7 @@
 #!/bin/sh -
 copyright='
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -266,15 +266,16 @@ echo '
 #include <sys/vnode.h>
 
 struct vnodeop_desc vop_default_desc = {
-       0,
-       "default",
-       0,
-       NULL,
-       VDESC_NO_OFFSET,
-       VDESC_NO_OFFSET,
-       VDESC_NO_OFFSET,
-       VDESC_NO_OFFSET,
-       NULL,
+       .vdesc_offset = 0,
+       .vdesc_name = "default",
+       .vdesc_flags = 0,
+       .vdesc_vp_offsets = NULL,
+       .vdesc_vpp_offset = VDESC_NO_OFFSET,
+       .vdesc_cred_offset = VDESC_NO_OFFSET,
+       .vdesc_proc_offset = VDESC_NO_OFFSET,
+       .vdesc_componentname_offset = VDESC_NO_OFFSET,
+       .vdesc_context_offset = VDESC_NO_OFFSET,
+       .vdesc_transports = NULL,
 };
 '
 
index 596835593bea93f3a8e498f1d6cb4365e5d8c9b1..b9626cc5ef259622374eff21c633b0c76e253318 100644 (file)
@@ -98,6 +98,7 @@
 #include <vm/vm_protos.h>
 
 #include <sys/kern_memorystatus.h>
+#include <sys/kern_memorystatus_freeze.h>
 
 #if CONFIG_MACF
 #include <security/mac_framework.h>
 #if CONFIG_CSR
 #include <sys/csr.h>
 #endif /* CONFIG_CSR */
+#include <IOKit/IOBSD.h>
 
 int _shared_region_map_and_slide(struct proc*, int, unsigned int, struct shared_file_mapping_np*, uint32_t, user_addr_t, user_addr_t);
 int shared_region_copyin_mappings(struct proc*, user_addr_t, unsigned int, struct shared_file_mapping_np *);
@@ -230,6 +232,10 @@ SYSCTL_UINT(_vm, OID_AUTO, user_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_u
 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ttepages_count, 0, "");
 SYSCTL_UINT(_vm, OID_AUTO, user_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ptepages_count, 0, "");
 SYSCTL_UINT(_vm, OID_AUTO, kernel_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ptepages_count, 0, "");
+#if DEVELOPMENT || DEBUG
+extern unsigned long pmap_asid_flushes;
+SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_flushes, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_flushes, "");
+#endif
 #endif /* __arm__ || __arm64__ */
 
 #if __arm64__
@@ -1042,6 +1048,7 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret)
        proc_t  targetproc = PROC_NULL;
        int     pid = args->pid;
        int     error = 0;
+       mach_port_t tfpport = MACH_PORT_NULL;
 
 #if CONFIG_MACF
        error = mac_proc_check_suspend_resume(p, MAC_PROC_CHECK_SUSPEND);
@@ -1062,7 +1069,8 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret)
                goto out;
        }
 
-       if (!task_for_pid_posix_check(targetproc)) {
+       if (!task_for_pid_posix_check(targetproc) &&
+           !IOTaskHasEntitlement(current_task(), PROCESS_RESUME_SUSPEND_ENTITLEMENT)) {
                error = EPERM;
                goto out;
        }
@@ -1070,8 +1078,6 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret)
        target = targetproc->task;
 #ifndef CONFIG_EMBEDDED
        if (target != TASK_NULL) {
-               mach_port_t tfpport;
-
                /* If we aren't root and target's task access port is set... */
                if (!kauth_cred_issuser(kauth_cred_get()) &&
                    targetproc != current_proc() &&
@@ -1115,6 +1121,10 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret)
        task_deallocate(target);
 
 out:
+       if (tfpport != IPC_PORT_NULL) {
+               ipc_port_release_send(tfpport);
+       }
+
        if (targetproc != PROC_NULL) {
                proc_rele(targetproc);
        }
@@ -1122,6 +1132,141 @@ out:
        return error;
 }
 
+kern_return_t
+debug_control_port_for_pid(struct debug_control_port_for_pid_args *args)
+{
+       mach_port_name_t        target_tport = args->target_tport;
+       int                     pid = args->pid;
+       user_addr_t             task_addr = args->t;
+       proc_t                  p = PROC_NULL;
+       task_t                  t1 = TASK_NULL;
+       task_t                  task = TASK_NULL;
+       mach_port_name_t        tret = MACH_PORT_NULL;
+       ipc_port_t              tfpport = MACH_PORT_NULL;
+       ipc_port_t              sright = NULL;
+       int                     error = 0;
+
+
+       AUDIT_MACH_SYSCALL_ENTER(AUE_DBGPORTFORPID);
+       AUDIT_ARG(pid, pid);
+       AUDIT_ARG(mach_port1, target_tport);
+
+       /* Always check if pid == 0 */
+       if (pid == 0) {
+               (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t));
+               AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE);
+               return KERN_FAILURE;
+       }
+
+       t1 = port_name_to_task(target_tport);
+       if (t1 == TASK_NULL) {
+               (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t));
+               AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE);
+               return KERN_FAILURE;
+       }
+
+
+       p = proc_find(pid);
+       if (p == PROC_NULL) {
+               error = KERN_FAILURE;
+               goto tfpout;
+       }
+
+#if CONFIG_AUDIT
+       AUDIT_ARG(process, p);
+#endif
+
+       if (!(task_for_pid_posix_check(p))) {
+               error = KERN_FAILURE;
+               goto tfpout;
+       }
+
+       if (p->task == TASK_NULL) {
+               error = KERN_SUCCESS;
+               goto tfpout;
+       }
+
+       /* Grab a task reference since the proc ref might be dropped if an upcall to task access server is made */
+       task = p->task;
+       task_reference(task);
+
+
+       if (!IOTaskHasEntitlement(current_task(), DEBUG_PORT_ENTITLEMENT)) {
+#if CONFIG_MACF
+               error = mac_proc_check_get_task(kauth_cred_get(), p);
+               if (error) {
+                       error = KERN_FAILURE;
+                       goto tfpout;
+               }
+#endif
+
+               /* If we aren't root and target's task access port is set... */
+               if (!kauth_cred_issuser(kauth_cred_get()) &&
+                   p != current_proc() &&
+                   (task_get_task_access_port(task, &tfpport) == 0) &&
+                   (tfpport != IPC_PORT_NULL)) {
+                       if (tfpport == IPC_PORT_DEAD) {
+                               error = KERN_PROTECTION_FAILURE;
+                               goto tfpout;
+                       }
+
+                       /*
+                        * Drop the proc_find proc ref before making an upcall
+                        * to taskgated, since holding a proc_find
+                        * ref while making an upcall can cause deadlock.
+                        */
+                       proc_rele(p);
+                       p = PROC_NULL;
+
+                       /* Call up to the task access server */
+                       error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, proc_selfpid(), kauth_getgid(), pid);
+
+                       if (error != MACH_MSG_SUCCESS) {
+                               if (error == MACH_RCV_INTERRUPTED) {
+                                       error = KERN_ABORTED;
+                               } else {
+                                       error = KERN_FAILURE;
+                               }
+                               goto tfpout;
+                       }
+               }
+       }
+
+       /* Check if the task has been corpsified */
+       if (is_corpsetask(task)) {
+               error = KERN_FAILURE;
+               goto tfpout;
+       }
+
+       error = task_get_debug_control_port(task, &sright);
+       if (error != KERN_SUCCESS) {
+               goto tfpout;
+       }
+
+       tret = ipc_port_copyout_send(
+               sright,
+               get_task_ipcspace(current_task()));
+
+       error = KERN_SUCCESS;
+
+tfpout:
+       task_deallocate(t1);
+       AUDIT_ARG(mach_port2, tret);
+       (void) copyout((char *) &tret, task_addr, sizeof(mach_port_name_t));
+
+       if (tfpport != IPC_PORT_NULL) {
+               ipc_port_release_send(tfpport);
+       }
+       if (task != TASK_NULL) {
+               task_deallocate(task);
+       }
+       if (p != PROC_NULL) {
+               proc_rele(p);
+       }
+       AUDIT_MACH_SYSCALL_EXIT(error);
+       return error;
+}
+
 kern_return_t
 pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret)
 {
@@ -1129,6 +1274,7 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret)
        proc_t  targetproc = PROC_NULL;
        int     pid = args->pid;
        int     error = 0;
+       mach_port_t tfpport = MACH_PORT_NULL;
 
 #if CONFIG_MACF
        error = mac_proc_check_suspend_resume(p, MAC_PROC_CHECK_RESUME);
@@ -1149,7 +1295,8 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret)
                goto out;
        }
 
-       if (!task_for_pid_posix_check(targetproc)) {
+       if (!task_for_pid_posix_check(targetproc) &&
+           !IOTaskHasEntitlement(current_task(), PROCESS_RESUME_SUSPEND_ENTITLEMENT)) {
                error = EPERM;
                goto out;
        }
@@ -1157,8 +1304,6 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret)
        target = targetproc->task;
 #ifndef CONFIG_EMBEDDED
        if (target != TASK_NULL) {
-               mach_port_t tfpport;
-
                /* If we aren't root and target's task access port is set... */
                if (!kauth_cred_issuser(kauth_cred_get()) &&
                    targetproc != current_proc() &&
@@ -1213,6 +1358,10 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret)
        task_deallocate(target);
 
 out:
+       if (tfpport != IPC_PORT_NULL) {
+               ipc_port_release_send(tfpport);
+       }
+
        if (targetproc != PROC_NULL) {
                proc_rele(targetproc);
        }
@@ -1402,7 +1551,8 @@ pid_shutdown_sockets(struct proc *p __unused, struct pid_shutdown_sockets_args *
                goto out;
        }
 
-       if (!task_for_pid_posix_check(targetproc)) {
+       if (!task_for_pid_posix_check(targetproc) &&
+           !IOTaskHasEntitlement(current_task(), PROCESS_RESUME_SUSPEND_ENTITLEMENT)) {
                error = EPERM;
                goto out;
        }
@@ -1689,32 +1839,22 @@ _shared_region_map_and_slide(
        }
 #endif /* MAC */
 
-       /* make sure vnode is on the process's root volume */
+       /* The calling process cannot be chroot-ed. */
        root_vp = p->p_fd->fd_rdir;
        if (root_vp == NULL) {
                root_vp = rootvnode;
        } else {
-               /*
-                * Chroot-ed processes can't use the shared_region.
-                */
-               error = EINVAL;
-               goto done;
-       }
-
-       if (vp->v_mount != root_vp->v_mount) {
                SHARED_REGION_TRACE_ERROR(
-                       ("shared_region: %p [%d(%s)] map(%p:'%s'): "
-                       "not on process's root volume\n",
-                       (void *)VM_KERNEL_ADDRPERM(current_thread()),
-                       p->p_pid, p->p_comm,
-                       (void *)VM_KERNEL_ADDRPERM(vp), vp->v_name));
+                       ("calling process [%d(%s)] is chroot-ed, permission denied\n",
+                       p->p_pid, p->p_comm));
                error = EPERM;
                goto done;
        }
 
-       /* make sure vnode is owned by "root" */
+       /* The shared cache file must be owned by root */
        VATTR_INIT(&va);
        VATTR_WANTED(&va, va_uid);
+       VATTR_WANTED(&va, va_flags);
        error = vnode_getattr(vp, &va, vfs_context_current());
        if (error) {
                SHARED_REGION_TRACE_ERROR(
@@ -1738,6 +1878,37 @@ _shared_region_map_and_slide(
                goto done;
        }
 
+#if CONFIG_CSR
+       if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0 &&
+           !(va.va_flags & SF_RESTRICTED)) {
+               /*
+                * CSR is not configured in CSR_ALLOW_UNRESTRICTED_FS mode, and
+                * the shared cache file is NOT SIP-protected, so reject the
+                * mapping request
+                */
+               SHARED_REGION_TRACE_ERROR(
+                       ("shared_region: %p [%d(%s)] map(%p:'%s'), "
+                       "vnode is not SIP-protected. \n",
+                       (void *)VM_KERNEL_ADDRPERM(current_thread()),
+                       p->p_pid, p->p_comm, (void *)VM_KERNEL_ADDRPERM(vp),
+                       vp->v_name));
+               error = EPERM;
+               goto done;
+       }
+#else
+       /* Devices without SIP/ROSP need to make sure that the shared cache is on the root volume. */
+       if (vp->v_mount != root_vp->v_mount) {
+               SHARED_REGION_TRACE_ERROR(
+                       ("shared_region: %p [%d(%s)] map(%p:'%s'): "
+                       "not on process's root volume\n",
+                       (void *)VM_KERNEL_ADDRPERM(current_thread()),
+                       p->p_pid, p->p_comm,
+                       (void *)VM_KERNEL_ADDRPERM(vp), vp->v_name));
+               error = EPERM;
+               goto done;
+       }
+#endif /* CONFIG_CSR */
+
        if (scdir_enforce) {
                /* get vnode for scdir_path */
                error = vnode_lookup(scdir_path, 0, &scdir_vp, vfs_context_current());
@@ -2032,6 +2203,10 @@ extern unsigned int     vm_page_purgeable_wired_count;
 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED,
     &vm_page_purgeable_wired_count, 0, "Wired purgeable page count");
 
+extern unsigned int vm_page_kern_lpage_count;
+SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vm_page_kern_lpage_count, 0, "kernel used large pages");
+
 #if DEVELOPMENT || DEBUG
 extern uint64_t get_pages_grabbed_count(void);
 
@@ -2171,10 +2346,12 @@ extern unsigned int vm_page_secluded_target;
 extern unsigned int vm_page_secluded_count;
 extern unsigned int vm_page_secluded_count_free;
 extern unsigned int vm_page_secluded_count_inuse;
+extern unsigned int vm_page_secluded_count_over_target;
 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, "");
 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, "");
 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, "");
 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, "");
+SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_over_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_over_target, 0, "");
 
 extern struct vm_page_secluded_data vm_page_secluded;
 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, "");
@@ -2344,6 +2521,12 @@ SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0
 extern uint32_t vm_page_busy_absent_skipped;
 SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, "");
 
+extern uint32_t vm_page_upl_tainted;
+SYSCTL_UINT(_vm, OID_AUTO, upl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_upl_tainted, 0, "");
+
+extern uint32_t vm_page_iopl_tainted;
+SYSCTL_UINT(_vm, OID_AUTO, iopl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_iopl_tainted, 0, "");
+
 #if (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG)
 extern int vm_footprint_suspend_allowed;
 SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, "");
@@ -2425,3 +2608,10 @@ SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed,
 extern int pmap_ledgers_panic_leeway;
 SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
 #endif /* MACH_ASSERT */
+
+extern int vm_protect_privileged_from_untrusted;
+SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, "");
+extern uint64_t vm_copied_on_read;
+SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, "");
index 436268db2facd4b7433e5d47aa940595790020d6..2fae8525c33804ae590bf07b70ccef894196ecbb 100644 (file)
@@ -618,6 +618,7 @@ vnode_pagein(
                         */
                        if ((error = VNOP_PAGEIN(vp, NULL, upl_offset, (off_t)f_offset,
                            size, flags, vfs_context_current()))) {
+                               set_thread_pagein_error(current_thread(), error);
                                result = PAGER_ERROR;
                                error  = PAGER_ERROR;
                        }
@@ -761,6 +762,7 @@ vnode_pagein(
                                                ubc_upl_abort_range(upl, (upl_offset_t) xoff, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
                                        }
                                }
+                               set_thread_pagein_error(current_thread(), error);
                                result = PAGER_ERROR;
                                error  = PAGER_ERROR;
                        }
index 934486bb8559b25b0db2f783ae7823a35c04171e..4e8d858f5a89adaf4a513cd453923d4920c73973 100644 (file)
@@ -93,6 +93,8 @@ _buf_uploffset
 _buf_valid
 _buf_vnode
 _buf_wcred
+_bufattr_markioscheduled
+_bufattr_ioscheduled
 _cache_enter
 _cache_lookup
 _cache_purge
@@ -343,6 +345,9 @@ _kauth_null_guid
 _kauth_register_scope
 _kauth_unlisten_scope
 _kdebug_enable
+_kdebug_debugid_enabled
+_kdebug_debugid_explicitly_enabled
+_kdebug_using_continuous_time
 _kernel_debug
 _kernel_debug1
 _kernel_debug_filtered
@@ -483,12 +488,14 @@ _proc_noremotehang
 _proc_pgrpid
 _proc_pid
 _proc_ppid
+_proc_original_ppid
 _proc_rele
 _proc_self
 _proc_selfname
 _proc_selfpid
 _proc_selfppid
 _proc_selfpgrpid
+_proc_sessionid
 _proc_signal
 _proc_suser
 _proto_inject
@@ -628,6 +635,7 @@ _vflush
 _vfs_64bitready
 _vfs_addname
 _vfs_attr_pack
+_vfs_attr_pack_ext
 _vfs_authcache_ttl
 _vfs_authopaque
 _vfs_authopaqueaccess
@@ -688,6 +696,7 @@ _vfs_sysctl
 _vfs_typenum
 _vfs_unbusy
 _vfs_unmountbyfsid
+_vn_authorize_unlink
 _vn_bwrite
 _vn_default_error
 _vn_getpath
@@ -758,6 +767,7 @@ _vnode_vfsmaxsymlen
 _vnode_vfsname
 _vnode_vfstypenum
 _vnode_vid
+_vnode_isonexternalstorage
 _vnode_vtype
 _vnode_waitforwrites
 _vnode_writedone
@@ -783,6 +793,7 @@ _vnop_listxattr_desc
 _vnop_lookup_desc
 _vnop_mkdir_desc
 _vnop_mknod_desc
+_vnop_mmap_check_desc
 _vnop_mmap_desc
 _vnop_mnomap_desc
 _vnop_offtoblk_desc
index ad89576cafab0a1526bfc8b938b10cee6de4e41b..5d3ed37cca558695fc07a6fae80850784afec13c 100644 (file)
@@ -109,6 +109,7 @@ __ZN18IOMemoryDescriptor11makeMappingEPS_P4taskjmmm
 __ZN18IOMemoryDescriptor11withAddressEPvm11IODirection
 __ZN18IOMemoryDescriptor11withAddressEjm11IODirectionP4task
 __ZN18IOMemoryDescriptor11withOptionsEPvmmP4taskmP8IOMapper
+__ZN18IOMemoryDescriptor12setOwnershipEP4taskim
 __ZN18IOMemoryDescriptor12setPurgeableEmPm
 __ZN18IOMemoryDescriptor12withSubRangeEPS_mm11IODirection
 __ZN18IOMemoryDescriptor13getPageCountsEPmS0_
@@ -169,11 +170,13 @@ __ZN21IONaturalMemoryCursor17withSpecificationEmmm
 __ZN21IONaturalMemoryCursor21initWithSpecificationEmmm
 __ZN21IOSubMemoryDescriptor11makeMappingEP18IOMemoryDescriptorP4taskjmmm
 __ZN21IOSubMemoryDescriptor12initSubRangeEP18IOMemoryDescriptormm11IODirection
+__ZN21IOSubMemoryDescriptor12setOwnershipEP4taskim
 __ZN21IOSubMemoryDescriptor12setPurgeableEmPm
 __ZN21IOSubMemoryDescriptor12withSubRangeEP18IOMemoryDescriptormmm
 __ZN21IOSubMemoryDescriptor18getPhysicalSegmentEmPmm
 __ZN21IOSubMemoryDescriptor7prepareE11IODirection
 __ZN21IOSubMemoryDescriptor8completeE11IODirection
+__ZN23IOMultiMemoryDescriptor12setOwnershipEP4taskim
 __ZN23IOMultiMemoryDescriptor15withDescriptorsEPP18IOMemoryDescriptorm11IODirectionb
 __ZN23IOMultiMemoryDescriptor18getPhysicalSegmentEmPmm
 __ZN23IOMultiMemoryDescriptor19initWithDescriptorsEPP18IOMemoryDescriptorm11IODirectionb
@@ -195,6 +198,7 @@ __ZN24IOBufferMemoryDescriptor9setLengthEj
 __ZN24IOBufferMemoryDescriptor9withBytesEPKvj11IODirectionb
 __ZN25IOGeneralMemoryDescriptor11setPositionEm
 __ZN25IOGeneralMemoryDescriptor11wireVirtualE11IODirection
+__ZN25IOGeneralMemoryDescriptor12setOwnershipEP4taskim
 __ZN25IOGeneralMemoryDescriptor12setPurgeableEmPm
 __ZN25IOGeneralMemoryDescriptor13mapIntoKernelEj
 __ZN25IOGeneralMemoryDescriptor14initWithRangesEP14IOVirtualRangem11IODirectionP4taskb
index 065a36f0f2df065db674f0831119e8ae05b91f6f..85e40f71132e5359eecd2e79272f4c4541fa8dcc 100644 (file)
@@ -102,6 +102,7 @@ __ZN18IOMemoryDescriptor10writeBytesEyPKvy
 __ZN18IOMemoryDescriptor11makeMappingEPS_P4taskyjyy
 __ZN18IOMemoryDescriptor11withAddressEPvyj
 __ZN18IOMemoryDescriptor11withOptionsEPvjjP4taskjP8IOMapper
+__ZN18IOMemoryDescriptor12setOwnershipEP4taskij
 __ZN18IOMemoryDescriptor12setPurgeableEjPj
 __ZN18IOMemoryDescriptor13getPageCountsEPyS0_
 __ZN18IOMemoryDescriptor15initWithOptionsEPvjjP4taskjP8IOMapper
@@ -147,11 +148,13 @@ __ZN21IONaturalMemoryCursor17withSpecificationEyyy
 __ZN21IONaturalMemoryCursor21initWithSpecificationEyyy
 __ZN21IOSubMemoryDescriptor11makeMappingEP18IOMemoryDescriptorP4taskyjyy
 __ZN21IOSubMemoryDescriptor12initSubRangeEP18IOMemoryDescriptoryyj
+__ZN21IOSubMemoryDescriptor12setOwnershipEP4taskij
 __ZN21IOSubMemoryDescriptor12setPurgeableEjPj
 __ZN21IOSubMemoryDescriptor12withSubRangeEP18IOMemoryDescriptoryyj
 __ZN21IOSubMemoryDescriptor18getPhysicalSegmentEyPyj
 __ZN21IOSubMemoryDescriptor7prepareEj
 __ZN21IOSubMemoryDescriptor8completeEj
+__ZN23IOMultiMemoryDescriptor12setOwnershipEP4taskij
 __ZN23IOMultiMemoryDescriptor15withDescriptorsEPP18IOMemoryDescriptorjjb
 __ZN23IOMultiMemoryDescriptor19initWithDescriptorsEPP18IOMemoryDescriptorjjb
 __ZN23IOMultiMemoryDescriptor7prepareEj
@@ -167,6 +170,7 @@ __ZN24IOBufferMemoryDescriptor22inTaskWithPhysicalMaskEP4taskjyy
 __ZN24IOBufferMemoryDescriptor9setLengthEm
 __ZN24IOBufferMemoryDescriptor9withBytesEPKvmjb
 __ZN25IOGeneralMemoryDescriptor11wireVirtualEj
+__ZN25IOGeneralMemoryDescriptor12setOwnershipEP4taskij
 __ZN25IOGeneralMemoryDescriptor12setPurgeableEjPj
 __ZN25IOGeneralMemoryDescriptor15initWithOptionsEPvjjP4taskjP8IOMapper
 __ZN25IOGeneralMemoryDescriptor18getPhysicalSegmentEyPyj
@@ -230,3 +234,4 @@ __ZNK18IOMemoryDescriptor19dmaCommandOperationEjPvj
 __ZNK25IOGeneralMemoryDescriptor19dmaCommandOperationEjPvj
 
 __ZN9IOService23addMatchingNotificationEPK8OSSymbolP12OSDictionaryiU13block_pointerFbPS_P10IONotifierE
+
index c5589237724883e490c37f7e26f2985b4c8637e4..0010db9f1c07d1852ac882bc743711e4fe2f1eba 100644 (file)
@@ -1,3 +1,13 @@
+_IORPCMessageFromMach
+__ZN12IOUserClient8DispatchE5IORPC
+
+__ZN16IODispatchSource23SetEnableWithCompletionEbU13block_pointerFvvEPFiP15OSMetaClassBase5IORPCE
+__ZN16IODispatchSource9SetEnableEbPFiP15OSMetaClassBase5IORPCE
+
+__ZN22IOInterruptEventSource27getPimaryInterruptTimestampEv
+__ZN22IOInterruptEventSource31enablePrimaryInterruptTimestampEb
+
+__ZN14IOPMrootDomain11setWakeTimeEy
 _IOAlignmentToSize
 _IOBSDNameMatching
 _IOBSDRegistryEntryForDeviceTree
@@ -36,6 +46,7 @@ _IOMalloc
 _IOMallocAligned
 _IOMallocContiguous
 _IOMallocPageable
+_IOMallocZero
 _IOMappedRead16
 _IOMappedRead32
 _IOMappedRead64
@@ -71,6 +82,7 @@ _IOSimpleLockAlloc
 _IOSimpleLockFree
 _IOSimpleLockGetMachLock
 _IOSimpleLockInit
+_IOSimpleLockDestroy
 _IOSimpleLockLock:_lck_spin_lock
 _IOSimpleLockTryLock:_lck_spin_try_lock
 _IOSimpleLockUnlock:_lck_spin_unlock
@@ -97,7 +109,6 @@ _PE_cpu_start
 _PE_enter_debugger
 _PE_halt_restart
 _PE_parse_boot_argn
-_PE_poll_input
 _StartIOKit
 __Z17IODTMapInterruptsP15IORegistryEntry
 __Z17IODeviceTreeAllocPv
@@ -107,24 +118,6 @@ __Z19printDictionaryKeysP12OSDictionaryPc
 __Z20IODTMatchNubWithKeysP15IORegistryEntryPKc
 __Z21IODTResolveAddressingP15IORegistryEntryPKcP14IODeviceMemory
 __Z27IODTInterruptControllerNameP15IORegistryEntry
-__ZN10IOMachPort10gMetaClassE
-__ZN10IOMachPort10superClassE
-__ZN10IOMachPort11dictForTypeEj
-__ZN10IOMachPort13portForObjectEP8OSObjectj
-__ZN10IOMachPort14setHoldDestroyEP8OSObjectj
-__ZN10IOMachPort20makeSendRightForTaskEP4taskP8OSObjectj
-__ZN10IOMachPort20releasePortForObjectEP8OSObjectj
-__ZN10IOMachPort22noMoreSendersForObjectEP8OSObjectjPj
-__ZN10IOMachPort4freeEv
-__ZN10IOMachPort9MetaClassC1Ev
-__ZN10IOMachPort9MetaClassC2Ev
-__ZN10IOMachPort9metaClassE
-__ZN10IOMachPortC1EPK11OSMetaClass
-__ZN10IOMachPortC1Ev
-__ZN10IOMachPortC2EPK11OSMetaClass
-__ZN10IOMachPortC2Ev
-__ZN10IOMachPortD0Ev
-__ZN10IOMachPortD2Ev
 __ZN10IONotifier10gMetaClassE
 __ZN10IONotifier10superClassE
 __ZN10IONotifier9MetaClassC1Ev
@@ -158,29 +151,6 @@ __ZN10IOWorkLoopC2EPK11OSMetaClass
 __ZN10IOWorkLoopC2Ev
 __ZN10IOWorkLoopD0Ev
 __ZN10IOWorkLoopD2Ev
-__ZN11IOCatalogue10addDriversEP7OSArrayb
-__ZN11IOCatalogue10gMetaClassE
-__ZN11IOCatalogue10initializeEv
-__ZN11IOCatalogue10superClassE
-__ZN11IOCatalogue13removeDriversEP12OSDictionaryb
-__ZN11IOCatalogue13startMatchingEP12OSDictionary
-__ZN11IOCatalogue15moduleHasLoadedEP8OSString
-__ZN11IOCatalogue15moduleHasLoadedEPKc
-__ZN11IOCatalogue16terminateDriversEP12OSDictionary
-__ZN11IOCatalogue25terminateDriversForModuleEP8OSStringb
-__ZN11IOCatalogue25terminateDriversForModuleEPKcb
-__ZN11IOCatalogue4freeEv
-__ZN11IOCatalogue4initEP7OSArray
-__ZN11IOCatalogue5resetEv
-__ZN11IOCatalogue9MetaClassC1Ev
-__ZN11IOCatalogue9MetaClassC2Ev
-__ZN11IOCatalogue9metaClassE
-__ZN11IOCatalogueC1EPK11OSMetaClass
-__ZN11IOCatalogueC1Ev
-__ZN11IOCatalogueC2EPK11OSMetaClass
-__ZN11IOCatalogueC2Ev
-__ZN11IOCatalogueD0Ev
-__ZN11IOCatalogueD2Ev
 __ZN11IODataQueue10gMetaClassE
 __ZN11IODataQueue10superClassE
 __ZN11IODataQueue19getMemoryDescriptorEv
@@ -701,6 +671,8 @@ __ZN18IOMemoryDescriptor9MetaClassC2Ev
 __ZN18IOMemoryDescriptor9metaClassE
 __ZN18IOMemoryDescriptorC2EPK11OSMetaClass
 __ZN18IOMemoryDescriptorD2Ev
+__ZN18IOMemoryDescriptor8getVMTagEP7_vm_map
+__ZN18IOMemoryDescriptor9setVMTagsEjj
 __ZN18IORegistryIterator10enterEntryEPK15IORegistryPlane
 __ZN18IORegistryIterator10enterEntryEv
 __ZN18IORegistryIterator10gMetaClassE
@@ -1134,8 +1106,6 @@ __ZN9IOServiceC2EPK11OSMetaClass
 __ZN9IOServiceC2Ev
 __ZN9IOServiceD0Ev
 __ZN9IOServiceD2Ev
-__ZNK10IOMachPort12getMetaClassEv
-__ZNK10IOMachPort9MetaClass5allocEv
 __ZNK10IONotifier12getMetaClassEv
 __ZNK10IONotifier9MetaClass5allocEv
 __ZNK10IOWorkLoop12getMetaClassEv
@@ -1149,9 +1119,6 @@ __ZNK10IOWorkLoop9MetaClass5allocEv
 __ZNK10IOWorkLoop9getThreadEv
 __ZNK11IOCatalogue12getMetaClassEv
 __ZNK11IOCatalogue12unloadModuleEP8OSString
-__ZNK11IOCatalogue14isModuleLoadedEP12OSDictionary
-__ZNK11IOCatalogue14isModuleLoadedEP8OSString
-__ZNK11IOCatalogue14isModuleLoadedEPKc
 __ZNK11IOCatalogue18getGenerationCountEv
 __ZNK11IOCatalogue9MetaClass5allocEv
 __ZNK11IOCatalogue9serializeEP11OSSerialize
@@ -1314,7 +1281,6 @@ __ZNK9IOService6isOpenEPKS_
 __ZNK9IOService8getStateEv
 __ZNK9IOService9MetaClass5allocEv
 __ZNK9IOService9getClientEv
-__ZTV10IOMachPort
 __ZTV10IONotifier
 __ZTV10IOWorkLoop
 __ZTV11IOCatalogue
@@ -1366,7 +1332,6 @@ __ZTV29IOInterleavedMemoryDescriptor
 __ZTV8IOMapper
 __ZTV9IOCommand
 __ZTV9IOService
-__ZTVN10IOMachPort9MetaClassE
 __ZTVN10IONotifier9MetaClassE
 __ZTVN10IOWorkLoop9MetaClassE
 __ZTVN11IOCatalogue9MetaClassE
@@ -1667,3 +1632,94 @@ __ZN9IOService22registerInterruptBlockEiP8OSObjectU13block_pointerFvPS_iE
 __ZNK13IOEventSource14getActionBlockEU13block_pointerFivE
 __ZN13IOEventSource9setRefconEPv
 __ZNK13IOEventSource9getRefconEv
+
+__ZN8OSAction17SetAbortedHandlerEU13block_pointerFvvE
+
+__ZN15IODispatchQueue9metaClassE
+__ZN16IODispatchSource9metaClassE
+__ZN25IOInterruptDispatchSource9metaClassE
+__ZN9IOService5StartEPS_PFiP15OSMetaClassBase5IORPCE
+
+__ZN25IODataQueueDispatchSource10CopyMemoryEPP18IOMemoryDescriptorPFiP15OSMetaClassBase5IORPCE
+__ZN25IODataQueueDispatchSource10gMetaClassE
+__ZN25IODataQueueDispatchSource10superClassE
+__ZN25IODataQueueDispatchSource12DataServicedEP8OSActionPFiP15OSMetaClassBase5IORPCE
+__ZN25IODataQueueDispatchSource13DataAvailableEP8OSActionPFiP15OSMetaClassBase5IORPCE
+__ZN25IODataQueueDispatchSource15IsDataAvailableEv
+__ZN25IODataQueueDispatchSource16SendDataServicedEv
+__ZN25IODataQueueDispatchSource17SendDataAvailableEv
+__ZN25IODataQueueDispatchSource22SetDataServicedHandlerEP8OSActionPFiP15OSMetaClassBase5IORPCE
+__ZN25IODataQueueDispatchSource23CopyDataServicedHandlerEPP8OSActionPFiP15OSMetaClassBase5IORPCE
+__ZN25IODataQueueDispatchSource23SetDataAvailableHandlerEP8OSActionPFiP15OSMetaClassBase5IORPCE
+__ZN25IODataQueueDispatchSource24CopyDataAvailableHandlerEPP8OSActionPFiP15OSMetaClassBase5IORPCE
+__ZN25IODataQueueDispatchSource4PeekEU13block_pointerFvPKvmE
+__ZN25IODataQueueDispatchSource4freeEv
+__ZN25IODataQueueDispatchSource4initEv
+__ZN25IODataQueueDispatchSource6CreateEyP15IODispatchQueuePPS_
+__ZN25IODataQueueDispatchSource7DequeueEU13block_pointerFvPKvmE
+__ZN25IODataQueueDispatchSource7EnqueueEjU13block_pointerFvPvmE
+__ZN25IODataQueueDispatchSource8DispatchE5IORPC
+__ZN25IODataQueueDispatchSource9MetaClass8DispatchE5IORPC
+__ZN25IODataQueueDispatchSource9MetaClassC1Ev
+__ZN25IODataQueueDispatchSource9MetaClassC2Ev
+__ZN25IODataQueueDispatchSource9_DispatchEPS_5IORPC
+__ZN25IODataQueueDispatchSource9metaClassE
+__ZN25IODataQueueDispatchSourceC1EPK11OSMetaClass
+__ZN25IODataQueueDispatchSourceC1Ev
+__ZN25IODataQueueDispatchSourceC2EPK11OSMetaClass
+__ZN25IODataQueueDispatchSourceC2Ev
+__ZN25IODataQueueDispatchSourceD0Ev
+__ZN25IODataQueueDispatchSourceD1Ev
+__ZN25IODataQueueDispatchSourceD2Ev
+__ZNK25IODataQueueDispatchSource12getMetaClassEv
+__ZNK25IODataQueueDispatchSource9MetaClass5allocEv
+__ZTV25IODataQueueDispatchSource
+__ZTVN25IODataQueueDispatchSource9MetaClassE
+__ZN25IODataQueueDispatchSource19DequeueWithCoalesceEPbU13block_pointerFvPKvmE
+__ZN25IODataQueueDispatchSource19EnqueueWithCoalesceEjPbU13block_pointerFvPvmE
+
+__ZN11IOMemoryMap17_CopyState_InvokeE5IORPCP15OSMetaClassBasePFiS2_P23IOMemoryMapPrivateStateE
+__ZN12IOUserClient22AsyncCompletion_InvokeE5IORPCP15OSMetaClassBasePFvS2_P8OSActioniPKyjE
+__ZN12IOUserClient22_ExternalMethod_InvokeE5IORPCP15OSMetaClassBasePFiS2_yPKyjP6OSDataP18IOMemoryDescriptorPyPjyPS6_S8_P8OSActionE
+__ZN12IOUserClient30CopyClientMemoryForType_InvokeE5IORPCP15OSMetaClassBasePFiS2_yPyPP18IOMemoryDescriptorE
+__ZN12IOUserServer11Exit_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcE
+__ZN12IOUserServer13Create_InvokeE5IORPCPFiPKcyyPPS_E
+__ZN12IOUserServer17LoadModule_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcE
+__ZN15IODispatchQueue13Create_InvokeE5IORPCPFiPKcyyPPS_E
+__ZN15IODispatchQueue14SetPort_InvokeE5IORPCP15OSMetaClassBasePFiS2_P8ipc_portE
+__ZN16IODispatchSource13Cancel_InvokeE5IORPCP15OSMetaClassBasePFiS2_U13block_pointerFvvEE
+__ZN16IODispatchSource16SetEnable_InvokeE5IORPCP15OSMetaClassBasePFiS2_bE
+__ZN16IODispatchSource19CheckForWork_InvokeE5IORPCP15OSMetaClassBasePFiS2_S0_bE
+__ZN16IODispatchSource30SetEnableWithCompletion_InvokeE5IORPCP15OSMetaClassBasePFiS2_bU13block_pointerFvvEE
+__ZN18IOMemoryDescriptor17_CopyState_InvokeE5IORPCP15OSMetaClassBasePFiS2_P16IOMDPrivateStateE
+__ZN18IOMemoryDescriptor20PrepareForDMA_InvokeE5IORPCP15OSMetaClassBasePFiS2_yP9IOServiceyyPyS5_PjP16IOAddressSegmentE
+__ZN24IOBufferMemoryDescriptor13Create_InvokeE5IORPCPFiyyyPPS_E
+__ZN24IOBufferMemoryDescriptor16SetLength_InvokeE5IORPCP15OSMetaClassBasePFiS2_yE
+__ZN25IODataQueueDispatchSource13Create_InvokeE5IORPCPFiyP15IODispatchQueuePPS_E
+__ZN25IODataQueueDispatchSource17CopyMemory_InvokeE5IORPCP15OSMetaClassBasePFiS2_PP18IOMemoryDescriptorE
+__ZN25IODataQueueDispatchSource19DataServiced_InvokeE5IORPCP15OSMetaClassBasePFvS2_P8OSActionE
+__ZN25IODataQueueDispatchSource20DataAvailable_InvokeE5IORPCP15OSMetaClassBasePFvS2_P8OSActionE
+__ZN25IODataQueueDispatchSource29SetDataServicedHandler_InvokeE5IORPCP15OSMetaClassBasePFiS2_P8OSActionE
+__ZN25IODataQueueDispatchSource30CopyDataServicedHandler_InvokeE5IORPCP15OSMetaClassBasePFiS2_PP8OSActionE
+__ZN25IODataQueueDispatchSource30SetDataAvailableHandler_InvokeE5IORPCP15OSMetaClassBasePFiS2_P8OSActionE
+__ZN25IODataQueueDispatchSource31CopyDataAvailableHandler_InvokeE5IORPCP15OSMetaClassBasePFiS2_PP8OSActionE
+__ZN25IOInterruptDispatchSource13Create_InvokeE5IORPCPFiP9IOServicejP15IODispatchQueuePPS_E
+__ZN25IOInterruptDispatchSource17SetHandler_InvokeE5IORPCP15OSMetaClassBasePFiS2_P8OSActionE
+__ZN25IOInterruptDispatchSource24InterruptOccurred_InvokeE5IORPCP15OSMetaClassBasePFvS2_P8OSActionyyE
+__ZN8OSAction13Create_InvokeE5IORPCPFiP8OSObjectyymPPS_E
+__ZN8OSObject23SetDispatchQueue_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcP15IODispatchQueueE
+__ZN8OSObject24CopyDispatchQueue_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcPP15IODispatchQueueE
+__ZN9IOService11Stop_InvokeE5IORPCP15OSMetaClassBasePFiS2_PS_E
+__ZN9IOService12Start_InvokeE5IORPCP15OSMetaClassBasePFiS2_PS_E
+__ZN9IOService13Create_InvokeE5IORPCP15OSMetaClassBasePFiS2_PS_PKcPS3_E
+__ZN9IOService14SetName_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcE
+__ZN9IOService20NewUserClient_InvokeE5IORPCP15OSMetaClassBasePFiS2_jPP12IOUserClientE
+__ZN9IOService20SetPowerState_InvokeE5IORPCP15OSMetaClassBasePFiS2_jE
+__ZN9IOService20SetProperties_InvokeE5IORPCP15OSMetaClassBasePFiS2_P12OSDictionaryE
+__ZN9IOService21CopyProperties_InvokeE5IORPCP15OSMetaClassBasePFiS2_PP12OSDictionaryE
+__ZN9IOService21SearchProperty_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcS4_yPP8OSObjectE
+__ZN9IOService22RegisterService_InvokeE5IORPCP15OSMetaClassBasePFiS2_E
+__ZN9IOService23ChangePowerState_InvokeE5IORPCP15OSMetaClassBasePFiS2_jE
+__ZN9IOService25GetRegistryEntryID_InvokeE5IORPCP15OSMetaClassBasePFiS2_PyE
+
+__ZN18IOMemoryDescriptor20CreateMapping_InvokeE5IORPCP15OSMetaClassBasePFiS2_yyyyyPP11IOMemoryMapE
index d53a169a51a23489fda2a53ea52fbb5c57be255d..721f17eb72e50fc82431165305aa4eda986db05f 100644 (file)
@@ -1,3 +1,7 @@
+__ZN11IOCatalogue10addDriversEP7OSArrayb
+__ZN11IOCatalogue13removeDriversEP12OSDictionaryb
+__ZN11IOCatalogue13startMatchingEP12OSDictionary
+
 _IOLockSleep_darwin14
 _IOLockSleepDeadline_darwin14
 _IOLockWakeup_darwin14
@@ -225,6 +229,7 @@ __ZN18IOMemoryDescriptor10writeBytesEyPKvy
 __ZN18IOMemoryDescriptor11makeMappingEPS_P4taskyjyy
 __ZN18IOMemoryDescriptor11withAddressEPvyj
 __ZN18IOMemoryDescriptor11withOptionsEPvjjP4taskjP8IOMapper
+__ZN18IOMemoryDescriptor12setOwnershipEP4taskij
 __ZN18IOMemoryDescriptor12setPurgeableEjPj
 __ZN18IOMemoryDescriptor13getPageCountsEPyS0_
 __ZN18IOMemoryDescriptor15initWithOptionsEPvjjP4taskjP8IOMapper
@@ -296,6 +301,7 @@ __ZN21IONaturalMemoryCursor17withSpecificationEyyy
 __ZN21IONaturalMemoryCursor21initWithSpecificationEyyy
 __ZN21IOSubMemoryDescriptor11makeMappingEP18IOMemoryDescriptorP4taskyjyy
 __ZN21IOSubMemoryDescriptor12initSubRangeEP18IOMemoryDescriptoryyj
+__ZN21IOSubMemoryDescriptor12setOwnershipEP4taskij
 __ZN21IOSubMemoryDescriptor12setPurgeableEjPj
 __ZN21IOSubMemoryDescriptor12withSubRangeEP18IOMemoryDescriptoryyj
 __ZN21IOSubMemoryDescriptor18getPhysicalSegmentEyPyj
@@ -309,6 +315,7 @@ __ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource4Ev
 __ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource5Ev
 __ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource6Ev
 __ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource7Ev
+__ZN23IOMultiMemoryDescriptor12setOwnershipEP4taskij
 __ZN23IOMultiMemoryDescriptor15withDescriptorsEPP18IOMemoryDescriptorjjb
 __ZN23IOMultiMemoryDescriptor19initWithDescriptorsEPP18IOMemoryDescriptorjjb
 __ZN23IOMultiMemoryDescriptor7prepareEj
@@ -340,6 +347,7 @@ __ZN24IOBufferMemoryDescriptor35_RESERVEDIOBufferMemoryDescriptor15Ev
 __ZN24IOBufferMemoryDescriptor9setLengthEm
 __ZN24IOBufferMemoryDescriptor9withBytesEPKvmjb
 __ZN25IOGeneralMemoryDescriptor11wireVirtualEj
+__ZN25IOGeneralMemoryDescriptor12setOwnershipEP4taskij
 __ZN25IOGeneralMemoryDescriptor12setPurgeableEjPj
 __ZN25IOGeneralMemoryDescriptor15initWithOptionsEPvjjP4taskjP8IOMapper
 __ZN25IOGeneralMemoryDescriptor18getPhysicalSegmentEyPyj
index ab47a9396799cf5d2ab83def98d81a030ab16e5b..0be20457d7e478263c9cef5e11c5c62dba7f90f2 100644 (file)
@@ -1,5 +1,5 @@
 _OSAddAtomic64
 _OSCompareAndSwap64
-__ZN15OSMetaClassBase9_ptmf2ptfEPKS_MS_FvvE
+__ZN15OSMetaClassBase9_ptmf2ptfEPKS_MS_FvvEm
 __ZN12OSOrderedSet12withCapacityEjPFlPK15OSMetaClassBaseS2_PvES3_
 __ZN12OSOrderedSet16initWithCapacityEjPFlPK15OSMetaClassBaseS2_PvES3_
index 40f33219b2843ba86052d0241c67a3304d99d8c9..d2575ff18d0eda3194bbea28f980cb9f635c2c2a 100644 (file)
@@ -1,6 +1,6 @@
 _OSAddAtomic64
 _OSCompareAndSwap64
 _PAGE_SHIFT_CONST
-__ZN15OSMetaClassBase9_ptmf2ptfEPKS_MS_FvvE
+__ZN15OSMetaClassBase9_ptmf2ptfEPKS_MS_FvvEm
 __ZN12OSOrderedSet12withCapacityEjPFiPK15OSMetaClassBaseS2_PvES3_
 __ZN12OSOrderedSet16initWithCapacityEjPFiPK15OSMetaClassBaseS2_PvES3_
index e5f0479288ac15248b6224cec6487044f73f4b3a..735ea69c3877bb701b547f12cd6e18cd44566f9f 100644 (file)
@@ -56,6 +56,8 @@ _SHA512_Init
 _SHA512_Update
 _STRDUP
 __Z13OSUnserializePKcPP8OSString
+__Z13OSUnserializePKcPN2os9smart_ptrI8OSString15osobject_policyEE
+__Z16OSUnserializeXMLPKcPN2os9smart_ptrI8OSString15osobject_policyEE
 __Z16OSUnserializeXMLPKcPP8OSString
 __Z16OSUnserializeXMLPKcmPP8OSString
 __ZN10OSIterator10gMetaClassE
@@ -88,6 +90,7 @@ __ZN11OSMetaClassD2Ev
 __ZN11OSMetaClassdlEPvm
 __ZN11OSMetaClassnwEm
 __ZN11OSSerialize10gMetaClassE
+__ZN11OSSerialize10setIndexedEb
 __ZN11OSSerialize10superClassE
 __ZN11OSSerialize12addXMLEndTagEPKc
 __ZN11OSSerialize12withCapacityEj
@@ -697,12 +700,14 @@ _os_log_debug_enabled
 _os_log_info_enabled
 _os_release
 _os_retain
-_os_ref_init_count
-_os_ref_retain
-_os_ref_release_explicit
-_os_ref_retain_try
-_os_ref_retain_locked
-_os_ref_release_locked
+_os_ref_init_count_external:_os_ref_init_count_internal
+_os_ref_release_barrier_external:_os_ref_release_barrier_internal
+_os_ref_release_external:_os_ref_release_internal
+_os_ref_release_locked_external:_os_ref_release_locked_internal
+_os_ref_release_relaxed_external:_os_ref_release_relaxed_internal
+_os_ref_retain_external:_os_ref_retain_internal
+_os_ref_retain_locked_external:_os_ref_retain_locked_internal
+_os_ref_retain_try_external:_os_ref_retain_try_internal
 _osrelease
 _ostype
 _page_mask
@@ -765,6 +770,34 @@ __NSConcreteGlobalBlock
 __NSConcreteMallocBlock
 __NSConcreteStackBlock
 __NSConcreteWeakBlockVariable
+__ZN12OSCollection14iterateObjectsEPvPFbS0_P8OSObjectE
 __ZN12OSCollection14iterateObjectsEU13block_pointerFbP8OSObjectE
+__ZN12OSDictionary14iterateObjectsEPvPFbS0_PK8OSSymbolP8OSObjectE
 __ZN12OSDictionary14iterateObjectsEU13block_pointerFbPK8OSSymbolP8OSObjectE
 __ZN12OSSerializer9withBlockEU13block_pointerFbP11OSSerializeE
+
+__ZN15IODispatchQueue8DispatchE5IORPC
+__ZN15IODispatchQueue9MetaClass8DispatchE5IORPC
+__ZN15OSMetaClassBase8DispatchE5IORPC
+__ZN15OSUserMetaClass8DispatchE5IORPC
+__ZN16IODispatchSource8DispatchE5IORPC
+__ZN16IODispatchSource9MetaClass8DispatchE5IORPC
+__ZN18IOMemoryDescriptor8DispatchE5IORPC
+__ZN18IOMemoryDescriptor9MetaClass8DispatchE5IORPC
+__ZN24IOBufferMemoryDescriptor8DispatchE5IORPC
+__ZN24IOBufferMemoryDescriptor9MetaClass8DispatchE5IORPC
+__ZN25IOInterruptDispatchSource8DispatchE5IORPC
+__ZN25IOInterruptDispatchSource9MetaClass8DispatchE5IORPC
+__ZN8OSAction8DispatchE5IORPC
+__ZN8OSAction9MetaClass8DispatchE5IORPC
+__ZN8OSObject8DispatchE5IORPC
+__ZN8OSObject9MetaClass8DispatchE5IORPC
+__ZN9IOService8DispatchE5IORPC
+__ZN9IOService9MetaClass8DispatchE5IORPC
+__ZN8OSAction9metaClassE
+__ZN15OSMetaClassBase6InvokeE5IORPC
+__ZN8OSObject9_DispatchEPS_5IORPC
+__ZN9IOService9_DispatchEPS_5IORPC
+
+__ZN8OSAction12GetReferenceEv
+__ZN8OSAction6CreateEP8OSObjectyymPPS_
index 9ea8e005a624d7ec95ec0e0914e4a07edc4658a9..48690ba94f8c380341a1acd08ef785a0e7e3c358 100644 (file)
@@ -44,7 +44,6 @@ __ZN12OSOrderedSet22_RESERVEDOSOrderedSet4Ev
 __ZN12OSOrderedSet22_RESERVEDOSOrderedSet5Ev
 __ZN12OSOrderedSet22_RESERVEDOSOrderedSet6Ev
 __ZN12OSOrderedSet22_RESERVEDOSOrderedSet7Ev
-__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase3Ev
 __ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase4Ev
 __ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase5Ev
 __ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase6Ev
index e594b265f3865253529339cd5003441264d528ae..e274ed3e14d7ced158ad61f345caf27c87c8428c 100644 (file)
@@ -18,10 +18,16 @@ _sbuf_cat
 _sbuf_data
 _sbuf_delete
 _sbuf_finish
+_sbuf_len
 _sbuf_new
+_sbuf_overflowed
 _sbuf_printf
 _sbuf_putc
+_sbuf_vprintf
 _strsep
 _sysctl__security_mac_children
 _VNOP_SETXATTR
 _VNOP_GETXATTR
+_mac_vnode_label_allocate
+_mac_vnode_label_get
+_mac_vnode_label_set
index a4b109d1139d7824ec2934661134b18fbafedaa0..f0900b345134faa0d5a5c07a07c90b63c66c8e15 100644 (file)
@@ -5,7 +5,7 @@
 #
 # All rights reserved.  The CMU software License Agreement
 # specifies the terms and conditions for use and redistribution.
-#  
+#
 #######################################################################
 #
 #      Master machine independent configuration file.
@@ -48,7 +48,7 @@
 #      medium = medium scale system configuration
 #      small  = small scale system configuration
 #      xsmall = extra small scale system configuration
-#      bsmall = special extra small scale system configuration 
+#      bsmall = special extra small scale system configuration
 #
 #######################################################################
 #
@@ -68,13 +68,14 @@ options     MACH_NP         # Mach IPC support              # <np>
 options                MACH_NBC        # No buffer cache               # <nbc>
 options                MACH_NET        # Fast network access           # <mach_net>
 options                MACH_XP         # external pager support        # <xp>
-options                NO_DIRECT_RPC   # for untyped mig servers       # 
+options                NO_DIRECT_RPC   # for untyped mig servers       #
 options                LOOP            # loopback support              # <loop>
 options                VLAN            #                               # <vlan>
+options                SIXLOWPAN       # 6LoWPAN support               # <sixlowpan>
 options                BOND            #                               # <bond>
 options                IF_FAKE         #                               # <if_fake>
+options                IF_HEADLESS     #                               # <if_headless>
 options                AH_ALL_CRYPTO   # AH all crypto algs            # <ah_all_crypto>
-options                IPCOMP_ZLIB     # IP compression using zlib     # <ipcomp_zlib>
 options                PF              # Packet Filter                 # <pf>
 options                PF_ECN          # PF use ECN marking            # <pf_ecn>
 options                PFLOG           # PF log interface              # <pflog>
@@ -96,30 +97,32 @@ options             FLOW_DIVERT                                                             # <flow_divert>
 options                NECP                                                                    # <necp>
 options                CONTENT_FILTER  #                                               # <content_filter>
 options        PACKET_MANGLER  #                                               # <packet_mangler>
-
+options        SIXLOWPAN       #               # <sixlowpan>
 # secure_kernel - secure kernel from user programs
-options     SECURE_KERNEL       # <secure_kernel> 
+options     SECURE_KERNEL       # <secure_kernel>
 
 options     OLD_SEMWAIT_SIGNAL  # old semwait_signal handler
 
 #
-#      4.4 general kernel 
+#      4.4 general kernel
 #
 options                SOCKETS         # socket support                # <inet, inet6>
 options        DIAGNOSTIC      # diagnostics                   # <diagnostic>
-options                GPROF           # build profiling               # <profile>
 options                PROFILE         # kernel profiling              # <profile>
 options                SENDFILE        # sendfile                                      # <sendfile>
 options                NETWORKING      # networking layer              # <inet, inet6>
 options                CONFIG_FSE      # file system events            # <config_fse>
 options                CONFIG_IMAGEBOOT        # local image boot      # <config_imageboot>
+options                CONFIG_LOCKERBOOT       # locker boot   # <config_lockerboot>
 options                CONFIG_MBUF_JUMBO       # jumbo cluster pool    # <config_mbuf_jumbo>
+options                CONFIG_IMAGEBOOT_IMG4   # authenticate image with AppleImage4   # <config_imageboot_img4>
+options                CONFIG_IMAGEBOOT_CHUNKLIST      # authenticate image with a chunk list  # <config_imageboot_chunklist>
 
 options                CONFIG_WORKQUEUE        # <config_workqueue>
 options                CONFIG_WORKLOOP_DEBUG   # <config_workloop_debug>
 
 #
-#      4.4 filesystems 
+#      4.4 filesystems
 #
 options                MOCKFS          # Boot from an executable       # <mockfs>
 options                FIFO            # fifo support                  # <fifo>
@@ -143,21 +146,28 @@ options           CONFIG_EXT_RESOLVER # e.g. memberd              # <config_ext_resolver>
 options                CONFIG_SEARCHFS # searchfs syscall support      # <config_searchfs>
 options                CONFIG_MNT_SUID # allow suid binaries  # <config_mnt_suid>
 options                CONFIG_MNT_ROOTSNAP # allow rooting from snapshot # <config_mnt_rootsnap>
+options        CONFIG_ROSV_STARTUP # allow read-only system volume startup # <config_rosv_startup>
+options                CONFIG_FIRMLINKS # support "firmlinks" # <config_firmlinks>
+options        CONFIG_MOUNT_VM # mount VM volume on startup # <config_mount_vm>
+options                CONFIG_DATALESS_FILES # support dataless file materialization # <config_dataless_files>
 
 #
 # NFS support
 #
 options                NFSCLIENT       # Be an NFS client              # <nfsclient>
 options                NFSSERVER       # Be an NFS server              # <nfsserver>
+options                CONFIG_NFS_GSS  # Support NFS GSSAPI            # <config_nfs_gss>
+options                CONFIG_NFS4     # Use NFSv4                     # <config_nfs4>
+options                CONFIG_NETBOOT  # network booting (requires NFSCLIENT) # <config_netboot>
 
 #
 # Machine Independent Apple Features
 #
 profile                                # build a profiling kernel      # <profile>
 
-#       
+#
 # IPv6 Support
-#       
+#
 options         "INET6"         # kernel IPv6 Support           # <inet6>
 options         IPV6SEND       # Secure Neighbor Discovery     # <ipv6send>
 options         IPSEC           # IP security                  # <ipsec>
@@ -177,26 +187,27 @@ options                   ENCRYPTED_SWAP                  # <encrypted_swap>
 options                        CONFIG_IMG4                     # <config_img4>
 
 options                ZLIB    # inflate/deflate support       # <zlib>
+options                ZLIBC   # inflate/deflate support       # <zlibc>
 
 options                IF_BRIDGE                               # <if_bridge>
 
 #
-#  configurable kernel event related resources 
+#  configurable kernel event related resources
 #
 options   CONFIG_KN_HASHSIZE=64                # <medium,large,xlarge>
 options   CONFIG_KN_HASHSIZE=48                # <small,xsmall>
 options   CONFIG_KN_HASHSIZE=20                # <bsmall>
 
 #
-#  configurable vfs related resources 
-#  CONFIG_VNODES - used to pre allocate vnode related resources 
+#  configurable vfs related resources
+#  CONFIG_VNODES - used to pre allocate vnode related resources
 #  CONFIG_NC_HASH - name cache hash table allocation
 #  CONFIG_VFS_NAMES - name strings
 #
-#  263168 magic number for medium CONFIG_VNODES is based on memory 
-#  Number vnodes  is (memsize/64k) + 1024 
+#  263168 magic number for medium CONFIG_VNODES is based on memory
+#  Number vnodes  is (memsize/64k) + 1024
 #  This is the calculation that is used by launchd in tiger
-#  we are clipping the max based on 16G 
+#  we are clipping the max based on 16G
 #  ie ((16*1024*1024*1024)/(64 *1024)) + 1024 = 263168;
 
 options   CONFIG_VNODES=263168         # <large,xlarge>
@@ -218,7 +229,7 @@ options   CONFIG_MAX_CLUSTERS=8             # <xlarge,large,medium>
 options   CONFIG_MAX_CLUSTERS=4                # <small,xsmall,bsmall>
 
 #
-#  configurable options for minumum number of buffers for kernel memory 
+#  configurable options for minumum number of buffers for kernel memory
 #
 options   CONFIG_MIN_NBUF=256          # <medium,large,xlarge>
 options   CONFIG_MIN_NBUF=128          # <small>
@@ -249,7 +260,7 @@ options CONFIG_ICMP_BANDLIM=250             # <medium,large,xlarge>
 options CONFIG_ICMP_BANDLIM=50         # <xsmall,small,bsmall>
 
 #
-#  configurable async IO options 
+#  configurable async IO options
 #  CONFIG_AIO_MAX - system wide limit of async IO requests.
 #  CONFIG_AIO_PROCESS_MAX - process limit of async IO requests.
 #  CONFIG_AIO_THREAD_COUNT - number of async IO worker threads created.
@@ -317,7 +328,7 @@ options   CONFIG_EMBEDDED                   # <config_embedded>
 #
 options                CONFIG_DYNAMIC_CODE_SIGNING     # <dynamic_codesigning>
 
-# enforce library validation on all processes. 
+# enforce library validation on all processes.
 #
 options                CONFIG_ENFORCE_LIBRARY_VALIDATION  # <config_library_validation>
 
@@ -334,6 +345,11 @@ options            CONFIG_PROTECT                  # <config_protect>
 #allow write-protection of key page
 options                CONFIG_KEYPAGE_WP               # <config_keypage_wp>
 
+#
+# allow vm_pageout_scan to dynamically adjust its priority based on priorities of waiters
+#
+options                CONFIG_VPS_DYNAMIC_PRIO         # <vps_dynamic_prio>
+
 #
 # enable per-process memory priority tracking
 #
@@ -408,7 +424,7 @@ options             CONFIG_PROC_UUID_POLICY         # <config_proc_uuid_policy>
 
 #
 # ECC data logging
-# 
+#
 options                CONFIG_ECC_LOGGING              # <config_ecc_logging>
 
 #
@@ -526,6 +542,7 @@ options             CONFIG_MACF_SOCKET_SUBSET       # MAC socket subest (no labels) # <config_mac
 
 options                CONFIG_AUDIT        # Kernel auditing       # <config_audit>
 
+options                CONFIG_ARCADE           # Arcade validation support     # <config_arcade>
 
 #
 # MACH configuration options.
@@ -582,20 +599,12 @@ options           CONFIG_TASK_ZONE_INFO           # <task_zone_info>
 # available when the kernel is being debugged.
 #
 options                CONFIG_DEBUGGER_FOR_ZONE_INFO   # <debugger_for_zone_info>
-# 
-# XPR_DEBUG enables the gathering of data through the XPR macros inserted
-#      into various subsystems. This option is normally only enabled for
-#      specific performance or behavior studies, as the overhead in both
-#      code and data space is large. The data is normally retrieved through
-#      the kernel debugger (kdb) or by reading /dev/kmem.
-#      
-options                XPR_DEBUG       #               # <debug>
-# 
+#
 # MACH_LDEBUG controls the internal consistency checks and
 #      data gathering in the locking package. This also enables a debug-only
 #      version of simple-locks on uniprocessor machines. The code size and
 #      performance impact of this option is significant.
-# 
+#
 options                MACH_LDEBUG     #               # <debug>
 
 #
@@ -621,7 +630,7 @@ options     KPC                    # <kpc>
 options     PGO                    # <pgo>
 
 # MACH_COUNTERS enables code that handles various counters in the system.
-# 
+#
 options                MACH_COUNTERS   #                           # <debug>
 
 # DEVELOPMENT define for development builds
@@ -629,6 +638,7 @@ options             DEVELOPMENT     # dev kernel                # <development>
 
 # DEBUG kernel
 options                DEBUG           # general debugging code    # <debug>
+options                CONFIG_NONFATAL_ASSERTS # non fatal asserts     # <softasserts>
 
 ##########################################################
 #
@@ -653,7 +663,7 @@ options             MACH_BSD        # BSD subsystem on top of Mach  # <mach_bsd>
 options         IOKIT          #                               # <iokit>
 
 #
-#  configurable kernel related resources (CONFIG_THREAD_MAX needs to stay in 
+#  configurable kernel related resources (CONFIG_THREAD_MAX needs to stay in
 #  sync with bsd/conf/MASTER until we fix the config system... todo XXX
 #
 options   CONFIG_THREAD_MAX=2560               # <medium,large,xlarge>
@@ -669,8 +679,8 @@ options   CONFIG_TASK_MAX=512                       # <xsmall,bsmall>
 #
 options   CONFIG_ZONE_MAP_MIN=120586240        # <xsmall,bsmall,small,medium,large,xlarge>
 
-# Sizes must be a power of two for the zhash to 
-# be able to just mask off bits instead of mod 
+# Sizes must be a power of two for the zhash to
+# be able to just mask off bits instead of mod
 options          CONFIG_ZLEAK_ALLOCATION_MAP_NUM=16384 #<medium,large,xlarge>
 options          CONFIG_ZLEAK_ALLOCATION_MAP_NUM=8192  #<small,xsmall,bsmall>
 options   CONFIG_ZLEAK_TRACE_MAP_NUM=8192 #<medium,large,xlarge>
@@ -688,6 +698,7 @@ options             CONFIG_SCHED_GRRR               # <config_sched_grrr>
 options                CONFIG_SCHED_GRRR_CORE          # <config_sched_grrr>
 options                CONFIG_SCHED_MULTIQ             # <config_sched_multiq>
 options                CONFIG_SCHED_TIMESHARE_CORE     # <config_sched_traditional,config_sched_multiq>
+options                CONFIG_CLUTCH                   # <config_clutch>
 
 options                CONFIG_SCHED_IDLE_IN_PLACE      # <config_sched_idle_in_place>
 options                CONFIG_SCHED_SFI                # <config_sched_sfi>
@@ -754,7 +765,7 @@ options             CONFIG_REQUIRES_U32_MUNGING     # incoming U32 argument structures must be
 options                COPYOUT_SHIM                    # Shim for copyout memory analysis via kext #<copyout_shim>
 
 #
-# Enable hardware correlation of mach absolute time 
+# Enable hardware correlation of mach absolute time
 # across intel/arm boundary
 options                CONFIG_MACH_BRIDGE_SEND_TIME #  # <config_mach_bridge_send_time>
 options                CONFIG_MACH_BRIDGE_RECV_TIME #  # <config_mach_bridge_recv_time>
@@ -766,3 +777,11 @@ options            CONFIG_32BIT_TELEMETRY # # <config_32bit_telemetry>
 
 options                CONFIG_QUIESCE_COUNTER # Support for _COMM_PAGE_CPU_QUIESCENT_COUNTER # <config_quiesce_counter>
 
+#
+# Sanitizers
+#
+options                CONFIG_KASAN            # <config_kasan>
+options                CONFIG_UBSAN            # <config_ubsan>
+options                CONFIG_KSANCOV          # <config_ksancov>
+
+pseudo-device ksancov 1 init ksancov_init_dev # <config_ksancov>
index d463ad18982e88ec981b6ba48a06659d0f3c9a82..6cc8a1b522ea3dac4dffa318a65bf1aec5cb1e54 100644 (file)
@@ -5,14 +5,14 @@
 #
 # All rights reserved.  The CMU software License Agreement
 # specifies the terms and conditions for use and redistribution.
-#  
+#
 ######################################################################
 #
 #  Master Apple configuration file (see the master machine independent
 #  configuration file for a description of the file format).
 #
 ######################################################################
-#  
+#
 #  Standard Apple OS Configurations:
 #  -------- ----- -- ---------------
 #
 #  KERNEL_RELEASE = [ KERNEL_BASE ]
 #  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug ]
 #  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug ]
-#  BSD_BASE =       [ mach_bsd config_workqueue psynch config_proc_uuid_policy ]
+#  BSD_BASE =       [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_imageboot config_imageboot_img4 ]
 #  BSD_RELEASE =    [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ]
-#  BSD_DEV =        [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ]
-#  BSD_DEBUG =      [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ]
-#  FILESYS_BASE =   [ devfs fifo fs_compression config_mnt_rootsnap config_protect config_fse routefs quota namedstreams ]
+#  BSD_DEV =        [ BSD_BASE config_imgsrc_access config_lockerboot config_coredump pgo config_vnguard ]
+#  BSD_DEBUG =      [ BSD_BASE config_imgsrc_access config_lockerboot config_coredump pgo config_vnguard ]
+#  FILESYS_BASE =   [ devfs fifo fs_compression config_mnt_rootsnap config_protect config_fse routefs namedstreams ]
 #  FILESYS_RELEASE= [ FILESYS_BASE ]
 #  FILESYS_DEV =    [ FILESYS_BASE fdesc ]
 #  FILESYS_DEBUG =  [ FILESYS_BASE fdesc ]
-#  NFS =            [ nfsclient nfsserver ]
-#  NETWORKING =     [ inet tcpdrop_synfin bpfilter inet6 ipv6send if_bridge traffic_mgt dummynet ah_all_crypto if_fake ]
+#  NFS_DEV =        [ nfsclient nfsserver config_nfs_gss ]
+#  NETWORKING =     [ inet tcpdrop_synfin bpfilter inet6 ipv6send if_bridge traffic_mgt dummynet ah_all_crypto if_fake sixlowpan ]
+#  NETWORKING_RELEASE = [ NETWORKING ]
+#  NETWORKING_DEV = [ NETWORKING_RELEASE if_headless ]
+#  NETWORKING_DEBUG = [ NETWORKING_DEV ]
 #  VPN =            [ ipsec flow_divert necp content_filter ]
-#  PF =             [ pf ]
+#  PF_RELEASE =     [ pf ]
+#  PF_DEV =         [ PF_RELEASE pflog ]
+#  PF_DEBUG =       [ PF_DEV ]
 #  MULTIPATH =      [ multipath mptcp ]
 #  IOKIT_BASE =     [ iokit iokitcpp no_kextd no_kernel_hid config_sleep ]
 #  IOKIT_RELEASE =  [ IOKIT_BASE ]
@@ -49,7 +54,7 @@
 #  MACH_RELEASE =   [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ]
 #  MACH_DEV =       [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ]
 #  MACH_DEBUG =     [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ]
-#  SCHED_BASE =     [ config_sched_traditional config_sched_multiq ]
+#  SCHED_BASE =     [ config_sched_traditional config_sched_multiq config_clutch ]
 #  SCHED_RELEASE =  [ SCHED_BASE ]
 #  SCHED_DEV =      [ SCHED_BASE ]
 #  SCHED_DEBUG =    [ SCHED_BASE config_sched_grrr config_sched_proto ]
@@ -58,9 +63,9 @@
 #  VM_DEV =         [ VM_BASE dynamic_codesigning ]
 #  VM_DEBUG =       [ VM_BASE dynamic_codesigning ]
 #  SECURITY =       [ config_macf ]
-#  RELEASE =        [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING PF MULTIPATH VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY ]
-#  DEVELOPMENT =    [ KERNEL_DEV     BSD_DEV     FILESYS_DEV NFS SKYWALK_DEV     NETWORKING PF MULTIPATH VPN IOKIT_DEV     LIBKERN_DEV     PERF_DBG_DEV     MACH_DEV     SCHED_DEV     VM_DEV     SECURITY ]
-#  DEBUG =          [ KERNEL_DEBUG   BSD_DEBUG   FILESYS_DEBUG   SKYWALK_DEBUG   NETWORKING PF MULTIPATH VPN IOKIT_DEBUG   LIBKERN_DEBUG   PERF_DBG_DEBUG   MACH_DEBUG   SCHED_DEBUG   VM_DEBUG   SECURITY ]
+#  RELEASE =        [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF_RELEASE MULTIPATH VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY ]
+#  DEVELOPMENT =    [ KERNEL_DEV     BSD_DEV     FILESYS_DEV NFS_DEV SKYWALK_DEV     NETWORKING_DEV PF_DEV MULTIPATH VPN IOKIT_DEV     LIBKERN_DEV     PERF_DBG_DEV     MACH_DEV     SCHED_DEV     VM_DEV     SECURITY ]
+#  DEBUG =          [ KERNEL_DEBUG   BSD_DEBUG   FILESYS_DEBUG   SKYWALK_DEBUG   NETWORKING_DEBUG PF_DEBUG MULTIPATH VPN IOKIT_DEBUG   LIBKERN_DEBUG   PERF_DBG_DEBUG   MACH_DEBUG   SCHED_DEBUG   VM_DEBUG   SECURITY ]
 #
 ######################################################################
 #
index eadc388d6f22f39df390b4172a04414492a465ad..110f6a6d62421bba51e5f25adce05d6da7f98344 100644 (file)
@@ -5,14 +5,14 @@
 #
 # All rights reserved.  The CMU software License Agreement
 # specifies the terms and conditions for use and redistribution.
-#  
+#
 ######################################################################
 #
 #  Master Apple configuration file (see the master machine independent
 #  configuration file for a description of the file format).
 #
 ######################################################################
-#  
+#
 #  Standard Apple OS Configurations:
 #  -------- ----- -- ---------------
 #
 #  KERNEL_RELEASE = [ KERNEL_BASE ]
 #  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ]
 #  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ]
-#  BSD_BASE =       [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas ]
+#  BSD_BASE =       [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas config_imageboot config_imageboot_img4 ]
 #  BSD_RELEASE =    [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ]
-#  BSD_DEV =        [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ]
-#  BSD_DEBUG =      [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ]
-#  FILESYS_BASE =   [ devfs fifo fs_compression config_mnt_rootsnap config_protect config_fse routefs quota namedstreams ]
+#  BSD_DEV =        [ BSD_BASE config_netboot config_imgsrc_access config_lockerboot config_coredump pgo config_vnguard ]
+#  BSD_DEBUG =      [ BSD_BASE config_netboot config_imgsrc_access config_lockerboot config_coredump pgo config_vnguard ]
+#  FILESYS_BASE =   [ devfs fifo fs_compression config_protect config_mnt_rootsnap config_triggers config_fse routefs namedstreams config_dataless_files ]
 #  FILESYS_RELEASE= [ FILESYS_BASE ]
 #  FILESYS_DEV =    [ FILESYS_BASE fdesc ]
 #  FILESYS_DEBUG =  [ FILESYS_BASE fdesc ]
-#  NFS =            [ nfsclient nfsserver ]
-#  NETWORKING =     [ inet tcpdrop_synfin bpfilter inet6 ipv6send if_bridge traffic_mgt dummynet ah_all_crypto if_fake ]
+#  NFS_DEV =        [ nfsclient nfsserver config_nfs_gss ]
+#  NFS_RELEASE =    [ nfsclient ]
+#  NFS_DEBUG =      [ nfsclient config_nfs_gss ]
+#  NETWORKING =     [ inet tcpdrop_synfin bpfilter inet6 ipv6send if_bridge traffic_mgt dummynet ah_all_crypto if_fake sixlowpan ]
 #  NETWORKING_RELEASE = [ NETWORKING ]
-#  NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler ]
+#  NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler if_headless ]
 #  NETWORKING_DEBUG = [ NETWORKING_DEV ]
 #  VPN =            [ ipsec flow_divert necp content_filter ]
-#  PF =             [ pf ]
+#  PF_RELEASE =     [ pf ]
+#  PF_DEV =         [ PF_RELEASE pflog ]
+#  PF_DEBUG =       [ PF_DEV ]
 #  MULTIPATH =      [ multipath mptcp ]
 #  IOKIT_BASE =     [ iokit iokitcpp no_kextd no_kernel_hid config_sleep ]
 #  IOKIT_RELEASE =  [ IOKIT_BASE ]
 #  MACH_RELEASE =   [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ]
 #  MACH_DEV =       [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ]
 #  MACH_DEBUG =     [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ]
-#  SCHED_BASE =     [ config_sched_traditional config_sched_multiq config_sched_deferred_ast ]
+#  SCHED_BASE =     [ config_sched_traditional config_sched_multiq config_sched_deferred_ast config_clutch ]
 #  SCHED_RELEASE =  [ SCHED_BASE ]
 #  SCHED_DEV =      [ SCHED_BASE ]
 #  SCHED_DEBUG =    [ SCHED_BASE config_sched_grrr config_sched_proto ]
-#  VM_BASE =        [ vm_pressure_events jetsam freeze memorystatus config_code_decryption phantom_cache config_secluded_memory config_background_queue config_cs_validation_bitmap]
+#  VM_BASE =        [ vps_dynamic_prio vm_pressure_events jetsam freeze memorystatus config_code_decryption phantom_cache config_secluded_memory config_background_queue config_cs_validation_bitmap]
 #  VM_RELEASE =     [ VM_BASE ]
 #  VM_DEV =         [ VM_BASE dynamic_codesigning ]
 #  VM_DEBUG =       [ VM_BASE dynamic_codesigning ]
 #  SECURITY =       [ config_macf kernel_integrity ]
-#  RELEASE =        [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF MULTIPATH VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY ]
-#  DEVELOPMENT =    [ KERNEL_DEV     BSD_DEV     FILESYS_DEV NFS SKYWALK_DEV     NETWORKING_DEV PF MULTIPATH VPN IOKIT_DEV     LIBKERN_DEV     PERF_DBG_DEV     MACH_DEV     SCHED_DEV     VM_DEV     SECURITY ]
-#  DEBUG =          [ KERNEL_DEBUG   BSD_DEBUG   FILESYS_DEBUG   SKYWALK_DEBUG   NETWORKING_DEBUG PF MULTIPATH VPN IOKIT_DEBUG   LIBKERN_DEBUG   PERF_DBG_DEBUG   MACH_DEBUG   SCHED_DEBUG   VM_DEBUG   SECURITY ]
-#  KASAN =          [ DEVELOPMENT ]
+#  RELEASE =        [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE NFS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF_RELEASE MULTIPATH VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY ]
+#  DEVELOPMENT =    [ KERNEL_DEV     BSD_DEV     FILESYS_DEV NFS_DEV SKYWALK_DEV     NETWORKING_DEV PF_DEV MULTIPATH VPN IOKIT_DEV     LIBKERN_DEV     PERF_DBG_DEV     MACH_DEV     SCHED_DEV     VM_DEV     SECURITY ]
+#  DEBUG =          [ KERNEL_DEBUG   BSD_DEBUG   FILESYS_DEBUG NFS_DEBUG  SKYWALK_DEBUG   NETWORKING_DEBUG PF_DEBUG MULTIPATH VPN IOKIT_DEBUG   LIBKERN_DEBUG   PERF_DBG_DEBUG   MACH_DEBUG   SCHED_DEBUG   VM_DEBUG   SECURITY ]
+#  KASAN =          [ DEVELOPMENT config_kasan config_ubsan config_ksancov ]
 #
 ######################################################################
 #
index f6c35b27b975fb04fd51755ec71dbf8869e62fe7..73670d3d344370c6083bbb400dcb546b9c6ace99 100644 (file)
@@ -1,18 +1,18 @@
 #
 # Mach Operating System
 # Copyright (c) 1986 Carnegie-Mellon University
-# Copyright 2001-2016 Apple Inc.
+# Copyright 2001-2018 Apple Inc.
 #
 # All rights reserved.  The CMU software License Agreement
 # specifies the terms and conditions for use and redistribution.
-#  
+#
 ######################################################################
 #
 #  Master Apple configuration file (see the master machine independent
 #  configuration file for a description of the file format).
 #
 ######################################################################
-#  
+#
 #  Standard Apple OS Configurations:
 #  -------- ----- -- ---------------
 #
 #  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ]
 #  BSD_BASE =       [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas ]
 #  BSD_RELEASE =    [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ]
-#  BSD_DEV =        [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ]
-#  BSD_DEBUG =      [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ]
-#  FILESYS_BASE =   [ devfs fifo fs_compression config_mnt_rootsnap config_protect config_fse routefs quota namedstreams ]
+#  BSD_DEV =        [ BSD_BASE config_netboot config_imageboot config_coredump pgo config_vnguard ]
+#  BSD_DEBUG =      [ BSD_BASE config_netboot config_imageboot config_coredump pgo config_vnguard ]
+#  FILESYS_BASE =   [ devfs fifo fs_compression config_protect config_mnt_rootsnap config_fse routefs namedstreams ]
 #  FILESYS_RELEASE= [ FILESYS_BASE ]
 #  FILESYS_DEV =    [ FILESYS_BASE fdesc ]
 #  FILESYS_DEBUG =  [ FILESYS_BASE fdesc ]
-#  NFS =            [ nfsclient nfsserver ]
+#  NFS_DEV =        [ nfsclient nfsserver config_nfs_gss ]
+#  NFS_RELEASE =    [ nfsclient ]
+#  NFS_DEBUG =      [ nfsclient config_nfs_gss ]
 #  NETWORKING =     [ inet tcpdrop_synfin bpfilter inet6 ipv6send if_bridge traffic_mgt dummynet ah_all_crypto if_fake ]
 #  NETWORKING_RELEASE = [ NETWORKING ]
-#  NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler ]
+#  NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler if_headless ]
 #  NETWORKING_DEBUG = [ NETWORKING_DEV ]
 #  VPN =            [ ipsec flow_divert necp content_filter ]
 #  PF =             [ pf ]
@@ -40,7 +42,7 @@
 #  IOKIT_RELEASE =  [ IOKIT_BASE ]
 #  IOKIT_DEV =      [ IOKIT_BASE iokitstats iotracking ]
 #  IOKIT_DEBUG =    [ IOKIT_BASE iokitstats iotracking]
-#  LIBKERN_BASE =   [ libkerncpp config_kec_fips zlib crypto_sha2 ]
+#  LIBKERN_BASE =   [ libkerncpp config_blocks config_kec_fips zlib crypto_sha2 ]
 #  LIBKERN_RELEASE =[ LIBKERN_BASE ]
 #  LIBKERN_DEV =    [ LIBKERN_BASE iotracking ]
 #  LIBKERN_DEBUG =  [ LIBKERN_BASE iotracking ]
 #  VM_DEV =         [ VM_BASE dynamic_codesigning ]
 #  VM_DEBUG =       [ VM_BASE dynamic_codesigning ]
 #  SECURITY =       [ config_macf kernel_integrity ]
-#  RELEASE =        [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF MULTIPATH VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY ]
-#  DEVELOPMENT =    [ KERNEL_DEV     BSD_DEV     FILESYS_DEV NFS SKYWALK_DEV     NETWORKING_DEV PF MULTIPATH VPN IOKIT_DEV     LIBKERN_DEV     PERF_DBG_DEV     MACH_DEV     SCHED_DEV     VM_DEV     SECURITY ]
-#  DEBUG =          [ KERNEL_DEBUG   BSD_DEBUG   FILESYS_DEBUG   SKYWALK_DEBUG   NETWORKING_DEBUG PF MULTIPATH VPN IOKIT_DEBUG   LIBKERN_DEBUG   PERF_DBG_DEBUG   MACH_DEBUG   SCHED_DEBUG   VM_DEBUG   SECURITY ]
-#  KASAN =          [ DEVELOPMENT ]
+#  RELEASE =        [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE NFS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF MULTIPATH VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY ]
+#  DEVELOPMENT =    [ KERNEL_DEV     BSD_DEV     FILESYS_DEV NFS_DEV SKYWALK_DEV     NETWORKING_DEV PF MULTIPATH VPN IOKIT_DEV     LIBKERN_DEV     PERF_DBG_DEV     MACH_DEV     SCHED_DEV     VM_DEV     SECURITY ]
+#  DEBUG =          [ KERNEL_DEBUG   BSD_DEBUG   FILESYS_DEBUG NFS_DEBUG  SKYWALK_DEBUG   NETWORKING_DEBUG PF MULTIPATH VPN IOKIT_DEBUG   LIBKERN_DEBUG   PERF_DBG_DEBUG   MACH_DEBUG   SCHED_DEBUG   VM_DEBUG   SECURITY ]
+#  KASAN =          [ DEVELOPMENT config_kasan config_ubsan config_ksancov ]
 #
 ######################################################################
 #
index 66e7f98dee5f30b29cf8771df0b2ec36dc6b0cab..2e72d1d457ba6c02d2ff3226526cbe5fbf7ee46d 100644 (file)
@@ -5,14 +5,14 @@
 #
 # All rights reserved.  The CMU software License Agreement
 # specifies the terms and conditions for use and redistribution.
-#  
+#
 ######################################################################
 #
 #  Master Apple configuration file (see the master machine independent
 #  configuration file for a description of the file format).
 #
 ######################################################################
-#  
+#
 #  Standard Apple OS Configurations:
 #  -------- ----- -- ---------------
 #
 #  KERNEL_RELEASE = [ KERNEL_BASE ]
 #  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug ]
 #  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug ]
-#  BSD_BASE =       [ mach_bsd sysv_sem sysv_msg sysv_shm config_imageboot config_workqueue psynch config_proc_uuid_policy config_coredump pgo config_32bit_telemetry ]
+#  BSD_BASE =       [ mach_bsd sysv_sem sysv_msg sysv_shm config_netboot config_imageboot config_imageboot_chunklist config_workqueue psynch config_proc_uuid_policy config_coredump pgo config_32bit_telemetry config_personas ]
 #  BSD_RELEASE =    [ BSD_BASE ]
 #  BSD_DEV =        [ BSD_BASE config_vnguard ]
 #  BSD_DEBUG =      [ BSD_BASE config_vnguard ]
-#  FILESYS_BASE =   [ devfs fdesc config_dev_kmem config_fse quota namedstreams config_mnt_rootsnap config_keypage_wp config_protect fifo config_volfs fs_compression config_imgsrc_access config_triggers config_ext_resolver config_searchfs config_appledouble nullfs config_mnt_suid ]
+#  FILESYS_BASE =   [ devfs fdesc config_dev_kmem config_fse quota namedstreams config_mnt_rootsnap config_rosv_startup config_mount_vm config_keypage_wp config_protect fifo config_volfs fs_compression config_imgsrc_access config_triggers config_ext_resolver config_searchfs config_appledouble nullfs config_mnt_suid config_firmlinks config_dataless_files ]
 #  FILESYS_RELEASE= [ FILESYS_BASE ]
 #  FILESYS_DEV =    [ FILESYS_BASE ]
 #  FILESYS_DEBUG =  [ FILESYS_BASE ]
-#  NFS =            [ nfsclient nfsserver ]
-#  NETWORKING =     [ inet inet6 ipv6send tcpdrop_synfin bpfilter dummynet traffic_mgt sendfile ah_all_crypto bond vlan gif stf ifnet_input_chk config_mbuf_jumbo if_bridge ipcomp_zlib MULTIPATH if_fake ]
+#  NFS =            [ nfsclient nfsserver config_nfs4 config_nfs_gss ]
+#  NETWORKING =     [ inet inet6 ipv6send tcpdrop_synfin bpfilter dummynet traffic_mgt sendfile ah_all_crypto bond vlan gif stf ifnet_input_chk config_mbuf_jumbo if_bridge MULTIPATH if_fake sixlowpan ]
 #  NETWORKING_RELEASE = [ NETWORKING ]
-#  NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler ]
+#  NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler if_headless ]
 #  NETWORKING_DEBUG = [ NETWORKING_DEV ]
 #  VPN =            [ ipsec flow_divert necp content_filter ]
 #  PF =             [ pf pflog ]
 #  IOKIT_RELEASE =  [ IOKIT_BASE ]
 #  IOKIT_DEV =      [ IOKIT_BASE iotracking ]
 #  IOKIT_DEBUG =    [ IOKIT_BASE iotracking ]
-#  LIBKERN_BASE =   [ libkerncpp config_blocks config_kxld config_kec_fips zlib crypto_sha2 config_img4 ]
-#  LIBKERN_RELEASE =[ LIBKERN_BASE ]
-#  LIBKERN_DEV =    [ LIBKERN_BASE iotracking ]
-#  LIBKERN_DEBUG =  [ LIBKERN_BASE iotracking ]
+#  LIBKERN_BASE =   [ libkerncpp config_blocks config_kxld config_kec_fips crypto_sha2 config_img4 ]
+#  LIBKERN_RELEASE =[ LIBKERN_BASE zlib ]
+#  LIBKERN_DEV =    [ LIBKERN_BASE zlib iotracking ]
+#  LIBKERN_DEBUG =  [ LIBKERN_BASE zlib iotracking ]
 #  PERF_DBG_BASE =  [ config_dtrace mach_kdp config_serial_kdp kdp_interactive_debugging kperf kpc zleaks config_gzalloc MONOTONIC_BASE ]
 #  PERF_DBG_RELEASE=[ PERF_DBG_BASE ]
 #  PERF_DBG_DEV    =[ PERF_DBG_BASE lock_stats ]
 #  SCHED_DEV =      [ SCHED_BASE ]
 #  SCHED_DEBUG =    [ SCHED_BASE config_sched_grrr config_sched_proto ]
 #  VM =             [ vm_pressure_events memorystatus dynamic_codesigning config_code_decryption encrypted_swap config_background_queue]
-#  SECURITY =       [ config_macf config_audit config_csr ]
+#  SECURITY =       [ config_macf config_audit config_csr config_arcade]
 #  RELEASE =        [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE NFS SKYWALK_RELEASE NETWORKING_RELEASE PF VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM SECURITY ]
 #  DEVELOPMENT =    [ KERNEL_DEV     BSD_DEV     FILESYS_DEV     NFS SKYWALK_DEV     NETWORKING_DEV PF VPN IOKIT_DEV     LIBKERN_DEV     PERF_DBG_DEV MACH_DEV     SCHED_DEV     VM SECURITY ]
 #  DEBUG =          [ KERNEL_DEBUG   BSD_DEBUG   FILESYS_DEBUG   NFS SKYWALK_DEBUG   NETWORKING_DEBUG PF VPN IOKIT_DEBUG   LIBKERN_DEBUG   PERF_DBG_DEBUG MACH_DEBUG   SCHED_DEBUG   VM SECURITY ]
-#  KASAN =          [ DEVELOPMENT ]
+#  KASAN =          [ DEVELOPMENT config_kasan config_ubsan config_ksancov ]
 #
 ######################################################################
 #
@@ -74,6 +74,7 @@ options               PAL_I386
 options            CONFIG_MCA                      # Machine Check Architecture        # <config_mca>
 options            CONFIG_VMX                      # Virtual Machine Extensions        # <config_vmx>
 options            CONFIG_MTRR                     # Memory Type Range Registers       # <config_mtrr>
+options     CONFIG_MACF_LAZY_VNODE_LABELS   # Turn on labels, don't preallocate
 
 options     NO_NESTED_PMAP                  # <no_nested_pmap>
 
index 439e238c67fddc8d6f3bd4c8ecb0f4b7c1c75ff6..025f579733aa59d5c1ed559e8090512121189a53 100644 (file)
@@ -56,8 +56,10 @@ _thread_call_enter1_delayed
 _thread_call_enter_delayed
 _thread_call_free
 _thread_deallocate
+_thread_has_thread_name
 _thread_policy_set
 _thread_reference
+_thread_set_thread_name
 _thread_terminate
 _thread_tid
 _thread_wakeup_prim
index da46458ffa7dc3563c7438264e0a371b2f98cf42..0f5f3ab63f0d078e931f034127e48dfeed353f98 100644 (file)
@@ -57,14 +57,19 @@ endif
 $(OBJPATH)/allsymbols: $(OBJPATH)/$(KERNEL_FILE_NAME)
        $(_v)$(NM) -gj $< > $@
 
-$(SYMBOL_SET_BUILD): $(OBJPATH)/%.symbolset :  %.exports %.$(EXPORT_SOURCE_ARCH_CONFIG_LC).exports $(OBJPATH)/allsymbols $(KEXT_CREATE_SYMBOL_SET)
-       @echo "$(ColorH)SYMBOLSET$(Color0)  $(ColorF)$*$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""
-       $(_v)$(KEXT_CREATE_SYMBOL_SET)                                  \
-               $(ARCH_FLAGS_$(CURRENT_ARCH_CONFIG))                    \
-               -import $(OBJPATH)/allsymbols                           \
-               -export $(SOURCE)/$*.exports                            \
-               -export $(SOURCE)/$*.$(EXPORT_SOURCE_ARCH_CONFIG_LC).exports    \
-               -output $@ $(_vstdout)
+define symbol_set_rule
+$(OBJPATH)/$(1).symbolset: MY_EXPORTS := $(filter $(1)%,$(EXPORTS_FILES))
+$(OBJPATH)/$(1).symbolset: MY_EXPORTS_ARGS := $$(foreach file,$$(MY_EXPORTS),-export $(SOURCE)/$$(file))
+$(OBJPATH)/$(1).symbolset: $$(MY_EXPORTS) $(OBJPATH)/allsymbols $(KEXT_CREATE_SYMBOL_SET)
+       $$(call makelog,$(ColorH)SYMBOLSET$(Color0)     $(ColorF)$(1)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))")
+       $(_v)$(KEXT_CREATE_SYMBOL_SET)                  \
+               $(ARCH_FLAGS_$(CURRENT_ARCH_CONFIG))    \
+               -import $(OBJPATH)/allsymbols           \
+               $$(MY_EXPORTS_ARGS)                     \
+               -output $$@ $(_vstdout)
+endef
+
+$(foreach symbolset,$(SYMBOL_COMPONENT_LIST),$(eval $(call symbol_set_rule,$(symbolset))))
 
 .PHONY: check_all_exports
 
@@ -85,11 +90,11 @@ check_all_exports: $(OBJPATH)/allsymbols $(KEXT_CREATE_SYMBOL_SET)
                -output /dev/null $(_vstdout)
 
 $(OBJPATH)/$(MD_SUPPORTED_KPI_FILENAME): $(EXPORTS_FILES)
-       @echo "$(ColorH)SUPPORTED_KPI$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""
+       $(call makelog,$(ColorH)SUPPORTED_KPI$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))")
        $(_v)$(SRCROOT)/config/list_supported.sh $(SOURCE) $(EXPORT_SOURCE_ARCH_CONFIG_LC) $@
 
 $(OBJPATH)/$(MI_SUPPORTED_KPI_FILENAME): $(EXPORTS_FILES)
-       @echo "$(ColorH)SUPPORTED_KPI$(Color0) \"($(ColorLF)all$(Color0))\""
+       $(call makelog,$(ColorH)SUPPORTED_KPI$(Color0) "($(ColorLF)all$(Color0))")
        $(_v)$(SRCROOT)/config/list_supported.sh $(SOURCE) all $@
 
 build_symbol_sets: check_all_exports $(SYMBOL_SET_BUILD) $(OBJPATH)/allsymbols \
@@ -101,12 +106,11 @@ do_config_all::   build_symbol_sets
 # There's no simple static pattern rule for these paths, so hardcode dependencies in the command list
 $(SYMROOT_INSTALL_KEXT_MACHO_FILES): ALWAYS
        $(_v)$(MKDIR) $(dir $@)
+       $(call makelog,$(ColorH)INSTALLSYM$(Color0)    $(ColorF)symbolset $(notdir $@)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))")
        $(_v)if [ $(OBJROOT)/.symbolset.timestamp -nt $@ ]; then                \
-               echo INSTALLSYM symbolset $(notdir $@) "($(CURRENT_ARCH_CONFIG_LC))";   \
                $(INSTALL) $(EXEC_INSTALL_FLAGS) $(OBJPATH)/$(@F).symbolset $@; \
                cmdstatus=$$?;                                                  \
        else                                                                    \
-               echo INSTALLSYM symbolset $(notdir $@) "($(CURRENT_ARCH_CONFIG_LC))";   \
                $(LIPO) -create $@ $(OBJPATH)/$(@F).symbolset -output $@ 2>/dev/null || true;   \
                cmdstatus=$$?;                                                  \
        fi;                                                                     \
@@ -114,23 +118,23 @@ $(SYMROOT_INSTALL_KEXT_MACHO_FILES): ALWAYS
 
 $(SYMROOT_INSTALL_KEXT_PLISTS): $(SYMROOT)/% : $(SOURCE)/%
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorH)INSTALLSYM$(ColorH) $(ColorLF)kextplist$(Color0) $(ColorF)$*$(Color0)"
+       $(call makelog,$(ColorH)INSTALLSYM$(Coloro)    $(ColorLF)kextplist$(Color0) $(ColorF)$*$(Color0))
        $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@
        $(_v)$(NEWVERS) $@ $(_vstdout)
 
 $(DSTROOT_INSTALL_KEXT_PLISTS): $(INSTALL_KEXT_DIR)/% : $(SYMROOT)/%
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorH)INSTALL$(ColorH)    $(ColorLF)kextplist$(Color0) $(ColorF)$*$(Color0)"
+       $(call makelog,$(ColorH)INSTALL$(Color0)       $(ColorLF)kextplist$(Color0) $(ColorF)$*$(Color0))
        $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@
 
 $(DSTROOT_INSTALL_KEXT_MACHO_FILES): $(INSTALL_KEXT_DIR)/% : $(SYMROOT)/% ALWAYS
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorF)INSTALL$(Color0)    $(ColorF)$(notdir $@)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""
+       $(call makelog,$(ColorF)INSTALL$(Color0)       $(ColorF)$(notdir $@)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))")
        $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@
 
 $(DSTROOT)/$(KRESDIR)/$(MD_SUPPORTED_KPI_FILENAME) $(DSTROOT)/$(KRESDIR)/$(MI_SUPPORTED_KPI_FILENAME): $(DSTROOT)/$(KRESDIR)/% : $(OBJPATH)/%
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorH)INSTALL$(Color0)    $(ColorF)$*$(Color0)"
+       $(call makelog,$(ColorH)INSTALL$(Color0)       $(ColorF)$*$(Color0))
        $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
 
 ifneq ($(INSTALL_KASAN_ONLY),1)
index c9bd63daff66784c2aa43c2a48635dcbf1d60d59..f72373b1c86395efd49fa741843d6f524953e110 100644 (file)
@@ -1,4 +1,4 @@
-18.7.0
+19.0.0
 
 # The first line of this file contains the master version number for the kernel.
 # All other instances of the kernel version in xnu are derived from this file.
index 683c7e68f16c4b7936e8b1ec64eba81deab4bfdd..8091f7218faaf938c6e0ca2428cc82559f3cc781 100644 (file)
@@ -5,13 +5,18 @@ _IOCPURunPlatformQuiesceActions
 _PE_get_default
 _PE_reboot_on_panic
 _PE_mark_hwaccess
+_mach_vm_map:_mach_vm_map_external
+_mach_vm_remap:_mach_vm_remap_external
 _ml_arm_sleep
 _ml_get_abstime_offset
 _ml_get_conttime_offset
 _ml_get_wake_timebase
+_ml_set_reset_time
 _proc_getcdhash
 _cpu_broadcast_xcall
 _cpu_xcall
+_cpu_broadcast_immediate_xcall
+_cpu_immediate_xcall
 _cpu_number
 _enable_kernel_vfp_context
 _get_preemption_level
index 4b43941bc64d1ae37006c36fdf6e0b9880e5c23d..e3bc84a7d9a0da7244a394bb0a4e0522eff1897a 100644 (file)
@@ -14,15 +14,20 @@ __ZN17IONVRAMController*
 __ZTV17IONVRAMController
 _cpu_broadcast_xcall
 _cpu_xcall
+_cpu_broadcast_immediate_xcall
+_cpu_immediate_xcall
 _cpu_cluster_id
 _cpu_number
 _cpu_qos_update_register
 _ecc_log_record_event
 _get_preemption_level
+_mach_vm_map:_mach_vm_map_external
+_mach_vm_remap:_mach_vm_remap_external
 _ml_arm_sleep
 _ml_get_abstime_offset
 _ml_get_conttime_offset
 _ml_get_wake_timebase
+_ml_set_reset_time
 _ml_thread_is64bit
 _pe_shmcon_set_child
 _proc_getcdhash
@@ -42,5 +47,6 @@ _pgtrace_add_probe
 _pgtrace_clear_probe
 _mach_bridge_recv_timestamps
 _mach_bridge_init_timestamp
+_mach_bridge_set_params
 _PE_panic_debugging_enabled
 _register_additional_panic_data_buffer
index 9cf4a78f317206069b6b8471a6d679832d39eb0c..5447e64e68faeb919845248bab18fea1d8729d9d 100644 (file)
@@ -186,6 +186,7 @@ _ifnet_link_status_report
 _ifnet_notice_master_elected
 _ifnet_notice_node_absence
 _ifnet_notice_node_presence
+_ifnet_notice_node_presence_v2
 _ifnet_poll_params
 _ifnet_purge
 _ifnet_report_issues
@@ -206,6 +207,8 @@ _ifnet_get_unsent_bytes
 _ifnet_get_buffer_status
 _ifnet_normalise_unsent_data
 _ifnet_set_low_power_mode
+_ifnet_notify_tcp_keepalive_offload_timeout
+_ifnet_interface_advisory_report
 _in6_localaddr
 _in6addr_local
 _in_localaddr
@@ -239,14 +242,11 @@ _kern_allocation_name_allocate
 _kern_allocation_name_release
 _thread_set_allocation_name
 _kern_asl_msg
-_kern_asl_msg_va
 _kern_coredump_log
 _kern_register_coredump_helper
 _kern_config_is_development
 _kern_stack_snapshot_with_reason
 _kernel_debug_string
-_kevent_id_internal
-_kevent_qos_internal
 _kmem_alloc_kobject:_kmem_alloc_kobject_external
 _kmem_alloc_pageable:_kmem_alloc_pageable_external
 _kx_qsort
@@ -267,9 +267,7 @@ _m_trailingspace:_mbuf_trailingspace
 _mach_vm_allocate:_mach_vm_allocate_external
 _mach_vm_behavior_set
 _mach_vm_deallocate
-_mach_vm_map:_mach_vm_map_external
 _mach_vm_protect
-_mach_vm_remap:_mach_vm_remap_external
 _mbuf_add_drvaux
 _mbuf_del_drvaux
 _mbuf_find_drvaux
@@ -298,6 +296,8 @@ _mbuf_get_flowid
 _mbuf_set_flowid
 _mbuf_pkt_new_flow
 _mbuf_last_pkt
+_mbuf_get_keepalive_flag
+_mbuf_set_keepalive_flag
 _mcl_to_paddr
 _ml_io_read
 _ml_io_read16
@@ -323,19 +323,24 @@ _net_add_domain:_net_add_domain_old
 _net_add_proto:_net_add_proto_old
 _net_del_domain:_net_del_domain_old
 _net_del_proto:_net_del_proto_old
+_net_domain_contains_hostname
 _netboot_root
 _os_reason_create
 _os_reason_alloc_buffer_noblock
 _os_reason_get_kcdata_descriptor
 _os_reason_ref
 _os_reason_free
+_os_reason_set_flags
+_os_reason_set_description_data
 _panic_with_options
 _persona_find
+_persona_find_by_type
 _persona_get
 _persona_get_id
 _persona_get_type
 _persona_get_cred
 _persona_lookup
+_persona_proc_get
 _current_persona_get
 _persona_put
 _pffinddomain:_pffinddomain_old
@@ -343,13 +348,19 @@ _pffindproto:_pffindproto_old
 _port_name_to_task
 _port_name_to_thread
 _post_sys_powersource
+_proc_get_syscall_filter_mask_size
+_proc_getexecutableoffset
 _proc_getexecutablevnode
+_proc_selfexecutableargs
 _proc_issetugid
 _proc_pidbackgrounded
 _proc_pidversion
 _proc_set_responsible_pid
+_proc_set_syscall_filter_mask
 _proc_task
 _proc_uniqueid
+_proc_puniqueid
+_proc_exitstatus
 _priv_check_cred
 _pru_abort_notsupp
 _pru_accept_notsupp
@@ -366,6 +377,7 @@ _pru_sense_null
 _pru_shutdown_notsupp
 _pru_sockaddr_notsupp
 _pru_sopoll_notsupp
+_psignal_sigkill_with_reason
 _pthread_kext_register
 _q_to_b
 _register_and_init_prng
@@ -409,6 +421,7 @@ _sorwakeup
 _sosend
 _strnstr
 _sysdiagnose_notify_user
+_task_is_driver
 _termioschars
 _thread_call_allocate_with_priority
 _thread_call_allocate_with_qos
@@ -431,6 +444,7 @@ _throttle_info_reset_window
 _throttle_info_update
 _throttle_info_update_by_mask
 _throttle_lowpri_io
+_throttle_lowpri_io_will_be_throttled
 _throttle_lowpri_window
 _throttle_set_thread_io_policy
 _throttle_get_thread_effective_io_policy
@@ -464,6 +478,7 @@ _utun_pkt_dtls_input
 _vfs_context_bind
 _vfs_context_get_special_port
 _vfs_context_set_special_port
+_vfs_context_is_dataless_manipulator
 _vfs_devvp
 _vfs_getattr
 _vfs_getbyid
@@ -487,27 +502,33 @@ _vm_map_round_page_mask
 _vm_map_trunc_page_mask
 _vm_map_wire_and_extract:_vm_map_wire_and_extract_external
 _vm_page_wire_count
+_vn_getpath_ext
 _vn_getpath_fsenter
 _vn_getpath_fsenter_with_parent
+_vn_getpath_no_firmlink
+_vnode_getfirmlink
 _vn_searchfs_inappropriate_name
 _vnode_create_empty
 _vnode_initialize
 _vnode_isdyldsharedcache
 _vnode_ismonitored
 _vnode_istty
+_vnode_lookupat
 _vnode_lookup_continue_needed
 _vnode_clearnoflush
 _vnode_isnoflush
 _vnode_getbackingvnode
 _vnode_setasnamedstream
+_vnode_setasfirmlink
 _vnop_compound_mkdir_desc
 _vnop_compound_open_desc
 _vnop_compound_remove_desc
 _vnop_compound_rename_desc
 _vnop_compound_rmdir_desc
 _vnop_monitor_desc
+_write_random
 
-# HFS Kext Requirements
+# HFS/APFS Kext Requirements
 _IOBSDMountChange
 _OSKextUnloadKextWithLoadTag
 _bdwrite_internal
@@ -515,10 +536,15 @@ _buf_markstatic
 _count_lock_queue
 _decmpfs_cnode_destroy
 _decmpfs_cnode_get_vnode_cached_size
+_decmpfs_cnode_get_vnode_cached_nchildren
+_decmpfs_cnode_get_vnode_cached_total_size
 _decmpfs_cnode_get_vnode_state
 _decmpfs_cnode_init
 _decmpfs_cnode_alloc
 _decmpfs_cnode_free
+_decmpfs_cnode_set_vnode_cached_size
+_decmpfs_cnode_set_vnode_cached_nchildren
+_decmpfs_cnode_set_vnode_cached_total_size
 _decmpfs_cnode_set_vnode_state
 _decmpfs_cnode_cmp_type
 _decmpfs_ctx
@@ -533,6 +559,7 @@ _decmpfs_read_compressed
 _decmpfs_unlock_compressed_data
 _decmpfs_update_attributes
 _decmpfs_validate_compressed_file
+_fg_get_vnode
 _fp_getfvp
 _kauth_cred_issuser
 _kdebug_lookup_gen_events
@@ -578,6 +605,7 @@ _vnode_should_flush_after_write
 _vfs_setowner
 _vfs_idle_time
 _mount_set_noreaddirext
+_vfs_get_statfs64
 _cluster_max_io_size
 _vfs_context_cwd
 _resolve_nspace_item
@@ -592,21 +620,6 @@ _proc_is_forcing_hfs_case_sensitivity
 _is_package_name
 _sysctl__hw_features_children
 _task_update_logical_writes
-_dqfileclose
-_dqfileopen
-_dqflush
-_dqget
-_dqhashinit
-_dqisinitialized
-_dqlock
-_dqrele
-_dqsync
-_dqsync_orphans
-_dqunlock
-_qf_get
-_qf_put
-_dqfileinit
-_dqreclaim
 _zalloc
 _zalloc_noblock
 _zdestroy
@@ -617,7 +630,11 @@ _fs_buffer_cache_gc_register
 _fs_buffer_cache_gc_unregister
 _cp_key_store_action_for_volume
 _mach_bridge_remote_time
-
+_lck_mtx_sleep_with_inheritor
+_lck_rw_sleep_with_inheritor
+_wakeup_one_with_inheritor
+_wakeup_all_with_inheritor
+_change_sleep_inheritor
 _Block_size
 __Block_extended_layout
 __Block_has_signature
@@ -629,3 +646,4 @@ __Block_signature
 __Block_tryRetain
 __Block_use_RR2
 __Block_use_stret
+_IOPMRootDomainGetWillShutdown
index 92da71aa12502a4a599e54e9a3d8a6d2667ef684..a24003941b8b64ad27ad7a806376d24cc1d4da82 100644 (file)
@@ -17,6 +17,7 @@ _cpuid_leaf7_features
 _cpuid_info
 _csr_check
 _csr_get_active_config
+_hv_ast_pending
 _hv_ept_pmap_create
 _hv_get*
 _hv_release*
@@ -55,6 +56,21 @@ _xts_encrypt
 _xts_start
 _aes_decrypt
 _PE_reboot_on_panic
+_dqfileclose
+_dqfileopen
+_dqflush
+_dqget
+_dqhashinit
+_dqisinitialized
+_dqlock
+_dqrele
+_dqsync
+_dqsync_orphans
+_dqunlock
+_qf_get
+_qf_put
+_dqfileinit
+_dqreclaim
 
 # HFS Kext Requirements
 _file_vnode
@@ -77,3 +93,7 @@ _csproc_mark_invalid_allowed
 _csproc_check_invalid_allowed
 _csproc_hardened_runtime
 _csproc_forced_lv
+
+#exports for vmware/, virtualbox, ...
+_mach_vm_map
+_mach_vm_remap
index 8853251cad2d9387a154221ba65a998349fcdc84..07f1387d6ea5aa9966f1c5772369429795055cdd 100644 (file)
@@ -50,7 +50,6 @@ __ZTV9IODTNVRAM
 __ZTVN15IOWatchDogTimer9MetaClassE
 __doprnt
 __doprnt_log
-__dtrace_register_anon_DOF
 _aes_decrypt_cbc
 _aes_decrypt_key
 _aes_decrypt_key128
@@ -165,6 +164,7 @@ _sock_accept_internal
 _sock_socket_internal
 _stack_privilege
 _task_get_special_port
+_task_is_app_suspended
 _task_resume
 _task_resume2
 _task_suspend
diff --git a/config/Unused.arm.exports b/config/Unused.arm.exports
new file mode 100644 (file)
index 0000000..58abc12
--- /dev/null
@@ -0,0 +1,4 @@
+# Symbols that are unused as KPI, but must be globally exported
+_arm64_root_pgtable_level
+_arm64_root_pgtable_num_ttes
+_arm_hardware_page_size
diff --git a/config/Unused.arm64.exports b/config/Unused.arm64.exports
new file mode 100644 (file)
index 0000000..58abc12
--- /dev/null
@@ -0,0 +1,4 @@
+# Symbols that are unused as KPI, but must be globally exported
+_arm64_root_pgtable_level
+_arm64_root_pgtable_num_ttes
+_arm_hardware_page_size
index 976fb68debc262baff9d037ae06e7baeb98368ef..c877ff291101b445c6ed18948355c02bf9bb8936 100644 (file)
@@ -1,7 +1,4 @@
 # Symbols that are unused as KPI, but must be globally exported
-_arm64_root_pgtable_level
-_arm64_root_pgtable_num_ttes
-_arm_hardware_page_size
 _atm_mana*
 _bank_mana*
 _dtrace_zero*
index 4af69e9b087bbf6ac7c141c04bd615f74d0473f6..3308705e4e9bed7a646402e67ef100465f7de1fb 100755 (executable)
@@ -10,6 +10,9 @@ fi
 OUTPUT="$1"
 shift
 
-( grep -h -v ":" "$@"; grep -h ":" "$@" | awk -F: '{print $2}' ) | sort -u > "$OUTPUT"
+# Note: we used to export both sides of the alias since forever
+# for now keep doing this
+
+( grep -h -v ":" "$@"; grep -h ":" "$@" | awk -F: '{print $1; print $2}' ) | sort -u > "$OUTPUT"
 
 exit 0
diff --git a/doc/atomics.md b/doc/atomics.md
new file mode 100644 (file)
index 0000000..eda4cc2
--- /dev/null
@@ -0,0 +1,423 @@
+XNU use of Atomics and Memory Barriers
+======================================
+
+Goal
+----
+
+This document discusses the use of atomics and memory barriers in XNU. It is
+meant as a guide to best practices, and warns against a variety of possible
+pitfalls in the handling of atomics in C.
+
+It is assumed that the reader has a decent understanding of
+the [C11 memory model](https://en.cppreference.com/w/c/atomic/memory_order)
+as this document builds on it, and explains the liberties XNU takes with said
+model.
+
+All the interfaces discussed in this document are available through
+the `<machine/atomic.h>` header.
+
+Note: Linux has thorough documentation around memory barriers
+(Documentation/memory-barriers.txt), some of which is Linux specific,
+but most is not and is a valuable read.
+
+
+Vocabulary
+----------
+
+In the rest of this document we'll refer to the various memory ordering defined
+by C11 as relaxed, consume, acquire, release, acq\_rel and seq\_cst.
+
+`os_atomic` also tries to make the distinction between compiler **barriers**
+(which limit how much the compiler can reorder code), and memory **fences**.
+
+
+The dangers and pitfalls of C11's `<stdatomic.h>`
+-------------------------------------------------
+
+While the C11 memory model has likely been one of the most important additions
+to modern C, in the purest C tradition, it is a sharp tool.
+
+By default, C11 comes with two variants of each atomic "operation":
+
+- an *explicit* variant where memory orderings can be specified,
+- a regular variant which is equivalent to the former with the *seq_cst*
+  memory ordering.
+
+When an `_Atomic` qualified variable is accessed directly without using
+any `atomic_*_explicit()` operation, then the compiler will generate the
+matching *seq_cst* atomic operations on your behalf.
+
+The sequentially consistent world is extremely safe from a lot of compiler
+and hardware reorderings and optimizations, which is great, but comes with
+a huge cost in terms of memory barriers. It is also completely wasted when
+building for a non SMP configuration.
+
+
+It seems very tempting to use `atomic_*_explicit()` functions with explicit
+memory orderings, however, the compiler is entitled to perform a number of
+optimizations with relaxed atomics, that most developers will not expect.
+Indeed, the compiler is perfectly allowed to perform various optimizations it
+does with other plain memory accesess such as coalescing, reordering, hoisting
+out of loops, ...
+
+For example, when the compiler can know what `doit` is doing (which due to LTO
+is almost always the case for XNU), is allowed to transform this code:
+
+```c
+    void
+    perform_with_progress(int steps, long _Atomic *progress)
+    {
+        for (int i = 0; i < steps; i++) {
+            doit(i);
+            atomic_store_explicit(progress, i, memory_order_relaxed);
+        }
+    }
+```
+
+Into this, which obviously defeats the entire purpose of `progress`:
+
+```c
+    void
+    perform_with_progress(int steps, long _Atomic *progress)
+    {
+        for (int i = 0; i < steps; i++) {
+            doit(i);
+        }
+        atomic_store_explicit(progress, steps, memory_order_relaxed);
+    }
+```
+
+
+How `os_atomic_*` tries to address `<stdatomic.h>` pitfalls
+-----------------------------------------------------------
+
+1. the memory locations passed to the various `os_atomic_*`
+   functions do not need to be marked `_Atomic` or `volatile`
+   (or `_Atomic volatile`), which allow for use of atomic
+   operations in code written before C11 was even a thing.
+
+   It is however recommended in new code to use the `_Atomic`
+   specifier.
+
+2. `os_atomic_*` cannot be coalesced by the compiler:
+   all accesses are performed on the specified locations
+   as if their type was `_Atomic volatile` qualified.
+
+3. `os_atomic_*` only comes with the explicit variants:
+   orderings must be provided and can express either memory orders
+   where the name is the same as in C11 without the `memory_order_` prefix,
+   or a compiler barrier ordering `compiler_acquire`, `compiler_release`,
+   `compiler_acq_rel`.
+
+4. `os_atomic_*` elides barriers for non SMP configurations
+   by default, however, it emits the proper compiler barriers
+   that correspond to the requested memory ordering (using
+   `atomic_signal_fence()`), even on UP configuration, so that
+   the compiler cannot possibly reorder code on UP systems.
+
+
+Best practices for the use of atomics in XNU
+--------------------------------------------
+
+For most generic code, the `os_atomic_*` functions from
+`<machine/atomic.h>` are the perferred interfaces.
+
+`__sync_*`, `__c11_*` and `__atomic_*` compiler builtins should not be used.
+
+`<stdatomic.h>` functions may be used if:
+
+- compiler coalescing / reordering is desired (refcounting
+  implementations may desire this for example).
+
+- defaulting to relaxed atomics for non SMP platforms doesn't make sense
+  (such as device access which may require memory fences even on UP systems).
+
+
+Qualifying atomic variables with `_Atomic` or even
+`_Atomic volatile` is encouraged, however authors must
+be aware that a direct access to this variable will
+result in quite heavy memory barriers.
+
+The *consume* memory ordering should not be used
+(See *dependency* memory order later in this documentation).
+
+**Note**: `<libkern/OSAtomic.h>` provides a bunch of legacy
+atomic interfaces, but this header is considered obsolete
+and these functions should not be used in new code.
+
+
+High level overview of `os_atomic_*` interfaces
+-----------------------------------------------
+
+### Compiler barriers and memory fences
+
+`os_compiler_barrier(mem_order?)` provides a compiler barrier,
+with an optional barrier ordering. It is implemented with C11's
+`atomic_signal_fence()`. The barrier ordering argument is optional
+and defaults to the `acq_rel` compiler barrier (which prevents the
+compiler to reorder code in any direction around this barrier).
+
+`os_atomic_thread_fence(mem_order)` provides a memory barrier
+according to the semantics of `atomic_thread_fence()`. It always
+implies the equivalent `os_compiler_barrier()` even on UP systems.
+
+### Init, load and store
+
+`os_atomic_init`, `os_atomic_load` and `os_atomic_store` provide
+facilities equivalent to `atomic_init`, `atomic_load_explicit`
+and `atomic_store_explicit` respectively.
+
+Note that `os_atomic_load` and `os_atomic_store` promise that they will
+compile to a plain load or store. `os_atomic_load_wide` and
+`os_atomic_store_wide` can be used to have access to atomic loads and store
+that involve more costly codegen (such as compare exchange loops).
+
+### Basic RMW (read/modify/write) atomic operations
+
+The following basic atomic RMW operations exist:
+
+- `inc`: atomic increment (equivalent to an atomic add of `1`),
+- `dec`: atomic decrement (equivalent to an atomic sub of `1`),
+- `add`: atomic add,
+- `sub`: atomic sub,
+- `or`: atomic bitwise or,
+- `xor`: atomic bitwise xor,
+- `and`: atomic bitwise and,
+- `andnot`: atomic bitwise andnot (equivalent to atomic and of ~value),
+- `min`: atomic min,
+- `max`: atomic max.
+
+For any such operation, two variants exist:
+
+- `os_atomic_${op}_orig` (for example `os_atomic_add_orig`)
+  which returns the value stored at the specified location
+  *before* the atomic operation took place
+- `os_atomic_${op}` (for example `os_atomic_add`) which
+  returns the value stored at the specified location
+  *after* the atomic operation took place
+
+This convention is picked for two reasons:
+
+1. `os_atomic_add(p, value, ...)` is essentially equivalent to the C
+   in place addition `(*p += value)` which returns the result of the
+   operation and not the original value of `*p`.
+
+2. Most subtle atomic algorithms do actually require the original value
+   stored at the location, especially for bit manipulations:
+   `(os_atomic_or_orig(p, bit, relaxed) & bit)` will atomically perform
+   `*p |= bit` but also tell you whether `bit` was set in the original value.
+
+   Making it more explicit that the original value is used is hence
+   important for readers and worth the extra five keystrokes.
+
+Typically:
+
+```c
+    static int _Atomic i = 0;
+
+    printf("%d\n", os_atomic_inc_orig(&i)); // prints 0
+    printf("%d\n", os_atomic_inc(&i)); // prints 2
+```
+
+### Atomic swap / compare and swap
+
+`os_atomic_xchg` is a simple wrapper around `atomic_exchange_explicit`.
+
+There are two variants of `os_atomic_cmpxchg` which are wrappers around
+`atomic_compare_exchange_strong_explicit`. Both of these variants will
+return false/0 if the compare exchange failed, and true/1 if the expected
+value was found at the specified location and the new value was stored.
+
+1. `os_atomic_cmpxchg(address, expected, new_value, mem_order)` which
+   will atomically store `new_value` at `address` if the current value
+   is equal to `expected`.
+
+2. `os_atomic_cmpxchgv(address, expected, new_value, orig_value, mem_order)`
+   which has an extra `orig_value` argument which must be a pointer to a local
+   variable and will be filled with the current value at `address` whether the
+   compare exchange was successful or not. In case of success, the loaded value
+   will always be `expected`, however in case of failure it will be filled with
+   the current value, which is helpful to redrive compare exchange loops.
+
+Unlike `atomic_compare_exchange_strong_explicit`, a single ordering is
+specified, which only takes effect in case of a successful compare exchange.
+In C11 speak, `os_atomic_cmpxchg*` always specifies `memory_order_relaxed`
+for the failure case ordering, as it is what is used most of the time.
+
+There is no wrapper around `atomic_compare_exchange_weak_explicit`,
+as `os_atomic_rmw_loop` offers a much better alternative for CAS-loops.
+
+### `os_atomic_rmw_loop`
+
+This expressive and versatile construct allows for really terse and
+way more readable compare exchange loops. It also uses LL/SC constructs more
+efficiently than a compare exchange loop would allow.
+
+Instead of a typical CAS-loop in C11:
+
+```c
+    int _Atomic *address;
+    int old_value, new_value;
+    bool success = false;
+
+    old_value = atomic_load_explicit(address, memory_order_relaxed);
+    do {
+        if (!validate(old_value)) {
+            break;
+        }
+        new_value = compute_new_value(old_value);
+        success = atomic_compare_exchange_weak_explicit(address, &old_value,
+                new_value, memory_order_acquire, memory_order_relaxed);
+    } while (__improbable(!success));
+```
+
+`os_atomic_rmw_loop` allows this form:
+
+```c
+    int _Atomic *address;
+    int old_value, new_value;
+    bool success;
+
+    success = os_atomic_rmw_loop(address, old_value, new_value, acquire, {
+        if (!validate(old_value)) {
+            os_atomic_rmw_loop_give_up(break);
+        }
+        new_value = compute_new_value(old_value);
+    });
+```
+
+Unlike the C11 variant, it lets the reader know in program order that this will
+be a CAS loop, and exposes the ordering upfront, while for traditional CAS loops
+one has to jump to the end of the code to understand what it does.
+
+Any control flow that attempts to exit its scope of the loop needs to be
+wrapped with `os_atomic_rmw_loop_give_up` (so that LL/SC architectures can
+abort their opened LL/SC transaction).
+
+Because these loops are LL/SC transactions, it is undefined to perform
+any store to memory (register operations are fine) within these loops,
+as these may cause the store-conditional to always fail.
+In particular nesting of `os_atomic_rmw_loop` is invalid.
+
+Use of `continue` within an `os_atomic_rmw_loop` is also invalid, instead an
+`os_atomic_rmw_loop_give_up(goto again)` jumping to an `again:` label placed
+before the loop should be used in this way:
+
+```c
+    int _Atomic *address;
+    int old_value, new_value;
+    bool success;
+
+again:
+    success = os_atomic_rmw_loop(address, old_value, new_value, acquire, {
+        if (needs_some_store_that_can_thwart_the_transaction(old_value)) {
+            os_atomic_rmw_loop_give_up({
+                // Do whatever you need to do/store to central memory
+                // that would cause the loop to always fail
+                do_my_rmw_loop_breaking_store();
+
+                // And only then redrive.
+                goto again;
+            });
+        }
+        if (!validate(old_value)) {
+            os_atomic_rmw_loop_give_up(break);
+        }
+        new_value = compute_new_value(old_value);
+    });
+```
+
+### the *dependency* memory order
+
+Because the C11 *consume* memory order is broken in various ways,
+most compilers, clang included, implement it as an equivalent
+for `memory_order_acquire`. However, its concept is useful
+for certain algorithms.
+
+As an attempt to provide a replacement for this, `<machine/atomic.h>`
+implements an entirely new *dependency* memory ordering.
+
+The purpose of this ordering is to provide a relaxed load followed by an
+implicit compiler barrier, that can be used as a root for a chain of hardware
+dependencies that would otherwise pair with store-releases done at this address,
+very much like the *consume* memory order is intended to provide.
+
+However, unlike the *consume* memory ordering where the compiler had to follow
+the dependencies, the *dependency* memory ordering relies on explicit
+annotations of when the dependencies are expected:
+
+- loads through a pointer loaded with a *dependency* memory ordering
+  will provide a hardware dependency,
+
+- dependencies may be injected into other loads not performed through this
+  particular pointer with the `os_atomic_load_with_dependency_on` and
+  `os_atomic_inject_dependency` interfaces.
+
+Here is an example of how it is meant to be used:
+
+```c
+    struct foo {
+        long value;
+        long _Atomic flag;
+    };
+
+    void
+    publish(struct foo *p, long value)
+    {
+        p->value = value;
+        os_atomic_store(&p->flag, 1, release);
+    }
+
+
+    bool
+    broken_read(struct foo *p, long *value)
+    {
+        /*
+         * This isn't safe, as there's absolutely no hardware dependency involved.
+         * Using an acquire barrier would of course fix it but is quite expensive...
+         */
+        if (os_atomic_load(&p->flag, relaxed)) {
+            *value = p->value;
+            return true;
+        }
+        return false;
+    }
+
+    bool
+    valid_read(struct foo *p, long *value)
+    {
+        long flag = os_atomic_load(&p->flag, dependency);
+        if (flag) {
+            /*
+             * Further the chain of dependency to any loads through `p`
+             * which properly pair with the release barrier in `publish`.
+             */
+            *value = os_atomic_load_with_dependency_on(&p->value, flag);
+            return true;
+        }
+        return false;
+    }
+```
+
+There are 4 interfaces involved with hardware dependencies:
+
+1. `os_atomic_load(..., dependency)` to initiate roots of hardware dependencies,
+   that should pair with a store or rmw with release semantics or stronger
+   (release, acq\_rel or seq\_cst),
+
+2. `os_atomic_inject_dependency` can be used to inject the dependency provided
+   by a *dependency* load, or any other value that has had a dependency
+   injected,
+
+3. `os_atomic_load_with_dependency_on` to do an otherwise related relaxed load
+   that still prolongs a dependency chain,
+
+4. `os_atomic_make_dependency` to create an opaque token out of a given
+   dependency root to inject into multiple loads.
+
+
+**Note**: this technique is NOT safe when the compiler can reason about the
+pointers that you are manipulating, for example if the compiler can know that
+the pointer can only take a couple of values and ditch all these manually
+crafted dependency chains. Hopefully there will be a future C2Y standard that
+provides a similar construct as a language feature instead.
diff --git a/iokit/DriverKit/IOBufferMemoryDescriptor.iig b/iokit/DriverKit/IOBufferMemoryDescriptor.iig
new file mode 100644 (file)
index 0000000..449d66e
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#if !__IIG
+#if KERNEL
+#include <IOKit/IOBufferMemoryDescriptor.h>
+#endif
+#endif
+
+
+#ifndef _IOKIT_UIOBUFFERMEMORYDESCRIPTOR_H
+#define _IOKIT_UIOBUFFERMEMORYDESCRIPTOR_H
+
+#include <DriverKit/IOMemoryDescriptor.iig>
+
+/*!
+ * @class IOBufferMemoryDescriptor
+ *
+ * @abstract
+ * IOBufferMemoryDescriptor describes a memory buffer allocated in the callers address space.
+ *
+ * @discussion
+ * To allocate memory for I/O or sharing, use IOBufferMemoryDescriptor::Create()
+ * Methods in this class are used for memory that was supplied as a parameter.
+ * IOBufferMemoryDescriptor can be handed to any API that expects an IOMemoryDescriptor.
+ */
+
+class KERNEL IOBufferMemoryDescriptor : public IOMemoryDescriptor
+{
+public:
+
+    /*!
+     * @brief       Create an IOBufferMemoryDescriptor.
+     * @param       options Pass the flags     kIOMemoryDirectionIn, kIOMemoryDirectionOut or kIOMemoryDirectionOutIn
+     *              to set the direction of the i/o
+     * @param       capacity Maximum length of the memory buffer. The descriptor has no valid data
+     *              and zero length until set with SetLength().
+     * @param       memory Created descriptor with +1 retain count to be released by the caller.
+     * @param       alignment For small less-than-page-size buffers, control the alignment of the memory buffer.
+     *              Pass zero for no guaranteed alignment.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       static kern_return_t
+       Create(
+               uint64_t options,
+               uint64_t capacity,
+               uint64_t alignment,
+               IOBufferMemoryDescriptor ** memory);
+
+       virtual bool
+       init() override;
+
+       virtual void
+       free() override;
+
+    /*!
+     * @brief       Obtain the address and length of the memory buffer.
+     * @param       range An IOAddressSegment structure filled out with the address and length of the memory buffer.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       kern_return_t
+       GetAddressRange(IOAddressSegment * range) LOCALONLY;
+
+    /*!
+     * @brief       Set the valid length of the memory buffer.
+     * @discussion  IOBufferMemoryDescriptor have capacity allocated at Create() but no valid data until set
+     *              with this method.
+     * @param       length New valid length of the memory described.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       SetLength(uint64_t length);
+};
+
+#endif /* ! _IOKIT_UIOBUFFERMEMORYDESCRIPTOR_H */
diff --git a/iokit/DriverKit/IODataQueueDispatchSource.iig b/iokit/DriverKit/IODataQueueDispatchSource.iig
new file mode 100644 (file)
index 0000000..1b4a0df
--- /dev/null
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2019-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _IOKIT_UIODATAQUEUEDISPATCHSOURCE_H
+#define _IOKIT_UIODATAQUEUEDISPATCHSOURCE_H
+
+#include <DriverKit/IODispatchQueue.iig>
+#include <DriverKit/IOMemoryDescriptor.iig>
+
+typedef void (^IODataQueueClientEnqueueEntryBlock)(void *data, size_t dataSize);
+typedef void (^IODataQueueClientDequeueEntryBlock)(const void *data, size_t dataSize);
+
+class NATIVE KERNEL IODataQueueDispatchSource : public IODispatchSource
+{
+public:
+
+    /*!
+     * @brief       Create an IODataQueueDispatchSource for a shared memory data queue.
+     * @param       queueByteCount The size of the queue in bytes.
+     * @param       queue IODispatchQueue the source is attached to. Note that the DataAvailable
+     *              and DataServiced handlers are invoked on the queue set for the target method
+     *              of the OSAction, not this queue.
+     * @param       source Created source with +1 retain count to be released by the caller.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       static kern_return_t
+       Create(
+               uint64_t queueByteCount,
+           IODispatchQueue * queue,
+           IODataQueueDispatchSource ** source);
+
+       virtual bool
+       init() override;
+
+       virtual void
+       free() override;
+
+    /*!
+     * @brief       As a consumer, set the handler block to run when the queue becomes non-empty.
+     * @param       action OSAction instance specifying the callback method. The OSAction object will be retained
+     *              until SetHandler is called again or the event source is cancelled.
+     *              The DataAvailable handler is invoked on the queue set for the target method of the OSAction.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       SetDataAvailableHandler(
+       OSAction * action TYPE(DataAvailable));
+
+    /*!
+     * @brief       As a producer, set the handler block to run when the queue becomes non-full, after an attempt
+     *              to enqueue data failed.
+     * @param       action OSAction instance specifying the callback method. The OSAction object will be retained
+     *              until SetHandler is called again or the event source is cancelled.
+     *              The DataServiced handler is invoked on the queue set for the target method of the OSAction.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       SetDataServicedHandler(
+       OSAction * action TYPE(DataServiced));
+
+    /*!
+     * @brief       Control the enable state of the interrupt source.
+     * @param       enable Pass true to enable the source or false to disable.
+     * @param       handler Optional block to be executed after the interrupt has been disabled and any pending
+     *              interrupt handlers completed.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       SetEnableWithCompletion(
+               bool enable,
+               IODispatchSourceCancelHandler handler) override LOCAL;
+
+    /*!
+     * @brief       Cancel all callbacks from the event source.
+     * @discussion  After cancellation, the source can only be freed. It cannot be reactivated.
+     * @param       handler Handler block to be invoked after any callbacks have completed.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       Cancel(IODispatchSourceCancelHandler handler) override LOCAL;
+
+
+    /*!
+     * @brief       As a consumer, check if the data queue is non-empty.
+     * @return      True if the queue is non-empty.
+     */
+       bool
+       IsDataAvailable(void) LOCALONLY;
+
+    /*!
+     * @brief       As a consumer, get access to the next queue entry without dequeuing it.
+     * @param       callback to invoked if the queue is non-empty with the next entry to be dequeued.
+     * @return      kIOReturnSuccess if the callback was invoked.
+     *              kIOReturnUnderrun if the queue was empty.
+     *              kIOReturnError if the queue was corrupt.
+     */
+       kern_return_t
+       Peek(IODataQueueClientDequeueEntryBlock callback) LOCALONLY;
+
+    /*!
+     * @brief       As a consumer, dequeue the next queue entry.
+     * @param       callback invoked if the queue was non-empty with the entry that was dequeued.
+     * @return      kIOReturnSuccess if the callback was invoked.
+     *              kIOReturnUnderrun if the queue was empty.
+     *              kIOReturnError if the queue was corrupt.
+     */
+       kern_return_t
+       Dequeue(IODataQueueClientDequeueEntryBlock callback) LOCALONLY;
+
+    /*!
+     * @brief       As a producer, enqueue a queue entry.
+     * @param       dataSize size of the data to enqueue.
+     * @param       callback invoked if the queue has enough space to enqueue the data.
+     * @return      kIOReturnSuccess if the callback was invoked.
+     *              kIOReturnOverrun if the queue was full.
+     *              kIOReturnError if the queue was corrupt.
+     */
+       kern_return_t
+       Enqueue(uint32_t dataSize, IODataQueueClientEnqueueEntryBlock callback) LOCALONLY;
+
+    /*!
+     * @brief       As a consumer, dequeue the next queue entry, but don't send any DataServiced notification.
+     * @param       sendDataServiced Flag that indicates a DataServiced notification would have sent.
+     *              It should be initialized to false before a series of calls to this method,
+     *              and if true after those calls, the notification sent with SendDataServiced().
+     * @param       callback invoked if the queue was non-empty with the entry that was dequeued.
+     * @return      kIOReturnSuccess if the callback was invoked.
+     *              kIOReturnUnderrun if the queue was empty.
+     *              kIOReturnError if the queue was corrupt.
+     */
+       kern_return_t
+       DequeueWithCoalesce(bool * sendDataServiced, IODataQueueClientDequeueEntryBlock callback) LOCALONLY;
+
+    /*!
+     * @brief       As a producer, enqueue a queue entry, but don't send any DataAvailable notification.
+     * @param       dataSize size of the data to enqueue
+     * @param       sendDataAvailable Flag that indicates a DataAvailable notification would have been sent.
+     *              It should be initialized to false before a series of calls to this method,
+     *              and if true after those calls, the notification sent with SendDataAvailable().
+     * @param       callback invoked if the queue has enough space to enqueue the data.
+     * @return      kIOReturnSuccess if the callback was invoked.
+     *              kIOReturnOverrun if the queue was full.
+     *              kIOReturnError if the queue was corrupt.
+     */
+       kern_return_t
+       EnqueueWithCoalesce(uint32_t dataSize,  bool * sendDataAvailable, IODataQueueClientEnqueueEntryBlock callback) LOCALONLY;
+
+    /*!
+     * @brief       As a consumer, send the DataServiced notification indicated by DequeueWithCoalesce.
+        */
+       void
+       SendDataServiced(void) LOCALONLY;
+
+    /*!
+     * @brief       As a producer, send the DataAvailable notification indicated by EnqueueWithCoalesce.
+        */
+       void
+       SendDataAvailable(void) LOCALONLY;
+
+private:
+       virtual kern_return_t
+       CopyMemory(
+       IOMemoryDescriptor ** memory);
+
+       virtual kern_return_t
+       CopyDataAvailableHandler(
+       OSAction ** action);
+
+       virtual kern_return_t
+       CopyDataServicedHandler(
+       OSAction ** action);
+
+       virtual kern_return_t
+       CheckForWork(bool synchronous) override LOCAL;
+
+       virtual void
+       DataAvailable(
+               OSAction * action TARGET) LOCAL = 0;
+
+       virtual void
+       DataServiced(
+               OSAction * action TARGET) LOCAL = 0;
+};
+
+#endif /* ! _IOKIT_UIODATAQUEUEDISPATCHSOURCE_H */
diff --git a/iokit/DriverKit/IODispatchQueue.iig b/iokit/DriverKit/IODispatchQueue.iig
new file mode 100644 (file)
index 0000000..b9b501f
--- /dev/null
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2019-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _IOKIT_UIODISPATCHQUEUE_H
+#define _IOKIT_UIODISPATCHQUEUE_H
+
+#include <DriverKit/OSObject.iig>
+#include <DriverKit/OSAction.iig>
+#include <DriverKit/IODispatchSource.iig>
+
+typedef int (*IODispatchLogFunction)(const char *format, ...);
+typedef void (^IODispatchBlock)(void);
+typedef void (*IODispatchFunction)(void * context);
+typedef void (^IODispatchQueueCancelHandler)(void);
+
+
+/*!
+ * @class IODispatchQueue
+ *
+ * @abstract
+ * IODispatchQueue provides a queue for ordered execution of blocks.
+ *
+ * @discussion
+ * All blocks submitted to dispatch queues are dequeued in FIFO order.
+ * By default the queue is serial and will execute one block at a time.
+ */
+
+class NATIVE KERNEL IODispatchQueue : public OSObject
+{
+public:
+    /*!
+     * @brief       Creates a new dispatch queue object.
+     * @discussion  Creates a new dispatch queue object. All queues are currently serial, executing one block at time
+     *              FIFO order. The new object has retain count 1 and should be released by the caller.
+     * @param       options No options are currently defined, pass zero.
+     * @param       priority No priorities are currently defined, pass zero.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       static kern_return_t
+       Create(
+               const IODispatchQueueName name,
+               uint64_t                  options,
+               uint64_t                  priority,
+               IODispatchQueue        ** queue) LOCAL;
+
+       virtual bool
+       init() override;
+
+       virtual void
+       free() override;
+
+    /*!
+     * @brief       Determines if the current thread is running on the queue.
+     * @discussion  Determines if the current thread is running on the queue, including if the queue invoked a
+     *              second queue (ie. OnQueue can return true for more than one queue in a given context.)
+     * @return      bool true if current thread is running on this queue.
+     */
+       bool
+       OnQueue() LOCALONLY;
+
+    /*!
+     * @brief       Return the name the queue was created with.
+     * @discussion  Returns a pointer to the queues name. Only valid while the queue is retained.
+     * @return      C-string pointer in the queues internal storage.
+     */
+       const char *
+       GetName() LOCALONLY;
+
+    /*!
+     * @brief       Stop the queue from executing futher work.
+     * @discussion  Stops the queue from dequeuing work, and on completion of any block currently being executed,
+     *              invokes a callback block. Canceling is asynchronous.
+     * @param       handler Block that will executed when the queue has completed any inflight work 
+     *              and will not execute further work.
+     * @return      C-string pointer in the queues internal storage.
+     */
+       kern_return_t
+       Cancel(IODispatchQueueCancelHandler handler) LOCALONLY;
+
+    /*!
+     * @brief       Schedule a block to be executed on the queue asynchronously.
+     * @discussion  Schedules work to be done on the queue without waiting for it to complete. The queue will be
+     *              retained until the block completes.
+     * @param       block Block that will executed on the queue, not in the context of the caller.
+     */
+       void
+       DispatchAsync(IODispatchBlock block) LOCALONLY;
+
+    /*!
+     * @brief       C-function callback version of DispatchAsync.
+        */
+       void
+       DispatchAsync_f(void * context, IODispatchFunction function) LOCALONLY;
+
+       void
+       DispatchSync(IODispatchBlock block) LOCALONLY;
+
+    /*!
+     * @brief       C-function callback version of DispatchSync.
+        */
+       void
+       DispatchSync_f(void * context, IODispatchFunction function) LOCALONLY;
+
+    /*!
+     * @brief       Log the current execution context with respect to any queues the current thread holds.
+     * @param       output printf like output function. The address of IOLog is suitable to be used.
+        */
+       static void
+       Log(const char * message, IODispatchLogFunction output) LOCALONLY;
+};
+
+#if DRIVERKIT_PRIVATE
+class EXTENDS (IODispatchQueue) IODispatchQueuePrivate
+{
+       virtual kern_return_t
+       SetPort(
+               mach_port_t port PORTMAKESEND);
+};
+#endif
+
+#endif /* ! _IOKIT_UIODISPATCH_H */
diff --git a/iokit/DriverKit/IODispatchSource.iig b/iokit/DriverKit/IODispatchSource.iig
new file mode 100644 (file)
index 0000000..46cea74
--- /dev/null
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _IOKIT_UIODISPATCHSOURCE_H
+#define _IOKIT_UIODISPATCHSOURCE_H
+
+#include <DriverKit/OSObject.iig>
+
+
+typedef void (^IODispatchSourceCancelHandler)(void);
+
+/*!
+ * @class IODispatchSource
+
+ * @abstract
+ * IODispatchSource common base class for dispatch sources.
+ */
+
+class NATIVE KERNEL IODispatchSource : public OSObject
+{
+public:
+
+       virtual bool
+       init() override;
+
+       virtual void
+       free() override;
+
+       virtual kern_return_t
+       Cancel(IODispatchSourceCancelHandler handler) = 0;
+
+       virtual kern_return_t
+       SetEnableWithCompletion(
+               bool enable,
+               IODispatchSourceCancelHandler handler) = 0;
+
+       virtual kern_return_t
+       CheckForWork(bool synchronous) INVOKEREPLY = 0;
+
+       virtual kern_return_t
+       SetEnable(bool enable) LOCAL;
+};
+
+#endif /* ! _IOKIT_UIODISPATCHSOURCE_H */
diff --git a/iokit/DriverKit/IOInterruptDispatchSource.iig b/iokit/DriverKit/IOInterruptDispatchSource.iig
new file mode 100644 (file)
index 0000000..ecd4b5c
--- /dev/null
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _IOKIT_UIOINTERRUPTDISPATCHSOURCE_H
+#define _IOKIT_UIOINTERRUPTDISPATCHSOURCE_H
+
+#include <DriverKit/IODispatchQueue.iig>
+#include <DriverKit/IOService.iig>
+
+struct IOInterruptDispatchSourcePayload {
+       uint64_t time;
+       uint64_t count;
+};
+
+/*!
+ * @class IOInterruptDispatchSource
+ *
+ * @abstract
+ * IOInterruptDispatchSource delivers interrupts to a handler block on a dispatch queue.
+ *
+ * @discussion
+ * A driver can run code in response to an interrupt from a device, specified as an IOService
+ * and index. The code runs at normal thread level, but is notified with the mach_absolute_time
+ * the primary interrupt fired. For IOPCIDevices, only MSI interrupt sources are supported.
+ */
+class NATIVE KERNEL IOInterruptDispatchSource : public IODispatchSource
+{
+public:
+
+    /*!
+     * @brief       Create an IOInterruptDispatchSource for an interrupt by index from an IOService provider.
+     * @param       provider The IOService object representing the HW device producing the interrupt.
+     * @param       index Index for the interrupt.
+     * @param       queue Target queue to run the handler block.
+     * @param       source Created source with +1 retain count to be released by the caller.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       static kern_return_t
+       Create(IOService * provider,
+           uint32_t index,
+           IODispatchQueue * queue,
+           IOInterruptDispatchSource ** source) LOCAL;
+
+       virtual bool
+       init() override;
+
+       virtual void
+       free() override;
+
+    /*!
+     * @brief       Set the handler block to run when the interupt fires.
+     * @param       action OSAction instance specifying the callback method. The OSAction object will be retained
+     *              until SetHandler is called again or the event source is cancelled.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+           SetHandler(
+               OSAction * action TYPE(InterruptOccurred)) LOCAL;
+
+    /*!
+     * @brief       Control the enable state of the interrupt source.
+     * @param       enable Pass true to enable the source or false to disable.
+     * @param       handler Optional block to be executed after the interrupt has been disabled and any pending
+     *              interrupt handlers completed.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       SetEnableWithCompletion(
+               bool enable,
+               IODispatchSourceCancelHandler handler) override LOCAL;
+
+    /*!
+     * @brief       Cancel all callbacks from the event source.
+     * @discussion  After cancellation, the source can only be freed. It cannot be reactivated.
+     * @param       handler Handler block to be invoked after any callbacks have completed.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       Cancel(IODispatchSourceCancelHandler handler) override LOCAL;
+
+private:
+       virtual kern_return_t
+       CheckForWork(bool synchronous) override LOCAL;
+
+       virtual void
+       InterruptOccurred(
+               OSAction * action TARGET,
+               uint64_t   count,
+               uint64_t   time) REPLY LOCAL;
+};
+
+#endif /* ! _IOKIT_UIOINTERRUPTDISPATCHSOURCE_H */
diff --git a/iokit/DriverKit/IOMemoryDescriptor.iig b/iokit/DriverKit/IOMemoryDescriptor.iig
new file mode 100644 (file)
index 0000000..760d48c
--- /dev/null
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2019-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#if !__IIG
+#if KERNEL
+#include <IOKit/IOMemoryDescriptor.h>
+#endif
+#endif
+
+#ifndef _IOKIT_UIOMEMORYDESCRIPTOR_H
+#define _IOKIT_UIOMEMORYDESCRIPTOR_H
+
+#include <DriverKit/OSObject.iig>
+
+class IOService;
+class IOMemoryMap;
+
+
+// IOMemoryDescriptor Create options
+enum {
+       kIOMemoryDirectionIn    = 0x00000001,
+       kIOMemoryDirectionOut   = 0x00000002,
+       kIOMemoryDirectionOutIn = kIOMemoryDirectionIn | kIOMemoryDirectionOut,
+       kIOMemoryDirectionInOut = kIOMemoryDirectionOutIn,
+};
+
+// IOMemoryDescriptor CreateMapping options
+enum {
+       kIOMemoryMapFixedAddress          = 0x00000001,
+       kIOMemoryMapReadOnly              = 0x00000002,
+       kIOMemoryMapCacheModeDefault      = 0x00000000,
+       kIOMemoryMapCacheModeInhibit      = 0x00000100,
+       kIOMemoryMapCacheModeCopyback     = 0x00000200,
+       kIOMemoryMapCacheModeWriteThrough = 0x00000400,
+};
+
+struct IOAddressSegment {
+       uint64_t address;
+       uint64_t length;
+};
+
+struct IOMDPrivateState {
+       uint64_t length;
+       uint64_t options;
+};
+
+/*!
+ * @class IOMemoryDescriptor
+ *
+ * @abstract
+ * IOMemoryDescriptor describes a memory buffer. 
+ *
+ * @discussion
+ * To allocate memory for I/O or sharing, use IOBufferMemoryDescriptor::Create()
+ * Methods in this class are used for memory that was supplied as a parameter.
+ *
+
+@iig implementation
+#include <DriverKit/IOService.h>
+@iig end
+*/
+
+class KERNEL IOMemoryDescriptor : public OSObject
+{
+public:
+
+
+       virtual bool
+       init() override;
+
+       virtual void
+       free() override;
+
+    /*!
+     * @brief       Obtain the length of the memory described.
+     * @param       returnLength Returned length.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       kern_return_t
+       GetLength(
+               uint64_t * returnLength) LOCALONLY;
+
+    /*!
+     * @brief       Create a mapping of the memory in the callers address space.
+     * @param       options
+        *              kIOMemoryMapFixedAddress map at the address requested
+        *              kIOMemoryMapReadOnly create a read only mapping
+        *              kIOMemoryMapCacheModeDefault default cache mode
+        *              kIOMemoryMapCacheModeInhibit inhibited cache mode
+        *              kIOMemoryMapCacheModeCopyback copyback cache mode
+        *              kIOMemoryMapCacheModeWriteThrough write through cache mode
+        * @param       address Requested address if kIOMemoryMapFixedAddress was passed
+        * @param       offset Start offset of the mapping in the descriptor.
+        * @param       length Pass zero to map the entire memory, or a value <= the length of the descriptor.
+        * @param       alignment of the memory virtual mapping. Only zero for no alignment is supported.
+        * @param       map Returned IOMemoryMap object with +1 retain count. 
+        *              It should be retained until the map is no longer required.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       CreateMapping(
+               uint64_t options,
+               uint64_t address,
+               uint64_t offset,
+               uint64_t length,
+               uint64_t alignment,
+               IOMemoryMap ** map);
+
+private:
+       virtual kern_return_t
+       PrepareForDMA(
+               uint64_t options,
+               IOService * device,
+               uint64_t offset,
+               uint64_t length,
+
+               uint64_t * flags,
+               uint64_t * returnLength,
+               uint32_t * segmentsCount,
+               IOAddressSegment segments[32]);
+
+       kern_return_t
+       Map(
+               uint64_t options,
+               uint64_t address,
+               uint64_t length,
+               uint64_t alignment,
+
+               uint64_t * returnAddress,
+               uint64_t * returnLength) LOCALONLY;
+};
+
+class EXTENDS (IOMemoryDescriptor) IOMemoryDescriptorPrivate
+{
+       virtual kern_return_t
+       _CopyState(
+               IOMDPrivateState * state);
+};
+
+
+
+#endif /* ! _IOKIT_UIOMEMORYDESCRIPTOR_H */
diff --git a/iokit/DriverKit/IOMemoryMap.iig b/iokit/DriverKit/IOMemoryMap.iig
new file mode 100644 (file)
index 0000000..716c87f
--- /dev/null
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#if !__IIG
+#if KERNEL
+#include <IOKit/IOMemoryDescriptor.h>
+#endif
+#endif
+
+#ifndef _IOKIT_UIOMEMORYMAP_H
+#define _IOKIT_UIOMEMORYMAP_H
+
+#include <DriverKit/OSObject.iig>
+
+struct IOMemoryMapPrivateState {
+       uint64_t length;
+       uint64_t offset;
+       uint64_t options;
+       uint64_t address;
+};
+
+/*!
+ * @class IOMemoryMap
+ *
+ * @abstract
+ * IOMemoryMap describes a memory mapping created with IOMemoryDescriptor::CreateMapping()
+ *
+ * @discussion
+ * To allocate memory for I/O or sharing, use IOBufferMemoryDescriptor::Create()
+ * Methods in this class are used for memory that was supplied as a parameter.
+ */
+
+class KERNEL IOMemoryMap : public OSObject
+{
+public:
+
+       virtual bool
+       init() override;
+
+       virtual void
+       free() override;
+       
+    /*!
+     * @brief       Obtain the address of the memory mapping.
+     * @return      Address.
+     */
+       uint64_t
+       GetAddress() LOCALONLY;
+
+    /*!
+     * @brief       Obtain the length of the memory mapping.
+     * @return      Length.
+     */
+       uint64_t
+       GetLength() LOCALONLY;
+
+    /*!
+     * @brief       Obtain the offset of the memory mapping.
+     * @return      Offset.
+     */
+       uint64_t
+       GetOffset() LOCALONLY;
+};
+
+class EXTENDS (IOMemoryMap) IOMemoryMapPrivate
+{
+       virtual kern_return_t
+       _CopyState(
+               IOMemoryMapPrivateState * state);
+};
+
+#endif /* ! _IOKIT_UIOMEMORYMAP_H */
diff --git a/iokit/DriverKit/IORPC.h b/iokit/DriverKit/IORPC.h
new file mode 100644 (file)
index 0000000..0ae1415
--- /dev/null
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+
+#ifndef _IORPC_H
+#define _IORPC_H
+
+#include <stdint.h>
+
+#ifndef PLATFORM_DriverKit
+
+#include <mach/message.h>
+
+#else /* !PLATFORM_DriverKit */
+
+#ifndef _MACH_MESSAGE_H_
+#define _MACH_MESSAGE_H_
+
+#define MACH_MSG_TYPE_MOVE_RECEIVE      16      /* Must hold receive right */
+#define MACH_MSG_TYPE_MOVE_SEND         17      /* Must hold send right(s) */
+#define MACH_MSG_TYPE_MOVE_SEND_ONCE    18      /* Must hold sendonce right */
+#define MACH_MSG_TYPE_COPY_SEND         19      /* Must hold send right(s) */
+#define MACH_MSG_TYPE_MAKE_SEND         20      /* Must hold receive right */
+#define MACH_MSG_TYPE_MAKE_SEND_ONCE    21      /* Must hold receive right */
+#define MACH_MSG_TYPE_COPY_RECEIVE      22      /* NOT VALID */
+#define MACH_MSG_TYPE_DISPOSE_RECEIVE   24      /* must hold receive right */
+#define MACH_MSG_TYPE_DISPOSE_SEND      25      /* must hold send right(s) */
+#define MACH_MSG_TYPE_DISPOSE_SEND_ONCE 26      /* must hold sendonce right */
+
+#define MACH_MSG_TYPE_PORT_NONE         0
+
+#define MACH_MSG_PORT_DESCRIPTOR                0
+#define MACH_MSG_OOL_DESCRIPTOR                 1
+
+typedef unsigned int mach_msg_copy_options_t;
+
+#define MACH_MSG_PHYSICAL_COPY          0
+#define MACH_MSG_VIRTUAL_COPY           1
+#define MACH_MSG_ALLOCATE               2
+
+typedef uint32_t natural_t;
+typedef int32_t integer_t;
+
+typedef unsigned int mach_msg_type_name_t;
+typedef unsigned int mach_msg_descriptor_type_t;
+
+#if KERNEL
+typedef void * mach_port_t;
+#define MACH_PORT_NULL  NULL
+#else /* !KERNEL */
+typedef natural_t mach_port_t;
+#define MACH_PORT_NULL  0
+#endif /* !KERNEL */
+
+typedef natural_t mach_port_name_t;
+
+typedef unsigned int mach_msg_bits_t;
+typedef natural_t mach_msg_size_t;
+typedef integer_t mach_msg_id_t;
+
+#pragma pack(push, 4)
+
+typedef struct{
+       mach_msg_bits_t       msgh_bits;
+       mach_msg_size_t       msgh_size;
+       mach_port_t           msgh_remote_port;
+       mach_port_t           msgh_local_port;
+       mach_port_name_t      msgh_voucher_port;
+       mach_msg_id_t         msgh_id;
+} mach_msg_header_t;
+
+typedef struct{
+       mach_msg_size_t msgh_descriptor_count;
+} mach_msg_body_t;
+
+typedef struct{
+       mach_port_t                   name;
+#if !(defined(KERNEL) && defined(__LP64__))
+// Pad to 8 bytes everywhere except the K64 kernel where mach_port_t is 8 bytes
+       mach_msg_size_t               pad1;
+#endif
+       unsigned int                  pad2 : 16;
+       mach_msg_type_name_t          disposition : 8;
+       mach_msg_descriptor_type_t    type : 8;
+#if defined(KERNEL)
+       uint32_t          pad_end;
+#endif
+} mach_msg_port_descriptor_t;
+
+typedef struct{
+       void *                        address;
+#if !defined(__LP64__)
+       mach_msg_size_t               size;
+#endif
+       int                           deallocate: 8;
+       mach_msg_copy_options_t       copy: 8;
+       unsigned int                  pad1: 8;
+       mach_msg_descriptor_type_t    type: 8;
+#if defined(__LP64__)
+       mach_msg_size_t               size;
+#endif
+#if defined(KERNEL) && !defined(__LP64__)
+       uint32_t          pad_end;
+#endif
+} mach_msg_ool_descriptor_t;
+
+typedef struct{
+       unsigned int                  val[80 / sizeof(int)];
+} mach_msg_max_trailer_t;
+
+#pragma pack(pop)
+
+#endif  /* _MACH_MESSAGE_H_ */
+
+#endif /* PLATFORM_DriverKit */
+
+#if KERNEL
+class IOUserServer;
+#endif /* KERNEL */
+
+typedef uint64_t OSObjectRef;
+
+enum {
+       kIORPCVersion190615       = (mach_msg_id_t) 0x4da2b68c,
+       kIORPCVersion190615Reply  = (mach_msg_id_t) 0x4da2b68d,
+
+#if DRIVERKIT_PRIVATE
+       kIORPCVersion190501       = (mach_msg_id_t) 0xfe316a7a,
+       kIORPCVersion190501Reply  = (mach_msg_id_t) 0xfe316a7b,
+
+       kIORPCVersionCurrent      = kIORPCVersion190615,
+       kIORPCVersionCurrentReply = kIORPCVersion190615Reply
+#endif /* DRIVERKIT_PRIVATE */
+};
+
+enum{
+       kIORPCMessageRemote     = 0x00000001,
+       kIORPCMessageLocalHost  = 0x00000002,
+       kIORPCMessageKernel     = 0x00000004,
+       kIORPCMessageOneway     = 0x00000008,
+       kIORPCMessageObjectRefs = 0x00000010,
+       kIORPCMessageOnqueue    = 0x00000020,
+       kIORPCMessageError      = 0x00000040,
+       kIORPCMessageSimpleReply = 0x00000080,
+};
+
+enum{
+       kIORPCMessageIDKernel   = (1ULL << 63),
+};
+
+struct IORPCMessageMach {
+       mach_msg_header_t          msgh;
+       mach_msg_body_t            msgh_body;
+       mach_msg_port_descriptor_t objects[0];
+};
+typedef struct IORPCMessageMach IORPCMessageMach;
+
+struct IORPCMessage {
+       uint64_t         msgid;
+       uint64_t         flags;
+       uint64_t         objectRefs;
+       OSObjectRef      objects[0];
+};
+typedef struct IORPCMessage IORPCMessage;
+
+extern "C" IORPCMessage *
+IORPCMessageFromMach(IORPCMessageMach * msg, bool reply);
+
+struct IORPCMessageErrorReturnContent {
+       IORPCMessage  hdr;
+       kern_return_t result;
+       uint32_t      pad;
+};
+
+#pragma pack(4)
+struct IORPCMessageErrorReturn {
+       IORPCMessageMach mach;
+       IORPCMessageErrorReturnContent content;
+};
+#pragma pack()
+
+
+class OSMetaClassBase;
+struct IORPC;
+typedef kern_return_t (*OSDispatchMethod)(OSMetaClassBase * self, const IORPC rpc);
+
+struct IORPC {
+       IORPCMessageMach * message;
+       IORPCMessageMach * reply;
+       uint32_t           sendSize;
+       uint32_t           replySize;
+};
+typedef struct IORPC IORPC;
+
+enum {
+       kOSClassCanRemote   = 0x00000001,
+};
+
+struct OSClassDescription {
+       uint32_t    descriptionSize;
+
+       char        name[96];
+       char        superName[96];
+
+       uint32_t    methodOptionsSize;
+       uint32_t    methodOptionsOffset;
+       uint32_t    metaMethodOptionsSize;
+       uint32_t    metaMethodOptionsOffset;
+       uint32_t    queueNamesSize;
+       uint32_t    queueNamesOffset;
+       uint32_t    methodNamesSize;
+       uint32_t    methodNamesOffset;
+       uint32_t    metaMethodNamesSize;
+       uint32_t    metaMethodNamesOffset;
+
+       uint64_t    flags;
+
+       uint64_t    resv1[8];
+
+       uint64_t    methodOptions[0];
+       uint64_t    metaMethodOptions[0];
+
+       char        dispatchNames[0];
+       char        methodNames[0];
+       char        metaMethodNames[0];
+};
+
+#endif /* _IORPC_H */
diff --git a/iokit/DriverKit/IOReturn.h b/iokit/DriverKit/IOReturn.h
new file mode 100644 (file)
index 0000000..5175ee7
--- /dev/null
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 1998-2002 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ * HISTORY
+ */
+
+/*
+ * Core IOReturn values. Others may be family defined.
+ */
+
+#ifndef __IOKIT_IORETURN_H
+#define __IOKIT_IORETURN_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef PLATFORM_DriverKit
+
+#include <mach/error.h>
+
+#else  /* PLATFORM_DriverKit */
+
+#ifndef _MACH_ERROR_H_
+#define _MACH_ERROR_H_
+
+typedef int             kern_return_t;
+
+#define KERN_SUCCESS                    0
+
+/*
+ *     error number layout as follows:
+ *
+ *     hi                                     lo
+ *     | system(6) | subsystem(12) | code(14) |
+ */
+
+#define err_none                (kern_return_t)0
+#define ERR_SUCCESS             (kern_return_t)0
+
+#define err_system(x)           ((signed)((((unsigned)(x))&0x3f)<<26))
+#define err_sub(x)              (((x)&0xfff)<<14)
+
+#define err_get_system(err)     (((err)>>26)&0x3f)
+#define err_get_sub(err)        (((err)>>14)&0xfff)
+#define err_get_code(err)       ((err)&0x3fff)
+
+#define err_max_system          0x3f
+
+#define system_emask            (err_system(err_max_system))
+#define sub_emask               (err_sub(0xfff))
+#define code_emask              (0x3fff)
+
+#endif  /* _MACH_ERROR_H_ */
+
+#endif /* PLATFORM_DriverKit */
+
+typedef kern_return_t           IOReturn;
+
+#ifndef sys_iokit
+#define sys_iokit                         err_system(0x38)
+#endif /* sys_iokit */
+#define sub_iokit_common                  err_sub(0)
+#define sub_iokit_usb                     err_sub(1)
+#define sub_iokit_firewire                err_sub(2)
+#define sub_iokit_block_storage           err_sub(4)
+#define sub_iokit_graphics                err_sub(5)
+#define sub_iokit_networking              err_sub(6)
+#define sub_iokit_bluetooth               err_sub(8)
+#define sub_iokit_pmu                     err_sub(9)
+#define sub_iokit_acpi                    err_sub(10)
+#define sub_iokit_smbus                   err_sub(11)
+#define sub_iokit_ahci                    err_sub(12)
+#define sub_iokit_powermanagement         err_sub(13)
+#define sub_iokit_hidsystem               err_sub(14)
+#define sub_iokit_scsi                    err_sub(16)
+#define sub_iokit_usbaudio                err_sub(17)
+#define sub_iokit_wirelesscharging        err_sub(18)
+//#define sub_iokit_pccard                err_sub(21)
+#ifdef PRIVATE
+#define sub_iokit_nvme                    err_sub(28)
+#endif
+#define sub_iokit_thunderbolt             err_sub(29)
+#define sub_iokit_graphics_acceleration   err_sub(30)
+#define sub_iokit_keystore                err_sub(31)
+#ifdef PRIVATE
+#define sub_iokit_smc                     err_sub(32)
+#endif
+#define sub_iokit_apfs                    err_sub(33)
+#define sub_iokit_platform                err_sub(0x2A)
+#define sub_iokit_audio_video             err_sub(0x45)
+#define sub_iokit_cec                     err_sub(0x46)
+#define sub_iokit_baseband                err_sub(0x80)
+#define sub_iokit_HDA                     err_sub(0xFE)
+#define sub_iokit_hsic                    err_sub(0x147)
+#define sub_iokit_sdio                    err_sub(0x174)
+#define sub_iokit_wlan                    err_sub(0x208)
+#define sub_iokit_appleembeddedsleepwakehandler  err_sub(0x209)
+#define sub_iokit_appleppm                err_sub(0x20A)
+
+#define sub_iokit_vendor_specific         err_sub(-2)
+#define sub_iokit_reserved                err_sub(-1)
+
+#define iokit_common_err(return )          (sys_iokit|sub_iokit_common|return)
+#define iokit_family_err(sub, return )      (sys_iokit|sub|return)
+#define iokit_vendor_specific_err(return ) (sys_iokit|sub_iokit_vendor_specific|return)
+
+#define kIOReturnSuccess         KERN_SUCCESS            // OK
+#define kIOReturnError           iokit_common_err(0x2bc) // general error
+#define kIOReturnNoMemory        iokit_common_err(0x2bd) // can't allocate memory
+#define kIOReturnNoResources     iokit_common_err(0x2be) // resource shortage
+#define kIOReturnIPCError        iokit_common_err(0x2bf) // error during IPC
+#define kIOReturnNoDevice        iokit_common_err(0x2c0) // no such device
+#define kIOReturnNotPrivileged   iokit_common_err(0x2c1) // privilege violation
+#define kIOReturnBadArgument     iokit_common_err(0x2c2) // invalid argument
+#define kIOReturnLockedRead      iokit_common_err(0x2c3) // device read locked
+#define kIOReturnLockedWrite     iokit_common_err(0x2c4) // device write locked
+#define kIOReturnExclusiveAccess iokit_common_err(0x2c5) // exclusive access and
+                                                         //   device already open
+#define kIOReturnBadMessageID    iokit_common_err(0x2c6) // sent/received messages
+                                                         //   had different msg_id
+#define kIOReturnUnsupported     iokit_common_err(0x2c7) // unsupported function
+#define kIOReturnVMError         iokit_common_err(0x2c8) // misc. VM failure
+#define kIOReturnInternalError   iokit_common_err(0x2c9) // internal error
+#define kIOReturnIOError         iokit_common_err(0x2ca) // General I/O error
+//#define kIOReturn???Error      iokit_common_err(0x2cb) // ???
+#define kIOReturnCannotLock      iokit_common_err(0x2cc) // can't acquire lock
+#define kIOReturnNotOpen         iokit_common_err(0x2cd) // device not open
+#define kIOReturnNotReadable     iokit_common_err(0x2ce) // read not supported
+#define kIOReturnNotWritable     iokit_common_err(0x2cf) // write not supported
+#define kIOReturnNotAligned      iokit_common_err(0x2d0) // alignment error
+#define kIOReturnBadMedia        iokit_common_err(0x2d1) // Media Error
+#define kIOReturnStillOpen       iokit_common_err(0x2d2) // device(s) still open
+#define kIOReturnRLDError        iokit_common_err(0x2d3) // rld failure
+#define kIOReturnDMAError        iokit_common_err(0x2d4) // DMA failure
+#define kIOReturnBusy            iokit_common_err(0x2d5) // Device Busy
+#define kIOReturnTimeout         iokit_common_err(0x2d6) // I/O Timeout
+#define kIOReturnOffline         iokit_common_err(0x2d7) // device offline
+#define kIOReturnNotReady        iokit_common_err(0x2d8) // not ready
+#define kIOReturnNotAttached     iokit_common_err(0x2d9) // device not attached
+#define kIOReturnNoChannels      iokit_common_err(0x2da) // no DMA channels left
+#define kIOReturnNoSpace         iokit_common_err(0x2db) // no space for data
+//#define kIOReturn???Error      iokit_common_err(0x2dc) // ???
+#define kIOReturnPortExists      iokit_common_err(0x2dd) // port already exists
+#define kIOReturnCannotWire      iokit_common_err(0x2de) // can't wire down
+                                                         //   physical memory
+#define kIOReturnNoInterrupt     iokit_common_err(0x2df) // no interrupt attached
+#define kIOReturnNoFrames        iokit_common_err(0x2e0) // no DMA frames enqueued
+#define kIOReturnMessageTooLarge iokit_common_err(0x2e1) // oversized msg received
+                                                         //   on interrupt port
+#define kIOReturnNotPermitted    iokit_common_err(0x2e2) // not permitted
+#define kIOReturnNoPower         iokit_common_err(0x2e3) // no power to device
+#define kIOReturnNoMedia         iokit_common_err(0x2e4) // media not present
+#define kIOReturnUnformattedMedia iokit_common_err(0x2e5)// media not formatted
+#define kIOReturnUnsupportedMode iokit_common_err(0x2e6) // no such mode
+#define kIOReturnUnderrun        iokit_common_err(0x2e7) // data underrun
+#define kIOReturnOverrun         iokit_common_err(0x2e8) // data overrun
+#define kIOReturnDeviceError     iokit_common_err(0x2e9) // the device is not working properly!
+#define kIOReturnNoCompletion    iokit_common_err(0x2ea) // a completion routine is required
+#define kIOReturnAborted         iokit_common_err(0x2eb) // operation aborted
+#define kIOReturnNoBandwidth     iokit_common_err(0x2ec) // bus bandwidth would be exceeded
+#define kIOReturnNotResponding   iokit_common_err(0x2ed) // device not responding
+#define kIOReturnIsoTooOld       iokit_common_err(0x2ee) // isochronous I/O request for distant past!
+#define kIOReturnIsoTooNew       iokit_common_err(0x2ef) // isochronous I/O request for distant future
+#define kIOReturnNotFound        iokit_common_err(0x2f0) // data was not found
+#define kIOReturnInvalid         iokit_common_err(0x1)   // should never be seen
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ! __IOKIT_IORETURN_H */
diff --git a/iokit/DriverKit/IOService.iig b/iokit/DriverKit/IOService.iig
new file mode 100644 (file)
index 0000000..5885850
--- /dev/null
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2019-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#if !__IIG
+#if KERNEL
+#include <IOKit/IOService.h>
+#endif
+#endif
+
+#ifndef _IOKIT_UIOSERVICE_H
+#define _IOKIT_UIOSERVICE_H
+
+#include <DriverKit/OSObject.iig>
+
+class IOMemoryDescriptor;
+class IOBufferMemoryDescriptor;
+class IOUserClient;
+
+typedef char IOServiceName[128];
+typedef char IOPropertyName[128];
+typedef char IORegistryPlaneName[128];
+
+enum {
+       kIOServiceSearchPropertyParents = 0x00000001,
+};
+
+#define kIOServiceDefaultQueueName     "Default"
+
+enum {
+       kIOServicePowerCapabilityOff = 0x00000000,
+       kIOServicePowerCapabilityOn  = 0x00000002,
+       kIOServicePowerCapabilityLow = 0x00010000,
+};
+
+/*!
+ * @class IOService
+ *
+ * @abstract
+ * IOService represents an device or OS service in IOKit and DriverKit.
+ *
+ * @discussion
+ * IOKit provides driver lifecycle management through the IOService APIs. 
+ * Drivers and devices are represented as subclasses of IOService.
+ *
+
+@iig implementation
+#include <DriverKit/IOUserClient.h>
+@iig end
+*/
+
+class KERNEL IOService : public OSObject
+{
+public:
+       virtual bool
+       init() override;
+
+       virtual void
+       free() override;
+
+    /*!
+     * @brief       First call made to a matched IOService.
+     * @discussion  During matching IOKit will create an IOService object for successful matches.
+     *              Start is the first call made to the new object.
+     * @param       provider The IOService provider for the match. This should be OSRequiredCast to the expected class.
+     *              The provider is retained by DriverKit for the duration of Start() and on successful Start() until
+     *              IOService::Stop() is called.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       Start(IOService * provider) LOCAL;
+
+    /*!
+     * @brief       Terminate access to provider.
+     * @discussion  During termination IOKit will teardown any IOService objects attached to a terminated provider.
+     *              Stop should quiesce all activity and when complete, pass the call to super. After calling super, the
+     *              provider is no longer valid and this object will likely be freed.
+     * @param       provider The IOService provider for being terminated, one previously passed to Start
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       Stop(IOService * provider) LOCAL;
+
+    /*!
+     * @brief       Obtain IOKit IORegistryEntryID.
+     * @param       registryEntryID IORegistryEntryID for the IOKit object.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       GetRegistryEntryID(uint64_t * registryEntryID) LOCAL;
+
+    /*!
+     * @brief       Set the IORegistryEntry name.
+     * @param       name Name for the IOKit object. The c-string will be copied.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       SetName(
+       const IOServiceName name);
+
+    /*!
+     * @brief       Start the matching process on the IOService object.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       RegisterService();
+
+    /*!
+     * @brief       Set the IODispatchQueue for a given name on the IOService.
+     * @param       name Name for the queue. The name may be referenced by methods in the .iig class definition
+     *              with the QUEUENAME() attribute to indicate the method must be invoked on that queue. If a method
+     *              is invoked before the queue is set for the name, the default queue is used. A default queue is
+     *              created by DriverKit for every new IOService object with the name kIOServiceDefaultQueueName.
+     * @param       queue Queue to be associated with the name on this IOService.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       SetDispatchQueue(
+               const IODispatchQueueName name,
+               IODispatchQueue         * queue) override LOCAL;
+
+    /*!
+     * @brief       Obtain the IODispatchQueue for a given name on the IOService.
+     * @param       name Name for the queue.
+     * @param       queue Returned, retained queue or NULL. The caller should release this queue.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       CopyDispatchQueue(
+               const IODispatchQueueName name,
+               IODispatchQueue        ** queue) override;
+
+    /*!
+     * @brief       Obtain the IOKit registry properties for the IOService.
+     * @param       properties Returned, retained dictionary of properties or NULL. The caller should release this dictionary.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       CopyProperties(
+               OSDictionary ** properties);
+
+    /*!
+     * @brief       Obtain the an IOKit registry properties from the service or one of its parents.
+     * @param       name Name of the property as a c-string.
+     * @param       plane Name of the registry plane to be searched, if the option kIOServiceSearchPropertyParents
+     *              is used.
+     * @param       options Pass kIOServiceSearchPropertyParents to search for the property in the IOService and all
+     *              its parents in the IOKit registry.
+     * @param       property Returned, retained property object or NULL. The caller should release this property.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       SearchProperty(
+               const IOPropertyName name,
+               const IORegistryPlaneName plane,
+               uint64_t options,
+               OSContainer ** property);
+
+    /*!
+     * @brief       Send a dictionary of properties to an IOService.
+     * @discussion  By default the method will fail. A DriverKit subclass or kernel class may implement this method.
+     * @param       properties Dictionary of properties.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       SetProperties(
+               OSDictionary * properties);
+
+    /*!
+     * @brief       Notification of change in power state of a provider.
+     * @discussion  DriverKit notifies of changes in power of a provider. The driver should make itself safe for
+     *              the new state before passing the call to super. 
+     * @param       powerFlags The power capabilities of the new state. The values possible are:
+        *      kIOServicePowerCapabilityOff the system will be entering sleep state
+        *      kIOServicePowerCapabilityOn  the device and system are fully powered
+        *  kIOServicePowerCapabilityLow the device is in a reduced power state while the system is running
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       SetPowerState(
+               uint32_t powerFlags) LOCAL;
+
+    /*!
+     * @brief       Allow provider to enter a low power state.
+     * @discussion  A driver may allow a device to enter a lower power state. 
+     * @param       powerFlags The power capabilities of the new state. The values possible are:
+        *  kIOServicePowerCapabilityLow the device is in a reduced power state while the system is running
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       ChangePowerState(
+               uint32_t powerFlags);
+
+    /*!
+     * @brief       Request create a new user client for a client process.
+     * @discussion  An application may request an IOUserClient be opened with the IOKit framework
+     *              IOServiceOpen() call. The type parameter of that call is passed here. The driver should respond to
+     *              the call by calling IOService::Create() with a plist entry describing the new user client object.
+     * @param       type The type passed to IOServiceOpen().
+     * @param       userClient The object created by IOService::Create()
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       NewUserClient(
+               uint32_t type,
+               IOUserClient ** userClient);
+
+    /*!
+     * @brief       Request to create an IOService object from a plist property.
+     * @discussion  An IOService interface or IOUserClient subclass may be created from a plist property of the driver.
+     *              The plist should contain the following IOKit matching keys:
+     *              IOClass - kernel class of IOUserUserClient
+     *              IOUserClass - DriverKit class to be instantiated
+     *              IOServiceDEXTEntitlements - Array of entitlements to be checked against a user client owning task
+     * @param       provider The provider of the new object.
+     * @param       propertiesKey The name of the properties dictionary in this IOService
+     * @param       result The created object retained, to be released by the caller.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       Create(
+               IOService          * provider,
+               const IOPropertyName propertiesKey,
+               IOService         ** result);
+};
+
+#endif /* ! _IOKIT_UIOSERVICE_H */
diff --git a/iokit/DriverKit/IOTypes.h b/iokit/DriverKit/IOTypes.h
new file mode 100644 (file)
index 0000000..de2d357
--- /dev/null
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 1998-2012 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifndef __IOKIT_IOTYPES_H
+#define __IOKIT_IOTYPES_H
+
+#ifndef PLATFORM_DriverKit
+
+#ifndef IOKIT
+#define IOKIT 1
+#endif /* !IOKIT */
+
+#if KERNEL
+#include <IOKit/system.h>
+#else
+#include <mach/message.h>
+#include <mach/vm_types.h>
+#endif
+
+#include <IOKit/IOReturn.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef NULL
+#if defined (__cplusplus)
+#ifdef XNU_KERNEL_PRIVATE
+#define NULL nullptr
+#else
+#if __cplusplus >= 201103L && (defined(__arm__) || defined(__arm64__))
+#define NULL nullptr
+#else
+#define NULL    0
+#endif
+#endif
+#else
+#define NULL ((void *)0)
+#endif
+#endif
+
+/*
+ * Simple data types.
+ */
+#include <stdbool.h>
+#include <libkern/OSTypes.h>
+
+#if KERNEL
+#include <libkern/OSBase.h>
+#endif
+
+typedef UInt32          IOOptionBits;
+typedef SInt32          IOFixed;
+typedef UInt32          IOVersion;
+typedef UInt32          IOItemCount;
+typedef UInt32          IOCacheMode;
+
+typedef UInt32          IOByteCount32;
+typedef UInt64          IOByteCount64;
+
+typedef UInt32  IOPhysicalAddress32;
+typedef UInt64  IOPhysicalAddress64;
+typedef UInt32  IOPhysicalLength32;
+typedef UInt64  IOPhysicalLength64;
+
+#if !defined(__arm__) && !defined(__i386__)
+typedef mach_vm_address_t       IOVirtualAddress;
+#else
+typedef vm_address_t            IOVirtualAddress;
+#endif
+
+#if !defined(__arm__) && !defined(__i386__) && !(defined(__x86_64__) && !defined(KERNEL)) && !(defined(__arm64__) && !defined(__LP64__))
+typedef IOByteCount64           IOByteCount;
+#else
+typedef IOByteCount32           IOByteCount;
+#endif
+
+typedef IOVirtualAddress    IOLogicalAddress;
+
+#if !defined(__arm__) && !defined(__i386__) && !(defined(__x86_64__) && !defined(KERNEL))
+
+typedef IOPhysicalAddress64      IOPhysicalAddress;
+typedef IOPhysicalLength64       IOPhysicalLength;
+#define IOPhysical32( hi, lo )          ((UInt64) lo + ((UInt64)(hi) << 32))
+#define IOPhysSize      64
+
+#else
+
+typedef IOPhysicalAddress32      IOPhysicalAddress;
+typedef IOPhysicalLength32       IOPhysicalLength;
+#define IOPhysical32( hi, lo )          (lo)
+#define IOPhysSize      32
+
+#endif
+
+
+typedef struct{
+       IOPhysicalAddress   address;
+       IOByteCount         length;
+} IOPhysicalRange;
+
+typedef struct{
+       IOVirtualAddress    address;
+       IOByteCount         length;
+} IOVirtualRange;
+
+#if !defined(__arm__) && !defined(__i386__)
+typedef IOVirtualRange  IOAddressRange;
+#else
+typedef struct{
+       mach_vm_address_t   address;
+       mach_vm_size_t      length;
+} IOAddressRange;
+#endif
+
+/*
+ * Map between #defined or enum'd constants and text description.
+ */
+typedef struct {
+       int value;
+       const char *name;
+} IONamedValue;
+
+
+/*
+ * Memory alignment -- specified as a power of two.
+ */
+typedef unsigned int    IOAlignment;
+
+#define IO_NULL_VM_TASK         ((vm_task_t)0)
+
+
+/*
+ * Pull in machine specific stuff.
+ */
+
+//#include <IOKit/machine/IOTypes.h>
+
+#ifndef MACH_KERNEL
+
+#ifndef __IOKIT_PORTS_DEFINED__
+#define __IOKIT_PORTS_DEFINED__
+#ifdef KERNEL
+#ifdef __cplusplus
+class OSObject;
+typedef OSObject * io_object_t;
+#else
+typedef struct OSObject * io_object_t;
+#endif
+#else /* KERNEL */
+typedef mach_port_t     io_object_t;
+#endif /* KERNEL */
+#endif /* __IOKIT_PORTS_DEFINED__ */
+
+#include <device/device_types.h>
+
+typedef io_object_t     io_connect_t;
+typedef io_object_t     io_enumerator_t;
+typedef io_object_t     io_iterator_t;
+typedef io_object_t     io_registry_entry_t;
+typedef io_object_t     io_service_t;
+typedef io_object_t     uext_object_t;
+
+#define IO_OBJECT_NULL  ((io_object_t) 0)
+
+#endif /* MACH_KERNEL */
+
+// IOConnectMapMemory memoryTypes
+enum {
+       kIODefaultMemoryType        = 0
+};
+
+enum {
+       kIODefaultCache             = 0,
+       kIOInhibitCache             = 1,
+       kIOWriteThruCache           = 2,
+       kIOCopybackCache            = 3,
+       kIOWriteCombineCache        = 4,
+       kIOCopybackInnerCache       = 5,
+       kIOPostedWrite              = 6,
+       kIORealTimeCache            = 7,
+       kIOPostedReordered          = 8,
+};
+
+// IOMemory mapping options
+enum {
+       kIOMapAnywhere              = 0x00000001,
+
+       kIOMapCacheMask             = 0x00000f00,
+       kIOMapCacheShift            = 8,
+       kIOMapDefaultCache          = kIODefaultCache       << kIOMapCacheShift,
+       kIOMapInhibitCache          = kIOInhibitCache       << kIOMapCacheShift,
+       kIOMapWriteThruCache        = kIOWriteThruCache     << kIOMapCacheShift,
+       kIOMapCopybackCache         = kIOCopybackCache      << kIOMapCacheShift,
+       kIOMapWriteCombineCache     = kIOWriteCombineCache  << kIOMapCacheShift,
+       kIOMapCopybackInnerCache    = kIOCopybackInnerCache << kIOMapCacheShift,
+       kIOMapPostedWrite           = kIOPostedWrite        << kIOMapCacheShift,
+       kIOMapRealTimeCache         = kIORealTimeCache      << kIOMapCacheShift,
+       kIOMapPostedReordered       = kIOPostedReordered    << kIOMapCacheShift,
+
+       kIOMapUserOptionsMask       = 0x00000fff,
+
+       kIOMapReadOnly              = 0x00001000,
+
+       kIOMapStatic                = 0x01000000,
+       kIOMapReference             = 0x02000000,
+       kIOMapUnique                = 0x04000000,
+#ifdef XNU_KERNEL_PRIVATE
+       kIOMap64Bit                 = 0x08000000,
+#endif
+       kIOMapPrefault              = 0x10000000,
+       kIOMapOverwrite     = 0x20000000
+};
+
+/*! @enum Scale Factors
+ *   @discussion Used when a scale_factor parameter is required to define a unit of time.
+ *   @constant kNanosecondScale Scale factor for nanosecond based times.
+ *   @constant kMicrosecondScale Scale factor for microsecond based times.
+ *   @constant kMillisecondScale Scale factor for millisecond based times.
+ *   @constant kTickScale Scale factor for the standard (100Hz) tick.
+ *   @constant kSecondScale Scale factor for second based times. */
+
+enum {
+       kNanosecondScale  = 1,
+       kMicrosecondScale = 1000,
+       kMillisecondScale = 1000 * 1000,
+       kSecondScale      = 1000 * 1000 * 1000,
+       kTickScale        = (kSecondScale / 100)
+};
+
+enum {
+       kIOConnectMethodVarOutputSize = -3
+};
+
+/* compatibility types */
+
+#ifndef KERNEL
+
+typedef unsigned int IODeviceNumber;
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#else /* !PLATFORM_DriverKit */
+
+#include <stdint.h>
+
+typedef uint32_t          IOOptionBits;
+typedef int32_t           IOFixed;
+typedef uint32_t          IOVersion;
+typedef uint32_t          IOItemCount;
+typedef uint32_t          IOCacheMode;
+
+typedef uint32_t          IOByteCount32;
+typedef uint64_t          IOByteCount64;
+typedef IOByteCount64     IOByteCount;
+
+typedef uint32_t  IOPhysicalAddress32;
+typedef uint64_t  IOPhysicalAddress64;
+typedef uint32_t  IOPhysicalLength32;
+typedef uint64_t  IOPhysicalLength64;
+
+typedef IOPhysicalAddress64      IOPhysicalAddress;
+typedef IOPhysicalLength64       IOPhysicalLength;
+
+typedef uint64_t       IOVirtualAddress;
+
+#endif /* PLATFORM_DriverKit */
+
+#endif /* ! __IOKIT_IOTYPES_H */
diff --git a/iokit/DriverKit/IOUserClient.iig b/iokit/DriverKit/IOUserClient.iig
new file mode 100644 (file)
index 0000000..5523bb2
--- /dev/null
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2019-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#if !__IIG
+#if KERNEL
+#include <IOKit/IOUserClient.h>
+#endif
+#endif
+
+#ifndef _IOKIT_UIOUSERCLIENT_H
+#define _IOKIT_UIOUSERCLIENT_H
+
+#include <DriverKit/OSAction.iig>
+#include <DriverKit/IOService.iig>
+
+class IOMemoryDescriptor;
+class IOBufferMemoryDescriptor;
+
+enum {
+       kIOUserClientScalarArrayCountMax  = 16,
+};
+typedef uint64_t IOUserClientScalarArray[kIOUserClientScalarArrayCountMax];
+
+enum {
+       kIOUserClientAsyncReferenceCountMax  = 16,
+};
+typedef uint64_t IOUserClientAsyncReferenceArray[kIOUserClientAsyncReferenceCountMax];
+
+enum {
+       kIOUserClientAsyncArgumentsCountMax  = 16,
+};
+typedef uint64_t IOUserClientAsyncArgumentsArray[kIOUserClientAsyncArgumentsCountMax];
+
+// CopyClientMemoryForType options
+enum {
+       kIOUserClientMemoryReadOnly  = 0x00000001,
+};
+
+
+/*! @enum
+ *   @abstract Constant to denote a variable length structure argument to IOUserClient.
+ *   @constant kIOUserClientVariableStructureSize Use in the structures IOUserClientMethodDispatch to specify the size of the structure is variable.
+ */
+enum {
+       kIOUserClientVariableStructureSize = 0xffffffff
+};
+
+
+enum {
+#define IO_USER_CLIENT_METHOD_ARGUMENTS_CURRENT_VERSION    2
+       kIOUserClientMethodArgumentsCurrentVersion = IO_USER_CLIENT_METHOD_ARGUMENTS_CURRENT_VERSION
+};
+
+/*!
+ * @struct      IOUserClientMethodArguments
+ * @brief       Holds arguments from IOKit.framework IOConnectMethod calls.
+ * @discussion  Any argument may be passed as NULL if not passed by the caller.
+ * @field       selector Selector argument to IOConnectMethod.
+ * @field       scalarInput Array of scalars from caller.
+ * @field       scalarInputCount Count of valid scalars in scalarInput.
+ * @field       structureInput OSData object containing structure input from IOConnectMethod.
+ * @field       structureInputDescriptor IOMemoryDescriptor containing structure input from IOConnectMethod.
+ *                             This parameter is only set for large structures, and if set structureInput will be NULL.
+ * @field       scalarOutput Array of scalars to return to the caller.
+ * @field       scalarOutputCount Count of scalars to return to the caller in scalarOutput.
+ * @field       structureOutput An OSData to be returned to the caller as structure output.
+ *                             A reference will be consumed by the caller. It is an error to set this field if
+ *              structureOutputDescriptor was passed in
+ * @field       structureOutputDescriptor A IOMemoryDescriptor specified by the caller for structure output.
+ * @field       structureOutputMaximumSize Maximum size of structure output specified by caller
+ *              or kIOUserClientVariableStructureSize.
+ * @field       completion For IOConnectAsyncMethod, an OSAction used to deliver async data to the caller.
+ *              It is only retained during the invocation of ExternalMethod and should be retained if
+ *              used beyond then.
+ */
+
+struct IOUserClientMethodArguments {
+       uint64_t                                version;
+       uint64_t                        selector;
+       OSAction                      * completion;
+    const uint64_t                           * scalarInput;
+       uint32_t                                                scalarInputCount;
+       OSData                                            * structureInput;
+       IOMemoryDescriptor            * structureInputDescriptor;
+       uint64_t                      * scalarOutput;
+       uint32_t                        scalarOutputCount;
+       OSData                        * structureOutput;
+       IOMemoryDescriptor            * structureOutputDescriptor;
+       uint64_t                        structureOutputMaximumSize;
+       uint64_t                        __reserved[30];
+};
+
+typedef kern_return_t (*IOUserClientMethodFunction)(
+       OSObject * target,
+       void * reference,
+    IOUserClientMethodArguments * arguments);
+
+/*!
+ * @struct      IOUserClientMethodDispatch
+ * @brief       Used to check fields in IOUserClientMethodArguments
+ * @field       function to invoke after making the checks specified below. If NULL and all checks pass,
+ *              kIOReturnNoCompletion will be returned for the caller to implement the method.
+ * @field       checkCompletionExists
+ *              if true completion field must be set,
+ *              if false must be zero,
+ *              if -1U don't care
+ * @field       checkScalarInputCount
+ *              if has value kIOUserClientVariableStructureSize don't care,
+ *              otherwise must equal args->scalarInputCount
+ * @field       checkStructureInputSize
+ *              if has value kIOUserClientVariableStructureSize don't care,
+ *              otherwise must equal length of structureInput or structureInputDescriptor
+ * @field       checkScalarOutputCount
+ *              if has value kIOUserClientVariableStructureSize don't care,
+ *              otherwise must equal args->scalarOutputCount
+ * @field       checkStructureOutputSize
+ *              if has value kIOUserClientVariableStructureSize don't care,
+ *              otherwise must equal length of structureOutputMaximumSize
+ */
+
+struct IOUserClientMethodDispatch {
+       IOUserClientMethodFunction function;
+       uint32_t                               checkCompletionExists;
+       uint32_t                               checkScalarInputCount;
+       uint32_t                               checkStructureInputSize;
+       uint32_t                               checkScalarOutputCount;
+       uint32_t                               checkStructureOutputSize;
+};
+
+/*!
+ * @class IOUserClient
+ *
+ * @abstract
+ * IOUserClient represents a connection opened by IOServiceOpen in the IOKit.framework.
+ *
+ * @discussion
+ * An application may open an IOUserClient by calling IOServiceOpen(). This results in a call
+ * to the IOService::NewUserClient API to create an instance representing the connection.
+ * and to receive untyped data via IOConnectMethod/IOConnectAsyncMethod.
+ * As an IOService subclass, IOUserClient receives the normal Start()/Stop() lifecyle calls.
+ *
+
+@iig implementation
+#include <DriverKit/IOBufferMemoryDescriptor.h>
+@iig end
+*/
+
+class KERNEL IOUserClient : public IOService
+{
+public:
+       virtual bool
+       init() override;
+
+       virtual void
+       free() override;
+
+       /*!
+        * @brief       Receive arguments from IOKit.framework IOConnectMethod calls.
+        * @discussion  IOConnectMethod calls from the owner of the connection come here.
+        *              Any argument may be passed as NULL if not passed by the caller.
+        * @param       selector Selector argument to IOConnectMethod.
+        * @param       scalarInput Array of scalars from caller.
+        * @param       scalarInputCount Count of valid scalars in scalarInput.
+        * @param       structureInput OSData object containing structure input from IOConnectMethod.
+        * @param       structureInputDescriptor IOMemoryDescriptor containing structure input from IOConnectMethod.
+        *                              This parameter is only set for large structures, and if set structureInput will be NULL.
+        * @param       scalarOutput Array of scalars to return to the caller.
+        * @param       scalarOutputCount Count of scalars to return to the caller in scalarOutput.
+        * @param       structureOutput An OSData to be returned to the caller as structureOutput.
+        *                              A reference will be consumed by the caller.
+        * @param       structureOutputDescriptor An IOMemoryDescriptor to be returned to the caller as structureOutput.
+        *                              A reference will be consumed by the caller.
+        *                              Only one of structureOutput and structureOutputDescriptor may set.
+        * @param       completion For IOConnectAsyncMethod, an OSAction used to deliver async data to the caller.
+        *              It should be passed to the AsyncCompletion() method and released.
+        * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+        */
+
+       virtual kern_return_t
+       ExternalMethod(
+               uint64_t                            selector,
+           IOUserClientMethodArguments       * arguments,
+           const IOUserClientMethodDispatch  * dispatch,
+           OSObject                          * target,
+           void                              * reference) LOCALONLY;
+
+
+    /*!
+     * @brief       Send asynchronous arguments to a completion supplied by ExternalMethod().
+     * @discussion  IOConnectAsyncMethod calls from the owner of the connection come will pass an OSAction instance.
+     *              To deliver the asynchronous results the driver calls AsyncCompletion().
+     * @param       action OSAction passed to IOExternalMethod().
+     * @param       status An IOReturn status value to be sent.
+     * @param       asyncData An array of scalar data to be sent.
+     * @param       asyncDataCount Count of valid data in asyncData.
+     */
+    virtual void
+    AsyncCompletion(
+        OSAction                            * action TARGET,
+        IOReturn                              status,
+        const IOUserClientAsyncArgumentsArray asyncData,
+               uint32_t                              asyncDataCount) = 0;
+
+    /*!
+     * @brief       Return an IOMemoryDescriptor to be mapped into the client task.
+     * @discussion  IOConnectMapMemory()/UnmapMemory() will result in a call to this method to obtain
+     *              an IOMemoryDescriptor instance for shared memory. For a given IOUserClient instance, calling
+     *              CopyClientMemoryForType() with a given type, should return the same IOMemoryDescriptor instance.
+     * @param       type Type parameter IOConnectMapMemory()/UnmapMemory().
+     * @param       options Set kIOUserClientMemoryReadOnly for memory to be mapped read only in the client.
+     * @param       memory An instance of IOMemoryDescriptor on success. One reference will be consumed by the caller
+     *              of this method.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       virtual kern_return_t
+       CopyClientMemoryForType(
+               uint64_t                              type,
+           uint64_t                            * options,
+           IOMemoryDescriptor                 ** memory) = 0;
+
+private:
+       virtual kern_return_t
+       _ExternalMethod(
+               uint64_t                              selector,
+               const IOUserClientScalarArray             scalarInput,
+               uint32_t                                                          scalarInputCount,
+               OSData                                                      * structureInput,
+               IOMemoryDescriptor                  * structureInputDescriptor,
+               IOUserClientScalarArray               scalarOutput,
+               uint32_t                            * scalarOutputCount,
+               uint64_t                              structureOutputMaximumSize,
+               OSData                             ** structureOutput,
+               IOMemoryDescriptor                  * structureOutputDescriptor,
+        OSAction                            * completion TYPE(IOUserClient::AsyncCompletion)) LOCAL;
+
+    virtual void
+    KernelCompletion(
+        OSAction                            * action TARGET,
+        IOReturn                              status,
+        const IOUserClientAsyncArgumentsArray asyncData,
+               uint32_t                              asyncDataCount)
+        KERNEL
+        TYPE(IOUserClient::AsyncCompletion);
+};
+
+#endif /* ! _IOKIT_UIOUSERCLIENT_H */
diff --git a/iokit/DriverKit/IOUserServer.iig b/iokit/DriverKit/IOUserServer.iig
new file mode 100644 (file)
index 0000000..ca946c4
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2019-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _IOKIT_UIOUSERSERVER_H
+#define _IOKIT_UIOUSERSERVER_H
+
+#include <DriverKit/OSObject.iig>
+#include <DriverKit/OSAction.iig>
+#include <DriverKit/IOService.iig>
+
+
+/*!
+@iig implementation
+#include <IOKit/IOUserServer.h>
+@iig end
+*/
+
+class KERNEL IOUserServer : public IOService
+{
+public:
+       static kern_return_t
+       Create(
+               const char      name[64],
+               uint64_t        tag,
+               uint64_t        options,
+               IOUserServer ** server);
+
+       virtual bool
+       init() override;
+
+       virtual void
+       free() override;
+
+       virtual kern_return_t
+       Exit(const char reason[1024]) LOCAL;
+
+       virtual kern_return_t
+       LoadModule(const char path[1024]) LOCAL;
+};
+
+#endif /* ! _IOKIT_UIOUSERSERVER_H */
diff --git a/iokit/DriverKit/Makefile b/iokit/DriverKit/Makefile
new file mode 100644 (file)
index 0000000..62ab74b
--- /dev/null
@@ -0,0 +1,50 @@
+export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
+export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
+export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
+export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
+
+IIG_INCFLAGS = -I$(SRCROOT)/iokit -I$(SRCROOT)/osfmk -I$(SRCROOT)/bsd -I$(OBJROOT)/bsd $(INCFLAGS_EXTERN)
+OTHER_IIG_CFLAGS = $(IIG_INCFLAGS) -isysroot $(SDKROOT) -x c++ -std=gnu++1z -D__IIG=1 -DDRIVERKIT_PRIVATE=1 $(DEPLOYMENT_TARGET_DEFINES) $($(addsuffix $(CURRENT_ARCH_CONFIG),ARCH_FLAGS_))
+
+INCDIR = $(FRAMEDIR)/$(DKIT_INCDIR)
+DRIVERKITINCDIR = $(DRIVERKITFRAMEDIR)/$(DRIVERKIT_DKIT_INCDIR)
+LCLDIR = $(FRAMEDIR)/$(DKIT_PINCDIR)
+
+include $(MakeInc_cmd)
+include $(MakeInc_def)
+
+ALL_DEFS = $(notdir $(wildcard $(SOURCE)*.iig))
+ALL_HDRS = $(notdir $(wildcard $(SOURCE)*.h))
+
+EXPINC_SUBDIRS = ${INSTINC_SUBDIRS}
+
+INSTALL_MI_DIR = .
+
+EXPORT_MI_DIR = DriverKit
+
+GENERATED_HEADERS = $(patsubst %.iig,%.h,$(ALL_DEFS))
+
+GENERATED_IMPL = $(patsubst %.iig,%.iig.cpp,$(ALL_DEFS))
+
+INSTALL_MI_LIST = $(ALL_DEFS)
+INSTALL_DRIVERKIT_MI_LIST = $(ALL_DEFS)
+
+OTHER_HEADERS = IOTypes.h IOReturn.h IORPC.h
+
+EXPORT_MI_GEN_LIST = $(GENERATED_HEADERS)  $(OTHER_HEADERS)
+INSTALL_MI_GEN_LIST = $(GENERATED_HEADERS)  $(OTHER_HEADERS)
+INSTALL_DRIVERKIT_MI_GEN_LIST = $(GENERATED_HEADERS)  $(OTHER_HEADERS)
+INSTALL_KF_MI_GEN_LIST = $(GENERATED_HEADERS)  $(OTHER_HEADERS)
+
+COMP_FILES = ${GENERATED_HEADERS} $(GENERATED_IMPL)
+
+$(GENERATED_HEADERS) : \
+       %.h : %.iig
+       $(IIG) --def $< --header $@ --impl $(patsubst %.h,%.iig.cpp,$@) --framework-name DriverKit ${OTHER_IIG_FLAGS} -- ${OTHER_IIG_CFLAGS}
+
+$(GENERATED_IMPL) : $(GENERATED_HEADERS)
+
+do_build_all:: $(COMP_FILES)
+
+include $(MakeInc_rule)
+include $(MakeInc_dir)
diff --git a/iokit/DriverKit/OSAction.iig b/iokit/DriverKit/OSAction.iig
new file mode 100644 (file)
index 0000000..999205c
--- /dev/null
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2019-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _IOKIT_OSACTION_H
+#define _IOKIT_OSACTION_H
+
+#include <DriverKit/OSObject.iig>
+
+typedef void (^OSActionCancelHandler)(void);
+typedef void (^OSActionAbortedHandler)(void);
+
+/*!
+ * @class OSAction
+ *
+ * @abstract
+ * OSAction is an object that represents a callback to be be invoked.
+ *
+ * @discussion
+ * The callback is specified as a method and object pair.
+ * State associated with the callback may be allocated and stored for the creator of the object.
+ * Methods to allocate an OSAction instance are generated for each method defined in a class with
+ * a TYPE attribute, so there should not be any need to directly call OSAction::Create().
+ */
+
+class NATIVE KERNEL OSAction : public OSObject
+{
+public:
+
+    /*!
+     * @brief       Create an instance of OSAction.
+        * @discussion  Methods to allocate an OSAction instance are generated for each method defined in a class with
+     *              a TYPE attribute, so there should not be any need to directly call OSAction::Create().
+     * @param       target OSObject to receive the callback. This object will be retained until the OSAction is
+     *              canceled or freed.
+     * @param       targetmsgid Generated message ID for the target method.
+     * @param       msgid Generated message ID for the method invoked by the receiver of the OSAction
+     *              to generate the callback.
+     * @param       referenceSize Size of additional state structure available to the creator of the OSAction
+     *              with GetReference.
+     * @param       action Created OSAction with +1 retain count to be released by the caller.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       static kern_return_t
+       Create(
+               OSObject      * target,
+               uint64_t        targetmsgid,
+               uint64_t        msgid,
+               size_t          referenceSize,
+               OSAction     ** action) LOCAL;
+
+       virtual void
+       free() override;
+
+    /*!
+     * @brief       Return a pointer to any state allocated by the OSAction creator.
+     * @discussion  Reference data is allocated with zero initialized content. It may be set and retrieved later
+     *              with this method.
+     * @return      A pointer to storage for the owner. It will be NULL if referenceSize was zero, and NULL
+     *              when called in a process other than the owner that is receiving the OSAction as a parameter.
+     */
+       void *
+       GetReference() LOCALONLY;
+
+    /*!
+     * @brief       Cancel all callbacks from the action.
+     * @discussion  After cancellation, the action can only be freed. It cannot be reactivated.
+     * @param       handler Handler block to be invoked after any callbacks have completed.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       kern_return_t
+       Cancel(OSActionCancelHandler handler) LOCALONLY;
+
+    /*!
+     * @brief       Install a handler to be invoked when no other processes reference the action.
+     * @discussion  When all tasks other than the creator release their references to the action,
+     *              invoke the handler in the owner. A task exiting will always remove its references.
+     * @param       handler Handler block to be invoked on no more references.
+     * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+     */
+       kern_return_t
+       SetAbortedHandler(OSActionAbortedHandler handler) LOCALONLY;
+
+       virtual void
+       Aborted(void) LOCAL;
+};
+
+#endif /* ! _IOKIT_OSACTION_H */
diff --git a/iokit/DriverKit/OSObject.iig b/iokit/DriverKit/OSObject.iig
new file mode 100644 (file)
index 0000000..38b55fa
--- /dev/null
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2019-2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#if !__IIG
+#if KERNEL
+#include <libkern/c++/OSObject.h>
+#endif
+#endif
+
+#ifndef _IOKIT_UOSOBJECT_H
+#define _IOKIT_UOSOBJECT_H
+
+#if !KERNEL
+#include <stddef.h>
+#include <stdint.h>
+#include <DriverKit/IOReturn.h>
+#if DRIVERKIT_PRIVATE
+#include <mach/port.h>
+#endif
+#if !__IIG
+#include <string.h>
+#include <DriverKit/OSMetaClass.h>
+#endif
+class OSObject;
+typedef OSObject * OSObjectPtr;
+#endif
+
+#if __IIG && !__IIG_ATTRIBUTES_DEFINED__
+
+#define __IIG_ATTRIBUTES_DEFINED__     1
+
+#define KERNEL       __attribute__((annotate("kernel")))
+#define NATIVE       __attribute__((annotate("native")))
+#define LOCAL        __attribute__((annotate("local")))
+#define LOCALONLY    __attribute__((annotate("localonly")))
+#define REMOTE       __attribute__((annotate("remote")))
+
+#define LOCALHOST    __attribute__((annotate("localhost")))
+
+#define INVOKEREPLY  __attribute__((annotate("invokereply")))
+#define REPLY        __attribute__((annotate("reply")))
+
+#define PORTMAKESEND __attribute__((annotate("MACH_MSG_TYPE_MAKE_SEND")))
+#define PORTCOPYSEND __attribute__((annotate("MACH_MSG_TYPE_COPY_SEND")))
+
+#define TARGET       __attribute__((annotate("target")))
+#define TYPE(p)      __attribute__((annotate("type=" # p)))
+
+//#define ARRAY(maxcount) __attribute__((annotate(# maxcount), annotate("array")))
+#define EXTENDS(cls) __attribute__((annotate("extends=" # cls)))
+
+//#define INTERFACE    __attribute__((annotate("interface")))
+//#define IMPLEMENTS(i)   void implements(i *);
+
+#define QUEUENAME(name) __attribute__((annotate("queuename=" # name)))
+
+#define IIG_SERIALIZABLE __attribute__((annotate("serializable")))
+
+#else
+
+#define IIG_SERIALIZABLE
+
+#endif /* __IIG */
+
+
+#if !__IIG
+#if KERNEL
+typedef OSObject OSContainer;
+#else
+class IIG_SERIALIZABLE OSContainer;
+#endif
+#else
+class IIG_SERIALIZABLE OSContainer;
+#endif
+
+class IIG_SERIALIZABLE OSData;
+class IIG_SERIALIZABLE OSNumber;
+class IIG_SERIALIZABLE OSString;
+class IIG_SERIALIZABLE OSBoolean;
+class IIG_SERIALIZABLE OSDictionary;
+class IIG_SERIALIZABLE OSArray;
+
+class OSMetaClass;
+class IODispatchQueue;
+typedef char IODispatchQueueName[256];
+
+#if __IIG
+class OSMetaClassBase
+{
+       virtual const OSMetaClass *
+       getMetaClass() const LOCALONLY;
+
+    virtual void
+    retain() const LOCALONLY;
+
+    virtual void
+    release() const LOCALONLY;
+
+       virtual bool
+    isEqualTo(const OSMetaClassBase * anObject) const LOCALONLY;
+};
+#endif /* __IIG */
+
+
+/*!
+@iig implementation
+#include <DriverKit/IODispatchQueue.h>
+@iig end
+*/
+
+class OSObject : public OSMetaClassBase
+{
+public:
+
+       virtual bool
+       init() LOCALONLY;
+
+       virtual void
+       free() LOCALONLY;
+
+    virtual void
+    retain() const override;
+
+    virtual void
+    release() const override;
+
+       virtual kern_return_t
+       SetDispatchQueue(
+               const IODispatchQueueName name,
+               IODispatchQueue         * queue) KERNEL = 0;
+
+       virtual kern_return_t
+       CopyDispatchQueue(
+               const IODispatchQueueName name,
+               IODispatchQueue        ** queue) KERNEL = 0;
+};
+
+#define DEFN(classname, name)                                       \
+name ## _Impl(classname ## _ ## name ## _Args)
+
+#define IMPL(classname, name)                                       \
+classname :: DEFN(classname, name)
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#endif /* ! _IOKIT_UOSOBJECT_H */
index 5fd5f6a8ce96cc3b262a9d4fb5ac7b84d464ab98..a900642c5cd774e5783fd3ff63441f103164c4fe 100644 (file)
@@ -59,7 +59,7 @@ IOWatchDogTimer::start(IOService *provider)
        }
 
        notifier = registerSleepWakeInterest(IOWatchDogTimerSleepHandler, this);
-       if (notifier == 0) {
+       if (notifier == NULL) {
                return false;
        }
 
@@ -92,7 +92,7 @@ IOWatchDogTimer::setProperties(OSObject *properties)
        }
 
        theNumber = OSDynamicCast(OSNumber, properties);
-       if (theNumber == 0) {
+       if (theNumber == NULL) {
                return kIOReturnBadArgument;
        }
 
index 5135d166af900cf6df3a11e60a3d29f506e25fba..0df8690ef1fa99c8d6e25cca1ae493835dab4624 100644 (file)
@@ -61,8 +61,17 @@ enum{
 extern void IOBSDMountChange(struct mount * mp, uint32_t op);
 extern boolean_t IOTaskHasEntitlement(task_t task, const char * entitlement);
 
+typedef enum {
+       kIOPolledCoreFileModeNotInitialized,
+       kIOPolledCoreFileModeDisabled,
+       kIOPolledCoreFileModeClosed,
+       kIOPolledCoreFileModeStackshot,
+       kIOPolledCoreFileModeCoredump,
+} IOPolledCoreFileMode_t;
+
 extern struct IOPolledFileIOVars * gIOPolledCoreFileVars;
 extern kern_return_t gIOPolledCoreFileOpenRet;
+extern IOPolledCoreFileMode_t gIOPolledCoreFileMode;
 
 #ifdef __cplusplus
 }
index e025e01a366b8d7f27cd1b5a85695ae28d32c5d2..112bf79fd3b7ba46911bafca92081beca3aa08a8 100644 (file)
@@ -29,6 +29,7 @@
 #define _IOBUFFERMEMORYDESCRIPTOR_H
 
 #include <IOKit/IOMemoryDescriptor.h>
+#include <DriverKit/IOBufferMemoryDescriptor.h>
 
 enum {
        kIOMemoryPhysicallyContiguous       = 0x00000010,
@@ -58,7 +59,7 @@ enum {
 
 class IOBufferMemoryDescriptor : public IOGeneralMemoryDescriptor
 {
-       OSDeclareDefaultStructors(IOBufferMemoryDescriptor);
+       OSDeclareDefaultStructorsWithDispatch(IOBufferMemoryDescriptor);
 
 private:
 /*! @struct ExpansionData
@@ -140,6 +141,13 @@ public:
            vm_offset_t  alignment) APPLE_KEXT_DEPRECATED;                         /* use withOptions() instead */
 #endif /* !__LP64__ */
 
+       static IOBufferMemoryDescriptor * withCopy(
+               task_t            inTask,
+               IOOptionBits      options,
+               vm_map_t          sourceMap,
+               mach_vm_address_t source,
+               mach_vm_size_t    size);
+
        static IOBufferMemoryDescriptor * withOptions(  IOOptionBits options,
            vm_size_t    capacity,
            vm_offset_t  alignment = 1);
index b1e6ca63b2479f0fce26addc1c351179e6cc8066..a52591e4f7345acf636ed4de9c15e9027411183b 100644 (file)
@@ -73,6 +73,8 @@ protected:
 
 public:
        virtual bool           start(IOService *provider) APPLE_KEXT_OVERRIDE;
+       virtual void           detach(IOService *provider) APPLE_KEXT_OVERRIDE;
+
        virtual OSObject       *getProperty(const OSSymbol *aKey) const APPLE_KEXT_OVERRIDE;
        virtual bool           setProperty(const OSSymbol *aKey, OSObject *anObject) APPLE_KEXT_OVERRIDE;
        virtual bool           serializeProperties(OSSerialize *serialize) const APPLE_KEXT_OVERRIDE;
index a9f7e3b44a960b521f3aaa29ea3463247ccb2008..682625f4302bc35d48da5aaf215b1e4b3fcce366 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 
 class IOService;
 
+extern const OSSymbol * gIOModuleIdentifierKey;
+extern const OSSymbol * gIOModuleIdentifierKernelKey;
+
+
 /*!
  *   @class IOCatalogue
  *   @abstract In-kernel database for IOKit driver personalities.
@@ -54,7 +58,7 @@ class IOService;
  */
 class IOCatalogue : public OSObject
 {
-       OSDeclareDefaultStructors(IOCatalogue)
+       OSDeclareDefaultStructors(IOCatalogue);
 
 private:
        IORWLock *               lock;
@@ -125,36 +129,21 @@ public:
  */
        SInt32 getGenerationCount( void ) const;
 
-/*!
- *   @function isModuleLoaded
- *   @abstract Reports if a kernel module has been loaded.
- *   @param moduleName  Name of the module.
- *   @result Returns true if the associated kernel module has been loaded into the kernel.
- */
-       bool isModuleLoaded( OSString * moduleName ) const;
-
-/*!
- *   @function isModuleLoaded
- *   @abstract Reports if a kernel module has been loaded.
- *   @param moduleName  Name of the module.
- *   @result Returns true if the associated kernel module has been loaded into the kernel.
- */
-       bool isModuleLoaded( const char * moduleName ) const;
-
 /*!
  *   @function isModuleLoaded
  *   @abstract Reports if a kernel module has been loaded for a particular personality.
  *   @param driver  A driver personality's property list.
+ *   @param kextRef A reference to the kext getting loaded.
  *   @result Returns true if the associated kernel module has been loaded into the kernel for a particular driver personality on which it depends.
  */
-       bool isModuleLoaded( OSDictionary * driver ) const;
+       bool isModuleLoaded( OSDictionary * driver, OSObject ** kextRef ) const;
 
 /*!
  *   @function moduleHasLoaded
  *   @abstract Callback function called after a IOKit dependent kernel module is loaded.
  *   @param name  Name of the kernel module.
  */
-       void moduleHasLoaded( OSString * name );
+       void moduleHasLoaded( const OSSymbol * name );
 
 /*!
  *   @function moduleHasLoaded
@@ -188,10 +177,15 @@ public:
 
 /*!
  *   @function startMatching
- *   @abstract Starts an IOService matching thread where matching keys and values are provided by the matching dictionary.
- *   @param matching  A dictionary whose keys and values are used for matching personalities in the database.  For example, a matching dictionary containing a 'IOProviderClass' key with the value 'IOPCIDevice' will start matching for all personalities which have the key 'IOProviderClass' equal to 'IOPCIDevice'.
+ *   @abstract Restarts IOService matching.
+ *   @param identifier  All IOService objects with this bundle indentifier are rematched.
  */
+       bool startMatching( const OSSymbol * identifier );
+
+       // deprecated, for bin compat
+#if defined(__i386__) || defined(__x86_64__)
        bool startMatching( OSDictionary * matching );
+#endif
 
 /*!
  *   @function reset
index 9c3e6c06bd133a03ad1452fad1f329218a02d170..6cfd848c99dca887593c49facf9a813b4ab9cd1d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -68,7 +68,7 @@
 
 class IOCommand : public OSObject
 {
-       OSDeclareDefaultStructors(IOCommand)
+       OSDeclareDefaultStructors(IOCommand);
 
 public:
        virtual bool init(void) APPLE_KEXT_OVERRIDE;
index 2fa36e137f383596614f24ec92d1d135e33a6c6a..2a1d2f287816dd64c73ba8f0bd7dad247a553b1b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -59,7 +59,7 @@
  */
 class IOCommandGate : public IOEventSource
 {
-       OSDeclareDefaultStructors(IOCommandGate)
+       OSDeclareDefaultStructors(IOCommandGate);
 
 public:
 /*!
@@ -99,7 +99,7 @@ public:
 /*! @function commandGate
  *   @abstract Factory method to create and initialise an IOCommandGate, See $link init.
  *   @result Returns a pointer to the new command gate if sucessful, 0 otherwise. */
-       static IOCommandGate *commandGate(OSObject *owner, Action action = 0);
+       static IOCommandGate *commandGate(OSObject *owner, Action action = NULL);
 
 /*! @function init
  *   @abstract Class initialiser.
@@ -112,7 +112,7 @@ public:
  *  must cast the member function to $link IOCommandGate::Action and they will get a
  *  compiler warning.  Defaults to zero, see $link IOEventSource::setAction.
  *   @result True if inherited classes initialise successfully. */
-       virtual bool init(OSObject *owner, Action action = 0);
+       virtual bool init(OSObject *owner, Action action = NULL);
 
 // Superclass overrides
        virtual void free() APPLE_KEXT_OVERRIDE;
@@ -132,8 +132,8 @@ public:
  *   @param arg3 Parameter for action of command gate, defaults to 0.
  *   @result kIOReturnSuccess if successful. kIOReturnAborted if a disabled command gate is free()ed before being reenabled, kIOReturnNoResources if no action available.
  */
-       virtual IOReturn runCommand(void *arg0 = 0, void *arg1 = 0,
-           void *arg2 = 0, void *arg3 = 0);
+       virtual IOReturn runCommand(void *arg0 = NULL, void *arg1 = NULL,
+           void *arg2 = NULL, void *arg3 = NULL);
 
 /*! @function runAction
  *   @abstract Single thread a call to an action with the target work loop.
@@ -151,8 +151,8 @@ public:
  *   @result The return value of action if it was called, kIOReturnBadArgument if action is not defined, kIOReturnAborted if a disabled command gate is free()ed before being reenabled.
  */
        virtual IOReturn runAction(Action action,
-           void *arg0 = 0, void *arg1 = 0,
-           void *arg2 = 0, void *arg3 = 0);
+           void *arg0 = NULL, void *arg1 = NULL,
+           void *arg2 = NULL, void *arg3 = NULL);
 
 #ifdef __BLOCKS__
 /*! @function runActionBlock
@@ -179,8 +179,8 @@ public:
  *   @param arg3 Parameter for action of command gate, defaults to 0.
  *   @result kIOReturnSuccess if successful. kIOReturnNotPermitted if this event source is currently disabled, kIOReturnNoResources if no action available, kIOReturnCannotLock if lock attempt fails.
  */
-       virtual IOReturn attemptCommand(void *arg0 = 0, void *arg1 = 0,
-           void *arg2 = 0, void *arg3 = 0);
+       virtual IOReturn attemptCommand(void *arg0 = NULL, void *arg1 = NULL,
+           void *arg2 = NULL, void *arg3 = NULL);
 
 /*! @function attemptAction
  *   @abstract Single thread a call to an action with the target work loop.
@@ -197,8 +197,8 @@ public:
  *
  */
        virtual IOReturn attemptAction(Action action,
-           void *arg0 = 0, void *arg1 = 0,
-           void *arg2 = 0, void *arg3 = 0);
+           void *arg0 = NULL, void *arg1 = NULL,
+           void *arg2 = NULL, void *arg3 = NULL);
 
 /*! @function commandSleep
  *   @abstract Put a thread that is currently holding the command gate to sleep.
index 356c04acedc1646ad1339423db67821698736f41..ee30bb44ea7d634a6fd979f577a29074fe0254a5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -74,7 +74,7 @@
 
 class IOCommandPool : public OSObject
 {
-       OSDeclareDefaultStructors(IOCommandPool)
+       OSDeclareDefaultStructors(IOCommandPool);
 
 
 protected:
@@ -199,7 +199,8 @@ protected:
  * doesn't wish to block until one does become available.
  * kIOReturnSuccess if the vCommand argument is valid.
  */
-       virtual IOReturn gatedGetCommand(IOCommand **command, bool blockForCommand);
+       virtual IOReturn gatedGetCommand(
+               LIBKERN_RETURNS_NOT_RETAINED IOCommand **command, bool blockForCommand);
 
 /*!
  * @function gatedReturnCommand
index 2193062b2bbe77740d7903715c720e14d4adc0ad..5ad86ec5b933179a4813b8ed6148396cfd74186c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -48,7 +48,7 @@ typedef void (*IOCommandQueueAction)
 
 class IOCommandQueue : public IOEventSource
 {
-       OSDeclareDefaultStructors(IOCommandQueue)
+       OSDeclareDefaultStructors(IOCommandQueue);
 
 protected:
        static const int kIOCQDefaultSize = 128;
@@ -65,17 +65,17 @@ protected:
 
 public:
        static IOCommandQueue *commandQueue(OSObject *inOwner,
-           IOCommandQueueAction inAction = 0,
+           IOCommandQueueAction inAction = NULL,
            int inSize = kIOCQDefaultSize)
        APPLE_KEXT_DEPRECATED;
        virtual bool init(OSObject *inOwner,
-           IOCommandQueueAction inAction = 0,
+           IOCommandQueueAction inAction = NULL,
            int inSize = kIOCQDefaultSize)
        APPLE_KEXT_DEPRECATED;
 
        virtual kern_return_t enqueueCommand(bool gotoSleep = true,
-           void *field0 = 0, void *field1 = 0,
-           void *field2 = 0, void *field3 = 0)
+           void *field0 = NULL, void *field1 = NULL,
+           void *field2 = NULL, void *field3 = NULL)
        APPLE_KEXT_DEPRECATED;
 
 // WARNING:  This function can only be safely called from the appropriate
@@ -84,8 +84,8 @@ public:
 // For each entry in the commandQueue call the target/action.
 // Lockout all new entries to the queue while iterating.
 // If the input fields are zero then the queue's owner/action will be used.
-       virtual int performAndFlush(OSObject *target = 0,
-           IOCommandQueueAction inAction = 0)
+       virtual int performAndFlush(OSObject *target = NULL,
+           IOCommandQueueAction inAction = NULL)
        APPLE_KEXT_DEPRECATED;
 };
 
index 408a78515e493fc71d12ea5abe1b79f2841008b8..52dd502d037690fe9a9d406c9d8c3f09570779b6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -40,7 +40,7 @@
 
 class IOConditionLock : public OSObject
 {
-       OSDeclareDefaultStructors(IOConditionLock)
+       OSDeclareDefaultStructors(IOConditionLock);
 
 private:
        IOLock *            cond_interlock;     // condition var Simple lock
index 5628986e2438f47567cccaaf25ef5e3ea04e9744..adaaf4cd0e70627470b85e2ebeeab830e0018a7e 100644 (file)
@@ -221,8 +221,8 @@ public:
            MappingOptions   mappingOptions = kMapped,
            UInt64           maxTransferSize = 0,
            UInt32           alignment = 1,
-           IOMapper        *mapper = 0,
-           void            *refCon = 0);
+           IOMapper        *mapper = NULL,
+           void            *refCon = NULL);
 
 /*! @function weakWithSpecification
  *   @abstract Creates and initialises an IODMACommand in one operation if this version of the operating system supports it.
@@ -251,8 +251,8 @@ public:
            MappingOptions  mapType = kMapped,
            UInt64          maxTransferSize = 0,
            UInt32          alignment = 1,
-           IOMapper       *mapper = 0,
-           void           *refCon = 0) __attribute__((always_inline));
+           IOMapper       *mapper = NULL,
+           void           *refCon = NULL) __attribute__((always_inline));
 
        static IODMACommand *
        withSpecification(SegmentFunction        outSegFunc,
@@ -276,7 +276,7 @@ public:
  *   @discussion Factory function to create and initialise an IODMACommand in one operation.  The current command's specification will be duplicated in the new object, but however none of its state will be duplicated.  This means that it is safe to clone a command even if it is currently active and running, however you must be certain that the command to be duplicated does have a valid reference for the duration.
  *   @result Returns a new IODMACommand if successfully created and initialised, 0 otherwise.
  */
-       virtual IODMACommand *cloneCommand(void *refCon = 0);
+       virtual IODMACommand *cloneCommand(void *refCon = NULL);
 
 /*! @function initWithSpecification
  *   @abstract Primary initializer for the IODMACommand class.
@@ -296,8 +296,8 @@ public:
            MappingOptions mappingOptions = kMapped,
            UInt64    maxTransferSize = 0,
            UInt32    alignment = 1,
-           IOMapper *mapper = 0,
-           void     *refCon = 0);
+           IOMapper *mapper = NULL,
+           void     *refCon = NULL);
 
 /*! @function setMemoryDescriptor
  *   @abstract Sets and resets the DMACommand's current memory descriptor
@@ -481,7 +481,7 @@ public:
            MappingOptions    mappingOptions = kMapped,
            UInt64            maxTransferSize = 0,
            UInt32            alignment = 1,
-           IOMapper          *mapper = 0,
+           IOMapper          *mapper = NULL,
            UInt64            offset = 0,
            UInt64            length = 0,
            bool              flushCache = true,
@@ -515,7 +515,7 @@ public:
  */
 
        virtual
-       bool initWithRefCon(void * refCon = 0);
+       bool initWithRefCon(void * refCon = NULL);
 
        virtual
        bool initWithSpecification(SegmentFunction        outSegFunc,
@@ -638,7 +638,7 @@ weakWithSpecification(IODMACommand **newCommand,
                ret =  kIOReturnSuccess;
        } else {
                self->release();
-               self = 0;
+               self = NULL;
                ret = kIOReturnError;
        }
 
index 88ffeed97da5f4d9c9bfb5587dfb7fd9caca0a72..5b26e08b70baf30509a5dc524af4c5ecc54175a8 100644 (file)
@@ -53,8 +53,8 @@ protected:
 public:
        static IODMAEventSource *dmaEventSource(OSObject *owner,
            IOService *provider,
-           Action completion = 0,
-           Action notification = 0,
+           Action completion = NULL,
+           Action notification = NULL,
            UInt32 dmaIndex = 0);
 
        virtual IOReturn startDMACommand(IODMACommand *dmaCommand, IODirection direction, IOByteCount byteCount = 0, IOByteCount byteOffset = 0);
@@ -83,8 +83,8 @@ private:
 
        virtual bool init(OSObject *owner,
            IOService *provider,
-           Action completion = 0,
-           Action notification = 0,
+           Action completion = NULL,
+           Action notification = NULL,
            UInt32 dmaIndex = 0);
        virtual bool checkForWork(void) APPLE_KEXT_OVERRIDE;
        virtual void free(void) APPLE_KEXT_OVERRIDE;
index c7de3c5febf3ea46c38cee4fdaaaa8b111a468b2..c16d03fa2f089ba282312c9f1ea1805b6ddf5ce7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -46,6 +46,14 @@ struct _notifyMsg {
        mach_msg_header_t h;
 };
 
+
+#ifdef dequeue
+#undef dequeue
+#endif
+#ifdef enqueue
+#undef enqueue
+#endif
+
 /*!
  * @class IODataQueue : public OSObject
  * @abstract A generic queue designed to pass data from the kernel to a user process.
@@ -65,7 +73,7 @@ class __attribute__((deprecated)) IODataQueue: public OSObject
 class IODataQueue : public OSObject
 #endif
 {
-       OSDeclareDefaultStructors(IODataQueue)
+       OSDeclareDefaultStructors(IODataQueue);
 
 protected:
        IODataQueueMemory * dataQueue;
index 49d0324b87d0f8b1f334d5aad358cc3fdcb09a8d..dadc043b03dea684469d04e9d79a0ef5fba2fe35 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -44,7 +44,7 @@
 
 class IODeviceMemory : public IOMemoryDescriptor
 {
-       OSDeclareDefaultStructors(IODeviceMemory)
+       OSDeclareDefaultStructors(IODeviceMemory);
 
 public:
 
index 24c79221a6d8ea48f22fc32b2edda43dfefd03b7..51d146aab18c4c6b5e7762295281e77aedeba5f1 100644 (file)
@@ -61,7 +61,8 @@ bool IODTMatchNubWithKeys( IORegistryEntry * nub,
     const char * keys );
 
 bool IODTCompareNubName( const IORegistryEntry * regEntry,
-    OSString * name, OSString ** matchingName );
+    OSString * name,
+    LIBKERN_RETURNS_RETAINED_ON_NONZERO OSString ** matchingName );
 
 enum {
        kIODTRecursive      = 0x00000001,
index 4b48cb4754cbd7546833874de44a7efdb170d200..db000cf38e671c448dfc64c2429528adfe0d5112 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000, 2009 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -83,7 +83,7 @@ __END_DECLS
  */
 class IOEventSource : public OSObject
 {
-       OSDeclareAbstractStructors(IOEventSource)
+       OSDeclareAbstractStructors(IOEventSource);
        friend class IOWorkLoop;
 #if IOKITSTATS
        friend class IOStatistics;
@@ -181,7 +181,7 @@ protected:
  *   @result true if the inherited classes and this instance initialise
  *  successfully.
  */
-       virtual bool init(OSObject *owner, IOEventSource::Action action = 0);
+       virtual bool init(OSObject *owner, IOEventSource::Action action = NULL);
 
        virtual void free( void ) APPLE_KEXT_OVERRIDE;
 
index db887e746b5cf2421a77f213db6cd4663fe01593..263f8ac4418c36a7b1a6d3a9db64c4800503fac9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -51,7 +51,7 @@ class IOService;
  */
 class IOFilterInterruptEventSource : public IOInterruptEventSource
 {
-       OSDeclareDefaultStructors(IOFilterInterruptEventSource)
+       OSDeclareDefaultStructors(IOFilterInterruptEventSource);
 
 public:
 /*!
@@ -73,14 +73,14 @@ public:
 private:
 // Hide the superclass initializers
        virtual bool init(OSObject *inOwner,
-           IOInterruptEventSource::Action inAction = 0,
-           IOService *inProvider = 0,
+           IOInterruptEventSource::Action inAction = NULL,
+           IOService *inProvider = NULL,
            int inIntIndex = 0) APPLE_KEXT_OVERRIDE;
 
        static IOInterruptEventSource *
        interruptEventSource(OSObject *inOwner,
-           IOInterruptEventSource::Action inAction = 0,
-           IOService *inProvider = 0,
+           IOInterruptEventSource::Action inAction = NULL,
+           IOService *inProvider = NULL,
            int inIntIndex = 0);
 
 protected:
index b1dd7e369229b8f0c9d7cb4b7a6692ab5ebfdfd8..ca42aa7f29de47f68cb72b4fd99a2241c2c5acff 100644 (file)
@@ -134,13 +134,13 @@ static const char * const kInterruptAccountingStatisticNameArray[IA_NUM_INTERRUP
  * two processors at once (and the interrupt should serve to force out stores), and the second level
  * handler should be synchonized by the work loop it runs on.
  */
-#if __x86_64__ || __arm64
+#if __x86_64__ || __arm64__
 #define IA_ADD_VALUE(target, value) \
     (*(target) += (value))
-#else
+#else /* !(__x86_64__ || __arm64__) */
 #define IA_ADD_VALUE(target, value) \
     (OSAddAtomic64((value), (target)))
-#endif
+#endif /* !(__x86_64__ || __arm64__) */
 
 /*
  * TODO: Should this be an OSObject?  Or properly pull in its methods as member functions?
@@ -160,6 +160,9 @@ struct IOInterruptAccountingData {
         */
        int interruptIndex;
 
+       bool enablePrimaryTimestamp;
+       volatile uint64_t primaryTimestamp __attribute__((aligned(8)));
+
        /*
         * As long as we are based on the simple reporter, all our channels will be 64 bits.  Align the data
         * to allow for safe atomic updates (we don't want to cross a cache line on any platform, but for some
index 40e5bc1dc839ea4d26e279771df5c83f2f7e8427..6313a87b4f5437164495002fc8397139433edecf 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -60,7 +60,7 @@ struct IOInterruptAccountingData;
  */
 class IOInterruptEventSource : public IOEventSource
 {
-       OSDeclareDefaultStructors(IOInterruptEventSource)
+       OSDeclareDefaultStructors(IOInterruptEventSource);
 
 public:
 /*! @typedef Action
@@ -138,7 +138,7 @@ public:
        static IOInterruptEventSource *
        interruptEventSource(OSObject *owner,
            Action action,
-           IOService *provider = 0,
+           IOService *provider = NULL,
            int intIndex = 0);
 
 
@@ -171,7 +171,7 @@ public:
  *  successfully.  */
        virtual bool init(OSObject *owner,
            Action action,
-           IOService *provider = 0,
+           IOService *provider = NULL,
            int intIndex = 0);
 
 /*! @function enable
@@ -231,6 +231,20 @@ public:
  *   @param abstime Time at which interrupt is expected. */
        IOReturn warmCPU(uint64_t abstime);
 
+/*! @function enablePrimaryInterruptTimestamp
+ *   @abstract Enables collection of mach_absolute_time at primary interrupt.
+ *   @discussion Enables collection of mach_absolute_time at primary interrupt.
+ *   @param enable True to enable timestamp. */
+
+       void enablePrimaryInterruptTimestamp(bool enable);
+
+/*! @function getPimaryInterruptTimestamp
+ *   @abstract Returns mach_absolute_time timestamp of primary interrupt.
+ *   @discussion Returns mach_absolute_time timestamp of primary interrupt.
+ *   @result Value of the timestamp. Zero if never interrupted, or -1ULL if timestamp collection has not been enabled. */
+
+       uint64_t getPimaryInterruptTimestamp();
+
 private:
        IOReturn registerInterruptHandler(IOService *inProvider, int inIntIndex);
        void unregisterInterruptHandler(IOService *inProvider, int inIntIndex);
index 6f114a34703dd02419f1a84ba29576b7a3ea1c5e..50a811610c49dede138ec5cd032f435297401915 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2010 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -40,7 +40,7 @@
 
 class IOKitDiagnostics : public OSObject
 {
-       OSDeclareDefaultStructors(IOKitDiagnostics)
+       OSDeclareDefaultStructors(IOKitDiagnostics);
 
 public:
        static OSObject * diagnostics( void );
@@ -128,8 +128,31 @@ enum {
        kIOTraceCompatBootArgs  =               kIOTraceIOService | kIOTracePowerMgmt
 };
 
+enum {
+       kIODKEnable             = 0x00000001ULL,
+       kIODKLogSetup   = 0x00000002ULL,
+       kIODKLogIPC             = 0x00000004ULL,
+       kIODKLogPM             = 0x00000008ULL,
+       kIODKLogMessages = 0x00000010ULL,
+
+       kIODKDisablePM = 0x000000100ULL,
+       kIODKDisableDextLaunch = 0x00001000ULL,
+       kIODKDisableDextTag    = 0x00002000ULL,
+       kIODKDisableCDHashChecking  = 0x00004000ULL,
+       kIODKDisableEntitlementChecking = 0x00008000ULL,
+};
+
+#if XNU_KERNEL_PRIVATE
+
+#define DKLOG(fmt, args...) { IOLog("DK: " fmt, ## args); }
+#define DKS                "%s-0x%qx"
+#define DKN(s)              s->getName(), s->getRegistryEntryID()
+
+#endif /* XNU_KERNEL_PRIVATE */
+
 extern SInt64    gIOKitDebug;
 extern SInt64    gIOKitTrace;
+extern SInt64    gIODKDebug;
 
 #ifdef __cplusplus
 extern "C" {
index 12976053b8bfad453504b3977fe977b341e10302..f78c45cc1c6a7bc3f3d8ab6aee46e460a3d095eb 100644 (file)
@@ -1,10 +1,12 @@
+/* * Copyright (c) 2019 Apple Inc. All rights reserved. */
+
 #include <IOKit/IOService.h>
 #include <IOKit/IOUserClient.h>
 
 
 class IOKitDiagnosticsClient : public IOUserClient
 {
-       OSDeclareDefaultStructors(IOKitDiagnosticsClient)
+       OSDeclareDefaultStructors(IOKitDiagnosticsClient);
 
 public:
        static  IOUserClient * withTask(task_t owningTask);
index 84e5a0afd459f2c3c0c9f04bb08c673bae7a382b..e8db76ced32d47bea16e33487a0c395d9a263d3f 100644 (file)
 #define kIOMatchCategoryKey             "IOMatchCategory"
 #define kIODefaultMatchCategoryKey      "IODefaultMatchCategory"
 
+#define kIOMatchedPersonalityKey        "IOMatchedPersonality"
+#define kIORematchPersonalityKey        "IORematchPersonality"
+#define kIORematchCountKey              "IORematchCount"
+#define kIODEXTMatchCountKey            "IODEXTMatchCount"
+
+// Entitlements to check against dext process
+// Property is an array, one or more of which may match, of:
+//   an array of entitlement strings, all must be present
+// Any array can be a single string.
+#define kIOServiceDEXTEntitlementsKey   "IOServiceDEXTEntitlements"
+
+// Entitlement required to open dext connection
+#define kIODriverKitEntitlementKey      "com.apple.developer.driverkit"
+
+// Entitlements required to open dext IOUserClient
+// Property is an array of strings containing CFBundleIdentifiers of service being opened
+#define kIODriverKitUserClientEntitlementsKey "com.apple.developer.driverkit.userclient-access"
+
+// Other DriverKit entitlements
+#define kIODriverKitUSBTransportEntitlementKey "com.apple.developer.driverkit.transport.usb"
+#define kIODriverKitHIDTransportEntitlementKey "com.apple.developer.driverkit.transport.hid"
+#define kIODriverKitHIDFamilyDeviceEntitlementKey "com.apple.developer.driverkit.family.hid.device"
+#define kIODriverKitHIDFamilyEventServiceEntitlementKey "com.apple.developer.driverkit.family.hid.eventservice"
+#define kIODriverKitTransportBuiltinEntitlementKey "com.apple.developer.driverkit.builtin"
+
+
+// When possible, defer matching of this driver until kextd has started.
+#define kIOMatchDeferKey                                "IOMatchDefer"
+
 // IOService default user client class, for loadable user clients
 #define kIOUserClientClassKey           "IOUserClientClass"
 
 #define kIOUserClientCrossEndianKey             "IOUserClientCrossEndian"
 #define kIOUserClientCrossEndianCompatibleKey   "IOUserClientCrossEndianCompatible"
 #define kIOUserClientSharedInstanceKey          "IOUserClientSharedInstance"
+#if KERNEL_PRIVATE
+#define kIOUserClientMessageAppSuspendedKey     "IOUserClientMessageAppSuspended"
+#endif
 // diagnostic string describing the creating task
 #define kIOUserClientCreatorKey         "IOUserClientCreator"
+// the expected cdhash value of the userspace driver executable
+#define kIOUserServerCDHashKey          "IOUserServerCDHash"
+
+#define kIOUserUserClientKey                    "IOUserUserClient"
+
 
 // IOService notification types
 #define kIOPublishNotification          "IOServicePublish"
 #define kIOPlatformUUIDKey      "IOPlatformUUID"        // (OSString)
 
 // IODTNVRAM property keys
+#define kIONVRAMBootArgsKey             "boot-args"
 #define kIONVRAMDeletePropertyKey       "IONVRAM-DELETE-PROPERTY"
 #define kIONVRAMSyncNowPropertyKey      "IONVRAM-SYNCNOW-PROPERTY"
 #define kIONVRAMActivateCSRConfigPropertyKey    "IONVRAM-ARMCSR-PROPERTY"
index 099970fbc73a05ab03345d52fed026edc425e529..f7cc9eae6094bc416bc52155234b1c9b4373a239 100644 (file)
@@ -71,8 +71,8 @@ enum {
        kIOCatalogAddDriversNoMatch,
        kIOCatalogRemoveDrivers,
        kIOCatalogRemoveDriversNoMatch,
-       kIOCatalogStartMatching,
-       kIOCatalogRemoveKernelLinker,
+       kIOCatalogStartMatching__Removed,
+       kIOCatalogRemoveKernelLinker__Removed,
        kIOCatalogKextdActive,
        kIOCatalogKextdFinishedLaunching,
        kIOCatalogResetDrivers,
@@ -154,11 +154,17 @@ extern kern_return_t iokit_destroy_object_port( ipc_port_t port );
 extern mach_port_name_t iokit_make_send_right( task_t task,
     io_object_t obj, ipc_kobject_type_t type );
 
+extern mach_port_t ipc_port_make_send(mach_port_t);
+extern void ipc_port_release_send(ipc_port_t port);
+
+extern io_object_t iokit_lookup_io_object(ipc_port_t port, ipc_kobject_type_t type);
+
 extern kern_return_t iokit_mod_send_right( task_t task, mach_port_name_t name, mach_port_delta_t delta );
 
 extern io_object_t iokit_lookup_object_with_port_name(mach_port_name_t name, ipc_kobject_type_t type, task_t task);
 
 extern io_object_t iokit_lookup_connect_ref_current_task(mach_port_name_t name);
+extern io_object_t iokit_lookup_uext_ref_current_task(mach_port_name_t name);
 
 extern void iokit_retain_port( ipc_port_t port );
 extern void iokit_release_port( ipc_port_t port );
@@ -169,6 +175,17 @@ extern void iokit_unlock_port(ipc_port_t port);
 
 extern kern_return_t iokit_switch_object_port( ipc_port_t port, io_object_t obj, ipc_kobject_type_t type );
 
+#ifndef MACH_KERNEL_PRIVATE
+typedef struct ipc_kmsg * ipc_kmsg_t;
+extern ipc_kmsg_t ipc_kmsg_alloc(size_t);
+extern void ipc_kmsg_destroy(ipc_kmsg_t);
+extern mach_msg_header_t * ipc_kmsg_msg_header(ipc_kmsg_t);
+#endif /* MACH_KERNEL_PRIVATE */
+
+extern kern_return_t
+uext_server(ipc_kmsg_t request, ipc_kmsg_t * preply);
+
+
 /*
  * Functions imported by iokit:IOMemoryDescriptor.cpp
  */
index 989dc1de11ee411f8854abd91f6c2cc52cf394ec..df503612372f9746ebde285f2f5118d43bf76cd2 100644 (file)
@@ -41,6 +41,7 @@
 
 #include <stdarg.h>
 #include <sys/cdefs.h>
+#include <os/overflow.h>
 
 #include <sys/appleapiopts.h>
 
@@ -80,7 +81,8 @@ typedef void (*IOThreadFunc)(void *argument);
  *   @param size Size of the memory requested.
  *   @result Pointer to the allocated memory, or zero on failure. */
 
-void * IOMalloc(vm_size_t size)  __attribute__((alloc_size(1)));
+void * IOMalloc(vm_size_t size)      __attribute__((alloc_size(1)));
+void * IOMallocZero(vm_size_t size)  __attribute__((alloc_size(1)));
 
 /*! @function IOFree
  *   @abstract Frees memory allocated with IOMalloc.
@@ -147,14 +149,40 @@ void * IOMallocPageable(vm_size_t size, vm_size_t alignment) __attribute__((allo
 void IOFreePageable(void * address, vm_size_t size);
 
 /*
- * Typed memory allocation macros. Both may block.
+ * Typed memory allocation macros. All may block.
  */
-#define IONew(type, number) \
-( ((number) != 0 && ((vm_size_t) ((sizeof(type) * (number) / (number))) != sizeof(type)) /* overflow check 20847256 */ \
-  ? 0 \
-  : ((type*)IOMalloc(sizeof(type) * (number)))) )
 
-#define IODelete(ptr, type, number) IOFree( (ptr) , sizeof(type) * (number) )
+#define IONew(type, count)                              \
+({                                                      \
+    size_t __size;                                      \
+    (os_mul_overflow(sizeof(type), (count), &__size)    \
+    ? ((type *) NULL)                                   \
+    : ((type *) IOMalloc(__size)));                     \
+})
+
+#define IONewZero(type, count)                          \
+({                                                      \
+    size_t __size;                                      \
+    (os_mul_overflow(sizeof(type), (count), &__size)    \
+    ? ((type *) NULL)                                   \
+    : ((type *) IOMallocZero(__size)));                 \
+})
+
+#define IODelete(ptr, type, count)                          \
+({                                                          \
+    size_t __size;                                          \
+    if (!os_mul_overflow(sizeof(type), (count), &__size)) { \
+       IOFree(ptr, __size);                                \
+    }                                                       \
+})
+
+#define IOSafeDeleteNULL(ptr, type, count)              \
+    do {                                                \
+       if (NULL != (ptr)) {                            \
+           IODelete((ptr), type, count);               \
+           (ptr) = NULL;                               \
+       }                                               \
+    } while (0)                                         \
 
 /////////////////////////////////////////////////////////////////////////////
 //
@@ -344,7 +372,7 @@ void Debugger(const char * reason);
 #if __LP64__
 #define IOPanic(reason) panic("%s", reason)
 #else
-void IOPanic(const char *reason) __attribute__((deprecated));
+void IOPanic(const char *reason) __attribute__((deprecated)) __abortlike;
 #endif
 
 #ifdef __cplusplus
index 2c89a177aec40c15939b24316dc35e30c949fef4..88242c940413a5f3cf7a2abe109e0c5987e49885 100644 (file)
@@ -401,11 +401,18 @@ lck_spin_t * IOSimpleLockGetMachLock( IOSimpleLock * lock);
 
 /*! @function IOSimpleLockInit
  *   @abstract Initialize a spin lock.
- *   @discussion Initialize an embedded spin lock, to the unlocked state.
+ *   @discussion Initialize a non heap allocated spin lock to the unlocked state. Use this function when your lock is, for example, a member variable. You will need to call IOSimpleLockDestroy when you are finished with the lock to avoid lock group refcount leaks.
  *   @param lock Pointer to the lock. */
 
 void IOSimpleLockInit( IOSimpleLock * lock );
 
+/*! @function IOSimpleLockDestroy
+ *   @abstract De-initializes (destroys) a spin lock initialized with IOSimpleLockInit
+ *   @discussion Destroy / De-initialize a non heap allocated spin lock, releasing any system resources such as lock group refcounts.
+ *   @param lock Pointer to the lock. */
+
+void IOSimpleLockDestroy( IOSimpleLock * lock );
+
 /*! @function IOSimpleLockLock
  *   @abstract Lock a spin lock.
  *  @discussion Lock the spin lock. If the lock is held, spin waiting for its unlock. Spin locks disable preemption, cannot be held across any blocking operation, and should be held for very short periods. When used to synchronize between interrupt context and thread context they should be locked with interrupts disabled - IOSimpleLockLockDisableInterrupt() will do both. Locking the lock recursively from one thread will result in deadlock.
index 99f9dc8140146bfec28e3187403fbd9e4b1523eb..3b81e6d3de7d00c7b468d68ea78936ae0061a533 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -54,7 +54,7 @@ class IOMemoryDescriptor;
  */
 class IOMemoryCursor : public OSObject
 {
-       OSDeclareDefaultStructors(IOMemoryCursor)
+       OSDeclareDefaultStructors(IOMemoryCursor);
 
 public:
 /*!
@@ -148,7 +148,7 @@ public:
                void *              segments,
                UInt32              maxSegments,
                UInt32              maxTransferSize = 0,
-               IOByteCount        *transferSize = 0);
+               IOByteCount        *transferSize = NULL);
 };
 
 /************************ class IONaturalMemoryCursor ************************/
@@ -161,7 +161,7 @@ public:
  */
 class IONaturalMemoryCursor : public IOMemoryCursor
 {
-       OSDeclareDefaultStructors(IONaturalMemoryCursor)
+       OSDeclareDefaultStructors(IONaturalMemoryCursor);
 
 public:
 /*! @function outputSegment
@@ -221,7 +221,7 @@ public:
            PhysicalSegment    *segments,
            UInt32              maxSegments,
            UInt32              inMaxTransferSize = 0,
-           IOByteCount        *transferSize = 0)
+           IOByteCount        *transferSize = NULL)
        {
                return genPhysicalSegments(descriptor, fromPosition, segments,
                           maxSegments, inMaxTransferSize, transferSize);
@@ -237,7 +237,7 @@ public:
  */
 class IOBigMemoryCursor : public IOMemoryCursor
 {
-       OSDeclareDefaultStructors(IOBigMemoryCursor)
+       OSDeclareDefaultStructors(IOBigMemoryCursor);
 
 public:
 /*! @function outputSegment
@@ -298,7 +298,7 @@ public:
            PhysicalSegment *    segments,
            UInt32               maxSegments,
            UInt32               inMaxTransferSize = 0,
-           IOByteCount       *  transferSize = 0)
+           IOByteCount       *  transferSize = NULL)
        {
                return genPhysicalSegments(descriptor, fromPosition, segments,
                           maxSegments, inMaxTransferSize, transferSize);
@@ -314,7 +314,7 @@ public:
  */
 class IOLittleMemoryCursor : public IOMemoryCursor
 {
-       OSDeclareDefaultStructors(IOLittleMemoryCursor)
+       OSDeclareDefaultStructors(IOLittleMemoryCursor);
 
 public:
 /*! @function outputSegment
@@ -373,7 +373,7 @@ public:
            PhysicalSegment *    segments,
            UInt32               maxSegments,
            UInt32               inMaxTransferSize = 0,
-           IOByteCount       *  transferSize = 0)
+           IOByteCount       *  transferSize = NULL)
        {
                return genPhysicalSegments(descriptor, fromPosition, segments,
                           maxSegments, inMaxTransferSize, transferSize);
index b9deeaa4ad656435887ba3f1778782546b3f02bb..0c19f496484f0343fc823b4411ca5d9056752d76 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -33,6 +33,8 @@
 #include <IOKit/IOTypes.h>
 #include <IOKit/IOLocks.h>
 #include <libkern/c++/OSContainers.h>
+#include <DriverKit/IOMemoryDescriptor.h>
+#include <DriverKit/IOMemoryMap.h>
 #ifdef XNU_KERNEL_PRIVATE
 #include <IOKit/IOKitDebug.h>
 #endif
@@ -123,7 +125,18 @@ enum {
 #endif
 };
 
-#define kIOMapperSystem ((IOMapper *) 0)
+#define kIOMapperSystem ((IOMapper *) NULL)
+
+enum{
+       kIOMemoryLedgerTagDefault       = VM_LEDGER_TAG_DEFAULT,
+       kIOmemoryLedgerTagNetwork       = VM_LEDGER_TAG_NETWORK,
+       kIOMemoryLedgerTagMedia         = VM_LEDGER_TAG_MEDIA,
+       kIOMemoryLedgerTagGraphics      = VM_LEDGER_TAG_GRAPHICS,
+       kIOMemoryLedgerTagNeural        = VM_LEDGER_TAG_NEURAL,
+};
+enum{
+       kIOMemoryLedgerFlagNoFootprint  = VM_LEDGER_FLAG_NO_FOOTPRINT,
+};
 
 enum{
        kIOMemoryPurgeableKeepCurrent = 1,
@@ -257,7 +270,7 @@ class IOMemoryDescriptor : public OSObject
        friend class IOMemoryMap;
        friend class IOMultiMemoryDescriptor;
 
-       OSDeclareDefaultStructors(IOMemoryDescriptor);
+       OSDeclareDefaultStructorsWithDispatch(IOMemoryDescriptor);
 
 protected:
 
@@ -334,6 +347,17 @@ public:
        virtual IOReturn setPurgeable( IOOptionBits newState,
            IOOptionBits * oldState );
 
+/*! @function setOwnership
+ *   @abstract Control the ownership of a memory descriptors memory.
+ *   @discussion IOBufferMemoryDescriptor are owned by a specific task. The ownership of such a buffer may be controlled with setOwnership().
+ *   @param newOwner - the task to be the new owner of the memory.
+ *   @param newLedgerTag - the ledger this memory should be accounted in.
+ *   @param newLedgerOptions - accounting options
+ *   @result An IOReturn code. */
+
+       IOReturn setOwnership( task_t newOwner,
+           int newLedgerTag,
+           IOOptionBits newLedgerOptions );
 
 /*! @function getPageCounts
  *   @abstract Retrieve the number of resident and/or dirty pages encompassed by an IOMemoryDescriptor.
@@ -381,8 +405,12 @@ public:
        virtual uint64_t getPreparationID( void );
        void             setPreparationID( void );
 
+       void     setVMTags(uint32_t kernelTag, uint32_t userTag);
+       uint32_t getVMTag(vm_map_t map);
+
 #ifdef XNU_KERNEL_PRIVATE
        IOMemoryDescriptorReserved * getKernelReserved( void );
+       void                         cleanKernelReserved(IOMemoryDescriptorReserved * reserved);
        IOReturn dmaMap(
                IOMapper                    * mapper,
                IODMACommand                * command,
@@ -401,9 +429,6 @@ public:
                IOMapper                    * mapper,
                IODMACommand                * command,
                uint64_t                      mapLength);
-
-       void     setVMTags(vm_tag_t kernelTag, vm_tag_t userTag);
-       vm_tag_t getVMTag(vm_map_t map);
 #endif
 
 private:
@@ -435,7 +460,7 @@ private:
        OSMetaClassDeclareReservedUnused(IOMemoryDescriptor, 15);
 
 protected:
-       virtual void free() APPLE_KEXT_OVERRIDE;
+       virtual void free(void) APPLE_KEXT_OVERRIDE;
 public:
        static void initialize( void );
 
@@ -796,7 +821,7 @@ protected:
 
 class IOMemoryMap : public OSObject
 {
-       OSDeclareDefaultStructors(IOMemoryMap)
+       OSDeclareDefaultStructorsWithDispatch(IOMemoryMap);
 #ifdef XNU_KERNEL_PRIVATE
 public:
        IOMemoryDescriptor * fMemory;
@@ -817,8 +842,8 @@ public:
 #endif /* XNU_KERNEL_PRIVATE */
 
 protected:
-       virtual void taggedRelease(const void *tag = 0) const APPLE_KEXT_OVERRIDE;
-       virtual void free() APPLE_KEXT_OVERRIDE;
+       virtual void taggedRelease(const void *tag = NULL) const APPLE_KEXT_OVERRIDE;
+       virtual void free(void) APPLE_KEXT_OVERRIDE;
 
 public:
 /*! @function getVirtualAddress
@@ -826,7 +851,7 @@ public:
  *   @discussion This method returns the virtual address of the first byte in the mapping. Since the IOVirtualAddress is only 32bit in 32bit kernels, the getAddress() method should be used for compatibility with 64bit task mappings.
  *   @result A virtual address. */
 
-       virtual IOVirtualAddress    getVirtualAddress();
+       virtual IOVirtualAddress    getVirtualAddress(void);
 
 /*! @function getPhysicalSegment
  *   @abstract Break a mapping into its physically contiguous segments.
@@ -849,14 +874,14 @@ public:
  *   @discussion This method returns the physical address of the  first byte in the mapping. It is most useful on mappings known to be physically contiguous.
  *   @result A physical address. */
 
-       IOPhysicalAddress getPhysicalAddress();
+       IOPhysicalAddress getPhysicalAddress(void);
 
 /*! @function getLength
  *   @abstract Accessor to the length of the mapping.
  *   @discussion This method returns the length of the mapping.
  *   @result A byte count. */
 
-       virtual IOByteCount         getLength();
+       virtual IOByteCount         getLength(void);
 
 /*! @function getAddressTask
  *   @abstract Accessor to the task of the mapping.
@@ -1055,6 +1080,11 @@ public:
                IOMemoryReference * ref,
                IOOptionBits newState,
                IOOptionBits * oldState);
+       static IOReturn memoryReferenceSetOwnership(
+               IOMemoryReference * ref,
+               task_t newOwner,
+               int newLedgerTag,
+               IOOptionBits newLedgerOptions);
        static IOReturn memoryReferenceGetPageCounts(
                IOMemoryReference * ref,
                IOByteCount       * residentPageCount,
@@ -1134,6 +1164,10 @@ public:
        virtual IOReturn setPurgeable( IOOptionBits newState,
            IOOptionBits * oldState ) APPLE_KEXT_OVERRIDE;
 
+       IOReturn setOwnership( task_t newOwner,
+           int newLedgerTag,
+           IOOptionBits newLedgerOptions );
+
        virtual addr64_t getPhysicalSegment( IOByteCount   offset,
            IOByteCount * length,
 #ifdef __LP64__
@@ -1185,4 +1219,6 @@ IOMemoryMap::getSize()
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
+extern boolean_t iokit_iomd_setownership_enabled;
+
 #endif /* !_IOMEMORYDESCRIPTOR_H */
index cdd53930aad71171c41e894f681e6b318a1010ff..1560b3305687ab2aa17f1c22617b105980f7fb09 100644 (file)
@@ -71,6 +71,12 @@ typedef UInt32 IOMessage;
 #define kIOMessageSystemCapabilityChange   iokit_common_msg(0x340)
 #define kIOMessageDeviceSignaledWakeup     iokit_common_msg(0x350)
 
+#ifdef KERNEL_PRIVATE
+// sent to IOUserClients with the property kIOUserClientMessageAppSuspendedKey
+// when their task's app suspend state changes;
+// use task_is_app_suspended() to retrieve the owning task's current state
+#define kIOMessageTaskAppSuspendedChange   iokit_common_msg(0x800)
+#endif
 
 /*!
  * @defined         kIOMessageDeviceWillPowerOff
index 995207421054b337606ce8798afc3d5614e0c0c6..f7ede4a1b6f5e7136685c17521b308a035b5d87c 100644 (file)
@@ -108,6 +108,8 @@ public:
 
        virtual IOReturn setPurgeable(IOOptionBits newState, IOOptionBits * oldState) APPLE_KEXT_OVERRIDE;
 
+       IOReturn setOwnership(task_t newOwner, int newLedgerTag, IOOptionBits newOptions);
+
 /*! @function getPageCounts
  *   @abstract Retrieve the number of resident and/or dirty pages encompassed by an IOMemoryDescriptor.
  *   @discussion This method returns the number of resident and/or dirty pages encompassed by an IOMemoryDescriptor.
index a4da3d4fa91cf6368f894944a6e8e17357d7d277..ef2e533e8ba6c9a02beba7a3c2e17da9b5c1c0eb 100644 (file)
@@ -107,8 +107,8 @@ private:
            UInt32 *propType, UInt32 *propOffset);
        virtual bool convertPropToObject(UInt8 *propName, UInt32 propNameLength,
            UInt8 *propData, UInt32 propDataLength,
-           const OSSymbol **propSymbol,
-           OSObject **propObject);
+           LIBKERN_RETURNS_RETAINED const OSSymbol **propSymbol,
+           LIBKERN_RETURNS_RETAINED OSObject **propObject);
        virtual bool convertObjectToProp(UInt8 *buffer, UInt32 *length,
            const OSSymbol *propSymbol, OSObject *propObject);
        virtual UInt16 generateOWChecksum(UInt8 *buffer);
@@ -137,6 +137,8 @@ private:
        void initNVRAMImage(void);
        void initProxyData(void);
        IOReturn syncVariables(void);
+       IOReturn setPropertyInternal(const OSSymbol *aKey, OSObject *anObject);
+
 
 public:
        virtual bool init(IORegistryEntry *old, const IORegistryPlane *plane) APPLE_KEXT_OVERRIDE;
index 0ee138617ab798c0fc11c4d24d7659aeca7f56e4..3324fc7f555d654ba5439e210325358e11925b01 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -43,7 +43,7 @@
 
 class IONotifier : public OSObject
 {
-       OSDeclareAbstractStructors(IONotifier)
+       OSDeclareAbstractStructors(IONotifier);
 
 public:
 
index 7f47bc62c3e0180c6c1beec5a5c2b87f935b9112..8e4d78a941b802ea786f65b8ac1fa5825f9d51e7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -68,7 +68,9 @@ enum {
        kPEPagingOff,
        kPEPanicBegin,
        kPEPanicEnd,
-       kPEPanicDiskShutdown
+       kPEPanicDiskShutdown,
+       kPEPanicRestartCPUNoPanicEndCallouts,
+       kPEPanicRestartCPUNoCallouts
 };
 extern int (*PE_halt_restart)(unsigned int type);
 extern int PEHaltRestart(unsigned int type);
@@ -79,8 +81,14 @@ enum {
        kIOSystemShutdownNotificationStageRootUnmount = 1,
 };
 extern void IOSystemShutdownNotification(int stage);
+
+extern uint32_t gEnforceQuiesceSafety;
 #endif /* XNU_KERNEL_PRIVATE */
 
+#ifdef KERNEL_PRIVATE
+extern boolean_t IOPMRootDomainGetWillShutdown(void);
+#endif /* KERNEL_PRIVATE */
+
 // Save the Panic Info.  Returns the number of bytes saved.
 extern UInt32 PESavePanicInfo(UInt8 *buffer, UInt32 length);
 extern void PESavePanicInfoAction(void *buffer, UInt32 offset, UInt32 length);
@@ -118,12 +126,70 @@ extern coprocessor_type_t PEGetCoprocessorVersion( void );
 
 extern OSSymbol *               gPlatformInterruptControllerName;
 
-extern const OSSymbol *         gIOPlatformSleepActionKey;
-extern const OSSymbol *         gIOPlatformWakeActionKey;
-extern const OSSymbol *         gIOPlatformQuiesceActionKey;
-extern const OSSymbol *         gIOPlatformActiveActionKey;
-extern const OSSymbol *         gIOPlatformHaltRestartActionKey;
-extern const OSSymbol *         gIOPlatformPanicActionKey;
+/*
+ * IOPlatformSleepAction
+ *
+ * Sleep is called after power management has finished all of the power plane
+ * driver power management notifications and state transitions and has
+ * committed to sleep, but before the other CPUs are powered off.
+ * The scheduler is still active.
+ */
+extern const OSSymbol *gIOPlatformSleepActionKey;
+
+/*
+ * IOPlatformWakeAction
+ *
+ * Wake is called with the scheduler enabled, but before
+ * powering on other CPUs, so try to minimize work done in this path to speed
+ * up wake time.
+ */
+extern const OSSymbol *gIOPlatformWakeActionKey;
+
+/*
+ * IOPlatformQuiesceAction
+ *
+ * Quiesce is called after all CPUs are off, scheduling is disabled,
+ * and the boot CPU is about to pull the plug.
+ * Mutexes and blocking are disallowed in this context and will panic.
+ * Do not pass this action to super() (incl. IOService, IOPlatformExpert)
+ */
+extern const OSSymbol *gIOPlatformQuiesceActionKey;
+
+/*
+ * IOPlatformActiveAction
+ *
+ * Active is called very early in the wake path before enabling the scheduler
+ * on the boot CPU.
+ * Mutexes and blocking are disallowed in this context and will panic.
+ * Do not pass this action to super() (incl. IOService, IOPlatformExpert)
+ */
+extern const OSSymbol *gIOPlatformActiveActionKey;
+
+/*
+ * IOPlatformHaltRestartAction
+ *
+ * Halt/Restart is called after the kernel finishes shutting down the
+ * system and is ready to power off or reboot.
+ *
+ * It is not guaranteed to be called in non-graceful shutdown scenarios.
+ */
+extern const OSSymbol *gIOPlatformHaltRestartActionKey;
+
+/*
+ * IOPlatformPanicAction
+ *
+ * Panic is called when the system is panicking before it records a core file
+ * (if it is configured to do so)
+ *
+ * It can be called at any time, in any context, in any state.  Don't depend
+ * on anything being powered on in a useful state.
+ *
+ * Mutexes and blocking are disallowed in this context and will fail.
+ *
+ * If you hang or panic again in this callout, the panic log may not be recorded,
+ * leading to the loss of field reports about customer issues.
+ */
+extern const OSSymbol *gIOPlatformPanicActionKey;
 
 class IORangeAllocator;
 class IONVRAMController;
@@ -168,7 +234,7 @@ public:
        virtual IOService * createNub( OSDictionary * from );
 
        virtual bool compareNubName( const IOService * nub, OSString * name,
-           OSString ** matched = 0 ) const;
+           OSString ** matched = NULL ) const;
        virtual IOReturn getNubResources( IOService * nub );
 
        virtual long getBootROMType(void);
@@ -261,7 +327,7 @@ public:
        virtual bool createNubs( IOService * parent, LIBKERN_CONSUMED OSIterator * iter );
 
        virtual bool compareNubName( const IOService * nub, OSString * name,
-           OSString ** matched = 0 ) const APPLE_KEXT_OVERRIDE;
+           OSString ** matched = NULL ) const APPLE_KEXT_OVERRIDE;
 
        virtual IOReturn getNubResources( IOService * nub ) APPLE_KEXT_OVERRIDE;
 
@@ -318,7 +384,7 @@ public:
 
 class IOPlatformExpertDevice : public IOService
 {
-       OSDeclareDefaultStructors(IOPlatformExpertDevice)
+       OSDeclareDefaultStructors(IOPlatformExpertDevice);
 
 private:
        IOWorkLoop *workLoop;
@@ -329,7 +395,7 @@ private:
 public:
        virtual bool initWithArgs( void * p1, void * p2,
            void * p3, void *p4 );
-       virtual bool compareName( OSString * name, OSString ** matched = 0 ) const APPLE_KEXT_OVERRIDE;
+       virtual bool compareName( OSString * name, OSString ** matched = NULL ) const APPLE_KEXT_OVERRIDE;
 
        virtual IOWorkLoop *getWorkLoop() const APPLE_KEXT_OVERRIDE;
        virtual IOReturn setProperties( OSObject * properties ) APPLE_KEXT_OVERRIDE;
@@ -353,13 +419,13 @@ public:
 
 class IOPlatformDevice : public IOService
 {
-       OSDeclareDefaultStructors(IOPlatformDevice)
+       OSDeclareDefaultStructors(IOPlatformDevice);
 
        struct ExpansionData { };
        ExpansionData *iopd_reserved;
 
 public:
-       virtual bool compareName( OSString * name, OSString ** matched = 0 ) const APPLE_KEXT_OVERRIDE;
+       virtual bool compareName( OSString * name, OSString ** matched = NULL ) const APPLE_KEXT_OVERRIDE;
        virtual IOService * matchLocation( IOService * client ) APPLE_KEXT_OVERRIDE;
        virtual IOReturn getResources( void ) APPLE_KEXT_OVERRIDE;
 
index 498814c826752431179f1a560d19e2fb4c752044..94e41c2c68ff788f5c458aa79c918c88c03fb77b 100644 (file)
@@ -180,7 +180,7 @@ IOReturn IOPolledFileOpen(const char * filename,
     uint64_t setFileSize, uint64_t fsFreeSize,
     void * write_file_addr, size_t write_file_len,
     IOPolledFileIOVars ** fileVars,
-    OSData ** imagePath,
+    LIBKERN_RETURNS_RETAINED OSData ** imagePath,
     uint8_t * volumeCryptKey, size_t * keySize);
 
 IOReturn IOPolledFileClose(IOPolledFileIOVars ** pVars,
diff --git a/iokit/IOKit/IORPC.h b/iokit/IOKit/IORPC.h
new file mode 100644 (file)
index 0000000..0ae1415
--- /dev/null
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+
+#ifndef _IORPC_H
+#define _IORPC_H
+
+#include <stdint.h>
+
+#ifndef PLATFORM_DriverKit
+
+#include <mach/message.h>
+
+#else /* !PLATFORM_DriverKit */
+
+#ifndef _MACH_MESSAGE_H_
+#define _MACH_MESSAGE_H_
+
+#define MACH_MSG_TYPE_MOVE_RECEIVE      16      /* Must hold receive right */
+#define MACH_MSG_TYPE_MOVE_SEND         17      /* Must hold send right(s) */
+#define MACH_MSG_TYPE_MOVE_SEND_ONCE    18      /* Must hold sendonce right */
+#define MACH_MSG_TYPE_COPY_SEND         19      /* Must hold send right(s) */
+#define MACH_MSG_TYPE_MAKE_SEND         20      /* Must hold receive right */
+#define MACH_MSG_TYPE_MAKE_SEND_ONCE    21      /* Must hold receive right */
+#define MACH_MSG_TYPE_COPY_RECEIVE      22      /* NOT VALID */
+#define MACH_MSG_TYPE_DISPOSE_RECEIVE   24      /* must hold receive right */
+#define MACH_MSG_TYPE_DISPOSE_SEND      25      /* must hold send right(s) */
+#define MACH_MSG_TYPE_DISPOSE_SEND_ONCE 26      /* must hold sendonce right */
+
+#define MACH_MSG_TYPE_PORT_NONE         0
+
+#define MACH_MSG_PORT_DESCRIPTOR                0
+#define MACH_MSG_OOL_DESCRIPTOR                 1
+
+typedef unsigned int mach_msg_copy_options_t;
+
+#define MACH_MSG_PHYSICAL_COPY          0
+#define MACH_MSG_VIRTUAL_COPY           1
+#define MACH_MSG_ALLOCATE               2
+
+typedef uint32_t natural_t;
+typedef int32_t integer_t;
+
+typedef unsigned int mach_msg_type_name_t;
+typedef unsigned int mach_msg_descriptor_type_t;
+
+#if KERNEL
+typedef void * mach_port_t;
+#define MACH_PORT_NULL  NULL
+#else /* !KERNEL */
+typedef natural_t mach_port_t;
+#define MACH_PORT_NULL  0
+#endif /* !KERNEL */
+
+typedef natural_t mach_port_name_t;
+
+typedef unsigned int mach_msg_bits_t;
+typedef natural_t mach_msg_size_t;
+typedef integer_t mach_msg_id_t;
+
+#pragma pack(push, 4)
+
+typedef struct{
+       mach_msg_bits_t       msgh_bits;
+       mach_msg_size_t       msgh_size;
+       mach_port_t           msgh_remote_port;
+       mach_port_t           msgh_local_port;
+       mach_port_name_t      msgh_voucher_port;
+       mach_msg_id_t         msgh_id;
+} mach_msg_header_t;
+
+typedef struct{
+       mach_msg_size_t msgh_descriptor_count;
+} mach_msg_body_t;
+
+typedef struct{
+       mach_port_t                   name;
+#if !(defined(KERNEL) && defined(__LP64__))
+// Pad to 8 bytes everywhere except the K64 kernel where mach_port_t is 8 bytes
+       mach_msg_size_t               pad1;
+#endif
+       unsigned int                  pad2 : 16;
+       mach_msg_type_name_t          disposition : 8;
+       mach_msg_descriptor_type_t    type : 8;
+#if defined(KERNEL)
+       uint32_t          pad_end;
+#endif
+} mach_msg_port_descriptor_t;
+
+typedef struct{
+       void *                        address;
+#if !defined(__LP64__)
+       mach_msg_size_t               size;
+#endif
+       int                           deallocate: 8;
+       mach_msg_copy_options_t       copy: 8;
+       unsigned int                  pad1: 8;
+       mach_msg_descriptor_type_t    type: 8;
+#if defined(__LP64__)
+       mach_msg_size_t               size;
+#endif
+#if defined(KERNEL) && !defined(__LP64__)
+       uint32_t          pad_end;
+#endif
+} mach_msg_ool_descriptor_t;
+
+typedef struct{
+       unsigned int                  val[80 / sizeof(int)];
+} mach_msg_max_trailer_t;
+
+#pragma pack(pop)
+
+#endif  /* _MACH_MESSAGE_H_ */
+
+#endif /* PLATFORM_DriverKit */
+
+#if KERNEL
+class IOUserServer;
+#endif /* KERNEL */
+
+typedef uint64_t OSObjectRef;
+
+enum {
+       kIORPCVersion190615       = (mach_msg_id_t) 0x4da2b68c,
+       kIORPCVersion190615Reply  = (mach_msg_id_t) 0x4da2b68d,
+
+#if DRIVERKIT_PRIVATE
+       kIORPCVersion190501       = (mach_msg_id_t) 0xfe316a7a,
+       kIORPCVersion190501Reply  = (mach_msg_id_t) 0xfe316a7b,
+
+       kIORPCVersionCurrent      = kIORPCVersion190615,
+       kIORPCVersionCurrentReply = kIORPCVersion190615Reply
+#endif /* DRIVERKIT_PRIVATE */
+};
+
+enum{
+       kIORPCMessageRemote     = 0x00000001,
+       kIORPCMessageLocalHost  = 0x00000002,
+       kIORPCMessageKernel     = 0x00000004,
+       kIORPCMessageOneway     = 0x00000008,
+       kIORPCMessageObjectRefs = 0x00000010,
+       kIORPCMessageOnqueue    = 0x00000020,
+       kIORPCMessageError      = 0x00000040,
+       kIORPCMessageSimpleReply = 0x00000080,
+};
+
+enum{
+       kIORPCMessageIDKernel   = (1ULL << 63),
+};
+
+struct IORPCMessageMach {
+       mach_msg_header_t          msgh;
+       mach_msg_body_t            msgh_body;
+       mach_msg_port_descriptor_t objects[0];
+};
+typedef struct IORPCMessageMach IORPCMessageMach;
+
+struct IORPCMessage {
+       uint64_t         msgid;
+       uint64_t         flags;
+       uint64_t         objectRefs;
+       OSObjectRef      objects[0];
+};
+typedef struct IORPCMessage IORPCMessage;
+
+extern "C" IORPCMessage *
+IORPCMessageFromMach(IORPCMessageMach * msg, bool reply);
+
+struct IORPCMessageErrorReturnContent {
+       IORPCMessage  hdr;
+       kern_return_t result;
+       uint32_t      pad;
+};
+
+#pragma pack(4)
+struct IORPCMessageErrorReturn {
+       IORPCMessageMach mach;
+       IORPCMessageErrorReturnContent content;
+};
+#pragma pack()
+
+
+class OSMetaClassBase;
+struct IORPC;
+typedef kern_return_t (*OSDispatchMethod)(OSMetaClassBase * self, const IORPC rpc);
+
+struct IORPC {
+       IORPCMessageMach * message;
+       IORPCMessageMach * reply;
+       uint32_t           sendSize;
+       uint32_t           replySize;
+};
+typedef struct IORPC IORPC;
+
+enum {
+       kOSClassCanRemote   = 0x00000001,
+};
+
+struct OSClassDescription {
+       uint32_t    descriptionSize;
+
+       char        name[96];
+       char        superName[96];
+
+       uint32_t    methodOptionsSize;
+       uint32_t    methodOptionsOffset;
+       uint32_t    metaMethodOptionsSize;
+       uint32_t    metaMethodOptionsOffset;
+       uint32_t    queueNamesSize;
+       uint32_t    queueNamesOffset;
+       uint32_t    methodNamesSize;
+       uint32_t    methodNamesOffset;
+       uint32_t    metaMethodNamesSize;
+       uint32_t    metaMethodNamesOffset;
+
+       uint64_t    flags;
+
+       uint64_t    resv1[8];
+
+       uint64_t    methodOptions[0];
+       uint64_t    metaMethodOptions[0];
+
+       char        dispatchNames[0];
+       char        methodNames[0];
+       char        metaMethodNames[0];
+};
+
+#endif /* _IORPC_H */
index 2520c5bd89e632edb4db98cf2b50c211a1f68fe4..e7b0472dc2235c504e11ea431d1193fa9fd2637a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -48,7 +48,7 @@ typedef IOByteCount IORangeScalar;
  */
 
 class IORangeAllocator : public OSObject {
-       OSDeclareDefaultStructors(IORangeAllocator)
+       OSDeclareDefaultStructors(IORangeAllocator);
 
 protected:
        UInt32              numElements;
index 0812c9579475097b97d343dc1744c2b5d4eeff05..c9e059654e354384b4e82092f5f8c8e533b63751 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -73,7 +73,7 @@ class IORegistryEntry : public OSObject
 {
        friend class IORegistryIterator;
 
-       OSDeclareDefaultStructors(IORegistryEntry)
+       OSDeclareDefaultStructors(IORegistryEntry);
 
 protected:
 /*! @struct ExpansionData
@@ -181,8 +181,8 @@ public:
  *   @result Returns the value of the Action callout.
  */
        virtual IOReturn runPropertyAction(Action action, OSObject *target,
-           void *arg0 = 0, void *arg1 = 0,
-           void *arg2 = 0, void *arg3 = 0);
+           void *arg0 = NULL, void *arg1 = NULL,
+           void *arg2 = NULL, void *arg3 = NULL);
 
 private:
 #if __LP64__
@@ -260,7 +260,7 @@ public:
  *   @param dictionary A dictionary that will become the registry entry's property table (retaining it), or zero which will cause an empty property table to be created.
  *   @result true on success, or false on a resource failure. */
 
-       virtual bool init( OSDictionary * dictionary = 0 );
+       virtual bool init( OSDictionary * dictionary = NULL );
 
 /*! @function free
  *   @abstract Standard free method for all IORegistryEntry subclasses.
@@ -576,7 +576,7 @@ public:
  *   @param plane The plane object, 0 indicates any plane.
  *   @result If the entry has a parent in the given plane or if plane = 0 then if entry has any parent; return true, otherwise false. */
 
-       virtual bool inPlane( const IORegistryPlane * plane = 0) const;
+       virtual bool inPlane( const IORegistryPlane * plane = NULL) const;
 
 /*! @function getDepth
  *   @abstract Counts the maximum number of entries between an entry and the registry root, in a plane.
@@ -648,7 +648,7 @@ public:
  *   @param plane The plane object, or zero for the global name.
  *   @result A C-string name, valid while the entry is retained. */
 
-       virtual const char * getName( const IORegistryPlane * plane = 0 ) const;
+       virtual const char * getName( const IORegistryPlane * plane = NULL ) const;
 
 /*! @function copyName
  *   @abstract Returns the name assigned to the registry entry as an OSSymbol.
@@ -657,7 +657,7 @@ public:
  *   @result A reference to an OSSymbol for the name, which should be released by the caller. */
 
        virtual const OSSymbol * copyName(
-               const IORegistryPlane * plane = 0 ) const;
+               const IORegistryPlane * plane = NULL ) const;
 
 /*! @function compareNames
  *   @abstract Compares the name of the entry with one or more names, and optionally returns the matching name.
@@ -666,7 +666,7 @@ public:
  *   @param matched If the caller wants the successfully matched name returned, pass a non-zero pointer for the matched parameter and an OSString will be returned here. It should be released by the caller.
  *   @result True if one of the names compared true with the entry's global name. */
 
-       virtual bool compareNames( OSObject * name, OSString ** matched = 0 ) const;
+       virtual bool compareNames( OSObject * name, OSString ** matched = NULL ) const;
 
 /*! @function compareName
  *   @abstract Compares the name of the entry with one name, and optionally returns the matching name.
@@ -675,7 +675,7 @@ public:
  *   @param matched If the caller wants the successfully matched name returned, pass a non-zero pointer for the matched parameter and an OSString will be returned here. It should be released by the caller. Generally, this will be the same as the name parameter, but may not be if wildcards are used.
  *   @result True if the name compared true with the entry's global name. */
 
-       virtual bool compareName( OSString * name, OSString ** matched = 0 ) const;
+       virtual bool compareName( OSString * name, OSString ** matched = NULL ) const;
 
 /*! @function setName
  *   @abstract Sets a name for the registry entry, in a particular plane, or globally.
@@ -684,7 +684,7 @@ public:
  *   @param plane The plane object, or zero to set the global name. */
 
        virtual void setName( const OSSymbol * name,
-           const IORegistryPlane * plane = 0 );
+           const IORegistryPlane * plane = NULL );
 
 /*! @function setName
  *   @abstract Sets a name for the registry entry, in a particular plane, or globally.
@@ -693,7 +693,7 @@ public:
  *   @param plane The plane object, or zero to set the global name. */
 
        virtual void setName( const char * name,
-           const IORegistryPlane * plane = 0 );
+           const IORegistryPlane * plane = NULL );
 
 /*! @function getLocation
  *   @abstract Returns the location string assigned to the registry entry as a C-string.
@@ -701,7 +701,7 @@ public:
  *   @param plane The plane object, or zero for the global name.
  *   @result A C-string location string, valid while the entry is retained, or zero. */
 
-       virtual const char * getLocation( const IORegistryPlane * plane = 0 ) const;
+       virtual const char * getLocation( const IORegistryPlane * plane = NULL ) const;
 
 /*! @function copyLocation
  *   @abstract Returns the location string assigned to the registry entry as an OSSymbol.
@@ -710,7 +710,7 @@ public:
  *   @result A reference to an OSSymbol for the location if one exists, which should be released by the caller, or zero. */
 
        virtual const OSSymbol * copyLocation(
-               const IORegistryPlane * plane = 0 ) const;
+               const IORegistryPlane * plane = NULL ) const;
 
 /*! @function setLocation
  *   @abstract Sets a location string for the registry entry, in a particular plane, or globally.
@@ -719,9 +719,9 @@ public:
  *   @param plane The plane object, or zero to set the global location string. */
 
        virtual void setLocation( const OSSymbol * location,
-           const IORegistryPlane * plane = 0 );
+           const IORegistryPlane * plane = NULL );
        virtual void setLocation( const char * location,
-           const IORegistryPlane * plane = 0 );
+           const IORegistryPlane * plane = NULL );
 
 /*! @function getPath
  *   @abstract Create a path for a registry entry.
@@ -756,10 +756,10 @@ public:
  *   @result A retained registry entry is returned on success, or zero on failure. The caller should release the entry. */
 
        static IORegistryEntry * fromPath(  const char * path,
-           const IORegistryPlane * plane = 0,
-           char * residualPath = 0,
-           int * residualLength = 0,
-           IORegistryEntry * fromEntry = 0 );
+           const IORegistryPlane * plane = NULL,
+           char * residualPath = NULL,
+           int * residualLength = NULL,
+           IORegistryEntry * fromEntry = NULL );
 
 /*! @function fromPath
  *   @abstract Looks up a registry entry by relative path.
@@ -771,9 +771,9 @@ public:
  *   @result See IORegistryEntry::fromPath. */
 
        virtual IORegistryEntry * childFromPath( const char * path,
-           const IORegistryPlane * plane = 0,
-           char * residualPath = 0,
-           int * residualLength = 0 );
+           const IORegistryPlane * plane = NULL,
+           char * residualPath = NULL,
+           int * residualLength = NULL );
 
 /*! @function dealiasPath
  *   @abstract Strips any aliases from the head of path and returns the full path.
@@ -815,12 +815,14 @@ private:
 
 #ifdef XNU_KERNEL_PRIVATE
        SInt32 getRegistryEntryGenerationCount( void ) const;
+       void setName(const OSString * name,
+           const IORegistryPlane * plane = NULL);
 #endif
 
 private:
        inline bool arrayMember( OSArray * set,
            const IORegistryEntry * member,
-           unsigned int * index = 0 ) const;
+           unsigned int * index = NULL ) const;
 
        bool makeLink( IORegistryEntry * to,
            unsigned int relation,
@@ -842,9 +844,9 @@ private:
            const IORegistryPlane * plane );
 
        APPLE_KEXT_COMPATIBILITY_VIRTUAL
-       LIBKERN_RETURNS_NOT_RETAINED
-       const OSSymbol * hasAlias(  const IORegistryPlane * plane,
-           char * opath = 0, int * length = 0 ) const;
+       LIBKERN_RETURNS_NOT_RETAINED const OSSymbol * hasAlias(
+               const IORegistryPlane * plane,
+               char * opath = NULL, int * length = NULL ) const;
 
        APPLE_KEXT_COMPATIBILITY_VIRTUAL
        const char * matchPathLocation( const char * cmp,
@@ -859,7 +861,7 @@ private:
 
 class IORegistryIterator : public OSIterator
 {
-       OSDeclareAbstractStructors(IORegistryIterator)
+       OSDeclareAbstractStructors(IORegistryIterator);
 
 private:
        struct IORegCursor {
index 94347a72d8f1d28fde0cab062cf23cdeb93bb402..d932032226f820a972d0b931767509ba1e6460a4 100644 (file)
 extern "C" {
 #endif
 
+#ifndef PLATFORM_DriverKit
+
 #include <mach/error.h>
 
+#else  /* PLATFORM_DriverKit */
+
+typedef int             kern_return_t;
+
+#define KERN_SUCCESS                    0
+
+/*
+ *     error number layout as follows:
+ *
+ *     hi                                     lo
+ *     | system(6) | subsystem(12) | code(14) |
+ */
+
+#define err_none                (kern_return_t)0
+#define ERR_SUCCESS             (kern_return_t)0
+
+#define err_system(x)           ((signed)((((unsigned)(x))&0x3f)<<26))
+#define err_sub(x)              (((x)&0xfff)<<14)
+
+#define err_get_system(err)     (((err)>>26)&0x3f)
+#define err_get_sub(err)        (((err)>>14)&0xfff)
+#define err_get_code(err)       ((err)&0x3fff)
+
+#define err_max_system          0x3f
+
+#define system_emask            (err_system(err_max_system))
+#define sub_emask               (err_sub(0xfff))
+#define code_emask              (0x3fff)
+
+#endif /* PLATFORM_DriverKit */
+
 typedef kern_return_t           IOReturn;
 
 #ifndef sys_iokit
@@ -73,6 +106,10 @@ typedef kern_return_t           IOReturn;
 #ifdef PRIVATE
 #define sub_iokit_smc                     err_sub(32)
 #endif
+#define sub_iokit_apfs                    err_sub(33)
+#define sub_iokit_acpiec                  err_sub(34)
+#define sub_iokit_timesync_avb            err_sub(35)
+
 #define sub_iokit_platform                err_sub(0x2A)
 #define sub_iokit_audio_video             err_sub(0x45)
 #define sub_iokit_cec                     err_sub(0x46)
index 4a6da1574432fc3207a05a617faf43a3eee77370..fdfded6611fa98184166d550e31df3b1057b6e1b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <IOKit/pwr_mgt/IOPMpowerState.h>
 #include <IOKit/IOServicePM.h>
 #include <IOKit/IOReportTypes.h>
+#include <DriverKit/IOService.h>
 
 extern "C" {
 #include <kern/thread_call.h>
 }
 
+
 #ifndef UINT64_MAX
 #define UINT64_MAX        18446744073709551615ULL
 #endif
@@ -83,6 +85,7 @@ enum {
        // options for terminate()
        kIOServiceRequired      = 0x00000001,
        kIOServiceTerminate     = 0x00000004,
+       kIOServiceTerminateWithRematch = 0x00000010,
 
        // options for registerService() & terminate()
        kIOServiceSynchronous   = 0x00000002,
@@ -109,18 +112,33 @@ extern const IORegistryPlane *  gIOPowerPlane;
 extern const OSSymbol *     gIOResourcesKey;
 extern const OSSymbol *     gIOResourceMatchKey;
 extern const OSSymbol *     gIOResourceMatchedKey;
+extern const OSSymbol *     gIOResourceIOKitKey;
+
 extern const OSSymbol *     gIOProviderClassKey;
 extern const OSSymbol *     gIONameMatchKey;
 extern const OSSymbol *     gIONameMatchedKey;
 extern const OSSymbol *     gIOPropertyMatchKey;
+extern const OSSymbol *     gIOPropertyExistsMatchKey;
 extern const OSSymbol *     gIOLocationMatchKey;
 extern const OSSymbol *     gIOParentMatchKey;
 extern const OSSymbol *     gIOPathMatchKey;
 extern const OSSymbol *     gIOMatchCategoryKey;
 extern const OSSymbol *     gIODefaultMatchCategoryKey;
 extern const OSSymbol *     gIOMatchedServiceCountKey;
+extern const OSSymbol *     gIOMatchedPersonalityKey;
+extern const OSSymbol *     gIORematchPersonalityKey;
+extern const OSSymbol *     gIORematchCountKey;
+extern const OSSymbol *     gIODEXTMatchCountKey;
 
 extern const OSSymbol *     gIOUserClientClassKey;
+
+extern const OSSymbol *     gIOUserClassKey;
+extern const OSSymbol *     gIOUserServerClassKey;
+extern const OSSymbol *     gIOUserServerNameKey;
+extern const OSSymbol *     gIOUserServerTagKey;
+extern const OSSymbol *     gIOUserServerCDHashKey;
+extern const OSSymbol *     gIOUserUserClientKey;
+
 extern const OSSymbol *     gIOKitDebugKey;
 extern const OSSymbol *     gIOServiceKey;
 
@@ -150,6 +168,11 @@ extern const OSSymbol *     gIOBSDMajorKey;
 extern const OSSymbol *     gIOBSDMinorKey;
 extern const OSSymbol *     gIOBSDUnitKey;
 
+extern const OSSymbol *     gIODriverKitEntitlementKey;
+extern const OSSymbol *     gIOServiceDEXTEntitlementsKey;
+extern const OSSymbol *     gIODriverKitUserClientEntitlementsKey;
+extern const OSSymbol *     gIOMatchDeferKey;
+
 extern SInt32 IOServiceOrdering( const OSMetaClassBase * inObj1, const OSMetaClassBase * inObj2, void * ref );
 
 typedef void (*IOInterruptAction)( OSObject * target, void * refCon,
@@ -309,12 +332,17 @@ class IOPlatformExpert;
 
 struct IOInterruptAccountingData;
 struct IOInterruptAccountingReporter;
+struct OSObjectUserVars;
 
 class IOService : public IORegistryEntry
 {
-       OSDeclareDefaultStructors(IOService)
+       OSDeclareDefaultStructorsWithDispatch(IOService);
 
+#if XNU_KERNEL_PRIVATE
+public:
+#else
 protected:
+#endif  /* XNU_KERNEL_PRIVATE */
 /*! @struct ExpansionData
  *   @discussion This structure will be used to expand the capablilties of this class in the future.
  */
@@ -330,6 +358,8 @@ protected:
                IOLock * interruptStatisticsLock;
                IOInterruptAccountingReporter * interruptStatisticsArray;
                int interruptStatisticsArrayCount;
+
+               OSObjectUserVars * uvars;
        };
 
 /*! @var reserved
@@ -566,7 +596,7 @@ public:
 
        virtual bool open(   IOService *       forClient,
            IOOptionBits      options = 0,
-           void *        arg = 0 );
+           void *        arg = NULL );
 
 /*! @function close
  *   @abstract Releases active access to a provider.
@@ -583,7 +613,7 @@ public:
  *   @param forClient If non-zero, <code>isOpen</code> returns the open state for that client. If zero is passed, <code>isOpen</code> returns the open state for all clients.
  *   @result <code>true</code> if the specific, or any, client has the IOService object open. */
 
-       virtual bool isOpen( const IOService * forClient = 0 ) const;
+       virtual bool isOpen( const IOService * forClient = NULL ) const;
 
 /*! @function handleOpen
  *   @abstract Controls the open / close behavior of an IOService object (overrideable by subclasses).
@@ -632,7 +662,7 @@ public:
 
 /*! @function init
  *   @abstract Initializes generic IOService data structures (expansion data, etc). */
-       virtual bool init( OSDictionary * dictionary = 0 ) APPLE_KEXT_OVERRIDE;
+       virtual bool init( OSDictionary * dictionary = NULL ) APPLE_KEXT_OVERRIDE;
 
 /*! @function init
  *   @abstract Initializes generic IOService data structures (expansion data, etc). */
@@ -729,7 +759,8 @@ public:
  *   @param key An OSSymbol key that globally identifies the object.
  *   @param value The object to be published. */
 
-       static void publishResource( const OSSymbol * key, OSObject * value = 0 );
+       static void publishResource( const OSSymbol * key, OSObject * value = NULL );
+       static void publishUserResource( const OSSymbol * key, OSObject * value = NULL );
 
 /*! @function publishResource
  *   @abstract Uses the resource service to publish a property.
@@ -737,7 +768,7 @@ public:
  *   @param key A C string key that globally identifies the object.
  *   @param value The object to be published. */
 
-       static void publishResource( const char * key, OSObject * value = 0 );
+       static void publishResource( const char * key, OSObject * value = NULL );
        virtual bool addNeededResource( const char * key );
 
 /* Notifications */
@@ -762,7 +793,7 @@ public:
        static IONotifier * addNotification(
                const OSSymbol * type, OSDictionary * matching,
                IOServiceNotificationHandler handler,
-               void * target, void * ref = 0,
+               void * target, void * ref = NULL,
                SInt32 priority = 0 )
        APPLE_KEXT_DEPRECATED;
 
@@ -786,7 +817,7 @@ public:
        static IONotifier * addMatchingNotification(
                const OSSymbol * type, OSDictionary * matching,
                IOServiceMatchingNotificationHandler handler,
-               void * target, void * ref = 0,
+               void * target, void * ref = NULL,
                SInt32 priority = 0 );
 
 
@@ -804,10 +835,9 @@ public:
  *   @param timeout The maximum time to wait.
  *   @result A published IOService object matching the supplied dictionary. */
 
-       LIBKERN_RETURNS_NOT_RETAINED
-       static IOService * waitForService(
+       static LIBKERN_RETURNS_NOT_RETAINED IOService * waitForService(
                LIBKERN_CONSUMED OSDictionary * matching,
-               mach_timespec_t * timeout = 0);
+               mach_timespec_t * timeout = NULL);
 
 /*! @function waitForMatchingService
  *   @abstract Waits for a matching to service to be published.
@@ -847,7 +877,7 @@ public:
  *   @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */
 
        static OSDictionary * serviceMatching( const char * className,
-           OSDictionary * table = 0 );
+           OSDictionary * table = NULL );
 
 /*! @function serviceMatching
  *   @abstract Creates a matching dictionary, or adds matching properties to an existing dictionary, that specify an IOService class match.
@@ -857,7 +887,7 @@ public:
  *   @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */
 
        static OSDictionary * serviceMatching( const OSString * className,
-           OSDictionary * table = 0 );
+           OSDictionary * table = NULL );
 
 /*! @function nameMatching
  *   @abstract Creates a matching dictionary, or adds matching properties to an existing dictionary, that specify an IOService name match.
@@ -867,7 +897,7 @@ public:
  *   @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */
 
        static OSDictionary * nameMatching( const char * name,
-           OSDictionary * table = 0 );
+           OSDictionary * table = NULL );
 
 /*! @function nameMatching
  *   @abstract Creates a matching dictionary, or adds matching properties to an existing dictionary, that specify an IOService name match.
@@ -877,7 +907,7 @@ public:
  *   @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */
 
        static OSDictionary * nameMatching( const OSString* name,
-           OSDictionary * table = 0 );
+           OSDictionary * table = NULL );
 
 /*! @function resourceMatching
  *   @abstract Creates a matching dictionary, or adds matching properties to an existing dictionary, that specify a resource service match.
@@ -887,7 +917,7 @@ public:
  *   @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */
 
        static OSDictionary * resourceMatching( const char * name,
-           OSDictionary * table = 0 );
+           OSDictionary * table = NULL );
 
 /*! @function resourceMatching
  *   @abstract Creates a matching dictionary, or adds matching properties to an existing dictionary, that specify a resource service match.
@@ -897,7 +927,7 @@ public:
  *   @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */
 
        static OSDictionary * resourceMatching( const OSString * name,
-           OSDictionary * table = 0 );
+           OSDictionary * table = NULL );
 
 
 /*! @function propertyMatching
@@ -909,7 +939,7 @@ public:
  *   @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */
 
        static OSDictionary * propertyMatching( const OSSymbol * key, const OSObject * value,
-           OSDictionary * table = 0 );
+           OSDictionary * table = NULL );
 
 /*! @function registryEntryIDMatching
  *   @abstract Creates a matching dictionary, or adds matching properties to an existing dictionary, that specify a IORegistryEntryID match.
@@ -919,7 +949,7 @@ public:
  *   @result The matching dictionary created, or passed in, is returned on success, or zero on failure. */
 
        static OSDictionary * registryEntryIDMatching( uint64_t entryID,
-           OSDictionary * table = 0 );
+           OSDictionary * table = NULL );
 
 
 /*! @function addLocation
@@ -1137,7 +1167,7 @@ public:
 
        virtual IOReturn registerInterrupt(int source, OSObject *target,
            IOInterruptAction handler,
-           void *refCon = 0);
+           void *refCon = NULL);
 
 #ifdef __BLOCKS__
 /*! @function registerInterrupt
@@ -1224,7 +1254,7 @@ public:
  *   @result An IOReturn code defined by the message type. */
 
        virtual IOReturn message( UInt32 type, IOService * provider,
-           void * argument = 0 );
+           void * argument = NULL );
 
 /*! @function messageClient
  *   @abstract Sends a generic message to an attached client.
@@ -1236,7 +1266,7 @@ public:
  *   @result The return code from the client message call. */
 
        virtual IOReturn messageClient( UInt32 messageType, OSObject * client,
-           void * messageArgument = 0, vm_size_t argSize = 0 );
+           void * messageArgument = NULL, vm_size_t argSize = 0 );
 
 /*! @function messageClients
  *   @abstract Sends a generic message to all attached clients.
@@ -1247,11 +1277,11 @@ public:
  *   @result Any non-<code>kIOReturnSuccess</code> return codes returned by the clients, or <code>kIOReturnSuccess</code> if all return <code>kIOReturnSuccess</code>. */
 
        virtual IOReturn messageClients( UInt32 type,
-           void * argument = 0, vm_size_t argSize = 0 );
+           void * argument = NULL, vm_size_t argSize = 0 );
 
        virtual IONotifier * registerInterest( const OSSymbol * typeOfInterest,
            IOServiceInterestHandler handler,
-           void * target, void * ref = 0 );
+           void * target, void * ref = NULL );
 
 #ifdef __BLOCKS__
        IONotifier * registerInterest(const OSSymbol * typeOfInterest,
@@ -1285,10 +1315,11 @@ public:
 
        virtual IOReturn newUserClient( task_t owningTask, void * securityID,
            UInt32 type, OSDictionary * properties,
-           IOUserClient ** handler );
+           LIBKERN_RETURNS_RETAINED IOUserClient ** handler );
 
        virtual IOReturn newUserClient( task_t owningTask, void * securityID,
-           UInt32 type, IOUserClient ** handler );
+           UInt32 type,
+           LIBKERN_RETURNS_RETAINED IOUserClient ** handler );
 
 /* Return code utilities */
 
@@ -1347,6 +1378,9 @@ public:
        IOReturn setAuthorizationID( uint64_t authorizationID );
        void cpusRunning(void);
        void scheduleFinalize(bool now);
+       static void willShutdown();
+       static void startDeferredMatches();
+       static void kextdLaunched();
 
 private:
        static IOReturn waitMatchIdle( UInt32 ms );
@@ -1354,13 +1388,15 @@ private:
                const OSSymbol * type, OSDictionary * matching,
                IOServiceMatchingNotificationHandler handler,
                void * target, void * ref,
-               SInt32 priority, OSIterator ** existing );
+               SInt32 priority,
+               LIBKERN_RETURNS_RETAINED OSIterator ** existing );
 #if !defined(__LP64__)
        static IONotifier * installNotification(
                const OSSymbol * type, OSDictionary * matching,
                IOServiceNotificationHandler handler,
                void * target, void * ref,
-               SInt32 priority, OSIterator ** existing);
+               SInt32 priority,
+               LIBKERN_RETURNS_RETAINED OSIterator ** existing);
 #endif /* !defined(__LP64__) */
 #endif
 
@@ -1427,7 +1463,7 @@ private:
        OSArray * copyNotifiers(const OSSymbol * type,
            IOOptionBits orNewState, IOOptionBits andNewState);
 
-       bool invokeNotifiers(OSArray ** willSend);
+       bool invokeNotifiers(OSArray * willSend[]);
        bool invokeNotifier( class _IOServiceNotifier * notify );
 
        APPLE_KEXT_COMPATIBILITY_VIRTUAL
@@ -1435,7 +1471,7 @@ private:
 
        APPLE_KEXT_COMPATIBILITY_VIRTUAL
        IOReturn waitForState( UInt32 mask, UInt32 value,
-           mach_timespec_t * timeout = 0 );
+           mach_timespec_t * timeout = NULL );
 
        IOReturn waitForState( UInt32 mask, UInt32 value, uint64_t timeout );
 
@@ -1449,7 +1485,7 @@ private:
        static void __attribute__((__noreturn__)) terminateThread( void * arg, wait_result_t unused );
        static void terminateWorker( IOOptionBits options );
        static void actionWillTerminate( IOService * victim, IOOptionBits options,
-           OSArray * doPhase2List, void*, void * );
+           OSArray * doPhase2List, bool, void * );
        static void actionDidTerminate( IOService * victim, IOOptionBits options,
            void *, void *, void *);
 
@@ -1466,7 +1502,10 @@ private:
        APPLE_KEXT_COMPATIBILITY_VIRTUAL
        IOReturn resolveInterrupt(IOService *nub, int source);
        APPLE_KEXT_COMPATIBILITY_VIRTUAL
-       IOReturn lookupInterrupt(int source, bool resolve, IOInterruptController **interruptController);
+       IOReturn lookupInterrupt(
+               int source, bool resolve,
+               LIBKERN_RETURNS_NOT_RETAINED IOInterruptController *
+               *interruptController);
 
 #ifdef XNU_KERNEL_PRIVATE
 /* end xnu internals */
@@ -1846,7 +1885,7 @@ protected:
  *   Drivers may eliminate the influence of the <code>changePowerStateTo</code> method on power state one of two ways. See @link powerOverrideOnPriv powerOverrideOnPriv@/link to ignore the method's influence, or call <code>changePowerStateTo(0)</code> in the driver's <code>start</code> routine to remove the <code>changePowerStateTo</code> method's power request.
  *   @param ordinal The number of the desired power state in the power state array.
  *   @result A return code that can be ignored by the caller. */
-
+public:
        IOReturn changePowerStateToPriv( unsigned long ordinal );
 
 /*! @function powerOverrideOnPriv
@@ -1874,8 +1913,8 @@ protected:
 public:
        void idleTimerExpired( void );
        void settleTimerExpired( void );
-       IOReturn synchronizePowerTree( IOOptionBits options = 0, IOService * notifyRoot = 0 );
-       bool assertPMDriverCall( IOPMDriverCallEntry * callEntry, IOOptionBits options = 0, IOPMinformee * inform = 0 );
+       IOReturn synchronizePowerTree( IOOptionBits options = 0, IOService * notifyRoot = NULL );
+       bool assertPMDriverCall( IOPMDriverCallEntry * callEntry, IOOptionBits method, const IOPMinformee * inform = NULL, IOOptionBits options = 0 );
        void deassertPMDriverCall( IOPMDriverCallEntry * callEntry );
        IOReturn changePowerStateWithOverrideTo( IOPMPowerStateIndex ordinal, IOPMRequestTag tag );
        IOReturn changePowerStateForRootDomain( IOPMPowerStateIndex ordinal );
@@ -1893,6 +1932,7 @@ public:
 
        static IOWorkLoop * getIOPMWorkloop( void );
        bool getBlockingDriverCall(thread_t *thread, const void **callMethod);
+       void cancelIdlePowerDown(IOService * service);
 
 protected:
        bool tellClientsWithResponse( int messageType );
@@ -1963,7 +2003,7 @@ private:
        static IOReturn actionSpinDumpTimerExpired(OSObject *, void *, void *, void *, void * );
 
        static IOReturn actionDriverCalloutDone(OSObject *, void *, void *, void *, void * );
-       static IOPMRequest * acquirePMRequest( IOService * target, IOOptionBits type, IOPMRequest * active = 0 );
+       static IOPMRequest * acquirePMRequest( IOService * target, IOOptionBits type, IOPMRequest * active = NULL );
        static void releasePMRequest( IOPMRequest * request );
        static void pmDriverCallout( IOService * from );
        static void pmTellAppWithResponse( OSObject * object, void * context );
@@ -1971,7 +2011,7 @@ private:
        static void pmTellCapabilityAppWithResponse( OSObject * object, void * arg );
        static void pmTellCapabilityClientWithResponse( OSObject * object, void * arg );
        static void submitPMRequest(LIBKERN_CONSUMED IOPMRequest * request );
-       static void submitPMRequests( IOPMRequest ** request, IOItemCount count );
+       static void submitPMRequests( IOPMRequest * requests[], IOItemCount count );
        bool ackTimerTick( void );
        void addPowerChild1( IOPMRequest * request );
        void addPowerChild2( IOPMRequest * request );
@@ -2014,7 +2054,7 @@ private:
        IOReturn updatePowerStatesReport( IOReportConfigureAction action, void *result, void *destination );
        IOReturn configureSimplePowerReport(IOReportConfigureAction action, void *result );
        IOReturn updateSimplePowerReport( IOReportConfigureAction action, void *result, void *destination );
-       void waitForPMDriverCall( IOService * target = 0 );
+       void waitForPMDriverCall( IOService * target = NULL );
 #endif /* XNU_KERNEL_PRIVATE */
 };
 
index e20f0f0869518b0ec1d5f12bfe3d20a9be9ac2d0..d226255bcf04cf15cfdf7e3cf694fe9d885f90c6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -95,7 +95,7 @@ class IOPMprot : public OSObject
 {
        friend class IOService;
 
-       OSDeclareDefaultStructors(IOPMprot)
+       OSDeclareDefaultStructors(IOPMprot);
 
 public:
        const char *            ourName;
index f0347c8c67f16db245552b22789fd51746da07f9..c956ce7740d9eeed6c4e02aae3fea5ae58797c57 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #ifndef _IOKIT_IOSHAREDDATAQUEUE_H
 #define _IOKIT_IOSHAREDDATAQUEUE_H
 
-#ifdef dequeue
-#undef dequeue
-#endif
-#ifdef enqueue
-#undef enqueue
-#endif
-
 #define DISABLE_DATAQUEUE_WARNING /* IODataQueue is deprecated, please use IOSharedDataQueue instead */
 
 #include <IOKit/IODataQueue.h>
@@ -57,7 +50,7 @@ typedef struct _IODataQueueEntry IODataQueueEntry;
  */
 class IOSharedDataQueue : public IODataQueue
 {
-       OSDeclareDefaultStructors(IOSharedDataQueue)
+       OSDeclareDefaultStructors(IOSharedDataQueue);
 
        struct ExpansionData {
                UInt32 queueSize;
index a228cb4ffaec58494d1a7c2b5bfad787f2bc75cb..42b10d913c413c1651a78837e4ab4c69df1b5d96 100644 (file)
@@ -94,6 +94,10 @@ public:
        virtual IOReturn setPurgeable( IOOptionBits newState,
            IOOptionBits * oldState ) APPLE_KEXT_OVERRIDE;
 
+       IOReturn setOwnership( task_t newOwner,
+           int newLedgerTag,
+           IOOptionBits newLedgerOptions );
+
 // support map() on kIOMemoryTypeVirtual without prepare()
        virtual IOMemoryMap *       makeMapping(
                IOMemoryDescriptor *    owner,
index f72dcc37fa95dd05ae04347de9f0b83999fde4e4..299bb5e932b965c85bea4bada6a3a154d75b893d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -34,7 +34,7 @@
 
 class IOSyncer : public OSObject
 {
-       OSDeclareDefaultStructors(IOSyncer)
+       OSDeclareDefaultStructors(IOSyncer);
 
 private:
 // The spin lock that is used to guard the 'threadMustStop' variable.
index ed54a6a60f374acf1c35d42be8b1decad428aeb0..f4987b69b05b2c33d3660e2492866c2085ebd74b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -107,7 +107,7 @@ enum{
 
 class IOTimerEventSource : public IOEventSource
 {
-       OSDeclareDefaultStructors(IOTimerEventSource)
+       OSDeclareDefaultStructors(IOTimerEventSource);
 
 protected:
 /*! @var calloutEntry thread_call entry for preregistered thread callouts */
@@ -161,7 +161,7 @@ public:
 #endif /* __BLOCKS__ */
 
        static IOTimerEventSource *
-       timerEventSource(OSObject *owner, Action action = 0);
+       timerEventSource(OSObject *owner, Action action = NULL);
 
 /*! @function timerEventSource
  *   @abstract Allocates and returns an initialized timer instance.
@@ -170,7 +170,7 @@ public:
  *   @param action 'C' Function pointer for the callout routine of this event source.
  */
        static IOTimerEventSource *
-       timerEventSource(uint32_t options, OSObject *owner, Action action = 0);
+       timerEventSource(uint32_t options, OSObject *owner, Action action = NULL);
 
 #ifdef __BLOCKS__
 /*! @function timerEventSource
@@ -191,7 +191,7 @@ public:
 /*! @function init
  *   @abstract Initializes the timer with an owner, and a handler to call when the timeout expires.
  */
-       virtual bool init(OSObject *owner, Action action = 0);
+       virtual bool init(OSObject *owner, Action action = NULL);
 
 /*! @function enable
  *   @abstract Enables a call to the action.
index be2137e18c962298c8391b146bbc036176a61d6f..d07f14f1b17ea846f6475778728bb9b6259db4b9 100644 (file)
@@ -28,6 +28,8 @@
 #ifndef __IOKIT_IOTYPES_H
 #define __IOKIT_IOTYPES_H
 
+#ifndef PLATFORM_DriverKit
+
 #ifndef IOKIT
 #define IOKIT 1
 #endif /* !IOKIT */
@@ -47,7 +49,11 @@ extern "C" {
 
 #ifndef NULL
 #if defined (__cplusplus)
+#if __cplusplus >= 201103L
+#define NULL nullptr
+#else
 #define NULL    0
+#endif
 #else
 #define NULL ((void *)0)
 #endif
@@ -173,6 +179,7 @@ typedef io_object_t     io_enumerator_t;
 typedef io_object_t     io_iterator_t;
 typedef io_object_t     io_registry_entry_t;
 typedef io_object_t     io_service_t;
+typedef io_object_t     uext_object_t;
 
 #define IO_OBJECT_NULL  ((io_object_t) 0)
 
@@ -190,35 +197,41 @@ enum {
        kIOCopybackCache            = 3,
        kIOWriteCombineCache        = 4,
        kIOCopybackInnerCache       = 5,
-       kIOPostedWrite              = 6
+       kIOPostedWrite              = 6,
+       kIORealTimeCache            = 7,
+       kIOPostedReordered          = 8,
+       kIOPostedCombinedReordered  = 9,
 };
 
 // IOMemory mapping options
 enum {
-       kIOMapAnywhere              = 0x00000001,
-
-       kIOMapCacheMask             = 0x00000700,
-       kIOMapCacheShift            = 8,
-       kIOMapDefaultCache          = kIODefaultCache       << kIOMapCacheShift,
-       kIOMapInhibitCache          = kIOInhibitCache       << kIOMapCacheShift,
-       kIOMapWriteThruCache        = kIOWriteThruCache     << kIOMapCacheShift,
-       kIOMapCopybackCache         = kIOCopybackCache      << kIOMapCacheShift,
-       kIOMapWriteCombineCache     = kIOWriteCombineCache  << kIOMapCacheShift,
-       kIOMapCopybackInnerCache    = kIOCopybackInnerCache << kIOMapCacheShift,
-       kIOMapPostedWrite           = kIOPostedWrite        << kIOMapCacheShift,
-
-       kIOMapUserOptionsMask       = 0x00000fff,
-
-       kIOMapReadOnly              = 0x00001000,
-
-       kIOMapStatic                = 0x01000000,
-       kIOMapReference             = 0x02000000,
-       kIOMapUnique                = 0x04000000,
+       kIOMapAnywhere                = 0x00000001,
+
+       kIOMapCacheMask               = 0x00000f00,
+       kIOMapCacheShift              = 8,
+       kIOMapDefaultCache            = kIODefaultCache            << kIOMapCacheShift,
+       kIOMapInhibitCache            = kIOInhibitCache            << kIOMapCacheShift,
+       kIOMapWriteThruCache          = kIOWriteThruCache          << kIOMapCacheShift,
+       kIOMapCopybackCache           = kIOCopybackCache           << kIOMapCacheShift,
+       kIOMapWriteCombineCache       = kIOWriteCombineCache       << kIOMapCacheShift,
+       kIOMapCopybackInnerCache      = kIOCopybackInnerCache      << kIOMapCacheShift,
+       kIOMapPostedWrite             = kIOPostedWrite             << kIOMapCacheShift,
+       kIOMapRealTimeCache           = kIORealTimeCache           << kIOMapCacheShift,
+       kIOMapPostedReordered         = kIOPostedReordered         << kIOMapCacheShift,
+       kIOMapPostedCombinedReordered = kIOPostedCombinedReordered << kIOMapCacheShift,
+
+       kIOMapUserOptionsMask         = 0x00000fff,
+
+       kIOMapReadOnly                = 0x00001000,
+
+       kIOMapStatic                  = 0x01000000,
+       kIOMapReference               = 0x02000000,
+       kIOMapUnique                  = 0x04000000,
 #ifdef XNU_KERNEL_PRIVATE
-       kIOMap64Bit                 = 0x08000000,
+       kIOMap64Bit                   = 0x08000000,
 #endif
-       kIOMapPrefault              = 0x10000000,
-       kIOMapOverwrite     = 0x20000000
+       kIOMapPrefault                = 0x10000000,
+       kIOMapOverwrite               = 0x20000000
 };
 
 /*! @enum Scale Factors
@@ -253,4 +266,30 @@ typedef unsigned int IODeviceNumber;
 }
 #endif
 
+#else /* !PLATFORM_DriverKit */
+
+#include <stdint.h>
+
+typedef uint32_t          IOOptionBits;
+typedef int32_t           IOFixed;
+typedef uint32_t          IOVersion;
+typedef uint32_t          IOItemCount;
+typedef uint32_t          IOCacheMode;
+
+typedef uint32_t          IOByteCount32;
+typedef uint64_t          IOByteCount64;
+typedef IOByteCount64     IOByteCount;
+
+typedef uint32_t  IOPhysicalAddress32;
+typedef uint64_t  IOPhysicalAddress64;
+typedef uint32_t  IOPhysicalLength32;
+typedef uint64_t  IOPhysicalLength64;
+
+typedef IOPhysicalAddress64      IOPhysicalAddress;
+typedef IOPhysicalLength64       IOPhysicalLength;
+
+typedef uint64_t       IOVirtualAddress;
+
+#endif /* PLATFORM_DriverKit */
+
 #endif /* ! __IOKIT_IOTYPES_H */
index 1c17dda61cd6cd6b8a099d38d72d663fefbcc3c6..eba181d0736094b379b7076645de3503fc888e65 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -36,6 +36,7 @@
 #include <IOKit/IOTypes.h>
 #include <IOKit/IOService.h>
 #include <IOKit/OSMessageNotification.h>
+#include <DriverKit/IOUserClient.h>
 
 #if IOKITSTATS
 #include <IOKit/IOStatisticsPrivate.h>
@@ -173,7 +174,7 @@ enum {
 
 class IOUserClient : public IOService
 {
-       OSDeclareAbstractStructors(IOUserClient)
+       OSDeclareAbstractStructorsWithDispatch(IOUserClient);
 #if IOKITSTATS
        friend class IOStatistics;
 #endif
@@ -206,7 +207,7 @@ public:
        UInt8   sharedInstance;
        UInt8   closed;
        UInt8   __ipcFinal;
-       UInt8   __reservedA[1];
+       UInt8   messageAppSuspended;
        volatile SInt32 __ipc;
        queue_head_t owners;
        IOLock * lock;
@@ -222,10 +223,12 @@ private:
 #endif /* XNU_KERNEL_PRIVATE */
 
 public:
-       virtual IOReturn externalMethod( uint32_t selector, IOExternalMethodArguments * arguments,
-           IOExternalMethodDispatch * dispatch = 0, OSObject * target = 0, void * reference = 0 );
+       MIG_SERVER_ROUTINE virtual IOReturn
+       externalMethod(uint32_t selector, IOExternalMethodArguments *arguments,
+           IOExternalMethodDispatch *dispatch = NULL,
+           OSObject *target = NULL, void *reference = NULL);
 
-       virtual IOReturn registerNotificationPort(
+       MIG_SERVER_ROUTINE virtual IOReturn registerNotificationPort(
                mach_port_t port, UInt32 type, io_user_reference_t refCon);
 
 private:
@@ -308,6 +311,8 @@ public:
        static OSObject * copyClientEntitlement( task_t task,
            const char * entitlement );
 
+       static OSDictionary * copyClientEntitlements(task_t task);
+
 /*!
  *   @function releaseAsyncReference64
  *   @abstract Release the mach_port_t reference held within the OSAsyncReference64 structure.
@@ -342,10 +347,10 @@ public:
 
        virtual IOService * getService( void );
 
-       virtual IOReturn registerNotificationPort(
+       MIG_SERVER_ROUTINE virtual IOReturn registerNotificationPort(
                mach_port_t port, UInt32 type, UInt32 refCon );
 
-       virtual IOReturn getNotificationSemaphore( UInt32 notification_type,
+       MIG_SERVER_ROUTINE virtual IOReturn getNotificationSemaphore( UInt32 notification_type,
            semaphore_t * semaphore );
 
        virtual IOReturn connectClient( IOUserClient * client );
@@ -436,9 +441,11 @@ public:
 
 // Methods for accessing method vector.
        virtual IOExternalMethod *
-       getTargetAndMethodForIndex( IOService ** targetP, UInt32 index );
+       getTargetAndMethodForIndex(
+               LIBKERN_RETURNS_NOT_RETAINED IOService ** targetP, UInt32 index );
        virtual IOExternalAsyncMethod *
-       getAsyncTargetAndMethodForIndex( IOService ** targetP, UInt32 index );
+       getAsyncTargetAndMethodForIndex(
+               LIBKERN_RETURNS_NOT_RETAINED IOService ** targetP, UInt32 index );
 
 // Methods for accessing trap vector - old and new style
        virtual IOExternalTrap *
@@ -446,7 +453,12 @@ public:
        APPLE_KEXT_DEPRECATED;
 
        virtual IOExternalTrap *
-       getTargetAndTrapForIndex( IOService **targetP, UInt32 index );
+       getTargetAndTrapForIndex(
+               LIBKERN_RETURNS_NOT_RETAINED IOService **targetP, UInt32 index );
 };
 
+#ifdef XNU_KERNEL_PRIVATE
+extern "C" void IOMachPortDestroyUserReferences(OSObject * obj, natural_t type);
+#endif /* XNU_KERNEL_PRIVATE */
+
 #endif /* ! _IOKIT_IOUSERCLIENT_H */
diff --git a/iokit/IOKit/IOUserServer.h b/iokit/IOKit/IOUserServer.h
new file mode 100644 (file)
index 0000000..0741ed4
--- /dev/null
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+
+#ifndef _IOUSERSERVER_H
+#define _IOUSERSERVER_H
+
+#include <IOKit/IORPC.h>
+
+#define kIOUserClassKey        "IOUserClass"
+#define kIOUserServerClassKey  "IOUserServer"
+#define kIOUserServerNameKey   "IOUserServerName"
+#define kIOUserServerTagKey    "IOUserServerTag"
+// the expected cdhash value of the userspace driver executable
+#define kIOUserServerCDHashKey "IOUserServerCDHash"
+
+#if DRIVERKIT_PRIVATE
+
+enum{
+       kIOKitUserServerClientType  = 0x99000003,
+};
+
+enum{
+       kIOUserServerMethodRegisterClass = 0x0001000,
+       kIOUserServerMethodStart         = 0x0001001,
+       kIOUserServerMethodRegister      = 0x0001002,
+};
+
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+class OSObject;
+
+#define OSObject_Instantiate_ID       0x0000000100000001ULL
+
+enum {
+       kOSObjectRPCRemote = 0x00000001,
+       kOSObjectRPCKernel = 0x00000002,
+};
+
+struct OSObject_Instantiate_Msg_Content {
+       IORPCMessage __hdr;
+       OSObjectRef  __object;
+};
+
+struct OSObject_Instantiate_Rpl_Content {
+       IORPCMessage  __hdr;
+       kern_return_t __result;
+       uint32_t      __pad;
+       uint64_t      flags;
+       char          classname[64];
+       uint64_t      methods[0];
+};
+
+#pragma pack(4)
+struct OSObject_Instantiate_Msg {
+       IORPCMessageMach mach;
+       mach_msg_port_descriptor_t __object__descriptor;
+       OSObject_Instantiate_Msg_Content content;
+};
+struct OSObject_Instantiate_Rpl {
+       IORPCMessageMach mach;
+       OSObject_Instantiate_Rpl_Content content;
+};
+#pragma pack()
+
+typedef uint64_t IOTrapMessageBuffer[256];
+
+#endif /* DRIVERKIT_PRIVATE */
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#ifdef XNU_KERNEL_PRIVATE
+
+#include <IOKit/IOService.h>
+#include <IOKit/IOUserClient.h>
+#include <DriverKit/IOUserServer.h>
+#include <libkern/c++/OSKext.h>
+
+class IOUserServer;
+class OSUserMetaClass;
+class IODispatchQueue;
+class IODispatchSource;
+class IOInterruptDispatchSource;
+class IOTimerDispatchSource;
+struct IOPStrings;
+
+struct OSObjectUserVars {
+       IOUserServer     * userServer;
+       IODispatchQueue ** queueArray;
+       OSUserMetaClass  * userMeta;
+       OSArray          * openProviders;
+       bool               willTerminate;
+       bool               didTerminate;
+       bool               serverDied;
+       bool               started;
+       bool               stopped;
+       bool               userServerPM;
+       bool               willPower;
+       uint32_t           powerOverride;
+};
+
+extern IOLock *        gIOUserServerLock;
+
+typedef struct ipc_kmsg * ipc_kmsg_t;
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+namespace IOServicePH
+{
+void serverAdd(IOUserServer * server);
+void serverRemove(IOUserServer * server);
+void serverAck(IOUserServer * server);
+};
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+class IOUserServer : public IOUserClient
+{
+       OSDeclareDefaultStructorsWithDispatch(IOUserServer);
+
+       IOLock       *        fLock;
+       IOSimpleLock *        fInterruptLock;
+       task_t                fOwningTask;
+       OSDictionary  *       fEntitlements;
+       OSDictionary  *       fClasses;
+       IODispatchQueue     * fRootQueue;
+       OSArray             * fServices;
+
+       uint64_t              fPowerStates;
+       uint8_t               fRootNotifier;
+       uint8_t               fSystemPowerAck;
+       uint8_t               fSystemOff;
+
+public:
+
+       static  IOUserClient * withTask(task_t owningTask);
+       virtual IOReturn       clientClose(void) APPLE_KEXT_OVERRIDE;
+       virtual bool           finalize(IOOptionBits options) APPLE_KEXT_OVERRIDE;
+       virtual void           stop(IOService * provider) APPLE_KEXT_OVERRIDE;
+       virtual void           free() APPLE_KEXT_OVERRIDE;
+
+       virtual IOReturn       setProperties(OSObject * properties) APPLE_KEXT_OVERRIDE;
+       virtual IOReturn       externalMethod(uint32_t selector, IOExternalMethodArguments * args,
+           IOExternalMethodDispatch * dispatch,
+           OSObject * target, void * reference) APPLE_KEXT_OVERRIDE;
+
+       virtual IOExternalTrap * getTargetAndTrapForIndex(IOService ** targetP, UInt32 index) APPLE_KEXT_OVERRIDE;
+
+       IOReturn               serviceAttach(IOService * service, IOService * provider);
+       IOReturn               serviceStop(IOService * service, IOService * provider);
+       void                   serviceFree(IOService * service);
+       IOReturn               serviceStarted(IOService * service, IOService * provider, bool result);
+       static void            serviceWillTerminate(IOService * client, IOService * provider, IOOptionBits options);
+       static void            serviceDidTerminate(IOService * client, IOService * provider, IOOptionBits options, bool * defer);
+       static void            serviceDidStop(IOService * client, IOService * provider);
+       IOReturn               serviceOpen(IOService * provider, IOService * client);
+       IOReturn               serviceClose(IOService * provider, IOService * client);
+       IOReturn               serviceNewUserClient(IOService * service, task_t owningTask, void * securityID,
+           uint32_t type, OSDictionary * properties, IOUserClient ** handler);
+       IOReturn               exit(const char * reason);
+
+       bool                   serviceMatchesCDHash(IOService *service);
+       bool                   checkEntitlements(IOService * provider, IOService * dext);
+       bool                   checkEntitlements(OSDictionary * entitlements, OSObject * prop,
+           IOService * provider, IOService * dext);
+
+       void                   setTaskLoadTag(OSKext *kext);
+       void                   setDriverKitUUID(OSKext *kext);
+       void                   systemPower(bool powerOff);
+       IOReturn                                setPowerState(unsigned long state, IOService * service) APPLE_KEXT_OVERRIDE;
+       IOReturn                                powerStateWillChangeTo(IOPMPowerFlags flags, unsigned long state, IOService * service) APPLE_KEXT_OVERRIDE;
+       IOReturn                                powerStateDidChangeTo(IOPMPowerFlags flags, unsigned long state, IOService * service) APPLE_KEXT_OVERRIDE;
+
+       IOPStrings *           copyInStringArray(const char * string, uint32_t userSize);
+       uint32_t               stringArrayIndex(IOPStrings * array, const char * look);
+       IOReturn               registerClass(OSClassDescription * desc, uint32_t size, OSUserMetaClass ** cls);
+       IOReturn               setRootQueue(IODispatchQueue * queue);
+
+       OSObjectUserVars     * varsForObject(OSObject * obj);
+       LIBKERN_RETURNS_NOT_RETAINED IODispatchQueue      * queueForObject(OSObject * obj, uint64_t msgid);
+
+       static ipc_port_t      copySendRightForObject(OSObject * object, natural_t /* ipc_kobject_type_t */ type);
+       static OSObject      * copyObjectForSendRight(ipc_port_t port, natural_t /* ipc_kobject_type_t */ type);
+
+       IOReturn               copyOutObjects(IORPCMessageMach * mach, IORPCMessage * message,
+           size_t size, bool consume);
+       IOReturn               copyInObjects(IORPCMessageMach * mach, IORPCMessage * message,
+           size_t size, bool copyObjects, bool consumePorts);
+
+       IOReturn               consumeObjects(IORPCMessage * message, size_t messageSize);
+
+       IOReturn               objectInstantiate(OSObject * obj, IORPC rpc, IORPCMessage * message);
+       IOReturn               kernelDispatch(OSObject * obj, IORPC rpc);
+       static OSObject      * target(OSAction * action, IORPCMessage * message);
+
+       IOReturn               rpc(IORPC rpc);
+       IOReturn               server(ipc_kmsg_t requestkmsg, ipc_kmsg_t * preply);
+       kern_return_t          waitInterruptTrap(void * p1, void * p2, void * p3, void * p4, void * p5, void * p6);
+};
+
+extern "C" kern_return_t
+IOUserServerUEXTTrap(OSObject * object, void * p1, void * p2, void * p3, void * p4, void * p5, void * p6);
+
+#endif /* XNU_KERNEL_PRIVATE */
+#endif /* _IOUSERSERVER_H */
index 2c1fd64f55ab490c36976a5632583317cb418b5d..1d5fa916baf9733c30a8fad2947073eecc0cc7cf 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -53,7 +53,7 @@ class IOCommandGate;
  */
 class IOWorkLoop : public OSObject
 {
-       OSDeclareDefaultStructors(IOWorkLoop)
+       OSDeclareDefaultStructors(IOWorkLoop);
 
 public:
 /*!
@@ -294,8 +294,8 @@ public:
  *   @result Returns the value of the Action callout.
  */
        virtual IOReturn runAction(Action action, OSObject *target,
-           void *arg0 = 0, void *arg1 = 0,
-           void *arg2 = 0, void *arg3 = 0);
+           void *arg0 = NULL, void *arg1 = NULL,
+           void *arg2 = NULL, void *arg3 = NULL);
 
 #ifdef __BLOCKS__
 /*! @function runAction
index 327365f21df89ba296d25dd0109895c3b34fabf9..e898d7e4b8d2a0a28ec20d85d4c4d04e05652734 100644 (file)
@@ -29,7 +29,7 @@ EXPINC_SUBDIRS = ${INSTINC_SUBDIRS}
 # Kernel.framework/Headers/IOKit AND Kernel.framework/PrivateHeaders/IOKit.
 # This is so the files with #ifdef ...PRIVATE portions can be processed.
 # xnu/README documents the INSTALL* and EXPORT_MI_DIR lists.
-ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h))
+ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h))
 
 # Do not install these headers (anywhere).
 NOT_EXPORT_HEADERS = IOInterruptAccountingPrivate.h
@@ -44,10 +44,11 @@ NOT_KF_MI_HEADERS  = $(NOT_EXPORT_HEADERS)                  \
                     IOKernelReporters.h                        \
                     IOInterruptAccounting.h
 
+
 # These should be additionally installed in IOKit.framework's public Headers
 INSTALL_MI_LIST        = IOBSD.h IOKitKeys.h IOKitServer.h IOReturn.h      \
                  IOSharedLock.h IOTypes.h OSMessageNotification.h  \
-                 IODataQueueShared.h IOMessage.h
+                 IODataQueueShared.h IOMessage.h IORPC.h IOUserServer.h
 
 # These should be additionally installed in IOKit.framework's PrivateHeaders
 INSTALL_MI_LCL_LIST = IOKitKeysPrivate.h IOHibernatePrivate.h   \
index d7a224db7f2db840005d042b2b547bd2e76c42e0..11cf1ace2e9533149830bf43357afbad9501ef82 100644 (file)
@@ -13,7 +13,7 @@ include $(MakeInc_def)
 MI_DIR = machine
 EXCLUDE_HEADERS =
 
-ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h))
+ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h))
 HEADER_LIST = $(filter-out $(EXCLUDE_HEADERS), $(ALL_HEADERS))
 
 INSTALL_MI_LIST        = ${HEADER_LIST}
index 393486e2ad693dc5604324b6c8343c3cdf3fc41a..b3946e7fff450776b85a2e832bd435a692a20194 100644 (file)
@@ -13,7 +13,7 @@ include $(MakeInc_def)
 MI_DIR = nvram
 NOT_EXPORT_HEADERS =
 
-ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h))
+ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h))
 
 INSTALL_MI_LIST        =
 INSTALL_MI_LCL_LIST =
index b1501cd526e6f4c80ee5f6cf963d3556d22b9576..27e776ee32b913b933c5dd8e6c39b5611cfe85ce 100644 (file)
@@ -8,6 +8,8 @@
 #ifdef __cplusplus
 
 #include <IOKit/IOService.h>
+#include <stdatomic.h>
+#include <kern/bits.h>
 
 struct thread_group;
 
@@ -175,6 +177,13 @@ public:
                WorkEndFunction workEnd;
        };
 
+       struct IOPerfControlClientShared {
+               atomic_uint_fast8_t maxDriverIndex;
+               PerfControllerInterface interface;
+               IOLock *interfaceLock;
+               OSSet *deviceRegistrationList;
+       };
+
 /*!
  * @function registerPerformanceController
  * @abstract Register a performance controller to receive callbacks. Not for general driver use.
@@ -190,20 +199,22 @@ private:
                uint8_t perfcontrol_data[32];
        };
 
-// TODO: size of table should match sum(maxWorkCapacity) of all users
-       static constexpr size_t kWorkTableNumEntries = 1024;
+       static constexpr size_t kMaxWorkTableNumEntries = 1024;
+       static constexpr size_t kWorkTableIndexBits = 24;
+       static constexpr size_t kWorkTableMaxSize = (1 << kWorkTableIndexBits) - 1; // - 1 since
+       // kIOPerfControlClientWorkUntracked takes number 0
+       static constexpr size_t kWorkTableIndexMask = mask(kWorkTableIndexBits);
 
        uint64_t allocateToken(thread_group *thread_group);
        void deallocateToken(uint64_t token);
        bool getEntryForToken(uint64_t token, WorkTableEntry &entry);
        void markEntryStarted(uint64_t token, bool started);
+       inline uint64_t tokenToGlobalUniqueToken(uint64_t token);
 
-       PerfControllerInterface interface;
-       IOLock *interfaceLock;
-       OSSet *deviceRegistrationList;
-
-// TODO: replace with ltable or pool of objects
-       WorkTableEntry workTable[kWorkTableNumEntries];
+       uint8_t driverIndex;
+       IOPerfControlClientShared *shared;
+       WorkTableEntry *workTable;
+       size_t workTableLength;
        size_t workTableNextIndex;
        IOSimpleLock *workTableLock;
 };
index 3f8cad1d5d8474cc9e0407c3fd84dd7ef0fe770f..017c31610fcf99e61c325db03385c6383689453a 100644 (file)
@@ -13,7 +13,7 @@ include $(MakeInc_def)
 MI_DIR = perfcontrol
 NOT_EXPORT_HEADERS =
 
-ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h))
+ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h))
 
 # Install these only in Kernel.framework's PrivateHeaders (not Headers).
 NOT_KF_MI_HEADERS  = $(NOT_EXPORT_HEADERS)                     \
index ebb1f416e92fa26474a5ef92db3b6b1528349c8a..35ec20f4c8cda65c0c98d340c3635852b5ad2db2 100644 (file)
@@ -14,7 +14,7 @@ MI_DIR = platform
 NOT_EXPORT_HEADERS =
 NOT_KF_MI_HEADERS  =
 
-ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h))
+ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h))
 
 INSTALL_MI_LIST        =
 INSTALL_MI_LCL_LIST =
index 6fcd0d8babb150a987a3f7b7701f4f113221fd86..f38707f8d8791b8c2b9fc1f96c424bb779c6c12d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -33,7 +33,7 @@
 
 class IOPwrController : public IOService
 {
-       OSDeclareAbstractStructors(IOPwrController)
+       OSDeclareAbstractStructors(IOPwrController);
 
 public:
 };
index 01cc4bd0927f2e7396d5e5e26158ea9fa67cc6cd..8407f300603bc1b593ad95b247c853e263b4918e 100644 (file)
@@ -13,7 +13,7 @@ include $(MakeInc_def)
 MI_DIR = power
 NOT_EXPORT_HEADERS =
 
-ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h))
+ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h))
 
 INSTALL_MI_LIST        =
 INSTALL_MI_LCL_LIST =
index bc1b883971e6714851b023a6ae1077a4869b7b50..a7b0417b2f3b1cff83c9ce5d962e25e9d3b659bd 100644 (file)
@@ -102,6 +102,12 @@ enum {
        kIOPMPowerOn                    = 0x00000002,
        kIOPMDeviceUsable               = 0x00008000,
        kIOPMLowPower                   = 0x00010000,
+#if PRIVATE
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+       kIOPMAOTPower                   = 0x00020000,
+       kIOPMAOTCapability              = kIOPMAOTPower,
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+#endif /* PRIVATE */
        kIOPMPreventIdleSleep           = 0x00000040,
        kIOPMSleepCapability            = 0x00000004,
        kIOPMRestartCapability          = 0x00000080,
@@ -321,6 +327,12 @@ enum {
         */
        kIOPMDriverAssertionCPUBit                      = 0x01,
 
+       /*! kIOPMDriverAssertionPreventSystemIdleSleepBit
+        * When set, the system should not idle sleep. This does not prevent
+        * demand sleep.
+        */
+       kIOPMDriverAssertionPreventSystemIdleSleepBit   = 0x02,
+
        /*! kIOPMDriverAssertionUSBExternalDeviceBit
         * When set, driver is informing PM that an external USB device is attached.
         */
@@ -473,7 +485,7 @@ enum {
  * Argument accompanying the kIOPMMessageSleepWakeUUIDChange notification when
  * the current UUID has been removed.
  */
-#define kIOPMMessageSleepWakeUUIDCleared                ((void *)0)
+#define kIOPMMessageSleepWakeUUIDCleared                ((void *)NULL)
 
 /*! kIOPMMessageDriverAssertionsChanged
  *  Sent when kernel PM driver assertions have changed.
@@ -510,7 +522,8 @@ enum {
        kIOPMProcessorSpeedChange     = (1 << 8),// change the processor speed
        kIOPMOverTemp                 = (1 << 9),// system dangerously hot
        kIOPMClamshellOpened          = (1 << 10),// clamshell was opened
-       kIOPMDWOverTemp               = (1 << 11)// DarkWake thermal limits exceeded.
+       kIOPMDWOverTemp               = (1 << 11),// DarkWake thermal limits exceeded.
+       kIOPMPowerButtonUp            = (1 << 12) // Power button up
 };
 
 
@@ -589,7 +602,7 @@ enum {
 #define kIOPMPSLegacyBatteryInfoKey                 "LegacyBatteryInfo"
 #define kIOPMPSBatteryHealthKey                     "BatteryHealth"
 #define kIOPMPSHealthConfidenceKey                  "HealthConfidence"
-#define kIOPMPSCapacityEstimatedKey                     "CapacityEstimated"
+#define kIOPMPSCapacityEstimatedKey                 "CapacityEstimated"
 #define kIOPMPSBatteryChargeStatusKey               "ChargeStatus"
 #define kIOPMPSBatteryTemperatureKey                "Temperature"
 #define kIOPMPSAdapterDetailsKey                    "AdapterDetails"
@@ -627,13 +640,13 @@ enum {
 #define kIOPMPSAdapterDetailsRevisionKey            "AdapterRevision"
 #define kIOPMPSAdapterDetailsSerialNumberKey        "SerialNumber"
 #define kIOPMPSAdapterDetailsFamilyKey              "FamilyCode"
-#define kIOPMPSAdapterDetailsAmperageKey            "Amperage"
+#define kIOPMPSAdapterDetailsAmperageKey            "Current"
 #define kIOPMPSAdapterDetailsDescriptionKey         "Description"
 #define kIOPMPSAdapterDetailsPMUConfigurationKey    "PMUConfiguration"
-#define kIOPMPSAdapterDetailsVoltage            "AdapterVoltage"
-#define kIOPMPSAdapterDetailsSourceIDKey                    "SourceID"
-#define kIOPMPSAdapterDetailsErrorFlagsKey                  "ErrorFlags"
-#define kIOPMPSAdapterDetailsSharedSourceKey            "SharedSource"
+#define kIOPMPSAdapterDetailsVoltage                "Voltage"
+#define kIOPMPSAdapterDetailsSourceIDKey            "Source"
+#define kIOPMPSAdapterDetailsErrorFlagsKey          "ErrorFlags"
+#define kIOPMPSAdapterDetailsSharedSourceKey        "SharedSource"
 #define kIOPMPSAdapterDetailsCloakedKey             "CloakedSource"
 
 // values for kIOPSPowerAdapterFamilyKey
index 7f199e6b89befda0aa883aeaed34a9d9c9705ba9..c248941a5d61873a7bc7ce16b2b0fe68042d7a09 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2005 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -170,7 +170,7 @@ enum {
 
 class IOPMPowerSource : public IOService
 {
-       OSDeclareDefaultStructors(IOPMPowerSource)
+       OSDeclareDefaultStructors(IOPMPowerSource);
 
        friend class IOPMPowerSourceList;
 
index f78ca2d54e19e98baf44eac2899267aa870b9236..e69663a3e842acbb213a9e4c9b1751741733cd8c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2005 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -32,7 +32,7 @@ class IOPMPowerSource;
 
 class IOPMPowerSourceList : public OSObject
 {
-       OSDeclareDefaultStructors(IOPMPowerSourceList)
+       OSDeclareDefaultStructors(IOPMPowerSourceList);
 private:
 // pointer to first power source in list
        IOPMPowerSource         *firstItem;
index 798be5d88d8f4240304b8bc9ad0fb7623306d072..1a549c0c40f917ce816d27e8a4a5d75fe5f459e9 100644 (file)
@@ -183,6 +183,7 @@ enum {
  * These are valid values for IOPM.h:IOPMCalendarStruct->selector
  */
 enum {
+    kPMCalendarTypeInvalid = 0,
     kPMCalendarTypeMaintenance = 1,
     kPMCalendarTypeSleepService = 2
 };
@@ -269,6 +270,8 @@ enum {
 #define kIOPMSleepStatisticsAppsKey             "AppStatistics"
 #define kIOPMIdleSleepPreventersKey             "IdleSleepPreventers"
 #define kIOPMSystemSleepPreventersKey           "SystemSleepPreventers"
+#define kIOPMIdleSleepPreventersWithIDKey       "IdleSleepPreventersWithID"
+#define kIOPMSystemSleepPreventersWithIDKey     "SystemSleepPreventersWithID"
 
 // Application response statistics
 #define kIOPMStatsNameKey                       "Name"
@@ -682,6 +685,84 @@ enum {
 
 #define kIOPMWakeEventSource                0x00000001
 
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+/*****************************************************************************
+ *
+ * AOT defs
+ *
+ *****************************************************************************/
+
+// signals the device should wake up to user space running
+#define kIOPMWakeEventAOTExit                   0x00000002
+
+// will start a 400 ms timer before sleeping
+#define kIOPMWakeEventAOTPossibleExit           0x00000004
+
+// signals the device should wake up to user space running
+#define kIOPMWakeEventAOTConfirmedPossibleExit  0x00000008
+
+// signals the device should go back to AOT
+#define kIOPMWakeEventAOTRejectedPossibleExit   0x00000010
+
+// signals the device should go back to AOT
+#define kIOPMWakeEventAOTExpiredPossibleExit    0x00000020
+
+#define kIOPMWakeEventAOTFlags \
+                                 (kIOPMWakeEventAOTExit \
+                                | kIOPMWakeEventAOTPossibleExit \
+                                | kIOPMWakeEventAOTConfirmedPossibleExit \
+                                | kIOPMWakeEventAOTRejectedPossibleExit \
+                                | kIOPMWakeEventAOTExpiredPossibleExit)
+
+#define kIOPMWakeEventAOTPossibleFlags \
+                                 (kIOPMWakeEventAOTPossibleExit \
+                                | kIOPMWakeEventAOTConfirmedPossibleExit \
+                                | kIOPMWakeEventAOTRejectedPossibleExit \
+                                | kIOPMWakeEventAOTExpiredPossibleExit)
+
+#define kIOPMWakeEventAOTPerCycleFlags \
+                                 (kIOPMWakeEventAOTPossibleExit \
+                                | kIOPMWakeEventAOTRejectedPossibleExit \
+                                | kIOPMWakeEventAOTExpiredPossibleExit)
+
+#define kIOPMWakeEventAOTExitFlags \
+                                 (kIOPMWakeEventAOTExit \
+                                | kIOPMWakeEventAOTConfirmedPossibleExit)
+
+enum {
+    kIOPMAOTModeEnable        = 0x00000001,
+    kIOPMAOTModeCycle         = 0x00000002,
+    kIOPMAOTModeAddEventFlags = 0x00000004,
+    kIOPMAOTModeRespectTimers = 0x00000008,
+    kIOPMAOTModeDefault       = (kIOPMAOTModeEnable | kIOPMAOTModeAddEventFlags | kIOPMAOTModeRespectTimers)
+};
+
+enum {
+    kIOPMAOTMetricsKernelWakeCountMax = 24
+};
+
+struct IOPMAOTMetrics
+{
+    uint32_t sleepCount;
+    uint32_t possibleCount;
+    uint32_t confirmedPossibleCount;
+    uint32_t rejectedPossibleCount;
+    uint32_t expiredPossibleCount;
+    uint32_t noTimeSetCount;
+    uint32_t rtcAlarmsCount;
+    uint32_t softwareRequestCount;
+    uint64_t totalTime;
+
+       char     kernelWakeReason[kIOPMAOTMetricsKernelWakeCountMax][64];
+       // 54:10 secs:ms calendar time
+    uint64_t kernelSleepTime[kIOPMAOTMetricsKernelWakeCountMax];
+    uint64_t kernelWakeTime[kIOPMAOTMetricsKernelWakeCountMax];
+};
+
+#define kIOPMAOTPowerKey    "aot-power"
+
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
 /*****************************************************************************
  *
  * System Sleep Policy
@@ -873,6 +954,7 @@ typedef struct {
 #define SWD_VALID_LOGS          0x08
 #define SWD_LOGS_IN_FILE        0x10
 #define SWD_LOGS_IN_MEM         0x20
+#define SWD_PWR_BTN_STACKSHOT   0x30
 
 #define SWD_DATA_CRC_ERROR      0x010000
 #define SWD_BUF_SIZE_ERROR      0x020000
index 6280e2ba43486fc701e0a3ff9f9ca802ef05790f..40111c0cc22b9554dcd2d2554471648ffc5d42f2 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -33,7 +33,7 @@
 
 class IOPMinformee : public OSObject
 {
-       OSDeclareDefaultStructors(IOPMinformee)
+       OSDeclareDefaultStructors(IOPMinformee);
        friend class IOPMinformeeList;
 
 public:
index f941a0ad17df01456fab800f846a93c3cd9eb5d8..ae4b1211015236269cbf344ec7b7c2de3f4c7960 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -35,7 +35,7 @@ extern uint32_t gCanSleepTimeout;
 
 class IOPMinformeeList : public OSObject
 {
-       OSDeclareDefaultStructors(IOPMinformeeList)
+       OSDeclareDefaultStructors(IOPMinformeeList);
        friend class IOPMinformee;
 
 private:
index 98ebe50b5f73d374cd5d26b67ca76fdaf39af1bc..f6dfb48d07611282f80463dbe5a92ced072eac92 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -48,7 +48,7 @@
 
 class IOPowerConnection : public IOService
 {
-       OSDeclareDefaultStructors(IOPowerConnection)
+       OSDeclareDefaultStructors(IOPowerConnection);
 
 protected:
 /*! @field parentKnowsState    true: parent knows state of its domain
index ad7dcbbdf06d59a7f0620a20b0da4f38b2f70c70..c4cf72b314e1ef688daf90725de27c7d6e07216e 100644 (file)
@@ -17,7 +17,7 @@ NOT_EXPORT_HEADERS = \
        IOPMlog.h               \
        IOPMPrivate.h
 
-ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h))
+ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h))
 
 INSTALL_MI_LIST        = IOPMLibDefs.h IOPM.h
 INSTALL_MI_LCL_LIST = IOPMPrivate.h
index 504d8d0f2a9581f95019757bff2e16f1cdd1a75f..61334b1cda8aab3db9611f2a1505fa6f831e6575 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -34,6 +34,9 @@
 #include <sys/vnode.h>
 
 #ifdef XNU_KERNEL_PRIVATE
+
+#include <IOKit/pwr_mgt/IOPMPrivate.h>
+
 struct AggressivesRecord;
 struct IOPMMessageFilterContext;
 struct IOPMActions;
@@ -43,6 +46,7 @@ class PMTraceWorker;
 class IOPMPowerStateQueue;
 class RootDomainUserClient;
 class PMAssertionsTracker;
+class IOTimerEventSource;
 
 #define OBFUSCATE(x) (void *)VM_KERNEL_UNSLIDE_OR_PERM(x)
 
@@ -130,11 +134,11 @@ typedef IOReturn (*IOPMSettingControllerCallback)
 
 __BEGIN_DECLS
 IONotifier *    registerSleepWakeInterest(
-       IOServiceInterestHandler, void *, void * = 0);
+       IOServiceInterestHandler, void *, void * = NULL);
 
 IONotifier *    registerPrioritySleepWakeInterest(
        IOServiceInterestHandler handler,
-       void * self, void * ref = 0);
+       void * self, void * ref = NULL);
 
 IOReturn        acknowledgeSleepWakeNotification(void * );
 
@@ -145,7 +149,7 @@ __END_DECLS
 
 class IOPMrootDomain : public IOService
 {
-       OSDeclareFinalStructors(IOPMrootDomain)
+       OSDeclareFinalStructors(IOPMrootDomain);
 
 public:
        static IOPMrootDomain * construct( void );
@@ -243,7 +247,7 @@ public:
        void                                claimSystemWakeEvent( IOService     *device,
            IOOptionBits  flags,
            const char    *reason,
-           OSObject      *details = 0 );
+           OSObject      *details = NULL );
 
        virtual IOReturn    receivePowerNotification( UInt32 msg );
 
@@ -324,7 +328,7 @@ public:
        virtual IONotifier * registerInterest(
                const OSSymbol * typeOfInterest,
                IOServiceInterestHandler handler,
-               void * target, void * ref = 0 ) APPLE_KEXT_OVERRIDE;
+               void * target, void * ref = NULL ) APPLE_KEXT_OVERRIDE;
 
        virtual IOReturn    callPlatformFunction(
                const OSSymbol *functionName,
@@ -386,7 +390,11 @@ public:
  */
        IOReturn restartWithStackshot();
 
+       IOReturn    setWakeTime(uint64_t wakeContinuousTime);
+
 private:
+       unsigned long getRUN_STATE(void);
+
        virtual IOReturn    changePowerStateTo( unsigned long ordinal ) APPLE_KEXT_COMPATIBILITY_OVERRIDE;
        virtual IOReturn    changePowerStateToPriv( unsigned long ordinal );
        virtual IOReturn    requestPowerDomainState( IOPMPowerFlags, IOPowerConnection *, unsigned long ) APPLE_KEXT_OVERRIDE;
@@ -524,6 +532,10 @@ public:
        void        updatePreventSystemSleepList(
                IOService * service, bool addNotRemove );
 
+       bool        updatePreventIdleSleepListInternal(
+               IOService * service, bool addNotRemove, unsigned int oldCount);
+       unsigned int idleSleepPreventersCount();
+
        void        publishPMSetting(
                const OSSymbol * feature, uint32_t where, uint32_t * featureID );
 
@@ -549,14 +561,15 @@ public:
                uint32_t *  hibernateFreeTime );
        bool        mustHibernate( void );
 #endif
-       void        takeStackshot(bool restart, bool isOSXWatchdog, bool isSpinDump);
+       void        takeStackshot(bool restart);
        void        sleepWakeDebugTrig(bool restart);
        void        sleepWakeDebugEnableWdog();
        bool        sleepWakeDebugIsWdogEnabled();
        void        sleepWakeDebugSaveSpinDumpFile();
        bool        checkShutdownTimeout();
-       void        panicWithShutdownLog(uint32_t timeoutInMs);
+       void        panicWithShutdownLog(uint32_t timeoutInMs) __abortlike;
        uint32_t    getWatchdogTimeout();
+       void        deleteStackshot();
 
 private:
        friend class PMSettingObject;
@@ -646,6 +659,8 @@ private:
 
 // Used to wait between say display idle and system idle
        thread_call_t           extraSleepTimer;
+       thread_call_t           powerButtonDown;
+       thread_call_t           powerButtonUp;
        thread_call_t           diskSyncCalloutEntry;
        thread_call_t           fullWakeThreadCall;
        thread_call_t           updateConsoleUsersEntry;
@@ -693,6 +708,7 @@ private:
        unsigned int            wranglerTickled         :1;
        unsigned int            _preventUserActive      :1;
        unsigned int            graphicsSuppressed      :1;
+       unsigned int            isRTCAlarmWake          :1;
 
        unsigned int            capabilityLoss          :1;
        unsigned int            pciCantSleepFlag        :1;
@@ -719,6 +735,7 @@ private:
        unsigned int            displayPowerOnRequested:1;
 
        uint8_t                 tasksSuspended;
+       uint8_t                 tasksSuspendState;
        uint32_t                hibernateMode;
        AbsoluteTime            userActivityTime;
        AbsoluteTime            userActivityTime_prev;
@@ -772,6 +789,7 @@ private:
 
        UInt32                  _scheduledAlarms;
        UInt32                  _userScheduledAlarm;
+       clock_sec_t             _scheduledAlarmUTC;
 
 #if HIBERNATION
        clock_sec_t             _standbyTimerResetSeconds;
@@ -790,6 +808,39 @@ private:
        OSArray *               _systemWakeEventsArray;
        bool                    _acceptSystemWakeEvents;
 
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+       // AOT --
+       IOPMCalendarStruct   _aotWakeTimeCalendar;
+       IOTimerEventSource * _aotTimerES;
+       clock_sec_t          _aotWakeTimeUTC;
+       uint64_t             _aotTestTime;
+       uint64_t             _aotTestInterval;
+       uint32_t             _aotPendingFlags;
+public:
+       IOPMAOTMetrics     * _aotMetrics;
+       uint8_t              _aotMode;
+private:
+       uint8_t              _aotNow;
+       uint8_t              _aotTasksSuspended;
+       uint8_t              _aotExit;
+       uint8_t              _aotTimerScheduled;
+       uint8_t              _aotReadyToFullWake;
+       uint64_t             _aotLastWakeTime;
+       uint64_t             _aotWakeTimeContinuous;
+       uint64_t             _aotWakePreWindow;
+       uint64_t             _aotWakePostWindow;
+       uint64_t             _aotLingerTime;
+
+       bool        aotShouldExit(bool checkTimeSet, bool software);
+       void        aotExit(bool cps);
+       void        aotEvaluate(IOTimerEventSource * timer);
+public:
+       bool        isAOTMode(void);
+private:
+       // -- AOT
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
+       void        updateTasksSuspend(void);
        int         findSuspendedPID(uint32_t pid, uint32_t *outRefCount);
 
 // IOPMrootDomain internal sleep call
@@ -807,6 +858,7 @@ private:
        void        restoreUserSpinDownTimeout( void );
 
        bool        shouldSleepOnClamshellClosed(void );
+       bool        shouldSleepOnRTCAlarmWake(void );
        void        sendClientClamshellNotification( void );
 
 // Inform PMCPU of changes to state like lid, AC vs. battery
@@ -874,13 +926,14 @@ private:
        void        preventTransitionToUserActive( bool prevent );
        void        setThermalState(OSObject *value);
        void        copySleepPreventersList(OSArray  **idleSleepList, OSArray  **systemSleepList);
+       void        copySleepPreventersListWithID(OSArray  **idleSleepList, OSArray  **systemSleepList);
 #endif /* XNU_KERNEL_PRIVATE */
 };
 
 #ifdef XNU_KERNEL_PRIVATE
 class IORootParent : public IOService
 {
-       OSDeclareFinalStructors(IORootParent)
+       OSDeclareFinalStructors(IORootParent);
 
 public:
        static void initialize( void );
index 5a73917ec554e6c66ffe94a5c160dd9bfacb835b..876210ee1832885055513738098a2fb61c45acfb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2017 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -38,7 +38,7 @@ typedef void (*RTC_tick_handler)( IOService * );
 
 class IORTCController : public IOService
 {
-       OSDeclareAbstractStructors(IORTCController)
+       OSDeclareAbstractStructors(IORTCController);
 
 public:
 
index 587476354bb25404a7a4c05064d615c260154c36..19d48faea90b41e848fba21862102a86d24fc4c5 100644 (file)
@@ -13,7 +13,7 @@ include $(MakeInc_def)
 MI_DIR = rtc
 NOT_EXPORT_HEADERS =
 
-ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h))
+ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h))
 
 INSTALL_MI_LIST        =
 INSTALL_MI_LCL_LIST =
index 715cf2d98734350442a835bd7e9720a2a32db378..fc81916b411b1f5838af12318f00bc1948bc2d84 100644 (file)
@@ -13,7 +13,7 @@ include $(MakeInc_def)
 MI_DIR = system_management
 NOT_EXPORT_HEADERS =
 
-ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h))
+ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h))
 
 INSTALL_MI_LIST        =
 INSTALL_MI_LCL_LIST =
diff --git a/iokit/IOKitUser/IOBlockStorageDevice.h b/iokit/IOKitUser/IOBlockStorageDevice.h
new file mode 100644 (file)
index 0000000..0be0d68
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/IOBlockStorageDevice.h>
diff --git a/iokit/IOKitUser/IOBufferMemoryDescriptor.h b/iokit/IOKitUser/IOBufferMemoryDescriptor.h
new file mode 100644 (file)
index 0000000..3573356
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/IOBufferMemoryDescriptor.h>
diff --git a/iokit/IOKitUser/IODataQueueDispatchSource.h b/iokit/IOKitUser/IODataQueueDispatchSource.h
new file mode 100644 (file)
index 0000000..4c7d36f
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/IODataQueueDispatchSource.h>
diff --git a/iokit/IOKitUser/IODispatchQueue.h b/iokit/IOKitUser/IODispatchQueue.h
new file mode 100644 (file)
index 0000000..05d4ff6
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/IODispatchQueue.h>
diff --git a/iokit/IOKitUser/IODispatchSource.h b/iokit/IOKitUser/IODispatchSource.h
new file mode 100644 (file)
index 0000000..9a9d06c
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/IODispatchSource.h>
diff --git a/iokit/IOKitUser/IOInterruptDispatchSource.h b/iokit/IOKitUser/IOInterruptDispatchSource.h
new file mode 100644 (file)
index 0000000..b27409e
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/IOInterruptDispatchSource.h>
diff --git a/iokit/IOKitUser/IOMemoryDescriptor.h b/iokit/IOKitUser/IOMemoryDescriptor.h
new file mode 100644 (file)
index 0000000..62cddfb
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/IOMemoryDescriptor.h>
diff --git a/iokit/IOKitUser/IOMemoryMap.h b/iokit/IOKitUser/IOMemoryMap.h
new file mode 100644 (file)
index 0000000..56fe092
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/IOMemoryMap.h>
diff --git a/iokit/IOKitUser/IOService.h b/iokit/IOKitUser/IOService.h
new file mode 100644 (file)
index 0000000..ec9ad1b
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/IOService.h>
diff --git a/iokit/IOKitUser/IOTimerDispatchSource.h b/iokit/IOKitUser/IOTimerDispatchSource.h
new file mode 100644 (file)
index 0000000..7b0634d
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/IOTimerDispatchSource.h>
diff --git a/iokit/IOKitUser/IOUserServer.h b/iokit/IOKitUser/IOUserServer.h
new file mode 100644 (file)
index 0000000..7c184bb
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/IOUserServer.h>
diff --git a/iokit/IOKitUser/Makefile b/iokit/IOKitUser/Makefile
new file mode 100644 (file)
index 0000000..afafc2e
--- /dev/null
@@ -0,0 +1,17 @@
+export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
+export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
+export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
+export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
+
+include $(MakeInc_cmd)
+include $(MakeInc_def)
+
+ALL_HEADERS = $(notdir $(wildcard $(SOURCE)*.h))
+
+EXPORT_MI_DIR = IOKitUser
+INSTALL_MI_DIR = IOKitUser
+
+INSTALL_KF_MI_LIST = $(ALL_HEADERS)
+
+include $(MakeInc_rule)
+include $(MakeInc_dir)
diff --git a/iokit/IOKitUser/OSAction.h b/iokit/IOKitUser/OSAction.h
new file mode 100644 (file)
index 0000000..9568834
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/OSAction.h>
diff --git a/iokit/IOKitUser/OSArray.h b/iokit/IOKitUser/OSArray.h
new file mode 100644 (file)
index 0000000..9bcf3f9
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/OSArray.h>
diff --git a/iokit/IOKitUser/OSBoolean.h b/iokit/IOKitUser/OSBoolean.h
new file mode 100644 (file)
index 0000000..885b393
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/OSBoolean.h>
diff --git a/iokit/IOKitUser/OSCollection.h b/iokit/IOKitUser/OSCollection.h
new file mode 100644 (file)
index 0000000..9e842c6
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/OSCollection.h>
diff --git a/iokit/IOKitUser/OSContainer.h b/iokit/IOKitUser/OSContainer.h
new file mode 100644 (file)
index 0000000..b36a0ce
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/OSContainer.h>
diff --git a/iokit/IOKitUser/OSData.h b/iokit/IOKitUser/OSData.h
new file mode 100644 (file)
index 0000000..bebb52d
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/OSData.h>
diff --git a/iokit/IOKitUser/OSDictionary.h b/iokit/IOKitUser/OSDictionary.h
new file mode 100644 (file)
index 0000000..53cc653
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/OSDictionary.h>
diff --git a/iokit/IOKitUser/OSNumber.h b/iokit/IOKitUser/OSNumber.h
new file mode 100644 (file)
index 0000000..377405c
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/OSNumber.h>
diff --git a/iokit/IOKitUser/OSObject.h b/iokit/IOKitUser/OSObject.h
new file mode 100644 (file)
index 0000000..4e58154
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/OSObject.h>
diff --git a/iokit/IOKitUser/OSSerialization.h b/iokit/IOKitUser/OSSerialization.h
new file mode 100644 (file)
index 0000000..6635145
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/OSSerialization.h>
diff --git a/iokit/IOKitUser/OSString.h b/iokit/IOKitUser/OSString.h
new file mode 100644 (file)
index 0000000..0944070
--- /dev/null
@@ -0,0 +1 @@
+#include <DriverKit/OSString.h>
index 91aeca2a87e2245ccfe7609248107d2056290344..dfe4b08ba5674b3ce8ea7786f163c73a05ffad05 100644 (file)
@@ -111,6 +111,63 @@ IOBufferMemoryDescriptor::initWithOptions(
 }
 #endif /* !__LP64__ */
 
+IOBufferMemoryDescriptor *
+IOBufferMemoryDescriptor::withCopy(
+       task_t                inTask,
+       IOOptionBits      options,
+       vm_map_t              sourceMap,
+       mach_vm_address_t source,
+       mach_vm_size_t    size)
+{
+       IOBufferMemoryDescriptor * inst;
+       kern_return_t              err;
+       vm_map_copy_t              copy;
+       vm_map_address_t           address;
+
+       copy = NULL;
+       do {
+               err = kIOReturnNoMemory;
+               inst = new IOBufferMemoryDescriptor;
+               if (!inst) {
+                       break;
+               }
+               inst->_ranges.v64 = IONew(IOAddressRange, 1);
+               if (!inst->_ranges.v64) {
+                       break;
+               }
+
+               err = vm_map_copyin(sourceMap, source, size,
+                   false /* src_destroy */, &copy);
+               if (KERN_SUCCESS != err) {
+                       break;
+               }
+
+               err = vm_map_copyout(get_task_map(inTask), &address, copy);
+               if (KERN_SUCCESS != err) {
+                       break;
+               }
+               copy = NULL;
+
+               inst->_ranges.v64->address = address;
+               inst->_ranges.v64->length  = size;
+
+               if (!inst->initWithPhysicalMask(inTask, options, size, page_size, 0)) {
+                       err = kIOReturnError;
+               }
+       } while (false);
+
+       if (KERN_SUCCESS == err) {
+               return inst;
+       }
+
+       if (copy) {
+               vm_map_copy_discard(copy);
+       }
+       OSSafeReleaseNULL(inst);
+       return NULL;
+}
+
+
 bool
 IOBufferMemoryDescriptor::initWithPhysicalMask(
        task_t            inTask,
@@ -125,6 +182,7 @@ IOBufferMemoryDescriptor::initWithPhysicalMask(
        IOOptionBits          iomdOptions = kIOMemoryTypeVirtual64 | kIOMemoryAsReference;
        IODMAMapSpecification mapSpec;
        bool                  mapped = false;
+       bool                  withCopy = false;
        bool                  needZero;
 
        if (!capacity) {
@@ -135,14 +193,28 @@ IOBufferMemoryDescriptor::initWithPhysicalMask(
        _capacity         = capacity;
        _internalFlags    = 0;
        _internalReserved = 0;
-       _buffer           = 0;
+       _buffer           = NULL;
 
-       _ranges.v64 = IONew(IOAddressRange, 1);
        if (!_ranges.v64) {
-               return false;
+               _ranges.v64 = IONew(IOAddressRange, 1);
+               if (!_ranges.v64) {
+                       return false;
+               }
+               _ranges.v64->address = 0;
+               _ranges.v64->length  = 0;
+       } else {
+               if (!_ranges.v64->address) {
+                       return false;
+               }
+               if (!(kIOMemoryPageable & options)) {
+                       return false;
+               }
+               if (!inTask) {
+                       return false;
+               }
+               _buffer = (void *) _ranges.v64->address;
+               withCopy = true;
        }
-       _ranges.v64->address = 0;
-       _ranges.v64->length  = 0;
        //  make sure super::free doesn't dealloc _ranges before super::init
        _flags = kIOMemoryAsReference;
 
@@ -151,7 +223,7 @@ IOBufferMemoryDescriptor::initWithPhysicalMask(
 
        if (!(kIOMemoryMapperNone & options)) {
                IOMapper::checkForSystemMapper();
-               mapped = (0 != IOMapper::gSystem);
+               mapped = (NULL != IOMapper::gSystem);
        }
        needZero = (mapped || (0 != (kIOMemorySharingTypeMask & options)));
 
@@ -261,13 +333,17 @@ IOBufferMemoryDescriptor::initWithPhysicalMask(
                vm_size_t       size = round_page(capacity);
 
                // initWithOptions will create memory entry
-               iomdOptions |= kIOMemoryPersistent;
+               if (!withCopy) {
+                       iomdOptions |= kIOMemoryPersistent;
+               }
 
                if (options & kIOMemoryPageable) {
 #if IOALLOCDEBUG
                        OSAddAtomicLong(size, &debug_iomallocpageable_size);
 #endif
-                       mapTask = inTask;
+                       if (!withCopy) {
+                               mapTask = inTask;
+                       }
                        if (NULL == inTask) {
                                inTask = kernel_task;
                        }
@@ -284,11 +360,11 @@ IOBufferMemoryDescriptor::initWithPhysicalMask(
                }
        }
 
-       _ranges.v64->address = (mach_vm_address_t) _buffer;;
+       _ranges.v64->address = (mach_vm_address_t) _buffer;
        _ranges.v64->length  = _capacity;
 
        if (!super::initWithOptions(_ranges.v64, 1, 0,
-           inTask, iomdOptions, /* System mapper */ 0)) {
+           inTask, iomdOptions, /* System mapper */ NULL)) {
                return false;
        }
 
@@ -315,7 +391,7 @@ IOBufferMemoryDescriptor::initWithPhysicalMask(
                reserved->map = createMappingInTask(mapTask, 0,
                    kIOMapAnywhere | (options & kIOMapPrefault) | (options & kIOMapCacheMask), 0, 0);
                if (!reserved->map) {
-                       _buffer = 0;
+                       _buffer = NULL;
                        return false;
                }
                release();  // map took a retain on this
@@ -344,7 +420,7 @@ IOBufferMemoryDescriptor::inTaskWithOptions(
 
        if (me && !me->initWithPhysicalMask(inTask, options, capacity, alignment, 0)) {
                me->release();
-               me = 0;
+               me = NULL;
        }
        return me;
 }
@@ -360,7 +436,7 @@ IOBufferMemoryDescriptor::inTaskWithPhysicalMask(
 
        if (me && !me->initWithPhysicalMask(inTask, options, capacity, 1, physicalMask)) {
                me->release();
-               me = 0;
+               me = NULL;
        }
        return me;
 }
@@ -386,7 +462,7 @@ IOBufferMemoryDescriptor::withOptions(
 
        if (me && !me->initWithPhysicalMask(kernel_task, options, capacity, alignment, 0)) {
                me->release();
-               me = 0;
+               me = NULL;
        }
        return me;
 }
@@ -458,7 +534,7 @@ IOBufferMemoryDescriptor::withBytes(const void * inBytes,
                    | (inContiguous ? kIOMemoryPhysicallyContiguous : 0),
                    inLength, inLength, 0 )) {
                me->release();
-               me = 0;
+               me = NULL;
        }
 
        if (me) {
@@ -467,7 +543,7 @@ IOBufferMemoryDescriptor::withBytes(const void * inBytes,
 
                if (!me->appendBytes(inBytes, inLength)) {
                        me->release();
-                       me = 0;
+                       me = NULL;
                }
        }
        return me;
@@ -488,7 +564,7 @@ IOBufferMemoryDescriptor::free()
        IOOptionBits     options   = _options;
        vm_size_t        size      = _capacity;
        void *           buffer    = _buffer;
-       IOMemoryMap *    map       = 0;
+       IOMemoryMap *    map       = NULL;
        IOAddressRange * range     = _ranges.v64;
        vm_offset_t      alignment = _alignment;
 
@@ -653,7 +729,7 @@ IOBufferMemoryDescriptor::getBytesNoCopy(vm_size_t start, vm_size_t withLength)
        IOVirtualAddress address;
 
        if ((start + withLength) < start) {
-               return 0;
+               return NULL;
        }
 
        if (kIOMemoryTypePhysical64 == (_flags & kIOMemoryTypeMask)) {
@@ -665,7 +741,7 @@ IOBufferMemoryDescriptor::getBytesNoCopy(vm_size_t start, vm_size_t withLength)
        if (start < _length && (start + withLength) <= _length) {
                return (void *)(address + start);
        }
-       return 0;
+       return NULL;
 }
 
 #ifndef __LP64__
index 88ac5d1ff974197f0c4d9edcabc0ed2a363fa7c7..84b9cedec6e08488bcebe1eb2d735f655b1bd1ec 100644 (file)
@@ -44,6 +44,7 @@ extern void kperf_kernel_configure(char *);
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 #include <kern/queue.h>
+#include <kern/sched_prim.h>
 
 extern "C" void console_suspend();
 extern "C" void console_resume();
@@ -149,6 +150,7 @@ iocpu_run_platform_actions(queue_head_t * queue, uint32_t first_priority, uint32
 extern "C" kern_return_t
 IOCPURunPlatformQuiesceActions(void)
 {
+       assert(preemption_enabled() == false);
        return iocpu_run_platform_actions(&gActionQueues[kQueueQuiesce], 0, 0U - 1,
                   NULL, NULL, NULL, TRUE);
 }
@@ -156,6 +158,7 @@ IOCPURunPlatformQuiesceActions(void)
 extern "C" kern_return_t
 IOCPURunPlatformActiveActions(void)
 {
+       assert(preemption_enabled() == false);
        return iocpu_run_platform_actions(&gActionQueues[kQueueActive], 0, 0U - 1,
                   NULL, NULL, NULL, TRUE);
 }
@@ -426,7 +429,7 @@ PE_cpu_machine_quiesce(cpu_id_t target)
 }
 
 #if defined(__arm__) || defined(__arm64__)
-static perfmon_interrupt_handler_func pmi_handler = 0;
+static perfmon_interrupt_handler_func pmi_handler = NULL;
 
 kern_return_t
 PE_cpu_perfmon_interrupt_install_handler(perfmon_interrupt_handler_func handler)
@@ -446,7 +449,7 @@ PE_cpu_perfmon_interrupt_enable(cpu_id_t target, boolean_t enable)
        }
 
        if (enable) {
-               targetCPU->getProvider()->registerInterrupt(1, targetCPU, (IOInterruptAction)pmi_handler, 0);
+               targetCPU->getProvider()->registerInterrupt(1, targetCPU, (IOInterruptAction)pmi_handler, NULL);
                targetCPU->getProvider()->enableInterrupt(1);
        } else {
                targetCPU->getProvider()->disableInterrupt(1);
@@ -495,7 +498,7 @@ IOCPUSleepKernel(void)
        iter = IORegistryIterator::iterateOver( gIOServicePlane,
            kIORegistryIterateRecursively );
        if (iter) {
-               all = 0;
+               all = NULL;
                do{
                        if (all) {
                                all->release();
@@ -525,6 +528,18 @@ IOCPUSleepKernel(void)
        currentShutdownTarget = NULL;
 #endif
 
+       integer_t old_pri;
+       thread_t self = current_thread();
+
+       /*
+        * We need to boost this thread's priority to the maximum kernel priority to
+        * ensure we can urgently preempt ANY thread currently executing on the
+        * target CPU.  Note that realtime threads have their own mechanism to eventually
+        * demote their priority below MAXPRI_KERNEL if they hog the CPU for too long.
+        */
+       old_pri = thread_kern_get_pri(self);
+       thread_kern_set_pri(self, thread_kern_get_kernel_maxpri());
+
        // Sleep the CPUs.
        cnt = numCPUs;
        while (cnt--) {
@@ -551,9 +566,18 @@ IOCPUSleepKernel(void)
        rootDomain->tracePoint( kIOPMTracePointSleepPlatformDriver );
        rootDomain->stop_watchdog_timer();
 
-       // Now sleep the boot CPU.
+       /*
+        * Now sleep the boot CPU, including calling the kQueueQuiesce actions.
+        * The system sleeps here.
+        */
+
        bootCPU->haltCPU();
 
+       /*
+        * The system is now coming back from sleep on the boot CPU.
+        * The kQueueActive actions have already been called.
+        */
+
        rootDomain->start_watchdog_timer();
        rootDomain->tracePoint( kIOPMTracePointWakePlatformActions );
 
@@ -592,6 +616,8 @@ IOCPUSleepKernel(void)
 #if defined(__arm64__)
        sched_restore_recommended_cores_after_sleep();
 #endif
+
+       thread_kern_set_pri(self, old_pri);
 }
 
 bool
@@ -639,6 +665,18 @@ IOCPU::start(IOService *provider)
        return true;
 }
 
+void
+IOCPU::detach(IOService *provider)
+{
+       super::detach(provider);
+       IOLockLock(gIOCPUsLock);
+       unsigned int index = gIOCPUs->getNextIndexOfObject(this, 0);
+       if (index != (unsigned int)-1) {
+               gIOCPUs->removeObject(index);
+       }
+       IOLockUnlock(gIOCPUsLock);
+}
+
 OSObject *
 IOCPU::getProperty(const OSSymbol *aKey) const
 {
@@ -680,12 +718,12 @@ IOCPU::setProperties(OSObject *properties)
        OSString     *stateStr;
        IOReturn     result;
 
-       if (dict == 0) {
+       if (dict == NULL) {
                return kIOReturnUnsupported;
        }
 
        stateStr = OSDynamicCast(OSString, dict->getObject(gIOCPUStateKey));
-       if (stateStr != 0) {
+       if (stateStr != NULL) {
                result = IOUserClient::clientHasPrivilege(current_task(), kIOClientPrivilegeAdministrator);
                if (result != kIOReturnSuccess) {
                        return result;
@@ -809,7 +847,7 @@ IOCPUInterruptController::initCPUInterruptController(int sources, int cpus)
        numCPUs = cpus;
 
        vectors = (IOInterruptVector *)IOMalloc(numSources * sizeof(IOInterruptVector));
-       if (vectors == 0) {
+       if (vectors == NULL) {
                return kIOReturnNoMemory;
        }
        bzero(vectors, numSources * sizeof(IOInterruptVector));
@@ -863,8 +901,8 @@ IOCPUInterruptController::setCPUInterruptProperties(IOService *service)
        OSData       *tmpData;
        long         tmpLong;
 
-       if ((service->getProperty(gIOInterruptControllersKey) != 0) &&
-           (service->getProperty(gIOInterruptSpecifiersKey) != 0)) {
+       if ((service->getProperty(gIOInterruptControllersKey) != NULL) &&
+           (service->getProperty(gIOInterruptSpecifiersKey) != NULL)) {
                return;
        }
 
@@ -899,7 +937,7 @@ IOCPUInterruptController::enableCPUInterrupt(IOCPU *cpu)
 
        assert(numCPUs > 0);
 
-       ml_install_interrupt_handler(cpu, cpu->getCPUNumber(), this, handler, 0);
+       ml_install_interrupt_handler(cpu, cpu->getCPUNumber(), this, handler, NULL);
 
        IOTakeLock(vectors[0].interruptLock);
        ++enabledCPUs;
@@ -920,6 +958,9 @@ IOCPUInterruptController::registerInterrupt(IOService *nub,
 {
        IOInterruptVector *vector;
 
+       // Interrupts must be enabled, as this can allocate memory.
+       assert(ml_get_interrupts_enabled() == TRUE);
+
        if (source >= numSources) {
                return kIOReturnNoResources;
        }
@@ -966,7 +1007,7 @@ IOCPUInterruptController::getInterruptType(IOService */*nub*/,
     int /*source*/,
     int *interruptType)
 {
-       if (interruptType == 0) {
+       if (interruptType == NULL) {
                return kIOReturnBadArgument;
        }
 
index 814494af2b824c759798c1c1bd92133ed6a1a281..7c0201e4df7c2381f379c5e62bde771231810414 100644 (file)
@@ -68,7 +68,8 @@ IOCatalogue    * gIOCatalogue;
 const OSSymbol * gIOClassKey;
 const OSSymbol * gIOProbeScoreKey;
 const OSSymbol * gIOModuleIdentifierKey;
-IORWLock         * gIOCatalogLock;
+const OSSymbol * gIOModuleIdentifierKernelKey;
+IORWLock       * gIOCatalogLock;
 
 #if PRAGMA_MARK
 #pragma mark Utility functions
@@ -105,9 +106,11 @@ IOCatalogue::initialize(void)
                errorString->release();
        }
 
-       gIOClassKey              = OSSymbol::withCStringNoCopy( kIOClassKey );
-       gIOProbeScoreKey         = OSSymbol::withCStringNoCopy( kIOProbeScoreKey );
-       gIOModuleIdentifierKey   = OSSymbol::withCStringNoCopy( kCFBundleIdentifierKey );
+       gIOClassKey                  = OSSymbol::withCStringNoCopy( kIOClassKey );
+       gIOProbeScoreKey             = OSSymbol::withCStringNoCopy( kIOProbeScoreKey );
+       gIOModuleIdentifierKey       = OSSymbol::withCStringNoCopy( kCFBundleIdentifierKey );
+       gIOModuleIdentifierKernelKey = OSSymbol::withCStringNoCopy( kCFBundleIdentifierKernelKey );
+
 
        assert( array && gIOClassKey && gIOProbeScoreKey
            && gIOModuleIdentifierKey);
@@ -129,7 +132,7 @@ IOCatalogue::arrayForPersonality(OSDictionary * dict)
 
        sym = OSDynamicCast(OSSymbol, dict->getObject(gIOProviderClassKey));
        if (!sym) {
-               return 0;
+               return NULL;
        }
 
        return (OSArray *) personalities->getObject(sym);
@@ -178,7 +181,7 @@ IOCatalogue::init(OSArray * initArray)
                        continue;
                }
                OSKext::uniquePersonalityProperties(dict);
-               if (0 == dict->getObject( gIOClassKey )) {
+               if (NULL == dict->getObject( gIOClassKey )) {
                        IOLog("Missing or bad \"%s\" key\n",
                            gIOClassKey->getCStringNoCopy());
                        continue;
@@ -219,7 +222,7 @@ IOCatalogue::findDrivers(
        set = OSOrderedSet::withCapacity( 1, IOServiceOrdering,
            (void *)gIOProbeScoreKey );
        if (!set) {
-               return 0;
+               return NULL;
        }
 
        IORWLockRead(lock);
@@ -265,12 +268,12 @@ IOCatalogue::findDrivers(
        set = OSOrderedSet::withCapacity( 1, IOServiceOrdering,
            (void *)gIOProbeScoreKey );
        if (!set) {
-               return 0;
+               return NULL;
        }
        iter = OSCollectionIterator::withCollection(personalities);
        if (!iter) {
                set->release();
-               return 0;
+               return NULL;
        }
 
        IORWLockRead(lock);
@@ -474,33 +477,17 @@ IOCatalogue::getGenerationCount(void) const
        return generation;
 }
 
+// Check to see if kernel module has been loaded already, and request its load.
 bool
-IOCatalogue::isModuleLoaded(OSString * moduleName) const
+IOCatalogue::isModuleLoaded(OSDictionary * driver, OSObject ** kextRef) const
 {
-       return isModuleLoaded(moduleName->getCStringNoCopy());
-}
+       OSString * moduleName = NULL;
+       OSString * publisherName = NULL;
+       OSReturn   ret;
 
-bool
-IOCatalogue::isModuleLoaded(const char * moduleName) const
-{
-       OSReturn ret;
-       ret = OSKext::loadKextWithIdentifier(moduleName);
-       if (kOSKextReturnDeferred == ret) {
-               // a request has been queued but the module isn't necessarily
-               // loaded yet, so stall.
-               return false;
+       if (kextRef) {
+               *kextRef = NULL;
        }
-       // module is present or never will be
-       return true;
-}
-
-// Check to see if module has been loaded already.
-bool
-IOCatalogue::isModuleLoaded(OSDictionary * driver) const
-{
-       OSString             * moduleName = NULL;
-       OSString             * publisherName = NULL;
-
        if (!driver) {
                return false;
        }
@@ -515,12 +502,25 @@ IOCatalogue::isModuleLoaded(OSDictionary * driver) const
            driver->getObject(kIOPersonalityPublisherKey));
        OSKext::recordIdentifierRequest(publisherName);
 
-       moduleName = OSDynamicCast(OSString, driver->getObject(gIOModuleIdentifierKey));
+       moduleName = OSDynamicCast(OSString, driver->getObject(gIOModuleIdentifierKernelKey));
        if (moduleName) {
-               return isModuleLoaded(moduleName);
+               ret = OSKext::loadKextWithIdentifier(moduleName, kextRef);
+               if (kOSKextReturnDeferred == ret) {
+                       // a request has been queued but the module isn't necessarily
+                       // loaded yet, so stall.
+                       return false;
+               }
+               OSString *moduleDextName = OSDynamicCast(OSString, driver->getObject(gIOModuleIdentifierKey));
+               if (moduleDextName && !(moduleName->isEqualTo(moduleDextName))) {
+                       OSObject *dextRef = NULL;
+                       ret = OSKext::loadKextWithIdentifier(moduleDextName, &dextRef);
+                       OSSafeReleaseNULL(dextRef);
+               }
+               // module is present or never will be
+               return true;
        }
 
-       /* If a personality doesn't hold the "CFBundleIdentifier" key
+       /* If a personality doesn't hold the "CFBundleIdentifier" or "CFBundleIdentifierKernel" key
         * it is assumed to be an "in-kernel" driver.
         */
        return true;
@@ -531,14 +531,9 @@ IOCatalogue::isModuleLoaded(OSDictionary * driver) const
  * IOCatalogueModuleLoaded(). Sent from kextd.
  */
 void
-IOCatalogue::moduleHasLoaded(OSString * moduleName)
+IOCatalogue::moduleHasLoaded(const OSSymbol * moduleName)
 {
-       OSDictionary * dict;
-
-       dict = OSDictionary::withCapacity(2);
-       dict->setObject(gIOModuleIdentifierKey, moduleName);
-       startMatching(dict);
-       dict->release();
+       startMatching(moduleName);
 
        (void) OSKext::setDeferredLoadSucceeded();
        (void) OSKext::considerRebuildOfPrelinkedKernel();
@@ -547,9 +542,9 @@ IOCatalogue::moduleHasLoaded(OSString * moduleName)
 void
 IOCatalogue::moduleHasLoaded(const char * moduleName)
 {
-       OSString * name;
+       const OSSymbol * name;
 
-       name = OSString::withCString(moduleName);
+       name = OSSymbol::withCString(moduleName);
        moduleHasLoaded(name);
        name->release();
 }
@@ -574,7 +569,7 @@ IOCatalogue::_terminateDrivers(OSDictionary * matching)
        }
 
        ret = kIOReturnSuccess;
-       dict = 0;
+       dict = NULL;
        iter = IORegistryIterator::iterateOver(gIOServicePlane,
            kIORegistryIterateRecursively);
        if (!iter) {
@@ -741,15 +736,11 @@ IOCatalogue::terminateDriversForModule(
        return ret;
 }
 
+#if defined(__i386__) || defined(__x86_64__)
 bool
 IOCatalogue::startMatching( OSDictionary * matching )
 {
-       OSCollectionIterator * iter;
-       OSDictionary         * dict;
        OSOrderedSet         * set;
-       OSArray              * array;
-       const OSSymbol *       key;
-       unsigned int           idx;
 
        if (!matching) {
                return false;
@@ -761,27 +752,71 @@ IOCatalogue::startMatching( OSDictionary * matching )
                return false;
        }
 
-       iter = OSCollectionIterator::withCollection(personalities);
-       if (!iter) {
-               set->release();
+       IORWLockRead(lock);
+
+       personalities->iterateObjects(^bool (const OSSymbol * key, OSObject * value) {
+               OSArray      * array;
+               OSDictionary * dict;
+               unsigned int   idx;
+
+               array = (OSArray *) value;
+               for (idx = 0; (dict = (OSDictionary *) array->getObject(idx)); idx++) {
+                       /* This comparison must be done with only the keys in the
+                        * "matching" dict to enable general matching.
+                        */
+                       if (dict->isEqualTo(matching, matching)) {
+                               set->setObject(dict);
+                       }
+               }
+               return false;
+       });
+
+       // Start device matching.
+       if (set->getCount() > 0) {
+               IOService::catalogNewDrivers(set);
+               generation++;
+       }
+
+       IORWLockUnlock(lock);
+
+       set->release();
+
+       return true;
+}
+#endif /* defined(__i386__) || defined(__x86_64__) */
+
+bool
+IOCatalogue::startMatching( const OSSymbol * moduleName )
+{
+       OSOrderedSet         * set;
+
+       if (!moduleName) {
+               return false;
+       }
+
+       set = OSOrderedSet::withCapacity(10, IOServiceOrdering,
+           (void *)gIOProbeScoreKey);
+       if (!set) {
                return false;
        }
 
        IORWLockRead(lock);
 
-       while ((key = (const OSSymbol *) iter->getNextObject())) {
-               array = (OSArray *) personalities->getObject(key);
-               if (array) {
-                       for (idx = 0; (dict = (OSDictionary *) array->getObject(idx)); idx++) {
-                               /* This comparison must be done with only the keys in the
-                                * "matching" dict to enable general matching.
-                                */
-                               if (dict->isEqualTo(matching, matching)) {
-                                       set->setObject(dict);
-                               }
+       personalities->iterateObjects(^bool (const OSSymbol * key, OSObject * value) {
+               OSArray      * array;
+               OSDictionary * dict;
+               OSObject     * obj;
+               unsigned int   idx;
+
+               array = (OSArray *) value;
+               for (idx = 0; (dict = (OSDictionary *) array->getObject(idx)); idx++) {
+                       obj = dict->getObject(gIOModuleIdentifierKernelKey);
+                       if (obj && moduleName->isEqualTo(obj)) {
+                               set->setObject(dict);
                        }
                }
-       }
+               return false;
+       });
 
        // Start device matching.
        if (set->getCount() > 0) {
@@ -792,7 +827,6 @@ IOCatalogue::startMatching( OSDictionary * matching )
        IORWLockUnlock(lock);
 
        set->release();
-       iter->release();
 
        return true;
 }
index 265b2f7860725434aa52bc57b0b58e176486e637..1ae9bcf0880b1b9610128e2a300943147e57de8a 100644 (file)
@@ -86,7 +86,7 @@ IOCommandGate::commandGate(OSObject *inOwner, Action inAction)
 
        if (me && !me->init(inOwner, inAction)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -117,7 +117,7 @@ IOCommandGate::enable()
 IOCommandGate::free()
 {
        if (workLoop) {
-               setWorkLoop(0);
+               setWorkLoop(NULL);
        }
        super::free();
 }
@@ -146,7 +146,7 @@ IOCommandGate::setWorkLoop(IOWorkLoop *inWorkLoop)
                *sleepersP &= ~kSleepersWaitEnabled;
                defer = (0 != (kSleepersActionsMask & *sleepersP));
                if (!defer) {
-                       super::setWorkLoop(0);
+                       super::setWorkLoop(NULL);
                        *sleepersP &= ~kSleepersRemoved;
                }
                wl->openGate();
@@ -180,9 +180,9 @@ IOCommandGateActionToBlock(OSObject *owner,
 }
 
 IOReturn
-IOCommandGate::runActionBlock(ActionBlock action)
+IOCommandGate::runActionBlock(ActionBlock _action)
 {
-       return runAction(&IOCommandGateActionToBlock, action);
+       return runAction(&IOCommandGateActionToBlock, _action);
 }
 
 IOReturn
@@ -250,7 +250,7 @@ IOCommandGate::runAction(Action inAction,
        if (kSleepersRemoved == ((kSleepersActionsMask | kSleepersRemoved) & *sleepersP)) {
                // no actions outstanding
                *sleepersP &= ~kSleepersRemoved;
-               super::setWorkLoop(0);
+               super::setWorkLoop(NULL);
        }
 
        wl->openGate();
index d61f37fc3a7171fd7116b5383dbdbfa68eeadfc0..550d9aac526d8375c7b08e15a6ae256b3dff7dd2 100644 (file)
@@ -62,7 +62,7 @@ withWorkLoop(IOWorkLoop *inWorkLoop)
 
        if (me && !me->initWithWorkLoop(inWorkLoop)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -106,7 +106,7 @@ commandPool(IOService * inOwner, IOWorkLoop *inWorkLoop, UInt32 inSize)
 
        if (me && !me->init(inOwner, inWorkLoop, inSize)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -135,7 +135,7 @@ IOCommandPool::free(void)
                }
 
                fSerializer->release();
-               fSerializer = 0;
+               fSerializer = NULL;
        }
 
        // Tell our superclass to cleanup too
@@ -153,7 +153,7 @@ IOCommand *
 IOCommandPool::getCommand(bool blockForCommand)
 {
        IOReturn     result  = kIOReturnSuccess;
-       IOCommand *command = 0;
+       IOCommand *command = NULL;
 
        IOCommandGate::Action func = OSMemberFunctionCast(
                IOCommandGate::Action, this, &IOCommandPool::gatedGetCommand);
@@ -162,7 +162,7 @@ IOCommandPool::getCommand(bool blockForCommand)
        if (kIOReturnSuccess == result) {
                return command;
        } else {
-               return 0;
+               return NULL;
        }
 }
 
index 08ba843c9f5477c57eb5bf4707416504da217d2b..2623d063d8546aa2305119d73e3f5e242f770971 100644 (file)
@@ -120,7 +120,7 @@ IOCommandQueue::commandQueue(OSObject *inOwner,
 
        if (me && !me->init(inOwner, inAction, inSize)) {
                me->free();
-               return 0;
+               return NULL;
        }
 
        return me;
index 655a150a4cacebec06088c9ca2820c501fef7b19..c24d8ce352267e05b0230485c103cc50aebf8ad0 100644 (file)
@@ -67,7 +67,7 @@ IOConditionLock::withCondition(int condition, bool intr)
 
        if (me && !me->initWithCondition(condition, intr)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
index b5383d481cc1609df6287d8cb437efb441db4b52..24047b5425cec507db8df7df6e795d0ad0a278e8 100644 (file)
@@ -109,7 +109,7 @@ IODMACommand::withRefCon(void * refCon)
 
        if (me && !me->initWithRefCon(refCon)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -127,7 +127,7 @@ IODMACommand::withSpecification(SegmentFunction  outSegFunc,
        if (me && !me->initWithSpecification(outSegFunc, segmentOptions, mappingOptions,
            mapper, refCon)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -150,7 +150,7 @@ IODMACommand::withSpecification(SegmentFunction outSegFunc,
            mappingOptions, maxTransferSize,
            alignment, mapper, refCon)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -244,7 +244,7 @@ IODMACommand::setSpecification(SegmentFunction        outSegFunc,
     uint32_t               mappingOptions,
     IOMapper             * mapper)
 {
-       IOService * device = 0;
+       IOService * device = NULL;
        UInt8       numAddressBits;
        UInt64      maxSegmentSize;
        UInt64      maxTransferSize;
@@ -284,7 +284,7 @@ IODMACommand::setSpecification(SegmentFunction        outSegFunc,
        }
        if (mapper && !OSDynamicCast(IOMapper, mapper)) {
                device = mapper;
-               mapper = 0;
+               mapper = NULL;
        }
        if (!mapper && (kUnmapped != MAPTYPE(mappingOptions))) {
                IOMapper::checkForSystemMapper();
@@ -434,7 +434,7 @@ IODMACommand::clearMemoryDescriptor(bool autoComplete)
                        fMemory->dmaCommandOperation(kIOMDSetDMAInactive, this, 0);
                }
                fMemory->release();
-               fMemory = 0;
+               fMemory = NULL;
        }
 
        return kIOReturnSuccess;
@@ -603,10 +603,10 @@ IODMACommand::walkAll(UInt8 op)
                state->fDoubleBuffer   = false;
                state->fPrepared       = false;
                state->fCopyNext       = NULL;
-               state->fCopyPageAlloc  = 0;
+               state->fCopyPageAlloc  = NULL;
                state->fCopyPageCount  = 0;
                state->fNextRemapPage  = NULL;
-               state->fCopyMD         = 0;
+               state->fCopyMD         = NULL;
 
                if (!(kWalkDoubleBuffer & op)) {
                        offset = 0;
@@ -703,12 +703,12 @@ IODMACommand::walkAll(UInt8 op)
        if (kWalkComplete & op) {
                if (state->fCopyPageAlloc) {
                        vm_page_free_list(state->fCopyPageAlloc, FALSE);
-                       state->fCopyPageAlloc = 0;
+                       state->fCopyPageAlloc = NULL;
                        state->fCopyPageCount = 0;
                }
                if (state->fCopyMD) {
                        state->fCopyMD->release();
-                       state->fCopyMD = 0;
+                       state->fCopyMD = NULL;
                }
 
                state->fPrepared = false;
@@ -833,10 +833,10 @@ IODMACommand::prepare(UInt64 offset, UInt64 length, bool flushCache, bool synchr
                state->fDoubleBuffer   = false;
                state->fPrepared       = false;
                state->fCopyNext       = NULL;
-               state->fCopyPageAlloc  = 0;
+               state->fCopyPageAlloc  = NULL;
                state->fCopyPageCount  = 0;
                state->fNextRemapPage  = NULL;
-               state->fCopyMD         = 0;
+               state->fCopyMD         = NULL;
                state->fLocalMapperAlloc       = 0;
                state->fLocalMapperAllocValid  = false;
                state->fLocalMapperAllocLength = 0;
index 4ce1edea3fa6b9dc8ed417631d5e387597f4e24b..8650d618996f2fa6fc30608b665610a49238937d 100644 (file)
@@ -52,7 +52,7 @@ IODMAController::getController(IOService *provider, UInt32 dmaIndex)
 
        // Find the name of the parent dma controller
        dmaParentData = OSDynamicCast(OSData, provider->getProperty("dma-parent"));
-       if (dmaParentData == 0) {
+       if (dmaParentData == NULL) {
                return NULL;
        }
 
@@ -64,7 +64,7 @@ IODMAController::getController(IOService *provider, UInt32 dmaIndex)
                }
                dmaParentName = createControllerName(*(UInt32 *)dmaParentData->getBytesNoCopy(dmaIndex * sizeof(UInt32), sizeof(UInt32)));
        }
-       if (dmaParentName == 0) {
+       if (dmaParentName == NULL) {
                return NULL;
        }
 
index af624aeaa2dc8b51d4b57ff8ab6392ce93dff90e..dd4d186f0917827538b87f1ee39dc79b84fd8a90 100644 (file)
@@ -48,7 +48,7 @@ IODMAEventSource::init(OSObject *inOwner,
                return false;
        }
 
-       if (inProvider == 0) {
+       if (inProvider == NULL) {
                return false;
        }
 
@@ -58,7 +58,7 @@ IODMAEventSource::init(OSObject *inOwner,
        dmaNotificationAction = inNotification;
 
        dmaController = IODMAController::getController(dmaProvider, inDMAIndex);
-       if (dmaController == 0) {
+       if (dmaController == NULL) {
                return false;
        }
        dmaController->retain();
@@ -94,7 +94,7 @@ IODMAEventSource::dmaEventSource(OSObject *inOwner,
 
        if (dmaES && !dmaES->init(inOwner, inProvider, inCompletion, inNotification, inDMAIndex)) {
                dmaES->release();
-               return 0;
+               return NULL;
        }
 
        return dmaES;
@@ -105,7 +105,7 @@ IODMAEventSource::startDMACommand(IODMACommand *dmaCommand, IODirection directio
 {
        IOReturn result;
 
-       if ((dmaController == 0) || (dmaIndex == 0xFFFFFFFF)) {
+       if ((dmaController == NULL) || (dmaIndex == 0xFFFFFFFF)) {
                return kIOReturnError;
        }
 
@@ -113,7 +113,7 @@ IODMAEventSource::startDMACommand(IODMACommand *dmaCommand, IODirection directio
                return kIOReturnBusy;
        }
 
-       if (dmaCompletionAction == 0) {
+       if (dmaCompletionAction == NULL) {
                dmaSynchBusy = true;
        }
 
@@ -134,7 +134,7 @@ IODMAEventSource::startDMACommand(IODMACommand *dmaCommand, IODirection directio
 IOReturn
 IODMAEventSource::stopDMACommand(bool flush, uint64_t timeout)
 {
-       if ((dmaController == 0) || (dmaIndex == 0xFFFFFFFF)) {
+       if ((dmaController == NULL) || (dmaIndex == 0xFFFFFFFF)) {
                return kIOReturnError;
        }
 
@@ -145,7 +145,7 @@ IODMAEventSource::stopDMACommand(bool flush, uint64_t timeout)
 IOReturn
 IODMAEventSource::queryDMACommand(IODMACommand **dmaCommand, IOByteCount *transferCount, bool waitForIdle)
 {
-       if ((dmaController == 0) || (dmaIndex == 0xFFFFFFFF)) {
+       if ((dmaController == NULL) || (dmaIndex == 0xFFFFFFFF)) {
                return kIOReturnError;
        }
 
@@ -156,7 +156,7 @@ IODMAEventSource::queryDMACommand(IODMACommand **dmaCommand, IOByteCount *transf
 IOByteCount
 IODMAEventSource::getFIFODepth(IODirection direction)
 {
-       if ((dmaController == 0) || (dmaIndex == 0xFFFFFFFF)) {
+       if ((dmaController == NULL) || (dmaIndex == 0xFFFFFFFF)) {
                return 0;
        }
 
@@ -167,7 +167,7 @@ IODMAEventSource::getFIFODepth(IODirection direction)
 IOReturn
 IODMAEventSource::setFIFODepth(IOByteCount depth)
 {
-       if ((dmaController == 0) || (dmaIndex == 0xFFFFFFFF)) {
+       if ((dmaController == NULL) || (dmaIndex == 0xFFFFFFFF)) {
                return kIOReturnError;
        }
 
@@ -178,7 +178,7 @@ IODMAEventSource::setFIFODepth(IOByteCount depth)
 IOByteCount
 IODMAEventSource::validFIFODepth(IOByteCount depth, IODirection direction)
 {
-       if ((dmaController == 0) || (dmaIndex == 0xFFFFFFFF)) {
+       if ((dmaController == NULL) || (dmaIndex == 0xFFFFFFFF)) {
                return kIOReturnError;
        }
 
@@ -189,7 +189,7 @@ IODMAEventSource::validFIFODepth(IOByteCount depth, IODirection direction)
 IOReturn
 IODMAEventSource::setFrameSize(UInt8 byteCount)
 {
-       if ((dmaController == 0) || (dmaIndex == 0xFFFFFFFF)) {
+       if ((dmaController == NULL) || (dmaIndex == 0xFFFFFFFF)) {
                return kIOReturnError;
        }
 
@@ -224,7 +224,7 @@ IODMAEventSource::checkForWork(void)
 void
 IODMAEventSource::completeDMACommand(IODMACommand *dmaCommand)
 {
-       if (dmaCompletionAction != 0) {
+       if (dmaCompletionAction != NULL) {
                IOSimpleLockLock(dmaCommandsCompletedLock);
                queue_enter(&dmaCommandsCompleted, dmaCommand, IODMACommand *, fCommandChain);
                IOSimpleLockUnlock(dmaCommandsCompletedLock);
@@ -243,7 +243,7 @@ IODMAEventSource::notifyDMACommand(IODMACommand *dmaCommand, IOReturn status, IO
        dmaCommand->reserved->fActualByteCount = actualByteCount;
        dmaCommand->reserved->fTimeStamp = timeStamp;
 
-       if (dmaNotificationAction != 0) {
+       if (dmaNotificationAction != NULL) {
                (*dmaNotificationAction)(owner, this, dmaCommand, status, actualByteCount, timeStamp);
        }
 }
index 93bd0c2684aadfe361edafb97f5881b0d66aa6f9..dde414b6ac41c8029de65f0e0d1d1624d1255138 100644 (file)
@@ -61,7 +61,7 @@ IODataQueue *IODataQueue::withCapacity(UInt32 size)
        if (dataQueue) {
                if (!dataQueue->initWithCapacity(size)) {
                        dataQueue->release();
-                       dataQueue = 0;
+                       dataQueue = NULL;
                }
        }
 
@@ -76,7 +76,7 @@ IODataQueue::withEntries(UInt32 numEntries, UInt32 entrySize)
        if (dataQueue) {
                if (!dataQueue->initWithEntries(numEntries, entrySize)) {
                        dataQueue->release();
-                       dataQueue = 0;
+                       dataQueue = NULL;
                }
        }
 
@@ -111,7 +111,7 @@ IODataQueue::initWithCapacity(UInt32 size)
        ((IODataQueueInternal *)notifyMsg)->queueSize = size;
 
        dataQueue = (IODataQueueMemory *)IOMallocAligned(allocSize, PAGE_SIZE);
-       if (dataQueue == 0) {
+       if (dataQueue == NULL) {
                return false;
        }
        bzero(dataQueue, allocSize);
@@ -190,7 +190,7 @@ IODataQueue::enqueue(void * data, UInt32 dataSize)
                        entry = (IODataQueueEntry *)((UInt8 *)dataQueue->queue + tail);
 
                        entry->size = dataSize;
-                       memcpy(&entry->data, data, dataSize);
+                       __nochk_memcpy(&entry->data, data, dataSize);
 
                        // The tail can be out of bound when the size of the new entry
                        // exactly matches the available space at the end of the queue.
@@ -211,7 +211,7 @@ IODataQueue::enqueue(void * data, UInt32 dataSize)
                                ((IODataQueueEntry *)((UInt8 *)dataQueue->queue + tail))->size = dataSize;
                        }
 
-                       memcpy(&dataQueue->queue->data, data, dataSize);
+                       __nochk_memcpy(&dataQueue->queue->data, data, dataSize);
                        newTail = entrySize;
                } else {
                        return false; // queue is full
@@ -224,7 +224,7 @@ IODataQueue::enqueue(void * data, UInt32 dataSize)
                        entry = (IODataQueueEntry *)((UInt8 *)dataQueue->queue + tail);
 
                        entry->size = dataSize;
-                       memcpy(&entry->data, data, dataSize);
+                       __nochk_memcpy(&entry->data, data, dataSize);
                        newTail = tail + entrySize;
                } else {
                        return false; // queue is full
@@ -291,11 +291,11 @@ IODataQueue::sendDataAvailableNotification()
 IOMemoryDescriptor *
 IODataQueue::getMemoryDescriptor()
 {
-       IOMemoryDescriptor *descriptor = 0;
+       IOMemoryDescriptor *descriptor = NULL;
        UInt32              queueSize;
 
        queueSize = ((IODataQueueInternal *) notifyMsg)->queueSize;
-       if (dataQueue != 0) {
+       if (dataQueue != NULL) {
                descriptor = IOMemoryDescriptor::withAddress(dataQueue, queueSize + DATA_QUEUE_MEMORY_HEADER_SIZE, kIODirectionOutIn);
        }
 
index ee178c845b69e0067333c2d197b79d56474cc08b..ecc175f08322b207df02657ecec7919fc218b8d9 100644 (file)
@@ -62,8 +62,8 @@ IODeviceMemory::arrayFromList(
        IOItemCount         i;
 
        array = OSArray::withCapacity( count );
-       if (0 == array) {
-               return 0;
+       if (NULL == array) {
+               return NULL;
        }
 
        for (i = 0; i < count; i++) {
@@ -74,7 +74,7 @@ IODeviceMemory::arrayFromList(
                        range->release();
                } else {
                        array->release();
-                       array = 0;
+                       array = NULL;
                        break;
                }
        }
index 6d49c4feefcf5f4b606669121a2e53099433f51f..f219e2f1d96edad8f98ee035408e30777031684d 100644 (file)
@@ -154,7 +154,7 @@ IODeviceTreeAlloc( void * dtTop )
            && gIODTInterruptCellKey
            );
 
-       freeDT = (kSuccess == DTLookupEntry( 0, "/chosen/memory-map", &mapEntry ))
+       freeDT = (kSuccess == DTLookupEntry( NULL, "/chosen/memory-map", &mapEntry ))
            && (kSuccess == DTGetProperty( mapEntry,
            "DeviceTree", (void **) &dtMap, &propSize ))
            && ((2 * sizeof(uint32_t)) == propSize);
@@ -202,7 +202,7 @@ IODeviceTreeAlloc( void * dtTop )
 
        if (freeDT) {
                // free original device tree
-               DTInit(0);
+               DTInit(NULL);
                IODTFreeLoaderInfo( "DeviceTree",
                    (void *)dtMap[0], (int) round_page(dtMap[1]));
        }
@@ -221,6 +221,14 @@ IODeviceTreeAlloc( void * dtTop )
                        if (!intMap && child->getProperty( gIODTInterruptParentKey)) {
                                intMap = true;
                        }
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+                       if (!strcmp("sep", child->getName())
+                           || !strcmp("aop", child->getName())
+                           || !strcmp("disp0", child->getName())) {
+                               uint32_t aotFlags = 1;
+                               child->setProperty("aot-power", &aotFlags, sizeof(aotFlags));
+                       }
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
                }
                regIter->release();
        }
@@ -268,12 +276,12 @@ IODTGetLoaderInfo( const char *key, void **infoAddr, int *infoSize )
        int ret = -1;
 
        chosen = IORegistryEntry::fromPath( "/chosen/memory-map", gIODTPlane );
-       if (chosen == 0) {
+       if (chosen == NULL) {
                return -1;
        }
 
        propObj = OSDynamicCast( OSData, chosen->getProperty(key));
-       if (propObj == 0) {
+       if (propObj == NULL) {
                goto cleanup;
        }
 
@@ -283,7 +291,7 @@ IODTGetLoaderInfo( const char *key, void **infoAddr, int *infoSize )
        }
 
        propPtr = (dtptr_t *)propObj->getBytesNoCopy();
-       if (propPtr == 0) {
+       if (propPtr == NULL) {
                goto cleanup;
        }
 
@@ -308,9 +316,9 @@ IODTFreeLoaderInfo( const char *key, void *infoAddr, int infoSize )
        range[1] = (vm_offset_t)infoSize;
        FreePhysicalMemory( range );
 
-       if (key != 0) {
+       if (key != NULL) {
                chosen = IORegistryEntry::fromPath( "/chosen/memory-map", gIODTPlane );
-               if (chosen != 0) {
+               if (chosen != NULL) {
                        chosen->removeProperty(key);
                        chosen->release();
                }
@@ -325,12 +333,12 @@ IODTGetDefault(const char *key, void *infoAddr, unsigned int infoSize )
        unsigned int                defaultSize;
 
        defaults = IORegistryEntry::fromPath( "/defaults", gIODTPlane );
-       if (defaults == 0) {
+       if (defaults == NULL) {
                return -1;
        }
 
        defaultObj = OSDynamicCast( OSData, defaults->getProperty(key));
-       if (defaultObj == 0) {
+       if (defaultObj == NULL) {
                return -1;
        }
 
@@ -375,7 +383,7 @@ MakeReferenceTable( DTEntry dtEntry, bool copy )
 
        if (regEntry && (false == regEntry->init())) {
                regEntry->release();
-               regEntry = 0;
+               regEntry = NULL;
        }
 
        if (regEntry &&
@@ -451,7 +459,7 @@ static IORegistryEntry *
 FindPHandle( UInt32 phandle )
 {
        OSData                      *data;
-       IORegistryEntry *regEntry = 0;
+       IORegistryEntry *regEntry = NULL;
        int                         i;
 
        for (i = 0; (data = (OSData *)gIODTPHandles->getObject( i )); i++) {
@@ -501,10 +509,10 @@ IODTFindInterruptParent( IORegistryEntry * regEntry, IOItemCount index )
                }
                phandle = ((UInt32 *) data->getBytesNoCopy())[index];
                parent = FindPHandle( phandle );
-       } else if (0 == regEntry->getProperty( "interrupt-controller")) {
+       } else if (NULL == regEntry->getProperty( "interrupt-controller")) {
                parent = regEntry->getParentEntry( gIODTPlane);
        } else {
-               parent = 0;
+               parent = NULL;
        }
 
        return parent;
@@ -525,7 +533,7 @@ IODTInterruptControllerName( IORegistryEntry * regEntry )
                snprintf(buf, sizeof(buf), "IOInterruptController%08X", (uint32_t)phandle);
                sym = OSSymbol::withCString( buf );
        } else {
-               sym = 0;
+               sym = NULL;
        }
 
        return sym;
@@ -547,9 +555,10 @@ IODTGetICellCounts( IORegistryEntry * regEntry,
 
 static UInt32
 IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, UInt32 index,
-    OSData ** spec, const OSSymbol ** controller )
+    LIBKERN_RETURNS_RETAINED OSData ** spec,
+    LIBKERN_RETURNS_RETAINED const OSSymbol ** controller )
 {
-       IORegistryEntry *parent = 0;
+       IORegistryEntry *parent = NULL;
        OSData                      *data;
        UInt32                      *addrCmp;
        UInt32                      *maskCmp;
@@ -561,7 +570,7 @@ IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, UInt32 index,
 
        parent = IODTFindInterruptParent( regEntry, index );
        IODTGetICellCounts( parent, &icells, &acells );
-       addrCmp = 0;
+       addrCmp = NULL;
        if (acells) {
                data = OSDynamicCast( OSData, regEntry->getProperty( "reg" ));
                if (data && (data->getLength() >= (acells * sizeof(UInt32)))) {
@@ -588,7 +597,7 @@ IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, UInt32 index,
                if (parent && (data = OSDynamicCast( OSData,
                    regEntry->getProperty( "interrupt-controller")))) {
                        // found a controller - don't want to follow cascaded controllers
-                       parent = 0;
+                       parent = NULL;
                        *spec = OSData::withBytesNoCopy((void *) intSpec,
                            icells * sizeof(UInt32));
                        *controller = IODTInterruptControllerName( regEntry );
@@ -602,7 +611,7 @@ IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, UInt32 index,
                        if (data && (data->getLength() >= ((acells + icells) * sizeof(UInt32)))) {
                                maskCmp = (UInt32 *) data->getBytesNoCopy();
                        } else {
-                               maskCmp = 0;
+                               maskCmp = NULL;
                        }
 
 #if IODTSUPPORTDEBUG
@@ -647,7 +656,7 @@ IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, UInt32 index,
                                }
 
                                map += acells + icells;
-                               if (0 == (parent = FindPHandle( *(map++)))) {
+                               if (NULL == (parent = FindPHandle( *(map++)))) {
                                        unexpected(break);
                                }
 
@@ -661,7 +670,7 @@ IODTMapOneInterrupt( IORegistryEntry * regEntry, UInt32 * intSpec, UInt32 index,
                                }
                        } while (!cmp && (map < endMap));
                        if (!cmp) {
-                               parent = 0;
+                               parent = NULL;
                        }
                }
 
@@ -729,14 +738,14 @@ IODTMapInterruptsSharing( IORegistryEntry * regEntry, OSDictionary * allInts )
        OSObject *          oneMap;
        OSArray *           mapped;
        OSArray *           controllerInts;
-       const OSSymbol *    controller = 0;
+       const OSSymbol *    controller = NULL;
        OSArray *           controllers;
        UInt32              skip = 1;
        bool                ok, nw;
 
-       nw = (0 == (local = OSDynamicCast( OSData,
+       nw = (NULL == (local = OSDynamicCast( OSData,
            regEntry->getProperty( gIODTAAPLInterruptsKey))));
-       if (nw && (0 == (local = OSDynamicCast( OSData,
+       if (nw && (NULL == (local = OSDynamicCast( OSData,
            regEntry->getProperty( "interrupts"))))) {
                return true;  // nothing to see here
        }
@@ -834,7 +843,7 @@ IODTMapInterruptsSharing( IORegistryEntry * regEntry, OSDictionary * allInts )
 bool
 IODTMapInterrupts( IORegistryEntry * regEntry )
 {
-       return IODTMapInterruptsSharing( regEntry, 0 );
+       return IODTMapInterruptsSharing( regEntry, NULL );
 }
 
 /*
@@ -843,7 +852,7 @@ IODTMapInterrupts( IORegistryEntry * regEntry )
 static bool
 CompareKey( OSString * key,
     const IORegistryEntry * table, const OSSymbol * propName,
-    OSString ** matchingName )
+    LIBKERN_RETURNS_RETAINED OSString ** matchingName )
 {
        OSObject            *prop;
        OSData                      *data;
@@ -855,9 +864,9 @@ CompareKey( OSString * key,
        const char          *lastName;
        bool                        wild;
        bool                        matched;
-       const char          *result = 0;
+       const char          *result = NULL;
 
-       if (0 == (prop = table->copyProperty( propName ))) {
+       if (NULL == (prop = table->copyProperty( propName ))) {
                return 0;
        }
 
@@ -868,7 +877,7 @@ CompareKey( OSString * key,
                names = string->getCStringNoCopy();
                lastName = names + string->getLength() + 1;
        } else {
-               names = 0;
+               names = NULL;
        }
 
        if (names) {
@@ -901,7 +910,7 @@ CompareKey( OSString * key,
                prop->release();
        }
 
-       return result != 0;
+       return result != NULL;
 }
 
 
@@ -926,7 +935,7 @@ IODTMatchNubWithKeys( IORegistryEntry * regEntry,
        OSObject    *obj;
        bool                result = false;
 
-       obj = OSUnserialize( keys, 0 );
+       obj = OSUnserialize( keys, NULL );
 
        if (obj) {
                result = regEntry->compareNames( obj );
@@ -945,7 +954,7 @@ OSCollectionIterator *
 IODTFindMatchingEntries( IORegistryEntry * from,
     IOOptionBits options, const char * keys )
 {
-       OSSet                                       *result = 0;
+       OSSet                                       *result = NULL;
        IORegistryEntry                     *next;
        IORegistryIterator          *iter;
        OSCollectionIterator        *cIter;
@@ -1160,7 +1169,7 @@ IODTResolveAddressCell( IORegistryEntry * startEntry,
 
        do{
                prop = OSDynamicCast( OSData, regEntry->getProperty( gIODTRangeKey ));
-               if (0 == prop) {
+               if (NULL == prop) {
                        /* end of the road */
                        *phys = CellsValue( childAddressCells, cell );
                        *phys += offset;
@@ -1291,11 +1300,11 @@ IODTResolveAddressing( IORegistryEntry * regEntry,
        OSArray                             *array;
        IODeviceMemory              *range;
 
-       array = 0;
+       array = NULL;
        do{
                parentEntry = regEntry->copyParentEntry( gIODTPlane );
                addressProperty = (OSData *) regEntry->getProperty( addressPropertyName );
-               if ((0 == addressProperty) || (0 == parentEntry)) {
+               if ((NULL == addressProperty) || (NULL == parentEntry)) {
                        break;
                }
 
@@ -1309,18 +1318,18 @@ IODTResolveAddressing( IORegistryEntry * regEntry,
                num = addressProperty->getLength() / (4 * cells);
 
                array = OSArray::withCapacity( 1 );
-               if (0 == array) {
+               if (NULL == array) {
                        break;
                }
 
                for (i = 0; i < num; i++) {
                        if (IODTResolveAddressCell( parentEntry, reg, &phys, &len )) {
-                               range = 0;
+                               range = NULL;
                                if (parent) {
                                        range = IODeviceMemory::withSubRange( parent,
-                                           phys - parent->getPhysicalSegment(0, 0, kIOMemoryMapperNone), len );
+                                           phys - parent->getPhysicalSegment(0, NULL, kIOMemoryMapperNone), len );
                                }
-                               if (0 == range) {
+                               if (NULL == range) {
                                        range = IODeviceMemory::withRange( phys, len );
                                }
                                if (range) {
@@ -1344,7 +1353,7 @@ IODTFindSlotName( IORegistryEntry * regEntry, UInt32 deviceNumber )
 {
        IORegistryEntry             *parent;
        OSData                              *data;
-       OSData                              *ret = 0;
+       OSData                              *ret = NULL;
        UInt32                              *bits;
        UInt32                              i;
        size_t              nlen;
index 33306ae6c73eef760e025329a3e14df3bbf6546d..3415fd34a3e18e208d62decc3d9fdce1ec5cd372 100644 (file)
@@ -293,5 +293,5 @@ IOEventSource::getWorkLoop() const
 bool
 IOEventSource::onThread() const
 {
-       return (workLoop != 0) && workLoop->onThread();
+       return (workLoop != NULL) && workLoop->onThread();
 }
index e3b9803cf3b813343fadfae0401b7e032a681a2c..5e3371a10445859e58d5e38b5a1cbc81dfb53abb 100644 (file)
@@ -84,7 +84,7 @@ IOFilterInterruptEventSource::interruptEventSource(OSObject *inOwner,
     IOService *inProvider,
     int inIntIndex)
 {
-       return 0;
+       return NULL;
 }
 
 bool
@@ -122,7 +122,7 @@ IOFilterInterruptEventSource
        if (me
            && !me->init(inOwner, inAction, inFilterAction, inProvider, inIntIndex)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -141,14 +141,15 @@ IOFilterInterruptEventSource
 
        FilterBlock filter = Block_copy(inFilterAction);
        if (!filter) {
-               return 0;
+               OSSafeReleaseNULL(me);
+               return NULL;
        }
 
        if (me
            && !me->init(inOwner, (Action) NULL, (Filter) filter, inProvider, inIntIndex)) {
                me->release();
                Block_release(filter);
-               return 0;
+               return NULL;
        }
        me->flags |= kFilterBlock;
        me->setActionBlock((IOEventSource::ActionBlock) inAction);
@@ -220,9 +221,13 @@ IOFilterInterruptEventSource::normalInterruptOccurred
        }
 
        if (IOInterruptEventSource::reserved->statistics) {
-               if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelTimeIndex)) {
+               if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelTimeIndex)
+                   || IOInterruptEventSource::reserved->statistics->enablePrimaryTimestamp) {
                        startTime = mach_absolute_time();
                }
+               if (IOInterruptEventSource::reserved->statistics->enablePrimaryTimestamp) {
+                       IOInterruptEventSource::reserved->statistics->primaryTimestamp = startTime;
+               }
        }
 
        // Call the filter.
@@ -269,9 +274,13 @@ IOFilterInterruptEventSource::disableInterruptOccurred
        }
 
        if (IOInterruptEventSource::reserved->statistics) {
-               if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelTimeIndex)) {
+               if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelTimeIndex)
+                   || IOInterruptEventSource::reserved->statistics->enablePrimaryTimestamp) {
                        startTime = mach_absolute_time();
                }
+               if (IOInterruptEventSource::reserved->statistics->enablePrimaryTimestamp) {
+                       IOInterruptEventSource::reserved->statistics->primaryTimestamp = startTime;
+               }
        }
 
        // Call the filter.
index be2483dbfc2309be0084c94ef16d665bd4804f62..3b696609a311b7cbc26f6efe96a33a40f2eb6e9b 100644 (file)
@@ -494,11 +494,13 @@ IOHibernateSystemSleep(void)
                    &vars->page_list_wired,
                    &vars->page_list_pal);
                if (KERN_SUCCESS != err) {
+                       HIBLOG("%s err, hibernate_alloc_page_lists return 0x%x\n", __FUNCTION__, err);
                        break;
                }
 
                err = hibernate_pin_swap(TRUE);
                if (KERN_SUCCESS != err) {
+                       HIBLOG("%s error, hibernate_pin_swap return 0x%x\n", __FUNCTION__, err);
                        break;
                }
                swapPinned = true;
@@ -702,7 +704,7 @@ IOHibernateSystemSleep(void)
                                }
                                // set BootNext
                                if (!gIOHibernateBoot0082Data) {
-                                       OSData * fileData = 0;
+                                       OSData * fileData = NULL;
                                        data = OSDynamicCast(OSData, gIOChosenEntry->getProperty("boot-device-path"));
                                        if (data && data->getLength() >= 4) {
                                                fileData = OSDynamicCast(OSData, gIOChosenEntry->getProperty("boot-file-path"));
@@ -1018,7 +1020,7 @@ IOHibernateSystemHasSlept(void)
 {
        IOReturn          ret = kIOReturnSuccess;
        IOHibernateVars * vars  = &gIOHibernateVars;
-       OSObject        * obj = 0;
+       OSObject        * obj = NULL;
        OSData          * data;
 
        IOLockLock(gFSLock);
@@ -1041,7 +1043,7 @@ IOHibernateSystemHasSlept(void)
        vars->consoleMapping = NULL;
        if (vars->previewBuffer && (kIOReturnSuccess != vars->previewBuffer->prepare())) {
                vars->previewBuffer->release();
-               vars->previewBuffer = 0;
+               vars->previewBuffer = NULL;
        }
 
        if ((kIOHibernateOptionProgress & gIOHibernateCurrentHeader->options)
@@ -1148,7 +1150,7 @@ IOHibernateDone(IOHibernateVars * vars)
 
        if (vars->previewBuffer) {
                vars->previewBuffer->release();
-               vars->previewBuffer = 0;
+               vars->previewBuffer = NULL;
        }
 
        if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
@@ -1300,7 +1302,7 @@ IOReturn
 IOHibernateSystemPostWake(bool now)
 {
        gIOHibernateCurrentHeader->signature = kIOHibernateHeaderInvalidSignature;
-       IOSetBootImageNVRAM(0);
+       IOSetBootImageNVRAM(NULL);
 
        IOLockLock(gFSLock);
        if (kFSTrimDelay == gFSState) {
@@ -1486,7 +1488,7 @@ hibernate_write_image(void)
        uint32_t     zvPageCount;
 
        IOPolledFileCryptVars _cryptvars;
-       IOPolledFileCryptVars * cryptvars = 0;
+       IOPolledFileCryptVars * cryptvars = NULL;
 
        wiredPagesEncrypted = 0;
        dirtyPagesEncrypted = 0;
@@ -1658,7 +1660,7 @@ hibernate_write_image(void)
                        }
                }
                err = IOHibernatePolledFileWrite(vars->fileVars,
-                   (uint8_t *) 0,
+                   (uint8_t *) NULL,
                    &gIOHibernateRestoreStackEnd[0] - &gIOHibernateRestoreStack[0],
                    cryptvars);
                if (kIOReturnSuccess != err) {
@@ -1967,7 +1969,7 @@ hibernate_write_image(void)
 
                        if (kWiredEncrypt != pageType) {
                                // end of image1/2 - fill to next block
-                               err = IOHibernatePolledFileWrite(vars->fileVars, 0, 0, cryptvars);
+                               err = IOHibernatePolledFileWrite(vars->fileVars, NULL, 0, cryptvars);
                                if (kIOReturnSuccess != err) {
                                        break;
                                }
@@ -2029,7 +2031,7 @@ hibernate_write_image(void)
                if (kIOReturnSuccess != err) {
                        break;
                }
-               err = IOHibernatePolledFileWrite(vars->fileVars, 0, 0, cryptvars);
+               err = IOHibernatePolledFileWrite(vars->fileVars, NULL, 0, cryptvars);
        }while (false);
 
        clock_get_uptime(&endTime);
@@ -2112,7 +2114,7 @@ hibernate_machine_init(void)
        uint64_t     compBytes;
        uint32_t     lastProgressStamp = 0;
        uint32_t     progressStamp;
-       IOPolledFileCryptVars * cryptvars = 0;
+       IOPolledFileCryptVars * cryptvars = NULL;
 
        IOHibernateVars * vars  = &gIOHibernateVars;
        bzero(gIOHibernateStats, sizeof(hibernate_statistics_t));
@@ -2174,7 +2176,7 @@ hibernate_machine_init(void)
                hibernate_page_list_discard(vars->page_list);
        }
 
-       cryptvars = (kIOHibernateModeEncrypt & gIOHibernateMode) ? &gIOHibernateCryptWakeContext : 0;
+       cryptvars = (kIOHibernateModeEncrypt & gIOHibernateMode) ? &gIOHibernateCryptWakeContext : NULL;
 
        if (gIOHibernateCurrentHeader->handoffPageCount > gIOHibernateHandoffPageCount) {
                panic("handoff overflow");
@@ -2300,7 +2302,7 @@ hibernate_machine_init(void)
                if (kIOReturnSuccess != err) {
                        panic("IOPolledFilePollersSetEncryptionKey(0x%x)", err);
                }
-               cryptvars = 0;
+               cryptvars = NULL;
        }
 
        IOPolledFileSeek(vars->fileVars, gIOHibernateCurrentHeader->image1Size);
@@ -2314,7 +2316,7 @@ hibernate_machine_init(void)
        vars->fileVars->cryptBytes   = 0;
        AbsoluteTime_to_scalar(&vars->fileVars->cryptTime) = 0;
 
-       err = IOPolledFileRead(vars->fileVars, 0, 0, cryptvars);
+       err = IOPolledFileRead(vars->fileVars, NULL, 0, cryptvars);
        vars->fileVars->bufferOffset = vars->fileVars->bufferLimit;
        // --
 
index efbd6de16e9643e4fe4b9500c2427bada5163e13..b5e9176b43e9dc674ba11e3ef552d8b2de086ef7 100644 (file)
@@ -65,7 +65,7 @@ IOHistogramReporter::with(IOService *reportingService,
        OSSafeReleaseNULL(reporter);
        OSSafeReleaseNULL(tmpChannelName);
 
-       return 0;
+       return NULL;
 }
 
 
index d4ad771ff90826cbbe77281887f09927eca5d516..7a9f15093e9589ba9f1c5d3e6e929c0ea968c7c5 100644 (file)
@@ -48,7 +48,7 @@ IOInterleavedMemoryDescriptor * IOInterleavedMemoryDescriptor::withCapacity(
                    /* capacity  */ capacity,
                    /* direction */ direction )) {
                me->release();
-               me = 0;
+               me = NULL;
        }
 
        return me;
@@ -79,14 +79,14 @@ IOInterleavedMemoryDescriptor::initWithCapacity(
        _direction              = (IODirection) (_flags & kIOMemoryDirectionMask);
 #endif /* !__LP64__ */
        _length                 = 0;
-       _mappings               = 0;
+       _mappings               = NULL;
        _tag                    = 0;
        _descriptorCount        = 0;
        _descriptors            = IONew(IOMemoryDescriptor *, capacity);
        _descriptorOffsets      = IONew(IOByteCount, capacity);
        _descriptorLengths      = IONew(IOByteCount, capacity);
 
-       if ((_descriptors == 0) || (_descriptorOffsets == 0) || (_descriptorLengths == 0)) {
+       if ((_descriptors == NULL) || (_descriptorOffsets == NULL) || (_descriptorLengths == NULL)) {
                return false;
        }
 
@@ -106,7 +106,7 @@ IOInterleavedMemoryDescriptor::clearMemoryDescriptors( IODirection direction )
                }
 
                _descriptors[index]->release();
-               _descriptors[index] = 0;
+               _descriptors[index] = NULL;
 
                _descriptorOffsets[index] = 0;
                _descriptorLengths[index] = 0;
@@ -121,7 +121,7 @@ IOInterleavedMemoryDescriptor::clearMemoryDescriptors( IODirection direction )
 
        _descriptorCount = 0;
        _length = 0;
-       _mappings = 0;
+       _mappings = NULL;
        _tag = 0;
 };
 
@@ -166,15 +166,15 @@ IOInterleavedMemoryDescriptor::free()
                        _descriptors[index]->release();
                }
 
-               if (_descriptors != 0) {
+               if (_descriptors != NULL) {
                        IODelete(_descriptors, IOMemoryDescriptor *, _descriptorCapacity);
                }
 
-               if (_descriptorOffsets != 0) {
+               if (_descriptorOffsets != NULL) {
                        IODelete(_descriptorOffsets, IOMemoryDescriptor *, _descriptorCapacity);
                }
 
-               if (_descriptorLengths != 0) {
+               if (_descriptorLengths != NULL) {
                        IODelete(_descriptorLengths, IOMemoryDescriptor *, _descriptorCapacity);
                }
        }
index f84357e38d0bbcc2f5f93db8ce68d68d71be13c3..18441e5ce4f4ccbb73947d198704ebb45bd902e3 100644 (file)
@@ -104,10 +104,10 @@ IOInterruptController::registerInterrupt(IOService *nub, int source,
        // register as a shared interrupt.
        if (wasAlreadyRegisterd || shouldBeShared) {
                // If this vector is not already shared, break it out.
-               if (vector->sharedController == 0) {
+               if (vector->sharedController == NULL) {
                        // Make the IOShareInterruptController instance
                        vector->sharedController = new IOSharedInterruptController;
-                       if (vector->sharedController == 0) {
+                       if (vector->sharedController == NULL) {
                                IOLockUnlock(vector->interruptLock);
                                return kIOReturnNoMemory;
                        }
@@ -133,7 +133,7 @@ IOInterruptController::registerInterrupt(IOService *nub, int source,
                                        enableInterrupt(originalNub, originalSource);
                                }
                                vector->sharedController->release();
-                               vector->sharedController = 0;
+                               vector->sharedController = NULL;
                                IOLockUnlock(vector->interruptLock);
                                return error;
                        }
@@ -163,7 +163,7 @@ IOInterruptController::registerInterrupt(IOService *nub, int source,
                                        enableInterrupt(originalNub, originalSource);
 
                                        vector->sharedController->release();
-                                       vector->sharedController = 0;
+                                       vector->sharedController = NULL;
                                        IOLockUnlock(vector->interruptLock);
                                        return error;
                                }
@@ -174,7 +174,7 @@ IOInterruptController::registerInterrupt(IOService *nub, int source,
                        vector->nub     = vector->sharedController;
                        vector->source  = 0;
                        vector->target  = vector->sharedController;
-                       vector->refCon  = 0;
+                       vector->refCon  = NULL;
 
                        // If the interrupt was already registered,
                        // save the driver's interrupt enablement state.
@@ -259,11 +259,11 @@ IOInterruptController::unregisterInterrupt(IOService *nub, int source)
        vector->interruptDisabledSoft = 0;
        vector->interruptDisabledHard = 0;
        vector->interruptRegistered = 0;
-       vector->nub = 0;
+       vector->nub = NULL;
        vector->source = 0;
-       vector->handler = 0;
-       vector->target = 0;
-       vector->refCon = 0;
+       vector->handler = NULL;
+       vector->target = NULL;
+       vector->refCon = NULL;
 
        IOLockUnlock(vector->interruptLock);
        return kIOReturnSuccess;
@@ -278,7 +278,7 @@ IOInterruptController::getInterruptType(IOService *nub, int source,
        IOInterruptVector *vector;
        OSData            *vectorData;
 
-       if (interruptType == 0) {
+       if (interruptType == NULL) {
                return kIOReturnBadArgument;
        }
 
@@ -372,7 +372,7 @@ IOInterruptController::causeInterrupt(IOService *nub, int source)
 IOInterruptAction
 IOInterruptController::getInterruptHandlerAddress(void)
 {
-       return 0;
+       return NULL;
 }
 
 IOReturn
@@ -507,7 +507,7 @@ IOSharedInterruptController::initInterruptController(IOInterruptController *pare
 
        // Allocate the IOInterruptSource so this can act like a nub.
        _interruptSources = (IOInterruptSource *)IOMalloc(sizeof(IOInterruptSource));
-       if (_interruptSources == 0) {
+       if (_interruptSources == NULL) {
                return kIOReturnNoMemory;
        }
        _numInterruptSources = 1;
@@ -537,7 +537,7 @@ IOSharedInterruptController::initInterruptController(IOInterruptController *pare
 
        // Allocate the lock for the controller.
        controllerLock = IOSimpleLockAlloc();
-       if (controllerLock == 0) {
+       if (controllerLock == NULL) {
                return kIOReturnNoResources;
        }
 
@@ -571,7 +571,7 @@ IOSharedInterruptController::registerInterrupt(IOService *nub,
 {
        IOInterruptSource *interruptSources;
        IOInterruptVectorNumber vectorNumber;
-       IOInterruptVector *vector = 0;
+       IOInterruptVector *vector = NULL;
        OSData            *vectorData;
        IOInterruptState  interruptState;
 
@@ -607,7 +607,7 @@ IOSharedInterruptController::registerInterrupt(IOService *nub,
 
        // Create the vectorData for the IOInterruptSource.
        vectorData = OSData::withBytes(&vectorNumber, sizeof(vectorNumber));
-       if (vectorData == 0) {
+       if (vectorData == NULL) {
                IOLockUnlock(vector->interruptLock);
                return kIOReturnNoMemory;
        }
@@ -667,11 +667,11 @@ IOSharedInterruptController::unregisterInterrupt(IOService *nub,
                vector->interruptDisabledSoft = 0;
                vector->interruptDisabledHard = 0;
                vector->interruptRegistered = 0;
-               vector->nub = 0;
+               vector->nub = NULL;
                vector->source = 0;
-               vector->handler = 0;
-               vector->target = 0;
-               vector->refCon = 0;
+               vector->handler = NULL;
+               vector->target = NULL;
+               vector->refCon = NULL;
 
                interruptState = IOSimpleLockLockDisableInterrupt(controllerLock);
                vectorsRegistered--;
index 19d5d597d754de689c67150a5506f32eea344ab8..5decae5c62d9d3286e349539c80d97afcac7d50f 100644 (file)
@@ -218,7 +218,7 @@ IOInterruptEventSource::interruptEventSource(OSObject *inOwner,
 
        if (me && !me->init(inOwner, inAction, inProvider, inIntIndex)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -456,6 +456,9 @@ IOInterruptEventSource::normalInterruptOccurred
        }
 
        if (reserved->statistics) {
+               if (reserved->statistics->enablePrimaryTimestamp) {
+                       reserved->statistics->primaryTimestamp = mach_absolute_time();
+               }
                if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelCountIndex)) {
                        IA_ADD_VALUE(&reserved->statistics->interruptStatistics[kInterruptAccountingFirstLevelCountIndex], 1);
                }
@@ -484,6 +487,9 @@ IOInterruptEventSource::disableInterruptOccurred
        }
 
        if (reserved->statistics) {
+               if (reserved->statistics->enablePrimaryTimestamp) {
+                       reserved->statistics->primaryTimestamp = mach_absolute_time();
+               }
                if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelCountIndex)) {
                        IA_ADD_VALUE(&reserved->statistics->interruptStatistics[kInterruptAccountingFirstLevelCountIndex], 1);
                }
@@ -498,12 +504,12 @@ IOInterruptEventSource::disableInterruptOccurred
 
 void
 IOInterruptEventSource::interruptOccurred
-(void *refcon, IOService *prov, int source)
+(void *_refcon, IOService *prov, int source)
 {
        if (autoDisable && prov) {
-               disableInterruptOccurred(refcon, prov, source);
+               disableInterruptOccurred(_refcon, prov, source);
        } else {
-               normalInterruptOccurred(refcon, prov, source);
+               normalInterruptOccurred(_refcon, prov, source);
        }
 }
 
@@ -513,3 +519,20 @@ IOInterruptEventSource::warmCPU
 {
        return ml_interrupt_prewarm(abstime);
 }
+
+void
+IOInterruptEventSource::enablePrimaryInterruptTimestamp(bool enable)
+{
+       if (reserved->statistics) {
+               reserved->statistics->enablePrimaryTimestamp = enable;
+       }
+}
+
+uint64_t
+IOInterruptEventSource::getPimaryInterruptTimestamp()
+{
+       if (reserved->statistics && reserved->statistics->enablePrimaryTimestamp) {
+               return reserved->statistics->primaryTimestamp;
+       }
+       return -1ULL;
+}
index bfbac5edf9387775e46503d9371ebbf0b3bafdda..69e82fbec2cda217b0e02ff63c98cfea3c982ba3 100644 (file)
@@ -174,7 +174,7 @@ OSObject * IOKitDiagnostics::diagnostics( void )
        diags = new IOKitDiagnostics;
        if (diags && !diags->init()) {
                diags->release();
-               diags = 0;
+               diags = NULL;
        }
 
        return diags;
@@ -304,7 +304,7 @@ IOTRecursiveLockLock(IOTRecursiveLock * lock)
                lock->count++;
        } else {
                lck_mtx_lock(lock->mutex);
-               assert(lock->thread == 0);
+               assert(lock->thread == NULL);
                assert(lock->count == 0);
                lock->thread = current_thread();
                lock->count = 1;
@@ -316,7 +316,7 @@ IOTRecursiveLockUnlock(IOTRecursiveLock * lock)
 {
        assert(lock->thread == current_thread());
        if (0 == (--lock->count)) {
-               lock->thread = 0;
+               lock->thread = NULL;
                lck_mtx_unlock(lock->mutex);
        }
 }
@@ -488,13 +488,13 @@ IOTrackingAddUser(IOTrackingQueue * queue, IOTrackingUser * mem, vm_size_t size)
 
        assert(!mem->link.next);
 
-       num = backtrace(&mem->bt[0], kIOTrackingCallSiteBTs);
+       num = backtrace(&mem->bt[0], kIOTrackingCallSiteBTs, NULL);
        num = 0;
        if ((kernel_task != current_task()) && (self = proc_self())) {
-               bool user_64;
+               bool user_64 = false;
                mem->btPID  = proc_pid(self);
                (void)backtrace_user(&mem->btUser[0], kIOTrackingCallSiteBTs - 1, &num,
-                   &user_64);
+                   &user_64, NULL);
                mem->user32 = !user_64;
                proc_rele(self);
        }
@@ -545,7 +545,7 @@ IOTrackingAdd(IOTrackingQueue * queue, IOTracking * mem, size_t size, bool addre
 
        assert(!mem->link.next);
 
-       num  = backtrace(&bt[0], kIOTrackingCallSiteBTs + 1);
+       num  = backtrace(&bt[0], kIOTrackingCallSiteBTs + 1, NULL);
        if (!num) {
                return;
        }
@@ -1083,9 +1083,9 @@ IOTrackingDebug(uint32_t selector, uint32_t options, uint64_t value,
        OSData                 * data;
 
        if (result) {
-               *result = 0;
+               *result = NULL;
        }
-       data = 0;
+       data = NULL;
        ret = kIOReturnNotReady;
 
 #if IOTRACKING
@@ -1426,7 +1426,7 @@ IOUserClient * IOKitDiagnosticsClient::withTask(task_t owningTask)
        inst = new IOKitDiagnosticsClient;
        if (inst && !inst->init()) {
                inst->release();
-               inst = 0;
+               inst = NULL;
        }
 
        return inst;
@@ -1464,7 +1464,7 @@ IOKitDiagnosticsClient::externalMethod(uint32_t selector, IOExternalMethodArgume
                return kIOReturnBadArgument;
        }
 
-       names = 0;
+       names = NULL;
        namesLen = args->structureInputSize - sizeof(IOKitDiagnosticsParameters);
        if (namesLen) {
                names = (typeof(names))(params + 1);
index 436b19793d7d347fc6895417a64dd1a7ddb0eae8..f1a0d882dcc13d23ffac9ccab6321f0f46799d46 100644 (file)
@@ -167,6 +167,7 @@ struct IOMemoryDescriptorReserved {
        uint64_t                      kernReserved[4];
        vm_tag_t                      kernelTag;
        vm_tag_t                      userTag;
+       task_t                        creator;
 };
 
 struct iopa_t {
@@ -206,6 +207,7 @@ extern bool gCPUsRunning;
 extern OSSet * gIORemoveOnReadProperties;
 
 extern "C" void IOKitInitializeTime( void );
+extern void IOMachPortInitialize(void);
 
 extern "C" OSString * IOCopyLogNameForPID(int pid);
 
index 0dedff70fefd55cb0371c3524f6799623f5f9b0c..16459d58520459faa4d18769f5e705e010938bac 100644 (file)
@@ -256,6 +256,19 @@ IOExitThread(void)
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
+void *
+IOMallocZero(vm_size_t size)
+{
+       void * result;
+       result = IOMalloc(size);
+       if (result) {
+               bzero(result, size);
+       }
+       return result;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
 #if IOTRACKING
 struct IOLibMallocHeader {
        IOTrackingAddress tracking;
@@ -390,7 +403,7 @@ IOMallocAligned(vm_size_t size, vm_size_t alignment)
        IOLibPageMallocHeader * hdr;
 
        if (size == 0) {
-               return 0;
+               return NULL;
        }
 
        alignment = (1UL << log2up(alignment));
@@ -672,7 +685,7 @@ IOMallocContiguous(vm_size_t size, vm_size_t alignment,
        mach_vm_address_t   address = 0;
 
        if (size == 0) {
-               return 0;
+               return NULL;
        }
        if (alignment == 0) {
                alignment = 1;
@@ -852,10 +865,10 @@ IOMallocPageablePages(vm_size_t size, vm_size_t alignment, vm_tag_t tag)
        struct IOMallocPageableRef ref;
 
        if (alignment > page_size) {
-               return 0;
+               return NULL;
        }
        if (size > kIOPageableMaxMapSize) {
-               return 0;
+               return NULL;
        }
 
        ref.size = size;
@@ -871,7 +884,7 @@ IOMallocPageablePages(vm_size_t size, vm_size_t alignment, vm_tag_t tag)
 vm_map_t
 IOPageableMapForAddress( uintptr_t address )
 {
-       vm_map_t    map = 0;
+       vm_map_t    map = NULL;
        UInt32      index;
 
        for (index = 0; index < gIOKitPageableSpace.count; index++) {
@@ -974,7 +987,7 @@ iopa_allocinpage(iopa_page_t * pa, uint32_t count, uint64_t align)
                pa->avail &= ~((-1ULL << (64 - count)) >> n);
                if (!pa->avail && pa->link.next) {
                        remque(&pa->link);
-                       pa->link.next = 0;
+                       pa->link.next = NULL;
                }
                return n * gIOPageAllocChunkBytes + trunc_page((uintptr_t) pa);
        }
@@ -1068,10 +1081,10 @@ iopa_free(iopa_t * a, uintptr_t addr, vm_size_t bytes)
        }
        pa->avail |= ((-1ULL << (64 - count)) >> chunk);
        if (pa->avail != -2ULL) {
-               pa = 0;
+               pa = NULL;
        } else {
                remque(&pa->link);
-               pa->link.next = 0;
+               pa->link.next = NULL;
                pa->signature = 0;
                a->pagecount--;
                // page to free
@@ -1239,38 +1252,28 @@ void
 IOKitKernelLogBuffer(const char * title, const void * buffer, size_t size,
     void (*output)(const char *format, ...))
 {
+       size_t idx, linestart;
+       enum { bytelen = (sizeof("0xZZ, ") - 1) };
+       char hex[(bytelen * 16) + 1];
        uint8_t c, chars[17];
-       size_t idx;
 
-       output("%s(0x%x):\n", title, size);
+       output("%s(0x%lx):\n", title, size);
+       output("              0     1     2     3     4     5     6     7     8     9     A     B     C     D     E     F\n");
        if (size > 4096) {
                size = 4096;
        }
-       chars[16] = idx = 0;
-       while (true) {
-               if (!(idx & 15)) {
-                       if (idx) {
-                               output(" |%s|\n", chars);
-                       }
-                       if (idx >= size) {
-                               break;
-                       }
-                       output("%04x:  ", idx);
-               } else if (!(idx & 7)) {
-                       output(" ");
-               }
-
-               c =  ((char *)buffer)[idx];
-               output("%02x ", c);
+       chars[16] = 0;
+       for (idx = 0, linestart = 0; idx < size;) {
+               c = ((char *)buffer)[idx];
+               snprintf(&hex[bytelen * (idx & 15)], bytelen + 1, "0x%02x, ", c);
                chars[idx & 15] = ((c >= 0x20) && (c <= 0x7f)) ? c : ' ';
-
                idx++;
-               if ((idx == size) && (idx & 15)) {
-                       chars[idx & 15] = 0;
-                       while (idx & 15) {
-                               idx++;
-                               output("   ");
+               if ((idx == size) || !(idx & 15)) {
+                       if (idx & 15) {
+                               chars[idx & 15] = 0;
                        }
+                       output("/* %04lx: */ %-96s /* |%-16s| */\n", linestart, hex, chars);
+                       linestart += 16;
                }
        }
 }
index 8871fb684155a2afa918001729062bd1ead54973..3b8a953592170cce4a28370f9744cfce036fc368 100644 (file)
@@ -144,18 +144,18 @@ IORecursiveLockAllocWithLockGroup( lck_grp_t * lockGroup )
 {
        _IORecursiveLock * lock;
 
-       if (lockGroup == 0) {
-               return 0;
+       if (lockGroup == NULL) {
+               return NULL;
        }
 
        lock = IONew( _IORecursiveLock, 1 );
        if (!lock) {
-               return 0;
+               return NULL;
        }
 
        lck_mtx_init( &lock->mutex, lockGroup, LCK_ATTR_NULL );
        lock->group = lockGroup;
-       lock->thread = 0;
+       lock->thread = NULL;
        lock->count  = 0;
 
        return (IORecursiveLock *) lock;
@@ -192,7 +192,7 @@ IORecursiveLockLock( IORecursiveLock * _lock)
                lock->count++;
        } else {
                lck_mtx_lock( &lock->mutex );
-               assert( lock->thread == 0 );
+               assert( lock->thread == NULL );
                assert( lock->count == 0 );
                lock->thread = IOThreadSelf();
                lock->count = 1;
@@ -209,7 +209,7 @@ IORecursiveLockTryLock( IORecursiveLock * _lock)
                return true;
        } else {
                if (lck_mtx_try_lock( &lock->mutex )) {
-                       assert( lock->thread == 0 );
+                       assert( lock->thread == NULL );
                        assert( lock->count == 0 );
                        lock->thread = IOThreadSelf();
                        lock->count = 1;
@@ -227,7 +227,7 @@ IORecursiveLockUnlock( IORecursiveLock * _lock)
        assert( lock->thread == IOThreadSelf());
 
        if (0 == (--lock->count)) {
-               lock->thread = 0;
+               lock->thread = NULL;
                lck_mtx_unlock( &lock->mutex );
        }
 }
@@ -250,12 +250,12 @@ IORecursiveLockSleep(IORecursiveLock *_lock, void *event, UInt32 interType)
        assert(lock->thread == IOThreadSelf());
 
        lock->count = 0;
-       lock->thread = 0;
+       lock->thread = NULL;
        res = lck_mtx_sleep(&lock->mutex, LCK_SLEEP_PROMOTED_PRI, (event_t) event, (wait_interrupt_t) interType);
 
        // Must re-establish the recursive lock no matter why we woke up
        // otherwise we would potentially leave the return path corrupted.
-       assert(lock->thread == 0);
+       assert(lock->thread == NULL);
        assert(lock->count == 0);
        lock->thread = IOThreadSelf();
        lock->count = count;
@@ -273,13 +273,13 @@ IORecursiveLockSleepDeadline( IORecursiveLock * _lock, void *event,
        assert(lock->thread == IOThreadSelf());
 
        lock->count = 0;
-       lock->thread = 0;
+       lock->thread = NULL;
        res = lck_mtx_sleep_deadline(&lock->mutex, LCK_SLEEP_PROMOTED_PRI, (event_t) event,
            (wait_interrupt_t) interType, __OSAbsoluteTime(deadline));
 
        // Must re-establish the recursive lock no matter why we woke up
        // otherwise we would potentially leave the return path corrupted.
-       assert(lock->thread == 0);
+       assert(lock->thread == NULL);
        assert(lock->count == 0);
        lock->thread = IOThreadSelf();
        lock->count = count;
@@ -331,6 +331,12 @@ IOSimpleLockInit( IOSimpleLock * lock)
        lck_spin_init( lock, IOLockGroup, LCK_ATTR_NULL);
 }
 
+void
+IOSimpleLockDestroy( IOSimpleLock * lock )
+{
+       lck_spin_destroy(lock, IOLockGroup);
+}
+
 void
 IOSimpleLockFree( IOSimpleLock * lock )
 {
index 1e4e948407256bdf7de6b7ab533f69f41755dc0c..82b1392678cc92a7ee4dd2bd97085669081fd451 100644 (file)
@@ -52,7 +52,7 @@ IOMemoryCursor::withSpecification(SegmentFunction  inSegFunc,
            inMaxTransferSize,
            inAlignment)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -215,7 +215,7 @@ IONaturalMemoryCursor::withSpecification(IOPhysicalLength inMaxSegmentSize,
            inMaxTransferSize,
            inAlignment)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -266,7 +266,7 @@ IOBigMemoryCursor::withSpecification(IOPhysicalLength inMaxSegmentSize,
            inMaxTransferSize,
            inAlignment)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -317,7 +317,7 @@ IOLittleMemoryCursor::withSpecification(IOPhysicalLength inMaxSegmentSize,
            inMaxTransferSize,
            inAlignment)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
index 3ff1f79ca0d4ecaa1076a129ca7456c18c7baf6e..d73a4343b5961696c3ef3dfda5bd4d0f53bd570a 100644 (file)
@@ -63,6 +63,7 @@ __BEGIN_DECLS
 
 #include <mach/vm_prot.h>
 #include <mach/mach_vm.h>
+#include <mach/memory_entry.h>
 #include <vm/vm_fault.h>
 #include <vm/vm_protos.h>
 
@@ -151,7 +152,7 @@ struct ioGMDData {
        __attribute__((aligned(sizeof(upl_t))))
 #endif
        ;
-       ioPLBlock fBlocks[1];
+       //ioPLBlock fBlocks[1];
 };
 
 #define getDataP(osd)   ((ioGMDData *) (osd)->getBytesNoCopy())
@@ -301,80 +302,55 @@ purgeableStateBits(int * state)
        return err;
 }
 
+typedef struct {
+       unsigned int wimg;
+       unsigned int object_type;
+} iokit_memtype_entry;
+
+static const iokit_memtype_entry iomd_mem_types[] = {
+       [kIODefaultCache] = {VM_WIMG_DEFAULT, MAP_MEM_NOOP},
+       [kIOInhibitCache] = {VM_WIMG_IO, MAP_MEM_IO},
+       [kIOWriteThruCache] = {VM_WIMG_WTHRU, MAP_MEM_WTHRU},
+       [kIOWriteCombineCache] = {VM_WIMG_WCOMB, MAP_MEM_WCOMB},
+       [kIOCopybackCache] = {VM_WIMG_COPYBACK, MAP_MEM_COPYBACK},
+       [kIOCopybackInnerCache] = {VM_WIMG_INNERWBACK, MAP_MEM_INNERWBACK},
+       [kIOPostedWrite] = {VM_WIMG_POSTED, MAP_MEM_POSTED},
+       [kIORealTimeCache] = {VM_WIMG_RT, MAP_MEM_RT},
+       [kIOPostedReordered] = {VM_WIMG_POSTED_REORDERED, MAP_MEM_POSTED_REORDERED},
+       [kIOPostedCombinedReordered] = {VM_WIMG_POSTED_COMBINED_REORDERED, MAP_MEM_POSTED_COMBINED_REORDERED},
+};
 
 static vm_prot_t
 vmProtForCacheMode(IOOptionBits cacheMode)
 {
+       assert(cacheMode < (sizeof(iomd_mem_types) / sizeof(iomd_mem_types[0])));
        vm_prot_t prot = 0;
-       switch (cacheMode) {
-       case kIOInhibitCache:
-               SET_MAP_MEM(MAP_MEM_IO, prot);
-               break;
-
-       case kIOWriteThruCache:
-               SET_MAP_MEM(MAP_MEM_WTHRU, prot);
-               break;
-
-       case kIOWriteCombineCache:
-               SET_MAP_MEM(MAP_MEM_WCOMB, prot);
-               break;
-
-       case kIOCopybackCache:
-               SET_MAP_MEM(MAP_MEM_COPYBACK, prot);
-               break;
-
-       case kIOCopybackInnerCache:
-               SET_MAP_MEM(MAP_MEM_INNERWBACK, prot);
-               break;
-
-       case kIOPostedWrite:
-               SET_MAP_MEM(MAP_MEM_POSTED, prot);
-               break;
-
-       case kIODefaultCache:
-       default:
-               SET_MAP_MEM(MAP_MEM_NOOP, prot);
-               break;
-       }
-
+       SET_MAP_MEM(iomd_mem_types[cacheMode].object_type, prot);
        return prot;
 }
 
 static unsigned int
 pagerFlagsForCacheMode(IOOptionBits cacheMode)
 {
-       unsigned int pagerFlags = 0;
-       switch (cacheMode) {
-       case kIOInhibitCache:
-               pagerFlags = DEVICE_PAGER_CACHE_INHIB |  DEVICE_PAGER_COHERENT | DEVICE_PAGER_GUARDED;
-               break;
-
-       case kIOWriteThruCache:
-               pagerFlags = DEVICE_PAGER_WRITE_THROUGH | DEVICE_PAGER_COHERENT | DEVICE_PAGER_GUARDED;
-               break;
-
-       case kIOWriteCombineCache:
-               pagerFlags = DEVICE_PAGER_CACHE_INHIB | DEVICE_PAGER_COHERENT;
-               break;
-
-       case kIOCopybackCache:
-               pagerFlags = DEVICE_PAGER_COHERENT;
-               break;
-
-       case kIOCopybackInnerCache:
-               pagerFlags = DEVICE_PAGER_COHERENT;
-               break;
-
-       case kIOPostedWrite:
-               pagerFlags = DEVICE_PAGER_CACHE_INHIB |  DEVICE_PAGER_COHERENT | DEVICE_PAGER_GUARDED | DEVICE_PAGER_EARLY_ACK;
-               break;
+       assert(cacheMode < (sizeof(iomd_mem_types) / sizeof(iomd_mem_types[0])));
+       if (cacheMode == kIODefaultCache) {
+               return -1U;
+       }
+       return iomd_mem_types[cacheMode].wimg;
+}
 
-       case kIODefaultCache:
-       default:
-               pagerFlags = -1U;
-               break;
+static IOOptionBits
+cacheModeForPagerFlags(unsigned int pagerFlags)
+{
+       pagerFlags &= VM_WIMG_MASK;
+       IOOptionBits cacheMode = kIODefaultCache;
+       for (IOOptionBits i = 0; i < (sizeof(iomd_mem_types) / sizeof(iomd_mem_types[0])); ++i) {
+               if (iomd_mem_types[i].wimg == pagerFlags) {
+                       cacheMode = i;
+                       break;
+               }
        }
-       return pagerFlags;
+       return (cacheMode == kIODefaultCache) ? kIOCopybackCache : cacheMode;
 }
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
@@ -431,7 +407,7 @@ IOGeneralMemoryDescriptor::memoryReferenceAlloc(uint32_t capacity, IOMemoryRefer
                OSIncrementAtomic(&gIOMemoryReferenceCount);
        }
        if (!ref) {
-               return 0;
+               return NULL;
        }
        ref->capacity = capacity;
        return ref;
@@ -445,7 +421,7 @@ IOGeneralMemoryDescriptor::memoryReferenceFree(IOMemoryReference * ref)
 
        if (ref->mapRef) {
                memoryReferenceFree(ref->mapRef);
-               ref->mapRef = 0;
+               ref->mapRef = NULL;
        }
 
        entries = ref->entries + ref->count;
@@ -496,6 +472,7 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate(
        IOOptionBits         cacheMode;
        unsigned int         pagerFlags;
        vm_tag_t             tag;
+       vm_named_entry_kernel_flags_t vmne_kflags;
 
        ref = memoryReferenceAlloc(kCapacity, NULL);
        if (!ref) {
@@ -503,6 +480,7 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate(
        }
 
        tag = getVMTag(kernel_map);
+       vmne_kflags = VM_NAMED_ENTRY_KERNEL_FLAGS_NONE;
        entries = &ref->entries[0];
        count = 0;
        err = KERN_SUCCESS;
@@ -517,21 +495,7 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate(
 
                // default cache mode for physical
                if (kIODefaultCache == ((_flags & kIOMemoryBufferCacheMask) >> kIOMemoryBufferCacheShift)) {
-                       IOOptionBits mode;
-                       pagerFlags = IODefaultCacheBits(nextAddr);
-                       if (DEVICE_PAGER_CACHE_INHIB & pagerFlags) {
-                               if (DEVICE_PAGER_EARLY_ACK & pagerFlags) {
-                                       mode = kIOPostedWrite;
-                               } else if (DEVICE_PAGER_GUARDED & pagerFlags) {
-                                       mode = kIOInhibitCache;
-                               } else {
-                                       mode = kIOWriteCombineCache;
-                               }
-                       } else if (DEVICE_PAGER_WRITE_THROUGH & pagerFlags) {
-                               mode = kIOWriteThruCache;
-                       } else {
-                               mode = kIOCopybackCache;
-                       }
+                       IOOptionBits mode = cacheModeForPagerFlags(IODefaultCacheBits(nextAddr));
                        _flags |= (mode << kIOMemoryBufferCacheShift);
                }
        }
@@ -554,6 +518,10 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate(
                prot |= MAP_MEM_VM_COPY;
        }
 
+       if (kIOMemoryUseReserve & _flags) {
+               prot |= MAP_MEM_GRAB_SECLUDED;
+       }
+
        if ((kIOMemoryReferenceReuse & options) && _memRef) {
                cloneEntries = &_memRef->entries[0];
                prot |= MAP_MEM_NAMED_REUSE;
@@ -563,14 +531,36 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate(
                // virtual ranges
 
                if (kIOMemoryBufferPageable & _flags) {
+                       int ledger_tag, ledger_no_footprint;
+
                        // IOBufferMemoryDescriptor alloc - set flags for entry + object create
                        prot |= MAP_MEM_NAMED_CREATE;
+
+                       // default accounting settings:
+                       //   + "none" ledger tag
+                       //   + include in footprint
+                       // can be changed later with ::setOwnership()
+                       ledger_tag = VM_LEDGER_TAG_NONE;
+                       ledger_no_footprint = 0;
+
                        if (kIOMemoryBufferPurgeable & _flags) {
                                prot |= (MAP_MEM_PURGABLE | MAP_MEM_PURGABLE_KERNEL_ONLY);
                                if (VM_KERN_MEMORY_SKYWALK == tag) {
-                                       prot |= MAP_MEM_LEDGER_TAG_NETWORK;
+                                       // Skywalk purgeable memory accounting:
+                                       //    + "network" ledger tag
+                                       //    + not included in footprint
+                                       ledger_tag = VM_LEDGER_TAG_NETWORK;
+                                       ledger_no_footprint = 1;
+                               } else {
+                                       // regular purgeable memory accounting:
+                                       //    + no ledger tag
+                                       //    + included in footprint
+                                       ledger_tag = VM_LEDGER_TAG_NONE;
+                                       ledger_no_footprint = 0;
                                }
                        }
+                       vmne_kflags.vmnekf_ledger_tag = ledger_tag;
+                       vmne_kflags.vmnekf_ledger_no_footprint = ledger_no_footprint;
                        if (kIOMemoryUseReserve & _flags) {
                                prot |= MAP_MEM_GRAB_SECLUDED;
                        }
@@ -614,7 +604,7 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate(
                                }
 
                                err = mach_make_memory_entry_internal(map,
-                                   &actualSize, entryAddr, prot, &entry, cloneEntry);
+                                   &actualSize, entryAddr, prot, vmne_kflags, &entry, cloneEntry);
 
                                if (KERN_SUCCESS != err) {
                                        break;
@@ -649,7 +639,7 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate(
        } else {
                // _task == 0, physical or kIOMemoryTypeUPL
                memory_object_t pager;
-               vm_size_t       size = ptoa_32(_pages);
+               vm_size_t       size = ptoa_64(_pages);
 
                if (!getKernelReserved()) {
                        panic("getKernelReserved");
@@ -666,7 +656,7 @@ IOGeneralMemoryDescriptor::memoryReferenceCreate(
                        pagerFlags |= DEVICE_PAGER_CONTIGUOUS;
                }
 
-               pager = device_pager_setup((memory_object_t) 0, (uintptr_t) reserved,
+               pager = device_pager_setup((memory_object_t) NULL, (uintptr_t) reserved,
                    size, pagerFlags);
                assert(pager);
                if (!pager) {
@@ -1115,6 +1105,30 @@ IOGeneralMemoryDescriptor::memoryReferenceSetPurgeable(
        return err;
 }
 
+IOReturn
+IOGeneralMemoryDescriptor::memoryReferenceSetOwnership(
+       IOMemoryReference * ref,
+       task_t              newOwner,
+       int                 newLedgerTag,
+       IOOptionBits        newLedgerOptions)
+{
+       IOReturn        err, totalErr;
+       IOMemoryEntry * entries;
+
+       totalErr = kIOReturnSuccess;
+       entries = ref->entries + ref->count;
+       while (entries > &ref->entries[0]) {
+               entries--;
+
+               err = mach_memory_entry_ownership(entries->entry, newOwner, newLedgerTag, newLedgerOptions);
+               if (KERN_SUCCESS != err) {
+                       totalErr = err;
+               }
+       }
+
+       return totalErr;
+}
+
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 IOMemoryDescriptor *
@@ -1141,7 +1155,7 @@ IOMemoryDescriptor::withAddress(IOVirtualAddress address,
 
                that->release();
        }
-       return 0;
+       return NULL;
 }
 #endif /* !__LP64__ */
 
@@ -1170,7 +1184,7 @@ IOMemoryDescriptor::withRanges( IOVirtualRange * ranges,
 
                that->release();
        }
-       return 0;
+       return NULL;
 }
 #endif /* !__LP64__ */
 
@@ -1198,14 +1212,14 @@ IOMemoryDescriptor::withAddressRanges(IOAddressRange *   ranges,
                        options |= kIOMemoryTypePhysical64;
                }
 
-               if (that->initWithOptions(ranges, rangeCount, 0, task, options, /* mapper */ 0)) {
+               if (that->initWithOptions(ranges, rangeCount, 0, task, options, /* mapper */ NULL)) {
                        return that;
                }
 
                that->release();
        }
 
-       return 0;
+       return NULL;
 }
 
 
@@ -1230,7 +1244,7 @@ IOMemoryDescriptor::withOptions(void *          buffers,
        if (self
            && !self->initWithOptions(buffers, count, offset, task, opts, mapper)) {
                self->release();
-               return 0;
+               return NULL;
        }
 
        return self;
@@ -1262,7 +1276,7 @@ IOMemoryDescriptor::withPhysicalRanges( IOPhysicalRange * ranges,
 
                that->release();
        }
-       return 0;
+       return NULL;
 }
 
 IOMemoryDescriptor *
@@ -1285,7 +1299,7 @@ IOMemoryDescriptor::withPersistentMemoryDescriptor(IOMemoryDescriptor *originalM
                return IOGeneralMemoryDescriptor::
                       withPersistentMemoryDescriptor(origGenMD);
        } else {
-               return 0;
+               return NULL;
        }
 }
 
@@ -1295,7 +1309,7 @@ IOGeneralMemoryDescriptor::withPersistentMemoryDescriptor(IOGeneralMemoryDescrip
        IOMemoryReference * memRef;
 
        if (kIOReturnSuccess != originalMD->memoryReferenceCreate(kIOMemoryReferenceReuse, &memRef)) {
-               return 0;
+               return NULL;
        }
 
        if (memRef == originalMD->_memRef) {
@@ -1308,9 +1322,9 @@ IOGeneralMemoryDescriptor::withPersistentMemoryDescriptor(IOGeneralMemoryDescrip
        IOMDPersistentInitData initData = { originalMD, memRef };
 
        if (self
-           && !self->initWithOptions(&initData, 1, 0, 0, kIOMemoryTypePersistentMD, 0)) {
+           && !self->initWithOptions(&initData, 1, 0, NULL, kIOMemoryTypePersistentMD, NULL)) {
                self->release();
-               self = 0;
+               self = NULL;
        }
        return self;
 }
@@ -1364,7 +1378,7 @@ IOGeneralMemoryDescriptor::initWithPhysicalRanges(
                mdOpts |= kIOMemoryAsReference;
        }
 
-       return initWithOptions(ranges, count, 0, 0, mdOpts, /* mapper */ 0);
+       return initWithOptions(ranges, count, 0, NULL, mdOpts, /* mapper */ NULL);
 }
 
 bool
@@ -1394,7 +1408,7 @@ IOGeneralMemoryDescriptor::initWithRanges(
                mdOpts |= kIOMemoryTypePhysical;
        }
 
-       return initWithOptions(ranges, count, 0, task, mdOpts, /* mapper */ 0);
+       return initWithOptions(ranges, count, 0, task, mdOpts, /* mapper */ NULL);
 }
 #endif /* !__LP64__ */
 
@@ -1519,7 +1533,7 @@ IOGeneralMemoryDescriptor::initWithOptions(void *       buffers,
                if (!(kIOMemoryRedirected & options)) {
                        if (_memRef) {
                                memoryReferenceRelease(_memRef);
-                               _memRef = 0;
+                               _memRef = NULL;
                        }
                        if (_mappings) {
                                _mappings->flushCollection();
@@ -1537,7 +1551,7 @@ IOGeneralMemoryDescriptor::initWithOptions(void *       buffers,
                options |= kIOMemoryMapperNone;
        }
        if (kIOMemoryMapperNone & options) {
-               mapper = 0; // No Mapper
+               mapper = NULL; // No Mapper
        } else if (mapper == kIOMapperSystem) {
                IOMapper::checkForSystemMapper();
                gIOSystemMapper = mapper = IOMapper::gSystem;
@@ -1780,7 +1794,7 @@ IOGeneralMemoryDescriptor::free()
 
        if (reserved) {
                LOCK;
-               reserved->dp.memory = 0;
+               reserved->dp.memory = NULL;
                UNLOCK;
        }
        if ((kIOMemoryTypePhysical == type) || (kIOMemoryTypePhysical64 == type)) {
@@ -1816,6 +1830,7 @@ IOGeneralMemoryDescriptor::free()
        }
 
        if (reserved) {
+               cleanKernelReserved(reserved);
                if (reserved->dp.devicePager) {
                        // memEntry holds a ref on the device pager which owns reserved
                        // (IOMemoryDescriptorReserved) so no reserved access after this point
@@ -2075,19 +2090,26 @@ IOGeneralMemoryDescriptor::getPreparationID( void )
        }
 
        if (kIOPreparationIDUnprepared == dataP->fPreparationID) {
-               dataP->fPreparationID = OSIncrementAtomic64(&gIOMDPreparationID);
+               SInt64 newID = OSIncrementAtomic64(&gIOMDPreparationID);
+               OSCompareAndSwap64(kIOPreparationIDUnprepared, newID, &dataP->fPreparationID);
        }
        return dataP->fPreparationID;
 }
 
+void
+IOMemoryDescriptor::cleanKernelReserved( IOMemoryDescriptorReserved * reserved )
+{
+       if (reserved->creator) {
+               task_deallocate(reserved->creator);
+               reserved->creator = NULL;
+       }
+}
+
 IOMemoryDescriptorReserved *
 IOMemoryDescriptor::getKernelReserved( void )
 {
        if (!reserved) {
-               reserved = IONew(IOMemoryDescriptorReserved, 1);
-               if (reserved) {
-                       bzero(reserved, sizeof(IOMemoryDescriptorReserved));
-               }
+               reserved = IONewZero(IOMemoryDescriptorReserved, 1);
        }
        return reserved;
 }
@@ -2096,7 +2118,8 @@ void
 IOMemoryDescriptor::setPreparationID( void )
 {
        if (getKernelReserved() && (kIOPreparationIDUnprepared == reserved->preparationID)) {
-               reserved->preparationID = OSIncrementAtomic64(&gIOMDPreparationID);
+               SInt64 newID = OSIncrementAtomic64(&gIOMDPreparationID);
+               OSCompareAndSwap64(kIOPreparationIDUnprepared, newID, &reserved->preparationID);
        }
 }
 
@@ -2111,22 +2134,22 @@ IOMemoryDescriptor::getPreparationID( void )
 }
 
 void
-IOMemoryDescriptor::setVMTags(vm_tag_t kernelTag, vm_tag_t userTag)
+IOMemoryDescriptor::setVMTags(uint32_t kernelTag, uint32_t userTag)
 {
-       _kernelTag = kernelTag;
-       _userTag   = userTag;
+       _kernelTag = (vm_tag_t) kernelTag;
+       _userTag   = (vm_tag_t) userTag;
 }
 
-vm_tag_t
+uint32_t
 IOMemoryDescriptor::getVMTag(vm_map_t map)
 {
        if (vm_kernel_map_is_kernel(map)) {
                if (VM_KERN_MEMORY_NONE != _kernelTag) {
-                       return _kernelTag;
+                       return (uint32_t) _kernelTag;
                }
        } else {
                if (VM_KERN_MEMORY_NONE != _userTag) {
-                       return _userTag;
+                       return (uint32_t) _userTag;
                }
        }
        return IOMemoryTag(map);
@@ -2282,9 +2305,9 @@ IOGeneralMemoryDescriptor::dmaCommandOperation(DMACommandOps op, void *vData, UI
        // Get the next segment
        struct InternalState {
                IOMDDMAWalkSegmentArgs fIO;
-               UInt fOffset2Index;
+               mach_vm_size_t fOffset2Index;
+               mach_vm_size_t fNextOffset;
                UInt fIndex;
-               UInt fNextOffset;
        } *isP;
 
        // Find the next segment
@@ -2293,7 +2316,7 @@ IOGeneralMemoryDescriptor::dmaCommandOperation(DMACommandOps op, void *vData, UI
        }
 
        isP = (InternalState *) vData;
-       UInt offset = isP->fIO.fOffset;
+       mach_vm_size_t offset = isP->fIO.fOffset;
        uint8_t mapped = isP->fIO.fMapped;
        uint64_t mappedBase;
 
@@ -2343,7 +2366,8 @@ IOGeneralMemoryDescriptor::dmaCommandOperation(DMACommandOps op, void *vData, UI
        }
 
        // Validate the previous offset
-       UInt ind, off2Ind = isP->fOffset2Index;
+       UInt ind;
+       mach_vm_size_t off2Ind = isP->fOffset2Index;
        if (!params
            && offset
            && (offset == isP->fNextOffset || off2Ind <= offset)) {
@@ -2351,7 +2375,7 @@ IOGeneralMemoryDescriptor::dmaCommandOperation(DMACommandOps op, void *vData, UI
        } else {
                ind = off2Ind = 0; // Start from beginning
        }
-       UInt length;
+       mach_vm_size_t length;
        UInt64 address;
 
        if ((_flags & kIOMemoryTypeMask) == kIOMemoryTypePhysical) {
@@ -2678,7 +2702,7 @@ IOMemoryDescriptor::getPhysicalSegment64(IOByteCount offset, IOByteCount *length
        IOPhysicalAddress phys32;
        IOByteCount       length;
        addr64_t          phys64;
-       IOMapper *        mapper = 0;
+       IOMapper *        mapper = NULL;
 
        phys32 = getPhysicalSegment(offset, lengthOfSegment);
        if (!phys32) {
@@ -2736,7 +2760,7 @@ IOGeneralMemoryDescriptor::getVirtualSegment(IOByteCount offset,
                panic("IOGMD::getVirtualSegment deprecated");
        }
 
-       return 0;
+       return NULL;
 }
 #pragma clang diagnostic pop
 #endif /* !__LP64__ */
@@ -2891,6 +2915,68 @@ IOMemoryDescriptor::setPurgeable( IOOptionBits newState,
        return err;
 }
 
+IOReturn
+IOGeneralMemoryDescriptor::setOwnership( task_t newOwner,
+    int newLedgerTag,
+    IOOptionBits newLedgerOptions )
+{
+       IOReturn      err = kIOReturnSuccess;
+
+       assert(!(kIOMemoryRemote & _flags));
+       if (kIOMemoryRemote & _flags) {
+               return kIOReturnNotAttached;
+       }
+
+       if (iokit_iomd_setownership_enabled == FALSE) {
+               return kIOReturnUnsupported;
+       }
+
+       if (_memRef) {
+               err = super::setOwnership(newOwner, newLedgerTag, newLedgerOptions);
+       } else {
+               err = kIOReturnUnsupported;
+       }
+
+       return err;
+}
+
+IOReturn
+IOMemoryDescriptor::setOwnership( task_t newOwner,
+    int newLedgerTag,
+    IOOptionBits newLedgerOptions )
+{
+       IOReturn err = kIOReturnNotReady;
+
+       assert(!(kIOMemoryRemote & _flags));
+       if (kIOMemoryRemote & _flags) {
+               return kIOReturnNotAttached;
+       }
+
+       if (iokit_iomd_setownership_enabled == FALSE) {
+               return kIOReturnUnsupported;
+       }
+
+       if (kIOMemoryThreadSafe & _flags) {
+               LOCK;
+       }
+       if (_memRef) {
+               err = IOGeneralMemoryDescriptor::memoryReferenceSetOwnership(_memRef, newOwner, newLedgerTag, newLedgerOptions);
+       } else {
+               IOMultiMemoryDescriptor * mmd;
+               IOSubMemoryDescriptor   * smd;
+               if ((smd = OSDynamicCast(IOSubMemoryDescriptor, this))) {
+                       err = smd->setOwnership(newOwner, newLedgerTag, newLedgerOptions);
+               } else if ((mmd = OSDynamicCast(IOMultiMemoryDescriptor, this))) {
+                       err = mmd->setOwnership(newOwner, newLedgerTag, newLedgerOptions);
+               }
+       }
+       if (kIOMemoryThreadSafe & _flags) {
+               UNLOCK;
+       }
+
+       return err;
+}
+
 IOReturn
 IOMemoryDescriptor::getPageCounts( IOByteCount * residentPageCount,
     IOByteCount * dirtyPageCount )
@@ -2962,9 +3048,9 @@ IOMemoryDescriptor::performOperation( IOOptionBits options,
 {
        IOByteCount remaining;
        unsigned int res;
-       void (*func)(addr64_t pa, unsigned int count) = 0;
+       void (*func)(addr64_t pa, unsigned int count) = NULL;
 #if defined(__arm__) || defined(__arm64__)
-       void (*func_ext)(addr64_t pa, unsigned int count, unsigned int remaining, unsigned int *result) = 0;
+       void (*func_ext)(addr64_t pa, unsigned int count, unsigned int remaining, unsigned int *result) = NULL;
 #endif
 
        assert(!(kIOMemoryRemote & _flags));
@@ -3009,7 +3095,7 @@ IOMemoryDescriptor::performOperation( IOOptionBits options,
        }
 
 #if defined(__arm__) || defined(__arm64__)
-       if ((func == 0) && (func_ext == 0)) {
+       if ((func == NULL) && (func_ext == NULL)) {
                return kIOReturnUnsupported;
        }
 #else /* defined(__arm__) || defined(__arm64__) */
@@ -3211,15 +3297,15 @@ IOGeneralMemoryDescriptor::wireVirtual(IODirection forDirection)
                if (uplPageSize > ((unsigned int)uplPageSize)) {
                        return kIOReturnNoMemory;
                }
-               if (!_memoryEntries->appendBytes(0, uplPageSize)) {
+               if (!_memoryEntries->appendBytes(NULL, uplPageSize)) {
                        return kIOReturnNoMemory;
                }
-               dataP = 0;
+               dataP = NULL;
 
                // Find the appropriate vm_map for the given task
                vm_map_t curMap;
-               if (_task == kernel_task && (kIOMemoryBufferPageable & _flags)) {
-                       curMap = 0;
+               if ((NULL != _memRef) || ((_task == kernel_task && (kIOMemoryBufferPageable & _flags)))) {
+                       curMap = NULL;
                } else {
                        curMap = get_task_map(_task);
                }
@@ -3230,7 +3316,7 @@ IOGeneralMemoryDescriptor::wireVirtual(IODirection forDirection)
                IOByteCount mdOffset    = 0;
                ppnum_t highestPage     = 0;
 
-               IOMemoryEntry * memRefEntry = 0;
+               IOMemoryEntry * memRefEntry = NULL;
                if (_memRef) {
                        memRefEntry = &_memRef->entries[0];
                }
@@ -3356,7 +3442,7 @@ IOGeneralMemoryDescriptor::wireVirtual(IODirection forDirection)
                                        }
                                        goto abortExit;
                                }
-                               dataP = 0;
+                               dataP = NULL;
 
                                // Check for a multiple iopl's in one virtual range
                                pageIndex += numPageInfo;
@@ -3433,7 +3519,7 @@ IOGeneralMemoryDescriptor::initMemoryEntries(size_t size, IOMapper * mapper)
                return false;
        }
 
-       _memoryEntries->appendBytes(0, computeDataSize(0, 0));
+       _memoryEntries->appendBytes(NULL, computeDataSize(0, 0));
        dataP = getDataP(_memoryEntries);
 
        if (mapper == kIOMapperWaitSystem) {
@@ -3523,7 +3609,7 @@ IOMemoryDescriptor::dmaUnmap(
        kern_allocation_name_t mapName;
        int16_t prior;
 
-       mapName = 0;
+       mapName = NULL;
        prior = 0;
        if (command) {
                mapName = _mapName;
@@ -3754,7 +3840,7 @@ IOGeneralMemoryDescriptor::complete(IODirection forDirection)
                                                        if (dataP->fCompletionError) {
                                                                upl_abort(ioplList[ind].fIOPL, 0 /*!UPL_ABORT_DUMP_PAGES*/);
                                                        } else {
-                                                               upl_commit(ioplList[ind].fIOPL, 0, 0);
+                                                               upl_commit(ioplList[ind].fIOPL, NULL, 0);
                                                        }
                                                        upl_deallocate(ioplList[ind].fIOPL);
                                                }
@@ -3838,7 +3924,8 @@ IOGeneralMemoryDescriptor::doMap(
                if (!(kIOMapReadOnly & options)) {
                        createOptions |= kIOMemoryReferenceWrite;
 #if DEVELOPMENT || DEBUG
-                       if (kIODirectionOut == (kIODirectionOutIn & _flags)) {
+                       if ((kIODirectionOut == (kIODirectionOutIn & _flags))
+                           && (!reserved || (reserved->creator != mapping->fAddressTask))) {
                                OSReportWithBacktrace("warning: creating writable mapping from IOMemoryDescriptor(kIODirectionOut) - use kIOMapReadOnly or change direction");
                        }
 #endif
@@ -3850,7 +3937,7 @@ IOGeneralMemoryDescriptor::doMap(
        }
 
        memory_object_t pager;
-       pager = (memory_object_t) (reserved ? reserved->dp.devicePager : 0);
+       pager = (memory_object_t) (reserved ? reserved->dp.devicePager : NULL);
 
        // <upl_transpose //
        if ((kIOMapReference | kIOMapUnique) == ((kIOMapReference | kIOMapUnique) & options)) {
@@ -3895,7 +3982,7 @@ IOGeneralMemoryDescriptor::doMap(
                        if (redirUPL2) {
                                upl_commit(redirUPL2, NULL, 0);
                                upl_deallocate(redirUPL2);
-                               redirUPL2 = 0;
+                               redirUPL2 = NULL;
                        }
                        {
                                // swap the memEntries since they now refer to different vm_objects
@@ -3936,7 +4023,7 @@ IOReturn
 IOMemoryMapTracking(IOTrackingUser * tracking, task_t * task,
     mach_vm_address_t * address, mach_vm_size_t * size)
 {
-#define iomap_offsetof(type, field) ((size_t)(&((type *)0)->field))
+#define iomap_offsetof(type, field) ((size_t)(&((type *)NULL)->field))
 
        IOMemoryMap * map = (typeof(map))(((uintptr_t) tracking) - iomap_offsetof(IOMemoryMap, fTracking));
 
@@ -3981,7 +4068,7 @@ OSMetaClassDefineReservedUnused(IOMemoryMap, 7);
 IOPhysicalAddress
 IOMemoryMap::getPhysicalAddress()
 {
-       return getPhysicalSegment( 0, 0 );
+       return getPhysicalSegment( 0, NULL );
 }
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
@@ -4112,7 +4199,7 @@ IOMemoryDescriptor::populateDevicePager(
                // in the middle of the loop only map whole pages
                if (segLen >= bytes) {
                        segLen = bytes;
-               } else if (segLen != trunc_page(segLen)) {
+               } else if (segLen != trunc_page_64(segLen)) {
                        err = kIOReturnVMError;
                }
                if (physAddr != trunc_page_64(physAddr)) {
@@ -4219,7 +4306,7 @@ IOReturn
 IOMemoryDescriptor::redirect( task_t safeTask, bool doRedirect )
 {
        IOReturn            err = kIOReturnSuccess;
-       IOMemoryMap *       mapping = 0;
+       IOMemoryMap *       mapping = NULL;
        OSIterator *        iter;
 
        LOCK;
@@ -4321,7 +4408,7 @@ IOMemoryMap::unmap( void )
 
        LOCK;
 
-       if (fAddress && fAddressMap && (0 == fSuperMap) && fMemory
+       if (fAddress && fAddressMap && (NULL == fSuperMap) && fMemory
            && (0 == (kIOMapStatic & fOptions))) {
                err = fMemory->doUnmap(fAddressMap, (IOVirtualAddress) this, 0);
        } else {
@@ -4330,7 +4417,7 @@ IOMemoryMap::unmap( void )
 
        if (fAddressMap) {
                vm_map_deallocate(fAddressMap);
-               fAddressMap = 0;
+               fAddressMap = NULL;
        }
 
        fAddress = 0;
@@ -4355,9 +4442,9 @@ IOMemoryMap::taskDied( void )
 
        if (fAddressMap) {
                vm_map_deallocate(fAddressMap);
-               fAddressMap = 0;
+               fAddressMap = NULL;
        }
-       fAddressTask = 0;
+       fAddressTask = NULL;
        fAddress     = 0;
        UNLOCK;
 }
@@ -4481,28 +4568,28 @@ IOMemoryMap::copyCompatible(
        mach_vm_size_t      _length   = newMapping->fLength;
 
        if ((!task) || (!fAddressMap) || (fAddressMap != get_task_map(task))) {
-               return 0;
+               return NULL;
        }
        if ((fOptions ^ _options) & kIOMapReadOnly) {
-               return 0;
+               return NULL;
        }
        if ((kIOMapDefaultCache != (_options & kIOMapCacheMask))
            && ((fOptions ^ _options) & kIOMapCacheMask)) {
-               return 0;
+               return NULL;
        }
 
        if ((0 == (_options & kIOMapAnywhere)) && (fAddress != toAddress)) {
-               return 0;
+               return NULL;
        }
 
        if (_offset < fOffset) {
-               return 0;
+               return NULL;
        }
 
        _offset -= fOffset;
 
        if ((_offset + _length) > fLength) {
-               return 0;
+               return NULL;
        }
 
        retain();
@@ -4569,7 +4656,7 @@ IOMemoryMap::getPhysicalSegment( IOByteCount _offset, IOPhysicalLength * _length
 void
 IOMemoryDescriptor::initialize( void )
 {
-       if (0 == gIOMemoryLock) {
+       if (NULL == gIOMemoryLock) {
                gIOMemoryLock = IORecursiveLockAlloc();
        }
 
@@ -4584,6 +4671,7 @@ IOMemoryDescriptor::free( void )
        }
 
        if (reserved) {
+               cleanKernelReserved(reserved);
                IODelete(reserved, IOMemoryDescriptorReserved, 1);
                reserved = NULL;
        }
@@ -4621,7 +4709,7 @@ IOMemoryDescriptor::map(
 {
        if ((!(kIOMapAnywhere & options)) && vm_map_is_64bit(get_task_map(intoTask))) {
                OSReportWithBacktrace("IOMemoryDescriptor::map() in 64b task, use ::createMappingInTask()");
-               return 0;
+               return NULL;
        }
 
        return createMappingInTask(intoTask, atAddress,
@@ -4650,13 +4738,13 @@ IOMemoryDescriptor::createMappingInTask(
            && !mapping->init( intoTask, atAddress,
            options, offset, length )) {
                mapping->release();
-               mapping = 0;
+               mapping = NULL;
        }
 
        if (mapping) {
                result = makeMapping(this, intoTask, (IOVirtualAddress) mapping, options | kIOMap64Bit, 0, 0);
        } else {
-               result = 0;
+               result = NULL;
        }
 
 #if DEBUG
@@ -4685,7 +4773,7 @@ IOMemoryMap::redirect(IOMemoryDescriptor * newBackingMemory,
     mach_vm_size_t       offset)
 {
        IOReturn err = kIOReturnSuccess;
-       IOMemoryDescriptor * physMem = 0;
+       IOMemoryDescriptor * physMem = NULL;
 
        LOCK;
 
@@ -4704,13 +4792,13 @@ IOMemoryMap::redirect(IOMemoryDescriptor * newBackingMemory,
                                if (KERN_SUCCESS != memory_object_iopl_request(fMemory->_memRef->entries[0].entry, 0, &size, &fRedirUPL,
                                    NULL, NULL,
                                    &flags, fMemory->getVMTag(kernel_map))) {
-                                       fRedirUPL = 0;
+                                       fRedirUPL = NULL;
                                }
 
                                if (physMem) {
                                        IOUnmapPages( fAddressMap, fAddress, fLength );
                                        if ((false)) {
-                                               physMem->redirect(0, true);
+                                               physMem->redirect(NULL, true);
                                        }
                                }
                        }
@@ -4727,10 +4815,10 @@ IOMemoryMap::redirect(IOMemoryDescriptor * newBackingMemory,
                                if (fRedirUPL) {
                                        upl_commit(fRedirUPL, NULL, 0);
                                        upl_deallocate(fRedirUPL);
-                                       fRedirUPL = 0;
+                                       fRedirUPL = NULL;
                                }
                                if ((false) && physMem) {
-                                       physMem->redirect(0, false);
+                                       physMem->redirect(NULL, false);
                                }
                        }
                }while (false);
@@ -4760,8 +4848,8 @@ IOMemoryDescriptor::makeMapping(
        }
 #endif /* !__LP64__ */
 
-       IOMemoryDescriptor *  mapDesc = 0;
-       __block IOMemoryMap * result  = 0;
+       IOMemoryDescriptor *  mapDesc = NULL;
+       __block IOMemoryMap * result  = NULL;
 
        IOMemoryMap *  mapping = (IOMemoryMap *) __address;
        mach_vm_size_t offset  = mapping->fOffset + __offset;
@@ -4828,7 +4916,7 @@ IOMemoryDescriptor::makeMapping(
                        mapDesc->retain();
                }
                IOReturn
-                   kr = mapDesc->doMap( 0, (IOVirtualAddress *) &mapping, options, 0, 0 );
+                   kr = mapDesc->doMap( NULL, (IOVirtualAddress *) &mapping, options, 0, 0 );
                if (kIOReturnSuccess == kr) {
                        result = mapping;
                        mapDesc->addMapping(result);
@@ -4853,7 +4941,7 @@ IOMemoryDescriptor::addMapping(
        IOMemoryMap * mapping )
 {
        if (mapping) {
-               if (0 == _mappings) {
+               if (NULL == _mappings) {
                        _mappings = OSSet::withCapacity(1);
                }
                if (_mappings) {
@@ -4924,7 +5012,7 @@ void *
 IOMemoryDescriptor::getVirtualSegment(IOByteCount offset,
     IOByteCount * lengthOfSegment)
 {
-       return 0;
+       return NULL;
 }
 #endif /* !__LP64__ */
 
@@ -4933,8 +5021,8 @@ IOMemoryDescriptor::getVirtualSegment(IOByteCount offset,
 bool
 IOGeneralMemoryDescriptor::serialize(OSSerialize * s) const
 {
-       OSSymbol const *keys[2] = {0};
-       OSObject *values[2] = {0};
+       OSSymbol const *keys[2] = {NULL};
+       OSObject *values[2] = {NULL};
        OSArray * array;
        vm_size_t vcopy_size;
 
@@ -4962,7 +5050,7 @@ IOGeneralMemoryDescriptor::serialize(OSSerialize * s) const
                goto bail;
        }
        vcopy = (SerData *) IOMalloc(vcopy_size);
-       if (vcopy == 0) {
+       if (vcopy == NULL) {
                result = false;
                goto bail;
        }
@@ -4993,17 +5081,17 @@ IOGeneralMemoryDescriptor::serialize(OSSerialize * s) const
                user_addr_t addr = vcopy[index].address;
                IOByteCount len = (IOByteCount) vcopy[index].length;
                values[0] = OSNumber::withNumber(addr, sizeof(addr) * 8);
-               if (values[0] == 0) {
+               if (values[0] == NULL) {
                        result = false;
                        goto bail;
                }
                values[1] = OSNumber::withNumber(len, sizeof(len) * 8);
-               if (values[1] == 0) {
+               if (values[1] == NULL) {
                        result = false;
                        goto bail;
                }
                OSDictionary *dict = OSDictionary::withObjects((const OSObject **)values, (const OSSymbol **)keys, 2);
-               if (dict == 0) {
+               if (dict == NULL) {
                        result = false;
                        goto bail;
                }
@@ -5011,7 +5099,7 @@ IOGeneralMemoryDescriptor::serialize(OSSerialize * s) const
                dict->release();
                values[0]->release();
                values[1]->release();
-               values[0] = values[1] = 0;
+               values[0] = values[1] = NULL;
        }
 
        result = array->serialize(s);
@@ -5072,5 +5160,5 @@ OSMetaClassDefineReservedUnused(IOMemoryDescriptor, 15);
 IOPhysicalAddress
 IOMemoryDescriptor::getPhysicalAddress()
 {
-       return getPhysicalSegment( 0, 0 );
+       return getPhysicalSegment( 0, NULL );
 }
index 3418e41a82878b9278fa09ae9cd898d148e80c1a..d70531265e01d5bf42f2ca02ad0498f421b16d24 100644 (file)
@@ -54,7 +54,7 @@ IOMultiMemoryDescriptor * IOMultiMemoryDescriptor::withDescriptors(
                    /* withDirection */ withDirection,
                    /* asReference   */ asReference ) == false) {
                me->release();
-               me = 0;
+               me = NULL;
        }
 
        return me;
@@ -97,7 +97,7 @@ IOMultiMemoryDescriptor::initWithDescriptors(
 
        // Initialize our minimal state.
 
-       _descriptors            = 0;
+       _descriptors            = NULL;
        _descriptorsCount       = withCount;
        _descriptorsIsAllocated = asReference ? false : true;
        _flags                  = withDirection;
@@ -105,14 +105,14 @@ IOMultiMemoryDescriptor::initWithDescriptors(
        _direction              = (IODirection) (_flags & kIOMemoryDirectionMask);
 #endif /* !__LP64__ */
        _length                 = 0;
-       _mappings               = 0;
+       _mappings               = NULL;
        _tag                    = 0;
 
        if (asReference) {
                _descriptors = descriptors;
        } else {
                _descriptors = IONew(IOMemoryDescriptor *, withCount);
-               if (_descriptors == 0) {
+               if (_descriptors == NULL) {
                        return false;
                }
 
@@ -396,6 +396,28 @@ IOMultiMemoryDescriptor::setPurgeable( IOOptionBits newState,
        return err;
 }
 
+IOReturn
+IOMultiMemoryDescriptor::setOwnership( task_t newOwner,
+    int newLedgerTag,
+    IOOptionBits newLedgerOptions )
+{
+       IOReturn     err;
+
+       if (iokit_iomd_setownership_enabled == FALSE) {
+               return kIOReturnUnsupported;
+       }
+
+       err = kIOReturnSuccess;
+       for (unsigned index = 0; index < _descriptorsCount; index++) {
+               err = _descriptors[index]->setOwnership(newOwner, newLedgerTag, newLedgerOptions);
+               if (kIOReturnSuccess != err) {
+                       break;
+               }
+       }
+
+       return err;
+}
+
 IOReturn
 IOMultiMemoryDescriptor::getPageCounts(IOByteCount * pResidentPageCount,
     IOByteCount * pDirtyPageCount)
index 88d58595fcfb6ae30ab58234fa3779864c199309..4b0c315a99a9524e832daae44e99fb08507973d6 100644 (file)
 #include <IOKit/IOKitKeys.h>
 #include <IOKit/IOKitKeysPrivate.h>
 #include <kern/debug.h>
+#include <pexpert/boot.h>
 #include <pexpert/pexpert.h>
 
+
 #define super IOService
 
 #define kIONVRAMPrivilege       kIOClientPrivilegeAdministrator
@@ -53,28 +55,29 @@ IODTNVRAM::init(IORegistryEntry *old, const IORegistryPlane *plane)
        }
 
        dict =  OSDictionary::withCapacity(1);
-       if (dict == 0) {
+       if (dict == NULL) {
                return false;
        }
        setPropertyTable(dict);
+       dict->release();
 
        _nvramImage = IONew(UInt8, kIODTNVRAMImageSize);
-       if (_nvramImage == 0) {
+       if (_nvramImage == NULL) {
                return false;
        }
 
        _nvramPartitionOffsets = OSDictionary::withCapacity(1);
-       if (_nvramPartitionOffsets == 0) {
+       if (_nvramPartitionOffsets == NULL) {
                return false;
        }
 
        _nvramPartitionLengths = OSDictionary::withCapacity(1);
-       if (_nvramPartitionLengths == 0) {
+       if (_nvramPartitionLengths == NULL) {
                return false;
        }
 
        _registryPropertiesKey = OSSymbol::withCStringNoCopy("aapl,pci");
-       if (_registryPropertiesKey == 0) {
+       if (_registryPropertiesKey == NULL) {
                return false;
        }
 
@@ -95,13 +98,13 @@ IODTNVRAM::initProxyData(void)
        const void *bytes;
 
        entry = IORegistryEntry::fromPath("/chosen", gIODTPlane);
-       if (entry != 0) {
+       if (entry != NULL) {
                prop = entry->getProperty(key);
-               if (prop != 0) {
+               if (prop != NULL) {
                        data = OSDynamicCast(OSData, prop);
-                       if (data != 0) {
+                       if (data != NULL) {
                                bytes = data->getBytesNoCopy();
-                               if ((bytes != 0) && (data->getLength() <= kIODTNVRAMImageSize)) {
+                               if ((bytes != NULL) && (data->getLength() <= kIODTNVRAMImageSize)) {
                                        bcopy(bytes, _nvramImage, data->getLength());
                                        initNVRAMImage();
                                        _isProxied = true;
@@ -116,7 +119,7 @@ IODTNVRAM::initProxyData(void)
 void
 IODTNVRAM::registerNVRAMController(IONVRAMController *nvram)
 {
-       if (_nvramController != 0) {
+       if (_nvramController != NULL) {
                return;
        }
 
@@ -127,7 +130,7 @@ IODTNVRAM::registerNVRAMController(IONVRAMController *nvram)
        if (!_isProxied) {
                _nvramController->read(0, _nvramImage, kIODTNVRAMImageSize);
                initNVRAMImage();
-       } else {
+       } else if (_ofLock) {
                IOLockLock(_ofLock);
                (void) syncVariables();
                IOLockUnlock(_ofLock);
@@ -249,7 +252,7 @@ IODTNVRAM::initNVRAMImage(void)
                        _nvramImage[freePartitionOffset + 1] =
                            calculatePartitionChecksum(_nvramImage + freePartitionOffset);
 
-                       if (_nvramController != 0) {
+                       if (_nvramController != NULL) {
                                _nvramController->write(0, _nvramImage, kIODTNVRAMImageSize);
                        }
                }
@@ -267,7 +270,7 @@ void
 IODTNVRAM::syncInternal(bool rateLimit)
 {
        // Don't try to perform controller operations if none has been registered.
-       if (_nvramController == 0) {
+       if (_nvramController == NULL) {
                return;
        }
 
@@ -293,34 +296,34 @@ IODTNVRAM::serializeProperties(OSSerialize *s) const
        UInt32               variablePerm;
        const OSSymbol       *key;
        OSDictionary         *dict;
-       OSCollectionIterator *iter = 0;
+       OSCollectionIterator *iter = NULL;
 
        // Verify permissions.
        hasPrivilege = (kIOReturnSuccess == IOUserClient::clientHasPrivilege(current_task(), kIONVRAMPrivilege));
 
-       if (_ofDict == 0) {
+       if (_ofDict == NULL) {
                /* No nvram. Return an empty dictionary. */
                dict = OSDictionary::withCapacity(1);
-               if (dict == 0) {
+               if (dict == NULL) {
                        return false;
                }
        } else {
                IOLockLock(_ofLock);
                dict = OSDictionary::withDictionary(_ofDict);
                IOLockUnlock(_ofLock);
-               if (dict == 0) {
+               if (dict == NULL) {
                        return false;
                }
 
                /* Copy properties with client privilege. */
                iter = OSCollectionIterator::withCollection(dict);
-               if (iter == 0) {
+               if (iter == NULL) {
                        dict->release();
                        return false;
                }
                while (1) {
                        key = OSDynamicCast(OSSymbol, iter->getNextObject());
-                       if (key == 0) {
+                       if (key == NULL) {
                                break;
                        }
 
@@ -337,7 +340,7 @@ IODTNVRAM::serializeProperties(OSSerialize *s) const
        result = dict->serialize(s);
 
        dict->release();
-       if (iter != 0) {
+       if (iter != NULL) {
                iter->release();
        }
 
@@ -351,8 +354,8 @@ IODTNVRAM::copyProperty(const OSSymbol *aKey) const
        UInt32   variablePerm;
        OSObject *theObject;
 
-       if (_ofDict == 0) {
-               return 0;
+       if (_ofDict == NULL) {
+               return NULL;
        }
 
        // Verify permissions.
@@ -360,11 +363,11 @@ IODTNVRAM::copyProperty(const OSSymbol *aKey) const
        result = IOUserClient::clientHasPrivilege(current_task(), kIONVRAMPrivilege);
        if (result != kIOReturnSuccess) {
                if (variablePerm == kOFVariablePermRootOnly) {
-                       return 0;
+                       return NULL;
                }
        }
        if (variablePerm == kOFVariablePermKernelOnly && current_task() != kernel_task) {
-               return 0;
+               return NULL;
        }
 
        IOLockLock(_ofLock);
@@ -381,10 +384,10 @@ OSObject *
 IODTNVRAM::copyProperty(const char *aKey) const
 {
        const OSSymbol *keySymbol;
-       OSObject *theObject = 0;
+       OSObject *theObject = NULL;
 
        keySymbol = OSSymbol::withCString(aKey);
-       if (keySymbol != 0) {
+       if (keySymbol != NULL) {
                theObject = copyProperty(keySymbol);
                keySymbol->release();
        }
@@ -418,15 +421,15 @@ IODTNVRAM::getProperty(const char *aKey) const
        return theObject;
 }
 
-bool
-IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject)
+IOReturn
+IODTNVRAM::setPropertyInternal(const OSSymbol *aKey, OSObject *anObject)
 {
-       bool     result;
+       IOReturn     result = kIOReturnSuccess;
        UInt32   propType, propPerm;
-       OSString *tmpString = 0;
-       OSObject *propObject = 0, *oldObject;
+       OSString *tmpString = NULL;
+       OSObject *propObject = NULL, *oldObject;
 
-       if (_ofDict == 0) {
+       if (_ofDict == NULL) {
                return false;
        }
 
@@ -434,16 +437,16 @@ IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject)
        propPerm = getOFVariablePerm(aKey);
        if (IOUserClient::clientHasPrivilege(current_task(), kIONVRAMPrivilege) != kIOReturnSuccess) {
                if (propPerm != kOFVariablePermUserWrite) {
-                       return false;
+                       return kIOReturnNotPrivileged;
                }
        }
        if (propPerm == kOFVariablePermKernelOnly && current_task() != kernel_task) {
-               return 0;
+               return kIOReturnNotPrivileged;
        }
 
        // Don't allow change of 'aapl,panic-info'.
        if (aKey->isEqualTo(kIODTNVRAMPanicInfoKey)) {
-               return false;
+               return kIOReturnUnsupported;
        }
 
        // Make sure the object is of the correct type.
@@ -459,13 +462,16 @@ IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject)
 
        case kOFVariableTypeString:
                propObject = OSDynamicCast(OSString, anObject);
+               if (propObject != NULL && aKey->isEqualTo(kIONVRAMBootArgsKey) && ((OSString*)propObject)->getLength() >= BOOT_LINE_LENGTH) {
+                       return kIOReturnNoSpace;
+               }
                break;
 
        case kOFVariableTypeData:
                propObject = OSDynamicCast(OSData, anObject);
-               if (propObject == 0) {
+               if (propObject == NULL) {
                        tmpString = OSDynamicCast(OSString, anObject);
-                       if (tmpString != 0) {
+                       if (tmpString != NULL) {
                                propObject = OSData::withBytes(tmpString->getCStringNoCopy(),
                                    tmpString->getLength());
                        }
@@ -473,8 +479,8 @@ IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject)
                break;
        }
 
-       if (propObject == 0) {
-               return false;
+       if (propObject == NULL) {
+               return kIOReturnBadArgument;
        }
 
        IOLockLock(_ofLock);
@@ -483,9 +489,11 @@ IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject)
        if (oldObject) {
                oldObject->retain();
        }
-       result = _ofDict->setObject(aKey, propObject);
+       if (!_ofDict->setObject(aKey, propObject)) {
+               result = kIOReturnBadArgument;
+       }
 
-       if (result) {
+       if (result == kIOReturnSuccess) {
                if (syncVariables() != kIOReturnSuccess) {
                        if (oldObject) {
                                _ofDict->setObject(aKey, oldObject);
@@ -493,7 +501,7 @@ IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject)
                                _ofDict->removeObject(aKey);
                        }
                        (void) syncVariables();
-                       result = false;
+                       result = kIOReturnNoMemory;
                }
        }
 
@@ -509,13 +517,19 @@ IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject)
        return result;
 }
 
+bool
+IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject)
+{
+       return setPropertyInternal(aKey, anObject) == kIOReturnSuccess;
+}
+
 void
 IODTNVRAM::removeProperty(const OSSymbol *aKey)
 {
        bool     result;
        UInt32   propPerm;
 
-       if (_ofDict == 0) {
+       if (_ofDict == NULL) {
                return;
        }
 
@@ -539,7 +553,7 @@ IODTNVRAM::removeProperty(const OSSymbol *aKey)
        // If the object exists, remove it from the dictionary.
 
        IOLockLock(_ofLock);
-       result = _ofDict->getObject(aKey) != 0;
+       result = _ofDict->getObject(aKey) != NULL;
        if (result) {
                _ofDict->removeObject(aKey);
        }
@@ -554,7 +568,7 @@ IODTNVRAM::removeProperty(const OSSymbol *aKey)
 IOReturn
 IODTNVRAM::setProperties(OSObject *properties)
 {
-       bool                 result = true;
+       IOReturn             res = kIOReturnSuccess;
        OSObject             *object;
        const OSSymbol       *key;
        const OSString       *tmpStr;
@@ -562,59 +576,53 @@ IODTNVRAM::setProperties(OSObject *properties)
        OSCollectionIterator *iter;
 
        dict = OSDynamicCast(OSDictionary, properties);
-       if (dict == 0) {
+       if (dict == NULL) {
                return kIOReturnBadArgument;
        }
 
        iter = OSCollectionIterator::withCollection(dict);
-       if (iter == 0) {
+       if (iter == NULL) {
                return kIOReturnBadArgument;
        }
 
-       while (result) {
+       while (res == kIOReturnSuccess) {
                key = OSDynamicCast(OSSymbol, iter->getNextObject());
-               if (key == 0) {
+               if (key == NULL) {
                        break;
                }
 
                object = dict->getObject(key);
-               if (object == 0) {
+               if (object == NULL) {
                        continue;
                }
 
                if (key->isEqualTo(kIONVRAMDeletePropertyKey)) {
                        tmpStr = OSDynamicCast(OSString, object);
-                       if (tmpStr != 0) {
+                       if (tmpStr != NULL) {
                                key = OSSymbol::withString(tmpStr);
                                removeProperty(key);
                                key->release();
-                               result = true;
                        } else {
-                               result = false;
+                               res = kIOReturnError;
                        }
                } else if (key->isEqualTo(kIONVRAMSyncNowPropertyKey) || key->isEqualTo(kIONVRAMForceSyncNowPropertyKey)) {
                        tmpStr = OSDynamicCast(OSString, object);
-                       if (tmpStr != 0) {
-                               result = true;
-
+                       if (tmpStr != NULL) {
                                // We still want to throttle NVRAM commit rate for SyncNow. ForceSyncNow is provided as a really big hammer.
-
                                syncInternal(key->isEqualTo(kIONVRAMSyncNowPropertyKey));
                        } else {
-                               result = false;
+                               res = kIOReturnError;
                        }
                } else {
-                       result = setProperty(key, object);
+                       if (!setProperty(key, object)) {
+                               res = kIOReturnNoSpace;
+                       }
                }
        }
 
        iter->release();
 
-       if (result) {
-               return kIOReturnSuccess;
-       } else {
-               return kIOReturnError;
-       }
+       return res;
 }
 
 IOReturn
@@ -674,7 +682,7 @@ IODTNVRAM::readNVRAMPartition(const OSSymbol *partitionID,
        partitionLengthNumber =
            (OSNumber *)_nvramPartitionLengths->getObject(partitionID);
 
-       if ((partitionOffsetNumber == 0) || (partitionLengthNumber == 0)) {
+       if ((partitionOffsetNumber == NULL) || (partitionLengthNumber == NULL)) {
                return kIOReturnNotFound;
        }
 
@@ -684,7 +692,7 @@ IODTNVRAM::readNVRAMPartition(const OSSymbol *partitionID,
        if (os_add_overflow(offset, length, &end)) {
                return kIOReturnBadArgument;
        }
-       if ((buffer == 0) || (length == 0) || (end > partitionLength)) {
+       if ((buffer == NULL) || (length == 0) || (end > partitionLength)) {
                return kIOReturnBadArgument;
        }
 
@@ -706,7 +714,7 @@ IODTNVRAM::writeNVRAMPartition(const OSSymbol *partitionID,
        partitionLengthNumber =
            (OSNumber *)_nvramPartitionLengths->getObject(partitionID);
 
-       if ((partitionOffsetNumber == 0) || (partitionLengthNumber == 0)) {
+       if ((partitionOffsetNumber == NULL) || (partitionLengthNumber == NULL)) {
                return kIOReturnNotFound;
        }
 
@@ -716,13 +724,13 @@ IODTNVRAM::writeNVRAMPartition(const OSSymbol *partitionID,
        if (os_add_overflow(offset, length, &end)) {
                return kIOReturnBadArgument;
        }
-       if ((buffer == 0) || (length == 0) || (end > partitionLength)) {
+       if ((buffer == NULL) || (length == 0) || (end > partitionLength)) {
                return kIOReturnBadArgument;
        }
 
        bcopy(buffer, _nvramImage + partitionOffset + offset, length);
 
-       if (_nvramController != 0) {
+       if (_nvramController != NULL) {
                _nvramController->write(0, _nvramImage, kIODTNVRAMImageSize);
        }
 
@@ -732,7 +740,7 @@ IODTNVRAM::writeNVRAMPartition(const OSSymbol *partitionID,
 IOByteCount
 IODTNVRAM::savePanicInfo(UInt8 *buffer, IOByteCount length)
 {
-       if ((_piImage == 0) || (length <= 0)) {
+       if ((_piImage == NULL) || (length <= 0)) {
                return 0;
        }
 
@@ -746,7 +754,7 @@ IODTNVRAM::savePanicInfo(UInt8 *buffer, IOByteCount length)
        // Save the Panic Info length.
        *(UInt32 *)_piImage = length;
 
-       if (_nvramController != 0) {
+       if (_nvramController != NULL) {
                _nvramController->write(0, _nvramImage, kIODTNVRAMImageSize);
        }
        /*
@@ -788,7 +796,7 @@ IODTNVRAM::initOFVariables(void)
        const OSSymbol    *propSymbol;
        OSObject          *propObject;
 
-       if (_ofImage == 0) {
+       if (_ofImage == NULL) {
                return kIOReturnNotReady;
        }
 
@@ -844,15 +852,15 @@ IODTNVRAM::initOFVariables(void)
        }
 
        // Create the boot-args property if it is not in the dictionary.
-       if (_ofDict->getObject("boot-args") == 0) {
+       if (_ofDict->getObject(kIONVRAMBootArgsKey) == NULL) {
                propObject = OSString::withCStringNoCopy("");
-               if (propObject != 0) {
-                       _ofDict->setObject("boot-args", propObject);
+               if (propObject != NULL) {
+                       _ofDict->setObject(kIONVRAMBootArgsKey, propObject);
                        propObject->release();
                }
        }
 
-       if (_piImage != 0) {
+       if (_piImage != NULL) {
                propDataLength = *(UInt32 *)_piImage;
                if ((propDataLength != 0) && (propDataLength <= (_piPartitionSize - 4))) {
                        propObject = OSData::withBytes(_piImage + 4, propDataLength);
@@ -861,7 +869,7 @@ IODTNVRAM::initOFVariables(void)
 
                        // Clear the length from _piImage and mark dirty.
                        *(UInt32 *)_piImage = 0;
-                       if (_nvramController != 0) {
+                       if (_nvramController != NULL) {
                                _nvramController->write(0, _nvramImage, kIODTNVRAMImageSize);
                        }
                }
@@ -888,12 +896,12 @@ IODTNVRAM::syncVariables(void)
 
        IOLockAssert(_ofLock, kIOLockAssertOwned);
 
-       if ((_ofImage == 0) || (_ofDict == 0) || _systemPaniced) {
+       if ((_ofImage == NULL) || (_ofDict == NULL) || _systemPaniced) {
                return kIOReturnNotReady;
        }
 
        buffer = tmpBuffer = IONew(UInt8, _ofPartitionSize);
-       if (buffer == 0) {
+       if (buffer == NULL) {
                return kIOReturnNoMemory;
        }
        bzero(buffer, _ofPartitionSize);
@@ -902,13 +910,13 @@ IODTNVRAM::syncVariables(void)
        maxLength = _ofPartitionSize;
 
        iter = OSCollectionIterator::withCollection(_ofDict);
-       if (iter == 0) {
+       if (iter == NULL) {
                ok = false;
        }
 
        while (ok) {
                tmpSymbol = OSDynamicCast(OSSymbol, iter->getNextObject());
-               if (tmpSymbol == 0) {
+               if (tmpSymbol == NULL) {
                        break;
                }
 
@@ -938,7 +946,7 @@ IODTNVRAM::syncVariables(void)
                return kIOReturnBadArgument;
        }
 
-       if (_nvramController != 0) {
+       if (_nvramController != NULL) {
                return _nvramController->write(0, _nvramImage, kIODTNVRAMImageSize);
        }
 
@@ -1018,7 +1026,7 @@ OFVariable gOFVariables[] = {
        {"enter-tdm-mode", kOFVariableTypeBoolean, kOFVariablePermUserWrite, -1},
        {"nonce-seeds", kOFVariableTypeData, kOFVariablePermKernelOnly, -1},
 #endif
-       {0, kOFVariableTypeData, kOFVariablePermUserRead, -1}
+       {NULL, kOFVariableTypeData, kOFVariablePermUserRead, -1}
 };
 
 UInt32
@@ -1028,7 +1036,7 @@ IODTNVRAM::getOFVariableType(const OSSymbol *propSymbol) const
 
        ofVar = gOFVariables;
        while (1) {
-               if ((ofVar->variableName == 0) ||
+               if ((ofVar->variableName == NULL) ||
                    propSymbol->isEqualTo(ofVar->variableName)) {
                        break;
                }
@@ -1045,7 +1053,7 @@ IODTNVRAM::getOFVariablePerm(const OSSymbol *propSymbol) const
 
        ofVar = gOFVariables;
        while (1) {
-               if ((ofVar->variableName == 0) ||
+               if ((ofVar->variableName == NULL) ||
                    propSymbol->isEqualTo(ofVar->variableName)) {
                        break;
                }
@@ -1059,39 +1067,8 @@ bool
 IODTNVRAM::getOWVariableInfo(UInt32 variableNumber, const OSSymbol **propSymbol,
     UInt32 *propType, UInt32 *propOffset)
 {
-       const OFVariable *ofVar;
-
-       ofVar = gOFVariables;
-       while (1) {
-               if (ofVar->variableName == 0) {
-                       return false;
-               }
-
-               if (ofVar->variableOffset == (SInt32) variableNumber) {
-                       break;
-               }
-
-               ofVar++;
-       }
-
-       *propSymbol = OSSymbol::withCStringNoCopy(ofVar->variableName);
-       *propType = ofVar->variableType;
-
-       switch (*propType) {
-       case kOFVariableTypeBoolean:
-               *propOffset = 1 << (31 - variableNumber);
-               break;
-
-       case kOFVariableTypeNumber:
-               *propOffset = variableNumber - kOWVariableOffsetNumber;
-               break;
-
-       case kOFVariableTypeString:
-               *propOffset = variableNumber - kOWVariableOffsetString;
-               break;
-       }
-
-       return true;
+       /* UNSUPPORTED */
+       return false;
 }
 
 bool
@@ -1110,14 +1087,14 @@ IODTNVRAM::convertPropToObject(UInt8 *propName, UInt32 propNameLength,
        propName[propNameLength] = '\0';
        tmpSymbol = OSSymbol::withCString((const char *)propName);
        propName[propNameLength] = '=';
-       if (tmpSymbol == 0) {
+       if (tmpSymbol == NULL) {
                return false;
        }
 
        propType = getOFVariableType(tmpSymbol);
 
        // Create the object.
-       tmpObject = 0;
+       tmpObject = NULL;
        switch (propType) {
        case kOFVariableTypeBoolean:
                if (!strncmp("true", (const char *)propData, propDataLength)) {
@@ -1128,15 +1105,15 @@ IODTNVRAM::convertPropToObject(UInt8 *propName, UInt32 propNameLength,
                break;
 
        case kOFVariableTypeNumber:
-               tmpNumber = OSNumber::withNumber(strtol((const char *)propData, 0, 0), 32);
-               if (tmpNumber != 0) {
+               tmpNumber = OSNumber::withNumber(strtol((const char *)propData, NULL, 0), 32);
+               if (tmpNumber != NULL) {
                        tmpObject = tmpNumber;
                }
                break;
 
        case kOFVariableTypeString:
                tmpString = OSString::withCString((const char *)propData);
-               if (tmpString != 0) {
+               if (tmpString != NULL) {
                        tmpObject = tmpString;
                }
                break;
@@ -1146,7 +1123,7 @@ IODTNVRAM::convertPropToObject(UInt8 *propName, UInt32 propNameLength,
                break;
        }
 
-       if (tmpObject == 0) {
+       if (tmpObject == NULL) {
                tmpSymbol->release();
                return false;
        }
@@ -1164,10 +1141,10 @@ IODTNVRAM::convertObjectToProp(UInt8 *buffer, UInt32 *length,
        const UInt8    *propName;
        UInt32         propNameLength, propDataLength, remaining;
        UInt32         propType, tmpValue;
-       OSBoolean      *tmpBoolean = 0;
-       OSNumber       *tmpNumber = 0;
-       OSString       *tmpString = 0;
-       OSData         *tmpData = 0;
+       OSBoolean      *tmpBoolean = NULL;
+       OSNumber       *tmpNumber = NULL;
+       OSString       *tmpString = NULL;
+       OSData         *tmpData = NULL;
 
        propName = (const UInt8 *)propSymbol->getCStringNoCopy();
        propNameLength = propSymbol->getLength();
@@ -1178,28 +1155,28 @@ IODTNVRAM::convertObjectToProp(UInt8 *buffer, UInt32 *length,
        switch (propType) {
        case kOFVariableTypeBoolean:
                tmpBoolean = OSDynamicCast(OSBoolean, propObject);
-               if (tmpBoolean != 0) {
+               if (tmpBoolean != NULL) {
                        propDataLength = 5;
                }
                break;
 
        case kOFVariableTypeNumber:
                tmpNumber = OSDynamicCast(OSNumber, propObject);
-               if (tmpNumber != 0) {
+               if (tmpNumber != NULL) {
                        propDataLength = 10;
                }
                break;
 
        case kOFVariableTypeString:
                tmpString = OSDynamicCast(OSString, propObject);
-               if (tmpString != 0) {
+               if (tmpString != NULL) {
                        propDataLength = tmpString->getLength();
                }
                break;
 
        case kOFVariableTypeData:
                tmpData = OSDynamicCast(OSData, propObject);
-               if (tmpData != 0) {
+               if (tmpData != NULL) {
                        tmpData = escapeDataToData(tmpData);
                        propDataLength = tmpData->getLength();
                }
@@ -1291,85 +1268,7 @@ IODTNVRAM::validateOWChecksum(UInt8 *buffer)
 void
 IODTNVRAM::updateOWBootArgs(const OSSymbol *key, OSObject *value)
 {
-       bool        wasBootArgs, bootr = false;
-       UInt32      cnt;
-       OSString    *tmpString, *bootCommand, *bootArgs = 0;
-       const UInt8 *bootCommandData, *bootArgsData;
-       UInt8       *tmpData;
-       UInt32      bootCommandDataLength, bootArgsDataLength, tmpDataLength;
-
-       tmpString = OSDynamicCast(OSString, value);
-       if (tmpString == 0) {
-               return;
-       }
-
-       if (key->isEqualTo("boot-command")) {
-               wasBootArgs = false;
-               bootCommand = tmpString;
-       } else if (key->isEqualTo("boot-args")) {
-               wasBootArgs = true;
-               bootArgs = tmpString;
-               bootCommand = OSDynamicCast(OSString, _ofDict->getObject("boot-command"));
-               if (bootCommand == 0) {
-                       return;
-               }
-       } else {
-               return;
-       }
-
-       bootCommandData = (const UInt8 *)bootCommand->getCStringNoCopy();
-       bootCommandDataLength = bootCommand->getLength();
-
-       if (bootCommandData == 0) {
-               return;
-       }
-
-       for (cnt = 0; cnt < bootCommandDataLength; cnt++) {
-               if ((bootCommandData[cnt] == 'b') &&
-                   !strncmp("bootr", (const char *)bootCommandData + cnt, 5)) {
-                       cnt += 5;
-                       while (bootCommandData[cnt] == ' ') {
-                               cnt++;
-                       }
-                       bootr = true;
-                       break;
-               }
-       }
-       if (!bootr) {
-               _ofDict->removeObject("boot-args");
-               return;
-       }
-
-       if (wasBootArgs) {
-               bootArgsData = (const UInt8 *)bootArgs->getCStringNoCopy();
-               bootArgsDataLength = bootArgs->getLength();
-               if (bootArgsData == 0) {
-                       return;
-               }
-
-               tmpDataLength = cnt + bootArgsDataLength;
-               tmpData = IONew(UInt8, tmpDataLength + 1);
-               if (tmpData == 0) {
-                       return;
-               }
-
-               cnt -= strlcpy((char *)tmpData, (const char *)bootCommandData, cnt);
-               strlcat((char *)tmpData, (const char *)bootArgsData, cnt);
-
-               bootCommand = OSString::withCString((const char *)tmpData);
-               if (bootCommand != 0) {
-                       _ofDict->setObject("boot-command", bootCommand);
-                       bootCommand->release();
-               }
-
-               IODelete(tmpData, UInt8, tmpDataLength + 1);
-       } else {
-               bootArgs = OSString::withCString((const char *)(bootCommandData + cnt));
-               if (bootArgs != 0) {
-                       _ofDict->setObject("boot-args", bootArgs);
-                       bootArgs->release();
-               }
-       }
+       /* UNSUPPORTED */
 }
 
 bool
@@ -1399,7 +1298,7 @@ IODTNVRAM::writeNVRAMPropertyType0(IORegistryEntry *entry,
 OSData *
 IODTNVRAM::unescapeBytesToData(const UInt8 *bytes, UInt32 length)
 {
-       OSData *data = 0;
+       OSData *data = NULL;
        UInt32 totalLength = 0;
        UInt32 cnt, cnt2;
        UInt8  byte;
@@ -1426,7 +1325,7 @@ IODTNVRAM::unescapeBytesToData(const UInt8 *bytes, UInt32 length)
        if (ok) {
                // Create an empty OSData of the correct size.
                data = OSData::withCapacity(totalLength);
-               if (data != 0) {
+               if (data != NULL) {
                        for (cnt = 0; cnt < length;) {
                                byte = bytes[cnt++];
                                if (byte == 0xFF) {
@@ -1479,7 +1378,7 @@ IODTNVRAM::escapeDataToData(OSData * value)
 
        if (!ok) {
                result->release();
-               result = 0;
+               result = NULL;
        }
 
        return result;
@@ -1508,14 +1407,14 @@ IODTNVRAM::readNVRAMPropertyType1(IORegistryEntry *entry,
        const UInt8 *startPtr;
        const UInt8 *endPtr;
        const UInt8 *wherePtr;
-       const UInt8 *nvPath = 0;
-       const char  *nvName = 0;
-       const char  *resultName = 0;
-       const UInt8 *resultValue = 0;
+       const UInt8 *nvPath = NULL;
+       const char  *nvName = NULL;
+       const char  *resultName = NULL;
+       const UInt8 *resultValue = NULL;
        UInt32       resultValueLen = 0;
        UInt8       byte;
 
-       if (_ofDict == 0) {
+       if (_ofDict == NULL) {
                return err;
        }
 
@@ -1523,7 +1422,7 @@ IODTNVRAM::readNVRAMPropertyType1(IORegistryEntry *entry,
        data = OSDynamicCast(OSData, _ofDict->getObject(_registryPropertiesKey));
        IOLockUnlock(_ofLock);
 
-       if (data == 0) {
+       if (data == NULL) {
                return err;
        }
 
@@ -1537,9 +1436,9 @@ IODTNVRAM::readNVRAMPropertyType1(IORegistryEntry *entry,
                        continue;
                }
 
-               if (nvPath == 0) {
+               if (nvPath == NULL) {
                        nvPath = startPtr;
-               } else if (nvName == 0) {
+               } else if (nvName == NULL) {
                        nvName = (const char *) startPtr;
                } else {
                        IORegistryEntry * compareEntry = IORegistryEntry::fromPath((const char *) nvPath, gIODTPlane);
@@ -1557,15 +1456,15 @@ IODTNVRAM::readNVRAMPropertyType1(IORegistryEntry *entry,
                                        break;
                                }
                        }
-                       nvPath = 0;
-                       nvName = 0;
+                       nvPath = NULL;
+                       nvName = NULL;
                }
                startPtr = wherePtr;
        }
        if (resultName) {
                *name = OSSymbol::withCString(resultName);
                *value = unescapeBytesToData(resultValue, resultValueLen);
-               if ((*name != 0) && (*value != 0)) {
+               if ((*name != NULL) && (*value != NULL)) {
                        err = kIOReturnSuccess;
                } else {
                        err = kIOReturnNoMemory;
@@ -1580,20 +1479,20 @@ IODTNVRAM::writeNVRAMPropertyType1(IORegistryEntry *entry,
     OSData *value)
 {
        OSData       *oldData, *escapedData;
-       OSData       *data = 0;
+       OSData       *data = NULL;
        const UInt8  *startPtr;
        const UInt8  *propStart;
        const UInt8  *endPtr;
        const UInt8  *wherePtr;
-       const UInt8  *nvPath = 0;
-       const char   *nvName = 0;
+       const UInt8  *nvPath = NULL;
+       const char   *nvName = NULL;
        const char * comp;
        const char * name;
        UInt8        byte;
        bool         ok = true;
        bool         settingAppleProp;
 
-       if (_ofDict == 0) {
+       if (_ofDict == NULL) {
                return kIOReturnNoResources;
        }
 
@@ -1615,9 +1514,9 @@ IODTNVRAM::writeNVRAMPropertyType1(IORegistryEntry *entry,
                        if (byte) {
                                continue;
                        }
-                       if (nvPath == 0) {
+                       if (nvPath == NULL) {
                                nvPath = startPtr;
-                       } else if (nvName == 0) {
+                       } else if (nvName == NULL) {
                                nvName = (const char *) startPtr;
                        } else {
                                IORegistryEntry * compareEntry = IORegistryEntry::fromPath((const char *) nvPath, gIODTPlane);
@@ -1635,8 +1534,8 @@ IODTNVRAM::writeNVRAMPropertyType1(IORegistryEntry *entry,
                                                break;
                                        }
                                }
-                               nvPath = 0;
-                               nvName = 0;
+                               nvPath = NULL;
+                               nvName = NULL;
                        }
 
                        startPtr = wherePtr;
@@ -1693,7 +1592,7 @@ IODTNVRAM::writeNVRAMPropertyType1(IORegistryEntry *entry,
 
                        // append escaped data
                        escapedData = escapeDataToData(value);
-                       ok &= (escapedData != 0);
+                       ok &= (escapedData != NULL);
                        if (ok) {
                                ok &= data->appendBytes(escapedData);
                        }
index e89680c08377e0c5e5fb9658b458a19022bc493c..f211a94de6bb1dfbcbee15decd772c607be0ed45 100644 (file)
@@ -162,6 +162,8 @@ IOPMPowerSource::free(void)
        if (batteryInfoKey) {
                batteryInfoKey->release();
        }
+
+       super::free();
 }
 
 // *****************************************************************************
index db2e5741231c9c827604d890c6562a5a53f7f824..1e737ed17dc462899559b1694da96b4bf4995d1e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001-2002 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2001-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -37,7 +37,7 @@ typedef void (*IOPMPowerStateQueueAction)(OSObject *, uint32_t event, void *, ui
 
 class IOPMPowerStateQueue : public IOEventSource
 {
-       OSDeclareDefaultStructors(IOPMPowerStateQueue)
+       OSDeclareDefaultStructors(IOPMPowerStateQueue);
 
 private:
        struct PowerEventEntry {
@@ -57,7 +57,7 @@ protected:
 public:
        static IOPMPowerStateQueue * PMPowerStateQueue( OSObject * owner, Action action );
 
-       bool submitPowerEvent( uint32_t eventType, void * arg0 = 0, uint64_t arg1 = 0 );
+       bool submitPowerEvent( uint32_t eventType, void * arg0 = NULL, uint64_t arg1 = 0 );
 };
 
 #endif /* _IOPMPOWERSTATEQUEUE_H_ */
index 320e8f3f27c928ff441926255e61391afe6b21d9..0920486dad527481418f06e142304d5c6361440d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -31,6 +31,7 @@
 #include <libkern/OSDebug.h>
 #include <IOKit/IOWorkLoop.h>
 #include <IOKit/IOCommandGate.h>
+#include <IOKit/IOTimerEventSource.h>
 #include <IOKit/IOPlatformExpert.h>
 #include <IOKit/IOCPU.h>
 #include <IOKit/IOKitDebug.h>
@@ -47,6 +48,8 @@
 #include "IOPMPowerStateQueue.h"
 #include <IOKit/IOCatalogue.h>
 #include <IOKit/IOReportMacros.h>
+#include <IOKit/IOLib.h>
+#include <IOKit/IOKitKeys.h>
 #include "IOKitKernelInternal.h"
 #if HIBERNATION
 #include <IOKit/IOHibernatePrivate.h>
@@ -89,19 +92,22 @@ __END_DECLS
 #define LOG(x...)    \
     do { kprintf(LOG_PREFIX x); } while (false)
 
-#if DEVELOPMENT
-#define DLOG(x...)  do { \
+#if DEVELOPMENT || DEBUG
+#define DEBUG_LOG(x...) do { \
     if (kIOLogPMRootDomain & gIOKitDebug) \
-       kprintf(LOG_PREFIX x); \
-    else \
-       os_log(OS_LOG_DEFAULT, LOG_PREFIX x); \
+    kprintf(LOG_PREFIX x); \
+    os_log_debug(OS_LOG_DEFAULT, LOG_PREFIX x); \
 } while (false)
 #else
+#define DEBUG_LOG(x...)
+#endif
+
 #define DLOG(x...)  do { \
     if (kIOLogPMRootDomain & gIOKitDebug) \
        kprintf(LOG_PREFIX x); \
+    else \
+       os_log(OS_LOG_DEFAULT, LOG_PREFIX x); \
 } while (false)
-#endif
 
 #define DMSG(x...)  do { \
     if (kIOLogPMRootDomain & gIOKitDebug) { \
@@ -114,7 +120,7 @@ __END_DECLS
 
 #define CHECK_THREAD_CONTEXT
 #ifdef  CHECK_THREAD_CONTEXT
-static IOWorkLoop * gIOPMWorkLoop = 0;
+static IOWorkLoop * gIOPMWorkLoop = NULL;
 #define ASSERT_GATED()                                      \
 do {                                                        \
     if (gIOPMWorkLoop && gIOPMWorkLoop->inGate() != true) { \
@@ -192,6 +198,13 @@ static void idleSleepTimerExpired( thread_call_param_t, thread_call_param_t );
 static void notifySystemShutdown( IOService * root, uint32_t messageType );
 static void handleAggressivesFunction( thread_call_param_t, thread_call_param_t );
 static void pmEventTimeStamp(uint64_t *recordTS);
+static void powerButtonUpCallout( thread_call_param_t, thread_call_param_t );
+static void powerButtonDownCallout( thread_call_param_t, thread_call_param_t );
+
+static int  IOPMConvertSecondsToCalendar(long secs, IOPMCalendarStruct * dt);
+static long IOPMConvertCalendarToSeconds(const IOPMCalendarStruct * dt);
+#define YMDTF       "%04d/%02d/%d %02d:%02d:%02d"
+#define YMDT(cal)   ((int)(cal)->year), (cal)->month, (cal)->day, (cal)->hour, (cal)->minute, (cal)->second
 
 // "IOPMSetSleepSupported"  callPlatformFunction name
 static const OSSymbol *sleepSupportedPEFunction = NULL;
@@ -249,8 +262,8 @@ static const OSSymbol *         gIOPMPSPostDishargeWaitSecondsKey;
 #define kDefaultWranglerIdlePeriod  1000 // in milliseconds
 
 #define kIOSleepWakeFailureString   "SleepWakeFailureString"
-#define kIOOSWatchdogFailureString  "OSWatchdogFailureString"
 #define kIOEFIBootRomFailureKey     "wake-failure"
+#define kIOSleepWakeFailurePanic    "SleepWakeFailurePanic"
 
 #define kRD_AllPowerSources (kIOPMSupportedOnAC \
                           | kIOPMSupportedOnBatt \
@@ -270,20 +283,60 @@ enum {
        OFF_STATE           = 0,
        RESTART_STATE       = 1,
        SLEEP_STATE         = 2,
-       ON_STATE            = 3,
+       AOT_STATE           = 3,
+       ON_STATE            = 4,
        NUM_POWER_STATES
 };
 
+const char *
+getPowerStateString( uint32_t state )
+{
+#define POWER_STATE(x) {(uint32_t) x, #x}
+
+       static const IONamedValue powerStates[] = {
+               POWER_STATE( OFF_STATE ),
+               POWER_STATE( RESTART_STATE ),
+               POWER_STATE( SLEEP_STATE ),
+               POWER_STATE( AOT_STATE ),
+               POWER_STATE( ON_STATE ),
+               { 0, NULL }
+       };
+       return IOFindNameForValue(state, powerStates);
+}
+
 #define ON_POWER        kIOPMPowerOn
 #define RESTART_POWER   kIOPMRestart
 #define SLEEP_POWER     kIOPMAuxPowerOn
 
-static IOPMPowerState ourPowerStates[NUM_POWER_STATES] =
-{
-       {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-       {1, kIOPMRestartCapability, kIOPMRestart, RESTART_POWER, 0, 0, 0, 0, 0, 0, 0, 0},
-       {1, kIOPMSleepCapability, kIOPMSleep, SLEEP_POWER, 0, 0, 0, 0, 0, 0, 0, 0},
-       {1, kIOPMPowerOn, kIOPMPowerOn, ON_POWER, 0, 0, 0, 0, 0, 0, 0, 0}
+static IOPMPowerState
+    ourPowerStates[NUM_POWER_STATES] =
+{
+       {   .version                = 1,
+           .capabilityFlags        = 0,
+           .outputPowerCharacter   = 0,
+           .inputPowerRequirement  = 0 },
+       {   .version                = 1,
+           .capabilityFlags        = kIOPMRestartCapability,
+           .outputPowerCharacter   = kIOPMRestart,
+           .inputPowerRequirement  = RESTART_POWER },
+       {   .version                = 1,
+           .capabilityFlags        = kIOPMSleepCapability,
+           .outputPowerCharacter   = kIOPMSleep,
+           .inputPowerRequirement  = SLEEP_POWER },
+       {   .version                = 1,
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+           .capabilityFlags        = kIOPMAOTCapability,
+           .outputPowerCharacter   = kIOPMAOTPower,
+           .inputPowerRequirement  = ON_POWER },
+#else /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+           .capabilityFlags        = 0,
+           .outputPowerCharacter   = 0,
+           .inputPowerRequirement  = 0xFFFFFFFF },
+#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+       {   .version                = 1,
+           .capabilityFlags        = kIOPMPowerOn,
+           .outputPowerCharacter   = kIOPMPowerOn,
+           .inputPowerRequirement  = ON_POWER },
 };
 
 #define kIOPMRootDomainWakeTypeSleepService     "SleepService"
@@ -302,6 +355,9 @@ static IOPMPowerState ourPowerStates[NUM_POWER_STATES] =
 //
 #define kIOPMSystemCapabilityInterest       "IOPMSystemCapabilityInterest"
 
+// Entitlement required for root domain clients
+#define kRootDomainEntitlementSetProperty   "com.apple.private.iokit.rootdomain-set-property"
+
 #define WAKEEVENT_LOCK()        IOLockLock(wakeEventLock)
 #define WAKEEVENT_UNLOCK()      IOLockUnlock(wakeEventLock)
 
@@ -313,6 +369,26 @@ static IOPMPowerState ourPowerStates[NUM_POWER_STATES] =
 
 #define kAggressivesMinValue    1
 
+const char *
+getAggressivenessTypeString( uint32_t type )
+{
+#define AGGRESSIVENESS_TYPE(x) {(uint32_t) x, #x}
+
+       static const IONamedValue aggressivenessTypes[] = {
+               AGGRESSIVENESS_TYPE( kPMGeneralAggressiveness ),
+               AGGRESSIVENESS_TYPE( kPMMinutesToDim ),
+               AGGRESSIVENESS_TYPE( kPMMinutesToSpinDown ),
+               AGGRESSIVENESS_TYPE( kPMMinutesToSleep ),
+               AGGRESSIVENESS_TYPE( kPMEthernetWakeOnLANSettings ),
+               AGGRESSIVENESS_TYPE( kPMSetProcessorSpeed ),
+               AGGRESSIVENESS_TYPE( kPMPowerSource),
+               AGGRESSIVENESS_TYPE( kPMMotionSensor ),
+               AGGRESSIVENESS_TYPE( kPMLastAggressivenessType ),
+               { 0, NULL }
+       };
+       return IOFindNameForValue(type, aggressivenessTypes);
+}
+
 enum {
        kAggressivesStateBusy           = 0x01,
        kAggressivesStateQuickSpindown  = 0x02
@@ -351,6 +427,33 @@ enum {
        kAggressivesRecordFlagMinValue         = 0x00000002
 };
 
+// System Sleep Preventers
+
+enum {
+       kPMUserDisabledAllSleep = 1,
+       kPMSystemRestartBootingInProgress,
+       kPMConfigPreventSystemSleep,
+       kPMChildPreventSystemSleep,
+       kPMCPUAssertion,
+       kPMPCIUnsupported,
+};
+
+const char *
+getSystemSleepPreventerString( uint32_t preventer )
+{
+#define SYSTEM_SLEEP_PREVENTER(x) {(int) x, #x}
+       static const IONamedValue systemSleepPreventers[] = {
+               SYSTEM_SLEEP_PREVENTER( kPMUserDisabledAllSleep ),
+               SYSTEM_SLEEP_PREVENTER( kPMSystemRestartBootingInProgress ),
+               SYSTEM_SLEEP_PREVENTER( kPMConfigPreventSystemSleep ),
+               SYSTEM_SLEEP_PREVENTER( kPMChildPreventSystemSleep ),
+               SYSTEM_SLEEP_PREVENTER( kPMCPUAssertion ),
+               SYSTEM_SLEEP_PREVENTER( kPMPCIUnsupported ),
+               { 0, NULL }
+       };
+       return IOFindNameForValue(preventer, systemSleepPreventers);
+}
+
 // gDarkWakeFlags
 enum {
        kDarkWakeFlagHIDTickleEarly      = 0x01,// hid tickle before gfx suppression
@@ -363,7 +466,7 @@ enum {
 };
 
 static IOPMrootDomain * gRootDomain;
-static IONotifier *     gSysPowerDownNotifier = 0;
+static IONotifier *     gSysPowerDownNotifier = NULL;
 static UInt32           gSleepOrShutdownPending = 0;
 static UInt32           gWillShutdown = 0;
 static UInt32           gPagingOff = 0;
@@ -382,22 +485,27 @@ uuid_string_t bootsessionuuid_string;
 
 static uint32_t         gDarkWakeFlags = kDarkWakeFlagHIDTickleNone;
 static uint32_t         gNoIdleFlag = 0;
-static uint32_t         gSwdPanic = 0;
+static uint32_t         gSwdPanic = 1;
 static uint32_t         gSwdSleepTimeout = 0;
 static uint32_t         gSwdWakeTimeout = 0;
 static uint32_t         gSwdSleepWakeTimeout = 0;
 static PMStatsStruct    gPMStats;
+#if DEVELOPMENT || DEBUG
+static uint32_t swd_panic_phase;
+#endif
 
 
 #if HIBERNATION
-static IOPMSystemSleepPolicyHandler     gSleepPolicyHandler = 0;
-static IOPMSystemSleepPolicyVariables * gSleepPolicyVars = 0;
+static IOPMSystemSleepPolicyHandler     gSleepPolicyHandler = NULL;
+static IOPMSystemSleepPolicyVariables * gSleepPolicyVars = NULL;
 static void *                           gSleepPolicyTarget;
 #endif
 
 struct timeval gIOLastSleepTime;
 struct timeval gIOLastWakeTime;
 
+struct timeval gIOLastUserSleepTime;
+
 static char gWakeReasonString[128];
 static bool gWakeReasonSysctlRegistered = false;
 static AbsoluteTime gIOLastWakeAbsTime;
@@ -421,9 +529,9 @@ static unsigned int     gPMHaltBusyCount;
 static unsigned int     gPMHaltIdleCount;
 static int              gPMHaltDepth;
 static uint32_t         gPMHaltMessageType;
-static IOLock *         gPMHaltLock  = 0;
-static OSArray *        gPMHaltArray = 0;
-static const OSSymbol * gPMHaltClientAcknowledgeKey = 0;
+static IOLock *         gPMHaltLock  = NULL;
+static OSArray *        gPMHaltArray = NULL;
+static const OSSymbol * gPMHaltClientAcknowledgeKey = NULL;
 static bool             gPMQuiesced;
 
 // Constants used as arguments to IOPMrootDomain::informCPUStateChange
@@ -448,7 +556,7 @@ const OSSymbol *gIOPMStatsDriverPSChangeSlow;
  */
 class PMSettingHandle : public OSObject
 {
-       OSDeclareFinalStructors( PMSettingHandle )
+       OSDeclareFinalStructors( PMSettingHandle );
        friend class PMSettingObject;
 
 private:
@@ -462,7 +570,7 @@ private:
  */
 class PMSettingObject : public OSObject
 {
-       OSDeclareFinalStructors( PMSettingObject )
+       OSDeclareFinalStructors( PMSettingObject );
        friend class IOPMrootDomain;
 
 private:
@@ -515,7 +623,7 @@ typedef void (*IOPMTracePointHandler)(
 
 class PMTraceWorker : public OSObject
 {
-       OSDeclareDefaultStructors(PMTraceWorker)
+       OSDeclareDefaultStructors(PMTraceWorker);
 public:
        typedef enum { kPowerChangeStart, kPowerChangeCompleted } change_t;
 
@@ -552,7 +660,7 @@ private:
  */
 class PMAssertionsTracker : public OSObject
 {
-       OSDeclareFinalStructors(PMAssertionsTracker)
+       OSDeclareFinalStructors(PMAssertionsTracker);
 public:
        static PMAssertionsTracker  *pmAssertionsTracker( IOPMrootDomain * );
 
@@ -609,7 +717,7 @@ OSDefineMetaClassAndFinalStructors(PMAssertionsTracker, OSObject);
 
 class PMHaltWorker : public OSObject
 {
-       OSDeclareFinalStructors( PMHaltWorker )
+       OSDeclareFinalStructors( PMHaltWorker );
 
 public:
        IOService *  service;// service being worked on
@@ -632,11 +740,17 @@ OSDefineMetaClassAndFinalStructors( PMHaltWorker, OSObject )
 #define super IOService
 OSDefineMetaClassAndFinalStructors(IOPMrootDomain, IOService)
 
+boolean_t
+IOPMRootDomainGetWillShutdown(void)
+{
+       return gWillShutdown != 0;
+}
+
 static void
 IOPMRootDomainWillShutdown(void)
 {
        if (OSCompareAndSwap(0, 1, &gWillShutdown)) {
-               OSKext::willShutdown();
+               IOService::willShutdown();
                for (int i = 0; i < 100; i++) {
                        if (OSCompareAndSwap(0, 1, &gSleepOrShutdownPending)) {
                                break;
@@ -771,16 +885,14 @@ IOSystemShutdownNotification(int stage)
 
        startTime = mach_absolute_time();
        IOPMRootDomainWillShutdown();
-       halt_log_enter("IOPMRootDomainWillShutdown", 0, mach_absolute_time() - startTime);
+       halt_log_enter("IOPMRootDomainWillShutdown", NULL, mach_absolute_time() - startTime);
 #if HIBERNATION
        startTime = mach_absolute_time();
        IOHibernateSystemPostWake(true);
-       halt_log_enter("IOHibernateSystemPostWake", 0, mach_absolute_time() - startTime);
+       halt_log_enter("IOHibernateSystemPostWake", NULL, mach_absolute_time() - startTime);
 #endif
        if (OSCompareAndSwap(0, 1, &gPagingOff)) {
-#if !CONFIG_EMBEDDED
                gRootDomain->handlePlatformHaltRestart(kPEPagingOff);
-#endif
        }
 }
 
@@ -862,10 +974,27 @@ IOPMrootDomain::updateConsoleUsers(void)
        IOService::updateConsoleUsers(NULL, kIOMessageSystemHasPoweredOn);
        if (tasksSuspended) {
                tasksSuspended = FALSE;
-               tasks_system_suspend(tasksSuspended);
+               updateTasksSuspend();
        }
 }
 
+void
+IOPMrootDomain::updateTasksSuspend(void)
+{
+       bool newSuspend;
+
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+       newSuspend = (tasksSuspended || _aotTasksSuspended);
+#else /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+       newSuspend = tasksSuspended;
+#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+       if (newSuspend == tasksSuspendState) {
+               return;
+       }
+       tasksSuspendState = newSuspend;
+       tasks_system_suspend(newSuspend);
+}
+
 //******************************************************************************
 
 static void
@@ -943,7 +1072,7 @@ sysctl_sleepwaketime SYSCTL_HANDLER_ARGS
 
 static SYSCTL_PROC(_kern, OID_AUTO, sleeptime,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    &gIOLastSleepTime, 0, sysctl_sleepwaketime, "S,timeval", "");
+    &gIOLastUserSleepTime, 0, sysctl_sleepwaketime, "S,timeval", "");
 
 static SYSCTL_PROC(_kern, OID_AUTO, waketime,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
@@ -972,7 +1101,7 @@ sysctl_willshutdown
 
 static SYSCTL_PROC(_kern, OID_AUTO, willshutdown,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    0, 0, sysctl_willshutdown, "I", "");
+    NULL, 0, sysctl_willshutdown, "I", "");
 
 extern struct sysctl_oid sysctl__kern_iokittest;
 extern struct sysctl_oid sysctl__debug_iokit;
@@ -1013,11 +1142,11 @@ sysctl_progressmeter
 
 static SYSCTL_PROC(_kern, OID_AUTO, progressmeterenable,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    0, 0, sysctl_progressmeterenable, "I", "");
+    NULL, 0, sysctl_progressmeterenable, "I", "");
 
 static SYSCTL_PROC(_kern, OID_AUTO, progressmeter,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    0, 0, sysctl_progressmeter, "I", "");
+    NULL, 0, sysctl_progressmeter, "I", "");
 
 #endif /* !CONFIG_EMBEDDED */
 
@@ -1041,7 +1170,7 @@ sysctl_consoleoptions
 
 static SYSCTL_PROC(_kern, OID_AUTO, consoleoptions,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    0, 0, sysctl_consoleoptions, "I", "");
+    NULL, 0, sysctl_consoleoptions, "I", "");
 
 
 static int
@@ -1101,11 +1230,114 @@ static SYSCTL_INT(_debug, OID_AUTO, swd_sleep_timeout, CTLFLAG_RW, &gSwdSleepTim
 static SYSCTL_INT(_debug, OID_AUTO, swd_wake_timeout, CTLFLAG_RW, &gSwdWakeTimeout, 0, "");
 static SYSCTL_INT(_debug, OID_AUTO, swd_timeout, CTLFLAG_RW, &gSwdSleepWakeTimeout, 0, "");
 static SYSCTL_INT(_debug, OID_AUTO, swd_panic, CTLFLAG_RW, &gSwdPanic, 0, "");
+#if DEVELOPMENT || DEBUG
+static SYSCTL_INT(_debug, OID_AUTO, swd_panic_phase, CTLFLAG_RW, &swd_panic_phase, 0, "");
+#endif
+
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+//******************************************************************************
+// AOT
+
+static int
+sysctl_aotmetrics SYSCTL_HANDLER_ARGS
+{
+       if (NULL == gRootDomain) {
+               return ENOENT;
+       }
+       if (NULL == gRootDomain->_aotMetrics) {
+               return ENOENT;
+       }
+       return sysctl_io_opaque(req, gRootDomain->_aotMetrics, sizeof(IOPMAOTMetrics), NULL);
+}
+
+static SYSCTL_PROC(_kern, OID_AUTO, aotmetrics,
+    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
+    NULL, 0, sysctl_aotmetrics, "S,IOPMAOTMetrics", "");
+
+
+static int
+update_aotmode(uint32_t mode)
+{
+       int result;
+
+       if (!gIOPMWorkLoop) {
+               return ENOENT;
+       }
+       result = gIOPMWorkLoop->runActionBlock(^IOReturn (void) {
+               unsigned int oldCount;
+
+               if (mode && !gRootDomain->_aotMetrics) {
+                       gRootDomain->_aotMetrics = IONewZero(IOPMAOTMetrics, 1);
+                       if (!gRootDomain->_aotMetrics) {
+                               return ENOMEM;
+                       }
+               }
+
+               oldCount = gRootDomain->idleSleepPreventersCount();
+               gRootDomain->_aotMode = mode;
+               gRootDomain->updatePreventIdleSleepListInternal(NULL, false, oldCount);
+               return 0;
+       });
+       return result;
+}
+
+static int
+sysctl_aotmodebits
+(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       int error, changed;
+       uint32_t new_value;
+
+       if (NULL == gRootDomain) {
+               return ENOENT;
+       }
+       error = sysctl_io_number(req, gRootDomain->_aotMode, sizeof(uint32_t), &new_value, &changed);
+       if (changed && gIOPMWorkLoop) {
+               error = update_aotmode(new_value);
+       }
+
+       return error;
+}
+
+static SYSCTL_PROC(_kern, OID_AUTO, aotmodebits,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    NULL, 0, sysctl_aotmodebits, "I", "");
+
+static int
+sysctl_aotmode
+(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       int error, changed;
+       uint32_t new_value;
+
+       if (NULL == gRootDomain) {
+               return ENOENT;
+       }
+       error = sysctl_io_number(req, gRootDomain->_aotMode, sizeof(uint32_t), &new_value, &changed);
+       if (changed && gIOPMWorkLoop) {
+               if (new_value) {
+                       new_value = kIOPMAOTModeDefault; // & ~kIOPMAOTModeRespectTimers;
+               }
+               error = update_aotmode(new_value);
+       }
+
+       return error;
+}
+
+static SYSCTL_PROC(_kern, OID_AUTO, aotmode,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
+    NULL, 0, sysctl_aotmode, "I", "");
+
+//******************************************************************************
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
 
 
 static const OSSymbol * gIOPMSettingAutoWakeCalendarKey;
 static const OSSymbol * gIOPMSettingAutoWakeSecondsKey;
+static const OSSymbol * gIOPMSettingAutoPowerCalendarKey;
+static const OSSymbol * gIOPMSettingAutoPowerSecondsKey;
 static const OSSymbol * gIOPMSettingDebugWakeRelativeKey;
+static const OSSymbol * gIOPMSettingDebugPowerRelativeKey;
 static const OSSymbol * gIOPMSettingMaintenanceWakeCalendarKey;
 static const OSSymbol * gIOPMSettingSleepServiceWakeCalendarKey;
 static const OSSymbol * gIOPMSettingSilentRunningKey;
@@ -1131,7 +1363,10 @@ IOPMrootDomain::start( IOService * nub )
        gRootDomain = this;
        gIOPMSettingAutoWakeCalendarKey = OSSymbol::withCString(kIOPMSettingAutoWakeCalendarKey);
        gIOPMSettingAutoWakeSecondsKey = OSSymbol::withCString(kIOPMSettingAutoWakeSecondsKey);
+       gIOPMSettingAutoPowerCalendarKey = OSSymbol::withCString(kIOPMSettingAutoPowerCalendarKey);
+       gIOPMSettingAutoPowerSecondsKey = OSSymbol::withCString(kIOPMSettingAutoPowerSecondsKey);
        gIOPMSettingDebugWakeRelativeKey = OSSymbol::withCString(kIOPMSettingDebugWakeRelativeKey);
+       gIOPMSettingDebugPowerRelativeKey = OSSymbol::withCString(kIOPMSettingDebugPowerRelativeKey);
        gIOPMSettingMaintenanceWakeCalendarKey = OSSymbol::withCString(kIOPMSettingMaintenanceWakeCalendarKey);
        gIOPMSettingSleepServiceWakeCalendarKey = OSSymbol::withCString(kIOPMSettingSleepServiceWakeCalendarKey);
        gIOPMSettingSilentRunningKey = OSSymbol::withCStringNoCopy(kIOPMSettingSilentRunningKey);
@@ -1151,11 +1386,11 @@ IOPMrootDomain::start( IOService * nub )
        {
                OSSymbol::withCString(kIOPMSettingSleepOnPowerButtonKey),
                gIOPMSettingAutoWakeSecondsKey,
-               OSSymbol::withCString(kIOPMSettingAutoPowerSecondsKey),
+               gIOPMSettingAutoPowerSecondsKey,
                gIOPMSettingAutoWakeCalendarKey,
-               OSSymbol::withCString(kIOPMSettingAutoPowerCalendarKey),
+               gIOPMSettingAutoPowerCalendarKey,
                gIOPMSettingDebugWakeRelativeKey,
-               OSSymbol::withCString(kIOPMSettingDebugPowerRelativeKey),
+               gIOPMSettingDebugPowerRelativeKey,
                OSSymbol::withCString(kIOPMSettingWakeOnRingKey),
                OSSymbol::withCString(kIOPMSettingRestartOnPowerLossKey),
                OSSymbol::withCString(kIOPMSettingWakeOnClamshellKey),
@@ -1191,6 +1426,14 @@ IOPMrootDomain::start( IOService * nub )
                idleSleepTimerExpired,
                (thread_call_param_t) this);
 
+       powerButtonDown = thread_call_allocate(
+               powerButtonDownCallout,
+               (thread_call_param_t) this);
+
+       powerButtonUp = thread_call_allocate(
+               powerButtonUpCallout,
+               (thread_call_param_t) this);
+
        diskSyncCalloutEntry = thread_call_allocate(
                &disk_sync_callout,
                (thread_call_param_t) this);
@@ -1291,6 +1534,14 @@ IOPMrootDomain::start( IOService * nub )
                &IOPMrootDomain::dispatchPowerEvent));
        gIOPMWorkLoop->addEventSource(pmPowerStateQueue);
 
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+       _aotMode = 0;
+       _aotTimerES = IOTimerEventSource::timerEventSource(this,
+           OSMemberFunctionCast(IOTimerEventSource::Action,
+           this, &IOPMrootDomain::aotEvaluate));
+       gIOPMWorkLoop->addEventSource(_aotTimerES);
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
        // create our power parent
        patriarch = new IORootParent;
        patriarch->init();
@@ -1302,7 +1553,7 @@ IOPMrootDomain::start( IOService * nub )
        changePowerStateToPriv(ON_STATE);
 
        // install power change handler
-       gSysPowerDownNotifier = registerPrioritySleepWakeInterest( &sysPowerDownHandler, this, 0);
+       gSysPowerDownNotifier = registerPrioritySleepWakeInterest( &sysPowerDownHandler, this, NULL);
 
 #if !NO_KERNEL_HID
        // Register for a notification when IODisplayWrangler is published
@@ -1310,7 +1561,7 @@ IOPMrootDomain::start( IOService * nub )
                _displayWranglerNotifier = addMatchingNotification(
                        gIOPublishNotification, tmpDict,
                        (IOServiceMatchingNotificationHandler) & displayWranglerMatchPublished,
-                       this, 0);
+                       this, NULL);
                tmpDict->release();
        }
 #endif
@@ -1354,6 +1605,8 @@ IOPMrootDomain::start( IOService * nub )
                psIterator->release();
        }
 
+       // read swd_panic boot-arg
+       PE_parse_boot_argn("swd_panic", &gSwdPanic, sizeof(gSwdPanic));
        sysctl_register_oid(&sysctl__kern_sleeptime);
        sysctl_register_oid(&sysctl__kern_waketime);
        sysctl_register_oid(&sysctl__kern_willshutdown);
@@ -1369,6 +1622,12 @@ IOPMrootDomain::start( IOService * nub )
        sysctl_register_oid(&sysctl__kern_consoleoptions);
        sysctl_register_oid(&sysctl__kern_progressoptions);
 
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+       sysctl_register_oid(&sysctl__kern_aotmode);
+       sysctl_register_oid(&sysctl__kern_aotmodebits);
+       sysctl_register_oid(&sysctl__kern_aotmetrics);
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
 #if HIBERNATION
        IOHibernateSystemInit(this);
 #endif
@@ -1394,7 +1653,46 @@ IOPMrootDomain::setProperties( OSObject * props_obj )
        OSNumber        *n;
        const OSSymbol  *key;
        OSObject        *obj;
-       OSCollectionIterator * iter = 0;
+       OSCollectionIterator * iter = NULL;
+
+       if (!dict) {
+               return kIOReturnBadArgument;
+       }
+
+       bool clientEntitled = false;
+       obj = IOUserClient::copyClientEntitlement(current_task(), kRootDomainEntitlementSetProperty);
+       clientEntitled = (obj == kOSBooleanTrue);
+       OSSafeReleaseNULL(obj);
+
+       if (!clientEntitled) {
+               const char * errorSuffix = NULL;
+
+               // IOPMSchedulePowerEvent() clients may not be entitled, but must be root.
+               // That API can set 6 possible keys that are checked below.
+               if ((dict->getCount() == 1) &&
+                   (dict->getObject(gIOPMSettingAutoWakeSecondsKey) ||
+                   dict->getObject(gIOPMSettingAutoPowerSecondsKey) ||
+                   dict->getObject(gIOPMSettingAutoWakeCalendarKey) ||
+                   dict->getObject(gIOPMSettingAutoPowerCalendarKey) ||
+                   dict->getObject(gIOPMSettingDebugWakeRelativeKey) ||
+                   dict->getObject(gIOPMSettingDebugPowerRelativeKey))) {
+                       return_value = IOUserClient::clientHasPrivilege(current_task(), kIOClientPrivilegeAdministrator);
+                       if (return_value != kIOReturnSuccess) {
+                               errorSuffix = "privileged";
+                       }
+               } else {
+                       return_value = kIOReturnNotPermitted;
+                       errorSuffix = "entitled";
+               }
+
+               if (return_value != kIOReturnSuccess) {
+                       OSString * procName = IOCopyLogNameForPID(proc_selfpid());
+                       DLOG("%s failed, process %s is not %s\n", __func__,
+                           procName ? procName->getCStringNoCopy() : "", errorSuffix);
+                       OSSafeReleaseNULL(procName);
+                       return return_value;
+               }
+       }
 
        const OSSymbol *publish_simulated_battery_string    = OSSymbol::withCString("SoftwareSimulatedBatteries");
        const OSSymbol *boot_complete_string                = OSSymbol::withCString("System Boot Complete");
@@ -1416,11 +1714,6 @@ IOPMrootDomain::setProperties( OSObject * props_obj )
        const OSSymbol *hibernatefreetime_string            = OSSymbol::withCString(kIOHibernateFreeTimeKey);
 #endif
 
-       if (!dict) {
-               return_value = kIOReturnBadArgument;
-               goto exit;
-       }
-
        iter = OSCollectionIterator::withCollection(dict);
        if (!iter) {
                return_value = kIOReturnNoMemory;
@@ -1533,10 +1826,12 @@ IOPMrootDomain::setProperties( OSObject * props_obj )
                                    (data->getLength() == sizeof(IOPMCalendarStruct))) {
                                        const IOPMCalendarStruct * cs =
                                            (const IOPMCalendarStruct *) data->getBytesNoCopy();
-
+                                       IOLog("gIOPMSettingAutoWakeCalendarKey " YMDTF "\n", YMDT(cs));
                                        if (cs->year) {
+                                               _scheduledAlarmUTC = IOPMConvertCalendarToSeconds(cs);
                                                OSBitOrAtomic(kIOPMAlarmBitCalendarWake, &_scheduledAlarms);
                                        } else {
+                                               _scheduledAlarmUTC = 0;
                                                OSBitAndAtomic(~kIOPMAlarmBitCalendarWake, &_scheduledAlarms);
                                        }
                                        DLOG("_scheduledAlarms = 0x%x\n", (uint32_t) _scheduledAlarms);
@@ -1631,8 +1926,13 @@ IOPMrootDomain::setAggressiveness(
        AggressivesRequest *    request;
        bool                    found = false;
 
-       DLOG("setAggressiveness(%x) 0x%x = %u\n",
-           (uint32_t) options, (uint32_t) type, (uint32_t) value);
+       if (type == kPMMinutesToDim || type == kPMMinutesToSleep) {
+               DLOG("setAggressiveness(%x) %s = %u\n",
+                   (uint32_t) options, getAggressivenessTypeString((uint32_t) type), (uint32_t) value);
+       } else {
+               DEBUG_LOG("setAggressiveness(%x) %s = %u\n",
+                   (uint32_t) options, getAggressivenessTypeString((uint32_t) type), (uint32_t) value);
+       }
 
        request = IONew(AggressivesRequest, 1);
        if (!request) {
@@ -1756,8 +2056,6 @@ IOPMrootDomain::getAggressiveness(
        AGGRESSIVES_UNLOCK();
 
        if (source) {
-               DLOG("getAggressiveness(%d) 0x%x = %u\n",
-                   source, (uint32_t) type, value);
                *outLevel = (unsigned long) value;
                return kIOReturnSuccess;
        } else {
@@ -1783,7 +2081,7 @@ IOPMrootDomain::joinAggressiveness(
                return kIOReturnBadArgument;
        }
 
-       DLOG("joinAggressiveness %s %p\n", service->getName(), OBFUSCATE(service));
+       DEBUG_LOG("joinAggressiveness %s %p\n", service->getName(), OBFUSCATE(service));
 
        request = IONew(AggressivesRequest, 1);
        if (!request) {
@@ -1988,14 +2286,14 @@ IOPMrootDomain::synchronizeAggressives(
                if (request->dataType == kAggressivesRequestTypeService) {
                        service = request->data.service;
                } else {
-                       service = 0;
+                       service = NULL;
                }
 
                IODelete(request, AggressivesRequest, 1);
-               request = 0;
+               request = NULL;
 
                if (service) {
-                       if (service->assertPMDriverCall(&callEntry)) {
+                       if (service->assertPMDriverCall(&callEntry, kIOPMDriverCallMethodSetAggressive)) {
                                for (i = 0, record = array; i < count; i++, record++) {
                                        value = record->value;
                                        if (record->flags & kAggressivesRecordFlagMinValue) {
@@ -2045,7 +2343,7 @@ IOPMrootDomain::broadcastAggressives(
                                }
 
                                if ((service = OSDynamicCast(IOService, connect->copyChildEntry(gIOPowerPlane)))) {
-                                       if (service->assertPMDriverCall(&callEntry)) {
+                                       if (service->assertPMDriverCall(&callEntry, kIOPMDriverCallMethodSetAggressive)) {
                                                for (i = 0, record = array; i < count; i++, record++) {
                                                        if (record->flags & kAggressivesRecordFlagModified) {
                                                                value = record->value;
@@ -2067,6 +2365,31 @@ IOPMrootDomain::broadcastAggressives(
        }
 }
 
+//*****************************************
+// stackshot on power button press
+// ***************************************
+static void
+powerButtonDownCallout(thread_call_param_t us, thread_call_param_t )
+{
+       /* Power button pressed during wake
+        * Take a stackshot
+        */
+       DEBUG_LOG("Powerbutton: down. Taking stackshot\n");
+       ((IOPMrootDomain *)us)->takeStackshot(false);
+}
+
+static void
+powerButtonUpCallout(thread_call_param_t us, thread_call_param_t)
+{
+       /* Power button released.
+        * Delete any stackshot data
+        */
+       DEBUG_LOG("PowerButton: up callout. Delete stackshot\n");
+       ((IOPMrootDomain *)us)->deleteStackshot();
+}
+//*************************************************************************
+//
+
 // MARK: -
 // MARK: System Sleep
 
@@ -2302,7 +2625,6 @@ IOPMrootDomain::privateSleepSystem( uint32_t sleepReason )
 //
 // This overrides powerChangeDone in IOService.
 //******************************************************************************
-
 void
 IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
 {
@@ -2310,34 +2632,85 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
        uint64_t    timeSinceReset = 0;
 #endif
        uint64_t    now;
+       unsigned long newState;
+       clock_sec_t        secs;
+       clock_usec_t       microsecs;
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+       clock_sec_t        adjWakeTime;
+       IOPMCalendarStruct nowCalendar;
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
        ASSERT_GATED();
-       DLOG("PowerChangeDone: %u->%u\n",
-           (uint32_t) previousPowerState, (uint32_t) getPowerState());
+       newState = getPowerState();
+       DLOG("PowerChangeDone: %s->%s\n",
+           getPowerStateString((uint32_t) previousPowerState), getPowerStateString((uint32_t) getPowerState()));
+
+       if (previousPowerState == newState) {
+               return;
+       }
 
        notifierThread = current_thread();
        switch (getPowerState()) {
        case SLEEP_STATE: {
-               if (previousPowerState != ON_STATE) {
-                       break;
-               }
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+               if (kPMCalendarTypeInvalid != _aotWakeTimeCalendar.selector) {
+                       secs = 0;
+                       microsecs = 0;
+                       PEGetUTCTimeOfDay(&secs, &microsecs);
+
+                       adjWakeTime = 0;
+                       if ((kIOPMAOTModeRespectTimers & _aotMode) && (_scheduledAlarmUTC < _aotWakeTimeUTC)) {
+                               IOLog("use _scheduledAlarmUTC\n");
+                               adjWakeTime = _scheduledAlarmUTC;
+                       } else if (_aotExit || (kIOPMWakeEventAOTExitFlags & _aotPendingFlags)) {
+                               IOLog("accelerate _aotWakeTime for exit\n");
+                               adjWakeTime = secs;
+                       } else if (kIOPMDriverAssertionLevelOn == getPMAssertionLevel(kIOPMDriverAssertionCPUBit)) {
+                               IOLog("accelerate _aotWakeTime for assertion\n");
+                               adjWakeTime = secs;
+                       }
+                       if (adjWakeTime) {
+                               IOPMConvertSecondsToCalendar(adjWakeTime, &_aotWakeTimeCalendar);
+                       }
+
+                       IOPMConvertSecondsToCalendar(secs, &nowCalendar);
+                       IOLog("aotSleep at " YMDTF " sched: " YMDTF "\n", YMDT(&nowCalendar), YMDT(&_aotWakeTimeCalendar));
 
+                       IOReturn __unused ret = setMaintenanceWakeCalendar(&_aotWakeTimeCalendar);
+                       assert(kIOReturnSuccess == ret);
+               }
+               if (_aotLastWakeTime) {
+                       _aotMetrics->totalTime += mach_absolute_time() - _aotLastWakeTime;
+                       if (_aotMetrics->sleepCount && (_aotMetrics->sleepCount <= kIOPMAOTMetricsKernelWakeCountMax)) {
+                               strlcpy(&_aotMetrics->kernelWakeReason[_aotMetrics->sleepCount - 1][0],
+                                   gWakeReasonString,
+                                   sizeof(_aotMetrics->kernelWakeReason[_aotMetrics->sleepCount]));
+                       }
+               }
+               _aotPendingFlags &= ~kIOPMWakeEventAOTPerCycleFlags;
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
                acceptSystemWakeEvents(true);
 
                // re-enable this timer for next sleep
                cancelIdleSleepTimer();
 
-               clock_sec_t     secs;
-               clock_usec_t    microsecs;
                clock_get_calendar_absolute_and_microtime(&secs, &microsecs, &now);
                logtime(secs);
                gIOLastSleepTime.tv_sec  = secs;
                gIOLastSleepTime.tv_usec = microsecs;
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+               if (!_aotLastWakeTime) {
+                       gIOLastUserSleepTime = gIOLastSleepTime;
+               }
+#else
+               gIOLastUserSleepTime = gIOLastSleepTime;
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
                gIOLastWakeTime.tv_sec = 0;
                gIOLastWakeTime.tv_usec = 0;
                gIOLastSleepAbsTime = now;
 
                if (wake2DarkwakeDelay && sleepDelaysReport) {
-                       clock_usec_t    microsecs;
                        clock_sec_t     wake2DarkwakeSecs, darkwake2SleepSecs;
                        // Update 'wake2DarkwakeDelay' histogram if this is a fullwake->sleep transition
 
@@ -2376,6 +2749,8 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
                        clock_usec_t microsecs = 0;
                        uint64_t now_b = mach_absolute_time();
 
+                       secs = 0;
+                       microsecs = 0;
                        PEGetUTCTimeOfDay(&secs, &microsecs);
 
                        uint64_t now_a = mach_absolute_time();
@@ -2401,16 +2776,37 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
                gSleepOrShutdownPending = 0;
 
                // trip the reset of the calendar clock
-               {
-                       clock_sec_t  wakeSecs;
-                       clock_usec_t wakeMicrosecs;
-
-                       clock_wakeup_calendar();
-
-                       clock_get_calendar_microtime(&wakeSecs, &wakeMicrosecs);
-                       gIOLastWakeTime.tv_sec  = wakeSecs;
-                       gIOLastWakeTime.tv_usec = wakeMicrosecs;
+               clock_wakeup_calendar();
+               clock_get_calendar_microtime(&secs, &microsecs);
+               gIOLastWakeTime.tv_sec  = secs;
+               gIOLastWakeTime.tv_usec = microsecs;
+
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+               // aot
+               if (_aotWakeTimeCalendar.selector != kPMCalendarTypeInvalid) {
+                       _aotWakeTimeCalendar.selector = kPMCalendarTypeInvalid;
+                       secs = 0;
+                       microsecs = 0;
+                       PEGetUTCTimeOfDay(&secs, &microsecs);
+                       IOPMConvertSecondsToCalendar(secs, &nowCalendar);
+                       IOLog("aotWake at " YMDTF " sched: " YMDTF "\n", YMDT(&nowCalendar), YMDT(&_aotWakeTimeCalendar));
+                       _aotMetrics->sleepCount++;
+                       _aotLastWakeTime = gIOLastWakeAbsTime;
+                       if (_aotMetrics->sleepCount <= kIOPMAOTMetricsKernelWakeCountMax) {
+                               _aotMetrics->kernelSleepTime[_aotMetrics->sleepCount - 1]
+                                       = (((uint64_t) gIOLastSleepTime.tv_sec) << 10) + (gIOLastSleepTime.tv_usec / 1000);
+                               _aotMetrics->kernelWakeTime[_aotMetrics->sleepCount - 1]
+                                       = (((uint64_t) gIOLastWakeTime.tv_sec) << 10) + (gIOLastWakeTime.tv_usec / 1000);
+                       }
+
+                       if (_aotTestTime) {
+                               if (_aotWakeTimeUTC <= secs) {
+                                       _aotTestTime = _aotTestTime + _aotTestInterval;
+                               }
+                               setWakeTime(_aotTestTime);
+                       }
                }
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
 
 #if HIBERNATION
                LOG("System %sWake\n", gIOHibernateState ? "SafeSleep " : "");
@@ -2434,6 +2830,7 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
                sleepToStandby          = false;
                wranglerTickleLatched   = false;
                userWasActive           = false;
+               isRTCAlarmWake          = false;
                fullWakeReason = kFullWakeReasonNone;
 
                OSString * wakeType = OSDynamicCast(
@@ -2459,9 +2856,11 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
                            !(hibOptions->unsigned32BitValue() & kIOHibernateOptionDarkWake)))) {
                                // Hibernate aborted, or EFI brought up graphics
                                wranglerTickled = true;
-                               DLOG("hibernation aborted %d, options 0x%x\n",
-                                   hibernateAborted,
-                                   hibOptions ? hibOptions->unsigned32BitValue() : 0);
+                               if (hibernateAborted) {
+                                       DLOG("Hibernation aborted\n");
+                               } else {
+                                       DLOG("EFI brought up graphics. Going to full wake. HibOptions: 0x%x\n", hibOptions->unsigned32BitValue());
+                               }
                        } else
 #endif
                        if (wakeType && (
@@ -2469,6 +2868,9 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
                                    wakeType->isEqualTo(kIOPMRootDomainWakeTypeAlarm))) {
                                // User wake or RTC alarm
                                wranglerTickled = true;
+                               if (wakeType->isEqualTo(kIOPMRootDomainWakeTypeAlarm)) {
+                                       isRTCAlarmWake = true;
+                               }
                        } else if (wakeType &&
                            wakeType->isEqualTo(kIOPMRootDomainWakeTypeSleepTimer)) {
                                // SMC standby timer trumps SleepX
@@ -2546,21 +2948,30 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
 
                thread_call_enter(updateConsoleUsersEntry);
 
-               changePowerStateToPriv(ON_STATE);
-       }   break;
+               changePowerStateToPriv(getRUN_STATE());
+               break;
+       }
 #if !__i386__ && !__x86_64__
-       case ON_STATE: {
-               if (previousPowerState != ON_STATE) {
-                       DLOG("Force re-evaluating aggressiveness\n");
-                       /* Force re-evaluate the aggressiveness values to set appropriate idle sleep timer */
-                       pmPowerStateQueue->submitPowerEvent(
-                               kPowerEventPolicyStimulus,
-                               (void *) kStimulusNoIdleSleepPreventers );
+       case ON_STATE:
+       case AOT_STATE:
+       {
+               DLOG("Force re-evaluating aggressiveness\n");
+               /* Force re-evaluate the aggressiveness values to set appropriate idle sleep timer */
+               pmPowerStateQueue->submitPowerEvent(
+                       kPowerEventPolicyStimulus,
+                       (void *) kStimulusNoIdleSleepPreventers );
+
+               // After changing to ON_STATE, invalidate any previously queued
+               // request to change to a state less than ON_STATE. This isn't
+               // necessary for AOT_STATE or if the device has only one running
+               // state since the changePowerStateToPriv() issued at the tail
+               // end of SLEEP_STATE case should take care of that.
+               if (getPowerState() == ON_STATE) {
+                       changePowerStateToPriv(ON_STATE);
                }
                break;
        }
-
-#endif
+#endif /* !__i386__ && !__x86_64__ */
        }
        notifierThread = NULL;
 }
@@ -2594,38 +3005,49 @@ IOPMrootDomain::requestPowerDomainState(
 
 bool
 IOPMrootDomain::updatePreventIdleSleepList(
-       IOService * service, bool addNotRemove )
+       IOService * service, bool addNotRemove)
 {
-       unsigned int oldCount, newCount;
+       unsigned int oldCount;
+
+       oldCount = idleSleepPreventersCount();
+       return updatePreventIdleSleepListInternal(service, addNotRemove, oldCount);
+}
+
+bool
+IOPMrootDomain::updatePreventIdleSleepListInternal(
+       IOService * service, bool addNotRemove, unsigned int oldCount)
+{
+       unsigned int newCount;
 
        ASSERT_GATED();
 
 #if defined(__i386__) || defined(__x86_64__)
        // Disregard disk I/O (besides the display wrangler) as a factor preventing
        // idle sleep, except in the case of legacy disk I/O
-       if ((service != wrangler) && (service != this)) {
+       if (service && (service != wrangler) && (service != this)) {
                return false;
        }
 #endif
 
-       oldCount = preventIdleSleepList->getCount();
-       if (addNotRemove) {
-               preventIdleSleepList->setObject(service);
-               DLOG("prevent idle sleep list: %s+ (%u)\n",
-                   service->getName(), preventIdleSleepList->getCount());
-       } else if (preventIdleSleepList->member(service)) {
-               preventIdleSleepList->removeObject(service);
-               DLOG("prevent idle sleep list: %s- (%u)\n",
-                   service->getName(), preventIdleSleepList->getCount());
+       if (service) {
+               if (addNotRemove) {
+                       preventIdleSleepList->setObject(service);
+                       DLOG("prevent idle sleep list: %s+ (%u)\n",
+                           service->getName(), preventIdleSleepList->getCount());
+               } else if (preventIdleSleepList->member(service)) {
+                       preventIdleSleepList->removeObject(service);
+                       DLOG("prevent idle sleep list: %s- (%u)\n",
+                           service->getName(), preventIdleSleepList->getCount());
+               }
        }
-       newCount = preventIdleSleepList->getCount();
+       newCount = idleSleepPreventersCount();
 
        if ((oldCount == 0) && (newCount != 0)) {
                // Driver added to empty prevent list.
                // Update the driver desire to prevent idle sleep.
                // Driver desire does not prevent demand sleep.
 
-               changePowerStateTo(ON_STATE);
+               changePowerStateTo(getRUN_STATE());
        } else if ((oldCount != 0) && (newCount == 0)) {
                // Last driver removed from prevent list.
                // Drop the driver clamp to allow idle sleep.
@@ -2751,6 +3173,68 @@ IOPMrootDomain::copySleepPreventersList(OSArray **idleSleepList, OSArray **syste
        }
 }
 
+void
+IOPMrootDomain::copySleepPreventersListWithID(OSArray **idleSleepList, OSArray **systemSleepList)
+{
+       OSCollectionIterator *iterator = NULL;
+       OSObject    *object = NULL;
+       OSArray     *array = NULL;
+
+       if (!gIOPMWorkLoop->inGate()) {
+               gIOPMWorkLoop->runAction(
+                       OSMemberFunctionCast(IOWorkLoop::Action, this,
+                       &IOPMrootDomain::IOPMrootDomain::copySleepPreventersListWithID),
+                       this, (void *)idleSleepList, (void *)systemSleepList);
+               return;
+       }
+
+       if (idleSleepList && preventIdleSleepList && (preventIdleSleepList->getCount() != 0)) {
+               iterator = OSCollectionIterator::withCollection(preventIdleSleepList);
+               array = OSArray::withCapacity(5);
+
+               while ((object = iterator->getNextObject())) {
+                       IOService *service = OSDynamicCast(IOService, object);
+                       if (object) {
+                               OSDictionary *dict = OSDictionary::withCapacity(2);
+                               if (dict) {
+                                       OSNumber *id = OSNumber::withNumber(service->getRegistryEntryID(), 64);
+                                       dict->setObject(kIOPMDriverAssertionRegistryEntryIDKey, id);
+                                       dict->setObject(kIOPMDriverAssertionOwnerStringKey, OSSymbol::withCString(service->getName()));
+                                       array->setObject(dict);
+                                       id->release();
+                                       dict->release();
+                               }
+                       }
+               }
+
+               iterator->release();
+               *idleSleepList = array;
+       }
+
+       if (systemSleepList && preventSystemSleepList && (preventSystemSleepList->getCount() != 0)) {
+               iterator = OSCollectionIterator::withCollection(preventSystemSleepList);
+               array = OSArray::withCapacity(5);
+
+               while ((object = iterator->getNextObject())) {
+                       IOService *service = OSDynamicCast(IOService, object);
+                       if (object) {
+                               OSDictionary *dict = OSDictionary::withCapacity(2);
+                               if (dict) {
+                                       OSNumber *id = OSNumber::withNumber(service->getRegistryEntryID(), 64);
+                                       dict->setObject(kIOPMDriverAssertionRegistryEntryIDKey, id);
+                                       dict->setObject(kIOPMDriverAssertionOwnerStringKey, OSSymbol::withCString(service->getName()));
+                                       array->setObject(dict);
+                                       id->release();
+                                       dict->release();
+                               }
+                       }
+               }
+
+               iterator->release();
+               *systemSleepList = array;
+       }
+}
+
 //******************************************************************************
 // tellChangeDown
 //
@@ -2760,8 +3244,8 @@ IOPMrootDomain::copySleepPreventersList(OSArray **idleSleepList, OSArray **syste
 bool
 IOPMrootDomain::tellChangeDown( unsigned long stateNum )
 {
-       DLOG("tellChangeDown %u->%u\n",
-           (uint32_t) getPowerState(), (uint32_t) stateNum);
+       DLOG("tellChangeDown %s->%s\n",
+           getPowerStateString((uint32_t) getPowerState()), getPowerStateString((uint32_t) stateNum));
 
        if (SLEEP_STATE == stateNum) {
                // Legacy apps were already told in the full->dark transition
@@ -2804,8 +3288,8 @@ IOPMrootDomain::tellChangeDown( unsigned long stateNum )
 bool
 IOPMrootDomain::askChangeDown( unsigned long stateNum )
 {
-       DLOG("askChangeDown %u->%u\n",
-           (uint32_t) getPowerState(), (uint32_t) stateNum);
+       DLOG("askChangeDown %s->%s\n",
+           getPowerStateString((uint32_t) getPowerState()), getPowerStateString((uint32_t) stateNum));
 
        // Don't log for dark wake entry
        if (kSystemTransitionSleep == _systemTransitionType) {
@@ -2860,6 +3344,16 @@ IOPMrootDomain::askChangeDownDone(
                        *cancel = true;
                        DLOG("cancel dark->sleep\n");
                }
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+               if (_aotMode && (kPMCalendarTypeInvalid != _aotWakeTimeCalendar.selector)) {
+                       uint64_t now = mach_continuous_time();
+                       if (((now + _aotWakePreWindow) >= _aotWakeTimeContinuous)
+                           && (now < (_aotWakeTimeContinuous + _aotWakePostWindow))) {
+                               *cancel = true;
+                               IOLog("AOT wake window cancel: %qd, %qd\n", now, _aotWakeTimeContinuous);
+                       }
+               }
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
        }
 }
 
@@ -2930,8 +3424,8 @@ IOPMrootDomain::systemDidNotSleep( void )
 void
 IOPMrootDomain::tellNoChangeDown( unsigned long stateNum )
 {
-       DLOG("tellNoChangeDown %u->%u\n",
-           (uint32_t) getPowerState(), (uint32_t) stateNum);
+       DLOG("tellNoChangeDown %s->%s\n",
+           getPowerStateString((uint32_t) getPowerState()), getPowerStateString((uint32_t) stateNum));
 
        // Sleep canceled, clear the sleep trace point.
        tracePoint(kIOPMTracePointSystemUp);
@@ -2952,8 +3446,8 @@ IOPMrootDomain::tellNoChangeDown( unsigned long stateNum )
 void
 IOPMrootDomain::tellChangeUp( unsigned long stateNum )
 {
-       DLOG("tellChangeUp %u->%u\n",
-           (uint32_t) getPowerState(), (uint32_t) stateNum);
+       DLOG("tellChangeUp %s->%s\n",
+           getPowerStateString((uint32_t) getPowerState()), getPowerStateString((uint32_t) stateNum));
 
        ignoreTellChangeDown = false;
 
@@ -2969,7 +3463,21 @@ IOPMrootDomain::tellChangeUp( unsigned long stateNum )
                        NULL, NULL, NULL);
 
                if (getPowerState() == ON_STATE) {
-                       // this is a quick wake from aborted sleep
+                       // Sleep was cancelled by idle cancel or revert
+                       if (!CAP_CURRENT(kIOPMSystemCapabilityGraphics)) {
+                               // rdar://problem/50363791
+                               // If system is in dark wake and sleep is cancelled, do not
+                               // send SystemWillPowerOn/HasPoweredOn messages to kernel
+                               // priority clients. They haven't yet seen a SystemWillSleep
+                               // message before the cancellation. So make sure the kernel
+                               // client bit is cleared in _systemMessageClientMask before
+                               // invoking the tellClients() below. This bit may have been
+                               // set by handleOurPowerChangeStart() anticipating a successful
+                               // sleep and setting the filter mask ahead of time allows the
+                               // SystemWillSleep message to go through.
+                               _systemMessageClientMask &= ~kSystemMessageClientKernel;
+                       }
+
                        systemDidNotSleep();
                        tellClients( kIOMessageSystemWillPowerOn );
                }
@@ -3011,10 +3519,29 @@ IOPMrootDomain::sysPowerDownHandler(
        UInt32 messageType, IOService * service,
        void * messageArgs, vm_size_t argSize )
 {
+       static UInt32 lastSystemMessageType = 0;
        IOReturn    ret = 0;
 
        DLOG("sysPowerDownHandler message %s\n", getIOMessageString(messageType));
 
+       // rdar://problem/50363791
+       // Sanity check to make sure the SystemWill/Has message types are
+       // received in the expected order for all kernel priority clients.
+       if (messageType == kIOMessageSystemWillSleep ||
+           messageType == kIOMessageSystemWillPowerOn ||
+           messageType == kIOMessageSystemHasPoweredOn) {
+               switch (messageType) {
+               case kIOMessageSystemWillPowerOn:
+                       assert(lastSystemMessageType == kIOMessageSystemWillSleep);
+                       break;
+               case kIOMessageSystemHasPoweredOn:
+                       assert(lastSystemMessageType == kIOMessageSystemWillPowerOn);
+                       break;
+               }
+
+               lastSystemMessageType = messageType;
+       }
+
        if (!gRootDomain) {
                return kIOReturnUnsupported;
        }
@@ -3235,9 +3762,9 @@ IOPMrootDomain::initializeBootSessionUUID(void)
 IOReturn
 IOPMrootDomain::changePowerStateTo( unsigned long ordinal )
 {
-       DLOG("changePowerStateTo(%lu)\n", ordinal);
+       DLOG("changePowerStateTo(%u)\n", (uint32_t) ordinal);
 
-       if ((ordinal != ON_STATE) && (ordinal != SLEEP_STATE)) {
+       if ((ordinal != ON_STATE) && (ordinal != AOT_STATE) && (ordinal != SLEEP_STATE)) {
                return kIOReturnUnsupported;
        }
 
@@ -3247,9 +3774,9 @@ IOPMrootDomain::changePowerStateTo( unsigned long ordinal )
 IOReturn
 IOPMrootDomain::changePowerStateToPriv( unsigned long ordinal )
 {
-       DLOG("changePowerStateToPriv(%lu)\n", ordinal);
+       DLOG("changePowerStateToPriv(%u)\n", (uint32_t) ordinal);
 
-       if ((ordinal != ON_STATE) && (ordinal != SLEEP_STATE)) {
+       if ((ordinal != ON_STATE) && (ordinal != AOT_STATE) && (ordinal != SLEEP_STATE)) {
                return kIOReturnUnsupported;
        }
 
@@ -3309,7 +3836,7 @@ IOPMrootDomain::willNotifyPowerChildren( IOPMPowerStateIndex newPowerState )
                if (!tasksSuspended) {
                        AbsoluteTime deadline;
                        tasksSuspended = TRUE;
-                       tasks_system_suspend(tasksSuspended);
+                       updateTasksSuspend();
 
                        clock_interval_to_deadline(10, kSecondScale, &deadline);
 #if !CONFIG_EMBEDDED
@@ -3317,6 +3844,44 @@ IOPMrootDomain::willNotifyPowerChildren( IOPMPowerStateIndex newPowerState )
 #endif /* !CONFIG_EMBEDDED */
                }
 
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+               _aotReadyToFullWake = false;
+#if 0
+               if (_aotLingerTime) {
+                       uint64_t deadline;
+                       IOLog("aot linger no return\n");
+                       clock_absolutetime_interval_to_deadline(_aotLingerTime, &deadline);
+                       clock_delay_until(deadline);
+               }
+#endif
+               if (!_aotMode) {
+                       _aotTestTime = 0;
+                       _aotWakeTimeCalendar.selector = kPMCalendarTypeInvalid;
+                       if (_aotMetrics) {
+                               bzero(_aotMetrics, sizeof(IOPMAOTMetrics));
+                       }
+               } else if (!_aotNow && !_debugWakeSeconds) {
+                       _aotNow            = true;
+                       _aotExit           = false;
+                       _aotPendingFlags   = 0;
+                       _aotTasksSuspended = true;
+                       _aotLastWakeTime   = 0;
+                       bzero(_aotMetrics, sizeof(IOPMAOTMetrics));
+                       if (kIOPMAOTModeCycle & _aotMode) {
+                               clock_interval_to_absolutetime_interval(60, kSecondScale, &_aotTestInterval);
+                               _aotTestTime = mach_continuous_time() + _aotTestInterval;
+                               setWakeTime(_aotTestTime);
+                       }
+                       uint32_t lingerSecs;
+                       if (!PE_parse_boot_argn("aotlinger", &lingerSecs, sizeof(lingerSecs))) {
+                               lingerSecs = 0;
+                       }
+                       clock_interval_to_absolutetime_interval(lingerSecs, kSecondScale, &_aotLingerTime);
+                       clock_interval_to_absolutetime_interval(2000, kMillisecondScale, &_aotWakePreWindow);
+                       clock_interval_to_absolutetime_interval(1100, kMillisecondScale, &_aotWakePostWindow);
+               }
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
 #if HIBERNATION
                IOHibernateSystemSleep();
                IOHibernateIOKitSleep();
@@ -3362,6 +3927,21 @@ IOPMrootDomain::shouldSleepOnClamshellClosed( void )
        return !clamshellDisabled && !(desktopMode && acAdaptorConnected) && !clamshellSleepDisabled;
 }
 
+bool
+IOPMrootDomain::shouldSleepOnRTCAlarmWake( void )
+{
+       // Called once every RTC/Alarm wake. Device should go to sleep if on clamshell
+       // closed && battery
+       if (!clamshellExists) {
+               return false;
+       }
+
+       DLOG("shouldSleepOnRTCAlarmWake: clamshell closed %d, disabled %d, desktopMode %d, ac %d sleepDisabled %d\n",
+           clamshellClosed, clamshellDisabled, desktopMode, acAdaptorConnected, clamshellSleepDisabled);
+
+       return !acAdaptorConnected && !clamshellSleepDisabled;
+}
+
 void
 IOPMrootDomain::sendClientClamshellNotification( void )
 {
@@ -3544,7 +4124,7 @@ IOPMrootDomain::publishFeature(
                        existing_feature_arr->setObject(new_feature_data);
                        features->setObject(feature, existing_feature_arr);
                        existing_feature_arr->release();
-                       existing_feature_arr = 0;
+                       existing_feature_arr = NULL;
                }
        } else {
                // The easy case: no previously existing features listed. We simply
@@ -3733,8 +4313,8 @@ IOPMrootDomain::setPMSetting(
        const OSSymbol  *type,
        OSObject        *object )
 {
-       PMSettingCallEntry  *entries = 0;
-       OSArray             *chosen  = 0;
+       PMSettingCallEntry  *entries = NULL;
+       OSArray             *chosen  = NULL;
        const OSArray       *array;
        PMSettingObject     *pmso;
        thread_t            thisThread;
@@ -3961,10 +4541,10 @@ IOPMrootDomain::deregisterPMSettingObject( PMSettingObject * pmso )
                        }
                }
                if (wait) {
-                       assert(0 == pmso->waitThread);
+                       assert(NULL == pmso->waitThread);
                        pmso->waitThread = thisThread;
                        PMSETTING_WAIT(pmso);
-                       pmso->waitThread = 0;
+                       pmso->waitThread = NULL;
                }
        } while (wait);
 
@@ -4113,9 +4693,10 @@ IOPMrootDomain::evaluateSystemSleepPolicy(
        IOPMSystemSleepParameters * params, int sleepPhase, uint32_t * hibMode )
 {
        const IOPMSystemSleepPolicyTable * pt;
-       OSObject *  prop = 0;
+       OSObject *  prop = NULL;
        OSData *    policyData;
        uint64_t    currentFactors = 0;
+       char        currentFactorsBuf[512];
        uint32_t    standbyDelay   = 0;
        uint32_t    powerOffDelay  = 0;
        uint32_t    powerOffTimer  = 0;
@@ -4149,6 +4730,7 @@ IOPMrootDomain::evaluateSystemSleepPolicy(
            sleepPhase, standbyEnabled, standbyDelay, standbyTimer,
            powerOffEnabled, powerOffDelay, powerOffTimer, *hibMode);
 
+       currentFactorsBuf[0] = 0;
        // pmset level overrides
        if ((*hibMode & kIOHibernateModeOn) == 0) {
                if (!gSleepPolicyHandler) {
@@ -4161,86 +4743,109 @@ IOPMrootDomain::evaluateSystemSleepPolicy(
                // If poweroff is enabled, force poweroff.
                if (standbyEnabled) {
                        currentFactors |= kIOPMSleepFactorStandbyForced;
+                       snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "StandbyForced");
                } else if (powerOffEnabled) {
                        currentFactors |= kIOPMSleepFactorAutoPowerOffForced;
+                       snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "AutoPowerOffForced");
                } else {
                        currentFactors |= kIOPMSleepFactorHibernateForced;
+                       snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "HibernateForced");
                }
        }
 
        // Current factors based on environment and assertions
        if (sleepTimerMaintenance) {
                currentFactors |= kIOPMSleepFactorSleepTimerWake;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "SleepTimerWake");
        }
        if (standbyEnabled && sleepToStandby && !gSleepPolicyHandler) {
                currentFactors |= kIOPMSleepFactorSleepTimerWake;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "SleepTimerWake");
        }
        if (!clamshellClosed) {
                currentFactors |= kIOPMSleepFactorLidOpen;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "LidOpen");
        }
        if (acAdaptorConnected) {
                currentFactors |= kIOPMSleepFactorACPower;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "ACPower");
        }
        if (lowBatteryCondition) {
                currentFactors |= kIOPMSleepFactorBatteryLow;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "BatteryLow");
        }
        if (!standbyDelay || !standbyTimer) {
                currentFactors |= kIOPMSleepFactorStandbyNoDelay;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "StandbyNoDelay");
        }
        if (standbyNixed || !standbyEnabled) {
                currentFactors |= kIOPMSleepFactorStandbyDisabled;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "StandbyDisabled");
        }
        if (resetTimers) {
                currentFactors |= kIOPMSleepFactorLocalUserActivity;
                currentFactors &= ~kIOPMSleepFactorSleepTimerWake;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "LocalUserActivity, !SleepTimerWake");
        }
        if (getPMAssertionLevel(kIOPMDriverAssertionUSBExternalDeviceBit) !=
            kIOPMDriverAssertionLevelOff) {
                currentFactors |= kIOPMSleepFactorUSBExternalDevice;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "USBExternalDevice");
        }
        if (getPMAssertionLevel(kIOPMDriverAssertionBluetoothHIDDevicePairedBit) !=
            kIOPMDriverAssertionLevelOff) {
                currentFactors |= kIOPMSleepFactorBluetoothHIDDevice;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "BluetoothHIDDevice");
        }
        if (getPMAssertionLevel(kIOPMDriverAssertionExternalMediaMountedBit) !=
            kIOPMDriverAssertionLevelOff) {
                currentFactors |= kIOPMSleepFactorExternalMediaMounted;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "ExternalMediaMounted");
        }
        if (getPMAssertionLevel(kIOPMDriverAssertionReservedBit5) !=
            kIOPMDriverAssertionLevelOff) {
                currentFactors |= kIOPMSleepFactorThunderboltDevice;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "ThunderboltDevice");
        }
        if (_scheduledAlarms != 0) {
                currentFactors |= kIOPMSleepFactorRTCAlarmScheduled;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "RTCAlaramScheduled");
        }
        if (getPMAssertionLevel(kIOPMDriverAssertionMagicPacketWakeEnabledBit) !=
            kIOPMDriverAssertionLevelOff) {
                currentFactors |= kIOPMSleepFactorMagicPacketWakeEnabled;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "MagicPacketWakeEnabled");
        }
 #define TCPKEEPALIVE 1
 #if TCPKEEPALIVE
        if (getPMAssertionLevel(kIOPMDriverAssertionNetworkKeepAliveActiveBit) !=
            kIOPMDriverAssertionLevelOff) {
                currentFactors |= kIOPMSleepFactorNetworkKeepAliveActive;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "NetworkKeepAliveActive");
        }
 #endif
        if (!powerOffEnabled) {
                currentFactors |= kIOPMSleepFactorAutoPowerOffDisabled;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "AutoPowerOffDisabled");
        }
        if (desktopMode) {
                currentFactors |= kIOPMSleepFactorExternalDisplay;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "ExternalDisplay");
        }
        if (userWasActive) {
                currentFactors |= kIOPMSleepFactorLocalUserActivity;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "LocalUserActivity");
        }
        if (darkWakeHibernateError && !CAP_HIGHEST(kIOPMSystemCapabilityGraphics)) {
                currentFactors |= kIOPMSleepFactorHibernateFailed;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "HibernateFailed");
        }
        if (thermalWarningState) {
                currentFactors |= kIOPMSleepFactorThermalWarning;
+               snprintf(currentFactorsBuf, sizeof(currentFactorsBuf), "%s, %s", currentFactorsBuf, "ThermalWarning");
        }
 
-       DLOG("sleep factors 0x%llx\n", currentFactors);
+       DLOG("sleep factors 0x%llx %s\n", currentFactors, currentFactorsBuf);
 
        if (gSleepPolicyHandler) {
                uint32_t    savedHibernateMode;
@@ -4524,7 +5129,7 @@ IOPMrootDomain::getSleepOption( const char * key, uint32_t * option )
 {
        OSObject *      optionsProp;
        OSDictionary *  optionsDict;
-       OSObject *      obj = 0;
+       OSObject *      obj = NULL;
        OSNumber *      num;
        bool            ok = false;
 
@@ -4677,7 +5282,7 @@ platformHaltRestartApplier( OSObject * object, void * context )
                halt_log_enter("PowerOff/Restart message to priority client", (const void *) notifier->handler, elapsedTime);
        }
 
-       ctx->handler = 0;
+       ctx->handler = NULL;
        ctx->Counter++;
 }
 
@@ -4751,7 +5356,13 @@ IOPMrootDomain::handlePlatformHaltRestart( UInt32 pe_type )
        }
 
        gHaltRestartCtx.phase = kNotifyHaltRestartAction;
+#if !CONFIG_EMBEDDED
        IOCPURunPlatformHaltRestartActions(pe_type);
+#else
+       if (kPEPagingOff != pe_type) {
+               IOCPURunPlatformHaltRestartActions(pe_type);
+       }
+#endif
 
        // Wait for PM to quiesce
        if ((kPEPagingOff != pe_type) && gPMHaltLock) {
@@ -4876,7 +5487,7 @@ IOPMrootDomain::tagPowerPlaneService(
        }
 
 #if !NO_KERNEL_HID
-       isDisplayWrangler = (0 != service->metaCast("IODisplayWrangler"));
+       isDisplayWrangler = (NULL != service->metaCast("IODisplayWrangler"));
        if (isDisplayWrangler) {
                wrangler = service;
                // found the display wrangler, check for any display assertions already created
@@ -5006,6 +5617,14 @@ IOPMrootDomain::overrideOurPowerChange(
        uint32_t changeFlags = *inOutChangeFlags;
        uint32_t currentPowerState = (uint32_t) getPowerState();
 
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+       if ((AOT_STATE == powerState) && (ON_STATE == currentPowerState)) {
+               // Assertion may have been taken in AOT leading to changePowerStateTo(AOT)
+               *inOutChangeFlags |= kIOPMNotDone;
+               return;
+       }
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
        if (changeFlags & kIOPMParentInitiated) {
                // Root parent is permanently pegged at max power,
                // a parent initiated power change is unexpected.
@@ -5028,11 +5647,11 @@ IOPMrootDomain::overrideOurPowerChange(
                                kIOPMSystemCapabilityAudio);
 
                        // Convert to capability change (ON->ON)
-                       *inOutPowerState = ON_STATE;
+                       *inOutPowerState = getRUN_STATE();
                        *inOutChangeFlags |= kIOPMSynchronize;
 
                        // Revert device desire from SLEEP to ON
-                       changePowerStateToPriv(ON_STATE);
+                       changePowerStateToPriv(getRUN_STATE());
                } else {
                        // System is in dark wake, ok to drop power state.
                        // Broadcast root powering down to entire tree.
@@ -5193,6 +5812,9 @@ IOPMrootDomain::handleOurPowerChangeStart(
                        _systemMessageClientMask &= ~kSystemMessageClientLegacyApp;
                }
                if ((_highestCapability & kIOPMSystemCapabilityGraphics) == 0) {
+                       // Kernel priority clients are only notified on the initial
+                       // transition to full wake, so don't notify them unless system
+                       // has gained graphics capability since the last system wake.
                        _systemMessageClientMask &= ~kSystemMessageClientKernel;
                }
 #if HIBERNATION
@@ -5214,6 +5836,10 @@ IOPMrootDomain::handleOurPowerChangeStart(
                tracePoint( kIOPMTracePointWakeWillPowerOnClients );
                // Clear stats about sleep
 
+               if (AOT_STATE == powerState) {
+                       _pendingCapability = 0;
+               }
+
                if (_pendingCapability & kIOPMSystemCapabilityGraphics) {
                        willEnterFullWake();
                } else {
@@ -5255,13 +5881,23 @@ IOPMrootDomain::handleOurPowerChangeStart(
                _systemStateGeneration++;
                systemDarkWake = false;
 
-               DLOG("=== START (%u->%u, 0x%x) type %u, gen %u, msg %x, "
+               DLOG("=== START (%s->%s, 0x%x) type %u, gen %u, msg %x, "
                    "dcp %x:%x:%x\n",
-                   currentPowerState, (uint32_t) powerState, *inOutChangeFlags,
+                   getPowerStateString(currentPowerState), getPowerStateString((uint32_t) powerState), *inOutChangeFlags,
                    _systemTransitionType, _systemStateGeneration,
                    _systemMessageClientMask,
                    _desiredCapability, _currentCapability, _pendingCapability);
        }
+
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+       if ((AOT_STATE == powerState) && (SLEEP_STATE != currentPowerState)) {
+               panic("illegal AOT entry from %s", getPowerStateString(currentPowerState));
+       }
+       if (_aotNow && (ON_STATE == powerState)) {
+               aotShouldExit(false, true);
+               aotExit(false);
+       }
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
 }
 
 void
@@ -5304,7 +5940,7 @@ IOPMrootDomain::handleOurPowerChangeDone(
                        }
 
                        // Revert device desire to max.
-                       changePowerStateToPriv(ON_STATE);
+                       changePowerStateToPriv(getRUN_STATE());
                } else {
                        // Send message on dark wake to full wake promotion.
                        // tellChangeUp() handles the normal SLEEP->ON case.
@@ -5363,9 +5999,9 @@ IOPMrootDomain::handleOurPowerChangeDone(
                        }
                }
 
-               DLOG("=== FINISH (%u->%u, 0x%x) type %u, gen %u, msg %x, "
+               DLOG("=== FINISH (%s->%s, 0x%x) type %u, gen %u, msg %x, "
                    "dcp %x:%x:%x, dbgtimer %u\n",
-                   currentPowerState, (uint32_t) powerState, changeFlags,
+                   getPowerStateString(currentPowerState), getPowerStateString((uint32_t) powerState), changeFlags,
                    _systemTransitionType, _systemStateGeneration,
                    _systemMessageClientMask,
                    _desiredCapability, _currentCapability, _pendingCapability,
@@ -5432,6 +6068,11 @@ IOPMrootDomain::handleOurPowerChangeDone(
                        DLOG("DisplayOn fullwake request is removed\n");
                        handleDisplayPowerOn();
                }
+
+               if (isRTCAlarmWake) {
+                       pmPowerStateQueue->submitPowerEvent(
+                               kPowerEventReceivedPowerNotification, (void *)(uintptr_t) kLocalEvalClamshellCommand );
+               }
        }
 }
 
@@ -5723,7 +6364,7 @@ IOPMrootDomain::handlePowerChangeDoneForPCIDevice(
 class IOPMServiceInterestNotifier : public _IOServiceInterestNotifier
 {
        friend class IOPMrootDomain;
-       OSDeclareDefaultStructors(IOPMServiceInterestNotifier)
+       OSDeclareDefaultStructors(IOPMServiceInterestNotifier);
 
 protected:
        uint32_t    ackTimeoutCnt;
@@ -5741,7 +6382,7 @@ IONotifier * IOPMrootDomain::registerInterest(
        IOServiceInterestHandler handler,
        void * target, void * ref )
 {
-       IOPMServiceInterestNotifier *notifier = 0;
+       IOPMServiceInterestNotifier *notifier = NULL;
        bool            isSystemCapabilityClient;
        bool            isKernelCapabilityClient;
        IOReturn        rc = kIOReturnError;;
@@ -5768,7 +6409,7 @@ IONotifier * IOPMrootDomain::registerInterest(
        }
        if (rc != kIOReturnSuccess) {
                notifier->release();
-               notifier = 0;
+               notifier = NULL;
 
                return NULL;
        }
@@ -5835,6 +6476,7 @@ IOPMrootDomain::systemMessageFilter(
        IOPMServiceInterestNotifier *notifier;
 
        notifier = OSDynamicCast(IOPMServiceInterestNotifier, (OSObject *)object);
+
        do {
                if ((kSystemTransitionNewCapClient == _systemTransitionType) &&
                    (!isCapMsg || !_joinedCapabilityClients ||
@@ -5956,7 +6598,7 @@ IOPMrootDomain::systemMessageFilter(
                        DLOG("destroyed capability client set %p\n",
                            OBFUSCATE(_joinedCapabilityClients));
                        _joinedCapabilityClients->release();
-                       _joinedCapabilityClients = 0;
+                       _joinedCapabilityClients = NULL;
                }
        }
        if (notifier) {
@@ -6090,7 +6732,7 @@ IOPMrootDomain::displayWranglerMatchPublished(
 #if !NO_KERNEL_HID
        // install a handler
        if (!newService->registerInterest( gIOGeneralInterest,
-           &displayWranglerNotification, target, 0)) {
+           &displayWranglerNotification, target, NULL)) {
                return false;
        }
 #endif
@@ -6170,7 +6812,7 @@ void
 IOPMrootDomain::setDisplayPowerOn( uint32_t options )
 {
        pmPowerStateQueue->submitPowerEvent( kPowerEventSetDisplayPowerOn,
-           (void *) 0, options );
+           (void *) NULL, options );
 }
 
 // MARK: -
@@ -6211,18 +6853,18 @@ bool
 IOPMrootDomain::checkSystemSleepAllowed( IOOptionBits options,
     uint32_t     sleepReason )
 {
-       int err = 0;
+       uint32_t err = 0;
 
        // Conditions that prevent idle and demand system sleep.
 
        do {
                if (userDisabledAllSleep) {
-                       err = 1; // 1. user-space sleep kill switch
+                       err = kPMUserDisabledAllSleep; // 1. user-space sleep kill switch
                        break;
                }
 
                if (systemBooting || systemShutdown || gWillShutdown) {
-                       err = 2; // 2. restart or shutdown in progress
+                       err = kPMSystemRestartBootingInProgress; // 2. restart or shutdown in progress
                        break;
                }
 
@@ -6235,7 +6877,7 @@ IOPMrootDomain::checkSystemSleepAllowed( IOOptionBits options,
                // dark wake, and must be called from gated context.
 
 #if !CONFIG_SLEEP
-               err = 3;    // 3. config does not support sleep
+               err = kPMConfigPreventSystemSleep;    // 3. config does not support sleep
                break;
 #endif
 
@@ -6248,19 +6890,19 @@ IOPMrootDomain::checkSystemSleepAllowed( IOOptionBits options,
                }
 
                if (preventSystemSleepList->getCount() != 0) {
-                       err = 4; // 4. child prevent system sleep clamp
+                       err = kPMChildPreventSystemSleep; // 4. child prevent system sleep clamp
                        break;
                }
 
                if (getPMAssertionLevel( kIOPMDriverAssertionCPUBit ) ==
                    kIOPMDriverAssertionLevelOn) {
-                       err = 5; // 5. CPU assertion
+                       err = kPMCPUAssertion; // 5. CPU assertion
                        break;
                }
 
                if (pciCantSleepValid) {
                        if (pciCantSleepFlag) {
-                               err = 6; // 6. PCI card does not support PM (cached)
+                               err = kPMPCIUnsupported; // 6. PCI card does not support PM (cached)
                        }
                        break;
                } else if (sleepSupportedPEFunction &&
@@ -6282,7 +6924,7 @@ IOPMrootDomain::checkSystemSleepAllowed( IOOptionBits options,
        }while (false);
 
        if (err) {
-               DLOG("System sleep prevented by %d\n", err);
+               DLOG("System sleep prevented by %s\n", getSystemSleepPreventerString(err));
                return false;
        }
        return true;
@@ -6344,6 +6986,295 @@ IOPMrootDomain::mustHibernate( void )
 
 #endif /* HIBERNATION */
 
+//******************************************************************************
+// AOT
+//******************************************************************************
+
+// Tables for accumulated days in year by month, latter used for leap years
+
+static const int daysbymonth[] =
+{ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 };
+
+static const int lydaysbymonth[] =
+{ 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 };
+
+static int __unused
+IOPMConvertSecondsToCalendar(long secs, IOPMCalendarStruct * dt)
+{
+       const int *             dbm = daysbymonth;
+       long                    n, x, y, z;
+
+       if (secs < 0) {
+               return 0;
+       }
+
+       // Calculate seconds, minutes and hours
+
+       n = secs % (24 * 3600);
+       dt->second = n % 60;
+       n /= 60;
+       dt->minute = n % 60;
+       dt->hour = n / 60;
+
+       // Calculate day of week
+
+       n = secs / (24 * 3600);
+//     dt->dayWeek = (n + 4) % 7;
+
+       // Calculate year
+       // Rebase from days since Unix epoch (1/1/1970) store in 'n',
+       // to days since 1/1/1968 to start on 4 year cycle, beginning
+       // on a leap year.
+
+       n += (366 + 365);
+
+       // Every 4 year cycle will be exactly (366 + 365 * 3) = 1461 days.
+       // Valid before 2100, since 2100 is not a leap year.
+
+       x = n / 1461;       // number of 4 year cycles
+       y = n % 1461;       // days into current 4 year cycle
+       z = 1968 + (4 * x);
+
+       // Add in years in the current 4 year cycle
+
+       if (y >= 366) {
+               y -= 366;   // days after the leap year
+               n = y % 365; // days into the current year
+               z += (1 + y / 365); // years after the past 4-yr cycle
+       } else {
+               n = y;
+               dbm = lydaysbymonth;
+       }
+       if (z > 2099) {
+               return 0;
+       }
+
+       dt->year = z;
+
+       // Adjust remaining days value to start at 1
+
+       n += 1;
+
+       // Calculate month
+
+       for (x = 1; n > dbm[x]; x++) {
+               continue;
+       }
+       dt->month = x;
+
+       // Calculate day of month
+
+       dt->day = n - dbm[x - 1];
+
+       return 1;
+}
+
+static long
+IOPMConvertCalendarToSeconds(const IOPMCalendarStruct * dt)
+{
+       const int *             dbm = daysbymonth;
+       long                    y, secs, days;
+
+       if (dt->year < 1970) {
+               return 0;
+       }
+
+       // Seconds elapsed in the current day
+
+       secs = dt->second + 60 * dt->minute + 3600 * dt->hour;
+
+       // Number of days from 1/1/70 to beginning of current year
+       // Account for extra day every 4 years starting at 1973
+
+       y = dt->year - 1970;
+       days = (y * 365) + ((y + 1) / 4);
+
+       // Change table if current year is a leap year
+
+       if ((dt->year % 4) == 0) {
+               dbm = lydaysbymonth;
+       }
+
+       // Add in days elapsed in the current year
+
+       days += (dt->day - 1) + dbm[dt->month - 1];
+
+       // Add accumulated days to accumulated seconds
+
+       secs += 24 * 3600 * days;
+
+       return secs;
+}
+
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+
+unsigned long
+IOPMrootDomain::getRUN_STATE(void)
+{
+       return _aotNow ? AOT_STATE : ON_STATE;
+}
+
+bool
+IOPMrootDomain::isAOTMode()
+{
+       return _aotNow;
+}
+
+IOReturn
+IOPMrootDomain::setWakeTime(uint64_t wakeContinuousTime)
+{
+       clock_sec_t     nowsecs, wakesecs;
+       clock_usec_t    nowmicrosecs, wakemicrosecs;
+       uint64_t        nowAbs, wakeAbs;
+
+       clock_gettimeofday_and_absolute_time(&nowsecs, &nowmicrosecs, &nowAbs);
+       wakeAbs = continuoustime_to_absolutetime(wakeContinuousTime);
+       if (wakeAbs < nowAbs) {
+               printf(LOG_PREFIX "wakeAbs %qd < nowAbs %qd\n", wakeAbs, nowAbs);
+               wakeAbs = nowAbs;
+       }
+       wakeAbs -= nowAbs;
+       absolutetime_to_microtime(wakeAbs, &wakesecs, &wakemicrosecs);
+
+       wakesecs += nowsecs;
+       wakemicrosecs += nowmicrosecs;
+       if (wakemicrosecs >= USEC_PER_SEC) {
+               wakesecs++;
+               wakemicrosecs -= USEC_PER_SEC;
+       }
+       if (wakemicrosecs >= (USEC_PER_SEC / 10)) {
+               wakesecs++;
+       }
+
+       IOPMConvertSecondsToCalendar(wakesecs, &_aotWakeTimeCalendar);
+
+       if (_aotWakeTimeContinuous != wakeContinuousTime) {
+               _aotWakeTimeContinuous = wakeContinuousTime;
+               IOLog(LOG_PREFIX "setWakeTime: " YMDTF "\n", YMDT(&_aotWakeTimeCalendar));
+       }
+       _aotWakeTimeCalendar.selector = kPMCalendarTypeMaintenance;
+       _aotWakeTimeUTC               = wakesecs;
+
+       return kIOReturnSuccess;
+}
+
+// assumes WAKEEVENT_LOCK
+bool
+IOPMrootDomain::aotShouldExit(bool checkTimeSet, bool software)
+{
+       bool exitNow;
+       const char * reason = "";
+
+       if (software) {
+               _aotExit = true;
+               _aotMetrics->softwareRequestCount++;
+               reason = "software request";
+       } else if (kIOPMWakeEventAOTExitFlags & _aotPendingFlags) {
+               _aotExit = true;
+               reason = gWakeReasonString;
+       } else if (checkTimeSet && (kPMCalendarTypeInvalid == _aotWakeTimeCalendar.selector)) {
+               _aotExit = true;
+               _aotMetrics->noTimeSetCount++;
+               reason = "flipbook expired";
+       } else if ((kIOPMAOTModeRespectTimers & _aotMode) && _scheduledAlarmUTC) {
+               clock_sec_t     sec;
+               clock_usec_t    usec;
+               clock_get_calendar_microtime(&sec, &usec);
+               if (_scheduledAlarmUTC <= sec) {
+                       _aotExit = true;
+                       _aotMetrics->rtcAlarmsCount++;
+                       reason = "user alarm";
+               }
+       }
+       exitNow = (_aotNow && _aotExit);
+       if (exitNow) {
+               _aotNow = false;
+               IOLog(LOG_PREFIX "AOT exit for %s, sc %d po %d, cp %d, rj %d, ex %d, nt %d, rt %d\n",
+                   reason,
+                   _aotMetrics->sleepCount,
+                   _aotMetrics->possibleCount,
+                   _aotMetrics->confirmedPossibleCount,
+                   _aotMetrics->rejectedPossibleCount,
+                   _aotMetrics->expiredPossibleCount,
+                   _aotMetrics->noTimeSetCount,
+                   _aotMetrics->rtcAlarmsCount);
+       }
+       return exitNow;
+}
+
+void
+IOPMrootDomain::aotExit(bool cps)
+{
+       _aotTasksSuspended  = false;
+       _aotReadyToFullWake = false;
+       if (_aotTimerScheduled) {
+               _aotTimerES->cancelTimeout();
+               _aotTimerScheduled = false;
+       }
+       updateTasksSuspend();
+
+       _aotMetrics->totalTime += mach_absolute_time() - _aotLastWakeTime;
+       _aotLastWakeTime = 0;
+       if (_aotMetrics->sleepCount && (_aotMetrics->sleepCount <= kIOPMAOTMetricsKernelWakeCountMax)) {
+               strlcpy(&_aotMetrics->kernelWakeReason[_aotMetrics->sleepCount - 1][0],
+                   gWakeReasonString,
+                   sizeof(_aotMetrics->kernelWakeReason[_aotMetrics->sleepCount]));
+       }
+
+       _aotWakeTimeCalendar.selector = kPMCalendarTypeInvalid;
+
+       _systemMessageClientMask = kSystemMessageClientLegacyApp;
+       tellClients(kIOMessageSystemWillPowerOn);
+
+       if (cps) {
+               changePowerStateToPriv(getRUN_STATE());
+       }
+}
+
+void
+IOPMrootDomain::aotEvaluate(IOTimerEventSource * timer)
+{
+       bool exitNow;
+
+       IOLog("aotEvaluate(%d) 0x%x\n", (timer != NULL), _aotPendingFlags);
+
+       WAKEEVENT_LOCK();
+       exitNow = aotShouldExit(false, false);
+       if (timer != NULL) {
+               _aotTimerScheduled = false;
+       }
+       WAKEEVENT_UNLOCK();
+       if (exitNow) {
+               aotExit(true);
+       } else {
+#if 0
+               if (_aotLingerTime) {
+                       uint64_t deadline;
+                       IOLog("aot linger before sleep\n");
+                       clock_absolutetime_interval_to_deadline(_aotLingerTime, &deadline);
+                       clock_delay_until(deadline);
+               }
+#endif
+               privateSleepSystem(kIOPMSleepReasonSoftware);
+       }
+}
+
+#else /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
+unsigned long
+IOPMrootDomain::getRUN_STATE(void)
+{
+       return ON_STATE;
+}
+
+IOReturn
+IOPMrootDomain::setWakeTime(uint64_t wakeContinuousTime)
+{
+       return kIOReturnUnsupported;
+}
+
+#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
 //******************************************************************************
 // adjustPowerState
 //
@@ -6357,13 +7288,45 @@ IOPMrootDomain::mustHibernate( void )
 void
 IOPMrootDomain::adjustPowerState( bool sleepASAP )
 {
-       DLOG("adjustPowerState ps %u, asap %d, idleSleepEnabled %d\n",
-           (uint32_t) getPowerState(), sleepASAP, idleSleepEnabled);
+       DEBUG_LOG("adjustPowerState ps %s, asap %d, idleSleepEnabled %d\n",
+           getPowerStateString((uint32_t) getPowerState()), sleepASAP, idleSleepEnabled);
 
        ASSERT_GATED();
 
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+       if (_aotNow) {
+               bool exitNow;
+
+               if (AOT_STATE != getPowerState()) {
+                       return;
+               }
+               WAKEEVENT_LOCK();
+               exitNow = aotShouldExit(true, false);
+               if (!exitNow
+                   && !_aotTimerScheduled
+                   && (kIOPMWakeEventAOTPossibleExit == (kIOPMWakeEventAOTPossibleFlags & _aotPendingFlags))) {
+                       _aotTimerScheduled = true;
+                       if (_aotLingerTime) {
+                               _aotTimerES->setTimeout(_aotLingerTime);
+                       } else {
+                               _aotTimerES->setTimeout(800, kMillisecondScale);
+                       }
+               }
+               WAKEEVENT_UNLOCK();
+               if (exitNow) {
+                       aotExit(true);
+               } else {
+                       _aotReadyToFullWake = true;
+                       if (!_aotTimerScheduled) {
+                               privateSleepSystem(kIOPMSleepReasonSoftware);
+                       }
+               }
+               return;
+       }
+#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
        if ((!idleSleepEnabled) || !checkSystemSleepEnabled()) {
-               changePowerStateToPriv(ON_STATE);
+               changePowerStateToPriv(getRUN_STATE());
        } else if (sleepASAP) {
                changePowerStateToPriv(SLEEP_STATE);
        }
@@ -6493,7 +7456,7 @@ IOPMrootDomain::dispatchPowerEvent(
                DLOG("power event %u args %p 0x%llx\n", event, OBFUSCATE(arg0), arg1);
                if (systemCapabilityNotifier) {
                        systemCapabilityNotifier->release();
-                       systemCapabilityNotifier = 0;
+                       systemCapabilityNotifier = NULL;
                }
                if (arg0) {
                        systemCapabilityNotifier = (IONotifier *) arg0;
@@ -6692,8 +7655,24 @@ exit:
 IOReturn
 IOPMrootDomain::receivePowerNotification( UInt32 msg )
 {
-       pmPowerStateQueue->submitPowerEvent(
-               kPowerEventReceivedPowerNotification, (void *)(uintptr_t) msg );
+       if (msg & kIOPMPowerButton) {
+               uint32_t currentPhase = pmTracer->getTracePhase();
+               if (currentPhase != kIOPMTracePointSystemUp && currentPhase > kIOPMTracePointSystemSleep) {
+                       DEBUG_LOG("power button pressed during wake. phase = %u\n", currentPhase);
+                       swd_flags |= SWD_PWR_BTN_STACKSHOT;
+                       thread_call_enter(powerButtonDown);
+               } else {
+                       DEBUG_LOG("power button pressed when system is up\n");
+               }
+       } else if (msg & kIOPMPowerButtonUp) {
+               if (swd_flags & SWD_PWR_BTN_STACKSHOT) {
+                       swd_flags &= ~SWD_PWR_BTN_STACKSHOT;
+                       thread_call_enter(powerButtonUp);
+               }
+       } else {
+               pmPowerStateQueue->submitPowerEvent(
+                       kPowerEventReceivedPowerNotification, (void *)(uintptr_t) msg );
+       }
        return kIOReturnSuccess;
 }
 
@@ -6701,6 +7680,7 @@ void
 IOPMrootDomain::handlePowerNotification( UInt32 msg )
 {
        bool        eval_clamshell = false;
+       bool        eval_clamshell_alarm = false;
 
        ASSERT_GATED();
 
@@ -6708,7 +7688,16 @@ IOPMrootDomain::handlePowerNotification( UInt32 msg )
         * Local (IOPMrootDomain only) eval clamshell command
         */
        if (msg & kLocalEvalClamshellCommand) {
-               eval_clamshell = true;
+               if (isRTCAlarmWake) {
+                       eval_clamshell_alarm = true;
+
+                       // reset isRTCAlarmWake. This evaluation should happen only once
+                       // on RTC/Alarm wake. Any clamshell events after wake should follow
+                       // the regular evaluation
+                       isRTCAlarmWake = false;
+               } else {
+                       eval_clamshell = true;
+               }
        }
 
        /*
@@ -6883,31 +7872,17 @@ IOPMrootDomain::handlePowerNotification( UInt32 msg )
        /*
         * Evaluate clamshell and SLEEP if appropiate
         */
-       if (eval_clamshell && clamshellClosed) {
+       if (eval_clamshell_alarm && clamshellClosed) {
+               if (shouldSleepOnRTCAlarmWake()) {
+                       privateSleepSystem(kIOPMSleepReasonClamshell);
+               }
+       } else if (eval_clamshell && clamshellClosed) {
                if (shouldSleepOnClamshellClosed()) {
                        privateSleepSystem(kIOPMSleepReasonClamshell);
                } else {
                        evaluatePolicy( kStimulusDarkWakeEvaluate );
                }
        }
-
-       /*
-        * Power Button
-        */
-       if (msg & kIOPMPowerButton) {
-               DLOG("Powerbutton press\n");
-               if (!wranglerAsleep) {
-                       OSString *pbs = OSString::withCString("DisablePowerButtonSleep");
-                       // Check that power button sleep is enabled
-                       if (pbs) {
-                               if (kOSBooleanTrue != getProperty(pbs)) {
-                                       privateSleepSystem(kIOPMSleepReasonPowerButton);
-                               }
-                       }
-               } else {
-                       reportUserInput();
-               }
-       }
 }
 
 //******************************************************************************
@@ -6966,7 +7941,7 @@ IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
                        if (kFullWakeReasonDisplayOn == fullWakeReason) {
                                fullWakeReason = fFullWakeReasonDisplayOnAndLocalUser;
                                DLOG("User activity while in notification wake\n");
-                               changePowerStateWithOverrideTo( ON_STATE, 0);
+                               changePowerStateWithOverrideTo( getRUN_STATE(), 0);
                        }
 
                        kdebugTrace(kPMLogUserActiveState, 0, 1, 0);
@@ -7166,7 +8141,7 @@ IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
                                // Release power clamp, and wait for children idle.
                                adjustPowerState(true);
                        } else {
-                               changePowerStateToPriv(ON_STATE);
+                               changePowerStateToPriv(getRUN_STATE());
                        }
                }
        }
@@ -7210,7 +8185,7 @@ IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
        if (flags.bit.idleSleepEnabled) {
                DLOG("idle sleep timer enabled\n");
                if (!wrangler) {
-                       changePowerStateToPriv(ON_STATE);
+                       changePowerStateToPriv(getRUN_STATE());
                        startIdleSleepTimer( idleSeconds );
                } else {
                        // Start idle timer if prefs now allow system sleep
@@ -7233,9 +8208,9 @@ IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
        if (flags.bit.adjustPowerState) {
                bool sleepASAP = false;
 
-               if (!systemBooting && (preventIdleSleepList->getCount() == 0)) {
+               if (!systemBooting && (0 == idleSleepPreventersCount())) {
                        if (!wrangler) {
-                               changePowerStateToPriv(ON_STATE);
+                               changePowerStateToPriv(getRUN_STATE());
                                if (idleSleepEnabled) {
                                        // stay awake for at least idleSeconds
                                        startIdleSleepTimer(idleSeconds);
@@ -7249,6 +8224,28 @@ IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
        }
 }
 
+//******************************************************************************
+
+unsigned int
+IOPMrootDomain::idleSleepPreventersCount()
+{
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+       if (_aotMode) {
+               unsigned int count __block;
+               count = 0;
+               preventIdleSleepList->iterateObjects(^bool (OSObject * obj)
+               {
+                       count += (NULL == obj->metaCast("AppleARMBacklight"));
+                       return false;
+               });
+               return count;
+       }
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
+       return preventIdleSleepList->getCount();
+}
+
+
 //******************************************************************************
 // requestFullWake
 //
@@ -7259,7 +8256,7 @@ void
 IOPMrootDomain::requestFullWake( FullWakeReason reason )
 {
        uint32_t        options = 0;
-       IOService *     pciRoot = 0;
+       IOService *     pciRoot = NULL;
        bool            promotion = false;
 
        // System must be in dark wake and a valid reason for entering full wake
@@ -7393,6 +8390,13 @@ IOPMrootDomain::fullWakeDelayedWork( void )
 // evaluateAssertions
 //
 //******************************************************************************
+
+// Bitmask of all kernel assertions that prevent system idle sleep.
+// kIOPMDriverAssertionReservedBit7 is reserved for IOMediaBSDClient.
+#define NO_IDLE_SLEEP_ASSERTIONS_MASK \
+       (kIOPMDriverAssertionReservedBit7 | \
+        kIOPMDriverAssertionPreventSystemIdleSleepBit)
+
 void
 IOPMrootDomain::evaluateAssertions(IOPMDriverAssertionType newAssertions, IOPMDriverAssertionType oldAssertions)
 {
@@ -7410,7 +8414,14 @@ IOPMrootDomain::evaluateAssertions(IOPMDriverAssertionType newAssertions, IOPMDr
        }
 
        if (changedBits & kIOPMDriverAssertionCPUBit) {
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+               if (_aotNow) {
+                       IOLog("CPU assertions %d\n", (0 != (kIOPMDriverAssertionCPUBit & newAssertions)));
+               }
+               evaluatePolicy(_aotNow ? kStimulusNoIdleSleepPreventers : kStimulusDarkWakeEvaluate);
+#else
                evaluatePolicy(kStimulusDarkWakeEvaluate);
+#endif
                if (!assertOnWakeSecs && gIOLastWakeAbsTime) {
                        AbsoluteTime    now;
                        clock_usec_t    microsecs;
@@ -7424,13 +8435,18 @@ IOPMrootDomain::evaluateAssertions(IOPMDriverAssertionType newAssertions, IOPMDr
                }
        }
 
-       if (changedBits & kIOPMDriverAssertionReservedBit7) {
-               bool value = (newAssertions & kIOPMDriverAssertionReservedBit7) ? true : false;
-               if (value) {
-                       DLOG("Driver assertion ReservedBit7 raised. Legacy IO preventing sleep\n");
-                       updatePreventIdleSleepList(this, true);
+       if (changedBits & NO_IDLE_SLEEP_ASSERTIONS_MASK) {
+               if ((newAssertions & NO_IDLE_SLEEP_ASSERTIONS_MASK) != 0) {
+                       if ((oldAssertions & NO_IDLE_SLEEP_ASSERTIONS_MASK) == 0) {
+                               DLOG("PreventIdleSleep driver assertion raised\n");
+                               bool ok = updatePreventIdleSleepList(this, true);
+                               if (ok && (changedBits & kIOPMDriverAssertionPreventSystemIdleSleepBit)) {
+                                       // Cancel idle sleep if there is one in progress
+                                       cancelIdlePowerDown(this);
+                               }
+                       }
                } else {
-                       DLOG("Driver assertion ReservedBit7 dropped\n");
+                       DLOG("PreventIdleSleep driver assertion dropped\n");
                        updatePreventIdleSleepList(this, false);
                }
        }
@@ -7513,7 +8529,7 @@ IOPMrootDomain::pmStatsRecordApplicationResponse(
        OSNumber        *msgNum                 = NULL;
        const OSSymbol  *appname;
        const OSSymbol  *sleep = NULL, *wake = NULL;
-       IOPMServiceInterestNotifier *notify = 0;
+       IOPMServiceInterestNotifier *notify = NULL;
 
        if (object && (notify = OSDynamicCast(IOPMServiceInterestNotifier, object))) {
                if (response->isEqualTo(gIOPMStatsResponseTimedOut)) {
@@ -8032,6 +9048,14 @@ PMTraceWorker::RTC_TRACE(void)
                tracePointHandler( tracePointTarget, traceData32, wordA );
                _LOG("RTC_TRACE wrote 0x%08x 0x%08x\n", traceData32, wordA);
        }
+#if DEVELOPMENT || DEBUG
+       if ((swd_panic_phase != 0) && (swd_panic_phase == tracePhase)) {
+               DEBUG_LOG("Causing sleep wake failure in phase 0x%08x\n", tracePhase);
+               IOLock *l = IOLockAlloc();
+               IOLockLock(l);
+               IOLockLock(l);
+       }
+#endif
 }
 
 int
@@ -8229,7 +9253,7 @@ PMHaltWorker::worker( void )
        if (me) {
                me->release();
        }
-       return 0;
+       return NULL;
 }
 
 void
@@ -8238,7 +9262,7 @@ PMHaltWorker::free( void )
        DLOG("PMHaltWorker free %p\n", OBFUSCATE(this));
        if (lock) {
                IOLockFree(lock);
-               lock = 0;
+               lock = NULL;
        }
        return OSObject::free();
 }
@@ -8291,7 +9315,7 @@ PMHaltWorker::work( PMHaltWorker * me )
        bool            timeout;
 
        while (true) {
-               service = 0;
+               service = NULL;
                timeout = false;
 
                // Claim an unit of work from the shared pool
@@ -8325,7 +9349,7 @@ PMHaltWorker::work( PMHaltWorker * me )
                        while (service->getProperty(gPMHaltClientAcknowledgeKey)) {
                                IOLockSleep(me->lock, me, THREAD_UNINT);
                        }
-                       me->service = 0;
+                       me->service = NULL;
                        timeout = me->timeout;
                        IOLockUnlock(me->lock);
                }
@@ -8754,7 +9778,7 @@ IOPMrootDomain::copyProperty( const char * aKey) const
        }
 
        if (!strcmp(aKey, kIOPMDriverWakeEventsKey)) {
-               OSArray * array = 0;
+               OSArray * array = NULL;
                WAKEEVENT_LOCK();
                if (_systemWakeEventsArray && _systemWakeEventsArray->getCount()) {
                        OSCollection *collection = _systemWakeEventsArray->copyCollection();
@@ -8767,7 +9791,7 @@ IOPMrootDomain::copyProperty( const char * aKey) const
        }
 
        if (!strcmp(aKey, kIOPMSleepStatisticsAppsKey)) {
-               OSArray * array = 0;
+               OSArray * array = NULL;
                IOLockLock(pmStatsLock);
                if (pmStatsAppResponses && pmStatsAppResponses->getCount()) {
                        OSCollection *collection = pmStatsAppResponses->copyCollection();
@@ -8792,6 +9816,17 @@ IOPMrootDomain::copyProperty( const char * aKey) const
                return systemSleepList;
        }
 
+       if (!strcmp(aKey, kIOPMIdleSleepPreventersWithIDKey)) {
+               OSArray *idleSleepList = NULL;
+               gRootDomain->copySleepPreventersListWithID(&idleSleepList, NULL);
+               return idleSleepList;
+       }
+
+       if (!strcmp(aKey, kIOPMSystemSleepPreventersWithIDKey)) {
+               OSArray *systemSleepList = NULL;
+               gRootDomain->copySleepPreventersListWithID(NULL, &systemSleepList);
+               return systemSleepList;
+       }
        return NULL;
 }
 
@@ -8819,12 +9854,18 @@ IOPMrootDomain::acceptSystemWakeEvents( bool accept )
 
        WAKEEVENT_LOCK();
        if (accept) {
-               gWakeReasonString[0] = '\0';
                if (!_systemWakeEventsArray) {
                        _systemWakeEventsArray = OSArray::withCapacity(4);
                }
-               if ((_acceptSystemWakeEvents = (_systemWakeEventsArray != 0))) {
-                       _systemWakeEventsArray->flushCollection();
+               _acceptSystemWakeEvents = (_systemWakeEventsArray != NULL);
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+               if (!(kIOPMWakeEventAOTExitFlags & _aotPendingFlags))
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+               {
+                       gWakeReasonString[0] = '\0';
+                       if (_systemWakeEventsArray) {
+                               _systemWakeEventsArray->flushCollection();
+                       }
                }
        } else {
                _acceptSystemWakeEvents = false;
@@ -8835,7 +9876,7 @@ IOPMrootDomain::acceptSystemWakeEvents( bool accept )
 
                if ((panic_allowed == -1) &&
                    (PE_parse_boot_argn("swd_wakereason_panic", &panic_allowed, sizeof(panic_allowed)) == false)) {
-                       panic_allowed = 1;
+                       panic_allowed = 0;
                }
 
                if (panic_allowed) {
@@ -8873,14 +9914,15 @@ IOPMrootDomain::claimSystemWakeEvent(
        const char *    reason,
        OSObject *      details )
 {
-       const OSSymbol *    deviceName   = 0;
-       OSNumber *          deviceRegId  = 0;
-       OSNumber *          claimTime    = 0;
-       OSData *            flagsData    = 0;
-       OSString *          reasonString = 0;
-       OSDictionary *      d = 0;
+       const OSSymbol *    deviceName   = NULL;
+       OSNumber *          deviceRegId  = NULL;
+       OSNumber *          claimTime    = NULL;
+       OSData *            flagsData    = NULL;
+       OSString *          reasonString = NULL;
+       OSDictionary *      d = NULL;
        uint64_t            timestamp;
        bool                ok = false;
+       bool                addWakeReason;
 
        pmEventTimeStamp(&timestamp);
 
@@ -8888,6 +9930,32 @@ IOPMrootDomain::claimSystemWakeEvent(
                return;
        }
 
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+       IOOptionBits        aotFlags = 0;
+       bool                needAOTEvaluate = FALSE;
+
+       if (kIOPMAOTModeAddEventFlags & _aotMode) {
+               if (!strcmp("hold", reason)
+                   || !strcmp("help", reason)
+                   || !strcmp("menu", reason)
+                   || !strcmp("stockholm", reason)
+                   || !strcmp("ringer", reason)
+                   || !strcmp("ringerab", reason)
+                   || !strcmp("smc0", reason)
+                   || !strcmp("AOP.RTPWakeupAP", reason)
+                   || !strcmp("BT.OutboxNotEmpty", reason)
+                   || !strcmp("WL.OutboxNotEmpty", reason)) {
+                       flags |= kIOPMWakeEventAOTExit;
+               }
+       }
+
+#if DEVELOPMENT || DEBUG
+       if (_aotLingerTime && !strcmp("rtc", reason)) {
+               flags |= kIOPMWakeEventAOTPossibleExit;
+       }
+#endif /* DEVELOPMENT || DEBUG */
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
        deviceName   = device->copyName(gIOServicePlane);
        deviceRegId  = OSNumber::withNumber(device->getRegistryEntryID(), 64);
        claimTime    = OSNumber::withNumber(timestamp, 64);
@@ -8908,6 +9976,34 @@ IOPMrootDomain::claimSystemWakeEvent(
        }
 
        WAKEEVENT_LOCK();
+       addWakeReason = _acceptSystemWakeEvents;
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+       if (_aotMode) {
+               IOLog("claimSystemWakeEvent(%s, %s, 0x%x) 0x%x %d\n", reason, deviceName->getCStringNoCopy(), (int)flags, _aotPendingFlags, _aotReadyToFullWake);
+       }
+       aotFlags        = (kIOPMWakeEventAOTFlags & flags);
+       aotFlags        = (aotFlags & ~_aotPendingFlags);
+       needAOTEvaluate = false;
+       if (_aotNow && aotFlags) {
+               if (kIOPMWakeEventAOTPossibleExit & flags) {
+                       _aotMetrics->possibleCount++;
+               }
+               if (kIOPMWakeEventAOTConfirmedPossibleExit & flags) {
+                       _aotMetrics->confirmedPossibleCount++;
+               }
+               if (kIOPMWakeEventAOTRejectedPossibleExit & flags) {
+                       _aotMetrics->rejectedPossibleCount++;
+               }
+               if (kIOPMWakeEventAOTExpiredPossibleExit & flags) {
+                       _aotMetrics->expiredPossibleCount++;
+               }
+
+               _aotPendingFlags |= aotFlags;
+               addWakeReason     = _aotNow && _systemWakeEventsArray && ((kIOPMWakeEventAOTExitFlags & aotFlags));
+               needAOTEvaluate   = _aotReadyToFullWake;
+       }
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
        if (!gWakeReasonSysctlRegistered) {
                // Lazy registration until the platform driver stops registering
                // the same name.
@@ -8916,14 +10012,20 @@ IOPMrootDomain::claimSystemWakeEvent(
                sysctl_register_oid(&sysctl__kern_wakereason);
 #endif
        }
-       if (_acceptSystemWakeEvents) {
+       if (addWakeReason) {
                ok = _systemWakeEventsArray->setObject(d);
                if (gWakeReasonString[0] != '\0') {
                        strlcat(gWakeReasonString, " ", sizeof(gWakeReasonString));
                }
                strlcat(gWakeReasonString, reason, sizeof(gWakeReasonString));
        }
+
        WAKEEVENT_UNLOCK();
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+       if (needAOTEvaluate) {
+               aotEvaluate(NULL);
+       }
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
 
 done:
        if (deviceName) {
@@ -8959,7 +10061,7 @@ PMSettingHandle::free( void )
        if (pmso) {
                pmso->clientHandleFreed();
                pmso->release();
-               pmso = 0;
+               pmso = NULL;
        }
 
        OSObject::free();
@@ -8985,8 +10087,8 @@ PMSettingObject *PMSettingObject::pmSettingObject(
        OSObject                            * *handle_obj)
 {
        uint32_t                            settingCount = 0;
-       PMSettingObject                     *pmso = 0;
-       PMSettingHandle                     *pmsh = 0;
+       PMSettingObject                     *pmso = NULL;
+       PMSettingHandle                     *pmsh = NULL;
 
        if (!parent_arg || !handler_arg || !settings || !handle_obj) {
                return NULL;
@@ -9253,7 +10355,7 @@ PMAssertionsTracker::createAssertion(
        track.id = OSIncrementAtomic64((SInt64*) &issuingUniqueID);
        track.level = level;
        track.assertionBits = which;
-       track.ownerString = whoItIs ? OSSymbol::withCString(whoItIs):0;
+       track.ownerString = whoItIs ? OSSymbol::withCString(whoItIs):NULL;
        track.ownerService = serviceID;
        track.registryEntryID = serviceID ? serviceID->getRegistryEntryID():0;
        track.modifiedTime = 0;
@@ -9313,7 +10415,7 @@ PMAssertionsTracker::releaseAssertion(
        IOPMDriverAssertionID _id)
 {
        if (owner && owner->pmPowerStateQueue) {
-               owner->pmPowerStateQueue->submitPowerEvent(kPowerEventAssertionRelease, 0, _id);
+               owner->pmPowerStateQueue->submitPowerEvent(kPowerEventAssertionRelease, NULL, _id);
        }
        return kIOReturnSuccess;
 }
@@ -9385,7 +10487,7 @@ PMAssertionsTracker::setUserAssertionLevels(
                                this,
                                &PMAssertionsTracker::handleSetUserAssertionLevels),
                        this,
-                       (void *) &new_user_levels, 0, 0, 0);
+                       (void *) &new_user_levels, NULL, NULL, NULL);
        }
 
        return kIOReturnSuccess;
@@ -9645,7 +10747,7 @@ IOPMrootDomain::getWatchdogTimeout()
 IOReturn
 IOPMrootDomain::restartWithStackshot()
 {
-       takeStackshot(true, true, false);
+       takeStackshot(true);
 
        return kIOReturnSuccess;
 }
@@ -9653,7 +10755,7 @@ IOPMrootDomain::restartWithStackshot()
 void
 IOPMrootDomain::sleepWakeDebugTrig(bool wdogTrigger)
 {
-       takeStackshot(wdogTrigger, false, false);
+       takeStackshot(wdogTrigger);
 }
 
 void
@@ -9810,39 +10912,72 @@ IOPMrootDomain::saveFailureData2File()
        char  failureStr[512];
        errno_t error;
        char *outbuf;
-       bool oswatchdog = false;
+       OSNumber *statusCode;
+       uint64_t pmStatusCode = 0;
+       uint32_t phaseData = 0;
+       uint32_t phaseDetail = 0;
+       bool efiFailure = false;
+
+       statusCode = OSDynamicCast(OSNumber, getProperty(kIOPMSleepWakeFailureCodeKey));
+       if (statusCode) {
+               pmStatusCode = statusCode->unsigned64BitValue();
+               phaseData = pmStatusCode & 0xFFFFFFFF;
+               phaseDetail = (pmStatusCode >> 32) & 0xFFFFFFFF;
+               if ((phaseData & 0xFF) == kIOPMTracePointSystemSleep) {
+                       LOG("Sleep Wake failure in EFI\n");
+                       efiFailure = true;
+                       failureStr[0] = 0;
+                       snprintf(failureStr, sizeof(failureStr), "Sleep Wake failure in EFI\n\nFailure code:: 0x%08x 0x%08x\n\nPlease IGNORE the below stackshot\n", phaseDetail, phaseData);
+                       len = strlen(failureStr);
+               }
+       }
+
+       if (!efiFailure) {
+               if (PEReadNVRAMProperty(kIOSleepWakeFailurePanic, NULL, &len)) {
+                       swd_flags |= SWD_BOOT_BY_SW_WDOG;
+                       PERemoveNVRAMProperty(kIOSleepWakeFailurePanic);
+                       // dump panic will handle saving nvram data
+                       return;
+               }
 
-       if (!PEReadNVRAMProperty(kIOSleepWakeFailureString, NULL, &len) &&
-           !PEReadNVRAMProperty(kIOOSWatchdogFailureString, NULL, &len)) {
-               DLOG("No SleepWake failure or OSWatchdog failure string to read\n");
-               return;
-       }
+               /* Keeping this around for capturing data during power
+                * button press */
 
-       if (len == 0) {
-               DLOG("Ignoring zero byte SleepWake failure string\n");
-               goto exit;
-       }
+               if (!PEReadNVRAMProperty(kIOSleepWakeFailureString, NULL, &len)) {
+                       DLOG("No sleep wake failure string\n");
+                       return;
+               }
+               if (len == 0) {
+                       DLOG("Ignoring zero byte SleepWake failure string\n");
+                       goto exit;
+               }
 
-       if (len > sizeof(failureStr)) {
-               len = sizeof(failureStr);
-       }
-       failureStr[0] = 0;
-       if (PEReadNVRAMProperty(kIOSleepWakeFailureString, failureStr, &len) == false) {
-               if (PEReadNVRAMProperty(kIOOSWatchdogFailureString, failureStr, &len)) {
-                       oswatchdog = true;
+               // if PMStatus code is zero, delete stackshot and return
+               if (statusCode) {
+                       if (((pmStatusCode & 0xFFFFFFFF) & 0xFF) == 0) {
+                               // there was no sleep wake failure
+                               // this can happen if delete stackshot was called
+                               // before take stackshot completed. Let us delete any
+                               // sleep wake failure data in nvram
+                               DLOG("Deleting stackshot on successful wake\n");
+                               deleteStackshot();
+                               return;
+                       }
+               }
+
+               if (len > sizeof(failureStr)) {
+                       len = sizeof(failureStr);
                }
+               failureStr[0] = 0;
+               PEReadNVRAMProperty(kIOSleepWakeFailureString, failureStr, &len);
        }
        if (failureStr[0] != 0) {
-               error = sleepWakeDebugSaveFile(oswatchdog ? kOSWatchdogFailureStringFile : kSleepWakeFailureStringFile,
-                   failureStr, len);
+               error = sleepWakeDebugSaveFile(kSleepWakeFailureStringFile, failureStr, len);
                if (error) {
                        DLOG("Failed to save SleepWake failure string to file. error:%d\n", error);
                } else {
                        DLOG("Saved SleepWake failure string to file.\n");
                }
-               if (!oswatchdog) {
-                       swd_flags |= SWD_BOOT_BY_SW_WDOG;
-               }
        }
 
        if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) {
@@ -9877,8 +11012,17 @@ IOPMrootDomain::saveFailureData2File()
                LOG("Concatenated length for the SWD blob %d\n", concat_len);
 
                if (concat_len) {
-                       error = sleepWakeDebugSaveFile(oswatchdog ? kOSWatchdogStacksFilename : kSleepWakeStacksFilename,
-                           outbuf, concat_len);
+                       error = sleepWakeDebugSaveFile(kSleepWakeStacksFilename, outbuf, concat_len);
+                       if (error) {
+                               LOG("Failed to save SleepWake zipped data to file. error:%d\n", error);
+                       } else {
+                               LOG("Saved SleepWake zipped data to file.\n");
+                       }
+               } else {
+                       // There is a sleep wake failure string but no stackshot
+                       // Write a placeholder stacks file so that swd runs
+                       snprintf(outbuf, 20, "%s", "No stackshot data\n");
+                       error = sleepWakeDebugSaveFile(kSleepWakeStacksFilename, outbuf, 20);
                        if (error) {
                                LOG("Failed to save SleepWake zipped data to file. error:%d\n", error);
                        } else {
@@ -9892,7 +11036,7 @@ IOPMrootDomain::saveFailureData2File()
 
        gRootDomain->swd_lock = 0;
 exit:
-       PERemoveNVRAMProperty(oswatchdog ? kIOOSWatchdogFailureString : kIOSleepWakeFailureString);
+       PERemoveNVRAMProperty(kIOSleepWakeFailureString);
        return;
 }
 
@@ -9952,6 +11096,7 @@ IOPMrootDomain::getFailureData(thread_t *thread, char *failureStr, size_t strLen
                        OSKext *kext = OSKext::lookupKextWithAddress((vm_address_t)callMethod);
                        if (kext) {
                                objectName = kext->getIdentifierCString();
+                               kext->release();
                        }
                }
        }
@@ -10136,238 +11281,222 @@ swd_compress(char *inPtr, char *outPtr, size_t numBytes)
 }
 
 void
-IOPMrootDomain::takeStackshot(bool wdogTrigger, bool isOSXWatchdog, bool isSpinDump)
+IOPMrootDomain::deleteStackshot()
 {
-       swd_hdr *         hdr = NULL;
-       int               wdog_panic = -1;
-       int               cnt = 0;
-       pid_t             pid = 0;
-       kern_return_t     kr = KERN_SUCCESS;
-       uint32_t          flags;
+       if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) {
+               // takeStackshot hasn't completed
+               return;
+       }
+       LOG("Deleting any sleepwake failure data in nvram\n");
 
-       char *            dstAddr;
-       uint32_t          size;
-       uint32_t          bytesRemaining;
-       unsigned          bytesWritten = 0;
-       unsigned          totalBytes = 0;
-       OSString *        UUIDstring = NULL;
+       PERemoveNVRAMProperty(kIOSleepWakeFailureString);
+       char nvram_var_name_buf[20];
+       for (int i = 0; i < 8; i++) {
+               snprintf(nvram_var_name_buf, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, i + 1);
+               if (PERemoveNVRAMProperty(nvram_var_name_buf) == false) {
+                       LOG("Removing %s returned false\n", nvram_var_name_buf);
+               }
+       }
+       // force NVRAM sync
+       if (PEWriteNVRAMProperty(kIONVRAMSyncNowPropertyKey, kIONVRAMSyncNowPropertyKey, strlen(kIONVRAMSyncNowPropertyKey)) == false) {
+               DLOG("Failed to force nvram sync\n");
+       }
+       gRootDomain->swd_lock = 0;
+}
+void
+IOPMrootDomain::takeStackshot(bool wdogTrigger)
+{
+       swd_hdr *                hdr = NULL;
+       int                      cnt = 0;
+       int                      max_cnt = 2;
+       pid_t                    pid = 0;
+       kern_return_t            kr = KERN_SUCCESS;
+       uint32_t                 flags;
 
-       char              failureStr[512];
-       thread_t          thread = NULL;
-       const char *      uuid;
+       char *                   dstAddr;
+       uint32_t                 size;
+       uint32_t                 bytesRemaining;
+       unsigned                 bytesWritten = 0;
 
+       char                     failureStr[512];
+       thread_t                 thread = NULL;
+       const char *             swfPanic = "swfPanic";
 
-       uint32_t          bufSize;
-       uint32_t          initialStackSize;
 
+       uint32_t                  bufSize;
+       int                       success = 0;
 
+       if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) {
+               return;
+       }
 
        failureStr[0] = 0;
-       if (isSpinDump) {
-               if (_systemTransitionType != kSystemTransitionSleep &&
-                   _systemTransitionType != kSystemTransitionWake) {
-                       return;
-               }
-
-               if (gSpinDumpBufferFull) {
-                       return;
-               }
-               if (swd_spindump_buffer == NULL) {
-                       sleepWakeDebugSpinDumpMemAlloc();
-                       if (swd_spindump_buffer == NULL) {
-                               return;
-                       }
-               }
-
-               bufSize = SWD_SPINDUMP_SIZE;
-               initialStackSize = SWD_INITIAL_SPINDUMP_SIZE;
-               hdr = (swd_hdr *)swd_spindump_buffer;
-       } else {
-               if ((kIOSleepWakeWdogOff & gIOKitDebug) || systemBooting || systemShutdown || gWillShutdown) {
-                       return;
-               }
+       if ((kIOSleepWakeWdogOff & gIOKitDebug) || systemBooting || systemShutdown || gWillShutdown) {
+               return;
+       }
 
-               if (isOSXWatchdog) {
-                       snprintf(failureStr, sizeof(failureStr), "Stackshot Reason: ");
-                       snprintf(failureStr, sizeof(failureStr), "%smacOS watchdog triggered failure\n", failureStr);
-               } else if (wdogTrigger) {
-                       if ((UUIDstring = OSDynamicCast(OSString, getProperty(kIOPMSleepWakeUUIDKey))) != NULL) {
-                               uuid = UUIDstring->getCStringNoCopy();
-                               snprintf(failureStr, sizeof(failureStr), "UUID: %s\n", uuid);
-                       }
+       if (wdogTrigger) {
+               getFailureData(&thread, failureStr, sizeof(failureStr));
 
-                       snprintf(failureStr, sizeof(failureStr), "%sStackshot Reason: ", failureStr);
-                       getFailureData(&thread, failureStr, sizeof(failureStr));
-                       if (PEGetCoprocessorVersion() >= kCoprocessorVersion2) {
-                               goto skip_stackshot;
-                       }
-               } else {
-                       snprintf(failureStr, sizeof(failureStr), "%sStackshot triggered for debugging stackshot collection.\n", failureStr);
+               if (PEGetCoprocessorVersion() >= kCoprocessorVersion2) {
+                       goto skip_stackshot;
                }
-               // Take only one stackshot in this case.
-               cnt = SWD_MAX_STACKSHOTS - 1;
+       } else {
+               AbsoluteTime now;
+               uint64_t nsec;
+               clock_get_uptime(&now);
+               SUB_ABSOLUTETIME(&now, &gIOLastWakeAbsTime);
+               absolutetime_to_nanoseconds(now, &nsec);
+               snprintf(failureStr, sizeof(failureStr), "%sPower button pressed during wake transition after %u ms.\n", failureStr, ((int)((nsec) / NSEC_PER_MSEC)));
+       }
 
+       if (swd_buffer == NULL) {
+               sleepWakeDebugMemAlloc();
                if (swd_buffer == NULL) {
-                       sleepWakeDebugMemAlloc();
-                       if (swd_buffer == NULL) {
-                               return;
-                       }
+                       return;
                }
-               hdr = (swd_hdr *)swd_buffer;
-
-               bufSize = hdr->alloc_size;;
-               initialStackSize = bufSize;
        }
+       hdr = (swd_hdr *)swd_buffer;
+       bufSize = hdr->alloc_size;;
 
 
-       if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) {
-               return;
-       }
 
 
        dstAddr = (char*)hdr + hdr->spindump_offset;
-       bytesRemaining = bufSize - hdr->spindump_offset;
-
-       DLOG("Taking snapshot. bytesRemaining: %d\n", bytesRemaining);
-
        flags = STACKSHOT_KCDATA_FORMAT | STACKSHOT_NO_IO_STATS | STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY | STACKSHOT_THREAD_WAITINFO;
-       while (kr == KERN_SUCCESS) {
-               if (cnt == 0) {
-                       /*
-                        * Take stackshot of all process on first sample. Size is restricted
-                        * to SWD_INITIAL_STACK_SIZE
-                        */
-                       pid = -1;
-                       size = (bytesRemaining > initialStackSize) ? initialStackSize : bytesRemaining;
-               } else {
-                       /* Take sample of kernel threads only */
-                       pid = 0;
-                       size = bytesRemaining;
-               }
+       /* If not wdogTrigger only take kernel tasks stackshot
+        */
+       if (wdogTrigger) {
+               pid = -1;
+       } else {
+               pid = 0;
+       }
+
+       /* Attempt to take stackshot with all ACTIVE_KERNEL_THREADS
+        * If we run out of space, take stackshot with only kernel task
+        */
+       while (success == 0 && cnt < max_cnt) {
+               bytesRemaining = bufSize - hdr->spindump_offset;
+               cnt++;
+               DLOG("Taking snapshot. bytesRemaining: %d\n", bytesRemaining);
 
+               size = bytesRemaining;
                kr = stack_snapshot_from_kernel(pid, dstAddr, size, flags, 0, &bytesWritten);
                DLOG("stack_snapshot_from_kernel returned 0x%x. pid: %d bufsize:0x%x flags:0x%x bytesWritten: %d\n",
                    kr, pid, size, flags, bytesWritten);
                if (kr == KERN_INSUFFICIENT_BUFFER_SIZE) {
                        if (pid == -1) {
-                               // Insufficient buffer when trying to take stackshot of user & kernel space threads.
-                               // Continue to take stackshot of just kernel threads
-                               ++cnt;
-                               kr = KERN_SUCCESS;
-                               continue;
-                       } else if (totalBytes == 0) {
-                               MSG("Failed to get stackshot(0x%x) bufsize:0x%x flags:0x%x\n", kr, size, flags);
+                               pid = 0;
+                       } else {
+                               LOG("Insufficient buffer size for only kernel task\n");
+                               break;
                        }
                }
+               if (kr == KERN_SUCCESS) {
+                       if (bytesWritten == 0) {
+                               MSG("Failed to get stackshot(0x%x) bufsize:0x%x flags:0x%x\n", kr, size, flags);
+                               continue;
+                       }
+                       bytesRemaining -= bytesWritten;
+                       hdr->spindump_size = (bufSize - bytesRemaining - hdr->spindump_offset);
 
-               dstAddr += bytesWritten;
-               totalBytes += bytesWritten;
-               bytesRemaining -= bytesWritten;
+                       memset(hdr->reason, 0x20, sizeof(hdr->reason));
 
-               if (++cnt == SWD_MAX_STACKSHOTS) {
-                       break;
-               }
-               IOSleep(10); // 10 ms
-       }
+                       // Compress stackshot and save to NVRAM
+                       {
+                               char *outbuf = (char *)swd_compressed_buffer;
+                               int outlen = 0;
+                               int num_chunks = 0;
+                               int max_chunks = 0;
+                               int leftover = 0;
+                               char nvram_var_name_buffer[20];
 
-       hdr->spindump_size = (bufSize - bytesRemaining - hdr->spindump_offset);
+                               outlen = swd_compress((char*)hdr + hdr->spindump_offset, outbuf, bytesWritten);
 
-       memset(hdr->reason, 0x20, sizeof(hdr->reason));
-       if (isSpinDump) {
-               snprintf(hdr->reason, sizeof(hdr->reason), "\nStackshot reason: Power State Change Delay\n\n");
-               gRootDomain->swd_lock = 0;
-               gSpinDumpBufferFull = true;
-               return;
-       }
+                               if (outlen) {
+                                       max_chunks = outlen / (2096 - 200);
+                                       leftover = outlen % (2096 - 200);
 
-       // Compress stackshot and save to NVRAM
-       {
-               char *outbuf = (char *)swd_compressed_buffer;
-               int outlen = 0;
-               int num_chunks = 0;
-               int max_chunks = 0;
-               int leftover = 0;
-               char nvram_var_name_buffer[20];
-
-               outlen = swd_compress((char*)hdr + hdr->spindump_offset, outbuf, bytesWritten);
-
-               if (outlen) {
-                       max_chunks = outlen / (2096 - 200);
-                       leftover = outlen % (2096 - 200);
-
-                       if (max_chunks < 8) {
-                               for (num_chunks = 0; num_chunks < max_chunks; num_chunks++) {
-                                       snprintf(nvram_var_name_buffer, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, num_chunks + 1);
-                                       if (PEWriteNVRAMPropertyWithCopy(nvram_var_name_buffer, (outbuf + (num_chunks * (2096 - 200))), (2096 - 200)) == FALSE) {
-                                               LOG("Failed to update NVRAM %d\n", num_chunks);
-                                               break;
-                                       }
-                               }
-                               if (leftover) {
-                                       snprintf(nvram_var_name_buffer, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, num_chunks + 1);
-                                       if (PEWriteNVRAMPropertyWithCopy(nvram_var_name_buffer, (outbuf + (num_chunks * (2096 - 200))), leftover) == FALSE) {
-                                               LOG("Failed to update NVRAM with leftovers\n");
+                                       if (max_chunks < 8) {
+                                               for (num_chunks = 0; num_chunks < max_chunks; num_chunks++) {
+                                                       snprintf(nvram_var_name_buffer, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, num_chunks + 1);
+                                                       if (PEWriteNVRAMPropertyWithCopy(nvram_var_name_buffer, (outbuf + (num_chunks * (2096 - 200))), (2096 - 200)) == FALSE) {
+                                                               LOG("Failed to update NVRAM %d\n", num_chunks);
+                                                               break;
+                                                       }
+                                               }
+                                               if (leftover) {
+                                                       snprintf(nvram_var_name_buffer, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, num_chunks + 1);
+                                                       if (PEWriteNVRAMPropertyWithCopy(nvram_var_name_buffer, (outbuf + (num_chunks * (2096 - 200))), leftover) == FALSE) {
+                                                               LOG("Failed to update NVRAM with leftovers\n");
+                                                       }
+                                               }
+                                               success = 1;
+                                               LOG("Successfully saved stackshot to NVRAM\n");
+                                       } else {
+                                               LOG("Compressed failure stackshot is too large. size=%d bytes\n", outlen);
+                                               if (pid == -1) {
+                                                       pid = 0;
+                                               } else {
+                                                       LOG("Compressed failure stackshot of only kernel is too large size=%d bytes\n", outlen);
+                                                       break;
+                                               }
                                        }
                                }
-                       } else {
-                               LOG("Compressed failure stackshot is too large. size=%d bytes\n", outlen);
                        }
                }
        }
 
        if (failureStr[0]) {
-               if (!isOSXWatchdog) {
-                       // append sleep-wake failure code
-                       snprintf(failureStr, sizeof(failureStr), "%s\nFailure code:: 0x%08x %08x\n",
-                           failureStr, pmTracer->getTraceData(), pmTracer->getTracePhase());
-                       if (PEWriteNVRAMProperty(kIOSleepWakeFailureString, failureStr, strlen(failureStr)) == false) {
-                               DLOG("Failed to write SleepWake failure string\n");
-                       }
-               } else {
-                       if (PEWriteNVRAMProperty(kIOOSWatchdogFailureString, failureStr, strlen(failureStr)) == false) {
-                               DLOG("Failed to write OSWatchdog failure string\n");
-                       }
+               // append sleep-wake failure code
+               snprintf(failureStr, sizeof(failureStr), "%s\nFailure code:: 0x%08x %08x\n",
+                   failureStr, pmTracer->getTraceData(), pmTracer->getTracePhase());
+               if (PEWriteNVRAMProperty(kIOSleepWakeFailureString, failureStr, strlen(failureStr)) == false) {
+                       DLOG("Failed to write SleepWake failure string\n");
                }
        }
-       gRootDomain->swd_lock = 0;
+
+       // force NVRAM sync
+       if (PEWriteNVRAMProperty(kIONVRAMSyncNowPropertyKey, kIONVRAMSyncNowPropertyKey, strlen(kIONVRAMSyncNowPropertyKey)) == false) {
+               DLOG("Failed to force nvram sync\n");
+       }
 
 skip_stackshot:
        if (wdogTrigger) {
-               PE_parse_boot_argn("swd_panic", &wdog_panic, sizeof(wdog_panic));
-
-               if ((wdog_panic == 1) || (PEGetCoprocessorVersion() >= kCoprocessorVersion2)) {
-                       if (thread) {
-                               panic_with_thread_context(0, NULL, DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT, thread, "%s", failureStr);
-                       } else {
-                               panic_with_options(0, NULL, DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT, "%s", failureStr);
+               if (PEGetCoprocessorVersion() < kCoprocessorVersion2) {
+                       if (swd_flags & SWD_BOOT_BY_SW_WDOG) {
+                               // If current boot is due to this watch dog trigger restart in previous boot,
+                               // then don't trigger again until at least 1 successful sleep & wake.
+                               if (!(sleepCnt && (displayWakeCnt || darkWakeCnt))) {
+                                       LOG("Shutting down due to repeated Sleep/Wake failures\n");
+                                       if (!tasksSuspended) {
+                                               tasksSuspended = TRUE;
+                                               updateTasksSuspend();
+                                       }
+                                       PEHaltRestart(kPEHaltCPU);
+                                       return;
+                               }
                        }
-                       return;
-               } else if (swd_flags & SWD_BOOT_BY_SW_WDOG) {
-                       // If current boot is due to this watch dog trigger restart in previous boot,
-                       // then don't trigger again until at least 1 successful sleep & wake.
-                       if (!(sleepCnt && (displayWakeCnt || darkWakeCnt))) {
-                               LOG("Shutting down due to repeated Sleep/Wake failures\n");
+                       if (gSwdPanic == 0) {
+                               LOG("Calling panic prevented by swd_panic boot-args. Calling restart");
                                if (!tasksSuspended) {
                                        tasksSuspended = TRUE;
-                                       tasks_system_suspend(true);
+                                       updateTasksSuspend();
                                }
-                               PEHaltRestart(kPEHaltCPU);
-                               return;
+                               PEHaltRestart(kPERestartCPU);
                        }
                }
-       }
-
-
-       if (wdogTrigger) {
-               LOG("Restarting to collect Sleep wake debug logs\n");
-               if (!tasksSuspended) {
-                       tasksSuspended = TRUE;
-                       tasks_system_suspend(true);
+               if (PEWriteNVRAMProperty(kIOSleepWakeFailurePanic, swfPanic, strlen(swfPanic)) == false) {
+                       DLOG("Failed to write SleepWake failure panic key\n");
+               }
+               if (thread) {
+                       panic_with_thread_context(0, NULL, DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT, thread, "%s", failureStr);
+               } else {
+                       panic_with_options(0, NULL, DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT, "%s", failureStr);
                }
-
-               PEHaltRestart(kPERestartCPU);
        } else {
-               saveFailureData2File();
+               gRootDomain->swd_lock = 0;
+               return;
        }
 }
 
@@ -10386,10 +11515,6 @@ IOPMrootDomain::sleepWakeDebugMemAlloc()
                return;
        }
 
-       if (PEGetCoprocessorVersion() >= kCoprocessorVersion2) {
-               return;
-       }
-
        if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) {
                return;
        }
@@ -10556,11 +11681,8 @@ exit:
 void
 IOPMrootDomain::sleepWakeDebugTrig(bool restart)
 {
-       uint32_t wdog_panic = 1;
-
        if (restart) {
-               if (PE_parse_boot_argn("swd_panic", &wdog_panic, sizeof(wdog_panic)) &&
-                   (wdog_panic == 0)) {
+               if (gSwdPanic == 0) {
                        return;
                }
                panic("Sleep/Wake hang detected");
@@ -10569,12 +11691,14 @@ IOPMrootDomain::sleepWakeDebugTrig(bool restart)
 }
 
 void
-IOPMrootDomain::takeStackshot(bool restart, bool isOSXWatchdog, bool isSpinDump)
+IOPMrootDomain::takeStackshot(bool restart)
 {
 #pragma unused(restart)
-#pragma unused(isOSXWatchdog)
 }
-
+void
+IOPMrootDomain::deleteStackshot()
+{
+}
 void
 IOPMrootDomain::sleepWakeDebugMemAlloc()
 {
index f90699c34774610f7d08093d1272caab54909b36..b3d1a5aac617b9be8c766f8a641a76e07865edc9 100644 (file)
 #define super OSObject
 OSDefineMetaClassAndStructors(IOPerfControlClient, OSObject);
 
+static IOPerfControlClient::IOPerfControlClientShared *_Atomic gIOPerfControlClientShared;
+
 bool
 IOPerfControlClient::init(IOService *driver, uint64_t maxWorkCapacity)
 {
+       // TODO: Remove this limit and implement dynamic table growth if workloads are found that exceed this
+       if (maxWorkCapacity > kMaxWorkTableNumEntries) {
+               maxWorkCapacity = kMaxWorkTableNumEntries;
+       }
+
        if (!super::init()) {
                return false;
        }
 
-       interface = PerfControllerInterface{
-               .version = 0,
-               .registerDevice =
-                   [](IOService *device) {
-                           return kIOReturnSuccess;
-                   },
-               .unregisterDevice =
+       shared = atomic_load_explicit(&gIOPerfControlClientShared, memory_order_acquire);
+       if (shared == nullptr) {
+               IOPerfControlClient::IOPerfControlClientShared *expected = shared;
+               shared = reinterpret_cast<IOPerfControlClient::IOPerfControlClientShared*>(kalloc(sizeof(IOPerfControlClientShared)));
+               if (!shared) {
+                       return false;
+               }
+
+               atomic_init(&shared->maxDriverIndex, 0);
+
+               shared->interface = PerfControllerInterface{
+                       .version = 0,
+                       .registerDevice =
                    [](IOService *device) {
                            return kIOReturnSuccess;
                    },
-               .workCanSubmit =
-                   [](IOService *device, PerfControllerInterface::WorkState *state, WorkSubmitArgs *args) {
-                           return false;
-                   },
-               .workSubmit =
-                   [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkSubmitArgs *args) {
-                   },
-               .workBegin =
-                   [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkBeginArgs *args) {
-                   },
-               .workEnd =
-                   [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkEndArgs *args, bool done) {
-                   },
-       };
+                       .unregisterDevice =
+                           [](IOService *device) {
+                                   return kIOReturnSuccess;
+                           },
+                       .workCanSubmit =
+                           [](IOService *device, PerfControllerInterface::WorkState *state, WorkSubmitArgs *args) {
+                                   return false;
+                           },
+                       .workSubmit =
+                           [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkSubmitArgs *args) {
+                           },
+                       .workBegin =
+                           [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkBeginArgs *args) {
+                           },
+                       .workEnd =
+                           [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkEndArgs *args, bool done) {
+                           },
+               };
+
+               shared->interfaceLock = IOLockAlloc();
+               if (!shared->interfaceLock) {
+                       goto shared_init_error;
+               }
 
-       interfaceLock = IOLockAlloc();
-       if (!interfaceLock) {
-               goto error;
-       }
+               shared->deviceRegistrationList = OSSet::withCapacity(4);
+               if (!shared->deviceRegistrationList) {
+                       goto shared_init_error;
+               }
 
-       deviceRegistrationList = OSSet::withCapacity(4);
-       if (!deviceRegistrationList) {
-               goto error;
+               if (!atomic_compare_exchange_strong_explicit(&gIOPerfControlClientShared, &expected, shared, memory_order_acq_rel,
+                   memory_order_acquire)) {
+                       IOLockFree(shared->interfaceLock);
+                       shared->deviceRegistrationList->release();
+                       kfree(shared, sizeof(*shared));
+                       shared = expected;
+               }
        }
 
-       bzero(workTable, sizeof(workTable));
-       memset(&workTable[kIOPerfControlClientWorkUntracked], ~0, sizeof(WorkTableEntry));
-       workTableNextIndex = kIOPerfControlClientWorkUntracked + 1;
+       driverIndex = atomic_fetch_add_explicit(&shared->maxDriverIndex, 1, memory_order_relaxed) + 1;
+       assertf(driverIndex != 0, "Overflow in driverIndex. Too many IOPerfControlClients created.\n");
 
-       workTableLock = IOSimpleLockAlloc();
-       if (!workTableLock) {
-               goto error;
-       }
+       // + 1 since index 0 is unused for kIOPerfControlClientWorkUntracked
+       workTableLength = maxWorkCapacity + 1;
+       assertf(workTableLength <= kWorkTableMaxSize, "%zu exceeds max allowed capacity of %zu", workTableLength, kWorkTableMaxSize);
+       if (maxWorkCapacity > 0) {
+               workTable = reinterpret_cast<WorkTableEntry*>(kalloc(workTableLength * sizeof(WorkTableEntry)));
+               if (!workTable) {
+                       goto error;
+               }
+               bzero(workTable, workTableLength * sizeof(WorkTableEntry));
+               workTableNextIndex = 1;
 
-       // TODO: check sum(maxWorkCapacities) < table size
+               workTableLock = IOSimpleLockAlloc();
+               if (!workTableLock) {
+                       goto error;
+               }
+       }
 
        return true;
 
 error:
-       if (interfaceLock) {
-               IOLockFree(interfaceLock);
-       }
-       if (deviceRegistrationList) {
-               deviceRegistrationList->release();
+       if (workTable) {
+               kfree(workTable, maxWorkCapacity * sizeof(WorkTableEntry));
        }
        if (workTableLock) {
                IOSimpleLockFree(workTableLock);
        }
        return false;
+shared_init_error:
+       if (shared) {
+               if (shared->interfaceLock) {
+                       IOLockFree(shared->interfaceLock);
+               }
+               if (shared->deviceRegistrationList) {
+                       shared->deviceRegistrationList->release();
+               }
+               kfree(shared, sizeof(*shared));
+               shared = nullptr;
+       }
+       return false;
 }
 
-IOPerfControlClient *_Atomic gSharedClient = nullptr;
-
 IOPerfControlClient *
 IOPerfControlClient::copyClient(IOService *driver, uint64_t maxWorkCapacity)
 {
-       IOPerfControlClient *client = atomic_load_explicit(&gSharedClient, memory_order_acquire);
-       if (client == nullptr) {
-               IOPerfControlClient *expected = client;
-               client = new IOPerfControlClient;
-               if (!client || !client->init(driver, maxWorkCapacity)) {
-                       panic("could not create IOPerfControlClient");
-               }
-               if (!atomic_compare_exchange_strong_explicit(&gSharedClient, &expected, client, memory_order_acq_rel,
-                   memory_order_acquire)) {
-                       client->release();
-                       client = expected;
-               }
+       IOPerfControlClient *client = new IOPerfControlClient;
+       if (!client || !client->init(driver, maxWorkCapacity)) {
+               panic("could not create IOPerfControlClient");
        }
-       // TODO: add maxWorkCapacity to existing client
-       client->retain();
        return client;
 }
 
+/* Convert the per driver token into a globally unique token for the performance
+ * controller's consumption. This is achieved by setting the driver's unique
+ * index onto the high order bits. The performance controller is shared between
+ * all drivers and must track all instances separately, while each driver has
+ * its own token table, so this step is needed to avoid token collisions between
+ * drivers.
+ */
+inline uint64_t
+IOPerfControlClient::tokenToGlobalUniqueToken(uint64_t token)
+{
+       return token | (static_cast<uint64_t>(driverIndex) << kWorkTableIndexBits);
+}
+
+/* With this implementation, tokens returned to the driver differ from tokens
+ * passed to the performance controller. This implementation has the nice
+ * property that tokens returns to the driver will aways be between 1 and
+ * the value of maxWorkCapacity passed by the driver to copyClient. The tokens
+ * the performance controller sees will match on the lower order bits and have
+ * the driver index set on the high order bits.
+ */
 uint64_t
 IOPerfControlClient::allocateToken(thread_group *thread_group)
 {
@@ -124,7 +175,7 @@ IOPerfControlClient::getEntryForToken(uint64_t token, IOPerfControlClient::WorkT
                return false;
        }
 
-       if (token >= kWorkTableNumEntries) {
+       if (token >= workTableLength) {
                panic("Invalid work token (%llu): index out of bounds.", token);
        }
 
@@ -141,7 +192,7 @@ IOPerfControlClient::markEntryStarted(uint64_t token, bool started)
                return;
        }
 
-       if (token >= kWorkTableNumEntries) {
+       if (token >= workTableLength) {
                panic("Invalid work token (%llu): index out of bounds.", token);
        }
 
@@ -153,15 +204,15 @@ IOPerfControlClient::registerDevice(__unused IOService *driver, IOService *devic
 {
        IOReturn ret = kIOReturnSuccess;
 
-       IOLockLock(interfaceLock);
+       IOLockLock(shared->interfaceLock);
 
-       if (interface.version > 0) {
-               ret = interface.registerDevice(device);
+       if (shared->interface.version > 0) {
+               ret = shared->interface.registerDevice(device);
        } else {
-               deviceRegistrationList->setObject(device);
+               shared->deviceRegistrationList->setObject(device);
        }
 
-       IOLockUnlock(interfaceLock);
+       IOLockUnlock(shared->interfaceLock);
 
        return ret;
 }
@@ -169,15 +220,15 @@ IOPerfControlClient::registerDevice(__unused IOService *driver, IOService *devic
 void
 IOPerfControlClient::unregisterDevice(__unused IOService *driver, IOService *device)
 {
-       IOLockLock(interfaceLock);
+       IOLockLock(shared->interfaceLock);
 
-       if (interface.version > 0) {
-               interface.unregisterDevice(device);
+       if (shared->interface.version > 0) {
+               shared->interface.unregisterDevice(device);
        } else {
-               deviceRegistrationList->removeObject(device);
+               shared->deviceRegistrationList->removeObject(device);
        }
 
-       IOLockUnlock(interfaceLock);
+       IOLockUnlock(shared->interfaceLock);
 }
 
 uint64_t
@@ -207,25 +258,25 @@ IOPerfControlClient::registerPerformanceController(PerfControllerInterface pci)
 {
        IOReturn result = kIOReturnError;
 
-       IOLockLock(interfaceLock);
+       IOLockLock(shared->interfaceLock);
 
-       if (interface.version == 0 && pci.version > 0) {
+       if (shared->interface.version == 0 && pci.version > 0) {
                assert(pci.registerDevice && pci.unregisterDevice && pci.workCanSubmit && pci.workSubmit && pci.workBegin && pci.workEnd);
                result = kIOReturnSuccess;
 
                OSObject *obj;
-               while ((obj = deviceRegistrationList->getAnyObject())) {
+               while ((obj = shared->deviceRegistrationList->getAnyObject())) {
                        IOService *device = OSDynamicCast(IOService, obj);
                        if (device) {
                                pci.registerDevice(device);
                        }
-                       deviceRegistrationList->removeObject(obj);
+                       shared->deviceRegistrationList->removeObject(obj);
                }
 
-               interface = pci;
+               shared->interface = pci;
        }
 
-       IOLockUnlock(interfaceLock);
+       IOLockUnlock(shared->interfaceLock);
 
        return result;
 }
index d7087fbb250ecbf665090ba38f0a1570bda6cca2..1fb74c64288dfdadcd17eeac6ed90a2e9c4019f2 100644 (file)
@@ -40,6 +40,7 @@
 #include <IOKit/IOTimeStamp.h>
 #include <IOKit/IOUserClient.h>
 #include <IOKit/IOKitDiagnosticsUserClient.h>
+#include <IOKit/IOUserServer.h>
 
 #include <IOKit/system.h>
 #include <sys/csr.h>
@@ -56,15 +57,26 @@ extern "C" {
 
 #define kShutdownTimeout    30 //in secs
 
-#if !CONFIG_EMBEDDED
+#if defined(XNU_TARGET_OS_OSX)
 
 boolean_t coprocessor_cross_panic_enabled = TRUE;
-#define APPLE_SECURE_BOOT_VARIABLE_GUID "94b73556-2197-4702-82a8-3e1337dafbfb"
-#endif /* !CONFIG_EMBEDDED */
+#define APPLE_VENDOR_VARIABLE_GUID "4d1ede05-38c7-4a6a-9cc6-4bcca8b38c14"
+#endif /* defined(XNU_TARGET_OS_OSX) */
 
 void printDictionaryKeys(OSDictionary * inDictionary, char * inMsg);
 static void getCStringForObject(OSObject *inObj, char *outStr, size_t outStrLen);
 
+/*
+ * There are drivers which take mutexes in the quiesce callout or pass
+ * the quiesce/active action to super.  Even though it sometimes panics,
+ * because it doesn't *always* panic, they get away with it.
+ * We need a chicken bit to diagnose and fix them all before this
+ * can be enabled by default.
+ *
+ * <rdar://problem/33831837> tracks turning this on by default.
+ */
+uint32_t gEnforceQuiesceSafety = 0;
+
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 #define super IOService
@@ -133,7 +145,7 @@ IOPlatformExpert::start( IOService * provider )
 
        // Register the presence or lack thereof a system
        // PCI address mapper with the IOMapper class
-       IOMapper::setMapperRequired(0 != getProperty(kIOPlatformMapperPresentKey));
+       IOMapper::setMapperRequired(NULL != getProperty(kIOPlatformMapperPresentKey));
 
        gIOInterruptControllers = OSDictionary::withCapacity(1);
        gIOInterruptControllersLock = IOLockAlloc();
@@ -172,6 +184,9 @@ IOPlatformExpert::start( IOService * provider )
        }
 #endif
 
+       PE_parse_boot_argn("enforce_quiesce_safety", &gEnforceQuiesceSafety,
+           sizeof(gEnforceQuiesceSafety));
+
        return configure(provider);
 }
 
@@ -190,7 +205,7 @@ IOPlatformExpert::configure( IOService * provider )
                        dict->retain();
                        topLevel->removeObject( dict );
                        nub = createNub( dict );
-                       if (0 == nub) {
+                       if (NULL == nub) {
                                continue;
                        }
                        dict->release();
@@ -211,7 +226,7 @@ IOPlatformExpert::createNub( OSDictionary * from )
        if (nub) {
                if (!nub->init( from )) {
                        nub->release();
-                       nub = 0;
+                       nub = NULL;
                }
        }
        return nub;
@@ -291,7 +306,7 @@ IOPlatformExpert::getPhysicalRangeAllocator(void)
                   getProperty("Platform Memory Ranges"));
 }
 
-int (*PE_halt_restart)(unsigned int type) = 0;
+int (*PE_halt_restart)(unsigned int type) = NULL;
 
 int
 IOPlatformExpert::haltRestart(unsigned int type)
@@ -408,7 +423,7 @@ IOPlatformExpert::lookUpInterruptController(OSSymbol *name)
        while (1) {
                object = gIOInterruptControllers->getObject(name);
 
-               if (object != 0) {
+               if (object != NULL) {
                        break;
                }
 
@@ -825,6 +840,9 @@ getCStringForObject(OSObject *inObj, char *outStr, size_t outStrLen)
 /* IOShutdownNotificationsTimedOut
  * - Called from a timer installed by PEHaltRestart
  */
+#ifdef CONFIG_EMBEDDED
+__abortlike
+#endif
 static void
 IOShutdownNotificationsTimedOut(
        thread_call_param_t p0,
@@ -900,6 +918,13 @@ PEHaltRestart(unsigned int type)
        static boolean_t  panic_begin_called = FALSE;
 
        if (type == kPEHaltCPU || type == kPERestartCPU || type == kPEUPSDelayHaltCPU) {
+               /* If we're in the panic path, the locks and memory allocations required below
+                *  could fail. So just try to reboot instead of risking a nested panic.
+                */
+               if (panic_begin_called) {
+                       goto skip_to_haltRestart;
+               }
+
                pmRootDomain = IOService::getPMRootDomain();
                /* Notify IOKit PM clients of shutdown/restart
                 *  Clients subscribe to this message with a call to
@@ -924,10 +949,20 @@ PEHaltRestart(unsigned int type)
                        }
                }
 
-               shutdown_hang = thread_call_allocate( &IOShutdownNotificationsTimedOut,
-                   (thread_call_param_t)(uintptr_t) type);
-               clock_interval_to_deadline( timeout, kSecondScale, &deadline );
-               thread_call_enter1_delayed( shutdown_hang, (thread_call_param_t)(uintptr_t)timeout, deadline );
+#if (DEVELOPMENT || DEBUG)
+               /* Override the default timeout via a boot-arg */
+               uint32_t boot_arg_val;
+               if (PE_parse_boot_argn("halt_restart_timeout", &boot_arg_val, sizeof(boot_arg_val))) {
+                       timeout = boot_arg_val;
+               }
+#endif
+
+               if (timeout) {
+                       shutdown_hang = thread_call_allocate( &IOShutdownNotificationsTimedOut,
+                           (thread_call_param_t)(uintptr_t) type);
+                       clock_interval_to_deadline( timeout, kSecondScale, &deadline );
+                       thread_call_enter1_delayed( shutdown_hang, (thread_call_param_t)(uintptr_t)timeout, deadline );
+               }
 
                pmRootDomain->handlePlatformHaltRestart(type);
                /* This notification should have few clients who all do
@@ -938,7 +973,8 @@ PEHaltRestart(unsigned int type)
                 *  later. PM internals make it very hard to wait for asynchronous
                 *  replies.
                 */
-       } else if (type == kPEPanicRestartCPU || type == kPEPanicSync) {
+       } else if (type == kPEPanicRestartCPU || type == kPEPanicSync || type == kPEPanicRestartCPUNoPanicEndCallouts ||
+           type == kPEPanicRestartCPUNoCallouts) {
                if (type == kPEPanicRestartCPU) {
                        // Notify any listeners that we're done collecting
                        // panic data before we call through to do the restart
@@ -946,13 +982,20 @@ PEHaltRestart(unsigned int type)
                        if (coprocessor_cross_panic_enabled)
 #endif
                        IOCPURunPlatformPanicActions(kPEPanicEnd);
+               }
 
+               if ((type == kPEPanicRestartCPU) || (type == kPEPanicRestartCPUNoPanicEndCallouts)) {
                        // Callout to shutdown the disk driver once we've returned from the
-                       // kPEPanicEnd callback (and we know all core dumps on this system
-                       // are complete).
+                       // kPEPanicEnd callbacks (if appropriate) and we know all coredumps
+                       // on this system are complete).
                        IOCPURunPlatformPanicActions(kPEPanicDiskShutdown);
                }
 
+               if (type == kPEPanicRestartCPUNoPanicEndCallouts || type == kPEPanicRestartCPUNoCallouts) {
+                       // Replace the wrapper type with the type drivers handle
+                       type = kPEPanicRestartCPU;
+               }
+
                // Do an initial sync to flush as much panic data as possible,
                // in case we have a problem in one of the platorm panic handlers.
                // After running the platform handlers, do a final sync w/
@@ -978,6 +1021,7 @@ PEHaltRestart(unsigned int type)
                }
        }
 
+skip_to_haltRestart:
        if (gIOPlatform) {
                return gIOPlatform->haltRestart(type);
        } else {
@@ -988,7 +1032,7 @@ PEHaltRestart(unsigned int type)
 UInt32
 PESavePanicInfo(UInt8 *buffer, UInt32 length)
 {
-       if (gIOPlatform != 0) {
+       if (gIOPlatform != NULL) {
                return gIOPlatform->savePanicInfo(buffer, length);
        } else {
                return 0;
@@ -1268,7 +1312,7 @@ IOPlatformExpert::registerNVRAMController(IONVRAMController * caller)
 {
        OSData *          data;
        IORegistryEntry * entry;
-       OSString *        string = 0;
+       OSString *        string = NULL;
        uuid_string_t     uuid;
 
 #if CONFIG_EMBEDDED
@@ -1302,20 +1346,22 @@ IOPlatformExpert::registerNVRAMController(IONVRAMController * caller)
 
                entry->release();
        }
-#else /* !CONFIG_EMBEDDED */
+#endif /* CONFIG_EMBEDDED */
+
+#if defined(XNU_TARGET_OS_OSX)
        /*
-        * If we have panic debugging enabled and a prod-fused coprocessor,
+        * If we have panic debugging enabled and the bridgeOS panic SoC watchdog is enabled,
         * disable cross panics so that the co-processor doesn't cause the system
         * to reset when we enter the debugger or hit a panic on the x86 side.
         */
        if (panicDebugging) {
                entry = IORegistryEntry::fromPath( "/options", gIODTPlane );
                if (entry) {
-                       data = OSDynamicCast( OSData, entry->getProperty( APPLE_SECURE_BOOT_VARIABLE_GUID":EffectiveProductionStatus" ));
+                       data = OSDynamicCast( OSData, entry->getProperty( APPLE_VENDOR_VARIABLE_GUID":BridgeOSPanicWatchdogEnabled" ));
                        if (data && (data->getLength() == sizeof(UInt8))) {
-                               UInt8 *isProdFused = (UInt8 *) data->getBytesNoCopy();
+                               UInt8 *panicWatchdogEnabled = (UInt8 *) data->getBytesNoCopy();
                                UInt32 debug_flags = 0;
-                               if (*isProdFused || (PE_i_can_has_debugger(&debug_flags) &&
+                               if (*panicWatchdogEnabled || (PE_i_can_has_debugger(&debug_flags) &&
                                    (debug_flags & DB_DISABLE_CROSS_PANIC))) {
                                        coprocessor_cross_panic_enabled = FALSE;
                                }
@@ -1346,9 +1392,9 @@ IOPlatformExpert::registerNVRAMController(IONVRAMController * caller)
 
                entry->release();
        }
-#endif /* !CONFIG_EMBEDDED */
+#endif /* defined(XNU_TARGET_OS_OSX) */
 
-       if (string == 0) {
+       if (string == NULL) {
                entry = IORegistryEntry::fromPath( "/options", gIODTPlane );
                if (entry) {
                        data = OSDynamicCast( OSData, entry->getProperty( "platform-uuid" ));
@@ -1379,17 +1425,29 @@ IOPlatformExpert::callPlatformFunction(const OSSymbol *functionName,
 {
        IOService *service, *_resources;
 
+       if (functionName == gIOPlatformQuiesceActionKey ||
+           functionName == gIOPlatformActiveActionKey) {
+               /*
+                * Services which register for IOPlatformQuiesceAction / IOPlatformActiveAction
+                * must consume that event themselves, without passing it up to super/IOPlatformExpert.
+                */
+               if (gEnforceQuiesceSafety) {
+                       panic("Class %s passed the quiesce/active action to IOPlatformExpert",
+                           getMetaClass()->getClassName());
+               }
+       }
+
        if (waitForFunction) {
                _resources = waitForService(resourceMatching(functionName));
        } else {
                _resources = getResourceService();
        }
-       if (_resources == 0) {
+       if (_resources == NULL) {
                return kIOReturnUnsupported;
        }
 
        service = OSDynamicCast(IOService, _resources->getProperty(functionName));
-       if (service == 0) {
+       if (service == NULL) {
                return kIOReturnUnsupported;
        }
 
@@ -1426,12 +1484,12 @@ IODTPlatformExpert::probe( IOService * provider,
     SInt32 * score )
 {
        if (!super::probe( provider, score)) {
-               return 0;
+               return NULL;
        }
 
        // check machine types
        if (!provider->compareNames( getProperty( gIONameMatchKey ))) {
-               return 0;
+               return NULL;
        }
 
        return this;
@@ -1458,7 +1516,7 @@ IODTPlatformExpert::createNub( IORegistryEntry * from )
        if (nub) {
                if (!nub->init( from, gIODTPlane )) {
                        nub->free();
-                       nub = 0;
+                       nub = NULL;
                }
        }
        return nub;
@@ -1473,7 +1531,7 @@ IODTPlatformExpert::createNubs( IOService * parent, OSIterator * iter )
 
        if (iter) {
                while ((next = (IORegistryEntry *) iter->getNextObject())) {
-                       if (0 == (nub = createNub( next ))) {
+                       if (NULL == (nub = createNub( next ))) {
                                continue;
                        }
 
@@ -1510,7 +1568,7 @@ IODTPlatformExpert::processTopLevel( IORegistryEntry * rootEntry )
                if (dtNVRAM) {
                        if (!dtNVRAM->init(options, gIODTPlane)) {
                                dtNVRAM->release();
-                               dtNVRAM = 0;
+                               dtNVRAM = NULL;
                        } else {
                                dtNVRAM->attach(this);
                                dtNVRAM->registerService();
@@ -1522,7 +1580,7 @@ IODTPlatformExpert::processTopLevel( IORegistryEntry * rootEntry )
        // Publish the cpus.
        cpus = rootEntry->childFromPath( "cpus", gIODTPlane);
        if (cpus) {
-               createNubs( this, IODTFindMatchingEntries( cpus, kIODTExclusive, 0));
+               createNubs( this, IODTFindMatchingEntries( cpus, kIODTExclusive, NULL));
                cpus->release();
        }
 
@@ -1537,7 +1595,7 @@ IODTPlatformExpert::getNubResources( IOService * nub )
                return kIOReturnSuccess;
        }
 
-       IODTResolveAddressing( nub, "reg", 0);
+       IODTResolveAddressing( nub, "reg", NULL);
 
        return kIOReturnSuccess;
 }
@@ -1595,7 +1653,7 @@ IODTPlatformExpert::getMachineName( char * name, int maxLength )
 
        maxLength--;
        prop = (OSData *) getProvider()->getProperty( gIODTModelKey );
-       ok = (0 != prop);
+       ok = (NULL != prop);
 
        if (ok) {
                strlcpy( name, (const char *) prop->getBytesNoCopy(), maxLength );
@@ -1678,7 +1736,7 @@ IODTPlatformExpert::getNVRAMPartitions(void)
        if (dtNVRAM) {
                return dtNVRAM->getNVRAMPartitions();
        } else {
-               return 0;
+               return NULL;
        }
 }
 
@@ -1786,7 +1844,7 @@ bool
 IOPlatformExpertDevice::initWithArgs(
        void * dtTop, void * p2, void * p3, void * p4 )
 {
-       IORegistryEntry *   dt = 0;
+       IORegistryEntry *   dt = NULL;
        bool                ok;
 
        // dtTop may be zero on non- device tree systems
@@ -1826,8 +1884,8 @@ IOPlatformExpertDevice::newUserClient( task_t owningTask, void * securityID,
     IOUserClient ** handler )
 {
        IOReturn            err = kIOReturnSuccess;
-       IOUserClient *      newConnect = 0;
-       IOUserClient *      theConnect = 0;
+       IOUserClient *      newConnect = NULL;
+       IOUserClient *      theConnect = NULL;
 
        switch (type) {
        case kIOKitDiagnosticsClientType:
@@ -1836,6 +1894,12 @@ IOPlatformExpertDevice::newUserClient( task_t owningTask, void * securityID,
                        err = kIOReturnNotPermitted;
                }
                break;
+       case kIOKitUserServerClientType:
+               newConnect = IOUserServer::withTask(owningTask);
+               if (!newConnect) {
+                       err = kIOReturnNotPermitted;
+               }
+               break;
        default:
                err = kIOReturnBadArgument;
        }
index cb982b0d6b26e9918746eddbc7ad675c78adb75b..d36c0c6db75fdafc64a378cb11b499d1c13d159c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2006-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -73,7 +73,7 @@ enum { kDefaultIOSize = 128 * 1024 };
 
 class IOPolledFilePollers : public OSObject
 {
-       OSDeclareDefaultStructors(IOPolledFilePollers)
+       OSDeclareDefaultStructors(IOPolledFilePollers);
 
 public:
        IOService                * media;
@@ -188,7 +188,7 @@ IOPolledFilePollersOpen(IOPolledFileIOVars * filevars, uint32_t state, bool abor
        int32_t                    idx;
 
        vars->abortable = abortable;
-       ioBuffer = 0;
+       ioBuffer = NULL;
 
        if (kIOPolledAfterSleepState == state) {
                vars->ioStatus = 0;
@@ -270,7 +270,7 @@ IOPolledFilePollersClose(IOPolledFileIOVars * filevars, uint32_t state)
                        }
                        if (vars->ioBuffer) {
                                vars->ioBuffer->release();
-                               vars->ioBuffer = 0;
+                               vars->ioBuffer = NULL;
                        }
                }while (false);
        }
@@ -348,7 +348,7 @@ IOStartPolledIO(IOPolledFilePollers * vars,
                return err;
        }
 
-       completion.target    = 0;
+       completion.target    = NULL;
        completion.action    = &IOPolledIOComplete;
        completion.parameter = vars;
 
@@ -452,11 +452,11 @@ IOCopyMediaForDev(dev_t device)
        OSDictionary * matching;
        OSNumber *     num;
        OSIterator *   iter;
-       IOService *    result = 0;
+       IOService *    result = NULL;
 
        matching = IOService::serviceMatching("IOMedia");
        if (!matching) {
-               return 0;
+               return NULL;
        }
        do{
                num = OSNumber::withNumber(major(device), 32);
@@ -489,13 +489,15 @@ IOCopyMediaForDev(dev_t device)
 #define APFSMEDIA_GETHIBERKEY         "getHiberKey"
 
 static IOReturn
-IOGetVolumeCryptKey(dev_t block_dev, OSString ** pKeyUUID,
-    uint8_t * volumeCryptKey, size_t * keySize)
+IOGetVolumeCryptKey(dev_t block_dev,
+    LIBKERN_RETURNS_RETAINED OSString ** pKeyUUID,
+    uint8_t * volumeCryptKey,
+    size_t * keySize)
 {
        IOReturn         err;
        IOService *      part;
-       OSString *       keyUUID = 0;
-       OSString *       keyStoreUUID = 0;
+       OSString *       keyUUID = NULL;
+       OSString *       keyStoreUUID = NULL;
        uuid_t           volumeKeyUUID;
        aks_volume_key_t vek;
        size_t           callerKeySize;
@@ -585,7 +587,7 @@ IOPolledFileOpen(const char * filename,
        _OpenFileContext     ctx;
        OSData *             extentsData = NULL;
        OSNumber *           num;
-       IOService *          part = 0;
+       IOService *          part = NULL;
        dev_t                block_dev;
        dev_t                image_dev;
        AbsoluteTime         startTime, endTime;
@@ -694,7 +696,7 @@ IOPolledFileOpen(const char * filename,
                                (void *) part, (void *) str2,
                                (void *) (uintptr_t) true, (void *) &data);
 #else
-                       data = 0;
+                       data = NULL;
                        err = kIOReturnSuccess;
 #endif
                        if (kIOReturnSuccess != err) {
@@ -713,7 +715,7 @@ IOPolledFileOpen(const char * filename,
 
        if (kIOReturnSuccess != err) {
                HIBLOG("error 0x%x opening polled file\n", err);
-               IOPolledFileClose(&vars, 0, 0, 0, 0, 0);
+               IOPolledFileClose(&vars, 0, NULL, 0, 0, 0);
                if (extentsData) {
                        extentsData->release();
                }
@@ -747,11 +749,11 @@ IOPolledFileClose(IOPolledFileIOVars ** pVars,
        }
        if (vars->fileExtents) {
                vars->fileExtents->release();
-               vars->fileExtents = 0;
+               vars->fileExtents = NULL;
        }
        if (vars->pollers) {
                vars->pollers->release();
-               vars->pollers = 0;
+               vars->pollers = NULL;
        }
 
        if (vars->allocated) {
@@ -1032,7 +1034,7 @@ IOPolledFileRead(IOPolledFileIOVars * vars,
 
                if ((vars->bufferOffset == vars->bufferLimit) && (vars->position < vars->readEnd)) {
                        if (!vars->pollers->io) {
-                               cryptvars = 0;
+                               cryptvars = NULL;
                        }
                        err = IOPolledFilePollersIODone(vars->pollers, true);
                        if (kIOReturnSuccess != err) {
index aca4f6790e259cabb37400d92ab90fac9a2f8bf9..6e114869755ca824d2f94f5b1d0080aca8e690ae 100644 (file)
@@ -82,7 +82,7 @@ IORangeAllocator::init( IORangeScalar endOfRange,
        capacity            = 0;
        capacityIncrement   = _capacity;
        numElements         = 0;
-       elements            = 0;
+       elements            = NULL;
        defaultAlignmentMask = _defaultAlignment - 1;
        options             = _options;
 
@@ -110,7 +110,7 @@ IORangeAllocator::withRange(
        if (thingy && !thingy->init( endOfRange, defaultAlignment,
            capacity, options )) {
                thingy->release();
-               thingy = 0;
+               thingy = NULL;
        }
 
        return thingy;
index 31e8ca1a9309208eabc033fa29e90caba580b8e0..45b3f42e6c25ba6892f7372f9a432577fb5c3d80 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -90,7 +90,7 @@ static uint64_t gIORegistryLastID = kIORegistryIDReserved;
 class IORegistryPlane : public OSObject {
        friend class IORegistryEntry;
 
-       OSDeclareAbstractStructors(IORegistryPlane)
+       OSDeclareAbstractStructors(IORegistryPlane);
 
        const OSSymbol *    nameKey;
        const OSSymbol *    keys[kNumSetIndex];
@@ -255,7 +255,7 @@ IORegistryEntry::makePlane( const char * name )
                if (nameKey) {
                        nameKey->release();
                }
-               plane = 0;
+               plane = NULL;
        }
 
        return plane;
@@ -450,6 +450,7 @@ IORegistryEntry::free( void )
 void
 IORegistryEntry::setPropertyTable( OSDictionary * dict )
 {
+       PLOCK;
        if (dict) {
                dict->retain();
        }
@@ -458,6 +459,7 @@ IORegistryEntry::setPropertyTable( OSDictionary * dict )
        }
 
        fPropertyTable = dict;
+       PUNLOCK;
 }
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
@@ -487,13 +489,13 @@ IORegistryEntry::getProperty( type *                  aKey, \
 { \
     OSObject * obj = getProperty( aKey ); \
     \
-    if ( (0 == obj) && plane && (options & kIORegistryIterateRecursively) ) { \
+    if ( (NULL == obj) && plane && (options & kIORegistryIterateRecursively) ) { \
        IORegistryEntry * entry = (IORegistryEntry *) this; \
        IORegistryIterator * iter; \
        iter = IORegistryIterator::iterateOver( entry, plane, options ); \
         \
        if(iter) { \
-           while ( (0 == obj) && (entry = iter->getNextObject()) ) { \
+           while ( (NULL == obj) && (entry = iter->getNextObject()) ) { \
                obj = entry->getProperty( aKey ); \
            } \
            iter->release(); \
@@ -511,13 +513,13 @@ IORegistryEntry::copyProperty( type *                  aKey, \
 { \
     OSObject * obj = copyProperty( aKey ); \
     \
-    if ( (0 == obj) && plane && (options & kIORegistryIterateRecursively) ) { \
+    if ( (NULL == obj) && plane && (options & kIORegistryIterateRecursively) ) { \
        IORegistryEntry * entry = (IORegistryEntry *) this; \
        IORegistryIterator * iter; \
        iter = IORegistryIterator::iterateOver( entry, plane, options ); \
         \
        if(iter) { \
-           while ( (0 == obj) && (entry = iter->getNextObject()) ) { \
+           while ( (NULL == obj) && (entry = iter->getNextObject()) ) { \
                obj = entry->copyProperty( aKey ); \
            } \
            iter->release(); \
@@ -796,14 +798,14 @@ IORegistryEntry::setIndexedProperty(uint32_t index, OSObject * anObject)
        OSObject *  prior;
 
        if (index >= kIORegistryEntryIndexedPropertyCount) {
-               return 0;
+               return NULL;
        }
 
        array = atomic_load_explicit(&reserved->fIndexedProperties, memory_order_acquire);
        if (!array) {
                array = IONew(OSObject *, kIORegistryEntryIndexedPropertyCount);
                if (!array) {
-                       return 0;
+                       return NULL;
                }
                bzero(array, kIORegistryEntryIndexedPropertyCount * sizeof(array[0]));
                if (!OSCompareAndSwapPtr(NULL, array, &reserved->fIndexedProperties)) {
@@ -811,7 +813,7 @@ IORegistryEntry::setIndexedProperty(uint32_t index, OSObject * anObject)
                }
        }
        if (!reserved->fIndexedProperties) {
-               return 0;
+               return NULL;
        }
 
        prior = reserved->fIndexedProperties[index];
@@ -827,10 +829,10 @@ OSObject *
 IORegistryEntry::getIndexedProperty(uint32_t index) const
 {
        if (index >= kIORegistryEntryIndexedPropertyCount) {
-               return 0;
+               return NULL;
        }
        if (!reserved->fIndexedProperties) {
-               return 0;
+               return NULL;
        }
 
        return reserved->fIndexedProperties[index];
@@ -843,7 +845,7 @@ IORegistryEntry::getIndexedProperty(uint32_t index) const
 const char *
 IORegistryEntry::getName( const IORegistryPlane * plane ) const
 {
-       OSSymbol *          sym = 0;
+       OSSymbol *          sym = NULL;
 
        RLOCK;
        if (plane) {
@@ -865,7 +867,7 @@ const OSSymbol *
 IORegistryEntry::copyName(
        const IORegistryPlane * plane ) const
 {
-       OSSymbol *          sym = 0;
+       OSSymbol *          sym = NULL;
 
        RLOCK;
        if (plane) {
@@ -890,7 +892,7 @@ const OSSymbol *
 IORegistryEntry::copyLocation(
        const IORegistryPlane * plane ) const
 {
-       OSSymbol *          sym = 0;
+       OSSymbol *          sym = NULL;
 
        RLOCK;
        if (plane) {
@@ -911,7 +913,7 @@ const char *
 IORegistryEntry::getLocation( const IORegistryPlane * plane ) const
 {
        const OSSymbol *    sym = copyLocation( plane );
-       const char *        result = 0;
+       const char *        result = NULL;
 
        if (sym) {
                result = sym->getCStringNoCopy();
@@ -963,6 +965,17 @@ IORegistryEntry::setName( const char * name,
        }
 }
 
+void
+IORegistryEntry::setName( const OSString * name,
+    const IORegistryPlane * plane )
+{
+       const OSSymbol * sym = OSSymbol::withString( name );
+       if (sym) {
+               setName( sym, plane );
+               sym->release();
+       }
+}
+
 void
 IORegistryEntry::setLocation( const OSSymbol * location,
     const IORegistryPlane * plane )
@@ -1018,12 +1031,12 @@ IORegistryEntry::compareNames( OSObject * names, OSString ** matched ) const
 {
        OSString *          string;
        OSCollection *      collection;
-       OSIterator *        iter = 0;
+       OSIterator *        iter = NULL;
        bool                result = false;
 
        if ((collection = OSDynamicCast( OSCollection, names))) {
                iter = OSCollectionIterator::withCollection( collection );
-               string = 0;
+               string = NULL;
        } else {
                string = OSDynamicCast( OSString, names);
        }
@@ -1100,7 +1113,7 @@ IORegistryEntry::getPath(  char * path, int * length,
                stack->setObject((OSObject *) entry );
        }
 
-       ok = (0 != parent);
+       ok = (NULL != parent);
        if (ok) {
                index = stack->getCount();
                if (0 == index) {
@@ -1184,7 +1197,7 @@ IORegistryEntry::matchPathLocation( const char * cmp,
     const IORegistryPlane * plane )
 {
        const char  *       str;
-       const char  *       result = 0;
+       const char  *       result = NULL;
        u_quad_t            num1, num2;
        char                lastPathChar, lastLocationChar;
 
@@ -1233,11 +1246,11 @@ IORegistryEntry *
 IORegistryEntry::getChildFromComponent( const char ** opath,
     const IORegistryPlane * plane )
 {
-       IORegistryEntry *   entry = 0;
+       IORegistryEntry *   entry = NULL;
        OSArray *           set;
        unsigned int        index;
        const char *        path;
-       const char *        cmp = 0;
+       const char *        cmp = NULL;
        char                c;
        size_t              len;
        const char *        str;
@@ -1287,7 +1300,7 @@ IORegistryEntry::hasAlias( const IORegistryPlane * plane,
        IORegistryEntry *   entry;
        IORegistryEntry *   entry2;
        const OSSymbol *    key;
-       const OSSymbol *    bestKey = 0;
+       const OSSymbol *    bestKey = NULL;
        OSIterator *        iter;
        OSData *            data;
        const char *        path = "/aliases";
@@ -1328,7 +1341,7 @@ IORegistryEntry::dealiasPath(
        IORegistryEntry *   entry;
        OSData *            data;
        const char *        path = *opath;
-       const char *        rpath = 0;
+       const char *        rpath = NULL;
        const char *        end;
        char                c;
        char                temp[kIOMaxPlaneName + 1];
@@ -1371,8 +1384,8 @@ IORegistryEntry::fromPath(
        int *                   length,
        IORegistryEntry *       fromEntry )
 {
-       IORegistryEntry *   where = 0;
-       IORegistryEntry *   aliasEntry = 0;
+       IORegistryEntry *   where = NULL;
+       IORegistryEntry *   aliasEntry = NULL;
        IORegistryEntry *   next;
        const char *        alias;
        const char *        end;
@@ -1381,11 +1394,11 @@ IORegistryEntry::fromPath(
        char                c;
        char                temp[kIOMaxPlaneName + 1];
 
-       if (0 == path) {
-               return 0;
+       if (NULL == path) {
+               return NULL;
        }
 
-       if (0 == plane) {
+       if (NULL == plane) {
                // get plane name
                end = strchr( path, ':' );
                if (end && ((end - path) < kIOMaxPlaneName)) {
@@ -1394,8 +1407,8 @@ IORegistryEntry::fromPath(
                        path = end + 1;
                }
        }
-       if (0 == plane) {
-               return 0;
+       if (NULL == plane) {
+               return NULL;
        }
 
        // check for alias
@@ -1417,19 +1430,19 @@ IORegistryEntry::fromPath(
        RLOCK;
 
        do {
-               if (0 == where) {
-                       if ((0 == fromEntry) && (*path++ == '/')) {
+               if (NULL == where) {
+                       if ((NULL == fromEntry) && (*path++ == '/')) {
                                fromEntry = gRegistryRoot->getChildEntry( plane );
                        }
                        where = fromEntry;
-                       if (0 == where) {
+                       if (NULL == where) {
                                break;
                        }
                } else {
                        c = *path++;
                        if (c != '/') {
                                if (c && (c != ':')) { // check valid terminator
-                                       where = 0;
+                                       where = NULL;
                                }
                                break;
                        }
@@ -1455,7 +1468,7 @@ IORegistryEntry::fromPath(
                        *length = (len + len2);
                } else if (path[0]) {
                        // no residual path => must be no tail for success
-                       where = 0;
+                       where = NULL;
                }
        }
 
@@ -1523,7 +1536,7 @@ IORegistryEntry::makeLink( IORegistryEntry * to,
                }
        } else {
                links = OSArray::withObjects((const OSObject **) &to, 1, 1 );
-               result = (links != 0);
+               result = (links != NULL);
                if (result) {
                        result = registryTable()->setObject( plane->keys[relation],
                            links );
@@ -1564,7 +1577,7 @@ IORegistryEntry::getParentSetReference(
                return (OSArray *) registryTable()->getObject(
                        plane->keys[kParentSetIndex]);
        } else {
-               return 0;
+               return NULL;
        }
 }
 
@@ -1576,12 +1589,12 @@ IORegistryEntry::getParentIterator(
        OSIterator *        iter;
 
        if (!plane) {
-               return 0;
+               return NULL;
        }
 
        RLOCK;
        links = getParentSetReference( plane );
-       if (0 == links) {
+       if (NULL == links) {
                links = OSArray::withCapacity( 1 );
        } else {
                links = OSArray::withArray( links, links->getCount());
@@ -1600,7 +1613,7 @@ IORegistryEntry::getParentIterator(
 IORegistryEntry *
 IORegistryEntry::copyParentEntry( const IORegistryPlane * plane ) const
 {
-       IORegistryEntry *   entry = 0;
+       IORegistryEntry *   entry = NULL;
        OSArray *           links;
 
        RLOCK;
@@ -1635,7 +1648,7 @@ IORegistryEntry::getChildSetReference( const IORegistryPlane * plane ) const
                return (OSArray *) registryTable()->getObject(
                        plane->keys[kChildSetIndex]);
        } else {
-               return 0;
+               return NULL;
        }
 }
 
@@ -1646,12 +1659,12 @@ IORegistryEntry::getChildIterator( const IORegistryPlane * plane ) const
        OSIterator *        iter;
 
        if (!plane) {
-               return 0;
+               return NULL;
        }
 
        RLOCK;
        links = getChildSetReference( plane );
-       if (0 == links) {
+       if (NULL == links) {
                links = OSArray::withCapacity( 1 );
        } else {
                links = OSArray::withArray( links, links->getCount());
@@ -1687,7 +1700,7 @@ IORegistryEntry *
 IORegistryEntry::copyChildEntry(
        const IORegistryPlane * plane ) const
 {
-       IORegistryEntry *   entry = 0;
+       IORegistryEntry *   entry = NULL;
        OSArray *           links;
 
        RLOCK;
@@ -1824,7 +1837,7 @@ IORegistryEntry::inPlane( const IORegistryPlane * plane ) const
        RLOCK;
 
        if (plane) {
-               ret = (0 != getParentSetReference( plane ));
+               ret = (NULL != getParentSetReference( plane ));
        } else {
                // Check to see if this is in any plane.  If it is in a plane
                // then the registryTable will contain a key with the ParentLinks
@@ -2055,7 +2068,7 @@ IORegistryEntry::detachAll( const IORegistryPlane * plane )
        IORegistryIterator *        regIter;
 
        regIter = IORegistryIterator::iterateOver( this, plane, true );
-       if (0 == regIter) {
+       if (NULL == regIter) {
                return;
        }
        all = regIter->iterateAll();
@@ -2134,11 +2147,11 @@ IORegistryIterator::iterateOver( IORegistryEntry * root,
 {
        IORegistryIterator *        create;
 
-       if (0 == root) {
-               return 0;
+       if (NULL == root) {
+               return NULL;
        }
-       if (0 == plane) {
-               return 0;
+       if (NULL == plane) {
+               return NULL;
        }
 
        create = new IORegistryIterator;
@@ -2152,7 +2165,7 @@ IORegistryIterator::iterateOver( IORegistryEntry * root,
                        create->options = options & ~kIORegistryIteratorInvalidFlag;
                } else {
                        create->release();
-                       create = 0;
+                       create = NULL;
                }
        }
        return create;
@@ -2198,7 +2211,7 @@ IORegistryIterator::enterEntry( const IORegistryPlane * enterPlane )
        assert( where);
 
        if (where) {
-               where->iter = 0;
+               where->iter = NULL;
                where->next = prev;
                where->current = prev->current;
                plane = enterPlane;
@@ -2218,7 +2231,7 @@ IORegistryIterator::exitEntry( void )
 
        if (where->iter) {
                where->iter->release();
-               where->iter = 0;
+               where->iter = NULL;
                if (where->current) {// && (where != &start))
                        where->current->release();
                }
@@ -2242,7 +2255,7 @@ IORegistryIterator::reset( void )
 
        if (done) {
                done->release();
-               done = 0;
+               done = NULL;
        }
 
        where->current = root;
@@ -2265,12 +2278,12 @@ IORegistryIterator::free( void )
 IORegistryEntry *
 IORegistryIterator::getNextObjectFlat( void )
 {
-       IORegistryEntry *   next = 0;
-       OSArray *           links = 0;
+       IORegistryEntry *   next = NULL;
+       OSArray *           links = NULL;
 
        RLOCK;
 
-       if ((0 == where->iter)) {
+       if ((NULL == where->iter)) {
                // just entered - create new iter
                if (isValid()
                    && where->current
@@ -2309,10 +2322,10 @@ IORegistryIterator::getNextObjectRecursive( void )
 
        do{
                next = getNextObjectFlat();
-       } while ((0 == next) && exitEntry());
+       } while ((NULL == next) && exitEntry());
 
        if (next) {
-               if (0 == done) {
+               if (NULL == done) {
                        done = OSOrderedSet::withCapacity( 10 );
                }
                if (done->setObject((OSObject *) next)) {
@@ -2339,7 +2352,7 @@ IORegistryIterator::getCurrentEntry( void )
        if (isValid()) {
                return where->current;
        } else {
-               return 0;
+               return NULL;
        }
 }
 
index 5ef9f43253e1e91ecb66899a659831abde8c32de..7e5abfb411192ba1c22386ef40e7223d301af932 100644 (file)
  */
 
 #include <IOKit/system.h>
-
 #include <IOKit/IOService.h>
 #include <libkern/OSDebug.h>
 #include <libkern/c++/OSContainers.h>
 #include <libkern/c++/OSKext.h>
 #include <libkern/c++/OSUnserialize.h>
+#include <libkern/c++/OSKext.h>
 #include <libkern/Block.h>
 #include <IOKit/IOCatalogue.h>
 #include <IOKit/IOCommand.h>
@@ -46,6 +46,7 @@
 #include <IOKit/IOKitKeysPrivate.h>
 #include <IOKit/IOBSD.h>
 #include <IOKit/IOUserClient.h>
+#include <IOKit/IOUserServer.h>
 #include <IOKit/IOWorkLoop.h>
 #include <IOKit/IOTimeStamp.h>
 #include <IOKit/IOHibernatePrivate.h>
@@ -55,6 +56,7 @@
 #include <IOKit/pwr_mgt/RootDomain.h>
 #include <IOKit/IOCPU.h>
 #include <mach/sync_policy.h>
+#include <mach/thread_info.h>
 #include <IOKit/assert.h>
 #include <sys/errno.h>
 #include <sys/kdebug.h>
 // disabled since lockForArbitration() can be held externally
 #define DEBUG_NOTIFIER_LOCKED   0
 
+enum{
+       kIOUserServerCheckInTimeoutSecs = 120ULL
+};
+
 #include "IOServicePrivate.h"
 #include "IOKitKernelInternal.h"
 
@@ -91,6 +97,7 @@ OSDefineMetaClassAndStructors(_IOConfigThread, OSObject)
 OSDefineMetaClassAndStructors(_IOServiceJob, OSObject)
 
 OSDefineMetaClassAndStructors(IOResources, IOService)
+OSDefineMetaClassAndStructors(IOUserResources, IOService)
 
 OSDefineMetaClassAndStructors(_IOOpenServiceIterator, OSIterator)
 
@@ -107,8 +114,11 @@ const OSSymbol *                gIOInterruptControllersKey;
 const OSSymbol *                gIOInterruptSpecifiersKey;
 
 const OSSymbol *                gIOResourcesKey;
+const OSSymbol *                gIOUserResourcesKey;
 const OSSymbol *                gIOResourceMatchKey;
 const OSSymbol *                gIOResourceMatchedKey;
+const OSSymbol *                gIOResourceIOKitKey;
+
 const OSSymbol *                gIOProviderClassKey;
 const OSSymbol *                gIONameMatchKey;
 const OSSymbol *                gIONameMatchedKey;
@@ -120,12 +130,24 @@ const OSSymbol *                gIOPathMatchKey;
 const OSSymbol *                gIOMatchCategoryKey;
 const OSSymbol *                gIODefaultMatchCategoryKey;
 const OSSymbol *                gIOMatchedServiceCountKey;
+const OSSymbol *                gIOMatchedPersonalityKey;
+const OSSymbol *                gIORematchPersonalityKey;
+const OSSymbol *                gIORematchCountKey;
+const OSSymbol *                gIODEXTMatchCountKey;
 #if !CONFIG_EMBEDDED
 const OSSymbol *                gIOServiceLegacyMatchingRegistryIDKey;
 #endif
 
 const OSSymbol *                gIOMapperIDKey;
 const OSSymbol *                gIOUserClientClassKey;
+
+const OSSymbol *                gIOUserClassKey;
+const OSSymbol *                gIOUserServerClassKey;
+const OSSymbol *                gIOUserServerNameKey;
+const OSSymbol *                gIOUserServerTagKey;
+const OSSymbol *                gIOUserServerCDHashKey;
+const OSSymbol *                gIOUserUserClientKey;
+
 const OSSymbol *                gIOKitDebugKey;
 
 const OSSymbol *                gIOCommandPoolSizeKey;
@@ -158,6 +180,11 @@ const OSSymbol *                gIOFirstMatchNotification;
 const OSSymbol *                gIOTerminatedNotification;
 const OSSymbol *                gIOWillTerminateNotification;
 
+const OSSymbol *                gIOServiceDEXTEntitlementsKey;
+const OSSymbol *                gIODriverKitEntitlementKey;
+const OSSymbol *                gIODriverKitUserClientEntitlementsKey;
+const OSSymbol *                gIOMatchDeferKey;
+
 const OSSymbol *                gIOGeneralInterest;
 const OSSymbol *                gIOBusyInterest;
 const OSSymbol *                gIOAppPowerStateInterest;
@@ -179,6 +206,7 @@ static OSDictionary *           gNotifications;
 static IORecursiveLock *        gNotificationLock;
 
 static IOService *              gIOResources;
+static IOService *              gIOUserResources;
 static IOService *              gIOServiceRoot;
 
 static OSOrderedSet *           gJobs;
@@ -189,6 +217,7 @@ static int                      gNumConfigThreads;
 static int                      gNumWaitingThreads;
 static IOLock *                 gIOServiceBusyLock;
 bool                            gCPUsRunning;
+bool                                                    gKextdWillTerminate;
 
 static thread_t                 gIOTerminateThread;
 static thread_t                 gIOTerminateWorkerThread;
@@ -198,6 +227,10 @@ static OSArray *                gIOStopList;
 static OSArray *                gIOStopProviderList;
 static OSArray *                gIOFinalizeList;
 
+#if !NO_KEXTD
+static OSArray *                gIOMatchDeferList;
+#endif
+
 static SInt32                   gIOConsoleUsersSeed;
 static OSData *                 gIOConsoleUsersSeedValue;
 
@@ -205,10 +238,13 @@ extern const OSSymbol *         gIODTPHandleKey;
 
 const OSSymbol *                gIOPlatformFunctionHandlerSet;
 
+
 static IOLock *                 gIOConsoleUsersLock;
 static thread_call_t            gIOConsoleLockCallout;
 static IONotifier *             gIOServiceNullNotifier;
 
+static uint32_t                 gIODextRelaunchMax = 1000;
+
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 #define LOCKREADNOTIFY()        \
@@ -305,6 +341,35 @@ setLatencyHandler(UInt32 delayType, IOService * target, bool enable);
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
+namespace IOServicePH
+{
+IONotifier          * fRootNotifier;
+OSArray             * fUserServers;
+OSArray             * fUserServersWait;
+OSArray             * fMatchingWork;
+OSArray             * fMatchingDelayed;
+IOService           * fSystemPowerAckTo;
+uint32_t              fSystemPowerAckRef;
+uint8_t               fSystemOff;
+uint8_t               fUserServerOff;
+
+void lock();
+void unlock();
+
+void init(IOPMrootDomain * root);
+
+IOReturn systemPowerChange(
+       void * target,
+       void * refCon,
+       UInt32 messageType, IOService * service,
+       void * messageArgument, vm_size_t argSize);
+
+bool matchingStart(IOService * service);
+void matchingEnd(IOService * service);
+};
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
 void
 IOService::initialize( void )
 {
@@ -327,16 +392,36 @@ IOService::initialize( void )
                kIODefaultMatchCategoryKey );
        gIOMatchedServiceCountKey   = OSSymbol::withCStringNoCopy(
                kIOMatchedServiceCountKey );
+       gIOMatchedPersonalityKey = OSSymbol::withCStringNoCopy(
+               kIOMatchedPersonalityKey );
+       gIORematchPersonalityKey = OSSymbol::withCStringNoCopy(
+               kIORematchPersonalityKey );
+       gIORematchCountKey = OSSymbol::withCStringNoCopy(
+               kIORematchCountKey );
+       gIODEXTMatchCountKey = OSSymbol::withCStringNoCopy(
+               kIODEXTMatchCountKey );
+
 #if !CONFIG_EMBEDDED
        gIOServiceLegacyMatchingRegistryIDKey = OSSymbol::withCStringNoCopy(
                kIOServiceLegacyMatchingRegistryIDKey );
 #endif
 
+       PE_parse_boot_argn("dextrelaunch", &gIODextRelaunchMax, sizeof(gIODextRelaunchMax));
+
        gIOUserClientClassKey = OSSymbol::withCStringNoCopy( kIOUserClientClassKey );
 
+       gIOUserClassKey       = OSSymbol::withCStringNoCopy(kIOUserClassKey);
+
+       gIOUserServerClassKey  = OSSymbol::withCStringNoCopy(kIOUserServerClassKey);
+       gIOUserServerNameKey   = OSSymbol::withCStringNoCopy(kIOUserServerNameKey);
+       gIOUserServerTagKey    = OSSymbol::withCStringNoCopy(kIOUserServerTagKey);
+       gIOUserServerCDHashKey = OSSymbol::withCStringNoCopy(kIOUserServerCDHashKey);
+       gIOUserUserClientKey   = OSSymbol::withCStringNoCopy(kIOUserUserClientKey);
+
        gIOResourcesKey       = OSSymbol::withCStringNoCopy( kIOResourcesClass );
        gIOResourceMatchKey   = OSSymbol::withCStringNoCopy( kIOResourceMatchKey );
        gIOResourceMatchedKey = OSSymbol::withCStringNoCopy( kIOResourceMatchedKey );
+       gIOResourceIOKitKey   = OSSymbol::withCStringNoCopy("IOKit");
 
        gIODeviceMemoryKey  = OSSymbol::withCStringNoCopy( "IODeviceMemory" );
        gIOInterruptControllersKey
@@ -377,6 +462,7 @@ IOService::initialize( void )
                kIOWillTerminateNotification );
        gIOServiceKey               = OSSymbol::withCStringNoCopy( kIOServiceClass);
 
+
        gIOConsoleLockedKey         = OSSymbol::withCStringNoCopy( kIOConsoleLockedKey);
        gIOConsoleUsersKey          = OSSymbol::withCStringNoCopy( kIOConsoleUsersKey);
        gIOConsoleSessionUIDKey     = OSSymbol::withCStringNoCopy( kIOConsoleSessionUIDKey);
@@ -391,6 +477,11 @@ IOService::initialize( void )
 
        gIOConsoleUsersSeedValue           = OSData::withBytesNoCopy(&gIOConsoleUsersSeed, sizeof(gIOConsoleUsersSeed));
 
+       gIOServiceDEXTEntitlementsKey           = OSSymbol::withCStringNoCopy( kIOServiceDEXTEntitlementsKey );
+       gIODriverKitEntitlementKey             = OSSymbol::withCStringNoCopy( kIODriverKitEntitlementKey );
+       gIODriverKitUserClientEntitlementsKey   = OSSymbol::withCStringNoCopy( kIODriverKitUserClientEntitlementsKey );
+       gIOMatchDeferKey                        = OSSymbol::withCStringNoCopy( kIOMatchDeferKey );
+
        gIOPlatformFunctionHandlerSet               = OSSymbol::withCStringNoCopy(kIOPlatformFunctionHandlerSet);
 #if defined(__i386__) || defined(__x86_64__)
        sCPULatencyFunctionName[kCpuDelayBusStall]  = OSSymbol::withCStringNoCopy(kIOPlatformFunctionHandlerMaxBusDelay);
@@ -435,7 +526,8 @@ IOService::initialize( void )
            && gIOConsoleLockCallout && (err == KERN_SUCCESS));
 
        gIOResources = IOResources::resources();
-       assert( gIOResources );
+       gIOUserResources = IOUserResources::resources();
+       assert( gIOResources && gIOUserResources );
 
        gIOServiceNullNotifier = OSTypeAlloc(_IOServiceNullNotifier);
        assert(gIOServiceNullNotifier);
@@ -451,6 +543,9 @@ IOService::initialize( void )
        gIOStopList            = OSArray::withCapacity( 16 );
        gIOStopProviderList    = OSArray::withCapacity( 16 );
        gIOFinalizeList        = OSArray::withCapacity( 16 );
+#if !NO_KEXTD
+       gIOMatchDeferList      = OSArray::withCapacity( 16 );
+#endif
        assert( gIOTerminatePhase2List && gIOStopList && gIOStopProviderList && gIOFinalizeList );
 
        // worker thread that is responsible for terminating / cleaning up threads
@@ -540,6 +635,9 @@ IOService::start( IOService * provider )
 void
 IOService::stop( IOService * provider )
 {
+       if (reserved->uvars && reserved->uvars->started && reserved->uvars->userServer) {
+               reserved->uvars->userServer->serviceStop(this, provider);
+       }
 }
 
 bool
@@ -642,6 +740,9 @@ IOService::free( void )
                if (reserved->interruptStatisticsLock) {
                        IOLockFree(reserved->interruptStatisticsLock);
                }
+               if (reserved->uvars && reserved->uvars->userServer) {
+                       reserved->uvars->userServer->serviceFree(this);
+               }
                IODelete(reserved, ExpansionData, 1);
        }
 
@@ -654,7 +755,7 @@ IOService::free( void )
                }
                IOFree(_interruptSources,
                    _numInterruptSources * sizeofAllIOInterruptSource);
-               _interruptSources = 0;
+               _interruptSources = NULL;
        }
 
        super::free();
@@ -728,7 +829,7 @@ IOService::getServiceRoot( void )
 void
 IOService::detach( IOService * provider )
 {
-       IOService * newProvider = 0;
+       IOService * newProvider = NULL;
        SInt32      busy;
        bool        adjParent;
 
@@ -736,6 +837,23 @@ IOService::detach( IOService * provider )
                LOG("%s::detach(%s)\n", getName(), provider->getName());
        }
 
+#if !NO_KEXTD
+       IOLockLock(gJobsLock);
+       if (gIOMatchDeferList) {
+               auto idx = gIOMatchDeferList->getNextIndexOfObject(this, 0);
+               if (-1U != idx) {
+                       gIOMatchDeferList->removeObject(idx);
+               }
+       }
+       if (IOServicePH::fMatchingDelayed) {
+               auto idx = IOServicePH::fMatchingDelayed->getNextIndexOfObject(this, 0);
+               if (-1U != idx) {
+                       IOServicePH::fMatchingDelayed->removeObject(idx);
+               }
+       }
+       IOLockUnlock(gJobsLock);
+#endif /* NO_KEXTD */
+
        lockForArbitration();
 
        uint64_t regID1 = provider->getRegistryEntryID();
@@ -754,7 +872,7 @@ IOService::detach( IOService * provider )
 
        if (busy) {
                newProvider = getProvider();
-               if (busy && (__state[1] & kIOServiceTermPhase3State) && (0 == newProvider)) {
+               if (busy && (__state[1] & kIOServiceTermPhase3State) && (NULL == newProvider)) {
                        _adjustBusy( -busy );
                }
        }
@@ -781,7 +899,7 @@ IOService::detach( IOService * provider )
                        provider->_adjustBusy( -1 );
                }
                if ((provider->__state[1] & kIOServiceTermPhase3State)
-                   && (0 == provider->getClient())) {
+                   && (NULL == provider->getClient())) {
                        provider->scheduleFinalize(false);
                }
 
@@ -912,7 +1030,7 @@ IOService::startMatching( IOOptionBits options )
                        thread_wakeup((event_t) this /*&__state[1]*/ );
                        IOLockUnlock( gIOServiceBusyLock );
                } else if (!sync || (kIOServiceAsynchronous & options)) {
-                       ok = (0 != _IOServiceJob::startJob( this, kMatchNubJob, options ));
+                       ok = (NULL != _IOServiceJob::startJob( this, kMatchNubJob, options ));
                } else {
                        do {
                                if ((__state[1] & kIOServiceNeedConfigState)) {
@@ -946,12 +1064,47 @@ IOService::startMatching( IOOptionBits options )
        }
 }
 
+
+void
+IOService::startDeferredMatches(void)
+{
+#if !NO_KEXTD
+       OSArray * array;
+
+       IOLockLock(gJobsLock);
+       array = gIOMatchDeferList;
+       gIOMatchDeferList = NULL;
+       IOLockUnlock(gJobsLock);
+
+       if (array) {
+               IOLog("deferred rematching count %d\n", array->getCount());
+               array->iterateObjects(^bool (OSObject * obj)
+               {
+                       ((IOService *)obj)->startMatching(kIOServiceAsynchronous);
+                       return false;
+               });
+               array->release();
+       }
+#endif /* !NO_KEXTD */
+}
+
+void
+IOService::kextdLaunched(void)
+{
+#if !NO_KEXTD
+       IOServiceTrace(IOSERVICE_KEXTD_READY, 0, 0, 0, 0);
+       startDeferredMatches();
+       getServiceRoot()->adjustBusy(-1);
+       IOService::publishUserResource(gIOResourceIOKitKey);
+#endif /* !NO_KEXTD */
+}
+
 IOReturn
 IOService::catalogNewDrivers( OSOrderedSet * newTables )
 {
        OSDictionary *      table;
        OSSet *             set;
-       OSSet *             allSet = 0;
+       OSSet *             allSet = NULL;
        IOService *         service;
 #if IOMATCHDEBUG
        SInt32              count = 0;
@@ -1007,7 +1160,7 @@ _IOServiceJob::startJob( IOService * nub, int type,
        job = new _IOServiceJob;
        if (job && !job->init()) {
                job->release();
-               job = 0;
+               job = NULL;
        }
 
        if (job) {
@@ -1068,7 +1221,7 @@ IOService::getProvider( void ) const
        parent = (IOService *) getParentEntry( gIOServicePlane);
        if (parent == IORegistryEntry::getRegistryRoot()) {
                /* root is not an IOService */
-               parent = 0;
+               parent = NULL;
        }
 
        self->__provider = parent;
@@ -1087,7 +1240,7 @@ IOService::getWorkLoop() const
        if (provider) {
                return provider->getWorkLoop();
        } else {
-               return 0;
+               return NULL;
        }
 }
 
@@ -1117,14 +1270,14 @@ _IOOpenServiceIterator::iterator( OSIterator * _iter,
        _IOOpenServiceIterator * inst;
 
        if (!_iter) {
-               return 0;
+               return NULL;
        }
 
        inst = new _IOOpenServiceIterator;
 
        if (inst && !inst->init()) {
                inst->release();
-               inst = 0;
+               inst = NULL;
        }
        if (inst) {
                inst->iter = _iter;
@@ -1179,7 +1332,7 @@ _IOOpenServiceIterator::reset()
 {
        if (last) {
                last->unlockForArbitration();
-               last = 0;
+               last = NULL;
        }
        iter->reset();
 }
@@ -1187,13 +1340,13 @@ _IOOpenServiceIterator::reset()
 OSIterator *
 IOService::getOpenProviderIterator( void ) const
 {
-       return _IOOpenServiceIterator::iterator( getProviderIterator(), this, 0 );
+       return _IOOpenServiceIterator::iterator( getProviderIterator(), this, NULL );
 }
 
 OSIterator *
 IOService::getOpenClientIterator( void ) const
 {
-       return _IOOpenServiceIterator::iterator( getClientIterator(), 0, this );
+       return _IOOpenServiceIterator::iterator( getClientIterator(), NULL, this );
 }
 
 
@@ -1206,11 +1359,23 @@ IOService::callPlatformFunction( const OSSymbol * functionName,
        IOReturn  result = kIOReturnUnsupported;
        IOService *provider;
 
+       if (functionName == gIOPlatformQuiesceActionKey ||
+           functionName == gIOPlatformActiveActionKey) {
+               /*
+                * Services which register for IOPlatformQuiesceAction / IOPlatformActiveAction
+                * must consume that event themselves, without passing it up to super/IOService.
+                */
+               if (gEnforceQuiesceSafety) {
+                       panic("Class %s passed the quiesce/active action to IOService",
+                           getMetaClass()->getClassName());
+               }
+       }
+
        if (gIOPlatformFunctionHandlerSet == functionName) {
 #if defined(__i386__) || defined(__x86_64__)
                const OSSymbol * functionHandlerName = (const OSSymbol *) param1;
                IOService *      target              = (IOService *) param2;
-               bool             enable              = (param3 != 0);
+               bool             enable              = (param3 != NULL);
 
                if (sCPULatencyFunctionName[kCpuDelayBusStall] == functionHandlerName) {
                        result = setLatencyHandler(kCpuDelayBusStall, target, enable);
@@ -1237,7 +1402,7 @@ IOService::callPlatformFunction( const char * functionName,
        IOReturn result = kIOReturnNoMemory;
        const OSSymbol *functionSymbol = OSSymbol::withCString(functionName);
 
-       if (functionSymbol != 0) {
+       if (functionSymbol != NULL) {
                result = callPlatformFunction(functionSymbol, waitForFunction,
                    param1, param2, param3, param4);
                functionSymbol->release();
@@ -1274,6 +1439,7 @@ IOService::setPlatform( IOPlatformExpert * platform)
 {
        gIOPlatform = platform;
        gIOResources->attachToParent( gIOServiceRoot, gIOServicePlane );
+       gIOUserResources->attachToParent( gIOServiceRoot, gIOServicePlane );
 
 #if defined(__i386__) || defined(__x86_64__)
 
@@ -1300,7 +1466,8 @@ void
 IOService::setPMRootDomain( class IOPMrootDomain * rootDomain)
 {
        gIOPMRootDomain = rootDomain;
-       publishResource("IOKit");
+       publishResource(gIOResourceIOKitKey);
+       IOServicePH::init(rootDomain);
 }
 
 /*
@@ -1355,7 +1522,7 @@ IOService::lockForArbitration( bool isSuccessRequired )
        if (found) { // this object is already locked
                // determine whether it is the same or a different thread trying to lock
                if (active->thread != element->thread) { // it is a different thread
-                       ArbitrationLockQueueElement * victim = 0;
+                       ArbitrationLockQueueElement * victim = NULL;
 
                        // before placing this new thread on the waiting queue, we look for
                        // a deadlock cycle...
@@ -1766,7 +1933,7 @@ applyToInterestNotifiers(const IORegistryEntry *target,
     OSObjectApplierFunction applier,
     void * context )
 {
-       OSArray *  copyArray = 0;
+       OSArray *  copyArray = NULL;
        OSObject * prop;
 
        LOCKREADNOTIFY();
@@ -1862,7 +2029,7 @@ IONotifier *
 IOService::registerInterest( const OSSymbol * typeOfInterest,
     IOServiceInterestHandler handler, void * target, void * ref )
 {
-       _IOServiceInterestNotifier * notify = 0;
+       _IOServiceInterestNotifier * notify = NULL;
        IOReturn rc = kIOReturnError;
 
        notify = new _IOServiceInterestNotifier;
@@ -1877,7 +2044,7 @@ IOService::registerInterest( const OSSymbol * typeOfInterest,
 
        if (rc != kIOReturnSuccess) {
                notify->release();
-               notify = 0;
+               notify = NULL;
        }
 
        return notify;
@@ -1919,7 +2086,7 @@ IOService::registerInterestForNotifier( IONotifier *svcNotify, const OSSymbol *
     IOServiceInterestHandler handler, void * target, void * ref )
 {
        IOReturn rc = kIOReturnSuccess;
-       _IOServiceInterestNotifier  *notify = 0;
+       _IOServiceInterestNotifier  *notify = NULL;
 
        if (!svcNotify || !(notify = OSDynamicCast(_IOServiceInterestNotifier, svcNotify))) {
                return kIOReturnBadArgument;
@@ -1955,7 +2122,7 @@ IOService::registerInterestForNotifier( IONotifier *svcNotify, const OSSymbol *
                                bool ok = setProperty( typeOfInterest, notifyList);
                                notifyList->release();
                                if (!ok) {
-                                       notifyList = 0;
+                                       notifyList = NULL;
                                }
                        }
                }
@@ -1987,7 +2154,7 @@ cleanInterestList( OSObject * head )
 
        LOCKWRITENOTIFY();
        while (queue_entry_t entry = dequeue(&notifyHead->fCommandChain)) {
-               queue_next(entry) = queue_prev(entry) = 0;
+               queue_next(entry) = queue_prev(entry) = NULL;
 
                _IOServiceInterestNotifier * notify;
 
@@ -2071,7 +2238,7 @@ _IOServiceInterestNotifier::remove()
 
        if (queue_next( &chain )) {
                remqueue(&chain);
-               queue_next( &chain) = queue_prev( &chain) = 0;
+               queue_next( &chain) = queue_prev( &chain) = NULL;
                release();
        }
 
@@ -2133,8 +2300,8 @@ _IOServiceInterestNotifier::init()
 static void
 _workLoopAction( IOWorkLoop::Action action,
     IOService * service,
-    void * p0 = 0, void * p1 = 0,
-    void * p2 = 0, void * p3 = 0 )
+    void * p0 = NULL, void * p1 = NULL,
+    void * p2 = NULL, void * p3 = NULL )
 {
        IOWorkLoop * wl;
 
@@ -2170,9 +2337,11 @@ IOService::terminatePhase1( IOOptionBits options )
 {
        IOService *  victim;
        IOService *  client;
+       IOService *  rematchProvider;
        OSIterator * iter;
        OSArray *    makeInactive;
        OSArray *    waitingInactive;
+       IOOptionBits callerOptions;
        int          waitResult = THREAD_AWAKENED;
        bool         wait;
        bool                 ok;
@@ -2181,6 +2350,8 @@ IOService::terminatePhase1( IOOptionBits options )
 
        TLOG("%s[0x%qx]::terminatePhase1(%08llx)\n", getName(), getRegistryEntryID(), (long long)options);
 
+       callerOptions = options;
+       rematchProvider = NULL;
        uint64_t regID = getRegistryEntryID();
        IOServiceTrace(
                IOSERVICE_TERMINATE_PHASE1,
@@ -2260,10 +2431,36 @@ IOService::terminatePhase1( IOOptionBits options )
                                        }
                                }
                                victim->_adjustBusy( 1 );
+
+                               if ((options & kIOServiceTerminateWithRematch) && (victim == this)) {
+                                       OSObject     * obj;
+                                       OSObject     * rematchProps;
+                                       OSNumber     * num;
+                                       uint32_t       count;
+
+                                       rematchProvider = getProvider();
+                                       if (rematchProvider) {
+                                               obj = rematchProvider->copyProperty(gIORematchCountKey);
+                                               num = OSDynamicCast(OSNumber, obj);
+                                               count = 0;
+                                               if (num) {
+                                                       count = num->unsigned32BitValue();
+                                                       count++;
+                                               }
+                                               num = OSNumber::withNumber(count, 32);
+                                               rematchProvider->setProperty(gIORematchCountKey, num);
+                                               rematchProps = copyProperty(gIOMatchedPersonalityKey);
+                                               rematchProvider->setProperty(gIORematchPersonalityKey, rematchProps);
+                                               OSSafeReleaseNULL(num);
+                                               OSSafeReleaseNULL(rematchProps);
+                                               OSSafeReleaseNULL(obj);
+                                       }
+                               }
                        }
                        victim->unlockForArbitration();
                }
                if (victim == this) {
+                       options &= ~kIOServiceTerminateWithRematch;
                        startPhase2 = didInactive;
                }
                if (didInactive) {
@@ -2336,6 +2533,11 @@ IOService::terminatePhase1( IOOptionBits options )
                release();
        }
 
+       if (rematchProvider) {
+               DKLOG(DKS " rematching after dext crash\n", DKN(rematchProvider));
+               rematchProvider->registerService();
+       }
+
        return true;
 }
 
@@ -2420,18 +2622,19 @@ IOService::scheduleTerminatePhase2( IOOptionBits options )
                                        haveDeadline = true;
                                }
                                /* let others do work while we wait */
-                               gIOTerminateThread = 0;
+                               gIOTerminateThread = NULL;
                                IOLockWakeup( gJobsLock, (event_t) &gIOTerminateThread, /* one-thread */ false);
                                waitResult = IOLockSleepDeadline( gJobsLock, &gIOTerminateWork,
                                    deadline, THREAD_UNINT );
                                if (__improbable(waitResult == THREAD_TIMED_OUT)) {
-                                       panic("%s[0x%qx]::terminate(kIOServiceSynchronous) timeout\n", getName(), getRegistryEntryID());
+                                       IOLog("%s[0x%qx]::terminate(kIOServiceSynchronous): THREAD_TIMED_OUT. "
+                                           "Attempting to auto-resolve your deadlock. PLEASE FIX!\n", getName(), getRegistryEntryID());
                                }
                                waitToBecomeTerminateThread();
                        }
                } while (gIOTerminateWork || (wait && (waitResult != THREAD_TIMED_OUT)));
 
-               gIOTerminateThread = 0;
+               gIOTerminateThread = NULL;
                IOLockWakeup( gJobsLock, (event_t) &gIOTerminateThread, /* one-thread */ false);
        } else {
                // ! kIOServiceSynchronous
@@ -2463,7 +2666,7 @@ IOService::terminateThread( void * arg, wait_result_t waitResult )
                        terminateWorker((uintptr_t)arg );
                }
 
-               gIOTerminateThread = 0;
+               gIOTerminateThread = NULL;
                IOLockWakeup( gJobsLock, (event_t) &gIOTerminateThread, /* one-thread */ false);
                IOLockSleep(gJobsLock, &gIOTerminateWork, THREAD_UNINT);
        }
@@ -2521,12 +2724,19 @@ IOService::scheduleFinalize(bool now)
 bool
 IOService::willTerminate( IOService * provider, IOOptionBits options )
 {
+       if (reserved->uvars) {
+               IOUserServer::serviceWillTerminate(this, provider, options);
+       }
        return true;
 }
 
 bool
 IOService::didTerminate( IOService * provider, IOOptionBits options, bool * defer )
 {
+       if (reserved->uvars) {
+               IOUserServer::serviceDidTerminate(this, provider, options, defer);
+       }
+
        if (false == *defer) {
                if (lockForArbitration( true )) {
                        if (false == provider->handleIsOpen( this )) {
@@ -2550,8 +2760,8 @@ IOService::didTerminate( IOService * provider, IOOptionBits options, bool * defe
 void
 IOService::actionWillTerminate( IOService * victim, IOOptionBits options,
     OSArray * doPhase2List,
-    void *unused2 __unused,
-    void *unused3 __unused  )
+    bool user,
+    void *unused3 __unused)
 {
        OSIterator * iter;
        IOService *  client;
@@ -2561,6 +2771,9 @@ IOService::actionWillTerminate( IOService * victim, IOOptionBits options,
        iter = victim->getClientIterator();
        if (iter) {
                while ((client = (IOService *) iter->getNextObject())) {
+                       if (user != (NULL != client->reserved->uvars)) {
+                               continue;
+                       }
                        regID1 = client->getRegistryEntryID();
                        TLOG("%s[0x%qx]::willTerminate(%s[0x%qx], %08llx)\n",
                            client->getName(), regID1,
@@ -2746,7 +2959,7 @@ IOService::terminateWorker( IOOptionBits options )
        doPhase2List  = OSArray::withCapacity( 16 );
        didPhase2List = OSArray::withCapacity( 16 );
        freeList      = OSSet::withCapacity( 16 );
-       if ((0 == doPhase2List) || (0 == didPhase2List) || (0 == freeList)) {
+       if ((NULL == doPhase2List) || (NULL == didPhase2List) || (NULL == freeList)) {
                return;
        }
 
@@ -2819,7 +3032,13 @@ IOService::terminateWorker( IOOptionBits options )
                                        victim->invokeNotifiers(&notifiers);
 
                                        _workLoopAction((IOWorkLoop::Action) &actionWillTerminate,
-                                           victim, (void *)(uintptr_t) options, (void *)(uintptr_t) doPhase2List );
+                                           victim,
+                                           (void *)(uintptr_t) options,
+                                           (void *)(uintptr_t) doPhase2List,
+                                           (void *)(uintptr_t) false);
+
+                                       actionWillTerminate(
+                                               victim, options, doPhase2List, true, NULL);
 
                                        didPhase2List->headQ( victim );
                                }
@@ -2835,7 +3054,7 @@ IOService::terminateWorker( IOOptionBits options )
                                bool scheduleFinalize = false;
                                if (victim->lockForArbitration( true )) {
                                        victim->__state[1] |= kIOServiceTermPhase3State;
-                                       scheduleFinalize = (0 == victim->getClient());
+                                       scheduleFinalize = (NULL == victim->getClient());
                                        victim->unlockForArbitration();
                                }
                                _workLoopAction((IOWorkLoop::Action) &actionDidTerminate,
@@ -3084,6 +3303,10 @@ IOService::open(   IOService *     forClient,
                ok = handleOpen( forClient, options, arg );
        }
 
+       if (ok && forClient && forClient->reserved->uvars && forClient->reserved->uvars->userServer) {
+               forClient->reserved->uvars->userServer->serviceOpen(this, forClient);
+       }
+
        unlockForArbitration();
 
        return ok;
@@ -3102,6 +3325,10 @@ IOService::close(  IOService *     forClient,
        if (wasClosed) {
                handleClose( forClient, options );
                last = (__state[1] & kIOServiceTermPhase3State);
+
+               if (forClient && forClient->reserved->uvars && forClient->reserved->uvars->userServer) {
+                       forClient->reserved->uvars->userServer->serviceClose(this, forClient);
+               }
        }
 
        unlockForArbitration();
@@ -3143,13 +3370,13 @@ IOService::handleOpen(     IOService *     forClient,
 {
        bool        ok;
 
-       ok = (0 == __owner);
+       ok = (NULL == __owner);
        if (ok) {
                __owner = forClient;
        } else if (options & kIOServiceSeize) {
                ok = (kIOReturnSuccess == messageClient( kIOMessageServiceIsRequestingClose,
                    __owner, (void *)(uintptr_t) options ));
-               if (ok && (0 == __owner)) {
+               if (ok && (NULL == __owner)) {
                        __owner = forClient;
                } else {
                        ok = false;
@@ -3163,7 +3390,7 @@ IOService::handleClose(    IOService *     forClient,
     IOOptionBits    options )
 {
        if (__owner == forClient) {
-               __owner = 0;
+               __owner = NULL;
        }
 }
 
@@ -3213,7 +3440,7 @@ IOServiceObjectOrder( const OSObject * entry, void * ref)
        OSObject *          prop;
        SInt32              result;
 
-       prop = 0;
+       prop = NULL;
        result = kIODefaultProbeScore;
        if ((dict = OSDynamicCast( OSDictionary, entry))) {
                offset = OSDynamicCast(OSNumber, dict->getObject( key ));
@@ -3224,7 +3451,7 @@ IOServiceObjectOrder( const OSObject * entry, void * ref)
                offset = OSDynamicCast(OSNumber, prop);
        } else {
                assert( false );
-               offset = 0;
+               offset = NULL;
        }
 
        if (offset) {
@@ -3261,7 +3488,7 @@ IOServiceOrdering( const OSMetaClassBase * inObj1, const OSMetaClassBase * inObj
 IOService *
 IOService::copyClientWithCategory( const OSSymbol * category )
 {
-       IOService *         service = 0;
+       IOService *         service = NULL;
        OSIterator *        iter;
        const OSSymbol *    nextCat;
 
@@ -3336,7 +3563,7 @@ IOService::invokeNotifier( _IOServiceNotifier * notify )
 }
 
 bool
-IOService::invokeNotifiers(OSArray ** willSend)
+IOService::invokeNotifiers(OSArray * willSend[])
 {
        OSArray *            array;
        _IOServiceNotifier * notify;
@@ -3346,7 +3573,7 @@ IOService::invokeNotifiers(OSArray ** willSend)
        if (!array) {
                return true;
        }
-       *willSend = 0;
+       *willSend = NULL;
 
        for (unsigned int idx = 0;
            (notify = (_IOServiceNotifier *) array->getObject(idx));
@@ -3367,30 +3594,56 @@ IOService::invokeNotifiers(OSArray ** willSend)
 void
 IOService::probeCandidates( OSOrderedSet * matches )
 {
-       OSDictionary        *       match = 0;
+       OSDictionary        *       match = NULL;
        OSSymbol            *       symbol;
        IOService           *       inst;
        IOService           *       newInst;
        OSDictionary        *       props;
        SInt32                      score;
        OSNumber            *       newPri;
-       OSOrderedSet        *       familyMatches = 0;
+       OSOrderedSet        *       familyMatches = NULL;
        OSOrderedSet        *       startList;
-       OSDictionary        *       startDict = 0;
+       OSSet               *       kexts = NULL;
+       OSObject            *       kextRef;
+
+       OSDictionary        *       startDict = NULL;
        const OSSymbol      *       category;
        OSIterator          *       iter;
-       _IOServiceNotifier  *               notify;
-       OSObject            *       nextMatch = 0;
+       _IOServiceNotifier  *       notify;
+       OSObject            *       nextMatch = NULL;
        bool                        started;
        bool                        needReloc = false;
+       bool                        matchDeferred = false;
 #if IOMATCHDEBUG
        SInt64                      debugFlags;
 #endif
-       IOService * client = NULL;
-
+       IOService           *       client = NULL;
+       OSObject            *       prop1;
+       OSObject            *       prop2;
+       OSDictionary        *       rematchPersonality;
+       OSNumber            *       num;
+       uint32_t                    count;
+       uint32_t                    dextCount;
+       bool                        isDext;
+       bool                        categoryConsumed;
+
+       prop2 = NULL;
+       count = 0;
+       prop1 = copyProperty(gIORematchPersonalityKey);
+       rematchPersonality = OSDynamicCast(OSDictionary, prop1);
+       if (rematchPersonality) {
+               prop2 = copyProperty(gIORematchCountKey);
+               num   = OSDynamicCast(OSNumber, prop2);
+               if (num) {
+                       count = num->unsigned32BitValue();
+               }
+               OSSafeReleaseNULL(prop2);
+       }
+       dextCount = 0;
 
        assert( matches );
-       while (!needReloc && (nextMatch = matches->getFirstObject())) {
+       while (!needReloc
+           && (nextMatch = matches->getFirstObject())) {
                nextMatch->retain();
                matches->removeObject(nextMatch);
 
@@ -3399,56 +3652,76 @@ IOService::probeCandidates( OSOrderedSet * matches )
                                invokeNotifier( notify );
                        }
                        nextMatch->release();
-                       nextMatch = 0;
+                       nextMatch = NULL;
                        continue;
                } else if (!(match = OSDynamicCast( OSDictionary, nextMatch ))) {
                        nextMatch->release();
-                       nextMatch = 0;
+                       nextMatch = NULL;
                        continue;
                }
 
-               props = 0;
+               props = NULL;
 #if IOMATCHDEBUG
                debugFlags = getDebugFlags( match );
 #endif
 
                do {
+                       isDext = (NULL != match->getObject(gIOUserServerNameKey));
+                       if (isDext && !(kIODKEnable & gIODKDebug)) {
+                               continue;
+                       }
+
                        category = OSDynamicCast( OSSymbol,
                            match->getObject( gIOMatchCategoryKey ));
-                       if (0 == category) {
+                       if (NULL == category) {
                                category = gIODefaultMatchCategoryKey;
                        }
+                       client = copyClientWithCategory(category);
 
-                       if ((client = copyClientWithCategory(category))) {
+                       categoryConsumed = (client != NULL);
+                       if (categoryConsumed) {
 #if IOMATCHDEBUG
                                if ((debugFlags & kIOLogMatch) && (this != gIOResources)) {
                                        LOG("%s: match category %s exists\n", getName(),
                                            category->getCStringNoCopy());
                                }
 #endif
-                               nextMatch->release();
-                               nextMatch = 0;
-
-                               client->release();
-                               client = NULL;
-
-                               continue;
+                               OSSafeReleaseNULL(client);
+                               if (!isDext) {
+                                       break;
+                               }
                        }
 
                        // create a copy now in case its modified during matching
-                       props = OSDictionary::withDictionary( match, match->getCount());
-                       if (0 == props) {
-                               continue;
+                       props = OSDictionary::withDictionary(match, match->getCount());
+                       if (NULL == props) {
+                               break;
                        }
                        props->setCapacityIncrement(1);
 
                        // check the nub matches
                        if (false == matchPassive(props, kIOServiceChangesOK | kIOServiceClassDone)) {
-                               continue;
+                               break;
+                       }
+                       if (isDext) {
+                               dextCount++;
+                               if (categoryConsumed) {
+                                       break;
+                               }
+                       }
+
+                       if (rematchPersonality) {
+                               bool personalityMatch = match->isEqualTo(rematchPersonality);
+                               if (count > gIODextRelaunchMax) {
+                                       personalityMatch = !personalityMatch;
+                               }
+                               if (!personalityMatch) {
+                                       break;
+                               }
                        }
 
                        // Check to see if driver reloc has been loaded.
-                       needReloc = (false == gIOCatalogue->isModuleLoaded( match ));
+                       needReloc = (false == gIOCatalogue->isModuleLoaded( match, &kextRef ));
                        if (needReloc) {
 #if IOMATCHDEBUG
                                if (debugFlags & kIOLogCatalogue) {
@@ -3457,11 +3730,23 @@ IOService::probeCandidates( OSOrderedSet * matches )
 #endif
                                // If reloc hasn't been loaded, exit;
                                // reprobing will occur after reloc has been loaded.
-                               continue;
+                               break;
+                       }
+                       if (kextRef) {
+                               if (NULL == kexts) {
+                                       kexts = OSSet::withCapacity(1);
+                               }
+                               if (kexts) {
+                                       kexts->setObject(kextRef);
+                                       kextRef->release();
+                               }
+                       }
+                       if (isDext) {
+                               // copy saved for rematchng
+                               props->setObject(gIOMatchedPersonalityKey, match);
                        }
-
                        // reorder on family matchPropertyTable score.
-                       if (0 == familyMatches) {
+                       if (NULL == familyMatches) {
                                familyMatches = OSOrderedSet::withCapacity( 1,
                                    IOServiceOrdering, (void *) gIOProbeScoreKey );
                        }
@@ -3470,16 +3755,11 @@ IOService::probeCandidates( OSOrderedSet * matches )
                        }
                } while (false);
 
-               if (nextMatch) {
-                       nextMatch->release();
-                       nextMatch = 0;
-               }
-               if (props) {
-                       props->release();
-               }
+               OSSafeReleaseNULL(nextMatch);
+               OSSafeReleaseNULL(props);
        }
        matches->release();
-       matches = 0;
+       matches = NULL;
 
        if (familyMatches) {
                while (!needReloc
@@ -3487,8 +3767,8 @@ IOService::probeCandidates( OSOrderedSet * matches )
                        props->retain();
                        familyMatches->removeObject( props );
 
-                       inst = 0;
-                       newInst = 0;
+                       inst = NULL;
+                       newInst = NULL;
 #if IOMATCHDEBUG
                        debugFlags = getDebugFlags( props );
 #endif
@@ -3526,7 +3806,7 @@ IOService::probeCandidates( OSOrderedSet * matches )
                                // give the driver the default match category if not specified
                                category = OSDynamicCast( OSSymbol,
                                    props->getObject( gIOMatchCategoryKey ));
-                               if (0 == category) {
+                               if (NULL == category) {
                                        category = gIODefaultMatchCategoryKey;
                                }
                                inst->setProperty( gIOMatchCategoryKey, (OSObject *) category );
@@ -3548,7 +3828,7 @@ IOService::probeCandidates( OSOrderedSet * matches )
 
                                newInst = inst->probe( this, &score );
                                inst->detach( this );
-                               if (0 == newInst) {
+                               if (NULL == newInst) {
 #if IOMATCHDEBUG
                                        if (debugFlags & kIOLogProbe) {
                                                IOLog("%s::probe fails\n", symbol->getCStringNoCopy());
@@ -3565,13 +3845,13 @@ IOService::probeCandidates( OSOrderedSet * matches )
                                }
 
                                // add to start list for the match category
-                               if (0 == startDict) {
+                               if (NULL == startDict) {
                                        startDict = OSDictionary::withCapacity( 1 );
                                }
                                assert( startDict );
                                startList = (OSOrderedSet *)
                                    startDict->getObject( category );
-                               if (0 == startList) {
+                               if (NULL == startList) {
                                        startList = OSOrderedSet::withCapacity( 1,
                                            IOServiceOrdering, (void *) gIOProbeScoreKey );
                                        if (startDict && startList) {
@@ -3591,7 +3871,7 @@ IOService::probeCandidates( OSOrderedSet * matches )
                        }
                }
                familyMatches->release();
-               familyMatches = 0;
+               familyMatches = NULL;
        }
 
        // start the best (until success) of each category
@@ -3607,6 +3887,7 @@ IOService::probeCandidates( OSOrderedSet * matches )
 
                        started = false;
                        while (true // (!started)
+                           && !matchDeferred
                            && (inst = (IOService *)startList->getFirstObject())) {
                                inst->retain();
                                startList->removeObject(inst);
@@ -3623,20 +3904,53 @@ IOService::probeCandidates( OSOrderedSet * matches )
                                }
 #endif
                                if (false == started) {
-                                       started = startCandidate( inst );
-                               }
+#if !NO_KEXTD
+                                       IOLockLock(gJobsLock);
+                                       matchDeferred = (gIOMatchDeferList
+                                           && (kOSBooleanTrue == inst->getProperty(gIOMatchDeferKey)));
+                                       if (matchDeferred && (-1U == gIOMatchDeferList->getNextIndexOfObject(this, 0))) {
+                                               gIOMatchDeferList->setObject(this);
+                                       }
+                                       IOLockUnlock(gJobsLock);
+                                       if (matchDeferred) {
+                                               symbol = OSDynamicCast(OSSymbol, inst->getProperty(gIOClassKey));
+                                               IOLog("%s(0x%qx): matching deferred by %s\n",
+                                                   getName(), getRegistryEntryID(),
+                                                   symbol ? symbol->getCStringNoCopy() : "");
+                                               // rematching will occur after kextd loads all plists
+                                       }
+#endif
+                                       if (!matchDeferred) {
+                                               started = startCandidate( inst );
 #if IOMATCHDEBUG
-                               if ((debugFlags & kIOLogStart) && (false == started)) {
-                                       LOG( "%s::start(%s) <%d> failed\n", inst->getName(), getName(),
-                                           inst->getRetainCount());
-                               }
+                                               if ((debugFlags & kIOLogStart) && (false == started)) {
+                                                       LOG( "%s::start(%s) <%d> failed\n", inst->getName(), getName(),
+                                                           inst->getRetainCount());
+                                               }
 #endif
+                                       }
+                               }
                                inst->release();
                        }
                }
                iter->release();
        }
 
+       OSSafeReleaseNULL(prop1);
+
+       if (dextCount) {
+               num = OSNumber::withNumber(dextCount, 32);
+               setProperty(gIODEXTMatchCountKey, num);
+               OSSafeReleaseNULL(num);
+       } else if (rematchPersonality) {
+               removeProperty(gIODEXTMatchCountKey);
+       }
+
+       // now that instances are created, drop the refs on any kexts allowing unload
+       if (kexts) {
+               OSKext::dropMatchingReferences(kexts);
+               OSSafeReleaseNULL(kexts);
+       }
 
        // adjust the busy count by +1 if matching is stalled for a module,
        // or -1 if a previously stalled matching is complete.
@@ -3677,6 +3991,272 @@ IOService::probeCandidates( OSOrderedSet * matches )
        }
 }
 
+/*
+ * Wait for a IOUserServer to check in
+ */
+
+static
+__attribute__((noinline, not_tail_called))
+IOService *
+__WAITING_FOR_USER_SERVER__(OSDictionary * matching)
+{
+       IOService * server;
+       server = IOService::waitForMatchingService(matching, kIOUserServerCheckInTimeoutSecs * NSEC_PER_SEC);
+       return server;
+}
+
+void
+IOService::willShutdown()
+{
+       gKextdWillTerminate = true;
+#if !NO_KEXTD
+       getPlatform()->waitQuiet(30 * NSEC_PER_SEC);
+#endif
+       OSKext::willShutdown();
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+void
+IOServicePH::init(IOPMrootDomain * root)
+{
+       fUserServers     = OSArray::withCapacity(4);
+       fMatchingWork    = OSArray::withCapacity(4);
+
+       assert(fUserServers && fMatchingWork);
+
+       fRootNotifier = root->registerInterest(
+               gIOPriorityPowerStateInterest, &IOServicePH::systemPowerChange, NULL, NULL);
+
+       assert(fRootNotifier);
+}
+
+void
+IOServicePH::lock()
+{
+       IOLockLock(gJobsLock);
+}
+
+void
+IOServicePH::unlock()
+{
+       IOLockUnlock(gJobsLock);
+}
+
+void
+IOServicePH::serverAdd(IOUserServer * server)
+{
+       uint32_t idx;
+
+       lock();
+       idx = fUserServers->getNextIndexOfObject(server, 0);
+       if (idx == -1U) {
+               fUserServers->setObject(server);
+       }
+       unlock();
+}
+
+void
+IOServicePH::serverRemove(IOUserServer * server)
+{
+       uint32_t idx;
+
+       lock();
+       idx = fUserServers->getNextIndexOfObject(server, 0);
+       if (idx != -1U) {
+               fUserServers->removeObject(idx);
+       }
+       unlock();
+}
+
+void
+IOServicePH::serverAck(IOUserServer * server)
+{
+       uint32_t    idx;
+       IOService * ackTo;
+       uint32_t    ackToRef;
+
+       ackTo = NULL;
+       lock();
+       if (server && fUserServersWait) {
+               idx = fUserServersWait->getNextIndexOfObject(server, 0);
+               if (idx != -1U) {
+                       fUserServersWait->removeObject(idx);
+                       if (0 == fUserServersWait->getCount()) {
+                               OSSafeReleaseNULL(fUserServersWait);
+                       }
+               }
+       }
+       if (!fUserServersWait && !fMatchingWork->getCount()) {
+               ackTo             = fSystemPowerAckTo;
+               ackToRef          = fSystemPowerAckRef;
+               fSystemPowerAckTo = NULL;
+       }
+       unlock();
+
+       if (ackTo) {
+               DKLOG("allowPowerChange\n");
+               ackTo->allowPowerChange((uintptr_t) ackToRef);
+       }
+}
+
+bool
+IOServicePH::matchingStart(IOService * service)
+{
+       uint32_t idx;
+       bool ok;
+
+       lock();
+       ok = !fSystemOff;
+       if (ok) {
+               idx = fMatchingWork->getNextIndexOfObject(service, 0);
+               if (idx == -1U) {
+                       fMatchingWork->setObject(service);
+               }
+       } else {
+               if (!fMatchingDelayed) {
+                       fMatchingDelayed = OSArray::withObjects((const OSObject **) &service, 1, 1);
+               } else {
+                       idx = fMatchingDelayed->getNextIndexOfObject(service, 0);
+                       if (idx == -1U) {
+                               fMatchingDelayed->setObject(service);
+                       }
+               }
+       }
+       unlock();
+
+       return ok;
+}
+
+void
+IOServicePH::matchingEnd(IOService * service)
+{
+       uint32_t idx;
+       OSArray   * notifyServers;
+       OSArray   * deferredMatches;
+
+       notifyServers   = NULL;
+       deferredMatches = NULL;
+
+       lock();
+
+       if (service) {
+               idx = fMatchingWork->getNextIndexOfObject(service, 0);
+               if (idx != -1U) {
+                       fMatchingWork->removeObject(idx);
+               }
+       }
+
+
+       if ((fUserServerOff != fSystemOff) && fUserServers->getCount()) {
+               if (fSystemOff) {
+                       if (0 == fMatchingWork->getCount()) {
+                               fUserServersWait = OSArray::withArray(fUserServers);
+                               notifyServers = OSArray::withArray(fUserServers);
+                               fUserServerOff = fSystemOff;
+                       }
+               } else {
+                       notifyServers = OSArray::withArray(fUserServers);
+                       fUserServerOff = fSystemOff;
+               }
+       }
+
+       if (!fSystemOff && fMatchingDelayed) {
+               deferredMatches = fMatchingDelayed;
+               fMatchingDelayed = NULL;
+       }
+
+       unlock();
+
+       if (notifyServers) {
+               notifyServers->iterateObjects(^bool (OSObject * obj) {
+                       IOUserServer * us;
+                       us = (typeof(us))obj;
+                       us->systemPower(fSystemOff);
+                       return false;
+               });
+               OSSafeReleaseNULL(notifyServers);
+       }
+
+       if (deferredMatches) {
+               DKLOG("sleep deferred rematching count %d\n", deferredMatches->getCount());
+               deferredMatches->iterateObjects(^bool (OSObject * obj)
+               {
+                       ((IOService *)obj)->startMatching(kIOServiceAsynchronous);
+                       return false;
+               });
+               deferredMatches->release();
+       }
+
+       serverAck(NULL);
+}
+
+IOReturn
+IOServicePH::systemPowerChange(
+       void * target,
+       void * refCon,
+       UInt32 messageType, IOService * service,
+       void * messageArgument, vm_size_t argSize)
+{
+       IOReturn                               ret;
+       IOUserServer                         * us;
+       IOPMSystemCapabilityChangeParameters * params;
+
+       us = NULL;
+
+       switch (messageType) {
+       case kIOMessageSystemCapabilityChange:
+
+               params = (typeof params)messageArgument;
+
+               if (kIODKLogPM & gIODKDebug) {
+                       IOLog("IOServicePH::kIOMessageSystemCapabilityChange: %s%s 0x%x->0x%x\n",
+                           params->changeFlags & kIOPMSystemCapabilityWillChange ? "will" : "",
+                           params->changeFlags & kIOPMSystemCapabilityDidChange ? "did" : "",
+                           params->fromCapabilities,
+                           params->toCapabilities);
+               }
+
+               if ((params->changeFlags & kIOPMSystemCapabilityWillChange) &&
+                   (params->fromCapabilities & kIOPMSystemCapabilityCPU) &&
+                   ((params->toCapabilities & kIOPMSystemCapabilityCPU) == 0)) {
+                       lock();
+                       fSystemOff         = true;
+                       fSystemPowerAckRef = params->notifyRef;
+                       fSystemPowerAckTo  = service;
+                       unlock();
+
+                       matchingEnd(NULL);
+
+                       params->maxWaitForReply = 60 * 1000 * 1000;
+                       ret = kIOReturnSuccess;
+               } else if ((params->changeFlags & kIOPMSystemCapabilityWillChange) &&
+                   ((params->fromCapabilities & kIOPMSystemCapabilityCPU) == 0) &&
+                   (params->toCapabilities & kIOPMSystemCapabilityCPU)) {
+                       lock();
+                       fSystemOff = false;
+                       unlock();
+
+                       matchingEnd(NULL);
+
+                       params->maxWaitForReply = 0;
+                       ret                 = kIOReturnSuccess;
+               } else {
+                       params->maxWaitForReply = 0;
+                       ret                 = kIOReturnSuccess;
+               }
+               break;
+
+       default:
+               ret = kIOReturnUnsupported;
+               break;
+       }
+
+       return ret;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
 /*
  * Start a previously attached & probed instance,
  * called on exporting object instance
@@ -3686,42 +4266,164 @@ bool
 IOService::startCandidate( IOService * service )
 {
        bool                ok;
+       OSObject          * obj;
+       OSObject          * prop;
+       IOUserServer      * userServer;
+       bool                ph;
 
-       ok = service->attach( this );
+       userServer = NULL;
+       obj = service->copyProperty(gIOUserServerNameKey);
 
-       if (ok) {
-               if (this != gIOResources) {
-                       // stall for any nub resources
-                       checkResources();
-                       // stall for any driver resources
-                       service->checkResources();
-               }
+       if (obj && (this == gIOResources)) {
+               ok = false;
+       } else {
+               ok = service->attach( this );
+       }
+       if (!ok) {
+               return false;
+       }
 
-               AbsoluteTime startTime;
-               AbsoluteTime endTime;
-               UInt64       nano;
+       if ((this != gIOResources) && (this != gIOUserResources)) {
+               // stall for any nub resources
+               checkResources();
+               // stall for any driver resources
+               service->checkResources();
+       }
+       ph = false;
+       {
+               OSString       * bundleID;
+               OSString       * serverName;
+               OSString       * str;
+               const OSSymbol * sym;
+               OSDictionary   * matching;
+               IOService      * server;
+               OSNumber       * serverTag;
+               uint64_t         entryID;
+
+               if ((serverName = OSDynamicCast(OSString, obj))) {
+                       obj       = service->copyProperty(gIOModuleIdentifierKey);
+                       bundleID  = OSDynamicCast(OSString, obj);
+                       entryID   = service->getRegistryEntryID();
+                       serverTag = OSNumber::withNumber(entryID, 64);
+
+                       if (gKextdWillTerminate) {
+                               DKLOG("%s disabled in shutdown\n", serverName->getCStringNoCopy());
+                               service->detach(this);
+                               OSSafeReleaseNULL(obj);
+                               return false;
+                       }
 
-               if (kIOLogStart & gIOKitDebug) {
-                       clock_get_uptime(&startTime);
-               }
+                       ph = IOServicePH::matchingStart(this);
+                       if (!ph) {
+                               DKLOG("%s deferred in sleep\n", serverName->getCStringNoCopy());
+                               service->detach(this);
+                               OSSafeReleaseNULL(obj);
+                               return false;
+                       }
+
+                       prop = service->copyProperty(gIOUserClassKey);
+                       str = OSDynamicCast(OSString, prop);
+                       if (str) {
+                               service->setName(str);
+                       }
+                       OSSafeReleaseNULL(prop);
 
-               ok = service->start(this);
+                       if (!(kIODKDisableDextLaunch & gIODKDebug)) {
+                               OSKext::requestDaemonLaunch(bundleID, serverName, serverTag);
+                       }
+                       sym = OSSymbol::withString(serverName);
+                       matching = serviceMatching(gIOUserServerClassKey);
+                       propertyMatching(gIOUserServerNameKey, sym, matching);
+                       if (!(kIODKDisableDextTag & gIODKDebug)) {
+                               propertyMatching(gIOUserServerTagKey, serverTag, matching);
+                       }
 
-               if (kIOLogStart & gIOKitDebug) {
-                       clock_get_uptime(&endTime);
+                       server = __WAITING_FOR_USER_SERVER__(matching);
+                       matching->release();
+                       OSSafeReleaseNULL(serverTag);
+                       OSSafeReleaseNULL(serverName);
+
+                       userServer = OSDynamicCast(IOUserServer, server);
+                       if (!userServer) {
+                               service->detach(this);
+                               IOServicePH::matchingEnd(this);
+                               DKLOG(DKS " user server timeout\n", DKN(service));
+                               return false;
+                       }
 
-                       if (CMP_ABSOLUTETIME(&endTime, &startTime) > 0) {
-                               SUB_ABSOLUTETIME(&endTime, &startTime);
-                               absolutetime_to_nanoseconds(endTime, &nano);
-                               if (nano > 500000000ULL) {
-                                       IOLog("%s::start took %ld ms\n", service->getName(), (long)(UInt32)(nano / 1000000ULL));
+                       if (!(kIODKDisableCDHashChecking & gIODKDebug)) {
+                               if (!userServer->serviceMatchesCDHash(service)) {
+                                       service->detach(this);
+                                       IOServicePH::matchingEnd(this);
+                                       userServer->exit("CDHash check failed");
+                                       userServer->release();
+                                       return false;
                                }
                        }
+                       OSKext *kext = OSKext::lookupKextWithIdentifier(bundleID);
+                       if (!kext) {
+                               const char *name = bundleID->getCStringNoCopy();
+                               IOLog("%s Could not find OSKext for %s\n", __func__, name);
+                               goto skip_log;
+                       }
+
+                       /*
+                        * Used for logging
+                        */
+                       userServer->setTaskLoadTag(kext);
+                       userServer->setDriverKitUUID(kext);
+                       OSKext::OSKextLogDriverKitInfoLoad(kext);
+skip_log:
+                       OSSafeReleaseNULL(bundleID);
+                       OSSafeReleaseNULL(kext);
+
+                       if (!(kIODKDisableEntitlementChecking & gIODKDebug)) {
+                               if (!userServer->checkEntitlements(this, service)) {
+                                       service->detach(this);
+                                       IOServicePH::matchingEnd(this);
+                                       userServer->exit("Entitlements check failed");
+                                       userServer->release();
+                                       return false;
+                               }
+                       }
+
+                       userServer->serviceAttach(service, this);
                }
-               if (!ok) {
-                       service->detach( this );
+       }
+
+       AbsoluteTime startTime;
+       AbsoluteTime endTime;
+       UInt64       nano;
+
+       if (kIOLogStart & gIOKitDebug) {
+               clock_get_uptime(&startTime);
+       }
+
+       ok = service->start(this);
+
+       if (kIOLogStart & gIOKitDebug) {
+               clock_get_uptime(&endTime);
+
+               if (CMP_ABSOLUTETIME(&endTime, &startTime) > 0) {
+                       SUB_ABSOLUTETIME(&endTime, &startTime);
+                       absolutetime_to_nanoseconds(endTime, &nano);
+                       if (nano > 500000000ULL) {
+                               IOLog("%s::start took %ld ms\n", service->getName(), (long)(UInt32)(nano / 1000000ULL));
+                       }
                }
        }
+       if (userServer) {
+               userServer->serviceStarted(service, this, ok);
+               userServer->release();
+       }
+       if (!ok) {
+               service->detach( this );
+       }
+
+       if (ph) {
+               IOServicePH::matchingEnd(this);
+       }
+
        return ok;
 }
 
@@ -3739,7 +4441,7 @@ IOService::publishResource( const char * key, OSObject * value )
 void
 IOService::publishResource( const OSSymbol * key, OSObject * value )
 {
-       if (0 == value) {
+       if (NULL == value) {
                value = (OSObject *) gIOServiceKey;
        }
 
@@ -3753,6 +4455,23 @@ IOService::publishResource( const OSSymbol * key, OSObject * value )
        gIOResources->registerService();
 }
 
+void
+IOService::publishUserResource( const OSSymbol * key, OSObject * value )
+{
+       if (NULL == value) {
+               value = (OSObject *) gIOServiceKey;
+       }
+
+       gIOUserResources->setProperty( key, value);
+
+       if (IORecursiveLockHaveLock( gNotificationLock)) {
+               return;
+       }
+
+       gIOResourceGenerationCount++;
+       gIOUserResources->registerService();
+}
+
 bool
 IOService::addNeededResource( const char * key )
 {
@@ -3836,13 +4555,13 @@ IOService::checkResources( void )
        bool                ok;
 
        resourcesProp = copyProperty( gIOResourceMatchKey );
-       if (0 == resourcesProp) {
+       if (NULL == resourcesProp) {
                return true;
        }
 
        if ((set = OSDynamicCast( OSSet, resourcesProp ))) {
                iter = OSCollectionIterator::withCollection( set );
-               ok = (0 != iter);
+               ok = (NULL != iter);
                while (ok && (resourcesProp = iter->getNextObject())) {
                        ok = checkResource( resourcesProp );
                }
@@ -3860,7 +4579,7 @@ IOService::checkResources( void )
 
 
 void
-_IOConfigThread::configThread( void )
+_IOConfigThread::configThread( int configThreadId )
 {
        _IOConfigThread *   inst;
 
@@ -3871,11 +4590,16 @@ _IOConfigThread::configThread( void )
                if (!inst->init()) {
                        continue;
                }
-               thread_t unused;
-               if (KERN_SUCCESS != kernel_thread_start(&_IOConfigThread::main, inst, &unused)) {
+               thread_t thread;
+               if (KERN_SUCCESS != kernel_thread_start(&_IOConfigThread::main, inst, &thread)) {
                        continue;
                }
 
+               char threadName[MAXTHREADNAMESIZE];
+               snprintf(threadName, sizeof(threadName), "IOConfigThread_%d", configThreadId);
+               thread_set_thread_name(thread, threadName);
+               thread_deallocate(thread);
+
                return;
        } while (false);
 
@@ -3886,25 +4610,18 @@ _IOConfigThread::configThread( void )
        return;
 }
 
-void
-_IOConfigThread::free( void )
-{
-       thread_deallocate(current_thread());
-       OSObject::free();
-}
-
 void
 IOService::doServiceMatch( IOOptionBits options )
 {
        _IOServiceNotifier * notify;
        OSIterator *        iter;
        OSOrderedSet *      matches;
-       OSArray *           resourceKeys = 0;
+       OSArray *           resourceKeys = NULL;
        SInt32              catalogGeneration;
        bool                keepGuessing = true;
        bool                reRegistered = true;
        bool                didRegister;
-       OSArray *           notifiers[2] = {0};
+       OSArray *           notifiers[2] = {NULL};
 
 //    job->nub->deliverNotification( gIOPublishNotification,
 //                              kIOServiceRegisteredState, 0xffffffff );
@@ -3946,7 +4663,7 @@ IOService::doServiceMatch( IOOptionBits options )
                        invokeNotifiers(&notifiers[0]);
 
                        if (keepGuessing && matches->getCount() && (kIOReturnSuccess == getResources())) {
-                               if (this == gIOResources) {
+                               if ((this == gIOResources) || (this == gIOUserResources)) {
                                        if (resourceKeys) {
                                                resourceKeys->release();
                                        }
@@ -4179,7 +4896,17 @@ IOService::waitQuiet( uint64_t timeout )
        bool     kextdWait;
        bool     dopanic;
 
+#if KASAN
+       /*
+        * On kasan kernels, everything takes longer, so double the number of
+        * timeout extensions. This should help with issues like 41259215
+        * where WindowServer was timing out waiting for kextd to get all the
+        * kasan kexts loaded and started.
+        */
+       enum { kTimeoutExtensions = 8 };
+#else
        enum { kTimeoutExtensions = 4 };
+#endif
 
        time = mach_absolute_time();
        kextdWait = false;
@@ -4194,7 +4921,7 @@ IOService::waitQuiet( uint64_t timeout )
                        break;
                } else if (kIOReturnTimeout != ret) {
                        break;
-               } else if (timeout < 41000000000) {
+               } else if (timeout < (4100ull * NSEC_PER_SEC)) {
                        break;
                }
 
@@ -4459,7 +5186,7 @@ _IOServiceJob::pingConfig( _IOServiceJob * job )
                if (gIOKitDebug & kIOLogConfig) {
                        LOG("config(%d): creating\n", gNumConfigThreads - 1);
                }
-               _IOConfigThread::configThread();
+               _IOConfigThread::configThread(gNumConfigThreads - 1);
        }
 
        semaphore_signal( gJobsSemaphore );
@@ -4517,14 +5244,14 @@ OSObject *
 IOService::copyExistingServices( OSDictionary * matching,
     IOOptionBits inState, IOOptionBits options )
 {
-       OSObject *   current = 0;
+       OSObject *   current = NULL;
        OSIterator * iter;
        IOService *  service;
        OSObject *   obj;
        OSString *   str;
 
        if (!matching) {
-               return 0;
+               return NULL;
        }
 
 #if MATCH_DEBUG
@@ -4553,7 +5280,7 @@ IOService::copyExistingServices( OSDictionary * matching,
                ctx.count   = 0;
                ctx.done    = 0;
                ctx.options = options;
-               ctx.result  = 0;
+               ctx.result  = NULL;
 
                if ((str = OSDynamicCast(OSString, obj))) {
                        const OSSymbol * sym = OSSymbol::withString(str);
@@ -4570,7 +5297,7 @@ IOService::copyExistingServices( OSDictionary * matching,
                if (current && (ctx.done != ctx.count)) {
                        OSSet *
                            source = OSDynamicCast(OSSet, current);
-                       current = 0;
+                       current = NULL;
                        while ((service = (IOService *) source->getAnyObject())) {
                                if (service->matchPassive(matching, options)) {
                                        if (options & kIONotifyOnce) {
@@ -4621,7 +5348,6 @@ IOService::copyExistingServices( OSDictionary * matching,
                        iter->release();
                }
 
-
                if (((current != 0) != (_current != 0))
                    || (current && _current && !current->isEqualTo(_current))) {
                        OSSerialize * s1 = OSSerialize::withCapacity(128);
@@ -4704,17 +5430,17 @@ IOService::setNotification(
        IOServiceMatchingNotificationHandler handler, void * target, void * ref,
        SInt32 priority )
 {
-       _IOServiceNotifier * notify = 0;
+       _IOServiceNotifier * notify = NULL;
        OSOrderedSet *      set;
 
        if (!matching) {
-               return 0;
+               return NULL;
        }
 
        notify = new _IOServiceNotifier;
        if (notify && !notify->init()) {
                notify->release();
-               notify = 0;
+               notify = NULL;
        }
 
        if (notify) {
@@ -4735,9 +5461,9 @@ IOService::setNotification(
 
                ////// queue
 
-               if (0 == (set = (OSOrderedSet *) gNotifications->getObject( type ))) {
+               if (NULL == (set = (OSOrderedSet *) gNotifications->getObject( type ))) {
                        set = OSOrderedSet::withCapacity( 1,
-                           IONotifyOrdering, 0 );
+                           IONotifyOrdering, NULL );
                        if (set) {
                                gNotifications->setObject( type, set );
                                set->release();
@@ -4765,7 +5491,7 @@ IOService::doInstallNotification(
        IOOptionBits        inState;
 
        if (!matching) {
-               return 0;
+               return NULL;
        }
 
        if (type == gIOPublishNotification) {
@@ -4779,7 +5505,7 @@ IOService::doInstallNotification(
        } else if ((type == gIOTerminatedNotification) || (type == gIOWillTerminateNotification)) {
                inState = 0;
        } else {
-               return 0;
+               return NULL;
        }
 
        notify = setNotification( type, matching, handler, target, ref, priority );
@@ -4788,7 +5514,7 @@ IOService::doInstallNotification(
                // get the current set
                exist = (OSIterator *) copyExistingServices( matching, inState );
        } else {
-               exist = 0;
+               exist = NULL;
        }
 
        *existing = exist;
@@ -4882,7 +5608,7 @@ IOService::addMatchingNotification(
        ret = notify = (_IOServiceNotifier *) installNotification( type, matching,
            handler, target, ref, priority, &existing );
        if (!ret) {
-               return 0;
+               return NULL;
        }
 
        // send notifications for existing set
@@ -4896,7 +5622,7 @@ IOService::addMatchingNotification(
        }
 
        LOCKWRITENOTIFY();
-       bool removed = (0 == notify->whence);
+       bool removed = (NULL == notify->whence);
        notify->release();
        if (removed) {
                ret = gIOServiceNullNotifier;
@@ -4960,13 +5686,13 @@ IOService *
 IOService::waitForMatchingService( OSDictionary * matching,
     uint64_t timeout)
 {
-       IONotifier *        notify = 0;
+       IONotifier *        notify = NULL;
        // priority doesn't help us much since we need a thread wakeup
        SInt32              priority = 0;
        IOService *         result;
 
        if (!matching) {
-               return 0;
+               return NULL;
        }
 
        result = NULL;
@@ -4979,7 +5705,7 @@ IOService::waitForMatchingService( OSDictionary * matching,
                        break;
                }
                notify = IOService::setNotification( gIOMatchedNotification, matching,
-                   &IOService::syncNotificationHandler, (void *) 0,
+                   &IOService::syncNotificationHandler, (void *) NULL,
                    &result, priority );
                if (!notify) {
                        break;
@@ -5027,6 +5753,7 @@ IOService::waitForService( OSDictionary * matching,
        return result;
 }
 
+__dead2
 void
 IOService::deliverNotification( const OSSymbol * type,
     IOOptionBits orNewState, IOOptionBits andNewState )
@@ -5040,7 +5767,7 @@ IOService::copyNotifiers(const OSSymbol * type,
 {
        _IOServiceNotifier * notify;
        OSIterator *         iter;
-       OSArray *            willSend = 0;
+       OSArray *            willSend = NULL;
 
        lockForArbitration();
 
@@ -5056,7 +5783,7 @@ IOService::copyNotifiers(const OSSymbol * type,
                        while ((notify = (_IOServiceNotifier *) iter->getNextObject())) {
                                if (matchPassive(notify->matching, 0)
                                    && (kIOServiceNotifyEnable & notify->state)) {
-                                       if (0 == willSend) {
+                                       if (NULL == willSend) {
                                                willSend = OSArray::withCapacity(8);
                                        }
                                        if (willSend) {
@@ -5093,7 +5820,7 @@ IOService::serviceMatching( const OSString * name,
 
        str = OSSymbol::withString(name);
        if (!str) {
-               return 0;
+               return NULL;
        }
 
        if (!table) {
@@ -5115,7 +5842,7 @@ IOService::serviceMatching( const char * name,
 
        str = OSSymbol::withCString( name );
        if (!str) {
-               return 0;
+               return NULL;
        }
 
        table = serviceMatching( str, table );
@@ -5145,7 +5872,7 @@ IOService::nameMatching( const char * name,
 
        str = OSSymbol::withCString( name );
        if (!str) {
-               return 0;
+               return NULL;
        }
 
        table = nameMatching( str, table );
@@ -5173,7 +5900,7 @@ IOService::resourceMatching( const char * name,
 
        str = OSSymbol::withCString( name );
        if (!str) {
-               return 0;
+               return NULL;
        }
 
        table = resourceMatching( str, table );
@@ -5190,7 +5917,7 @@ IOService::propertyMatching( const OSSymbol * key, const OSObject * value,
 
        properties = OSDictionary::withCapacity( 2 );
        if (!properties) {
-               return 0;
+               return NULL;
        }
        properties->setObject( key, value );
 
@@ -5214,7 +5941,7 @@ IOService::registryEntryIDMatching( uint64_t entryID,
 
        num = OSNumber::withNumber( entryID, 64 );
        if (!num) {
-               return 0;
+               return NULL;
        }
 
        if (!table) {
@@ -5280,11 +6007,11 @@ _IOServiceNotifier::remove()
 
        if (whence) {
                whence->removeObject((OSObject *) this );
-               whence = 0;
+               whence = NULL;
        }
        if (matching) {
                matching->release();
-               matching = 0;
+               matching = NULL;
        }
 
        state &= ~kIOServiceNotifyEnable;
@@ -5373,7 +6100,7 @@ IOResources::resources( void )
        inst = new IOResources;
        if (inst && !inst->init()) {
                inst->release();
-               inst = 0;
+               inst = NULL;
        }
 
        return inst;
@@ -5397,8 +6124,8 @@ IOResources::init( OSDictionary * dictionary )
                OSNumber *num;
                const OSSymbol *        sym;
 
-               if ((num = OSNumber::withNumber(property_value, 32)) != 0) {
-                       if ((sym = OSSymbol::withCString( property_name)) != 0) {
+               if ((num = OSNumber::withNumber(property_value, 32)) != NULL) {
+                       if ((sym = OSSymbol::withCString( property_name)) != NULL) {
                                this->setProperty( sym, num );
                                sym->release();
                        }
@@ -5429,8 +6156,8 @@ IOResources::getWorkLoop() const
        }
 }
 
-bool
-IOResources::matchPropertyTable( OSDictionary * table )
+static bool
+IOResourcesMatchPropertyTable(IOService * resources, OSDictionary * table)
 {
        OSObject *          prop;
        OSString *          str;
@@ -5443,19 +6170,19 @@ IOResources::matchPropertyTable( OSDictionary * table )
        prop = table->getObject( gIOResourceMatchKey );
        str = OSDynamicCast( OSString, prop );
        if (str) {
-               ok = (0 != getProperty( str ));
+               ok = (NULL != resources->getProperty( str ));
        } else if ((set = OSDynamicCast( OSSet, prop))) {
                iter = OSCollectionIterator::withCollection( set );
-               ok = (iter != 0);
+               ok = (iter != NULL);
                while (ok && (str = OSDynamicCast( OSString, iter->getNextObject()))) {
-                       ok = (0 != getProperty( str ));
+                       ok = (NULL != resources->getProperty( str ));
                }
 
                if (iter) {
                        iter->release();
                }
        } else if ((prop = table->getObject(gIOResourceMatchedKey))) {
-               obj = copyProperty(gIOResourceMatchedKey);
+               obj = resources->copyProperty(gIOResourceMatchedKey);
                keys = OSDynamicCast(OSArray, obj);
                ok = false;
                if (keys) {
@@ -5468,6 +6195,62 @@ IOResources::matchPropertyTable( OSDictionary * table )
        return ok;
 }
 
+bool
+IOResources::matchPropertyTable( OSDictionary * table )
+{
+       return IOResourcesMatchPropertyTable(this, table);
+}
+
+/*
+ * IOUserResources
+ */
+
+IOService *
+IOUserResources::resources( void )
+{
+       IOUserResources *       inst;
+
+       inst = OSTypeAlloc(IOUserResources);
+       if (inst && !inst->init()) {
+               inst->release();
+               inst = NULL;
+       }
+
+       return inst;
+}
+
+bool
+IOUserResources::init( OSDictionary * dictionary )
+{
+       // Do super init first
+       if (!IOService::init()) {
+               return false;
+       }
+       return true;
+}
+
+IOReturn
+IOUserResources::newUserClient(task_t owningTask, void * securityID,
+    UInt32 type, OSDictionary * properties,
+    IOUserClient ** handler)
+{
+       return kIOReturnUnsupported;
+}
+
+IOWorkLoop *
+IOUserResources::getWorkLoop() const
+{
+       return getPlatform()->getWorkLoop();
+}
+
+bool
+IOUserResources::matchPropertyTable( OSDictionary * table )
+{
+       return IOResourcesMatchPropertyTable(this, table);
+}
+
+// --
+
 void
 IOService::consoleLockTimer(thread_call_param_t p0, thread_call_param_t p1)
 {
@@ -5507,7 +6290,7 @@ IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessage)
                                gIOConsoleBooterLockState = kOSBooleanTrue;
                                break;
                        case kIOScreenLockNoLock:
-                               gIOConsoleBooterLockState = 0;
+                               gIOConsoleBooterLockState = NULL;
                                break;
                        case kIOScreenLockUnlocked:
                        default:
@@ -5519,7 +6302,7 @@ IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessage)
        }
 
        if (consoleUsers) {
-               OSNumber * num = 0;
+               OSNumber * num = NULL;
                bool       loginLocked = true;
 
                gIOConsoleLoggedIn = false;
@@ -5536,10 +6319,10 @@ IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessage)
                }
 #if HIBERNATION
                if (!loginLocked) {
-                       gIOConsoleBooterLockState = 0;
+                       gIOConsoleBooterLockState = NULL;
                }
                IOLog("IOConsoleUsers: time(%d) %ld->%d, lin %d, llk %d, \n",
-                   (num != 0), gIOConsoleLockTime, (num ? num->unsigned32BitValue() : 0),
+                   (num != NULL), gIOConsoleLockTime, (num ? num->unsigned32BitValue() : 0),
                    gIOConsoleLoggedIn, loginLocked);
 #endif /* HIBERNATION */
                gIOConsoleLockTime = num ? num->unsigned32BitValue() : 0;
@@ -5587,7 +6370,7 @@ IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessage)
                gIOChosenEntry->setProperty(kIOScreenLockStateKey, &gIOScreenLockState, sizeof(gIOScreenLockState));
 
                IOLog("IOConsoleUsers: gIOScreenLockState %d, hs %d, bs %d, now %ld, sm 0x%x\n",
-                   gIOScreenLockState, gIOHibernateState, (gIOConsoleBooterLockState != 0), now, systemMessage);
+                   gIOScreenLockState, gIOHibernateState, (gIOConsoleBooterLockState != NULL), now, systemMessage);
        }
 #endif /* HIBERNATION */
 
@@ -5622,12 +6405,12 @@ IOResources::setProperties( OSObject * properties )
        }
 
        dict = OSDynamicCast(OSDictionary, properties);
-       if (0 == dict) {
+       if (NULL == dict) {
                return kIOReturnBadArgument;
        }
 
        iter = OSCollectionIterator::withCollection( dict);
-       if (0 == iter) {
+       if (NULL == iter) {
                return kIOReturnBadArgument;
        }
 
@@ -5736,13 +6519,16 @@ IOService::addLocation( OSDictionary * table )
        OSDictionary *      dict;
 
        if (!table) {
-               return 0;
+               return NULL;
        }
 
        dict = OSDictionary::withCapacity( 1 );
        if (dict) {
-               table->setObject( gIOLocationMatchKey, dict );
+               bool ok = table->setObject( gIOLocationMatchKey, dict );
                dict->release();
+               if (!ok) {
+                       dict = NULL;
+               }
        }
 
        return dict;
@@ -5786,7 +6572,7 @@ IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * did)
                str = OSDynamicCast(OSString, table->getObject(gIOProviderClassKey));
                if (str) {
                        done++;
-                       match = ((kIOServiceClassDone & options) || (0 != metaCast(str)));
+                       match = ((kIOServiceClassDone & options) || (NULL != metaCast(str)));
 #if MATCH_DEBUG
                        match = (0 != metaCast( str ));
                        if ((kIOServiceClassDone & options) && !match) {
@@ -5801,7 +6587,7 @@ IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * did)
                obj = table->getObject( gIONameMatchKey );
                if (obj) {
                        done++;
-                       match = compareNames( obj, changesOK ? &matched : 0 );
+                       match = compareNames( obj, changesOK ? &matched : NULL );
                        if (!match) {
                                break;
                        }
@@ -5841,20 +6627,20 @@ IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * did)
                        if (dict) {
                                nextDict = OSDynamicCast( OSDictionary, obj);
                                if (nextDict) {
-                                       iter = 0;
+                                       iter = NULL;
                                } else {
                                        iter = OSCollectionIterator::withCollection(
                                                OSDynamicCast(OSCollection, obj));
                                }
 
                                while (nextDict
-                                   || (iter && (0 != (nextDict = OSDynamicCast(OSDictionary,
+                                   || (iter && (NULL != (nextDict = OSDynamicCast(OSDictionary,
                                    iter->getNextObject()))))) {
                                        match = dict->isEqualTo( nextDict, nextDict);
                                        if (match) {
                                                break;
                                        }
-                                       nextDict = 0;
+                                       nextDict = NULL;
                                }
                                dict->release();
                                if (iter) {
@@ -5877,20 +6663,20 @@ IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * did)
                        if (dict) {
                                nextKey = OSDynamicCast( OSString, obj);
                                if (nextKey) {
-                                       iter = 0;
+                                       iter = NULL;
                                } else {
                                        iter = OSCollectionIterator::withCollection(
                                                OSDynamicCast(OSCollection, obj));
                                }
 
                                while (nextKey
-                                   || (iter && (0 != (nextKey = OSDynamicCast(OSString,
+                                   || (iter && (NULL != (nextKey = OSDynamicCast(OSString,
                                    iter->getNextObject()))))) {
-                                       match = (0 != dict->getObject(nextKey));
+                                       match = (NULL != dict->getObject(nextKey));
                                        if (match) {
                                                break;
                                        }
-                                       nextKey = 0;
+                                       nextKey = NULL;
                                }
                                dict->release();
                                if (iter) {
@@ -5927,7 +6713,7 @@ IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * did)
                num = OSDynamicCast( OSNumber, table->getObject( gIOMatchedServiceCountKey ));
                if (num) {
                        OSIterator *        iter;
-                       IOService *         service = 0;
+                       IOService *         service = NULL;
                        UInt32              serviceCount = 0;
 
                        done++;
@@ -5937,7 +6723,7 @@ IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * did)
                                        if (kIOServiceInactiveState & service->__state[0]) {
                                                continue;
                                        }
-                                       if (0 == service->getProperty( gIOMatchCategoryKey )) {
+                                       if (NULL == service->getProperty( gIOMatchCategoryKey )) {
                                                continue;
                                        }
                                        ++serviceCount;
@@ -6135,11 +6921,15 @@ IOService::newUserClient( task_t owningTask, void * securityID,
     UInt32 type, OSDictionary * properties,
     IOUserClient ** handler )
 {
-       const OSSymbol *userClientClass = 0;
+       const OSSymbol *userClientClass = NULL;
        IOUserClient *client;
        OSObject *prop;
        OSObject *temp;
 
+       if (reserved && reserved->uvars && reserved->uvars->userServer) {
+               return reserved->uvars->userServer->serviceNewUserClient(this, owningTask, securityID, type, properties, handler);
+       }
+
        if (kIOReturnSuccess == newUserClient( owningTask, securityID, type, handler )) {
                return kIOReturnSuccess;
        }
@@ -6417,7 +7207,7 @@ IOService::getDeviceMemoryWithIndex( unsigned int index )
        if (array) {
                range = (IODeviceMemory *) array->getObject( index );
        } else {
-               range = 0;
+               range = NULL;
        }
 
        return range;
@@ -6434,7 +7224,7 @@ IOService::mapDeviceMemoryWithIndex( unsigned int index,
        if (range) {
                map = range->map( options );
        } else {
-               map = 0;
+               map = NULL;
        }
 
        return map;
@@ -6539,7 +7329,7 @@ requireMaxCpuDelay(IOService * service, UInt32 ns, UInt32 delayType)
 
                // Check if entry found
                if (kNoReplace != replace) {
-                       entries[replace].fService = 0; // Null the entry
+                       entries[replace].fService = NULL; // Null the entry
                        setCpuDelay = true;
                }
        }
@@ -6665,16 +7455,16 @@ IOService::resolveInterrupt(IOService *nub, int source)
 
        // Get the parents list from the nub.
        array = OSDynamicCast(OSArray, nub->getProperty(gIOInterruptControllersKey));
-       if (array == 0) {
+       if (array == NULL) {
                return kIOReturnNoResources;
        }
 
        // Allocate space for the IOInterruptSources if needed... then return early.
-       if (nub->_interruptSources == 0) {
+       if (nub->_interruptSources == NULL) {
                numSources = array->getCount();
                interruptSources = (IOInterruptSource *)IOMalloc(
                        numSources * sizeofAllIOInterruptSource);
-               if (interruptSources == 0) {
+               if (interruptSources == NULL) {
                        return kIOReturnNoMemory;
                }
 
@@ -6686,22 +7476,22 @@ IOService::resolveInterrupt(IOService *nub, int source)
        }
 
        interruptControllerName = OSDynamicCast(OSSymbol, array->getObject(source));
-       if (interruptControllerName == 0) {
+       if (interruptControllerName == NULL) {
                return kIOReturnNoResources;
        }
 
        interruptController = getPlatform()->lookUpInterruptController(interruptControllerName);
-       if (interruptController == 0) {
+       if (interruptController == NULL) {
                return kIOReturnNoResources;
        }
 
        // Get the interrupt numbers from the nub.
        array = OSDynamicCast(OSArray, nub->getProperty(gIOInterruptSpecifiersKey));
-       if (array == 0) {
+       if (array == NULL) {
                return kIOReturnNoResources;
        }
        data = OSDynamicCast(OSData, array->getObject(source));
-       if (data == 0) {
+       if (data == NULL) {
                return kIOReturnNoResources;
        }
 
@@ -6719,7 +7509,7 @@ IOService::lookupInterrupt(int source, bool resolve, IOInterruptController **int
        IOReturn ret;
 
        /* Make sure the _interruptSources are set */
-       if (_interruptSources == 0) {
+       if (_interruptSources == NULL) {
                ret = resolveInterrupt(this, source);
                if (ret != kIOReturnSuccess) {
                        return ret;
@@ -7209,6 +7999,9 @@ IOService::setAuthorizationID( uint64_t authorizationID )
        return status;
 }
 
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+
 #if __LP64__
 OSMetaClassDefineReservedUsed(IOService, 0);
 OSMetaClassDefineReservedUsed(IOService, 1);
index c4d1c96fd1f0f1fb78fec8339fb5fbca6be887fa..5cdc56a78209b57c92be5b383ea728a1dc7a3784 100644 (file)
@@ -37,6 +37,7 @@
 #include <IOKit/IOCommand.h>
 #include <IOKit/IOTimeStamp.h>
 #include <IOKit/IOReportMacros.h>
+#include <IOKit/IODeviceTreeSupport.h>
 
 #include <IOKit/pwr_mgt/IOPMlog.h>
 #include <IOKit/pwr_mgt/IOPMinformee.h>
@@ -86,36 +87,23 @@ static bool                  gIOPMInitialized       = false;
 static uint32_t              gIOPMBusyRequestCount  = 0;
 static uint32_t              gIOPMWorkInvokeCount   = 0;
 static uint32_t              gIOPMTickleGeneration  = 0;
-static IOWorkLoop *          gIOPMWorkLoop          = 0;
-static IOPMRequestQueue *    gIOPMRequestQueue      = 0;
-static IOPMRequestQueue *    gIOPMReplyQueue        = 0;
-static IOPMWorkQueue *       gIOPMWorkQueue         = 0;
-static IOPMCompletionQueue * gIOPMCompletionQueue   = 0;
-static IOPMRequest *         gIOPMRequest           = 0;
-static IOService *           gIOPMRootNode          = 0;
-static IOPlatformExpert *    gPlatform              = 0;
-
-static char                  gIOSpinDumpKextName[128];
-static char                  gIOSpinDumpDelayType[16];
-static uint32_t              gIOSpinDumpDelayDuration = 0;
-
-static SYSCTL_STRING(_debug, OID_AUTO, swd_kext_name,
-    CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    &gIOSpinDumpKextName, sizeof(gIOSpinDumpKextName), "");
-static SYSCTL_STRING(_debug, OID_AUTO, swd_delay_type,
-    CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    &gIOSpinDumpDelayType, sizeof(gIOSpinDumpDelayType), "");
-static SYSCTL_INT(_debug, OID_AUTO, swd_delay_duration,
-    CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    &gIOSpinDumpDelayDuration, 0, "");
-
-const OSSymbol *             gIOPMPowerClientDevice     = 0;
-const OSSymbol *             gIOPMPowerClientDriver     = 0;
-const OSSymbol *             gIOPMPowerClientChildProxy = 0;
-const OSSymbol *             gIOPMPowerClientChildren   = 0;
-const OSSymbol *             gIOPMPowerClientRootDomain = 0;
-
-static const OSSymbol *      gIOPMPowerClientAdvisoryTickle = 0;
+static IOWorkLoop *          gIOPMWorkLoop          = NULL;
+static IOPMRequestQueue *    gIOPMRequestQueue      = NULL;
+static IOPMRequestQueue *    gIOPMReplyQueue        = NULL;
+static IOPMWorkQueue *       gIOPMWorkQueue         = NULL;
+static IOPMCompletionQueue * gIOPMCompletionQueue   = NULL;
+static IOPMRequest *         gIOPMRequest           = NULL;
+static IOService *           gIOPMRootNode          = NULL;
+static IOPlatformExpert *    gPlatform              = NULL;
+
+
+const OSSymbol *             gIOPMPowerClientDevice     = NULL;
+const OSSymbol *             gIOPMPowerClientDriver     = NULL;
+const OSSymbol *             gIOPMPowerClientChildProxy = NULL;
+const OSSymbol *             gIOPMPowerClientChildren   = NULL;
+const OSSymbol *             gIOPMPowerClientRootDomain = NULL;
+
+static const OSSymbol *      gIOPMPowerClientAdvisoryTickle = NULL;
 static bool                  gIOPMAdvisoryTickleEnabled = true;
 static thread_t              gIOPMWatchDogThread        = NULL;
 uint32_t                     gCanSleepTimeout           = 0;
@@ -367,19 +355,19 @@ IOService::PMinit( void )
                                if (gIOPMWorkLoop->addEventSource(gIOPMRequestQueue) !=
                                    kIOReturnSuccess) {
                                        gIOPMRequestQueue->release();
-                                       gIOPMRequestQueue = 0;
+                                       gIOPMRequestQueue = NULL;
                                }
 
                                if (gIOPMWorkLoop->addEventSource(gIOPMReplyQueue) !=
                                    kIOReturnSuccess) {
                                        gIOPMReplyQueue->release();
-                                       gIOPMReplyQueue = 0;
+                                       gIOPMReplyQueue = NULL;
                                }
 
                                if (gIOPMWorkLoop->addEventSource(gIOPMWorkQueue) !=
                                    kIOReturnSuccess) {
                                        gIOPMWorkQueue->release();
-                                       gIOPMWorkQueue = 0;
+                                       gIOPMWorkQueue = NULL;
                                }
 
                                // Must be added after the work queue, which pushes request
@@ -387,7 +375,7 @@ IOService::PMinit( void )
                                if (gIOPMWorkLoop->addEventSource(gIOPMCompletionQueue) !=
                                    kIOReturnSuccess) {
                                        gIOPMCompletionQueue->release();
-                                       gIOPMCompletionQueue = 0;
+                                       gIOPMCompletionQueue = NULL;
                                }
 
                                gIOPMPowerClientDevice =
@@ -407,9 +395,6 @@ IOService::PMinit( void )
 
                                gIOPMPowerClientRootDomain =
                                    OSSymbol::withCStringNoCopy( "RootDomainPower" );
-
-                               gIOSpinDumpKextName[0] = '\0';
-                               gIOSpinDumpDelayType[0] = '\0';
                        }
 
                        if (gIOPMRequestQueue && gIOPMReplyQueue && gIOPMCompletionQueue) {
@@ -483,10 +468,6 @@ IOService::PMinit( void )
                fDriverCallEntry = thread_call_allocate(
                        (thread_call_func_t) &IOService::pmDriverCallout, this);
                assert(fDriverCallEntry);
-               if (kIOKextSpinDump & gIOKitDebug) {
-                       fSpinDumpTimer = thread_call_allocate(
-                               &IOService::spindump_timer_expired, (thread_call_param_t)this);
-               }
 
                // Check for powerChangeDone override.
                if (OSMemberFunctionCast(void (*)(void),
@@ -523,7 +504,7 @@ void
 IOService::PMfree( void )
 {
        initialized = false;
-       pm_vars = 0;
+       pm_vars = NULL;
 
        if (pwrMgt) {
                assert(fMachineState == kIOPM_Finished);
@@ -568,11 +549,6 @@ IOService::PMfree( void )
                        thread_call_free(fDriverCallEntry);
                        fDriverCallEntry = NULL;
                }
-               if (fSpinDumpTimer) {
-                       thread_call_cancel(fSpinDumpTimer);
-                       thread_call_free(fSpinDumpTimer);
-                       fSpinDumpTimer = NULL;
-               }
                if (fPMLock) {
                        IOLockFree(fPMLock);
                        fPMLock = NULL;
@@ -587,7 +563,7 @@ IOService::PMfree( void )
                }
                if (fDriverCallParamSlots && fDriverCallParamPtr) {
                        IODelete(fDriverCallParamPtr, DriverCallParam, fDriverCallParamSlots);
-                       fDriverCallParamPtr = 0;
+                       fDriverCallParamPtr = NULL;
                        fDriverCallParamSlots = 0;
                }
                if (fResponseArray) {
@@ -605,18 +581,18 @@ IOService::PMfree( void )
                }
                if (fPowerClients) {
                        fPowerClients->release();
-                       fPowerClients = 0;
+                       fPowerClients = NULL;
                }
 
 #if PM_VARS_SUPPORT
                if (fPMVars) {
                        fPMVars->release();
-                       fPMVars = 0;
+                       fPMVars = NULL;
                }
 #endif
 
                pwrMgt->release();
-               pwrMgt = 0;
+               pwrMgt = NULL;
        }
 }
 
@@ -645,7 +621,7 @@ IOService::joinPMtree( IOService * driver )
        IOPlatformExpert *  platform;
 
        platform = getPlatform();
-       assert(platform != 0);
+       assert(platform != NULL);
        platform->PMRegisterDevice(this, driver);
 }
 
@@ -806,8 +782,8 @@ IOService::handlePMstop( IOPMRequest * request )
 IOReturn
 IOService::addPowerChild( IOService * child )
 {
-       IOPowerConnection * connection  = 0;
-       IOPMRequest *       requests[3] = {0, 0, 0};
+       IOPowerConnection * connection  = NULL;
+       IOPMRequest *       requests[3] = {NULL, NULL, NULL};
        OSIterator *        iter;
        bool                ok = true;
 
@@ -839,7 +815,7 @@ IOService::addPowerChild( IOService * child )
                        iter->release();
                }
                if (!ok) {
-                       PM_LOG("%s: %s (%p) is already a child\n",
+                       PM_LOG2("%s: %s (%p) is already a child\n",
                            getName(), child->getName(), OBFUSCATE(child));
                        break;
                }
@@ -1125,7 +1101,7 @@ IOService::registerPowerDriver(
        unsigned long       numberOfStates )
 {
        IOPMRequest *       request;
-       IOPMPSEntry *       powerStatesCopy = 0;
+       IOPMPSEntry *       powerStatesCopy = NULL;
        IOPMPowerStateIndex stateOrder;
        IOReturn            error = kIOReturnSuccess;
 
@@ -1246,6 +1222,40 @@ IOService::handleRegisterPowerDriver( IOPMRequest * request )
                lowestPowerState   = fPowerStates[0].stateOrderToIndex;
                fHighestPowerState = fPowerStates[numberOfStates - 1].stateOrderToIndex;
 
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+               {
+                       uint32_t        aotFlags;
+                       IOService *     service;
+                       OSObject *      object;
+                       OSData *        data;
+
+                       // Disallow kIOPMAOTPower states unless device tree enabled
+
+                       aotFlags = 0;
+                       service  = this;
+                       while (service && !service->inPlane(gIODTPlane)) {
+                               service = service->getProvider();
+                       }
+                       if (service) {
+                               object = service->copyProperty(kIOPMAOTPowerKey, gIODTPlane);
+                               data = OSDynamicCast(OSData, object);
+                               if (data && (data->getLength() >= sizeof(uint32_t))) {
+                                       aotFlags = ((uint32_t *)data->getBytesNoCopy())[0];
+                               }
+                               OSSafeReleaseNULL(object);
+                       }
+                       if (!aotFlags) {
+                               for (i = 0; i < numberOfStates; i++) {
+                                       if (kIOPMAOTPower & fPowerStates[i].inputPowerFlags) {
+                                               fPowerStates[i].inputPowerFlags  = 0xFFFFFFFF;
+                                               fPowerStates[i].capabilityFlags  = 0;
+                                               fPowerStates[i].outputPowerFlags = 0;
+                                       }
+                               }
+                       }
+               }
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
                // OR'in all the output power flags
                fMergedOutputPowerFlags = 0;
                fDeviceUsablePowerState = lowestPowerState;
@@ -1434,7 +1444,7 @@ IOService::handleInterestChanged( IOPMRequest * request )
                        fInsertInterestSet->removeObject(driver);
                }
                fInsertInterestSet->release();
-               fInsertInterestSet = 0;
+               fInsertInterestSet = NULL;
        }
 
        if (fRemoveInterestSet) {
@@ -1451,7 +1461,7 @@ IOService::handleInterestChanged( IOPMRequest * request )
                        fRemoveInterestSet->removeObject(driver);
                }
                fRemoveInterestSet->release();
-               fRemoveInterestSet = 0;
+               fRemoveInterestSet = NULL;
        }
 
        PM_UNLOCK();
@@ -1642,7 +1652,7 @@ IOService::adjustPowerState( uint32_t clamp )
                        /* flags        */ changeFlags,
                        /* power state  */ fDesiredPowerState,
                        /* domain flags */ 0,
-                       /* connection   */ 0,
+                       /* connection   */ NULL,
                        /* parent flags */ 0);
        }
 }
@@ -1656,7 +1666,7 @@ IOService::synchronizePowerTree(
        IOOptionBits    options,
        IOService *     notifyRoot )
 {
-       IOPMRequest *   request_c = 0;
+       IOPMRequest *   request_c = NULL;
        IOPMRequest *   request_s;
 
        if (this != getPMRootDomain()) {
@@ -1666,7 +1676,7 @@ IOService::synchronizePowerTree(
                return kIOPMNotYetInitialized;
        }
 
-       OUR_PMLog(kPMLogCSynchronizePowerTree, options, (notifyRoot != 0));
+       OUR_PMLog(kPMLogCSynchronizePowerTree, options, (notifyRoot != NULL));
 
        if (notifyRoot) {
                IOPMRequest * nr;
@@ -1727,7 +1737,7 @@ IOService::handleSynchronizePowerTree( IOPMRequest * request )
                        (options & kIOPMSyncNoChildNotify),
                        /* power state  */ fCurrentPowerState,
                        /* domain flags */ 0,
-                       /* connection   */ 0,
+                       /* connection   */ NULL,
                        /* parent flags */ 0);
        }
 }
@@ -1965,6 +1975,12 @@ IOService::handlePowerDomainDidChangeTo( IOPMRequest * request )
                myChangeFlags = kIOPMParentInitiated | kIOPMDomainDidChange |
                    (parentChangeFlags & kIOPMRootBroadcastFlags);
 
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+               if (kIOPMAOTPower & fPowerStates[maxPowerState].inputPowerFlags) {
+                       IOLog("aotPS %s0x%qx[%ld]\n", getName(), getRegistryEntryID(), maxPowerState);
+               }
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
                result = startPowerChange(
                        /* flags        */ myChangeFlags,
                        /* power state  */ maxPowerState,
@@ -2339,7 +2355,7 @@ IOService::changePowerStateWithOverrideTo( IOPMPowerStateIndex ordinal,
        request->fRequestTag = tag;
        request->fArg0 = (void *) ordinal;
        request->fArg1 = (void *) gIOPMPowerClientDevice;
-       request->fArg2 = 0;
+       request->fArg2 = NULL;
 #if NOT_READY
        if (action) {
                request->installCompletionAction( action, target, param );
@@ -2441,7 +2457,7 @@ IOService::requestPowerState(
        client->retain();
        request->fArg0 = (void *)(uintptr_t) state;
        request->fArg1 = (void *)            client;
-       request->fArg2 = 0;
+       request->fArg2 = NULL;
 #if NOT_READY
        if (action) {
                request->installCompletionAction( action, target, param );
@@ -2731,6 +2747,15 @@ IOService::computeDesiredState( unsigned long localClamp, bool computeOnly )
                newPowerState = fHighestPowerState;
        }
 
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+       if (getPMRootDomain()->isAOTMode()) {
+               if ((kIOPMPreventIdleSleep & fPowerStates[newPowerState].capabilityFlags)
+                   && !(kIOPMPreventIdleSleep & fPowerStates[fDesiredPowerState].capabilityFlags)) {
+                       getPMRootDomain()->claimSystemWakeEvent(this, kIOPMWakeEventAOTExit, getName(), NULL);
+               }
+       }
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+
        fDesiredPowerState = newPowerState;
 
        PM_LOG1("  temp %u, clamp %u, current %u, new %u\n",
@@ -3432,6 +3457,8 @@ IOService::startPowerChange(
        IOPowerConnection *     parentConnection,
        IOPMPowerFlags          parentFlags )
 {
+       uint32_t savedPMActionsParam;
+
        PM_ASSERT_IN_GATE();
        assert( fMachineState == kIOPM_Finished );
        assert( powerState < fNumberOfPowerStates );
@@ -3441,8 +3468,25 @@ IOService::startPowerChange(
        }
 
        fIsPreChange = true;
+       savedPMActionsParam = fPMActions.parameter;
        PM_ACTION_2(actionPowerChangeOverride, &powerState, &changeFlags);
 
+       // rdar://problem/55040032
+       // Schedule a power adjustment after removing the power clamp
+       // to inform our power parent(s) about our latest desired domain
+       // power state. For a self-initiated change, let OurChangeStart()
+       // automatically request parent power when necessary.
+       if (!fAdjustPowerScheduled &&
+           ((changeFlags & kIOPMSelfInitiated) == 0) &&
+           ((fPMActions.parameter & kPMActionsFlagLimitPower) == 0) &&
+           ((savedPMActionsParam  & kPMActionsFlagLimitPower) != 0)) {
+               IOPMRequest * request = acquirePMRequest(this, kIOPMRequestTypeAdjustPowerState);
+               if (request) {
+                       submitPMRequest(request);
+                       fAdjustPowerScheduled = true;
+               }
+       }
+
        if (changeFlags & kIOPMExpireIdleTimer) {
                // Root domain requested removal of tickle influence
                if (StateOrder(fDeviceDesire) > StateOrder(powerState)) {
@@ -3523,7 +3567,7 @@ IOService::notifyInterestedDrivers( void )
                if (fDriverCallParamSlots) {
                        assert(fDriverCallParamPtr);
                        IODelete(fDriverCallParamPtr, DriverCallParam, fDriverCallParamSlots);
-                       fDriverCallParamPtr = 0;
+                       fDriverCallParamPtr = NULL;
                        fDriverCallParamSlots = 0;
                }
 
@@ -3703,7 +3747,7 @@ IOService::notifyChildren( void )
        OSIterator *        iter;
        OSObject *          next;
        IOPowerConnection * connection;
-       OSArray *           children = 0;
+       OSArray *           children = NULL;
        IOPMrootDomain *    rootDomain;
        bool                delayNotify = false;
 
@@ -3763,10 +3807,10 @@ IOService::notifyChildren( void )
 
        if (children && (children->getCount() == 0)) {
                children->release();
-               children = 0;
+               children = NULL;
        }
        if (children) {
-               assert(fNotifyChildArray == 0);
+               assert(fNotifyChildArray == NULL);
                fNotifyChildArray = children;
                MS_PUSH(fMachineState);
 
@@ -3818,7 +3862,7 @@ IOService::notifyChildrenOrdered( void )
                fNotifyChildArray->removeObject(0);
        } else {
                fNotifyChildArray->release();
-               fNotifyChildArray = 0;
+               fNotifyChildArray = NULL;
 
                MS_POP(); // pushed by notifyChildren()
        }
@@ -3853,7 +3897,7 @@ IOService::notifyChildrenDelayed( void )
 
        PM_LOG2("%s: notified delayed children\n", getName());
        fNotifyChildArray->release();
-       fNotifyChildArray = 0;
+       fNotifyChildArray = NULL;
 
        MS_POP(); // pushed by notifyChildren()
 }
@@ -3954,14 +3998,11 @@ IOService::driverSetPowerState( void )
        param = (DriverCallParam *) fDriverCallParamPtr;
        powerState = fHeadNotePowerState;
 
-       callEntry.callMethod = OSMemberFunctionCast(const void *, fControllingDriver, &IOService::setPowerState);
-       if (assertPMDriverCall(&callEntry)) {
+       if (assertPMDriverCall(&callEntry, kIOPMDriverCallMethodSetPowerState)) {
                OUR_PMLogFuncStart(kPMLogProgramHardware, (uintptr_t) this, powerState);
-               start_spindump_timer("SetState");
                clock_get_uptime(&fDriverCallStartTime);
                result = fControllingDriver->setPowerState( powerState, this );
                clock_get_uptime(&end);
-               stop_spindump_timer();
                OUR_PMLogFuncEnd(kPMLogProgramHardware, (uintptr_t) this, (UInt32) result);
 
                deassertPMDriverCall(&callEntry);
@@ -4017,6 +4058,8 @@ IOService::driverInformPowerChange( void )
        AbsoluteTime        end;
        IOReturn            result;
        IOItemCount         count;
+       IOOptionBits        callMethod = (fDriverCallReason == kDriverCallInformPreChange) ?
+           kIOPMDriverCallMethodWillChange : kIOPMDriverCallMethodDidChange;
 
        assert( fDriverCallBusy );
        assert( fDriverCallParamPtr );
@@ -4032,27 +4075,18 @@ IOService::driverInformPowerChange( void )
                informee = (IOPMinformee *) param->Target;
                driver   = informee->whatObject;
 
-               if (fDriverCallReason == kDriverCallInformPreChange) {
-                       callEntry.callMethod = OSMemberFunctionCast(const void *, driver, &IOService::powerStateWillChangeTo);
-               } else {
-                       callEntry.callMethod = OSMemberFunctionCast(const void *, driver, &IOService::powerStateDidChangeTo);
-               }
-               if (assertPMDriverCall(&callEntry, 0, informee)) {
+               if (assertPMDriverCall(&callEntry, callMethod, informee)) {
                        if (fDriverCallReason == kDriverCallInformPreChange) {
                                OUR_PMLogFuncStart(kPMLogInformDriverPreChange, (uintptr_t) this, powerState);
-                               start_spindump_timer("WillChange");
                                clock_get_uptime(&informee->startTime);
                                result = driver->powerStateWillChangeTo(powerFlags, powerState, this);
                                clock_get_uptime(&end);
-                               stop_spindump_timer();
                                OUR_PMLogFuncEnd(kPMLogInformDriverPreChange, (uintptr_t) this, result);
                        } else {
                                OUR_PMLogFuncStart(kPMLogInformDriverPostChange, (uintptr_t) this, powerState);
-                               start_spindump_timer("DidChange");
                                clock_get_uptime(&informee->startTime);
                                result = driver->powerStateDidChangeTo(powerFlags, powerState, this);
                                clock_get_uptime(&end);
-                               stop_spindump_timer();
                                OUR_PMLogFuncEnd(kPMLogInformDriverPostChange, (uintptr_t) this, result);
                        }
 
@@ -4340,7 +4374,7 @@ IOService::all_done( void )
 
                        // inform subclass policy-maker
                        if (fPCDFunctionOverride && fParentsKnowState &&
-                           assertPMDriverCall(&callEntry, kIOPMADC_NoInactiveCheck)) {
+                           assertPMDriverCall(&callEntry, kIOPMDriverCallMethodChangeDone, NULL, kIOPMDriverCallNoInactiveCheck)) {
                                powerChangeDone(prevPowerState);
                                deassertPMDriverCall(&callEntry);
                        }
@@ -4392,7 +4426,7 @@ IOService::all_done( void )
 
                        // inform subclass policy-maker
                        if (fPCDFunctionOverride && fParentsKnowState &&
-                           assertPMDriverCall(&callEntry, kIOPMADC_NoInactiveCheck)) {
+                           assertPMDriverCall(&callEntry, kIOPMDriverCallMethodChangeDone, NULL, kIOPMDriverCallNoInactiveCheck)) {
                                powerChangeDone(prevPowerState);
                                deassertPMDriverCall(&callEntry);
                        }
@@ -4509,7 +4543,7 @@ requestDomainPowerApplier(
        IOService *                     parent;
        IOPMRequestDomainPowerContext * context;
 
-       if ((connection = OSDynamicCast(IOPowerConnection, entry)) == 0) {
+       if ((connection = OSDynamicCast(IOPowerConnection, entry)) == NULL) {
                return;
        }
        parent = (IOService *) connection->copyParentEntry(gIOPowerPlane);
@@ -5203,6 +5237,9 @@ IOService::startSettleTimer( void )
 //*********************************************************************************
 
 #ifndef __LP64__
+#if MACH_ASSERT
+__dead2
+#endif
 void
 IOService::ack_timer_ticked( void )
 {
@@ -5232,14 +5269,40 @@ IOService::ackTimerTick( void )
                                PM_ERROR("%s::setPowerState(%p, %lu -> %lu) timed out after %d ms\n",
                                    fName, OBFUSCATE(this), fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec));
 
-#if DEBUG && CONFIG_EMBEDDED
-                               panic("%s::setPowerState(%p, %lu -> %lu) timed out after %d ms",
-                                   fName, this, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec));
+#if DEBUG || DEVELOPMENT || CONFIG_EMBEDDED
+                               uint32_t panic_allowed = -1;
+                               PE_parse_boot_argn("setpowerstate_panic", &panic_allowed, sizeof(panic_allowed));
+                               if (panic_allowed != 0) {
+                                       // rdar://problem/48743340 - excluding AppleSEPManager from panic
+                                       const char *whitelist = "AppleSEPManager";
+                                       if (strncmp(fName, whitelist, strlen(whitelist))) {
+                                               panic("%s::setPowerState(%p, %lu -> %lu) timed out after %d ms",
+                                                   fName, this, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec));
+                                       }
+                               } else {
+                                       PM_ERROR("setPowerState panic disabled by setpowerstate_panic boot-arg\n");
+                               }
 #else
                                if (gIOKitDebug & kIOLogDebugPower) {
                                        panic("%s::setPowerState(%p, %lu -> %lu) timed out after %d ms",
                                            fName, this, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec));
                                } else {
+                                       // panic for first party kexts
+                                       const void *function_addr = NULL;
+                                       OSKext *kext = NULL;
+                                       function_addr = OSMemberFunctionCast(const void *, fControllingDriver, &IOService::setPowerState);
+                                       kext = OSKext::lookupKextWithAddress((vm_address_t)function_addr);
+                                       if (kext) {
+                                               const char *bundleID = kext->getIdentifierCString();
+                                               const char *apple_prefix = "com.apple";
+                                               const char *kernel_prefix = "__kernel__";
+                                               if (strncmp(bundleID, apple_prefix, strlen(apple_prefix)) == 0 || strncmp(bundleID, kernel_prefix, strlen(kernel_prefix)) == 0) {
+                                                       // first party client
+                                                       panic("%s::setPowerState(%p : %p, %lu -> %lu) timed out after %d ms",
+                                                           fName, this, function_addr, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec));
+                                               }
+                                               kext->release();
+                                       }
                                        // Unblock state machine and pretend driver has acked.
                                        done = true;
                                }
@@ -5283,6 +5346,7 @@ IOService::ackTimerTick( void )
                        if (fHeadNotePendingAcks == 0) {
                                // yes, we can continue
                                done = true;
+                               getPMRootDomain()->reset_watchdog_timer(this, 0);
                        } else {
                                // no, set timer again
                                start_ack_timer();
@@ -5414,7 +5478,6 @@ IOService::reset_watchdog_timer(IOService *blockedObject, int pendingResponseTim
                        goto exit;
                }
 
-
                for (i = 0; i < fBlockedArray->getCount(); i++) {
                        obj = OSDynamicCast(IOService, fBlockedArray->getObject(i));
                        if (obj && (obj->fPendingResponseDeadline < deadline)) {
@@ -5459,9 +5522,9 @@ IOService::watchdog_timer_expired( thread_call_param_t arg0, thread_call_param_t
 
        gIOPMWatchDogThread = current_thread();
        getPMRootDomain()->sleepWakeDebugTrig(true);
-       gIOPMWatchDogThread = 0;
+       gIOPMWatchDogThread = NULL;
        thread_call_free(me->fWatchdogTimer);
-       me->fWatchdogTimer = 0;
+       me->fWatchdogTimer = NULL;
 
        return;
 }
@@ -5558,108 +5621,6 @@ IOService::ack_timer_expired( thread_call_param_t arg0, thread_call_param_t arg1
        me->release();
 }
 
-//*********************************************************************************
-// [private] start_spindump_timer
-//*********************************************************************************
-
-void
-IOService::start_spindump_timer( const char * delay_type )
-{
-       AbsoluteTime    deadline;
-       boolean_t       pending;
-
-       if (!fSpinDumpTimer || !(kIOKextSpinDump & gIOKitDebug)) {
-               return;
-       }
-
-       if (gIOSpinDumpKextName[0] == '\0' &&
-           !(PE_parse_boot_argn("swd_kext_name", &gIOSpinDumpKextName,
-           sizeof(gIOSpinDumpKextName)))) {
-               return;
-       }
-
-       if (strncmp(gIOSpinDumpKextName, fName, sizeof(gIOSpinDumpKextName)) != 0) {
-               return;
-       }
-
-       if (gIOSpinDumpDelayType[0] == '\0' &&
-           !(PE_parse_boot_argn("swd_delay_type", &gIOSpinDumpDelayType,
-           sizeof(gIOSpinDumpDelayType)))) {
-               strncpy(gIOSpinDumpDelayType, "SetState", sizeof(gIOSpinDumpDelayType));
-       }
-
-       if (strncmp(delay_type, gIOSpinDumpDelayType, sizeof(gIOSpinDumpDelayType)) != 0) {
-               return;
-       }
-
-       if (gIOSpinDumpDelayDuration == 0 &&
-           !(PE_parse_boot_argn("swd_delay_duration", &gIOSpinDumpDelayDuration,
-           sizeof(gIOSpinDumpDelayDuration)))) {
-               gIOSpinDumpDelayDuration = 300;
-       }
-
-       clock_interval_to_deadline(gIOSpinDumpDelayDuration, kMillisecondScale, &deadline);
-
-       retain();
-       pending = thread_call_enter_delayed(fSpinDumpTimer, deadline);
-       if (pending) {
-               release();
-       }
-}
-
-//*********************************************************************************
-// [private] stop_spindump_timer
-//*********************************************************************************
-
-void
-IOService::stop_spindump_timer( void )
-{
-       boolean_t   pending;
-
-       if (!fSpinDumpTimer || !(kIOKextSpinDump & gIOKitDebug)) {
-               return;
-       }
-
-       pending = thread_call_cancel(fSpinDumpTimer);
-       if (pending) {
-               release();
-       }
-}
-
-
-//*********************************************************************************
-// [static] actionSpinDumpTimerExpired
-//
-// Inside PM work loop's gate.
-//*********************************************************************************
-
-IOReturn
-IOService::actionSpinDumpTimerExpired(
-       OSObject * target,
-       void * arg0, void * arg1,
-       void * arg2, void * arg3 )
-{
-       getPMRootDomain()->takeStackshot(false, false, true);
-
-       return kIOReturnSuccess;
-}
-
-//*********************************************************************************
-// spindump_timer_expired
-//
-// Thread call function. Holds a retain while the callout is in flight.
-//*********************************************************************************
-
-void
-IOService::spindump_timer_expired( thread_call_param_t arg0, thread_call_param_t arg1 )
-{
-       IOService * me = (IOService *) arg0;
-
-       if (gIOPMWorkLoop) {
-               gIOPMWorkLoop->runAction(&actionSpinDumpTimerExpired, me);
-       }
-       me->release();
-}
 
 // MARK: -
 // MARK: Client Messaging
@@ -5887,10 +5848,12 @@ IOService::tellClientsWithResponse( int messageType )
        }
 
        context.responseArray    = fResponseArray;
-       context.notifyClients    = 0;
+       context.notifyClients    = NULL;
        context.serialNumber     = fSerialNumber;
        context.messageType      = messageType;
        context.notifyType       = fOutOfBandParameter;
+       context.skippedInDark    = 0;
+       context.notSkippedInDark = 0;
        context.isPreChange      = fIsPreChange;
        context.enableTracing    = false;
        context.us               = this;
@@ -5902,7 +5865,7 @@ IOService::tellClientsWithResponse( int messageType )
            OSMemberFunctionCast(
                IOPMMessageFilter,
                this,
-               &IOPMrootDomain::systemMessageFilter) : 0;
+               &IOPMrootDomain::systemMessageFilter) : NULL;
 
        switch (fOutOfBandParameter) {
        case kNotifyApps:
@@ -5969,6 +5932,12 @@ IOService::tellClientsWithResponse( int messageType )
        }
        fNotifyClientArray = context.notifyClients;
 
+       if (context.skippedInDark) {
+               IOLog("tellClientsWithResponse(%s, %d) %d of %d skipped in dark\n",
+                   getIOMessageString(messageType), fOutOfBandParameter,
+                   context.skippedInDark, context.skippedInDark + context.notSkippedInDark);
+       }
+
        // do we have to wait for somebody?
        if (!checkForDone()) {
                OUR_PMLog(kPMLogStartAckTimer, context.maxTimeRequested, 0);
@@ -6027,10 +5996,16 @@ IOService::pmTellAppWithResponse( OSObject * object, void * arg )
 
                        if (proc) {
                                proc_suspended = get_task_pidsuspended((task_t) proc->task);
-                               proc_rele(proc);
-
                                if (proc_suspended) {
                                        logClientIDForNotification(object, context, "PMTellAppWithResponse - Suspended");
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+                               } else if (getPMRootDomain()->isAOTMode() && get_task_suspended((task_t) proc->task)) {
+                                       proc_suspended = true;
+                                       context->skippedInDark++;
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+                               }
+                               proc_rele(proc);
+                               if (proc_suspended) {
                                        return;
                                }
                        }
@@ -6038,16 +6013,17 @@ IOService::pmTellAppWithResponse( OSObject * object, void * arg )
        }
 
        if (context->messageFilter &&
-           !context->messageFilter(context->us, object, context, 0, &waitForReply)) {
+           !context->messageFilter(context->us, object, context, NULL, &waitForReply)) {
                if (kIOLogDebugPower & gIOKitDebug) {
                        logClientIDForNotification(object, context, "DROP App");
                }
                return;
        }
+       context->notSkippedInDark++;
 
        // Create client array (for tracking purposes) only if the service
        // has app clients. Usually only root domain does.
-       if (0 == context->notifyClients) {
+       if (NULL == context->notifyClients) {
                context->notifyClients = OSArray::withCapacity( 32 );
        }
 
@@ -6104,7 +6080,7 @@ IOService::pmTellClientWithResponse( OSObject * object, void * arg )
        uint64_t                        nsec;
 
        if (context->messageFilter &&
-           !context->messageFilter(context->us, object, context, 0, 0)) {
+           !context->messageFilter(context->us, object, context, NULL, NULL)) {
                if ((kIOLogDebugPower & gIOKitDebug) &&
                    (OSDynamicCast(_IOServiceInterestNotifier, object))) {
                        _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object;
@@ -6138,7 +6114,7 @@ IOService::pmTellClientWithResponse( OSObject * object, void * arg )
                    OBFUSCATE(object), OBFUSCATE(notifier->handler));
        }
 
-       if (0 == context->notifyClients) {
+       if (NULL == context->notifyClients) {
                context->notifyClients = OSArray::withCapacity( 32 );
        }
 
@@ -6147,7 +6123,7 @@ IOService::pmTellClientWithResponse( OSObject * object, void * arg )
        notify.stateNumber = context->stateNumber;
        notify.stateFlags  = context->stateFlags;
 
-       if (context->enableTracing && (notifier != 0)) {
+       if (context->enableTracing && (notifier != NULL)) {
                getPMRootDomain()->traceDetail(notifier, true);
        }
 
@@ -6236,9 +6212,40 @@ IOService::pmTellCapabilityAppWithResponse( OSObject * object, void * arg )
                return;
        }
 
+       if (context->us == getPMRootDomain() &&
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+           getPMRootDomain()->isAOTMode()
+#else /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+           false
+#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+           ) {
+               OSNumber                *clientID = NULL;
+               boolean_t               proc_suspended = FALSE;
+               proc_t                proc = NULL;
+               if ((clientID = copyClientIDForNotification(object, context))) {
+                       uint32_t clientPID = clientID->unsigned32BitValue();
+                       clientID->release();
+                       proc = proc_find(clientPID);
+                       if (proc) {
+                               proc_suspended = get_task_pidsuspended((task_t) proc->task);
+                               if (proc_suspended) {
+                                       logClientIDForNotification(object, context, "PMTellCapablityAppWithResponse - Suspended");
+                               } else if (get_task_suspended((task_t) proc->task)) {
+                                       proc_suspended = true;
+                                       context->skippedInDark++;
+                               }
+                               proc_rele(proc);
+                               if (proc_suspended) {
+                                       return;
+                               }
+                       }
+               }
+       }
+       context->notSkippedInDark++;
+
        // Create client array (for tracking purposes) only if the service
        // has app clients. Usually only root domain does.
-       if (0 == context->notifyClients) {
+       if (NULL == context->notifyClients) {
                context->notifyClients = OSArray::withCapacity( 32 );
        }
 
@@ -6316,7 +6323,7 @@ IOService::pmTellCapabilityClientWithResponse(
 
        memset(&msgArg, 0, sizeof(msgArg));
        if (context->messageFilter &&
-           !context->messageFilter(context->us, object, context, &msgArg, 0)) {
+           !context->messageFilter(context->us, object, context, &msgArg, NULL)) {
                if ((kIOLogDebugPower & gIOKitDebug) &&
                    (OSDynamicCast(_IOServiceInterestNotifier, object))) {
                        _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object;
@@ -6328,7 +6335,7 @@ IOService::pmTellCapabilityClientWithResponse(
                return;
        }
 
-       if (0 == context->notifyClients) {
+       if (NULL == context->notifyClients) {
                context->notifyClients = OSArray::withCapacity( 32 );
        }
        notifier = OSDynamicCast(_IOServiceInterestNotifier, object);
@@ -6356,7 +6363,7 @@ IOService::pmTellCapabilityClientWithResponse(
        msgArg.notifyRef = msgRef;
        msgArg.maxWaitForReply = 0;
 
-       if (context->enableTracing && (notifier != 0)) {
+       if (context->enableTracing && (notifier != NULL)) {
                getPMRootDomain()->traceDetail(notifier, true);
        }
 
@@ -6475,7 +6482,7 @@ IOService::tellClients( int messageType )
            OSMemberFunctionCast(
                IOPMMessageFilter,
                this,
-               &IOPMrootDomain::systemMessageFilter) : 0;
+               &IOPMrootDomain::systemMessageFilter) : NULL;
 
        context.notifyType    = kNotifyPriority;
        applyToInterested( gIOPriorityPowerStateInterest,
@@ -6502,7 +6509,7 @@ tellKernelClientApplier( OSObject * object, void * arg )
        IOPMInterestContext *           context = (IOPMInterestContext *) arg;
 
        if (context->messageFilter &&
-           !context->messageFilter(context->us, object, context, 0, 0)) {
+           !context->messageFilter(context->us, object, context, NULL, NULL)) {
                if ((kIOLogDebugPower & gIOKitDebug) &&
                    (OSDynamicCast(_IOServiceInterestNotifier, object))) {
                        _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object;
@@ -6514,7 +6521,7 @@ tellKernelClientApplier( OSObject * object, void * arg )
                return;
        }
 
-       notify.powerRef     = (void *) 0;
+       notify.powerRef     = (void *) NULL;
        notify.returnValue  = 0;
        notify.stateNumber  = context->stateNumber;
        notify.stateFlags   = context->stateFlags;
@@ -6596,10 +6603,16 @@ tellAppClientApplier( OSObject * object, void * arg )
 
                        if (proc) {
                                proc_suspended = get_task_pidsuspended((task_t) proc->task);
-                               proc_rele(proc);
-
                                if (proc_suspended) {
                                        logClientIDForNotification(object, context, "tellAppClientApplier - Suspended");
+#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146))
+                               } else if (IOService::getPMRootDomain()->isAOTMode() && get_task_suspended((task_t) proc->task)) {
+                                       proc_suspended = true;
+                                       context->skippedInDark++;
+#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */
+                               }
+                               proc_rele(proc);
+                               if (proc_suspended) {
                                        return;
                                }
                        }
@@ -6607,18 +6620,19 @@ tellAppClientApplier( OSObject * object, void * arg )
        }
 
        if (context->messageFilter &&
-           !context->messageFilter(context->us, object, context, 0, 0)) {
+           !context->messageFilter(context->us, object, context, NULL, NULL)) {
                if (kIOLogDebugPower & gIOKitDebug) {
                        logClientIDForNotification(object, context, "DROP App");
                }
                return;
        }
+       context->notSkippedInDark++;
 
        if (kIOLogDebugPower & gIOKitDebug) {
                logClientIDForNotification(object, context, "MESG App");
        }
 
-       context->us->messageClient(context->messageType, object, 0);
+       context->us->messageClient(context->messageType, object, NULL);
 }
 
 //*********************************************************************************
@@ -6659,7 +6673,7 @@ IOService::responseValid( uint32_t refcon, int pid )
        UInt16          serialComponent;
        UInt16          ordinalComponent;
        OSObject *      theFlag;
-       OSObject        *object = 0;
+       OSObject        *object = NULL;
 
        serialComponent  = (refcon >> 16) & 0xFFFF;
        ordinalComponent = (refcon & 0xFFFF);
@@ -6674,7 +6688,7 @@ IOService::responseValid( uint32_t refcon, int pid )
 
        theFlag = fResponseArray->getObject(ordinalComponent);
 
-       if (theFlag == 0) {
+       if (theFlag == NULL) {
                return false;
        }
 
@@ -6727,7 +6741,7 @@ IOService::responseValid( uint32_t refcon, int pid )
        } else if (object) {
                getPMRootDomain()->pmStatsRecordApplicationResponse(
                        gIOPMStatsResponsePrompt,
-                       0, 0, 0, pid, object);
+                       NULL, 0, 0, pid, object);
        }
 
        if (kOSBooleanFalse == theFlag) {
@@ -6762,7 +6776,7 @@ IOService::allowPowerChange( unsigned long refcon )
 
        request->fArg0 = (void *)            refcon;
        request->fArg1 = (void *)(uintptr_t) proc_selfpid();
-       request->fArg2 = (void *)            0;
+       request->fArg2 = (void *)            NULL;
        submitPMRequest( request );
 
        return kIOReturnSuccess;
@@ -6814,6 +6828,23 @@ IOService::cancelPowerChange( unsigned long refcon )
        return kIOReturnSuccess;
 }
 
+//*********************************************************************************
+// cancelIdlePowerDown
+//
+// Internal method to trigger an idle cancel or revert
+//*********************************************************************************
+
+void
+IOService::cancelIdlePowerDown( IOService * service )
+{
+       IOPMRequest * request;
+
+       request = acquirePMRequest(service, kIOPMRequestTypeIdleCancel);
+       if (request) {
+               submitPMRequest(request);
+       }
+}
+
 #ifndef __LP64__
 IOReturn
 IOService::serializedCancelPowerChange2( unsigned long refcon )
@@ -7375,7 +7406,7 @@ IOService::actionPMCompletionQueue(
        IOPMRequest *         request,
        IOPMCompletionQueue * queue )
 {
-       bool            more = (request->getNextRequest() != 0);
+       bool            more = (request->getNextRequest() != NULL);
        IOPMRequest *   root = request->getRootRequest();
 
        if (root && (root != request)) {
@@ -7767,7 +7798,7 @@ IOService::actionPMWorkQueueInvoke( IOPMRequest * request, IOPMWorkQueue * queue
                            fMachineState);
                }
 
-               gIOPMRequest = 0;
+               gIOPMRequest = NULL;
 
                if (fMachineState == kIOPM_Finished) {
                        stop_watchdog_timer();
@@ -7904,7 +7935,7 @@ IOService::actionPMReplyQueue( IOPMRequest * request, IOPMRequestQueue * queue )
                                        getPMRootDomain()->pmStatsRecordApplicationResponse(
                                                gIOPMStatsResponseCancel,
                                                name ? name->getCStringNoCopy() : "", 0,
-                                               0, (int)(uintptr_t) request->fArg1, 0);
+                                               0, (int)(uintptr_t) request->fArg1, NULL);
                                }
                        }
 
@@ -8009,10 +8040,11 @@ IOService::actionPMReplyQueue( IOPMRequest * request, IOPMRequestQueue * queue )
 bool
 IOService::assertPMDriverCall(
        IOPMDriverCallEntry *   entry,
-       IOOptionBits            options,
-       IOPMinformee *          inform )
+       IOOptionBits            method,
+       const IOPMinformee *    inform,
+       IOOptionBits            options )
 {
-       IOService * target = 0;
+       IOService * target = NULL;
        bool        ok = false;
 
        if (!initialized) {
@@ -8025,7 +8057,7 @@ IOService::assertPMDriverCall(
                goto fail;
        }
 
-       if (((options & kIOPMADC_NoInactiveCheck) == 0) && isInactive()) {
+       if (((options & kIOPMDriverCallNoInactiveCheck) == 0) && isInactive()) {
                goto fail;
        }
 
@@ -8039,6 +8071,24 @@ IOService::assertPMDriverCall(
                }
        }
 
+       // Record calling address for sleep failure diagnostics
+       switch (method) {
+       case kIOPMDriverCallMethodSetPowerState:
+               entry->callMethod = OSMemberFunctionCast(const void *, fControllingDriver, &IOService::setPowerState);
+               break;
+       case kIOPMDriverCallMethodWillChange:
+               entry->callMethod = OSMemberFunctionCast(const void *, target, &IOService::powerStateWillChangeTo);
+               break;
+       case kIOPMDriverCallMethodDidChange:
+               entry->callMethod = OSMemberFunctionCast(const void *, target, &IOService::powerStateDidChangeTo);
+               break;
+       case kIOPMDriverCallMethodUnknown:
+       case kIOPMDriverCallMethodSetAggressive:
+       default:
+               entry->callMethod = NULL;
+               break;
+       }
+
        entry->thread = current_thread();
        entry->target = target;
        queue_enter(&fPMDriverCallQueue, entry, IOPMDriverCallEntry *, link);
@@ -8193,9 +8243,9 @@ IOPMRequest *
 IOPMRequest::create( void )
 {
        IOPMRequest * me = OSTypeAlloc(IOPMRequest);
-       if (me && !me->init(0, kIOPMRequestTypeInvalid)) {
+       if (me && !me->init(NULL, kIOPMRequestTypeInvalid)) {
                me->release();
-               me = 0;
+               me = NULL;
        }
        return me;
 }
@@ -8235,14 +8285,14 @@ IOPMRequest::reset( void )
        if (fCompletionAction && (fRequestType == kIOPMRequestTypeQuiescePowerTree)) {
                // Call the completion on PM work loop context
                fCompletionAction(fCompletionTarget, fCompletionParam);
-               fCompletionAction = 0;
+               fCompletionAction = NULL;
        }
 
        fRequestType = kIOPMRequestTypeInvalid;
 
        if (fTarget) {
                fTarget->release();
-               fTarget = 0;
+               fTarget = NULL;
        }
 }
 
@@ -8285,7 +8335,7 @@ IOPMRequest::detachNextRequest( void )
                    (uint32_t) fRequestNext->fWorkWaitCount,
                    fTarget->getName());
 #endif
-               fRequestNext = 0;
+               fRequestNext = NULL;
                ok = true;
        }
        return ok;
@@ -8330,7 +8380,7 @@ IOPMRequest::detachRootRequest( void )
                    (uint32_t) fRequestRoot->fFreeWaitCount,
                    fTarget->getName());
 #endif
-               fRequestRoot = 0;
+               fRequestRoot = NULL;
                ok = true;
        }
        return ok;
@@ -8353,7 +8403,7 @@ IOPMRequestQueue::create( IOService * inOwner, Action inAction )
        IOPMRequestQueue * me = OSTypeAlloc(IOPMRequestQueue);
        if (me && !me->init(inOwner, inAction)) {
                me->release();
-               me = 0;
+               me = NULL;
        }
        return me;
 }
@@ -8367,7 +8417,7 @@ IOPMRequestQueue::init( IOService * inOwner, Action inAction )
 
        queue_init(&fQueue);
        fLock = IOLockAlloc();
-       return fLock != 0;
+       return fLock != NULL;
 }
 
 void
@@ -8375,7 +8425,7 @@ IOPMRequestQueue::free( void )
 {
        if (fLock) {
                IOLockFree(fLock);
-               fLock = 0;
+               fLock = NULL;
        }
        return IOEventSource::free();
 }
@@ -8458,7 +8508,7 @@ IOPMWorkQueue::create( IOService * inOwner, Action invoke, Action retire )
        IOPMWorkQueue * me = OSTypeAlloc(IOPMWorkQueue);
        if (me && !me->init(inOwner, invoke, retire)) {
                me->release();
-               me = 0;
+               me = NULL;
        }
        return me;
 }
@@ -8467,7 +8517,7 @@ bool
 IOPMWorkQueue::init( IOService * inOwner, Action invoke, Action retire )
 {
        if (!invoke || !retire ||
-           !IOEventSource::init(inOwner, (IOEventSourceAction)0)) {
+           !IOEventSource::init(inOwner, (IOEventSourceAction)NULL)) {
                return false;
        }
 
@@ -8562,7 +8612,7 @@ IOPMWorkQueue::checkRequestQueue( queue_head_t * requestQueue, bool * empty )
                }
 
                if (request == fQuiesceRequest) {
-                       fQuiesceRequest = 0;
+                       fQuiesceRequest = NULL;
                }
 
                queue_remove_first(requestQueue, request, typeof(request), fCommandChain);
@@ -8685,7 +8735,7 @@ IOPMWorkQueue::finishQuiesceRequest( IOPMRequest * quiesceRequest )
 {
        if (fQuiesceRequest && (quiesceRequest == fQuiesceRequest) &&
            (fQuiesceStartTime != 0)) {
-               fInvokeAction = 0;
+               fInvokeAction = NULL;
                fQuiesceFinishTime = mach_absolute_time();
        }
 }
@@ -8705,7 +8755,7 @@ IOPMCompletionQueue::create( IOService * inOwner, Action inAction )
        IOPMCompletionQueue * me = OSTypeAlloc(IOPMCompletionQueue);
        if (me && !me->init(inOwner, inAction)) {
                me->release();
-               me = 0;
+               me = NULL;
        }
        return me;
 }
index 9c4f3bb7fccadaf870e561e578da736d41dcd0e3..7d5a2bc54499067642223f8beb5220124fe99382 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -158,7 +158,7 @@ class IOServicePM : public OSObject
        friend class IOService;
        friend class IOPMWorkQueue;
 
-       OSDeclareDefaultStructors( IOServicePM )
+       OSDeclareDefaultStructors( IOServicePM );
 
 private:
 // Link IOServicePM objects on IOPMWorkQueue.
@@ -368,7 +368,6 @@ private:
 #define fWatchdogLock               pwrMgt->WatchdogLock
 #define fBlockedArray               pwrMgt->BlockedArray
 #define fPendingResponseDeadline    pwrMgt->PendingResponseDeadline
-#define fSpinDumpTimer              pwrMgt->SpinDumpTimer
 #define fSettleTimeUS               pwrMgt->SettleTimeUS
 #define fIdleTimerGeneration        pwrMgt->IdleTimerGeneration
 #define fHeadNoteChangeFlags        pwrMgt->HeadNoteChangeFlags
@@ -542,6 +541,8 @@ struct IOPMInterestContext {
        uint32_t                maxTimeRequested;
        uint32_t                messageType;
        uint32_t                notifyType;
+       uint32_t                skippedInDark;
+       uint32_t                notSkippedInDark;
        IOService *             us;
        IOPMPowerStateIndex     stateNumber;
        IOPMPowerFlags          stateFlags;
@@ -552,7 +553,17 @@ struct IOPMInterestContext {
 
 // assertPMDriverCall() options
 enum {
-       kIOPMADC_NoInactiveCheck = 1
+       kIOPMDriverCallNoInactiveCheck = 1
+};
+
+// assertPMDriverCall() method
+enum {
+       kIOPMDriverCallMethodUnknown       = 0,
+       kIOPMDriverCallMethodSetPowerState = 1,
+       kIOPMDriverCallMethodWillChange    = 2,
+       kIOPMDriverCallMethodDidChange     = 3,
+       kIOPMDriverCallMethodChangeDone    = 4,
+       kIOPMDriverCallMethodSetAggressive = 5
 };
 
 //******************************************************************************
@@ -571,7 +582,7 @@ extern const OSSymbol *gIOPMStatsDriverPSChangeSlow;
 
 class IOPMRequest : public IOCommand
 {
-       OSDeclareDefaultStructors( IOPMRequest )
+       OSDeclareDefaultStructors( IOPMRequest );
 
 protected:
        IOService *          fTarget;       // request target
@@ -621,7 +632,7 @@ public:
                        return (IOPMRequest *) this;
                }
 #endif
-               return 0;
+               return NULL;
        }
 
        inline uint32_t
@@ -652,7 +663,7 @@ public:
        isQuiesceType( void ) const
        {
                return (kIOPMRequestTypeQuiescePowerTree == fRequestType) &&
-                      (fCompletionAction != 0) && (fCompletionTarget != 0);
+                      (fCompletionAction != NULL) && (fCompletionTarget != NULL);
        }
 
        inline void
@@ -681,7 +692,7 @@ public:
 
 class IOPMRequestQueue : public IOEventSource
 {
-       OSDeclareDefaultStructors( IOPMRequestQueue )
+       OSDeclareDefaultStructors( IOPMRequestQueue );
 
 public:
        typedef bool (*Action)( IOService *, IOPMRequest *, IOPMRequestQueue * );
@@ -710,7 +721,7 @@ public:
 
 class IOPMWorkQueue : public IOEventSource
 {
-       OSDeclareDefaultStructors( IOPMWorkQueue )
+       OSDeclareDefaultStructors( IOPMWorkQueue );
 
 public:
        typedef bool (*Action)( IOService *, IOPMRequest *, IOPMWorkQueue * );
@@ -752,7 +763,7 @@ public:
 
 class IOPMCompletionQueue : public IOEventSource
 {
-       OSDeclareDefaultStructors( IOPMCompletionQueue )
+       OSDeclareDefaultStructors( IOPMCompletionQueue );
 
 public:
        typedef bool (*Action)( IOService *, IOPMRequest *, IOPMCompletionQueue * );
index 91bc2a1d1751646635c2e9cce84b8a834c79da36..4ae23be1b5568af84c93cd8125d4e78d7d114650 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -84,7 +84,7 @@ class _IOServiceNotifier : public IONotifier
 {
        friend class IOService;
 
-       OSDeclareDefaultStructors(_IOServiceNotifier)
+       OSDeclareDefaultStructors(_IOServiceNotifier);
 
 public:
        OSOrderedSet *                      whence;
@@ -110,7 +110,7 @@ class _IOServiceInterestNotifier : public IONotifier
 {
        friend class IOService;
 
-       OSDeclareDefaultStructors(_IOServiceInterestNotifier)
+       OSDeclareDefaultStructors(_IOServiceInterestNotifier);
 
 public:
        queue_chain_t               chain;
@@ -131,7 +131,7 @@ public:
 
 class _IOServiceNullNotifier : public IONotifier
 {
-       OSDeclareDefaultStructors(_IOServiceNullNotifier)
+       OSDeclareDefaultStructors(_IOServiceNullNotifier);
 
 public:
        virtual void taggedRetain(const void *tag) const APPLE_KEXT_OVERRIDE;
@@ -147,12 +147,10 @@ class _IOConfigThread : public OSObject
 {
        friend class IOService;
 
-       OSDeclareDefaultStructors(_IOConfigThread)
+       OSDeclareDefaultStructors(_IOConfigThread);
 
 public:
-       virtual void free() APPLE_KEXT_OVERRIDE;
-
-       static void configThread( void );
+       static void configThread( int configThreadId );
        static void main( void * arg, wait_result_t result );
 };
 
@@ -168,7 +166,7 @@ class _IOServiceJob : public OSObject
 {
        friend class IOService;
 
-       OSDeclareDefaultStructors(_IOServiceJob)
+       OSDeclareDefaultStructors(_IOServiceJob);
 
 public:
        int                 type;
@@ -184,11 +182,11 @@ class IOResources : public IOService
 {
        friend class IOService;
 
-       OSDeclareDefaultStructors(IOResources)
+       OSDeclareDefaultStructors(IOResources);
 
 public:
        static IOService * resources( void );
-       virtual bool init( OSDictionary * dictionary = 0 ) APPLE_KEXT_OVERRIDE;
+       virtual bool init( OSDictionary * dictionary = NULL ) APPLE_KEXT_OVERRIDE;
        virtual IOReturn newUserClient(task_t owningTask, void * securityID,
            UInt32 type, OSDictionary * properties,
            IOUserClient ** handler) APPLE_KEXT_OVERRIDE;
@@ -197,11 +195,27 @@ public:
        virtual IOReturn setProperties( OSObject * properties ) APPLE_KEXT_OVERRIDE;
 };
 
+class IOUserResources : public IOService
+{
+       friend class IOService;
+
+       OSDeclareDefaultStructors(IOUserResources);
+
+public:
+       static IOService * resources( void );
+       virtual bool init( OSDictionary * dictionary = NULL ) APPLE_KEXT_OVERRIDE;
+       virtual IOReturn newUserClient(task_t owningTask, void * securityID,
+           UInt32 type, OSDictionary * properties,
+           IOUserClient ** handler) APPLE_KEXT_OVERRIDE;
+       virtual IOWorkLoop * getWorkLoop() const APPLE_KEXT_OVERRIDE;
+       virtual bool matchPropertyTable( OSDictionary * table ) APPLE_KEXT_OVERRIDE;
+};
+
 class _IOOpenServiceIterator : public OSIterator
 {
        friend class IOService;
 
-       OSDeclareDefaultStructors(_IOOpenServiceIterator)
+       OSDeclareDefaultStructors(_IOOpenServiceIterator);
 
        OSIterator *        iter;
        const IOService *   client;
index 797583bf8a79a92cca2c942b27940a1be675c1be..71d3c681774f1ffa00555830bcf7a6628b6b78a3 100644 (file)
@@ -50,7 +50,7 @@ IOSharedDataQueue *IOSharedDataQueue::withCapacity(UInt32 size)
        if (dataQueue) {
                if (!dataQueue->initWithCapacity(size)) {
                        dataQueue->release();
-                       dataQueue = 0;
+                       dataQueue = NULL;
                }
        }
 
@@ -65,7 +65,7 @@ IOSharedDataQueue::withEntries(UInt32 numEntries, UInt32 entrySize)
        if (dataQueue) {
                if (!dataQueue->initWithEntries(numEntries, entrySize)) {
                        dataQueue->release();
-                       dataQueue = 0;
+                       dataQueue = NULL;
                }
        }
 
@@ -98,7 +98,7 @@ IOSharedDataQueue::initWithCapacity(UInt32 size)
        }
 
        dataQueue = (IODataQueueMemory *)IOMallocAligned(allocSize, PAGE_SIZE);
-       if (dataQueue == 0) {
+       if (dataQueue == NULL) {
                return false;
        }
        bzero(dataQueue, allocSize);
@@ -150,9 +150,9 @@ IOSharedDataQueue::free()
 IOMemoryDescriptor *
 IOSharedDataQueue::getMemoryDescriptor()
 {
-       IOMemoryDescriptor *descriptor = 0;
+       IOMemoryDescriptor *descriptor = NULL;
 
-       if (dataQueue != 0) {
+       if (dataQueue != NULL) {
                descriptor = IOMemoryDescriptor::withAddress(dataQueue, getQueueSize() + DATA_QUEUE_MEMORY_HEADER_SIZE + DATA_QUEUE_MEMORY_APPENDIX_SIZE, kIODirectionOutIn);
        }
 
@@ -163,7 +163,7 @@ IOSharedDataQueue::getMemoryDescriptor()
 IODataQueueEntry *
 IOSharedDataQueue::peek()
 {
-       IODataQueueEntry *entry      = 0;
+       IODataQueueEntry *entry      = NULL;
        UInt32            headOffset;
        UInt32            tailOffset;
 
@@ -177,7 +177,7 @@ IOSharedDataQueue::peek()
        tailOffset = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->tail, __ATOMIC_ACQUIRE);
 
        if (headOffset != tailOffset) {
-               volatile IODataQueueEntry * head = 0;
+               volatile IODataQueueEntry * head = NULL;
                UInt32              headSize     = 0;
                UInt32              headOffset   = dataQueue->head;
                UInt32              queueSize    = getQueueSize();
@@ -239,7 +239,7 @@ IOSharedDataQueue::enqueue(void * data, UInt32 dataSize)
                        entry = (IODataQueueEntry *)((UInt8 *)dataQueue->queue + tail);
 
                        entry->size = dataSize;
-                       memcpy(&entry->data, data, dataSize);
+                       __nochk_memcpy(&entry->data, data, dataSize);
 
                        // The tail can be out of bound when the size of the new entry
                        // exactly matches the available space at the end of the queue.
@@ -260,7 +260,7 @@ IOSharedDataQueue::enqueue(void * data, UInt32 dataSize)
                                ((IODataQueueEntry *)((UInt8 *)dataQueue->queue + tail))->size = dataSize;
                        }
 
-                       memcpy(&dataQueue->queue->data, data, dataSize);
+                       __nochk_memcpy(&dataQueue->queue->data, data, dataSize);
                        newTail = entrySize;
                } else {
                        return false; // queue is full
@@ -273,7 +273,7 @@ IOSharedDataQueue::enqueue(void * data, UInt32 dataSize)
                        entry = (IODataQueueEntry *)((UInt8 *)dataQueue->queue + tail);
 
                        entry->size = dataSize;
-                       memcpy(&entry->data, data, dataSize);
+                       __nochk_memcpy(&entry->data, data, dataSize);
                        newTail = tail + entrySize;
                } else {
                        return false; // queue is full
@@ -308,7 +308,7 @@ Boolean
 IOSharedDataQueue::dequeue(void *data, UInt32 *dataSize)
 {
        Boolean             retVal          = TRUE;
-       volatile IODataQueueEntry * entry   = 0;
+       volatile IODataQueueEntry *  entry  = NULL;
        UInt32              entrySize       = 0;
        UInt32              headOffset      = 0;
        UInt32              tailOffset      = 0;
@@ -324,7 +324,7 @@ IOSharedDataQueue::dequeue(void *data, UInt32 *dataSize)
        tailOffset = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->tail, __ATOMIC_ACQUIRE);
 
        if (headOffset != tailOffset) {
-               volatile IODataQueueEntry * head = 0;
+               volatile IODataQueueEntry * head = NULL;
                UInt32              headSize     = 0;
                UInt32              queueSize    = getQueueSize();
 
@@ -372,7 +372,7 @@ IOSharedDataQueue::dequeue(void *data, UInt32 *dataSize)
                        // not enough space
                        return false;
                }
-               memcpy(data, (void *)entry->data, entrySize);
+               __nochk_memcpy(data, (void *)entry->data, entrySize);
                *dataSize = entrySize;
        }
 
index 196e91086139d4da7b43ccfb05fa2376db96ff4c..e64a84a45f81567be72d8d504485db5310033bea 100644 (file)
@@ -70,8 +70,12 @@ IOKitInitializeTime( void )
 
        t.tv_sec = 30;
        t.tv_nsec = 0;
+
+// RTC is not present on this target
+#ifndef BCM2837
        IOService::waitForService(
                IOService::resourceMatching("IORTC"), &t );
+#endif
 #if defined(__i386__) || defined(__x86_64__)
        IOService::waitForService(
                IOService::resourceMatching("IONVRAM"), &t );
@@ -116,7 +120,7 @@ iokit_post_constructor_init(void)
 /*****
  * Pointer into bootstrap KLD segment for functions never used past startup.
  */
-void (*record_startup_extensions_function)(void) = 0;
+void (*record_startup_extensions_function)(void) = NULL;
 
 void
 StartIOKit( void * p1, void * p2, void * p3, void * p4 )
@@ -143,6 +147,12 @@ StartIOKit( void * p1, void * p2, void * p3, void * p4 )
        if (PE_parse_boot_argn( "pmtimeout", &debugFlags, sizeof(debugFlags))) {
                gCanSleepTimeout = debugFlags;
        }
+
+       if (PE_parse_boot_argn( "dk", &debugFlags, sizeof(debugFlags))) {
+               gIODKDebug = debugFlags;
+       }
+
+
        //
        // Have to start IOKit environment before we attempt to start
        // the C++ runtime environment.  At some stage we have to clean up
@@ -152,6 +162,7 @@ StartIOKit( void * p1, void * p2, void * p3, void * p4 )
        //
        IOLibInit();
        OSlibkernInit();
+       IOMachPortInitialize();
        devsw_init();
 
        gIOProgressBackbufferKey  = OSSymbol::withCStringNoCopy(kIOProgressBackbufferKey);
@@ -162,7 +173,7 @@ StartIOKit( void * p1, void * p2, void * p3, void * p4 )
        rootNub = new IOPlatformExpertDevice;
 
        if (rootNub && rootNub->initWithArgs( p1, p2, p3, p4)) {
-               rootNub->attach( 0 );
+               rootNub->attach( NULL );
 
                /* If the bootstrap segment set up a function to record startup
                 * extensions, call it now.
index 54338fb77efc6c5312dc1c679fad2c452b4ecf84..6bac5ad694792bb1e9b9dbeca1cfd8d5483f1b38 100644 (file)
@@ -168,19 +168,19 @@ oid_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, int arg2, stru
        return error;
 }
 
-SYSCTL_NODE(_debug, OID_AUTO, iokit_statistics, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IOStatistics");
+SYSCTL_NODE(_debug, OID_AUTO, iokit_statistics, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "IOStatistics");
 
 static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, general,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    0, kIOStatisticsGeneral, oid_sysctl, "S", "");
+    NULL, kIOStatisticsGeneral, oid_sysctl, "S", "");
 
 static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, workloop,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    0, kIOStatisticsWorkLoop, oid_sysctl, "S", "");
+    NULL, kIOStatisticsWorkLoop, oid_sysctl, "S", "");
 
 static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, userclient,
     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    0, kIOStatisticsUserClient, oid_sysctl, "S", "");
+    NULL, kIOStatisticsUserClient, oid_sysctl, "S", "");
 
 void
 IOStatistics::initialize()
@@ -1260,7 +1260,7 @@ IOStatistics::getKextNodeFromBacktrace(boolean_t write)
         * overhead. OSBacktrace does many safety checks that
         * are not needed in this situation.
         */
-       btCount = backtrace((uintptr_t*)bt, btCount);
+       btCount = backtrace((uintptr_t*)bt, btCount, NULL);
 
        if (write) {
                IORWLockWrite(lock);
index 8d0a472b12a6568c37322e9129f50296359f5a9c..c65c7c4863a0579e2c8f1efc9edde9bda414ef7a 100644 (file)
@@ -53,7 +53,7 @@ IOSubMemoryDescriptor::withSubRange(IOMemoryDescriptor *        of,
 
        if (self && !self->initSubRange(of, offset, length, (IODirection) options)) {
                self->release();
-               self = 0;
+               self = NULL;
        }
        return self;
 }
@@ -151,6 +151,22 @@ IOSubMemoryDescriptor::setPurgeable( IOOptionBits newState,
        return err;
 }
 
+IOReturn
+IOSubMemoryDescriptor::setOwnership( task_t newOwner,
+    int newLedgerTag,
+    IOOptionBits newLedgerOptions )
+{
+       IOReturn err;
+
+       if (iokit_iomd_setownership_enabled == FALSE) {
+               return kIOReturnUnsupported;
+       }
+
+       err = _parent->setOwnership( newOwner, newLedgerTag, newLedgerOptions );
+
+       return err;
+}
+
 IOReturn
 IOSubMemoryDescriptor::prepare(
        IODirection forDirection)
@@ -182,7 +198,7 @@ IOSubMemoryDescriptor::makeMapping(
        IOByteCount             offset,
        IOByteCount             length )
 {
-       IOMemoryMap * mapping = 0;
+       IOMemoryMap * mapping = NULL;
 
 #ifndef __LP64__
        if (!(kIOMap64Bit & options)) {
index 32449d6f69d99f1af81654022cfdd2703f214925..b4df67d2fb81b69daf13cad838d30b5ebfacb7e5 100644 (file)
@@ -38,7 +38,7 @@ IOSyncer * IOSyncer::create(bool twoRetains)
 
        if (me && !me->init(twoRetains)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
index ad6b75455c0637a6619a28f770aa32e6b794cf28..59eb6a0a26af9b6b58376389d532573e8a431a4e 100644 (file)
@@ -100,29 +100,29 @@ do { \
 //
 
 __inline__ void
-IOTimerEventSource::invokeAction(IOTimerEventSource::Action action, IOTimerEventSource * ts,
-    OSObject * owner, IOWorkLoop * workLoop)
+IOTimerEventSource::invokeAction(IOTimerEventSource::Action _action, IOTimerEventSource * ts,
+    OSObject * _owner, IOWorkLoop * _workLoop)
 {
        bool    trace = (gIOKitTrace & kIOTraceTimers) ? true : false;
 
        if (trace) {
                IOTimeStampStartConstant(IODBG_TIMES(IOTIMES_ACTION),
-                   VM_KERNEL_ADDRHIDE(action), VM_KERNEL_ADDRHIDE(owner));
+                   VM_KERNEL_ADDRHIDE(_action), VM_KERNEL_ADDRHIDE(_owner));
        }
 
        if (kActionBlock & flags) {
                ((IOTimerEventSource::ActionBlock) actionBlock)(ts);
        } else {
-               (*action)(owner, ts);
+               (*_action)(_owner, ts);
        }
 
 #if CONFIG_DTRACE
-       DTRACE_TMR3(iotescallout__expire, Action, action, OSObject, owner, void, workLoop);
+       DTRACE_TMR3(iotescallout__expire, Action, _action, OSObject, _owner, void, _workLoop);
 #endif
 
        if (trace) {
                IOTimeStampEndConstant(IODBG_TIMES(IOTIMES_ACTION),
-                   VM_KERNEL_UNSLIDE(action), VM_KERNEL_ADDRHIDE(owner));
+                   VM_KERNEL_UNSLIDE(_action), VM_KERNEL_ADDRHIDE(_owner));
        }
 }
 
@@ -319,19 +319,19 @@ IOTimerEventSource::timerEventSource(uint32_t inOptions, OSObject *inOwner, Acti
 
        if (me && !me->init(inOptions, inOwner, inAction)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
 }
 
 IOTimerEventSource *
-IOTimerEventSource::timerEventSource(uint32_t options, OSObject *inOwner, ActionBlock action)
+IOTimerEventSource::timerEventSource(uint32_t options, OSObject *inOwner, ActionBlock _action)
 {
        IOTimerEventSource * tes;
        tes = IOTimerEventSource::timerEventSource(options, inOwner, (Action) NULL);
        if (tes) {
-               tes->setActionBlock((IOEventSource::ActionBlock) action);
+               tes->setActionBlock((IOEventSource::ActionBlock) _action);
        }
 
        return tes;
index 12ae32416702a74a2c35c1e9403f20c5a111ba18..bbe9448fd3b1eacaf7a155dc077aa7aef8ae83bf 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <IOKit/IOStatisticsPrivate.h>
 #include <IOKit/IOTimeStamp.h>
 #include <IOKit/IODeviceTreeSupport.h>
+#include <IOKit/IOUserServer.h>
 #include <IOKit/system.h>
 #include <libkern/OSDebug.h>
+#include <DriverKit/OSAction.h>
 #include <sys/proc.h>
 #include <sys/kauth.h>
 #include <sys/codesign.h>
 
 #include <mach/sdt.h>
+#include <os/hash.h>
 
 #if CONFIG_MACF
 
@@ -132,29 +135,37 @@ extern "C" {
 #include <vm/vm_map.h>
 } /* extern "C" */
 
+struct IOMachPortHashList;
+
+static_assert(IKOT_MAX_TYPE <= 255);
+
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 // IOMachPort maps OSObjects to ports, avoiding adding an ivar to OSObject.
-
 class IOMachPort : public OSObject
 {
-       OSDeclareDefaultStructors(IOMachPort)
+       OSDeclareDefaultStructors(IOMachPort);
 public:
-       OSObject *  object;
+       SLIST_ENTRY(IOMachPort) link;
        ipc_port_t  port;
+       OSObject*   object;
        UInt32      mscount;
        UInt8       holdDestroy;
+       UInt8       type;
+
+       static IOMachPort* withObjectAndType(OSObject *obj, ipc_kobject_type_t type);
+
+       static IOMachPortHashList* bucketForObject(OSObject *obj,
+           ipc_kobject_type_t type);
+
+       static IOMachPort* portForObjectInBucket(IOMachPortHashList *bucket, OSObject *obj, ipc_kobject_type_t type);
 
-       static IOMachPort * portForObject( OSObject * obj,
-           ipc_kobject_type_t type );
        static bool noMoreSendersForObject( OSObject * obj,
            ipc_kobject_type_t type, mach_port_mscount_t * mscount );
        static void releasePortForObject( OSObject * obj,
            ipc_kobject_type_t type );
        static void setHoldDestroy( OSObject * obj, ipc_kobject_type_t type );
 
-       static OSDictionary * dictForType( ipc_kobject_type_t type );
-
        static mach_port_name_t makeSendRightForTask( task_t task,
            io_object_t obj, ipc_kobject_type_t type );
 
@@ -165,116 +176,109 @@ public:
 OSDefineMetaClassAndStructors(IOMachPort, OSObject)
 
 static IOLock *         gIOObjectPortLock;
+IOLock *                gIOUserServerLock;
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
-// not in dictForType() for debugging ease
-static OSDictionary *   gIOObjectPorts;
-static OSDictionary *   gIOConnectPorts;
-static OSDictionary *   gIOIdentifierPorts;
+SLIST_HEAD(IOMachPortHashList, IOMachPort);
 
-OSDictionary *
-IOMachPort::dictForType( ipc_kobject_type_t type )
-{
-       OSDictionary **             dict;
+#if CONFIG_EMBEDDED
+#define PORT_HASH_SIZE 256
+#else
+#define PORT_HASH_SIZE 4096
+#endif /* CONFIG_EMBEDDED */
 
-       switch (type) {
-       case IKOT_IOKIT_OBJECT:
-               dict = &gIOObjectPorts;
-               break;
-       case IKOT_IOKIT_CONNECT:
-               dict = &gIOConnectPorts;
-               break;
-       case IKOT_IOKIT_IDENT:
-               dict = &gIOIdentifierPorts;
-               break;
-       default:
-               panic("dictForType %d", type);
-               dict = NULL;
-               break;
-       }
+IOMachPortHashList ports[PORT_HASH_SIZE];
 
-       if (0 == *dict) {
-               *dict = OSDictionary::withCapacity( 1 );
+void
+IOMachPortInitialize(void)
+{
+       for (size_t i = 0; i < PORT_HASH_SIZE; i++) {
+               SLIST_INIT(&ports[i]);
        }
-
-       return *dict;
 }
 
-IOMachPort *
-IOMachPort::portForObject( OSObject * obj,
-    ipc_kobject_type_t type )
+IOMachPortHashList*
+IOMachPort::bucketForObject(OSObject *obj, ipc_kobject_type_t type )
 {
-       IOMachPort *        inst = 0;
-       OSDictionary *      dict;
+       return &ports[os_hash_kernel_pointer(obj) % PORT_HASH_SIZE];
+}
 
-       IOTakeLock( gIOObjectPortLock);
+IOMachPort*
+IOMachPort::portForObjectInBucket(IOMachPortHashList *bucket, OSObject *obj, ipc_kobject_type_t type)
+{
+       IOMachPort *machPort;
 
-       do {
-               dict = dictForType( type );
-               if (!dict) {
-                       continue;
+       SLIST_FOREACH(machPort, bucket, link) {
+               if (machPort->object == obj && machPort->type == type) {
+                       return machPort;
                }
+       }
+       return NULL;
+}
 
-               if ((inst = (IOMachPort *)
-                   dict->getObject((const OSSymbol *) obj ))) {
-                       inst->mscount++;
-                       inst->retain();
-                       continue;
-               }
+IOMachPort*
+IOMachPort::withObjectAndType(OSObject *obj, ipc_kobject_type_t type)
+{
+       IOMachPort *machPort = NULL;
 
-               inst = new IOMachPort;
-               if (inst && !inst->init()) {
-                       inst = 0;
-                       continue;
-               }
+       machPort = new IOMachPort;
+       if (__improbable(machPort && !machPort->init())) {
+               return NULL;
+       }
 
-               inst->port = iokit_alloc_object_port( obj, type );
-               if (inst->port) {
-                       // retains obj
-                       dict->setObject((const OSSymbol *) obj, inst );
-                       inst->mscount++;
-               } else {
-                       inst->release();
-                       inst = 0;
-               }
-       } while (false);
+       machPort->object = obj;
+       machPort->type = (typeof(machPort->type))type;
+       machPort->port = iokit_alloc_object_port(obj, type);
 
-       IOUnlock( gIOObjectPortLock);
+       obj->taggedRetain(OSTypeID(OSCollection));
+       machPort->mscount++;
 
-       return inst;
+       return machPort;
 }
 
 bool
 IOMachPort::noMoreSendersForObject( OSObject * obj,
     ipc_kobject_type_t type, mach_port_mscount_t * mscount )
 {
-       OSDictionary *      dict;
-       IOMachPort *        machPort;
-       IOUserClient *      uc;
-       bool                destroyed = true;
+       IOMachPort *machPort = NULL;
+       IOUserClient *uc;
+       OSAction *action;
+       bool destroyed = true;
 
-       IOTakeLock( gIOObjectPortLock);
+       IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj, type);
 
-       if ((dict = dictForType( type ))) {
-               obj->retain();
+       obj->retain();
 
-               machPort = (IOMachPort *) dict->getObject((const OSSymbol *) obj );
-               if (machPort) {
-                       destroyed = (machPort->mscount <= *mscount);
-                       if (!destroyed) {
-                               *mscount = machPort->mscount;
-                       } else {
-                               if ((IKOT_IOKIT_CONNECT == type) && (uc = OSDynamicCast(IOUserClient, obj))) {
-                                       uc->noMoreSenders();
-                               }
-                               dict->removeObject((const OSSymbol *) obj );
+       lck_mtx_lock(gIOObjectPortLock);
+
+       machPort = IOMachPort::portForObjectInBucket(bucket, obj, type);
+
+       if (machPort) {
+               destroyed = (machPort->mscount <= *mscount);
+               if (!destroyed) {
+                       *mscount = machPort->mscount;
+                       lck_mtx_unlock(gIOObjectPortLock);
+               } else {
+                       if ((IKOT_IOKIT_CONNECT == type) && (uc = OSDynamicCast(IOUserClient, obj))) {
+                               uc->noMoreSenders();
                        }
+                       SLIST_REMOVE(bucket, machPort, IOMachPort, link);
+
+                       lck_mtx_unlock(gIOObjectPortLock);
+
+                       machPort->release();
+                       obj->taggedRelease(OSTypeID(OSCollection));
                }
-               obj->release();
+       } else {
+               lck_mtx_unlock(gIOObjectPortLock);
+       }
+
+       if ((IKOT_UEXT_OBJECT == type) && (action = OSDynamicCast(OSAction, obj))) {
+               action->Aborted();
        }
 
-       IOUnlock( gIOObjectPortLock);
+       obj->release();
 
        return destroyed;
 }
@@ -283,76 +287,108 @@ void
 IOMachPort::releasePortForObject( OSObject * obj,
     ipc_kobject_type_t type )
 {
-       OSDictionary *      dict;
-       IOMachPort *        machPort;
+       IOMachPort *machPort;
+       IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj, type);
 
        assert(IKOT_IOKIT_CONNECT != type);
 
-       IOTakeLock( gIOObjectPortLock);
+       lck_mtx_lock(gIOObjectPortLock);
+
+       machPort = IOMachPort::portForObjectInBucket(bucket, obj, type);
 
-       if ((dict = dictForType( type ))) {
+       if (machPort && !machPort->holdDestroy) {
                obj->retain();
-               machPort = (IOMachPort *) dict->getObject((const OSSymbol *) obj );
-               if (machPort && !machPort->holdDestroy) {
-                       dict->removeObject((const OSSymbol *) obj );
-               }
+               SLIST_REMOVE(bucket, machPort, IOMachPort, link);
+
+               lck_mtx_unlock(gIOObjectPortLock);
+
+               machPort->release();
+               obj->taggedRelease(OSTypeID(OSCollection));
                obj->release();
+       } else {
+               lck_mtx_unlock(gIOObjectPortLock);
        }
-
-       IOUnlock( gIOObjectPortLock);
 }
 
 void
 IOMachPort::setHoldDestroy( OSObject * obj, ipc_kobject_type_t type )
 {
-       OSDictionary *      dict;
        IOMachPort *        machPort;
 
-       IOLockLock( gIOObjectPortLock );
+       IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj, type);
+       lck_mtx_lock(gIOObjectPortLock);
 
-       if ((dict = dictForType( type ))) {
-               machPort = (IOMachPort *) dict->getObject((const OSSymbol *) obj );
-               if (machPort) {
-                       machPort->holdDestroy = true;
-               }
+       machPort = IOMachPort::portForObjectInBucket(bucket, obj, type);
+
+       if (machPort) {
+               machPort->holdDestroy = true;
        }
 
-       IOLockUnlock( gIOObjectPortLock );
+       lck_mtx_unlock(gIOObjectPortLock);
+}
+
+void
+IOMachPortDestroyUserReferences(OSObject * obj, natural_t type)
+{
+       IOMachPort::releasePortForObject(obj, type);
 }
 
 void
 IOUserClient::destroyUserReferences( OSObject * obj )
 {
+       IOMachPort *machPort;
+
        IOMachPort::releasePortForObject( obj, IKOT_IOKIT_OBJECT );
 
        // panther, 3160200
        // IOMachPort::releasePortForObject( obj, IKOT_IOKIT_CONNECT );
 
-       OSDictionary * dict;
-
-       IOTakeLock( gIOObjectPortLock);
        obj->retain();
+       IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj, IKOT_IOKIT_CONNECT);
+       IOMachPortHashList *mappingBucket = NULL;
 
-       if ((dict = IOMachPort::dictForType( IKOT_IOKIT_CONNECT ))) {
-               IOMachPort * port;
-               port = (IOMachPort *) dict->getObject((const OSSymbol *) obj );
-               if (port) {
-                       IOUserClient * uc;
-                       if ((uc = OSDynamicCast(IOUserClient, obj))) {
-                               uc->noMoreSenders();
-                               if (uc->mappings) {
-                                       dict->setObject((const OSSymbol *) uc->mappings, port);
-                                       iokit_switch_object_port(port->port, uc->mappings, IKOT_IOKIT_CONNECT);
+       lck_mtx_lock(gIOObjectPortLock);
 
-                                       uc->mappings->release();
-                                       uc->mappings = 0;
-                               }
-                       }
-                       dict->removeObject((const OSSymbol *) obj );
+       IOUserClient * uc = OSDynamicCast(IOUserClient, obj);
+       if (uc && uc->mappings) {
+               mappingBucket = IOMachPort::bucketForObject(uc->mappings, IKOT_IOKIT_CONNECT);
+       }
+
+       machPort = IOMachPort::portForObjectInBucket(bucket, obj, IKOT_IOKIT_CONNECT);
+
+       if (machPort == NULL) {
+               lck_mtx_unlock(gIOObjectPortLock);
+               goto end;
+       }
+
+       SLIST_REMOVE(bucket, machPort, IOMachPort, link);
+       obj->taggedRelease(OSTypeID(OSCollection));
+
+       if (uc) {
+               uc->noMoreSenders();
+               if (uc->mappings) {
+                       uc->mappings->taggedRetain(OSTypeID(OSCollection));
+                       machPort->object = uc->mappings;
+                       SLIST_INSERT_HEAD(mappingBucket, machPort, link);
+                       iokit_switch_object_port(machPort->port, uc->mappings, IKOT_IOKIT_CONNECT);
+
+                       lck_mtx_unlock(gIOObjectPortLock);
+
+                       uc->mappings->release();
+                       uc->mappings = NULL;
+               } else {
+                       lck_mtx_unlock(gIOObjectPortLock);
+                       machPort->release();
                }
+       } else {
+               lck_mtx_unlock(gIOObjectPortLock);
+               machPort->release();
        }
+
+
+end:
+
        obj->release();
-       IOUnlock( gIOObjectPortLock);
 }
 
 mach_port_name_t
@@ -375,7 +411,7 @@ IOMachPort::free( void )
 
 class IOUserIterator : public OSIterator
 {
-       OSDeclareDefaultStructors(IOUserIterator)
+       OSDeclareDefaultStructors(IOUserIterator);
 public:
        OSObject    *       userIteratorObject;
        IOLock      *       lock;
@@ -394,7 +430,7 @@ public:
 
 class IOUserNotification : public IOUserIterator
 {
-       OSDeclareDefaultStructors(IOUserNotification)
+       OSDeclareDefaultStructors(IOUserNotification);
 
 #define holdNotify      userIteratorObject
 
@@ -418,13 +454,13 @@ IOUserIterator::withIterator(OSIterator * iter)
        IOUserIterator * me;
 
        if (!iter) {
-               return 0;
+               return NULL;
        }
 
        me = new IOUserIterator;
        if (me && !me->init()) {
                me->release();
-               me = 0;
+               me = NULL;
        }
        if (!me) {
                return me;
@@ -581,20 +617,31 @@ IOUserClient::finalizeUserReferences(OSObject * obj)
 ipc_port_t
 iokit_port_for_object( io_object_t obj, ipc_kobject_type_t type )
 {
-       IOMachPort * machPort;
-       ipc_port_t   port;
+       IOMachPort *machPort = NULL;
+       ipc_port_t   port = NULL;
 
-       if ((machPort = IOMachPort::portForObject( obj, type ))) {
-               port = machPort->port;
-               if (port) {
-                       iokit_retain_port( port );
-               }
+       IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj, type);
 
-               machPort->release();
+       lck_mtx_lock(gIOObjectPortLock);
+
+       machPort = IOMachPort::portForObjectInBucket(bucket, obj, type);
+
+       if (__improbable(machPort == NULL)) {
+               machPort = IOMachPort::withObjectAndType(obj, type);
+               if (__improbable(machPort == NULL)) {
+                       goto end;
+               }
+               SLIST_INSERT_HEAD(bucket, machPort, link);
        } else {
-               port = NULL;
+               machPort->mscount++;
        }
 
+       iokit_retain_port(machPort->port);
+       port = machPort->port;
+
+end:
+       lck_mtx_unlock(gIOObjectPortLock);
+
        return port;
 }
 
@@ -621,7 +668,7 @@ iokit_client_died( io_object_t obj, ipc_port_t /* port */,
                if ((map = OSDynamicCast( IOMemoryMap, obj ))) {
                        map->taskDied();
                } else if ((notify = OSDynamicCast( IOUserNotification, obj ))) {
-                       notify->setNotification( 0 );
+                       notify->setNotification( NULL );
                }
        }
 
@@ -633,7 +680,7 @@ iokit_client_died( io_object_t obj, ipc_port_t /* port */,
 
 class IOServiceUserNotification : public IOUserNotification
 {
-       OSDeclareDefaultStructors(IOServiceUserNotification)
+       OSDeclareDefaultStructors(IOServiceUserNotification);
 
        struct PingMsg {
                mach_msg_header_t               msgHdr;
@@ -666,7 +713,7 @@ public:
 
 class IOServiceMessageUserNotification : public IOUserNotification
 {
-       OSDeclareDefaultStructors(IOServiceMessageUserNotification)
+       OSDeclareDefaultStructors(IOServiceMessageUserNotification);
 
        struct PingMsg {
                mach_msg_header_t               msgHdr;
@@ -706,8 +753,8 @@ public:
 
 #undef super
 #define super IOUserIterator
-OSDefineMetaClass( IOUserNotification, IOUserIterator )
-OSDefineAbstractStructors( IOUserNotification, IOUserIterator )
+OSDefineMetaClass( IOUserNotification, IOUserIterator );
+OSDefineAbstractStructors( IOUserNotification, IOUserIterator );
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
@@ -717,7 +764,7 @@ IOUserNotification::free( void )
        if (holdNotify) {
                assert(OSDynamicCast(IONotifier, holdNotify));
                ((IONotifier *)holdNotify)->remove();
-               holdNotify = 0;
+               holdNotify = NULL;
        }
        // can't be in handler now
 
@@ -912,7 +959,7 @@ IOServiceUserNotification::copyNextObject()
                result->retain();
                newSet->removeObject( count - 1);
        } else {
-               result = 0;
+               result = NULL;
                armed = true;
        }
 
@@ -963,7 +1010,7 @@ IOServiceMessageUserNotification::init( mach_port_t port, natural_t type,
 
        pingMsg->msgBody.msgh_descriptor_count = 1;
 
-       pingMsg->ports[0].name              = 0;
+       pingMsg->ports[0].name              = NULL;
        pingMsg->ports[0].disposition       = MACH_MSG_TYPE_MAKE_SEND;
        pingMsg->ports[0].type              = MACH_MSG_PORT_DESCRIPTOR;
 
@@ -1062,7 +1109,7 @@ IOServiceMessageUserNotification::handler( void * ref,
                }
                thisMsg = (typeof(thisMsg))allocMsg;
        } else {
-               allocMsg = 0;
+               allocMsg = NULL;
                thisMsg  = (typeof(thisMsg))stackMsg;
        }
 
@@ -1116,7 +1163,7 @@ IOServiceMessageUserNotification::handler( void * ref,
 OSObject *
 IOServiceMessageUserNotification::getNextObject()
 {
-       return 0;
+       return NULL;
 }
 
 OSObject *
@@ -1138,6 +1185,7 @@ IOUserClient::initialize( void )
 {
        gIOObjectPortLock       = IOLockAlloc();
        gIOUserClientOwnersLock = IOLockAlloc();
+       gIOUserServerLock       = IOLockAlloc();
        assert(gIOObjectPortLock && gIOUserClientOwnersLock);
 }
 
@@ -1178,7 +1226,7 @@ static OSDictionary *
 CopyConsoleUser(UInt32 uid)
 {
        OSArray * array;
-       OSDictionary * user = 0;
+       OSDictionary * user = NULL;
 
        if ((array = OSDynamicCast(OSArray,
            IORegistryEntry::getRegistryRoot()->copyProperty(gIOConsoleUsersKey)))) {
@@ -1202,7 +1250,7 @@ static OSDictionary *
 CopyUserOnConsole(void)
 {
        OSArray * array;
-       OSDictionary * user = 0;
+       OSDictionary * user = NULL;
 
        if ((array = OSDynamicCast(OSArray,
            IORegistryEntry::getRegistryRoot()->copyProperty(gIOConsoleUsersKey)))) {
@@ -1335,29 +1383,31 @@ IOUserClient::clientHasPrivilege( void * securityToken,
        return kr;
 }
 
-OSObject *
-IOUserClient::copyClientEntitlement( task_t task,
-    const char * entitlement )
+OSDictionary *
+IOUserClient::copyClientEntitlements(task_t task)
 {
 #define MAX_ENTITLEMENTS_LEN    (128 * 1024)
 
        proc_t p = NULL;
        pid_t pid = 0;
-       char procname[MAXCOMLEN + 1] = "";
        size_t len = 0;
        void *entitlements_blob = NULL;
        char *entitlements_data = NULL;
        OSObject *entitlements_obj = NULL;
        OSDictionary *entitlements = NULL;
        OSString *errorString = NULL;
-       OSObject *value = NULL;
 
        p = (proc_t)get_bsdtask_info(task);
        if (p == NULL) {
                goto fail;
        }
        pid = proc_pid(p);
-       proc_name(pid, procname, (int)sizeof(procname));
+
+       if (cs_entitlements_dictionary_copy(p, (void **)&entitlements) == 0) {
+               if (entitlements) {
+                       return entitlements;
+               }
+       }
 
        if (cs_entitlements_blob_get(p, &entitlements_blob, &len) != 0) {
                goto fail;
@@ -1373,7 +1423,8 @@ IOUserClient::copyClientEntitlement( task_t task,
         */
        len -= offsetof(CS_GenericBlob, data);
        if (len > MAX_ENTITLEMENTS_LEN) {
-               IOLog("failed to parse entitlements for %s[%u]: %lu bytes of entitlements exceeds maximum of %u\n", procname, pid, len, MAX_ENTITLEMENTS_LEN);
+               IOLog("failed to parse entitlements for %s[%u]: %lu bytes of entitlements exceeds maximum of %u\n",
+                   proc_best_name(p), pid, len, MAX_ENTITLEMENTS_LEN);
                goto fail;
        }
 
@@ -1391,7 +1442,8 @@ IOUserClient::copyClientEntitlement( task_t task,
 
        entitlements_obj = OSUnserializeXML(entitlements_data, len + 1, &errorString);
        if (errorString != NULL) {
-               IOLog("failed to parse entitlements for %s[%u]: %s\n", procname, pid, errorString->getCStringNoCopy());
+               IOLog("failed to parse entitlements for %s[%u]: %s\n",
+                   proc_best_name(p), pid, errorString->getCStringNoCopy());
                goto fail;
        }
        if (entitlements_obj == NULL) {
@@ -1402,12 +1454,7 @@ IOUserClient::copyClientEntitlement( task_t task,
        if (entitlements == NULL) {
                goto fail;
        }
-
-       /* Fetch the entitlement value from the dictionary. */
-       value = entitlements->getObject(entitlement);
-       if (value != NULL) {
-               value->retain();
-       }
+       entitlements_obj = NULL;
 
 fail:
        if (entitlements_data != NULL) {
@@ -1419,6 +1466,28 @@ fail:
        if (errorString != NULL) {
                errorString->release();
        }
+       return entitlements;
+}
+
+OSObject *
+IOUserClient::copyClientEntitlement( task_t task,
+    const char * entitlement )
+{
+       OSDictionary *entitlements;
+       OSObject *value;
+
+       entitlements = copyClientEntitlements(task);
+       if (entitlements == NULL) {
+               return NULL;
+       }
+
+       /* Fetch the entitlement value from the dictionary. */
+       value = entitlements->getObject(entitlement);
+       if (value != NULL) {
+               value->retain();
+       }
+
+       entitlements->release();
        return value;
 }
 
@@ -1523,6 +1592,9 @@ IOUserClient::registerOwner(task_t task)
                        owner->uc   = this;
                        queue_enter_first(&owners, owner, IOUserClientOwner *, ucLink);
                        queue_enter_first(task_io_user_clients(task), owner, IOUserClientOwner *, taskLink);
+                       if (messageAppSuspended) {
+                               task_set_message_app_suspended(task, true);
+                       }
                }
        }
 
@@ -1535,13 +1607,25 @@ void
 IOUserClient::noMoreSenders(void)
 {
        IOUserClientOwner * owner;
+       IOUserClientOwner * iter;
+       queue_head_t      * taskque;
+       bool                hasMessageAppSuspended;
 
        IOLockLock(gIOUserClientOwnersLock);
 
        if (owners.next) {
                while (!queue_empty(&owners)) {
                        owner = (IOUserClientOwner *)(void *) queue_first(&owners);
-                       queue_remove(task_io_user_clients(owner->task), owner, IOUserClientOwner *, taskLink);
+                       taskque = task_io_user_clients(owner->task);
+                       queue_remove(taskque, owner, IOUserClientOwner *, taskLink);
+                       hasMessageAppSuspended = false;
+                       queue_iterate(taskque, iter, IOUserClientOwner *, taskLink) {
+                               hasMessageAppSuspended = iter->uc->messageAppSuspended;
+                               if (hasMessageAppSuspended) {
+                                       break;
+                               }
+                       }
+                       task_set_message_app_suspended(owner->task, hasMessageAppSuspended);
                        queue_remove(&owners, owner, IOUserClientOwner *, ucLink);
                        IODelete(owner, IOUserClientOwner, 1);
                }
@@ -1551,6 +1635,55 @@ IOUserClient::noMoreSenders(void)
        IOLockUnlock(gIOUserClientOwnersLock);
 }
 
+
+extern "C" void
+iokit_task_app_suspended_changed(task_t task)
+{
+       queue_head_t      * taskque;
+       IOUserClientOwner * owner;
+       OSSet             * set;
+
+       IOLockLock(gIOUserClientOwnersLock);
+
+       taskque = task_io_user_clients(task);
+       set = NULL;
+       queue_iterate(taskque, owner, IOUserClientOwner *, taskLink) {
+               if (!owner->uc->messageAppSuspended) {
+                       continue;
+               }
+               if (!set) {
+                       set = OSSet::withCapacity(4);
+                       if (!set) {
+                               break;
+                       }
+               }
+               set->setObject(owner->uc);
+       }
+
+       IOLockUnlock(gIOUserClientOwnersLock);
+
+       if (set) {
+               set->iterateObjects(^bool (OSObject * obj) {
+                       IOUserClient      * uc;
+
+                       uc = (typeof(uc))obj;
+#if 0
+                       {
+                               OSString          * str;
+                               str = IOCopyLogNameForPID(task_pid(task));
+                               IOLog("iokit_task_app_suspended_changed(%s) %s %d\n", str ? str->getCStringNoCopy() : "",
+                               uc->getName(), task_is_app_suspended(task));
+                               OSSafeReleaseNULL(str);
+                       }
+#endif
+                       uc->message(kIOMessageTaskAppSuspendedChange, NULL);
+
+                       return false;
+               });
+               set->release();
+       }
+}
+
 extern "C" kern_return_t
 iokit_task_terminate(task_t task)
 {
@@ -1638,7 +1771,7 @@ IOUserClient::clientClose( void )
 IOService *
 IOUserClient::getService( void )
 {
-       return 0;
+       return NULL;
 }
 
 IOReturn
@@ -1701,8 +1834,8 @@ IOUserClient::mapClientMemory64(
 {
        IOReturn            err;
        IOOptionBits        options = 0;
-       IOMemoryDescriptor * memory = 0;
-       IOMemoryMap *       map = 0;
+       IOMemoryDescriptor * memory = NULL;
+       IOMemoryMap *       map = NULL;
 
        err = clientMemoryForType((UInt32) type, &options, &memory );
 
@@ -1772,13 +1905,13 @@ IOUserClient::adjustPortNameReferencesInTask(task_t task, mach_port_name_t port_
 IOExternalMethod *
 IOUserClient::getExternalMethodForIndex( UInt32 /* index */)
 {
-       return 0;
+       return NULL;
 }
 
 IOExternalAsyncMethod *
 IOUserClient::getExternalAsyncMethodForIndex( UInt32 /* index */)
 {
-       return 0;
+       return NULL;
 }
 
 IOExternalTrap *
@@ -1930,7 +2063,7 @@ IOUserClient::_sendAsyncResult64(OSAsyncReference64 reference,
        replyMsg.msgHdr.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND /*remote*/,
            0 /*local*/);
        replyMsg.msgHdr.msgh_remote_port = replyPort;
-       replyMsg.msgHdr.msgh_local_port  = 0;
+       replyMsg.msgHdr.msgh_local_port  = NULL;
        replyMsg.msgHdr.msgh_id          = kOSNotificationMessageID;
        if (kIOUCAsync64Flag & reference[0]) {
                replyMsg.msgHdr.msgh_size =
@@ -2077,7 +2210,7 @@ is_io_object_get_superclass(
        }
 
        ret = kIOReturnNotFound;
-       meta = 0;
+       meta = NULL;
        do{
                name = OSSymbol::withCString(obj_name);
                if (!name) {
@@ -2128,7 +2261,7 @@ is_io_object_get_bundle_identifier(
        }
 
        ret = kIOReturnNotFound;
-       meta = 0;
+       meta = NULL;
        do{
                name = OSSymbol::withCString(obj_name);
                if (!name) {
@@ -2169,7 +2302,7 @@ is_io_object_conforms_to(
                return kIOReturnBadArgument;
        }
 
-       *conforms = (0 != object->metaCast( className ));
+       *conforms = (NULL != object->metaCast( className ));
 
        return kIOReturnSuccess;
 }
@@ -2385,7 +2518,7 @@ is_io_service_get_matching_services_ool(
        if (KERN_SUCCESS == kr) {
                // must return success after vm_map_copyout() succeeds
                // and mig will copy out objects on success
-               *existing = 0;
+               *existing = NULL;
                *result = internal_io_service_get_matching_services(master_port,
                    (const char *) data, matchingCnt, existing);
                vm_deallocate( kernel_map, data, matchingCnt );
@@ -2467,7 +2600,7 @@ is_io_service_get_matching_service_ool(
        if (KERN_SUCCESS == kr) {
                // must return success after vm_map_copyout() succeeds
                // and mig will copy out objects on success
-               *service = 0;
+               *service = NULL;
                *result = internal_io_service_get_matching_service(master_port,
                    (const char *) data, matchingCnt, service );
                vm_deallocate( kernel_map, data, matchingCnt );
@@ -2499,8 +2632,8 @@ internal_io_service_add_notification(
        bool client64,
        io_object_t * notification )
 {
-       IOServiceUserNotification * userNotify = 0;
-       IONotifier *                notify = 0;
+       IOServiceUserNotification * userNotify = NULL;
+       IONotifier *                notify = NULL;
        const OSSymbol *            sym;
        OSDictionary *              dict;
        IOReturn                    err;
@@ -2546,7 +2679,7 @@ internal_io_service_add_notification(
                if (userNotify && !userNotify->init( port, userMsgType,
                    reference, referenceSize, client64)) {
                        userNotify->release();
-                       userNotify = 0;
+                       userNotify = NULL;
                }
                if (!userNotify) {
                        continue;
@@ -2566,7 +2699,7 @@ internal_io_service_add_notification(
        if ((kIOReturnSuccess != err) && userNotify) {
                userNotify->invalidatePort();
                userNotify->release();
-               userNotify = 0;
+               userNotify = NULL;
        }
 
        if (sym) {
@@ -2683,7 +2816,7 @@ internal_io_service_add_notification_ool(
        if (KERN_SUCCESS == kr) {
                // must return success after vm_map_copyout() succeeds
                // and mig will copy out objects on success
-               *notification = 0;
+               *notification = NULL;
                *result = internal_io_service_add_notification( master_port, notification_type,
                    (char *) data, matchingCnt, wake_port, reference, referenceSize, client64, notification );
                vm_deallocate( kernel_map, data, matchingCnt );
@@ -2770,8 +2903,8 @@ internal_io_service_add_interest_notification(
        bool client64,
        io_object_t * notification )
 {
-       IOServiceMessageUserNotification *  userNotify = 0;
-       IONotifier *                        notify = 0;
+       IOServiceMessageUserNotification *  userNotify = NULL;
+       IONotifier *                        notify = NULL;
        const OSSymbol *                    sym;
        IOReturn                            err;
 
@@ -2787,7 +2920,7 @@ internal_io_service_add_interest_notification(
                            kIOUserNotifyMaxMessageSize,
                            client64 )) {
                                userNotify->release();
-                               userNotify = 0;
+                               userNotify = NULL;
                        }
                        if (!userNotify) {
                                continue;
@@ -2810,7 +2943,7 @@ internal_io_service_add_interest_notification(
        if ((kIOReturnSuccess != err) && userNotify) {
                userNotify->invalidatePort();
                userNotify->release();
-               userNotify = 0;
+               userNotify = NULL;
        }
 
        return err;
@@ -3017,7 +3150,7 @@ is_io_registry_entry_from_path_ool(
        }
 
        map_data = 0;
-       entry    = 0;
+       entry    = NULL;
        res = err = KERN_SUCCESS;
        if (path[0]) {
                cpath = path;
@@ -3158,7 +3291,7 @@ is_io_registry_entry_get_name_in_plane(
        if (planeName[0]) {
                plane = IORegistryEntry::getPlane( planeName );
        } else {
-               plane = 0;
+               plane = NULL;
        }
 
        strncpy( name, entry->getName( plane), sizeof(io_name_t));
@@ -3179,7 +3312,7 @@ is_io_registry_entry_get_location_in_plane(
        if (planeName[0]) {
                plane = IORegistryEntry::getPlane( planeName );
        } else {
-               plane = 0;
+               plane = NULL;
        }
 
        const char * cstr = entry->getLocation( plane );
@@ -3220,7 +3353,7 @@ is_io_registry_entry_get_property_bytes(
        OSNumber    *       off;
        UInt64              offsetBytes;
        unsigned int        len = 0;
-       const void *        bytes = 0;
+       const void *        bytes = NULL;
        IOReturn            ret = kIOReturnSuccess;
 
        CHECK( IORegistryEntry, registry_entry, entry );
@@ -3404,7 +3537,7 @@ GetPropertiesEditor(void                  * reference,
        }
        if (ref->root == container) {
                if (0 != mac_iokit_check_get_property(ref->cred, ref->entry, name->getCStringNoCopy())) {
-                       value = 0;
+                       value = NULL;
                }
        }
        if (value) {
@@ -3425,8 +3558,8 @@ is_io_registry_entry_get_properties_bin(
        kern_return_t              err = kIOReturnSuccess;
        vm_size_t                  len;
        OSSerialize          * s;
-       OSSerialize::Editor    editor = 0;
-       void                 * editRef = 0;
+       OSSerialize::Editor    editor = NULL;
+       void                 * editRef = NULL;
 
        CHECK(IORegistryEntry, registry_entry, entry);
 
@@ -3437,7 +3570,7 @@ is_io_registry_entry_get_properties_bin(
                editRef   = &ref;
                ref.cred  = kauth_cred_get();
                ref.entry = entry;
-               ref.root  = 0;
+               ref.root  = NULL;
        }
 #endif
 
@@ -3717,10 +3850,10 @@ is_io_service_open_extended(
        kern_return_t * result,
        io_object_t *connection )
 {
-       IOUserClient * client = 0;
+       IOUserClient * client = NULL;
        kern_return_t  err = KERN_SUCCESS;
        IOReturn       res = kIOReturnSuccess;
-       OSDictionary * propertiesDict = 0;
+       OSDictionary * propertiesDict = NULL;
        bool           crossEndian;
        bool           disallowAccess;
 
@@ -3792,7 +3925,8 @@ is_io_service_open_extended(
                if (res == kIOReturnSuccess) {
                        assert( OSDynamicCast(IOUserClient, client));
 
-                       client->sharedInstance = (0 != client->getProperty(kIOUserClientSharedInstanceKey));
+                       client->sharedInstance = (NULL != client->getProperty(kIOUserClientSharedInstanceKey));
+                       client->messageAppSuspended = (NULL != client->getProperty(kIOUserClientMessageAppSuspendedKey));
                        client->closed = false;
                        client->lock = IOLockAlloc();
 
@@ -3816,7 +3950,7 @@ is_io_service_open_extended(
                                IOStatisticsClientCall();
                                client->clientClose();
                                client->release();
-                               client = 0;
+                               client = NULL;
                                break;
                        }
                        OSString * creatorName = IOCopyLogNameForPID(proc_selfpid());
@@ -3959,7 +4093,7 @@ is_io_connect_map_memory_into_task
                } else {
                        // keep it with the user client
                        IOLockLock( gIOObjectPortLock);
-                       if (0 == client->mappings) {
+                       if (NULL == client->mappings) {
                                client->mappings = OSSet::withCapacity(2);
                        }
                        if (client->mappings) {
@@ -4006,7 +4140,7 @@ IOMemoryMap *
 IOUserClient::removeMappingForDescriptor(IOMemoryDescriptor * mem)
 {
        OSIterator *  iter;
-       IOMemoryMap * map = 0;
+       IOMemoryMap * map = NULL;
 
        IOLockLock(gIOObjectPortLock);
 
@@ -4039,7 +4173,7 @@ is_io_connect_unmap_memory_from_task
 {
        IOReturn            err;
        IOOptionBits        options = 0;
-       IOMemoryDescriptor * memory = 0;
+       IOMemoryDescriptor * memory = NULL;
        IOMemoryMap *       map;
 
        CHECK( IOUserClient, connection, client );
@@ -4155,8 +4289,8 @@ is_io_connect_method_var_output
 
        IOExternalMethodArguments args;
        IOReturn ret;
-       IOMemoryDescriptor * inputMD  = 0;
-       OSObject *           structureVariableOutputData = 0;
+       IOMemoryDescriptor * inputMD  = NULL;
+       OSObject *           structureVariableOutputData = NULL;
 
        bzero(&args.__reserved[0], sizeof(args.__reserved));
        args.__reservedA = 0;
@@ -4165,7 +4299,7 @@ is_io_connect_method_var_output
        args.selector = selector;
 
        args.asyncWakePort               = MACH_PORT_NULL;
-       args.asyncReference              = 0;
+       args.asyncReference              = NULL;
        args.asyncReferenceCount         = 0;
        args.structureVariableOutputData = &structureVariableOutputData;
 
@@ -4252,8 +4386,8 @@ is_io_connect_method
 
        IOExternalMethodArguments args;
        IOReturn ret;
-       IOMemoryDescriptor * inputMD  = 0;
-       IOMemoryDescriptor * outputMD = 0;
+       IOMemoryDescriptor * inputMD  = NULL;
+       IOMemoryDescriptor * outputMD = NULL;
 
        bzero(&args.__reserved[0], sizeof(args.__reserved));
        args.__reservedA = 0;
@@ -4262,9 +4396,9 @@ is_io_connect_method
        args.selector = selector;
 
        args.asyncWakePort               = MACH_PORT_NULL;
-       args.asyncReference              = 0;
+       args.asyncReference              = NULL;
        args.asyncReferenceCount         = 0;
-       args.structureVariableOutputData = 0;
+       args.structureVariableOutputData = NULL;
 
        args.scalarInput = scalar_input;
        args.scalarInputCount = scalar_inputCnt;
@@ -4344,8 +4478,8 @@ is_io_connect_async_method
 
        IOExternalMethodArguments args;
        IOReturn ret;
-       IOMemoryDescriptor * inputMD  = 0;
-       IOMemoryDescriptor * outputMD = 0;
+       IOMemoryDescriptor * inputMD  = NULL;
+       IOMemoryDescriptor * outputMD = NULL;
 
        bzero(&args.__reserved[0], sizeof(args.__reserved));
        args.__reservedA = 0;
@@ -4362,7 +4496,7 @@ is_io_connect_async_method
        args.asyncReference      = reference;
        args.asyncReferenceCount = referenceCnt;
 
-       args.structureVariableOutputData = 0;
+       args.structureVariableOutputData = NULL;
 
        args.scalarInput = scalar_input;
        args.scalarInputCount = scalar_inputCnt;
@@ -4401,6 +4535,7 @@ is_io_connect_async_method
        IOStatisticsClientCall();
        ret = client->externalMethod( selector, &args );
 
+       *scalar_outputCnt = args.scalarOutputCount;
        *inband_outputCnt = args.structureOutputSize;
        *ool_output_size  = args.structureOutputDescriptorSize;
 
@@ -4547,10 +4682,14 @@ is_io_async_method_scalarI_scalarO(
        io_scalar_inband64_t _output;
        io_async_ref64_t _reference;
 
+       if (referenceCnt > ASYNC_REF64_COUNT) {
+               return kIOReturnBadArgument;
+       }
        bzero(&_output[0], sizeof(_output));
        for (i = 0; i < referenceCnt; i++) {
                _reference[i] = REF64(reference[i]);
        }
+       bzero(&_reference[referenceCnt], (ASYNC_REF64_COUNT - referenceCnt) * sizeof(_reference[0]));
 
        mach_msg_type_number_t struct_outputCnt = 0;
        mach_vm_size_t ool_output_size = 0;
@@ -4592,9 +4731,13 @@ is_io_async_method_scalarI_structureO(
        io_scalar_inband64_t _input;
        io_async_ref64_t _reference;
 
+       if (referenceCnt > ASYNC_REF64_COUNT) {
+               return kIOReturnBadArgument;
+       }
        for (i = 0; i < referenceCnt; i++) {
                _reference[i] = REF64(reference[i]);
        }
+       bzero(&_reference[referenceCnt], (ASYNC_REF64_COUNT - referenceCnt) * sizeof(_reference[0]));
 
        mach_msg_type_number_t scalar_outputCnt = 0;
        mach_vm_size_t ool_output_size = 0;
@@ -4631,9 +4774,13 @@ is_io_async_method_scalarI_structureI(
        io_scalar_inband64_t _input;
        io_async_ref64_t _reference;
 
+       if (referenceCnt > ASYNC_REF64_COUNT) {
+               return kIOReturnBadArgument;
+       }
        for (i = 0; i < referenceCnt; i++) {
                _reference[i] = REF64(reference[i]);
        }
+       bzero(&_reference[referenceCnt], (ASYNC_REF64_COUNT - referenceCnt) * sizeof(_reference[0]));
 
        mach_msg_type_number_t scalar_outputCnt = 0;
        mach_msg_type_number_t inband_outputCnt = 0;
@@ -4672,9 +4819,13 @@ is_io_async_method_structureI_structureO(
        mach_vm_size_t ool_output_size = 0;
        io_async_ref64_t _reference;
 
+       if (referenceCnt > ASYNC_REF64_COUNT) {
+               return kIOReturnBadArgument;
+       }
        for (i = 0; i < referenceCnt; i++) {
                _reference[i] = REF64(reference[i]);
        }
+       bzero(&_reference[referenceCnt], (ASYNC_REF64_COUNT - referenceCnt) * sizeof(_reference[0]));
 
        return is_io_connect_async_method(connect,
                   wake_port, _reference, referenceCnt,
@@ -4853,18 +5004,18 @@ shim_io_connect_method_scalarI_structureO(
                        break;
                case 3:
                        err = (object->*func)(  ARG32(input[0]), ARG32(input[1]), ARG32(input[2]),
-                           output, (void *)outputCount, 0 );
+                           output, (void *)outputCount, NULL );
                        break;
                case 2:
                        err = (object->*func)(  ARG32(input[0]), ARG32(input[1]),
-                           output, (void *)outputCount, 0, 0 );
+                           output, (void *)outputCount, NULL, NULL );
                        break;
                case 1:
                        err = (object->*func)(  ARG32(input[0]),
-                           output, (void *)outputCount, 0, 0, 0 );
+                           output, (void *)outputCount, NULL, NULL, NULL );
                        break;
                case 0:
-                       err = (object->*func)(  output, (void *)outputCount, 0, 0, 0, 0 );
+                       err = (object->*func)(  output, (void *)outputCount, NULL, NULL, NULL, NULL );
                        break;
 
                default:
@@ -4929,21 +5080,21 @@ shim_io_async_method_scalarI_structureO(
                case 3:
                        err = (object->*func)(  reference,
                            ARG32(input[0]), ARG32(input[1]), ARG32(input[2]),
-                           output, (void *)outputCount, 0 );
+                           output, (void *)outputCount, NULL );
                        break;
                case 2:
                        err = (object->*func)(  reference,
                            ARG32(input[0]), ARG32(input[1]),
-                           output, (void *)outputCount, 0, 0 );
+                           output, (void *)outputCount, NULL, NULL );
                        break;
                case 1:
                        err = (object->*func)(  reference,
                            ARG32(input[0]),
-                           output, (void *)outputCount, 0, 0, 0 );
+                           output, (void *)outputCount, NULL, NULL, NULL );
                        break;
                case 0:
                        err = (object->*func)(  reference,
-                           output, (void *)outputCount, 0, 0, 0, 0 );
+                           output, (void *)outputCount, NULL, NULL, NULL, NULL );
                        break;
 
                default:
@@ -5025,21 +5176,21 @@ shim_io_connect_method_scalarI_structureI(
                case 3:
                        err = (object->*func)( ARG32(input[0]), ARG32(input[1]), ARG32(input[2]),
                            inputStruct, (void *)(uintptr_t)inputStructCount,
-                           0 );
+                           NULL );
                        break;
                case 2:
                        err = (object->*func)( ARG32(input[0]), ARG32(input[1]),
                            inputStruct, (void *)(uintptr_t)inputStructCount,
-                           0, 0 );
+                           NULL, NULL );
                        break;
                case 1:
                        err = (object->*func)( ARG32(input[0]),
                            inputStruct, (void *)(uintptr_t)inputStructCount,
-                           0, 0, 0 );
+                           NULL, NULL, NULL );
                        break;
                case 0:
                        err = (object->*func)( inputStruct, (void *)(uintptr_t)inputStructCount,
-                           0, 0, 0, 0 );
+                           NULL, NULL, NULL, NULL );
                        break;
 
                default:
@@ -5103,24 +5254,24 @@ shim_io_async_method_scalarI_structureI(
                        err = (object->*func)(  reference,
                            ARG32(input[0]), ARG32(input[1]), ARG32(input[2]),
                            inputStruct, (void *)(uintptr_t)inputStructCount,
-                           0 );
+                           NULL );
                        break;
                case 2:
                        err = (object->*func)(  reference,
                            ARG32(input[0]), ARG32(input[1]),
                            inputStruct, (void *)(uintptr_t)inputStructCount,
-                           0, 0 );
+                           NULL, NULL );
                        break;
                case 1:
                        err = (object->*func)(  reference,
                            ARG32(input[0]),
                            inputStruct, (void *)(uintptr_t)inputStructCount,
-                           0, 0, 0 );
+                           NULL, NULL, NULL );
                        break;
                case 0:
                        err = (object->*func)(  reference,
                            inputStruct, (void *)(uintptr_t)inputStructCount,
-                           0, 0, 0, 0 );
+                           NULL, NULL, NULL, NULL );
                        break;
 
                default:
@@ -5184,12 +5335,12 @@ shim_io_connect_method_structureI_structureO(
                if (method->count1) {
                        if (method->count0) {
                                err = (object->*func)( input, output,
-                                   (void *)(uintptr_t)inputCount, outputCount, 0, 0 );
+                                   (void *)(uintptr_t)inputCount, outputCount, NULL, NULL );
                        } else {
-                               err = (object->*func)( output, outputCount, 0, 0, 0, 0 );
+                               err = (object->*func)( output, outputCount, NULL, NULL, NULL, NULL );
                        }
                } else {
-                       err = (object->*func)( input, (void *)(uintptr_t)inputCount, 0, 0, 0, 0 );
+                       err = (object->*func)( input, (void *)(uintptr_t)inputCount, NULL, NULL, NULL, NULL );
                }
        }while (false);
 
@@ -5239,14 +5390,14 @@ shim_io_async_method_structureI_structureO(
                        if (method->count0) {
                                err = (object->*func)( reference,
                                    input, output,
-                                   (void *)(uintptr_t)inputCount, outputCount, 0, 0 );
+                                   (void *)(uintptr_t)inputCount, outputCount, NULL, NULL );
                        } else {
                                err = (object->*func)( reference,
-                                   output, outputCount, 0, 0, 0, 0 );
+                                   output, outputCount, NULL, NULL, NULL, NULL );
                        }
                } else {
                        err = (object->*func)( reference,
-                           input, (void *)(uintptr_t)inputCount, 0, 0, 0, 0 );
+                           input, (void *)(uintptr_t)inputCount, NULL, NULL, NULL, NULL );
                }
        }while (false);
 
@@ -5269,7 +5420,7 @@ is_io_catalog_send_data(
 #if NO_KEXTD
        return kIOReturnNotPrivileged;
 #else /* NO_KEXTD */
-       OSObject * obj = 0;
+       OSObject * obj = NULL;
        vm_offset_t data;
        kern_return_t kr = kIOReturnError;
 
@@ -5279,14 +5430,14 @@ is_io_catalog_send_data(
                return kIOReturnNotPrivileged;
        }
 
-       if ((flag != kIOCatalogRemoveKernelLinker &&
+       if ((flag != kIOCatalogRemoveKernelLinker__Removed &&
            flag != kIOCatalogKextdActive &&
            flag != kIOCatalogKextdFinishedLaunching) &&
            (!inData || !inDataCount)) {
                return kIOReturnBadArgument;
        }
 
-       if (!IOTaskHasEntitlement(current_task(), "com.apple.rootless.kext-secure-management")) {
+       if (!IOTaskHasEntitlement(current_task(), kOSKextManagementEntitlement)) {
                OSString * taskName = IOCopyLogNameForPID(proc_selfpid());
                IOLog("IOCatalogueSendData(%s): Not entitled\n", taskName ? taskName->getCStringNoCopy() : "");
                OSSafeReleaseNULL(taskName);
@@ -5370,21 +5521,8 @@ is_io_catalog_send_data(
        }
        break;
 
-       case kIOCatalogStartMatching: {
-               OSDictionary * dict;
-
-               dict = OSDynamicCast(OSDictionary, obj);
-               if (dict) {
-                       if (!gIOCatalogue->startMatching( dict )) {
-                               kr = kIOReturnError;
-                       }
-               } else {
-                       kr = kIOReturnBadArgument;
-               }
-       }
-       break;
-
-       case kIOCatalogRemoveKernelLinker:
+       case kIOCatalogStartMatching__Removed:
+       case kIOCatalogRemoveKernelLinker__Removed:
                kr = KERN_NOT_SUPPORTED;
                break;
 
@@ -5404,12 +5542,8 @@ is_io_catalog_send_data(
        case kIOCatalogKextdFinishedLaunching: {
 #if !NO_KEXTD
                if (!gIOKextdClearedBusy) {
-                       IOService * serviceRoot = IOService::getServiceRoot();
-                       if (serviceRoot) {
-                               IOServiceTrace(IOSERVICE_KEXTD_READY, 0, 0, 0, 0);
-                               serviceRoot->adjustBusy(-1);
-                               gIOKextdClearedBusy = true;
-                       }
+                       IOService::kextdLaunched();
+                       gIOKextdClearedBusy = true;
                }
 #endif
                kr = kIOReturnSuccess;
@@ -5604,11 +5738,19 @@ is_io_catalog_reset(
 kern_return_t
 iokit_user_client_trap(struct iokit_user_client_trap_args *args)
 {
-       kern_return_t result = kIOReturnBadArgument;
-       IOUserClient *userClient;
+       kern_return_t  result = kIOReturnBadArgument;
+       IOUserClient * userClient;
+       OSObject     * object;
+       uintptr_t      ref;
 
-       if ((userClient = OSDynamicCast(IOUserClient,
-           iokit_lookup_connect_ref_current_task((mach_port_name_t)(uintptr_t)args->userClientRef)))) {
+       ref = (uintptr_t) args->userClientRef;
+       if ((1ULL << 32) & ref) {
+               object = iokit_lookup_uext_ref_current_task((mach_port_name_t) ref);
+               if (object) {
+                       result = IOUserServerUEXTTrap(object, args->p1, args->p2, args->p3, args->p4, args->p5, args->p6);
+               }
+               OSSafeReleaseNULL(object);
+       } else if ((userClient = OSDynamicCast(IOUserClient, iokit_lookup_connect_ref_current_task((mach_port_name_t) ref)))) {
                IOExternalTrap *trap;
                IOService *target = NULL;
 
@@ -5704,7 +5846,7 @@ IOUserClient::externalMethod( uint32_t selector, IOExternalMethodArguments * arg
 
        if (args->asyncWakePort) {
                IOExternalAsyncMethod * method;
-               object = 0;
+               object = NULL;
                if (!(method = getAsyncTargetAndMethodForIndex(&object, selector)) || !object) {
                        return kIOReturnUnsupported;
                }
@@ -5751,7 +5893,7 @@ IOUserClient::externalMethod( uint32_t selector, IOExternalMethodArguments * arg
                }
        } else {
                IOExternalMethod *      method;
-               object = 0;
+               object = NULL;
                if (!(method = getTargetAndMethodForIndex(&object, selector)) || !object) {
                        return kIOReturnUnsupported;
                }
diff --git a/iokit/Kernel/IOUserServer.cpp b/iokit/Kernel/IOUserServer.cpp
new file mode 100644 (file)
index 0000000..4ad8eb5
--- /dev/null
@@ -0,0 +1,3462 @@
+/*
+ * Copyright (c) 1998-2014 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <IOKit/IORPC.h>
+#include <IOKit/IOKitServer.h>
+#include <IOKit/IOKitKeysPrivate.h>
+#include <IOKit/IOUserClient.h>
+#include <IOKit/IOService.h>
+#include <IOKit/IORegistryEntry.h>
+#include <IOKit/IOCatalogue.h>
+#include <IOKit/IOMemoryDescriptor.h>
+#include <IOKit/IOBufferMemoryDescriptor.h>
+#include <IOKit/IOLib.h>
+#include <IOKit/IOBSD.h>
+#include <IOKit/system.h>
+#include <IOKit/IOUserServer.h>
+#include <IOKit/IOInterruptEventSource.h>
+#include <IOKit/IOTimerEventSource.h>
+#include <IOKit/pwr_mgt/RootDomain.h>
+#include <libkern/c++/OSKext.h>
+#include <libkern/OSDebug.h>
+#include <libkern/Block.h>
+#include <sys/proc.h>
+#include "IOKitKernelInternal.h"
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+#include <DriverKit/IODispatchQueue.h>
+#include <DriverKit/OSObject.h>
+#include <DriverKit/OSAction.h>
+#include <DriverKit/IODispatchSource.h>
+#include <DriverKit/IOInterruptDispatchSource.h>
+#include <DriverKit/IOService.h>
+#include <DriverKit/IOMemoryDescriptor.h>
+#include <DriverKit/IOBufferMemoryDescriptor.h>
+#include <DriverKit/IOMemoryMap.h>
+#include <DriverKit/IODataQueueDispatchSource.h>
+#include <DriverKit/IOUserServer.h>
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+#include <System/IODataQueueDispatchSourceShared.h>
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+SInt64    gIODKDebug = kIODKEnable;
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+struct IOPStrings;
+
+class OSUserMetaClass : public OSObject
+{
+       OSDeclareDefaultStructors(OSUserMetaClass);
+public:
+       const OSSymbol    * name;
+       const OSMetaClass * meta;
+       OSUserMetaClass   * superMeta;
+
+       queue_chain_t       link;
+
+       OSClassDescription * description;
+       IOPStrings * queueNames;
+       uint32_t     methodCount;
+       uint64_t   * methods;
+
+       virtual void free() override;
+       virtual kern_return_t Dispatch(const IORPC rpc) APPLE_KEXT_OVERRIDE;
+};
+OSDefineMetaClassAndStructors(OSUserMetaClass, OSObject);
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+class IOUserService : public IOService
+{
+       friend class IOService;
+
+       OSDeclareDefaultStructors(IOUserService)
+
+       virtual bool
+       start(IOService * provider) APPLE_KEXT_OVERRIDE;
+       virtual IOReturn
+       setProperties(OSObject * props) APPLE_KEXT_OVERRIDE;
+};
+
+OSDefineMetaClassAndStructors(IOUserService, IOService)
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+class IOUserUserClient : public IOUserClient
+{
+       OSDeclareDefaultStructors(IOUserUserClient);
+public:
+       task_t          fTask;
+
+       IOReturn                   setTask(task_t task);
+       virtual void           stop(IOService * provider) APPLE_KEXT_OVERRIDE;
+       virtual IOReturn       clientClose(void) APPLE_KEXT_OVERRIDE;
+       virtual IOReturn       setProperties(OSObject * properties) APPLE_KEXT_OVERRIDE;
+       virtual IOReturn       externalMethod(uint32_t selector, IOExternalMethodArguments * args,
+           IOExternalMethodDispatch * dispatch, OSObject * target, void * reference) APPLE_KEXT_OVERRIDE;
+       virtual IOReturn           clientMemoryForType(UInt32 type,
+           IOOptionBits * options,
+           IOMemoryDescriptor ** memory) APPLE_KEXT_OVERRIDE;
+};
+
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+
+bool
+IOUserService::start(IOService * provider)
+{
+       bool     ok = true;
+       IOReturn ret;
+
+       ret = Start(provider);
+       if (kIOReturnSuccess != ret) {
+               return false;
+       }
+
+       return ok;
+}
+
+IOReturn
+IOUserService::setProperties(OSObject * properties)
+{
+       setProperty("USER", properties);
+       return kIOReturnSuccess;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+#undef super
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+struct IODispatchQueue_IVars {
+       IOUserServer * userServer;
+       IODispatchQueue   * queue;
+       queue_chain_t  link;
+       uint64_t       tid;
+
+       mach_port_t    serverPort;
+};
+
+struct OSAction_IVars {
+       OSObject             * target;
+       uint64_t               targetmsgid;
+       uint64_t               msgid;
+       OSActionAbortedHandler abortedHandler;
+       size_t                 referenceSize;
+       void                 * reference[0];
+};
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+kern_return_t
+IMPL(IOService, GetRegistryEntryID)
+{
+       IOReturn ret = kIOReturnSuccess;
+
+       *registryEntryID = getRegistryEntryID();
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IOService, SetName)
+{
+       IOReturn ret = kIOReturnSuccess;
+
+       setName(name);
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IOService, Start)
+{
+       IOReturn ret = kIOReturnSuccess;
+       return ret;
+}
+
+kern_return_t
+IMPL(IOService, RegisterService)
+{
+       IOReturn ret = kIOReturnSuccess;
+
+       registerService();
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IOService, CopyDispatchQueue)
+{
+       IODispatchQueue * result;
+       IOService  * service;
+       IOReturn     ret;
+       uint32_t index;
+
+       ret = kIOReturnNotFound;
+       index = -1U;
+       if (!strcmp("Default", name)) {
+               index = 0;
+       } else if (reserved->uvars->userMeta
+           && reserved->uvars->userMeta->queueNames) {
+               index = reserved->uvars->userServer->stringArrayIndex(reserved->uvars->userMeta->queueNames, name);
+               if (index != -1U) {
+                       index++;
+               }
+       }
+       if (index == -1U) {
+               if ((service = getProvider())) {
+                       ret = service->CopyDispatchQueue(name, queue);
+               }
+       } else {
+               result = reserved->uvars->queueArray[index];
+               if (result) {
+                       result->retain();
+                       *queue = result;
+                       ret = kIOReturnSuccess;
+               }
+       }
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IOService, SetDispatchQueue)
+{
+       IOReturn ret = kIOReturnSuccess;
+       uint32_t index;
+
+       if (kIODKLogSetup & gIODKDebug) {
+               DKLOG(DKS "::SetDispatchQueue(%s)\n", DKN(this), name);
+       }
+       queue->ivars->userServer = reserved->uvars->userServer;
+       index = -1U;
+       if (!strcmp("Default", name)) {
+               index = 0;
+       } else if (reserved->uvars->userMeta
+           && reserved->uvars->userMeta->queueNames) {
+               index = reserved->uvars->userServer->stringArrayIndex(reserved->uvars->userMeta->queueNames, name);
+               if (index != -1U) {
+                       index++;
+               }
+       }
+       if (index == -1U) {
+               ret = kIOReturnBadArgument;
+       } else {
+               reserved->uvars->queueArray[index] = queue;
+               queue->retain();
+       }
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IOService, SetProperties)
+{
+       IOReturn ret = kIOReturnUnsupported;
+
+       ret = setProperties(properties);
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IOService, CopyProperties)
+{
+       IOReturn ret = kIOReturnSuccess;
+       *properties = dictionaryWithProperties();
+       return ret;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+kern_return_t
+IMPL(IOMemoryDescriptor, _CopyState)
+{
+       IOReturn ret;
+
+       state->length = _length;
+       state->options = _flags;
+
+       ret = kIOReturnSuccess;
+
+       return ret;
+}
+
+kern_return_t
+IOMemoryDescriptor::GetLength(uint64_t * returnLength)
+{
+       *returnLength = getLength();
+
+       return kIOReturnSuccess;
+}
+
+kern_return_t
+IMPL(IOMemoryDescriptor, CreateMapping)
+{
+       IOReturn          ret;
+       IOMemoryMap     * resultMap;
+       IOOptionBits      koptions;
+       mach_vm_address_t atAddress;
+
+       ret       = kIOReturnSuccess;
+       koptions  = 0;
+       resultMap = NULL;
+
+       if (kIOMemoryMapFixedAddress & options) {
+               atAddress   = address;
+               koptions    = 0;
+       } else {
+               atAddress   = 0;
+               koptions   |= kIOMapAnywhere;
+       }
+
+       if (kIOMemoryMapReadOnly & options || (kIODirectionOut == getDirection())) {
+               if (!reserved || (current_task() != reserved->creator)) {
+                       koptions   |= kIOMapReadOnly;
+               }
+       }
+
+       switch (0xFF00 & options) {
+       case kIOMemoryMapCacheModeDefault:
+               koptions |= kIOMapDefaultCache;
+               break;
+       case kIOMemoryMapCacheModeInhibit:
+               koptions |= kIOMapInhibitCache;
+               break;
+       case kIOMemoryMapCacheModeCopyback:
+               koptions |= kIOMapCopybackCache;
+               break;
+       case kIOMemoryMapCacheModeWriteThrough:
+               koptions |= kIOMapWriteThruCache;
+               break;
+       default:
+               ret = kIOReturnBadArgument;
+       }
+
+       if (kIOReturnSuccess == ret) {
+               resultMap = createMappingInTask(current_task(), atAddress, koptions, offset, length);
+               if (!resultMap) {
+                       ret = kIOReturnError;
+               }
+       }
+
+       *map = resultMap;
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IOMemoryDescriptor, PrepareForDMA)
+{
+       IOReturn    ret;
+       uint32_t    idx, count;
+       uint64_t    sumLength;
+       uint64_t    lflags;
+
+       if (!device) {
+               return kIOReturnBadArgument;
+       }
+
+       count = *segmentsCount;
+       sumLength = 0;
+       for (idx = 0; idx < count; idx++) {
+#ifdef __LP64__
+               segments[idx].address = getPhysicalSegment(offset, &segments[idx].length);
+#else
+               segments[idx].address = 0;
+#endif
+               if (!segments[idx].address) {
+                       break;
+               }
+               sumLength += segments[idx].length;
+               offset += segments[idx].length;
+       }
+       *returnLength = sumLength;
+       *segmentsCount = idx;
+
+       // !!translate flags
+       lflags = 0;
+       if (kIODirectionOut & _flags) {
+               lflags |= kIOMemoryDirectionOut;
+       }
+       if (kIODirectionIn  & _flags) {
+               lflags |= kIOMemoryDirectionIn;
+       }
+
+       *flags = lflags;
+       ret = kIOReturnSuccess;
+
+       return ret;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+kern_return_t
+IMPL(IOMemoryMap, _CopyState)
+{
+       IOReturn ret;
+
+       state->offset  = fOffset;
+       state->length  = getLength();
+       state->address = getAddress();
+       state->options = getMapOptions();
+
+       ret = kIOReturnSuccess;
+
+       return ret;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+kern_return_t
+IMPL(IOBufferMemoryDescriptor, Create)
+{
+       IOReturn ret;
+       IOBufferMemoryDescriptor   * bmd;
+       IOMemoryDescriptorReserved * reserved;
+
+       if (options & ~((uint64_t) kIOMemoryDirectionOutIn)) {
+               // no other options currently defined
+               return kIOReturnBadArgument;
+       }
+       options &= kIOMemoryDirectionOutIn;
+       bmd = IOBufferMemoryDescriptor::inTaskWithOptions(
+               kernel_task, options, capacity, alignment);
+
+       *memory = bmd;
+
+       if (!bmd) {
+               return kIOReturnNoMemory;
+       }
+
+       reserved = bmd->getKernelReserved();
+       reserved->creator = current_task();
+       task_reference(reserved->creator);
+
+       ret = kIOReturnSuccess;
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IOBufferMemoryDescriptor, SetLength)
+{
+       setLength(length);
+       return kIOReturnSuccess;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+kern_return_t
+OSAction::Create(OSAction_Create_Args)
+{
+       kern_return_t ret;
+       ret = OSAction::Create_Call(target, targetmsgid, msgid, referenceSize, action);
+       return ret;
+}
+
+kern_return_t
+IMPL(OSAction, Create)
+{
+       OSAction * inst;
+       vm_size_t  allocsize;
+
+       if (os_add_overflow(referenceSize, sizeof(OSAction_IVars), &allocsize)) {
+               return kIOReturnBadArgument;
+       }
+       inst = OSTypeAlloc(OSAction);
+       if (!inst) {
+               return kIOReturnNoMemory;
+       }
+       inst->ivars = (typeof(inst->ivars))(uintptr_t) IONewZero(uint8_t, allocsize);
+       if (!inst->ivars) {
+               inst->release();
+               return kIOReturnNoMemory;
+       }
+       target->retain();
+       inst->ivars->target        = target;
+       inst->ivars->targetmsgid   = targetmsgid;
+       inst->ivars->msgid         = msgid;
+       inst->ivars->referenceSize = referenceSize;
+
+       *action = inst;
+
+       return kIOReturnSuccess;
+}
+
+void
+OSAction::free()
+{
+       if (ivars) {
+               if (ivars->abortedHandler) {
+                       Block_release(ivars->abortedHandler);
+                       ivars->abortedHandler = NULL;
+               }
+               OSSafeReleaseNULL(ivars->target);
+               IOSafeDeleteNULL(ivars, uint8_t, ivars->referenceSize + sizeof(OSAction_IVars));
+       }
+       return super::free();
+}
+
+void *
+OSAction::GetReference()
+{
+       assert(ivars && ivars->referenceSize);
+       return &ivars->reference[0];
+}
+
+kern_return_t
+OSAction::SetAbortedHandler(OSActionAbortedHandler handler)
+{
+       ivars->abortedHandler = Block_copy(handler);
+       return kIOReturnSuccess;
+}
+
+void
+OSAction::Aborted_Impl(void)
+{
+       if (ivars->abortedHandler) {
+               ivars->abortedHandler();
+       }
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+struct IODispatchSource_IVars {
+       queue_chain_t           link;
+       IODispatchSource      * source;
+       IOUserServer          * server;
+       IODispatchQueue_IVars * queue;
+       bool                    enabled;
+};
+
+bool
+IODispatchSource::init()
+{
+       if (!super::init()) {
+               return false;
+       }
+
+       ivars = IONewZero(IODispatchSource_IVars, 1);
+
+       ivars->source = this;
+
+       return true;
+}
+
+void
+IODispatchSource::free()
+{
+       IOSafeDeleteNULL(ivars, IODispatchSource_IVars, 1);
+       super::free();
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+struct IOInterruptDispatchSource_IVars {
+       IOService    * provider;
+       uint32_t       intIndex;
+       IOSimpleLock * lock;
+       thread_t       waiter;
+       uint64_t       count;
+       uint64_t       time;
+       OSAction     * action;
+       bool           enable;
+};
+
+static void
+IOInterruptDispatchSourceInterrupt(OSObject * target, void * refCon,
+    IOService * nub, int source )
+{
+       IOInterruptDispatchSource_IVars * ivars = (typeof(ivars))refCon;
+       IOInterruptState is;
+
+       is = IOSimpleLockLockDisableInterrupt(ivars->lock);
+       ivars->count++;
+       if (ivars->waiter) {
+               ivars->time = mach_absolute_time();
+               thread_wakeup_thread((event_t) ivars, ivars->waiter);
+               ivars->waiter = NULL;
+       }
+       IOSimpleLockUnlockEnableInterrupt(ivars->lock, is);
+}
+
+kern_return_t
+IMPL(IOInterruptDispatchSource, Create)
+{
+       IOReturn ret;
+       IOInterruptDispatchSource * inst;
+
+       inst = OSTypeAlloc(IOInterruptDispatchSource);
+       if (!inst->init()) {
+               inst->free();
+               return kIOReturnNoMemory;
+       }
+
+       inst->ivars->lock = IOSimpleLockAlloc();
+
+       ret = provider->registerInterrupt(index, inst, IOInterruptDispatchSourceInterrupt, inst->ivars);
+       if (kIOReturnSuccess == ret) {
+               inst->ivars->intIndex = index;
+               inst->ivars->provider = provider;
+               *source = inst;
+       }
+       return ret;
+}
+
+bool
+IOInterruptDispatchSource::init()
+{
+       if (!super::init()) {
+               return false;
+       }
+       ivars = IONewZero(IOInterruptDispatchSource_IVars, 1);
+       if (!ivars) {
+               return false;
+       }
+
+       return true;
+}
+
+void
+IOInterruptDispatchSource::free()
+{
+       IOReturn ret;
+
+       if (ivars && ivars->provider) {
+               ret = ivars->provider->unregisterInterrupt(ivars->intIndex);
+               assert(kIOReturnSuccess == ret);
+       }
+
+       IOSafeDeleteNULL(ivars, IOInterruptDispatchSource_IVars, 1);
+
+       super::free();
+}
+
+kern_return_t
+IMPL(IOInterruptDispatchSource, SetHandler)
+{
+       IOReturn ret;
+       OSAction * oldAction;
+
+       oldAction = (typeof(oldAction))ivars->action;
+       if (oldAction && OSCompareAndSwapPtr(oldAction, NULL, &ivars->action)) {
+               oldAction->release();
+       }
+       action->retain();
+       ivars->action = action;
+
+       ret = kIOReturnSuccess;
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IOInterruptDispatchSource, SetEnableWithCompletion)
+{
+       IOReturn ret;
+       IOInterruptState is;
+
+       if (enable == ivars->enable) {
+               return kIOReturnSuccess;
+       }
+
+       if (enable) {
+               is = IOSimpleLockLockDisableInterrupt(ivars->lock);
+               ivars->enable = enable;
+               IOSimpleLockUnlockEnableInterrupt(ivars->lock, is);
+               ret = ivars->provider->enableInterrupt(ivars->intIndex);
+       } else {
+               ret = ivars->provider->disableInterrupt(ivars->intIndex);
+               is = IOSimpleLockLockDisableInterrupt(ivars->lock);
+               ivars->enable = enable;
+               IOSimpleLockUnlockEnableInterrupt(ivars->lock, is);
+       }
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IODispatchSource, SetEnable)
+{
+       return SetEnableWithCompletion(enable, NULL);
+}
+
+kern_return_t
+IMPL(IOInterruptDispatchSource, CheckForWork)
+{
+       IOReturn         ret = kIOReturnNotReady;
+       IOInterruptState is;
+       wait_result_t    waitResult;
+       uint64_t         icount;
+       uint64_t         itime;
+       thread_t         self;
+
+       self = current_thread();
+       icount = 0;
+       do {
+               is = IOSimpleLockLockDisableInterrupt(ivars->lock);
+               if ((icount = ivars->count)) {
+                       itime = ivars->time;
+                       ivars->count = 0;
+                       waitResult = THREAD_AWAKENED;
+               } else if (synchronous) {
+                       assert(NULL == ivars->waiter);
+                       ivars->waiter = self;
+                       waitResult = assert_wait((event_t) ivars, THREAD_INTERRUPTIBLE);
+               }
+               IOSimpleLockUnlockEnableInterrupt(ivars->lock, is);
+               if (synchronous && (waitResult == THREAD_WAITING)) {
+                       waitResult = thread_block(THREAD_CONTINUE_NULL);
+                       if (THREAD_INTERRUPTED == waitResult) {
+                               break;
+                       }
+               }
+       } while (synchronous && !icount);
+
+       if (icount && ivars->action) {
+               ret = InterruptOccurred(rpc, ivars->action, icount, itime);
+       }
+
+       return ret;
+}
+
+void
+IMPL(IOInterruptDispatchSource, InterruptOccurred)
+{
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+kern_return_t
+IOUserServer::waitInterruptTrap(void * p1, void * p2, void * p3, void * p4, void * p5, void * p6)
+{
+       IOReturn         ret = kIOReturnBadArgument;
+       IOInterruptState is;
+       IOInterruptDispatchSource * interrupt;
+       IOInterruptDispatchSource_IVars * ivars;
+       IOInterruptDispatchSourcePayload payload;
+
+       wait_result_t    waitResult;
+       thread_t         self;
+
+       OSObject * object;
+
+       object = iokit_lookup_object_with_port_name((mach_port_name_t)(uintptr_t)p1, IKOT_UEXT_OBJECT, current_task());
+
+       if (!object) {
+               return kIOReturnBadArgument;
+       }
+       if (!(interrupt = OSDynamicCast(IOInterruptDispatchSource, object))) {
+               ret = kIOReturnBadArgument;
+       } else {
+               self = current_thread();
+               ivars = interrupt->ivars;
+               payload.count = 0;
+               do {
+                       is = IOSimpleLockLockDisableInterrupt(ivars->lock);
+                       if ((payload.count = ivars->count)) {
+                               payload.time = ivars->time;
+                               ivars->count = 0;
+                               waitResult = THREAD_AWAKENED;
+                       } else {
+                               assert(NULL == ivars->waiter);
+                               ivars->waiter = self;
+                               waitResult = assert_wait((event_t) ivars, THREAD_INTERRUPTIBLE);
+                       }
+                       IOSimpleLockUnlockEnableInterrupt(ivars->lock, is);
+                       if (waitResult == THREAD_WAITING) {
+                               waitResult = thread_block(THREAD_CONTINUE_NULL);
+                               if (THREAD_INTERRUPTED == waitResult) {
+                                       break;
+                               }
+                       }
+               } while (!payload.count);
+               ret = (payload.count ? kIOReturnSuccess : kIOReturnAborted);
+       }
+
+       if (kIOReturnSuccess == ret) {
+               int copyerr = copyout(&payload, (user_addr_t) p2, sizeof(payload));
+               if (copyerr) {
+                       ret = kIOReturnVMError;
+               }
+       }
+
+       object->release();
+
+       return ret;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+kern_return_t
+IMPL(IOUserServer, Create)
+{
+       IOReturn          ret;
+       IOUserServer    * us;
+       const OSSymbol  * sym;
+       OSNumber        * serverTag;
+       io_name_t         rname;
+
+       us = (typeof(us))thread_iokit_tls_get(0);
+       assert(OSDynamicCast(IOUserServer, us));
+       if (kIODKLogSetup & gIODKDebug) {
+               DKLOG(DKS "::Create(" DKS ") %p\n", DKN(us), name, tag, us);
+       }
+       if (!us) {
+               return kIOReturnError;
+       }
+
+       sym       = OSSymbol::withCString(name);
+       serverTag = OSNumber::withNumber(tag, 64);
+
+       us->setProperty(gIOUserServerNameKey, (OSObject *) sym);
+       us->setProperty(gIOUserServerTagKey, serverTag);
+
+       serverTag->release();
+       OSSafeReleaseNULL(sym);
+
+       snprintf(rname, sizeof(rname), "IOUserServer(%s-0x%qx)", name, tag);
+       us->setName(rname);
+
+       us->retain();
+       *server = us;
+       ret = kIOReturnSuccess;
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IOUserServer, Exit)
+{
+       return kIOReturnUnsupported;
+}
+
+kern_return_t
+IMPL(IOUserServer, LoadModule)
+{
+       return kIOReturnUnsupported;
+}
+
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+kern_return_t
+IMPL(IODispatchQueue, Create)
+{
+       IODispatchQueue * result;
+       IOUserServer    * us;
+
+       result = OSTypeAlloc(IODispatchQueue);
+       if (!result) {
+               return kIOReturnNoMemory;
+       }
+       if (!result->init()) {
+               return kIOReturnNoMemory;
+       }
+
+       *queue = result;
+
+       if (!strcmp("Root", name)) {
+               us = (typeof(us))thread_iokit_tls_get(0);
+               assert(OSDynamicCast(IOUserServer, us));
+               us->setRootQueue(result);
+       }
+
+       if (kIODKLogSetup & gIODKDebug) {
+               DKLOG("IODispatchQueue::Create %s %p\n", name, result);
+       }
+
+       return kIOReturnSuccess;
+}
+
+kern_return_t
+IMPL(IODispatchQueue, SetPort)
+{
+       ivars->serverPort = port;
+       return kIOReturnSuccess;
+}
+
+bool
+IODispatchQueue::init()
+{
+       ivars = IONewZero(IODispatchQueue_IVars, 1);
+       if (!ivars) {
+               return false;
+       }
+       ivars->queue = this;
+
+       return true;
+}
+
+void
+IODispatchQueue::free()
+{
+       IOSafeDeleteNULL(ivars, IODispatchQueue_IVars, 1);
+       super::free();
+}
+
+bool
+IODispatchQueue::OnQueue()
+{
+       return false;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+
+kern_return_t
+OSMetaClassBase::Dispatch(IORPC rpc)
+{
+       return kIOReturnUnsupported;
+}
+
+kern_return_t
+OSMetaClassBase::Invoke(IORPC rpc)
+{
+       IOReturn          ret = kIOReturnUnsupported;
+       OSMetaClassBase * object;
+       OSAction        * action;
+       IOService       * service;
+       IOUserServer    * us;
+       IORPCMessage    * message;
+
+       assert(rpc.sendSize >= (sizeof(IORPCMessageMach) + sizeof(IORPCMessage)));
+       message = IORPCMessageFromMach(rpc.message, false);
+       if (!message) {
+               return kIOReturnIPCError;
+       }
+       message->flags |= kIORPCMessageKernel;
+
+       us = NULL;
+       if (!(kIORPCMessageLocalHost & message->flags)) {
+               us = OSDynamicCast(IOUserServer, this);
+               if (!us) {
+                       if ((action = OSDynamicCast(OSAction, this))) {
+                               object = IOUserServer::target(action, message);
+                       } else {
+                               object = this;
+                       }
+                       if ((service = OSDynamicCast(IOService, object))
+                           && service->reserved->uvars) {
+                               // xxx other classes
+                               us = service->reserved->uvars->userServer;
+                       }
+               }
+       }
+       if (us) {
+               message->flags |= kIORPCMessageRemote;
+               ret = us->rpc(rpc);
+               if (kIOReturnSuccess != ret) {
+                       if (kIODKLogIPC & gIODKDebug) {
+                               DKLOG("OSMetaClassBase::Invoke user 0x%x\n", ret);
+                       }
+               }
+       } else {
+               if (kIODKLogIPC & gIODKDebug) {
+                       DKLOG("OSMetaClassBase::Invoke kernel %s 0x%qx\n", getMetaClass()->getClassName(), message->msgid);
+               }
+               ret = Dispatch(rpc);
+       }
+
+       return ret;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+struct IOPStrings {
+       uint32_t     dataSize;
+       uint32_t     count;
+       const char   strings[0];
+};
+
+kern_return_t
+OSUserMetaClass::Dispatch(IORPC rpc)
+{
+       return const_cast<OSMetaClass *>(meta)->Dispatch(rpc);
+}
+
+void
+OSUserMetaClass::free()
+{
+       if (queueNames) {
+               IOFree(queueNames, sizeof(IOPStrings) + queueNames->dataSize * sizeof(char));
+               queueNames = NULL;
+       }
+       if (description) {
+               IOFree(description, description->descriptionSize);
+               description = NULL;
+       }
+       IOSafeDeleteNULL(methods, uint64_t, 2 * methodCount);
+       if (meta) {
+               meta->releaseMetaClass();
+       }
+       if (name) {
+               name->release();
+       }
+       OSObject::free();
+}
+
+/*
+ * Sets the loadTag of the associated OSKext
+ * in the dext task.
+ * NOTE: different instances of the same OSKext
+ * (so same BounleID but different tasks)
+ * will have the same loadTag.
+ */
+void
+IOUserServer::setTaskLoadTag(OSKext *kext)
+{
+       task_t owningTask;
+       uint32_t loadTag, prev_taskloadTag;
+
+       owningTask = this->fOwningTask;
+       if (!owningTask) {
+               printf("%s: fOwningTask not found\n", __FUNCTION__);
+               return;
+       }
+
+       loadTag = kext->getLoadTag();
+       prev_taskloadTag = set_task_loadTag(owningTask, loadTag);
+       if (prev_taskloadTag) {
+               printf("%s: found the task loadTag already set to %u (set to %u)\n",
+                   __FUNCTION__, prev_taskloadTag, loadTag);
+       }
+}
+
+/*
+ * Sets the OSKext uuid as the uuid of the userspace
+ * dext executable.
+ */
+void
+IOUserServer::setDriverKitUUID(OSKext *kext)
+{
+       task_t task;
+       proc_t p;
+       uuid_t p_uuid, k_uuid;
+       OSData *k_data_uuid;
+       OSData *new_uuid;
+       uuid_string_t       uuid_string = "";
+
+       task = this->fOwningTask;
+       if (!task) {
+               printf("%s: fOwningTask not found\n", __FUNCTION__);
+               return;
+       }
+
+       p = (proc_t)(get_bsdtask_info(task));
+       if (!p) {
+               printf("%s: proc not found\n", __FUNCTION__);
+               return;
+       }
+       proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
+
+       k_data_uuid = kext->copyUUID();
+       if (k_data_uuid) {
+               memcpy(&k_uuid, k_data_uuid->getBytesNoCopy(), sizeof(k_uuid));
+               OSSafeReleaseNULL(k_data_uuid);
+               if (uuid_compare(k_uuid, p_uuid) != 0) {
+                       printf("%s: uuid not matching\n", __FUNCTION__);
+               }
+               return;
+       }
+
+       uuid_unparse(p_uuid, uuid_string);
+       new_uuid = OSData::withBytes(p_uuid, sizeof(p_uuid));
+       kext->setDriverKitUUID(new_uuid);
+}
+
+bool
+IOUserServer::serviceMatchesCDHash(IOService *service)
+{
+       OSObject   *obj               = NULL;
+       bool        result            = false;
+       OSString   *requiredCDHashStr = NULL;
+       const char *requiredCDHash    = NULL;
+       char        taskCDHash[CS_CDHASH_LEN];
+
+       task_t owningTask = this->fOwningTask;
+       if (!owningTask) {
+               printf("%s: fOwningTask not found\n", __FUNCTION__);
+               goto out;
+       }
+
+       obj = service->copyProperty(gIOUserServerCDHashKey);
+       requiredCDHashStr = OSDynamicCast(OSString, obj);
+       if (!requiredCDHashStr) {
+               printf("%s: required cdhash not found as property of personality\n", __FUNCTION__);
+               goto out;
+       }
+
+       requiredCDHash = requiredCDHashStr->getCStringNoCopy();
+       if (!requiredCDHash) {
+               printf("%s: required cdhash unable to be read as string\n", __FUNCTION__);
+               goto out;
+       }
+
+       if (strlen(requiredCDHash) != CS_CDHASH_LEN * 2) {
+               printf("%s: required cdhash string has incorrect length\n", __FUNCTION__);
+               goto out;
+       }
+
+       get_task_cdhash(owningTask, taskCDHash);
+       for (int i = 0; i < (int)CS_CDHASH_LEN * 2; i++) {
+               uint8_t which  = (i + 1) & 0x1; /* 1 for upper nibble, 0 for lower */
+               uint8_t nibble = requiredCDHash[i];
+               uint8_t byte   = taskCDHash[i / 2];
+               if ('0' <= nibble && nibble <= '9') {
+                       nibble -= '0';
+               } else if ('a' <= nibble && nibble <= 'f') {
+                       nibble -= 'a' - 10;
+               } else if ('A' <= nibble && nibble <= 'F') {
+                       nibble -= 'A' - 10;
+               } else {
+                       printf("%s: required cdhash contains invalid token '%c'\n", __FUNCTION__, nibble);
+                       goto out;
+               }
+
+               /*
+                * Decide which half of the byte to compare
+                */
+               if (nibble != (which ? (byte >> 4) : (byte & 0x0f))) {
+                       printf("%s: required cdhash %s in personality does not match service\n", __FUNCTION__, requiredCDHash);
+                       goto out;
+               }
+       }
+
+       result = true;
+out:
+       OSSafeReleaseNULL(obj);
+       return result;
+}
+
+bool
+IOUserServer::checkEntitlements(
+       OSDictionary * entitlements, OSObject * prop,
+       IOService * provider, IOService * dext)
+{
+       OSDictionary * matching;
+
+       if (!prop) {
+               return true;
+       }
+       if (!entitlements) {
+               return false;
+       }
+
+       matching = NULL;
+       if (dext) {
+               matching = dext->dictionaryWithProperties();
+               if (!matching) {
+                       return false;
+               }
+       }
+
+       bool allPresent __block;
+       prop->iterateObjects(^bool (OSObject * object) {
+               allPresent = false;
+               object->iterateObjects(^bool (OSObject * object) {
+                       OSString * string;
+                       OSObject * value;
+                       string = OSDynamicCast(OSString, object);
+                       value = entitlements->getObject(string);
+                       if (matching && value) {
+                               matching->setObject(string, value);
+                       }
+                       allPresent = (NULL != value);
+                       return !allPresent;
+               });
+               return allPresent;
+       });
+
+       if (allPresent && matching && provider) {
+               allPresent = provider->matchPropertyTable(matching);
+       }
+
+       OSSafeReleaseNULL(matching);
+       OSSafeReleaseNULL(prop);
+
+       return allPresent;
+}
+
+bool
+IOUserServer::checkEntitlements(IOService * provider, IOService * dext)
+{
+       OSObject     * prop;
+       bool           ok;
+
+       if (!fOwningTask) {
+               return false;
+       }
+
+       prop = provider->copyProperty(gIOServiceDEXTEntitlementsKey);
+       ok = checkEntitlements(fEntitlements, prop, provider, dext);
+       if (!ok) {
+               DKLOG(DKS ": provider entitlements check failed\n", DKN(dext));
+       }
+       if (ok) {
+               prop = dext->copyProperty(gIOServiceDEXTEntitlementsKey);
+               ok = checkEntitlements(fEntitlements, prop, NULL, NULL);
+               if (!ok) {
+                       DKLOG(DKS ": family entitlements check failed\n", DKN(dext));
+               }
+       }
+
+       return ok;
+}
+
+IOReturn
+IOUserServer::exit(const char * reason)
+{
+       DKLOG("%s::exit(%s)\n", getName(), reason);
+       Exit(reason);
+       return kIOReturnSuccess;
+}
+
+OSObjectUserVars *
+IOUserServer::varsForObject(OSObject * obj)
+{
+       IOService * service;
+
+       if ((service = OSDynamicCast(IOService, obj))) {
+               return service->reserved->uvars;
+       }
+
+       return NULL;
+}
+
+IOPStrings *
+IOUserServer::copyInStringArray(const char * string, uint32_t userSize)
+{
+       IOPStrings * array;
+       vm_size_t    alloc;
+       size_t       len;
+       const char * cstr;
+       const char * end;
+
+       if (userSize <= 1) {
+               return NULL;
+       }
+
+       if (os_add_overflow(sizeof(IOPStrings), userSize, &alloc)) {
+               assert(false);
+               return NULL;
+       }
+       if (alloc > 16384) {
+               assert(false);
+               return NULL;
+       }
+       array = (typeof(array))IOMalloc(alloc);
+       if (!array) {
+               return NULL;
+       }
+       array->dataSize = userSize;
+       bcopy(string, (void *) &array->strings[0], userSize);
+
+       array->count = 0;
+       cstr = &array->strings[0];
+       end =  &array->strings[array->dataSize];
+       while ((len = cstr[0])) {
+               cstr++;
+               if ((cstr + len) >= end) {
+                       break;
+               }
+               cstr += len;
+               array->count++;
+       }
+       if (len) {
+               IOFree(array, alloc);
+               array = NULL;
+       }
+
+       return array;
+}
+
+uint32_t
+IOUserServer::stringArrayIndex(IOPStrings * array, const char * look)
+{
+       uint32_t     idx;
+       size_t       len, llen;
+       const char * cstr;
+       const char * end;
+
+       idx  = 0;
+       cstr = &array->strings[0];
+       end  =  &array->strings[array->dataSize];
+       llen = strlen(look);
+       while ((len = cstr[0])) {
+               cstr++;
+               if ((cstr + len) >= end) {
+                       break;
+               }
+               if ((len == llen) && !strncmp(cstr, look, len)) {
+                       return idx;
+               }
+               cstr += len;
+               idx++;
+       }
+
+       return -1U;
+}
+#define kIODispatchQueueStopped ((IODispatchQueue *) -1L)
+
+IODispatchQueue *
+IOUserServer::queueForObject(OSObject * obj, uint64_t msgid)
+{
+       IODispatchQueue  * queue;
+       OSObjectUserVars * uvars;
+       uint64_t           option;
+
+       uvars = varsForObject(obj);
+       if (!uvars) {
+               return NULL;
+       }
+       if (!uvars->queueArray) {
+               if (uvars->stopped) {
+                       return kIODispatchQueueStopped;
+               }
+               return NULL;
+       }
+       queue = uvars->queueArray[0];
+
+       if (uvars->userMeta
+           && uvars->userMeta->methods) {
+               uint32_t idx, baseIdx;
+               uint32_t lim;
+               // bsearch
+               for (baseIdx = 0, lim = uvars->userMeta->methodCount; lim; lim >>= 1) {
+                       idx = baseIdx + (lim >> 1);
+                       if (msgid == uvars->userMeta->methods[idx]) {
+                               option = uvars->userMeta->methods[uvars->userMeta->methodCount + idx];
+                               option &= 0xFF;
+                               if (option < uvars->userMeta->queueNames->count) {
+                                       queue = uvars->queueArray[option + 1];
+                               }
+                               break;
+                       } else if (msgid > uvars->userMeta->methods[idx]) {
+                               // move right
+                               baseIdx += (lim >> 1) + 1;
+                               lim--;
+                       }
+                       // else move left
+               }
+       }
+       return queue;
+}
+
+IOReturn
+IOUserServer::objectInstantiate(OSObject * obj, IORPC rpc, IORPCMessage * message)
+{
+       IOReturn         ret;
+       OSString       * str;
+       OSObject       * prop;
+       IOService      * service;
+
+       OSAction       * action;
+       OSObject       * target;
+       uint32_t         queueCount, queueAlloc;
+       const char     * resultClassName;
+       uint64_t         resultFlags;
+
+       size_t             replySize;
+       uint32_t           methodCount;
+       const uint64_t   * methods;
+       IODispatchQueue  * queue;
+       OSUserMetaClass  * userMeta;
+       OSObjectUserVars * uvars;
+       uint32_t           idx;
+       ipc_port_t         sendPort;
+
+       OSObject_Instantiate_Rpl_Content * reply;
+
+       queueCount      = 0;
+       methodCount     = 0;
+       methods         = NULL;
+       str             = NULL;
+       prop            = NULL;
+       userMeta        = NULL;
+       resultClassName = NULL;
+       resultFlags     = 0;
+       ret = kIOReturnUnsupportedMode;
+
+       service = OSDynamicCast(IOService, obj);
+       if (!service) {
+               // xxx other classes hosted
+               resultFlags |= kOSObjectRPCKernel;
+               resultFlags |= kOSObjectRPCRemote;
+       } else {
+               if (service->isInactive()) {
+                       DKLOG(DKS "::instantiate inactive\n", DKN(service));
+                       return kIOReturnOffline;
+               }
+               prop = service->copyProperty(gIOUserClassKey);
+               str = OSDynamicCast(OSString, prop);
+               if (!service->reserved->uvars) {
+                       resultFlags |= kOSObjectRPCRemote;
+                       resultFlags |= kOSObjectRPCKernel;
+               } else if (this != service->reserved->uvars->userServer) {
+                       // remote, use base class
+                       resultFlags |= kOSObjectRPCRemote;
+               }
+               if (service->reserved->uvars && service->reserved->uvars->userServer) {
+                       userMeta = (typeof(userMeta))service->reserved->uvars->userServer->fClasses->getObject(str);
+               }
+       }
+       if (!str && !userMeta) {
+               const OSMetaClass * meta;
+               meta = obj->getMetaClass();
+               while (meta && !userMeta) {
+                       str = (OSString *) meta->getClassNameSymbol();
+                       userMeta = (typeof(userMeta))fClasses->getObject(str);
+                       if (!userMeta) {
+                               meta = meta->getSuperClass();
+                       }
+               }
+       }
+       if (str) {
+               if (!userMeta) {
+                       userMeta = (typeof(userMeta))fClasses->getObject(str);
+               }
+               if (kIODKLogSetup & gIODKDebug) {
+                       DKLOG("userMeta %s %p\n", str->getCStringNoCopy(), userMeta);
+               }
+               if (userMeta) {
+                       if (kOSObjectRPCRemote & resultFlags) {
+                               while (userMeta && !(kOSClassCanRemote & userMeta->description->flags)) {
+                                       userMeta = userMeta->superMeta;
+                               }
+                               if (userMeta) {
+                                       resultClassName = userMeta->description->name;
+                                       ret = kIOReturnSuccess;
+                               }
+                       } else {
+                               service->reserved->uvars->userMeta = userMeta;
+                               queueAlloc = 1;
+                               if (userMeta->queueNames) {
+                                       queueAlloc += userMeta->queueNames->count;
+                               }
+                               service->reserved->uvars->queueArray =
+                                   IONewZero(IODispatchQueue *, queueAlloc);
+                               resultClassName = str->getCStringNoCopy();
+                               ret = kIOReturnSuccess;
+                       }
+               }
+       }
+       OSSafeReleaseNULL(prop);
+
+       IORPCMessageMach * machReply = rpc.reply;
+       replySize = sizeof(OSObject_Instantiate_Rpl);
+
+       if ((kIOReturnSuccess == ret) && (kOSObjectRPCRemote & resultFlags)) {
+               target = obj;
+               if ((action = OSDynamicCast(OSAction, obj))) {
+                       if (action->ivars->referenceSize) {
+                               resultFlags |= kOSObjectRPCKernel;
+                       } else {
+                               resultFlags &= ~kOSObjectRPCKernel;
+                               target = action->ivars->target;
+
+                               queueCount = 1;
+                               queue = queueForObject(target, action->ivars->targetmsgid);
+                               idx = 0;
+                               sendPort = NULL;
+                               if (queue && (kIODispatchQueueStopped != queue)) {
+                                       sendPort = ipc_port_make_send(queue->ivars->serverPort);
+                               }
+                               replySize = sizeof(OSObject_Instantiate_Rpl)
+                                   + queueCount * sizeof(machReply->objects[0])
+                                   + 2 * methodCount * sizeof(reply->methods[0]);
+                               if (replySize > rpc.replySize) {
+                                       assert(false);
+                                       return kIOReturnIPCError;
+                               }
+                               machReply->objects[idx].type        = MACH_MSG_PORT_DESCRIPTOR;
+                               machReply->objects[idx].disposition = MACH_MSG_TYPE_MOVE_SEND;
+                               machReply->objects[idx].name        = sendPort;
+                               machReply->objects[idx].pad2        = 0;
+                               machReply->objects[idx].pad_end     = 0;
+                       }
+               } else {
+                       uvars = varsForObject(target);
+                       if (uvars && uvars->userMeta) {
+                               queueCount = 1;
+                               if (uvars->userMeta->queueNames) {
+                                       queueCount += uvars->userMeta->queueNames->count;
+                               }
+                               methods = &uvars->userMeta->methods[0];
+                               methodCount = uvars->userMeta->methodCount;
+                               replySize = sizeof(OSObject_Instantiate_Rpl)
+                                   + queueCount * sizeof(machReply->objects[0])
+                                   + 2 * methodCount * sizeof(reply->methods[0]);
+                               if (replySize > rpc.replySize) {
+                                       assert(false);
+                                       return kIOReturnIPCError;
+                               }
+                               for (idx = 0; idx < queueCount; idx++) {
+                                       queue = uvars->queueArray[idx];
+                                       sendPort = NULL;
+                                       if (queue) {
+                                               sendPort = ipc_port_make_send(queue->ivars->serverPort);
+                                       }
+                                       machReply->objects[idx].type        = MACH_MSG_PORT_DESCRIPTOR;
+                                       machReply->objects[idx].disposition = MACH_MSG_TYPE_MOVE_SEND;
+                                       machReply->objects[idx].name        = sendPort;
+                                       machReply->objects[idx].pad2        = 0;
+                                       machReply->objects[idx].pad_end     = 0;
+                               }
+                       }
+               }
+       }
+
+       if (kIODKLogIPC & gIODKDebug) {
+               DKLOG("instantiate %s\n", obj->getMetaClass()->getClassName());
+       }
+
+       if (kIOReturnSuccess != ret) {
+               DKLOG("%s: no user class found\n", str ? str->getCStringNoCopy() : obj->getMetaClass()->getClassName());
+               resultClassName = "unknown";
+       }
+
+       machReply->msgh.msgh_id                    = kIORPCVersionCurrentReply;
+       machReply->msgh.msgh_size                  = replySize;
+       machReply->msgh_body.msgh_descriptor_count = queueCount;
+
+       reply = (typeof(reply))IORPCMessageFromMach(machReply, true);
+       if (!reply) {
+               return kIOReturnIPCError;
+       }
+       if (methodCount) {
+               bcopy(methods, &reply->methods[0], methodCount * 2 * sizeof(reply->methods[0]));
+       }
+       reply->__hdr.msgid       = OSObject_Instantiate_ID;
+       reply->__hdr.flags       = kIORPCMessageOneway;
+       reply->__hdr.objectRefs  = 0;
+       reply->__pad             = 0;
+       reply->flags             = resultFlags;
+       strlcpy(reply->classname, resultClassName, sizeof(reply->classname));
+       reply->__result          = ret;
+
+       ret = kIOReturnSuccess;
+
+       return ret;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+IOReturn
+IOUserServer::kernelDispatch(OSObject * obj, IORPC rpc)
+{
+       IOReturn       ret;
+       IORPCMessage * message;
+
+       message = IORPCMessageFromMach(rpc.message, false);
+       if (!message) {
+               return kIOReturnIPCError;
+       }
+
+       if (OSObject_Instantiate_ID == message->msgid) {
+               ret = objectInstantiate(obj, rpc, message);
+               if (kIOReturnSuccess != ret) {
+                       DKLOG("%s: instantiate failed 0x%x\n", obj->getMetaClass()->getClassName(), ret);
+               }
+       } else {
+               if (kIODKLogIPC & gIODKDebug) {
+                       DKLOG("%s::Dispatch kernel 0x%qx\n", obj->getMetaClass()->getClassName(), message->msgid);
+               }
+               ret = obj->Dispatch(rpc);
+               if (kIODKLogIPC & gIODKDebug) {
+                       DKLOG("%s::Dispatch kernel 0x%qx result 0x%x\n", obj->getMetaClass()->getClassName(), message->msgid, ret);
+               }
+       }
+
+       return ret;
+}
+
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+OSObject *
+IOUserServer::target(OSAction * action, IORPCMessage * message)
+{
+       OSObject * object;
+
+       if (message->msgid != action->ivars->msgid) {
+               return action;
+       }
+       object              = action->ivars->target;
+       message->msgid      = action->ivars->targetmsgid;
+       message->objects[0] = (OSObjectRef) object;
+       if (kIORPCMessageRemote & message->flags) {
+               object->retain();
+               action->release();
+       }
+       if (kIODKLogIPC & gIODKDebug) {
+               DKLOG("TARGET %s msg 0x%qx from 0x%qx\n", object->getMetaClass()->getClassName(), message->msgid, action->ivars->msgid);
+       }
+
+       return object;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+kern_return_t
+uext_server(ipc_kmsg_t requestkmsg, ipc_kmsg_t * pReply)
+{
+       kern_return_t      ret;
+       IORPCMessageMach * msgin;
+       OSObject         * object;
+       IOUserServer     * server;
+
+       msgin   = (typeof(msgin))ipc_kmsg_msg_header(requestkmsg);
+
+       object = IOUserServer::copyObjectForSendRight(msgin->msgh.msgh_remote_port, IKOT_UEXT_OBJECT);
+       server = OSDynamicCast(IOUserServer, object);
+       if (!server) {
+               OSSafeReleaseNULL(object);
+               return KERN_INVALID_NAME;
+       }
+       ret = server->server(requestkmsg, pReply);
+       object->release();
+
+       return ret;
+}
+
+#define MAX_UEXT_REPLY_SIZE     0x17c0
+
+kern_return_t
+IOUserServer::server(ipc_kmsg_t requestkmsg, ipc_kmsg_t * pReply)
+{
+       kern_return_t      ret;
+       mach_msg_size_t    replyAlloc;
+       ipc_kmsg_t         replykmsg;
+       IORPCMessageMach * msgin;
+       IORPCMessage     * message;
+       IORPCMessageMach * msgout;
+       IORPCMessage     * reply;
+       uint32_t           replySize;
+       OSObject         * object;
+       OSAction         * action;
+       bool               oneway;
+       uint64_t           msgid;
+
+       msgin   = (typeof(msgin))ipc_kmsg_msg_header(requestkmsg);
+       replyAlloc = 0;
+       msgout = NULL;
+       replykmsg = NULL;
+
+       if (msgin->msgh.msgh_size < (sizeof(IORPCMessageMach) + sizeof(IORPCMessage))) {
+               if (kIODKLogIPC & gIODKDebug) {
+                       DKLOG("UEXT notify %o\n", msgin->msgh.msgh_id);
+               }
+               return KERN_NOT_SUPPORTED;
+       }
+
+       if (!(MACH_MSGH_BITS_COMPLEX & msgin->msgh.msgh_bits)) {
+               msgin->msgh_body.msgh_descriptor_count = 0;
+       }
+       message = IORPCMessageFromMach(msgin, false);
+       if (!message) {
+               return kIOReturnIPCError;
+       }
+       ret = copyInObjects(msgin, message, msgin->msgh.msgh_size, true, false);
+       if (kIOReturnSuccess != ret) {
+               if (kIODKLogIPC & gIODKDebug) {
+                       DKLOG("UEXT copyin(0x%x) %x\n", ret, msgin->msgh.msgh_id);
+               }
+               return KERN_NOT_SUPPORTED;
+       }
+
+       if (msgin->msgh_body.msgh_descriptor_count < 1) {
+               return KERN_NOT_SUPPORTED;
+       }
+       object = (OSObject *) message->objects[0];
+       msgid = message->msgid;
+       message->flags &= ~kIORPCMessageKernel;
+       message->flags |= kIORPCMessageRemote;
+
+       if ((action = OSDynamicCast(OSAction, object))) {
+               object = target(action, message);
+               msgid  = message->msgid;
+       }
+
+       oneway = (0 != (kIORPCMessageOneway & message->flags));
+       assert(oneway || (MACH_PORT_NULL != msgin->msgh.msgh_local_port));
+
+       // includes trailer size
+       replyAlloc = oneway ? 0 : MAX_UEXT_REPLY_SIZE;
+       if (replyAlloc) {
+               replykmsg = ipc_kmsg_alloc(replyAlloc);
+               if (replykmsg == NULL) {
+//                     printf("uext_server: dropping request\n");
+                       //      ipc_kmsg_trace_send(request, option);
+                       consumeObjects(message, msgin->msgh.msgh_size);
+                       ipc_kmsg_destroy(requestkmsg);
+                       return KERN_MEMORY_FAILURE;
+               }
+
+               msgout = (typeof(msgout))ipc_kmsg_msg_header(replykmsg);
+               /*
+                * MIG should really assure no data leakage -
+                * but until it does, pessimistically zero the
+                * whole reply buffer.
+                */
+               bzero((void *)msgout, replyAlloc);
+       }
+
+       IORPC rpc = { .message = msgin, .sendSize = msgin->msgh.msgh_size, .reply = msgout, .replySize = replyAlloc };
+
+       if (object) {
+               thread_iokit_tls_set(0, this);
+               ret = kernelDispatch(object, rpc);
+               thread_iokit_tls_set(0, NULL);
+       } else {
+               ret = kIOReturnBadArgument;
+       }
+
+       // release objects
+       consumeObjects(message, msgin->msgh.msgh_size);
+
+       // release ports
+       copyInObjects(msgin, message, msgin->msgh.msgh_size, false, true);
+
+       if (!oneway) {
+               if (kIOReturnSuccess == ret) {
+                       replySize = msgout->msgh.msgh_size;
+                       reply = IORPCMessageFromMach(msgout, true);
+                       if (!reply) {
+                               ret = kIOReturnIPCError;
+                       } else {
+                               ret = copyOutObjects(msgout, reply, replySize, (kIORPCVersionCurrentReply == msgout->msgh.msgh_id) /* =>!InvokeReply */);
+                       }
+               }
+               if (kIOReturnSuccess != ret) {
+                       IORPCMessageErrorReturnContent * errorMsg;
+
+                       msgout->msgh_body.msgh_descriptor_count = 0;
+                       msgout->msgh.msgh_id                    = kIORPCVersionCurrentReply;
+                       errorMsg = (typeof(errorMsg))IORPCMessageFromMach(msgout, true);
+                       errorMsg->hdr.msgid      = message->msgid;
+                       errorMsg->hdr.flags      = kIORPCMessageOneway | kIORPCMessageError;
+                       errorMsg->hdr.objectRefs = 0;
+                       errorMsg->result         = ret;
+                       errorMsg->pad            = 0;
+                       replySize                = sizeof(IORPCMessageErrorReturn);
+               }
+
+               msgout->msgh.msgh_bits = MACH_MSGH_BITS_COMPLEX |
+                   MACH_MSGH_BITS_SET(MACH_MSGH_BITS_LOCAL(msgin->msgh.msgh_bits) /*remote*/, 0 /*local*/, 0, 0);
+
+               msgout->msgh.msgh_remote_port  = msgin->msgh.msgh_local_port;
+               msgout->msgh.msgh_local_port   = MACH_PORT_NULL;
+               msgout->msgh.msgh_voucher_port = (mach_port_name_t) 0;
+               msgout->msgh.msgh_reserved     = 0;
+               msgout->msgh.msgh_size         = replySize;
+       }
+
+       *pReply = replykmsg;
+
+       return oneway ? MIG_NO_REPLY : KERN_SUCCESS;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+#define MAX_OBJECT_COUNT(mach, size, message) \
+       ((((size) + ((uintptr_t) (mach))) - ((uintptr_t) (&message->objects[0]))) / sizeof(OSObjectRef))
+
+kern_return_t
+IOUserServerUEXTTrap(OSObject * object, void * p1, void * p2, void * p3, void * p4, void * p5, void * p6)
+{
+       const user_addr_t msg              = (uintptr_t) p1;
+       size_t            inSize           = (uintptr_t) p2;
+       user_addr_t       out              = (uintptr_t) p3;
+       size_t            outSize          = (uintptr_t) p4;
+       mach_port_name_t  objectName1      = (uintptr_t) p5;
+       size_t            totalSize;
+       OSObject        * objectArg1;
+
+       IORPCMessageMach *  mach;
+       mach_msg_port_descriptor_t * descs;
+
+#pragma pack(4)
+       struct {
+               uint32_t                   pad;
+               IORPCMessageMach           mach;
+               mach_msg_port_descriptor_t objects[2];
+               IOTrapMessageBuffer        buffer;
+       } buffer;
+#pragma pack()
+
+       IOReturn           ret;
+       OSAction         * action;
+       int                copyerr;
+       IORPCMessage     * message;
+       IORPCMessage     * reply;
+       IORPC              rpc;
+       uint64_t           refs;
+       uint32_t           maxObjectCount;
+       size_t             copySize;
+       uint64_t         * replyHdr;
+       uintptr_t          p;
+
+       bzero(&buffer, sizeof(buffer));
+
+       p = (typeof(p)) & buffer.buffer[0];
+       if (os_add_overflow(inSize, outSize, &totalSize)) {
+               return kIOReturnMessageTooLarge;
+       }
+       if (totalSize > sizeof(buffer.buffer)) {
+               return kIOReturnMessageTooLarge;
+       }
+       if (inSize < sizeof(IORPCMessage)) {
+               return kIOReturnIPCError;
+       }
+       copyerr = copyin(msg, &buffer.buffer[0], inSize);
+       if (copyerr) {
+               return kIOReturnVMError;
+       }
+
+       message = (typeof(message))p;
+       refs    = message->objectRefs;
+       if ((refs > 2) || !refs) {
+               return kIOReturnUnsupported;
+       }
+       if (!(kIORPCMessageSimpleReply & message->flags)) {
+               return kIOReturnUnsupported;
+       }
+
+       descs = (typeof(descs))(p - refs * sizeof(*descs));
+       mach  = (typeof(mach))(p - refs * sizeof(*descs) - sizeof(*mach));
+
+       mach->msgh.msgh_id   = kIORPCVersionCurrent;
+       mach->msgh.msgh_size = sizeof(IORPCMessageMach) + refs * sizeof(*descs) + inSize;
+       mach->msgh_body.msgh_descriptor_count = refs;
+
+       rpc.message   = mach;
+       rpc.sendSize  = mach->msgh.msgh_size;
+       rpc.reply     = (IORPCMessageMach *) (p + inSize);
+       rpc.replySize = sizeof(buffer.buffer) - inSize;
+
+       message->objects[0] = 0;
+       if ((action = OSDynamicCast(OSAction, object))) {
+               maxObjectCount = MAX_OBJECT_COUNT(rpc.message, rpc.sendSize, message);
+               if (refs > maxObjectCount) {
+                       return kIOReturnBadArgument;
+               }
+               object = IOUserServer::target(action, message);
+               message->objects[1] = (OSObjectRef) action;
+               if (kIODKLogIPC & gIODKDebug) {
+                       DKLOG("%s::Dispatch(trap) kernel 0x%qx\n", object->getMetaClass()->getClassName(), message->msgid);
+               }
+               ret = object->Dispatch(rpc);
+       } else {
+               objectArg1 = NULL;
+               if (refs > 1) {
+                       objectArg1 = iokit_lookup_uext_ref_current_task(objectName1);
+                       if (!objectArg1) {
+                               return kIOReturnIPCError;
+                       }
+                       message->objects[1] = (OSObjectRef) objectArg1;
+               }
+               if (kIODKLogIPC & gIODKDebug) {
+                       DKLOG("%s::Dispatch(trap) kernel 0x%qx\n", object->getMetaClass()->getClassName(), message->msgid);
+               }
+               ret = object->Dispatch(rpc);
+               if (kIODKLogIPC & gIODKDebug) {
+                       DKLOG("%s::Dispatch(trap) kernel 0x%qx 0x%x\n", object->getMetaClass()->getClassName(), message->msgid, ret);
+               }
+               OSSafeReleaseNULL(objectArg1);
+
+               if (kIOReturnSuccess == ret) {
+                       if (rpc.reply->msgh_body.msgh_descriptor_count) {
+                               return kIOReturnIPCError;
+                       }
+                       reply = IORPCMessageFromMach(rpc.reply, rpc.reply->msgh.msgh_size);
+                       if (!reply) {
+                               return kIOReturnIPCError;
+                       }
+                       copySize = rpc.reply->msgh.msgh_size - (((uintptr_t) reply) - ((uintptr_t) rpc.reply)) + sizeof(uint64_t);
+                       if (copySize > outSize) {
+                               return kIOReturnIPCError;
+                       }
+                       replyHdr = (uint64_t *) reply;
+                       replyHdr--;
+                       replyHdr[0] = copySize;
+                       copyerr = copyout(replyHdr, out, copySize);
+                       if (copyerr) {
+                               return kIOReturnVMError;
+                       }
+               }
+       }
+
+       return ret;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * */
+
+IOReturn
+IOUserServer::rpc(IORPC rpc)
+{
+       if (isInactive() && !fRootQueue) {
+               return kIOReturnOffline;
+       }
+
+       IOReturn           ret;
+       IORPCMessage     * message;
+       IORPCMessageMach * mach;
+       mach_msg_id_t      machid;
+       uint32_t           sendSize, replySize;
+       bool               oneway;
+       uint64_t           msgid;
+       IODispatchQueue  * queue;
+       IOService        * service;
+       ipc_port_t         port;
+       ipc_port_t         sendPort;
+
+       queue    = NULL;
+       port     = NULL;
+       sendPort = NULL;
+
+       mach      = rpc.message;
+       sendSize  = rpc.sendSize;
+       replySize = rpc.replySize;
+
+       assert(sendSize >= (sizeof(IORPCMessageMach) + sizeof(IORPCMessage)));
+
+       message = IORPCMessageFromMach(mach, false);
+       if (!message) {
+               ret = kIOReturnIPCError;
+       }
+       msgid   = message->msgid;
+       machid  = (msgid >> 32);
+
+       if (mach->msgh_body.msgh_descriptor_count < 1) {
+               return kIOReturnNoMedia;
+       }
+
+       IOLockLock(gIOUserServerLock);
+       if ((service = OSDynamicCast(IOService, (OSObject *) message->objects[0]))) {
+               queue = queueForObject(service, msgid);
+       }
+       if (!queue) {
+               queue = fRootQueue;
+       }
+       if (queue && (kIODispatchQueueStopped != queue)) {
+               port = queue->ivars->serverPort;
+       }
+       if (port) {
+               sendPort = ipc_port_make_send(port);
+       }
+       IOLockUnlock(gIOUserServerLock);
+       if (!sendPort) {
+               return kIOReturnNotReady;
+       }
+
+       oneway = (0 != (kIORPCMessageOneway & message->flags));
+
+       ret = copyOutObjects(mach, message, sendSize, false);
+
+       mach->msgh.msgh_bits = MACH_MSGH_BITS_COMPLEX |
+           MACH_MSGH_BITS(MACH_MSG_TYPE_MOVE_SEND, (oneway ? 0 : MACH_MSG_TYPE_MAKE_SEND_ONCE));
+       mach->msgh.msgh_remote_port  = sendPort;
+       mach->msgh.msgh_local_port   = (oneway ? MACH_PORT_NULL : mig_get_reply_port());
+       mach->msgh.msgh_id           = kIORPCVersionCurrent;
+       mach->msgh.msgh_reserved     = 0;
+
+       if (oneway) {
+               ret = mach_msg_send_from_kernel(&mach->msgh, sendSize);
+       } else {
+               assert(replySize >= (sizeof(IORPCMessageMach) + sizeof(IORPCMessage)));
+               ret = mach_msg_rpc_from_kernel(&mach->msgh, sendSize, replySize);
+               if (KERN_SUCCESS == ret) {
+                       if (kIORPCVersionCurrentReply != mach->msgh.msgh_id) {
+                               ret = (MACH_NOTIFY_SEND_ONCE == mach->msgh.msgh_id) ? MIG_SERVER_DIED : MIG_REPLY_MISMATCH;
+                       } else if ((replySize = mach->msgh.msgh_size) < (sizeof(IORPCMessageMach) + sizeof(IORPCMessage))) {
+//                             printf("BAD REPLY SIZE\n");
+                               ret = MIG_BAD_ARGUMENTS;
+                       } else {
+                               if (!(MACH_MSGH_BITS_COMPLEX & mach->msgh.msgh_bits)) {
+                                       mach->msgh_body.msgh_descriptor_count = 0;
+                               }
+                               message = IORPCMessageFromMach(mach, true);
+                               if (!message) {
+                                       ret = kIOReturnIPCError;
+                               } else if (message->msgid != msgid) {
+//                                     printf("BAD REPLY ID\n");
+                                       ret = MIG_BAD_ARGUMENTS;
+                               } else {
+                                       bool isError = (0 != (kIORPCMessageError & message->flags));
+                                       ret = copyInObjects(mach, message, replySize, !isError, true);
+                                       if (kIOReturnSuccess != ret) {
+                                               if (kIODKLogIPC & gIODKDebug) {
+                                                       DKLOG("rpc copyin(0x%x) %x\n", ret, mach->msgh.msgh_id);
+                                               }
+                                               return KERN_NOT_SUPPORTED;
+                                       }
+                                       if (isError) {
+                                               IORPCMessageErrorReturnContent * errorMsg = (typeof(errorMsg))message;
+                                               ret = errorMsg->result;
+                                       }
+                               }
+                       }
+               }
+       }
+
+       return ret;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+IORPCMessage *
+IORPCMessageFromMach(IORPCMessageMach * msg, bool reply)
+{
+       mach_msg_size_t              idx, count;
+       mach_msg_port_descriptor_t * desc;
+       mach_msg_port_descriptor_t * maxDesc;
+       size_t                       size, msgsize;
+       bool                         upgrade;
+
+       msgsize = msg->msgh.msgh_size;
+       count   = msg->msgh_body.msgh_descriptor_count;
+       desc    = &msg->objects[0];
+       maxDesc = (typeof(maxDesc))(((uintptr_t) msg) + msgsize);
+       upgrade = (msg->msgh.msgh_id != (reply ? kIORPCVersionCurrentReply : kIORPCVersionCurrent));
+
+       if (upgrade) {
+               OSReportWithBacktrace("obsolete message");
+               return NULL;
+       }
+
+       for (idx = 0; idx < count; idx++) {
+               if (desc >= maxDesc) {
+                       return NULL;
+               }
+               switch (desc->type) {
+               case MACH_MSG_PORT_DESCRIPTOR:
+                       size = sizeof(mach_msg_port_descriptor_t);
+                       break;
+               case MACH_MSG_OOL_DESCRIPTOR:
+                       size = sizeof(mach_msg_ool_descriptor_t);
+                       break;
+               default:
+                       return NULL;
+               }
+               desc = (typeof(desc))(((uintptr_t) desc) + size);
+       }
+       return (IORPCMessage *)(uintptr_t) desc;
+}
+
+ipc_port_t
+IOUserServer::copySendRightForObject(OSObject * object, ipc_kobject_type_t type)
+{
+       ipc_port_t port;
+       ipc_port_t sendPort = NULL;
+
+       port = iokit_port_for_object(object, type);
+       if (port) {
+               sendPort = ipc_port_make_send(port);
+               iokit_release_port(port);
+       }
+
+       return sendPort;
+}
+
+OSObject *
+IOUserServer::copyObjectForSendRight(ipc_port_t port, ipc_kobject_type_t type)
+{
+       OSObject * object;
+       object = iokit_lookup_io_object(port, type);
+       return object;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+// Create a vm_map_copy_t or kalloc'ed data for memory
+// to be copied out. ipc will free after the copyout.
+
+static kern_return_t
+copyoutkdata(const void * data, vm_size_t len, void ** buf)
+{
+       kern_return_t       err;
+       vm_map_copy_t       copy;
+
+       err = vm_map_copyin( kernel_map, CAST_USER_ADDR_T(data), len,
+           false /* src_destroy */, &copy);
+
+       assert( err == KERN_SUCCESS );
+       if (err == KERN_SUCCESS) {
+               *buf = (char *) copy;
+       }
+
+       return err;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+IOReturn
+IOUserServer::copyOutObjects(IORPCMessageMach * mach, IORPCMessage * message,
+    size_t size, bool consume)
+{
+       uint64_t           refs;
+       uint32_t           idx, maxObjectCount;
+       ipc_port_t         port;
+       OSObject         * object;
+       size_t             descsize;
+       mach_msg_port_descriptor_t * desc;
+       mach_msg_ool_descriptor_t  * ool;
+       vm_map_copy_t                copy;
+       void                       * address;
+       mach_msg_size_t              length;
+       kern_return_t                kr;
+       OSSerialize                * s;
+
+       refs           = message->objectRefs;
+       maxObjectCount = MAX_OBJECT_COUNT(mach, size, message);
+//     assert(refs <= mach->msgh_body.msgh_descriptor_count);
+//     assert(refs <= maxObjectCount);
+       if (refs > mach->msgh_body.msgh_descriptor_count) {
+               return kIOReturnBadArgument;
+       }
+       if (refs > maxObjectCount) {
+               return kIOReturnBadArgument;
+       }
+
+       desc = &mach->objects[0];
+       for (idx = 0; idx < refs; idx++) {
+               object = (OSObject *) message->objects[idx];
+
+               switch (desc->type) {
+               case MACH_MSG_PORT_DESCRIPTOR:
+                       descsize = sizeof(mach_msg_port_descriptor_t);
+                       port = NULL;
+                       if (object) {
+                               port = copySendRightForObject(object, IKOT_UEXT_OBJECT);
+                               if (!port) {
+                                       break;
+                               }
+                               if (consume) {
+                                       object->release();
+                               }
+                               message->objects[idx] = 0;
+                       }
+//                 desc->type        = MACH_MSG_PORT_DESCRIPTOR;
+                       desc->disposition = MACH_MSG_TYPE_MOVE_SEND;
+                       desc->name        = port;
+                       desc->pad2        = 0;
+                       desc->pad_end     = 0;
+                       break;
+
+               case MACH_MSG_OOL_DESCRIPTOR:
+                       descsize = sizeof(mach_msg_ool_descriptor_t);
+
+                       length = 0;
+                       address = NULL;
+                       if (object) {
+                               s = OSSerialize::binaryWithCapacity(4096);
+                               assert(s);
+                               if (!s) {
+                                       break;
+                               }
+                               s->setIndexed(true);
+                               if (!object->serialize(s)) {
+                                       assert(false);
+                                       descsize = -1UL;
+                                       s->release();
+                                       break;
+                               }
+                               length = s->getLength();
+                               kr = copyoutkdata(s->text(), length, &address);
+                               s->release();
+                               if (KERN_SUCCESS != kr) {
+                                       descsize = -1UL;
+                                       address = NULL;
+                                       length = 0;
+                               }
+                               if (consume) {
+                                       object->release();
+                               }
+                               message->objects[idx] = 0;
+                       }
+                       ool = (typeof(ool))desc;
+//                 ool->type        = MACH_MSG_OOL_DESCRIPTOR;
+                       ool->deallocate  = false;
+                       ool->copy        = MACH_MSG_PHYSICAL_COPY;
+                       ool->size        = length;
+                       ool->address     = address;
+                       break;
+
+               default:
+                       descsize = -1UL;
+                       break;
+               }
+               if (-1UL == descsize) {
+                       break;
+               }
+               desc = (typeof(desc))(((uintptr_t) desc) + descsize);
+       }
+
+       if (idx >= refs) {
+               return kIOReturnSuccess;
+       }
+
+       desc = &mach->objects[0];
+       while (idx--) {
+               switch (desc->type) {
+               case MACH_MSG_PORT_DESCRIPTOR:
+                       descsize = sizeof(mach_msg_port_descriptor_t);
+                       port = desc->name;
+                       if (port) {
+                               ipc_port_release_send(port);
+                       }
+                       break;
+
+               case MACH_MSG_OOL_DESCRIPTOR:
+                       descsize = sizeof(mach_msg_ool_descriptor_t);
+                       ool = (typeof(ool))desc;
+                       copy = (vm_map_copy_t) ool->address;
+                       if (copy) {
+                               vm_map_copy_discard(copy);
+                       }
+                       break;
+
+               default:
+                       descsize = -1UL;
+                       break;
+               }
+               if (-1UL == descsize) {
+                       break;
+               }
+               desc = (typeof(desc))(((uintptr_t) desc) + descsize);
+       }
+
+       return kIOReturnBadArgument;
+}
+
+IOReturn
+IOUserServer::copyInObjects(IORPCMessageMach * mach, IORPCMessage * message,
+    size_t size, bool copyObjects, bool consumePorts)
+{
+       uint64_t           refs;
+       uint32_t           idx, maxObjectCount;
+       ipc_port_t         port;
+       OSObject         * object;
+       size_t                       descsize;
+       mach_msg_port_descriptor_t * desc;
+       mach_msg_ool_descriptor_t  * ool;
+       vm_map_address_t             copyoutdata;
+       kern_return_t                kr;
+
+       refs           = message->objectRefs;
+       maxObjectCount = MAX_OBJECT_COUNT(mach, size, message);
+//     assert(refs <= mach->msgh_body.msgh_descriptor_count);
+//     assert(refs <= maxObjectCount);
+       if (refs > mach->msgh_body.msgh_descriptor_count) {
+               return kIOReturnBadArgument;
+       }
+       if (refs > maxObjectCount) {
+               return kIOReturnBadArgument;
+       }
+
+       desc = &mach->objects[0];
+       for (idx = 0; idx < refs; idx++) {
+               switch (desc->type) {
+               case MACH_MSG_PORT_DESCRIPTOR:
+                       descsize = sizeof(mach_msg_port_descriptor_t);
+
+                       object = NULL;
+                       port = desc->name;
+                       if (port) {
+                               if (copyObjects) {
+                                       object = copyObjectForSendRight(port, IKOT_UEXT_OBJECT);
+                                       if (!object) {
+                                               descsize = -1UL;
+                                               break;
+                                       }
+                               }
+                               if (consumePorts) {
+                                       ipc_port_release_send(port);
+                               }
+                       }
+                       break;
+
+               case MACH_MSG_OOL_DESCRIPTOR:
+                       descsize = sizeof(mach_msg_ool_descriptor_t);
+                       ool = (typeof(ool))desc;
+
+                       object = NULL;
+                       if (copyObjects && ool->size && ool->address) {
+                               kr = vm_map_copyout(kernel_map, &copyoutdata, (vm_map_copy_t) ool->address);
+                               if (KERN_SUCCESS == kr) {
+                                       object = OSUnserializeXML((const char *) copyoutdata, ool->size);
+                                       // vm_map_copyout() has consumed the vm_map_copy_t in the message
+                                       ool->size = 0;
+                                       ool->address = NULL;
+                                       kr = vm_deallocate(kernel_map, copyoutdata, ool->size);
+                                       assert(KERN_SUCCESS == kr);
+                               }
+                               if (!object) {
+                                       descsize = -1UL;
+                                       break;
+                               }
+                       }
+                       break;
+
+               default:
+                       descsize = -1UL;
+                       break;
+               }
+               if (-1UL == descsize) {
+                       break;
+               }
+               if (copyObjects) {
+                       message->objects[idx] = (OSObjectRef) object;
+               }
+               desc = (typeof(desc))(((uintptr_t) desc) + descsize);
+       }
+
+       if (idx >= refs) {
+               return kIOReturnSuccess;
+       }
+
+       while (idx--) {
+               object = (OSObject *) message->objects[idx];
+               object->release();
+               message->objects[idx] = 0;
+       }
+
+       return kIOReturnBadArgument;
+}
+
+IOReturn
+IOUserServer::consumeObjects(IORPCMessage * message, size_t messageSize)
+{
+       uint64_t    refs, idx;
+       OSObject  * object;
+
+       refs   = message->objectRefs;
+       for (idx = 0; idx < refs; idx++) {
+               object = (OSObject *) message->objects[idx];
+               if (object) {
+                       object->release();
+                       message->objects[idx] = 0;
+               }
+       }
+
+       return kIOReturnSuccess;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+bool
+IOUserServer::finalize(IOOptionBits options)
+{
+       OSArray   * services;
+
+       if (kIODKLogSetup & gIODKDebug) {
+               DKLOG("%s::finalize(%p)\n", getName(), this);
+       }
+
+       IOLockLock(gIOUserServerLock);
+       OSSafeReleaseNULL(fRootQueue);
+       IOLockUnlock(gIOUserServerLock);
+
+       services = NULL;
+       IOLockLock(fLock);
+       if (fServices) {
+               services = OSArray::withArray(fServices);
+       }
+       IOLockUnlock(fLock);
+
+       if (services) {
+               services->iterateObjects(^bool (OSObject * obj) {
+                       IOService * service;
+                       IOService * provider;
+                       bool        started = false;
+
+                       service = (IOService *) obj;
+                       if (kIODKLogSetup & gIODKDebug) {
+                               DKLOG("%s::terminate(" DKS ")\n", getName(), DKN(service));
+                       }
+                       if (service->reserved->uvars) {
+                               started = service->reserved->uvars->started;
+                               service->reserved->uvars->serverDied = true;
+                               if (started) {
+                                       provider = service->getProvider();
+                                       serviceDidStop(service, provider);
+                                       service->terminate(kIOServiceTerminateNeedWillTerminate | kIOServiceTerminateWithRematch);
+                               }
+                       }
+                       if (!started) {
+                               DKLOG("%s::terminate(" DKS ") server exit before start()\n", getName(), DKN(service));
+                               serviceStop(service, NULL);
+                       }
+                       return false;
+               });
+               services->release();
+       }
+
+       return IOUserClient::finalize(options);
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#undef super
+#define super IOUserClient
+
+OSDefineMetaClassAndStructors(IOUserServer, IOUserClient)
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+IOUserClient * IOUserServer::withTask(task_t owningTask)
+{
+       IOUserServer * inst;
+
+       inst = new IOUserServer;
+       if (inst && !inst->init()) {
+               inst->release();
+               inst = NULL;
+               return inst;
+       }
+       inst->PMinit();
+
+       inst->fOwningTask = current_task();
+       inst->fEntitlements = IOUserClient::copyClientEntitlements(inst->fOwningTask);
+
+       if (!(kIODKDisableEntitlementChecking & gIODKDebug)) {
+               if (!inst->fEntitlements || !inst->fEntitlements->getObject(gIODriverKitEntitlementKey)) {
+                       proc_t p;
+                       pid_t  pid;
+
+                       p = (proc_t)get_bsdtask_info(inst->fOwningTask);
+                       if (p) {
+                               pid = proc_pid(p);
+                               IOLog(kIODriverKitEntitlementKey " entitlement check failed for %s[%d]\n", proc_best_name(p), pid);
+                       }
+                       inst->release();
+                       inst = NULL;
+                       return inst;
+               }
+       }
+
+       inst->fLock     = IOLockAlloc();
+       inst->fServices = OSArray::withCapacity(4);
+       inst->fClasses  = OSDictionary::withCapacity(16);
+       inst->fClasses->setOptions(OSCollection::kSort, OSCollection::kSort);
+
+       return inst;
+}
+
+IOReturn
+IOUserServer::clientClose(void)
+{
+       terminate();
+       return kIOReturnSuccess;
+}
+
+IOReturn
+IOUserServer::setProperties(OSObject * properties)
+{
+       IOReturn kr = kIOReturnUnsupported;
+       return kr;
+}
+
+void
+IOUserServer::stop(IOService * provider)
+{
+       fOwningTask = TASK_NULL;
+
+       PMstop();
+
+       IOServicePH::serverRemove(this);
+
+       OSSafeReleaseNULL(fRootQueue);
+
+       if (fInterruptLock) {
+               IOSimpleLockFree(fInterruptLock);
+       }
+}
+
+void
+IOUserServer::free()
+{
+       OSSafeReleaseNULL(fEntitlements);
+       OSSafeReleaseNULL(fClasses);
+       if (fLock) {
+               IOLockFree(fLock);
+       }
+       OSSafeReleaseNULL(fServices);
+       IOUserClient::free();
+}
+
+IOReturn
+IOUserServer::registerClass(OSClassDescription * desc, uint32_t size, OSUserMetaClass ** pCls)
+{
+       OSUserMetaClass * cls;
+       const OSSymbol  * sym;
+       uint64_t        * methodOptions;
+       const char      * queueNames;
+       uint32_t          methodOptionsEnd, queueNamesEnd;
+       IOReturn          ret = kIOReturnSuccess;
+
+       if (size < sizeof(OSClassDescription)) {
+               assert(false);
+               return kIOReturnBadArgument;
+       }
+
+       if (kIODKLogSetup & gIODKDebug) {
+               DKLOG("%s::registerClass %s, %d, %d\n", getName(), desc->name, desc->queueNamesSize, desc->methodNamesSize);
+       }
+
+       if (desc->descriptionSize != size) {
+               assert(false);
+               return kIOReturnBadArgument;
+       }
+       if (os_add_overflow(desc->queueNamesOffset, desc->queueNamesSize, &queueNamesEnd)) {
+               assert(false);
+               return kIOReturnBadArgument;
+       }
+       if (queueNamesEnd > size) {
+               assert(false);
+               return kIOReturnBadArgument;
+       }
+       if (os_add_overflow(desc->methodOptionsOffset, desc->methodOptionsSize, &methodOptionsEnd)) {
+               assert(false);
+               return kIOReturnBadArgument;
+       }
+       if (methodOptionsEnd > size) {
+               assert(false);
+               return kIOReturnBadArgument;
+       }
+       // overlaps?
+       if ((desc->queueNamesOffset >= desc->methodOptionsOffset) && (desc->queueNamesOffset < methodOptionsEnd)) {
+               assert(false);
+               return kIOReturnBadArgument;
+       }
+       if ((queueNamesEnd >= desc->methodOptionsOffset) && (queueNamesEnd < methodOptionsEnd)) {
+               assert(false);
+               return kIOReturnBadArgument;
+       }
+
+       if (desc->methodOptionsSize & ((2 * sizeof(uint64_t)) - 1)) {
+               assert(false);
+               return kIOReturnBadArgument;
+       }
+       if (sizeof(desc->name) == strnlen(desc->name, sizeof(desc->name))) {
+               assert(false);
+               return kIOReturnBadArgument;
+       }
+       if (sizeof(desc->superName) == strnlen(desc->superName, sizeof(desc->superName))) {
+               assert(false);
+               return kIOReturnBadArgument;
+       }
+
+       cls = OSTypeAlloc(OSUserMetaClass);
+       assert(cls);
+       if (!cls) {
+               return kIOReturnNoMemory;
+       }
+
+       cls->description = (typeof(cls->description))IOMalloc(size);
+       assert(cls->description);
+       if (!cls->description) {
+               assert(false);
+               cls->release();
+               return kIOReturnNoMemory;
+       }
+       bcopy(desc, cls->description, size);
+
+       cls->methodCount = desc->methodOptionsSize / (2 * sizeof(uint64_t));
+       cls->methods = IONew(uint64_t, 2 * cls->methodCount);
+       if (!cls->methods) {
+               assert(false);
+               cls->release();
+               return kIOReturnNoMemory;
+       }
+
+       methodOptions = (typeof(methodOptions))(((uintptr_t) desc) + desc->methodOptionsOffset);
+       bcopy(methodOptions, cls->methods, 2 * cls->methodCount * sizeof(uint64_t));
+
+       queueNames = (typeof(queueNames))(((uintptr_t) desc) + desc->queueNamesOffset);
+       cls->queueNames = copyInStringArray(queueNames, desc->queueNamesSize);
+
+       sym = OSSymbol::withCString(desc->name);
+       assert(sym);
+       if (!sym) {
+               assert(false);
+               cls->release();
+               return kIOReturnNoMemory;
+       }
+
+       cls->name = sym;
+       cls->meta = OSMetaClass::copyMetaClassWithName(sym);
+       cls->superMeta = OSDynamicCast(OSUserMetaClass, fClasses->getObject(desc->superName));
+       fClasses->setObject(sym, cls);
+       cls->release();
+
+       *pCls = cls;
+
+       return ret;
+}
+
+IOReturn
+IOUserServer::setRootQueue(IODispatchQueue * queue)
+{
+       assert(!fRootQueue);
+       if (fRootQueue) {
+               return kIOReturnStillOpen;
+       }
+       queue->retain();
+       fRootQueue = queue;
+
+       return kIOReturnSuccess;
+}
+
+IOReturn
+IOUserServer::externalMethod(uint32_t selector, IOExternalMethodArguments * args,
+    IOExternalMethodDispatch * dispatch, OSObject * target, void * reference)
+{
+       IOReturn ret = kIOReturnBadArgument;
+       mach_port_name_t portname;
+
+       switch (selector) {
+       case kIOUserServerMethodRegisterClass:
+       {
+               OSUserMetaClass * cls;
+               if (!args->structureInputSize) {
+                       return kIOReturnBadArgument;
+               }
+               if (args->scalarOutputCount != 2) {
+                       return kIOReturnBadArgument;
+               }
+               ret = registerClass((OSClassDescription *) args->structureInput, args->structureInputSize, &cls);
+               if (kIOReturnSuccess == ret) {
+                       portname = iokit_make_send_right(fOwningTask, cls, IKOT_UEXT_OBJECT);
+                       assert(portname);
+                       args->scalarOutput[0] = portname;
+                       args->scalarOutput[1] = kOSObjectRPCRemote;
+               }
+               break;
+       }
+       case kIOUserServerMethodStart:
+       {
+               if (args->scalarOutputCount != 1) {
+                       return kIOReturnBadArgument;
+               }
+               portname = iokit_make_send_right(fOwningTask, this, IKOT_UEXT_OBJECT);
+               assert(portname);
+               args->scalarOutput[0] = portname;
+               ret = kIOReturnSuccess;
+               break;
+       }
+       default:
+               break;
+       }
+
+       return ret;
+}
+
+IOExternalTrap *
+IOUserServer::getTargetAndTrapForIndex( IOService **targetP, UInt32 index )
+{
+       static const IOExternalTrap trapTemplate[] = {
+               { NULL, (IOTrap) & IOUserServer::waitInterruptTrap},
+       };
+       if (index >= (sizeof(trapTemplate) / sizeof(IOExternalTrap))) {
+               return NULL;
+       }
+       *targetP = this;
+       return (IOExternalTrap *)&trapTemplate[index];
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+IOReturn
+IOUserServer::serviceAttach(IOService * service, IOService * provider)
+{
+       IOReturn           ret;
+       OSObjectUserVars * vars;
+       OSObject         * prop;
+       OSString         * str;
+       OSSymbolConstPtr   bundleID;
+       char               execPath[1024];
+
+       vars = IONewZero(OSObjectUserVars, 1);
+       service->reserved->uvars = vars;
+
+       vars->userServer = this;
+       vars->userServer->retain();
+       IOLockLock(fLock);
+       if (-1U == fServices->getNextIndexOfObject(service, 0)) {
+               fServices->setObject(service);
+       }
+       IOLockUnlock(fLock);
+
+       prop = service->copyProperty(gIOUserClassKey);
+       str = OSDynamicCast(OSString, prop);
+       if (str) {
+               service->setName(str);
+       }
+       OSSafeReleaseNULL(prop);
+
+       prop = service->copyProperty(gIOModuleIdentifierKey);
+       bundleID = OSDynamicCast(OSSymbol, prop);
+       if (bundleID) {
+               execPath[0] = 0;
+               bool ok = OSKext::copyUserExecutablePath(bundleID, execPath, sizeof(execPath));
+               if (ok) {
+                       ret = LoadModule(execPath);
+                       if (kIODKLogSetup & gIODKDebug) {
+                               DKLOG("%s::LoadModule 0x%x %s\n", getName(), ret, execPath);
+                       }
+               }
+       }
+       OSSafeReleaseNULL(prop);
+
+       ret = kIOReturnSuccess;
+
+       return ret;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#define kDriverKitUCPrefix "com.apple.developer.driverkit.userclient-access."
+
+IOReturn
+IOUserServer::serviceNewUserClient(IOService * service, task_t owningTask, void * securityID,
+    uint32_t type, OSDictionary * properties, IOUserClient ** handler)
+{
+       IOReturn           ret;
+       IOUserClient     * uc;
+       IOUserUserClient * userUC;
+       OSDictionary     * entitlements;
+       OSObject         * prop;
+       OSObject         * bundleID;
+       bool               ok;
+
+       *handler = NULL;
+       ret = service->NewUserClient(type, &uc);
+       if (kIOReturnSuccess != ret) {
+               return ret;
+       }
+       userUC = OSDynamicCast(IOUserUserClient, uc);
+       if (!userUC) {
+               uc->terminate();
+               OSSafeReleaseNULL(uc);
+               return kIOReturnUnsupported;
+       }
+       userUC->setTask(owningTask);
+
+       if (!(kIODKDisableEntitlementChecking & gIODKDebug)) {
+               entitlements = IOUserClient::copyClientEntitlements(owningTask);
+               bundleID = service->copyProperty(gIOModuleIdentifierKey);
+               ok = (entitlements
+                   && bundleID
+                   && (prop = entitlements->getObject(gIODriverKitUserClientEntitlementsKey)));
+               if (ok) {
+                       bool found __block = false;
+                       ok = prop->iterateObjects(^bool (OSObject * object) {
+                               found = object->isEqualTo(bundleID);
+                               return found;
+                       });
+                       ok = found;
+               }
+               if (ok) {
+                       prop = userUC->copyProperty(gIOServiceDEXTEntitlementsKey);
+                       ok = checkEntitlements(entitlements, prop, NULL, NULL);
+               }
+               OSSafeReleaseNULL(bundleID);
+               OSSafeReleaseNULL(entitlements);
+               if (!ok) {
+                       DKLOG(DKS ":UC entitlements check failed\n", DKN(userUC));
+                       uc->terminate();
+                       OSSafeReleaseNULL(uc);
+                       return kIOReturnNotPermitted;
+               }
+       }
+
+       ret = userUC->Start(service);
+       if (kIOReturnSuccess != ret) {
+               userUC->detach(this);
+               userUC->release();
+               return ret;
+       }
+
+       *handler = userUC;
+
+       return ret;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+static IOPMPowerState
+    sPowerStates[] = {
+       {   .version                = kIOPMPowerStateVersion1,
+           .capabilityFlags        = 0,
+           .outputPowerCharacter   = 0,
+           .inputPowerRequirement  = 0},
+       {   .version                = kIOPMPowerStateVersion1,
+           .capabilityFlags        = kIOPMLowPower,
+           .outputPowerCharacter   = kIOPMLowPower,
+           .inputPowerRequirement  = kIOPMLowPower},
+       {   .version                = kIOPMPowerStateVersion1,
+           .capabilityFlags        = kIOPMPowerOn,
+           .outputPowerCharacter   = kIOPMPowerOn,
+           .inputPowerRequirement  = kIOPMPowerOn},
+};
+
+IOReturn
+IOUserServer::setPowerState(unsigned long state, IOService * service)
+{
+       if (kIODKLogPM & gIODKDebug) {
+               DKLOG(DKS "::setPowerState(%ld) %d\n", DKN(service), state, fSystemPowerAck);
+       }
+       return kIOPMAckImplied;
+}
+
+IOReturn
+IOUserServer::powerStateWillChangeTo(IOPMPowerFlags flags, unsigned long state, IOService * service)
+{
+       IOReturn ret;
+
+       if (service->reserved->uvars) {
+               if (!fSystemOff && !(kIODKDisablePM & gIODKDebug)) {
+                       service->reserved->uvars->willPower = true;
+                       if (kIODKLogPM & gIODKDebug) {
+                               DKLOG(DKS "::powerStateWillChangeTo(%ld) 0x%qx, %d\n", DKN(service), state, fPowerStates, fSystemPowerAck);
+                       }
+                       ret = service->SetPowerState(flags);
+                       if (kIOReturnSuccess == ret) {
+                               return 20 * 1000 * 1000;
+                       }
+               }
+               service->reserved->uvars->willPower = false;
+       }
+
+       return kIOPMAckImplied;
+}
+
+IOReturn
+IOUserServer::powerStateDidChangeTo(IOPMPowerFlags flags, unsigned long state, IOService * service)
+{
+       unsigned int idx;
+       bool         pmAck;
+
+       pmAck = false;
+       IOLockLock(fLock);
+       idx = fServices->getNextIndexOfObject(service, 0);
+       if (-1U == idx) {
+               IOLockUnlock(fLock);
+               return kIOPMAckImplied;
+       }
+       assert(idx <= 63);
+
+       if (state) {
+               fPowerStates |= (1ULL << idx);
+       } else {
+               fPowerStates &= ~(1ULL << idx);
+       }
+       if (kIODKLogPM & gIODKDebug) {
+               DKLOG(DKS "::powerStateDidChangeTo(%ld) 0x%qx, %d\n", DKN(service), state, fPowerStates, fSystemPowerAck);
+       }
+       if (!fPowerStates && (pmAck = fSystemPowerAck)) {
+               fSystemPowerAck = false;
+               fSystemOff      = true;
+       }
+       IOLockUnlock(fLock);
+
+       if (pmAck) {
+               IOServicePH::serverAck(this);
+       }
+
+       return kIOPMAckImplied;
+}
+
+kern_return_t
+IMPL(IOService, SetPowerState)
+{
+       if (kIODKLogPM & gIODKDebug) {
+               DKLOG(DKS "::SetPowerState(%d), %d\n", DKN(this), powerFlags, reserved->uvars->willPower);
+       }
+       if (reserved->uvars
+           && reserved->uvars->userServer
+           && reserved->uvars->willPower) {
+               reserved->uvars->willPower = false;
+               acknowledgePowerChange(reserved->uvars->userServer);
+               return kIOReturnSuccess;
+       }
+       return kIOReturnNotReady;
+}
+
+kern_return_t
+IMPL(IOService, ChangePowerState)
+{
+       switch (powerFlags) {
+       case kIOServicePowerCapabilityOff:
+               changePowerStateToPriv(0);
+               break;
+       case kIOServicePowerCapabilityLow:
+               changePowerStateToPriv(1);
+               break;
+       case kIOServicePowerCapabilityOn:
+               changePowerStateToPriv(2);
+               break;
+       default:
+               return kIOReturnBadArgument;
+       }
+
+       return kIOReturnSuccess;
+}
+
+kern_return_t
+IMPL(IOService, Create)
+{
+       OSObject       * inst;
+       IOService      * service;
+       OSString       * str;
+       const OSSymbol * sym;
+       OSObject       * prop;
+       OSDictionary   * properties;
+       kern_return_t    ret;
+
+       if (provider != this) {
+               return kIOReturnUnsupported;
+       }
+
+       ret = kIOReturnUnsupported;
+       inst = NULL;
+       service = NULL;
+
+       prop = copyProperty(propertiesKey);
+       properties = OSDynamicCast(OSDictionary, prop);
+       assert(properties);
+       if (properties) {
+               str = OSDynamicCast(OSString, properties->getObject(gIOClassKey));
+               assert(str);
+               sym = OSSymbol::withString(str);
+               if (sym) {
+                       inst = OSMetaClass::allocClassWithName(sym);
+                       service = OSDynamicCast(IOService, inst);
+                       if (service && service->init(properties) && service->attach(this)) {
+                               reserved->uvars->userServer->serviceAttach(service, this);
+                               ret = kIOReturnSuccess;
+                               *result = service;
+                       }
+                       OSSafeReleaseNULL(sym);
+               }
+       }
+
+       OSSafeReleaseNULL(prop);
+       if (kIOReturnSuccess != ret) {
+               OSSafeReleaseNULL(inst);
+       }
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IOService, NewUserClient)
+{
+       return kIOReturnError;
+}
+
+kern_return_t
+IMPL(IOService, SearchProperty)
+{
+       OSObject * object;
+
+       if (kIOServiceSearchPropertyParents & options) {
+               options = kIORegistryIterateParents | kIORegistryIterateRecursively;
+       } else {
+               options = 0;
+       }
+
+       object = copyProperty(name, IORegistryEntry::getPlane(plane), options);
+       *property = object;
+
+       return object ? kIOReturnSuccess : kIOReturnNotFound;
+}
+
+void
+IOUserServer::systemPower(bool powerOff)
+{
+       OSArray * services;
+
+       if (kIODKLogPM & gIODKDebug) {
+               DKLOG("%s::powerOff(%d) 0x%qx\n", getName(), powerOff, fPowerStates);
+       }
+
+       IOLockLock(fLock);
+       services = OSArray::withArray(fServices);
+
+       if (powerOff) {
+               fSystemPowerAck = (0 != fPowerStates);
+               if (!fSystemPowerAck) {
+                       fSystemOff = true;
+               }
+               IOLockUnlock(fLock);
+
+               if (!fSystemPowerAck) {
+                       IOServicePH::serverAck(this);
+               } else {
+                       if (services) {
+                               services->iterateObjects(^bool (OSObject * obj) {
+                                       IOService * service;
+                                       service = (IOService *) obj;
+                                       if (kIODKLogPM & gIODKDebug) {
+                                               DKLOG("changePowerStateWithOverrideTo(" DKS ", %d)\n", DKN(service), 0);
+                                       }
+                                       service->reserved->uvars->powerOverride = service->getPowerState();
+                                       service->changePowerStateWithOverrideTo(0, 0);
+                                       return false;
+                               });
+                       }
+               }
+       } else {
+               fSystemOff = false;
+               IOLockUnlock(fLock);
+               if (services) {
+                       services->iterateObjects(^bool (OSObject * obj) {
+                               IOService * service;
+                               service = (IOService *) obj;
+                               if (-1U != service->reserved->uvars->powerOverride) {
+                                       if (kIODKLogPM & gIODKDebug) {
+                                               DKLOG("changePowerStateWithOverrideTo(" DKS ", %d)\n", DKN(service), service->reserved->uvars->powerOverride);
+                                       }
+                                       service->changePowerStateWithOverrideTo(service->reserved->uvars->powerOverride, 0);
+                                       service->reserved->uvars->powerOverride = -1U;
+                               }
+                               return false;
+                       });
+               }
+       }
+       OSSafeReleaseNULL(services);
+}
+
+
+
+IOReturn
+IOUserServer::serviceStarted(IOService * service, IOService * provider, bool result)
+{
+       IOReturn    ret;
+       IOService * pmProvider;
+
+       DKLOG(DKS "::start(" DKS ") %s\n", DKN(service), DKN(provider), result ? "ok" : "fail");
+
+       if (!result) {
+               ret = kIOReturnSuccess;
+               return ret;
+       }
+
+       if (!fRootNotifier) {
+               ret = registerPowerDriver(this, sPowerStates, sizeof(sPowerStates) / sizeof(sPowerStates[0]));
+               assert(kIOReturnSuccess == ret);
+               IOServicePH::serverAdd(this);
+               fRootNotifier = true;
+       }
+
+       if (!(kIODKDisablePM & gIODKDebug) && !service->pm_vars) {
+               service->PMinit();
+               ret = service->registerPowerDriver(this, sPowerStates, sizeof(sPowerStates) / sizeof(sPowerStates[0]));
+               assert(kIOReturnSuccess == ret);
+
+               pmProvider = service;
+               while (pmProvider && !pmProvider->inPlane(gIOPowerPlane)) {
+                       pmProvider = pmProvider->getProvider();
+               }
+               if (pmProvider) {
+                       OSObject  * prop;
+                       OSString  * str;
+                       prop = pmProvider->copyProperty("non-removable");
+                       if (prop) {
+                               str = OSDynamicCast(OSString, prop);
+                               if (str && str->isEqualTo("yes")) {
+                                       pmProvider = NULL;
+                               }
+                               prop->release();
+                       }
+               }
+               if (pmProvider) {
+                       IOLockLock(fLock);
+                       unsigned int idx = fServices->getNextIndexOfObject(service, 0);
+                       assert(idx <= 63);
+                       fPowerStates |= (1ULL << idx);
+                       IOLockUnlock(fLock);
+
+                       pmProvider->joinPMtree(service);
+                       service->reserved->uvars->userServerPM = true;
+               }
+       }
+
+       service->registerInterestedDriver(this);
+       service->reserved->uvars->started = true;
+
+       return kIOReturnSuccess;
+}
+
+
+IOReturn
+IOUserServer::serviceOpen(IOService * provider, IOService * client)
+{
+       OSObjectUserVars * uvars;
+
+       uvars = client->reserved->uvars;
+       if (!uvars->openProviders) {
+               uvars->openProviders = OSArray::withObjects((const OSObject **) &provider, 1);
+       } else if (-1U == uvars->openProviders->getNextIndexOfObject(client, 0)) {
+               uvars->openProviders->setObject(provider);
+       }
+
+       return kIOReturnSuccess;
+}
+
+IOReturn
+IOUserServer::serviceClose(IOService * provider, IOService * client)
+{
+       OSObjectUserVars * uvars;
+       unsigned int       idx;
+
+       uvars = client->reserved->uvars;
+       if (!uvars->openProviders) {
+               return kIOReturnNotOpen;
+       }
+       idx = uvars->openProviders->getNextIndexOfObject(client, 0);
+       if (-1U == idx) {
+               return kIOReturnNotOpen;
+       }
+       uvars->openProviders->removeObject(idx);
+
+       return kIOReturnSuccess;
+}
+
+
+IOReturn
+IOUserServer::serviceStop(IOService * service, IOService *)
+{
+       IOReturn           ret;
+       uint32_t           idx, queueAlloc;
+       OSObjectUserVars * uvars;
+
+       IOLockLock(fLock);
+       idx = fServices->getNextIndexOfObject(service, 0);
+       if (-1U != idx) {
+               fServices->removeObject(idx);
+               uvars = service->reserved->uvars;
+               uvars->stopped = true;
+       }
+       IOLockUnlock(fLock);
+
+       if (-1U == idx) {
+               return kIOReturnSuccess;
+       }
+
+       IOMachPortDestroyUserReferences(service, IKOT_UEXT_OBJECT);
+
+       if (uvars->queueArray && uvars->userMeta) {
+               queueAlloc = 1;
+               if (uvars->userMeta->queueNames) {
+                       queueAlloc += uvars->userMeta->queueNames->count;
+               }
+               for (idx = 0; idx < queueAlloc; idx++) {
+                       OSSafeReleaseNULL(uvars->queueArray[idx]);
+               }
+               IOSafeDeleteNULL(uvars->queueArray, IODispatchQueue *, queueAlloc);
+       }
+
+       (void) service->deRegisterInterestedDriver(this);
+       if (uvars->userServerPM) {
+               service->PMstop();
+       }
+
+       ret = kIOReturnSuccess;
+       return ret;
+}
+
+void
+IOUserServer::serviceFree(IOService * service)
+{
+       OSObjectUserVars * uvars;
+
+       uvars = service->reserved->uvars;
+       if (!uvars) {
+               return;
+       }
+       OSSafeReleaseNULL(uvars->userServer);
+       IOSafeDeleteNULL(service->reserved->uvars, OSObjectUserVars, 1);
+}
+
+void
+IOUserServer::serviceWillTerminate(IOService * client, IOService * provider, IOOptionBits options)
+{
+       IOReturn ret;
+       bool     willTerminate;
+
+       willTerminate = false;
+       if (client->lockForArbitration(true)) {
+               if (!client->reserved->uvars->serverDied
+                   && !client->reserved->uvars->willTerminate) {
+                       client->reserved->uvars->willTerminate = true;
+                       willTerminate = true;
+               }
+               client->unlockForArbitration();
+       }
+
+       if (willTerminate) {
+               ret = client->Stop(provider);
+               if (kIOReturnSuccess != ret) {
+                       ret = client->IOService::Stop(provider);
+               }
+       }
+}
+
+void
+IOUserServer::serviceDidTerminate(IOService * client, IOService * provider, IOOptionBits options, bool * defer)
+{
+       if (client->lockForArbitration(true)) {
+               client->reserved->uvars->didTerminate = true;
+               if (!client->reserved->uvars->serverDied
+                   && !client->reserved->uvars->stopped) {
+                       *defer = true;
+               }
+               client->unlockForArbitration();
+       }
+}
+
+void
+IOUserServer::serviceDidStop(IOService * client, IOService * provider)
+{
+       bool complete;
+       OSArray * closeArray;
+
+       complete = false;
+       closeArray = NULL;
+
+       if (client->lockForArbitration(true)) {
+               if (client->reserved->uvars
+                   && client->reserved->uvars->willTerminate
+                   && !client->reserved->uvars->stopped) {
+                       client->reserved->uvars->stopped = true;
+                       complete = client->reserved->uvars->didTerminate;
+               }
+
+               if (client->reserved->uvars) {
+                       closeArray = client->reserved->uvars->openProviders;
+                       client->reserved->uvars->openProviders = NULL;
+               }
+               client->unlockForArbitration();
+               if (closeArray) {
+                       closeArray->iterateObjects(^bool (OSObject * obj) {
+                               IOService * toClose;
+                               toClose = OSDynamicCast(IOService, obj);
+                               if (toClose) {
+                                       DKLOG(DKS ":force close (" DKS ")\n", DKN(client), DKN(toClose));
+                                       toClose->close(client);
+                               }
+                               return false;
+                       });
+                       closeArray->release();
+               }
+       }
+       if (complete) {
+               bool defer = false;
+               client->didTerminate(provider, 0, &defer);
+       }
+}
+
+kern_return_t
+IMPL(IOService, Stop)
+{
+       IOUserServer::serviceDidStop(this, provider);
+
+       return kIOReturnSuccess;
+}
+
+kern_return_t
+IMPL(IOInterruptDispatchSource, Cancel)
+{
+       return kIOReturnUnsupported;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#undef super
+#define super IOUserClient
+
+OSDefineMetaClassAndStructors(IOUserUserClient, IOUserClient)
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+IOReturn
+IOUserUserClient::setTask(task_t task)
+{
+       task_reference(task);
+       fTask = task;
+
+       return kIOReturnSuccess;
+}
+
+void
+IOUserUserClient::stop(IOService * provider)
+{
+       if (fTask) {
+               task_deallocate(fTask);
+               fTask = NULL;
+       }
+       super::stop(provider);
+}
+
+IOReturn
+IOUserUserClient::clientClose(void)
+{
+       terminate();
+       return kIOReturnSuccess;
+}
+
+IOReturn
+IOUserUserClient::setProperties(OSObject * properties)
+{
+       IOReturn ret = kIOReturnUnsupported;
+       return ret;
+}
+
+struct IOUserUserClientActionRef {
+       OSAsyncReference64 asyncRef;
+};
+
+void
+IMPL(IOUserClient, KernelCompletion)
+{
+       IOUserUserClientActionRef * ref;
+
+       ref = (typeof(ref))action->GetReference();
+
+       IOUserClient::sendAsyncResult64(ref->asyncRef, status, (io_user_reference_t *) asyncData, asyncDataCount);
+}
+
+kern_return_t
+IMPL(IOUserClient, _ExternalMethod)
+{
+       return kIOReturnUnsupported;
+}
+
+IOReturn
+IOUserUserClient::clientMemoryForType(UInt32 type,
+    IOOptionBits * koptions,
+    IOMemoryDescriptor ** kmemory)
+{
+       IOReturn             kr;
+       uint64_t             options;
+       IOMemoryDescriptor * memory;
+
+       kr = CopyClientMemoryForType(type, &options, &memory);
+
+       *koptions = 0;
+       *kmemory  = NULL;
+       if (kIOReturnSuccess != kr) {
+               return kr;
+       }
+
+       if (kIOUserClientMemoryReadOnly & options) {
+               *koptions |= kIOMapReadOnly;
+       }
+       *kmemory = memory;
+
+       return kr;
+}
+
+IOReturn
+IOUserUserClient::externalMethod(uint32_t selector, IOExternalMethodArguments * args,
+    IOExternalMethodDispatch * dispatch, OSObject * target, void * reference)
+{
+       IOReturn   kr;
+       OSData   * structureInput;
+       OSData   * structureOutput;
+       size_t     copylen;
+       uint64_t   structureOutputSize;
+       OSAction                  * action;
+       IOUserUserClientActionRef * ref;
+
+       kr             = kIOReturnUnsupported;
+       structureInput = NULL;
+       action         = NULL;
+
+       if (args->structureInputSize) {
+               structureInput = OSData::withBytesNoCopy((void *) args->structureInput, args->structureInputSize);
+       }
+
+       if (MACH_PORT_NULL != args->asyncWakePort) {
+               kr = CreateActionKernelCompletion(sizeof(IOUserUserClientActionRef), &action);
+               assert(KERN_SUCCESS == kr);
+               ref = (typeof(ref))action->GetReference();
+               bcopy(args->asyncReference, &ref->asyncRef[0], args->asyncReferenceCount * sizeof(ref->asyncRef[0]));
+       }
+
+       if (args->structureVariableOutputData) {
+               structureOutputSize = kIOUserClientVariableStructureSize;
+       } else if (args->structureOutputDescriptor) {
+               structureOutputSize = args->structureOutputDescriptor->getLength();
+       } else {
+               structureOutputSize = args->structureOutputSize;
+       }
+
+       kr = _ExternalMethod(selector, &args->scalarInput[0], args->scalarInputCount,
+           structureInput, args->structureInputDescriptor,
+           args->scalarOutput, &args->scalarOutputCount,
+           structureOutputSize, &structureOutput, args->structureOutputDescriptor,
+           action);
+
+       OSSafeReleaseNULL(structureInput);
+       OSSafeReleaseNULL(action);
+
+       if (kIOReturnSuccess != kr) {
+               return kr;
+       }
+       if (structureOutput) {
+               if (args->structureVariableOutputData) {
+                       *args->structureVariableOutputData = structureOutput;
+               } else {
+                       copylen = structureOutput->getLength();
+                       if (copylen > args->structureOutputSize) {
+                               kr = kIOReturnBadArgument;
+                       } else {
+                               bcopy((const void *) structureOutput->getBytesNoCopy(), args->structureOutput, copylen);
+                       }
+                       OSSafeReleaseNULL(structureOutput);
+               }
+       }
+
+       return kr;
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
index a9aff9f93e54355cac32fcb39cc318cad5af7d12..d046938d2aa690f549472f9a73b348225031cbf9 100644 (file)
@@ -206,7 +206,7 @@ IOWorkLoop::workLoopWithOptions(IOOptionBits options)
                me->reserved = IONew(ExpansionData, 1);
                if (!me->reserved) {
                        me->release();
-                       return 0;
+                       return NULL;
                }
                bzero(me->reserved, sizeof(ExpansionData));
                me->reserved->options = options;
@@ -214,7 +214,7 @@ IOWorkLoop::workLoopWithOptions(IOOptionBits options)
 
        if (me && !me->init()) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -250,45 +250,45 @@ IOWorkLoop::free()
 
                for (event = eventChain; event; event = next) {
                        next = event->getNext();
-                       event->setWorkLoop(0);
-                       event->setNext(0);
+                       event->setWorkLoop(NULL);
+                       event->setNext(NULL);
                        event->release();
                }
-               eventChain = 0;
+               eventChain = NULL;
 
                for (event = passiveEventChain; event; event = next) {
                        next = event->getNext();
-                       event->setWorkLoop(0);
-                       event->setNext(0);
+                       event->setWorkLoop(NULL);
+                       event->setNext(NULL);
                        event->release();
                }
-               passiveEventChain = 0;
+               passiveEventChain = NULL;
 
                // Either we have a partial initialization to clean up
                // or the workThread itself is performing hari-kari.
                // Either way clean up all of our resources and return.
 
                if (controlG) {
-                       controlG->workLoop = 0;
+                       controlG->workLoop = NULL;
                        controlG->release();
-                       controlG = 0;
+                       controlG = NULL;
                }
 
                if (workToDoLock) {
                        IOSimpleLockFree(workToDoLock);
-                       workToDoLock = 0;
+                       workToDoLock = NULL;
                }
 
                if (gateLock) {
                        IORecursiveLockFree(gateLock);
-                       gateLock = 0;
+                       gateLock = NULL;
                }
 
                IOStatisticsUnregisterCounter();
 
                if (reserved) {
                        IODelete(reserved, ExpansionData, 1);
-                       reserved = 0;
+                       reserved = NULL;
                }
 
                super::free();
@@ -457,7 +457,7 @@ restartThread:
 exitThread:
        closeGate();
        thread_t thread = workThread;
-       workThread = 0; // Say we don't have a loop and free ourselves
+       workThread = NULL; // Say we don't have a loop and free ourselves
        openGate();
 
        free();
@@ -589,7 +589,7 @@ IOWorkLoop::_maintRequest(void *inC, void *inD, void *, void *)
 
                        inEvent->retain();
                        inEvent->setWorkLoop(this);
-                       inEvent->setNext(0);
+                       inEvent->setNext(NULL);
 
                        /* Check if this is a passive or active event source being added */
                        if (eventSourcePerformsWork(inEvent)) {
@@ -627,7 +627,7 @@ IOWorkLoop::_maintRequest(void *inC, void *inD, void *, void *)
                                if (eventChain == inEvent) {
                                        eventChain = inEvent->getNext();
                                } else {
-                                       IOEventSource *event, *next = 0;
+                                       IOEventSource *event, *next = NULL;
 
                                        event = eventChain;
                                        if (event) {
@@ -646,7 +646,7 @@ IOWorkLoop::_maintRequest(void *inC, void *inD, void *, void *)
                                if (passiveEventChain == inEvent) {
                                        passiveEventChain = inEvent->getNext();
                                } else {
-                                       IOEventSource *event, *next = 0;
+                                       IOEventSource *event, *next = NULL;
 
                                        event = passiveEventChain;
                                        if (event) {
@@ -663,8 +663,8 @@ IOWorkLoop::_maintRequest(void *inC, void *inD, void *, void *)
                                }
                        }
 
-                       inEvent->setWorkLoop(0);
-                       inEvent->setNext(0);
+                       inEvent->setWorkLoop(NULL);
+                       inEvent->setNext(NULL);
                        inEvent->release();
                        SETP(&fFlags, kLoopRestart);
                }
index 646ccec4e38d9224d9175dc3167db21ce10c955c..632b97ae550e5aee6b065414446a36511a14831c 100644 (file)
@@ -93,8 +93,6 @@ RootDomainUserClient::secureSleepSystemOptions(
        int             local_priv = 0;
        int             admin_priv = 0;
        IOReturn        ret = kIOReturnNotPrivileged;
-       OSDictionary    *unserializedOptions =  NULL;
-       OSString        *unserializeErrorString = NULL;
 
        ret = clientHasPrivilege(fOwningTask, kIOClientPrivilegeLocalUser);
        local_priv = (kIOReturnSuccess == ret);
@@ -102,38 +100,38 @@ RootDomainUserClient::secureSleepSystemOptions(
        ret = clientHasPrivilege(fOwningTask, kIOClientPrivilegeAdministrator);
        admin_priv = (kIOReturnSuccess == ret);
 
-
-       if (inOptions) {
-               unserializedOptions = OSDynamicCast( OSDictionary,
-                   OSUnserializeXML((const char *)inOptions, inOptionsSize, &unserializeErrorString));
-
-               if (!unserializedOptions) {
-                       IOLog("IOPMRootDomain SleepSystem unserialization failure: %s\n",
-                           unserializeErrorString ? unserializeErrorString->getCStringNoCopy() : "Unknown");
-               }
-       }
-
        if ((local_priv || admin_priv) && fOwner) {
+               OSString        *unserializeErrorString = NULL;
+               OSObject        *unserializedObject = NULL;
+               OSDictionary    *sleepOptionsDict = NULL; // do not release
+
                proc_t p;
                p = (proc_t)get_bsdtask_info(fOwningTask);
                if (p) {
                        fOwner->setProperty("SleepRequestedByPID", proc_pid(p), 32);
                }
 
-               if (unserializedOptions) {
-                       // Publish Sleep Options in registry under root_domain
-                       fOwner->setProperty( kRootDomainSleepOptionsKey, unserializedOptions);
-
-                       *returnCode = fOwner->sleepSystemOptions( unserializedOptions );
+               if (inOptions) {
+                       unserializedObject = OSUnserializeXML((const char *)inOptions, inOptionsSize, &unserializeErrorString);
+                       sleepOptionsDict = OSDynamicCast( OSDictionary, unserializedObject);
+                       if (!sleepOptionsDict) {
+                               IOLog("IOPMRootDomain SleepSystem unserialization failure: %s\n",
+                                   unserializeErrorString ? unserializeErrorString->getCStringNoCopy() : "Unknown");
+                       }
+               }
 
-                       unserializedOptions->release();
+               if (sleepOptionsDict) {
+                       // Publish Sleep Options in registry under root_domain
+                       fOwner->setProperty( kRootDomainSleepOptionsKey, sleepOptionsDict);
                } else {
                        // No options
                        // Clear any pre-existing options
                        fOwner->removeProperty( kRootDomainSleepOptionsKey );
-
-                       *returnCode = fOwner->sleepSystemOptions( NULL );
                }
+
+               *returnCode = fOwner->sleepSystemOptions( sleepOptionsDict );
+               OSSafeReleaseNULL(unserializedObject);
+               OSSafeReleaseNULL(unserializeErrorString);
        } else {
                *returnCode = kIOReturnNotPrivileged;
        }
@@ -233,7 +231,7 @@ RootDomainUserClient::stop( IOService *provider)
 {
        if (fOwningTask) {
                task_deallocate(fOwningTask);
-               fOwningTask = 0;
+               fOwningTask = NULL;
        }
 
        super::stop(provider);
index aea9ca375fb346a597a69acc576e984acd7da0bb..84276c5b2657f3c3ef8eeea370194809297ea0ee 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -44,7 +44,7 @@
 
 class RootDomainUserClient : public IOUserClient
 {
-       OSDeclareDefaultStructors(RootDomainUserClient)
+       OSDeclareDefaultStructors(RootDomainUserClient);
 
        friend class IOPMrootDomain;
 private:
index 9dda5e80dd26f0db62be1360e9685115bad9f837..2b14eda8f68fc643781e5b4002e354aad0e173e9 100644 (file)
@@ -40,9 +40,3 @@ const char * gIOKernelConfigTables =
     "     'IOProbeScore'    = 0:32;"
     "   }"
     ")";
-
-/* This stuff is no longer used at all but was exported in prior
- * releases, so we'll keep them around for PPC/i386 only.
- * See libkern's OSKext.cpp for other symbols, which have been moved
- * there for sanity.
- */
index c9c3d03b69e777b1a225a4c1a908fd12067f672f..e82361d0baf8477c9c1686a07b4f038bcd98f6ce 100644 (file)
@@ -6,9 +6,9 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
 include $(MakeInc_cmd)
 include $(MakeInc_def)
 
-INSTINC_SUBDIRS = IOKit
+INSTINC_SUBDIRS = IOKit DriverKit System IOKitUser
 
-EXPINC_SUBDIRS = IOKit
+EXPINC_SUBDIRS = IOKit DriverKit
 
 COMP_SUBDIRS = conf
 
diff --git a/iokit/System/IODataQueueDispatchSourceShared.h b/iokit/System/IODataQueueDispatchSourceShared.h
new file mode 100644 (file)
index 0000000..bee716a
--- /dev/null
@@ -0,0 +1,594 @@
+typedef struct _IODataQueueEntry {
+       uint32_t  size;
+       uint8_t   data[0];
+} IODataQueueEntry;
+
+#define DATA_QUEUE_ENTRY_HEADER_SIZE sizeof(IODataQueueEntry)
+
+typedef struct _IODataQueueMemory {
+       volatile uint32_t   head;
+       volatile uint32_t   tail;
+       volatile uint8_t    needServicedCallback;
+       volatile uint8_t    _resv[31];
+       IODataQueueEntry  queue[0];
+} IODataQueueMemory;
+
+struct IODataQueueDispatchSource_IVars {
+       IODataQueueMemory         * dataQueue;
+       IODataQueueDispatchSource * source;
+//    IODispatchQueue           * queue;
+       IOMemoryDescriptor        * memory;
+       OSAction                  * dataAvailableAction;
+       OSAction                  * dataServicedAction;
+       uint64_t                    options;
+       uint32_t                    queueByteCount;
+
+#if !KERNEL
+       bool                        enable;
+       bool                        canceled;
+#endif
+};
+
+bool
+IODataQueueDispatchSource::init()
+{
+       if (!super::init()) {
+               return false;
+       }
+
+       ivars = IONewZero(IODataQueueDispatchSource_IVars, 1);
+       ivars->source = this;
+
+#if !KERNEL
+       kern_return_t ret;
+
+       ret = CopyMemory(&ivars->memory);
+       assert(kIOReturnSuccess == ret);
+
+       uint64_t address;
+       uint64_t length;
+
+       ret = ivars->memory->Map(0, 0, 0, 0, &address, &length);
+       assert(kIOReturnSuccess == ret);
+       ivars->dataQueue = (typeof(ivars->dataQueue))(uintptr_t) address;
+       ivars->queueByteCount = length;
+#endif
+
+       return true;
+}
+
+kern_return_t
+IMPL(IODataQueueDispatchSource, CheckForWork)
+{
+       IOReturn ret = kIOReturnNotReady;
+
+       return ret;
+}
+
+#if KERNEL
+
+kern_return_t
+IMPL(IODataQueueDispatchSource, Create)
+{
+       IODataQueueDispatchSource * inst;
+       IOBufferMemoryDescriptor  * bmd;
+
+       if (3 & queueByteCount) {
+               return kIOReturnBadArgument;
+       }
+       inst = OSTypeAlloc(IODataQueueDispatchSource);
+       if (!inst) {
+               return kIOReturnNoMemory;
+       }
+       if (!inst->init()) {
+               inst->release();
+               return kIOReturnError;
+       }
+
+       bmd = IOBufferMemoryDescriptor::withOptions(
+               kIODirectionOutIn | kIOMemoryKernelUserShared,
+               queueByteCount, page_size);
+       if (!bmd) {
+               inst->release();
+               return kIOReturnNoMemory;
+       }
+       inst->ivars->memory         = bmd;
+       inst->ivars->queueByteCount = queueByteCount;
+       inst->ivars->options        = 0;
+       inst->ivars->dataQueue      = (typeof(inst->ivars->dataQueue))bmd->getBytesNoCopy();
+
+       *source = inst;
+
+       return kIOReturnSuccess;
+}
+
+kern_return_t
+IMPL(IODataQueueDispatchSource, CopyMemory)
+{
+       kern_return_t ret;
+       IOMemoryDescriptor * result;
+
+       result = ivars->memory;
+       if (result) {
+               result->retain();
+               ret = kIOReturnSuccess;
+       } else {
+               ret = kIOReturnNotReady;
+       }
+       *memory = result;
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IODataQueueDispatchSource, CopyDataAvailableHandler)
+{
+       kern_return_t ret;
+       OSAction    * result;
+
+       result = ivars->dataAvailableAction;
+       if (result) {
+               result->retain();
+               ret = kIOReturnSuccess;
+       } else {
+               ret = kIOReturnNotReady;
+       }
+       *action = result;
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IODataQueueDispatchSource, CopyDataServicedHandler)
+{
+       kern_return_t ret;
+       OSAction    * result;
+
+       result = ivars->dataServicedAction;
+       if (result) {
+               result->retain();
+               ret = kIOReturnSuccess;
+       } else {
+               ret = kIOReturnNotReady;
+       }
+       *action = result;
+       return ret;
+}
+
+kern_return_t
+IMPL(IODataQueueDispatchSource, SetDataAvailableHandler)
+{
+       IOReturn ret;
+       OSAction * oldAction;
+
+       oldAction = ivars->dataAvailableAction;
+       if (oldAction && OSCompareAndSwapPtr(oldAction, NULL, &ivars->dataAvailableAction)) {
+               oldAction->release();
+       }
+       if (action) {
+               action->retain();
+               ivars->dataAvailableAction = action;
+               if (IsDataAvailable()) {
+                       DataAvailable(ivars->dataAvailableAction);
+               }
+       }
+       ret = kIOReturnSuccess;
+
+       return ret;
+}
+
+kern_return_t
+IMPL(IODataQueueDispatchSource, SetDataServicedHandler)
+{
+       IOReturn ret;
+       OSAction * oldAction;
+
+       oldAction = ivars->dataServicedAction;
+       if (oldAction && OSCompareAndSwapPtr(oldAction, NULL, &ivars->dataServicedAction)) {
+               oldAction->release();
+       }
+       if (action) {
+               action->retain();
+               ivars->dataServicedAction = action;
+       }
+       ret = kIOReturnSuccess;
+
+       return ret;
+}
+
+#endif /* KERNEL */
+
+void
+IODataQueueDispatchSource::SendDataAvailable(void)
+{
+       IOReturn ret;
+
+       if (!ivars->dataAvailableAction) {
+               ret = CopyDataAvailableHandler(&ivars->dataAvailableAction);
+               if (kIOReturnSuccess != ret) {
+                       ivars->dataAvailableAction = NULL;
+               }
+       }
+       if (ivars->dataAvailableAction) {
+               DataAvailable(ivars->dataAvailableAction);
+       }
+}
+
+void
+IODataQueueDispatchSource::SendDataServiced(void)
+{
+       IOReturn ret;
+
+       if (!ivars->dataServicedAction) {
+               ret = CopyDataServicedHandler(&ivars->dataServicedAction);
+               if (kIOReturnSuccess != ret) {
+                       ivars->dataServicedAction = NULL;
+               }
+       }
+       if (ivars->dataServicedAction) {
+               ivars->dataQueue->needServicedCallback = false;
+               DataServiced(ivars->dataServicedAction);
+       }
+}
+
+kern_return_t
+IMPL(IODataQueueDispatchSource, SetEnableWithCompletion)
+{
+       IOReturn ret;
+
+#if !KERNEL
+       ivars->enable = enable;
+#endif
+
+       ret = kIOReturnSuccess;
+       return ret;
+}
+
+void
+IODataQueueDispatchSource::free()
+{
+       OSSafeReleaseNULL(ivars->memory);
+       OSSafeReleaseNULL(ivars->dataAvailableAction);
+       OSSafeReleaseNULL(ivars->dataServicedAction);
+       IOSafeDeleteNULL(ivars, IODataQueueDispatchSource_IVars, 1);
+       super::free();
+}
+
+kern_return_t
+IMPL(IODataQueueDispatchSource, Cancel)
+{
+       return kIOReturnSuccess;
+}
+
+bool
+IODataQueueDispatchSource::IsDataAvailable(void)
+{
+       IODataQueueMemory *dataQueue = ivars->dataQueue;
+
+       return dataQueue && (dataQueue->head != dataQueue->tail);
+}
+
+kern_return_t
+IODataQueueDispatchSource::Peek(IODataQueueClientDequeueEntryBlock callback)
+{
+       IODataQueueEntry *  entry = NULL;
+       IODataQueueMemory * dataQueue;
+       uint32_t            callerDataSize;
+       uint32_t            dataSize;
+       uint32_t            headOffset;
+       uint32_t            tailOffset;
+
+       dataQueue = ivars->dataQueue;
+       if (!dataQueue) {
+               return kIOReturnNoMemory;
+       }
+
+       // Read head and tail with acquire barrier
+       headOffset = __c11_atomic_load((_Atomic uint32_t *)&dataQueue->head, __ATOMIC_RELAXED);
+       tailOffset = __c11_atomic_load((_Atomic uint32_t *)&dataQueue->tail, __ATOMIC_ACQUIRE);
+
+       if (headOffset != tailOffset) {
+               IODataQueueEntry *  head        = NULL;
+               uint32_t            headSize    = 0;
+               uint32_t            queueSize   = ivars->queueByteCount;
+
+               if (headOffset > queueSize) {
+                       return kIOReturnError;
+               }
+
+               head     = (IODataQueueEntry *)((uintptr_t)dataQueue->queue + headOffset);
+               callerDataSize = head->size;
+               if (os_add_overflow(3, callerDataSize, &headSize)) {
+                       return kIOReturnError;
+               }
+               headSize &= ~3U;
+
+               // Check if there's enough room before the end of the queue for a header.
+               // If there is room, check if there's enough room to hold the header and
+               // the data.
+
+               if ((headOffset > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) ||
+                   (headOffset + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize) ||
+                   (headOffset + DATA_QUEUE_ENTRY_HEADER_SIZE > UINT32_MAX - headSize) ||
+                   (headOffset + headSize + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize)) {
+                       // No room for the header or the data, wrap to the beginning of the queue.
+                       // Note: wrapping even with the UINT32_MAX checks, as we have to support
+                       // queueSize of UINT32_MAX
+                       entry = dataQueue->queue;
+                       callerDataSize  = entry->size;
+                       dataSize = entry->size;
+                       if (os_add_overflow(3, callerDataSize, &dataSize)) {
+                               return kIOReturnError;
+                       }
+                       dataSize &= ~3U;
+
+                       if ((dataSize > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) ||
+                           (dataSize + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize)) {
+                               return kIOReturnError;
+                       }
+
+                       callback(&entry->data, callerDataSize);
+                       return kIOReturnSuccess;
+               } else {
+                       callback(&head->data, callerDataSize);
+                       return kIOReturnSuccess;
+               }
+       }
+
+       return kIOReturnUnderrun;
+}
+
+kern_return_t
+IODataQueueDispatchSource::Dequeue(IODataQueueClientDequeueEntryBlock callback)
+{
+       kern_return_t ret;
+       bool          sendDataServiced;
+
+       sendDataServiced = false;
+       ret = DequeueWithCoalesce(&sendDataServiced, callback);
+       if (sendDataServiced) {
+               SendDataServiced();
+       }
+       return ret;
+}
+
+kern_return_t
+IODataQueueDispatchSource::DequeueWithCoalesce(bool * sendDataServiced,
+    IODataQueueClientDequeueEntryBlock callback)
+{
+       IOReturn            retVal          = kIOReturnSuccess;
+       IODataQueueEntry *  entry           = NULL;
+       IODataQueueMemory * dataQueue;
+       uint32_t            callerDataSize;
+       uint32_t            dataSize        = 0;
+       uint32_t            headOffset      = 0;
+       uint32_t            tailOffset      = 0;
+       uint32_t            newHeadOffset   = 0;
+
+       dataQueue = ivars->dataQueue;
+       if (!dataQueue) {
+               return kIOReturnNoMemory;
+       }
+
+       // Read head and tail with acquire barrier
+       headOffset = __c11_atomic_load((_Atomic uint32_t *)&dataQueue->head, __ATOMIC_RELAXED);
+       tailOffset = __c11_atomic_load((_Atomic uint32_t *)&dataQueue->tail, __ATOMIC_ACQUIRE);
+
+       if (headOffset != tailOffset) {
+               IODataQueueEntry *  head        = NULL;
+               uint32_t            headSize    = 0;
+               uint32_t            queueSize   = ivars->queueByteCount;
+
+               if (headOffset > queueSize) {
+                       return kIOReturnError;
+               }
+
+               head = (IODataQueueEntry *)((uintptr_t)dataQueue->queue + headOffset);
+               callerDataSize = head->size;
+               if (os_add_overflow(3, callerDataSize, &headSize)) {
+                       return kIOReturnError;
+               }
+               headSize &= ~3U;
+
+               // we wrapped around to beginning, so read from there
+               // either there was not even room for the header
+               if ((headOffset > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) ||
+                   (headOffset + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize) ||
+                   // or there was room for the header, but not for the data
+                   (headOffset + DATA_QUEUE_ENTRY_HEADER_SIZE > UINT32_MAX - headSize) ||
+                   (headOffset + headSize + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize)) {
+                       // Note: we have to wrap to the beginning even with the UINT32_MAX checks
+                       // because we have to support a queueSize of UINT32_MAX.
+                       entry           = dataQueue->queue;
+                       callerDataSize  = entry->size;
+
+                       if (os_add_overflow(callerDataSize, 3, &dataSize)) {
+                               return kIOReturnError;
+                       }
+                       dataSize &= ~3U;
+                       if ((dataSize > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) ||
+                           (dataSize + DATA_QUEUE_ENTRY_HEADER_SIZE > queueSize)) {
+                               return kIOReturnError;
+                       }
+                       newHeadOffset   = dataSize + DATA_QUEUE_ENTRY_HEADER_SIZE;
+                       // else it is at the end
+               } else {
+                       entry = head;
+
+                       if ((headSize > UINT32_MAX - DATA_QUEUE_ENTRY_HEADER_SIZE) ||
+                           (headSize + DATA_QUEUE_ENTRY_HEADER_SIZE > UINT32_MAX - headOffset) ||
+                           (headSize + DATA_QUEUE_ENTRY_HEADER_SIZE + headOffset > queueSize)) {
+                               return kIOReturnError;
+                       }
+                       newHeadOffset   = headOffset + headSize + DATA_QUEUE_ENTRY_HEADER_SIZE;
+               }
+       } else {
+               // empty queue
+               if (dataQueue->needServicedCallback) {
+                       *sendDataServiced = true;
+               }
+               return kIOReturnUnderrun;
+       }
+
+       callback(&entry->data, callerDataSize);
+       if (dataQueue->needServicedCallback) {
+               *sendDataServiced = true;
+       }
+
+       __c11_atomic_store((_Atomic uint32_t *)&dataQueue->head, newHeadOffset, __ATOMIC_RELEASE);
+
+       if (newHeadOffset == tailOffset) {
+               //
+               // If we are making the queue empty, then we need to make sure
+               // that either the enqueuer notices, or we notice the enqueue
+               // that raced with our making of the queue empty.
+               //
+               __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
+       }
+
+       return retVal;
+}
+
+kern_return_t
+IODataQueueDispatchSource::Enqueue(uint32_t callerDataSize,
+    IODataQueueClientEnqueueEntryBlock callback)
+{
+       kern_return_t ret;
+       bool          sendDataAvailable;
+
+       sendDataAvailable = false;
+       ret = EnqueueWithCoalesce(callerDataSize, &sendDataAvailable, callback);
+       if (sendDataAvailable) {
+               SendDataAvailable();
+       }
+       return ret;
+}
+
+kern_return_t
+IODataQueueDispatchSource::EnqueueWithCoalesce(uint32_t callerDataSize,
+    bool * sendDataAvailable,
+    IODataQueueClientEnqueueEntryBlock callback)
+{
+       IODataQueueMemory * dataQueue;
+       IODataQueueEntry *  entry;
+       uint32_t            head;
+       uint32_t            tail;
+       uint32_t            newTail;
+       uint32_t                        dataSize;
+       uint32_t            queueSize;
+       uint32_t            entrySize;
+       IOReturn            retVal = kIOReturnSuccess;
+
+       dataQueue = ivars->dataQueue;
+       if (!dataQueue) {
+               return kIOReturnNoMemory;
+       }
+       queueSize = ivars->queueByteCount;
+
+       // Force a single read of head and tail
+       tail = __c11_atomic_load((_Atomic uint32_t *)&dataQueue->tail, __ATOMIC_RELAXED);
+       head = __c11_atomic_load((_Atomic uint32_t *)&dataQueue->head, __ATOMIC_ACQUIRE);
+
+       if (os_add_overflow(callerDataSize, 3, &dataSize)) {
+               return kIOReturnOverrun;
+       }
+       dataSize &= ~3U;
+
+       // Check for overflow of entrySize
+       if (os_add_overflow(DATA_QUEUE_ENTRY_HEADER_SIZE, dataSize, &entrySize)) {
+               return kIOReturnOverrun;
+       }
+
+       // Check for underflow of (getQueueSize() - tail)
+       if (queueSize < tail || queueSize < head) {
+               return kIOReturnUnderrun;
+       }
+
+       newTail = tail;
+       if (tail >= head) {
+               // Is there enough room at the end for the entry?
+               if ((entrySize <= (UINT32_MAX - tail)) &&
+                   ((tail + entrySize) <= queueSize)) {
+                       entry = (IODataQueueEntry *)((uintptr_t)dataQueue->queue + tail);
+
+                       callback(&entry->data, callerDataSize);
+
+                       entry->size = callerDataSize;
+
+                       // The tail can be out of bound when the size of the new entry
+                       // exactly matches the available space at the end of the queue.
+                       // The tail can range from 0 to queueSize inclusive.
+
+                       newTail = tail + entrySize;
+               } else if (head > entrySize) { // Is there enough room at the beginning?
+                       entry = (IODataQueueEntry *)((uintptr_t)dataQueue->queue);
+
+                       callback(&entry->data, callerDataSize);
+
+                       // Wrap around to the beginning, but do not allow the tail to catch
+                       // up to the head.
+
+                       entry->size = callerDataSize;
+
+                       // We need to make sure that there is enough room to set the size before
+                       // doing this. The user client checks for this and will look for the size
+                       // at the beginning if there isn't room for it at the end.
+
+                       if ((queueSize - tail) >= DATA_QUEUE_ENTRY_HEADER_SIZE) {
+                               ((IODataQueueEntry *)((uintptr_t)dataQueue->queue + tail))->size = dataSize;
+                       }
+
+                       newTail = entrySize;
+               } else {
+                       retVal = kIOReturnOverrun; // queue is full
+               }
+       } else {
+               // Do not allow the tail to catch up to the head when the queue is full.
+               // That's why the comparison uses a '>' rather than '>='.
+
+               if ((head - tail) > entrySize) {
+                       entry = (IODataQueueEntry *)((uintptr_t)dataQueue->queue + tail);
+
+                       callback(&entry->data, callerDataSize);
+
+                       entry->size = callerDataSize;
+
+                       newTail = tail + entrySize;
+               } else {
+                       retVal = kIOReturnOverrun; // queue is full
+               }
+       }
+
+       // Send notification (via mach message) that data is available.
+
+       if (retVal == kIOReturnSuccess) {
+               // Publish the data we just enqueued
+               __c11_atomic_store((_Atomic uint32_t *)&dataQueue->tail, newTail, __ATOMIC_RELEASE);
+
+               if (tail != head) {
+                       //
+                       // The memory barrier below pairs with the one in dequeue
+                       // so that either our store to the tail cannot be missed by
+                       // the next dequeue attempt, or we will observe the dequeuer
+                       // making the queue empty.
+                       //
+                       // Of course, if we already think the queue is empty,
+                       // there's no point paying this extra cost.
+                       //
+                       __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
+                       head = __c11_atomic_load((_Atomic uint32_t *)&dataQueue->head, __ATOMIC_RELAXED);
+               }
+
+               if (tail == head) {
+                       // Send notification that data is now available.
+                       *sendDataAvailable = true;
+                       retVal = kIOReturnSuccess;
+               }
+       } else if (retVal == kIOReturnOverrun) {
+               // ask to be notified of Dequeue()
+               dataQueue->needServicedCallback = true;
+               *sendDataAvailable = true;
+       }
+
+       return retVal;
+}
diff --git a/iokit/System/Makefile b/iokit/System/Makefile
new file mode 100644 (file)
index 0000000..2e9bc0d
--- /dev/null
@@ -0,0 +1,24 @@
+export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
+export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
+export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
+export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
+
+# These are System.framework headers
+
+include $(MakeInc_cmd)
+include $(MakeInc_def)
+
+ALL_HDRS = $(shell (cd $(SOURCE); echo *.h))
+
+# INSTINC_SUBDIRS = Headers
+
+EXPINC_SUBDIRS = ${INSTINC_SUBDIRS}
+
+# INSTALL_MI_DIR = .
+
+INSTALL_MI_LCL_LIST = $(ALL_HDRS)
+
+INSTALL_MI_GEN_LIST = $(GENERATED_HEADERS)
+
+include $(MakeInc_rule)
+include $(MakeInc_dir)
index 7e7fe8f1dd50edc3609a2564d0844440b53f783c..47be23b23b447bc052b22181ef5079885d158002 100644 (file)
@@ -426,6 +426,27 @@ sysctl_iokittest(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused
                data->release();
        }
 
+       if (changed && (newValue >= 6666) && (newValue <= 6669)) {
+               OSIterator * iter;
+               IOService  * service;
+
+               service = NULL;
+               iter = IOService::getMatchingServices(IOService::nameMatching("XHC1"));
+               if (iter && (service = (IOService *) iter->getNextObject())) {
+                       if (newValue == 6666) {
+                               IOLog("terminating 0x%qx\n", service->getRegistryEntryID());
+                               service->terminate();
+                       } else if (newValue == 6667) {
+                               IOLog("register 0x%qx\n", service->getRegistryEntryID());
+                               service->registerService();
+                       }
+               }
+               OSSafeReleaseNULL(iter);
+               if (service) {
+                       return 0;
+               }
+       }
+
 
        if (changed && newValue) {
                error = IOWorkLoopTest(newValue);
@@ -444,4 +465,4 @@ sysctl_iokittest(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused
 
 SYSCTL_PROC(_kern, OID_AUTO, iokittest,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    0, 0, sysctl_iokittest, "I", "");
+    NULL, 0, sysctl_iokittest, "I", "");
index 17ebe5a233632e2de6b369941fc8c881aaa70e9d..6f4eb396dbfb3eda5f09e49fb6e4c355ddfbbede 100644 (file)
 static IOService *
 di_load_controller( void )
 {
-       OSIterator *    controllerIterator      = 0;
-       OSDictionary *  matchDictionary         = 0;
-       IOService *     controller                      = 0;
+       OSIterator *    controllerIterator      = NULL;
+       OSDictionary *  matchDictionary         = NULL;
+       IOService *     controller                      = NULL;
 
        do {
                IOService::getResourceService()->publishResource("com.apple.AppleDiskImageController.load", kOSBooleanTrue);
@@ -151,11 +151,11 @@ int
 di_root_image(const char *path, char *devname, size_t devsz, dev_t *dev_p)
 {
        IOReturn                        res                             = 0;
-       IOService               *       controller                      = 0;
-       OSString                *       pathString                      = 0;
-       OSNumber                *       myResult                        = 0;
-       OSString                *       myDevName                       = 0;
-       OSNumber                *       myDevT                          = 0;
+       IOService               *       controller                      = NULL;
+       OSString                *       pathString                      = NULL;
+       OSNumber                *       myResult                        = NULL;
+       OSString                *       myDevName                       = NULL;
+       OSNumber                *       myDevT                          = NULL;
 
        // sanity check arguments please
        if (devname) {
@@ -243,11 +243,11 @@ int
 di_root_ramfile_buf(void *buf, size_t bufsz, char *devname, size_t devsz, dev_t *dev_p)
 {
        IOReturn res = 0;
-       IOService *controller = 0;
-       OSNumber *myResult = 0;
-       OSString *myDevName = 0;
-       OSNumber *myDevT = 0;
-       IOMemoryDescriptor *mem = 0;
+       IOService *controller = NULL;
+       OSNumber *myResult = NULL;
+       OSString *myDevName = NULL;
+       OSNumber *myDevT = NULL;
+       IOMemoryDescriptor *mem = NULL;
 
        mem = IOMemoryDescriptor::withAddress(buf, bufsz, kIODirectionInOut);
        assert(mem);
@@ -306,7 +306,7 @@ di_root_ramfile( IORegistryEntry * entry )
        IOMemoryDescriptor *    mem;
        uint64_t                dmgSize;
        uint64_t                remain, length;
-       OSData *                extentData = 0;
+       OSData *                extentData = NULL;
        IOAddressRange *        extentList;
        uint64_t                extentSize;
        uint32_t                extentCount;
index 6ce81657afafa8ccf9a5c205d381ea36f0cd2d95..11514d895466efd57e6be8f54575246b21fff679 100644 (file)
@@ -49,49 +49,21 @@ extern "C" {
 #define ROOTDEVICETIMEOUT       60
 #endif
 
-int panic_on_exception_triage = 0;
-
 extern dev_t mdevadd(int devid, uint64_t base, unsigned int size, int phys);
 extern dev_t mdevlookup(int devid);
 extern void mdevremoveall(void);
 extern int mdevgetrange(int devid, uint64_t *base, uint64_t *size);
 extern void di_root_ramfile(IORegistryEntry * entry);
 
-#if CONFIG_EMBEDDED
+#define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
+
 #define IOPOLLED_COREFILE       (CONFIG_KDP_INTERACTIVE_DEBUGGING)
 
 #if defined(XNU_TARGET_OS_BRIDGE)
-
-#define kIOCoreDumpSize         150ULL*1024ULL*1024ULL
-// leave free space on volume:
-#define kIOCoreDumpFreeSize     150ULL*1024ULL*1024ULL
 #define kIOCoreDumpPath         "/private/var/internal/kernelcore"
-
-#else /* defined(XNU_TARGET_OS_BRIDGE) */
-#define kIOCoreDumpMinSize      350ULL*1024ULL*1024ULL
-#define kIOCoreDumpLargeSize    500ULL*1024ULL*1024ULL
-// leave free space on volume:
-#define kIOCoreDumpFreeSize     350ULL*1024ULL*1024ULL
+#else
 #define kIOCoreDumpPath         "/private/var/vm/kernelcore"
-
-#endif /* defined(XNU_TARGET_OS_BRIDGE) */
-
-#elif DEVELOPMENT /* CONFIG_EMBEDDED */
-#define IOPOLLED_COREFILE       1
-// no sizing
-#define kIOCoreDumpSize         0ULL
-#define kIOCoreDumpFreeSize     0ULL
-#else /* CONFIG_EMBEDDED */
-#define IOPOLLED_COREFILE       0
-#endif /* CONFIG_EMBEDDED */
-
-
-#if IOPOLLED_COREFILE
-static bool
-NewKernelCoreMedia(void * target, void * refCon,
-    IOService * newService,
-    IONotifier * notifier);
-#endif /* IOPOLLED_COREFILE */
+#endif
 
 #if CONFIG_KDP_INTERACTIVE_DEBUGGING
 /*
@@ -101,11 +73,21 @@ extern uint64_t kdp_core_ramdisk_addr;
 extern uint64_t kdp_core_ramdisk_size;
 #endif
 
+#if IOPOLLED_COREFILE
+static void IOOpenPolledCoreFile(thread_call_param_t __unused, thread_call_param_t corefilename);
+
+thread_call_t corefile_open_call = NULL;
+#endif
+
 kern_return_t
 IOKitBSDInit( void )
 {
        IOService::publishResource("IOBSD");
 
+#if IOPOLLED_COREFILE
+       corefile_open_call = thread_call_allocate_with_options(IOOpenPolledCoreFile, NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
+#endif
+
        return kIOReturnSuccess;
 }
 
@@ -122,8 +104,8 @@ IOServicePublishResource( const char * property, boolean_t value )
 boolean_t
 IOServiceWaitForMatchingResource( const char * property, uint64_t timeout )
 {
-       OSDictionary *      dict = 0;
-       IOService *         match = 0;
+       OSDictionary *      dict = NULL;
+       IOService *         match = NULL;
        boolean_t           found = false;
 
        do {
@@ -150,8 +132,8 @@ IOServiceWaitForMatchingResource( const char * property, uint64_t timeout )
 boolean_t
 IOCatalogueMatchingDriversPresent( const char * property )
 {
-       OSDictionary *      dict = 0;
-       OSOrderedSet *      set = 0;
+       OSDictionary *      dict = NULL;
+       OSOrderedSet *      set = NULL;
        SInt32              generationCount = 0;
        boolean_t           found = false;
 
@@ -181,7 +163,7 @@ OSDictionary *
 IOBSDNameMatching( const char * name )
 {
        OSDictionary *      dict;
-       const OSSymbol *    str = 0;
+       const OSSymbol *    str = NULL;
 
        do {
                dict = IOService::serviceMatching( gIOServiceKey );
@@ -205,7 +187,7 @@ IOBSDNameMatching( const char * name )
                str->release();
        }
 
-       return 0;
+       return NULL;
 }
 
 OSDictionary *
@@ -218,29 +200,29 @@ OSDictionary *
 IONetworkNamePrefixMatching( const char * prefix )
 {
        OSDictionary *       matching;
-       OSDictionary *   propDict = 0;
-       const OSSymbol * str      = 0;
+       OSDictionary *   propDict = NULL;
+       const OSSymbol * str      = NULL;
        char networkType[128];
 
        do {
                matching = IOService::serviceMatching( "IONetworkInterface" );
-               if (matching == 0) {
+               if (matching == NULL) {
                        continue;
                }
 
                propDict = OSDictionary::withCapacity(1);
-               if (propDict == 0) {
+               if (propDict == NULL) {
                        continue;
                }
 
                str = OSSymbol::withCString( prefix );
-               if (str == 0) {
+               if (str == NULL) {
                        continue;
                }
 
                propDict->setObject( "IOInterfaceNamePrefix", (OSObject *) str );
                str->release();
-               str = 0;
+               str = NULL;
 
                // see if we're contrained to netroot off of specific network type
                if (PE_parse_boot_argn( "network-type", networkType, 128 )) {
@@ -248,7 +230,7 @@ IONetworkNamePrefixMatching( const char * prefix )
                        if (str) {
                                propDict->setObject( "IONetworkRootType", str);
                                str->release();
-                               str = 0;
+                               str = NULL;
                        }
                }
 
@@ -258,7 +240,7 @@ IONetworkNamePrefixMatching( const char * prefix )
                }
 
                propDict->release();
-               propDict = 0;
+               propDict = NULL;
 
                return matching;
        } while (false);
@@ -273,7 +255,7 @@ IONetworkNamePrefixMatching( const char * prefix )
                str->release();
        }
 
-       return 0;
+       return NULL;
 }
 
 static bool
@@ -287,32 +269,32 @@ IORegisterNetworkInterface( IOService * netif )
        // device is handed to BSD.
 
        IOService *    stack;
-       OSNumber *     zero    = 0;
-       OSString *     path    = 0;
-       OSDictionary * dict    = 0;
-       char *         pathBuf = 0;
+       OSNumber *     zero    = NULL;
+       OSString *     path    = NULL;
+       OSDictionary * dict    = NULL;
+       char *         pathBuf = NULL;
        int            len;
        enum { kMaxPathLen = 512 };
 
        do {
                stack = IOService::waitForService(
                        IOService::serviceMatching("IONetworkStack"));
-               if (stack == 0) {
+               if (stack == NULL) {
                        break;
                }
 
                dict = OSDictionary::withCapacity(3);
-               if (dict == 0) {
+               if (dict == NULL) {
                        break;
                }
 
                zero = OSNumber::withNumber((UInt64) 0, 32);
-               if (zero == 0) {
+               if (zero == NULL) {
                        break;
                }
 
                pathBuf = (char *) IOMalloc( kMaxPathLen );
-               if (pathBuf == 0) {
+               if (pathBuf == NULL) {
                        break;
                }
 
@@ -323,7 +305,7 @@ IORegisterNetworkInterface( IOService * netif )
                }
 
                path = OSString::withCStringNoCopy( pathBuf );
-               if (path == 0) {
+               if (path == NULL) {
                        break;
                }
 
@@ -346,7 +328,7 @@ IORegisterNetworkInterface( IOService * netif )
                IOFree(pathBuf, kMaxPathLen);
        }
 
-       return netif->getProperty( kIOBSDNameKey ) != 0;
+       return netif->getProperty( kIOBSDNameKey ) != NULL;
 }
 
 OSDictionary *
@@ -393,7 +375,7 @@ IOOFPathMatching( const char * path, char * buf, int maxLen )
                matching->release();
        }
 
-       return 0;
+       return NULL;
 }
 
 static int didRam = 0;
@@ -406,19 +388,20 @@ IOFindBSDRoot( char * rootName, unsigned int rootNameSize,
        mach_timespec_t     t;
        IOService *         service;
        IORegistryEntry *   regEntry;
-       OSDictionary *      matching = 0;
+       OSDictionary *      matching = NULL;
        OSString *          iostr;
        OSNumber *          off;
-       OSData *            data = 0;
+       OSData *            data = NULL;
 
        UInt32              flags = 0;
        int                 mnr, mjr;
-       const char *        mediaProperty = 0;
+       const char *        mediaProperty = NULL;
        char *              rdBootVar;
        char *              str;
-       const char *        look = 0;
+       const char *        look = NULL;
        int                 len;
        bool                debugInfoPrintedOnce = false;
+       bool                needNetworkKexts = false;
        const char *        uuidStr = NULL;
 
        static int          mountAttempts = 0;
@@ -556,6 +539,7 @@ IOFindBSDRoot( char * rootName, unsigned int rootNameSize,
 
                if (strncmp( look, "en", strlen( "en" )) == 0) {
                        matching = IONetworkNamePrefixMatching( "en" );
+                       needNetworkKexts = true;
                } else if (strncmp( look, "uuid", strlen( "uuid" )) == 0) {
                        char *uuid;
                        OSString *uuidString;
@@ -607,6 +591,12 @@ IOFindBSDRoot( char * rootName, unsigned int rootNameSize,
                }
        }
 
+       char namep[8];
+       if (needNetworkKexts
+           || PE_parse_boot_argn("-s", namep, sizeof(namep))) {
+               IOService::startDeferredMatches();
+       }
+
        do {
                t.tv_sec = ROOTDEVICETIMEOUT;
                t.tv_nsec = 0;
@@ -648,7 +638,7 @@ IOFindBSDRoot( char * rootName, unsigned int rootNameSize,
        if (service
            && service->metaCast( "IONetworkInterface" )
            && !IORegisterNetworkInterface( service )) {
-               service = 0;
+               service = NULL;
        }
 
        if (service) {
@@ -726,7 +716,6 @@ void
 IOSecureBSDRoot(const char * rootName)
 {
 #if CONFIG_EMBEDDED
-       int              tmpInt;
        IOReturn         result;
        IOPlatformExpert *pe;
        OSDictionary     *matching;
@@ -739,20 +728,12 @@ IOSecureBSDRoot(const char * rootName)
        assert(pe);
        // Returns kIOReturnNotPrivileged is the root device is not secure.
        // Returns kIOReturnUnsupported if "SecureRootName" is not implemented.
-       result = pe->callPlatformFunction(functionName, false, (void *)rootName, (void *)0, (void *)0, (void *)0);
+       result = pe->callPlatformFunction(functionName, false, (void *)rootName, (void *)NULL, (void *)NULL, (void *)NULL);
        functionName->release();
        OSSafeReleaseNULL(pe);
 
        if (result == kIOReturnNotPrivileged) {
                mdevremoveall();
-       } else if (result == kIOReturnSuccess) {
-               // If we are booting with a secure root, and we have the right
-               // boot-arg, we will want to panic on exception triage.  This
-               // behavior is intended as a debug aid (we can look at why an
-               // exception occured in the kernel debugger).
-               if (PE_parse_boot_argn("-panic_on_exception_triage", &tmpInt, sizeof(tmpInt))) {
-                       panic_on_exception_triage = 1;
-               }
        }
 
 #endif  // CONFIG_EMBEDDED
@@ -796,13 +777,13 @@ IOBSDGetPlatformUUID( uuid_t uuid, mach_timespec_t timeout )
        IOService * resources;
        OSString *  string;
 
-       resources = IOService::waitForService( IOService::resourceMatching( kIOPlatformUUIDKey ), (timeout.tv_sec || timeout.tv_nsec) ? &timeout : 0 );
-       if (resources == 0) {
+       resources = IOService::waitForService( IOService::resourceMatching( kIOPlatformUUIDKey ), (timeout.tv_sec || timeout.tv_nsec) ? &timeout : NULL );
+       if (resources == NULL) {
                return KERN_OPERATION_TIMED_OUT;
        }
 
        string = (OSString *) IOService::getPlatform()->getProvider()->getProperty( kIOPlatformUUIDKey );
-       if (string == 0) {
+       if (string == NULL) {
                return KERN_NOT_SUPPORTED;
        }
 
@@ -823,179 +804,167 @@ IOBSDGetPlatformUUID( uuid_t uuid, mach_timespec_t timeout )
 
 IOPolledFileIOVars * gIOPolledCoreFileVars;
 kern_return_t gIOPolledCoreFileOpenRet = kIOReturnNotReady;
+IOPolledCoreFileMode_t gIOPolledCoreFileMode = kIOPolledCoreFileModeNotInitialized;
+
 #if IOPOLLED_COREFILE
 
-static IOReturn
-IOOpenPolledCoreFile(const char * filename)
+#if defined(XNU_TARGET_OS_BRIDGE)
+// On bridgeOS allocate a 150MB corefile and leave 150MB free
+#define kIOCoreDumpSize         150ULL*1024ULL*1024ULL
+#define kIOCoreDumpFreeSize     150ULL*1024ULL*1024ULL
+
+#elif CONFIG_EMBEDDED /* defined(XNU_TARGET_OS_BRIDGE) */
+// On embedded devices with >3GB DRAM we allocate a 500MB corefile
+// otherwise allocate a 350MB corefile. Leave 350 MB free
+
+#define kIOCoreDumpMinSize      350ULL*1024ULL*1024ULL
+#define kIOCoreDumpLargeSize    500ULL*1024ULL*1024ULL
+
+#define kIOCoreDumpFreeSize     350ULL*1024ULL*1024ULL
+
+#else /* defined(XNU_TARGET_OS_BRIDGE) */
+// on macOS devices allocate a corefile sized at 1GB / 32GB of DRAM,
+// fallback to a 1GB corefile and leave at least 1GB free
+#define kIOCoreDumpMinSize              1024ULL*1024ULL*1024ULL
+#define kIOCoreDumpIncrementalSize      1024ULL*1024ULL*1024ULL
+
+#define kIOCoreDumpFreeSize     1024ULL*1024ULL*1024ULL
+
+// on older macOS devices we allocate a 1MB file at boot
+// to store a panic time stackshot
+#define kIOStackshotFileSize    1024ULL*1024ULL
+
+#endif /* defined(XNU_TARGET_OS_BRIDGE) */
+
+static IOPolledCoreFileMode_t
+GetCoreFileMode()
+{
+       if (on_device_corefile_enabled()) {
+               return kIOPolledCoreFileModeCoredump;
+       } else if (panic_stackshot_to_disk_enabled()) {
+               return kIOPolledCoreFileModeStackshot;
+       } else {
+               return kIOPolledCoreFileModeDisabled;
+       }
+}
+
+static void
+IOCoreFileGetSize(uint64_t *ideal_size, uint64_t *fallback_size, uint64_t *free_space_to_leave, IOPolledCoreFileMode_t mode)
+{
+       unsigned int requested_corefile_size = 0;
+
+       *ideal_size = *fallback_size = *free_space_to_leave = 0;
+
+#if defined(XNU_TARGET_OS_BRIDGE)
+#pragma unused(mode)
+       *ideal_size = *fallback_size = kIOCoreDumpSize;
+       *free_space_to_leave = kIOCoreDumpFreeSize;
+#elif CONFIG_EMBEDDED /* defined(XNU_TARGET_OS_BRIDGE) */
+#pragma unused(mode)
+       *ideal_size = *fallback_size = kIOCoreDumpMinSize;
+
+       if (max_mem > (3 * 1024ULL * 1024ULL * 1024ULL)) {
+               *ideal_size = kIOCoreDumpLargeSize;
+       }
+
+       *free_space_to_leave = kIOCoreDumpFreeSize;
+#else /* defined(XNU_TARGET_OS_BRIDGE) */
+       if (mode == kIOPolledCoreFileModeCoredump) {
+               *ideal_size = *fallback_size = kIOCoreDumpMinSize;
+               if (kIOCoreDumpIncrementalSize != 0 && max_mem > (32 * 1024ULL * 1024ULL * 1024ULL)) {
+                       *ideal_size = ((ROUNDUP(max_mem, (32 * 1024ULL * 1024ULL * 1024ULL)) / (32 * 1024ULL * 1024ULL * 1024ULL)) * kIOCoreDumpIncrementalSize);
+               }
+               *free_space_to_leave = kIOCoreDumpFreeSize;
+       } else if (mode == kIOPolledCoreFileModeStackshot) {
+               *ideal_size = *fallback_size = *free_space_to_leave = kIOStackshotFileSize;
+       }
+#endif /* defined(XNU_TARGET_OS_BRIDGE) */
+       // If a custom size was requested, override the ideal and requested sizes
+       if (PE_parse_boot_argn("corefile_size_mb", &requested_corefile_size, sizeof(requested_corefile_size))) {
+               IOLog("Boot-args specify %d MB kernel corefile\n", requested_corefile_size);
+
+               *ideal_size = *fallback_size = (requested_corefile_size * 1024ULL * 1024ULL);
+       }
+
+       return;
+}
+
+static void
+IOOpenPolledCoreFile(thread_call_param_t __unused, thread_call_param_t corefilename)
 {
+       assert(corefilename != NULL);
+
        IOReturn err;
-       unsigned int debug;
-       uint64_t corefile_size_bytes = 0;
+       char *filename = (char *) corefilename;
+       uint64_t corefile_size_bytes = 0, corefile_fallback_size_bytes = 0, free_space_to_leave_bytes = 0;
+       IOPolledCoreFileMode_t mode_to_init = GetCoreFileMode();
 
        if (gIOPolledCoreFileVars) {
-               return kIOReturnBusy;
+               return;
        }
        if (!IOPolledInterface::gMetaClass.getInstanceCount()) {
-               return kIOReturnUnsupported;
+               return;
        }
 
-       debug = 0;
-       PE_parse_boot_argn("debug", &debug, sizeof(debug));
-       if (DB_DISABLE_LOCAL_CORE & debug) {
-               return kIOReturnUnsupported;
+       if (mode_to_init == kIOPolledCoreFileModeDisabled) {
+               gIOPolledCoreFileMode = kIOPolledCoreFileModeDisabled;
+               return;
        }
 
-#if CONFIG_EMBEDDED
-       unsigned int requested_corefile_size = 0;
-       if (PE_parse_boot_argn("corefile_size_mb", &requested_corefile_size, sizeof(requested_corefile_size))) {
-               IOLog("Boot-args specify %d MB kernel corefile\n", requested_corefile_size);
-
-               corefile_size_bytes = (requested_corefile_size * 1024ULL * 1024ULL);
-       }
-#endif
+       // We'll overwrite this once we open the file, we update this to mark that we have made
+       // it past initialization
+       gIOPolledCoreFileMode = kIOPolledCoreFileModeClosed;
 
+       IOCoreFileGetSize(&corefile_size_bytes, &corefile_fallback_size_bytes, &free_space_to_leave_bytes, mode_to_init);
 
        do {
-#if defined(kIOCoreDumpLargeSize)
-               if (0 == corefile_size_bytes) {
-                       // If no custom size was requested and we're on a device with >3GB of DRAM, attempt
-                       // to allocate a large corefile otherwise use a small file.
-                       if (max_mem > (3 * 1024ULL * 1024ULL * 1024ULL)) {
-                               corefile_size_bytes = kIOCoreDumpLargeSize;
-                               err = IOPolledFileOpen(filename,
-                                   kIOPolledFileCreate,
-                                   corefile_size_bytes, kIOCoreDumpFreeSize,
-                                   NULL, 0,
-                                   &gIOPolledCoreFileVars, NULL, NULL, 0);
-                               if (kIOReturnSuccess == err) {
-                                       break;
-                               } else if (kIOReturnNoSpace == err) {
-                                       IOLog("Failed to open corefile of size %llu MB (low disk space)",
-                                           (corefile_size_bytes / (1024ULL * 1024ULL)));
-                                       if (corefile_size_bytes == kIOCoreDumpMinSize) {
-                                               gIOPolledCoreFileOpenRet = err;
-                                               return err;
-                                       }
-                                       // Try to open a smaller corefile (set size and fall-through)
-                                       corefile_size_bytes = kIOCoreDumpMinSize;
-                               } else {
-                                       IOLog("Failed to open corefile of size %llu MB (returned error 0x%x)\n",
-                                           (corefile_size_bytes / (1024ULL * 1024ULL)), err);
-                                       gIOPolledCoreFileOpenRet = err;
-                                       return err;
-                               }
-                       } else {
-                               corefile_size_bytes = kIOCoreDumpMinSize;
+               err = IOPolledFileOpen(filename, kIOPolledFileCreate, corefile_size_bytes, free_space_to_leave_bytes,
+                   NULL, 0, &gIOPolledCoreFileVars, NULL, NULL, NULL);
+               if (kIOReturnSuccess == err) {
+                       break;
+               } else if (kIOReturnNoSpace == err) {
+                       IOLog("Failed to open corefile of size %llu MB (low disk space)",
+                           (corefile_size_bytes / (1024ULL * 1024ULL)));
+                       if (corefile_size_bytes == corefile_fallback_size_bytes) {
+                               gIOPolledCoreFileOpenRet = err;
+                               return;
                        }
+               } else {
+                       IOLog("Failed to open corefile of size %llu MB (returned error 0x%x)\n",
+                           (corefile_size_bytes / (1024ULL * 1024ULL)), err);
+                       gIOPolledCoreFileOpenRet = err;
+                       return;
                }
-#else /* defined(kIOCoreDumpLargeSize) */
-               if (0 == corefile_size_bytes) {
-                       corefile_size_bytes = kIOCoreDumpSize;
-               }
-#endif /* defined(kIOCoreDumpLargeSize) */
-               err = IOPolledFileOpen(filename,
-                   kIOPolledFileCreate,
-                   corefile_size_bytes, kIOCoreDumpFreeSize,
-                   NULL, 0,
-                   &gIOPolledCoreFileVars, NULL, NULL, 0);
+
+               err = IOPolledFileOpen(filename, kIOPolledFileCreate, corefile_fallback_size_bytes, free_space_to_leave_bytes,
+                   NULL, 0, &gIOPolledCoreFileVars, NULL, NULL, NULL);
                if (kIOReturnSuccess != err) {
                        IOLog("Failed to open corefile of size %llu MB (returned error 0x%x)\n",
-                           (corefile_size_bytes / (1024ULL * 1024ULL)), err);
+                           (corefile_fallback_size_bytes / (1024ULL * 1024ULL)), err);
                        gIOPolledCoreFileOpenRet = err;
-                       return err;
+                       return;
                }
        } while (false);
 
-       err = IOPolledFilePollersSetup(gIOPolledCoreFileVars, kIOPolledPreflightCoreDumpState);
-       if (kIOReturnSuccess != err) {
-               IOPolledFileClose(&gIOPolledCoreFileVars, NULL, NULL, 0, 0, 0);
+       gIOPolledCoreFileOpenRet = IOPolledFilePollersSetup(gIOPolledCoreFileVars, kIOPolledPreflightCoreDumpState);
+       if (kIOReturnSuccess != gIOPolledCoreFileOpenRet) {
+               IOPolledFileClose(&gIOPolledCoreFileVars, 0, NULL, 0, 0, 0);
                IOLog("IOPolledFilePollersSetup for corefile failed with error: 0x%x\n", err);
-               gIOPolledCoreFileOpenRet = err;
        } else {
                IOLog("Opened corefile of size %llu MB\n", (corefile_size_bytes / (1024ULL * 1024ULL)));
+               gIOPolledCoreFileMode = mode_to_init;
        }
 
-       return err;
+       return;
 }
 
 static void
 IOClosePolledCoreFile(void)
 {
        gIOPolledCoreFileOpenRet = kIOReturnNotOpen;
+       gIOPolledCoreFileMode = kIOPolledCoreFileModeClosed;
        IOPolledFilePollersClose(gIOPolledCoreFileVars, kIOPolledPostflightCoreDumpState);
-       IOPolledFileClose(&gIOPolledCoreFileVars, NULL, NULL, 0, 0, 0);
-}
-
-static thread_call_t gIOOpenPolledCoreFileTC;
-static IONotifier  * gIOPolledCoreFileNotifier;
-static IONotifier  * gIOPolledCoreFileInterestNotifier;
-
-static IOReturn
-KernelCoreMediaInterest(void * target, void * refCon,
-    UInt32 messageType, IOService * provider,
-    void * messageArgument, vm_size_t argSize )
-{
-       if (kIOMessageServiceIsTerminated == messageType) {
-               gIOPolledCoreFileInterestNotifier->remove();
-               gIOPolledCoreFileInterestNotifier = 0;
-               IOClosePolledCoreFile();
-       }
-
-       return kIOReturnSuccess;
-}
-
-static void
-OpenKernelCoreMedia(thread_call_param_t p0, thread_call_param_t p1)
-{
-       IOService * newService;
-       OSString  * string;
-       char        filename[16];
-
-       newService = (IOService *) p1;
-       do{
-               if (gIOPolledCoreFileVars) {
-                       break;
-               }
-               string = OSDynamicCast(OSString, newService->getProperty(kIOBSDNameKey));
-               if (!string) {
-                       break;
-               }
-               snprintf(filename, sizeof(filename), "/dev/%s", string->getCStringNoCopy());
-               if (kIOReturnSuccess != IOOpenPolledCoreFile(filename)) {
-                       break;
-               }
-               gIOPolledCoreFileInterestNotifier = newService->registerInterest(
-                       gIOGeneralInterest, &KernelCoreMediaInterest, NULL, 0);
-       }while (false);
-
-       newService->release();
-}
-
-static bool
-NewKernelCoreMedia(void * target, void * refCon,
-    IOService * newService,
-    IONotifier * notifier)
-{
-       static volatile UInt32 onlyOneCorePartition = 0;
-       do{
-               if (!OSCompareAndSwap(0, 1, &onlyOneCorePartition)) {
-                       break;
-               }
-               if (gIOPolledCoreFileVars) {
-                       break;
-               }
-               if (!gIOOpenPolledCoreFileTC) {
-                       break;
-               }
-               newService = newService->getProvider();
-               if (!newService) {
-                       break;
-               }
-               newService->retain();
-               thread_call_enter1(gIOOpenPolledCoreFileTC, newService);
-       }while (false);
-
-       return false;
+       IOPolledFileClose(&gIOPolledCoreFileVars, 0, NULL, 0, 0, 0);
 }
 
 #endif /* IOPOLLED_COREFILE */
@@ -1004,37 +973,6 @@ extern "C" void
 IOBSDMountChange(struct mount * mp, uint32_t op)
 {
 #if IOPOLLED_COREFILE
-
-       OSDictionary * bsdMatching;
-       OSDictionary * mediaMatching;
-       OSString     * string;
-
-       if (!gIOPolledCoreFileNotifier) {
-               do{
-                       if (!gIOOpenPolledCoreFileTC) {
-                               gIOOpenPolledCoreFileTC = thread_call_allocate(&OpenKernelCoreMedia, NULL);
-                       }
-                       bsdMatching = IOService::serviceMatching("IOMediaBSDClient");
-                       if (!bsdMatching) {
-                               break;
-                       }
-                       mediaMatching = IOService::serviceMatching("IOMedia");
-                       string = OSString::withCStringNoCopy("5361644D-6163-11AA-AA11-00306543ECAC");
-                       if (!string || !mediaMatching) {
-                               break;
-                       }
-                       mediaMatching->setObject("Content", string);
-                       string->release();
-                       bsdMatching->setObject(gIOParentMatchKey, mediaMatching);
-                       mediaMatching->release();
-
-                       gIOPolledCoreFileNotifier = IOService::addMatchingNotification(
-                               gIOFirstMatchNotification, bsdMatching,
-                               &NewKernelCoreMedia, NULL, NULL, -1000);
-               }while (false);
-       }
-
-#if CONFIG_EMBEDDED
        uint64_t flags;
        char path[128];
        int pathLen;
@@ -1080,17 +1018,18 @@ IOBSDMountChange(struct mount * mp, uint32_t op)
                if (0 != strncmp(path, kIOCoreDumpPath, pathLen - 1)) {
                        break;
                }
-               IOOpenPolledCoreFile(kIOCoreDumpPath);
+
+               thread_call_enter1(corefile_open_call, (void *) kIOCoreDumpPath);
                break;
 
        case kIOMountChangeUnmount:
        case kIOMountChangeWillResize:
                if (gIOPolledCoreFileVars && (mp == kern_file_mount(gIOPolledCoreFileVars->fileRef))) {
+                       thread_call_cancel_wait(corefile_open_call);
                        IOClosePolledCoreFile();
                }
                break;
        }
-#endif /* CONFIG_EMBEDDED */
 #endif /* IOPOLLED_COREFILE */
 }
 
index b38d804d8b06869b9a97b1eb04083790de69aabd..5d030959c73f7dc0e438b7fab373335956ab5a57 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2015-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -37,7 +37,7 @@ IOSKCopyKextIdentifierWithAddress( vm_address_t address );
 const OSSymbol *
 IOSKCopyKextIdentifierWithAddress( vm_address_t address )
 {
-       const OSSymbol * id = 0;
+       const OSSymbol * id = NULL;
 
        OSKext * kext = OSKext::lookupKextWithAddress(address);
        if (kext) {
index 7bd79d9ae78b20528784ccb5d5684d4a6bcb579f..05c4b79cf0cdde381cf245f419ccc25af4fec458 100644 (file)
@@ -23,7 +23,7 @@ endif
 
 $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS)
        $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG)
-       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG);
+       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG)
 
 do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
        $(_v)${MAKE} \
@@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
                SOURCE=$(subst conf/,,$(SOURCE))                        \
                TARGET=${TARGET}                                        \
                OBJPATH=${OBJPATH}                                      \
-               build_all;
+               build_all
 
 do_build_all:: do_all
 
index c29d59de6ea65a2e194cace8dedd74de64504f92..b58cd7ee8d8d2cc2183fd182a31a160d41a50d23 100644 (file)
@@ -17,7 +17,7 @@ include $(MakeInc_def)
 # XXX: CFLAGS
 #
 CFLAGS+= -include meta_features.h -DDRIVER_PRIVATE             \
-       -DIOKIT_KERNEL_PRIVATE -DIOMATCHDEBUG=1 -DIOALLOCDEBUG=1
+       -DIOKIT_KERNEL_PRIVATE -DDRIVERKIT_PRIVATE=1 -DIOMATCHDEBUG=1 -DIOALLOCDEBUG=1
 SFLAGS+= -include meta_features.h
 #-DIOKITDEBUG=-1
 
@@ -48,9 +48,10 @@ IOKitBSDInit.cpo_CXXWARNFLAGS_ADD += -Wno-missing-prototypes -Wno-documentation
 IOPMrootDomain.cpo_CXXWARNFLAGS_ADD += -Wno-missing-prototypes
 
 #
-# Directories for mig generated files
+# Directories for generated files
 #
-COMP_SUBDIRS =
+COMP_SUBDIRS = \
+       DriverKit
 
 #
 #  Make sure we don't remove this by accident if interrupted at the wrong
@@ -96,9 +97,9 @@ $(COMPONENT).filelist: $(OBJS)
                 $(SEG_HACK) -n __HIB -o $${hib_file}__ $${hib_file} || exit 1; \
                 mv $${hib_file}__ $${hib_file} || exit 1; \
        done
-       @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)"
+       $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0))
        $(_v)for obj in ${OBJS}; do     \
-                echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
+                $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
        done > $(COMPONENT).filelist
 
 do_all: $(COMPONENT).filelist
index 5af89b2b08c6ba7b56a856a2a851c9aac480103d..9b2578710346dea730f418479d65731b867245a9 100644 (file)
@@ -8,6 +8,21 @@ OPTIONS/crypto                                         optional crypto
 OPTIONS/config_dtrace                          optional config_dtrace
 OPTIONS/mach_assert                            optional mach_assert
 
+# iig generated implementation files
+
+./DriverKit/OSObject.iig.cpp           optional iokitcpp
+./DriverKit/OSAction.iig.cpp           optional iokitcpp
+./DriverKit/IOService.iig.cpp          optional iokitcpp
+./DriverKit/IOUserClient.iig.cpp               optional iokitcpp
+./DriverKit/IOMemoryDescriptor.iig.cpp         optional iokitcpp
+./DriverKit/IOBufferMemoryDescriptor.iig.cpp           optional iokitcpp
+./DriverKit/IOMemoryMap.iig.cpp                optional iokitcpp
+./DriverKit/IODispatchSource.iig.cpp           optional iokitcpp
+./DriverKit/IODispatchQueue.iig.cpp            optional iokitcpp
+./DriverKit/IOInterruptDispatchSource.iig.cpp          optional iokitcpp
+./DriverKit/IODataQueueDispatchSource.iig.cpp          optional iokitcpp
+./DriverKit/IOUserServer.iig.cpp               optional iokitcpp
+
 # libIOKit
 
 iokit/Kernel/IOHibernateIO.cpp                         optional hibernation
@@ -89,6 +104,8 @@ iokit/Kernel/IORTC.cpp                               optional iokitcpp
 
 iokit/Kernel/IOStringFuncs.c                           standard
 
+iokit/Kernel/IOUserServer.cpp                          optional iokitcpp
+
 # Property tables for kernel-linked objects
 
 iokit/KernelConfigTables.cpp                           optional iokitcpp
index 85cf4998b3792d85ce01eeec8b4f5cb953144e27..f00a3be8fd027cf46a406a76998602f06ec80dec 100644 (file)
@@ -436,45 +436,47 @@ struct kcdata_type_definition {
  * NOTE: Please update kcdata/libkdd/kcdtypes.c if you make any changes
  * in STACKSHOT_KCTYPE_* types.
  */
-#define STACKSHOT_KCTYPE_IOSTATS 0x901u                   /* io_stats_snapshot */
-#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u          /* struct mem_and_io_snapshot */
-#define STACKSHOT_KCCONTAINER_TASK 0x903u
-#define STACKSHOT_KCCONTAINER_THREAD 0x904u
-#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u             /* task_snapshot_v2 */
-#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u           /* thread_snapshot_v2, thread_snapshot_v3 */
-#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u             /* int[] */
-#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u      /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
-#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u               /* char[] */
-#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au           /* struct stack_snapshot_frame32 */
-#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu         /* struct stack_snapshot_frame64 */
-#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu           /* struct stack_snapshot_frame32 */
-#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du         /* struct stack_snapshot_frame64 */
-#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu                  /* boot args string */
-#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu                 /* os version string */
-#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u            /* kernel page size in uint32_t */
-#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u              /* jetsam level in uint32_t */
-#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u     /* timestamp used for the delta stackshot */
-#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u              /* uint32_t */
-#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u            /* uint64_t */
-#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u              /* uint32_t */
-#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u            /* uint64_t */
-#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u          /* uint64_t */
-#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u         /* uint64_t */
-#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u                 /* struct stackshot_cpu_times or stackshot_cpu_times_v2 */
-#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au        /* struct stackshot_duration */
-#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu     /* struct stackshot_fault_stats */
-#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO  0x91cu     /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
-#define STACKSHOT_KCTYPE_THREAD_WAITINFO 0x91du           /* struct stackshot_thread_waitinfo */
-#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT 0x91eu     /* struct thread_group_snapshot or thread_group_snapshot_v2 */
-#define STACKSHOT_KCTYPE_THREAD_GROUP 0x91fu              /* uint64_t */
-#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT 0x920u /* struct jetsam_coalition_snapshot */
-#define STACKSHOT_KCTYPE_JETSAM_COALITION 0x921u          /* uint64_t */
-#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION 0x922u     /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */
-#define STACKSHOT_KCTYPE_INSTRS_CYCLES 0x923u             /* struct instrs_cycles_snapshot */
-#define STACKSHOT_KCTYPE_USER_STACKTOP 0x924u             /* struct stack_snapshot_stacktop */
-#define STACKSHOT_KCTYPE_ASID 0x925u                      /* uint32_t */
-#define STACKSHOT_KCTYPE_PAGE_TABLES 0x926u               /* uint64_t */
-#define STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT 0x927u    /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
+#define STACKSHOT_KCTYPE_IOSTATS                     0x901u /* io_stats_snapshot */
+#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS            0x902u /* struct mem_and_io_snapshot */
+#define STACKSHOT_KCCONTAINER_TASK                   0x903u
+#define STACKSHOT_KCCONTAINER_THREAD                 0x904u
+#define STACKSHOT_KCTYPE_TASK_SNAPSHOT               0x905u /* task_snapshot_v2 */
+#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT             0x906u /* thread_snapshot_v2, thread_snapshot_v3 */
+#define STACKSHOT_KCTYPE_DONATING_PIDS               0x907u /* int[] */
+#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO        0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
+#define STACKSHOT_KCTYPE_THREAD_NAME                 0x909u /* char[] */
+#define STACKSHOT_KCTYPE_KERN_STACKFRAME             0x90Au /* struct stack_snapshot_frame32 */
+#define STACKSHOT_KCTYPE_KERN_STACKFRAME64           0x90Bu /* struct stack_snapshot_frame64 */
+#define STACKSHOT_KCTYPE_USER_STACKFRAME             0x90Cu /* struct stack_snapshot_frame32 */
+#define STACKSHOT_KCTYPE_USER_STACKFRAME64           0x90Du /* struct stack_snapshot_frame64 */
+#define STACKSHOT_KCTYPE_BOOTARGS                    0x90Eu /* boot args string */
+#define STACKSHOT_KCTYPE_OSVERSION                   0x90Fu /* os version string */
+#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE              0x910u /* kernel page size in uint32_t */
+#define STACKSHOT_KCTYPE_JETSAM_LEVEL                0x911u /* jetsam level in uint32_t */
+#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP       0x912u /* timestamp used for the delta stackshot */
+#define STACKSHOT_KCTYPE_KERN_STACKLR                0x913u /* uint32_t */
+#define STACKSHOT_KCTYPE_KERN_STACKLR64              0x914u /* uint64_t */
+#define STACKSHOT_KCTYPE_USER_STACKLR                0x915u /* uint32_t */
+#define STACKSHOT_KCTYPE_USER_STACKLR64              0x916u /* uint64_t */
+#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS            0x917u /* uint64_t */
+#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS           0x918u /* uint64_t */
+#define STACKSHOT_KCTYPE_CPU_TIMES                   0x919u /* struct stackshot_cpu_times or stackshot_cpu_times_v2 */
+#define STACKSHOT_KCTYPE_STACKSHOT_DURATION          0x91au /* struct stackshot_duration */
+#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS       0x91bu /* struct stackshot_fault_stats */
+#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO        0x91cu /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
+#define STACKSHOT_KCTYPE_THREAD_WAITINFO             0x91du /* struct stackshot_thread_waitinfo */
+#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT       0x91eu /* struct thread_group_snapshot or thread_group_snapshot_v2 */
+#define STACKSHOT_KCTYPE_THREAD_GROUP                0x91fu /* uint64_t */
+#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT   0x920u /* struct jetsam_coalition_snapshot */
+#define STACKSHOT_KCTYPE_JETSAM_COALITION            0x921u /* uint64_t */
+#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION       0x922u /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */
+#define STACKSHOT_KCTYPE_INSTRS_CYCLES               0x923u /* struct instrs_cycles_snapshot */
+#define STACKSHOT_KCTYPE_USER_STACKTOP               0x924u /* struct stack_snapshot_stacktop */
+#define STACKSHOT_KCTYPE_ASID                        0x925u /* uint32_t */
+#define STACKSHOT_KCTYPE_PAGE_TABLES                 0x926u /* uint64_t */
+#define STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT      0x927u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
+#define STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL 0x928u /* dispatch queue label */
+#define STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO        0x929u /* struct stackshot_thread_turnstileinfo */
 
 #define STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT 0x940u   /* task_delta_snapshot_v2 */
 #define STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT 0x941u /* thread_delta_snapshot_v* */
@@ -517,6 +519,7 @@ struct user64_dyld_uuid_info {
 };
 
 enum task_snapshot_flags {
+       /* k{User,Kernel}64_p (values 0x1 and 0x2) are defined in generic_snapshot_flags */
        kTaskRsrcFlagged                      = 0x4, // In the EXC_RESOURCE danger zone?
        kTerminatedSnapshot                   = 0x8,
        kPidSuspended                         = 0x10, // true for suspended task
@@ -546,6 +549,7 @@ enum task_snapshot_flags {
 };
 
 enum thread_snapshot_flags {
+       /* k{User,Kernel}64_p (values 0x1 and 0x2) are defined in generic_snapshot_flags */
        kHasDispatchSerial    = 0x4,
        kStacksPCOnly         = 0x8,    /* Stack traces have no frame pointers. */
        kThreadDarwinBG       = 0x10,   /* Thread is darwinbg */
@@ -814,6 +818,18 @@ typedef struct stackshot_thread_waitinfo {
        uint8_t wait_type;      /* The type of object that the thread is waiting on */
 } __attribute__((packed)) thread_waitinfo_t;
 
+typedef struct stackshot_thread_turnstileinfo {
+       uint64_t waiter;        /* The thread that's waiting on the object */
+       uint64_t turnstile_context; /* Associated data (either thread id, or workq addr) */
+       uint8_t turnstile_priority;
+       uint8_t number_of_hops;
+#define STACKSHOT_TURNSTILE_STATUS_UNKNOWN      (1 << 0) /* The final inheritor is unknown (bug?) */
+#define STACKSHOT_TURNSTILE_STATUS_LOCKED_WAITQ (1 << 1) /* A waitq was found to be locked */
+#define STACKSHOT_TURNSTILE_STATUS_WORKQUEUE    (1 << 2) /* The final inheritor is a workqueue */
+#define STACKSHOT_TURNSTILE_STATUS_THREAD       (1 << 3) /* The final inheritor is a thread */
+       uint64_t turnstile_flags;
+} __attribute__((packed)) thread_turnstileinfo_t;
+
 #define STACKSHOT_WAITOWNER_KERNEL         (UINT64_MAX - 1)
 #define STACKSHOT_WAITOWNER_PORT_LOCKED    (UINT64_MAX - 2)
 #define STACKSHOT_WAITOWNER_PSET_LOCKED    (UINT64_MAX - 3)
@@ -895,6 +911,8 @@ struct crashinfo_proc_uniqidentifierinfo {
 #define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE               0x828 /* uint64_t */
 #define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE_COMPRESSED    0x829 /* uint64_t */
 #define TASK_CRASHINFO_LEDGER_WIRED_MEM                         0x82A /* uint64_t */
+#define TASK_CRASHINFO_PROC_PERSONA_ID                          0x82B /* uid_t */
+#define TASK_CRASHINFO_MEMORY_LIMIT_INCREASE                    0x82C /* uint32_t */
 
 
 
@@ -971,7 +989,7 @@ kcdata_iter_unsafe(void *buffer)
        return iter;
 }
 
-static const kcdata_iter_t kcdata_invalid_iter = { .item = 0, .end = 0 };
+static const kcdata_iter_t kcdata_invalid_iter = { .item = NULL, .end = NULL };
 
 static inline
 int
index e3ef22e33667abc8b174ab984a52ee9c8cbad89f..cafecb6418a27f43b71f28caaaef0bf889a2d1c2 100644 (file)
@@ -832,6 +832,24 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s
                break;
        }
 
+       case STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL: {
+               i = 0;
+               _STRINGTYPE("dispatch_queue_label");
+               setup_type_definition(retval, type_id, i, "dispatch_queue_label");
+               break;
+       }
+
+       case STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO: {
+               i = 0;
+               _SUBTYPE(KC_ST_UINT64, struct stackshot_thread_turnstileinfo, waiter);
+               _SUBTYPE(KC_ST_UINT64, struct stackshot_thread_turnstileinfo, turnstile_context);
+               _SUBTYPE(KC_ST_UINT8, struct stackshot_thread_turnstileinfo, turnstile_priority);
+               _SUBTYPE(KC_ST_UINT8, struct stackshot_thread_turnstileinfo, number_of_hops);
+               _SUBTYPE(KC_ST_UINT64, struct stackshot_thread_turnstileinfo, turnstile_flags);
+               setup_type_definition(retval, type_id, i, "thread_turnstileinfo");
+               break;
+       }
+
        default:
                retval = NULL;
                break;
index a16b8bdcaf5fd8f22d5a0b5c61253b6730b5217a..33575c38dd3c444e6dfb164ac8e0d595098a903e 100644 (file)
@@ -69,6 +69,7 @@
                08B4808C1BF9474A00B4AAE0 /* corpse-sample.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08B4808A1BF9473800B4AAE0 /* corpse-sample.plist.gz */; };
                08B9297E1C1CCE8D003B1703 /* stackshot-sample-ths-thread-t in Resources */ = {isa = PBXBuildFile; fileRef = 08B9297C1C1CCE7E003B1703 /* stackshot-sample-ths-thread-t */; };
                08B9297F1C1CCE8D003B1703 /* stackshot-sample-ths-thread-t.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08B9297D1C1CCE7E003B1703 /* stackshot-sample-ths-thread-t.plist.gz */; };
+               08C3972F204E0A7C00BDDB3F /* xnu.libkdd.plist in CopyFiles */ = {isa = PBXBuildFile; fileRef = 08C3972E204E0A7500BDDB3F /* xnu.libkdd.plist */; };
                08C9D83D1BFFF8E100DF6C05 /* exitreason-sample in Resources */ = {isa = PBXBuildFile; fileRef = 08C9D83B1BFFF8D500DF6C05 /* exitreason-sample */; };
                08C9D83E1BFFF8E100DF6C05 /* exitreason-sample.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08C9D83C1BFFF8D500DF6C05 /* exitreason-sample.plist.gz */; };
                08CF18FF1BF9B7B100D05813 /* stackshot-sample-tailspin in Resources */ = {isa = PBXBuildFile; fileRef = 08CF18FD1BF9B79E00D05813 /* stackshot-sample-tailspin */; };
                C9DCEF011F01C3810000BD02 /* stackshot-sample-instrs-cycles in Resources */ = {isa = PBXBuildFile; fileRef = C9DCEF001F01C3790000BD02 /* stackshot-sample-instrs-cycles */; };
                C9DCEF021F01C3810000BD02 /* stackshot-sample-instrs-cycles.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = C9DCEEFF1F01C3790000BD02 /* stackshot-sample-instrs-cycles.plist.gz */; };
                C9DE39141ACB5A540020F4A3 /* kcdata_core.m in Sources */ = {isa = PBXBuildFile; fileRef = C9DE39131ACB5A540020F4A3 /* kcdata_core.m */; };
+               F702EC6422AE50DD00FDCF74 /* stackshot-sample-dispatch-queue-label in Resources */ = {isa = PBXBuildFile; fileRef = F702EC6322AE500E00FDCF74 /* stackshot-sample-dispatch-queue-label */; };
+               F702EC6522AE50DD00FDCF74 /* stackshot-sample-dispatch-queue-label.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = F702EC6222AE500E00FDCF74 /* stackshot-sample-dispatch-queue-label.plist.gz */; };
+               F7C20D3322A168620002AD06 /* stackshot-sample-asid-pagetable in Resources */ = {isa = PBXBuildFile; fileRef = F7C20D3122A168610002AD06 /* stackshot-sample-asid-pagetable */; };
+               F7C20D3422A168620002AD06 /* stackshot-sample-asid-pagetable.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = F7C20D3222A168610002AD06 /* stackshot-sample-asid-pagetable.plist.gz */; };
+               F7F2F28422A159F700542597 /* stackshot-sample-turnstileinfo in Resources */ = {isa = PBXBuildFile; fileRef = F7F2F28222A159F700542597 /* stackshot-sample-turnstileinfo */; };
+               F7F2F28522A159F700542597 /* stackshot-sample-turnstileinfo.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = F7F2F28322A159F700542597 /* stackshot-sample-turnstileinfo.plist.gz */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXContainerItemProxy section */
                        );
                        runOnlyForDeploymentPostprocessing = 1;
                };
+               08C3972D204E0A5300BDDB3F /* CopyFiles */ = {
+                       isa = PBXCopyFilesBuildPhase;
+                       buildActionMask = 8;
+                       dstPath = /AppleInternal/CoreOS/BATS/unit_tests;
+                       dstSubfolderSpec = 0;
+                       files = (
+                               08C3972F204E0A7C00BDDB3F /* xnu.libkdd.plist in CopyFiles */,
+                       );
+                       runOnlyForDeploymentPostprocessing = 1;
+               };
 /* End PBXCopyFilesBuildPhase section */
 
 /* Begin PBXFileReference section */
                08B4808A1BF9473800B4AAE0 /* corpse-sample.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "corpse-sample.plist.gz"; path = "tests/corpse-sample.plist.gz"; sourceTree = SOURCE_ROOT; };
                08B9297C1C1CCE7E003B1703 /* stackshot-sample-ths-thread-t */ = {isa = PBXFileReference; lastKnownFileType = file; name = "stackshot-sample-ths-thread-t"; path = "tests/stackshot-sample-ths-thread-t"; sourceTree = SOURCE_ROOT; };
                08B9297D1C1CCE7E003B1703 /* stackshot-sample-ths-thread-t.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-sample-ths-thread-t.plist.gz"; path = "tests/stackshot-sample-ths-thread-t.plist.gz"; sourceTree = SOURCE_ROOT; };
+               08C3972E204E0A7500BDDB3F /* xnu.libkdd.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = xnu.libkdd.plist; sourceTree = "<group>"; };
                08C9D83B1BFFF8D500DF6C05 /* exitreason-sample */ = {isa = PBXFileReference; lastKnownFileType = file; name = "exitreason-sample"; path = "tests/exitreason-sample"; sourceTree = SOURCE_ROOT; };
                08C9D83C1BFFF8D500DF6C05 /* exitreason-sample.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "exitreason-sample.plist.gz"; path = "tests/exitreason-sample.plist.gz"; sourceTree = SOURCE_ROOT; };
                08CF18FD1BF9B79E00D05813 /* stackshot-sample-tailspin */ = {isa = PBXFileReference; lastKnownFileType = file; name = "stackshot-sample-tailspin"; path = "tests/stackshot-sample-tailspin"; sourceTree = SOURCE_ROOT; };
                C9DCEEFF1F01C3790000BD02 /* stackshot-sample-instrs-cycles.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-instrs-cycles.plist.gz"; sourceTree = "<group>"; };
                C9DCEF001F01C3790000BD02 /* stackshot-sample-instrs-cycles */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-sample-instrs-cycles"; sourceTree = "<group>"; };
                C9DE39131ACB5A540020F4A3 /* kcdata_core.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = kcdata_core.m; sourceTree = "<group>"; };
+               F702EC6222AE500E00FDCF74 /* stackshot-sample-dispatch-queue-label.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-dispatch-queue-label.plist.gz"; sourceTree = "<group>"; };
+               F702EC6322AE500E00FDCF74 /* stackshot-sample-dispatch-queue-label */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-sample-dispatch-queue-label"; sourceTree = "<group>"; };
+               F7C20D3122A168610002AD06 /* stackshot-sample-asid-pagetable */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-sample-asid-pagetable"; sourceTree = "<group>"; };
+               F7C20D3222A168610002AD06 /* stackshot-sample-asid-pagetable.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-asid-pagetable.plist.gz"; sourceTree = "<group>"; };
+               F7F2F28222A159F700542597 /* stackshot-sample-turnstileinfo */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-sample-turnstileinfo"; sourceTree = "<group>"; };
+               F7F2F28322A159F700542597 /* stackshot-sample-turnstileinfo.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-turnstileinfo.plist.gz"; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
                08603F351BF69EDE007D3784 /* tests */ = {
                        isa = PBXGroup;
                        children = (
+                               F7F2F28222A159F700542597 /* stackshot-sample-turnstileinfo */,
+                               F7F2F28322A159F700542597 /* stackshot-sample-turnstileinfo.plist.gz */,
+                               F702EC6322AE500E00FDCF74 /* stackshot-sample-dispatch-queue-label */,
+                               F702EC6222AE500E00FDCF74 /* stackshot-sample-dispatch-queue-label.plist.gz */,
                                13739E8420DB18B600D8D9B9 /* stackshot-with-shared-cache-layout */,
                                13739E8320DB18B500D8D9B9 /* stackshot-with-shared-cache-layout.plist.gz */,
                                C95E4D19204F42C500FD2229 /* stackshot-sample-cpu-times */,
                                08603F3F1BF69F44007D3784 /* kdd_bridge.h */,
                                0843EE911BF6AFB700CD4150 /* stackshot-sample */,
                                08603F361BF69EDE007D3784 /* Tests.swift */,
+                               F7C20D3122A168610002AD06 /* stackshot-sample-asid-pagetable */,
+                               F7C20D3222A168610002AD06 /* stackshot-sample-asid-pagetable.plist.gz */,
                                0843EE931BF6BAB400CD4150 /* stackshot-sample.plist.gz */,
                                08B480741BF8294E00B4AAE0 /* stackshot-sample-new-arrays */,
                                08B480751BF8294E00B4AAE0 /* stackshot-sample-new-arrays.plist.gz */,
                C91C93BE1ACB58B700119B60 = {
                        isa = PBXGroup;
                        children = (
+                               08C3972E204E0A7500BDDB3F /* xnu.libkdd.plist */,
                                08DE68361BFFB71D00BC682F /* kdd */,
                                08DE68351BFFB70900BC682F /* libkdd */,
                                08F1501D1BFEA7AC00F2C89C /* libz.dylib */,
                                08603F301BF69EDE007D3784 /* Sources */,
                                08603F311BF69EDE007D3784 /* Frameworks */,
                                08603F321BF69EDE007D3784 /* Resources */,
+                               08C3972D204E0A5300BDDB3F /* CopyFiles */,
                        );
                        buildRules = (
                        );
                C91C93BF1ACB58B700119B60 /* Project object */ = {
                        isa = PBXProject;
                        attributes = {
+                               DefaultBuildSystemTypeForWorkspace = Latest;
                                LastSwiftUpdateCheck = 0730;
                                LastUpgradeCheck = 0830;
                                ORGANIZATIONNAME = "Vishal Patel";
                        developmentRegion = English;
                        hasScannedForEncodings = 0;
                        knownRegions = (
+                               English,
                                en,
                        );
                        mainGroup = C91C93BE1ACB58B700119B60;
                        isa = PBXResourcesBuildPhase;
                        buildActionMask = 2147483647;
                        files = (
+                               F7F2F28422A159F700542597 /* stackshot-sample-turnstileinfo in Resources */,
                                13739E8620DB18B600D8D9B9 /* stackshot-with-shared-cache-layout in Resources */,
                                084422F82048BABB008A085B /* stackshot-sample-asid in Resources */,
                                084422F92048BABB008A085B /* stackshot-sample-asid.plist.gz in Resources */,
                                18C577C31F96DB5200C67EB3 /* stackshot-sample-thread-groups-flags in Resources */,
                                C9DCEF011F01C3810000BD02 /* stackshot-sample-instrs-cycles in Resources */,
                                C9DCEF021F01C3810000BD02 /* stackshot-sample-instrs-cycles.plist.gz in Resources */,
+                               F7F2F28522A159F700542597 /* stackshot-sample-turnstileinfo.plist.gz in Resources */,
                                088C36E01EF323C300ABB2E0 /* stackshot-sample-thread-policy in Resources */,
                                088C36E11EF323C300ABB2E0 /* stackshot-sample-thread-policy.plist.gz in Resources */,
                                045F7F131D2ADE8000B4808B /* stackshot-with-waitinfo.plist.gz in Resources */,
                                13D6C5D01C4DDDB6005E617C /* corpse-twr-sample in Resources */,
                                C95E4D1B204F42C500FD2229 /* stackshot-sample-cpu-times in Resources */,
                                C9D7B5401D1B41D700F1019D /* xnupost_testconfig-sample in Resources */,
+                               F7C20D3322A168620002AD06 /* stackshot-sample-asid-pagetable in Resources */,
                                081725D51C3F476500371A54 /* stackshot-sample-duration in Resources */,
                                08F2AC0B1FA136EB00271A11 /* stackshot-sample-delta-thread-policy.plist.gz in Resources */,
                                081725D61C3F476500371A54 /* stackshot-sample-duration.plist.gz in Resources */,
                                13DBA2681CAB1AD600227EB2 /* stackshot-sample-sharedcachev2.plist.gz in Resources */,
                                08CF19001BF9B7B100D05813 /* stackshot-sample-tailspin.plist.gz in Resources */,
                                13CC08441CB97F8D00EA6069 /* stackshot-fault-stats in Resources */,
+                               F702EC6522AE50DD00FDCF74 /* stackshot-sample-dispatch-queue-label.plist.gz in Resources */,
                                13F3DA9C1C7C1BEE00ACFFCC /* corpse-twr-sample-v2 in Resources */,
                                13D6C5D31C4DDE0D005E617C /* test-twr-sample.plist.gz in Resources */,
+                               F7C20D3422A168620002AD06 /* stackshot-sample-asid-pagetable.plist.gz in Resources */,
                                1862B0351E7A083F0005ADF4 /* stackshot-sample-thread-groups.plist.gz in Resources */,
                                1368F0851C87E06A00940FC6 /* exitreason-codesigning.plist.gz in Resources */,
                                08C9D83D1BFFF8E100DF6C05 /* exitreason-sample in Resources */,
                                08B4807A1BF8297500B4AAE0 /* stackshot-sample-old-arrays in Resources */,
                                08B4807B1BF8297500B4AAE0 /* stackshot-sample-old-arrays.plist.gz in Resources */,
                                0843EE941BF6BAC100CD4150 /* stackshot-sample.plist.gz in Resources */,
+                               F702EC6422AE50DD00FDCF74 /* stackshot-sample-dispatch-queue-label in Resources */,
                                0843EE921BF6AFC600CD4150 /* stackshot-sample in Resources */,
                                08AD0BF11FBE370000CB41B2 /* stackshot-sample-stacktop.plist.gz in Resources */,
                                18E592991E9451A20018612A /* stackshot-sample-coalitions.plist.gz in Resources */,
index e073f85a406950ee8f9c068c509f024da6a3f4bb..34f8d9e8eeac36a4cc4cf0b119fe20fe8f351307 100644 (file)
@@ -734,6 +734,66 @@ class Tests: XCTestCase {
         XCTAssert(dict.value(forKeyPath: "kcdata_crashinfo.task_snapshots.0.crashed_threadid")  as? Int == 42)
     }
 
+    func testDispatchQueueLabel() {
+        let buffer = NSMutableData(capacity:1000)!
+
+        var item = kcdata_item()
+        let dql = "houston.we.had.a.problem"
+        var payload32 : UInt32
+
+        item.type = KCDATA_BUFFER_BEGIN_STACKSHOT
+        item.flags = 0
+        item.size = 0
+        buffer.append(&item, length: MemoryLayout<kcdata_item>.size)
+
+        item.type = UInt32(KCDATA_TYPE_CONTAINER_BEGIN)
+        item.flags = 0
+        item.size = UInt32(MemoryLayout<UInt32>.size)
+        buffer.append(&item, length: MemoryLayout<kcdata_item>.size)
+        payload32 = UInt32(STACKSHOT_KCCONTAINER_TASK)
+        buffer.append(&payload32, length:MemoryLayout<UInt32>.size)
+
+        item.type = UInt32(KCDATA_TYPE_CONTAINER_BEGIN)
+        item.flags = 0
+        item.size = UInt32(MemoryLayout<UInt32>.size)
+        buffer.append(&item, length: MemoryLayout<kcdata_item>.size)
+        payload32 = UInt32(STACKSHOT_KCCONTAINER_THREAD)
+        buffer.append(&payload32, length:MemoryLayout<UInt32>.size)
+
+        item.type = UInt32(STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL)
+        item.flags = 0
+        item.size = UInt32(dql.utf8.count + 1)
+        buffer.append(&item, length: MemoryLayout<kcdata_item>.size)
+        dql.utf8CString.withUnsafeBufferPointer({
+            buffer.append($0.baseAddress!, length:dql.utf8.count + 1)
+        })
+
+        item.type = UInt32(KCDATA_TYPE_CONTAINER_END)
+        item.flags = 0
+        item.size = UInt32(MemoryLayout<UInt32>.size)
+        buffer.append(&item, length: MemoryLayout<kcdata_item>.size)
+        payload32 = UInt32(STACKSHOT_KCCONTAINER_THREAD)
+        buffer.append(&payload32, length:MemoryLayout<UInt32>.size)
+
+        item.type = UInt32(KCDATA_TYPE_CONTAINER_END)
+        item.flags = 0
+        item.size = UInt32(MemoryLayout<UInt32>.size)
+        buffer.append(&item, length: MemoryLayout<kcdata_item>.size)
+        payload32 = UInt32(STACKSHOT_KCCONTAINER_TASK)
+        buffer.append(&payload32, length:MemoryLayout<UInt32>.size)
+
+
+        item.type = KCDATA_TYPE_BUFFER_END
+        item.flags = 0
+        item.size = 0
+        buffer.append(&item, length: MemoryLayout<kcdata_item>.size)
+
+        guard let dict = try? self.parseBuffer(buffer)
+            else { XCTFail(); return; }
+
+        XCTAssert(dict.value(forKeyPath: "kcdata_stackshot.task_snapshots.0.thread_snapshots.0.dispatch_queue_label")  as? String == dql)
+    }
+
     func testRepeatedContainer() {
         //repeated container of same name and key shoudl fail
 
@@ -1348,6 +1408,10 @@ class Tests: XCTestCase {
         self.testSampleStackshot("stackshot-sample-coalitions")
     }
 
+    func testSampleTurnstileInfo() {
+        self.testSampleStackshot("stackshot-sample-turnstileinfo")
+    }
+
     func testStackshotSharedcacheV2() {
         self.testSampleStackshot("stackshot-sample-sharedcachev2")
     }
@@ -1400,6 +1464,10 @@ class Tests: XCTestCase {
         self.testSampleStackshot("stackshot-with-shared-cache-layout")
     }
 
+    func testStackshotDispatchQueueLabel() {
+        self.testSampleStackshot("stackshot-sample-dispatch-queue-label")
+    }
+
     func testTrivial() {
     }
 }
index d6691bafbd5e51ea0dc6eae7a0706196dbd0ed76..fb2f484879bf77af6e8bfd9ff33956ecacf4c50d 100644 (file)
@@ -9,8 +9,8 @@
 #ifndef kdd_bridge_h
 #define kdd_bridge_h
 
-#import "kdd.h"
-#include "kcdata.h"
+#include <kdd/kdd.h>
+#include <kdd/kcdata.h>
 #include <zlib.h>
 
 #endif /* kdd_bridge_h */
diff --git a/libkdd/tests/stackshot-sample-dispatch-queue-label b/libkdd/tests/stackshot-sample-dispatch-queue-label
new file mode 100644 (file)
index 0000000..b57d3fb
Binary files /dev/null and b/libkdd/tests/stackshot-sample-dispatch-queue-label differ
diff --git a/libkdd/tests/stackshot-sample-dispatch-queue-label.plist.gz b/libkdd/tests/stackshot-sample-dispatch-queue-label.plist.gz
new file mode 100644 (file)
index 0000000..2d91577
Binary files /dev/null and b/libkdd/tests/stackshot-sample-dispatch-queue-label.plist.gz differ
diff --git a/libkdd/tests/stackshot-sample-turnstileinfo b/libkdd/tests/stackshot-sample-turnstileinfo
new file mode 100644 (file)
index 0000000..4af0879
Binary files /dev/null and b/libkdd/tests/stackshot-sample-turnstileinfo differ
diff --git a/libkdd/tests/stackshot-sample-turnstileinfo.plist.gz b/libkdd/tests/stackshot-sample-turnstileinfo.plist.gz
new file mode 100644 (file)
index 0000000..76c6617
Binary files /dev/null and b/libkdd/tests/stackshot-sample-turnstileinfo.plist.gz differ
diff --git a/libkdd/xnu.libkdd.plist b/libkdd/xnu.libkdd.plist
new file mode 100644 (file)
index 0000000..d6c6149
--- /dev/null
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>Project</key>
+       <string>libkdd</string>
+
+    <key>RadarComponents</key>
+    <dict>
+        <key>Name</key>
+        <string>xnu</string>
+        <key>Version</key>
+        <string>debugging</string>
+    </dict>
+
+
+       <key>Tests</key>
+       <array>
+               <dict>
+                       <key>TestName</key>
+                       <string>libkdd_tests</string>
+                       <key>AsRoot</key>
+                       <false/>
+                       <key>Arch</key>
+                       <string>platform-native</string>
+                       <key>Command</key>
+                       <array>
+                               <string>BATS_XCTEST_CMD</string>
+                               <string>/AppleInternal/XCTests/com.apple.libkdd/tests.xctest</string>
+                       </array>
+               </dict>
+       </array>
+
+</dict>
+</plist>
index 4fff0ba813a182d45c32fcdba0e7988c646dc707..5a887bc797b6167f38695976c4d810204b67feb3 100644 (file)
@@ -190,9 +190,12 @@ OSKextCancelRequest(
 #pragma mark MIG Functions & Wrappers
 #endif
 /*********************************************************************
-* IMPORTANT: Once we have done the vm_map_copyout(), we *must* return
-* KERN_SUCCESS or the kernel map gets messed up (reason as yet
-* unknown). We use op_result to return the real result of our work.
+* IMPORTANT: vm_map_copyout_size() consumes the requestIn copy
+* object on success. Therefore once it has been invoked successfully,
+* this routine *must* return KERN_SUCCESS, regardless of our actual
+* result. Our contract with the caller is that requestIn must be
+* caller-deallocated if we return an error. We use op_result to return
+* the real result of our work.
 *********************************************************************/
 kern_return_t
 kext_request(
@@ -222,9 +225,9 @@ kext_request(
         * just in case, or MIG will try to copy out bogus data.
         */
        *op_result = KERN_FAILURE;
-       *responseOut = NULL;
+       *responseOut = 0;
        *responseLengthOut = 0;
-       *logDataOut = NULL;
+       *logDataOut = 0;
        *logDataLengthOut = 0;
 
        /* Check for input. Don't discard what isn't there, though.
@@ -238,17 +241,17 @@ kext_request(
                goto finish;
        }
 
-       /* Once we have done the vm_map_copyout(), we *must* return KERN_SUCCESS
-        * or the kernel map gets messed up (reason as yet unknown). We will use
-        * op_result to return the real result of our work.
-        */
-       result = vm_map_copyout(kernel_map, &map_addr, (vm_map_copy_t)requestIn);
+       result = vm_map_copyout_size(kernel_map, &map_addr, (vm_map_copy_t)requestIn, requestLengthIn);
        if (result != KERN_SUCCESS) {
                OSKextLog(/* kext */ NULL,
                    kOSKextLogErrorLevel |
                    kOSKextLogIPCFlag,
                    "vm_map_copyout() failed for request from user space.");
-               vm_map_copy_discard((vm_map_copy_t)requestIn);
+               /*
+                * If we return an error it is our caller's responsibility to
+                * deallocate the requestIn copy object, so do not deallocate it
+                * here. See comment above.
+                */
                goto finish;
        }
        request = CAST_DOWN(char *, map_addr);
@@ -314,7 +317,7 @@ kext_request(
                            kOSKextLogIPCFlag,
                            "Failed to copy response to request from user space.");
                        *op_result = copyin_result; // xxx - should we map to our own code?
-                       *responseOut = NULL;
+                       *responseOut = 0;
                        *responseLengthOut = 0;
                        goto finish;
                }
@@ -334,7 +337,7 @@ kext_request(
                            kOSKextLogIPCFlag,
                            "Failed to copy log data for request from user space.");
                        *op_result = copyin_result; // xxx - should we map to our own code?
-                       *logDataOut = NULL;
+                       *logDataOut = 0;
                        *logDataLengthOut = 0;
                        goto finish;
                }
@@ -392,7 +395,7 @@ kext_weak_symbol_referenced(void)
        panic("A kext referenced an unresolved weak symbol\n");
 }
 
-const void *gOSKextUnresolved = (const void *)&kext_weak_symbol_referenced;
+const void * const gOSKextUnresolved = (const void *)&kext_weak_symbol_referenced;
 
 #if PRAGMA_MARK
 #pragma mark Kernel-Internal C Functions
index 92558ac4ccb828833deab6b36c4c31784688ade9..cd04323f3e73a866ed1e300ebda94aedfdf4ae95 100644 (file)
@@ -133,7 +133,7 @@ OSArray::withCapacity(unsigned int capacity)
 
        if (me && !me->initWithCapacity(capacity)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -148,7 +148,7 @@ OSArray::withObjects(const OSObject *objects[],
 
        if (me && !me->initWithObjects(objects, count, capacity)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -162,7 +162,7 @@ OSArray::withArray(const OSArray *array,
 
        if (me && !me->initWithArray(array, capacity)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -403,7 +403,7 @@ OSObject *
 OSArray::getObject(unsigned int index) const
 {
        if (index >= count) {
-               return 0;
+               return NULL;
        } else {
                return (OSObject *) (const_cast<OSMetaClassBase *>(array[index]));
        }
@@ -413,7 +413,7 @@ OSObject *
 OSArray::getLastObject() const
 {
        if (count == 0) {
-               return 0;
+               return NULL;
        } else {
                return (OSObject *) (const_cast<OSMetaClassBase *>(array[count - 1]));
        }
@@ -457,7 +457,7 @@ OSArray::getNextObjectForIterator(void *inIterator, OSObject **ret) const
                *ret = (OSObject *)(const_cast<OSMetaClassBase *> (array[index]));
                return true;
        } else {
-               *ret = 0;
+               *ret = NULL;
                return false;
        }
 }
@@ -503,13 +503,13 @@ OSCollection *
 OSArray::copyCollection(OSDictionary *cycleDict)
 {
        bool allocDict = !cycleDict;
-       OSCollection *ret = 0;
-       OSArray *newArray = 0;
+       OSCollection *ret = NULL;
+       OSArray *newArray = NULL;
 
        if (allocDict) {
                cycleDict = OSDictionary::withCapacity(16);
                if (!cycleDict) {
-                       return 0;
+                       return NULL;
                }
        }
 
@@ -546,7 +546,7 @@ OSArray::copyCollection(OSDictionary *cycleDict)
                ;
 
                ret = newArray;
-               newArray = 0;
+               newArray = NULL;
        } while (false);
 
 abortCopy:
index cabe30ab7ade9b97b901408413bfde1579883289..6918e6954ad3feea5d29ca6b908d2e1a67db7ecc 100644 (file)
@@ -44,8 +44,8 @@ OSMetaClassDefineReservedUnused(OSBoolean, 5);
 OSMetaClassDefineReservedUnused(OSBoolean, 6);
 OSMetaClassDefineReservedUnused(OSBoolean, 7);
 
-static OSBoolean * gOSBooleanTrue  = 0;
-static OSBoolean * gOSBooleanFalse = 0;
+static OSBoolean * gOSBooleanTrue  = NULL;
+static OSBoolean * gOSBooleanFalse = NULL;
 
 OSBoolean * const & kOSBooleanTrue  = gOSBooleanTrue;
 OSBoolean * const & kOSBooleanFalse = gOSBooleanFalse;
index 93a2433e60ab554b3f3bda6caf8624834f481796..cc60901c8a388380d9b0a6426b54482dc3541ef3 100644 (file)
@@ -45,7 +45,7 @@ OSCollectionIterator::initWithCollection(const OSCollection *inColl)
 
        inColl->retain();
        collection = inColl;
-       collIterator = 0;
+       collIterator = NULL;
        initialUpdateStamp = 0;
        valid = false;
 
@@ -59,7 +59,7 @@ OSCollectionIterator::withCollection(const OSCollection *inColl)
 
        if (me && !me->initWithCollection(inColl)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -71,12 +71,12 @@ OSCollectionIterator::free()
        if (collIterator) {
                kfree(collIterator, collection->iteratorSize());
                OSCONTAINER_ACCUMSIZE(-((size_t) collection->iteratorSize()));
-               collIterator = 0;
+               collIterator = NULL;
        }
 
        if (collection) {
                collection->release();
-               collection = 0;
+               collection = NULL;
        }
 
        super::free();
@@ -128,9 +128,9 @@ OSCollectionIterator::getNextObject()
        bool retVal;
 
        if (!isValid()) {
-               return 0;
+               return NULL;
        }
 
        retVal = collection->getNextObjectForIterator(collIterator, &retObj);
-       return (retVal)? retObj : 0;
+       return (retVal)? retObj : NULL;
 }
diff --git a/libkern/c++/OSCompat.cpp b/libkern/c++/OSCompat.cpp
new file mode 100644 (file)
index 0000000..b0fd915
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ * Compatibility definitions for I/O Kit smart pointers
+ */
+
+#define LIBKERN_SMART_POINTERS
+
+#include <libkern/c++/OSUnserialize.h>
+#include <libkern/c++/OSString.h>
+
+extern OSObjectPtr
+OSUnserialize(const char *buffer, LIBKERN_RETURNS_RETAINED_ON_ZERO OSString **errorString);
+
+OSObjectPtr
+OSUnserialize(const char *buffer, OSStringPtr *errorString)
+{
+       return OSUnserialize(buffer, OSOutPtr(errorString));
+}
+
+extern OSObjectPtr
+OSUnserializeXML(const char *buffer, LIBKERN_RETURNS_RETAINED_ON_ZERO OSString **errorString);
+
+OSObjectPtr
+OSUnserializeXML(const char *buffer, OSStringPtr *errorString)
+{
+       return OSUnserializeXML(buffer, OSOutPtr(errorString));
+}
index 92daa9c1e4119db4cc68f20a7946e0bd5ca367f1..08271131293dd3f06e5cde62a2e67df2b39667c3 100644 (file)
@@ -33,6 +33,8 @@ __BEGIN_DECLS
 #include <vm/vm_kern.h>
 __END_DECLS
 
+#define LIBKERN_SMART_POINTERS
+
 #include <libkern/c++/OSData.h>
 #include <libkern/c++/OSSerialize.h>
 #include <libkern/c++/OSLib.h>
@@ -65,7 +67,7 @@ OSData::initWithCapacity(unsigned int inCapacity)
                        } else {
                                kmem_free(kernel_map, (vm_offset_t)data, capacity);
                        }
-                       data = 0;
+                       data = NULL;
                        capacity = 0;
                }
        }
@@ -153,66 +155,61 @@ OSData::initWithData(const OSData *inData,
        }
 }
 
-OSData *
+OSDataPtr
 OSData::withCapacity(unsigned int inCapacity)
 {
-       OSData *me = new OSData;
+       OSDataPtr me = OSDataPtr::alloc();
 
        if (me && !me->initWithCapacity(inCapacity)) {
-               me->release();
-               return 0;
+               return nullptr;
        }
 
        return me;
 }
 
-OSData *
+OSDataPtr
 OSData::withBytes(const void *bytes, unsigned int inLength)
 {
-       OSData *me = new OSData;
+       OSDataPtr me = OSDataPtr::alloc();
 
        if (me && !me->initWithBytes(bytes, inLength)) {
-               me->release();
-               return 0;
+               return nullptr;
        }
        return me;
 }
 
-OSData *
+OSDataPtr
 OSData::withBytesNoCopy(void *bytes, unsigned int inLength)
 {
-       OSData *me = new OSData;
+       OSDataPtr me = OSDataPtr::alloc();
 
        if (me && !me->initWithBytesNoCopy(bytes, inLength)) {
-               me->release();
-               return 0;
+               return nullptr;
        }
 
        return me;
 }
 
-OSData *
+OSDataPtr
 OSData::withData(const OSData *inData)
 {
-       OSData *me = new OSData;
+       OSDataPtr me = OSDataPtr::alloc();
 
        if (me && !me->initWithData(inData)) {
-               me->release();
-               return 0;
+               return nullptr;
        }
 
        return me;
 }
 
-OSData *
+OSDataPtr
 OSData::withData(const OSData *inData,
     unsigned int start, unsigned int inLength)
 {
-       OSData *me = new OSData;
+       OSDataPtr me = OSDataPtr::alloc();
 
        if (me && !me->initWithData(inData, start, inLength)) {
-               me->release();
-               return 0;
+               return nullptr;
        }
 
        return me;
@@ -401,7 +398,7 @@ const void *
 OSData::getBytesNoCopy() const
 {
        if (!length) {
-               return 0;
+               return NULL;
        } else {
                return data;
        }
@@ -411,7 +408,7 @@ const void *
 OSData::getBytesNoCopy(unsigned int start,
     unsigned int inLength) const
 {
-       const void *outData = 0;
+       const void *outData = NULL;
 
        if (length
            && start < length
index d9c75679892c932cfdb6bcabe4b9e147c67d7a29..c928cdd15ff1e4597fa1993d18cc5e6d643c61f1 100644 (file)
@@ -228,7 +228,7 @@ OSDictionary::withCapacity(unsigned int capacity)
 
        if (me && !me->initWithCapacity(capacity)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -244,7 +244,7 @@ OSDictionary::withObjects(const OSObject *objects[],
 
        if (me && !me->initWithObjects(objects, keys, count, capacity)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -260,7 +260,7 @@ OSDictionary::withObjects(const OSObject *objects[],
 
        if (me && !me->initWithObjects(objects, keys, count, capacity)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -274,7 +274,7 @@ OSDictionary::withDictionary(const OSDictionary *dict,
 
        if (me && !me->initWithDictionary(dict, capacity)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -511,7 +511,7 @@ OSDictionary::getObject(const OSSymbol *aKey) const
        unsigned int i, l = 0, r = count;
 
        if (!aKey) {
-               return 0;
+               return NULL;
        }
 
        // if the key exists, return the object
@@ -703,10 +703,10 @@ OSDictionary::getNextObjectForIterator(void *inIterator, OSObject **ret) const
        if (index < count) {
                *ret = (OSObject *) dictionary[index].key;
        } else {
-               *ret = 0;
+               *ret = NULL;
        }
 
-       return *ret != 0;
+       return *ret != NULL;
 }
 
 bool
@@ -788,13 +788,13 @@ OSCollection *
 OSDictionary::copyCollection(OSDictionary *cycleDict)
 {
        bool allocDict = !cycleDict;
-       OSCollection *ret = 0;
-       OSDictionary *newDict = 0;
+       OSCollection *ret = NULL;
+       OSDictionary *newDict = NULL;
 
        if (allocDict) {
                cycleDict = OSDictionary::withCapacity(16);
                if (!cycleDict) {
-                       return 0;
+                       return NULL;
                }
        }
 
@@ -833,7 +833,7 @@ OSDictionary::copyCollection(OSDictionary *cycleDict)
                }
 
                ret = newDict;
-               newDict = 0;
+               newDict = NULL;
        } while (false);
 
 abortCopy:
@@ -855,13 +855,13 @@ OSDictionary::copyKeys(void)
 
        array = OSArray::withCapacity(count);
        if (!array) {
-               return 0;
+               return NULL;
        }
 
        for (unsigned int i = 0; i < count; i++) {
                if (!array->setObject(i, dictionary[i].key)) {
                        array->release();
-                       array = 0;
+                       array = NULL;
                        break;
                }
        }
index d79b4b64e002f6be1dcf4ee4e257d44cd448bfa2..6de6147789e9e57c92a22536fc323f3760bb0b53 100644 (file)
@@ -113,6 +113,7 @@ static bool _OSKextInUnloadedPrelinkedKexts(const OSSymbol * theBundleID);
 // We really should add containsObject() & containsCString to OSCollection & subclasses.
 // So few pad slots, though....
 static bool _OSArrayContainsCString(OSArray * array, const char * cString);
+static void OSKextLogKextInfo(OSKext *aKext, uint64_t address, uint64_t size, firehose_tracepoint_code_t code);
 
 /* Prelinked arm kexts do not have VM entries because the method we use to
  * fake an entry (see libsa/bootstrap.cpp:readPrelinkedExtensions()) does
@@ -269,6 +270,7 @@ static OSDictionary       * sExcludeListByID           = NULL;
 static OSKextVersion        sExcludeListVersion        = 0;
 static OSArray            * sLoadedKexts               = NULL;
 static OSArray            * sUnloadedPrelinkedKexts    = NULL;
+static OSArray            * sLoadedDriverKitKexts      = NULL;
 
 // Requests to kextd waiting to be picked up.
 static OSArray            * sKernelRequests            = NULL;
@@ -313,35 +315,35 @@ static OSKext          * sKernelKext             = NULL;
  * binary compability.
  */
 kmod_info_t g_kernel_kmod_info = {
-       /* next            */ 0,
-       /* info_version    */ KMOD_INFO_VERSION,
-       /* id              */ 0,             // loadTag: kernel is always 0
-       /* name            */ kOSKextKernelIdentifier, // bundle identifier
-       /* version         */ "0",           // filled in in OSKext::initialize()
-       /* reference_count */ -1,            // never adjusted; kernel never unloads
-       /* reference_list  */ NULL,
-       /* address         */ 0,
-       /* size            */ 0,             // filled in in OSKext::initialize()
-       /* hdr_size        */ 0,
-       /* start           */ 0,
-       /* stop            */ 0
+       .next =            NULL,
+       .info_version =    KMOD_INFO_VERSION,
+       .id =              0,             // loadTag: kernel is always 0
+       .name =            kOSKextKernelIdentifier,// bundle identifier
+       .version =         "0",           // filled in in OSKext::initialize()
+       .reference_count = -1,            // never adjusted; kernel never unloads
+       .reference_list =  NULL,
+       .address =         0,
+       .size =            0,             // filled in in OSKext::initialize()
+       .hdr_size =        0,
+       .start =           NULL,
+       .stop =            NULL
 };
 
 /* Set up a fake kmod_info struct for statically linked kexts that don't have one. */
 
 kmod_info_t invalid_kmod_info = {
-       /* next            */ 0,
-       /* info_version    */ KMOD_INFO_VERSION,
-       /* id              */ UINT32_MAX,
-       /* name            */ "invalid",
-       /* version         */ "0",
-       /* reference_count */ -1,
-       /* reference_list  */ NULL,
-       /* address         */ 0,
-       /* size            */ 0,
-       /* hdr_size        */ 0,
-       /* start           */ 0,
-       /* stop            */ 0
+       .next =            NULL,
+       .info_version =    KMOD_INFO_VERSION,
+       .id =              UINT32_MAX,
+       .name =            "invalid",
+       .version =         "0",
+       .reference_count = -1,
+       .reference_list =  NULL,
+       .address =         0,
+       .size =            0,
+       .hdr_size =        0,
+       .start =           NULL,
+       .stop =            NULL
 };
 
 extern "C" {
@@ -407,8 +409,8 @@ static bool                 sConsiderUnloadsCalled     = false;
 static bool                 sConsiderUnloadsPending    = false;
 
 static unsigned int         sConsiderUnloadDelay       = 60;     // seconds
-static thread_call_t        sUnloadCallout             = 0;
-static thread_call_t        sDestroyLinkContextThread  = 0;      // one-shot, one-at-a-time thread
+static thread_call_t        sUnloadCallout             = NULL;
+static thread_call_t        sDestroyLinkContextThread  = NULL;   // one-shot, one-at-a-time thread
 static bool                 sSystemSleep               = false;  // true when system going to sleep
 static AbsoluteTime         sLastWakeTime;                       // last time we woke up
 
@@ -429,7 +431,7 @@ static IOLock                 * sKextSummariesLock                = NULL;
 extern "C" lck_spin_t           vm_allocation_sites_lock;
 static IOSimpleLock           * sKextAccountsLock = &vm_allocation_sites_lock;
 
-void (*sLoadedKextSummariesUpdated)(void) = OSKextLoadedKextSummariesUpdated;
+void(*const sLoadedKextSummariesUpdated)(void) = OSKextLoadedKextSummariesUpdated;
 OSKextLoadedKextSummaryHeader * gLoadedKextSummaries __attribute__((used)) = NULL;
 uint64_t gLoadedKextSummariesTimestamp __attribute__((used)) = 0;
 static size_t sLoadedKextSummariesAllocSize = 0;
@@ -678,7 +680,7 @@ OSKext::initialize(void)
        OSNumber        * kernelCPUSubtype   = NULL;// must release
        OSKextLogSpec     bootLogFilter      = kOSKextLogSilentFilter;
        bool              setResult          = false;
-       uint64_t        * timestamp          = 0;
+       uint64_t        * timestamp          = NULL;
        char              bootArgBuffer[16];// for PE_parse_boot_argn w/strings
 
        /* This must be the first thing allocated. Everything else grabs this lock.
@@ -694,12 +696,13 @@ OSKext::initialize(void)
 
        sKextsByID = OSDictionary::withCapacity(kOSKextTypicalLoadCount);
        sLoadedKexts = OSArray::withCapacity(kOSKextTypicalLoadCount);
+       sLoadedDriverKitKexts = OSArray::withCapacity(kOSKextTypicalLoadCount);
        sUnloadedPrelinkedKexts = OSArray::withCapacity(kOSKextTypicalLoadCount / 10);
        sKernelRequests = OSArray::withCapacity(0);
        sPostedKextLoadIdentifiers = OSSet::withCapacity(0);
        sAllKextLoadIdentifiers = OSSet::withCapacity(kOSKextTypicalLoadCount);
        sRequestCallbackRecords = OSArray::withCapacity(0);
-       assert(sKextsByID && sLoadedKexts && sKernelRequests &&
+       assert(sKextsByID && sLoadedKexts && sLoadedDriverKitKexts && sKernelRequests &&
            sPostedKextLoadIdentifiers && sAllKextLoadIdentifiers &&
            sRequestCallbackRecords && sUnloadedPrelinkedKexts);
 
@@ -750,11 +753,12 @@ OSKext::initialize(void)
        assert(kernelExecutable);
 
 #if KASLR_KEXT_DEBUG
-       IOLog("kaslr: kernel start 0x%lx end 0x%lx length %lu vm_kernel_slide %llu (0x%016lx) \n",
+       IOLog("kaslr: kernel start 0x%lx end 0x%lx length %lu vm_kernel_slide %lu (0x%016lx) \n",
            (unsigned long)kernelStart,
            (unsigned long)getlastaddr(),
            kernelLength,
-           vm_kernel_slide, vm_kernel_slide);
+           (unsigned long)vm_kernel_slide,
+           (unsigned long)vm_kernel_slide);
 #endif
 
        sKernelKext->loadTag = sNextLoadTag++; // the kernel is load tag 0
@@ -1081,12 +1085,7 @@ void
 OSKext::flushNonloadedKexts(
        Boolean flushPrelinkedKexts)
 {
-       OSSet                * prelinkedKexts  = NULL;// must release
-       OSCollectionIterator * kextIterator    = NULL;// must release
-       OSCollectionIterator * prelinkIterator = NULL; // must release
-       const OSSymbol       * thisID          = NULL;// do not release
-       OSKext               * thisKext        = NULL;// do not release
-       uint32_t               count, i;
+       OSSet                * keepKexts       = NULL;// must release
 
        IORecursiveLockLock(sKextLock);
 
@@ -1100,33 +1099,36 @@ OSKext::flushNonloadedKexts(
        /* If we aren't flushing unused prelinked kexts, we have to put them
         * aside while we flush everything else so make a container for them.
         */
-       if (!flushPrelinkedKexts) {
-               prelinkedKexts = OSSet::withCapacity(0);
-               if (!prelinkedKexts) {
-                       goto finish;
-               }
+       keepKexts = OSSet::withCapacity(16);
+       if (!keepKexts) {
+               goto finish;
        }
 
        /* Set aside prelinked kexts (in-use or not) and break
         * any lingering inter-kext references for nonloaded kexts
         * so they have min. retain counts.
         */
-       kextIterator = OSCollectionIterator::withCollection(sKextsByID);
-       if (!kextIterator) {
-               goto finish;
-       }
-
-       while ((thisID = OSDynamicCast(OSSymbol,
-           kextIterator->getNextObject()))) {
-               thisKext = OSDynamicCast(OSKext, sKextsByID->getObject(thisID));
-
-               if (thisKext) {
-                       if (prelinkedKexts && thisKext->isPrelinked()) {
-                               prelinkedKexts->setObject(thisKext);
-                       }
-                       thisKext->flushDependencies(/* forceIfLoaded */ false);
+       sKextsByID->iterateObjects(^bool (const OSSymbol * thisID __unused, OSObject * obj) {
+               OSKext * thisKext = OSDynamicCast(OSKext, obj);
+               if (!thisKext) {
+                       return false;
                }
-       }
+               if (!flushPrelinkedKexts && thisKext->isPrelinked()) {
+                       keepKexts->setObject(thisKext);
+               }
+               if (!thisKext->declaresExecutable()) {
+                       /*
+                        * Don't unload codeless kexts, because they never appear in the loadedKexts array.
+                        * Requesting one from kextd will load it and then immediately remove it by calling
+                        * flushNonloadedKexts().
+                        * And adding one to loadedKexts breaks code assuming they have kmod_info etc.
+                        */
+                       keepKexts->setObject(thisKext);
+               }
+
+               thisKext->flushDependencies(/* forceIfLoaded */ false);
+               return false;
+       });
 
        /* Dump all the kexts in the ID dictionary; we'll repopulate it shortly.
         */
@@ -1134,33 +1136,30 @@ OSKext::flushNonloadedKexts(
 
        /* Now put the loaded kexts back into the ID dictionary.
         */
-       count = sLoadedKexts->getCount();
-       for (i = 0; i < count; i++) {
-               thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i));
+       sLoadedKexts->iterateObjects(^bool (OSObject * obj) {
+               OSKext * thisKext = OSDynamicCast(OSKext, obj);
+               if (!thisKext) {
+                       return false;
+               }
                sKextsByID->setObject(thisKext->getIdentifierCString(), thisKext);
-       }
+               return false;
+       });
 
-       /* Finally, put back the prelinked kexts if we saved any.
+       /* Finally, put back the kept kexts if we saved any.
         */
-       if (prelinkedKexts) {
-               prelinkIterator = OSCollectionIterator::withCollection(prelinkedKexts);
-               if (!prelinkIterator) {
-                       goto finish;
-               }
-
-               while ((thisKext = OSDynamicCast(OSKext,
-                   prelinkIterator->getNextObject()))) {
-                       sKextsByID->setObject(thisKext->getIdentifierCString(),
-                           thisKext);
+       keepKexts->iterateObjects(^bool (OSObject * obj) {
+               OSKext * thisKext = OSDynamicCast(OSKext, obj);
+               if (!thisKext) {
+                       return false;
                }
-       }
+               sKextsByID->setObject(thisKext->getIdentifierCString(), thisKext);
+               return false;
+       });
 
 finish:
        IORecursiveLockUnlock(sKextLock);
 
-       OSSafeReleaseNULL(prelinkedKexts);
-       OSSafeReleaseNULL(kextIterator);
-       OSSafeReleaseNULL(prelinkIterator);
+       OSSafeReleaseNULL(keepKexts);
 
        return;
 }
@@ -1525,6 +1524,12 @@ OSKext::initWithPrelinkedInfoDict(
                executableRelPath->retain();
        }
 
+       userExecutableRelPath = OSDynamicCast(OSString,
+           anInfoDict->getObject("CFBundleUEXTExecutable"));
+       if (userExecutableRelPath) {
+               userExecutableRelPath->retain();
+       }
+
        /* Don't need the paths to be in the info dictionary any more.
         */
        anInfoDict->removeObject(kPrelinkBundlePathKey);
@@ -1551,7 +1556,7 @@ OSKext::initWithPrelinkedInfoDict(
 
 #if KASLR_KEXT_DEBUG
                IOLog("kaslr: unslid 0x%lx slid 0x%lx length %u - prelink executable \n",
-                   (unsigned long)ml_static_unslide(data),
+                   (unsigned long)ml_static_unslide((vm_offset_t)data),
                    (unsigned long)data,
                    length);
 #endif
@@ -1568,7 +1573,7 @@ OSKext::initWithPrelinkedInfoDict(
 
 #if KASLR_KEXT_DEBUG
                        IOLog("kaslr: unslid 0x%lx slid 0x%lx - prelink executable source \n",
-                           (unsigned long)ml_static_unslide(srcData),
+                           (unsigned long)ml_static_unslide((vm_offset_t)srcData),
                            (unsigned long)srcData);
 #endif
 
@@ -1630,7 +1635,7 @@ OSKext::initWithPrelinkedInfoDict(
                        kmod_info->address = ml_static_slide(kmod_info->address);
 #if KASLR_KEXT_DEBUG
                        IOLog("kaslr: unslid 0x%lx slid 0x%lx - kmod_info \n",
-                           (unsigned long)ml_static_unslide(kmod_info),
+                           (unsigned long)ml_static_unslide((vm_offset_t)kmod_info),
                            (unsigned long)kmod_info);
                        IOLog("kaslr: unslid 0x%lx slid 0x%lx - kmod_info->address \n",
                            (unsigned long)ml_static_unslide(kmod_info->address),
@@ -2406,6 +2411,11 @@ OSKext::uniquePersonalityProperties(OSDictionary * personalityDict)
        uniqueStringPlistProperty(personalityDict, kCFBundleIdentifierKey);
        uniqueStringPlistProperty(personalityDict, kIOProviderClassKey);
        uniqueStringPlistProperty(personalityDict, gIOClassKey);
+       if (personalityDict->getObject(kCFBundleIdentifierKernelKey)) {
+               uniqueStringPlistProperty(personalityDict, kCFBundleIdentifierKernelKey);
+       } else {
+               personalityDict->setObject(kCFBundleIdentifierKernelKey, personalityDict->getObject(kCFBundleIdentifierKey));
+       }
 
        /* Other commonly used properties.
         */
@@ -2443,10 +2453,12 @@ OSKext::free(void)
        OSSafeReleaseNULL(bundleID);
        OSSafeReleaseNULL(path);
        OSSafeReleaseNULL(executableRelPath);
+       OSSafeReleaseNULL(userExecutableRelPath);
        OSSafeReleaseNULL(dependencies);
        OSSafeReleaseNULL(linkedExecutable);
        OSSafeReleaseNULL(metaClasses);
        OSSafeReleaseNULL(interfaceUUID);
+       OSSafeReleaseNULL(driverKitUUID);
 
        if (isInterface() && kmod_info) {
                kfree(kmod_info, sizeof(kmod_info_t));
@@ -2467,7 +2479,7 @@ OSKext::readMkextArchive(OSData * mkextData,
 {
        OSReturn       result       = kOSKextReturnBadData;
        uint32_t       mkextLength  = 0;
-       mkext_header * mkextHeader  = 0;// do not free
+       mkext_header * mkextHeader  = NULL;// do not free
        uint32_t       mkextVersion = 0;
 
        /* Note default return of kOSKextReturnBadData above.
@@ -2874,7 +2886,7 @@ OSKext::extractMkext2FileData(
 
        OSData      * uncompressedData = NULL;// release on error
 
-       uint8_t     * uncompressedDataBuffer = 0;// do not free
+       uint8_t     * uncompressedDataBuffer = NULL;// do not free
        unsigned long uncompressedSize;
        z_stream      zstream;
        bool          zstream_inited = false;
@@ -3153,6 +3165,7 @@ OSKext::loadFromMkext(
 
        kextIdentifier = OSDynamicCast(OSString,
            requestArgs->getObject(kKextRequestArgumentBundleIdentifierKey));
+
        if (!kextIdentifier) {
                OSKextLog(/* kext */ NULL,
                    kOSKextLogErrorLevel |
@@ -3194,6 +3207,7 @@ OSKext::loadFromMkext(
         */
        result = OSKext::loadKextWithIdentifier(
                kextIdentifier,
+               /* kextRef */ NULL,
                /* allowDefer */ false,
                delayAutounload,
                startKextExcludeLevel,
@@ -3351,17 +3365,20 @@ OSKext *
 OSKext::lookupKextWithLoadTag(uint32_t aTag)
 {
        OSKext * foundKext = NULL;             // returned
-       uint32_t count, i;
+       uint32_t i, j;
+       OSArray *list[2] = {sLoadedKexts, sLoadedDriverKitKexts};
+       uint32_t count[2] = {sLoadedKexts->getCount(), sLoadedDriverKitKexts->getCount()};
 
        IORecursiveLockLock(sKextLock);
 
-       count = sLoadedKexts->getCount();
-       for (i = 0; i < count; i++) {
-               OSKext * thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i));
-               if (thisKext->getLoadTag() == aTag) {
-                       foundKext = thisKext;
-                       foundKext->retain();
-                       goto finish;
+       for (j = 0; j < (sizeof(list) / sizeof(list[0])); j++) {
+               for (i = 0; i < count[j]; i++) {
+                       OSKext * thisKext = OSDynamicCast(OSKext, list[j]->getObject(i));
+                       if (thisKext->getLoadTag() == aTag) {
+                               foundKext = thisKext;
+                               foundKext->retain();
+                               goto finish;
+                       }
                }
        }
 
@@ -3397,6 +3414,19 @@ OSKext::lookupKextWithAddress(vm_address_t address)
                }
        }
 
+       count = sLoadedDriverKitKexts->getCount();
+       for (i = 0; i < count; i++) {
+               OSKext * thisKext = OSDynamicCast(OSKext, sLoadedDriverKitKexts->getObject(i));
+               /*
+                * DriverKitKexts do not have a linkedExecutable,
+                * so we "fake" their address with the LoadTag
+                */
+               if (thisKext->getLoadTag() == address) {
+                       foundKext = thisKext;
+                       foundKext->retain();
+               }
+       }
+
 finish:
        IORecursiveLockUnlock(sKextLock);
 
@@ -3411,6 +3441,7 @@ OSKext::copyKextUUIDForAddress(OSNumber *address)
        OSKext              * kext = NULL;
        uint32_t              baseIdx;
        uint32_t              lim;
+       uint32_t            count, i;
 
        if (!address) {
                return NULL;
@@ -3457,6 +3488,36 @@ OSKext::copyKextUUIDForAddress(OSNumber *address)
        }
        IOSimpleLockUnlock(sKextAccountsLock);
 
+       if (!kext) {
+               /*
+                * Maybe it is a Dext.
+                * DriverKit userspace executables do not have a kernel linkedExecutable,
+                * so we "fake" their address range with the LoadTag.
+                *
+                * This is supposed to be used for logging reasons only. When logd
+                * calls this function it ors the address with FIREHOSE_TRACEPOINT_PC_KERNEL_MASK, so we
+                * remove it here before checking it against the LoadTag.
+                * Also we need to remove FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT set when emitting the log line.
+                */
+               addr = (uintptr_t)address->unsigned64BitValue() & ~(FIREHOSE_TRACEPOINT_PC_KERNEL_MASK | FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT);
+               IORecursiveLockLock(sKextLock);
+               count = sLoadedDriverKitKexts->getCount();
+               for (i = 0; i < count; i++) {
+                       OSKext   * thisKext     = NULL;
+
+                       thisKext = OSDynamicCast(OSKext, sLoadedDriverKitKexts->getObject(i));
+                       if (!thisKext) {
+                               continue;
+                       }
+                       if (thisKext->getLoadTag() == addr) {
+                               kext = thisKext;
+                               kext->retain();
+                               break;
+                       }
+               }
+               IORecursiveLockUnlock(sKextLock);
+       }
+
        if (kext) {
                uuid = kext->copyTextUUID();
                kext->release();
@@ -3473,36 +3534,38 @@ OSKext *
 OSKext::lookupKextWithUUID(uuid_t wanted)
 {
        OSKext * foundKext = NULL;             // returned
-       uint32_t count, i;
+       uint32_t j, i;
+       OSArray *list[2] = {sLoadedKexts, sLoadedDriverKitKexts};
+       uint32_t count[2] = {sLoadedKexts->getCount(), sLoadedDriverKitKexts->getCount()};
+
 
        IORecursiveLockLock(sKextLock);
 
-       count = sLoadedKexts->getCount();
+       for (j = 0; j < (sizeof(list) / sizeof(list[0])); j++) {
+               for (i = 0; i < count[j]; i++) {
+                       OSKext   * thisKext     = NULL;
 
-       for (i = 0; i < count; i++) {
-               OSKext   * thisKext     = NULL;
-
-               thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i));
-               if (!thisKext) {
-                       continue;
-               }
+                       thisKext = OSDynamicCast(OSKext, list[j]->getObject(i));
+                       if (!thisKext) {
+                               continue;
+                       }
 
-               OSData *uuid_data = thisKext->copyUUID();
-               if (!uuid_data) {
-                       continue;
-               }
+                       OSData *uuid_data = thisKext->copyUUID();
+                       if (!uuid_data) {
+                               continue;
+                       }
 
-               uuid_t uuid;
-               memcpy(&uuid, uuid_data->getBytesNoCopy(), sizeof(uuid));
-               uuid_data->release();
+                       uuid_t uuid;
+                       memcpy(&uuid, uuid_data->getBytesNoCopy(), sizeof(uuid));
+                       uuid_data->release();
 
-               if (0 == uuid_compare(wanted, uuid)) {
-                       foundKext = thisKext;
-                       foundKext->retain();
-                       goto finish;
+                       if (0 == uuid_compare(wanted, uuid)) {
+                               foundKext = thisKext;
+                               foundKext->retain();
+                               goto finish;
+                       }
                }
        }
-
 finish:
        IORecursiveLockUnlock(sKextLock);
 
@@ -3696,16 +3759,20 @@ OSKext::removeKextWithLoadTag(
 {
        OSReturn result    = kOSReturnError;
        OSKext * foundKext = NULL;
-       uint32_t count, i;
+       uint32_t i, j;
+       OSArray *list[2] = {sLoadedKexts, sLoadedDriverKitKexts};
+       uint32_t count[2] = {sLoadedKexts->getCount(), sLoadedDriverKitKexts->getCount()};
+
 
        IORecursiveLockLock(sKextLock);
 
-       count = sLoadedKexts->getCount();
-       for (i = 0; i < count; i++) {
-               OSKext * thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i));
-               if (thisKext->loadTag == loadTag) {
-                       foundKext = thisKext;
-                       break;
+       for (j = 0; j < (sizeof(list) / sizeof(list[0])); j++) {
+               for (i = 0; i < count[j]; i++) {
+                       OSKext * thisKext = OSDynamicCast(OSKext, list[j]->getObject(i));
+                       if (thisKext->loadTag == loadTag) {
+                               foundKext = thisKext;
+                               break;
+                       }
                }
        }
 
@@ -3985,6 +4052,9 @@ OSKext::isCompatibleWithVersion(OSKextVersion aVersion)
 bool
 OSKext::declaresExecutable(void)
 {
+       if (isDriverKit()) {
+               return false;
+       }
        return getPropertyForHostArch(kCFBundleExecutableKey) != NULL;
 }
 
@@ -4216,6 +4286,15 @@ OSKext::copyUUID(void)
                return sKernelKext->copyUUID();
        }
 
+       if (isDriverKit() && infoDict) {
+               if (driverKitUUID) {
+                       driverKitUUID->retain();
+                       return driverKitUUID;
+               } else {
+                       return NULL;
+               }
+       }
+
        /* For real kexts, try to get the UUID from the linked executable,
         * or if is hasn't been linked yet, the unrelocated executable.
         */
@@ -4223,6 +4302,7 @@ OSKext::copyUUID(void)
        if (!theExecutable) {
                theExecutable = getExecutable();
        }
+
        if (!theExecutable) {
                goto finish;
        }
@@ -4279,6 +4359,14 @@ finish:
        return result;
 }
 
+void
+OSKext::setDriverKitUUID(OSData *uuid)
+{
+       if (!OSCompareAndSwapPtr(nullptr, uuid, &driverKitUUID)) {
+               OSSafeReleaseNULL(uuid);
+       }
+}
+
 /*********************************************************************
 *********************************************************************/
 #if defined (__arm__)
@@ -4511,6 +4599,7 @@ OSKext::loadKextWithIdentifier(
                goto finish;
        }
        result = OSKext::loadKextWithIdentifier(kextIdentifier,
+           NULL /* kextRef */,
            allowDeferFlag, delayAutounloadFlag,
            startOpt, startMatchingOpt, personalityNames);
 
@@ -4524,6 +4613,7 @@ finish:
 OSReturn
 OSKext::loadKextWithIdentifier(
        OSString          * kextIdentifier,
+       OSObject         ** kextRef,
        Boolean             allowDeferFlag,
        Boolean             delayAutounloadFlag,
        OSKextExcludeLevel  startOpt,
@@ -4536,6 +4626,10 @@ OSKext::loadKextWithIdentifier(
        OSDictionary    * loadRequest          = NULL;// must release
        const OSSymbol  * kextIdentifierSymbol = NULL;// must release
 
+       if (kextRef) {
+               *kextRef = NULL;
+       }
+
        IORecursiveLockLock(sKextLock);
 
        if (!kextIdentifier) {
@@ -4638,10 +4732,34 @@ finish:
        OSSafeReleaseNULL(loadRequest);
        OSSafeReleaseNULL(kextIdentifierSymbol);
 
+       if ((kOSReturnSuccess == result) && kextRef) {
+               theKext->retain();
+               theKext->matchingRefCount++;
+               *kextRef = theKext;
+       }
+
        IORecursiveLockUnlock(sKextLock);
 
        return result;
 }
+/*********************************************************************
+*********************************************************************/
+/* static */
+void
+OSKext::dropMatchingReferences(
+       OSSet * kexts)
+{
+       IORecursiveLockLock(sKextLock);
+       kexts->iterateObjects(^bool (OSObject * obj) {
+               OSKext * thisKext = OSDynamicCast(OSKext, obj);
+               if (!thisKext) {
+                       return false;
+               }
+               thisKext->matchingRefCount--;
+               return false;
+       });
+       IORecursiveLockUnlock(sKextLock);
+}
 
 /*********************************************************************
 *********************************************************************/
@@ -4791,6 +4909,13 @@ OSKext::load(
                                    "KextExcludeList was updated to version: %lld", sExcludeListVersion);
                        }
                }
+
+               if (isDriverKit()) {
+                       if (loadTag == 0) {
+                               sLoadedDriverKitKexts->setObject(this);
+                               loadTag = sNextLoadTag++;
+                       }
+               }
                result = kOSReturnSuccess;
                goto loaded;
        }
@@ -4998,24 +5123,6 @@ loaded:
 
 finish:
 
-       /* More hack! If the kext doesn't declare an executable, even if we
-        * "loaded" it, we have to remove any personalities naming it, or we'll
-        * never see the registry go quiet. Errors here do not count for the
-        * load operation itself.
-        *
-        * Note that in every other regard it's perfectly ok for a kext to
-        * not declare an executable and serve only as a package for personalities
-        * naming another kext, so we do have to allow such kexts to be "loaded"
-        * so that those other personalities get added & matched.
-        */
-       if (!declaresExecutable()) {
-               OSKextLog(this,
-                   kOSKextLogStepLevel | kOSKextLogLoadFlag,
-                   "Kext %s has no executable; removing any personalities naming it.",
-                   getIdentifierCString());
-               removePersonalitiesFromCatalog();
-       }
-
        if (result != kOSReturnSuccess) {
                OSKextLog(this,
                    kOSKextLogErrorLevel |
@@ -5079,12 +5186,12 @@ OSKext::lookupSection(const char *segname, const char *secname)
        mh = (kernel_mach_header_t *)linkedExecutable->getBytesNoCopy();
 
        for (seg = firstsegfromheader(mh); seg != NULL; seg = nextsegfromheader(mh, seg)) {
-               if (0 != strcmp(seg->segname, segname)) {
+               if (0 != strncmp(seg->segname, segname, sizeof(seg->segname))) {
                        continue;
                }
 
                for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) {
-                       if (0 == strcmp(sec->sectname, secname)) {
+                       if (0 == strncmp(sec->sectname, secname, sizeof(sec->sectname))) {
                                found_section = sec;
                                goto out;
                        }
@@ -5383,7 +5490,7 @@ OSKext::loadExecutable()
        }
 
        /* <rdar://problem/21444003> all callers must be entitled */
-       if (FALSE == IOTaskHasEntitlement(current_task(), "com.apple.rootless.kext-secure-management")) {
+       if (FALSE == IOTaskHasEntitlement(current_task(), kOSKextManagementEntitlement)) {
                OSKextLog(this,
                    kOSKextLogErrorLevel | kOSKextLogLoadFlag,
                    "Not entitled to link kext '%s'",
@@ -6291,12 +6398,21 @@ OSKextLogKextInfo(OSKext *aKext, uint64_t address, uint64_t size, firehose_trace
        }
 
        uuid_info->ftui_size    = size;
-       uuid_info->ftui_address = ml_static_unslide(address);
-
+       if (aKext->isDriverKit()) {
+               uuid_info->ftui_address = address;
+       } else {
+               uuid_info->ftui_address = ml_static_unslide(address);
+       }
        firehose_trace_metadata(firehose_stream_metadata, trace_id, stamp, uuid_info, uuid_info_len);
        return;
 }
 
+void
+OSKext::OSKextLogDriverKitInfoLoad(OSKext *kext)
+{
+       OSKextLogKextInfo(kext, kext->getLoadTag(), 1, firehose_tracepoint_code_load);
+}
+
 /*********************************************************************
 *********************************************************************/
 OSReturn
@@ -6588,6 +6704,15 @@ OSKext::unload(void)
                goto finish;
        }
 
+       if (isDriverKit()) {
+               index = sLoadedKexts->getNextIndexOfObject(this, 0);
+               if (index != (unsigned int)-1) {
+                       sLoadedDriverKitKexts->removeObject(index);
+                       OSKextLogKextInfo(this, loadTag, 1, firehose_tracepoint_code_unload);
+                       loadTag = 0;
+               }
+       }
+
        if (!isLoaded()) {
                result = kOSReturnSuccess;
                goto finish;
@@ -6904,7 +7029,7 @@ _OSKextConsiderDestroyingLinkContext(
                            kOSKextLogGeneralFlag,
                            "thread_call_free() failed for kext link context.");
                }
-               sDestroyLinkContextThread = 0;
+               sDestroyLinkContextThread = NULL;
        }
 
        IORecursiveLockUnlock(sKextInnerLock);
@@ -6939,7 +7064,7 @@ OSKext::considerDestroyingLinkContext(void)
         * this thread_call, so don't share it around.
         */
        sDestroyLinkContextThread = thread_call_allocate(
-               &_OSKextConsiderDestroyingLinkContext, 0);
+               &_OSKextConsiderDestroyingLinkContext, NULL);
        if (!sDestroyLinkContextThread) {
                OSKextLog(/* kext */ NULL,
                    kOSKextLogErrorLevel | kOSKextLogGeneralFlag | kOSKextLogLinkFlag,
@@ -7097,7 +7222,7 @@ OSKext::considerUnloads(Boolean rescheduleOnlyFlag)
        IORecursiveLockLock(sKextInnerLock);
 
        if (!sUnloadCallout) {
-               sUnloadCallout = thread_call_allocate(&_OSKextConsiderUnloads, 0);
+               sUnloadCallout = thread_call_allocate(&_OSKextConsiderUnloads, NULL);
        }
 
        /* we only reset delay value for unloading if we already have something
@@ -8497,7 +8622,12 @@ OSKextGrabPgoDataLocked(OSKext *kext,
        size_t metadata_size = 0;
 
        sect_prf_data = kext->lookupSection("__DATA", "__llvm_prf_data");
-       sect_prf_name = kext->lookupSection("__DATA", "__llvm_prf_name");
+       sect_prf_name = kext->lookupSection("__DATA", "__llvm_prf_names");
+       if (!sect_prf_name) {
+               // kextcache sometimes truncates the section name to 15 chars
+               // <rdar://problem/52080551> 16 character section name is truncated to 15 characters by kextcache
+               sect_prf_name = kext->lookupSection("__DATA", "__llvm_prf_name");
+       }
        sect_prf_cnts = kext->lookupSection("__DATA", "__llvm_prf_cnts");
 
        if (!sect_prf_data || !sect_prf_name || !sect_prf_cnts) {
@@ -8664,11 +8794,12 @@ OSKext::copyLoadedKextInfoByUUID(
 {
        OSDictionary * result = NULL;
        OSDictionary * kextInfo = NULL; // must release
-       uint32_t       count, i;
+       uint32_t       max_count, i, j;
        uint32_t       idCount = 0;
        uint32_t       idIndex = 0;
-
        IORecursiveLockLock(sKextLock);
+       OSArray *list[2] = {sLoadedKexts, sLoadedDriverKitKexts};
+       uint32_t count[2] = {sLoadedKexts->getCount(), sLoadedDriverKitKexts->getCount()};
 
 #if CONFIG_MACF
        /* Is the calling process allowed to query kext info? */
@@ -8704,81 +8835,83 @@ OSKext::copyLoadedKextInfoByUUID(
                infoKeys = NULL;
        }
 
-       count = sLoadedKexts->getCount();
-       result = OSDictionary::withCapacity(count);
+       max_count = count[0] + count[1];
+       result = OSDictionary::withCapacity(max_count);
        if (!result) {
                goto finish;
        }
 
-       for (i = 0; i < count; i++) {
-               OSKext       *thisKext     = NULL;// do not release
-               Boolean       includeThis  = true;
-               uuid_t        thisKextUUID;
-               uuid_t        thisKextTextUUID;
-               OSData       *uuid_data;
-               uuid_string_t uuid_key;
-
-               thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i));
-               if (!thisKext) {
-                       continue;
-               }
-
-               uuid_data = thisKext->copyUUID();
-               if (!uuid_data) {
-                       continue;
-               }
-
-               memcpy(&thisKextUUID, uuid_data->getBytesNoCopy(), sizeof(thisKextUUID));
-               OSSafeReleaseNULL(uuid_data);
+       for (j = 0; j < (sizeof(list) / sizeof(list[0])); j++) {
+               for (i = 0; i < count[j]; i++) {
+                       OSKext       *thisKext     = NULL;// do not release
+                       Boolean       includeThis  = true;
+                       uuid_t        thisKextUUID;
+                       uuid_t        thisKextTextUUID;
+                       OSData       *uuid_data;
+                       uuid_string_t uuid_key;
 
-               uuid_unparse(thisKextUUID, uuid_key);
+                       thisKext = OSDynamicCast(OSKext, list[j]->getObject(i));
+                       if (!thisKext) {
+                               continue;
+                       }
 
-               uuid_data = thisKext->copyTextUUID();
-               if (!uuid_data) {
-                       continue;
-               }
-               memcpy(&thisKextTextUUID, uuid_data->getBytesNoCopy(), sizeof(thisKextTextUUID));
-               OSSafeReleaseNULL(uuid_data);
+                       uuid_data = thisKext->copyUUID();
+                       if (!uuid_data) {
+                               continue;
+                       }
 
-               /* Skip current kext if we have a list of UUIDs and
-                * it isn't in the list.
-                */
-               if (kextIdentifiers) {
-                       includeThis = false;
+                       memcpy(&thisKextUUID, uuid_data->getBytesNoCopy(), sizeof(thisKextUUID));
+                       OSSafeReleaseNULL(uuid_data);
 
-                       for (idIndex = 0; idIndex < idCount; idIndex++) {
-                               const OSString* wantedUUID = OSDynamicCast(OSString,
-                                   kextIdentifiers->getObject(idIndex));
+                       uuid_unparse(thisKextUUID, uuid_key);
 
-                               uuid_t uuid;
-                               uuid_parse(wantedUUID->getCStringNoCopy(), uuid);
+                       uuid_data = thisKext->copyTextUUID();
+                       if (!uuid_data) {
+                               continue;
+                       }
+                       memcpy(&thisKextTextUUID, uuid_data->getBytesNoCopy(), sizeof(thisKextTextUUID));
+                       OSSafeReleaseNULL(uuid_data);
 
-                               if ((0 == uuid_compare(uuid, thisKextUUID))
-                                   || (0 == uuid_compare(uuid, thisKextTextUUID))) {
-                                       includeThis = true;
-                                       /* Only need to find the first kext if multiple match,
-                                        * ie. asking for the kernel uuid does not need to find
-                                        * interface kexts or builtin static kexts.
-                                        */
-                                       kextIdentifiers->removeObject(idIndex);
-                                       uuid_unparse(uuid, uuid_key);
-                                       break;
+                       /* Skip current kext if we have a list of UUIDs and
+                        * it isn't in the list.
+                        */
+                       if (kextIdentifiers) {
+                               includeThis = false;
+
+                               for (idIndex = 0; idIndex < idCount; idIndex++) {
+                                       const OSString* wantedUUID = OSDynamicCast(OSString,
+                                           kextIdentifiers->getObject(idIndex));
+
+                                       uuid_t uuid;
+                                       uuid_parse(wantedUUID->getCStringNoCopy(), uuid);
+
+                                       if ((0 == uuid_compare(uuid, thisKextUUID))
+                                           || (0 == uuid_compare(uuid, thisKextTextUUID))) {
+                                               includeThis = true;
+                                               /* Only need to find the first kext if multiple match,
+                                                * ie. asking for the kernel uuid does not need to find
+                                                * interface kexts or builtin static kexts.
+                                                */
+                                               kextIdentifiers->removeObject(idIndex);
+                                               uuid_unparse(uuid, uuid_key);
+                                               break;
+                                       }
                                }
                        }
-               }
 
-               if (!includeThis) {
-                       continue;
-               }
+                       if (!includeThis) {
+                               continue;
+                       }
 
-               kextInfo = thisKext->copyInfo(infoKeys);
-               if (kextInfo) {
-                       result->setObject(uuid_key, kextInfo);
-                       kextInfo->release();
-               }
+                       kextInfo = thisKext->copyInfo(infoKeys);
+                       if (kextInfo) {
+                               result->setObject(uuid_key, kextInfo);
+                               kextInfo->release();
+                       }
 
-               if (kextIdentifiers && !kextIdentifiers->getCount()) {
-                       break;
+                       if (kextIdentifiers && !kextIdentifiers->getCount()) {
+                               goto finish;
+                       }
                }
        }
 
@@ -9121,6 +9254,30 @@ OSKext::copyInfo(OSArray * infoKeys)
                                }
                                result->setObject(kOSBundleCPUSubtypeKey, cpuSubtypeNumber);
                        }
+               } else {
+                       if (isDriverKit() && _OSArrayContainsCString(infoKeys, kOSBundleLogStringsKey)) {
+                               osLogDataHeaderRef *header;
+                               char headerBytes[offsetof(osLogDataHeaderRef, sections) + NUM_OS_LOG_SECTIONS * sizeof(header->sections[0])];
+                               bool res;
+
+                               header             = (osLogDataHeaderRef *) headerBytes;
+                               header->version    = OS_LOG_HDR_VERSION;
+                               header->sect_count = NUM_OS_LOG_SECTIONS;
+                               header->sections[OS_LOG_SECT_IDX].sect_offset  = 0;
+                               header->sections[OS_LOG_SECT_IDX].sect_size    = (uint32_t) 0;
+                               header->sections[CSTRING_SECT_IDX].sect_offset = 0;
+                               header->sections[CSTRING_SECT_IDX].sect_size   = (uint32_t) 0;
+
+                               logData = OSData::withBytes(header, (u_int) (sizeof(osLogDataHeaderRef)));
+                               if (!logData) {
+                                       goto finish;
+                               }
+                               res = logData->appendBytes(&(header->sections[0]), (u_int)(header->sect_count * sizeof(header->sections[0])));
+                               if (!res) {
+                                       goto finish;
+                               }
+                               result->setObject(kOSBundleLogStringsKey, logData);
+                       }
                }
        }
 
@@ -9187,6 +9344,29 @@ OSKext::copyInfo(OSArray * infoKeys)
                        result->setObject(kOSBundleExecutablePathKey, executablePathString);
                } else if (flags.builtin) {
                        result->setObject(kOSBundleExecutablePathKey, bundleID);
+               } else if (isDriverKit()) {
+                       if (path) {
+                               // +1 for slash, +1 for \0
+                               uint32_t pathLength = path->getLength();
+                               executablePathCStringSize = pathLength + 2;
+
+                               executablePathCString = (char *)kalloc_tag((executablePathCStringSize) *
+                                   sizeof(char), VM_KERN_MEMORY_OSKEXT);
+                               if (!executablePathCString) {
+                                       goto finish;
+                               }
+                               strlcpy(executablePathCString, path->getCStringNoCopy(), executablePathCStringSize);
+                               executablePathCString[pathLength++] = '/';
+                               executablePathCString[pathLength++] = '\0';
+
+                               executablePathString = OSString::withCString(executablePathCString);
+
+                               if (!executablePathString) {
+                                       goto finish;
+                               }
+
+                               result->setObject(kOSBundleExecutablePathKey, executablePathString);
+                       }
                }
        }
 
@@ -9249,7 +9429,8 @@ OSKext::copyInfo(OSArray * infoKeys)
            _OSArrayContainsCString(infoKeys, kOSBundleExecLoadAddressKey) ||
            _OSArrayContainsCString(infoKeys, kOSBundleExecLoadSizeKey) ||
            _OSArrayContainsCString(infoKeys, kOSBundleWiredSizeKey)) {
-               if (isInterface() || flags.builtin || linkedExecutable) {
+               bool is_dext = isDriverKit();
+               if (isInterface() || flags.builtin || linkedExecutable || is_dext) {
                        /* These go to userspace via serialization, so we don't want any doubts
                         * about their size.
                         */
@@ -9299,6 +9480,15 @@ OSKext::copyInfo(OSArray * infoKeys)
                                } else {
                                        wiredSize = loadSize;
                                }
+                       } else if (is_dext) {
+                               /*
+                                * DriverKit userspace executables do not have a kernel linkedExecutable,
+                                * so we "fake" their address range with the LoadTag.
+                                */
+                               if (loadTag) {
+                                       loadAddress = execLoadAddress = loadTag;
+                                       loadSize = execLoadSize = 1;
+                               }
                        }
 
                        if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleLoadAddressKey)) {
@@ -9492,6 +9682,35 @@ finish:
        return result;
 }
 
+/*********************************************************************
+*********************************************************************/
+/* static */
+bool
+OSKext::copyUserExecutablePath(const OSSymbol * bundleID, char * pathResult, size_t pathSize)
+{
+       bool ok;
+       OSKext * kext;
+
+       IORecursiveLockLock(sKextLock);
+       kext = OSDynamicCast(OSKext, sKextsByID->getObject(bundleID));
+       if (kext) {
+               kext->retain();
+       }
+       IORecursiveLockUnlock(sKextLock);
+
+       if (!kext || !kext->path || !kext->userExecutableRelPath) {
+               OSSafeReleaseNULL(kext);
+               return false;
+       }
+       snprintf(pathResult, pathSize, "%s/Contents/MacOS/%s",
+           kext->path->getCStringNoCopy(),
+           kext->userExecutableRelPath->getCStringNoCopy());
+       ok = true;
+       kext->release();
+
+       return ok;
+}
+
 /*********************************************************************
 *********************************************************************/
 /* static */
@@ -9690,6 +9909,64 @@ finish:
        return result;
 }
 
+OSReturn
+OSKext::requestDaemonLaunch(
+       OSString *kextIdentifier,
+       OSString *serverName,
+       OSNumber *serverTag)
+{
+       OSReturn       result        = kOSReturnError;
+       OSDictionary * requestDict   = NULL; // must release
+
+       if (!kextIdentifier || !serverName || !serverTag) {
+               result = kOSKextReturnInvalidArgument;
+               goto finish;
+       }
+
+       IORecursiveLockLock(sKextLock);
+
+       OSKextLog(/* kext */ NULL,
+           kOSKextLogDebugLevel |
+           kOSKextLogGeneralFlag,
+           "Requesting daemon launch for %s with serverName %s and tag %llu",
+           kextIdentifier->getCStringNoCopy(),
+           serverName->getCStringNoCopy(),
+           serverTag->unsigned64BitValue()
+           );
+
+       result = _OSKextCreateRequest(kKextRequestPredicateRequestDaemonLaunch, &requestDict);
+       if (result != kOSReturnSuccess) {
+               goto finish;
+       }
+
+       if (!_OSKextSetRequestArgument(requestDict,
+           kKextRequestArgumentBundleIdentifierKey, kextIdentifier) ||
+           !_OSKextSetRequestArgument(requestDict,
+           kKextRequestArgumentDriverExtensionServerName, serverName) ||
+           !_OSKextSetRequestArgument(requestDict,
+           kKextRequestArgumentDriverExtensionServerTag, serverTag)) {
+               result = kOSKextReturnNoMemory;
+               goto finish;
+       }
+
+       /* Only post the requests after all the other potential failure points
+        * have been passed.
+        */
+       if (!sKernelRequests->setObject(requestDict)) {
+               result = kOSKextReturnNoMemory;
+               goto finish;
+       }
+       OSKext::pingKextd();
+
+       result = kOSReturnSuccess;
+finish:
+       IORecursiveLockUnlock(sKextLock);
+       if (requestDict) {
+               requestDict->release();
+       }
+       return result;
+}
+
 /*********************************************************************
 * Assumes sKextLock is held.
 *********************************************************************/
@@ -11955,6 +12232,20 @@ OSKext::updateActiveAccount(OSKextActiveAccount *accountp)
        accountp->account = this->account;
 }
 
+bool
+OSKext::isDriverKit(void)
+{
+       OSString *bundleType;
+
+       if (infoDict) {
+               bundleType = OSDynamicCast(OSString, infoDict->getObject(kCFBundlePackageTypeKey));
+               if (bundleType && bundleType->isEqualTo(kOSKextBundlePackageTypeDriverKit)) {
+                       return TRUE;
+               }
+       }
+       return FALSE;
+}
+
 extern "C" const vm_allocation_site_t *
 OSKextGetAllocationSiteForCaller(uintptr_t address)
 {
index 7db8d37aaa1d534136e6a4efab248896d37585ce..0d564de95e67b6b51c40c498a140f7e0e8a29d9d 100644 (file)
@@ -60,6 +60,7 @@ __BEGIN_DECLS
 #include <kern/thread_call.h>
 #include <kern/host.h>
 #include <mach/mach_interface.h>
+#include <stddef.h>
 
 #if PRAGMA_MARK
 #pragma mark Macros
@@ -144,14 +145,14 @@ OSMetaClassBase::_RESERVEDOSMetaClassBase2()
 {
        panic("OSMetaClassBase::_RESERVEDOSMetaClassBase%d called.", 2);
 }
-#endif /* SLOT_USED */
-
-// As these slots are used move them up inside the #if above
 void
 OSMetaClassBase::_RESERVEDOSMetaClassBase3()
 {
        panic("OSMetaClassBase::_RESERVEDOSMetaClassBase%d called.", 3);
 }
+#endif /* SLOT_USED */
+
+// As these slots are used move them up inside the #if above
 void
 OSMetaClassBase::_RESERVEDOSMetaClassBase4()
 {
@@ -169,13 +170,14 @@ OSMetaClassBase::_RESERVEDOSMetaClassBase6()
 }
 #endif
 
-
 /*********************************************************************
 *********************************************************************/
 
 #if defined(__arm__) || defined(__arm64__)
 
-
+#if defined(HAS_APPLE_PAC)
+#include <ptrauth.h>
+#endif /* defined(HAS_APPLE_PAC) */
 
 /*
  *  IHI0059A "C++ Application Binary Interface Standard for the ARM 64 - bit Architecture":
@@ -194,9 +196,16 @@ OSMetaClassBase::_RESERVEDOSMetaClassBase6()
  */
 
 OSMetaClassBase::_ptf_t
-OSMetaClassBase::_ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void))
+#if defined(HAS_APPLE_PAC) && __has_feature(ptrauth_type_discriminator)
+OSMetaClassBase::_ptmf2ptf(const OSMetaClassBase *self __attribute__((unused)),
+    void (OSMetaClassBase::*func)(void), uintptr_t typeDisc)
+#else
+OSMetaClassBase::_ptmf2ptf(const OSMetaClassBase *self,
+    void (OSMetaClassBase::*func)(void),
+    uintptr_t typeDisc
+    __attribute__((unused)))
+#endif
 {
-       typedef long int ptrdiff_t;
        struct ptmf_t {
                _ptf_t fPFN;
                ptrdiff_t delta;
@@ -210,6 +219,13 @@ OSMetaClassBase::_ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*
        map.fIn = func;
        pfn     = map.pTMF.fPFN;
 
+#if defined(HAS_APPLE_PAC) && __has_feature(ptrauth_type_discriminator)
+       // Authenticate 'pfn' using the member function pointer type discriminator
+       // and resign it as a C function pointer. 'pfn' can point to either a
+       // non-virtual function or a virtual member function thunk.
+       pfn = ptrauth_auth_function(pfn, ptrauth_key_function_pointer, typeDisc);
+       return pfn;
+#else
        if (map.pTMF.delta & 1) {
                // virtual
                union {
@@ -219,12 +235,33 @@ OSMetaClassBase::_ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*
                u.fObj = self;
 
                // Virtual member function so dereference table
+#if defined(HAS_APPLE_PAC)
+               // The entity hash is stored in the top 32-bits of the vtable offset of a
+               // member function pointer.
+               uint32_t entity_hash = ((uintptr_t)pfn) >> 32;
+               pfn = (_ptf_t)(((uintptr_t) pfn) & 0xFFFFFFFF);
+
+               // Authenticate the vtable pointer.
+               _ptf_t *vtablep = ptrauth_auth_data(*u.vtablep,
+                   ptrauth_key_cxx_vtable_pointer, 0);
+               // Calculate the address of the vtable entry.
+               _ptf_t *vtentryp = (_ptf_t *)(((uintptr_t)vtablep) + (uintptr_t)pfn);
+               // Load the pointer from the vtable entry.
+               pfn = *vtentryp;
+
+               // Finally, resign the vtable entry as a function pointer.
+               uintptr_t auth_data = ptrauth_blend_discriminator(vtentryp, entity_hash);
+               pfn = ptrauth_auth_and_resign(pfn, ptrauth_key_function_pointer,
+                   auth_data, ptrauth_key_function_pointer, 0);
+#else /* defined(HAS_APPLE_PAC) */
                pfn = *(_ptf_t *)(((uintptr_t)*u.vtablep) + (uintptr_t)pfn);
+#endif /* !defined(HAS_APPLE_PAC) */
                return pfn;
        } else {
                // Not virtual, i.e. plain member func
                return pfn;
        }
+#endif
 }
 
 #endif /* defined(__arm__) || defined(__arm64__) */
@@ -243,7 +280,32 @@ OSMetaClassBase::safeMetaCast(
        const OSMetaClassBase * me,
        const OSMetaClass     * toType)
 {
-       return (me)? me->metaCast(toType) : 0;
+       return (me)? me->metaCast(toType) : NULL;
+}
+
+/// A helper function to crash with a kernel panic.
+__attribute__((cold, not_tail_called, noreturn))
+static inline void
+panic_crash_fail_cast(const OSMetaClassBase *me,
+    const OSMetaClass *toType)
+{
+       panic("Unexpected cast fail: from %p to %p", me, toType);
+       __builtin_unreachable();
+}
+
+OSMetaClassBase *
+OSMetaClassBase::requiredMetaCast(
+       const OSMetaClassBase * me,
+       const OSMetaClass     * toType)
+{
+       if (!me) {
+               return NULL;
+       }
+       OSMetaClassBase *tmp = safeMetaCast(me, toType);
+       if (!tmp) {
+               panic_crash_fail_cast(me, toType);
+       }
+       return tmp;
 }
 
 /*********************************************************************
@@ -254,7 +316,7 @@ OSMetaClassBase::checkTypeInst(
        const OSMetaClassBase * typeinst)
 {
        const OSMetaClass * toType = OSTypeIDInst(typeinst);
-       return typeinst && inst && (0 != inst->metaCast(toType));
+       return typeinst && inst && (NULL != inst->metaCast(toType));
 }
 
 /*********************************************************************
@@ -327,7 +389,7 @@ OSMetaClassBase *
 OSMetaClassBase::metaCast(const OSString * toMetaStr) const
 {
        const OSSymbol  * tempSymb = OSSymbol::withString(toMetaStr);
-       OSMetaClassBase * ret = 0;
+       OSMetaClassBase * ret = NULL;
        if (tempSymb) {
                ret = metaCast(tempSymb);
                tempSymb->release();
@@ -341,7 +403,7 @@ OSMetaClassBase *
 OSMetaClassBase::metaCast(const char * toMetaCStr) const
 {
        const OSSymbol  * tempSymb = OSSymbol::withCString(toMetaCStr);
-       OSMetaClassBase * ret = 0;
+       OSMetaClassBase * ret = NULL;
        if (tempSymb) {
                ret = metaCast(tempSymb);
                tempSymb->release();
@@ -362,13 +424,13 @@ public:
        OSObject * alloc() const;
 };
 OSMetaClassMeta::OSMetaClassMeta()
-       : OSMetaClass("OSMetaClass", 0, sizeof(OSMetaClass))
+       : OSMetaClass("OSMetaClass", NULL, sizeof(OSMetaClass))
 {
 }
 OSObject *
 OSMetaClassMeta::alloc() const
 {
-       return 0;
+       return NULL;
 }
 
 static OSMetaClassMeta sOSMetaClassMeta;
@@ -496,6 +558,7 @@ OSMetaClass::logError(OSReturn error)
 * registration, and OSMetaClass::postModLoad(), which actually
 * records all the class/kext relationships of the new MetaClasses.
 *********************************************************************/
+
 OSMetaClass::OSMetaClass(
        const char        * inClassName,
        const OSMetaClass * inSuperClass,
@@ -568,7 +631,7 @@ OSMetaClass::OSMetaClass(
 *********************************************************************/
 OSMetaClass::~OSMetaClass()
 {
-       OSKext * myKext = reserved ? reserved->kext : 0; // do not release
+       OSKext * myKext = reserved ? reserved->kext : NULL; // do not release
 
        /* Hack alert: 'className' is a C string during early C++ init, and
         * is converted to a real OSSymbol only when we record the OSKext in
@@ -698,7 +761,7 @@ OSMetaClass::preModLoad(const char * kextIdentifier)
                    kalloc_tag(kKModCapacityIncrement * sizeof(OSMetaClass *), VM_KERN_MEMORY_OSKEXT);
                if (!sStalled->classes) {
                        kfree(sStalled, sizeof(*sStalled));
-                       return 0;
+                       return NULL;
                }
                OSMETA_ACCUMSIZE((kKModCapacityIncrement * sizeof(OSMetaClass *)) +
                    sizeof(*sStalled));
@@ -730,8 +793,8 @@ OSReturn
 OSMetaClass::postModLoad(void * loadHandle)
 {
        OSReturn         result     = kOSReturnSuccess;
-       OSSymbol       * myKextName = 0;// must release
-       OSKext         * myKext     = 0;// must release
+       OSSymbol       * myKextName = NULL;// must release
+       OSKext         * myKext     = NULL;// must release
 
        if (!sStalled || loadHandle != sStalled) {
                result = kOSMetaClassInternal;
@@ -882,7 +945,7 @@ finish:
                    sizeof(*sStalled)));
                kfree(sStalled->classes, sStalled->capacity * sizeof(OSMetaClass *));
                kfree(sStalled, sizeof(*sStalled));
-               sStalled = 0;
+               sStalled = NULL;
        }
 
        IOLockUnlock(sStalledClassesLock);
@@ -988,7 +1051,7 @@ OSMetaClass::removeInstance(const OSObject * instance, bool super) const
                        }
                         IOLockLock(sAllClassesLock);
                         reserved->instances->release();
-                        reserved->instances = 0;
+                        reserved->instances = NULL;
                         IOLockUnlock(sAllClassesLock);
                }
        }
@@ -1072,7 +1135,7 @@ OSMetaClass::applyToInstancesOfClassName(
        void * context)
 {
         OSMetaClass  * meta;
-        OSOrderedSet * set = 0;
+        OSOrderedSet * set = NULL;
 
         IOLockLock(sAllClassesLock);
         if (sAllClassesDict
@@ -1144,10 +1207,10 @@ OSMetaClass::removeClasses(OSCollection * metaClasses)
 const OSMetaClass *
 OSMetaClass::getMetaClassWithName(const OSSymbol * name)
 {
-        OSMetaClass * retMeta = 0;
+        OSMetaClass * retMeta = NULL;
 
         if (!name) {
-                return 0;
+                return NULL;
        }
 
         IOLockLock(sAllClassesLock);
@@ -1167,10 +1230,10 @@ OSMetaClass::copyMetaClassWithName(const OSSymbol * name)
         const OSMetaClass * meta;
 
         if (!name) {
-                return 0;
+                return NULL;
        }
 
-        meta = 0;
+        meta = NULL;
         IOLockLock(sAllClassesLock);
         if (sAllClassesDict) {
                 meta = (OSMetaClass *) sAllClassesDict->getObject(name);
@@ -1199,7 +1262,7 @@ OSMetaClass::allocClassWithName(const OSSymbol * name)
         const OSMetaClass * meta;
         OSObject          * result;
 
-        result = 0;
+        result = NULL;
         meta = copyMetaClassWithName(name);
         if (meta) {
                 result = meta->alloc();
@@ -1239,7 +1302,7 @@ OSMetaClass::checkMetaCastWithName(
        const OSSymbol        * name,
        const OSMetaClassBase * in)
 {
-        OSMetaClassBase * result = 0;
+        OSMetaClassBase * result = NULL;
 
         const OSMetaClass * const meta = getMetaClassWithName(name);
 
@@ -1305,11 +1368,12 @@ OSMetaClass::checkMetaCast(
                }
        }
 
-        return 0;
+        return NULL;
 }
 
 /*********************************************************************
 *********************************************************************/
+__dead2
 void
 OSMetaClass::reservedCalled(int ind) const
 {
@@ -1332,7 +1396,7 @@ OSMetaClass::getSuperClass() const
 const OSSymbol *
 OSMetaClass::getKmodName() const
 {
-        OSKext * myKext = reserved ? reserved->kext : 0;
+        OSKext * myKext = reserved ? reserved->kext : NULL;
         if (myKext) {
                 return myKext->getIdentifier();
        }
@@ -1383,7 +1447,7 @@ OSDictionary *
 OSMetaClass::getClassDictionary()
 {
         panic("OSMetaClass::getClassDictionary() is obsoleted.\n");
-        return 0;
+        return NULL;
 }
 
 /*********************************************************************
index ffbc9e79359d00bfee6eb82d524769103ff8cca0..6b6a6caae738ae76930fae15f0b0d05f75ba6806 100644 (file)
@@ -85,7 +85,7 @@ OSNumber::withNumber(unsigned long long value,
 
        if (me && !me->init(value, newNumberOfBits)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -98,7 +98,7 @@ OSNumber::withNumber(const char *value, unsigned int newNumberOfBits)
 
        if (me && !me->init(value, newNumberOfBits)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
index de9cc00fa5c2fe94f89b35a8768b3ac43eece2bb..61168bf48cc58f3782be2511f09e1d9c6382ffed 100644 (file)
@@ -50,7 +50,7 @@ __END_DECLS
 /* Class global data */
 OSObject::MetaClass OSObject::gMetaClass;
 const OSMetaClass * const OSObject::metaClass = &OSObject::gMetaClass;
-const OSMetaClass * const OSObject::superClass = 0;
+const OSMetaClass * const OSObject::superClass = NULL;
 
 /* Class member functions - Can't use defaults */
 OSObject::~OSObject()
@@ -64,7 +64,7 @@ OSObject::getMetaClass() const
 OSObject *
 OSObject::MetaClass::alloc() const
 {
-       return 0;
+       return NULL;
 }
 
 /* The OSObject::MetaClass constructor */
@@ -233,13 +233,13 @@ OSObject::taggedRelease(const void *tag, const int when) const
 void
 OSObject::release() const
 {
-       taggedRelease(0);
+       taggedRelease(NULL);
 }
 
 void
 OSObject::retain() const
 {
-       taggedRetain(0);
+       taggedRetain(NULL);
 }
 
 extern "C" void
@@ -257,7 +257,7 @@ osobject_release(void * object)
 void
 OSObject::release(int when) const
 {
-       taggedRelease(0, when);
+       taggedRelease(NULL, when);
 }
 
 bool
@@ -365,3 +365,24 @@ OSObject::OSObject(const OSMetaClass *)
        retainCount = 1;
 //    if (kIOTracking & gIOKitDebug) getMetaClass()->trackedInstance(this);
 }
+
+
+bool
+OSObject::iterateObjects(void * refcon, bool (*callback)(void * refcon, OSObject * object))
+{
+       OSCollection * col;
+       if ((col = OSDynamicCast(OSCollection, this))) {
+               return col->iterateObjects(refcon, callback);
+       }
+       return callback(refcon, this);
+}
+
+bool
+OSObject::iterateObjects(bool (^block)(OSObject * object))
+{
+       OSCollection * col;
+       if ((col = OSDynamicCast(OSCollection, this))) {
+               return col->iterateObjects(block);
+       }
+       return block(this);
+}
index 2b7cd44c8ec47909973c59118ec836fb8fbe3e96..88230f6591c69fcc4415f0ab0f0cd0087b6479ee 100644 (file)
@@ -93,7 +93,7 @@ withCapacity(unsigned int capacity,
 
        if (me && !me->initWithCapacity(capacity, ordering, orderingRef)) {
                me->release();
-               me = 0;
+               me = NULL;
        }
 
        return me;
@@ -298,7 +298,7 @@ OSObject *
 OSOrderedSet::getObject( unsigned int index ) const
 {
        if (index >= count) {
-               return 0;
+               return NULL;
        }
 
 //    if( pri)
@@ -313,7 +313,7 @@ OSOrderedSet::getFirstObject() const
        if (count) {
                return const_cast<OSObject *>((const OSObject *) array[0].obj);
        } else {
-               return 0;
+               return NULL;
        }
 }
 
@@ -323,14 +323,14 @@ OSOrderedSet::getLastObject() const
        if (count) {
                return const_cast<OSObject *>((const OSObject *) array[count - 1].obj);
        } else {
-               return 0;
+               return NULL;
        }
 }
 
 SInt32
 OSOrderedSet::orderObject( const OSMetaClassBase * anObject )
 {
-       return ORDER( anObject, 0 );
+       return ORDER( anObject, NULL );
 }
 
 void *
@@ -399,10 +399,10 @@ getNextObjectForIterator(void *inIterator, OSObject **ret) const
        if (index < count) {
                *ret = const_cast<OSObject *>((const OSObject *) array[index].obj);
        } else {
-               *ret = 0;
+               *ret = NULL;
        }
 
-       return *ret != 0;
+       return *ret != NULL;
 }
 
 
@@ -427,13 +427,13 @@ OSCollection *
 OSOrderedSet::copyCollection(OSDictionary *cycleDict)
 {
        bool allocDict = !cycleDict;
-       OSCollection *ret = 0;
-       OSOrderedSet *newSet = 0;
+       OSCollection *ret = NULL;
+       OSOrderedSet *newSet = NULL;
 
        if (allocDict) {
                cycleDict = OSDictionary::withCapacity(16);
                if (!cycleDict) {
-                       return 0;
+                       return NULL;
                }
        }
 
@@ -474,7 +474,7 @@ OSOrderedSet::copyCollection(OSDictionary *cycleDict)
                ;
 
                ret = newSet;
-               newSet = 0;
+               newSet = NULL;
        } while (false);
 
 abortCopy:
index 122acda60a68b5254cbeac64150f05dd9e983487..ba1dd30b1ed8e902fe2bba8c5ea9114fc38edc33 100644 (file)
@@ -45,6 +45,10 @@ __BEGIN_DECLS
 #include <libkern/prelink.h>
 #include <stdarg.h>
 
+#if KASAN
+#include <san/kasan.h>
+#endif
+
 #if PRAGMA_MARK
 #pragma mark Constants &c.
 #endif /* PRAGMA_MARK */
@@ -95,12 +99,12 @@ kern_os_malloc(size_t size)
 {
        void *mem;
        if (size == 0) {
-               return 0;
+               return NULL;
        }
 
        mem = kallocp_tag_bt((vm_size_t *)&size, VM_KERN_MEMORY_LIBKERN);
        if (!mem) {
-               return 0;
+               return NULL;
        }
 
 #if OSALLOCDEBUG
@@ -147,13 +151,13 @@ kern_os_realloc(
 
        if (nsize == 0) {
                kfree_addr(addr);
-               return 0;
+               return NULL;
        }
 
        nmem = kallocp_tag_bt((vm_size_t *)&nsize, VM_KERN_MEMORY_LIBKERN);
        if (!nmem) {
                kfree_addr(addr);
-               return 0;
+               return NULL;
        }
 
 #if OSALLOCDEBUG
@@ -177,13 +181,13 @@ kern_os_realloc(
 *********************************************************************/
 
 #if __GNUC__ >= 3
-void
+void __dead2
 __cxa_pure_virtual( void )
 {
        panic("%s", __FUNCTION__);
 }
 #else
-void
+void __dead2
 __pure_virtual( void )
 {
        panic("%s", __FUNCTION__);
@@ -236,6 +240,9 @@ __END_DECLS
 * kern_os C++ Runtime Load/Unload
 *********************************************************************/
 
+#if defined(HAS_APPLE_PAC)
+#include <ptrauth.h>
+#endif /* defined(HAS_APPLE_PAC) */
 
 typedef void (*structor_t)(void);
 
@@ -310,6 +317,10 @@ OSRuntimeCallStructorsInSection(
                                        break;
                                }
 
+#if !defined(XXX) && defined(HAS_APPLE_PAC)
+                               structor = __builtin_ptrauth_strip(structor, ptrauth_key_function_pointer);
+                               structor = __builtin_ptrauth_sign_unauthenticated(structor, ptrauth_key_function_pointer, 0);
+#endif
                                (*structor)();
                        } else if (!hit_null_structor) {
                                hit_null_structor = 1;
@@ -393,7 +404,7 @@ OSRuntimeFinalizeCPP(
        segment = firstsegfromheader(header);
 
        for (segment = firstsegfromheader(header);
-           segment != 0;
+           segment != NULL;
            segment = nextsegfromheader(header, segment)) {
                OSRuntimeCallStructorsInSection(theKext, kmodInfo, NULL, segment,
                    sectionNames[kOSSectionNameFinalizer], textStart, textEnd);
@@ -487,7 +498,7 @@ OSRuntimeInitializeCPP(
                 * segment, and invoke the constructors within those sections.
                 */
                for (segment = firstsegfromheader(header);
-                   segment != failure_segment && segment != 0;
+                   segment != failure_segment && segment != NULL;
                    segment = nextsegfromheader(header, segment)) {
                        OSRuntimeCallStructorsInSection(theKext, kmodInfo, NULL, segment,
                            sectionNames[kOSSectionNameFinalizer], textStart, textEnd);
@@ -572,11 +583,42 @@ noexcept
 #endif
 {
        if (ptr) {
+#if KASAN
+               /*
+                * Unpoison the C++ array cookie inserted (but not removed) by the
+                * compiler on new[].
+                */
+               kasan_unpoison_cxx_array_cookie(ptr);
+#endif
                kern_os_free(ptr);
        }
        return;
 }
 
+#if __cplusplus >= 201103L
+
+void
+operator delete(void * addr, size_t sz) noexcept
+{
+#if OSALLOCDEBUG
+       OSAddAtomic(-sz, &debug_iomalloc_size);
+#endif /* OSALLOCDEBUG */
+       kfree(addr, sz);
+}
+
+void
+operator delete[](void * addr, size_t sz) noexcept
+{
+       if (addr) {
+#if OSALLOCDEBUG
+               OSAddAtomic(-sz, &debug_iomalloc_size);
+#endif /* OSALLOCDEBUG */
+               kfree(addr, sz);
+       }
+}
+
+#endif /* __cplusplus >= 201103L */
+
 /* PR-6481964 - The compiler is going to check for size overflows in calls to
  * new[], and if there is an overflow, it will call __throw_length_error.
  * This is an unrecoverable error by the C++ standard, so we must panic here.
@@ -585,7 +627,7 @@ noexcept
  * compiler expects the name to be mangled.
  */
 namespace std {
-void
+void __dead2
 __throw_length_error(const char *msg __unused)
 {
        panic("Size of array created by new[] has overflowed");
index a0366f02d813ec026e009097de6c87ae7f50e57d..d015efe9e4dd43f2efb260b2a96bbe06a8003614 100644 (file)
@@ -205,7 +205,7 @@ OSSerialize::initWithCapacity(unsigned int inCapacity)
        }
        if (round_page_overflow(inCapacity, &capacity)) {
                tags->release();
-               tags = 0;
+               tags = NULL;
                return false;
        }
 
@@ -217,7 +217,7 @@ OSSerialize::initWithCapacity(unsigned int inCapacity)
        kern_return_t rc = kmem_alloc(kernel_map, (vm_offset_t *)&data, capacity, IOMemoryTag(kernel_map));
        if (rc) {
                tags->release();
-               tags = 0;
+               tags = NULL;
                return false;
        }
        bzero((void *)data, capacity);
@@ -235,7 +235,7 @@ OSSerialize::withCapacity(unsigned int inCapacity)
 
        if (me && !me->initWithCapacity(inCapacity)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -303,9 +303,8 @@ OSSerialize::ensureCapacity(unsigned int newCapacity)
 void
 OSSerialize::free()
 {
-       if (tags) {
-               tags->release();
-       }
+       OSSafeReleaseNULL(tags);
+       OSSafeReleaseNULL(indexData);
 
        if (data) {
                kmem_free(kernel_map, (vm_offset_t)data, capacity);
@@ -325,7 +324,7 @@ OSSerializer * OSSerializer::forTarget( void * target,
        thing = new OSSerializer;
        if (thing && !thing->init()) {
                thing->release();
-               thing = 0;
+               thing = NULL;
        }
 
        if (thing) {
@@ -352,7 +351,7 @@ OSSerializer::withBlock(
 
        block = Block_copy(callback);
        if (!block) {
-               return 0;
+               return NULL;
        }
 
        serializer = (OSSerializer::forTarget(NULL, &OSSerializer::callbackToBlock, block));
index b408296c40aa6a70fed1a7138bb85cd32acdf915..a9d9ed61c02c38e150aa953169d269dc6ba75c14 100644 (file)
@@ -51,11 +51,11 @@ OSSerialize::binaryWithCapacity(unsigned int inCapacity,
        OSSerialize *me;
 
        if (inCapacity < sizeof(uint32_t)) {
-               return 0;
+               return NULL;
        }
        me = OSSerialize::withCapacity(inCapacity);
        if (!me) {
-               return 0;
+               return NULL;
        }
 
        me->binary        = true;
@@ -98,17 +98,38 @@ OSSerialize::addBinary(const void * bits, size_t size)
        return true;
 }
 
+void
+OSSerialize::setIndexed(bool index __unused)
+{
+       assert(index && !indexData);
+       indexData = OSData::withCapacity(256);
+       assert(indexData);
+}
+
 bool
 OSSerialize::addBinaryObject(const OSMetaClassBase * o, uint32_t key,
-    const void * bits, size_t size)
+    const void * bits, size_t size,
+    uint32_t * startCollection)
 {
        unsigned int newCapacity;
        size_t       alignSize;
+       size_t       headerSize;
 
        // add to tag array
        tags->setObject(o);
 
-       if (os_add3_overflow(size, sizeof(key), 3, &alignSize)) {
+       headerSize = sizeof(key);
+       if (indexData) {
+               uint32_t offset = length;
+               if (startCollection) {
+                       *startCollection = offset;
+                       headerSize += sizeof(uint32_t);
+               }
+               offset /= sizeof(uint32_t);
+               indexData->appendBytes(&offset, sizeof(offset));
+       }
+
+       if (os_add3_overflow(size, headerSize, 3, &alignSize)) {
                return false;
        }
        alignSize &= ~3L;
@@ -131,14 +152,58 @@ OSSerialize::addBinaryObject(const OSMetaClassBase * o, uint32_t key,
        }
 
        bcopy(&key, &data[length], sizeof(key));
-       bcopy(bits, &data[length + sizeof(key)], size);
+       bcopy(bits, &data[length + headerSize], size);
        length += alignSize;
 
        return true;
 }
 
+void
+OSSerialize::endBinaryCollection(uint32_t startCollection)
+{
+       uint32_t clength;
+
+       if (!indexData) {
+               return;
+       }
+
+       assert(length > startCollection);
+       if (length <= startCollection) {
+               return;
+       }
+
+       clength = length - startCollection;
+       assert(!(clength & 3));
+       clength /= sizeof(uint32_t);
+
+       memcpy(&data[startCollection + sizeof(uint32_t)], &clength, sizeof(clength));
+}
+
 bool
 OSSerialize::binarySerialize(const OSMetaClassBase *o)
+{
+       bool ok;
+       uint32_t header;
+
+       ok = binarySerializeInternal(o);
+       if (!ok) {
+               return ok;
+       }
+
+       if (indexData) {
+               header = indexData->getLength() / sizeof(uint32_t);
+               assert(header <= kOSSerializeDataMask);
+               header <<= 8;
+               header |= kOSSerializeIndexedBinarySignature;
+
+               memcpy(&data[0], &header, sizeof(header));
+       }
+
+       return ok;
+}
+
+bool
+OSSerialize::binarySerializeInternal(const OSMetaClassBase *o)
 {
        OSDictionary * dict;
        OSArray      * array;
@@ -150,13 +215,18 @@ OSSerialize::binarySerialize(const OSMetaClassBase *o)
        OSBoolean    * boo;
 
        unsigned int  tagIdx;
-       uint32_t   i, key;
+       uint32_t   i, key, startCollection;
        size_t     len;
        bool       ok;
 
        tagIdx = tags->getNextIndexOfObject(o, 0);
        // does it exist?
        if (-1U != tagIdx) {
+               if (indexData) {
+                       assert(indexData->getLength() > (tagIdx * sizeof(uint32_t)));
+                       tagIdx = ((const uint32_t *)indexData->getBytesNoCopy())[tagIdx];
+                       assert(tagIdx <= kOSSerializeDataMask);
+               }
                key = (kOSSerializeObject | tagIdx);
                if (endCollection) {
                        endCollection = false;
@@ -168,11 +238,11 @@ OSSerialize::binarySerialize(const OSMetaClassBase *o)
 
        if ((dict = OSDynamicCast(OSDictionary, o))) {
                key = (kOSSerializeDictionary | dict->count);
-               ok = addBinaryObject(o, key, NULL, 0);
+               ok = addBinaryObject(o, key, NULL, 0, &startCollection);
                for (i = 0; ok && (i < dict->count);) {
                        const OSSymbol        * dictKey;
                        const OSMetaClassBase * dictValue;
-                       const OSMetaClassBase * nvalue = 0;
+                       const OSMetaClassBase * nvalue = NULL;
 
                        dictKey = dict->dictionary[i].key;
                        dictValue = dict->dictionary[i].value;
@@ -197,9 +267,10 @@ OSSerialize::binarySerialize(const OSMetaClassBase *o)
                        }
 //                     if (!ok) ok = binarySerialize(kOSBooleanFalse);
                }
+               endBinaryCollection(startCollection);
        } else if ((array = OSDynamicCast(OSArray, o))) {
                key = (kOSSerializeArray | array->count);
-               ok = addBinaryObject(o, key, NULL, 0);
+               ok = addBinaryObject(o, key, NULL, 0, &startCollection);
                for (i = 0; ok && (i < array->count);) {
                        i++;
                        endCollection = (i == array->count);
@@ -209,9 +280,10 @@ OSSerialize::binarySerialize(const OSMetaClassBase *o)
                        }
 //                     if (!ok) ok = binarySerialize(kOSBooleanFalse);
                }
+               endBinaryCollection(startCollection);
        } else if ((set = OSDynamicCast(OSSet, o))) {
                key = (kOSSerializeSet | set->members->count);
-               ok = addBinaryObject(o, key, NULL, 0);
+               ok = addBinaryObject(o, key, NULL, 0, &startCollection);
                for (i = 0; ok && (i < set->members->count);) {
                        i++;
                        endCollection = (i == set->members->count);
@@ -221,27 +293,28 @@ OSSerialize::binarySerialize(const OSMetaClassBase *o)
                        }
 //                     if (!ok) ok = binarySerialize(kOSBooleanFalse);
                }
+               endBinaryCollection(startCollection);
        } else if ((num = OSDynamicCast(OSNumber, o))) {
                key = (kOSSerializeNumber | num->size);
-               ok = addBinaryObject(o, key, &num->value, sizeof(num->value));
+               ok = addBinaryObject(o, key, &num->value, sizeof(num->value), NULL);
        } else if ((boo = OSDynamicCast(OSBoolean, o))) {
                key = (kOSSerializeBoolean | (kOSBooleanTrue == boo));
-               ok = addBinaryObject(o, key, NULL, 0);
+               ok = addBinaryObject(o, key, NULL, 0, NULL);
        } else if ((sym = OSDynamicCast(OSSymbol, o))) {
                len = (sym->getLength() + 1);
                key = (kOSSerializeSymbol | len);
-               ok = addBinaryObject(o, key, sym->getCStringNoCopy(), len);
+               ok = addBinaryObject(o, key, sym->getCStringNoCopy(), len, NULL);
        } else if ((str = OSDynamicCast(OSString, o))) {
-               len = (str->getLength() + 0);
+               len = (str->getLength() + ((indexData != NULL) ? 1 : 0));
                key = (kOSSerializeString | len);
-               ok = addBinaryObject(o, key, str->getCStringNoCopy(), len);
+               ok = addBinaryObject(o, key, str->getCStringNoCopy(), len, NULL);
        } else if ((ldata = OSDynamicCast(OSData, o))) {
                len = ldata->getLength();
                if (ldata->reserved && ldata->reserved->disableSerialization) {
                        len = 0;
                }
                key = (kOSSerializeData | len);
-               ok = addBinaryObject(o, key, ldata->getBytesNoCopy(), len);
+               ok = addBinaryObject(o, key, ldata->getBytesNoCopy(), len, NULL);
        } else {
                return false;
        }
@@ -303,23 +376,28 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin
 
        size_t           bufferPos;
        const uint32_t * next;
-       uint32_t         key, len, wordLen;
+       uint32_t         key, len, wordLen, length;
        bool             end, newCollect, isRef;
        unsigned long long value;
-       bool ok;
+       bool ok, indexed, hasLength;
 
+       indexed = false;
        if (errorString) {
-               *errorString = 0;
+               *errorString = NULL;
        }
+
        if (bufferSize < sizeof(kOSSerializeBinarySignature)) {
                return NULL;
        }
-       if (0 != strcmp(kOSSerializeBinarySignature, buffer)) {
+       if (kOSSerializeIndexedBinarySignature == (((const uint8_t *) buffer)[0])) {
+               indexed = true;
+       } else if (0 != strcmp(kOSSerializeBinarySignature, buffer)) {
                return NULL;
        }
        if (3 & ((uintptr_t) buffer)) {
                return NULL;
        }
+
        bufferPos = sizeof(kOSSerializeBinarySignature);
        next = (typeof(next))(((uintptr_t) buffer) + bufferPos);
 
@@ -329,12 +407,12 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin
        objsIdx   = objsCapacity  = 0;
        stackIdx  = stackCapacity = 0;
 
-       result   = 0;
-       parent   = 0;
-       dict     = 0;
-       array    = 0;
-       set      = 0;
-       sym      = 0;
+       result   = NULL;
+       parent   = NULL;
+       dict     = NULL;
+       array    = NULL;
+       set      = NULL;
+       sym      = NULL;
 
        ok = true;
        while (ok) {
@@ -343,27 +421,31 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin
                        break;
                }
                key = *next++;
+               length = 0;
 
                len = (key & kOSSerializeDataMask);
                wordLen = (len + 3) >> 2;
                end = (0 != (kOSSerializeEndCollecton & key));
                DEBG("key 0x%08x: 0x%04x, %d\n", key, len, end);
 
-               newCollect = isRef = false;
-               o = 0; newDict = 0; newArray = 0; newSet = 0;
+               newCollect = isRef = hasLength = false;
+               o = NULL; newDict = NULL; newArray = NULL; newSet = NULL;
 
                switch (kOSSerializeTypeMask & key) {
                case kOSSerializeDictionary:
                        o = newDict = OSDictionary::withCapacity(len);
                        newCollect = (len != 0);
+                       hasLength  = indexed;
                        break;
                case kOSSerializeArray:
                        o = newArray = OSArray::withCapacity(len);
                        newCollect = (len != 0);
+                       hasLength  = indexed;
                        break;
                case kOSSerializeSet:
                        o = newSet = OSSet::withCapacity(len);
                        newCollect = (len != 0);
+                       hasLength  = indexed;
                        break;
 
                case kOSSerializeObject:
@@ -430,10 +512,18 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin
                        break;
                }
 
-               if (!(ok = (o != 0))) {
+               if (!(ok = (o != NULL))) {
                        break;
                }
 
+               if (hasLength) {
+                       bufferPos += sizeof(*next);
+                       if (!(ok = (bufferPos <= bufferSize))) {
+                               break;
+                       }
+                       length = *next++;
+               }
+
                if (!isRef) {
                        setAtIndex(objs, objsIdx, o);
                        if (!ok) {
@@ -451,7 +541,7 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin
                                sym = OSDynamicCast(OSSymbol, sym);
                                if (!sym && (str = OSDynamicCast(OSString, str))) {
                                        sym = const_cast<OSSymbol *>(OSSymbol::withString(str));
-                                       ok = (sym != 0);
+                                       ok = (sym != NULL);
                                        if (!ok) {
                                                break;
                                        }
@@ -463,7 +553,7 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin
                                if (sym && (sym != str)) {
                                        sym->release();
                                }
-                               sym = 0;
+                               sym = NULL;
                        }
                } else if (array) {
                        ok = array->setObject(o);
@@ -481,7 +571,7 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin
                }
 
                if (end) {
-                       parent = 0;
+                       parent = NULL;
                }
                if (newCollect) {
                        stackIdx++;
@@ -509,12 +599,12 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin
                        if (!parent) {
                                break;
                        }
-                       set   = 0;
-                       dict  = 0;
-                       array = 0;
+                       set   = NULL;
+                       dict  = NULL;
+                       array = NULL;
                        if (!(dict = OSDynamicCast(OSDictionary, parent))) {
                                if (!(array = OSDynamicCast(OSArray, parent))) {
-                                       ok = (0 != (set = OSDynamicCast(OSSet, parent)));
+                                       ok = (NULL != (set = OSDynamicCast(OSSet, parent)));
                                }
                        }
                }
@@ -522,11 +612,11 @@ OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorStrin
        DEBG("ret %p\n", result);
 
        if (!ok) {
-               result = 0;
+               result = NULL;
        }
 
        if (objsCapacity) {
-               for (len = (result != 0); len < objsIdx; len++) {
+               for (len = (result != NULL); len < objsIdx; len++) {
                        objsArray[len]->release();
                }
                kfree(objsArray, objsCapacity  * sizeof(*objsArray));
index 3c7701dcf4abe9970fb98bd687d306676a7dfb4d..ed8b2762cea037e22eed05c465de6bbeadc83152 100644 (file)
@@ -119,7 +119,7 @@ OSSet::withCapacity(unsigned int capacity)
 
        if (me && !me->initWithCapacity(capacity)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -134,7 +134,7 @@ OSSet::withObjects(const OSObject *objects[],
 
        if (me && !me->initWithObjects(objects, count, capacity)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -148,7 +148,7 @@ OSSet::withArray(const OSArray *array,
 
        if (me && !me->initWithArray(array, capacity)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -162,7 +162,7 @@ OSSet::withSet(const OSSet *set,
 
        if (me && !me->initWithSet(set, capacity)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -230,7 +230,7 @@ OSSet::setObject(const OSMetaClassBase *anObject)
 bool
 OSSet::merge(const OSArray * array)
 {
-       const OSMetaClassBase * anObject = 0;
+       const OSMetaClassBase * anObject = NULL;
        bool                    result   = true;
 
        for (int i = 0; (anObject = array->getObject(i)); i++) {
@@ -367,10 +367,10 @@ OSSet::getNextObjectForIterator(void *inIterator, OSObject **ret) const
        if (index < members->count) {
                *ret = members->getObject(index);
        } else {
-               *ret = 0;
+               *ret = NULL;
        }
 
-       return *ret != 0;
+       return *ret != NULL;
 }
 
 bool
@@ -410,13 +410,13 @@ OSCollection *
 OSSet::copyCollection(OSDictionary *cycleDict)
 {
        bool allocDict = !cycleDict;
-       OSCollection *ret = 0;
-       OSSet *newSet = 0;
+       OSCollection *ret = NULL;
+       OSSet *newSet = NULL;
 
        if (allocDict) {
                cycleDict = OSDictionary::withCapacity(16);
                if (!cycleDict) {
-                       return 0;
+                       return NULL;
                }
        }
 
@@ -455,7 +455,7 @@ OSSet::copyCollection(OSDictionary *cycleDict)
                ;
 
                ret = newSet;
-               newSet = 0;
+               newSet = NULL;
        } while (false);
 
 abortCopy:
index 91fc3cba2fe4b79df7b6a25b12cbb0e25e8c54a4..c5196917c708efa618cd434b376af30f81ce5214 100644 (file)
@@ -165,7 +165,7 @@ OSString::withString(const OSString *aString)
 
        if (me && !me->initWithString(aString)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -178,7 +178,7 @@ OSString::withCString(const char *cString)
 
        if (me && !me->initWithCString(cString)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -191,7 +191,7 @@ OSString::withCStringNoCopy(const char *cString)
 
        if (me && !me->initWithCStringNoCopy(cString)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
@@ -204,7 +204,7 @@ OSString::withStringOfLength(const char *cString, size_t length)
 
        if (me && !me->initWithStringOfLength(cString, length)) {
                me->release();
-               return 0;
+               return NULL;
        }
 
        return me;
index 455ea10d648b4f8b7cff7d7cf3720c829036c0da..695fda33a432c40676b5363908add6a5f228351d 100644 (file)
@@ -188,7 +188,7 @@ OSSymbolPool::init()
 
        poolGate = lck_rw_alloc_init(IOLockGroup, LCK_ATTR_NULL);
 
-       return poolGate != 0;
+       return poolGate != NULL;
 }
 
 OSSymbolPool::OSSymbolPool(const OSSymbolPool *old)
@@ -197,7 +197,7 @@ OSSymbolPool::OSSymbolPool(const OSSymbolPool *old)
        nBuckets = old->nBuckets;
        buckets = old->buckets;
 
-       poolGate = 0;   // Do not duplicate the poolGate
+       poolGate = NULL; // Do not duplicate the poolGate
 }
 
 OSSymbolPool::~OSSymbolPool()
@@ -250,7 +250,7 @@ OSSymbolPool::nextHashState(OSSymbolPoolState *stateP)
 
        while (!stateP->j) {
                if (!stateP->i) {
-                       return 0;
+                       return NULL;
                }
                stateP->i--;
                thisBucket--;
@@ -319,7 +319,7 @@ OSSymbolPool::findSymbol(const char *cString) const
        j = thisBucket->count;
 
        if (!j) {
-               return 0;
+               return NULL;
        }
 
        if (j == 1) {
@@ -330,7 +330,7 @@ OSSymbolPool::findSymbol(const char *cString) const
                    && probeSymbol->taggedTryRetain(nullptr)) {
                        return probeSymbol;
                }
-               return 0;
+               return NULL;
        }
 
        for (list = thisBucket->symbolP; j--; list++) {
@@ -342,7 +342,7 @@ OSSymbolPool::findSymbol(const char *cString) const
                }
        }
 
-       return 0;
+       return NULL;
 }
 
 OSSymbol *
@@ -432,7 +432,7 @@ OSSymbolPool::removeSymbol(OSSymbol *sym)
                probeSymbol = (OSSymbol *) list;
 
                if (probeSymbol == sym) {
-                       thisBucket->symbolP = 0;
+                       thisBucket->symbolP = NULL;
                        count--;
                        thisBucket->count--;
                        SHRINK_POOL();
index 86f396784c8df94d7ef65f3ea71db68f4d42a68a..d3189324e5b4481f390c4c9b033ad0a4485ba604 100644 (file)
@@ -292,7 +292,7 @@ yylex()
 
                /* copy to null terminated buffer */
                tempString = (char *)malloc(length + 1);
-               if (tempString == 0) {
+               if (tempString == NULL) {
                        printf("OSUnserialize: can't alloc temp memory\n");
                        return 0;
                }
@@ -320,7 +320,7 @@ yylex()
                (void)nextChar();
                /* copy to null terminated buffer */
                tempString = (char *)malloc(length + 1);
-               if (tempString == 0) {
+               if (tempString == NULL) {
                        printf("OSUnserialize: can't alloc temp memory\n");
                        return 0;
                }
@@ -626,9 +626,9 @@ OSUnserialize(const char *buffer, OSString **errorString)
        tags = OSDictionary::withCapacity(128);
        if (yyparse() == 0) {
                object = parsedObject;
-               if (errorString) *errorString = 0;
+               if (errorString) *errorString = NULL;
        } else {
-               object = 0;
+               object = NULL;
                if (errorString)
                        *errorString = OSString::withCString(yyerror_message);
        }
index 2c53ef4cc37cb38d053aeb3cc22d3bbfd79d40ee..3382460427d3795863f8693141e713be4b4994cb 100644 (file)
@@ -258,7 +258,7 @@ typedef int YYSTYPE;
 
 
 /* Line 216 of yacc.c.  */
-#line 215 "OSUnserializeXML.tab.c"
+#line 212 "OSUnserializeXML.tab.c"
 
 #ifdef short
 # undef short
@@ -549,10 +549,10 @@ static const yytype_int8 yyrhs[] =
 /* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
 static const yytype_uint16 yyrline[] =
 {
-       0, 149, 149, 152, 157, 162, 174, 186, 198, 210,
-       222, 234, 246, 265, 268, 271, 274, 275, 290, 299,
-       311, 314, 317, 320, 323, 326, 329, 332, 339, 342,
-       345, 348, 351
+       0, 146, 146, 149, 154, 159, 171, 183, 195, 207,
+       219, 231, 243, 267, 270, 273, 276, 277, 292, 301,
+       313, 316, 319, 322, 325, 328, 331, 334, 341, 344,
+       347, 350, 353
 };
 #endif
 
@@ -933,7 +933,7 @@ int yydebug;
 
 /* YYINITDEPTH -- initial size of the parser's stacks.  */
 #ifndef YYINITDEPTH
-# define YYINITDEPTH 64
+# define YYINITDEPTH 200
 #endif
 
 /* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
@@ -1495,14 +1495,14 @@ yyreduce:
        YY_REDUCE_PRINT(yyn);
        switch (yyn) {
        case 2:
-#line 149 "OSUnserializeXML.y"
+#line 146 "OSUnserializeXML.y"
                { yyerror("unexpected end of buffer");
                  YYERROR;
                  ;}
                break;
 
        case 3:
-#line 152 "OSUnserializeXML.y"
+#line 149 "OSUnserializeXML.y"
                { STATE->parsedObject = (yyvsp[(1) - (1)])->object;
                  (yyvsp[(1) - (1)])->object = 0;
                  freeObject(STATE, (yyvsp[(1) - (1)]));
@@ -1511,14 +1511,14 @@ yyreduce:
                break;
 
        case 4:
-#line 157 "OSUnserializeXML.y"
+#line 154 "OSUnserializeXML.y"
                { yyerror("syntax error");
                  YYERROR;
                  ;}
                break;
 
        case 5:
-#line 162 "OSUnserializeXML.y"
+#line 159 "OSUnserializeXML.y"
                { (yyval) = buildDictionary(STATE, (yyvsp[(1) - (1)]));
 
                  if (!yyval->object) {
@@ -1534,7 +1534,7 @@ yyreduce:
                break;
 
        case 6:
-#line 174 "OSUnserializeXML.y"
+#line 171 "OSUnserializeXML.y"
                { (yyval) = buildArray(STATE, (yyvsp[(1) - (1)]));
 
                  if (!yyval->object) {
@@ -1550,7 +1550,7 @@ yyreduce:
                break;
 
        case 7:
-#line 186 "OSUnserializeXML.y"
+#line 183 "OSUnserializeXML.y"
                { (yyval) = buildSet(STATE, (yyvsp[(1) - (1)]));
 
                  if (!yyval->object) {
@@ -1566,7 +1566,7 @@ yyreduce:
                break;
 
        case 8:
-#line 198 "OSUnserializeXML.y"
+#line 195 "OSUnserializeXML.y"
                { (yyval) = buildString(STATE, (yyvsp[(1) - (1)]));
 
                  if (!yyval->object) {
@@ -1582,7 +1582,7 @@ yyreduce:
                break;
 
        case 9:
-#line 210 "OSUnserializeXML.y"
+#line 207 "OSUnserializeXML.y"
                { (yyval) = buildData(STATE, (yyvsp[(1) - (1)]));
 
                  if (!yyval->object) {
@@ -1598,7 +1598,7 @@ yyreduce:
                break;
 
        case 10:
-#line 222 "OSUnserializeXML.y"
+#line 219 "OSUnserializeXML.y"
                { (yyval) = buildNumber(STATE, (yyvsp[(1) - (1)]));
 
                  if (!yyval->object) {
@@ -1614,7 +1614,7 @@ yyreduce:
                break;
 
        case 11:
-#line 234 "OSUnserializeXML.y"
+#line 231 "OSUnserializeXML.y"
                { (yyval) = buildBoolean(STATE, (yyvsp[(1) - (1)]));
 
                  if (!yyval->object) {
@@ -1630,7 +1630,7 @@ yyreduce:
                break;
 
        case 12:
-#line 246 "OSUnserializeXML.y"
+#line 243 "OSUnserializeXML.y"
                { (yyval) = retrieveObject(STATE, (yyvsp[(1) - (1)])->idref);
                  if ((yyval)) {
                          STATE->retrievedObjectCount++;
@@ -1654,21 +1654,21 @@ yyreduce:
                break;
 
        case 13:
-#line 265 "OSUnserializeXML.y"
+#line 267 "OSUnserializeXML.y"
                { (yyval) = (yyvsp[(1) - (2)]);
                  (yyval)->elements = NULL;
                  ;}
                break;
 
        case 14:
-#line 268 "OSUnserializeXML.y"
+#line 270 "OSUnserializeXML.y"
                { (yyval) = (yyvsp[(1) - (3)]);
                  (yyval)->elements = (yyvsp[(2) - (3)]);
                  ;}
                break;
 
        case 17:
-#line 275 "OSUnserializeXML.y"
+#line 277 "OSUnserializeXML.y"
                { (yyval) = (yyvsp[(2) - (2)]);
                  (yyval)->next = (yyvsp[(1) - (2)]);
 
@@ -1685,7 +1685,7 @@ yyreduce:
                break;
 
        case 18:
-#line 290 "OSUnserializeXML.y"
+#line 292 "OSUnserializeXML.y"
                { (yyval) = (yyvsp[(1) - (2)]);
                  (yyval)->key = (OSSymbol *)(yyval)->object;
                  (yyval)->object = (yyvsp[(2) - (2)])->object;
@@ -1696,7 +1696,7 @@ yyreduce:
                break;
 
        case 19:
-#line 299 "OSUnserializeXML.y"
+#line 301 "OSUnserializeXML.y"
                { (yyval) = buildSymbol(STATE, (yyvsp[(1) - (1)]));
 
 //                               STATE->parsedObjectCount++;
@@ -1708,42 +1708,42 @@ yyreduce:
                break;
 
        case 20:
-#line 311 "OSUnserializeXML.y"
+#line 313 "OSUnserializeXML.y"
                { (yyval) = (yyvsp[(1) - (2)]);
                  (yyval)->elements = NULL;
                  ;}
                break;
 
        case 21:
-#line 314 "OSUnserializeXML.y"
+#line 316 "OSUnserializeXML.y"
                { (yyval) = (yyvsp[(1) - (3)]);
                  (yyval)->elements = (yyvsp[(2) - (3)]);
                  ;}
                break;
 
        case 23:
-#line 320 "OSUnserializeXML.y"
+#line 322 "OSUnserializeXML.y"
                { (yyval) = (yyvsp[(1) - (2)]);
                  (yyval)->elements = NULL;
                  ;}
                break;
 
        case 24:
-#line 323 "OSUnserializeXML.y"
+#line 325 "OSUnserializeXML.y"
                { (yyval) = (yyvsp[(1) - (3)]);
                  (yyval)->elements = (yyvsp[(2) - (3)]);
                  ;}
                break;
 
        case 26:
-#line 329 "OSUnserializeXML.y"
+#line 331 "OSUnserializeXML.y"
                { (yyval) = (yyvsp[(1) - (1)]);
                  (yyval)->next = NULL;
                  ;}
                break;
 
        case 27:
-#line 332 "OSUnserializeXML.y"
+#line 334 "OSUnserializeXML.y"
                { (yyval) = (yyvsp[(2) - (2)]);
                  (yyval)->next = (yyvsp[(1) - (2)]);
                  ;}
@@ -1751,7 +1751,7 @@ yyreduce:
 
 
 /* Line 1267 of yacc.c.  */
-#line 1699 "OSUnserializeXML.tab.c"
+#line 1701 "OSUnserializeXML.tab.c"
        default: break;
        }
        YY_SYMBOL_PRINT("-> $$ =", yyr1[yyn], &yyval, &yyloc);
@@ -1963,7 +1963,7 @@ yyreturn:
 }
 
 
-#line 354 "OSUnserializeXML.y"
+#line 356 "OSUnserializeXML.y"
 
 
 int
@@ -2187,7 +2187,7 @@ getString(parser_state_t *state)
 
        /* copy to null terminated buffer */
        tempString = (char *)malloc(length + 1);
-       if (tempString == 0) {
+       if (tempString == NULL) {
                printf("OSUnserializeXML: can't alloc temp memory\n");
                goto error;
        }
@@ -2324,7 +2324,8 @@ static const signed char __CFPLDataDecodeTable[128] = {
 static void *
 getCFEncodedData(parser_state_t *state, unsigned int *size)
 {
-       int numeq = 0, acc = 0, cntr = 0;
+       int numeq = 0, cntr = 0;
+       unsigned int acc = 0;
        int tmpbufpos = 0, tmpbuflen = 0;
        unsigned char *tmpbuf = (unsigned char *)malloc(DATA_ALLOC_SIZE);
 
@@ -2865,7 +2866,7 @@ buildSymbol(parser_state_t *state, object_t *o)
 {
        OSSymbol *symbol;
 
-       symbol = const_cast<OSSymbol *>(OSSymbol::withCString(o->string));
+       symbol = const_cast < OSSymbol * > (OSSymbol::withCString(o->string));
        if (o->idref >= 0) {
                rememberObject(state, o->idref, symbol);
        }
@@ -2972,7 +2973,8 @@ OSUnserializeXML(const char *buffer, size_t bufferSize, OSString **errorString)
                return 0;
        }
 
-       if (!strcmp(kOSSerializeBinarySignature, buffer)) {
+       if (!strcmp(kOSSerializeBinarySignature, buffer)
+           || (kOSSerializeIndexedBinarySignature == (uint8_t)buffer[0])) {
                return OSUnserializeBinary(buffer, bufferSize, errorString);
        }
 
index 4f1c3cc97e1861fdde4b5534529738c4294c7077..1769fb6317a949639779f89082bbb11f814164ea 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 1999-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 1999-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
@@ -55,7 +55,7 @@
 //
 //
 
-     
+
 %pure_parser
 
 %{
 #include <libkern/c++/OSContainers.h>
 #include <libkern/c++/OSLib.h>
 
-#define MAX_OBJECTS    65535
+#define MAX_OBJECTS              131071
+#define MAX_REFED_OBJECTS        65535
 
 #define YYSTYPE object_t *
-#define YYPARSE_PARAM  state
-#define YYLEX_PARAM    (parser_state_t *)state
+#define YYPARSE_PARAM   state
+#define YYLEX_PARAM     (parser_state_t *)state
 
 // this is the internal struct used to hold objects on parser stack
 // it represents objects both before and after they have been created
-typedef        struct object {
-       struct object   *next;
-       struct object   *free;
-       struct object   *elements;
-       OSObject        *object;
-       OSSymbol        *key;                   // for dictionary
-       int             size;
-       void            *data;                  // for data
-       char            *string;                // for string & symbol
-       long long       number;                 // for number
-       int             idref;
+typedef struct object {
+       struct object   *next;
+       struct object   *free;
+       struct object   *elements;
+       OSObject        *object;
+       OSSymbol        *key;                   // for dictionary
+       int             size;
+       void            *data;                  // for data
+       char            *string;                // for string & symbol
+       long long       number;                 // for number
+       int             idref;
 } object_t;
 
 // this code is reentrant, this structure contains all
 // state information for the parsing of a single buffer
 typedef struct parser_state {
-       const char      *parseBuffer;           // start of text to be parsed
-       int             parseBufferIndex;       // current index into text
-       int             lineNumber;             // current line number
-       object_t        *objects;               // internal objects in use
-       object_t        *freeObjects;           // internal objects that are free
-       OSDictionary    *tags;                  // used to remember "ID" tags
-       OSString        **errorString;          // parse error with line
-       OSObject        *parsedObject;          // resultant object of parsed text
-       int             parsedObjectCount;
+       const char      *parseBuffer;           // start of text to be parsed
+       int             parseBufferIndex;       // current index into text
+       int             lineNumber;             // current line number
+       object_t        *objects;               // internal objects in use
+       object_t        *freeObjects;           // internal objects that are free
+       OSDictionary    *tags;                  // used to remember "ID" tags
+       OSString        **errorString;          // parse error with line
+       OSObject        *parsedObject;          // resultant object of parsed text
+       int             parsedObjectCount;
+       int             retrievedObjectCount;
 } parser_state_t;
 
-#define STATE          ((parser_state_t *)state)
+#define STATE           ((parser_state_t *)state)
 
-#undef yyerror         
-#define yyerror(s)     OSUnserializeerror(STATE, (s))
-static int             OSUnserializeerror(parser_state_t *state, const char *s);
+#undef yyerror
+#define yyerror(s)      OSUnserializeerror(STATE, (s))
+static int              OSUnserializeerror(parser_state_t *state, const char *s);
 
-static int             yylex(YYSTYPE *lvalp, parser_state_t *state);
+static int              yylex(YYSTYPE *lvalp, parser_state_t *state);
 
-static object_t        *newObject(parser_state_t *state);
-static void            freeObject(parser_state_t *state, object_t *o);
-static void            rememberObject(parser_state_t *state, int tag, OSObject *o);
-static object_t                *retrieveObject(parser_state_t *state, int tag);
-static void            cleanupObjects(parser_state_t *state);
+static object_t         *newObject(parser_state_t *state);
+static void             freeObject(parser_state_t *state, object_t *o);
+static void             rememberObject(parser_state_t *state, int tag, OSObject *o);
+static object_t         *retrieveObject(parser_state_t *state, int tag);
+static void             cleanupObjects(parser_state_t *state);
 
-static object_t                *buildDictionary(parser_state_t *state, object_t *o);
-static object_t                *buildArray(parser_state_t *state, object_t *o);
-static object_t                *buildSet(parser_state_t *state, object_t *o);
-static object_t                *buildString(parser_state_t *state, object_t *o);
-static object_t                *buildSymbol(parser_state_t *state, object_t *o);
-static object_t                *buildData(parser_state_t *state, object_t *o);
-static object_t                *buildNumber(parser_state_t *state, object_t *o);
-static object_t                *buildBoolean(parser_state_t *state, object_t *o);
+static object_t         *buildDictionary(parser_state_t *state, object_t *o);
+static object_t         *buildArray(parser_state_t *state, object_t *o);
+static object_t         *buildSet(parser_state_t *state, object_t *o);
+static object_t         *buildString(parser_state_t *state, object_t *o);
+static object_t         *buildSymbol(parser_state_t *state, object_t *o);
+static object_t         *buildData(parser_state_t *state, object_t *o);
+static object_t         *buildNumber(parser_state_t *state, object_t *o);
+static object_t         *buildBoolean(parser_state_t *state, object_t *o);
 
 #include <libkern/OSRuntime.h>
 
@@ -138,7 +140,7 @@ static object_t             *buildBoolean(parser_state_t *state, object_t *o);
 %token NUMBER
 %token SET
 %token STRING
-%token SYNTAX_ERROR     
+%token SYNTAX_ERROR
 %% /* Grammar rules and actions follow */
 
 input:   /* empty */           { yyerror("unexpected end of buffer");
@@ -240,8 +242,13 @@ object:      dict                  { $$ = buildDictionary(STATE, $1);
                                }
        | idref                 { $$ = retrieveObject(STATE, $1->idref);
                                  if ($$) {
+                                   STATE->retrievedObjectCount++;
                                    $$->object->retain();
-                                 } else { 
+                                   if (STATE->retrievedObjectCount > MAX_REFED_OBJECTS) {
+                                     yyerror("maximum object reference count");
+                                     YYERROR;
+                                   }
+                                 } else {
                                    yyerror("forward reference detected");
                                    YYERROR;
                                  }
@@ -285,7 +292,7 @@ pairs:        pair
 pair:    key object            { $$ = $1;
                                  $$->key = (OSSymbol *)$$->object;
                                  $$->object = $2->object;
-                                 $$->next = NULL; 
+                                 $$->next = NULL;
                                  $2->object = 0;
                                  freeObject(STATE, $2);
                                }
@@ -321,8 +328,8 @@ set:          '[' ']'               { $$ = $1;
        | SET
        ;
 
-elements: object               { $$ = $1; 
-                                 $$->next = NULL; 
+elements: object               { $$ = $1;
+                                 $$->next = NULL;
                                }
        | elements object       { $$ = $2;
                                  $$->next = $1;
@@ -351,40 +358,40 @@ string:     STRING
 int
 OSUnserializeerror(parser_state_t * state, const char *s)  /* Called by yyparse on errors */
 {
-    if (state->errorString) {
-       char tempString[128];
-       snprintf(tempString, 128, "OSUnserializeXML: %s near line %d\n", s, state->lineNumber);
-       *(state->errorString) = OSString::withCString(tempString);
-    }
-    
-    return 0;
+       if (state->errorString) {
+               char tempString[128];
+               snprintf(tempString, 128, "OSUnserializeXML: %s near line %d\n", s, state->lineNumber);
+               *(state->errorString) = OSString::withCString(tempString);
+       }
+
+       return 0;
 }
 
-#define TAG_MAX_LENGTH         32
-#define TAG_MAX_ATTRIBUTES     32
-#define TAG_BAD                        0
-#define TAG_START              1
-#define TAG_END                        2
-#define TAG_EMPTY              3
-#define TAG_IGNORE             4
-
-#define currentChar()  (state->parseBuffer[state->parseBufferIndex])
-#define nextChar()     (state->parseBuffer[++state->parseBufferIndex])
-#define prevChar()     (state->parseBuffer[state->parseBufferIndex - 1])
-
-#define isSpace(c)     ((c) == ' ' || (c) == '\t')
-#define isAlpha(c)     (((c) >= 'A' && (c) <= 'Z') || ((c) >= 'a' && (c) <= 'z'))
-#define isDigit(c)     ((c) >= '0' && (c) <= '9')
-#define isAlphaDigit(c)        ((c) >= 'a' && (c) <= 'f')
-#define isHexDigit(c)  (isDigit(c) || isAlphaDigit(c))
-#define isAlphaNumeric(c) (isAlpha(c) || isDigit(c) || ((c) == '-')) 
+#define TAG_MAX_LENGTH          32
+#define TAG_MAX_ATTRIBUTES      32
+#define TAG_BAD                 0
+#define TAG_START               1
+#define TAG_END                 2
+#define TAG_EMPTY               3
+#define TAG_IGNORE              4
+
+#define currentChar()   (state->parseBuffer[state->parseBufferIndex])
+#define nextChar()      (state->parseBuffer[++state->parseBufferIndex])
+#define prevChar()      (state->parseBuffer[state->parseBufferIndex - 1])
+
+#define isSpace(c)      ((c) == ' ' || (c) == '\t')
+#define isAlpha(c)      (((c) >= 'A' && (c) <= 'Z') || ((c) >= 'a' && (c) <= 'z'))
+#define isDigit(c)      ((c) >= '0' && (c) <= '9')
+#define isAlphaDigit(c) ((c) >= 'a' && (c) <= 'f')
+#define isHexDigit(c)   (isDigit(c) || isAlphaDigit(c))
+#define isAlphaNumeric(c) (isAlpha(c) || isDigit(c) || ((c) == '-'))
 
 static int
 getTag(parser_state_t *state,
-       char tag[TAG_MAX_LENGTH],
-       int *attributeCount, 
-       char attributes[TAG_MAX_ATTRIBUTES][TAG_MAX_LENGTH],
-       char values[TAG_MAX_ATTRIBUTES][TAG_MAX_LENGTH] )
+    char tag[TAG_MAX_LENGTH],
+    int *attributeCount,
+    char attributes[TAG_MAX_ATTRIBUTES][TAG_MAX_LENGTH],
+    char values[TAG_MAX_ATTRIBUTES][TAG_MAX_LENGTH] )
 {
        int length = 0;
        int c = currentChar();
@@ -392,116 +399,151 @@ getTag(parser_state_t *state,
 
        *attributeCount = 0;
 
-       if (c != '<') return TAG_BAD;
-        c = nextChar();                // skip '<'
+       if (c != '<') {
+               return TAG_BAD;
+       }
+       c = nextChar();         // skip '<'
 
 
        // <!TAG   declarations     >
        // <!--     comments      -->
-        if (c == '!') {
-           c = nextChar();  
-           bool isComment = (c == '-') && ((c = nextChar()) != 0) && (c == '-');
-           if (!isComment && !isAlpha(c)) return TAG_BAD;   // <!1, <!-A, <!eos
-
-           while (c && (c = nextChar()) != 0) {
-               if (c == '\n') state->lineNumber++;
-               if (isComment) {
-                   if (c != '-') continue;
-                   c = nextChar();
-                   if (c != '-') continue;
-                   c = nextChar();
+       if (c == '!') {
+               c = nextChar();
+               bool isComment = (c == '-') && ((c = nextChar()) != 0) && (c == '-');
+               if (!isComment && !isAlpha(c)) {
+                       return TAG_BAD;                      // <!1, <!-A, <!eos
                }
-               if (c == '>') {
-                   (void)nextChar();
-                   return TAG_IGNORE;
+               while (c && (c = nextChar()) != 0) {
+                       if (c == '\n') {
+                               state->lineNumber++;
+                       }
+                       if (isComment) {
+                               if (c != '-') {
+                                       continue;
+                               }
+                               c = nextChar();
+                               if (c != '-') {
+                                       continue;
+                               }
+                               c = nextChar();
+                       }
+                       if (c == '>') {
+                               (void)nextChar();
+                               return TAG_IGNORE;
+                       }
+                       if (isComment) {
+                               break;
+                       }
                }
-               if (isComment) break;
-           }
-           return TAG_BAD;
-       }
-
-       else
-
+               return TAG_BAD;
+       } else
        // <? Processing Instructions  ?>
-        if (c == '?') {
-           while ((c = nextChar()) != 0) {
-               if (c == '\n') state->lineNumber++;
-               if (c != '?') continue;
-               c = nextChar();
-               if (!c) return TAG_IGNORE;
-               if (c == '>') {
-                   (void)nextChar();
-                   return TAG_IGNORE;
+       if (c == '?') {
+               while ((c = nextChar()) != 0) {
+                       if (c == '\n') {
+                               state->lineNumber++;
+                       }
+                       if (c != '?') {
+                               continue;
+                       }
+                       c = nextChar();
+                       if (!c) {
+                               return TAG_IGNORE;
+                       }
+                       if (c == '>') {
+                               (void)nextChar();
+                               return TAG_IGNORE;
+                       }
                }
-           }
-           return TAG_BAD;
-       }
-
-       else
-
-       // </ end tag >    
+               return TAG_BAD;
+       } else
+       // </ end tag >
        if (c == '/') {
-               c = nextChar();         // skip '/'
+               c = nextChar();         // skip '/'
                tagType = TAG_END;
        }
-        if (!isAlpha(c)) return TAG_BAD;
+       if (!isAlpha(c)) {
+               return TAG_BAD;
+       }
 
        /* find end of tag while copying it */
        while (isAlphaNumeric(c)) {
                tag[length++] = c;
                c = nextChar();
-               if (length >= (TAG_MAX_LENGTH - 1)) return TAG_BAD;
+               if (length >= (TAG_MAX_LENGTH - 1)) {
+                       return TAG_BAD;
+               }
        }
 
        tag[length] = 0;
 
 //     printf("tag %s, type %d\n", tag, tagType);
-       
+
        // look for attributes of the form attribute = "value" ...
        while ((c != '>') && (c != '/')) {
-               while (isSpace(c)) c = nextChar();
+               while (isSpace(c)) {
+                       c = nextChar();
+               }
 
                length = 0;
                while (isAlphaNumeric(c)) {
                        attributes[*attributeCount][length++] = c;
-                       if (length >= (TAG_MAX_LENGTH - 1)) return TAG_BAD;
+                       if (length >= (TAG_MAX_LENGTH - 1)) {
+                               return TAG_BAD;
+                       }
                        c = nextChar();
                }
                attributes[*attributeCount][length] = 0;
 
-               while (isSpace(c)) c = nextChar();
-               
-               if (c != '=') return TAG_BAD;
+               while (isSpace(c)) {
+                       c = nextChar();
+               }
+
+               if (c != '=') {
+                       return TAG_BAD;
+               }
                c = nextChar();
-               
-               while (isSpace(c)) c = nextChar();
 
-               if (c != '"') return TAG_BAD;
+               while (isSpace(c)) {
+                       c = nextChar();
+               }
+
+               if (c != '"') {
+                       return TAG_BAD;
+               }
                c = nextChar();
                length = 0;
                while (c != '"') {
                        values[*attributeCount][length++] = c;
-                       if (length >= (TAG_MAX_LENGTH - 1)) return TAG_BAD;
+                       if (length >= (TAG_MAX_LENGTH - 1)) {
+                               return TAG_BAD;
+                       }
                        c = nextChar();
-                       if (!c) return TAG_BAD;
+                       if (!c) {
+                               return TAG_BAD;
+                       }
                }
                values[*attributeCount][length] = 0;
 
                c = nextChar(); // skip closing quote
 
-//             printf("        attribute '%s' = '%s', nextchar = '%c'\n", 
+//             printf("        attribute '%s' = '%s', nextchar = '%c'\n",
 //                    attributes[*attributeCount], values[*attributeCount], c);
 
                (*attributeCount)++;
-               if (*attributeCount >= TAG_MAX_ATTRIBUTES) return TAG_BAD;
+               if (*attributeCount >= TAG_MAX_ATTRIBUTES) {
+                       return TAG_BAD;
+               }
        }
 
        if (c == '/') {
-               c = nextChar();         // skip '/'
+               c = nextChar();         // skip '/'
                tagType = TAG_EMPTY;
        }
-       if (c != '>') return TAG_BAD;
-       c = nextChar();         // skip '>'
+       if (c != '>') {
+               return TAG_BAD;
+       }
+       c = nextChar();         // skip '>'
 
        return tagType;
 }
@@ -517,20 +559,24 @@ getString(parser_state_t *state)
        /* find end of string */
 
        while (c != 0) {
-               if (c == '\n') state->lineNumber++;
+               if (c == '\n') {
+                       state->lineNumber++;
+               }
                if (c == '<') {
                        break;
                }
                c = nextChar();
        }
 
-       if (c != '<') return 0;
+       if (c != '<') {
+               return 0;
+       }
 
        length = state->parseBufferIndex - start;
 
        /* copy to null terminated buffer */
        tempString = (char *)malloc(length + 1);
-       if (tempString == 0) {
+       if (tempString == NULL) {
                printf("OSUnserializeXML: can't alloc temp memory\n");
                goto error;
        }
@@ -544,30 +590,48 @@ getString(parser_state_t *state)
                if (c != '&') {
                        tempString[j++] = c;
                } else {
-                       if ((i+3) > length) goto error;
+                       if ((i + 3) > length) {
+                               goto error;
+                       }
                        c = state->parseBuffer[start + i++];
                        if (c == 'l') {
-                               if (state->parseBuffer[start + i++] != 't') goto error;
-                               if (state->parseBuffer[start + i++] != ';') goto error;
+                               if (state->parseBuffer[start + i++] != 't') {
+                                       goto error;
+                               }
+                               if (state->parseBuffer[start + i++] != ';') {
+                                       goto error;
+                               }
                                tempString[j++] = '<';
                                continue;
-                       }       
+                       }
                        if (c == 'g') {
-                               if (state->parseBuffer[start + i++] != 't') goto error;
-                               if (state->parseBuffer[start + i++] != ';') goto error;
+                               if (state->parseBuffer[start + i++] != 't') {
+                                       goto error;
+                               }
+                               if (state->parseBuffer[start + i++] != ';') {
+                                       goto error;
+                               }
                                tempString[j++] = '>';
                                continue;
-                       }       
-                       if ((i+3) > length) goto error;
+                       }
+                       if ((i + 3) > length) {
+                               goto error;
+                       }
                        if (c == 'a') {
-                               if (state->parseBuffer[start + i++] != 'm') goto error;
-                               if (state->parseBuffer[start + i++] != 'p') goto error;
-                               if (state->parseBuffer[start + i++] != ';') goto error;
+                               if (state->parseBuffer[start + i++] != 'm') {
+                                       goto error;
+                               }
+                               if (state->parseBuffer[start + i++] != 'p') {
+                                       goto error;
+                               }
+                               if (state->parseBuffer[start + i++] != ';') {
+                                       goto error;
+                               }
                                tempString[j++] = '&';
                                continue;
                        }
                        goto error;
-               }       
+               }
        }
        tempString[j] = 0;
 
@@ -576,7 +640,9 @@ getString(parser_state_t *state)
        return tempString;
 
 error:
-       if (tempString) free(tempString);
+       if (tempString) {
+               free(tempString);
+       }
        return 0;
 }
 
@@ -600,7 +666,7 @@ getNumber(parser_state_t *state)
                        negate = true;
                        c = nextChar();
                }
-               while(isDigit(c)) {
+               while (isDigit(c)) {
                        n = (n * base + c - '0');
                        c = nextChar();
                }
@@ -608,7 +674,7 @@ getNumber(parser_state_t *state)
                        n = (unsigned long long)((long long)n * (long long)-1);
                }
        } else {
-               while(isHexDigit(c)) {
+               while (isHexDigit(c)) {
                        if (isDigit(c)) {
                                n = (n * base + c - '0');
                        } else {
@@ -624,22 +690,22 @@ getNumber(parser_state_t *state)
 // taken from CFXMLParsing/CFPropertyList.c
 
 static const signed char __CFPLDataDecodeTable[128] = {
-    /* 000 */ -1, -1, -1, -1, -1, -1, -1, -1,
-    /* 010 */ -1, -1, -1, -1, -1, -1, -1, -1,
-    /* 020 */ -1, -1, -1, -1, -1, -1, -1, -1,
-    /* 030 */ -1, -1, -1, -1, -1, -1, -1, -1,
-    /* ' ' */ -1, -1, -1, -1, -1, -1, -1, -1,
-    /* '(' */ -1, -1, -1, 62, -1, -1, -1, 63,
-    /* '0' */ 52, 53, 54, 55, 56, 57, 58, 59,
-    /* '8' */ 60, 61, -1, -1, -1,  0, -1, -1,
-    /* '@' */ -1,  0,  1,  2,  3,  4,  5,  6,
-    /* 'H' */  7,  8,  9, 10, 11, 12, 13, 14,
-    /* 'P' */ 15, 16, 17, 18, 19, 20, 21, 22,
-    /* 'X' */ 23, 24, 25, -1, -1, -1, -1, -1,
-    /* '`' */ -1, 26, 27, 28, 29, 30, 31, 32,
-    /* 'h' */ 33, 34, 35, 36, 37, 38, 39, 40,
-    /* 'p' */ 41, 42, 43, 44, 45, 46, 47, 48,
-    /* 'x' */ 49, 50, 51, -1, -1, -1, -1, -1
+       /* 000 */ -1, -1, -1, -1, -1, -1, -1, -1,
+       /* 010 */ -1, -1, -1, -1, -1, -1, -1, -1,
+       /* 020 */ -1, -1, -1, -1, -1, -1, -1, -1,
+       /* 030 */ -1, -1, -1, -1, -1, -1, -1, -1,
+       /* ' ' */ -1, -1, -1, -1, -1, -1, -1, -1,
+       /* '(' */ -1, -1, -1, 62, -1, -1, -1, 63,
+       /* '0' */ 52, 53, 54, 55, 56, 57, 58, 59,
+       /* '8' */ 60, 61, -1, -1, -1, 0, -1, -1,
+       /* '@' */ -1, 0, 1, 2, 3, 4, 5, 6,
+       /* 'H' */ 7, 8, 9, 10, 11, 12, 13, 14,
+       /* 'P' */ 15, 16, 17, 18, 19, 20, 21, 22,
+       /* 'X' */ 23, 24, 25, -1, -1, -1, -1, -1,
+       /* '`' */ -1, 26, 27, 28, 29, 30, 31, 32,
+       /* 'h' */ 33, 34, 35, 36, 37, 38, 39, 40,
+       /* 'p' */ 41, 42, 43, 44, 45, 46, 47, 48,
+       /* 'x' */ 49, 50, 51, -1, -1, -1, -1, -1
 };
 
 #define DATA_ALLOC_SIZE 4096
@@ -647,103 +713,115 @@ static const signed char __CFPLDataDecodeTable[128] = {
 static void *
 getCFEncodedData(parser_state_t *state, unsigned int *size)
 {
-    int numeq = 0, acc = 0, cntr = 0;
-    int tmpbufpos = 0, tmpbuflen = 0;
-    unsigned char *tmpbuf = (unsigned char *)malloc(DATA_ALLOC_SIZE);
-
-    int c = currentChar();
-    *size = 0;
-       
-    while (c != '<') {
-        c &= 0x7f;
-       if (c == 0) {
+       int numeq = 0, cntr = 0;
+       unsigned int acc = 0;
+       int tmpbufpos = 0, tmpbuflen = 0;
+       unsigned char *tmpbuf = (unsigned char *)malloc(DATA_ALLOC_SIZE);
+
+       int c = currentChar();
+       *size = 0;
+
+       while (c != '<') {
+               c &= 0x7f;
+               if (c == 0) {
+                       free(tmpbuf);
+                       return 0;
+               }
+               if (c == '=') {
+                       numeq++;
+               } else {
+                       numeq = 0;
+               }
+               if (c == '\n') {
+                       state->lineNumber++;
+               }
+               if (__CFPLDataDecodeTable[c] < 0) {
+                       c = nextChar();
+                       continue;
+               }
+               cntr++;
+               acc <<= 6;
+               acc += __CFPLDataDecodeTable[c];
+               if (0 == (cntr & 0x3)) {
+                       if (tmpbuflen <= tmpbufpos + 2) {
+                               tmpbuflen += DATA_ALLOC_SIZE;
+                               tmpbuf = (unsigned char *)realloc(tmpbuf, tmpbuflen);
+                       }
+                       tmpbuf[tmpbufpos++] = (acc >> 16) & 0xff;
+                       if (numeq < 2) {
+                               tmpbuf[tmpbufpos++] = (acc >> 8) & 0xff;
+                       }
+                       if (numeq < 1) {
+                               tmpbuf[tmpbufpos++] = acc & 0xff;
+                       }
+               }
+               c = nextChar();
+       }
+       *size = tmpbufpos;
+       if (*size == 0) {
                free(tmpbuf);
                return 0;
        }
-       if (c == '=') numeq++; else numeq = 0;
-       if (c == '\n') state->lineNumber++;
-        if (__CFPLDataDecodeTable[c] < 0) {
-           c = nextChar();
-            continue;
-       }
-        cntr++;
-        acc <<= 6;
-        acc += __CFPLDataDecodeTable[c];
-        if (0 == (cntr & 0x3)) {
-            if (tmpbuflen <= tmpbufpos + 2) {
-                tmpbuflen += DATA_ALLOC_SIZE;
-               tmpbuf = (unsigned char *)realloc(tmpbuf, tmpbuflen);
-            }
-            tmpbuf[tmpbufpos++] = (acc >> 16) & 0xff;
-            if (numeq < 2)
-                tmpbuf[tmpbufpos++] = (acc >> 8) & 0xff;
-            if (numeq < 1)
-                tmpbuf[tmpbufpos++] = acc & 0xff;
-        }
-       c = nextChar();
-    }
-    *size = tmpbufpos;
-    if (*size == 0) {
-       free(tmpbuf);
-       return 0;
-    }
-    return tmpbuf;
+       return tmpbuf;
 }
 
 static void *
 getHexData(parser_state_t *state, unsigned int *size)
 {
-    int c;
-    unsigned char *d, *start, *lastStart;
+       int c;
+       unsigned char *d, *start, *lastStart;
 
-    start = lastStart = d = (unsigned char *)malloc(DATA_ALLOC_SIZE);
-    c = currentChar();
+       start = lastStart = d = (unsigned char *)malloc(DATA_ALLOC_SIZE);
+       c = currentChar();
 
-    while (c != '<') {
+       while (c != '<') {
+               if (isSpace(c)) {
+                       while ((c = nextChar()) != 0 && isSpace(c)) {
+                       }
+               }
+               ;
+               if (c == '\n') {
+                       state->lineNumber++;
+                       c = nextChar();
+                       continue;
+               }
 
-       if (isSpace(c)) while ((c = nextChar()) != 0 && isSpace(c)) {};
-       if (c == '\n') {
-           state->lineNumber++;
-           c = nextChar();
-           continue;
-       }
+               // get high nibble
+               if (isDigit(c)) {
+                       *d = (c - '0') << 4;
+               } else if (isAlphaDigit(c)) {
+                       *d =  (0xa + (c - 'a')) << 4;
+               } else {
+                       goto error;
+               }
 
-       // get high nibble
-       if (isDigit(c)) {
-           *d = (c - '0') << 4;
-       } else if (isAlphaDigit(c)) {
-           *d =  (0xa + (c - 'a')) << 4;
-       } else {
-           goto error;
-       }
+               // get low nibble
+               c = nextChar();
+               if (isDigit(c)) {
+                       *d |= c - '0';
+               } else if (isAlphaDigit(c)) {
+                       *d |= 0xa + (c - 'a');
+               } else {
+                       goto error;
+               }
 
-       // get low nibble
-       c = nextChar();
-       if (isDigit(c)) {
-           *d |= c - '0';
-       } else if (isAlphaDigit(c)) {
-           *d |= 0xa + (c - 'a');
-       } else {
-           goto error;
-       }
-       
-       d++;
-       if ((d - lastStart) >= DATA_ALLOC_SIZE) {
-           int oldsize = d - start;
-           start = (unsigned char *)realloc(start, oldsize + DATA_ALLOC_SIZE);
-           d = lastStart = start + oldsize;
+               d++;
+               if ((d - lastStart) >= DATA_ALLOC_SIZE) {
+                       int oldsize = d - start;
+                       start = (unsigned char *)realloc(start, oldsize + DATA_ALLOC_SIZE);
+                       d = lastStart = start + oldsize;
+               }
+               c = nextChar();
        }
-       c = nextChar();
-    }
 
-    *size = d - start;
-    return start;
+       *size = d - start;
+       return start;
 
- error:
+error:
 
-    *size = 0;
-    free(start);
-    return 0;
+       *size = 0;
+       free(start);
+       return 0;
 }
 
 static int
@@ -757,11 +835,15 @@ yylex(YYSTYPE *lvalp, parser_state_t *state)
        char values[TAG_MAX_ATTRIBUTES][TAG_MAX_LENGTH];
        object_t *object;
 
- top:
+top:
        c = currentChar();
 
        /* skip white space  */
-       if (isSpace(c)) while ((c = nextChar()) != 0 && isSpace(c)) {};
+       if (isSpace(c)) {
+               while ((c = nextChar()) != 0 && isSpace(c)) {
+               }
+       }
+       ;
 
        /* keep track of line number, don't return \n's */
        if (c == '\n') {
@@ -771,33 +853,41 @@ yylex(YYSTYPE *lvalp, parser_state_t *state)
        }
 
        // end of the buffer?
-       if (!c) return 0;
+       if (!c) {
+               return 0;
+       }
 
        tagType = getTag(STATE, tag, &attributeCount, attributes, values);
-       if (tagType == TAG_BAD) return SYNTAX_ERROR;
-       if (tagType == TAG_IGNORE) goto top;
+       if (tagType == TAG_BAD) {
+               return SYNTAX_ERROR;
+       }
+       if (tagType == TAG_IGNORE) {
+               goto top;
+       }
 
        // handle allocation and check for "ID" and "IDREF" tags up front
        *lvalp = object = newObject(STATE);
        object->idref = -1;
-       for (i=0; i < attributeCount; i++) {
-           if (attributes[i][0] == 'I' && attributes[i][1] == 'D') {
-               // check for idref's, note: we ignore the tag, for
-               // this to work correctly, all idrefs must be unique
-               // across the whole serialization
-               if (attributes[i][2] == 'R' && attributes[i][3] == 'E' &&
-                   attributes[i][4] == 'F' && !attributes[i][5]) {
-                   if (tagType != TAG_EMPTY) return SYNTAX_ERROR;
-                   object->idref = strtol(values[i], NULL, 0);
-                   return IDREF;
-               }
-               // check for id's
-               if (!attributes[i][2]) {
-                   object->idref = strtol(values[i], NULL, 0);
-               } else {
-                   return SYNTAX_ERROR;
+       for (i = 0; i < attributeCount; i++) {
+               if (attributes[i][0] == 'I' && attributes[i][1] == 'D') {
+                       // check for idref's, note: we ignore the tag, for
+                       // this to work correctly, all idrefs must be unique
+                       // across the whole serialization
+                       if (attributes[i][2] == 'R' && attributes[i][3] == 'E' &&
+                           attributes[i][4] == 'F' && !attributes[i][5]) {
+                               if (tagType != TAG_EMPTY) {
+                                       return SYNTAX_ERROR;
+                               }
+                               object->idref = strtol(values[i], NULL, 0);
+                               return IDREF;
+                       }
+                       // check for id's
+                       if (!attributes[i][2]) {
+                               object->idref = strtol(values[i], NULL, 0);
+                       } else {
+                               return SYNTAX_ERROR;
+                       }
                }
-           }
        }
 
        switch (*tag) {
@@ -827,7 +917,7 @@ yylex(YYSTYPE *lvalp, parser_state_t *state)
                        }
 
                        bool isHexFormat = false;
-                       for (i=0; i < attributeCount; i++) {
+                       for (i = 0; i < attributeCount; i++) {
                                if (!strcmp(attributes[i], "format") && !strcmp(values[i], "hex")) {
                                        isHexFormat = true;
                                        break;
@@ -835,9 +925,9 @@ yylex(YYSTYPE *lvalp, parser_state_t *state)
                        }
                        // CF encoded is the default form
                        if (isHexFormat) {
-                           object->data = getHexData(STATE, &size);
+                               object->data = getHexData(STATE, &size);
                        } else {
-                           object->data = getCFEncodedData(STATE, &size);
+                               object->data = getCFEncodedData(STATE, &size);
                        }
                        object->size = size;
                        if ((getTag(STATE, tag, &attributeCount, attributes, values) != TAG_END) || strcmp(tag, "data")) {
@@ -856,8 +946,8 @@ yylex(YYSTYPE *lvalp, parser_state_t *state)
                break;
        case 'i':
                if (!strcmp(tag, "integer")) {
-                       object->size = 64;      // default
-                       for (i=0; i < attributeCount; i++) {
+                       object->size = 64;      // default
+                       for (i = 0; i < attributeCount; i++) {
                                if (!strcmp(attributes[i], "size")) {
                                        object->size = strtoul(values[i], NULL, 0);
                                }
@@ -875,13 +965,15 @@ yylex(YYSTYPE *lvalp, parser_state_t *state)
                break;
        case 'k':
                if (!strcmp(tag, "key")) {
-                       if (tagType == TAG_EMPTY) return SYNTAX_ERROR;
+                       if (tagType == TAG_EMPTY) {
+                               return SYNTAX_ERROR;
+                       }
                        object->string = getString(STATE);
                        if (!object->string) {
                                return SYNTAX_ERROR;
                        }
                        if ((getTag(STATE, tag, &attributeCount, attributes, values) != TAG_END)
-                          || strcmp(tag, "key")) {
+                           || strcmp(tag, "key")) {
                                return SYNTAX_ERROR;
                        }
                        return KEY;
@@ -896,8 +988,8 @@ yylex(YYSTYPE *lvalp, parser_state_t *state)
        case 's':
                if (!strcmp(tag, "string")) {
                        if (tagType == TAG_EMPTY) {
-                               object->string = (char *)malloc(1);
-                               object->string[0] = 0;
+                               object->string = (char *)malloc(1);
+                               object->string[0] = 0;
                                return STRING;
                        }
                        object->string = getString(STATE);
@@ -905,7 +997,7 @@ yylex(YYSTYPE *lvalp, parser_state_t *state)
                                return SYNTAX_ERROR;
                        }
                        if ((getTag(STATE, tag, &attributeCount, attributes, values) != TAG_END)
-                          || strcmp(tag, "string")) {
+                           || strcmp(tag, "string")) {
                                return SYNTAX_ERROR;
                        }
                        return STRING;
@@ -960,7 +1052,7 @@ newObject(parser_state_t *state)
                o->free = state->objects;
                state->objects = o;
        }
-       
+
        return o;
 }
 
@@ -968,7 +1060,7 @@ void
 freeObject(parser_state_t * state, object_t *o)
 {
        o->next = state->freeObjects;
-       state->freeObjects = o; 
+       state->freeObjects = o;
 }
 
 void
@@ -1006,7 +1098,7 @@ cleanupObjects(parser_state_t *state)
 // !@$&)(^Q$&*^!$(*!@$_(^%_(*Q#$(_*&!$_(*&!$_(*&!#$(*!@&^!@#%!_!#
 // !@$&)(^Q$&*^!$(*!@$_(^%_(*Q#$(_*&!$_(*&!$_(*&!#$(*!@&^!@#%!_!#
 
-static void 
+static void
 rememberObject(parser_state_t *state, int tag, OSObject *o)
 {
        char key[16];
@@ -1028,7 +1120,9 @@ retrieveObject(parser_state_t *state, int tag)
 //     printf("retrieve key '%s'\n", key);
 
        ref = state->tags->getObject(key);
-       if (!ref) return 0;
+       if (!ref) {
+               return 0;
+       }
 
        o = newObject(state);
        o->object = ref;
@@ -1059,7 +1153,9 @@ buildDictionary(parser_state_t *state, object_t * header)
        }
 
        dict = OSDictionary::withCapacity(count);
-       if (header->idref >= 0) rememberObject(state, header->idref, dict);
+       if (header->idref >= 0) {
+               rememberObject(state, header->idref, dict);
+       }
 
        o = header->elements;
        while (o) {
@@ -1099,7 +1195,9 @@ buildArray(parser_state_t *state, object_t * header)
        }
 
        array = OSArray::withCapacity(count);
-       if (header->idref >= 0) rememberObject(state, header->idref, array);
+       if (header->idref >= 0) {
+               rememberObject(state, header->idref, array);
+       }
 
        o = header->elements;
        while (o) {
@@ -1126,7 +1224,9 @@ buildSet(parser_state_t *state, object_t *header)
        OSSet *set = OSSet::withArray(array, array->getCapacity());
 
        // write over the reference created in buildArray
-       if (header->idref >= 0) rememberObject(state, header->idref, set);
+       if (header->idref >= 0) {
+               rememberObject(state, header->idref, set);
+       }
 
        array->release();
        o->object = set;
@@ -1139,7 +1239,9 @@ buildString(parser_state_t *state, object_t *o)
        OSString *string;
 
        string = OSString::withCString(o->string);
-       if (o->idref >= 0) rememberObject(state, o->idref, string);
+       if (o->idref >= 0) {
+               rememberObject(state, o->idref, string);
+       }
 
        free(o->string);
        o->string = 0;
@@ -1153,8 +1255,10 @@ buildSymbol(parser_state_t *state, object_t *o)
 {
        OSSymbol *symbol;
 
-       symbol = const_cast<OSSymbol *>(OSSymbol::withCString(o->string));
-       if (o->idref >= 0) rememberObject(state, o->idref, symbol);
+       symbol = const_cast < OSSymbol * > (OSSymbol::withCString(o->string));
+       if (o->idref >= 0) {
+               rememberObject(state, o->idref, symbol);
+       }
 
        free(o->string);
        o->string = 0;
@@ -1173,9 +1277,13 @@ buildData(parser_state_t *state, object_t *o)
        } else {
                data = OSData::withCapacity(0);
        }
-       if (o->idref >= 0) rememberObject(state, o->idref, data);
+       if (o->idref >= 0) {
+               rememberObject(state, o->idref, data);
+       }
 
-       if (o->size) free(o->data);
+       if (o->size) {
+               free(o->data);
+       }
        o->data = 0;
        o->object = data;
        return o;
@@ -1186,7 +1294,9 @@ buildNumber(parser_state_t *state, object_t *o)
 {
        OSNumber *number = OSNumber::withNumber(o->number, o->size);
 
-       if (o->idref >= 0) rememberObject(state, o->idref, number);
+       if (o->idref >= 0) {
+               rememberObject(state, o->idref, number);
+       }
 
        o->object = number;
        return o;
@@ -1205,12 +1315,18 @@ OSUnserializeXML(const char *buffer, OSString **errorString)
 {
        OSObject *object;
 
-       if (!buffer) return 0;
+       if (!buffer) {
+               return 0;
+       }
        parser_state_t *state = (parser_state_t *)malloc(sizeof(parser_state_t));
-       if (!state) return 0;
+       if (!state) {
+               return 0;
+       }
 
        // just in case
-       if (errorString) *errorString = NULL;
+       if (errorString) {
+               *errorString = NULL;
+       }
 
        state->parseBuffer = buffer;
        state->parseBufferIndex = 0;
@@ -1221,6 +1337,7 @@ OSUnserializeXML(const char *buffer, OSString **errorString)
        state->errorString = errorString;
        state->parsedObject = 0;
        state->parsedObjectCount = 0;
+       state->retrievedObjectCount = 0;
 
        (void)yyparse((void *)state);
 
@@ -1238,13 +1355,22 @@ OSUnserializeXML(const char *buffer, OSString **errorString)
 OSObject*
 OSUnserializeXML(const char *buffer, size_t bufferSize, OSString **errorString)
 {
-       if (!buffer) return (0);
-    if (bufferSize < sizeof(kOSSerializeBinarySignature)) return (0);
+       if (!buffer) {
+               return 0;
+       }
+       if (bufferSize < sizeof(kOSSerializeBinarySignature)) {
+               return 0;
+       }
 
-       if (!strcmp(kOSSerializeBinarySignature, buffer)) return OSUnserializeBinary(buffer, bufferSize, errorString);
+       if (!strcmp(kOSSerializeBinarySignature, buffer)
+           || (kOSSerializeIndexedBinarySignature == (uint8_t)buffer[0])) {
+               return OSUnserializeBinary(buffer, bufferSize, errorString);
+       }
 
        // XML must be null terminated
-       if (buffer[bufferSize - 1]) return 0;
+       if (buffer[bufferSize - 1]) {
+               return 0;
+       }
 
        return OSUnserializeXML(buffer, errorString);
 }
index 7bd79d9ae78b20528784ccb5d5684d4a6bcb579f..05c4b79cf0cdde381cf245f419ccc25af4fec458 100644 (file)
@@ -23,7 +23,7 @@ endif
 
 $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS)
        $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG)
-       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG);
+       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG)
 
 do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
        $(_v)${MAKE} \
@@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
                SOURCE=$(subst conf/,,$(SOURCE))                        \
                TARGET=${TARGET}                                        \
                OBJPATH=${OBJPATH}                                      \
-               build_all;
+               build_all
 
 do_build_all:: do_all
 
index b38b73fe1a353954cd33fc1a14831bfecf8cccbc..fa45a7f1f4198dd3b098a8a9d0bfbf3a1106a398 100644 (file)
@@ -42,8 +42,8 @@ runtime.cpo_CXXWARNFLAGS_ADD = -Wno-cast-qual
 
 
 # warnings in bison-generated code
-OSUnserializeXML.cpo_CXXWARNFLAGS_ADD += -Wno-uninitialized -Wno-unreachable-code -Wno-unreachable-code-break
-OSUnserialize.cpo_CXXWARNFLAGS_ADD += -Wno-unreachable-code
+OSUnserializeXML.cpo_CXXWARNFLAGS_ADD += -Wno-uninitialized -Wno-unreachable-code -Wno-unreachable-code-break -Wno-zero-as-null-pointer-constant
+OSUnserialize.cpo_CXXWARNFLAGS_ADD += -Wno-unreachable-code -Wno-zero-as-null-pointer-constant
 
 # Runtime support functions don't interact well with LTO (9294679)
 stack_protector.o_CFLAGS_ADD += $(CFLAGS_NOLTO_FLAG)
@@ -97,9 +97,9 @@ $(COMPONENT).filelist: $(OBJS)
                $(SEG_HACK) -n __HIB -o $${hib_file}__ $${hib_file} || exit 1; \
                mv $${hib_file}__ $${hib_file} || exit 1; \
        done
-       @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)"
+       $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0))
        $(_v)for obj in ${OBJS}; do     \
-                echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
+                $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
        done > $(COMPONENT).filelist
 
 do_all: $(COMPONENT).filelist
index 5181c01431c00fe866c809a2354afceac2930ecf..95674f7f4c81a6dac08b5db0a6c6764e188e4373 100644 (file)
@@ -2,13 +2,13 @@
 
 OPTIONS/libkerncpp                                     optional libkerncpp
 OPTIONS/kdebug                                         optional kdebug
-OPTIONS/gprof                                          optional gprof
 OPTIONS/config_dtrace                                  optional config_dtrace
 OPTIONS/hibernation                                    optional hibernation
 OPTIONS/iotracking                                     optional iotracking
 OPTIONS/networking                                     optional networking
 OPTIONS/crypto                                         optional crypto
 OPTIONS/zlib                                           optional zlib
+OPTIONS/zlibc                                          optional zlibc
 
 # libkern
 
@@ -37,6 +37,7 @@ libkern/c++/OSSymbol.cpp                              optional libkerncpp
 libkern/c++/OSUnserialize.cpp                          optional libkerncpp
 libkern/c++/OSUnserializeXML.cpp                       optional libkerncpp
 libkern/c++/OSSerializeBinary.cpp                      optional libkerncpp
+libkern/c++/OSCompat.cpp                       optional libkerncpp
 
 libkern/OSKextLib.cpp                                  optional libkerncpp
 libkern/mkext.c                                                standard
@@ -70,6 +71,18 @@ libkern/zlib/trees.c                                    optional zlib
 libkern/zlib/uncompr.c                                  optional zlib
 libkern/zlib/zutil.c                                    optional zlib
 
+libkern/zlib/adler32.c                                  optional zlibc
+libkern/zlib/compress.c                                 optional zlibc
+libkern/zlib/z_crc32.c                                  optional zlibc
+libkern/zlib/deflate.c                                  optional zlibc
+libkern/zlib/infback.c                                  optional zlibc
+libkern/zlib/inffast.c                                  optional zlibc
+libkern/zlib/inflate.c                                  optional zlibc
+libkern/zlib/inftrees.c                                 optional zlibc
+libkern/zlib/trees.c                                    optional zlibc
+libkern/zlib/uncompr.c                                  optional zlibc
+libkern/zlib/zutil.c                                    optional zlibc
+
 libkern/crypto/register_crypto.c               optional crypto
 libkern/crypto/corecrypto_sha2.c               standard
 libkern/crypto/corecrypto_sha1.c               optional crypto
index 9aa590e1430b1a4eb68c8f9005bdc6ca5f0eef34..0105da4e78c26902dac68a5560153b51c65ac1a6 100644 (file)
@@ -42,9 +42,7 @@ aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1])
                panic("%s: inconsistent size for AES encrypt context", __FUNCTION__);
        }
 
-       cccbc_init(cbc, cx[0].ctx, key_len, key);
-
-       return aes_good;
+       return cccbc_init(cbc, cx[0].ctx, key_len, key);
 }
 
 aes_rval
@@ -54,10 +52,12 @@ aes_encrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigne
        const struct ccmode_cbc *cbc = g_crypto_funcs->ccaes_cbc_encrypt;
        cccbc_iv_decl(cbc->block_size, ctx_iv);
 
-       cccbc_set_iv(cbc, ctx_iv, in_iv);
-       cccbc_update(cbc, cx[0].ctx, ctx_iv, num_blk, in_blk, out_blk); //Actually cbc encrypt.
+       int rc = cccbc_set_iv(cbc, ctx_iv, in_iv);
+       if (rc) {
+               return rc;
+       }
 
-       return aes_good;
+       return cccbc_update(cbc, cx[0].ctx, ctx_iv, num_blk, in_blk, out_blk); //Actually cbc encrypt.
 }
 
 #if defined (__i386__) || defined (__x86_64__) || defined (__arm64__)
@@ -79,9 +79,7 @@ aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1])
                panic("%s: inconsistent size for AES decrypt context", __FUNCTION__);
        }
 
-       cccbc_init(cbc, cx[0].ctx, key_len, key);
-
-       return aes_good;
+       return cccbc_init(cbc, cx[0].ctx, key_len, key);
 }
 
 aes_rval
@@ -91,10 +89,12 @@ aes_decrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigne
        const struct ccmode_cbc *cbc = g_crypto_funcs->ccaes_cbc_decrypt;
        cccbc_iv_decl(cbc->block_size, ctx_iv);
 
-       cccbc_set_iv(cbc, ctx_iv, in_iv);
-       cccbc_update(cbc, cx[0].ctx, ctx_iv, num_blk, in_blk, out_blk); //Actually cbc decrypt.
+       int rc = cccbc_set_iv(cbc, ctx_iv, in_iv);
+       if (rc) {
+               return rc;
+       }
 
-       return aes_good;
+       return cccbc_update(cbc, cx[0].ctx, ctx_iv, num_blk, in_blk, out_blk); //Actually cbc decrypt.
 }
 
 #if defined (__i386__) || defined (__x86_64__) || defined (__arm64__)
@@ -194,7 +194,7 @@ aes_encrypt_aad_gcm(const unsigned char *aad, unsigned int aad_bytes, ccgcm_ctx
                return aes_error;
        }
 
-       return ccgcm_gmac(gcm, ctx, aad_bytes, aad);
+       return ccgcm_aad(gcm, ctx, aad_bytes, aad);
 }
 
 aes_rval
@@ -212,15 +212,17 @@ aes_encrypt_gcm(const unsigned char *in_blk, unsigned int num_bytes,
 aes_rval
 aes_encrypt_finalize_gcm(unsigned char *tag, unsigned int tag_bytes, ccgcm_ctx *ctx)
 {
-       int rc;
        const struct ccmode_gcm *gcm = g_crypto_funcs->ccaes_gcm_encrypt;
        if (!gcm) {
                return aes_error;
        }
 
-       rc = ccgcm_finalize(gcm, ctx, tag_bytes, tag);
-       rc |= ccgcm_reset(gcm, ctx);
-       return rc;
+       int rc = ccgcm_finalize(gcm, ctx, tag_bytes, tag);
+       if (rc) {
+               return rc;
+       }
+
+       return ccgcm_reset(gcm, ctx);
 }
 
 aes_rval
@@ -248,16 +250,17 @@ aes_decrypt_key_with_iv_gcm(const unsigned char *key, int key_len, const unsigne
 aes_rval
 aes_decrypt_set_iv_gcm(const unsigned char *in_iv, unsigned int len, ccgcm_ctx *ctx)
 {
-       int rc;
-
        const struct ccmode_gcm *gcm = g_crypto_funcs->ccaes_gcm_decrypt;
        if (!gcm) {
                return aes_error;
        }
 
-       rc = ccgcm_reset(gcm, ctx);
-       rc |= ccgcm_set_iv(gcm, ctx, len, in_iv);
-       return rc;
+       int rc = ccgcm_reset(gcm, ctx);
+       if (rc) {
+               return rc;
+       }
+
+       return ccgcm_set_iv(gcm, ctx, len, in_iv);
 }
 
 aes_rval
@@ -290,7 +293,7 @@ aes_decrypt_aad_gcm(const unsigned char *aad, unsigned int aad_bytes, ccgcm_ctx
                return aes_error;
        }
 
-       return ccgcm_gmac(gcm, ctx, aad_bytes, aad);
+       return ccgcm_aad(gcm, ctx, aad_bytes, aad);
 }
 
 aes_rval
@@ -308,15 +311,17 @@ aes_decrypt_gcm(const unsigned char *in_blk, unsigned int num_bytes,
 aes_rval
 aes_decrypt_finalize_gcm(unsigned char *tag, unsigned int tag_bytes, ccgcm_ctx *ctx)
 {
-       int rc;
        const struct ccmode_gcm *gcm = g_crypto_funcs->ccaes_gcm_decrypt;
        if (!gcm) {
                return aes_error;
        }
 
-       rc = ccgcm_finalize(gcm, ctx, tag_bytes, tag);
-       rc |= ccgcm_reset(gcm, ctx);
-       return rc;
+       int rc = ccgcm_finalize(gcm, ctx, tag_bytes, tag);
+       if (rc) {
+               return rc;
+       }
+
+       return ccgcm_reset(gcm, ctx);
 }
 
 unsigned
index 80cd614fdf2cd2834e16d73fce46ff49d640c534..61c2d0d639c8e3b3c2d0d4bf4a5fc456aef43f62 100644 (file)
@@ -64,10 +64,10 @@ xts_start(uint32_t cipher __unused, // ignored - we're doing this for xts-aes on
                panic("%s: inconsistent size for AES-XTS context", __FUNCTION__);
        }
 
-       enc->init(enc, xts->enc, keylen, key1, key2);
-       dec->init(dec, xts->dec, keylen, key1, key2);
+       int rc = enc->init(enc, xts->enc, keylen, key1, key2);
+       rc |= dec->init(dec, xts->dec, keylen, key1, key2);
 
-       return 0;         //never fails
+       return rc;
 }
 
 int
@@ -83,10 +83,13 @@ xts_encrypt(const uint8_t *pt, unsigned long ptlen,
                panic("xts encrypt not a multiple of block size\n");
        }
 
-       xtsenc->set_tweak(xts->enc, tweak, iv);
-       xtsenc->xts(xts->enc, tweak, ptlen / 16, pt, ct);
+       int rc = xtsenc->set_tweak(xts->enc, tweak, iv);
+       if (rc) {
+               return rc;
+       }
 
-       return 0; //never fails
+       xtsenc->xts(xts->enc, tweak, ptlen / 16, pt, ct);
+       return 0;
 }
 
 int
@@ -102,10 +105,13 @@ xts_decrypt(const uint8_t *ct, unsigned long ptlen,
                panic("xts decrypt not a multiple of block size\n");
        }
 
-       xtsdec->set_tweak(xts->dec, tweak, iv);
-       xtsdec->xts(xts->dec, tweak, ptlen / 16, ct, pt);
+       int rc = xtsdec->set_tweak(xts->dec, tweak, iv);
+       if (rc) {
+               return rc;
+       }
 
-       return 0; //never fails
+       xtsdec->xts(xts->dec, tweak, ptlen / 16, ct, pt);
+       return 0;
 }
 
 void
index 80406bb02f462fd39794557292e8bfd56f06b664..b77967c491906c2118cac65329477412ec545536 100644 (file)
@@ -45,22 +45,22 @@ des_ecb_key_sched(des_cblock *key, des_ecb_key_schedule *ks)
                panic("%s: inconsistent size for DES-ECB context", __FUNCTION__);
        }
 
-       enc->init(enc, ks->enc, CCDES_KEY_SIZE, key);
-       dec->init(dec, ks->dec, CCDES_KEY_SIZE, key);
+       int rc = enc->init(enc, ks->enc, CCDES_KEY_SIZE, key);
+       if (rc) {
+               return rc;
+       }
 
-       /* The old DES interface could return -1 or -2 for weak keys and wrong parity,
-        *  but this was disabled all the time, so we never fail here */
-       return 0;
+       return dec->init(dec, ks->dec, CCDES_KEY_SIZE, key);
 }
 
 /* Simple des - 1 block */
-void
+int
 des_ecb_encrypt(des_cblock *in, des_cblock *out, des_ecb_key_schedule *ks, int enc)
 {
        const struct ccmode_ecb *ecb = enc ? g_crypto_funcs->ccdes_ecb_encrypt : g_crypto_funcs->ccdes_ecb_decrypt;
        ccecb_ctx *ctx = enc ? ks->enc : ks->dec;
 
-       ecb->ecb(ctx, 1, in, out);
+       return ecb->ecb(ctx, 1, in, out);
 }
 
 
@@ -68,7 +68,6 @@ des_ecb_encrypt(des_cblock *in, des_cblock *out, des_ecb_key_schedule *ks, int e
 int
 des3_ecb_key_sched(des_cblock *key, des3_ecb_key_schedule *ks)
 {
-       int rc;
        const struct ccmode_ecb *enc = g_crypto_funcs->cctdes_ecb_encrypt;
        const struct ccmode_ecb *dec = g_crypto_funcs->cctdes_ecb_decrypt;
 
@@ -77,20 +76,22 @@ des3_ecb_key_sched(des_cblock *key, des3_ecb_key_schedule *ks)
                panic("%s: inconsistent size for 3DES-ECB context", __FUNCTION__);
        }
 
-       rc = enc->init(enc, ks->enc, CCDES_KEY_SIZE * 3, key);
-       rc |= dec->init(dec, ks->dec, CCDES_KEY_SIZE * 3, key);
+       int rc = enc->init(enc, ks->enc, CCDES_KEY_SIZE * 3, key);
+       if (rc) {
+               return rc;
+       }
 
-       return rc;
+       return dec->init(dec, ks->dec, CCDES_KEY_SIZE * 3, key);
 }
 
 /* Simple des - 1 block */
-void
+int
 des3_ecb_encrypt(des_cblock *in, des_cblock *out, des3_ecb_key_schedule *ks, int enc)
 {
        const struct ccmode_ecb *ecb = enc ? g_crypto_funcs->cctdes_ecb_encrypt : g_crypto_funcs->cctdes_ecb_decrypt;
        ccecb_ctx *ctx = enc ? ks->enc : ks->dec;
 
-       ecb->ecb(ctx, 1, in, out);
+       return ecb->ecb(ctx, 1, in, out);
 }
 
 /* Raw key helper functions */
index d6f2503f92a05e7cc52c1e3bc47661fa4d20d7df..41be8924a9f0acaafd92d4db647de6ecb570b4fe 100644 (file)
@@ -6,7 +6,9 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
 include $(MakeInc_cmd)
 include $(MakeInc_def)
 
-LCLDIR = /usr/local/include
+INSTALLHDRS_SKIP_HOST = NO
+
+LCLDIR = $(SDKHEADERSROOT)/usr/local/include
 
 KERNELFILES =
 
index 80d5969fa3d600870124784a841bbf34ed078da7..dece91a3786bd5a21f2542501dd744cf3dfa3cd2 100644 (file)
@@ -146,7 +146,7 @@ firehose_chunk_tracepoint_try_reserve(firehose_chunk_t fc, uint64_t stamp,
                return FIREHOSE_CHUNK_TRY_RESERVE_FAIL_ENQUEUE;
        }
        if (privptr) {
-               *privptr = fc->fc_start + pos.fcp_private_offs;
+               *privptr = (uint8_t *)((uintptr_t)fc->fc_start + pos.fcp_private_offs);
        }
        return orig.fcp_next_entry_offs;
 }
@@ -157,7 +157,7 @@ firehose_chunk_tracepoint_begin(firehose_chunk_t fc, uint64_t stamp,
     uint16_t pubsize, uint64_t thread_id, long offset)
 {
        firehose_tracepoint_t ft = (firehose_tracepoint_t)
-           __builtin_assume_aligned(fc->fc_start + offset, 8);
+           __builtin_assume_aligned((void *)((uintptr_t)fc->fc_start + (uintptr_t)offset), 8);
        stamp -= fc->fc_timestamp;
        stamp |= (uint64_t)pubsize << 48;
        // The compiler barrier is needed for userland process death handling, see
index c47635c8803f6158c6d5f76d650f69a473049711..22e3def405abfc83d58d382c0a7c87dee80b873f 100644 (file)
@@ -36,14 +36,14 @@ __BEGIN_DECLS
  * The lower 8 bits are or-ed in the upper 8 bits of Activity ID and propagated
  * to children activities
  */
-    OS_ENUM(firehose_activity_flags, unsigned long,
-    firehose_activity_flags_default                                         = 0x0000,
+    OS_OPTIONS(firehose_activity_flags, unsigned long,
+    firehose_activity_flags_default             = 0x0000,
 
-    firehose_activity_flags_info_mode                                       = 0x0001,
-    firehose_activity_flags_debug_mode                                      = 0x0002,
-    firehose_activity_flags_stream_live_mode                        = 0x0004,
+    firehose_activity_flags_info_mode           = 0x0001,
+    firehose_activity_flags_debug_mode          = 0x0002,
+    firehose_activity_flags_stream_live_mode    = 0x0004,
 
-    firehose_activity_flags_precise_timestamp                       = 0x0080,
+    firehose_activity_flags_precise_timestamp   = 0x0080,
     );
 
 /*!
@@ -69,13 +69,13 @@ typedef uint64_t firehose_activity_id_t;
  * @enum firehose_stream_t
  */
 OS_ENUM(firehose_stream, uint8_t,
-    firehose_stream_persist                                                         = 0,
-    firehose_stream_special                                                         = 1,
-    firehose_stream_memory                                                          = 2,
-    firehose_stream_metadata                                                        = 3,
-    firehose_stream_signpost                                                        = 4,
-    firehose_stream_memory_wifi                                                     = 5,
-    firehose_stream_memory_baseband                                         = 6,
+    firehose_stream_persist                     = 0,
+    firehose_stream_special                     = 1,
+    firehose_stream_memory                      = 2,
+    firehose_stream_metadata                    = 3,
+    firehose_stream_signpost                    = 4,
+    firehose_stream_memory_wifi                 = 5,
+    firehose_stream_memory_baseband             = 6,
 
     _firehose_stream_max,
     );
@@ -87,12 +87,12 @@ OS_ENUM(firehose_stream, uint8_t,
  * Namespaces of tracepoints.
  */
 OS_ENUM(firehose_tracepoint_namespace, uint8_t,
-    firehose_tracepoint_namespace_activity                          = 0x02,
-    firehose_tracepoint_namespace_trace                                     = 0x03,
-    firehose_tracepoint_namespace_log                                       = 0x04,
-    firehose_tracepoint_namespace_metadata                          = 0x05,
-    firehose_tracepoint_namespace_signpost                          = 0x06,
-    firehose_tracepoint_namespace_loss                                      = 0x07,
+    firehose_tracepoint_namespace_activity      = 0x02,
+    firehose_tracepoint_namespace_trace         = 0x03,
+    firehose_tracepoint_namespace_log           = 0x04,
+    firehose_tracepoint_namespace_metadata      = 0x05,
+    firehose_tracepoint_namespace_signpost      = 0x06,
+    firehose_tracepoint_namespace_loss          = 0x07,
     );
 
 /*!
@@ -102,8 +102,8 @@ OS_ENUM(firehose_tracepoint_namespace, uint8_t,
  * Codes of tracepoints.
  */
 OS_ENUM(firehose_tracepoint_code, uint32_t,
-    firehose_tracepoint_code_load                           = 0x01,
-    firehose_tracepoint_code_unload                         = 0x02,
+    firehose_tracepoint_code_load               = 0x01,
+    firehose_tracepoint_code_unload             = 0x02,
     );
 
 /*!
@@ -120,10 +120,10 @@ typedef uint8_t firehose_tracepoint_type_t;
  * @abstract
  * Flags for tracepoints.
  */
-OS_ENUM(firehose_tracepoint_flags, uint16_t,
+OS_OPTIONS(firehose_tracepoint_flags, uint16_t,
     _firehose_tracepoint_flags_base_has_current_aid         = 0x0001,
 #define _firehose_tracepoint_flags_pc_style_mask                 (0x0007 << 1)
-    _firehose_tracepoint_flags_pc_style_none                        = 0x0000 << 1,
+    _firehose_tracepoint_flags_pc_style_none                = 0x0000 << 1,
         _firehose_tracepoint_flags_pc_style_main_exe            = 0x0001 << 1,
         _firehose_tracepoint_flags_pc_style_shared_cache        = 0x0002 << 1,
         _firehose_tracepoint_flags_pc_style_main_plugin         = 0x0003 << 1,
@@ -134,6 +134,25 @@ OS_ENUM(firehose_tracepoint_flags, uint16_t,
         _firehose_tracepoint_flags_base_has_unique_pid          = 0x0010,
     );
 
+/*
+ * Same as _OS_TRACE_DYNAMIC_BIT defined in libtrace/tracepoint_internal.h.
+ * This bit is used by logd to know how to evaluate the format
+ * string.
+ * If it is set, logd assumes that the format is "%s" and the content of the
+ * whole string is passed with the firehose_tracepoint.
+ * Otherwise it tries to find the unformatted string within the text
+ * section of the executable and expects only the content of the variables
+ * on the firehose_tracepoint.
+ */
+#define FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT 0x80000000
+
+/*
+ * Same as KERNEL_MASK defined in logd/logd_main.c
+ * It is used by logd to mask the pc before calling
+ * OSKextCopyUUIDForAddress.
+ */
+#define FIREHOSE_TRACEPOINT_PC_KERNEL_MASK 0xffff000000000000
+
 /*!
  * @typedef firehose_tracepoint_id_t
  *
@@ -149,8 +168,8 @@ typedef uint64_t firehose_tracepoint_id_t;
  * Types of Activity tracepoints (namespace activity).
  */
 OS_ENUM(_firehose_tracepoint_type_activity, firehose_tracepoint_type_t,
-    _firehose_tracepoint_type_activity_create                       = 0x01,
-    _firehose_tracepoint_type_activity_swap                         = 0x02,
+    _firehose_tracepoint_type_activity_create               = 0x01,
+    _firehose_tracepoint_type_activity_swap                 = 0x02,
     _firehose_tracepoint_type_activity_useraction           = 0x03,
     );
 
@@ -160,7 +179,7 @@ OS_ENUM(_firehose_tracepoint_type_activity, firehose_tracepoint_type_t,
  * @abstract
  * Flags for Activity tracepoints (namespace activity).
  */
-OS_ENUM(_firehose_tracepoint_flags_activity, uint16_t,
+OS_OPTIONS(_firehose_tracepoint_flags_activity, uint16_t,
     _firehose_tracepoint_flags_activity_user_interface      = 0x0100,
     _firehose_tracepoint_flags_activity_has_other_aid       = 0x0200,
     );
@@ -172,11 +191,11 @@ OS_ENUM(_firehose_tracepoint_flags_activity, uint16_t,
  * Types of trace tracepoints (namespace trace).
  */
 OS_ENUM(_firehose_tracepoint_type_trace, firehose_tracepoint_type_t,
-    _firehose_tracepoint_type_trace_default                         = 0x00,
-    _firehose_tracepoint_type_trace_info                            = 0x01,
-    _firehose_tracepoint_type_trace_debug                           = 0x02,
-    _firehose_tracepoint_type_trace_error                           = 0x10,
-    _firehose_tracepoint_type_trace_fault                           = 0x11,
+    _firehose_tracepoint_type_trace_default                 = 0x00,
+    _firehose_tracepoint_type_trace_info                    = 0x01,
+    _firehose_tracepoint_type_trace_debug                   = 0x02,
+    _firehose_tracepoint_type_trace_error                   = 0x10,
+    _firehose_tracepoint_type_trace_fault                   = 0x11,
     );
 
 /*!
@@ -186,11 +205,11 @@ OS_ENUM(_firehose_tracepoint_type_trace, firehose_tracepoint_type_t,
  * Types of Log tracepoints (namespace log).
  */
 OS_ENUM(_firehose_tracepoint_type_log, firehose_tracepoint_type_t,
-    _firehose_tracepoint_type_log_default                           = 0x00,
-    _firehose_tracepoint_type_log_info                                      = 0x01,
-    _firehose_tracepoint_type_log_debug                                     = 0x02,
-    _firehose_tracepoint_type_log_error                                     = 0x10,
-    _firehose_tracepoint_type_log_fault                                     = 0x11,
+    _firehose_tracepoint_type_log_default                   = 0x00,
+    _firehose_tracepoint_type_log_info                      = 0x01,
+    _firehose_tracepoint_type_log_debug                     = 0x02,
+    _firehose_tracepoint_type_log_error                     = 0x10,
+    _firehose_tracepoint_type_log_fault                     = 0x11,
     );
 
 /*!
@@ -199,11 +218,11 @@ OS_ENUM(_firehose_tracepoint_type_log, firehose_tracepoint_type_t,
  * @abstract
  * Flags for Log tracepoints (namespace log).
  */
-OS_ENUM(_firehose_tracepoint_flags_log, uint16_t,
+OS_OPTIONS(_firehose_tracepoint_flags_log, uint16_t,
     _firehose_tracepoint_flags_log_has_private_data         = 0x0100,
     _firehose_tracepoint_flags_log_has_subsystem            = 0x0200,
-    _firehose_tracepoint_flags_log_has_rules                        = 0x0400,
-    _firehose_tracepoint_flags_log_has_oversize                     = 0x0800,
+    _firehose_tracepoint_flags_log_has_rules                = 0x0400,
+    _firehose_tracepoint_flags_log_has_oversize             = 0x0800,
     _firehose_tracepoint_flags_log_has_context_data         = 0x1000,
     );
 
@@ -214,9 +233,9 @@ OS_ENUM(_firehose_tracepoint_flags_log, uint16_t,
  * Types for metadata tracepoints (namespace metadata).
  */
 OS_ENUM(_firehose_tracepoint_type_metadata, firehose_tracepoint_type_t,
-    _firehose_tracepoint_type_metadata_dyld                         = 0x01,
+    _firehose_tracepoint_type_metadata_dyld                 = 0x01,
     _firehose_tracepoint_type_metadata_subsystem            = 0x02,
-    _firehose_tracepoint_type_metadata_kext                         = 0x03,
+    _firehose_tracepoint_type_metadata_kext                 = 0x03,
     );
 
 /*!
@@ -226,7 +245,7 @@ OS_ENUM(_firehose_tracepoint_type_metadata, firehose_tracepoint_type_t,
  * Types of Log tracepoints (namespace signpost).
  */
 OS_ENUM(_firehose_tracepoint_type_signpost, firehose_tracepoint_type_t,
-    _firehose_tracepoint_type_signpost_event                        = 0x00,
+    _firehose_tracepoint_type_signpost_event                = 0x00,
     _firehose_tracepoint_type_signpost_interval_begin       = 0x01,
     _firehose_tracepoint_type_signpost_interval_end         = 0x02,
 
@@ -242,13 +261,13 @@ OS_ENUM(_firehose_tracepoint_type_signpost, firehose_tracepoint_type_t,
  * @abstract
  * Flags for Log tracepoints (namespace signpost).
  *
- * When flags are shared with the log type, they should have the same values.
+ * When flags are shared with the log type, they should havethe same values.
  */
-OS_ENUM(_firehose_tracepoint_flags_signpost, uint16_t,
+OS_OPTIONS(_firehose_tracepoint_flags_signpost, uint16_t,
     _firehose_tracepoint_flags_signpost_has_private_data    = 0x0100,
-    _firehose_tracepoint_flags_signpost_has_subsystem               = 0x0200,
-    _firehose_tracepoint_flags_signpost_has_rules                   = 0x0400,
-    _firehose_tracepoint_flags_signpost_has_oversize                = 0x0800,
+    _firehose_tracepoint_flags_signpost_has_subsystem       = 0x0200,
+    _firehose_tracepoint_flags_signpost_has_rules           = 0x0400,
+    _firehose_tracepoint_flags_signpost_has_oversize        = 0x0800,
     _firehose_tracepoint_flags_signpost_has_context_data    = 0x1000,
     );
 
index 7866c302e05d4f101f366ea3933530dd405b85d8..8408c83a43b4db0395f926c0b4c3e41b2d3d1be7 100644 (file)
@@ -47,10 +47,6 @@ enum {
 #define ALIGN_TEST(p, t) do{}while(0)
 #endif
 
-// 19831745 - start of big hammer!
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wcast-qual"
-
 /*
  * atomic operations
  *     These are _the_ atomic operations, now implemented via compiler built-ins.
@@ -63,16 +59,14 @@ enum {
 Boolean
 OSCompareAndSwap8(UInt8 oldValue, UInt8 newValue, volatile UInt8 *address)
 {
-       return __c11_atomic_compare_exchange_strong((_Atomic UInt8 *)address, &oldValue, newValue,
-                  memory_order_acq_rel_smp, memory_order_relaxed);
+       return os_atomic_cmpxchg(address, oldValue, newValue, acq_rel);
 }
 
 #undef OSCompareAndSwap16
 Boolean
 OSCompareAndSwap16(UInt16 oldValue, UInt16 newValue, volatile UInt16 *address)
 {
-       return __c11_atomic_compare_exchange_strong((_Atomic UInt16 *)address, &oldValue, newValue,
-                  memory_order_acq_rel_smp, memory_order_relaxed);
+       return os_atomic_cmpxchg(address, oldValue, newValue, acq_rel);
 }
 
 #undef OSCompareAndSwap
@@ -80,8 +74,7 @@ Boolean
 OSCompareAndSwap(UInt32 oldValue, UInt32 newValue, volatile UInt32 *address)
 {
        ALIGN_TEST(address, UInt32);
-       return __c11_atomic_compare_exchange_strong((_Atomic UInt32 *)address, &oldValue, newValue,
-                  memory_order_acq_rel_smp, memory_order_relaxed);
+       return os_atomic_cmpxchg(address, oldValue, newValue, acq_rel);
 }
 
 #undef OSCompareAndSwap64
@@ -96,31 +89,26 @@ OSCompareAndSwap64(UInt64 oldValue, UInt64 newValue, volatile UInt64 *address)
        _Atomic UInt64 *aligned_addr = (_Atomic UInt64 *)(uintptr_t)address;
 
        ALIGN_TEST(address, UInt64);
-       return __c11_atomic_compare_exchange_strong(aligned_addr, &oldValue, newValue,
-                  memory_order_acq_rel_smp, memory_order_relaxed);
+       return os_atomic_cmpxchg(aligned_addr, oldValue, newValue, acq_rel);
 }
 
 #undef OSCompareAndSwapPtr
 Boolean
 OSCompareAndSwapPtr(void *oldValue, void *newValue, void * volatile *address)
 {
-#if __LP64__
-       return OSCompareAndSwap64((UInt64)oldValue, (UInt64)newValue, (volatile UInt64 *)address);
-#else
-       return OSCompareAndSwap((UInt32)oldValue, (UInt32)newValue, (volatile UInt32 *)address);
-#endif
+       return os_atomic_cmpxchg(address, oldValue, newValue, acq_rel);
 }
 
 SInt8
 OSAddAtomic8(SInt32 amount, volatile SInt8 *address)
 {
-       return __c11_atomic_fetch_add((_Atomic SInt8*)address, amount, memory_order_relaxed);
+       return os_atomic_add_orig(address, amount, relaxed);
 }
 
 SInt16
 OSAddAtomic16(SInt32 amount, volatile SInt16 *address)
 {
-       return __c11_atomic_fetch_add((_Atomic SInt16*)address, amount, memory_order_relaxed);
+       return os_atomic_add_orig(address, amount, relaxed);
 }
 
 #undef OSAddAtomic
@@ -128,7 +116,7 @@ SInt32
 OSAddAtomic(SInt32 amount, volatile SInt32 *address)
 {
        ALIGN_TEST(address, UInt32);
-       return __c11_atomic_fetch_add((_Atomic SInt32*)address, amount, memory_order_relaxed);
+       return os_atomic_add_orig(address, amount, relaxed);
 }
 
 #undef OSAddAtomic64
@@ -138,75 +126,69 @@ OSAddAtomic64(SInt64 amount, volatile SInt64 *address)
        _Atomic SInt64* aligned_address = (_Atomic SInt64*)(uintptr_t)address;
 
        ALIGN_TEST(address, SInt64);
-       return __c11_atomic_fetch_add(aligned_address, amount, memory_order_relaxed);
+       return os_atomic_add_orig(aligned_address, amount, relaxed);
 }
 
 #undef OSAddAtomicLong
 long
 OSAddAtomicLong(long theAmount, volatile long *address)
 {
-#ifdef __LP64__
-       return (long)OSAddAtomic64((SInt64)theAmount, (SInt64*)address);
-#else
-       return (long)OSAddAtomic((SInt32)theAmount, address);
-#endif
+       return os_atomic_add_orig(address, theAmount, relaxed);
 }
 
 #undef OSIncrementAtomic
 SInt32
 OSIncrementAtomic(volatile SInt32 * value)
 {
-       return OSAddAtomic(1, value);
+       return os_atomic_inc_orig(value, relaxed);
 }
 
 #undef OSDecrementAtomic
 SInt32
 OSDecrementAtomic(volatile SInt32 * value)
 {
-       return OSAddAtomic(-1, value);
+       return os_atomic_dec_orig(value, relaxed);
 }
 
 #undef OSBitAndAtomic
 UInt32
 OSBitAndAtomic(UInt32 mask, volatile UInt32 * value)
 {
-       return __c11_atomic_fetch_and((_Atomic UInt32*)value, mask, memory_order_relaxed);
+       return os_atomic_and_orig(value, mask, relaxed);
 }
 
 #undef OSBitOrAtomic
 UInt32
 OSBitOrAtomic(UInt32 mask, volatile UInt32 * value)
 {
-       return __c11_atomic_fetch_or((_Atomic UInt32*)value, mask, memory_order_relaxed);
+       return os_atomic_or_orig(value, mask, relaxed);
 }
 
 #undef OSBitXorAtomic
 UInt32
 OSBitXorAtomic(UInt32 mask, volatile UInt32 * value)
 {
-       return __c11_atomic_fetch_xor((_Atomic UInt32*)value, mask, memory_order_relaxed);
+       return os_atomic_xor_orig(value, mask, relaxed);
 }
 
 static Boolean
 OSTestAndSetClear(UInt32 bit, Boolean wantSet, volatile UInt8 * startAddress)
 {
        UInt8           mask = 1;
-       UInt8           oldValue;
+       UInt8           oldValue, newValue;
        UInt8           wantValue;
+       UInt8           *address;
 
-       startAddress += (bit / 8);
+       address = (UInt8 *)(uintptr_t)(startAddress + (bit / 8));
        mask <<= (7 - (bit % 8));
        wantValue = wantSet ? mask : 0;
 
-       do {
-               oldValue = *startAddress;
+       return !os_atomic_rmw_loop(address, oldValue, newValue, relaxed, {
                if ((oldValue & mask) == wantValue) {
-                       break;
+                       os_atomic_rmw_loop_give_up(break);
                }
-       } while (!__c11_atomic_compare_exchange_strong((_Atomic UInt8 *)startAddress,
-           &oldValue, (oldValue & ~mask) | wantValue, memory_order_relaxed, memory_order_relaxed));
-
-       return (oldValue & mask) == wantValue;
+               newValue = (oldValue & ~mask) | wantValue;
+       });
 }
 
 Boolean
@@ -228,31 +210,31 @@ OSTestAndClear(UInt32 bit, volatile UInt8 * startAddress)
 SInt8
 OSIncrementAtomic8(volatile SInt8 * value)
 {
-       return OSAddAtomic8(1, value);
+       return os_atomic_inc_orig(value, relaxed);
 }
 
 SInt8
 OSDecrementAtomic8(volatile SInt8 * value)
 {
-       return OSAddAtomic8(-1, value);
+       return os_atomic_dec_orig(value, relaxed);
 }
 
 UInt8
 OSBitAndAtomic8(UInt32 mask, volatile UInt8 * value)
 {
-       return __c11_atomic_fetch_and((_Atomic UInt8 *)value, mask, memory_order_relaxed);
+       return os_atomic_and_orig(value, mask, relaxed);
 }
 
 UInt8
 OSBitOrAtomic8(UInt32 mask, volatile UInt8 * value)
 {
-       return __c11_atomic_fetch_or((_Atomic UInt8 *)value, mask, memory_order_relaxed);
+       return os_atomic_or_orig(value, mask, relaxed);
 }
 
 UInt8
 OSBitXorAtomic8(UInt32 mask, volatile UInt8 * value)
 {
-       return __c11_atomic_fetch_xor((_Atomic UInt8 *)value, mask, memory_order_relaxed);
+       return os_atomic_xor_orig(value, mask, relaxed);
 }
 
 SInt16
@@ -270,20 +252,17 @@ OSDecrementAtomic16(volatile SInt16 * value)
 UInt16
 OSBitAndAtomic16(UInt32 mask, volatile UInt16 * value)
 {
-       return __c11_atomic_fetch_and((_Atomic UInt16 *)value, mask, memory_order_relaxed);
+       return os_atomic_and_orig(value, mask, relaxed);
 }
 
 UInt16
 OSBitOrAtomic16(UInt32 mask, volatile UInt16 * value)
 {
-       return __c11_atomic_fetch_or((_Atomic UInt16 *)value, mask, memory_order_relaxed);
+       return os_atomic_or_orig(value, mask, relaxed);
 }
 
 UInt16
 OSBitXorAtomic16(UInt32 mask, volatile UInt16 * value)
 {
-       return __c11_atomic_fetch_xor((_Atomic UInt16 *)value, mask, memory_order_relaxed);
+       return os_atomic_xor_orig(value, mask, relaxed);
 }
-
-// 19831745 - end of big hammer!
-#pragma clang diagnostic pop
index b806dce3c5b3380a8c1145f87a36c87b2d27dcf3..402b6a345bdb7fb1fd1fc5cea903524cdac34f93 100644 (file)
@@ -43,6 +43,9 @@
 #include <sys/kdebug.h>
 #include <kern/thread.h>
 
+#if defined(HAS_APPLE_PAC)
+#include <ptrauth.h>
+#endif
 
 extern int etext;
 __BEGIN_DECLS
@@ -95,7 +98,7 @@ trace_backtrace(uint32_t debugid, uint32_t debugid2, uintptr_t size, uintptr_t d
                i = 2;
        }
 
-#define safe_bt(a) (uintptr_t)(a<cnt ? bt[a] : 0)
+#define safe_bt(a) (uintptr_t)(a<cnt ? bt[a] : NULL)
        kernel_debug(debugid, data, size, safe_bt(i), safe_bt(i + 1), 0);
        kernel_debug(debugid2, safe_bt(i + 2), safe_bt(i + 3), safe_bt(i + 4), safe_bt(i + 5), 0);
 }
@@ -236,7 +239,7 @@ pad:
        frame = frame_index;
 
        for (; frame_index < maxAddrs; frame_index++) {
-               bt[frame_index] = (void *) 0;
+               bt[frame_index] = (void *) NULL;
        }
 #elif __arm__ || __arm64__
        uint32_t i = 0;
@@ -270,7 +273,12 @@ pad:
                }
 
                // No need to use copyin as this is always a kernel address, see check above
+#if defined(HAS_APPLE_PAC)
+               /* return addresses on stack signed by arm64e ABI */
+               bt[i] = ptrauth_strip((void*)frameb[1], ptrauth_key_return_address); // link register
+#else
                bt[i] = (void*)frameb[1]; // link register
+#endif
                fp = frameb[0];
        } while (++i < maxAddrs);
        frame = i;
index b53725676e506d6482fbf513d648ce809d6fabe5..1a7c2988eb9d2950d263befc7aabfbadba2e500c 100644 (file)
@@ -137,9 +137,8 @@ kxld_create_context(KXLDContext **_context,
        check(logging_callback);
        *_context = NULL;
 
-       context = kxld_alloc(sizeof(*context));
+       context = kxld_calloc(sizeof(*context));
        require_action(context, finish, rval = KERN_RESOURCE_SHORTAGE);
-       bzero(context, sizeof(*context));
 
        context->flags = flags;
        context->allocate_callback = allocate_callback;
@@ -154,9 +153,8 @@ kxld_create_context(KXLDContext **_context,
 
        kxld_set_logging_callback(logging_callback);
 
-       context->kext = kxld_alloc(kxld_kext_sizeof());
+       context->kext = kxld_calloc(kxld_kext_sizeof());
        require_action(context->kext, finish, rval = KERN_RESOURCE_SHORTAGE);
-       bzero(context->kext, kxld_kext_sizeof());
 
        /* Check if we already have an order array for this arch */
 
@@ -166,9 +164,8 @@ kxld_create_context(KXLDContext **_context,
 #else
        /* In userspace, create the dictionary if it doesn't already exist */
        if (!s_order_dict) {
-               s_order_dict = kxld_alloc(sizeof(*s_order_dict));
+               s_order_dict = kxld_calloc(sizeof(*s_order_dict));
                require_action(s_order_dict, finish, rval = KERN_RESOURCE_SHORTAGE);
-               bzero(s_order_dict, sizeof(*s_order_dict));
 
                rval = kxld_dict_init(s_order_dict, kxld_dict_uint32_hash,
                    kxld_dict_uint32_cmp, 0);
@@ -181,9 +178,8 @@ kxld_create_context(KXLDContext **_context,
        /* Create an order array for this arch if needed */
 
        if (!context->section_order) {
-               section_order = kxld_alloc(sizeof(*section_order));
+               section_order = kxld_calloc(sizeof(*section_order));
                require_action(section_order, finish, rval = KERN_RESOURCE_SHORTAGE);
-               bzero(section_order, sizeof(*section_order));
 
 #if KERNEL
                s_section_order = section_order;
@@ -620,8 +616,6 @@ allocate_split_kext(KXLDContext *context, splitKextLinkInfo * link_info)
        linked_object = kxld_page_alloc_untracked(link_info->linkedKextSize);
        require(linked_object, finish);
        link_info->linkedKext = linked_object;
-
-       bzero(linked_object, vmsize);
        rval = KERN_SUCCESS;
 
 finish:
@@ -653,8 +647,14 @@ allocate_kext(KXLDContext *context,
            "Load address %p is not page-aligned.",
            (void *) (uintptr_t) vmaddr));
 
+       /* Zero out the memory before we fill it.  We fill this buffer in a
+        * sparse fashion, and it's simpler to clear it now rather than
+        * track and zero any pieces we didn't touch after we've written
+        * all of the sections to memory.
+        */
        if (flags & kKxldAllocateWritable) {
                linked_object = (u_char *) (u_long) vmaddr;
+               bzero(linked_object, vmsize);
        } else {
                linked_object = kxld_page_alloc_untracked(vmsize);
                require(linked_object, finish);
@@ -664,12 +664,6 @@ allocate_kext(KXLDContext *context,
 
        kxld_kext_set_linked_object_size(context->kext, vmsize);
 
-       /* Zero out the memory before we fill it.  We fill this buffer in a
-        * sparse fashion, and it's simpler to clear it now rather than
-        * track and zero any pieces we didn't touch after we've written
-        * all of the sections to memory.
-        */
-       bzero(linked_object, vmsize);
        *vmaddr_out = vmaddr;
        *vmsize_out = vmsize;
 
index 51c6df6adda66efbb37c666e3ec46318ff498f14..1cec07c853ea593383b41781d65fa020f0c89f76 100644 (file)
@@ -177,12 +177,11 @@ pool_create(size_t capacity)
 {
        KXLDArrayPool *pool = NULL, *rval = NULL;
 
-       pool = kxld_alloc(sizeof(*pool));
+       pool = kxld_calloc(sizeof(*pool));
        require(pool, finish);
 
        pool->buffer = kxld_page_alloc(capacity);
        require(pool->buffer, finish);
-       bzero(pool->buffer, capacity);
 
        rval = pool;
        pool = NULL;
index 116b0ffbb1c20bdb083a07dda14c9f358d2776f2..998ab0242ee4788fdd8a7fa9a25f4dcc9f2e44da 100644 (file)
@@ -47,6 +47,6 @@
  *
  */
 const char * kxld_demangle(const char *str, char **buffer, size_t *length)
-__attribute__((pure, nonnull(1), visibility("hidden")));
+__attribute__((nonnull(1), visibility("hidden")));
 
 #endif /* !_KXLD_DEMANGLE_H_ */
index 51cbb170b38a6c36b107875a041fb70b03be5df1..f83f0f3213139e353a4673d22a4bf5ea60c5b035 100644 (file)
@@ -248,9 +248,8 @@ kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size,
        /* Allocate the symbol table */
 
        if (!object->symtab) {
-               object->symtab = kxld_alloc(kxld_symtab_sizeof());
+               object->symtab = kxld_calloc(kxld_symtab_sizeof());
                require_action(object->symtab, finish, rval = KERN_RESOURCE_SHORTAGE);
-               bzero(object->symtab, kxld_symtab_sizeof());
        }
 
        /* Build the relocator */
index 7a289f7447c8f4a522659baa62fc0c4b02bc3bf8..ac4a8b3199fd1023af7bb8722fcb804d7a763c7a 100644 (file)
@@ -619,8 +619,7 @@ kxld_reloc_export_macho(const KXLDRelocator *relocator,
        if (kaslr_offsets == NULL) {
                kaslr_offsets_index = 0;
                kaslr_offsets_count = locrelocs->nitems + extrelocs->nitems;
-               kaslr_offsets = (uint32_t *)malloc(kaslr_offsets_count * sizeof(*kaslr_offsets));
-               bzero(kaslr_offsets, kaslr_offsets_count * sizeof(*kaslr_offsets));
+               kaslr_offsets = (uint32_t *)calloc(kaslr_offsets_count, sizeof(*kaslr_offsets));
        }
 
        // copies the reloc data into the __LINKEDIT segment
index bc2ace856e87381772e7586b3b296a37ec909099..0072c4197a6f56ccf56e9b52990df89553eade46 100644 (file)
@@ -114,6 +114,10 @@ kxld_sym_init_from_macho64(KXLDSym *sym, char *strtab, const struct nlist_64 *sr
        sym->base_addr = src->n_value;
        sym->link_addr = sym->base_addr;
 
+       if (!strcmp("__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase3Ev", sym->name)) {
+               sym->name = (char *)(uintptr_t) "__ZN15OSMetaClassBase8DispatchE5IORPC";
+       }
+
        rval = init_predicates(sym, src->n_type, src->n_desc);
        require_noerr(rval, finish);
 
index f40a3d173bef9b94ee30a8835153eddbc32d380f..47df25866751af74fe3f0005c7277dbd039b2326 100644 (file)
@@ -144,6 +144,30 @@ kxld_log(KXLDLogSubsystem subsystem, KXLDLogLevel level,
 
 /*******************************************************************************
 *******************************************************************************/
+void *
+kxld_calloc(size_t size)
+{
+       void * ptr = NULL;
+
+#if KERNEL
+       ptr = kalloc(size);
+       if (ptr) {
+               bzero(ptr, size);
+       }
+#else
+       ptr = calloc(1, size);
+#endif
+
+#if DEBUG
+       if (ptr) {
+               ++num_allocations;
+               bytes_allocated += size;
+       }
+#endif
+
+       return ptr;
+}
+
 void *
 kxld_alloc(size_t size)
 {
@@ -187,8 +211,11 @@ kxld_page_alloc_untracked(size_t size)
                        ptr = (void *) addr;
                }
        }
+       if (ptr) {
+               bzero(ptr, size);
+       }
 #else /* !KERNEL */
-       ptr = malloc(size);
+       ptr = calloc(1, size);
 #endif /* KERNEL */
 
        return ptr;
index ec9bb482b5de0a795ce8187059be0f9180dd217d..d11d2dc8b5239eea8b03f2be76b979c0d850c0bc 100644 (file)
@@ -139,6 +139,9 @@ __attribute__((visibility("hidden"), format(printf, 3, 4)));
 * Allocators
 *******************************************************************************/
 
+void * kxld_calloc(size_t size)
+__attribute__((malloc, visibility("hidden")));
+
 void * kxld_alloc(size_t size)
 __attribute__((malloc, visibility("hidden")));
 
index 95bf5a48c50eea243e7541f7dfb781db014206fc..4ae5cd977a7e04fee423f329f0d6feb7c0e6ccd7 100644 (file)
@@ -26,6 +26,7 @@
 
 #endif /* KERNEL */
 
+#include <machine/atomic.h>
 #include <string.h>
 #include <stdint.h>
 #ifndef os_assumes
@@ -54,8 +55,8 @@ OSAtomicCompareAndSwapInt(int oldi, int newi, int volatile *dst)
        return original == oldi;
 }
 #else
-#define OSAtomicCompareAndSwapLong(_Old, _New, _Ptr) __sync_bool_compare_and_swap(_Ptr, _Old, _New)
-#define OSAtomicCompareAndSwapInt(_Old, _New, _Ptr) __sync_bool_compare_and_swap(_Ptr, _Old, _New)
+#define OSAtomicCompareAndSwapLong(_Old, _New, _Ptr) os_atomic_cmpxchg(_Ptr, _Old, _New, relaxed)
+#define OSAtomicCompareAndSwapInt(_Old, _New, _Ptr) os_atomic_cmpxchg(_Ptr, _Old, _New, relaxed)
 #endif
 
 
index a7b66fe8a94fd001514ae49762aa7f51c324f59e..f40ad217072ee4160b10a4bb8b7917104aca6507 100644 (file)
@@ -34,6 +34,10 @@ DATAFILES = \
        OSReturn.h \
        OSTypes.h
 
+DRIVERKIT_DATAFILES = \
+       OSByteOrder.h \
+       _OSByteOrder.h
+
 KERNELFILES = \
        ${DATAFILES} \
        OSAtomic.h \
@@ -62,6 +66,10 @@ PRIVATE_DATAFILES = \
 
 INSTALL_MI_LIST        = ${DATAFILES}
 
+INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES}
+
+DRIVERKITINCDIR = $(DRIVERKITSDKHEADERSROOT)/usr/local/include
+
 INSTALL_MI_DIR = libkern
 
 INSTALL_MI_LCL_LIST =        \
@@ -85,9 +93,9 @@ EXPORT_MI_GEN_LIST = version.h
 EXPORT_MI_DIR = libkern
 
 version.h: version.h.template $(SRCROOT)/config/MasterVersion
-       @echo "[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)libkern/$@$(Color0) from $(ColorF)$<$(Color0)";
+       $(call makelog,[$(CMD_MC)] $(ColorH)GENERATING$(Color0) $(ColorLF)libkern/$@$(Color0) from $(ColorF)$<$(Color0))
        $(_v)install $(DATA_INSTALL_FLAGS) $< $@
-       $(_v)$(NEWVERS) $@ > /dev/null;
+       $(_v)$(NEWVERS) $@ > /dev/null
 
 include $(MakeInc_rule)
 include $(MakeInc_dir)
index 375151929965ad72142c711f447b71ea85e810c0..e1290895c0fc30f20f4ecf136439196a3fe6cddf 100644 (file)
@@ -37,6 +37,7 @@
 #define _OS_OSATOMIC_H
 
 #include <libkern/OSBase.h>
+#include <string.h>
 
 #if defined(__cplusplus)
 extern "C" {
@@ -53,7 +54,7 @@ extern "C" {
  * -- var is used, but sizeof does not evaluate the
  *    argument, i.e. we're safe against "++" etc. in var --
  */
-#define __SAFE_CAST_PTR(type, var) (((type)(var))+(0/(sizeof(*var) == sizeof(*(type)0) ? 1 : 0)))
+#define __SAFE_CAST_PTR(type, var) (((type)(var))+(0/(sizeof(*var) == sizeof(*(type)NULL) ? 1 : 0)))
 #else
 #define __SAFE_CAST_PTR(type, var) ((type)(var))
 #endif
index 2729d3c83194c40431fd9fb47f9c18e8f9f9c7d2..b9011d07225ec4bf4506849d66bc3081b8d9ae1d 100644 (file)
@@ -261,9 +261,12 @@ __BEGIN_DECLS
 /* Define C-string versions of the CFBundle keys for use in the kernel.
  */
 #define kCFBundleIdentifierKey                  "CFBundleIdentifier"
+#define kCFBundleIdentifierKernelKey            "CFBundleIdentifierKernel"
 #define kCFBundleVersionKey                     "CFBundleVersion"
 #define kCFBundleNameKey                        "CFBundleName"
 #define kCFBundleExecutableKey                  "CFBundleExecutable"
+#define kCFBundlePackageTypeKey                 "CFBundlePackageType"
+#define kCFBundleDriverKitUUIDKey               "CFBundleDriverKitUUID"
 #endif /* KERNEL */
 
 /*!
@@ -339,6 +342,13 @@ __BEGIN_DECLS
  */
 #define kOSKernelResourceKey                    "OSKernelResource"
 
+/*!
+ * @define   kOSKextVariantOverrideKey
+ * @abstract A dictionary with target names as key and a target-specific variant
+ *           name as value.
+ */
+#define kOSKextVariantOverrideKey               "OSKextVariantOverride"
+
 /*!
  * @define   kIOKitPersonalitiesKey
  * @abstract A dictionary of dictionaries used in matching for I/O Kit drivers.
@@ -408,6 +418,20 @@ __BEGIN_DECLS
  */
 #define kOSKextKernelIdentifier                 "__kernel__"
 
+/*!
+ * @define  kOSKextBundlePackageTypeKext
+ * @abstract
+ * The bundle type value for Kernel Extensions.
+ */
+#define kOSKextBundlePackageTypeKext        "KEXT"
+
+/*!
+ * @define  kOSKextBundlePackageTypeDriverKit
+ * @abstract
+ * The bundle type value for Driver Extensions.
+ */
+#define kOSKextBundlePackageTypeDriverKit   "DEXT"
+
 /*!
  * @define   kOSBundleRequiredRoot
  * @abstract
@@ -457,6 +481,19 @@ __BEGIN_DECLS
  */
 #define kOSBundleRequiredConsole                "Console"
 
+/*!
+ * @define   kOSBundleRequiredDriverKit
+ * @abstract
+ * This <code>@link kOSBundleRequiredKey OSBundleRequired@/link</code>
+ * value indicates that the driver extension's (DriverKit driver's)
+ * personalities must be present in the kernel at early boot (specifically
+ * before <code>@link //apple_ref/doc/man/8/kextd kextd(8)@/link</code> starts)
+ * in order to compete with kexts built into the prelinkedkernel. Note that
+ * kextd is still required to launch the user space driver binary. The IOKit
+ * matching will happen during early boot, and the actual driver launch
+ * will happen after kextd starts.
+ */
+#define kOSBundleRequiredDriverKit              "DriverKit"
 
 #if PRAGMA_MARK
 #pragma mark -
@@ -973,7 +1010,7 @@ OSKextResetPgoCounters(void);
  * @/textblock
  * </pre>
  */
-extern const void * gOSKextUnresolved;
+extern const void * const gOSKextUnresolved;
 
 /*!
  * @define OSKextSymbolIsResolved
index 2167f212b03800d16f24cd87fe1d5ee4b2b0962e..a3971be0792dfbca88951700880babc6ec052123 100644 (file)
@@ -60,6 +60,8 @@ typedef uint8_t OSKextExcludeLevel;
 #define kOSKextExcludeKext  (1)
 #define kOSKextExcludeAll   (2)
 
+#define kOSKextManagementEntitlement "com.apple.private.security.kext-management"
+
 #if PRAGMA_MARK
 #pragma mark -
 /********************************************************************/
@@ -698,7 +700,7 @@ Boolean OSKextVersionGetString(
 /********************************************************************/
 #endif
 #ifdef XNU_KERNEL_PRIVATE
-void kext_weak_symbol_referenced(void);
+void kext_weak_symbol_referenced(void) __abortlike;
 #endif /* XNU_KERNEL_PRIVATE */
 
 #if PRAGMA_MARK
index bab81dba965eebd283844e943b3fb80d91904750..0396821f6456d70a400df959ed76b97417274c00 100644 (file)
@@ -44,7 +44,7 @@ enum{
        kOSSerializeEndCollecton = 0x80000000U,
 };
 
-#define kOSSerializeBinarySignature "\323\0\0"
-
+#define kOSSerializeBinarySignature        "\323\0\0"
+#define kOSSerializeIndexedBinarySignature 0x000000D4
 
 #endif /* _OS_OSSERIALIZEBINARY_H */
index 69b376774b1f5d865d142d8e143ad74397eaeb5a..f9e09b101b1f93f4bc126146979ec81b7a6b3802 100644 (file)
@@ -23,6 +23,7 @@ DATAFILES = \
           OSNumber.h \
           OSObject.h \
           OSOrderedSet.h \
+          OSPtr.h \
           OSSerialize.h \
           OSSet.h \
           OSString.h \
index d094b0e08a9168edeeaf0bd6f74feb21920d7f8f..73edeb601b9ed838378294d78679cc8ab79ebd80 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define _OS_OSARRAY_H
 
 #include <libkern/c++/OSCollection.h>
+#include <libkern/c++/OSPtr.h>
 
 class OSSerialize;
+class OSArray;
+
+typedef OSPtr<OSArray> OSArrayPtr;
 
 /*!
  * @header
@@ -90,7 +94,7 @@ class OSArray : public OSCollection
        friend class OSSet;
        friend class OSSerialize;
 
-       OSDeclareDefaultStructors(OSArray)
+       OSDeclareDefaultStructors(OSArray);
 
 #if APPLE_KEXT_ALIGN_CONTAINERS
 
@@ -98,12 +102,12 @@ protected:
        unsigned int             count;
        unsigned int             capacity;
        unsigned int             capacityIncrement;
-       const OSMetaClassBase ** array;
+       OSCollectionTaggedPtr<const OSMetaClassBase> *array;
 
 #else /* APPLE_KEXT_ALIGN_CONTAINERS */
 
 protected:
-       const OSMetaClassBase ** array;
+       OSCollectionTaggedPtr<const OSMetaClassBase> *array;
        unsigned int             count;
        unsigned int             capacity;
        unsigned int             capacityIncrement;
@@ -140,7 +144,7 @@ public:
  * (<i>unlike</i> @link //apple_ref/doc/uid/20001502 CFMutableArray@/link,
  * for which the initial capacity is a hard limit).
  */
-       static OSArray * withCapacity(unsigned int capacity);
+       static OSArrayPtr withCapacity(unsigned int capacity);
 
 
 /*!
@@ -168,7 +172,7 @@ public:
  * (<i>unlike</i> @link //apple_ref/doc/uid/20001502 CFMutableArray@/link,
  * for which the initial capacity is a hard limit).
  */
-       static OSArray * withObjects(
+       static OSArrayPtr withObjects(
                const OSObject * objects[],
                unsigned int     count,
                unsigned int     capacity = 0);
@@ -206,7 +210,7 @@ public:
  * for storage in the new OSArray,
  * not copied.
  */
-       static OSArray * withArray(
+       static OSArrayPtr withArray(
                const OSArray * array,
                unsigned int    capacity = 0);
 
@@ -698,7 +702,7 @@ public:
        virtual unsigned setOptions(
                unsigned   options,
                unsigned   mask,
-               void     * context = 0) APPLE_KEXT_OVERRIDE;
+               void     * context = NULL) APPLE_KEXT_OVERRIDE;
 
 
 /*!
@@ -723,7 +727,7 @@ public:
  * Objects that are not derived from OSCollection are retained
  * rather than copied.
  */
-       OSCollection * copyCollection(OSDictionary * cycleDict = 0) APPLE_KEXT_OVERRIDE;
+       OSCollectionPtr copyCollection(OSDictionary * cycleDict = NULL) APPLE_KEXT_OVERRIDE;
 
        OSMetaClassDeclareReservedUnused(OSArray, 0);
        OSMetaClassDeclareReservedUnused(OSArray, 1);
index 207bb4da853df76fcd327df7c3ea37c9618bce9e..67e3b840beb17ad42d9831f9f718f1c2f93288f1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define _OS_OSBOOLEAN_H
 
 #include <libkern/c++/OSObject.h>
+#include <libkern/c++/OSPtr.h>
 
 class OSString;
+class OSBoolean;
+
+typedef OSPtr<OSBoolean> OSBooleanPtr;
 
 /*!
  * @header
@@ -62,7 +66,7 @@ class OSString;
  */
 class OSBoolean : public OSObject
 {
-       OSDeclareDefaultStructors(OSBoolean)
+       OSDeclareDefaultStructors(OSBoolean);
        friend class OSSerialize;
 
 protected:
@@ -102,7 +106,7 @@ public:
  * <code>@link kOSBooleanFalse kOSBooleanFalse@/link</code>,
  * so that you can always use pointer comparison with OSBoolean objects.
  */
-       static OSBoolean * withBoolean(bool value);
+       static OSBooleanPtr withBoolean(bool value);
 
 /*!
  * @function free
index f6c7e01b7b685b43d1b8f8c7bec661db1ead68f1..67ec1f771301f948fd01ebcfe8e9b24a37a1af95 100644 (file)
 #define _OS_OSCOLLECTION_H
 
 #include <libkern/c++/OSObject.h>
+#include <libkern/c++/OSPtr.h>
 
 class OSDictionary;
+class OSCollection;
+
+typedef OSPtr<OSCollection> OSCollectionPtr;
+
+template <typename T>
+using OSCollectionTaggedPtr = OSTaggedPtr<T, OSCollection>;
 
 
 /*!
@@ -418,7 +425,7 @@ public:
        virtual unsigned setOptions(
                unsigned   options,
                unsigned   mask,
-               void     * context = 0);
+               void     * context = NULL);
        OSMetaClassDeclareReservedUsed(OSCollection, 0);
 
 /*!
@@ -445,7 +452,7 @@ public:
  * Subclasses of OSCollection must override this function
  * to properly support deep copies.
  */
-       virtual OSCollection *copyCollection(OSDictionary * cycleDict = 0);
+       virtual OSCollectionPtr copyCollection(OSDictionary * cycleDict = NULL);
        OSMetaClassDeclareReservedUsed(OSCollection, 1);
 
 /*!
index d82cff5097bb388b8b2c69d97813426873c6bf98..eb57231d99ce36b9eaa5b8eaff62509c03e39ff7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define _OS_OSCOLLECTIONITERATOR_H
 
 #include <libkern/c++/OSIterator.h>
+#include <libkern/c++/OSCollection.h>
+#include <libkern/c++/OSPtr.h>
 
-class OSCollection;
+class OSCollectionIterator;
+
+typedef OSPtr<OSCollectionIterator> OSCollectionIteratorPtr;
 
 /*!
  * @header
@@ -90,11 +94,11 @@ class OSCollection;
  */
 class OSCollectionIterator : public OSIterator
 {
-       OSDeclareDefaultStructors(OSCollectionIterator)
+       OSDeclareDefaultStructors(OSCollectionIterator);
 
 protected:
 // xx-review: Do we want to document these?
-       const OSCollection * collection;
+       OSPtr<const OSCollection> collection;
        void               * collIterator;
        unsigned int         initialUpdateStamp;
        bool                 valid;
@@ -112,7 +116,7 @@ public:
  * @result
  * A new instance of OSCollectionIterator, or <code>NULL</code> on failure.
  */
-       static OSCollectionIterator * withCollection(const OSCollection * inColl);
+       static OSCollectionIteratorPtr withCollection(const OSCollection * inColl);
 
 
 /*!
index b1547ae8eb0086b8527f40099fa97cb7bde24fcf..ed473487a1971d5f7a80fe4d5b03a9845e91a595 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define _OS_OSDATA_H
 
 #include <libkern/c++/OSObject.h>
+#include <libkern/c++/OSPtr.h>
 
+class OSData;
 class OSString;
 
+typedef OSPtr<OSData> OSDataPtr;
+typedef OSPtr<const OSData> OSDataConstPtr;
+
 /*!
  * @header
  *
@@ -75,7 +80,7 @@ class OSData : public OSObject
 {
        friend class OSSerialize;
 
-       OSDeclareDefaultStructors(OSData)
+       OSDeclareDefaultStructors(OSData);
 
 #if APPLE_KEXT_ALIGN_CONTAINERS
 
@@ -135,7 +140,7 @@ public:
  * (<i>unlike</i> @link //apple_ref/doc/uid/20001498 CFMutableData@/link,
  * for which a nonzero initial capacity is a hard limit).
  */
-       static OSData * withCapacity(unsigned int capacity);
+       static OSDataPtr withCapacity(unsigned int capacity);
 
 
 /*!
@@ -158,7 +163,7 @@ public:
  * (<i>unlike</i> @link //apple_ref/doc/uid/20001498 CFMutableData@/link,
  * for which a nonzero initial capacity is a hard limit).
  */
-       static OSData * withBytes(
+       static OSDataPtr withBytes(
                const void   * bytes,
                unsigned int   numBytes);
 
@@ -191,7 +196,7 @@ public:
  * but you can get the byte pointer and
  * modify bytes within the shared buffer.
  */
-       static OSData * withBytesNoCopy(
+       static OSDataPtr withBytesNoCopy(
                void         * bytes,
                unsigned int   numBytes);
 
@@ -215,7 +220,7 @@ public:
  * (<i>unlike</i> @link //apple_ref/doc/uid/20001498 CFMutableData@/link,
  * for which a nonzero initial capacity is a hard limit).
  */
-       static OSData * withData(const OSData * inData);
+       static OSDataPtr withData(const OSData * inData);
 
 
 /*!
@@ -240,7 +245,7 @@ public:
  * (<i>unlike</i> @link //apple_ref/doc/uid/20001498 CFMutableData@/link,
  * for which a nonzero initial capacity is a hard limit).
  */
-       static OSData * withData(
+       static OSDataPtr withData(
                const OSData * inData,
                unsigned int   start,
                unsigned int   numBytes);
index a7dcfcdb0986f165119f44d21ddfc11a92bd9e50..98d2581538250370bc87861ff71edea3dfc0d452 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define _IOKIT_IODICTIONARY_H
 
 #include <libkern/c++/OSCollection.h>
+#include <libkern/c++/OSArray.h>
+#include <libkern/c++/OSPtr.h>
 
 class OSArray;
 class OSSymbol;
 class OSString;
+class OSDictionary;
+
+typedef OSPtr<OSDictionary> OSDictionaryPtr;
 
 /*!
  * @header
@@ -114,7 +119,7 @@ class OSDictionary : public OSCollection
 {
        friend class OSSerialize;
 
-       OSDeclareDefaultStructors(OSDictionary)
+       OSDeclareDefaultStructors(OSDictionary);
 
 #if APPLE_KEXT_ALIGN_CONTAINERS
 
@@ -123,8 +128,8 @@ protected:
        unsigned int   capacity;
        unsigned int   capacityIncrement;
        struct dictEntry {
-               const OSSymbol        * key;
-               const OSMetaClassBase * value;
+               OSCollectionTaggedPtr<const OSSymbol>        key;
+               OSCollectionTaggedPtr<const OSMetaClassBase> value;
 #if XNU_KERNEL_PRIVATE
                static int compare(const void *, const void *);
 #endif
@@ -135,8 +140,8 @@ protected:
 
 protected:
        struct dictEntry {
-               const OSSymbol        * key;
-               const OSMetaClassBase * value;
+               OSCollectionTaggedPtr<const OSSymbol>        key;
+               OSCollectionTaggedPtr<const OSMetaClassBase> value;
 #if XNU_KERNEL_PRIVATE
                static int compare(const void *, const void *);
 #endif
@@ -179,7 +184,7 @@ public:
  * (<i>unlike</i> @link //apple_ref/doc/uid/20001497 CFMutableDictionary@/link,
  * for which the initial capacity is a hard limit).
  */
-       static OSDictionary * withCapacity(unsigned int capacity);
+       static OSDictionaryPtr withCapacity(unsigned int capacity);
 
 
 /*!
@@ -214,7 +219,7 @@ public:
  * @link //apple_ref/doc/uid/20001497 CFMutableDictionary@/link,
  * for which the initial capacity is a hard limit).
  */
-       static OSDictionary * withObjects(
+       static OSDictionaryPtr withObjects(
                const OSObject * objects[],
                const OSSymbol * keys[],
                unsigned int     count,
@@ -252,7 +257,7 @@ public:
  * @link //apple_ref/doc/uid/20001497 CFMutableDictionary@/link,
  * for which the initial capacity is a hard limit).
  */
-       static OSDictionary * withObjects(
+       static OSDictionaryPtr withObjects(
                const OSObject * objects[],
                const OSString * keys[],
                unsigned int     count,
@@ -293,7 +298,7 @@ public:
  * in the new OSDictionary,
  * not copied.
  */
-       static OSDictionary * withDictionary(
+       static OSDictionaryPtr withDictionary(
                const OSDictionary * dict,
                unsigned int         capacity = 0);
 
@@ -898,7 +903,7 @@ public:
        virtual unsigned setOptions(
                unsigned   options,
                unsigned   mask,
-               void     * context = 0) APPLE_KEXT_OVERRIDE;
+               void     * context = NULL) APPLE_KEXT_OVERRIDE;
 
 
 /*!
@@ -924,12 +929,12 @@ public:
  * Objects that are not derived from OSCollection are retained
  * rather than copied.
  */
-       OSCollection * copyCollection(OSDictionary * cycleDict = 0) APPLE_KEXT_OVERRIDE;
+       OSCollectionPtr copyCollection(OSDictionary * cycleDict = NULL) APPLE_KEXT_OVERRIDE;
 
 #if XNU_KERNEL_PRIVATE
        bool setObject(const OSSymbol *aKey, const OSMetaClassBase *anObject, bool onlyAdd);
-       OSArray * copyKeys(void);
        void sortBySymbol(void);
+       OSArrayPtr copyKeys(void);
 #endif /* XNU_KERNEL_PRIVATE */
 
 
index f23dd782cce28c851409052033e58b3a6760019d..a9c049b8721a83398ba08d2681b7e011edc51d0f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -67,7 +67,7 @@
  */
 class OSIterator : public OSObject
 {
-       OSDeclareAbstractStructors(OSIterator)
+       OSDeclareAbstractStructors(OSIterator);
 
 public:
 /*!
index 89821706966077e4962f6790b7d3b72be89ec96b..942788f0c18ab4446ddc737c6ffe8eaa650423ee 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -45,6 +45,7 @@ extern "C" {
 #include <libkern/OSKextLibPrivate.h>
 #include <libkern/c++/OSObject.h>
 #include <libkern/c++/OSContainers.h>
+#include <libkern/c++/OSPtr.h>
 #include <IOKit/IOLocks.h>
 
 /*********************************************************************
@@ -70,6 +71,8 @@ void kxld_log_callback(
 *********************************************************************/
 class OSKext;
 
+typedef OSPtr<OSKext> OSKextPtr;
+
 extern "C" {
 void OSKextLog(
        OSKext         * aKext,
@@ -153,7 +156,7 @@ typedef struct OSKextActiveAccount OSKextActiveAccount;
 /********************************************************************/
 class OSKext : public OSObject
 {
-       OSDeclareDefaultStructors(OSKext)
+       OSDeclareDefaultStructors(OSKext);
 
 #if PRAGMA_MARK
 /**************************************/
@@ -234,11 +237,12 @@ private:
 /*************************
 * Instance variables
 *************************/
-       OSDictionary   * infoDict;
+       OSDictionaryPtr  infoDict;
 
-       const OSSymbol * bundleID;
-       OSString       * path;           // not necessarily correct :-/
-       OSString       * executableRelPath;// relative to bundle
+       OSSymbolConstPtr    bundleID;
+       OSStringPtr    path;           // not necessarily correct :-/
+       OSStringPtr    executableRelPath;// relative to bundle
+       OSStringPtr    userExecutableRelPath;// relative to bundle
 
        OSKextVersion    version;        // parsed
        OSKextVersion    compatibleVersion;// parsed
@@ -250,18 +254,19 @@ private:
                                         // kOSKextInvalidLoadTag invalid
        kmod_info_t    * kmod_info;      // address into linkedExec./alloced for interface
 
-       OSArray        * dependencies;   // kernel resource does not have any;
-                                        // links directly to kernel
+       OSArrayPtr     dependencies;   // kernel resource does not have any;
+                                      // links directly to kernel
 
 /* Only real kexts have these; interface kexts do not.
  */
-       OSData         * linkedExecutable;
-       OSSet          * metaClasses;       // for C++/OSMetaClass kexts
+       OSDataPtr        linkedExecutable;
+       OSSetPtr         metaClasses;       // for C++/OSMetaClass kexts
 
 /* Only interface kexts have these; non-interface kexts can get at them
  * in the linked Executable.
  */
-       OSData         * interfaceUUID;
+       OSDataPtr        interfaceUUID;
+       OSDataPtr        driverKitUUID;
 
        struct {
                unsigned int loggingEnabled:1;
@@ -287,6 +292,8 @@ private:
                unsigned int jettisonLinkeditSeg:1;
        } flags;
 
+       uint32_t matchingRefCount;
+
        struct list_head pendingPgoHead;
        uuid_t instance_uuid;
        OSKextAccount * account;
@@ -303,13 +310,13 @@ private:
  */
 public:
        static void           initialize(void);
-       static OSDictionary * copyKexts(void);
+       static OSDictionaryPtr copyKexts(void);
        static OSReturn       removeKextBootstrap(void);
        static void           willShutdown(void);// called by IOPMrootDomain on shutdown
        static  void reportOSMetaClassInstances(
                const char     * kextIdentifier,
                OSKextLogSpec    msgLogSpec);
-
+       static void OSKextLogDriverKitInfoLoad(OSKext *kext);
 #endif /* XNU_KERNEL_PRIVATE */
 
 private:
@@ -328,14 +335,14 @@ private:
 
 /* Instance life cycle.
  */
-       static OSKext * withBooterData(
+       static OSKextPtr withBooterData(
                OSString * deviceTreeName,
                OSData   * booterData);
        virtual bool initWithBooterData(
                OSString * deviceTreeName,
                OSData   * booterData);
 
-       static OSKext * withPrelinkedInfoDict(
+       static OSKextPtr withPrelinkedInfoDict(
                OSDictionary * infoDict,
                bool doCoalesedSlides);
        virtual bool initWithPrelinkedInfoDict(
@@ -344,7 +351,7 @@ private:
 
        static void setAllVMAttributes(void);
 
-       static OSKext * withMkext2Info(
+       static OSKextPtr withMkext2Info(
                OSDictionary * anInfoDict,
                OSData       * mkextData);
        virtual bool initWithMkext2Info(
@@ -381,7 +388,7 @@ private:
                OSData * mkextData,
                OSNumber * offsetNum,
                const char * entryName);
-       virtual OSData * extractMkext2FileData(
+       virtual OSDataPtr extractMkext2FileData(
                UInt8      * data,
                const char * name,
                uint32_t     compressedSize,
@@ -454,7 +461,7 @@ private:
        virtual OSReturn validateKextMapping(bool startFlag);
        virtual boolean_t verifySegmentMapping(kernel_segment_command_t *seg);
 
-       static OSArray * copyAllKextPersonalities(
+       static OSArrayPtr copyAllKextPersonalities(
                bool filterSafeBootFlag = false);
 
        static  void  setPrelinkedPersonalities(OSArray * personalitiesArray);
@@ -477,21 +484,21 @@ private:
 
 /* Getting info about loaded kexts (kextstat).
  */
-       static  OSDictionary * copyLoadedKextInfo(
+       static  OSDictionaryPtr copyLoadedKextInfo(
                OSArray * kextIdentifiers = NULL,
                OSArray * keys = NULL);
-       static  OSDictionary * copyLoadedKextInfoByUUID(
+       static  OSDictionaryPtr copyLoadedKextInfoByUUID(
                OSArray * kextIdentifiers = NULL,
                OSArray * keys = NULL);
-       static OSData * copyKextUUIDForAddress(OSNumber *address = NULL);
-       virtual OSDictionary * copyInfo(OSArray * keys = NULL);
+       static OSDataPtr copyKextUUIDForAddress(OSNumber *address = NULL);
+       virtual OSDictionaryPtr copyInfo(OSArray * keys = NULL);
 
 /* Logging to user space.
  */
        static OSKextLogSpec setUserSpaceLogFilter(
                OSKextLogSpec  userLogSpec,
                bool           captureFlag = false);
-       static OSArray * clearUserSpaceLogFilter(void);
+       static OSArrayPtr clearUserSpaceLogFilter(void);
        static OSKextLogSpec getUserSpaceLogFilter(void);
 
 /* OSMetaClasses defined by kext.
@@ -513,10 +520,10 @@ private:
 
        static OSReturn dequeueCallbackForRequestTag(
                OSKextRequestTag    requestTag,
-               OSDictionary     ** callbackRecordOut);
+               LIBKERN_RETURNS_RETAINED OSDictionary     ** callbackRecordOut);
        static OSReturn dequeueCallbackForRequestTag(
                OSNumber     *    requestTagNum,
-               OSDictionary ** callbackRecordOut);
+               LIBKERN_RETURNS_RETAINED OSDictionary ** callbackRecordOut);
        static void invokeRequestCallback(
                OSDictionary * callbackRecord,
                OSReturn         requestResult);
@@ -538,6 +545,7 @@ public:
                unsigned int    cnt,
                int          (* printf_func)(const char *fmt, ...),
                uint32_t        flags);
+       bool isDriverKit(void);
 private:
        static OSKextLoadedKextSummary *summaryForAddress(const uintptr_t addr);
        static void *kextForAddress(const void *addr);
@@ -580,12 +588,12 @@ public:
 /**************************************/
 #endif
 public:
-// caller must release
-       static OSKext * lookupKextWithIdentifier(const char * kextIdentifier);
-       static OSKext * lookupKextWithIdentifier(OSString * kextIdentifier);
-       static OSKext * lookupKextWithLoadTag(OSKextLoadTag aTag);
-       static OSKext * lookupKextWithAddress(vm_address_t address);
-       static OSKext * lookupKextWithUUID(uuid_t uuid);
+       // caller must release
+       static OSKextPtr lookupKextWithIdentifier(const char * kextIdentifier);
+       static OSKextPtr lookupKextWithIdentifier(OSString * kextIdentifier);
+       static OSKextPtr lookupKextWithLoadTag(OSKextLoadTag aTag);
+       static OSKextPtr lookupKextWithAddress(vm_address_t address);
+       static OSKextPtr lookupKextWithUUID(uuid_t uuid);
 
        kernel_section_t *lookupSection(const char *segname, const char*secname);
 
@@ -598,20 +606,29 @@ public:
                OSKextExcludeLevel startOpt            = kOSKextExcludeNone,
                OSKextExcludeLevel startMatchingOpt    = kOSKextExcludeAll,
                OSArray          * personalityNames    = NULL);
+
        static OSReturn loadKextWithIdentifier(
                OSString         * kextIdentifier,
+               LIBKERN_RETURNS_RETAINED_ON_ZERO OSObject        ** kextRef,
                Boolean            allowDeferFlag      = true,
                Boolean            delayAutounloadFlag = false,
                OSKextExcludeLevel startOpt            = kOSKextExcludeNone,
                OSKextExcludeLevel startMatchingOpt    = kOSKextExcludeAll,
                OSArray          * personalityNames    = NULL);
+
+       static void dropMatchingReferences(
+               OSSet * kexts);
+
        static OSReturn removeKextWithIdentifier(
                const char * kextIdentifier,
                bool         terminateServicesAndRemovePersonalitiesFlag = false);
        static OSReturn removeKextWithLoadTag(
                OSKextLoadTag loadTag,
                bool          terminateServicesAndRemovePersonalitiesFlag = false);
-
+       static OSReturn requestDaemonLaunch(
+               OSString        * kextIdentifier,
+               OSString        * serverName,
+               OSNumber        * serverTag);
        static OSReturn requestResource(
                const char                    * kextIdentifier,
                const char                    * resourceName,
@@ -647,11 +664,12 @@ public:
 
        virtual OSKextLoadTag      getLoadTag(void);
        virtual void               getSizeInfo(uint32_t *loadSize, uint32_t *wiredSize);
-       virtual OSData           * copyUUID(void);
-       OSData                   * copyTextUUID(void);
-       OSData                   * copyMachoUUID(const kernel_mach_header_t * header);
-       virtual OSArray          * copyPersonalitiesArray(void);
-
+       virtual OSDataPtr          copyUUID(void);
+       OSDataPtr                  copyTextUUID(void);
+       OSDataPtr                  copyMachoUUID(const kernel_mach_header_t * header);
+       virtual OSArrayPtr         copyPersonalitiesArray(void);
+       static bool                copyUserExecutablePath(const OSSymbol * bundleID, char * pathResult, size_t pathSize);
+       virtual void               setDriverKitUUID(OSData *uuid);
 /* This removes personalities naming the kext (by CFBundleIdentifier),
  * not all personalities defined by the kext (IOPersonalityPublisher or CFBundleIdentifier).
  */
index 93c2548e107d8cde58fe8da74ccf60be5499bd9a..358b2ad53f322d2f346cd0c3a1079b1e6521ddc0 100644 (file)
@@ -78,7 +78,11 @@ extern "C" int debug_ivars_size;
 
 #ifndef NULL
 #if defined (__cplusplus)
+#if __cplusplus >= 201103L
+#define NULL nullptr
+#else
 #define NULL 0
+#endif
 #else
 #define NULL ((void *)0)
 #endif
index 21b4f40e625647f0ce72371e8f50564e3b151256..03da0e6c29a9975fc2fe06f23f753bcb34d70719 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -32,6 +32,7 @@
 
 #include <libkern/OSReturn.h>
 #include <kern/debug.h>
+#include <ptrauth.h>
 
 /*
  * LIBKERN_ macros below can be used to describe the ownership semantics
  *  attribute applied to a function.
  *  In the former case, it stipulates that the function is returning at "+1",
  *  and in the latter case "+0".
+ *
+ *  LIBKERN_RETURNS_RETAINED and LIBKERN_RETURNS_NOT_RETAINED attributes
+ *  can be also applied to out parameters, in which case they specify
+ *  that an out parameter is written into at +1 or +0 respectively.
+ *  For out parameters of non-void functions an assumption is
+ *  that an out parameter is written into iff the return value is non-zero
+ *  unless the function returns a typedef to kern_return_t,
+ *  in which case it is assumed to be written into on zero value
+ *  (kIOReturnSuccess).
+ *  This can be customized using the attributes
+ *  LIBKERN_RETURNS_RETAINED_ON_ZERO and LIBKERN_RETURNS_RETAINED_ON_NONZERO.
  */
 #if __has_attribute(os_returns_retained)
 #define LIBKERN_RETURNS_RETAINED __attribute__((os_returns_retained))
 #define LIBKERN_CONSUMES_THIS
 #endif
 
+/*
+ * LIBKERN_RETURNS_RETAINED_ON_ZERO is an attribute applicable to out
+ * parameters.
+ * It specifies that an out parameter at +1 is written into an argument iff
+ * the function returns a zero return value.
+ */
+#if __has_attribute(os_returns_retained_on_zero)
+#define LIBKERN_RETURNS_RETAINED_ON_ZERO __attribute__((os_returns_retained_on_zero))
+#else
+#define LIBKERN_RETURNS_RETAINED_ON_ZERO
+#endif
+
+/*
+ * LIBKERN_RETURNS_RETAINED_ON_NON_ZERO is an attribute applicable to out
+ * parameters.
+ * It specifies that an out parameter at +1 is written into an argument iff
+ * the function returns a non-zero return value.
+ */
+#if __has_attribute(os_returns_retained_on_non_zero)
+#define LIBKERN_RETURNS_RETAINED_ON_NONZERO __attribute__((os_returns_retained_on_non_zero))
+#else
+#define LIBKERN_RETURNS_RETAINED_ON_NONZERO
+#endif
+
 class OSMetaClass;
 class OSObject;
 class OSString;
@@ -101,7 +137,10 @@ class OSSerialize;
 class OSOrderedSet;
 class OSCollection;
 #endif /* XNU_KERNEL_PRIVATE */
-
+struct IORPC;
+class OSInterface
+{
+};
 
 /*!
  * @header
@@ -128,12 +167,12 @@ class OSCollection;
 #else /* XNU_KERNEL_PRIVATE */
 #include <TargetConditionals.h>
 
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 #define APPLE_KEXT_VTABLE_PADDING   0
-#else /* TARGET_OS_EMBEDDED */
+#else /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
 /*! @parseOnly */
 #define APPLE_KEXT_VTABLE_PADDING   1
-#endif /* TARGET_OS_EMBEDDED */
+#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
 
 #endif /* XNU_KERNEL_PRIVATE */
 
@@ -160,7 +199,10 @@ class OSCollection;
 #define APPLE_KEXT_DEPRECATED  __attribute__((deprecated))
 
 
-#if __cplusplus >= 201103L
+/*
+ *  <rdar://problem/44872498> AppleUSBAudio builds xnu's libkern headers in user space
+ */
+#if !defined(BUILD_FOR_USERSPACE) && (__cplusplus >= 201103L)
 #define APPLE_KEXT_OVERRIDE                             override
 #if defined(__LP64__)
 #define APPLE_KEXT_COMPATIBILITY_OVERRIDE
@@ -172,7 +214,7 @@ class OSCollection;
 #define APPLE_KEXT_COMPATIBILITY_OVERRIDE
 #endif
 
-#define APPLE_KEXT_WSHADOW_PUSH _Pragma("clang diagnostic push"); \
+#define APPLE_KEXT_WSHADOW_PUSH _Pragma("clang diagnostic push") \
        _Pragma("clang diagnostic ignored \"-Wunknown-warning-option\"") \
        _Pragma("clang diagnostic ignored \"-Wshadow-field\"")
 
@@ -232,6 +274,7 @@ class OSCollection;
  *
  * The run-time type macros and functions of OSMetaClassBase are thread-safe.
  */
+
 class OSMetaClassBase
 {
 public:
@@ -288,6 +331,7 @@ public:
  * <code>@link //apple_ref/cpp/macro/OSCheckTypeInst OSCheckTypeInst@/link</code>.
  */
 #define OSTypeID(type)   (type::metaClass)
+#define OSMTypeID(type)  ((OSMetaClass *) type::metaClass)
 
 
 /*!
@@ -348,6 +392,27 @@ public:
 #define OSDynamicCast(type, inst)   \
     ((type *) OSMetaClassBase::safeMetaCast((inst), OSTypeID(type)))
 
+/*!
+ * @define OSRequiredCast
+ * @hidecontents
+ *
+ * @abstract
+ * Safe type-casting for Libkern C++ objects; panics on failure.
+ * The input parameters are the same as for the {@code OSDynamicCast} macro.
+ *
+ * @result {@code inst} if it is NULL or derived from {@code type};
+ * otherwise triggers a kernel panic.
+ *
+ * @discussion
+ * This macro should be used in place of C-style casts or
+ * <code>@link OSDynamicCast OSDynamicCast@/link</code>.
+ * when the caller is absolutely sure that the passed
+ * argument is a subclass of a required type.
+ * It is equivalent to using {@code OSDynamicCast} and crashing with a kernel
+ * panic on cast failure.
+ */
+#define OSRequiredCast(type, inst)  \
+    (type *) OSMetaClassBase::requiredMetaCast((inst), OSTypeID(type))
 
 /*!
  * @define OSCheckTypeInst
@@ -382,7 +447,7 @@ public:
 
 #if defined(__arm__) || defined(__arm64__)
 
-       static _ptf_t _ptmf2ptf(const OSMetaClassBase * self, void (OSMetaClassBase::*func)(void));
+       static _ptf_t _ptmf2ptf(const OSMetaClassBase * self, void (OSMetaClassBase::*func)(void), uintptr_t typeDisc);
 
 #elif defined(__i386__) || defined(__x86_64__)
 
@@ -391,7 +456,8 @@ public:
 // ABI
 
        static inline _ptf_t
-       _ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void))
+       _ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void),
+           uintptr_t typeDisc __attribute__((unused)))
        {
                union {
                        void (OSMetaClassBase::*fIn)(void);
@@ -451,7 +517,8 @@ public:
  */
 #define OSMemberFunctionCast(cptrtype, self, func)         \
     (cptrtype) OSMetaClassBase::                           \
-       _ptmf2ptf(self, (void (OSMetaClassBase::*)(void)) func)
+       _ptmf2ptf(self, (void (OSMetaClassBase::*)(void)) func,  \
+                 ptrauth_type_discriminator(__typeof__(func)))
 
 protected:
        OSMetaClassBase();
@@ -717,6 +784,31 @@ public:
                const OSMetaClassBase * anObject,
                const OSMetaClass     * toMeta);
 
+/*!
+ * @function requiredMetaCast
+ *
+ * @abstract
+ * Casts an object to the class managed by the given OSMetaClass or
+ * fails with a kernel panic if the cast does not succeed.
+ *
+ * @param anObject A pointer to the object to be cast.
+ * @param toMeta   A pointer to a constant OSMetaClass
+ *                 for the desired target type.
+ *
+ * @result
+ * <code>anObject</code> if the object is derived
+ * from the class managed by <code>toMeta</code>,
+ * <code>NULL</code> if <code>anObject</code> was <code>NULL</code>,
+ * kernel panic otherwise.
+ *
+ * @discussion
+ * It is far more convenient to use
+ * <code>@link OSRequiredCast OSRequiredCast@/link</code>.
+ */
+       static OSMetaClassBase *requiredMetaCast(
+               const OSMetaClassBase * anObject,
+               const OSMetaClass     * toMeta);
+
 /*!
  * @function checkTypeInst
  *
@@ -761,7 +853,7 @@ public:
  * OSObject::taggedRetain(const void *)@/link</code>.
  */
 // WAS: virtual void _RESERVEDOSMetaClassBase0();
-       virtual void taggedRetain(const void * tag = 0) const = 0;
+       virtual void taggedRetain(const void * tag = NULL) const = 0;
 
 
 /*!
@@ -780,7 +872,7 @@ public:
  * OSObject::taggedRelease(const void *)@/link</code>.
  */
 // WAS:  virtual void _RESERVEDOSMetaClassBase1();
-       virtual void taggedRelease(const void * tag = 0) const = 0;
+       virtual void taggedRelease(const void * tag = NULL) const = 0;
 
 protected:
 /*!
@@ -803,10 +895,16 @@ protected:
                const void * tag,
                const int    freeWhen) const = 0;
 
+public:
+       virtual kern_return_t
+       Dispatch(const IORPC rpc);
+
+       kern_return_t
+       Invoke(const IORPC rpc);
+
 private:
 #if APPLE_KEXT_VTABLE_PADDING
 // Virtual Padding
-       virtual void _RESERVEDOSMetaClassBase3();
        virtual void _RESERVEDOSMetaClassBase4();
        virtual void _RESERVEDOSMetaClassBase5();
        virtual void _RESERVEDOSMetaClassBase6();
@@ -901,7 +999,7 @@ typedef bool (*OSMetaClassInstanceApplierFunction)(const OSObject * instance,
  * by the run-time type information system,
  * which handles concurrency and locking internally.
  */
-class OSMetaClass : private OSMetaClassBase
+class OSMetaClass : public OSMetaClassBase
 {
        friend class OSKext;
 #if IOKITSTATS
@@ -1061,7 +1159,7 @@ protected:
  * for as long as its kernel extension is loaded,
  * OSMetaClass does not use reference-counting.
  */
-       virtual void taggedRetain(const void * tag = 0) const;
+       virtual void taggedRetain(const void * tag = NULL) const;
 
 
 /*!
@@ -1078,7 +1176,7 @@ protected:
  * for as long as its kernel extension is loaded,
  * OSMetaClass does not use reference-counting.
  */
-       virtual void taggedRelease(const void * tag = 0) const;
+       virtual void taggedRelease(const void * tag = NULL) const;
 
 
 /*!
@@ -1658,7 +1756,21 @@ public:
  * @param className The name of the C++ class, as a raw token,
  *                  <i>not</i> a string or macro.
  */
-#define OSDeclareCommonStructors(className)                     \
+
+#define _OS_ADD_METAMETHODS(b) _OS_ADD_METAMETHODS_ ## b
+#define _OS_ADD_METAMETHODS_
+#define _OS_ADD_METAMETHODS_dispatch                            \
+    virtual kern_return_t Dispatch(const IORPC rpc) APPLE_KEXT_OVERRIDE;
+
+#define _OS_ADD_METHODS(className, b) _OS_ADD_METHODS_ ## b(className)
+#define _OS_ADD_METHODS_(className)
+#define _OS_ADD_METHODS_dispatch(className)                     \
+    className ## _Methods                                       \
+    className ## _KernelMethods
+
+#define SUPERDISPATCH ((OSDispatchMethod)&super::_Dispatch)
+
+#define OSDeclareCommonStructors(className, dispatch)           \
     private:                                                    \
     static const OSMetaClass * const superClass;                \
     public:                                                     \
@@ -1666,13 +1778,15 @@ public:
        static class MetaClass : public OSMetaClass {           \
        public:                                                 \
            MetaClass();                                        \
-           virtual OSObject *alloc() const;                    \
+           virtual OSObject *alloc() const APPLE_KEXT_OVERRIDE;\
+           _OS_ADD_METAMETHODS(dispatch);                      \
        } gMetaClass;                                           \
        friend class className ::MetaClass;                     \
        virtual const OSMetaClass * getMetaClass() const APPLE_KEXT_OVERRIDE; \
     protected:                                                  \
     className (const OSMetaClass *);                            \
-    virtual ~ className () APPLE_KEXT_OVERRIDE
+    virtual ~ className () APPLE_KEXT_OVERRIDE;                 \
+    _OS_ADD_METHODS(className, dispatch)
 
 
 /*!
@@ -1681,7 +1795,7 @@ public:
  *
  * @abstract
  * Declares run-time type information and functions
- * for a concrete Libkern C++ class.
+ * for a final (non-subclassable) Libkern C++ class.
  *
  * @param className The name of the C++ class, as a raw token,
  *                  <i>not</i> a string or macro.
@@ -1691,13 +1805,20 @@ public:
  * immediately after the opening brace in a class declaration.
  * It leaves the current privacy state as <code>protected:</code>.
  */
-#define OSDeclareDefaultStructors(className)    \
-    OSDeclareCommonStructors(className);        \
+#define _OSDeclareDefaultStructors(className, dispatch)    \
+    OSDeclareCommonStructors(className, dispatch);        \
     public:                                     \
-    className ();                               \
+    className (void);                           \
     protected:
 
 
+#define OSDeclareDefaultStructors(className)   \
+_OSDeclareDefaultStructors(className, )
+
+#define OSDeclareDefaultStructorsWithDispatch(className)   \
+_OSDeclareDefaultStructors(className, dispatch)
+
+
 /*!
  * @define OSDeclareAbstractStructors
  * @hidecontents
@@ -1715,19 +1836,25 @@ public:
  * immediately after the opening brace in a class declaration.
  * It leaves the current privacy state as <code>protected:</code>.
  */
-#define OSDeclareAbstractStructors(className)                          \
-    OSDeclareCommonStructors(className);                               \
-    private:                                                           \
-    className (); /* Make primary constructor private in abstract */   \
+#define _OSDeclareAbstractStructors(className, dispatch)                        \
+    OSDeclareCommonStructors(className, dispatch);                              \
+    private:                                                                    \
+    className (void); /* Make primary constructor private in abstract */            \
     protected:
 
+#define OSDeclareAbstractStructors(className)                                   \
+_OSDeclareAbstractStructors(className, )
+
+#define OSDeclareAbstractStructorsWithDispatch(className)                       \
+_OSDeclareAbstractStructors(className, dispatch)
+
 /*!
  * @define OSDeclareFinalStructors
  * @hidecontents
  *
  * @abstract
  * Declares run-time type information and functions
- * for a final (non-subclassable) Libkern C++ class.
+ * for a concrete Libkern C++ class.
  *
  * @param className The name of the C++ class, as a raw token,
  *                  <i>not</i> a string or macro.
@@ -1746,13 +1873,20 @@ public:
  * <b>Warning:</b> Changing a class from "Default" to "Final" will break
  * binary compatibility.
  */
-#define OSDeclareFinalStructors(className)                              \
-       OSDeclareDefaultStructors(className)                            \
-    private:                                                            \
-       void __OSFinalClass(void);                                      \
+#define _OSDeclareFinalStructors(className, dispatch)                           \
+       _OSDeclareDefaultStructors(className, dispatch)                         \
+    private:                                                                    \
+       void __OSFinalClass(void);                                              \
     protected:
 
 
+#define OSDeclareFinalStructors(className)                                      \
+_OSDeclareFinalStructors(className, )
+
+#define OSDeclareFinalStructorsWithDispatch(className)                          \
+_OSDeclareFinalStructors(className, dispatch)
+
+
 /* Not to be included in headerdoc.
  *
  * @define OSDefineMetaClassWithInit
@@ -1805,7 +1939,7 @@ public:
  *                       <i>not</i> a string or macro.
  */
 #define OSDefineAbstractStructors(className, superclassName)        \
-    OSObject * className ::MetaClass::alloc() const { return 0; }
+    OSObject * className ::MetaClass::alloc() const { return NULL; }
 
 
 /* Not to be included in headerdoc.
@@ -1991,7 +2125,7 @@ public:
  *
  * @abstract
  * Defines an OSMetaClass and associated routines
- * for a final (non-subclassable) Libkern C++ class.
+ * for concrete Libkern C++ class.
  *
  * @param className      The name of the C++ class, as a raw token,
  *                       <i>not</i> a string or macro.
index ebb81a616850f1576d1ad4cf11ea88973f95498d..34a9472d57fe093741e81c6d9cb50863539de02b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -32,6 +32,7 @@
 #define _OS_OSNUMBER_H
 
 #include <libkern/c++/OSObject.h>
+#include <libkern/c++/OSPtr.h>
 
 /*!
  * @header
@@ -40,6 +41,9 @@
  * This header declares the OSNumber container class.
  */
 
+class OSNumber;
+
+typedef OSPtr<OSNumber> OSNumberPtr;
 
 /*!
  * @class OSNumber
@@ -72,7 +76,7 @@ class OSNumber : public OSObject
 {
        friend class OSSerialize;
 
-       OSDeclareDefaultStructors(OSNumber)
+       OSDeclareDefaultStructors(OSNumber);
 
 #if APPLE_KEXT_ALIGN_CONTAINERS
 
@@ -118,7 +122,7 @@ public:
  * and <code>@link addValue addValue@/link</code>,
  * but you can't change the bit size.
  */
-       static OSNumber * withNumber(
+       static OSNumberPtr withNumber(
                unsigned long long value,
                unsigned int       numberOfBits);
 
@@ -153,7 +157,7 @@ public:
  * and <code>@link addValue addValue@/link</code>,
  * but you can't change the bit size.
  */
-       static OSNumber * withNumber(
+       static OSNumberPtr withNumber(
                const char   * valueString,
                unsigned int   numberOfBits);
 
index 036730372427ec57b4856316899243677816fb06..d75fad273f59bd683d24c768b8a7fa9d44e2f6ec 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -35,6 +35,9 @@
 #define _LIBKERN_OSOBJECT_H
 
 #include <libkern/c++/OSMetaClass.h>
+#include <IOKit/IORPC.h>
+#include <DriverKit/OSObject.h>
+#include <libkern/c++/OSPtr.h>
 
 #if defined(__clang__)
 #pragma clang diagnostic ignored "-Woverloaded-virtual"
@@ -42,6 +45,9 @@
 
 class OSSymbol;
 class OSString;
+class OSObject;
+
+typedef OSPtr<OSObject> OSObjectPtr;
 
 
 /*!
@@ -167,11 +173,17 @@ class OSString;
  */
 class OSObject : public OSMetaClassBase
 {
-       OSDeclareAbstractStructors(OSObject)
+       OSDeclareAbstractStructorsWithDispatch(OSObject);
+
 #if IOKITSTATS
        friend class IOStatistics;
 #endif
 
+#ifdef LIBKERN_SMART_POINTERS
+       template<class T, class OSPtrPolicy>
+       friend class os::smart_ptr;
+#endif
+
 private:
 /* Not to be included in headerdoc.
  *
@@ -192,10 +204,10 @@ protected:
  * drops below the specified threshold.
  *
  * @param freeWhen If decrementing the reference count makes it
- *                 >= <code>freeWhen</code>, the object is immediately freed.
+ *                 < <code>freeWhen</code>, the object is immediately freed.
  *
  * @discussion
- * If the receiver has <code>freeWhen</code> or fewer references
+ * If the receiver has fewer than <code>freeWhen</code> references
  * after its reference count is decremented,
  * it is immediately freed.
  *
@@ -215,14 +227,14 @@ protected:
  *
  * @param tag      Used for tracking collection references.
  * @param freeWhen If decrementing the reference count makes it
- *                 >= <code>freeWhen</code>, the object is immediately freed.
+ *                 < <code>freeWhen</code>, the object is immediately freed.
  *
  * @discussion
  * Kernel extensions should not use this function.
  * It is for use by OSCollection and subclasses to track
  * inclusion in collections.
  *
- * If the receiver has <code>freeWhen</code> or fewer references
+ * If the receiver has fewer than <code>freeWhen</code> references
  * after its reference count is decremented,
  * it is immediately freed.
  *
@@ -299,7 +311,13 @@ protected:
  */
        static void operator delete(void * mem, size_t size);
 
+// XXX: eventually we can flip this switch
+//#ifdef LIBKERN_SMART_POINTERS
+#if 0
+private:
+#else
 public:
+#endif
 
 /*!
  * @function operator new
@@ -314,6 +332,7 @@ public:
  */
        static void * operator new(size_t size);
 
+public:
 
 /*!
  * @function getRetainCount
@@ -378,7 +397,7 @@ public:
  * outside the context in which you received it,
  * you should always retain it immediately.
  */
-       virtual void taggedRetain(const void * tag = 0) const APPLE_KEXT_OVERRIDE;
+       virtual void taggedRetain(const void * tag = NULL) const APPLE_KEXT_OVERRIDE;
 
 
 /*!
@@ -396,7 +415,7 @@ public:
  * It is for use by OSCollection and subclasses to track
  * inclusion in collections.
  */
-       virtual void taggedRelease(const void * tag = 0) const APPLE_KEXT_OVERRIDE;
+       virtual void taggedRelease(const void * tag = NULL) const APPLE_KEXT_OVERRIDE;
 // xx-review: used to say, "Remove a reference on this object with this tag, if an attempt is made to remove a reference that isn't associated with this tag the kernel will panic immediately", but I don't see that in the implementation
 
 
@@ -430,7 +449,13 @@ public:
 #endif
 
        bool taggedTryRetain(const void *tag) const;
-#endif
+
+       bool iterateObjects(void * refcon, bool (*callback)(void * refcon, OSObject * object));
+#ifdef __BLOCKS__
+       bool iterateObjects(bool (^block)(OSObject * object));
+#endif /* __BLOCKS__ */
+
+#endif /* XNU_KERNEL_PRIVATE */
 
 // Unused Padding
        OSMetaClassDeclareReservedUnused(OSObject, 0);
index 2a24e321ff1b8aefe025fc1c2d3a5099727510d7..dc1a61d20a143c211b8a0a11fb65801ee59c1207 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define _OS_OSORDEREDSET_H
 
 #include <libkern/c++/OSCollection.h>
+#include <libkern/c++/OSPtr.h>
 #include <libkern/OSTypes.h>
 
 class OSOffset;
+class OSOrderedSet;
+
+typedef OSPtr<OSOrderedSet> OSOrderedSetPtr;
 
 /*!
  * @header
@@ -94,7 +98,7 @@ class OSOffset;
  */
 class OSOrderedSet : public OSCollection
 {
-       OSDeclareDefaultStructors(OSOrderedSet)
+       OSDeclareDefaultStructors(OSOrderedSet);
 
 public:
 /*!
@@ -180,10 +184,10 @@ public:
  * See
  * <code>@link getOrderingRef getOrderingRef@/link</code>.
  */
-       static OSOrderedSet * withCapacity(
+       static OSOrderedSetPtr withCapacity(
                unsigned int      capacity,
-               OSOrderFunction   orderFunc = 0,
-               void            * orderingContext = 0);
+               OSOrderFunction   orderFunc = NULL,
+               void            * orderingContext = NULL);
 
 
 /*!
@@ -231,8 +235,8 @@ public:
  */
        virtual bool initWithCapacity(
                unsigned int      capacity,
-               OSOrderFunction   orderFunc = 0,
-               void            * orderingContext = 0);
+               OSOrderFunction   orderFunc = NULL,
+               void            * orderingContext = NULL);
 
 
 /*!
@@ -728,7 +732,7 @@ public:
        virtual unsigned setOptions(
                unsigned   options,
                unsigned   mask,
-               void     * context = 0) APPLE_KEXT_OVERRIDE;
+               void     * context = NULL) APPLE_KEXT_OVERRIDE;
 
 
 /*!
@@ -753,7 +757,7 @@ public:
  * Objects that are not derived from OSCollection are retained
  * rather than copied.
  */
-       OSCollection *copyCollection(OSDictionary * cycleDict = 0) APPLE_KEXT_OVERRIDE;
+       OSCollectionPtr copyCollection(OSDictionary * cycleDict = NULL) APPLE_KEXT_OVERRIDE;
 
        OSMetaClassDeclareReservedUnused(OSOrderedSet, 0);
        OSMetaClassDeclareReservedUnused(OSOrderedSet, 1);
diff --git a/libkern/libkern/c++/OSPtr.h b/libkern/libkern/c++/OSPtr.h
new file mode 100644 (file)
index 0000000..fb2dc97
--- /dev/null
@@ -0,0 +1,145 @@
+#ifndef _OS_OBJECT_PTR_H
+#define _OS_OBJECT_PTR_H
+
+#include <sys/cdefs.h>
+#include <os/smart_ptr.h>
+
+#if KERNEL
+# include <libkern/c++/OSObject.h>
+#endif
+
+#ifdef LIBKERN_SMART_POINTERS
+
+/*
+ * OSObject pointers (OSPtr)
+ */
+
+struct osobject_policy {
+       static void
+       retain(const OSMetaClassBase *obj)
+       {
+               obj->retain();
+       }
+       static void
+       release(const OSMetaClassBase *obj)
+       {
+               obj->release();
+       }
+       template <class T> static T *
+       alloc()
+       {
+               return OSTypeAlloc(T);
+       }
+       template <class From, class To> static To *
+       dyn_cast(From *p)
+       {
+               return OSDynamicCast(To, p);
+       }
+};
+
+template <class T>
+using OSPtr = os::smart_ptr<T, osobject_policy>;
+
+/*
+ * Tagged OSObject pointers (OSTaggedPtr)
+ */
+
+template <class Tag>
+struct osobject_tagged_policy {
+       static void
+       retain(const OSMetaClassBase *obj)
+       {
+               obj->taggedRetain(OSTypeID(Tag));
+       }
+       static void
+       release(const OSMetaClassBase *obj)
+       {
+               obj->taggedRelease(OSTypeID(Tag));
+       }
+       template <class T> static T *
+       alloc()
+       {
+               return OSTypeAlloc(T);
+       }
+       template <class From, class To> static To *
+       dyn_cast(From *p)
+       {
+               return OSDynamicCast(To, p);
+       }
+};
+
+template <class T, class Tag>
+using OSTaggedPtr = os::smart_ptr<T, osobject_tagged_policy<Tag> >;
+
+/*
+ * Dynamic cast
+ */
+
+template<class T, class U, class P>
+os::smart_ptr<T, P>
+OSDynamicCastPtr(os::smart_ptr<U, P> const &from)
+{
+       return from.template dynamic_pointer_cast<T>();
+}
+
+template<class T, class U, class P>
+os::smart_ptr<T, P>
+OSDynamicCastPtr(os::smart_ptr<U, P> &&from)
+{
+       return os::move(from).template dynamic_pointer_cast<T>();
+}
+
+/*
+ * Creation helpers
+ */
+
+template<class T, class P>
+os::smart_ptr<T, P>
+OSNewObject()
+{
+       return os::smart_ptr<T, P>::alloc();
+}
+
+template<class T, class P>
+os::smart_ptr<T, P>
+OSMakePtr(T *&p)
+{
+       return os::smart_ptr<T, P>(p);
+}
+
+template<class T, class P>
+os::smart_ptr<T, P>
+OSMakePtr(T *&&p)
+{
+       return os::smart_ptr<T, P>(os::move(p));
+}
+
+template<class T, class P>
+os::smart_ptr<T, P>
+OSMakePtr(T *&&p, bool retain)
+{
+       return os::smart_ptr<T, P>(os::move(p), retain);
+}
+
+template<class T, class P>
+static inline T **
+OSOutPtr(os::smart_ptr<T, P> *p)
+{
+       if (p == nullptr) {
+               return nullptr;
+       } else {
+               return p->get_for_out_param();
+       }
+}
+
+#else /* LIBKERN_SMART_POINTERS */
+
+/* Fall back to the smart pointer types just being a simple pointer */
+template<class T, class policy = void>
+using OSPtr = T *;
+
+template <class T, class Tag = void>
+using OSTaggedPtr = T *;
+
+#endif /* LIBKERN_SMART_POINTERS */
+#endif /* _OS_OBJECT_PTR_H */
index 061830af23f8d924c5dd032b6b4dbf608ad1e55f..7581625846aa25b6bed04050196e9bce4286cec3 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -36,6 +36,13 @@ class OSCollection;
 class OSSet;
 class OSDictionary;
 class OSArray;
+class OSData;
+
+class OSSerializer;
+typedef OSPtr<OSSerializer> OSSerializerPtr;
+
+class OSSerialize;
+typedef OSPtr<OSSerialize> OSSerializePtr;
 
 /*!
  * @header
@@ -44,7 +51,7 @@ class OSArray;
  * This header declares the OSSerialize class.
  */
 
-OSObject *
+OSObjectPtr
 OSUnserializeBinary(const void *buffer, size_t bufferSize);
 
 /*!
@@ -83,7 +90,7 @@ OSUnserializeBinary(const void *buffer, size_t bufferSize);
 
 class OSSerialize : public OSObject
 {
-       OSDeclareDefaultStructors(OSSerialize)
+       OSDeclareDefaultStructors(OSSerialize);
        friend class OSBoolean;
 
 private:
@@ -105,14 +112,18 @@ public:
        typedef void * Editor;
 #endif
 
-       bool   binary;
-       bool   endCollection;
-       Editor editor;
-       void * editRef;
+       bool     binary;
+       bool     endCollection;
+       Editor   editor;
+       void   * editRef;
+       OSData * indexData;
 
        bool binarySerialize(const OSMetaClassBase *o);
+       bool binarySerializeInternal(const OSMetaClassBase *o);
        bool addBinary(const void * data, size_t size);
-       bool addBinaryObject(const OSMetaClassBase * o, uint32_t key, const void * _bits, size_t size);
+       bool addBinaryObject(const OSMetaClassBase * o, uint32_t key, const void * _bits, size_t size,
+           uint32_t * startCollection);
+       void endBinaryCollection(uint32_t startCollection);
 
 public:
 
@@ -132,9 +143,10 @@ public:
  * @discussion
  * The serializer will grow as needed to accommodate more data.
  */
-       static OSSerialize * withCapacity(unsigned int capacity);
+       static OSSerializePtr withCapacity(unsigned int capacity);
 
-       static OSSerialize * binaryWithCapacity(unsigned int inCapacity, Editor editor = 0, void * reference = 0);
+       static OSSerializePtr binaryWithCapacity(unsigned int inCapacity, Editor editor = NULL, void * reference = NULL);
+       void setIndexed(bool index);
 
 /*!
  * @function text
@@ -321,7 +333,7 @@ typedef bool (^OSSerializerBlock)(OSSerialize * serializer);
 
 class OSSerializer : public OSObject
 {
-       OSDeclareDefaultStructors(OSSerializer)
+       OSDeclareDefaultStructors(OSSerializer);
 
        void * target;
        void * ref;
@@ -329,13 +341,13 @@ class OSSerializer : public OSObject
 
 public:
 
-       static OSSerializer * forTarget(
+       static OSSerializerPtr forTarget(
                void * target,
                OSSerializerCallback callback,
-               void * ref = 0);
+               void * ref = NULL);
 
 #ifdef __BLOCKS__
-       static OSSerializer * withBlock(
+       static OSSerializerPtr withBlock(
                OSSerializerBlock callback);
 #endif
 
index bec190e9f5a5a1e66102ce9420572d5b47b00482..9c7718807a0e7b8b7e9501a2cd8af3ebb7deb370 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define _OS_OSSET_H
 
 #include <libkern/c++/OSCollection.h>
+#include <libkern/c++/OSPtr.h>
 
 class OSArray;
+class OSSet;
+
+typedef OSPtr<OSSet> OSSetPtr;
+typedef OSPtr<OSArray> OSArrayPtr;
 
 /*!
  * @header
@@ -86,17 +91,17 @@ class OSSet : public OSCollection
 {
        friend class OSSerialize;
 
-       OSDeclareDefaultStructors(OSSet)
+       OSDeclareDefaultStructors(OSSet);
 
 #if APPLE_KEXT_ALIGN_CONTAINERS
 
 private:
-       OSArray * members;
+       OSArrayPtr members;
 
 #else /* APPLE_KEXT_ALIGN_CONTAINERS */
 
 private:
-       OSArray * members;
+       OSArrayPtr members;
 
 protected:
        struct ExpansionData { };
@@ -135,7 +140,7 @@ public:
  * (<i>unlike</i> @link //apple_ref/doc/uid/20001503 CFMutableSet@/link,
  * for which the initial capacity is a hard limit).
  */
-       static OSSet * withCapacity(unsigned int capacity);
+       static OSSetPtr withCapacity(unsigned int capacity);
 
 
 /*!
@@ -169,7 +174,7 @@ public:
  * The objects in <code>objects</code> are retained for storage in the new set,
  * not copied.
  */
-       static OSSet * withObjects(
+       static OSSetPtr withObjects(
                const OSObject * objects[],
                unsigned int     count,
                unsigned int     capacity = 0);
@@ -207,7 +212,7 @@ public:
  * The objects in <code>array</code> are retained for storage in the new set,
  * not copied.
  */
-       static OSSet * withArray(
+       static OSSetPtr withArray(
                const OSArray * array,
                unsigned int    capacity = 0);
 
@@ -243,7 +248,7 @@ public:
  * The objects in <code>set</code> are retained for storage in the new set,
  * not copied.
  */
-       static OSSet * withSet(const OSSet * set,
+       static OSSetPtr withSet(const OSSet * set,
            unsigned int capacity = 0);
 
 
@@ -749,7 +754,7 @@ public:
  * Child collections' options are changed only if the receiving set's
  * options actually change.
  */
-       virtual unsigned setOptions(unsigned options, unsigned mask, void * context = 0) APPLE_KEXT_OVERRIDE;
+       virtual unsigned setOptions(unsigned options, unsigned mask, void * context = NULL) APPLE_KEXT_OVERRIDE;
 
 
 /*!
@@ -774,7 +779,7 @@ public:
  * Objects that are not derived from OSCollection are retained
  * rather than copied.
  */
-       OSCollection *copyCollection(OSDictionary *cycleDict = 0) APPLE_KEXT_OVERRIDE;
+       OSCollectionPtr copyCollection(OSDictionary *cycleDict = NULL) APPLE_KEXT_OVERRIDE;
 
        OSMetaClassDeclareReservedUnused(OSSet, 0);
        OSMetaClassDeclareReservedUnused(OSSet, 1);
index c761c9d2888c455a8bc2430642f16344b285bae2..925d5a3a46a84d3af145f595cb9e33e02aaa0a6b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define _OS_OSSTRING_H
 
 #include <libkern/c++/OSObject.h>
+#include <libkern/c++/OSPtr.h>
 
 class OSData;
+class OSString;
+
+typedef OSPtr<OSString> OSStringPtr;
+typedef OSPtr<const OSString> OSStringConstPtr;
 
 
 /*!
@@ -102,7 +107,7 @@ enum { kOSStringNoCopy = 0x00000001 };
  */
 class OSString : public OSObject
 {
-       OSDeclareDefaultStructors(OSString)
+       OSDeclareDefaultStructors(OSString);
 
        enum { kMaxStringLength  = 262142 };
 
@@ -145,7 +150,7 @@ public:
  * with the reference count incremented.
  * Changes to one will not be reflected in the other.
  */
-       static OSString * withString(const OSString * aString);
+       static OSStringPtr withString(const OSString * aString);
 
 
 /*!
@@ -162,7 +167,7 @@ public:
  * and with a reference count of 1;
  * <code>NULL</code> on failure.
  */
-       static OSString * withCString(const char * cString);
+       static OSStringPtr withCString(const char * cString);
 
 
 /*!
@@ -191,10 +196,10 @@ public:
  * An OSString object created with this function does not
  * allow changing the string via <code>@link setChar setChar@/link</code>.
  */
-       static OSString * withCStringNoCopy(const char * cString);
+       static OSStringPtr withCStringNoCopy(const char * cString);
 
 #if XNU_KERNEL_PRIVATE
-       static OSString * withStringOfLength(const char *cString, size_t length);
+       static OSStringPtr withStringOfLength(const char *cString, size_t length);
 #endif  /* XNU_KERNEL_PRIVATE */
 
 /*!
index 03490a0266f43a63253ff1a85edaaaeb0d39a151..1ee9792b699d634a8bfb13bd715720fd75610e90 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define _OS_OSSYMBOL_H
 
 #include <libkern/c++/OSString.h>
+#include <libkern/c++/OSPtr.h>
+
+class OSSymbol;
+
+typedef OSPtr<OSSymbol> OSSymbolPtr;
+typedef OSPtr<const OSSymbol> OSSymbolConstPtr;
 
 /*!
  * @header
@@ -82,7 +88,7 @@ class OSSymbol : public OSString
 {
        friend class OSSymbolPool;
 
-       OSDeclareAbstractStructors(OSSymbol)
+       OSDeclareAbstractStructors(OSSymbol);
 
 private:
 
@@ -245,7 +251,7 @@ public:
  * new OSSymbol with a retain count of 1,
  * or increments the retain count of the existing instance.
  */
-       static const OSSymbol * withString(const OSString * aString);
+       static OSSymbolConstPtr withString(const OSString * aString);
 
 
 /*!
@@ -272,7 +278,7 @@ public:
  * new OSSymbol with a retain count of 1,
  * or increments the retain count of the existing instance.
  */
-       static const OSSymbol * withCString(const char * cString);
+       static OSSymbolConstPtr withCString(const char * cString);
 
 
 /*!
@@ -302,7 +308,7 @@ public:
  * new OSSymbol with a retain count of 1,
  * or increments the retain count of the existing instance.
  */
-       static const OSSymbol * withCStringNoCopy(const char * cString);
+       static OSSymbolConstPtr withCStringNoCopy(const char * cString);
 
 /*!
  * @function existingSymbolForString
@@ -321,7 +327,7 @@ public:
  * The returned OSSymbol object is returned with an incremented refcount
  * that needs to be released.
  */
-       static const OSSymbol* existingSymbolForString(const OSString *aString);
+       static OSSymbolConstPtr existingSymbolForString(const OSString *aString);
 
 /*!
  * @function existingSymbolForCString
@@ -340,7 +346,7 @@ public:
  * The returned OSSymbol object is returned with an incremented refcount
  * that needs to be released.
  */
-       static const OSSymbol* existingSymbolForCString(const char *aCString);
+       static OSSymbolConstPtr existingSymbolForCString(const char *aCString);
 
 /*!
  * @function isEqualTo
index a3f8fc378c30f7360f1c529d0290a28a47fe9989..678e48828467a97cb2b32b70a5078f07e12d72ec 100644 (file)
@@ -30,6 +30,9 @@
 #ifndef _OS_OSUNSERIALIZE_H
 #define _OS_OSUNSERIALIZE_H
 
+#include <libkern/c++/OSMetaClass.h>
+#include <libkern/c++/OSString.h>
+
 #include <sys/appleapiopts.h>
 #include <sys/types.h>
 
@@ -64,9 +67,9 @@ class OSString;
  * @discussion
  * <b>Not safe</b> to call in a primary interrupt handler.
  */
-extern "C++" OSObject * OSUnserializeXML(
+extern "C++" OSObjectPtr OSUnserializeXML(
        const char  * buffer,
-       OSString   ** errorString = 0);
+       OSStringPtr * errorString = NULL);
 
 /*!
  * @function OSUnserializeXML
@@ -90,16 +93,16 @@ extern "C++" OSObject * OSUnserializeXML(
  * @discussion
  * <b>Not safe</b> to call in a primary interrupt handler.
  */
-extern "C++" OSObject * OSUnserializeXML(
+extern "C++" OSObjectPtr OSUnserializeXML(
        const char  * buffer,
        size_t        bufferSize,
-       OSString   ** errorString = 0);
+       OSStringPtr   *errorString = NULL);
 
-extern "C++" OSObject *
-OSUnserializeBinary(const char *buffer, size_t bufferSize, OSString **errorString);
+extern "C++" OSObjectPtr
+OSUnserializeBinary(const char *buffer, size_t bufferSize, OSStringPtr *errorString);
 
 #ifdef __APPLE_API_OBSOLETE
-extern OSObject* OSUnserialize(const char *buffer, OSString **errorString = 0);
+extern OSObjectPtr OSUnserialize(const char *buffer, OSStringPtr *errorString = NULL);
 #endif /* __APPLE_API_OBSOLETE */
 
 #endif /* _OS_OSUNSERIALIZE_H */
index e2df46dbe7a4561868c1fcfa79c79be33c7c2bb4..a3efc08def95be72357bb8e458abaa88f7fc402c 100644 (file)
@@ -67,11 +67,11 @@ typedef des_ecb_key_schedule des_key_schedule[1];
 
 /* Single DES ECB - 1 block */
 int des_ecb_key_sched(des_cblock *key, des_ecb_key_schedule *ks);
-void des_ecb_encrypt(des_cblock * in, des_cblock *out, des_ecb_key_schedule *ks, int encrypt);
+int des_ecb_encrypt(des_cblock * in, des_cblock *out, des_ecb_key_schedule *ks, int encrypt);
 
 /* Triple DES ECB - 1 block */
 int des3_ecb_key_sched(des_cblock *key, des3_ecb_key_schedule *ks);
-void des3_ecb_encrypt(des_cblock *block, des_cblock *, des3_ecb_key_schedule *ks, int encrypt);
+int des3_ecb_encrypt(des_cblock *block, des_cblock *, des3_ecb_key_schedule *ks, int encrypt);
 
 int des_is_weak_key(des_cblock *key);
 
index 78bbfa507c8443db9cde5c5f1c56cc2f8acd49a4..19e08c823f700c884b5f9fabd03146907405fcd4 100644 (file)
@@ -10,8 +10,16 @@ DATAFILES = \
           OSByteOrder.h \
          _OSByteOrder.h
 
+DRIVERKIT_DATAFILES = \
+          OSByteOrder.h \
+         _OSByteOrder.h
+
 INSTALL_MD_LIST        = ${DATAFILES}
 
+INSTALL_DRIVERKIT_MD_LIST = ${DRIVERKIT_DATAFILES}
+
+DRIVERKITINCDIR = $(DRIVERKITSDKHEADERSROOT)/usr/local/include
+
 INSTALL_MD_DIR = libkern/i386
 
 EXPORT_MD_LIST = ${DATAFILES}
index 01d749e4f680d446244c1ff04236a6ee7d315c5f..7bf58353d09d7ae30b6e9343cc5d8ff851b4057d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <os/base.h>
 #include <sys/cdefs.h>
 
-#if MACH_KERNEL_PRIVATE
-#define _SYS_TYPES_H_ 1
-#include <sys/kernel_types.h>
-#include <sys/_types/_errno_t.h>
-#else
-#include <sys/kernel_types.h>
-#include <sys/types.h>
-#endif
-
 /*
  * We rely on img4.h's logic for either including sys/types.h or declaring
  * errno_t ourselves. So when building the kernel, include img4.h from our
@@ -63,7 +54,7 @@
  * it can be tested at build-time and not require rev-locked submissions of xnu
  * and AppleImage4.
  */
-#define IMG4_INTERFACE_VERSION (2u)
+#define IMG4_INTERFACE_VERSION (3u)
 
 /*!
  * @typedef img4_init_t
@@ -240,9 +231,6 @@ typedef errno_t (*img4_payload_init_with_vnode_4xnu_t)(
  *
  * @field i4if_v3.nonce_domain_cryptex
  * The {@link IMG4_NONCE_DOMAIN_CRYPTEX} global.
- *
- * @field i4if_v4.environment_init_identity
- * A pointer to the {@link img4_environment_init_identity} function.
  */
 
 typedef struct _img4_interface {
@@ -266,10 +254,14 @@ typedef struct _img4_interface {
        struct {
                img4_payload_init_with_vnode_4xnu_t payload_init_with_vnode_4xnu;
        } i4if_v2;
-       void *__reserved[17];
+       struct {
+               const img4_nonce_domain_t *nonce_domain_pdi;
+               const img4_nonce_domain_t *nonce_domain_cryptex;
+       } i4if_v3;
+       void *__reserved[15];
 } img4_interface_t;
 
-__BEGIN_DECLS;
+__BEGIN_DECLS
 
 /*!
  * @const img4if
@@ -292,6 +284,6 @@ OS_EXPORT OS_NONNULL1
 void
 img4_interface_register(const img4_interface_t *i4);
 
-__END_DECLS;
+__END_DECLS
 
 #endif // __IMG4_INTERFACE_H
index 73f2985fefee13d0e391e323df2ccc8451e52394..4c241e40066f05680eeec262122810edb5d1f657 100644 (file)
@@ -53,7 +53,7 @@ subs_entry_t kext_identifier_prefix_subs[] = {
        { "com.apple.security.", '$' },
        { "com.apple.", '@' },
 
-       { (char *)0, '\0' }
+       { (char *)NULL, '\0' }
 };
 
 /* Substring substitution list. Substrings are replaced with a '!' followed
@@ -71,7 +71,7 @@ subs_entry_t kext_identifier_substring_subs[] = {
        { "Bluetooth", 'B' },
        { "Intel", 'I' },
 
-       { (char *)0, '\0' }
+       { (char *)NULL, '\0' }
 };
 
 __END_DECLS
index 0cd79f5ff10194c670d4fcd4d163e7aa157c319b..fa1697ac0a7a398d3389a71e648a332b747aca65 100644 (file)
@@ -269,6 +269,16 @@ extern "C" {
  */
 #define kKextRequestPredicateRequestKextdExit    "Kextd Exit"
 
+/* Predicate: Dext Daemon Launch
+ * Argument: kKextRequestArgumentBundleIdentifierKey
+ * Argument: IOUserServerName
+ * Response: Asynchronous via a DriverKit daemon checking in
+ * Op result: OSReturn indicating result (see OSKextLib.h)
+ *
+ * Requests kextd to launch a driver extension userspace daemon.
+ */
+#define kKextRequestPredicateRequestDaemonLaunch "Dext Daemon Launch"
+
 #if PRAGMA_MARK
 /********************************************************************/
 #pragma mark -
@@ -436,6 +446,30 @@ extern "C" {
  */
 #define kKextRequestArgumentTerminateIOServicesKey     "Terminate IOServices"
 
+#if PRAGMA_MARK
+#pragma mark Daemon Launch Request Arguments
+#endif
+
+/* Argument: Server tag
+ * Type:     Integer
+ * Default:  N/A
+ *
+ * A DriverKit daemon launch request must include a "server tag" that
+ * is unique to every launch request. Userspace daemons include this
+ * tag in their messages when attempting to rendez-vous with IOKit.
+ */
+#define kKextRequestArgumentDriverExtensionServerTag   "Driver Extension Server Tag"
+
+/* Argument: Server name
+ * Type:     String
+ * Default:  N/A
+ *
+ * A DriverKit daemon launch request must include a "server name" that
+ * can be used to identify what personality the driver is matching on.
+ * This name is also used for the launchd service name of the daemon.
+ */
+#define kKextRequestArgumentDriverExtensionServerName  "Driver Extension Server Name"
+
 #if PRAGMA_MARK
 #pragma mark Internal Tracking Properties
 #endif
index 3e9849371bf3f208b2cf55cb2cb643b2f3e45fbc..deaf2e3762c4699723a075908b9f6a54c12b457f 100644 (file)
@@ -9,8 +9,15 @@ include $(MakeInc_def)
 DATAFILES = \
           OSByteOrder.h
 
+DRIVERKIT_DATAFILES = \
+          OSByteOrder.h
+
 INSTALL_MI_LIST        = ${DATAFILES}
 
+INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES}
+
+DRIVERKITINCDIR = $(DRIVERKITSDKHEADERSROOT)/usr/local/include
+
 INSTALL_MI_DIR = libkern/machine
 
 EXPORT_MI_LIST = ${DATAFILES}
index 2ffcbabca1f73983a01cba3717a791f0c344e503..443d20a3d03b676b2a6637a0b187e4271955ff4e 100644 (file)
@@ -33,6 +33,7 @@
 extern unsigned long __stack_chk_guard;
 
 /* Called as a consequence on stack corruption */
+__attribute__((noreturn))
 extern void __stack_chk_fail(void);
 
 #endif // _STACK_PROTECTOR_H_
index 15b6636395c5950aa8c22221097217b4eed98c99..5cf38cbc036bc279b5f3ca57c6d10dc8dfd01ddd 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009-2010 Apple Inc. All rights reserved.
+ * Copyright (c) 2009-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -408,6 +408,7 @@ void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *);\
 struct type *name##_RB_REMOVE(struct name *, struct type *);            \
 struct type *name##_RB_INSERT(struct name *, struct type *);            \
 struct type *name##_RB_FIND(struct name *, struct type *);              \
+struct type *name##_RB_NFIND(struct name *, struct type *);             \
 struct type *name##_RB_NEXT(struct type *);                             \
 struct type *name##_RB_MINMAX(struct name *, int);                      \
 struct type *name##_RB_GETPARENT(struct type*);                         \
@@ -422,12 +423,13 @@ _sc_ void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *); \
 _sc_ struct type *name##_RB_REMOVE(struct name *, struct type *);       \
 _sc_ struct type *name##_RB_INSERT(struct name *, struct type *);       \
 _sc_ struct type *name##_RB_FIND(struct name *, struct type *);         \
+_sc_ struct type *name##_RB_NFIND(struct name *, struct type *);        \
 _sc_ struct type *name##_RB_NEXT(struct type *);                        \
 _sc_ struct type *name##_RB_MINMAX(struct name *, int);                 \
 _sc_ struct type *name##_RB_GETPARENT(struct type*);                    \
 _sc_ struct type *name##_RB_SETPARENT(struct type*, struct type*);                      \
 _sc_ int name##_RB_GETCOLOR(struct type*);                      \
-_sc_ void name##_RB_SETCOLOR(struct type*,int);
+_sc_ void name##_RB_SETCOLOR(struct type*,int)
 
 
 /* Main rb operation.
@@ -698,6 +700,28 @@ name##_RB_FIND(struct name *head, struct type *elm)                     \
        return (NULL);                                                  \
 }                                                                       \
                                                                         \
+/* Finds the first node greater than or equal to the search key */      \
+__attribute__((unused))                                                 \
+struct type *                                                           \
+name##_RB_NFIND(struct name *head, struct type *elm)                    \
+{                                                                       \
+       struct type *tmp = RB_ROOT(head);                               \
+       struct type *res = NULL;                                        \
+       int comp;                                                       \
+       while (tmp) {                                                   \
+               comp = cmp(elm, tmp);                                   \
+               if (comp < 0) {                                         \
+                       res = tmp;                                      \
+                       tmp = RB_LEFT(tmp, field);                      \
+               }                                                       \
+               else if (comp > 0)                                      \
+                       tmp = RB_RIGHT(tmp, field);                     \
+               else                                                    \
+                       return (tmp);                                   \
+       }                                                               \
+       return (res);                                                   \
+}                                                                       \
+                                                                        \
 /* ARGSUSED */                                                          \
 struct type *                                                           \
 name##_RB_NEXT(struct type *elm)                                        \
@@ -742,11 +766,11 @@ struct type *name##_RB_PREV(struct type *);
 
 
 #define RB_PROTOTYPE_SC_PREV(_sc_, name, type, field, cmp)              \
-       RB_PROTOTYPE_SC(_sc_, name, type, field, cmp)                   \
-_sc_ struct type *name##_RB_PREV(struct type *);
+       RB_PROTOTYPE_SC(_sc_, name, type, field, cmp);                  \
+_sc_ struct type *name##_RB_PREV(struct type *)
 
 #define RB_GENERATE_PREV(name, type, field, cmp)                        \
-       RB_GENERATE(name, type, field, cmp)                             \
+       RB_GENERATE(name, type, field, cmp);                            \
 struct type *                                                           \
 name##_RB_PREV(struct type *elm)                                        \
 {                                                                       \
@@ -774,6 +798,7 @@ name##_RB_PREV(struct type *elm)                                        \
 #define RB_INSERT(name, x, y)   name##_RB_INSERT(x, y)
 #define RB_REMOVE(name, x, y)   name##_RB_REMOVE(x, y)
 #define RB_FIND(name, x, y)     name##_RB_FIND(x, y)
+#define RB_NFIND(name, x, y)    name##_RB_NFIND(x, y)
 #define RB_NEXT(name, x, y)     name##_RB_NEXT(y)
 #define RB_PREV(name, x, y)     name##_RB_PREV(y)
 #define RB_MIN(name, x)         name##_RB_MINMAX(x, RB_NEGINF)
index 5db93b07d0fd968868e05924efd17e3e838706d3..26c29df2cc804715d267bd4419c967e866b70078 100644 (file)
@@ -6,26 +6,36 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
 include $(MakeInc_cmd)
 include $(MakeInc_def)
 
-LCLDIR = /usr/local/include
-
 KERNELFILES = \
        base.h \
        object.h \
        log.h \
        trace.h \
-       overflow.h
+       overflow.h \
+       smart_ptr.h \
+       cpp_util.h
 
 PRIVATE_KERNELFILES = \
        hash.h \
        object_private.h \
+       ptrtools.h \
        reason_private.h \
-       refcnt.h
+       refcnt.h \
+       refcnt_internal.h
+
+DATAFILES = \
+       overflow.h
+
+DRIVERKIT_DATAFILES = \
+       base.h \
+       overflow.h
 
 PRIVATE_DATAFILES = \
        reason_private.h
 
-INSTALL_MI_LIST        = \
-       overflow.h
+INSTALL_MI_LIST = ${DATAFILES}
+
+INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES}
 
 INSTALL_MI_DIR = os
 
index 62b98b4533e325036b348024c46f2604cc475ca1..bea2772a4b2f0aa134df392d328ba4783aa81ee0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2013 Apple Inc. All rights reserved.
  *
  * @APPLE_APACHE_LICENSE_HEADER_START@
  *
 #define OS_NOESCAPE
 #endif
 
+#if defined(__cplusplus) && defined(__clang__)
+#define OS_FALLTHROUGH [[clang::fallthrough]]
+#else
+#define OS_FALLTHROUGH
+#endif
+
 #if __has_feature(assume_nonnull)
 #define OS_ASSUME_NONNULL_BEGIN _Pragma("clang assume_nonnull begin")
 #define OS_ASSUME_NONNULL_END   _Pragma("clang assume_nonnull end")
 #define OS_OVERLOADABLE
 #endif
 
-#if __has_feature(objc_fixed_enum) || __has_extension(cxx_strong_enums)
+#if __has_attribute(enum_extensibility)
+#define __OS_ENUM_ATTR __attribute__((enum_extensibility(open)))
+#define __OS_ENUM_ATTR_CLOSED __attribute__((enum_extensibility(closed)))
+#else
+#define __OS_ENUM_ATTR
+#define __OS_ENUM_ATTR_CLOSED
+#endif // __has_attribute(enum_extensibility)
+
+#if __has_attribute(flag_enum)
+/*!
+ * Compile with -Wflag-enum and -Wassign-enum to enforce at definition and
+ * assignment, respectively, i.e. -Wflag-enum prevents you from creating new
+ * enumeration values from illegal values within the enum definition, and
+ * -Wassign-enum prevents you from assigning illegal values to a variable of the
+ * enum type.
+ */
+#define __OS_OPTIONS_ATTR __attribute__((flag_enum))
+#else
+#define __OS_OPTIONS_ATTR
+#endif // __has_attribute(flag_enum)
+
+#if __has_feature(objc_fixed_enum) || __has_extension(cxx_fixed_enum) || \
+        __has_extension(cxx_strong_enums)
 #define OS_ENUM(_name, _type, ...) \
                typedef enum : _type { __VA_ARGS__ } _name##_t
+#define OS_CLOSED_ENUM(_name, _type, ...) \
+               typedef enum : _type { __VA_ARGS__ } \
+                       __OS_ENUM_ATTR_CLOSED _name##_t
+#define OS_OPTIONS(_name, _type, ...) \
+               typedef enum : _type { __VA_ARGS__ } \
+                       __OS_ENUM_ATTR __OS_OPTIONS_ATTR _name##_t
+#define OS_CLOSED_OPTIONS(_name, _type, ...) \
+               typedef enum : _type { __VA_ARGS__ } \
+                       __OS_ENUM_ATTR_CLOSED __OS_OPTIONS_ATTR _name##_t
 #else
+/*!
+ * There is unfortunately no good way in plain C to have both fixed-type enums
+ * and enforcement for clang's enum_extensibility extensions. The primary goal
+ * of these macros is to allow you to define an enum and specify its width in a
+ * single statement, and for plain C that is accomplished by defining an
+ * anonymous enum and then separately typedef'ing the requested type name to the
+ * requested underlying integer type. So the type emitted actually has no
+ * relationship at all to the enum, and therefore while the compiler could
+ * enforce enum extensibility if you used the enum type, it cannot do so if you
+ * use the "_t" type resulting from this expression.
+ *
+ * But we still define a named enum type and decorate it appropriately for you,
+ * so if you really want the enum extensibility enforcement, you can use the
+ * enum type yourself, i.e. when compiling with a C compiler:
+ *
+ *     OS_CLOSED_ENUM(my_type, uint64_t,
+ *         FOO,
+ *         BAR,
+ *         BAZ,
+ *     );
+ *
+ *     my_type_t mt = 98; // legal
+ *     enum my_type emt = 98; // illegal
+ *
+ * But be aware that the underlying enum type's width is subject only to the C
+ * language's guarantees -- namely that it will be compatible with int, char,
+ * and unsigned char. It is not safe to rely on the size of this type.
+ *
+ * When compiling in ObjC or C++, both of the above assignments are illegal.
+ */
+#define __OS_ENUM_C_FALLBACK(_name, _type, ...) \
+               typedef _type _name##_t; enum _name { __VA_ARGS__ }
+
 #define OS_ENUM(_name, _type, ...) \
-               enum { __VA_ARGS__ }; typedef _type _name##_t
-#endif
+               typedef _type _name##_t; enum { __VA_ARGS__ }
+#define OS_CLOSED_ENUM(_name, _type, ...) \
+               __OS_ENUM_C_FALLBACK(_name, _type, ## __VA_ARGS__) \
+               __OS_ENUM_ATTR_CLOSED
+#define OS_OPTIONS(_name, _type, ...) \
+               __OS_ENUM_C_FALLBACK(_name, _type, ## __VA_ARGS__) \
+               __OS_ENUM_ATTR __OS_OPTIONS_ATTR
+#define OS_CLOSED_OPTIONS(_name, _type, ...) \
+               __OS_ENUM_C_FALLBACK(_name, _type, ## __VA_ARGS__) \
+               __OS_ENUM_ATTR_CLOSED __OS_OPTIONS_ATTR
+#endif // __has_feature(objc_fixed_enum) || __has_extension(cxx_strong_enums)
 
 #if __has_feature(attribute_availability_swift)
 // equivalent to __SWIFT_UNAVAILABLE from Availability.h
 #ifdef __GNUC__
 #define os_prevent_tail_call_optimization()  __asm__("")
 #define os_is_compile_time_constant(expr)  __builtin_constant_p(expr)
+#ifndef KERNEL
 #define os_compiler_barrier()  __asm__ __volatile__("" ::: "memory")
+#endif
 #else
 #define os_prevent_tail_call_optimization()  do { } while (0)
 #define os_is_compile_time_constant(expr)  0
+#ifndef KERNEL
 #define os_compiler_barrier()  do { } while (0)
 #endif
+#endif
 
 #if __has_attribute(not_tail_called)
 #define OS_NOT_TAIL_CALLED __attribute__((__not_tail_called__))
 #define OS_NOT_TAIL_CALLED
 #endif
 
+/*
+ * LIBKERN_ALWAYS_DESTROY attribute can be applied to global variables with
+ * destructors. It specifies that and object should have its exit-time
+ * destructor run. This attribute is the default unless clang was invoked with
+ * -fno-c++-static-destructors.
+ */
+#if __has_attribute(always_destroy)
+#define LIBKERN_ALWAYS_DESTROY __attribute__((always_destroy))
+#else
+#define LIBKERN_ALWAYS_DESTROY
+#endif
+
 typedef void (*os_function_t)(void *_Nullable);
 
 #ifdef __BLOCKS__
diff --git a/libkern/os/cpp_util.h b/libkern/os/cpp_util.h
new file mode 100644 (file)
index 0000000..dc7236b
--- /dev/null
@@ -0,0 +1,49 @@
+#ifndef _OS_CPP_UTIL_H
+#define _OS_CPP_UTIL_H
+
+#include <sys/cdefs.h>
+
+#if __has_feature(cxx_nullptr) && __has_feature(cxx_decltype)
+# define OS_HAS_NULLPTR 1
+#endif
+
+#if __has_feature(cxx_rvalue_references) || __has_extension(cxx_rvalue_references)
+# define OS_HAS_RVALUE_REFERENCES 1
+#endif
+
+namespace os {
+#if OS_HAS_NULLPTR
+typedef decltype(nullptr) nullptr_t;
+#endif
+
+/*
+ * Reference removal
+ */
+
+template <class _T> struct remove_reference       {typedef _T type;};
+template <class _T> struct remove_reference<_T&>  {typedef _T type;};
+template <class _T> struct remove_reference<_T &&> {typedef _T type;};
+template <class _T> using remove_reference_t = typename remove_reference<_T>::type;
+
+/*
+ * Const removal
+ */
+
+template <class _T> struct remove_const           {typedef _T type;};
+template <class _T> struct remove_const<const _T> {typedef _T type;};
+template <class _T> using remove_const_t = typename remove_const<_T>::type;
+
+/*
+ * Move
+ */
+
+template <class _T>
+inline typename remove_reference<_T>::type &&
+move(_T && _t)
+{
+       typedef typename os::remove_reference<_T>::type _U;
+       return static_cast<_U &&>(_t);
+}
+}
+
+#endif /* _OS_CPP_UTIL_H */
index 8a64d5e32c75ddb330a477a1046017d79b6fe785..b01f583220daf4baee8218ee9d4b0064806f2e7b 100644 (file)
@@ -1,3 +1,5 @@
+/* * Copyright (c) 2019 Apple Inc. All rights reserved. */
+
 #include <stddef.h>
 #undef offset
 
@@ -23,6 +25,7 @@
 #include <kern/kalloc.h>
 #include <kern/clock.h>
 #include <kern/assert.h>
+#include <kern/task.h>
 
 #include <firehose/tracepoint_private.h>
 #include <firehose/chunk_private.h>
@@ -56,7 +59,7 @@ extern void bsd_log_lock(void);
 extern void bsd_log_unlock(void);
 extern void logwakeup(struct msgbuf *);
 
-decl_lck_spin_data(extern, oslog_stream_lock)
+decl_lck_spin_data(extern, oslog_stream_lock);
 #define stream_lock() lck_spin_lock(&oslog_stream_lock)
 #define stream_unlock() lck_spin_unlock(&oslog_stream_lock)
 
@@ -106,19 +109,19 @@ oslog_stream_create_buf_entry(oslog_stream_link_type_t type, firehose_tracepoint
 
 static void
 _os_log_with_args_internal(os_log_t oslog __unused, os_log_type_t type __unused,
-    const char *format, va_list args, void *addr, void *dso);
+    const char *format, va_list args, void *addr, void *dso, bool driverKit, bool addcr);
 
 static void
-_os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool logging);
+_os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool logging, bool addcr);
 
 static void
 _os_log_to_log_internal(os_log_t oslog, os_log_type_t type,
-    const char *format, va_list args, void *addr, void *dso);
+    const char *format, va_list args, void *addr, void *dso, bool driverKit);
 
 
 static void
 _os_log_actual(os_log_t oslog, os_log_type_t type, const char *format, void
-    *dso, void *addr, os_log_buffer_context_t context);
+    *dso, void *addr, os_log_buffer_context_t context, bool driverKit);
 
 bool
 os_log_info_enabled(os_log_t log __unused)
@@ -152,13 +155,63 @@ _os_log_internal(void *dso, os_log_t log, uint8_t type, const char *message, ...
 
        va_start(args, message);
 
-       _os_log_with_args_internal(log, type, message, args, addr, dso);
+       _os_log_with_args_internal(log, type, message, args, addr, dso, FALSE, FALSE);
 
        va_end(args);
 
        return;
 }
 
+__attribute__((noinline, not_tail_called)) int
+_os_log_internal_driverKit(void *dso, os_log_t log, uint8_t type, const char *message, ...)
+{
+       va_list args;
+       void *addr = __builtin_return_address(0);
+       bool driverKitLog = FALSE;
+
+       /*
+        * We want to be able to identify dexts from the logs.
+        *
+        * Usually the addr is used to understand if the log line
+        * was generated by a kext or the kernel main executable.
+        * Logd uses copyKextUUIDForAddress with the addr specified
+        * in the log line to retrieve the kext UUID of the sender.
+        *
+        * Dext however are not loaded in kernel space so they do not
+        * have a kernel range of addresses.
+        *
+        * To make the same mechanism work, OSKext fakes a kernel
+        * address range for dexts using the loadTag,
+        * so we just need to use the loadTag as addr here
+        * to allow logd to retrieve the correct UUID.
+        *
+        * NOTE: loadTag is populated in the task when the dext is matching,
+        * so if log lines are generated before the matching they will be
+        * identified as kernel main executable.
+        */
+       task_t self_task = current_task();
+
+       /*
+        * Only dextis are supposed to use this log path.
+        */
+       if (!task_is_driver(self_task)) {
+               return EPERM;
+       }
+
+       uint64_t loadTag = get_task_loadTag(self_task);
+       if (loadTag != 0) {
+               driverKitLog = TRUE;
+               addr = (void*) loadTag;
+       }
+       va_start(args, message);
+
+       _os_log_with_args_internal(log, type, message, args, addr, dso, driverKitLog, true);
+
+       va_end(args);
+
+       return 0;
+}
+
 #pragma mark - shim functions
 
 __attribute__((noinline, not_tail_called)) void
@@ -169,12 +222,12 @@ os_log_with_args(os_log_t oslog, os_log_type_t type, const char *format, va_list
                addr = __builtin_return_address(0);
        }
 
-       _os_log_with_args_internal(oslog, type, format, args, addr, NULL);
+       _os_log_with_args_internal(oslog, type, format, args, addr, NULL, FALSE, FALSE);
 }
 
 static void
 _os_log_with_args_internal(os_log_t oslog, os_log_type_t type,
-    const char *format, va_list args, void *addr, void *dso)
+    const char *format, va_list args, void *addr, void *dso, bool driverKit, bool addcr)
 {
        uint32_t  logging_config = atm_get_diagnostic_config();
        boolean_t safe;
@@ -194,16 +247,16 @@ _os_log_with_args_internal(os_log_t oslog, os_log_type_t type,
        }
 
        if (oslog != &_os_log_replay) {
-               _os_log_to_msgbuf_internal(format, args, safe, logging);
+               _os_log_to_msgbuf_internal(format, args, safe, logging, addcr);
        }
 
        if (safe && logging) {
-               _os_log_to_log_internal(oslog, type, format, args, addr, dso);
+               _os_log_to_log_internal(oslog, type, format, args, addr, dso, driverKit);
        }
 }
 
 static void
-_os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool logging)
+_os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool logging, bool addcr)
 {
        static int msgbufreplay = -1;
        va_list args_copy;
@@ -279,7 +332,7 @@ _os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool log
        }
 
        va_copy(args_copy, args);
-       vprintf_log_locked(format, args_copy);
+       vprintf_log_locked(format, args_copy, addcr);
        va_end(args_copy);
 
 #if DEVELOPMENT || DEBUG
@@ -297,7 +350,7 @@ _os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool log
 
 static void
 _os_log_to_log_internal(os_log_t oslog, os_log_type_t type,
-    const char *format, va_list args, void *addr, void *dso)
+    const char *format, va_list args, void *addr, void *dso, bool driverKit)
 {
        struct os_log_buffer_context_s context;
        unsigned char buffer_data[OS_LOG_BUFFER_MAX_SIZE] __attribute__((aligned(8)));
@@ -322,10 +375,11 @@ _os_log_to_log_internal(os_log_t oslog, os_log_type_t type,
        if (!_os_trace_addr_in_text_segment(dso, format)) {
                return;
        }
-
-       void *dso_addr = (void *) OSKextKextForAddress(addr);
-       if (dso != dso_addr) {
-               return;
+       if (!driverKit) {
+               void *dso_addr = (void *) OSKextKextForAddress(addr);
+               if (dso != dso_addr) {
+                       return;
+               }
        }
 #endif /* FIREHOSE_USES_SHARED_CACHE */
 
@@ -340,11 +394,11 @@ _os_log_to_log_internal(os_log_t oslog, os_log_type_t type,
 
        va_copy(args_copy, args);
 
-       (void)hw_atomic_add(&oslog_p_total_msgcount, 1);
+       os_atomic_inc(&oslog_p_total_msgcount, relaxed);
        if (_os_log_encode(format, args_copy, 0, &context)) {
-               _os_log_actual(oslog, type, format, dso, addr, &context);
+               _os_log_actual(oslog, type, format, dso, addr, &context, driverKit);
        } else {
-               (void)hw_atomic_add(&oslog_p_error_count, 1);
+               os_atomic_inc(&oslog_p_error_count, relaxed);
        }
 
        va_end(args_copy);
@@ -352,26 +406,37 @@ _os_log_to_log_internal(os_log_t oslog, os_log_type_t type,
 
 static inline size_t
 _os_trace_write_location_for_address(uint8_t buf[static sizeof(uint64_t)],
-    void *dso, const void *address, firehose_tracepoint_flags_t *flags)
+    void *dso, const void *address, firehose_tracepoint_flags_t *flags, __unused bool driverKit)
 {
+       uintptr_t shift_addr =  (uintptr_t)address - (uintptr_t)dso;
 #if FIREHOSE_USES_SHARED_CACHE
+
        *flags = _firehose_tracepoint_flags_pc_style_shared_cache;
-       memcpy(buf, (uint32_t[]){ (uintptr_t)address - (uintptr_t)dso },
+       memcpy(buf, (uint32_t[]){ shift_addr },
            sizeof(uint32_t));
        return sizeof(uint32_t);
 
 #else /* FIREHOSE_USES_SHARED_CACHE */
        kernel_mach_header_t *mh = dso;
 
-       if (mh->filetype == MH_EXECUTE) {
+       /*
+        * driverKit will have the dso set as MH_EXECUTE
+        * (it is logging from a syscall in the kernel)
+        * but needs logd to parse the address as an
+        * absolute pc.
+        */
+       if (mh->filetype == MH_EXECUTE && !driverKit) {
                *flags = _firehose_tracepoint_flags_pc_style_main_exe;
-
-               memcpy(buf, (uint32_t[]){ (uintptr_t)address - (uintptr_t)dso },
-                   sizeof(uint32_t));
+               memcpy(buf, (uint32_t[]){ shift_addr}, sizeof(uint32_t));
                return sizeof(uint32_t);
        } else {
                *flags = _firehose_tracepoint_flags_pc_style_absolute;
-               memcpy(buf, (uintptr_t[]){ VM_KERNEL_UNSLIDE(address) }, sizeof(uintptr_t));
+               if (!driverKit) {
+                       shift_addr = VM_KERNEL_UNSLIDE(address);
+               } else {
+                       shift_addr = (uintptr_t) address;
+               }
+               memcpy(buf, (uintptr_t[]){ shift_addr }, sizeof(uintptr_t));
 #if __LP64__
                return 6; // 48 bits are enough
 #else
@@ -402,7 +467,7 @@ _os_log_buffer_pack(uint8_t *buffdata, size_t buffdata_sz,
 
 static void
 _os_log_actual(os_log_t oslog __unused, os_log_type_t type, const char *format,
-    void *dso, void *addr, os_log_buffer_context_t context)
+    void *dso, void *addr, os_log_buffer_context_t context, bool driverKit)
 {
        firehose_stream_t stream;
        firehose_tracepoint_flags_t flags = 0;
@@ -413,7 +478,7 @@ _os_log_actual(os_log_t oslog __unused, os_log_type_t type, const char *format,
        uint64_t thread_id;
 
        // dso == the start of the binary that was loaded
-       addr_len = _os_trace_write_location_for_address(buffdata, dso, addr, &flags);
+       addr_len = _os_trace_write_location_for_address(buffdata, dso, addr, &flags, driverKit);
        buffdata_sz = _os_log_buffer_pack(buffdata + addr_len,
            sizeof(buffdata) - addr_len, context);
        if (buffdata_sz == 0) {
@@ -424,9 +489,16 @@ _os_log_actual(os_log_t oslog __unused, os_log_type_t type, const char *format,
        timestamp = firehose_tracepoint_time(firehose_activity_flags_default);
        thread_id = thread_tid(current_thread());
 
-       // create trace_id after we've set additional flags
-       trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log,
-           type, flags, _os_trace_offset(dso, format, flags));
+       if (driverKit) {
+               // set FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT so logd will not try to find the format string in
+               // the executable text
+               trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log,
+                   type, flags, (uintptr_t) addr | FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT);
+       } else {
+               // create trace_id after we've set additional flags
+               trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log,
+                   type, flags, _os_trace_offset(dso, format, (_firehose_tracepoint_flags_activity_t)flags));
+       }
 
        if (FALSE) {
                firehose_debug_trace(stream, trace_id.ftid_value, timestamp,
@@ -452,7 +524,7 @@ _firehose_trace(firehose_stream_t stream, firehose_tracepoint_id_u ftid,
 
        if (slowpath(ft_size + publen > _firehose_chunk_payload_size)) {
                // We'll need to have some handling here. For now - return 0
-               (void)hw_atomic_add(&oslog_p_error_count, 1);
+               os_atomic_inc(&oslog_p_error_count, relaxed);
                return 0;
        }
 
@@ -474,11 +546,11 @@ out:
        if (!fastpath(ft)) {
                if (oslog_boot_done) {
                        if (stream == firehose_stream_metadata) {
-                               (void)hw_atomic_add(&oslog_p_metadata_dropped_msgcount, 1);
+                               os_atomic_inc(&oslog_p_metadata_dropped_msgcount, relaxed);
                        } else {
                                // If we run out of space in the persistence buffer we're
                                // dropping the message.
-                               (void)hw_atomic_add(&oslog_p_dropped_msgcount, 1);
+                               os_atomic_inc(&oslog_p_dropped_msgcount, relaxed);
                        }
                        return 0;
                }
@@ -489,7 +561,7 @@ out:
                offset = firehose_chunk_tracepoint_try_reserve(fbc, stamp,
                    firehose_stream_persist, 0, publen, 0, NULL);
                if (offset <= 0) {
-                       (void)hw_atomic_add(&oslog_p_boot_dropped_msgcount, 1);
+                       os_atomic_inc(&oslog_p_boot_dropped_msgcount, relaxed);
                        return 0;
                }
 
@@ -497,7 +569,7 @@ out:
                    thread_tid(current_thread()), offset);
                memcpy(ft->ft_data, pubdata, publen);
                firehose_chunk_tracepoint_end(fbc, ft, ftid);
-               (void)hw_atomic_add(&oslog_p_saved_msgcount, 1);
+               os_atomic_inc(&oslog_p_saved_msgcount, relaxed);
                return ftid.ftid_value;
        }
        if (!oslog_boot_done) {
@@ -507,9 +579,9 @@ out:
 
        __firehose_buffer_tracepoint_flush(ft, ftid);
        if (stream == firehose_stream_metadata) {
-               (void)hw_atomic_add(&oslog_p_metadata_saved_msgcount, 1);
+               os_atomic_inc(&oslog_p_metadata_saved_msgcount, relaxed);
        } else {
-               (void)hw_atomic_add(&oslog_p_saved_msgcount, 1);
+               os_atomic_inc(&oslog_p_saved_msgcount, relaxed);
        }
        return ftid.ftid_value;
 }
@@ -567,7 +639,7 @@ firehose_trace_metadata(firehose_stream_t stream, firehose_tracepoint_id_u ftid,
        m_entry = oslog_stream_create_buf_entry(oslog_stream_link_type_metadata, ftid,
            stamp, pubdata, publen);
        if (!m_entry) {
-               (void)hw_atomic_add(&oslog_s_error_count, 1);
+               os_atomic_inc(&oslog_s_error_count, relaxed);
                goto finish;
        }
 
@@ -796,7 +868,7 @@ _test_log_loop(void * arg __unused, wait_result_t wres __unused)
 {
        uint32_t uniqid = RandomULong();
        test_oslog_debug_helper(uniqid, 100);
-       (void)hw_atomic_add(&_test_log_loop_count, 100);
+       os_atomic_add(&_test_log_loop_count, 100, relaxed);
 }
 
 kern_return_t
index 8b58e484e7ea3cae428e42564a7b59b40f326977..2972daca7dd6f519180e17ff2fe664736d6d12ab 100644 (file)
@@ -53,6 +53,16 @@ extern bool startup_serial_logging_active;
 extern uint64_t startup_serial_num_procs;
 #endif /* XNU_KERNEL_PRIVATE */
 
+#ifdef KERNEL
+#define OS_LOG_BUFFER_MAX_SIZE 256
+#else
+#define OS_LOG_BUFFER_MAX_SIZE 1024
+#endif
+
+// The OS_LOG_BUFFER_MAX_SIZE limit includes the metadata that
+// must be included in the os_log firehose buffer
+#define OS_LOG_DATA_MAX_SIZE (OS_LOG_BUFFER_MAX_SIZE - 16)
+
 OS_ALWAYS_INLINE static inline void _os_log_verify_format_str(__unused const char *msg, ...) __attribute__((format(os_log, 1, 2)));
 OS_ALWAYS_INLINE static inline void
 _os_log_verify_format_str(__unused const char *msg, ...)                                       /* placeholder */
@@ -452,6 +462,38 @@ os_log_debug_enabled(os_log_t log);
     __asm__(""); /* avoid tailcall */                                                       \
 })
 
+/*!
+ * @function os_log_driverKit
+ *
+ * @abstract
+ * Log a message using a specific type. This variant should be called only from dexts.
+ *
+ * @discussion
+ * Will log a message with the provided os_log_type_t.
+ *
+ * @param log
+ * Pass OS_LOG_DEFAULT or a log object previously created with os_log_create.
+ *
+ * @param type
+ * Pass a valid type from os_log_type_t.
+ *
+ * @param format
+ * A format string to generate a human-readable log message when the log
+ * line is decoded.  This string must be a constant string, not dynamically
+ * generated.  Supports all standard printf types and %@ (objects).
+ *
+ * @result
+ * Returns EPERM if the caller is not a driverKit process, 0 in case of success.
+ */
+#define os_log_driverKit(out, log, type, format, ...) __extension__({                            \
+    _Static_assert(__builtin_constant_p(format), "format string must be constant");         \
+    __attribute__((section("__TEXT,__os_log"))) static const char _os_log_fmt[] = format;   \
+    _os_log_verify_format_str(format, ##__VA_ARGS__);                                       \
+    (*(out)) = _os_log_internal_driverKit(&__dso_handle, log, type, _os_log_fmt, ##__VA_ARGS__);                 \
+    __asm__(""); /* avoid tailcall */                                                       \
+})
+
+
 /*!
  * @function os_log_sensitive_debug
  *
@@ -511,6 +553,16 @@ OS_EXPORT OS_NOTHROW
 void
 _os_log_internal(void *dso, os_log_t log, os_log_type_t type, const char *message, ...);
 
+/*!
+ * @function _os_log_internal_driverKit
+ *
+ * @abstract
+ * Internal function used by macros.
+ */
+__WATCHOS_AVAILABLE(6.0) __OSX_AVAILABLE(10.15) __IOS_AVAILABLE(13.0) __TVOS_AVAILABLE(13.0)
+OS_EXPORT OS_NOTHROW
+int
+_os_log_internal_driverKit(void *dso, os_log_t log, os_log_type_t type, const char *message, ...);
 __END_DECLS
 
 #endif /* __os_log_h */
index e07364752b3d817f35c3d2b4a15e3ccfe0558767..ac4b44bdb41e480636801ec1e57c09e280acd37e 100644 (file)
@@ -118,11 +118,6 @@ typedef struct os_log_buffer_value_s {
 typedef struct os_log_buffer_s {
 #define OS_LOG_BUFFER_HAS_PRIVATE 0x1
 #define OS_LOG_BUFFER_HAS_NON_SCALAR 0x2
-#ifdef KERNEL
-#define OS_LOG_BUFFER_MAX_SIZE 256
-#else
-#define OS_LOG_BUFFER_MAX_SIZE 1024
-#endif
        uint8_t flags;
        uint8_t arg_cnt;
        uint8_t content[];
index abf04917a93d8073890bd7861157262dadc90894..f00a6024fe37a803a803f4310449f2ffe0367256 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2015-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -60,66 +60,7 @@ __os_warn_unused(__const bool x)
 #define os_mul_overflow(a, b, res) __os_warn_unused(__builtin_mul_overflow((a), (b), (res)))
 
 #else
-
-/* compile-time assertion that 'x' and 'y' are equivalent types */
-#ifdef __cplusplus
-#define __OS_TYPE_CHECK(x, y) do { \
-       __typeof__(x) _x; \
-       __typeof__(y) _y; \
-       (void)(&_x == &_y, "overflow arithmetic: incompatible types"); \
-} while (0)
-#else
-#define __OS_TYPE_CHECK(x, y) do { \
-       _Static_assert(__builtin_types_compatible_p(__typeof(x),__typeof(y)), \
-                       "overflow arithmetic: incompatible types"); \
-} while (0)
-#endif
-
-#define __os_add_overflow_func(T, U, V) _Generic((T),\
-               unsigned:           __builtin_uadd_overflow, \
-               unsigned long:      __builtin_uaddl_overflow, \
-               unsigned long long: __builtin_uaddll_overflow, \
-               int:                __builtin_sadd_overflow, \
-               long:               __builtin_saddl_overflow, \
-               long long:          __builtin_saddll_overflow \
-       )(T,U,V)
-
-#define __os_sub_overflow_func(T, U, V) _Generic((T),\
-               unsigned:           __builtin_usub_overflow, \
-               unsigned long:      __builtin_usubl_overflow, \
-               unsigned long long: __builtin_usubll_overflow, \
-               int:                __builtin_ssub_overflow, \
-               long:               __builtin_ssubl_overflow, \
-               long long:          __builtin_ssubll_overflow \
-       )(T,U,V)
-
-#define __os_mul_overflow_func(T, U, V) _Generic((T),\
-               unsigned:           __builtin_umul_overflow, \
-               unsigned long:      __builtin_umull_overflow, \
-               unsigned long long: __builtin_umulll_overflow, \
-               int:                __builtin_smul_overflow, \
-               long:               __builtin_smull_overflow, \
-               long long:          __builtin_smulll_overflow \
-       )(T,U,V)
-
-#define os_add_overflow(a, b, res) __os_warn_unused(__extension__({ \
-       __OS_TYPE_CHECK((a), (b)); \
-       __OS_TYPE_CHECK((b), *(res)); \
-       __os_add_overflow_func((a), (b), (res)); \
-}))
-
-#define os_sub_overflow(a, b, res) __os_warn_unused(__extension__({ \
-       __OS_TYPE_CHECK((a), (b)); \
-       __OS_TYPE_CHECK((b), *(res)); \
-       __os_sub_overflow_func((a), (b), (res)); \
-}))
-
-#define os_mul_overflow(a, b, res) __os_warn_unused(__extension__({ \
-       __OS_TYPE_CHECK((a), (b)); \
-       __OS_TYPE_CHECK((b), *(res)); \
-       __os_mul_overflow_func((a), (b), (res)); \
-}))
-
+# error os_overflow expects type-generic builtins
 #endif /* __has_builtin(...) */
 
 /* os_add3_overflow(a, b, c) -> (a + b + c) */
@@ -158,6 +99,20 @@ __os_warn_unused(__const bool x)
        _s | _t; \
 }))
 
+/* os_convert_overflow(a) -> a [converted to the result type] */
 #define os_convert_overflow(a, res) os_add_overflow((a), 0, (res))
 
+/* os_inc_overflow(res) -> *res += 1 */
+#define os_inc_overflow(res) __os_warn_unused(__extension__({ \
+       __typeof((res)) _tmp = (res); \
+       os_add_overflow(*_tmp, 1, _tmp); \
+}))
+
+/* os_dec_overflow(res) -> *res -= 1 */
+#define os_dec_overflow(res) __os_warn_unused(__extension__({ \
+       __typeof((res)) _tmp = (res); \
+       os_sub_overflow(*_tmp, 1, _tmp); \
+}))
+
+
 #endif /* _OS_OVERFLOW_H */
diff --git a/libkern/os/ptrtools.h b/libkern/os/ptrtools.h
new file mode 100644 (file)
index 0000000..9aaf436
--- /dev/null
@@ -0,0 +1,39 @@
+/* * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _OS_PTRTOOLS_H
+#define _OS_PTRTOOLS_H
+
+/* dereference unaligned pointer 'p' */
+#define os_unaligned_deref(p) ((__os_unaligned_type(p))(p))->val
+
+/* ensure the compiler emits at most one access to 'val' */
+#define os_access_once(val) (*((volatile __typeof__((val)) *)&(val)))
+
+#define __os_unaligned_type(p) struct { __typeof__(*(p)) val; } __attribute__((packed)) *
+
+#endif /* _OS_PTRTOOLS_H */
index a83940b07c531a852602766e5da2c9889f64fd05..a4b9b1c3ffda98b8bbd6defce102c39ebeaf9599 100644 (file)
@@ -53,7 +53,7 @@ OS_ENUM(os_reason_libsystem_code, uint64_t,
 int
 os_fault_with_payload(uint32_t reason_namespace, uint64_t reason_code,
     void *payload, uint32_t payload_size, const char *reason_string,
-    uint64_t reason_flags);
+    uint64_t reason_flags) __attribute__((cold));
 
 #endif // !KERNEL
 
index 67deb068f441cabc21abc2e6ed1fbbcb14f631d4..0cbcdf745ef39a8374475491d525f45aaece3181 100644 (file)
@@ -1,16 +1,22 @@
+#if KERNEL
 #include <kern/assert.h>
 #include <kern/debug.h>
 #include <pexpert/pexpert.h>
 #include <kern/btlog.h>
 #include <kern/backtrace.h>
 #include <libkern/libkern.h>
+#endif
+
 #include "refcnt.h"
 
 #define OS_REFCNT_MAX_COUNT     ((os_ref_count_t)0x0FFFFFFFUL)
 
 #if OS_REFCNT_DEBUG
-os_refgrp_decl(static, global_ref_group, "all", NULL);
-static bool ref_debug_enable = false;
+extern struct os_refgrp global_ref_group;
+os_refgrp_decl(, global_ref_group, "all", NULL);
+
+extern bool ref_debug_enable;
+bool ref_debug_enable = false;
 static const size_t ref_log_nrecords = 1000000;
 
 #define REFLOG_BTDEPTH   10
@@ -22,77 +28,75 @@ static const size_t ref_log_nrecords = 1000000;
 # define __debug_only __unused
 #endif /* OS_REFCNT_DEBUG */
 
-static const char *
-ref_grp_name(struct os_refcnt __debug_only *rc)
+void
+os_ref_panic_live(void *rc)
 {
-#if OS_REFCNT_DEBUG
-       if (rc && rc->ref_group && rc->ref_group->grp_name) {
-               return rc->ref_group->grp_name;
-       }
-#endif
-       return "<null>";
+       panic("os_refcnt: unexpected release of final reference (rc=%p)\n", rc);
+       __builtin_unreachable();
 }
 
-__attribute__((cold, noinline, not_tail_called, noreturn))
+__abortlike
 static void
-os_ref_panic_underflow(struct os_refcnt *rc)
+os_ref_panic_underflow(void *rc)
 {
-       panic("os_refcnt: underflow (rc=%p, grp=%s)\n", rc, ref_grp_name(rc));
+       panic("os_refcnt: underflow (rc=%p)\n", rc);
        __builtin_unreachable();
 }
 
-static inline void
-os_ref_check_underflow(struct os_refcnt *rc, os_ref_count_t count)
+__abortlike
+static void
+os_ref_panic_resurrection(void *rc)
 {
-       if (__improbable(count == 0)) {
-               os_ref_panic_underflow(rc);
-       }
+       panic("os_refcnt: attempted resurrection (rc=%p)\n", rc);
+       __builtin_unreachable();
 }
 
-__attribute__((cold, noinline, not_tail_called, noreturn))
+__abortlike
 static void
-os_ref_panic_resurrection(struct os_refcnt *rc)
+os_ref_panic_overflow(void *rc)
 {
-       panic("os_refcnt: used unsafely when zero (rc=%p, grp=%s)\n", rc, ref_grp_name(rc));
+       panic("os_refcnt: overflow (rc=%p)\n", rc);
        __builtin_unreachable();
 }
 
 static inline void
-os_ref_assert_referenced(struct os_refcnt *rc, os_ref_count_t count)
+os_ref_check_underflow(void *rc, os_ref_count_t count)
 {
        if (__improbable(count == 0)) {
-               os_ref_panic_resurrection(rc);
+               os_ref_panic_underflow(rc);
        }
 }
 
-__attribute__((cold, noinline, not_tail_called, noreturn))
-static void
-os_ref_panic_overflow(struct os_refcnt *rc)
+static inline void
+os_ref_check_overflow(os_ref_atomic_t *rc, os_ref_count_t count)
 {
-       panic("os_refcnt: overflow (rc=%p, grp=%s)\n", rc, ref_grp_name(rc));
-       __builtin_unreachable();
+       if (__improbable(count >= OS_REFCNT_MAX_COUNT)) {
+               os_ref_panic_overflow(rc);
+       }
 }
 
 static inline void
-os_ref_check_overflow(struct os_refcnt *rc, os_ref_count_t count)
+os_ref_assert_referenced(void *rc, os_ref_count_t count)
 {
-       if (__improbable(count >= OS_REFCNT_MAX_COUNT)) {
-               os_ref_panic_overflow(rc);
+       if (__improbable(count == 0)) {
+               os_ref_panic_resurrection(rc);
        }
 }
 
-static void
-os_ref_check_retain(struct os_refcnt *rc, os_ref_count_t count)
+static inline void
+os_ref_check_retain(os_ref_atomic_t *rc, os_ref_count_t count)
 {
        os_ref_assert_referenced(rc, count);
        os_ref_check_overflow(rc, count);
 }
 
 #if OS_REFCNT_DEBUG
+#if KERNEL
+__attribute__((cold, noinline))
 static void
 ref_log_op(struct os_refgrp *grp, void *elem, int op)
 {
-       if (!ref_debug_enable || grp == NULL) {
+       if (grp == NULL) {
                return;
        }
 
@@ -102,10 +106,11 @@ ref_log_op(struct os_refgrp *grp, void *elem, int op)
        }
 
        uintptr_t bt[REFLOG_BTDEPTH];
-       uint32_t nframes = backtrace(bt, REFLOG_BTDEPTH);
+       uint32_t nframes = backtrace(bt, REFLOG_BTDEPTH, NULL);
        btlog_add_entry((btlog_t *)grp->grp_log, elem, op, (void **)bt, nframes);
 }
 
+__attribute__((cold, noinline))
 static void
 ref_log_drop(struct os_refgrp *grp, void *elem)
 {
@@ -121,6 +126,7 @@ ref_log_drop(struct os_refgrp *grp, void *elem)
        btlog_remove_entries_for_element(grp->grp_log, elem);
 }
 
+__attribute__((cold, noinline))
 static void
 ref_log_init(struct os_refgrp *grp)
 {
@@ -145,17 +151,30 @@ ref_log_init(struct os_refgrp *grp)
                if (strcmp(g, grp->grp_name) == 0) {
                        /* enable logging on this refgrp */
                        grp->grp_log = btlog_create(ref_log_nrecords, REFLOG_BTDEPTH, true);
-                       assert(grp->grp_log);
                        return;
                }
        }
 }
+#else
+
+#ifndef ref_log_init
+# define ref_log_init(...) do {} while (0)
+#endif
+#ifndef ref_log_op
+# define ref_log_op(...) do {} while (0)
+#endif
+#ifndef ref_log_drop
+# define ref_log_drop(...) do {} while (0)
+#endif
+
+#endif /* KERNEL */
 
 /*
  * attach a new refcnt to a group
  */
+__attribute__((cold, noinline))
 static void
-ref_attach_to_group(struct os_refcnt *rc, struct os_refgrp *grp, os_ref_count_t init_count)
+ref_attach_to_group(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t init_count)
 {
        if (grp == NULL) {
                return;
@@ -181,7 +200,7 @@ ref_attach_to_group(struct os_refcnt *rc, struct os_refgrp *grp, os_ref_count_t
        ref_attach_to_group(rc, grp->grp_parent, init_count);
 }
 
-static inline void
+static void
 ref_retain_group(struct os_refgrp *grp)
 {
        if (grp) {
@@ -191,7 +210,8 @@ ref_retain_group(struct os_refgrp *grp)
        }
 }
 
-static inline void
+__attribute__((cold, noinline))
+static void
 ref_release_group(struct os_refgrp *grp, bool final)
 {
        if (grp) {
@@ -204,48 +224,57 @@ ref_release_group(struct os_refgrp *grp, bool final)
                ref_release_group(grp->grp_parent, final);
        }
 }
-#endif
 
-#undef os_ref_init_count
-void
-os_ref_init_count(struct os_refcnt *rc, struct os_refgrp __debug_only *grp, os_ref_count_t count)
+__attribute__((cold, noinline))
+static void
+ref_init_debug(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, os_ref_count_t count)
 {
-       atomic_init(&rc->ref_count, count);
+       ref_attach_to_group(rc, grp, count);
 
-#if OS_REFCNT_DEBUG
-       assert(count > 0);
-       if (grp) {
-               rc->ref_group = grp;
-       } else {
-               rc->ref_group = &global_ref_group;
+       for (os_ref_count_t i = 0; i < count; i++) {
+               ref_log_op(grp, (void *)rc, REFLOG_RETAIN);
        }
+}
 
-       ref_attach_to_group(rc, rc->ref_group, count);
+__attribute__((cold, noinline))
+static void
+ref_retain_debug(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp)
+{
+       ref_retain_group(grp);
+       ref_log_op(grp, (void *)rc, REFLOG_RETAIN);
+}
+#endif
 
-       for (os_ref_count_t i = 0; i < count; i++) {
-               ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN);
+void
+os_ref_init_count_internal(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, os_ref_count_t count)
+{
+       os_ref_check_underflow(rc, count);
+       atomic_init(rc, count);
+
+#if OS_REFCNT_DEBUG
+       if (__improbable(ref_debug_enable && grp)) {
+               ref_init_debug(rc, grp, count);
        }
 #endif
 }
 
 void
-os_ref_retain(struct os_refcnt *rc)
+os_ref_retain_internal(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp)
 {
-       os_ref_count_t old = atomic_fetch_add_explicit(&rc->ref_count, 1, memory_order_relaxed);
+       os_ref_count_t old = atomic_fetch_add_explicit(rc, 1, memory_order_relaxed);
        os_ref_check_retain(rc, old);
 
 #if OS_REFCNT_DEBUG
-       if (__improbable(ref_debug_enable)) {
-               ref_retain_group(rc->ref_group);
-               ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN);
+       if (__improbable(grp && ref_debug_enable)) {
+               ref_retain_debug(rc, grp);
        }
 #endif
 }
 
 bool
-os_ref_retain_try(struct os_refcnt *rc)
+os_ref_retain_try_internal(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp)
 {
-       os_ref_count_t cur = os_ref_get_count(rc);
+       os_ref_count_t cur = os_ref_get_count_internal(rc);
 
        while (1) {
                if (__improbable(cur == 0)) {
@@ -254,83 +283,275 @@ os_ref_retain_try(struct os_refcnt *rc)
 
                os_ref_check_retain(rc, cur);
 
-               if (atomic_compare_exchange_weak_explicit(&rc->ref_count, &cur, cur + 1,
+               if (atomic_compare_exchange_weak_explicit(rc, &cur, cur + 1,
                    memory_order_relaxed, memory_order_relaxed)) {
-#if OS_REFCNT_DEBUG
-                       if (__improbable(ref_debug_enable)) {
-                               ref_retain_group(rc->ref_group);
-                               ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN);
-                       }
-#endif
-                       return true;
+                       break;
                }
        }
+
+#if OS_REFCNT_DEBUG
+       if (__improbable(grp && ref_debug_enable)) {
+               ref_retain_debug(rc, grp);
+       }
+#endif
+
+       return true;
 }
 
-os_ref_count_t
-os_ref_release_explicit(struct os_refcnt *rc, memory_order release_order, memory_order dealloc_order)
+__attribute__((always_inline))
+static inline os_ref_count_t
+_os_ref_release_inline(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp,
+    memory_order release_order, memory_order dealloc_order)
 {
+       os_ref_count_t val;
+
 #if OS_REFCNT_DEBUG
-       /*
-        * Care not to use 'rc' after the decrement because it might be deallocated
-        * under us.
-        */
-       struct os_refgrp *grp = rc->ref_group;
-       if (__improbable(ref_debug_enable)) {
+       if (__improbable(grp && ref_debug_enable)) {
+               /*
+                * Care not to use 'rc' after the decrement because it might be deallocated
+                * under us.
+                */
                ref_log_op(grp, (void *)rc, REFLOG_RELEASE);
        }
 #endif
 
-       os_ref_count_t val = atomic_fetch_sub_explicit(&rc->ref_count, 1, release_order);
+       val = atomic_fetch_sub_explicit(rc, 1, release_order);
        os_ref_check_underflow(rc, val);
        if (__improbable(--val == 0)) {
-               atomic_load_explicit(&rc->ref_count, dealloc_order);
+               atomic_load_explicit(rc, dealloc_order);
+       }
+
 #if OS_REFCNT_DEBUG
-               if (__improbable(ref_debug_enable)) {
+       if (__improbable(grp && ref_debug_enable)) {
+               if (val == 0) {
                        ref_log_drop(grp, (void *)rc); /* rc is only used as an identifier */
                }
+               ref_release_group(grp, !val);
+       }
 #endif
+
+       return val;
+}
+
+__attribute__((noinline))
+os_ref_count_t
+os_ref_release_internal(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp,
+    memory_order release_order, memory_order dealloc_order)
+{
+       // Legacy exported interface with bad codegen due to the barriers
+       // not being immediate
+       //
+       // Also serves as the debug function
+       return _os_ref_release_inline(rc, grp, release_order, dealloc_order);
+}
+
+os_ref_count_t
+os_ref_release_barrier_internal(os_ref_atomic_t *rc,
+    struct os_refgrp * __debug_only grp)
+{
+#if OS_REFCNT_DEBUG
+       if (__improbable(grp && ref_debug_enable)) {
+               return os_ref_release_internal(rc, grp,
+                          memory_order_release, memory_order_acquire);
        }
+#endif
+       return _os_ref_release_inline(rc, NULL,
+                  memory_order_release, memory_order_acquire);
+}
 
+os_ref_count_t
+os_ref_release_relaxed_internal(os_ref_atomic_t *rc,
+    struct os_refgrp * __debug_only grp)
+{
 #if OS_REFCNT_DEBUG
-       if (__improbable(ref_debug_enable)) {
+       if (__improbable(grp && ref_debug_enable)) {
+               return os_ref_release_internal(rc, grp,
+                          memory_order_relaxed, memory_order_relaxed);
+       }
+#endif
+       return _os_ref_release_inline(rc, NULL,
+                  memory_order_relaxed, memory_order_relaxed);
+}
+
+void
+os_ref_retain_locked_internal(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp)
+{
+       os_ref_count_t val = os_ref_get_count_internal(rc);
+       os_ref_check_retain(rc, val);
+       atomic_store_explicit(rc, ++val, memory_order_relaxed);
+
+#if OS_REFCNT_DEBUG
+       if (__improbable(grp && ref_debug_enable)) {
+               ref_retain_debug(rc, grp);
+       }
+#endif
+}
+
+os_ref_count_t
+os_ref_release_locked_internal(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp)
+{
+       os_ref_count_t val = os_ref_get_count_internal(rc);
+       os_ref_check_underflow(rc, val);
+       atomic_store_explicit(rc, --val, memory_order_relaxed);
+
+#if OS_REFCNT_DEBUG
+       if (__improbable(grp && ref_debug_enable)) {
                ref_release_group(grp, !val);
+               ref_log_op(grp, (void *)rc, REFLOG_RELEASE);
+               if (val == 0) {
+                       ref_log_drop(grp, (void *)rc);
+               }
        }
 #endif
 
        return val;
 }
 
+/*
+ * Bitwise API
+ */
+
+os_ref_count_t
+os_ref_get_count_mask(os_ref_atomic_t *rc, os_ref_count_t bits)
+{
+       os_ref_count_t ret;
+       ret = os_ref_get_count_raw(rc);
+       return ret >> bits;
+}
+
+#undef os_ref_init_count_mask
 void
-os_ref_retain_locked(struct os_refcnt *rc)
+os_ref_init_count_mask(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp,
+    os_ref_count_t init_count, os_ref_count_t init_bits, os_ref_count_t b)
 {
-       os_ref_count_t val = atomic_load_explicit(&rc->ref_count, memory_order_relaxed);
-       os_ref_check_retain(rc, val);
-       atomic_store_explicit(&rc->ref_count, ++val, memory_order_relaxed);
+       assert(init_bits < (1U << b));
+       os_ref_check_underflow(rc, init_count);
+       atomic_init(rc, (init_count << b) | init_bits);
+
+#if OS_REFCNT_DEBUG
+       if (__improbable(ref_debug_enable && grp)) {
+               ref_init_debug(rc, grp, init_count);
+       }
+#endif
+}
+
+#undef os_ref_retain_mask
+void
+os_ref_retain_mask(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, os_ref_count_t bits)
+{
+       os_ref_count_t old = atomic_fetch_add_explicit(rc, 1U << bits, memory_order_relaxed);
+       os_ref_check_overflow(rc, old);
+       os_ref_assert_referenced(rc, old >> bits);
 
 #if OS_REFCNT_DEBUG
-       if (__improbable(ref_debug_enable)) {
-               ref_retain_group(rc->ref_group);
-               ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN);
+       if (__improbable(grp && ref_debug_enable)) {
+               ref_retain_debug(rc, grp);
        }
 #endif
 }
 
+#undef os_ref_release_mask_internal
 os_ref_count_t
-os_ref_release_locked(struct os_refcnt *rc)
+os_ref_release_mask_internal(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, os_ref_count_t bits,
+    memory_order release_order, memory_order dealloc_order)
 {
-       os_ref_count_t val = atomic_load_explicit(&rc->ref_count, memory_order_relaxed);
+#if OS_REFCNT_DEBUG
+       if (__improbable(grp && ref_debug_enable)) {
+               /*
+                * Care not to use 'rc' after the decrement because it might be deallocated
+                * under us.
+                */
+               ref_log_op(grp, (void *)rc, REFLOG_RELEASE);
+       }
+#endif
+
+       os_ref_count_t val = atomic_fetch_sub_explicit(rc, 1U << bits, release_order);
+       val >>= bits;
        os_ref_check_underflow(rc, val);
-       atomic_store_explicit(&rc->ref_count, --val, memory_order_relaxed);
+       if (__improbable(--val == 0)) {
+               atomic_load_explicit(rc, dealloc_order);
+       }
+
+#if OS_REFCNT_DEBUG
+       if (__improbable(grp && ref_debug_enable)) {
+               if (val == 0) {
+                       ref_log_drop(grp, (void *)rc); /* rc is only used as an identifier */
+               }
+               ref_release_group(grp, !val);
+       }
+#endif
+
+       return val;
+}
+
+#undef os_ref_retain_try_mask
+bool
+os_ref_retain_try_mask(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, os_ref_count_t bits)
+{
+       os_ref_count_t cur = os_ref_get_count_internal(rc);
+
+       while (1) {
+               if (__improbable((cur >> bits) == 0)) {
+                       return false;
+               }
+
+               os_ref_check_overflow(rc, cur);
+
+               os_ref_count_t next = cur + (1U << bits);
+               if (atomic_compare_exchange_weak_explicit(rc, &cur, next,
+                   memory_order_relaxed, memory_order_relaxed)) {
+                       break;
+               }
+       }
+
+#if OS_REFCNT_DEBUG
+       if (__improbable(grp && ref_debug_enable)) {
+               ref_retain_debug(rc, grp);
+       }
+#endif
+
+       return true;
+}
+
+#undef os_ref_retain_locked_mask
+void
+os_ref_retain_locked_mask(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, os_ref_count_t bits)
+{
+       os_ref_count_t val = os_ref_get_count_internal(rc);
+
+       os_ref_check_overflow(rc, val);
+       os_ref_assert_referenced(rc, val >> bits);
+
+       val += (1U << bits);
+       atomic_store_explicit(rc, val, memory_order_relaxed);
+
+#if OS_REFCNT_DEBUG
+       if (__improbable(grp && ref_debug_enable)) {
+               ref_retain_debug(rc, grp);
+       }
+#endif
+}
+
+#undef os_ref_release_locked_mask
+os_ref_count_t
+os_ref_release_locked_mask(os_ref_atomic_t *rc, struct os_refgrp * __debug_only grp, os_ref_count_t bits)
+{
+       os_ref_count_t val = os_ref_get_count_internal(rc);
+       os_ref_check_underflow(rc, val >> bits);
+       val -= (1U << bits);
+       atomic_store_explicit(rc, val, memory_order_relaxed);
+
+       val >>= bits;
 
 #if OS_REFCNT_DEBUG
-       if (__improbable(ref_debug_enable)) {
-               ref_release_group(rc->ref_group, !val);
-               ref_log_op(rc->ref_group, (void *)rc, REFLOG_RELEASE);
+       if (__improbable(grp && ref_debug_enable)) {
+               ref_release_group(grp, !val);
+               ref_log_op(grp, (void *)rc, REFLOG_RELEASE);
                if (val == 0) {
-                       ref_log_drop(rc->ref_group, (void *)rc);
+                       ref_log_drop(grp, (void *)rc);
                }
        }
 #endif
+
        return val;
 }
index e306d35522caed0be6f569dda6e230c06ebd9e18..bca8fcdf8150b84d6a74e1e3da2467c7a747e089 100644 (file)
  * operations and requires no external synchronization, whereas the locked flavor
  * assumes the refcnt object is locked by the caller. It is NOT safe to
  * mix-and-match locked and atomic calls.
+ *
+ * 'refgrp's are a way to (hierarchically) group like refcount objects for
+ * debugging purposes. The group keeps track of the total number and aggregate
+ * reference count of member refcounts, and the "rlog=" boot-arg is used to enable
+ * refcount logging by group name. Named groups can be created explicitly with
+ * os_refgrp_decl(), or implicitly by passing NULL for the refgrp when
+ * initializing a refcnt object. In the latter case, the group name is the same as
+ * the function enclosing the init call. Groups are only available on DEV or DEBUG
+ * builds, and are otherwise compiled out.
  */
 
 #include <stdatomic.h>
@@ -48,40 +57,33 @@ typedef struct os_refcnt os_refcnt_t;
 
 /* type of the internal counter */
 typedef uint32_t os_ref_count_t;
-
-#if DEVELOPMENT || DEBUG
-# define OS_REFCNT_DEBUG 1
-#else
-# define OS_REFCNT_DEBUG 0
-#endif
+typedef _Atomic(os_ref_count_t) os_ref_atomic_t;
 
 /*
- * Debugging is keyed off ref_group, so leave that field for kexts so that the
- * combination of dev/debug kernel and release kext works.
+ * OS_REF_INITIALIZER
+ * OS_REF_ATOMIC_INITIALIZER
+ *
+ * Static initializers that create refcnt objects with safe initial values for use
+ * between declaration and initialization (os_ref*_init()). Equivalent to zeroing.
  */
-#if XNU_KERNEL_PRIVATE
-# define OS_REFCNT_HAS_GROUP OS_REFCNT_DEBUG
-#else
-# define OS_REFCNT_HAS_GROUP 1
-#endif
 
-struct os_refcnt {
-       _Atomic os_ref_count_t ref_count;
-#if OS_REFCNT_HAS_GROUP
-       struct os_refgrp *ref_group;
+#ifndef KERNEL
+# include <stdlib.h>
+# include <stdio.h>
+# ifndef __improbable
+#  define __improbable(x) x
+# endif
+# ifndef panic
+#  define panic(x, ...) do { fprintf(stderr, x, __VA_ARGS__); abort(); } while (0)
+# endif
 #endif
-};
-
-#if OS_REFCNT_DEBUG
-struct os_refgrp {
-       const char *const grp_name;
-       _Atomic os_ref_count_t grp_children; /* number of refcount objects in group */
-       _Atomic os_ref_count_t grp_count;    /* current reference count of group */
-       _Atomic uint64_t grp_retain_total;
-       _Atomic uint64_t grp_release_total;
-       struct os_refgrp *grp_parent;
-       void *grp_log;                       /* refcount logging context */
-};
+
+#ifndef OS_REFCNT_DEBUG
+# if DEVELOPMENT || DEBUG
+#  define OS_REFCNT_DEBUG 1
+# else
+#  define OS_REFCNT_DEBUG 0
+# endif
 #endif
 
 #if __has_attribute(diagnose_if)
@@ -97,33 +99,16 @@ __BEGIN_DECLS
  * os_ref_init_count: initialize an os_refcnt with a specific count >= 1
  */
 #define os_ref_init(rc, grp) os_ref_init_count((rc), (grp), 1)
-void os_ref_init_count(struct os_refcnt *, struct os_refgrp *, os_ref_count_t count)
+static void os_ref_init_count(struct os_refcnt *, struct os_refgrp *, os_ref_count_t count)
 os_error_if(count == 0, "Reference count must be non-zero initialized");
 
-#if OS_REFCNT_DEBUG
-# define os_refgrp_decl(qual, var, name, parent) \
-       qual struct os_refgrp __attribute__((section("__DATA,__refgrps"))) var = { \
-               .grp_name =          (name), \
-               .grp_children =      ATOMIC_VAR_INIT(0), \
-               .grp_count =         ATOMIC_VAR_INIT(0), \
-               .grp_retain_total =  ATOMIC_VAR_INIT(0), \
-               .grp_release_total = ATOMIC_VAR_INIT(0), \
-               .grp_parent =        (parent), \
-               .grp_log =           NULL, \
-       }
-
-/* Create a default group based on the init() callsite if no explicit group
- * is provided. */
-# define os_ref_init_count(rc, grp, count) ({ \
-               os_refgrp_decl(static, __grp, __func__, NULL); \
-               (os_ref_init_count)((rc), (grp) ? (grp) : &__grp, (count)); \
-       })
-#else
-# define os_refgrp_decl(...)
-# define os_ref_init_count(rc, grp, count) (os_ref_init_count)((rc), NULL, (count))
-#endif /* OS_REFCNT_DEBUG */
+/*
+ * os_refgrp_decl(qual, var, name, parent): declare a refgroup object 'var' with
+ *   given name string and parent group.
+ */
 
 /*
+ *
  * os_ref_retain: acquire a reference (increment reference count by 1) atomically.
  *
  * os_ref_release: release a reference (decrement reference count) atomically and
@@ -138,33 +123,10 @@ os_error_if(count == 0, "Reference count must be non-zero initialized");
  *
  * os_ref_release_live: release a reference that is guaranteed not to be the last one.
  */
-void os_ref_retain(struct os_refcnt *);
-
-os_ref_count_t os_ref_release_explicit(struct os_refcnt *rc,
-    memory_order release_order, memory_order dealloc_order) OS_WARN_RESULT;
-
-static inline os_ref_count_t OS_WARN_RESULT
-os_ref_release(struct os_refcnt *rc)
-{
-       return os_ref_release_explicit(rc, memory_order_release, memory_order_acquire);
-}
-
-static inline os_ref_count_t OS_WARN_RESULT
-os_ref_release_relaxed(struct os_refcnt *rc)
-{
-       return os_ref_release_explicit(rc, memory_order_relaxed, memory_order_relaxed);
-}
-
-static inline void
-os_ref_release_live(struct os_refcnt *rc)
-{
-       if (__improbable(os_ref_release_explicit(rc,
-           memory_order_release, memory_order_relaxed) == 0)) {
-               panic("os_refcnt: unexpected release of final reference (rc=%p)\n", rc);
-               __builtin_unreachable();
-       }
-}
-
+static void os_ref_retain(struct os_refcnt *);
+static os_ref_count_t os_ref_release(struct os_refcnt *) OS_WARN_RESULT;
+static os_ref_count_t os_ref_release_relaxed(struct os_refcnt *) OS_WARN_RESULT;
+static void os_ref_release_live(struct os_refcnt *);
 
 /*
  * os_ref_retain_try: a variant of atomic retain that fails for objects with a
@@ -174,8 +136,7 @@ os_ref_release_live(struct os_refcnt *rc)
  *             for objects stored in a collection, because no lock is required on the
  *             release() side until the object is deallocated.
  */
-bool os_ref_retain_try(struct os_refcnt *) OS_WARN_RESULT;
-
+static bool os_ref_retain_try(struct os_refcnt *) OS_WARN_RESULT;
 
 /*
  * os_ref_retain_locked: acquire a reference on an object protected by a held
@@ -185,20 +146,71 @@ bool os_ref_retain_try(struct os_refcnt *) OS_WARN_RESULT;
  * os_ref_release_locked: release a reference on an object protected by a held
  *             lock.
  */
-void os_ref_retain_locked(struct os_refcnt *);
-os_ref_count_t os_ref_release_locked(struct os_refcnt *) OS_WARN_RESULT;
-
+static void os_ref_retain_locked(struct os_refcnt *);
+static os_ref_count_t os_ref_release_locked(struct os_refcnt *) OS_WARN_RESULT;
 
 /*
  * os_ref_get_count: return the current reference count. This is unsafe for
  *             synchronization.
  */
-static inline os_ref_count_t
-os_ref_get_count(struct os_refcnt *rc)
-{
-       return atomic_load_explicit(&rc->ref_count, memory_order_relaxed);
-}
+static os_ref_count_t os_ref_get_count(struct os_refcnt *rc);
+
+
+#if XNU_KERNEL_PRIVATE
+/*
+ * Raw API that uses a plain atomic counter (os_ref_atomic_t) and a separate
+ * refgroup. This can be used in situations where the refcount object must be
+ * fixed size, for example for embedding in structures with ABI stability
+ * requirements.
+ */
+
+#define os_ref_init_raw(rc, grp) os_ref_init_count_raw((rc), (grp), 1)
+static void os_ref_init_count_raw(os_ref_atomic_t *, struct os_refgrp *, os_ref_count_t count)
+os_error_if(count == 0, "Reference count must be non-zero initialized");
+static void os_ref_retain_raw(os_ref_atomic_t *, struct os_refgrp *);
+static os_ref_count_t os_ref_release_raw(os_ref_atomic_t *, struct os_refgrp *) OS_WARN_RESULT;
+static os_ref_count_t os_ref_release_relaxed_raw(os_ref_atomic_t *, struct os_refgrp *) OS_WARN_RESULT;
+static void os_ref_release_live_raw(os_ref_atomic_t *, struct os_refgrp *);
+static bool os_ref_retain_try_raw(os_ref_atomic_t *, struct os_refgrp *) OS_WARN_RESULT;
+static void os_ref_retain_locked_raw(os_ref_atomic_t *, struct os_refgrp *);
+static os_ref_count_t os_ref_release_locked_raw(os_ref_atomic_t *, struct os_refgrp *) OS_WARN_RESULT;
+static os_ref_count_t os_ref_get_count_raw(os_ref_atomic_t *rc);
+
+
+/*
+ * Bitwise API: like the raw API, but allows some bits in the refcount value to be
+ * reserved for other purposes. 'b' defines the number of trailing (LSB) reserved
+ * bits, which the refcnt_raw API will never modify (except at init()).
+ *
+ * It is assumed that users of this API always use atomic ops on the
+ * os_ref_atomic_t (or hold a lock for the locked variants), and never modify the
+ * top (32 - 'b') bits.
+ *
+ * Due to guard bits, the maximum reference count is 2^(28 - 'b') - 1, and the
+ * maximum 'b' is 26 bits. This API can also be used just to limit the max
+ * refcount.
+ */
+
+/* Initialize the reference count and reserved bits */
+#define os_ref_init_mask(rc, grp, b) os_ref_init_count_mask((rc), (grp), 1, 0, (b))
+void os_ref_init_count_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t init_count,
+    os_ref_count_t init_bits, os_ref_count_t b)
+os_error_if(init_count == 0, "Reference count must be non-zero initialized")
+os_error_if(b > 26, "Bitwise reference count limited to 26 bits")
+os_error_if(init_bits >= (1U << b), "Bits out of range");
+
+void os_ref_retain_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b);
+static os_ref_count_t os_ref_release_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b) OS_WARN_RESULT;
+static os_ref_count_t os_ref_release_relaxed_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b) OS_WARN_RESULT;
+static void os_ref_release_live_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b);
+bool os_ref_retain_try_mask(os_ref_atomic_t *, struct os_refgrp *grp, os_ref_count_t b) OS_WARN_RESULT;
+void os_ref_retain_locked_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b);
+os_ref_count_t os_ref_release_locked_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b) OS_WARN_RESULT;
+os_ref_count_t os_ref_get_count_mask(os_ref_atomic_t *rc, os_ref_count_t b);
+
+#endif /* XNU_KERNEL_PRIVATE */
 
 __END_DECLS
 
+#include <os/refcnt_internal.h>
 #endif
diff --git a/libkern/os/refcnt_internal.h b/libkern/os/refcnt_internal.h
new file mode 100644 (file)
index 0000000..fdc26ec
--- /dev/null
@@ -0,0 +1,301 @@
+#ifndef _OS_REFCNT_INTERNAL_H
+#define _OS_REFCNT_INTERNAL_H
+
+struct os_refcnt {
+       os_ref_atomic_t ref_count;
+#if OS_REFCNT_DEBUG
+       struct os_refgrp *ref_group;
+#endif
+};
+
+#if OS_REFCNT_DEBUG
+struct os_refgrp {
+       const char *const grp_name;
+       os_ref_atomic_t grp_children;  /* number of refcount objects in group */
+       os_ref_atomic_t grp_count;     /* current reference count of group */
+       _Atomic uint64_t grp_retain_total;
+       _Atomic uint64_t grp_release_total;
+       struct os_refgrp *grp_parent;
+       void *grp_log;                       /* refcount logging context */
+};
+#endif
+
+# define OS_REF_ATOMIC_INITIALIZER ATOMIC_VAR_INIT(0)
+#if OS_REFCNT_DEBUG
+# define OS_REF_INITIALIZER { .ref_count = OS_REF_ATOMIC_INITIALIZER, .ref_group = NULL }
+#else
+# define OS_REF_INITIALIZER { .ref_count = OS_REF_ATOMIC_INITIALIZER }
+#endif
+
+__BEGIN_DECLS
+
+#if OS_REFCNT_DEBUG
+# define os_ref_if_debug(x, y) x
+#else
+# define os_ref_if_debug(x, y) y
+#endif
+
+void os_ref_init_count_external(os_ref_atomic_t *, struct os_refgrp *, os_ref_count_t);
+void os_ref_retain_external(os_ref_atomic_t *, struct os_refgrp *);
+void os_ref_retain_locked_external(os_ref_atomic_t *, struct os_refgrp *);
+os_ref_count_t os_ref_release_external(os_ref_atomic_t *, struct os_refgrp *,
+    memory_order release_order, memory_order dealloc_order);
+os_ref_count_t os_ref_release_relaxed_external(os_ref_atomic_t *, struct os_refgrp *);
+os_ref_count_t os_ref_release_barrier_external(os_ref_atomic_t *, struct os_refgrp *);
+os_ref_count_t os_ref_release_locked_external(os_ref_atomic_t *, struct os_refgrp *);
+bool os_ref_retain_try_external(os_ref_atomic_t *, struct os_refgrp *);
+
+#if XNU_KERNEL_PRIVATE
+void os_ref_init_count_internal(os_ref_atomic_t *, struct os_refgrp *, os_ref_count_t);
+void os_ref_retain_internal(os_ref_atomic_t *, struct os_refgrp *);
+os_ref_count_t os_ref_release_relaxed_internal(os_ref_atomic_t *, struct os_refgrp *);
+os_ref_count_t os_ref_release_barrier_internal(os_ref_atomic_t *, struct os_refgrp *);
+os_ref_count_t os_ref_release_internal(os_ref_atomic_t *, struct os_refgrp *,
+    memory_order release_order, memory_order dealloc_order);
+bool os_ref_retain_try_internal(os_ref_atomic_t *, struct os_refgrp *);
+void os_ref_retain_locked_internal(os_ref_atomic_t *, struct os_refgrp *);
+os_ref_count_t os_ref_release_locked_internal(os_ref_atomic_t *, struct os_refgrp *);
+#else
+/* For now, the internal and external variants are identical */
+#define os_ref_init_count_internal      os_ref_init_count_external
+#define os_ref_retain_internal          os_ref_retain_external
+#define os_ref_retain_locked_internal   os_ref_retain_locked_external
+#define os_ref_release_internal         os_ref_release_external
+#define os_ref_release_barrier_internal os_ref_release_barrier_external
+#define os_ref_release_relaxed_internal os_ref_release_relaxed_external
+#define os_ref_release_locked_internal  os_ref_release_locked_external
+#define os_ref_retain_try_internal      os_ref_retain_try_external
+#endif
+
+static inline void
+os_ref_init_count(struct os_refcnt *rc, struct os_refgrp * __unused grp, os_ref_count_t count)
+{
+#if OS_REFCNT_DEBUG
+       rc->ref_group = grp;
+#endif
+       os_ref_init_count_internal(&rc->ref_count, os_ref_if_debug(rc->ref_group, NULL), count);
+}
+
+static inline void
+os_ref_retain(struct os_refcnt *rc)
+{
+       os_ref_retain_internal(&rc->ref_count, os_ref_if_debug(rc->ref_group, NULL));
+}
+
+static inline os_ref_count_t
+os_ref_release_locked(struct os_refcnt *rc)
+{
+       return os_ref_release_locked_internal(&rc->ref_count, os_ref_if_debug(rc->ref_group, NULL));
+}
+
+static inline void
+os_ref_retain_locked(struct os_refcnt *rc)
+{
+       os_ref_retain_internal(&rc->ref_count, os_ref_if_debug(rc->ref_group, NULL));
+}
+
+static inline bool
+os_ref_retain_try(struct os_refcnt *rc)
+{
+       return os_ref_retain_try_internal(&rc->ref_count, os_ref_if_debug(rc->ref_group, NULL));
+}
+
+__deprecated_msg("inefficient codegen, prefer os_ref_release / os_ref_release_relaxed")
+static inline os_ref_count_t OS_WARN_RESULT
+os_ref_release_explicit(struct os_refcnt *rc, memory_order release_order, memory_order dealloc_order)
+{
+       return os_ref_release_internal(&rc->ref_count, os_ref_if_debug(rc->ref_group, NULL),
+                  release_order, dealloc_order);
+}
+
+#if OS_REFCNT_DEBUG
+# define os_refgrp_decl(qual, var, name, parent) \
+       qual struct os_refgrp __attribute__((section("__DATA,__refgrps"))) var = { \
+               .grp_name =          (name), \
+               .grp_children =      ATOMIC_VAR_INIT(0u), \
+               .grp_count =         ATOMIC_VAR_INIT(0u), \
+               .grp_retain_total =  ATOMIC_VAR_INIT(0u), \
+               .grp_release_total = ATOMIC_VAR_INIT(0u), \
+               .grp_parent =        (parent), \
+               .grp_log =           NULL, \
+       }
+# define os_refgrp_decl_extern(var) \
+       extern struct os_refgrp var
+
+/* Create a default group based on the init() callsite if no explicit group
+ * is provided. */
+# define os_ref_init_count(rc, grp, count) ({ \
+               os_refgrp_decl(static, __grp, __func__, NULL); \
+               (os_ref_init_count)((rc), (grp) ? (grp) : &__grp, (count)); \
+       })
+
+#else /* OS_REFCNT_DEBUG */
+
+# define os_refgrp_decl(...) extern struct os_refgrp var __attribute__((unused))
+# define os_refgrp_decl_extern(var) os_refgrp_decl(var)
+# define os_ref_init_count(rc, grp, count) (os_ref_init_count)((rc), NULL, (count))
+
+#endif /* OS_REFCNT_DEBUG */
+
+#if XNU_KERNEL_PRIVATE
+void os_ref_panic_live(void *rc) __abortlike;
+#else
+__abortlike
+static inline void
+os_ref_panic_live(void *rc)
+{
+       panic("os_refcnt: unexpected release of final reference (rc=%p)\n", rc);
+       __builtin_unreachable();
+}
+#endif
+
+static inline os_ref_count_t OS_WARN_RESULT
+os_ref_release(struct os_refcnt *rc)
+{
+       return os_ref_release_barrier_internal(&rc->ref_count,
+                  os_ref_if_debug(rc->ref_group, NULL));
+}
+
+static inline os_ref_count_t OS_WARN_RESULT
+os_ref_release_relaxed(struct os_refcnt *rc)
+{
+       return os_ref_release_relaxed_internal(&rc->ref_count,
+                  os_ref_if_debug(rc->ref_group, NULL));
+}
+
+static inline void
+os_ref_release_live(struct os_refcnt *rc)
+{
+       if (__improbable(os_ref_release(rc) == 0)) {
+               os_ref_panic_live(rc);
+       }
+}
+
+static inline os_ref_count_t
+os_ref_get_count_internal(os_ref_atomic_t *rc)
+{
+       return atomic_load_explicit(rc, memory_order_relaxed);
+}
+
+static inline os_ref_count_t
+os_ref_get_count(struct os_refcnt *rc)
+{
+       return os_ref_get_count_internal(&rc->ref_count);
+}
+
+
+
+/*
+ * Raw API
+ */
+
+static inline void
+os_ref_init_count_raw(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t count)
+{
+       os_ref_init_count_internal(rc, grp, count);
+}
+
+static inline void
+os_ref_retain_raw(os_ref_atomic_t *rc, struct os_refgrp *grp)
+{
+       os_ref_retain_internal(rc, grp);
+}
+
+static inline os_ref_count_t
+os_ref_release_raw(os_ref_atomic_t *rc, struct os_refgrp *grp)
+{
+       return os_ref_release_barrier_internal(rc, grp);
+}
+
+static inline os_ref_count_t
+os_ref_release_relaxed_raw(os_ref_atomic_t *rc, struct os_refgrp *grp)
+{
+       return os_ref_release_relaxed_internal(rc, grp);
+}
+
+static inline void
+os_ref_release_live_raw(os_ref_atomic_t *rc, struct os_refgrp *grp)
+{
+       if (__improbable(os_ref_release_barrier_internal(rc, grp) == 0)) {
+               os_ref_panic_live(rc);
+       }
+}
+
+static inline bool
+os_ref_retain_try_raw(os_ref_atomic_t *rc, struct os_refgrp *grp)
+{
+       return os_ref_retain_try_internal(rc, grp);
+}
+
+static inline void
+os_ref_retain_locked_raw(os_ref_atomic_t *rc, struct os_refgrp *grp)
+{
+       os_ref_retain_locked_internal(rc, grp);
+}
+
+static inline os_ref_count_t
+os_ref_release_locked_raw(os_ref_atomic_t *rc, struct os_refgrp *grp)
+{
+       return os_ref_release_locked_internal(rc, grp);
+}
+
+static inline os_ref_count_t
+os_ref_get_count_raw(os_ref_atomic_t *rc)
+{
+       return os_ref_get_count_internal(rc);
+}
+
+#if !OS_REFCNT_DEBUG
+/* remove the group argument for non-debug */
+#define os_ref_init_count_raw(rc, grp, count) (os_ref_init_count_raw)((rc), NULL, (count))
+#define os_ref_retain_raw(rc, grp) (os_ref_retain_raw)((rc), NULL)
+#define os_ref_release_raw(rc, grp) (os_ref_release_raw)((rc), NULL)
+#define os_ref_release_relaxed_raw(rc, grp) (os_ref_release_relaxed_raw)((rc), NULL)
+#define os_ref_release_live_raw(rc, grp) (os_ref_release_live_raw)((rc), NULL)
+#define os_ref_retain_try_raw(rc, grp) (os_ref_retain_try_raw)((rc), NULL)
+#define os_ref_retain_locked_raw(rc, grp) (os_ref_retain_locked_raw)((rc), NULL)
+#define os_ref_release_locked_raw(rc, grp) (os_ref_release_locked_raw)((rc), NULL)
+#endif
+
+#if XNU_KERNEL_PRIVATE
+os_ref_count_t os_ref_release_mask_internal(os_ref_atomic_t *rc, struct os_refgrp *grp,
+    os_ref_count_t b, memory_order release_order, memory_order dealloc_order);
+
+static inline os_ref_count_t
+os_ref_release_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b)
+{
+       return os_ref_release_mask_internal(rc, grp, b, memory_order_release, memory_order_acquire);
+}
+
+static inline os_ref_count_t
+os_ref_release_relaxed_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b)
+{
+       return os_ref_release_mask_internal(rc, grp, b, memory_order_relaxed, memory_order_relaxed);
+}
+
+static inline void
+os_ref_release_live_mask(os_ref_atomic_t *rc, struct os_refgrp *grp, os_ref_count_t b)
+{
+       if (__improbable(os_ref_release_mask_internal(rc, grp, b,
+           memory_order_release, memory_order_relaxed) == 0)) {
+               os_ref_panic_live(rc);
+       }
+}
+
+#if !OS_REFCNT_DEBUG
+/* remove the group argument for non-debug */
+#define os_ref_init_count_mask(rc, grp, init_c, init_b, b) (os_ref_init_count_mask)(rc, NULL, init_c, init_b, b)
+#define os_ref_retain_mask(rc, grp, b) (os_ref_retain_mask)((rc), NULL, (b))
+#define os_ref_release_mask(rc, grp, b) (os_ref_release_mask)((rc), NULL, (b))
+#define os_ref_release_relaxed_mask(rc, grp, b) (os_ref_relaxed_mask)((rc), NULL, (b))
+#define os_ref_release_live_mask(rc, grp, b) (os_ref_release_live_mask)((rc), NULL, (b))
+#define os_ref_retain_try_mask(rc, grp, b) (os_ref_retain_try_mask)((rc), NULL, (b))
+#define os_ref_release_locked_mask(rc, grp, b) (os_ref_release_locked_mask)((rc), NULL, (b))
+#define os_ref_retain_locked_mask(rc, grp, b) (os_ref_retain_locked_mask)((rc), NULL, (b))
+#endif
+
+#endif
+
+__END_DECLS
+
+#endif /* _OS_REFCNT_INTERNAL_H */
diff --git a/libkern/os/smart_ptr.h b/libkern/os/smart_ptr.h
new file mode 100644 (file)
index 0000000..5f89c7f
--- /dev/null
@@ -0,0 +1,523 @@
+#ifndef _OS_SMART_POINTER_H
+#define _OS_SMART_POINTER_H
+
+#include <sys/cdefs.h>
+#include <os/cpp_util.h>
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++11-extensions"
+
+#if __has_attribute(trivial_abi)
+# define OS_TRIVIAL_ABI __attribute__((trivial_abi))
+#else
+# error Smart pointers depend on trivial_abi attribute
+#endif
+
+#if !OS_HAS_RVALUE_REFERENCES
+# error Smart pointers depend on rvalue references
+#endif
+
+/* C++98 compatibility */
+#if !OS_HAS_NULLPTR && !defined(nullptr)
+# define nullptr NULL
+#endif
+
+#ifndef OSPTR_LOG
+# define OSPTR_LOG(x, ...) do {} while(0)
+#endif
+
+namespace os {
+static struct no_retain_t {} no_retain;
+
+template<class T, class Policy>
+class OS_TRIVIAL_ABI smart_ptr
+{
+       template<class U, class OtherPolicy> friend class smart_ptr;
+
+public:
+
+/*
+ * Default constructor, creates a null pointer
+ */
+       smart_ptr() : pointer(nullptr)
+       {
+               OSPTR_LOG("Default construct smart_ptr\n");
+       }
+
+#if OS_HAS_NULLPTR
+/*
+ * Construction from a nullptr
+ */
+       smart_ptr(os::nullptr_t) : pointer(nullptr)
+       {
+               OSPTR_LOG("Construct smart_ptr from null\n");
+       }
+#endif
+
+/*
+ * Construct from a raw pointer, taking a reference to the object
+ */
+       explicit smart_ptr(T *&p) : pointer(p)
+       {
+               OSPTR_LOG("Construct smart_ptr from raw %p\n", pointer);
+               if (pointer != nullptr) {
+                       _retain(pointer);
+               }
+       }
+
+/*
+ * Construct from a raw pointer, without bumping the refcount
+ */
+       explicit smart_ptr(T *&p, no_retain_t) : pointer(p)
+       {
+               OSPTR_LOG("Construct smart_ptr from raw %p no retain\n", pointer);
+       }
+
+/*
+ * Copy constructor from the same smart_ptr type
+ */
+       smart_ptr(smart_ptr const &rhs) : pointer(rhs.pointer)
+       {
+               OSPTR_LOG("Copy construct smart_ptr with %p\n", rhs.pointer);
+               if (pointer != nullptr) {
+                       _retain(pointer);
+               }
+       }
+
+#if !LIBKERN_NO_MEMBER_TEMPLATES
+/*
+ * Allows copy of a smart_ptr<T> from a smart_ptr<U>
+ * if U is convertible to T. For example, if T is a base class of U
+ */
+       template<class U>
+       smart_ptr(smart_ptr<U, Policy> const &rhs) : pointer(rhs.get())
+       {
+               OSPTR_LOG("Copy construct smart_ptr with compatible %p\n", rhs.pointer);
+               if (pointer != nullptr) {
+                       _retain(pointer);
+               }
+       }
+#endif
+
+/*
+ * Assign to an OSPointer from a raw pointer
+ */
+       smart_ptr &
+       operator=(T *&rhs)
+       {
+               OSPTR_LOG("Assign smart_ptr with replacing %p with raw %p\n", pointer, rhs);
+               smart_ptr(rhs).swap(*this);
+               return *this;
+       }
+
+#if OS_HAS_NULLPTR
+/*
+ * Assign to an OSPointer from a null pointer
+ */
+       smart_ptr &
+       operator=(os::nullptr_t)
+       {
+               OSPTR_LOG("Assign smart_ptr to null replacing %p\n", pointer);
+               smart_ptr().swap(*this);
+               return *this;
+       }
+#endif
+
+/*
+ * Assign to a smart_ptr from a smart_ptr of the same type
+ */
+       smart_ptr &
+       operator=(smart_ptr &rhs)
+       {
+               OSPTR_LOG("Assign smart_ptr replacing %p with %p\n", pointer, rhs.pointer);
+               smart_ptr(rhs).swap(*this);
+               return *this;
+       }
+
+#if !LIBKERN_NO_MEMBER_TEMPLATES
+/*
+ * Allows assignment of a smart_ptr<T> from a smart_ptr<U>
+ * if U is convertible to T. For example, if T is a base class of U.
+ */
+       template <class U>
+       smart_ptr &
+       operator=(smart_ptr<U, Policy> const &rhs)
+       {
+               OSPTR_LOG("Assign smart_ptr to compatible replacing %p with %p\n", pointer, rhs.pointer);
+               smart_ptr(rhs.get()).swap(*this);
+               return *this;
+       }
+#endif
+
+/*
+ * Move support
+ */
+
+#if OS_HAS_RVALUE_REFERENCES
+/*
+ * Move-construct from a different smart_ptr of the same pointer type
+ */
+       smart_ptr(smart_ptr &&rhs) : pointer(rhs.pointer)
+       {
+               OSPTR_LOG("Move construct smart_ptr with %p\n", rhs.pointer);
+               rhs.pointer = nullptr;
+       }
+
+/*
+ * Move-construct from a raw pointer
+ */
+       smart_ptr(T *&&p) : pointer(p)
+       {
+               OSPTR_LOG("Move construct smart_ptr with %p\n", pointer);
+               if (pointer != nullptr) {
+                       _retain(pointer);
+               }
+               p = nullptr;
+       }
+
+/*
+ * Move-construct from a raw pointer without bumping the refcount
+ */
+       smart_ptr(T *&&p, no_retain_t) : pointer(p)
+       {
+               OSPTR_LOG("Move construct smart_ptr with %p no retain\n", pointer);
+               p = nullptr;
+       }
+
+/*
+ * Move-assign to a smart_ptr from a raw pointer
+ */
+       smart_ptr &
+       operator=(T *&&rhs)
+       {
+               OSPTR_LOG("Move assign smart_ptr replacing %p with raw %p\n", pointer, rhs);
+               smart_ptr(os::move(rhs)).swap(*this);
+               rhs = nullptr;
+               return *this;
+       }
+
+/*
+ * Move-assign from a different smart_ptr of the same type
+ */
+       smart_ptr &
+       operator=(smart_ptr &&rhs)
+       {
+               OSPTR_LOG("Move assign smart_ptr replacing %p with %p\n", pointer, rhs.pointer);
+               smart_ptr(os::move(rhs)).swap(*this);
+               return *this;
+       }
+
+/*
+ * Move from a different smart_ptr with a compatible pointer type
+ */
+       template<class U>
+       smart_ptr(smart_ptr<U, Policy> &&rhs) : pointer(rhs.pointer)
+       {
+               OSPTR_LOG("Move construct smart_ptr with compatible %p\n", rhs.pointer);
+               rhs.pointer = nullptr;
+       }
+
+       template<class U>
+       smart_ptr &
+       operator=(smart_ptr<U, Policy> &&rhs)
+       {
+               OSPTR_LOG("Move assign smart_ptr replacing %p with compatible %p\n", pointer, rhs.pointer);
+               smart_ptr(os::move(rhs)).swap(*this);
+               return *this;
+       }
+#endif
+
+/*
+ * Destructor - decreases the object's reference count
+ */
+       ~smart_ptr()
+       {
+               OSPTR_LOG("Destroy smart_ptr with %p\n", pointer);
+               if (pointer) {
+                       _release(pointer);
+               }
+       }
+
+/*
+ * Create a new object of type T and wrap it in a smart_ptr. The object will have
+ * a reference count of 1, so destruction of the smart_ptr will result in the
+ * object being freed if the smart_ptr wasn't copied first.
+ */
+       static inline smart_ptr
+       alloc()
+       {
+               return smart_ptr(_alloc(), no_retain);
+       }
+
+       void
+       reset()
+       {
+               smart_ptr().swap(*this);
+       }
+
+       T *
+       get() const
+       {
+               return pointer;
+       }
+
+       T **
+       get_for_out_param()
+       {
+               reset();
+               return &pointer;
+       }
+
+/*
+ * Take ownership of object from raw pointer
+ */
+       void
+       attach(T *&p)
+       {
+               OSPTR_LOG("Attach smart_ptr with %p\n", p);
+               smart_ptr(p, no_retain).swap(*this);
+       }
+
+       void
+       attach(T *&&p)
+       {
+               OSPTR_LOG("Move attach smart_ptr with %p\n", p);
+               smart_ptr(os::move(p), no_retain).swap(*this);
+       }
+
+/* Return and drop ownership of pointer with NO release() */
+       T *
+       detach()
+       {
+               OSPTR_LOG("Detach smart_ptr with %p\n", pointer);
+               T *ret = pointer;
+               pointer = nullptr;
+               return ret;
+       }
+
+       T *
+       operator->() const
+       {
+               OSPTR_LOG("Dereference smart_ptr with %p\n", pointer);
+               return pointer;
+       }
+
+       explicit
+       operator bool() const
+       {
+               return pointer != nullptr;
+       }
+
+       inline void
+       swap(smart_ptr &p)
+       {
+               T *temp = pointer;
+               pointer = p.pointer;
+               p.pointer = temp;
+       }
+
+/* swap pointers to the same type but with different policies */
+       template<class OtherPolicy>
+       void
+       swap(smart_ptr<T, OtherPolicy> &p)
+       {
+               if (p.pointer) {
+                       _retain(p.pointer);
+               }
+               if (pointer) {
+                       smart_ptr<T, OtherPolicy>::_retain(pointer);
+               }
+
+               T *temp = pointer;
+               pointer = p.pointer;
+               p.pointer = temp;
+
+               if (p.pointer) {
+                       _release(p.pointer);
+               }
+               if (pointer) {
+                       smart_ptr<T, OtherPolicy>::_release(pointer);
+               }
+       }
+
+       template<class U>
+       smart_ptr<U, Policy>
+       const_pointer_cast() const &
+       {
+               OSPTR_LOG("const_pointer_cast smart_ptr with %p\n", pointer);
+               return smart_ptr<U, Policy>(const_cast<U *>(pointer));
+       }
+
+       template <class U>
+       smart_ptr<U, Policy>
+       const_pointer_cast() &&
+       {
+               OSPTR_LOG("const_pointer_cast move smart_ptr with %p\n", pointer);
+               U *newPointer = const_cast<U *>(detach());
+               return smart_ptr<U, Policy>(os::move(newPointer), no_retain);
+       }
+
+       template <class U>
+       smart_ptr<U, Policy>
+       static_pointer_cast() const &
+       {
+               OSPTR_LOG("static_pointer_cast smart_ptr with %p\n", pointer);
+               return smart_ptr<U, Policy>(static_cast<U *>(pointer));
+       }
+
+       template <class U>
+       smart_ptr<U, Policy>
+       static_pointer_cast() &&
+       {
+               OSPTR_LOG("static_pointer_cast move smart_ptr with %p\n", pointer);
+               return smart_ptr<U, Policy>(static_cast<U *>(detach()), no_retain);
+       }
+
+       template <class U>
+       smart_ptr<U, Policy>
+       dynamic_pointer_cast() const &
+       {
+               OSPTR_LOG("dynamic_pointer_cast smart_ptr with %p\n", pointer);
+               return smart_ptr<U, Policy>(Policy::template dyn_cast<T, U>(pointer));
+       }
+
+       template <class U>
+       smart_ptr<U, Policy>
+       dynamic_pointer_cast() &&
+       {
+               OSPTR_LOG("dynamic_pointer_cast move smart_ptr with %p\n", pointer);
+               U *newPointer = Policy::template dyn_cast<T, U>(pointer);
+
+               if (newPointer != nullptr) {
+                       detach();
+               } else {
+                       reset();
+               }
+               return smart_ptr<U, Policy>(os::move(newPointer), no_retain);
+       }
+
+private:
+       static inline void
+       _retain(T *obj)
+       {
+               OSPTR_LOG("    %s with %p\n", __FUNCTION__, obj);
+               Policy::retain(obj);
+       }
+
+       static inline void
+       _release(T *obj)
+       {
+               OSPTR_LOG("    %s with %p\n", __FUNCTION__, obj);
+               Policy::release(obj);
+       }
+
+       static inline T *
+       _alloc()
+       {
+               OSPTR_LOG("    %s\n", __FUNCTION__);
+               return Policy::template alloc<T>();
+       }
+
+       T *pointer;
+};
+
+/*
+ * Comparison
+ */
+
+template<class T, class Policy>
+inline bool
+operator==(smart_ptr<T, Policy> const &a, smart_ptr<T, Policy> const &b)
+{
+       return a.get() == b.get();
+}
+
+template<class T, class Policy>
+inline bool
+operator!=(smart_ptr<T, Policy> const &a, smart_ptr<T, Policy> const &b)
+{
+       return a.get() != b.get();
+}
+
+template<class A, class A_policy, class B, class B_policy>
+inline bool
+operator==(smart_ptr<A, A_policy> const &a, smart_ptr<B, B_policy> const &b)
+{
+       return a.get() == b.get();
+}
+
+template<class A, class A_policy, class B, class B_policy>
+inline bool
+operator!=(smart_ptr<A, A_policy> const &a, smart_ptr<B, B_policy> const &b)
+{
+       return a.get() != b.get();
+}
+
+/*
+ * Comparison with nullptr
+ */
+
+#if OS_HAS_NULLPTR
+template<class T, class Policy>
+inline bool
+operator==(smart_ptr<T, Policy> const &p, os::nullptr_t)
+{
+       return p.get() == nullptr;
+}
+
+template<class T, class Policy> inline bool
+operator==(os::nullptr_t, smart_ptr<T, Policy> const &p)
+{
+       return p.get() == nullptr;
+}
+
+template<class T, class Policy>
+inline bool
+operator!=(smart_ptr<T, Policy> const &p, os::nullptr_t)
+{
+       return p.get() != nullptr;
+}
+
+template<class T, class Policy>
+inline bool
+operator!=(os::nullptr_t, smart_ptr<T, Policy> const &p)
+{
+       return p.get() != nullptr;
+}
+#endif
+
+/*
+ * Comparison with raw pointer
+ */
+
+template<class T, class Policy>
+inline bool
+operator==(smart_ptr<T, Policy> const &p, const os::remove_const_t<T> *other)
+{
+       return p.get() == other;
+}
+
+template<class T, class Policy>
+inline bool
+operator==(const os::remove_const_t<T> *other, smart_ptr<T, Policy> const &p)
+{
+       return other == p.get();
+}
+
+template<class T, class Policy>
+inline bool
+operator!=(smart_ptr<T, Policy> const &p, const os::remove_const_t<T> *other)
+{
+       return p.get() != other;
+}
+
+template<class T, class Policy>
+inline bool
+operator!=(const os::remove_const_t<T> *other, smart_ptr<T, Policy> const &p)
+{
+       return other != p.get();
+}
+};
+
+#pragma clang diagnostic pop
+#endif /* _OS_SMART_POINTER_H */
index 8f33073a22c9e2697c78aa526bba291e6d17dfcc..f46485a1d81893311cd9ee1f3ec02eadb1c0c8bc 100644 (file)
@@ -164,7 +164,7 @@ public:
        ~KLDBootstrap(void);
 };
 
-static KLDBootstrap sBootstrapObject;
+LIBKERN_ALWAYS_DESTROY static KLDBootstrap sBootstrapObject;
 
 /*********************************************************************
 * Set the function pointers for the entry points into the bootstrap
@@ -190,8 +190,8 @@ KLDBootstrap::~KLDBootstrap(void)
        }
 
 
-       record_startup_extensions_function = 0;
-       load_security_extensions_function = 0;
+       record_startup_extensions_function = NULL;
+       load_security_extensions_function = NULL;
 }
 
 /*********************************************************************
@@ -725,7 +725,6 @@ KLDBootstrap::loadSecurityExtensions(void)
        OSCollectionIterator * keyIterator    = NULL;// must release
        OSString             * bundleID       = NULL;// don't release
        OSKext               * theKext        = NULL;// don't release
-       OSBoolean            * isSecurityKext = NULL;// don't release
 
        OSKextLog(/* kext */ NULL,
            kOSKextLogStepLevel |
@@ -761,9 +760,7 @@ KLDBootstrap::loadSecurityExtensions(void)
                        continue;
                }
 
-               isSecurityKext = OSDynamicCast(OSBoolean,
-                   theKext->getPropertyForHostArch(kAppleSecurityExtensionKey));
-               if (isSecurityKext && isSecurityKext->isTrue()) {
+               if (kOSBooleanTrue == theKext->getPropertyForHostArch(kAppleSecurityExtensionKey)) {
                        OSKextLog(/* kext */ NULL,
                            kOSKextLogStepLevel |
                            kOSKextLogLoadFlag,
index 7bd79d9ae78b20528784ccb5d5684d4a6bcb579f..05c4b79cf0cdde381cf245f419ccc25af4fec458 100644 (file)
@@ -23,7 +23,7 @@ endif
 
 $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS)
        $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG)
-       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG);
+       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG)
 
 do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
        $(_v)${MAKE} \
@@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
                SOURCE=$(subst conf/,,$(SOURCE))                        \
                TARGET=${TARGET}                                        \
                OBJPATH=${OBJPATH}                                      \
-               build_all;
+               build_all
 
 do_build_all:: do_all
 
index bc570dde56e775fa736b2e23ca6d8d0afe025211..6288236329bb39e322c1f7d098ea839c6a5d224d 100644 (file)
@@ -69,9 +69,9 @@ $(COMPONENT).filelist: $(OBJS)
                $(SEG_HACK) -n __KLD -o $${kld_file}__ $${kld_file} || exit 1; \
                mv $${kld_file}__ $${kld_file} || exit 1; \
        done
-       @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)"
+       $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0))
        $(_v)for obj in ${OBJS}; do     \
-                echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
+                $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
        done > $(COMPONENT).filelist
 
 
diff --git a/libsa/nonlto.c b/libsa/nonlto.c
new file mode 100644 (file)
index 0000000..68adb07
--- /dev/null
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ * used to make a tiny non-LTO object file to force the linker to turn bitcode to macho
+ */
+
+int __attribute__((__unused__)) __not_used_at_all__;
index 7153bda3ee8340788f81a88b2f4b2fff837603d4..15531c0fe38462d428b2972f8fd1d237767fffb9 100644 (file)
@@ -5,27 +5,61 @@ SUPPORTED_PLATFORMS = macosx iphoneos iphoneosnano tvos appletvos watchos bridge
 ONLY_ACTIVE_ARCH = NO
 DEAD_CODE_STRIPPING = YES
 DEBUG_INFORMATION_FORMAT = dwarf-with-dsym
-INSTALL_PATH = /usr/lib/system
-PUBLIC_HEADERS_FOLDER_PATH = /usr/include
-PRIVATE_HEADERS_FOLDER_PATH = /usr/local/include
-OS_PRIVATE_HEADERS_FOLDER_PATH = /usr/local/include/os
+
+SDK_INSTALL_VARIANT = $(SDK_INSTALL_VARIANT_$(DRIVERKIT))
+SDK_INSTALL_VARIANT_1 = driverkit
+SDK_INSTALL_VARIANT_ = default
+SDK_INSTALL_ROOT = $(SDK_INSTALL_ROOT_$(SDK_INSTALL_VARIANT))
+SDK_INSTALL_ROOT_driverkit = $(DRIVERKITROOT)
+SDK_INSTALL_HEADERS_ROOT = $(SDK_INSTALL_HEADERS_ROOT_$(SDK_INSTALL_VARIANT))
+SDK_INSTALL_HEADERS_ROOT_driverkit = $(SDK_INSTALL_ROOT)/$(SDK_RUNTIME_HEADERS_PREFIX)
+SDK_RUNTIME_HEADERS_PREFIX = Runtime
+
+INSTALL_PATH = $(SDK_INSTALL_ROOT)/usr/lib/system
+PUBLIC_HEADERS_FOLDER_PATH = $(SDK_INSTALL_HEADERS_ROOT)/usr/include
+PRIVATE_HEADERS_FOLDER_PATH = $(SDK_INSTALL_HEADERS_ROOT)/usr/local/include
+OS_PRIVATE_HEADERS_FOLDER_PATH = $(SDK_INSTALL_HEADERS_ROOT)/usr/local/include/os
+OS_PUBLIC_HEADERS_FOLDER_PATH = $(SDK_INSTALL_HEADERS_ROOT)/usr/include/os
 EXECUTABLE_PREFIX = libsystem_
 PRODUCT_NAME = kernel
 ALWAYS_SEARCH_USER_PATHS = NO
 ORDER_FILE[sdk=iphoneos*] = $(SDKROOT)/$(APPLE_INTERNAL_DIR)/OrderFiles/libsystem_kernel.order
-OTHER_CFLAGS = -fdollars-in-identifiers -no-cpp-precomp -fno-common -fno-stack-protector -fno-stack-check -momit-leaf-frame-pointer -DLIBSYSCALL_INTERFACE -D__DARWIN_VERS_1050=1
-OTHER_CFLAGS[sdk=macosx*] = $(inherited) -DSYSCALL_PRE1050
-OTHER_CFLAGS[sdk=macosx*][arch=x86_64*] = $(inherited) -DNO_SYSCALL_LEGACY
-OTHER_CFLAGS[sdk=iphoneos*] = $(inherited) -DNO_SYSCALL_LEGACY
-OTHER_CFLAGS[sdk=watchos*] = $(inherited) -DNO_SYSCALL_LEGACY
-OTHER_CFLAGS[sdk=tvos*] = $(inherited) -DNO_SYSCALL_LEGACY
-OTHER_CFLAGS[sdk=appletvos*] = $(inherited) -DNO_SYSCALL_LEGACY
-OTHER_CFLAGS[sdk=bridgeos*] = $(inherited) -DNO_SYSCALL_LEGACY
+OTHER_CFLAGS = -fdollars-in-identifiers -no-cpp-precomp -fno-common -fno-stack-protector -fno-stack-check -momit-leaf-frame-pointer -DLIBSYSCALL_INTERFACE -D__DARWIN_VERS_1050=1 -DNO_SYSCALL_LEGACY
+OTHER_CFLAGS[sdk=macosx*][arch=i386] = $(inherited) -UNO_SYSCALL_LEGACY -DSYSCALL_PRE1050
+OTHER_CFLAGS[sdk=macosx*][arch=x86_64*] = $(inherited) -DSYSCALL_PRE1050
 GCC_PREPROCESSOR_DEFINITIONS = CF_OPEN_SOURCE CF_EXCLUDE_CSTD_HEADERS DEBUG _FORTIFY_SOURCE=0
-HEADER_SEARCH_PATHS = $(PROJECT_DIR)/mach $(PROJECT_DIR)/os $(PROJECT_DIR)/wrappers $(PROJECT_DIR)/wrappers/string $(PROJECT_DIR)/wrappers/libproc $(PROJECT_DIR)/wrappers/libproc/spawn $(BUILT_PRODUCTS_DIR)/internal_hdr/include $(BUILT_PRODUCTS_DIR)/mig_hdr/local/include $(BUILT_PRODUCTS_DIR)/mig_hdr/include $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
+
+HEADER_SEARCH_PATHS = $(PROJECT_DIR)/mach $(PROJECT_DIR)/os $(PROJECT_DIR)/wrappers $(PROJECT_DIR)/wrappers/string $(PROJECT_DIR)/wrappers/libproc $(PROJECT_DIR)/wrappers/libproc/spawn $(BUILT_PRODUCTS_DIR)/internal_hdr/include $(BUILT_PRODUCTS_DIR)/mig_hdr/local/include $(BUILT_PRODUCTS_DIR)/mig_hdr/include
+SYSTEM_HEADER_SEARCH_PATHS = $(SDKROOT)/$(SDK_INSTALL_HEADERS_ROOT)/System/Library/Frameworks/System.framework/PrivateHeaders $(SDKROOT)/$(SDK_INSTALL_HEADERS_ROOT)/usr/local/include $(SDKROOT)/$(SDK_INSTALL_HEADERS_ROOT)/usr/include
+SYSTEM_FRAMEWORK_SEARCH_PATHS = $(SDKROOT)/$(SDK_INSTALL_HEADERS_ROOT)/System/Library/Frameworks
+
+OTHER_MIGFLAGS = -novouchers -I$(SDKROOT)/$(SDK_INSTALL_HEADERS_ROOT)/System/Library/Frameworks/System.framework/PrivateHeaders -I$(SDKROOT)/$(SDK_INSTALL_HEADERS_ROOT)/usr/local/include -I$(SDKROOT)/$(SDK_INSTALL_HEADERS_ROOT)/usr/include
+
 WARNING_CFLAGS = -Wmost
 GCC_TREAT_WARNINGS_AS_ERRORS = YES
 GCC_WARN_ABOUT_MISSING_NEWLINE = YES
+GCC_NO_COMMON_BLOCKS = YES
+GCC_C_LANGUAGE_STANDARD = gnu11
+CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES
+CLANG_WARN_EMPTY_BODY = YES
+CLANG_WARN_BOOL_CONVERSION = YES
+CLANG_WARN_CONSTANT_CONVERSION = YES
+GCC_WARN_64_TO_32_BIT_CONVERSION = NO
+CLANG_WARN_ENUM_CONVERSION = YES
+CLANG_WARN_INT_CONVERSION = NO
+CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES
+CLANG_WARN_INFINITE_RECURSION = YES
+GCC_WARN_ABOUT_RETURN_TYPE = YES
+CLANG_WARN_STRICT_PROTOTYPES = YES
+CLANG_WARN_COMMA = YES
+GCC_WARN_UNINITIALIZED_AUTOS = YES
+CLANG_WARN_UNREACHABLE_CODE = YES
+GCC_WARN_UNUSED_FUNCTION = YES
+GCC_WARN_UNUSED_PARAMETER = YES
+GCC_WARN_UNUSED_VARIABLE = YES
+CLANG_WARN_RANGE_LOOP_ANALYSIS = YES
+CLANG_WARN_SUSPICIOUS_MOVE = YES
+
 CODE_SIGN_IDENTITY = -
 DYLIB_CURRENT_VERSION = $(RC_ProjectSourceVersion)
 DYLIB_LDFLAGS = -umbrella System -all_load
@@ -34,9 +68,12 @@ DYLIB_LDFLAGS[sdk=watchos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000
 DYLIB_LDFLAGS[sdk=tvos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000
 DYLIB_LDFLAGS[sdk=appletvos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000
 DYLIB_LDFLAGS[sdk=bridgeos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000
-OTHER_LDFLAGS =
+OTHER_LDFLAGS = $(SIMULATOR_LDFLAGS)
+SIMULATOR_LDFLAGS =
+SIMULATOR_LDFLAGS[sdk=macosx*] = -Wl,-simulator_support
 INSTALLHDRS_SCRIPT_PHASE = YES
 INSTALLHDRS_COPY_PHASE = YES
 USE_HEADERMAP = NO
 LINK_WITH_STANDARD_LIBRARIES = NO
 ALWAYS_SEARCH_USER_PATHS = YES
+IS_ZIPPERED = YES
index c5f769437330ba2d20a4319367794779a414fa9e..0fba2db30055df02ab468eeb5e724b919f799f05 100644 (file)
                        name = "MIG headers";
                        productName = "MIG headers";
                };
+               E46CB80621FBAC32005D1E53 /* Libsyscall_driverkit */ = {
+                       isa = PBXAggregateTarget;
+                       buildConfigurationList = E46CB80F21FBAC32005D1E53 /* Build configuration list for PBXAggregateTarget "Libsyscall_driverkit" */;
+                       buildPhases = (
+                       );
+                       dependencies = (
+                               E46CB80721FBAC32005D1E53 /* PBXTargetDependency */,
+                       );
+                       name = Libsyscall_driverkit;
+                       productName = Build;
+               };
 /* End PBXAggregateTarget section */
 
 /* Begin PBXBuildFile section */
                29A59AE2183B0DE000E8B896 /* renameat.c in Sources */ = {isa = PBXBuildFile; fileRef = 29A59AE1183B0DE000E8B896 /* renameat.c */; };
                29A59AE6183B110C00E8B896 /* unlinkat.c in Sources */ = {isa = PBXBuildFile; fileRef = 29A59AE5183B110C00E8B896 /* unlinkat.c */; };
                2BA88DCC1810A3CE00EB63F6 /* coalition.c in Sources */ = {isa = PBXBuildFile; fileRef = 2BA88DCB1810A3CE00EB63F6 /* coalition.c */; };
+               2C4853EC221C82160008D1F5 /* os_channel_event.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C4853EB221C82160008D1F5 /* os_channel_event.c */; settings = {COMPILER_FLAGS = "-fno-builtin"; }; };
                3695DCC91F3D2C5F0072C0B3 /* reboot.c in Sources */ = {isa = PBXBuildFile; fileRef = 3695DCC81F3D2C5A0072C0B3 /* reboot.c */; };
                374A36E314748F1300AAF39D /* varargs_wrappers.s in Sources */ = {isa = PBXBuildFile; fileRef = 374A36E214748EE400AAF39D /* varargs_wrappers.s */; };
                3F538F891A659C5600B37EFD /* persona.c in Sources */ = {isa = PBXBuildFile; fileRef = 3F538F881A659C5600B37EFD /* persona.c */; };
                4BCDD8B220741C2F00FA37A3 /* mach_right.c in Sources */ = {isa = PBXBuildFile; fileRef = 4BCDD8B120741C2F00FA37A3 /* mach_right.c */; };
                4BDD5F1D1891AB2F004BF300 /* mach_approximate_time.c in Sources */ = {isa = PBXBuildFile; fileRef = 4BDD5F1B1891AB2F004BF300 /* mach_approximate_time.c */; };
                4BDD5F1E1891AB2F004BF300 /* mach_approximate_time.s in Sources */ = {isa = PBXBuildFile; fileRef = 4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */; };
+               6E6EAA8121EA5AC6001C5D04 /* restartable.defs in Sources */ = {isa = PBXBuildFile; fileRef = 6E6EAA8021EA5AAE001C5D04 /* restartable.defs */; };
                726D915520ACD7FC0039A2FE /* mach_bridge_remote_time.c in Sources */ = {isa = PBXBuildFile; fileRef = 726D915420ACD7FC0039A2FE /* mach_bridge_remote_time.c */; };
+               72950DF822418FAC00EFD5E0 /* proc.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = 72AAD8692241878C001511C3 /* proc.h */; };
                729B7D0A15C8938C000E2501 /* carbon_delete.c in Sources */ = {isa = PBXBuildFile; fileRef = FB50F1B315AB7DE700F814BA /* carbon_delete.c */; };
+               72AAD86A22418795001511C3 /* proc.h in Headers */ = {isa = PBXBuildFile; fileRef = 72AAD8692241878C001511C3 /* proc.h */; };
                72B1E6ED190723DB00FB3FA2 /* guarded_open_dprotected_np.c in Sources */ = {isa = PBXBuildFile; fileRef = 72B1E6EC190723DB00FB3FA2 /* guarded_open_dprotected_np.c */; };
+               72D6AFCF22421725004CD782 /* proc.h in Headers */ = {isa = PBXBuildFile; fileRef = 72AAD8692241878C001511C3 /* proc.h */; };
+               72D6AFD122421753004CD782 /* proc.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = 72AAD8692241878C001511C3 /* proc.h */; };
+               72DE4B6A224174D0007844CB /* proc.c in Sources */ = {isa = PBXBuildFile; fileRef = 72DE4B69224174D0007844CB /* proc.c */; };
                72E09E941B444B19006F11A4 /* mach_continuous_time.c in Sources */ = {isa = PBXBuildFile; fileRef = 72FB18801B437F7A00181A5B /* mach_continuous_time.c */; };
                74119F46188F3B6A00C6F48F /* vm_page_size.h in Headers */ = {isa = PBXBuildFile; fileRef = 7466C923170CB99B004557CC /* vm_page_size.h */; };
                7466C924170CBA53004557CC /* vm_page_size.h in Headers */ = {isa = PBXBuildFile; fileRef = 7466C923170CB99B004557CC /* vm_page_size.h */; };
                929FD46F1C5711DB0087B9C8 /* mach_timebase_info.c in Sources */ = {isa = PBXBuildFile; fileRef = 929FD46E1C5711CF0087B9C8 /* mach_timebase_info.c */; };
                978228281B8678DC008385AC /* pselect-darwinext.c in Sources */ = {isa = PBXBuildFile; fileRef = 978228271B8678CB008385AC /* pselect-darwinext.c */; };
                978228291B8678DF008385AC /* pselect-darwinext-cancel.c in Sources */ = {isa = PBXBuildFile; fileRef = 978228261B8678C2008385AC /* pselect-darwinext-cancel.c */; };
+               9C4B507422273E0F00F068C1 /* log_data.c in Sources */ = {isa = PBXBuildFile; fileRef = 9C4B507322273E0F00F068C1 /* log_data.c */; };
                9C6DA3D220A3D09F0090330B /* mach_sync_ipc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9C6DA3D120A3D09F0090330B /* mach_sync_ipc.h */; };
                9C6DA3D320A3D09F0090330B /* mach_sync_ipc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9C6DA3D120A3D09F0090330B /* mach_sync_ipc.h */; };
                9CCF28271E68E993002EE6CD /* pid_shutdown_networking.c in Sources */ = {isa = PBXBuildFile; fileRef = 9CCF28261E68E993002EE6CD /* pid_shutdown_networking.c */; };
                A50BD5301DDA5500006622C8 /* thread_self_restrict.h in Headers */ = {isa = PBXBuildFile; fileRef = A50BD52E1DDA548F006622C8 /* thread_self_restrict.h */; };
                A59CB95616669EFB00B064B3 /* stack_logging_internal.h in Headers */ = {isa = PBXBuildFile; fileRef = A59CB95516669DB700B064B3 /* stack_logging_internal.h */; };
                A59CB9581666A1A200B064B3 /* munmap.c in Sources */ = {isa = PBXBuildFile; fileRef = A59CB9571666A1A200B064B3 /* munmap.c */; };
+               AE69785A22405C21001445CE /* memory_entry.defs in Sources */ = {isa = PBXBuildFile; fileRef = AE69785922405C21001445CE /* memory_entry.defs */; };
                BA0D9FB1199031AD007E8A73 /* kdebug_trace.c in Sources */ = {isa = PBXBuildFile; fileRef = BA0D9FB0199031AD007E8A73 /* kdebug_trace.c */; };
                BA4414AA18336A5F00AAE813 /* mach in CopyFiles */ = {isa = PBXBuildFile; fileRef = BA4414A51833697C00AAE813 /* mach */; };
                BA4414AB18336A6400AAE813 /* servers in CopyFiles */ = {isa = PBXBuildFile; fileRef = BA4414A6183369A100AAE813 /* servers */; };
                        remoteGlobalIDString = BA4414A1183366E600AAE813;
                        remoteInfo = "MIG headers";
                };
+               E46CB80821FBAC32005D1E53 /* PBXContainerItemProxy */ = {
+                       isa = PBXContainerItemProxy;
+                       containerPortal = 08FB7793FE84155DC02AAC07 /* Project object */;
+                       proxyType = 1;
+                       remoteGlobalIDString = 249C60FE1194747600ED73F3;
+                       remoteInfo = Libmach;
+               };
 /* End PBXContainerItemProxy section */
 
 /* Begin PBXCopyFilesBuildPhase section */
+               72950DF722418F7F00EFD5E0 /* CopyFiles */ = {
+                       isa = PBXCopyFilesBuildPhase;
+                       buildActionMask = 8;
+                       dstPath = "${OS_PUBLIC_HEADERS_FOLDER_PATH}";
+                       dstSubfolderSpec = 0;
+                       files = (
+                               72950DF822418FAC00EFD5E0 /* proc.h in CopyFiles */,
+                       );
+                       runOnlyForDeploymentPostprocessing = 1;
+               };
+               72D6AFD02242173F004CD782 /* CopyFiles */ = {
+                       isa = PBXCopyFilesBuildPhase;
+                       buildActionMask = 8;
+                       dstPath = "${OS_PUBLIC_HEADERS_FOLDER_PATH}";
+                       dstSubfolderSpec = 0;
+                       files = (
+                               72D6AFD122421753004CD782 /* proc.h in CopyFiles */,
+                       );
+                       runOnlyForDeploymentPostprocessing = 1;
+               };
                BA4414A818336A1300AAE813 /* CopyFiles */ = {
                        isa = PBXCopyFilesBuildPhase;
                        buildActionMask = 8;
                29A59AE1183B0DE000E8B896 /* renameat.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = renameat.c; sourceTree = "<group>"; };
                29A59AE5183B110C00E8B896 /* unlinkat.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = unlinkat.c; sourceTree = "<group>"; };
                2BA88DCB1810A3CE00EB63F6 /* coalition.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = coalition.c; sourceTree = "<group>"; };
+               2C4853EB221C82160008D1F5 /* os_channel_event.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = os_channel_event.c; path = skywalk/os_channel_event.c; sourceTree = "<group>"; };
                3695DCC81F3D2C5A0072C0B3 /* reboot.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = reboot.c; sourceTree = "<group>"; };
                374A36E214748EE400AAF39D /* varargs_wrappers.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = varargs_wrappers.s; sourceTree = "<group>"; };
                37DDFB7614748713009D3355 /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = "<group>"; };
                4BCDD8B120741C2F00FA37A3 /* mach_right.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = mach_right.c; sourceTree = "<group>"; };
                4BDD5F1B1891AB2F004BF300 /* mach_approximate_time.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_approximate_time.c; sourceTree = "<group>"; };
                4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mach_approximate_time.s; sourceTree = "<group>"; };
-               726D915420ACD7FC0039A2FE /* mach_bridge_remote_time.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = mach_bridge_remote_time.c; path = wrappers/mach_bridge_remote_time.c; sourceTree = "<group>"; };
+               6E6EAA8021EA5AAE001C5D04 /* restartable.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = restartable.defs; sourceTree = "<group>"; };
+               726D915420ACD7FC0039A2FE /* mach_bridge_remote_time.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = mach_bridge_remote_time.c; sourceTree = "<group>"; };
+               72AAD8692241878C001511C3 /* proc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = proc.h; sourceTree = "<group>"; };
                72B1E6EC190723DB00FB3FA2 /* guarded_open_dprotected_np.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = guarded_open_dprotected_np.c; sourceTree = "<group>"; };
+               72DE4B69224174D0007844CB /* proc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = proc.c; sourceTree = "<group>"; };
                72FB18801B437F7A00181A5B /* mach_continuous_time.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_continuous_time.c; sourceTree = "<group>"; };
                7466C923170CB99B004557CC /* vm_page_size.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = vm_page_size.h; sourceTree = "<group>"; };
                7AE28FDE18AC41B1006A5626 /* csr.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = csr.c; sourceTree = "<group>"; };
                929FD46E1C5711CF0087B9C8 /* mach_timebase_info.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_timebase_info.c; sourceTree = "<group>"; };
                978228261B8678C2008385AC /* pselect-darwinext-cancel.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "pselect-darwinext-cancel.c"; sourceTree = "<group>"; };
                978228271B8678CB008385AC /* pselect-darwinext.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "pselect-darwinext.c"; sourceTree = "<group>"; };
+               9C4B507322273E0F00F068C1 /* log_data.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = log_data.c; sourceTree = "<group>"; };
                9C6DA3D120A3D09F0090330B /* mach_sync_ipc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mach_sync_ipc.h; sourceTree = "<group>"; };
                9CCF28261E68E993002EE6CD /* pid_shutdown_networking.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = pid_shutdown_networking.c; sourceTree = "<group>"; };
                A50BD52E1DDA548F006622C8 /* thread_self_restrict.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = thread_self_restrict.h; sourceTree = "<group>"; };
                A59CB95516669DB700B064B3 /* stack_logging_internal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = stack_logging_internal.h; sourceTree = "<group>"; };
                A59CB9571666A1A200B064B3 /* munmap.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = munmap.c; sourceTree = "<group>"; };
+               AE69785922405C21001445CE /* memory_entry.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = memory_entry.defs; sourceTree = "<group>"; };
                BA0D9FB0199031AD007E8A73 /* kdebug_trace.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kdebug_trace.c; sourceTree = "<group>"; };
                BA4414A51833697C00AAE813 /* mach */ = {isa = PBXFileReference; lastKnownFileType = text; name = mach; path = mig_hdr/include/mach; sourceTree = BUILT_PRODUCTS_DIR; };
                BA4414A6183369A100AAE813 /* servers */ = {isa = PBXFileReference; lastKnownFileType = text; name = servers; path = mig_hdr/include/servers; sourceTree = BUILT_PRODUCTS_DIR; };
                08FB7794FE84155DC02AAC07 /* mach */ = {
                        isa = PBXGroup;
                        children = (
-                               726D915420ACD7FC0039A2FE /* mach_bridge_remote_time.c */,
                                C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */,
                                24D1158911E672270063D54D /* Platforms */,
                                24D1156511E671B20063D54D /* custom */,
                08FB7795FE84155DC02AAC07 /* mach */ = {
                        isa = PBXGroup;
                        children = (
-                               C9001751206B00850070D674 /* port_descriptions.c */,
                                247A08FF11F8E18000E4693F /* abort.h */,
                                C9D9BCC5114B00600000D8B9 /* clock_priv.defs */,
                                C9D9BCC6114B00600000D8B9 /* clock_reply.defs */,
                                C9D9BCCF114B00600000D8B9 /* err_us.sub */,
                                C9D9BCD0114B00600000D8B9 /* error_codes.c */,
                                C9D9BCD1114B00600000D8B9 /* errorlib.h */,
-                               247A091611F8E7A800E4693F /* exc_catcher.h */,
                                C9D9BCD2114B00600000D8B9 /* exc_catcher_state_identity.c */,
                                C9D9BCD3114B00600000D8B9 /* exc_catcher_state.c */,
                                C9D9BCD4114B00600000D8B9 /* exc_catcher.c */,
+                               247A091611F8E7A800E4693F /* exc_catcher.h */,
                                C9D9BCD5114B00600000D8B9 /* exc.defs */,
                                C9D9BCD6114B00600000D8B9 /* externs.h */,
                                C9D9BCD7114B00600000D8B9 /* fprintf_stderr.c */,
-                               C9D9BCD8114B00600000D8B9 /* mach */,
                                C9D9BCE4114B00600000D8B9 /* host_priv.defs */,
-                               BABA36CA1A856C4700BBBCF7 /* host.c */,
                                C9D9BCE5114B00600000D8B9 /* host_security.defs */,
+                               BABA36CA1A856C4700BBBCF7 /* host.c */,
                                C9D9BCEA114B00600000D8B9 /* lock_set.defs */,
+                               C9D9BCD8114B00600000D8B9 /* mach */,
                                C9D9BCEB114B00600000D8B9 /* mach_error_string.c */,
                                C9D9BCEC114B00600000D8B9 /* mach_error.c */,
                                C9D9BCED114B00600000D8B9 /* mach_host.defs */,
                                4BCDD8B120741C2F00FA37A3 /* mach_right.c */,
                                C9D9BCF3114B00600000D8B9 /* mach_traps.s */,
                                291D3C271354FDD100D46061 /* mach_vm.c */,
-                               E4216C301822D404006F2632 /* mach_voucher.defs */,
                                C9D9BCF4114B00600000D8B9 /* mach_vm.defs */,
+                               AE69785922405C21001445CE /* memory_entry.defs */,
+                               E4216C301822D404006F2632 /* mach_voucher.defs */,
                                C9D9BCF6114B00600000D8B9 /* mig_allocate.c */,
                                C9D9BCF7114B00600000D8B9 /* mig_deallocate.c */,
+                               24484A9311F61D1900E10CD2 /* mig_reply_port.c */,
                                C9D9BCF8114B00600000D8B9 /* mig_reply_setup.c */,
                                C9D9BCF9114B00600000D8B9 /* mig_strncpy.c */,
-                               24484A9311F61D1900E10CD2 /* mig_reply_port.c */,
                                C9D9BCFA114B00600000D8B9 /* ms_thread_switch.c */,
                                C9D9BCFB114B00600000D8B9 /* notify.defs */,
                                C9D9BCFC114B00600000D8B9 /* panic.c */,
+                               C9001751206B00850070D674 /* port_descriptions.c */,
                                C9D9BCFD114B00600000D8B9 /* port_obj.c */,
                                C9D9BD03114B00600000D8B9 /* processor_set.defs */,
                                C9D9BD04114B00600000D8B9 /* processor.defs */,
+                               6E6EAA8021EA5AAE001C5D04 /* restartable.defs */,
                                C9D9BD06114B00600000D8B9 /* semaphore.c */,
                                C9D9BD07114B00600000D8B9 /* servers */,
                                C9D9BD0E114B00600000D8B9 /* slot_name.c */,
                                A59CB95516669DB700B064B3 /* stack_logging_internal.h */,
-                               24484A7311F51E9800E10CD2 /* string.h */,
                                24484A7411F51E9800E10CD2 /* string.c */,
+                               24484A7311F51E9800E10CD2 /* string.h */,
                                C9D9BD0F114B00600000D8B9 /* task.defs */,
                                C962B16D18DBB43F0031244A /* thread_act.c */,
                                C9D9BD10114B00600000D8B9 /* thread_act.defs */,
                                E4D45C2316F856900002AF25 /* mach_absolute_time.s */,
                                4BDD5F1B1891AB2F004BF300 /* mach_approximate_time.c */,
                                4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */,
+                               726D915420ACD7FC0039A2FE /* mach_bridge_remote_time.c */,
                                925559911CBBBBB300E527CE /* mach_boottime.c */,
                                72FB18801B437F7A00181A5B /* mach_continuous_time.c */,
                                14FE60EB1B7D3BED00ACB44C /* mach_get_times.c */,
                                3F538F881A659C5600B37EFD /* persona.c */,
                                9CCF28261E68E993002EE6CD /* pid_shutdown_networking.c */,
                                C6BEE9171806840200D25AAB /* posix_sem_obsolete.c */,
+                               72DE4B69224174D0007844CB /* proc.c */,
                                BA9973461C3B4C8A00B14D8C /* quota_obsolete.c */,
                                24B8C2611237F53900D36CC3 /* remove-counter.c */,
                                248AA966122C7CDA0085F5B1 /* rename.c */,
                401BB7141BCAE523005080D3 /* skywalk */ = {
                        isa = PBXGroup;
                        children = (
+                               2C4853EB221C82160008D1F5 /* os_channel_event.c */,
                                405FA3381E0C669D007D66EA /* os_packet.c */,
                                40DD162F1E4ACCAA003297CC /* cpu_copy_in_cksum.s */,
                                409A78301E4EB3D900E0699B /* cpu_in_cksum.s */,
                        isa = PBXGroup;
                        children = (
                                C9C1824F15338C0B00933F23 /* alloc_once.c */,
-                               C9EE57F51669673D00337E4B /* tsd.h */,
+                               9C4B507322273E0F00F068C1 /* log_data.c */,
+                               72AAD8692241878C001511C3 /* proc.h */,
                                A50BD52E1DDA548F006622C8 /* thread_self_restrict.h */,
+                               C9EE57F51669673D00337E4B /* tsd.h */,
                        );
                        path = os;
                        sourceTree = "<group>";
                                A50BD52F1DDA548F006622C8 /* thread_self_restrict.h in Headers */,
                                C6D3EFC116542C510052CF30 /* vm_task.h in Headers */,
                                C6D3EFC216542C510052CF30 /* key_defs.h in Headers */,
+                               72D6AFCF22421725004CD782 /* proc.h in Headers */,
                                C6D3EFC316542C510052CF30 /* ls_defs.h in Headers */,
                                C6D3EFC416542C510052CF30 /* netname_defs.h in Headers */,
                                C6D3EFC516542C510052CF30 /* nm_defs.h in Headers */,
                                A59CB95616669EFB00B064B3 /* stack_logging_internal.h in Headers */,
                                E4D45C3F16FB20D30002AF25 /* spawn.h in Headers */,
                                E4512B8C21AFDF1600673F73 /* mach_right_private.h in Headers */,
+                               72AAD86A22418795001511C3 /* proc.h in Headers */,
                                9C6DA3D220A3D09F0090330B /* mach_sync_ipc.h in Headers */,
                                E4D45C4016FB20DC0002AF25 /* spawn_private.h in Headers */,
                                E4D45C2F16F868ED0002AF25 /* libproc.h in Headers */,
                                C6D3EFCA16542C510052CF30 /* CopyFiles */,
                                BA4414B418336E1A00AAE813 /* Copy Files */,
                                BA4414B718336E5600AAE813 /* CopyFiles */,
+                               72D6AFD02242173F004CD782 /* CopyFiles */,
                                C6D3EFCD16542C510052CF30 /* Sources */,
                        );
                        buildRules = (
                                C63F480B1654203800A1F78F /* CopyFiles */,
                                BA4414A818336A1300AAE813 /* CopyFiles */,
                                BA4414AC18336A7700AAE813 /* CopyFiles */,
+                               72950DF722418F7F00EFD5E0 /* CopyFiles */,
                                D2AAC0610554660B00DB518D /* Sources */,
                                D289988505E68E00004EDB86 /* Frameworks */,
                        );
                                249C61101194755D00ED73F3 /* Build */,
                                24614EF311E7C98600E78584 /* Syscalls */,
                                BA4414A1183366E600AAE813 /* MIG headers */,
+                               E46CB80621FBAC32005D1E53 /* Libsyscall_driverkit */,
                                D2AAC0620554660B00DB518D /* Libsyscall_static */,
                                249C60FE1194747600ED73F3 /* Libsyscall_dynamic */,
                                C6D3EFB216542C510052CF30 /* Libsyscall_headers_Sim */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                        shellPath = /bin/sh;
-                       shellScript = "set -x\n[[ $ACTION == \"installhdrs\" ]] && exit 0\n\nmkdir -p $OBJROOT/sys\n\n$SRCROOT/xcodescripts/create-syscalls.pl \\\n\t$SRCROOT/../bsd/kern/syscalls.master \\\n\t$SRCROOT/custom \\\n\t$SRCROOT/Platforms \\\n\t$MAP_PLATFORM \\\n\t$OBJROOT/sys\n";
+                       shellScript = "set -x\n[[ $ACTION == \"installhdrs\" ]] && exit 0\n# workaround 48125283\n[ -n \"$DRIVERKIT_DEPLOYMENT_TARGET\" ] && unset MACOSX_DEPLOYMENT_TARGET\n\nmkdir -p $OBJROOT/sys\n\n$SRCROOT/xcodescripts/create-syscalls.pl \\\n\t$SRCROOT/../bsd/kern/syscalls.master \\\n\t$SRCROOT/custom \\\n\t$SRCROOT/Platforms \\\n\t$MAP_PLATFORM \\\n\t$OBJROOT/sys\n";
                };
                24614EF611E7C9A000E78584 /* Compile Syscalls */ = {
                        isa = PBXShellScriptBuildPhase;
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                        shellPath = /bin/sh;
-                       shellScript = "set -x\n[[ $ACTION == \"installhdrs\" ]] && exit 0\n\nmkdir -p $OBJROOT/UninstalledProducts\n\n$SRCROOT/xcodescripts/compile-syscalls.pl \\\n\t$OBJROOT/sys/stubs.list \\\n\t$BUILD_ROOT/syscalls.a";
+                       shellScript = "set -x\n[[ $ACTION == \"installhdrs\" ]] && exit 0\n# workaround 48125283\n[ -n \"$DRIVERKIT_DEPLOYMENT_TARGET\" ] && unset MACOSX_DEPLOYMENT_TARGET\n\nmkdir -p $OBJROOT/UninstalledProducts\n\n$SRCROOT/xcodescripts/compile-syscalls.pl \\\n\t$OBJROOT/sys/stubs.list \\\n\t$BUILD_ROOT/syscalls.a\n";
                };
                BA4414A41833672200AAE813 /* Generate MIG Headers */ = {
                        isa = PBXShellScriptBuildPhase;
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                        shellPath = /bin/sh;
-                       shellScript = "\"$PROJECT_DIR\"/xcodescripts/mach_install_mig.sh";
+                       shellScript = "# workaround 48125283\n[ -n \"$DRIVERKIT_DEPLOYMENT_TARGET\" ] && unset MACOSX_DEPLOYMENT_TARGET\n\n\"$PROJECT_DIR\"/xcodescripts/mach_install_mig.sh\n";
                };
 /* End PBXShellScriptBuildPhase section */
 
                                E4D45C2416F856900002AF25 /* __commpage_gettimeofday.c in Sources */,
                                C9D9BD43114B00600000D8B9 /* mig_reply_setup.c in Sources */,
                                24484A9411F61D2B00E10CD2 /* mig_reply_port.c in Sources */,
+                               6E6EAA8121EA5AC6001C5D04 /* restartable.defs in Sources */,
                                C9D9BD44114B00600000D8B9 /* mig_strncpy.c in Sources */,
                                C9D9BD45114B00600000D8B9 /* ms_thread_switch.c in Sources */,
                                C9D9BD47114B00600000D8B9 /* panic.c in Sources */,
                                2485235511582D8F0051B413 /* mach_legacy.c in Sources */,
                                242AB66611EBDC1200107336 /* errno.c in Sources */,
                                4BCDD8B220741C2F00FA37A3 /* mach_right.c in Sources */,
+                               72DE4B6A224174D0007844CB /* proc.c in Sources */,
                                E4D45C2E16F868ED0002AF25 /* libproc.c in Sources */,
                                24A7C5BC11FF8DA6007669EB /* accept.c in Sources */,
                                24A7C5BD11FF8DA6007669EB /* bind.c in Sources */,
                                C962B16E18DBB43F0031244A /* thread_act.c in Sources */,
                                24A7C5C511FF8DA6007669EB /* recvmsg.c in Sources */,
                                24A7C5C611FF8DA6007669EB /* sendmsg.c in Sources */,
+                               AE69785A22405C21001445CE /* memory_entry.defs in Sources */,
                                24A7C5C711FF8DA6007669EB /* sendto.c in Sources */,
                                24A7C5C811FF8DA6007669EB /* setattrlist.c in Sources */,
                                24A7C5C911FF8DA6007669EB /* socketpair.c in Sources */,
                                9002401118FC9A7F00D73BFA /* renamex.c in Sources */,
                                2419382B12135FF6003CDE41 /* chmod.c in Sources */,
                                248BA01D121C56BF008C073F /* connect.c in Sources */,
+                               9C4B507422273E0F00F068C1 /* log_data.c in Sources */,
                                248BA01F121C607E008C073F /* fchmod.c in Sources */,
                                E4D45C3616F86BD80002AF25 /* posix_spawn.c in Sources */,
                                13B598941A142F6400DB2D5A /* stackshot.c in Sources */,
                                402AF43F1E5CD88600F1A4B9 /* cpu_in_cksum_gen.c in Sources */,
                                030B179B135377B400DAD1F0 /* open_dprotected_np.c in Sources */,
                                E4D45C3116F868ED0002AF25 /* proc_listpidspath.c in Sources */,
+                               2C4853EC221C82160008D1F5 /* os_channel_event.c in Sources */,
                                374A36E314748F1300AAF39D /* varargs_wrappers.s in Sources */,
                                291D3C281354FDD100D46061 /* mach_port.c in Sources */,
                                291D3C291354FDD100D46061 /* mach_vm.c in Sources */,
                        target = BA4414A1183366E600AAE813 /* MIG headers */;
                        targetProxy = BA4414B218336D8D00AAE813 /* PBXContainerItemProxy */;
                };
+               E46CB80721FBAC32005D1E53 /* PBXTargetDependency */ = {
+                       isa = PBXTargetDependency;
+                       target = 249C60FE1194747600ED73F3 /* Libsyscall_dynamic */;
+                       targetProxy = E46CB80821FBAC32005D1E53 /* PBXContainerItemProxy */;
+               };
 /* End PBXTargetDependency section */
 
 /* Begin XCBuildConfiguration section */
                        isa = XCBuildConfiguration;
                        baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
                        buildSettings = {
-                               CLANG_ENABLE_OBJC_WEAK = YES;
                                COPY_PHASE_STRIP = NO;
                                INSTALL_PATH = /usr/local/lib/dyld;
+                               SKIP_INSTALL = "$(SKIP_INSTALL_$(SDK_INSTALL_VARIANT))";
+                               SKIP_INSTALL_driverkit = YES;
                                STRIP_INSTALLED_PRODUCT = NO;
                        };
                        name = Release;
                        isa = XCBuildConfiguration;
                        baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
                        buildSettings = {
-                               CLANG_ANALYZER_LOCALIZABILITY_NONLOCALIZED = YES;
-                               CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-                               CLANG_WARN_BOOL_CONVERSION = YES;
-                               CLANG_WARN_COMMA = YES;
-                               CLANG_WARN_CONSTANT_CONVERSION = YES;
-                               CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-                               CLANG_WARN_EMPTY_BODY = YES;
-                               CLANG_WARN_ENUM_CONVERSION = YES;
-                               CLANG_WARN_INFINITE_RECURSION = YES;
-                               CLANG_WARN_INT_CONVERSION = NO;
-                               CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-                               CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-                               CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-                               CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-                               CLANG_WARN_STRICT_PROTOTYPES = YES;
-                               CLANG_WARN_SUSPICIOUS_MOVE = YES;
-                               CLANG_WARN_UNREACHABLE_CODE = YES;
-                               CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-                               ENABLE_STRICT_OBJC_MSGSEND = YES;
-                               GCC_C_LANGUAGE_STANDARD = gnu99;
-                               GCC_NO_COMMON_BLOCKS = YES;
-                               GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
-                               GCC_WARN_ABOUT_RETURN_TYPE = YES;
-                               GCC_WARN_UNDECLARED_SELECTOR = YES;
-                               GCC_WARN_UNINITIALIZED_AUTOS = YES;
-                               GCC_WARN_UNUSED_FUNCTION = YES;
-                               GCC_WARN_UNUSED_PARAMETER = YES;
-                               GCC_WARN_UNUSED_VARIABLE = YES;
-                               OTHER_MIGFLAGS = "-novouchers";
                        };
                        name = Release;
                };
                        isa = XCBuildConfiguration;
                        baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
                        buildSettings = {
-                               CLANG_ENABLE_OBJC_WEAK = YES;
                                COPY_PHASE_STRIP = YES;
-                               DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
                                MAP_PLATFORM = "$(MAP_PLATFORM_$(PLATFORM_NAME))";
                                MAP_PLATFORM_appletvos = iPhoneOS;
                                MAP_PLATFORM_bridgeos = iPhoneOS;
                                MAP_PLATFORM_iphoneos = iPhoneOS;
                                MAP_PLATFORM_iphoneosnano = iPhoneOS;
-                               MAP_PLATFORM_macosx = MacOSX;
+                               MAP_PLATFORM_macosx = "$(MAP_PLATFORM_macosx_$(SDK_INSTALL_VARIANT))";
+                               MAP_PLATFORM_macosx_default = MacOSX;
+                               MAP_PLATFORM_macosx_driverkit = DriverKit;
                                MAP_PLATFORM_tvos = iPhoneOS;
                                MAP_PLATFORM_watchos = iPhoneOS;
                                PRODUCT_NAME = Syscalls;
                        isa = XCBuildConfiguration;
                        baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
                        buildSettings = {
-                               CLANG_ENABLE_OBJC_WEAK = YES;
-                               OTHER_LDFLAGS = "$(DYLIB_LDFLAGS)";
+                               OTHER_LDFLAGS = (
+                                       "$(SIMULATOR_LDFLAGS)",
+                                       "$(DYLIB_LDFLAGS)",
+                               );
                                VERSION_INFO_PREFIX = "___";
                        };
                        name = Release;
                        isa = XCBuildConfiguration;
                        baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
                        buildSettings = {
-                               CLANG_ENABLE_OBJC_WEAK = YES;
                                COPY_PHASE_STRIP = NO;
                                INSTALLHDRS_COPY_PHASE = NO;
                                PRODUCT_NAME = Build;
                        isa = XCBuildConfiguration;
                        baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
                        buildSettings = {
-                               CLANG_ENABLE_OBJC_WEAK = YES;
                                PRODUCT_NAME = "$(TARGET_NAME)";
                        };
                        name = Release;
                        isa = XCBuildConfiguration;
                        baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
                        buildSettings = {
-                               CLANG_ENABLE_OBJC_WEAK = YES;
                                COPY_PHASE_STRIP = NO;
                                PRODUCT_NAME = Libsyscall_headers_Sim;
                                SKIP_INSTALL = YES;
                        };
                        name = Release;
                };
+               E46CB81021FBAC32005D1E53 /* Release */ = {
+                       isa = XCBuildConfiguration;
+                       baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
+                       buildSettings = {
+                               COPY_PHASE_STRIP = NO;
+                               PRODUCT_NAME = "$(TARGET_NAME)";
+                               STRIP_STYLE = debugging;
+                       };
+                       name = Release;
+               };
 /* End XCBuildConfiguration section */
 
 /* Begin XCConfigurationList section */
                        defaultConfigurationIsVisible = 0;
                        defaultConfigurationName = Release;
                };
+               E46CB80F21FBAC32005D1E53 /* Build configuration list for PBXAggregateTarget "Libsyscall_driverkit" */ = {
+                       isa = XCConfigurationList;
+                       buildConfigurations = (
+                               E46CB81021FBAC32005D1E53 /* Release */,
+                       );
+                       defaultConfigurationIsVisible = 0;
+                       defaultConfigurationName = Release;
+               };
 /* End XCConfigurationList section */
        };
        rootObject = 08FB7793FE84155DC02AAC07 /* Project object */;
diff --git a/libsyscall/Platforms/DriverKit/x86_64/syscall.map b/libsyscall/Platforms/DriverKit/x86_64/syscall.map
new file mode 100644 (file)
index 0000000..9aa2064
--- /dev/null
@@ -0,0 +1,61 @@
+_accept$NOCANCEL       ___accept_nocancel
+_aio_suspend$NOCANCEL  ___aio_suspend_nocancel
+_close$NOCANCEL        ___close_nocancel
+_connect$NOCANCEL      ___connect_nocancel
+_fstat ___fstat64
+_fstatat       ___fstatat64
+_fstatfs       ___fstatfs64
+_fsync$NOCANCEL        ___fsync_nocancel
+_getfsstat     ___getfsstat64
+_lstat ___lstat64
+_msgrcv$NOCANCEL       ___msgrcv_nocancel
+_msgsnd$NOCANCEL       ___msgsnd_nocancel
+_msgsys                ___msgsys
+_msync$NOCANCEL        ___msync_nocancel
+_open$NOCANCEL ___open_nocancel
+_openat$NOCANCEL       ___openat_nocancel
+_poll$NOCANCEL ___poll_nocancel
+_pread$NOCANCEL        ___pread_nocancel
+_pwrite$NOCANCEL       ___pwrite_nocancel
+_read$NOCANCEL ___read_nocancel
+_readv$NOCANCEL        ___readv_nocancel
+_recvfrom$NOCANCEL     ___recvfrom_nocancel
+_recvmsg$NOCANCEL      ___recvmsg_nocancel
+_select$DARWIN_EXTSN   ___select
+_select$DARWIN_EXTSN$NOCANCEL  ___select_nocancel
+_sem_wait$NOCANCEL     ___sem_wait_nocancel
+_semsys                ___semsys
+_sendmsg$NOCANCEL      ___sendmsg_nocancel
+_sendto$NOCANCEL       ___sendto_nocancel
+_stat  ___stat64
+_statfs        ___statfs64
+_waitid$NOCANCEL       ___waitid_nocancel
+_write$NOCANCEL        ___write_nocancel
+_writev$NOCANCEL       ___writev_nocancel
+
+_accept        ___accept
+_bind  ___bind
+_connect       ___connect
+_getattrlist   ___getattrlist
+_getpeername   ___getpeername
+_getsockname   ___getsockname
+_lchown        ___lchown
+_listen        ___listen
+_mprotect      ___mprotect
+_msgctl        ___msgctl
+_msync ___msync
+_open  ___open
+_openat        ___openat
+_recvfrom      ___recvfrom
+_recvmsg       ___recvmsg
+_semctl        ___semctl
+_sem_open ___sem_open
+_sendmsg       ___sendmsg
+_sendto        ___sendto
+_setattrlist   ___setattrlist
+_setregid      ___setregid
+_setreuid      ___setreuid
+_shmctl                ___shmctl
+_shmsys        ___shmsys
+_shm_open      ___shm_open
+_socketpair      ___socketpair
index a62cc1f996821bf905b1ff4a081059e2b37613e5..ffc6a8f2e725d039dbd0ca96372aa6a4c2b59c7b 100644 (file)
@@ -450,6 +450,7 @@ pseudo:                                                                     ;\
    PUSH_FRAME                  %%\
    bl    _##cerror             %%\
    POP_FRAME                   %%\
+   ret                                 %%\
 2:                     
 
 #define MI_GET_ADDRESS(reg,var)  \
index d80b8d6c08c621d6ceb7c447a750ca812c065b3c..f9f4212b8f1654a8983fb9dece60434afd6b9108 100644 (file)
@@ -34,7 +34,7 @@ extern int __getpid(void);
 extern int __kill(int pid, int signum, int posix);
 extern int __exit(int) __attribute__((noreturn));
 
-static inline void __attribute__((noreturn))
+static inline void __attribute__((noreturn, cold))
 abort(void)
 {
        (void)__kill(__getpid(), __SIGABRT, 0);
index b5361b8be291118b2ccb57f8c4bbbc505abd0afb..7a105843bb3c2d09c3e55527c7dea11b1232ca9c 100644 (file)
@@ -29,7 +29,7 @@
 
 #include <TargetConditionals.h>
 #include <IOKit/IOReturn.h>
-#if !TARGET_OS_EMBEDDED
+#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 #include <IOKit/usb/USB.h>
 #include <IOKit/firewire/IOFireWireLib.h>
 #endif
@@ -97,7 +97,7 @@ static const char * const err_codes_iokit_common[] = {
     "(iokit/common) data was not found",                               // 0x2f0
 };
 
-#if !TARGET_OS_EMBEDDED
+#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 static const struct error_sparse_map err_codes_iokit_usb_map[] = {
     err_code_map_entry(kIOUSBCRCErr, kIOUSBDataToggleErr),
     err_code_map_entry(kIOUSBPIDCheckErr, kIOUSBWrongPIDErr),
@@ -203,7 +203,7 @@ static const char * const err_codes_iokit_bluetooth[] = {
     "(iokit/bluetooth) no HCI controller",                             // 003
     "(iokit/bluetooth) changing power states is unsupported",          // 004
 };
-#endif /* !TARGET_OS_EMBEDDED */
+#endif /* !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
 
 static const struct error_sparse_map err_iokit_sub_map[] = {
     err_sub_map_entry(sub_iokit_common,          sub_iokit_pmu),
@@ -220,7 +220,7 @@ static const struct error_subsystem err_iokit_sub[] =
        err_codes_iokit_common_map,
        errlib_count(err_codes_iokit_common_map),
     },
-#if !TARGET_OS_EMBEDDED
+#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
     /*  1 */ {
        "(iokit/usb)",                          // 0xe0004000
        errlib_count(err_codes_iokit_usb),
@@ -235,20 +235,20 @@ static const struct error_subsystem err_iokit_sub[] =
        err_codes_iokit_fw_map,
        errlib_count(err_codes_iokit_fw_map),
     },
-#endif /* !TARGET_OS_EMBEDDED */
+#endif /* !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
     /*  3 */ err_iokit_null_sub,               // 0xe000c000
     /*  4 */ { "(iokit/blkstorage)", 0 },      // 0xe0010000
     /*  5 */ { "(iokit/graphics)",   0 },      // 0xe0014000
     /*  6 */ err_iokit_null_sub,               // 0xe0018000
     /*  7 */ err_iokit_null_sub,               // 0xe001c000
-#if !TARGET_OS_EMBEDDED
+#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
     /*  8 */ {
        "(iokit/bluetooth)",                    // 0xe0020000
        errlib_count(err_codes_iokit_bluetooth),
        err_codes_iokit_bluetooth,
        NULL, 0,
     },
-#endif /* !TARGET_OS_EMBEDDED */
+#endif /* !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
     /*  9 */ { "(iokit/pmu)",        0 },      // 0xe0024000
     /* -2 */ { "(iokit/vendor)",     0 },      // 0xe0028000
     /* -1 */ { "(iokit/reserved)",   0 },      // 0xe002c000
index b0e82564c281a5c56fb0931378fd0751b7b65a7f..97aa377c71c7706b64cf838123c9a9a8ab942ea0 100644 (file)
@@ -55,10 +55,11 @@ static const char * const err_codes_mach_send[] = {
        /* 15 */        "(ipc/send) invalid msg-type",
        /* 16 */        "(ipc/send) invalid msg-header",
        /* 17 */ "(ipc/send) invalid msg-trailer",
-       /* 18 */ "(ipc/send) unused error",
+       /* 18 */ "(ipc/send) invalid context for reply",
        /* 19 */ "(ipc/send) unused error",
        /* 20 */ "(ipc/send) unused error",
        /* 21 */ "(ipc/send) out-of-line buffer too large",
+       /* 22 */ "(ipc/send) destination does not accept OOL ports",
 };
 
 static const char * const err_codes_mach_rcv[] = {
index 4ec633dfa1f9a47f4cd35aa40babbc06b0cb15f1..4aae84fc27ce6094c071e568aa333efd5a186cec 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2010 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
  *      Generic error code interface
  */
 
+#include <TargetConditionals.h>
 #include <mach/error.h>
 #include "errorlib.h"
+#if !TARGET_OS_DRIVERKIT
 #include "err_libkern.sub"
 #include "err_iokit.sub"
+#endif // !TARGET_OS_DRIVERKIT
 #include "err_ipc.sub"
 #include "err_kern.sub"
 #include "err_mach_ipc.sub"
 const struct error_system _mach_errors[err_max_system + 1] = {
        /* 0; err_kern */
        {
-               errlib_count(err_os_sub),
-               "(operating system/?) unknown subsystem error",
-               err_os_sub,
+               .max_sub = errlib_count(err_os_sub),
+               .bad_sub = "(operating system/?) unknown subsystem error",
+               .subsystem = err_os_sub,
        },
        /* 1; err_us */
        {
-               errlib_count(err_us_sub),
-               "(user space/?) unknown subsystem error",
-               err_us_sub,
+               .max_sub = errlib_count(err_us_sub),
+               .bad_sub = "(user space/?) unknown subsystem error",
+               .subsystem = err_us_sub,
        },
        /* 2; err_server */
        {
-               errlib_count(err_server_sub),
-               "(server/?) unknown subsystem error",
-               err_server_sub,
+               .max_sub = errlib_count(err_server_sub),
+               .bad_sub = "(server/?) unknown subsystem error",
+               .subsystem = err_server_sub,
        },
        /* 3 (& 3f); err_ipc */
        {
-               errlib_count(err_ipc_sub),
-               "(ipc/?) unknown subsystem error",
-               err_ipc_sub,
+               .max_sub = errlib_count(err_ipc_sub),
+               .bad_sub = "(ipc/?) unknown subsystem error",
+               .subsystem = err_ipc_sub,
        },
        /* 4; err_mach_ipc */
        {
-               errlib_count(err_mach_ipc_sub),
-               "(ipc/?) unknown subsystem error",
-               err_mach_ipc_sub,
+               .max_sub = errlib_count(err_mach_ipc_sub),
+               .bad_sub = "(ipc/?) unknown subsystem error",
+               .subsystem = err_mach_ipc_sub,
        },
 
        /* 0x05 */ errorlib_system_null,
@@ -134,21 +137,25 @@ const struct error_system _mach_errors[err_max_system + 1] = {
        /* 0x34 */ errorlib_system_null, /* 0x35 */ errorlib_system_null,
        /* 0x36 */ errorlib_system_null,
 
+#if !TARGET_OS_DRIVERKIT
        /* 0x37; err_libkern */
        {
-               errlib_count(err_libkern_sub),
-               "(libkern/?) unknown subsystem error",
-               err_libkern_sub,
+               .max_sub = errlib_count(err_libkern_sub),
+               .bad_sub = "(libkern/?) unknown subsystem error",
+               .subsystem = err_libkern_sub,
        },
 
        /* 0x38; err_iokit */
        {
-               errlib_count(err_iokit_sub),
-               "(iokit/?) unknown subsystem error",
-               err_iokit_sub,
-               err_iokit_sub_map,
-               errlib_count(err_iokit_sub_map)
+               .max_sub = errlib_count(err_iokit_sub),
+               .bad_sub = "(iokit/?) unknown subsystem error",
+               .subsystem = err_iokit_sub,
+               .map_table = err_iokit_sub_map,
+               .map_count = errlib_count(err_iokit_sub_map)
        },
+#else
+       /* 0x37 */ errorlib_system_null, /* 0x38 */ errorlib_system_null,
+#endif // TARGET_OS_DRIVERKIT
 
        /* 0x39 */ errorlib_system_null,
        /* 0x3a */ errorlib_system_null, /* 0x3b */ errorlib_system_null,
index f78f3d7a1792991ccaf593311500e9359941cda3..f7ac4e5e3b9326c7b6b0667471da6617ef6796f7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -97,7 +97,7 @@ struct error_subsystem {
        const struct error_sparse_map   *map_table;
        int                             map_count;
 };
-#define errorlib_system_null    { 0, NULL, NULL, NULL, 0 }
+#define errorlib_system_null    { NULL, 0, NULL, NULL, 0 }
 
 struct error_system {
        int                             max_sub;
@@ -106,7 +106,7 @@ struct error_system {
        const struct error_sparse_map   *map_table;
        int                             map_count;
 };
-#define errorlib_sub_null       { NULL, 0, NULL, NULL, 0 }
+#define errorlib_sub_null       { 0, NULL, NULL, NULL, 0 }
 
 extern const struct error_system        _mach_errors[err_max_system + 1];
 char *mach_error_string_int(mach_error_t, boolean_t *);
index a0d9f716531cd44e377d33a3e8bdc280c94d2e22..d24b486b6e2266b5c10b8340f67d68d3ce05461e 100644 (file)
@@ -51,7 +51,7 @@ internal_catch_exception_raise(
 #if defined(__DYNAMIC__)
        static _libkernel_exc_raise_func_t exc_raise_func = (void*)-1;
 
-       if (exc_raise_func == ((void*)-1)) {
+       if (exc_raise_func == ((void*)-1) && _dlsym) {
                exc_raise_func = _dlsym(RTLD_DEFAULT, "catch_exception_raise");
        }
        if (exc_raise_func == 0) {
index deedf57d17b0b5e2bb5a9698875e8cd404a692d2..47ac3d7fb6f66d431b4711ad0fed09b1221f90c5 100644 (file)
@@ -35,7 +35,6 @@
 #include <mach/message.h>
 #include <mach/exception.h>
 #include <mach/mig_errors.h>
-#include <dlfcn.h>
 
 #include "abort.h"
 #include "exc_catcher.h"
@@ -55,7 +54,7 @@ internal_catch_exception_raise_state(
 #if defined(__DYNAMIC__)
        static _libkernel_exc_raise_state_func_t exc_raise_state_func = (void*)-1;
 
-       if (exc_raise_state_func == ((void*)-1)) {
+       if (exc_raise_state_func == ((void*)-1) && _dlsym) {
                exc_raise_state_func = _dlsym(RTLD_DEFAULT, "catch_exception_raise_state");
        }
        if (exc_raise_state_func == 0) {
index 1eac28e6c08c6f754d53bef97da7675249ba7923..1ddaf8c65506247eaaa3b9a0c6dbbc9877135228 100644 (file)
@@ -35,7 +35,6 @@
 #include <mach/message.h>
 #include <mach/exception.h>
 #include <mach/mig_errors.h>
-#include <dlfcn.h>
 
 #include "abort.h"
 #include "exc_catcher.h"
@@ -57,7 +56,7 @@ internal_catch_exception_raise_state_identity(
 #if defined(__DYNAMIC__)
        static _libkernel_exec_raise_state_identity_t exc_raise_state_identity_func = (void*)-1;
 
-       if (exc_raise_state_identity_func == ((void*)-1)) {
+       if (exc_raise_state_identity_func == ((void*)-1) && _dlsym) {
                exc_raise_state_identity_func = _dlsym(RTLD_DEFAULT, "catch_exception_raise_state_identity");
        }
        if (exc_raise_state_identity_func == 0) {
index 2aa1c8423de72f3e12ad62818c7117538a053b8f..6a7ec639e935069ae6da5dd549eac6d3d678a1f1 100644 (file)
@@ -46,7 +46,7 @@ kern_return_t
 host_get_multiuser_config_flags(host_t host __unused,
     uint32_t *multiuser_flags)
 {
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
        volatile uint32_t *multiuser_flag_address = (volatile uint32_t *)(uintptr_t)(_COMM_PAGE_MULTIUSER_CONFIG);
        *multiuser_flags = *multiuser_flag_address;
        return KERN_SUCCESS;
@@ -60,7 +60,7 @@ kern_return_t
 host_check_multiuser_mode(host_t host __unused,
     uint32_t *multiuser_mode)
 {
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
        uint32_t multiuser_flags;
        kern_return_t kr;
 
index 88c2583b75e6aeca2fe0f395634bbca9ffc3811f..1c1b7af7461db5ef798a4f339c0502eb76b6040f 100644 (file)
@@ -363,11 +363,78 @@ mach_msg_destroy(mach_msg_header_t *msg)
                                daddr = (mach_msg_descriptor_t *)(dsc + 1);
                                break;
                        }
+
+                       case MACH_MSG_GUARDED_PORT_DESCRIPTOR: {
+                               mach_msg_guarded_port_descriptor_t *dsc;
+                               mach_msg_guard_flags_t flags;
+                               /*
+                                * Destroy port right carried in the message
+                                */
+                               dsc = &daddr->guarded_port;
+                               flags = dsc->flags;
+                               if ((flags & MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND) == 0) {
+                                       /* Need to unguard before destroying the port */
+                                       mach_port_unguard(mach_task_self_, dsc->name, (uint64_t)dsc->context);
+                               }
+                               mach_msg_destroy_port(dsc->name, dsc->disposition);
+                               daddr = (mach_msg_descriptor_t *)(dsc + 1);
+                               break;
+                       }
                        }
                }
        }
 }
 
+static inline boolean_t
+mach_msg_server_is_recoverable_send_error(kern_return_t kr)
+{
+       switch (kr) {
+       case MACH_SEND_INVALID_DEST:
+       case MACH_SEND_TIMED_OUT:
+       case MACH_SEND_INTERRUPTED:
+               return TRUE;
+       default:
+               /*
+                * Other errors mean that the message may have been partially destroyed
+                * by the kernel, and these can't be recovered and may leak resources.
+                */
+               return FALSE;
+       }
+}
+
+static kern_return_t
+mach_msg_server_mig_return_code(mig_reply_error_t *reply)
+{
+       /*
+        * If the message is complex, it is assumed that the reply was successful,
+        * as the RetCode is where the count of out of line descriptors is.
+        *
+        * If not, we read RetCode.
+        */
+       if (reply->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX) {
+               return KERN_SUCCESS;
+       }
+       return reply->RetCode;
+}
+
+static void
+mach_msg_server_consume_unsent_message(mach_msg_header_t *hdr)
+{
+       /* mach_msg_destroy doesn't handle the local port */
+       mach_port_t port = hdr->msgh_local_port;
+       if (MACH_PORT_VALID(port)) {
+               switch (MACH_MSGH_BITS_LOCAL(hdr->msgh_bits)) {
+               case MACH_MSG_TYPE_MOVE_SEND:
+               case MACH_MSG_TYPE_MOVE_SEND_ONCE:
+                       /* destroy the send/send-once right */
+                       (void) mach_port_deallocate(mach_task_self_, port);
+                       hdr->msgh_local_port = MACH_PORT_NULL;
+                       break;
+               }
+       }
+       mach_msg_destroy(hdr);
+}
+
 /*
  *     Routine:        mach_msg_server_once
  *     Purpose:
@@ -453,15 +520,19 @@ mach_msg_server_once(
 
                (void) (*demux)(&bufRequest->Head, &bufReply->Head);
 
-               if (!(bufReply->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX)) {
-                       if (bufReply->RetCode == MIG_NO_REPLY) {
-                               bufReply->Head.msgh_remote_port = MACH_PORT_NULL;
-                       } else if ((bufReply->RetCode != KERN_SUCCESS) &&
-                           (bufRequest->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX)) {
-                               /* destroy the request - but not the reply port */
-                               bufRequest->Head.msgh_remote_port = MACH_PORT_NULL;
-                               mach_msg_destroy(&bufRequest->Head);
-                       }
+               switch (mach_msg_server_mig_return_code(bufReply)) {
+               case KERN_SUCCESS:
+                       break;
+               case MIG_NO_REPLY:
+                       bufReply->Head.msgh_remote_port = MACH_PORT_NULL;
+                       break;
+               default:
+                       /*
+                        * destroy the request - but not the reply port
+                        * (MIG moved it into the bufReply).
+                        */
+                       bufRequest->Head.msgh_remote_port = MACH_PORT_NULL;
+                       mach_msg_destroy(&bufRequest->Head);
                }
 
                /*
@@ -482,18 +553,13 @@ mach_msg_server_once(
                            bufReply->Head.msgh_size, 0, MACH_PORT_NULL,
                            MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL);
 
-                       if ((mr != MACH_SEND_INVALID_DEST) &&
-                           (mr != MACH_SEND_TIMED_OUT)) {
-                               goto done_once;
+                       if (mach_msg_server_is_recoverable_send_error(mr)) {
+                               mach_msg_server_consume_unsent_message(&bufReply->Head);
+                               mr = MACH_MSG_SUCCESS;
                        }
-                       mr = MACH_MSG_SUCCESS;
-               }
-               if (bufReply->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX) {
-                       mach_msg_destroy(&bufReply->Head);
                }
        }
 
-done_once:
        voucher_mach_msg_revert(old_state);
 
        (void)vm_deallocate(self,
@@ -530,7 +596,7 @@ mach_msg_server(
        voucher_mach_msg_state_t old_state = VOUCHER_MACH_MSG_STATE_UNCHANGED;
        boolean_t buffers_swapped = FALSE;
 
-       options &= ~(MACH_SEND_MSG | MACH_RCV_MSG | MACH_RCV_VOUCHER | MACH_RCV_OVERWRITE);
+       options &= ~(MACH_SEND_MSG | MACH_RCV_MSG | MACH_RCV_VOUCHER);
 
        reply_alloc = (mach_msg_size_t)round_page((options & MACH_SEND_TRAILER) ?
            (max_size + MAX_TRAILER_SIZE) : max_size);
@@ -578,15 +644,19 @@ mach_msg_server(
 
                        (void) (*demux)(&bufRequest->Head, &bufReply->Head);
 
-                       if (!(bufReply->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX)) {
-                               if (bufReply->RetCode == MIG_NO_REPLY) {
-                                       bufReply->Head.msgh_remote_port = MACH_PORT_NULL;
-                               } else if ((bufReply->RetCode != KERN_SUCCESS) &&
-                                   (bufRequest->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX)) {
-                                       /* destroy the request - but not the reply port */
-                                       bufRequest->Head.msgh_remote_port = MACH_PORT_NULL;
-                                       mach_msg_destroy(&bufRequest->Head);
-                               }
+                       switch (mach_msg_server_mig_return_code(bufReply)) {
+                       case KERN_SUCCESS:
+                               break;
+                       case MIG_NO_REPLY:
+                               bufReply->Head.msgh_remote_port = MACH_PORT_NULL;
+                               break;
+                       default:
+                               /*
+                                * destroy the request - but not the reply port
+                                * (MIG moved it into the bufReply).
+                                */
+                               bufRequest->Head.msgh_remote_port = MACH_PORT_NULL;
+                               mach_msg_destroy(&bufRequest->Head);
                        }
 
                        /*
@@ -628,32 +698,25 @@ mach_msg_server(
                                                &bufRequest->Head, 0);
                                }
 
-                               if ((mr != MACH_SEND_INVALID_DEST) &&
-                                   (mr != MACH_SEND_TIMED_OUT) &&
-                                   (mr != MACH_RCV_TIMED_OUT)) {
+                               /*
+                                * Need to destroy the reply msg in case if there was a send timeout or
+                                * invalid destination. The reply msg would be swapped with request msg
+                                * if buffers_swapped is true, thus destroy request msg instead of
+                                * reply msg in such cases.
+                                */
+                               if (mach_msg_server_is_recoverable_send_error(mr)) {
+                                       if (buffers_swapped) {
+                                               mach_msg_server_consume_unsent_message(&bufRequest->Head);
+                                       } else {
+                                               mach_msg_server_consume_unsent_message(&bufReply->Head);
+                                       }
+                               } else if (mr != MACH_RCV_TIMED_OUT) {
                                        voucher_mach_msg_revert(old_state);
                                        old_state = VOUCHER_MACH_MSG_STATE_UNCHANGED;
 
                                        continue;
                                }
                        }
-                       /*
-                        * Need to destroy the reply msg in case if there was a send timeout or
-                        * invalid destination. The reply msg would be swapped with request msg
-                        * if buffers_swapped is true, thus destroy request msg instead of
-                        * reply msg in such cases.
-                        */
-                       if (mr != MACH_RCV_TIMED_OUT) {
-                               if (buffers_swapped) {
-                                       if (bufRequest->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX) {
-                                               mach_msg_destroy(&bufRequest->Head);
-                                       }
-                               } else {
-                                       if (bufReply->Head.msgh_bits & MACH_MSGH_BITS_COMPLEX) {
-                                               mach_msg_destroy(&bufReply->Head);
-                                       }
-                               }
-                       }
                        voucher_mach_msg_revert(old_state);
                        old_state = VOUCHER_MACH_MSG_STATE_UNCHANGED;
 
index 52f731b9997edb02b4c208dd64c6a43cb684782f..57d67509466e616870fe0f203631cc25dbd7ca86 100644 (file)
@@ -33,6 +33,7 @@
 #include <mach/mach_sync_ipc.h>
 #include "tsd.h"
 
+
 kern_return_t
 mach_port_names(
        ipc_space_t task,
@@ -57,7 +58,11 @@ mach_port_type(
 {
        kern_return_t rv;
 
-       rv = _kernelrpc_mach_port_type(task, name, ptype);
+       rv = _kernelrpc_mach_port_type_trap(task, name, ptype);
+
+       if (rv == MACH_SEND_INVALID_DEST) {
+               rv = _kernelrpc_mach_port_type(task, name, ptype);
+       }
 
        return rv;
 }
@@ -246,9 +251,14 @@ mach_port_request_notification(
 {
        kern_return_t rv;
 
-       rv = _kernelrpc_mach_port_request_notification(task, name, msgid,
+       rv = _kernelrpc_mach_port_request_notification_trap(task, name, msgid,
            sync, notify, notifyPoly, previous);
 
+       if (rv == MACH_SEND_INVALID_DEST) {
+               rv = _kernelrpc_mach_port_request_notification(task, name, msgid,
+                   sync, notify, notifyPoly, previous);
+       }
+
        return rv;
 }
 
@@ -744,3 +754,31 @@ thread_destruct_special_reply_port(
                return KERN_INVALID_ARGUMENT;
        }
 }
+
+kern_return_t
+mach_port_guard_with_flags(
+       ipc_space_t             task,
+       mach_port_name_t        name,
+       mach_port_context_t     guard,
+       uint64_t                flags)
+{
+       kern_return_t rv;
+
+       rv = _kernelrpc_mach_port_guard_with_flags(task, name, (uint64_t) guard, flags);
+
+       return rv;
+}
+
+kern_return_t
+mach_port_swap_guard(
+       ipc_space_t             task,
+       mach_port_name_t        name,
+       mach_port_context_t     old_guard,
+       mach_port_context_t     new_guard)
+{
+       kern_return_t rv;
+
+       rv = _kernelrpc_mach_port_swap_guard(task, name, (uint64_t)old_guard, (uint64_t)new_guard);
+
+       return rv;
+}
index fe89c6513c8eb2cb643cbd307f802383ec616f31..f8fbf921dee843f783b7fe3f3bf991093546c535 100644 (file)
@@ -58,7 +58,7 @@ mach_vm_allocate(
                rv = _kernelrpc_mach_vm_allocate(target, address, size, flags);
        }
 
-       if (__syscall_logger) {
+       if (__syscall_logger && rv == KERN_SUCCESS && !(flags & VM_MAKE_TAG(VM_MEMORY_STACK))) {
                int userTagFlags = flags & VM_FLAGS_ALIAS_MASK;
                __syscall_logger(stack_logging_type_vm_allocate | userTagFlags, (uintptr_t)target, (uintptr_t)size, 0, (uintptr_t)*address, 0);
        }
@@ -184,7 +184,7 @@ mach_vm_map(
                    offset, copy, cur_protection, max_protection, inheritance);
        }
 
-       if (__syscall_logger) {
+       if (__syscall_logger && rv == KERN_SUCCESS && !(flags & VM_MAKE_TAG(VM_MEMORY_STACK))) {
                int eventTypeFlags = stack_logging_type_vm_allocate | stack_logging_type_mapped_file_or_shared_mem;
                int userTagFlags = flags & VM_FLAGS_ALIAS_MASK;
                __syscall_logger(eventTypeFlags | userTagFlags, (uintptr_t)target, (uintptr_t)size, 0, (uintptr_t)*address, 0);
@@ -213,7 +213,7 @@ mach_vm_remap(
            src_task, src_address, copy, cur_protection, max_protection,
            inheritance);
 
-       if (__syscall_logger) {
+       if (__syscall_logger && rv == KERN_SUCCESS) {
                int eventTypeFlags = stack_logging_type_vm_allocate | stack_logging_type_mapped_file_or_shared_mem;
                int userTagFlags = flags & VM_FLAGS_ALIAS_MASK;
                __syscall_logger(eventTypeFlags | userTagFlags, (uintptr_t)target, (uintptr_t)size, 0, (uintptr_t)*address, 0);
@@ -234,7 +234,7 @@ mach_vm_read(
 
        rv = _kernelrpc_mach_vm_read(target, address, size, data, dataCnt);
 
-       if (__syscall_logger) {
+       if (__syscall_logger && rv == KERN_SUCCESS) {
                int eventTypeFlags = stack_logging_type_vm_allocate | stack_logging_type_mapped_file_or_shared_mem;
                // The target argument is the remote task from which data is being read,
                // so pass mach_task_self() as the destination task receiving the allocation.
@@ -263,7 +263,7 @@ vm_map(
        rv = _kernelrpc_vm_map(target, address, size, mask, flags, object,
            offset, copy, cur_protection, max_protection, inheritance);
 
-       if (__syscall_logger) {
+       if (__syscall_logger && rv == KERN_SUCCESS) {
                int eventTypeFlags = stack_logging_type_vm_allocate | stack_logging_type_mapped_file_or_shared_mem;
                int userTagFlags = flags & VM_FLAGS_ALIAS_MASK;
                __syscall_logger(eventTypeFlags | userTagFlags, (uintptr_t)target, (uintptr_t)size, 0, (uintptr_t)*address, 0);
@@ -313,7 +313,7 @@ vm_read(
 
        rv = _kernelrpc_vm_read(target, address, size, data, dataCnt);
 
-       if (__syscall_logger) {
+       if (__syscall_logger && rv == KERN_SUCCESS) {
                int eventTypeFlags = stack_logging_type_vm_allocate | stack_logging_type_mapped_file_or_shared_mem;
                // The target argument is the remote task from which data is being read,
                // so pass mach_task_self() as the destination task receiving the allocation.
diff --git a/libsyscall/mach/memory_entry.defs b/libsyscall/mach/memory_entry.defs
new file mode 100644 (file)
index 0000000..1cfbe3d
--- /dev/null
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#include <mach/memory_entry.defs>
index 035cf223735dd20dbfcdac93dd5057eb4b4e847e..2e086c80d061ccf1a7943b9e67e02238ba244bc7 100644 (file)
@@ -69,8 +69,10 @@ mach_host_special_port_description(int port)
                [HOST_RESOURCE_NOTIFY_PORT] = "resource notify",
                [HOST_CLOSURED_PORT] = "closured",
                [HOST_SYSPOLICYD_PORT] = "syspolicyd",
+               [HOST_FILECOORDINATIOND_PORT] = "filecoordinationd",
+               [HOST_FAIRPLAYD_PORT] = "fairplayd",
        };
-       _Static_assert(HOST_SYSPOLICYD_PORT == HOST_MAX_SPECIAL_PORT,
+       _Static_assert(HOST_FAIRPLAYD_PORT == HOST_MAX_SPECIAL_PORT,
            "all host special ports must have descriptions");
 
        return hsp_descs[port_index];
@@ -149,6 +151,7 @@ mach_host_special_port_for_id(const char *id)
                SP_ENTRY(HOST_RESOURCE_NOTIFY_PORT),
                SP_ENTRY(HOST_CLOSURED_PORT),
                SP_ENTRY(HOST_SYSPOLICYD_PORT),
+               SP_ENTRY(HOST_FILECOORDINATIOND_PORT),
        };
 
        return port_for_id_internal(id, hsp_ids,
diff --git a/libsyscall/mach/restartable.defs b/libsyscall/mach/restartable.defs
new file mode 100644 (file)
index 0000000..d43bda0
--- /dev/null
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2019 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#include <mach/restartable.defs>
index fefeb9d507192aa189bb1f04e4f88ad8ba46c5f5..8ed90a0e5fbafab9855e86fa19c9f2396b79e4bd 100644 (file)
@@ -43,7 +43,6 @@
  */
 
 #include <mach/mach.h>
-#include <mach-o/arch.h>
 #include <stddef.h>
 
 kern_return_t
diff --git a/libsyscall/os/log_data.c b/libsyscall/os/log_data.c
new file mode 100644 (file)
index 0000000..baaafcf
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <sys/log_data.h>
+
+extern int __log_data(unsigned int tag, unsigned int flags, void *buffer, unsigned int size);
+
+int
+log_data_as_kernel(unsigned int tag, unsigned int flags, void *buffer, unsigned int size)
+{
+       return __log_data(tag, flags, buffer, size);
+}
diff --git a/libsyscall/os/proc.h b/libsyscall/os/proc.h
new file mode 100644 (file)
index 0000000..84a3396
--- /dev/null
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef __OS_PROC__
+#define __OS_PROC__
+
+#include <stddef.h>
+#include <sys/cdefs.h>
+#include <os/availability.h>
+
+/*!
+ * @header
+ *
+ * @preprocinfo
+ * This is for functions that operate on the calling process alone.
+ */
+
+__BEGIN_DECLS
+
+/*!
+ * @function os_proc_available_memory
+ *
+ * @abstract
+ * Return the number of bytes remaining, at the time of the call, before the
+ * current process will hit its current dirty memory limit.
+ *
+ * @discussion
+ * Developers can query this value efficiently whenever it is needed. The return
+ * value is only a snapshot at the time of the call. Caching the result is not
+ * advised. The result may be instantaneously invalidated by actions taken in
+ * another thread or another framework.
+ *
+ * Memory limits can change during the app life cycle. Make sure to check accordingly.
+ *
+ * The size returned is not representative of the total memory of the device, it
+ * is the current dirty memory limit minus the dirty memory footprint used at the
+ * time of the query.
+ *
+ * This interface allows an app to efficiently consume all available memory resources.
+ * Significant memory use, even under the current memory limit, may still cause
+ * system-wide performance including the termination of other apps and system
+ * processes. Take care to use the minimum amount of memory needed to satisfy the
+ * user’s need.
+ *
+ * If you need more information than just the available memory, you can use task_info().
+ * The information returned is equivalent to the task_vm_info.limit_bytes_remaining
+ * field. task_info() is a more expensive call, and will return information such
+ * as your phys_footprint, which is used to calculate the return of this function.
+ *
+ * Dirty memory contains data that must be kept in RAM (or the equivalent) even
+ * when unused. It is memory that has been modified.
+ *
+ * @param none
+ *
+ * @result
+ * The remaining bytes. 0 is returned if the calling process is not an app, or
+ * the calling process exceeds its memory limit.
+ */
+
+    API_UNAVAILABLE(macos) API_AVAILABLE(ios(13.0), tvos(13.0), watchos(6.0), bridgeos(4.0))
+extern
+size_t os_proc_available_memory(void);
+
+__END_DECLS
+
+#endif
index 474c97aec2976bba7491dc0bd2e3c457e3acac1e..e4ab6d678ac8c65fdebe761c96566037c2784051 100644 (file)
@@ -58,7 +58,7 @@ __attribute__((always_inline))
 static __inline__ unsigned int
 _os_cpu_number(void)
 {
-#if defined(__arm__) && defined(_ARM_ARCH_6)
+#if defined(__arm__)
        uintptr_t p;
        __asm__("mrc    p15, 0, %[p], c13, c0, 3" : [p] "=&r" (p));
        return (unsigned int)(p & 0x3ul);
@@ -116,16 +116,16 @@ __attribute__((always_inline, pure))
 static __inline__ void**
 _os_tsd_get_base(void)
 {
-#if defined(__arm__) && defined(_ARM_ARCH_6)
+#if defined(__arm__)
        uintptr_t tsd;
-       __asm__("mrc p15, 0, %0, c13, c0, 3" : "=r" (tsd));
-       tsd &= ~0x3ul; /* lower 2-bits contain CPU number */
-#elif defined(__arm__) && defined(_ARM_ARCH_5)
-       register uintptr_t tsd asm ("r9");
+       __asm__("mrc p15, 0, %0, c13, c0, 3\n"
+                "bic %0, %0, #0x3\n" : "=r" (tsd));
+       /* lower 2-bits contain CPU number */
 #elif defined(__arm64__)
        uint64_t tsd;
-       __asm__("mrs %0, TPIDRRO_EL0" : "=r" (tsd));
-       tsd &= ~0x7ull;
+       __asm__("mrs %0, TPIDRRO_EL0\n"
+                "bic %0, %0, #0x7\n" : "=r" (tsd));
+       /* lower 3-bits contain CPU number */
 #endif
 
        return (void**)(uintptr_t)tsd;
index 0b295ba10de48b3292f27ecaf32efdee9ecc383a..2f84dba81380bca4825c21db9c87dc53fafdb890 100644 (file)
@@ -54,6 +54,7 @@ fcntl(int fd, int cmd, ...)
        case F_LOG2PHYS:
        case F_LOG2PHYS_EXT:
        case F_GETPATH:
+       case F_GETPATH_NOFIRMLINK:
        case F_GETPATH_MTMINFO:
        case F_GETCODEDIR:
        case F_PATHPKG_CHECK:
@@ -66,6 +67,7 @@ fcntl(int fd, int cmd, ...)
        case F_FINDSIGS:
        case F_TRANSCODEKEY:
        case F_TRIM_ACTIVE_FILE:
+       case F_SPECULATIVE_READ:
        case F_CHECK_LV:
                arg = va_arg(ap, void *);
                break;
index 33da1103d43d58fbd86a0f3f872e5a48bf5df897..6c183a9f78da5ccd191137bc0738f55b7d6d878d 100644 (file)
@@ -30,6 +30,7 @@
 /* Syscall entry points */
 int __coalition(uint32_t operation, uint64_t *cid, uint32_t flags);
 int __coalition_info(uint32_t operation, uint64_t *cid, void *buffer, size_t *bufsize);
+int __coalition_ledger(uint32_t operation, uint64_t *cid, void *buffer, size_t *bufsize);
 
 int
 coalition_create(uint64_t *cid_out, uint32_t flags)
@@ -67,3 +68,10 @@ coalition_info_set_efficiency(uint64_t cid, uint64_t flags)
        size_t size = sizeof(flags);
        return __coalition_info(COALITION_INFO_SET_EFFICIENCY, &cid, (void *)&flags, &size);
 }
+
+int
+coalition_ledger_set_logical_writes_limit(uint64_t cid, int64_t limit)
+{
+       size_t size = sizeof(limit);
+       return __coalition_ledger(COALITION_LEDGER_SET_LOGICAL_WRITES_LIMIT, &cid, (void *)&limit, &size);
+}
index daa0ea30a0dd0d1a654047600756d03cb2ea9d5f..60b0e26de7b4673c0f6c8ba589b33fefd6afb9c6 100644 (file)
@@ -29,7 +29,7 @@
 
 #include "gethostuuid_private.h"
 
-extern int __gethostuuid(uuid_t, const struct timespec *, int);
+extern int __gethostuuid(uuid_t, const struct timespec *);
 
 static volatile int (*_gethostuuid_callback)(uuid_t) = (void *)0;
 
@@ -38,7 +38,7 @@ gethostuuid(uuid_t uuid, const struct timespec *timeout)
 {
        int result;
 
-       result = __gethostuuid(uuid, timeout, 0);
+       result = __gethostuuid(uuid, timeout);
        if ((result == -1) && (errno == EPERM)) {
                if (_gethostuuid_callback) {
                        result = _gethostuuid_callback(uuid);
@@ -51,11 +51,11 @@ gethostuuid(uuid_t uuid, const struct timespec *timeout)
        return result;
 }
 
-/* SPI to call gethostuuid syscall directly, without fallback */
+/* SPI to call gethostuuid syscall directly, without fallback, need an entitlement */
 int
 _getprivatesystemidentifier(uuid_t uuid, const struct timespec *timeout)
 {
-       return __gethostuuid(uuid, timeout, 1);
+       return __gethostuuid(uuid, timeout);
 }
 
 int
index db097ad446de04d3658833285c8492ef2c89dc39..e09f849cce42cc9eb38abb4201a1860fc8ac7795 100644 (file)
@@ -33,7 +33,7 @@ getiopolicy_np(int iotype, int scope)
        int policy, error;
        struct _iopol_param_t iop_param;
 
-       if ((iotype != IOPOL_TYPE_DISK && iotype != IOPOL_TYPE_VFS_ATIME_UPDATES) ||
+       if ((iotype != IOPOL_TYPE_DISK && iotype != IOPOL_TYPE_VFS_ATIME_UPDATES && iotype != IOPOL_TYPE_VFS_MATERIALIZE_DATALESS_FILES) ||
            (scope != IOPOL_SCOPE_PROCESS && scope != IOPOL_SCOPE_THREAD)) {
                errno = EINVAL;
                policy = -1;
index e42794a493f8addd527320554ae9a365f8d0c3b6..d7409d5419169383e98735e9deed9074f1c0b5ce 100644 (file)
@@ -116,6 +116,13 @@ kdebug_is_enabled(uint32_t debugid)
        return TRUE;
 }
 
+bool
+kdebug_using_continuous_time(void)
+{
+       uint32_t state = *((volatile uint32_t *)(uintptr_t)(_COMM_PAGE_KDEBUG_ENABLE));
+       return state & KDEBUG_ENABLE_CONT_TIME;
+}
+
 int
 kdebug_trace(uint32_t debugid, uint64_t arg1, uint64_t arg2, uint64_t arg3,
     uint64_t arg4)
index 4c6fc235696db3068a3d00beca4b303c516fc203..8cf27b6ac87931e00fbd4ed0787326da0362b651 100644 (file)
@@ -226,16 +226,16 @@ int
 proc_regionfilename(int pid, uint64_t address, void * buffer, uint32_t buffersize)
 {
        int retval;
-       struct proc_regionwithpathinfo reginfo;
+       struct proc_regionpath path;
 
        if (buffersize < MAXPATHLEN) {
                errno = ENOMEM;
                return 0;
        }
 
-       retval = proc_pidinfo(pid, PROC_PIDREGIONPATHINFO2, (uint64_t)address, &reginfo, sizeof(struct proc_regionwithpathinfo));
+       retval = proc_pidinfo(pid, PROC_PIDREGIONPATH, (uint64_t)address, &path, sizeof(struct proc_regionpath));
        if (retval != -1) {
-               return (int)(strlcpy(buffer, reginfo.prp_vip.vip_path, MAXPATHLEN));
+               return (int)(strlcpy(buffer, path.prpo_path, buffersize));
        }
        return 0;
 }
@@ -622,7 +622,7 @@ proc_clear_cpulimits(pid_t pid)
        }
 }
 
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 
 int
 proc_setcpu_deadline(pid_t pid, int action, uint64_t deadline)
@@ -739,7 +739,7 @@ proc_can_use_foreground_hw(int pid, uint32_t *reason)
 {
        return __proc_info(PROC_INFO_CALL_CANUSEFGHW, pid, 0, 0, reason, sizeof(*reason));
 }
-#endif /* TARGET_OS_EMBEDDED */
+#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
 
 
 /* Donate importance to adaptive processes from this process */
@@ -748,19 +748,19 @@ proc_donate_importance_boost()
 {
        int rval;
 
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
        rval = __process_policy(PROC_POLICY_SCOPE_PROCESS,
            PROC_POLICY_ACTION_ENABLE,
            PROC_POLICY_APPTYPE,
            PROC_POLICY_IOS_DONATEIMP,
            NULL, getpid(), (uint64_t)0);
-#else /* TARGET_OS_EMBEDDED */
+#else /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
        rval = __process_policy(PROC_POLICY_SCOPE_PROCESS,
            PROC_POLICY_ACTION_SET,
            PROC_POLICY_BOOST,
            PROC_POLICY_IMP_DONATION,
            NULL, getpid(), 0);
-#endif /* TARGET_OS_EMBEDDED */
+#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
 
        if (rval == 0) {
                return 0;
@@ -903,7 +903,7 @@ proc_denap_assertion_complete(uint64_t assertion_token)
        return proc_importance_assertion_complete(assertion_token);
 }
 
-#if !TARGET_OS_EMBEDDED
+#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 
 int
 proc_clear_vmpressure(pid_t pid)
@@ -992,7 +992,7 @@ proc_enable_apptype(pid_t pid, int apptype)
        }
 }
 
-#if !TARGET_IPHONE_SIMULATOR
+#if !TARGET_OS_SIMULATOR
 
 int
 proc_suppress(__unused pid_t pid, __unused uint64_t *generation)
@@ -1000,6 +1000,6 @@ proc_suppress(__unused pid_t pid, __unused uint64_t *generation)
        return 0;
 }
 
-#endif /* !TARGET_IPHONE_SIMULATOR */
+#endif /* !TARGET_OS_SIMULATOR */
 
-#endif /* !TARGET_OS_EMBEDDED */
+#endif /* !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
index 1f4bc60da33b19fe0bd8dd97471510a22048e0f8..c154510f05bbe1a71d0bfb1e740dc8c2fb659b18 100644 (file)
@@ -41,7 +41,7 @@ int proc_clear_cpulimits(pid_t pid) __OSX_AVAILABLE_STARTING(__MAC_10_12_2, __IP
 /* CPU limits, applies to current thread only. 0% unsets limit */
 int proc_setthread_cpupercent(uint8_t percentage, uint32_t ms_refill) __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_5_0);
 
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 
 /* CPU monitor action, continued */
 #define PROC_SETCPU_ACTION_SUSPEND      2
@@ -86,7 +86,7 @@ int proc_pidbind(int pid, uint64_t threadid, int bind);
  */
 int proc_can_use_foreground_hw(int pid, uint32_t *reason);
 
-#else /* TARGET_OS_EMBEDDED */
+#else /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
 
 /* resume the process suspend due to low VM resource */
 int proc_clear_vmpressure(pid_t pid);
@@ -113,7 +113,7 @@ int proc_clear_delayidlesleep(void);
 int proc_disable_apptype(pid_t pid, int apptype);
 int proc_enable_apptype(pid_t pid, int apptype);
 
-#endif /* TARGET_OS_EMBEDDED */
+#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
 
 /* mark process as importance donating */
 int proc_donate_importance_boost(void);
@@ -160,7 +160,7 @@ int proc_pidoriginatorinfo(int flavor, void *buffer, int buffersize) __OSX_AVAIL
 
 int proc_listcoalitions(int flavor, int coaltype, void *buffer, int buffersize) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_8_3);
 
-#if !TARGET_IPHONE_SIMULATOR
+#if !TARGET_OS_SIMULATOR
 
 #define PROC_SUPPRESS_SUCCESS                (0)
 #define PROC_SUPPRESS_BAD_ARGUMENTS         (-1)
@@ -168,7 +168,7 @@ int proc_listcoalitions(int flavor, int coaltype, void *buffer, int buffersize)
 #define PROC_SUPPRESS_ALREADY_SUPPRESSED    (-3)
 
 int proc_suppress(pid_t pid, uint64_t *generation);
-#endif /* !TARGET_IPHONE_SIMULATOR */
+#endif /* !TARGET_OS_SIMULATOR */
 
 __END_DECLS
 
index 2c637bfc933740fb4174eaf6dbe650de6741a5bd..5b3b36541fd51f5f185856bcc34b89ef49d5e065 100644 (file)
@@ -178,7 +178,7 @@ _mach_absolute_time:
        movw    ip, #((_COMM_PAGE_TIMEBASE_OFFSET) & 0x0000FFFF)
        movt    ip, #(((_COMM_PAGE_TIMEBASE_OFFSET) >> 16) & 0x0000FFFF)
        ldrb    r0, [ip, #((_COMM_PAGE_USER_TIMEBASE) - (_COMM_PAGE_TIMEBASE_OFFSET))]
-       cmp     r0, #0                          // Are userspace reads supported?
+       cmp     r0, #USER_TIMEBASE_NONE         // Are userspace reads supported?
        beq     _mach_absolute_time_kernel      // If not, go to the kernel
        isb                                     // Prevent speculation on CNTPCT across calls
                                                // (see ARMV7C.b section B8.1.2, ARMv8 section D6.1.2)
@@ -242,7 +242,7 @@ _mach_absolute_time:
        movk    x3, #(((_COMM_PAGE_TIMEBASE_OFFSET) >> 16) & 0x000000000000FFFF), lsl #16
        movk    x3, #((_COMM_PAGE_TIMEBASE_OFFSET) & 0x000000000000FFFF)
        ldrb    w2, [x3, #((_COMM_PAGE_USER_TIMEBASE) - (_COMM_PAGE_TIMEBASE_OFFSET))]
-       cmp     x2, #0                          // Are userspace reads supported?
+       cmp     x2, #USER_TIMEBASE_NONE         // Are userspace reads supported?
        b.eq    _mach_absolute_time_kernel      // If not, go to the kernel
        isb                                     // Prevent speculation on CNTPCT across calls
                                                // (see ARMV7C.b section B8.1.2, ARMv8 section D6.1.2)
@@ -253,7 +253,9 @@ L_mach_absolute_time_user:
        cmp     x1, x2                          // Compare our offset values...
        b.ne    L_mach_absolute_time_user       // If they changed, try again
        add     x0, x0, x1                      // Construct mach_absolute_time
-       ret     
+       ret
+
+
 
        .text
        .align 2
index 353ef0d87d9cef4f548c937fae0b93c47eb1cd75..c128ac1b7fd4bf134edd1f9ffaf4d0d068d54c8b 100644 (file)
@@ -59,11 +59,11 @@ kern_return_t
 _mach_continuous_hwclock(uint64_t *cont_time __unused)
 {
 #if defined(__arm64__)
+#define ISB_SY          0xf
        uint8_t cont_hwclock = *((uint8_t*)_COMM_PAGE_CONT_HWCLOCK);
-       uint64_t timebase;
        if (cont_hwclock) {
-               __asm__ volatile ("isb\n" "mrs %0, CNTPCT_EL0" : "=r"(timebase));
-               *cont_time = timebase;
+               __builtin_arm_isb(ISB_SY);
+               *cont_time = __builtin_arm_rsr64("CNTPCT_EL0");
                return KERN_SUCCESS;
        }
 #endif
index f44b4b1ebf64b9ec7bf2b333b06dee2e39a18c90..4a8a3f19e327d4a946347a6f6e04ee8fa53bc84c 100644 (file)
@@ -64,7 +64,7 @@ mach_get_times(uint64_t* absolute_time, uint64_t* cont_time, struct timespec *tp
                        if (__gettimeofday_with_mach(&tv, NULL, &tbr) < 0) {
                                return KERN_FAILURE;
                        } else if (tbr == 0) {
-#if !TARGET_OS_EMBEDDED
+#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
                                // On an old kernel, likely chroot'ed. (remove next year)
                                tbr = mach_absolute_time();
 #else
index 67bee1546a32f1745d020fbfcdf6573fd6e17e87..513543952e18765f98af1291081698aa42edcb49 100644 (file)
 #include "strings.h"
 
 /* syscall entry point */
-int __persona(uint32_t operation, uint32_t flags, struct kpersona_info *info, uid_t *id, size_t *idlen);
+int __persona(uint32_t operation, uint32_t flags, struct kpersona_info *info, uid_t *id, size_t *idlen, char *path);
 
 int
 kpersona_alloc(struct kpersona_info *info, uid_t *id)
 {
        size_t idlen = 1;
-       return __persona(PERSONA_OP_ALLOC, 0, info, id, &idlen);
+       return __persona(PERSONA_OP_ALLOC, 0, info, id, &idlen, NULL);
+}
+
+int
+kpersona_palloc(struct kpersona_info *info, uid_t *id, char path[MAXPATHLEN])
+{
+       size_t idlen = 1;
+       return __persona(PERSONA_OP_PALLOC, 0, info, id, &idlen, path);
 }
 
 int
 kpersona_dealloc(uid_t id)
 {
        size_t idlen = 1;
-       return __persona(PERSONA_OP_DEALLOC, 0, NULL, &id, &idlen);
+       return __persona(PERSONA_OP_DEALLOC, 0, NULL, &id, &idlen, NULL);
 }
 
 int
@@ -53,7 +60,7 @@ kpersona_get(uid_t *id)
        if (p_id == PERSONA_ID_NONE) {
                int ret = 0;
                size_t idlen = 1;
-               ret = __persona(PERSONA_OP_GET, 0, NULL, &p_id, &idlen);
+               ret = __persona(PERSONA_OP_GET, 0, NULL, &p_id, &idlen, NULL);
                if (ret != 0) {
                        return ret;
                }
@@ -62,11 +69,18 @@ kpersona_get(uid_t *id)
        return 0;
 }
 
+int
+kpersona_getpath(uid_t id, char path[MAXPATHLEN])
+{
+       size_t idlen = 1;
+       return __persona(PERSONA_OP_GETPATH, 0, NULL, &id, &idlen, path);
+}
+
 int
 kpersona_info(uid_t id, struct kpersona_info *info)
 {
        size_t idlen = 1;
-       return __persona(PERSONA_OP_INFO, 0, info, &id, &idlen);
+       return __persona(PERSONA_OP_INFO, 0, info, &id, &idlen, NULL);
 }
 
 int
@@ -74,7 +88,7 @@ kpersona_pidinfo(pid_t pid, struct kpersona_info *info)
 {
        size_t idlen = 1;
        uid_t id = (uid_t)pid;
-       return __persona(PERSONA_OP_PIDINFO, 0, info, &id, &idlen);
+       return __persona(PERSONA_OP_PIDINFO, 0, info, &id, &idlen, NULL);
 }
 
 int
@@ -92,7 +106,26 @@ kpersona_find(const char *name, uid_t uid, uid_t *id, size_t *idlen)
        if (name) {
                strlcpy(kinfo.persona_name, name, sizeof(kinfo.persona_name));
        }
-       ret = __persona(PERSONA_OP_FIND, 0, &kinfo, id, idlen);
+       ret = __persona(PERSONA_OP_FIND, 0, &kinfo, id, idlen, NULL);
+       if (ret < 0) {
+               return ret;
+       }
+       return (int)(*idlen);
+}
+
+int
+kpersona_find_by_type(int persona_type, uid_t *id, size_t *idlen)
+{
+       int ret;
+       struct kpersona_info kinfo;
+       kinfo.persona_info_version = PERSONA_INFO_V1;
+       kinfo.persona_type = persona_type;
+       kinfo.persona_id = -1;
+       kinfo.persona_gid = 0;
+       kinfo.persona_ngroups = 0;
+       kinfo.persona_groups[0] = 0;
+       kinfo.persona_name[0] = 0;
+       ret = __persona(PERSONA_OP_FIND_BY_TYPE, 0, &kinfo, id, idlen, NULL);
        if (ret < 0) {
                return ret;
        }
diff --git a/libsyscall/wrappers/proc.c b/libsyscall/wrappers/proc.c
new file mode 100644 (file)
index 0000000..ce95bce
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <os/availability.h>
+#include <TargetConditionals.h>
+#include <os/proc.h>
+
+#if !TARGET_OS_OSX
+extern uint64_t __memorystatus_available_memory(void);
+
+size_t
+os_proc_available_memory(void)
+{
+       return (size_t)__memorystatus_available_memory();
+}
+#endif
index 1aff8f18221df9b2bffdbae6f272b758230530cf..ac235cbe4da1594e0336e9c91569b90ce9bb8591 100644 (file)
@@ -25,7 +25,7 @@
 #include <unistd.h>
 #include <TargetConditionals.h>
 
-#if !TARGET_OS_EMBEDDED
+#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 /*
  * system call stubs are no longer generated for these from
  * syscalls.master. Instead, provide simple stubs here.
@@ -45,4 +45,4 @@ setquota(void)
 {
        return kill(getpid(), SIGSYS);
 }
-#endif /* !TARGET_OS_EMBEDDED */
+#endif /* !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
index 4aee6e02d48ac92297252f92a20796bde5e56d25..7d7762110d027009573092685f196fd7f59cce43 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2015-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
diff --git a/libsyscall/wrappers/skywalk/os_channel_event.c b/libsyscall/wrappers/skywalk/os_channel_event.c
new file mode 100644 (file)
index 0000000..2d6fde5
--- /dev/null
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
index 819863128241b7233fb8f26c0c4c65fe6e6780e6..ae5585b048ed24a32e279881c5aea5959995009f 100644 (file)
@@ -37,6 +37,7 @@
 #include <string.h>
 #include <strings.h>
 #include <mach/port.h>
+#include <mach/mach_param.h> /* for TASK_PORT_REGISTER_MAX */
 #include <mach/exception_types.h>
 #include <mach/coalition.h> /* for COALITION_TYPE_MAX */
 #include <sys/kern_memorystatus.h>
@@ -137,6 +138,8 @@ posix_spawnattr_init(posix_spawnattr_t *attr)
 
                (*psattrp)->psa_persona_info = NULL;
 
+               (*psattrp)->psa_posix_cred_info = NULL;
+
                /*
                 * old coalition field
                 * For backwards compatibility reasons, we set this to 1
@@ -183,6 +186,8 @@ posix_spawnattr_init(posix_spawnattr_t *attr)
 static int posix_spawn_destroyportactions_np(posix_spawnattr_t *);
 static int posix_spawn_destroycoalition_info_np(posix_spawnattr_t *);
 static int posix_spawn_destroypersona_info_np(posix_spawnattr_t *);
+static int posix_spawn_destroyposix_cred_info_np(posix_spawnattr_t *);
+static int posix_spawn_destroymacpolicy_info_np(posix_spawnattr_t *);
 
 int
 posix_spawnattr_destroy(posix_spawnattr_t *attr)
@@ -197,6 +202,8 @@ posix_spawnattr_destroy(posix_spawnattr_t *attr)
        posix_spawn_destroyportactions_np(attr);
        posix_spawn_destroycoalition_info_np(attr);
        posix_spawn_destroypersona_info_np(attr);
+       posix_spawn_destroyposix_cred_info_np(attr);
+       posix_spawn_destroymacpolicy_info_np(attr);
 
        free(psattr);
        *attr = NULL;
@@ -841,6 +848,31 @@ posix_spawn_destroypersona_info_np(posix_spawnattr_t *attr)
        return 0;
 }
 
+/*
+ * posix_spawn_destroyposix_cred_info_np
+ * Description: clean up posix_cred_info struct in posix_spawnattr_t attr
+ */
+static int
+posix_spawn_destroyposix_cred_info_np(posix_spawnattr_t *attr)
+{
+       _posix_spawnattr_t psattr;
+       struct _posix_spawn_posix_cred_info *pspci;
+
+       if (attr == NULL || *attr == NULL) {
+               return EINVAL;
+       }
+
+       psattr = *(_posix_spawnattr_t *)attr;
+       pspci = psattr->psa_posix_cred_info;
+       if (pspci == NULL) {
+               return EINVAL;
+       }
+
+       psattr->psa_posix_cred_info = NULL;
+       free(pspci);
+       return 0;
+}
+
 /*
  * posix_spawn_appendportaction_np
  * Description: append a port action, grow the array if necessary
@@ -1276,7 +1308,69 @@ posix_spawn_file_actions_adddup2(posix_spawn_file_actions_t *file_actions,
 
        psfileact->psfaa_type = PSFA_DUP2;
        psfileact->psfaa_filedes = filedes;
-       psfileact->psfaa_openargs.psfao_oflag = newfiledes;
+       psfileact->psfaa_dup2args.psfad_newfiledes = newfiledes;
+
+       return 0;
+}
+
+/*
+ * posix_spawn_file_actions_add_fileportdup2_np
+ *
+ * Description:        Add a dup2 action to the object referenced by 'file_actions'
+ *             that will cause the file referenced by 'fileport' to be
+ *             attempted to be dup2'ed to the descriptor 'newfiledes' in the
+ *             spawned process.
+ *
+ * Parameters: file_actions            File action object to augment
+ *             filedes                 fileport to dup2
+ *             newfiledes              fd to dup2 it to
+ *
+ * Returns:    0                       Success
+ *             EBADF                   fileport isn't a valid port, or the
+ *                                     value specified by newfiledes is
+ *                                     negative or greater than or equal to
+ *                                     {OPEN_MAX}.
+ *             ENOMEM                  Insufficient memory exists to add to
+ *                                     the spawn file actions object.
+ *
+ * NOTIMP:     Allowed failures (checking NOT required):
+ *             EINVAL  The value specified by file_actions is invalid.
+ */
+int
+posix_spawn_file_actions_add_fileportdup2_np(
+       posix_spawn_file_actions_t *file_actions,
+       mach_port_t fileport, int newfiledes)
+{
+       _posix_spawn_file_actions_t *psactsp;
+       _psfa_action_t *psfileact;
+
+       if (file_actions == NULL || *file_actions == NULL) {
+               return EINVAL;
+       }
+
+       psactsp = (_posix_spawn_file_actions_t *)file_actions;
+       /* Range check; required by POSIX */
+       if (!MACH_PORT_VALID(fileport) ||
+           newfiledes < 0 || newfiledes >= OPEN_MAX) {
+               return EBADF;
+       }
+
+       /* If we do not have enough slots, grow the structure */
+       if ((*psactsp)->psfa_act_count == (*psactsp)->psfa_act_alloc) {
+               /* need to grow file actions structure */
+               if (_posix_spawn_file_actions_grow(psactsp)) {
+                       return ENOMEM;
+               }
+       }
+
+       /*
+        * Allocate next available slot and fill it out
+        */
+       psfileact = &(*psactsp)->psfa_act_acts[(*psactsp)->psfa_act_count++];
+
+       psfileact->psfaa_type = PSFA_FILEPORT_DUP2;
+       psfileact->psfaa_fileport = fileport;
+       psfileact->psfaa_dup2args.psfad_newfiledes = newfiledes;
 
        return 0;
 }
@@ -1351,6 +1445,117 @@ posix_spawn_file_actions_addinherit_np(posix_spawn_file_actions_t *file_actions,
        return 0;
 }
 
+
+/*
+ * posix_spawn_file_actions_addchdir_np
+ *
+ * Description:        Add a chdir action to the object referenced by 'file_actions'
+ *             that will cause the current working directory to attempt to be changed
+ *      to that referenced by 'path' in the spawned process.
+ *
+ * Parameters: file_actions            File action object to augment
+ *             path                    path of the desired working directory
+ *
+ * Returns:    0                       Success
+ *             ENOMEM                  Insufficient memory exists to add to
+ *                                     the spawn file actions object.
+ *             ENAMETOOLONG    The supplied path exceeded PATH_MAX.
+ *
+ * NOTIMP:     Allowed failures (checking NOT required):
+ *             EINVAL  The value specified by file_actions is invalid.
+ */
+int
+posix_spawn_file_actions_addchdir_np(
+       posix_spawn_file_actions_t * __restrict file_actions,
+       const char * __restrict path)
+{
+       _posix_spawn_file_actions_t *psactsp;
+       _psfa_action_t *psfileact;
+
+       if (file_actions == NULL || *file_actions == NULL) {
+               return EINVAL;
+       }
+
+       psactsp = (_posix_spawn_file_actions_t *)file_actions;
+
+       /* If we do not have enough slots, grow the structure */
+       if ((*psactsp)->psfa_act_count == (*psactsp)->psfa_act_alloc) {
+               /* need to grow file actions structure */
+               if (_posix_spawn_file_actions_grow(psactsp)) {
+                       return ENOMEM;
+               }
+       }
+
+       /*
+        * Allocate next available slot and fill it out
+        */
+       psfileact = &(*psactsp)->psfa_act_acts[(*psactsp)->psfa_act_count++];
+
+       psfileact->psfaa_type = PSFA_CHDIR;
+       if (strlcpy(psfileact->psfaa_chdirargs.psfac_path, path, PATH_MAX) >= PATH_MAX) {
+               (*psactsp)->psfa_act_count--;
+               return ENAMETOOLONG;
+       }
+
+       return 0;
+}
+
+
+/*
+ * posix_spawn_file_actions_fchdir_np
+ *
+ * Description:        Add a fchdir action to the object referenced by 'file_actions'
+ *             that will cause the current working directory to attempt to be changed
+ *      to that referenced by the descriptor 'filedes' in the spawned process.
+ *
+ * Parameters: file_actions            File action object to augment
+ *             filedes                 fd to chdir to
+ *
+ * Returns:    0                       Success
+ *             EBADF                   The value specified by either fildes is negative or
+ *                              greater than or equal to {OPEN_MAX}.
+ *             ENOMEM                  Insufficient memory exists to add to
+ *                                     the spawn file actions object.
+ *
+ * NOTIMP:     Allowed failures (checking NOT required):
+ *             EINVAL  The value specified by file_actions is invalid.
+ */
+int
+posix_spawn_file_actions_addfchdir_np(posix_spawn_file_actions_t *file_actions,
+    int filedes)
+{
+       _posix_spawn_file_actions_t *psactsp;
+       _psfa_action_t *psfileact;
+
+       if (file_actions == NULL || *file_actions == NULL) {
+               return EINVAL;
+       }
+
+       psactsp = (_posix_spawn_file_actions_t *)file_actions;
+       /* Range check; in spirit of POSIX */
+       if (filedes < 0 || filedes >= OPEN_MAX) {
+               return EBADF;
+       }
+
+       /* If we do not have enough slots, grow the structure */
+       if ((*psactsp)->psfa_act_count == (*psactsp)->psfa_act_alloc) {
+               /* need to grow file actions structure */
+               if (_posix_spawn_file_actions_grow(psactsp)) {
+                       return ENOMEM;
+               }
+       }
+
+       /*
+        * Allocate next available slot and fill it out
+        */
+       psfileact = &(*psactsp)->psfa_act_acts[(*psactsp)->psfa_act_count++];
+
+       psfileact->psfaa_type = PSFA_FCHDIR;
+       psfileact->psfaa_filedes = filedes;
+
+       return 0;
+}
+
 int
 posix_spawnattr_setcpumonitor_default(posix_spawnattr_t * __restrict attr)
 {
@@ -1393,7 +1598,7 @@ posix_spawnattr_getcpumonitor(posix_spawnattr_t * __restrict attr,
        return 0;
 }
 
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 /*
  * posix_spawnattr_setjetsam
  *
@@ -1427,7 +1632,7 @@ posix_spawnattr_setjetsam(posix_spawnattr_t * __restrict attr,
 
        return posix_spawnattr_setjetsam_ext(attr, flags_ext, priority, memlimit, memlimit);
 }
-#endif /* TARGET_OS_EMBEDDED */
+#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
 
 /*
  * posix_spawnattr_setjetsam_ext
@@ -1520,7 +1725,7 @@ posix_spawnattr_set_importancewatch_port_np(posix_spawnattr_t * __restrict attr,
                        .port_type = PSPA_IMP_WATCHPORTS,
                        .new_port = portarray[i],
                };
-               int err = posix_spawn_appendportaction_np(attr, &action);
+               err = posix_spawn_appendportaction_np(attr, &action);
                if (err) {
                        break;
                }
@@ -1528,6 +1733,28 @@ posix_spawnattr_set_importancewatch_port_np(posix_spawnattr_t * __restrict attr,
        return err;
 }
 
+int
+posix_spawnattr_set_registered_ports_np(posix_spawnattr_t * __restrict attr,
+    mach_port_t portarray[], uint32_t count)
+{
+       int err = 0;
+
+       if (count > TASK_PORT_REGISTER_MAX) {
+               return EINVAL;
+       }
+
+       for (uint32_t i = 0; i < count; i++) {
+               _ps_port_action_t action = {
+                       .port_type = PSPA_REGISTERED_PORTS,
+                       .new_port = portarray[i],
+               };
+               err = posix_spawn_appendportaction_np(attr, &action);
+               if (err) {
+                       break;
+               }
+       }
+       return err;
+}
 
 
 static
@@ -1621,6 +1848,31 @@ posix_spawnattr_setmacpolicyinfo_np(posix_spawnattr_t * __restrict attr,
        return 0;
 }
 
+/*
+ * posix_spawn_destroymacpolicy_info_np
+ * Description: cleanup the macpolicy struct in posix_spawnattr_t attr
+ */
+static int
+posix_spawn_destroymacpolicy_info_np(posix_spawnattr_t *attr)
+{
+       _posix_spawnattr_t psattr;
+       _posix_spawn_mac_policy_extensions_t psmx;
+
+       if (attr == NULL || *attr == NULL) {
+               return EINVAL;
+       }
+
+       psattr = *(_posix_spawnattr_t *)attr;
+       psmx = psattr->psa_mac_extensions;
+       if (psmx == NULL) {
+               return EINVAL;
+       }
+
+       psattr->psa_mac_extensions = NULL;
+       free(psmx);
+       return 0;
+}
+
 int
 posix_spawnattr_setcoalition_np(const posix_spawnattr_t * __restrict attr,
     uint64_t coalitionid, int type, int role)
@@ -1745,6 +1997,7 @@ posix_spawnattr_set_persona_np(const posix_spawnattr_t * __restrict attr, uid_t
                persona->pspi_gid = 0;
                persona->pspi_ngroups = 0;
                persona->pspi_groups[0] = 0;
+               persona->pspi_gmuid = 0;
 
                psattr->psa_persona_info = persona;
        }
@@ -1864,6 +2117,199 @@ posix_spawnattr_set_max_addr_np(const posix_spawnattr_t * __restrict attr, uint6
        return 0;
 }
 
+static struct _posix_spawn_posix_cred_info *
+_posix_spawnattr_get_posix_creds_info(_posix_spawnattr_t psattr)
+{
+       struct _posix_spawn_posix_cred_info *pspci = psattr->psa_posix_cred_info;
+
+       if (pspci == NULL) {
+               pspci = malloc(sizeof(struct _posix_spawn_posix_cred_info));
+               if (pspci != NULL) {
+                       pspci->pspci_flags = 0;
+                       pspci->pspci_uid = 0;
+                       pspci->pspci_gid = 0;
+                       pspci->pspci_ngroups = 0;
+                       pspci->pspci_groups[0] = 0;
+                       pspci->pspci_gmuid = 0;
+                       pspci->pspci_login[0] = '\0';
+                       psattr->psa_posix_cred_info = pspci;
+               }
+       }
+       return pspci;
+}
+
+int
+posix_spawnattr_set_uid_np(const posix_spawnattr_t *attr, uid_t uid)
+{
+       struct _posix_spawn_posix_cred_info *pspci;
+
+       if (attr == NULL || *attr == NULL) {
+               return EINVAL;
+       }
+
+       pspci = _posix_spawnattr_get_posix_creds_info(*(_posix_spawnattr_t *)attr);
+       if (pspci == NULL) {
+               return ENOMEM;
+       }
+
+       pspci->pspci_uid = uid;
+
+       pspci->pspci_flags |= POSIX_SPAWN_POSIX_CRED_UID;
+
+       return 0;
+}
+
+int
+posix_spawnattr_set_gid_np(const posix_spawnattr_t *attr, gid_t gid)
+{
+       struct _posix_spawn_posix_cred_info *pspci;
+
+       if (attr == NULL || *attr == NULL) {
+               return EINVAL;
+       }
+
+       pspci = _posix_spawnattr_get_posix_creds_info(*(_posix_spawnattr_t *)attr);
+       if (pspci == NULL) {
+               return ENOMEM;
+       }
+
+       pspci->pspci_gid = gid;
+
+       pspci->pspci_flags |= POSIX_SPAWN_POSIX_CRED_GID;
+
+       return 0;
+}
+
+int
+posix_spawnattr_set_groups_np(const posix_spawnattr_t *attr,
+    int ngroups, gid_t *gidarray, uid_t gmuid)
+{
+       struct _posix_spawn_posix_cred_info *pspci;
+
+       if (attr == NULL || *attr == NULL) {
+               return EINVAL;
+       }
+
+       if (gidarray == NULL) {
+               return EINVAL;
+       }
+
+       if (ngroups > NGROUPS || ngroups < 0) {
+               return EINVAL;
+       }
+
+       pspci = _posix_spawnattr_get_posix_creds_info(*(_posix_spawnattr_t *)attr);
+       if (pspci == NULL) {
+               return ENOMEM;
+       }
+
+       pspci->pspci_ngroups = ngroups;
+       for (int i = 0; i < ngroups; i++) {
+               pspci->pspci_groups[i] = gidarray[i];
+       }
+
+       pspci->pspci_gmuid = gmuid;
+
+       pspci->pspci_flags |= POSIX_SPAWN_POSIX_CRED_GROUPS;
+
+       return 0;
+}
+
+int
+posix_spawnattr_set_login_np(const posix_spawnattr_t *attr, const char *login)
+{
+       struct _posix_spawn_posix_cred_info *pspci;
+
+       if (attr == NULL || *attr == NULL) {
+               return EINVAL;
+       }
+
+       if (strlen(login) > MAXLOGNAME) {
+               return ERANGE;
+       }
+
+       pspci = _posix_spawnattr_get_posix_creds_info(*(_posix_spawnattr_t *)attr);
+       if (pspci == NULL) {
+               return ENOMEM;
+       }
+
+       strlcpy(pspci->pspci_login, login, sizeof(pspci->pspci_login));
+
+       pspci->pspci_flags |= POSIX_SPAWN_POSIX_CRED_LOGIN;
+
+       return 0;
+}
+
+/*
+ * posix_spawnattr_set_jetsam_ttr_np
+ *
+ * Description: Pass data regarding recent relaunch behavior when jetsammed for the process.
+ *              The recent history is effectively converted into a histogram and the highest
+ *              frequency bucket defines the "type" of the process. The type is passed along
+ *              to the jetsam code as part of psa_jetsam_flags.
+ *
+ * Parameters: count           Number of entries in the ttrs_millis array
+ *              ttrs_millis     Array of raw data for relaunch behavior
+ *
+ * Returns:     0       Success
+ *              EINVAL  Bad attr pointer or empty data array
+ */
+int
+posix_spawnattr_set_jetsam_ttr_np(const posix_spawnattr_t * __restrict attr, uint32_t count, uint32_t *ttrs_millis)
+{
+       _posix_spawnattr_t psattr;
+
+       /*
+        * Define the bucketizing policy which would be used to generate the histogram. These
+        * values are based on looking at data from various Avg. Joanna runs.
+        */
+       static const uint32_t relaunch_buckets_msecs[POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_BUCKETS] = {
+               5000,
+               10000,
+               UINT32_MAX
+       };
+       static const uint32_t relaunch_jetsam_flags[POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_BUCKETS] = {
+               POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_HIGH,
+               POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MED,
+               POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_LOW
+       };
+
+       /* Make sure the attr pointer is valid */
+       if (attr == NULL || *attr == NULL) {
+               return EINVAL;
+       }
+
+       /* Make sure the count of entries is non-zero */
+       if (count == 0) {
+               return EINVAL;
+       }
+
+       psattr = *(_posix_spawnattr_t *)attr;
+
+       /* Generate a histogram based on the relaunch data while maintaining highest frequency bucket info */
+       int relaunch_histogram[POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_BUCKETS] = {0};
+       int max_frequency = -1;
+       int highest_frequency_bucket = -1;
+
+       for (uint32_t i = 0; i < count; i++) {
+               /* For each data point passed in via launchd, find the bucket it lands in */
+               for (uint32_t bucket = 0; bucket < POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_BUCKETS; bucket++) {
+                       if (ttrs_millis[i] <= relaunch_buckets_msecs[bucket]) {
+                               relaunch_histogram[bucket]++;
+
+                               /* Check if the bucket is the highest frequency bucket now */
+                               if (relaunch_histogram[bucket] > max_frequency) {
+                                       max_frequency = relaunch_histogram[bucket];
+                                       highest_frequency_bucket = bucket;
+                               }
+                               break;
+                       }
+               }
+       }
+       psattr->psa_jetsam_flags |= relaunch_jetsam_flags[highest_frequency_bucket];
+       return 0;
+}
+
 /*
  * posix_spawn
  *
@@ -1955,6 +2401,10 @@ posix_spawn(pid_t * __restrict pid, const char * __restrict path,
                                ad.persona_info_size = sizeof(struct _posix_spawn_persona_info);
                                ad.persona_info = psattr->psa_persona_info;
                        }
+                       if (psattr->psa_posix_cred_info != NULL) {
+                               ad.posix_cred_info_size = sizeof(struct _posix_spawn_posix_cred_info);
+                               ad.posix_cred_info = psattr->psa_posix_cred_info;
+                       }
                }
                if (file_actions != NULL && *file_actions != NULL) {
                        _posix_spawn_file_actions_t psactsp =
index 7fa01896789059877a46d7ec74d521e9ae23f7ea..1b83c9d960fd75ab764c385b10bafdb15d2ab87d 100644 (file)
@@ -57,73 +57,56 @@ __BEGIN_DECLS
  * a dummy argument name is added.
  */
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawn(pid_t * __restrict, const char * __restrict,
     const posix_spawn_file_actions_t *,
     const posix_spawnattr_t * __restrict,
     char *const __argv[__restrict],
-    char *const __envp[__restrict]) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+    char *const __envp[__restrict]) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawnp(pid_t * __restrict, const char * __restrict,
     const posix_spawn_file_actions_t *,
     const posix_spawnattr_t * __restrict,
     char *const __argv[__restrict],
-    char *const __envp[__restrict]) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+    char *const __envp[__restrict]) __API_AVAILABLE(macos(10.5), ios(2.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
-int     posix_spawn_file_actions_addclose(posix_spawn_file_actions_t *, int) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+int     posix_spawn_file_actions_addclose(posix_spawn_file_actions_t *, int) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawn_file_actions_adddup2(posix_spawn_file_actions_t *, int,
-    int) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+    int) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawn_file_actions_addopen(
        posix_spawn_file_actions_t * __restrict, int,
-       const char * __restrict, int, mode_t) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+       const char * __restrict, int, mode_t) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
-int     posix_spawn_file_actions_destroy(posix_spawn_file_actions_t *) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+int     posix_spawn_file_actions_destroy(posix_spawn_file_actions_t *) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
-int     posix_spawn_file_actions_init(posix_spawn_file_actions_t *) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+int     posix_spawn_file_actions_init(posix_spawn_file_actions_t *) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
-int     posix_spawnattr_destroy(posix_spawnattr_t *) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+int     posix_spawnattr_destroy(posix_spawnattr_t *) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawnattr_getsigdefault(const posix_spawnattr_t * __restrict,
-    sigset_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+    sigset_t * __restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawnattr_getflags(const posix_spawnattr_t * __restrict,
-    short * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+    short * __restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawnattr_getpgroup(const posix_spawnattr_t * __restrict,
-    pid_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+    pid_t * __restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawnattr_getsigmask(const posix_spawnattr_t * __restrict,
-    sigset_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+    sigset_t * __restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
-int     posix_spawnattr_init(posix_spawnattr_t *) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+int     posix_spawnattr_init(posix_spawnattr_t *) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawnattr_setsigdefault(posix_spawnattr_t * __restrict,
-    const sigset_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+    const sigset_t * __restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
-int     posix_spawnattr_setflags(posix_spawnattr_t *, short) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+int     posix_spawnattr_setflags(posix_spawnattr_t *, short) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
-int     posix_spawnattr_setpgroup(posix_spawnattr_t *, pid_t) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+int     posix_spawnattr_setpgroup(posix_spawnattr_t *, pid_t) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawnattr_setsigmask(posix_spawnattr_t * __restrict,
-    const sigset_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+    const sigset_t * __restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
 #if 0   /* _POSIX_PRIORITY_SCHEDULING [PS] : not supported */
 int     posix_spawnattr_setschedparam(posix_spawnattr_t * __restrict,
@@ -149,30 +132,30 @@ __END_DECLS
 
 __BEGIN_DECLS
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawnattr_getbinpref_np(const posix_spawnattr_t * __restrict,
-    size_t, cpu_type_t *__restrict, size_t *__restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+    size_t, cpu_type_t *__restrict, size_t *__restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawnattr_setauditsessionport_np(posix_spawnattr_t * __restrict,
-    mach_port_t) __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2);
+    mach_port_t) __API_AVAILABLE(macos(10.6), ios(3.2));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawnattr_setbinpref_np(posix_spawnattr_t * __restrict,
-    size_t, cpu_type_t *__restrict, size_t *__restrict) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+    size_t, cpu_type_t *__restrict, size_t *__restrict) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawnattr_setexceptionports_np(posix_spawnattr_t * __restrict,
     exception_mask_t, mach_port_t,
-    exception_behavior_t, thread_state_flavor_t) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+    exception_behavior_t, thread_state_flavor_t) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawnattr_setspecialport_np(posix_spawnattr_t * __restrict,
-    mach_port_t, int) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0);
+    mach_port_t, int) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
 
-__WATCHOS_PROHIBITED  __TVOS_PROHIBITED
 int     posix_spawn_file_actions_addinherit_np(posix_spawn_file_actions_t *,
-    int) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3);
+    int) __API_AVAILABLE(macos(10.7), ios(4.3)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0));
+
+int     posix_spawn_file_actions_addchdir_np(posix_spawn_file_actions_t *,
+    const char * __restrict) __API_AVAILABLE(macos(10.15)) __SPI_AVAILABLE(ios(13.0), tvos(13.0), watchos(6.0), bridgeos(4.0));
+
+int     posix_spawn_file_actions_addfchdir_np(posix_spawn_file_actions_t *,
+    int) __API_AVAILABLE(macos(10.15)) __SPI_AVAILABLE(ios(13.0), tvos(13.0), watchos(6.0), bridgeos(4.0));
 
 __END_DECLS
 
index 41878e7467c5626399d730fb46b6fcefdc0c65b6..aa2897d3313aabb46d4282aa260e21fd08d1e9c7 100644 (file)
 #include <Availability.h>
 #include <TargetConditionals.h>
 
-int     posix_spawnattr_getpcontrol_np(const posix_spawnattr_t * __restrict, int * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2);
-int     posix_spawnattr_setpcontrol_np(posix_spawnattr_t *, const int) __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2);
+int     posix_spawnattr_getpcontrol_np(const posix_spawnattr_t * __restrict, int * __restrict) __API_AVAILABLE(macos(10.6), ios(3.2));
+int     posix_spawnattr_setpcontrol_np(posix_spawnattr_t *, const int) __API_AVAILABLE(macos(10.6), ios(3.2));
 
-int     posix_spawnattr_getprocesstype_np(const posix_spawnattr_t * __restrict, int * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_6_0);
-int     posix_spawnattr_setprocesstype_np(posix_spawnattr_t *, const int) __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_6_0);
+int     posix_spawnattr_getprocesstype_np(const posix_spawnattr_t * __restrict, int * __restrict) __API_AVAILABLE(macos(10.8), ios(6.0));
+int     posix_spawnattr_setprocesstype_np(posix_spawnattr_t *, const int) __API_AVAILABLE(macos(10.8), ios(6.0));
 
-int     posix_spawnattr_setcpumonitor(posix_spawnattr_t * __restrict, uint64_t, uint64_t) __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_6_0);
-int     posix_spawnattr_getcpumonitor(posix_spawnattr_t * __restrict, uint64_t *, uint64_t *) __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_6_0);
-int     posix_spawnattr_setcpumonitor_default(posix_spawnattr_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_6_0);
+int     posix_spawnattr_setcpumonitor(posix_spawnattr_t * __restrict, uint64_t, uint64_t) __API_AVAILABLE(macos(10.8), ios(6.0));
+int     posix_spawnattr_getcpumonitor(posix_spawnattr_t * __restrict, uint64_t *, uint64_t *) __API_AVAILABLE(macos(10.8), ios(6.0));
+int     posix_spawnattr_setcpumonitor_default(posix_spawnattr_t * __restrict) __API_AVAILABLE(macos(10.9), ios(6.0));
 
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 int     posix_spawnattr_setjetsam(posix_spawnattr_t * __restrict attr,
-    short flags, int priority, int memlimit) __OSX_AVAILABLE_STARTING(__MAC_NA, __IPHONE_5_0);
-#endif /* TARGET_OS_EMBEDDED */
+    short flags, int priority, int memlimit) __API_UNAVAILABLE(macos) __API_AVAILABLE(ios(5.0));
+#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
 
 int     posix_spawnattr_setjetsam_ext(posix_spawnattr_t * __restrict attr,
-    short flags, int priority, int memlimit_active, int memlimit_inactive) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0);
+    short flags, int priority, int memlimit_active, int memlimit_inactive) __API_AVAILABLE(macos(10.11), ios(9.0));
+
+// time-to-relaunch after jetsam, set by launchd
+int     posix_spawnattr_set_jetsam_ttr_np(const posix_spawnattr_t * __restrict attr, uint32_t count, uint32_t *ttrs_millis) __OSX_AVAILABLE_STARTING(__MAC_10_15, __IPHONE_13_0);
 
 int     posix_spawnattr_set_threadlimit_ext(posix_spawnattr_t * __restrict attr,
-    int thread_limit);
+    int thread_limit)  __API_AVAILABLE(macos(10.14), ios(12.0), tvos(12.0), watchos(5.0));
 
 #define POSIX_SPAWN_IMPORTANCE_PORT_COUNT 32
 int     posix_spawnattr_set_importancewatch_port_np(posix_spawnattr_t * __restrict attr,
-    int count, mach_port_t portarray[])  __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_6_0);
+    int count, mach_port_t portarray[])  __API_AVAILABLE(macos(10.9), ios(6.0));
+
+int     posix_spawnattr_set_registered_ports_np(posix_spawnattr_t * __restrict attr, mach_port_t portarray[], uint32_t count) __API_AVAILABLE(macos(10.15), ios(13.0), tvos(13.0), watchos(6.0));
 
 #define POSIX_SPAWN_MACPOLICYINFO_WITHSIZE 1
-int     posix_spawnattr_getmacpolicyinfo_np(const posix_spawnattr_t * __restrict, const char *, void **, size_t *) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_7_0);
-int     posix_spawnattr_setmacpolicyinfo_np(posix_spawnattr_t * __restrict, const char *, void *, size_t) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_7_0);
+int     posix_spawnattr_getmacpolicyinfo_np(const posix_spawnattr_t * __restrict, const char *, void **, size_t *) __API_AVAILABLE(macos(10.9), ios(7.0));
+int     posix_spawnattr_setmacpolicyinfo_np(posix_spawnattr_t * __restrict, const char *, void *, size_t) __API_AVAILABLE(macos(10.9), ios(7.0));
+
+int     posix_spawnattr_setcoalition_np(const posix_spawnattr_t * __restrict, uint64_t, int, int) __API_AVAILABLE(macos(10.10), ios(8.0));
+
+int     posix_spawnattr_set_qos_clamp_np(const posix_spawnattr_t * __restrict, uint64_t) __API_AVAILABLE(macos(10.10), ios(8.0));
+int     posix_spawnattr_get_qos_clamp_np(const posix_spawnattr_t * __restrict, uint64_t * __restrict) __API_AVAILABLE(macos(10.10), ios(8.0));
+
+int     posix_spawnattr_set_darwin_role_np(const posix_spawnattr_t * __restrict, uint64_t) __API_AVAILABLE(macos(10.11), ios(9.0));
+int     posix_spawnattr_get_darwin_role_np(const posix_spawnattr_t * __restrict, uint64_t * __restrict) __API_AVAILABLE(macos(10.11), ios(9.0));
 
-int     posix_spawnattr_setcoalition_np(const posix_spawnattr_t * __restrict, uint64_t, int, int) __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_8_0);
+int     posix_spawnattr_set_persona_np(const posix_spawnattr_t * __restrict, uid_t, uint32_t) __API_AVAILABLE(macos(10.11), ios(9.0));
+int     posix_spawnattr_set_persona_uid_np(const posix_spawnattr_t * __restrict, uid_t) __API_AVAILABLE(macos(10.11), ios(9.0));
+int     posix_spawnattr_set_persona_gid_np(const posix_spawnattr_t * __restrict, gid_t) __API_AVAILABLE(macos(10.11), ios(9.0));
+int     posix_spawnattr_set_persona_groups_np(const posix_spawnattr_t * __restrict, int, gid_t * __restrict, uid_t) __API_AVAILABLE(macos(10.11), ios(9.0));
 
-int     posix_spawnattr_set_qos_clamp_np(const posix_spawnattr_t * __restrict, uint64_t) __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_8_0);
-int     posix_spawnattr_get_qos_clamp_np(const posix_spawnattr_t * __restrict, uint64_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_8_0);
+int     posix_spawnattr_set_max_addr_np(const posix_spawnattr_t * __restrict attr, uint64_t max_addr) __API_AVAILABLE(macos(10.14), ios(12.0), tvos(12.0), watchos(5.0));
 
-int     posix_spawnattr_set_darwin_role_np(const posix_spawnattr_t * __restrict, uint64_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0);
-int     posix_spawnattr_get_darwin_role_np(const posix_spawnattr_t * __restrict, uint64_t * __restrict) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0);
+int     posix_spawnattr_set_uid_np(const posix_spawnattr_t * __restrict, uid_t) __API_AVAILABLE(macos(10.15), ios(13.0), tvos(13.0), watchos(6.0));
+int     posix_spawnattr_set_gid_np(const posix_spawnattr_t * __restrict, gid_t) __API_AVAILABLE(macos(10.15), ios(13.0), tvos(13.0), watchos(6.0));
+int     posix_spawnattr_set_groups_np(const posix_spawnattr_t * __restrict, int, gid_t * __restrict, uid_t) __API_AVAILABLE(macos(10.15), ios(13.0), tvos(13.0), watchos(6.0));
+int     posix_spawnattr_set_login_np(const posix_spawnattr_t * __restrict, const char * __restrict) __API_AVAILABLE(macos(10.15), ios(13.0), tvos(13.0), watchos(6.0));
 
-int     posix_spawnattr_set_persona_np(const posix_spawnattr_t * __restrict, uid_t, uint32_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0);
-int     posix_spawnattr_set_persona_uid_np(const posix_spawnattr_t * __restrict, uid_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0);
-int     posix_spawnattr_set_persona_gid_np(const posix_spawnattr_t * __restrict, gid_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0);
-int     posix_spawnattr_set_persona_groups_np(const posix_spawnattr_t * __restrict, int, gid_t *, uid_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0);
-int     posix_spawnattr_set_max_addr_np(const posix_spawnattr_t * __restrict attr, uint64_t max_addr) __OSX_AVAILABLE_STARTING(__MAC_10_14, __IPHONE_12_0);
+int     posix_spawn_file_actions_add_fileportdup2_np(posix_spawn_file_actions_t * __restrict, mach_port_t, int) __API_AVAILABLE(macos(10.15), ios(13.0), tvos(13.0), watchos(6.0));
 
 #endif /* !defined _SPAWN_PRIVATE_H_*/
index 52082bb687d602d59ee5c06770b32f91820adcbf..dd7719b471d1ebd4f5e8911d2c6ba0275e255311 100644 (file)
@@ -37,7 +37,7 @@ void __abort_with_payload(uint32_t reason_namespace, uint64_t reason_code,
 
 static void abort_with_payload_wrapper_internal(uint32_t reason_namespace, uint64_t reason_code,
     void *payload, uint32_t payload_size, const char *reason_string,
-    uint64_t reason_flags) __attribute__((noreturn));
+    uint64_t reason_flags) __attribute__((noreturn, cold));
 
 /* System call wrappers */
 int
index f0c269132686a403f3f85975b176aeb00823def4..f278f1dba0cb70fb50f2c0fa96bdbeebef1ea207 100755 (executable)
@@ -63,7 +63,9 @@ my @CFLAGS = (
        "-x assembler-with-cpp",
        "-c",
        "-isysroot", $ENV{'SDKROOT'} || "/",
-       "-I".$ENV{"SDKROOT"}."/System/Library/Frameworks/System.framework/PrivateHeaders",
+       "-I".$ENV{"SDKROOT"}."/".$ENV{"SDK_INSTALL_HEADERS_ROOT"}."/usr/include",
+       "-I".$ENV{"SDKROOT"}."/".$ENV{"SDK_INSTALL_HEADERS_ROOT"}."/usr/local/include",
+       "-I".$ENV{"SDKROOT"}."/".$ENV{"SDK_INSTALL_HEADERS_ROOT"}."/System/Library/Frameworks/System.framework/PrivateHeaders",
 );
 
 chomp(my $LIBTOOL = `xcrun -sdk "$ENV{'SDKROOT'}" -find libtool`);
index a4e17d689ec971d5ec83e097c8914f35fed9b8c6..6bf15db9080f1038c36988200f80da1b93169b8b 100755 (executable)
@@ -150,8 +150,10 @@ sub usage {
 # Read the syscall.master file and collect the system call names and number
 # of arguments.  It looks for the NO_SYSCALL_STUB quailifier following the
 # prototype to determine if no automatic stub should be created by Libsystem.
-# System call name that are already prefixed with double-underbar are set as
-# if the NO_SYSCALL_STUB qualifier were specified (whether it is or not).
+#
+# The `sys_` prefix is stripped from syscall names, and is only kept for
+# the kernel symbol in order to avoid namespace clashes and identify
+# syscalls more easily.
 #
 # For the #if lines in syscall.master, all macros are assumed to be defined,
 # except COMPAT_GETFSSTAT (assumed undefined).
@@ -186,6 +188,7 @@ sub readMaster {
         my $no_syscall_stub = /\)\s*NO_SYSCALL_STUB\s*;/;
         my($name, $args) = /\s(\S+)\s*\(([^)]*)\)/;
         next if $name =~ /e?nosys/;
+        $name =~ s/^sys_//;
         $args =~ s/^\s+//;
         $args =~ s/\s+$//;
         my $argbytes = 0;
@@ -330,13 +333,13 @@ sub writeStubForSymbol {
         $arch =~ s/arm64(.*)/arm64/;
         push(@conditions, "defined(__${arch}__)") unless grep { $_ eq $arch } @{$$symbol{except}};
 
-        if($arch == 'arm64') {
+        if($arch eq "arm64") {
             $has_arm64 = 1 unless grep { $_ eq $arch } @{$$symbol{except}};
         }
     }
 
-       my %is_cancel;
-       for (@Cancelable) { $is_cancel{$_} = 1 };
+    my %is_cancel;
+    for (@Cancelable) { $is_cancel{$_} = 1 };
 
     print $f "#define __SYSCALL_32BIT_ARG_BYTES $$symbol{bytes}\n";
     print $f "#include \"SYS.h\"\n\n";
index 40bbcbf74460e0e4b8317b500bc2402273416acd..0761c11fedd85d90870bde68d9b84921f869bdd0 100755 (executable)
@@ -40,6 +40,11 @@ MIG_PRIVATE_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/local/include/mach"
 SERVER_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/servers"
 MACH_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/mach"
 MACH_PRIVATE_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/local/include/mach"
+MIG_INTERNAL_HEADER_DST="$BUILT_PRODUCTS_DIR/internal_hdr/include/mach"
+MIG_INCFLAGS="-I${SDKROOT}/${SDK_INSTALL_HEADERS_ROOT}/usr/include -I${SDKROOT}/${SDK_INSTALL_HEADERS_ROOT}/usr/local/include"
+MIG_PRIVATE_DEFS_INCFLAGS="-I${SDKROOT}/${SDK_INSTALL_HEADERS_ROOT}/System/Library/Frameworks/System.framework/PrivateHeaders"
+SRC="$SRCROOT/mach"
+FILTER_MIG="$SRCROOT/xcodescripts/filter_mig.awk"
 
 # from old Libsystem makefiles
 MACHINE_ARCH=`echo $ARCHS | cut -d' ' -f 1`
@@ -63,11 +68,6 @@ then
        MACHINE_ARCH="i386"
 fi
 
-SRC="$SRCROOT/mach"
-MIG_INTERNAL_HEADER_DST="$BUILT_PRODUCTS_DIR/internal_hdr/include/mach"
-MIG_PRIVATE_DEFS_INCFLAGS="-I${SDKROOT}/System/Library/Frameworks/System.framework/PrivateHeaders"
-FILTER_MIG="$SRCROOT/xcodescripts/filter_mig.awk"
-
 ASROOT=""
 if [ `whoami` = "root" ]; then
        ASROOT="-o 0"
@@ -83,6 +83,7 @@ MIGS="clock.defs
        mach_host.defs
        mach_port.defs
        mach_voucher.defs
+       memory_entry.defs
        processor.defs
        processor_set.defs
        task.defs
@@ -146,7 +147,7 @@ for hdr in $MACH_PRIVATE_HDRS; do
 done
 
 # special case because we only have one to do here
-$MIG -novouchers -arch $MACHINE_ARCH -header "$SERVER_HEADER_DST/netname.h" $SRC/servers/netname.defs
+$MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$SERVER_HEADER_DST/netname.h" $MIG_INCFLAGS $SRC/servers/netname.defs
 
 # install /usr/include/mach mig headers
 
@@ -155,7 +156,7 @@ mkdir -p $MIG_HEADER_OBJ
 
 for mig in $MIGS $MIGS_DUAL_PUBLIC_PRIVATE; do
        MIG_NAME=`basename $mig .defs`
-       $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_HEADER_OBJ/$MIG_NAME.h" $MIG_DEFINES $SRC/$mig
+       $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_HEADER_OBJ/$MIG_NAME.h" $MIG_DEFINES $MIG_INCFLAGS $SRC/$mig
        for filter in $MIG_FILTERS; do
                $FILTER_MIG $SRC/$filter $MIG_HEADER_OBJ/$MIG_NAME.h > $MIG_HEADER_OBJ/$MIG_NAME.tmp.h
                mv $MIG_HEADER_OBJ/$MIG_NAME.tmp.h $MIG_HEADER_OBJ/$MIG_NAME.h
@@ -167,7 +168,7 @@ mkdir -p $MIG_PRIVATE_HEADER_DST
 
 for mig in $MIGS_PRIVATE $MIGS_DUAL_PUBLIC_PRIVATE; do
        MIG_NAME=`basename $mig .defs`
-       $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_PRIVATE_HEADER_DST/$MIG_NAME.h" $MIG_DEFINES $MIG_PRIVATE_DEFS_INCFLAGS $SRC/$mig
+       $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_PRIVATE_HEADER_DST/$MIG_NAME.h" $MIG_DEFINES $MIG_INCFLAGS $MIG_PRIVATE_DEFS_INCFLAGS $SRC/$mig
        if [ ! -e "$MIG_HEADER_DST/$MIG_NAME.h" ]; then
                echo "#error $MIG_NAME.h unsupported." > "$MIG_HEADER_DST/$MIG_NAME.h"
        fi
@@ -182,6 +183,6 @@ mkdir -p $MIG_INTERNAL_HEADER_DST
  
 for mig in $MIGS_INTERNAL; do
        MIG_NAME=`basename $mig .defs`
-       $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_INTERNAL_HEADER_DST/${MIG_NAME}_internal.h" $SRC/$mig
+       $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_INTERNAL_HEADER_DST/${MIG_NAME}_internal.h" $MIG_INCFLAGS $SRC/$mig
 done
  
index 3f6f71317537500a96aef226c4cc38e786588791..0c8420d13b839aeecfe77d222ac1ed3a6e42aeaf 100644 (file)
 #
 # Commands for the build environment
 #
+
 ##
 # Verbosity
 ##
+
 ifeq ($(RC_XBS),YES)
-VERBOSE = YES
-else
-VERBOSE = NO
-endif
-ifeq ($(VERBOSE),YES)
-_v =
-_vstdout =
+       VERBOSE = YES
 else
-_v = @
-_vstdout = > /dev/null
+       VERBOSE = NO
 endif
 
-VERBOSE_GENERATED_MAKE_FRAGMENTS = NO
+ECHO = echo
+
+LOG = echo
+makelog = $(info $1)
+ERR = $(ECHO) > /dev/stderr
+
+QUIET ?= 0
+ifneq ($(QUIET),0)
+       LOG = :
+       makelog =
+       ifeq ($(VERBOSE),YES)
+               override VERBOSE = NO
+       endif
+endif
 
 ifeq ($(VERBOSE),YES)
+       _v =
+       _vstdout =
        XCRUN = /usr/bin/xcrun -verbose
 else
+       _v = @
+       _vstdout = > /dev/null
        XCRUN = /usr/bin/xcrun
 endif
 
+VERBOSE_GENERATED_MAKE_FRAGMENTS = NO
+
 SDKROOT ?= macosx
 HOST_SDKROOT ?= macosx
 
@@ -66,6 +80,15 @@ ifeq ($(PLATFORM),)
        endif
 endif
 
+ifeq ($(PLATFORM),MacOSX)
+       ifeq (DriverKit,$(shell echo $(SDKROOT_RESOLVED) | sed 's,^.*/\([^/1-9]*\)[1-9][^/]*\.sdk$$,\1,'))
+               export PLATFORM := DriverKit
+               export DRIVERKIT ?= 1
+               export DRIVERKITROOT ?= /System/DriverKit
+               export DRIVERKITRUNTIMEROOT = $(DRIVERKITROOT)/Runtime
+       endif
+endif
+
 ifeq ($(SDKVERSION),)
      export SDKVERSION := $(shell $(XCRUN) -sdk $(SDKROOT) -show-sdk-version)
 endif
@@ -87,6 +110,9 @@ endif
 ifeq ($(MIGCC),)
        export MIGCC := $(CC)
 endif
+ifeq ($(IIG),)
+       export IIG := $(shell $(XCRUN) -sdk $(SDKROOT) -find iig)
+endif
 ifeq ($(STRIP),)
        export STRIP := $(shell $(XCRUN) -sdk $(SDKROOT) -find strip)
 endif
@@ -123,7 +149,7 @@ endif
 #
 SUPPORTED_EMBEDDED_PLATFORMS := iPhoneOS iPhoneOSNano tvOS AppleTVOS WatchOS BridgeOS
 SUPPORTED_SIMULATOR_PLATFORMS := iPhoneSimulator iPhoneNanoSimulator tvSimulator AppleTVSimulator WatchSimulator
-SUPPORTED_PLATFORMS := MacOSX $(SUPPORTED_SIMULATOR_PLATFORMS) $(SUPPORTED_EMBEDDED_PLATFORMS)
+SUPPORTED_PLATFORMS := MacOSX DriverKit $(SUPPORTED_SIMULATOR_PLATFORMS) $(SUPPORTED_EMBEDDED_PLATFORMS)
 
 # Platform-specific tools
 ifneq ($(filter $(SUPPORTED_EMBEDDED_PLATFORMS),$(PLATFORM)),)
@@ -170,7 +196,6 @@ TOUCH = /usr/bin/touch
 SLEEP = /bin/sleep
 AWK = /usr/bin/awk
 SED = /usr/bin/sed
-ECHO = /bin/echo
 PLUTIL = /usr/bin/plutil
 
 #
index de10f2053181db760d9a8470df197379cc151387..a1030c34a11571ba70f9052775879b426a1865ea 100644 (file)
@@ -1,6 +1,6 @@
 # -*- mode: makefile;-*-
 #
-# Copyright (C) 1999-2017 Apple Inc. All rights reserved.
+# Copyright (C) 1999-2019 Apple Inc. All rights reserved.
 #
 # MakeInc.def contains global definitions for building,
 # linking, and installing files.
@@ -16,6 +16,7 @@ SUPPORTED_ARCH_CONFIGS := X86_64 X86_64H ARM ARM64
 #
 SUPPORTED_KERNEL_CONFIGS = RELEASE DEVELOPMENT DEBUG PROFILE KASAN
 
+
 #
 # Machine Configuration options
 #
@@ -24,7 +25,7 @@ SUPPORTED_X86_64_MACHINE_CONFIGS = NONE
 SUPPORTED_X86_64H_MACHINE_CONFIGS = NONE
 
 SUPPORTED_ARM_MACHINE_CONFIGS = S7002 T8002 T8004
-SUPPORTED_ARM64_MACHINE_CONFIGS = S5L8960X T7000 T7001 S8000 S8001 T8010 T8011 BCM2837
+SUPPORTED_ARM64_MACHINE_CONFIGS = T7000 T7001 S8000 S8001 T8010 T8011 BCM2837
 
 
 #
@@ -50,7 +51,6 @@ COMPONENT_LIST        = osfmk bsd libkern iokit pexpert libsa security san
 COMPONENT      = $(if $(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(firstword $(subst /, ,$(RELATIVE_SOURCE_PATH))))
 COMPONENT_IMPORT_LIST = $(filter-out $(COMPONENT),$(COMPONENT_LIST))
 
-MACHINE_FLAGS_ARM64_S5L8960X = -DARM64_BOARD_CONFIG_S5L8960X
 MACHINE_FLAGS_ARM64_T7000 = -DARM64_BOARD_CONFIG_T7000
 MACHINE_FLAGS_ARM64_T7001 = -DARM64_BOARD_CONFIG_T7001
 MACHINE_FLAGS_ARM_S7002 = -DARM_BOARD_CONFIG_S7002
@@ -67,21 +67,24 @@ MACHINE_FLAGS_ARM64_BCM2837 = -DARM64_BOARD_CONFIG_BCM2837
 # Deployment target flag
 #
 ifeq ($(PLATFORM),MacOSX)
-    DEPLOYMENT_TARGET_FLAGS = -mmacosx-version-min=$(SDKVERSION)
+    DEPLOYMENT_TARGET_FLAGS = -mmacosx-version-min=$(SDKVERSION) -DXNU_TARGET_OS_OSX
     DEPLOYMENT_LINKER_FLAGS = -Wl,-macosx_version_min,$(SDKVERSION)
+else ifeq ($(PLATFORM),DriverKit)
+    DEPLOYMENT_TARGET_FLAGS = -target x86_64-apple-driverkit$(SDKVERSION) -DXNU_TARGET_OS_OSX
+    DEPLOYMENT_LINKER_FLAGS = -Wl,-target,x86_64-apple-driverkit$(SDKVERSION)
 else ifeq ($(PLATFORM),WatchOS)
     DEPLOYMENT_TARGET_FLAGS = -mwatchos-version-min=$(SDKVERSION) -DXNU_TARGET_OS_WATCH
     DEPLOYMENT_LINKER_FLAGS =
 else ifeq ($(PLATFORM),tvOS)
-    DEPLOYMENT_TARGET_FLAGS = -mtvos-version-min=$(SDKVERSION)
+    DEPLOYMENT_TARGET_FLAGS = -mtvos-version-min=$(SDKVERSION) -DXNU_TARGET_OS_TV
     DEPLOYMENT_LINKER_FLAGS =
 else ifeq ($(PLATFORM),AppleTVOS)
-    DEPLOYMENT_TARGET_FLAGS = -mtvos-version-min=$(SDKVERSION)
+    DEPLOYMENT_TARGET_FLAGS = -mtvos-version-min=$(SDKVERSION) -DXNU_TARGET_OS_TV
 else ifeq ($(PLATFORM),BridgeOS)
     DEPLOYMENT_TARGET_FLAGS = -mbridgeos-version-min=$(SDKVERSION) -DXNU_TARGET_OS_BRIDGE
     DEPLOYMENT_LINKER_FLAGS =
 else ifneq ($(filter $(SUPPORTED_EMBEDDED_PLATFORMS),$(PLATFORM)),)
-    DEPLOYMENT_TARGET_FLAGS = -miphoneos-version-min=$(SDKVERSION)
+    DEPLOYMENT_TARGET_FLAGS = -miphoneos-version-min=$(SDKVERSION) -DXNU_TARGET_OS_IOS
     DEPLOYMENT_LINKER_FLAGS = -Wl,-ios_version_min,$(SDKVERSION)
 else ifneq ($(filter $(SUPPORTED_SIMULATOR_PLATFORMS),$(PLATFORM)),)
     DEPLOYMENT_TARGET_FLAGS =
@@ -123,36 +126,30 @@ WERROR := -Werror
 endif
 
 # Shared C/C++ warning flags
+# NOTE: order matters here.  -Wno-xxx goes before opt-in of ones we want
 WARNFLAGS_STD := \
        -Weverything \
        -Wno-pedantic \
        $(WERROR) \
-       -Wno-assign-enum \
+       -Wno-implicit-int-conversion \
+       -Wno-sign-conversion \
+       -Wno-shorten-64-to-32 \
        -Wno-bad-function-cast \
-       -Wno-c++98-compat \
        -Wno-c++-compat \
+       -Wno-c++98-compat \
        -Wno-conditional-uninitialized \
-       -Wno-conversion \
-       -Wnull-conversion \
-       -Wstring-conversion \
-       -Wliteral-conversion \
-       -Wnon-literal-null-conversion \
-       -Wint-conversion \
-       -Wenum-conversion  \
-       -Wfloat-conversion \
-       -Wconstant-conversion \
-       -Wpointer-bool-conversion \
        -Wno-covered-switch-default \
        -Wno-disabled-macro-expansion \
        -Wno-documentation-unknown-command \
+       -Wno-extra-semi-stmt \
        -Wno-format-non-iso \
        -Wno-format-nonliteral \
-       -Wno-reserved-id-macro \
        -Wno-language-extension-token \
        -Wno-missing-variable-declarations \
        -Wno-packed \
        -Wno-padded \
        -Wno-partial-availability \
+       -Wno-reserved-id-macro \
        -Wno-shift-sign-overflow \
        -Wno-switch-enum \
        -Wno-undef \
@@ -169,6 +166,8 @@ WARNFLAGS_STD := $(WARNFLAGS_STD) \
 CWARNFLAGS_STD = \
        $(WARNFLAGS_STD)
 
+
+
 # Can be overridden in Makefile.template or Makefile.$arch
 export CWARNFLAGS ?= $(CWARNFLAGS_STD)
 
@@ -176,13 +175,16 @@ define add_perfile_cflags
 $(1)_CWARNFLAGS_ADD += $2
 endef
 
+define rm_perfile_cflags
+$(1)_CFLAGS_RM += $2
+endef
+
 CXXWARNFLAGS_STD = \
        $(WARNFLAGS_STD) \
        -Wno-c++98-compat-pedantic \
        -Wno-exit-time-destructors \
        -Wno-global-constructors \
-       -Wno-old-style-cast \
-       -Wno-zero-as-null-pointer-constant
+       -Wno-old-style-cast
 
 # Can be overridden in Makefile.template or Makefile.$arch
 export CXXWARNFLAGS ?= $(CXXWARNFLAGS_STD)
@@ -203,6 +205,15 @@ ifndef ARCH_STRING_FOR_CURRENT_MACHINE_CONFIG
 export ARCH_STRING_FOR_CURRENT_MACHINE_CONFIG := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) -query SELECT DISTINCT KernelMachOArchitecture FROM Targets WHERE KernelPlatform IS \"$(CURRENT_MACHINE_CONFIG_LC)\" LIMIT 1 || echo UNKNOWN )
 endif
 
+#
+# This can have false negatives, and is used to avoid calling CTF when we'll build a static KC
+#
+ifndef WILL_BUILD_STATIC_KC
+export WILL_BUILD_STATIC_KC := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH)        \
+                               -query 'SELECT COUNT(*) != 0 FROM Targets WHERE KernelPlatform IS "$(CURRENT_MACHINE_CONFIG_LC)" \
+                                       AND (KernelMachOArchitecture LIKE "arm64e" OR ProductType LIKE "iphone10,%")')
+endif
+
 BUILD_STATIC_LINK := 1
 
 endif
@@ -249,8 +260,15 @@ BUILD_DSYM := 1
 # probes from the kernel.
 #
 CFLAGS_GEN = $(DEBUG_CFLAGS) -nostdinc \
-       -fno-builtin -fno-common \
-       -fsigned-bitfields $(OTHER_CFLAGS)
+       -ferror-limit=10000 \
+       -fno-builtin \
+       -fno-common \
+       -ftrivial-auto-var-init=zero \
+       -enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang \
+       -fsigned-bitfields \
+       -fmerge-all-constants \
+       -fno-c++-static-destructors \
+       $(OTHER_CFLAGS)
 
 CFLAGS_RELEASE =
 CFLAGS_DEVELOPMENT     =
@@ -264,10 +282,10 @@ CFLAGS_X86_64     = -Dx86_64 -DX86_64 -D__X86_64__ -DLP64 \
 CFLAGS_X86_64H = $(CFLAGS_X86_64)
 
 CFLAGS_ARM     = -Darm -DARM -D__ARM__ -DPAGE_SIZE_FIXED \
-                       -fno-strict-aliasing -D__API__=v4
+                       -momit-leaf-frame-pointer -fno-strict-aliasing -D__API__=v4
 
 CFLAGS_ARM64   = -Darm64 -DARM64 -D__ARM64__ -DLP64 -DPAGE_SIZE_FIXED \
-                       -fno-strict-aliasing -D__API__=v4 -mkernel
+                       -momit-leaf-frame-pointer -fno-strict-aliasing -D__API__=v4 -mkernel
 
 CFLAGS_RELEASEX86_64 = -O2
 CFLAGS_DEVELOPMENTX86_64 = -O2
@@ -302,13 +320,15 @@ CFLAGS_PROFILEARM64 = -O2
 SAN=0
 
 ifeq ($(CURRENT_KERNEL_CONFIG),KASAN)
+# KASan kernel config implicitly enables the KASan instrumentation.
+# Instrumentation for other sanitizers is enabled explicitly at build time.
 KASAN = 1
 endif
 
 ifeq ($(KASAN),1)
 SAN=1
 BUILD_LTO = 0
-KASAN_SHIFT_ARM64=0xdffffff800000000
+KASAN_SHIFT_ARM64=0xe000000000000000
 #
 # To calculate the kasan shift, subtract the lowest KVA to sanitize, shifted right by 3 bits,
 # from the base address of the kasan shadow area, (e.g. solve the following equation:
@@ -329,26 +349,38 @@ endif
 
 ifeq ($(UBSAN),1)
 SAN=1
-UBSAN_CHECKS = signed-integer-overflow shift pointer-overflow  # non-fatal (calls runtime, can return)
+UBSAN_CHECKS = signed-integer-overflow shift pointer-overflow bounds object-size # non-fatal (calls runtime, can return)
+# UBSAN_CHECKS = undefined nullability unsigned-integer-overflow # everything
 UBSAN_CHECKS_FATAL =                                           # fatal (calls runtime, must not return)
 UBSAN_CHECKS_TRAP = vla-bound builtin                          # emit a trap instruction (no runtime support)
-UBSAN_DISABLED = bounds object-size
+UBSAN_DISABLED =
 
-ifneq ($(KASAN),1)
-UBSAN_CHECKS += alignment         # UBSan alignment + KASan code size is too large
-UBSAN_CHECKS_FATAL += unreachable # UBSan unreachable doesn't play nice with ASan (40723397)
+UBSAN_DISABLED += vptr function     # requires unsupported C++ runtime
+ifeq ($(KASAN),1)
+# UBSan alignment + KASan code size is too large
+# UBSan unreachable doesn't play nice with ASan (40723397)
+UBSAN_DISABLED += alignment unreachable
 endif
 
 CFLAGS_GEN += -DUBSAN=1
 CFLAGS_GEN += $(foreach x,$(UBSAN_CHECKS) $(UBSAN_CHECKS_FATAL) $(UBSAN_CHECKS_TRAP),-fsanitize=$(x))
 CFLAGS_GEN += $(foreach x,$(UBSAN_CHECKS_FATAL),-fno-sanitize-recover=$(x))
 CFLAGS_GEN += $(foreach x,$(UBSAN_CHECKS_TRAP),-fsanitize-trap=$(x))
+CFLAGS_GEN += $(foreach x,$(UBSAN_DISABLED),-fno-sanitize=$(x))
+endif
+
+ifeq ($(KSANCOV),1)
+# Enable SanitizerCoverage instrumentation in xnu
+SAN = 1
+KSANCOV_CFLAGS := -fsanitize-coverage=trace-pc-guard
+CFLAGS_GEN += $(KSANCOV_CFLAGS) -DKSANCOV=1
 endif
 
 ifeq ($(SAN),1)
 CFLAGS_GEN += -fsanitize-blacklist=$(OBJROOT)/san/kasan-blacklist-$(CURRENT_ARCH_CONFIG_LC)
 endif
 
+
 CFLAGS = $(CFLAGS_GEN) \
                  $($(addsuffix $(CURRENT_MACHINE_CONFIG),MACHINE_FLAGS_$(CURRENT_ARCH_CONFIG)_)) \
                  $($(addsuffix $(CURRENT_ARCH_CONFIG),ARCH_FLAGS_)) \
@@ -365,7 +397,7 @@ CFLAGS      = $(CFLAGS_GEN) \
 
 OTHER_CXXFLAGS =
 
-CXXFLAGS_GEN  = -std=gnu++1z -fapple-kext $(OTHER_CXXFLAGS)
+CXXFLAGS_GEN  = -std=gnu++1z -fsized-deallocation -fapple-kext $(OTHER_CXXFLAGS)
 
 CXXFLAGS      = $(CXXFLAGS_GEN) \
                  $($(addsuffix $(CURRENT_ARCH_CONFIG),CXXFLAGS_)) \
@@ -469,7 +501,7 @@ LDFLAGS_KERNEL_RELEASEX86_64 = \
        -Wl,-sectalign,__HIB,__llvm_prf_names,0x1000 \
        -Wl,-sectalign,__HIB,__llvm_prf_data,0x1000 \
        -Wl,-sectalign,__HIB,__textcoal_nt,0x1000 \
-       -Wl,-rename_section,__DATA,__const,__CONST,__constdata \
+       -Wl,-rename_section,__DATA,__const,__DATA_CONST,__const \
        -Wl,-no_zero_fill_sections \
        $(LDFLAGS_NOSTRIP_FLAG)
 
@@ -478,13 +510,10 @@ LDFLAGS_KERNEL_RELEASEX86_64 += \
        -Wl,-sectalign,__HIB,__cstring,0x1000
 endif
 
-ifeq ($(KASAN),1)
+ifeq ($(KSANCOV),1)
 LDFLAGS_KERNEL_RELEASEX86_64 += \
-       -Wl,-sectalign,__HIB,__asan_globals,0x1000 \
-       -Wl,-sectalign,__HIB,__asan_liveness,0x1000 \
-       -Wl,-sectalign,__HIB,__mod_term_func,0x1000 \
-       -Wl,-rename_section,__HIB,__mod_init_func,__NULL,__mod_init_func \
-       -Wl,-rename_section,__HIB,__eh_frame,__NULL,__eh_frame
+       -Wl,-sectalign,__HIB,__sancov_guards,0x1000 \
+       -Wl,-sectalign,__HIB,__sancov_pcs,0x1000
 endif
 
 # Define KERNEL_BASE_OFFSET so known at compile time:
@@ -493,13 +522,18 @@ CFLAGS_X86_64H += -DKERNEL_BASE_OFFSET=$(KERNEL_BASE_OFFSET)
 
 LDFLAGS_KERNEL_DEBUGX86_64 = $(LDFLAGS_KERNEL_RELEASEX86_64)
 LDFLAGS_KERNEL_DEVELOPMENTX86_64 = $(LDFLAGS_KERNEL_RELEASEX86_64)
-LDFLAGS_KERNEL_KASANX86_64 = $(LDFLAGS_KERNEL_RELEASEX86_64)
+LDFLAGS_KERNEL_KASANX86_64 = $(LDFLAGS_KERNEL_DEVELOPMENTX86_64) \
+       -Wl,-sectalign,__HIB,__asan_globals,0x1000 \
+       -Wl,-sectalign,__HIB,__asan_liveness,0x1000 \
+       -Wl,-sectalign,__HIB,__mod_term_func,0x1000 \
+       -Wl,-rename_section,__HIB,__mod_init_func,__NULL,__mod_init_func \
+       -Wl,-rename_section,__HIB,__eh_frame,__NULL,__eh_frame
 LDFLAGS_KERNEL_PROFILEX86_64 = $(LDFLAGS_KERNEL_RELEASEX86_64)
 
 LDFLAGS_KERNEL_RELEASEX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64)
 LDFLAGS_KERNEL_DEBUGX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H)
 LDFLAGS_KERNEL_DEVELOPMENTX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H)
-LDFLAGS_KERNEL_KASANX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H)
+LDFLAGS_KERNEL_KASANX86_64H = $(LDFLAGS_KERNEL_KASANX86_64)
 LDFLAGS_KERNEL_PROFILEX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H)
 
 # We preload ___udivmoddi4 in order to work around an issue with building
@@ -512,7 +546,8 @@ LDFLAGS_KERNEL_GENARM = \
        -Wl,-u,___udivmoddi4
 
 LDFLAGS_KERNEL_RELEASEARM     = \
-       $(LDFLAGS_KERNEL_GENARM)
+       $(LDFLAGS_KERNEL_GENARM) \
+       $(LDFLAGS_KERNEL_STRIP_LTO)
 
 LDFLAGS_KERNEL_EXPORTS_RELEASEARM     = \
        -Wl,-exported_symbols_list,$(TARGET)/all-kpi.exp
@@ -580,7 +615,8 @@ LDFLAGS_KERNEL_SEGARM64 ?= \
 
 LDFLAGS_KERNEL_RELEASEARM64     = \
        $(LDFLAGS_KERNEL_GENARM64) \
-       $(LDFLAGS_KERNEL_SEGARM64)
+       $(LDFLAGS_KERNEL_SEGARM64) \
+       $(LDFLAGS_KERNEL_STRIP_LTO)
 
 LDFLAGS_KERNEL_EXPORTS_RELEASEARM64     = \
        -Wl,-exported_symbols_list,$(TARGET)/all-kpi.exp
@@ -608,7 +644,9 @@ LDFLAGS_KERNEL      = $(LDFLAGS_KERNEL_GEN) \
 
 
 LDFLAGS_KERNEL_EXPORTS   =   \
-                 $($(addsuffix $(CURRENT_ARCH_CONFIG), $(addsuffix $(CURRENT_KERNEL_CONFIG),LDFLAGS_KERNEL_EXPORTS_)))
+                 $($(addsuffix $(CURRENT_ARCH_CONFIG), $(addsuffix $(CURRENT_KERNEL_CONFIG),LDFLAGS_KERNEL_EXPORTS_))) \
+                 -Wl,-alias_list,$(TARGET)/all-alias.exp
+
 
 #
 # Default runtime libraries to be linked with the kernel
@@ -619,21 +657,20 @@ LD_KERNEL_ARCHIVES = $(LDFLAGS_KERNEL_SDK) -lfirehose_kernel
 #
 # DTrace support
 #
+ifndef DO_CTFMERGE
+DO_CTFMERGE := 1
 ifeq ($(CURRENT_KERNEL_CONFIG),RELEASE)
 ifneq ($(filter ARM%,$(CURRENT_ARCH_CONFIG)),)
-DO_CTFCONVERT = 0
-DO_CTFMERGE   = 0
-DO_CTFMACHO   = 0
-else
-DO_CTFCONVERT = $(SUPPORTS_CTFCONVERT)
-DO_CTFMERGE   = 1
-DO_CTFMACHO   = $(NEEDS_CTF_MACHOS)
+DO_CTFMERGE := 0
 endif
-else
-DO_CTFCONVERT = $(SUPPORTS_CTFCONVERT)
-DO_CTFMERGE   = 1
-DO_CTFMACHO   = $(NEEDS_CTF_MACHOS)
 endif
+ifneq ($(CURRENT_KERNEL_CONFIG),KASAN)
+ifeq ($(WILL_BUILD_STATIC_KC),1)
+DO_CTFMERGE := 0
+endif
+endif
+endif # DO_CTFMERGE
+
 
 #
 # Default INCFLAGS
@@ -693,28 +730,25 @@ else
 USE_LTO = $(LTO_ENABLED_$(CURRENT_KERNEL_CONFIG))
 endif
 
-SUPPORTS_CTFCONVERT    = 0
 ifeq ($(USE_LTO),1)
 CFLAGS_GEN     += -flto
 CXXFLAGS_GEN   += -flto
-LDFLAGS_KERNEL_GEN     += -Wl,-mllvm,-inline-threshold=100 -Wl,-object_path_lto,$(TARGET)/lto.o
+LDFLAGS_KERNEL_LTO     = -Wl,-mllvm,-inline-threshold=100
+LDFLAGS_KERNEL_GEN     += $(LDFLAGS_KERNEL_LTO) -Wl,-object_path_lto,$(TARGET)/lto.o
 LDFLAGS_NOSTRIP_FLAG = -rdynamic
+LDFLAGS_KERNEL_STRIP_LTO = -Wl,-dead_strip,-no_dead_strip_inits_and_terms
+
 CFLAGS_NOLTO_FLAG = -fno-lto
-NEEDS_CTF_MACHOS       = 1
 else
+LDFLAGS_KERNEL_LTO =
 LDFLAGS_NOSTRIP_FLAG =
+LDFLAGS_KERNEL_STRIP_LTO =
 CFLAGS_NOLTO_FLAG =
-ifneq ($(CTFCONVERT),)
-SUPPORTS_CTFCONVERT    = 1
-endif
-NEEDS_CTF_MACHOS       = 0
 endif
 
 ifeq ($(BUILD_JSON_COMPILATION_DATABASE),1)
 BUILD_DSYM     := 0
-DO_CTFCONVERT  := 0
-DO_CTFMERGE    := 0
-DO_CTFMACHO    := 0
+DO_CTFMERGE    := 0
 KCC            = $(JSONCOMPILATIONDB) $(OBJPATH)/compile_commands.json $(PWD) $< $(CC)
 KC++           = $(JSONCOMPILATIONDB) $(OBJPATH)/compile_commands.json $(PWD) $< $(CXX)
 S_KCC          = $(JSONCOMPILATIONDB) $(OBJPATH)/compile_commands.json $(PWD) $< $(CC)
@@ -740,7 +774,15 @@ EXEC_INSTALL_FLAGS = -c -S -m 0755
 #
 # Header file destinations
 #
-FRAMEDIR = /System/Library/Frameworks
+
+ifeq ($(DRIVERKIT),1)
+    SDKHEADERSROOT=$(DRIVERKITRUNTIMEROOT)
+    # only whitelisted headers install outside of the DriverKit Runtime hierarchy
+    DRIVERKITSDKHEADERSROOT=$(DRIVERKITROOT)
+    DRIVERKITFRAMEDIR = $(DRIVERKITROOT)/System/Library/Frameworks
+endif
+
+FRAMEDIR = $(SDKHEADERSROOT)/System/Library/Frameworks
 
 SINCVERS = B
 SINCFRAME = $(FRAMEDIR)/System.framework
@@ -749,11 +791,17 @@ SPINCDIR = $(SINCFRAME)/Versions/$(SINCVERS)/PrivateHeaders
 SRESDIR = $(SINCFRAME)/Versions/$(SINCVERS)/Resources
 
 ifndef INCDIR
-    INCDIR = /usr/include
+    INCDIR = $(SDKHEADERSROOT)/usr/include
+endif
+ifndef DRIVERKITINCDIR
+    DRIVERKITINCDIR = $(DRIVERKITSDKHEADERSROOT)/usr/include
 endif
 ifndef LCLDIR
     LCLDIR = $(SPINCDIR)
 endif
+ifndef DRIVERKITLCLDIR
+    DRIVERKITLCLDIR = $(DRIVERKITSDKHEADERSROOT)/usr/local/include
+endif
 
 KINCVERS = A
 KINCFRAME = $(FRAMEDIR)/Kernel.framework
@@ -761,17 +809,27 @@ KINCDIR = $(KINCFRAME)/Versions/$(KINCVERS)/Headers
 KPINCDIR = $(KINCFRAME)/Versions/$(KINCVERS)/PrivateHeaders
 KRESDIR = $(KINCFRAME)/Versions/$(KINCVERS)/Resources
 
+DKIT_INCVERS = A
+DKIT_INCFRAME = DriverKit.framework
+DKIT_INCDIR = $(DKIT_INCFRAME)/Versions/$(DKIT_INCVERS)/Headers
+DKIT_PINCDIR = $(DKIT_INCFRAME)/Versions/$(DKIT_INCVERS)/PrivateHeaders
+# DriverKit SDK frameworks use shallow bundle structure
+DRIVERKIT_DKIT_INCDIR = $(DKIT_INCFRAME)/Headers
+DRIVERKIT_DKIT_PINCDIR = $(DKIT_INCFRAME)/PrivateHeaders
+
 XNU_PRIVATE_UNIFDEF = -UMACH_KERNEL_PRIVATE -UBSD_KERNEL_PRIVATE -UIOKIT_KERNEL_PRIVATE -ULIBKERN_KERNEL_PRIVATE -ULIBSA_KERNEL_PRIVATE -UPEXPERT_KERNEL_PRIVATE -UXNU_KERNEL_PRIVATE
 
 
 PLATFORM_UNIFDEF = $(foreach x,$(SUPPORTED_PLATFORMS),$(if $(filter $(PLATFORM),$(x)),-DPLATFORM_$(x) $(foreach token,$(PLATFORM_UNIFDEF_BLACKLIST_TOKENS_$(x)),-U$(token)),-UPLATFORM_$(x)))
 
 
-SPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__
-SINCFRAME_UNIFDEF  = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__
-KPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DKERNEL_PRIVATE -DPRIVATE -DKERNEL -U_OPEN_SOURCE_ -U__OPEN_SOURCE__
-KINCFRAME_UNIFDEF  = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UPRIVATE -DKERNEL -D_OPEN_SOURCE_ -D__OPEN_SOURCE__
-DATA_UNIFDEF       = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -D_OPEN_SOURCE_ -D__OPEN_SOURCE__
+SPINCFRAME_UNIFDEF  = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -UDRIVERKIT -U_OPEN_SOURCE_ -U__OPEN_SOURCE__
+SINCFRAME_UNIFDEF   = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -UDRIVERKIT -D_OPEN_SOURCE_ -D__OPEN_SOURCE__
+DKPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -DDRIVERKIT -U_OPEN_SOURCE_ -U__OPEN_SOURCE__
+DKINCFRAME_UNIFDEF  = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -DDRIVERKIT -D_OPEN_SOURCE_ -D__OPEN_SOURCE__
+KPINCFRAME_UNIFDEF  = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DKERNEL_PRIVATE -DKERNEL -DPRIVATE -UDRIVERKIT -U_OPEN_SOURCE_ -U__OPEN_SOURCE__
+KINCFRAME_UNIFDEF   = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -DKERNEL -UPRIVATE -UDRIVERKIT -D_OPEN_SOURCE_ -D__OPEN_SOURCE__
+DATA_UNIFDEF        = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -D_OPEN_SOURCE_ -D__OPEN_SOURCE__
 
 #
 # Compononent Header file destinations
index 55de6d307d6280655cb4197f9640c4ad9ff9f222..3637091421de43468d96bd0f076c6a9a69b031d1 100644 (file)
@@ -31,11 +31,22 @@ do_build_setup::
        $(_v)$(CAT) > $(OBJPATH)/compile_commands.json < /dev/null
 endif
 
+ifeq ($(BUILD_STATIC_LINK),1)
+ifeq ($(USE_LTO),1)
+# <rdar://problem/46252406>
+# To run LTO in the xnu project while linking the final result in KCB, without losing debugging info,
+# run ld -r on only the LTO bitcode object files to produce one mach-o for KCB to use, which is added
+# to the static link archive, along with the non-LTO objects (not linked, since ld -r on mach-o objects
+# does not preserve DWARF.)
+PRE_LTO=1
+endif
+endif
+
 #
 # Rules for the highly parallel "build" phase, where each build configuration
 # writes into their own $(TARGET) independent of other build configs
 #
-# There are 3 primary build outputs:
+# There are 4 primary build outputs:
 # 1) $(KERNEL_FILE_NAME).unstripped    (raw linked kernel, unstripped)
 # 2) $(KERNEL_FILE_NAME)               (stripped kernel, with optional CTF data)
 # 3) $(KERNEL_FILE_NAME).dSYM          (dSYM)
@@ -44,7 +55,7 @@ endif
 ifeq ($(BUILD_STATIC_LINK),1)
 
 KERNEL_STATIC_LINK_TARGETS = \
-        $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a
+       $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a
 
 KERNEL_STATIC_LINK_DST = \
                        $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a             \
@@ -78,45 +89,82 @@ do_build_kernel_dSYM: $(TARGET)/$(KERNEL_FILE_NAME).dSYM
 .CFLAGS: ALWAYS
        $(_v)$(REPLACECONTENTS) $@ $(KCC) $(CFLAGS) $(INCFLAGS)
 
-$(TARGET)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).unstripped
-       @echo "$(ColorH)STRIP$(Color0)      $(ColorLF)$(@F)$(Color0)"
+$(TARGET)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).unstripped $(TARGET)/$(KERNEL_FILE_NAME).dSYM
+       $(call makelog,$(ColorH)STRIP$(Color0)         $(ColorLF)$(@F)$(Color0))
        $(_v)$(STRIP) $(STRIP_FLAGS) $< -o $@
        $(_v)$(RM) $@.ctfdata
 ifeq ($(DO_CTFMERGE),1)
-       @echo "$(ColorH)CTFMERGE$(Color0)   $(ColorLF)$(@F)$(Color0)"
-       $(_v)$(FIND) $(TARGET)/ -name \*.ctf -size +0 |         \
-               $(XARGS) $(CTFMERGE) -l xnu -o $@ -Z $@.ctfdata || true
-endif
+       $(call makelog,$(ColorH)CTFCONVERT$(Color0)    $(ColorLF)$(@F)$(Color0))
+       $(_v)$(CTFCONVERT) -c -l xnu -u /xnu -o $@.ctf $(TARGET)/$(KERNEL_FILE_NAME).dSYM/Contents/Resources/DWARF/$(KERNEL_FILE_NAME)
+       $(call makelog,$(ColorH)CTFMERGE$(Color0)      $(ColorLF)$(@F)$(Color0))
+       $(_v)$(CTFMERGE) -l xnu -o $@ -Z $@.ctfdata $@.ctf
        $(_v)if [ -s $@.ctfdata ]; then                                                         \
-               echo "$(ColorH)CTFINSERT$(Color0)  $(ColorLF)$(@F)$(Color0)";                                   \
+               $(LOG) "$(ColorH)CTFINSERT$(Color0)     $(ColorLF)$(@F)$(Color0)";              \
                $(CTFINSERT) $@ $(ARCH_FLAGS_$(CURRENT_ARCH_CONFIG))                            \
                             $@.ctfdata -o $@;                                                  \
        fi;
+endif
        $(_v)$(LN) $(call function_convert_build_config_to_objdir,$(CURRENT_BUILD_CONFIG))/$(KERNEL_FILE_NAME) $(OBJROOT)/$(KERNEL_FILE_NAME)
 
 $(TARGET)/$(KERNEL_FILE_NAME).dSYM: $(TARGET)/$(KERNEL_FILE_NAME).unstripped
-       $(_v)echo "$(ColorH)DSYMUTIL$(Color0)   $(ColorLF)$(@F)$(Color0)"
+       $(call makelog,$(ColorH)DSYMUTIL$(Color0)      $(ColorLF)$(@F)$(Color0))
        $(_v)$(DSYMUTIL) $(DSYMUTIL_FLAGS) $< -o $@
        $(_v)$(MV) $@/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME).unstripped $@/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME)
        $(_v)$(TOUCH) $@
 
-$(TARGET)/$(KERNEL_FILE_NAME).unstripped: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST),$(component)/$(CURRENT_KERNEL_CONFIG)/$(component).filelist)) lastkerneldataconst.o lastkernelconstructor.o $(SRCROOT)/config/version.c $(SRCROOT)/config/MasterVersion .LDFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST))
+$(TARGET)/$(KERNEL_FILE_NAME).unstripped: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST),$(component)/$(CURRENT_KERNEL_CONFIG)/$(component).filelist)) lastkerneldataconst.o lastkernelconstructor.o nonlto.o $(SRCROOT)/config/version.c $(SRCROOT)/config/MasterVersion .LDFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST))
        $(_v)${MAKE} -f $(firstword $(MAKEFILE_LIST)) version.o
-       @echo "$(ColorL)LD$(Color0)  $(ColorLF)$(@F)$(Color0)"
+ifeq ($(PRE_LTO),1)
+       $(call makelog,$(ColorL)LTO$(Color0) $(ColorLF)$(@F)$(Color0))
+       $(_v)rm -f ltolink.filelist
+       $(_v)rm -f nonltolink.filelist
+       $(_v)files="$$($(CAT) $(filter %.filelist,$+)) version.o $(filter %.o,$+)"; \
+       for ofile in $$files; \
+       do \
+               hdr=$$(od -An -N 4 -t x4 $$ofile); \
+               if [ $$hdr == "0b17c0de" ]; \
+                       then \
+                               lto="$$lto$$ofile"$$'\n'; \
+                       else \
+                               nonlto="$$nonlto$$ofile"$$'\n'; \
+                       fi; \
+       done; \
+       printf "$$lto" >ltolink.filelist; \
+       printf "$$nonlto" >nonltolink.filelist
+       $(_v)if [ -s ltolink.filelist ]; \
+       then \
+               $(LD) $($(addsuffix $(CURRENT_ARCH_CONFIG),ARCH_FLAGS_)) -r nonlto.o -filelist ltolink.filelist $(LDFLAGS_KERNEL_LTO) -Wl,-object_path_lto,$(TARGET)/justlto.o -o $(TARGET)/justlto.tmp.o && \
+               $(LD) $(LDFLAGS_KERNEL) $(LDFLAGS_KERNEL_EXPORTS) -filelist nonltolink.filelist $(TARGET)/justlto.o $(LDFLAGS_KERNEL_STRIP_LTO) -o $@ $(LD_KERNEL_LIBS) $(LD_KERNEL_ARCHIVES); \
+       else \
+               $(LD) $(LDFLAGS_KERNEL) $(LDFLAGS_KERNEL_EXPORTS) -filelist nonltolink.filelist -o $@ $(LD_KERNEL_LIBS) $(LD_KERNEL_ARCHIVES); \
+       fi
+else
+       $(call makelog,$(ColorL)LD$(Color0)  $(ColorLF)$(@F)$(Color0))
        $(_v)$(CAT) $(filter %.filelist,$+) < /dev/null > link.filelist
        $(_v)$(LD) $(LDFLAGS_KERNEL) $(LDFLAGS_KERNEL_EXPORTS) -filelist link.filelist version.o $(filter %.o,$+) -o $@ $(LD_KERNEL_LIBS) $(LD_KERNEL_ARCHIVES)
+endif
+
 
-$(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST),$(component)/$(CURRENT_KERNEL_CONFIG)/$(component).filelist)) lastkerneldataconst.o lastkernelconstructor.o $(TARGET)/$(KERNEL_FILE_NAME).unstripped .LDFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST))
-       @echo "$(ColorL)LIBTOOL$(Color0)    $(ColorLF)$(@F)$(Color0)"
+$(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a:   $(TARGET)/$(KERNEL_FILE_NAME).unstripped .LDFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST))
+       $(call makelog,$(ColorL)LIBTOOL$(Color0)    $(ColorLF)$(@F)$(Color0))
        $(_v)$(MKDIR) $(dir $@)
-       $(_v)$(CAT) $(filter %.filelist,$+) < /dev/null > libtool.filelist
-       $(_v)$(LIBTOOL) -ca -filelist libtool.filelist $(filter %.o,$+) version.o -o $@
+ifeq ($(PRE_LTO),1)
+       $(_v)$(LIBTOOL) -ca $(TARGET)/justlto.o -filelist nonltolink.filelist -o $@
+else
+       $(_v)$(LIBTOOL) -ca -filelist link.filelist version.o lastkerneldataconst.o lastkernelconstructor.o -o $@
+endif
        $(_v)cp $(TARGET)/all-kpi.exp $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).exp
        $(_v)cp $(TARGET)/all-alias.exp $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).alias.exp
        $(_v)echo "$(LD_KERNEL_ARCHIVES)" >$(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarchives
        $(_v)echo "$(LDFLAGS_KERNEL) $(LD_KERNEL_LIBS)" >$(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarguments
        $(_v)$(LN) $(call function_convert_build_config_to_objdir,$(CURRENT_BUILD_CONFIG))/$(KERNEL_FILE_NAME).link $(OBJROOT)/$(KERNEL_FILE_NAME).link
 
+nonlto.o: .CFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST))
+nonlto.o:   $(SRCROOT)/libsa/nonlto.c
+       ${C_RULE_0}
+       ${C_RULE_1A}$< $(CFLAGS_NOLTO_FLAG)
+       ${C_RULE_2}
+
 -include version.d
 version.o: .CFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST))
 version.o: $(OBJPATH)/version.c
@@ -155,10 +203,10 @@ lastkernelconstructor.o: $(SRCROOT)/libsa/lastkernelconstructor.c
        ${C_RULE_3}
        ${C_RULE_4}
        $(_v)for last_file in ${LAST_FILES};                            \
-        do                                                     \
+       do                                                      \
                $(SEG_HACK) -s __DATA -n __LAST -o $${last_file}__ $${last_file} || exit 1; \
-                mv $${last_file}__ $${last_file} || exit 1;            \
-        done
+               mv $${last_file}__ $${last_file} || exit 1;             \
+       done
 
 #
 # Install rules. Each build config is classified as "primary" (the first
@@ -209,12 +257,11 @@ do_install_xnu_debug_files:       $(DSTROOT)/$(DEVELOPER_EXTRAS_DIR)/README.DEBUG-kern
 
 $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME) ALWAYS
        $(_v)$(MKDIR) $(dir $@)
+       $(call makelog,$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_MACHINE_CONFIG_LC)$(Color0))")
        $(_v)if [ $(OBJROOT)/.mach_kernel.timestamp -nt $@ ]; then      \
-               echo "$(ColorH)INSTALL$(Color0)    $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_MACHINE_CONFIG_LC)$(Color0))\""; \
                $(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@;                 \
                cmdstatus=$$?;                                          \
        else                                                            \
-               echo "$(ColorH)INSTALL$(Color0)    $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_MACHINE_CONFIG_LC)$(Color0))\""; \
                $(LIPO) -create $@ $< -output $@;                       \
                cmdstatus=$$?;                                          \
        fi;                                                             \
@@ -224,27 +271,27 @@ ifeq ($(BUILD_STATIC_LINK),1)
 
 $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a ALWAYS
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0)"
+       $(call makelog,$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0))
        $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
 
 $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarguments: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarguments ALWAYS
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0)"
+       $(call makelog,$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0))
        $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
 
 $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarchives: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarchives ALWAYS
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0)"
+       $(call makelog,$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0))
        $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
 
 $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).exp: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).exp ALWAYS
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0)"
+       $(call makelog,$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0))
        $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
 
 $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).alias.exp: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).alias.exp ALWAYS
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0)"
+       $(call makelog,$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0))
        $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
 
 # BUILD_STATIC_LINK
@@ -252,12 +299,11 @@ endif
 
 $(SYMROOT)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).unstripped ALWAYS
        $(_v)$(MKDIR) $(dir $@)
+       $(call makelog,$(ColorH)INSTALLSYM$(Color0)    $(ColorF)$(@F)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))")
        $(_v)if [ $(OBJROOT)/.mach_kernel.timestamp -nt $@ ]; then              \
-               echo "$(ColorH)INSTALLSYM$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""; \
                $(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@;                         \
                cmdstatus=$$?;                                                  \
        else                                                                    \
-               echo "$(ColorH)INSTALLSYM$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""; \
                $(LIPO) -create $@ $< -output $@;                               \
                cmdstatus=$$?;                                                  \
        fi;                                                                     \
@@ -269,7 +315,7 @@ $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).dS
 $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros:                                          \
 $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorH)INSTALLMACROS$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""
+       $(call makelog,$(ColorH)INSTALLMACROS$(Color0) $(ColorF)$(@F)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))")
        $(_v)$(CP) -r $< $(dir $@)
        $(_v)$(TOUCH) $@
 
@@ -278,27 +324,26 @@ $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).dS
 $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME):                        \
 $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME)
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorH)INSTALLMACROS$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""
+       $(call makelog,$(ColorH)INSTALLMACROS$(Color0) $(ColorF)$(@F)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))")
        $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
 
 $(DSTROOT)/$(DEVELOPER_EXTRAS_DIR)/README.DEBUG-kernel.txt: $(SRCROOT)/config/README.DEBUG-kernel.txt
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0)"
+       $(call makelog,$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0))
        $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
 
 $(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMINFODIR)/Info.plist $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMINFODIR)/Info.plist: $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMINFODIR)/Info.plist
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorH)INSTALLSYM$(Color0) $(ColorL)dSYM$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""
+       $(call makelog,$(ColorH)INSTALLSYM$(Color0)    $(ColorL)dSYM$(Color0) $(ColorF)$(@F)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))")
        $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
 
 $(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME) $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME) ALWAYS
        $(_v)$(MKDIR) $(dir $@)
+       $(call makelog,$(ColorH)INSTALLSYM$(Color0)    $(ColorL)dSYM$(Color0) $(ColorF)$(@F).dSYM$(ColorF) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))")
        $(_v)if [ $(OBJROOT)/.mach_kernel.timestamp -nt $@ ]; then                      \
-               echo "$(ColorH)INSTALLSYM$(Color0) $(ColorL)dSYM$(Color0) $(ColorF)$(@F).dSYM$(ColorF) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"";     \
                $(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@;                                 \
                cmdstatus=$$?;                                                          \
        else                                                                            \
-               echo "$(ColorH)INSTALLSYM$(Color0) $(ColorL)dSYM$(Color0) $(ColorF)$(@F).dSYM$(ColorF) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"";     \
                $(LIPO) -create $@ $< -output $@;                                       \
                cmdstatus=$$?;                                                          \
        fi;                                                                             \
@@ -331,6 +376,7 @@ do_install_machine_specific_KDK_dSYM: \
 # symlink during incremental builds and create a new symlink inside
 # the target of the existing symlink
 do_installhdrs_mi:: $(DSTROOT)/$(KRESDIR)/Info.plist
+ifneq ($(INSTALLHDRS_SKIP_HOST),YES)
        $(_v)$(MKDIR) $(DSTROOT)/$(KINCFRAME)
        $(_v)$(MKDIR) $(DSTROOT)/$(KPINCDIR)
        $(_v)$(MKDIR) $(DSTROOT)/$(KRESDIR)
@@ -345,17 +391,20 @@ do_installhdrs_mi:: $(DSTROOT)/$(KRESDIR)/Info.plist
        $(_v)$(RM) $(DSTROOT)/$(KINCFRAME)/Resources
        $(_v)$(LN) Versions/Current/Resources                   \
                   $(DSTROOT)/$(KINCFRAME)/Resources
+endif
 
 $(DSTROOT)/$(KRESDIR)/Info.plist: $(SOURCE)/EXTERNAL_HEADERS/Info.plist
+ifneq ($(INSTALLHDRS_SKIP_HOST),YES)
        $(_v)$(MKDIR) $(DSTROOT)/$(KRESDIR)
        $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@
        $(_v)$(NEWVERS) $@ $(_vstdout)
 ifeq ($(USE_BINARY_PLIST),1)
        $(_v)$(PLUTIL) -convert binary1 -o $@ $@
 endif
+endif
 
 $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(ALIAS_FILE_NAME): ALWAYS
-       $(_v)echo "$(ColorH)ALIAS$(Color0)      $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_MACHINE_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_ALIAS_MACHINE_CONFIG_LC)$(Color0))\""
+       $(call makelog,$(ColorH)ALIAS$(Color0)         $(ColorF)$(@F)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_MACHINE_CONFIG_LC)$(Color0) $(ColorLF)$(CURRENT_ALIAS_MACHINE_CONFIG_LC)$(Color0))")
        $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME) $@
 
 install_alias: $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(ALIAS_FILE_NAME)
index d2e05a89fea2cdf3c00ef2f4ed107c7629143411..3961872d55f1e88abdfcf00c2c5c6d0659fd4ccc 100644 (file)
@@ -61,29 +61,29 @@ ifeq ($(LOGCOLORS),y)
         ifeq ($(CURRENT_MACHINE_CONFIG),NONE)
             export _MACHINE := $(subst Mac,,$(PLATFORM))
         endif
-        export CMD_MC := \\033[1m$(shell __A="$(CURRENT_ARCH_CONFIG_LC)"; \
+        export CMD_MC := $(shell __A="$(CURRENT_ARCH_CONFIG_LC)"; \
                                          __As=$$((6-$${\#__A})); \
-                                         printf "%-.6s%*.*s %9.9s" \
+                                         printf "\\033[1m%-.6s%*.*s %9.9s\\033[m" \
                                                 "$${__A}" \
                                                 $${__As} $${__As} " " \
-                                                "$(_MACHINE)")\\033[m
+                                                "$(_MACHINE)")
     endif
     # Turn off colored output
-    Color0=\\033[m
+    Color0:=$(shell printf "\\033[m")
     # Start a host command: bold, underlined pink text
-    ColorH=\\033[1;4;35m
+    ColorH:=$(shell printf "\\033[1;4;35m")
     # Start a compilation-related command: bold, underlined blue text
-    ColorC=[$(CMD_MC)] \\033[1;4;34m
+    ColorC:=$(shell printf "[$(CMD_MC)] \\033[1;4;34m")
     # Start a MIG command: bold, green text on light grey background
-    ColorM=[$(CMD_MC)] \\033[1;32;40m
+    ColorM:=$(shell printf "[$(CMD_MC)] \\033[1;32;40m")
     # Start a linking command: bold, white text on blue background
-    ColorL=[$(CMD_MC)] \\033[1;37;44m
+    ColorL:=$(shell printf "[$(CMD_MC)] \\033[1;37;44m")
     # Start a filename: bold, white text
-    ColorF=\\033[1;37m
+    ColorF:=$(shell printf "\\033[1;37m")
     # Start a linked file name: yellow text on light grey background
-    ColorLF=\\033[1;33;40m
+    ColorLF:=$(shell printf "\\033[1;33;40m")
     # Error strings: underlined bold white text on red background
-    ColorErr=\033[1;4;37;41m
+    ColorErr:=$(shell printf "\033[1;4;37;41m")
 endif
 
 .PHONY: ALWAYS
@@ -112,10 +112,10 @@ $(3)/.UNIFDEF_FLAGS: ALWAYS | $(3)_MKDIR
        $$(_v)$$(REPLACECONTENTS) $$@ $$(UNIFDEF) $(4)
 
 $(1): $(dir $(firstword $(1)))% : $(if $(2),%,$$(SOURCE)/%) | $(3)_MKDIR
-       @echo "$$(ColorH)INSTALLHDR$$(Color0)    $$(ColorF)$$*$$(Color0)"
+       $$(call makelog,$$(ColorH)INSTALLHDR$$(Color0)    $$(ColorF)$$*$$(Color0))
        $$(_v)$$(UNIFDEF) $(4) $$< > ./$(3)/$$*.unifdef.$$$$$$$$;       \
        if [ $$$$? -eq 2 ]; then                                                \
-               echo Parse failure for $$<;                             \
+               $(ERR) Parse failure for $$<;                           \
                exit 1;                                                 \
        fi;                                                             \
        $$(DECOMMENT) ./$(3)/$$*.unifdef.$$$$$$$$ r >                   \
@@ -148,11 +148,11 @@ $(3)/.UNIFDEF_FLAGS: ALWAYS | $(3)_MKDIR
        $$(_v)$$(REPLACECONTENTS) $$@ $$(UNIFDEF) -t $(4)
 
 $(1): $(5)% : $(2) | $(3)_MKDIR
-       @echo "$$(ColorH)INSTALLPY$$(Color0)    $$(ColorF)$$*$$(Color0)"
+       $$(call makelog,$$(ColorH)INSTALLPY$$(Color0)     $$(ColorF)$$*$$(Color0))
        $$(_v)$$(MKDIR) $$(dir $$@) $$(dir ./$(3)/$$*)
        $$(_v)$$(UNIFDEF) -t $(4) $$< > ./$(3)/$$*.unifdef.$$$$$$$$$$(suffix $$*); \
        if [ $$$$? -eq 2 ]; then                                                \
-               echo Parse failure for $$<;                             \
+               $(ERR) Parse failure for $$<;                           \
                exit 1;                                                 \
        fi;                                                             \
        $$(INSTALL) $$(DATA_INSTALL_FLAGS) \
@@ -163,6 +163,37 @@ $(1): $(5)% : $(2) | $(3)_MKDIR
        $$(_v)if [ -n "$(5)" ]; then $$(TOUCH) "$(5)"; fi
 endef
 
+#
+# Empty the install lists of non-host headers if building the host headers alias
+#
+
+ifeq ($(INSTALLHDRS_SKIP_HOST),YES)
+INSTALL_MI_LIST =
+INSTALL_MI_GEN_LIST =
+INSTALL_DRIVERKIT_MI_LIST =
+INSTALL_DRIVERKIT_MI_GEN_LIST =
+INSTALL_KF_MI_LIST =
+INSTALL_KF_MI_GEN_LIST =
+INSTALL_MI_LCL_LIST =
+INSTALL_MI_LCL_GEN_LIST =
+INSTALL_DRIVERKIT_MI_LCL_LIST =
+INSTALL_DRIVERKIT_MI_LCL_GEN_LIST =
+INSTALL_KF_MI_LCL_LIST =
+INSTALL_KF_MI_LCL_GEN_LIST =
+INSTALL_MD_LIST =
+INSTALL_MD_GEN_LIST =
+INSTALL_DRIVERKIT_MD_LIST =
+INSTALL_DRIVERKIT_MD_GEN_LIST =
+INSTALL_KF_MD_LIST =
+INSTALL_KF_MD_GEN_LIST =
+INSTALL_MD_LCL_LIST =
+INSTALL_MD_LCL_GEN_LIST =
+INSTALL_DRIVERKIT_MD_LCL_LIST =
+INSTALL_DRIVERKIT_MD_LCL_GEN_LIST =
+INSTALL_KF_MD_LCL_LIST =
+INSTALL_KF_MD_LCL_GEN_LIST =
+endif
+
 #
 # Machine-independent (public) files
 #
@@ -173,6 +204,14 @@ INSTALL_MI_INC_GEN_FILES = $(addprefix $(DSTROOT)/$(INCDIR)/$(INSTALL_MI_DIR)/,
 $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MI_INC_FILES),,incmidir,$(SINCFRAME_UNIFDEF)))
 $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MI_INC_GEN_FILES),1,incmigendir,$(SINCFRAME_UNIFDEF)))
 
+ifeq ($(DRIVERKIT),1)
+INSTALL_DRIVERKIT_MI_INC_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITINCDIR)/$(INSTALL_MI_DIR)/, $(INSTALL_DRIVERKIT_MI_LIST))
+INSTALL_DRIVERKIT_MI_INC_GEN_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITINCDIR)/$(INSTALL_MI_DIR)/, $(INSTALL_DRIVERKIT_MI_GEN_LIST))
+
+$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MI_INC_FILES),,dkincmidir,$(DKINCFRAME_UNIFDEF)))
+$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MI_INC_GEN_FILES),1,dkincmigendir,$(DKINCFRAME_UNIFDEF)))
+endif
+
 INSTALL_KF_MI_FILES = $(addprefix $(DSTROOT)/$(KINCDIR)/$(EXPORT_MI_DIR)/, $(INSTALL_KF_MI_LIST))
 INSTALL_KF_MI_GEN_FILES = $(addprefix $(DSTROOT)/$(KINCDIR)/$(EXPORT_MI_DIR)/, $(INSTALL_KF_MI_GEN_LIST))
 
@@ -189,6 +228,14 @@ INSTALL_MI_LCL_GEN_FILES = $(addprefix $(DSTROOT)/$(LCLDIR)/$(INSTALL_MI_DIR)/,
 $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MI_LCL_FILES),,pincmidir,$(SPINCFRAME_UNIFDEF)))
 $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MI_LCL_GEN_FILES),1,pincmigendir,$(SPINCFRAME_UNIFDEF)))
 
+ifeq ($(DRIVERKIT),1)
+INSTALL_DRIVERKIT_MI_LCL_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITLCLDIR)/$(INSTALL_MI_DIR)/, $(INSTALL_DRIVERKIT_MI_LCL_LIST))
+INSTALL_DRIVERKIT_MI_LCL_GEN_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITLCLDIR)/$(INSTALL_MI_DIR)/, $(INSTALL_DRIVERKIT_MI_LCL_GEN_LIST))
+
+$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MI_LCL_FILES),,dkpincmidir,$(DKPINCFRAME_UNIFDEF)))
+$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MI_LCL_GEN_FILES),1,dkpincmigendir,$(DKPINCFRAME_UNIFDEF)))
+endif
+
 INSTALL_KF_MI_LCL_FILES = $(addprefix $(DSTROOT)/$(KPINCDIR)/$(EXPORT_MI_DIR)/, $(INSTALL_KF_MI_LCL_LIST))
 INSTALL_KF_MI_LCL_GEN_FILES = $(addprefix $(DSTROOT)/$(KPINCDIR)/$(EXPORT_MI_DIR)/, $(INSTALL_KF_MI_LCL_GEN_LIST))
 
@@ -205,6 +252,14 @@ INSTALL_MD_INC_GEN_FILES = $(addprefix $(DSTROOT)/$(INCDIR)/$(INSTALL_MD_DIR)/,
 $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MD_INC_FILES),,incdir,$(SINCFRAME_UNIFDEF)))
 $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MD_INC_GEN_FILES),1,incgendir,$(SINCFRAME_UNIFDEF)))
 
+ifeq ($(DRIVERKIT),1)
+INSTALL_DRIVERKIT_MD_INC_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITINCDIR)/$(INSTALL_MD_DIR)/, $(INSTALL_DRIVERKIT_MD_LIST))
+INSTALL_DRIVERKIT_MD_INC_GEN_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITINCDIR)/$(INSTALL_MD_DIR)/, $(INSTALL_DRIVERKIT_MD_GEN_LIST))
+
+$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MD_INC_FILES),,dkincdir,$(DKINCFRAME_UNIFDEF)))
+$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MD_INC_GEN_FILES),1,dkincgendir,$(DKINCFRAME_UNIFDEF)))
+endif
+
 INSTALL_KF_MD_FILES = $(addprefix $(DSTROOT)/$(KINCDIR)/$(EXPORT_MD_DIR)/, $(INSTALL_KF_MD_LIST))
 INSTALL_KF_MD_GEN_FILES = $(addprefix $(DSTROOT)/$(KINCDIR)/$(EXPORT_MD_DIR)/, $(INSTALL_KF_MD_GEN_LIST))
 
@@ -221,6 +276,14 @@ INSTALL_MD_LCL_GEN_FILES = $(addprefix $(DSTROOT)/$(LCLDIR)/$(INSTALL_MD_DIR)/,
 $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MD_LCL_FILES),,pincdir,$(SPINCFRAME_UNIFDEF)))
 $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MD_LCL_GEN_FILES),1,pincgendir,$(SPINCFRAME_UNIFDEF)))
 
+ifeq ($(DRIVERKIT),1)
+INSTALL_DRIVERKIT_MD_LCL_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITLCLDIR)/$(INSTALL_MD_DIR)/, $(INSTALL_DRIVERKIT_MD_LCL_LIST))
+INSTALL_DRIVERKIT_MD_LCL_GEN_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITLCLDIR)/$(INSTALL_MD_DIR)/, $(INSTALL_DRIVERKIT_MD_LCL_GEN_LIST))
+
+$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MD_LCL_FILES),,dkpincdir,$(DKPINCFRAME_UNIFDEF)))
+$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_DRIVERKIT_MD_LCL_GEN_FILES),1,dkpincgendir,$(DKPINCFRAME_UNIFDEF)))
+endif
+
 INSTALL_KF_MD_LCL_FILES = $(addprefix $(DSTROOT)/$(KPINCDIR)/$(EXPORT_MD_DIR)/, $(INSTALL_KF_MD_LCL_LIST))
 INSTALL_KF_MD_LCL_GEN_FILES = $(addprefix $(DSTROOT)/$(KPINCDIR)/$(EXPORT_MD_DIR)/, $(INSTALL_KF_MD_LCL_GEN_LIST))
 
@@ -230,14 +293,22 @@ $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_KF_MD_LCL_GEN_FILES),1,kpincge
 .PHONY: do_installhdrs_mi
 
 # Double-colon rule so that MakeInc.kernel can add custom behaviors
-do_installhdrs_mi:: $(INSTALL_MI_INC_FILES) $(INSTALL_MI_INC_GEN_FILES) $(INSTALL_KF_MI_FILES) $(INSTALL_KF_MI_GEN_FILES) \
-                   $(INSTALL_MI_LCL_FILES) $(INSTALL_MI_LCL_GEN_FILES) $(INSTALL_KF_MI_LCL_FILES) $(INSTALL_KF_MI_LCL_GEN_FILES)
+do_installhdrs_mi:: $(INSTALL_MI_INC_FILES) $(INSTALL_MI_INC_GEN_FILES) \
+                   $(INSTALL_DRIVERKIT_MI_INC_FILES) $(INSTALL_DRIVERKIT_MI_INC_GEN_FILES) \
+                   $(INSTALL_KF_MI_FILES) $(INSTALL_KF_MI_GEN_FILES) \
+                   $(INSTALL_MI_LCL_FILES) $(INSTALL_MI_LCL_GEN_FILES) \
+                   $(INSTALL_DRIVERKIT_MI_LCL_FILES) $(INSTALL_DRIVERKIT_MI_LCL_GEN_FILES) \
+                   $(INSTALL_KF_MI_LCL_FILES) $(INSTALL_KF_MI_LCL_GEN_FILES)
        @:
 
 .PHONY: do_installhdrs_md
 
-do_installhdrs_md: $(INSTALL_MD_INC_FILES) $(INSTALL_MD_INC_GEN_FILES) $(INSTALL_KF_MD_FILES) $(INSTALL_KF_MD_GEN_FILES) \
-                  $(INSTALL_MD_LCL_FILES) $(INSTALL_MD_LCL_GEN_FILES) $(INSTALL_KF_MD_LCL_FILES) $(INSTALL_KF_MD_LCL_GEN_FILES)
+do_installhdrs_md: $(INSTALL_MD_INC_FILES) $(INSTALL_MD_INC_GEN_FILES) \
+                  $(INSTALL_DRIVERKIT_MD_INC_FILES) $(INSTALL_DRIVERKIT_MD_INC_GEN_FILES) \
+                  $(INSTALL_KF_MD_FILES) $(INSTALL_KF_MD_GEN_FILES) \
+                  $(INSTALL_MD_LCL_FILES) $(INSTALL_MD_LCL_GEN_FILES) \
+                  $(INSTALL_DRIVERKIT_MD_LCL_FILES) $(INSTALL_DRIVERKIT_MD_LCL_GEN_FILES) \
+                  $(INSTALL_KF_MD_LCL_FILES) $(INSTALL_KF_MD_LCL_GEN_FILES)
        @:
 
 #
@@ -289,7 +360,7 @@ do_exporthdrs_md: $(EXPORT_MD_GEN_INC_FILES)  $(EXPORT_MD_INC_FILES)
 # Compilation rules to generate .o from .s
 #
 
-S_RULE_0=@echo "$(ColorC)AS$(Color0)  $(ColorF)$@$(Color0)"
+S_RULE_0=$(call makelog,$(ColorC)AS$(Color0)  $(ColorF)$@$(Color0))
 S_RULE_1A=$(_v)${S_KCC} -c ${SFLAGS} -MD -MF $(@:o=d) -MP ${$@_SFLAGS_ADD} ${INCFLAGS} ${$@_INCFLAGS}
 S_RULE_1B=$(<F)
 S_RULE_2=
@@ -297,38 +368,13 @@ S_RULE_2=
 #
 # Compilation rules to generate .o from .c for normal files
 #
-C_RULE_0=@echo "$(ColorC)CC$(Color0)  $(ColorF)$@$(Color0)"
+C_RULE_0=$(call makelog,$(ColorC)CC$(Color0)  $(ColorF)$@$(Color0))
 C_RULE_1A=$(_v)${KCC} -c ${filter-out ${$@_CFLAGS_RM}, ${CFLAGS} ${CWARNFLAGS}} -MD -MF $(@:o=d) -MP ${$@_CFLAGS_ADD} ${$@_CWARNFLAGS_ADD} ${INCFLAGS} ${$@_INCFLAGS}
 C_RULE_1B=$(<F)
-ifeq ($(BUILD_MACHO_OBJ),0)
-C_RULE_2=
-else ifeq ($(DO_CTFCONVERT),1)
-C_RULE_2=$(_v)if [ -z "${$@_SKIP_CTFCONVERT}" ]; then \
-                  ctferr=`${CTFCONVERT} -l xnu -v -o $@.ctf $@ 2>&1 > /dev/null || true`; \
-                  if [ ! -z "$${ctferr}" ]; then \
-                      echo "[$(CMD_MC)] $(ColorErr)$@$(Color0)  $(ColorErr)$${ctferr}$(Color0)"; \
-                  fi; \
-              fi
-else
 C_RULE_2=
-endif
-ifeq ($(DO_CTFMACHO), 1)
-C_CTFRULE_1A=$(_v)${KCC} -o $@.non_lto -c ${filter-out ${$@_CFLAGS_RM}, ${CFLAGS} ${CWARNFLAGS}} ${$@_CFLAGS_ADD} ${$@_CWARNFLAGS_ADD} ${INCFLAGS} $(CFLAGS_NOLTO_FLAG) ${$@_INCFLAGS}
-C_CTFRULE_1B=$(<F)
-C_CTFRULE_2=$(_v)if [ -z "${$@_SKIP_CTFCONVERT}" ]; then \
-                     ctferr=`${CTFCONVERT} -l xnu -v -o $@.non_lto.ctf $@.non_lto 2>&1 > /dev/null || true`; \
-                     if [ ! -z "$${ctferr}" ]; then \
-                         echo "[$(CMD_MC)] $(ColorErr)$@$(Color0)  $(ColorErr)$${ctferr}$(Color0)"; \
-                     fi; \
-                 fi
-else
-C_CTFRULE_1A=@true 
-C_CTFRULE_1B=
-C_CTFRULE_2=@true 
-endif
 
-C_RULE_3=@true 
-C_RULE_4A=@true 
+C_RULE_3=
+C_RULE_4A=
 C_RULE_4B=
 
 #
@@ -341,25 +387,18 @@ C_RULE_2_D=${C_RULE_2}
 C_RULE_3_D=${C_RULE_3}
 C_RULE_4A_D=${C_RULE_4A}
 C_RULE_4B_D=${C_RULE_4B}
-C_CTFRULE_1A_D=${C_CTFRULE_1A}
-C_CTFRULE_1B_D=${C_CTFRULE_1B}
-C_CTFRULE_2_D=${C_CTFRULE_2}
-C_CTFRULE_3_D=${C_CTFRULE_3}
 
 #
 # Compilation rules to generate .co from .cp or .cpo from .cpp
 #   The config tool slickly changes the last source filename char to 'o'
 #   for the object filename.
-P_RULE_0=@echo "$(ColorC)C++$(Color0) $(ColorF)$@$(Color0)"
+P_RULE_0=$(call makelog,$(ColorC)C++$(Color0) $(ColorF)$@$(Color0))
 P_RULE_1A=$(_v)${KC++} -o $@ -c ${CXXFLAGS} ${filter-out ${$@_CFLAGS_RM}, ${CFLAGS} ${CXXWARNFLAGS}} -MD -MF $(@:o=d) -MP ${$@_CFLAGS_ADD} ${$@_CXXWARNFLAGS_ADD} ${INCFLAGS} ${$@_INCFLAGS}
 P_RULE_1B=$(<F)
 P_RULE_2=
-P_CTFRULE_1A=@true 
-P_CTFRULE_1B=
-P_CTFRULE_2=@true 
 
-P_RULE_3=@true 
-P_RULE_4A=@true 
+P_RULE_3=
+P_RULE_4A=
 P_RULE_4B=
 
 #
@@ -401,12 +440,12 @@ INSTALL_MAN_DIR:
        $(_v)$(MKDIR) $(DSTROOT)/$(MANDIR)/$(INSTALL_MAN_DIR)
 
 $(INSTALL_MAN_FILES): $(DSTROOT)/$(MANDIR)/$(INSTALL_MAN_DIR)/% : % | INSTALL_MAN_DIR
-       @echo "$(ColorH)MAN$(Color0)        $(ColorF)$*$(Color0)"
+       $(call makelog,$(ColorH)MAN$(Color0)           $(ColorF)$*$(Color0))
        $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
 
 define MAN_LINKS_RULE_template
 $$(DSTROOT)/$$(MANDIR)/$$(INSTALL_MAN_DIR)/$(2): $$(DSTROOT)/$$(MANDIR)/$$(INSTALL_MAN_DIR)/$(1)
-       @echo "$$(ColorH)MANLINK$$(Color0)    $$(ColorF)$(2)$$(Color0)"
+       $$(call makelog,$$(ColorH)MANLINK$$(Color0)       $$(ColorF)$(2)$$(Color0))
        $(_v)ln -f $$< $$@
 endef
 
index 3c8521dd6350bf97ea140f907057e04c0bc21a8c..dd1070e6422d48966e9e8eaa6f7bbf03b1bf2789 100644 (file)
@@ -88,7 +88,7 @@ override DEFAULT_I386_MACHINE_CONFIG := NONE
 override DEFAULT_X86_64_MACHINE_CONFIG := NONE
 override DEFAULT_X86_64H_MACHINE_CONFIG := NONE
 override DEFAULT_ARM_MACHINE_CONFIG    := T8002
-override DEFAULT_ARM64_MACHINE_CONFIG  := S5L8960X
+override DEFAULT_ARM64_MACHINE_CONFIG  := T7000
 
 # This is typically never specified (TARGET_CONFIGS is used)
 ifndef MACHINE_CONFIGS
@@ -556,6 +556,11 @@ else ifeq ($(RC_ProjectName),xnu_kasan)
 install: install_config install_kernels
 else ifeq ($(RC_ProjectName),xnu_headers_Sim)
 install: installhdrs
+else ifeq ($(RC_ProjectName),xnu_headers_host)
+install: installhdrs
+export INSTALLHDRS_SKIP_HOST=YES
+else ifeq ($(RC_ProjectName),xnu_headers_driverkit)
+install: installhdrs_desktop
 else
 
 install: installhdrs install_textfiles install_config install_kernels install_aliases checkstyle
index b3beb261918043a0c9c9dace56b805bfbca41817..a79b6f8c79447b0d6c84c2608f25e6125f22a03a 100644 (file)
@@ -6,7 +6,6 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
 include $(MakeInc_cmd)
 include $(MakeInc_def)
 
-
 INSTINC_SUBDIRS = \
        mach    \
        atm     \
index 1f56c232344776f23c87e3ecdeaeef55e17ee39f..b5ddcad612d446048db93e48e3ee745d72eb8a26 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -54,7 +54,7 @@
  */
 
 struct UNDReply {
-       decl_lck_mtx_data(, lock)                                /* UNDReply lock */
+       decl_lck_mtx_data(, lock);                               /* UNDReply lock */
        int                             userLandNotificationKey;
        KUNCUserNotificationCallBack    callback;
        boolean_t                       inprogress;
@@ -203,18 +203,11 @@ KUNCGetNotificationID(void)
 
        reply = (UNDReplyRef) kalloc(sizeof(struct UNDReply));
        if (reply != UND_REPLY_NULL) {
-               reply->self_port = ipc_port_alloc_kernel();
-               if (reply->self_port == IP_NULL) {
-                       kfree(reply, sizeof(struct UNDReply));
-                       reply = UND_REPLY_NULL;
-               } else {
-                       lck_mtx_init(&reply->lock, &LockCompatGroup, LCK_ATTR_NULL);
-                       reply->userLandNotificationKey = -1;
-                       reply->inprogress = FALSE;
-                       ipc_kobject_set(reply->self_port,
-                           (ipc_kobject_t)reply,
-                           IKOT_UND_REPLY);
-               }
+               reply->self_port = ipc_kobject_alloc_port((ipc_kobject_t)reply,
+                   IKOT_UND_REPLY, IPC_KOBJECT_ALLOC_NONE);
+               lck_mtx_init(&reply->lock, &LockCompatGroup, LCK_ATTR_NULL);
+               reply->userLandNotificationKey = -1;
+               reply->inprogress = FALSE;
        }
        return (KUNCUserNotificationID) reply;
 }
index 2de33166adaa40523b67bc1e17d750159303424e..be0723a8bcee540978f9fb5d059df9139171e6c0 100644 (file)
@@ -63,7 +63,7 @@ ${COMP_FILES} : ${MIG_TYPES}
 
 ${MIG_KUSRC} : \
        %.c : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS}        \
                -user    $*.c              \
                -header  $*.h              \
@@ -73,7 +73,7 @@ ${MIG_KUSRC} : \
 
 ${MIG_KSSRC}: \
        %Server.c : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS}        \
                -user    /dev/null              \
                -header  /dev/null              \
index 38c19f380ab98b1aca070b1da6a41f79753d4da5..ab9975c6edc4023268691421b83faeba99ac796f 100644 (file)
@@ -20,6 +20,7 @@ ARM_HEADER_FILES =    \
                machine_cpu.h \
                machine_cpuid.h \
                machine_routines.h \
+               memory_types.h \
                pal_routines.h \
                pmap_public.h \
                proc_reg.h \
index cf490691536474765aebdff9e8fdfc65e12b41e7..9f2b60169165682de9b68261bf1f327491e51a74 100644 (file)
@@ -85,6 +85,7 @@ extern void sleep_token_buffer_init(void);
 extern vm_offset_t intstack_top;
 #if __arm64__
 extern vm_offset_t excepstack_top;
+extern uint64_t events_per_sec;
 #else
 extern vm_offset_t fiqstack_top;
 #endif
@@ -101,6 +102,7 @@ boolean_t up_style_idle_exit = 0;
 
 
 
+
 #if INTERRUPT_MASKED_DEBUG
 boolean_t interrupt_masked_debug = 1;
 uint64_t interrupt_masked_timeout = 0xd0000;
@@ -135,6 +137,9 @@ unsigned int page_shift_user32; /* for page_size as seen by a 32-bit task */
  * JOP rebasing
  */
 
+#if defined(HAS_APPLE_PAC)
+#include <ptrauth.h>
+#endif /* defined(HAS_APPLE_PAC) */
 
 // Note, the following should come from a header from dyld
 static void
@@ -145,6 +150,11 @@ rebase_chain(uintptr_t chainStartAddress, uint64_t stepMultiplier, uintptr_t bas
        do {
                uint64_t value = *(uint64_t*)address;
 
+#if HAS_APPLE_PAC
+               uint16_t diversity = (uint16_t)(value >> 32);
+               bool hasAddressDiversity = (value & (1ULL << 48)) != 0;
+               ptrauth_key key = (ptrauth_key)((value >> 49) & 0x3);
+#endif
                bool isAuthenticated = (value & (1ULL << 63)) != 0;
                bool isRebase = (value & (1ULL << 62)) == 0;
                if (isRebase) {
@@ -153,6 +163,33 @@ rebase_chain(uintptr_t chainStartAddress, uint64_t stepMultiplier, uintptr_t bas
                                uint64_t newValue = (value & 0xFFFFFFFF) + slide;
                                // Add in the offset from the mach_header
                                newValue += baseAddress;
+#if HAS_APPLE_PAC
+                               // We have bits to merge in to the discriminator
+                               uintptr_t discriminator = diversity;
+                               if (hasAddressDiversity) {
+                                       // First calculate a new discriminator using the address of where we are trying to store the value
+                                       // Only blend if we have a discriminator
+                                       if (discriminator) {
+                                               discriminator = __builtin_ptrauth_blend_discriminator((void*)address, discriminator);
+                                       } else {
+                                               discriminator = address;
+                                       }
+                               }
+                               switch (key) {
+                               case ptrauth_key_asia:
+                                       newValue = (uintptr_t)__builtin_ptrauth_sign_unauthenticated((void*)newValue, ptrauth_key_asia, discriminator);
+                                       break;
+                               case ptrauth_key_asib:
+                                       newValue = (uintptr_t)__builtin_ptrauth_sign_unauthenticated((void*)newValue, ptrauth_key_asib, discriminator);
+                                       break;
+                               case ptrauth_key_asda:
+                                       newValue = (uintptr_t)__builtin_ptrauth_sign_unauthenticated((void*)newValue, ptrauth_key_asda, discriminator);
+                                       break;
+                               case ptrauth_key_asdb:
+                                       newValue = (uintptr_t)__builtin_ptrauth_sign_unauthenticated((void*)newValue, ptrauth_key_asdb, discriminator);
+                                       break;
+                               }
+#endif
                                *(uint64_t*)address = newValue;
                        } else {
                                // Regular pointer which needs to fit in 51-bits of value.
@@ -190,6 +227,7 @@ rebase_threaded_starts(uint32_t *threadArrayStart, uint32_t *threadArrayEnd,
        return true;
 }
 
+
 /*
  *             Routine:                arm_init
  *             Function:
@@ -222,12 +260,29 @@ arm_init(
        BootArgs = args = &const_boot_args;
 
        cpu_data_init(&BootCpuData);
+#if defined(HAS_APPLE_PAC)
+       /* bootstrap cpu process dependent key for kernel has been loaded by start.s */
+       BootCpuData.rop_key = KERNEL_ROP_ID;
+#endif /* defined(HAS_APPLE_PAC) */
 
        PE_init_platform(FALSE, args); /* Get platform expert set up */
 
 #if __arm64__
 
 
+#if defined(HAS_APPLE_PAC)
+       boolean_t user_jop = TRUE;
+       PE_parse_boot_argn("user_jop", &user_jop, sizeof(user_jop));
+       if (!user_jop) {
+               args->bootFlags |= kBootFlagsDisableUserJOP;
+       }
+       boolean_t user_ts_jop = TRUE;
+       PE_parse_boot_argn("user_ts_jop", &user_ts_jop, sizeof(user_ts_jop));
+       if (!user_ts_jop) {
+               args->bootFlags |= kBootFlagsDisableUserThreadStateJOP;
+       }
+#endif /* defined(HAS_APPLE_PAC) */
+
        {
                unsigned int    tmp_16k = 0;
 
@@ -339,12 +394,17 @@ arm_init(
 
        rtclock_early_init();
 
+       lck_mod_init();
+
+       /*
+        * Initialize the timer callout world
+        */
+       timer_call_init();
+
        kernel_early_bootstrap();
 
        cpu_init();
 
-       EntropyData.index_ptr = EntropyData.buffer;
-
        processor_bootstrap();
        my_master_proc = master_processor;
 
@@ -366,7 +426,7 @@ arm_init(
        /* Disable if WDT is disabled or no_interrupt_mask_debug in boot-args */
        if (PE_parse_boot_argn("no_interrupt_masked_debug", &interrupt_masked_debug,
            sizeof(interrupt_masked_debug)) || (PE_parse_boot_argn("wdt", &wdt_boot_arg,
-           sizeof(wdt_boot_arg)) && (wdt_boot_arg == -1))) {
+           sizeof(wdt_boot_arg)) && (wdt_boot_arg == -1)) || kern_feature_override(KF_INTERRUPT_MASKED_DEBUG_OVRD)) {
                interrupt_masked_debug = 0;
        }
 
@@ -450,7 +510,26 @@ arm_init(
 #endif
 
        PE_init_platform(TRUE, &BootCpuData);
+
+#if __arm64__
+       if (PE_parse_boot_argn("wfe_events_sec", &events_per_sec, sizeof(events_per_sec))) {
+               if (events_per_sec <= 0) {
+                       events_per_sec = 1;
+               } else if (events_per_sec > USEC_PER_SEC) {
+                       events_per_sec = USEC_PER_SEC;
+               }
+       } else {
+#if defined(ARM_BOARD_WFE_TIMEOUT_NS)
+               events_per_sec = NSEC_PER_SEC / ARM_BOARD_WFE_TIMEOUT_NS;
+#else /* !defined(ARM_BOARD_WFE_TIMEOUT_NS) */
+               /* Default to 1usec (or as close as we can get) */
+               events_per_sec = USEC_PER_SEC;
+#endif /* !defined(ARM_BOARD_WFE_TIMEOUT_NS) */
+       }
+#endif
+
        cpu_timebase_init(TRUE);
+       PE_init_cpu();
        fiq_context_bootstrap(TRUE);
 
 
@@ -483,6 +562,7 @@ arm_init_cpu(
        __builtin_arm_wsr("pan", 1);
 #endif
 
+
        cpu_data_ptr->cpu_flags &= ~SleepState;
 #if     __ARM_SMP__ && defined(ARMA7)
        cpu_data_ptr->cpu_CLW_active = 1;
@@ -528,6 +608,7 @@ arm_init_cpu(
                PE_init_platform(TRUE, NULL);
                commpage_update_timebase();
        }
+       PE_init_cpu();
 
        fiq_context_init(TRUE);
        cpu_data_ptr->rtcPop = EndOfAllTime;
index da4a0c3b567f8bf51d5cc32b0ac554a045917d54..3b9c4f31001b8ec258dc5ffa16b8b623cd312c87 100644 (file)
@@ -199,6 +199,20 @@ timer_resync_deadlines(void)
        splx(s);
 }
 
+void
+timer_queue_expire_local(
+       __unused void                   *arg)
+{
+       rtclock_timer_t         *mytimer = &getCpuDatap()->rtclock_timer;
+       uint64_t                abstime;
+
+       abstime = mach_absolute_time();
+       mytimer->has_expired = TRUE;
+       mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime);
+       mytimer->has_expired = FALSE;
+
+       timer_resync_deadlines();
+}
 
 boolean_t
 timer_resort_threshold(__unused uint64_t skew)
index 5a24884580e6be69865603462048ab6f07d4501e..08788e136d91c5a51a5db958092c9c1d4daae23d 100644 (file)
@@ -184,11 +184,12 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va,
                } else {
                        /* TTE must be reincarnated COARSE. */
                        ppte = (pt_entry_t *)phystokv(avail_start);
+                       pmap_paddr_t l2table = avail_start;
                        avail_start += ARM_PGBYTES;
                        bzero(ppte, ARM_PGBYTES);
 
                        for (i = 0; i < 4; ++i) {
-                               tte[i] = pa_to_tte(kvtophys((vm_offset_t)ppte) + (i * 0x400)) | ARM_TTE_TYPE_TABLE;
+                               tte[i] = pa_to_tte(l2table + (i * 0x400)) | ARM_TTE_TYPE_TABLE;
                        }
                }
 
@@ -343,8 +344,9 @@ arm_vm_prot_init(boot_args * args)
         */
        pmap_paddr_t p = (pmap_paddr_t)(args->topOfKernelData) + (ARM_PGBYTES * 9);
        pt_entry_t *ppte = (pt_entry_t *)phystokv(p);
+       pmap_init_pte_page(kernel_pmap, ppte, HIGH_EXC_VECTORS & ~ARM_TT_L1_PT_OFFMASK, 2, TRUE, FALSE);
 
-       int idx = (HIGH_EXC_VECTORS & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT;
+       int idx = (HIGH_EXC_VECTORS & ARM_TT_L1_PT_OFFMASK) >> ARM_TT_L2_SHIFT;
        pt_entry_t ptmp = ppte[idx];
 
        ptmp = (ptmp & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA);
@@ -367,9 +369,6 @@ arm_vm_prot_finalize(boot_args * args)
 
        arm_vm_page_granular_RWNX(phystokv(args->topOfKernelData) + ARM_PGBYTES * 9, ARM_PGBYTES, FALSE); /* commpage, EVB */
 
-#ifndef  __ARM_L1_PTW__
-       FlushPoC_Dcache();
-#endif
        flush_mmu_tlb();
 }
 
@@ -497,11 +496,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
        sectCONSTB = sectDCONST->addr;
        sectSizeCONST = sectDCONST->size;
 
-#if !SECURE_KERNEL
-       /* doconstro is true by default, but we allow a boot-arg to disable it */
-       (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
-#endif
-
        if (doconstro) {
                extern vm_offset_t _lastkerneldataconst;
                extern vm_size_t _lastkerneldataconst_padsize;
@@ -534,25 +528,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
 
        vm_set_page_size();
 
-#ifndef __ARM_L1_PTW__
-       FlushPoC_Dcache();
-#endif
-       set_mmu_ttb(cpu_ttep);
-       set_mmu_ttb_alternate(cpu_ttep);
-       flush_mmu_tlb();
-#if __arm__ && __ARM_USER_PROTECT__
-       {
-               unsigned int ttbr0_val, ttbr1_val, ttbcr_val;
-               thread_t thread = current_thread();
-
-               __asm__ volatile ("mrc p15,0,%0,c2,c0,0\n" : "=r"(ttbr0_val));
-               __asm__ volatile ("mrc p15,0,%0,c2,c0,1\n" : "=r"(ttbr1_val));
-               __asm__ volatile ("mrc p15,0,%0,c2,c0,2\n" : "=r"(ttbcr_val));
-               thread->machine.uptw_ttb = ttbr0_val;
-               thread->machine.kptw_ttb = ttbr1_val;
-               thread->machine.uptw_ttc = ttbcr_val;
-       }
-#endif
        vm_prelink_stext = segPRELINKTEXTB;
        vm_prelink_etext = segPRELINKTEXTB + segSizePRELINKTEXT;
        vm_prelink_sinfo = segPRELINKINFOB;
@@ -591,14 +566,30 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
                ptp = (pt_entry_t *) phystokv(avail_start);
                ptp_phys = (pmap_paddr_t)avail_start;
                avail_start += ARM_PGBYTES;
-               pmap_init_pte_page(kernel_pmap, ptp, va + off, 2, TRUE);
+               pmap_init_pte_page(kernel_pmap, ptp, va + off, 2, TRUE, TRUE);
                tte = &cpu_tte[ttenum(va + off)];
-               *tte     = pa_to_tte((ptp_phys)) | ARM_TTE_TYPE_TABLE;;
-               *(tte + 1) = pa_to_tte((ptp_phys + 0x400)) | ARM_TTE_TYPE_TABLE;;
-               *(tte + 2) = pa_to_tte((ptp_phys + 0x800)) | ARM_TTE_TYPE_TABLE;;
-               *(tte + 3) = pa_to_tte((ptp_phys + 0xC00)) | ARM_TTE_TYPE_TABLE;;
+               *tte     = pa_to_tte((ptp_phys)) | ARM_TTE_TYPE_TABLE;
+               *(tte + 1) = pa_to_tte((ptp_phys + 0x400)) | ARM_TTE_TYPE_TABLE;
+               *(tte + 2) = pa_to_tte((ptp_phys + 0x800)) | ARM_TTE_TYPE_TABLE;
+               *(tte + 3) = pa_to_tte((ptp_phys + 0xC00)) | ARM_TTE_TYPE_TABLE;
        }
 
+       set_mmu_ttb(cpu_ttep);
+       set_mmu_ttb_alternate(cpu_ttep);
+       flush_mmu_tlb();
+#if __arm__ && __ARM_USER_PROTECT__
+       {
+               unsigned int ttbr0_val, ttbr1_val, ttbcr_val;
+               thread_t thread = current_thread();
+
+               __asm__ volatile ("mrc p15,0,%0,c2,c0,0\n" : "=r"(ttbr0_val));
+               __asm__ volatile ("mrc p15,0,%0,c2,c0,1\n" : "=r"(ttbr1_val));
+               __asm__ volatile ("mrc p15,0,%0,c2,c0,2\n" : "=r"(ttbcr_val));
+               thread->machine.uptw_ttb = ttbr0_val;
+               thread->machine.kptw_ttb = ttbr1_val;
+               thread->machine.uptw_ttc = ttbcr_val;
+       }
+#endif
        avail_start = (avail_start + PAGE_MASK) & ~PAGE_MASK;
 
        first_avail = avail_start;
index 380286cded9dddb098292074dc69b3b3623700ff..a6b4c2b8c6f568609bb3e47b4f82589fafff1c06 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2015-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
+#ifndef _MACHINE_ATOMIC_H
+#error "Do not include <arm/atomic.h> directly, use <machine/atomic.h>"
+#endif
+
 #ifndef _ARM_ATOMIC_H_
 #define _ARM_ATOMIC_H_
 
 #include <mach/boolean.h>
-#include <arm/smp.h>
 
 // Parameter for __builtin_arm_dmb
+#define DMB_OSHLD       0x1
+#define DMB_OSHST       0x2
+#define DMB_OSH         0x3
+#define DMB_NSHLD       0x5
+#define DMB_NSHST       0x6
 #define DMB_NSH         0x7
 #define DMB_ISHLD       0x9
 #define DMB_ISHST       0xa
 #define DMB_ISH         0xb
+#define DMB_LD          0xd
+#define DMB_ST          0xe
 #define DMB_SY          0xf
 
 // Parameter for __builtin_arm_dsb
+#define DSB_OSHLD       0x1
+#define DSB_OSHST       0x2
+#define DSB_OSH         0x3
+#define DSB_NSHLD       0x5
+#define DSB_NSHST       0x6
 #define DSB_NSH         0x7
 #define DSB_ISHLD       0x9
 #define DSB_ISHST       0xa
 #define DSB_ISH         0xb
+#define DSB_LD          0xd
+#define DSB_ST          0xe
 #define DSB_SY          0xf
 
 // Parameter for __builtin_arm_isb
 #define ISB_SY          0xf
 
-#if     __SMP__
-
-#define memory_order_consume_smp memory_order_consume
-#define memory_order_acquire_smp memory_order_acquire
-#define memory_order_release_smp memory_order_release
-#define memory_order_acq_rel_smp memory_order_acq_rel
-#define memory_order_seq_cst_smp memory_order_seq_cst
-
-#else
-
-#define memory_order_consume_smp memory_order_relaxed
-#define memory_order_acquire_smp memory_order_relaxed
-#define memory_order_release_smp memory_order_relaxed
-#define memory_order_acq_rel_smp memory_order_relaxed
-#define memory_order_seq_cst_smp memory_order_relaxed
+#undef OS_ATOMIC_HAS_LLSC
+#define OS_ATOMIC_HAS_LLSC  1
 
+#if defined(__ARM_ARCH_8_2__) && defined(__arm64__)
+#undef OS_ATOMIC_USE_LLSC
+#define OS_ATOMIC_USE_LLSC  0
 #endif
 
+
 /*
- * Atomic operations functions
- *
- * These static functions are designed for inlining
- * It is expected that the memory_order arguments are
- * known at compile time.  This collapses these
- * functions into a simple atomic operation
+ * On armv7 & arm64, we do provide fine grained dependency injection, so
+ * memory_order_dependency maps to relaxed as far as thread fences are concerned
  */
+#undef memory_order_dependency_smp
+#define memory_order_dependency_smp  memory_order_relaxed
 
-static inline boolean_t
-memory_order_has_acquire(enum memory_order ord)
-{
-       switch (ord) {
-       case memory_order_consume:
-       case memory_order_acquire:
-       case memory_order_acq_rel:
-       case memory_order_seq_cst:
-               return TRUE;
-       default:
-               return FALSE;
-       }
-}
-
-static inline boolean_t
-memory_order_has_release(enum memory_order ord)
-{
-       switch (ord) {
-       case memory_order_release:
-       case memory_order_acq_rel:
-       case memory_order_seq_cst:
-               return TRUE;
-       default:
-               return FALSE;
-       }
-}
-
-#ifdef ATOMIC_PRIVATE
-
-#define clear_exclusive()       __builtin_arm_clrex()
-
-__unused static uint32_t
-load_exclusive32(uint32_t *target, enum memory_order ord)
-{
-       uint32_t        value;
+#define os_atomic_clear_exclusive()  __builtin_arm_clrex()
 
 #if __arm__
-       if (memory_order_has_release(ord)) {
-               // Pre-load release barrier
-               atomic_thread_fence(memory_order_release);
-       }
-       value = __builtin_arm_ldrex(target);
-#else
-       if (memory_order_has_acquire(ord)) {
-               value = __builtin_arm_ldaex(target);    // ldaxr
-       } else {
-               value = __builtin_arm_ldrex(target);    // ldxr
-       }
-#endif  // __arm__
-       return value;
-}
-
-__unused static boolean_t
-store_exclusive32(uint32_t *target, uint32_t value, enum memory_order ord)
-{
-       boolean_t err;
 
-#if __arm__
-       err = __builtin_arm_strex(value, target);
-       if (memory_order_has_acquire(ord)) {
-               // Post-store acquire barrier
-               atomic_thread_fence(memory_order_acquire);
-       }
-#else
-       if (memory_order_has_release(ord)) {
-               err = __builtin_arm_stlex(value, target);       // stlxr
-       } else {
-               err = __builtin_arm_strex(value, target);       // stxr
-       }
-#endif  // __arm__
-       return !err;
-}
+#define os_atomic_load_exclusive(p, m)  ({ \
+               _os_atomic_basetypeof(p) _r; \
+               _r = __builtin_arm_ldrex(p); \
+               _os_memory_fence_after_atomic(m); \
+               _os_compiler_barrier_after_atomic(m); \
+               _r; \
+})
 
-__unused static uintptr_t
-load_exclusive(uintptr_t *target, enum memory_order ord)
-{
-#if !__LP64__
-       return load_exclusive32((uint32_t *)target, ord);
-#else
-       uintptr_t       value;
+#define os_atomic_store_exclusive(p, v, m)  ({ \
+               _os_compiler_barrier_before_atomic(m); \
+               _os_memory_fence_before_atomic(m); \
+               !__builtin_arm_strex(p, v); \
+})
 
-       if (memory_order_has_acquire(ord)) {
-               value = __builtin_arm_ldaex(target);    // ldaxr
-       } else {
-               value = __builtin_arm_ldrex(target);    // ldxr
-       }
-       return value;
-#endif  // __arm__
-}
-
-__unused static uint8_t
-load_exclusive_acquire8(uint8_t *target)
-{
-       uint8_t value;
-#if __arm__
-       value = __builtin_arm_ldrex(target);
-       __c11_atomic_thread_fence(__ATOMIC_ACQUIRE);
-#else
-       value = __builtin_arm_ldaex(target);    // ldaxr
-       /* "Compiler barrier", no barrier instructions are emitted */
-       atomic_signal_fence(memory_order_acquire);
-#endif
-       return value;
-}
-
-__unused static boolean_t
-store_exclusive(uintptr_t *target, uintptr_t value, enum memory_order ord)
-{
-#if !__LP64__
-       return store_exclusive32((uint32_t *)target, value, ord);
-#else
-       boolean_t err;
-
-       if (memory_order_has_release(ord)) {
-               err = __builtin_arm_stlex(value, target);       // stlxr
-       } else {
-               err = __builtin_arm_strex(value, target);       // stxr
-       }
-       return !err;
-#endif
-}
-
-__unused static boolean_t
-atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval,
-    enum memory_order orig_ord, boolean_t wait)
-{
-       enum memory_order       ord = orig_ord;
-       uintptr_t                       value;
-
-
-#if __arm__
-       ord = memory_order_relaxed;
-       if (memory_order_has_release(orig_ord)) {
-               atomic_thread_fence(memory_order_release);
-       }
-#endif
-       do {
-               value = load_exclusive(target, ord);
-               if (value != oldval) {
-                       if (wait) {
-                               wait_for_event();       // Wait with monitor held
-                       } else {
-                               clear_exclusive();      // Clear exclusive monitor
-                       }
-                       return FALSE;
-               }
-       } while (!store_exclusive(target, newval, ord));
-#if __arm__
-       if (memory_order_has_acquire(orig_ord)) {
-               atomic_thread_fence(memory_order_acquire);
-       }
-#endif
-       return TRUE;
-}
-
-#endif // ATOMIC_PRIVATE
+/*
+ * armv7 override of os_atomic_make_dependency
+ * documentation for os_atomic_make_dependency is in <machine/atomic.h>
+ */
+#undef os_atomic_make_dependency
+#define os_atomic_make_dependency(v) ({ \
+               os_atomic_dependency_t _dep; \
+               __asm__ __volatile__("and %[_dep], %[_v], #0" \
+                               : [_dep] "=r" (_dep.__opaque_zero) : [_v] "r" (v)); \
+               os_compiler_barrier(acquire); \
+               _dep; \
+})
 
-#if __arm__
+/*
+ * armv7 override of os_atomic_rmw_loop
+ * documentation for os_atomic_rmw_loop is in <machine/atomic.h>
+ */
 #undef os_atomic_rmw_loop
 #define os_atomic_rmw_loop(p, ov, nv, m, ...)  ({ \
-               boolean_t _result = FALSE; uint32_t _err = 0; \
-               typeof(atomic_load(p)) *_p = (typeof(atomic_load(p)) *)(p); \
+               int _result = 0; uint32_t _err = 0; \
+               _os_atomic_basetypeof(p) *_p; \
+               _p = (_os_atomic_basetypeof(p) *)(p); \
+               _os_compiler_barrier_before_atomic(m); \
                for (;;) { \
                        ov = __builtin_arm_ldrex(_p); \
                        __VA_ARGS__; \
-                       if (!_err && memory_order_has_release(memory_order_##m)) { \
-       /* only done for the first loop iteration */ \
-                               atomic_thread_fence(memory_order_release); \
+                       if (!_err) { \
+       /* release barrier only done for the first loop iteration */ \
+                               _os_memory_fence_before_atomic(m); \
                        } \
                        _err = __builtin_arm_strex(nv, _p); \
                        if (__builtin_expect(!_err, 1)) { \
-                               if (memory_order_has_acquire(memory_order_##m)) { \
-                                       atomic_thread_fence(memory_order_acquire); \
-                               } \
-                               _result = TRUE; \
+                               _os_memory_fence_after_atomic(m); \
+                               _result = 1; \
                                break; \
                        } \
                } \
+               _os_compiler_barrier_after_atomic(m); \
                _result; \
        })
 
+/*
+ * armv7 override of os_atomic_rmw_loop_give_up
+ * documentation for os_atomic_rmw_loop_give_up is in <machine/atomic.h>
+ */
 #undef os_atomic_rmw_loop_give_up
-#define os_atomic_rmw_loop_give_up(expr) \
-               ({ __builtin_arm_clrex(); expr; __builtin_trap(); })
+#define os_atomic_rmw_loop_give_up(...) \
+               ({ os_atomic_clear_exclusive(); __VA_ARGS__; break; })
+
+#else // __arm64__
+
+#define os_atomic_load_exclusive(p, m)  ({ \
+               _os_atomic_basetypeof(p) _r; \
+               if (memory_order_has_acquire(memory_order_##m##_smp)) { \
+                       _r = __builtin_arm_ldaex(p); \
+               } else { \
+                       _r = __builtin_arm_ldrex(p); \
+               } \
+               _os_compiler_barrier_after_atomic(m); \
+               _r; \
+})
 
-#else
+#define os_atomic_store_exclusive(p, v, m)  ({ \
+               _os_compiler_barrier_before_atomic(m); \
+               (memory_order_has_release(memory_order_##m##_smp) ? \
+                               !__builtin_arm_stlex(p, v) : !__builtin_arm_strex(p, v)); \
+})
 
+/*
+ * arm64 override of os_atomic_make_dependency
+ * documentation for os_atomic_make_dependency is in <machine/atomic.h>
+ */
+#undef os_atomic_make_dependency
+#define os_atomic_make_dependency(v) ({ \
+               os_atomic_dependency_t _dep; \
+               __asm__ __volatile__("and %[_dep], %[_v], xzr" \
+                               : [_dep] "=r" (_dep.__opaque_zero) : [_v] "r" (v)); \
+               os_compiler_barrier(acquire); \
+               _dep; \
+})
+
+#if OS_ATOMIC_USE_LLSC
+
+/*
+ * arm64 (without armv81 atomics) override of os_atomic_rmw_loop
+ * documentation for os_atomic_rmw_loop is in <machine/atomic.h>
+ */
 #undef os_atomic_rmw_loop
 #define os_atomic_rmw_loop(p, ov, nv, m, ...)  ({ \
-               boolean_t _result = FALSE; \
-               typeof(atomic_load(p)) *_p = (typeof(atomic_load(p)) *)(p); \
+               int _result = 0; \
+               _os_atomic_basetypeof(p) *_p; \
+               _p = (_os_atomic_basetypeof(p) *)(p); \
+               _os_compiler_barrier_before_atomic(m); \
                do { \
-                       if (memory_order_has_acquire(memory_order_##m)) { \
+                       if (memory_order_has_acquire(memory_order_##m##_smp)) { \
                                ov = __builtin_arm_ldaex(_p); \
                        } else { \
                                ov = __builtin_arm_ldrex(_p); \
                        } \
                        __VA_ARGS__; \
-                       if (memory_order_has_release(memory_order_##m)) { \
+                       if (memory_order_has_release(memory_order_##m##_smp)) { \
                                _result = !__builtin_arm_stlex(nv, _p); \
                        } else { \
                                _result = !__builtin_arm_strex(nv, _p); \
                        } \
                } while (__builtin_expect(!_result, 0)); \
+               _os_compiler_barrier_after_atomic(m); \
                _result; \
        })
 
+/*
+ * arm64 override of os_atomic_rmw_loop_give_up
+ * documentation for os_atomic_rmw_loop_give_up is in <machine/atomic.h>
+ */
 #undef os_atomic_rmw_loop_give_up
-#define os_atomic_rmw_loop_give_up(expr) \
-               ({ __builtin_arm_clrex(); expr; __builtin_trap(); })
-#endif
+#define os_atomic_rmw_loop_give_up(...) \
+               ({ os_atomic_clear_exclusive(); __VA_ARGS__; break; })
 
-#undef os_atomic_force_dependency_on
-#if defined(__arm64__)
-#define os_atomic_force_dependency_on(p, e) ({ \
-               unsigned long _v; \
-               __asm__("and %x[_v], %x[_e], xzr" : [_v] "=r" (_v) : [_e] "r" (e)); \
-               (typeof(*(p)) *)((char *)(p) + _v); \
-       })
-#else
-#define os_atomic_force_dependency_on(p, e) ({ \
-               unsigned long _v; \
-               __asm__("and %[_v], %[_e], #0" : [_v] "=r" (_v) : [_e] "r" (e)); \
-               (typeof(*(p)) *)((char *)(p) + _v); \
-       })
-#endif // defined(__arm64__)
+#endif // OS_ATOMIC_USE_LLSC
+
+#endif // __arm64__
 
 #endif // _ARM_ATOMIC_H_
index f76a19edfe29a8e04d11c560036575a7f19a5c63..e5e64cff787c0f026f486f1202b16ddc1c6e5024 100644 (file)
@@ -64,44 +64,30 @@ flush_dcache(
        boolean_t phys)
 {
        cpu_data_t      *cpu_data_ptr = getCpuDatap();
-
-       if (phys) {
-               pmap_paddr_t    paddr;
-               vm_offset_t     vaddr;
-
-               paddr = CAST_DOWN(pmap_paddr_t, addr);
-               if (!isphysmem(paddr)) {
-                       return;
-               }
-               vaddr = phystokv(paddr);
-               FlushPoC_DcacheRegion((vm_offset_t) vaddr, length);
-
-               if (cpu_data_ptr->cpu_cache_dispatch != (cache_dispatch_t) NULL) {
-                       ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)(
-                               cpu_data_ptr->cpu_id, CacheCleanFlushRegion, (unsigned int) paddr, length);
-               }
-               return;
-       }
-       if (cpu_data_ptr->cpu_cache_dispatch == (cache_dispatch_t) NULL) {
-               FlushPoC_DcacheRegion((vm_offset_t) addr, length);
-       } else {
-               addr64_t        paddr;
-               uint32_t        count;
-
-               while (length > 0) {
+       vm_offset_t     vaddr;
+       addr64_t        paddr;
+       vm_size_t       count;
+
+       while (length > 0) {
+               if (phys) {
+                       count = length;
+                       paddr = CAST_DOWN(pmap_paddr_t, addr);
+                       vaddr = phystokv_range(paddr, &count);
+               } else {
+                       paddr = kvtophys(addr);
+                       vaddr = addr;
                        count = PAGE_SIZE - (addr & PAGE_MASK);
                        if (count > length) {
                                count = length;
                        }
-                       FlushPoC_DcacheRegion((vm_offset_t) addr, count);
-                       paddr = kvtophys(addr);
-                       if (paddr) {
-                               ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)(
-                                       cpu_data_ptr->cpu_id, CacheCleanFlushRegion, (unsigned int) paddr, count);
-                       }
-                       addr += count;
-                       length -= count;
                }
+               FlushPoC_DcacheRegion(vaddr, (unsigned)count);
+               if (paddr && (cpu_data_ptr->cpu_cache_dispatch != NULL)) {
+                       ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)(
+                               cpu_data_ptr->cpu_id, CacheCleanFlushRegion, (unsigned int) paddr, (unsigned)count);
+               }
+               addr += count;
+               length -= count;
        }
        return;
 }
@@ -113,46 +99,30 @@ clean_dcache(
        boolean_t phys)
 {
        cpu_data_t      *cpu_data_ptr = getCpuDatap();
-
-       if (phys) {
-               pmap_paddr_t    paddr;
-               vm_offset_t     vaddr;
-
-               paddr = CAST_DOWN(pmap_paddr_t, addr);
-               if (!isphysmem(paddr)) {
-                       return;
-               }
-
-               vaddr = phystokv(paddr);
-               CleanPoC_DcacheRegion((vm_offset_t) vaddr, length);
-
-               if (cpu_data_ptr->cpu_cache_dispatch != (cache_dispatch_t) NULL) {
-                       ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)(
-                               cpu_data_ptr->cpu_id, CacheCleanRegion, (unsigned int) paddr, length);
-               }
-               return;
-       }
-
-       if (cpu_data_ptr->cpu_cache_dispatch == (cache_dispatch_t) NULL) {
-               CleanPoC_DcacheRegion((vm_offset_t) addr, length);
-       } else {
-               addr64_t        paddr;
-               uint32_t        count;
-
-               while (length > 0) {
+       vm_offset_t     vaddr;
+       addr64_t        paddr;
+       vm_size_t       count;
+
+       while (length > 0) {
+               if (phys) {
+                       count = length;
+                       paddr = CAST_DOWN(pmap_paddr_t, addr);
+                       vaddr = phystokv_range(paddr, &count);
+               } else {
+                       paddr = kvtophys(addr);
+                       vaddr = addr;
                        count = PAGE_SIZE - (addr & PAGE_MASK);
                        if (count > length) {
                                count = length;
                        }
-                       CleanPoC_DcacheRegion((vm_offset_t) addr, count);
-                       paddr = kvtophys(addr);
-                       if (paddr) {
-                               ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)(
-                                       cpu_data_ptr->cpu_id, CacheCleanRegion, (unsigned int) paddr, count);
-                       }
-                       addr += count;
-                       length -= count;
                }
+               CleanPoC_DcacheRegion(vaddr, (unsigned)count);
+               if (paddr && (cpu_data_ptr->cpu_cache_dispatch != NULL)) {
+                       ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)(
+                               cpu_data_ptr->cpu_id, CacheCleanRegion, (unsigned int) paddr, (unsigned)count);
+               }
+               addr += count;
+               length -= count;
        }
        return;
 }
@@ -184,8 +154,6 @@ dcache_incoherent_io_flush64(
        unsigned int remaining,
        unsigned int *res)
 {
-       vm_offset_t vaddr;
-       pmap_paddr_t paddr = CAST_DOWN(pmap_paddr_t, pa);
        cpu_data_t *cpu_data_ptr = getCpuDatap();
 
        if ((cache_info()->c_bulksize_op != 0) && (remaining >= (cache_info()->c_bulksize_op))) {
@@ -199,41 +167,38 @@ dcache_incoherent_io_flush64(
 #endif
                *res = BWOpDone;
        } else {
-               if (isphysmem(paddr)) {
-                       vaddr = phystokv(pa);
-                       {
-                               FlushPoC_DcacheRegion((vm_offset_t) vaddr, size);
-
-                               if (cpu_data_ptr->cpu_cache_dispatch != (cache_dispatch_t) NULL) {
-                                       ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)(cpu_data_ptr->cpu_id, CacheCleanFlushRegion, (unsigned int) pa, size);
-                               }
-                       }
-               } else {
-                       /* slow path - pa isn't in the vtop region. Flush one page at a time via cpu_copywindows */
-                       unsigned int wimg_bits, index;
-                       uint32_t count;
-
-                       mp_disable_preemption();
-
-                       while (size > 0) {
+               vm_offset_t     vaddr;
+               pmap_paddr_t    paddr = CAST_DOWN(pmap_paddr_t, pa);
+               vm_size_t       count;
+               unsigned int    wimg_bits, index;
+
+               while (size > 0) {
+                       if (isphysmem(paddr)) {
+                               count = size;
+                               vaddr = phystokv_range(paddr, &count);
+                       } else {
                                count = PAGE_SIZE - (paddr & PAGE_MASK);
                                if (count > size) {
                                        count = size;
                                }
 
                                wimg_bits = pmap_cache_attributes((ppnum_t) (paddr >> PAGE_SHIFT));
+                               mp_disable_preemption();
                                index = pmap_map_cpu_windows_copy((ppnum_t) (paddr >> PAGE_SHIFT), VM_PROT_READ | VM_PROT_WRITE, wimg_bits);
                                vaddr = pmap_cpu_windows_copy_addr(cpu_number(), index) | (paddr & PAGE_MASK);
-
-                               CleanPoC_DcacheRegion((vm_offset_t) vaddr, count);
-
+                       }
+                       FlushPoC_DcacheRegion(vaddr, (unsigned)count);
+                       if (isphysmem(paddr)) {
+                               if (cpu_data_ptr->cpu_cache_dispatch != NULL) {
+                                       ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)(
+                                               cpu_data_ptr->cpu_id, CacheCleanFlushRegion, (unsigned int) paddr, (unsigned)count);
+                               }
+                       } else {
                                pmap_unmap_cpu_windows_copy(index);
-
-                               paddr += count;
-                               size -= count;
+                               mp_enable_preemption();
                        }
-
-                       mp_enable_preemption();
+                       paddr += count;
+                       size -= count;
                }
        }
 
@@ -247,13 +212,12 @@ dcache_incoherent_io_store64(
        unsigned int remaining,
        unsigned int *res)
 {
-       vm_offset_t vaddr;
        pmap_paddr_t paddr = CAST_DOWN(pmap_paddr_t, pa);
        cpu_data_t *cpu_data_ptr = getCpuDatap();
 
        if (isphysmem(paddr)) {
                unsigned int wimg_bits = pmap_cache_attributes((ppnum_t) (paddr >> PAGE_SHIFT));
-               if ((wimg_bits == VM_WIMG_IO) || (wimg_bits == VM_WIMG_WCOMB)) {
+               if ((wimg_bits == VM_WIMG_IO) || (wimg_bits == VM_WIMG_WCOMB) || (wimg_bits == VM_WIMG_RT)) {
                        return;
                }
        }
@@ -272,41 +236,36 @@ dcache_incoherent_io_store64(
 #endif
                *res = BWOpDone;
        } else {
-               if (isphysmem(paddr)) {
-                       vaddr = phystokv(pa);
-                       {
-                               CleanPoC_DcacheRegion((vm_offset_t) vaddr, size);
-
-                               if (cpu_data_ptr->cpu_cache_dispatch != (cache_dispatch_t) NULL) {
-                                       ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)(cpu_data_ptr->cpu_id, CacheCleanRegion, (unsigned int) pa, size);
-                               }
-                       }
-               } else {
-                       /* slow path - pa isn't in the vtop region. Flush one page at a time via cpu_copywindows */
-                       unsigned int wimg_bits, index;
-                       uint32_t count;
-
-                       mp_disable_preemption();
-
-                       while (size > 0) {
+               vm_offset_t     vaddr;
+               vm_size_t       count;
+               unsigned int    wimg_bits, index;
+
+               while (size > 0) {
+                       if (isphysmem(paddr)) {
+                               count = size;
+                               vaddr = phystokv_range(paddr, &count);
+                       } else {
                                count = PAGE_SIZE - (paddr & PAGE_MASK);
                                if (count > size) {
                                        count = size;
                                }
-
                                wimg_bits = pmap_cache_attributes((ppnum_t) (paddr >> PAGE_SHIFT));
+                               mp_disable_preemption();
                                index = pmap_map_cpu_windows_copy((ppnum_t) (paddr >> PAGE_SHIFT), VM_PROT_READ | VM_PROT_WRITE, wimg_bits);
                                vaddr = pmap_cpu_windows_copy_addr(cpu_number(), index) | (paddr & PAGE_MASK);
-
-                               CleanPoC_DcacheRegion((vm_offset_t) vaddr, count);
-
+                       }
+                       CleanPoC_DcacheRegion(vaddr, (unsigned)count);
+                       if (isphysmem(paddr)) {
+                               if (cpu_data_ptr->cpu_cache_dispatch != NULL) {
+                                       ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch)(
+                                               cpu_data_ptr->cpu_id, CacheCleanRegion, (unsigned int) paddr, (unsigned)count);
+                               }
+                       } else {
                                pmap_unmap_cpu_windows_copy(index);
-
-                               paddr += count;
-                               size -= count;
+                               mp_enable_preemption();
                        }
-
-                       mp_enable_preemption();
+                       paddr += count;
+                       size -= count;
                }
        }
 
@@ -322,13 +281,7 @@ cache_sync_page(
 
        if (isphysmem(paddr)) {
                vm_offset_t     vaddr = phystokv(paddr);
-
-               CleanPoU_DcacheRegion(vaddr, PAGE_SIZE);
-#ifdef  __ARM_IC_NOALIAS_ICACHE__
                InvalidatePoU_IcacheRegion(vaddr, PAGE_SIZE);
-#else
-               InvalidatePoU_Icache();
-#endif
        } else {
                FlushPoC_Dcache();
                InvalidatePoU_Icache();
@@ -406,14 +359,10 @@ platform_cache_disable(void)
        uint32_t sctlr_value = 0;
 
        /* Disable dcache allocation. */
-       __asm__ volatile ("mrc p15, 0, %0, c1, c0, 0"
-                          : "=r"(sctlr_value));
-
+       sctlr_value = __builtin_arm_mrc(MRC_SCTLR);
        sctlr_value &= ~SCTLR_DCACHE;
-
-       __asm__ volatile ("mcr p15, 0, %0, c1, c0, 0\n"
-                          "isb"
-                          :: "r"(sctlr_value));
+       __builtin_arm_mcr(MCR_SCTLR(sctlr_value));
+       __builtin_arm_isb(ISB_SY);
 #endif /* (__ARM_ARCH__ < 8) */
 }
 
@@ -421,7 +370,7 @@ void
 platform_cache_idle_enter(
        void)
 {
-#if     __ARM_SMP__
+#if __ARM_SMP__
        platform_cache_disable();
 
        /*
@@ -438,42 +387,39 @@ platform_cache_idle_enter(
 #if (__ARM_ARCH__ < 8)
                cpu_data_t      *cpu_data_ptr = getCpuDatap();
                cpu_data_ptr->cpu_CLW_active = 0;
-               __asm__ volatile ("dmb ish");
+               __builtin_arm_dmb(DMB_ISH);
                cpu_data_ptr->cpu_CLWFlush_req = 0;
                cpu_data_ptr->cpu_CLWClean_req = 0;
                CleanPoC_DcacheRegion((vm_offset_t) cpu_data_ptr, sizeof(cpu_data_t));
 #endif /* (__ARM_ARCH__ < 8) */
        }
-#else
+#else /* !__ARM_SMP__ */
        CleanPoU_Dcache();
-#endif
+#endif /* !__ARM_SMP__ */
 
-#if      defined (__ARM_SMP__) && defined (ARMA7)
+#if defined(__ARM_SMP__) && defined(ARMA7)
        uint32_t actlr_value = 0;
 
        /* Leave the coherency domain */
-       __asm__ volatile ("clrex\n"
-                          "mrc p15, 0, %0, c1, c0, 1\n"
-                          : "=r"(actlr_value));
-
+       __builtin_arm_clrex();
+       actlr_value = __builtin_arm_mrc(MRC_ACTLR);
        actlr_value &= ~0x40;
 
-       __asm__ volatile ("mcr p15, 0, %0, c1, c0, 1\n"
-                         /* Ensures any pending fwd request gets serviced and ends up */
-                          "dsb\n"
-                         /* Forces the processor to re-fetch, so any pending fwd request gets into the core */
-                          "isb\n"
-                         /* Ensures the second possible pending fwd request ends up. */
-                          "dsb\n"
-                          :: "r"(actlr_value));
-#endif
+       __builtin_arm_mcr(MCR_ACTLR(actlr_value));
+       /* Ensures any pending fwd request gets serviced and ends up */
+       __builtin_arm_dsb(DSB_SY);
+       /* Forces the processor to re-fetch, so any pending fwd request gets into the core */
+       __builtin_arm_isb(ISB_SY);
+       /* Ensures the second possible pending fwd request ends up. */
+       __builtin_arm_dsb(DSB_SY);
+#endif /* defined(__ARM_SMP__) && defined(ARMA7) */
 }
 
 void
 platform_cache_idle_exit(
        void)
 {
-#if defined (ARMA7)
+#if defined(ARMA7)
        uint32_t actlr_value = 0;
 
        /* Flush L1 caches and TLB before rejoining the coherency domain */
@@ -491,30 +437,22 @@ platform_cache_idle_exit(
        }
 
        /* Rejoin the coherency domain */
-       __asm__ volatile ("mrc p15, 0, %0, c1, c0, 1\n"
-                          : "=r"(actlr_value));
-
+       actlr_value = __builtin_arm_mrc(MRC_ACTLR);
        actlr_value |= 0x40;
-
-       __asm__ volatile ("mcr p15, 0, %0, c1, c0, 1\n"
-                          "isb\n"
-                          :: "r"(actlr_value));
+       __builtin_arm_mcr(MCR_ACTLR(actlr_value));
+       __builtin_arm_isb(ISB_SY);
 
 #if __ARM_SMP__
        uint32_t sctlr_value = 0;
 
        /* Enable dcache allocation. */
-       __asm__ volatile ("mrc p15, 0, %0, c1, c0, 0\n"
-                          : "=r"(sctlr_value));
-
+       sctlr_value = __builtin_arm_mrc(MRC_SCTLR);
        sctlr_value |= SCTLR_DCACHE;
-
-       __asm__ volatile ("mcr p15, 0, %0, c1, c0, 0\n"
-                          "isb"
-                          :: "r"(sctlr_value));
+       __builtin_arm_mcr(MCR_SCTLR(sctlr_value));
+       __builtin_arm_isb(ISB_SY);
        getCpuDatap()->cpu_CLW_active = 1;
-#endif
-#endif
+#endif /* __ARM_SMP__ */
+#endif /* defined(ARMA7) */
 }
 
 boolean_t
@@ -603,7 +541,7 @@ cache_xcall(unsigned int op)
                } else if (op == LWClean) {
                        target_cdp->cpu_CLWClean_req = abstime;
                }
-               __asm__ volatile ("dmb ish");
+               __builtin_arm_dmb(DMB_ISH);
                if (target_cdp->cpu_CLW_active == 0) {
                        if (op == LWFlush) {
                                target_cdp->cpu_CLWFlush_req = 0x0ULL;
@@ -675,7 +613,7 @@ flush_dcache(
        __unused unsigned length,
        __unused boolean_t phys)
 {
-       __asm__ volatile ("dsb sy");
+       __builtin_arm_dsb(DSB_SY);
 }
 
 void
@@ -684,7 +622,7 @@ clean_dcache(
        __unused unsigned length,
        __unused boolean_t phys)
 {
-       __asm__ volatile ("dsb sy");
+       __builtin_arm_dsb(DSB_SY);
 }
 
 void
@@ -692,7 +630,7 @@ flush_dcache_syscall(
        __unused vm_offset_t va,
        __unused unsigned length)
 {
-       __asm__ volatile ("dsb sy");
+       __builtin_arm_dsb(DSB_SY);
 }
 
 void
@@ -702,7 +640,7 @@ dcache_incoherent_io_flush64(
        __unused unsigned int remaining,
        __unused unsigned int *res)
 {
-       __asm__ volatile ("dsb sy");
+       __builtin_arm_dsb(DSB_SY);
        *res = LWOpDone;
        return;
 }
@@ -714,7 +652,7 @@ dcache_incoherent_io_store64(
        __unused unsigned int remaining,
        __unused unsigned int *res)
 {
-       __asm__ volatile ("dsb sy");
+       __builtin_arm_dsb(DSB_SY);
        *res = LWOpDone;
        return;
 }
@@ -728,12 +666,7 @@ cache_sync_page(
 
        if (isphysmem(paddr)) {
                vm_offset_t     vaddr = phystokv(paddr);
-
-#ifdef  __ARM_IC_NOALIAS_ICACHE__
                InvalidatePoU_IcacheRegion(vaddr, PAGE_SIZE);
-#else
-               InvalidatePoU_Icache();
-#endif
        }
 }
 
index b4e6a94c801d73f48dc750b99cdd33053f1c8591..0b305f48f613be90726ad2ce097f88fce74f0288 100644 (file)
        .globl EXT(invalidate_mmu_cache)
 LEXT(invalidate_mmu_cache)
        mov             r0, #0
+       dsb
        mcr             p15, 0, r0, c7, c7, 0                           // Invalidate caches
+       dsb
+       isb
        bx              lr
 
 /*
@@ -56,7 +59,9 @@ LEXT(invalidate_mmu_cache)
        .globl EXT(invalidate_mmu_dcache)
 LEXT(invalidate_mmu_dcache)
        mov             r0, #0
+       dsb
        mcr             p15, 0, r0, c7, c6, 0                           // Invalidate dcache
+       dsb
        bx              lr
 
 /*
@@ -73,12 +78,13 @@ LEXT(invalidate_mmu_dcache_region)
        add             r1, r1, r2
        sub             r1, r1, #1
        mov             r1, r1, LSR #MMU_CLINE                          // Set cache line counter
+       dsb
 fmdr_loop:
        mcr             p15, 0, r0, c7, c14, 1                          // Invalidate dcache line
        add             r0, r0, #1<<MMU_CLINE                           // Get next cache aligned addr
        subs    r1, r1, #1                                                      // Decrementer cache line counter
        bpl             fmdr_loop                                                       // Loop in counter not null
-       isb
+       dsb
        bx              lr
 
 /*
@@ -93,7 +99,10 @@ fmdr_loop:
 LEXT(InvalidatePoU_Icache)
 LEXT(invalidate_mmu_icache)
        mov     r0, #0
+       dsb
        mcr     p15, 0, r0, c7, c5, 0                           // Invalidate icache
+       dsb
+       isb
        bx              lr
 
 /*
@@ -105,6 +114,9 @@ LEXT(invalidate_mmu_icache)
        .align 2
        .globl EXT(InvalidatePoU_IcacheRegion)
 LEXT(InvalidatePoU_IcacheRegion)
+       push            {r7,lr}
+       mov             r7, sp
+       bl              EXT(CleanPoU_DcacheRegion)
        and             r2, r0, #((1<<MMU_I_CLINE)-1)
        bic             r0, r0, #((1<<MMU_I_CLINE)-1)                   // Cached aligned 
        add             r1, r1, r2
@@ -115,7 +127,9 @@ fmir_loop:
        add             r0, r0, #1<<MMU_I_CLINE                         // Get next cache aligned addr
        subs    r1, r1, #1                                                      // Decrementer cache line counter
        bpl             fmir_loop                                                       // Loop in counter not null
-       bx              lr
+       dsb
+       isb
+       pop             {r7,pc}
 
 /*
  * void CleanPoC_Dcache(void)
@@ -130,6 +144,7 @@ LEXT(CleanPoC_Dcache)
 LEXT(clean_mmu_dcache)
 #if    !defined(__ARM_L1_WT_CACHE__)
        mov             r0, #0
+       dsb
 clean_dcacheway:
 clean_dcacheline:              
        mcr             p15, 0, r0, c7, c10, 2                           // clean dcache line by way/set
@@ -167,6 +182,7 @@ clean_l2dcacheline:
 LEXT(CleanPoU_Dcache)
 #if    !defined(__ARM_PoU_WT_CACHE__)
        mov             r0, #0
+       dsb
 clean_dcacheway_idle:
 clean_dcacheline_idle:         
        mcr             p15, 0, r0, c7, c10, 2                           // clean dcache line by way/set
@@ -192,14 +208,15 @@ LEXT(CleanPoU_DcacheRegion)
 #if    !defined(__ARM_PoU_WT_CACHE__)
 
        and             r2, r0, #((1<<MMU_CLINE)-1)
-       bic             r0, r0, #((1<<MMU_CLINE)-1)                     // Cached aligned 
-       add             r1, r1, r2
-       sub             r1, r1, #1
-       mov             r1, r1, LSR #MMU_CLINE                          // Set cache line counter
+       bic             r3, r0, #((1<<MMU_CLINE)-1)                     // Cached aligned 
+       add             r12, r1, r2
+       sub             r12, r12, #1
+       mov             r12, r12, LSR #MMU_CLINE                                // Set cache line counter
+       dsb
 cudr_loop:
-       mcr             p15, 0, r0, c7, c11, 1                          // Clean dcache line to PoU
-       add             r0, r0, #1<<MMU_CLINE                           // Get next cache aligned addr
-       subs    r1, r1, #1                                                      // Decrementer cache line counter
+       mcr             p15, 0, r3, c7, c11, 1                          // Clean dcache line to PoU
+       add             r3, r3, #1<<MMU_CLINE                           // Get next cache aligned addr
+       subs    r12, r12, #1                                                    // Decrementer cache line counter
        bpl             cudr_loop                                                       // Loop in counter not null
 
 #endif
@@ -240,6 +257,7 @@ ccdr_loop:
        .globl EXT(FlushPoC_Dcache)
 LEXT(FlushPoC_Dcache)
        mov             r0, #0
+       dsb
 cleanflush_dcacheway:
 cleanflush_dcacheline:         
        mcr             p15, 0, r0, c7, c14, 2                           // cleanflush dcache line by way/set
@@ -275,6 +293,7 @@ cleanflush_l2dcacheline:
        .globl EXT(FlushPoU_Dcache)
 LEXT(FlushPoU_Dcache)
        mov             r0, #0
+       dsb
 fpud_way:
 fpud_line:             
        mcr             p15, 0, r0, c7, c14, 2                           // cleanflush dcache line by way/set
@@ -301,6 +320,7 @@ LEXT(FlushPoC_DcacheRegion)
        add             r1, r1, r2
        sub             r1, r1, #1
        mov             r1, r1, LSR #MMU_CLINE                          // Set cache line counter
+       dsb
 cfmdr_loop:
        mcr             p15, 0, r0, c7, c14, 1                          // Clean & invalidate dcache line
        add             r0, r0, #1<<MMU_CLINE                           // Get next cache aligned addr
index a9b48ce7ee7b9824dbcf8538b1e105c05cc97757..2aebfb3ce90b8883313c72dd98bfb5c781252bec 100644 (file)
@@ -42,6 +42,7 @@
 #include <mach/vm_map.h>
 #include <machine/cpu_capabilities.h>
 #include <machine/commpage.h>
+#include <machine/config.h>
 #include <machine/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
@@ -70,6 +71,7 @@ SECURITY_READ_ONLY_LATE(uint32_t)       _cpu_capabilities = 0;
 /* For sysctl access from BSD side */
 extern int      gARMv81Atomics;
 extern int      gARMv8Crc32;
+extern int      gARMv82FHM;
 
 void
 commpage_populate(
@@ -81,6 +83,14 @@ commpage_populate(
        sharedpage_rw_addr = pmap_create_sharedpage();
        commPagePtr = (vm_address_t)_COMM_PAGE_BASE_ADDRESS;
 
+#if __arm64__
+       bcopy(_COMM_PAGE64_SIGNATURE_STRING, (void *)(_COMM_PAGE_SIGNATURE + _COMM_PAGE_RW_OFFSET),
+           MIN(_COMM_PAGE_SIGNATURELEN, strlen(_COMM_PAGE64_SIGNATURE_STRING)));
+#else
+       bcopy(_COMM_PAGE32_SIGNATURE_STRING, (void *)(_COMM_PAGE_SIGNATURE + _COMM_PAGE_RW_OFFSET),
+           MIN(_COMM_PAGE_SIGNATURELEN, strlen(_COMM_PAGE32_SIGNATURE_STRING)));
+#endif
+
        *((uint16_t*)(_COMM_PAGE_VERSION + _COMM_PAGE_RW_OFFSET)) = (uint16_t) _COMM_PAGE_THIS_VERSION;
 
        commpage_init_cpu_capabilities();
@@ -108,14 +118,14 @@ commpage_populate(
        *((uint64_t*)(_COMM_PAGE_MEMORY_SIZE + _COMM_PAGE_RW_OFFSET)) = machine_info.max_mem;
        *((uint32_t*)(_COMM_PAGE_CPUFAMILY + _COMM_PAGE_RW_OFFSET)) = (uint32_t)cpufamily;
        *((uint32_t*)(_COMM_PAGE_DEV_FIRM + _COMM_PAGE_RW_OFFSET)) = (uint32_t)PE_i_can_has_debugger(NULL);
-       *((uint8_t*)(_COMM_PAGE_USER_TIMEBASE + _COMM_PAGE_RW_OFFSET)) = user_timebase_allowed();
+       *((uint8_t*)(_COMM_PAGE_USER_TIMEBASE + _COMM_PAGE_RW_OFFSET)) = user_timebase_type();
        *((uint8_t*)(_COMM_PAGE_CONT_HWCLOCK + _COMM_PAGE_RW_OFFSET)) = user_cont_hwclock_allowed();
        *((uint8_t*)(_COMM_PAGE_KERNEL_PAGE_SHIFT + _COMM_PAGE_RW_OFFSET)) = (uint8_t) page_shift;
 
 #if __arm64__
        *((uint8_t*)(_COMM_PAGE_USER_PAGE_SHIFT_32 + _COMM_PAGE_RW_OFFSET)) = (uint8_t) page_shift_user32;
        *((uint8_t*)(_COMM_PAGE_USER_PAGE_SHIFT_64 + _COMM_PAGE_RW_OFFSET)) = (uint8_t) SIXTEENK_PAGE_SHIFT;
-#elif (__ARM_ARCH_7K__ >= 2) && defined(PLATFORM_WatchOS)
+#elif (__ARM_ARCH_7K__ >= 2)
        /* enforce 16KB alignment for watch targets with new ABI */
        *((uint8_t*)(_COMM_PAGE_USER_PAGE_SHIFT_32 + _COMM_PAGE_RW_OFFSET)) = (uint8_t) SIXTEENK_PAGE_SHIFT;
        *((uint8_t*)(_COMM_PAGE_USER_PAGE_SHIFT_64 + _COMM_PAGE_RW_OFFSET)) = (uint8_t) SIXTEENK_PAGE_SHIFT;
@@ -243,6 +253,12 @@ commpage_cpus( void )
        return cpus;
 }
 
+int
+_get_cpu_capabilities(void)
+{
+       return _cpu_capabilities;
+}
+
 vm_address_t
 _get_commpage_priv_address(void)
 {
@@ -323,7 +339,15 @@ commpage_init_cpu_capabilities( void )
                bits |= kHasARMv8Crc32;
                gARMv8Crc32 = 1;
        }
+       if ((isar0 & ID_AA64ISAR0_EL1_FHM_MASK) >= ID_AA64ISAR0_EL1_FHM_8_2) {
+               bits |= kHasARMv82FHM;
+               gARMv82FHM = 1;
+       }
 #endif
+
+
+
+
        _cpu_capabilities = bits;
 
        *((uint32_t *)(_COMM_PAGE_CPU_CAPABILITIES + _COMM_PAGE_RW_OFFSET)) = _cpu_capabilities;
@@ -513,10 +537,32 @@ commpage_increment_cpu_quiescent_counter(void)
         * the cpu mask, relaxed loads and stores is more efficient.
         */
 #if __LP64__
-       old_gen = atomic_load_explicit(sched_gen, memory_order_relaxed);
-       atomic_store_explicit(sched_gen, old_gen + 1, memory_order_relaxed);
+       old_gen = os_atomic_load(sched_gen, relaxed);
+       os_atomic_store(sched_gen, old_gen + 1, relaxed);
 #else
        old_gen = atomic_fetch_add_explicit(sched_gen, 1, memory_order_relaxed);
 #endif
        return old_gen;
 }
+
+/*
+ * update the commpage with if dtrace user land probes are enabled
+ */
+void
+commpage_update_dof(boolean_t enabled)
+{
+#if CONFIG_DTRACE
+       *((uint8_t*)(_COMM_PAGE_DTRACE_DOF_ENABLED + _COMM_PAGE_RW_OFFSET)) = (enabled ? 1 : 0);
+#else
+       (void)enabled;
+#endif
+}
+
+/*
+ * update the dyld global config flags
+ */
+void
+commpage_update_dyld_flags(uint64_t value)
+{
+       *((uint64_t*)(_COMM_PAGE_DYLD_SYSTEM_FLAGS + _COMM_PAGE_RW_OFFSET)) = value;
+}
index 6eeb6379995cc934f01224130697a61f55779068..ee124d4b1fe86c131c898b9ef41726dab0058fc6 100644 (file)
@@ -47,5 +47,7 @@ extern  void    commpage_update_multiuser_config(uint32_t);
 extern  void    commpage_update_boottime(uint64_t boottime_usec);
 extern  void    commpage_set_remotetime_params(double rate, uint64_t base_local_ts, uint64_t base_remote_ts);
 extern uint64_t commpage_increment_cpu_quiescent_counter(void);
+extern  void    commpage_update_dof(boolean_t enabled);
+extern  void    commpage_update_dyld_flags(uint64_t value);
 
 #endif /* _ARM_COMMPAGE_H */
index 4109f698ec55a4635e43af3553e50182f573ddc3..72e8c780048ac24c1779447851976500f99c4f1e 100644 (file)
@@ -305,11 +305,17 @@ cpu_stack_alloc(cpu_data_t *cpu_data_ptr)
 void
 cpu_data_free(cpu_data_t *cpu_data_ptr)
 {
-       if (cpu_data_ptr == &BootCpuData) {
+       if ((cpu_data_ptr == NULL) || (cpu_data_ptr == &BootCpuData)) {
                return;
        }
 
        cpu_processor_free( cpu_data_ptr->cpu_processor);
+       if (CpuDataEntries[cpu_data_ptr->cpu_number].cpu_data_vaddr == cpu_data_ptr) {
+               OSDecrementAtomic((SInt32*)&real_ncpus);
+               CpuDataEntries[cpu_data_ptr->cpu_number].cpu_data_vaddr = NULL;
+               CpuDataEntries[cpu_data_ptr->cpu_number].cpu_data_paddr = 0;
+               __builtin_arm_dmb(DMB_ISH); // Ensure prior stores to cpu array are visible
+       }
        (kfree)((void *)(cpu_data_ptr->intstack_top - INTSTACK_SIZE), INTSTACK_SIZE);
        (kfree)((void *)(cpu_data_ptr->fiqstack_top - FIQSTACK_SIZE), FIQSTACK_SIZE);
        kmem_free(kernel_map, (vm_offset_t)cpu_data_ptr, sizeof(cpu_data_t));
@@ -351,12 +357,6 @@ cpu_data_init(cpu_data_t *cpu_data_ptr)
 
        cpu_data_ptr->cpu_signal = SIGPdisabled;
 
-#if DEBUG || DEVELOPMENT
-       cpu_data_ptr->failed_xcall = NULL;
-       cpu_data_ptr->failed_signal = 0;
-       cpu_data_ptr->failed_signal_count = 0;
-#endif
-
        cpu_data_ptr->cpu_get_fiq_handler = NULL;
        cpu_data_ptr->cpu_tbd_hardware_addr = NULL;
        cpu_data_ptr->cpu_tbd_hardware_val = NULL;
@@ -366,6 +366,8 @@ cpu_data_init(cpu_data_t *cpu_data_ptr)
        cpu_data_ptr->cpu_sleep_token_last = 0x00000000UL;
        cpu_data_ptr->cpu_xcall_p0 = NULL;
        cpu_data_ptr->cpu_xcall_p1 = NULL;
+       cpu_data_ptr->cpu_imm_xcall_p0 = NULL;
+       cpu_data_ptr->cpu_imm_xcall_p1 = NULL;
 
 #if     __ARM_SMP__ && defined(ARMA7)
        cpu_data_ptr->cpu_CLWFlush_req = 0x0ULL;
@@ -398,6 +400,7 @@ cpu_data_register(cpu_data_t *cpu_data_ptr)
        }
 
        cpu_data_ptr->cpu_number = cpu;
+       __builtin_arm_dmb(DMB_ISH); // Ensure prior stores to cpu data are visible
        CpuDataEntries[cpu].cpu_data_vaddr = cpu_data_ptr;
        CpuDataEntries[cpu].cpu_data_paddr = (void *)ml_vtophys((vm_offset_t)cpu_data_ptr);
        return KERN_SUCCESS;
@@ -420,8 +423,8 @@ cpu_start(int cpu)
 
                cpu_data_ptr->cpu_pmap_cpu_data.cpu_user_pmap = NULL;
 
-               if (cpu_data_ptr->cpu_processor->next_thread != THREAD_NULL) {
-                       first_thread = cpu_data_ptr->cpu_processor->next_thread;
+               if (cpu_data_ptr->cpu_processor->startup_thread != THREAD_NULL) {
+                       first_thread = cpu_data_ptr->cpu_processor->startup_thread;
                } else {
                        first_thread = cpu_data_ptr->cpu_processor->idle_thread;
                }
@@ -594,8 +597,8 @@ void
 machine_track_platform_idle(boolean_t entry)
 {
        if (entry) {
-               (void)__c11_atomic_fetch_add(&cpu_idle_count, 1, __ATOMIC_RELAXED);
+               os_atomic_inc(&cpu_idle_count, relaxed);
        } else {
-               (void)__c11_atomic_fetch_sub(&cpu_idle_count, 1, __ATOMIC_RELAXED);
+               os_atomic_dec(&cpu_idle_count, relaxed);
        }
 }
index c43156cebb05ab70df5d7309c330f7ed2b41adff..b0f2b3fdad8aaea0a84edf28221d089c1306b024 100644 (file)
@@ -35,6 +35,9 @@
 #include <mach/vm_types.h>
 #endif
 
+#define USER_TIMEBASE_NONE   0
+#define USER_TIMEBASE_SPEC   1
+
 /*
  * This is the authoritative way to determine from user mode what
  * implementation-specific processor features are available.
@@ -45,6 +48,8 @@
 /*
  * Bit definitions for _cpu_capabilities:
  */
+#define kHasICDSBShift                  2
+#define kHasICDSB                       0x00000004      // ICache Data Syncronization on DSB enabled (H13)
 #define kHasNeonFP16                    0x00000008      // ARM v8.2 NEON FP16 supported
 #define kCache32                        0x00000010      // cache line size is 32 bytes
 #define kCache64                        0x00000020      // cache line size is 64 bytes
 #define kHasNeon                        0x00000100      // Advanced SIMD is supported
 #define kHasNeonHPFP                    0x00000200      // Advanced SIMD half-precision
 #define kHasVfp                         0x00000400      // VFP is supported
+#define kHasUCNormalMemory              0x00000800      // Uncacheable normal memory type supported
 #define kHasEvent                       0x00001000      // WFE/SVE and period event wakeup
 #define kHasFMA                         0x00002000      // Fused multiply add is supported
+#define kHasARMv82FHM                   0x00004000      // Optional ARMv8.2 FMLAL/FMLSL instructions (required in ARMv8.4)
 #define kUP                             0x00008000      // set if (kNumCPUs == 1)
 #define kNumCPUs                        0x00FF0000      // number of CPUs (see _NumCPUs() below)
 #define kHasARMv8Crypto                 0x01000000      // Optional ARMv8 Crypto extensions
 #define kHasARMv81Atomics               0x02000000      // ARMv8.1 Atomic instructions supported
 #define kHasARMv8Crc32                  0x04000000      // Optional ARMv8 crc32 instructions (required in ARMv8.1)
 
-#define kNumCPUsShift           16                      // see _NumCPUs() below
-
+#define kNumCPUsShift                   16              // see _NumCPUs() below
 /*
  * Bit definitions for multiuser_config:
  */
@@ -72,7 +78,9 @@
 #ifndef __ASSEMBLER__
 #include <sys/commpage.h>
 
+__BEGIN_DECLS
 extern int  _get_cpu_capabilities( void );
+__END_DECLS
 
 __inline static
 int
@@ -81,6 +89,7 @@ _NumCPUs( void )
        return (_get_cpu_capabilities() & kNumCPUs) >> kNumCPUsShift;
 }
 
+
 typedef struct {
        volatile uint64_t       TimeBase;
        volatile uint32_t       TimeStamp_sec;
@@ -92,7 +101,9 @@ typedef struct {
        volatile uint32_t       TimeBase_shift;
 } commpage_timeofday_data_t;
 
+__BEGIN_DECLS
 extern vm_address_t                             _get_commpage_priv_address(void);
+__END_DECLS
 
 #endif /* __ASSEMBLER__ */
 
@@ -166,6 +177,7 @@ extern vm_address_t                             _get_commpage_priv_address(void)
  * apply _COMM_PAGE_PRIV macro to use these in privileged mode
  */
 #define _COMM_PAGE_SIGNATURE                    (_COMM_PAGE_START_ADDRESS+0x000)        // first few bytes are a signature
+#define _COMM_PAGE_SIGNATURELEN                 (0x10)
 #define _COMM_PAGE_VERSION                      (_COMM_PAGE_START_ADDRESS+0x01E)        // 16-bit version#
 #define _COMM_PAGE_THIS_VERSION                 3                                       // version of the commarea format
 
@@ -188,7 +200,8 @@ extern vm_address_t                             _get_commpage_priv_address(void)
 #define _COMM_PAGE_TIMEBASE_OFFSET              (_COMM_PAGE_START_ADDRESS+0x088)        // uint64_t timebase offset for constructing mach_absolute_time()
 #define _COMM_PAGE_USER_TIMEBASE                (_COMM_PAGE_START_ADDRESS+0x090)        // uint8_t is userspace mach_absolute_time supported (can read the timebase)
 #define _COMM_PAGE_CONT_HWCLOCK                 (_COMM_PAGE_START_ADDRESS+0x091)        // uint8_t is always-on hardware clock present for mach_continuous_time()
-#define _COMM_PAGE_UNUSED0                      (_COMM_PAGE_START_ADDRESS+0x092)        // 6 unused bytes
+#define _COMM_PAGE_DTRACE_DOF_ENABLED           (_COMM_PAGE_START_ADDRESS+0x092)        // uint8_t 0 if userspace DOF disable, 1 if enabled
+#define _COMM_PAGE_UNUSED0                      (_COMM_PAGE_START_ADDRESS+0x093)        // 5 unused bytes
 #define _COMM_PAGE_CONT_TIMEBASE                (_COMM_PAGE_START_ADDRESS+0x098)        // uint64_t base for mach_continuous_time()
 #define _COMM_PAGE_BOOTTIME_USEC                (_COMM_PAGE_START_ADDRESS+0x0A0)        // uint64_t boottime in microseconds
 
@@ -204,6 +217,7 @@ extern vm_address_t                             _get_commpage_priv_address(void)
 
 #define _COMM_PAGE_NEWTIMEOFDAY_DATA            (_COMM_PAGE_START_ADDRESS+0x120)        // used by gettimeofday(). Currently, sizeof(new_commpage_timeofday_data_t) = 40.
 #define _COMM_PAGE_REMOTETIME_PARAMS            (_COMM_PAGE_START_ADDRESS+0x148)        // used by mach_bridge_remote_time(). Currently, sizeof(struct bt_params) = 24
+#define _COMM_PAGE_DYLD_SYSTEM_FLAGS            (_COMM_PAGE_START_ADDRESS+0x160)        // uint64_t export kern.dyld_system_flags to userspace
 
 // aligning to 128 bytes for cacheline/fabric size
 #define _COMM_PAGE_CPU_QUIESCENT_COUNTER        (_COMM_PAGE_START_ADDRESS+0x180)        // uint64_t, but reserve the whole 128 (0x80) bytes
index 85f1cf13bdc112ac64b59c8fae70bf666a1b1243..327434ece8c53a963414d7afbd9471b7cefd21a9 100644 (file)
@@ -150,7 +150,9 @@ cpu_info(processor_flavor_t flavor, int slot_num, processor_info_t info,
                cpu_stat->vfp_shortv_cnt = 0;
                cpu_stat->data_ex_cnt = cpu_data_ptr->cpu_stat.data_ex_cnt;
                cpu_stat->instr_ex_cnt = cpu_data_ptr->cpu_stat.instr_ex_cnt;
-               cpu_stat->pmi_cnt = cpu_data_ptr->cpu_stat.pmi_cnt;
+#if MONOTONIC
+               cpu_stat->pmi_cnt = cpu_data_ptr->cpu_monotonic.mtc_npmis;
+#endif /* MONOTONIC */
 
                *count = PROCESSOR_CPU_STAT64_COUNT;
 
@@ -207,7 +209,7 @@ cpu_handle_xcall(cpu_data_t *cpu_data_ptr)
        broadcastFunc   xfunc;
        void            *xparam;
 
-       __c11_atomic_thread_fence(memory_order_acquire_smp);
+       os_atomic_thread_fence(acquire);
        /* Come back around if cpu_signal_internal is running on another CPU and has just
        * added SIGPxcall to the pending mask, but hasn't yet assigned the call params.*/
        if (cpu_data_ptr->cpu_xcall_p0 != NULL && cpu_data_ptr->cpu_xcall_p1 != NULL) {
@@ -215,14 +217,24 @@ cpu_handle_xcall(cpu_data_t *cpu_data_ptr)
                xparam = cpu_data_ptr->cpu_xcall_p1;
                cpu_data_ptr->cpu_xcall_p0 = NULL;
                cpu_data_ptr->cpu_xcall_p1 = NULL;
-               __c11_atomic_thread_fence(memory_order_acq_rel_smp);
-               hw_atomic_and_noret(&cpu_data_ptr->cpu_signal, ~SIGPxcall);
+               os_atomic_thread_fence(acq_rel);
+               os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPxcall, relaxed);
+               xfunc(xparam);
+       }
+       if (cpu_data_ptr->cpu_imm_xcall_p0 != NULL && cpu_data_ptr->cpu_imm_xcall_p1 != NULL) {
+               xfunc = cpu_data_ptr->cpu_imm_xcall_p0;
+               xparam = cpu_data_ptr->cpu_imm_xcall_p1;
+               cpu_data_ptr->cpu_imm_xcall_p0 = NULL;
+               cpu_data_ptr->cpu_imm_xcall_p1 = NULL;
+               os_atomic_thread_fence(acq_rel);
+               os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPxcallImm, relaxed);
                xfunc(xparam);
        }
 }
 
-unsigned int
-cpu_broadcast_xcall(uint32_t *synch,
+static unsigned int
+cpu_broadcast_xcall_internal(unsigned int signal,
+    uint32_t *synch,
     boolean_t self_xcall,
     broadcastFunc func,
     void *parm)
@@ -232,7 +244,7 @@ cpu_broadcast_xcall(uint32_t *synch,
        cpu_data_t      *target_cpu_datap;
        unsigned int    failsig;
        int             cpu;
-       int             max_cpu;
+       int             max_cpu = ml_get_max_cpu_number() + 1;
 
        intr = ml_set_interrupts_enabled(FALSE);
        cpu_data_ptr = getCpuDatap();
@@ -240,19 +252,19 @@ cpu_broadcast_xcall(uint32_t *synch,
        failsig = 0;
 
        if (synch != NULL) {
-               *synch = real_ncpus;
+               *synch = max_cpu;
                assert_wait((event_t)synch, THREAD_UNINT);
        }
 
-       max_cpu = ml_get_max_cpu_number();
-       for (cpu = 0; cpu <= max_cpu; cpu++) {
+       for (cpu = 0; cpu < max_cpu; cpu++) {
                target_cpu_datap = (cpu_data_t *)CpuDataEntries[cpu].cpu_data_vaddr;
 
-               if ((target_cpu_datap == NULL) || (target_cpu_datap == cpu_data_ptr)) {
+               if (target_cpu_datap == cpu_data_ptr) {
                        continue;
                }
 
-               if (KERN_SUCCESS != cpu_signal(target_cpu_datap, SIGPxcall, (void *)func, parm)) {
+               if ((target_cpu_datap == NULL) ||
+                   KERN_SUCCESS != cpu_signal(target_cpu_datap, signal, (void *)func, parm)) {
                        failsig++;
                }
        }
@@ -265,7 +277,7 @@ cpu_broadcast_xcall(uint32_t *synch,
        (void) ml_set_interrupts_enabled(intr);
 
        if (synch != NULL) {
-               if (hw_atomic_sub(synch, (!self_xcall)? failsig + 1 : failsig) == 0) {
+               if (os_atomic_sub(synch, (!self_xcall) ? failsig + 1 : failsig, relaxed) == 0) {
                        clear_wait(current_thread(), THREAD_AWAKENED);
                } else {
                        thread_block(THREAD_CONTINUE_NULL);
@@ -273,14 +285,32 @@ cpu_broadcast_xcall(uint32_t *synch,
        }
 
        if (!self_xcall) {
-               return real_ncpus - failsig - 1;
+               return max_cpu - failsig - 1;
        } else {
-               return real_ncpus - failsig;
+               return max_cpu - failsig;
        }
 }
 
-kern_return_t
-cpu_xcall(int cpu_number, broadcastFunc func, void *param)
+unsigned int
+cpu_broadcast_xcall(uint32_t *synch,
+    boolean_t self_xcall,
+    broadcastFunc func,
+    void *parm)
+{
+       return cpu_broadcast_xcall_internal(SIGPxcall, synch, self_xcall, func, parm);
+}
+
+unsigned int
+cpu_broadcast_immediate_xcall(uint32_t *synch,
+    boolean_t self_xcall,
+    broadcastFunc func,
+    void *parm)
+{
+       return cpu_broadcast_xcall_internal(SIGPxcallImm, synch, self_xcall, func, parm);
+}
+
+static kern_return_t
+cpu_xcall_internal(unsigned int signal, int cpu_number, broadcastFunc func, void *param)
 {
        cpu_data_t      *target_cpu_datap;
 
@@ -288,12 +318,28 @@ cpu_xcall(int cpu_number, broadcastFunc func, void *param)
                return KERN_INVALID_ARGUMENT;
        }
 
+       if (func == NULL || param == NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
        target_cpu_datap = (cpu_data_t*)CpuDataEntries[cpu_number].cpu_data_vaddr;
        if (target_cpu_datap == NULL) {
                return KERN_INVALID_ARGUMENT;
        }
 
-       return cpu_signal(target_cpu_datap, SIGPxcall, (void*)func, param);
+       return cpu_signal(target_cpu_datap, signal, (void*)func, param);
+}
+
+kern_return_t
+cpu_xcall(int cpu_number, broadcastFunc func, void *param)
+{
+       return cpu_xcall_internal(SIGPxcall, cpu_number, func, param);
+}
+
+kern_return_t
+cpu_immediate_xcall(int cpu_number, broadcastFunc func, void *param)
+{
+       return cpu_xcall_internal(SIGPxcallImm, cpu_number, func, param);
 }
 
 static kern_return_t
@@ -320,39 +366,40 @@ cpu_signal_internal(cpu_data_t *target_proc,
                Check_SIGPdisabled = 0;
        }
 
-       if (signal == SIGPxcall) {
+       if ((signal == SIGPxcall) || (signal == SIGPxcallImm)) {
                do {
                        current_signals = target_proc->cpu_signal;
                        if ((current_signals & SIGPdisabled) == SIGPdisabled) {
-#if DEBUG || DEVELOPMENT
-                               target_proc->failed_signal = SIGPxcall;
-                               target_proc->failed_xcall = p0;
-                               OSIncrementAtomicLong(&target_proc->failed_signal_count);
-#endif
                                ml_set_interrupts_enabled(interruptible);
                                return KERN_FAILURE;
                        }
-                       swap_success = OSCompareAndSwap(current_signals & (~SIGPxcall), current_signals | SIGPxcall,
+                       swap_success = OSCompareAndSwap(current_signals & (~signal), current_signals | signal,
                            &target_proc->cpu_signal);
 
+                       if (!swap_success && (signal == SIGPxcallImm) && (target_proc->cpu_signal & SIGPxcallImm)) {
+                               ml_set_interrupts_enabled(interruptible);
+                               return KERN_ALREADY_WAITING;
+                       }
+
                        /* Drain pending xcalls on this cpu; the CPU we're trying to xcall may in turn
                         * be trying to xcall us.  Since we have interrupts disabled that can deadlock,
                         * so break the deadlock by draining pending xcalls. */
-                       if (!swap_success && (current_proc->cpu_signal & SIGPxcall)) {
+                       if (!swap_success && (current_proc->cpu_signal & signal)) {
                                cpu_handle_xcall(current_proc);
                        }
                } while (!swap_success);
 
-               target_proc->cpu_xcall_p0 = p0;
-               target_proc->cpu_xcall_p1 = p1;
+               if (signal == SIGPxcallImm) {
+                       target_proc->cpu_imm_xcall_p0 = p0;
+                       target_proc->cpu_imm_xcall_p1 = p1;
+               } else {
+                       target_proc->cpu_xcall_p0 = p0;
+                       target_proc->cpu_xcall_p1 = p1;
+               }
        } else {
                do {
                        current_signals = target_proc->cpu_signal;
                        if ((Check_SIGPdisabled != 0) && (current_signals & Check_SIGPdisabled) == SIGPdisabled) {
-#if DEBUG || DEVELOPMENT
-                               target_proc->failed_signal = signal;
-                               OSIncrementAtomicLong(&target_proc->failed_signal_count);
-#endif
                                ml_set_interrupts_enabled(interruptible);
                                return KERN_FAILURE;
                        }
@@ -424,48 +471,48 @@ cpu_signal_handler_internal(boolean_t disable_signal)
 
        SCHED_STATS_IPI(current_processor());
 
-       cpu_signal = hw_atomic_or(&cpu_data_ptr->cpu_signal, 0);
+       cpu_signal = os_atomic_or(&cpu_data_ptr->cpu_signal, 0, relaxed);
 
        if ((!(cpu_signal & SIGPdisabled)) && (disable_signal == TRUE)) {
-               (void)hw_atomic_or(&cpu_data_ptr->cpu_signal, SIGPdisabled);
+               os_atomic_or(&cpu_data_ptr->cpu_signal, SIGPdisabled, relaxed);
        } else if ((cpu_signal & SIGPdisabled) && (disable_signal == FALSE)) {
-               (void)hw_atomic_and(&cpu_data_ptr->cpu_signal, ~SIGPdisabled);
+               os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPdisabled, relaxed);
        }
 
        while (cpu_signal & ~SIGPdisabled) {
                if (cpu_signal & SIGPdec) {
-                       (void)hw_atomic_and(&cpu_data_ptr->cpu_signal, ~SIGPdec);
+                       os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPdec, relaxed);
                        rtclock_intr(FALSE);
                }
 #if KPERF
                if (cpu_signal & SIGPkptimer) {
-                       (void)hw_atomic_and(&cpu_data_ptr->cpu_signal, ~SIGPkptimer);
+                       os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPkptimer, relaxed);
                        kperf_signal_handler((unsigned int)cpu_data_ptr->cpu_number);
                }
 #endif
-               if (cpu_signal & SIGPxcall) {
+               if (cpu_signal & (SIGPxcall | SIGPxcallImm)) {
                        cpu_handle_xcall(cpu_data_ptr);
                }
                if (cpu_signal & SIGPast) {
-                       (void)hw_atomic_and(&cpu_data_ptr->cpu_signal, ~SIGPast);
+                       os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPast, relaxed);
                        ast_check(cpu_data_ptr->cpu_processor);
                }
                if (cpu_signal & SIGPdebug) {
-                       (void)hw_atomic_and(&cpu_data_ptr->cpu_signal, ~SIGPdebug);
+                       os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPdebug, relaxed);
                        DebuggerXCall(cpu_data_ptr->cpu_int_state);
                }
 #if     __ARM_SMP__ && defined(ARMA7)
                if (cpu_signal & SIGPLWFlush) {
-                       (void)hw_atomic_and(&cpu_data_ptr->cpu_signal, ~SIGPLWFlush);
+                       os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPLWFlush, relaxed);
                        cache_xcall_handler(LWFlush);
                }
                if (cpu_signal & SIGPLWClean) {
-                       (void)hw_atomic_and(&cpu_data_ptr->cpu_signal, ~SIGPLWClean);
+                       os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPLWClean, relaxed);
                        cache_xcall_handler(LWClean);
                }
 #endif
 
-               cpu_signal = hw_atomic_or(&cpu_data_ptr->cpu_signal, 0);
+               cpu_signal = os_atomic_or(&cpu_data_ptr->cpu_signal, 0, relaxed);
        }
 }
 
@@ -499,7 +546,10 @@ cpu_machine_init(void)
        if (cpu_data_ptr->cpu_cache_dispatch != (cache_dispatch_t) NULL) {
                platform_cache_init();
        }
+
+       /* Note: this calls IOCPURunPlatformActiveActions when resuming on boot cpu */
        PE_cpu_machine_init(cpu_data_ptr->cpu_id, !started);
+
        cpu_data_ptr->cpu_flags |= StartedState;
        ml_init_interrupt();
 }
index b99f054e1fad59296242f9f50bab2eb103d5b51b..7b001d176e6a23f8e10df72cbc14a91e361a1623 100644 (file)
@@ -48,7 +48,7 @@
 
 #define current_thread()        current_thread_fast()
 
-static inline __pure2 thread_t
+static inline __attribute__((const)) thread_t
 current_thread_fast(void)
 {
 #if defined(__arm64__)
index ac6569f7c4d7d9ab014698e429eb4d9c3360f31b..8b29c711a7c60225e6cd586c18ef23ed0117f3a6 100644 (file)
@@ -69,13 +69,15 @@ extern  reset_handler_data_t    ResetHandlerData;
 #define MAX_CPUS                        1
 #endif
 
-#define CPUWINDOWS_MAX                  4
+/* Put the static check for cpumap_t here as it's defined in <kern/processor.h> */
+static_assert(sizeof(cpumap_t) * CHAR_BIT >= MAX_CPUS, "cpumap_t bitvector is too small for current MAX_CPUS value");
+
 #ifdef  __arm__
-#define CPUWINDOWS_BASE                 0xFFF00000UL
+#define CPUWINDOWS_BASE_MASK            0xFFF00000UL
 #else
 #define CPUWINDOWS_BASE_MASK            0xFFFFFFFFFFF00000UL
-#define CPUWINDOWS_BASE                 (VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK)
 #endif
+#define CPUWINDOWS_BASE                 (VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK)
 #define CPUWINDOWS_TOP                  (CPUWINDOWS_BASE + (MAX_CPUS * CPUWINDOWS_MAX * PAGE_SIZE))
 
 typedef struct cpu_data_entry {
@@ -109,8 +111,9 @@ typedef struct {
        uint64_t ipi_cnt_wake;
        uint64_t timer_cnt;
        uint64_t timer_cnt_wake;
-       uint64_t pmi_cnt;
+#if MONOTONIC
        uint64_t pmi_cnt_wake;
+#endif /* MONOTONIC */
        uint64_t undef_ex_cnt;
        uint64_t unaligned_cnt;
        uint64_t vfp_cnt;
@@ -137,11 +140,6 @@ typedef struct cpu_data {
        unsigned int                            cpu_ident;
        cpu_id_t                                cpu_id;
        unsigned volatile int                   cpu_signal;
-#if DEBUG || DEVELOPMENT
-       void                                    *failed_xcall;
-       unsigned int                            failed_signal;
-       volatile long                           failed_signal_count;
-#endif
        void                                    *cpu_cache_dispatch;
        ast_t                                   cpu_pending_ast;
        struct processor                        *cpu_processor;
@@ -223,6 +221,8 @@ typedef struct cpu_data {
 
        void                                    *cpu_xcall_p0;
        void                                    *cpu_xcall_p1;
+       void                                    *cpu_imm_xcall_p0;
+       void                                    *cpu_imm_xcall_p1;
 
 #if     __ARM_SMP__ && defined(ARMA7)
        volatile uint32_t                       cpu_CLW_active;
@@ -278,6 +278,9 @@ typedef struct cpu_data {
                CPU_HALTED,
                CPU_HALTED_WITH_STATE
        }                                       halt_status;
+#if defined(HAS_APPLE_PAC)
+       uint64_t        rop_key;
+#endif /* defined(HAS_APPLE_PAC) */
 } cpu_data_t;
 
 /*
index f40941de5a17535c00afa6fe83d073d43aeeb08a..7a98926000e169d54cd097de7dd5c0c61218050e 100644 (file)
@@ -64,6 +64,7 @@ extern void                     cpu_signal_cancel(
 #define SIGPLWFlush             0x00000020UL            /* Request LWFlush call */
 #define SIGPLWClean             0x00000040UL            /* Request LWClean call */
 #define SIGPkptimer             0x00000100U             /* Request kperf timer */
+#define SIGPxcallImm            0x00000200U             /* Send a cross-call, fail if already pending */
 
 #define SIGPdisabled            0x80000000U             /* Signal disabled */
 
index 147bfaa1dfbf5988fbf62c47d45dea0b415789e0..73f9b0d83417ce766b017477eb9a45e661ba90e1 100644 (file)
 
 typedef struct {
        uint32_t
-
-           Ctype1:3,   /* 2:0 */
-           Ctype2:3,   /* 5:3 */
-           Ctype3:3,   /* 8:6 */
-           Ctypes:15,  /* 6:23 - Don't Care */
-           LoC:3,      /* 26-24 - Level of Coherency */
-           LoU:3,      /* 29:27 - Level of Unification */
-           RAZ:2;      /* 31:30 - Read-As-Zero */
-}               arm_cache_clidr_t;
+           Ctype1:3, /* 2:0 */
+           Ctype2:3, /* 5:3 */
+           Ctype3:3, /* 8:6 */
+           Ctypes:15, /* 6:23 - Don't Care */
+           LoC:3, /* 26-24 - Level of Coherency */
+           LoU:3, /* 29:27 - Level of Unification */
+           RAZ:2; /* 31:30 - Read-As-Zero */
+} arm_cache_clidr_t;
 
 typedef union {
        arm_cache_clidr_t bits;
-       uint32_t        value;
-}               arm_cache_clidr_info_t;
+       uint32_t          value;
+} arm_cache_clidr_info_t;
 
 
 typedef struct {
        uint32_t
-
            LineSize:3, /* 2:0 - Number of words in cache line */
-           Assoc:10,   /* 12:3 - Associativity of cache */
+           Assoc:10, /* 12:3 - Associativity of cache */
            NumSets:15, /* 27:13 - Number of sets in cache */
-           c_type:4;   /* 31:28 - Cache type */
-}               arm_cache_ccsidr_t;
+           c_type:4; /* 31:28 - Cache type */
+} arm_cache_ccsidr_t;
 
 
 typedef union {
        arm_cache_ccsidr_t bits;
-       uint32_t        value;
-}               arm_cache_ccsidr_info_t;
+       uint32_t           value;
+} arm_cache_ccsidr_info_t;
 
 /* Statics */
 
@@ -85,17 +83,21 @@ void
 do_cpuid(void)
 {
        cpuid_cpu_info.value = machine_read_midr();
-#if             (__ARM_ARCH__ == 8)
+#if (__ARM_ARCH__ == 8)
 
+#if defined(HAS_APPLE_PAC)
+       cpuid_cpu_info.arm_info.arm_arch = CPU_ARCH_ARMv8E;
+#else /* defined(HAS_APPLE_PAC) */
        cpuid_cpu_info.arm_info.arm_arch = CPU_ARCH_ARMv8;
+#endif /* defined(HAS_APPLE_PAC) */
 
-#elif   (__ARM_ARCH__ == 7)
-  #ifdef __ARM_SUB_ARCH__
+#elif (__ARM_ARCH__ == 7)
+#ifdef __ARM_SUB_ARCH__
        cpuid_cpu_info.arm_info.arm_arch = __ARM_SUB_ARCH__;
-  #else
+#else /* __ARM_SUB_ARCH__ */
        cpuid_cpu_info.arm_info.arm_arch = CPU_ARCH_ARMv7;
-  #endif
-#else
+#endif /* __ARM_SUB_ARCH__ */
+#else /* (__ARM_ARCH__ != 7) && (__ARM_ARCH__ != 8) */
        /* 1176 architecture lives in the extended feature register */
        if (cpuid_cpu_info.arm_info.arm_arch == CPU_ARCH_EXTENDED) {
                arm_isa_feat1_reg isa = machine_read_isa_feat1();
@@ -108,7 +110,7 @@ do_cpuid(void)
                        cpuid_cpu_info.arm_info.arm_arch = CPU_ARCH_ARMv6;
                }
        }
-#endif
+#endif /* (__ARM_ARCH__ != 7) && (__ARM_ARCH__ != 8) */
 }
 
 arm_cpu_info_t *
@@ -176,6 +178,13 @@ cpuid_get_cpufamily(void)
                case CPU_PART_MISTRAL:
                        cpufamily = CPUFAMILY_ARM_MONSOON_MISTRAL;
                        break;
+               case CPU_PART_VORTEX:
+               case CPU_PART_TEMPEST:
+               case CPU_PART_TEMPEST_M9:
+               case CPU_PART_VORTEX_ARUBA:
+               case CPU_PART_TEMPEST_ARUBA:
+                       cpufamily = CPUFAMILY_ARM_VORTEX_TEMPEST;
+                       break;
                default:
                        cpufamily = CPUFAMILY_UNKNOWN;
                        break;
index bc6468a963f28d5394a42f6115885c06d6cad63b..559cde9fcbfeb9f7f9f72138739d02ddd408633d 100644 (file)
 #include <machine/machine_cpuid.h>
 
 typedef struct {
-       uint32_t        arm_rev                 : 4,/* 00:03    revision number  */
-           arm_part                : 12,               /* 04:15        primary part number */
-           arm_arch                : 4,                /* 16:19        architecture            */
-           arm_variant             : 4,                /* 20:23        variant                  */
-           arm_implementor : 8;                /* 24:31        implementor (0x41)  */
+       uint32_t arm_rev : 4,  /* 00:03 revision number */
+           arm_part         : 12,/* 04:15 primary part number */
+           arm_arch         : 4,/* 16:19 architecture */
+           arm_variant      : 4,/* 20:23 variant  */
+           arm_implementor  : 8;/* 24:31 implementor (0x41) */
 } arm_cpuid_bits_t;
 
 typedef union {
-       arm_cpuid_bits_t        arm_info;               /* ARM9xx,  ARM11xx, and later processors */
-       uint32_t                        value;
+       arm_cpuid_bits_t arm_info; /* ARM9xx,  ARM11xx, and later processors */
+       uint32_t         value;
 } arm_cpu_info_t;
 
 /* Implementor codes */
-#define CPU_VID_ARM             0x41    // ARM Limited
-#define CPU_VID_DEC             0x44    // Digital Equipment Corporation
-#define CPU_VID_MOTOROLA        0x4D    // Motorola - Freescale Semiconductor Inc.
-#define CPU_VID_MARVELL 0x56    // Marvell Semiconductor Inc.
-#define CPU_VID_INTEL   0x69    // Intel ARM parts.
-#define CPU_VID_APPLE   0x61    // Apple Inc.
+#define CPU_VID_ARM      0x41 // ARM Limited
+#define CPU_VID_DEC      0x44 // Digital Equipment Corporation
+#define CPU_VID_MOTOROLA 0x4D // Motorola - Freescale Semiconductor Inc.
+#define CPU_VID_MARVELL  0x56 // Marvell Semiconductor Inc.
+#define CPU_VID_INTEL    0x69 // Intel ARM parts.
+#define CPU_VID_APPLE    0x61 // Apple Inc.
 
 
 /* ARM Architecture Codes */
 
-#define CPU_ARCH_ARMv4          0x1             /* ARMv4 */
-#define CPU_ARCH_ARMv4T         0x2             /* ARMv4 + Thumb */
-#define CPU_ARCH_ARMv5          0x3             /* ARMv5 */
-#define CPU_ARCH_ARMv5T         0x4             /* ARMv5 + Thumb */
-#define CPU_ARCH_ARMv5TE        0x5             /* ARMv5 + Thumb + Extensions(?) */
-#define CPU_ARCH_ARMv5TEJ       0x6             /* ARMv5 + Thumb + Extensions(?) + //Jazelle(?) XXX */
-#define CPU_ARCH_ARMv6          0x7             /* ARMv6 */
-#define CPU_ARCH_ARMv7          0x8             /* ARMv7 */
-#define CPU_ARCH_ARMv7f         0x9             /* ARMv7 for Cortex A9 */
-#define CPU_ARCH_ARMv7s         0xa             /* ARMv7 for Swift */
-#define CPU_ARCH_ARMv7k         0xb             /* ARMv7 for Cortex A7 */
+#define CPU_ARCH_ARMv4    0x1 /* ARMv4 */
+#define CPU_ARCH_ARMv4T   0x2 /* ARMv4 + Thumb */
+#define CPU_ARCH_ARMv5    0x3 /* ARMv5 */
+#define CPU_ARCH_ARMv5T   0x4 /* ARMv5 + Thumb */
+#define CPU_ARCH_ARMv5TE  0x5 /* ARMv5 + Thumb + Extensions(?) */
+#define CPU_ARCH_ARMv5TEJ 0x6 /* ARMv5 + Thumb + Extensions(?) + //Jazelle(?) XXX */
+#define CPU_ARCH_ARMv6    0x7 /* ARMv6 */
+#define CPU_ARCH_ARMv7    0x8 /* ARMv7 */
+#define CPU_ARCH_ARMv7f   0x9 /* ARMv7 for Cortex A9 */
+#define CPU_ARCH_ARMv7s   0xa /* ARMv7 for Swift */
+#define CPU_ARCH_ARMv7k   0xb /* ARMv7 for Cortex A7 */
 
-#define CPU_ARCH_ARMv8          0xc             /* Subtype for CPU_TYPE_ARM64 */
+#define CPU_ARCH_ARMv8    0xc /* Subtype for CPU_TYPE_ARM64 */
 
+#define CPU_ARCH_ARMv8E   0xd /* ARMv8.3a + Apple Private ISA Subtype for CPU_TYPE_ARM64 */
 
 /* special code indicating we need to look somewhere else for the architecture version */
-#define CPU_ARCH_EXTENDED       0xF
+#define CPU_ARCH_EXTENDED 0xF
 
 /* ARM Part Numbers */
 /*
@@ -89,54 +90,69 @@ typedef union {
  */
 
 /* ARM9 (ARMv4T architecture) */
-#define CPU_PART_920T           0x920
-#define CPU_PART_926EJS         0x926   /* ARM926EJ-S */
+#define CPU_PART_920T               0x920
+#define CPU_PART_926EJS             0x926 /* ARM926EJ-S */
 
 /* ARM11  (ARMv6 architecture) */
-#define CPU_PART_1136JFS        0xB36   /* ARM1136JF-S or ARM1136J-S */
-#define CPU_PART_1176JZFS       0xB76   /* ARM1176JZF-S */
+#define CPU_PART_1136JFS            0xB36 /* ARM1136JF-S or ARM1136J-S */
+#define CPU_PART_1176JZFS           0xB76 /* ARM1176JZF-S */
 
 /* G1 (ARMv7 architecture) */
-#define CPU_PART_CORTEXA5       0xC05
+#define CPU_PART_CORTEXA5           0xC05
 
 /* M7 (ARMv7 architecture) */
-#define CPU_PART_CORTEXA7       0xC07
+#define CPU_PART_CORTEXA7           0xC07
 
 /* H2 H3 (ARMv7 architecture) */
-#define CPU_PART_CORTEXA8       0xC08
+#define CPU_PART_CORTEXA8           0xC08
 
 /* H4 (ARMv7 architecture) */
-#define CPU_PART_CORTEXA9       0xC09
+#define CPU_PART_CORTEXA9           0xC09
 
 /* H5 (SWIFT architecture) */
-#define CPU_PART_SWIFT          0x0
+#define CPU_PART_SWIFT              0x0
 
 /* H6 (ARMv8 architecture) */
-#define CPU_PART_CYCLONE        0x1
+#define CPU_PART_CYCLONE            0x1
 
 /* H7 (ARMv8 architecture) */
-#define CPU_PART_TYPHOON        0x2
+#define CPU_PART_TYPHOON            0x2
 
 /* H7G (ARMv8 architecture) */
-#define CPU_PART_TYPHOON_CAPRI  0x3
+#define CPU_PART_TYPHOON_CAPRI      0x3
 
 /* H8 (ARMv8 architecture) */
-#define CPU_PART_TWISTER        0x4
+#define CPU_PART_TWISTER            0x4
 
 /* H8G H8M (ARMv8 architecture) */
-#define CPU_PART_TWISTER_ELBA_MALTA     0x5
+#define CPU_PART_TWISTER_ELBA_MALTA 0x5
 
 /* H9 (ARMv8 architecture) */
-#define CPU_PART_HURRICANE      0x6
+#define CPU_PART_HURRICANE          0x6
 
 /* H9G (ARMv8 architecture) */
-#define CPU_PART_HURRICANE_MYST 0x7
+#define CPU_PART_HURRICANE_MYST     0x7
 
 /* H10 p-Core (ARMv8 architecture) */
-#define CPU_PART_MONSOON        0x8
+#define CPU_PART_MONSOON            0x8
 
 /* H10 e-Core (ARMv8 architecture) */
-#define CPU_PART_MISTRAL        0x9
+#define CPU_PART_MISTRAL            0x9
+
+/* H11 p-Core (ARMv8 architecture) */
+#define CPU_PART_VORTEX             0xB
+
+/* H11 e-Core (ARMv8 architecture) */
+#define CPU_PART_TEMPEST            0xC
+
+/* M9 e-Core (ARMv8 architecture) */
+#define CPU_PART_TEMPEST_M9         0xF
+
+/* H11G p-Core (ARMv8 architecture) */
+#define CPU_PART_VORTEX_ARUBA       0x10
+
+/* H11G e-Core (ARMv8 architecture) */
+#define CPU_PART_TEMPEST_ARUBA      0x11
 
 
 /* Cache type identification */
@@ -151,24 +167,23 @@ typedef enum {
 } cache_type_t;
 
 typedef struct {
-       boolean_t               c_unified;      /* unified I & D cache? */
-       uint32_t                c_isize;        /* in Bytes (ARM caches can be 0.5 KB) */
-       boolean_t               c_i_ppage;      /* protected page restriction for I cache
-                                                * (see B6-11 in ARM DDI 0100I document). */
-       uint32_t                c_dsize;        /* in Bytes (ARM caches can be 0.5 KB) */
-       boolean_t               c_d_ppage;      /* protected page restriction for I cache
-                                                * (see B6-11 in ARM DDI 0100I document). */
-       cache_type_t    c_type;         /* WB or WT */
-       uint32_t                c_linesz;       /* number of bytes */
-       uint32_t                c_assoc;        /* n-way associativity */
-       uint32_t            c_l2size;   /* L2 size, if present */
-       uint32_t            c_bulksize_op;/* bulk operation size limit. 0 if disabled */
-       uint32_t            c_inner_cache_size; /* inner dache size */
+       boolean_t    c_unified;          /* unified I & D cache? */
+       uint32_t     c_isize;            /* in Bytes (ARM caches can be 0.5 KB) */
+       boolean_t    c_i_ppage;          /* protected page restriction for I cache
+                                         * (see B6-11 in ARM DDI 0100I document). */
+       uint32_t     c_dsize;            /* in Bytes (ARM caches can be 0.5 KB) */
+       boolean_t    c_d_ppage;          /* protected page restriction for I cache
+                                         * (see B6-11 in ARM DDI 0100I document). */
+       cache_type_t c_type;             /* WB or WT */
+       uint32_t     c_linesz;           /* number of bytes */
+       uint32_t     c_assoc;            /* n-way associativity */
+       uint32_t     c_l2size;           /* L2 size, if present */
+       uint32_t     c_bulksize_op;      /* bulk operation size limit. 0 if disabled */
+       uint32_t     c_inner_cache_size; /* inner dache size */
 } cache_info_t;
 
 typedef struct {
        uint32_t
-
            RB:4, /* 3:0 - 32x64-bit media register bank supported: 0x2 */
            SP:4, /* 7:4 - Single precision supported in VFPv3: 0x2 */
            DP:4, /* 8:11 - Double precision supported in VFPv3: 0x2 */
@@ -186,7 +201,6 @@ typedef union {
 
 typedef struct {
        uint32_t
-
            FZ:4, /* 3:0 - Full denormal arithmetic supported for VFP: 0x1 */
            DN:4, /* 7:4 - Propagation of NaN values supported for VFP: 0x1 */
            LS:4, /* 11:8 - Load/store instructions supported for NEON: 0x1 */
@@ -202,14 +216,14 @@ typedef union {
 } arm_mvfr1_info_t;
 
 typedef struct {
-       uint32_t                neon;
-       uint32_t                neon_hpfp;
-       uint32_t                neon_fp16;
+       uint32_t neon;
+       uint32_t neon_hpfp;
+       uint32_t neon_fp16;
 } arm_mvfp_info_t;
 
 #ifdef __cplusplus
 extern "C" {
-#endif
+#endif /* __cplusplus */
 
 extern void do_cpuid(void);
 extern arm_cpu_info_t *cpuid_info(void);
@@ -226,6 +240,6 @@ extern arm_mvfp_info_t *arm_mvfp_info(void);
 
 #ifdef __cplusplus
 }
-#endif
+#endif /* __cplusplus */
 
 #endif // _MACHINE_CPUID_H_
index 435383706329a6243ccc8595c597faebb651b432..5ebbf990bae4b46900de84afa1037a882e0dc05d 100644 (file)
@@ -355,11 +355,11 @@ main(
        DECLARE("BA_TOP_OF_KERNEL_DATA",
            offsetof(struct boot_args, topOfKernelData));
 
-       DECLARE("ENTROPY_INDEX_PTR",
-           offsetof(entropy_data_t, index_ptr));
+       DECLARE("ENTROPY_SAMPLE_COUNT",
+           offsetof(entropy_data_t, sample_count));
        DECLARE("ENTROPY_BUFFER",
            offsetof(entropy_data_t, buffer));
-       DECLARE("ENTROPY_DATA_SIZE", sizeof(struct entropy_data));
+       DECLARE("ENTROPY_BUFFER_INDEX_MASK", ENTROPY_BUFFER_INDEX_MASK);
 
        return 0;
 }
index bae84c7807d562159774c04ca0b03e7918e43c97..1bb8b82d6e238635fd5e5793b9f4c5343834498d 100644 (file)
@@ -69,6 +69,16 @@ extern vm_offset_t      virtual_space_start;     /* Next available kernel VA */
  */
 vm_offset_t
 io_map(vm_map_offset_t phys_addr, vm_size_t size, unsigned int flags)
+{
+       return io_map_with_prot(phys_addr, size, flags, VM_PROT_READ | VM_PROT_WRITE);
+}
+
+/*
+ * Allocate and map memory for devices that may need to be mapped before
+ * Mach VM is running. Allows caller to specify mapping protection
+ */
+vm_offset_t
+io_map_with_prot(vm_map_offset_t phys_addr, vm_size_t size, unsigned int flags, vm_prot_t prot)
 {
        vm_offset_t     start, start_offset;
 
@@ -87,15 +97,15 @@ io_map(vm_map_offset_t phys_addr, vm_size_t size, unsigned int flags)
 
                if (flags == VM_WIMG_WCOMB) {
                        (void) pmap_map_bd_with_options(start, phys_addr, phys_addr + round_page(size),
-                           VM_PROT_READ | VM_PROT_WRITE, PMAP_MAP_BD_WCOMB);
+                           prot, PMAP_MAP_BD_WCOMB);
                } else {
                        (void) pmap_map_bd(start, phys_addr, phys_addr + round_page(size),
-                           VM_PROT_READ | VM_PROT_WRITE);
+                           prot);
                }
        } else {
                (void) kmem_alloc_pageable(kernel_map, &start, round_page(size), VM_KERN_MEMORY_IOKIT);
                (void) pmap_map(start, phys_addr, phys_addr + round_page(size),
-                   VM_PROT_READ | VM_PROT_WRITE, flags);
+                   prot, flags);
        }
 #if KASAN
        kasan_notify_address(start + start_offset, size);
index 4b97c77f5ccec39f1d26d0303387c3013e0ff9ca..1c5ec79a615bbeb6a2dbc953189002b618824d69 100644 (file)
@@ -40,6 +40,13 @@ extern vm_offset_t      io_map(
        vm_map_offset_t         phys_addr,
        vm_size_t               size,
        unsigned int            flags);
+
+extern vm_offset_t      io_map_with_prot(
+       vm_map_offset_t                   phys_addr,
+       vm_size_t                         size,
+       unsigned int                      flags,
+       vm_prot_t                         prot);
+
 extern vm_offset_t io_map_spec(vm_map_offset_t phys_addr, vm_size_t size, unsigned int flags);
 #endif  /* __APPLE_API_PRIVATE */
 
index b5c060a8a1fcd8cbb196f3fc7f98652472b2c78a..5d882c13b289413760c75c0726f091f4217f313f 100644 (file)
@@ -282,7 +282,7 @@ kpc_set_running_xcall( void *vstate )
        set_running_configurable(mp_config->cfg_target_mask,
            mp_config->cfg_state_mask);
 
-       if (hw_atomic_sub(&kpc_xcall_sync, 1) == 0) {
+       if (os_atomic_dec(&kpc_xcall_sync, relaxed) == 0) {
                thread_wakeup((event_t) &kpc_xcall_sync);
        }
 }
@@ -674,7 +674,7 @@ kpc_set_reload_xcall(void *vmp_config)
 
        ml_set_interrupts_enabled(enabled);
 
-       if (hw_atomic_sub(&kpc_reload_sync, 1) == 0) {
+       if (os_atomic_dec(&kpc_reload_sync, relaxed) == 0) {
                thread_wakeup((event_t) &kpc_reload_sync);
        }
 }
@@ -749,7 +749,7 @@ kpc_set_config_xcall(void *vmp_config)
                new_config += kpc_popcount(mp_config->pmc_mask);
        }
 
-       if (hw_atomic_sub(&kpc_config_sync, 1) == 0) {
+       if (os_atomic_dec(&kpc_config_sync, relaxed) == 0) {
                thread_wakeup((event_t) &kpc_config_sync);
        }
 }
@@ -795,9 +795,9 @@ kpc_get_curcpu_counters_xcall(void *args)
        r = kpc_get_curcpu_counters(handler->classes, NULL, &handler->buf[offset]);
 
        /* number of counters added by this CPU, needs to be atomic  */
-       hw_atomic_add(&(handler->nb_counters), r);
+       os_atomic_add(&(handler->nb_counters), r, relaxed);
 
-       if (hw_atomic_sub(&kpc_xread_sync, 1) == 0) {
+       if (os_atomic_dec(&kpc_xread_sync, relaxed) == 0) {
                thread_wakeup((event_t) &kpc_xread_sync);
        }
 }
index 41941a9d17f013c54652ab2c414c2e108db064f6..ce0e69e4258035932a73b3848214c1164cd75d8f 100644 (file)
@@ -88,7 +88,7 @@ typedef struct _lck_mtx_ {
        union {
                struct {
                        uint16_t                                lck_mtx_waiters;/* Number of waiters */
-                       uint8_t                                 lck_mtx_pri;    /* Priority to inherit */
+                       uint8_t                                 lck_mtx_pri;    /* unused */
                        uint8_t                                 lck_mtx_type;   /* Type */
                };
                struct {
@@ -215,13 +215,13 @@ typedef struct {
 //                                                                     23-30
 #define LCK_RW_TAG_VALID_BIT            31
 
-#define LCK_RW_INTERLOCK                (1 << LCK_RW_INTERLOCK_BIT)
-#define LCK_RW_R_WAITING                (1 << LCK_RW_R_WAITING_BIT)
-#define LCK_RW_W_WAITING                (1 << LCK_RW_W_WAITING_BIT)
-#define LCK_RW_WANT_UPGRADE             (1 << LCK_RW_WANT_UPGRADE_BIT)
-#define LCK_RW_WANT_EXCL                (1 << LCK_RW_WANT_EXCL_BIT)
-#define LCK_RW_TAG_VALID                (1 << LCK_RW_TAG_VALID_BIT)
-#define LCK_RW_PRIV_EXCL                (1 << LCK_RW_PRIV_EXCL_BIT)
+#define LCK_RW_INTERLOCK                (1U << LCK_RW_INTERLOCK_BIT)
+#define LCK_RW_R_WAITING                (1U << LCK_RW_R_WAITING_BIT)
+#define LCK_RW_W_WAITING                (1U << LCK_RW_W_WAITING_BIT)
+#define LCK_RW_WANT_UPGRADE             (1U << LCK_RW_WANT_UPGRADE_BIT)
+#define LCK_RW_WANT_EXCL                (1U << LCK_RW_WANT_EXCL_BIT)
+#define LCK_RW_TAG_VALID                (1U << LCK_RW_TAG_VALID_BIT)
+#define LCK_RW_PRIV_EXCL                (1U << LCK_RW_PRIV_EXCL_BIT)
 #define LCK_RW_SHARED_MASK              (0xffff << LCK_RW_SHARED_READER_OFFSET)
 #define LCK_RW_SHARED_READER    (0x1 << LCK_RW_SHARED_READER_OFFSET)
 
@@ -257,6 +257,9 @@ typedef struct {
 
 #define PLATFORM_LCK_ILOCK LCK_ILOCK
 
+#if defined(__ARM_ARCH_8_2__)
+#define __ARM_ATOMICS_8_1       1       // ARMv8.1 atomic instructions are available
+#endif
 
 /*
  * Lock state to thread pointer
@@ -273,8 +276,8 @@ typedef struct {
  */
 #define LCK_MTX_THREAD_MASK (~(uintptr_t)(LCK_ILOCK | ARM_LCK_WAITERS))
 
-#define disable_preemption_for_thread(t) ((volatile thread_t)t)->machine.preemption_count++
-#define preemption_disabled_for_thread(t) (((volatile thread_t)t)->machine.preemption_count > 0)
+#define disable_preemption_for_thread(t) os_atomic_store(&(t->machine.preemption_count), t->machine.preemption_count + 1, compiler_acq_rel)
+#define preemption_disabled_for_thread(t) (t->machine.preemption_count > 0)
 
 
 __unused static void
index 5b6917ac3bb6ac1f1c02abf048f72d3958ba23b5..49a261f31c434fbab97525e39bba554e5d6afd28 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -59,7 +59,6 @@
  *     Locking primitives implementation
  */
 
-#define ATOMIC_PRIVATE 1
 #define LOCK_PRIVATE 1
 
 #include <mach_ldebug.h>
@@ -71,7 +70,6 @@
 #include <kern/thread.h>
 #include <kern/processor.h>
 #include <kern/sched_prim.h>
-#include <kern/xpr.h>
 #include <kern/debug.h>
 #include <kern/kcdata.h>
 #include <string.h>
@@ -129,17 +127,6 @@ extern uint64_t dtrace_spin_threshold;
 
 /* Forwards */
 
-
-#if     USLOCK_DEBUG
-/*
- *     Perform simple lock checks.
- */
-int             uslock_check = 1;
-int             max_lock_loops = 100000000;
-decl_simple_lock_data(extern, printf_lock)
-decl_simple_lock_data(extern, panic_lock)
-#endif                          /* USLOCK_DEBUG */
-
 extern unsigned int not_in_kdp;
 
 /*
@@ -165,19 +152,6 @@ typedef void   *pc_t;
  *     Portable lock package implementation of usimple_locks.
  */
 
-#if     USLOCK_DEBUG
-#define USLDBG(stmt)    stmt
-void            usld_lock_init(usimple_lock_t, unsigned short);
-void            usld_lock_pre(usimple_lock_t, pc_t);
-void            usld_lock_post(usimple_lock_t, pc_t);
-void            usld_unlock(usimple_lock_t, pc_t);
-void            usld_lock_try_pre(usimple_lock_t, pc_t);
-void            usld_lock_try_post(usimple_lock_t, pc_t);
-int             usld_lock_common_checks(usimple_lock_t, const char *);
-#else                           /* USLOCK_DEBUG */
-#define USLDBG(stmt)
-#endif                          /* USLOCK_DEBUG */
-
 /*
  * Owner thread pointer when lock held in spin mode
  */
@@ -190,26 +164,24 @@ int             usld_lock_common_checks(usimple_lock_t, const char *);
 #define lck_rw_ilk_lock(lock)   hw_lock_bit  ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
 #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
 
-#define memory_barrier()        __c11_atomic_thread_fence(memory_order_acq_rel_smp)
-#define load_memory_barrier()   __c11_atomic_thread_fence(memory_order_acquire_smp)
-#define store_memory_barrier()  __c11_atomic_thread_fence(memory_order_release_smp)
+#define load_memory_barrier()   os_atomic_thread_fence(acquire)
 
 // Enforce program order of loads and stores.
-#define ordered_load(target, type) \
-               __c11_atomic_load((_Atomic type *)(target), memory_order_relaxed)
-#define ordered_store(target, type, value) \
-               __c11_atomic_store((_Atomic type *)(target), value, memory_order_relaxed)
-
-#define ordered_load_mtx(lock)                  ordered_load(&(lock)->lck_mtx_data, uintptr_t)
-#define ordered_store_mtx(lock, value)  ordered_store(&(lock)->lck_mtx_data, uintptr_t, (value))
-#define ordered_load_rw(lock)                   ordered_load(&(lock)->lck_rw_data, uint32_t)
-#define ordered_store_rw(lock, value)   ordered_store(&(lock)->lck_rw_data, uint32_t, (value))
-#define ordered_load_rw_owner(lock)             ordered_load(&(lock)->lck_rw_owner, thread_t)
-#define ordered_store_rw_owner(lock, value)     ordered_store(&(lock)->lck_rw_owner, thread_t, (value))
-#define ordered_load_hw(lock)                   ordered_load(&(lock)->lock_data, uintptr_t)
-#define ordered_store_hw(lock, value)   ordered_store(&(lock)->lock_data, uintptr_t, (value))
-#define ordered_load_bit(lock)                  ordered_load((lock), uint32_t)
-#define ordered_store_bit(lock, value)  ordered_store((lock), uint32_t, (value))
+#define ordered_load(target) \
+               os_atomic_load(target, compiler_acq_rel)
+#define ordered_store(target, value) \
+               os_atomic_store(target, value, compiler_acq_rel)
+
+#define ordered_load_mtx(lock)                  ordered_load(&(lock)->lck_mtx_data)
+#define ordered_store_mtx(lock, value)  ordered_store(&(lock)->lck_mtx_data, (value))
+#define ordered_load_rw(lock)                   ordered_load(&(lock)->lck_rw_data)
+#define ordered_store_rw(lock, value)   ordered_store(&(lock)->lck_rw_data, (value))
+#define ordered_load_rw_owner(lock)             ordered_load(&(lock)->lck_rw_owner)
+#define ordered_store_rw_owner(lock, value)     ordered_store(&(lock)->lck_rw_owner, (value))
+#define ordered_load_hw(lock)                   ordered_load(&(lock)->lock_data)
+#define ordered_store_hw(lock, value)   ordered_store(&(lock)->lock_data, (value))
+#define ordered_load_bit(lock)                  ordered_load((lock))
+#define ordered_store_bit(lock, value)  ordered_store((lock), (value))
 
 
 // Prevent the compiler from reordering memory operations around this
@@ -253,11 +225,56 @@ static boolean_t lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait);
  * atomic_exchange_complete() - conclude an exchange
  * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
  */
+__unused static uint32_t
+load_exclusive32(uint32_t *target, enum memory_order ord)
+{
+       uint32_t        value;
+
+#if __arm__
+       if (memory_order_has_release(ord)) {
+               // Pre-load release barrier
+               atomic_thread_fence(memory_order_release);
+       }
+       value = __builtin_arm_ldrex(target);
+#else
+       if (memory_order_has_acquire(ord)) {
+               value = __builtin_arm_ldaex(target);    // ldaxr
+       } else {
+               value = __builtin_arm_ldrex(target);    // ldxr
+       }
+#endif  // __arm__
+       return value;
+}
+
+__unused static boolean_t
+store_exclusive32(uint32_t *target, uint32_t value, enum memory_order ord)
+{
+       boolean_t err;
+
+#if __arm__
+       err = __builtin_arm_strex(value, target);
+       if (memory_order_has_acquire(ord)) {
+               // Post-store acquire barrier
+               atomic_thread_fence(memory_order_acquire);
+       }
+#else
+       if (memory_order_has_release(ord)) {
+               err = __builtin_arm_stlex(value, target);       // stlxr
+       } else {
+               err = __builtin_arm_strex(value, target);       // stxr
+       }
+#endif  // __arm__
+       return !err;
+}
+
 static uint32_t
 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
 {
        uint32_t        val;
 
+#if __ARM_ATOMICS_8_1
+       ord = memory_order_relaxed;
+#endif
        val = load_exclusive32(target, ord);
        *previous = val;
        return val;
@@ -266,14 +283,18 @@ atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order
 static boolean_t
 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
 {
+#if __ARM_ATOMICS_8_1
+       return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
+#else
        (void)previous;         // Previous not needed, monitor is held
        return store_exclusive32(target, newval, ord);
+#endif
 }
 
 static void
 atomic_exchange_abort(void)
 {
-       clear_exclusive();
+       os_atomic_clear_exclusive();
 }
 
 static boolean_t
@@ -298,260 +319,113 @@ atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, e
        }
 }
 
+inline boolean_t
+hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
+{
+       return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
+}
+
 void
 _disable_preemption(void)
 {
-       thread_t        thread = current_thread();
-       unsigned int    count;
+       thread_t     thread = current_thread();
+       unsigned int count  = thread->machine.preemption_count;
 
-       count = thread->machine.preemption_count + 1;
-       ordered_store(&thread->machine.preemption_count, unsigned int, count);
+       count += 1;
+       if (__improbable(count == 0)) {
+               panic("Preemption count overflow");
+       }
+
+       os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
 }
 
-void
-_enable_preemption(void)
+/*
+ * This function checks whether an AST_URGENT has been pended.
+ *
+ * It is called once the preemption has been reenabled, which means the thread
+ * may have been preempted right before this was called, and when this function
+ * actually performs the check, we've changed CPU.
+ *
+ * This race is however benign: the point of AST_URGENT is to trigger a context
+ * switch, so if one happened, there's nothing left to check for, and AST_URGENT
+ * was cleared in the process.
+ *
+ * It follows that this check cannot have false negatives, which allows us
+ * to avoid fiddling with interrupt state for the vast majority of cases
+ * when the check will actually be negative.
+ */
+static NOINLINE void
+kernel_preempt_check(thread_t thread)
 {
-       thread_t        thread = current_thread();
-       long            state;
-       unsigned int    count;
+       cpu_data_t *cpu_data_ptr;
+       long        state;
+
 #if __arm__
 #define INTERRUPT_MASK PSR_IRQF
 #else   // __arm__
 #define INTERRUPT_MASK DAIF_IRQF
 #endif  // __arm__
 
-       count = thread->machine.preemption_count;
-       if (count == 0) {
-               panic("Preemption count negative");     // Count will go negative when released
-       }
-       count--;
-       if (count > 0) {
-               goto update_count;                      // Preemption is still disabled, just update
-       }
-       state = get_interrupts();                       // Get interrupt state
-       if (state & INTERRUPT_MASK) {
-               goto update_count;                      // Interrupts are already masked, can't take AST here
+       /*
+        * This check is racy and could load from another CPU's pending_ast mask,
+        * but as described above, this can't have false negatives.
+        */
+       cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
+       if (__probable((cpu_data_ptr->cpu_pending_ast & AST_URGENT) == 0)) {
+               return;
        }
-       disable_interrupts_noread();                    // Disable interrupts
-       ordered_store(&thread->machine.preemption_count, unsigned int, count);
-       if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
+
+       /* If interrupts are masked, we can't take an AST here */
+       state = get_interrupts();
+       if ((state & INTERRUPT_MASK) == 0) {
+               disable_interrupts_noread();                    // Disable interrupts
+
+               /*
+                * Reload cpu_data_ptr: a context switch would cause it to change.
+                * Now that interrupts are disabled, this will debounce false positives.
+                */
+               cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
+               if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
 #if __arm__
 #if __ARM_USER_PROTECT__
-               uintptr_t up = arm_user_protect_begin(thread);
+                       uintptr_t up = arm_user_protect_begin(thread);
 #endif  // __ARM_USER_PROTECT__
-               enable_fiq();
+                       enable_fiq();
 #endif  // __arm__
-               ast_taken_kernel();                     // Handle urgent AST
+                       ast_taken_kernel();                 // Handle urgent AST
 #if __arm__
 #if __ARM_USER_PROTECT__
-               arm_user_protect_end(thread, up, TRUE);
+                       arm_user_protect_end(thread, up, TRUE);
 #endif  // __ARM_USER_PROTECT__
-               enable_interrupts();
-               return;                                 // Return early on arm only due to FIQ enabling
+                       enable_interrupts();
+                       return;                             // Return early on arm only due to FIQ enabling
 #endif  // __arm__
-       }
-       restore_interrupts(state);                      // Enable interrupts
-       return;
-
-update_count:
-       ordered_store(&thread->machine.preemption_count, unsigned int, count);
-       return;
-}
-
-int
-get_preemption_level(void)
-{
-       return current_thread()->machine.preemption_count;
-}
-
-#if     __SMP__
-static unsigned int
-hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp));
-#endif
-
-static inline unsigned int
-hw_lock_bit_to_internal(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp))
-{
-       unsigned int success = 0;
-       uint32_t        mask = (1 << bit);
-#if     !__SMP__
-       uint32_t        state;
-#endif
-
-#if     __SMP__
-       if (__improbable(!atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE))) {
-               success = hw_lock_bit_to_contended(lock, mask, timeout LCK_GRP_ARG(grp));
-       } else {
-               success = 1;
-       }
-#else   // __SMP__
-       (void)timeout;
-       state = ordered_load_bit(lock);
-       if (!(mask & state)) {
-               ordered_store_bit(lock, state | mask);
-               success = 1;
-       }
-#endif  // __SMP__
-
-       if (success) {
-               lck_grp_spin_update_held(lock LCK_GRP_ARG(grp));
-       }
-
-       return success;
-}
-
-unsigned
-int
-(hw_lock_bit_to)(hw_lock_bit_t * lock, unsigned int bit, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp))
-{
-       _disable_preemption();
-       return hw_lock_bit_to_internal(lock, bit, timeout LCK_GRP_ARG(grp));
-}
-
-#if     __SMP__
-static unsigned int NOINLINE
-hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp))
-{
-       uint64_t        end = 0;
-       int             i;
-#if CONFIG_DTRACE || LOCK_STATS
-       uint64_t begin = 0;
-       boolean_t stat_enabled = lck_grp_spin_spin_enabled(lock LCK_GRP_ARG(grp));
-#endif /* CONFIG_DTRACE || LOCK_STATS */
-
-#if LOCK_STATS || CONFIG_DTRACE
-       if (__improbable(stat_enabled)) {
-               begin = mach_absolute_time();
-       }
-#endif /* LOCK_STATS || CONFIG_DTRACE */
-       for (;;) {
-               for (i = 0; i < LOCK_SNOOP_SPINS; i++) {
-                       // Always load-exclusive before wfe
-                       // This grabs the monitor and wakes up on a release event
-                       if (atomic_test_and_set32(lock, mask, mask, memory_order_acquire, TRUE)) {
-                               goto end;
-                       }
-               }
-               if (end == 0) {
-                       end = ml_get_timebase() + timeout;
-               } else if (ml_get_timebase() >= end) {
-                       break;
                }
+               restore_interrupts(state);              // Enable interrupts
        }
-       return 0;
-end:
-#if CONFIG_DTRACE || LOCK_STATS
-       if (__improbable(stat_enabled)) {
-               lck_grp_spin_update_spin(lock LCK_GRP_ARG(grp), mach_absolute_time() - begin);
-       }
-       lck_grp_spin_update_miss(lock LCK_GRP_ARG(grp));
-#endif /* CONFIG_DTRACE || LCK_GRP_STAT */
-
-       return 1;
 }
-#endif  // __SMP__
 
 void
-(hw_lock_bit)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp))
-{
-       if (hw_lock_bit_to(lock, bit, LOCK_PANIC_TIMEOUT, LCK_GRP_PROBEARG(grp))) {
-               return;
-       }
-#if     __SMP__
-       panic("hw_lock_bit(): timed out (%p)", lock);
-#else
-       panic("hw_lock_bit(): interlock held (%p)", lock);
-#endif
-}
-
-void
-(hw_lock_bit_nopreempt)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp))
-{
-       if (__improbable(get_preemption_level() == 0)) {
-               panic("Attempt to take no-preempt bitlock %p in preemptible context", lock);
-       }
-       if (hw_lock_bit_to_internal(lock, bit, LOCK_PANIC_TIMEOUT LCK_GRP_ARG(grp))) {
-               return;
-       }
-#if     __SMP__
-       panic("hw_lock_bit_nopreempt(): timed out (%p)", lock);
-#else
-       panic("hw_lock_bit_nopreempt(): interlock held (%p)", lock);
-#endif
-}
-
-unsigned
-int
-(hw_lock_bit_try)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp))
+_enable_preemption(void)
 {
-       uint32_t        mask = (1 << bit);
-#if     !__SMP__
-       uint32_t        state;
-#endif
-       boolean_t       success = FALSE;
+       thread_t     thread = current_thread();
+       unsigned int count  = thread->machine.preemption_count;
 
-       _disable_preemption();
-#if     __SMP__
-       // TODO: consider weak (non-looping) atomic test-and-set
-       success = atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE);
-#else
-       state = ordered_load_bit(lock);
-       if (!(mask & state)) {
-               ordered_store_bit(lock, state | mask);
-               success = TRUE;
-       }
-#endif  // __SMP__
-       if (!success) {
-               _enable_preemption();
+       if (__improbable(count == 0)) {
+               panic("Preemption count underflow");
        }
+       count -= 1;
 
-       if (success) {
-               lck_grp_spin_update_held(lock LCK_GRP_ARG(grp));
+       os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
+       if (count == 0) {
+               kernel_preempt_check(thread);
        }
-
-       return success;
-}
-
-static inline void
-hw_unlock_bit_internal(hw_lock_bit_t *lock, unsigned int bit)
-{
-       uint32_t        mask = (1 << bit);
-#if     !__SMP__
-       uint32_t        state;
-#endif
-
-#if     __SMP__
-       __c11_atomic_fetch_and((_Atomic uint32_t *)lock, ~mask, memory_order_release);
-       set_event();
-#else   // __SMP__
-       state = ordered_load_bit(lock);
-       ordered_store_bit(lock, state & ~mask);
-#endif  // __SMP__
-#if CONFIG_DTRACE
-       LOCKSTAT_RECORD(LS_LCK_SPIN_UNLOCK_RELEASE, lock, bit);
-#endif
-}
-
-/*
- *     Routine:        hw_unlock_bit
- *
- *             Release spin-lock. The second parameter is the bit number to test and set.
- *             Decrement the preemption level.
- */
-void
-hw_unlock_bit(hw_lock_bit_t * lock, unsigned int bit)
-{
-       hw_unlock_bit_internal(lock, bit);
-       _enable_preemption();
 }
 
-void
-hw_unlock_bit_nopreempt(hw_lock_bit_t * lock, unsigned int bit)
+int
+get_preemption_level(void)
 {
-       if (__improbable(get_preemption_level() == 0)) {
-               panic("Attempt to release no-preempt bitlock %p in preemptible context", lock);
-       }
-       hw_unlock_bit_internal(lock, bit);
+       return current_thread()->machine.preemption_count;
 }
 
 #if __SMP__
@@ -618,11 +492,12 @@ lck_spin_init(
        lck_grp_t * grp,
        __unused lck_attr_t * attr)
 {
-       hw_lock_init(&lck->hwlock);
        lck->type = LCK_SPIN_TYPE;
-       lck_grp_reference(grp);
-       lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
-       store_memory_barrier();
+       hw_lock_init(&lck->hwlock);
+       if (grp) {
+               lck_grp_reference(grp);
+               lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
+       }
 }
 
 /*
@@ -633,7 +508,6 @@ arm_usimple_lock_init(simple_lock_t lck, __unused unsigned short initial_value)
 {
        lck->type = LCK_SPIN_TYPE;
        hw_lock_init(&lck->hwlock);
-       store_memory_barrier();
 }
 
 
@@ -767,8 +641,10 @@ lck_spin_destroy(
                return;
        }
        lck->lck_spin_data = LCK_SPIN_TAG_DESTROYED;
-       lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
-       lck_grp_deallocate(grp);
+       if (grp) {
+               lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
+               lck_grp_deallocate(grp);
+       }
 }
 
 /*
@@ -794,12 +670,7 @@ usimple_lock_init(
        usimple_lock_t l,
        unsigned short tag)
 {
-#ifndef MACHINE_SIMPLE_LOCK
-       USLDBG(usld_lock_init(l, tag));
-       hw_lock_init(&l->lck_spin_data);
-#else
        simple_lock_init((simple_lock_t) l, tag);
-#endif
 }
 
 
@@ -815,21 +686,7 @@ void
        usimple_lock_t l
        LCK_GRP_ARG(lck_grp_t *grp))
 {
-#ifndef MACHINE_SIMPLE_LOCK
-       pc_t            pc;
-
-       OBTAIN_PC(pc, l);
-       USLDBG(usld_lock_pre(l, pc));
-
-       if (!hw_lock_to(&l->lck_spin_data, LockTimeOut, LCK_GRP_ARG(grp))) {      /* Try to get the lock
-                                                                                  * with a timeout */
-               panic("simple lock deadlock detection - l=%p, cpu=%d, ret=%p", &l, cpu_number(), pc);
-       }
-
-       USLDBG(usld_lock_post(l, pc));
-#else
        simple_lock((simple_lock_t) l, LCK_GRP_PROBEARG(grp));
-#endif
 }
 
 
@@ -846,16 +703,7 @@ void
 (usimple_unlock)(
        usimple_lock_t l)
 {
-#ifndef MACHINE_SIMPLE_LOCK
-       pc_t            pc;
-
-       OBTAIN_PC(pc, l);
-       USLDBG(usld_unlock(l, pc));
-       sync();
-       hw_lock_unlock(&l->lck_spin_data);
-#else
        simple_unlock((simple_lock_t)l);
-#endif
 }
 
 
@@ -877,299 +725,9 @@ int
        usimple_lock_t l
        LCK_GRP_ARG(lck_grp_t *grp))
 {
-#ifndef MACHINE_SIMPLE_LOCK
-       pc_t            pc;
-       unsigned int    success;
-
-       OBTAIN_PC(pc, l);
-       USLDBG(usld_lock_try_pre(l, pc));
-       if ((success = hw_lock_try(&l->lck_spin_data LCK_GRP_ARG(grp)))) {
-               USLDBG(usld_lock_try_post(l, pc));
-       }
-       return success;
-#else
        return simple_lock_try((simple_lock_t) l, grp);
-#endif
-}
-
-#if     USLOCK_DEBUG
-/*
- *     States of a usimple_lock.  The default when initializing
- *     a usimple_lock is setting it up for debug checking.
- */
-#define USLOCK_CHECKED          0x0001  /* lock is being checked */
-#define USLOCK_TAKEN            0x0002  /* lock has been taken */
-#define USLOCK_INIT             0xBAA0  /* lock has been initialized */
-#define USLOCK_INITIALIZED      (USLOCK_INIT|USLOCK_CHECKED)
-#define USLOCK_CHECKING(l)      (uslock_check &&                        \
-                                ((l)->debug.state & USLOCK_CHECKED))
-
-/*
- *     Trace activities of a particularly interesting lock.
- */
-void            usl_trace(usimple_lock_t, int, pc_t, const char *);
-
-
-/*
- *     Initialize the debugging information contained
- *     in a usimple_lock.
- */
-void
-usld_lock_init(
-       usimple_lock_t l,
-       __unused unsigned short tag)
-{
-       if (l == USIMPLE_LOCK_NULL) {
-               panic("lock initialization:  null lock pointer");
-       }
-       l->lock_type = USLOCK_TAG;
-       l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
-       l->debug.lock_cpu = l->debug.unlock_cpu = 0;
-       l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
-       l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
-       l->debug.duration[0] = l->debug.duration[1] = 0;
-       l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
-       l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
-       l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
-}
-
-
-/*
- *     These checks apply to all usimple_locks, not just
- *     those with USLOCK_CHECKED turned on.
- */
-int
-usld_lock_common_checks(
-       usimple_lock_t l,
-       const char *caller)
-{
-       if (l == USIMPLE_LOCK_NULL) {
-               panic("%s:  null lock pointer", caller);
-       }
-       if (l->lock_type != USLOCK_TAG) {
-               panic("%s:  0x%x is not a usimple lock", caller, (integer_t) l);
-       }
-       if (!(l->debug.state & USLOCK_INIT)) {
-               panic("%s:  0x%x is not an initialized lock",
-                   caller, (integer_t) l);
-       }
-       return USLOCK_CHECKING(l);
-}
-
-
-/*
- *     Debug checks on a usimple_lock just before attempting
- *     to acquire it.
- */
-/* ARGSUSED */
-void
-usld_lock_pre(
-       usimple_lock_t l,
-       pc_t pc)
-{
-       const char     *caller = "usimple_lock";
-
-
-       if (!usld_lock_common_checks(l, caller)) {
-               return;
-       }
-
-       /*
-        *      Note that we have a weird case where we are getting a lock when we are]
-        *      in the process of putting the system to sleep. We are running with no
-        *      current threads, therefore we can't tell if we are trying to retake a lock
-        *      we have or someone on the other processor has it.  Therefore we just
-        *      ignore this test if the locking thread is 0.
-        */
-
-       if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
-           l->debug.lock_thread == (void *) current_thread()) {
-               printf("%s:  lock 0x%x already locked (at %p) by",
-                   caller, (integer_t) l, l->debug.lock_pc);
-               printf(" current thread %p (new attempt at pc %p)\n",
-                   l->debug.lock_thread, pc);
-               panic("%s", caller);
-       }
-       mp_disable_preemption();
-       usl_trace(l, cpu_number(), pc, caller);
-       mp_enable_preemption();
-}
-
-
-/*
- *     Debug checks on a usimple_lock just after acquiring it.
- *
- *     Pre-emption has been disabled at this point,
- *     so we are safe in using cpu_number.
- */
-void
-usld_lock_post(
-       usimple_lock_t l,
-       pc_t pc)
-{
-       int             mycpu;
-       const char     *caller = "successful usimple_lock";
-
-
-       if (!usld_lock_common_checks(l, caller)) {
-               return;
-       }
-
-       if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
-               panic("%s:  lock 0x%x became uninitialized",
-                   caller, (integer_t) l);
-       }
-       if ((l->debug.state & USLOCK_TAKEN)) {
-               panic("%s:  lock 0x%x became TAKEN by someone else",
-                   caller, (integer_t) l);
-       }
-
-       mycpu = cpu_number();
-       l->debug.lock_thread = (void *) current_thread();
-       l->debug.state |= USLOCK_TAKEN;
-       l->debug.lock_pc = pc;
-       l->debug.lock_cpu = mycpu;
-
-       usl_trace(l, mycpu, pc, caller);
-}
-
-
-/*
- *     Debug checks on a usimple_lock just before
- *     releasing it.  Note that the caller has not
- *     yet released the hardware lock.
- *
- *     Preemption is still disabled, so there's
- *     no problem using cpu_number.
- */
-void
-usld_unlock(
-       usimple_lock_t l,
-       pc_t pc)
-{
-       int             mycpu;
-       const char     *caller = "usimple_unlock";
-
-
-       if (!usld_lock_common_checks(l, caller)) {
-               return;
-       }
-
-       mycpu = cpu_number();
-
-       if (!(l->debug.state & USLOCK_TAKEN)) {
-               panic("%s:  lock 0x%x hasn't been taken",
-                   caller, (integer_t) l);
-       }
-       if (l->debug.lock_thread != (void *) current_thread()) {
-               panic("%s:  unlocking lock 0x%x, owned by thread %p",
-                   caller, (integer_t) l, l->debug.lock_thread);
-       }
-       if (l->debug.lock_cpu != mycpu) {
-               printf("%s:  unlocking lock 0x%x on cpu 0x%x",
-                   caller, (integer_t) l, mycpu);
-               printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
-               panic("%s", caller);
-       }
-       usl_trace(l, mycpu, pc, caller);
-
-       l->debug.unlock_thread = l->debug.lock_thread;
-       l->debug.lock_thread = INVALID_PC;
-       l->debug.state &= ~USLOCK_TAKEN;
-       l->debug.unlock_pc = pc;
-       l->debug.unlock_cpu = mycpu;
 }
 
-
-/*
- *     Debug checks on a usimple_lock just before
- *     attempting to acquire it.
- *
- *     Preemption isn't guaranteed to be disabled.
- */
-void
-usld_lock_try_pre(
-       usimple_lock_t l,
-       pc_t pc)
-{
-       const char     *caller = "usimple_lock_try";
-
-       if (!usld_lock_common_checks(l, caller)) {
-               return;
-       }
-       mp_disable_preemption();
-       usl_trace(l, cpu_number(), pc, caller);
-       mp_enable_preemption();
-}
-
-
-/*
- *     Debug checks on a usimple_lock just after
- *     successfully attempting to acquire it.
- *
- *     Preemption has been disabled by the
- *     lock acquisition attempt, so it's safe
- *     to use cpu_number.
- */
-void
-usld_lock_try_post(
-       usimple_lock_t l,
-       pc_t pc)
-{
-       int             mycpu;
-       const char     *caller = "successful usimple_lock_try";
-
-       if (!usld_lock_common_checks(l, caller)) {
-               return;
-       }
-
-       if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
-               panic("%s:  lock 0x%x became uninitialized",
-                   caller, (integer_t) l);
-       }
-       if ((l->debug.state & USLOCK_TAKEN)) {
-               panic("%s:  lock 0x%x became TAKEN by someone else",
-                   caller, (integer_t) l);
-       }
-
-       mycpu = cpu_number();
-       l->debug.lock_thread = (void *) current_thread();
-       l->debug.state |= USLOCK_TAKEN;
-       l->debug.lock_pc = pc;
-       l->debug.lock_cpu = mycpu;
-
-       usl_trace(l, mycpu, pc, caller);
-}
-
-
-/*
- *     For very special cases, set traced_lock to point to a
- *     specific lock of interest.  The result is a series of
- *     XPRs showing lock operations on that lock.  The lock_seq
- *     value is used to show the order of those operations.
- */
-usimple_lock_t  traced_lock;
-unsigned int    lock_seq;
-
-void
-usl_trace(
-       usimple_lock_t l,
-       int mycpu,
-       pc_t pc,
-       const char *op_name)
-{
-       if (traced_lock == l) {
-               XPR(XPR_SLOCK,
-                   "seq %d, cpu %d, %s @ %x\n",
-                   (integer_t) lock_seq, (integer_t) mycpu,
-                   (integer_t) op_name, (integer_t) pc, 0);
-               lock_seq++;
-       }
-}
-
-
-#endif                          /* USLOCK_DEBUG */
-
 /*
  * The C portion of the shared/exclusive locks package.
  */
@@ -1225,13 +783,13 @@ lck_rw_drain_status(lck_rw_t *lock, uint32_t status_mask, boolean_t wait __unuse
                if (wait) {
                        wait_for_event();
                } else {
-                       clear_exclusive();
+                       os_atomic_clear_exclusive();
                }
                if (!wait || (mach_absolute_time() >= deadline)) {
                        return FALSE;
                }
        }
-       clear_exclusive();
+       os_atomic_clear_exclusive();
        return TRUE;
 #else
        uint32_t        data;
@@ -1259,7 +817,7 @@ lck_rw_interlock_spin(lck_rw_t *lock)
                if (data & LCK_RW_INTERLOCK) {
                        wait_for_event();
                } else {
-                       clear_exclusive();
+                       os_atomic_clear_exclusive();
                        return;
                }
        }
@@ -1495,6 +1053,8 @@ lck_rw_lock_shared(lck_rw_t *lock)
 
 /*
  *     Routine:        lck_rw_lock_shared_to_exclusive
+ *
+ *     False returned upon failure, in this case the shared lock is dropped.
  */
 boolean_t
 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
@@ -2505,7 +2065,6 @@ lck_mtx_init(
        {
                lck->lck_mtx_ptr = NULL;                // Clear any padding in the union fields below
                lck->lck_mtx_waiters = 0;
-               lck->lck_mtx_pri = 0;
                lck->lck_mtx_type = LCK_MTX_TYPE;
                ordered_store_mtx(lck, 0);
        }
@@ -2538,7 +2097,6 @@ lck_mtx_init_ext(
                lck->lck_mtx_type = LCK_MTX_TYPE;
        } else {
                lck->lck_mtx_waiters = 0;
-               lck->lck_mtx_pri = 0;
                lck->lck_mtx_type = LCK_MTX_TYPE;
                ordered_store_mtx(lck, 0);
        }
@@ -2627,8 +2185,8 @@ lck_mtx_lock(lck_mtx_t *lock)
        lck_mtx_verify(lock);
        lck_mtx_check_preemption(lock);
        thread = current_thread();
-       if (atomic_compare_exchange(&lock->lck_mtx_data, 0, LCK_MTX_THREAD_TO_STATE(thread),
-           memory_order_acquire_smp, FALSE)) {
+       if (os_atomic_cmpxchg(&lock->lck_mtx_data,
+           0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
 #if     CONFIG_DTRACE
                LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
 #endif /* CONFIG_DTRACE */
@@ -2647,6 +2205,7 @@ lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
        uintptr_t               state;
        int                     waiters = 0;
        spinwait_result_t       sw_res;
+       struct turnstile        *ts = NULL;
 
        /* Loop waiting until I see that the mutex is unowned */
        for (;;) {
@@ -2655,6 +2214,11 @@ lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
 
                switch (sw_res) {
                case SPINWAIT_ACQUIRED:
+                       if (ts != NULL) {
+                               interlock_lock(lock);
+                               turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
+                               interlock_unlock(lock);
+                       }
                        goto done;
                case SPINWAIT_INTERLOCK:
                        goto set_owner;
@@ -2668,7 +2232,7 @@ lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
                        break;
                }
                ordered_store_mtx(lock, (state | LCK_ILOCK | ARM_LCK_WAITERS)); // Set waiters bit and wait
-               lck_mtx_lock_wait(lock, holding_thread);
+               lck_mtx_lock_wait(lock, holding_thread, &ts);
                /* returns interlock unlocked */
        }
 
@@ -2678,7 +2242,15 @@ set_owner:
 
        if (state & ARM_LCK_WAITERS) {
                /* Skip lck_mtx_lock_acquire if there are no waiters. */
-               waiters = lck_mtx_lock_acquire(lock);
+               waiters = lck_mtx_lock_acquire(lock, ts);
+               /*
+                * lck_mtx_lock_acquire will call
+                * turnstile_complete
+                */
+       } else {
+               if (ts != NULL) {
+                       turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
+               }
        }
 
        state = LCK_MTX_THREAD_TO_STATE(thread);
@@ -2697,6 +2269,12 @@ set_owner:
 done:
        load_memory_barrier();
 
+       assert(thread->turnstile != NULL);
+
+       if (ts != NULL) {
+               turnstile_cleanup();
+       }
+
 #if CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
 #endif /* CONFIG_DTRACE */
@@ -2918,8 +2496,8 @@ lck_mtx_try_lock(lck_mtx_t *lock)
        thread_t        thread = current_thread();
 
        lck_mtx_verify(lock);
-       if (atomic_compare_exchange(&lock->lck_mtx_data, 0, LCK_MTX_THREAD_TO_STATE(thread),
-           memory_order_acquire_smp, FALSE)) {
+       if (os_atomic_cmpxchg(&lock->lck_mtx_data,
+           0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
 #if     CONFIG_DTRACE
                LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, lock, 0);
 #endif /* CONFIG_DTRACE */
@@ -2957,7 +2535,7 @@ lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread)
        state |= LCK_ILOCK;
        ordered_store_mtx(lock, state);
 #endif  // __SMP__
-       waiters = lck_mtx_lock_acquire(lock);
+       waiters = lck_mtx_lock_acquire(lock, NULL);
        state = LCK_MTX_THREAD_TO_STATE(thread);
        if (waiters != 0) {
                state |= ARM_LCK_WAITERS;
@@ -2971,6 +2549,9 @@ lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread)
        enable_preemption();
 #endif
        load_memory_barrier();
+
+       turnstile_cleanup();
+
        return TRUE;
 }
 
@@ -3046,8 +2627,8 @@ lck_mtx_unlock(lck_mtx_t *lock)
                goto slow_case;
        }
        // Locked as a mutex
-       if (atomic_compare_exchange(&lock->lck_mtx_data, LCK_MTX_THREAD_TO_STATE(thread), 0,
-           memory_order_release_smp, FALSE)) {
+       if (os_atomic_cmpxchg(&lock->lck_mtx_data,
+           LCK_MTX_THREAD_TO_STATE(thread), 0, release)) {
 #if     CONFIG_DTRACE
                LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
 #endif /* CONFIG_DTRACE */
@@ -3061,6 +2642,7 @@ static void NOINLINE
 lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held)
 {
        uintptr_t       state;
+       boolean_t               cleanup = FALSE;
 
        if (ilk_held) {
                state = ordered_load_mtx(lock);
@@ -3084,13 +2666,17 @@ lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held)
                ordered_store_mtx(lock, state);
 #endif
                if (state & ARM_LCK_WAITERS) {
-                       lck_mtx_unlock_wakeup(lock, thread);
-                       state = ordered_load_mtx(lock);
-               } else {
-                       assertf(lock->lck_mtx_pri == 0, "pri=0x%x", lock->lck_mtx_pri);
+                       if (lck_mtx_unlock_wakeup(lock, thread)) {
+                               state = ARM_LCK_WAITERS;
+                       } else {
+                               state = 0;
+                       }
+                       cleanup = TRUE;
+                       goto unlock;
                }
        }
        state &= ARM_LCK_WAITERS;   /* Clear state, retain waiters bit */
+unlock:
 #if __SMP__
        state |= LCK_ILOCK;
        ordered_store_mtx(lock, state);
@@ -3099,6 +2685,16 @@ lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held)
        ordered_store_mtx(lock, state);
        enable_preemption();
 #endif
+       if (cleanup) {
+               /*
+                * Do not do any turnstile operations outside of this block.
+                * lock/unlock is called at early stage of boot with single thread,
+                * when turnstile is not yet initialized.
+                * Even without contention we can come throught the slow path
+                * if the mutex is acquired as a spin lock.
+                */
+               turnstile_cleanup();
+       }
 
 #if     CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
@@ -3165,7 +2761,7 @@ lck_mtx_convert_spin(lck_mtx_t *lock)
        }
        state &= ~(LCK_MTX_THREAD_MASK);                // Clear the spin tag
        ordered_store_mtx(lock, state);
-       waiters = lck_mtx_lock_acquire(lock);   // Acquire to manage priority boosts
+       waiters = lck_mtx_lock_acquire(lock, NULL);   // Acquire to manage priority boosts
        state = LCK_MTX_THREAD_TO_STATE(thread);
        if (waiters != 0) {
                state |= ARM_LCK_WAITERS;
@@ -3178,6 +2774,7 @@ lck_mtx_convert_spin(lck_mtx_t *lock)
        ordered_store_mtx(lock, state);                 // Set ownership
        enable_preemption();
 #endif
+       turnstile_cleanup();
 }
 
 
@@ -3232,13 +2829,8 @@ lck_spin_assert(lck_spin_t *lock, unsigned int type)
                if (holder != 0) {
                        if (holder == thread) {
                                panic("Lock owned by current thread %p = %lx", lock, state);
-                       } else {
-                               panic("Lock %p owned by thread %p", lock, holder);
                        }
                }
-               if (state & LCK_ILOCK) {
-                       panic("Lock bit set %p = %lx", lock, state);
-               }
        } else {
                panic("lck_spin_assert(): invalid arg (%u)", type);
        }
index ce41150c18042a699d0468a9bffd4dc572b39937..1a544b0d8c2545d55a0e702e1e9221fd5819f6f2 100644 (file)
@@ -720,11 +720,6 @@ icache_invalidate_trap:
        dsb             ish
        isb
 #endif
-       mov             r4, r0
-       mov             r5, r1
-       bl              EXT(CleanPoU_DcacheRegion)
-       mov             r0, r4
-       mov             r1, r5
        bl              EXT(InvalidatePoU_IcacheRegion)
        mrc             p15, 0, r9, c13, c0, 4                          // Reload r9 from TPIDRPRW
 #if __ARM_USER_PROTECT__
@@ -1354,15 +1349,14 @@ fleh_irq_handler:
        mrc             p15, 0, r9, c13, c0, 4                          // Reload r9 from TPIDRPRW
        bl              EXT(ml_get_timebase)                            // get current timebase
        LOAD_ADDR(r3, EntropyData)
-       ldr             r2, [r3, ENTROPY_INDEX_PTR]
-       add             r1, r3, ENTROPY_DATA_SIZE
-       add             r2, r2, #4
-       cmp             r2, r1
-       addge   r2, r3, ENTROPY_BUFFER
-       ldr             r4, [r2]
-       eor             r0, r0, r4, ROR #9
-       str             r0, [r2]                                                        // Update gEntropie
-       str             r2, [r3, ENTROPY_INDEX_PTR]
+       ldr             r2, [r3, ENTROPY_SAMPLE_COUNT]
+       add             r1, r2, 1
+       str             r1, [r3, ENTROPY_SAMPLE_COUNT]
+       and             r2, r2, ENTROPY_BUFFER_INDEX_MASK
+       add             r1, r3, ENTROPY_BUFFER
+       ldr             r4, [r1, r2, lsl #2]
+       eor             r0, r0, r4, ror #9
+       str             r0, [r1, r2, lsl #2]                            // Update gEntropie
 
 return_from_irq:
        mov             r5, #0
index 883f8a1eaf2eb86fbe9e516778788d37805d7728..cd9ff90216b70eae27c30ec6a657253953814e40 100644 (file)
 
 #define INT_SIZE        (BYTE_SIZE * sizeof (int))
 
+/* machine_routines_asm.s calls these */
+extern int copyin_validate(const user_addr_t, uintptr_t, vm_size_t);
+extern int copyin_user_validate(const user_addr_t, uintptr_t, vm_size_t);
+extern int copyout_validate(uintptr_t, const user_addr_t, vm_size_t);
+extern int copyio_user_validate(int, int, user_addr_t, vm_size_t);
+extern int copyoutstr_prevalidate(const void *, user_addr_t, size_t);
 
 void
 bcopy_phys(addr64_t src, addr64_t dst, vm_size_t bytes)
@@ -572,6 +578,36 @@ copypv(addr64_t source, addr64_t sink, unsigned int size, int which)
  */
 const int copysize_limit_panic = (64 * 1024 * 1024);
 
+static inline bool
+is_kernel_to_kernel_copy()
+{
+       return current_thread()->map->pmap == kernel_pmap;
+}
+
+static int
+copy_validate_user(const user_addr_t user_addr, vm_size_t nbytes, bool kern_to_kern_allowed)
+{
+       user_addr_t user_addr_last = user_addr + nbytes;
+       thread_t self = current_thread();
+
+       if (__improbable(!kern_to_kern_allowed && is_kernel_to_kernel_copy())) {
+               return EFAULT;
+       }
+
+       if (__improbable((user_addr_last < user_addr) ||
+           ((user_addr + nbytes) > vm_map_max(self->map)) ||
+           (user_addr < vm_map_min(self->map)))) {
+               return EFAULT;
+       }
+
+       if (__improbable(nbytes > copysize_limit_panic)) {
+               panic("%s(%p, ..., %u) - transfer too large", __func__,
+                   (void *)user_addr, nbytes);
+       }
+
+       return 0;
+}
+
 /*
  * Validate the arguments to copy{in,out} on this platform.
  *
@@ -581,7 +617,7 @@ const int copysize_limit_panic = (64 * 1024 * 1024);
  */
 static int
 copy_validate(const user_addr_t user_addr,
-    uintptr_t kernel_addr, vm_size_t nbytes)
+    uintptr_t kernel_addr, vm_size_t nbytes, bool kern_to_kern_allowed)
 {
        uintptr_t kernel_addr_last = kernel_addr + nbytes;
 
@@ -593,31 +629,42 @@ copy_validate(const user_addr_t user_addr,
                    (void *)user_addr, (void *)kernel_addr, nbytes);
        }
 
-       user_addr_t user_addr_last = user_addr + nbytes;
+       return copy_validate_user(user_addr, nbytes, kern_to_kern_allowed);
+}
 
-       if (__improbable((user_addr_last < user_addr) || ((user_addr + nbytes) > vm_map_max(current_thread()->map)) ||
-           (user_addr < vm_map_min(current_thread()->map)))) {
-               return EFAULT;
-       }
+int
+copyin_validate(const user_addr_t ua, uintptr_t ka, vm_size_t nbytes)
+{
+       return copy_validate(ua, ka, nbytes, true);
+}
 
-       if (__improbable(nbytes > copysize_limit_panic)) {
-               panic("%s(%p, %p, %u) - transfer too large", __func__,
-                   (void *)user_addr, (void *)kernel_addr, nbytes);
-       }
+int
+copyin_user_validate(const user_addr_t ua, uintptr_t ka, vm_size_t nbytes)
+{
+       return copy_validate(ua, ka, nbytes, false);
+}
 
-       return 0;
+int
+copyout_validate(uintptr_t ka, const user_addr_t ua, vm_size_t nbytes)
+{
+       return copy_validate(ua, ka, nbytes, true);
 }
 
 int
-copyin_validate(const user_addr_t ua, uintptr_t ka, vm_size_t nbytes)
+copyio_user_validate(int a __unused, int b __unused,
+    user_addr_t ua, vm_size_t nbytes)
 {
-       return copy_validate(ua, ka, nbytes);
+       return copy_validate_user(ua, nbytes, false);
 }
 
 int
-copyout_validate(uintptr_t ka, const user_addr_t ua, vm_size_t nbytes)
+copyoutstr_prevalidate(const void *__unused kaddr, user_addr_t __unused uaddr, size_t __unused len)
 {
-       return copy_validate(ua, ka, nbytes);
+       if (__improbable(is_kernel_to_kernel_copy())) {
+               return EFAULT;
+       }
+
+       return 0;
 }
 
 #if     MACH_ASSERT
index b79253632d0393a6d0b0740a49b26cc7be9a8d3e..a29074a2cf9860f610a403e9f746c2ef433294d0 100644 (file)
@@ -147,6 +147,9 @@ machine_do_mvfpid()
 #else
        cpuid_mvfp_info.neon = 1;
        cpuid_mvfp_info.neon_hpfp = 1;
+#if defined(__ARM_ARCH_8_2__)
+       cpuid_mvfp_info.neon_fp16 = 1;
+#endif /* defined(__ARM_ARCH_8_2__) */
 #endif /* __arm__ */
 }
 
index f201ddcc87b4dd0549c8e4cdd16afdacc8b48146..df89b75005c065994c113a1304d5c9042d7d4aa3 100644 (file)
@@ -50,6 +50,7 @@
 #include <kern/coalition.h>
 #include <pexpert/device_tree.h>
 #include <arm/cpuid_internal.h>
+#include <arm/cpu_capabilities.h>
 
 #include <IOKit/IOPlatformExpert.h>
 
@@ -69,7 +70,9 @@ uint64_t TLockTimeOut;
 uint64_t MutexSpin;
 boolean_t is_clock_configured = FALSE;
 
+#if CONFIG_NONFATAL_ASSERTS
 extern int mach_assert;
+#endif
 extern volatile uint32_t debug_enabled;
 
 void machine_conf(void);
@@ -79,7 +82,9 @@ machine_startup(__unused boot_args * args)
 {
        int boot_arg;
 
+#if CONFIG_NONFATAL_ASSERTS
        PE_parse_boot_argn("assert", &mach_assert, sizeof(mach_assert));
+#endif
 
        if (PE_parse_boot_argn("preempt", &boot_arg, sizeof(boot_arg))) {
                default_preemption_rate = boot_arg;
@@ -222,8 +227,8 @@ ml_init_lock_timeout(void)
 void
 ml_cpu_up(void)
 {
-       hw_atomic_add(&machine_info.physical_cpu, 1);
-       hw_atomic_add(&machine_info.logical_cpu, 1);
+       os_atomic_inc(&machine_info.physical_cpu, relaxed);
+       os_atomic_inc(&machine_info.logical_cpu, relaxed);
 }
 
 /*
@@ -235,8 +240,8 @@ ml_cpu_down(void)
 {
        cpu_data_t      *cpu_data_ptr;
 
-       hw_atomic_sub(&machine_info.physical_cpu, 1);
-       hw_atomic_sub(&machine_info.logical_cpu, 1);
+       os_atomic_dec(&machine_info.physical_cpu, relaxed);
+       os_atomic_dec(&machine_info.logical_cpu, relaxed);
 
        /*
         * If we want to deal with outstanding IPIs, we need to
@@ -617,7 +622,7 @@ ml_processor_register(ml_processor_info_t *in_processor_info,
 #endif
 
        if (!is_boot_cpu) {
-               early_random_cpu_init(this_cpu_datap->cpu_number);
+               random_cpu_init(this_cpu_datap->cpu_number);
        }
 
        return KERN_SUCCESS;
@@ -693,6 +698,16 @@ ml_io_map(
        return io_map(phys_addr, size, VM_WIMG_IO);
 }
 
+/* Map memory map IO space (with protections specified) */
+vm_offset_t
+ml_io_map_with_prot(
+       vm_offset_t phys_addr,
+       vm_size_t size,
+       vm_prot_t prot)
+{
+       return io_map_with_prot(phys_addr, size, VM_WIMG_IO, prot);
+}
+
 vm_offset_t
 ml_io_map_wcomb(
        vm_offset_t phys_addr,
@@ -728,12 +743,28 @@ vm_offset_t
 ml_static_vtop(
        vm_offset_t vaddr)
 {
-       if (((vm_address_t)(vaddr) - gVirtBase) >= gPhysSize) {
-               panic("ml_static_ptovirt(): illegal vaddr: %p\n", (void*)vaddr);
-       }
+       assertf(((vm_address_t)(vaddr) - gVirtBase) < gPhysSize, "%s: illegal vaddr: %p", __func__, (void*)vaddr);
        return (vm_address_t)(vaddr) - gVirtBase + gPhysBase;
 }
 
+/*
+ * Return the maximum contiguous KVA range that can be accessed from this
+ * physical address.  For arm64, we employ a segmented physical aperture
+ * relocation table which can limit the available range for a given PA to
+ * something less than the extent of physical memory.  But here, we still
+ * have a flat physical aperture, so no such requirement exists.
+ */
+vm_map_address_t
+phystokv_range(pmap_paddr_t pa, vm_size_t *max_len)
+{
+       vm_size_t len = gPhysSize - (pa - gPhysBase);
+       if (*max_len > len) {
+               *max_len = len;
+       }
+       assertf((pa - gPhysBase) < gPhysSize, "%s: illegal PA: 0x%lx", __func__, (unsigned long)pa);
+       return pa - gPhysBase + gVirtBase;
+}
+
 vm_offset_t
 ml_static_slide(
        vm_offset_t vaddr)
@@ -811,9 +842,6 @@ ml_static_protect(
 
                        ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_NX_MASK)) | arm_prot;
                        *pte_p = ptmp;
-#ifndef  __ARM_L1_PTW__
-                       FlushPoC_DcacheRegion((vm_offset_t) pte_p, sizeof(*pte_p));
-#endif
                }
        }
 
@@ -1142,13 +1170,13 @@ user_cont_hwclock_allowed(void)
        return FALSE;
 }
 
-boolean_t
-user_timebase_allowed(void)
+uint8_t
+user_timebase_type(void)
 {
 #if __ARM_TIME__
-       return TRUE;
+       return USER_TIMEBASE_SPEC;
 #else
-       return FALSE;
+       return USER_TIMEBASE_NONE;
 #endif
 }
 
@@ -1156,7 +1184,7 @@ user_timebase_allowed(void)
  * The following are required for parts of the kernel
  * that cannot resolve these functions as inlines:
  */
-extern thread_t current_act(void);
+extern thread_t current_act(void) __attribute__((const));
 thread_t
 current_act(void)
 {
@@ -1164,7 +1192,7 @@ current_act(void)
 }
 
 #undef current_thread
-extern thread_t current_thread(void);
+extern thread_t current_thread(void) __attribute__((const));
 thread_t
 current_thread(void)
 {
index 545403eeed139cbf7f7b318aad933c9c544b079d..db581e897246ca1c34231b59040126de3f8c84c4 100644 (file)
@@ -446,6 +446,11 @@ vm_offset_t ml_io_map_wcomb(
        vm_offset_t phys_addr,
        vm_size_t size);
 
+vm_offset_t ml_io_map_with_prot(
+       vm_offset_t phys_addr,
+       vm_size_t size,
+       vm_prot_t prot);
+
 void ml_get_bouncepool_info(
        vm_offset_t *phys_addr,
        vm_size_t   *size);
@@ -514,6 +519,17 @@ void bzero_phys(
 
 void bzero_phys_nc(addr64_t src64, vm_size_t bytes);
 
+#if MACH_KERNEL_PRIVATE
+#ifdef __arm64__
+/* Pattern-fill buffer with zeros or a 32-bit pattern;
+ * target must be 128-byte aligned and sized a multiple of 128
+ * Both variants emit stores with non-temporal properties.
+ */
+void fill32_dczva(addr64_t, vm_size_t);
+void fill32_nt(addr64_t, vm_size_t, uint32_t);
+#endif
+#endif
+
 void ml_thread_policy(
        thread_t thread,
        unsigned policy_id,
@@ -556,6 +572,14 @@ extern uint64_t ml_get_conttime_wake_time(void);
 /* Time since the system was reset (as part of boot/wake) */
 uint64_t ml_get_time_since_reset(void);
 
+/*
+ * Called by ApplePMGR to set wake time.  Units and epoch are identical
+ * to mach_continuous_time().  Has no effect on !HAS_CONTINUOUS_HWCLOCK
+ * chips.  If wake_time == UINT64_MAX, that means the wake time is
+ * unknown and calls to ml_get_time_since_reset() will return UINT64_MAX.
+ */
+void ml_set_reset_time(uint64_t wake_time);
+
 #ifdef XNU_KERNEL_PRIVATE
 /* Just a stub on ARM */
 extern kern_return_t ml_interrupt_prewarm(uint64_t deadline);
@@ -608,6 +632,8 @@ extern int      be_tracing(void);
 typedef void (*broadcastFunc) (void *);
 unsigned int cpu_broadcast_xcall(uint32_t *, boolean_t, broadcastFunc, void *);
 kern_return_t cpu_xcall(int, broadcastFunc, void *);
+unsigned int cpu_broadcast_immediate_xcall(uint32_t *, boolean_t, broadcastFunc, void *);
+kern_return_t cpu_immediate_xcall(int, broadcastFunc, void *);
 
 #ifdef  KERNEL_PRIVATE
 
@@ -932,6 +958,22 @@ typedef enum perfcontrol_callout_stat {
 uint64_t perfcontrol_callout_stat_avg(perfcontrol_callout_type_t type,
     perfcontrol_callout_stat_t stat);
 
+#if defined(HAS_APPLE_PAC)
+#define ONES(x) (BIT((x))-1)
+#define PTR_MASK ONES(64-T1SZ_BOOT)
+#define PAC_MASK ~PTR_MASK
+#define SIGN(p) ((p) & BIT(55))
+#define UNSIGN_PTR(p) \
+       SIGN(p) ? ((p) | PAC_MASK) : ((p) & ~PAC_MASK)
+
+void ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit);
+void ml_task_set_disable_user_jop(task_t task, boolean_t disable_user_jop);
+void ml_thread_set_disable_user_jop(thread_t thread, boolean_t disable_user_jop);
+void ml_set_kernelkey_enabled(boolean_t enable);
+void *ml_auth_ptr_unchecked(void *ptr, unsigned key, uint64_t modifier);
+#endif /* defined(HAS_APPLE_PAC) */
+
+
 
 #endif /* KERNEL_PRIVATE */
 
@@ -940,7 +982,7 @@ void ml_get_power_state(boolean_t *, boolean_t *);
 
 uint32_t get_arm_cpu_version(void);
 boolean_t user_cont_hwclock_allowed(void);
-boolean_t user_timebase_allowed(void);
+uint8_t user_timebase_type(void);
 boolean_t ml_thread_is64bit(thread_t thread);
 
 #ifdef __arm64__
index d175af88d1062c46537638d97c89c3a2cc31fb39..7b7f41411223e359360aed9da6b867b3c54dd0ea 100644 (file)
@@ -621,15 +621,21 @@ LEXT(set_context_id)
        isb
        bx              lr
 
-#define        COPYIO_VALIDATE(NAME)                                           \
-       /* call NAME_validate to check the arguments */                 ;\
-       push            {r0, r1, r2, r7, lr}                            ;\
-       add             r7, sp, #12                                     ;\
-       blx             EXT(NAME##_validate)                            ;\
-       cmp             r0, #0                                          ;\
-       addne           sp, #12                                         ;\
-       popne           {r7, pc}                                        ;\
-       pop             {r0, r1, r2, r7, lr}                            ;\
+/*
+ * arg0: prefix of the external validator function (copyin or copyout)
+ * arg1: 0-based index of highest argument register that must be preserved
+ */
+.macro COPYIO_VALIDATE
+       /* call NAME_validate to check the arguments */
+       push            {r0-r$1, r7, lr}
+       add             r7, sp, #(($1 + 1) * 4)
+       blx             EXT($0_validate)
+       cmp             r0, #0
+       addne           sp, #(($1 + 1) * 4)
+       popne           {r7, pc}
+       pop             {r0-r$1, r7, lr}
+.endmacro
+
 
 #define        COPYIO_SET_RECOVER()                                            \
        /* set recovery address */                                      ;\
@@ -735,7 +741,7 @@ LEXT(copyinstr)
        moveq           r12, #0
        streq           r12, [r3]
        bxeq            lr
-       COPYIO_VALIDATE(copyin)
+       COPYIO_VALIDATE copyin_user, 3
        stmfd   sp!, { r4, r5, r6 }
        
        mov             r6, r3
@@ -786,7 +792,7 @@ copyinstr_error:
        .globl EXT(copyin)
 LEXT(copyin)
        COPYIO_HEADER()
-       COPYIO_VALIDATE(copyin)
+       COPYIO_VALIDATE copyin, 2
        COPYIO_TRY_KERNEL()
        COPYIO_SET_RECOVER()
        COPYIO_MAP_USER()
@@ -803,7 +809,7 @@ LEXT(copyin)
        .globl EXT(copyout)
 LEXT(copyout)
        COPYIO_HEADER()
-       COPYIO_VALIDATE(copyout)
+       COPYIO_VALIDATE copyout, 2
        COPYIO_TRY_KERNEL()
        COPYIO_SET_RECOVER()
        COPYIO_MAP_USER()
@@ -814,34 +820,96 @@ LEXT(copyout)
 
 
 /*
- *  int copyin_word(const user_addr_t user_addr, uint64_t *kernel_addr, vm_size_t nbytes)
+ *  int copyin_atomic32(const user_addr_t user_addr, uint32_t *kernel_addr)
+ *    r0: user_addr
+ *    r1: kernel_addr
+ */
+       .text
+       .align 2
+       .globl EXT(copyin_atomic32)
+LEXT(copyin_atomic32)
+       tst             r0, #3                  // Test alignment of user address
+       bne             2f
+
+       mov             r2, #4
+       COPYIO_VALIDATE copyin_user, 1
+       COPYIO_SET_RECOVER()
+       COPYIO_MAP_USER()
+
+       ldr             r2, [r0]                // Load word from user
+       str             r2, [r1]                // Store to kernel_addr
+       mov             r0, #0                  // Success
+
+       COPYIO_UNMAP_USER()
+       COPYIO_RESTORE_RECOVER()
+       bx              lr
+2:     // misaligned copyin
+       mov             r0, #EINVAL
+       bx              lr
+
+/*
+ *  int copyin_atomic32_wait_if_equals(const char *src, uint32_t value)
+ *    r0: user_addr
+ *    r1: value
+ */
+       .text
+       .align 2
+       .globl EXT(copyin_atomic32_wait_if_equals)
+LEXT(copyin_atomic32_wait_if_equals)
+       tst             r0, #3                  // Test alignment of user address
+       bne             2f
+
+       mov             r2, r0
+       mov             r3, #4
+       COPYIO_VALIDATE copyio_user, 1          // validate user address (uses r2, r3)
+       COPYIO_SET_RECOVER()
+       COPYIO_MAP_USER()
+
+       ldrex           r2, [r0]
+       cmp             r2, r1
+       movne           r0, ESTALE
+       bne             1f
+       mov             r0, #0
+       wfe
+1:
+       clrex
+
+       COPYIO_UNMAP_USER()
+       COPYIO_RESTORE_RECOVER()
+       bx              lr
+2:     // misaligned copyin
+       mov             r0, #EINVAL
+       bx              lr
+
+/*
+ *  int copyin_atomic64(const user_addr_t user_addr, uint64_t *kernel_addr)
+ *    r0: user_addr
+ *    r1: kernel_addr
  */
        .text
        .align 2
-       .globl EXT(copyin_word)
-LEXT(copyin_word)
-       cmp             r2, #4                  // Test if size is 4 or 8
-       cmpne           r2, #8
-       bne             L_copyin_invalid
-       sub             r3, r2, #1
-       tst             r0, r3                  // Test alignment of user address
-       bne             L_copyin_invalid
-
-       COPYIO_VALIDATE(copyin)
+       .globl EXT(copyin_atomic64)
+LEXT(copyin_atomic64)
+       tst             r0, #7                  // Test alignment of user address
+       bne             2f
+
+       mov             r2, #8
+       COPYIO_VALIDATE copyin_user, 1
        COPYIO_SET_RECOVER()
        COPYIO_MAP_USER()
 
-       mov             r3, #0                  // Clear high register
-       cmp             r2, #4                  // If size is 4
-       ldreq           r2, [r0]                //      Load word from user
-       ldrdne          r2, r3, [r0]            // Else Load double word from user
+1:     // ldrex/strex retry loop
+       ldrexd          r2, r3, [r0]            // Load double word from user
+       strexd          r5, r2, r3, [r0]        // (the COPYIO_*() macros make r5 safe to use as a scratch register here)
+       cmp             r5, #0
+       bne             1b
        stm             r1, {r2, r3}            // Store to kernel_addr
        mov             r0, #0                  // Success
 
        COPYIO_UNMAP_USER()
        COPYIO_RESTORE_RECOVER()
        bx              lr
-L_copyin_invalid:
+2:     // misaligned copyin
        mov             r0, #EINVAL
        bx              lr
 
@@ -853,6 +921,69 @@ copyio_error:
        ldmfd           sp!, { r4, r5, r6 }
        bx              lr
 
+
+/*
+ *  int copyout_atomic32(uint32_t value, user_addr_t user_addr)
+ *    r0: value
+ *    r1: user_addr
+ */
+       .text
+       .align 2
+       .globl EXT(copyout_atomic32)
+LEXT(copyout_atomic32)
+       tst             r1, #3                  // Test alignment of user address
+       bne             2f
+
+       mov             r2, r1
+       mov             r3, #4
+       COPYIO_VALIDATE copyio_user, 1          // validate user address (uses r2, r3)
+       COPYIO_SET_RECOVER()
+       COPYIO_MAP_USER()
+
+       str             r0, [r1]                // Store word to user
+       mov             r0, #0                  // Success
+
+       COPYIO_UNMAP_USER()
+       COPYIO_RESTORE_RECOVER()
+       bx              lr
+2:     // misaligned copyout
+       mov             r0, #EINVAL
+       bx              lr
+
+
+/*
+ *  int copyout_atomic64(uint64_t value, user_addr_t user_addr)
+ *    r0, r1: value
+ *    r2: user_addr
+ */
+       .text
+       .align 2
+       .globl EXT(copyout_atomic64)
+LEXT(copyout_atomic64)
+       tst             r2, #7                  // Test alignment of user address
+       bne             2f
+
+       mov             r3, #8
+       COPYIO_VALIDATE copyio_user, 2          // validate user address (uses r2, r3)
+       COPYIO_SET_RECOVER()
+       COPYIO_MAP_USER()
+
+1:     // ldrex/strex retry loop
+       ldrexd          r4, r5, [r2]
+       strexd          r3, r0, r1, [r2]        // Atomically store double word to user
+       cmp             r3, #0
+       bne             1b
+
+       mov             r0, #0                  // Success
+
+       COPYIO_UNMAP_USER()
+       COPYIO_RESTORE_RECOVER()
+       bx              lr
+2:     // misaligned copyout
+       mov             r0, #EINVAL
+       bx              lr
+
+
 /*
  * int copyin_kern(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes)
  */
index 2cb596872c68aa5c824a1a9f784a6fc360424e9c..02f73391024b7d2a63fa180ba0e69866b866491d 100644 (file)
@@ -40,6 +40,7 @@
 #include <kern/thread_group.h>
 #include <kern/policy_internal.h>
 #include <machine/config.h>
+#include <machine/atomic.h>
 #include <pexpert/pexpert.h>
 
 #if MONOTONIC
@@ -262,13 +263,13 @@ perfcontrol_callout_counters_end(uint64_t *start_counters,
 {
        uint64_t end_counters[MT_CORE_NFIXED];
        mt_fixed_counts(end_counters);
-       atomic_fetch_add_explicit(&perfcontrol_callout_stats[type][PERFCONTROL_STAT_CYCLES],
-           end_counters[MT_CORE_CYCLES] - start_counters[MT_CORE_CYCLES], memory_order_relaxed);
+       os_atomic_add(&perfcontrol_callout_stats[type][PERFCONTROL_STAT_CYCLES],
+           end_counters[MT_CORE_CYCLES] - start_counters[MT_CORE_CYCLES], relaxed);
 #ifdef MT_CORE_INSTRS
-       atomic_fetch_add_explicit(&perfcontrol_callout_stats[type][PERFCONTROL_STAT_INSTRS],
-           end_counters[MT_CORE_INSTRS] - start_counters[MT_CORE_INSTRS], memory_order_relaxed);
+       os_atomic_add(&perfcontrol_callout_stats[type][PERFCONTROL_STAT_INSTRS],
+           end_counters[MT_CORE_INSTRS] - start_counters[MT_CORE_INSTRS], relaxed);
 #endif /* defined(MT_CORE_INSTRS) */
-       atomic_fetch_add_explicit(&perfcontrol_callout_count[type], 1, memory_order_relaxed);
+       os_atomic_inc(&perfcontrol_callout_count[type], relaxed);
 }
 #endif /* MONOTONIC */
 
@@ -279,7 +280,8 @@ perfcontrol_callout_stat_avg(perfcontrol_callout_type_t type,
        if (!perfcontrol_callout_stats_enabled) {
                return 0;
        }
-       return perfcontrol_callout_stats[type][stat] / perfcontrol_callout_count[type];
+       return os_atomic_load_wide(&perfcontrol_callout_stats[type][stat], relaxed) /
+              os_atomic_load_wide(&perfcontrol_callout_count[type], relaxed);
 }
 
 void
@@ -480,13 +482,16 @@ machine_perfcontrol_deadline_passed(uint64_t deadline)
 /*
  * ml_spin_debug_reset()
  * Reset the timestamp on a thread that has been unscheduled
- * to avoid false alarms.    Alarm will go off if interrupts are held
+ * to avoid false alarms. Alarm will go off if interrupts are held
  * disabled for too long, starting from now.
+ *
+ * Call ml_get_timebase() directly to prevent extra overhead on newer
+ * platforms that's enabled in DEVELOPMENT kernel configurations.
  */
 void
 ml_spin_debug_reset(thread_t thread)
 {
-       thread->machine.intmask_timestamp = mach_absolute_time();
+       thread->machine.intmask_timestamp = ml_get_timebase();
 }
 
 /*
@@ -519,7 +524,7 @@ ml_check_interrupts_disabled_duration(thread_t thread)
 
        start = thread->machine.intmask_timestamp;
        if (start != 0) {
-               now = mach_absolute_time();
+               now = ml_get_timebase();
 
                if ((now - start) > interrupt_masked_timeout * debug_cpu_performance_degradation_factor) {
                        mach_timebase_info_data_t timebase;
@@ -554,6 +559,7 @@ ml_set_interrupts_enabled(boolean_t enable)
        state = __builtin_arm_rsr("DAIF");
 #endif
        if (enable && (state & INTERRUPT_MASK)) {
+               assert(getCpuDatap()->cpu_int_state == NULL); // Make sure we're not enabling interrupts from primary interrupt context
 #if INTERRUPT_MASKED_DEBUG
                if (interrupt_masked_debug) {
                        // Interrupts are currently masked, we will enable them (after finishing this check)
@@ -588,7 +594,7 @@ ml_set_interrupts_enabled(boolean_t enable)
 #if INTERRUPT_MASKED_DEBUG
                if (interrupt_masked_debug) {
                        // Interrupts were enabled, we just masked them
-                       current_thread()->machine.intmask_timestamp = mach_absolute_time();
+                       current_thread()->machine.intmask_timestamp = ml_get_timebase();
                }
 #endif
        }
@@ -690,6 +696,11 @@ ml_get_time_since_reset(void)
        return ml_get_hwclock();
 }
 
+void
+ml_set_reset_time(__unused uint64_t wake_time)
+{
+}
+
 uint64_t
 ml_get_conttime_wake_time(void)
 {
diff --git a/osfmk/arm/memory_types.h b/osfmk/arm/memory_types.h
new file mode 100644 (file)
index 0000000..59458b6
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifndef _ARM_MEMORY_TYPES_H_
+#define _ARM_MEMORY_TYPES_H_
+
+#include <machine/config.h>
+
+/*
+ * WIMG control
+ */
+#define VM_MEM_INNER                      0x10
+#define VM_MEM_RT                         0x10 // intentionally alias VM_MEM_INNER; will be used with mutually exclusive caching policies
+#define VM_MEM_EARLY_ACK                  0x20
+
+#define VM_WIMG_DEFAULT                   (VM_MEM_COHERENT) // 0x2
+#define VM_WIMG_COPYBACK                  (VM_MEM_COHERENT) // 0x2
+#define VM_WIMG_INNERWBACK                (VM_MEM_COHERENT | VM_MEM_INNER) // 0x12
+#define VM_WIMG_IO                        (VM_MEM_COHERENT | VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED) // 0x7
+#define VM_WIMG_POSTED                    (VM_MEM_COHERENT | VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED | VM_MEM_EARLY_ACK) // 0x27
+#define VM_WIMG_WTHRU                     (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED) // 0xb
+#define VM_WIMG_WCOMB                     (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT) // 0x6
+#define VM_WIMG_RT                        (VM_WIMG_IO | VM_MEM_RT) // 0x17
+#define VM_WIMG_POSTED_REORDERED          (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT | VM_MEM_WRITE_THROUGH | VM_MEM_EARLY_ACK) // 0x2e
+#define VM_WIMG_POSTED_COMBINED_REORDERED (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT | VM_MEM_EARLY_ACK) // 0x26
+
+#endif /* _ARM_MEMORY_TYPES_H_ */
index ca995fb37b26f648646ebc0d6e17c603f81bf858..3cd4964bc1c22d5e376c1c9e7d0896552f2a76c7 100644 (file)
@@ -44,24 +44,23 @@ extern void arm_vm_init(uint64_t memory_size, boot_args *args);
 extern void arm_vm_prot_init(boot_args *args);
 extern void arm_vm_prot_finalize(boot_args *args);
 
-
 extern kern_return_t DebuggerXCallEnter(boolean_t);
 extern void DebuggerXCallReturn(void);
 
 #if __arm64__ && DEBUG
 extern void dump_kva_space(void);
-#endif
+#endif /* __arm64__ && DEBUG */
 
 extern void Load_context(thread_t);
 extern void Idle_load_context(void) __attribute__((noreturn));
 extern thread_t Switch_context(thread_t, thread_continue_t, thread_t);
 extern thread_t Shutdown_context(void (*doshutdown)(processor_t), processor_t  processor);
-extern void Call_continuation(thread_continue_t, void *, wait_result_t, boolean_t enable_interrupts);
+extern void __dead2 Call_continuation(thread_continue_t, void *, wait_result_t, boolean_t enable_interrupts);
+
 
 extern void DebuggerCall(unsigned int reason, void *ctx);
 extern void DebuggerXCall(void *ctx);
 
-extern int _copyinstr(const user_addr_t user_addr, char *kernel_addr, vm_size_t max, vm_size_t *actual);
 extern int copyout_kern(const char *kernel_addr, user_addr_t user_addr, vm_size_t nbytes);
 extern int copyin_kern(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes);
 
@@ -85,12 +84,12 @@ extern int copyio_check_user_addr(user_addr_t user_addr, vm_size_t nbytes);
 
 /* Top-Byte-Ignore */
 extern boolean_t user_tbi;
-#define TBI_MASK                0xff00000000000000
-#define user_tbi_enabled()      (user_tbi)
-#define tbi_clear(addr)         ((addr) & ~(TBI_MASK))
+#define TBI_MASK           0xff00000000000000
+#define user_tbi_enabled() (user_tbi)
+#define tbi_clear(addr)    ((addr) & ~(TBI_MASK))
 
-#else
+#else /* !defined(__arm__) && !defined(__arm64__) */
 #error Unknown architecture.
-#endif
+#endif /* defined(__arm__) */
 
 #endif /* _ARM_MISC_PROTOS_H_ */
index f178db28edfa5eaae6594cf335559da92714a7cd..42d753130b6a04e926974a95c3ecd86dd2c819f3 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -40,6 +40,9 @@
 #include <pexpert/boot.h>
 #include <pexpert/pexpert.h>
 
+#if defined(HAS_APPLE_PAC)
+#include <ptrauth.h>
+#endif
 
 #include <kern/misc_protos.h>
 #include <kern/startup.h>
@@ -135,7 +138,7 @@ extern uint64_t         last_hwaccess_thread;
 extern char  gTargetTypeBuffer[8];
 extern char  gModelTypeBuffer[32];
 
-decl_simple_lock_data(extern, clock_lock)
+decl_simple_lock_data(extern, clock_lock);
 extern struct timeval    gIOLastSleepTime;
 extern struct timeval    gIOLastWakeTime;
 extern boolean_t                 is_clock_configured;
@@ -262,6 +265,10 @@ print_one_backtrace(pmap_t pmap, vm_offset_t topfp, const char *cur_marker,
                if (ppn != (ppnum_t)NULL) {
                        if (is_64_bit) {
                                lr = ml_phys_read_double_64(((((vm_offset_t)ppn) << PAGE_SHIFT)) | ((fp + FP_LR_OFFSET64) & PAGE_MASK));
+#if defined(HAS_APPLE_PAC)
+                               /* return addresses on stack will be signed by arm64e ABI */
+                               lr = (addr64_t) ptrauth_strip((void *)lr, ptrauth_key_return_address);
+#endif
                        } else {
                                lr = ml_phys_read_word(((((vm_offset_t)ppn) << PAGE_SHIFT)) | ((fp + FP_LR_OFFSET) & PAGE_MASK));
                        }
@@ -309,8 +316,7 @@ print_one_backtrace(pmap_t pmap, vm_offset_t topfp, const char *cur_marker,
 extern void panic_print_vnodes(void);
 
 static void
-do_print_all_backtraces(
-       const char      *message)
+do_print_all_backtraces(const char *message, uint64_t panic_options)
 {
        int             logversion = PANICLOG_VERSION;
        thread_t        cur_thread = current_thread();
@@ -337,7 +343,7 @@ do_print_all_backtraces(
        }
        panic_bt_depth++;
 
-       /* Truncate panic string to 1200 bytes -- WDT log can be ~1100 bytes */
+       /* Truncate panic string to 1200 bytes */
        paniclog_append_noflush("Debugger message: %.1200s\n", message);
        if (debug_enabled) {
                paniclog_append_noflush("Device: %s\n",
@@ -437,8 +443,8 @@ do_print_all_backtraces(
        }
 #endif
 
-       // Just print threads with high CPU usage for WDT timeouts
-       if (strncmp(message, "WDT timeout", 11) == 0) {
+       // Highlight threads that used high amounts of CPU in the panic log if requested (historically requested for watchdog panics)
+       if (panic_options & DEBUGGER_OPTION_PRINT_CPU_USAGE_PANICLOG) {
                thread_t        top_runnable[5] = {0};
                thread_t        thread;
                int                     total_cpu_usage = 0;
@@ -483,7 +489,7 @@ do_print_all_backtraces(
                        }
                } // Loop through highest priority runnable threads
                paniclog_append_noflush("\n");
-       } // Check if message is "WDT timeout"
+       }
 
        // print current task info
        if (VALIDATE_PTR_LIST(cur_thread, cur_thread->task)) {
@@ -557,7 +563,7 @@ do_print_all_backtraces(
                        kdp_snapshot_preflight(-1, stackshot_begin_loc, bytes_remaining - end_marker_bytes,
                            (STACKSHOT_GET_GLOBAL_MEM_STATS | STACKSHOT_SAVE_LOADINFO | STACKSHOT_KCDATA_FORMAT |
                            STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_FROM_PANIC |
-                           STACKSHOT_NO_IO_STATS | STACKSHOT_THREAD_WAITINFO), &kc_panic_data, 0);
+                           STACKSHOT_NO_IO_STATS | STACKSHOT_THREAD_WAITINFO | STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT), &kc_panic_data, 0);
                        err = do_stackshot(NULL);
                        bytes_traced = kdp_stack_snapshot_bytes_traced();
                        if (bytes_traced > 0 && !err) {
@@ -605,7 +611,7 @@ do_print_all_backtraces(
  * Entry to print_all_backtraces is serialized by the debugger lock
  */
 static void
-print_all_backtraces(const char *message)
+print_all_backtraces(const char *message, uint64_t panic_options)
 {
        unsigned int initial_not_in_kdp = not_in_kdp;
 
@@ -620,7 +626,7 @@ print_all_backtraces(const char *message)
         * not_in_kdp.
         */
        not_in_kdp = 0;
-       do_print_all_backtraces(message);
+       do_print_all_backtraces(message, panic_options);
 
        not_in_kdp = initial_not_in_kdp;
 
@@ -663,10 +669,20 @@ panic_print_symbol_name(vm_address_t search)
 
 void
 SavePanicInfo(
-       const char *message, __unused void *panic_data, __unused uint64_t panic_options)
+       const char *message, __unused void *panic_data, uint64_t panic_options)
 {
-       /* This should be initialized by the time we get here */
-       assert(panic_info->eph_panic_log_offset != 0);
+       /*
+        * This should be initialized by the time we get here, but
+        * if it is not, asserting about it will be of no use (it will
+        * come right back to here), so just loop right here and now.
+        * This prevents early-boot panics from becoming recursive and
+        * thus makes them easier to debug. If you attached to a device
+        * and see your PC here, look down a few frames to see your
+        * early-boot panic there.
+        */
+       while (!panic_info || panic_info->eph_panic_log_offset == 0) {
+               ;
+       }
 
        if (panic_options & DEBUGGER_OPTION_PANICLOGANDREBOOT) {
                panic_info->eph_panic_flags  |= EMBEDDED_PANIC_HEADER_FLAG_BUTTON_RESET_PANIC;
@@ -699,7 +715,7 @@ SavePanicInfo(
 
        PanicInfoSaved = TRUE;
 
-       print_all_backtraces(message);
+       print_all_backtraces(message, panic_options);
 
        assert(panic_info->eph_panic_log_len != 0);
        panic_info->eph_other_log_len = PE_get_offset_into_panic_region(debug_buf_ptr) - panic_info->eph_other_log_offset;
@@ -744,6 +760,20 @@ paniclog_flush()
        PE_sync_panic_buffers();
 }
 
+/*
+ * @function _was_in_userspace
+ *
+ * @abstract Unused function used to indicate that a CPU was in userspace
+ * before it was IPI'd to enter the Debugger context.
+ *
+ * @discussion This function should never actually be called.
+ */
+static void __attribute__((__noreturn__))
+_was_in_userspace(void)
+{
+       panic("%s: should not have been invoked.", __FUNCTION__);
+}
+
 /*
  * @function DebuggerXCallEnter
  *
@@ -814,7 +844,7 @@ DebuggerXCallEnter(
                        }
 
                        if (KERN_SUCCESS == cpu_signal(target_cpu_datap, SIGPdebug, (void *)NULL, NULL)) {
-                               (void)hw_atomic_add(&debugger_sync, 1);
+                               os_atomic_inc(&debugger_sync, relaxed);
                        } else {
                                cpu_signal_failed = true;
                                kprintf("cpu_signal failed in DebuggerXCallEnter\n");
@@ -951,16 +981,16 @@ DebuggerXCall(
 
        if (save_context) {
                /* Save the interrupted context before acknowledging the signal */
-               *state = *regs;
+               copy_signed_thread_state(state, regs);
        } else if (regs) {
                /* zero old state so machine_trace_thread knows not to backtrace it */
                set_saved_state_fp(state, 0);
-               set_saved_state_pc(state, 0);
+               set_saved_state_pc(state, (register_t)&_was_in_userspace);
                set_saved_state_lr(state, 0);
                set_saved_state_sp(state, 0);
        }
 
-       (void)hw_atomic_sub(&debugger_sync, 1);
+       os_atomic_dec(&debugger_sync, relaxed);
        __builtin_arm_dmb(DMB_ISH);
        while (mp_kdp_trap) {
                ;
index 8ed24d4eb939d0ad658beef9120c83751201c707..b5825672b9038dc52b03f649c0994b0712cb9444 100644 (file)
@@ -27,6 +27,7 @@
  */
 
 #include <arm/monotonic.h>
+#include <kern/monotonic.h>
 #include <sys/errno.h>
 #include <sys/monotonic.h>
 
@@ -43,6 +44,12 @@ mt_core_snap(__unused unsigned int ctr)
        return 0;
 }
 
+uint64_t
+mt_count_pmis(void)
+{
+       return 0;
+}
+
 struct mt_cpu *
 mt_cur_cpu(void)
 {
index c9056284e3ac4aadf5b4c83032e4891953c1542f..76431251c25d21e789188b07b6a4b8083e21cb47 100644 (file)
@@ -29,6 +29,7 @@
 #define _ARM_PAL_ROUTINES_H
 
 #include <stdint.h>
+#include <string.h>
 
 #if defined(__cplusplus)
 extern "C" {
@@ -58,7 +59,7 @@ static inline void
 pal_get_resource_property(const char **property_name,
     int *property_value)
 {
-       *property_name = 0;
+       *property_name = NULL;
        (void) property_value;
 }
 
index 2ec9f9dcbca8172178306e8a33e2f94f7bb169d4..c03e518b63e191038e290e64d26e1dde85abdcc7 100644 (file)
@@ -115,6 +115,12 @@ machine_switch_context(
        return retval;
 }
 
+boolean_t
+machine_thread_on_core(thread_t thread)
+{
+       return thread->machine.machine_thread_flags & MACHINE_THREAD_FLAGS_ON_CPU;
+}
+
 /*
  * Routine:    machine_thread_create
  *
@@ -143,7 +149,7 @@ machine_thread_create(
        struct pmap *new_pmap = vm_map_pmap(task->map);
 
        thread->machine.kptw_ttb = ((unsigned int) kernel_pmap->ttep) | TTBR_SETUP;
-       thread->machine.asid = new_pmap->asid;
+       thread->machine.asid = new_pmap->hw_asid;
        if (new_pmap->tte_index_max == NTTES) {
                thread->machine.uptw_ttc = 2;
                thread->machine.uptw_ttb = ((unsigned int) new_pmap->ttep) | TTBR_SETUP;
index 8f33cff284001afa2e5fe7159860ef55434d0152..93921c0eb467ba6b577c7fdde4ef506686aa649a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2011-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <mach/machine/vm_types.h>
 
 #include <mach/boolean.h>
+#include <kern/bits.h>
 #include <kern/thread.h>
 #include <kern/sched.h>
 #include <kern/zalloc.h>
 #include <kern/kalloc.h>
 #include <kern/ledger.h>
-#include <kern/misc_protos.h>
 #include <kern/spl.h>
-#include <kern/xpr.h>
 #include <kern/trustcache.h>
 
 #include <os/overflow.h>
 #include <san/kasan.h>
 #include <sys/cdefs.h>
 
+#if defined(HAS_APPLE_PAC)
+#include <ptrauth.h>
+#endif
+
+#define PMAP_TT_L0_LEVEL        0x0
+#define PMAP_TT_L1_LEVEL        0x1
+#define PMAP_TT_L2_LEVEL        0x2
+#define PMAP_TT_L3_LEVEL        0x3
+#if (__ARM_VMSA__ == 7)
+#define PMAP_TT_MAX_LEVEL       PMAP_TT_L2_LEVEL
+#else
+#define PMAP_TT_MAX_LEVEL       PMAP_TT_L3_LEVEL
+#endif
+#define PMAP_TT_LEAF_LEVEL      PMAP_TT_MAX_LEVEL
+#define PMAP_TT_TWIG_LEVEL      (PMAP_TT_MAX_LEVEL - 1)
+
+static bool alloc_asid(pmap_t pmap);
+static void free_asid(pmap_t pmap);
+static void flush_mmu_tlb_region_asid_async(vm_offset_t va, unsigned length, pmap_t pmap);
+static void flush_mmu_tlb_tte_asid_async(vm_offset_t va, pmap_t pmap);
+static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
+static pt_entry_t wimg_to_pte(unsigned int wimg);
+
+struct page_table_ops {
+       bool (*alloc_id)(pmap_t pmap);
+       void (*free_id)(pmap_t pmap);
+       void (*flush_tlb_region_async)(vm_offset_t va, unsigned length, pmap_t pmap);
+       void (*flush_tlb_tte_async)(vm_offset_t va, pmap_t pmap);
+       void (*flush_tlb_async)(pmap_t pmap);
+       pt_entry_t (*wimg_to_pte)(unsigned int wimg);
+};
+
+static const struct page_table_ops native_pt_ops =
+{
+       .alloc_id = alloc_asid,
+       .free_id = free_asid,
+       .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
+       .flush_tlb_tte_async = flush_mmu_tlb_tte_asid_async,
+       .flush_tlb_async = flush_mmu_tlb_full_asid_async,
+       .wimg_to_pte = wimg_to_pte,
+};
+
+#if (__ARM_VMSA__ > 7)
+const struct page_table_level_info pmap_table_level_info_16k[] =
+{
+       [0] = {
+               .size       = ARM_16K_TT_L0_SIZE,
+               .offmask    = ARM_16K_TT_L0_OFFMASK,
+               .shift      = ARM_16K_TT_L0_SHIFT,
+               .index_mask = ARM_16K_TT_L0_INDEX_MASK,
+               .valid_mask = ARM_TTE_VALID,
+               .type_mask  = ARM_TTE_TYPE_MASK,
+               .type_block = ARM_TTE_TYPE_BLOCK
+       },
+       [1] = {
+               .size       = ARM_16K_TT_L1_SIZE,
+               .offmask    = ARM_16K_TT_L1_OFFMASK,
+               .shift      = ARM_16K_TT_L1_SHIFT,
+               .index_mask = ARM_16K_TT_L1_INDEX_MASK,
+               .valid_mask = ARM_TTE_VALID,
+               .type_mask  = ARM_TTE_TYPE_MASK,
+               .type_block = ARM_TTE_TYPE_BLOCK
+       },
+       [2] = {
+               .size       = ARM_16K_TT_L2_SIZE,
+               .offmask    = ARM_16K_TT_L2_OFFMASK,
+               .shift      = ARM_16K_TT_L2_SHIFT,
+               .index_mask = ARM_16K_TT_L2_INDEX_MASK,
+               .valid_mask = ARM_TTE_VALID,
+               .type_mask  = ARM_TTE_TYPE_MASK,
+               .type_block = ARM_TTE_TYPE_BLOCK
+       },
+       [3] = {
+               .size       = ARM_16K_TT_L3_SIZE,
+               .offmask    = ARM_16K_TT_L3_OFFMASK,
+               .shift      = ARM_16K_TT_L3_SHIFT,
+               .index_mask = ARM_16K_TT_L3_INDEX_MASK,
+               .valid_mask = ARM_PTE_TYPE_VALID,
+               .type_mask  = ARM_PTE_TYPE_MASK,
+               .type_block = ARM_TTE_TYPE_L3BLOCK
+       }
+};
+
+const struct page_table_level_info pmap_table_level_info_4k[] =
+{
+       [0] = {
+               .size       = ARM_4K_TT_L0_SIZE,
+               .offmask    = ARM_4K_TT_L0_OFFMASK,
+               .shift      = ARM_4K_TT_L0_SHIFT,
+               .index_mask = ARM_4K_TT_L0_INDEX_MASK,
+               .valid_mask = ARM_TTE_VALID,
+               .type_mask  = ARM_TTE_TYPE_MASK,
+               .type_block = ARM_TTE_TYPE_BLOCK
+       },
+       [1] = {
+               .size       = ARM_4K_TT_L1_SIZE,
+               .offmask    = ARM_4K_TT_L1_OFFMASK,
+               .shift      = ARM_4K_TT_L1_SHIFT,
+               .index_mask = ARM_4K_TT_L1_INDEX_MASK,
+               .valid_mask = ARM_TTE_VALID,
+               .type_mask  = ARM_TTE_TYPE_MASK,
+               .type_block = ARM_TTE_TYPE_BLOCK
+       },
+       [2] = {
+               .size       = ARM_4K_TT_L2_SIZE,
+               .offmask    = ARM_4K_TT_L2_OFFMASK,
+               .shift      = ARM_4K_TT_L2_SHIFT,
+               .index_mask = ARM_4K_TT_L2_INDEX_MASK,
+               .valid_mask = ARM_TTE_VALID,
+               .type_mask  = ARM_TTE_TYPE_MASK,
+               .type_block = ARM_TTE_TYPE_BLOCK
+       },
+       [3] = {
+               .size       = ARM_4K_TT_L3_SIZE,
+               .offmask    = ARM_4K_TT_L3_OFFMASK,
+               .shift      = ARM_4K_TT_L3_SHIFT,
+               .index_mask = ARM_4K_TT_L3_INDEX_MASK,
+               .valid_mask = ARM_PTE_TYPE_VALID,
+               .type_mask  = ARM_PTE_TYPE_MASK,
+               .type_block = ARM_TTE_TYPE_L3BLOCK
+       }
+};
+
+struct page_table_attr {
+       const struct page_table_level_info * const pta_level_info;
+       const struct page_table_ops * const pta_ops;
+       const uintptr_t ap_ro;
+       const uintptr_t ap_rw;
+       const uintptr_t ap_rona;
+       const uintptr_t ap_rwna;
+       const uintptr_t ap_xn;
+       const uintptr_t ap_x;
+       const unsigned int pta_root_level;
+       const unsigned int pta_max_level;
+};
+
+const struct page_table_attr pmap_pt_attr_4k = {
+       .pta_level_info = pmap_table_level_info_4k,
+       .pta_root_level = PMAP_TT_L1_LEVEL,
+       .pta_max_level  = PMAP_TT_L3_LEVEL,
+       .pta_ops = &native_pt_ops,
+       .ap_ro = ARM_PTE_AP(AP_RORO),
+       .ap_rw = ARM_PTE_AP(AP_RWRW),
+       .ap_rona = ARM_PTE_AP(AP_RONA),
+       .ap_rwna = ARM_PTE_AP(AP_RWNA),
+       .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
+       .ap_x = ARM_PTE_PNX,
+};
+
+const struct page_table_attr pmap_pt_attr_16k = {
+       .pta_level_info = pmap_table_level_info_16k,
+       .pta_root_level = PMAP_TT_L1_LEVEL,
+       .pta_max_level  = PMAP_TT_L3_LEVEL,
+       .pta_ops = &native_pt_ops,
+       .ap_ro = ARM_PTE_AP(AP_RORO),
+       .ap_rw = ARM_PTE_AP(AP_RWRW),
+       .ap_rona = ARM_PTE_AP(AP_RONA),
+       .ap_rwna = ARM_PTE_AP(AP_RWNA),
+       .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
+       .ap_x = ARM_PTE_PNX,
+};
+
+#if __ARM_16K_PG__
+const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
+#else /* !__ARM_16K_PG__ */
+const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
+#endif /* !__ARM_16K_PG__ */
+
+
+#else /* (__ARM_VMSA__ > 7) */
+/*
+ * We don't support pmap parameterization for VMSA7, so use an opaque
+ * page_table_attr structure.
+ */
+const struct page_table_attr * const native_pt_attr = NULL;
+#endif /* (__ARM_VMSA__ > 7) */
+
+typedef struct page_table_attr pt_attr_t;
+
+/* Macro for getting pmap attributes; not a function for const propagation. */
+#if ARM_PARAMETERIZED_PMAP
+/* The page table attributes are linked to the pmap */
+#define pmap_get_pt_attr(pmap) ((pmap)->pmap_pt_attr)
+#define pmap_get_pt_ops(pmap) ((pmap)->pmap_pt_attr->pta_ops)
+#else /* !ARM_PARAMETERIZED_PMAP */
+/* The page table attributes are fixed (to allow for const propagation) */
+#define pmap_get_pt_attr(pmap) (native_pt_attr)
+#define pmap_get_pt_ops(pmap) (&native_pt_ops)
+#endif /* !ARM_PARAMETERIZED_PMAP */
+
+#if (__ARM_VMSA__ > 7)
+static inline uint64_t
+pt_attr_ln_size(const pt_attr_t * const pt_attr, unsigned int level)
+{
+       return pt_attr->pta_level_info[level].size;
+}
+
+__unused static inline uint64_t
+pt_attr_ln_shift(const pt_attr_t * const pt_attr, unsigned int level)
+{
+       return pt_attr->pta_level_info[level].shift;
+}
+
+__unused static inline uint64_t
+pt_attr_ln_offmask(const pt_attr_t * const pt_attr, unsigned int level)
+{
+       return pt_attr->pta_level_info[level].offmask;
+}
+
+static inline unsigned int
+pt_attr_twig_level(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->pta_max_level - 1;
+}
+
+static inline unsigned int
+pt_attr_root_level(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->pta_root_level;
+}
+
+static __unused inline uint64_t
+pt_attr_leaf_size(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->pta_level_info[pt_attr->pta_max_level].size;
+}
+
+static __unused inline uint64_t
+pt_attr_leaf_offmask(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->pta_level_info[pt_attr->pta_max_level].offmask;
+}
+
+static inline uint64_t
+pt_attr_leaf_shift(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->pta_level_info[pt_attr->pta_max_level].shift;
+}
+
+static __unused inline uint64_t
+pt_attr_leaf_index_mask(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->pta_level_info[pt_attr->pta_max_level].index_mask;
+}
+
+static inline uint64_t
+pt_attr_twig_size(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->pta_level_info[pt_attr->pta_max_level - 1].size;
+}
+
+static inline uint64_t
+pt_attr_twig_offmask(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->pta_level_info[pt_attr->pta_max_level - 1].offmask;
+}
+
+static inline uint64_t
+pt_attr_twig_shift(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->pta_level_info[pt_attr->pta_max_level - 1].shift;
+}
+
+static __unused inline uint64_t
+pt_attr_twig_index_mask(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->pta_level_info[pt_attr->pta_max_level - 1].index_mask;
+}
+
+static inline uint64_t
+pt_attr_leaf_table_size(const pt_attr_t * const pt_attr)
+{
+       return pt_attr_twig_size(pt_attr);
+}
+
+static inline uint64_t
+pt_attr_leaf_table_offmask(const pt_attr_t * const pt_attr)
+{
+       return pt_attr_twig_offmask(pt_attr);
+}
+
+static inline uintptr_t
+pt_attr_leaf_rw(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->ap_rw;
+}
+
+static inline uintptr_t
+pt_attr_leaf_ro(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->ap_ro;
+}
+
+static inline uintptr_t
+pt_attr_leaf_rona(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->ap_rona;
+}
+
+static inline uintptr_t
+pt_attr_leaf_rwna(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->ap_rwna;
+}
+
+static inline uintptr_t
+pt_attr_leaf_xn(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->ap_xn;
+}
+
+static inline uintptr_t
+pt_attr_leaf_x(const pt_attr_t * const pt_attr)
+{
+       return pt_attr->ap_x;
+}
+
+#else /* (__ARM_VMSA__ > 7) */
+
+static inline unsigned int
+pt_attr_twig_level(__unused const pt_attr_t * const pt_attr)
+{
+       return PMAP_TT_L1_LEVEL;
+}
+
+static inline uint64_t
+pt_attr_twig_size(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_TT_TWIG_SIZE;
+}
+
+static inline uint64_t
+pt_attr_twig_offmask(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_TT_TWIG_OFFMASK;
+}
+
+static inline uint64_t
+pt_attr_twig_shift(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_TT_TWIG_SHIFT;
+}
+
+static __unused inline uint64_t
+pt_attr_twig_index_mask(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_TT_TWIG_INDEX_MASK;
+}
+
+__unused static inline uint64_t
+pt_attr_leaf_size(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_TT_LEAF_SIZE;
+}
+
+__unused static inline uint64_t
+pt_attr_leaf_offmask(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_TT_LEAF_OFFMASK;
+}
+
+static inline uint64_t
+pt_attr_leaf_shift(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_TT_LEAF_SHIFT;
+}
+
+static __unused inline uint64_t
+pt_attr_leaf_index_mask(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_TT_LEAF_INDEX_MASK;
+}
+
+static inline uint64_t
+pt_attr_leaf_table_size(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_TT_L1_PT_SIZE;
+}
+
+static inline uint64_t
+pt_attr_leaf_table_offmask(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_TT_L1_PT_OFFMASK;
+}
+
+static inline uintptr_t
+pt_attr_leaf_rw(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_PTE_AP(AP_RWRW);
+}
+
+static inline uintptr_t
+pt_attr_leaf_ro(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_PTE_AP(AP_RORO);
+}
+
+static inline uintptr_t
+pt_attr_leaf_rona(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_PTE_AP(AP_RONA);
+}
+
+static inline uintptr_t
+pt_attr_leaf_rwna(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_PTE_AP(AP_RWNA);
+}
+
+static inline uintptr_t
+pt_attr_leaf_xn(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_PTE_NX;
+}
+
+#endif /* (__ARM_VMSA__ > 7) */
+
+static inline void
+pmap_sync_tlb(bool strong __unused)
+{
+       sync_tlb_flush();
+}
 
 #if MACH_ASSERT
 int vm_footprint_suspend_allowed = 1;
@@ -128,11 +549,11 @@ int panic_on_unsigned_execute = 0;
 
 /* Virtual memory region for early allocation */
 #if     (__ARM_VMSA__ == 7)
-#define VREGION1_START          (VM_HIGH_KERNEL_WINDOW & ~ARM_TT_L1_PT_OFFMASK)
+#define VREGION1_HIGH_WINDOW    (0)
 #else
 #define VREGION1_HIGH_WINDOW    (PE_EARLY_BOOT_VA)
-#define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
 #endif
+#define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
 #define VREGION1_SIZE           (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
 
 extern unsigned int not_in_kdp;
@@ -146,7 +567,7 @@ extern vm_offset_t     virtual_space_start;     /* Next available kernel VA */
 extern vm_offset_t     virtual_space_end;       /* End of kernel address space */
 extern vm_offset_t     static_memory_end;
 
-extern int hard_maxproc;
+extern int maxproc, hard_maxproc;
 
 #if (__ARM_VMSA__ > 7)
 /* The number of address bits one TTBR can cover. */
@@ -179,14 +600,15 @@ vm_object_t     pmap_object = &pmap_object_store;
 
 static struct zone *pmap_zone;  /* zone of pmap structures */
 
-decl_simple_lock_data(, pmaps_lock MARK_AS_PMAP_DATA)
+decl_simple_lock_data(, pmaps_lock MARK_AS_PMAP_DATA);
+decl_simple_lock_data(, tt1_lock MARK_AS_PMAP_DATA);
 unsigned int    pmap_stamp MARK_AS_PMAP_DATA;
 queue_head_t    map_pmap_list MARK_AS_PMAP_DATA;
 
-decl_simple_lock_data(, pt_pages_lock MARK_AS_PMAP_DATA)
+decl_simple_lock_data(, pt_pages_lock MARK_AS_PMAP_DATA);
 queue_head_t    pt_page_list MARK_AS_PMAP_DATA; /* pt page ptd entries list */
 
-decl_simple_lock_data(, pmap_pages_lock MARK_AS_PMAP_DATA)
+decl_simple_lock_data(, pmap_pages_lock MARK_AS_PMAP_DATA);
 
 typedef struct page_free_entry {
        struct page_free_entry  *next;
@@ -241,6 +663,7 @@ SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0;                     /* set b
 int nx_enabled = 1;                                     /* enable no-execute protection */
 int allow_data_exec  = 0;                               /* No apps may execute data */
 int allow_stack_exec = 0;                               /* No apps may execute from the stack */
+unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
 #else /* DEVELOPMENT || DEBUG */
 const int nx_enabled = 1;                                       /* enable no-execute protection */
 const int allow_data_exec  = 0;                         /* No apps may execute data */
@@ -253,15 +676,16 @@ const int allow_stack_exec = 0;                         /* No apps may execute f
 typedef struct pv_entry {
        struct pv_entry *pve_next;              /* next alias */
        pt_entry_t      *pve_ptep;              /* page table entry */
+}
 #if __arm__ && (__BIGGEST_ALIGNMENT__ > 4)
 /* For the newer ARMv7k ABI where 64-bit types are 64-bit aligned, but pointers
  * are 32-bit:
  * Since pt_desc is 64-bit aligned and we cast often from pv_entry to
  * pt_desc.
  */
-__attribute__ ((aligned(8))) pv_entry_t;
+__attribute__ ((aligned(8))) pv_entry_t;
 #else
-pv_entry_t;
+pv_entry_t;
 #endif
 
 #define PV_ENTRY_NULL   ((pv_entry_t *) 0)
@@ -298,10 +722,10 @@ SECURITY_READ_ONLY_LATE(pv_entry_t * *) pv_head_table;           /* array of pv
 
 pv_entry_t              *pv_free_list MARK_AS_PMAP_DATA;
 pv_entry_t              *pv_kern_free_list MARK_AS_PMAP_DATA;
-decl_simple_lock_data(, pv_free_list_lock MARK_AS_PMAP_DATA)
-decl_simple_lock_data(, pv_kern_free_list_lock MARK_AS_PMAP_DATA)
+decl_simple_lock_data(, pv_free_list_lock MARK_AS_PMAP_DATA);
+decl_simple_lock_data(, pv_kern_free_list_lock MARK_AS_PMAP_DATA);
 
-decl_simple_lock_data(, phys_backup_lock)
+decl_simple_lock_data(, phys_backup_lock);
 
 /*
  *             pt_desc - structure to keep info on page assigned to page tables
@@ -321,6 +745,14 @@ decl_simple_lock_data(, phys_backup_lock)
 
 typedef struct pt_desc {
        queue_chain_t                   pt_page;
+       union {
+               struct pmap             *pmap;
+       };
+       /*
+        * Locate this struct towards the end of the pt_desc; our long term
+        * goal is to make this a VLA to avoid wasting memory if we don't need
+        * multiple entries.
+        */
        struct {
                /*
                 * For non-leaf pagetables, should always be PT_DESC_REFCOUNT
@@ -334,13 +766,8 @@ typedef struct pt_desc {
                 * For IOMMU pages, may optionally reflect a driver-defined refcount (IOMMU operations are implicitly wired)
                 */
                unsigned short          wiredcnt;
-       } pt_cnt[PT_INDEX_MAX];
-       union {
-               struct pmap             *pmap;
-       };
-       struct {
                vm_offset_t             va;
-       } pt_map[PT_INDEX_MAX];
+       } ptd_info[PT_INDEX_MAX];
 } pt_desc_t;
 
 
@@ -351,7 +778,7 @@ SECURITY_READ_ONLY_LATE(pt_desc_t *) ptd_root_table;
 pt_desc_t               *ptd_free_list MARK_AS_PMAP_DATA = PTD_ENTRY_NULL;
 SECURITY_READ_ONLY_LATE(boolean_t) ptd_preboot = TRUE;
 unsigned int    ptd_free_count MARK_AS_PMAP_DATA = 0;
-decl_simple_lock_data(, ptd_free_list_lock MARK_AS_PMAP_DATA)
+decl_simple_lock_data(, ptd_free_list_lock MARK_AS_PMAP_DATA);
 
 /*
  *     physical page attribute
@@ -377,8 +804,10 @@ SECURITY_READ_ONLY_LATE(pp_attr_t*)     pp_attr_table;
 
 typedef struct pmap_io_range {
        uint64_t addr;
-       uint32_t len;
-       uint32_t wimg; // treated as pp_attr_t
+       uint64_t len;
+       #define PMAP_IO_RANGE_STRONG_SYNC (1UL << 31) // Strong DSB required for pages in this range
+       uint32_t wimg; // lower 16 bits treated as pp_attr_t, upper 16 bits contain additional mapping flags
+       uint32_t signature; // 4CC
 } __attribute__((packed)) pmap_io_range_t;
 
 SECURITY_READ_ONLY_LATE(pmap_io_range_t*)       io_attr_table;
@@ -386,8 +815,6 @@ SECURITY_READ_ONLY_LATE(pmap_io_range_t*)       io_attr_table;
 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_first_phys = (pmap_paddr_t) 0;
 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_last_phys = (pmap_paddr_t) 0;
 
-SECURITY_READ_ONLY_LATE(pmap_paddr_t)   io_rgn_start = 0;
-SECURITY_READ_ONLY_LATE(pmap_paddr_t)   io_rgn_end = 0;
 SECURITY_READ_ONLY_LATE(unsigned int)   num_io_rgns = 0;
 
 SECURITY_READ_ONLY_LATE(boolean_t)      pmap_initialized = FALSE;       /* Has pmap_init completed? */
@@ -400,8 +827,13 @@ SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default  = 0x0;
 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
 #endif
 
-/* free address spaces (1 means free) */
-static uint32_t asid_bitmap[MAX_ASID / (sizeof(uint32_t) * NBBY)] MARK_AS_PMAP_DATA;
+#define PMAP_MAX_SW_ASID ((MAX_ASID + MAX_HW_ASID - 1) / MAX_HW_ASID)
+_Static_assert(PMAP_MAX_SW_ASID <= (UINT8_MAX + 1),
+    "VASID bits can't be represented by an 8-bit integer");
+
+decl_simple_lock_data(, asid_lock MARK_AS_PMAP_DATA);
+static bitmap_t asid_bitmap[BITMAP_LEN(MAX_ASID)] MARK_AS_PMAP_DATA;
+
 
 #if     (__ARM_VMSA__ > 7)
 SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap;
@@ -425,7 +857,7 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap;
 #define pte_set_wired(ptep, wired)                                                                              \
        do {                                                                                                    \
                SInt16  *ptd_wiredcnt_ptr;                                                                      \
-               ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_ptd(ptep)->pt_cnt[ARM_PT_DESC_INDEX(ptep)].wiredcnt);   \
+               ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_ptd(ptep)->ptd_info[ARM_PT_DESC_INDEX(ptep)].wiredcnt);   \
                if (wired) {                                                                                    \
                                *ptep |= ARM_PTE_WIRED;                                                         \
                                OSAddAtomic16(1, ptd_wiredcnt_ptr);                                             \
@@ -473,69 +905,52 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap;
 
 /* PTEP Define Macros */
 
-#if     (__ARM_VMSA__ == 7)
+/* mask for page descriptor index */
+#define ARM_TT_PT_INDEX_MASK            ARM_PGMASK
 
+#if     (__ARM_VMSA__ == 7)
 #define ARM_PT_DESC_INDEX_MASK          0x00000
 #define ARM_PT_DESC_INDEX_SHIFT         0
 
-/*
- * mask for page descriptor index:  4MB per page table
- */
-#define ARM_TT_PT_INDEX_MASK            0xfffU          /* mask for page descriptor index: 4MB per page table  */
-
 /*
  * Shift value used for reconstructing the virtual address for a PTE.
  */
 #define ARM_TT_PT_ADDR_SHIFT            (10U)
 
 #define ptep_get_va(ptep)                                                                               \
-       ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~0xFFF))))))))->pt_map[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<<ARM_TT_PT_ADDR_SHIFT))
+       ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~ARM_PGMASK))))))))->ptd_info[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<<ARM_TT_PT_ADDR_SHIFT))
 
 #define ptep_get_pmap(ptep)                                                                             \
-       ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~0xFFF))))))))->pmap))
+       ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~ARM_PGMASK))))))))->pmap))
 
 #else
 
 #if (ARM_PGSHIFT == 12)
 #define ARM_PT_DESC_INDEX_MASK          ((PAGE_SHIFT_CONST == ARM_PGSHIFT )? 0x00000ULL : 0x03000ULL)
 #define ARM_PT_DESC_INDEX_SHIFT         ((PAGE_SHIFT_CONST == ARM_PGSHIFT )? 0 : 12)
-/*
- * mask for page descriptor index:  2MB per page table
- */
-#define ARM_TT_PT_INDEX_MASK            (0x0fffULL)
 /*
  * Shift value used for reconstructing the virtual address for a PTE.
  */
 #define ARM_TT_PT_ADDR_SHIFT            (9ULL)
-
-/* TODO: Give this a better name/documentation than "other" */
-#define ARM_TT_PT_OTHER_MASK            (0x0fffULL)
-
 #else
 
 #define ARM_PT_DESC_INDEX_MASK          (0x00000)
 #define ARM_PT_DESC_INDEX_SHIFT         (0)
-/*
- * mask for page descriptor index:  32MB per page table
- */
-#define ARM_TT_PT_INDEX_MASK            (0x3fffULL)
 /*
  * Shift value used for reconstructing the virtual address for a PTE.
  */
 #define ARM_TT_PT_ADDR_SHIFT            (11ULL)
-
-/* TODO: Give this a better name/documentation than "other" */
-#define ARM_TT_PT_OTHER_MASK            (0x3fffULL)
 #endif
 
+
 #define ARM_PT_DESC_INDEX(ptep)                                                                         \
        (((unsigned)(ptep) & ARM_PT_DESC_INDEX_MASK) >> ARM_PT_DESC_INDEX_SHIFT)
 
 #define ptep_get_va(ptep)                                                                               \
-       ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~ARM_TT_PT_OTHER_MASK))))))))->pt_map[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<<ARM_TT_PT_ADDR_SHIFT))
+       ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~ARM_PGMASK))))))))->ptd_info[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<<ARM_TT_PT_ADDR_SHIFT))
 
 #define ptep_get_pmap(ptep)                                                                             \
-       ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~ARM_TT_PT_OTHER_MASK))))))))->pmap))
+       ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~ARM_PGMASK))))))))->pmap))
 
 #endif
 
@@ -558,8 +973,9 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap;
 
 #ifdef  __arm64__
 
-#define PVH_FLAG_IOMMU       0x4UL
-#define PVH_FLAG_IOMMU_TABLE (1ULL << 63)
+/* All flags listed below are stored in the PV head pointer unless otherwise noted */
+#define PVH_FLAG_IOMMU       0x4UL /* Stored in each PTE, or in PV head for single-PTE PV heads */
+#define PVH_FLAG_IOMMU_TABLE (1ULL << 63) /* Stored in each PTE, or in PV head for single-PTE PV heads */
 #define PVH_FLAG_CPU         (1ULL << 62)
 #define PVH_LOCK_BIT         61
 #define PVH_FLAG_LOCK        (1ULL << PVH_LOCK_BIT)
@@ -591,15 +1007,15 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap;
 
 #define pvh_set_flags(h, f)                                                                                     \
        do {                                                                                                    \
-               __c11_atomic_store((_Atomic vm_offset_t *)(h), (*(vm_offset_t *)(h) & ~PVH_HIGH_FLAGS) | (f),   \
-                    memory_order_relaxed);                                                                     \
+               os_atomic_store((vm_offset_t *)(h), (*(vm_offset_t *)(h) & ~PVH_HIGH_FLAGS) | (f),              \
+                    relaxed);                                                                                  \
        } while (0)
 
 #define pvh_update_head(h, e, t)                                                                                \
        do {                                                                                                    \
                assert(*(vm_offset_t *)(h) & PVH_FLAG_LOCK);                                                    \
-               __c11_atomic_store((_Atomic vm_offset_t *)(h), (vm_offset_t)(e) | (t) | PVH_FLAG_LOCK,          \
-                    memory_order_relaxed);                                                                     \
+               os_atomic_store((vm_offset_t *)(h), (vm_offset_t)(e) | (t) | PVH_FLAG_LOCK,                     \
+                    relaxed);                                                                                  \
        } while (0)
 
 #define pvh_update_head_unlocked(h, e, t)                                                                       \
@@ -740,25 +1156,34 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap;
 
 #if     (__ARM_VMSA__ == 7)
 
-#define tte_index(pmap, addr)                                                           \
+#define tte_index(pmap, pt_attr, addr) \
        ttenum((addr))
 
+#define pte_index(pmap, pt_attr, addr) \
+       ptenum((addr))
+
 #else
 
-#define tt0_index(pmap, addr)                                                           \
-       (((addr) & ARM_TT_L0_INDEX_MASK) >> ARM_TT_L0_SHIFT)
+#define ttn_index(pmap, pt_attr, addr, pt_level) \
+       (((addr) & (pt_attr)->pta_level_info[(pt_level)].index_mask) >> (pt_attr)->pta_level_info[(pt_level)].shift)
+
+#define tt0_index(pmap, pt_attr, addr) \
+       ttn_index((pmap), (pt_attr), (addr), PMAP_TT_L0_LEVEL)
 
-#define tt1_index(pmap, addr)                                                           \
-       (((addr) & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT)
+#define tt1_index(pmap, pt_attr, addr) \
+       ttn_index((pmap), (pt_attr), (addr), PMAP_TT_L1_LEVEL)
 
-#define tt2_index(pmap, addr)                                                           \
-       (((addr) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)
+#define tt2_index(pmap, pt_attr, addr) \
+       ttn_index((pmap), (pt_attr), (addr), PMAP_TT_L2_LEVEL)
 
-#define tt3_index(pmap, addr)                                                           \
-       (((addr) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)
+#define tt3_index(pmap, pt_attr, addr) \
+       ttn_index((pmap), (pt_attr), (addr), PMAP_TT_L3_LEVEL)
 
-#define tte_index(pmap, addr)                                                           \
-       (((addr) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)
+#define tte_index(pmap, pt_attr, addr) \
+       tt2_index((pmap), (pt_attr), (addr))
+
+#define pte_index(pmap, pt_attr, addr) \
+       tt3_index((pmap), (pt_attr), (addr))
 
 #endif
 
@@ -810,13 +1235,11 @@ lck_grp_t pmap_lck_grp;
                pmap_unlock_bit((uint32_t*)(&pv_head_table[index]) + PVH_LOCK_WORD, PVH_LOCK_BIT - (PVH_LOCK_WORD * 32));       \
        } while (0)
 
-#define PMAP_UPDATE_TLBS(pmap, s, e) {                                                  \
-       flush_mmu_tlb_region_asid_async(s, (unsigned)(e - s), pmap);                    \
-       sync_tlb_flush();                                                               \
+#define PMAP_UPDATE_TLBS(pmap, s, e, strong) {                                          \
+       pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (unsigned)(e - s), pmap);      \
+       pmap_sync_tlb(strong);                                                          \
 }
 
-#ifdef  __ARM_L1_PTW__
-
 #define FLUSH_PTE_RANGE(spte, epte)                                                     \
        __builtin_arm_dmb(DMB_ISH);
 
@@ -829,32 +1252,15 @@ lck_grp_t pmap_lck_grp;
 #define FLUSH_PTE_RANGE_STRONG(spte, epte)                                              \
        __builtin_arm_dsb(DSB_ISH);
 
-#else /* __ARM_L1_PTW */
-
-#define FLUSH_PTE_RANGE(spte, epte)                                                     \
-               CleanPoU_DcacheRegion((vm_offset_t)spte,                                \
-                       (vm_offset_t)epte - (vm_offset_t)spte);
-
-#define FLUSH_PTE(pte_p)                                                                \
-       __unreachable_ok_push                                                           \
-       if (TEST_PAGE_RATIO_4)                                                          \
-               FLUSH_PTE_RANGE((pte_p), (pte_p) + 4);                                  \
-       else                                                                            \
-               FLUSH_PTE_RANGE((pte_p), (pte_p) + 1);                                  \
-       CleanPoU_DcacheRegion((vm_offset_t)pte_p, sizeof(pt_entry_t));                  \
-       __unreachable_ok_pop
-
-#define FLUSH_PTE_STRONG(pte_p) FLUSH_PTE(pte_p)
-
-#define FLUSH_PTE_RANGE_STRONG(spte, epte) FLUSH_PTE_RANGE(spte, epte)
-
-#endif /* !defined(__ARM_L1_PTW) */
-
 #define WRITE_PTE_FAST(pte_p, pte_entry)                                                \
        __unreachable_ok_push                                                           \
        if (TEST_PAGE_RATIO_4) {                                                        \
-               if (((unsigned)(pte_p)) & 0x1f)                                         \
-                       panic("WRITE_PTE\n");                                           \
+               if (((unsigned)(pte_p)) & 0x1f) {                                       \
+                       panic("%s: WRITE_PTE_FAST is unaligned, "                       \
+                             "pte_p=%p, pte_entry=%p",                                 \
+                              __FUNCTION__,                                            \
+                              pte_p, (void*)pte_entry);                                \
+               }                                                                       \
                if (((pte_entry) & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {        \
                        *(pte_p) = (pte_entry);                                         \
                        *((pte_p)+1) = (pte_entry);                                     \
@@ -959,9 +1365,6 @@ ppnum_t                 pmap_vtophys(
 void pmap_switch_user_ttb(
        pmap_t pmap);
 
-static void     flush_mmu_tlb_region_asid_async(
-       vm_offset_t va, unsigned length, pmap_t pmap);
-
 static kern_return_t pmap_expand(
        pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
 
@@ -969,7 +1372,7 @@ static int pmap_remove_range(
        pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *, uint32_t *);
 
 static int pmap_remove_range_options(
-       pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *, uint32_t *, int);
+       pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *, uint32_t *, bool *, int);
 
 static tt_entry_t *pmap_tt1_allocate(
        pmap_t, vm_size_t, unsigned int);
@@ -989,15 +1392,6 @@ static kern_return_t pmap_tt_allocate(
 static void pmap_tte_deallocate(
        pmap_t, tt_entry_t *, unsigned int);
 
-#define PMAP_TT_L1_LEVEL        0x1
-#define PMAP_TT_L2_LEVEL        0x2
-#define PMAP_TT_L3_LEVEL        0x3
-#if (__ARM_VMSA__ == 7)
-#define PMAP_TT_MAX_LEVEL       PMAP_TT_L2_LEVEL
-#else
-#define PMAP_TT_MAX_LEVEL       PMAP_TT_L3_LEVEL
-#endif
-
 #ifdef __ARM64_PMAP_SUBPAGE_L1__
 #if (__ARM_VMSA__ <= 7)
 #error This is not supported for old-style page tables
@@ -1024,6 +1418,9 @@ static inline tt_entry_t *pmap_tt2e(
 static inline pt_entry_t *pmap_tt3e(
        pmap_t, vm_map_address_t);
 
+static inline pt_entry_t *pmap_ttne(
+       pmap_t, unsigned int, vm_map_address_t);
+
 static void pmap_unmap_sharedpage(
        pmap_t pmap);
 
@@ -1064,19 +1461,20 @@ static void pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes);
 
 static void pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes);
 
-
 static void pmap_trim_self(pmap_t pmap);
 static void pmap_trim_subord(pmap_t subord);
 
+
 #define PMAP_SUPPORT_PROTOTYPES(__return_type, __function_name, __function_args, __function_index) \
-       static __return_type __function_name##_internal __function_args;
+       static __return_type __function_name##_internal __function_args
 
 PMAP_SUPPORT_PROTOTYPES(
        kern_return_t,
        arm_fast_fault, (pmap_t pmap,
        vm_map_address_t va,
        vm_prot_t fault_type,
-       boolean_t from_user), ARM_FAST_FAULT_INDEX);
+       bool was_af_fault,
+       bool from_user), ARM_FAST_FAULT_INDEX);
 
 
 PMAP_SUPPORT_PROTOTYPES(
@@ -1110,9 +1508,9 @@ PMAP_SUPPORT_PROTOTYPES(
 
 PMAP_SUPPORT_PROTOTYPES(
        pmap_t,
-       pmap_create, (ledger_t ledger,
+       pmap_create_options, (ledger_t ledger,
        vm_map_size_t size,
-       boolean_t is_64bit), PMAP_CREATE_INDEX);
+       unsigned int flags), PMAP_CREATE_INDEX);
 
 PMAP_SUPPORT_PROTOTYPES(
        void,
@@ -1287,6 +1685,7 @@ PMAP_SUPPORT_PROTOTYPES(
 
 
 
+
 void pmap_footprint_suspend(vm_map_t    map,
     boolean_t   suspend);
 PMAP_SUPPORT_PROTOTYPES(
@@ -1389,6 +1788,7 @@ pmap_get_cpu_data(void)
 }
 
 
+
 /* TODO */
 pmap_paddr_t
 pmap_pages_reclaim(
@@ -1398,7 +1798,6 @@ pmap_pages_reclaim(
        unsigned                i;
        pt_desc_t               *ptdp;
 
-
        /*
         * pmap_pages_reclaim() is returning a page by freeing an active pt page.
         * To be eligible, a pt page is assigned to a user pmap. It doesn't have any wired pte
@@ -1442,13 +1841,13 @@ pmap_pages_reclaim(
                                unsigned wiredcnt_acc = 0;
 
                                for (i = 0; i < PT_INDEX_MAX; i++) {
-                                       if (ptdp->pt_cnt[i].refcnt == PT_DESC_REFCOUNT) {
+                                       if (ptdp->ptd_info[i].refcnt == PT_DESC_REFCOUNT) {
                                                /* Do not attempt to free a page that contains an L2 table */
                                                refcnt_acc = 0;
                                                break;
                                        }
-                                       refcnt_acc += ptdp->pt_cnt[i].refcnt;
-                                       wiredcnt_acc += ptdp->pt_cnt[i].wiredcnt;
+                                       refcnt_acc += ptdp->ptd_info[i].refcnt;
+                                       wiredcnt_acc += ptdp->ptd_info[i].wiredcnt;
                                }
                                if ((wiredcnt_acc == 0) && (refcnt_acc != 0)) {
                                        found_page = TRUE;
@@ -1462,21 +1861,25 @@ pmap_pages_reclaim(
                        ptdp = (pt_desc_t *)queue_next((queue_t)ptdp);
                }
                if (!found_page) {
-                       panic("pmap_pages_reclaim(): No eligible page in pt_page_list\n");
+                       panic("%s: No eligible page in pt_page_list", __FUNCTION__);
                } else {
-                       int                                     remove_count = 0;
+                       int                     remove_count = 0;
+                       bool                    need_strong_sync = false;
                        vm_map_address_t        va;
-                       pmap_t                          pmap;
-                       pt_entry_t                      *bpte, *epte;
-                       pt_entry_t                      *pte_p;
-                       tt_entry_t                      *tte_p;
-                       uint32_t                        rmv_spte = 0;
+                       pmap_t                  pmap;
+                       pt_entry_t              *bpte, *epte;
+                       pt_entry_t              *pte_p;
+                       tt_entry_t              *tte_p;
+                       uint32_t                rmv_spte = 0;
 
                        pmap_simple_unlock(&pt_pages_lock);
                        pmap = ptdp->pmap;
                        PMAP_ASSERT_LOCKED(pmap); // pmap lock should be held from loop above
+
+                       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
+
                        for (i = 0; i < PT_INDEX_MAX; i++) {
-                               va = ptdp->pt_map[i].va;
+                               va = ptdp->ptd_info[i].va;
 
                                /* If the VA is bogus, this may represent an unallocated region
                                 * or one which is in transition (already being freed or expanded).
@@ -1488,15 +1891,9 @@ pmap_pages_reclaim(
                                tte_p = pmap_tte(pmap, va);
                                if ((tte_p != (tt_entry_t *) NULL)
                                    && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
-#if     (__ARM_VMSA__ == 7)
-                                       pte_p = (pt_entry_t *) ttetokv(*tte_p);
-                                       bpte = &pte_p[ptenum(va)];
-                                       epte = bpte + PAGE_SIZE / sizeof(pt_entry_t);
-#else
                                        pte_p = (pt_entry_t *) ttetokv(*tte_p);
-                                       bpte = &pte_p[tt3_index(pmap, va)];
+                                       bpte = &pte_p[pte_index(pmap, pt_attr, va)];
                                        epte = bpte + PAGE_SIZE / sizeof(pt_entry_t);
-#endif
                                        /*
                                         * Use PMAP_OPTIONS_REMOVE to clear any
                                         * "compressed" markers and update the
@@ -1510,33 +1907,23 @@ pmap_pages_reclaim(
                                         */
                                        remove_count += pmap_remove_range_options(
                                                pmap, va, bpte, epte,
-                                               &rmv_spte, PMAP_OPTIONS_REMOVE);
-                                       if (ptdp->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt != 0) {
-                                               panic("pmap_pages_reclaim(): ptdp %p, count %d\n", ptdp, ptdp->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt);
+                                               &rmv_spte, &need_strong_sync, PMAP_OPTIONS_REMOVE);
+                                       if (ptdp->ptd_info[ARM_PT_DESC_INDEX(pte_p)].refcnt != 0) {
+                                               panic("%s: ptdp %p, count %d", __FUNCTION__, ptdp, ptdp->ptd_info[ARM_PT_DESC_INDEX(pte_p)].refcnt);
                                        }
-#if     (__ARM_VMSA__ == 7)
-                                       pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L1_LEVEL);
-                                       flush_mmu_tlb_entry_async((va & ~ARM_TT_L1_PT_OFFMASK) | (pmap->asid & 0xff));
-                                       flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->asid & 0xff));
-                                       flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 2 * ARM_TT_L1_SIZE) | (pmap->asid & 0xff));
-                                       flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 3 * ARM_TT_L1_SIZE) | (pmap->asid & 0xff));
-#else
-                                       pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L2_LEVEL);
-                                       flush_mmu_tlb_entry_async(tlbi_addr(va & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid));
-#endif
+
+                                       pmap_tte_deallocate(pmap, tte_p, PMAP_TT_TWIG_LEVEL);
 
                                        if (remove_count > 0) {
-#if     (__ARM_VMSA__ == 7)
-                                               flush_mmu_tlb_region_asid_async(va, 4 * ARM_TT_L1_SIZE, pmap);
-#else
-                                               flush_mmu_tlb_region_asid_async(va, ARM_TT_L2_SIZE, pmap);
-#endif
+                                               pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, (unsigned int)pt_attr_leaf_table_size(pt_attr), pmap);
+                                       } else {
+                                               pmap_get_pt_ops(pmap)->flush_tlb_tte_async(va, pmap);
                                        }
                                }
                        }
-                       sync_tlb_flush();
                        // Undo the lock we grabbed when we found ptdp above
                        PMAP_UNLOCK(pmap);
+                       pmap_sync_tlb(need_strong_sync);
                }
                pmap_simple_lock(&pmap_pages_lock);
        }
@@ -1671,76 +2058,56 @@ pmap_tt_ledger_debit(
        }
 }
 
-static unsigned int
-alloc_asid(
-       void)
+static bool
+alloc_asid(pmap_t pmap)
 {
-       unsigned int    asid_bitmap_index;
-
-       pmap_simple_lock(&pmaps_lock);
-       for (asid_bitmap_index = 0; asid_bitmap_index < (MAX_ASID / (sizeof(uint32_t) * NBBY)); asid_bitmap_index++) {
-               unsigned int    temp = ffs(asid_bitmap[asid_bitmap_index]);
-               if (temp > 0) {
-                       temp -= 1;
-                       asid_bitmap[asid_bitmap_index] &= ~(1 << temp);
-#if __ARM_KERNEL_PROTECT__
-                       /*
-                        * We need two ASIDs: n and (n | 1).  n is used for EL0,
-                        * (n | 1) for EL1.
-                        */
-                       unsigned int temp2 = temp | 1;
-                       assert(temp2 < MAX_ASID);
-                       assert(temp2 < 32);
-                       assert(temp2 != temp);
-                       assert(asid_bitmap[asid_bitmap_index] & (1 << temp2));
-
-                       /* Grab the second ASID. */
-                       asid_bitmap[asid_bitmap_index] &= ~(1 << temp2);
-#endif /* __ARM_KERNEL_PROTECT__ */
-                       pmap_simple_unlock(&pmaps_lock);
-
-                       /*
-                        * We should never vend out physical ASID 0 through this
-                        * method, as it belongs to the kernel.
-                        */
-                       assert(((asid_bitmap_index * sizeof(uint32_t) * NBBY + temp) % ARM_MAX_ASID) != 0);
+       int vasid;
+       uint16_t hw_asid;
 
-#if __ARM_KERNEL_PROTECT__
-                       /* Or the kernel EL1 ASID. */
-                       assert(((asid_bitmap_index * sizeof(uint32_t) * NBBY + temp) % ARM_MAX_ASID) != 1);
-#endif /* __ARM_KERNEL_PROTECT__ */
-
-                       return asid_bitmap_index * sizeof(uint32_t) * NBBY + temp;
-               }
+       pmap_simple_lock(&asid_lock);
+       vasid = bitmap_first(&asid_bitmap[0], MAX_ASID);
+       if (vasid < 0) {
+               pmap_simple_unlock(&asid_lock);
+               return false;
        }
-       pmap_simple_unlock(&pmaps_lock);
-       /*
-        * ToDo: Add code to deal with pmap with no asid panic for now. Not
-        * an issue with the small config  process hard limit
-        */
-       panic("alloc_asid(): out of ASID number");
-       return MAX_ASID;
+       assert(vasid < MAX_ASID);
+       bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
+       pmap_simple_unlock(&asid_lock);
+       // bitmap_first() returns highest-order bits first, but a 0-based scheme works
+       // slightly better with the collision detection scheme used by pmap_switch_internal().
+       vasid = MAX_ASID - 1 - vasid;
+       hw_asid = vasid % MAX_HW_ASID;
+       pmap->sw_asid = vasid / MAX_HW_ASID;
+       hw_asid += 1;  // Account for ASID 0, which is reserved for the kernel
+#if __ARM_KERNEL_PROTECT__
+       hw_asid <<= 1;  // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
+#endif
+       pmap->hw_asid = hw_asid;
+       return true;
 }
 
 static void
-free_asid(
-       int asid)
+free_asid(pmap_t pmap)
 {
-       /* Don't free up any alias of physical ASID 0. */
-       assert((asid % ARM_MAX_ASID) != 0);
-
-       pmap_simple_lock(&pmaps_lock);
-       setbit(asid, (int *) asid_bitmap);
+       unsigned int vasid;
+       uint16_t hw_asid = pmap->hw_asid;
+       assert(hw_asid != 0); // Should not try to free kernel ASID
 
 #if __ARM_KERNEL_PROTECT__
-       assert((asid | 1) < MAX_ASID);
-       assert((asid | 1) != asid);
-       setbit(asid | 1, (int *) asid_bitmap);
-#endif /* __ARM_KERNEL_PROTECT__ */
+       hw_asid >>= 1;
+#endif
+       hw_asid -= 1;
 
-       pmap_simple_unlock(&pmaps_lock);
+       vasid = ((unsigned int)pmap->sw_asid * MAX_HW_ASID) + hw_asid;
+       vasid = MAX_ASID - 1 - vasid;
+
+       pmap_simple_lock(&asid_lock);
+       assert(!bitmap_test(&asid_bitmap[0], vasid));
+       bitmap_set(&asid_bitmap[0], vasid);
+       pmap_simple_unlock(&asid_lock);
 }
 
+
 #ifndef PMAP_PV_LOAD_FACTOR
 #define PMAP_PV_LOAD_FACTOR            1
 #endif
@@ -1931,8 +2298,8 @@ pv_list_free(
 static inline void
 pv_water_mark_check(void)
 {
-       if ((pv_free_count < pv_low_water_mark) || (pv_kern_free_count < pv_kern_low_water_mark)) {
-               if (!mappingrecurse && hw_compare_and_store(0, 1, &mappingrecurse)) {
+       if (__improbable((pv_free_count < pv_low_water_mark) || (pv_kern_free_count < pv_kern_low_water_mark))) {
+               if (!mappingrecurse && os_atomic_cmpxchg(&mappingrecurse, 0, 1, acq_rel)) {
                        thread_wakeup(&mapping_replenish_event);
                }
        }
@@ -2134,7 +2501,8 @@ mapping_free_prime(void)
        kr = mapping_free_prime_internal();
 
        if (kr != KERN_SUCCESS) {
-               panic("%s: failed, kr=%d", __FUNCTION__, kr);
+               panic("%s: failed, kr=%d",
+                   __FUNCTION__, kr);
        }
 }
 
@@ -2147,7 +2515,8 @@ mapping_adjust(void)
 
        mres = kernel_thread_start_priority((thread_continue_t)mapping_replenish, NULL, MAXPRI_KERNEL, &mapping_replenish_thread);
        if (mres != KERN_SUCCESS) {
-               panic("pmap: mapping_replenish thread creation failed");
+               panic("%s: mapping_replenish thread creation failed",
+                   __FUNCTION__);
        }
        thread_deallocate(mapping_replenish_thread);
 }
@@ -2331,7 +2700,8 @@ ptd_alloc_unlinked(bool reclaim)
                ptd_free_list = (pt_desc_t *)(*(void **)ptdp);
                ptd_free_count--;
        } else {
-               panic("out of ptd entry\n");
+               panic("%s: out of ptd entry",
+                   __FUNCTION__);
        }
 
        if (!ptd_preboot) {
@@ -2343,9 +2713,9 @@ ptd_alloc_unlinked(bool reclaim)
        ptdp->pmap = NULL;
 
        for (i = 0; i < PT_INDEX_MAX; i++) {
-               ptdp->pt_map[i].va = (vm_offset_t)-1;
-               ptdp->pt_cnt[i].refcnt = 0;
-               ptdp->pt_cnt[i].wiredcnt = 0;
+               ptdp->ptd_info[i].va = (vm_offset_t)-1;
+               ptdp->ptd_info[i].refcnt = 0;
+               ptdp->ptd_info[i].wiredcnt = 0;
        }
 
        return ptdp;
@@ -2379,7 +2749,10 @@ ptd_deallocate(pt_desc_t *ptdp)
        pmap_t          pmap = ptdp->pmap;
 
        if (ptd_preboot) {
-               panic("ptd_deallocate(): early boot\n");
+               panic("%s: early boot, "
+                   "ptdp=%p",
+                   __FUNCTION__,
+                   ptdp);
        }
 
        if (ptdp->pt_page.next != NULL) {
@@ -2406,21 +2779,21 @@ ptd_init(
        pt_entry_t *pte_p)
 {
        if (ptdp->pmap != pmap) {
-               panic("ptd_init(): pmap mismatch\n");
+               panic("%s: pmap mismatch, "
+                   "ptdp=%p, pmap=%p, va=%p, level=%u, pte_p=%p",
+                   __FUNCTION__,
+                   ptdp, pmap, (void*)va, level, pte_p);
        }
 
 #if     (__ARM_VMSA__ == 7)
        assert(level == 2);
-       ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~(ARM_TT_L1_PT_OFFMASK);
+       ptdp->ptd_info[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~(ARM_TT_L1_PT_OFFMASK);
 #else
-       if (level == 3) {
-               ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~ARM_TT_L2_OFFMASK;
-       } else if (level == 2) {
-               ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~ARM_TT_L1_OFFMASK;
-       }
+       assert(level > pt_attr_root_level(pmap_get_pt_attr(pmap)));
+       ptdp->ptd_info[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~(pt_attr_ln_offmask(pmap_get_pt_attr(pmap), level - 1));
 #endif
        if (level < PMAP_TT_MAX_LEVEL) {
-               ptdp->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt = PT_DESC_REFCOUNT;
+               ptdp->ptd_info[ARM_PT_DESC_INDEX(pte_p)].refcnt = PT_DESC_REFCOUNT;
        }
 }
 
@@ -2442,10 +2815,12 @@ static inline tt_entry_t *
 pmap_tte(pmap_t pmap,
     vm_map_address_t addr)
 {
-       if (!(tte_index(pmap, addr) < pmap->tte_index_max)) {
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
+
+       if (!(tte_index(pmap, pt_attr, addr) < pmap->tte_index_max)) {
                return (tt_entry_t *)NULL;
        }
-       return &pmap->tte[tte_index(pmap, addr)];
+       return &pmap->tte[tte_index(pmap, pt_attr, addr)];
 }
 
 
@@ -2470,11 +2845,14 @@ pmap_pte(
                return PT_ENTRY_NULL;
        }
        tte = *ttp;
-       #if MACH_ASSERT
+#if MACH_ASSERT
        if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
-               panic("Attempt to demote L1 block: pmap=%p, va=0x%llx, tte=0x%llx\n", pmap, (uint64_t)addr, (uint64_t)tte);
+               panic("%s: Attempt to demote L1 block, tte=0x%lx, "
+                   "pmap=%p, addr=%p",
+                   __FUNCTION__, (unsigned long)tte,
+                   pmap, (void*)addr);
        }
-       #endif
+#endif
        if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
                return PT_ENTRY_NULL;
        }
@@ -2482,8 +2860,75 @@ pmap_pte(
        return ptp;
 }
 
+__unused static inline tt_entry_t *
+pmap_ttne(pmap_t pmap,
+    unsigned int target_level,
+    vm_map_address_t addr)
+{
+       tt_entry_t * ret_ttep = NULL;
+
+       switch (target_level) {
+       case 1:
+               ret_ttep = pmap_tte(pmap, addr);
+               break;
+       case 2:
+               ret_ttep = (tt_entry_t *)pmap_pte(pmap, addr);
+               break;
+       default:
+               panic("%s: bad level, "
+                   "pmap=%p, target_level=%u, addr=%p",
+                   __FUNCTION__,
+                   pmap, target_level, (void *)addr);
+       }
+
+       return ret_ttep;
+}
+
 #else
 
+static inline tt_entry_t *
+pmap_ttne(pmap_t pmap,
+    unsigned int target_level,
+    vm_map_address_t addr)
+{
+       tt_entry_t * ttp = NULL;
+       tt_entry_t * ttep = NULL;
+       tt_entry_t   tte = ARM_TTE_EMPTY;
+       unsigned int cur_level;
+
+       const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
+
+       ttp = pmap->tte;
+
+       assert(target_level <= pt_attr->pta_max_level);
+
+       for (cur_level = pt_attr->pta_root_level; cur_level <= target_level; cur_level++) {
+               ttep = &ttp[ttn_index(pmap, pt_attr, addr, cur_level)];
+
+               if (cur_level == target_level) {
+                       break;
+               }
+
+               tte = *ttep;
+
+#if MACH_ASSERT
+               if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) == (ARM_TTE_TYPE_BLOCK | ARM_TTE_VALID)) {
+                       panic("%s: Attempt to demote L%u block, tte=0x%llx, "
+                           "pmap=%p, target_level=%u, addr=%p",
+                           __FUNCTION__, cur_level, tte,
+                           pmap, target_level, (void*)addr);
+               }
+#endif
+               if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) != (ARM_TTE_TYPE_TABLE | ARM_TTE_VALID)) {
+                       return TT_ENTRY_NULL;
+               }
+
+               ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
+       }
+
+       return ttep;
+}
+
 /*
  *     Given an offset and a map, compute the address of level 1 translation table entry.
  *     If the tranlation is invalid then PT_ENTRY_NULL is returned.
@@ -2492,14 +2937,7 @@ static inline tt_entry_t *
 pmap_tt1e(pmap_t pmap,
     vm_map_address_t addr)
 {
-       /* Level 0 currently unused */
-#if __ARM64_TWO_LEVEL_PMAP__
-#pragma unused(pmap, addr)
-       panic("pmap_tt1e called on a two level pmap");
-       return NULL;
-#else
-       return &pmap->tte[tt1_index(pmap, addr)];
-#endif
+       return pmap_ttne(pmap, PMAP_TT_L1_LEVEL, addr);
 }
 
 /*
@@ -2510,26 +2948,7 @@ static inline tt_entry_t *
 pmap_tt2e(pmap_t pmap,
     vm_map_address_t addr)
 {
-#if __ARM64_TWO_LEVEL_PMAP__
-       return &pmap->tte[tt2_index(pmap, addr)];
-#else
-       tt_entry_t     *ttp;
-       tt_entry_t      tte;
-
-       ttp = pmap_tt1e(pmap, addr);
-       tte = *ttp;
-       #if MACH_ASSERT
-       if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) == (ARM_TTE_TYPE_BLOCK | ARM_TTE_VALID)) {
-               panic("Attempt to demote L1 block (?!): pmap=%p, va=0x%llx, tte=0x%llx\n", pmap, (uint64_t)addr, (uint64_t)tte);
-       }
-       #endif
-       if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) != (ARM_TTE_TYPE_TABLE | ARM_TTE_VALID)) {
-               return PT_ENTRY_NULL;
-       }
-
-       ttp = &((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt2_index(pmap, addr)];
-       return (tt_entry_t *)ttp;
-#endif
+       return pmap_ttne(pmap, PMAP_TT_L2_LEVEL, addr);
 }
 
 
@@ -2542,32 +2961,9 @@ pmap_tt3e(
        pmap_t pmap,
        vm_map_address_t addr)
 {
-       pt_entry_t     *ptp;
-       tt_entry_t     *ttp;
-       tt_entry_t      tte;
-
-       ttp = pmap_tt2e(pmap, addr);
-       if (ttp == PT_ENTRY_NULL) {
-               return PT_ENTRY_NULL;
-       }
-
-       tte = *ttp;
-
-#if MACH_ASSERT
-       if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) == (ARM_TTE_TYPE_BLOCK | ARM_TTE_VALID)) {
-               panic("Attempt to demote L2 block: pmap=%p, va=0x%llx, tte=0x%llx\n", pmap, (uint64_t)addr, (uint64_t)tte);
-       }
-#endif
-       if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) != (ARM_TTE_TYPE_TABLE | ARM_TTE_VALID)) {
-               return PT_ENTRY_NULL;
-       }
-
-       /* Get third-level (4KB) entry */
-       ptp = &(((pt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt3_index(pmap, addr)]);
-       return ptp;
+       return (pt_entry_t*)pmap_ttne(pmap, PMAP_TT_L3_LEVEL, addr);
 }
 
-
 static inline tt_entry_t *
 pmap_tte(
        pmap_t pmap,
@@ -2576,7 +2972,6 @@ pmap_tte(
        return pmap_tt2e(pmap, addr);
 }
 
-
 static inline pt_entry_t *
 pmap_pte(
        pmap_t pmap,
@@ -2588,6 +2983,10 @@ pmap_pte(
 #endif
 
 
+
+
+
+
 /*
  *      Map memory at initialization.  The physical addresses being
  *      mapped are not managed and are never unmapped.
@@ -2650,6 +3049,12 @@ pmap_map_bd_with_options(
        case PMAP_MAP_BD_POSTED:
                mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
                break;
+       case PMAP_MAP_BD_POSTED_REORDERED:
+               mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
+               break;
+       case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
+               mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
+               break;
        default:
                mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
                break;
@@ -2666,9 +3071,13 @@ pmap_map_bd_with_options(
        while (paddr < end) {
                ptep = pmap_pte(kernel_pmap, vaddr);
                if (ptep == PT_ENTRY_NULL) {
-                       panic("pmap_map_bd");
+                       panic("%s: no PTE for vaddr=%p, "
+                           "virt=%p, start=%p, end=%p, prot=0x%x, options=0x%x",
+                           __FUNCTION__, (void*)vaddr,
+                           (void*)virt, (void*)start, (void*)end, prot, options);
                }
-               assert(!ARM_PTE_IS_COMPRESSED(*ptep));
+
+               assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
                WRITE_PTE_STRONG(ptep, tmplate);
 
                pte_increment_pa(tmplate);
@@ -2717,7 +3126,7 @@ pmap_map_bd(
                if (ptep == PT_ENTRY_NULL) {
                        panic("pmap_map_bd");
                }
-               assert(!ARM_PTE_IS_COMPRESSED(*ptep));
+               assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
                WRITE_PTE_STRONG(ptep, tmplate);
 
                pte_increment_pa(tmplate);
@@ -2763,24 +3172,30 @@ pmap_map_high_window_bd(
        len += offset;
 
        if (len > (va_max - va_start)) {
-               panic("pmap_map_high_window_bd: area too large\n");
+               panic("%s: area too large, "
+                   "pa_start=%p, len=%p, prot=0x%x",
+                   __FUNCTION__,
+                   (void*)pa_start, (void*)len, prot);
        }
 
 scan:
        for (; va_start < va_max; va_start += PAGE_SIZE) {
                ptep = pmap_pte(kernel_pmap, va_start);
-               assert(!ARM_PTE_IS_COMPRESSED(*ptep));
+               assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
                if (*ptep == ARM_PTE_TYPE_FAULT) {
                        break;
                }
        }
        if (va_start > va_max) {
-               panic("pmap_map_high_window_bd: insufficient pages\n");
+               panic("%s: insufficient pages, "
+                   "pa_start=%p, len=%p, prot=0x%x",
+                   __FUNCTION__,
+                   (void*)pa_start, (void*)len, prot);
        }
 
        for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
                ptep = pmap_pte(kernel_pmap, va_end);
-               assert(!ARM_PTE_IS_COMPRESSED(*ptep));
+               assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
                if (*ptep != ARM_PTE_TYPE_FAULT) {
                        va_start = va_end + PAGE_SIZE;
                        goto scan;
@@ -2803,7 +3218,7 @@ scan:
 #endif /* __ARM_KERNEL_PROTECT__ */
                WRITE_PTE_STRONG(ptep, pte);
        }
-       PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len);
+       PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false);
 #if KASAN
        kasan_notify_address(va_start, len);
 #endif
@@ -2835,32 +3250,18 @@ pmap_compute_io_rgns(void)
                        panic("pmap I/O region %u addr 0x%llx is not page-aligned", i, ranges[i].addr);
                }
                if (ranges[i].len & PAGE_MASK) {
-                       panic("pmap I/O region %u length 0x%x is not page-aligned", i, ranges[i].len);
+                       panic("pmap I/O region %u length 0x%llx is not page-aligned", i, ranges[i].len);
                }
                if (os_add_overflow(ranges[i].addr, ranges[i].len, &rgn_end)) {
-                       panic("pmap I/O region %u addr 0x%llx length 0x%x wraps around", i, ranges[i].addr, ranges[i].len);
-               }
-               if ((i == 0) || (ranges[i].addr < io_rgn_start)) {
-                       io_rgn_start = ranges[i].addr;
+                       panic("pmap I/O region %u addr 0x%llx length 0x%llx wraps around", i, ranges[i].addr, ranges[i].len);
                }
-               if ((i == 0) || (rgn_end > io_rgn_end)) {
-                       io_rgn_end = rgn_end;
+               if (((ranges[i].addr <= gPhysBase) && (rgn_end > gPhysBase)) ||
+                   ((ranges[i].addr < avail_end) && (rgn_end >= avail_end)) ||
+                   ((ranges[i].addr > gPhysBase) && (rgn_end < avail_end))) {
+                       panic("pmap I/O region %u addr 0x%llx length 0x%llx overlaps physical memory", i, ranges[i].addr, ranges[i].len);
                }
-               ++num_io_rgns;
-       }
 
-       if (io_rgn_start & PAGE_MASK) {
-               panic("pmap I/O region start is not page-aligned!\n");
-       }
-
-       if (io_rgn_end & PAGE_MASK) {
-               panic("pmap I/O region end is not page-aligned!\n");
-       }
-
-       if (((io_rgn_start <= gPhysBase) && (io_rgn_end > gPhysBase)) ||
-           ((io_rgn_start < avail_end) && (io_rgn_end >= avail_end)) ||
-           ((io_rgn_start > gPhysBase) && (io_rgn_end < avail_end))) {
-               panic("pmap I/O region overlaps physical memory!\n");
+               ++num_io_rgns;
        }
 
        return num_io_rgns * sizeof(*ranges);
@@ -2931,51 +3332,48 @@ pmap_get_arm64_prot(
        pmap_t pmap,
        vm_offset_t addr)
 {
-       uint64_t tte;
-       uint64_t tt_type, table_ap, table_xn, table_pxn;
-       uint64_t prot = 0;
-
-       tte = *pmap_tt1e(pmap, addr);
-
-       if (!(tte & ARM_TTE_VALID)) {
-               return 0;
-       }
-
-       tt_type = tte & ARM_TTE_TYPE_MASK;
+       tt_entry_t tte = 0;
+       unsigned int level = 0;
+       uint64_t tte_type = 0;
+       uint64_t effective_prot_bits = 0;
+       uint64_t aggregate_tte = 0;
+       uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
+       const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
 
-       if (tt_type == ARM_TTE_TYPE_BLOCK) {
-               return prot | (tte & ARM_TTE_BLOCK_NX) | (tte & ARM_TTE_BLOCK_PNX) | (tte & ARM_TTE_BLOCK_APMASK) | ARM_TTE_VALID;
-       }
+       for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
+               tte = *pmap_ttne(pmap, level, addr);
 
-       table_ap = (tte >> ARM_TTE_TABLE_APSHIFT) & 0x3;
-       table_xn = tte & ARM_TTE_TABLE_XN;
-       table_pxn = tte & ARM_TTE_TABLE_PXN;
+               if (!(tte & ARM_TTE_VALID)) {
+                       return 0;
+               }
 
-       prot |= (table_ap << ARM_TTE_BLOCK_APSHIFT) | (table_xn ? ARM_TTE_BLOCK_NX : 0) | (table_pxn ? ARM_TTE_BLOCK_PNX : 0);
+               tte_type = tte & ARM_TTE_TYPE_MASK;
 
-       tte = *pmap_tt2e(pmap, addr);
-       if (!(tte & ARM_TTE_VALID)) {
-               return 0;
+               if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
+                   (level == pt_attr->pta_max_level)) {
+                       /* Block or page mapping; both have the same protection bit layout. */
+                       break;
+               } else if (tte_type == ARM_TTE_TYPE_TABLE) {
+                       /* All of the table bits we care about are overrides, so just OR them together. */
+                       aggregate_tte |= tte;
+               }
        }
 
-       tt_type = tte & ARM_TTE_TYPE_MASK;
-
-       if (tt_type == ARM_TTE_TYPE_BLOCK) {
-               return prot | (tte & ARM_TTE_BLOCK_NX) | (tte & ARM_TTE_BLOCK_PNX) | (tte & ARM_TTE_BLOCK_APMASK) | ARM_TTE_VALID;
-       }
+       table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
+       table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
+       table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
 
-       table_ap = (tte >> ARM_TTE_TABLE_APSHIFT) & 0x3;
-       table_xn = tte & ARM_TTE_TABLE_XN;
-       table_pxn = tte & ARM_TTE_TABLE_PXN;
+       /* Start with the PTE bits. */
+       effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
 
-       prot |= (table_ap << ARM_TTE_BLOCK_APSHIFT) | (table_xn ? ARM_TTE_BLOCK_NX : 0) | (table_pxn ? ARM_TTE_BLOCK_PNX : 0);
+       /* Table AP bits mask out block/page AP bits */
+       effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
 
-       tte = *pmap_tt3e(pmap, addr);
-       if (!(tte & ARM_TTE_VALID)) {
-               return 0;
-       }
+       /* XN/PXN bits can be OR'd in. */
+       effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
+       effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
 
-       return prot | (tte & ARM_TTE_BLOCK_NX) | (tte & ARM_TTE_BLOCK_PNX) | (tte & ARM_TTE_BLOCK_APMASK) | ARM_TTE_VALID;
+       return effective_prot_bits;
 }
 #endif /* __arm64__ */
 
@@ -3012,7 +3410,6 @@ pmap_bootstrap(
        vm_size_t       pp_attr_table_size;
        vm_size_t       io_attr_table_size;
        unsigned int    npages;
-       unsigned int    i;
        vm_map_offset_t maxoffset;
 
        lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
@@ -3028,6 +3425,12 @@ pmap_bootstrap(
         *      Initialize the kernel pmap.
         */
        pmap_stamp = 1;
+#if ARM_PARAMETERIZED_PMAP
+       kernel_pmap->pmap_pt_attr = native_pt_attr;
+#endif /* ARM_PARAMETERIZED_PMAP */
+#if HAS_APPLE_PAC
+       kernel_pmap->disable_jop = 0;
+#endif /* HAS_APPLE_PAC */
        kernel_pmap->tte = cpu_tte;
        kernel_pmap->ttep = cpu_ttep;
 #if (__ARM_VMSA__ > 7)
@@ -3036,7 +3439,7 @@ pmap_bootstrap(
        kernel_pmap->min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
 #endif
        kernel_pmap->max = VM_MAX_KERNEL_ADDRESS;
-       kernel_pmap->ref_count = 1;
+       os_atomic_init(&kernel_pmap->ref_count, 1);
        kernel_pmap->gc_status = 0;
        kernel_pmap->nx_enabled = TRUE;
 #ifdef  __arm64__
@@ -3044,7 +3447,7 @@ pmap_bootstrap(
 #else
        kernel_pmap->is_64bit = FALSE;
 #endif
-       kernel_pmap->stamp = hw_atomic_add(&pmap_stamp, 1);
+       kernel_pmap->stamp = os_atomic_inc(&pmap_stamp, relaxed);
 
        kernel_pmap->nested_region_grand_addr = 0x0ULL;
        kernel_pmap->nested_region_subord_addr = 0x0ULL;
@@ -3054,10 +3457,10 @@ pmap_bootstrap(
 
 #if (__ARM_VMSA__ == 7)
        kernel_pmap->tte_index_max = 4 * NTTES;
-#else
-       kernel_pmap->tte_index_max = (ARM_PGBYTES / sizeof(tt_entry_t));
 #endif
        kernel_pmap->prev_tte = (tt_entry_t *) NULL;
+       kernel_pmap->hw_asid = 0;
+       kernel_pmap->sw_asid = 0;
 
        PMAP_LOCK_INIT(kernel_pmap);
 #if     (__ARM_VMSA__ == 7)
@@ -3096,6 +3499,8 @@ pmap_bootstrap(
        vm_last_phys = trunc_page(avail_end);
 
        simple_lock_init(&pmaps_lock, 0);
+       simple_lock_init(&asid_lock, 0);
+       simple_lock_init(&tt1_lock, 0);
        queue_init(&map_pmap_list);
        queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
        free_page_size_tt_list = TT_FREE_ENTRY_NULL;
@@ -3119,28 +3524,8 @@ pmap_bootstrap(
        virtual_space_start = vstart;
        virtual_space_end = VM_MAX_KERNEL_ADDRESS;
 
-       /* mark all the address spaces in use */
-       for (i = 0; i < MAX_ASID / (sizeof(uint32_t) * NBBY); i++) {
-               asid_bitmap[i] = 0xffffffff;
-       }
-
-       /*
-        * The kernel gets ASID 0, and all aliases of it.  This is
-        * important because ASID 0 is global; if we vend ASID 0
-        * out to a user pmap, those translations will show up in
-        * other processes through the TLB.
-        */
-       for (i = 0; i < MAX_ASID; i += ARM_MAX_ASID) {
-               asid_bitmap[i / (sizeof(uint32_t) * NBBY)] &= ~(1 << (i % (sizeof(uint32_t) * NBBY)));
-
-#if __ARM_KERNEL_PROTECT__
-               assert((i + 1) < MAX_ASID);
-               asid_bitmap[(i + 1) / (sizeof(uint32_t) * NBBY)] &= ~(1 << ((i + 1) % (sizeof(uint32_t) * NBBY)));
-#endif /* __ARM_KERNEL_PROTECT__ */
-       }
+       bitmap_full(&asid_bitmap[0], MAX_ASID);
 
-       kernel_pmap->asid = 0;
-       kernel_pmap->vasid = 0;
 
 
        if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
@@ -3287,7 +3672,8 @@ pmap_free_pages(
 
 boolean_t
 pmap_next_page_hi(
-       ppnum_t * pnum)
+       ppnum_t            * pnum,
+       __unused boolean_t might_free)
 {
        return pmap_next_page(pnum);
 }
@@ -3338,21 +3724,21 @@ pmap_init(
        pv_init();
 
        /*
-        * The value of hard_maxproc may have been scaled, make sure
-        * it is still less than the value of MAX_ASID.
+        * The values of [hard_]maxproc may have been scaled, make sure
+        * they are still less than the value of MAX_ASID.
         */
-       assert(hard_maxproc < MAX_ASID);
+       if (maxproc > MAX_ASID) {
+               maxproc = MAX_ASID;
+       }
+       if (hard_maxproc > MAX_ASID) {
+               hard_maxproc = MAX_ASID;
+       }
 
 #if CONFIG_PGTRACE
        pmap_pgtrace_init();
 #endif
 }
 
-void
-pmap_pv_fixup(__unused vm_offset_t start, __unused vm_size_t length)
-{
-}
-
 boolean_t
 pmap_verify_free(
        ppnum_t ppnum)
@@ -3398,7 +3784,7 @@ pmap_zone_init(
            PAGE_SIZE, "pmap");
 }
 
-
+__dead2
 void
 pmap_ledger_alloc_init(size_t size)
 {
@@ -3407,17 +3793,15 @@ pmap_ledger_alloc_init(size_t size)
            __func__, size);
 }
 
+__dead2
 ledger_t
 pmap_ledger_alloc(void)
 {
-       ledger_t retval = NULL;
-
        panic("%s: unsupported",
            __func__);
-
-       return retval;
 }
 
+__dead2
 void
 pmap_ledger_free(ledger_t ledger)
 {
@@ -3439,13 +3823,18 @@ pmap_ledger_free(ledger_t ledger)
  *     is bounded by that size.
  */
 MARK_AS_PMAP_TEXT static pmap_t
-pmap_create_internal(
+pmap_create_options_internal(
        ledger_t ledger,
        vm_map_size_t size,
-       boolean_t is_64bit)
+       unsigned int flags)
 {
        unsigned        i;
+       unsigned        tte_index_max;
        pmap_t          p;
+       bool is_64bit = flags & PMAP_CREATE_64BIT;
+#if defined(HAS_APPLE_PAC)
+       bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
+#endif /* defined(HAS_APPLE_PAC) */
 
        /*
         *      A software use-only map doesn't even need a pmap.
@@ -3454,7 +3843,6 @@ pmap_create_internal(
                return PMAP_NULL;
        }
 
-
        /*
         *      Allocate a pmap struct from the pmap_zone.  Then allocate
         *      the translation table of the right size for the pmap.
@@ -3463,7 +3851,7 @@ pmap_create_internal(
                return PMAP_NULL;
        }
 
-       if (is_64bit) {
+       if (flags & PMAP_CREATE_64BIT) {
                p->min = MACH_VM_MIN_ADDRESS;
                p->max = MACH_VM_MAX_ADDRESS;
        } else {
@@ -3471,17 +3859,29 @@ pmap_create_internal(
                p->max = VM_MAX_ADDRESS;
        }
 
+#if defined(HAS_APPLE_PAC)
+       p->disable_jop = disable_jop;
+#endif /* defined(HAS_APPLE_PAC) */
+
        p->nested_region_true_start = 0;
        p->nested_region_true_end = ~0;
 
-       p->ref_count = 1;
+       os_atomic_init(&p->ref_count, 1);
        p->gc_status = 0;
-       p->stamp = hw_atomic_add(&pmap_stamp, 1);
+       p->stamp = os_atomic_inc(&pmap_stamp, relaxed);
        p->nx_enabled = TRUE;
        p->is_64bit = is_64bit;
        p->nested = FALSE;
        p->nested_pmap = PMAP_NULL;
 
+#if ARM_PARAMETERIZED_PMAP
+       p->pmap_pt_attr = native_pt_attr;
+#endif /* ARM_PARAMETERIZED_PMAP */
+
+       if (!pmap_get_pt_ops(p)->alloc_id(p)) {
+               goto id_alloc_fail;
+       }
+
 
 
        p->ledger = ledger;
@@ -3496,26 +3896,26 @@ pmap_create_internal(
        p->tt_entry_free = (tt_entry_t *)0;
 
        p->tte = pmap_tt1_allocate(p, PMAP_ROOT_ALLOC_SIZE, 0);
+       if (!(p->tte)) {
+               goto tt1_alloc_fail;
+       }
+
        p->ttep = ml_static_vtop((vm_offset_t)p->tte);
        PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
 
 #if (__ARM_VMSA__ == 7)
-       p->tte_index_max = NTTES;
+       tte_index_max = p->tte_index_max = NTTES;
 #else
-       p->tte_index_max = (PMAP_ROOT_ALLOC_SIZE / sizeof(tt_entry_t));
+       tte_index_max = (PMAP_ROOT_ALLOC_SIZE / sizeof(tt_entry_t));
 #endif
        p->prev_tte = (tt_entry_t *) NULL;
 
        /* nullify the translation table */
-       for (i = 0; i < p->tte_index_max; i++) {
+       for (i = 0; i < tte_index_max; i++) {
                p->tte[i] = ARM_TTE_TYPE_FAULT;
        }
 
-       FLUSH_PTE_RANGE(p->tte, p->tte + p->tte_index_max);
-
-       /* assign a asid */
-       p->vasid = alloc_asid();
-       p->asid = p->vasid % ARM_MAX_ASID;
+       FLUSH_PTE_RANGE(p->tte, p->tte + tte_index_max);
 
        /*
         *  initialize the rest of the structure
@@ -3545,27 +3945,33 @@ pmap_create_internal(
        pmap_simple_unlock(&pmaps_lock);
 
        return p;
+
+tt1_alloc_fail:
+       pmap_get_pt_ops(p)->free_id(p);
+id_alloc_fail:
+       zfree(pmap_zone, p);
+       return PMAP_NULL;
 }
 
 pmap_t
-pmap_create(
+pmap_create_options(
        ledger_t ledger,
        vm_map_size_t size,
-       boolean_t is_64bit)
+       unsigned int flags)
 {
        pmap_t pmap;
 
-       PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, is_64bit);
+       PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
 
        ledger_reference(ledger);
 
-       pmap = pmap_create_internal(ledger, size, is_64bit);
+       pmap = pmap_create_options_internal(ledger, size, flags);
 
        if (pmap == PMAP_NULL) {
                ledger_dereference(ledger);
        }
 
-       PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid);
+       PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
 
        return pmap;
 }
@@ -3625,6 +4031,7 @@ pmap_set_process(
 {
        pmap_set_process_internal(pmap, pid, procname);
 }
+#endif /* MACH_ASSERT */
 
 /*
  * We maintain stats and ledgers so that a task's physical footprint is:
@@ -3637,115 +4044,6 @@ pmap_set_process(
  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
  */
 
-struct {
-       uint64_t        num_pmaps_checked;
-
-       int             phys_footprint_over;
-       ledger_amount_t phys_footprint_over_total;
-       ledger_amount_t phys_footprint_over_max;
-       int             phys_footprint_under;
-       ledger_amount_t phys_footprint_under_total;
-       ledger_amount_t phys_footprint_under_max;
-
-       int             internal_over;
-       ledger_amount_t internal_over_total;
-       ledger_amount_t internal_over_max;
-       int             internal_under;
-       ledger_amount_t internal_under_total;
-       ledger_amount_t internal_under_max;
-
-       int             internal_compressed_over;
-       ledger_amount_t internal_compressed_over_total;
-       ledger_amount_t internal_compressed_over_max;
-       int             internal_compressed_under;
-       ledger_amount_t internal_compressed_under_total;
-       ledger_amount_t internal_compressed_under_max;
-
-       int             iokit_mapped_over;
-       ledger_amount_t iokit_mapped_over_total;
-       ledger_amount_t iokit_mapped_over_max;
-       int             iokit_mapped_under;
-       ledger_amount_t iokit_mapped_under_total;
-       ledger_amount_t iokit_mapped_under_max;
-
-       int             alternate_accounting_over;
-       ledger_amount_t alternate_accounting_over_total;
-       ledger_amount_t alternate_accounting_over_max;
-       int             alternate_accounting_under;
-       ledger_amount_t alternate_accounting_under_total;
-       ledger_amount_t alternate_accounting_under_max;
-
-       int             alternate_accounting_compressed_over;
-       ledger_amount_t alternate_accounting_compressed_over_total;
-       ledger_amount_t alternate_accounting_compressed_over_max;
-       int             alternate_accounting_compressed_under;
-       ledger_amount_t alternate_accounting_compressed_under_total;
-       ledger_amount_t alternate_accounting_compressed_under_max;
-
-       int             page_table_over;
-       ledger_amount_t page_table_over_total;
-       ledger_amount_t page_table_over_max;
-       int             page_table_under;
-       ledger_amount_t page_table_under_total;
-       ledger_amount_t page_table_under_max;
-
-       int             purgeable_volatile_over;
-       ledger_amount_t purgeable_volatile_over_total;
-       ledger_amount_t purgeable_volatile_over_max;
-       int             purgeable_volatile_under;
-       ledger_amount_t purgeable_volatile_under_total;
-       ledger_amount_t purgeable_volatile_under_max;
-
-       int             purgeable_nonvolatile_over;
-       ledger_amount_t purgeable_nonvolatile_over_total;
-       ledger_amount_t purgeable_nonvolatile_over_max;
-       int             purgeable_nonvolatile_under;
-       ledger_amount_t purgeable_nonvolatile_under_total;
-       ledger_amount_t purgeable_nonvolatile_under_max;
-
-       int             purgeable_volatile_compressed_over;
-       ledger_amount_t purgeable_volatile_compressed_over_total;
-       ledger_amount_t purgeable_volatile_compressed_over_max;
-       int             purgeable_volatile_compressed_under;
-       ledger_amount_t purgeable_volatile_compressed_under_total;
-       ledger_amount_t purgeable_volatile_compressed_under_max;
-
-       int             purgeable_nonvolatile_compressed_over;
-       ledger_amount_t purgeable_nonvolatile_compressed_over_total;
-       ledger_amount_t purgeable_nonvolatile_compressed_over_max;
-       int             purgeable_nonvolatile_compressed_under;
-       ledger_amount_t purgeable_nonvolatile_compressed_under_total;
-       ledger_amount_t purgeable_nonvolatile_compressed_under_max;
-
-       int             network_volatile_over;
-       ledger_amount_t network_volatile_over_total;
-       ledger_amount_t network_volatile_over_max;
-       int             network_volatile_under;
-       ledger_amount_t network_volatile_under_total;
-       ledger_amount_t network_volatile_under_max;
-
-       int             network_nonvolatile_over;
-       ledger_amount_t network_nonvolatile_over_total;
-       ledger_amount_t network_nonvolatile_over_max;
-       int             network_nonvolatile_under;
-       ledger_amount_t network_nonvolatile_under_total;
-       ledger_amount_t network_nonvolatile_under_max;
-
-       int             network_volatile_compressed_over;
-       ledger_amount_t network_volatile_compressed_over_total;
-       ledger_amount_t network_volatile_compressed_over_max;
-       int             network_volatile_compressed_under;
-       ledger_amount_t network_volatile_compressed_under_total;
-       ledger_amount_t network_volatile_compressed_under_max;
-
-       int             network_nonvolatile_compressed_over;
-       ledger_amount_t network_nonvolatile_compressed_over_total;
-       ledger_amount_t network_nonvolatile_compressed_over_max;
-       int             network_nonvolatile_compressed_under;
-       ledger_amount_t network_nonvolatile_compressed_under_total;
-       ledger_amount_t network_nonvolatile_compressed_under_max;
-} pmap_ledgers_drift;
-#endif /* MACH_ASSERT */
 
 /*
  *     Retire the given physical map from service.
@@ -3762,7 +4060,9 @@ pmap_destroy_internal(
 
        VALIDATE_PMAP(pmap);
 
-       int32_t ref_count = __c11_atomic_fetch_sub(&pmap->ref_count, 1, memory_order_relaxed) - 1;
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
+
+       int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
        if (ref_count > 0) {
                return;
        } else if (ref_count < 0) {
@@ -3771,9 +4071,11 @@ pmap_destroy_internal(
                panic("pmap %p: attempt to destroy kernel pmap", pmap);
        }
 
-#if (__ARM_VMSA__ == 7)
        pt_entry_t     *ttep;
-       unsigned int    i;
+
+#if (__ARM_VMSA__ > 7)
+       pmap_unmap_sharedpage(pmap);
+#endif /* (__ARM_VMSA__ > 7) */
 
        pmap_simple_lock(&pmaps_lock);
        while (pmap->gc_status & PMAP_GC_INFLIGHT) {
@@ -3786,9 +4088,14 @@ pmap_destroy_internal(
        queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
        pmap_simple_unlock(&pmaps_lock);
 
+#if (__ARM_VMSA__ == 7)
        if (pmap->cpu_ref != 0) {
-               panic("pmap_destroy(%p): cpu_ref = %u", pmap, pmap->cpu_ref);
+               panic("%s: cpu_ref=%u, "
+                   "pmap=%p",
+                   __FUNCTION__, pmap->cpu_ref,
+                   pmap);
        }
+#endif /* (__ARM_VMSA__ == 7) */
 
        pmap_trim_self(pmap);
 
@@ -3796,6 +4103,9 @@ pmap_destroy_internal(
         *      Free the memory maps, then the
         *      pmap structure.
         */
+#if (__ARM_VMSA__ == 7)
+       unsigned int i = 0;
+
        PMAP_LOCK(pmap);
        for (i = 0; i < pmap->tte_index_max; i++) {
                ttep = &pmap->tte[i];
@@ -3804,91 +4114,57 @@ pmap_destroy_internal(
                }
        }
        PMAP_UNLOCK(pmap);
+#else /* (__ARM_VMSA__ == 7) */
+       vm_map_address_t c;
+       unsigned int level;
+
+       for (level = pt_attr->pta_max_level - 1; level >= pt_attr->pta_root_level; level--) {
+               for (c = pmap->min; c < pmap->max; c += pt_attr_ln_size(pt_attr, level)) {
+                       ttep = pmap_ttne(pmap, level, c);
+
+                       if ((ttep != PT_ENTRY_NULL) && (*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
+                               PMAP_LOCK(pmap);
+                               pmap_tte_deallocate(pmap, ttep, level);
+                               PMAP_UNLOCK(pmap);
+                       }
+               }
+       }
+#endif /* (__ARM_VMSA__ == 7) */
+
+
 
        if (pmap->tte) {
+#if (__ARM_VMSA__ == 7)
                pmap_tt1_deallocate(pmap, pmap->tte, pmap->tte_index_max * sizeof(tt_entry_t), 0);
+               pmap->tte_index_max = 0;
+#else /* (__ARM_VMSA__ == 7) */
+               pmap_tt1_deallocate(pmap, pmap->tte, PMAP_ROOT_ALLOC_SIZE, 0);
+#endif /* (__ARM_VMSA__ == 7) */
                pmap->tte = (tt_entry_t *) NULL;
                pmap->ttep = 0;
-               pmap->tte_index_max = 0;
        }
+
+#if (__ARM_VMSA__ == 7)
        if (pmap->prev_tte) {
                pmap_tt1_deallocate(pmap, pmap->prev_tte, PMAP_ROOT_ALLOC_SIZE, 0);
                pmap->prev_tte = (tt_entry_t *) NULL;
        }
+#endif /* (__ARM_VMSA__ == 7) */
+
        assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
 
-       flush_mmu_tlb_asid(pmap->asid);
+       pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
+       sync_tlb_flush();
+
        /* return its asid to the pool */
-       free_asid(pmap->vasid);
+       pmap_get_pt_ops(pmap)->free_id(pmap);
        pmap_check_ledgers(pmap);
 
-
-       if (pmap->nested_region_asid_bitmap) {
-               kfree(pmap->nested_region_asid_bitmap, pmap->nested_region_asid_bitmap_size * sizeof(unsigned int));
-       }
-       zfree(pmap_zone, pmap);
-#else /* __ARM_VMSA__ == 7 */
-       pt_entry_t     *ttep;
-       pmap_paddr_t    pa;
-       vm_map_address_t c;
-
-       pmap_unmap_sharedpage(pmap);
-
-       pmap_simple_lock(&pmaps_lock);
-       while (pmap->gc_status & PMAP_GC_INFLIGHT) {
-               pmap->gc_status |= PMAP_GC_WAIT;
-               assert_wait((event_t) &pmap->gc_status, THREAD_UNINT);
-               pmap_simple_unlock(&pmaps_lock);
-               (void) thread_block(THREAD_CONTINUE_NULL);
-               pmap_simple_lock(&pmaps_lock);
-       }
-       queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
-       pmap_simple_unlock(&pmaps_lock);
-
-       pmap_trim_self(pmap);
-
-       /*
-        *      Free the memory maps, then the
-        *      pmap structure.
-        */
-       for (c = pmap->min; c < pmap->max; c += ARM_TT_L2_SIZE) {
-               ttep = pmap_tt2e(pmap, c);
-               if ((ttep != PT_ENTRY_NULL) && (*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
-                       PMAP_LOCK(pmap);
-                       pmap_tte_deallocate(pmap, ttep, PMAP_TT_L2_LEVEL);
-                       PMAP_UNLOCK(pmap);
-               }
-       }
-#if !__ARM64_TWO_LEVEL_PMAP__
-       for (c = pmap->min; c < pmap->max; c += ARM_TT_L1_SIZE) {
-               ttep = pmap_tt1e(pmap, c);
-               if ((ttep != PT_ENTRY_NULL) && (*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
-                       PMAP_LOCK(pmap);
-                       pmap_tte_deallocate(pmap, ttep, PMAP_TT_L1_LEVEL);
-                       PMAP_UNLOCK(pmap);
-               }
-       }
-#endif
-
-
-       if (pmap->tte) {
-               pa = pmap->ttep;
-               pmap_tt1_deallocate(pmap, (tt_entry_t *)phystokv(pa), PMAP_ROOT_ALLOC_SIZE, 0);
-       }
-
-       assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
-       flush_mmu_tlb_asid((uint64_t)(pmap->asid) << TLBI_ASID_SHIFT);
-       free_asid(pmap->vasid);
-
        if (pmap->nested_region_asid_bitmap) {
                kfree(pmap->nested_region_asid_bitmap, pmap->nested_region_asid_bitmap_size * sizeof(unsigned int));
        }
 
-       pmap_check_ledgers(pmap);
-
        zfree(pmap_zone, pmap);
-
-#endif /* __ARM_VMSA__ == 7 */
 }
 
 void
@@ -3897,7 +4173,7 @@ pmap_destroy(
 {
        ledger_t ledger;
 
-       PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid);
+       PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
 
        ledger = pmap->ledger;
 
@@ -3918,7 +4194,7 @@ pmap_reference_internal(
 {
        if (pmap != PMAP_NULL) {
                VALIDATE_PMAP(pmap);
-               __c11_atomic_fetch_add(&pmap->ref_count, 1, memory_order_relaxed);
+               os_atomic_inc(&pmap->ref_count, relaxed);
        }
 }
 
@@ -3935,43 +4211,35 @@ pmap_tt1_allocate(
        vm_size_t       size,
        unsigned        option)
 {
-       tt_entry_t              *tt1;
+       tt_entry_t      *tt1 = NULL;
        tt_free_entry_t *tt1_free;
        pmap_paddr_t    pa;
        vm_address_t    va;
        vm_address_t    va_end;
        kern_return_t   ret;
 
-       pmap_simple_lock(&pmaps_lock);
+       pmap_simple_lock(&tt1_lock);
        if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
                free_page_size_tt_count--;
                tt1 = (tt_entry_t *)free_page_size_tt_list;
                free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
-               pmap_simple_unlock(&pmaps_lock);
-               pmap_tt_ledger_credit(pmap, size);
-               return (tt_entry_t *)tt1;
-       }
-       ;
-       if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
+       } else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
                free_two_page_size_tt_count--;
                tt1 = (tt_entry_t *)free_two_page_size_tt_list;
                free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
-               pmap_simple_unlock(&pmaps_lock);
-               pmap_tt_ledger_credit(pmap, size);
-               return (tt_entry_t *)tt1;
-       }
-       ;
-       if (free_tt_count != 0) {
+       } else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
                free_tt_count--;
                tt1 = (tt_entry_t *)free_tt_list;
                free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
-               pmap_simple_unlock(&pmaps_lock);
+       }
+
+       pmap_simple_unlock(&tt1_lock);
+
+       if (tt1 != NULL) {
                pmap_tt_ledger_credit(pmap, size);
                return (tt_entry_t *)tt1;
        }
 
-       pmap_simple_unlock(&pmaps_lock);
-
        ret = pmap_pages_alloc(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
 
        if (ret == KERN_RESOURCE_SHORTAGE) {
@@ -3980,19 +4248,22 @@ pmap_tt1_allocate(
 
 
        if (size < PAGE_SIZE) {
-               pmap_simple_lock(&pmaps_lock);
-
-               for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + size; va < va_end; va = va + size) {
+               va = phystokv(pa) + size;
+               tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
+               tt_free_entry_t *next_free = NULL;
+               for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
                        tt1_free = (tt_free_entry_t *)va;
-                       tt1_free->next = free_tt_list;
-                       free_tt_list = tt1_free;
-                       free_tt_count++;
+                       tt1_free->next = next_free;
+                       next_free = tt1_free;
                }
+               pmap_simple_lock(&tt1_lock);
+               local_free_list->next = free_tt_list;
+               free_tt_list = next_free;
+               free_tt_count += ((PAGE_SIZE / size) - 1);
                if (free_tt_count > free_tt_max) {
                        free_tt_max = free_tt_count;
                }
-
-               pmap_simple_unlock(&pmaps_lock);
+               pmap_simple_unlock(&tt1_lock);
        }
 
        /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
@@ -4014,9 +4285,8 @@ pmap_tt1_deallocate(
        tt_free_entry_t *tt_entry;
 
        tt_entry = (tt_free_entry_t *)tt;
-       if (not_in_kdp) {
-               pmap_simple_lock(&pmaps_lock);
-       }
+       assert(not_in_kdp);
+       pmap_simple_lock(&tt1_lock);
 
        if (size < PAGE_SIZE) {
                free_tt_count++;
@@ -4045,10 +4315,8 @@ pmap_tt1_deallocate(
                free_two_page_size_tt_list = tt_entry;
        }
 
-       if ((option & PMAP_TT_DEALLOCATE_NOBLOCK) || (!not_in_kdp)) {
-               if (not_in_kdp) {
-                       pmap_simple_unlock(&pmaps_lock);
-               }
+       if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
+               pmap_simple_unlock(&tt1_lock);
                pmap_tt_ledger_debit(pmap, size);
                return;
        }
@@ -4058,13 +4326,13 @@ pmap_tt1_deallocate(
                tt = (tt_entry_t *)free_page_size_tt_list;
                free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
 
-               pmap_simple_unlock(&pmaps_lock);
+               pmap_simple_unlock(&tt1_lock);
 
                pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
 
                OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
 
-               pmap_simple_lock(&pmaps_lock);
+               pmap_simple_lock(&tt1_lock);
        }
 
        while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
@@ -4072,15 +4340,15 @@ pmap_tt1_deallocate(
                tt = (tt_entry_t *)free_two_page_size_tt_list;
                free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
 
-               pmap_simple_unlock(&pmaps_lock);
+               pmap_simple_unlock(&tt1_lock);
 
                pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
 
                OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
 
-               pmap_simple_lock(&pmaps_lock);
+               pmap_simple_lock(&tt1_lock);
        }
-       pmap_simple_unlock(&pmaps_lock);
+       pmap_simple_unlock(&tt1_lock);
        pmap_tt_ledger_debit(pmap, size);
 }
 
@@ -4177,20 +4445,20 @@ pmap_tt_deallocate(
 
        ptdp = ptep_get_ptd((vm_offset_t)ttp);
 
-       ptdp->pt_map[ARM_PT_DESC_INDEX(ttp)].va = (vm_offset_t)-1;
+       ptdp->ptd_info[ARM_PT_DESC_INDEX(ttp)].va = (vm_offset_t)-1;
 
-       if ((level < PMAP_TT_MAX_LEVEL) && (ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt == PT_DESC_REFCOUNT)) {
-               ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt = 0;
+       if ((level < PMAP_TT_MAX_LEVEL) && (ptdp->ptd_info[ARM_PT_DESC_INDEX(ttp)].refcnt == PT_DESC_REFCOUNT)) {
+               ptdp->ptd_info[ARM_PT_DESC_INDEX(ttp)].refcnt = 0;
        }
 
-       if (ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt != 0) {
-               panic("pmap_tt_deallocate(): ptdp %p, count %d\n", ptdp, ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt);
+       if (ptdp->ptd_info[ARM_PT_DESC_INDEX(ttp)].refcnt != 0) {
+               panic("pmap_tt_deallocate(): ptdp %p, count %d\n", ptdp, ptdp->ptd_info[ARM_PT_DESC_INDEX(ttp)].refcnt);
        }
 
-       ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt = 0;
+       ptdp->ptd_info[ARM_PT_DESC_INDEX(ttp)].refcnt = 0;
 
        for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
-               pt_acc_cnt += ptdp->pt_cnt[i].refcnt;
+               pt_acc_cnt += ptdp->ptd_info[i].refcnt;
        }
 
        if (pt_acc_cnt == 0) {
@@ -4261,9 +4529,9 @@ pmap_tte_remove(
                panic("pmap_tte_deallocate(): null tt_entry ttep==%p\n", ttep);
        }
 
-       if (((level + 1) == PMAP_TT_MAX_LEVEL) && (tte_get_ptd(tte)->pt_cnt[ARM_PT_DESC_INDEX(ttetokv(*ttep))].refcnt != 0)) {
+       if (((level + 1) == PMAP_TT_MAX_LEVEL) && (tte_get_ptd(tte)->ptd_info[ARM_PT_DESC_INDEX(ttetokv(*ttep))].refcnt != 0)) {
                panic("pmap_tte_deallocate(): pmap=%p ttep=%p ptd=%p refcnt=0x%x \n", pmap, ttep,
-                   tte_get_ptd(tte), (tte_get_ptd(tte)->pt_cnt[ARM_PT_DESC_INDEX(ttetokv(*ttep))].refcnt));
+                   tte_get_ptd(tte), (tte_get_ptd(tte)->ptd_info[ARM_PT_DESC_INDEX(ttetokv(*ttep))].refcnt));
        }
 
 #if     (__ARM_VMSA__ == 7)
@@ -4311,7 +4579,7 @@ pmap_tte_deallocate(
                        unsigned        i;
 
                        for (i = 0; i < (ARM_PGBYTES / sizeof(*pte_p)); i++, pte_p++) {
-                               if (ARM_PTE_IS_COMPRESSED(*pte_p)) {
+                               if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
                                        panic("pmap_tte_deallocate: tte=0x%llx pmap=%p, pte_p=%p, pte=0x%llx compressed\n",
                                            (uint64_t)tte, pmap, pte_p, (uint64_t)(*pte_p));
                                } else if (((*pte_p) & ARM_PTE_TYPE_MASK) != ARM_PTE_TYPE_FAULT) {
@@ -4353,8 +4621,13 @@ pmap_remove_range(
        pt_entry_t *epte,
        uint32_t *rmv_cnt)
 {
-       return pmap_remove_range_options(pmap, va, bpte, epte, rmv_cnt,
-                  PMAP_OPTIONS_REMOVE);
+       bool need_strong_sync = false;
+       int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, rmv_cnt,
+           &need_strong_sync, PMAP_OPTIONS_REMOVE);
+       if (num_changed > 0) {
+               PMAP_UPDATE_TLBS(pmap, va, va + (PAGE_SIZE * (epte - bpte)), need_strong_sync);
+       }
+       return num_changed;
 }
 
 
@@ -4500,6 +4773,7 @@ pmap_remove_range_options(
        pt_entry_t *bpte,
        pt_entry_t *epte,
        uint32_t *rmv_cnt,
+       bool *need_strong_sync __unused,
        int options)
 {
        pt_entry_t     *cpte;
@@ -4539,7 +4813,7 @@ pmap_remove_range_options(
                while (!managed) {
                        if (pmap != kernel_pmap &&
                            (options & PMAP_OPTIONS_REMOVE) &&
-                           (ARM_PTE_IS_COMPRESSED(spte))) {
+                           (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
                                /*
                                 * "pmap" must be locked at this point,
                                 * so this should not race with another
@@ -4560,7 +4834,7 @@ pmap_remove_range_options(
                                 * our "compressed" markers,
                                 * so let's update it here.
                                 */
-                               if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_ptd(cpte)->pt_cnt[ARM_PT_DESC_INDEX(cpte)].refcnt)) <= 0) {
+                               if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_ptd(cpte)->ptd_info[ARM_PT_DESC_INDEX(cpte)].refcnt)) <= 0) {
                                        panic("pmap_remove_range_options: over-release of ptdp %p for pte %p\n", ptep_get_ptd(cpte), cpte);
                                }
                                spte = *cpte;
@@ -4586,7 +4860,7 @@ pmap_remove_range_options(
                        UNLOCK_PVH(pai);
                }
 
-               if (ARM_PTE_IS_COMPRESSED(*cpte)) {
+               if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
                        /*
                         * There used to be a valid mapping here but it
                         * has already been removed when the page was
@@ -4598,7 +4872,8 @@ pmap_remove_range_options(
 
                /* remove the translation, do not flush the TLB */
                if (*cpte != ARM_PTE_TYPE_FAULT) {
-                       assert(!ARM_PTE_IS_COMPRESSED(*cpte));
+                       assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
+                       assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
 #if MACH_ASSERT
                        if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
                                panic("pmap_remove_range_options(): cpte=%p ptd=%p pte=0x%llx va=0x%llx\n",
@@ -4611,8 +4886,9 @@ pmap_remove_range_options(
 
                if ((spte != ARM_PTE_TYPE_FAULT) &&
                    (pmap != kernel_pmap)) {
-                       assert(!ARM_PTE_IS_COMPRESSED(spte));
-                       if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_ptd(cpte)->pt_cnt[ARM_PT_DESC_INDEX(cpte)].refcnt)) <= 0) {
+                       assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
+                       assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
+                       if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_ptd(cpte)->ptd_info[ARM_PT_DESC_INDEX(cpte)].refcnt)) <= 0) {
                                panic("pmap_remove_range_options: over-release of ptdp %p for pte %p\n", ptep_get_ptd(cpte), cpte);
                        }
                        if (rmv_cnt) {
@@ -4777,17 +5053,22 @@ pmap_remove_options_internal(
        vm_map_address_t end,
        int options)
 {
-       int remove_count = 0;
+       int             remove_count = 0;
        pt_entry_t     *bpte, *epte;
        pt_entry_t     *pte_p;
        tt_entry_t     *tte_p;
        uint32_t        rmv_spte = 0;
+       bool            need_strong_sync = false;
+       bool            flush_tte = false;
 
        if (__improbable(end < start)) {
                panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
        }
 
        VALIDATE_PMAP(pmap);
+
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
+
        PMAP_LOCK(pmap);
 
        tte_p = pmap_tte(pmap, start);
@@ -4799,28 +5080,27 @@ pmap_remove_options_internal(
        if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
                pte_p = (pt_entry_t *) ttetokv(*tte_p);
                bpte = &pte_p[ptenum(start)];
-               epte = bpte + ((end - start) >> ARM_TT_LEAF_SHIFT);
+               epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
 
                remove_count += pmap_remove_range_options(pmap, start, bpte, epte,
-                   &rmv_spte, options);
+                   &rmv_spte, &need_strong_sync, options);
 
-#if     (__ARM_VMSA__ == 7)
-               if (rmv_spte && (ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) &&
-                   (pmap != kernel_pmap) && (pmap->nested == FALSE)) {
-                       pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L1_LEVEL);
-                       flush_mmu_tlb_entry((start & ~ARM_TT_L1_OFFMASK) | (pmap->asid & 0xff));
-               }
-#else
-               if (rmv_spte && (ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) &&
+               if (rmv_spte && (ptep_get_ptd(pte_p)->ptd_info[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) &&
                    (pmap != kernel_pmap) && (pmap->nested == FALSE)) {
-                       pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L2_LEVEL);
-                       flush_mmu_tlb_entry(tlbi_addr(start & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid));
+                       pmap_tte_deallocate(pmap, tte_p, pt_attr_twig_level(pt_attr));
+                       flush_tte = true;
                }
-#endif
        }
 
 done:
        PMAP_UNLOCK(pmap);
+
+       if (remove_count > 0) {
+               PMAP_UPDATE_TLBS(pmap, start, end, need_strong_sync);
+       } else if (flush_tte > 0) {
+               pmap_get_pt_ops(pmap)->flush_tlb_tte_async(start, pmap);
+               sync_tlb_flush();
+       }
        return remove_count;
 }
 
@@ -4838,6 +5118,8 @@ pmap_remove_options(
                return;
        }
 
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
+
        PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
            VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
            VM_KERNEL_ADDRHIDE(end));
@@ -4860,11 +5142,7 @@ pmap_remove_options(
        while (va < end) {
                vm_map_address_t l;
 
-#if     (__ARM_VMSA__ == 7)
-               l = ((va + ARM_TT_L1_SIZE) & ~ARM_TT_L1_OFFMASK);
-#else
-               l = ((va + ARM_TT_L2_SIZE) & ~ARM_TT_L2_OFFMASK);
-#endif
+               l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
                if (l > end) {
                        l = end;
                }
@@ -4874,10 +5152,6 @@ pmap_remove_options(
                va = l;
        }
 
-       if (remove_count > 0) {
-               PMAP_UPDATE_TLBS(pmap, start, end);
-       }
-
        PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
 }
 
@@ -4905,12 +5179,11 @@ pmap_set_pmap(
 #if __ARM_USER_PROTECT__
        if (pmap->tte_index_max == NTTES) {
                thread->machine.uptw_ttc = 2;
-               thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
        } else {
-               thread->machine.uptw_ttc = 1;       \
-               thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
+               thread->machine.uptw_ttc = 1;
        }
-       thread->machine.asid = pmap->asid;
+       thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
+       thread->machine.asid = pmap->hw_asid;
 #endif
 }
 
@@ -4918,9 +5191,9 @@ static void
 pmap_flush_core_tlb_asid(pmap_t pmap)
 {
 #if (__ARM_VMSA__ == 7)
-       flush_core_tlb_asid(pmap->asid);
+       flush_core_tlb_asid(pmap->hw_asid);
 #else
-       flush_core_tlb_asid(((uint64_t) pmap->asid) << TLBI_ASID_SHIFT);
+       flush_core_tlb_asid(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
 #endif
 }
 
@@ -4930,34 +5203,42 @@ pmap_switch_internal(
 {
        VALIDATE_PMAP(pmap);
        pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
-       uint32_t         last_asid_high_bits, asid_high_bits;
-       boolean_t        do_asid_flush = FALSE;
+       uint16_t        asid_index = pmap->hw_asid;
+       boolean_t       do_asid_flush = FALSE;
+
+#if __ARM_KERNEL_PROTECT__
+       asid_index >>= 1;
+#endif
 
 #if     (__ARM_VMSA__ == 7)
-       if (not_in_kdp) {
-               pmap_simple_lock(&pmap->tt1_lock);
-       }
+       assert(not_in_kdp);
+       pmap_simple_lock(&pmap->tt1_lock);
 #else
        pmap_t           last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
 #endif
 
-       /* Paranoia. */
-       assert(pmap->asid < (sizeof(cpu_data_ptr->cpu_asid_high_bits) / sizeof(*cpu_data_ptr->cpu_asid_high_bits)));
+#if MAX_ASID > MAX_HW_ASID
+       if (asid_index > 0) {
+               asid_index -= 1;
+               /* Paranoia. */
+               assert(asid_index < (sizeof(cpu_data_ptr->cpu_asid_high_bits) / sizeof(*cpu_data_ptr->cpu_asid_high_bits)));
 
-       /* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
-       asid_high_bits = pmap->vasid >> ARM_ASID_SHIFT;
-       last_asid_high_bits = (uint32_t) cpu_data_ptr->cpu_asid_high_bits[pmap->asid];
+               /* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
+               uint8_t asid_high_bits = pmap->sw_asid;
+               uint8_t last_asid_high_bits = cpu_data_ptr->cpu_asid_high_bits[asid_index];
 
-       if (asid_high_bits != last_asid_high_bits) {
-               /*
-                * If the virtual ASID of the new pmap does not match the virtual ASID
-                * last seen on this CPU for the physical ASID (that was a mouthful),
-                * then this switch runs the risk of aliasing.  We need to flush the
-                * TLB for this phyiscal ASID in this case.
-                */
-               cpu_data_ptr->cpu_asid_high_bits[pmap->asid] = (uint8_t) asid_high_bits;
-               do_asid_flush = TRUE;
+               if (asid_high_bits != last_asid_high_bits) {
+                       /*
+                        * If the virtual ASID of the new pmap does not match the virtual ASID
+                        * last seen on this CPU for the physical ASID (that was a mouthful),
+                        * then this switch runs the risk of aliasing.  We need to flush the
+                        * TLB for this phyiscal ASID in this case.
+                        */
+                       cpu_data_ptr->cpu_asid_high_bits[asid_index] = asid_high_bits;
+                       do_asid_flush = TRUE;
+               }
        }
+#endif /* MAX_ASID > MAX_HW_ASID */
 
        pmap_switch_user_ttb_internal(pmap);
 
@@ -4972,12 +5253,13 @@ pmap_switch_internal(
 #endif
        if (do_asid_flush) {
                pmap_flush_core_tlb_asid(pmap);
+#if DEVELOPMENT || DEBUG
+               os_atomic_inc(&pmap_asid_flushes, relaxed);
+#endif
        }
 
 #if     (__ARM_VMSA__ == 7)
-       if (not_in_kdp) {
-               pmap_simple_unlock(&pmap->tt1_lock);
-       }
+       pmap_simple_unlock(&pmap->tt1_lock);
 #endif
 }
 
@@ -4985,7 +5267,7 @@ void
 pmap_switch(
        pmap_t pmap)
 {
-       PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid);
+       PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
        pmap_switch_internal(pmap);
        PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
 }
@@ -5156,7 +5438,7 @@ pmap_page_protect_options_internal(
                            pmap != kernel_pmap &&
                            (options & PMAP_OPTIONS_COMPRESSOR) &&
                            IS_INTERNAL_PAGE(pai)) {
-                               assert(!ARM_PTE_IS_COMPRESSED(*pte_p));
+                               assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
                                /* mark this PTE as having been "compressed" */
                                tmplate = ARM_PTE_COMPRESSED;
                                if (is_altacct) {
@@ -5170,7 +5452,7 @@ pmap_page_protect_options_internal(
                        if ((*pte_p != ARM_PTE_TYPE_FAULT) &&
                            tmplate == ARM_PTE_TYPE_FAULT &&
                            (pmap != kernel_pmap)) {
-                               if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt)) <= 0) {
+                               if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_ptd(pte_p)->ptd_info[ARM_PT_DESC_INDEX(pte_p)].refcnt)) <= 0) {
                                        panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p\n", ptep_get_ptd(pte_p), pte_p);
                                }
                        }
@@ -5264,58 +5546,29 @@ pmap_page_protect_options_internal(
                        }
                } else {
                        pt_entry_t      spte;
+                       const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
 
                        spte = *pte_p;
 
                        if (pmap == kernel_pmap) {
                                tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
                        } else {
-                               tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RORO));
+                               tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
                        }
 
                        pte_set_was_writeable(tmplate, false);
-
-#if     (__ARM_VMSA__ == 7)
-                       if (set_NX) {
-                               tmplate |= ARM_PTE_NX;
-                       } else {
-                               /*
-                                * While the naive implementation of this would serve to add execute
-                                * permission, this is not how the VM uses this interface, or how
-                                * x86_64 implements it.  So ignore requests to add execute permissions.
-                                */
-#if 0
-                               tmplate &= ~ARM_PTE_NX;
-#else
-                               ;
-#endif
-                       }
-#else
+                       /*
+                        * While the naive implementation of this would serve to add execute
+                        * permission, this is not how the VM uses this interface, or how
+                        * x86_64 implements it.  So ignore requests to add execute permissions.
+                        */
                        if (set_NX) {
-                               tmplate |= ARM_PTE_NX | ARM_PTE_PNX;
-                       } else {
-                               /*
-                                * While the naive implementation of this would serve to add execute
-                                * permission, this is not how the VM uses this interface, or how
-                                * x86_64 implements it.  So ignore requests to add execute permissions.
-                                */
-#if 0
-                               if (pmap == kernel_pmap) {
-                                       tmplate &= ~ARM_PTE_PNX;
-                                       tmplate |= ARM_PTE_NX;
-                               } else {
-                                       tmplate &= ~ARM_PTE_NX;
-                                       tmplate |= ARM_PTE_PNX;
-                               }
-#else
-                               ;
-#endif
+                               tmplate |= pt_attr_leaf_xn(pt_attr);
                        }
-#endif
 
 
                        if (*pte_p != ARM_PTE_TYPE_FAULT &&
-                           !ARM_PTE_IS_COMPRESSED(*pte_p) &&
+                           !ARM_PTE_IS_COMPRESSED(*pte_p, pte_p) &&
                            *pte_p != tmplate) {
                                WRITE_PTE_STRONG(pte_p, tmplate);
                                update = TRUE;
@@ -5325,7 +5578,7 @@ pmap_page_protect_options_internal(
                /* Invalidate TLBs for all CPUs using it */
                if (update) {
                        tlb_flush_needed = TRUE;
-                       flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap);
+                       pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, PAGE_SIZE, pmap);
                }
 
 #ifdef PVH_FLAG_IOMMU
@@ -5436,18 +5689,16 @@ pmap_protect_options_internal(
        unsigned int options,
        __unused void *args)
 {
-       tt_entry_t     *tte_p;
-       pt_entry_t     *bpte_p, *epte_p;
-       pt_entry_t     *pte_p;
-       boolean_t       set_NX = TRUE;
+       const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
+       tt_entry_t      *tte_p;
+       pt_entry_t      *bpte_p, *epte_p;
+       pt_entry_t      *pte_p;
+       boolean_t        set_NX = TRUE;
 #if (__ARM_VMSA__ > 7)
-       boolean_t       set_XO = FALSE;
-#endif
-       boolean_t       should_have_removed = FALSE;
-
-#ifndef __ARM_IC_NOALIAS_ICACHE__
-       boolean_t       InvalidatePoU_Icache_Done = FALSE;
+       boolean_t        set_XO = FALSE;
 #endif
+       boolean_t        should_have_removed = FALSE;
+       bool             need_strong_sync = false;
 
        if (__improbable(end < start)) {
                panic("%s called with bogus range: %p, %p", __func__, (void*)start, (void*)end);
@@ -5518,7 +5769,7 @@ pmap_protect_options_internal(
                        spte = *pte_p;
 
                        if ((spte == ARM_PTE_TYPE_FAULT) ||
-                           ARM_PTE_IS_COMPRESSED(spte)) {
+                           ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
                                continue;
                        }
 
@@ -5549,7 +5800,7 @@ pmap_protect_options_internal(
                        }
 
                        if ((spte == ARM_PTE_TYPE_FAULT) ||
-                           ARM_PTE_IS_COMPRESSED(spte)) {
+                           ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
                                continue;
                        }
 
@@ -5569,11 +5820,11 @@ pmap_protect_options_internal(
 #if DEVELOPMENT || DEBUG
                                if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
                                        force_write = TRUE;
-                                       tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWRW));
+                                       tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
                                } else
 #endif
                                {
-                                       tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RORO));
+                                       tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
                                }
                        }
 
@@ -5587,34 +5838,23 @@ pmap_protect_options_internal(
                         * not allowed to increase
                         * access permissions.
                         */
-#if     (__ARM_VMSA__ == 7)
-                       if (set_NX) {
-                               tmplate |= ARM_PTE_NX;
-                       } else {
-                               /* do NOT clear "NX"! */
-                       }
-#else
                        if (set_NX) {
-                               tmplate |= ARM_PTE_NX | ARM_PTE_PNX;
+                               tmplate |= pt_attr_leaf_xn(pt_attr);
                        } else {
+#if     (__ARM_VMSA__ > 7)
                                if (pmap == kernel_pmap) {
-                                       /*
-                                        * TODO: Run CS/Monitor checks here;
-                                        * should we be clearing PNX here?  Is
-                                        * this just for dtrace?
-                                        */
-                                       tmplate &= ~ARM_PTE_PNX;
+                                       /* do NOT clear "PNX"! */
                                        tmplate |= ARM_PTE_NX;
                                } else {
                                        /* do NOT clear "NX"! */
-                                       tmplate |= ARM_PTE_PNX;
+                                       tmplate |= pt_attr_leaf_x(pt_attr);
                                        if (set_XO) {
                                                tmplate &= ~ARM_PTE_APMASK;
-                                               tmplate |= ARM_PTE_AP(AP_RONA);
+                                               tmplate |= pt_attr_leaf_rona(pt_attr);
                                        }
                                }
-                       }
 #endif
+                       }
 
 #if DEVELOPMENT || DEBUG
                        if (force_write) {
@@ -5658,19 +5898,6 @@ pmap_protect_options_internal(
                        /* We do not expect to write fast fault the entry. */
                        pte_set_was_writeable(tmplate, false);
 
-                       /* TODO: Doesn't this need to worry about PNX? */
-                       if (((spte & ARM_PTE_NX) == ARM_PTE_NX) && (prot & VM_PROT_EXECUTE)) {
-                               CleanPoU_DcacheRegion((vm_offset_t) phystokv(pa), PAGE_SIZE);
-#ifdef  __ARM_IC_NOALIAS_ICACHE__
-                               InvalidatePoU_IcacheRegion((vm_offset_t) phystokv(pa), PAGE_SIZE);
-#else
-                               if (!InvalidatePoU_Icache_Done) {
-                                       InvalidatePoU_Icache();
-                                       InvalidatePoU_Icache_Done = TRUE;
-                               }
-#endif
-                       }
-
                        WRITE_PTE_FAST(pte_p, tmplate);
 
                        if (managed) {
@@ -5678,9 +5905,8 @@ pmap_protect_options_internal(
                                UNLOCK_PVH(pai);
                        }
                }
-
                FLUSH_PTE_RANGE_STRONG(bpte_p, epte_p);
-               PMAP_UPDATE_TLBS(pmap, start, end);
+               PMAP_UPDATE_TLBS(pmap, start, end, need_strong_sync);
        }
 
        PMAP_UNLOCK(pmap);
@@ -5697,6 +5923,8 @@ pmap_protect_options(
 {
        vm_map_address_t l, beg;
 
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
+
        if ((b | e) & PAGE_MASK) {
                panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx\n",
                    pmap, (uint64_t)b, (uint64_t)e);
@@ -5733,7 +5961,7 @@ pmap_protect_options(
        beg = b;
 
        while (beg < e) {
-               l = ((beg + ARM_TT_TWIG_SIZE) & ~ARM_TT_TWIG_OFFMASK);
+               l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
 
                if (l > e) {
                        l = e;
@@ -5817,7 +6045,7 @@ static inline void
 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t pte, vm_map_address_t v)
 {
        if (pmap != kernel_pmap && ((pte & ARM_PTE_WIRED) != (*pte_p & ARM_PTE_WIRED))) {
-               SInt16  *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].wiredcnt);
+               SInt16  *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_ptd(pte_p)->ptd_info[ARM_PT_DESC_INDEX(pte_p)].wiredcnt);
                if (pte & ARM_PTE_WIRED) {
                        OSAddAtomic16(1, ptd_wiredcnt_ptr);
                        pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
@@ -5829,9 +6057,9 @@ pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t pte, vm_map_address_t
                }
        }
        if (*pte_p != ARM_PTE_TYPE_FAULT &&
-           !ARM_PTE_IS_COMPRESSED(*pte_p)) {
+           !ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
                WRITE_PTE_STRONG(pte_p, pte);
-               PMAP_UPDATE_TLBS(pmap, v, v + PAGE_SIZE);
+               PMAP_UPDATE_TLBS(pmap, v, v + PAGE_SIZE, false);
        } else {
                WRITE_PTE(pte_p, pte);
                __builtin_arm_isb(ISB_SY);
@@ -5840,7 +6068,7 @@ pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t pte, vm_map_address_t
        PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + PAGE_SIZE), pte);
 }
 
-static pt_entry_t
+MARK_AS_PMAP_TEXT static pt_entry_t
 wimg_to_pte(unsigned int wimg)
 {
        pt_entry_t pte;
@@ -5855,6 +6083,14 @@ wimg_to_pte(unsigned int wimg)
                pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
                pte |= ARM_PTE_NX | ARM_PTE_PNX;
                break;
+       case VM_WIMG_POSTED_REORDERED:
+               pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
+               pte |= ARM_PTE_NX | ARM_PTE_PNX;
+               break;
+       case VM_WIMG_POSTED_COMBINED_REORDERED:
+               pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
+               pte |= ARM_PTE_NX | ARM_PTE_PNX;
+               break;
        case VM_WIMG_WCOMB:
                pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
                pte |= ARM_PTE_NX | ARM_PTE_PNX;
@@ -6062,6 +6298,8 @@ pmap_enter_options_internal(
 
        VALIDATE_PMAP(pmap);
 
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
+
        if ((v) & PAGE_MASK) {
                panic("pmap_enter_options() pmap %p v 0x%llx\n",
                    pmap, (uint64_t)v);
@@ -6123,7 +6361,7 @@ Pmap_enter_retry:
 
        spte = *pte_p;
 
-       if (ARM_PTE_IS_COMPRESSED(spte)) {
+       if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
                /*
                 * "pmap" should be locked at this point, so this should
                 * not race with another pmap_enter() or pmap_remove_range().
@@ -6158,7 +6396,6 @@ Pmap_enter_retry:
 
        if ((spte != ARM_PTE_TYPE_FAULT) && (pte_to_pa(spte) != pa)) {
                pmap_remove_range(pmap, v, pte_p, pte_p + 1, 0);
-               PMAP_UPDATE_TLBS(pmap, v, v + PAGE_SIZE);
        }
 
        pte = pa_to_pte(pa) | ARM_PTE_TYPE;
@@ -6171,21 +6408,17 @@ Pmap_enter_retry:
                pte |= ARM_PTE_WIRED;
        }
 
-#if     (__ARM_VMSA__ == 7)
-       if (set_NX) {
-               pte |= ARM_PTE_NX;
-       }
-#else
        if (set_NX) {
-               pte |= ARM_PTE_NX | ARM_PTE_PNX;
+               pte |= pt_attr_leaf_xn(pt_attr);
        } else {
+#if     (__ARM_VMSA__ > 7)
                if (pmap == kernel_pmap) {
                        pte |= ARM_PTE_NX;
                } else {
-                       pte |= ARM_PTE_PNX;
+                       pte |= pt_attr_leaf_x(pt_attr);
                }
-       }
 #endif
+       }
 
        if (pmap == kernel_pmap) {
 #if __ARM_KERNEL_PROTECT__
@@ -6204,12 +6437,12 @@ Pmap_enter_retry:
                }
 #endif
        } else {
-               if (!(pmap->nested)) {
+               if (!pmap->nested) {
                        pte |= ARM_PTE_NG;
                } else if ((pmap->nested_region_asid_bitmap)
                    && (v >= pmap->nested_region_subord_addr)
                    && (v < (pmap->nested_region_subord_addr + pmap->nested_region_size))) {
-                       unsigned int index = (unsigned int)((v - pmap->nested_region_subord_addr)  >> ARM_TT_TWIG_SHIFT);
+                       unsigned int index = (unsigned int)((v - pmap->nested_region_subord_addr)  >> pt_attr_twig_shift(pt_attr));
 
                        if ((pmap->nested_region_asid_bitmap)
                            && testbit(index, (int *)pmap->nested_region_asid_bitmap)) {
@@ -6227,9 +6460,9 @@ Pmap_enter_retry:
                            && (nest_vaddr < (pmap->nested_region_subord_addr + pmap->nested_region_size))
                            && ((nest_pte_p = pmap_pte(pmap->nested_pmap, nest_vaddr)) != PT_ENTRY_NULL)
                            && (*nest_pte_p != ARM_PTE_TYPE_FAULT)
-                           && (!ARM_PTE_IS_COMPRESSED(*nest_pte_p))
+                           && (!ARM_PTE_IS_COMPRESSED(*nest_pte_p, nest_pte_p))
                            && (((*nest_pte_p) & ARM_PTE_NG) != ARM_PTE_NG)) {
-                               unsigned int index = (unsigned int)((v - pmap->nested_region_subord_addr)  >> ARM_TT_TWIG_SHIFT);
+                               unsigned int index = (unsigned int)((v - pmap->nested_region_subord_addr)  >> pt_attr_twig_shift(pt_attr));
 
                                if ((pmap->nested_pmap->nested_region_asid_bitmap)
                                    && !testbit(index, (int *)pmap->nested_pmap->nested_region_asid_bitmap)) {
@@ -6243,33 +6476,33 @@ Pmap_enter_retry:
                        if (pa_valid(pa) && (!pa_test_bits(pa, PP_ATTR_MODIFIED))) {
                                if (fault_type & VM_PROT_WRITE) {
                                        if (set_XO) {
-                                               pte |= ARM_PTE_AP(AP_RWNA);
+                                               pte |= pt_attr_leaf_rwna(pt_attr);
                                        } else {
-                                               pte |= ARM_PTE_AP(AP_RWRW);
+                                               pte |= pt_attr_leaf_rw(pt_attr);
                                        }
                                        pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
                                } else {
                                        if (set_XO) {
-                                               pte |= ARM_PTE_AP(AP_RONA);
+                                               pte |= pt_attr_leaf_rona(pt_attr);
                                        } else {
-                                               pte |= ARM_PTE_AP(AP_RORO);
+                                               pte |= pt_attr_leaf_ro(pt_attr);
                                        }
                                        pa_set_bits(pa, PP_ATTR_REFERENCED);
                                        pte_set_was_writeable(pte, true);
                                }
                        } else {
                                if (set_XO) {
-                                       pte |= ARM_PTE_AP(AP_RWNA);
+                                       pte |= pt_attr_leaf_rwna(pt_attr);
                                } else {
-                                       pte |= ARM_PTE_AP(AP_RWRW);
+                                       pte |= pt_attr_leaf_rw(pt_attr);
                                }
                                pa_set_bits(pa, PP_ATTR_REFERENCED);
                        }
                } else {
                        if (set_XO) {
-                               pte |= ARM_PTE_AP(AP_RONA);
+                               pte |= pt_attr_leaf_rona(pt_attr);
                        } else {
-                               pte |= ARM_PTE_AP(AP_RORO);
+                               pte |= pt_attr_leaf_ro(pt_attr);;
                        }
                        pa_set_bits(pa, PP_ATTR_REFERENCED);
                }
@@ -6280,8 +6513,8 @@ Pmap_enter_retry:
        volatile uint16_t *refcnt = NULL;
        volatile uint16_t *wiredcnt = NULL;
        if (pmap != kernel_pmap) {
-               refcnt = &(ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt);
-               wiredcnt = &(ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].wiredcnt);
+               refcnt = &(ptep_get_ptd(pte_p)->ptd_info[ARM_PT_DESC_INDEX(pte_p)].refcnt);
+               wiredcnt = &(ptep_get_ptd(pte_p)->ptd_info[ARM_PT_DESC_INDEX(pte_p)].wiredcnt);
                /* Bump the wired count to keep the PTE page from being reclaimed.  We need this because
                 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
                 * a new PV entry. */
@@ -6318,7 +6551,7 @@ Pmap_enter_loop:
                 * was dropped, so clear any cache attributes we may have previously set
                 * in the PTE template. */
                pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
-               pte |= wimg_to_pte(wimg_bits);
+               pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits);
 
 
 
@@ -6417,7 +6650,7 @@ Pmap_enter_loop:
                        wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
                }
 
-               pte |= wimg_to_pte(wimg_bits);
+               pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits);
 
                pmap_enter_pte(pmap, pte_p, pte, v);
        }
@@ -6511,8 +6744,19 @@ pmap_change_wiring_internal(
        pte_p = pmap_pte(pmap, v);
        assert(pte_p != PT_ENTRY_NULL);
        pa = pte_to_pa(*pte_p);
-       if (pa_valid(pa)) {
+
+       while (pa_valid(pa)) {
+               pmap_paddr_t new_pa;
+
                LOCK_PVH((int)pa_index(pa));
+               new_pa = pte_to_pa(*pte_p);
+
+               if (pa == new_pa) {
+                       break;
+               }
+
+               UNLOCK_PVH((int)pa_index(pa));
+               pa = new_pa;
        }
 
        if (wired && !pte_is_wired(*pte_p)) {
@@ -6631,7 +6875,7 @@ pmap_vtophys(
                ppn = (ppnum_t) atop(pte_to_pa(*pte_p) | (va & ARM_PGMASK));
 #if DEVELOPMENT || DEBUG
                if (ppn != 0 &&
-                   ARM_PTE_IS_COMPRESSED(*pte_p)) {
+                   ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
                        panic("pmap_vtophys(%p,0x%llx): compressed pte_p=%p 0x%llx with ppn=0x%x\n",
                            pmap, va, pte_p, (uint64_t) (*pte_p), ppn);
                }
@@ -6650,13 +6894,10 @@ pmap_vtophys(
        tt_entry_t              tte;
        ppnum_t                 ppn = 0;
 
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
+
        /* Level 0 currently unused */
 
-#if __ARM64_TWO_LEVEL_PMAP__
-       /* We have no L1 entry; go straight to the L2 entry */
-       ttp = pmap_tt2e(pmap, va);
-       tte = *ttp;
-#else
        /* Get first-level (1GB) entry */
        ttp = pmap_tt1e(pmap, va);
        tte = *ttp;
@@ -6664,8 +6905,8 @@ pmap_vtophys(
                return ppn;
        }
 
-       tte = ((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt2_index(pmap, va)];
-#endif
+       tte = ((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt2_index(pmap, pt_attr, va)];
+
        if ((tte & ARM_TTE_VALID) != (ARM_TTE_VALID)) {
                return ppn;
        }
@@ -6674,7 +6915,7 @@ pmap_vtophys(
                ppn = (ppnum_t) atop((tte & ARM_TTE_BLOCK_L2_MASK) | (va & ARM_TT_L2_OFFMASK));
                return ppn;
        }
-       tte = ((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt3_index(pmap, va)];
+       tte = ((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt3_index(pmap, pt_attr, va)];
        ppn = (ppnum_t) atop((tte & ARM_PTE_MASK) | (va & ARM_TT_L3_OFFMASK));
 #endif
 
@@ -6744,7 +6985,8 @@ pmap_init_pte_page(
        pt_entry_t *pte_p,
        vm_offset_t va,
        unsigned int ttlevel,
-       boolean_t alloc_ptd)
+       boolean_t alloc_ptd,
+       boolean_t clear)
 {
        pt_desc_t   *ptdp = NULL;
        vm_offset_t *pvh;
@@ -6769,10 +7011,12 @@ pmap_init_pte_page(
                panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
        }
 
-       bzero(pte_p, ARM_PGBYTES);
-       // below barrier ensures the page zeroing is visible to PTW before
-       // it is linked to the PTE of previous level
-       __builtin_arm_dmb(DMB_ISHST);
+       if (clear) {
+               bzero(pte_p, ARM_PGBYTES);
+               // below barrier ensures the page zeroing is visible to PTW before
+               // it is linked to the PTE of previous level
+               __builtin_arm_dmb(DMB_ISHST);
+       }
        ptd_init(ptdp, pmap, va, ttlevel, pte_p);
 }
 
@@ -6794,14 +7038,15 @@ pmap_expand(
        unsigned int options,
        unsigned int level)
 {
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
+
 #if     (__ARM_VMSA__ == 7)
        vm_offset_t     pa;
        tt_entry_t              *tte_p;
        tt_entry_t              *tt_p;
        unsigned int    i;
 
-
-       while (tte_index(pmap, v) >= pmap->tte_index_max) {
+       while (tte_index(pmap, pt_attr, v) >= pmap->tte_index_max) {
                tte_p = pmap_tt1_allocate(pmap, 2 * ARM_PGBYTES, ((options & PMAP_OPTIONS_NOWAIT)? PMAP_TT_ALLOCATE_NOWAIT : 0));
                if (tte_p == (tt_entry_t *)0) {
                        return KERN_RESOURCE_SHORTAGE;
@@ -6822,24 +7067,43 @@ pmap_expand(
                        tte_p[i] = ARM_TTE_TYPE_FAULT;
                }
 
-               pmap->prev_tte = pmap->tte;
+               FLUSH_PTE_RANGE(tte_p, tte_p + (2 * NTTES)); // DMB
+
+               /* Order is important here, so that pmap_switch_user_ttb() sees things
+                * in the correct sequence.
+                * --update of pmap->tte[p] must happen prior to updating pmap->tte_index_max,
+                *   separated by at least a DMB, so that context switch does not see a 1 GB
+                *   L1 table with a 2GB size.
+                * --update of pmap->tte[p] must also happen prior to setting pmap->prev_tte,
+                *   separated by at least a DMB, so that context switch does not see an L1
+                *   table to be freed without also seeing its replacement.*/
+
+               tt_entry_t *prev_tte = pmap->tte;
+
                pmap->tte = tte_p;
                pmap->ttep = ml_static_vtop((vm_offset_t)pmap->tte);
 
-               FLUSH_PTE_RANGE(pmap->tte, pmap->tte + (2 * NTTES));
+               __builtin_arm_dmb(DMB_ISH);
 
                pmap->tte_index_max = 2 * NTTES;
-               pmap->stamp = hw_atomic_add(&pmap_stamp, 1);
+               pmap->stamp = os_atomic_inc(&pmap_stamp, relaxed);
 
                for (i = 0; i < NTTES; i++) {
-                       pmap->prev_tte[i] = ARM_TTE_TYPE_FAULT;
+                       prev_tte[i] = ARM_TTE_TYPE_FAULT;
                }
 
-               FLUSH_PTE_RANGE(pmap->prev_tte, pmap->prev_tte + NTTES);
+               /* We need a strong flush here because a TLB flush will be
+                * issued from pmap_switch_user_ttb() as soon as this pmap
+                * is no longer active on any CPU.  We need to ensure all
+                * prior stores to the TTE region have retired before that. */
+               FLUSH_PTE_RANGE_STRONG(prev_tte, prev_tte + NTTES); // DSB
+               pmap->prev_tte = prev_tte;
 
                pmap_simple_unlock(&pmap->tt1_lock);
                PMAP_UNLOCK(pmap);
-               pmap_set_pmap(pmap, current_thread());
+               if (current_pmap() == pmap) {
+                       pmap_set_pmap(pmap, current_thread());
+               }
        }
 
        if (level == 1) {
@@ -6896,11 +7160,8 @@ pmap_expand(
                if (pmap_pte(pmap, v) == PT_ENTRY_NULL) {
                        tt_entry_t     *tte_next_p;
 
-                       pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, PMAP_TT_L2_LEVEL, FALSE);
+                       pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, PMAP_TT_L2_LEVEL, FALSE, TRUE);
                        pa = kvtophys((vm_offset_t)tt_p);
-#ifndef  __ARM_L1_PTW__
-                       CleanPoU_DcacheRegion((vm_offset_t) phystokv(pa), PAGE_SIZE);
-#endif
                        tte_p = &pmap->tte[ttenum(v)];
                        for (i = 0, tte_next_p = tte_p; i < 4; i++) {
                                *tte_next_p = pa_to_tte(pa) | ARM_TTE_TYPE_TABLE;
@@ -6923,13 +7184,7 @@ pmap_expand(
        return KERN_SUCCESS;
 #else
        pmap_paddr_t    pa;
-#if __ARM64_TWO_LEVEL_PMAP__
-       /* If we are using a two level page table, we'll start at L2. */
-       unsigned int    ttlevel = 2;
-#else
-       /* Otherwise, we start at L1 (we use 3 levels by default). */
-       unsigned int    ttlevel = 1;
-#endif
+       unsigned int    ttlevel = pt_attr_root_level(pt_attr);
        tt_entry_t              *tte_p;
        tt_entry_t              *tt_p;
 
@@ -6939,50 +7194,24 @@ pmap_expand(
        for (; ttlevel < level; ttlevel++) {
                PMAP_LOCK(pmap);
 
-               if (ttlevel == 1) {
-                       if ((pmap_tt2e(pmap, v) == PT_ENTRY_NULL)) {
-                               PMAP_UNLOCK(pmap);
-                               while (pmap_tt_allocate(pmap, &tt_p, PMAP_TT_L2_LEVEL, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
-                                       if (options & PMAP_OPTIONS_NOWAIT) {
-                                               return KERN_RESOURCE_SHORTAGE;
-                                       }
-                                       VM_PAGE_WAIT();
-                               }
-                               PMAP_LOCK(pmap);
-                               if ((pmap_tt2e(pmap, v) == PT_ENTRY_NULL)) {
-                                       pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, PMAP_TT_L2_LEVEL, FALSE);
-                                       pa = kvtophys((vm_offset_t)tt_p);
-                                       tte_p = pmap_tt1e( pmap, v);
-                                       *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
-                                       PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~ARM_TT_L1_OFFMASK),
-                                           VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_OFFMASK) + ARM_TT_L1_SIZE), *tte_p);
-                                       pa = 0x0ULL;
-                                       tt_p = (tt_entry_t *)NULL;
-                                       if ((pmap == kernel_pmap) && (VM_MIN_KERNEL_ADDRESS < 0x00000000FFFFFFFFULL)) {
-                                               current_pmap()->tte[v >> ARM_TT_L1_SHIFT] = kernel_pmap->tte[v >> ARM_TT_L1_SHIFT];
-                                       }
+               if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
+                       PMAP_UNLOCK(pmap);
+                       while (pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
+                               if (options & PMAP_OPTIONS_NOWAIT) {
+                                       return KERN_RESOURCE_SHORTAGE;
                                }
+                               VM_PAGE_WAIT();
                        }
-               } else if (ttlevel == 2) {
-                       if (pmap_tt3e(pmap, v) == PT_ENTRY_NULL) {
-                               PMAP_UNLOCK(pmap);
-                               while (pmap_tt_allocate(pmap, &tt_p, PMAP_TT_L3_LEVEL, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
-                                       if (options & PMAP_OPTIONS_NOWAIT) {
-                                               return KERN_RESOURCE_SHORTAGE;
-                                       }
-                                       VM_PAGE_WAIT();
-                               }
-                               PMAP_LOCK(pmap);
-                               if ((pmap_tt3e(pmap, v) == PT_ENTRY_NULL)) {
-                                       pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, PMAP_TT_L3_LEVEL, FALSE);
-                                       pa = kvtophys((vm_offset_t)tt_p);
-                                       tte_p = pmap_tt2e( pmap, v);
-                                       *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
-                                       PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~ARM_TT_L2_OFFMASK),
-                                           VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L2_OFFMASK) + ARM_TT_L2_SIZE), *tte_p);
-                                       pa = 0x0ULL;
-                                       tt_p = (tt_entry_t *)NULL;
-                               }
+                       PMAP_LOCK(pmap);
+                       if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
+                               pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE, TRUE);
+                               pa = kvtophys((vm_offset_t)tt_p);
+                               tte_p = pmap_ttne(pmap, ttlevel, v);
+                               *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
+                               PMAP_TRACE(ttlevel + 1, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
+                                   VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
+                               pa = 0x0ULL;
+                               tt_p = (tt_entry_t *)NULL;
                        }
                }
 
@@ -7598,21 +7827,29 @@ pmap_switch_user_ttb_internal(
        if ((cpu_data_ptr->cpu_user_pmap != PMAP_NULL)
            && (cpu_data_ptr->cpu_user_pmap != kernel_pmap)) {
                unsigned int    c;
+               tt_entry_t      *tt_entry = cpu_data_ptr->cpu_user_pmap->prev_tte;
 
-               c = hw_atomic_sub((volatile uint32_t *)&cpu_data_ptr->cpu_user_pmap->cpu_ref, 1);
-               if ((c == 0) && (cpu_data_ptr->cpu_user_pmap->prev_tte != 0)) {
+               c = os_atomic_dec(&cpu_data_ptr->cpu_user_pmap->cpu_ref, acq_rel);
+               if ((c == 0) && (tt_entry != NULL)) {
                        /* We saved off the old 1-page tt1 in pmap_expand() in case other cores were still using it.
                         * Now that the user pmap's cpu_ref is 0, we should be able to safely free it.*/
-                       tt_entry_t      *tt_entry;
 
-                       tt_entry = cpu_data_ptr->cpu_user_pmap->prev_tte;
-                       cpu_data_ptr->cpu_user_pmap->prev_tte = (tt_entry_t *) NULL;
+                       cpu_data_ptr->cpu_user_pmap->prev_tte = NULL;
+#if !__ARM_USER_PROTECT__
+                       set_mmu_ttb(kernel_pmap->ttep);
+                       set_context_id(kernel_pmap->hw_asid);
+#endif
+                       /* Now that we can guarantee the old 1-page L1 table is no longer active on any CPU,
+                        * flush any cached intermediate translations that may point to it.  Note that to be truly
+                        * safe from prefetch-related issues, this table PA must have been cleared from TTBR0 prior
+                        * to this call.  __ARM_USER_PROTECT__ effectively guarantees that for all current configurations.*/
+                       flush_mmu_tlb_asid(cpu_data_ptr->cpu_user_pmap->hw_asid);
                        pmap_tt1_deallocate(cpu_data_ptr->cpu_user_pmap, tt_entry, ARM_PGBYTES, PMAP_TT_DEALLOCATE_NOBLOCK);
                }
        }
        cpu_data_ptr->cpu_user_pmap = pmap;
        cpu_data_ptr->cpu_user_pmap_stamp = pmap->stamp;
-       (void) hw_atomic_add((volatile uint32_t *)&pmap->cpu_ref, 1);
+       os_atomic_inc(&pmap->cpu_ref, acq_rel);
 
 #if     MACH_ASSERT && __ARM_USER_PROTECT__
        {
@@ -7646,7 +7883,7 @@ pmap_switch_user_ttb_internal(
        }
 
 #if !__ARM_USER_PROTECT__
-       set_context_id(pmap->asid);
+       set_context_id(pmap->hw_asid);
 #endif
 
 #else /* (__ARM_VMSA__ == 7) */
@@ -7658,16 +7895,33 @@ pmap_switch_user_ttb_internal(
        if (pmap == kernel_pmap) {
                pmap_clear_user_ttb_internal();
        } else {
-               set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->asid) << TTBR_ASID_SHIFT));
+               set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
        }
-#endif
+
+#if defined(HAS_APPLE_PAC) && (__APCFG_SUPPORTED__ || __APSTS_SUPPORTED__)
+       if (!(BootArgs->bootFlags & kBootFlagsDisableJOP) && !(BootArgs->bootFlags & kBootFlagsDisableUserJOP)) {
+               uint64_t sctlr = __builtin_arm_rsr64("SCTLR_EL1");
+               bool jop_enabled = sctlr & SCTLR_JOP_KEYS_ENABLED;
+               if (!jop_enabled && !pmap->disable_jop) {
+                       // turn on JOP
+                       sctlr |= SCTLR_JOP_KEYS_ENABLED;
+                       __builtin_arm_wsr64("SCTLR_EL1", sctlr);
+                       // no ISB necessary because this won't take effect until eret returns to EL0
+               } else if (jop_enabled && pmap->disable_jop) {
+                       // turn off JOP
+                       sctlr &= ~SCTLR_JOP_KEYS_ENABLED;
+                       __builtin_arm_wsr64("SCTLR_EL1", sctlr);
+               }
+       }
+#endif /* HAS_APPLE_PAC && (__APCFG_SUPPORTED__ || __APSTS_SUPPORTED__) */
+#endif /* (__ARM_VMSA__ == 7) */
 }
 
 void
 pmap_switch_user_ttb(
        pmap_t pmap)
 {
-       PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH_USER_TTB) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid);
+       PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH_USER_TTB) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
        pmap_switch_user_ttb_internal(pmap);
        PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH_USER_TTB) | DBG_FUNC_END);
 }
@@ -7702,16 +7956,16 @@ arm_force_fast_fault_internal(
        vm_prot_t       allow_mode,
        int             options)
 {
-       pmap_paddr_t    phys = ptoa(ppnum);
-       pv_entry_t     *pve_p;
-       pt_entry_t     *pte_p;
-       int             pai;
-       boolean_t       result;
-       pv_entry_t    **pv_h;
-       boolean_t       is_reusable, is_internal;
-       boolean_t       tlb_flush_needed = FALSE;
-       boolean_t       ref_fault;
-       boolean_t       mod_fault;
+       pmap_paddr_t     phys = ptoa(ppnum);
+       pv_entry_t      *pve_p;
+       pt_entry_t      *pte_p;
+       int              pai;
+       boolean_t        result;
+       pv_entry_t     **pv_h;
+       boolean_t        is_reusable, is_internal;
+       boolean_t        tlb_flush_needed = FALSE;
+       boolean_t        ref_fault;
+       boolean_t        mod_fault;
 
        assert(ppnum != vm_page_fictitious_addr);
 
@@ -7759,7 +8013,7 @@ arm_force_fast_fault_internal(
                if (*pte_p == ARM_PTE_EMPTY) {
                        panic("pte is NULL: pte_p=%p ppnum=0x%x\n", pte_p, ppnum);
                }
-               if (ARM_PTE_IS_COMPRESSED(*pte_p)) {
+               if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
                        panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x\n", pte_p, ppnum);
                }
 
@@ -7794,7 +8048,7 @@ arm_force_fast_fault_internal(
                                }
                        } else {
                                if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWRW)) {
-                                       tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RORO));
+                                       tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pmap_get_pt_attr(pmap)));
                                        pte_set_was_writeable(tmplate, true);
                                        update_pte = TRUE;
                                        mod_fault = TRUE;
@@ -7805,9 +8059,9 @@ arm_force_fast_fault_internal(
 
                if (update_pte) {
                        if (*pte_p != ARM_PTE_TYPE_FAULT &&
-                           !ARM_PTE_IS_COMPRESSED(*pte_p)) {
+                           !ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
                                WRITE_PTE_STRONG(pte_p, tmplate);
-                               flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap);
+                               pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, PAGE_SIZE, pmap);
                                tlb_flush_needed = TRUE;
                        } else {
                                WRITE_PTE(pte_p, tmplate);
@@ -7990,7 +8244,7 @@ arm_clear_fast_fault(
                                if (pmap == kernel_pmap) {
                                        tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
                                } else {
-                                       tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWRW));
+                                       tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
                                }
                        }
 
@@ -8010,7 +8264,7 @@ arm_clear_fast_fault(
                if (spte != tmplate) {
                        if (spte != ARM_PTE_TYPE_FAULT) {
                                WRITE_PTE_STRONG(pte_p, tmplate);
-                               flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap);
+                               pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, PAGE_SIZE, pmap);
                                tlb_flush_needed = TRUE;
                        } else {
                                WRITE_PTE(pte_p, tmplate);
@@ -8052,14 +8306,14 @@ arm_fast_fault_internal(
        pmap_t pmap,
        vm_map_address_t va,
        vm_prot_t fault_type,
-       __unused boolean_t from_user)
+       __unused bool was_af_fault,
+       __unused bool from_user)
 {
        kern_return_t   result = KERN_FAILURE;
        pt_entry_t     *ptep;
        pt_entry_t      spte = ARM_PTE_TYPE_FAULT;
        int             pai;
        pmap_paddr_t    pa;
-
        VALIDATE_PMAP(pmap);
 
        PMAP_LOCK(pmap);
@@ -8071,22 +8325,25 @@ arm_fast_fault_internal(
 
        ptep = pmap_pte(pmap, va);
        if (ptep != PT_ENTRY_NULL) {
-               spte = *ptep;
+               while (true) {
+                       spte = *ptep;
 
-               pa = pte_to_pa(spte);
+                       pa = pte_to_pa(spte);
 
-               if ((spte == ARM_PTE_TYPE_FAULT) ||
-                   ARM_PTE_IS_COMPRESSED(spte)) {
-                       PMAP_UNLOCK(pmap);
-                       return result;
-               }
+                       if ((spte == ARM_PTE_TYPE_FAULT) ||
+                           ARM_PTE_IS_COMPRESSED(spte, ptep)) {
+                               PMAP_UNLOCK(pmap);
+                               return result;
+                       }
 
-               if (!pa_valid(pa)) {
-                       PMAP_UNLOCK(pmap);
-                       return result;
+                       if (!pa_valid(pa)) {
+                               PMAP_UNLOCK(pmap);
+                               return result;
+                       }
+                       pai = (int)pa_index(pa);
+                       LOCK_PVH(pai);
+                       break;
                }
-               pai = (int)pa_index(pa);
-               LOCK_PVH(pai);
        } else {
                PMAP_UNLOCK(pmap);
                return result;
@@ -8132,7 +8389,8 @@ arm_fast_fault(
        pmap_t pmap,
        vm_map_address_t va,
        vm_prot_t fault_type,
-       __unused boolean_t from_user)
+       bool was_af_fault,
+       __unused bool from_user)
 {
        kern_return_t   result = KERN_FAILURE;
 
@@ -8163,7 +8421,7 @@ arm_fast_fault(
        }
 #endif
 
-       result = arm_fast_fault_internal(pmap, va, fault_type, from_user);
+       result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
 
 #if (__ARM_VMSA__ == 7)
 done:
@@ -8260,7 +8518,7 @@ pmap_map_globals(
 #endif
        *ptep = pte;
        FLUSH_PTE_RANGE(ptep, (ptep + 1));
-       PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE);
+       PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false);
 }
 
 vm_offset_t
@@ -8279,16 +8537,19 @@ pmap_map_cpu_windows_copy_internal(
        unsigned int wimg_bits)
 {
        pt_entry_t      *ptep = NULL, pte;
+       pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
        unsigned int    cpu_num;
        unsigned int    i;
        vm_offset_t     cpu_copywindow_vaddr = 0;
+       bool            need_strong_sync = false;
 
-       cpu_num = pmap_get_cpu_data()->cpu_number;
+
+       cpu_num = pmap_cpu_data->cpu_number;
 
        for (i = 0; i < CPUWINDOWS_MAX; i++) {
                cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
                ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
-               assert(!ARM_PTE_IS_COMPRESSED(*ptep));
+               assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
                if (*ptep == ARM_PTE_TYPE_FAULT) {
                        break;
                }
@@ -8316,7 +8577,8 @@ pmap_map_cpu_windows_copy_internal(
         * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
         */
        FLUSH_PTE_STRONG(ptep);
-       PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE);
+       PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i]);
+       pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
 
        return i;
 }
@@ -8337,8 +8599,9 @@ pmap_unmap_cpu_windows_copy_internal(
        pt_entry_t      *ptep;
        unsigned int    cpu_num;
        vm_offset_t     cpu_copywindow_vaddr = 0;
+       pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
 
-       cpu_num = pmap_get_cpu_data()->cpu_number;
+       cpu_num = pmap_cpu_data->cpu_number;
 
        cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
        /* Issue full-system DSB to ensure prior operations on the per-CPU window
@@ -8347,7 +8610,7 @@ pmap_unmap_cpu_windows_copy_internal(
        __builtin_arm_dsb(DSB_SY);
        ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
        WRITE_PTE_STRONG(ptep, ARM_PTE_TYPE_FAULT);
-       PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE);
+       PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index]);
 }
 
 void
@@ -8400,6 +8663,7 @@ pmap_trim_range(
        addr64_t adjust_offmask;
        tt_entry_t * tte_p;
        pt_entry_t * pte_p;
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
 
        if (__improbable(end < start)) {
                panic("%s: invalid address range, "
@@ -8419,19 +8683,13 @@ pmap_trim_range(
        }
 
        /* Contract the range to TT page boundaries. */
-#if (__ARM_VMSA__ > 7)
-       adjust_offmask = ARM_TT_TWIG_OFFMASK;
-#else /* (__ARM_VMSA__ > 7) */
-       adjust_offmask = ((ARM_TT_TWIG_SIZE * 4) - 1);
-#endif /* (__ARM_VMSA__ > 7) */
-
+       adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
        adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
        adjusted_end = end & ~adjust_offmask;
+       bool modified = false;
 
        /* Iterate over the range, trying to remove TTEs. */
-       for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += ARM_TT_TWIG_SIZE) {
-               bool modified = false;
-
+       for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
                PMAP_LOCK(pmap);
 
                tte_p = pmap_tte(pmap, cur);
@@ -8443,43 +8701,27 @@ pmap_trim_range(
                if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
                        pte_p = (pt_entry_t *) ttetokv(*tte_p);
 
-#if (__ARM_VMSA__ == 7)
-                       if ((ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) &&
-                           (pmap != kernel_pmap)) {
-                               if (pmap->nested == TRUE) {
-                                       /* Deallocate for the nested map. */
-                                       pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L1_LEVEL);
-                               } else {
-                                       /* Just remove for the parent map. */
-                                       pmap_tte_remove(pmap, tte_p, PMAP_TT_L1_LEVEL);
-                               }
-
-                               flush_mmu_tlb_entry((cur & ~ARM_TT_L1_OFFMASK) | (pmap->asid & 0xff));
-                               modified = true;
-                       }
-#else
-                       if ((ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) &&
+                       if ((ptep_get_ptd(pte_p)->ptd_info[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) &&
                            (pmap != kernel_pmap)) {
                                if (pmap->nested == TRUE) {
                                        /* Deallocate for the nested map. */
-                                       pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L2_LEVEL);
+                                       pmap_tte_deallocate(pmap, tte_p, pt_attr_twig_level(pt_attr));
                                } else {
                                        /* Just remove for the parent map. */
-                                       pmap_tte_remove(pmap, tte_p, PMAP_TT_L2_LEVEL);
+                                       pmap_tte_remove(pmap, tte_p, pt_attr_twig_level(pt_attr));
                                }
 
-                               flush_mmu_tlb_entry(tlbi_addr(cur & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid));
+                               pmap_get_pt_ops(pmap)->flush_tlb_tte_async(cur, pmap);
                                modified = true;
                        }
-#endif
                }
 
 done:
                PMAP_UNLOCK(pmap);
+       }
 
-               if (modified) {
-                       PMAP_UPDATE_TLBS(pmap, cur, cur + PAGE_SIZE);
-               }
+       if (modified) {
+               sync_tlb_flush();
        }
 
 #if (__ARM_VMSA__ > 7)
@@ -8525,7 +8767,7 @@ done:
 
                if (remove_tt1e) {
                        pmap_tte_deallocate(pmap, tt1e_p, PMAP_TT_L1_LEVEL);
-                       PMAP_UPDATE_TLBS(pmap, cur, cur + PAGE_SIZE);
+                       PMAP_UPDATE_TLBS(pmap, cur, cur + PAGE_SIZE, false);
                }
 
                PMAP_UNLOCK(pmap);
@@ -8571,6 +8813,8 @@ pmap_trim_internal(
        VALIDATE_PMAP(grand);
        VALIDATE_PMAP(subord);
 
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
+
        PMAP_LOCK(subord);
 
        if (!subord->nested) {
@@ -8621,11 +8865,7 @@ pmap_trim_internal(
        }
 
        if ((!subord->nested_bounds_set) && size) {
-#if (__ARM_VMSA__ > 7)
-               adjust_offmask = ARM_TT_TWIG_OFFMASK;
-#else /* (__ARM_VMSA__ > 7) */
-               adjust_offmask = ((ARM_TT_TWIG_SIZE * 4) - 1);
-#endif /* (__ARM_VMSA__ > 7) */
+               adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
 
                subord->nested_region_true_start = nstart;
                subord->nested_region_true_end = nend;
@@ -8729,6 +8969,7 @@ pmap_trim(
        pmap_trim_internal(grand, subord, vstart, nstart, size);
 }
 
+
 /*
  *     kern_return_t pmap_nest(grand, subord, vstart, size)
  *
@@ -8767,19 +9008,17 @@ pmap_nest_internal(
        if (__improbable(os_add_overflow(nstart, size, &nend))) {
                panic("%s: %p nested addr wraps around: 0x%llx + 0x%llx", __func__, subord, nstart, size);
        }
+
        VALIDATE_PMAP(grand);
        VALIDATE_PMAP(subord);
 
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
+       assert(pmap_get_pt_attr(subord) == pt_attr);
 
-#if     (__ARM_VMSA__ == 7)
-       if (((size | vstart | nstart) & ARM_TT_L1_PT_OFFMASK) != 0x0ULL) {
-               return KERN_INVALID_VALUE;      /* Nest 4MB region */
-       }
-#else
-       if (((size | vstart | nstart) & (ARM_TT_L2_OFFMASK)) != 0x0ULL) {
+
+       if (((size | vstart | nstart) & (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL) {
                panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx\n", grand, vstart, nstart, size);
        }
-#endif
 
        if (!subord->nested) {
                panic("%s: subordinate pmap %p is not nestable", __func__, subord);
@@ -8790,7 +9029,7 @@ pmap_nest_internal(
        }
 
        if (subord->nested_region_asid_bitmap == NULL) {
-               nested_region_asid_bitmap_size  = (unsigned int)(size >> ARM_TT_TWIG_SHIFT) / (sizeof(unsigned int) * NBBY);
+               nested_region_asid_bitmap_size  = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY);
 
                nested_region_asid_bitmap = kalloc(nested_region_asid_bitmap_size * sizeof(unsigned int));
                bzero(nested_region_asid_bitmap, nested_region_asid_bitmap_size * sizeof(unsigned int));
@@ -8818,7 +9057,7 @@ pmap_nest_internal(
                new_size =  nend - subord->nested_region_subord_addr;
 
                /* We explicitly add 1 to the bitmap allocation size in order to avoid issues with truncation. */
-               new_nested_region_asid_bitmap_size  = (unsigned int)((new_size >> ARM_TT_TWIG_SHIFT) / (sizeof(unsigned int) * NBBY)) + 1;
+               new_nested_region_asid_bitmap_size  = (unsigned int)((new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY)) + 1;
 
                new_nested_region_asid_bitmap = kalloc(new_nested_region_asid_bitmap_size * sizeof(unsigned int));
                PMAP_LOCK(subord);
@@ -8909,17 +9148,17 @@ expand_next:
 
 #else
        nvaddr = (vm_map_offset_t) nstart;
-       num_tte = (unsigned int)(size >> ARM_TT_L2_SHIFT);
+       num_tte = (unsigned int)(size >> pt_attr_twig_shift(pt_attr));
 
        for (i = 0; i < num_tte; i++) {
                if (((subord->nested_region_true_start) > nvaddr) || ((subord->nested_region_true_end) <= nvaddr)) {
                        goto expand_next;
                }
 
-               stte_p = pmap_tt2e(subord, nvaddr);
+               stte_p = pmap_tte(subord, nvaddr);
                if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
                        PMAP_UNLOCK(subord);
-                       kr = pmap_expand(subord, nvaddr, expand_options, PMAP_TT_L3_LEVEL);
+                       kr = pmap_expand(subord, nvaddr, expand_options, PMAP_TT_LEAF_LEVEL);
 
                        if (kr != KERN_SUCCESS) {
                                PMAP_LOCK(grand);
@@ -8929,7 +9168,7 @@ expand_next:
                        PMAP_LOCK(subord);
                }
 expand_next:
-               nvaddr += ARM_TT_L2_SIZE;
+               nvaddr += pt_attr_twig_size(pt_attr);
        }
 #endif
        PMAP_UNLOCK(subord);
@@ -8963,11 +9202,11 @@ nest_next:
                        goto nest_next;
                }
 
-               stte_p = pmap_tt2e(subord, nvaddr);
-               gtte_p = pmap_tt2e(grand, vaddr);
+               stte_p = pmap_tte(subord, nvaddr);
+               gtte_p = pmap_tte(grand, vaddr);
                if (gtte_p == PT_ENTRY_NULL) {
                        PMAP_UNLOCK(grand);
-                       kr = pmap_expand(grand, vaddr, expand_options, PMAP_TT_L2_LEVEL);
+                       kr = pmap_expand(grand, vaddr, expand_options, PMAP_TT_TWIG_LEVEL);
                        PMAP_LOCK(grand);
 
                        if (kr != KERN_SUCCESS) {
@@ -8979,8 +9218,8 @@ nest_next:
                *gtte_p = *stte_p;
 
 nest_next:
-               vaddr += ARM_TT_L2_SIZE;
-               nvaddr += ARM_TT_L2_SIZE;
+               vaddr += pt_attr_twig_size(pt_attr);
+               nvaddr += pt_attr_twig_size(pt_attr);
        }
 #endif
 
@@ -8996,7 +9235,7 @@ done:
         */
        assert((size & 0xFFFFFFFF00000000ULL) == 0);
 #endif
-       PMAP_UPDATE_TLBS(grand, vstart, vend);
+       PMAP_UPDATE_TLBS(grand, vstart, vend, false);
 
        PMAP_UNLOCK(grand);
        return kr;
@@ -9064,15 +9303,11 @@ pmap_unnest_options_internal(
 
        VALIDATE_PMAP(grand);
 
-#if     (__ARM_VMSA__ == 7)
-       if (((size | vaddr) & ARM_TT_L1_PT_OFFMASK) != 0x0ULL) {
-               panic("pmap_unnest(): unaligned request\n");
-       }
-#else
-       if (((size | vaddr) & ARM_TT_L2_OFFMASK) != 0x0ULL) {
-               panic("pmap_unnest(): unaligned request\n");
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
+
+       if (((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
+               panic("pmap_unnest(): unaligned request");
        }
-#endif
 
        if ((option & PMAP_UNNEST_CLEAN) == 0) {
                if (grand->nested_pmap == NULL) {
@@ -9086,11 +9321,11 @@ pmap_unnest_options_internal(
                PMAP_LOCK(grand->nested_pmap);
 
                start = vaddr - grand->nested_region_grand_addr + grand->nested_region_subord_addr;
-               start_index = (unsigned int)((vaddr - grand->nested_region_grand_addr)  >> ARM_TT_TWIG_SHIFT);
-               max_index = (unsigned int)(start_index + (size >> ARM_TT_TWIG_SHIFT));
-               num_tte = (unsigned int)(size >> ARM_TT_TWIG_SHIFT);
+               start_index = (unsigned int)((vaddr - grand->nested_region_grand_addr)  >> pt_attr_twig_shift(pt_attr));
+               max_index = (unsigned int)(start_index + (size >> pt_attr_twig_shift(pt_attr)));
+               num_tte = (unsigned int)(size >> pt_attr_twig_shift(pt_attr));
 
-               for (current_index = start_index, addr = start; current_index < max_index; current_index++, addr += ARM_TT_TWIG_SIZE) {
+               for (current_index = start_index, addr = start; current_index < max_index; current_index++, addr += pt_attr_twig_size(pt_attr)) {
                        pt_entry_t  *bpte, *epte, *cpte;
 
                        if (addr < grand->nested_pmap->nested_region_true_start) {
@@ -9104,7 +9339,7 @@ pmap_unnest_options_internal(
                        }
 
                        bpte = pmap_pte(grand->nested_pmap, addr);
-                       epte = bpte + (ARM_TT_LEAF_INDEX_MASK >> ARM_TT_LEAF_SHIFT);
+                       epte = bpte + (pt_attr_leaf_index_mask(pt_attr) >> pt_attr_leaf_shift(pt_attr));
 
                        if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap)) {
                                setbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap);
@@ -9116,7 +9351,7 @@ pmap_unnest_options_internal(
                                        pt_entry_t  spte;
 
                                        if ((*cpte != ARM_PTE_TYPE_FAULT)
-                                           && (!ARM_PTE_IS_COMPRESSED(*cpte))) {
+                                           && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
                                                spte = *cpte;
                                                while (!managed) {
                                                        pa = pte_to_pa(spte);
@@ -9163,9 +9398,9 @@ pmap_unnest_options_internal(
        start = vaddr;
        addr = vaddr;
 
-       num_tte = (unsigned int)(size >> ARM_TT_TWIG_SHIFT);
+       num_tte = (unsigned int)(size >> pt_attr_twig_shift(pt_attr));
 
-       for (i = 0; i < num_tte; i++, addr += ARM_TT_TWIG_SIZE) {
+       for (i = 0; i < num_tte; i++, addr += pt_attr_twig_size(pt_attr)) {
                if (addr < grand->nested_pmap->nested_region_true_start) {
                        /* We haven't reached the interesting range. */
                        continue;
@@ -9182,7 +9417,7 @@ pmap_unnest_options_internal(
 
        tte_p = pmap_tte(grand, start);
        FLUSH_PTE_RANGE_STRONG(tte_p, tte_p + num_tte);
-       PMAP_UPDATE_TLBS(grand, start, vend);
+       PMAP_UPDATE_TLBS(grand, start, vend, false);
 
        PMAP_UNLOCK(grand);
 
@@ -9267,8 +9502,14 @@ pt_fake_zone_info(
  * an ARM small page (4K).
  */
 
-#define ARM_FULL_TLB_FLUSH_THRESHOLD     64
+#define ARM_FULL_TLB_FLUSH_THRESHOLD 64
+
+#if __ARM_RANGE_TLBI__
+#define ARM64_RANGE_TLB_FLUSH_THRESHOLD 1
+#define ARM64_FULL_TLB_FLUSH_THRESHOLD  ARM64_16K_TLB_RANGE_PAGES
+#else
 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  256
+#endif // __ARM_RANGE_TLBI__
 
 static void
 flush_mmu_tlb_region_asid_async(
@@ -9280,7 +9521,7 @@ flush_mmu_tlb_region_asid_async(
        vm_offset_t     end = va + length;
        uint32_t        asid;
 
-       asid = pmap->asid;
+       asid = pmap->hw_asid;
 
        if (length / ARM_SMALL_PAGE_SIZE > ARM_FULL_TLB_FLUSH_THRESHOLD) {
                boolean_t       flush_all = FALSE;
@@ -9312,12 +9553,12 @@ flush_mmu_tlb_region_asid_async(
        flush_mmu_tlb_entries_async(va, end);
 
 #else
-       vm_offset_t             end = va + length;
-       uint32_t                asid;
+       unsigned    npages = length >> pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
+       uint32_t    asid;
 
-       asid = pmap->asid;
+       asid = pmap->hw_asid;
 
-       if ((length >> ARM_TT_L3_SHIFT) > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
+       if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
                boolean_t       flush_all = FALSE;
 
                if ((asid == 0) || (pmap->nested == TRUE)) {
@@ -9330,8 +9571,19 @@ flush_mmu_tlb_region_asid_async(
                }
                return;
        }
+#if __ARM_RANGE_TLBI__
+       if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
+               va = generate_rtlbi_param(npages, asid, va);
+               if (pmap->nested == TRUE) {
+                       flush_mmu_tlb_allrange_async(va);
+               } else {
+                       flush_mmu_tlb_range_async(va);
+               }
+               return;
+       }
+#endif
+       vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
        va = tlbi_asid(asid) | tlbi_addr(va);
-       end = tlbi_asid(asid) | tlbi_addr(end);
        if (pmap->nested == TRUE) {
                flush_mmu_tlb_allentries_async(va, end);
        } else {
@@ -9341,6 +9593,29 @@ flush_mmu_tlb_region_asid_async(
 #endif
 }
 
+MARK_AS_PMAP_TEXT static void
+flush_mmu_tlb_tte_asid_async(vm_offset_t va, pmap_t pmap)
+{
+#if     (__ARM_VMSA__ == 7)
+       flush_mmu_tlb_entry_async((va & ~ARM_TT_L1_PT_OFFMASK) | (pmap->hw_asid & 0xff));
+       flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
+       flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 2 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
+       flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 3 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
+#else
+       flush_mmu_tlb_entry_async(tlbi_addr(va & ~pt_attr_twig_offmask(pmap_get_pt_attr(pmap))) | tlbi_asid(pmap->hw_asid));
+#endif
+}
+
+MARK_AS_PMAP_TEXT static void
+flush_mmu_tlb_full_asid_async(pmap_t pmap)
+{
+#if (__ARM_VMSA__ == 7)
+       flush_mmu_tlb_asid_async(pmap->hw_asid);
+#else /* (__ARM_VMSA__ == 7) */
+       flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT);
+#endif /* (__ARM_VMSA__ == 7) */
+}
+
 void
 flush_mmu_tlb_region(
        vm_offset_t va,
@@ -9350,18 +9625,21 @@ flush_mmu_tlb_region(
        sync_tlb_flush();
 }
 
-static unsigned int
+static pmap_io_range_t*
 pmap_find_io_attr(pmap_paddr_t paddr)
 {
-       pmap_io_range_t find_range = {.addr = paddr, .len = PAGE_SIZE};
+       pmap_io_range_t find_range = {.addr = paddr & ~PAGE_MASK, .len = PAGE_SIZE};
        unsigned int begin = 0, end = num_io_rgns - 1;
-       assert(num_io_rgns > 0);
+       if ((num_io_rgns == 0) || (paddr < io_attr_table[begin].addr) ||
+           (paddr >= (io_attr_table[end].addr + io_attr_table[end].len))) {
+               return NULL;
+       }
 
        for (;;) {
                unsigned int middle = (begin + end) / 2;
                int cmp = cmp_io_rgns(&find_range, &io_attr_table[middle]);
                if (cmp == 0) {
-                       return io_attr_table[middle].wimg;
+                       return &io_attr_table[middle];
                } else if (begin == end) {
                        break;
                } else if (cmp > 0) {
@@ -9370,9 +9648,8 @@ pmap_find_io_attr(pmap_paddr_t paddr)
                        end = middle;
                }
        }
-       ;
 
-       return VM_WIMG_IO;
+       return NULL;
 }
 
 unsigned int
@@ -9386,21 +9663,11 @@ pmap_cache_attributes(
 
        paddr = ptoa(pn);
 
-       if ((paddr >= io_rgn_start) && (paddr < io_rgn_end)) {
-               return pmap_find_io_attr(paddr);
-       }
-
-       if (!pmap_initialized) {
-               if ((paddr >= gPhysBase) && (paddr < gPhysBase + gPhysSize)) {
-                       return VM_WIMG_DEFAULT;
-               } else {
-                       return VM_WIMG_IO;
-               }
-       }
-
+       assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
 
        if (!pa_valid(paddr)) {
-               return VM_WIMG_IO;
+               pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
+               return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
        }
 
        result = VM_WIMG_DEFAULT;
@@ -9572,7 +9839,7 @@ pmap_batch_set_cache_attributes_internal(
        }
 
        return TRUE;
-};
+}
 
 boolean_t
 pmap_batch_set_cache_attributes(
@@ -9656,7 +9923,7 @@ pmap_set_cache_attributes(
        pmap_set_cache_attributes_internal(pn, cacheattr);
 }
 
-void
+MARK_AS_PMAP_TEXT void
 pmap_update_cache_attributes_locked(
        ppnum_t ppnum,
        unsigned attributes)
@@ -9669,6 +9936,8 @@ pmap_update_cache_attributes_locked(
        unsigned int    pai;
        boolean_t       tlb_flush_needed = FALSE;
 
+       PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
+
 #if __ARM_PTE_PHYSMAP__
        vm_offset_t kva = phystokv(phys);
        pte_p = pmap_pte(kernel_pmap, kva);
@@ -9717,10 +9986,10 @@ pmap_update_cache_attributes_locked(
 
                tmplate = *pte_p;
                tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
-               tmplate |= wimg_to_pte(attributes);
+               tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes);
 
                WRITE_PTE_STRONG(pte_p, tmplate);
-               flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap);
+               pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, PAGE_SIZE, pmap);
                tlb_flush_needed = TRUE;
 
 #ifdef PVH_FLAG_IOMMU
@@ -9734,6 +10003,8 @@ cache_skip_pve:
        if (tlb_flush_needed) {
                sync_tlb_flush();
        }
+
+       PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
 }
 
 #if     (__ARM_VMSA__ == 7)
@@ -9822,7 +10093,7 @@ pmap_create_sharedpage(
         * Note that we update parameters of the entry for our unique needs (NG
         * entry, etc.).
         */
-       sharedpage_pmap = pmap_create(NULL, 0x0, FALSE);
+       sharedpage_pmap = pmap_create_options(NULL, 0x0, 0);
        assert(sharedpage_pmap != NULL);
 
        /* The user 64-bit mapping... */
@@ -9843,7 +10114,7 @@ pmap_create_sharedpage(
  * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
  * with user controlled TTEs.
  */
-#if (ARM_PGSHIFT == 14) || __ARM64_TWO_LEVEL_PMAP__
+#if (ARM_PGSHIFT == 14)
 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= MACH_VM_MAX_ADDRESS);
 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
 #elif (ARM_PGSHIFT == 12)
@@ -9884,9 +10155,6 @@ pmap_insert_sharedpage_internal(
         * order to nest.
         */
 #if (ARM_PGSHIFT == 12)
-#if __ARM64_TWO_LEVEL_PMAP__
-#error A two level page table with a page shift of 12 is not currently supported
-#endif
        (void)options;
 
        /* Just slam in the L1 entry.  */
@@ -9898,7 +10166,6 @@ pmap_insert_sharedpage_internal(
 
        src_ttep = pmap_tt1e(sharedpage_pmap, sharedpage_vaddr);
 #elif (ARM_PGSHIFT == 14)
-#if !__ARM64_TWO_LEVEL_PMAP__
        /* Allocate for the L2 entry if necessary, and slam it into place. */
        /*
         * As long as we are use a three level page table, the first level
@@ -9917,7 +10184,6 @@ pmap_insert_sharedpage_internal(
 
                PMAP_LOCK(pmap);
        }
-#endif
 
        ttep = pmap_tt2e(pmap, sharedpage_vaddr);
 
@@ -9934,10 +10200,10 @@ pmap_insert_sharedpage_internal(
        /* TODO: Should we flush in the 64-bit case? */
        flush_mmu_tlb_region_asid_async(sharedpage_vaddr, PAGE_SIZE, kernel_pmap);
 
-#if (ARM_PGSHIFT == 12) && !__ARM64_TWO_LEVEL_PMAP__
-       flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->asid));
+#if (ARM_PGSHIFT == 12)
+       flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->hw_asid));
 #elif (ARM_PGSHIFT == 14)
-       flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid));
+       flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->hw_asid));
 #endif
        sync_tlb_flush();
 
@@ -9964,9 +10230,6 @@ pmap_unmap_sharedpage(
        }
 
 #if (ARM_PGSHIFT == 12)
-#if __ARM64_TWO_LEVEL_PMAP__
-#error A two level page table with a page shift of 12 is not currently supported
-#endif
        ttep = pmap_tt1e(pmap, sharedpage_vaddr);
 
        if (ttep == NULL) {
@@ -9994,12 +10257,9 @@ pmap_unmap_sharedpage(
        flush_mmu_tlb_region_asid_async(sharedpage_vaddr, PAGE_SIZE, kernel_pmap);
 
 #if (ARM_PGSHIFT == 12)
-#if __ARM64_TWO_LEVEL_PMAP__
-#error A two level page table with a page shift of 12 is not currently supported
-#endif
-       flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->asid));
+       flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->hw_asid));
 #elif (ARM_PGSHIFT == 14)
-       flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid));
+       flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->hw_asid));
 #endif
        sync_tlb_flush();
 }
@@ -10045,69 +10305,44 @@ pmap_is_empty_internal(
 
        VALIDATE_PMAP(pmap);
 
-       if ((pmap != kernel_pmap) && (not_in_kdp)) {
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
+       unsigned int initial_not_in_kdp = not_in_kdp;
+
+       if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
                PMAP_LOCK(pmap);
        }
 
 #if     (__ARM_VMSA__ == 7)
-       if (tte_index(pmap, va_end) >= pmap->tte_index_max) {
-               if ((pmap != kernel_pmap) && (not_in_kdp)) {
+       if (tte_index(pmap, pt_attr, va_end) >= pmap->tte_index_max) {
+               if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
                        PMAP_UNLOCK(pmap);
                }
                return TRUE;
        }
+#endif
 
-       block_start = va_start;
-       tte_p = pmap_tte(pmap, block_start);
-       while (block_start < va_end) {
-               block_end = (block_start + ARM_TT_L1_SIZE) & ~(ARM_TT_L1_OFFMASK);
-               if (block_end > va_end) {
-                       block_end = va_end;
-               }
-
-               if ((*tte_p & ARM_TTE_TYPE_MASK) != 0) {
-                       vm_map_offset_t offset;
-                       ppnum_t phys_page = 0;
-
-                       for (offset = block_start;
-                           offset < block_end;
-                           offset += ARM_PGBYTES) {
-                               // This does a pmap_find_phys() lookup but assumes lock is held
-                               phys_page = pmap_vtophys(pmap, offset);
-                               if (phys_page) {
-                                       if ((pmap != kernel_pmap) && (not_in_kdp)) {
-                                               PMAP_UNLOCK(pmap);
-                                       }
-                                       return FALSE;
-                               }
-                       }
-               }
-
-               block_start = block_end;
-               tte_p++;
-       }
-#else
+       /* TODO: This will be faster if we increment ttep at each level. */
        block_start = va_start;
 
        while (block_start < va_end) {
                pt_entry_t     *bpte_p, *epte_p;
                pt_entry_t     *pte_p;
 
-               block_end = (block_start + ARM_TT_L2_SIZE) & ~ARM_TT_L2_OFFMASK;
+               block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
                if (block_end > va_end) {
                        block_end = va_end;
                }
 
-               tte_p = pmap_tt2e(pmap, block_start);
+               tte_p = pmap_tte(pmap, block_start);
                if ((tte_p != PT_ENTRY_NULL)
                    && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
                        pte_p = (pt_entry_t *) ttetokv(*tte_p);
-                       bpte_p = &pte_p[tt3_index(pmap, block_start)];
-                       epte_p = bpte_p + (((block_end - block_start) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT);
+                       bpte_p = &pte_p[pte_index(pmap, pt_attr, block_start)];
+                       epte_p = &pte_p[pte_index(pmap, pt_attr, block_end)];
 
                        for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
                                if (*pte_p != ARM_PTE_EMPTY) {
-                                       if ((pmap != kernel_pmap) && (not_in_kdp)) {
+                                       if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
                                                PMAP_UNLOCK(pmap);
                                        }
                                        return FALSE;
@@ -10116,9 +10351,8 @@ pmap_is_empty_internal(
                }
                block_start = block_end;
        }
-#endif
 
-       if ((pmap != kernel_pmap) && (not_in_kdp)) {
+       if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
                PMAP_UNLOCK(pmap);
        }
 
@@ -10308,18 +10542,13 @@ pmap_query_resident_internal(
                return PMAP_RESIDENT_INVALID;
        }
        if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
-#if     (__ARM_VMSA__ == 7)
-               pte_p = (pt_entry_t *) ttetokv(*tte_p);
-               bpte = &pte_p[ptenum(start)];
-               epte = bpte + atop(end - start);
-#else
+               __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
                pte_p = (pt_entry_t *) ttetokv(*tte_p);
-               bpte = &pte_p[tt3_index(pmap, start)];
-               epte = bpte + ((end - start) >> ARM_TT_L3_SHIFT);
-#endif
+               bpte = &pte_p[pte_index(pmap, pt_attr, start)];
+               epte = &pte_p[pte_index(pmap, pt_attr, end)];
 
                for (; bpte < epte; bpte++) {
-                       if (ARM_PTE_IS_COMPRESSED(*bpte)) {
+                       if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
                                compressed_bytes += ARM_PGBYTES;
                        } else if (pa_valid(pte_to_pa(*bpte))) {
                                resident_bytes += ARM_PGBYTES;
@@ -10356,6 +10585,8 @@ pmap_query_resident(
                return 0;
        }
 
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
+
        total_resident_bytes = 0;
        compressed_bytes = 0;
 
@@ -10368,7 +10599,7 @@ pmap_query_resident(
                vm_map_address_t l;
                mach_vm_size_t resident_bytes;
 
-               l = ((va + ARM_TT_TWIG_SIZE) & ~ARM_TT_TWIG_OFFMASK);
+               l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
 
                if (l > end) {
                        l = end;
@@ -10398,10 +10629,8 @@ static void
 pmap_check_ledgers(
        pmap_t pmap)
 {
-       ledger_amount_t bal;
-       int             pid;
-       char            *procname;
-       boolean_t       do_panic;
+       int     pid;
+       char    *procname;
 
        if (pmap->pmap_pid == 0) {
                /*
@@ -10419,73 +10648,10 @@ pmap_check_ledgers(
                return;
        }
 
-       do_panic = FALSE;
        pid = pmap->pmap_pid;
        procname = pmap->pmap_procname;
 
-       pmap_ledgers_drift.num_pmaps_checked++;
-
-#define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
-MACRO_BEGIN                                                             \
-       int panic_on_negative = TRUE;                                   \
-       ledger_get_balance(pmap->ledger,                                \
-                          task_ledgers.__LEDGER,                       \
-                          &bal);                                       \
-       ledger_get_panic_on_negative(pmap->ledger,                      \
-                                    task_ledgers.__LEDGER,             \
-                                    &panic_on_negative);               \
-       if (bal != 0) {                                                 \
-               if (panic_on_negative ||                                \
-                   (pmap_ledgers_panic &&                              \
-                    pmap_ledgers_panic_leeway > 0 &&                   \
-                    (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
-                     bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
-                       do_panic = TRUE;                                \
-               }                                                       \
-               printf("LEDGER BALANCE proc %d (%s) "                   \
-                      "\"%s\" = %lld\n",                               \
-                      pid, procname, #__LEDGER, bal);                  \
-               if (bal > 0) {                                          \
-                       pmap_ledgers_drift.__LEDGER##_over++;           \
-                       pmap_ledgers_drift.__LEDGER##_over_total += bal; \
-                       if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
-                               pmap_ledgers_drift.__LEDGER##_over_max = bal; \
-                       }                                               \
-               } else if (bal < 0) {                                   \
-                       pmap_ledgers_drift.__LEDGER##_under++;          \
-                       pmap_ledgers_drift.__LEDGER##_under_total += bal; \
-                       if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
-                               pmap_ledgers_drift.__LEDGER##_under_max = bal; \
-                       }                                               \
-               }                                                       \
-       }                                                               \
-MACRO_END
-
-       LEDGER_CHECK_BALANCE(phys_footprint);
-       LEDGER_CHECK_BALANCE(internal);
-       LEDGER_CHECK_BALANCE(internal_compressed);
-       LEDGER_CHECK_BALANCE(iokit_mapped);
-       LEDGER_CHECK_BALANCE(alternate_accounting);
-       LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
-       LEDGER_CHECK_BALANCE(page_table);
-       LEDGER_CHECK_BALANCE(purgeable_volatile);
-       LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
-       LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
-       LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
-       LEDGER_CHECK_BALANCE(network_volatile);
-       LEDGER_CHECK_BALANCE(network_nonvolatile);
-       LEDGER_CHECK_BALANCE(network_volatile_compressed);
-       LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
-
-       if (do_panic) {
-               if (pmap_ledgers_panic) {
-                       panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
-                           pmap, pid, procname);
-               } else {
-                       printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
-                           pmap, pid, procname);
-               }
-       }
+       vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
 
        PMAP_STATS_ASSERTF(pmap->stats.resident_count == 0, pmap, "stats.resident_count %d", pmap->stats.resident_count);
 #if 00
@@ -10708,7 +10874,7 @@ pmap_pgtrace_enter_clone(pmap_t pmap, vm_map_offset_t va_page, vm_map_offset_t s
                } else {
                        PGTRACE_WRITE_PTE(cptep, *ptep);
                }
-               PMAP_UPDATE_TLBS(kernel_pmap, map->cva[i], map->cva[i] + ARM_PGBYTES);
+               PMAP_UPDATE_TLBS(kernel_pmap, map->cva[i], map->cva[i] + ARM_PGBYTES, false);
        }
 
        // get ptes for original and clone
@@ -10717,7 +10883,7 @@ pmap_pgtrace_enter_clone(pmap_t pmap, vm_map_offset_t va_page, vm_map_offset_t s
 
        // invalidate original pte and mark it as a pgtrace page
        PGTRACE_WRITE_PTE(ptep, (*ptep | ARM_PTE_PGTRACE) & ~ARM_PTE_TYPE_VALID);
-       PMAP_UPDATE_TLBS(pmap, map->ova, map->ova + ARM_PGBYTES);
+       PMAP_UPDATE_TLBS(pmap, map->ova, map->ova + ARM_PGBYTES, false);
 
        map->cloned = true;
        p->state = DEFINED;
@@ -10766,14 +10932,14 @@ pmap_pgtrace_remove_clone(pmap_t pmap, pmap_paddr_t pa, vm_map_offset_t va)
                ptep = pmap_pte(pmap, map->ova);
                assert(ptep);
                PGTRACE_WRITE_PTE(ptep, *ptep | ARM_PTE_TYPE_VALID);
-               PMAP_UPDATE_TLBS(pmap, va, va + ARM_PGBYTES);
+               PMAP_UPDATE_TLBS(pmap, va, va + ARM_PGBYTES, false);
 
                // revert clone pages
                for (int i = 0; i < 3; i++) {
                        ptep = pmap_pte(kernel_pmap, map->cva[i]);
                        assert(ptep != NULL);
                        PGTRACE_WRITE_PTE(ptep, map->cva_spte[i]);
-                       PMAP_UPDATE_TLBS(kernel_pmap, map->cva[i], map->cva[i] + ARM_PGBYTES);
+                       PMAP_UPDATE_TLBS(kernel_pmap, map->cva[i], map->cva[i] + ARM_PGBYTES, false);
                }
        }
 
@@ -10828,14 +10994,14 @@ pmap_pgtrace_remove_all_clone(pmap_paddr_t pa)
                        ptep = pmap_pte(map->pmap, map->ova);
                        assert(ptep);
                        PGTRACE_WRITE_PTE(ptep, *ptep | ARM_PTE_TYPE_VALID);
-                       PMAP_UPDATE_TLBS(map->pmap, map->ova, map->ova + ARM_PGBYTES);
+                       PMAP_UPDATE_TLBS(map->pmap, map->ova, map->ova + ARM_PGBYTES, false);
 
                        // revert clone ptes
                        for (int i = 0; i < 3; i++) {
                                ptep = pmap_pte(kernel_pmap, map->cva[i]);
                                assert(ptep != NULL);
                                PGTRACE_WRITE_PTE(ptep, map->cva_spte[i]);
-                               PMAP_UPDATE_TLBS(kernel_pmap, map->cva[i], map->cva[i] + ARM_PGBYTES);
+                               PMAP_UPDATE_TLBS(kernel_pmap, map->cva[i], map->cva[i] + ARM_PGBYTES, false);
                        }
 
                        PMAP_UNLOCK(map->pmap);
@@ -10894,6 +11060,7 @@ pmap_pgtrace_clone_from_pa(pmap_t pmap, pmap_paddr_t pa, vm_map_offset_t start_o
        pt_entry_t *ptep;
        tt_entry_t *ttep;
        tt_entry_t tte;
+       __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
 
        pmap_pgtrace_get_search_space(pmap, &min, &max);
 
@@ -10912,12 +11079,6 @@ pmap_pgtrace_clone_from_pa(pmap_t pmap, pmap_paddr_t pa, vm_map_offset_t start_o
                        goto unlock_continue;
                }
 
-#if __ARM64_TWO_LEVEL_PMAP__
-               // check whether we can skip l2
-               ttep = pmap_tt2e(pmap, cur_page);
-               assert(ttep);
-               tte = *ttep;
-#else
                // check whether we can skip l1
                ttep = pmap_tt1e(pmap, cur_page);
                assert(ttep);
@@ -10928,15 +11089,15 @@ pmap_pgtrace_clone_from_pa(pmap_t pmap, pmap_paddr_t pa, vm_map_offset_t start_o
                }
 
                // how about l2
-               tte = ((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt2_index(pmap, cur_page)];
-#endif
+               tte = ((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt2_index(pmap, pt_attr, cur_page)];
+
                if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) != (ARM_TTE_TYPE_TABLE | ARM_TTE_VALID)) {
                        add = ARM_TT_L2_SIZE;
                        goto unlock_continue;
                }
 
                // ptep finally
-               ptep = &(((pt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt3_index(pmap, cur_page)]);
+               ptep = &(((pt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt3_index(pmap, pt_attr, cur_page)]);
                if (ptep == PT_ENTRY_NULL) {
                        add = ARM_TT_L3_SIZE;
                        goto unlock_continue;
@@ -11382,7 +11543,7 @@ pmap_pgtrace_fault(pmap_t pmap, vm_map_offset_t va, arm_saved_state_t *ss)
        } else if ((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE_VALID) {
                // Somehow this cpu's tlb has not updated
                kprintf("%s Somehow this cpu's tlb has not updated?\n", __func__);
-               PMAP_UPDATE_TLBS(pmap, va, va + ARM_PGBYTES);
+               PMAP_UPDATE_TLBS(pmap, va, va + ARM_PGBYTES, false);
 
                PMAP_PGTRACE_UNLOCK(&ints);
                return KERN_SUCCESS;
@@ -11437,7 +11598,7 @@ pmap_pgtrace_fault(pmap_t pmap, vm_map_offset_t va, arm_saved_state_t *ss)
        PMAP_PGTRACE_UNLOCK(&ints);
 
        // Return to next instruction
-       set_saved_state_pc(ss, get_saved_state_pc(ss) + sizeof(uint32_t));
+       add_saved_state_pc(ss, sizeof(uint32_t));
 
        return KERN_SUCCESS;
 }
@@ -11502,7 +11663,7 @@ pmap_query_page_info_internal(
 
        pa = pte_to_pa(*pte);
        if (pa == 0) {
-               if (ARM_PTE_IS_COMPRESSED(*pte)) {
+               if (ARM_PTE_IS_COMPRESSED(*pte, pte)) {
                        disp |= PMAP_QUERY_PAGE_COMPRESSED;
                        if (*pte & ARM_PTE_COMPRESSED_ALT) {
                                disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
@@ -11566,6 +11727,7 @@ pmap_return(boolean_t do_panic, boolean_t do_recurse)
 
 
 
+
 MARK_AS_PMAP_TEXT static void
 pmap_footprint_suspend_internal(
        vm_map_t        map,
@@ -11594,16 +11756,6 @@ pmap_footprint_suspend(
 
 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
 
-struct page_table_level_info {
-       uint64_t size;
-       uint64_t offmask;
-       uint64_t shift;
-       uint64_t index_mask;
-       uint64_t valid_mask;
-       uint64_t type_mask;
-       uint64_t type_block;
-};
-
 struct page_table_dump_header {
        uint64_t pa;
        uint64_t num_entries;
@@ -11611,14 +11763,9 @@ struct page_table_dump_header {
        uint64_t end_va;
 };
 
-struct page_table_level_info page_table_levels[] =
-{ { ARM_TT_L0_SIZE, ARM_TT_L0_OFFMASK, ARM_TT_L0_SHIFT, ARM_TT_L0_INDEX_MASK, ARM_TTE_VALID, ARM_TTE_TYPE_MASK, ARM_TTE_TYPE_BLOCK },
-  { ARM_TT_L1_SIZE, ARM_TT_L1_OFFMASK, ARM_TT_L1_SHIFT, ARM_TT_L1_INDEX_MASK, ARM_TTE_VALID, ARM_TTE_TYPE_MASK, ARM_TTE_TYPE_BLOCK },
-  { ARM_TT_L2_SIZE, ARM_TT_L2_OFFMASK, ARM_TT_L2_SHIFT, ARM_TT_L2_INDEX_MASK, ARM_TTE_VALID, ARM_TTE_TYPE_MASK, ARM_TTE_TYPE_BLOCK },
-  { ARM_TT_L3_SIZE, ARM_TT_L3_OFFMASK, ARM_TT_L3_SHIFT, ARM_TT_L3_INDEX_MASK, ARM_PTE_TYPE_VALID, ARM_PTE_TYPE_MASK, ARM_TTE_TYPE_L3BLOCK } };
-
 static size_t
-pmap_dump_page_tables_recurse(const tt_entry_t *ttp,
+pmap_dump_page_tables_recurse(pmap_t pmap,
+    const tt_entry_t *ttp,
     unsigned int cur_level,
     uint64_t start_va,
     void *bufp,
@@ -11626,10 +11773,12 @@ pmap_dump_page_tables_recurse(const tt_entry_t *ttp,
 {
        size_t bytes_used = 0;
        uint64_t num_entries = ARM_PGBYTES / sizeof(*ttp);
-       uint64_t size = page_table_levels[cur_level].size;
-       uint64_t valid_mask = page_table_levels[cur_level].valid_mask;
-       uint64_t type_mask = page_table_levels[cur_level].type_mask;
-       uint64_t type_block = page_table_levels[cur_level].type_block;
+       const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
+
+       uint64_t size = pt_attr->pta_level_info[cur_level].size;
+       uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
+       uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
+       uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
 
        if (cur_level == arm64_root_pgtable_level) {
                num_entries = arm64_root_pgtable_num_ttes;
@@ -11671,7 +11820,7 @@ pmap_dump_page_tables_recurse(const tt_entry_t *ttp,
 
                        const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
 
-                       size_t recurse_result = pmap_dump_page_tables_recurse(next_tt, cur_level + 1, current_va, (uint8_t*)bufp + bytes_used, buf_end);
+                       size_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1, current_va, (uint8_t*)bufp + bytes_used, buf_end);
 
                        if (recurse_result == 0) {
                                return 0;
@@ -11690,7 +11839,7 @@ pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end)
        if (not_in_kdp) {
                panic("pmap_dump_page_tables must only be called from kernel debugger context");
        }
-       return pmap_dump_page_tables_recurse(pmap->tte, arm64_root_pgtable_level, pmap->min, bufp, buf_end);
+       return pmap_dump_page_tables_recurse(pmap, pmap->tte, arm64_root_pgtable_level, pmap->min, bufp, buf_end);
 }
 
 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
index 50464cd10e0539bef0a493fdb89f66f8dda2117f..3d45185eb340994630f6536c7a79f04b37956d15 100644 (file)
@@ -1,6 +1,5 @@
 /*
- *
- * Copyright (c) 2007-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #ifndef ASSEMBLER
 
 #include <stdatomic.h>
+#include <stdbool.h>
 #include <libkern/section_keywords.h>
 #include <mach/kern_return.h>
 #include <mach/machine/vm_types.h>
 #include <arm/pmap_public.h>
 #include <mach/arm/thread_status.h>
+#if defined(__arm64__)
+#include <arm64/tlb.h>
+#else
+#include <arm/tlb.h>
+#endif
+
+
+#define ASID_SHIFT                  (11)                            /* Shift for 2048 max virtual ASIDs (2048 pmaps) */
+#define MAX_ASID                    (1 << ASID_SHIFT)               /* Max supported ASIDs (can be virtual) */
+#ifndef ARM_ASID_SHIFT
+#define ARM_ASID_SHIFT              (8)                             /* Shift for the maximum ARM ASID value (256) */
+#endif
+#define ARM_MAX_ASID                (1 << ARM_ASID_SHIFT)           /* Max ASIDs supported by the hardware */
+#define NBBY                        8
 
 #if __ARM_KERNEL_PROTECT__
-/*
- * For __ARM_KERNEL_PROTECT__, we need twice as many ASIDs to support having
- * unique EL0 and EL1 ASIDs for each pmap.
- */
-#define ASID_SHIFT                      (12)                            /* Shift for the maximum virtual ASID value (2048)*/
-#else /* __ARM_KERNEL_PROTECT__ */
-#define ASID_SHIFT                      (11)                            /* Shift for the maximum virtual ASID value (2048) */
-#endif /* __ARM_KERNEL_PROTECT__ */
-#define MAX_ASID                        (1 << ASID_SHIFT)               /* Max supported ASIDs (can be virtual) */
-#define ARM_ASID_SHIFT                  (8)                             /* Shift for the maximum ARM ASID value (256) */
-#define ARM_MAX_ASID                    (1 << ARM_ASID_SHIFT)           /* Max ASIDs supported by the hardware */
-#define ASID_VIRT_BITS                  (ASID_SHIFT - ARM_ASID_SHIFT)   /* The number of virtual bits in a virtaul ASID */
-#define NBBY                            8
+#define MAX_HW_ASID ((ARM_MAX_ASID >> 1) - 1)
+#else
+#define MAX_HW_ASID (ARM_MAX_ASID - 1)
+#endif
+
+#ifndef ARM_VMID_SHIFT
+#define ARM_VMID_SHIFT                  (8)
+#endif
+#define ARM_MAX_VMID                    (1 << ARM_VMID_SHIFT)
+
+/* XPRR virtual register map */
+
+#define CPUWINDOWS_MAX              4
 
 struct pmap_cpu_data {
 #if defined(__arm64__)
@@ -72,7 +86,9 @@ struct pmap_cpu_data {
        unsigned int cpu_user_pmap_stamp;
 #endif
        unsigned int cpu_number;
+       bool copywindow_strong_sync[CPUWINDOWS_MAX];
 
+#if MAX_ASID > MAX_HW_ASID
 
        /*
         * This supports overloading of ARM ASIDs by the pmap.  The field needs
@@ -85,7 +101,8 @@ struct pmap_cpu_data {
         * memory by only having enough bits to support MAX_ASID.  However, such
         * an implementation would be more error prone.
         */
-       uint8_t cpu_asid_high_bits[ARM_MAX_ASID];
+       uint8_t cpu_asid_high_bits[MAX_HW_ASID];
+#endif
 };
 typedef struct pmap_cpu_data pmap_cpu_data_t;
 
@@ -134,6 +151,16 @@ typedef uint32_t        pt_entry_t;                                     /* page
 #error unknown arch
 #endif
 
+struct page_table_level_info {
+       const uint64_t size;
+       const uint64_t offmask;
+       const uint64_t shift;
+       const uint64_t index_mask;
+       const uint64_t valid_mask;
+       const uint64_t type_mask;
+       const uint64_t type_block;
+};
+
 
 /* superpages */
 #define SUPERPAGE_NBASEPAGES 1  /* No superpages support */
@@ -174,37 +201,6 @@ typedef uint32_t        pt_entry_t;                                     /* page
 #define NPTES   (ARM_PGBYTES / sizeof(pt_entry_t))
 #endif
 
-extern void sync_tlb_flush(void);
-extern void flush_mmu_tlb_async(void);
-extern void flush_mmu_tlb(void);
-extern void flush_core_tlb_async(void);
-extern void flush_core_tlb(void);
-#if defined(__arm64__)
-extern void flush_mmu_tlb_allentries_async(uint64_t, uint64_t);
-extern void flush_mmu_tlb_allentries(uint64_t, uint64_t);
-extern void flush_mmu_tlb_entry_async(uint64_t);
-extern void flush_mmu_tlb_entry(uint64_t);
-extern void flush_mmu_tlb_entries_async(uint64_t, uint64_t);
-extern void flush_mmu_tlb_entries(uint64_t, uint64_t);
-extern void flush_mmu_tlb_asid_async(uint64_t);
-extern void flush_mmu_tlb_asid(uint64_t);
-extern void flush_core_tlb_asid_async(uint64_t);
-extern void flush_core_tlb_asid(uint64_t);
-
-#define tlbi_addr(x) (((x) >> TLBI_ADDR_SHIFT) & TLBI_ADDR_MASK)
-#define tlbi_asid(x) (((uint64_t)x << TLBI_ASID_SHIFT) & TLBI_ASID_MASK)
-#else
-extern void flush_mmu_tlb_entry_async(uint32_t);
-extern void flush_mmu_tlb_entry(uint32_t);
-extern void flush_mmu_tlb_entries_async(uint32_t, uint32_t);
-extern void flush_mmu_tlb_entries(uint32_t, uint32_t);
-extern void flush_mmu_tlb_mva_entries_async(uint32_t);
-extern void flush_mmu_tlb_mva_entries(uint32_t);
-extern void flush_mmu_tlb_asid_async(uint32_t);
-extern void flush_mmu_tlb_asid(uint32_t);
-extern void flush_core_tlb_asid_async(uint32_t);
-extern void flush_core_tlb_asid(uint32_t);
-#endif
 extern void flush_mmu_tlb_region(vm_offset_t va, unsigned length);
 
 #if defined(__arm64__)
@@ -291,6 +287,9 @@ extern pmap_paddr_t mmu_uvtop(vm_offset_t va);
 #endif /* DEVELOPMENT || DEBUG */
 
 
+/* Forward struct declarations for the pmap data structure */
+struct page_table_attr;
+
 /*
  *     Convert translation/page table entry to kernel virtual address
  */
@@ -298,12 +297,15 @@ extern pmap_paddr_t mmu_uvtop(vm_offset_t va);
 #define ptetokv(a)      (phystokv(pte_to_pa(a)))
 
 struct pmap {
-       tt_entry_t                      *tte;                   /* translation table entries */
+       tt_entry_t              *tte;                   /* translation table entries */
        pmap_paddr_t            ttep;                   /* translation table physical */
        vm_map_address_t        min;                    /* min address in pmap */
        vm_map_address_t        max;                    /* max address in pmap */
+#if ARM_PARAMETERIZED_PMAP
+       const struct page_table_attr * pmap_pt_attr;    /* details about page table layout */
+#endif /* ARM_PARAMETERIZED_PMAP */
        ledger_t                ledger;                 /* ledger tracking phys mappings */
-       decl_simple_lock_data(, lock)            /* lock on map */
+       decl_simple_lock_data(, lock);           /* lock on map */
        struct pmap_statistics  stats;          /* map statistics */
        queue_chain_t           pmaps;                  /* global list of pmaps */
        tt_entry_t                      *tt_entry_free; /* free translation table entries */
@@ -317,19 +319,19 @@ struct pmap {
        unsigned int            *nested_region_asid_bitmap;
 
 #if (__ARM_VMSA__ <= 7)
-       decl_simple_lock_data(, tt1_lock)       /* lock on tt1 */
+       decl_simple_lock_data(, tt1_lock);       /* lock on tt1 */
        unsigned int            cpu_ref;                /* number of cpus using pmap */
+       unsigned int            tte_index_max;          /* max tte index in translation table entries */
 #endif
 
 
-       unsigned int            asid;                   /* address space id */
-       unsigned int            vasid;                  /* Virtual address space id */
        unsigned int            stamp;                  /* creation stamp */
        _Atomic int32_t         ref_count;              /* pmap reference count */
        unsigned int            gc_status;              /* gc status */
        unsigned int            nested_region_asid_bitmap_size;
-       unsigned int            tte_index_max;          /* max tte index in translation table entries */
        uint32_t                nested_no_bounds_refcnt;/* number of pmaps that nested this pmap without bounds set */
+       uint16_t                hw_asid;
+       uint8_t                 sw_asid;
 
 #if MACH_ASSERT
        int                     pmap_pid;
@@ -340,32 +342,17 @@ struct pmap {
        bool            footprint_suspended;
        bool            footprint_was_suspended;
 #endif /* DEVELOPMENT || DEBUG */
-       bool                    nx_enabled;                             /* no execute */
-       bool                    nested;                                 /* is nested */
-       bool                    is_64bit;                               /* is 64bit */
+       bool            nx_enabled;                             /* no execute */
+       bool            nested;                                 /* is nested */
+       bool            is_64bit;                               /* is 64bit */
        bool            nested_has_no_bounds_ref;       /* nested a pmap when the bounds were not set */
        bool            nested_bounds_set;                      /* The nesting bounds have been set */
+#if HAS_APPLE_PAC
+       bool            disable_jop;
+#endif /* HAS_APPLE_PAC */
 };
 
-/* typedef struct pmap *pmap_t; */
-#define PMAP_NULL       ((pmap_t) 0)
-
-
-/*
- * WIMG control
- */
-#define VM_MEM_INNER            0x10
-#define VM_MEM_RT               0x10 // intentionally alias VM_MEM_INNER; will be used with mutually exclusive caching policies
-#define VM_MEM_EARLY_ACK        0x20
-
-#define VM_WIMG_DEFAULT         (VM_MEM_COHERENT)
-#define VM_WIMG_COPYBACK        (VM_MEM_COHERENT)
-#define VM_WIMG_INNERWBACK      (VM_MEM_COHERENT | VM_MEM_INNER)
-#define VM_WIMG_IO              (VM_MEM_COHERENT | VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED)
-#define VM_WIMG_POSTED          (VM_MEM_COHERENT | VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED | VM_MEM_EARLY_ACK)
-#define VM_WIMG_WTHRU           (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED)
-#define VM_WIMG_WCOMB           (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT)
-#define VM_WIMG_RT              (VM_WIMG_IO | VM_MEM_RT)
+#define PMAP_VASID(pmap) (((uint32_t)((pmap)->sw_asid) << 16) | pmap->hw_asid)
 
 #if VM_DEBUG
 extern int      pmap_list_resident_pages(
@@ -460,15 +447,17 @@ extern vm_map_address_t pmap_map_high_window_bd( vm_offset_t pa, vm_size_t len,
 extern kern_return_t pmap_map_block(pmap_t pmap, addr64_t va, ppnum_t pa, uint32_t size, vm_prot_t prot, int attr, unsigned int flags);
 extern void pmap_map_globals(void);
 
-#define PMAP_MAP_BD_DEVICE      0x1
-#define PMAP_MAP_BD_WCOMB       0x2
-#define PMAP_MAP_BD_POSTED      0x3
-#define PMAP_MAP_BD_MASK        0x3
+#define PMAP_MAP_BD_DEVICE                    0x0
+#define PMAP_MAP_BD_WCOMB                     0x1
+#define PMAP_MAP_BD_POSTED                    0x2
+#define PMAP_MAP_BD_POSTED_REORDERED          0x3
+#define PMAP_MAP_BD_POSTED_COMBINED_REORDERED 0x4
+#define PMAP_MAP_BD_MASK                      0x7
 
 extern vm_map_address_t pmap_map_bd_with_options(vm_map_address_t va, vm_offset_t sa, vm_offset_t ea, vm_prot_t prot, int32_t options);
 extern vm_map_address_t pmap_map_bd(vm_map_address_t va, vm_offset_t sa, vm_offset_t ea, vm_prot_t prot);
 
-extern void pmap_init_pte_page(pmap_t, pt_entry_t *, vm_offset_t, unsigned int ttlevel, boolean_t alloc_ptd);
+extern void pmap_init_pte_page(pmap_t, pt_entry_t *, vm_offset_t, unsigned int ttlevel, boolean_t alloc_ptd, boolean_t clear);
 
 extern boolean_t pmap_valid_address(pmap_paddr_t addr);
 extern void pmap_disable_NX(pmap_t pmap);
@@ -551,7 +540,9 @@ boolean_t pmap_enforces_execute_only(pmap_t pmap);
 #define PMAP_LEDGER_ALLOC_INDEX 66
 #define PMAP_LEDGER_FREE_INDEX 67
 
-#define PMAP_COUNT 68
+
+
+#define PMAP_COUNT 71
 
 #define PMAP_INVALID_CPU_NUM (~0U)
 
index 45536d43ffdf3b77260212874b7d9d4c5021a1af..192bc9d692e7684eba3a127c1baf919a8be32a6c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #endif
 
 #if defined (ARMA7)
-#define __ARM_ARCH__    7
-#define __ARM_SUB_ARCH__ CPU_ARCH_ARMv7k
-#define __ARM_VMSA__    7
-#define __ARM_VFP__     3
+#define __ARM_ARCH__               7
+#define __ARM_SUB_ARCH__           CPU_ARCH_ARMv7k
+#define __ARM_VMSA__               7
+#define __ARM_VFP__                3
 #if defined(__XNU_UP__)
-#define __ARM_SMP__     0
+#define __ARM_SMP__                0
 #else
-#define __ARM_SMP__     1
+#define __ARM_SMP__                1
 /* For SMP kernels, force physical aperture to be mapped at PTE level so that its mappings
  * can be updated to reflect cache attribute changes on alias mappings.  This prevents
  * prefetched physical aperture cachelines from becoming dirty in L1 due to a write to
  * an uncached alias mapping on the same core.  Subsequent uncached writes from another
  * core may not snoop this line, and the dirty line may end up being evicted later to
  * effectively overwrite the uncached writes from other cores. */
-#define __ARM_PTE_PHYSMAP__     1
+#define __ARM_PTE_PHYSMAP__        1
 #endif
 /* __ARMA7_SMP__ controls whether we are consistent with the A7 MP_CORE spec; needed because entities other than
  * the xnu-managed processors may need to snoop our cache operations.
  */
-#define __ARMA7_SMP__   1
-#define __ARM_COHERENT_CACHE__ 1
-#define __ARM_L1_PTW__  1
-#define __ARM_DEBUG__   7
-#define __ARM_USER_PROTECT__    1
-#define __ARM_TIME_TIMEBASE_ONLY__      1
-
-#elif defined (APPLECYCLONE)
-#define __ARM_ARCH__    8
-#define __ARM_VMSA__    8
-#define __ARM_SMP__     1
-#define __ARM_VFP__     4
-#define __ARM_COHERENT_CACHE__ 1
-#define __ARM_COHERENT_IO__ 1
-#define __ARM_IC_NOALIAS_ICACHE__ 1
-#define __ARM_L1_PTW__ 1
-#define __ARM_DEBUG__   7
-#define __ARM_ENABLE_SWAP__ 1
-#define __ARM_V8_CRYPTO_EXTENSIONS__ 1
-#define __ARM64_PMAP_SUBPAGE_L1__ 1
-#define __ARM_KERNEL_PROTECT__ 1
+#define __ARMA7_SMP__              1
+#define __ARM_COHERENT_CACHE__     1
+#define __ARM_DEBUG__              7
+#define __ARM_USER_PROTECT__       1
+#define __ARM_TIME_TIMEBASE_ONLY__ 1
 
 #elif defined (APPLETYPHOON)
-#define __ARM_ARCH__    8
-#define __ARM_VMSA__    8
-#define __ARM_SMP__     1
-#define __ARM_VFP__     4
-#define __ARM_COHERENT_CACHE__ 1
-#define __ARM_COHERENT_IO__ 1
-#define __ARM_IC_NOALIAS_ICACHE__ 1
-#define __ARM_L1_PTW__ 1
-#define __ARM_DEBUG__   7
-#define __ARM_ENABLE_SWAP__ 1
+#define __ARM_ARCH__                 8
+#define __ARM_VMSA__                 8
+#define __ARM_SMP__                  1
+#define __ARM_VFP__                  4
+#define __ARM_COHERENT_CACHE__       1
+#define __ARM_COHERENT_IO__          1
+#define __ARM_IC_NOALIAS_ICACHE__    1
+#define __ARM_DEBUG__                7
+#define __ARM_ENABLE_SWAP__          1
 #define __ARM_V8_CRYPTO_EXTENSIONS__ 1
-#define __ARM64_PMAP_SUBPAGE_L1__ 1
-#define __ARM_KERNEL_PROTECT__ 1
+#define __ARM64_PMAP_SUBPAGE_L1__    1
+#define __ARM_KERNEL_PROTECT__       1
 
 #elif defined (APPLETWISTER)
-#define __ARM_ARCH__    8
-#define __ARM_VMSA__    8
-#define __ARM_SMP__     1
-#define __ARM_VFP__     4
-#define __ARM_COHERENT_CACHE__ 1
-#define __ARM_COHERENT_IO__ 1
-#define __ARM_IC_NOALIAS_ICACHE__ 1
-#define __ARM_L1_PTW__ 1
-#define __ARM_DEBUG__   7
-#define __ARM_ENABLE_SWAP__ 1
+#define __ARM_ARCH__                 8
+#define __ARM_VMSA__                 8
+#define __ARM_SMP__                  1
+#define __ARM_VFP__                  4
+#define __ARM_COHERENT_CACHE__       1
+#define __ARM_COHERENT_IO__          1
+#define __ARM_IC_NOALIAS_ICACHE__    1
+#define __ARM_DEBUG__                7
+#define __ARM_ENABLE_SWAP__          1
 #define __ARM_V8_CRYPTO_EXTENSIONS__ 1
-#define __ARM_16K_PG__  1
-#define __ARM64_PMAP_SUBPAGE_L1__ 1
-#define __ARM_KERNEL_PROTECT__ 1
+#define __ARM_16K_PG__               1
+#define __ARM64_PMAP_SUBPAGE_L1__    1
+#define __ARM_KERNEL_PROTECT__       1
 
 #elif defined (APPLEHURRICANE)
-#define __ARM_ARCH__    8
-#define __ARM_VMSA__    8
-#define __ARM_SMP__     1
-#define __ARM_VFP__     4
-#define __ARM_COHERENT_CACHE__ 1
-#define __ARM_COHERENT_IO__ 1
-#define __ARM_IC_NOALIAS_ICACHE__ 1
-#define __ARM_L1_PTW__ 1
-#define __ARM_DEBUG__   7
-#define __ARM_ENABLE_SWAP__ 1
+#define __ARM_ARCH__                 8
+#define __ARM_VMSA__                 8
+#define __ARM_SMP__                  1
+#define __ARM_VFP__                  4
+#define __ARM_COHERENT_CACHE__       1
+#define __ARM_COHERENT_IO__          1
+#define __ARM_IC_NOALIAS_ICACHE__    1
+#define __ARM_DEBUG__                7
+#define __ARM_ENABLE_SWAP__          1
 #define __ARM_V8_CRYPTO_EXTENSIONS__ 1
-#define __ARM_16K_PG__  1
-#define __ARM64_PMAP_SUBPAGE_L1__ 1
-#define __ARM_KERNEL_PROTECT__ 1
-#define __ARM_GLOBAL_SLEEP_BIT__ 1
-#define __ARM_PAN_AVAILABLE__ 1
+#define __ARM_16K_PG__               1
+#define __ARM64_PMAP_SUBPAGE_L1__    1
+#define __ARM_KERNEL_PROTECT__       1
+#define __ARM_GLOBAL_SLEEP_BIT__     1
+#define __ARM_PAN_AVAILABLE__        1
 
 #elif defined (APPLEMONSOON)
-#define __ARM_ARCH__    8
-#define __ARM_VMSA__    8
-#define __ARM_SMP__     1
-#define __ARM_AMP__     1
-#define __ARM_VFP__     4
-#define __ARM_COHERENT_CACHE__ 1
-#define __ARM_COHERENT_IO__ 1
-#define __ARM_IC_NOALIAS_ICACHE__ 1
-#define __ARM_L1_PTW__ 1
-#define __ARM_DEBUG__   7
-#define __ARM_ENABLE_SWAP__ 1
-#define __ARM_V8_CRYPTO_EXTENSIONS__ 1
-#define __ARM_16K_PG__  1
-#define __ARM64_PMAP_SUBPAGE_L1__ 1
-#define __ARM_KERNEL_PROTECT__ 1
-#define __ARM_GLOBAL_SLEEP_BIT__ 1
-#define __ARM_PAN_AVAILABLE__ 1
-#define __ARM_WKDM_ISA_AVAILABLE__ 1
-#define __PLATFORM_WKDM_ALIGNMENT_MASK__ (0x3FULL)
+#define __ARM_ARCH__                         8
+#define __ARM_VMSA__                         8
+#define __ARM_SMP__                          1
+#define __ARM_AMP__                          1
+#define __ARM_VFP__                          4
+#define __ARM_COHERENT_CACHE__               1
+#define __ARM_COHERENT_IO__                  1
+#define __ARM_IC_NOALIAS_ICACHE__            1
+#define __ARM_DEBUG__                        7
+#define __ARM_ENABLE_SWAP__                  1
+#define __ARM_V8_CRYPTO_EXTENSIONS__         1
+#define __ARM_16K_PG__                       1
+#define __ARM64_PMAP_SUBPAGE_L1__            1
+#define __ARM_KERNEL_PROTECT__               1
+#define __ARM_GLOBAL_SLEEP_BIT__             1
+#define __ARM_PAN_AVAILABLE__                1
+#define __ARM_WKDM_ISA_AVAILABLE__           1
+#define __PLATFORM_WKDM_ALIGNMENT_MASK__     (0x3FULL)
 #define __PLATFORM_WKDM_ALIGNMENT_BOUNDARY__ (64)
-#define __ARM_CLUSTER_COUNT__ 2
+#define __ARM_CLUSTER_COUNT__                2
 
 #elif defined (BCM2837)
-#define __ARM_ARCH__    8
-#define __ARM_VMSA__    8
-#define __ARM_SMP__     1
-#define __ARM_VFP__             4
-#define __ARM_COHERENT_CACHE__ 1
-#define __ARM_L1_PTW__ 1
-#define __ARM_DEBUG__   7
+#define __ARM_ARCH__              8
+#define __ARM_VMSA__              8
+#define __ARM_SMP__               1
+#define __ARM_VFP__               4
+#define __ARM_COHERENT_CACHE__    1
+#define __ARM_DEBUG__             7
 #define __ARM64_PMAP_SUBPAGE_L1__ 1
 #else
 #error processor not supported
 #endif
 
+#if __ARM_42BIT_PA_SPACE__
+/* For now, force the issue! */
+#undef __ARM64_PMAP_SUBPAGE_L1__
+#endif /* __ARM_42BIT_PA_SPACE__ */
+
 #if __ARM_KERNEL_PROTECT__
 /*
  * This feature is not currently implemented for 32-bit ARM CPU architectures.
  */
 #if __arm__
 #error __ARM_KERNEL_PROTECT__ is not supported on ARM32
-#endif
+#endif /* __arm__ */
 #endif /* __ARM_KERNEL_PROTECT__ */
 
 #if defined(ARM_BOARD_WFE_TIMEOUT_NS)
 #define __ARM_ENABLE_WFE_ 1
-#else
+#else /* defined(ARM_BOARD_WFE_TIMEOUT_NS) */
 #define __ARM_ENABLE_WFE_ 0
-#endif
+#endif /* defined(ARM_BOARD_WFE_TIMEOUT_NS) */
 
+/*
+ * The clutch scheduler is enabled only on non-AMP platforms for now.
+ */
+#if !__ARM_AMP__ && CONFIG_CLUTCH
+#define CONFIG_SCHED_CLUTCH 1
+#else /* !__ARM_AMP__ && CONFIG_CLUTCH */
+#define CONFIG_SCHED_CLUTCH 0
+#endif /* !__ARM_AMP__ && CONFIG_CLUTCH */
+
+#if __ARM_AMP__ || CONFIG_SCHED_CLUTCH
+#define CONFIG_THREAD_GROUPS 1
+#else /* __ARM_AMP__ || CONFIG_SCHED_CLUTCH */
 #define CONFIG_THREAD_GROUPS 0
+#endif
 
+#ifdef XNU_KERNEL_PRIVATE
 
-#ifdef  XNU_KERNEL_PRIVATE
-
-#if     __ARM_VFP__
-#define ARM_VFP_DEBUG   0
-#endif
+#if __ARM_VFP__
+#define ARM_VFP_DEBUG 0
+#endif /* __ARM_VFP__ */
 
-#endif
+#endif /* XNU_KERNEL_PRIVATE */
 
 
 
 /*
  * Flags
  */
-#define PSR_NF                  0x80000000      /* Negative/Less than */
-#define PSR_ZF                  0x40000000      /* Zero */
-#define PSR_CF                  0x20000000      /* Carry/Borrow/Extend */
-#define PSR_VF                  0x10000000      /* Overflow */
-#define PSR_QF                  0x08000000      /* saturation flag (QADD ARMv5) */
+#define PSR_NF 0x80000000 /* Negative/Less than */
+#define PSR_ZF 0x40000000 /* Zero */
+#define PSR_CF 0x20000000 /* Carry/Borrow/Extend */
+#define PSR_VF 0x10000000 /* Overflow */
+#define PSR_QF 0x08000000 /* saturation flag (QADD ARMv5) */
 
 /*
  * Modified execution mode flags
  */
-#define PSR_JF                  0x01000000      /* Jazelle flag (BXJ ARMv5) */
-#define PSR_EF                  0x00000200      /* mixed-endian flag (SETEND ARMv6) */
-#define PSR_AF                  0x00000100      /* precise abort flag (ARMv6) */
-#define PSR_TF                  0x00000020      /* thumb flag (BX ARMv4T) */
-#define PSR_TFb                          5      /* thumb flag (BX ARMv4T) */
+#define PSR_JF  0x01000000 /* Jazelle flag (BXJ ARMv5) */
+#define PSR_EF  0x00000200 /* mixed-endian flag (SETEND ARMv6) */
+#define PSR_AF  0x00000100 /* precise abort flag (ARMv6) */
+#define PSR_TF  0x00000020 /* thumb flag (BX ARMv4T) */
+#define PSR_TFb 5          /* thumb flag (BX ARMv4T) */
 
 /*
  * Interrupts
  */
-#define PSR_IRQFb                        7      /* IRQ : 0 = IRQ enable */
-#define PSR_IRQF                0x00000080      /* IRQ : 0 = IRQ enable */
-#define PSR_FIQF                0x00000040      /* FIQ : 0 = FIQ enable */
+#define PSR_IRQFb 7          /* IRQ : 0 = IRQ enable */
+#define PSR_IRQF  0x00000080 /* IRQ : 0 = IRQ enable */
+#define PSR_FIQF  0x00000040 /* FIQ : 0 = FIQ enable */
 
 /*
  * CPU mode
  */
-#define PSR_USER_MODE           0x00000010      /* User mode */
-#define PSR_FIQ_MODE            0x00000011      /* FIQ mode */
-#define PSR_IRQ_MODE            0x00000012      /* IRQ mode */
-#define PSR_SVC_MODE            0x00000013      /* Supervisor mode */
-#define PSR_ABT_MODE            0x00000017      /* Abort mode */
-#define PSR_UND_MODE            0x0000001B      /* Undefined mode */
+#define PSR_USER_MODE 0x00000010 /* User mode */
+#define PSR_FIQ_MODE  0x00000011 /* FIQ mode */
+#define PSR_IRQ_MODE  0x00000012 /* IRQ mode */
+#define PSR_SVC_MODE  0x00000013 /* Supervisor mode */
+#define PSR_ABT_MODE  0x00000017 /* Abort mode */
+#define PSR_UND_MODE  0x0000001B /* Undefined mode */
 
-#define PSR_MODE_MASK           0x0000001F
-#define PSR_IS_KERNEL(psr)       (((psr) & PSR_MODE_MASK) != PSR_USER_MODE)
-#define PSR_IS_USER(psr)         (((psr) & PSR_MODE_MASK) == PSR_USER_MODE)
+#define PSR_MODE_MASK      0x0000001F
+#define PSR_IS_KERNEL(psr) (((psr) & PSR_MODE_MASK) != PSR_USER_MODE)
+#define PSR_IS_USER(psr)   (((psr) & PSR_MODE_MASK) == PSR_USER_MODE)
 
-#define PSR_USERDFLT            PSR_USER_MODE
-#define PSR_USER_MASK           (PSR_AF | PSR_IRQF | PSR_FIQF | PSR_MODE_MASK)
-#define PSR_USER_SET            PSR_USER_MODE
+#define PSR_USERDFLT  PSR_USER_MODE
+#define PSR_USER_MASK (PSR_AF | PSR_IRQF | PSR_FIQF | PSR_MODE_MASK)
+#define PSR_USER_SET  PSR_USER_MODE
 
-#define PSR_INTMASK             PSR_IRQF        /* Interrupt disable */
+#define PSR_INTMASK PSR_IRQF /* Interrupt disable */
 
 /*
  * FPEXC: Floating-Point Exception Register
  */
 
-#define FPEXC_EX                0x80000000      /* Exception status */
-#define FPEXC_EX_BIT            31
-#define FPEXC_EN                0x40000000      /* VFP : 1 = EN enable */
-#define FPEXC_EN_BIT            30
+#define FPEXC_EX     0x80000000 /* Exception status */
+#define FPEXC_EX_BIT 31
+#define FPEXC_EN     0x40000000 /* VFP : 1 = EN enable */
+#define FPEXC_EN_BIT 30
 
 
 /*
  * FPSCR: Floating-point Status and Control Register
  */
 
-#define FPSCR_DN                0x02000000      /* Default NaN */
-#define FPSCR_FZ                0x01000000      /* Flush to zero */
+#define FPSCR_DN      0x02000000          /* Default NaN */
+#define FPSCR_FZ      0x01000000          /* Flush to zero */
 
-#define FPSCR_DEFAULT           FPSCR_DN | FPSCR_FZ
+#define FPSCR_DEFAULT FPSCR_DN | FPSCR_FZ
 
 
 /*
  * IFSR: Instruction Fault Status Register
  * DFSR: Data Fault Status Register
  */
-#define FSR_ALIGN               0x00000001      /* Alignment */
-#define FSR_DEBUG               0x00000002      /* Debug (watch/break) */
-#define FSR_ICFAULT             0x00000004      /* Fault on instruction cache maintenance */
-#define FSR_SFAULT              0x00000005      /* Translation Section */
-#define FSR_PFAULT              0x00000007      /* Translation Page */
-#define FSR_SACCESS             0x00000003      /* Section access */
-#define FSR_PACCESS             0x00000006      /* Page Access */
-#define FSR_SDOM                0x00000009      /* Domain Section */
-#define FSR_PDOM                0x0000000B      /* Domain Page */
-#define FSR_SPERM               0x0000000D      /* Permission Section */
-#define FSR_PPERM               0x0000000F      /* Permission Page */
-#define FSR_EXT                 0x00001000      /* External (Implementation Defined Classification) */
-
-#define FSR_MASK                0x0000040F      /* Valid bits */
-#define FSR_ALIGN_MASK          0x0000040D      /* Valid bits to check align */
-
-#define DFSR_WRITE              0x00000800      /* write data abort fault */
+#define FSR_ALIGN      0x00000001 /* Alignment */
+#define FSR_DEBUG      0x00000002 /* Debug (watch/break) */
+#define FSR_ICFAULT    0x00000004 /* Fault on instruction cache maintenance */
+#define FSR_SFAULT     0x00000005 /* Translation Section */
+#define FSR_PFAULT     0x00000007 /* Translation Page */
+#define FSR_SACCESS    0x00000003 /* Section access */
+#define FSR_PACCESS    0x00000006 /* Page Access */
+#define FSR_SDOM       0x00000009 /* Domain Section */
+#define FSR_PDOM       0x0000000B /* Domain Page */
+#define FSR_SPERM      0x0000000D /* Permission Section */
+#define FSR_PPERM      0x0000000F /* Permission Page */
+#define FSR_EXT        0x00001000 /* External (Implementation Defined Classification) */
+
+#define FSR_MASK       0x0000040F /* Valid bits */
+#define FSR_ALIGN_MASK 0x0000040D /* Valid bits to check align */
+
+#define DFSR_WRITE     0x00000800 /* write data abort fault */
 
 #if defined (ARMA7) || defined (APPLE_ARM64_ARCH_FAMILY) || defined (BCM2837)
 
-#define TEST_FSR_VMFAULT(status)        \
-                               (((status) == FSR_PFAULT)       \
-                               || ((status) == FSR_PPERM)      \
-                               || ((status) == FSR_SFAULT)     \
-                               || ((status) == FSR_SPERM)      \
-                               || ((status) == FSR_ICFAULT)    \
-                               || ((status) == FSR_SACCESS)    \
-                               || ((status) == FSR_PACCESS))
+#define TEST_FSR_VMFAULT(status) \
+       (((status) == FSR_PFAULT)     \
+       || ((status) == FSR_PPERM)    \
+       || ((status) == FSR_SFAULT)   \
+       || ((status) == FSR_SPERM)    \
+       || ((status) == FSR_ICFAULT)  \
+       || ((status) == FSR_SACCESS)  \
+       || ((status) == FSR_PACCESS))
 
 #define TEST_FSR_TRANSLATION_FAULT(status)      \
                                (((status) == FSR_SFAULT)       \
 #if defined (ARMA7)
 
 /* I-Cache */
-#define MMU_I_CLINE     5               /* cache line size as 1<<MMU_I_CLINE (32) */
-
-/* D-Cache */
-#define MMU_CSIZE       15              /* cache size as 1<<MMU_CSIZE (32K) */
-#define MMU_CLINE       6               /* cache line size as 1<<MMU_CLINE (64) */
-#define MMU_NWAY        2               /* set associativity 1<<MMU_NWAY (4) */
-#define MMU_I7SET       6               /* cp15 c7 set incrementer 1<<MMU_I7SET */
-#define MMU_I7WAY       30              /* cp15 c7 way incrementer 1<<MMU_I7WAY */
-
-#define MMU_SWAY        (MMU_CSIZE - MMU_NWAY)  /* set size 1<<MMU_SWAY */
-#define MMU_NSET        (MMU_SWAY - MMU_CLINE)  /* lines per way 1<<MMU_NSET */
-
-#define __ARM_L2CACHE__ 1
-
-#define L2_CSIZE        __ARM_L2CACHE_SIZE_LOG__        /* cache size as 1<<MMU_CSIZE */
-#define L2_CLINE        6               /* cache line size as 1<<MMU_CLINE (64) */
-#define L2_NWAY         3               /* set associativity 1<<MMU_NWAY (8) */
-#define L2_I7SET        6               /* cp15 c7 set incrementer 1<<MMU_I7SET */
-#define L2_I7WAY        29              /* cp15 c7 way incrementer 1<<MMU_I7WAY */
-#define L2_I9WAY        29              /* cp15 c9 way incrementer 1<<MMU_I9WAY */
-
-#define L2_SWAY (L2_CSIZE - L2_NWAY)    /* set size 1<<MMU_SWAY */
-#define L2_NSET (L2_SWAY - L2_CLINE)    /* lines per way 1<<MMU_NSET */
-
-#elif defined (APPLECYCLONE)
-
-/* I-Cache */
-#define MMU_I_CLINE     6               /* cache line size as 1<<MMU_I_CLINE (64) */
+#define MMU_I_CLINE     5                      /* cache line size as 1<<MMU_I_CLINE (32) */
 
 /* D-Cache */
-#define MMU_CSIZE       16              /* cache size as 1<<MMU_CSIZE (64K) */
-#define MMU_CLINE       6               /* cache line size as 1<<MMU_CLINE (64) */
-#define MMU_NWAY        1               /* set associativity 1<<MMU_NWAY (2) */
-#define MMU_I7SET       6               /* cp15 c7 set incrementer 1<<MMU_I7SET */
-#define MMU_I7WAY       31              /* cp15 c7 way incrementer 1<<MMU_I7WAY */
-#define MMU_I9WAY       31              /* cp15 c9 way incrementer 1<<MMU_I9WAY */
+#define MMU_CSIZE       15                     /* cache size as 1<<MMU_CSIZE (32K) */
+#define MMU_CLINE       6                      /* cache line size as 1<<MMU_CLINE (64) */
+#define MMU_NWAY        2                      /* set associativity 1<<MMU_NWAY (4) */
+#define MMU_I7SET       6                      /* cp15 c7 set incrementer 1<<MMU_I7SET */
+#define MMU_I7WAY       30                     /* cp15 c7 way incrementer 1<<MMU_I7WAY */
 
-#define MMU_SWAY        (MMU_CSIZE - MMU_NWAY)  /* set size 1<<MMU_SWAY */
-#define MMU_NSET        (MMU_SWAY - MMU_CLINE)  /* lines per way 1<<MMU_NSET */
+#define MMU_SWAY        (MMU_CSIZE - MMU_NWAY) /* set size 1<<MMU_SWAY */
+#define MMU_NSET        (MMU_SWAY - MMU_CLINE) /* lines per way 1<<MMU_NSET */
 
 #define __ARM_L2CACHE__ 1
 
-#define L2_CSIZE        __ARM_L2CACHE_SIZE_LOG__        /* cache size as 1<<L2_CSIZE */
-#define L2_CLINE        6               /* cache line size as 1<<L2_CLINE (64) */
-#define L2_NWAY         3               /* set associativity 1<<L2_NWAY (8) */
-#define L2_I7SET        6               /* cp15 c7 set incrementer 1<<L2_I7SET */
-#define L2_I7WAY        29              /* cp15 c7 way incrementer 1<<L2_I7WAY */
-#define L2_I9WAY        29              /* cp15 c9 way incrementer 1<<L2_I9WAY */
+#define L2_CSIZE        __ARM_L2CACHE_SIZE_LOG__ /* cache size as 1<<MMU_CSIZE */
+#define L2_CLINE        6                        /* cache line size as 1<<MMU_CLINE (64) */
+#define L2_NWAY         3                        /* set associativity 1<<MMU_NWAY (8) */
+#define L2_I7SET        6                        /* cp15 c7 set incrementer 1<<MMU_I7SET */
+#define L2_I7WAY        29                       /* cp15 c7 way incrementer 1<<MMU_I7WAY */
+#define L2_I9WAY        29                       /* cp15 c9 way incrementer 1<<MMU_I9WAY */
 
-#define L2_SWAY (L2_CSIZE - L2_NWAY)    /* set size 1<<L2_SWAY */
-#define L2_NSET (L2_SWAY - L2_CLINE)    /* lines per way 1<<L2_NSET */
+#define L2_SWAY         (L2_CSIZE - L2_NWAY)     /* set size 1<<MMU_SWAY */
+#define L2_NSET         (L2_SWAY - L2_CLINE)     /* lines per way 1<<MMU_NSET */
 
 #elif defined (APPLETYPHOON)
 
 /* I-Cache */
-#define MMU_I_CLINE     6               /* cache line size as 1<<MMU_I_CLINE (64) */
+#define MMU_I_CLINE 6                      /* cache line size as 1<<MMU_I_CLINE (64) */
 
 /* D-Cache */
-#define MMU_CSIZE       16              /* cache size as 1<<MMU_CSIZE (64K) */
-#define MMU_CLINE       6               /* cache line size as 1<<MMU_CLINE (64) */
-#define MMU_NWAY        1               /* set associativity 1<<MMU_NWAY (2) */
-#define MMU_I7SET       6               /* cp15 c7 set incrementer 1<<MMU_I7SET */
-#define MMU_I7WAY       31              /* cp15 c7 way incrementer 1<<MMU_I7WAY */
-#define MMU_I9WAY       31              /* cp15 c9 way incrementer 1<<MMU_I9WAY */
+#define MMU_CSIZE   16                     /* cache size as 1<<MMU_CSIZE (64K) */
+#define MMU_CLINE   6                      /* cache line size as 1<<MMU_CLINE (64) */
+#define MMU_NWAY    1                      /* set associativity 1<<MMU_NWAY (2) */
+#define MMU_I7SET   6                      /* cp15 c7 set incrementer 1<<MMU_I7SET */
+#define MMU_I7WAY   31                     /* cp15 c7 way incrementer 1<<MMU_I7WAY */
+#define MMU_I9WAY   31                     /* cp15 c9 way incrementer 1<<MMU_I9WAY */
 
-#define MMU_SWAY        (MMU_CSIZE - MMU_NWAY)  /* set size 1<<MMU_SWAY */
-#define MMU_NSET        (MMU_SWAY - MMU_CLINE)  /* lines per way 1<<MMU_NSET */
+#define MMU_SWAY    (MMU_CSIZE - MMU_NWAY) /* set size 1<<MMU_SWAY */
+#define MMU_NSET    (MMU_SWAY - MMU_CLINE) /* lines per way 1<<MMU_NSET */
 
 #define __ARM_L2CACHE__ 1
 
-#define L2_CSIZE        __ARM_L2CACHE_SIZE_LOG__        /* cache size as 1<<L2_CSIZE */
-#define L2_CLINE        6               /* cache line size as 1<<L2_CLINE (64) */
-#define L2_NWAY         3               /* set associativity 1<<L2_NWAY (8) */
-#define L2_I7SET        6               /* cp15 c7 set incrementer 1<<L2_I7SET */
-#define L2_I7WAY        29              /* cp15 c7 way incrementer 1<<L2_I7WAY */
-#define L2_I9WAY        29              /* cp15 c9 way incrementer 1<<L2_I9WAY */
+#define L2_CSIZE        __ARM_L2CACHE_SIZE_LOG__ /* cache size as 1<<L2_CSIZE */
+#define L2_CLINE        6                        /* cache line size as 1<<L2_CLINE (64) */
+#define L2_NWAY         3                        /* set associativity 1<<L2_NWAY (8) */
+#define L2_I7SET        6                        /* cp15 c7 set incrementer 1<<L2_I7SET */
+#define L2_I7WAY        29                       /* cp15 c7 way incrementer 1<<L2_I7WAY */
+#define L2_I9WAY        29                       /* cp15 c9 way incrementer 1<<L2_I9WAY */
 
-#define L2_SWAY (L2_CSIZE - L2_NWAY)    /* set size 1<<L2_SWAY */
-#define L2_NSET (L2_SWAY - L2_CLINE)    /* lines per way 1<<L2_NSET */
+#define L2_SWAY         (L2_CSIZE - L2_NWAY)     /* set size 1<<L2_SWAY */
+#define L2_NSET         (L2_SWAY - L2_CLINE)     /* lines per way 1<<L2_NSET */
 
 #elif defined (APPLETWISTER)
 
 /* I-Cache */
-#define MMU_I_CLINE     6               /* cache line size as 1<<MMU_I_CLINE (64) */
+#define MMU_I_CLINE 6                      /* cache line size as 1<<MMU_I_CLINE (64) */
 
 /* D-Cache */
-#define MMU_CSIZE       16              /* cache size as 1<<MMU_CSIZE (64K) */
-#define MMU_CLINE       6               /* cache line size is 1<<MMU_CLINE (64) */
-#define MMU_NWAY        2               /* set associativity 1<<MMU_NWAY (4) */
-#define MMU_I7SET       6               /* cp15 c7 set incrementer 1<<MMU_I7SET */
-#define MMU_I7WAY       30              /* cp15 c7 way incrementer 1<<MMU_I7WAY */
-#define MMU_I9WAY       30              /* cp15 c9 way incrementer 1<<MMU_I9WAY */
+#define MMU_CSIZE   16                     /* cache size as 1<<MMU_CSIZE (64K) */
+#define MMU_CLINE   6                      /* cache line size is 1<<MMU_CLINE (64) */
+#define MMU_NWAY    2                      /* set associativity 1<<MMU_NWAY (4) */
+#define MMU_I7SET   6                      /* cp15 c7 set incrementer 1<<MMU_I7SET */
+#define MMU_I7WAY   30                     /* cp15 c7 way incrementer 1<<MMU_I7WAY */
+#define MMU_I9WAY   30                     /* cp15 c9 way incrementer 1<<MMU_I9WAY */
 
-#define MMU_SWAY        (MMU_CSIZE - MMU_NWAY)  /* set size 1<<MMU_SWAY */
-#define MMU_NSET        (MMU_SWAY - MMU_CLINE)  /* lines per way 1<<MMU_NSET */
+#define MMU_SWAY    (MMU_CSIZE - MMU_NWAY) /* set size 1<<MMU_SWAY */
+#define MMU_NSET    (MMU_SWAY - MMU_CLINE) /* lines per way 1<<MMU_NSET */
 
 /* L2-Cache */
 #define __ARM_L2CACHE__ 1
  * For reasons discussed in the platform expert code, we round the reported
  * L2 size to 4MB, and adjust the other parameters accordingly.
  */
-#define L2_CSIZE        __ARM_L2CACHE_SIZE_LOG__        /* cache size as 1<<L2_CSIZE */
-#define L2_CLINE        6               /* cache line size as 1<<L2_CSIZE (64) */
-#define L2_NWAY         4               /* set associativity as 1<<L2_CLINE (16, is actually 12) */
-#define L2_I7SET        6               /* cp15 c7 set incrementer 1<<L2_I7SET */
-#define L2_I7WAY        28              /* cp15 c7 way incrementer 1<<L2_I7WAY */
-#define L2_I9WAY        28              /* cp15 c9 way incremenber 1<<L2_I9WAY */
+#define L2_CSIZE        __ARM_L2CACHE_SIZE_LOG__ /* cache size as 1<<L2_CSIZE */
+#define L2_CLINE        6                        /* cache line size as 1<<L2_CSIZE (64) */
+#define L2_NWAY         4                        /* set associativity as 1<<L2_CLINE (16, is actually 12) */
+#define L2_I7SET        6                        /* cp15 c7 set incrementer 1<<L2_I7SET */
+#define L2_I7WAY        28                       /* cp15 c7 way incrementer 1<<L2_I7WAY */
+#define L2_I9WAY        28                       /* cp15 c9 way incremenber 1<<L2_I9WAY */
 
-#define L2_SWAY (L2_CSIZE - L2_NWAY)            /* set size 1<<L2_SWAY */
-#define L2_NSET (L2_SWAY - L2_CLINE)            /* lines per way 1<<L2_NSET */
+#define L2_SWAY         (L2_CSIZE - L2_NWAY)     /* set size 1<<L2_SWAY */
+#define L2_NSET         (L2_SWAY - L2_CLINE)     /* lines per way 1<<L2_NSET */
 
 #elif defined (APPLEHURRICANE)
 
 /* I-Cache */
-#define MMU_I_CLINE     6               /* cache line size as 1<<MMU_I_CLINE (64) */
+#define MMU_I_CLINE 6                      /* cache line size as 1<<MMU_I_CLINE (64) */
 
 /* D-Cache */
-#define MMU_CSIZE       16              /* cache size as 1<<MMU_CSIZE (64K) */
-#define MMU_CLINE       6               /* cache line size is 1<<MMU_CLINE (64) */
-#define MMU_NWAY        2               /* set associativity 1<<MMU_NWAY (4) */
-#define MMU_I7SET       6               /* cp15 c7 set incrementer 1<<MMU_I7SET */
-#define MMU_I7WAY       30              /* cp15 c7 way incrementer 1<<MMU_I7WAY */
-#define MMU_I9WAY       30              /* cp15 c9 way incrementer 1<<MMU_I9WAY */
+#define MMU_CSIZE   16                     /* cache size as 1<<MMU_CSIZE (64K) */
+#define MMU_CLINE   6                      /* cache line size is 1<<MMU_CLINE (64) */
+#define MMU_NWAY    2                      /* set associativity 1<<MMU_NWAY (4) */
+#define MMU_I7SET   6                      /* cp15 c7 set incrementer 1<<MMU_I7SET */
+#define MMU_I7WAY   30                     /* cp15 c7 way incrementer 1<<MMU_I7WAY */
+#define MMU_I9WAY   30                     /* cp15 c9 way incrementer 1<<MMU_I9WAY */
 
-#define MMU_SWAY        (MMU_CSIZE - MMU_NWAY)  /* set size 1<<MMU_SWAY */
-#define MMU_NSET        (MMU_SWAY - MMU_CLINE)  /* lines per way 1<<MMU_NSET */
+#define MMU_SWAY    (MMU_CSIZE - MMU_NWAY) /* set size 1<<MMU_SWAY */
+#define MMU_NSET    (MMU_SWAY - MMU_CLINE) /* lines per way 1<<MMU_NSET */
 
 /* L2-Cache */
 #define __ARM_L2CACHE__ 1
  * For reasons discussed in the platform expert code, we round the reported
  * L2 size to 4MB, and adjust the other parameters accordingly.
  */
-#define L2_CSIZE        __ARM_L2CACHE_SIZE_LOG__        /* cache size as 1<<L2_CSIZE */
-#define L2_CLINE        6               /* cache line size as 1<<L2_CSIZE (64) */
-#define L2_NWAY         4               /* set associativity as 1<<L2_CLINE (16, is actually 12) */
-#define L2_I7SET        6               /* cp15 c7 set incrementer 1<<L2_I7SET */
-#define L2_I7WAY        28              /* cp15 c7 way incrementer 1<<L2_I7WAY */
-#define L2_I9WAY        28              /* cp15 c9 way incremenber 1<<L2_I9WAY */
+#define L2_CSIZE        __ARM_L2CACHE_SIZE_LOG__ /* cache size as 1<<L2_CSIZE */
+#define L2_CLINE        6                        /* cache line size as 1<<L2_CSIZE (64) */
+#define L2_NWAY         4                        /* set associativity as 1<<L2_CLINE (16, is actually 12) */
+#define L2_I7SET        6                        /* cp15 c7 set incrementer 1<<L2_I7SET */
+#define L2_I7WAY        28                       /* cp15 c7 way incrementer 1<<L2_I7WAY */
+#define L2_I9WAY        28                       /* cp15 c9 way incremenber 1<<L2_I9WAY */
 
-#define L2_SWAY (L2_CSIZE - L2_NWAY)            /* set size 1<<L2_SWAY */
-#define L2_NSET (L2_SWAY - L2_CLINE)            /* lines per way 1<<L2_NSET */
+#define L2_SWAY         (L2_CSIZE - L2_NWAY)     /* set size 1<<L2_SWAY */
+#define L2_NSET         (L2_SWAY - L2_CLINE)     /* lines per way 1<<L2_NSET */
 
 #elif defined (APPLEMONSOON)
 
 /* I-Cache, 96KB for Monsoon, 48KB for Mistral, 6-way. */
-#define MMU_I_CLINE     6               /* cache line size as 1<<MMU_I_CLINE (64) */
+#define MMU_I_CLINE 6                      /* cache line size as 1<<MMU_I_CLINE (64) */
 
 /* D-Cache, 64KB for Monsoon, 32KB for Mistral, 4-way. */
-#define MMU_CSIZE       16              /* cache size as 1<<MMU_CSIZE (64K) */
-#define MMU_CLINE       6               /* cache line size is 1<<MMU_CLINE (64) */
-#define MMU_NWAY        2               /* set associativity 1<<MMU_NWAY (4) */
-#define MMU_I7SET       6               /* cp15 c7 set incrementer 1<<MMU_I7SET */
-#define MMU_I7WAY       30              /* cp15 c7 way incrementer 1<<MMU_I7WAY */
-#define MMU_I9WAY       30              /* cp15 c9 way incrementer 1<<MMU_I9WAY */
+#define MMU_CSIZE   16                     /* cache size as 1<<MMU_CSIZE (64K) */
+#define MMU_CLINE   6                      /* cache line size is 1<<MMU_CLINE (64) */
+#define MMU_NWAY    2                      /* set associativity 1<<MMU_NWAY (4) */
+#define MMU_I7SET   6                      /* cp15 c7 set incrementer 1<<MMU_I7SET */
+#define MMU_I7WAY   30                     /* cp15 c7 way incrementer 1<<MMU_I7WAY */
+#define MMU_I9WAY   30                     /* cp15 c9 way incrementer 1<<MMU_I9WAY */
 
-#define MMU_SWAY        (MMU_CSIZE - MMU_NWAY)  /* set size 1<<MMU_SWAY */
-#define MMU_NSET        (MMU_SWAY - MMU_CLINE)  /* lines per way 1<<MMU_NSET */
+#define MMU_SWAY    (MMU_CSIZE - MMU_NWAY) /* set size 1<<MMU_SWAY */
+#define MMU_NSET    (MMU_SWAY - MMU_CLINE) /* lines per way 1<<MMU_NSET */
 
 /* L2-Cache */
 #define __ARM_L2CACHE__ 1
  * TODO: Our L2 cahes have different line sizes.  I begin to suspect
  * this may be a problem.
  */
-#define L2_CSIZE        __ARM_L2CACHE_SIZE_LOG__        /* cache size as 1<<L2_CSIZE */
-#define L2_CLINE        7               /* cache line size as 1<<L2_CLINE (128) */
-#define L2_NWAY         4               /* set associativity as 1<<L2_NWAY (16) */
-#define L2_I7SET        6               /* TODO: cp15 c7 set incrementer 1<<L2_I7SET */
-#define L2_I7WAY        28              /* TODO: cp15 c7 way incrementer 1<<L2_I7WAY */
-#define L2_I9WAY        28              /* TODO: cp15 c9 way incremenber 1<<L2_I9WAY */
+#define L2_CSIZE        __ARM_L2CACHE_SIZE_LOG__ /* cache size as 1<<L2_CSIZE */
+#define L2_CLINE        7                        /* cache line size as 1<<L2_CLINE (128) */
+#define L2_NWAY         4                        /* set associativity as 1<<L2_NWAY (16) */
+#define L2_I7SET        6                        /* TODO: cp15 c7 set incrementer 1<<L2_I7SET */
+#define L2_I7WAY        28                       /* TODO: cp15 c7 way incrementer 1<<L2_I7WAY */
+#define L2_I9WAY        28                       /* TODO: cp15 c9 way incremenber 1<<L2_I9WAY */
 
-#define L2_SWAY (L2_CSIZE - L2_NWAY)            /* set size 1<<L2_SWAY */
-#define L2_NSET (L2_SWAY - L2_CLINE)            /* lines per way 1<<L2_NSET */
+#define L2_SWAY         (L2_CSIZE - L2_NWAY)     /* set size 1<<L2_SWAY */
+#define L2_NSET         (L2_SWAY - L2_CLINE)     /* lines per way 1<<L2_NSET */
 
 #elif defined (BCM2837) /* Raspberry Pi 3 */
 
 /* I-Cache. We don't have detailed spec so we just follow the ARM technical reference. */
-#define MMU_I_CLINE     6
+#define MMU_I_CLINE 6
 
 /* D-Cache. */
-#define MMU_CSIZE       15
-#define MMU_CLINE       6
-#define MMU_NWAY        4
+#define MMU_CSIZE   15
+#define MMU_CLINE   6
+#define MMU_NWAY    2
 
-#define MMU_I7SET       6
-#define MMU_I7WAY       30
-#define MMU_I9WAY       30
+#define MMU_I7SET   6
+#define MMU_I7WAY   30
+#define MMU_I9WAY   30
 
-#define MMU_SWAY        (MMU_CSIZE - MMU_NWAY)
-#define MMU_NSET        (MMU_SWAY - MMU_CLINE)
+#define MMU_SWAY    (MMU_CSIZE - MMU_NWAY)
+#define MMU_NSET    (MMU_SWAY - MMU_CLINE)
 
 #define __ARM_L2CACHE__ 1
 
 #define L2_I7SET        6
 #define L2_I7WAY        28
 #define L2_I9WAY        28
-#define L2_SWAY (L2_CSIZE - L2_NWAY)
-#define L2_NSET (L2_SWAY - L2_CLINE)
+#define L2_SWAY         (L2_CSIZE - L2_NWAY)
+#define L2_NSET         (L2_SWAY - L2_CLINE)
 
 #else
 #error processor not supported
  * |0|TE|AFE|TRE|NMFI|0|EE|VE|11|FI|UWXN|WXN|1|HA|1|0|RR| V| I| Z|SW|000|1|C15BEN|11|C|A|M|
  * +-+--+---+---+----+-+--+--+--+--+----+---+-+--+-+-+--+--+--+--+--+---+-+------+--+-+-+-+
  *
- *             TE                              Thumb Exception enable
- *             AFE                             Access flag enable
- *             TRE                             TEX remap enable
- *             NMFI                    Non-maskable FIQ (NMFI) support
- *             EE                              Exception Endianness
- *             VE                              Interrupt Vectors Enable
- *             FI                              Fast interrupts configuration enable
- *             ITD                             IT Disable
- *             UWXN                    Unprivileged write permission implies PL1 XN
- *             WXN                             Write permission implies XN
- *             HA                              Hardware Access flag enable
- *             RR                              Round Robin select
- *             V                               High exception vectors
- *             I                               Instruction cache enable
- *             Z                               Branch prediction enable
- *             SW                              SWP/SWPB enable
- *             C15BEN                  CP15 barrier enable
- *             C                               Cache enable
- *             A                               Alignment check enable
- *             M                               MMU enable
+ * Where:
+ *   TE:     Thumb Exception enable
+ *   AFE:    Access flag enable
+ *   TRE:    TEX remap enable
+ *   NMFI:   Non-maskable FIQ (NMFI) support
+ *   EE:     Exception Endianness
+ *   VE:     Interrupt Vectors Enable
+ *   FI:     Fast interrupts configuration enable
+ *   ITD:    IT Disable
+ *   UWXN:   Unprivileged write permission implies PL1 XN
+ *   WXN:    Write permission implies XN
+ *   HA:     Hardware Access flag enable
+ *   RR:     Round Robin select
+ *   V:      High exception vectors
+ *   I:      Instruction cache enable
+ *   Z:      Branch prediction enable
+ *   SW:     SWP/SWPB enable
+ *   C15BEN: CP15 barrier enable
+ *   C:      Cache enable
+ *   A:      Alignment check enable
+ *   M:      MMU enable
  */
 
-#define SCTLR_RESERVED                                  0x82DD8394
+#define SCTLR_RESERVED 0x82DD8394
 
-#define SCTLR_ENABLE                                    0x00000001      /* MMU enable */
-#define SCTLR_ALIGN                                             0x00000002      /* Alignment check enable */
-#define SCTLR_DCACHE                                    0x00000004      /* Data or Unified Cache enable */
-#define SCTLR_BEN                                               0x00000040      /* CP15 barrier enable */
-#define SCTLR_SW                                                0x00000400      /* SWP/SWPB Enable */
-#define SCTLR_PREDIC                                    0x00000800      /* Branch prediction enable */
-#define SCTLR_ICACHE                                    0x00001000      /* Instruction cache enabled. */
-#define SCTLR_HIGHVEC                                   0x00002000      /* Vector table at 0xffff0000 */
-#define SCTLR_RROBIN                                    0x00004000      /* Round Robin replacement */
-#define SCTLR_HA                                                0x00020000      /* Hardware Access flag enable */
-#define SCTLR_NMFI                                              0x08000000      /* Non-maskable FIQ */
-#define SCTLR_TRE                                               0x10000000      /* TEX remap enable */
-#define SCTLR_AFE                                               0x20000000      /* Access flag enable */
-#define SCTLR_TE                                                0x40000000      /* Thumb Exception enable */
+#define SCTLR_ENABLE   0x00000001 /* MMU enable */
+#define SCTLR_ALIGN    0x00000002 /* Alignment check enable */
+#define SCTLR_DCACHE   0x00000004 /* Data or Unified Cache enable */
+#define SCTLR_BEN      0x00000040 /* CP15 barrier enable */
+#define SCTLR_SW       0x00000400 /* SWP/SWPB Enable */
+#define SCTLR_PREDIC   0x00000800 /* Branch prediction enable */
+#define SCTLR_ICACHE   0x00001000 /* Instruction cache enabled. */
+#define SCTLR_HIGHVEC  0x00002000 /* Vector table at 0xffff0000 */
+#define SCTLR_RROBIN   0x00004000 /* Round Robin replacement */
+#define SCTLR_HA       0x00020000 /* Hardware Access flag enable */
+#define SCTLR_NMFI     0x08000000 /* Non-maskable FIQ */
+#define SCTLR_TRE      0x10000000 /* TEX remap enable */
+#define SCTLR_AFE      0x20000000 /* Access flag enable */
+#define SCTLR_TE       0x40000000 /* Thumb Exception enable */
 
-#define SCTLR_DEFAULT                                   (SCTLR_AFE|SCTLR_TRE|SCTLR_HIGHVEC|SCTLR_ICACHE|SCTLR_PREDIC|SCTLR_DCACHE|SCTLR_ENABLE)
+#define SCTLR_DEFAULT \
+       (SCTLR_AFE|SCTLR_TRE|SCTLR_HIGHVEC|SCTLR_ICACHE|SCTLR_PREDIC|SCTLR_DCACHE|SCTLR_ENABLE)
 
 
 /*
  * +---------------------------------------------------------------+
  */
 
-#define PRRR_NS1                                                0x00080000
-#define PRRR_NS0                                                0x00040000
-#define PRRR_DS1                                                0x00020000
-#define PRRR_DS0                                                0x00010000
-#define PRRR_NOSn_ISH(region)                   (0x1<<((region)+24))
+#define PRRR_NS1 0x00080000
+#define PRRR_NS0 0x00040000
+#define PRRR_DS1 0x00020000
+#define PRRR_DS0 0x00010000
+
+#define PRRR_NOSn_ISH(region) (0x1<<((region)+24))
 
 #if defined (ARMA7)
-#define PRRR_SETUP                      (0x1F08022A)
+#define PRRR_SETUP (0x1F08022A)
 #else
 #error processor not supported
 #endif
  * +---------------------------------------------------------------+
  */
 
-#define NMRR_DISABLED                                   0x0     /*  Non-cacheable */
-#define NMRR_WRITEBACK                                  0x1     /*  Write-Back, Write-Allocate */
-#define NMRR_WRITETHRU                                  0x2     /*  Write-Through, no Write-Allocate */
-#define NMRR_WRITEBACKNO                                0x3     /*  Write-Back, no Write-Allocate */
+#define NMRR_DISABLED    0x0 /* Non-cacheable */
+#define NMRR_WRITEBACK   0x1 /* Write-Back, Write-Allocate */
+#define NMRR_WRITETHRU   0x2 /* Write-Through, no Write-Allocate */
+#define NMRR_WRITEBACKNO 0x3 /* Write-Back, no Write-Allocate */
 
 #if defined (ARMA7)
-#define NMRR_SETUP                      (0x01210121)
+#define NMRR_SETUP (0x01210121)
 #else
 #error processor not supported
 #endif
  *
  */
 
-#define TTBR_IRGN_DISBALED                              0x00000000      /* inner non-cacheable */
-#define TTBR_IRGN_WRITEBACK                             0x00000040      /* inner write back and allocate */
-#define TTBR_IRGN_WRITETHRU                             0x00000001      /* inner write thru */
-#define TTBR_IRGN_WRITEBACKNO                   0x00000041      /* inner write back no allocate */
+#define TTBR_IRGN_DISBALED    0x00000000 /* inner non-cacheable */
+#define TTBR_IRGN_WRITEBACK   0x00000040 /* inner write back and allocate */
+#define TTBR_IRGN_WRITETHRU   0x00000001 /* inner write thru */
+#define TTBR_IRGN_WRITEBACKNO 0x00000041 /* inner write back no allocate */
 
-#define TTBR_RGN_DISBALED                               0x00000000      /* outer non-cacheable */
-#define TTBR_RGN_WRITEBACK                              0x00000008      /* outer write back and allocate */
-#define TTBR_RGN_WRITETHRU                              0x00000010      /* outer write thru outer cache */
-#define TTBR_RGN_WRITEBACKNO                    0x00000018      /* outer write back no allocate */
+#define TTBR_RGN_DISBALED     0x00000000 /* outer non-cacheable */
+#define TTBR_RGN_WRITEBACK    0x00000008 /* outer write back and allocate */
+#define TTBR_RGN_WRITETHRU    0x00000010 /* outer write thru outer cache */
+#define TTBR_RGN_WRITEBACKNO  0x00000018 /* outer write back no allocate */
 
-#define TTBR_SHARED                                             0x00000002      /* Shareable memory atribute */
-#define TTBR_SHARED_NOTOUTER                    0x00000020      /* Outer not shareable memory atribute */
+#define TTBR_SHARED           0x00000002 /* Shareable memory atribute */
+#define TTBR_SHARED_NOTOUTER  0x00000020 /* Outer not shareable memory atribute */
 
 #if defined (ARMA7)
-#define TTBR_SETUP      (TTBR_RGN_WRITEBACK|TTBR_IRGN_WRITEBACK|TTBR_SHARED)
+#define TTBR_SETUP (TTBR_RGN_WRITEBACK|TTBR_IRGN_WRITEBACK|TTBR_SHARED)
 #else
 #error processor not supported
 #endif
 /*
  * TTBCR: Translation Table Base Control register
  *
- *     31    3 2 0
- *     +----------+
- *     | zero | N |
- *     +----------+
+ *  31   3 2 0
+ * +----------+
+ * | zero | N |
+ * +----------+
  *
  * If N=0, always use translation table base register 0.  Otherwise, if
  * bits [31:32-N] of the address are all zero use base register 0.  Otherwise,
  * Writing to it updates the boundary for TTB0. (0=16KB, 1=8KB, 2=4KB, etc...)
  */
 
-#define TTBCR_N_1GB_TTB0                                0x2     /* 1 GB TTB0, 3GB TTB1 */
-#define TTBCR_N_2GB_TTB0                                0x1     /* 2 GB TTB0, 2GB TTB1 */
-#define TTBCR_N_4GB_TTB0                                0x0     /* 4 GB TTB0 */
-#define TTBCR_N_MASK                                    0x3
+#define TTBCR_N_1GB_TTB0 0x2 /* 1 GB TTB0, 3GB TTB1 */
+#define TTBCR_N_2GB_TTB0 0x1 /* 2 GB TTB0, 2GB TTB1 */
+#define TTBCR_N_4GB_TTB0 0x0 /* 4 GB TTB0 */
+#define TTBCR_N_MASK     0x3
 
 
 
  * DACR: Domain Access Control register
  */
 
-#define DAC_FAULT                                               0x0     /* invalid domain - everyone loses */
-#define DAC_CLIENT                                              0x1     /* client domain - use AP bits */
-#define DAC_RESERVE                                             0x2     /* reserved domain - undefined */
-#define DAC_MANAGER                                             0x3     /* manager domain - all access */
-#define DACR_SET(dom, x)                                ((x)<<((dom)<<1))
+#define DAC_FAULT   0x0 /* invalid domain - everyone loses */
+#define DAC_CLIENT  0x1 /* client domain - use AP bits */
+#define DAC_RESERVE 0x2 /* reserved domain - undefined */
+#define DAC_MANAGER 0x3 /* manager domain - all access */
 
+#define DACR_SET(dom, x) ((x)<<((dom)<<1))
 
-#define ARM_DOM_DEFAULT                                 0                       /* domain that forces AP use */
-#define ARM_DAC_SETUP                                   0x1
+
+#define ARM_DOM_DEFAULT 0   /* domain that forces AP use */
+#define ARM_DAC_SETUP   0x1
 
 /*
- *     ARM 2-level Page Table support
+ * ARM 2-level Page Table support
  */
 
 /*
  *  Memory Attribute Index
  */
-#define CACHE_ATTRINDX_WRITEBACK                0x0     /* cache enabled, buffer enabled */
-#define CACHE_ATTRINDX_WRITECOMB                0x1     /* no cache, buffered writes */
-#define CACHE_ATTRINDX_WRITETHRU                0x2     /* cache enabled, buffer disabled */
-#define CACHE_ATTRINDX_DISABLE                  0x3     /* no cache, no buffer */
-#define CACHE_ATTRINDX_INNERWRITEBACK           0x4     /* inner cache enabled, buffer enabled, write allocate */
-#define CACHE_ATTRINDX_POSTED                   CACHE_ATTRINDX_DISABLE
-#define CACHE_ATTRINDX_DEFAULT                  CACHE_ATTRINDX_WRITEBACK
+#define CACHE_ATTRINDX_WRITEBACK                 0x0 /* cache enabled, buffer enabled */
+#define CACHE_ATTRINDX_WRITECOMB                 0x1 /* no cache, buffered writes */
+#define CACHE_ATTRINDX_WRITETHRU                 0x2 /* cache enabled, buffer disabled */
+#define CACHE_ATTRINDX_DISABLE                   0x3 /* no cache, no buffer */
+#define CACHE_ATTRINDX_INNERWRITEBACK            0x4 /* inner cache enabled, buffer enabled, write allocate */
+#define CACHE_ATTRINDX_POSTED                    CACHE_ATTRINDX_DISABLE
+#define CACHE_ATTRINDX_POSTED_REORDERED          CACHE_ATTRINDX_DISABLE
+#define CACHE_ATTRINDX_POSTED_COMBINED_REORDERED CACHE_ATTRINDX_DISABLE
+#define CACHE_ATTRINDX_DEFAULT                   CACHE_ATTRINDX_WRITEBACK
 
 
 /*
  * Access protection bit values
  */
-#define AP_RWNA                                                 0x0     /* priv=read-write, user=no-access  */
-#define AP_RWRW                                                 0x1     /* priv=read-write, user=read-write */
-#define AP_RONA                                                 0x2     /* priv=read-only , user=no-access  */
-#define AP_RORO                                                 0x3     /* priv=read-only , user=read-only  */
+#define AP_RWNA 0x0 /* priv=read-write, user=no-access  */
+#define AP_RWRW 0x1 /* priv=read-write, user=read-write */
+#define AP_RONA 0x2 /* priv=read-only , user=no-access  */
+#define AP_RORO 0x3 /* priv=read-only , user=read-only  */
 
 /*
  *  L1 Translation table
  *  4096 32-bit entries of 1MB of address space.
  */
 
-#define ARM_TT_L1_SIZE                                  0x00100000      /* size of area covered by a tte */
-#define ARM_TT_L1_OFFMASK                               0x000FFFFF      /* offset within an L1 entry */
-#define ARM_TT_L1_TABLE_OFFMASK                 0x000FFFFF      /* offset within an L1 entry */
-#define ARM_TT_L1_BLOCK_OFFMASK                 0x000FFFFF      /* offset within an L1 entry */
-#define ARM_TT_L1_SUPER_OFFMASK                 0x00FFFFFF      /* offset within an L1 entry */
-#define ARM_TT_L1_SHIFT                                 20                      /* page descriptor shift */
-#define ARM_TT_L1_INDEX_MASK                    0xfff00000      /* mask for getting index in L1 table from virtual address */
+#define ARM_TT_L1_SIZE          0x00100000              /* size of area covered by a tte */
+#define ARM_TT_L1_OFFMASK       0x000FFFFF              /* offset within an L1 entry */
+#define ARM_TT_L1_TABLE_OFFMASK 0x000FFFFF              /* offset within an L1 entry */
+#define ARM_TT_L1_BLOCK_OFFMASK 0x000FFFFF              /* offset within an L1 entry */
+#define ARM_TT_L1_SUPER_OFFMASK 0x00FFFFFF              /* offset within an L1 entry */
+#define ARM_TT_L1_SHIFT         20                      /* page descriptor shift */
+#define ARM_TT_L1_INDEX_MASK    0xfff00000              /* mask for getting index in L1 table from virtual address */
 
-#define ARM_TT_L1_PT_SIZE                       (4 * ARM_TT_L1_SIZE)    /* 4 L1 table entries required to consume 1 L2 pagetable page */
-#define ARM_TT_L1_PT_OFFMASK                    (ARM_TT_L1_PT_SIZE - 1)
+#define ARM_TT_L1_PT_SIZE       (4 * ARM_TT_L1_SIZE)    /* 4 L1 table entries required to consume 1 L2 pagetable page */
+#define ARM_TT_L1_PT_OFFMASK    (ARM_TT_L1_PT_SIZE - 1)
 
 /*
  *  L2 Translation table
  *  4096 32-bit entries of 1MB (2^30) of address space.
  */
 
-#define ARM_TT_L2_SIZE                                  0x00001000      /* size of area covered by a tte */
-#define ARM_TT_L2_OFFMASK                               0x00000FFF      /* offset within an L2 entry */
-#define ARM_TT_L2_SHIFT                                 12                      /* page descriptor shift */
-#define ARM_TT_L2_INDEX_MASK                    0x000ff000      /* mask for getting index in L2 table from virtual address */
+#define ARM_TT_L2_SIZE       0x00001000 /* size of area covered by a tte */
+#define ARM_TT_L2_OFFMASK    0x00000FFF /* offset within an L2 entry */
+#define ARM_TT_L2_SHIFT      12         /* page descriptor shift */
+#define ARM_TT_L2_INDEX_MASK 0x000ff000 /* mask for getting index in L2 table from virtual address */
 
 /*
  * Convenience definitions for:
  *
  *   My apologies to any botanists who may be reading this.
  */
-#define ARM_TT_LEAF_SIZE                                ARM_TT_L2_SIZE
-#define ARM_TT_LEAF_OFFMASK                             ARM_TT_L2_OFFMASK
-#define ARM_TT_LEAF_SHIFT                               ARM_TT_L2_SHIFT
-#define ARM_TT_LEAF_INDEX_MASK                  ARM_TT_L2_INDEX_MASK
+#define ARM_TT_LEAF_SIZE       ARM_TT_L2_SIZE
+#define ARM_TT_LEAF_OFFMASK    ARM_TT_L2_OFFMASK
+#define ARM_TT_LEAF_SHIFT      ARM_TT_L2_SHIFT
+#define ARM_TT_LEAF_INDEX_MASK ARM_TT_L2_INDEX_MASK
 
-#define ARM_TT_TWIG_SIZE                                ARM_TT_L1_SIZE
-#define ARM_TT_TWIG_OFFMASK                             ARM_TT_L1_OFFMASK
-#define ARM_TT_TWIG_SHIFT                               ARM_TT_L1_SHIFT
-#define ARM_TT_TWIG_INDEX_MASK                  ARM_TT_L1_INDEX_MASK
+#define ARM_TT_TWIG_SIZE       ARM_TT_L1_SIZE
+#define ARM_TT_TWIG_OFFMASK    ARM_TT_L1_OFFMASK
+#define ARM_TT_TWIG_SHIFT      ARM_TT_L1_SHIFT
+#define ARM_TT_TWIG_INDEX_MASK ARM_TT_L1_INDEX_MASK
 
-#define ARM_TT_ROOT_SIZE                                ARM_TT_L1_SIZE
-#define ARM_TT_ROOT_OFFMASK                             ARM_TT_L1_OFFMASK
-#define ARM_TT_ROOT_SHIFT                               ARM_TT_L1_SHIFT
-#define ARM_TT_ROOT_INDEX_MASK                  ARM_TT_L1_INDEX_MASK
+#define ARM_TT_ROOT_SIZE       ARM_TT_L1_SIZE
+#define ARM_TT_ROOT_OFFMASK    ARM_TT_L1_OFFMASK
+#define ARM_TT_ROOT_SHIFT      ARM_TT_L1_SHIFT
+#define ARM_TT_ROOT_INDEX_MASK ARM_TT_L1_INDEX_MASK
 
 /*
- *     Level 1 Translation Table Entry
+ * Level 1 Translation Table Entry
  *
- *     page table entry
+ * page table entry
  *
- *     31                   10 9 8  5  4  2  0
- *     +----------------------+-+----+--+--+--+
- *     | page table base addr | |dom |XN|00|01|
- *     +----------------------+-+----+--+--+--+
+ *  31                  10 9 8  5  4  2  0
+ * +----------------------+-+----+--+--+--+
+ * | page table base addr | |dom |XN|00|01|
+ * +----------------------+-+----+--+--+--+
  *
- *     direct (1MB) section entry
+ * direct (1MB) section entry
  *
- *     31         20 18    15  12 10 9 8  5  4  2  0
- *     +------------+--+-+-+-+---+--+-+----+--+--+--+
- *     | base addr  |00|G|S|A|TEX|AP| |dom |XN|CB|10|
- *     +------------+--+-+-+-+---+--+-+----+--+--+--+
+ *  31        20 18    15  12 10 9 8  5  4  2  0
+ * +------------+--+-+-+-+---+--+-+----+--+--+--+
+ * | base addr  |00|G|S|A|TEX|AP| |dom |XN|CB|10|
+ * +------------+--+-+-+-+---+--+-+----+--+--+--+
  *
  *  super (16MB) section entry
  *
- *     31      24 23  18    15  12 10 9 8  5  4  2  0
- *     +---------+------+-+-+-+---+--+-+----+--+--+--+
- *     |base addr|000001|G|S|A|TEX|AP| |dom |XN|CB|10|
- *     +---------+------+-+-+-+---+--+-+----+--+--+--+
+ *  31     24 23  18    15  12 10 9 8  5  4  2  0
+ * +---------+------+-+-+-+---+--+-+----+--+--+--+
+ * |base addr|000001|G|S|A|TEX|AP| |dom |XN|CB|10|
+ * +---------+------+-+-+-+---+--+-+----+--+--+--+
  *
  * where:
- *     'G' is the notGlobal bit
- *     'S' is the shared bit
- *     'A' in the access permission extension (APX) bit
- *     'TEX' remap register control bits
- *     'AP' is the access protection
- *     'dom' is the domain for the translation
- *     'XN' is the eXecute Never bit
- *     'CB' is the cache/buffer attribute
+ *   'G' is the notGlobal bit
+ *   'S' is the shared bit
+ *   'A' in the access permission extension (APX) bit
+ *   'TEX' remap register control bits
+ *   'AP' is the access protection
+ *   'dom' is the domain for the translation
+ *   'XN' is the eXecute Never bit
+ *   'CB' is the cache/buffer attribute
  */
 
-#define ARM_TTE_EMPTY                           0x00000000                                      /* unasigned entry */
-
-#define ARM_TTE_TYPE_FAULT                      0x00000000                                      /* fault entry type */
-#define ARM_TTE_TYPE_TABLE                      0x00000001                                      /* page table type */
-#define ARM_TTE_TYPE_BLOCK                      0x00000002                                      /* section entry type */
-#define ARM_TTE_TYPE_MASK                       0x00000003                                      /* mask for extracting the type */
+#define ARM_TTE_EMPTY            0x00000000                     /* unasigned entry */
 
-#define ARM_TTE_BLOCK_NGSHIFT           17
-#define ARM_TTE_BLOCK_NG_MASK           0x00020000                               /* mask to determine notGlobal bit */
-#define ARM_TTE_BLOCK_NG                        0x00020000                               /* value for a per-process mapping */
+#define ARM_TTE_TYPE_FAULT       0x00000000                     /* fault entry type */
+#define ARM_TTE_TYPE_TABLE       0x00000001                     /* page table type */
+#define ARM_TTE_TYPE_BLOCK       0x00000002                     /* section entry type */
+#define ARM_TTE_TYPE_MASK        0x00000003                     /* mask for extracting the type */
 
-#define ARM_TTE_BLOCK_SHSHIFT           16
-#define ARM_TTE_BLOCK_SH_MASK           0x00010000                                      /* shared (SMP) mapping mask */
-#define ARM_TTE_BLOCK_SH                        0x00010000                                      /* shared (SMP) mapping */
+#define ARM_TTE_BLOCK_NGSHIFT    17
+#define ARM_TTE_BLOCK_NG_MASK    0x00020000                     /* mask to determine notGlobal bit */
+#define ARM_TTE_BLOCK_NG         0x00020000                     /* value for a per-process mapping */
 
-#define ARM_TTE_BLOCK_CBSHIFT           2
-#define ARM_TTE_BLOCK_CB(x)                     ((x) << ARM_TTE_BLOCK_CBSHIFT)
-#define ARM_TTE_BLOCK_CB_MASK           (3<< ARM_TTE_BLOCK_CBSHIFT)
+#define ARM_TTE_BLOCK_SHSHIFT    16
+#define ARM_TTE_BLOCK_SH_MASK    0x00010000                     /* shared (SMP) mapping mask */
+#define ARM_TTE_BLOCK_SH         0x00010000                     /* shared (SMP) mapping */
 
-#define ARM_TTE_BLOCK_AP0SHIFT          10
-#define ARM_TTE_BLOCK_AP0                       (1<<ARM_TTE_BLOCK_AP0SHIFT)
-#define ARM_TTE_BLOCK_AP0_MASK          (1<<ARM_TTE_BLOCK_AP0SHIFT)
+#define ARM_TTE_BLOCK_CBSHIFT    2
+#define ARM_TTE_BLOCK_CB(x)      ((x) << ARM_TTE_BLOCK_CBSHIFT)
+#define ARM_TTE_BLOCK_CB_MASK    (3<< ARM_TTE_BLOCK_CBSHIFT)
 
-#define ARM_TTE_BLOCK_AP1SHIFT          11
-#define ARM_TTE_BLOCK_AP1                       (1<<ARM_TTE_BLOCK_AP1SHIFT)
-#define ARM_TTE_BLOCK_AP1_MASK          (1<<ARM_TTE_BLOCK_AP1SHIFT)
+#define ARM_TTE_BLOCK_AP0SHIFT   10
+#define ARM_TTE_BLOCK_AP0        (1<<ARM_TTE_BLOCK_AP0SHIFT)
+#define ARM_TTE_BLOCK_AP0_MASK   (1<<ARM_TTE_BLOCK_AP0SHIFT)
 
-#define ARM_TTE_BLOCK_AP2SHIFT          15
-#define ARM_TTE_BLOCK_AP2                       (1<<ARM_TTE_BLOCK_AP2SHIFT)
-#define ARM_TTE_BLOCK_AP2_MASK          (1<<ARM_TTE_BLOCK_AP2SHIFT)
+#define ARM_TTE_BLOCK_AP1SHIFT   11
+#define ARM_TTE_BLOCK_AP1        (1<<ARM_TTE_BLOCK_AP1SHIFT)
+#define ARM_TTE_BLOCK_AP1_MASK   (1<<ARM_TTE_BLOCK_AP1SHIFT)
 
+#define ARM_TTE_BLOCK_AP2SHIFT   15
+#define ARM_TTE_BLOCK_AP2        (1<<ARM_TTE_BLOCK_AP2SHIFT)
+#define ARM_TTE_BLOCK_AP2_MASK   (1<<ARM_TTE_BLOCK_AP2SHIFT)
 
 /* access protections */
-#define ARM_TTE_BLOCK_AP(ap)            ((((ap)&0x1)<<ARM_TTE_BLOCK_AP1SHIFT)   \
-                                                                       | ((((ap)>>1)&0x1)<<ARM_TTE_BLOCK_AP2SHIFT))
+#define ARM_TTE_BLOCK_AP(ap) \
+       ((((ap)&0x1)<<ARM_TTE_BLOCK_AP1SHIFT) |    \
+       ((((ap)>>1)&0x1)<<ARM_TTE_BLOCK_AP2SHIFT))
 
 /* mask access protections */
-#define ARM_TTE_BLOCK_APMASK            (ARM_TTE_BLOCK_AP1_MASK \
-                                                                       | ARM_TTE_BLOCK_AP2_MASK)
+#define ARM_TTE_BLOCK_APMASK \
+       (ARM_TTE_BLOCK_AP1_MASK | ARM_TTE_BLOCK_AP2_MASK)
 
-#define ARM_TTE_BLOCK_AF                        ARM_TTE_BLOCK_AP0                       /* value for access */
-#define ARM_TTE_BLOCK_AFMASK            ARM_TTE_BLOCK_AP0_MASK          /* access mask */
+#define ARM_TTE_BLOCK_AF         ARM_TTE_BLOCK_AP0             /* value for access */
+#define ARM_TTE_BLOCK_AFMASK     ARM_TTE_BLOCK_AP0_MASK        /* access mask */
 
-#define ARM_TTE_TABLE_MASK                      0xFFFFFC00                                      /* mask for a L2 page table entry */
-#define ARM_TTE_TABLE_SHIFT                     10                                                      /* shift for  L2 page table phys address */
+#define ARM_TTE_TABLE_MASK       0xFFFFFC00                    /* mask for a L2 page table entry */
+#define ARM_TTE_TABLE_SHIFT      10                            /* shift for  L2 page table phys address */
 
-#define ARM_TTE_BLOCK_L1_MASK           0xFFF00000                                      /* mask to extract phys address from L1 section entry */
-#define ARM_TTE_BLOCK_L1_SHIFT          20                                                      /* shift for 1MB section phys address */
+#define ARM_TTE_BLOCK_L1_MASK    0xFFF00000                    /* mask to extract phys address from L1 section entry */
+#define ARM_TTE_BLOCK_L1_SHIFT   20                            /* shift for 1MB section phys address */
 
-#define ARM_TTE_SUPER_L1_MASK           0xFF000000                                      /* mask to extract phys address from L1 super entry */
-#define ARM_TTE_SUPER_L1_SHIFT          24                                                      /* shift for 16MB section phys address */
+#define ARM_TTE_SUPER_L1_MASK    0xFF000000                    /* mask to extract phys address from L1 super entry */
+#define ARM_TTE_SUPER_L1_SHIFT   24                            /* shift for 16MB section phys address */
 
-#define ARM_TTE_BLOCK_SUPER                     0x00040000                                      /* make section a 16MB section */
-#define ARM_TTE_BLOCK_SUPER_MASK        0x00F40000                                      /* make section a 16MB section */
+#define ARM_TTE_BLOCK_SUPER      0x00040000                    /* make section a 16MB section */
+#define ARM_TTE_BLOCK_SUPER_MASK 0x00F40000                    /* make section a 16MB section */
 
-#define ARM_TTE_BLOCK_NXSHIFT           4
-#define ARM_TTE_BLOCK_NX                        0x00000010                                      /* section is no execute */
-#define ARM_TTE_BLOCK_NX_MASK           0x00000010                                      /* mask for extracting no execute bit */
-#define ARM_TTE_BLOCK_PNX               ARM_TTE_BLOCK_NX
+#define ARM_TTE_BLOCK_NXSHIFT    4
+#define ARM_TTE_BLOCK_NX         0x00000010                    /* section is no execute */
+#define ARM_TTE_BLOCK_NX_MASK    0x00000010                    /* mask for extracting no execute bit */
+#define ARM_TTE_BLOCK_PNX        ARM_TTE_BLOCK_NX
 
-#define ARM_TTE_BLOCK_TEX0SHIFT         12
-#define ARM_TTE_BLOCK_TEX0                      (1<<ARM_TTE_BLOCK_TEX0SHIFT)
-#define ARM_TTE_BLOCK_TEX0_MASK         (1<<ARM_TTE_BLOCK_TEX0SHIFT)
+#define ARM_TTE_BLOCK_TEX0SHIFT  12
+#define ARM_TTE_BLOCK_TEX0       (1<<ARM_TTE_BLOCK_TEX0SHIFT)
+#define ARM_TTE_BLOCK_TEX0_MASK  (1<<ARM_TTE_BLOCK_TEX0SHIFT)
 
-#define ARM_TTE_BLOCK_TEX1SHIFT         13
-#define ARM_TTE_BLOCK_TEX1                      (1<<ARM_TTE_BLOCK_TEX1SHIFT)
-#define ARM_TTE_BLOCK_TEX1_MASK         (1<<ARM_TTE_BLOCK_TEX1SHIFT)
+#define ARM_TTE_BLOCK_TEX1SHIFT  13
+#define ARM_TTE_BLOCK_TEX1       (1<<ARM_TTE_BLOCK_TEX1SHIFT)
+#define ARM_TTE_BLOCK_TEX1_MASK  (1<<ARM_TTE_BLOCK_TEX1SHIFT)
 
-#define ARM_TTE_BLOCK_TEX2SHIFT         14
-#define ARM_TTE_BLOCK_TEX2                      (1<<ARM_TTE_BLOCK_TEX2SHIFT)
-#define ARM_TTE_BLOCK_TEX2_MASK         (1<<ARM_TTE_BLOCK_TEX2SHIFT)
+#define ARM_TTE_BLOCK_TEX2SHIFT  14
+#define ARM_TTE_BLOCK_TEX2       (1<<ARM_TTE_BLOCK_TEX2SHIFT)
+#define ARM_TTE_BLOCK_TEX2_MASK  (1<<ARM_TTE_BLOCK_TEX2SHIFT)
 
 
 /* mask memory attributes index */
-#define ARM_TTE_BLOCK_ATTRINDX(i)       ((((i)&0x3)<<ARM_TTE_BLOCK_CBSHIFT)     \
-                                                                       | ((((i)>>2)&0x1)<<ARM_TTE_BLOCK_TEX0SHIFT))
+#define ARM_TTE_BLOCK_ATTRINDX(i) \
+       ((((i)&0x3)<<ARM_TTE_BLOCK_CBSHIFT) |      \
+       ((((i)>>2)&0x1)<<ARM_TTE_BLOCK_TEX0SHIFT))
 
 /* mask memory attributes index */
-#define ARM_TTE_BLOCK_ATTRINDXMASK      (ARM_TTE_BLOCK_CB_MASK  \
-                                                                       | ARM_TTE_BLOCK_TEX0_MASK)
+#define ARM_TTE_BLOCK_ATTRINDXMASK \
+       (ARM_TTE_BLOCK_CB_MASK | ARM_TTE_BLOCK_TEX0_MASK)
 
 
 /*
  *
  * The following page table entry types are possible:
  *
- *     fault page entry
- *     31                                      2  0
- *     +----------------------------------------+--+
- *     |    ignored                             |00|
- *     +----------------------------------------+--+
+ * fault page entry
+ *  31                                     2  0
+ * +----------------------------------------+--+
+ * |    ignored                             |00|
+ * +----------------------------------------+--+
  *
- *     large (64KB) page entry
- *     31             16 15  12     9   6  4 3 2  0
- *     +----------------+--+---+-+-+-+---+--+-+-+--+
- *     | base phys addr |XN|TEX|G|S|A|000|AP|C|B|01|
- *     +----------------+--+---+-+-+-+---+--+-+-+--+
+ * large (64KB) page entry
+ *  31            16 15  12     9   6  4 3 2  0
+ * +----------------+--+---+-+-+-+---+--+-+-+--+
+ * | base phys addr |XN|TEX|G|S|A|000|AP|C|B|01|
+ * +----------------+--+---+-+-+-+---+--+-+-+--+
  *
- *     small (4KB) page entry
- *     31                    12     9   6  4 3 2 1  0
- *     +-----------------------+-+-+-+---+--+-+-+-+--+
- *     | base phys addr        |G|S|A|TEX|AP|C|B|1|XN|
- *     +-----------------------+-+-+-+---+--+-+-+-+--+
+ * small (4KB) page entry
+ *  31                   12     9   6  4 3 2 1  0
+ * +-----------------------+-+-+-+---+--+-+-+-+--+
+ * | base phys addr        |G|S|A|TEX|AP|C|B|1|XN|
+ * +-----------------------+-+-+-+---+--+-+-+-+--+
  *
  * also where:
- *     'XN' is the eXecute Never bit
- *     'G' is the notGlobal (process-specific) bit
- *     'S' is the shared bit
- *     'A' in the access permission extension (ATX) bit
- *     'TEX' remap register control bits
- *     'AP' is the access protection
- *     'dom' is the domain for the translation
- *     'C' is the cache attribute
- *     'B' is the write buffer attribute
+ *   'XN' is the eXecute Never bit
+ *   'G' is the notGlobal (process-specific) bit
+ *   'S' is the shared bit
+ *   'A' in the access permission extension (ATX) bit
+ *   'TEX' remap register control bits
+ *   'AP' is the access protection
+ *   'dom' is the domain for the translation
+ *   'C' is the cache attribute
+ *   'B' is the write buffer attribute
  */
 
-#define PTE_SHIFT                                       2                                               /* shift width of a pte (sizeof(pte) == (1 << PTE_SHIFT)) */
-#define PTE_PGENTRIES                           (1024 >> PTE_SHIFT)             /* number of ptes per page */
-
-#define ARM_PTE_EMPTY                                   0x00000000                      /* unasigned - invalid entry */
-
 /* markers for (invalid) PTE for a page sent to compressor */
-#define ARM_PTE_COMPRESSED      ARM_PTE_TEX1    /* compressed... */
-#define ARM_PTE_COMPRESSED_ALT  ARM_PTE_TEX2    /* ... and was "alt_acct" */
+#define ARM_PTE_COMPRESSED      ARM_PTE_TEX1                                  /* compressed... */
+#define ARM_PTE_COMPRESSED_ALT  ARM_PTE_TEX2                                  /* ... and was "alt_acct" */
 #define ARM_PTE_COMPRESSED_MASK (ARM_PTE_COMPRESSED | ARM_PTE_COMPRESSED_ALT)
-#define ARM_PTE_IS_COMPRESSED(x)                                        \
-       ((((x) & 0x3) == 0) &&          /* PTE is not valid... */       \
-        ((x) & ARM_PTE_COMPRESSED) &&  /* ...has "compressed" marker" */ \
-        ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \
-         (panic("compressed PTE %p 0x%x has extra bits 0x%x: corrupted?", \
-                &(x), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
+#define ARM_PTE_IS_COMPRESSED(x, p) \
+       ((((x) & 0x3) == 0) &&                   /* PTE is not valid... */         \
+        ((x) & ARM_PTE_COMPRESSED) &&           /* ...has "compressed" marker" */ \
+        ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */            \
+         (panic("compressed PTE %p 0x%x has extra bits 0x%x: corrupted?",         \
+                (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
+
+#define PTE_SHIFT              2                      /* shift width of a pte (sizeof(pte) == (1 << PTE_SHIFT)) */
+#define PTE_PGENTRIES          (1024 >> PTE_SHIFT)    /* number of ptes per page */
 
-#define ARM_PTE_TYPE_FAULT                              0x00000000                      /* fault entry type */
-#define ARM_PTE_TYPE                                    0x00000002                      /* small page entry type */
-#define ARM_PTE_TYPE_MASK                               0x00000002                      /* mask to get pte type */
+#define ARM_PTE_EMPTY          0x00000000             /* unasigned - invalid entry */
 
-#define ARM_PTE_NG_MASK                                 0x00000800                      /* mask to determine notGlobal bit */
-#define ARM_PTE_NG                                              0x00000800                       /* value for a per-process mapping */
+#define ARM_PTE_TYPE_FAULT     0x00000000             /* fault entry type */
+#define ARM_PTE_TYPE_VALID     0x00000002             /* valid L2 entry */
+#define ARM_PTE_TYPE           0x00000002             /* small page entry type */
+#define ARM_PTE_TYPE_MASK      0x00000002             /* mask to get pte type */
 
-#define ARM_PTE_SHSHIFT                                 10
-#define ARM_PTE_SHMASK                                  0x00000400                       /* shared (SMP) mapping mask */
-#define ARM_PTE_SH                                      0x00000400                       /* shared (SMP) mapping */
+#define ARM_PTE_NG_MASK        0x00000800             /* mask to determine notGlobal bit */
+#define ARM_PTE_NG             0x00000800             /* value for a per-process mapping */
 
-#define ARM_PTE_CBSHIFT                                 2
-#define ARM_PTE_CB(x)                                   ((x)<<ARM_PTE_CBSHIFT)
-#define ARM_PTE_CB_MASK                                 (0x3<<ARM_PTE_CBSHIFT)
+#define ARM_PTE_SHSHIFT        10
+#define ARM_PTE_SHMASK         0x00000400             /* shared (SMP) mapping mask */
+#define ARM_PTE_SH             0x00000400             /* shared (SMP) mapping */
 
-#define ARM_PTE_AP0SHIFT                                4
-#define ARM_PTE_AP0                                             (1<<ARM_PTE_AP0SHIFT)
-#define ARM_PTE_AP0_MASK                                (1<<ARM_PTE_AP0SHIFT)
+#define ARM_PTE_CBSHIFT        2
+#define ARM_PTE_CB(x)          ((x)<<ARM_PTE_CBSHIFT)
+#define ARM_PTE_CB_MASK        (0x3<<ARM_PTE_CBSHIFT)
 
-#define ARM_PTE_AP1SHIFT                                5
-#define ARM_PTE_AP1                                             (1<<ARM_PTE_AP1SHIFT)
-#define ARM_PTE_AP1_MASK                                (1<<ARM_PTE_AP1SHIFT)
+#define ARM_PTE_AP0SHIFT       4
+#define ARM_PTE_AP0            (1<<ARM_PTE_AP0SHIFT)
+#define ARM_PTE_AP0_MASK       (1<<ARM_PTE_AP0SHIFT)
 
-#define ARM_PTE_AP2SHIFT                                9
-#define ARM_PTE_AP2                                             (1<<ARM_PTE_AP2SHIFT)
-#define ARM_PTE_AP2_MASK                                (1<<ARM_PTE_AP2SHIFT)
+#define ARM_PTE_AP1SHIFT       5
+#define ARM_PTE_AP1            (1<<ARM_PTE_AP1SHIFT)
+#define ARM_PTE_AP1_MASK       (1<<ARM_PTE_AP1SHIFT)
+
+#define ARM_PTE_AP2SHIFT       9
+#define ARM_PTE_AP2            (1<<ARM_PTE_AP2SHIFT)
+#define ARM_PTE_AP2_MASK       (1<<ARM_PTE_AP2SHIFT)
 
 /* access protections */
-#define ARM_PTE_AP(ap)                                  ((((ap)&0x1)<<ARM_PTE_AP1SHIFT) \
-                                                                               | ((((ap)>>1)&0x1)<<ARM_PTE_AP2SHIFT))
+#define ARM_PTE_AP(ap) \
+       ((((ap)&0x1)<<ARM_PTE_AP1SHIFT) |    \
+       ((((ap)>>1)&0x1)<<ARM_PTE_AP2SHIFT))
 
 /* mask access protections */
-#define ARM_PTE_APMASK                                  (ARM_PTE_AP1_MASK       \
-                                                                               | ARM_PTE_AP2_MASK)
+#define ARM_PTE_APMASK \
+       (ARM_PTE_AP1_MASK | ARM_PTE_AP2_MASK)
 
-#define ARM_PTE_AF                                              ARM_PTE_AP0                     /* value for access */
-#define ARM_PTE_AFMASK                                  ARM_PTE_AP0_MASK        /* access mask */
+#define ARM_PTE_AF             ARM_PTE_AP0            /* value for access */
+#define ARM_PTE_AFMASK         ARM_PTE_AP0_MASK       /* access mask */
 
-#define ARM_PTE_PAGE_MASK                               0xFFFFF000                      /* mask for a small page */
-#define ARM_PTE_PAGE_SHIFT                              12                                      /* page shift for 4KB page */
+#define ARM_PTE_PAGE_MASK      0xFFFFF000             /* mask for a small page */
+#define ARM_PTE_PAGE_SHIFT     12                     /* page shift for 4KB page */
 
-#define ARM_PTE_NXSHIFT                                 0
-#define ARM_PTE_NX                                              0x00000001                       /* small page no execute */
-#define ARM_PTE_NX_MASK                                 (1<<ARM_PTE_NXSHIFT)
+#define ARM_PTE_NXSHIFT        0
+#define ARM_PTE_NX             0x00000001             /* small page no execute */
+#define ARM_PTE_NX_MASK        (1<<ARM_PTE_NXSHIFT)
 
-#define ARM_PTE_PNXSHIFT                                0
-#define ARM_PTE_PNX                                             0x00000000                      /* no privilege execute. not impl */
-#define ARM_PTE_PNX_MASK                                (0<<ARM_PTE_NXSHIFT)
+#define ARM_PTE_PNXSHIFT       0
+#define ARM_PTE_PNX            0x00000000             /* no privilege execute. not impl */
+#define ARM_PTE_PNX_MASK       (0<<ARM_PTE_NXSHIFT)
 
-#define ARM_PTE_TEX0SHIFT                               6
-#define ARM_PTE_TEX0                                    (1<<ARM_PTE_TEX0SHIFT)
-#define ARM_PTE_TEX0_MASK                               (1<<ARM_PTE_TEX0SHIFT)
+#define ARM_PTE_TEX0SHIFT      6
+#define ARM_PTE_TEX0           (1<<ARM_PTE_TEX0SHIFT)
+#define ARM_PTE_TEX0_MASK      (1<<ARM_PTE_TEX0SHIFT)
 
-#define ARM_PTE_TEX1SHIFT                               7
-#define ARM_PTE_TEX1                                    (1<<ARM_PTE_TEX1SHIFT)
-#define ARM_PTE_TEX1_MASK                               (1<<ARM_PTE_TEX1SHIFT)
+#define ARM_PTE_TEX1SHIFT      7
+#define ARM_PTE_TEX1           (1<<ARM_PTE_TEX1SHIFT)
+#define ARM_PTE_TEX1_MASK      (1<<ARM_PTE_TEX1SHIFT)
 
-#define ARM_PTE_WRITEABLESHIFT                  ARM_PTE_TEX1SHIFT
-#define ARM_PTE_WRITEABLE                               ARM_PTE_TEX1
-#define ARM_PTE_WRITEABLE_MASK                  ARM_PTE_TEX1_MASK
+#define ARM_PTE_WRITEABLESHIFT ARM_PTE_TEX1SHIFT
+#define ARM_PTE_WRITEABLE      ARM_PTE_TEX1
+#define ARM_PTE_WRITEABLE_MASK ARM_PTE_TEX1_MASK
 
-#define ARM_PTE_TEX2SHIFT                               8
-#define ARM_PTE_TEX2                                    (1<<ARM_PTE_TEX2SHIFT)
-#define ARM_PTE_TEX2_MASK                               (1<<ARM_PTE_TEX2SHIFT)
+#define ARM_PTE_TEX2SHIFT      8
+#define ARM_PTE_TEX2           (1<<ARM_PTE_TEX2SHIFT)
+#define ARM_PTE_TEX2_MASK      (1<<ARM_PTE_TEX2SHIFT)
 
-#define ARM_PTE_WIREDSHIFT                              ARM_PTE_TEX2SHIFT
-#define ARM_PTE_WIRED                                   ARM_PTE_TEX2
-#define ARM_PTE_WIRED_MASK                              ARM_PTE_TEX2_MASK
+#define ARM_PTE_WIREDSHIFT     ARM_PTE_TEX2SHIFT
+#define ARM_PTE_WIRED          ARM_PTE_TEX2
+#define ARM_PTE_WIRED_MASK     ARM_PTE_TEX2_MASK
 
 /* mask memory attributes index */
-#define ARM_PTE_ATTRINDX(indx)                  ((((indx)&0x3)<<ARM_PTE_CBSHIFT)        \
-                                                                               | ((((indx)>>2)&0x1)<<ARM_PTE_TEX0SHIFT))
+#define ARM_PTE_ATTRINDX(indx) \
+       ((((indx)&0x3)<<ARM_PTE_CBSHIFT) |      \
+       ((((indx)>>2)&0x1)<<ARM_PTE_TEX0SHIFT))
 
 /* mask memory attributes index */
-#define ARM_PTE_ATTRINDXMASK                    (ARM_PTE_CB_MASK        \
-                                                                               | ARM_PTE_TEX0_MASK)
-
-#define ARM_SMALL_PAGE_SIZE                     (4096)                          /* 4KB */
-#define ARM_LARGE_PAGE_SIZE                     (64*1024)                       /* 64KB */
-#define ARM_SECTION_SIZE                                (1024*1024)                     /* 1MB */
-#define ARM_SUPERSECTION_SIZE                   (16*1024*1024)          /* 16MB */
-
+#define ARM_PTE_ATTRINDXMASK \
+       (ARM_PTE_CB_MASK | ARM_PTE_TEX0_MASK)
+
+#define ARM_SMALL_PAGE_SIZE    (4096)         /* 4KB */
+#define ARM_LARGE_PAGE_SIZE    (64*1024)      /* 64KB */
+#define ARM_SECTION_SIZE       (1024*1024)    /* 1MB */
+#define ARM_SUPERSECTION_SIZE  (16*1024*1024) /* 16MB */
+
+#define TLBI_ADDR_SHIFT (12)
+#define TLBI_ADDR_SIZE  (20)
+#define TLBI_ADDR_MASK  (((1ULL << TLBI_ADDR_SIZE) - 1))
+#define TLBI_ASID_SHIFT (0)
+#define TLBI_ASID_SIZE  (8)
+#define TLBI_ASID_MASK  (((1ULL << TLBI_ASID_SIZE) - 1))
 #endif
 
 /*
  * Format of the Debug Status and Control Register (DBGDSCR)
  */
-#define ARM_DBGDSCR_RXFULL                                      (1 << 30)
-#define ARM_DBGDSCR_TXFULL                                      (1 << 29)
-#define ARM_DBGDSCR_RXFULL_1                            (1 << 27)
-#define ARM_DBGDSCR_TXFULL_1                            (1 << 26)
-#define ARM_DBGDSCR_PIPEADV                                     (1 << 25)
-#define ARM_DBGDSCR_INSTRCOMPL_1                        (1 << 24)
-#define ARM_DBGDSCR_EXTDCCMODE_MASK                     (3 << 20)
-#define ARM_DBGDSCR_EXTDCCMODE_NONBLOCKING      (0 << 20)
-#define ARM_DBGDSCR_EXTDCCMODE_STALL            (1 << 20)
-#define ARM_DBGDSCR_EXTDCCMODE_FAST                     (1 << 20)
-#define ARM_DBGDSCR_ADADISCARD                          (1 << 19)
-#define ARM_DBGDSCR_NS                                          (1 << 18)
-#define ARM_DBGDSCR_SPNIDDIS                            (1 << 17)
-#define ARM_DBGDSCR_SPIDDIS                                     (1 << 16)
-#define ARM_DBGDSCR_MDBGEN                                      (1 << 15)
-#define ARM_DBGDSCR_HDBGEN                                      (1 << 14)
-#define ARM_DBGDSCR_ITREN                                       (1 << 13)
-#define ARM_DBGDSCR_UDCCDIS                                     (1 << 12)
-#define ARM_DBGDSCR_INTDIS                                      (1 << 11)
-#define ARM_DBGDSCR_DBGACK                                      (1 << 10)
-#define ARM_DBGDSCR_DBGNOPWRDWN                         (1 << 9)
-#define ARM_DBGDSCR_UND_1                                       (1 << 8)
-#define ARM_DBGDSCR_ADABORT_1                           (1 << 7)
-#define ARM_DBGDSCR_SDABORT_1                           (1 << 6)
-#define ARM_DBGDSCR_MOE_MASK                            (15 << 2)
-#define ARM_DBGDSCR_MOE_HALT_REQUEST            (0 << 2)
-#define ARM_DBGDSCR_MOE_BREAKPOINT                      (1 << 2)
-#define ARM_DBGDSCR_MOE_ASYNC_WATCHPOINT        (2 << 2)
-#define ARM_DBGDSCR_MOE_BKPT_INSTRUCTION        (3 << 2)
-#define ARM_DBGDSCR_MOE_EXT_DEBUG_REQ           (4 << 2)
-#define ARM_DBGDSCR_MOE_VECTOR_CATCH            (5 << 2)
-#define ARM_DBGDSCR_MOE_DSIDE_ABORT                     (6 << 2)
-#define ARM_DBGDSCR_MOE_ISIDE_ABORT                     (7 << 2)
-#define ARM_DBGDSCR_MOE_OS_UNLOCK_CATCH         (8 << 2)
-#define ARM_DBGDSCR_MOE_SYNC_WATCHPOINT         (10 << 2)
-
-#define ARM_DBGDSCR_RESTARTED                           (1 << 1)
-#define ARM_DBGDSCR_HALTED                                      (1 << 0)
+#define ARM_DBGDSCR_RXFULL                 (1 << 30)
+#define ARM_DBGDSCR_TXFULL                 (1 << 29)
+#define ARM_DBGDSCR_RXFULL_1               (1 << 27)
+#define ARM_DBGDSCR_TXFULL_1               (1 << 26)
+#define ARM_DBGDSCR_PIPEADV                (1 << 25)
+#define ARM_DBGDSCR_INSTRCOMPL_1           (1 << 24)
+#define ARM_DBGDSCR_EXTDCCMODE_MASK        (3 << 20)
+#define ARM_DBGDSCR_EXTDCCMODE_NONBLOCKING (0 << 20)
+#define ARM_DBGDSCR_EXTDCCMODE_STALL       (1 << 20)
+#define ARM_DBGDSCR_EXTDCCMODE_FAST        (1 << 20)
+#define ARM_DBGDSCR_ADADISCARD             (1 << 19)
+#define ARM_DBGDSCR_NS                     (1 << 18)
+#define ARM_DBGDSCR_SPNIDDIS               (1 << 17)
+#define ARM_DBGDSCR_SPIDDIS                (1 << 16)
+#define ARM_DBGDSCR_MDBGEN                 (1 << 15)
+#define ARM_DBGDSCR_HDBGEN                 (1 << 14)
+#define ARM_DBGDSCR_ITREN                  (1 << 13)
+#define ARM_DBGDSCR_UDCCDIS                (1 << 12)
+#define ARM_DBGDSCR_INTDIS                 (1 << 11)
+#define ARM_DBGDSCR_DBGACK                 (1 << 10)
+#define ARM_DBGDSCR_DBGNOPWRDWN            (1 << 9)
+#define ARM_DBGDSCR_UND_1                  (1 << 8)
+#define ARM_DBGDSCR_ADABORT_1              (1 << 7)
+#define ARM_DBGDSCR_SDABORT_1              (1 << 6)
+#define ARM_DBGDSCR_MOE_MASK               (15 << 2)
+#define ARM_DBGDSCR_MOE_HALT_REQUEST       (0 << 2)
+#define ARM_DBGDSCR_MOE_BREAKPOINT         (1 << 2)
+#define ARM_DBGDSCR_MOE_ASYNC_WATCHPOINT   (2 << 2)
+#define ARM_DBGDSCR_MOE_BKPT_INSTRUCTION   (3 << 2)
+#define ARM_DBGDSCR_MOE_EXT_DEBUG_REQ      (4 << 2)
+#define ARM_DBGDSCR_MOE_VECTOR_CATCH       (5 << 2)
+#define ARM_DBGDSCR_MOE_DSIDE_ABORT        (6 << 2)
+#define ARM_DBGDSCR_MOE_ISIDE_ABORT        (7 << 2)
+#define ARM_DBGDSCR_MOE_OS_UNLOCK_CATCH    (8 << 2)
+#define ARM_DBGDSCR_MOE_SYNC_WATCHPOINT    (10 << 2)
+
+#define ARM_DBGDSCR_RESTARTED              (1 << 1)
+#define ARM_DBGDSCR_HALTED                 (1 << 0)
 
 /*
  * Format of the Debug & Watchpoint Breakpoint Value and Control Registers
  * Using ARMv7 names; ARMv6 and ARMv6.1 are bit-compatible
  */
-#define ARM_DBG_VR_ADDRESS_MASK             0xFFFFFFFC  /* BVR & WVR */
-#define ARM_DBGBVR_CONTEXTID_MASK           0xFFFFFFFF  /* BVR only  */
+#define ARM_DBG_VR_ADDRESS_MASK             0xFFFFFFFC /* BVR & WVR */
+#define ARM_DBGBVR_CONTEXTID_MASK           0xFFFFFFFF /* BVR only  */
 
-#define ARM_DBG_CR_ADDRESS_MASK_MASK        0x1F000000  /* BCR & WCR */
-#define ARM_DBGBCR_MATCH_MASK               (1 << 22)   /* BCR only  */
+#define ARM_DBG_CR_ADDRESS_MASK_MASK        0x1F000000 /* BCR & WCR */
+#define ARM_DBGBCR_MATCH_MASK               (1 << 22)  /* BCR only  */
 #define ARM_DBGBCR_MATCH_MATCH              (0 << 22)
 #define ARM_DBGBCR_MATCH_MISMATCH           (1 << 22)
-#define ARM_DBGBCR_TYPE_MASK                (1 << 21)   /* BCR only */
+#define ARM_DBGBCR_TYPE_MASK                (1 << 21)  /* BCR only */
 #define ARM_DBGBCR_TYPE_IVA                 (0 << 21)
 #define ARM_DBGBCR_TYPE_CONTEXTID           (1 << 21)
-#define ARM_DBG_CR_LINKED_MASK              (1 << 20)   /* BCR & WCR */
+#define ARM_DBG_CR_LINKED_MASK              (1 << 20)  /* BCR & WCR */
 #define ARM_DBG_CR_LINKED_LINKED            (1 << 20)
 #define ARM_DBG_CR_LINKED_UNLINKED          (0 << 20)
-#define ARM_DBG_CR_LINKED_BRP_MASK          0x000F0000  /* BCR & WCR */
-#define ARM_DBG_CR_SECURITY_STATE_MASK      (3 << 14)   /* BCR & WCR */
+#define ARM_DBG_CR_LINKED_BRP_MASK          0x000F0000 /* BCR & WCR */
+#define ARM_DBG_CR_SECURITY_STATE_MASK      (3 << 14)  /* BCR & WCR */
 #define ARM_DBG_CR_SECURITY_STATE_BOTH      (0 << 14)
 #define ARM_DBG_CR_SECURITY_STATE_NONSECURE (1 << 14)
 #define ARM_DBG_CR_SECURITY_STATE_SECURE    (2 << 14)
-#define ARM_DBG_CR_HIGHER_MODE_MASK         (1 << 13)   /* BCR & WCR */
+#define ARM_DBG_CR_HIGHER_MODE_MASK         (1 << 13)  /* BCR & WCR */
 #define ARM_DBG_CR_HIGHER_MODE_ENABLE       (1 << 13)
 #define ARM_DBG_CR_HIGHER_MODE_DISABLE      (0 << 13)
-#define ARM_DBGWCR_BYTE_ADDRESS_SELECT_MASK 0x00001FE0  /* WCR only  */
-#define ARM_DBG_CR_BYTE_ADDRESS_SELECT_MASK 0x000001E0  /* BCR & WCR */
-#define ARM_DBGWCR_ACCESS_CONTROL_MASK      (3 << 3)    /* WCR only */
+#define ARM_DBGWCR_BYTE_ADDRESS_SELECT_MASK 0x00001FE0 /* WCR only  */
+#define ARM_DBG_CR_BYTE_ADDRESS_SELECT_MASK 0x000001E0 /* BCR & WCR */
+#define ARM_DBGWCR_ACCESS_CONTROL_MASK      (3 << 3)   /* WCR only */
 #define ARM_DBCWCR_ACCESS_CONTROL_LOAD      (1 << 3)
 #define ARM_DBCWCR_ACCESS_CONTROL_STORE     (2 << 3)
 #define ARM_DBCWCR_ACCESS_CONTROL_ANY       (3 << 3)
-#define ARM_DBG_CR_MODE_CONTROL_MASK        (3 << 1)    /* BCR & WCR */
-#define ARM_DBG_CR_MODE_CONTROL_U_S_S       (0 << 1)    /* BCR only  */
-#define ARM_DBG_CR_MODE_CONTROL_PRIVILEGED  (1 << 1)    /* BCR & WCR */
-#define ARM_DBG_CR_MODE_CONTROL_USER        (2 << 1)    /* BCR & WCR */
-#define ARM_DBG_CR_MODE_CONTROL_ANY         (3 << 1)    /* BCR & WCR */
-#define ARM_DBG_CR_ENABLE_MASK              (1 << 0)    /* BCR & WCR */
+#define ARM_DBG_CR_MODE_CONTROL_MASK        (3 << 1)   /* BCR & WCR */
+#define ARM_DBG_CR_MODE_CONTROL_U_S_S       (0 << 1)   /* BCR only  */
+#define ARM_DBG_CR_MODE_CONTROL_PRIVILEGED  (1 << 1)   /* BCR & WCR */
+#define ARM_DBG_CR_MODE_CONTROL_USER        (2 << 1)   /* BCR & WCR */
+#define ARM_DBG_CR_MODE_CONTROL_ANY         (3 << 1)   /* BCR & WCR */
+#define ARM_DBG_CR_ENABLE_MASK              (1 << 0)   /* BCR & WCR */
 #define ARM_DBG_CR_ENABLE_ENABLE            (1 << 0)
 #define ARM_DBG_CR_ENABLE_DISABLE           (0 << 0)
 
 /*
  * Format of the Device Power-down and Reset Status Register (DBGPRSR)
  */
-#define ARM_DBGPRSR_STICKY_RESET_STATUS         (1 << 3)
-#define ARM_DBGPRSR_RESET_STATUS                        (1 << 2)
-#define ARM_DBGPRSR_STICKY_POWERDOWN_STATUS     (1 << 1)
-#define ARM_DBGPRSR_POWERUP_STATUS                      (1 << 0)
+#define ARM_DBGPRSR_STICKY_RESET_STATUS     (1 << 3)
+#define ARM_DBGPRSR_RESET_STATUS            (1 << 2)
+#define ARM_DBGPRSR_STICKY_POWERDOWN_STATUS (1 << 1)
+#define ARM_DBGPRSR_POWERUP_STATUS          (1 << 0)
 
 /*
  * Format of the OS Lock Access (DBGOSLAR) and Lock Access Registers (DBGLAR)
  */
-#define ARM_DBG_LOCK_ACCESS_KEY                         0xC5ACCE55
+#define ARM_DBG_LOCK_ACCESS_KEY 0xC5ACCE55
 
 /* ARMv7 Debug register map */
-#define ARM_DEBUG_OFFSET_DBGDIDR                        (0x000)
-#define ARM_DEBUG_OFFSET_DBGWFAR                        (0x018)
-#define ARM_DEBUG_OFFSET_DBGVCR                         (0x01C)
-#define ARM_DEBUG_OFFSET_DBGECR                         (0x024)
-#define ARM_DEBUG_OFFSET_DBGDSCCR                       (0x028)
-#define ARM_DEBUG_OFFSET_DBGDSMCR                       (0x02C)
-#define ARM_DEBUG_OFFSET_DBGDTRRX                       (0x080)
-#define ARM_DEBUG_OFFSET_DBGITR                         (0x084) /* Write-only */
-#define ARM_DEBUG_OFFSET_DBGPCSR                        (0x084) /* Read-only */
-#define ARM_DEBUG_OFFSET_DBGDSCR                        (0x088)
-#define ARM_DEBUG_OFFSET_DBGDTRTX                       (0x08C)
-#define ARM_DEBUG_OFFSET_DBGDRCR                        (0x090)
-#define ARM_DEBUG_OFFSET_DBGBVR                         (0x100) /* 0x100 - 0x13C */
-#define ARM_DEBUG_OFFSET_DBGBCR                         (0x140) /* 0x140 - 0x17C */
-#define ARM_DEBUG_OFFSET_DBGWVR                         (0x180) /* 0x180 - 0x1BC */
-#define ARM_DEBUG_OFFSET_DBGWCR                         (0x1C0) /* 0x1C0 - 0x1FC */
-#define ARM_DEBUG_OFFSET_DBGOSLAR                       (0x300)
-#define ARM_DEBUG_OFFSET_DBGOSLSR                       (0x304)
-#define ARM_DEBUG_OFFSET_DBGOSSRR                       (0x308)
-#define ARM_DEBUG_OFFSET_DBGPRCR                        (0x310)
-#define ARM_DEBUG_OFFSET_DBGPRSR                        (0x314)
-#define ARM_DEBUG_OFFSET_DBGITCTRL                      (0xF00)
-#define ARM_DEBUG_OFFSET_DBGCLAIMSET            (0xFA0)
-#define ARM_DEBUG_OFFSET_DBGCLAIMCLR            (0xFA4)
-#define ARM_DEBUG_OFFSET_DBGLAR                         (0xFB0)
-#define ARM_DEBUG_OFFSET_DBGLSR                         (0xFB4)
-#define ARM_DEBUG_OFFSET_DBGAUTHSTATUS          (0xFB8)
-#define ARM_DEBUG_OFFSET_DBGDEVID                       (0xFC8)
-#define ARM_DEBUG_OFFSET_DBGDEVTYPE                     (0xFCC)
-#define ARM_DEBUG_OFFSET_DBGPID0                        (0xFD0)
-#define ARM_DEBUG_OFFSET_DBGPID1                        (0xFD4)
-#define ARM_DEBUG_OFFSET_DBGPID2                        (0xFD8)
-#define ARM_DEBUG_OFFSET_DBGPID3                        (0xFDA)
-#define ARM_DEBUG_OFFSET_DBGPID4                        (0xFDC)
-#define ARM_DEBUG_OFFSET_DBGCID0                        (0xFF0)
-#define ARM_DEBUG_OFFSET_DBGCID1                        (0xFF4)
-#define ARM_DEBUG_OFFSET_DBGCID2                        (0xFF8)
-#define ARM_DEBUG_OFFSET_DBGCID3                        (0xFFA)
-#define ARM_DEBUG_OFFSET_DBGCID4                        (0xFFC)
+#define ARM_DEBUG_OFFSET_DBGDIDR       (0x000)
+#define ARM_DEBUG_OFFSET_DBGWFAR       (0x018)
+#define ARM_DEBUG_OFFSET_DBGVCR        (0x01C)
+#define ARM_DEBUG_OFFSET_DBGECR        (0x024)
+#define ARM_DEBUG_OFFSET_DBGDSCCR      (0x028)
+#define ARM_DEBUG_OFFSET_DBGDSMCR      (0x02C)
+#define ARM_DEBUG_OFFSET_DBGDTRRX      (0x080)
+#define ARM_DEBUG_OFFSET_DBGITR        (0x084) /* Write-only */
+#define ARM_DEBUG_OFFSET_DBGPCSR       (0x084) /* Read-only */
+#define ARM_DEBUG_OFFSET_DBGDSCR       (0x088)
+#define ARM_DEBUG_OFFSET_DBGDTRTX      (0x08C)
+#define ARM_DEBUG_OFFSET_DBGDRCR       (0x090)
+#define ARM_DEBUG_OFFSET_DBGBVR        (0x100) /* 0x100 - 0x13C */
+#define ARM_DEBUG_OFFSET_DBGBCR        (0x140) /* 0x140 - 0x17C */
+#define ARM_DEBUG_OFFSET_DBGWVR        (0x180) /* 0x180 - 0x1BC */
+#define ARM_DEBUG_OFFSET_DBGWCR        (0x1C0) /* 0x1C0 - 0x1FC */
+#define ARM_DEBUG_OFFSET_DBGOSLAR      (0x300)
+#define ARM_DEBUG_OFFSET_DBGOSLSR      (0x304)
+#define ARM_DEBUG_OFFSET_DBGOSSRR      (0x308)
+#define ARM_DEBUG_OFFSET_DBGPRCR       (0x310)
+#define ARM_DEBUG_OFFSET_DBGPRSR       (0x314)
+#define ARM_DEBUG_OFFSET_DBGITCTRL     (0xF00)
+#define ARM_DEBUG_OFFSET_DBGCLAIMSET   (0xFA0)
+#define ARM_DEBUG_OFFSET_DBGCLAIMCLR   (0xFA4)
+#define ARM_DEBUG_OFFSET_DBGLAR        (0xFB0)
+#define ARM_DEBUG_OFFSET_DBGLSR        (0xFB4)
+#define ARM_DEBUG_OFFSET_DBGAUTHSTATUS (0xFB8)
+#define ARM_DEBUG_OFFSET_DBGDEVID      (0xFC8)
+#define ARM_DEBUG_OFFSET_DBGDEVTYPE    (0xFCC)
+#define ARM_DEBUG_OFFSET_DBGPID0       (0xFD0)
+#define ARM_DEBUG_OFFSET_DBGPID1       (0xFD4)
+#define ARM_DEBUG_OFFSET_DBGPID2       (0xFD8)
+#define ARM_DEBUG_OFFSET_DBGPID3       (0xFDA)
+#define ARM_DEBUG_OFFSET_DBGPID4       (0xFDC)
+#define ARM_DEBUG_OFFSET_DBGCID0       (0xFF0)
+#define ARM_DEBUG_OFFSET_DBGCID1       (0xFF4)
+#define ARM_DEBUG_OFFSET_DBGCID2       (0xFF8)
+#define ARM_DEBUG_OFFSET_DBGCID3       (0xFFA)
+#define ARM_DEBUG_OFFSET_DBGCID4       (0xFFC)
 
 /*
  * Media and VFP Feature Register 1 (MVFR1)
  */
-#define MVFR_ASIMD_HPFP                         0x00100000UL
+#define MVFR_ASIMD_HPFP 0x00100000UL
 
 /*
  * Main ID Register (MIDR)
 #define MIDR_IMP_SHIFT  24
 #define MIDR_IMP_MASK   (0xff << MIDR_IMP_SHIFT)
 
+#ifdef __arm__
+
+/* Macros meant to make __builtin_arm_* functions easier to use. */
+#define MRC_SCTLR    15,0,1,0,0
+#define MCR_SCTLR(x) 15,0,(x),1,0,0
+
+#define MRC_ACTLR    15,0,1,0,1
+#define MCR_ACTLR(x) 15,0,(x),1,0,1
+
+#endif /* __arm__ */
+
 #endif /* _ARM_PROC_REG_H_ */
index 3f4d5c91a9fa2a07ca231750d84a290d94f1e843..b0dd0d928e7b674a48afee50d736f7a3c86ac201 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 
 #ifdef MACH_KERNEL_PRIVATE
 
-typedef uint32_t hw_lock_bit_t;
-
-#if LOCK_STATS
-extern void     hw_lock_bit(
-       hw_lock_bit_t *,
-       unsigned int,
-       lck_grp_t*);
-
-extern void     hw_lock_bit_nopreempt(
-       hw_lock_bit_t *,
-       unsigned int,
-       lck_grp_t*);
-
-extern unsigned int hw_lock_bit_try(
-       hw_lock_bit_t *,
-       unsigned int,
-       lck_grp_t*);
-
-extern unsigned int hw_lock_bit_to(
-       hw_lock_bit_t *,
-       unsigned int,
-       uint32_t,
-       lck_grp_t*);
-
-#else
-extern void     hw_lock_bit(
-       hw_lock_bit_t *,
-       unsigned int);
-#define hw_lock_bit(lck, bit, grp) hw_lock_bit(lck, bit)
-
-extern void     hw_lock_bit_nopreempt(
-       hw_lock_bit_t *,
-       unsigned int);
-#define hw_lock_bit_nopreempt(lck, bit, grp) hw_lock_bit_nopreempt(lck, bit)
-
-extern unsigned int hw_lock_bit_try(
-       hw_lock_bit_t *,
-       unsigned int);
-#define hw_lock_bit_try(lck, bit, grp) hw_lock_bit_try(lck, bit)
-
-extern unsigned int hw_lock_bit_to(
-       hw_lock_bit_t *,
-       unsigned int,
-       uint32_t);
-#define hw_lock_bit_to(lck, bit, timeout, grp) hw_lock_bit_to(lck, bit, timeout)
-
-#endif /* LOCK_STATS */
-
-extern void     hw_unlock_bit(
-       hw_lock_bit_t *,
-       unsigned int);
-
-extern void     hw_unlock_bit_nopreempt(
-       hw_lock_bit_t *,
-       unsigned int);
-
-#define hw_lock_bit_held(l, b) (((*(l))&(1<<b))!=0)
-
-
 extern uint32_t LockTimeOut;                    /* Number of hardware ticks of a lock timeout */
 extern uint32_t LockTimeOutUsec;                /* Number of microseconds for lock timeout */
 
-/*
- * USLOCK_DEBUG is broken on ARM and has been disabled.
- * There are no callers to any of the usld_lock functions and data structures
- * don't match between between usimple_lock_data_t and lck_spin_t
- */
-
-/*
- #if MACH_LDEBUG
- #define USLOCK_DEBUG 1
- #else
- #define USLOCK_DEBUG 0
- #endif
- */
-
-#if     !USLOCK_DEBUG
-
 typedef lck_spin_t usimple_lock_data_t, *usimple_lock_t;
-
-#else
-
-typedef struct uslock_debug {
-       void                    *lock_pc;       /* pc where lock operation began    */
-       void                    *lock_thread;   /* thread that acquired lock */
-       unsigned long   duration[2];
-       unsigned short  state;
-       unsigned char   lock_cpu;
-       void                    *unlock_thread; /* last thread to release lock */
-       unsigned char   unlock_cpu;
-       void                    *unlock_pc;     /* pc where lock operation ended    */
-} uslock_debug;
-
-typedef struct {
-       hw_lock_data_t  interlock;      /* must be first... see lock.c */
-       unsigned short  lock_type;      /* must be second... see lock.c */
-#define USLOCK_TAG      0x5353
-       uslock_debug    debug;
-} usimple_lock_data_t, *usimple_lock_t;
-
-#endif  /* USLOCK_DEBUG */
-
-#else
+#else /* MACH_KERNEL_PRIVATE */
 
 #if defined(__arm__)
 typedef struct slock {
@@ -205,7 +107,7 @@ typedef usimple_lock_data_t     *simple_lock_t;
 typedef usimple_lock_data_t     simple_lock_data_t;
 
 #define decl_simple_lock_data(class, name) \
-       class   simple_lock_data_t      name;
+       class   simple_lock_data_t      name
 
 #endif  /* !defined(decl_simple_lock_data) */
 
index ced8c0d3eb9c1474a6ef32f2fb6a3338b3e41eb7..2ecd28c66904131e3e7bdc1852be0118e4fb3c55 100644 (file)
@@ -244,8 +244,9 @@ doneveqp:
        add             r5, r4, r5, LSL #2                                      // convert to tte pointer
 
        add             r6, r4, PGBYTES * 9                                     // get page table base (past 4 + 4 + 1 tte/pte pages)
+       add             r6, r6, #0xc00                                          // adjust to last 1MB section
        mov             r7, #(ARM_TTE_TABLE_MASK & 0xFFFF)      // ARM_TTE_TABLE_MASK low halfword
-       movt    r7, #(ARM_TTE_TABLE_MASK >> 16)         // ARM_TTE_TABLE_MASK top halfword 
+       movt            r7, #(ARM_TTE_TABLE_MASK >> 16)         // ARM_TTE_TABLE_MASK top halfword 
        and             r11, r6, r7                                                     // apply mask
        orr             r11, r11, #ARM_TTE_TYPE_TABLE           // mark it as a coarse page table
        str             r11, [r5]                                                       // store tte entry for page table
index 8fffe7c1ca05e0912e980b784c7cc17a502c9101..bdfcf5a6ba531e72109febe813d29e58ffb6d4ce 100644 (file)
@@ -63,10 +63,11 @@ void
 /* __private_extern__ */
 unsigned int    _MachineStateCount[] = {
         /* FLAVOR_LIST */ 0,
-       ARM_THREAD_STATE_COUNT,
-       ARM_VFP_STATE_COUNT,
-       ARM_EXCEPTION_STATE_COUNT,
-       ARM_DEBUG_STATE_COUNT
+       [ARM_THREAD_STATE]    = ARM_THREAD_STATE_COUNT,
+       [ARM_VFP_STATE]       = ARM_VFP_STATE_COUNT,
+       [ARM_EXCEPTION_STATE] = ARM_EXCEPTION_STATE_COUNT,
+       [ARM_DEBUG_STATE]     = ARM_DEBUG_STATE_COUNT,
+       [ARM_PAGEIN_STATE]    = ARM_PAGEIN_STATE_COUNT,
 };
 
 extern zone_t ads_zone;
@@ -139,6 +140,18 @@ machine_thread_get_state(
                *count = 4;
                break;
 
+       case THREAD_STATE_FLAVOR_LIST_10_15:
+               if (*count < 5)
+                       return (KERN_INVALID_ARGUMENT);
+
+               tstate[0] = ARM_THREAD_STATE;
+               tstate[1] = ARM_VFP_STATE;
+               tstate[2] = ARM_EXCEPTION_STATE;
+               tstate[3] = ARM_DEBUG_STATE;
+               tstate[4] = ARM_PAGEIN_STATE;
+               *count = 5;
+               break;
+
        case ARM_THREAD_STATE:{
                        struct arm_thread_state *state;
                        struct arm_saved_state *saved_state;
@@ -237,6 +250,20 @@ machine_thread_get_state(
                         break;
                }
 
+       case ARM_PAGEIN_STATE:{
+               arm_pagein_state_t *state;
+
+               if (*count < ARM_PAGEIN_STATE_COUNT) {
+                       return (KERN_INVALID_ARGUMENT);
+               }
+                       
+               state = (arm_pagein_state_t *)tstate;
+               state->__pagein_error = thread->t_pagein_error;
+
+               *count = ARM_PAGEIN_STATE_COUNT;
+               break;
+       }
+
        default:
                return (KERN_INVALID_ARGUMENT);
        }
@@ -456,17 +483,30 @@ machine_thread_set_state(
        return (KERN_SUCCESS);
 }
 
+mach_vm_address_t
+machine_thread_pc(thread_t thread)
+{
+       struct arm_saved_state *ss = get_user_regs(thread);
+       return (mach_vm_address_t)get_saved_state_pc(ss);
+}
+
+void
+machine_thread_reset_pc(thread_t thread, mach_vm_address_t pc)
+{
+       set_saved_state_pc(get_user_regs(thread), (register_t)pc);
+}
+
 /*
  * Routine:    machine_thread_state_initialize
  *
  */
 kern_return_t
 machine_thread_state_initialize(
-                               thread_t thread)
+       thread_t thread)
 {
        struct arm_saved_state *savestate;
 
-       savestate = (struct arm_saved_state *) & thread->machine.PcbData;
+       savestate = (struct arm_saved_state *) &thread->machine.PcbData;
        bzero((char *) savestate, sizeof(struct arm_saved_state));
        savestate->cpsr = PSR_USERDFLT;
 
index 2558ed3bb6624cc19a3c0d6ab545e7fa530129dd..7bdcec8af6874a8d0acd4194f3c968248387934a 100644 (file)
  * Machine dependant task fields
  */
 
+#if defined(HAS_APPLE_PAC)
+#define MACHINE_TASK \
+       void*                   task_debug; \
+       uint64_t rop_pid; \
+       boolean_t disable_user_jop;
+#else
 #define MACHINE_TASK \
        void*                   task_debug;
+#endif
index 46a603dcce9d51837aea671f5b368f95a3369544..f17ae451dad75e997cdf1bc7250f047be86013b4 100644 (file)
 #include <mach/arm/vm_types.h>
 #include <mach/thread_status.h>
 
-#ifdef  MACH_KERNEL_PRIVATE
+#ifdef MACH_KERNEL_PRIVATE
 #include <arm/cpu_data.h>
 #include <arm/proc_reg.h>
 #endif
 
 #if __ARM_VFP__
 
-#define VFPSAVE_ALIGN   16
-#define VFPSAVE_ATTRIB  __attribute__ ((aligned (VFPSAVE_ALIGN)))
-#define THREAD_ALIGN    VFPSAVE_ALIGN
+#define VFPSAVE_ALIGN  16
+#define VFPSAVE_ATTRIB __attribute__((aligned (VFPSAVE_ALIGN)))
+#define THREAD_ALIGN   VFPSAVE_ALIGN
 
 /*
  * vector floating point saved state
  */
 struct arm_vfpsaved_state {
-       uint32_t    r[64];
-       uint32_t    fpscr;
-       uint32_t    fpexc;
+       uint32_t r[64];
+       uint32_t fpscr;
+       uint32_t fpexc;
 };
 #endif
 
 struct perfcontrol_state {
-       uint64_t        opaque[8] __attribute__((aligned(8)));
+       uint64_t opaque[8] __attribute__((aligned(8)));
 };
 
 /*
@@ -94,7 +94,7 @@ struct perfcontrol_state {
  */
 extern unsigned int _MachineStateCount[];
 
-#ifdef  MACH_KERNEL_PRIVATE
+#ifdef MACH_KERNEL_PRIVATE
 #if __arm64__
 typedef arm_context_t machine_thread_kernel_state;
 #else
@@ -104,104 +104,82 @@ typedef struct arm_saved_state machine_thread_kernel_state;
 
 struct machine_thread {
 #if __arm64__
-       arm_context_t                           *contextData;                           /* allocated user context */
-       arm_saved_state_t                       *upcb;                                  /* pointer to user GPR state */
-       arm_neon_saved_state_t                  *uNeon;                                 /* pointer to user VFP state */
+       arm_context_t *           contextData;             /* allocated user context */
+       arm_saved_state_t *       upcb;                    /* pointer to user GPR state */
+       arm_neon_saved_state_t *  uNeon;                   /* pointer to user VFP state */
 #elif __arm__
-       struct arm_saved_state          PcbData;
+       struct arm_saved_state    PcbData;
 #if __ARM_VFP__
-       struct arm_vfpsaved_state       uVFPdata VFPSAVE_ATTRIB;
-       struct arm_vfpsaved_state       kVFPdata VFPSAVE_ATTRIB;
+       struct arm_vfpsaved_state uVFPdata VFPSAVE_ATTRIB;
+       struct arm_vfpsaved_state kVFPdata VFPSAVE_ATTRIB;
 #endif /* __ARM_VFP__ */
 
 #else
 #error Unknown arch
 #endif
+
 #if __ARM_USER_PROTECT__
-       unsigned int                            uptw_ttc;
-       unsigned int                            uptw_ttb;
-       unsigned int                            kptw_ttb;
-       unsigned int                            asid;
+       unsigned int              uptw_ttc;
+       unsigned int              uptw_ttb;
+       unsigned int              kptw_ttb;
+       unsigned int              asid;
 #endif
 
-       vm_offset_t                             kstackptr;                                      /* top of kernel stack */
-       struct cpu_data                         *CpuDatap;                                      /* current per cpu data */
-       unsigned int                            preemption_count;                       /* preemption count */
+       vm_offset_t               kstackptr;                  /* top of kernel stack */
+#if defined(HAS_APPLE_PAC)
+       uint64_t                  rop_pid;
+       boolean_t                 disable_user_jop;
+#endif
+       struct cpu_data *         CpuDatap;                   /* current per cpu data */
+       unsigned int              preemption_count;           /* preemption count */
 
 #if __ARM_SMP__
 #define MACHINE_THREAD_FLAGS_ON_CPU (0x1)
 
-       uint8_t                                 machine_thread_flags;
+       uint8_t                   machine_thread_flags;
 #endif /* __ARM_SMP__ */
 
-       arm_debug_state_t                       *DebugData;
-       mach_vm_address_t                       cthread_self;                           /* for use of cthread package */
-       mach_vm_address_t                       cthread_data;                           /* for use of cthread package */
+       arm_debug_state_t *       DebugData;
+       mach_vm_address_t         cthread_self;               /* for use of cthread package */
+       mach_vm_address_t         cthread_data;               /* for use of cthread package */
 
-       struct perfcontrol_state        perfctrl_state;
+       struct perfcontrol_state  perfctrl_state;
 #if __arm64__
-       uint64_t                                energy_estimate_nj;
+       uint64_t                  energy_estimate_nj;
 #endif
 
 #if INTERRUPT_MASKED_DEBUG
-       uint64_t                            intmask_timestamp;                  /* timestamp of when interrupts were masked */
+       uint64_t                  intmask_timestamp;          /* timestamp of when interrupts were masked */
 #endif
 };
 #endif
 
-extern struct arm_saved_state           *get_user_regs(thread_t);
-extern struct arm_saved_state           *find_user_regs(thread_t);
-extern struct arm_saved_state           *find_kern_regs(thread_t);
-extern struct arm_vfpsaved_state        *find_user_vfp(thread_t);
+extern struct arm_saved_state *    get_user_regs(thread_t);
+extern struct arm_saved_state *    find_user_regs(thread_t);
+extern struct arm_saved_state *    find_kern_regs(thread_t);
+extern struct arm_vfpsaved_state find_user_vfp(thread_t);
 #if defined(__arm__)
-extern arm_debug_state_t                        *find_debug_state(thread_t);
+extern arm_debug_state_t *         find_debug_state(thread_t);
 #elif defined(__arm64__)
-extern arm_debug_state32_t                      *find_debug_state32(thread_t);
-extern arm_debug_state64_t                      *find_debug_state64(thread_t);
-extern arm_neon_saved_state_t                   *get_user_neon_regs(thread_t);
+extern arm_debug_state32_t *       find_debug_state32(thread_t);
+extern arm_debug_state64_t *       find_debug_state64(thread_t);
+extern arm_neon_saved_state_t *    get_user_neon_regs(thread_t);
 #else
 #error unknown arch
 #endif
 
 #define FIND_PERFCONTROL_STATE(th) (&th->machine.perfctrl_state)
 
-#ifdef  MACH_KERNEL_PRIVATE
+#ifdef MACH_KERNEL_PRIVATE
 #if __ARM_VFP__
-extern void     vfp_state_initialize(struct arm_vfpsaved_state *vfp_state);
-extern void     vfp_save(struct arm_vfpsaved_state *vfp_ss);
-extern void     vfp_load(struct arm_vfpsaved_state *vfp_ss);
-extern void     toss_live_vfp(void *vfp_fc);
+extern void vfp_state_initialize(struct arm_vfpsaved_state *vfp_state);
+extern void vfp_save(struct arm_vfpsaved_state *vfp_ss);
+extern void vfp_load(struct arm_vfpsaved_state *vfp_ss);
 #endif /* __ARM_VFP__ */
-extern void     arm_debug_set(arm_debug_state_t *debug_state);
+extern void arm_debug_set(arm_debug_state_t *debug_state);
 #if defined(__arm64__)
-extern void     arm_debug_set32(arm_debug_state_t *debug_state);
-extern void     arm_debug_set64(arm_debug_state_t *debug_state);
-
-kern_return_t handle_get_arm_thread_state(
-       thread_state_t tstate,
-       mach_msg_type_number_t * count,
-       const arm_saved_state_t *saved_state);
-kern_return_t handle_get_arm32_thread_state(
-       thread_state_t tstate,
-       mach_msg_type_number_t * count,
-       const arm_saved_state_t *saved_state);
-kern_return_t handle_get_arm64_thread_state(
-       thread_state_t tstate,
-       mach_msg_type_number_t * count,
-       const arm_saved_state_t *saved_state);
-
-kern_return_t handle_set_arm_thread_state(
-       const thread_state_t tstate,
-       mach_msg_type_number_t count,
-       arm_saved_state_t *saved_state);
-kern_return_t handle_set_arm32_thread_state(
-       const thread_state_t tstate,
-       mach_msg_type_number_t count,
-       arm_saved_state_t *saved_state);
-kern_return_t handle_set_arm64_thread_state(
-       const thread_state_t tstate,
-       mach_msg_type_number_t count,
-       arm_saved_state_t *saved_state);
+extern void arm_debug_set32(arm_debug_state_t *debug_state);
+extern void arm_debug_set64(arm_debug_state_t *debug_state);
 #endif
 #endif /* MACH_KERNEL_PRIVATE */
 
@@ -209,17 +187,11 @@ extern void *act_thread_csave(void);
 extern void act_thread_catt(void *ctx);
 extern void act_thread_cfree(void *ctx);
 
-/*
- * Return address of the function that called current function, given
- *     address of the first parameter of current function.
- */
-#define GET_RETURN_PC(addr)     (((vm_offset_t *)0))
 
 /*
- * Defining this indicates that MD code will supply an exception()
- * routine, conformant with kern/exception.c (dependency alert!)
- * but which does wonderfully fast, machine-dependent magic.
+ * Return address of the function that called current function, given
+ * address of the first parameter of current function.
  */
-#define MACHINE_FAST_EXCEPTION 1
+#define GET_RETURN_PC(addr) (__builtin_return_address(0))
 
-#endif  /* _ARM_THREAD_H_ */
+#endif /* _ARM_THREAD_H_ */
diff --git a/osfmk/arm/tlb.h b/osfmk/arm/tlb.h
new file mode 100644 (file)
index 0000000..793b50c
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#pragma once
+
+#include <arm/proc_reg.h>
+
+#define tlbi_addr(x) ((((x) >> 12) & TLBI_ADDR_MASK) << TLBI_ADDR_SHIFT)
+#define tlbi_asid(x) (((uintptr_t)(x) & TLBI_ASID_MASK) << TLBI_ASID_SHIFT)
+
+extern void sync_tlb_flush(void);
+extern void flush_mmu_tlb_async(void);
+extern void flush_mmu_tlb(void);
+extern void flush_core_tlb_async(void);
+extern void flush_core_tlb(void);
+extern void flush_mmu_tlb_entry_async(uint32_t);
+extern void flush_mmu_tlb_entry(uint32_t);
+extern void flush_mmu_tlb_entries_async(uint32_t, uint32_t);
+extern void flush_mmu_tlb_entries(uint32_t, uint32_t);
+extern void flush_mmu_tlb_mva_entries_async(uint32_t);
+extern void flush_mmu_tlb_mva_entries(uint32_t);
+extern void flush_mmu_tlb_asid_async(uint32_t);
+extern void flush_mmu_tlb_asid(uint32_t);
+extern void flush_core_tlb_asid_async(uint32_t);
+extern void flush_core_tlb_asid(uint32_t);
index 2605951b2a1460e4c8999262695404d959b01bad..608593c869423594e1893b590bdbe4435f6fc29d 100644 (file)
@@ -102,7 +102,7 @@ perfCallback tempDTraceTrapHook = NULL; /* Pointer to DTrace fbt trap hook routi
 void            sleh_undef(struct arm_saved_state *, struct arm_vfpsaved_state *);
 void            sleh_abort(struct arm_saved_state *, int);
 static kern_return_t sleh_alignment(struct arm_saved_state *);
-static void     panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *regs);
+static void panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *regs);
 
 int             sleh_alignment_count = 0;
 int             trap_on_alignment_fault = 0;
@@ -243,7 +243,7 @@ sleh_undef(struct arm_saved_state * regs, struct arm_vfpsaved_state * vfp_ss __u
                         * can see the original state of this thread).
                         */
                        vm_offset_t kstackptr = current_thread()->machine.kstackptr;
-                       *((arm_saved_state_t *) kstackptr) = *regs;
+                       copy_signed_thread_state((arm_saved_state_t *)kstackptr, regs);
 
                        DebuggerCall(exception, regs);
                        (void) ml_set_interrupts_enabled(intr);
@@ -274,7 +274,7 @@ sleh_abort(struct arm_saved_state * regs, int type)
        int             status;
        int             debug_status = 0;
        int             spsr;
-       int             exc;
+       int             exc = EXC_BAD_ACCESS;
        mach_exception_data_type_t codes[2];
        vm_map_t        map;
        vm_map_address_t vaddr;
@@ -309,7 +309,7 @@ sleh_abort(struct arm_saved_state * regs, int type)
 
        if (ml_at_interrupt_context()) {
 #if CONFIG_DTRACE
-               if (!(thread->options & TH_OPT_DTRACE))
+               if (!(thread->t_dtrace_inprobe))
 #endif /* CONFIG_DTRACE */
                {
                        panic_with_thread_kernel_state("sleh_abort at interrupt context", regs);
@@ -404,7 +404,7 @@ sleh_abort(struct arm_saved_state * regs, int type)
                        (void) ml_set_interrupts_enabled(intr);
                } else if (TEST_FSR_VMFAULT(status)) {
 #if CONFIG_DTRACE
-                       if (thread->options & TH_OPT_DTRACE) {  /* Executing under dtrace_probe? */
+                       if (thread->t_dtrace_inprobe) {  /* Executing under dtrace_probe? */
                                if (dtrace_tally_fault(fault_addr)) { /* Should a fault under dtrace be ignored? */
                                        /* Point to next instruction */
                                        regs->pc += ((regs->cpsr & PSR_TF) && !IS_THUMB32(*((uint16_t*) (regs->pc)))) ? 2 : 4;
@@ -428,7 +428,7 @@ sleh_abort(struct arm_saved_state * regs, int type)
 
                        if (!TEST_FSR_TRANSLATION_FAULT(status)) {
                                /* check to see if it is just a pmap ref/modify fault */
-                               result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, FALSE);
+                               result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, (status == FSR_PACCESS), FALSE);
                                if (result == KERN_SUCCESS) {
                                        goto exit;
                                }
@@ -470,22 +470,18 @@ sleh_abort(struct arm_saved_state * regs, int type)
                }
                intr = ml_set_interrupts_enabled(FALSE);
 
-               panic_plain("kernel abort type %d: fault_type=0x%x, fault_addr=0x%x\n"
+               panic_plain("kernel abort type %d at pc 0x%08x, lr 0x%08x: fault_type=0x%x, fault_addr=0x%x\n"
                    "r0:   0x%08x  r1: 0x%08x  r2: 0x%08x  r3: 0x%08x\n"
                    "r4:   0x%08x  r5: 0x%08x  r6: 0x%08x  r7: 0x%08x\n"
                    "r8:   0x%08x  r9: 0x%08x r10: 0x%08x r11: 0x%08x\n"
                    "r12:  0x%08x  sp: 0x%08x  lr: 0x%08x  pc: 0x%08x\n"
                    "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n",
-                   type, fault_type, fault_addr,
+                   type, regs->pc, regs->lr, fault_type, fault_addr,
                    regs->r[0], regs->r[1], regs->r[2], regs->r[3],
                    regs->r[4], regs->r[5], regs->r[6], regs->r[7],
                    regs->r[8], regs->r[9], regs->r[10], regs->r[11],
                    regs->r[12], regs->sp, regs->lr, regs->pc,
                    regs->cpsr, regs->fsr, regs->far);
-
-               (void) ml_set_interrupts_enabled(intr);
-
-               goto exit;
        }
        /* Fault in user mode */
 
@@ -493,7 +489,7 @@ sleh_abort(struct arm_saved_state * regs, int type)
                map = thread->map;
 
 #if CONFIG_DTRACE
-               if (thread->options & TH_OPT_DTRACE) {  /* Executing under dtrace_probe? */
+               if (thread->t_dtrace_inprobe) {  /* Executing under dtrace_probe? */
                        if (dtrace_tally_fault(fault_addr)) { /* Should a user mode fault under dtrace be ignored? */
                                if (recover) {
                                        regs->pc = recover;
@@ -519,7 +515,7 @@ sleh_abort(struct arm_saved_state * regs, int type)
 
                if (!TEST_FSR_TRANSLATION_FAULT(status)) {
                        /* check to see if it is just a pmap ref/modify fault */
-                       result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, TRUE);
+                       result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, (status == FSR_PACCESS), TRUE);
                        if (result == KERN_SUCCESS) {
                                goto exception_return;
                        }
@@ -534,22 +530,27 @@ sleh_abort(struct arm_saved_state * regs, int type)
                if (result == KERN_SUCCESS || result == KERN_ABORTED) {
                        goto exception_return;
                }
-               exc = EXC_BAD_ACCESS;
+
+               /*
+                * KERN_FAILURE here means preemption was disabled when we called vm_fault.
+                * That should never happen for a page fault from user space.
+                */
+               if (__improbable(result == KERN_FAILURE)) {
+                       panic("vm_fault() KERN_FAILURE from user fault on thread %p", thread);
+               }
+
                codes[0] = result;
        } else if ((status & FSR_ALIGN_MASK) == FSR_ALIGN) {
                if (sleh_alignment(regs) == KERN_SUCCESS) {
                        goto exception_return;
                }
-               exc = EXC_BAD_ACCESS;
                codes[0] = EXC_ARM_DA_ALIGN;
        } else if (status == FSR_DEBUG) {
                exc = EXC_BREAKPOINT;
                codes[0] = EXC_ARM_DA_DEBUG;
        } else if ((status == FSR_SDOM) || (status == FSR_PDOM)) {
-               exc = EXC_BAD_ACCESS;
-               codes[0] = KERN_INVALID_ADDRESS;
+               panic_with_thread_kernel_state("Unexpected domain fault", regs);
        } else {
-               exc = EXC_BAD_ACCESS;
                codes[0] = KERN_FAILURE;
        }
 
@@ -857,16 +858,17 @@ interrupt_stats(void)
        SCHED_STATS_INTERRUPT(current_processor());
 }
 
+__dead2
 static void
 panic_with_thread_kernel_state(const char *msg, struct arm_saved_state *regs)
 {
-       panic_plain("%s (saved state:%p)\n"
+       panic_plain("%s at pc 0x%08x, lr 0x%08x (saved state:%p)\n"
            "r0:   0x%08x  r1: 0x%08x  r2: 0x%08x  r3: 0x%08x\n"
            "r4:   0x%08x  r5: 0x%08x  r6: 0x%08x  r7: 0x%08x\n"
            "r8:   0x%08x  r9: 0x%08x r10: 0x%08x r11: 0x%08x\n"
            "r12:  0x%08x  sp: 0x%08x  lr: 0x%08x  pc: 0x%08x\n"
            "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n",
-           msg, regs,
+           msg, regs->pc, regs->lr, regs,
            regs->r[0], regs->r[1], regs->r[2], regs->r[3],
            regs->r[4], regs->r[5], regs->r[6], regs->r[7],
            regs->r[8], regs->r[9], regs->r[10], regs->r[11],
index d3a07cb1080cf23405ef0f9cb5cc9b9845e145b6..fa179c8b53c0b497e0d72d41935c2415bba2dc89 100644 (file)
         || (((op) & THUMB_SIMD_VFP_MASK3) == THUMB_SIMD_VFP_CODE3))
 
 extern boolean_t arm_force_fast_fault(ppnum_t, vm_prot_t, int, void *);
-extern kern_return_t arm_fast_fault(pmap_t, vm_map_address_t, vm_prot_t, boolean_t);
+extern kern_return_t arm_fast_fault(pmap_t, vm_map_address_t, vm_prot_t, bool, bool);
 
 /*
  * Determines if the aborted instruction is read or write operation
diff --git a/osfmk/arm/xpr.h b/osfmk/arm/xpr.h
deleted file mode 100644 (file)
index 82904b1..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2007 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-
-/*
- *     Machine dependent module for the XPR tracing facility.
- */
-
-#define XPR_TIMESTAMP   (0)
index f7ec7555e9aa68e490ed35d9475ed01940573200..ec8b119014d377da46e7637b6e119813a816abcd 100644 (file)
@@ -11,7 +11,8 @@ ARM_HEADER_FILES = \
                lowglobals.h            \
                machine_cpuid.h         \
                machine_machdep.h       \
-               proc_reg.h
+               proc_reg.h              \
+               tlb.h                   \
 
 INSTALL_MD_DIR = arm64
 
@@ -23,7 +24,7 @@ INSTALL_KF_MD_LIST = $(ARM_HEADER_FILES)
 
 INSTALL_KF_MD_LCL_LIST = machine_kpc.h machine_remote_time.h monotonic.h pgtrace.h $(ARM_HEADER_FILES)
 
-EXPORT_MD_LIST = machine_cpuid.h machine_kpc.h machine_remote_time.h monotonic.h proc_reg.h pgtrace.h asm.h
+EXPORT_MD_LIST = machine_cpuid.h machine_kpc.h machine_remote_time.h monotonic.h proc_reg.h pgtrace.h asm.h tlb.h
 
 EXPORT_MD_DIR = arm64
 
index bfad29bf552e93bcd5afd0762cab607cb8c5fd77..8f6a0cbb75dc0488f833044380bd5813d7a79fbe 100644 (file)
 #include <mach_kdp.h>
 #include <debug.h>
 
+#include <kern/assert.h>
+#include <kern/misc_protos.h>
+#include <kern/monotonic.h>
 #include <mach/vm_types.h>
 #include <mach/vm_param.h>
-#include <kern/misc_protos.h>
-#include <kern/assert.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 
-#include <arm/atomic.h>
+#include <machine/atomic.h>
 #include <arm64/proc_reg.h>
 #include <arm64/lowglobals.h>
 #include <arm/cpu_data_internal.h>
@@ -118,8 +119,8 @@ SECURITY_READ_ONLY_LATE(unsigned long) gPhysSize;
 SECURITY_READ_ONLY_LATE(unsigned long) gT0Sz = T0SZ_BOOT;
 SECURITY_READ_ONLY_LATE(unsigned long) gT1Sz = T1SZ_BOOT;
 
-/* 23543331 - step 1 of kext / kernel __TEXT and __DATA colocation is to move 
- * all kexts before the kernel.  This is only for arm64 devices and looks 
+/* 23543331 - step 1 of kext / kernel __TEXT and __DATA colocation is to move
+ * all kexts before the kernel.  This is only for arm64 devices and looks
  * something like the following:
  * -- vmaddr order --
  * 0xffffff8004004000 __PRELINK_TEXT
@@ -188,6 +189,7 @@ SECURITY_READ_ONLY_LATE(vm_offset_t)                  segEXTRADATA;
 SECURITY_READ_ONLY_LATE(unsigned long)           segSizeEXTRADATA;
 
 SECURITY_READ_ONLY_LATE(vm_offset_t)          segLOWESTTEXT;
+SECURITY_READ_ONLY_LATE(vm_offset_t)          segLOWEST;
 
 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segTEXTB;
 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeTEXT;
@@ -266,19 +268,13 @@ SECURITY_READ_ONLY_LATE(vm_offset_t)     static_memory_end;
 SECURITY_READ_ONLY_LATE(pmap_paddr_t)    avail_start;
 SECURITY_READ_ONLY_LATE(pmap_paddr_t)    avail_end;
 SECURITY_READ_ONLY_LATE(pmap_paddr_t)    real_avail_end;
+SECURITY_READ_ONLY_LATE(unsigned long)   real_phys_size;
 
 #if __ARM_KERNEL_PROTECT__
 extern void ExceptionVectorsBase;
 extern void ExceptionVectorsEnd;
 #endif /* __ARM_KERNEL_PROTECT__ */
 
-#if defined(KERNEL_INTEGRITY_KTRR)
-#if __ARM64_TWO_LEVEL_PMAP__
-/* We could support this configuration, but it adds memory overhead. */
-#error This configuration is not supported
-#endif
-#endif
-
 typedef struct {
        pmap_paddr_t pa;
        vm_map_address_t va;
@@ -297,6 +293,7 @@ phystokv(pmap_paddr_t pa)
                if ((pa >= ptov_table[i].pa) && (pa < (ptov_table[i].pa + ptov_table[i].len)))
                        return (pa - ptov_table[i].pa + ptov_table[i].va);
        }
+       assertf((pa - gPhysBase) < real_phys_size, "%s: illegal PA: 0x%llx", __func__, (uint64_t)pa);
        return (pa - gPhysBase + gVirtBase);
 }
 
@@ -315,6 +312,7 @@ phystokv_range(pmap_paddr_t pa, vm_size_t *max_len)
        len = PAGE_SIZE - (pa & PAGE_MASK);
        if (*max_len > len)
                *max_len = len;
+       assertf((pa - gPhysBase) < real_phys_size, "%s: illegal PA: 0x%llx", __func__, (uint64_t)pa);
        return (pa - gPhysBase + gVirtBase);
 }
 
@@ -325,8 +323,7 @@ ml_static_vtop(vm_offset_t va)
                if ((va >= ptov_table[i].va) && (va < (ptov_table[i].va + ptov_table[i].len)))
                        return (va - ptov_table[i].va + ptov_table[i].pa);
        }
-       if (((vm_address_t)(va) - gVirtBase) >= gPhysSize)
-               panic("ml_static_vtop(): illegal VA: %p\n", (void*)va);
+       assertf(((vm_address_t)(va) - gVirtBase) < gPhysSize, "%s: illegal VA: %p", __func__, (void*)va);
        return ((vm_address_t)(va) - gVirtBase + gPhysBase);
 }
 
@@ -446,7 +443,6 @@ void dump_kva_space() {
 
        printf("Root page table: %s\n", root_static ? "Static" : "Dynamic");
 
-#if !__ARM64_TWO_LEVEL_PMAP__
        for(unsigned int i=0; i<TTE_PGENTRIES; i++) {
                pmap_paddr_t cur;
                boolean_t cur_ro;
@@ -471,9 +467,6 @@ void dump_kva_space() {
                tot_rosz += rosz;
                tot_rwsz += rwsz;
        }
-#else
-       dump_kva_l2(kva_base, cpu_tte, 0, &tot_rosz, &tot_rwsz);
-#endif /* !_ARM64_TWO_LEVEL_PMAP__ */
 
        printf("L2 Address space mapped: Static %lluMB Dynamic %lluMB Total %lluMB\n",
          tot_rosz >> 20,
@@ -503,10 +496,8 @@ arm_vm_map(tt_entry_t * root_ttp, vm_offset_t vaddr, pt_entry_t pte)
        vm_offset_t ptpage = 0;
        tt_entry_t * ttp = root_ttp;
 
-#if !__ARM64_TWO_LEVEL_PMAP__
        tt_entry_t * l1_ttep = NULL;
        tt_entry_t l1_tte = 0;
-#endif
 
        tt_entry_t * l2_ttep = NULL;
        tt_entry_t l2_tte = 0;
@@ -517,7 +508,6 @@ arm_vm_map(tt_entry_t * root_ttp, vm_offset_t vaddr, pt_entry_t pte)
         * Walk the target page table to find the PTE for the given virtual
         * address.  Allocate any page table pages needed to do this.
         */
-#if !__ARM64_TWO_LEVEL_PMAP__
        l1_ttep = ttp + ((vaddr & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT);
        l1_tte = *l1_ttep;
 
@@ -532,7 +522,6 @@ arm_vm_map(tt_entry_t * root_ttp, vm_offset_t vaddr, pt_entry_t pte)
        }
 
        ttp = (tt_entry_t *)phystokv(l1_tte & ARM_TTE_TABLE_MASK);
-#endif
 
        l2_ttep = ttp + ((vaddr & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
        l2_tte = *l2_ttep;
@@ -566,6 +555,10 @@ arm_vm_map(tt_entry_t * root_ttp, vm_offset_t vaddr, pt_entry_t pte)
        *ptep = pte;
 }
 
+#endif // __ARM_KERNEL_PROTECT
+
+#if __ARM_KERNEL_PROTECT__
+
 /*
  * arm_vm_kernel_el0_map:
  *   vaddr: The target virtual address
@@ -611,7 +604,6 @@ arm_vm_kernel_pte(vm_offset_t vaddr)
        pt_entry_t * ptep = NULL;
        pt_entry_t pte = 0;
 
-#if !__ARM64_TWO_LEVEL_PMAP__
        ttep = ttp + ((vaddr & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT);
        tte = *ttep;
 
@@ -627,7 +619,6 @@ arm_vm_kernel_pte(vm_offset_t vaddr)
        }
 
        ttp = (tt_entry_t *)phystokv(tte & ARM_TTE_TABLE_MASK);
-#endif
        ttep = ttp + ((vaddr & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
        tte = *ttep;
 
@@ -736,11 +727,9 @@ static void arm_replace_identity_map(boot_args * args)
        vm_offset_t addr;
        pmap_paddr_t paddr;
 
-#if !__ARM64_TWO_LEVEL_PMAP__
        pmap_paddr_t l1_ptp_phys = 0;
        tt_entry_t *l1_ptp_virt = NULL;
        tt_entry_t *tte1 = NULL;
-#endif
        pmap_paddr_t l2_ptp_phys = 0;
        tt_entry_t *l2_ptp_virt = NULL;
        tt_entry_t *tte2 = NULL;
@@ -795,18 +784,17 @@ tt_entry_t *arm_kva_to_tte(vm_offset_t);
 tt_entry_t *
 arm_kva_to_tte(vm_offset_t va)
 {
-#if __ARM64_TWO_LEVEL_PMAP__
-       tt_entry_t *tte2;
-       tte2 = cpu_tte + L2_TABLE_INDEX(va);
-#else
        tt_entry_t *tte1, *tte2;
        tte1 = cpu_tte + L1_TABLE_INDEX(va);
        tte2 = L2_TABLE_VA(tte1) + L2_TABLE_INDEX(va);
-#endif
+
        return tte2;
 }
 
 
+#define ARM64_GRANULE_ALLOW_BLOCK (1 << 0)
+#define ARM64_GRANULE_ALLOW_HINT (1 << 1)
+
 /*
  * arm_vm_page_granular_helper updates protections at the L3 level.  It will (if
  * neccessary) allocate a page for the L3 table and update the corresponding L2
@@ -815,13 +803,13 @@ arm_kva_to_tte(vm_offset_t va)
  * not be invoked from a context that does not do L2 iteration separately (basically,
  * don't call this except from arm_vm_page_granular_prot).
  *
- * bool force_page_granule: true: will force page level mappings for this entry
- *                        false: will try to use block level mappings
+ * unsigned granule: 0 => force to page granule, or a combination of
+ * ARM64_GRANULE_* flags declared above.
  */
 
 static void
 arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, pmap_paddr_t pa_offset,
-                            int pte_prot_APX, int pte_prot_XN, bool force_page_granule,
+                            int pte_prot_APX, int pte_prot_XN, unsigned granule,
                             pt_entry_t **deferred_pte, pt_entry_t *deferred_ptmp)
 {
        if (va & ARM_TT_L2_OFFMASK) { /* ragged edge hanging over a ARM_TT_L2_SIZE  boundary */
@@ -886,9 +874,13 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va,
                                 * be fully covered by this mapping request.
                                 */
                                if ((va >= round_up_pte_hint_address(start)) && (round_up_pte_hint_address(va + 1) <= _end) &&
-                                   !force_page_granule && use_contiguous_hint) {
+                                   (granule & ARM64_GRANULE_ALLOW_HINT) && use_contiguous_hint) {
                                        assert((va & ((1 << ARM_PTE_HINT_ADDR_SHIFT) - 1)) == ((pa & ((1 << ARM_PTE_HINT_ADDR_SHIFT) - 1))));
                                        ptmp |= ARM_PTE_HINT;
+                                       /* Do not attempt to reapply the hint bit to an already-active mapping.
+                                        * This very likely means we're attempting to change attributes on an already-active mapping,
+                                        * which violates the requirement of the hint bit.*/
+                                       assert(!kva_active || (ppte[i] == ARM_PTE_TYPE_FAULT));
                                }
                                /*
                                 * Do not change the contiguous bit on an active mapping.  Even in a single-threaded
@@ -899,18 +891,18 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va,
                                 */
                                assert(!kva_active || (ppte[i] == ARM_PTE_TYPE_FAULT) || ((ppte[i] & ARM_PTE_HINT) == (ptmp & ARM_PTE_HINT)));
 
-                               /* 
+                               /*
                                 * If we reach an entry that maps the current pte page, delay updating it until the very end.
                                 * Otherwise we might end up making the PTE page read-only, leading to a fault later on in
                                 * this function if we manage to outrun the TLB.  This can happen on KTRR-enabled devices when
                                 * marking segDATACONST read-only.  Mappings for this region may straddle a PT page boundary,
                                 * so we must also defer assignment of the following PTE.  We will assume that if the region
                                 * were to require one or more full L3 pages, it would instead use L2 blocks where possible,
-                                * therefore only requiring at most one L3 page at the beginning and one at the end. 
+                                * therefore only requiring at most one L3 page at the beginning and one at the end.
                                 */
                                if (kva_active && ((pt_entry_t*)(phystokv(pa)) == ppte)) {
-                                       assert(recursive_pte == NULL);  
-                                       assert(!force_page_granule);
+                                       assert(recursive_pte == NULL);
+                                       assert(granule & ARM64_GRANULE_ALLOW_BLOCK);
                                        recursive_pte = &ppte[i];
                                        recursive_ptmp = ptmp;
                                } else if ((deferred_pte != NULL) && (&ppte[i] == &recursive_pte[1])) {
@@ -940,7 +932,7 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va,
 static void
 arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, pmap_paddr_t pa_offset,
                           int tte_prot_XN, int pte_prot_APX, int pte_prot_XN,
-                          bool force_page_granule)
+                          unsigned granule)
 {
        pt_entry_t *deferred_pte = NULL, deferred_ptmp = 0;
        vm_offset_t _end = start + size;
@@ -950,19 +942,19 @@ arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, pmap_paddr_t pa
                return;
 
        if (align_start > _end) {
-               arm_vm_page_granular_helper(start, _end, start, pa_offset, pte_prot_APX, pte_prot_XN, force_page_granule, NULL, NULL);
+               arm_vm_page_granular_helper(start, _end, start, pa_offset, pte_prot_APX, pte_prot_XN, granule, NULL, NULL);
                return;
        }
 
-       arm_vm_page_granular_helper(start, align_start, start, pa_offset, pte_prot_APX, pte_prot_XN, force_page_granule, &deferred_pte, &deferred_ptmp);
+       arm_vm_page_granular_helper(start, align_start, start, pa_offset, pte_prot_APX, pte_prot_XN, granule, &deferred_pte, &deferred_ptmp);
 
        while ((_end - align_start) >= ARM_TT_L2_SIZE) {
-               if (force_page_granule)
+               if (!(granule & ARM64_GRANULE_ALLOW_BLOCK)) {
                        arm_vm_page_granular_helper(align_start, align_start+ARM_TT_L2_SIZE, align_start + 1, pa_offset,
-                                                   pte_prot_APX, pte_prot_XN, force_page_granule, NULL, NULL);
-               else {
+                                                   pte_prot_APX, pte_prot_XN, granule, NULL, NULL);
+               else {
                        pmap_paddr_t pa = align_start - gVirtBase + gPhysBase - pa_offset;
-                       assert((pa & ARM_TT_L2_OFFMASK) == 0); 
+                       assert((pa & ARM_TT_L2_OFFMASK) == 0);
                        tt_entry_t *tte2;
                        tt_entry_t tmplate;
 
@@ -973,7 +965,7 @@ arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, pmap_paddr_t pa
                                        | ARM_TTE_VALID | ARM_TTE_BLOCK_AF | ARM_TTE_BLOCK_NX
                                        | ARM_TTE_BLOCK_AP(pte_prot_APX) | ARM_TTE_BLOCK_SH(SH_OUTER_MEMORY)
                                        | ARM_TTE_BLOCK_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
-                               
+
 #if __ARM_KERNEL_PROTECT__
                                tmplate = tmplate | ARM_TTE_BLOCK_NG;
 #endif /* __ARM_KERNEL_PROTECT__ */
@@ -987,34 +979,28 @@ arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, pmap_paddr_t pa
        }
 
        if (align_start < _end)
-               arm_vm_page_granular_helper(align_start, _end, _end, pa_offset, pte_prot_APX, pte_prot_XN, force_page_granule, &deferred_pte, &deferred_ptmp);
+               arm_vm_page_granular_helper(align_start, _end, _end, pa_offset, pte_prot_APX, pte_prot_XN, granule, &deferred_pte, &deferred_ptmp);
 
        if (deferred_pte != NULL)
                *deferred_pte = deferred_ptmp;
 }
 
 static inline void
-arm_vm_page_granular_RNX(vm_offset_t start, unsigned long size, bool force_page_granule)
-{
-       arm_vm_page_granular_prot(start, size, 0, 1, AP_RONA, 1, force_page_granule);
-}
-
-static inline void
-arm_vm_page_granular_ROX(vm_offset_t start, unsigned long size, bool force_page_granule)
+arm_vm_page_granular_RNX(vm_offset_t start, unsigned long size, unsigned granule)
 {
-       arm_vm_page_granular_prot(start, size, 0, 0, AP_RONA, 0, force_page_granule);
+       arm_vm_page_granular_prot(start, size, 0, 1, AP_RONA, 1, granule);
 }
 
 static inline void
-arm_vm_page_granular_RWNX(vm_offset_t start, unsigned long size, bool force_page_granule)
+arm_vm_page_granular_ROX(vm_offset_t start, unsigned long size, unsigned granule)
 {
-       arm_vm_page_granular_prot(start, size, 0, 1, AP_RWNA, 1, force_page_granule);
+       arm_vm_page_granular_prot(start, size, 0, 0, AP_RONA, 0, granule);
 }
 
 static inline void
-arm_vm_page_granular_RWX(vm_offset_t start, unsigned long size, bool force_page_granule)
+arm_vm_page_granular_RWNX(vm_offset_t start, unsigned long size, unsigned granule)
 {
-       arm_vm_page_granular_prot(start, size, 0, 0, AP_RWNA, 0, force_page_granule);
+       arm_vm_page_granular_prot(start, size, 0, 1, AP_RWNA, 1, granule);
 }
 
 /* used in the chosen/memory-map node, populated by iBoot. */
@@ -1023,7 +1009,6 @@ typedef struct MemoryMapFileInfo {
        size_t length;
 } MemoryMapFileInfo;
 
-
 void
 arm_vm_prot_init(boot_args * args)
 {
@@ -1037,6 +1022,8 @@ arm_vm_prot_init(boot_args * args)
        segEXTRADATA = segLOWESTTEXT;
        segSizeEXTRADATA = 0;
 
+       segLOWEST = segLOWESTTEXT;
+
        DTEntry memory_map;
        MemoryMapFileInfo *trustCacheRange;
        unsigned int trustCacheRangeSize;
@@ -1052,24 +1039,38 @@ arm_vm_prot_init(boot_args * args)
                segEXTRADATA = phystokv(trustCacheRange->paddr);
                segSizeEXTRADATA = trustCacheRange->length;
 
-               arm_vm_page_granular_RNX(segEXTRADATA, segSizeEXTRADATA, FALSE);
+               if (segEXTRADATA <= segLOWEST) {
+                       segLOWEST = segEXTRADATA;
+               }
+#if !(DEBUG || DEVELOPMENT)
+
+
+               else {
+                       panic("EXTRADATA is in an unexpected place: %#lx > %#lx", segEXTRADATA, segLOWEST);
+               }
+#endif /* !(DEBUG || DEVELOPMENT) */
+
+               arm_vm_page_granular_RNX(segEXTRADATA, segSizeEXTRADATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
+
        }
 
        /* Map coalesced kext TEXT segment RWNX for now */
-       arm_vm_page_granular_RWNX(segPRELINKTEXTB, segSizePRELINKTEXT, FALSE); // Refined in OSKext::readPrelinkedExtensions
+       arm_vm_page_granular_RWNX(segPRELINKTEXTB, segSizePRELINKTEXT, ARM64_GRANULE_ALLOW_BLOCK); // Refined in OSKext::readPrelinkedExtensions
 
        /* Map coalesced kext DATA_CONST segment RWNX (could be empty) */
-       arm_vm_page_granular_RWNX(segPLKDATACONSTB, segSizePLKDATACONST, FALSE); // Refined in OSKext::readPrelinkedExtensions
+       arm_vm_page_granular_RWNX(segPLKDATACONSTB, segSizePLKDATACONST, ARM64_GRANULE_ALLOW_BLOCK); // Refined in OSKext::readPrelinkedExtensions
 
-       /* Map coalesced kext TEXT_EXEC segment RWX (could be empty) */
-       arm_vm_page_granular_ROX(segPLKTEXTEXECB, segSizePLKTEXTEXEC, FALSE); // Refined in OSKext::readPrelinkedExtensions
+       /* Map coalesced kext TEXT_EXEC segment RX (could be empty) */
+       arm_vm_page_granular_ROX(segPLKTEXTEXECB, segSizePLKTEXTEXEC, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // Refined in OSKext::readPrelinkedExtensions
 
        /* if new segments not present, set space between PRELINK_TEXT and xnu TEXT to RWNX
         * otherwise we no longer expect any space between the coalesced kext read only segments and xnu rosegments
         */
        if (!segSizePLKDATACONST && !segSizePLKTEXTEXEC) {
-               if (segSizePRELINKTEXT)
-                       arm_vm_page_granular_RWNX(segPRELINKTEXTB + segSizePRELINKTEXT, segTEXTB - (segPRELINKTEXTB + segSizePRELINKTEXT), FALSE);
+               if (segSizePRELINKTEXT) {
+                       arm_vm_page_granular_RWNX(segPRELINKTEXTB + segSizePRELINKTEXT, segTEXTB - (segPRELINKTEXTB + segSizePRELINKTEXT),
+                           ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
+               }
        } else {
                /*
                 * If we have the new segments, we should still protect the gap between kext
@@ -1077,7 +1078,8 @@ arm_vm_prot_init(boot_args * args)
                 * exists.
                 */
                if ((segPLKDATACONSTB + segSizePLKDATACONST) < segTEXTB) {
-                       arm_vm_page_granular_RWNX(segPLKDATACONSTB + segSizePLKDATACONST, segTEXTB - (segPLKDATACONSTB + segSizePLKDATACONST), FALSE);
+                       arm_vm_page_granular_RWNX(segPLKDATACONSTB + segSizePLKDATACONST, segTEXTB - (segPLKDATACONSTB + segSizePLKDATACONST),
+                           ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
                }
        }
 
@@ -1088,39 +1090,38 @@ arm_vm_prot_init(boot_args * args)
         *
         * TEXT segment contains mach headers and other non-executable data. This will become RONX later.
         */
-       arm_vm_page_granular_RNX(segTEXTB, segSizeTEXT, FALSE);
+       arm_vm_page_granular_RNX(segTEXTB, segSizeTEXT, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
 
        /* Can DATACONST start out and stay RNX?
         * NO, stuff in this segment gets modified during startup (viz. mac_policy_init()/mac_policy_list)
         * Make RNX in prot_finalize
         */
-       arm_vm_page_granular_RWNX(segDATACONSTB, segSizeDATACONST, FALSE);
+       arm_vm_page_granular_RWNX(segDATACONSTB, segSizeDATACONST, ARM64_GRANULE_ALLOW_BLOCK);
 
-       /* TEXTEXEC contains read only executable code: becomes ROX in prot_finalize */
-       arm_vm_page_granular_RWX(segTEXTEXECB, segSizeTEXTEXEC, FALSE);
+       arm_vm_page_granular_ROX(segTEXTEXECB, segSizeTEXTEXEC, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
 
 
        /* DATA segment will remain RWNX */
-       arm_vm_page_granular_RWNX(segDATAB, segSizeDATA, FALSE);
+       arm_vm_page_granular_RWNX(segDATAB, segSizeDATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
 
-       arm_vm_page_granular_RWNX(segBOOTDATAB, segSizeBOOTDATA, TRUE);
-       arm_vm_page_granular_RNX((vm_offset_t)&intstack_low_guard, PAGE_MAX_SIZE, TRUE);
-       arm_vm_page_granular_RNX((vm_offset_t)&intstack_high_guard, PAGE_MAX_SIZE, TRUE);
-       arm_vm_page_granular_RNX((vm_offset_t)&excepstack_high_guard, PAGE_MAX_SIZE, TRUE);
+       arm_vm_page_granular_RWNX(segBOOTDATAB, segSizeBOOTDATA, 0);
+       arm_vm_page_granular_RNX((vm_offset_t)&intstack_low_guard, PAGE_MAX_SIZE, 0);
+       arm_vm_page_granular_RNX((vm_offset_t)&intstack_high_guard, PAGE_MAX_SIZE, 0);
+       arm_vm_page_granular_RNX((vm_offset_t)&excepstack_high_guard, PAGE_MAX_SIZE, 0);
 
-       arm_vm_page_granular_ROX(segKLDB, segSizeKLD, FALSE);
-       arm_vm_page_granular_RWNX(segLINKB, segSizeLINK, FALSE);
-       arm_vm_page_granular_RWNX(segPLKLINKEDITB, segSizePLKLINKEDIT, FALSE); // Coalesced kext LINKEDIT segment
-       arm_vm_page_granular_ROX(segLASTB, segSizeLAST, FALSE); // __LAST may be empty, but we cannot assume this
+       arm_vm_page_granular_ROX(segKLDB, segSizeKLD, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
+       arm_vm_page_granular_RWNX(segLINKB, segSizeLINK, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
+       arm_vm_page_granular_RWNX(segPLKLINKEDITB, segSizePLKLINKEDIT, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // Coalesced kext LINKEDIT segment
+       arm_vm_page_granular_ROX(segLASTB, segSizeLAST, ARM64_GRANULE_ALLOW_BLOCK); // __LAST may be empty, but we cannot assume this
 
-       arm_vm_page_granular_RWNX(segPRELINKDATAB, segSizePRELINKDATA, FALSE); // Prelink __DATA for kexts (RW data)
+       arm_vm_page_granular_RWNX(segPRELINKDATAB, segSizePRELINKDATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // Prelink __DATA for kexts (RW data)
 
        if (segSizePLKLLVMCOV > 0)
-               arm_vm_page_granular_RWNX(segPLKLLVMCOVB, segSizePLKLLVMCOV, FALSE); // LLVM code coverage data
+               arm_vm_page_granular_RWNX(segPLKLLVMCOVB, segSizePLKLLVMCOV, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // LLVM code coverage data
 
-       arm_vm_page_granular_RWNX(segPRELINKINFOB, segSizePRELINKINFO, FALSE); /* PreLinkInfoDictionary */
+       arm_vm_page_granular_RWNX(segPRELINKINFOB, segSizePRELINKINFO, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); /* PreLinkInfoDictionary */
 
-       arm_vm_page_granular_RNX(phystokv(args->topOfKernelData), BOOTSTRAP_TABLE_SIZE, FALSE); // Boot page tables; they should not be mutable.
+       arm_vm_page_granular_RNX(phystokv(args->topOfKernelData), BOOTSTRAP_TABLE_SIZE, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // Boot page tables; they should not be mutable.
 }
 
 /*
@@ -1152,7 +1153,7 @@ SECURITY_READ_ONLY_LATE(static unsigned int) ptov_index = 0;
 #define ROUND_TWIG(addr) (((addr) + ARM_TT_TWIG_OFFMASK) & ~(ARM_TT_TWIG_OFFMASK))
 
 static void
-arm_vm_physmap_slide(ptov_table_entry *temp_ptov_table, vm_map_address_t physmap_base, vm_map_address_t orig_va, vm_size_t len, int pte_prot_APX, boolean_t force_page_granule)
+arm_vm_physmap_slide(ptov_table_entry *temp_ptov_table, vm_map_address_t physmap_base, vm_map_address_t orig_va, vm_size_t len, int pte_prot_APX, unsigned granule)
 {
        pmap_paddr_t pa_offset;
 
@@ -1163,7 +1164,7 @@ arm_vm_physmap_slide(ptov_table_entry *temp_ptov_table, vm_map_address_t physmap
                temp_ptov_table[ptov_index].va = physmap_base;
        else
                temp_ptov_table[ptov_index].va = temp_ptov_table[ptov_index - 1].va + temp_ptov_table[ptov_index - 1].len;
-       if (!force_page_granule) {
+       if (granule & ARM64_GRANULE_ALLOW_BLOCK) {
                vm_map_address_t orig_offset = temp_ptov_table[ptov_index].pa & ARM_TT_TWIG_OFFMASK;
                vm_map_address_t new_offset = temp_ptov_table[ptov_index].va & ARM_TT_TWIG_OFFMASK;
                if (new_offset < orig_offset)
@@ -1173,8 +1174,8 @@ arm_vm_physmap_slide(ptov_table_entry *temp_ptov_table, vm_map_address_t physmap
        }
        assert((temp_ptov_table[ptov_index].va & ARM_PGMASK) == 0);
        temp_ptov_table[ptov_index].len = round_page(len);
-       pa_offset = temp_ptov_table[ptov_index].va - orig_va; 
-       arm_vm_page_granular_prot(temp_ptov_table[ptov_index].va, temp_ptov_table[ptov_index].len, pa_offset, 1, pte_prot_APX, 1, force_page_granule);
+       pa_offset = temp_ptov_table[ptov_index].va - orig_va;
+       arm_vm_page_granular_prot(temp_ptov_table[ptov_index].va, temp_ptov_table[ptov_index].len, pa_offset, 1, pte_prot_APX, 1, granule);
        ++ptov_index;
 }
 
@@ -1186,18 +1187,20 @@ arm_vm_physmap_init(boot_args *args, vm_map_address_t physmap_base, vm_map_addre
        bzero(temp_ptov_table, sizeof(temp_ptov_table));
 
        // Will be handed back to VM layer through ml_static_mfree() in arm_vm_prot_finalize()
-       arm_vm_physmap_slide(temp_ptov_table, physmap_base, gVirtBase, segEXTRADATA - gVirtBase, AP_RWNA, FALSE);
+       arm_vm_physmap_slide(temp_ptov_table, physmap_base, gVirtBase, segLOWEST - gVirtBase, AP_RWNA,
+           ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
 
-       arm_vm_page_granular_RWNX(end_kern, phystokv(args->topOfKernelData) - end_kern, FALSE); /* Device Tree, RAM Disk (if present), bootArgs */
+       arm_vm_page_granular_RWNX(end_kern, phystokv(args->topOfKernelData) - end_kern,
+           ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); /* Device Tree, RAM Disk (if present), bootArgs */
 
        arm_vm_physmap_slide(temp_ptov_table, physmap_base, (args->topOfKernelData + BOOTSTRAP_TABLE_SIZE - gPhysBase + gVirtBase),
-                            real_avail_end - (args->topOfKernelData + BOOTSTRAP_TABLE_SIZE), AP_RWNA, FALSE); // rest of physmem
+           real_avail_end - (args->topOfKernelData + BOOTSTRAP_TABLE_SIZE), AP_RWNA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // rest of physmem
 
        assert((temp_ptov_table[ptov_index - 1].va + temp_ptov_table[ptov_index - 1].len) <= dynamic_memory_begin);
 
        // Sort in descending order of segment length.  LUT traversal is linear, so largest (most likely used)
        // segments should be placed earliest in the table to optimize lookup performance.
-       qsort(temp_ptov_table, PTOV_TABLE_SIZE, sizeof(temp_ptov_table[0]), cmp_ptov_entries); 
+       qsort(temp_ptov_table, PTOV_TABLE_SIZE, sizeof(temp_ptov_table[0]), cmp_ptov_entries);
 
        memcpy(ptov_table, temp_ptov_table, sizeof(ptov_table));
 }
@@ -1222,7 +1225,7 @@ arm_vm_prot_finalize(boot_args * args __unused)
         * should be immediately followed by XNU's TEXT segment
         */
 
-       ml_static_mfree(phystokv(gPhysBase), segEXTRADATA - gVirtBase);
+       ml_static_mfree(phystokv(gPhysBase), segLOWEST - gVirtBase);
 
        /*
         * KTRR support means we will be mucking with these pages and trying to
@@ -1233,18 +1236,9 @@ arm_vm_prot_finalize(boot_args * args __unused)
                ml_static_mfree(segPRELINKTEXTB + segSizePRELINKTEXT, segTEXTB - (segPRELINKTEXTB + segSizePRELINKTEXT));
        }
 
-       /*
-        * LowResetVectorBase patching should be done by now, so tighten executable
-        * protections.
-        */
-       arm_vm_page_granular_ROX(segTEXTEXECB, segSizeTEXTEXEC, FALSE);
-
        /* tighten permissions on kext read only data and code */
-       if (segSizePLKDATACONST && segSizePLKTEXTEXEC) {
-               arm_vm_page_granular_RNX(segPRELINKTEXTB, segSizePRELINKTEXT, FALSE);
-               arm_vm_page_granular_ROX(segPLKTEXTEXECB, segSizePLKTEXTEXEC, FALSE);
-               arm_vm_page_granular_RNX(segPLKDATACONSTB, segSizePLKDATACONST, FALSE);
-       }
+       arm_vm_page_granular_RNX(segPRELINKTEXTB, segSizePRELINKTEXT, ARM64_GRANULE_ALLOW_BLOCK);
+       arm_vm_page_granular_RNX(segPLKDATACONSTB, segSizePLKDATACONST, ARM64_GRANULE_ALLOW_BLOCK);
 
        cpu_stack_alloc(&BootCpuData);
        arm64_replace_bootstack(&BootCpuData);
@@ -1259,7 +1253,7 @@ arm_vm_prot_finalize(boot_args * args __unused)
        /*
         * __LAST,__pinst should no longer be executable.
         */
-       arm_vm_page_granular_RNX(segLASTB, segSizeLAST, FALSE);
+       arm_vm_page_granular_RNX(segLASTB, segSizeLAST, ARM64_GRANULE_ALLOW_BLOCK);
 
        /*
         * Must wait until all other region permissions are set before locking down DATA_CONST
@@ -1268,11 +1262,8 @@ arm_vm_prot_finalize(boot_args * args __unused)
         */
 #endif
 
-       arm_vm_page_granular_RNX(segDATACONSTB, segSizeDATACONST, FALSE);
+       arm_vm_page_granular_RNX(segDATACONSTB, segSizeDATACONST, ARM64_GRANULE_ALLOW_BLOCK);
 
-#ifndef __ARM_L1_PTW__
-       FlushPoC_Dcache();
-#endif
        __builtin_arm_dsb(DSB_ISH);
        flush_mmu_tlb();
 }
@@ -1302,7 +1293,16 @@ set_tbi(void)
                user_tbi = ((tbi & TBI_USER) == TBI_USER);
        old_tcr = new_tcr = get_tcr();
        new_tcr |= (user_tbi) ? TCR_TBI0_TOPBYTE_IGNORED : 0;
+
+#if !defined(HAS_APPLE_PAC)
+       /*
+        * arm_vm_init() runs after rebase_threaded_starts(), so enabling TBI1
+        * at this point will break the computed pointer signatures.  TBID1
+        * could help mitigate this problem, but for now we'll just disable
+        * kernel TBI if PAC is being used.
+        */
        new_tcr |= (tbi & TBI_KERNEL) ? TCR_TBI1_TOPBYTE_IGNORED : 0;
+#endif
 
        if (old_tcr != new_tcr) {
                set_tcr(new_tcr);
@@ -1317,19 +1317,8 @@ set_tbi(void)
 void
 arm_vm_init(uint64_t memory_size, boot_args * args)
 {
-#if !__ARM64_TWO_LEVEL_PMAP__
        vm_map_address_t va_l1, va_l1_end;
        tt_entry_t       *cpu_l1_tte;
-#else
-       /*
-        * If we are using two level page tables, rather than the
-        * 3 level page tables that xnu defaults to for ARM64,
-        * then a great deal of the code in this path becomes
-        * redundant.  As a result, most of the logic having to
-        * do with L1 pages will be excluded from such
-        * configurations in this function.
-        */
-#endif
        vm_map_address_t va_l2, va_l2_end;
        tt_entry_t       *cpu_l2_tte;
        pmap_paddr_t     boot_ttep;
@@ -1345,8 +1334,22 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
         */
        gVirtBase = args->virtBase;
        gPhysBase = args->physBase;
-       gPhysSize = args->memSize;
-       mem_size = args->memSize;
+#if KASAN
+       real_phys_size = args->memSize + (shadow_ptop - shadow_pbase);
+#else
+       real_phys_size = args->memSize;
+#endif
+       /*
+        * Ensure the physical region we specify for the VM to manage ends on a
+        * software page boundary.  Note that the software page size (PAGE_SIZE)
+        * may be a multiple of the hardware page size specified in ARM_PGBYTES.
+        * We must round the reported memory size down to the nearest PAGE_SIZE
+        * boundary to ensure the VM does not try to manage a page it does not
+        * completely own.  The KASAN shadow region, if present, is managed entirely
+        * in units of the hardware page size and should not need similar treatment.
+        */
+       gPhysSize = mem_size = ((gPhysBase + args->memSize) & ~PAGE_MASK) - gPhysBase;
+
        if ((memory_size != 0) && (mem_size > memory_size))
                mem_size = memory_size;
        if (mem_size >= ((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 4))
@@ -1403,21 +1406,14 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
        /*
         * Initialize l1 page table page
         */
-#if __ARM64_TWO_LEVEL_PMAP__
-       /*
-        * If we're using a two level page table, we still need to
-        * set the cpu_ttep to avail_start, as this will be the root
-        * of our page table regardless of how many levels we are
-        * using.
-        */
-#endif
        cpu_tte = (tt_entry_t *)alloc_ptpage(TRUE);
        cpu_ttep = kvtophys((vm_offset_t)cpu_tte);
        bzero(cpu_tte, ARM_PGBYTES);
        avail_end = gPhysBase + mem_size;
+       assert(!(avail_end & PAGE_MASK));
 
 #if KASAN
-       real_avail_end = avail_end + (shadow_ptop - shadow_pbase);
+       real_avail_end = gPhysBase + real_phys_size;
 #else
        real_avail_end = avail_end;
 #endif
@@ -1429,9 +1425,8 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
         *
         *   the so called physical aperture should be statically mapped
         */
-#if !__ARM64_TWO_LEVEL_PMAP__
        va_l1 = gVirtBase;
-       va_l1_end = dynamic_memory_begin; 
+       va_l1_end = dynamic_memory_begin;
        cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT);
 
        while (va_l1 < va_l1_end) {
@@ -1450,7 +1445,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
                va_l1 += ARM_TT_L1_SIZE;
                cpu_l1_tte++;
        }
-#endif
 
 #if __ARM_KERNEL_PROTECT__
        /* Expand the page tables to prepare for the EL0 mappings. */
@@ -1519,14 +1513,9 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
         *   cover this address range:
         *     LOW_GLOBAL_BASE_ADDRESS + 2MB
         */
-#if __ARM64_TWO_LEVEL_PMAP__
-       va_l2 = LOW_GLOBAL_BASE_ADDRESS;
-       cpu_l2_tte = cpu_tte + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
-#else
        va_l1 = va_l2 = LOW_GLOBAL_BASE_ADDRESS;
        cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT);
        cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
-#endif
        ptpage_vaddr = alloc_ptpage(TRUE);
        *cpu_l2_tte = (kvtophys(ptpage_vaddr) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN;
        bzero((void *)ptpage_vaddr, ARM_PGBYTES);
@@ -1536,7 +1525,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
         *   cover this address range:
         *    KERNEL_DYNAMIC_ADDR - VM_MAX_KERNEL_ADDRESS
         */
-#if !__ARM64_TWO_LEVEL_PMAP__
        va_l1 = dynamic_memory_begin;
        va_l1_end = VM_MAX_KERNEL_ADDRESS;
        cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT);
@@ -1557,20 +1545,26 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
                va_l1 += ARM_TT_L1_SIZE;
                cpu_l1_tte++;
        }
-#endif
 
 #if KASAN
        /* record the extent of the physmap */
        physmap_vbase = physmap_base;
        physmap_vtop = static_memory_end;
        kasan_init();
-#endif
+#endif /* KASAN */
+
+#if MONOTONIC
+       mt_early_init();
+#endif /* MONOTONIC */
 
        set_tbi();
-       set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
 
        arm_vm_physmap_init(args, physmap_base, dynamic_memory_begin);
        set_mmu_ttb_alternate(cpu_ttep & TTBR_BADDR_MASK);
+
+
+       set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
+
        flush_mmu_tlb();
        kva_active = TRUE;
        // global table pointers may need to be different due to physical aperture remapping
@@ -1600,7 +1594,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
 
        mem_segments = (mem_size + 0x0FFFFFFF) >> 28;
 
-#if !__ARM64_TWO_LEVEL_PMAP__
        va_l1 = dynamic_memory_begin;
        va_l1_end = va_l1 + ((2 + (mem_segments * 10)) << 20);
        va_l1_end += round_page(args->Video.v_height * args->Video.v_rowBytes);
@@ -1620,13 +1613,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
                }
 
                cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
-#else
-               va_l2 = dynamic_memory_begin;
-               va_l2_end = va_l2 + ((2 + (mem_segments * 10)) << 20);
-               va_l2_end += round_page(args->Video.v_height * args->Video.v_rowBytes);
-               va_l2_end = (va_l2_end + 0x00000000007FFFFFULL) & 0xFFFFFFFFFF800000ULL;
-               cpu_l2_tte = cpu_tte + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
-#endif
 
                while (va_l2 < va_l2_end) {
                        pt_entry_t *    ptp;
@@ -1636,25 +1622,23 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
                        ptp = (pt_entry_t *) alloc_ptpage(FALSE);
                        ptp_phys = (pmap_paddr_t)kvtophys((vm_offset_t)ptp);
 
-                       pmap_init_pte_page(kernel_pmap, ptp, va_l2, 3, TRUE);
+                       pmap_init_pte_page(kernel_pmap, ptp, va_l2, 3, TRUE, TRUE);
 
                        *cpu_l2_tte = (pa_to_tte (ptp_phys)) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_DYNAMIC_TABLE_XN;
 
                        va_l2 += ARM_TT_L2_SIZE;
                        cpu_l2_tte++;
                };
-#if !__ARM64_TWO_LEVEL_PMAP__
+
                va_l1 = va_l2_end;
                cpu_l1_tte++;
        }
-#endif
 
        /*
         * Initialize l3 page table pages :
         *   cover this address range:
         *   (VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VM_MAX_KERNEL_ADDRESS
         */
-#if !__ARM64_TWO_LEVEL_PMAP__
        va_l1 = VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK;
        va_l1_end = VM_MAX_KERNEL_ADDRESS;
 
@@ -1672,11 +1656,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
                }
 
                cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
-#else
-               va_l2 = VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK;
-               va_l2_end = VM_MAX_KERNEL_ADDRESS;
-               cpu_l2_tte = cpu_tte + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
-#endif
 
                while (va_l2 < va_l2_end) {
                        pt_entry_t *    ptp;
@@ -1686,18 +1665,17 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
                        ptp = (pt_entry_t *) alloc_ptpage(FALSE);
                        ptp_phys = (pmap_paddr_t)kvtophys((vm_offset_t)ptp);
 
-                       pmap_init_pte_page(kernel_pmap, ptp, va_l2, 3, TRUE);
+                       pmap_init_pte_page(kernel_pmap, ptp, va_l2, 3, TRUE, TRUE);
 
                        *cpu_l2_tte = (pa_to_tte (ptp_phys)) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_DYNAMIC_TABLE_XN;
 
                        va_l2 += ARM_TT_L2_SIZE;
                        cpu_l2_tte++;
                };
-#if !__ARM64_TWO_LEVEL_PMAP__
+
                va_l1 = va_l2_end;
                cpu_l1_tte++;
        }
-#endif
 
 #if __ARM64_PMAP_SUBPAGE_L1__ && __ARM_16K_PG__
        /*
index fb2c1ea8a5520a1807064230949461e82ee0c43a..3bced40de936bed4d257151aa42aa8848b3702f8 100644 (file)
        movk $0, #((($1) >> 00) & 0x000000000000FFFF), lsl #00
 .endmacro
 
+.macro MOV32
+       movz $0, #((($1) >> 16) & 0x000000000000FFFF), lsl #16
+       movk $0, #((($1) >> 00) & 0x000000000000FFFF), lsl #00
+.endmacro
+
 .macro ARM64_STACK_PROLOG
 #if __has_feature(ptrauth_returns)
        pacibsp
 
 #ifdef  XNU_KERNEL_PRIVATE
 .macro PANIC_UNIMPLEMENTED
-       bl _panic_unimplemented
+       bl EXT(panic_unimplemented)
 .endmacro
 #endif
 
index f40b6bfca3e7aa021d9470abf92b00ee2dc56413..0a76b1cafbfc21b532612c37963214546364e886 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -143,7 +143,17 @@ mach_syscall(struct arm_saved_state *state)
 {
        kern_return_t retval;
        mach_call_t mach_call;
-       struct mach_call_args args = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+       struct mach_call_args args = {
+               .arg1 = 0,
+               .arg2 = 0,
+               .arg3 = 0,
+               .arg4 = 0,
+               .arg5 = 0,
+               .arg6 = 0,
+               .arg7 = 0,
+               .arg8 = 0,
+               .arg9 = 0
+       };
        int call_number = get_saved_state_svc_number(state);
        int64_t exc_code;
        int argc;
index a673abaf302d8465c480dd33a1fb4556a72fbb11..87caca6e81fe0f081f39a32103572949ca4d7fe1 100644 (file)
        .globl EXT(invalidate_mmu_icache)
 LEXT(InvalidatePoU_Icache)
 LEXT(invalidate_mmu_icache)
+       dsb             sy
        ic              ialluis                                                         // Invalidate icache
        dsb             sy
        isb             sy
+L_imi_done:
        ret
 
 /*
@@ -57,6 +59,10 @@ LEXT(invalidate_mmu_icache)
        .align 2
        .globl EXT(InvalidatePoU_IcacheRegion)
 LEXT(InvalidatePoU_IcacheRegion)
+       ARM64_STACK_PROLOG
+       PUSH_FRAME
+       bl              EXT(CleanPoU_DcacheRegion)
+#if __ARM_IC_NOALIAS_ICACHE__
        mov             x9, #((1<<MMU_I_CLINE)-1) 
        and             x2, x0, x9
        bic             x0, x0, x9                                                      // Cached aligned
@@ -70,7 +76,12 @@ L_ipui_loop:
        b.pl    L_ipui_loop                                                     // Loop in counter not null
        dsb             sy
        isb             sy
-       ret
+L_ipui_done:
+#else
+       bl              EXT(InvalidatePoU_Icache)
+#endif
+       POP_FRAME
+       ARM64_STACK_EPILOG
 
 
 /*
@@ -90,6 +101,7 @@ LEXT(CleanPoC_Dcache)
        mov             x9, #(1 << MMU_I7SET)
        mov             x10, #(1 << (MMU_NSET + MMU_I7SET))
        mov             x11, #(1 << MMU_I7WAY)
+       dmb             sy
 L_cpcd_dcacheway:
 L_cpcd_dcacheline:
        dc              csw, x0                                                         // clean dcache line by way/set
@@ -97,13 +109,14 @@ L_cpcd_dcacheline:
        tst             x0, #(1 << (MMU_NSET + MMU_I7SET))      // look for overflow
        b.eq    L_cpcd_dcacheline
        bic             x0, x0, x10                                                     // clear set overflow
-       adds    x0, x0, x11                                                     // increment way
+       adds    w0, w0, w11                                                     // increment way
        b.cc    L_cpcd_dcacheway                                        // loop
 #if __ARM_L2CACHE__
        mov             x0, #2
        mov             x9, #(1 << L2_I7SET)
        mov             x10, #(1 << (L2_NSET + L2_I7SET))
        mov             x11, #(1 << L2_I7WAY)
+       dsb             sy
 L_cpcd_l2dcacheway:
 L_cpcd_l2dcacheline:
        dc              csw, x0                                                         // clean dcache line by way/set
@@ -111,7 +124,7 @@ L_cpcd_l2dcacheline:
        tst             x0, #(1 << (L2_NSET + L2_I7SET))        // look for overflow
        b.eq    L_cpcd_l2dcacheline
        bic             x0, x0, x10                                                     // clear set overflow
-       adds    x0, x0, x11                                                     // increment way
+       adds    w0, w0, w11                                                     // increment way
        b.cc    L_cpcd_l2dcacheway                                      // loop
 #endif
 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */
@@ -134,6 +147,7 @@ LEXT(CleanPoU_Dcache)
        mov             x9, #(1 << MMU_I7SET)
        mov             x10, #(1 << (MMU_NSET + MMU_I7SET))
        mov             x11, #(1 << MMU_I7WAY)
+       dmb             sy
 L_cpud_dcacheway:
 L_cpud_dcacheline:
        dc              csw, x0                                                         // clean dcache line by way/set
@@ -141,7 +155,7 @@ L_cpud_dcacheline:
        tst             x0, #(1 << (MMU_NSET + MMU_I7SET))      // look for overflow
        b.eq    L_cpud_dcacheline
        bic             x0, x0, x10                                                     // clear set overflow
-       adds    x0, x0, x11                                                     // increment way
+       adds    w0, w0, w11                                                     // increment way
        b.cc    L_cpud_dcacheway                                        // loop
 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */
        dsb sy
@@ -161,14 +175,15 @@ LEXT(CleanPoU_DcacheRegion)
 #else /* !defined(APPLE_ARM64_ARCH_FAMILY) */
        mov             x9, #((1<<MMU_CLINE)-1)
        and             x2, x0, x9
-       bic             x0, x0, x9                                                      // Cached aligned
-       add             x1, x1, x2
-       sub             x1, x1, #1
-       lsr             x1, x1, #MMU_CLINE                                      // Set cache line counter
+       bic             x3, x0, x9                                                      // Cached aligned
+       add             x4, x1, x2
+       sub             x4, x4, #1
+       lsr             x4, x4, #MMU_CLINE                                      // Set cache line counter
+       dmb             sy
 L_cpudr_loop:
-       dc              cvau, x0                                                        // Clean dcache line to PoU 
-       add             x0, x0, #(1<<MMU_CLINE)                         // Get next cache aligned addr
-       subs    x1, x1, #1                                                      // Decrementer cache line counter
+       dc              cvau, x3                                                        // Clean dcache line to PoU 
+       add             x3, x3, #(1<<MMU_CLINE)                         // Get next cache aligned addr
+       subs    x4, x4, #1                                                      // Decrementer cache line counter
        b.pl    L_cpudr_loop                                            // Loop in counter not null
 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */
        dsb             sy
@@ -182,8 +197,6 @@ L_cpudr_loop:
        .text
        .align 2
 LEXT(CleanPoC_DcacheRegion_internal)
-       ARM64_STACK_PROLOG
-       PUSH_FRAME
        mov             x9, #((1<<MMU_CLINE)-1)
        and             x2, x0, x9
        bic             x0, x0, x9                                                      // Cached aligned
@@ -205,8 +218,7 @@ L_cpcdr_loop:
        subs    x1, x1, #1                                                      // Decrementer cache line counter
        b.pl    L_cpcdr_loop                                            // Loop in counter not null
        dsb             sy
-       POP_FRAME
-       ARM64_STACK_EPILOG
+       ret
 
 /*
  *     void CleanPoC_DcacheRegion(vm_offset_t va, unsigned length)
@@ -289,6 +301,7 @@ LEXT(FlushPoC_Dcache)
        mov             x9, #(1 << MMU_I7SET)
        mov             x10, #(1 << (MMU_NSET + MMU_I7SET))
        mov             x11, #(1 << MMU_I7WAY)
+       dmb             sy
 L_fpcd_dcacheway:
 L_fpcd_dcacheline:
        dc              cisw, x0                                                        // clean invalidate dcache line by way/set
@@ -296,9 +309,10 @@ L_fpcd_dcacheline:
        tst             x0, #(1 << (MMU_NSET + MMU_I7SET))      // look for overflow
        b.eq    L_fpcd_dcacheline
        bic             x0, x0, x10                                                     // clear set overflow
-       adds    x0, x0, x11                                                     // increment way
+       adds    w0, w0, w11                                                     // increment way
        b.cc    L_fpcd_dcacheway                                        // loop
 #if __ARM_L2CACHE__
+       dsb             sy
        mov             x0, #2
        mov             x9, #(1 << L2_I7SET)
        mov             x10, #(1 << (L2_NSET + L2_I7SET))
@@ -310,7 +324,7 @@ L_fpcd_l2dcacheline:
        tst             x0, #(1 << (L2_NSET + L2_I7SET))        // look for overflow
        b.eq    L_fpcd_l2dcacheline
        bic             x0, x0, x10                                                     // clear set overflow
-       adds    x0, x0, x11                                                     // increment way
+       adds    w0, w0, w11                                                     // increment way
        b.cc    L_fpcd_l2dcacheway                                      // loop
 #endif
 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */
@@ -333,6 +347,7 @@ LEXT(FlushPoU_Dcache)
        mov             x9, #(1 << MMU_I7SET)
        mov             x10, #(1 << (MMU_NSET + MMU_I7SET))
        mov             x11, #(1 << MMU_I7WAY)
+       dmb             sy
 L_fpud_way:
 L_fpud_line:
        dc              cisw, x0                                                        // clean invalidate dcache line by way/set
@@ -340,7 +355,7 @@ L_fpud_line:
        tst             x0, #1 << (MMU_NSET + MMU_I7SET)        // look for overflow
        b.eq    L_fpud_line
        bic             x0, x0, x10                                                     // clear set overflow
-       adds    x0, x0, x11                                                     // increment way
+       adds    w0, w0, w11                                                     // increment way
        b.cc    L_fpud_way                                                      // loop
 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */
        dsb             sy
@@ -364,6 +379,7 @@ LEXT(FlushPoC_DcacheRegion)
        add             x1, x1, x2
        sub             x1, x1, #1
        lsr             x1, x1, #MMU_CLINE                                      // Set cache line counter
+       dmb             sy
 L_fpcdr_loop:
        dc              civac, x0                                                       // Clean invalidate dcache line to PoC
        add             x0, x0, #(1<<MMU_CLINE)                         // Get next cache aligned addr
index 031e5a396b023721d4ce127fed6db4f5aada5681..af47b7201f2366592b827819cd50396e3fcecf81 100644 (file)
 extern int _bcopyin(const char *src, char *dst, vm_size_t len);
 extern int _bcopyinstr(const char *src, char *dst, vm_size_t max, vm_size_t *actual);
 extern int _bcopyout(const char *src, char *dst, vm_size_t len);
-extern int _copyin_word(const char *src, uint64_t *dst, vm_size_t len);
+extern int _copyin_atomic32(const char *src, uint32_t *dst);
+extern int _copyin_atomic32_wait_if_equals(const char *src, uint32_t dst);
+extern int _copyin_atomic64(const char *src, uint64_t *dst);
+extern int _copyout_atomic32(uint32_t u32, const char *dst);
+extern int _copyout_atomic64(uint64_t u64, const char *dst);
+
+extern int copyoutstr_prevalidate(const void *kaddr, user_addr_t uaddr, size_t len);
 
 extern pmap_t kernel_pmap;
 
 /* On by default, optionally disabled by boot-arg */
 extern boolean_t copyio_zalloc_check;
 
-typedef enum copyio_type {
-       COPYIO_IN,
-       COPYIO_IN_WORD,
-       COPYIO_INSTR,
-       COPYIO_OUT,
-} copyio_type_t;
+/*!
+ * @typedef copyio_flags_t
+ *
+ * @const COPYIO_IN
+ * The copy is user -> kernel.
+ * One of COPYIO_IN or COPYIO_OUT should always be specified.
+ *
+ * @const COPYIO_OUT
+ * The copy is kernel -> user
+ * One of COPYIO_IN or COPYIO_OUT should always be specified.
+ *
+ * @const COPYIO_ALLOW_KERNEL_TO_KERNEL
+ * The "user_address" is allowed to be in the VA space of the kernel.
+ *
+ * @const COPYIO_VALIDATE_USER_ONLY
+ * There isn't really a kernel address used, and only the user address
+ * needs to be validated.
+ *
+ * @const COPYIO_ATOMIC
+ * The copyio operation is atomic, ensure that it is properly aligned.
+ */
+__options_decl(copyio_flags_t, uint32_t, {
+       COPYIO_IN                       = 0x0001,
+       COPYIO_OUT                      = 0x0002,
+       COPYIO_ALLOW_KERNEL_TO_KERNEL   = 0x0004,
+       COPYIO_VALIDATE_USER_ONLY       = 0x0008,
+       COPYIO_ATOMIC                   = 0x0010,
+});
 
 static inline void
 user_access_enable(void)
 {
 #if __ARM_PAN_AVAILABLE__
+       assert(__builtin_arm_rsr("pan") != 0);
        __builtin_arm_wsr("pan", 0);
 #endif  /* __ARM_PAN_AVAILABLE__ */
 }
@@ -70,78 +99,94 @@ user_access_disable(void)
 #endif  /* __ARM_PAN_AVAILABLE__ */
 }
 
+/*
+ * Copy sizes bigger than this value will cause a kernel panic.
+ *
+ * Yes, this is an arbitrary fixed limit, but it's almost certainly
+ * a programming error to be copying more than this amount between
+ * user and wired kernel memory in a single invocation on this
+ * platform.
+ */
+const int copysize_limit_panic = (64 * 1024 * 1024);
+
+static inline bool
+is_kernel_to_kernel_copy()
+{
+       return current_thread()->map->pmap == kernel_pmap;
+}
+
+/*
+ * Validate the arguments to copy{in,out} on this platform.
+ *
+ * Returns EXDEV when the current thread pmap is the kernel's
+ * which is non fatal for certain routines.
+ */
 static int
-copyio(copyio_type_t copytype, const char *src, char *dst,
-    vm_size_t nbytes, vm_size_t *lencopied)
+copy_validate(const user_addr_t user_addr, uintptr_t kernel_addr,
+    vm_size_t nbytes, copyio_flags_t flags)
 {
-       int result = 0;
-       vm_size_t bytes_copied = 0;
-       vm_size_t kernel_buf_size = 0;
-       void * kernel_addr = NULL;
+       thread_t self = current_thread();
 
-       /* Reject TBI addresses */
-       if (copytype == COPYIO_OUT) {
-               if ((uintptr_t)dst & TBI_MASK) {
-                       return EINVAL;
-               }
-       } else {
-               if ((uintptr_t)src & TBI_MASK) {
+       user_addr_t user_addr_last;
+       uintptr_t kernel_addr_last;
+
+       if (__improbable(nbytes > copysize_limit_panic)) {
+               panic("%s(%p, %p, %lu) - transfer too large", __func__,
+                   (void *)user_addr, (void *)kernel_addr, nbytes);
+       }
+
+       if (__improbable((user_addr < vm_map_min(self->map)) ||
+           os_add_overflow(user_addr, nbytes, &user_addr_last) ||
+           (user_addr_last > vm_map_max(self->map)))) {
+               return EFAULT;
+       }
+
+       if (flags & COPYIO_ATOMIC) {
+               if (__improbable(user_addr & (nbytes - 1))) {
                        return EINVAL;
                }
        }
 
-       if (__probable(copyio_zalloc_check)) {
-               if (copytype == COPYIO_IN || copytype == COPYIO_INSTR || copytype == COPYIO_IN_WORD) {
-                       kernel_addr = (void*)dst;
-               } else if (copytype == COPYIO_OUT) {
-                       kernel_addr = (void*)(uintptr_t)src;
+       if ((flags & COPYIO_VALIDATE_USER_ONLY) == 0) {
+               if (__improbable((kernel_addr < VM_MIN_KERNEL_ADDRESS) ||
+                   os_add_overflow(kernel_addr, nbytes, &kernel_addr_last) ||
+                   (kernel_addr_last > VM_MAX_KERNEL_ADDRESS))) {
+                       panic("%s(%p, %p, %lu) - kaddr not in kernel", __func__,
+                           (void *)user_addr, (void *)kernel_addr, nbytes);
                }
-               if (kernel_addr) {
-                       kernel_buf_size = zone_element_size(kernel_addr, NULL);
-               }
-               if (__improbable(kernel_buf_size && kernel_buf_size < nbytes)) {
-                       panic("copyio: kernel buffer %p has size %lu < nbytes %lu", kernel_addr, kernel_buf_size, nbytes);
+       }
+
+       if (is_kernel_to_kernel_copy()) {
+               if (__improbable((flags & COPYIO_ALLOW_KERNEL_TO_KERNEL) == 0)) {
+                       return EFAULT;
                }
+               return EXDEV;
        }
 
-#if KASAN
-       /* For user copies, asan-check the kernel-side buffer */
-       if (copytype == COPYIO_IN || copytype == COPYIO_INSTR || copytype == COPYIO_IN_WORD) {
-               __asan_storeN((uintptr_t)dst, nbytes);
-       } else if (copytype == COPYIO_OUT) {
-               __asan_loadN((uintptr_t)src, nbytes);
+       if (__improbable(user_addr & TBI_MASK)) {
+               return EINVAL;
        }
-#endif
 
-       user_access_enable();
+       if ((flags & COPYIO_VALIDATE_USER_ONLY) == 0) {
+               if (__probable(copyio_zalloc_check)) {
+                       vm_size_t kernel_buf_size = zone_element_size((void *)kernel_addr, NULL);
+                       if (__improbable(kernel_buf_size && kernel_buf_size < nbytes)) {
+                               panic("copyio_preflight: kernel buffer 0x%lx has size %lu < nbytes %lu",
+                                   kernel_addr, kernel_buf_size, nbytes);
+                       }
+               }
 
-       /* Select copy routines based on direction:
-        *   COPYIO_IN - Use unprivileged loads to read from user address
-        *   COPYIO_OUT - Use unprivleged stores to write to user address
-        */
-
-       switch (copytype) {
-       case COPYIO_IN:
-               result = _bcopyin(src, dst, nbytes);
-               break;
-       case COPYIO_INSTR:
-               result = _bcopyinstr(src, dst, nbytes, &bytes_copied);
-               if (result != EFAULT) {
-                       *lencopied = bytes_copied;
+#if KASAN
+               /* For user copies, asan-check the kernel-side buffer */
+               if (flags & COPYIO_IN) {
+                       __asan_storeN(kernel_addr, nbytes);
+               } else {
+                       __asan_loadN(kernel_addr, nbytes);
+                       kasan_check_uninitialized((vm_address_t)kernel_addr, nbytes);
                }
-               break;
-       case COPYIO_IN_WORD:
-               result = _copyin_word(src, (uint64_t *)(uintptr_t)dst, nbytes);
-               break;
-       case COPYIO_OUT:
-               result = _bcopyout(src, dst, nbytes);
-               break;
-       default:
-               result = EINVAL;
+#endif
        }
-
-       user_access_disable();
-       return result;
+       return 0;
 }
 
 int
@@ -165,67 +210,123 @@ copyin(const user_addr_t user_addr, void *kernel_addr, vm_size_t nbytes)
 {
        int result;
 
-       if (nbytes == 0) {
+       if (__improbable(nbytes == 0)) {
                return 0;
        }
 
-       result = copyin_validate(user_addr, (uintptr_t)kernel_addr, nbytes);
-       if (result) {
+       result = copy_validate(user_addr, (uintptr_t)kernel_addr, nbytes,
+           COPYIO_IN | COPYIO_ALLOW_KERNEL_TO_KERNEL);
+       if (result == EXDEV) {
+               return copyin_kern(user_addr, kernel_addr, nbytes);
+       }
+       if (__improbable(result)) {
                return result;
        }
 
-       if (current_thread()->map->pmap == kernel_pmap) {
-               return copyin_kern(user_addr, kernel_addr, nbytes);
-       } else {
-               return copyio(COPYIO_IN, (const char *)(uintptr_t)user_addr, kernel_addr, nbytes, NULL);
-       }
+       user_access_enable();
+       result = _bcopyin((const char *)user_addr, kernel_addr, nbytes);
+       user_access_disable();
+       return result;
 }
 
 /*
- * copyin_word
- * Read an aligned value from userspace as a single memory transaction.
- * This function supports userspace synchronization features
+ * copy{in,out}_atomic{32,64}
+ * Read or store an aligned value from userspace as a single memory transaction.
+ * These functions support userspace synchronization features
  */
 int
-copyin_word(const user_addr_t user_addr, uint64_t *kernel_addr, vm_size_t nbytes)
+copyin_atomic32(const user_addr_t user_addr, uint32_t *kernel_addr)
 {
-       int                     result;
+       int result = copy_validate(user_addr, (uintptr_t)kernel_addr, 4,
+           COPYIO_IN | COPYIO_ATOMIC);
+       if (__improbable(result)) {
+               return result;
+       }
+       user_access_enable();
+       result = _copyin_atomic32((const char *)user_addr, kernel_addr);
+       user_access_disable();
+       return result;
+}
 
-       /* Verify sizes */
-       if ((nbytes != 4) && (nbytes != 8)) {
-               return EINVAL;
+int
+copyin_atomic32_wait_if_equals(const user_addr_t user_addr, uint32_t value)
+{
+       int result = copy_validate(user_addr, 0, 4,
+           COPYIO_OUT | COPYIO_ATOMIC | COPYIO_VALIDATE_USER_ONLY);
+       if (__improbable(result)) {
+               return result;
        }
+       user_access_enable();
+       result = _copyin_atomic32_wait_if_equals((const char *)user_addr, value);
+       user_access_disable();
+       return result;
+}
 
-       /* Test alignment */
-       if (user_addr & (nbytes - 1)) {
-               return EINVAL;
+int
+copyin_atomic64(const user_addr_t user_addr, uint64_t *kernel_addr)
+{
+       int result = copy_validate(user_addr, (uintptr_t)kernel_addr, 8,
+           COPYIO_IN | COPYIO_ATOMIC);
+       if (__improbable(result)) {
+               return result;
        }
+       user_access_enable();
+       result = _copyin_atomic64((const char *)user_addr, kernel_addr);
+       user_access_disable();
+       return result;
+}
 
-       result = copyin_validate(user_addr, (uintptr_t)kernel_addr, nbytes);
-       if (result) {
+int
+copyout_atomic32(uint32_t value, user_addr_t user_addr)
+{
+       int result = copy_validate(user_addr, 0, 4,
+           COPYIO_OUT | COPYIO_ATOMIC | COPYIO_VALIDATE_USER_ONLY);
+       if (__improbable(result)) {
                return result;
        }
+       user_access_enable();
+       result = _copyout_atomic32(value, (const char *)user_addr);
+       user_access_disable();
+       return result;
+}
 
-       return copyio(COPYIO_IN_WORD, (const char *)user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL);
+int
+copyout_atomic64(uint64_t value, user_addr_t user_addr)
+{
+       int result = copy_validate(user_addr, 0, 8,
+           COPYIO_OUT | COPYIO_ATOMIC | COPYIO_VALIDATE_USER_ONLY);
+       if (__improbable(result)) {
+               return result;
+       }
+       user_access_enable();
+       result = _copyout_atomic64(value, (const char *)user_addr);
+       user_access_disable();
+       return result;
 }
 
 int
 copyinstr(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes, vm_size_t *lencopied)
 {
        int result;
+       vm_size_t bytes_copied = 0;
 
        *lencopied = 0;
-       if (nbytes == 0) {
+       if (__improbable(nbytes == 0)) {
                return ENAMETOOLONG;
        }
 
-       result = copyin_validate(user_addr, (uintptr_t)kernel_addr, nbytes);
-
-       if (result) {
+       result = copy_validate(user_addr, (uintptr_t)kernel_addr, nbytes, COPYIO_IN);
+       if (__improbable(result)) {
                return result;
        }
-
-       return copyio(COPYIO_INSTR, (const char *)(uintptr_t)user_addr, kernel_addr, nbytes, lencopied);
+       user_access_enable();
+       result = _bcopyinstr((const char *)user_addr, kernel_addr, nbytes,
+           &bytes_copied);
+       user_access_disable();
+       if (result != EFAULT) {
+               *lencopied = bytes_copied;
+       }
+       return result;
 }
 
 int
@@ -237,69 +338,26 @@ copyout(const void *kernel_addr, user_addr_t user_addr, vm_size_t nbytes)
                return 0;
        }
 
-       result = copyout_validate((uintptr_t)kernel_addr, user_addr, nbytes);
-       if (result) {
-               return result;
-       }
-
-       if (current_thread()->map->pmap == kernel_pmap) {
+       result = copy_validate(user_addr, (uintptr_t)kernel_addr, nbytes,
+           COPYIO_OUT | COPYIO_ALLOW_KERNEL_TO_KERNEL);
+       if (result == EXDEV) {
                return copyout_kern(kernel_addr, user_addr, nbytes);
-       } else {
-               return copyio(COPYIO_OUT, kernel_addr, (char *)(uintptr_t)user_addr, nbytes, NULL);
        }
+       if (__improbable(result)) {
+               return result;
+       }
+       user_access_enable();
+       result = _bcopyout(kernel_addr, (char *)user_addr, nbytes);
+       user_access_disable();
+       return result;
 }
 
-
-/*
- * Copy sizes bigger than this value will cause a kernel panic.
- *
- * Yes, this is an arbitrary fixed limit, but it's almost certainly
- * a programming error to be copying more than this amount between
- * user and wired kernel memory in a single invocation on this
- * platform.
- */
-const int copysize_limit_panic = (64 * 1024 * 1024);
-
-/*
- * Validate the arguments to copy{in,out} on this platform.
- */
-static int
-copy_validate(const user_addr_t user_addr,
-    uintptr_t kernel_addr, vm_size_t nbytes)
+int
+copyoutstr_prevalidate(const void *__unused kaddr, user_addr_t __unused uaddr, size_t __unused len)
 {
-       uintptr_t kernel_addr_last = kernel_addr + nbytes;
-
-       if (__improbable(kernel_addr < VM_MIN_KERNEL_ADDRESS ||
-           kernel_addr > VM_MAX_KERNEL_ADDRESS ||
-           kernel_addr_last < kernel_addr ||
-           kernel_addr_last > VM_MAX_KERNEL_ADDRESS)) {
-               panic("%s(%p, %p, %lu) - kaddr not in kernel", __func__,
-                   (void *)user_addr, (void *)kernel_addr, nbytes);
-       }
-
-       user_addr_t user_addr_last = user_addr + nbytes;
-
-       if (__improbable((user_addr_last < user_addr) || ((user_addr + nbytes) > vm_map_max(current_thread()->map)) ||
-           (user_addr < vm_map_min(current_thread()->map)))) {
+       if (__improbable(is_kernel_to_kernel_copy())) {
                return EFAULT;
        }
 
-       if (__improbable(nbytes > copysize_limit_panic)) {
-               panic("%s(%p, %p, %lu) - transfer too large", __func__,
-                   (void *)user_addr, (void *)kernel_addr, nbytes);
-       }
-
        return 0;
 }
-
-int
-copyin_validate(const user_addr_t ua, uintptr_t ka, vm_size_t nbytes)
-{
-       return copy_validate(ua, ka, nbytes);
-}
-
-int
-copyout_validate(uintptr_t ka, const user_addr_t ua, vm_size_t nbytes)
-{
-       return copy_validate(ua, ka, nbytes);
-}
index 483d4673bda3b6a631120e912c1d18d8aaf8f3dd..2360e698264a7eaedcdc6aa166d8c748b4be786f 100644 (file)
@@ -84,10 +84,10 @@ extern void exc_vectors_table;
 
 extern void __attribute__((noreturn)) arm64_prepare_for_sleep(void);
 extern void arm64_force_wfi_clock_gate(void);
-#if (defined(APPLECYCLONE) || defined(APPLETYPHOON))
-// <rdar://problem/15827409> CPU1 Stuck in WFIWT Because of MMU Prefetch
-extern void cyclone_typhoon_prepare_for_wfi(void);
-extern void cyclone_typhoon_return_from_wfi(void);
+#if defined(APPLETYPHOON)
+// <rdar://problem/15827409>
+extern void typhoon_prepare_for_wfi(void);
+extern void typhoon_return_from_wfi(void);
 #endif
 
 
@@ -116,6 +116,8 @@ static uint64_t wfi_delay = 0;
 
 #endif /* DEVELOPMENT || DEBUG */
 
+static bool idle_wfe_to_deadline = false;
+
 #if __ARM_GLOBAL_SLEEP_BIT__
 volatile boolean_t arm64_stall_sleep = TRUE;
 #endif
@@ -136,6 +138,7 @@ static boolean_t coresight_debug_enabled = FALSE;
 
 #if defined(CONFIG_XNUPOST)
 void arm64_ipi_test_callback(void *);
+void arm64_immediate_ipi_test_callback(void *);
 
 void
 arm64_ipi_test_callback(void *parm)
@@ -148,12 +151,23 @@ arm64_ipi_test_callback(void *parm)
        *ipi_test_data = cpu_data->cpu_number;
 }
 
-uint64_t arm64_ipi_test_data[MAX_CPUS];
+void
+arm64_immediate_ipi_test_callback(void *parm)
+{
+       volatile uint64_t *ipi_test_data = parm;
+       cpu_data_t *cpu_data;
+
+       cpu_data = getCpuDatap();
+
+       *ipi_test_data = cpu_data->cpu_number + MAX_CPUS;
+}
+
+uint64_t arm64_ipi_test_data[MAX_CPUS * 2];
 
 void
 arm64_ipi_test()
 {
-       volatile uint64_t *ipi_test_data;
+       volatile uint64_t *ipi_test_data, *immediate_ipi_test_data;
        uint32_t timeout_ms = 100;
        uint64_t then, now, delta;
        int current_cpu_number = getCpuDatap()->cpu_number;
@@ -169,19 +183,34 @@ arm64_ipi_test()
 
        for (unsigned int i = 0; i < MAX_CPUS; ++i) {
                ipi_test_data = &arm64_ipi_test_data[i];
+               immediate_ipi_test_data = &arm64_ipi_test_data[i + MAX_CPUS];
                *ipi_test_data = ~i;
                kern_return_t error = cpu_xcall((int)i, (void *)arm64_ipi_test_callback, (void *)(uintptr_t)ipi_test_data);
                if (error != KERN_SUCCESS) {
                        panic("CPU %d was unable to IPI CPU %u: error %d", current_cpu_number, i, error);
                }
 
+               while ((error = cpu_immediate_xcall((int)i, (void *)arm64_immediate_ipi_test_callback,
+                   (void *)(uintptr_t)immediate_ipi_test_data)) == KERN_ALREADY_WAITING) {
+                       now = mach_absolute_time();
+                       absolutetime_to_nanoseconds(now - then, &delta);
+                       if ((delta / NSEC_PER_MSEC) > timeout_ms) {
+                               panic("CPU %d was unable to immediate-IPI CPU %u within %dms", current_cpu_number, i, timeout_ms);
+                       }
+               }
+
+               if (error != KERN_SUCCESS) {
+                       panic("CPU %d was unable to immediate-IPI CPU %u: error %d", current_cpu_number, i, error);
+               }
+
                then = mach_absolute_time();
 
-               while (*ipi_test_data != i) {
+               while ((*ipi_test_data != i) || (*immediate_ipi_test_data != (i + MAX_CPUS))) {
                        now = mach_absolute_time();
                        absolutetime_to_nanoseconds(now - then, &delta);
                        if ((delta / NSEC_PER_MSEC) > timeout_ms) {
-                               panic("CPU %d tried to IPI CPU %d but didn't get correct response within %dms, respose: %llx", current_cpu_number, i, timeout_ms, *ipi_test_data);
+                               panic("CPU %d tried to IPI CPU %d but didn't get correct responses within %dms, responses: %llx, %llx",
+                                   current_cpu_number, i, timeout_ms, *ipi_test_data, *immediate_ipi_test_data);
                        }
                }
        }
@@ -271,7 +300,29 @@ cpu_sleep(void)
 
        CleanPoC_Dcache();
 
+       /* This calls:
+        *
+        * IOCPURunPlatformQuiesceActions when sleeping the boot cpu
+        * ml_arm_sleep() on all CPUs
+        *
+        * It does not return.
+        */
        PE_cpu_machine_quiesce(cpu_data_ptr->cpu_id);
+       /*NOTREACHED*/
+}
+
+/*
+ *     Routine:        cpu_interrupt_is_pending
+ *     Function:       Returns the value of ISR.  Due to how this register is
+ *                     is implemented, this returns 0 if there are no
+ *                     interrupts pending, so it can be used as a boolean test.
+ */
+static int
+cpu_interrupt_is_pending(void)
+{
+       uint64_t isr_value;
+       isr_value = __builtin_arm_rsr64("ISR_EL1");
+       return (int)isr_value;
 }
 
 /*
@@ -287,9 +338,20 @@ cpu_idle(void)
        if ((!idle_enable) || (cpu_data_ptr->cpu_signal & SIGPdisabled)) {
                Idle_load_context();
        }
+
        if (!SetIdlePop()) {
+               /* If a deadline is pending, wait for it to elapse. */
+               if (idle_wfe_to_deadline) {
+                       if (arm64_wfe_allowed()) {
+                               while (!cpu_interrupt_is_pending()) {
+                                       __builtin_arm_wfe();
+                               }
+                       }
+               }
+
                Idle_load_context();
        }
+
        lastPop = cpu_data_ptr->rtcPop;
 
        pmap_switch_user_ttb(kernel_pmap);
@@ -335,16 +397,16 @@ cpu_idle(void)
                }
 #endif /* DEVELOPMENT || DEBUG */
 
-#if defined(APPLECYCLONE) || defined(APPLETYPHOON)
+#if defined(APPLETYPHOON)
                // <rdar://problem/15827409> CPU1 Stuck in WFIWT Because of MMU Prefetch
-               cyclone_typhoon_prepare_for_wfi();
+               typhoon_prepare_for_wfi();
 #endif
                __builtin_arm_dsb(DSB_SY);
                __builtin_arm_wfi();
 
-#if defined(APPLECYCLONE) || defined(APPLETYPHOON)
+#if defined(APPLETYPHOON)
                // <rdar://problem/15827409> CPU1 Stuck in WFIWT Because of MMU Prefetch
-               cyclone_typhoon_return_from_wfi();
+               typhoon_return_from_wfi();
 #endif
 
 #if DEVELOPMENT || DEBUG
@@ -471,7 +533,9 @@ cpu_init(void)
        cdp->cpu_stat.irq_ex_cnt_wake = 0;
        cdp->cpu_stat.ipi_cnt_wake = 0;
        cdp->cpu_stat.timer_cnt_wake = 0;
+#if MONOTONIC
        cdp->cpu_stat.pmi_cnt_wake = 0;
+#endif /* MONOTONIC */
        cdp->cpu_running = TRUE;
        cdp->cpu_sleep_token_last = cdp->cpu_sleep_token;
        cdp->cpu_sleep_token = 0x0UL;
@@ -517,11 +581,16 @@ cpu_stack_alloc(cpu_data_t *cpu_data_ptr)
 void
 cpu_data_free(cpu_data_t *cpu_data_ptr)
 {
-       if (cpu_data_ptr == &BootCpuData) {
+       if ((cpu_data_ptr == NULL) || (cpu_data_ptr == &BootCpuData)) {
                return;
        }
 
        cpu_processor_free( cpu_data_ptr->cpu_processor);
+       if (CpuDataEntries[cpu_data_ptr->cpu_number].cpu_data_vaddr == cpu_data_ptr) {
+               CpuDataEntries[cpu_data_ptr->cpu_number].cpu_data_vaddr = NULL;
+               CpuDataEntries[cpu_data_ptr->cpu_number].cpu_data_paddr = 0;
+               __builtin_arm_dmb(DMB_ISH); // Ensure prior stores to cpu array are visible
+       }
        (kfree)((void *)(cpu_data_ptr->intstack_top - INTSTACK_SIZE), INTSTACK_SIZE);
        (kfree)((void *)(cpu_data_ptr->excepstack_top - EXCEPSTACK_SIZE), EXCEPSTACK_SIZE);
        kmem_free(kernel_map, (vm_offset_t)cpu_data_ptr, sizeof(cpu_data_t));
@@ -561,12 +630,6 @@ cpu_data_init(cpu_data_t *cpu_data_ptr)
 
        cpu_data_ptr->cpu_signal = SIGPdisabled;
 
-#if DEBUG || DEVELOPMENT
-       cpu_data_ptr->failed_xcall = NULL;
-       cpu_data_ptr->failed_signal = 0;
-       cpu_data_ptr->failed_signal_count = 0;
-#endif
-
        cpu_data_ptr->cpu_get_fiq_handler = NULL;
        cpu_data_ptr->cpu_tbd_hardware_addr = NULL;
        cpu_data_ptr->cpu_tbd_hardware_val = NULL;
@@ -576,6 +639,8 @@ cpu_data_init(cpu_data_t *cpu_data_ptr)
        cpu_data_ptr->cpu_sleep_token_last = 0x00000000UL;
        cpu_data_ptr->cpu_xcall_p0 = NULL;
        cpu_data_ptr->cpu_xcall_p1 = NULL;
+       cpu_data_ptr->cpu_imm_xcall_p0 = NULL;
+       cpu_data_ptr->cpu_imm_xcall_p1 = NULL;
 
        for (i = 0; i < CORESIGHT_REGIONS; ++i) {
                cpu_data_ptr->coresight_base[i] = 0;
@@ -594,6 +659,9 @@ cpu_data_init(cpu_data_t *cpu_data_ptr)
        cpu_data_ptr->cpu_exc_vectors = (vm_offset_t)&exc_vectors_table;
 #endif /* __ARM_KERNEL_PROTECT__ */
 
+#if defined(HAS_APPLE_PAC)
+       cpu_data_ptr->rop_key = 0;
+#endif
 }
 
 kern_return_t
@@ -607,6 +675,7 @@ cpu_data_register(cpu_data_t *cpu_data_ptr)
        }
 #endif
 
+       __builtin_arm_dmb(DMB_ISH); // Ensure prior stores to cpu data are visible
        CpuDataEntries[cpu].cpu_data_vaddr = cpu_data_ptr;
        CpuDataEntries[cpu].cpu_data_paddr = (void *)ml_vtophys((vm_offset_t)cpu_data_ptr);
        return KERN_SUCCESS;
@@ -630,8 +699,8 @@ cpu_start(int cpu)
 
                cpu_data_ptr->cpu_pmap_cpu_data.cpu_nested_pmap = NULL;
 
-               if (cpu_data_ptr->cpu_processor->next_thread != THREAD_NULL) {
-                       first_thread = cpu_data_ptr->cpu_processor->next_thread;
+               if (cpu_data_ptr->cpu_processor->startup_thread != THREAD_NULL) {
+                       first_thread = cpu_data_ptr->cpu_processor->startup_thread;
                } else {
                        first_thread = cpu_data_ptr->cpu_processor->idle_thread;
                }
@@ -675,6 +744,9 @@ cpu_timebase_init(boolean_t from_boot)
                 * This ensures that mach_absolute_time() stops ticking across sleep.
                 */
                rtclock_base_abstime = wake_abstime - ml_get_hwclock();
+       } else if (from_boot) {
+               /* On initial boot, initialize time_since_reset to CNTPCT_EL0. */
+               ml_set_reset_time(ml_get_hwclock());
        }
 
        cdp->cpu_decrementer = 0x7FFFFFFFUL;
@@ -717,6 +789,7 @@ ml_arm_sleep(void)
                 * the abstime value we'll use when we resume.
                 */
                wake_abstime = ml_get_timebase();
+               ml_set_reset_time(UINT64_MAX);
        } else {
                CleanPoU_Dcache();
        }
@@ -841,6 +914,8 @@ cpu_machine_idle_init(boolean_t from_boot)
                        break;
                }
 
+               PE_parse_boot_argn("idle_wfe_to_deadline", &idle_wfe_to_deadline, sizeof(idle_wfe_to_deadline));
+
                ResetHandlerData.assist_reset_handler = 0;
                ResetHandlerData.cpu_data_entries = ml_static_vtop((vm_offset_t)CpuDataEntries);
 
@@ -898,9 +973,9 @@ void
 machine_track_platform_idle(boolean_t entry)
 {
        if (entry) {
-               (void)__c11_atomic_fetch_add(&cpu_idle_count, 1, __ATOMIC_RELAXED);
+               os_atomic_inc(&cpu_idle_count, relaxed);
        } else {
-               (void)__c11_atomic_fetch_sub(&cpu_idle_count, 1, __ATOMIC_RELAXED);
+               os_atomic_dec(&cpu_idle_count, relaxed);
        }
 }
 
index 7aa9614a13f52877f779a5e12c4c0c7fa2a4cbd5..06aeca99ef4cd839fcb7784f4211c869b17182dc 100644 (file)
@@ -27,6 +27,7 @@
  */
 #include <machine/asm.h>
 #include <arm64/machine_machdep.h>
+#include <arm64/machine_routines_asm.h>
 #include <arm64/proc_reg.h>
 #include "assym.s"
 
        stp             x25, x26, [$0, SS64_X25]
        stp             x27, x28, [$0, SS64_X27]
        stp             fp, lr, [$0, SS64_FP]
+#ifdef HAS_APPLE_PAC
+       stp             x0, x1, [sp, #-16]!
+       stp             x2, x3, [sp, #-16]!
+       stp             x4, x5, [sp, #-16]!
+
+       /*
+        * Arg0: The ARM context pointer
+        * Arg1: PC value to sign
+        * Arg2: CPSR value to sign
+        * Arg3: LR to sign
+        */
+       mov             x0, $0
+       ldr             x1, [x0, SS64_PC]
+       ldr             w2, [x0, SS64_CPSR]
+       mov             x3, lr
+       mov             x4, x16
+       mov             x5, x17
+       bl              EXT(ml_sign_thread_state)
+
+       ldp             x4, x5, [sp], #16
+       ldp             x2, x3, [sp], #16
+       ldp             x0, x1, [sp], #16
+       ldp             fp, lr, [$0, SS64_FP]
+#endif /* defined(HAS_APPLE_PAC) */
        mov             $1, sp
        str             $1, [$0, SS64_SP]
 
  *   arg1 - Scratch register
  */
 .macro load_general_registers
+       mov             x20, x0
+       mov             x21, x1
+       mov             x22, x2
+
+       mov             x0, $0
+       AUTH_THREAD_STATE_IN_X0 x23, x24, x25, x26, x27
 
-       ldp             x16, x17, [$0, SS64_X16]
+       mov             x0, x20
+       mov             x1, x21
+       mov             x2, x22
+
+       // Skip x16, x17 - already loaded + authed by AUTH_THREAD_STATE_IN_X0
        ldp             x19, x20, [$0, SS64_X19]
        ldp             x21, x22, [$0, SS64_X21]
        ldp             x23, x24, [$0, SS64_X23]
        ldp             x25, x26, [$0, SS64_X25]
        ldp             x27, x28, [$0, SS64_X27]
-       ldp             fp, lr, [$0, SS64_FP]
+       ldr             fp, [$0, SS64_FP]
+       // Skip lr - already loaded + authed by AUTH_THREAD_STATE_IN_X0
        ldr             $1, [$0, SS64_SP]
        mov             sp, $1
 
        ldr             d15,[$0, NS64_D15]
 .endmacro
 
+
 /*
  * set_thread_registers
  *
        mov             x18, $1                                                         // ... and trash reserved x18
 .endmacro
 
+#if defined(HAS_APPLE_PAC)
+/*
+ * set_process_dependent_keys
+ *
+ * Updates process dependent keys during context switch if necessary
+ *  Per CPU Data rop_key is initialized in arm_init() for bootstrap processor
+ *  and in cpu_data_init for slave processors
+ *
+ *  arg0 - New thread pointer/Current CPU key
+ *  arg1 - Scratch register: New Thread Key
+ *  arg2 - Scratch register: Current CPU Data pointer
+ */
+.macro set_process_dependent_keys
+       ldr             $1, [$0, TH_ROP_PID]
+       ldr             $2, [$0, ACT_CPUDATAP]
+       ldr             $0, [$2, CPU_ROP_KEY]
+       cmp             $0, $1
+       b.eq    1f
+       str             $1, [$2, CPU_ROP_KEY]
+       msr             APIBKeyLo_EL1, $1
+       add             $1, $1, #1
+       msr             APIBKeyHi_EL1, $1
+       add             $1, $1, #1
+       msr             APDBKeyLo_EL1, $1
+       add             $1, $1, #1
+       msr             APDBKeyHi_EL1, $1
+       isb     sy
+1:
+.endmacro
+#endif /* defined(HAS_APPLE_PAC) */
 
 /*
  * void     machine_load_context(thread_t        thread)
@@ -135,6 +202,9 @@ LEXT(machine_load_context)
        set_thread_registers    x0, x1, x2
        ldr             x1, [x0, TH_KSTACKPTR]                          // Get top of kernel stack
        load_general_registers  x1, x2
+#ifdef HAS_APPLE_PAC
+       set_process_dependent_keys      x0, x1, x2
+#endif
        mov             x0, #0                                                          // Clear argument to thread_continue
        ret
 
@@ -158,6 +228,9 @@ LEXT(Call_continuation)
        mov             sp, x5                                                          // Set stack pointer
        mov             fp, #0                                                          // Clear the frame pointer
 
+#if defined(HAS_APPLE_PAC)
+       set_process_dependent_keys      x4, x5, x6
+#endif
 
     mov x20, x0  //continuation
     mov x21, x1  //continuation parameter
@@ -165,12 +238,16 @@ LEXT(Call_continuation)
 
     cbz x3, 1f
     mov x0, #1
-    bl _ml_set_interrupts_enabled
+    bl EXT(ml_set_interrupts_enabled)
 1:
 
        mov             x0, x21                                                         // Set the first parameter
        mov             x1, x22                                                         // Set the wait result arg
+#ifdef HAS_APPLE_PAC
+       blraaz  x20                                                                     // Branch to the continuation
+#else
        blr             x20                                                                     // Branch to the continuation
+#endif
        mrs             x0, TPIDR_EL1                                           // Get the current thread pointer
        b               EXT(thread_terminate)                           // Kill the thread
 
@@ -192,6 +269,9 @@ Lswitch_threads:
        set_thread_registers    x2, x3, x4
        ldr             x3, [x2, TH_KSTACKPTR]
        load_general_registers  x3, x4
+#if defined(HAS_APPLE_PAC)
+       set_process_dependent_keys      x2, x3, x4
+#endif
        ret
 
 /*
@@ -212,7 +292,6 @@ LEXT(Shutdown_context)
        mov             sp, x12
        b               EXT(cpu_doshutdown)
 
-
 /*
  *     thread_t Idle_context(void)
  *
@@ -242,6 +321,9 @@ LEXT(Idle_load_context)
        mrs             x0, TPIDR_EL1                                           // Get thread pointer
        ldr             x1, [x0, TH_KSTACKPTR]                          // Get the top of the kernel stack
        load_general_registers  x1, x2
+#ifdef HAS_APPLE_PAC
+       set_process_dependent_keys      x0, x1, x2
+#endif
        ret
 
        .align  2
@@ -249,3 +331,5 @@ LEXT(Idle_load_context)
 LEXT(machine_set_current_thread)
        set_thread_registers x0, x1, x2
        ret
+
+
index 666efc2d33f4fe4b5bde3a3384bbf882f0a2cffd..7aa70d824183451be6a09bde05829f87709d7822 100644 (file)
@@ -115,7 +115,7 @@ ml_dbgwrap_halt_cpu(int cpu_index, uint64_t timeout_ns)
                return DBGWRAP_ERR_SELF_HALT;
        }
 
-       if (!hw_compare_and_store((uint32_t)-1, (unsigned int)curcpu, &halt_from_cpu) &&
+       if (!os_atomic_cmpxchg(&halt_from_cpu, (uint32_t)-1, (unsigned int)curcpu, acq_rel) &&
            (halt_from_cpu != (uint32_t)curcpu)) {
                return DBGWRAP_ERR_INPROGRESS;
        }
@@ -155,7 +155,7 @@ ml_dbgwrap_stuff_instr(cpu_data_t *cdp, uint32_t instr, uint64_t timeout_ns, dbg
        uint64_t deadline = mach_absolute_time() + interval;
 
 #if DEVELOPMENT || DEBUG
-       uint32_t stuffed_instr_index = hw_atomic_add(&stuffed_instr_count, 1);
+       uint32_t stuffed_instr_index = os_atomic_inc(&stuffed_instr_count, relaxed);
        stuffed_instrs[(stuffed_instr_index - 1) % MAX_STUFFED_INSTRS] = instr;
 #endif
 
diff --git a/osfmk/arm64/exception_asm.h b/osfmk/arm64/exception_asm.h
new file mode 100644 (file)
index 0000000..41bfa1f
--- /dev/null
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _PEXPERT_ARM_BOARD_CONFIG_H
+#include <pexpert/arm64/board_config.h>
+#endif
+
+
+/*
+ * INIT_SAVED_STATE_FLAVORS
+ *
+ * Initializes the saved state flavors of a new saved state structure
+ *  arg0 - saved state pointer
+ *  arg1 - 32-bit scratch reg
+ *  arg2 - 32-bit scratch reg
+ */
+.macro INIT_SAVED_STATE_FLAVORS
+mov            $1, ARM_SAVED_STATE64                                   // Set saved state to 64-bit flavor
+mov            $2, ARM_SAVED_STATE64_COUNT
+stp            $1, $2, [$0, SS_FLAVOR]
+mov            $1, ARM_NEON_SAVED_STATE64                              // Set neon state to 64-bit flavor
+str            $1, [$0, NS_FLAVOR]
+mov            $1, ARM_NEON_SAVED_STATE64_COUNT
+str            $1, [$0, NS_COUNT]
+.endmacro
+
+/*
+ * SPILL_REGISTERS
+ *
+ * Spills the current set of registers (excluding x0, x1, sp, fp) to the specified
+ * save area.
+ *   x0 - Address of the save area
+ */
+
+.macro SPILL_REGISTERS
+stp            x2, x3, [x0, SS64_X2]                                   // Save remaining GPRs
+stp            x4, x5, [x0, SS64_X4]
+stp            x6, x7, [x0, SS64_X6]
+stp            x8, x9, [x0, SS64_X8]
+stp            x10, x11, [x0, SS64_X10]
+stp            x12, x13, [x0, SS64_X12]
+stp            x14, x15, [x0, SS64_X14]
+stp            x16, x17, [x0, SS64_X16]
+stp            x18, x19, [x0, SS64_X18]
+stp            x20, x21, [x0, SS64_X20]
+stp            x22, x23, [x0, SS64_X22]
+stp            x24, x25, [x0, SS64_X24]
+stp            x26, x27, [x0, SS64_X26]
+str            x28, [x0, SS64_X28]
+
+/* Save arm_neon_saved_state64 */
+
+stp            q0, q1, [x0, NS64_Q0]
+stp            q2, q3, [x0, NS64_Q2]
+stp            q4, q5, [x0, NS64_Q4]
+stp            q6, q7, [x0, NS64_Q6]
+stp            q8, q9, [x0, NS64_Q8]
+stp            q10, q11, [x0, NS64_Q10]
+stp            q12, q13, [x0, NS64_Q12]
+stp            q14, q15, [x0, NS64_Q14]
+stp            q16, q17, [x0, NS64_Q16]
+stp            q18, q19, [x0, NS64_Q18]
+stp            q20, q21, [x0, NS64_Q20]
+stp            q22, q23, [x0, NS64_Q22]
+stp            q24, q25, [x0, NS64_Q24]
+stp            q26, q27, [x0, NS64_Q26]
+stp            q28, q29, [x0, NS64_Q28]
+stp            q30, q31, [x0, NS64_Q30]
+
+mrs            lr, ELR_EL1                                                     // Get exception link register
+mrs            x23, SPSR_EL1                                                   // Load CPSR into var reg x23
+mrs            x24, FPSR
+mrs            x25, FPCR
+
+#if defined(HAS_APPLE_PAC)
+/* Save x1 and LR to preserve across call */
+mov            x21, x1
+mov            x20, lr
+
+/*
+ * Create thread state signature
+ *
+ * Arg0: The ARM context pointer
+ * Arg1: The PC value to sign
+ * Arg2: The CPSR value to sign
+ * Arg3: The LR value to sign
+ * Arg4: The X16 value to sign
+ * Arg5: The X17 value to sign
+ */
+mov            x1, lr
+mov            w2, w23
+ldr            x3, [x0, SS64_LR]
+mov            x4, x16
+mov            x5, x17
+bl             _ml_sign_thread_state
+
+mov            lr, x20
+mov            x1, x21
+#endif /* defined(HAS_APPLE_PAC) */
+
+str            lr, [x0, SS64_PC]                                               // Save ELR to PCB
+str            w23, [x0, SS64_CPSR]                                    // Save CPSR to PCB
+str            w24, [x0, NS64_FPSR]
+str            w25, [x0, NS64_FPCR]
+
+mrs            x20, FAR_EL1
+mrs            x21, ESR_EL1
+
+str            x20, [x0, SS64_FAR]
+str            w21, [x0, SS64_ESR]
+.endmacro
+
+.macro DEADLOOP
+b      .
+.endmacro
index faf7f88437594f4e82a1c43d9026abf4b9a74d2f..8dfdecddaedf90221c0655fa52dfeff252e0105d 100644 (file)
  */
 
 #define DECLARE(SYM, VAL) \
-       __asm("DEFINITION__define__" SYM ":\t .ascii \"%0\"" : : "n"  ((u_long)(VAL)))
+       __asm("DEFINITION__define__" SYM ":\t .ascii \"%0\"" : : "i"  ((u_long)(VAL)))
 
 
-int     main(
-       int             argc,
-       char            ** argv);
+int main(int     argc,
+    char ** argv);
 
 int
-main(
-       int     argc,
-       char    **argv)
+main(int     argc,
+    char ** argv)
 {
-       DECLARE("T_PREFETCH_ABT", T_PREFETCH_ABT);
-       DECLARE("T_DATA_ABT", T_DATA_ABT);
-
        DECLARE("AST_URGENT", AST_URGENT);
-       DECLARE("AST_PREEMPTION", AST_PREEMPTION);
 
        DECLARE("TH_RECOVER", offsetof(struct thread, recover));
-       DECLARE("TH_CONTINUATION", offsetof(struct thread, continuation));
-       DECLARE("TH_KERNEL_STACK", offsetof(struct thread, kernel_stack));
        DECLARE("TH_KSTACKPTR", offsetof(struct thread, machine.kstackptr));
-       DECLARE("THREAD_UTHREAD", offsetof(struct thread, uthread));
-
-       DECLARE("TASK_MACH_EXC_PORT",
-           offsetof(struct task, exc_actions[EXC_MACH_SYSCALL].port));
+#if defined(HAS_APPLE_PAC)
+       DECLARE("TH_ROP_PID", offsetof(struct thread, machine.rop_pid));
+       DECLARE("TH_DISABLE_USER_JOP", offsetof(struct thread, machine.disable_user_jop));
+#endif /* defined(HAS_APPLE_PAC) */
 
        /* These fields are being added on demand */
-       DECLARE("ACT_TASK", offsetof(struct thread, task));
        DECLARE("ACT_CONTEXT", offsetof(struct thread, machine.contextData));
-       DECLARE("ACT_UPCB", offsetof(struct thread, machine.upcb));
-//     DECLARE("ACT_PCBDATA",  offsetof(struct thread, machine.contextData.ss));
-       DECLARE("ACT_UNEON", offsetof(struct thread, machine.uNeon));
-//     DECLARE("ACT_NEONDATA", offsetof(struct thread, machine.contextData.ns));
        DECLARE("TH_CTH_SELF", offsetof(struct thread, machine.cthread_self));
        DECLARE("TH_CTH_DATA", offsetof(struct thread, machine.cthread_data));
        DECLARE("ACT_PREEMPT_CNT", offsetof(struct thread, machine.preemption_count));
        DECLARE("ACT_CPUDATAP", offsetof(struct thread, machine.CpuDatap));
-       DECLARE("ACT_MAP", offsetof(struct thread, map));
        DECLARE("ACT_DEBUGDATA", offsetof(struct thread, machine.DebugData));
        DECLARE("TH_IOTIER_OVERRIDE", offsetof(struct thread, iotier_override));
        DECLARE("TH_RWLOCK_CNT", offsetof(struct thread, rwlock_count));
-       DECLARE("TH_SCHED_FLAGS", offsetof(struct thread, sched_flags));
-       DECLARE("TH_SFLAG_RW_PROMOTED_BIT", TH_SFLAG_RW_PROMOTED_BIT);
-
-       DECLARE("TH_MACH_SYSCALLS", offsetof(struct thread, syscalls_mach));
-       DECLARE("TH_UNIX_SYSCALLS", offsetof(struct thread, syscalls_unix));
-       DECLARE("TASK_BSD_INFO", offsetof(struct task, bsd_info));
 
-       DECLARE("MACH_TRAP_TABLE_COUNT", MACH_TRAP_TABLE_COUNT);
-       DECLARE("MACH_TRAP_TABLE_ENTRY_SIZE", sizeof(mach_trap_t));
-
-       DECLARE("MAP_PMAP", offsetof(struct _vm_map, pmap));
+#if defined(HAS_APPLE_PAC)
+       DECLARE("TASK_ROP_PID", offsetof(struct task, rop_pid));
+#endif /* defined(HAS_APPLE_PAC) */
 
        DECLARE("ARM_CONTEXT_SIZE", sizeof(arm_context_t));
 
-       DECLARE("CONTEXT_SS", offsetof(arm_context_t, ss));
        DECLARE("SS_FLAVOR", offsetof(arm_context_t, ss.ash.flavor));
-       DECLARE("ARM_SAVED_STATE32", ARM_SAVED_STATE32);
        DECLARE("ARM_SAVED_STATE64", ARM_SAVED_STATE64);
        DECLARE("ARM_SAVED_STATE64_COUNT", ARM_SAVED_STATE64_COUNT);
 
-       DECLARE("SS32_W0", offsetof(arm_context_t, ss.ss_32.r[0]));
-       DECLARE("SS32_W2", offsetof(arm_context_t, ss.ss_32.r[2]));
-       DECLARE("SS32_W4", offsetof(arm_context_t, ss.ss_32.r[4]));
-       DECLARE("SS32_W6", offsetof(arm_context_t, ss.ss_32.r[6]));
-       DECLARE("SS32_W8", offsetof(arm_context_t, ss.ss_32.r[8]));
-       DECLARE("SS32_W10", offsetof(arm_context_t, ss.ss_32.r[10]));
-       DECLARE("SS32_W12", offsetof(arm_context_t, ss.ss_32.r[12]));
-       DECLARE("SS32_SP", offsetof(arm_context_t, ss.ss_32.sp));
-       DECLARE("SS32_LR", offsetof(arm_context_t, ss.ss_32.lr));
-       DECLARE("SS32_PC", offsetof(arm_context_t, ss.ss_32.pc));
-       DECLARE("SS32_CPSR", offsetof(arm_context_t, ss.ss_32.cpsr));
-       DECLARE("SS32_VADDR", offsetof(arm_context_t, ss.ss_32.far));
-       DECLARE("SS32_STATUS", offsetof(arm_context_t, ss.ss_32.esr));
-
        DECLARE("SS64_X0", offsetof(arm_context_t, ss.ss_64.x[0]));
        DECLARE("SS64_X2", offsetof(arm_context_t, ss.ss_64.x[2]));
        DECLARE("SS64_X4", offsetof(arm_context_t, ss.ss_64.x[4]));
@@ -203,25 +166,15 @@ main(
        DECLARE("SS64_CPSR", offsetof(arm_context_t, ss.ss_64.cpsr));
        DECLARE("SS64_FAR", offsetof(arm_context_t, ss.ss_64.far));
        DECLARE("SS64_ESR", offsetof(arm_context_t, ss.ss_64.esr));
+#if defined(HAS_APPLE_PAC)
+       DECLARE("SS64_JOPHASH", offsetof(arm_context_t, ss.ss_64.jophash));
+#endif /* defined(HAS_APPLE_PAC) */
 
-       DECLARE("CONTEXT_NS", offsetof(arm_context_t, ns));
        DECLARE("NS_FLAVOR", offsetof(arm_context_t, ns.nsh.flavor));
        DECLARE("NS_COUNT", offsetof(arm_context_t, ns.nsh.count));
-       DECLARE("ARM_NEON_SAVED_STATE32", ARM_NEON_SAVED_STATE32);
        DECLARE("ARM_NEON_SAVED_STATE64", ARM_NEON_SAVED_STATE64);
        DECLARE("ARM_NEON_SAVED_STATE64_COUNT", ARM_NEON_SAVED_STATE64_COUNT);
 
-       DECLARE("NS32_Q0", offsetof(arm_context_t, ns.ns_32.v.q[0]));
-       DECLARE("NS32_Q2", offsetof(arm_context_t, ns.ns_32.v.q[2]));
-       DECLARE("NS32_Q4", offsetof(arm_context_t, ns.ns_32.v.q[4]));
-       DECLARE("NS32_Q6", offsetof(arm_context_t, ns.ns_32.v.q[6]));
-       DECLARE("NS32_Q8", offsetof(arm_context_t, ns.ns_32.v.q[8]));
-       DECLARE("NS32_Q10", offsetof(arm_context_t, ns.ns_32.v.q[10]));
-       DECLARE("NS32_Q12", offsetof(arm_context_t, ns.ns_32.v.q[12]));
-       DECLARE("NS32_Q14", offsetof(arm_context_t, ns.ns_32.v.q[14]));
-       DECLARE("NS32_FPSR", offsetof(arm_context_t, ns.ns_32.fpsr));
-       DECLARE("NS32_FPCR", offsetof(arm_context_t, ns.ns_32.fpcr));
-
        DECLARE("NS64_D8", offsetof(arm_context_t, ns.ns_64.v.d[8]));
        DECLARE("NS64_D9", offsetof(arm_context_t, ns.ns_64.v.d[9]));
        DECLARE("NS64_D10", offsetof(arm_context_t, ns.ns_64.v.d[10]));
@@ -250,126 +203,45 @@ main(
        DECLARE("NS64_FPSR", offsetof(arm_context_t, ns.ns_64.fpsr));
        DECLARE("NS64_FPCR", offsetof(arm_context_t, ns.ns_64.fpcr));
 
+
        DECLARE("PGBYTES", ARM_PGBYTES);
        DECLARE("PGSHIFT", ARM_PGSHIFT);
-       DECLARE("PGMASK", ARM_PGMASK);
 
-       DECLARE("VM_MIN_ADDRESS", VM_MIN_ADDRESS);
-       DECLARE("VM_MAX_ADDRESS", VM_MAX_ADDRESS);
        DECLARE("VM_MIN_KERNEL_ADDRESS", VM_MIN_KERNEL_ADDRESS);
-       DECLARE("VM_MAX_KERNEL_ADDRESS", VM_MAX_KERNEL_ADDRESS);
-       DECLARE("KERNELBASE", VM_MIN_KERNEL_ADDRESS);
        DECLARE("KERNEL_STACK_SIZE", KERNEL_STACK_SIZE);
        DECLARE("TBI_MASK", TBI_MASK);
 
-       DECLARE("KERN_INVALID_ADDRESS", KERN_INVALID_ADDRESS);
+       DECLARE("MAX_CPUS", MAX_CPUS);
 
+       DECLARE("cdeSize", sizeof(struct cpu_data_entry));
 
-       DECLARE("MAX_CPUS", MAX_CPUS);
+       DECLARE("cdSize", sizeof(struct cpu_data));
 
-       DECLARE("cdeSize",
-           sizeof(struct cpu_data_entry));
-
-       DECLARE("cdSize",
-           sizeof(struct cpu_data));
-
-       DECLARE("CPU_ACTIVE_THREAD",
-           offsetof(cpu_data_t, cpu_active_thread));
-       DECLARE("CPU_ACTIVE_STACK",
-           offsetof(cpu_data_t, cpu_active_stack));
-       DECLARE("CPU_ISTACKPTR",
-           offsetof(cpu_data_t, istackptr));
-       DECLARE("CPU_INTSTACK_TOP",
-           offsetof(cpu_data_t, intstack_top));
-       DECLARE("CPU_EXCEPSTACKPTR",
-           offsetof(cpu_data_t, excepstackptr));
-       DECLARE("CPU_EXCEPSTACK_TOP",
-           offsetof(cpu_data_t, excepstack_top));
+       DECLARE("CPU_ACTIVE_THREAD", offsetof(cpu_data_t, cpu_active_thread));
+       DECLARE("CPU_ISTACKPTR", offsetof(cpu_data_t, istackptr));
+       DECLARE("CPU_INTSTACK_TOP", offsetof(cpu_data_t, intstack_top));
+       DECLARE("CPU_EXCEPSTACK_TOP", offsetof(cpu_data_t, excepstack_top));
 #if __ARM_KERNEL_PROTECT__
-       DECLARE("CPU_EXC_VECTORS",
-           offsetof(cpu_data_t, cpu_exc_vectors));
+       DECLARE("CPU_EXC_VECTORS", offsetof(cpu_data_t, cpu_exc_vectors));
 #endif /* __ARM_KERNEL_PROTECT__ */
-       DECLARE("CPU_NUMBER_GS",
-           offsetof(cpu_data_t, cpu_number));
-       DECLARE("CPU_IDENT",
-           offsetof(cpu_data_t, cpu_ident));
-       DECLARE("CPU_RUNNING",
-           offsetof(cpu_data_t, cpu_running));
-       DECLARE("CPU_MCOUNT_OFF",
-           offsetof(cpu_data_t, cpu_mcount_off));
-       DECLARE("CPU_PENDING_AST",
-           offsetof(cpu_data_t, cpu_pending_ast));
-       DECLARE("CPU_PROCESSOR",
-           offsetof(cpu_data_t, cpu_processor));
-       DECLARE("CPU_CACHE_DISPATCH",
-           offsetof(cpu_data_t, cpu_cache_dispatch));
-       DECLARE("CPU_BASE_TIMEBASE",
-           offsetof(cpu_data_t, cpu_base_timebase));
-       DECLARE("CPU_DECREMENTER",
-           offsetof(cpu_data_t, cpu_decrementer));
-       DECLARE("CPU_GET_DECREMENTER_FUNC",
-           offsetof(cpu_data_t, cpu_get_decrementer_func));
-       DECLARE("CPU_SET_DECREMENTER_FUNC",
-           offsetof(cpu_data_t, cpu_set_decrementer_func));
-       DECLARE("CPU_GET_FIQ_HANDLER",
-           offsetof(cpu_data_t, cpu_get_fiq_handler));
-       DECLARE("CPU_TBD_HARDWARE_ADDR",
-           offsetof(cpu_data_t, cpu_tbd_hardware_addr));
-       DECLARE("CPU_TBD_HARDWARE_VAL",
-           offsetof(cpu_data_t, cpu_tbd_hardware_val));
-       DECLARE("CPU_INT_STATE",
-           offsetof(cpu_data_t, cpu_int_state));
-       DECLARE("INTERRUPT_HANDLER",
-           offsetof(cpu_data_t, interrupt_handler));
-       DECLARE("INTERRUPT_TARGET",
-           offsetof(cpu_data_t, interrupt_target));
-       DECLARE("INTERRUPT_REFCON",
-           offsetof(cpu_data_t, interrupt_refCon));
-       DECLARE("INTERRUPT_NUB",
-           offsetof(cpu_data_t, interrupt_nub));
-       DECLARE("INTERRUPT_SOURCE",
-           offsetof(cpu_data_t, interrupt_source));
-       DECLARE("CPU_USER_DEBUG",
-           offsetof(cpu_data_t, cpu_user_debug));
-       DECLARE("CPU_STAT_IRQ",
-           offsetof(cpu_data_t, cpu_stat.irq_ex_cnt));
-       DECLARE("CPU_STAT_IRQ_WAKE",
-           offsetof(cpu_data_t, cpu_stat.irq_ex_cnt_wake));
-       DECLARE("CPU_RESET_HANDLER",
-           offsetof(cpu_data_t, cpu_reset_handler));
-       DECLARE("CPU_RESET_ASSIST",
-           offsetof(cpu_data_t, cpu_reset_assist));
-       DECLARE("CPU_REGMAP_PADDR",
-           offsetof(cpu_data_t, cpu_regmap_paddr));
-       DECLARE("CPU_PHYS_ID",
-           offsetof(cpu_data_t, cpu_phys_id));
-       DECLARE("RTCLOCK_DATAP",
-           offsetof(cpu_data_t, rtclock_datap));
-       DECLARE("CLUSTER_MASTER",
-           offsetof(cpu_data_t, cluster_master));
-
-       DECLARE("RTCLOCKDataSize",
-           sizeof(rtclock_data_t));
-       DECLARE("RTCLOCK_ADJ_ABSTIME_LOW",
-           offsetof(rtclock_data_t, rtc_adj.abstime_val.low));
-       DECLARE("RTCLOCK_ADJ_ABSTIME_HIGH",
-           offsetof(rtclock_data_t, rtc_adj.abstime_val.high));
-       DECLARE("RTCLOCK_BASE_ABSTIME_LOW",
-           offsetof(rtclock_data_t, rtc_base.abstime_val.low));
-       DECLARE("RTCLOCK_BASE_ABSTIME_HIGH",
-           offsetof(rtclock_data_t, rtc_base.abstime_val.high));
-
-       DECLARE("SIGPdec", SIGPdec);
-
-       DECLARE("rhdSize",
-           sizeof(struct reset_handler_data));
+       DECLARE("CPU_NUMBER_GS", offsetof(cpu_data_t, cpu_number));
+       DECLARE("CPU_PENDING_AST", offsetof(cpu_data_t, cpu_pending_ast));
+       DECLARE("CPU_INT_STATE", offsetof(cpu_data_t, cpu_int_state));
+       DECLARE("CPU_USER_DEBUG", offsetof(cpu_data_t, cpu_user_debug));
+       DECLARE("CPU_STAT_IRQ", offsetof(cpu_data_t, cpu_stat.irq_ex_cnt));
+       DECLARE("CPU_STAT_IRQ_WAKE", offsetof(cpu_data_t, cpu_stat.irq_ex_cnt_wake));
+       DECLARE("CPU_RESET_HANDLER", offsetof(cpu_data_t, cpu_reset_handler));
+       DECLARE("CPU_PHYS_ID", offsetof(cpu_data_t, cpu_phys_id));
+       DECLARE("CLUSTER_MASTER", offsetof(cpu_data_t, cluster_master));
+
+       DECLARE("RTCLOCKDataSize", sizeof(rtclock_data_t));
+
+       DECLARE("rhdSize", sizeof(struct reset_handler_data));
 #if WITH_CLASSIC_S2R || !__arm64__
-       DECLARE("stSize",
-           sizeof(SleepToken));
-#endif
+       DECLARE("stSize", sizeof(SleepToken));
+#endif /* WITH_CLASSIC_S2R || !__arm64__ */
 
        DECLARE("CPU_DATA_ENTRIES", offsetof(struct reset_handler_data, cpu_data_entries));
-       DECLARE("ASSIST_RESET_HANDLER", offsetof(struct reset_handler_data, assist_reset_handler));
 
        DECLARE("CPU_DATA_PADDR", offsetof(struct cpu_data_entry, cpu_data_paddr));
 
@@ -378,53 +250,19 @@ main(
 
        DECLARE("PAGE_MAX_SIZE", PAGE_MAX_SIZE);
 
-       DECLARE("TIMER_TSTAMP",
-           offsetof(struct timer, tstamp));
-       DECLARE("THREAD_TIMER",
-           offsetof(struct processor, processor_data.thread_timer));
-       DECLARE("KERNEL_TIMER",
-           offsetof(struct processor, processor_data.kernel_timer));
-       DECLARE("SYSTEM_STATE",
-           offsetof(struct processor, processor_data.system_state));
-       DECLARE("USER_STATE",
-           offsetof(struct processor, processor_data.user_state));
-       DECLARE("CURRENT_STATE",
-           offsetof(struct processor, processor_data.current_state));
-
-       DECLARE("SYSTEM_TIMER",
-           offsetof(struct thread, system_timer));
-       DECLARE("USER_TIMER",
-           offsetof(struct thread, user_timer));
-
-#if !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME
-       DECLARE("PRECISE_USER_KERNEL_TIME",
-           offsetof(struct thread, precise_user_kernel_time));
-#endif
-
-       DECLARE("BA_VIRT_BASE",
-           offsetof(struct boot_args, virtBase));
-       DECLARE("BA_PHYS_BASE",
-           offsetof(struct boot_args, physBase));
-       DECLARE("BA_MEM_SIZE",
-           offsetof(struct boot_args, memSize));
-       DECLARE("BA_TOP_OF_KERNEL_DATA",
-           offsetof(struct boot_args, topOfKernelData));
-       DECLARE("BA_DEVICE_TREE",
-           offsetof(struct boot_args, deviceTreeP));
-       DECLARE("BA_DEVICE_TREE_LENGTH",
-           offsetof(struct boot_args, deviceTreeLength));
-       DECLARE("BA_BOOT_FLAGS",
-           offsetof(struct boot_args, bootFlags));
-
-       DECLARE("ENTROPY_INDEX_PTR",
-           offsetof(entropy_data_t, index_ptr));
-       DECLARE("ENTROPY_BUFFER",
-           offsetof(entropy_data_t, buffer));
-       DECLARE("ENTROPY_DATA_SIZE", sizeof(struct entropy_data));
+       DECLARE("BA_VIRT_BASE", offsetof(struct boot_args, virtBase));
+       DECLARE("BA_PHYS_BASE", offsetof(struct boot_args, physBase));
+       DECLARE("BA_MEM_SIZE", offsetof(struct boot_args, memSize));
+       DECLARE("BA_TOP_OF_KERNEL_DATA", offsetof(struct boot_args, topOfKernelData));
+       DECLARE("BA_BOOT_FLAGS", offsetof(struct boot_args, bootFlags));
 
        DECLARE("SR_RESTORE_TCR_EL1", offsetof(struct sysreg_restore, tcr_el1));
 
 
 
+#if defined(HAS_APPLE_PAC)
+       DECLARE("CPU_ROP_KEY", offsetof(cpu_data_t, rop_key));
+#endif /* defined(HAS_APPLE_PAC) */
+
        return 0;
 }
diff --git a/osfmk/arm64/gxf_exceptions.s b/osfmk/arm64/gxf_exceptions.s
new file mode 100644 (file)
index 0000000..669f288
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+
+#include <machine/asm.h>
+#include <arm64/machine_routines_asm.h>
+#include <arm64/proc_reg.h>
+#include <pexpert/arm64/board_config.h>
+#include <mach/exception_types.h>
+#include "assym.s"
+#include <arm64/exception_asm.h>
+
+
+/* vim: set ts=4: */
index 25a328da830e70024000762112a3aaf45c419a43..3a5a4d444a95260aa1900a2cf29d1824dfa5f088 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -37,6 +37,8 @@
 #include <stdint.h>
 #include <sys/errno.h>
 
+#if APPLE_ARM64_ARCH_FAMILY
+
 #if MONOTONIC
 #include <kern/monotonic.h>
 #endif /* MONOTONIC */
@@ -180,12 +182,10 @@ void kpc_pmi_handler(unsigned int ctr);
 #define SREG_PMC8 "S3_2_c15_c9_0"
 #define SREG_PMC9 "S3_2_c15_c10_0"
 
-#if !defined(APPLECYCLONE)
 #define SREG_PMMMAP   "S3_2_c15_c15_0"
 #define SREG_PMTRHLD2 "S3_2_c15_c14_0"
 #define SREG_PMTRHLD4 "S3_2_c15_c13_0"
 #define SREG_PMTRHLD6 "S3_2_c15_c12_0"
-#endif
 
 /*
  * The low 8 bits of a configuration words select the event to program on
@@ -217,11 +217,11 @@ void kpc_pmi_handler(unsigned int ctr);
  * All: PMCR2-4, OPMAT0-1, OPMSK0-1.
  * Typhoon/Twister/Hurricane: PMMMAP, PMTRHLD2/4/6.
  */
-#if defined(APPLECYCLONE)
+#if HAS_EARLY_APPLE_CPMU
 #define RAWPMU_CONFIG_COUNT 7
-#else
+#else /* HAS_EARLY_APPLE_CPMU */
 #define RAWPMU_CONFIG_COUNT 11
-#endif
+#endif /* !HAS_EARLY_APPLE_CPMU */
 
 /* TODO: allocate dynamically */
 static uint64_t saved_PMCR[MAX_CPUS][2];
@@ -243,100 +243,89 @@ static boolean_t whitelist_disabled = TRUE;
 static boolean_t whitelist_disabled = FALSE;
 #endif
 
-/* List of counter events that are allowed externally */
+#define CPMU_CORE_CYCLE 0x02
+
+#if HAS_EARLY_APPLE_CPMU
+
+#define CPMU_BIU_UPSTREAM_CYCLE 0x19
+#define CPMU_BIU_DOWNSTREAM_CYCLE 0x1a
+#define CPMU_L2C_AGENT_LD 0x22
+#define CPMU_L2C_AGENT_LD_MISS 0x23
+#define CPMU_L2C_AGENT_ST 0x24
+#define CPMU_L2C_AGENT_ST_MISS 0x25
+#define CPMU_INST_A32 0x78
+#define CPMU_INST_THUMB 0x79
+#define CPMU_INST_A64 0x7a
+#define CPMU_INST_BRANCH 0x7b
+#define CPMU_SYNC_DC_LOAD_MISS 0xb4
+#define CPMU_SYNC_DC_STORE_MISS 0xb5
+#define CPMU_SYNC_DTLB_MISS 0xb6
+#define CPMU_SYNC_ST_HIT_YNGR_LD 0xb9
+#define CPMU_SYNC_BR_ANY_MISP 0xc0
+#define CPMU_FED_IC_MISS_DEM 0xce
+#define CPMU_FED_ITLB_MISS 0xcf
+
+#else /* HAS_EARLY_APPLE_CPMU */
+
+#if HAS_CPMU_BIU_EVENTS
+#define CPMU_BIU_UPSTREAM_CYCLE 0x13
+#define CPMU_BIU_DOWNSTREAM_CYCLE 0x14
+#endif /* HAS_CPMU_BIU_EVENTS */
+
+#if HAS_CPMU_L2C_EVENTS
+#define CPMU_L2C_AGENT_LD 0x1a
+#define CPMU_L2C_AGENT_LD_MISS 0x1b
+#define CPMU_L2C_AGENT_ST 0x1c
+#define CPMU_L2C_AGENT_ST_MISS 0x1d
+#endif /* HAS_CPMU_L2C_EVENTS */
+
+#define CPMU_INST_A32 0x8a
+#define CPMU_INST_THUMB 0x8b
+#define CPMU_INST_A64 0x8c
+#define CPMU_INST_BRANCH 0x8d
+#define CPMU_SYNC_DC_LOAD_MISS 0xbf
+#define CPMU_SYNC_DC_STORE_MISS 0xc0
+#define CPMU_SYNC_DTLB_MISS 0xc1
+#define CPMU_SYNC_ST_HIT_YNGR_LD 0xc4
+#define CPMU_SYNC_BR_ANY_MISP 0xcb
+#define CPMU_FED_IC_MISS_DEM 0xd3
+#define CPMU_FED_ITLB_MISS 0xd4
+
+#endif /* !HAS_EARLY_APPLE_CPMU */
+
+/* List of counter events that are allowed to be used by 3rd-parties. */
 static kpc_config_t whitelist[] = {
-       0,    /* NO_EVENT */
-
-#if defined(APPLECYCLONE)
-       0x02, /* CORE_CYCLE */
-       0x19, /* BIU_UPSTREAM_CYCLE */
-       0x1a, /* BIU_DOWNSTREAM_CYCLE */
-       0x22, /* L2C_AGENT_LD */
-       0x23, /* L2C_AGENT_LD_MISS */
-       0x24, /* L2C_AGENT_ST */
-       0x25, /* L2C_AGENT_ST_MISS */
-       0x78, /* INST_A32 */
-       0x79, /* INST_THUMB */
-       0x7a, /* INST_A64 */
-       0x7b, /* INST_BRANCH */
-       0xb4, /* SYNC_DC_LOAD_MISS */
-       0xb5, /* SYNC_DC_STORE_MISS */
-       0xb6, /* SYNC_DTLB_MISS */
-       0xb9, /* SYNC_ST_HIT_YNGR_LD */
-       0xc0, /* SYNC_BR_ANY_MISP */
-       0xce, /* FED_IC_MISS_DEM */
-       0xcf, /* FED_ITLB_MISS */
-
-#elif defined(APPLETYPHOON)
-       0x02, /* CORE_CYCLE */
-       0x13, /* BIU_UPSTREAM_CYCLE */
-       0x14, /* BIU_DOWNSTREAM_CYCLE */
-       0x1a, /* L2C_AGENT_LD */
-       0x1b, /* L2C_AGENT_LD_MISS */
-       0x1c, /* L2C_AGENT_ST */
-       0x1d, /* L2C_AGENT_ST_MISS */
-       0x8a, /* INST_A32 */
-       0x8b, /* INST_THUMB */
-       0x8c, /* INST_A64 */
-       0x8d, /* INST_BRANCH */
-       0xbf, /* SYNC_DC_LOAD_MISS */
-       0xc0, /* SYNC_DC_STORE_MISS */
-       0xc1, /* SYNC_DTLB_MISS */
-       0xc4, /* SYNC_ST_HIT_YNGR_LD */
-       0xcb, /* SYNC_BR_ANY_MISP */
-       0xd3, /* FED_IC_MISS_DEM */
-       0xd4, /* FED_ITLB_MISS */
-
-#elif defined(APPLETWISTER) || defined(APPLEHURRICANE)
-       0x02, /* CORE_CYCLE */
-       0x1a, /* L2C_AGENT_LD */
-       0x1b, /* L2C_AGENT_LD_MISS */
-       0x1c, /* L2C_AGENT_ST */
-       0x1d, /* L2C_AGENT_ST_MISS */
-       0x8a, /* INST_A32 */
-       0x8b, /* INST_THUMB */
-       0x8c, /* INST_A64 */
-       0x8d, /* INST_BRANCH */
-       0xbf, /* SYNC_DC_LOAD_MISS */
-       0xc0, /* SYNC_DC_STORE_MISS */
-       0xc1, /* SYNC_DTLB_MISS */
-       0xc4, /* SYNC_ST_HIT_YNGR_LD */
-       0xcb, /* SYNC_BR_ANY_MISP */
-       0xd3, /* FED_IC_MISS_DEM */
-       0xd4, /* FED_ITLB_MISS */
-
-#elif defined(APPLEMONSOON)
-       0x02, /* CORE_CYCLE */
-       0x8a, /* INST_A32 */
-       0x8b, /* INST_THUMB */
-       0x8c, /* INST_A64 */
-       0x8d, /* INST_BRANCH */
-       0xbf, /* SYNC_DC_LOAD_MISS */
-       0xc0, /* SYNC_DC_STORE_MISS */
-       0xc1, /* SYNC_DTLB_MISS */
-       0xc4, /* SYNC_ST_HIT_YNGR_LD */
-       0xcb, /* SYNC_BR_ANY_MISP */
-       0xd3, /* FED_IC_MISS_DEM */
-       0xd4, /* FED_ITLB_MISS */
+       0, /* NO_EVENT */
 
-#else
-       /* An unknown CPU gets a trivial { NO_EVENT } whitelist. */
-#endif
+       CPMU_CORE_CYCLE,
+
+#if HAS_CPMU_BIU_EVENTS
+       CPMU_BIU_UPSTREAM_CYCLE, CPMU_BIU_DOWNSTREAM_CYCLE,
+#endif /* HAS_CPMU_BIU_EVENTS */
+
+#if HAS_CPMU_L2C_EVENTS
+       CPMU_L2C_AGENT_LD, CPMU_L2C_AGENT_LD_MISS, CPMU_L2C_AGENT_ST,
+       CPMU_L2C_AGENT_ST_MISS,
+#endif /* HAS_CPMU_L2C_EVENTS */
+
+       CPMU_INST_A32, CPMU_INST_THUMB, CPMU_INST_A64, CPMU_INST_BRANCH,
+       CPMU_SYNC_DC_LOAD_MISS, CPMU_SYNC_DC_STORE_MISS,
+       CPMU_SYNC_DTLB_MISS, CPMU_SYNC_ST_HIT_YNGR_LD,
+       CPMU_SYNC_BR_ANY_MISP, CPMU_FED_IC_MISS_DEM, CPMU_FED_ITLB_MISS,
 };
-#define WHITELIST_COUNT (sizeof(whitelist)/sizeof(*whitelist))
+#define WHITELIST_COUNT (sizeof(whitelist) / sizeof(whitelist[0]))
+#define EVENT_MASK 0xff
 
-static boolean_t
+static bool
 config_in_whitelist(kpc_config_t cfg)
 {
-       unsigned int i;
-
-       for (i = 0; i < WHITELIST_COUNT; i++) {
-               if (cfg == whitelist[i]) {
-                       return TRUE;
+       for (unsigned int i = 0; i < WHITELIST_COUNT; i++) {
+               /* Strip off any EL configuration bits -- just look at the event. */
+               if ((cfg & EVENT_MASK) == whitelist[i]) {
+                       return true;
                }
        }
-
-       return FALSE;
+       return false;
 }
 
 #ifdef KPC_DEBUG
@@ -784,7 +773,7 @@ kpc_set_running_xcall( void *vstate )
        set_running_configurable(mp_config->cfg_target_mask,
            mp_config->cfg_state_mask);
 
-       if (hw_atomic_sub(&kpc_xcall_sync, 1) == 0) {
+       if (os_atomic_dec(&kpc_xcall_sync, relaxed) == 0) {
                thread_wakeup((event_t) &kpc_xcall_sync);
        }
 }
@@ -802,9 +791,9 @@ kpc_get_curcpu_counters_xcall(void *args)
        int r = kpc_get_curcpu_counters(handler->classes, NULL, &handler->buf[offset]);
 
        /* number of counters added by this CPU, needs to be atomic  */
-       hw_atomic_add(&(handler->nb_counters), r);
+       os_atomic_add(&(handler->nb_counters), r, relaxed);
 
-       if (hw_atomic_sub(&kpc_xread_sync, 1) == 0) {
+       if (os_atomic_dec(&kpc_xread_sync, relaxed) == 0) {
                thread_wakeup((event_t) &kpc_xread_sync);
        }
 }
@@ -939,7 +928,7 @@ kpc_set_config_xcall(void *vmp_config)
                new_config += RAWPMU_CONFIG_COUNT;
        }
 
-       if (hw_atomic_sub(&kpc_config_sync, 1) == 0) {
+       if (os_atomic_dec(&kpc_config_sync, relaxed) == 0) {
                thread_wakeup((event_t) &kpc_config_sync);
        }
 }
@@ -1010,7 +999,7 @@ kpc_set_reload_xcall(void *vmp_config)
 
        ml_set_interrupts_enabled(enabled);
 
-       if (hw_atomic_sub(&kpc_reload_sync, 1) == 0) {
+       if (os_atomic_dec(&kpc_reload_sync, relaxed) == 0) {
                thread_wakeup((event_t) &kpc_reload_sync);
        }
 }
@@ -1124,3 +1113,165 @@ kpc_get_pmu_version(void)
 {
        return KPC_PMU_ARM_APPLE;
 }
+
+#else /* APPLE_ARM64_ARCH_FAMILY */
+
+/* We don't currently support non-Apple arm64 PMU configurations like PMUv3 */
+
+void
+kpc_arch_init(void)
+{
+       /* No-op */
+}
+
+uint32_t
+kpc_get_classes(void)
+{
+       return 0;
+}
+
+uint32_t
+kpc_fixed_count(void)
+{
+       return 0;
+}
+
+uint32_t
+kpc_configurable_count(void)
+{
+       return 0;
+}
+
+uint32_t
+kpc_fixed_config_count(void)
+{
+       return 0;
+}
+
+uint32_t
+kpc_configurable_config_count(uint64_t pmc_mask __unused)
+{
+       return 0;
+}
+
+int
+kpc_get_fixed_config(kpc_config_t *configv __unused)
+{
+       return 0;
+}
+
+uint64_t
+kpc_fixed_max(void)
+{
+       return 0;
+}
+
+uint64_t
+kpc_configurable_max(void)
+{
+       return 0;
+}
+
+int
+kpc_get_configurable_config(kpc_config_t *configv __unused, uint64_t pmc_mask __unused)
+{
+       return ENOTSUP;
+}
+
+int
+kpc_get_configurable_counters(uint64_t *counterv __unused, uint64_t pmc_mask __unused)
+{
+       return ENOTSUP;
+}
+
+int
+kpc_get_fixed_counters(uint64_t *counterv __unused)
+{
+       return 0;
+}
+
+boolean_t
+kpc_is_running_fixed(void)
+{
+       return FALSE;
+}
+
+boolean_t
+kpc_is_running_configurable(uint64_t pmc_mask __unused)
+{
+       return FALSE;
+}
+
+int
+kpc_set_running_arch(struct kpc_running_remote *mp_config __unused)
+{
+       return ENOTSUP;
+}
+
+int
+kpc_set_period_arch(struct kpc_config_remote *mp_config __unused)
+{
+       return ENOTSUP;
+}
+
+int
+kpc_set_config_arch(struct kpc_config_remote *mp_config __unused)
+{
+       return ENOTSUP;
+}
+
+void
+kpc_idle(void)
+{
+       // do nothing
+}
+
+void
+kpc_idle_exit(void)
+{
+       // do nothing
+}
+
+int
+kpc_get_all_cpus_counters(uint32_t classes __unused, int *curcpu __unused, uint64_t *buf __unused)
+{
+       return 0;
+}
+
+int
+kpc_set_sw_inc( uint32_t mask __unused )
+{
+       return ENOTSUP;
+}
+
+int
+kpc_get_pmu_version(void)
+{
+       return KPC_PMU_ERROR;
+}
+
+uint32_t
+kpc_rawpmu_config_count(void)
+{
+       return 0;
+}
+
+int
+kpc_get_rawpmu_config(__unused kpc_config_t *configv)
+{
+       return 0;
+}
+
+int
+kpc_disable_whitelist( int val __unused )
+{
+       return 0;
+}
+
+int
+kpc_get_whitelist_disabled( void )
+{
+       return 0;
+}
+
+#endif /* !APPLE_ARM64_ARCH_FAMILY */
index 6a8d109f785184ec72be4639a88740552a7171bf..5edaf67f1711d32ee4b51684c5885492e286d857 100644 (file)
  */
 
 #include <machine/asm.h>
+#include <arm64/machine_routines_asm.h>
 #include <arm64/proc_reg.h>
 #include <pexpert/arm64/board_config.h>
 #include <mach/exception_types.h>
 #include <mach_kdp.h>
 #include <config_dtrace.h>
 #include "assym.s"
+#include <arm64/exception_asm.h>
 
 #if __ARM_KERNEL_PROTECT__
 #include <arm/pmap.h>
 #endif
 
 
-/*
- * INIT_SAVED_STATE_FLAVORS
- *
- * Initializes the saved state flavors of a new saved state structure
- *  arg0 - saved state pointer
- *  arg1 - 32-bit scratch reg
- *  arg2 - 32-bit scratch reg
- */
-.macro INIT_SAVED_STATE_FLAVORS
-       mov             $1, ARM_SAVED_STATE64                           // Set saved state to 64-bit flavor
-       mov             $2, ARM_SAVED_STATE64_COUNT
-       stp             $1, $2, [$0, SS_FLAVOR]
-       mov             $1, ARM_NEON_SAVED_STATE64                      // Set neon state to 64-bit flavor
-       str             $1, [$0, NS_FLAVOR]
-       mov             $1, ARM_NEON_SAVED_STATE64_COUNT
-       str             $1, [$0, NS_COUNT]
-.endmacro
-
-
-/*
- * SPILL_REGISTERS
- *
- * Spills the current set of registers (excluding x0 and x1) to the specified
- * save area.
- *   x0 - Address of the save area
- */
-.macro SPILL_REGISTERS
-       stp             x2, x3, [x0, SS64_X2]                           // Save remaining GPRs
-       stp             x4, x5, [x0, SS64_X4]
-       stp             x6, x7, [x0, SS64_X6]
-       stp             x8, x9, [x0, SS64_X8]
-       stp             x10, x11, [x0, SS64_X10]
-       stp             x12, x13, [x0, SS64_X12]
-       stp             x14, x15, [x0, SS64_X14]
-       stp             x16, x17, [x0, SS64_X16]
-       stp             x18, x19, [x0, SS64_X18]
-       stp             x20, x21, [x0, SS64_X20]
-       stp             x22, x23, [x0, SS64_X22]
-       stp             x24, x25, [x0, SS64_X24]
-       stp             x26, x27, [x0, SS64_X26]
-       str             x28, [x0, SS64_X28]
-
-       /* Save arm_neon_saved_state64 */
-
-       stp             q0, q1, [x0, NS64_Q0]
-       stp             q2, q3, [x0, NS64_Q2]
-       stp             q4, q5, [x0, NS64_Q4]
-       stp             q6, q7, [x0, NS64_Q6]
-       stp             q8, q9, [x0, NS64_Q8]
-       stp             q10, q11, [x0, NS64_Q10]
-       stp             q12, q13, [x0, NS64_Q12]
-       stp             q14, q15, [x0, NS64_Q14]
-       stp             q16, q17, [x0, NS64_Q16]
-       stp             q18, q19, [x0, NS64_Q18]
-       stp             q20, q21, [x0, NS64_Q20]
-       stp             q22, q23, [x0, NS64_Q22]
-       stp             q24, q25, [x0, NS64_Q24]
-       stp             q26, q27, [x0, NS64_Q26]
-       stp             q28, q29, [x0, NS64_Q28]
-       stp             q30, q31, [x0, NS64_Q30]
-
-       mrs             lr,  ELR_EL1                                            // Get exception link register
-       mrs             x23, SPSR_EL1                                           // Load CPSR into var reg x23
-       mrs             x24, FPSR
-       mrs             x25, FPCR
-
-
-       str             lr, [x0, SS64_PC]                                       // Save ELR to PCB
-       str             w23, [x0, SS64_CPSR]                            // Save CPSR to PCB
-       str             w24, [x0, NS64_FPSR]
-       str             w25, [x0, NS64_FPCR]
-
-       mrs             x20, FAR_EL1
-       mrs             x21, ESR_EL1
-       str             x20, [x0, SS64_FAR]
-       str             w21, [x0, SS64_ESR]
-.endmacro
-
-
 #define        CBF_DISABLE     0
 #define        CBF_ENABLE      1
 
        .align 3
        .globl EXT(exc_vectors_table)
 LEXT(exc_vectors_table)
-       /* Table of exception handlers. */
-       .quad Lel1_sp0_synchronous_vector_long
-       .quad Lel1_sp0_irq_vector_long
-       .quad Lel1_sp0_fiq_vector_long
-       .quad Lel1_sp0_serror_vector_long
-       .quad Lel1_sp1_synchronous_vector_long
-       .quad Lel1_sp1_irq_vector_long
-       .quad Lel1_sp1_fiq_vector_long
-       .quad Lel1_sp1_serror_vector_long
-       .quad Lel0_synchronous_vector_64_long
-       .quad Lel0_irq_vector_64_long
-       .quad Lel0_fiq_vector_64_long
-       .quad Lel0_serror_vector_64_long
+       /* Table of exception handlers.
+         * These handlers sometimes contain deadloops. 
+         * It's nice to have symbols for them when debugging. */
+       .quad el1_sp0_synchronous_vector_long
+       .quad el1_sp0_irq_vector_long
+       .quad el1_sp0_fiq_vector_long
+       .quad el1_sp0_serror_vector_long
+       .quad el1_sp1_synchronous_vector_long
+       .quad el1_sp1_irq_vector_long
+       .quad el1_sp1_fiq_vector_long
+       .quad el1_sp1_serror_vector_long
+       .quad el0_synchronous_vector_64_long
+       .quad el0_irq_vector_64_long
+       .quad el0_fiq_vector_64_long
+       .quad el0_serror_vector_64_long
 #endif /* __ARM_KERNEL_PROTECT__ */
 
        .text
@@ -234,66 +159,66 @@ LEXT(exc_vectors_table)
        .globl EXT(ExceptionVectorsBase)
 LEXT(ExceptionVectorsBase)
 Lel1_sp0_synchronous_vector:
-       BRANCH_TO_KVA_VECTOR Lel1_sp0_synchronous_vector_long, 0
+       BRANCH_TO_KVA_VECTOR el1_sp0_synchronous_vector_long, 0
 
        .text
        .align 7
 Lel1_sp0_irq_vector:
-       BRANCH_TO_KVA_VECTOR Lel1_sp0_irq_vector_long, 1
+       BRANCH_TO_KVA_VECTOR el1_sp0_irq_vector_long, 1
 
        .text
        .align 7
 Lel1_sp0_fiq_vector:
-       BRANCH_TO_KVA_VECTOR Lel1_sp0_fiq_vector_long, 2
+       BRANCH_TO_KVA_VECTOR el1_sp0_fiq_vector_long, 2
 
        .text
        .align 7
 Lel1_sp0_serror_vector:
-       BRANCH_TO_KVA_VECTOR Lel1_sp0_serror_vector_long, 3
+       BRANCH_TO_KVA_VECTOR el1_sp0_serror_vector_long, 3
 
        .text
        .align 7
 Lel1_sp1_synchronous_vector:
-       BRANCH_TO_KVA_VECTOR Lel1_sp1_synchronous_vector_long, 4
+       BRANCH_TO_KVA_VECTOR el1_sp1_synchronous_vector_long, 4
 
        .text
        .align 7
 Lel1_sp1_irq_vector:
-       BRANCH_TO_KVA_VECTOR Lel1_sp1_irq_vector_long, 5
+       BRANCH_TO_KVA_VECTOR el1_sp1_irq_vector_long, 5
 
        .text
        .align 7
 Lel1_sp1_fiq_vector:
-       BRANCH_TO_KVA_VECTOR Lel1_sp1_fiq_vector_long, 6
+       BRANCH_TO_KVA_VECTOR el1_sp1_fiq_vector_long, 6
 
        .text
        .align 7
 Lel1_sp1_serror_vector:
-       BRANCH_TO_KVA_VECTOR Lel1_sp1_serror_vector, 7
+       BRANCH_TO_KVA_VECTOR el1_sp1_serror_vector_long, 7
 
        .text
        .align 7
 Lel0_synchronous_vector_64:
        MAP_KERNEL
-       BRANCH_TO_KVA_VECTOR Lel0_synchronous_vector_64_long, 8
+       BRANCH_TO_KVA_VECTOR el0_synchronous_vector_64_long, 8
 
        .text
        .align 7
 Lel0_irq_vector_64:
        MAP_KERNEL
-       BRANCH_TO_KVA_VECTOR Lel0_irq_vector_64_long, 9
+       BRANCH_TO_KVA_VECTOR el0_irq_vector_64_long, 9
 
        .text
        .align 7
 Lel0_fiq_vector_64:
        MAP_KERNEL
-       BRANCH_TO_KVA_VECTOR Lel0_fiq_vector_64_long, 10
+       BRANCH_TO_KVA_VECTOR el0_fiq_vector_64_long, 10
 
        .text
        .align 7
 Lel0_serror_vector_64:
        MAP_KERNEL
-       BRANCH_TO_KVA_VECTOR Lel0_serror_vector_64_long, 11
+       BRANCH_TO_KVA_VECTOR el0_serror_vector_64_long, 11
 
        /* Fill out the rest of the page */
        .align 12
@@ -313,7 +238,7 @@ Lel0_serror_vector_64:
        mov             x0, sp                                                          // Copy saved state pointer to x0
 .endmacro
 
-Lel1_sp0_synchronous_vector_long:
+el1_sp0_synchronous_vector_long:
        sub             sp, sp, ARM_CONTEXT_SIZE                        // Make space on the exception stack
        stp             x0, x1, [sp, SS64_X0]                           // Save x0, x1 to the stack
        mrs             x1, ESR_EL1                                                     // Get the exception syndrome
@@ -331,35 +256,35 @@ Lkernel_stack_valid:
        ldp             x0, x1, [sp, SS64_X0]                           // Restore x0, x1
        add             sp, sp, ARM_CONTEXT_SIZE                        // Restore SP1
        EL1_SP0_VECTOR
-       adrp    x1, fleh_synchronous@page                       // Load address for fleh
-       add             x1, x1, fleh_synchronous@pageoff
+       adrp    x1, EXT(fleh_synchronous)@page                  // Load address for fleh
+       add             x1, x1, EXT(fleh_synchronous)@pageoff
        b               fleh_dispatch64
 
-Lel1_sp0_irq_vector_long:
+el1_sp0_irq_vector_long:
        EL1_SP0_VECTOR
        mrs             x1, TPIDR_EL1
        ldr             x1, [x1, ACT_CPUDATAP]
        ldr             x1, [x1, CPU_ISTACKPTR]
        mov             sp, x1
-       adrp    x1, fleh_irq@page                                       // Load address for fleh
-       add             x1, x1, fleh_irq@pageoff
+       adrp    x1, EXT(fleh_irq)@page                                  // Load address for fleh
+       add             x1, x1, EXT(fleh_irq)@pageoff
        b               fleh_dispatch64
 
-Lel1_sp0_fiq_vector_long:
+el1_sp0_fiq_vector_long:
        // ARM64_TODO write optimized decrementer
        EL1_SP0_VECTOR
        mrs             x1, TPIDR_EL1
        ldr             x1, [x1, ACT_CPUDATAP]
        ldr             x1, [x1, CPU_ISTACKPTR]
        mov             sp, x1
-       adrp    x1, fleh_fiq@page                                       // Load address for fleh
-       add             x1, x1, fleh_fiq@pageoff
+       adrp    x1, EXT(fleh_fiq)@page                                  // Load address for fleh
+       add             x1, x1, EXT(fleh_fiq)@pageoff
        b               fleh_dispatch64
 
-Lel1_sp0_serror_vector_long:
+el1_sp0_serror_vector_long:
        EL1_SP0_VECTOR
-       adrp    x1, fleh_serror@page                            // Load address for fleh
-       add             x1, x1, fleh_serror@pageoff
+       adrp    x1, EXT(fleh_serror)@page                               // Load address for fleh
+       add             x1, x1, EXT(fleh_serror)@pageoff
        b               fleh_dispatch64
 
 .macro EL1_SP1_VECTOR
@@ -372,7 +297,7 @@ Lel1_sp0_serror_vector_long:
        mov             x0, sp                                                          // Copy saved state pointer to x0
 .endmacro
 
-Lel1_sp1_synchronous_vector_long:
+el1_sp1_synchronous_vector_long:
        b               check_exception_stack
 Lel1_sp1_synchronous_valid_stack:
 #if defined(KERNEL_INTEGRITY_KTRR)
@@ -384,27 +309,60 @@ Lel1_sp1_synchronous_vector_continue:
        add             x1, x1, fleh_synchronous_sp1@pageoff
        b               fleh_dispatch64
 
-Lel1_sp1_irq_vector_long:
+el1_sp1_irq_vector_long:
        EL1_SP1_VECTOR
        adrp    x1, fleh_irq_sp1@page
        add             x1, x1, fleh_irq_sp1@pageoff
        b               fleh_dispatch64
 
-Lel1_sp1_fiq_vector_long:
+el1_sp1_fiq_vector_long:
        EL1_SP1_VECTOR
        adrp    x1, fleh_fiq_sp1@page
        add             x1, x1, fleh_fiq_sp1@pageoff
        b               fleh_dispatch64
 
-Lel1_sp1_serror_vector_long:
+el1_sp1_serror_vector_long:
        EL1_SP1_VECTOR
        adrp    x1, fleh_serror_sp1@page
        add             x1, x1, fleh_serror_sp1@pageoff
        b               fleh_dispatch64
 
+#if defined(HAS_APPLE_PAC) && !(__APCFG_SUPPORTED__ || __APSTS_SUPPORTED__)
+/**
+ * On these CPUs, SCTLR_CP15BEN_ENABLED is res0, and SCTLR_{ITD,SED}_DISABLED are res1.
+ * The rest of the bits in SCTLR_EL1_DEFAULT | SCTLR_PACIB_ENABLED are set in common_start.
+ */
+#define SCTLR_EL1_INITIAL      (SCTLR_EL1_DEFAULT | SCTLR_PACIB_ENABLED)
+#define SCTLR_EL1_EXPECTED     ((SCTLR_EL1_INITIAL | SCTLR_SED_DISABLED | SCTLR_ITD_DISABLED) & ~SCTLR_CP15BEN_ENABLED)
+#endif
+
 .macro EL0_64_VECTOR
        mov             x18, #0                                                 // Zero x18 to avoid leaking data to user SS
        stp             x0, x1, [sp, #-16]!                                     // Save x0 and x1 to the exception stack
+#if defined(HAS_APPLE_PAC) && !(__APCFG_SUPPORTED__ || __APSTS_SUPPORTED__)
+       // enable JOP for kernel
+       adrp    x0, EXT(const_boot_args)@page
+       add             x0, x0, EXT(const_boot_args)@pageoff
+       ldr             x0, [x0, BA_BOOT_FLAGS]
+       and             x0, x0, BA_BOOT_FLAGS_DISABLE_JOP
+       cbnz    x0, 1f
+       // if disable jop is set, don't touch SCTLR (it's already off)
+       // if (!boot_args->kernel_jop_disable) {
+       mrs             x0, SCTLR_EL1
+       tbnz    x0, SCTLR_PACIA_ENABLED_SHIFT, 1f
+       //      turn on jop for kernel if it isn't already on
+       //      if (!jop_running) {
+       MOV64   x1, SCTLR_JOP_KEYS_ENABLED
+       orr             x0, x0, x1
+       msr             SCTLR_EL1, x0
+       isb             sy
+       MOV64   x1, SCTLR_EL1_EXPECTED | SCTLR_JOP_KEYS_ENABLED
+       cmp             x0, x1
+       bne             .
+       //      }
+       // }
+1:
+#endif /* defined(HAS_APPLE_PAC) && !(__APCFG_SUPPORTED__ || __APSTS_SUPPORTED__) */
        mrs             x0, TPIDR_EL1                                           // Load the thread register
        mrs             x1, SP_EL0                                                      // Load the user stack pointer
        add             x0, x0, ACT_CONTEXT                                     // Calculate where we store the user context pointer
@@ -421,42 +379,42 @@ Lel1_sp1_serror_vector_long:
 .endmacro
 
 
-Lel0_synchronous_vector_64_long:
+el0_synchronous_vector_64_long:
        EL0_64_VECTOR
        mrs             x1, TPIDR_EL1                                           // Load the thread register
        ldr             x1, [x1, TH_KSTACKPTR]                          // Load the top of the kernel stack to x1
        mov             sp, x1                                                          // Set the stack pointer to the kernel stack
-       adrp    x1, fleh_synchronous@page                       // Load address for fleh
-       add             x1, x1, fleh_synchronous@pageoff
+       adrp    x1, EXT(fleh_synchronous)@page                  // Load address for fleh
+       add             x1, x1, EXT(fleh_synchronous)@pageoff
        b               fleh_dispatch64
 
-Lel0_irq_vector_64_long:
+el0_irq_vector_64_long:
        EL0_64_VECTOR
        mrs             x1, TPIDR_EL1
        ldr             x1, [x1, ACT_CPUDATAP]
        ldr             x1, [x1, CPU_ISTACKPTR]
        mov             sp, x1                                                          // Set the stack pointer to the kernel stack
-       adrp    x1, fleh_irq@page                                       // load address for fleh
-       add             x1, x1, fleh_irq@pageoff
+       adrp    x1, EXT(fleh_irq)@page                                  // load address for fleh
+       add             x1, x1, EXT(fleh_irq)@pageoff
        b               fleh_dispatch64
 
-Lel0_fiq_vector_64_long:
+el0_fiq_vector_64_long:
        EL0_64_VECTOR
        mrs             x1, TPIDR_EL1
        ldr             x1, [x1, ACT_CPUDATAP]
        ldr             x1, [x1, CPU_ISTACKPTR]
        mov             sp, x1                                                          // Set the stack pointer to the kernel stack
-       adrp    x1, fleh_fiq@page                                       // load address for fleh
-       add             x1, x1, fleh_fiq@pageoff
+       adrp    x1, EXT(fleh_fiq)@page                                  // load address for fleh
+       add             x1, x1, EXT(fleh_fiq)@pageoff
        b               fleh_dispatch64
 
-Lel0_serror_vector_64_long:
+el0_serror_vector_64_long:
        EL0_64_VECTOR
        mrs             x1, TPIDR_EL1                                           // Load the thread register
        ldr             x1, [x1, TH_KSTACKPTR]                          // Load the top of the kernel stack to x1
        mov             sp, x1                                                          // Set the stack pointer to the kernel stack
-       adrp    x1, fleh_serror@page                            // load address for fleh
-       add             x1, x1, fleh_serror@pageoff
+       adrp    x1, EXT(fleh_serror)@page                               // load address for fleh
+       add             x1, x1, EXT(fleh_serror)@pageoff
        b               fleh_dispatch64
 
 
@@ -583,7 +541,7 @@ check_ktrr_sctlr_trap:
        .align 2
 fleh_dispatch64:
        /* Save arm_saved_state64 */
-       SPILL_REGISTERS
+       SPILL_REGISTERS KERNEL_MODE
 
        /* If exception is from userspace, zero unused registers */
        and             x23, x23, #(PSR64_MODE_EL_MASK)
@@ -640,7 +598,8 @@ fleh_dispatch64:
 
        .text
        .align 2
-fleh_synchronous:
+       .global EXT(fleh_synchronous)
+LEXT(fleh_synchronous)
        mrs             x1, ESR_EL1                                                     // Load exception syndrome
        mrs             x2, FAR_EL1                                                     // Load fault address
 
@@ -724,7 +683,8 @@ Lfleh_sync_load_lr:
 
        .text
        .align 2
-fleh_irq:
+       .global EXT(fleh_irq)
+LEXT(fleh_irq)
        BEGIN_INTERRUPT_HANDLER
        PUSH_FRAME
        bl              EXT(sleh_irq)
@@ -742,7 +702,8 @@ LEXT(fleh_fiq_generic)
 
        .text
        .align 2
-fleh_fiq:
+       .global EXT(fleh_fiq)
+LEXT(fleh_fiq)
        BEGIN_INTERRUPT_HANDLER
        PUSH_FRAME
        bl              EXT(sleh_fiq)
@@ -754,7 +715,8 @@ fleh_fiq:
 
        .text
        .align 2
-fleh_serror:
+       .global EXT(fleh_serror)
+LEXT(fleh_serror)
        mrs             x1, ESR_EL1                                                     // Load exception syndrome
        mrs             x2, FAR_EL1                                                     // Load fault address
 
@@ -820,31 +782,27 @@ Lsp1_serror_str:
        .text
        .align 2
 exception_return_dispatch:
-       ldr             w0, [x21, SS_FLAVOR]                    // x0 = (threadIs64Bit) ? ss_64.cpsr : ss_32.cpsr
-       cmp             x0, ARM_SAVED_STATE64
-       ldr             w1, [x21, SS64_CPSR]
-       ldr             w2, [x21, SS32_CPSR]
-       csel    w0, w1, w2, eq
-       tbnz    w0, PSR64_MODE_EL_SHIFT, return_to_kernel // Test for low bit of EL, return to kernel if set
+       ldr             w0, [x21, SS64_CPSR]
+       tst             w0, PSR64_MODE_EL_MASK
+       b.ne    return_to_kernel // return to kernel if M[3:2] > 0
        b               return_to_user
 
        .text
        .align 2
 return_to_kernel:
-       tbnz    w0, #DAIF_IRQF_SHIFT, Lkernel_skip_ast_taken    // Skip AST check if IRQ disabled
-       msr             DAIFSet, #(DAIFSC_IRQF | DAIFSC_FIQF)           // Disable interrupts
-       mrs             x0, TPIDR_EL1                                                           // Load thread pointer
-       ldr             w1, [x0, ACT_PREEMPT_CNT]                                       // Load preemption count
-       cbnz    x1, Lkernel_skip_ast_taken                                      // If preemption disabled, skip AST check
-       ldr             x1, [x0, ACT_CPUDATAP]                                          // Get current CPU data pointer
-       ldr             x2, [x1, CPU_PENDING_AST]                                       // Get ASTs
-       tst             x2, AST_URGENT                                                          // If no urgent ASTs, skip ast_taken
-       b.eq    Lkernel_skip_ast_taken
-       mov             sp, x21                                                                         // Switch to thread stack for preemption
+       tbnz    w0, #DAIF_IRQF_SHIFT, exception_return  // Skip AST check if IRQ disabled
+       mrs             x3, TPIDR_EL1                           // Load thread pointer
+       ldr             w1, [x3, ACT_PREEMPT_CNT]               // Load preemption count
+       msr             DAIFSet, #DAIFSC_ALL                    // Disable exceptions
+       cbnz    x1, exception_return_unint_tpidr_x3     // If preemption disabled, skip AST check
+       ldr             x1, [x3, ACT_CPUDATAP]                  // Get current CPU data pointer
+       ldr             x2, [x1, CPU_PENDING_AST]               // Get ASTs
+       tst             x2, AST_URGENT                          // If no urgent ASTs, skip ast_taken
+       b.eq    exception_return_unint_tpidr_x3
+       mov             sp, x21                                 // Switch to thread stack for preemption
        PUSH_FRAME
-       bl              EXT(ast_taken_kernel)                                           // Handle AST_URGENT
+       bl              EXT(ast_taken_kernel)                   // Handle AST_URGENT
        POP_FRAME
-Lkernel_skip_ast_taken:
        b               exception_return
 
        .text
@@ -870,26 +828,33 @@ LEXT(thread_exception_return)
        .text
 return_to_user:
 check_user_asts:
-       msr             DAIFSet, #(DAIFSC_IRQF | DAIFSC_FIQF)           // Disable interrupts
        mrs             x3, TPIDR_EL1                                                           // Load thread pointer
 
        movn            w2, #0
        str             w2, [x3, TH_IOTIER_OVERRIDE]                    // Reset IO tier override to -1 before returning to user
 
+#if MACH_ASSERT
        ldr             w0, [x3, TH_RWLOCK_CNT]
-       cbz             w0, 1f                                                          // Detect unbalance RW lock/unlock
+       cbz             w0, 1f                                          // Detect unbalance RW lock/unlock
        b               rwlock_count_notzero
 1:
+       ldr             w0, [x3, ACT_PREEMPT_CNT]
+       cbz             w0, 1f
+       b               preempt_count_notzero
+1:
+#endif
        
-       ldr             x4, [x3, ACT_CPUDATAP]                                          // Get current CPU data pointer
-       ldr             x0, [x4, CPU_PENDING_AST]                                       // Get ASTs
-       cbnz    x0, user_take_ast                                                       // If pending ASTs, go service them
+       msr             DAIFSet, #DAIFSC_ALL                            // Disable exceptions
+       ldr             x4, [x3, ACT_CPUDATAP]                          // Get current CPU data pointer
+       ldr             x0, [x4, CPU_PENDING_AST]                       // Get ASTs
+       cbnz    x0, user_take_ast                                       // If pending ASTs, go service them
        
 #if    !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME
+       mov             x19, x3                                         // Preserve thread pointer across function call
        PUSH_FRAME
        bl              EXT(timer_state_event_kernel_to_user)
        POP_FRAME
-       mrs             x3, TPIDR_EL1                                                           // Reload thread pointer
+       mov             x3, x19
 #endif  /* !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME */
 
 #if (CONFIG_KERNEL_INTEGRITY && KERNEL_INTEGRITY_WT)
@@ -923,6 +888,7 @@ check_user_asts:
        ldr             x0, [x3, ACT_DEBUGDATA]
        orr             x1, x1, x0                                                      // Thread debug state and live debug state both NULL?
        cbnz    x1, user_set_debug_state_and_return     // If one or the other non-null, go set debug state
+       b               exception_return_unint_tpidr_x3
 
        //
        // Fall through from return_to_user to exception_return.
@@ -932,7 +898,9 @@ check_user_asts:
 
 exception_return:
        msr             DAIFSet, #DAIFSC_ALL                            // Disable exceptions
+exception_return_unint:
        mrs             x3, TPIDR_EL1                                   // Load thread pointer
+exception_return_unint_tpidr_x3:
        mov             sp, x21                                         // Reload the pcb pointer
 
        /* ARM64_TODO Reserve x18 until we decide what to do with it */
@@ -960,18 +928,42 @@ Lskip_el0_eret_mapping:
 #endif /* __ARM_KERNEL_PROTECT__ */
 
 Lexception_return_restore_registers:
-       /* Restore special register state */
-       ldr             x0, [sp, SS64_PC]                                       // Get the return address
-       ldr             w1, [sp, SS64_CPSR]                                     // Get the return CPSR
-       ldr             w2, [sp, NS64_FPSR]
-       ldr             w3, [sp, NS64_FPCR]
-
-       msr             ELR_EL1, x0                                                     // Load the return address into ELR
-       msr             SPSR_EL1, x1                                            // Load the return CPSR into SPSR
-       msr             FPSR, x2
-       msr             FPCR, x3                                                        // Synchronized by ERET
-
        mov     x0, sp                                                          // x0 = &pcb
+       // Loads authed $x0->ss_64.pc into x1 and $x0->ss_64.cpsr into w2
+       AUTH_THREAD_STATE_IN_X0 x20, x21, x22, x23, x24
+
+/* Restore special register state */
+       ldr             w3, [sp, NS64_FPSR]
+       ldr             w4, [sp, NS64_FPCR]
+
+       msr             ELR_EL1, x1                                                     // Load the return address into ELR
+       msr             SPSR_EL1, x2                                            // Load the return CPSR into SPSR
+       msr             FPSR, x3
+       msr             FPCR, x4                                                        // Synchronized by ERET
+
+#if defined(HAS_APPLE_PAC) && !(__APCFG_SUPPORTED__ || __APSTS_SUPPORTED__)
+       /* if eret to userspace, disable JOP */
+       tbnz    w2, PSR64_MODE_EL_SHIFT, Lskip_disable_jop
+       adrp    x4, EXT(const_boot_args)@page
+       add             x4, x4, EXT(const_boot_args)@pageoff
+       ldr             x4, [x4, BA_BOOT_FLAGS]
+       and             x1, x4, BA_BOOT_FLAGS_DISABLE_JOP
+       cbnz    x1, Lskip_disable_jop // if global JOP disabled, don't touch SCTLR (kernel JOP is already off)
+       and             x1, x4, BA_BOOT_FLAGS_DISABLE_USER_JOP
+       cbnz    x1, Ldisable_jop // if global user JOP disabled, always turn off JOP regardless of thread flag (kernel running with JOP on)
+       mrs             x2, TPIDR_EL1
+       ldr             x2, [x2, TH_DISABLE_USER_JOP]
+       cbz             x2, Lskip_disable_jop // if thread has JOP enabled, leave it on (kernel running with JOP on)
+Ldisable_jop:
+       MOV64   x1, SCTLR_JOP_KEYS_ENABLED
+       mrs             x4, SCTLR_EL1
+       bic             x4, x4, x1
+       msr             SCTLR_EL1, x4
+       MOV64   x1, SCTLR_EL1_EXPECTED
+       cmp             x4, x1
+       bne             .
+Lskip_disable_jop:
+#endif /* defined(HAS_APPLE_PAC) && !(__APCFG_SUPPORTED__ || __APSTS_SUPPORTED__)*/
 
        /* Restore arm_neon_saved_state64 */
        ldp             q0, q1, [x0, NS64_Q0]
@@ -1001,14 +993,15 @@ Lexception_return_restore_registers:
        ldp             x10, x11, [x0, SS64_X10]
        ldp             x12, x13, [x0, SS64_X12]
        ldp             x14, x15, [x0, SS64_X14]
-       ldp             x16, x17, [x0, SS64_X16]
+       // Skip x16, x17 - already loaded + authed by AUTH_THREAD_STATE_IN_X0
        ldp             x18, x19, [x0, SS64_X18]
        ldp             x20, x21, [x0, SS64_X20]
        ldp             x22, x23, [x0, SS64_X22]
        ldp             x24, x25, [x0, SS64_X24]
        ldp             x26, x27, [x0, SS64_X26]
        ldr             x28, [x0, SS64_X28]
-       ldp             fp, lr, [x0, SS64_FP]
+       ldr             fp, [x0, SS64_FP]
+       // Skip lr - already loaded + authed by AUTH_THREAD_STATE_IN_X0
 
        // Restore stack pointer and our last two GPRs
        ldr             x1, [x0, SS64_SP]
@@ -1052,18 +1045,18 @@ user_take_ast:
        PUSH_FRAME
        bl              EXT(ast_taken_user)                                                     // Handle all ASTs, may return via continuation
        POP_FRAME
-       mrs             x3, TPIDR_EL1                                                           // Reload thread pointer
        b               check_user_asts                                                         // Now try again
 
 user_set_debug_state_and_return:
+
+
        ldr             x4, [x3, ACT_CPUDATAP]                          // Get current CPU data pointer
        isb                                                                                     // Synchronize context
        PUSH_FRAME
        bl              EXT(arm_debug_set)                                      // Establish thread debug state in live regs
        POP_FRAME
        isb
-       mrs             x3, TPIDR_EL1                                           // Reload thread pointer
-       b               exception_return                        // And continue
+       b               exception_return_unint                                  // Continue, reloading the thread pointer
 
        .text
        .align 2
@@ -1077,6 +1070,7 @@ L_underflow_str:
        .asciz "Preemption count negative on thread %p"
 .align 2
 
+#if MACH_ASSERT
        .text
        .align 2
 rwlock_count_notzero:
@@ -1089,6 +1083,21 @@ rwlock_count_notzero:
 
 L_rwlock_count_notzero_str:
        .asciz "RW lock count not 0 on thread %p (%u)"
+
+       .text
+       .align 2
+preempt_count_notzero:
+       mrs             x0, TPIDR_EL1
+       str             x0, [sp, #-16]!                                         // We'll print thread pointer
+       ldr             w0, [x0, ACT_PREEMPT_CNT]
+       str             w0, [sp, #8]
+       adr             x0, L_preempt_count_notzero_str                                 // Format string
+       CALL_EXTERN panic                                                       // Game over
+
+L_preempt_count_notzero_str:
+       .asciz "preemption count not 0 on thread %p (%u)"
+#endif /* MACH_ASSERT */
+
 .align 2
 
 #if __ARM_KERNEL_PROTECT__
index 00aae153f91456b7876e02d7606407dd28c5ff74..495cc7c03d9fb24197330c33df18a0b553020f68 100644 (file)
@@ -520,7 +520,7 @@ ml_phys_write_double_64(addr64_t paddr64, unsigned long long data)
 void
 setbit(int bitno, int *s)
 {
-       s[bitno / INT_SIZE] |= 1 << (bitno % INT_SIZE);
+       s[bitno / INT_SIZE] |= 1U << (bitno % INT_SIZE);
 }
 
 /*
@@ -529,7 +529,7 @@ setbit(int bitno, int *s)
 void
 clrbit(int bitno, int *s)
 {
-       s[bitno / INT_SIZE] &= ~(1 << (bitno % INT_SIZE));
+       s[bitno / INT_SIZE] &= ~(1U << (bitno % INT_SIZE));
 }
 
 /*
@@ -538,7 +538,7 @@ clrbit(int bitno, int *s)
 int
 testbit(int bitno, int *s)
 {
-       return s[bitno / INT_SIZE] & (1 << (bitno % INT_SIZE));
+       return s[bitno / INT_SIZE] & (1U << (bitno % INT_SIZE));
 }
 
 /*
@@ -765,17 +765,18 @@ ml_thread_policy(
        //      kprintf("ml_thread_policy() unimplemented\n");
 }
 
+__dead2
 void
-panic_unimplemented()
+panic_unimplemented(void)
 {
        panic("Not yet implemented.");
 }
 
 /* ARM64_TODO <rdar://problem/9198953> */
-void abort(void);
+void abort(void) __dead2;
 
 void
-abort()
+abort(void)
 {
        panic("Abort.");
 }
index 095ec6dab9e7caf9e7002d0bb6ab4d662a450477..defc45b4497bfe5e993c64a17d87340eb832e409 100644 (file)
@@ -26,7 +26,7 @@
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 #include <kern/misc_protos.h>
-#include <stdatomic.h>
+#include <machine/atomic.h>
 #include <mach/mach_time.h>
 #include <mach/clock_types.h>
 #include <kern/clock.h>
@@ -34,6 +34,7 @@
 #include <arm64/machine_remote_time.h>
 #include <sys/kdebug.h>
 #include <arm/machine_routines.h>
+#include <kern/remote_time.h>
 
 lck_spin_t *bt_spin_lock = NULL;
 _Atomic uint32_t bt_init_flag = 0;
@@ -41,14 +42,14 @@ _Atomic uint32_t bt_init_flag = 0;
 extern lck_spin_t *ts_conversion_lock;
 extern void mach_bridge_add_timestamp(uint64_t remote_timestamp, uint64_t local_timestamp);
 extern void bt_calibration_thread_start(void);
+extern void bt_params_add(struct bt_params *params);
 
 void
 mach_bridge_init_timestamp(void)
 {
        /* This function should be called only once by the driver
         *  implementing the interrupt handler for receiving timestamps */
-       if (bt_init_flag) {
-               assert(!bt_init_flag);
+       if (os_atomic_load(&bt_init_flag, relaxed)) {
                return;
        }
 
@@ -59,7 +60,7 @@ mach_bridge_init_timestamp(void)
        bt_spin_lock = lck_spin_alloc_init(bt_lck_grp, NULL);
        ts_conversion_lock = lck_spin_alloc_init(bt_lck_grp, NULL);
 
-       atomic_store(&bt_init_flag, 1);
+       os_atomic_store(&bt_init_flag, 1, release);
 
        /* Start the kernel thread only after all the locks have been initialized */
        bt_calibration_thread_start();
@@ -74,8 +75,8 @@ mach_bridge_recv_timestamps(uint64_t remoteTimestamp, uint64_t localTimestamp)
        assert(ml_at_interrupt_context() == TRUE);
 
        /* Ensure the locks have been initialized */
-       if (!bt_init_flag) {
-               assert(bt_init_flag != 0);
+       if (!os_atomic_load(&bt_init_flag, acquire)) {
+               panic("%s called before mach_bridge_init_timestamp", __func__);
                return;
        }
 
@@ -87,3 +88,27 @@ mach_bridge_recv_timestamps(uint64_t remoteTimestamp, uint64_t localTimestamp)
 
        return;
 }
+
+/*
+ * This function is used to set parameters, calculated externally,
+ * needed for mach_bridge_remote_time.
+ */
+void
+mach_bridge_set_params(uint64_t local_timestamp, uint64_t remote_timestamp, double rate)
+{
+       /* Ensure the locks have been initialized */
+       if (!os_atomic_load(&bt_init_flag, acquire)) {
+               panic("%s called before mach_bridge_init_timestamp", __func__);
+               return;
+       }
+
+       struct bt_params params = {};
+       params.base_local_ts = local_timestamp;
+       params.base_remote_ts = remote_timestamp;
+       params.rate = rate;
+       lck_spin_lock(ts_conversion_lock);
+       bt_params_add(&params);
+       lck_spin_unlock(ts_conversion_lock);
+       KDBG(MACHDBG_CODE(DBG_MACH_CLOCK, MACH_BRIDGE_TS_PARAMS), params.base_local_ts,
+           params.base_remote_ts, *(uint64_t *)((void *)&params.rate));
+}
index ee4db2f3b934884aaddcdaa379545dbdd684c27d..1083a4b41cc4b72368b4a636c8b5baa22a85e3fb 100644 (file)
@@ -34,6 +34,7 @@
 __BEGIN_DECLS
 void mach_bridge_recv_timestamps(uint64_t bridgeTimestamp, uint64_t localTimestamp);
 void mach_bridge_init_timestamp(void);
+void mach_bridge_set_params(uint64_t local_timestamp, uint64_t remote_timestamp, double rate);
 __END_DECLS
 
 #endif /* MACHINE_ARM64_REMOTE_TIME_H */
index b426e7eb6b8744809531cddb4bf977b625513a6e..13aca14c10786b4251c8dde006b52e8b99a91e78 100644 (file)
 #include <arm/caches_internal.h>
 #include <arm/misc_protos.h>
 #include <arm/machdep_call.h>
+#include <arm/machine_routines.h>
 #include <arm/rtclock.h>
 #include <arm/cpuid_internal.h>
+#include <arm/cpu_capabilities.h>
 #include <console/serial_protos.h>
 #include <kern/machine.h>
 #include <prng/random.h>
 #include <kern/startup.h>
 #include <kern/thread.h>
+#include <kern/timer_queue.h>
 #include <mach/machine.h>
 #include <machine/atomic.h>
 #include <vm/pmap.h>
@@ -64,6 +67,7 @@
 #endif
 
 
+
 static int max_cpus_initialized = 0;
 #define MAX_CPUS_SET    0x1
 #define MAX_CPUS_WAIT   0x2
@@ -76,10 +80,12 @@ boolean_t is_clock_configured = FALSE;
 
 uint32_t yield_delay_us = 0; /* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */
 
+#if CONFIG_NONFATAL_ASSERTS
 extern int mach_assert;
+#endif
 extern volatile uint32_t debug_enabled;
 
-extern vm_offset_t   segEXTRADATA;
+extern vm_offset_t   segLOWEST;
 extern vm_offset_t   segLOWESTTEXT;
 extern vm_offset_t   segLASTB;
 extern unsigned long segSizeLAST;
@@ -108,12 +114,14 @@ void ml_lockdown_run_handler(void);
 uint32_t get_arm_cpu_version(void);
 
 
+__dead2
 void
-ml_cpu_signal(unsigned int cpu_id __unused)
+ml_cpu_signal(unsigned int cpu_mpidr __unused)
 {
        panic("Platform does not support ACC Fast IPI");
 }
 
+__dead2
 void
 ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)
 {
@@ -127,14 +135,16 @@ ml_cpu_signal_deferred_get_timer()
        return 0;
 }
 
+__dead2
 void
-ml_cpu_signal_deferred(unsigned int cpu_id __unused)
+ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)
 {
        panic("Platform does not support ACC Fast IPI deferral");
 }
 
+__dead2
 void
-ml_cpu_signal_retract(unsigned int cpu_id __unused)
+ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)
 {
        panic("Platform does not support ACC Fast IPI retraction");
 }
@@ -142,9 +152,9 @@ ml_cpu_signal_retract(unsigned int cpu_id __unused)
 void
 machine_idle(void)
 {
-       __asm__ volatile ("msr DAIFSet, %[mask]" ::[mask] "i" (DAIFSC_IRQF | DAIFSC_FIQF));
+       __builtin_arm_wsr("DAIFSet", (DAIFSC_IRQF | DAIFSC_FIQF));
        Idle_context();
-       __asm__ volatile ("msr DAIFClr, %[mask]" ::[mask] "i" (DAIFSC_IRQF | DAIFSC_FIQF));
+       __builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF | DAIFSC_FIQF));
 }
 
 void
@@ -234,15 +244,11 @@ user_cont_hwclock_allowed(void)
        return FALSE;
 }
 
-/*
- * user_timebase_allowed()
- *
- * Indicates whether we allow EL0 to read the physical timebase (CNTPCT_EL0).
- */
-boolean_t
-user_timebase_allowed(void)
+
+uint8_t
+user_timebase_type(void)
 {
-       return TRUE;
+       return USER_TIMEBASE_SPEC;
 }
 
 boolean_t
@@ -357,9 +363,9 @@ lock_mmu(uint64_t begin, uint64_t end)
        __builtin_arm_isb(ISB_SY);
        flush_mmu_tlb();
 
-#else
+#else /* defined(KERNEL_INTEGRITY_KTRR) */
 #error KERNEL_INTEGRITY config error
-#endif
+#endif /* defined(KERNEL_INTEGRITY_KTRR) */
 }
 
 static void
@@ -403,7 +409,7 @@ rorgn_lockdown(void)
        assert_unlocked();
 
        /* [x] - Use final method of determining all kernel text range or expect crashes */
-       ktrr_begin = segEXTRADATA;
+       ktrr_begin = segLOWEST;
        assert(ktrr_begin && gVirtBase && gPhysBase);
 
        ktrr_begin = kvtophys(ktrr_begin);
@@ -451,7 +457,9 @@ machine_startup(__unused boot_args * args)
        int boot_arg;
 
 
+#if CONFIG_NONFATAL_ASSERTS
        PE_parse_boot_argn("assert", &mach_assert, sizeof(mach_assert));
+#endif
 
        if (PE_parse_boot_argn("preempt", &boot_arg, sizeof(boot_arg))) {
                default_preemption_rate = boot_arg;
@@ -649,8 +657,8 @@ ml_init_lock_timeout(void)
 void
 ml_cpu_up(void)
 {
-       hw_atomic_add(&machine_info.physical_cpu, 1);
-       hw_atomic_add(&machine_info.logical_cpu, 1);
+       os_atomic_inc(&machine_info.physical_cpu, relaxed);
+       os_atomic_inc(&machine_info.logical_cpu, relaxed);
 }
 
 /*
@@ -662,8 +670,8 @@ ml_cpu_down(void)
 {
        cpu_data_t      *cpu_data_ptr;
 
-       hw_atomic_sub(&machine_info.physical_cpu, 1);
-       hw_atomic_sub(&machine_info.logical_cpu, 1);
+       os_atomic_dec(&machine_info.physical_cpu, relaxed);
+       os_atomic_dec(&machine_info.logical_cpu, relaxed);
 
        /*
         * If we want to deal with outstanding IPIs, we need to
@@ -678,6 +686,16 @@ ml_cpu_down(void)
         */
        cpu_data_ptr = getCpuDatap();
        cpu_data_ptr->cpu_running = FALSE;
+
+       if (cpu_data_ptr != &BootCpuData) {
+               /*
+                * Move all of this cpu's timers to the master/boot cpu,
+                * and poke it in case there's a sooner deadline for it to schedule.
+                */
+               timer_queue_shutdown(&cpu_data_ptr->rtclock_timer.queue);
+               cpu_xcall(BootCpuData.cpu_number, &timer_queue_expire_local, NULL);
+       }
+
        cpu_signal_handler_internal(TRUE);
 }
 
@@ -1085,7 +1103,7 @@ ml_processor_register(ml_processor_info_t *in_processor_info,
 #endif /* KPC */
 
        if (!is_boot_cpu) {
-               early_random_cpu_init(this_cpu_datap->cpu_number);
+               random_cpu_init(this_cpu_datap->cpu_number);
                // now let next CPU register itself
                OSIncrementAtomic((SInt32*)&real_ncpus);
        }
@@ -1164,6 +1182,16 @@ ml_io_map(
        return io_map(phys_addr, size, VM_WIMG_IO);
 }
 
+/* Map memory map IO space (with protections specified) */
+vm_offset_t
+ml_io_map_with_prot(
+       vm_offset_t phys_addr,
+       vm_size_t size,
+       vm_prot_t prot)
+{
+       return io_map_with_prot(phys_addr, size, VM_WIMG_IO, prot);
+}
+
 vm_offset_t
 ml_io_map_wcomb(
        vm_offset_t phys_addr,
@@ -1308,9 +1336,6 @@ ml_static_protect(
                                                }
                                        }
                                }
-#ifndef  __ARM_L1_PTW__
-                               FlushPoC_DcacheRegion( trunc_page_32(pte_p), 4 * sizeof(*pte_p));
-#endif
                        } else {
                                ptmp = *pte_p;
 
@@ -1319,10 +1344,6 @@ ml_static_protect(
                                        ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
                                        *pte_p = ptmp;
                                }
-
-#ifndef  __ARM_L1_PTW__
-                               FlushPoC_DcacheRegion( trunc_page_32(pte_p), sizeof(*pte_p));
-#endif
                        }
                        __unreachable_ok_pop
                }
@@ -1601,9 +1622,8 @@ ml_get_hwclock()
        // ISB required by ARMV7C.b section B8.1.2 & ARMv8 section D6.1.2
        // "Reads of CNTPCT[_EL0] can occur speculatively and out of order relative
        // to other instructions executed on the same processor."
-       __asm__ volatile ("isb\n"
-                          "mrs %0, CNTPCT_EL0"
-                          : "=r"(timebase));
+       __builtin_arm_isb(ISB_SY);
+       timebase = __builtin_arm_rsr64("CNTPCT_EL0");
 
        return timebase;
 }
@@ -1678,7 +1698,13 @@ cache_trap_recover()
 static void
 set_cache_trap_recover(thread_t thread)
 {
+#if defined(HAS_APPLE_PAC)
+       thread->recover = (vm_address_t)ptrauth_auth_and_resign(&cache_trap_recover,
+           ptrauth_key_function_pointer, 0,
+           ptrauth_key_function_pointer, ptrauth_blend_discriminator(&thread->recover, PAC_DISCRIMINATOR_RECOVER));
+#else /* defined(HAS_APPLE_PAC) */
        thread->recover = (vm_address_t)cache_trap_recover;
+#endif /* defined(HAS_APPLE_PAC) */
 }
 
 static void
@@ -1742,14 +1768,8 @@ icache_invalidate_trap(vm_map_address_t start, vm_map_size_t size)
 
        set_cache_trap_recover(thread);
 
-       CleanPoU_DcacheRegion(start, (uint32_t) size);
-
        /* Invalidate iCache to point of unification */
-#if __ARM_IC_NOALIAS_ICACHE__
        InvalidatePoU_IcacheRegion(start, (uint32_t)size);
-#else
-       InvalidatePoU_Icache();
-#endif
 
        /* Restore recovery function */
        thread->recover = old_recover;
@@ -1814,7 +1834,7 @@ _enable_timebase_event_stream(uint32_t bit_index)
         * If the SOC supports it (and it isn't broken), enable
         * EL0 access to the physical timebase register.
         */
-       if (user_timebase_allowed()) {
+       if (user_timebase_type() != USER_TIMEBASE_NONE) {
                cntkctl |= CNTKCTL_EL1_PL0PCTEN;
        }
 
@@ -1832,6 +1852,8 @@ _enable_virtual_timer(void)
        __asm__ volatile ("msr CNTP_CTL_EL0, %0" : : "r"(cntvctl));
 }
 
+uint64_t events_per_sec = 0;
+
 void
 fiq_context_init(boolean_t enable_fiq __unused)
 {
@@ -1847,16 +1869,10 @@ fiq_context_bootstrap(boolean_t enable_fiq)
 {
 #if defined(APPLE_ARM64_ARCH_FAMILY) || defined(BCM2837)
        /* Could fill in our own ops here, if we needed them */
-       uint64_t        ticks_per_sec, ticks_per_event, events_per_sec;
+       uint64_t        ticks_per_sec, ticks_per_event;
        uint32_t        bit_index;
 
        ticks_per_sec = gPEClockFrequencyInfo.timebase_frequency_hz;
-#if defined(ARM_BOARD_WFE_TIMEOUT_NS)
-       events_per_sec = 1000000000 / ARM_BOARD_WFE_TIMEOUT_NS;
-#else
-       /* Default to 1usec (or as close as we can get) */
-       events_per_sec = 1000000;
-#endif
        ticks_per_event = ticks_per_sec / events_per_sec;
        bit_index = flsll(ticks_per_event) - 1; /* Highest bit set */
 
@@ -1988,7 +2004,7 @@ timer_state_event_kernel_to_user(void)
  * The following are required for parts of the kernel
  * that cannot resolve these functions as inlines:
  */
-extern thread_t current_act(void);
+extern thread_t current_act(void) __attribute__((const));
 thread_t
 current_act(void)
 {
@@ -1996,7 +2012,7 @@ current_act(void)
 }
 
 #undef current_thread
-extern thread_t current_thread(void);
+extern thread_t current_thread(void) __attribute__((const));
 thread_t
 current_thread(void)
 {
@@ -2057,3 +2073,59 @@ ex_cb_invoke(
        return EXCB_ACTION_NONE;
 }
 
+#if defined(HAS_APPLE_PAC)
+void
+ml_task_set_disable_user_jop(task_t task, boolean_t disable_user_jop)
+{
+       assert(task);
+       task->disable_user_jop = disable_user_jop;
+}
+
+void
+ml_thread_set_disable_user_jop(thread_t thread, boolean_t disable_user_jop)
+{
+       assert(thread);
+       thread->machine.disable_user_jop = disable_user_jop;
+}
+
+void
+ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit)
+{
+       if (inherit) {
+               task->rop_pid = parent_task->rop_pid;
+       } else {
+               task->rop_pid = early_random();
+       }
+}
+#endif /* defined(HAS_APPLE_PAC) */
+
+
+#if defined(HAS_APPLE_PAC)
+
+/*
+ * ml_auth_ptr_unchecked: call this instead of ptrauth_auth_data
+ * instrinsic when you don't want to trap on auth fail.
+ *
+ */
+
+void *
+ml_auth_ptr_unchecked(void *ptr, ptrauth_key key, uint64_t modifier)
+{
+       switch (key & 0x3) {
+       case ptrauth_key_asia:
+               asm volatile ("autia %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier));
+               break;
+       case ptrauth_key_asib:
+               asm volatile ("autib %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier));
+               break;
+       case ptrauth_key_asda:
+               asm volatile ("autda %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier));
+               break;
+       case ptrauth_key_asdb:
+               asm volatile ("autdb %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier));
+               break;
+       }
+
+       return ptr;
+}
+#endif /* defined(HAS_APPLE_PAC) */
diff --git a/osfmk/arm64/machine_routines_asm.h b/osfmk/arm64/machine_routines_asm.h
new file mode 100644 (file)
index 0000000..7f5f8ed
--- /dev/null
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <pexpert/arm64/board_config.h>
+#include "assym.s"
+
+#ifndef __ASSEMBLER__
+#error "This header should only be used in .s files"
+#endif
+
+/**
+ * Loads the following values from the thread_kernel_state pointer in x0:
+ *
+ * x1: $x0->ss_64.pc
+ * w2: $x0->ss_64.cpsr
+ * x16: $x0->ss_64.x16
+ * x17: $x0->ss_64.x17
+ * lr: $x0->ss_64.lr
+ *
+ * On CPUs with PAC support, this macro will auth the above values with ml_check_signed_state().
+ *
+ * arg0 - scratch register 1
+ * arg1 - scratch register 2
+ * arg2 - scratch register 3
+ * arg3 - scratch register 4
+ * arg4 - scratch register 5
+ */
+/* BEGIN IGNORE CODESTYLE */
+.macro  AUTH_THREAD_STATE_IN_X0
+       ldr             x1, [x0, SS64_PC]
+       ldr             w2, [x0, SS64_CPSR]
+       ldp             x16, x17, [x0, SS64_X16]
+
+#if defined(HAS_APPLE_PAC)
+       // Save x3-x5 to preserve across call
+       mov             $2, x3
+       mov             $3, x4
+       mov             $4, x5
+
+       /*
+       * Arg0: The ARM context pointer (already in x0)
+       * Arg1: PC to check (loaded above)
+       * Arg2: CPSR to check (loaded above)
+       * Arg3: the LR to check
+       *
+       * Stash saved state PC and CPSR in other registers to avoid reloading potentially unauthed
+       * values from memory.  (ml_check_signed_state will clobber x1 and x2.)
+       */
+       mov             $0, x1
+       mov             $1, x2
+       ldr             x3, [x0, SS64_LR]
+       mov             x4, x16
+       mov             x5, x17
+       bl              EXT(ml_check_signed_state)
+       mov             x1, $0
+       mov             x2, $1
+
+       // LR was already loaded/authed earlier, if we reload it we might be loading a potentially unauthed value
+       mov             lr, x3
+       mov             x3, $2
+       mov             x4, $3
+       mov             x5, $4
+#else
+       ldr             lr, [x0, SS64_LR]
+#endif /* defined(HAS_APPLE_PAC) */
+.endmacro
+/* END IGNORE CODESTYLE */
+
+/* vim: set ft=asm: */
index 08756dc8da9c5695264f8b0932f75214dabd338f..64fd61152e622e2d9c860b6991de65c9741e780c 100644 (file)
 #include "assym.s"
 
 
+#if defined(HAS_APPLE_PAC)
+/*
+ * void
+ * ml_set_kernelkey_enabled(boolean_t enable)
+ *
+ * Toggle pointer auth kernel domain key diversification. Assembly to prevent compiler reordering.
+ *
+ */
+
+       .align 2
+       .globl EXT(ml_set_kernelkey_enabled)
+LEXT(ml_set_kernelkey_enabled)
+       mrs             x1, ARM64_REG_APCTL_EL1
+       orr             x2, x1, #APCTL_EL1_KernKeyEn
+       and     x1, x1, #~APCTL_EL1_KernKeyEn
+       cmp             w0, #0
+       csel    x1, x1, x2, eq
+       msr             ARM64_REG_APCTL_EL1, x1
+       isb
+       ret
+
+#endif /* defined(HAS_APPLE_PAC) */
+
+
 
 /*     uint32_t get_fpscr(void):
  *             Returns (FPSR | FPCR).
@@ -131,369 +155,6 @@ Lupdate_mdscr_panic_str:
        .asciz "MDSCR.KDE was set"
 
 
-#if __ARM_KERNEL_PROTECT__
-/*
- * __ARM_KERNEL_PROTECT__ adds two complications to TLB management:
- *
- * 1. As each pmap has two ASIDs, every TLB operation that targets an ASID must
- *   target both ASIDs for the pmap that owns the target ASID.
- *
- * 2. Any TLB operation targeting the kernel_pmap ASID (ASID 0) must target all
- *   ASIDs (as kernel_pmap mappings may be referenced while using an ASID that
- *   belongs to another pmap).  We expect these routines to be called with the
- *   EL0 ASID for the target; not the EL1 ASID.
- */
-#endif /* __ARM_KERNEL_PROTECT__ */
-
-.macro SYNC_TLB_FLUSH
-       dsb     ish
-       isb     sy
-.endmacro
-
-
-/*
- *     void sync_tlb_flush(void)
- *
- *             Synchronize one or more prior TLB flush operations
- */
-       .text
-       .align 2
-       .globl EXT(sync_tlb_flush)
-LEXT(sync_tlb_flush)
-       SYNC_TLB_FLUSH
-       ret
-
-
-.macro FLUSH_MMU_TLB
-       tlbi    vmalle1is
-.endmacro
-/*
- *     void flush_mmu_tlb_async(void)
- *
- *             Flush all TLBs, don't wait for completion
- */
-       .text
-       .align 2
-       .globl EXT(flush_mmu_tlb_async)
-LEXT(flush_mmu_tlb_async)
-       FLUSH_MMU_TLB
-       ret
-
-/*
- *     void flush_mmu_tlb(void)
- *
- *             Flush all TLBs
- */
-       .text
-       .align 2
-       .globl EXT(flush_mmu_tlb)
-LEXT(flush_mmu_tlb)
-       FLUSH_MMU_TLB
-       SYNC_TLB_FLUSH
-       ret
-
-.macro FLUSH_CORE_TLB
-       tlbi    vmalle1
-.endmacro
-
-/*
- *     void flush_core_tlb_async(void)
- *
- *             Flush local core TLB, don't wait for completion
- */
-       .text
-       .align 2
-       .globl EXT(flush_core_tlb_async)
-LEXT(flush_core_tlb_async)
-       FLUSH_CORE_TLB
-       ret
-
-/*
- *     void flush_core_tlb(void)
- *
- *             Flush local core TLB
- */
-       .text
-       .align 2
-       .globl EXT(flush_core_tlb)
-LEXT(flush_core_tlb)
-       FLUSH_CORE_TLB
-       SYNC_TLB_FLUSH
-       ret
-
-.macro FLUSH_MMU_TLB_ALLENTRIES
-#if __ARM_16K_PG__
-       and             x0, x0, #~0x3
-
-       /*
-        * The code below is not necessarily correct.  From an overview of
-        * the client code, the expected contract for TLB flushes is that
-        * we will expand from an "address, length" pair to "start address,
-        * end address" in the course of a TLB flush.  This suggests that
-        * a flush for "X, X+4" is actually only asking for a flush of a
-        * single 16KB page.  At the same time, we'd like to be prepared
-        * for bad inputs (X, X+3), so add 3 and then truncate the 4KB page
-        * number to a 16KB page boundary.  This should deal correctly with
-        * unaligned inputs.
-        *
-        * If our expecations about client behavior are wrong however, this
-        * will lead to occasional TLB corruption on platforms with 16KB
-        * pages.
-        */
-       add             x1, x1, #0x3
-       and             x1, x1, #~0x3
-#endif
-1: // Lflush_mmu_tlb_allentries_loop:
-       tlbi    vaae1is, x0
-       add             x0, x0, #(ARM_PGBYTES / 4096)   // Units are 4KB pages, as defined by the ISA
-       cmp             x0, x1
-       b.lt    1b // Lflush_mmu_tlb_allentries_loop
-.endmacro
-
-/*
- *     void flush_mmu_tlb_allentries_async(uint64_t, uint64_t)
- *
- *             Flush TLB entries, don't wait for completion
- */
-       .text
-       .align 2
-       .globl EXT(flush_mmu_tlb_allentries_async)
-LEXT(flush_mmu_tlb_allentries_async)
-       FLUSH_MMU_TLB_ALLENTRIES
-       ret
-
-/*
- *     void flush_mmu_tlb_allentries(uint64_t, uint64_t)
- *
- *             Flush TLB entries
- */
-       .globl EXT(flush_mmu_tlb_allentries)
-LEXT(flush_mmu_tlb_allentries)
-       FLUSH_MMU_TLB_ALLENTRIES
-       SYNC_TLB_FLUSH
-       ret
-
-.macro FLUSH_MMU_TLB_ENTRY
-#if __ARM_KERNEL_PROTECT__
-       /*
-        * If we are flushing ASID 0, this is a kernel operation.  With this
-        * ASID scheme, this means we should flush all ASIDs.
-        */
-       lsr             x2, x0, #TLBI_ASID_SHIFT
-       cmp             x2, #0
-       b.eq            1f // Lflush_mmu_tlb_entry_globally
-
-       bic             x0, x0, #(1 << TLBI_ASID_SHIFT)
-       tlbi    vae1is, x0
-       orr             x0, x0, #(1 << TLBI_ASID_SHIFT)
-#endif /* __ARM_KERNEL_PROTECT__ */
-       tlbi    vae1is, x0
-#if __ARM_KERNEL_PROTECT__
-       b               2f // Lflush_mmu_tlb_entry_done
-1: // Lflush_mmu_tlb_entry_globally:
-       tlbi    vaae1is, x0
-2: // Lflush_mmu_tlb_entry_done
-#endif /* __ARM_KERNEL_PROTECT__ */
-.endmacro
-/*
- *     void flush_mmu_tlb_entry_async(uint64_t)
- *
- *             Flush TLB entry, don't wait for completion
- */
-       .text
-       .align 2
-       .globl EXT(flush_mmu_tlb_entry_async)
-LEXT(flush_mmu_tlb_entry_async)
-       FLUSH_MMU_TLB_ENTRY
-       ret
-
-/*
- *     void flush_mmu_tlb_entry(uint64_t)
- *
- *             Flush TLB entry
- */
-       .text
-       .align 2
-       .globl EXT(flush_mmu_tlb_entry)
-LEXT(flush_mmu_tlb_entry)
-       FLUSH_MMU_TLB_ENTRY
-       SYNC_TLB_FLUSH
-       ret
-
-.macro FLUSH_MMU_TLB_ENTRIES
-#if __ARM_16K_PG__
-       and             x0, x0, #~0x3
-
-       /*
-        * The code below is not necessarily correct.  From an overview of
-        * the client code, the expected contract for TLB flushes is that
-        * we will expand from an "address, length" pair to "start address,
-        * end address" in the course of a TLB flush.  This suggests that
-        * a flush for "X, X+4" is actually only asking for a flush of a
-        * single 16KB page.  At the same time, we'd like to be prepared
-        * for bad inputs (X, X+3), so add 3 and then truncate the 4KB page
-        * number to a 16KB page boundary.  This should deal correctly with
-        * unaligned inputs.
-        *
-        * If our expecations about client behavior are wrong however, this
-        * will lead to occasional TLB corruption on platforms with 16KB
-        * pages.
-        */
-       add             x1, x1, #0x3
-       and             x1, x1, #~0x3
-#endif /* __ARM_16K_PG__ */
-#if __ARM_KERNEL_PROTECT__
-       /*
-        * If we are flushing ASID 0, this is a kernel operation.  With this
-        * ASID scheme, this means we should flush all ASIDs.
-        */
-       lsr             x2, x0, #TLBI_ASID_SHIFT
-       cmp             x2, #0
-       b.eq            2f // Lflush_mmu_tlb_entries_globally_loop
-
-       bic             x0, x0, #(1 << TLBI_ASID_SHIFT)
-#endif /* __ARM_KERNEL_PROTECT__ */
-1: // Lflush_mmu_tlb_entries_loop
-       tlbi    vae1is, x0
-#if __ARM_KERNEL_PROTECT__
-       orr             x0, x0, #(1 << TLBI_ASID_SHIFT)
-       tlbi    vae1is, x0
-       bic             x0, x0, #(1 << TLBI_ASID_SHIFT)
-#endif /* __ARM_KERNEL_PROTECT__ */
-       add             x0, x0, #(ARM_PGBYTES / 4096)   // Units are pages
-       cmp             x0, x1
-       b.lt            1b // Lflush_mmu_tlb_entries_loop
-#if __ARM_KERNEL_PROTECT__
-       b               3f // Lflush_mmu_tlb_entries_done
-2: // Lflush_mmu_tlb_entries_globally_loop:
-       tlbi    vaae1is, x0
-       add             x0, x0, #(ARM_PGBYTES / 4096)   // Units are pages
-       cmp             x0, x1
-       b.lt            2b // Lflush_mmu_tlb_entries_globally_loop
-3: // Lflush_mmu_tlb_entries_done
-#endif /* __ARM_KERNEL_PROTECT__ */
-.endmacro
-
-/*
- *     void flush_mmu_tlb_entries_async(uint64_t, uint64_t)
- *
- *             Flush TLB entries, don't wait for completion
- */
-       .text
-       .align 2
-       .globl EXT(flush_mmu_tlb_entries_async)
-LEXT(flush_mmu_tlb_entries_async)
-       FLUSH_MMU_TLB_ENTRIES
-       ret
-
-/*
- *     void flush_mmu_tlb_entries(uint64_t, uint64_t)
- *
- *             Flush TLB entries
- */
-       .text
-       .align 2
-       .globl EXT(flush_mmu_tlb_entries)
-LEXT(flush_mmu_tlb_entries)
-       FLUSH_MMU_TLB_ENTRIES
-       SYNC_TLB_FLUSH
-       ret
-
-.macro FLUSH_MMU_TLB_ASID
-#if __ARM_KERNEL_PROTECT__
-       /*
-        * If we are flushing ASID 0, this is a kernel operation.  With this
-        * ASID scheme, this means we should flush all ASIDs.
-        */
-       lsr             x1, x0, #TLBI_ASID_SHIFT
-       cmp             x1, #0
-       b.eq            1f // Lflush_mmu_tlb_globally
-
-       bic             x0, x0, #(1 << TLBI_ASID_SHIFT)
-       tlbi    aside1is, x0
-       orr             x0, x0, #(1 << TLBI_ASID_SHIFT)
-#endif /* __ARM_KERNEL_PROTECT__ */
-       tlbi    aside1is, x0
-#if __ARM_KERNEL_PROTECT__
-       b               2f // Lflush_mmu_tlb_asid_done
-1: // Lflush_mmu_tlb_globally:
-       tlbi    vmalle1is
-2: // Lflush_mmu_tlb_asid_done:
-#endif /* __ARM_KERNEL_PROTECT__ */
-.endmacro
-
-/*
- *     void flush_mmu_tlb_asid_async(uint64_t)
- *
- *             Flush TLB entriesfor requested asid, don't wait for completion
- */
-       .text
-       .align 2
-       .globl EXT(flush_mmu_tlb_asid_async)
-LEXT(flush_mmu_tlb_asid_async)
-       FLUSH_MMU_TLB_ASID
-       ret
-
-/*
- *     void flush_mmu_tlb_asid(uint64_t)
- *
- *             Flush TLB entriesfor requested asid
- */
-       .text
-       .align 2
-       .globl EXT(flush_mmu_tlb_asid)
-LEXT(flush_mmu_tlb_asid)
-       FLUSH_MMU_TLB_ASID
-       SYNC_TLB_FLUSH
-       ret
-
-.macro FLUSH_CORE_TLB_ASID
-#if __ARM_KERNEL_PROTECT__
-       /*
-        * If we are flushing ASID 0, this is a kernel operation.  With this
-        * ASID scheme, this means we should flush all ASIDs.
-        */
-       lsr             x1, x0, #TLBI_ASID_SHIFT
-       cmp             x1, #0
-       b.eq            1f // Lflush_core_tlb_asid_globally
-
-       bic             x0, x0, #(1 << TLBI_ASID_SHIFT)
-       tlbi    aside1, x0
-       orr             x0, x0, #(1 << TLBI_ASID_SHIFT)
-#endif /* __ARM_KERNEL_PROTECT__ */
-       tlbi    aside1, x0
-#if __ARM_KERNEL_PROTECT__
-       b               2f // Lflush_core_tlb_asid_done
-1: // Lflush_core_tlb_asid_globally:
-       tlbi    vmalle1
-2: // Lflush_core_tlb_asid_done:
-#endif /* __ARM_KERNEL_PROTECT__ */
-.endmacro
-
-/*
- *     void flush_core_tlb_asid_async(uint64_t)
- *
- *             Flush TLB entries for core for requested asid, don't wait for completion
- */
-       .text
-       .align 2
-       .globl EXT(flush_core_tlb_asid_async)
-LEXT(flush_core_tlb_asid_async)
-       FLUSH_CORE_TLB_ASID
-       ret
-/*
- *     void flush_core_tlb_asid(uint64_t)
- *
- *             Flush TLB entries for core for requested asid
- */
-       .text
-       .align 2
-       .globl EXT(flush_core_tlb_asid)
-LEXT(flush_core_tlb_asid)
-       FLUSH_CORE_TLB_ASID
-       SYNC_TLB_FLUSH
-       ret
-
 /*
  *     Set MMU Translation Table Base Alternate
  */
@@ -566,7 +227,7 @@ LEXT(set_tcr)
        cbnz    x1, L_set_tcr_panic
 #if defined(KERNEL_INTEGRITY_KTRR)
        mov             x1, lr
-       bl              _pinst_set_tcr
+       bl              EXT(pinst_set_tcr)
        mov             lr, x1
 #else
        msr             TCR_EL1, x0
@@ -598,7 +259,7 @@ L_set_locked_reg_panic_str:
 #else
 #if defined(KERNEL_INTEGRITY_KTRR)
        mov             x1, lr
-       bl              _pinst_set_tcr
+       bl              EXT(pinst_set_tcr)
        mov             lr, x1
 #else
        msr             TCR_EL1, x0
@@ -683,6 +344,11 @@ L_mmu_kvtop_wpreflight_invalid:
        mrs             $0, TPIDR_EL1                                   // Load thread pointer
        adrp    $2, $3@page                                             // Load the recovery handler address
        add             $2, $2, $3@pageoff
+#if defined(HAS_APPLE_PAC)
+       add             $1, $0, TH_RECOVER
+       movk    $1, #PAC_DISCRIMINATOR_RECOVER, lsl 48
+       pacia   $2, $1                                                  // Sign with IAKey + blended discriminator
+#endif
 
        ldr             $1, [$0, TH_RECOVER]                    // Save previous recovery handler
        str             $2, [$0, TH_RECOVER]                    // Set new signed recovery handler
@@ -744,35 +410,94 @@ LEXT(_bcopyin)
        ARM64_STACK_EPILOG
 
 /*
- * int _copyin_word(const char *src, uint64_t *dst, vm_size_t len)
+ * int _copyin_atomic32(const char *src, uint32_t *dst)
  */
        .text
        .align 2
-       .globl EXT(_copyin_word)
-LEXT(_copyin_word)
+       .globl EXT(_copyin_atomic32)
+LEXT(_copyin_atomic32)
        ARM64_STACK_PROLOG
        PUSH_FRAME
        SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
-       cmp             x2, #4
-       b.eq    L_copyin_word_4
-       cmp             x2, #8
-       b.eq    L_copyin_word_8
-       mov             x0, EINVAL
-       b               L_copying_exit
-L_copyin_word_4:
        ldr             w8, [x0]
-       b               L_copyin_word_store
-L_copyin_word_8:
+       str             w8, [x1]
+       mov             x0, #0
+       CLEAR_RECOVERY_HANDLER x10, x11
+       POP_FRAME
+       ARM64_STACK_EPILOG
+
+/*
+ * int _copyin_atomic32_wait_if_equals(const char *src, uint32_t value)
+ */
+       .text
+       .align 2
+       .globl EXT(_copyin_atomic32_wait_if_equals)
+LEXT(_copyin_atomic32_wait_if_equals)
+       ARM64_STACK_PROLOG
+       PUSH_FRAME
+       SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+       ldxr            w8, [x0]
+       cmp             w8, w1
+       mov             x0, ESTALE
+       b.ne            1f
+       mov             x0, #0
+       wfe
+1:
+       clrex
+       CLEAR_RECOVERY_HANDLER x10, x11
+       POP_FRAME
+       ARM64_STACK_EPILOG
+
+/*
+ * int _copyin_atomic64(const char *src, uint32_t *dst)
+ */
+       .text
+       .align 2
+       .globl EXT(_copyin_atomic64)
+LEXT(_copyin_atomic64)
+       ARM64_STACK_PROLOG
+       PUSH_FRAME
+       SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
        ldr             x8, [x0]
-L_copyin_word_store:
        str             x8, [x1]
        mov             x0, #0
        CLEAR_RECOVERY_HANDLER x10, x11
-L_copying_exit:
        POP_FRAME
        ARM64_STACK_EPILOG
 
 
+/*
+ * int _copyout_atomic32(uint32_t value, char *dst)
+ */
+       .text
+       .align 2
+       .globl EXT(_copyout_atomic32)
+LEXT(_copyout_atomic32)
+       ARM64_STACK_PROLOG
+       PUSH_FRAME
+       SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+       str             w0, [x1]
+       mov             x0, #0
+       CLEAR_RECOVERY_HANDLER x10, x11
+       POP_FRAME
+       ARM64_STACK_EPILOG
+
+/*
+ * int _copyout_atomic64(uint64_t value, char *dst)
+ */
+       .text
+       .align 2
+       .globl EXT(_copyout_atomic64)
+LEXT(_copyout_atomic64)
+       ARM64_STACK_PROLOG
+       PUSH_FRAME
+       SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+       str             x0, [x1]
+       mov             x0, #0
+       CLEAR_RECOVERY_HANDLER x10, x11
+       POP_FRAME
+       ARM64_STACK_EPILOG
+
 
 /*
  * int _bcopyout(const char *src, char *dst, vm_size_t len)
@@ -825,6 +550,11 @@ LEXT(_bcopyinstr)
        mrs             x10, TPIDR_EL1                          // Get thread pointer
        ldr             x11, [x10, TH_RECOVER]          // Save previous recover
 
+#if defined(HAS_APPLE_PAC)
+       add             x5, x10, TH_RECOVER             // Sign new pointer with IAKey + blended discriminator
+       movk    x5, #PAC_DISCRIMINATOR_RECOVER, lsl 48
+       pacia   x4, x5
+#endif
        str             x4, [x10, TH_RECOVER]           // Store new recover
 
        mov             x4, #0                                          // x4 - total bytes copied
@@ -940,8 +670,8 @@ LEXT(arm_debug_set_cp14)
 LEXT(arm64_prepare_for_sleep)
        PUSH_FRAME
 
-#if defined(APPLECYCLONE) || defined(APPLETYPHOON)
-       // <rdar://problem/15827409> CPU1 Stuck in WFIWT Because of MMU Prefetch
+#if defined(APPLETYPHOON)
+       // <rdar://problem/15827409>
        mrs             x0, ARM64_REG_HID2                              // Read HID2
        orr             x0, x0, #(ARM64_REG_HID2_disMMUmtlbPrefetch)    // Set HID.DisableMTLBPrefetch
        msr             ARM64_REG_HID2, x0                              // Write HID2
@@ -1022,16 +752,16 @@ LEXT(arm64_force_wfi_clock_gate)
 
 
 
-#if defined(APPLECYCLONE) || defined(APPLETYPHOON)
+#if defined(APPLETYPHOON)
 
        .text
        .align 2
-       .globl EXT(cyclone_typhoon_prepare_for_wfi)
+       .globl EXT(typhoon_prepare_for_wfi)
 
-LEXT(cyclone_typhoon_prepare_for_wfi)
+LEXT(typhoon_prepare_for_wfi)
        PUSH_FRAME
 
-       // <rdar://problem/15827409> CPU1 Stuck in WFIWT Because of MMU Prefetch
+       // <rdar://problem/15827409>
        mrs             x0, ARM64_REG_HID2                              // Read HID2
        orr             x0, x0, #(ARM64_REG_HID2_disMMUmtlbPrefetch)    // Set HID.DisableMTLBPrefetch
        msr             ARM64_REG_HID2, x0                              // Write HID2
@@ -1044,11 +774,11 @@ LEXT(cyclone_typhoon_prepare_for_wfi)
 
        .text
        .align 2
-       .globl EXT(cyclone_typhoon_return_from_wfi)
-LEXT(cyclone_typhoon_return_from_wfi)
+       .globl EXT(typhoon_return_from_wfi)
+LEXT(typhoon_return_from_wfi)
        PUSH_FRAME
 
-       // <rdar://problem/15827409> CPU1 Stuck in WFIWT Because of MMU Prefetch
+       // <rdar://problem/15827409>
        mrs             x0, ARM64_REG_HID2                              // Read HID2
        mov             x1, #(ARM64_REG_HID2_disMMUmtlbPrefetch)        //
        bic             x0, x0, x1                                      // Clear HID.DisableMTLBPrefetchMTLBPrefetch
@@ -1204,7 +934,7 @@ LEXT(arm64_replace_bootstack)
        // Set SP_EL1 to exception stack
 #if defined(KERNEL_INTEGRITY_KTRR)
        mov             x1, lr
-       bl              _pinst_spsel_1
+       bl              EXT(pinst_spsel_1)
        mov             lr, x1
 #else
        msr             SPSel, #1
@@ -1233,5 +963,84 @@ LEXT(monitor_call)
        ret
 #endif
 
+#ifdef HAS_APPLE_PAC
+/**
+ * void ml_sign_thread_state(arm_saved_state_t *ss, uint64_t pc,
+ *                                                      uint32_t cpsr, uint64_t lr, uint64_t x16,
+ *                                                      uint64_t x17)
+ */
+       .text
+       .align 2
+       .globl EXT(ml_sign_thread_state)
+LEXT(ml_sign_thread_state)
+       pacga   x1, x1, x0              /* PC hash (gkey + &arm_saved_state) */
+       /*
+        * Mask off the carry flag so we don't need to re-sign when that flag is
+        * touched by the system call return path.
+        */
+       bic             x2, x2, PSR_CF
+       pacga   x1, x2, x1              /* SPSR hash (gkey + pc hash) */
+       pacga   x1, x3, x1              /* LR Hash (gkey + spsr hash) */
+       pacga   x1, x4, x1              /* X16 hash (gkey + lr hash) */
+       pacga   x1, x5, x1              /* X17 hash (gkey + x16 hash) */
+       str             x1, [x0, SS64_JOPHASH]
+       ret
+
+/**
+ * void ml_check_signed_state(arm_saved_state_t *ss, uint64_t pc,
+ *                                                       uint32_t cpsr, uint64_t lr, uint64_t x16,
+ *                                                       uint64_t x17)
+ */
+       .text
+       .align 2
+       .globl EXT(ml_check_signed_state)
+LEXT(ml_check_signed_state)
+       pacga   x1, x1, x0              /* PC hash (gkey + &arm_saved_state) */
+       /*
+        * Mask off the carry flag so we don't need to re-sign when that flag is
+        * touched by the system call return path.
+        */
+       bic             x2, x2, PSR_CF
+       pacga   x1, x2, x1              /* SPSR hash (gkey + pc hash) */
+       pacga   x1, x3, x1              /* LR Hash (gkey + spsr hash) */
+       pacga   x1, x4, x1              /* X16 hash (gkey + lr hash) */
+       pacga   x1, x5, x1              /* X17 hash (gkey + x16 hash) */
+       ldr             x2, [x0, SS64_JOPHASH]
+       cmp             x1, x2
+       b.ne    Lcheck_hash_panic
+       ret
+Lcheck_hash_panic:
+       mov             x1, x0
+       adr             x0, Lcheck_hash_str
+       CALL_EXTERN panic_with_thread_kernel_state
+Lcheck_hash_str:
+       .asciz "JOP Hash Mismatch Detected (PC, CPSR, or LR corruption)"
+#endif /* HAS_APPLE_PAC */
+
+       .text
+       .align 2
+       .globl EXT(fill32_dczva)
+LEXT(fill32_dczva)
+0:
+       dc      zva, x0
+       add     x0, x0, #64
+       subs    x1, x1, #64
+       b.hi    0b
+       ret
+
+       .text
+       .align 2
+       .globl EXT(fill32_nt)
+LEXT(fill32_nt)
+       dup.4s  v0, w2
+0:
+       stnp    q0, q0, [x0]
+       stnp    q0, q0, [x0, #0x20]
+       stnp    q0, q0, [x0, #0x40]
+       stnp    q0, q0, [x0, #0x60]
+       add     x0, x0, #128
+       subs    x1, x1, #128
+       b.hi    0b
+       ret
 
 /* vim: set sw=4 ts=4: */
index 878c879733c45d93cde6232d6e02b5bba90b3ed6..25895247fc7b1bf92ef21fc53cefe1f9f22323a5 100644 (file)
@@ -284,6 +284,11 @@ core_idle(__unused cpu_data_t *cpu)
 
 #pragma mark common hooks
 
+void
+mt_early_init(void)
+{
+}
+
 void
 mt_cpu_idle(cpu_data_t *cpu)
 {
@@ -332,15 +337,26 @@ mt_wake_per_core(void)
 {
 }
 
+uint64_t
+mt_count_pmis(void)
+{
+       uint64_t npmis = 0;
+       int max_cpu = ml_get_max_cpu_number();
+       for (int i = 0; i <= max_cpu; i++) {
+               cpu_data_t *cpu = (cpu_data_t *)CpuDataEntries[i].cpu_data_vaddr;
+               npmis += cpu->cpu_monotonic.mtc_npmis;
+       }
+       return npmis;
+}
+
 static void
 mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0)
 {
        assert(cpu != NULL);
        assert(ml_get_interrupts_enabled() == FALSE);
 
-       os_atomic_inc(&mt_pmis, relaxed);
-       cpu->cpu_stat.pmi_cnt++;
-       cpu->cpu_stat.pmi_cnt_wake++;
+       cpu->cpu_monotonic.mtc_npmis += 1;
+       cpu->cpu_stat.pmi_cnt_wake += 1;
 
 #if MONOTONIC_DEBUG
        if (!PMCR0_PMI(pmcr0)) {
@@ -444,7 +460,7 @@ mt_microstackshot_start_remote(__unused void *arg)
 
        core_set_enabled();
 
-       if (hw_atomic_sub(&mt_xc_sync, 1) == 0) {
+       if (os_atomic_dec(&mt_xc_sync, relaxed) == 0) {
                thread_wakeup((event_t)&mt_xc_sync);
        }
 }
index 5904e612ffc4f938f234a2fe50395327bf74ed36..4303f45fe81c5967db4fadb0327b8d38416aebb4 100644 (file)
@@ -61,7 +61,6 @@
 
 #include <sys/kdebug.h>
 
-
 #define USER_SS_ZONE_ALLOC_SIZE (0x4000)
 
 extern int debug_task;
@@ -70,7 +69,7 @@ zone_t ads_zone;     /* zone for debug_state area */
 zone_t user_ss_zone; /* zone for user arm_context_t allocations */
 
 /*
- * Routine:    consider_machine_collect
+ * Routine: consider_machine_collect
  *
  */
 void
@@ -80,7 +79,7 @@ consider_machine_collect(void)
 }
 
 /*
- * Routine:    consider_machine_adjust
+ * Routine: consider_machine_adjust
  *
  */
 void
@@ -88,22 +87,22 @@ consider_machine_adjust(void)
 {
 }
 
+
 /*
- * Routine:    machine_switch_context
+ * Routine: machine_switch_context
  *
  */
 thread_t
-machine_switch_context(
-                      thread_t old,
-                      thread_continue_t continuation,
-                      thread_t new)
+machine_switch_context(thread_t old,
+                       thread_continue_t continuation,
+                       thread_t new)
 {
        thread_t retval;
-       pmap_t          new_pmap;
-       cpu_data_t      *cpu_data_ptr;
+       pmap_t       new_pmap;
+       cpu_data_t cpu_data_ptr;
 
-#define machine_switch_context_kprintf(x...)   /* kprintf("machine_switch_con
-                                                * text: " x) */
+#define machine_switch_context_kprintf(x...) \
+       /* kprintf("machine_switch_context: " x) */
 
        cpu_data_ptr = getCpuDatap();
        if (old == new)
@@ -112,10 +111,12 @@ machine_switch_context(
        kpc_off_cpu(old);
 
 
+
        new_pmap = new->map->pmap;
        if (old->map->pmap != new_pmap)
                pmap_switch(new_pmap);
 
+
        new->machine.CpuDatap = cpu_data_ptr;
 
        /* TODO: Should this be ordered? */
@@ -130,19 +131,25 @@ machine_switch_context(
        return retval;
 }
 
+boolean_t
+machine_thread_on_core(thread_t thread)
+{
+       return thread->machine.machine_thread_flags & MACHINE_THREAD_FLAGS_ON_CPU;
+}
+
 /*
- * Routine:    machine_thread_create
+ * Routine: machine_thread_create
  *
  */
 kern_return_t
-machine_thread_create(
-                     thread_t thread,
-                     task_t task)
+machine_thread_create(thread_t thread,
+                      task_t task)
 {
        arm_context_t *thread_user_ss = NULL;
        kern_return_t result = KERN_SUCCESS;
 
-#define machine_thread_create_kprintf(x...)    /* kprintf("machine_thread_create: " x) */
+#define machine_thread_create_kprintf(x...) \
+       /* kprintf("machine_thread_create: " x) */
 
        machine_thread_create_kprintf("thread = %x\n", thread);
 
@@ -152,6 +159,10 @@ machine_thread_create(
        thread->machine.preemption_count = 0;
        thread->machine.cthread_self = 0;
        thread->machine.cthread_data = 0;
+#if defined(HAS_APPLE_PAC)
+       thread->machine.rop_pid = task->rop_pid;
+       thread->machine.disable_user_jop = task->disable_user_jop;
+#endif
 
 
        if (task != kernel_task) {
@@ -159,7 +170,8 @@ machine_thread_create(
                thread->machine.contextData = (arm_context_t *)zalloc(user_ss_zone);
 
                if (!thread->machine.contextData) {
-                       return KERN_FAILURE;
+                       result = KERN_FAILURE;
+                       goto done;
                }
 
                thread->machine.upcb = &thread->machine.contextData->ss;
@@ -176,34 +188,38 @@ machine_thread_create(
                        thread->machine.uNeon->nsh.flavor = ARM_NEON_SAVED_STATE32;
                        thread->machine.uNeon->nsh.count = ARM_NEON_SAVED_STATE32_COUNT;
                }
+
        } else {
                thread->machine.upcb = NULL;
                thread->machine.uNeon = NULL;
                thread->machine.contextData = NULL;
        }
 
-       bzero(&thread->machine.perfctrl_state, sizeof(thread->machine.perfctrl_state));
 
+       bzero(&thread->machine.perfctrl_state, sizeof(thread->machine.perfctrl_state));
        result = machine_thread_state_initialize(thread);
 
+done:
        if (result != KERN_SUCCESS) {
                thread_user_ss = thread->machine.contextData;
-               thread->machine.upcb = NULL;
-               thread->machine.uNeon = NULL;
-               thread->machine.contextData = NULL;
-               zfree(user_ss_zone, thread_user_ss);
+
+               if (thread_user_ss) {
+                       thread->machine.upcb = NULL;
+                       thread->machine.uNeon = NULL;
+                       thread->machine.contextData = NULL;
+                       zfree(user_ss_zone, thread_user_ss);
+               }
        }
 
        return result;
 }
 
 /*
- * Routine:    machine_thread_destroy
+ * Routine: machine_thread_destroy
  *
  */
 void
-machine_thread_destroy(
-                      thread_t thread)
+machine_thread_destroy(thread_t thread)
 {
        arm_context_t *thread_user_ss;
 
@@ -213,6 +229,8 @@ machine_thread_destroy(
                thread->machine.upcb = NULL;
                thread->machine.uNeon = NULL;
                thread->machine.contextData = NULL;
+
+
                zfree(user_ss_zone, thread_user_ss);
        }
 
@@ -227,7 +245,7 @@ machine_thread_destroy(
 
 
 /*
- * Routine:    machine_thread_init
+ * Routine: machine_thread_init
  *
  */
 void
@@ -251,11 +269,12 @@ machine_thread_init(void)
                             CONFIG_THREAD_MAX * (sizeof(arm_context_t)),
                             USER_SS_ZONE_ALLOC_SIZE,
                             "user save state");
+
 }
 
 
 /*
- * Routine:    get_useraddr
+ * Routine: get_useraddr
  *
  */
 user_addr_t
@@ -265,17 +284,16 @@ get_useraddr()
 }
 
 /*
- * Routine:    machine_stack_detach
+ * Routine: machine_stack_detach
  *
  */
 vm_offset_t
-machine_stack_detach(
-                    thread_t thread)
+machine_stack_detach(thread_t thread)
 {
-       vm_offset_t     stack;
+       vm_offset_t stack;
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_DETACH),
-                    (uintptr_t)thread_tid(thread), thread->priority, thread->sched_pri, 0, 0);
+                    (uintptr_t)thread_tid(thread), thread->priority, thread->sched_pri, 0, 0);
 
        stack = thread->kernel_stack;
        thread->kernel_stack = 0;
@@ -286,21 +304,22 @@ machine_stack_detach(
 
 
 /*
- * Routine:    machine_stack_attach
+ * Routine: machine_stack_attach
  *
  */
 void
-machine_stack_attach(
-                    thread_t thread,
-                    vm_offset_t stack)
+machine_stack_attach(thread_t thread,
+                     vm_offset_t stack)
 {
        struct arm_context *context;
        struct arm_saved_state64 *savestate;
+       uint32_t current_el;
 
-#define machine_stack_attach_kprintf(x...)     /* kprintf("machine_stack_attach: " x) */
+#define machine_stack_attach_kprintf(x...) \
+       /* kprintf("machine_stack_attach: " x) */
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_ATTACH),
-                    (uintptr_t)thread_tid(thread), thread->priority, thread->sched_pri, 0, 0);
+                    (uintptr_t)thread_tid(thread), thread->priority, thread->sched_pri, 0, 0);
 
        thread->kernel_stack = stack;
        thread->machine.kstackptr = stack + kernel_stack_size - sizeof(struct thread_kernel_state);
@@ -308,28 +327,66 @@ machine_stack_attach(
 
        machine_stack_attach_kprintf("kstackptr: %lx\n", (vm_address_t)thread->machine.kstackptr);
 
+       current_el = (uint32_t) __builtin_arm_rsr64("CurrentEL");
        context = &((thread_kernel_state_t) thread->machine.kstackptr)->machine;
        savestate = saved_state64(&context->ss);
        savestate->fp = 0;
-       savestate->lr = (uintptr_t)thread_continue;
        savestate->sp = thread->machine.kstackptr;
-       savestate->cpsr = PSR64_KERNEL_DEFAULT;
+#if defined(HAS_APPLE_PAC)
+       /* Sign the initial kernel stack saved state */
+       const uint32_t default_cpsr = PSR64_KERNEL_DEFAULT & ~PSR64_MODE_EL_MASK;
+       asm volatile (
+               "mov    x0, %[ss]"                              "\n"
+
+               "mov    x1, xzr"                                "\n"
+               "str    x1, [x0, %[SS64_PC]]"                   "\n"
+
+               "mov    x2, %[default_cpsr_lo]"                 "\n"
+               "movk   x2, %[default_cpsr_hi], lsl #16"        "\n"
+               "mrs    x3, CurrentEL"                          "\n"
+               "orr    w2, w2, w3"                             "\n"
+               "str    w2, [x0, %[SS64_CPSR]]"                 "\n"
+
+               "adrp   x3, _thread_continue@page"              "\n"
+               "add    x3, x3, _thread_continue@pageoff"       "\n"
+               "str    x3, [x0, %[SS64_LR]]"                   "\n"
+
+               "mov    x4, xzr"                                "\n"
+               "mov    x5, xzr"                                "\n"
+               "stp    x4, x5, [x0, %[SS64_X16]]"              "\n"
+
+               "mov    x6, lr"                                 "\n"
+               "bl     _ml_sign_thread_state"                  "\n"
+               "mov    lr, x6"                                 "\n"
+               :
+               : [ss]                  "r"(&context->ss),
+                 [default_cpsr_lo]     "M"(default_cpsr & 0xFFFF),
+                 [default_cpsr_hi]     "M"(default_cpsr >> 16),
+                 [SS64_X16]            "i"(offsetof(struct arm_saved_state, ss_64.x[16])),
+                 [SS64_PC]             "i"(offsetof(struct arm_saved_state, ss_64.pc)),
+                 [SS64_CPSR]           "i"(offsetof(struct arm_saved_state, ss_64.cpsr)),
+                 [SS64_LR]             "i"(offsetof(struct arm_saved_state, ss_64.lr))
+               : "x0", "x1", "x2", "x3", "x4", "x5", "x6"
+       );
+#else
+       savestate->lr = (uintptr_t)thread_continue;
+       savestate->cpsr = (PSR64_KERNEL_DEFAULT & ~PSR64_MODE_EL_MASK) | current_el;
+#endif /* defined(HAS_APPLE_PAC) */
        machine_stack_attach_kprintf("thread = %p pc = %llx, sp = %llx\n", thread, savestate->lr, savestate->sp);
 }
 
 
 /*
- * Routine:    machine_stack_handoff
+ * Routine: machine_stack_handoff
  *
  */
 void
-machine_stack_handoff(
-                     thread_t old,
-                     thread_t new)
+machine_stack_handoff(thread_t old,
+                      thread_t new)
 {
-       vm_offset_t     stack;
-       pmap_t          new_pmap;
-       cpu_data_t      *cpu_data_ptr;
+       vm_offset_t  stack;
+       pmap_t       new_pmap;
+       cpu_data_t cpu_data_ptr;
 
        kpc_off_cpu(old);
 
@@ -344,10 +401,12 @@ machine_stack_handoff(
        }
 
 
+
        new_pmap = new->map->pmap;
        if (old->map->pmap != new_pmap)
                pmap_switch(new_pmap);
 
+
        new->machine.CpuDatap = cpu_data_ptr;
 
        /* TODO: Should this be ordered? */
@@ -362,17 +421,17 @@ machine_stack_handoff(
 
 
 /*
- * Routine:    call_continuation
+ * Routine: call_continuation
  *
  */
 void
-call_continuation(
-       thread_continue_t continuation,
-       void *parameter,
-       wait_result_t wresult,
-       boolean_t enable_interrupts)
+call_continuation(thread_continue_t continuation,
+                  void *parameter,
+                  wait_result_t wresult,
+                  boolean_t enable_interrupts)
 {
-#define call_continuation_kprintf(x...)        /* kprintf("call_continuation_kprintf:" x) */
+#define call_continuation_kprintf(x...) \
+       /* kprintf("call_continuation_kprintf:" x) */
 
        call_continuation_kprintf("thread = %p continuation = %p, stack = %p\n", current_thread(), continuation, current_thread()->machine.kstackptr);
        Call_continuation(continuation, parameter, wresult, enable_interrupts);
@@ -398,12 +457,12 @@ call_continuation(
 
 void arm_debug_set32(arm_debug_state_t *debug_state)
 {
-       struct cpu_data         *cpu_data_ptr;
-       arm_debug_info_t        *debug_info = arm_debug_info();
-       boolean_t               intr, set_mde = 0;
-       arm_debug_state_t       off_state;
-       uint32_t                        i;
-       uint64_t                all_ctrls = 0;
+       struct cpu_data *  cpu_data_ptr;
+       arm_debug_info_t * debug_info    = arm_debug_info();
+       boolean_t          intr, set_mde = 0;
+       arm_debug_state_t  off_state;
+       uint32_t           i;
+       uint64_t           all_ctrls = 0;
 
        intr = ml_set_interrupts_enabled(FALSE);
        cpu_data_ptr = getCpuDatap();
@@ -550,16 +609,14 @@ void arm_debug_set32(arm_debug_state_t *debug_state)
        } else {
                update_mdscr(0x8000, 0);
        }
-               
+
        /*
         * Software debug single step enable
         */
        if (debug_state->uds.ds32.mdscr_el1 & 0x1) {
                update_mdscr(0x8000, 1); // ~MDE | SS : no brk/watch while single stepping (which we've set)
 
-               set_saved_state_cpsr((current_thread()->machine.upcb), 
-                       get_saved_state_cpsr((current_thread()->machine.upcb)) | PSR64_SS);
-
+               mask_saved_state_cpsr(current_thread()->machine.upcb, PSR64_SS, 0);
        } else {
 
                update_mdscr(0x1, 0);
@@ -577,12 +634,12 @@ void arm_debug_set32(arm_debug_state_t *debug_state)
 
 void arm_debug_set64(arm_debug_state_t *debug_state)
 {
-       struct cpu_data         *cpu_data_ptr;
-       arm_debug_info_t        *debug_info = arm_debug_info();
-       boolean_t               intr, set_mde = 0;
-       arm_debug_state_t       off_state;
-       uint32_t                        i;
-       uint64_t                        all_ctrls = 0;
+       struct cpu_data *  cpu_data_ptr;
+       arm_debug_info_t * debug_info    = arm_debug_info();
+       boolean_t          intr, set_mde = 0;
+       arm_debug_state_t  off_state;
+       uint32_t           i;
+       uint64_t           all_ctrls = 0;
 
        intr = ml_set_interrupts_enabled(FALSE);
        cpu_data_ptr = getCpuDatap();
@@ -727,7 +784,7 @@ void arm_debug_set64(arm_debug_state_t *debug_state)
        if (set_mde) {
                update_mdscr(0, 0x8000); // MDSCR_EL1[MDE]
        }
-               
+
        /*
         * Software debug single step enable
         */
@@ -735,9 +792,7 @@ void arm_debug_set64(arm_debug_state_t *debug_state)
 
                update_mdscr(0x8000, 1); // ~MDE | SS : no brk/watch while single stepping (which we've set)
 
-               set_saved_state_cpsr((current_thread()->machine.upcb), 
-                       get_saved_state_cpsr((current_thread()->machine.upcb)) | PSR64_SS);
-
+               mask_saved_state_cpsr(current_thread()->machine.upcb, PSR64_SS, 0);
        } else {
 
                update_mdscr(0x1, 0);
@@ -779,7 +834,7 @@ void arm_debug_set(arm_debug_state_t *debug_state)
 boolean_t
 debug_legacy_state_is_valid(arm_legacy_debug_state_t *debug_state)
 {
-       arm_debug_info_t        *debug_info = arm_debug_info();
+       arm_debug_info_t *debug_info = arm_debug_info();
        uint32_t i;
        for (i = 0; i < debug_info->num_breakpoint_pairs; i++) {
                if (0 != debug_state->bcr[i] && VM_MAX_ADDRESS32 <= debug_state->bvr[i])
@@ -796,7 +851,7 @@ debug_legacy_state_is_valid(arm_legacy_debug_state_t *debug_state)
 boolean_t
 debug_state_is_valid32(arm_debug_state32_t *debug_state)
 {
-       arm_debug_info_t        *debug_info = arm_debug_info();
+       arm_debug_info_t *debug_info = arm_debug_info();
        uint32_t i;
        for (i = 0; i < debug_info->num_breakpoint_pairs; i++) {
                if (0 != debug_state->bcr[i] && VM_MAX_ADDRESS32 <= debug_state->bvr[i])
@@ -813,7 +868,7 @@ debug_state_is_valid32(arm_debug_state32_t *debug_state)
 boolean_t
 debug_state_is_valid64(arm_debug_state64_t *debug_state)
 {
-       arm_debug_info_t        *debug_info = arm_debug_info();
+       arm_debug_info_t *debug_info = arm_debug_info();
        uint32_t i;
        for (i = 0; i < debug_info->num_breakpoint_pairs; i++) {
                if (0 != debug_state->bcr[i] && MACH_VM_MAX_ADDRESS <= debug_state->bvr[i])
@@ -832,38 +887,33 @@ debug_state_is_valid64(arm_debug_state64_t *debug_state)
  * is ignored in the case of ARM -- Is this the right assumption?
  */
 void
-copy_legacy_debug_state(
-               arm_legacy_debug_state_t *src,
-               arm_legacy_debug_state_t *target,
-               __unused boolean_t all)
+copy_legacy_debug_state(arm_legacy_debug_state_t * src,
+                        arm_legacy_debug_state_t * target,
+                        __unused boolean_t         all)
 {
        bcopy(src, target, sizeof(arm_legacy_debug_state_t));
 }
 
 void
-copy_debug_state32(
-               arm_debug_state32_t *src,
-               arm_debug_state32_t *target,
-               __unused boolean_t all)
+copy_debug_state32(arm_debug_state32_t * src,
+                   arm_debug_state32_t * target,
+                   __unused boolean_t    all)
 {
        bcopy(src, target, sizeof(arm_debug_state32_t));
 }
 
 void
-copy_debug_state64(
-               arm_debug_state64_t *src,
-               arm_debug_state64_t *target,
-               __unused boolean_t all)
+copy_debug_state64(arm_debug_state64_t * src,
+                   arm_debug_state64_t * target,
+                   __unused boolean_t    all)
 {
        bcopy(src, target, sizeof(arm_debug_state64_t));
 }
 
 kern_return_t
-machine_thread_set_tsd_base(
-       thread_t                        thread,
-       mach_vm_offset_t        tsd_base)
+machine_thread_set_tsd_base(thread_t         thread,
+                            mach_vm_offset_t tsd_base)
 {
-
        if (thread->task == kernel_task) {
                return KERN_INVALID_ARGUMENT;
        }
index 5f6e474fb6057a5bb2d6ec65a5407fad61bc03bd..9026e45f11cf0d6ba27e9237022ec1b77d8f43df 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011 Apple Inc. All rights reserved.
+ * Copyright (c) 2011-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -63,7 +63,6 @@
 #include <kern/thread.h>
 #include <kern/processor.h>
 #include <kern/sched_prim.h>
-#include <kern/xpr.h>
 #include <kern/debug.h>
 #include <string.h>
 #include <tests/xnupost.h>
@@ -85,6 +84,15 @@ kern_return_t arm64_lock_test(void);
 kern_return_t arm64_munger_test(void);
 kern_return_t ex_cb_test(void);
 kern_return_t arm64_pan_test(void);
+kern_return_t arm64_late_pan_test(void);
+#if defined(HAS_APPLE_PAC)
+#include <ptrauth.h>
+kern_return_t arm64_ropjop_test(void);
+#endif
+#if HAS_TWO_STAGE_SPR_LOCK
+kern_return_t arm64_spr_lock_test(void);
+extern void arm64_msr_lock_test(uint64_t);
+#endif
 
 // exception handler ignores this fault address during PAN test
 #if __ARM_PAN_AVAILABLE__
@@ -1060,8 +1068,163 @@ ex_cb_test()
        return KERN_SUCCESS;
 }
 
+#if defined(HAS_APPLE_PAC)
+
+/*
+ *
+ *  arm64_ropjop_test - basic xnu ROP/JOP test plan
+ *
+ *  - assert ROP/JOP configured and running status match
+ *  - assert all AppleMode ROP/JOP features enabled
+ *  - ensure ROP/JOP keys are set and diversified
+ *  - sign a KVA (the address of this function),assert it was signed (changed)
+ *  - authenticate the newly signed KVA
+ *  - assert the authed KVA is the original KVA
+ *  - corrupt a signed ptr, auth it, ensure auth failed
+ *  - assert the failed authIB of corrupted pointer is tagged
+ *
+ */
+
+kern_return_t
+arm64_ropjop_test()
+{
+       T_LOG("Testing ROP/JOP");
+
+       /* how is ROP/JOP configured */
+       boolean_t config_rop_enabled = TRUE;
+       boolean_t config_jop_enabled = !(BootArgs->bootFlags & kBootFlagsDisableJOP);
+
+
+       /* assert all AppleMode ROP/JOP features enabled */
+       uint64_t apctl = __builtin_arm_rsr64(ARM64_REG_APCTL_EL1);
+#if __APSTS_SUPPORTED__
+       uint64_t apsts = __builtin_arm_rsr64(ARM64_REG_APSTS_EL1);
+       T_ASSERT(apsts & APSTS_EL1_MKEYVld, NULL);
+#else
+       T_ASSERT(apctl & APCTL_EL1_MKEYVld, NULL);
+#endif /* __APSTS_SUPPORTED__ */
+       T_ASSERT(apctl & APCTL_EL1_AppleMode, NULL);
+       T_ASSERT(apctl & APCTL_EL1_KernKeyEn, NULL);
+
+       /* ROP/JOP keys enabled current status */
+       bool status_jop_enabled, status_rop_enabled;
+#if __APSTS_SUPPORTED__ /* H13+ */
+       // TODO: update unit test to understand ROP/JOP enabled config for H13+
+       status_jop_enabled = status_rop_enabled = apctl & APCTL_EL1_EnAPKey1;
+#elif __APCFG_SUPPORTED__ /* H12 */
+       uint64_t apcfg_el1 = __builtin_arm_rsr64(APCFG_EL1);
+       status_jop_enabled = status_rop_enabled = apcfg_el1 & APCFG_EL1_ELXENKEY;
+#else /* !__APCFG_SUPPORTED__ H11 */
+       uint64_t sctlr_el1 = __builtin_arm_rsr64("SCTLR_EL1");
+       status_jop_enabled = sctlr_el1 & SCTLR_PACIA_ENABLED;
+       status_rop_enabled = sctlr_el1 & SCTLR_PACIB_ENABLED;
+#endif /* __APSTS_SUPPORTED__ */
+
+       /* assert configured and running status match */
+       T_ASSERT(config_rop_enabled == status_rop_enabled, NULL);
+       T_ASSERT(config_jop_enabled == status_jop_enabled, NULL);
+
+
+       if (config_jop_enabled) {
+               /* jop key */
+               uint64_t apiakey_hi = __builtin_arm_rsr64(ARM64_REG_APIAKEYHI_EL1);
+               uint64_t apiakey_lo = __builtin_arm_rsr64(ARM64_REG_APIAKEYLO_EL1);
+
+               /* ensure JOP key is set and diversified */
+               T_EXPECT(apiakey_hi != KERNEL_ROP_ID && apiakey_lo != KERNEL_ROP_ID, NULL);
+               T_EXPECT(apiakey_hi != 0 && apiakey_lo != 0, NULL);
+       }
+
+       if (config_rop_enabled) {
+               /* rop key */
+               uint64_t apibkey_hi = __builtin_arm_rsr64(ARM64_REG_APIBKEYHI_EL1);
+               uint64_t apibkey_lo = __builtin_arm_rsr64(ARM64_REG_APIBKEYLO_EL1);
+
+               /* ensure ROP key is set and diversified */
+               T_EXPECT(apibkey_hi != KERNEL_ROP_ID && apibkey_lo != KERNEL_ROP_ID, NULL);
+               T_EXPECT(apibkey_hi != 0 && apibkey_lo != 0, NULL);
+
+               /* sign a KVA (the address of this function) */
+               uint64_t kva_signed = (uint64_t) ptrauth_sign_unauthenticated((void *)&config_rop_enabled, ptrauth_key_asib, 0);
+
+               /* assert it was signed (changed) */
+               T_EXPECT(kva_signed != (uint64_t)&config_rop_enabled, NULL);
+
+               /* authenticate the newly signed KVA */
+               uint64_t kva_authed = (uint64_t) ml_auth_ptr_unchecked((void *)kva_signed, ptrauth_key_asib, 0);
+
+               /* assert the authed KVA is the original KVA */
+               T_EXPECT(kva_authed == (uint64_t)&config_rop_enabled, NULL);
+
+               /* corrupt a signed ptr, auth it, ensure auth failed */
+               uint64_t kva_corrupted = kva_signed ^ 1;
+
+               /* authenticate the corrupted pointer */
+               kva_authed = (uint64_t) ml_auth_ptr_unchecked((void *)kva_corrupted, ptrauth_key_asib, 0);
+
+               /* when AuthIB fails, bits 63:62 will be set to 2'b10 */
+               uint64_t auth_fail_mask = 3ULL << 61;
+               uint64_t authib_fail = 2ULL << 61;
+
+               /* assert the failed authIB of corrupted pointer is tagged */
+               T_EXPECT((kva_authed & auth_fail_mask) == authib_fail, NULL);
+       }
+
+       return KERN_SUCCESS;
+}
+#endif /* defined(HAS_APPLE_PAC) */
 
 #if __ARM_PAN_AVAILABLE__
+
+struct pan_test_thread_args {
+       volatile bool join;
+};
+
+static void
+arm64_pan_test_thread(void *arg, wait_result_t __unused wres)
+{
+       T_ASSERT(__builtin_arm_rsr("pan") != 0, NULL);
+
+       struct pan_test_thread_args *args = arg;
+
+       for (processor_t p = processor_list; p != NULL; p = p->processor_list) {
+               thread_bind(p);
+               thread_block(THREAD_CONTINUE_NULL);
+               kprintf("Running PAN test on cpu %d\n", p->cpu_id);
+               arm64_pan_test();
+       }
+
+       /* unbind thread from specific cpu */
+       thread_bind(PROCESSOR_NULL);
+       thread_block(THREAD_CONTINUE_NULL);
+
+       while (!args->join) {
+               ;
+       }
+
+       thread_wakeup(args);
+}
+
+kern_return_t
+arm64_late_pan_test()
+{
+       thread_t thread;
+       kern_return_t kr;
+
+       struct pan_test_thread_args args;
+       args.join = false;
+
+       kr = kernel_thread_start(arm64_pan_test_thread, &args, &thread);
+       assert(kr == KERN_SUCCESS);
+
+       thread_deallocate(thread);
+
+       assert_wait(&args, THREAD_UNINT);
+       args.join = true;
+       thread_block(THREAD_CONTINUE_NULL);
+       return KERN_SUCCESS;
+}
+
 kern_return_t
 arm64_pan_test()
 {
@@ -1069,6 +1232,9 @@ arm64_pan_test()
 
        T_LOG("Testing PAN.");
 
+
+       T_ASSERT((__builtin_arm_rsr("SCTLR_EL1") & SCTLR_PAN_UNCHANGED) == 0, "SCTLR_EL1.SPAN must be cleared");
+
        T_ASSERT(__builtin_arm_rsr("pan") != 0, NULL);
 
        pan_exception_level = 0;
@@ -1107,9 +1273,10 @@ arm64_pan_test()
        pan_ro_addr = 0;
 
        __builtin_arm_wsr("pan", 1);
+
        return KERN_SUCCESS;
 }
-#endif
+#endif /* __ARM_PAN_AVAILABLE__ */
 
 
 kern_return_t
@@ -1125,3 +1292,44 @@ arm64_munger_test()
        return 0;
 }
 
+
+#if HAS_TWO_STAGE_SPR_LOCK
+
+#define STR1(x) #x
+#define STR(x) STR1(x)
+
+volatile vm_offset_t spr_lock_test_addr;
+volatile uint32_t spr_lock_exception_esr;
+
+kern_return_t
+arm64_spr_lock_test()
+{
+       processor_t p;
+
+       for (p = processor_list; p != NULL; p = p->processor_list) {
+               thread_bind(p);
+               thread_block(THREAD_CONTINUE_NULL);
+               T_LOG("Running SPR lock test on cpu %d\n", p->cpu_id);
+
+               uint64_t orig_value = __builtin_arm_rsr64(STR(ARM64_REG_HID8));
+               spr_lock_test_addr = (vm_offset_t)VM_KERNEL_STRIP_PTR(arm64_msr_lock_test);
+               spr_lock_exception_esr = 0;
+               arm64_msr_lock_test(~orig_value);
+               T_EXPECT(spr_lock_exception_esr != 0, "MSR write generated synchronous abort");
+
+               uint64_t new_value = __builtin_arm_rsr64(STR(ARM64_REG_HID8));
+               T_EXPECT(orig_value == new_value, "MSR write did not succeed");
+
+               spr_lock_test_addr = 0;
+       }
+
+       /* unbind thread from specific cpu */
+       thread_bind(PROCESSOR_NULL);
+       thread_block(THREAD_CONTINUE_NULL);
+
+       T_PASS("Done running SPR lock tests");
+
+       return KERN_SUCCESS;
+}
+
+#endif /* HAS_TWO_STAGE_SPR_LOCK */
diff --git a/osfmk/arm64/platform_tests_asm.s b/osfmk/arm64/platform_tests_asm.s
new file mode 100644 (file)
index 0000000..5ec159e
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <arm64/asm.h>
+#include <pexpert/arm64/board_config.h>
+
+#if HAS_TWO_STAGE_SPR_LOCK
+       .text
+       .align 2
+       .globl EXT(arm64_msr_lock_test)
+LEXT(arm64_msr_lock_test)
+       msr             ARM64_REG_HID8, x0
+       ret
+#endif
index ee13e1844255469b2bc4232e054c73c35ad8ccb7..10c7aa5671739b8b56bc014cc6e6a69380884d7e 100644 (file)
  * +-+-+-+-+-+---+---+--+--+----------+-+-+-+-+-+-----+
  *
  * where:
- *     NZCV    Comparison flags
- *     PAN             Privileged Access Never
- *  SS         Single step
- *     IL              Illegal state
- *     DAIF    Interrupt masks
- *     M               Mode field
+ *   NZCV: Comparison flags
+ *   PAN:  Privileged Access Never
+ *   SS:   Single step
+ *   IL:   Illegal state
+ *   DAIF: Interrupt masks
+ *   M:    Mode field
  */
 
-#define PSR64_NZCV_SHIFT                28
-#define PSR64_NZCV_MASK                 (1 << PSR64_NZCV_SHIFT)
+#define PSR64_NZCV_SHIFT 28
+#define PSR64_NZCV_MASK  (1 << PSR64_NZCV_SHIFT)
 
-#define PSR64_N_SHIFT                   31
-#define PSR64_N                                 (1 << PSR64_N_SHIFT)
+#define PSR64_N_SHIFT    31
+#define PSR64_N          (1 << PSR64_N_SHIFT)
 
-#define PSR64_Z_SHIFT                   30
-#define PSR64_Z                                 (1 << PSR64_Z_SHIFT)
+#define PSR64_Z_SHIFT    30
+#define PSR64_Z          (1 << PSR64_Z_SHIFT)
 
-#define PSR64_C_SHIFT                   29
-#define PSR64_C                                 (1 << PSR64_C_SHIFT)
+#define PSR64_C_SHIFT    29
+#define PSR64_C          (1 << PSR64_C_SHIFT)
 
-#define PSR64_V_SHIFT                   28
-#define PSR64_V                                 (1 << PSR64_V_SHIFT)
+#define PSR64_V_SHIFT    28
+#define PSR64_V          (1 << PSR64_V_SHIFT)
 
-#define PSR64_PAN_SHIFT                 22
-#define PSR64_PAN                               (1 << PSR64_PAN_SHIFT)
+#define PSR64_PAN_SHIFT  22
+#define PSR64_PAN        (1 << PSR64_PAN_SHIFT)
 
-#define PSR64_SS_SHIFT                  21
-#define PSR64_SS                                (1 << PSR64_SS_SHIFT)
+#define PSR64_SS_SHIFT   21
+#define PSR64_SS         (1 << PSR64_SS_SHIFT)
 
-#define PSR64_IL_SHIFT                  20
-#define PSR64_IL                                (1 << PSR64_IL_SHIFT)
+#define PSR64_IL_SHIFT   20
+#define PSR64_IL         (1 << PSR64_IL_SHIFT)
 
 /*
  * msr DAIF, Xn and mrs Xn, DAIF transfer into
  * and out of bits 9:6
  */
-#define DAIF_DEBUG_SHIFT                9
-#define DAIF_DEBUGF                             (1 << DAIF_DEBUG_SHIFT)
+#define DAIF_DEBUG_SHIFT      9
+#define DAIF_DEBUGF           (1 << DAIF_DEBUG_SHIFT)
 
-#define DAIF_ASYNC_SHIFT                8
-#define DAIF_ASYNCF                             (1 << DAIF_ASYNC_SHIFT)
+#define DAIF_ASYNC_SHIFT      8
+#define DAIF_ASYNCF           (1 << DAIF_ASYNC_SHIFT)
 
-#define DAIF_IRQF_SHIFT                 7
-#define DAIF_IRQF                               (1 << DAIF_IRQF_SHIFT)
+#define DAIF_IRQF_SHIFT       7
+#define DAIF_IRQF             (1 << DAIF_IRQF_SHIFT)
 
-#define DAIF_FIQF_SHIFT                 6
-#define DAIF_FIQF                               (1 << DAIF_FIQF_SHIFT)
+#define DAIF_FIQF_SHIFT       6
+#define DAIF_FIQF             (1 << DAIF_FIQF_SHIFT)
 
-#define DAIF_ALL                                (DAIF_DEBUGF | DAIF_ASYNCF | DAIF_IRQF | DAIF_FIQF)
-#define DAIF_STANDARD_DISABLE   (DAIF_ASYNCF | DAIF_IRQF | DAIF_FIQF)
+#define DAIF_ALL              (DAIF_DEBUGF | DAIF_ASYNCF | DAIF_IRQF | DAIF_FIQF)
+#define DAIF_STANDARD_DISABLE (DAIF_ASYNCF | DAIF_IRQF | DAIF_FIQF)
 
-#define SPSR_INTERRUPTS_ENABLED(x)      (!(x & DAIF_FIQF))
+#define SPSR_INTERRUPTS_ENABLED(x) (!(x & DAIF_FIQF))
 
 /*
  * msr DAIFSet, Xn, and msr DAIFClr, Xn transfer
  * from bits 3:0.
  */
-#define DAIFSC_DEBUGF                   (1 << 3)
-#define DAIFSC_ASYNCF                   (1 << 2)
-#define DAIFSC_IRQF                             (1 << 1)
-#define DAIFSC_FIQF                             (1 << 0)
-#define DAIFSC_ALL                              (DAIFSC_DEBUGF | DAIFSC_ASYNCF | DAIFSC_IRQF | DAIFSC_FIQF)
+#define DAIFSC_DEBUGF           (1 << 3)
+#define DAIFSC_ASYNCF           (1 << 2)
+#define DAIFSC_IRQF             (1 << 1)
+#define DAIFSC_FIQF             (1 << 0)
+#define DAIFSC_ALL              (DAIFSC_DEBUGF | DAIFSC_ASYNCF | DAIFSC_IRQF | DAIFSC_FIQF)
 #define DAIFSC_STANDARD_DISABLE (DAIFSC_ASYNCF | DAIFSC_IRQF | DAIFSC_FIQF)
 
 /*
  * ARM64_TODO: unify with ARM?
  */
-#define PSR64_CF                0x20000000      /* Carry/Borrow/Extend */
+#define PSR64_CF         0x20000000 /* Carry/Borrow/Extend */
 
-#define PSR64_MODE_MASK                 0x1F
+#define PSR64_MODE_MASK         0x1F
 
-#define PSR64_MODE_USER32_THUMB         0x20
+#define PSR64_MODE_USER32_THUMB 0x20
 
-#define PSR64_MODE_RW_SHIFT             4
-#define PSR64_MODE_RW_64                0
-#define PSR64_MODE_RW_32                (0x1 << PSR64_MODE_RW_SHIFT)
+#define PSR64_MODE_RW_SHIFT     4
+#define PSR64_MODE_RW_64        0
+#define PSR64_MODE_RW_32        (0x1 << PSR64_MODE_RW_SHIFT)
 
-#define PSR64_MODE_EL_SHIFT             2
-#define PSR64_MODE_EL_MASK              (0x3 << PSR64_MODE_EL_SHIFT)
-#define PSR64_MODE_EL3                  (0x3 << PSR64_MODE_EL_SHIFT)
-#define PSR64_MODE_EL1                  (0x1 << PSR64_MODE_EL_SHIFT)
-#define PSR64_MODE_EL0                  0
+#define PSR64_MODE_EL_SHIFT     2
+#define PSR64_MODE_EL_MASK      (0x3 << PSR64_MODE_EL_SHIFT)
+#define PSR64_MODE_EL3          (0x3 << PSR64_MODE_EL_SHIFT)
+#define PSR64_MODE_EL2          (0x2 << PSR64_MODE_EL_SHIFT)
+#define PSR64_MODE_EL1          (0x1 << PSR64_MODE_EL_SHIFT)
+#define PSR64_MODE_EL0          0
 
-#define PSR64_MODE_SPX                  0x1
-#define PSR64_MODE_SP0                  0
+#define PSR64_MODE_SPX          0x1
+#define PSR64_MODE_SP0          0
 
-#define PSR64_USER32_DEFAULT            (PSR64_MODE_RW_32 | PSR64_MODE_EL0 | PSR64_MODE_SP0)
-#define PSR64_USER64_DEFAULT            (PSR64_MODE_RW_64 | PSR64_MODE_EL0 | PSR64_MODE_SP0)
-#define PSR64_KERNEL_DEFAULT    (DAIF_STANDARD_DISABLE | PSR64_MODE_RW_64 | PSR64_MODE_EL1 | PSR64_MODE_SP0)
+#define PSR64_USER32_DEFAULT    (PSR64_MODE_RW_32 | PSR64_MODE_EL0 | PSR64_MODE_SP0)
+#define PSR64_USER64_DEFAULT    (PSR64_MODE_RW_64 | PSR64_MODE_EL0 | PSR64_MODE_SP0)
+#define PSR64_KERNEL_STANDARD   (DAIF_STANDARD_DISABLE | PSR64_MODE_RW_64 | PSR64_MODE_EL1 | PSR64_MODE_SP0)
+#if __ARM_PAN_AVAILABLE__
+#define PSR64_KERNEL_DEFAULT    (PSR64_KERNEL_STANDARD | PSR64_PAN)
+#else
+#define PSR64_KERNEL_DEFAULT    PSR64_KERNEL_STANDARD
+#endif
 
-#define PSR64_IS_KERNEL(x)              ((x & PSR64_MODE_EL_MASK) == PSR64_MODE_EL1)
-#define PSR64_IS_USER(x)                ((x & PSR64_MODE_EL_MASK) == PSR64_MODE_EL0)
+#define PSR64_IS_KERNEL(x)      ((x & PSR64_MODE_EL_MASK) > PSR64_MODE_EL0)
+#define PSR64_IS_USER(x)        ((x & PSR64_MODE_EL_MASK) == PSR64_MODE_EL0)
 
-#define PSR64_IS_USER32(x)              (PSR64_IS_USER(x) && (x & PSR64_MODE_RW_32))
-#define PSR64_IS_USER64(x)              (PSR64_IS_USER(x) && !(x & PSR64_MODE_RW_32))
+#define PSR64_IS_USER32(x)      (PSR64_IS_USER(x) && (x & PSR64_MODE_RW_32))
+#define PSR64_IS_USER64(x)      (PSR64_IS_USER(x) && !(x & PSR64_MODE_RW_32))
 
 
 
  * System Control Register (SCTLR)
  */
 
-#define SCTLR_RESERVED                  ((3 << 28) | (1 << 22) | (1 << 20) | (1 << 11))
+#define SCTLR_RESERVED     ((3ULL << 28) | (1ULL << 22) | (1ULL << 20) | (1ULL << 11))
+#if defined(HAS_APPLE_PAC)
+
+// 31    PACIA_ENABLED AddPACIA and AuthIA functions enabled
+#define SCTLR_PACIA_ENABLED_SHIFT 31
+#define SCTLR_PACIA_ENABLED       (1ULL << SCTLR_PACIA_ENABLED_SHIFT)
+// 30    PACIB_ENABLED AddPACIB and AuthIB functions enabled
+#define SCTLR_PACIB_ENABLED       (1ULL << 30)
+// 29:28 RES1 11
+// 27    PACDA_ENABLED AddPACDA and AuthDA functions enabled
+#define SCTLR_PACDA_ENABLED       (1ULL << 27)
+// 13    PACDB_ENABLED  AddPACDB and AuthDB functions enabled
+#define SCTLR_PACDB_ENABLED       (1ULL << 13)
 
-// 26          UCI             User Cache Instructions
-#define SCTLR_UCI_ENABLED               (1 << 26)
+#define SCTLR_JOP_KEYS_ENABLED (SCTLR_PACIA_ENABLED | SCTLR_PACDA_ENABLED | SCTLR_PACDB_ENABLED)
+#endif /* defined(HAS_APPLE_PAC) */
 
-// 25          EE              Exception Endianness
-#define SCTLR_EE_BIG_ENDIAN             (1 << 25)
+// 26    UCI User Cache Instructions
+#define SCTLR_UCI_ENABLED         (1ULL << 26)
 
-// 24          E0E             EL0 Endianness
-#define SCTLR_E0E_BIG_ENDIAN    (1 << 24)
+// 25    EE             Exception Endianness
+#define SCTLR_EE_BIG_ENDIAN       (1ULL << 25)
 
-// 23          SPAN    Set PAN
-#define SCTLR_PAN_UNCHANGED             (1 << 23)
+// 24    E0E            EL0 Endianness
+#define SCTLR_E0E_BIG_ENDIAN      (1ULL << 24)
 
-// 22          RES1    1
-// 21          RES0    0
-// 20          RES1    1
+// 23    SPAN           Set PAN
+#define SCTLR_PAN_UNCHANGED       (1ULL << 23)
 
-// 19          WXN             Writeable implies eXecute Never
-#define SCTLR_WXN_ENABLED               (1 << 19)
+// 22    RES1           1
+// 21    RES0           0
+// 20    RES1           1
 
-// 18          nTWE    Not trap WFE from EL0
-#define SCTLR_nTWE_WFE_ENABLED  (1 << 18)
+// 19    WXN            Writeable implies eXecute Never
+#define SCTLR_WXN_ENABLED         (1ULL << 19)
 
-// 17          RES0    0
+// 18    nTWE           Not trap WFE from EL0
+#define SCTLR_nTWE_WFE_ENABLED    (1ULL << 18)
 
-// 16          nTWI    Not trap WFI from EL0
-#define SCTRL_nTWI_WFI_ENABLED  (1 << 16)
+// 17    RES0           0
 
-// 15          UCT             User Cache Type register (CTR_EL0)
-#define SCTLR_UCT_ENABLED               (1 << 15)
+// 16    nTWI           Not trap WFI from EL0
+#define SCTRL_nTWI_WFI_ENABLED    (1ULL << 16)
 
-// 14          DZE             User Data Cache Zero (DC ZVA)
-#define SCTLR_DZE_ENABLED               (1 << 14)
+// 15    UCT            User Cache Type register (CTR_EL0)
+#define SCTLR_UCT_ENABLED         (1ULL << 15)
 
-// 13          PACDB_ENABLED            AddPACDB and AuthDB functions enabled
-#define SCTLR_PACDB_ENABLED             (1 << 13)
+// 14    DZE            User Data Cache Zero (DC ZVA)
+#define SCTLR_DZE_ENABLED         (1ULL << 14)
 
-// 12          I               Instruction cache enable
-#define SCTLR_I_ENABLED                 (1 << 12)
+// 12    I              Instruction cache enable
+#define SCTLR_I_ENABLED           (1ULL << 12)
 
-// 11          RES1    1
-// 10          RES0    0
+// 11    RES1           1
+// 10    RES0           0
 
-// 9           UMA             User Mask Access
-#define SCTLR_UMA_ENABLED               (1 << 9)
+// 9     UMA            User Mask Access
+#define SCTLR_UMA_ENABLED         (1ULL << 9)
 
-// 8           SED             SETEND Disable
-#define SCTLR_SED_DISABLED              (1 << 8)
+// 8     SED            SETEND Disable
+#define SCTLR_SED_DISABLED        (1ULL << 8)
 
-// 7           ITD             IT Disable
-#define SCTLR_ITD_DISABLED              (1 << 7)
+// 7     ITD            IT Disable
+#define SCTLR_ITD_DISABLED        (1ULL << 7)
 
-// 6           RES0    0
+// 6     RES0           0
 
-// 5           CP15BEN CP15 Barrier ENable
-#define SCTLR_CP15BEN_ENABLED   (1 << 5)
+// 5     CP15BEN        CP15 Barrier ENable
+#define SCTLR_CP15BEN_ENABLED     (1ULL << 5)
 
-// 4           SA0             Stack Alignment check for EL0
-#define SCTLR_SA0_ENABLED               (1 << 4)
+// 4     SA0            Stack Alignment check for EL0
+#define SCTLR_SA0_ENABLED         (1ULL << 4)
 
-// 3           SA              Stack Alignment check
-#define SCTLR_SA_ENABLED                (1 << 3)
+// 3     SA             Stack Alignment check
+#define SCTLR_SA_ENABLED          (1ULL << 3)
 
-// 2           C               Cache enable
-#define SCTLR_C_ENABLED                 (1 << 2)
+// 2     C              Cache enable
+#define SCTLR_C_ENABLED           (1ULL << 2)
 
-// 1           A               Alignment check
-#define SCTLR_A_ENABLED                 (1 << 1)
+// 1     A              Alignment check
+#define SCTLR_A_ENABLED           (1ULL << 1)
 
-// 0           M               MMU enable
-#define SCTLR_M_ENABLED                 (1 << 0)
+// 0     M              MMU enable
+#define SCTLR_M_ENABLED           (1ULL << 0)
 
-#define SCTLR_EL1_DEFAULT               (SCTLR_RESERVED | SCTLR_UCI_ENABLED | SCTLR_nTWE_WFE_ENABLED | SCTLR_DZE_ENABLED | \
-                                               SCTLR_I_ENABLED | SCTLR_SED_DISABLED | SCTLR_CP15BEN_ENABLED |             \
-                                               SCTLR_SA0_ENABLED | SCTLR_SA_ENABLED | SCTLR_C_ENABLED | SCTLR_M_ENABLED)
+#define SCTLR_EL1_DEFAULT \
+       (SCTLR_RESERVED | SCTLR_UCI_ENABLED | SCTLR_nTWE_WFE_ENABLED | SCTLR_DZE_ENABLED | \
+        SCTLR_I_ENABLED | SCTLR_SED_DISABLED | SCTLR_CP15BEN_ENABLED |                    \
+        SCTLR_SA0_ENABLED | SCTLR_SA_ENABLED | SCTLR_C_ENABLED | SCTLR_M_ENABLED)
 
 /*
  * Coprocessor Access Control Register (CPACR)
  * +---+---+------+------+--------------------+
  *
  * where:
- *     TTA             Trace trap
- *     FPEN    Floating point enable
+ *   TTA:  Trace trap
+ *   FPEN: Floating point enable
  */
-#define CPACR_TTA_SHIFT                         28
-#define CPACR_TTA                                       (1 << CPACR_TTA_SHIFT)
+#define CPACR_TTA_SHIFT     28
+#define CPACR_TTA           (1 << CPACR_TTA_SHIFT)
 
-#define CPACR_FPEN_SHIFT                        20
-#define CPACR_FPEN_EL0_TRAP                     (0x1 << CPACR_FPEN_SHIFT)
-#define CPACR_FPEN_ENABLE                       (0x3 << CPACR_FPEN_SHIFT)
+#define CPACR_FPEN_SHIFT    20
+#define CPACR_FPEN_EL0_TRAP (0x1 << CPACR_FPEN_SHIFT)
+#define CPACR_FPEN_ENABLE   (0x3 << CPACR_FPEN_SHIFT)
 
 /*
  *  FPSR: Floating Point Status Register
  * +--+--+--+--+--+-------------------+---+--+---+---+---+---+---+
  */
 
-#define FPSR_N_SHIFT    31
-#define FPSR_Z_SHIFT    30
-#define FPSR_C_SHIFT    29
-#define FPSR_V_SHIFT    28
-#define FPSR_QC_SHIFT   27
-#define FPSR_IDC_SHIFT  7
-#define FPSR_IXC_SHIFT  4
-#define FPSR_UFC_SHIFT  3
-#define FPSR_OFC_SHIFT  2
-#define FPSR_DZC_SHIFT  1
-#define FPSR_IOC_SHIFT  0
-#define FPSR_N          (1 << FPSR_N_SHIFT)
-#define FPSR_Z          (1 << FPSR_Z_SHIFT)
-#define FPSR_C          (1 << FPSR_C_SHIFT)
-#define FPSR_V          (1 << FPSR_V_SHIFT)
-#define FPSR_QC         (1 << FPSR_QC_SHIFT)
-#define FPSR_IDC        (1 << FPSR_IDC_SHIFT)
-#define FPSR_IXC        (1 << FPSR_IXC_SHIFT)
-#define FPSR_UFC        (1 << FPSR_UFC_SHIFT)
-#define FPSR_OFC        (1 << FPSR_OFC_SHIFT)
-#define FPSR_DZC        (1 << FPSR_DZC_SHIFT)
-#define FPSR_IOC        (1 << FPSR_IOC_SHIFT)
+#define FPSR_N_SHIFT   31
+#define FPSR_Z_SHIFT   30
+#define FPSR_C_SHIFT   29
+#define FPSR_V_SHIFT   28
+#define FPSR_QC_SHIFT  27
+#define FPSR_IDC_SHIFT 7
+#define FPSR_IXC_SHIFT 4
+#define FPSR_UFC_SHIFT 3
+#define FPSR_OFC_SHIFT 2
+#define FPSR_DZC_SHIFT 1
+#define FPSR_IOC_SHIFT 0
+#define FPSR_N         (1 << FPSR_N_SHIFT)
+#define FPSR_Z         (1 << FPSR_Z_SHIFT)
+#define FPSR_C         (1 << FPSR_C_SHIFT)
+#define FPSR_V         (1 << FPSR_V_SHIFT)
+#define FPSR_QC        (1 << FPSR_QC_SHIFT)
+#define FPSR_IDC       (1 << FPSR_IDC_SHIFT)
+#define FPSR_IXC       (1 << FPSR_IXC_SHIFT)
+#define FPSR_UFC       (1 << FPSR_UFC_SHIFT)
+#define FPSR_OFC       (1 << FPSR_OFC_SHIFT)
+#define FPSR_DZC       (1 << FPSR_DZC_SHIFT)
+#define FPSR_IOC       (1 << FPSR_IOC_SHIFT)
 
 /*
  * A mask for all for all of the bits that are not RAZ for FPSR; this
  * is primarily for converting between a 32-bit view of NEON state
  * (FPSCR) and a 64-bit view of NEON state (FPSR, FPCR).
  */
-#define FPSR_MASK       (FPSR_N | FPSR_Z | FPSR_C | FPSR_V | FPSR_QC | \
-                        FPSR_IDC | FPSR_IXC | FPSR_UFC | FPSR_OFC | \
-                        FPSR_DZC | FPSR_IOC)
+#define FPSR_MASK \
+       (FPSR_N | FPSR_Z | FPSR_C | FPSR_V | FPSR_QC | FPSR_IDC | FPSR_IXC | \
+        FPSR_UFC | FPSR_OFC | FPSR_DZC | FPSR_IOC)
 
 /*
  *  FPCR: Floating Point Control Register
  * +-----+---+--+--+-----+------+--+---+---+--+---+---+---+---+---+--------+
  */
 
-#define FPCR_AHP_SHIFT          26
-#define FPCR_DN_SHIFT           25
-#define FPCR_FZ_SHIFT           24
-#define FPCR_RMODE_SHIFT        22
-#define FPCR_STRIDE_SHIFT       20
-#define FPCR_LEN_SHIFT          16
-#define FPCR_IDE_SHIFT          15
-#define FPCR_IXE_SHIFT          12
-#define FPCR_UFE_SHIFT          11
-#define FPCR_OFE_SHIFT          10
-#define FPCR_DZE_SHIFT          9
-#define FPCR_IOE_SHIFT          8
-#define FPCR_AHP                (1 << FPCR_AHP_SHIFT)
-#define FPCR_DN                 (1 << FPCR_DN_SHIFT)
-#define FPCR_FZ                 (1 << FPCR_FZ_SHIFT)
-#define FPCR_RMODE              (0x3 << FPCR_RMODE_SHIFT)
-#define FPCR_STRIDE             (0x3 << FPCR_STRIDE_SHIFT)
-#define FPCR_LEN                (0x7 << FPCR_LEN_SHIFT)
-#define FPCR_IDE                (1 << FPCR_IDE_SHIFT)
-#define FPCR_IXE                (1 << FPCR_IXE_SHIFT)
-#define FPCR_UFE                (1 << FPCR_UFE_SHIFT)
-#define FPCR_OFE                (1 << FPCR_OFE_SHIFT)
-#define FPCR_DZE                (1 << FPCR_DZE_SHIFT)
-#define FPCR_IOE                (1 << FPCR_IOE_SHIFT)
-#define FPCR_DEFAULT            (FPCR_DN)
-#define FPCR_DEFAULT_32         (FPCR_DN|FPCR_FZ)
+#define FPCR_AHP_SHIFT    26
+#define FPCR_DN_SHIFT     25
+#define FPCR_FZ_SHIFT     24
+#define FPCR_RMODE_SHIFT  22
+#define FPCR_STRIDE_SHIFT 20
+#define FPCR_LEN_SHIFT    16
+#define FPCR_IDE_SHIFT    15
+#define FPCR_IXE_SHIFT    12
+#define FPCR_UFE_SHIFT    11
+#define FPCR_OFE_SHIFT    10
+#define FPCR_DZE_SHIFT    9
+#define FPCR_IOE_SHIFT    8
+#define FPCR_AHP          (1 << FPCR_AHP_SHIFT)
+#define FPCR_DN           (1 << FPCR_DN_SHIFT)
+#define FPCR_FZ           (1 << FPCR_FZ_SHIFT)
+#define FPCR_RMODE        (0x3 << FPCR_RMODE_SHIFT)
+#define FPCR_STRIDE       (0x3 << FPCR_STRIDE_SHIFT)
+#define FPCR_LEN          (0x7 << FPCR_LEN_SHIFT)
+#define FPCR_IDE          (1 << FPCR_IDE_SHIFT)
+#define FPCR_IXE          (1 << FPCR_IXE_SHIFT)
+#define FPCR_UFE          (1 << FPCR_UFE_SHIFT)
+#define FPCR_OFE          (1 << FPCR_OFE_SHIFT)
+#define FPCR_DZE          (1 << FPCR_DZE_SHIFT)
+#define FPCR_IOE          (1 << FPCR_IOE_SHIFT)
+#define FPCR_DEFAULT      (FPCR_DN)
+#define FPCR_DEFAULT_32   (FPCR_DN|FPCR_FZ)
 
 /*
  * A mask for all for all of the bits that are not RAZ for FPCR; this
  * is primarily for converting between a 32-bit view of NEON state
  * (FPSCR) and a 64-bit view of NEON state (FPSR, FPCR).
  */
-#define FPCR_MASK               (FPCR_AHP | FPCR_DN | FPCR_FZ | FPCR_RMODE | \
-                                FPCR_STRIDE | FPCR_LEN | FPCR_IDE | FPCR_IXE | \
-                                FPCR_UFE | FPCR_OFE | FPCR_DZE | FPCR_IOE)
+#define FPCR_MASK \
+       (FPCR_AHP | FPCR_DN | FPCR_FZ | FPCR_RMODE | FPCR_STRIDE | FPCR_LEN | \
+        FPCR_IDE | FPCR_IXE | FPCR_UFE | FPCR_OFE | FPCR_DZE | FPCR_IOE)
 
 /*
  * Translation Control Register (TCR)
  * | zero |TBI1|TBI0|AS|z| IPS | TG1 | SH1 |ORGN1|IRGN1|EPD1|A1| T1SZ | TG0 | SH0 |ORGN0|IRGN0|EPD0|z|T0SZ|
  * +------+----+----+--+-+-----+-----+-----+-----+-----+----+--+------+-----+-----+-----+-----+----+-+----+
  *
- *     TBI1    Top Byte Ignored for TTBR1 region
- *     TBI0    Top Byte Ignored for TTBR0 region
- *     AS              ASID Size
- *     IPS             Physical Address Size limit
- *     TG1             Granule Size for TTBR1 region
- *     SH1             Shareability for TTBR1 region
- *  ORGN1      Outer Cacheability for TTBR1 region
- *  IRGN1      Inner Cacheability for TTBR1 region
- *     EPD1    Translation table walk disable for TTBR1
- *     A1              ASID selection from TTBR1 enable
- *     T1SZ    Virtual address size for TTBR1
- *     TG0             Granule Size for TTBR0 region
- *     SH0             Shareability for TTBR0 region
- *  ORGN0      Outer Cacheability for TTBR0 region
- *  IRGN0      Inner Cacheability for TTBR0 region
- *     T0SZ    Virtual address size for TTBR0
+ * TBI1:  Top Byte Ignored for TTBR1 region
+ * TBI0:  Top Byte Ignored for TTBR0 region
+ * AS:    ASID Size
+ * IPS:   Physical Address Size limit
+ * TG1:   Granule Size for TTBR1 region
+ * SH1:   Shareability for TTBR1 region
+ * ORGN1: Outer Cacheability for TTBR1 region
+ * IRGN1: Inner Cacheability for TTBR1 region
+ * EPD1:  Translation table walk disable for TTBR1
+ * A1:    ASID selection from TTBR1 enable
+ * T1SZ:  Virtual address size for TTBR1
+ * TG0:   Granule Size for TTBR0 region
+ * SH0:   Shareability for TTBR0 region
+ * ORGN0: Outer Cacheability for TTBR0 region
+ * IRGN0: Inner Cacheability for TTBR0 region
+ * T0SZ:  Virtual address size for TTBR0
  */
 
-#define TCR_T0SZ_SHIFT                          0ULL
-#define TCR_TSZ_BITS                            6ULL
-#define TCR_TSZ_MASK                            ((1ULL << TCR_TSZ_BITS) - 1ULL)
+#define TCR_T0SZ_SHIFT          0ULL
+#define TCR_TSZ_BITS            6ULL
+#define TCR_TSZ_MASK            ((1ULL << TCR_TSZ_BITS) - 1ULL)
 
-#define TCR_IRGN0_SHIFT                         8ULL
-#define TCR_IRGN0_DISABLED                      (0ULL << TCR_IRGN0_SHIFT)
-#define TCR_IRGN0_WRITEBACK                     (1ULL << TCR_IRGN0_SHIFT)
-#define TCR_IRGN0_WRITETHRU                     (2ULL << TCR_IRGN0_SHIFT)
-#define TCR_IRGN0_WRITEBACKNO           (3ULL << TCR_IRGN0_SHIFT)
+#define TCR_IRGN0_SHIFT         8ULL
+#define TCR_IRGN0_DISABLED      (0ULL << TCR_IRGN0_SHIFT)
+#define TCR_IRGN0_WRITEBACK     (1ULL << TCR_IRGN0_SHIFT)
+#define TCR_IRGN0_WRITETHRU     (2ULL << TCR_IRGN0_SHIFT)
+#define TCR_IRGN0_WRITEBACKNO   (3ULL << TCR_IRGN0_SHIFT)
 
-#define TCR_ORGN0_SHIFT                         10ULL
-#define TCR_ORGN0_DISABLED                      (0ULL << TCR_ORGN0_SHIFT)
-#define TCR_ORGN0_WRITEBACK                     (1ULL << TCR_ORGN0_SHIFT)
-#define TCR_ORGN0_WRITETHRU                     (2ULL << TCR_ORGN0_SHIFT)
-#define TCR_ORGN0_WRITEBACKNO           (3ULL << TCR_ORGN0_SHIFT)
+#define TCR_ORGN0_SHIFT         10ULL
+#define TCR_ORGN0_DISABLED      (0ULL << TCR_ORGN0_SHIFT)
+#define TCR_ORGN0_WRITEBACK     (1ULL << TCR_ORGN0_SHIFT)
+#define TCR_ORGN0_WRITETHRU     (2ULL << TCR_ORGN0_SHIFT)
+#define TCR_ORGN0_WRITEBACKNO   (3ULL << TCR_ORGN0_SHIFT)
 
-#define TCR_SH0_SHIFT                           12ULL
-#define TCR_SH0_NONE                            (0ULL << TCR_SH0_SHIFT)
-#define TCR_SH0_OUTER                           (2ULL << TCR_SH0_SHIFT)
-#define TCR_SH0_INNER                           (3ULL << TCR_SH0_SHIFT)
+#define TCR_SH0_SHIFT           12ULL
+#define TCR_SH0_NONE            (0ULL << TCR_SH0_SHIFT)
+#define TCR_SH0_OUTER           (2ULL << TCR_SH0_SHIFT)
+#define TCR_SH0_INNER           (3ULL << TCR_SH0_SHIFT)
 
-#define TCR_TG0_GRANULE_SHIFT           (14ULL)
+#define TCR_TG0_GRANULE_SHIFT   (14ULL)
 
-#define TCR_TG0_GRANULE_4KB                     (0ULL << TCR_TG0_GRANULE_SHIFT)
-#define TCR_TG0_GRANULE_64KB            (1ULL << TCR_TG0_GRANULE_SHIFT)
-#define TCR_TG0_GRANULE_16KB            (2ULL << TCR_TG0_GRANULE_SHIFT)
+#define TCR_TG0_GRANULE_4KB     (0ULL << TCR_TG0_GRANULE_SHIFT)
+#define TCR_TG0_GRANULE_64KB    (1ULL << TCR_TG0_GRANULE_SHIFT)
+#define TCR_TG0_GRANULE_16KB    (2ULL << TCR_TG0_GRANULE_SHIFT)
 
 #if __ARM_16K_PG__
-#define TCR_TG0_GRANULE_SIZE            (TCR_TG0_GRANULE_16KB)
+#define TCR_TG0_GRANULE_SIZE    (TCR_TG0_GRANULE_16KB)
 #else
-#define TCR_TG0_GRANULE_SIZE            (TCR_TG0_GRANULE_4KB)
+#define TCR_TG0_GRANULE_SIZE    (TCR_TG0_GRANULE_4KB)
 #endif
 
-#define TCR_T1SZ_SHIFT                          16ULL
+#define TCR_T1SZ_SHIFT          16ULL
 
-#define TCR_A1_ASID1                            (1ULL << 22ULL)
-#define TCR_EPD1_TTBR1_DISABLED         (1ULL << 23ULL)
+#define TCR_A1_ASID1            (1ULL << 22ULL)
+#define TCR_EPD1_TTBR1_DISABLED (1ULL << 23ULL)
 
-#define TCR_IRGN1_SHIFT                         24ULL
-#define TCR_IRGN1_DISABLED                      (0ULL << TCR_IRGN1_SHIFT)
-#define TCR_IRGN1_WRITEBACK                     (1ULL << TCR_IRGN1_SHIFT)
-#define TCR_IRGN1_WRITETHRU                     (2ULL << TCR_IRGN1_SHIFT)
-#define TCR_IRGN1_WRITEBACKNO           (3ULL << TCR_IRGN1_SHIFT)
+#define TCR_IRGN1_SHIFT          24ULL
+#define TCR_IRGN1_DISABLED       (0ULL << TCR_IRGN1_SHIFT)
+#define TCR_IRGN1_WRITEBACK      (1ULL << TCR_IRGN1_SHIFT)
+#define TCR_IRGN1_WRITETHRU      (2ULL << TCR_IRGN1_SHIFT)
+#define TCR_IRGN1_WRITEBACKNO    (3ULL << TCR_IRGN1_SHIFT)
 
-#define TCR_ORGN1_SHIFT                         26ULL
-#define TCR_ORGN1_DISABLED                      (0ULL << TCR_ORGN1_SHIFT)
-#define TCR_ORGN1_WRITEBACK                     (1ULL << TCR_ORGN1_SHIFT)
-#define TCR_ORGN1_WRITETHRU                     (2ULL << TCR_ORGN1_SHIFT)
-#define TCR_ORGN1_WRITEBACKNO           (3ULL << TCR_ORGN1_SHIFT)
+#define TCR_ORGN1_SHIFT          26ULL
+#define TCR_ORGN1_DISABLED       (0ULL << TCR_ORGN1_SHIFT)
+#define TCR_ORGN1_WRITEBACK      (1ULL << TCR_ORGN1_SHIFT)
+#define TCR_ORGN1_WRITETHRU      (2ULL << TCR_ORGN1_SHIFT)
+#define TCR_ORGN1_WRITEBACKNO    (3ULL << TCR_ORGN1_SHIFT)
 
-#define TCR_SH1_SHIFT                           28ULL
-#define TCR_SH1_NONE                            (0ULL << TCR_SH1_SHIFT)
-#define TCR_SH1_OUTER                           (2ULL << TCR_SH1_SHIFT)
-#define TCR_SH1_INNER                           (3ULL << TCR_SH1_SHIFT)
+#define TCR_SH1_SHIFT            28ULL
+#define TCR_SH1_NONE             (0ULL << TCR_SH1_SHIFT)
+#define TCR_SH1_OUTER            (2ULL << TCR_SH1_SHIFT)
+#define TCR_SH1_INNER            (3ULL << TCR_SH1_SHIFT)
 
-#define TCR_TG1_GRANULE_SHIFT           30ULL
+#define TCR_TG1_GRANULE_SHIFT    30ULL
 
-#define TCR_TG1_GRANULE_16KB            (1ULL << TCR_TG1_GRANULE_SHIFT)
-#define TCR_TG1_GRANULE_4KB                     (2ULL << TCR_TG1_GRANULE_SHIFT)
-#define TCR_TG1_GRANULE_64KB            (3ULL << TCR_TG1_GRANULE_SHIFT)
+#define TCR_TG1_GRANULE_16KB     (1ULL << TCR_TG1_GRANULE_SHIFT)
+#define TCR_TG1_GRANULE_4KB      (2ULL << TCR_TG1_GRANULE_SHIFT)
+#define TCR_TG1_GRANULE_64KB     (3ULL << TCR_TG1_GRANULE_SHIFT)
 
 #if __ARM_16K_PG__
-#define TCR_TG1_GRANULE_SIZE            (TCR_TG1_GRANULE_16KB)
+#define TCR_TG1_GRANULE_SIZE     (TCR_TG1_GRANULE_16KB)
 #else
-#define TCR_TG1_GRANULE_SIZE            (TCR_TG1_GRANULE_4KB)
+#define TCR_TG1_GRANULE_SIZE     (TCR_TG1_GRANULE_4KB)
 #endif
 
-#define TCR_IPS_SHIFT                           32ULL
-#define TCR_IPS_32BITS                          (0ULL << TCR_IPS_SHIFT)
-#define TCR_IPS_36BITS                          (1ULL << TCR_IPS_SHIFT)
-#define TCR_IPS_40BITS                          (2ULL << TCR_IPS_SHIFT)
-#define TCR_IPS_42BITS                          (3ULL << TCR_IPS_SHIFT)
-#define TCR_IPS_44BITS                          (4ULL << TCR_IPS_SHIFT)
-#define TCR_IPS_48BITS                          (5ULL << TCR_IPS_SHIFT)
-
-#define TCR_AS_16BIT_ASID                       (1ULL << 36)
-#define TCR_TBI0_TOPBYTE_IGNORED        (1ULL << 37)
-#define TCR_TBI1_TOPBYTE_IGNORED        (1ULL << 38)
+#define TCR_IPS_SHIFT            32ULL
+#define TCR_IPS_32BITS           (0ULL << TCR_IPS_SHIFT)
+#define TCR_IPS_36BITS           (1ULL << TCR_IPS_SHIFT)
+#define TCR_IPS_40BITS           (2ULL << TCR_IPS_SHIFT)
+#define TCR_IPS_42BITS           (3ULL << TCR_IPS_SHIFT)
+#define TCR_IPS_44BITS           (4ULL << TCR_IPS_SHIFT)
+#define TCR_IPS_48BITS           (5ULL << TCR_IPS_SHIFT)
+
+#define TCR_AS_16BIT_ASID        (1ULL << 36)
+#define TCR_TBI0_TOPBYTE_IGNORED (1ULL << 37)
+#define TCR_TBI1_TOPBYTE_IGNORED (1ULL << 38)
+#define TCR_TBID0_TBI_DATA_ONLY  (1ULL << 51)
+#define TCR_TBID1_TBI_DATA_ONLY  (1ULL << 52)
+
+#if defined(HAS_APPLE_PAC)
+#define TCR_TBID0_ENABLE         TCR_TBID0_TBI_DATA_ONLY
+#else
+#define TCR_TBID0_ENABLE         0
+#endif
 
 /*
  * Multiprocessor Affinity Register (MPIDR_EL1)
  * +---------------------------------+--+-----+--+-----+----+----+
  *
  * where
- *     U               Uniprocessor
- *     MT              Multi-threading at lowest affinity level
- *     Aff2    "1" - PCORE, "0" - ECORE
- *     Aff1    Cluster ID
- *     Aff0    CPU ID
+ *   U:    Uniprocessor
+ *   MT:   Multi-threading at lowest affinity level
+ *   Aff2: "1" - PCORE, "0" - ECORE
+ *   Aff1: Cluster ID
+ *   Aff0: CPU ID
  */
-#define MPIDR_AFF0_MASK                         0xFF
-#define MPIDR_AFF1_MASK                         0xFF00
-#define MPIDR_AFF1_SHIFT                        8
-#define MPIDR_AFF2_MASK                         0xFF0000
-#define MPIDR_AFF2_SHIFT                        16
-
-/*
- * We currently use a 3 level page table (rather than the full 4
- * level page table).  As a result, we do not have the full 48-bits
- * of address space per TTBR (although the 16KB granule size lets us
- * get very close).
- */
-#if __ARM64_TWO_LEVEL_PMAP__ && !__ARM_16K_PG__
-#error ARM64 does not currently support a 2 level page table with 4KB pages
-#endif /* __ARM64_TWO_LEVEL_PMAP__ */
+#define MPIDR_AFF0_SHIFT 0
+#define MPIDR_AFF0_WIDTH 8
+#define MPIDR_AFF0_MASK  (((1 << MPIDR_AFF0_WIDTH) - 1) << MPIDR_AFF0_SHIFT)
+#define MPIDR_AFF1_SHIFT 8
+#define MPIDR_AFF1_WIDTH 8
+#define MPIDR_AFF1_MASK  (((1 << MPIDR_AFF1_WIDTH) - 1) << MPIDR_AFF1_SHIFT)
+#define MPIDR_AFF2_SHIFT 16
+#define MPIDR_AFF2_WIDTH 8
+#define MPIDR_AFF2_MASK  (((1 << MPIDR_AFF2_WIDTH) - 1) << MPIDR_AFF2_SHIFT)
 
 /*
  * TXSZ indicates the size of the range a TTBR covers.  Currently,
  */
 #endif /* __ARM_KERNEL_PROTECT__ */
 #ifdef __ARM_16K_PG__
-#if __ARM64_TWO_LEVEL_PMAP__
-#define T0SZ_BOOT                                               28ULL
-#elif __ARM64_PMAP_SUBPAGE_L1__
-#define T0SZ_BOOT                                               25ULL
-#else /* __ARM64_TWO_LEVEL_PMAP__ */
-#define T0SZ_BOOT                                               17ULL
-#endif /* __ARM64_TWO_LEVEL_PMAP__ */
+#if __ARM64_PMAP_SUBPAGE_L1__
+#define T0SZ_BOOT 25ULL
+#else /* !__ARM64_PMAP_SUBPAGE_L1__ */
+#define T0SZ_BOOT 17ULL
+#endif /* !__ARM64_PMAP_SUBPAGE_L1__ */
 #else /* __ARM_16K_PG__ */
 #if __ARM64_PMAP_SUBPAGE_L1__
-#define T0SZ_BOOT                                               26ULL
+#define T0SZ_BOOT 26ULL
 #else /* __ARM64_PMAP_SUBPAGE_L1__ */
-#define T0SZ_BOOT                                               25ULL
+#define T0SZ_BOOT 25ULL
 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
 #endif /* __ARM_16K_PG__ */
 
 #if defined(APPLE_ARM64_ARCH_FAMILY)
 /* T0SZ must be the same as T1SZ */
-#define T1SZ_BOOT                                               T0SZ_BOOT
+#define T1SZ_BOOT T0SZ_BOOT
 #else /* defined(APPLE_ARM64_ARCH_FAMILY) */
 #ifdef __ARM_16K_PG__
-#if __ARM64_TWO_LEVEL_PMAP__
-#define T1SZ_BOOT                                               28ULL
-#elif __ARM64_PMAP_SUBPAGE_L1__
-#define T1SZ_BOOT                                               25ULL
-#else /* __ARM64_TWO_LEVEL_PMAP__ */
-#define T1SZ_BOOT                                               17ULL
-#endif /* __ARM64_TWO_LEVEL_PMAP__ */
+#if __ARM64_PMAP_SUBPAGE_L1__
+#define T1SZ_BOOT 25ULL
+#else /* !__ARM64_PMAP_SUBPAGE_L1__ */
+#define T1SZ_BOOT 17ULL
+#endif /* !__ARM64_PMAP_SUBPAGE_L1__ */
 #else /* __ARM_16K_PG__ */
 #if __ARM64_PMAP_SUBPAGE_L1__
-#define T1SZ_BOOT                                               26ULL
+#define T1SZ_BOOT 26ULL
 #else /* __ARM64_PMAP_SUBPAGE_L1__ */
-#define T1SZ_BOOT                                               25ULL
+#define T1SZ_BOOT 25ULL
 #endif /*__ARM64_PMAP_SUBPAGE_L1__*/
 #endif /* __ARM_16K_PG__ */
 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */
 
-#define TCR_EL1_BASE    (TCR_IPS_40BITS | \
-                                                TCR_SH0_OUTER | TCR_ORGN0_WRITEBACK |  TCR_IRGN0_WRITEBACK | (T0SZ_BOOT << TCR_T0SZ_SHIFT) | (TCR_TG0_GRANULE_SIZE) |\
-                                                TCR_SH1_OUTER | TCR_ORGN1_WRITEBACK |  TCR_IRGN1_WRITEBACK | (TCR_TG1_GRANULE_SIZE))
+#if __ARM_42BIT_PA_SPACE__
+#define TCR_IPS_VALUE TCR_IPS_42BITS
+#else /* !__ARM_42BIT_PA_SPACE__ */
+#define TCR_IPS_VALUE TCR_IPS_40BITS
+#endif /* !__ARM_42BIT_PA_SPACE__ */
+
+#define TCR_EL1_BASE \
+       (TCR_IPS_VALUE | TCR_SH0_OUTER | TCR_ORGN0_WRITEBACK |         \
+        TCR_IRGN0_WRITEBACK | (T0SZ_BOOT << TCR_T0SZ_SHIFT) |          \
+        (TCR_TG0_GRANULE_SIZE) | TCR_SH1_OUTER | TCR_ORGN1_WRITEBACK | \
+        TCR_IRGN1_WRITEBACK | (TCR_TG1_GRANULE_SIZE) |                 \
+        TCR_TBI0_TOPBYTE_IGNORED | (TCR_TBID0_ENABLE))
 
 #if __ARM_KERNEL_PROTECT__
-#define TCR_EL1_BOOT    (TCR_EL1_BASE | \
-                                                (T1SZ_BOOT << TCR_T1SZ_SHIFT) | TCR_TBI0_TOPBYTE_IGNORED)
-#define T1SZ_USER       (T1SZ_BOOT + 1)
-#define TCR_EL1_USER    (TCR_EL1_BASE | (T1SZ_USER << TCR_T1SZ_SHIFT) | TCR_TBI0_TOPBYTE_IGNORED)
+#define TCR_EL1_BOOT (TCR_EL1_BASE | (T1SZ_BOOT << TCR_T1SZ_SHIFT))
+#define T1SZ_USER (T1SZ_BOOT + 1)
+#define TCR_EL1_USER (TCR_EL1_BASE | (T1SZ_USER << TCR_T1SZ_SHIFT))
 #else
-#define TCR_EL1_BOOT    (TCR_EL1_BASE | \
-                                                (T1SZ_BOOT << TCR_T1SZ_SHIFT))
+#define TCR_EL1_BOOT (TCR_EL1_BASE | (T1SZ_BOOT << TCR_T1SZ_SHIFT))
 #endif /* __ARM_KERNEL_PROTECT__ */
 
+
 /*
  * Translation Table Base Register (TTBR)
  *
  * +--------+------------------+------+
  *
  */
-#define TTBR_ASID_SHIFT                 48
-#define TTBR_ASID_MASK                  0xffff000000000000
+#define TTBR_ASID_SHIFT 48
+#define TTBR_ASID_MASK  0xffff000000000000
 
-#define TTBR_BADDR_MASK                 0x0000ffffffffffff
+#define TTBR_BADDR_MASK 0x0000ffffffffffff
 
 /*
  * Memory Attribute Indirection Register
  *
  */
 
-#define MAIR_ATTR_SHIFT(x)                              (8*(x))
+#define MAIR_ATTR_SHIFT(x)          (8*(x))
 
 /* Strongly ordered or device memory attributes */
-#define MAIR_OUTER_STRONGLY_ORDERED             0x0
-#define MAIR_OUTER_DEVICE                               0x0
+#define MAIR_OUTER_STRONGLY_ORDERED 0x0
+#define MAIR_OUTER_DEVICE           0x0
 
-#define MAIR_INNER_STRONGLY_ORDERED             0x0
-#define MAIR_INNER_DEVICE                               0x4
+#define MAIR_INNER_STRONGLY_ORDERED 0x0
+#define MAIR_INNER_DEVICE           0x4
 
 /* Normal memory attributes */
-#define MAIR_OUTER_NON_CACHEABLE                0x40
-#define MAIR_OUTER_WRITE_THROUGH                0x80
-#define MAIR_OUTER_WRITE_BACK                   0xc0
+#define MAIR_OUTER_NON_CACHEABLE    0x40
+#define MAIR_OUTER_WRITE_THROUGH    0x80
+#define MAIR_OUTER_WRITE_BACK       0xc0
 
-#define MAIR_INNER_NON_CACHEABLE                0x4
-#define MAIR_INNER_WRITE_THROUGH                0x8
-#define MAIR_INNER_WRITE_BACK                   0xc
+#define MAIR_INNER_NON_CACHEABLE    0x4
+#define MAIR_INNER_WRITE_THROUGH    0x8
+#define MAIR_INNER_WRITE_BACK       0xc
 
 /* Allocate policy for cacheable memory */
-#define MAIR_OUTER_WRITE_ALLOCATE               0x10
-#define MAIR_OUTER_READ_ALLOCATE                0x20
+#define MAIR_OUTER_WRITE_ALLOCATE   0x10
+#define MAIR_OUTER_READ_ALLOCATE    0x20
 
-#define MAIR_INNER_WRITE_ALLOCATE               0x1
-#define MAIR_INNER_READ_ALLOCATE                0x2
+#define MAIR_INNER_WRITE_ALLOCATE   0x1
+#define MAIR_INNER_READ_ALLOCATE    0x2
 
 /* Memory Atribute Encoding */
 
-/* Device memory types:
- *  G (gathering): multiple reads/writes can be combined
- *  R (reordering): reads or writes may reach device out of program order
- *  E (early-acknowledge): writes may return immediately (e.g. PCIe posted writes)
+/*
+ * Device memory types:
+ * G (gathering): multiple reads/writes can be combined
+ * R (reordering): reads or writes may reach device out of program order
+ * E (early-acknowledge): writes may return immediately (e.g. PCIe posted writes)
  */
-#define MAIR_DISABLE            0x00            /* Device Memory, nGnRnE (strongly ordered) */
-#define MAIR_POSTED             0x04            /* Device Memory, nGnRE (strongly ordered, posted writes) */
-#define MAIR_WRITECOMB          0x44            /* Normal Memory, Outer Non-Cacheable, Inner Non-Cacheable */
-#define MAIR_WRITETHRU          0xBB            /* Normal Memory, Outer Write-through, Inner Write-through */
-#define MAIR_WRITEBACK          0xFF            /* Normal Memory, Outer Write-back, Inner Write-back */
-#define MAIR_INNERWRITEBACK     0x4F            /* Normal Memory, Outer Non-Cacheable, Inner Write-back */
+#define MAIR_DISABLE                   0x00 /* Device Memory, nGnRnE (strongly ordered) */
+#define MAIR_POSTED                    0x04 /* Device Memory, nGnRE (strongly ordered, posted writes) */
+#define MAIR_POSTED_REORDERED          0x08 /* Device Memory, nGRE (reorderable, posted writes) */
+#define MAIR_POSTED_COMBINED_REORDERED 0x0C /* Device Memory, GRE (reorderable, gathered writes, posted writes) */
+#define MAIR_WRITECOMB                 0x44 /* Normal Memory, Outer Non-Cacheable, Inner Non-Cacheable */
+#define MAIR_WRITETHRU                 0xBB /* Normal Memory, Outer Write-through, Inner Write-through */
+#define MAIR_WRITEBACK                 0xFF /* Normal Memory, Outer Write-back, Inner Write-back */
+#define MAIR_INNERWRITEBACK            0x4F /* Normal Memory, Outer Non-Cacheable, Inner Write-back */
 
 
 /*
- *     ARM 4-level Page Table support - 2*1024TB (2^48) of address space
+ * ARM 4-level Page Table support - 2*1024TB (2^48) of address space
  */
 
 
 /*
  *  Memory Attribute Index
  */
-#define CACHE_ATTRINDX_WRITEBACK                0x0     /* cache enabled, buffer enabled */
-#define CACHE_ATTRINDX_WRITECOMB                0x1     /* no cache, buffered writes */
-#define CACHE_ATTRINDX_WRITETHRU                0x2     /* cache enabled, buffer disabled */
-#define CACHE_ATTRINDX_DISABLE                  0x3     /* no cache, no buffer */
-#define CACHE_ATTRINDX_INNERWRITEBACK           0x4     /* inner cache enabled, buffer enabled, write allocate */
-#define CACHE_ATTRINDX_POSTED                   0x5     /* no cache, no buffer, posted writes */
-#define CACHE_ATTRINDX_DEFAULT                  CACHE_ATTRINDX_WRITEBACK
+#define CACHE_ATTRINDX_WRITEBACK                 0x0 /* cache enabled, buffer enabled  (normal memory) */
+#define CACHE_ATTRINDX_WRITECOMB                 0x1 /* no cache, buffered writes (normal memory) */
+#define CACHE_ATTRINDX_WRITETHRU                 0x2 /* cache enabled, buffer disabled (normal memory) */
+#define CACHE_ATTRINDX_DISABLE                   0x3 /* no cache, no buffer (device memory) */
+#define CACHE_ATTRINDX_INNERWRITEBACK            0x4 /* inner cache enabled, buffer enabled, write allocate (normal memory) */
+#define CACHE_ATTRINDX_POSTED                    0x5 /* no cache, no buffer, posted writes (device memory) */
+#define CACHE_ATTRINDX_POSTED_REORDERED          0x6 /* no cache, reorderable access, posted writes (device memory) */
+#define CACHE_ATTRINDX_POSTED_COMBINED_REORDERED 0x7 /* no cache, write gathering, reorderable access, posted writes (device memory) */
+#define CACHE_ATTRINDX_DEFAULT                   CACHE_ATTRINDX_WRITEBACK
+
 
 /*
- *      Access protection bit values (TTEs and PTEs)
+ * Access protection bit values (TTEs and PTEs), stage 1
+ *
+ * Bit 1 controls access type (1=RO, 0=RW), bit 0 controls user (1=access, 0=no access)
  */
-#define AP_RWNA                                                 0x0     /* priv=read-write, user=no-access */
-#define AP_RWRW                                                 0x1     /* priv=read-write, user=read-write */
-#define AP_RONA                                                 0x2     /* priv=read-only, user=no-access */
-#define AP_RORO                                                 0x3     /* priv=read-only, user=read-only */
-#define AP_MASK                                                 0x3     /* mask to find ap bits */
+#define AP_RWNA 0x0 /* priv=read-write, user=no-access */
+#define AP_RWRW 0x1 /* priv=read-write, user=read-write */
+#define AP_RONA 0x2 /* priv=read-only, user=no-access */
+#define AP_RORO 0x3 /* priv=read-only, user=read-only */
+#define AP_MASK 0x3 /* mask to find ap bits */
 
 /*
  * Shareability attributes
  */
-#define SH_NONE                                                 0x0     /* Non shareable  */
-#define SH_NONE                                                 0x0     /* Device shareable */
-#define SH_DEVICE                                               0x2     /* Normal memory Inner non shareable - Outer non shareable */
-#define SH_OUTER_MEMORY                                 0x2 /* Normal memory Inner shareable - Outer shareable */
-#define SH_INNER_MEMORY                                 0x3 /* Normal memory Inner shareable - Outer non shareable */
+#define SH_NONE         0x0 /* Non shareable  */
+#define SH_NONE         0x0 /* Device shareable */
+#define SH_DEVICE       0x2 /* Normal memory Inner non shareable - Outer non shareable */
+#define SH_OUTER_MEMORY 0x2 /* Normal memory Inner shareable - Outer shareable */
+#define SH_INNER_MEMORY 0x3 /* Normal memory Inner shareable - Outer non shareable */
 
 
 /*
  * ARM Page Granule
  */
-#ifdef  __ARM_16K_PG__
+#ifdef __ARM_16K_PG__
 #define ARM_PGSHIFT 14
 #else
 #define ARM_PGSHIFT 12
 #define ARM_PGBYTES (1 << ARM_PGSHIFT)
 #define ARM_PGMASK  (ARM_PGBYTES-1)
 
-
 /*
  *  L0 Translation table
  *
  *    Covers 256TB (2^48) of address space.
  */
 
-#ifdef __ARM_16K_PG__
-#define ARM_TT_L0_SIZE                                  0x0000800000000000ULL           /* size of area covered by a tte */
-#define ARM_TT_L0_OFFMASK                               0x00007fffffffffffULL           /* offset within an L0 entry */
-#define ARM_TT_L0_SHIFT                                 47                                                      /* page descriptor shift */
-#define ARM_TT_L0_INDEX_MASK                    0x0000800000000000ULL           /* mask for getting index in L0 table from virtual address */
-#else
-#define ARM_TT_L0_SIZE                                  0x0000008000000000ULL           /* size of area covered by a tte */
-#define ARM_TT_L0_OFFMASK                               0x0000007fffffffffULL           /* offset within an L0 entry */
-#define ARM_TT_L0_SHIFT                                 39                                                      /* page descriptor shift */
-#define ARM_TT_L0_INDEX_MASK                    0x0000ff8000000000ULL           /* mask for getting index in L0 table from virtual address */
-#endif
+/* 16K L0 */
+#define ARM_16K_TT_L0_SIZE       0x0000800000000000ULL /* size of area covered by a tte */
+#define ARM_16K_TT_L0_OFFMASK    0x00007fffffffffffULL /* offset within an L0 entry */
+#define ARM_16K_TT_L0_SHIFT      47                    /* page descriptor shift */
+#define ARM_16K_TT_L0_INDEX_MASK 0x0000800000000000ULL /* mask for getting index in L0 table from virtual address */
+
+/* 4K L0 */
+#define ARM_4K_TT_L0_SIZE       0x0000008000000000ULL /* size of area covered by a tte */
+#define ARM_4K_TT_L0_OFFMASK    0x0000007fffffffffULL /* offset within an L0 entry */
+#define ARM_4K_TT_L0_SHIFT      39                    /* page descriptor shift */
+#define ARM_4K_TT_L0_INDEX_MASK 0x0000ff8000000000ULL /* mask for getting index in L0 table from virtual address */
 
 /*
  *  L1 Translation table
  *    Covers 128TB (2^47) of address space.
  */
 
-#ifdef __ARM_16K_PG__
-#define ARM_TT_L1_SIZE                                  0x0000001000000000ULL           /* size of area covered by a tte */
-#define ARM_TT_L1_OFFMASK                               0x0000000fffffffffULL           /* offset within an L1 entry */
-#define ARM_TT_L1_SHIFT                                 36                                                      /* page descriptor shift */
+/* 16K L1 */
+#define ARM_16K_TT_L1_SIZE       0x0000001000000000ULL /* size of area covered by a tte */
+#define ARM_16K_TT_L1_OFFMASK    0x0000000fffffffffULL /* offset within an L1 entry */
+#define ARM_16K_TT_L1_SHIFT      36                    /* page descriptor shift */
 #ifdef __ARM64_PMAP_SUBPAGE_L1__
 /* This config supports 512GB per TTBR. */
-#define ARM_TT_L1_INDEX_MASK                    0x0000007000000000ULL           /* mask for getting index into L1 table from virtual address */
+#define ARM_16K_TT_L1_INDEX_MASK 0x0000007000000000ULL /* mask for getting index into L1 table from virtual address */
 #else /* __ARM64_PMAP_SUBPAGE_L1__ */
-#define ARM_TT_L1_INDEX_MASK                    0x00007ff000000000ULL           /* mask for getting index into L1 table from virtual address */
+#define ARM_16K_TT_L1_INDEX_MASK 0x00007ff000000000ULL /* mask for getting index into L1 table from virtual address */
 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
-#else /* __ARM_16K_PG__ */
-#define ARM_TT_L1_SIZE                                  0x0000000040000000ULL           /* size of area covered by a tte */
-#define ARM_TT_L1_OFFMASK                               0x000000003fffffffULL           /* offset within an L1 entry */
-#define ARM_TT_L1_SHIFT                                 30                                                      /* page descriptor shift */
+
+/* 4K L1 */
+#define ARM_4K_TT_L1_SIZE       0x0000000040000000ULL /* size of area covered by a tte */
+#define ARM_4K_TT_L1_OFFMASK    0x000000003fffffffULL /* offset within an L1 entry */
+#define ARM_4K_TT_L1_SHIFT      30                    /* page descriptor shift */
 #ifdef __ARM64_PMAP_SUBPAGE_L1__
 /* This config supports 256GB per TTBR. */
-#define ARM_TT_L1_INDEX_MASK                    0x0000003fc0000000ULL           /* mask for getting index into L1 table from virtual address */
+#define ARM_4K_TT_L1_INDEX_MASK 0x0000003fc0000000ULL /* mask for getting index into L1 table from virtual address */
 #else /* __ARM64_PMAP_SUBPAGE_L1__ */
-#define ARM_TT_L1_INDEX_MASK                    0x0000007fc0000000ULL           /* mask for getting index into L1 table from virtual address */
+#define ARM_4K_TT_L1_INDEX_MASK 0x0000007fc0000000ULL /* mask for getting index into L1 table from virtual address */
 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
-#endif
 
 /* some sugar for getting pointers to page tables and entries */
 
 #define L2_TABLE_INDEX(va) (((va) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)
 #define L3_TABLE_INDEX(va) (((va) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)
 
-#define L2_TABLE_VA(tte) ((tt_entry_t*) phystokv((*(tte)) & ARM_TTE_TABLE_MASK))
+#define L2_TABLE_VA(tte)  ((tt_entry_t*) phystokv((*(tte)) & ARM_TTE_TABLE_MASK))
 #define L3_TABLE_VA(tte2) ((pt_entry_t*) phystokv((*(tte2)) & ARM_TTE_TABLE_MASK))
 
 /*
  *    Covers 64GB (2^36) of address space.
  */
 
-#ifdef __ARM_16K_PG__
-#define ARM_TT_L2_SIZE                                  0x0000000002000000ULL           /* size of area covered by a tte */
-#define ARM_TT_L2_OFFMASK                               0x0000000001ffffffULL           /* offset within an L2 entry */
-#define ARM_TT_L2_SHIFT                                 25                                                      /* page descriptor shift */
-#define ARM_TT_L2_INDEX_MASK                    0x0000000ffe000000ULL           /* mask for getting index in L2 table from virtual address */
-#else
-#define ARM_TT_L2_SIZE                                  0x0000000000200000ULL           /* size of area covered by a tte */
-#define ARM_TT_L2_OFFMASK                               0x00000000001fffffULL           /* offset within an L2 entry */
-#define ARM_TT_L2_SHIFT                                 21                                                      /* page descriptor shift */
-#define ARM_TT_L2_INDEX_MASK                    0x000000003fe00000ULL           /* mask for getting index in L2 table from virtual address */
-#endif
+/* 16K L2 */
+#define ARM_16K_TT_L2_SIZE       0x0000000002000000ULL /* size of area covered by a tte */
+#define ARM_16K_TT_L2_OFFMASK    0x0000000001ffffffULL /* offset within an L2 entry */
+#define ARM_16K_TT_L2_SHIFT      25                    /* page descriptor shift */
+#define ARM_16K_TT_L2_INDEX_MASK 0x0000000ffe000000ULL /* mask for getting index in L2 table from virtual address */
+
+/* 4K L2 */
+#define ARM_4K_TT_L2_SIZE       0x0000000000200000ULL /* size of area covered by a tte */
+#define ARM_4K_TT_L2_OFFMASK    0x00000000001fffffULL /* offset within an L2 entry */
+#define ARM_4K_TT_L2_SHIFT      21                    /* page descriptor shift */
+#define ARM_4K_TT_L2_INDEX_MASK 0x000000003fe00000ULL /* mask for getting index in L2 table from virtual address */
 
 /*
  *  L3 Translation table
  *    Covers 32MB (2^25) of address space.
  */
 
+/* 16K L3 */
+#define ARM_16K_TT_L3_SIZE       0x0000000000004000ULL /* size of area covered by a tte */
+#define ARM_16K_TT_L3_OFFMASK    0x0000000000003fffULL /* offset within L3 PTE */
+#define ARM_16K_TT_L3_SHIFT      14                    /* page descriptor shift */
+#define ARM_16K_TT_L3_INDEX_MASK 0x0000000001ffc000ULL /* mask for page descriptor index */
+
+/* 4K L3 */
+#define ARM_4K_TT_L3_SIZE       0x0000000000001000ULL /* size of area covered by a tte */
+#define ARM_4K_TT_L3_OFFMASK    0x0000000000000fffULL /* offset within L3 PTE */
+#define ARM_4K_TT_L3_SHIFT      12                    /* page descriptor shift */
+#define ARM_4K_TT_L3_INDEX_MASK 0x00000000001ff000ULL /* mask for page descriptor index */
+
 #ifdef __ARM_16K_PG__
-#define ARM_TT_L3_SIZE                                  0x0000000000004000ULL           /* size of area covered by a tte */
-#define ARM_TT_L3_OFFMASK                               0x0000000000003fffULL           /* offset within L3 PTE */
-#define ARM_TT_L3_SHIFT                                 14                                                      /* page descriptor shift */
-#define ARM_TT_L3_INDEX_MASK                    0x0000000001ffc000ULL           /* mask for page descriptor index */
-#else
-#define ARM_TT_L3_SIZE                                  0x0000000000001000ULL           /* size of area covered by a tte */
-#define ARM_TT_L3_OFFMASK                               0x0000000000000fffULL           /* offset within L3 PTE */
-#define ARM_TT_L3_SHIFT                                 12                                                      /* page descriptor shift */
-#define ARM_TT_L3_INDEX_MASK                    0x00000000001ff000ULL           /* mask for page descriptor index */
-#endif
+
+/* Native L0 defines */
+#define ARM_TT_L0_SIZE       ARM_16K_TT_L0_SIZE
+#define ARM_TT_L0_OFFMASK    ARM_16K_TT_L0_OFFMASK
+#define ARM_TT_L0_SHIFT      ARM_16K_TT_L0_SHIFT
+#define ARM_TT_L0_INDEX_MASK ARM_16K_TT_L0_INDEX_MASK
+
+/* Native L1 defines */
+#define ARM_TT_L1_SIZE       ARM_16K_TT_L1_SIZE
+#define ARM_TT_L1_OFFMASK    ARM_16K_TT_L1_OFFMASK
+#define ARM_TT_L1_SHIFT      ARM_16K_TT_L1_SHIFT
+#define ARM_TT_L1_INDEX_MASK ARM_16K_TT_L1_INDEX_MASK
+
+/* Native L2 defines */
+#define ARM_TT_L2_SIZE       ARM_16K_TT_L2_SIZE
+#define ARM_TT_L2_OFFMASK    ARM_16K_TT_L2_OFFMASK
+#define ARM_TT_L2_SHIFT      ARM_16K_TT_L2_SHIFT
+#define ARM_TT_L2_INDEX_MASK ARM_16K_TT_L2_INDEX_MASK
+
+/* Native L3 defines */
+#define ARM_TT_L3_SIZE       ARM_16K_TT_L3_SIZE
+#define ARM_TT_L3_OFFMASK    ARM_16K_TT_L3_OFFMASK
+#define ARM_TT_L3_SHIFT      ARM_16K_TT_L3_SHIFT
+#define ARM_TT_L3_INDEX_MASK ARM_16K_TT_L3_INDEX_MASK
+
+#else /* !__ARM_16K_PG__ */
+
+/* Native L0 defines */
+#define ARM_TT_L0_SIZE       ARM_4K_TT_L0_SIZE
+#define ARM_TT_L0_OFFMASK    ARM_4K_TT_L0_OFFMASK
+#define ARM_TT_L0_SHIFT      ARM_4K_TT_L0_SHIFT
+#define ARM_TT_L0_INDEX_MASK ARM_4K_TT_L0_INDEX_MASK
+
+/* Native L1 defines */
+#define ARM_TT_L1_SIZE       ARM_4K_TT_L1_SIZE
+#define ARM_TT_L1_OFFMASK    ARM_4K_TT_L1_OFFMASK
+#define ARM_TT_L1_SHIFT      ARM_4K_TT_L1_SHIFT
+#define ARM_TT_L1_INDEX_MASK ARM_4K_TT_L1_INDEX_MASK
+
+/* Native L2 defines */
+#define ARM_TT_L2_SIZE       ARM_4K_TT_L2_SIZE
+#define ARM_TT_L2_OFFMASK    ARM_4K_TT_L2_OFFMASK
+#define ARM_TT_L2_SHIFT      ARM_4K_TT_L2_SHIFT
+#define ARM_TT_L2_INDEX_MASK ARM_4K_TT_L2_INDEX_MASK
+
+/* Native L3 defines */
+#define ARM_TT_L3_SIZE       ARM_4K_TT_L3_SIZE
+#define ARM_TT_L3_OFFMASK    ARM_4K_TT_L3_OFFMASK
+#define ARM_TT_L3_SHIFT      ARM_4K_TT_L3_SHIFT
+#define ARM_TT_L3_INDEX_MASK ARM_4K_TT_L3_INDEX_MASK
+
+#endif /* !__ARM_16K_PG__ */
 
 /*
  * Convenience definitions for:
  *
  *   My apologies to any botanists who may be reading this.
  */
-#define ARM_TT_LEAF_SIZE                                ARM_TT_L3_SIZE
-#define ARM_TT_LEAF_OFFMASK                             ARM_TT_L3_OFFMASK
-#define ARM_TT_LEAF_SHIFT                               ARM_TT_L3_SHIFT
-#define ARM_TT_LEAF_INDEX_MASK                  ARM_TT_L3_INDEX_MASK
-
-#define ARM_TT_TWIG_SIZE                                ARM_TT_L2_SIZE
-#define ARM_TT_TWIG_OFFMASK                             ARM_TT_L2_OFFMASK
-#define ARM_TT_TWIG_SHIFT                               ARM_TT_L2_SHIFT
-#define ARM_TT_TWIG_INDEX_MASK                  ARM_TT_L2_INDEX_MASK
-
-#if __ARM64_TWO_LEVEL_PMAP__
-#define ARM_TT_ROOT_SIZE                                ARM_TT_L2_SIZE
-#define ARM_TT_ROOT_OFFMASK                             ARM_TT_L2_OFFMASK
-#define ARM_TT_ROOT_SHIFT                               ARM_TT_L2_SHIFT
-#define ARM_TT_ROOT_INDEX_MASK                  ARM_TT_L2_INDEX_MASK
-#else
-#define ARM_TT_ROOT_SIZE                                ARM_TT_L1_SIZE
-#define ARM_TT_ROOT_OFFMASK                             ARM_TT_L1_OFFMASK
-#define ARM_TT_ROOT_SHIFT                               ARM_TT_L1_SHIFT
-#define ARM_TT_ROOT_INDEX_MASK                  ARM_TT_L1_INDEX_MASK
-#endif
+#define ARM_TT_LEAF_SIZE       ARM_TT_L3_SIZE
+#define ARM_TT_LEAF_OFFMASK    ARM_TT_L3_OFFMASK
+#define ARM_TT_LEAF_SHIFT      ARM_TT_L3_SHIFT
+#define ARM_TT_LEAF_INDEX_MASK ARM_TT_L3_INDEX_MASK
+
+#define ARM_TT_TWIG_SIZE       ARM_TT_L2_SIZE
+#define ARM_TT_TWIG_OFFMASK    ARM_TT_L2_OFFMASK
+#define ARM_TT_TWIG_SHIFT      ARM_TT_L2_SHIFT
+#define ARM_TT_TWIG_INDEX_MASK ARM_TT_L2_INDEX_MASK
+
+#define ARM_TT_ROOT_SIZE       ARM_TT_L1_SIZE
+#define ARM_TT_ROOT_OFFMASK    ARM_TT_L1_OFFMASK
+#define ARM_TT_ROOT_SHIFT      ARM_TT_L1_SHIFT
+#define ARM_TT_ROOT_INDEX_MASK ARM_TT_L1_INDEX_MASK
 
 /*
  * 4KB granule size:
  * +-----+------+--+---+----+------+----------------------+------+--+--+----+----+--+-------+-+-+
  *
  * where:
- *     'nG'            notGlobal bit
- *     'SH'            Shareability field
- *     'AP'            access protection
- *     'XN'            eXecute Never bit
- *     'PXN'           Privilege eXecute Never bit
- *     'NS'            Non-Secure bit
- *     'HINT'          16 entry continuguous output hint
- *     'AttrIdx'       Memory Attribute Index
+ *   nG:      notGlobal bit
+ *   SH:      Shareability field
+ *   AP:      access protection
+ *   XN:      eXecute Never bit
+ *   PXN:     Privilege eXecute Never bit
+ *   NS:      Non-Secure bit
+ *   HINT:    16 entry continuguous output hint
+ *   AttrIdx: Memory Attribute Index
  */
 
-#define TTE_SHIFT                                       3                                               /* shift width of a tte (sizeof(tte) == (1 << TTE_SHIFT)) */
+#define TTE_SHIFT                   3                              /* shift width of a tte (sizeof(tte) == (1 << TTE_SHIFT)) */
 #ifdef __ARM_16K_PG__
-#define TTE_PGENTRIES                           (16384 >> TTE_SHIFT)    /* number of ttes per page */
+#define TTE_PGENTRIES               (16384 >> TTE_SHIFT)           /* number of ttes per page */
 #else
-#define TTE_PGENTRIES                           (4096 >> TTE_SHIFT)             /* number of ttes per page */
+#define TTE_PGENTRIES               (4096 >> TTE_SHIFT)            /* number of ttes per page */
 #endif
 
-#define ARM_TTE_MAX                                     (TTE_PGENTRIES)
+#define ARM_TTE_MAX                 (TTE_PGENTRIES)
 
-#define ARM_TTE_EMPTY                           0x0000000000000000ULL   /* unasigned - invalid entry */
-#define ARM_TTE_TYPE_FAULT                      0x0000000000000000ULL   /* unasigned - invalid entry */
+#define ARM_TTE_EMPTY               0x0000000000000000ULL          /* unasigned - invalid entry */
+#define ARM_TTE_TYPE_FAULT          0x0000000000000000ULL          /* unasigned - invalid entry */
 
-#define ARM_TTE_VALID                           0x0000000000000001ULL   /* valid entry */
+#define ARM_TTE_VALID               0x0000000000000001ULL          /* valid entry */
 
-#define ARM_TTE_TYPE_MASK                       0x0000000000000002ULL   /* mask for extracting the type */
-#define ARM_TTE_TYPE_TABLE                      0x0000000000000002ULL   /* page table type */
-#define ARM_TTE_TYPE_BLOCK                      0x0000000000000000ULL   /* block entry type */
-#define ARM_TTE_TYPE_L3BLOCK            0x0000000000000002ULL
-#define ARM_TTE_TYPE_MASK                       0x0000000000000002ULL   /* mask for extracting the type */
+#define ARM_TTE_TYPE_MASK           0x0000000000000002ULL          /* mask for extracting the type */
+#define ARM_TTE_TYPE_TABLE          0x0000000000000002ULL          /* page table type */
+#define ARM_TTE_TYPE_BLOCK          0x0000000000000000ULL          /* block entry type */
+#define ARM_TTE_TYPE_L3BLOCK        0x0000000000000002ULL
+#define ARM_TTE_TYPE_MASK           0x0000000000000002ULL          /* mask for extracting the type */
 
 #ifdef __ARM_16K_PG__
-/* Note that L0/L1 block entries are disallowed for the 16KB granule size; what are we doing with these? */
-#define ARM_TTE_BLOCK_SHIFT                     12                                              /* entry shift for a 16KB L3 TTE entry */
-#define ARM_TTE_BLOCK_L0_SHIFT          ARM_TT_L0_SHIFT                 /* block shift for 128TB section */
-#define ARM_TTE_BLOCK_L1_MASK           0x0000fff000000000ULL   /* mask to extract phys address from L1 block entry */
-#define ARM_TTE_BLOCK_L1_SHIFT          ARM_TT_L1_SHIFT                 /* block shift for 64GB section */
-#define ARM_TTE_BLOCK_L2_MASK           0x0000fffffe000000ULL   /* mask to extract phys address from Level 2 Translation Block entry */
-#define ARM_TTE_BLOCK_L2_SHIFT          ARM_TT_L2_SHIFT                 /* block shift for 32MB section */
+/*
+ * Note that L0/L1 block entries are disallowed for the 16KB granule size; what
+ * are we doing with these?
+ */
+#define ARM_TTE_BLOCK_SHIFT         12                             /* entry shift for a 16KB L3 TTE entry */
+#define ARM_TTE_BLOCK_L0_SHIFT      ARM_TT_L0_SHIFT                /* block shift for 128TB section */
+#define ARM_TTE_BLOCK_L1_MASK       0x0000fff000000000ULL          /* mask to extract phys address from L1 block entry */
+#define ARM_TTE_BLOCK_L1_SHIFT      ARM_TT_L1_SHIFT                /* block shift for 64GB section */
+#define ARM_TTE_BLOCK_L2_MASK       0x0000fffffe000000ULL          /* mask to extract phys address from Level 2 Translation Block entry */
+#define ARM_TTE_BLOCK_L2_SHIFT      ARM_TT_L2_SHIFT                /* block shift for 32MB section */
 #else
-#define ARM_TTE_BLOCK_SHIFT                     12                                              /* entry shift for a 4KB L3 TTE entry */
-#define ARM_TTE_BLOCK_L0_SHIFT          ARM_TT_L0_SHIFT                 /* block shift for 2048GB section */
-#define ARM_TTE_BLOCK_L1_MASK           0x0000ffffc0000000ULL   /* mask to extract phys address from L1 block entry */
-#define ARM_TTE_BLOCK_L1_SHIFT          ARM_TT_L1_SHIFT                 /* block shift for 1GB section */
-#define ARM_TTE_BLOCK_L2_MASK           0x0000ffffffe00000ULL   /* mask to extract phys address from Level 2 Translation Block entry */
-#define ARM_TTE_BLOCK_L2_SHIFT          ARM_TT_L2_SHIFT                 /* block shift for 2MB section */
+#define ARM_TTE_BLOCK_SHIFT         12                             /* entry shift for a 4KB L3 TTE entry */
+#define ARM_TTE_BLOCK_L0_SHIFT      ARM_TT_L0_SHIFT                /* block shift for 2048GB section */
+#define ARM_TTE_BLOCK_L1_MASK       0x0000ffffc0000000ULL          /* mask to extract phys address from L1 block entry */
+#define ARM_TTE_BLOCK_L1_SHIFT      ARM_TT_L1_SHIFT                /* block shift for 1GB section */
+#define ARM_TTE_BLOCK_L2_MASK       0x0000ffffffe00000ULL          /* mask to extract phys address from Level 2 Translation Block entry */
+#define ARM_TTE_BLOCK_L2_SHIFT      ARM_TT_L2_SHIFT                /* block shift for 2MB section */
 #endif
 
-#define ARM_TTE_BLOCK_APSHIFT           6
-#define ARM_TTE_BLOCK_AP(x)                     ((x)<<ARM_TTE_BLOCK_APSHIFT) /* access protection */
-#define ARM_TTE_BLOCK_APMASK            (0x3 << ARM_TTE_BLOCK_APSHIFT)
+#define ARM_TTE_BLOCK_APSHIFT       6
+#define ARM_TTE_BLOCK_AP(x)         ((x)<<ARM_TTE_BLOCK_APSHIFT)   /* access protection */
+#define ARM_TTE_BLOCK_APMASK        (0x3 << ARM_TTE_BLOCK_APSHIFT)
 
-#define ARM_TTE_BLOCK_ATTRINDX(x)       ((x) << 2)                              /* memory attributes index */
-#define ARM_TTE_BLOCK_ATTRINDXMASK      (0x7ULL << 2)                   /* mask memory attributes index */
+#define ARM_TTE_BLOCK_ATTRINDX(x)   ((x) << 2)                     /* memory attributes index */
+#define ARM_TTE_BLOCK_ATTRINDXMASK  (0x7ULL << 2)                  /* mask memory attributes index */
 
-#define ARM_TTE_BLOCK_SH(x)                     ((x) << 8)                              /* access shared */
-#define ARM_TTE_BLOCK_SHMASK            (0x3ULL << 8)                   /* mask access shared */
+#define ARM_TTE_BLOCK_SH(x)         ((x) << 8)                     /* access shared */
+#define ARM_TTE_BLOCK_SHMASK        (0x3ULL << 8)                  /* mask access shared */
 
-#define ARM_TTE_BLOCK_AF                        0x0000000000000400ULL   /* value for access */
-#define ARM_TTE_BLOCK_AFMASK            0x0000000000000400ULL   /* access mask */
+#define ARM_TTE_BLOCK_AF            0x0000000000000400ULL          /* value for access */
+#define ARM_TTE_BLOCK_AFMASK        0x0000000000000400ULL          /* access mask */
 
-#define ARM_TTE_BLOCK_NG                        0x0000000000000800ULL   /* value for a global mapping */
-#define ARM_TTE_BLOCK_NG_MASK           0x0000000000000800ULL   /* notGlobal mapping mask */
+#define ARM_TTE_BLOCK_NG            0x0000000000000800ULL          /* value for a global mapping */
+#define ARM_TTE_BLOCK_NG_MASK       0x0000000000000800ULL          /* notGlobal mapping mask */
 
-#define ARM_TTE_BLOCK_NS                        0x0000000000000020ULL   /* value for a secure mapping */
-#define ARM_TTE_BLOCK_NS_MASK           0x0000000000000020ULL   /* notSecure mapping mask */
+#define ARM_TTE_BLOCK_NS            0x0000000000000020ULL          /* value for a secure mapping */
+#define ARM_TTE_BLOCK_NS_MASK       0x0000000000000020ULL          /* notSecure mapping mask */
 
-#define ARM_TTE_BLOCK_PNX                       0x0020000000000000ULL   /* value for privilege no execute bit */
-#define ARM_TTE_BLOCK_PNXMASK           0x0020000000000000ULL   /* privilege no execute mask */
+#define ARM_TTE_BLOCK_PNX           0x0020000000000000ULL          /* value for privilege no execute bit */
+#define ARM_TTE_BLOCK_PNXMASK       0x0020000000000000ULL          /* privilege no execute mask */
 
-#define ARM_TTE_BLOCK_NX                        0x0040000000000000ULL   /* value for no execute */
-#define ARM_TTE_BLOCK_NXMASK            0x0040000000000000ULL   /* no execute mask */
+#define ARM_TTE_BLOCK_NX            0x0040000000000000ULL          /* value for no execute */
+#define ARM_TTE_BLOCK_NXMASK        0x0040000000000000ULL          /* no execute mask */
 
-#define ARM_TTE_BLOCK_WIRED                     0x0080000000000000ULL   /* value for software wired bit */
-#define ARM_TTE_BLOCK_WIREDMASK         0x0080000000000000ULL   /* software wired mask */
+#define ARM_TTE_BLOCK_WIRED         0x0400000000000000ULL          /* value for software wired bit */
+#define ARM_TTE_BLOCK_WIREDMASK     0x0400000000000000ULL          /* software wired mask */
 
-#define ARM_TTE_BLOCK_WRITEABLE         0x0100000000000000ULL   /* value for software writeable bit */
-#define ARM_TTE_BLOCK_WRITEABLEMASK     0x0100000000000000ULL   /* software writeable mask */
+#define ARM_TTE_BLOCK_WRITEABLE     0x0800000000000000ULL          /* value for software writeable bit */
+#define ARM_TTE_BLOCK_WRITEABLEMASK 0x0800000000000000ULL          /* software writeable mask */
 
 #ifdef __ARM_16K_PG__
 /*
- * TODO: Do we care about the low bits being unused?  It should technically work either way, but masking them out should be future proof;
- * it is only a matter of time before someone wants to shove something into the free bits.
+ * TODO: Do we care about the low bits being unused?  It should technically
+ * work either way, but masking them out should be future proof; it is only a
+ * matter of time before someone wants to shove something into the free bits.
  */
-#define ARM_TTE_TABLE_MASK                      (0x0000ffffffffc000ULL) /* mask for extracting pointer to next table (works at any level) */
+#define ARM_TTE_TABLE_MASK          (0x0000ffffffffc000ULL)        /* mask for extracting pointer to next table (works at any level) */
 #else
-#define ARM_TTE_TABLE_MASK                      (0x0000fffffffff000ULL) /* mask for extracting pointer to next table (works at any level) */
+#define ARM_TTE_TABLE_MASK          (0x0000fffffffff000ULL)        /* mask for extracting pointer to next table (works at any level) */
 #endif
 
-#define ARM_TTE_TABLE_APSHIFT           61
-#define ARM_TTE_TABLE_AP(x)                     ((x)<<TTE_BLOCK_APSHIFT) /* access protection */
+#define ARM_TTE_TABLE_APSHIFT       61
+#define ARM_TTE_TABLE_AP(x)         ((x)<<TTE_BLOCK_APSHIFT)       /* access protection */
 
-#define ARM_TTE_TABLE_NS                        0x8000000000000020ULL   /* value for a secure mapping */
-#define ARM_TTE_TABLE_NS_MASK           0x8000000000000020ULL   /* notSecure mapping mask */
+#define ARM_TTE_TABLE_NS            0x8000000000000020ULL          /* value for a secure mapping */
+#define ARM_TTE_TABLE_NS_MASK       0x8000000000000020ULL          /* notSecure mapping mask */
 
-#define ARM_TTE_TABLE_XN                        0x1000000000000000ULL   /* value for no execute */
-#define ARM_TTE_TABLE_XNMASK            0x1000000000000000ULL   /* no execute mask */
+#define ARM_TTE_TABLE_XN            0x1000000000000000ULL          /* value for no execute */
+#define ARM_TTE_TABLE_XNMASK        0x1000000000000000ULL          /* no execute mask */
 
-#define ARM_TTE_TABLE_PXN                       0x0800000000000000ULL   /* value for privilege no execute bit */
-#define ARM_TTE_TABLE_PXNMASK           0x0800000000000000ULL   /* privilege execute mask */
+#define ARM_TTE_TABLE_PXN           0x0800000000000000ULL          /* value for privilege no execute bit */
+#define ARM_TTE_TABLE_PXNMASK       0x0800000000000000ULL          /* privilege execute mask */
 
 #if __ARM_KERNEL_PROTECT__
-#define ARM_TTE_BOOT_BLOCK                      (ARM_TTE_TYPE_BLOCK | ARM_TTE_VALID |  ARM_TTE_BLOCK_SH(SH_OUTER_MEMORY)        \
-                                                                        | ARM_TTE_BLOCK_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) | ARM_TTE_BLOCK_AF \
-                                                                        | ARM_TTE_BLOCK_NG)
+#define ARM_TTE_BOOT_BLOCK \
+       (ARM_TTE_TYPE_BLOCK | ARM_TTE_VALID | ARM_TTE_BLOCK_SH(SH_OUTER_MEMORY) | \
+        ARM_TTE_BLOCK_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) | ARM_TTE_BLOCK_AF | ARM_TTE_BLOCK_NG)
 #else /* __ARM_KERNEL_PROTECT__ */
-#define ARM_TTE_BOOT_BLOCK                      (ARM_TTE_TYPE_BLOCK | ARM_TTE_VALID |  ARM_TTE_BLOCK_SH(SH_OUTER_MEMORY)        \
-                                                                        | ARM_TTE_BLOCK_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) | ARM_TTE_BLOCK_AF)
+#define ARM_TTE_BOOT_BLOCK \
+       (ARM_TTE_TYPE_BLOCK | ARM_TTE_VALID | ARM_TTE_BLOCK_SH(SH_OUTER_MEMORY) | \
+        ARM_TTE_BLOCK_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) | ARM_TTE_BLOCK_AF)
 #endif /* __ARM_KERNEL_PROTECT__ */
 
-#define ARM_TTE_BOOT_TABLE                      (ARM_TTE_TYPE_TABLE | ARM_TTE_VALID )
+#define ARM_TTE_BOOT_TABLE (ARM_TTE_TYPE_TABLE | ARM_TTE_VALID )
 /*
  *  L3 Translation table
  *
  */
 
 #ifdef __ARM_16K_PG__
-#define ARM_PTE_SIZE                            0x0000000000004000ULL           /* size of area covered by a tte */
-#define ARM_PTE_OFFMASK                         0x0000000000003fffULL           /* offset within pte area */
-#define ARM_PTE_SHIFT                           14                                                      /* page descriptor shift */
-#define ARM_PTE_MASK                            0x0000ffffffffc000ULL           /* mask for output address in PTE */
+#define ARM_PTE_SIZE    0x0000000000004000ULL /* size of area covered by a tte */
+#define ARM_PTE_OFFMASK 0x0000000000003fffULL /* offset within pte area */
+#define ARM_PTE_SHIFT   14                    /* page descriptor shift */
+#define ARM_PTE_MASK    0x0000ffffffffc000ULL /* mask for output address in PTE */
 #else
-#define ARM_PTE_SIZE                            0x0000000000001000ULL           /* size of area covered by a tte */
-#define ARM_PTE_OFFMASK                         0x0000000000000fffULL           /* offset within pte area */
-#define ARM_PTE_SHIFT                           12                                                      /* page descriptor shift */
-#define ARM_PTE_MASK                            0x0000fffffffff000ULL           /* mask for output address in PTE */
+#define ARM_PTE_SIZE    0x0000000000001000ULL /* size of area covered by a tte */
+#define ARM_PTE_OFFMASK 0x0000000000000fffULL /* offset within pte area */
+#define ARM_PTE_SHIFT   12                    /* page descriptor shift */
+#define ARM_PTE_MASK    0x0000fffffffff000ULL /* mask for output address in PTE */
 #endif
 
 /*
  *
  * The following page table entry types are possible:
  *
- *     fault page entry
- *     63                            2  0
- *     +------------------------------+--+
- *     |    ignored                   |00|
- *     +------------------------------+--+
+ * fault page entry
+ *  63                            2  0
+ * +------------------------------+--+
+ * |    ignored                   |00|
+ * +------------------------------+--+
  *
  *
  *  63 59 58  55 54  53   52 51  48 47                  12 11 10 9  8 7  6  5 4     2 1 0
  * +-----+------+--+---+----+------+----------------------+--+--+----+----+--+-------+-+-+
  *
  * where:
- *     'nG'            notGlobal bit
- *     'SH'            Shareability field
- *     'AP'            access protection
- *     'XN'            eXecute Never bit
- *     'PXN'           Privilege eXecute Never bit
- *     'NS'            Non-Secure bit
- *     'HINT'          16 entry continuguous output hint
- *     'AttrIdx'       Memory Attribute Index
+ *   nG:      notGlobal bit
+ *   SH:      Shareability field
+ *   AP:      access protection
+ *   XN:      eXecute Never bit
+ *   PXN:     Privilege eXecute Never bit
+ *   NS:      Non-Secure bit
+ *   HINT:    16 entry continuguous output hint
+ *   AttrIdx: Memory Attribute Index
  */
 
-#define PTE_SHIFT                                       3                                               /* shift width of a pte (sizeof(pte) == (1 << PTE_SHIFT)) */
+#define PTE_SHIFT               3                     /* shift width of a pte (sizeof(pte) == (1 << PTE_SHIFT)) */
 #ifdef __ARM_16K_PG__
-#define PTE_PGENTRIES                           (16384 >> PTE_SHIFT)    /* number of ptes per page */
+#define PTE_PGENTRIES           (16384 >> PTE_SHIFT)  /* number of ptes per page */
 #else
-#define PTE_PGENTRIES                           (4096 >> PTE_SHIFT)             /* number of ptes per page */
+#define PTE_PGENTRIES           (4096 >> PTE_SHIFT)   /* number of ptes per page */
 #endif
 
-#define ARM_PTE_EMPTY                           0x0000000000000000ULL   /* unasigned - invalid entry */
+#define ARM_PTE_EMPTY           0x0000000000000000ULL /* unassigned - invalid entry */
 
 /* markers for (invalid) PTE for a page sent to compressor */
-#define ARM_PTE_COMPRESSED              0x8000000000000000ULL   /* compressed... */
-#define ARM_PTE_COMPRESSED_ALT          0x4000000000000000ULL   /* ... and was "alt_acct" */
-#define ARM_PTE_COMPRESSED_MASK         0xC000000000000000ULL
-#define ARM_PTE_IS_COMPRESSED(x)                                        \
-       ((((x) & 0x3) == 0) &&          /* PTE is not valid... */       \
-        ((x) & ARM_PTE_COMPRESSED) &&  /* ...has "compressed" marker" */ \
-        ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \
-         (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
-                &(x), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
-
-#define ARM_PTE_TYPE                            0x0000000000000003ULL   /* valid L3 entry: includes bit #1 (counterintuitively) */
-#define ARM_PTE_TYPE_VALID                      0x0000000000000003ULL   /* valid L3 entry: includes bit #1 (counterintuitively) */
-#define ARM_PTE_TYPE_FAULT                      0x0000000000000000ULL   /* invalid L3 entry */
-#define ARM_PTE_TYPE_MASK                       0x0000000000000002ULL   /* mask to get pte type */
+#define ARM_PTE_COMPRESSED      0x8000000000000000ULL /* compressed... */
+#define ARM_PTE_COMPRESSED_ALT  0x4000000000000000ULL /* ... and was "alt_acct" */
+#define ARM_PTE_COMPRESSED_MASK 0xC000000000000000ULL
+
+#define ARM_PTE_IS_COMPRESSED(x, p) \
+       ((((x) & 0x3) == 0) && /* PTE is not valid... */                      \
+        ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */      \
+        ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */       \
+        (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
+               (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
+
+#define ARM_PTE_TYPE               0x0000000000000003ULL /* valid L3 entry: includes bit #1 (counterintuitively) */
+#define ARM_PTE_TYPE_VALID         0x0000000000000003ULL /* valid L3 entry: includes bit #1 (counterintuitively) */
+#define ARM_PTE_TYPE_FAULT         0x0000000000000000ULL /* invalid L3 entry */
+#define ARM_PTE_TYPE_MASK          0x0000000000000002ULL /* mask to get pte type */
 
 #ifdef __ARM_16K_PG__
 /* TODO: What does the shift mean here? */
-#define ARM_PTE_PAGE_MASK                       0x0000FFFFFFFFC000ULL   /* mask for 16KB page */
+#define ARM_PTE_PAGE_MASK          0x0000FFFFFFFFC000ULL /* mask for 16KB page */
 #else
-#define ARM_PTE_PAGE_MASK                       0x0000FFFFFFFFF000ULL   /* mask for  4KB page */
-#define ARM_PTE_PAGE_SHIFT                      12                                              /* page shift for 4KB page */
+#define ARM_PTE_PAGE_MASK          0x0000FFFFFFFFF000ULL /* mask for  4KB page */
+#define ARM_PTE_PAGE_SHIFT         12                    /* page shift for 4KB page */
 #endif
 
-#define ARM_PTE_AP(x)                           ((x) << 6)                              /* access protections */
-#define ARM_PTE_APMASK                          (0x3ULL << 6)                   /* mask access protections */
-#define ARM_PTE_EXTRACT_AP(x)           (((x) >> 6) & 0x3ULL)   /* extract access protections from PTE */
+#define ARM_PTE_AP(x)              ((x) << 6)            /* access protections */
+#define ARM_PTE_APMASK             (0x3ULL << 6)         /* mask access protections */
+#define ARM_PTE_EXTRACT_AP(x)      (((x) >> 6) & 0x3ULL) /* extract access protections from PTE */
 
-#define ARM_PTE_ATTRINDX(x)                     ((x) << 2)                              /* memory attributes index */
-#define ARM_PTE_ATTRINDXMASK            (0x7ULL << 2)                   /* mask memory attributes index */
+#define ARM_PTE_ATTRINDX(x)        ((x) << 2)            /* memory attributes index */
+#define ARM_PTE_ATTRINDXMASK       (0x7ULL << 2)         /* mask memory attributes index */
 
-#define ARM_PTE_SH(x)                           ((x) << 8)                              /* access shared */
-#define ARM_PTE_SHMASK                          (0x3ULL << 8)                   /* mask access shared */
+#define ARM_PTE_SH(x)              ((x) << 8)            /* access shared */
+#define ARM_PTE_SHMASK             (0x3ULL << 8)         /* mask access shared */
 
-#define ARM_PTE_AF                                      0x0000000000000400ULL   /* value for access */
-#define ARM_PTE_AFMASK                          0x0000000000000400ULL   /* access mask */
+#define ARM_PTE_AF                 0x0000000000000400ULL /* value for access */
+#define ARM_PTE_AFMASK             0x0000000000000400ULL /* access mask */
 
-#define ARM_PTE_NG                                      0x0000000000000800ULL   /* value for a global mapping */
-#define ARM_PTE_NG_MASK                         0x0000000000000800ULL   /* notGlobal mapping mask */
+#define ARM_PTE_NG                 0x0000000000000800ULL /* value for a global mapping */
+#define ARM_PTE_NG_MASK            0x0000000000000800ULL /* notGlobal mapping mask */
 
-#define ARM_PTE_NS                                      0x0000000000000020ULL   /* value for a secure mapping */
-#define ARM_PTE_NS_MASK                         0x0000000000000020ULL   /* notSecure mapping mask */
+#define ARM_PTE_NS                 0x0000000000000020ULL /* value for a secure mapping */
+#define ARM_PTE_NS_MASK            0x0000000000000020ULL /* notSecure mapping mask */
 
-#define ARM_PTE_HINT                            0x0010000000000000ULL   /* value for contiguous entries hint */
-#define ARM_PTE_HINT_MASK                       0x0010000000000000ULL   /* mask for contiguous entries hint */
+#define ARM_PTE_HINT               0x0010000000000000ULL /* value for contiguous entries hint */
+#define ARM_PTE_HINT_MASK          0x0010000000000000ULL /* mask for contiguous entries hint */
 
 #if __ARM_16K_PG__
-#define ARM_PTE_HINT_ENTRIES            128ULL                                  /* number of entries the hint covers */
-#define ARM_PTE_HINT_ENTRIES_SHIFT      7ULL                                    /* shift to construct the number of entries */
-#define ARM_PTE_HINT_ADDR_MASK          0x0000FFFFFFE00000ULL                   /* mask to extract the starting hint address */
-#define ARM_PTE_HINT_ADDR_SHIFT         21                                      /* shift for the hint address */
-#define ARM_KVA_HINT_ADDR_MASK          0xFFFFFFFFFFE00000ULL                   /* mask to extract the starting hint address */
+#define ARM_PTE_HINT_ENTRIES       128ULL                /* number of entries the hint covers */
+#define ARM_PTE_HINT_ENTRIES_SHIFT 7ULL                  /* shift to construct the number of entries */
+#define ARM_PTE_HINT_ADDR_MASK     0x0000FFFFFFE00000ULL /* mask to extract the starting hint address */
+#define ARM_PTE_HINT_ADDR_SHIFT    21                    /* shift for the hint address */
+#define ARM_KVA_HINT_ADDR_MASK     0xFFFFFFFFFFE00000ULL /* mask to extract the starting hint address */
 #else
-#define ARM_PTE_HINT_ENTRIES            16ULL                                   /* number of entries the hint covers */
-#define ARM_PTE_HINT_ENTRIES_SHIFT      4ULL                                    /* shift to construct the number of entries */
-#define ARM_PTE_HINT_ADDR_MASK          0x0000FFFFFFFF0000ULL                   /* mask to extract the starting hint address */
-#define ARM_PTE_HINT_ADDR_SHIFT         16                                      /* shift for the hint address */
-#define ARM_KVA_HINT_ADDR_MASK          0xFFFFFFFFFFFF0000ULL                   /* mask to extract the starting hint address */
+#define ARM_PTE_HINT_ENTRIES       16ULL                 /* number of entries the hint covers */
+#define ARM_PTE_HINT_ENTRIES_SHIFT 4ULL                  /* shift to construct the number of entries */
+#define ARM_PTE_HINT_ADDR_MASK     0x0000FFFFFFFF0000ULL /* mask to extract the starting hint address */
+#define ARM_PTE_HINT_ADDR_SHIFT    16                    /* shift for the hint address */
+#define ARM_KVA_HINT_ADDR_MASK     0xFFFFFFFFFFFF0000ULL /* mask to extract the starting hint address */
 #endif
 
-#define ARM_PTE_PNX                                     0x0020000000000000ULL   /* value for privilege no execute bit */
-#define ARM_PTE_PNXMASK                         0x0020000000000000ULL   /* privilege no execute mask */
+#define ARM_PTE_PNX                0x0020000000000000ULL /* value for privilege no execute bit */
+#define ARM_PTE_PNXMASK            0x0020000000000000ULL /* privilege no execute mask */
 
-#define ARM_PTE_NX                                      0x0040000000000000ULL   /* value for no execute bit */
-#define ARM_PTE_NXMASK                          0x0040000000000000ULL   /* no execute mask */
+#define ARM_PTE_NX                 0x0040000000000000ULL /* value for no execute bit */
+#define ARM_PTE_NXMASK             0x0040000000000000ULL /* no execute mask */
 
-#define ARM_PTE_WIRED                           0x0080000000000000ULL   /* value for software wired bit */
-#define ARM_PTE_WIRED_MASK                      0x0080000000000000ULL   /* software wired mask */
+#define ARM_PTE_WIRED              0x0400000000000000ULL /* value for software wired bit */
+#define ARM_PTE_WIRED_MASK         0x0400000000000000ULL /* software wired mask */
 
-#define ARM_PTE_WRITEABLE                       0x0100000000000000ULL   /* value for software writeable bit */
-#define ARM_PTE_WRITEABLE_MASK          0x0100000000000000ULL   /* software writeable mask */
+#define ARM_PTE_WRITEABLE          0x0800000000000000ULL /* value for software writeable bit */
+#define ARM_PTE_WRITEABLE_MASK     0x0800000000000000ULL /* software writeable mask */
 
 #if CONFIG_PGTRACE
-#define ARM_PTE_PGTRACE             0x0200000000000000ULL   /* value for software trace bit */
-#define ARM_PTE_PGTRACE_MASK        0x0200000000000000ULL   /* software trace mask */
+#define ARM_PTE_PGTRACE            0x0200000000000000ULL /* value for software trace bit */
+#define ARM_PTE_PGTRACE_MASK       0x0200000000000000ULL /* software trace mask */
 #endif
 
-#define ARM_PTE_BOOT_PAGE_BASE                  (ARM_PTE_TYPE_VALID |  ARM_PTE_SH(SH_OUTER_MEMORY) \
-                                                                        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) | ARM_PTE_AF)
+#define ARM_PTE_BOOT_PAGE_BASE \
+       (ARM_PTE_TYPE_VALID | ARM_PTE_SH(SH_OUTER_MEMORY) |       \
+        ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) | ARM_PTE_AF)
 
 #if __ARM_KERNEL_PROTECT__
-#define ARM_PTE_BOOT_PAGE                       (ARM_PTE_BOOT_PAGE_BASE | ARM_PTE_NG)
+#define ARM_PTE_BOOT_PAGE (ARM_PTE_BOOT_PAGE_BASE | ARM_PTE_NG)
 #else /* __ARM_KERNEL_PROTECT__ */
-#define ARM_PTE_BOOT_PAGE                       (ARM_PTE_BOOT_PAGE_BASE)
+#define ARM_PTE_BOOT_PAGE (ARM_PTE_BOOT_PAGE_BASE)
 #endif /* __ARM_KERNEL_PROTECT__ */
 
 /*
  * TLBI appers to only deal in 4KB page addresses, so give
  * it an explicit shift of 12.
  */
+#define TLBI_ADDR_SHIFT (0)
 #define TLBI_ADDR_SIZE  (44)
 #define TLBI_ADDR_MASK  ((1ULL << TLBI_ADDR_SIZE) - 1)
-#define TLBI_ADDR_SHIFT (12)
 #define TLBI_ASID_SHIFT (48)
 #define TLBI_ASID_SIZE  (16)
-#define TLBI_ASID_MASK  (((1ULL << TLBI_ASID_SIZE) - 1) << TLBI_ASID_SHIFT)
+#define TLBI_ASID_MASK  (((1ULL << TLBI_ASID_SIZE) - 1))
+
+#define RTLBI_ADDR_SIZE (37)
+#define RTLBI_ADDR_MASK ((1ULL << RTLBI_ADDR_SIZE) - 1)
+#define RTLBI_ADDR_SHIFT ARM_TT_L3_SHIFT
+#define RTLBI_TG ((uint64_t)(((ARM_TT_L3_SHIFT - 12) >> 1) + 1) << 46)
+#define RTLBI_SCALE_SHIFT (44)
+#define RTLBI_NUM_SHIFT (39)
 
 /*
  * Exception Syndrome Register
  * |  EC  |IL|       ISS        |
  * +------+--+------------------+
  *
- *     EC - Exception Class
- *     IL - Instruction Length
- *  ISS- Instruction Specific Syndrome
+ * EC  - Exception Class
+ * IL  - Instruction Length
+ * ISS - Instruction Specific Syndrome
  *
  * Note: The ISS can have many forms. These are defined separately below.
  */
 
-#define ESR_EC_SHIFT                            26
-#define ESR_EC_MASK                                     (0x3F << ESR_EC_SHIFT)
-#define ESR_EC(x)                                       ((x & ESR_EC_MASK) >> ESR_EC_SHIFT)
+#define ESR_EC_SHIFT           26
+#define ESR_EC_MASK            (0x3FULL << ESR_EC_SHIFT)
+#define ESR_EC(x)              ((x & ESR_EC_MASK) >> ESR_EC_SHIFT)
 
-#define ESR_IL_SHIFT                            25
-#define ESR_IL                                          (1 << ESR_IL_SHIFT)
+#define ESR_IL_SHIFT           25
+#define ESR_IL                 (1 << ESR_IL_SHIFT)
 
-#define ESR_INSTR_IS_2BYTES(x)          (!(x & ESR_IL))
+#define ESR_INSTR_IS_2BYTES(x) (!(x & ESR_IL))
 
-#define ESR_ISS_MASK                            0x01FFFFFF
-#define ESR_ISS(x)                                      (x & ESR_ISS_MASK)
+#define ESR_ISS_MASK           0x01FFFFFF
+#define ESR_ISS(x)             (x & ESR_ISS_MASK)
 
 #ifdef __ASSEMBLER__
 /* Define only the classes we need to test in the exception vectors. */
-#define ESR_EC_IABORT_EL1                       0x21
-#define ESR_EC_DABORT_EL1                       0x25
-#define ESR_EC_SP_ALIGN                         0x26
+#define ESR_EC_IABORT_EL1      0x21
+#define ESR_EC_DABORT_EL1      0x25
+#define ESR_EC_SP_ALIGN        0x26
 #else
 typedef enum {
-       ESR_EC_UNCATEGORIZED                    = 0x00,
-       ESR_EC_WFI_WFE                                  = 0x01,
-       ESR_EC_MCR_MRC_CP15_TRAP                = 0x03,
-       ESR_EC_MCRR_MRRC_CP15_TRAP              = 0x04,
-       ESR_EC_MCR_MRC_CP14_TRAP                = 0x05,
-       ESR_EC_LDC_STC_CP14_TRAP                = 0x06,
-       ESR_EC_TRAP_SIMD_FP                             = 0x07,
-       ESR_EC_MCRR_MRRC_CP14_TRAP              = 0x0c,
-       ESR_EC_ILLEGAL_INSTR_SET                = 0x0e,
-       ESR_EC_SVC_32                                   = 0x11,
-       ESR_EC_SVC_64                                   = 0x15,
-       ESR_EC_MSR_TRAP                                 = 0x18,
-       ESR_EC_IABORT_EL0                               = 0x20,
-       ESR_EC_IABORT_EL1                               = 0x21,
-       ESR_EC_PC_ALIGN                                 = 0x22,
-       ESR_EC_DABORT_EL0                               = 0x24,
-       ESR_EC_DABORT_EL1                               = 0x25,
-       ESR_EC_SP_ALIGN                                 = 0x26,
-       ESR_EC_FLOATING_POINT_32                = 0x28,
-       ESR_EC_FLOATING_POINT_64                = 0x2C,
-       ESR_EC_BKPT_REG_MATCH_EL0               = 0x30, // Breakpoint Debug event taken to the EL from a lower EL.
-       ESR_EC_BKPT_REG_MATCH_EL1               = 0x31, // Breakpoint Debug event taken to the EL from the EL.
-       ESR_EC_SW_STEP_DEBUG_EL0                = 0x32, // Software Step Debug event taken to the EL from a lower EL.
-       ESR_EC_SW_STEP_DEBUG_EL1                = 0x33, // Software Step Debug event taken to the EL from the EL.
-       ESR_EC_WATCHPT_MATCH_EL0                = 0x34, // Watchpoint Debug event taken to the EL from a lower EL.
-       ESR_EC_WATCHPT_MATCH_EL1                = 0x35, // Watchpoint Debug event taken to the EL from the EL.
-       ESR_EC_BKPT_AARCH32                             = 0x38,
-       ESR_EC_BRK_AARCH64                              = 0x3C
+       ESR_EC_UNCATEGORIZED       = 0x00,
+       ESR_EC_WFI_WFE             = 0x01,
+       ESR_EC_MCR_MRC_CP15_TRAP   = 0x03,
+       ESR_EC_MCRR_MRRC_CP15_TRAP = 0x04,
+       ESR_EC_MCR_MRC_CP14_TRAP   = 0x05,
+       ESR_EC_LDC_STC_CP14_TRAP   = 0x06,
+       ESR_EC_TRAP_SIMD_FP        = 0x07,
+       ESR_EC_MCRR_MRRC_CP14_TRAP = 0x0c,
+       ESR_EC_ILLEGAL_INSTR_SET   = 0x0e,
+       ESR_EC_SVC_32              = 0x11,
+       ESR_EC_SVC_64              = 0x15,
+       ESR_EC_MSR_TRAP            = 0x18,
+       ESR_EC_IABORT_EL0          = 0x20,
+       ESR_EC_IABORT_EL1          = 0x21,
+       ESR_EC_PC_ALIGN            = 0x22,
+       ESR_EC_DABORT_EL0          = 0x24,
+       ESR_EC_DABORT_EL1          = 0x25,
+       ESR_EC_SP_ALIGN            = 0x26,
+       ESR_EC_FLOATING_POINT_32   = 0x28,
+       ESR_EC_FLOATING_POINT_64   = 0x2C,
+       ESR_EC_BKPT_REG_MATCH_EL0  = 0x30, // Breakpoint Debug event taken to the EL from a lower EL.
+       ESR_EC_BKPT_REG_MATCH_EL1  = 0x31, // Breakpoint Debug event taken to the EL from the EL.
+       ESR_EC_SW_STEP_DEBUG_EL0   = 0x32, // Software Step Debug event taken to the EL from a lower EL.
+       ESR_EC_SW_STEP_DEBUG_EL1   = 0x33, // Software Step Debug event taken to the EL from the EL.
+       ESR_EC_WATCHPT_MATCH_EL0   = 0x34, // Watchpoint Debug event taken to the EL from a lower EL.
+       ESR_EC_WATCHPT_MATCH_EL1   = 0x35, // Watchpoint Debug event taken to the EL from the EL.
+       ESR_EC_BKPT_AARCH32        = 0x38,
+       ESR_EC_BRK_AARCH64         = 0x3C,
 } esr_exception_class_t;
 
 typedef enum {
-       FSC_TRANSLATION_FAULT_L0                = 0x04,
-       FSC_TRANSLATION_FAULT_L1                = 0x05,
-       FSC_TRANSLATION_FAULT_L2                = 0x06,
-       FSC_TRANSLATION_FAULT_L3                = 0x07,
-       FSC_ACCESS_FLAG_FAULT_L1                = 0x09,
-       FSC_ACCESS_FLAG_FAULT_L2                = 0x0A,
-       FSC_ACCESS_FLAG_FAULT_L3                = 0x0B,
-       FSC_PERMISSION_FAULT_L1                 = 0x0D,
-       FSC_PERMISSION_FAULT_L2                 = 0x0E,
-       FSC_PERMISSION_FAULT_L3                 = 0x0F,
-       FSC_SYNC_EXT_ABORT                              = 0x10,
-       FSC_ASYNC_EXT_ABORT                             = 0x11,
-       FSC_SYNC_EXT_ABORT_TT_L1                = 0x15,
-       FSC_SYNC_EXT_ABORT_TT_L2                = 0x16,
-       FSC_SYNC_EXT_ABORT_TT_L3                = 0x17,
-       FSC_SYNC_PARITY                                 = 0x18,
-       FSC_ASYNC_PARITY                                = 0x19,
-       FSC_SYNC_PARITY_TT_L1                   = 0x1D,
-       FSC_SYNC_PARITY_TT_L2                   = 0x1E,
-       FSC_SYNC_PARITY_TT_L3                   = 0x1F,
-       FSC_ALIGNMENT_FAULT                             = 0x21,
-       FSC_DEBUG_FAULT                                 = 0x22
+       FSC_TRANSLATION_FAULT_L0   = 0x04,
+       FSC_TRANSLATION_FAULT_L1   = 0x05,
+       FSC_TRANSLATION_FAULT_L2   = 0x06,
+       FSC_TRANSLATION_FAULT_L3   = 0x07,
+       FSC_ACCESS_FLAG_FAULT_L1   = 0x09,
+       FSC_ACCESS_FLAG_FAULT_L2   = 0x0A,
+       FSC_ACCESS_FLAG_FAULT_L3   = 0x0B,
+       FSC_PERMISSION_FAULT_L1    = 0x0D,
+       FSC_PERMISSION_FAULT_L2    = 0x0E,
+       FSC_PERMISSION_FAULT_L3    = 0x0F,
+       FSC_SYNC_EXT_ABORT         = 0x10,
+       FSC_ASYNC_EXT_ABORT        = 0x11,
+       FSC_SYNC_EXT_ABORT_TT_L1   = 0x15,
+       FSC_SYNC_EXT_ABORT_TT_L2   = 0x16,
+       FSC_SYNC_EXT_ABORT_TT_L3   = 0x17,
+       FSC_SYNC_PARITY            = 0x18,
+       FSC_ASYNC_PARITY           = 0x19,
+       FSC_SYNC_PARITY_TT_L1      = 0x1D,
+       FSC_SYNC_PARITY_TT_L2      = 0x1E,
+       FSC_SYNC_PARITY_TT_L3      = 0x1F,
+       FSC_ALIGNMENT_FAULT        = 0x21,
+       FSC_DEBUG_FAULT            = 0x22
 } fault_status_t;
 #endif /* ASSEMBLER */
 
@@ -1295,19 +1388,19 @@ typedef enum {
  * +---+-----------------+--+------+
  *
  * where:
- *     ISV             Instruction syndrome valid
- *     EX              Exclusive access
- *     IFSC    Instruction Fault Status Code
+ *   ISV:  Instruction syndrome valid
+ *   EX:   Exclusive access
+ *   IFSC: Instruction Fault Status Code
  */
 
-#define ISS_SSDE_ISV_SHIFT                      24
-#define ISS_SSDE_ISV                            (0x1 << ISS_SSDE_ISV_SHIFT)
+#define ISS_SSDE_ISV_SHIFT 24
+#define ISS_SSDE_ISV       (0x1 << ISS_SSDE_ISV_SHIFT)
 
-#define ISS_SSDE_EX_SHIFT                       6
-#define ISS_SSDE_EX                                     (0x1 << ISS_SSDE_EX_SHIFT)
+#define ISS_SSDE_EX_SHIFT  6
+#define ISS_SSDE_EX        (0x1 << ISS_SSDE_EX_SHIFT)
 
-#define ISS_SSDE_FSC_MASK                       0x3F
-#define ISS_SSDE_FSC(x)                         (x & ISS_SSDE_FSC_MASK)
+#define ISS_SSDE_FSC_MASK  0x3F
+#define ISS_SSDE_FSC(x)    (x & ISS_SSDE_FSC_MASK)
 
 /*
  * Instruction Abort ISS (EL1)
@@ -1317,15 +1410,15 @@ typedef enum {
  * +---------------+--+---+------+
  *
  * where:
- *     EA              External Abort type
- *     IFSC    Instruction Fault Status Code
+ *   EA:   External Abort type
+ *   IFSC: Instruction Fault Status Code
  */
 
-#define ISS_IA_EA_SHIFT                         9
-#define ISS_IA_EA                                       (0x1 << ISS_IA_EA_SHIFT)
+#define ISS_IA_EA_SHIFT 9
+#define ISS_IA_EA       (0x1 << ISS_IA_EA_SHIFT)
 
-#define ISS_IA_FSC_MASK                         0x3F
-#define ISS_IA_FSC(x)                           (x & ISS_IA_FSC_MASK)
+#define ISS_IA_FSC_MASK 0x3F
+#define ISS_IA_FSC(x)   (x & ISS_IA_FSC_MASK)
 
 
 /*
@@ -1337,59 +1430,98 @@ typedef enum {
  * +---------------+--+--+-+---+----+
  *
  * where:
- *     EA              External Abort type
- *     CM              Cache Maintenance operation
- *     WnR             Write not Read
- *     DFSC    Data Fault Status Code
+ *   EA:   External Abort type
+ *   CM:   Cache Maintenance operation
+ *   WnR:  Write not Read
+ *   DFSC: Data Fault Status Code
  */
-#define ISS_DA_EA_SHIFT                         9
-#define ISS_DA_EA                                       (0x1 << ISS_DA_EA_SHIFT)
+#define ISS_DA_EA_SHIFT  9
+#define ISS_DA_EA        (0x1 << ISS_DA_EA_SHIFT)
 
-#define ISS_DA_CM_SHIFT                         8
-#define ISS_DA_CM                                       (0x1 << ISS_DA_CM_SHIFT)
+#define ISS_DA_CM_SHIFT  8
+#define ISS_DA_CM        (0x1 << ISS_DA_CM_SHIFT)
 
-#define ISS_DA_WNR_SHIFT                        6
-#define ISS_DA_WNR                                      (0x1 << ISS_DA_WNR_SHIFT)
+#define ISS_DA_WNR_SHIFT 6
+#define ISS_DA_WNR       (0x1 << ISS_DA_WNR_SHIFT)
+
+#define ISS_DA_FSC_MASK  0x3F
+#define ISS_DA_FSC(x)    (x & ISS_DA_FSC_MASK)
+
+/*
+ * Floating Point Exception ISS (EL1)
+ *
+ * 24  23 22            8  7      4   3   2   1   0
+ * +-+---+---------------+---+--+---+---+---+---+---+
+ * |0|TFV|000000000000000|IDF|00|IXF|UFF|OFF|DZF|IOF|
+ * +-+---+---------------+---+--+---+---+---+---+---+
+ *
+ * where:
+ *   TFV: Trapped Fault Valid
+ *   IDF: Input Denormal Exception
+ *   IXF: Input Inexact Exception
+ *   UFF: Underflow Exception
+ *   OFF: Overflow Exception
+ *   DZF: Divide by Zero Exception
+ *   IOF: Invalid Operation Exception
+ */
+#define ISS_FP_TFV_SHIFT 23
+#define ISS_FP_TFV       (0x1 << ISS_FP_TFV_SHIFT)
+
+#define ISS_FP_IDF_SHIFT 7
+#define ISS_FP_IDF       (0x1 << ISS_FP_IDF_SHIFT)
+
+#define ISS_FP_IXF_SHIFT 4
+#define ISS_FP_IXF       (0x1 << ISS_FP_IXF_SHIFT)
+
+#define ISS_FP_UFF_SHIFT 3
+#define ISS_FP_UFF       (0x1 << ISS_FP_UFF_SHIFT)
+
+#define ISS_FP_OFF_SHIFT 2
+#define ISS_FP_OFF       (0x1 << ISS_FP_OFF_SHIFT)
+
+#define ISS_FP_DZF_SHIFT 1
+#define ISS_FP_DZF       (0x1 << ISS_FP_DZF_SHIFT)
+
+#define ISS_FP_IOF_SHIFT 0
+#define ISS_FP_IOF       (0x1 << ISS_FP_IOF_SHIFT)
 
-#define ISS_DA_FSC_MASK                         0x3F
-#define ISS_DA_FSC(x)                           (x & ISS_DA_FSC_MASK)
 
 /*
  * Physical Address Register (EL1)
  */
-#define PAR_F_SHIFT                                     0
-#define PAR_F                                           (0x1 << PAR_F_SHIFT)
+#define PAR_F_SHIFT 0
+#define PAR_F       (0x1 << PAR_F_SHIFT)
 
-#define PLATFORM_SYSCALL_TRAP_NO                0x80000000
+#define PLATFORM_SYSCALL_TRAP_NO 0x80000000
 
-#define ARM64_SYSCALL_CODE_REG_NUM                      (16)
+#define ARM64_SYSCALL_CODE_REG_NUM (16)
 
-#define ARM64_CLINE_SHIFT                       6
+#define ARM64_CLINE_SHIFT 6
 
 #if defined(APPLE_ARM64_ARCH_FAMILY)
-#define L2CERRSTS_DATSBEESV     (1ULL << 2)     /* L2C data single bit ECC error */
-#define L2CERRSTS_DATDBEESV     (1ULL << 4)     /* L2C data double bit ECC error */
+#define L2CERRSTS_DATSBEESV (1ULL << 2) /* L2C data single bit ECC error */
+#define L2CERRSTS_DATDBEESV (1ULL << 4) /* L2C data double bit ECC error */
 #endif
 
 /*
  * Timer definitions.
  */
-#define CNTKCTL_EL1_PL0PTEN                             (0x1 << 9)              /* 1: EL0 access to physical timer regs permitted */
-#define CNTKCTL_EL1_PL0VTEN                             (0x1 << 8)              /* 1: EL0 access to virtual timer regs permitted */
-#define CNTKCTL_EL1_EVENTI_MASK                 (0x000000f0)    /* Mask for bits describing which bit to use for triggering event stream */
-#define CNTKCTL_EL1_EVENTI_SHIFT                (0x4)                   /* Shift for same */
-#define CNTKCTL_EL1_EVENTDIR                    (0x1 << 3)              /* 1: one-to-zero transition of specified bit causes event */
-#define CNTKCTL_EL1_EVNTEN                              (0x1 << 2)              /* 1: enable event stream */
-#define CNTKCTL_EL1_PL0VCTEN                    (0x1 << 1)              /* 1: EL0 access to physical timebase + frequency reg enabled */
-#define CNTKCTL_EL1_PL0PCTEN                    (0x1 << 0)              /* 1: EL0 access to virtual timebase + frequency reg enabled */
-
-#define CNTV_CTL_EL0_ISTATUS            (0x1 << 2)              /* (read only): whether interrupt asserted */
-#define CNTV_CTL_EL0_IMASKED            (0x1 << 1)              /* 1: interrupt masked */
-#define CNTV_CTL_EL0_ENABLE                     (0x1 << 0)              /* 1: virtual timer enabled */
-
-#define CNTP_CTL_EL0_ISTATUS            CNTV_CTL_EL0_ISTATUS
-#define CNTP_CTL_EL0_IMASKED            CNTV_CTL_EL0_IMASKED
-#define CNTP_CTL_EL0_ENABLE                     CNTV_CTL_EL0_ENABLE
+#define CNTKCTL_EL1_PL0PTEN      (0x1 << 9)           /* 1: EL0 access to physical timer regs permitted */
+#define CNTKCTL_EL1_PL0VTEN      (0x1 << 8)           /* 1: EL0 access to virtual timer regs permitted */
+#define CNTKCTL_EL1_EVENTI_MASK  (0x000000f0)         /* Mask for bits describing which bit to use for triggering event stream */
+#define CNTKCTL_EL1_EVENTI_SHIFT (0x4)                /* Shift for same */
+#define CNTKCTL_EL1_EVENTDIR     (0x1 << 3)           /* 1: one-to-zero transition of specified bit causes event */
+#define CNTKCTL_EL1_EVNTEN       (0x1 << 2)           /* 1: enable event stream */
+#define CNTKCTL_EL1_PL0VCTEN     (0x1 << 1)           /* 1: EL0 access to physical timebase + frequency reg enabled */
+#define CNTKCTL_EL1_PL0PCTEN     (0x1 << 0)           /* 1: EL0 access to virtual timebase + frequency reg enabled */
+
+#define CNTV_CTL_EL0_ISTATUS     (0x1 << 2)           /* (read only): whether interrupt asserted */
+#define CNTV_CTL_EL0_IMASKED     (0x1 << 1)           /* 1: interrupt masked */
+#define CNTV_CTL_EL0_ENABLE      (0x1 << 0)           /* 1: virtual timer enabled */
+
+#define CNTP_CTL_EL0_ISTATUS     CNTV_CTL_EL0_ISTATUS
+#define CNTP_CTL_EL0_IMASKED     CNTV_CTL_EL0_IMASKED
+#define CNTP_CTL_EL0_ENABLE      CNTV_CTL_EL0_ENABLE
 
 /*
  * At present all other uses of ARM_DBG_* are shared bit compatibly with the 32bit definitons.
@@ -1397,28 +1529,36 @@ typedef enum {
  */
 #define ARM_DBG_VR_ADDRESS_MASK64 0xFFFFFFFFFFFFFFFCull /* BVR & WVR */
 
-#define MIDR_EL1_REV_SHIFT                      0
-#define MIDR_EL1_REV_MASK                       (0xf << MIDR_EL1_REV_SHIFT)
-#define MIDR_EL1_PNUM_SHIFT                     4
-#define MIDR_EL1_PNUM_MASK                      (0xfff << MIDR_EL1_PNUM_SHIFT)
-#define MIDR_EL1_ARCH_SHIFT                     16
-#define MIDR_EL1_ARCH_MASK                      (0xf << MIDR_EL1_ARCH_SHIFT)
-#define MIDR_EL1_VAR_SHIFT                      20
-#define MIDR_EL1_VAR_MASK                       (0xf << MIDR_EL1_VAR_SHIFT)
-#define MIDR_EL1_IMP_SHIFT                      24
-#define MIDR_EL1_IMP_MASK                       (0xff << MIDR_EL1_IMP_SHIFT)
+#define MIDR_EL1_REV_SHIFT  0
+#define MIDR_EL1_REV_MASK   (0xf << MIDR_EL1_REV_SHIFT)
+#define MIDR_EL1_PNUM_SHIFT 4
+#define MIDR_EL1_PNUM_MASK  (0xfff << MIDR_EL1_PNUM_SHIFT)
+#define MIDR_EL1_ARCH_SHIFT 16
+#define MIDR_EL1_ARCH_MASK  (0xf << MIDR_EL1_ARCH_SHIFT)
+#define MIDR_EL1_VAR_SHIFT  20
+#define MIDR_EL1_VAR_MASK   (0xf << MIDR_EL1_VAR_SHIFT)
+#define MIDR_EL1_IMP_SHIFT  24
+#define MIDR_EL1_IMP_MASK   (0xff << MIDR_EL1_IMP_SHIFT)
 
 /*
  * CoreSight debug registers
  */
-#define CORESIGHT_ED    0
-#define CORESIGHT_CTI   1
-#define CORESIGHT_PMU   2
-#define CORESIGHT_UTT   3 /* Not truly a coresight thing, but at a fixed convenient location right after the coresight region */
+#define CORESIGHT_ED  0
+#define CORESIGHT_CTI 1
+#define CORESIGHT_PMU 2
+#define CORESIGHT_UTT 3 /* Not truly a coresight thing, but at a fixed convenient location right after the coresight region */
+
+#define CORESIGHT_OFFSET(x) ((x) * 0x10000)
+#define CORESIGHT_REGIONS   4
+#define CORESIGHT_SIZE      0x1000
+
+
+
+
+
+
+
 
-#define CORESIGHT_OFFSET(x)     ((x) * 0x10000)
-#define CORESIGHT_REGIONS       4
-#define CORESIGHT_SIZE          0x1000
 
 
 /*
@@ -1430,30 +1570,75 @@ typedef enum {
  * +----------+--------+------+------+------+-----+------+
  */
 
-#define ID_AA64ISAR0_EL1_ATOMIC_OFFSET  20
-#define ID_AA64ISAR0_EL1_ATOMIC_MASK    (0xfull << ID_AA64ISAR0_EL1_ATOMIC_OFFSET)
-#define ID_AA64ISAR0_EL1_ATOMIC_8_1     (2ull << ID_AA64ISAR0_EL1_ATOMIC_OFFSET)
+#define ID_AA64ISAR0_EL1_FHM_OFFSET    48
+#define ID_AA64ISAR0_EL1_FHM_MASK      (0xfull << ID_AA64ISAR0_EL1_FHM_OFFSET)
+#define ID_AA64ISAR0_EL1_FHM_8_2       (1ull << ID_AA64ISAR0_EL1_FHM_OFFSET)
+
+#define ID_AA64ISAR0_EL1_ATOMIC_OFFSET 20
+#define ID_AA64ISAR0_EL1_ATOMIC_MASK   (0xfull << ID_AA64ISAR0_EL1_ATOMIC_OFFSET)
+#define ID_AA64ISAR0_EL1_ATOMIC_8_1    (2ull << ID_AA64ISAR0_EL1_ATOMIC_OFFSET)
+
+#define ID_AA64ISAR0_EL1_CRC32_OFFSET  16
+#define ID_AA64ISAR0_EL1_CRC32_MASK    (0xfull << ID_AA64ISAR0_EL1_CRC32_OFFSET)
+#define ID_AA64ISAR0_EL1_CRC32_EN      (1ull << ID_AA64ISAR0_EL1_CRC32_OFFSET)
+
+#define ID_AA64ISAR0_EL1_SHA2_OFFSET   12
+#define ID_AA64ISAR0_EL1_SHA2_MASK     (0xfull << ID_AA64ISAR0_EL1_SHA2_OFFSET)
+#define ID_AA64ISAR0_EL1_SHA2_EN       (1ull << ID_AA64ISAR0_EL1_SHA2_OFFSET)
 
-#define ID_AA64ISAR0_EL1_CRC32_OFFSET   16
-#define ID_AA64ISAR0_EL1_CRC32_MASK     (0xfull << ID_AA64ISAR0_EL1_CRC32_OFFSET)
-#define ID_AA64ISAR0_EL1_CRC32_EN       (1ull << ID_AA64ISAR0_EL1_CRC32_OFFSET)
+#define ID_AA64ISAR0_EL1_SHA1_OFFSET   8
+#define ID_AA64ISAR0_EL1_SHA1_MASK     (0xfull << ID_AA64ISAR0_EL1_SHA1_OFFSET)
+#define ID_AA64ISAR0_EL1_SHA1_EN       (1ull << ID_AA64ISAR0_EL1_SHA1_OFFSET)
 
-#define ID_AA64ISAR0_EL1_SHA2_OFFSET    12
-#define ID_AA64ISAR0_EL1_SHA2_MASK      (0xfull << ID_AA64ISAR0_EL1_SHA2_OFFSET)
-#define ID_AA64ISAR0_EL1_SHA2_EN        (1ull << ID_AA64ISAR0_EL1_SHA2_OFFSET)
+#define ID_AA64ISAR0_EL1_AES_OFFSET    4
+#define ID_AA64ISAR0_EL1_AES_MASK      (0xfull << ID_AA64ISAR0_EL1_AES_OFFSET)
+#define ID_AA64ISAR0_EL1_AES_EN        (1ull << ID_AA64ISAR0_EL1_AES_OFFSET)
+#define ID_AA64ISAR0_EL1_AES_PMULL_EN  (2ull << ID_AA64ISAR0_EL1_AES_OFFSET)
 
-#define ID_AA64ISAR0_EL1_SHA1_OFFSET    8
-#define ID_AA64ISAR0_EL1_SHA1_MASK      (0xfull << ID_AA64ISAR0_EL1_SHA1_OFFSET)
-#define ID_AA64ISAR0_EL1_SHA1_EN        (1ull << ID_AA64ISAR0_EL1_SHA1_OFFSET)
 
-#define ID_AA64ISAR0_EL1_AES_OFFSET     4
-#define ID_AA64ISAR0_EL1_AES_MASK       (0xfull << ID_AA64ISAR0_EL1_AES_OFFSET)
-#define ID_AA64ISAR0_EL1_AES_EN         (1ull << ID_AA64ISAR0_EL1_AES_OFFSET)
-#define ID_AA64ISAR0_EL1_AES_PMULL_EN   (2ull << ID_AA64ISAR0_EL1_AES_OFFSET)
+#if __APCFG_SUPPORTED__
+/*
+ * APCFG_EL1
+ *
+ *  63       2 1 0
+ * +----------+-+-+
+ * | reserved |K|R|
+ * +----------+-+-+
+ *
+ * where:
+ *   R: Reserved
+ *   K: ElXEnKey - Enable ARMV8.3 defined {IA,IB,DA,DB} keys when CPU is
+ *                 operating in EL1 (or higher) and when under Apple-Mode
+ */
 
+#define APCFG_EL1_ELXENKEY_OFFSET      1
+#define APCFG_EL1_ELXENKEY_MASK        (0x1ULL << APCFG_EL1_ELXENKEY_OFFSET)
+#define APCFG_EL1_ELXENKEY             APCFG_EL1_ELXENKEY_MASK
+#endif /* __APCFG_SUPPORTED__ */
+
+#define APSTATE_G_SHIFT  (0)
+#define APSTATE_P_SHIFT  (1)
+#define APSTATE_A_SHIFT  (2)
+
+#ifdef __APSTS_SUPPORTED__
+#define APCTL_EL1_AppleMode  (1ULL << 0)
+#define APCTL_EL1_KernKeyEn  (1ULL << 1)
+#define APCTL_EL1_EnAPKey0   (1ULL << 2)
+#define APCTL_EL1_EnAPKey1   (1ULL << 3)
+#define APSTS_EL1_MKEYVld    (1ULL << 0)
+#else
+#define APCTL_EL1_AppleMode  (1ULL << 0)
+#define APCTL_EL1_MKEYVld    (1ULL << 1)
+#define APCTL_EL1_KernKeyEn  (1ULL << 2)
+#endif
 
 
 
+#if defined(HAS_APPLE_PAC)
+// The value of ptrauth_string_discriminator("recover"), hardcoded so it can be used from assembly code
+#define PAC_DISCRIMINATOR_RECOVER    0x1e02
+#endif
+
 #ifdef __ASSEMBLER__
 
 /*
@@ -1463,14 +1648,14 @@ typedef enum {
  * Where the "variant" is the major number and the "revision" is the minor number.
  *
  * For example:
- *      Cyclone A0 is variant 0, revision 0, i.e. 0.
- *     Cyclone B0 is variant 1, revision 0, i.e. 0x10
+ *   Cyclone A0 is variant 0, revision 0, i.e. 0.
+ *   Cyclone B0 is variant 1, revision 0, i.e. 0x10
  * $0 - register to place value in
  */
 .macro GET_MIDR_CPU_VERSION
-mrs     $0, MIDR_EL1                                            // Read MIDR_EL1 for CPUID
-bfi             $0, $0, #(MIDR_EL1_VAR_SHIFT - 4), #4           // move bits 3:0 (revision) to 19:16 (below variant) to get values adjacent
-ubfx    $0, $0, #(MIDR_EL1_VAR_SHIFT - 4), #8           // And extract the concatenated bitstring to beginning of register
+mrs  $0, MIDR_EL1                                  // Read MIDR_EL1 for CPUID
+bfi  $0, $0, #(MIDR_EL1_VAR_SHIFT - 4), #4         // move bits 3:0 (revision) to 19:16 (below variant) to get values adjacent
+ubfx $0, $0, #(MIDR_EL1_VAR_SHIFT - 4), #8         // And extract the concatenated bitstring to beginning of register
 .endmacro
 
 /*
@@ -1483,8 +1668,8 @@ ubfx    $0, $0, #(MIDR_EL1_VAR_SHIFT - 4), #8           // And extract the conca
  */
 .macro SKIP_IF_CPU_VERSION_GREATER_OR_EQUAL
 GET_MIDR_CPU_VERSION $0
-cmp     $0, $1
-b.pl    $2                      // Unsigned "greater or equal"
+cmp  $0, $1
+b.pl $2                         // Unsigned "greater or equal"
 .endmacro
 
 /*
@@ -1497,8 +1682,8 @@ b.pl    $2                      // Unsigned "greater or equal"
  */
 .macro SKIP_IF_CPU_VERSION_LESS_THAN
 GET_MIDR_CPU_VERSION $0
-cmp     $0, $1
-b.mi    $2                      // Unsigned "strictly less than"
+cmp  $0, $1
+b.mi $2                         // Unsigned "strictly less than"
 .endmacro
 
 #endif /* __ASSEMBLER__ */
index 2ad70b7c811da6dc8aa6f5348d872c201cbd32df..705e31444adf02a3d31bfea04c397b22725f2141 100644 (file)
@@ -44,7 +44,7 @@
 #include <mach/machine/thread_status.h>
 
 #include <machine/atomic.h>
-#include <machine/machlimits.h>
+#include <machine/limits.h>
 
 #include <pexpert/arm/protos.h>
 
        assert(TEST_CONTEXT32_SANITY(context) || TEST_CONTEXT64_SANITY(context))
 
 
-#define COPYIN(src, dst, size)                                  \
-       (PSR64_IS_KERNEL(get_saved_state_cpsr(state)))  ?   \
-               copyin_kern(src, dst, size)                     \
-       :                                                       \
-               copyin(src, dst, size)
+#define COPYIN(src, dst, size)                           \
+       (PSR64_IS_KERNEL(get_saved_state_cpsr(state))) ? \
+       copyin_kern(src, dst, size) :                    \
+       copyin(src, dst, size)
 
-#define COPYOUT(src, dst, size)                                 \
-       (PSR64_IS_KERNEL(get_saved_state_cpsr(state)))  ?   \
-               copyout_kern(src, dst, size)                    \
-       :                                                       \
-               copyout(src, dst, size)
+#define COPYOUT(src, dst, size)                          \
+       (PSR64_IS_KERNEL(get_saved_state_cpsr(state))) ? \
+       copyout_kern(src, dst, size)                   : \
+       copyout(src, dst, size)
 
 // Below is for concatenating a string param to a string literal
 #define STR1(x) #x
 #define STR(x) STR1(x)
 
-void panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *ss);
+void panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *ss) __abortlike;
 
-void sleh_synchronous_sp1(arm_context_t *, uint32_t, vm_offset_t);
+void sleh_synchronous_sp1(arm_context_t *, uint32_t, vm_offset_t) __abortlike;
 void sleh_synchronous(arm_context_t *, uint32_t, vm_offset_t);
 void sleh_irq(arm_saved_state_t *);
 void sleh_fiq(arm_saved_state_t *);
 void sleh_serror(arm_context_t *context, uint32_t esr, vm_offset_t far);
-void sleh_invalid_stack(arm_context_t *context, uint32_t esr, vm_offset_t far);
+void sleh_invalid_stack(arm_context_t *context, uint32_t esr, vm_offset_t far) __dead2;
 
 static void sleh_interrupt_handler_prologue(arm_saved_state_t *, unsigned int type);
 static void sleh_interrupt_handler_epilogue(void);
@@ -113,10 +111,10 @@ static void handle_mach_continuous_time_trap(arm_saved_state_t *);
 
 static void handle_msr_trap(arm_saved_state_t *state, uint32_t iss);
 
-extern kern_return_t arm_fast_fault(pmap_t, vm_map_address_t, vm_prot_t, boolean_t);
+extern kern_return_t arm_fast_fault(pmap_t, vm_map_address_t, vm_prot_t, bool, bool);
 
-static void handle_uncategorized(arm_saved_state_t *, boolean_t);
-static void handle_breakpoint(arm_saved_state_t *);
+static void handle_uncategorized(arm_saved_state_t *);
+static void handle_breakpoint(arm_saved_state_t *) __dead2;
 
 typedef void (*abort_inspector_t)(uint32_t, fault_status_t *, vm_prot_t *);
 static void inspect_instruction_abort(uint32_t, fault_status_t *, vm_prot_t *);
@@ -130,18 +128,19 @@ typedef void (*abort_handler_t)(arm_saved_state_t *, uint32_t, vm_offset_t, faul
 static void handle_user_abort(arm_saved_state_t *, uint32_t, vm_offset_t, fault_status_t, vm_prot_t, vm_offset_t);
 static void handle_kernel_abort(arm_saved_state_t *, uint32_t, vm_offset_t, fault_status_t, vm_prot_t, vm_offset_t);
 
-static void handle_pc_align(arm_saved_state_t *ss);
-static void handle_sp_align(arm_saved_state_t *ss);
-static void handle_sw_step_debug(arm_saved_state_t *ss);
-static void handle_wf_trap(arm_saved_state_t *ss);
+static void handle_pc_align(arm_saved_state_t *ss) __dead2;
+static void handle_sp_align(arm_saved_state_t *ss) __dead2;
+static void handle_sw_step_debug(arm_saved_state_t *ss) __dead2;
+static void handle_wf_trap(arm_saved_state_t *ss) __dead2;
+static void handle_fp_trap(arm_saved_state_t *ss, uint32_t esr) __dead2;
 
-static void handle_watchpoint(vm_offset_t fault_addr);
+static void handle_watchpoint(vm_offset_t fault_addr) __dead2;
 
 static void handle_abort(arm_saved_state_t *, uint32_t, vm_offset_t, vm_offset_t, abort_inspector_t, abort_handler_t);
 
-static void handle_user_trapped_instruction32(arm_saved_state_t *, uint32_t esr);
+static void handle_user_trapped_instruction32(arm_saved_state_t *, uint32_t esr) __dead2;
 
-static void handle_simd_trap(arm_saved_state_t *, uint32_t esr);
+static void handle_simd_trap(arm_saved_state_t *, uint32_t esr) __dead2;
 
 extern void mach_kauth_cred_uthread_update(void);
 void   mach_syscall_trace_exit(unsigned int retval, unsigned int call_number);
@@ -160,8 +159,11 @@ mach_syscall(struct arm_saved_state*);
 extern kern_return_t dtrace_user_probe(arm_saved_state_t* regs);
 extern boolean_t dtrace_tally_fault(user_addr_t);
 
-/* Traps for userland processing. Can't include bsd/sys/fasttrap_isa.h, so copy and paste the trap instructions
- *  over from that file. Need to keep these in sync! */
+/*
+ * Traps for userland processing. Can't include bsd/sys/fasttrap_isa.h, so copy
+ * and paste the trap instructions
+ * over from that file. Need to keep these in sync!
+ */
 #define FASTTRAP_ARM32_INSTR 0xe7ffdefc
 #define FASTTRAP_THUMB32_INSTR 0xdefc
 #define FASTTRAP_ARM64_INSTR 0xe7eeee7e
@@ -174,6 +176,7 @@ extern boolean_t dtrace_tally_fault(user_addr_t);
 perfCallback tempDTraceTrapHook = NULL; /* Pointer to DTrace fbt trap hook routine */
 #endif
 
+
 #if CONFIG_PGTRACE
 extern boolean_t pgtrace_enabled;
 #endif
@@ -187,16 +190,21 @@ extern volatile char pan_fault_value;
 #endif
 #endif
 
-#if defined(APPLECYCLONE)
-#define CPU_NAME        "Cyclone"
-#elif defined(APPLETYPHOON)
-#define CPU_NAME        "Typhoon"
+#if HAS_TWO_STAGE_SPR_LOCK
+#ifdef CONFIG_XNUPOST
+extern volatile vm_offset_t spr_lock_test_addr;
+extern volatile uint32_t spr_lock_exception_esr;
+#endif
+#endif
+
+#if defined(APPLETYPHOON)
+#define CPU_NAME "Typhoon"
 #elif defined(APPLETWISTER)
-#define CPU_NAME        "Twister"
+#define CPU_NAME "Twister"
 #elif defined(APPLEHURRICANE)
-#define CPU_NAME        "Hurricane"
+#define CPU_NAME "Hurricane"
 #else
-#define CPU_NAME        "Unknown"
+#define CPU_NAME "Unknown"
 #endif
 
 #if (CONFIG_KERNEL_INTEGRITY && defined(KERNEL_INTEGRITY_WT))
@@ -224,6 +232,7 @@ __ror(unsigned value, unsigned shift)
               (unsigned)(value) << ((unsigned)(sizeof(unsigned) * CHAR_BIT) - (unsigned)(shift));
 }
 
+__dead2
 static void
 arm64_implementation_specific_error(arm_saved_state_t *state, uint32_t esr, vm_offset_t far)
 {
@@ -268,6 +277,9 @@ arm64_implementation_specific_error(arm_saved_state_t *state, uint32_t esr, vm_o
            (void *)l2c_err_sts, (void *)l2c_err_adr, (void *)l2c_err_inf);
 #else // !defined(NO_ECORE) && !defined(HAS_MIGSTS)
        uint64_t llc_err_sts, llc_err_adr, llc_err_inf, mpidr;
+#if defined(HAS_DPC_ERR)
+       uint64_t dpc_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_DPC_ERR_STS));
+#endif // defined(HAS_DPC_ERR)
 
        mpidr = __builtin_arm_rsr64("MPIDR_EL1");
 
@@ -286,10 +298,17 @@ arm64_implementation_specific_error(arm_saved_state_t *state, uint32_t esr, vm_o
        llc_err_inf = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_INF));
 
        panic_plain("Unhandled " CPU_NAME
-           " implementation specific error. state=%p esr=%#x far=%p p-core?%d\n"
+           " implementation specific error. state=%p esr=%#x far=%p p-core?%d"
+#if defined(HAS_DPC_ERR)
+           " dpc_err_sts:%p"
+#endif
+           "\n"
            "\tlsu_err_sts:%p, fed_err_sts:%p, mmu_err_sts:%p\n"
            "\tllc_err_sts:%p, llc_err_adr:%p, llc_err_inf:%p\n",
            state, esr, (void *)far, !!(mpidr & MPIDR_PNE),
+#if defined(HAS_DPC_ERR)
+           (void *)dpc_err_sts,
+#endif
            (void *)lsu_err_sts, (void *)fed_err_sts, (void *)mmu_err_sts,
            (void *)llc_err_sts, (void *)llc_err_adr, (void *)llc_err_inf);
 #endif
@@ -345,7 +364,7 @@ kernel_integrity_error_handler(uint32_t esr, vm_offset_t far)
 static void
 arm64_platform_error(arm_saved_state_t *state, uint32_t esr, vm_offset_t far)
 {
-       cpu_data_t      *cdp = getCpuDatap();
+       cpu_data_t *cdp = getCpuDatap();
 
 #if CONFIG_KERNEL_INTEGRITY
        kernel_integrity_error_handler(esr, far);
@@ -366,7 +385,7 @@ panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *ss)
        ss_valid = is_saved_state64(ss);
        arm_saved_state64_t *state = saved_state64(ss);
 
-       panic_plain("%s (saved state: %p%s)\n"
+       panic_plain("%s at pc 0x%016llx, lr 0x%016llx (saved state: %p%s)\n"
            "\t  x0: 0x%016llx  x1:  0x%016llx  x2:  0x%016llx  x3:  0x%016llx\n"
            "\t  x4: 0x%016llx  x5:  0x%016llx  x6:  0x%016llx  x7:  0x%016llx\n"
            "\t  x8: 0x%016llx  x9:  0x%016llx  x10: 0x%016llx  x11: 0x%016llx\n"
@@ -376,7 +395,7 @@ panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *ss)
            "\t  x24: 0x%016llx x25: 0x%016llx  x26: 0x%016llx  x27: 0x%016llx\n"
            "\t  x28: 0x%016llx fp:  0x%016llx  lr:  0x%016llx  sp:  0x%016llx\n"
            "\t  pc:  0x%016llx cpsr: 0x%08x         esr: 0x%08x          far: 0x%016llx\n",
-           msg, ss, (ss_valid ? "" : " INVALID"),
+           msg, state->pc, state->lr, ss, (ss_valid ? "" : " INVALID"),
            state->x[0], state->x[1], state->x[2], state->x[3],
            state->x[4], state->x[5], state->x[6], state->x[7],
            state->x[8], state->x[9], state->x[10], state->x[11],
@@ -388,12 +407,11 @@ panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *ss)
            state->pc, state->cpsr, state->esr, state->far);
 }
 
-
 void
 sleh_synchronous_sp1(arm_context_t *context, uint32_t esr, vm_offset_t far __unused)
 {
-       esr_exception_class_t   class = ESR_EC(esr);
-       arm_saved_state_t       *state = &context->ss;
+       esr_exception_class_t  class = ESR_EC(esr);
+       arm_saved_state_t    state = &context->ss;
 
        switch (class) {
        case ESR_EC_UNCATEGORIZED:
@@ -409,19 +427,51 @@ sleh_synchronous_sp1(arm_context_t *context, uint32_t esr, vm_offset_t far __unu
        }
 }
 
+#if defined(HAS_TWO_STAGE_SPR_LOCK) && defined(CONFIG_XNUPOST)
+static bool
+handle_msr_write_from_xnupost(arm_saved_state_t *state, uint32_t esr)
+{
+       user_addr_t pc = get_saved_state_pc(state);
+       if ((spr_lock_test_addr != 0) && (pc == spr_lock_test_addr)) {
+               spr_lock_exception_esr = esr;
+               set_saved_state_pc(state, pc + 4);
+               return true;
+       }
+
+       return false;
+}
+#endif
+
 void
 sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far)
 {
-       esr_exception_class_t   class = ESR_EC(esr);
-       arm_saved_state_t               *state = &context->ss;
-       vm_offset_t                             recover = 0, recover_saved = 0;
-       thread_t                                thread = current_thread();
+       esr_exception_class_t  class   = ESR_EC(esr);
+       arm_saved_state_t    * state   = &context->ss;
+       vm_offset_t            recover = 0;
+       thread_t               thread  = current_thread();
+#if MACH_ASSERT
+       int                    preemption_level = get_preemption_level();
+#endif
 
        ASSERT_CONTEXT_SANITY(context);
 
+       if (__improbable(ESR_INSTR_IS_2BYTES(esr))) {
+               /*
+                * We no longer support 32-bit, which means no 2-byte
+                * instructions.
+                */
+               if (PSR64_IS_USER(get_saved_state_cpsr(state))) {
+                       panic("Exception on 2-byte instruction, "
+                           "context=%p, esr=%#x, far=%p",
+                           context, esr, (void *)far);
+               } else {
+                       panic_with_thread_kernel_state("Exception on 2-byte instruction", state);
+               }
+       }
+
        /* Don't run exception handler with recover handler set in case of double fault */
        if (thread->recover) {
-               recover = recover_saved = thread->recover;
+               recover = thread->recover;
                thread->recover = (vm_offset_t)NULL;
        }
 
@@ -441,7 +491,7 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far)
 
        case ESR_EC_DABORT_EL0:
                handle_abort(state, esr, far, recover, inspect_data_abort, handle_user_abort);
-               assert(0); /* Unreachable */
+               thread_exception_return();
 
        case ESR_EC_MSR_TRAP:
                handle_msr_trap(state, ESR_ISS(esr));
@@ -449,7 +499,7 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far)
 
        case ESR_EC_IABORT_EL0:
                handle_abort(state, esr, far, recover, inspect_instruction_abort, handle_user_abort);
-               assert(0); /* Unreachable */
+               thread_exception_return();
 
        case ESR_EC_IABORT_EL1:
 
@@ -457,8 +507,7 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far)
 
        case ESR_EC_PC_ALIGN:
                handle_pc_align(state);
-               assert(0); /* Unreachable */
-               break;
+               __builtin_unreachable();
 
        case ESR_EC_DABORT_EL1:
                handle_abort(state, esr, far, recover, inspect_data_abort, handle_kernel_abort);
@@ -467,103 +516,61 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far)
        case ESR_EC_UNCATEGORIZED:
                assert(!ESR_ISS(esr));
 
-               handle_uncategorized(&context->ss, ESR_INSTR_IS_2BYTES(esr));
-               /* TODO: Uncomment this after stackshot uses a brk instruction
-                * rather than an undefined instruction, as stackshot is the
-                * only case where we want to return to the first-level handler.
-                */
-               //assert(0); /* Unreachable */
+#if defined(HAS_TWO_STAGE_SPR_LOCK) && defined(CONFIG_XNUPOST)
+               if (handle_msr_write_from_xnupost(state, esr)) {
+                       break;
+               }
+#endif
+               handle_uncategorized(&context->ss);
                break;
 
        case ESR_EC_SP_ALIGN:
                handle_sp_align(state);
-               assert(0); /* Unreachable */
-               break;
+               __builtin_unreachable();
 
        case ESR_EC_BKPT_AARCH32:
                handle_breakpoint(state);
-               assert(0); /* Unreachable */
-               break;
+               __builtin_unreachable();
 
        case ESR_EC_BRK_AARCH64:
                if (PSR64_IS_KERNEL(get_saved_state_cpsr(state))) {
-                       kprintf("Breakpoint instruction exception from kernel.  Hanging here (by design).\n");
-                       for (;;) {
-                               ;
-                       }
-
-                       __unreachable_ok_push
-                       DebuggerCall(EXC_BREAKPOINT, &context->ss);
-                       break;
-                       __unreachable_ok_pop
+                       panic_with_thread_kernel_state("Break instruction exception from kernel. Panic (by design)", state);
                } else {
                        handle_breakpoint(state);
-                       assert(0); /* Unreachable */
                }
+               __builtin_unreachable();
 
        case ESR_EC_BKPT_REG_MATCH_EL0:
                if (FSC_DEBUG_FAULT == ISS_SSDE_FSC(esr)) {
                        handle_breakpoint(state);
-                       assert(0); /* Unreachable */
                }
                panic("Unsupported Class %u event code. state=%p class=%u esr=%u far=%p",
                    class, state, class, esr, (void *)far);
-               assert(0); /* Unreachable */
-               break;
+               __builtin_unreachable();
 
        case ESR_EC_BKPT_REG_MATCH_EL1:
-               if (!PE_i_can_has_debugger(NULL) && FSC_DEBUG_FAULT == ISS_SSDE_FSC(esr)) {
-                       kprintf("Hardware Breakpoint Debug exception from kernel.  Hanging here (by design).\n");
-                       for (;;) {
-                               ;
-                       }
-
-                       __unreachable_ok_push
-                       DebuggerCall(EXC_BREAKPOINT, &context->ss);
-                       break;
-                       __unreachable_ok_pop
-               }
-               panic("Unsupported Class %u event code. state=%p class=%u esr=%u far=%p",
-                   class, state, class, esr, (void *)far);
-               assert(0); /* Unreachable */
-               break;
+               panic_with_thread_kernel_state("Hardware Breakpoint Debug exception from kernel. Panic (by design)", state);
+               __builtin_unreachable();
 
        case ESR_EC_SW_STEP_DEBUG_EL0:
                if (FSC_DEBUG_FAULT == ISS_SSDE_FSC(esr)) {
                        handle_sw_step_debug(state);
-                       assert(0); /* Unreachable */
                }
                panic("Unsupported Class %u event code. state=%p class=%u esr=%u far=%p",
                    class, state, class, esr, (void *)far);
-               assert(0); /* Unreachable */
-               break;
+               __builtin_unreachable();
 
        case ESR_EC_SW_STEP_DEBUG_EL1:
-               if (!PE_i_can_has_debugger(NULL) && FSC_DEBUG_FAULT == ISS_SSDE_FSC(esr)) {
-                       kprintf("Software Step Debug exception from kernel.  Hanging here (by design).\n");
-                       for (;;) {
-                               ;
-                       }
-
-                       __unreachable_ok_push
-                       DebuggerCall(EXC_BREAKPOINT, &context->ss);
-                       break;
-                       __unreachable_ok_pop
-               }
-               panic("Unsupported Class %u event code. state=%p class=%u esr=%u far=%p",
-                   class, state, class, esr, (void *)far);
-               assert(0); /* Unreachable */
-               break;
+               panic_with_thread_kernel_state("Software Step Debug exception from kernel. Panic (by design)", state);
+               __builtin_unreachable();
 
        case ESR_EC_WATCHPT_MATCH_EL0:
                if (FSC_DEBUG_FAULT == ISS_SSDE_FSC(esr)) {
                        handle_watchpoint(far);
-                       assert(0); /* Unreachable */
                }
                panic("Unsupported Class %u event code. state=%p class=%u esr=%u far=%p",
                    class, state, class, esr, (void *)far);
-               assert(0); /* Unreachable */
-               break;
+               __builtin_unreachable();
 
        case ESR_EC_WATCHPT_MATCH_EL1:
                /*
@@ -576,13 +583,11 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far)
                }
                panic("Unsupported Class %u event code. state=%p class=%u esr=%u far=%p",
                    class, state, class, esr, (void *)far);
-               assert(0); /* Unreachable */
-               break;
+               __builtin_unreachable();
 
        case ESR_EC_TRAP_SIMD_FP:
                handle_simd_trap(state, esr);
-               assert(0);
-               break;
+               __builtin_unreachable();
 
        case ESR_EC_ILLEGAL_INSTR_SET:
                if (EXCB_ACTION_RERUN !=
@@ -590,10 +595,9 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far)
                        // instruction is not re-executed
                        panic("Illegal instruction set exception. state=%p class=%u esr=%u far=%p spsr=0x%x",
                            state, class, esr, (void *)far, get_saved_state_cpsr(state));
-                       assert(0);
                }
                // must clear this fault in PSR to re-run
-               set_saved_state_cpsr(state, get_saved_state_cpsr(state) & (~PSR64_IL));
+               mask_saved_state_cpsr(state, 0, PSR64_IL);
                break;
 
        case ESR_EC_MCR_MRC_CP15_TRAP:
@@ -602,25 +606,32 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far)
        case ESR_EC_LDC_STC_CP14_TRAP:
        case ESR_EC_MCRR_MRRC_CP14_TRAP:
                handle_user_trapped_instruction32(state, esr);
-               assert(0);
-               break;
+               __builtin_unreachable();
 
        case ESR_EC_WFI_WFE:
                // Use of WFI or WFE instruction when they have been disabled for EL0
                handle_wf_trap(state);
-               assert(0);      /* Unreachable */
-               break;
+               __builtin_unreachable();
+
+       case ESR_EC_FLOATING_POINT_64:
+               handle_fp_trap(state, esr);
+               __builtin_unreachable();
+
 
        default:
                panic("Unsupported synchronous exception. state=%p class=%u esr=%u far=%p",
                    state, class, esr, (void *)far);
-               assert(0); /* Unreachable */
-               break;
+               __builtin_unreachable();
        }
 
-       if (recover_saved) {
-               thread->recover = recover_saved;
+       if (recover) {
+               thread->recover = recover;
        }
+#if MACH_ASSERT
+       if (preemption_level != get_preemption_level()) {
+               panic("synchronous exception changed preemption level from %d to %d", preemption_level, get_preemption_level());
+       }
+#endif
 }
 
 /*
@@ -628,21 +639,14 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far)
  * ARM64_TODO: For now, we assume this is for undefined instruction exceptions.
  */
 static void
-handle_uncategorized(arm_saved_state_t *state, boolean_t instrLen2)
+handle_uncategorized(arm_saved_state_t *state)
 {
        exception_type_t           exception = EXC_BAD_INSTRUCTION;
-       mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED};
-       mach_msg_type_number_t     numcodes = 2;
-       uint32_t                   instr = 0;
-
-       if (instrLen2) {
-               uint16_t instr16 = 0;
-               COPYIN(get_saved_state_pc(state), (char *)&instr16, sizeof(instr16));
+       mach_exception_data_type_t codes[2]  = {EXC_ARM_UNDEFINED};
+       mach_msg_type_number_t     numcodes  = 2;
+       uint32_t                   instr     = 0;
 
-               instr = instr16;
-       } else {
-               COPYIN(get_saved_state_pc(state), (char *)&instr, sizeof(instr));
-       }
+       COPYIN(get_saved_state_pc(state), (char *)&instr, sizeof(instr));
 
 #if CONFIG_DTRACE
        if (tempDTraceTrapHook && (tempDTraceTrapHook(exception, state, 0, 0) == KERN_SUCCESS)) {
@@ -697,7 +701,7 @@ handle_uncategorized(arm_saved_state_t *state, boolean_t instrLen2)
                         */
                        kstackptr = (vm_offset_t) current_thread()->machine.kstackptr;
                        if (kstackptr) {
-                               ((thread_kernel_state_t) kstackptr)->machine.ss = *state;
+                               copy_signed_thread_state(&((thread_kernel_state_t) kstackptr)->machine.ss, state);
                        }
 
                        /* Hop into the debugger (typically either due to a
@@ -714,68 +718,50 @@ handle_uncategorized(arm_saved_state_t *state, boolean_t instrLen2)
        }
 
        /*
-        * Check for GDB  breakpoint via illegal opcode.
+        * Check for GDB breakpoint via illegal opcode.
         */
-       if (instrLen2) {
-               if (IS_THUMB_GDB_TRAP(instr)) {
-                       exception = EXC_BREAKPOINT;
-                       codes[0] = EXC_ARM_BREAKPOINT;
-                       codes[1] = instr;
-               } else {
-                       codes[1] = instr;
-               }
+       if (IS_ARM_GDB_TRAP(instr)) {
+               exception = EXC_BREAKPOINT;
+               codes[0] = EXC_ARM_BREAKPOINT;
+               codes[1] = instr;
        } else {
-               if (IS_ARM_GDB_TRAP(instr)) {
-                       exception = EXC_BREAKPOINT;
-                       codes[0] = EXC_ARM_BREAKPOINT;
-                       codes[1] = instr;
-               } else if (IS_THUMB_GDB_TRAP((instr & 0xFFFF))) {
-                       exception = EXC_BREAKPOINT;
-                       codes[0] = EXC_ARM_BREAKPOINT;
-                       codes[1] = instr & 0xFFFF;
-               } else if (IS_THUMB_GDB_TRAP((instr >> 16))) {
-                       exception = EXC_BREAKPOINT;
-                       codes[0] = EXC_ARM_BREAKPOINT;
-                       codes[1] = instr >> 16;
-               } else {
-                       codes[1] = instr;
-               }
+               codes[1] = instr;
        }
 
        exception_triage(exception, codes, numcodes);
-       assert(0); /* NOTREACHED */
+       __builtin_unreachable();
 }
 
 static void
 handle_breakpoint(arm_saved_state_t *state)
 {
-       exception_type_t                        exception = EXC_BREAKPOINT;
-       mach_exception_data_type_t      codes[2] = {EXC_ARM_BREAKPOINT};
-       mach_msg_type_number_t          numcodes = 2;
+       exception_type_t           exception = EXC_BREAKPOINT;
+       mach_exception_data_type_t codes[2]  = {EXC_ARM_BREAKPOINT};
+       mach_msg_type_number_t     numcodes  = 2;
 
        codes[1] = get_saved_state_pc(state);
        exception_triage(exception, codes, numcodes);
-       assert(0); /* NOTREACHED */
+       __builtin_unreachable();
 }
 
 static void
 handle_watchpoint(vm_offset_t fault_addr)
 {
-       exception_type_t                        exception = EXC_BREAKPOINT;
-       mach_exception_data_type_t      codes[2] = {EXC_ARM_DA_DEBUG};
-       mach_msg_type_number_t          numcodes = 2;
+       exception_type_t           exception = EXC_BREAKPOINT;
+       mach_exception_data_type_t codes[2]  = {EXC_ARM_DA_DEBUG};
+       mach_msg_type_number_t     numcodes  = 2;
 
        codes[1] = fault_addr;
        exception_triage(exception, codes, numcodes);
-       assert(0); /* NOTREACHED */
+       __builtin_unreachable();
 }
 
 static void
 handle_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr, vm_offset_t recover,
     abort_inspector_t inspect_abort, abort_handler_t handler)
 {
-       fault_status_t          fault_code;
-       vm_prot_t                       fault_type;
+       fault_status_t fault_code;
+       vm_prot_t      fault_type;
 
        inspect_abort(ESR_ISS(esr), &fault_code, &fault_type);
        handler(state, esr, fault_addr, fault_code, fault_type, recover);
@@ -819,7 +805,7 @@ handle_pc_align(arm_saved_state_t *ss)
        codes[1] = get_saved_state_pc(ss);
 
        exception_triage(exc, codes, numcodes);
-       assert(0); /* NOTREACHED */
+       __builtin_unreachable();
 }
 
 static void
@@ -838,22 +824,59 @@ handle_sp_align(arm_saved_state_t *ss)
        codes[1] = get_saved_state_sp(ss);
 
        exception_triage(exc, codes, numcodes);
-       assert(0); /* NOTREACHED */
+       __builtin_unreachable();
 }
 
 static void
-handle_wf_trap(arm_saved_state_t *ss)
+handle_wf_trap(arm_saved_state_t *state)
 {
        exception_type_t exc;
        mach_exception_data_type_t codes[2];
        mach_msg_type_number_t numcodes = 2;
+       uint32_t instr = 0;
+
+       COPYIN(get_saved_state_pc(state), (char *)&instr, sizeof(instr));
 
        exc = EXC_BAD_INSTRUCTION;
        codes[0] = EXC_ARM_UNDEFINED;
-       codes[1] = get_saved_state_sp(ss);
+       codes[1] = instr;
+
+       exception_triage(exc, codes, numcodes);
+       __builtin_unreachable();
+}
+
+static void
+handle_fp_trap(arm_saved_state_t *state, uint32_t esr)
+{
+       exception_type_t exc = EXC_ARITHMETIC;
+       mach_exception_data_type_t codes[2];
+       mach_msg_type_number_t numcodes = 2;
+       uint32_t instr = 0;
+
+       /* The floating point trap flags are only valid if TFV is set. */
+       if (!(esr & ISS_FP_TFV)) {
+               codes[0] = EXC_ARM_FP_UNDEFINED;
+       } else if (esr & ISS_FP_UFF) {
+               codes[0] = EXC_ARM_FP_UF;
+       } else if (esr & ISS_FP_OFF) {
+               codes[0] = EXC_ARM_FP_OF;
+       } else if (esr & ISS_FP_IOF) {
+               codes[0] = EXC_ARM_FP_IO;
+       } else if (esr & ISS_FP_DZF) {
+               codes[0] = EXC_ARM_FP_DZ;
+       } else if (esr & ISS_FP_IDF) {
+               codes[0] = EXC_ARM_FP_ID;
+       } else if (esr & ISS_FP_IXF) {
+               codes[0] = EXC_ARM_FP_IX;
+       } else {
+               panic("Unrecognized floating point exception, state=%p, esr=%#x", state, esr);
+       }
+
+       COPYIN(get_saved_state_pc(state), (char *)&instr, sizeof(instr));
+       codes[1] = instr;
 
        exception_triage(exc, codes, numcodes);
-       assert(0); /* NOTREACHED */
+       __builtin_unreachable();
 }
 
 
@@ -876,8 +899,7 @@ handle_sw_step_debug(arm_saved_state_t *state)
                panic_with_thread_kernel_state("SW_STEP_DEBUG exception thread DebugData is NULL.", state);
        }
 
-       set_saved_state_cpsr((thread->machine.upcb),
-           get_saved_state_cpsr((thread->machine.upcb)) & ~(PSR64_SS | DAIF_IRQF | DAIF_FIQF));
+       mask_saved_state_cpsr(thread->machine.upcb, 0, PSR64_SS | DAIF_IRQF | DAIF_FIQF);
 
        // Special encoding for gdb single step event on ARM
        exc = EXC_BREAKPOINT;
@@ -885,7 +907,7 @@ handle_sw_step_debug(arm_saved_state_t *state)
        codes[1] = 0;
 
        exception_triage(exc, codes, numcodes);
-       assert(0); /* NOTREACHED */
+       __builtin_unreachable();
 }
 
 static int
@@ -958,14 +980,48 @@ is_parity_error(fault_status_t status)
        }
 }
 
+static void
+set_saved_state_pc_to_recovery_handler(arm_saved_state_t *iss, vm_offset_t recover)
+{
+#if defined(HAS_APPLE_PAC)
+       thread_t thread = current_thread();
+       const uintptr_t disc = ptrauth_blend_discriminator(&thread->recover, PAC_DISCRIMINATOR_RECOVER);
+       const char *panic_msg = "Illegal thread->recover value %p";
+
+       MANIPULATE_SIGNED_THREAD_STATE(iss,
+           // recover = (vm_offset_t)ptrauth_auth_data((void *)recover, ptrauth_key_function_pointer,
+           //     ptrauth_blend_discriminator(&thread->recover, PAC_DISCRIMINATOR_RECOVER));
+           "mov        x1, %[recover]          \n"
+           "mov        x6, %[disc]             \n"
+           "autia      x1, x6                  \n"
+           // if (recover != (vm_offset_t)ptrauth_strip((void *)recover, ptrauth_key_function_pointer)) {
+           "mov        x6, x1                  \n"
+           "xpaci      x6                      \n"
+           "cmp        x1, x6                  \n"
+           "beq        1f                      \n"
+           //         panic("Illegal thread->recover value %p", (void *)recover);
+           "mov        x0, %[panic_msg]        \n"
+           "bl         _panic                  \n"
+           // }
+           "1:                                 \n"
+           "str        x1, [x0, %[SS64_PC]]    \n",
+           [recover]     "r"(recover),
+           [disc]        "r"(disc),
+           [panic_msg]   "r"(panic_msg)
+           );
+#else
+       set_saved_state_pc(iss, recover);
+#endif
+}
+
 static void
 handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr,
     fault_status_t fault_code, vm_prot_t fault_type, vm_offset_t recover)
 {
-       exception_type_t                exc = EXC_BAD_ACCESS;
-       mach_exception_data_type_t      codes[2];
-       mach_msg_type_number_t          numcodes = 2;
-       thread_t                        thread = current_thread();
+       exception_type_t           exc      = EXC_BAD_ACCESS;
+       mach_exception_data_type_t codes[2];
+       mach_msg_type_number_t     numcodes = 2;
+       thread_t                   thread   = current_thread();
 
        (void)esr;
        (void)state;
@@ -988,21 +1044,18 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr
                }
 
 #if CONFIG_DTRACE
-               if (thread->options & TH_OPT_DTRACE) {  /* Executing under dtrace_probe? */
+               if (thread->t_dtrace_inprobe) { /* Executing under dtrace_probe? */
                        if (dtrace_tally_fault(vm_fault_addr)) { /* Should a user mode fault under dtrace be ignored? */
                                if (recover) {
-                                       set_saved_state_pc(state, recover);
+                                       set_saved_state_pc_to_recovery_handler(state, recover);
                                } else {
-                                       boolean_t intr = ml_set_interrupts_enabled(FALSE);
+                                       ml_set_interrupts_enabled(FALSE);
                                        panic_with_thread_kernel_state("copyin/out has no recovery point", state);
-                                       (void) ml_set_interrupts_enabled(intr);
                                }
                                return;
                        } else {
-                               boolean_t intr = ml_set_interrupts_enabled(FALSE);
+                               ml_set_interrupts_enabled(FALSE);
                                panic_with_thread_kernel_state("Unexpected UMW page fault under dtrace_probe", state);
-                               (void) ml_set_interrupts_enabled(intr);
-                               return;
                        }
                }
 #else
@@ -1022,7 +1075,7 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr
                /* check to see if it is just a pmap ref/modify fault */
 
                if ((result != KERN_SUCCESS) && !is_translation_fault(fault_code)) {
-                       result = arm_fast_fault(map->pmap, trunc_page(vm_fault_addr), fault_type, TRUE);
+                       result = arm_fast_fault(map->pmap, trunc_page(vm_fault_addr), fault_type, (fault_code == FSC_ACCESS_FLAG_FAULT_L3), TRUE);
                }
                if (result != KERN_SUCCESS) {
                        {
@@ -1033,8 +1086,15 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr
                        }
                }
                if (result == KERN_SUCCESS || result == KERN_ABORTED) {
-                       thread_exception_return();
-                       /* NOTREACHED */
+                       return;
+               }
+
+               /*
+                * vm_fault() should never return KERN_FAILURE for page faults from user space.
+                * If it does, we're leaking preemption disables somewhere in the kernel.
+                */
+               if (__improbable(result == KERN_FAILURE)) {
+                       panic("vm_fault() KERN_FAILURE from user fault on thread %p", thread);
                }
 
                codes[0] = result;
@@ -1044,8 +1104,7 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr
 #if defined(APPLE_ARM64_ARCH_FAMILY)
                if (fault_code == FSC_SYNC_PARITY) {
                        arm64_platform_error(state, esr, fault_addr);
-                       thread_exception_return();
-                       /* NOTREACHED */
+                       return;
                }
 #else
                panic("User parity error.");
@@ -1056,7 +1115,7 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr
 
        codes[1] = fault_addr;
        exception_triage(exc, codes, numcodes);
-       assert(0); /* NOTREACHED */
+       __builtin_unreachable();
 }
 
 #if __ARM_PAN_AVAILABLE__
@@ -1091,26 +1150,24 @@ static void
 handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr,
     fault_status_t fault_code, vm_prot_t fault_type, vm_offset_t recover)
 {
-       thread_t                thread = current_thread();
+       thread_t thread = current_thread();
        (void)esr;
 
 #if CONFIG_DTRACE
-       if (is_vm_fault(fault_code) && thread->options & TH_OPT_DTRACE) {       /* Executing under dtrace_probe? */
+       if (is_vm_fault(fault_code) && thread->t_dtrace_inprobe) { /* Executing under dtrace_probe? */
                if (dtrace_tally_fault(fault_addr)) { /* Should a fault under dtrace be ignored? */
                        /*
                         * Point to next instruction, or recovery handler if set.
                         */
                        if (recover) {
-                               set_saved_state_pc(state, recover);
+                               set_saved_state_pc_to_recovery_handler(state, recover);
                        } else {
-                               set_saved_state_pc(state, get_saved_state_pc(state) + 4);
+                               add_saved_state_pc(state, 4);
                        }
                        return;
                } else {
-                       boolean_t intr = ml_set_interrupts_enabled(FALSE);
+                       ml_set_interrupts_enabled(FALSE);
                        panic_with_thread_kernel_state("Unexpected page fault under dtrace_probe", state);
-                       (void) ml_set_interrupts_enabled(intr);
-                       return;
                }
        }
 #endif
@@ -1122,9 +1179,9 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad
 #endif
 
        if (is_vm_fault(fault_code)) {
-               kern_return_t   result = KERN_FAILURE;
-               vm_map_t        map;
-               int             interruptible;
+               kern_return_t result = KERN_FAILURE;
+               vm_map_t      map;
+               int           interruptible;
 
                /*
                 * Ensure no faults in the physical aperture. This could happen if
@@ -1141,7 +1198,8 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad
                        // that PAN is re-enabled for the exception handler and that
                        // accessing the test address produces a PAN fault.
                        pan_fault_value = *(char *)pan_test_addr;
-                       set_saved_state_pc(state, get_saved_state_pc(state) + 4);
+                       __builtin_arm_wsr("pan", 1); // turn PAN back on after the nested exception cleared it for this context
+                       add_saved_state_pc(state, 4);
                        return;
                }
 #endif
@@ -1174,7 +1232,7 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad
 
                /* check to see if it is just a pmap ref/modify fault */
                if (!is_translation_fault(fault_code)) {
-                       result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, FALSE);
+                       result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, (fault_code == FSC_ACCESS_FLAG_FAULT_L3), FALSE);
                        if (result == KERN_SUCCESS) {
                                return;
                        }
@@ -1197,7 +1255,7 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad
                 *  If we have a recover handler, invoke it now.
                 */
                if (recover) {
-                       set_saved_state_pc(state, recover);
+                       set_saved_state_pc_to_recovery_handler(state, recover);
                        return;
                }
 
@@ -1211,11 +1269,11 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad
                                // the exception handler
                                if (pan_exception_level == 1) {
                                        pan_fault_value = *(char *)pan_test_addr;
+                                       __builtin_arm_wsr("pan", 1); // turn PAN back on after the nested exception cleared it for this context
                                }
                                // this fault address is used for PAN test
                                // disable PAN and rerun
-                               set_saved_state_cpsr(state,
-                                   get_saved_state_cpsr(state) & (~PSR64_PAN));
+                               mask_saved_state_cpsr(state, 0, PSR64_PAN);
                                return;
                        }
 #endif
@@ -1228,6 +1286,10 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad
                panic_with_thread_kernel_state("Unexpected abort while on interrupt stack.", state);
 #endif
        } else if (is_alignment_fault(fault_code)) {
+               if (recover) {
+                       set_saved_state_pc_to_recovery_handler(state, recover);
+                       return;
+               }
                panic_with_thread_kernel_state("Unaligned kernel data abort.", state);
        } else if (is_parity_error(fault_code)) {
 #if defined(APPLE_ARM64_ARCH_FAMILY)
@@ -1250,9 +1312,9 @@ extern void syscall_trace(struct arm_saved_state * regs);
 static void
 handle_svc(arm_saved_state_t *state)
 {
-       int trap_no = get_saved_state_svc_number(state);
-       thread_t thread = current_thread();
-       struct proc *p;
+       int      trap_no = get_saved_state_svc_number(state);
+       thread_t thread  = current_thread();
+       struct   proc *p;
 
 #define handle_svc_kprintf(x...) /* kprintf("handle_svc: " x) */
 
@@ -1311,9 +1373,9 @@ static void
 handle_msr_trap(arm_saved_state_t *state, uint32_t iss)
 {
        exception_type_t           exception = EXC_BAD_INSTRUCTION;
-       mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED};
-       mach_msg_type_number_t     numcodes = 2;
-       uint32_t                   instr = 0;
+       mach_exception_data_type_t codes[2]  = {EXC_ARM_UNDEFINED};
+       mach_msg_type_number_t     numcodes  = 2;
+       uint32_t                   instr     = 0;
 
        (void)iss;
 
@@ -1335,9 +1397,9 @@ static void
 handle_user_trapped_instruction32(arm_saved_state_t *state, uint32_t esr)
 {
        exception_type_t           exception = EXC_BAD_INSTRUCTION;
-       mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED};
-       mach_msg_type_number_t     numcodes = 2;
-       uint32_t                   instr = 0;
+       mach_exception_data_type_t codes[2]  = {EXC_ARM_UNDEFINED};
+       mach_msg_type_number_t     numcodes  = 2;
+       uint32_t                   instr;
 
        if (is_saved_state64(state)) {
                panic("ESR (0x%x) for instruction trapped from U32, but saved state is 64-bit.", esr);
@@ -1351,15 +1413,16 @@ handle_user_trapped_instruction32(arm_saved_state_t *state, uint32_t esr)
        codes[1] = instr;
 
        exception_triage(exception, codes, numcodes);
+       __builtin_unreachable();
 }
 
 static void
 handle_simd_trap(arm_saved_state_t *state, uint32_t esr)
 {
        exception_type_t           exception = EXC_BAD_INSTRUCTION;
-       mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED};
-       mach_msg_type_number_t     numcodes = 2;
-       uint32_t                   instr = 0;
+       mach_exception_data_type_t codes[2]  = {EXC_ARM_UNDEFINED};
+       mach_msg_type_number_t     numcodes  = 2;
+       uint32_t                   instr     = 0;
 
        if (PSR64_IS_KERNEL(get_saved_state_cpsr(state))) {
                panic("ESR (0x%x) for SIMD trap from userland, actually came from kernel?", esr);
@@ -1369,20 +1432,23 @@ handle_simd_trap(arm_saved_state_t *state, uint32_t esr)
        codes[1] = instr;
 
        exception_triage(exception, codes, numcodes);
+       __builtin_unreachable();
 }
 
 void
 sleh_irq(arm_saved_state_t *state)
 {
-       uint64_t     timestamp            = 0;
-       uint32_t     old_entropy_data     = 0;
-       uint32_t *   old_entropy_data_ptr = NULL;
-       uint32_t *   new_entropy_data_ptr = NULL;
-       cpu_data_t * cdp                  = getCpuDatap();
-#if DEVELOPMENT || DEBUG
+       uint64_t     timestamp                = 0;
+       uint32_t     old_entropy_data         = 0;
+       uint32_t     old_entropy_sample_count = 0;
+       size_t       entropy_index            = 0;
+       uint32_t *   entropy_data_ptr         = NULL;
+       cpu_data_t * cdp                      = getCpuDatap();
+#if MACH_ASSERT
        int preemption_level = get_preemption_level();
 #endif
 
+
        sleh_interrupt_handler_prologue(state, DBG_INTR_TYPE_OTHER);
 
        /* Run the registered interrupt handler. */
@@ -1401,21 +1467,18 @@ sleh_irq(arm_saved_state_t *state)
         * is the entire point of the entropy buffer, we will not worry about
         * these races for now.
         */
-       old_entropy_data_ptr = EntropyData.index_ptr;
-       new_entropy_data_ptr = old_entropy_data_ptr + 1;
-
-       if (new_entropy_data_ptr >= &EntropyData.buffer[ENTROPY_BUFFER_SIZE]) {
-               new_entropy_data_ptr = EntropyData.buffer;
-       }
+       old_entropy_sample_count = EntropyData.sample_count;
+       EntropyData.sample_count += 1;
 
-       EntropyData.index_ptr = new_entropy_data_ptr;
+       entropy_index = old_entropy_sample_count & ENTROPY_BUFFER_INDEX_MASK;
+       entropy_data_ptr = EntropyData.buffer + entropy_index;
 
        /* Mix the timestamp data and the old data together. */
-       old_entropy_data = *old_entropy_data_ptr;
-       *old_entropy_data_ptr = (uint32_t)timestamp ^ __ror(old_entropy_data, 9);
+       old_entropy_data = *entropy_data_ptr;
+       *entropy_data_ptr = (uint32_t)timestamp ^ __ror(old_entropy_data, 9);
 
        sleh_interrupt_handler_epilogue();
-#if DEVELOPMENT || DEBUG
+#if MACH_ASSERT
        if (preemption_level != get_preemption_level()) {
                panic("irq handler %p changed preemption level from %d to %d", cdp->interrupt_handler, preemption_level, get_preemption_level());
        }
@@ -1426,7 +1489,7 @@ void
 sleh_fiq(arm_saved_state_t *state)
 {
        unsigned int type   = DBG_INTR_TYPE_UNKNOWN;
-#if DEVELOPMENT || DEBUG
+#if MACH_ASSERT
        int preemption_level = get_preemption_level();
 #endif
 
@@ -1469,7 +1532,7 @@ sleh_fiq(arm_saved_state_t *state)
        }
 
        sleh_interrupt_handler_epilogue();
-#if DEVELOPMENT || DEBUG
+#if MACH_ASSERT
        if (preemption_level != get_preemption_level()) {
                panic("fiq type %u changed preemption level from %d to %d", type, preemption_level, get_preemption_level());
        }
@@ -1479,14 +1542,14 @@ sleh_fiq(arm_saved_state_t *state)
 void
 sleh_serror(arm_context_t *context, uint32_t esr, vm_offset_t far)
 {
-       arm_saved_state_t               *state = &context->ss;
-#if DEVELOPMENT || DEBUG
+       arm_saved_state_t *state = &context->ss;
+#if MACH_ASSERT
        int preemption_level = get_preemption_level();
 #endif
 
        ASSERT_CONTEXT_SANITY(context);
        arm64_platform_error(state, esr, far);
-#if DEVELOPMENT || DEBUG
+#if MACH_ASSERT
        if (preemption_level != get_preemption_level()) {
                panic("serror changed preemption level from %d to %d", preemption_level, get_preemption_level());
        }
@@ -1494,13 +1557,12 @@ sleh_serror(arm_context_t *context, uint32_t esr, vm_offset_t far)
 }
 
 void
-mach_syscall_trace_exit(
-       unsigned int retval,
-       unsigned int call_number)
+mach_syscall_trace_exit(unsigned int retval,
+    unsigned int call_number)
 {
        KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-           MACHDBG_CODE(DBG_MACH_EXCP_SC, (call_number)) | DBG_FUNC_END,
-           retval, 0, 0, 0, 0);
+           MACHDBG_CODE(DBG_MACH_EXCP_SC, (call_number)) |
+           DBG_FUNC_END, retval, 0, 0, 0, 0);
 }
 
 __attribute__((noreturn))
@@ -1516,11 +1578,11 @@ thread_syscall_return(kern_return_t error)
        assert(is_saved_state64(state));
        saved_state64(state)->x[0] = error;
 
-#if DEBUG || DEVELOPMENT
+#if MACH_ASSERT
        kern_allocation_name_t
        prior __assert_only = thread_get_kernel_state(thread)->allocation_name;
        assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior));
-#endif /* DEBUG || DEVELOPMENT */
+#endif /* MACH_ASSERT */
 
        if (kdebug_enable) {
                /* Invert syscall number (negative for a mach syscall) */
@@ -1540,7 +1602,7 @@ syscall_trace(
 static void
 sleh_interrupt_handler_prologue(arm_saved_state_t *state, unsigned int type)
 {
-       uint64_t     is_user = PSR64_IS_USER(get_saved_state_cpsr(state));
+       uint64_t is_user = PSR64_IS_USER(get_saved_state_cpsr(state));
 
        uint64_t pc = is_user ? get_saved_state_pc(state) :
            VM_KERNEL_UNSLIDE(get_saved_state_pc(state));
index f709217cf72a4db21e83200bf9056bb628c16c7b..4e964ca8bce3b519c6b6c7b36bab23721d55e8cc 100644 (file)
 #include <mach_assert.h>
 #include <machine/asm.h>
 #include "assym.s"
+#include <arm64/exception_asm.h>
 
 #if __ARM_KERNEL_PROTECT__
 #include <arm/pmap.h>
 #endif /* __ARM_KERNEL_PROTECT__ */
 
 
+
 .macro MSR_VBAR_EL1_X0
 #if defined(KERNEL_INTEGRITY_KTRR)
        mov     x1, lr
@@ -52,7 +54,7 @@
 #if defined(KERNEL_INTEGRITY_KTRR)
        mov             x0, x1
        mov             x1, lr
-       bl              _pinst_set_tcr
+       bl              EXT(pinst_set_tcr)
        mov             lr, x1
 #else
        msr             TCR_EL1, x1
@@ -62,7 +64,7 @@
 .macro MSR_TTBR1_EL1_X0
 #if defined(KERNEL_INTEGRITY_KTRR)
        mov             x1, lr
-       bl              _pinst_set_ttbr1
+       bl              EXT(pinst_set_ttbr1)
        mov             lr, x1
 #else
        msr             TTBR1_EL1, x0
@@ -74,9 +76,9 @@
        mov             x1, lr
 
        // This may abort, do so on SP1
-       bl              _pinst_spsel_1
+       bl              EXT(pinst_spsel_1)
 
-       bl              _pinst_set_sctlr
+       bl              EXT(pinst_set_sctlr)
        msr             SPSel, #0                                                                       // Back to SP0
        mov             lr, x1
 #else
        .align 12
        .globl EXT(LowResetVectorBase)
 LEXT(LowResetVectorBase)
-       // Preserve x0 for start_first_cpu, if called
+       /*
+        * On reset, both RVBAR_EL1 and VBAR_EL1 point here.  SPSel.SP is 1,
+        * so on reset the CPU will jump to offset 0x0 and on exceptions
+        * the CPU will jump to offset 0x200, 0x280, 0x300, or 0x380.
+        * In order for both the reset vector and exception vectors to
+        * coexist in the same space, the reset code is moved to the end
+        * of the exception vector area.
+        */
+       b               EXT(reset_vector)
 
+       /* EL1 SP1: These vectors trap errors during early startup on non-boot CPUs. */
+       .align  9
+       b               .
+       .align  7
+       b               .
+       .align  7
+       b               .
+       .align  7
+       b               .
+
+       .align  7
+       .globl EXT(reset_vector)
+LEXT(reset_vector)
+       // Preserve x0 for start_first_cpu, if called
        // Unlock the core for debugging
        msr             OSLAR_EL1, xzr
        msr             DAIFSet, #(DAIFSC_ALL)                          // Disable all interrupts
@@ -122,6 +146,11 @@ LEXT(LowResetVectorBase)
         * If either values are zero, we're debugging kernel so skip programming KTRR.
         */
 
+       /* spin until bootstrap core has completed machine lockdown */
+       adrp    x17, EXT(lockdown_done)@page
+1:
+       ldr     x18, [x17, EXT(lockdown_done)@pageoff]
+       cbz     x18, 1b
 
        // load stashed rorgn_begin
        adrp    x17, EXT(rorgn_begin)@page
@@ -144,7 +173,7 @@ LEXT(LowResetVectorBase)
        mov             x17, #1
        msr             ARM64_REG_KTRR_LOCK_EL1, x17
 Lskip_ktrr:
-#endif /* defined(KERNEL_INTEGRITY_KTRR)*/
+#endif /* defined(KERNEL_INTEGRITY_KTRR) */
 
        // Process reset handlers
        adrp    x19, EXT(ResetHandlerData)@page                 // Get address of the reset handler data
@@ -158,14 +187,14 @@ Lcheck_cpu_data_entry:
        cbz             x21, Lnext_cpu_data_entry
        ldr             w2, [x21, CPU_PHYS_ID]                          // Load ccc cpu phys id
        cmp             x0, x2                                          // Compare cpu data phys cpu and MPIDR_EL1 phys cpu
-       b.eq            Lfound_cpu_data_entry                           // Branch if match
+       b.eq    Lfound_cpu_data_entry                           // Branch if match
 Lnext_cpu_data_entry:
        add             x1, x1, #16                                     // Increment to the next cpu data entry
        cmp             x1, x3
-       b.eq            Lskip_cpu_reset_handler                         // Not found
+       b.eq    Lskip_cpu_reset_handler                         // Not found
        b               Lcheck_cpu_data_entry   // loop
 Lfound_cpu_data_entry:
-       adrp            x20, EXT(const_boot_args)@page
+       adrp    x20, EXT(const_boot_args)@page
        add             x20, x20, EXT(const_boot_args)@pageoff
        ldr             x0, [x21, CPU_RESET_HANDLER]            // Call CPU reset handler
        cbz             x0, Lskip_cpu_reset_handler
@@ -178,7 +207,7 @@ Lfound_cpu_data_entry:
        adrp    x2, EXT(start_cpu)@page
        add             x2, x2, EXT(start_cpu)@pageoff
        cmp             x0, x2
-       bne     Lskip_cpu_reset_handler
+       bne             Lskip_cpu_reset_handler
 1:
 
 
@@ -196,12 +225,7 @@ Lfound_cpu_data_entry:
 Lskip_cpu_reset_handler:
        b               .                                                                       // Hang if the handler is NULL or returns
 
-       .align  3
-       .globl  EXT(ResetHandlerData)
-LEXT(ResetHandlerData)
-       .space  (rhdSize_NUM),0         // (filled with 0s)
-
-       .align  3
+       .align 3
        .global EXT(LowResetVectorEnd)
 LEXT(LowResetVectorEnd)
        .global EXT(SleepToken)
@@ -210,6 +234,13 @@ LEXT(SleepToken)
        .space  (stSize_NUM),0
 #endif
 
+       .section __DATA_CONST,__const
+       .align  3
+       .globl  EXT(ResetHandlerData)
+LEXT(ResetHandlerData)
+       .space  (rhdSize_NUM),0         // (filled with 0s)
+       .text
+
 
 /*
  * __start trampoline is located at a position relative to LowResetVectorBase
@@ -276,6 +307,7 @@ LEXT(LowExceptionVectorBase)
 .align ARM_PGSHIFT
 .globl EXT(bootstrap_instructions)
 LEXT(bootstrap_instructions)
+
 #endif /* defined(KERNEL_INTEGRITY_KTRR)*/
        .align 2
        .globl EXT(resume_idle_cpu)
@@ -311,6 +343,7 @@ start_cpu:
        ldr             x25, [x20, BA_TOP_OF_KERNEL_DATA]       // Get the top of the kernel data
        ldr             x26, [x20, BA_BOOT_FLAGS]                       // Get the kernel boot flags
 
+
        // Set TPIDRRO_EL0 with the CPU number
        ldr             x0, [x21, CPU_NUMBER_GS]
        msr             TPIDRRO_EL0, x0
@@ -322,7 +355,7 @@ start_cpu:
        // Set SP_EL1 to exception stack
 #if defined(KERNEL_INTEGRITY_KTRR)
        mov             x1, lr
-       bl              _pinst_spsel_1
+       bl              EXT(pinst_spsel_1)
        mov             lr, x1
 #else
        msr             SPSel, #1
@@ -452,6 +485,7 @@ LEXT(start_first_cpu)
        // Unlock the core for debugging
        msr             OSLAR_EL1, xzr
        msr             DAIFSet, #(DAIFSC_ALL)                          // Disable all interrupts
+
        mov             x20, x0
        mov             x21, #0
 
@@ -481,7 +515,7 @@ LEXT(start_first_cpu)
 
        // Set SP_EL1 to exception stack
 #if defined(KERNEL_INTEGRITY_KTRR)
-       bl              _pinst_spsel_1
+       bl              EXT(pinst_spsel_1)
 #else
        msr             SPSel, #1
 #endif
@@ -511,23 +545,13 @@ LEXT(start_first_cpu)
         *      Page 3 - KVA L1 table
         *      Page 4 - KVA L2 table
         */
-#if __ARM64_TWO_LEVEL_PMAP__
-       /*
-        * If we are using a two level scheme, we don't need the L1 entries, so:
-        *      Page 1 - V=P L2 table
-        *      Page 2 - KVA L2 table
-        */
-#endif
 
        // Invalidate all entries in the bootstrap page tables
        mov             x0, #(ARM_TTE_EMPTY)                            // Load invalid entry template
        mov             x1, x25                                                         // Start at top of kernel
        mov             x2, #(TTE_PGENTRIES)                            // Load number of entries per page
-#if __ARM64_TWO_LEVEL_PMAP__
-       lsl             x2, x2, #1                                                      // Shift by 1 for num entries on 2 pages
-#else
        lsl             x2, x2, #2                                                      // Shift by 2 for num entries on 4 pages
-#endif
+
 Linvalidate_bootstrap:                                                 // do {
        str             x0, [x1], #(1 << TTE_SHIFT)                     //   Invalidate and advance
        subs    x2, x2, #1                                                      //   entries--
@@ -603,6 +627,7 @@ Linvalidate_bootstrap:                                                      // do {
        /* Ensure TTEs are visible */
        dsb             ish
 
+
        b               common_start
 
 /*
@@ -659,6 +684,10 @@ common_start:
        orr             x0, x0, x1
        mov             x1, #(MAIR_POSTED << MAIR_ATTR_SHIFT(CACHE_ATTRINDX_POSTED))
        orr             x0, x0, x1
+       mov             x1, #(MAIR_POSTED_REORDERED << MAIR_ATTR_SHIFT(CACHE_ATTRINDX_POSTED_REORDERED))
+       orr             x0, x0, x1
+       mov             x1, #(MAIR_POSTED_COMBINED_REORDERED << MAIR_ATTR_SHIFT(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED))
+       orr             x0, x0, x1
        msr             MAIR_EL1, x0
 
 #if defined(APPLEHURRICANE)
@@ -686,6 +715,7 @@ common_start:
 #endif
 
 
+
 #ifndef __ARM_IC_NOALIAS_ICACHE__
        /* Invalidate the TLB and icache on systems that do not guarantee that the
         * caches are invalidated on reset.
@@ -723,14 +753,108 @@ common_start:
 1:
        MSR_VBAR_EL1_X0
 
+1:
+#ifdef HAS_APPLE_PAC
+#ifdef __APSTS_SUPPORTED__
+       mrs             x0, ARM64_REG_APSTS_EL1
+       and             x1, x0, #(APSTS_EL1_MKEYVld)
+       cbz             x1, 1b                                                                          // Poll APSTS_EL1.MKEYVld
+       mrs             x0, ARM64_REG_APCTL_EL1
+       orr             x0, x0, #(APCTL_EL1_AppleMode)
+       orr             x0, x0, #(APCTL_EL1_KernKeyEn)
+       and             x0, x0, #~(APCTL_EL1_EnAPKey0)
+       msr             ARM64_REG_APCTL_EL1, x0
+#else
+       mrs             x0, ARM64_REG_APCTL_EL1
+       and             x1, x0, #(APCTL_EL1_MKEYVld)
+       cbz             x1, 1b                                                                          // Poll APCTL_EL1.MKEYVld
+       orr             x0, x0, #(APCTL_EL1_AppleMode)
+       orr             x0, x0, #(APCTL_EL1_KernKeyEn)
+       msr             ARM64_REG_APCTL_EL1, x0
+#endif /* APSTS_SUPPORTED */
+
+       /* ISB necessary to ensure APCTL_EL1_AppleMode logic enabled before proceeding */
+       isb             sy
+       /* Load static kernel key diversification values */
+       ldr             x0, =KERNEL_ROP_ID
+       /* set ROP key. must write at least once to pickup mkey per boot diversification */
+       msr             APIBKeyLo_EL1, x0
+       add             x0, x0, #1
+       msr             APIBKeyHi_EL1, x0
+       add             x0, x0, #1
+       msr             APDBKeyLo_EL1, x0
+       add             x0, x0, #1
+       msr             APDBKeyHi_EL1, x0
+       add             x0, x0, #1
+       msr             ARM64_REG_KERNELKEYLO_EL1, x0
+       add             x0, x0, #1
+       msr             ARM64_REG_KERNELKEYHI_EL1, x0
+       /* set JOP key. must write at least once to pickup mkey per boot diversification */
+       add             x0, x0, #1
+       msr             APIAKeyLo_EL1, x0
+       add             x0, x0, #1
+       msr             APIAKeyHi_EL1, x0
+       add             x0, x0, #1
+       msr             APDAKeyLo_EL1, x0
+       add             x0, x0, #1
+       msr             APDAKeyHi_EL1, x0
+       /* set G key */
+       add             x0, x0, #1
+       msr             APGAKeyLo_EL1, x0
+       add             x0, x0, #1
+       msr             APGAKeyHi_EL1, x0
+
+       // Enable caches, MMU, ROP and JOP
+       mov             x0, #(SCTLR_EL1_DEFAULT & 0xFFFF)
+       mov             x1, #(SCTLR_EL1_DEFAULT & 0xFFFF0000)
+       orr             x0, x0, x1
+       orr             x0, x0, #(SCTLR_PACIB_ENABLED) /* IB is ROP */
+
+#if DEBUG || DEVELOPMENT
+       and             x2, x26, BA_BOOT_FLAGS_DISABLE_JOP
+#if __APCFG_SUPPORTED__
+       // for APCFG systems, JOP keys are always on for EL1 unless ELXENKEY is cleared.
+       // JOP keys for EL0 will be toggled on the first time we pmap_switch to a pmap that has JOP enabled
+       cbz             x2, Lenable_mmu
+       mrs             x3, APCFG_EL1
+       and             x3, x3, #~(APCFG_EL1_ELXENKEY)
+       msr             APCFG_EL1, x3
+#else /* __APCFG_SUPPORTED__ */
+       cbnz    x2, Lenable_mmu
+#endif /* __APCFG_SUPPORTED__ */
+#endif /* DEBUG || DEVELOPMENT */
+
+#if !__APCFG_SUPPORTED__
+       MOV64   x1, SCTLR_JOP_KEYS_ENABLED
+       orr     x0, x0, x1
+#endif /* !__APCFG_SUPPORTED__ */
+Lenable_mmu:
+#else  /* HAS_APPLE_PAC */
 
        // Enable caches and MMU
        mov             x0, #(SCTLR_EL1_DEFAULT & 0xFFFF)
        mov             x1, #(SCTLR_EL1_DEFAULT & 0xFFFF0000)
        orr             x0, x0, x1
+#endif /* HAS_APPLE_PAC */
        MSR_SCTLR_EL1_X0
        isb             sy
 
+       MOV32   x1, SCTLR_EL1_DEFAULT
+#if HAS_APPLE_PAC
+       orr             x1, x1, #(SCTLR_PACIB_ENABLED)
+#if !__APCFG_SUPPORTED__
+       MOV64   x2, SCTLR_JOP_KEYS_ENABLED
+#if (DEBUG || DEVELOPMENT)
+       // Ignore the JOP bits, since we can't predict at compile time whether BA_BOOT_FLAGS_DISABLE_JOP is set
+       bic             x0, x0, x2
+#else
+       orr             x1, x1, x2
+#endif /* (DEBUG || DEVELOPMENT) */
+#endif /* !__APCFG_SUPPORTED__ */
+#endif /* HAS_APPLE_PAC */
+       cmp             x0, x1
+       bne             .
+
 #if (!CONFIG_KERNEL_INTEGRITY || (CONFIG_KERNEL_INTEGRITY && !defined(KERNEL_INTEGRITY_WT)))
        /* Watchtower
         *
@@ -756,27 +880,24 @@ common_start:
        ARM64_WRITE_EP_SPR x15, x12, ARM64_REG_EHID4, ARM64_REG_HID4
 #endif  // APPLE_ARM64_ARCH_FAMILY
 
-#if defined(APPLECYCLONE) || defined(APPLETYPHOON)
+#if defined(APPLETYPHOON)
        //
-       // Cyclone/Typhoon-Specific initialization
-       // For tunable summary, see <rdar://problem/13503621> Alcatraz/H6: Confirm Cyclone CPU tunables have been set
+       // Typhoon-Specific initialization
+       // For tunable summary, see <rdar://problem/13503621>
        //
 
        //
        // Disable LSP flush with context switch to work around bug in LSP
-       // that can cause Cyclone to wedge when CONTEXTIDR is written.
-       // <rdar://problem/12387704> Innsbruck11A175: panic(cpu 0 caller 0xffffff800024e30c): "wait queue deadlock - wq=0xffffff805a7a63c0, cpu=0\n"
+       // that can cause Typhoon to wedge when CONTEXTIDR is written.
+       // <rdar://problem/12387704>
        //
 
        mrs             x12, ARM64_REG_HID0
        orr             x12, x12, ARM64_REG_HID0_LoopBuffDisb
        msr             ARM64_REG_HID0, x12
-       
+
        mrs             x12, ARM64_REG_HID1
        orr             x12, x12, ARM64_REG_HID1_rccDisStallInactiveIexCtl
-#if defined(APPLECYCLONE)
-       orr             x12, x12, ARM64_REG_HID1_disLspFlushWithContextSwitch
-#endif
        msr             ARM64_REG_HID1, x12
 
        mrs             x12, ARM64_REG_HID3
@@ -796,7 +917,7 @@ common_start:
 #endif // ARM64_BOARD_CONFIG_T7001
        msr             ARM64_REG_HID8, x12
        isb             sy
-#endif // APPLECYCLONE || APPLETYPHOON
+#endif // APPLETYPHOON
 
 #if defined(APPLETWISTER)
 
@@ -955,6 +1076,11 @@ Lskip_skye_post_a1_workarounds:
 #endif /* defined(APPLEMONSOON) */
 
 
+
+
+
+
+
        // If x21 != 0, we're doing a warm reset, so we need to trampoline to the kernel pmap.
        cbnz    x21, Ltrampoline
 
@@ -969,7 +1095,7 @@ Lskip_skye_post_a1_workarounds:
        // x0: boot args
        // x1: KVA page table phys base
        mrs     x1, TTBR1_EL1
-       bl      _kasan_bootstrap
+       bl      EXT(kasan_bootstrap)
 
        mov     x0, x20
        mov     lr, x21
@@ -1024,6 +1150,7 @@ arm_init_tramp:
         *  +---Kernel Base---+
         */
 
+
        mov             x19, lr
        // Convert CPU data PA to VA and set as first argument
        mov             x0, x21
index 5a69eabc437db5ad77fffb2a5896d49f8d4d3489..41d213e69033432226072226d9032b67eedab1ad 100644 (file)
 #include <arm/vmparam.h>
 #include <arm/cpu_data_internal.h>
 #include <arm64/proc_reg.h>
+#if __has_feature(ptrauth_calls)
+#include <ptrauth.h>
+#endif
 
 struct arm_vfpv2_state {
-       __uint32_t        __r[32];
-       __uint32_t        __fpscr;
+       __uint32_t __r[32];
+       __uint32_t __fpscr;
 };
 
-typedef struct arm_vfpv2_state  arm_vfpv2_state_t;
+typedef struct arm_vfpv2_state arm_vfpv2_state_t;
 
-#define ARM_VFPV2_STATE_COUNT ((mach_msg_type_number_t) \
-       (sizeof (arm_vfpv2_state_t)/sizeof(uint32_t)))
+#define ARM_VFPV2_STATE_COUNT \
+       ((mach_msg_type_number_t)(sizeof (arm_vfpv2_state_t)/sizeof(uint32_t)))
 
 /*
  * Forward definitions
@@ -55,31 +58,19 @@ void thread_set_parent(thread_t parent, int pid);
  * Maps state flavor to number of words in the state:
  */
 /* __private_extern__ */
-unsigned int    _MachineStateCount[] = {
-       /* FLAVOR_LIST */ 0,
-       ARM_UNIFIED_THREAD_STATE_COUNT,
-       ARM_VFP_STATE_COUNT,
-       ARM_EXCEPTION_STATE_COUNT,
-       ARM_DEBUG_STATE_COUNT,
-       /* THREAD_STATE_NONE (legacy) */ 0,
-       ARM_THREAD_STATE64_COUNT,
-       ARM_EXCEPTION_STATE64_COUNT,
-       /* THREAD_STATE_LAST (legacy) */ 0,
-       ARM_THREAD_STATE32_COUNT,
-       /* UNALLOCATED */ 0,
-       /* UNALLOCATED */ 0,
-       /* UNALLOCATED */ 0,
-       /* UNALLOCATED */ 0,
-       ARM_DEBUG_STATE32_COUNT,
-       ARM_DEBUG_STATE64_COUNT,
-       ARM_NEON_STATE_COUNT,
-       ARM_NEON_STATE64_COUNT,
-       /* UNALLOCATED */ 0,
-       /* UNALLOCATED */ 0,
-       /* ARM_SAVED_STATE32_COUNT */ 0,
-       /* ARM_SAVED_STATE64_COUNT */ 0,
-       /* ARM_NEON_SAVED_STATE32_COUNT */ 0,
-       /* ARM_NEON_SAVED_STATE64_COUNT */ 0,
+unsigned int _MachineStateCount[] = {
+       [ARM_UNIFIED_THREAD_STATE] = ARM_UNIFIED_THREAD_STATE_COUNT,
+       [ARM_VFP_STATE] = ARM_VFP_STATE_COUNT,
+       [ARM_EXCEPTION_STATE] = ARM_EXCEPTION_STATE_COUNT,
+       [ARM_DEBUG_STATE] = ARM_DEBUG_STATE_COUNT,
+       [ARM_THREAD_STATE64] = ARM_THREAD_STATE64_COUNT,
+       [ARM_EXCEPTION_STATE64] = ARM_EXCEPTION_STATE64_COUNT,
+       [ARM_THREAD_STATE32] = ARM_THREAD_STATE32_COUNT,
+       [ARM_DEBUG_STATE32] = ARM_DEBUG_STATE32_COUNT,
+       [ARM_DEBUG_STATE64] = ARM_DEBUG_STATE64_COUNT,
+       [ARM_NEON_STATE] = ARM_NEON_STATE_COUNT,
+       [ARM_NEON_STATE64] = ARM_NEON_STATE64_COUNT,
+       [ARM_PAGEIN_STATE] = ARM_PAGEIN_STATE_COUNT,
 };
 
 extern zone_t ads_zone;
@@ -89,7 +80,8 @@ extern zone_t ads_zone;
  * Copy values from saved_state to ts64.
  */
 void
-saved_state_to_thread_state64(const arm_saved_state_t *saved_state, arm_thread_state64_t *ts64)
+saved_state_to_thread_state64(const arm_saved_state_t * saved_state,
+    arm_thread_state64_t *    ts64)
 {
        uint32_t i;
 
@@ -109,7 +101,8 @@ saved_state_to_thread_state64(const arm_saved_state_t *saved_state, arm_thread_s
  * Copy values from ts64 to saved_state
  */
 void
-thread_state64_to_saved_state(const arm_thread_state64_t *ts64, arm_saved_state_t *saved_state)
+thread_state64_to_saved_state(const arm_thread_state64_t * ts64,
+    arm_saved_state_t *          saved_state)
 {
        uint32_t i;
 
@@ -124,13 +117,13 @@ thread_state64_to_saved_state(const arm_thread_state64_t *ts64, arm_saved_state_
                set_saved_state_reg(saved_state, i, ts64->x[i]);
        }
 }
-#endif
 
-kern_return_t
-handle_get_arm32_thread_state(
-       thread_state_t tstate,
-       mach_msg_type_number_t * count,
-       const arm_saved_state_t *saved_state)
+#endif /* __arm64__ */
+
+static kern_return_t
+handle_get_arm32_thread_state(thread_state_t            tstate,
+    mach_msg_type_number_t *  count,
+    const arm_saved_state_t * saved_state)
 {
        if (*count < ARM_THREAD_STATE32_COUNT) {
                return KERN_INVALID_ARGUMENT;
@@ -144,11 +137,10 @@ handle_get_arm32_thread_state(
        return KERN_SUCCESS;
 }
 
-kern_return_t
-handle_get_arm64_thread_state(
-       thread_state_t tstate,
-       mach_msg_type_number_t * count,
-       const arm_saved_state_t *saved_state)
+static kern_return_t
+handle_get_arm64_thread_state(thread_state_t            tstate,
+    mach_msg_type_number_t *  count,
+    const arm_saved_state_t * saved_state)
 {
        if (*count < ARM_THREAD_STATE64_COUNT) {
                return KERN_INVALID_ARGUMENT;
@@ -163,11 +155,10 @@ handle_get_arm64_thread_state(
 }
 
 
-kern_return_t
-handle_get_arm_thread_state(
-       thread_state_t tstate,
-       mach_msg_type_number_t * count,
-       const arm_saved_state_t *saved_state)
+static kern_return_t
+handle_get_arm_thread_state(thread_state_t            tstate,
+    mach_msg_type_number_t *  count,
+    const arm_saved_state_t * saved_state)
 {
        /* In an arm64 world, this flavor can be used to retrieve the thread
         * state of a 32-bit or 64-bit thread into a unified structure, but we
@@ -196,11 +187,11 @@ handle_get_arm_thread_state(
        return KERN_SUCCESS;
 }
 
-kern_return_t
-handle_set_arm32_thread_state(
-       const thread_state_t tstate,
-       mach_msg_type_number_t count,
-       arm_saved_state_t *saved_state)
+
+static kern_return_t
+handle_set_arm32_thread_state(const thread_state_t   tstate,
+    mach_msg_type_number_t count,
+    arm_saved_state_t *    saved_state)
 {
        if (count != ARM_THREAD_STATE32_COUNT) {
                return KERN_INVALID_ARGUMENT;
@@ -210,11 +201,10 @@ handle_set_arm32_thread_state(
        return KERN_SUCCESS;
 }
 
-kern_return_t
-handle_set_arm64_thread_state(
-       const thread_state_t tstate,
-       mach_msg_type_number_t count,
-       arm_saved_state_t *saved_state)
+static kern_return_t
+handle_set_arm64_thread_state(const thread_state_t   tstate,
+    mach_msg_type_number_t count,
+    arm_saved_state_t *    saved_state)
 {
        if (count != ARM_THREAD_STATE64_COUNT) {
                return KERN_INVALID_ARGUMENT;
@@ -225,11 +215,10 @@ handle_set_arm64_thread_state(
 }
 
 
-kern_return_t
-handle_set_arm_thread_state(
-       const thread_state_t tstate,
-       mach_msg_type_number_t count,
-       arm_saved_state_t *saved_state)
+static kern_return_t
+handle_set_arm_thread_state(const thread_state_t   tstate,
+    mach_msg_type_number_t count,
+    arm_saved_state_t *    saved_state)
 {
        /* In an arm64 world, this flavor can be used to set the thread state of a
         * 32-bit or 64-bit thread from a unified structure, but we need to support
@@ -262,6 +251,7 @@ handle_set_arm_thread_state(
        return KERN_SUCCESS;
 }
 
+
 /*
  * Translate thread state arguments to userspace representation
  */
@@ -273,9 +263,80 @@ machine_thread_state_convert_to_user(
        thread_state_t tstate,
        mach_msg_type_number_t *count)
 {
+#if __has_feature(ptrauth_calls)
+       arm_thread_state64_t *ts64;
+
+       switch (flavor) {
+       case ARM_THREAD_STATE:
+       {
+               arm_unified_thread_state_t *unified_state = (arm_unified_thread_state_t *)tstate;
+
+               if (*count < ARM_UNIFIED_THREAD_STATE_COUNT || !is_thread_state64(unified_state)) {
+                       return KERN_SUCCESS;
+               }
+               ts64 = thread_state64(unified_state);
+               break;
+       }
+       case ARM_THREAD_STATE64:
+       {
+               if (*count < ARM_THREAD_STATE64_COUNT) {
+                       return KERN_SUCCESS;
+               }
+               ts64 = (arm_thread_state64_t *)tstate;
+               break;
+       }
+       default:
+               return KERN_SUCCESS;
+       }
+
+       // Note that kernel threads never have disable_user_jop set
+       if (current_thread()->machine.disable_user_jop || !thread_is_64bit_addr(current_thread()) ||
+           thread->machine.disable_user_jop || !thread_is_64bit_addr(thread) ||
+           (BootArgs->bootFlags & kBootFlagsDisableUserThreadStateJOP)) {
+               ts64->flags = __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH;
+               return KERN_SUCCESS;
+       }
+
+       ts64->flags = 0;
+       if (ts64->lr) {
+               // lr might contain an IB-signed return address (strip is a no-op on unsigned addresses)
+               uintptr_t stripped_lr = (uintptr_t)ptrauth_strip((void *)ts64->lr,
+                   ptrauth_key_return_address);
+               if (ts64->lr != stripped_lr) {
+                       // Need to allow already-signed lr value to round-trip as is
+                       ts64->flags |= __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR;
+               }
+               // Note that an IB-signed return address that happens to have a 0 signature value
+               // will round-trip correctly even if IA-signed again below (and IA-authd later)
+       }
+
+       if (BootArgs->bootFlags & kBootFlagsDisableUserJOP) {
+               return KERN_SUCCESS;
+       }
+
+       if (ts64->pc) {
+               ts64->pc = (uintptr_t)pmap_sign_user_ptr((void*)ts64->pc,
+                   ptrauth_key_process_independent_code, ptrauth_string_discriminator("pc"));
+       }
+       if (ts64->lr && !(ts64->flags & __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR)) {
+               ts64->lr = (uintptr_t)pmap_sign_user_ptr((void*)ts64->lr,
+                   ptrauth_key_process_independent_code, ptrauth_string_discriminator("lr"));
+       }
+       if (ts64->sp) {
+               ts64->sp = (uintptr_t)pmap_sign_user_ptr((void*)ts64->sp,
+                   ptrauth_key_process_independent_data, ptrauth_string_discriminator("sp"));
+       }
+       if (ts64->fp) {
+               ts64->fp = (uintptr_t)pmap_sign_user_ptr((void*)ts64->fp,
+                   ptrauth_key_process_independent_data, ptrauth_string_discriminator("fp"));
+       }
+
+       return KERN_SUCCESS;
+#else
        // No conversion to userspace representation on this platform
        (void)thread; (void)flavor; (void)tstate; (void)count;
        return KERN_SUCCESS;
+#endif /* __has_feature(ptrauth_calls) */
 }
 
 /*
@@ -289,9 +350,94 @@ machine_thread_state_convert_from_user(
        thread_state_t tstate,
        mach_msg_type_number_t count)
 {
+#if __has_feature(ptrauth_calls)
+       arm_thread_state64_t *ts64;
+
+       switch (flavor) {
+       case ARM_THREAD_STATE:
+       {
+               arm_unified_thread_state_t *unified_state = (arm_unified_thread_state_t *)tstate;
+
+               if (count < ARM_UNIFIED_THREAD_STATE_COUNT || !is_thread_state64(unified_state)) {
+                       return KERN_SUCCESS;
+               }
+               ts64 = thread_state64(unified_state);
+               break;
+       }
+       case ARM_THREAD_STATE64:
+       {
+               if (count != ARM_THREAD_STATE64_COUNT) {
+                       return KERN_SUCCESS;
+               }
+               ts64 = (arm_thread_state64_t *)tstate;
+               break;
+       }
+       default:
+               return KERN_SUCCESS;
+       }
+
+       // Note that kernel threads never have disable_user_jop set
+       if (current_thread()->machine.disable_user_jop || !thread_is_64bit_addr(current_thread())) {
+               if (thread->machine.disable_user_jop || !thread_is_64bit_addr(thread)) {
+                       ts64->flags = __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH;
+                       return KERN_SUCCESS;
+               }
+               // A JOP-disabled process must not set thread state on a JOP-enabled process
+               return KERN_PROTECTION_FAILURE;
+       }
+
+       if (ts64->flags & __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) {
+               if (thread->machine.disable_user_jop || !thread_is_64bit_addr(thread) ||
+                   (BootArgs->bootFlags & kBootFlagsDisableUserThreadStateJOP)) {
+                       return KERN_SUCCESS;
+               }
+               // Disallow setting unsigned thread state on JOP-enabled processes.
+               // Ignore flag and treat thread state arguments as signed, ptrauth
+               // poisoning will cause resulting thread state to be invalid
+               ts64->flags &= ~__DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH;
+       }
+
+       if (ts64->flags & __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR) {
+               // lr might contain an IB-signed return address (strip is a no-op on unsigned addresses)
+               uintptr_t stripped_lr = (uintptr_t)ptrauth_strip((void *)ts64->lr,
+                   ptrauth_key_return_address);
+               if (ts64->lr == stripped_lr) {
+                       // Don't allow unsigned pointer to be passed through as is. Ignore flag and
+                       // treat as IA-signed below (where auth failure may poison the value).
+                       ts64->flags &= ~__DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR;
+               }
+               // Note that an IB-signed return address that happens to have a 0 signature value
+               // will also have been IA-signed (without this flag being set) and so will IA-auth
+               // correctly below.
+       }
+
+       if (BootArgs->bootFlags & kBootFlagsDisableUserJOP) {
+               return KERN_SUCCESS;
+       }
+
+       if (ts64->pc) {
+               ts64->pc = (uintptr_t)pmap_auth_user_ptr((void*)ts64->pc,
+                   ptrauth_key_process_independent_code, ptrauth_string_discriminator("pc"));
+       }
+       if (ts64->lr && !(ts64->flags & __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR)) {
+               ts64->lr = (uintptr_t)pmap_auth_user_ptr((void*)ts64->lr,
+                   ptrauth_key_process_independent_code, ptrauth_string_discriminator("lr"));
+       }
+       if (ts64->sp) {
+               ts64->sp = (uintptr_t)pmap_auth_user_ptr((void*)ts64->sp,
+                   ptrauth_key_process_independent_data, ptrauth_string_discriminator("sp"));
+       }
+       if (ts64->fp) {
+               ts64->fp = (uintptr_t)pmap_auth_user_ptr((void*)ts64->fp,
+                   ptrauth_key_process_independent_data, ptrauth_string_discriminator("fp"));
+       }
+
+       return KERN_SUCCESS;
+#else
        // No conversion from userspace representation on this platform
        (void)thread; (void)flavor; (void)tstate; (void)count;
        return KERN_SUCCESS;
+#endif /* __has_feature(ptrauth_calls) */
 }
 
 /*
@@ -303,9 +449,27 @@ machine_thread_siguctx_pointer_convert_to_user(
        __assert_only thread_t thread,
        user_addr_t *uctxp)
 {
+#if __has_feature(ptrauth_calls)
+       if (current_thread()->machine.disable_user_jop || !thread_is_64bit_addr(current_thread())) {
+               assert(thread->machine.disable_user_jop || !thread_is_64bit_addr(thread));
+               return KERN_SUCCESS;
+       }
+
+       if (BootArgs->bootFlags & kBootFlagsDisableUserJOP) {
+               return KERN_SUCCESS;
+       }
+
+       if (*uctxp) {
+               *uctxp = (uintptr_t)pmap_sign_user_ptr((void*)*uctxp,
+                   ptrauth_key_process_independent_data, ptrauth_string_discriminator("uctx"));
+       }
+
+       return KERN_SUCCESS;
+#else
        // No conversion to userspace representation on this platform
        (void)thread; (void)uctxp;
        return KERN_SUCCESS;
+#endif /* __has_feature(ptrauth_calls) */
 }
 
 /*
@@ -318,21 +482,41 @@ machine_thread_function_pointers_convert_from_user(
        user_addr_t *fptrs,
        uint32_t count)
 {
+#if __has_feature(ptrauth_calls)
+       if (current_thread()->machine.disable_user_jop || !thread_is_64bit_addr(current_thread())) {
+               assert(thread->machine.disable_user_jop || !thread_is_64bit_addr(thread));
+               return KERN_SUCCESS;
+       }
+
+       if (BootArgs->bootFlags & kBootFlagsDisableUserJOP) {
+               return KERN_SUCCESS;
+       }
+
+       while (count--) {
+               if (*fptrs) {
+                       *fptrs = (uintptr_t)pmap_auth_user_ptr((void*)*fptrs,
+                           ptrauth_key_function_pointer, 0);
+               }
+               fptrs++;
+       }
+
+       return KERN_SUCCESS;
+#else
        // No conversion from userspace representation on this platform
        (void)thread; (void)fptrs; (void)count;
        return KERN_SUCCESS;
+#endif /* __has_feature(ptrauth_calls) */
 }
 
 /*
- * Routine:    machine_thread_get_state
+ * Routine: machine_thread_get_state
  *
  */
 kern_return_t
-machine_thread_get_state(
-       thread_t thread,
-       thread_flavor_t flavor,
-       thread_state_t tstate,
-       mach_msg_type_number_t * count)
+machine_thread_get_state(thread_t                 thread,
+    thread_flavor_t          flavor,
+    thread_state_t           tstate,
+    mach_msg_type_number_t * count)
 {
        switch (flavor) {
        case THREAD_STATE_FLAVOR_LIST:
@@ -359,6 +543,19 @@ machine_thread_get_state(
                *count = 4;
                break;
 
+       case THREAD_STATE_FLAVOR_LIST_10_15:
+               if (*count < 5) {
+                       return KERN_INVALID_ARGUMENT;
+               }
+
+               tstate[0] = ARM_THREAD_STATE;
+               tstate[1] = ARM_VFP_STATE;
+               tstate[2] = thread_is_64bit_data(thread) ? ARM_EXCEPTION_STATE64 : ARM_EXCEPTION_STATE;
+               tstate[3] = thread_is_64bit_data(thread) ? ARM_DEBUG_STATE64 : ARM_DEBUG_STATE32;
+               tstate[4] = ARM_PAGEIN_STATE;
+               *count = 5;
+               break;
+
        case ARM_THREAD_STATE:
        {
                kern_return_t rn = handle_get_arm_thread_state(tstate, count, thread->machine.upcb);
@@ -512,7 +709,7 @@ machine_thread_get_state(
        case ARM_VFP_STATE:{
                struct arm_vfp_state *state;
                arm_neon_saved_state32_t *thread_state;
-               unsigned int    max;
+               unsigned int max;
 
                if (*count < ARM_VFP_STATE_COUNT) {
                        if (*count < ARM_VFPV2_STATE_COUNT) {
@@ -581,6 +778,22 @@ machine_thread_get_state(
                break;
        }
 
+
+       case ARM_PAGEIN_STATE: {
+               arm_pagein_state_t *state;
+
+               if (*count < ARM_PAGEIN_STATE_COUNT) {
+                       return KERN_INVALID_ARGUMENT;
+               }
+
+               state = (arm_pagein_state_t *)tstate;
+               state->__pagein_error = thread->t_pagein_error;
+
+               *count = ARM_PAGEIN_STATE_COUNT;
+               break;
+       }
+
+
        default:
                return KERN_INVALID_ARGUMENT;
        }
@@ -589,15 +802,14 @@ machine_thread_get_state(
 
 
 /*
- * Routine:    machine_thread_get_kern_state
+ * Routine: machine_thread_get_kern_state
  *
  */
 kern_return_t
-machine_thread_get_kern_state(
-       thread_t thread,
-       thread_flavor_t flavor,
-       thread_state_t tstate,
-       mach_msg_type_number_t * count)
+machine_thread_get_kern_state(thread_t                 thread,
+    thread_flavor_t          flavor,
+    thread_state_t           tstate,
+    mach_msg_type_number_t * count)
 {
        /*
         * This works only for an interrupted kernel thread
@@ -670,15 +882,14 @@ machine_thread_switch_addrmode(thread_t thread)
 extern long long arm_debug_get(void);
 
 /*
- * Routine:    machine_thread_set_state
+ * Routine: machine_thread_set_state
  *
  */
 kern_return_t
-machine_thread_set_state(
-       thread_t thread,
-       thread_flavor_t flavor,
-       thread_state_t tstate,
-       mach_msg_type_number_t count)
+machine_thread_set_state(thread_t               thread,
+    thread_flavor_t        flavor,
+    thread_state_t         tstate,
+    mach_msg_type_number_t count)
 {
        kern_return_t rn;
 
@@ -762,7 +973,6 @@ machine_thread_set_state(
                        }
                }
 
-
                if (!enabled) {
                        arm_debug_state32_t *thread_state = find_debug_state32(thread);
                        if (thread_state != NULL) {
@@ -902,7 +1112,7 @@ machine_thread_set_state(
        {
                arm_debug_state64_t *state;
                boolean_t enabled = FALSE;
-               unsigned int    i;
+               unsigned int i;
 
                if (count != ARM_DEBUG_STATE64_COUNT) {
                        return KERN_INVALID_ARGUMENT;
@@ -957,7 +1167,7 @@ machine_thread_set_state(
                        for (i = 0; i < 16; i++) {
                                /* set appropriate privilege; mask out unknown bits */
                                thread_state->bcr[i] = (state->bcr[i] & (0         /* Was ARM_DBG_CR_ADDRESS_MASK_MASK deprecated in v8 */
-                                   | 0                                                  /* Was ARM_DBGBCR_MATCH_MASK, ignored in AArch64 state */
+                                   | 0                             /* Was ARM_DBGBCR_MATCH_MASK, ignored in AArch64 state */
                                    | ARM_DBG_CR_BYTE_ADDRESS_SELECT_MASK
                                    | ARM_DBG_CR_ENABLE_MASK))
                                    | ARM_DBGBCR_TYPE_IVA
@@ -1055,19 +1265,32 @@ machine_thread_set_state(
                break;
        }
 
+
        default:
                return KERN_INVALID_ARGUMENT;
        }
        return KERN_SUCCESS;
 }
 
+mach_vm_address_t
+machine_thread_pc(thread_t thread)
+{
+       struct arm_saved_state *ss = get_user_regs(thread);
+       return (mach_vm_address_t)get_saved_state_pc(ss);
+}
+
+void
+machine_thread_reset_pc(thread_t thread, mach_vm_address_t pc)
+{
+       set_saved_state_pc(get_user_regs(thread), (register_t)pc);
+}
+
 /*
- * Routine:    machine_thread_state_initialize
+ * Routine: machine_thread_state_initialize
  *
  */
 kern_return_t
-machine_thread_state_initialize(
-       thread_t thread)
+machine_thread_state_initialize(thread_t thread)
 {
        arm_context_t *context = thread->machine.contextData;
 
@@ -1090,19 +1313,24 @@ machine_thread_state_initialize(
 
        thread->machine.DebugData = NULL;
 
+#if defined(HAS_APPLE_PAC)
+       /* Sign the initial user-space thread state */
+       if (thread->machine.upcb != NULL) {
+               ml_sign_thread_state(thread->machine.upcb, 0, 0, 0, 0, 0);
+       }
+#endif /* defined(HAS_APPLE_PAC) */
 
        return KERN_SUCCESS;
 }
 
 /*
- * Routine:    machine_thread_dup
+ * Routine: machine_thread_dup
  *
  */
 kern_return_t
-machine_thread_dup(
-       thread_t self,
-       thread_t target,
-       __unused boolean_t is_corpse)
+machine_thread_dup(thread_t self,
+    thread_t target,
+    __unused boolean_t is_corpse)
 {
        struct arm_saved_state *self_saved_state;
        struct arm_saved_state *target_saved_state;
@@ -1113,46 +1341,47 @@ machine_thread_dup(
        self_saved_state = self->machine.upcb;
        target_saved_state = target->machine.upcb;
        bcopy(self_saved_state, target_saved_state, sizeof(struct arm_saved_state));
+#if defined(HAS_APPLE_PAC)
+       if (!is_corpse && is_saved_state64(self_saved_state)) {
+               check_and_sign_copied_thread_state(target_saved_state, self_saved_state);
+       }
+#endif /* defined(HAS_APPLE_PAC) */
 
        return KERN_SUCCESS;
 }
 
 /*
- * Routine:    get_user_regs
+ * Routine: get_user_regs
  *
  */
 struct arm_saved_state *
-get_user_regs(
-       thread_t thread)
+get_user_regs(thread_t thread)
 {
        return thread->machine.upcb;
 }
 
 arm_neon_saved_state_t *
-get_user_neon_regs(
-       thread_t thread)
+get_user_neon_regs(thread_t thread)
 {
        return thread->machine.uNeon;
 }
 
 /*
- * Routine:    find_user_regs
+ * Routine: find_user_regs
  *
  */
 struct arm_saved_state *
-find_user_regs(
-       thread_t thread)
+find_user_regs(thread_t thread)
 {
        return thread->machine.upcb;
 }
 
 /*
- * Routine:    find_kern_regs
+ * Routine: find_kern_regs
  *
  */
 struct arm_saved_state *
-find_kern_regs(
-       thread_t thread)
+find_kern_regs(thread_t thread)
 {
        /*
         * This works only for an interrupted kernel thread
@@ -1165,8 +1394,7 @@ find_kern_regs(
 }
 
 arm_debug_state32_t *
-find_debug_state32(
-       thread_t thread)
+find_debug_state32(thread_t thread)
 {
        if (thread && thread->machine.DebugData) {
                return &(thread->machine.DebugData->uds.ds32);
@@ -1176,8 +1404,7 @@ find_debug_state32(
 }
 
 arm_debug_state64_t *
-find_debug_state64(
-       thread_t thread)
+find_debug_state64(thread_t thread)
 {
        if (thread && thread->machine.DebugData) {
                return &(thread->machine.DebugData->uds.ds64);
@@ -1187,19 +1414,18 @@ find_debug_state64(
 }
 
 /*
- * Routine:    thread_userstack
+ * Routine: thread_userstack
  *
  */
 kern_return_t
-thread_userstack(
-       __unused thread_t thread,
-       int flavor,
-       thread_state_t tstate,
-       unsigned int count,
-       mach_vm_offset_t * user_stack,
-       int *customstack,
-       boolean_t is_64bit_data
-       )
+thread_userstack(__unused thread_t  thread,
+    int                flavor,
+    thread_state_t     tstate,
+    unsigned int       count,
+    mach_vm_offset_t * user_stack,
+    int *              customstack,
+    boolean_t          is_64bit_data
+    )
 {
        register_t sp;
 
@@ -1267,9 +1493,8 @@ thread_userstack(
  * thread, if otherwise unknown.
  */
 kern_return_t
-thread_userstackdefault(
-       mach_vm_offset_t *default_user_stack,
-       boolean_t is64bit)
+thread_userstackdefault(mach_vm_offset_t * default_user_stack,
+    boolean_t          is64bit)
 {
        if (is64bit) {
                *default_user_stack = USRSTACK64;
@@ -1281,11 +1506,12 @@ thread_userstackdefault(
 }
 
 /*
- * Routine:    thread_setuserstack
+ * Routine: thread_setuserstack
  *
  */
 void
-thread_setuserstack(thread_t thread, mach_vm_address_t user_stack)
+thread_setuserstack(thread_t          thread,
+    mach_vm_address_t user_stack)
 {
        struct arm_saved_state *sv;
 
@@ -1297,11 +1523,12 @@ thread_setuserstack(thread_t thread, mach_vm_address_t user_stack)
 }
 
 /*
- * Routine:    thread_adjuserstack
+ * Routine: thread_adjuserstack
  *
  */
 uint64_t
-thread_adjuserstack(thread_t thread, int adjust)
+thread_adjuserstack(thread_t thread,
+    int      adjust)
 {
        struct arm_saved_state *sv;
        uint64_t sp;
@@ -1316,11 +1543,12 @@ thread_adjuserstack(thread_t thread, int adjust)
 }
 
 /*
- * Routine:    thread_setentrypoint
+ * Routine: thread_setentrypoint
  *
  */
 void
-thread_setentrypoint(thread_t thread, mach_vm_offset_t entry)
+thread_setentrypoint(thread_t         thread,
+    mach_vm_offset_t entry)
 {
        struct arm_saved_state *sv;
 
@@ -1332,17 +1560,16 @@ thread_setentrypoint(thread_t thread, mach_vm_offset_t entry)
 }
 
 /*
- * Routine:    thread_entrypoint
+ * Routine: thread_entrypoint
  *
  */
 kern_return_t
-thread_entrypoint(
-       __unused thread_t thread,
-       int flavor,
-       thread_state_t tstate,
-       unsigned int count __unused,
-       mach_vm_offset_t * entry_point
-       )
+thread_entrypoint(__unused thread_t  thread,
+    int                flavor,
+    thread_state_t     tstate,
+    unsigned int       count __unused,
+    mach_vm_offset_t * entry_point
+    )
 {
        switch (flavor) {
        case ARM_THREAD_STATE:
@@ -1388,13 +1615,12 @@ thread_entrypoint(
 
 
 /*
- * Routine:    thread_set_child
+ * Routine: thread_set_child
  *
  */
 void
-thread_set_child(
-       thread_t child,
-       int pid)
+thread_set_child(thread_t child,
+    int      pid)
 {
        struct arm_saved_state *child_state;
 
@@ -1406,13 +1632,12 @@ thread_set_child(
 
 
 /*
- * Routine:    thread_set_parent
+ * Routine: thread_set_parent
  *
  */
 void
-thread_set_parent(
-       thread_t parent,
-       int pid)
+thread_set_parent(thread_t parent,
+    int      pid)
 {
        struct arm_saved_state *parent_state;
 
@@ -1431,10 +1656,10 @@ struct arm_act_context {
 };
 
 /*
- * Routine:    act_thread_csave
+ * Routine: act_thread_csave
  *
  */
-void           *
+void *
 act_thread_csave(void)
 {
        struct arm_act_context *ic;
@@ -1459,13 +1684,13 @@ act_thread_csave(void)
                val = ARM_NEON_STATE64_COUNT;
                kret = machine_thread_get_state(thread,
                    ARM_NEON_STATE64,
-                   (thread_state_t) &ic->ns,
+                   (thread_state_t)&ic->ns,
                    &val);
        } else {
                val = ARM_NEON_STATE_COUNT;
                kret = machine_thread_get_state(thread,
                    ARM_NEON_STATE,
-                   (thread_state_t) &ic->ns,
+                   (thread_state_t)&ic->ns,
                    &val);
        }
        if (kret != KERN_SUCCESS) {
@@ -1477,11 +1702,11 @@ act_thread_csave(void)
 }
 
 /*
- * Routine:    act_thread_catt
+ * Routine: act_thread_catt
  *
  */
 void
-act_thread_catt(void *ctx)
+act_thread_catt(void * ctx)
 {
        struct arm_act_context *ic;
        kern_return_t   kret;
@@ -1501,12 +1726,12 @@ act_thread_catt(void *ctx)
        if (thread_is_64bit_data(thread)) {
                kret = machine_thread_set_state(thread,
                    ARM_NEON_STATE64,
-                   (thread_state_t) &ic->ns,
+                   (thread_state_t)&ic->ns,
                    ARM_NEON_STATE64_COUNT);
        } else {
                kret = machine_thread_set_state(thread,
                    ARM_NEON_STATE,
-                   (thread_state_t) &ic->ns,
+                   (thread_state_t)&ic->ns,
                    ARM_NEON_STATE_COUNT);
        }
        if (kret != KERN_SUCCESS) {
@@ -1518,7 +1743,7 @@ out:
 }
 
 /*
- * Routine:    act_thread_catt
+ * Routine: act_thread_catt
  *
  */
 void
@@ -1528,7 +1753,8 @@ act_thread_cfree(void *ctx)
 }
 
 kern_return_t
-thread_set_wq_state32(thread_t thread, thread_state_t tstate)
+thread_set_wq_state32(thread_t       thread,
+    thread_state_t tstate)
 {
        arm_thread_state_t *state;
        struct arm_saved_state *saved_state;
@@ -1565,7 +1791,8 @@ thread_set_wq_state32(thread_t thread, thread_state_t tstate)
 }
 
 kern_return_t
-thread_set_wq_state64(thread_t thread, thread_state_t tstate)
+thread_set_wq_state64(thread_t       thread,
+    thread_state_t tstate)
 {
        arm_thread_state64_t *state;
        struct arm_saved_state *saved_state;
diff --git a/osfmk/arm64/tlb.h b/osfmk/arm64/tlb.h
new file mode 100644 (file)
index 0000000..eb1face
--- /dev/null
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#pragma once
+
+#include <arm64/proc_reg.h>
+#include <machine/atomic.h>
+
+#define tlbi_addr(x) ((((x) >> 12) & TLBI_ADDR_MASK) << TLBI_ADDR_SHIFT)
+#define tlbi_asid(x) (((uintptr_t)(x) & TLBI_ASID_MASK) << TLBI_ASID_SHIFT)
+
+#if __ARM_KERNEL_PROTECT__
+/*
+ * __ARM_KERNEL_PROTECT__ adds two complications to TLB management:
+ *
+ * 1. As each pmap has two ASIDs, every TLB operation that targets an ASID must
+ *   target both ASIDs for the pmap that owns the target ASID.
+ *
+ * 2. Any TLB operation targeting the kernel_pmap ASID (ASID 0) must target all
+ *   ASIDs (as kernel_pmap mappings may be referenced while using an ASID that
+ *   belongs to another pmap).  We expect these routines to be called with the
+ *   EL0 ASID for the target; not the EL1 ASID.
+ */
+#endif /* __ARM_KERNEL_PROTECT__ */
+
+static inline void
+sync_tlb_flush(void)
+{
+       __builtin_arm_dsb(DSB_ISH);
+       __builtin_arm_isb(ISB_SY);
+}
+
+// flush_mmu_tlb: full TLB flush on all cores
+static inline void
+flush_mmu_tlb_async(void)
+{
+       asm volatile ("tlbi vmalle1is");
+}
+
+static inline void
+flush_mmu_tlb(void)
+{
+       flush_mmu_tlb_async();
+       sync_tlb_flush();
+}
+
+// flush_core_tlb: full TLB flush on local core only
+static inline void
+flush_core_tlb_async(void)
+{
+       asm volatile ("tlbi vmalle1");
+}
+
+static inline void
+flush_core_tlb(void)
+{
+       flush_core_tlb_async();
+       sync_tlb_flush();
+}
+
+// flush_mmu_tlb_allentries_async: flush entries that map VA range, all ASIDS, all cores
+// start and end are in units of 4K pages.
+static inline void
+flush_mmu_tlb_allentries_async(uint64_t start, uint64_t end)
+{
+#if __ARM_16K_PG__
+       start = start & ~0x3ULL;
+
+       /*
+        * The code below is not necessarily correct.  From an overview of
+        * the client code, the expected contract for TLB flushes is that
+        * we will expand from an "address, length" pair to "start address,
+        * end address" in the course of a TLB flush.  This suggests that
+        * a flush for "X, X+4" is actually only asking for a flush of a
+        * single 16KB page.  At the same time, we'd like to be prepared
+        * for bad inputs (X, X+3), so add 3 and then truncate the 4KB page
+        * number to a 16KB page boundary.  This should deal correctly with
+        * unaligned inputs.
+        *
+        * If our expecations about client behavior are wrong however, this
+        * will lead to occasional TLB corruption on platforms with 16KB
+        * pages.
+        */
+       end = (end + 0x3ULL) & ~0x3ULL;
+#endif // __ARM_16K_PG__
+       for (; start < end; start += (ARM_PGBYTES / 4096)) {
+               asm volatile ("tlbi vaae1is, %0" : : "r"(start));
+       }
+}
+
+static inline void
+flush_mmu_tlb_allentries(uint64_t start, uint64_t end)
+{
+       flush_mmu_tlb_allentries_async(start, end);
+       sync_tlb_flush();
+}
+
+// flush_mmu_tlb_entry: flush TLB entries that map a VA and ASID, all cores
+// Will also flush global entries that match the VA
+static inline void
+flush_mmu_tlb_entry_async(uint64_t val)
+{
+#if __ARM_KERNEL_PROTECT__
+       uint64_t asid = val >> TLBI_ASID_SHIFT;
+       if (asid == 0) {
+               asm volatile ("tlbi vaae1is, %0" : : "r"(val));
+               return;
+       }
+       val = val & ~(1ULL << TLBI_ASID_SHIFT);
+       asm volatile ("tlbi vae1is, %0" : : "r"(val));
+       val = val | (1ULL << TLBI_ASID_SHIFT);
+#endif /* __ARM_KERNEL_PROTECT__ */
+       asm volatile ("tlbi vae1is, %0" : : "r"(val));
+}
+
+static inline void
+flush_mmu_tlb_entry(uint64_t val)
+{
+       flush_mmu_tlb_entry_async(val);
+       sync_tlb_flush();
+}
+
+// flush_mmu_tlb_entries: flush TLB entries that map a VA range and ASID, all cores
+// start and end must have the ASID in the high 16 bits, with the VA in units of 4K in the lowest bits
+// Will also flush global entries that match the VA range
+static inline void
+flush_mmu_tlb_entries_async(uint64_t start, uint64_t end)
+{
+#if __ARM_16K_PG__
+       start = start & ~0x3ULL;
+
+       /*
+        * The code below is not necessarily correct.  From an overview of
+        * the client code, the expected contract for TLB flushes is that
+        * we will expand from an "address, length" pair to "start address,
+        * end address" in the course of a TLB flush.  This suggests that
+        * a flush for "X, X+4" is actually only asking for a flush of a
+        * single 16KB page.  At the same time, we'd like to be prepared
+        * for bad inputs (X, X+3), so add 3 and then truncate the 4KB page
+        * number to a 16KB page boundary.  This should deal correctly with
+        * unaligned inputs.
+        *
+        * If our expecations about client behavior are wrong however, this
+        * will lead to occasional TLB corruption on platforms with 16KB
+        * pages.
+        */
+       end = (end + 0x3ULL) & ~0x3ULL;
+#endif // __ARM_16K_PG__
+#if __ARM_KERNEL_PROTECT__
+       uint64_t asid = start >> TLBI_ASID_SHIFT;
+       /*
+        * If we are flushing ASID 0, this is a kernel operation.  With this
+        * ASID scheme, this means we should flush all ASIDs.
+        */
+       if (asid == 0) {
+               for (; start < end; start += (ARM_PGBYTES / 4096)) {
+                       asm volatile ("tlbi vaae1is, %0" : : "r"(start));
+               }
+               return;
+       }
+       start = start | (1ULL << TLBI_ASID_SHIFT);
+       end = end | (1ULL << TLBI_ASID_SHIFT);
+       for (; start < end; start += (ARM_PGBYTES / 4096)) {
+               start = start & ~(1ULL << TLBI_ASID_SHIFT);
+               asm volatile ("tlbi vae1is, %0" : : "r"(start));
+               start = start | (1ULL << TLBI_ASID_SHIFT);
+               asm volatile ("tlbi vae1is, %0" : : "r"(start));
+       }
+#else
+       for (; start < end; start += (ARM_PGBYTES / 4096)) {
+               asm volatile ("tlbi vae1is, %0" : : "r"(start));
+       }
+#endif /* __ARM_KERNEL_PROTECT__ */
+}
+
+static inline void
+flush_mmu_tlb_entries(uint64_t start, uint64_t end)
+{
+       flush_mmu_tlb_entries_async(start, end);
+       sync_tlb_flush();
+}
+
+// flush_mmu_tlb_asid: flush all entries that match an ASID, on all cores
+// ASID must be in high 16 bits of argument
+// Will not flush global entries
+static inline void
+flush_mmu_tlb_asid_async(uint64_t val)
+{
+#if __ARM_KERNEL_PROTECT__
+       /*
+        * If we are flushing ASID 0, this is a kernel operation.  With this
+        * ASID scheme, this means we should flush all ASIDs.
+        */
+       uint64_t asid = val >> TLBI_ASID_SHIFT;
+       if (asid == 0) {
+               asm volatile ("tlbi vmalle1is");
+               return;
+       }
+       val = val & ~(1ULL << TLBI_ASID_SHIFT);
+       asm volatile ("tlbi aside1is, %0" : : "r"(val));
+       val = val | (1ULL << TLBI_ASID_SHIFT);
+#endif /* __ARM_KERNEL_PROTECT__ */
+       asm volatile ("tlbi aside1is, %0" : : "r"(val));
+}
+
+static inline void
+flush_mmu_tlb_asid(uint64_t val)
+{
+       flush_mmu_tlb_asid_async(val);
+       sync_tlb_flush();
+}
+
+// flush_core_tlb_asid: flush all entries that match an ASID, local core only
+// ASID must be in high 16 bits of argument
+// Will not flush global entries
+static inline void
+flush_core_tlb_asid_async(uint64_t val)
+{
+#if __ARM_KERNEL_PROTECT__
+       /*
+        * If we are flushing ASID 0, this is a kernel operation.  With this
+        * ASID scheme, this means we should flush all ASIDs.
+        */
+       uint64_t asid = val >> TLBI_ASID_SHIFT;
+       if (asid == 0) {
+               asm volatile ("tlbi vmalle1");
+               return;
+       }
+       val = val & ~(1ULL << TLBI_ASID_SHIFT);
+       asm volatile ("tlbi aside1, %0" : : "r"(val));
+       val = val | (1ULL << TLBI_ASID_SHIFT);
+#endif /* __ARM_KERNEL_PROTECT__ */
+       asm volatile ("tlbi aside1, %0" : : "r"(val));
+}
+
+static inline void
+flush_core_tlb_asid(uint64_t val)
+{
+       flush_core_tlb_asid_async(val);
+       sync_tlb_flush();
+}
+
+#if __ARM_RANGE_TLBI__
+#if __ARM_KERNEL_PROTECT__
+       #error __ARM_RANGE_TLBI__ + __ARM_KERNEL_PROTECT__ is not currently supported
+#endif
+
+#define ARM64_16K_TLB_RANGE_PAGES (1ULL << 21)
+#define rtlbi_addr(x) (((x) >> RTLBI_ADDR_SHIFT) & RTLBI_ADDR_MASK)
+#define rtlbi_scale(x) ((uint64_t)(x) << RTLBI_SCALE_SHIFT)
+#define rtlbi_num(x) ((uint64_t)(x) << RTLBI_NUM_SHIFT)
+
+/**
+ * Given the number of pages to invalidate, generate the correct parameter to
+ * pass to any of the TLBI by range methods.
+ */
+static inline uint64_t
+generate_rtlbi_param(ppnum_t npages, uint32_t asid, vm_offset_t va)
+{
+       /**
+        * Per the armv8.4 RTLBI extension spec, the range encoded in the rtlbi register operand is defined by:
+        * BaseADDR <= VA < BaseADDR+((NUM+1)*2^(5*SCALE+1) * Translation_Granule_Size)
+        */
+       unsigned order = (sizeof(npages) * 8) - __builtin_clz(npages - 1) - 1;
+       unsigned scale = ((order ? order : 1) - 1) / 5;
+       unsigned granule = 1 << ((5 * scale) + 1);
+       unsigned num = (((npages + granule - 1) & ~(granule - 1)) / granule) - 1;
+       return tlbi_asid(asid) | RTLBI_TG | rtlbi_scale(scale) | rtlbi_num(num) | rtlbi_addr(va);
+}
+
+// flush_mmu_tlb_range: flush TLB entries that map a VA range using a single instruction
+// The argument should be encoded according to generate_rtlbi_param().
+// Follows the same ASID matching behavior as flush_mmu_tlb_entries()
+static inline void
+flush_mmu_tlb_range_async(uint64_t val)
+{
+       asm volatile ("tlbi rvae1is, %0" : : "r"(val));
+}
+
+static inline void
+flush_mmu_tlb_range(uint64_t val)
+{
+       flush_mmu_tlb_range_async(val);
+       sync_tlb_flush();
+}
+
+// flush_mmu_tlb_allrange: flush TLB entries that map a VA range using a single instruction
+// The argument should be encoded according to generate_rtlbi_param().
+// Follows the same ASID matching behavior as flush_mmu_tlb_allentries()
+static inline void
+flush_mmu_tlb_allrange_async(uint64_t val)
+{
+       asm volatile ("tlbi rvaae1is, %0" : : "r"(val));
+}
+
+static inline void
+flush_mmu_tlb_allrange(uint64_t val)
+{
+       flush_mmu_tlb_allrange_async(val);
+       sync_tlb_flush();
+}
+
+#endif // __ARM_RANGE_TLBI__
+
+
index aa1f67f54ccc650bcaca46e47e13cb6a84faa198..88863e3b2ef552ffc2625dee69780e0a13df82d5 100644 (file)
@@ -54,7 +54,7 @@ ${MIGINCLUDES} : ${MIG_TYPES}
 
 ${MIG_UUHDRS} : \
        %.h : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)$(MIG) $(MIGFLAGS)         \
                -server /dev/null       \
                -user /dev/null         \
@@ -63,7 +63,7 @@ ${MIG_UUHDRS} : \
 
 ${MIG_USHDRS} : \
        %_server.h : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)$(MIG) $(MIGFLAGS)         \
                -server /dev/null       \
                -user /dev/null         \
@@ -101,7 +101,7 @@ ${COMP_FILES} : ${MIG_TYPES}
 
 ${MIG_KUSRC} : \
        %_user.c : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS}        \
                -user    $*_user.c              \
                -header  $*.h              \
@@ -111,7 +111,7 @@ ${MIG_KUSRC} : \
 
 ${MIG_KSSRC}: \
        %_server.c : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS}        \
                -user    /dev/null              \
                -header  /dev/null              \
index 69fc9f73841f4b699467cefd6e11e6e266784db1..ff11110809612f47fc061bfe613cf0ae25537c31 100644 (file)
@@ -76,16 +76,13 @@ static void atm_hash_table_init(void);
 static kern_return_t atm_value_hash_table_insert(atm_value_t new_atm_value);
 static void atm_value_hash_table_delete(atm_value_t atm_value);
 static atm_value_t get_atm_value_from_aid(aid_t aid) __unused;
-static void atm_value_get_ref(atm_value_t atm_value);
 static kern_return_t atm_listener_insert(atm_value_t atm_value, atm_task_descriptor_t task_descriptor, atm_guard_t guard);
 static void atm_listener_delete_all(atm_value_t atm_value);
 static atm_task_descriptor_t atm_task_descriptor_alloc_init(mach_port_t trace_buffer, uint64_t buffer_size, __assert_only task_t task);
-static void atm_descriptor_get_reference(atm_task_descriptor_t task_descriptor);
 static void atm_task_descriptor_dealloc(atm_task_descriptor_t task_descriptor);
 static kern_return_t atm_value_unregister(atm_value_t atm_value, atm_task_descriptor_t task_descriptor, atm_guard_t guard);
 static kern_return_t atm_value_register(atm_value_t atm_value, atm_task_descriptor_t task_descriptor, atm_guard_t guard);
 static kern_return_t atm_listener_delete(atm_value_t atm_value, atm_task_descriptor_t task_descriptor, atm_guard_t guard);
-static void atm_link_get_reference(atm_link_object_t link_object) __unused;
 static void atm_link_dealloc(atm_link_object_t link_object);
 
 kern_return_t
@@ -136,7 +133,7 @@ atm_release(ipc_voucher_attr_manager_t __assert_only manager);
 /*
  * communication channel from voucher system to ATM
  */
-struct ipc_voucher_attr_manager atm_manager = {
+const struct ipc_voucher_attr_manager atm_manager = {
        .ivam_release_value    = atm_release_value,
        .ivam_get_value        = atm_get_value,
        .ivam_extract_content  = atm_extract_content,
@@ -609,7 +606,7 @@ atm_value_alloc_init(aid_t aid)
        queue_init(&new_atm_value->listeners);
        new_atm_value->sync = 1;
        new_atm_value->listener_count = 0;
-       new_atm_value->reference_count = 1;
+       os_ref_init(&new_atm_value->reference_count, NULL);
        lck_mtx_init(&new_atm_value->listener_lock, &atm_lock_grp, &atm_lock_attr);
 
 #if DEVELOPMENT || DEBUG
@@ -658,24 +655,19 @@ get_subaid()
 static void
 atm_value_dealloc(atm_value_t atm_value)
 {
-       if (0 < atm_value_release_internal(atm_value)) {
-               return;
-       }
-
-       assert(atm_value->reference_count == 0);
-
-       /* Free up the atm value and also remove all the listeners. */
-       atm_listener_delete_all(atm_value);
+       if (os_ref_release(&atm_value->reference_count) == 0) {
+               /* Free up the atm value and also remove all the listeners. */
+               atm_listener_delete_all(atm_value);
 
-       lck_mtx_destroy(&atm_value->listener_lock, &atm_lock_grp);
+               lck_mtx_destroy(&atm_value->listener_lock, &atm_lock_grp);
 
 #if DEVELOPMENT || DEBUG
-       lck_mtx_lock(&atm_values_list_lock);
-       queue_remove(&atm_values_list, atm_value, atm_value_t, value_elt);
-       lck_mtx_unlock(&atm_values_list_lock);
+               lck_mtx_lock(&atm_values_list_lock);
+               queue_remove(&atm_values_list, atm_value, atm_value_t, value_elt);
+               lck_mtx_unlock(&atm_values_list_lock);
 #endif
-       zfree(atm_value_zone, atm_value);
-       return;
+               zfree(atm_value_zone, atm_value);
+       }
 }
 
 
@@ -780,7 +772,7 @@ get_atm_value_from_aid(aid_t aid)
                         * Aid found. Incerease ref count and return
                         * the atm value structure.
                         */
-                       atm_value_get_ref(next);
+                       os_ref_retain(&next->reference_count);
                        lck_mtx_unlock(&hash_list_head->hash_list_lock);
                        return next;
                }
@@ -790,18 +782,6 @@ get_atm_value_from_aid(aid_t aid)
 }
 
 
-/*
- * Routine: atm_value_get_ref
- * Purpose: Get a reference on atm value.
- * Returns: None.
- */
-static void
-atm_value_get_ref(atm_value_t atm_value)
-{
-       atm_value_reference_internal(atm_value);
-}
-
-
 /*
  * Routine: atm_listener_insert
  * Purpose: Insert a listener to an atm value.
@@ -822,11 +802,11 @@ atm_listener_insert(
 
        new_link_object = (atm_link_object_t) zalloc(atm_link_objects_zone);
        new_link_object->descriptor = task_descriptor;
-       new_link_object->reference_count = 1;
+       os_ref_init(&new_link_object->reference_count, NULL);
        new_link_object->guard = guard;
 
        /* Get a reference on the task descriptor */
-       atm_descriptor_get_reference(task_descriptor);
+       os_ref_retain(&task_descriptor->reference_count);
        queue_init(&free_listeners);
        listener_count = atm_value->listener_count;
 
@@ -857,7 +837,7 @@ atm_listener_insert(
 
                if (elem->descriptor == task_descriptor) {
                        /* Increment reference count on Link object. */
-                       atm_link_get_reference(elem);
+                       os_ref_retain(&elem->reference_count);
 
                        /* Replace the guard with the new one, the old guard is anyways on unregister path. */
                        elem->guard = guard;
@@ -945,16 +925,16 @@ atm_listener_delete(
                        if (elem->guard == guard) {
                                KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_UNREGISTER_INFO,
                                    (ATM_VALUE_UNREGISTERED))) | DBG_FUNC_NONE,
-                                   VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, guard, elem->reference_count, 0);
+                                   VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, guard, os_ref_get_count(&elem->reference_count), 0);
                                elem->guard = 0;
                                kr = KERN_SUCCESS;
                        } else {
                                KERNEL_DEBUG_CONSTANT((ATM_CODE(ATM_UNREGISTER_INFO,
                                    (ATM_VALUE_DIFF_MAILBOX))) | DBG_FUNC_NONE,
-                                   VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, elem->guard, elem->reference_count, 0);
+                                   VM_KERNEL_ADDRPERM(atm_value), atm_value->aid, elem->guard, os_ref_get_count(&elem->reference_count), 0);
                                kr = KERN_INVALID_VALUE;
                        }
-                       if (0 == atm_link_object_release_internal(elem)) {
+                       if (os_ref_release(&elem->reference_count) == 0) {
                                queue_remove(&atm_value->listeners, elem, atm_link_object_t, listeners_element);
                                queue_enter(&free_listeners, elem, atm_link_object_t, listeners_element);
                                atm_listener_count_decr_internal(atm_value);
@@ -992,7 +972,7 @@ atm_task_descriptor_alloc_init(
 
        new_task_descriptor->trace_buffer = trace_buffer;
        new_task_descriptor->trace_buffer_size = buffer_size;
-       new_task_descriptor->reference_count = 1;
+       os_ref_init(&new_task_descriptor->reference_count, NULL);
        new_task_descriptor->flags = 0;
        lck_mtx_init(&new_task_descriptor->lock, &atm_lock_grp, &atm_lock_attr);
 
@@ -1007,18 +987,6 @@ atm_task_descriptor_alloc_init(
 }
 
 
-/*
- * Routine: atm_descriptor_get_reference
- * Purpose: Get a reference count on task descriptor.
- * Returns: None.
- */
-static void
-atm_descriptor_get_reference(atm_task_descriptor_t task_descriptor)
-{
-       atm_task_desc_reference_internal(task_descriptor);
-}
-
-
 /*
  * Routine: atm_task_descriptor_dealloc
  * Prupose: Drops the reference on atm descriptor.
@@ -1027,34 +995,17 @@ atm_descriptor_get_reference(atm_task_descriptor_t task_descriptor)
 static void
 atm_task_descriptor_dealloc(atm_task_descriptor_t task_descriptor)
 {
-       if (0 < atm_task_desc_release_internal(task_descriptor)) {
-               return;
-       }
-
-       assert(task_descriptor->reference_count == 0);
-
+       if (os_ref_release(&task_descriptor->reference_count) == 0) {
 #if DEVELOPMENT || DEBUG
-       lck_mtx_lock(&atm_descriptors_list_lock);
-       queue_remove(&atm_descriptors_list, task_descriptor, atm_task_descriptor_t, descriptor_elt);
-       lck_mtx_unlock(&atm_descriptors_list_lock);
+               lck_mtx_lock(&atm_descriptors_list_lock);
+               queue_remove(&atm_descriptors_list, task_descriptor, atm_task_descriptor_t, descriptor_elt);
+               lck_mtx_unlock(&atm_descriptors_list_lock);
 #endif
-       /* release the send right for the named memory entry */
-       ipc_port_release_send(task_descriptor->trace_buffer);
-       lck_mtx_destroy(&task_descriptor->lock, &atm_lock_grp);
-       zfree(atm_descriptors_zone, task_descriptor);
-       return;
-}
-
-
-/*
- * Routine: atm_link_get_reference
- * Purpose: Get a reference count on atm link object.
- * Returns: None.
- */
-static void
-atm_link_get_reference(atm_link_object_t link_object)
-{
-       atm_link_object_reference_internal(link_object);
+               /* release the send right for the named memory entry */
+               ipc_port_release_send(task_descriptor->trace_buffer);
+               lck_mtx_destroy(&task_descriptor->lock, &atm_lock_grp);
+               zfree(atm_descriptors_zone, task_descriptor);
+       }
 }
 
 
index ea1cbce7c549d900d0aefd34cde0641e09efae81..a8a4aace6414f511a82ab3f13c58a69175943221 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -32,6 +32,7 @@
 #include <stdint.h>
 #include <mach/mach_types.h>
 #include <atm/atm_types.h>
+#include <os/refcnt.h>
 
 #ifdef  MACH_KERNEL_PRIVATE
 
 typedef mach_voucher_attr_value_handle_t atm_voucher_id_t;
 
 struct atm_task_descriptor {
-       decl_lck_mtx_data(, lock)                /* lock to protect reference count */
+       decl_lck_mtx_data(, lock);                /* lock to protect reference count */
        mach_port_t     trace_buffer;           /* named memory entry registered by user */
        uint64_t        trace_buffer_size;      /* size of the trace_buffer registered */
-       uint32_t        reference_count;
+       os_refcnt_t     reference_count;
        uint8_t         flags;
 #if DEVELOPMENT || DEBUG
        task_t          task;                   /* task pointer for debugging purposes */
@@ -60,42 +61,31 @@ struct atm_task_descriptor {
 #endif
 };
 
-#define atm_task_desc_reference_internal(elem)  \
-       (hw_atomic_add(&(elem)->reference_count, 1))
-
-#define atm_task_desc_release_internal(elem)    \
-       (hw_atomic_sub(&(elem)->reference_count, 1))
-
 typedef struct atm_task_descriptor *atm_task_descriptor_t;
 #define ATM_TASK_DESCRIPTOR_NULL NULL
 
 struct atm_value {
        aid_t            aid;                   /* activity id */
        queue_head_t     listeners;             /* List of listeners who register for this activity */
-       decl_lck_mtx_data(, listener_lock)      /* Lock to protect listener list */
+       decl_lck_mtx_data(, listener_lock);      /* Lock to protect listener list */
        queue_chain_t    vid_hash_elt;          /* Next hash element in the global hash table */
 #if DEVELOPMENT || DEBUG
        queue_chain_t    value_elt;             /* global chain of all values */
 #endif
        uint32_t         sync;                  /* Made ref count given to voucher sub system. */
-       uint32_t         listener_count;        /* Number of Listerners listening on the value. */
-       uint32_t         reference_count;       /* use count on the atm value, 1 taken by the global hash table */
-};
-
-#define atm_value_reference_internal(elem)      \
-       (hw_atomic_add(&(elem)->reference_count, 1))
 
-#define atm_value_release_internal(elem)        \
-       (hw_atomic_sub(&(elem)->reference_count, 1))
+       uint32_t         listener_count;
+       os_refcnt_t      reference_count;               /* use count on the atm value, 1 taken by the global hash table */
+};
 
 #define atm_listener_count_incr_internal(elem)  \
-       (hw_atomic_add(&(elem)->listener_count, 1))
+       (os_atomic_inc(&(elem)->listener_count, relaxed))
 
 #define atm_listener_count_decr_internal(elem)  \
-       (hw_atomic_sub(&(elem)->listener_count, 1))
+       (os_atomic_dec(&(elem)->listener_count, relaxed))
 
 #define atm_sync_reference_internal(elem)       \
-       (hw_atomic_add(&(elem)->sync, 1))
+       (os_atomic_inc(&(elem)->sync, relaxed))
 
 typedef struct atm_value *atm_value_t;
 #define ATM_VALUE_NULL NULL
@@ -107,20 +97,14 @@ struct atm_link_object {
        atm_task_descriptor_t  descriptor;
        queue_chain_t          listeners_element;    /* Head is atm_value->listeners. */
        atm_guard_t            guard;                /* Guard registered by the user for an activity. */
-       uint32_t               reference_count;      /* Refernece count for link object */
+       os_refcnt_t            reference_count;
 };
 
 typedef struct atm_link_object *atm_link_object_t;
 
-#define atm_link_object_reference_internal(elem)        \
-       (hw_atomic_add(&(elem)->reference_count, 1))
-
-#define atm_link_object_release_internal(elem)  \
-       (hw_atomic_sub(&(elem)->reference_count, 1))
-
 struct atm_value_hash {
        queue_head_t    hash_list;
-       decl_lck_mtx_data(, hash_list_lock)     /* lock to protect bucket list. */
+       decl_lck_mtx_data(, hash_list_lock);    /* lock to protect bucket list. */
 };
 
 typedef struct atm_value_hash *atm_value_hash_t;
index 2f4246c0f2478116cb57a308fadd622499f10833..27dee2fdbd1f023ddff7cd3ef01534a315b142a6 100644 (file)
@@ -52,7 +52,7 @@ ${MIGINCLUDES} : ${MIG_TYPES}
 
 ${MIG_UUHDRS} : \
        %.h : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)$(MIG) $(MIGFLAGS)         \
                -server /dev/null       \
                -user /dev/null         \
@@ -61,7 +61,7 @@ ${MIG_UUHDRS} : \
 
 ${MIG_USHDRS} : \
        %_server.h : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)$(MIG) $(MIGFLAGS)         \
                -server /dev/null       \
                -user /dev/null         \
@@ -97,7 +97,7 @@ ${COMP_FILES} : ${MIG_TYPES}
 
 ${MIG_KUSRC} : \
        %_user.c : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS}        \
                -user    $*_user.c              \
                -header  $*.h              \
@@ -107,7 +107,7 @@ ${MIG_KUSRC} : \
 
 ${MIG_KSSRC}: \
        %_server.c : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS}        \
                -user    /dev/null              \
                -header  /dev/null              \
index ef4d2977d6f7fe294e774eeef1ed1f56b86fc663..a281a029aea4f0b56d17b95bd66ad551189f5587 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -59,6 +59,10 @@ static zone_t bank_task_zone, bank_account_zone;
 #define CAST_TO_BANK_ACCOUNT(x) ((bank_account_t)((void *)(x)))
 
 ipc_voucher_attr_control_t  bank_voucher_attr_control;    /* communication channel from ATM to voucher system */
+struct persona;
+extern struct persona *system_persona, *proxy_system_persona;
+uint32_t persona_get_id(struct persona *persona);
+extern int unique_persona;
 
 #if DEVELOPMENT || DEBUG
 queue_head_t bank_tasks_list;
@@ -66,11 +70,11 @@ queue_head_t bank_accounts_list;
 #endif
 
 static ledger_template_t bank_ledger_template = NULL;
-struct _bank_ledger_indices bank_ledgers = { -1, -1 };
+struct _bank_ledger_indices bank_ledgers = { .cpu_time = -1, .energy = -1 };
 
 static bank_task_t bank_task_alloc_init(task_t task);
 static bank_account_t bank_account_alloc_init(bank_task_t bank_holder, bank_task_t bank_merchant,
-    bank_task_t bank_secureoriginator, bank_task_t bank_proximateprocess, struct thread_group* banktg);
+    bank_task_t bank_secureoriginator, bank_task_t bank_proximateprocess, struct thread_group* banktg, uint32_t persona_id);
 static bank_task_t get_bank_task_context(task_t task, boolean_t initialize);
 static void bank_task_dealloc(bank_task_t bank_task, mach_voucher_attr_value_reference_t sync);
 static kern_return_t bank_account_dealloc_with_sync(bank_account_t bank_account, mach_voucher_attr_value_reference_t sync);
@@ -80,11 +84,15 @@ static ledger_t bank_get_bank_task_ledger_with_ref(bank_task_t bank_task);
 static void bank_destroy_bank_task_ledger(bank_task_t bank_task);
 static void init_bank_ledgers(void);
 static boolean_t bank_task_is_propagate_entitled(task_t t);
+static boolean_t bank_task_is_persona_modify_entitled(task_t t);
 static struct thread_group *bank_get_bank_task_thread_group(bank_task_t bank_task __unused);
 static struct thread_group *bank_get_bank_account_thread_group(bank_account_t bank_account __unused);
+static boolean_t bank_verify_persona_id(uint32_t persona_id);
 
 static lck_spin_t g_bank_task_lock_data;    /* lock to protect task->bank_context transition */
 
+static uint32_t disable_persona_propogate_check = 0;
+
 #define global_bank_task_lock_init() \
        lck_spin_init(&g_bank_task_lock_data, &bank_lock_grp, &bank_lock_attr)
 #define global_bank_task_lock_destroy() \
@@ -105,7 +113,8 @@ extern uint32_t proc_getgid(void *p);
 extern void proc_getexecutableuuid(void *p, unsigned char *uuidbuf, unsigned long size);
 extern int kauth_cred_issuser(void *cred);
 extern void* kauth_cred_get(void);
-
+extern void* persona_lookup(uint32_t id);
+extern void persona_put(void* persona);
 
 kern_return_t
 bank_release_value(
@@ -155,7 +164,7 @@ bank_release(ipc_voucher_attr_manager_t __assert_only manager);
 /*
  * communication channel from voucher system to ATM
  */
-struct ipc_voucher_attr_manager bank_manager = {
+const struct ipc_voucher_attr_manager bank_manager = {
        .ivam_release_value    = bank_release_value,
        .ivam_get_value        = bank_get_value,
        .ivam_extract_content  = bank_extract_content,
@@ -232,6 +241,15 @@ bank_init()
                panic("BANK subsystem initialization failed");
        }
 
+
+#if DEVELOPMENT || DEBUG
+       uint32_t disable_persona_propogate_check_bootarg = 0;
+       if (PE_parse_boot_argn("disable_persona_propogate_check", &disable_persona_propogate_check_bootarg,
+           sizeof(disable_persona_propogate_check_bootarg))) {
+               disable_persona_propogate_check = (disable_persona_propogate_check_bootarg != 0) ? 1 : 0;
+       }
+#endif
+
        kprintf("BANK subsystem is initialized\n");
        return;
 }
@@ -303,6 +321,8 @@ bank_release_value(
 
 /*
  * Routine: bank_get_value
+ *
+ * This function uses the recipe to create a bank attribute for a voucher.
  */
 kern_return_t
 bank_get_value(
@@ -311,13 +331,12 @@ bank_get_value(
        mach_voucher_attr_recipe_command_t                command,
        mach_voucher_attr_value_handle_array_t        prev_values,
        mach_msg_type_number_t                        prev_value_count,
-       mach_voucher_attr_content_t          __unused recipe,
-       mach_voucher_attr_content_size_t     __unused recipe_size,
+       mach_voucher_attr_content_t                   recipe,
+       mach_voucher_attr_content_size_t              recipe_size,
        mach_voucher_attr_value_handle_t             *out_value,
        mach_voucher_attr_value_flags_t              *out_flags,
        ipc_voucher_t                                            *out_value_voucher)
 {
-       bank_task_t bank_task = BANK_TASK_NULL;
        bank_task_t bank_holder = BANK_TASK_NULL;
        bank_task_t bank_merchant = BANK_TASK_NULL;
        bank_task_t bank_secureoriginator = BANK_TASK_NULL;
@@ -331,6 +350,7 @@ bank_get_value(
        mach_msg_type_number_t i;
        struct thread_group *thread_group = NULL;
        struct thread_group *cur_thread_group = NULL;
+       uint32_t persona_id = proc_persona_id(NULL);
 
        assert(MACH_VOUCHER_ATTR_KEY_BANK == key);
        assert(manager == &bank_manager);
@@ -342,13 +362,107 @@ bank_get_value(
        switch (command) {
        case MACH_VOUCHER_ATTR_BANK_CREATE:
 
-               /* Return the default task value instead of bank task */
+               /* It returns the default task value. This value is replaced by
+                * an actual bank task reference, by using a recipe with
+                * MACH_VOUCHER_ATTR_SEND_PREPROCESS command.
+                */
                *out_value = BANK_ELEMENT_TO_HANDLE(BANK_DEFAULT_TASK_VALUE);
                *out_flags = MACH_VOUCHER_ATTR_VALUE_FLAGS_PERSIST;
                break;
 
+       case MACH_VOUCHER_ATTR_BANK_MODIFY_PERSONA:
+
+               /* It creates a bank account attribute value with a new persona id
+                * and auto-redeems it on behalf of the bank_holder.
+                */
+               *out_value = BANK_ELEMENT_TO_HANDLE(BANK_DEFAULT_VALUE);
+
+               for (i = 0; i < prev_value_count; i++) {
+                       bank_handle = prev_values[i];
+                       bank_element = HANDLE_TO_BANK_ELEMENT(bank_handle);
+
+                       /* Expect a pre-processed attribute value */
+                       if (bank_element == BANK_DEFAULT_VALUE || bank_element == BANK_DEFAULT_TASK_VALUE) {
+                               continue;
+                       }
+
+                       if (!bank_task_is_persona_modify_entitled(current_task())) {
+                               return KERN_NO_ACCESS;
+                       }
+
+                       struct persona_modify_info pmi = {};
+                       if (recipe_size == sizeof(pmi)) {
+                               memcpy((void *)&pmi, recipe, sizeof(pmi));
+                               persona_id = pmi.persona_id;
+                       } else {
+                               return KERN_INVALID_ARGUMENT;
+                       }
+
+                       /* Verify if the persona id is valid */
+                       if (!bank_verify_persona_id(persona_id)) {
+                               return KERN_INVALID_ARGUMENT;
+                       }
+
+                       /* Update the persona id only if the bank element is a bank task.
+                        * This ensures that the bank_holder can be trusted.
+                        */
+                       if (bank_element->be_type == BANK_TASK) {
+                               bank_holder = CAST_TO_BANK_TASK(bank_element);
+                               /* Ensure that the requestor validated by userspace matches
+                                * the bank_holder
+                                */
+                               if (pmi.unique_pid != bank_holder->bt_unique_pid) {
+                                       return KERN_INVALID_CAPABILITY;
+                               }
+                               bank_merchant = bank_holder;
+                               bank_secureoriginator = bank_holder;
+                               bank_proximateprocess = bank_holder;
+                               thread_group = bank_get_bank_task_thread_group(bank_holder);
+                       } else if (bank_element->be_type == BANK_ACCOUNT) {
+                               return KERN_INVALID_ARGUMENT;
+                       } else {
+                               panic("Bogus bank type: %d passed in get_value\n", bank_element->be_type);
+                       }
+
+                       /* Change the persona-id to holder task's persona-id if the task is not spawned in system persona */
+                       if (unique_persona &&
+                           bank_merchant->bt_persona_id != persona_get_id(system_persona) &&
+                           bank_merchant->bt_persona_id != persona_get_id(proxy_system_persona)) {
+                               persona_id = bank_merchant->bt_persona_id;
+                       }
+
+                       if (bank_holder->bt_persona_id == persona_id) {
+                               lck_mtx_lock(&bank_holder->bt_acc_to_pay_lock);
+                               bank_task_made_reference(bank_holder);
+                               if (bank_holder->bt_voucher_ref == 0) {
+                                       /* Take a ref for voucher system, if voucher system does not have a ref */
+                                       bank_task_reference(bank_holder);
+                                       bank_holder->bt_voucher_ref = 1;
+                               }
+                               lck_mtx_unlock(&bank_holder->bt_acc_to_pay_lock);
+
+                               *out_value = BANK_ELEMENT_TO_HANDLE(bank_holder);
+                               return kr;
+                       }
+
+                       bank_account = bank_account_alloc_init(bank_holder, bank_merchant,
+                           bank_secureoriginator, bank_proximateprocess,
+                           thread_group, persona_id);
+                       if (bank_account == BANK_ACCOUNT_NULL) {
+                               return KERN_RESOURCE_SHORTAGE;
+                       }
+
+                       *out_value = BANK_ELEMENT_TO_HANDLE(bank_account);
+                       return kr;
+               }
+               break;
+
        case MACH_VOUCHER_ATTR_AUTO_REDEEM:
 
+               /* It creates a bank account with the bank_merchant set to the current task.
+                * A bank attribute voucher needs to be redeemed before it can be adopted by
+                * it's threads.
+                */
                for (i = 0; i < prev_value_count; i++) {
                        bank_handle = prev_values[i];
                        bank_element = HANDLE_TO_BANK_ELEMENT(bank_handle);
@@ -364,12 +478,14 @@ bank_get_value(
                                bank_secureoriginator = bank_holder;
                                bank_proximateprocess = bank_holder;
                                thread_group = bank_get_bank_task_thread_group(bank_holder);
+                               persona_id = bank_holder->bt_persona_id;
                        } else if (bank_element->be_type == BANK_ACCOUNT) {
                                old_bank_account = CAST_TO_BANK_ACCOUNT(bank_element);
                                bank_holder = old_bank_account->ba_holder;
                                bank_secureoriginator = old_bank_account->ba_secureoriginator;
                                bank_proximateprocess = old_bank_account->ba_proximateprocess;
                                thread_group = bank_get_bank_account_thread_group(old_bank_account);
+                               persona_id = old_bank_account->ba_so_persona_id;
                        } else {
                                panic("Bogus bank type: %d passed in get_value\n", bank_element->be_type);
                        }
@@ -386,11 +502,19 @@ bank_get_value(
                                thread_group = cur_thread_group;
                        }
 
+                       /* Change the persona-id to current task persona-id if the task is not spawned in system persona */
+                       if (unique_persona &&
+                           bank_merchant->bt_persona_id != persona_get_id(system_persona) &&
+                           bank_merchant->bt_persona_id != persona_get_id(proxy_system_persona)) {
+                               persona_id = bank_merchant->bt_persona_id;
+                       }
+
                        /* Check if trying to redeem for self task, return the default bank task */
                        if (bank_holder == bank_merchant &&
                            bank_holder == bank_secureoriginator &&
                            bank_holder == bank_proximateprocess &&
-                           thread_group == cur_thread_group) {
+                           thread_group == cur_thread_group &&
+                           persona_id == bank_holder->bt_persona_id) {
                                *out_value = BANK_ELEMENT_TO_HANDLE(BANK_DEFAULT_TASK_VALUE);
                                *out_flags = MACH_VOUCHER_ATTR_VALUE_FLAGS_PERSIST;
                                return kr;
@@ -398,7 +522,7 @@ bank_get_value(
 
                        bank_account = bank_account_alloc_init(bank_holder, bank_merchant,
                            bank_secureoriginator, bank_proximateprocess,
-                           thread_group);
+                           thread_group, persona_id);
                        if (bank_account == BANK_ACCOUNT_NULL) {
                                return KERN_RESOURCE_SHORTAGE;
                        }
@@ -429,11 +553,13 @@ bank_get_value(
                                bank_holder = CAST_TO_BANK_TASK(bank_element);
                                bank_secureoriginator = bank_holder;
                                thread_group = bank_get_bank_task_thread_group(bank_holder);
+                               persona_id = bank_holder->bt_persona_id;
                        } else if (bank_element->be_type == BANK_ACCOUNT) {
                                old_bank_account = CAST_TO_BANK_ACCOUNT(bank_element);
                                bank_holder = old_bank_account->ba_holder;
                                bank_secureoriginator = old_bank_account->ba_secureoriginator;
                                thread_group = bank_get_bank_account_thread_group(old_bank_account);
+                               persona_id = old_bank_account->ba_so_persona_id;
                        } else {
                                panic("Bogus bank type: %d passed in get_value\n", bank_element->be_type);
                        }
@@ -448,21 +574,24 @@ bank_get_value(
                        /*
                         * If the process doesn't have secure persona entitlement,
                         * then replace the secure originator to current task.
+                        * Also update the persona_id to match that of the secure originator.
                         */
                        if (bank_merchant->bt_hasentitlement == 0) {
                                KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
                                    (BANK_CODE(BANK_ACCOUNT_INFO, (BANK_SECURE_ORIGINATOR_CHANGED))) | DBG_FUNC_NONE,
                                    bank_secureoriginator->bt_pid, bank_merchant->bt_pid, 0, 0, 0);
                                bank_secureoriginator = bank_merchant;
+                               persona_id = bank_merchant->bt_persona_id;
                        }
 
                        bank_proximateprocess = bank_merchant;
 
-                       /* Check if trying to redeem for self task, return the bank task */
+                       /* Check if trying to pre-process for self task, return the bank task */
                        if (bank_holder == bank_merchant &&
                            bank_holder == bank_secureoriginator &&
                            bank_holder == bank_proximateprocess &&
-                           thread_group == cur_thread_group) {
+                           thread_group == cur_thread_group &&
+                           persona_id == bank_holder->bt_persona_id) {
                                lck_mtx_lock(&bank_holder->bt_acc_to_pay_lock);
                                bank_task_made_reference(bank_holder);
                                if (bank_holder->bt_voucher_ref == 0) {
@@ -477,7 +606,7 @@ bank_get_value(
                        }
                        bank_account = bank_account_alloc_init(bank_holder, bank_merchant,
                            bank_secureoriginator, bank_proximateprocess,
-                           thread_group);
+                           thread_group, persona_id);
                        if (bank_account == BANK_ACCOUNT_NULL) {
                                return KERN_RESOURCE_SHORTAGE;
                        }
@@ -490,7 +619,9 @@ bank_get_value(
                break;
 
        case MACH_VOUCHER_ATTR_REDEEM:
-
+               /* This command expects that the bank attribute has been auto-redeemed
+                * and returns a reference to that bank account value.
+                */
                for (i = 0; i < prev_value_count; i++) {
                        bank_handle = prev_values[i];
                        bank_element = HANDLE_TO_BANK_ELEMENT(bank_handle);
@@ -499,24 +630,32 @@ bank_get_value(
                                continue;
                        }
 
-                       task = current_task();
                        if (bank_element == BANK_DEFAULT_TASK_VALUE) {
                                *out_value = BANK_ELEMENT_TO_HANDLE(BANK_DEFAULT_TASK_VALUE);
                                *out_flags = MACH_VOUCHER_ATTR_VALUE_FLAGS_PERSIST;
                                return kr;
                        }
-                       if (bank_element->be_type == BANK_TASK) {
-                               bank_task = CAST_TO_BANK_TASK(bank_element);
-                               panic("Found a bank task in MACH_VOUCHER_ATTR_REDEEM: %p", bank_task);
 
+                       task = current_task();
+                       if (bank_element->be_type == BANK_TASK) {
+                               bank_holder =  CAST_TO_BANK_TASK(bank_element);
+                               if (bank_holder == get_bank_task_context(task, FALSE)) {
+                                       *out_value = BANK_ELEMENT_TO_HANDLE(BANK_DEFAULT_TASK_VALUE);
+                                       *out_flags = MACH_VOUCHER_ATTR_VALUE_FLAGS_PERSIST;
+                               } else {
+                                       kr = KERN_INVALID_CAPABILITY;
+                               }
                                return kr;
                        } else if (bank_element->be_type == BANK_ACCOUNT) {
                                bank_account = CAST_TO_BANK_ACCOUNT(bank_element);
                                bank_merchant = bank_account->ba_merchant;
                                if (bank_merchant != get_bank_task_context(task, FALSE)) {
-                                       panic("Found another bank task: %p as a bank merchant\n", bank_merchant);
+                                       /* This error can be used to verify if the task can
+                                        * adopt the voucher.
+                                        */
+                                       kr = KERN_INVALID_CAPABILITY;
+                                       return kr;
                                }
-
                                bank_account_made_reference(bank_account);
                                *out_value = BANK_ELEMENT_TO_HANDLE(bank_account);
                                return kr;
@@ -591,14 +730,13 @@ bank_extract_content(
                            bank_account->ba_holder->bt_pid,
                            bank_account->ba_merchant->bt_pid,
                            bank_account->ba_secureoriginator->bt_pid,
-                           bank_account->ba_secureoriginator->bt_persona_id,
+                           bank_account->ba_so_persona_id,
                            bank_account->ba_proximateprocess->bt_pid,
                            bank_account->ba_proximateprocess->bt_persona_id);
                } else {
                        panic("Bogus bank type: %d passed in get_value\n", bank_element->be_type);
                }
 
-
                memcpy(&out_recipe[0], buf, strlen(buf) + 1);
                *out_command = MACH_VOUCHER_ATTR_BANK_NULL;
                *in_out_recipe_size = (mach_voucher_attr_content_size_t)strlen(buf) + 1;
@@ -610,7 +748,7 @@ bank_extract_content(
 
 /*
  * Routine: bank_command
- * Purpose: Execute a command against a set of ATM values.
+ * Purpose: Execute a command against a set of bank values.
  * Returns: KERN_SUCCESS: On successful execution of command.
  *           KERN_FAILURE: On failure.
  */
@@ -635,6 +773,7 @@ bank_command(
        mach_voucher_attr_value_handle_t bank_handle;
        mach_msg_type_number_t i;
        int32_t pid;
+       uint32_t persona_id;
 
        assert(MACH_VOUCHER_ATTR_KEY_BANK == key);
        assert(manager == &bank_manager);
@@ -714,6 +853,42 @@ bank_command(
                *out_content_size = 0;
                return KERN_INVALID_VALUE;
 
+       case BANK_PERSONA_ID:
+
+               if ((sizeof(persona_id)) > *out_content_size) {
+                       *out_content_size = 0;
+                       return KERN_NO_SPACE;
+               }
+
+               for (i = 0; i < value_count; i++) {
+                       bank_handle = values[i];
+                       bank_element = HANDLE_TO_BANK_ELEMENT(bank_handle);
+                       if (bank_element == BANK_DEFAULT_VALUE) {
+                               continue;
+                       }
+
+                       if (bank_element == BANK_DEFAULT_TASK_VALUE) {
+                               bank_element = CAST_TO_BANK_ELEMENT(get_bank_task_context(current_task(), FALSE));
+                       }
+
+                       if (bank_element->be_type == BANK_TASK) {
+                               bank_task = CAST_TO_BANK_TASK(bank_element);
+                               persona_id = bank_task->bt_persona_id;
+                       } else if (bank_element->be_type == BANK_ACCOUNT) {
+                               bank_account = CAST_TO_BANK_ACCOUNT(bank_element);
+                               persona_id = bank_account->ba_so_persona_id;
+                       } else {
+                               panic("Bogus bank type: %d passed in voucher_command\n", bank_element->be_type);
+                       }
+
+                       memcpy(out_content, &persona_id, sizeof(persona_id));
+                       *out_content_size = (mach_voucher_attr_content_size_t)sizeof(persona_id);
+                       return KERN_SUCCESS;
+               }
+               /* In the case of no value, return error KERN_INVALID_VALUE */
+               *out_content_size = 0;
+               return KERN_INVALID_VALUE;
+
        default:
                return KERN_INVALID_ARGUMENT;
        }
@@ -787,22 +962,40 @@ bank_task_alloc_init(task_t task)
 
 /*
  * Routine: proc_is_propagate_entitled
- * Purpose: Check if the process has persona propagate entitlement.
+ * Purpose: Check if the process is allowed to propagate secure originator.
  * Returns: TRUE if entitled.
  *          FALSE if not.
  */
 static boolean_t
 bank_task_is_propagate_entitled(task_t t)
 {
-       /* Return TRUE if root process */
-       if (0 == kauth_cred_issuser(kauth_cred_get())) {
-               /* If it's a non-root process, it needs to have the entitlement for secure originator propagation */
-               boolean_t entitled = FALSE;
-               entitled = IOTaskHasEntitlement(t, ENTITLEMENT_PERSONA_PROPAGATE);
-               return entitled;
-       } else {
+       /* Check if it has an entitlement which disallows secure originator propagation */
+       boolean_t entitled = FALSE;
+       entitled = IOTaskHasEntitlement(t, ENTITLEMENT_PERSONA_NO_PROPAGATE);
+       if (entitled) {
+               return FALSE;
+       }
+
+       /* If it's a platform binary, allow propogation by default */
+       if (disable_persona_propogate_check || (t->t_flags & TF_PLATFORM)) {
                return TRUE;
        }
+
+       return FALSE;
+}
+
+/*
+ * Routine: proc_is_persona_modify_entitled
+ * Purpose: Check if the process has persona modify entitlement.
+ * Returns: TRUE if entitled.
+ *          FALSE if not.
+ */
+static boolean_t
+bank_task_is_persona_modify_entitled(task_t t)
+{
+       boolean_t entitled = FALSE;
+       entitled = IOTaskHasEntitlement(t, ENTITLEMENT_PERSONA_MODIFY);
+       return entitled;
 }
 
 /*
@@ -817,7 +1010,8 @@ bank_account_alloc_init(
        bank_task_t bank_merchant,
        bank_task_t bank_secureoriginator,
        bank_task_t bank_proximateprocess,
-       struct thread_group *thread_group)
+       struct thread_group *thread_group,
+       uint32_t persona_id)
 {
        bank_account_t new_bank_account;
        bank_account_t bank_account;
@@ -845,6 +1039,7 @@ bank_account_alloc_init(
        new_bank_account->ba_holder = bank_holder;
        new_bank_account->ba_secureoriginator = bank_secureoriginator;
        new_bank_account->ba_proximateprocess = bank_proximateprocess;
+       new_bank_account->ba_so_persona_id = persona_id;
 
        /* Iterate through accounts need to pay list to find the existing entry */
        lck_mtx_lock(&bank_holder->bt_acc_to_pay_lock);
@@ -852,7 +1047,8 @@ bank_account_alloc_init(
                if (bank_account->ba_merchant != bank_merchant ||
                    bank_account->ba_secureoriginator != bank_secureoriginator ||
                    bank_account->ba_proximateprocess != bank_proximateprocess ||
-                   bank_get_bank_account_thread_group(bank_account) != thread_group) {
+                   bank_get_bank_account_thread_group(bank_account) != thread_group ||
+                   bank_account->ba_so_persona_id != persona_id) {
                        continue;
                }
 
@@ -1405,8 +1601,7 @@ bank_serviced_balance(bank_task_t bank_task, uint64_t *cpu_time, uint64_t *energ
  * Routine: bank_get_voucher_bank_account
  * Purpose: Get the bank account from the voucher.
  * Returns: bank_account if bank_account attribute present in voucher.
- *          NULL on no attribute, no bank_element, or if holder and merchant bank accounts
- *          and voucher thread group and current thread group are the same.
+ *          NULL on no attribute or no bank_element
  */
 static bank_account_t
 bank_get_voucher_bank_account(ipc_voucher_t voucher)
@@ -1439,23 +1634,7 @@ bank_get_voucher_bank_account(ipc_voucher_t voucher)
                return BANK_ACCOUNT_NULL;
        } else if (bank_element->be_type == BANK_ACCOUNT) {
                bank_account = CAST_TO_BANK_ACCOUNT(bank_element);
-               /*
-                * Return BANK_ACCOUNT_NULL if the ba_holder is same as ba_merchant
-                * and bank account thread group is same as current thread group
-                * i.e. ba_merchant's thread group.
-                *
-                * The bank account might have ba_holder same as ba_merchant but different
-                * thread group if daemon sends a voucher to an App and then App sends the
-                * same voucher back to the daemon (IPC code will replace thread group in the
-                * voucher to App's thread group when it gets auto redeemed by the App).
-                */
-               if (bank_account->ba_holder != bank_account->ba_merchant ||
-                   bank_get_bank_account_thread_group(bank_account) !=
-                   bank_get_bank_task_thread_group(bank_account->ba_merchant)) {
-                       return bank_account;
-               } else {
-                       return BANK_ACCOUNT_NULL;
-               }
+               return bank_account;
        } else {
                panic("Bogus bank type: %d passed in bank_get_voucher_bank_account\n", bank_element->be_type);
        }
@@ -1544,30 +1723,61 @@ bank_get_bank_account_thread_group(bank_account_t bank_account __unused)
 }
 
 /*
- * Routine: bank_get_bank_ledger_and_thread_group
- * Purpose: Get the bankledger (chit) and thread group from the voucher.
- * Returns: bankledger and thread group if bank_account attribute present in voucher.
- *
+ * Routine: bank_get_bank_ledger_thread_group_and_persona
+ * Purpose: Get the bankledger (chit), thread group and persona id from the voucher.
+ * Returns: bankledger, thread group if bank_account attribute present in voucher
+ *          and persona_id
  */
 kern_return_t
-bank_get_bank_ledger_and_thread_group(
+bank_get_bank_ledger_thread_group_and_persona(
        ipc_voucher_t     voucher,
        ledger_t          *bankledger,
-       struct thread_group **banktg)
+       struct thread_group **banktg,
+       uint32_t *persona_id)
 {
        bank_account_t bank_account;
+       bank_task_t bank_task;
        struct thread_group *thread_group = NULL;
 
        bank_account = bank_get_voucher_bank_account(voucher);
-       *bankledger = bank_get_bank_account_ledger(bank_account);
-       thread_group = bank_get_bank_account_thread_group(bank_account);
+       bank_task = get_bank_task_context(current_task(), FALSE);
+       if (persona_id != NULL) {
+               if (bank_account != BANK_ACCOUNT_NULL) {
+                       *persona_id = bank_account->ba_so_persona_id;
+               } else {
+                       *persona_id = bank_task->bt_persona_id;
+               }
+       }
+       /*
+        * Use BANK_ACCOUNT_NULL if the ba_holder is same as ba_merchant
+        * and bank account thread group is same as current thread group
+        * i.e. ba_merchant's thread group.
+        *
+        * The bank account might have ba_holder same as ba_merchant but different
+        * thread group if daemon sends a voucher to an App and then App sends the
+        * same voucher back to the daemon (IPC code will replace thread group in the
+        * voucher to App's thread group when it gets auto redeemed by the App).
+        */
+       if ((bank_account != NULL) &&
+           (bank_account->ba_holder == bank_account->ba_merchant) &&
+           (bank_get_bank_account_thread_group(bank_account) ==
+           bank_get_bank_task_thread_group(bank_account->ba_merchant))) {
+               bank_account = BANK_ACCOUNT_NULL;
+       }
 
-       /* Return NULL thread group if voucher has current task's thread group */
-       if (thread_group == bank_get_bank_task_thread_group(
-                   get_bank_task_context(current_task(), FALSE))) {
-               thread_group = NULL;
+       if (bankledger != NULL) {
+               *bankledger = bank_get_bank_account_ledger(bank_account);
+       }
+
+       if (banktg != NULL) {
+               thread_group = bank_get_bank_account_thread_group(bank_account);
+
+               /* Return NULL thread group if voucher has current task's thread group */
+               if (thread_group == bank_get_bank_task_thread_group(bank_task)) {
+                       thread_group = NULL;
+               }
+               *banktg = thread_group;
        }
-       *banktg = thread_group;
        return KERN_SUCCESS;
 }
 
@@ -1645,3 +1855,23 @@ bank_swap_thread_bank_ledger(thread_t thread __unused, ledger_t new_ledger __unu
                    effective_energy_consumed);
        }
 }
+
+/*
+ * Routine: bank_verify_persona_id
+ * Purpose: Verifies if the persona id is valid
+ *
+ * The caller should check if the task is entitled
+ * to do the lookup.
+ */
+static boolean_t
+bank_verify_persona_id(uint32_t persona_id)
+{
+       /* A successful lookup implies that the persona id is valid */
+       void *persona = persona_lookup(persona_id);
+       if (!persona) {
+               return FALSE;
+       }
+       persona_put(persona);
+
+       return TRUE;
+}
index f20d0995098970a1f74124f21f05509272d607da..f78a64dda94f5a66b506b542a0095baa8a41a588 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -69,8 +69,8 @@ struct bank_task {
        ledger_t                  bt_ledger;               /* Ledger of the customer task */
        queue_head_t              bt_accounts_to_pay;      /* List of accounts worked for me and need to pay */
        queue_head_t              bt_accounts_to_charge;   /* List of accounts I did work and need to charge */
-       decl_lck_mtx_data(, bt_acc_to_pay_lock)            /* Lock to protect accounts to pay list */
-       decl_lck_mtx_data(, bt_acc_to_charge_lock)         /* Lock to protect accounts to charge list */
+       decl_lck_mtx_data(, bt_acc_to_pay_lock);           /* Lock to protect accounts to pay list */
+       decl_lck_mtx_data(, bt_acc_to_charge_lock);        /* Lock to protect accounts to charge list */
        uint8_t                   bt_hasentitlement;       /* If the secure persona entitlement is set on the task */
 #if DEVELOPMENT || DEBUG
        queue_chain_t             bt_global_elt;           /* Element on the global bank task chain */
@@ -108,13 +108,13 @@ typedef struct bank_task * bank_task_t;
                (OSAddAtomic(-(num), &(elem)->bt_refs))
 
 #define bank_task_made_reference(elem)  \
-               (hw_atomic_add(&(elem)->bt_made, 1) - 1)
+               (os_atomic_inc_orig(&(elem)->bt_made, relaxed))
 
 #define bank_task_made_release(elem)    \
-               (hw_atomic_sub(&(elem)->bt_made, 1) + 1)
+               (os_atomic_dec_orig(&(elem)->bt_made, relaxed))
 
 #define bank_task_made_release_num(elem, num)   \
-               (hw_atomic_sub(&(elem)->bt_made, (num)) + (num))
+               (os_atomic_sub_orig(&(elem)->bt_made, (num), relaxed))
 
 
 struct bank_account {
@@ -129,6 +129,8 @@ struct bank_account {
 #if DEVELOPMENT || DEBUG
        queue_chain_t       ba_global_elt;           /* Element on the global account chain */
 #endif
+       uint32_t            ba_so_persona_id;        /* Persona ID of ba_secureoriginator,
+                                                     *  unless modified by a entitled process */
 };
 
 #define ba_type             ba_elem.be_type
@@ -153,13 +155,13 @@ typedef struct bank_account * bank_account_t;
                (OSAddAtomic(-(num), &(elem)->ba_refs))
 
 #define bank_account_made_reference(elem)       \
-               (hw_atomic_add(&(elem)->ba_made, 1) - 1)
+               (os_atomic_inc_orig(&(elem)->ba_made, relaxed))
 
 #define bank_account_made_release(elem)         \
-               (hw_atomic_sub(&(elem)->ba_made, 1) + 1)
+               (os_atomic_dec_orig(&(elem)->ba_made, relaxed))
 
 #define bank_account_made_release_num(elem, num)        \
-               (hw_atomic_sub(&(elem)->ba_made, (num)) + (num))
+               (os_atomic_sub_orig(&(elem)->ba_made, (num), relaxed))
 
 struct _bank_ledger_indices {
        int cpu_time;
@@ -175,8 +177,8 @@ extern void bank_billed_balance_safe(task_t task, uint64_t *cpu_time, uint64_t *
 extern void bank_billed_balance(bank_task_t bank_task, uint64_t *cpu_time, uint64_t *energy);
 extern void bank_serviced_balance_safe(task_t task, uint64_t *cpu_time, uint64_t *energy);
 extern void bank_serviced_balance(bank_task_t bank_task, uint64_t *cpu_time, uint64_t *energy);
-extern kern_return_t bank_get_bank_ledger_and_thread_group(ipc_voucher_t voucher,
-    ledger_t *bankledger, struct thread_group **banktg);
+extern kern_return_t bank_get_bank_ledger_thread_group_and_persona(ipc_voucher_t voucher,
+    ledger_t *bankledger, struct thread_group **banktg, uint32_t *persona_id);
 extern void bank_swap_thread_bank_ledger(thread_t thread, ledger_t ledger);
 
 #endif /* MACH_KERNEL_PRIVATE */
index 51c40830d56728df4a870fa342134fb19d643bf9..7483b2a68e389fe9372438798c2a10a70d65179a 100644 (file)
 
 #define MACH_VOUCHER_ATTR_BANK_NULL             ((mach_voucher_attr_recipe_command_t)601)
 #define MACH_VOUCHER_ATTR_BANK_CREATE           ((mach_voucher_attr_recipe_command_t)610)
+#define MACH_VOUCHER_ATTR_BANK_MODIFY_PERSONA   ((mach_voucher_attr_recipe_command_t)611)
 
 #define MACH_VOUCHER_BANK_CONTENT_SIZE (500)
 
 typedef uint32_t bank_action_t;
 #define BANK_ORIGINATOR_PID     0x1
 #define BANK_PERSONA_TOKEN      0x2
+#define BANK_PERSONA_ID         0x3
 
 struct proc_persona_info {
        uint64_t unique_pid;
@@ -57,8 +59,15 @@ struct persona_token {
        struct proc_persona_info proximate;
 };
 
+struct persona_modify_info {
+       uint32_t persona_id;
+       uint64_t unique_pid;
+};
+
 #ifdef PRIVATE
-#define ENTITLEMENT_PERSONA_PROPAGATE "com.apple.private.personas.propagate"
+/* Redeem bank voucher on behalf of another process while changing the persona */
+#define ENTITLEMENT_PERSONA_MODIFY    "com.apple.private.persona.modify"
+#define ENTITLEMENT_PERSONA_NO_PROPAGATE "com.apple.private.personas.no.propagate"
 #endif /* PRIVATE */
 
 #endif /* _BANK_BANK_TYPES_H_ */
index 7bd79d9ae78b20528784ccb5d5684d4a6bcb579f..05c4b79cf0cdde381cf245f419ccc25af4fec458 100644 (file)
@@ -23,7 +23,7 @@ endif
 
 $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS)
        $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG)
-       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG);
+       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG)
 
 do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
        $(_v)${MAKE} \
@@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
                SOURCE=$(subst conf/,,$(SOURCE))                        \
                TARGET=${TARGET}                                        \
                OBJPATH=${OBJPATH}                                      \
-               build_all;
+               build_all
 
 do_build_all:: do_all
 
index d296cb6a72917e91169b825a0dbd4e59dbca5ef6..2db9fb56696973f3fa95945222e2a549aadd2399 100644 (file)
@@ -76,6 +76,14 @@ OBJS_NO_SIGN_COMPARE =               \
 $(foreach file,$(OBJS_NO_CAST_ALIGN),$(eval $(call add_perfile_cflags,$(file),-Wno-cast-align)))
 $(foreach file,$(OBJS_NO_SIGN_COMPARE),$(eval $(call add_perfile_cflags,$(file),-Wno-sign-compare)))
 
+ifeq ($(KSANCOV),1)
+# Don't instrument functions called by the ksancov runtime. SanitizeCoverage does
+# not support blacklists, so exclude the whole file.
+machine_routines.o_CFLAGS_RM = $(KSANCOV_CFLAGS)
+machine_routines_common.o_CFLAGS_RM = $(KSANCOV_CFLAGS)
+pcb_native.o_CFLAGS_RM = $(KSANCOV_CFLAGS)
+endif
+
 #
 # XXX: INCFLAGS to include libsa prototypes
 #
@@ -138,9 +146,9 @@ $(COMPONENT).filelist: $(OBJS)
                 $(SEG_HACK) -n __HIB -o $${hib_file}__ $${hib_file} || exit 1; \
                 mv $${hib_file}__ $${hib_file} || exit 1; \
        done
-       @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)"
+       $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0))
        $(_v)for obj in ${OBJS}; do     \
-                echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
+                $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
        done > $(COMPONENT).filelist
 
 do_all: $(COMPONENT).filelist
@@ -162,7 +170,7 @@ endif
 -include genassym.d
 genassym.o: .CFLAGS $(firstword $(MAKEFILE_LIST))
 genassym.o: $(SOURCE_DIR)/$(COMPONENT)/$(GENASSYM_LOCATION)/genassym.c
-       @echo "[$(CMD_MC)] $(ColorH)GENASSYM$(Color0)      $(ColorLF)$<$(Color0)"
+       $(call makelog,[$(CMD_MC)] $(ColorH)GENASSYM$(Color0)      $(ColorLF)$<$(Color0))
        $(_v)${GENASSYM_KCC} ${CFLAGS} ${CFLAGS_NOLTO_FLAG} -MD -S -o ${@} ${INCFLAGS} $<
 
 assym.s: genassym.o
index 57759351c04767edcd3b13f2d826c13cc02f278d..98df09944b1d797e2538a138111b66c07692e556 100644 (file)
@@ -2,7 +2,7 @@
 #BEGIN Machine dependent Makefile fragment for x86_64
 ######################################################################
 
-CWARNFLAGS = $(CWARNFLAGS_STD) -Wshorten-64-to-32
+CWARNFLAGS = $(CWARNFLAGS_STD) -Wshorten-64-to-32 -Wno-atomic-implicit-seq-cst
 
 # Files that must go in the __HIB segment:
 UNCONFIGURED_HIB_FILES=                                        \
index d7a06429a83adec0186cccec9ee2adc9b183722d..4c6f803a06a924f0f5a5e99c6bec6e0935a14e9a 100644 (file)
@@ -48,7 +48,6 @@ OPTIONS/mach_pagemap          optional mach_pagemap
 OPTIONS/mach_vm_debug          optional mach_vm_debug
 OPTIONS/mach_page_hash_stats    optional mach_page_hash_stats
 OPTIONS/mig_debug              optional mig_debug
-OPTIONS/xpr_debug              optional xpr_debug
 OPTIONS/zone_debug             optional zone_debug
 OPTIONS/vm_cpm                 optional vm_cpm
 OPTIONS/task_swapper           optional task_swapper
@@ -112,6 +111,7 @@ osfmk/ipc/mach_msg.c                        standard
 osfmk/ipc/mach_port.c                  standard
 osfmk/ipc/mig_log.c                    optional mig_debug
 osfmk/kern/affinity.c                  standard
+osfmk/kern/arcade.c                            optional config_arcade
 osfmk/kern/ast.c                       standard
 osfmk/kern/audit_sessionport.c         optional config_audit
 osfmk/kern/backtrace.c                 standard
@@ -123,6 +123,7 @@ osfmk/kern/coalition.c                      optional config_coalitions
 osfmk/kern/counters.c                  standard
 osfmk/kern/cpu_quiesce.c               optional config_quiesce_counter
 osfmk/kern/debug.c                     standard
+osfmk/kern/ecc_logging.c                       optional config_ecc_logging
 osfmk/kern/energy_perf.c               standard
 osfmk/kern/exception.c         standard
 osfmk/kern/extmod_statistics.c         standard
@@ -136,24 +137,26 @@ osfmk/kern/ipc_misc.c                     standard
 osfmk/kern/ipc_sync.c                  standard
 osfmk/kern/ipc_tt.c                    standard
 osfmk/kern/kalloc.c                    standard
-osfmk/kern/ecc_logging.c                       optional config_ecc_logging
 osfmk/kern/ktrace_background_notify.c  standard
 osfmk/kern/ledger.c                    standard
 osfmk/kern/locks.c                     standard
 osfmk/kern/tlock.c                     standard
 osfmk/kern/ltable.c                    standard
-osfmk/kern/machine.c                   standard
 osfmk/kern/mach_node.c                 standard
+osfmk/kern/machine.c                   standard
 osfmk/kern/mk_sp.c                     standard
 osfmk/kern/mk_timer.c          standard
+osfmk/kern/mpsc_queue.c                standard
 osfmk/kern/page_decrypt.c      standard
 osfmk/kern/printf.c                    standard
 osfmk/kern/priority.c                  standard
 osfmk/kern/priority_queue.c            standard
 osfmk/kern/processor.c         standard
 osfmk/kern/processor_data.c            standard
+osfmk/kern/restartable.c               standard
 osfmk/kern/sched_average.c             standard
 osfmk/kern/sched_dualq.c       optional config_sched_multiq
+osfmk/kern/sched_clutch.c      optional config_clutch
 osfmk/kern/sched_prim.c                standard
 osfmk/kern/sched_proto.c       optional config_sched_proto
 osfmk/kern/sched_traditional.c optional config_sched_traditional
@@ -172,7 +175,9 @@ osfmk/kern/task.c                   standard
 osfmk/kern/task_policy.c       standard
 osfmk/kern/task_swap.c         standard
 osfmk/kern/test_lock.c         optional development
-osfmk/kern/test_lock.c          optional debug
+osfmk/kern/test_lock.c         optional debug
+osfmk/kern/test_mpsc_queue.c   optional development
+osfmk/kern/test_mpsc_queue.c   optional debug
 osfmk/kern/thread.c                    standard
 osfmk/kern/thread_act.c                standard
 osfmk/kern/thread_call.c       standard
@@ -184,7 +189,6 @@ osfmk/kern/turnstile.c      standard
 osfmk/kern/ux_handler.c                standard
 osfmk/kern/waitq.c                     standard
 osfmk/kern/work_interval.c             standard
-osfmk/kern/xpr.c                       optional xpr_debug
 osfmk/kern/zalloc.c                    standard
 osfmk/kern/zcache.c            optional config_zcache
 osfmk/kern/gzalloc.c           optional config_gzalloc
@@ -214,6 +218,7 @@ osfmk/kern/copyout_shim.c   optional copyout_shim
 ./mach/memory_entry_server.c           standard
 ./mach/memory_object_control_server.c  standard
 ./mach/resource_notify_user.c          standard
+./mach/restartable_server.c            standard
 ./mach/upl_server.c                    standard
 ./mach/audit_triggers_user.c           standard
 ./mach/task_access_user.c              standard
@@ -234,6 +239,12 @@ osfmk/atm/atm.c                    optional config_atm
 osfmk/voucher/ipc_pthread_priority.c           standard
 ./mach/coalition_notification_user.c   optional config_coalitions
 ./mach/sysdiagnose_notification_user.c optional config_sysdiagnose
+./mach/sysdiagnose_notification_user.c optional config_sysdiagnose
+./mach/vfs_nspace_user.c standard
+./mach/fairplayd_notification_user.c optional config_arcade
+./mach/arcade_upcall_user.c optional config_arcade
+./mach/arcade_register_server.c optional config_arcade
+
 #
 # For now, no external pagers
 #
@@ -317,19 +328,17 @@ osfmk/console/video_console.c     optional        video_console
 osfmk/kern/telemetry.c                 optional config_telemetry
 
 # Built-in corecrypto for early_random():
-osfmk/corecrypto/cc/src/cc_clear.c                     standard
-osfmk/corecrypto/cc/src/cc_cmp_safe.c                          standard
-osfmk/corecrypto/cc/src/cc_try_abort.c                 standard
-osfmk/corecrypto/ccdbrg/src/ccdrbg_nisthmac.c          standard
-osfmk/corecrypto/ccdigest/src/ccdigest_init.c          standard
-osfmk/corecrypto/ccdigest/src/ccdigest_update.c                standard
-osfmk/corecrypto/cchmac/src/cchmac.c                   standard
-osfmk/corecrypto/cchmac/src/cchmac_init.c              standard
-osfmk/corecrypto/cchmac/src/cchmac_update.c            standard
-osfmk/corecrypto/cchmac/src/cchmac_final.c             standard
+osfmk/corecrypto/cc/src/cc_clear.c     standard
+osfmk/corecrypto/cc/src/cc_cmp_safe.c  standard
+osfmk/corecrypto/cc/src/cc_abort.c     standard
+osfmk/corecrypto/ccdbrg/src/ccdrbg_nisthmac.c  standard
+osfmk/corecrypto/ccdigest/src/ccdigest_init.c  standard
+osfmk/corecrypto/ccdigest/src/ccdigest_update.c        standard
+osfmk/corecrypto/cchmac/src/cchmac.c   standard
+osfmk/corecrypto/cchmac/src/cchmac_init.c      standard
+osfmk/corecrypto/cchmac/src/cchmac_update.c    standard
+osfmk/corecrypto/cchmac/src/cchmac_final.c     standard
 osfmk/corecrypto/ccsha1/src/ccdigest_final_64be.c      standard
-osfmk/corecrypto/ccsha1/src/ccsha1_eay.c               standard
-osfmk/corecrypto/ccsha1/src/ccsha1_initial_state.c     standard
 
 osfmk/corecrypto/ccsha2/src/ccsha256_di.c      standard
 osfmk/corecrypto/ccsha2/src/ccsha256_initial_state.c   standard
@@ -337,4 +346,4 @@ osfmk/corecrypto/ccsha2/src/ccsha256_K.c    standard
 osfmk/corecrypto/ccsha2/src/ccsha256_ltc_compress.c    standard
 osfmk/corecrypto/ccsha2/src/ccsha256_ltc_di.c  standard
 
-osfmk/prng/prng_random.c               standard
+osfmk/prng/prng_random.c       standard
index 11ca5662789aedc5141433ea33d63bfcc60f6518..777e24bc6f6163e09d358cddfd839833c5961e42 100644 (file)
@@ -72,6 +72,3 @@ osfmk/kperf/arm/kperf_mp.c      optional kperf
 osfmk/arm/kpc_arm.c            optional kpc
 
 osfmk/arm/monotonic_arm.c optional monotonic
-
-# Support for early_random()
-osfmk/corecrypto/ccn/src/arm/ccn_set.s standard
index 239a78423bb9f7bf7f47f152e61f0720603a5129..2fb849e8b01ac542c1f5750c9aabe4cbe2f0f226 100644 (file)
@@ -36,6 +36,7 @@ osfmk/arm/io_map.c            standard
 osfmk/arm64/loose_ends.c       standard
 osfmk/arm/locks_arm.c  standard
 osfmk/arm64/locore.s   standard
+osfmk/arm64/gxf_exceptions.s   standard
 osfmk/arm64/lowmem_vectors.c   standard
 osfmk/arm64/sleh.c                     standard
 osfmk/arm64/start.s    standard
@@ -77,14 +78,12 @@ osfmk/arm64/kpc.c           optional kpc
 
 osfmk/arm64/monotonic_arm64.c optional monotonic
 
-osfmk/arm64/platform_tests.c   optional config_xnupost
+osfmk/arm64/platform_tests.c                   optional config_xnupost
+osfmk/arm64/platform_tests_asm.s               optional config_xnupost
 
 osfmk/arm64/alternate_debugger.c               optional alternate_debugger
 osfmk/arm64/alternate_debugger_asm.s           optional alternate_debugger
 
-# Support for early_random()
-osfmk/corecrypto/ccn/src/ccn_set.c             standard
-
 osfmk/arm64/pgtrace.c           standard
 osfmk/arm64/pgtrace_decoder.c   optional config_pgtrace_nonkext
 osfmk/arm64/machine_remote_time.c optional config_mach_bridge_recv_time
index dfe06058dfebb148a3512e8906e0e60c4b9a7967..dd89ec483f0e6d940b46a98694ad90a2fd306279 100644 (file)
@@ -2,8 +2,6 @@ OPTIONS/fb                              optional fb
 
 OPTIONS/debug                  optional debug
 
-OPTIONS/gprof          optional gprof
-
 osfmk/vm/vm_apple_protect.c     standard
 
 #osfmk/x86_64/hi_res_clock_map.c       optional hi_res_clock
@@ -85,12 +83,6 @@ osfmk/i386/acpi.c            standard
 
 osfmk/i386/mtrr.c              optional    config_mtrr
 
-
-#osfmk/profiling/x86_64/profile-md.c   optional gprof
-#osfmk/profiling/x86_64/profile-asm.s  optional gprof
-#osfmk/profiling/profile-kgmon.c               optional gprof
-#osfmk/profiling/profile-mk.c          optional gprof
-
 osfmk/kdp/ml/x86_64/kdp_machdep.c      optional        mach_kdp
 osfmk/kdp/ml/x86_64/kdp_vm.c           optional        mach_kdp
 osfmk/kdp/ml/i386/kdp_x86_common.c     optional        mach_kdp
index 160ab8fb1802ee4d735aa6c08d4f157900d9009c..cc887d62d03d185395cb4082902fa47b496f478f 100644 (file)
@@ -310,10 +310,10 @@ _cnputs(char * c, int size)
        }
 
        while (size-- > 0) {
-               cons_ops[cons_ops_index].putc(0, 0, *c);
                if (*c == '\n') {
                        cons_ops[cons_ops_index].putc(0, 0, '\r');
                }
+               cons_ops[cons_ops_index].putc(0, 0, *c);
                c++;
        }
 
@@ -407,7 +407,7 @@ console_ring_try_empty(void)
                boolean_t state = ml_set_interrupts_enabled(FALSE);
 
                /* Indicate that we're in the process of writing a block of data to the console. */
-               (void)hw_atomic_add(&console_output, 1);
+               os_atomic_inc(&console_output, relaxed);
 
                simple_lock_try_lock_loop(&console_ring.write_lock, LCK_GRP_NULL);
 
@@ -430,7 +430,7 @@ console_ring_try_empty(void)
 
                simple_unlock(&console_ring.write_lock);
 
-               (void)hw_atomic_sub(&console_output, 1);
+               os_atomic_dec(&console_output, relaxed);
 
                simple_unlock(&console_ring.read_lock);
 
@@ -658,7 +658,7 @@ vcgetc(__unused int l, __unused int u, __unused boolean_t wait, __unused boolean
 {
        char c;
 
-       if (0 == (*PE_poll_input)(0, &c)) {
+       if (0 == PE_stub_poll_input(0, &c)) {
                return c;
        } else {
                return 0;
@@ -681,7 +681,7 @@ alloc_free_func(void * arg, wait_result_t wres __unused)
        T_LOG("Doing %d iterations of console cpu alloc and free.", count);
 
        while (count-- > 0) {
-               (void)hw_atomic_add(&cons_test_ops_count, 1);
+               os_atomic_inc(&cons_test_ops_count, relaxed);
                cbp = (console_buf_t *)console_cpu_alloc(0);
                if (cbp == NULL) {
                        T_ASSERT_NOTNULL(cbp, "cpu allocation failed");
@@ -702,7 +702,7 @@ log_to_console_func(void * arg __unused, wait_result_t wres __unused)
        uint64_t thread_id = current_thread()->thread_id;
        char somedata[10] = "123456789";
        for (int i = 0; i < 26; i++) {
-               (void)hw_atomic_add(&cons_test_ops_count, 1);
+               os_atomic_inc(&cons_test_ops_count, relaxed);
                printf(" thid: %llu printf iteration %d\n", thread_id, i);
                cnputc_unbuffered((char)('A' + i));
                cnputc_unbuffered('\n');
index fb2d0d6c506b86a071e7f4abf06e00e1a5abb1f4..7a8cfeb00c86dcaee98413cd24daa8fd9867ee85 100644 (file)
@@ -72,9 +72,9 @@ serial_keyboard_start(void)
 {
        /* Go see if there are any characters pending now */
        serial_keyboard_poll();
-       panic("serial_keyboard_start: we can't get back here\n");
 }
 
+__dead2
 void
 serial_keyboard_poll(void)
 {
index 68f4a21e7d5b537a1dc364797d0feec3faa5c9a3..e508ee10bf6b548743423fd509f5cc17fe9d604b 100644 (file)
@@ -41,8 +41,8 @@ extern "C" {
 
 
 void serial_keyboard_init(void);
-void serial_keyboard_start(void);
-void serial_keyboard_poll(void);
+void serial_keyboard_start(void) __dead2;
+void serial_keyboard_poll(void) __dead2;
 
 extern uint32_t serialmode;
 
index 2034f1e51755faacc551a9e3411b33935e06dd9a..6a2131ca07d3f260a4576697118bd7025068a2cb 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
  * @OSF_FREE_COPYRIGHT@
- * 
+ *
  */
 /*
  * @APPLE_FREE_COPYRIGHT@
  */
 /*
- *     NetBSD: ite.c,v 1.16 1995/07/17 01:24:34 briggs Exp     
+ *     NetBSD: ite.c,v 1.16 1995/07/17 01:24:34 briggs Exp
  *
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1990, 1993
@@ -196,7 +196,7 @@ MACRO_END
 #endif
 
 /*
-# Attribute codes: 
+# Attribute codes:
 # 00=none 01=bold 04=underscore 05=blink 07=reverse 08=concealed
 # Text color codes:
 # 30=black 31=red 32=green 33=yellow 34=blue 35=magenta 36=cyan 37=white
@@ -241,7 +241,7 @@ enum vt100state_e {
 } gc_vt100state = ESnormal;
 
 
-enum 
+enum
 {
     /* secs */
     kProgressAcquireDelay   = 0,
@@ -295,7 +295,7 @@ static void gc_set_tab_stop(unsigned int column, boolean_t enabled);
 static void gc_show_cursor(unsigned int xx, unsigned int yy);
 static void gc_update_color(int color, boolean_t fore);
 
-static void 
+static void
 gc_clear_line(unsigned int xx, unsigned int yy, int which)
 {
        unsigned int start, end, i;
@@ -329,7 +329,7 @@ gc_clear_line(unsigned int xx, unsigned int yy, int which)
        }
 }
 
-static void 
+static void
 gc_clear_screen(unsigned int xx, unsigned int yy, int top, unsigned int bottom,
                int which)
 {
@@ -538,7 +538,7 @@ gc_paint_char(unsigned int xx, unsigned int yy, unsigned char ch, int attrs)
        if ( xx < gc_buffer_columns && yy < gc_buffer_rows )
        {
                uint32_t index = (yy * gc_buffer_columns) + xx;
+
                gc_buffer_attributes[index] = attrs;
                gc_buffer_characters[index] = ch;
                gc_buffer_colorcodes[index] = gc_color_code;
@@ -547,7 +547,7 @@ gc_paint_char(unsigned int xx, unsigned int yy, unsigned char ch, int attrs)
        gc_ops.paint_char(xx, yy, ch, attrs, 0, 0);
 }
 
-static void 
+static void
 gc_putchar(char ch)
 {
        if (!ch) {
@@ -1871,7 +1871,7 @@ static int8_t                     vc_uiscale = 1;
 vc_progress_user_options        vc_progress_options;
 vc_progress_user_options        vc_user_options;
 
-decl_simple_lock_data(,vc_progress_lock)
+decl_simple_lock_data(,vc_progress_lock);
 
 #if !CONFIG_EMBEDDED
 static int                     vc_progress_withmeter = 3;
@@ -3306,6 +3306,3 @@ vc_set_progressmeter(int new_value)
 }
 
 #endif /* !CONFIG_EMBEDDED */
-
-
-
diff --git a/osfmk/corecrypto/cc/src/cc_abort.c b/osfmk/corecrypto/cc/src/cc_abort.c
new file mode 100644 (file)
index 0000000..726af16
--- /dev/null
@@ -0,0 +1,81 @@
+/*
+ *  cc_abort.c
+ *  corecrypto
+ *
+ *  Created on 3/9/2019
+ *
+ *  Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <corecrypto/cc_priv.h>
+
+//cc_abort() is implemented to comply with by FIPS 140-2, when DRBG produces
+//two equal consecutive blocks. See rdar://19129408
+
+#if !CC_PROVIDES_ABORT
+
+#error "This environment does not provide an abort()/panic()-like function"
+
+#elif CC_KERNEL
+
+#include <kern/debug.h>
+void
+cc_abort(const char * msg)
+{
+       panic("%s", msg);
+}
+
+#elif CC_USE_L4
+
+#include <sys/panic.h>
+#include <stdarg.h>
+void
+cc_abort(const char * msg)
+{
+       sys_panic(msg);
+}
+
+#elif CC_RTKIT
+
+#include <RTK_platform.h>
+void
+cc_abort(const char * msg)
+{
+       RTK_abort("%s", msg);
+}
+
+#else
+
+#include <stdlib.h>
+void
+cc_abort(const char * msg CC_UNUSED)
+{
+       abort();
+}
+
+#endif
index 1733f9a538bffddccad45590cfe8718dac416f6e..db21af6c91cf5d657c452df05d33732a713e6c13 100644 (file)
@@ -66,6 +66,6 @@ cc_clear(size_t len, void *dst)
 /* This is an altarnative for clang that should work
  *  void cc_clear(size_t len, void *dst) __attribute__ ((optnone))
  *  {
- *  cc_zero(len,dst);
+ *  cc_clear(len,dst);
  *  }
  */
index ee9efab1184959f5d7b5727728f6548d6e81bd32..dcafb1e11d4b706d049c1648de0c4cbbc8d2b9c3 100644 (file)
@@ -44,6 +44,6 @@ cc_cmp_safe(size_t num, const void * ptr1, const void * ptr2)
        for (i = 0; i < num; i++) {
                flag |= (s[i] ^ t[i]);
        }
-       HEAVISIDE_STEP_UINT8(flag, flag); // flag=(flag==0)?0:1;
+       CC_HEAVISIDE_STEP(flag, flag); // flag=(flag==0)?0:1;
        return flag; // 0 iff all bytes were equal, 1 if there is any difference
 }
diff --git a/osfmk/corecrypto/cc/src/cc_try_abort.c b/osfmk/corecrypto/cc/src/cc_try_abort.c
deleted file mode 100644 (file)
index 31a07ba..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- *  cc_try_abort.c
- *  corecrypto
- *
- *  Created on 7/16/2015
- *
- *  Copyright (c) 2014,2015 Apple Inc. All rights reserved.
- *
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#include <corecrypto/cc_priv.h>
-
-//cc_try_abort() is implemented to comply with by FIPS 140-2, when DRBG produces
-//two equal consecutive blocks. See radar 19129408
-
-#if CC_KERNEL
-#include <kern/debug.h>
-void
-cc_try_abort(const char * msg CC_UNUSED, ...)
-{
-       panic("%s", msg);
-}
-
-#elif CC_USE_SEPROM || CC_USE_S3 || CC_BASEBAND || CC_EFI || CC_IBOOT || CC_RTKIT || CC_RTKITROM
-void
-cc_try_abort(const char * msg CC_UNUSED, ...)
-{
-       //Do nothing and return because we don't have panic() in those
-       //environments. Make sure you return error, when using cc_try_abort() in above environments
-}
-
-#else
-#include <stdlib.h>
-void
-cc_try_abort(const char * msg CC_UNUSED, ...)
-{
-       abort();
-}
-#endif
index 5757bd413b049156f26b3cdf57791c5906ded0a7..bb85bbf14558638ae9292967262e8111512d8f73 100644 (file)
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
+#include <stdbool.h>
+
+#include <corecrypto/cc_priv.h>
 #include <corecrypto/ccdrbg.h>
 #include <corecrypto/cchmac.h>
 #include <corecrypto/ccsha2.h>
-#include <corecrypto/cc_priv.h>
 #include <corecrypto/cc_macros.h>
 
-// Test vectors at:
-//      http://csrc.nist.gov/groups/STM/cavp/#05
-//      http://csrc.nist.gov/groups/STM/cavp/documents/drbg/drbgtestvectors.zip
-//
-
-/*
- *  This HMAC DBRG is described in:
- *
- *  SP 800-90 A Rev. 1 (2nd Draft)
- *  DRAFT Recommendation for Random Number Generation Using Deterministic Random Bit Generators
- *  April 2014
- *
- *
- *  See in particular
- *  - 10.1.2 HMAC_DRBG (p 45)
- *  - B.2 HMAC_DRBGExample (p 83)
- *
- *  We support maximum security strength of 256 bits
- *  Note that the example in B.2 is very limited, refer to §10.1.2 for more
- */
-
-/*
- *  The Get_entropy_input function is specified in pseudocode in [SP 800-90C] for various RBG constructions;
- *  however, in general, the function has the following meaning:
- *  Get_entropy_input: A function that is used to obtain entropy input. The function call is:
- *  (status, entropy_input) = Get_entropy_input (min_entropy, min_ length, max_ length, prediction_resistance_request),
- *  which requests a string of bits (entropy_input) with at least min_entropy bits of entropy. The length for the string
- *  shall be equal to or greater than min_length bits, and less than or equal to max_length bits. The
- *  prediction_resistance_request parameter indicates whether or not prediction resistance is to be provided during the request
- *  (i.e., whether fresh entropy is required). A status code is also returned from the function.
- */
-
-/*
- *  Check the validity of the input parameters.
- *  1. If (requested_instantiation_security_strength > 256), then Return (“Invalid
- *  requested_instantiation_security_strength”, −1).
- *  2. If (len (personalization_string) > 160), then Return (“Personalization_string
- *  too long”, −1)
- *  Comment: Set the security_strength to one of the valid security strengths.
- *  3. If (requested_security_strength ≤ 112), then security_strength = 112 Else (requested_ security_strength ≤ 128), then security_strength = 128 Else (requested_ security_strength ≤ 192), then security_strength = 192 Else security_strength = 256.
- *  Comment: Get the entropy_input and the nonce.
- *  4. min_entropy = 1.5 × security_strength.
- *  5. (status, entropy_input) = Get_entropy_input (min_entropy, 1000).
- *  6. If (status ≠ “Success”), then Return (status, −1).
- */
+// This HMAC DRBG is described in:
 
-/*
- *  1. highest_supported_security_strength = 256.
- *  2. Output block (outlen) = 256 bits.
- *  3. Required minimum entropy for the entropy input at instantiation = 3/2 security_strength (this includes the entropy required for the nonce).
- *  4. Seed length (seedlen) = 440 bits.
- *  5. Maximum number of bits per request (max_number_of_bits_per_request) = 7500
- *  bits.
- *  6. Reseed_interval (reseed_ interval) = 10,000 requests.
- *  7. Maximum length of the personalization string (max_personalization_string_length) = 160 bits.
- *  8. Maximum length of the entropy input (max _length) = 1000 bits.
- */
+// NIST SP 800-90A Rev. 1
+// Recommendation for Random Number Generation Using Deterministic Random Bit Generators
+// June 2015
 
-//
-// Defines below based on 10.1, Table 2: Definitions for Hash-Based DRBG Mechanisms (p 39)
-//
+// See in particular:
+// - 9 DRBG Mechanism Functions
+// - 10.1.2 HMAC_DRBG
+// - B.2 HMAC_DRBGExample
 
-#define NH_MAX_OUTPUT_BLOCK_SIZE    (CCSHA512_OUTPUT_SIZE)          // 512 bits, i.e. 64 bytes (CCSHA512_OUTPUT_SIZE)
-#define NH_MAX_KEY_SIZE             (CCSHA512_OUTPUT_SIZE)          // 512 bits, i.e. 64 bytes (CCSHA512_OUTPUT_SIZE)
+#define NISTHMAC_MAX_OUTPUT_SIZE (CCSHA512_OUTPUT_SIZE)
 
-#define MIN_REQ_ENTROPY(di)            ((di)->output_size/2)
+#define MIN_REQ_ENTROPY(di) ((di)->output_size / 2)
 
 struct ccdrbg_nisthmac_state {
-       const struct ccdrbg_nisthmac_custom *custom; //ccdrbg_nisthmac_state does not need to store ccdrbg_info. ccdrbg_nisthmac_custom is sufficient
-       size_t bytesLeft;
-       uint64_t reseed_counter; // the reseed counter should be able to hole 2^^48. size_t might be smaller than 48 bits
-       size_t  vsize;
-       size_t  keysize;
-       uint8_t v[2 * NH_MAX_OUTPUT_BLOCK_SIZE];
-       uint8_t *vptr;
-       uint8_t *nextvptr;
-       uint8_t key[NH_MAX_KEY_SIZE];
+       const struct ccdrbg_nisthmac_custom *custom;
+       uint8_t key[NISTHMAC_MAX_OUTPUT_SIZE];
+       uint8_t V[NISTHMAC_MAX_OUTPUT_SIZE];
+       uint64_t reseed_counter;
 };
 
 #define DRBG_NISTHMAC_DEBUG 0
 
-
 #if DRBG_NISTHMAC_DEBUG
-#include "cc_debug.h"
+#include <corecrypto/cc_debug.h>
 
 static void
-dumpState(const char *label, struct ccdrbg_nisthmac_state *state)
+dump_state(const char *label, struct ccdrbg_nisthmac_state *drbg_ctx)
 {
-       //cc_print(label, state->vsize, state->nextvptr);
-       cc_print(label, state->vsize, state->vptr);
-       cc_print(label, state->keysize, state->key);
+       size_t outlen = drbg_ctx->custom->di->output_size;
+
+       cc_print(label, outlen, drbg_ctx->key);
+       cc_print(label, outlen, drbg_ctx->V);
 }
 #endif
 
+// See NIST SP 800-90A, Rev. 1, 9.4
+static void
+done(struct ccdrbg_state *ctx)
+{
+       cc_clear(sizeof(struct ccdrbg_nisthmac_state), ctx);
+}
+
+// See NIST SP 800-90A, Rev. 1, 10.1.2.2
+static void
+update(struct ccdrbg_state *ctx, unsigned ndata, ...)
+{
+       struct ccdrbg_nisthmac_state *drbg_ctx = (struct ccdrbg_nisthmac_state *)ctx;
+       const struct ccdigest_info *info = drbg_ctx->custom->di;
+       size_t outlen = info->output_size;
+       size_t data_nbytes = 0;
+       va_list args;
 
-static void done(struct ccdrbg_state *drbg);
+       cchmac_di_decl(info, hmac_ctx);
 
-/*
- *  NIST SP 800-90A, Rev. 1 HMAC_DRBG April 2014, p 46
- *
- *  HMAC_DRBG_Update (provided_data, K, V):
- *  1. provided_data: The data to be used.
- *  2. K: The current value of Key.
- *  3. V: The current value of V.
- *  Output:
- *  1. K: The new value for Key.
- *  2. V: The new value for V.
- *
- *  HMAC_DRBG Update Process:
- *
- *  1. K = HMAC (K, V || 0x00 || provided_data).
- *  2. V=HMAC(K,V).
- *  3. If (provided_data = Null), then return K and V.
- *  4. K = HMAC (K, V || 0x01 || provided_data).
- *  5. V=HMAC(K,V).
- *  6. Return K and V.
- */
+       for (uint8_t b = 0; b < 2; b += 1) {
+               cchmac_init(info, hmac_ctx, outlen, drbg_ctx->key);
 
-// was: size_t providedDataLength, const void *providedData
+               cchmac_update(info, hmac_ctx, outlen, drbg_ctx->V);
 
-/*
- *  To handle the case where we have three strings that are concatenated,
- *  we pass in three (ptr, len) pairs
- */
+               cchmac_update(info, hmac_ctx, sizeof(b), &b);
 
-static int
-hmac_dbrg_update(struct ccdrbg_state *drbg,
-    size_t daLen, const void *da,
-    size_t dbLen, const void *db,
-    size_t dcLen, const void *dc
-    )
-{
-       int rc = CCDRBG_STATUS_ERROR;
-       struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg;
-       const struct ccdigest_info *di = state->custom->di;
+               va_start(args, ndata);
 
-       const unsigned char cZero = 0x00;
-       const unsigned char cOne  = 0x01;
+               for (unsigned i = 0; i < ndata; i += 1) {
+                       size_t nbytes = va_arg(args, size_t);
+                       const void *buf = va_arg(args, const void *);
 
-       cchmac_ctx_decl(di->state_size, di->block_size, ctx);
-       cchmac_init(di, ctx, state->keysize, state->key);
+                       cchmac_update(info, hmac_ctx, nbytes, buf);
 
-       // 1. K = HMAC (K, V || 0x00 || provided_data).
-       cchmac_update(di, ctx, state->vsize, state->vptr);
-       cchmac_update(di, ctx, 1, &cZero);
-       if (da && daLen) {
-               cchmac_update(di, ctx, daLen, da);
-       }
-       if (db && dbLen) {
-               cchmac_update(di, ctx, dbLen, db);
-       }
-       if (dc && dcLen) {
-               cchmac_update(di, ctx, dcLen, dc);
-       }
-       cchmac_final(di, ctx, state->key);
-
-       // One parameter must be non-empty, or return
-       if (((da && daLen) || (db && dbLen) || (dc && dcLen))) {
-               //  2. V=HMAC(K,V).
-               cchmac(di, state->keysize, state->key, state->vsize, state->vptr, state->vptr);
-               //  4. K = HMAC (K, V || 0x01 || provided_data).
-               cchmac_init(di, ctx, state->keysize, state->key);
-               cchmac_update(di, ctx, state->vsize, state->vptr);
-               cchmac_update(di, ctx, 1, &cOne);
-               if (da && daLen) {
-                       cchmac_update(di, ctx, daLen, da);
-               }
-               if (db && dbLen) {
-                       cchmac_update(di, ctx, dbLen, db);
+                       data_nbytes += nbytes;
                }
-               if (dc && dcLen) {
-                       cchmac_update(di, ctx, dcLen, dc);
-               }
-               cchmac_final(di, ctx, state->key);
-       }
-       //  If additional data 5. V=HMAC(K,V)
-       //  If no addtional data, this is step 2. V=HMAC(K,V).
-       state->bytesLeft = 0;
-
-       // FIPS 140-2 4.9.2 Conditional Tests
-       // "the first n-bit block generated after power-up, initialization, or reset shall not be used, but shall be saved for comparison with the next n-bit block to be generated"
-       // Generate the first block and the second block. Compare for FIPS and discard the first block
-       // We keep the second block as the first set of data to be returned
-       cchmac(di, state->keysize, state->key, state->vsize, state->vptr, state->vptr); // First block
-       cchmac(di, state->keysize, state->key, state->vsize, state->vptr, state->nextvptr); // First to be returned
-       if (0 == cc_cmp_safe(state->vsize, state->vptr, state->nextvptr)) {
-               //The world as we know it has come to an end
-               //the DRBG data structure is zeroized. subsequent calls to
-               //DRBG ends up in NULL dereferencing and/or unpredictable state.
-               //catastrophic error in SP 800-90A
-               done(drbg);
-               rc = CCDRBG_STATUS_ABORT;
-               cc_try_abort(NULL);
-               goto errOut;
-       }
-       rc = CCDRBG_STATUS_OK;
-errOut:
-       return rc;
-}
-
-//make sure state is initialized, before calling this function
-static int
-validate_inputs(struct ccdrbg_nisthmac_state *state,
-    size_t entropyLength,
-    size_t additionalInputLength,
-    size_t psLength)
-{
-       int rc;
-       const struct ccdrbg_nisthmac_custom *custom = state->custom;
-       const struct ccdigest_info *di  = custom->di;
 
-       rc = CCDRBG_STATUS_ERROR;
-       //buffer size checks
-       cc_require(di->output_size <= sizeof(state->v) / 2, end); //digest size too long
-       cc_require(di->output_size <= sizeof(state->key), end); //digest size too long
+               va_end(args);
 
-       //NIST SP800 compliance checks
-       //the following maximum checks are redundant if long is 32 bits.
+               cchmac_final(info, hmac_ctx, drbg_ctx->key);
 
-       rc = CCDRBG_STATUS_PARAM_ERROR;
-       cc_require(psLength <= CCDRBG_MAX_PSINPUT_SIZE, end); //personalization string too long
-       cc_require(entropyLength <= CCDRBG_MAX_ENTROPY_SIZE, end); //supplied too much entropy
-       cc_require(additionalInputLength <= CCDRBG_MAX_ADDITIONALINPUT_SIZE, end); //additional input too long
-       cc_require(entropyLength >= MIN_REQ_ENTROPY(di), end); //supplied too litle entropy
+               cchmac(info, outlen, drbg_ctx->key, outlen, drbg_ctx->V, drbg_ctx->V);
 
-       cc_require(di->output_size <= NH_MAX_OUTPUT_BLOCK_SIZE, end); //the requested security strength is not supported
+               if (data_nbytes == 0) {
+                       break;
+               }
+       }
 
-       rc = CCDRBG_STATUS_OK;
-end:
-       return rc;
+       cchmac_di_clear(info, hmac_ctx);
 }
 
-/*
- *  NIST SP 800-90A, Rev. 1 April 2014 B.2.2, p 84
- *
- *  HMAC_DRBG_Instantiate_algorithm (...):
- *  Input: bitstring (entropy_input, personalization_string).
- *  Output: bitstring (V, Key), integer reseed_counter.
- *
- *  Process:
- *  1. seed_material = entropy_input || personalization_string.
- *  2. Set Key to outlen bits of zeros.
- *  3. Set V to outlen/8 bytes of 0x01.
- *  4. (Key, V) = HMAC_DRBG_Update (seed_material, Key, V).
- *  5. reseed_counter = 1.
- *  6. Return (V, Key, reseed_counter).
- */
-
-// This version does not do memory allocation
-//SP800-90 A: Required minimum entropy for instantiate and reseed=security_strength
+static bool
+entropy_isvalid(size_t entropy_nbytes, const struct ccdigest_info *info)
+{
+       return (entropy_nbytes <= CCDRBG_MAX_ENTROPY_SIZE) && (entropy_nbytes >= MIN_REQ_ENTROPY(info));
+}
 
+// See NIST SP 800-90A, Rev. 1, 9.1 and 10.1.2.3
 static int
-hmac_dbrg_instantiate_algorithm(struct ccdrbg_state *drbg,
-    size_t entropyLength, const void *entropy,
-    size_t nonceLength, const void *nonce,
-    size_t psLength, const void *ps)
+init(const struct ccdrbg_info *info,
+    struct ccdrbg_state *ctx,
+    size_t entropy_nbytes,
+    const void *entropy,
+    size_t nonce_nbytes,
+    const void *nonce,
+    size_t ps_nbytes,
+    const void *ps)
 {
-       // TODO: The NIST code passes nonce (i.e. HMAC key) to generate, but cc interface isn't set up that way
-       struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg;
+       struct ccdrbg_nisthmac_state *drbg_ctx = (struct ccdrbg_nisthmac_state *)ctx;
+       drbg_ctx->custom = info->custom;
+       const struct ccdigest_info *digest_info = drbg_ctx->custom->di;
+       size_t outlen = digest_info->output_size;
 
-       // 1. seed_material = entropy_input || nonce || personalization_string.
+       int status = CCDRBG_STATUS_PARAM_ERROR;
+       cc_require(outlen <= NISTHMAC_MAX_OUTPUT_SIZE, out);
+       cc_require(entropy_isvalid(entropy_nbytes, digest_info), out);
+       cc_require(ps_nbytes <= CCDRBG_MAX_PSINPUT_SIZE, out);
 
-       // 2. Set Key to outlen bits of zeros.
-       cc_zero(state->keysize, state->key);
+       status = CCDRBG_STATUS_OK;
 
-       // 3. Set V to outlen/8 bytes of 0x01.
-       CC_MEMSET(state->vptr, 0x01, state->vsize);
+       cc_memset(drbg_ctx->key, 0, outlen);
+       cc_memset(drbg_ctx->V, 1, outlen);
 
-       // 4. (Key, V) = HMAC_DRBG_Update (seed_material, Key, V).
-       hmac_dbrg_update(drbg, entropyLength, entropy, nonceLength, nonce, psLength, ps);
+       update(ctx, 3, entropy_nbytes, entropy, nonce_nbytes, nonce, ps_nbytes, ps);
 
-       // 5. reseed_counter = 1.
-       state->reseed_counter = 1;
+       drbg_ctx->reseed_counter = 1;
 
-       return CCDRBG_STATUS_OK;
+out:
+       return status;
 }
 
-//  In NIST terminology, the nonce is the HMAC key and ps is the personalization string
-//  We assume that the caller has passed in
-//      min_entropy = NH_REQUIRED_MIN_ENTROPY(security_strength)
-//  bytes of entropy
+static bool
+add_isvalid(size_t add_nbytes)
+{
+       return add_nbytes <= CCDRBG_MAX_ADDITIONALINPUT_SIZE;
+}
 
+// See NIST SP 800-90A, Rev. 1, 9.2 and 10.1.2.4
 static int
-init(const struct ccdrbg_info *info, struct ccdrbg_state *drbg,
-    size_t entropyLength, const void* entropy,
-    size_t nonceLength, const void* nonce,
-    size_t psLength, const void* ps)
+reseed(struct ccdrbg_state *ctx, size_t entropy_nbytes, const void *entropy, size_t add_nbytes, const void *add)
 {
-       struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg;
-       state->bytesLeft = 0;
-       state->custom = info->custom; //we only need to get the custom parameter from the info structure.
-
-       int rc = validate_inputs(state, entropyLength, 0, psLength);
-       if (rc != CCDRBG_STATUS_OK) {
-               //clear everything if cannot initialize. The idea is that if the caller doesn't check the output of init() and init() fails,
-               //the system crashes by NULL dereferencing after a call to generate, rather than generating bad random numbers.
-               done(drbg);
-               return rc;
-       }
-
-       const struct ccdigest_info *di = state->custom->di;
-       state->vsize = di->output_size;
-       state->keysize = di->output_size;
-       state->vptr = state->v;
-       state->nextvptr = state->v + state->vsize;
-
-       // 7. (V, Key, reseed_counter) = HMAC_DRBG_Instantiate_algorithm (entropy_input, personalization_string).
-       hmac_dbrg_instantiate_algorithm(drbg, entropyLength, entropy, nonceLength, nonce, psLength, ps);
+       struct ccdrbg_nisthmac_state *drbg_ctx = (struct ccdrbg_nisthmac_state *)ctx;
+       const struct ccdigest_info *digest_info = drbg_ctx->custom->di;
 
-#if DRBG_NISTHMAC_DEBUG
-       dumpState("Init: ", state);
-#endif
-       return CCDRBG_STATUS_OK;
-}
+       int status = CCDRBG_STATUS_PARAM_ERROR;
+       cc_require(entropy_isvalid(entropy_nbytes, digest_info), out);
+       cc_require(add_isvalid(add_nbytes), out);
 
-/*
- *  10.1.2.4 Reseeding an HMAC_DRBG Instantiation
- *  Notes for the reseed function specified in Section 9.2:
- *  The reseeding of an HMAC_DRBG instantiation requires a call to the Reseed_function specified in Section 9.2.
- *  Process step 6 of that function calls the reseed algorithm specified in this section. The values for min_length
- *  are provided in Table 2 of Section 10.1.
- *
- *  The reseed algorithm:
- *  Let HMAC_DRBG_Update be the function specified in Section 10.1.2.2. The following process or its equivalent
- *  shall be used as the reseed algorithm for this DRBG mechanism (see step 6 of the reseed process in Section 9.2):
- *
- *  HMAC_DRBG_Reseed_algorithm (working_state, entropy_input, additional_input):
- *  1.  working_state: The current values for V, Key and reseed_counter (see Section 10.1.2.1).
- *  2.  entropy_input: The string of bits obtained from the source of entropy input.
- *  3.  additional_input: The additional input string received from the consuming application.
- *  Note that the length of the additional_input string may be zero.
- *
- *  Output:
- *  1.  new_working_state: The new values for V, Key and reseed_counter. HMAC_DRBG Reseed Process:
- *  1.  seed_material = entropy_input || additional_input.
- *  2.  (Key, V) = HMAC_DRBG_Update (seed_material, Key, V). 3. reseed_counter = 1.
- *  4.  Return V, Key and reseed_counter as the new_working_state.
- */
+       status = CCDRBG_STATUS_OK;
 
-static int
-reseed(struct ccdrbg_state *drbg,
-    size_t entropyLength, const void *entropy,
-    size_t additionalLength, const void *additional)
-{
-       struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg;
-       int rc = validate_inputs(state, entropyLength, additionalLength, 0);
-       if (rc != CCDRBG_STATUS_OK) {
-               return rc;
-       }
+       update(ctx, 2, entropy_nbytes, entropy, add_nbytes, add);
 
-       int rx = hmac_dbrg_update(drbg, entropyLength, entropy, additionalLength, additional, 0, NULL);
-       state->reseed_counter = 1;
+       drbg_ctx->reseed_counter = 1;
 
-#if DRBG_NISTHMAC_DEBUG
-       dumpState("Reseed: ", state);
-#endif
-       return rx;
+out:
+       return status;
 }
 
-/*
- *  HMAC_DRBG_Generate_algorithm:
- *  Input: bitstring (V, Key), integer (reseed_counter, requested_number_of_bits).
- *  Output: string status, bitstring (pseudorandom_bits, V, Key), integer reseed_counter.
- *
- *  Process:
- *  1.      If (reseed_counter ≥ 10,000), then Return (“Reseed required”, Null, V, Key, reseed_counter).
- *  2.      temp = Null.
- *  3.      While (len (temp) < requested_no_of_bits) do:
- *  3.1         V = HMAC (Key, V).
- *  3.2         temp = temp || V.
- *  4.      pseudorandom_bits = Leftmost (requested_no_of_bits) of temp.
- *  5.      (Key, V) = HMAC_DRBG_Update (Null, Key, V).
- *  6.      reseed_counter = reseed_counter + 1.
- *  7.      Return (“Success”, pseudorandom_bits, V, Key, reseed_counter).
- */
-
+// See NIST SP 800-90A, Rev. 1, 9.3 and 10.1.2.5
 static int
-validate_gen_params(uint64_t reseed_counter, size_t dataOutLength, size_t additionalLength)
+generate(struct ccdrbg_state *ctx, size_t out_nbytes, void *out, size_t add_nbytes, const void *add)
 {
-       int rc = CCDRBG_STATUS_PARAM_ERROR;
-
-       // Zero byte in one request is a valid use-case (21208820)
-       cc_require(dataOutLength <= CCDRBG_MAX_REQUEST_SIZE, end); //Requested too many bytes in one request
-       cc_require(additionalLength <= CCDRBG_MAX_ADDITIONALINPUT_SIZE, end); //Additional input too long
+       struct ccdrbg_nisthmac_state *drbg_ctx = (struct ccdrbg_nisthmac_state *)ctx;
+       const struct ccdigest_info *info = drbg_ctx->custom->di;
+       size_t outlen = info->output_size;
 
-       // 1. If (reseed_counter > 2^^48), then Return (“Reseed required”, Null, V, Key, reseed_counter).
-       rc = CCDRBG_STATUS_NEED_RESEED;
-       cc_require(reseed_counter <= CCDRBG_RESEED_INTERVAL, end); //Reseed required
+       int status = CCDRBG_STATUS_PARAM_ERROR;
+       cc_require(out_nbytes <= CCDRBG_MAX_REQUEST_SIZE, out);
+       cc_require(add_isvalid(add_nbytes), out);
 
-       rc = CCDRBG_STATUS_OK;
+       status = CCDRBG_STATUS_NEED_RESEED;
+       cc_require(drbg_ctx->reseed_counter <= CCDRBG_RESEED_INTERVAL || !drbg_ctx->custom->strictFIPS, out);
 
-end:
-       return rc;
-}
-
-static int
-generate(struct ccdrbg_state *drbg, size_t dataOutLength, void *dataOut,
-    size_t additionalLength, const void *additional)
-{
-       struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg;
-       const struct ccdrbg_nisthmac_custom *custom = state->custom;
-       const struct ccdigest_info *di = custom->di;
+       status = CCDRBG_STATUS_OK;
 
-       int rc = validate_gen_params(state->reseed_counter, dataOutLength, additional == NULL?0:additionalLength);
-       if (rc != CCDRBG_STATUS_OK) {
-               return rc;
+       if (add_nbytes > 0) {
+               update(ctx, 1, add_nbytes, add);
        }
 
-       // 2. If additional_input ≠ Null, then (Key, V) = HMAC_DRBG_Update (additional_input, Key, V).
-       if (additional && additionalLength) {
-               hmac_dbrg_update(drbg, additionalLength, additional, 0, NULL, 0, NULL);
-       }
+       uint8_t *out_bytes = out;
+       uint8_t Vprev[NISTHMAC_MAX_OUTPUT_SIZE];
 
-       // hmac_dbrg_generate_algorithm
-       char *outPtr = (char *) dataOut;
-       while (dataOutLength > 0) {
-               if (!state->bytesLeft) {
-                       //  5. V=HMAC(K,V).
-                       cchmac(di, state->keysize, state->key, state->vsize, state->nextvptr, state->vptr); // Won't be returned
-                       // FIPS 140-2 4.9.2 Conditional Tests
-                       // "Each subsequent generation of an n-bit block shall be compared with the previously generated block. The test shall fail if any two compared n-bit blocks are equal."
-                       if (0 == cc_cmp_safe(state->vsize, state->vptr, state->nextvptr)) {
-                               //The world as we know it has come to an end
-                               //the DRBG data structure is zeroized. subsequent calls to
-                               //DRBG ends up in NULL dereferencing and/or unpredictable state.
-                               //catastrophic error in SP 800-90A
-                               done(drbg);
-                               rc = CCDRBG_STATUS_ABORT;
-                               cc_try_abort(NULL);
-                               goto errOut;
-                       }
-                       CC_SWAP(state->nextvptr, state->vptr);
-                       state->bytesLeft = state->vsize;
-#if DRBG_NISTHMAC_DEBUG
-                       cc_print("generate blk: ", state->vsize, state->vptr);
-#endif
+       while (out_nbytes > 0) {
+               cc_memcpy(Vprev, drbg_ctx->V, outlen);
+               cchmac(info, outlen, drbg_ctx->key, outlen, drbg_ctx->V, drbg_ctx->V);
+
+               // See FIPS 140-2, 4.9.2 Conditional Tests
+               if (cc_cmp_safe(outlen, Vprev, drbg_ctx->V) == 0) {
+                       done(ctx);
+                       status = CCDRBG_STATUS_ABORT;
+                       cc_try_abort(NULL);
+                       goto out;
                }
-               size_t outLength = dataOutLength > state->bytesLeft ? state->bytesLeft : dataOutLength;
-               CC_MEMCPY(outPtr, state->vptr, outLength);
-               state->bytesLeft -= outLength;
-               outPtr += outLength;
-               dataOutLength -= outLength;
-       }
 
-       // 6. (Key, V) = HMAC_DRBG_Update (additional_input, Key, V).
-       hmac_dbrg_update(drbg, additionalLength, additional, 0, NULL, 0, NULL);
+               size_t n = CC_MIN(out_nbytes, outlen);
+               cc_memcpy(out_bytes, drbg_ctx->V, n);
 
-       // 7. reseed_counter = reseed_counter + 1.
-       state->reseed_counter++;
+               out_bytes += n;
+               out_nbytes -= n;
+       }
 
-#if DRBG_NISTHMAC_DEBUG
-       dumpState("generate end: ", state);
-       cc_print("generate end nxt: ", state->vsize, state->nextvptr);
-#endif
-       rc = CCDRBG_STATUS_OK;
-errOut:
-       return rc;
-}
+       update(ctx, 1, add_nbytes, add);
 
-static void
-done(struct ccdrbg_state *drbg)
-{
-       struct ccdrbg_nisthmac_state *state = (struct ccdrbg_nisthmac_state *)drbg;
-       cc_clear(sizeof(struct ccdrbg_nisthmac_state), state); //clear v, key as well as internal variables
-}
+       drbg_ctx->reseed_counter += 1;
 
-struct ccdrbg_info ccdrbg_nisthmac_info = {
-       .size = sizeof(struct ccdrbg_nisthmac_state) + sizeof(struct ccdrbg_nisthmac_custom),
-       .init = init,
-       .reseed = reseed,
-       .generate = generate,
-       .done = done,
-       .custom = NULL
-};
+out:
+       cc_clear(outlen, Vprev);
+       return status;
+}
 
-/* This initializes an info object with the right options */
 void
 ccdrbg_factory_nisthmac(struct ccdrbg_info *info, const struct ccdrbg_nisthmac_custom *custom)
 {
index 9dc776366096da7c6723a4b96da5c1ea990d6670..0ba7548415120dfdbd1f16bcbbcd93237d5b0f7f 100644 (file)
@@ -32,7 +32,7 @@
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
-#include <corecrypto/ccdigest.h>
+#include <corecrypto/ccdigest_priv.h>
 #include <corecrypto/cc_priv.h>
 
 void
index 94b29a17292b24e079b284d4e7c847633ac0fbac..6856c4e7430e052642d3f654b113f8f7d9c09d46 100644 (file)
 #include <corecrypto/cc_priv.h>
 
 void
-ccdigest_update(const struct ccdigest_info *di, ccdigest_ctx_t ctx,
-    size_t len, const void *data)
+ccdigest_update(const struct ccdigest_info *di, ccdigest_ctx_t ctx, size_t len, const void *data)
 {
        const char * data_ptr = data;
        size_t nblocks, nbytes;
 
+       // Sanity check to recover from ctx corruptions.
+       if (ccdigest_num(di, ctx) >= di->block_size) {
+               ccdigest_num(di, ctx) = 0;
+       }
+
        while (len > 0) {
                if (ccdigest_num(di, ctx) == 0 && len > di->block_size) {
                        //low-end processors are slow on divison
@@ -59,13 +63,10 @@ ccdigest_update(const struct ccdigest_info *di, ccdigest_ctx_t ctx,
                        di->compress(ccdigest_state(di, ctx), nblocks, data_ptr);
                        len -= nbytes;
                        data_ptr += nbytes;
-                       ccdigest_nbits(di, ctx) += nbytes * 8;
+                       ccdigest_nbits(di, ctx) += (uint64_t) (nbytes) * 8;
                } else {
-                       size_t n = di->block_size - ccdigest_num(di, ctx);
-                       if (len < n) {
-                               n = len;
-                       }
-                       CC_MEMCPY(ccdigest_data(di, ctx) + ccdigest_num(di, ctx), data_ptr, n);
+                       size_t n = CC_MIN(di->block_size - ccdigest_num(di, ctx), len);
+                       cc_memcpy(ccdigest_data(di, ctx) + ccdigest_num(di, ctx), data_ptr, n);
                        /* typecast: less than block size, will always fit into an int */
                        ccdigest_num(di, ctx) += (unsigned int)n;
                        len -= n;
index dc72c7adb5f28597b2c6e9bec9ab08faa3e033c4..bb25887fecbc49f13b697eebc9493146341ddba5 100644 (file)
@@ -32,6 +32,7 @@
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
+#include <corecrypto/ccdigest_priv.h>
 #include <corecrypto/cchmac.h>
 #include <corecrypto/ccn.h>
 #include <corecrypto/cc_priv.h>
index 4eba5b23ab49d92eae1599fe5f6f99263b436376..1d5d799f554feae57bfeb037903ca653077237f5 100644 (file)
@@ -32,6 +32,7 @@
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
+#include <corecrypto/ccdigest_priv.h>
 #include <corecrypto/cchmac.h>
 #include <corecrypto/ccn.h>
 #include <corecrypto/cc_priv.h>
@@ -67,7 +68,7 @@ cchmac_init(const struct ccdigest_info *di, cchmac_ctx_t hc,
        }
        /* Fill remainder of cchmac_data(di, hc) with opad. */
        if (key_len < di->block_size) {
-               CC_MEMSET(cchmac_data(di, hc) + key_len, 0x5c, di->block_size - key_len);
+               cc_memset(cchmac_data(di, hc) + key_len, 0x5c, di->block_size - key_len);
        }
 
        /* Set cchmac_ostate32(di, hc) to the state of the first round of the
diff --git a/osfmk/corecrypto/ccn/src/ccn_set.c b/osfmk/corecrypto/ccn/src/ccn_set.c
deleted file mode 100644 (file)
index 4cd06a5..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- *  ccn_set.c
- *  corecrypto
- *
- *  Created on 02/17/2012
- *
- *  Copyright (c) 2012,2014,2015 Apple Inc. All rights reserved.
- *
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#include <corecrypto/ccn.h>
-#include <corecrypto/cc_priv.h>
-
-#if !CCN_SET_ASM
-void
-ccn_set(cc_size n, cc_unit *r, const cc_unit *s)
-{
-       CC_MEMMOVE(r, s, ccn_sizeof_n(n));
-}
-#endif
index 8b30793d57e6d9e371e1aa900e777748b69d3748..56234ff33742a663b02352ba1ab528213b979f40 100644 (file)
 
 /* This can be used for SHA1, SHA256 and SHA224 */
 void
-ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t ctx,
-    unsigned char *digest)
+ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t ctx, unsigned char *digest)
 {
-       ccdigest_nbits(di, ctx) += ccdigest_num(di, ctx) * 8;
-       ccdigest_data(di, ctx)[ccdigest_num(di, ctx)++] = 0x80;
+       // Sanity check to recover from ctx corruptions.
+       if (ccdigest_num(di, ctx) >= di->block_size) {
+               ccdigest_num(di, ctx) = 0;
+       }
+
+       // Clone the state.
+       ccdigest_di_decl(di, tmp);
+       cc_memcpy(tmp, ctx, ccdigest_di_size(di));
+
+       ccdigest_nbits(di, tmp) += ccdigest_num(di, tmp) * 8;
+       ccdigest_data(di, tmp)[ccdigest_num(di, tmp)++] = 0x80;
 
        /* If we don't have at least 8 bytes (for the length) left we need to add
         *  a second block. */
-       if (ccdigest_num(di, ctx) > 64 - 8) {
-               while (ccdigest_num(di, ctx) < 64) {
-                       ccdigest_data(di, ctx)[ccdigest_num(di, ctx)++] = 0;
+       if (ccdigest_num(di, tmp) > 64 - 8) {
+               while (ccdigest_num(di, tmp) < 64) {
+                       ccdigest_data(di, tmp)[ccdigest_num(di, tmp)++] = 0;
                }
-               di->compress(ccdigest_state(di, ctx), 1, ccdigest_data(di, ctx));
-               ccdigest_num(di, ctx) = 0;
+               di->compress(ccdigest_state(di, tmp), 1, ccdigest_data(di, tmp));
+               ccdigest_num(di, tmp) = 0;
        }
 
        /* pad upto block_size minus 8 with 0s */
-       while (ccdigest_num(di, ctx) < 64 - 8) {
-               ccdigest_data(di, ctx)[ccdigest_num(di, ctx)++] = 0;
+       while (ccdigest_num(di, tmp) < 64 - 8) {
+               ccdigest_data(di, tmp)[ccdigest_num(di, tmp)++] = 0;
        }
 
-       CC_STORE64_BE(ccdigest_nbits(di, ctx), ccdigest_data(di, ctx) + 64 - 8);
-       di->compress(ccdigest_state(di, ctx), 1, ccdigest_data(di, ctx));
+       CC_STORE64_BE(ccdigest_nbits(di, tmp), ccdigest_data(di, tmp) + 64 - 8);
+       di->compress(ccdigest_state(di, tmp), 1, ccdigest_data(di, tmp));
 
        /* copy output */
        for (unsigned int i = 0; i < di->output_size / 4; i++) {
-               CC_STORE32_BE(ccdigest_state_u32(di, ctx)[i], digest + (4 * i));
+               CC_STORE32_BE(ccdigest_state_u32(di, tmp)[i], digest + (4 * i));
        }
+
+       ccdigest_di_clear(di, tmp);
 }
index 59c6acc041bc2d780ed3fab884b4d813a31bf9ae..f055084b0c48a1b4b475db50eb616d8add8cbbb7 100644 (file)
@@ -37,8 +37,6 @@
 
 #include <corecrypto/ccdigest.h>
 
-void ccdigest_final_common(const struct ccdigest_info *di,
-    ccdigest_ctx_t ctx, void *digest);
 void ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t,
     unsigned char *digest);
 void ccdigest_final_64le(const struct ccdigest_info *di, ccdigest_ctx_t,
diff --git a/osfmk/corecrypto/ccsha1/src/ccsha1_eay.c b/osfmk/corecrypto/ccsha1/src/ccsha1_eay.c
deleted file mode 100644 (file)
index 22941eb..0000000
+++ /dev/null
@@ -1,309 +0,0 @@
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to.  The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *    "This product includes cryptographic software written by
- *     Eric Young (eay@cryptsoft.com)"
- *    The word 'cryptographic' can be left out if the rouines from the library
- *    being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- *    the apps directory (application code) you must include an acknowledgement:
- *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed.  i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-
-#include <corecrypto/ccsha1.h>
-#include "ccsha1_internal.h"
-#include <corecrypto/cc_priv.h>
-#include <corecrypto/ccdigest_priv.h>
-#include "ccdigest_internal.h"
-
-
-#ifndef SHA_LONG_LOG2
-#define SHA_LONG_LOG2   2       /* default to 32 bits */
-#endif
-
-
-#define ROTATE(b, n) CC_ROLc(b, n)
-
-#define Xupdate(a, ix, ia, ib, ic, id)       ( (a)=(ia^ib^ic^id),\
-                                         ix=(a)=ROTATE((a),1)  \
-                                       )
-
-#define MD32_REG_T uint32_t
-
-#define HOST_c2l(data, l) CC_LOAD32_BE(l, data); data+=4;
-
-#define K_00_19 0x5a827999
-#define K_20_39 0x6ed9eba1
-#define K_40_59 0x8f1bbcdc
-#define K_60_79 0xca62c1d6
-
-/* As  pointed out by Wei Dai <weidai@eskimo.com>, F() below can be
- * simplified to the code in F_00_19.  Wei attributes these optimisations
- * to Peter Gutmann's SHS code, and he attributes it to Rich Schroeppel.
- * #define F(x,y,z) (((x) & (y))  |  ((~(x)) & (z)))
- * I've just become aware of another tweak to be made, again from Wei Dai,
- * in F_40_59, (x&a)|(y&a) -> (x|y)&a
- */
-#define F_00_19(b, c, d)  ((((c) ^ (d)) & (b)) ^ (d))
-#define F_20_39(b, c, d)  ((b) ^ (c) ^ (d))
-#define F_40_59(b, c, d)  (((b) & (c)) | (((b)|(c)) & (d)))
-#define F_60_79(b, c, d)  F_20_39(b,c,d)
-
-#define BODY_00_15(i, a, b, c, d, e, f, xi) \
-       (f)=xi+(e)+K_00_19+ROTATE((a),5)+F_00_19((b),(c),(d)); \
-       (b)=ROTATE((b),30);
-
-#define BODY_16_19(i, a, b, c, d, e, f, xi, xa, xb, xc, xd) \
-       Xupdate(f,xi,xa,xb,xc,xd); \
-       (f)+=(e)+K_00_19+ROTATE((a),5)+F_00_19((b),(c),(d)); \
-       (b)=ROTATE((b),30);
-
-#define BODY_20_31(i, a, b, c, d, e, f, xi, xa, xb, xc, xd) \
-       Xupdate(f,xi,xa,xb,xc,xd); \
-       (f)+=(e)+K_20_39+ROTATE((a),5)+F_20_39((b),(c),(d)); \
-       (b)=ROTATE((b),30);
-
-#define BODY_32_39(i, a, b, c, d, e, f, xa, xb, xc, xd) \
-       Xupdate(f,xa,xa,xb,xc,xd); \
-       (f)+=(e)+K_20_39+ROTATE((a),5)+F_20_39((b),(c),(d)); \
-       (b)=ROTATE((b),30);
-
-#define BODY_40_59(i, a, b, c, d, e, f, xa, xb, xc, xd) \
-       Xupdate(f,xa,xa,xb,xc,xd); \
-       (f)+=(e)+K_40_59+ROTATE((a),5)+F_40_59((b),(c),(d)); \
-       (b)=ROTATE((b),30);
-
-#define BODY_60_79(i, a, b, c, d, e, f, xa, xb, xc, xd) \
-       Xupdate(f,xa,xa,xb,xc,xd); \
-       (f)=xa+(e)+K_60_79+ROTATE((a),5)+F_60_79((b),(c),(d)); \
-       (b)=ROTATE((b),30);
-
-#ifdef X
-#undef X
-#endif
-
-#ifndef MD32_XARRAY
-/*
- * Originally X was an array. As it's automatic it's natural
- * to expect RISC compiler to accomodate at least part of it in
- * the register bank, isn't it? Unfortunately not all compilers
- * "find" this expectation reasonable:-( On order to make such
- * compilers generate better code I replace X[] with a bunch of
- * X0, X1, etc. See the function body below...
- *                                     <appro@fy.chalmers.se>
- */
-# define X(i)   XX##i
-#else
-/*
- * However! Some compilers (most notably HP C) get overwhelmed by
- * that many local variables so that we have to have the way to
- * fall down to the original behavior.
- */
-# define X(i)   XX[i]
-#endif
-
-static void
-sha1_compress(ccdigest_state_t s, size_t num, const void *buf)
-{
-       const unsigned char *data = buf;
-       register uint32_t A, B, C, D, E, T, l;
-#ifndef MD32_XARRAY
-       uint32_t    XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7,
-           XX8, XX9, XX10, XX11, XX12, XX13, XX14, XX15;
-#else
-       uint32_t    XX[16];
-#endif
-       uint32_t *state = ccdigest_u32(s);
-
-       A = state[0];
-       B = state[1];
-       C = state[2];
-       D = state[3];
-       E = state[4];
-
-       for (;;) {
-               HOST_c2l(data, l); X( 0) = l; HOST_c2l(data, l); X( 1) = l;
-               BODY_00_15( 0, A, B, C, D, E, T, X( 0)); HOST_c2l(data, l); X( 2) = l;
-               BODY_00_15( 1, T, A, B, C, D, E, X( 1)); HOST_c2l(data, l); X( 3) = l;
-               BODY_00_15( 2, E, T, A, B, C, D, X( 2)); HOST_c2l(data, l); X( 4) = l;
-               BODY_00_15( 3, D, E, T, A, B, C, X( 3)); HOST_c2l(data, l); X( 5) = l;
-               BODY_00_15( 4, C, D, E, T, A, B, X( 4)); HOST_c2l(data, l); X( 6) = l;
-               BODY_00_15( 5, B, C, D, E, T, A, X( 5)); HOST_c2l(data, l); X( 7) = l;
-               BODY_00_15( 6, A, B, C, D, E, T, X( 6)); HOST_c2l(data, l); X( 8) = l;
-               BODY_00_15( 7, T, A, B, C, D, E, X( 7)); HOST_c2l(data, l); X( 9) = l;
-               BODY_00_15( 8, E, T, A, B, C, D, X( 8)); HOST_c2l(data, l); X(10) = l;
-               BODY_00_15( 9, D, E, T, A, B, C, X( 9)); HOST_c2l(data, l); X(11) = l;
-               BODY_00_15(10, C, D, E, T, A, B, X(10)); HOST_c2l(data, l); X(12) = l;
-               BODY_00_15(11, B, C, D, E, T, A, X(11)); HOST_c2l(data, l); X(13) = l;
-               BODY_00_15(12, A, B, C, D, E, T, X(12)); HOST_c2l(data, l); X(14) = l;
-               BODY_00_15(13, T, A, B, C, D, E, X(13)); HOST_c2l(data, l); X(15) = l;
-               BODY_00_15(14, E, T, A, B, C, D, X(14));
-               BODY_00_15(15, D, E, T, A, B, C, X(15));
-
-               BODY_16_19(16, C, D, E, T, A, B, X( 0), X( 0), X( 2), X( 8), X(13));
-               BODY_16_19(17, B, C, D, E, T, A, X( 1), X( 1), X( 3), X( 9), X(14));
-               BODY_16_19(18, A, B, C, D, E, T, X( 2), X( 2), X( 4), X(10), X(15));
-               BODY_16_19(19, T, A, B, C, D, E, X( 3), X( 3), X( 5), X(11), X( 0));
-
-               BODY_20_31(20, E, T, A, B, C, D, X( 4), X( 4), X( 6), X(12), X( 1));
-               BODY_20_31(21, D, E, T, A, B, C, X( 5), X( 5), X( 7), X(13), X( 2));
-               BODY_20_31(22, C, D, E, T, A, B, X( 6), X( 6), X( 8), X(14), X( 3));
-               BODY_20_31(23, B, C, D, E, T, A, X( 7), X( 7), X( 9), X(15), X( 4));
-               BODY_20_31(24, A, B, C, D, E, T, X( 8), X( 8), X(10), X( 0), X( 5));
-               BODY_20_31(25, T, A, B, C, D, E, X( 9), X( 9), X(11), X( 1), X( 6));
-               BODY_20_31(26, E, T, A, B, C, D, X(10), X(10), X(12), X( 2), X( 7));
-               BODY_20_31(27, D, E, T, A, B, C, X(11), X(11), X(13), X( 3), X( 8));
-               BODY_20_31(28, C, D, E, T, A, B, X(12), X(12), X(14), X( 4), X( 9));
-               BODY_20_31(29, B, C, D, E, T, A, X(13), X(13), X(15), X( 5), X(10));
-               BODY_20_31(30, A, B, C, D, E, T, X(14), X(14), X( 0), X( 6), X(11));
-               BODY_20_31(31, T, A, B, C, D, E, X(15), X(15), X( 1), X( 7), X(12));
-
-               BODY_32_39(32, E, T, A, B, C, D, X( 0), X( 2), X( 8), X(13));
-               BODY_32_39(33, D, E, T, A, B, C, X( 1), X( 3), X( 9), X(14));
-               BODY_32_39(34, C, D, E, T, A, B, X( 2), X( 4), X(10), X(15));
-               BODY_32_39(35, B, C, D, E, T, A, X( 3), X( 5), X(11), X( 0));
-               BODY_32_39(36, A, B, C, D, E, T, X( 4), X( 6), X(12), X( 1));
-               BODY_32_39(37, T, A, B, C, D, E, X( 5), X( 7), X(13), X( 2));
-               BODY_32_39(38, E, T, A, B, C, D, X( 6), X( 8), X(14), X( 3));
-               BODY_32_39(39, D, E, T, A, B, C, X( 7), X( 9), X(15), X( 4));
-
-               BODY_40_59(40, C, D, E, T, A, B, X( 8), X(10), X( 0), X( 5));
-               BODY_40_59(41, B, C, D, E, T, A, X( 9), X(11), X( 1), X( 6));
-               BODY_40_59(42, A, B, C, D, E, T, X(10), X(12), X( 2), X( 7));
-               BODY_40_59(43, T, A, B, C, D, E, X(11), X(13), X( 3), X( 8));
-               BODY_40_59(44, E, T, A, B, C, D, X(12), X(14), X( 4), X( 9));
-               BODY_40_59(45, D, E, T, A, B, C, X(13), X(15), X( 5), X(10));
-               BODY_40_59(46, C, D, E, T, A, B, X(14), X( 0), X( 6), X(11));
-               BODY_40_59(47, B, C, D, E, T, A, X(15), X( 1), X( 7), X(12));
-               BODY_40_59(48, A, B, C, D, E, T, X( 0), X( 2), X( 8), X(13));
-               BODY_40_59(49, T, A, B, C, D, E, X( 1), X( 3), X( 9), X(14));
-               BODY_40_59(50, E, T, A, B, C, D, X( 2), X( 4), X(10), X(15));
-               BODY_40_59(51, D, E, T, A, B, C, X( 3), X( 5), X(11), X( 0));
-               BODY_40_59(52, C, D, E, T, A, B, X( 4), X( 6), X(12), X( 1));
-               BODY_40_59(53, B, C, D, E, T, A, X( 5), X( 7), X(13), X( 2));
-               BODY_40_59(54, A, B, C, D, E, T, X( 6), X( 8), X(14), X( 3));
-               BODY_40_59(55, T, A, B, C, D, E, X( 7), X( 9), X(15), X( 4));
-               BODY_40_59(56, E, T, A, B, C, D, X( 8), X(10), X( 0), X( 5));
-               BODY_40_59(57, D, E, T, A, B, C, X( 9), X(11), X( 1), X( 6));
-               BODY_40_59(58, C, D, E, T, A, B, X(10), X(12), X( 2), X( 7));
-               BODY_40_59(59, B, C, D, E, T, A, X(11), X(13), X( 3), X( 8));
-
-               BODY_60_79(60, A, B, C, D, E, T, X(12), X(14), X( 4), X( 9));
-               BODY_60_79(61, T, A, B, C, D, E, X(13), X(15), X( 5), X(10));
-               BODY_60_79(62, E, T, A, B, C, D, X(14), X( 0), X( 6), X(11));
-               BODY_60_79(63, D, E, T, A, B, C, X(15), X( 1), X( 7), X(12));
-               BODY_60_79(64, C, D, E, T, A, B, X( 0), X( 2), X( 8), X(13));
-               BODY_60_79(65, B, C, D, E, T, A, X( 1), X( 3), X( 9), X(14));
-               BODY_60_79(66, A, B, C, D, E, T, X( 2), X( 4), X(10), X(15));
-               BODY_60_79(67, T, A, B, C, D, E, X( 3), X( 5), X(11), X( 0));
-               BODY_60_79(68, E, T, A, B, C, D, X( 4), X( 6), X(12), X( 1));
-               BODY_60_79(69, D, E, T, A, B, C, X( 5), X( 7), X(13), X( 2));
-               BODY_60_79(70, C, D, E, T, A, B, X( 6), X( 8), X(14), X( 3));
-               BODY_60_79(71, B, C, D, E, T, A, X( 7), X( 9), X(15), X( 4));
-               BODY_60_79(72, A, B, C, D, E, T, X( 8), X(10), X( 0), X( 5));
-               BODY_60_79(73, T, A, B, C, D, E, X( 9), X(11), X( 1), X( 6));
-               BODY_60_79(74, E, T, A, B, C, D, X(10), X(12), X( 2), X( 7));
-               BODY_60_79(75, D, E, T, A, B, C, X(11), X(13), X( 3), X( 8));
-               BODY_60_79(76, C, D, E, T, A, B, X(12), X(14), X( 4), X( 9));
-               BODY_60_79(77, B, C, D, E, T, A, X(13), X(15), X( 5), X(10));
-               BODY_60_79(78, A, B, C, D, E, T, X(14), X( 0), X( 6), X(11));
-               BODY_60_79(79, T, A, B, C, D, E, X(15), X( 1), X( 7), X(12));
-
-               state[0] = (state[0] + E) & 0xffffffff;
-               state[1] = (state[1] + T) & 0xffffffff;
-               state[2] = (state[2] + A) & 0xffffffff;
-               state[3] = (state[3] + B) & 0xffffffff;
-               state[4] = (state[4] + C) & 0xffffffff;
-
-               if (--num <= 0) {
-                       break;
-               }
-
-               A = state[0];
-               B = state[1];
-               C = state[2];
-               D = state[3];
-               E = state[4];
-       }
-}
-
-const struct ccdigest_info ccsha1_eay_di = {
-       .output_size = CCSHA1_OUTPUT_SIZE,
-       .state_size = CCSHA1_STATE_SIZE,
-       .block_size = CCSHA1_BLOCK_SIZE,
-       .oid_size = ccoid_sha1_len,
-       .oid = CC_DIGEST_OID_SHA1,
-       .initial_state = ccsha1_initial_state,
-       .compress = sha1_compress,
-       .final = ccdigest_final_64be,
-};
diff --git a/osfmk/corecrypto/ccsha1/src/ccsha1_initial_state.c b/osfmk/corecrypto/ccsha1/src/ccsha1_initial_state.c
deleted file mode 100644 (file)
index f72ecfd..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- *  ccsha1_initial_state.c
- *  corecrypto
- *
- *  Created on 12/07/2010
- *
- *  Copyright (c) 2010,2015 Apple Inc. All rights reserved.
- *
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#include <corecrypto/ccsha1.h>
-#include <corecrypto/cc_priv.h>
-
-const uint32_t ccsha1_initial_state[5] = {
-       0x67452301,
-       0xefcdab89,
-       0x98badcfe,
-       0x10325476,
-       0xc3d2e1f0
-};
diff --git a/osfmk/corecrypto/ccsha1/src/ccsha1_internal.h b/osfmk/corecrypto/ccsha1/src/ccsha1_internal.h
deleted file mode 100644 (file)
index 323bbb2..0000000
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- *  ccsha1_internal.h
- *  corecrypto
- *
- *  Created on 12/19/2017
- *
- *  Copyright (c) 2017 Apple Inc. All rights reserved.
- *
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#ifndef _CORECRYPTO_CCSHA1_INTERNAL_H_
-#define _CORECRYPTO_CCSHA1_INTERNAL_H_
-
-#include <corecrypto/ccdigest.h>
-#include <corecrypto/cc_config.h>
-
-extern const uint32_t ccsha1_initial_state[5];
-
-#if CCSHA1_VNG_INTEL && defined(__x86_64__)
-extern const struct ccdigest_info ccsha1_vng_intel_AVX2_di;
-extern const struct ccdigest_info ccsha1_vng_intel_AVX1_di;
-#endif
-
-#endif /* _CORECRYPTO_CCSHA1_INTERNAL_H_ */
index 59c6acc041bc2d780ed3fab884b4d813a31bf9ae..f055084b0c48a1b4b475db50eb616d8add8cbbb7 100644 (file)
@@ -37,8 +37,6 @@
 
 #include <corecrypto/ccdigest.h>
 
-void ccdigest_final_common(const struct ccdigest_info *di,
-    ccdigest_ctx_t ctx, void *digest);
 void ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t,
     unsigned char *digest);
 void ccdigest_final_64le(const struct ccdigest_info *di, ccdigest_ctx_t,
index d31a9402c878c55e6a37d8d78a9077ee966cc535..2a61b8413b9bc3613877f5a36fc3f18d23c1bd4b 100644 (file)
@@ -47,16 +47,20 @@ ccsha256_di(void)
 #if defined (__x86_64__)
        if (CC_HAS_AVX512_AND_IN_KERNEL()) {
                return &ccsha256_vng_intel_SupplementalSSE3_di;
-       } else {
-               return CC_HAS_AVX2() ? &ccsha256_vng_intel_AVX2_di :
-                      ((CC_HAS_AVX1() ? &ccsha256_vng_intel_AVX1_di :
-                      &ccsha256_vng_intel_SupplementalSSE3_di));
+       } else
+#if CC_ACCELERATECRYPTO
+       { return &ccsha256_vng_intel_di; // use AccelerateCrypto
        }
+#else
+       { return CC_HAS_AVX2() ? &ccsha256_vng_intel_AVX2_di :
+                ((CC_HAS_AVX1() ? &ccsha256_vng_intel_AVX1_di :
+                &ccsha256_vng_intel_SupplementalSSE3_di)); }
+#endif
 #else
        return &ccsha256_vng_intel_SupplementalSSE3_di;
 #endif
-#elif  CCSHA2_VNG_ARMV7NEON
-       return &ccsha256_vng_armv7neon_di;
+#elif  CCSHA2_VNG_ARM
+       return &ccsha256_vng_arm_di;
 #elif CCSHA256_ARMV6M_ASM
        return &ccsha256_v6m_di;
 #else
index 6c84aa4d442c0fe543bc3cec19ad66bb45ee579b..0b6ea34de6d90f4c86c208d602b8e0db3b6caa21 100644 (file)
 
 #if !CC_KERNEL || !CC_USE_ASM
 
-// Various logical functions
-#define Ch(x, y, z)       (z ^ (x & (y ^ z)))
-#define Maj(x, y, z)      (((x | y) & z) | (x & y))
-#define S(x, n)         ror((x),(n))
-#define R(x, n)         ((x)>>(n))
-
-#define Sigma0(x)       (S(x, 2) ^ S(x, 13) ^ S(x, 22))
-#define Sigma1(x)       (S(x, 6) ^ S(x, 11) ^ S(x, 25))
-
-#define Gamma0(x)       (S(x, 7)  ^ S(x, 18) ^ R(x, 3))
-#define Gamma1(x)       (S(x, 17) ^ S(x, 19) ^ R(x, 10))
-
-//It is beter if the following macros are defined as inline functions,
-//but I found some compilers do not inline them.
-#ifdef __CC_ARM
-    #define ror(val, shift) __ror(val,shift)
+#if CCSHA2_SHA256_USE_SHA512_K
+#define K(i) ((uint32_t)(ccsha512_K[i] >> 32))
 #else
-    #define ror(val, shift) ((val >> shift) | (val << (32 - shift)))
+#define K(i) ccsha256_K[i]
 #endif
 
-#ifdef __CC_ARM
-    #define byte_swap32(x) __rev(x)
-#elif defined(__clang__) && !defined(_MSC_VER)
-    #define byte_swap32(x) __builtin_bswap32(x);
-#else
-   #define byte_swap32(x) ((ror(x, 8) & 0xff00ff00) | (ror(x, 24) & 0x00ff00ff))
-#endif
+// Various logical functions
+#define Ch(x, y, z) (z ^ (x & (y ^ z)))
+#define Maj(x, y, z) (((x | y) & z) | (x & y))
+#define S(x, n) CC_RORc(x, n)
+#define R(x, n) ((x) >> (n))
+#define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
+#define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
+#define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
+#define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
 
-#if CC_HANDLE_UNALIGNED_DATA
-    #define set_W(i) CC_LOAD32_BE(W[i], buf + (4*(i)))
-#else
-    #define set_W(i) W[i] = byte_swap32(buf[i])
-#endif
+#define set_W(i) CC_LOAD32_BE(W[i], buf + (4 * (i)))
 
 // the round function
-#define RND(a, b, c, d, e, f, g, h, i)                                 \
-    t0 = h + Sigma1(e) + Ch(e, f, g) + ccsha256_K[i] + W[i];   \
-    t1 = Sigma0(a) + Maj(a, b, c);                             \
-    d += t0;                                                   \
-    h  = t0 + t1;
+#define RND(a, b, c, d, e, f, g, h, i)              \
+    t0 = h + Sigma1(e) + Ch(e, f, g) + K(i) + W[i]; \
+    t1 = Sigma0(a) + Maj(a, b, c);                  \
+    d += t0;                                        \
+    h = t0 + t1;
 
 // compress 512-bits
 void
 ccsha256_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *in)
 {
        uint32_t W[64], t0, t1;
-       uint32_t S0, S1, S2, S3, S4, S5, S6, S7;
+       uint32_t S[8];
        int i;
        uint32_t *s = ccdigest_u32(state);
-#if CC_HANDLE_UNALIGNED_DATA
        const unsigned char *buf = in;
-#else
-       const uint32_t *buf = in;
-#endif
 
        while (nblocks--) {
                // schedule W 0..15
-               set_W(0); set_W(1); set_W(2); set_W(3); set_W(4); set_W(5); set_W(6); set_W(7);
-               set_W(8); set_W(9); set_W(10); set_W(11); set_W(12); set_W(13); set_W(14); set_W(15);
+               for (i = 0; i < 16; i += 1) {
+                       set_W(i);
+               }
 
                // schedule W 16..63
-               for (i = 16; i < 64; i++) {
+               for (; i < 64; i++) {
                        W[i] = Gamma1(W[i - 2]) + W[i - 7] + Gamma0(W[i - 15]) + W[i - 16];
                }
 
                // copy state into S
-               S0 = s[0];
-               S1 = s[1];
-               S2 = s[2];
-               S3 = s[3];
-               S4 = s[4];
-               S5 = s[5];
-               S6 = s[6];
-               S7 = s[7];
+               S[0] = s[0];
+               S[1] = s[1];
+               S[2] = s[2];
+               S[3] = s[3];
+               S[4] = s[4];
+               S[5] = s[5];
+               S[6] = s[6];
+               S[7] = s[7];
 
                // Compress
+#if CC_SMALL_CODE
+               for (i = 0; i < 64; i += 1) {
+                       t0 = S[7] + Sigma1(S[4]) + Ch(S[4], S[5], S[6]) + K(i) + W[i];
+                       t1 = Sigma0(S[0]) + Maj(S[0], S[1], S[2]);
+                       S[7] = S[6];
+                       S[6] = S[5];
+                       S[5] = S[4];
+                       S[4] = S[3] + t0;
+                       S[3] = S[2];
+                       S[2] = S[1];
+                       S[1] = S[0];
+                       S[0] = t0 + t1;
+               }
+#else
                for (i = 0; i < 64; i += 8) {
-                       RND(S0, S1, S2, S3, S4, S5, S6, S7, i + 0);
-                       RND(S7, S0, S1, S2, S3, S4, S5, S6, i + 1);
-                       RND(S6, S7, S0, S1, S2, S3, S4, S5, i + 2);
-                       RND(S5, S6, S7, S0, S1, S2, S3, S4, i + 3);
-                       RND(S4, S5, S6, S7, S0, S1, S2, S3, i + 4);
-                       RND(S3, S4, S5, S6, S7, S0, S1, S2, i + 5);
-                       RND(S2, S3, S4, S5, S6, S7, S0, S1, i + 6);
-                       RND(S1, S2, S3, S4, S5, S6, S7, S0, i + 7);
+                       RND(S[0], S[1], S[2], S[3], S[4], S[5], S[6], S[7], i + 0);
+                       RND(S[7], S[0], S[1], S[2], S[3], S[4], S[5], S[6], i + 1);
+                       RND(S[6], S[7], S[0], S[1], S[2], S[3], S[4], S[5], i + 2);
+                       RND(S[5], S[6], S[7], S[0], S[1], S[2], S[3], S[4], i + 3);
+                       RND(S[4], S[5], S[6], S[7], S[0], S[1], S[2], S[3], i + 4);
+                       RND(S[3], S[4], S[5], S[6], S[7], S[0], S[1], S[2], i + 5);
+                       RND(S[2], S[3], S[4], S[5], S[6], S[7], S[0], S[1], i + 6);
+                       RND(S[1], S[2], S[3], S[4], S[5], S[6], S[7], S[0], i + 7);
                }
+#endif
 
                // feedback
-               s[0] += S0;
-               s[1] += S1;
-               s[2] += S2;
-               s[3] += S3;
-               s[4] += S4;
-               s[5] += S5;
-               s[6] += S6;
-               s[7] += S7;
+               s[0] += S[0];
+               s[1] += S[1];
+               s[2] += S[2];
+               s[3] += S[3];
+               s[4] += S[4];
+               s[5] += S[5];
+               s[6] += S[6];
+               s[7] += S[7];
 
                buf += CCSHA256_BLOCK_SIZE / sizeof(buf[0]);
        }
index 7bf64bc04b86074bb7143b7d10685da7279a2d77..5a174ab687029b2ecd82169d0383e0fcd9a3612f 100644 (file)
 
 #include <corecrypto/ccdigest.h>
 
+#ifndef CCSHA2_DISABLE_SHA512
+#define CCSHA2_DISABLE_SHA512 0
+#endif
+
+#define CCSHA2_SHA256_USE_SHA512_K (CC_SMALL_CODE && !CCSHA2_DISABLE_SHA512)
+
 extern const struct ccdigest_info ccsha256_v6m_di;
 void ccsha256_v6m_compress(ccdigest_state_t state, size_t nblocks, const void *buf);
 
@@ -45,12 +51,20 @@ void ccsha512_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *i
 
 #if  CCSHA2_VNG_INTEL
 #if defined __x86_64__
-void ccsha256_vng_intel_avx2_compress(ccdigest_state_t state, size_t nblocks, const void *in);
-void ccsha256_vng_intel_avx1_compress(ccdigest_state_t state, size_t nblocks, const void *in);
-void ccsha256_vng_intel_ssse3_compress(ccdigest_state_t state, size_t nblocks, const void *in);
-void ccsha512_vng_intel_avx2_compress(ccdigest_state_t state, size_t nblocks, const void *in);
-void ccsha512_vng_intel_avx1_compress(ccdigest_state_t state, size_t nblocks, const void *in);
-void ccsha512_vng_intel_ssse3_compress(ccdigest_state_t state, size_t nblocks, const void *in);
+void ccsha256_vng_intel_avx2_compress(ccdigest_state_t state, size_t nblocks, const void *in) __asm__("_ccsha256_vng_intel_avx2_compress");
+void ccsha256_vng_intel_avx1_compress(ccdigest_state_t state, size_t nblocks, const void *in) __asm__("_ccsha256_vng_intel_avx1_compress");
+void ccsha256_vng_intel_ssse3_compress(ccdigest_state_t state, size_t nblocks, const void *in) __asm__("_ccsha256_vng_intel_sse3_compress");
+void ccsha512_vng_intel_avx2_compress(ccdigest_state_t state, size_t nblocks, const void *in) __asm__("_ccsha512_vng_intel_avx2_compress");
+void ccsha512_vng_intel_avx1_compress(ccdigest_state_t state, size_t nblocks, const void *in) __asm__("_ccsha512_vng_intel_avx1_compress");
+void ccsha512_vng_intel_ssse3_compress(ccdigest_state_t state, size_t nblocks, const void *in) __asm__("_ccsha512_vng_intel_ssse3_compress");
+
+#if CC_ACCELERATECRYPTO
+// AccelerateCrypto
+extern const struct ccdigest_info ccsha224_vng_intel_di;
+extern const struct ccdigest_info ccsha256_vng_intel_di;
+extern const struct ccdigest_info ccsha384_vng_intel_di;
+extern const struct ccdigest_info ccsha512_vng_intel_di;
+#endif
 
 extern const struct ccdigest_info ccsha224_vng_intel_AVX2_di;
 extern const struct ccdigest_info ccsha224_vng_intel_AVX1_di;
@@ -63,14 +77,7 @@ extern const struct ccdigest_info ccsha512_vng_intel_AVX2_di;
 extern const struct ccdigest_info ccsha512_vng_intel_AVX1_di;
 extern const struct ccdigest_info ccsha512_vng_intel_SupplementalSSE3_di;
 #endif
-void ccsha256_vng_intel_sse3_compress(ccdigest_state_t state, size_t nblocks, const void *in);
-#endif
-
-#if  CCSHA2_VNG_ARMV7NEON
-extern const struct ccdigest_info ccsha384_vng_arm64_di;
-extern const struct ccdigest_info ccsha384_vng_armv7neon_di;
-extern const struct ccdigest_info ccsha512_vng_arm64_di;
-extern const struct ccdigest_info ccsha512_vng_armv7neon_di;
+void ccsha256_vng_intel_sse3_compress(ccdigest_state_t state, size_t nblocks, const void *in) __asm__("_ccsha256_vng_intel_sse3_compress");
 #endif
 
 extern const uint32_t ccsha256_K[64];
index 73448a1f5717d8d77db4067a25eba0780e53e2cb..040f331cee70d2cacf7e37ca7327bc4aaa410030 100644 (file)
@@ -407,7 +407,7 @@ task_generate_corpse(
        ipc_port_t corpse_port;
        ipc_port_t old_notify;
 
-       if (task == kernel_task || task == TASK_NULL || task == current_task()) {
+       if (task == kernel_task || task == TASK_NULL) {
                return KERN_INVALID_ARGUMENT;
        }
 
@@ -446,7 +446,7 @@ task_generate_corpse(
        assert(IP_NULL != corpse_port);
 
        ip_lock(corpse_port);
-       assert(ip_active(corpse_port));
+       require_ip_active(corpse_port);
        ipc_port_nsrequest(corpse_port, corpse_port->ip_mscount, ipc_port_make_sonce_locked(corpse_port), &old_notify);
        /* port unlocked */
 
@@ -579,6 +579,7 @@ task_generate_corpse_internal(
            is_64bit_data,
            t_flags,
            TPF_NONE,
+           TWF_NONE,
            &new_task);
        if (kr != KERN_SUCCESS) {
                goto error_task_generate_corpse;
index a264a60a9ff6392f0e9af6a8688ec8372b63add3..c51a3bbd0640ae88586b6b88c4715680a059835f 100644 (file)
@@ -75,7 +75,7 @@ extern kern_return_t task_crashinfo_destroy(kcdata_descriptor_t data);
 
 extern void corpses_init(void);
 
-extern unsigned long total_corpses_count(void);
+extern unsigned long total_corpses_count(void) __attribute__((pure));
 extern boolean_t corpses_enabled(void);
 
 extern kern_return_t task_generate_corpse_internal(
index f7cad85e26a3ea8f7dc5e8ec1880da024ff2f877..1a6d194dfa956b3600f62b1a2e08b59147ecf284 100644 (file)
@@ -39,7 +39,7 @@ ${MIGINCLUDES} : ${MIG_TYPES}
 
 ${MIG_UUHDRS} : \
        %.h : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)$(MIG) $(MIGFLAGS)         \
                -server /dev/null       \
                -user /dev/null         \
@@ -48,7 +48,7 @@ ${MIG_UUHDRS} : \
 
 ${MIG_USHDRS} : \
        %_server.h : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)$(MIG) $(MIGFLAGS)         \
                -server /dev/null       \
                -user /dev/null         \
@@ -99,7 +99,7 @@ ${COMP_FILES} : ${MIG_TYPES}
 
 ${MIG_KUSRC} : \
        %_user.c : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS}        \
                -user    $*_user.c              \
                -header  $*.h              \
@@ -109,7 +109,7 @@ ${MIG_KUSRC} : \
 
 ${MIG_KSSRC}: \
        %_server.c : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS}        \
                -user    /dev/null              \
                -header  /dev/null              \
index c6e070a05ba6d6af06b9e402225791433b17ee47..2f23d12310ff4a95a4d7fedaa02eb3e83530a182 100644 (file)
@@ -45,7 +45,7 @@ COMP_FILES    = ${DEVICE_FILES}
 do_build_all:: $(COMP_FILES)
 
 ${DEVICE_FILES}: device.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS}   \
        -header /dev/null                       \
        -user /dev/null                         \
index c0a92c6e74dbeae468adc8a42765149093b677db..8b1fedd41154e8ba2b66df288b51655b48d7f29e 100644 (file)
@@ -90,12 +90,10 @@ lck_mtx_t iokit_obj_to_port_binding_lock;
 void
 device_service_create(void)
 {
-       master_device_port = ipc_port_alloc_kernel();
-       if (master_device_port == IP_NULL) {
-               panic("can't allocate master device port");
-       }
+       master_device_port = ipc_kobject_alloc_port(
+               (ipc_kobject_t)&master_device_kobject, IKOT_MASTER_DEVICE,
+               IPC_KOBJECT_ALLOC_NONE);
 
-       ipc_kobject_set(master_device_port, (ipc_kobject_t)&master_device_kobject, IKOT_MASTER_DEVICE);
        kernel_set_special_port(host_priv_self(), HOST_IO_MASTER_PORT,
            ipc_port_make_send(master_device_port));
 
index 625eabe0a9f0e30c42d1871ac850c598a2f6a946..e73e85c501fc5cf8145f1ae648c594796c57cbf2 100644 (file)
@@ -74,8 +74,21 @@ extern mach_port_t      master_device_port;
 #define DEVICE_PAGER_COHERENT           0x2
 #define DEVICE_PAGER_CACHE_INHIB        0x4
 #define DEVICE_PAGER_WRITE_THROUGH      0x8
-#define DEVICE_PAGER_EARLY_ACK          0x20
 #define DEVICE_PAGER_CONTIGUOUS         0x100
 #define DEVICE_PAGER_NOPHYSCACHE        0x200
 
+#ifdef XNU_KERNEL_PRIVATE
+#include <vm/memory_types.h>
+
+_Static_assert(((DEVICE_PAGER_CONTIGUOUS | DEVICE_PAGER_NOPHYSCACHE) & VM_WIMG_MASK) == 0,
+    "device pager flags overlap WIMG mask");
+
+/* Assert on the backwards-compatible DEVICE_PAGER* values */
+_Static_assert(DEVICE_PAGER_GUARDED == VM_MEM_GUARDED, "DEVICE_PAGER_GUARDED != VM_MEM_GUARDED");
+_Static_assert(DEVICE_PAGER_COHERENT == VM_MEM_COHERENT, "DEVICE_PAGER_COHERENT != VM_MEM_COHERENT");
+_Static_assert(DEVICE_PAGER_CACHE_INHIB == VM_MEM_NOT_CACHEABLE, "DEVICE_PAGER_CACHE_INHIB != VM_MEM_NOT_CACHEABLE");
+_Static_assert(DEVICE_PAGER_WRITE_THROUGH == VM_MEM_WRITE_THROUGH, "DEVICE_PAGER_WRITE_THROUGH != VM_MEM_WRITE_THROUGH");
+
+#endif /* KERNEL_PRIVATE */
+
 #endif  /* _DEVICE_DEVICE_PORT_H_ */
index 28a649ce8316186a5bb57769c20dc3259fc92f13..f1cc26e1e2916d391fef5bfcc32d7063ff98d151 100644 (file)
@@ -72,7 +72,7 @@
 #include <mach/port.h>
 
 #if PRIVATE
-#define IOKIT_SERVER_VERSION    20150715
+#define IOKIT_SERVER_VERSION    20190423
 #endif
 
 
@@ -121,12 +121,14 @@ typedef uint64_t                io_async_ref64_t[8];
 
 typedef struct IOObject * io_object_t;
 typedef io_object_t io_connect_t;
+typedef io_object_t uext_object_t;
 
 extern void iokit_remove_reference( io_object_t obj );
 extern void iokit_remove_connect_reference( io_object_t obj );
 
 extern io_object_t iokit_lookup_object_port( ipc_port_t port );
 extern io_connect_t iokit_lookup_connect_port( ipc_port_t port );
+extern uext_object_t iokit_lookup_uext_object_port( ipc_port_t port );
 
 extern ipc_port_t iokit_make_object_port( io_object_t obj );
 extern ipc_port_t iokit_make_connect_port( io_connect_t obj );
index 487eee3365eabc82023b5c86d05a9acedda4008b..15866381eeef43ee44971b7f7fef2f7db23aefce 100644 (file)
@@ -75,7 +75,7 @@
  * Lookup a device by its port.
  * Doesn't consume the naked send right; produces a device reference.
  */
-static io_object_t
+io_object_t
 iokit_lookup_io_object(ipc_port_t port, ipc_kobject_type_t type)
 {
        io_object_t     obj;
@@ -111,6 +111,13 @@ iokit_lookup_connect_port(
        return iokit_lookup_io_object(port, IKOT_IOKIT_CONNECT);
 }
 
+MIGEXTERN io_object_t
+iokit_lookup_uext_object_port(
+       ipc_port_t      port)
+{
+       return iokit_lookup_io_object(port, IKOT_UEXT_OBJECT);
+}
+
 static io_object_t
 iokit_lookup_object_in_space_with_port_name(mach_port_name_t name, ipc_kobject_type_t type, ipc_space_t space)
 {
@@ -120,16 +127,16 @@ iokit_lookup_object_in_space_with_port_name(mach_port_name_t name, ipc_kobject_t
                ipc_port_t port;
                kern_return_t kr;
 
-               kr = ipc_object_translate(space, name, MACH_PORT_RIGHT_SEND, (ipc_object_t *)&port);
+               kr = ipc_port_translate_send(space, name, &port);
 
                if (kr == KERN_SUCCESS) {
                        assert(IP_VALID(port));
-
+                       require_ip_active(port);
                        ip_reference(port);
                        ip_unlock(port);
 
                        iokit_lock_port(port);
-                       if (ip_active(port) && (ip_kotype(port) == type)) {
+                       if (ip_kotype(port) == type) {
                                obj = (io_object_t) port->ip_kobject;
                                iokit_add_reference(obj, type);
                        }
@@ -154,6 +161,12 @@ iokit_lookup_connect_ref_current_task(mach_port_name_t name)
        return iokit_lookup_object_in_space_with_port_name(name, IKOT_IOKIT_CONNECT, current_space());
 }
 
+EXTERN io_object_t
+iokit_lookup_uext_ref_current_task(mach_port_name_t name)
+{
+       return iokit_lookup_object_in_space_with_port_name(name, IKOT_UEXT_OBJECT, current_space());
+}
+
 EXTERN void
 iokit_retain_port( ipc_port_t port )
 {
@@ -233,32 +246,15 @@ int gIOKitPortCount;
 EXTERN ipc_port_t
 iokit_alloc_object_port( io_object_t obj, ipc_kobject_type_t type )
 {
-       ipc_port_t          notify;
-       ipc_port_t          port;
-
-       do {
-               /* Allocate port, keeping a reference for it. */
-               port = ipc_port_alloc_kernel();
-               if (port == IP_NULL) {
-                       continue;
-               }
-
-               /* set kobject & type */
-               ipc_kobject_set( port, (ipc_kobject_t) obj, type);
-
-               /* Request no-senders notifications on the port. */
-               ip_lock( port);
-               notify = ipc_port_make_sonce_locked( port);
-               ipc_port_nsrequest( port, 1, notify, &notify);
-               /* port unlocked */
-               assert( notify == IP_NULL);
-               gIOKitPortCount++;
-       } while (FALSE);
-
-       return port;
+       /* Allocate port, keeping a reference for it. */
+       gIOKitPortCount++;
+       ipc_kobject_alloc_options_t options = IPC_KOBJECT_ALLOC_NSREQUEST;
+       if (type == IKOT_IOKIT_CONNECT) {
+               options |= IPC_KOBJECT_ALLOC_IMMOVABLE_SEND;
+       }
+       return ipc_kobject_alloc_port((ipc_kobject_t) obj, type, options);
 }
 
-
 EXTERN kern_return_t
 iokit_destroy_object_port( ipc_port_t port )
 {
@@ -304,8 +300,12 @@ iokit_make_send_right( task_t task, io_object_t obj, ipc_kobject_type_t type )
 
        if (IP_VALID( sendPort )) {
                kern_return_t   kr;
-               kr = ipc_object_copyout( task->itk_space, (ipc_object_t) sendPort,
-                   MACH_MSG_TYPE_PORT_SEND, TRUE, &name);
+               // Remove once <rdar://problem/45522961> is fixed.
+               // We need to make ith_knote NULL as ipc_object_copyout() uses
+               // thread-argument-passing and its value should not be garbage
+               current_thread()->ith_knote = ITH_KNOTE_NULL;
+               kr = ipc_object_copyout( task->itk_space, ip_to_object(sendPort),
+                   MACH_MSG_TYPE_PORT_SEND, NULL, NULL, &name);
                if (kr != KERN_SUCCESS) {
                        ipc_port_release_send( sendPort );
                        name = MACH_PORT_NULL;
@@ -339,7 +339,7 @@ iokit_no_senders( mach_no_senders_notification_t * notification )
        ipc_kobject_type_t  type = IKOT_NONE;
        ipc_port_t          notify;
 
-       port = (ipc_port_t) notification->not_header.msgh_remote_port;
+       port = notification->not_header.msgh_remote_port;
 
        // convert a port to io_object_t.
        if (IP_VALID(port)) {
@@ -349,7 +349,8 @@ iokit_no_senders( mach_no_senders_notification_t * notification )
                        type = ip_kotype( port );
                        if ((IKOT_IOKIT_OBJECT == type)
                            || (IKOT_IOKIT_CONNECT == type)
-                           || (IKOT_IOKIT_IDENT == type)) {
+                           || (IKOT_IOKIT_IDENT == type)
+                           || (IKOT_UEXT_OBJECT == type)) {
                                iokit_add_reference( obj, IKOT_IOKIT_OBJECT );
                        } else {
                                obj = NULL;
@@ -449,6 +450,10 @@ IOMapPages(vm_map_t map, mach_vm_address_t va, mach_vm_address_t pa,
        case kIOMapPostedWrite:
                flags = VM_WIMG_POSTED;
                break;
+
+       case kIOMapRealTimeCache:
+               flags = VM_WIMG_RT;
+               break;
        }
 
        pmap_set_cache_attributes(pagenum, flags);
@@ -513,6 +518,10 @@ IOProtectCacheMode(vm_map_t __unused map, mach_vm_address_t __unused va,
        case kIOMapPostedWrite:
                flags = VM_WIMG_POSTED;
                break;
+
+       case kIOMapRealTimeCache:
+               flags = VM_WIMG_RT;
+               break;
        }
 
        pmap_flush_context_init(&pmap_flush_context_storage);
index 83666b479e70e224069d00575d3e5d77e8e49081..3f42fac823cc44a509c724650b713df8bed25b01 100644 (file)
@@ -41,7 +41,7 @@ COMP_FILES = ${MIG_KUSRC}
 do_build_all:: $(COMP_FILES)
 
 ${MIG_KUSRC} : gssd_mach.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS}   \
                -user    gssd_mach.c            \
                -header  gssd_mach.h            \
index 19781ce3bb9aa2d13e339b79afa78ff791e35d59..f6b970194709183d2fd65b363bb582ac27136393 100644 (file)
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 
+#include <IOKit/IOBSD.h>
 #include <IOKit/IOPlatformExpert.h>
 #include <IOKit/IOHibernatePrivate.h>
 
@@ -198,11 +199,10 @@ extern void             kdp_snapshot_preflight(int pid, void *tracebuf,
     boolean_t enable_faulting);
 extern int              kdp_stack_snapshot_bytes_traced(void);
 
-#if DEVELOPMENT || DEBUG
 vm_offset_t panic_stackshot_buf = 0;
-size_t panic_stackshot_len = 0;
-#endif
+size_t panic_stackshot_buf_len = 0;
 
+size_t panic_stackshot_len = 0;
 /*
  * Backtrace a single frame.
  */
@@ -765,7 +765,8 @@ uint64_t panic_restart_timeout = ~(0ULL);
 void
 RecordPanicStackshot()
 {
-       int err = 0, bytes_traced = 0, bytes_used = 0, bytes_remaining = 0;
+       int err = 0;
+       size_t bytes_traced = 0, bytes_used = 0, bytes_remaining = 0;
        char *stackshot_begin_loc = NULL;
 
        /* Don't re-enter this code if we panic here */
@@ -786,136 +787,125 @@ RecordPanicStackshot()
                return;
        }
 
-       /*
-        * Try to capture an in memory panic_stackshot (enabled during boot
-        * on systems with co-processors).
-        */
-       if (extended_debug_log_enabled) {
-               if (stackshot_active()) {
-                       panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_NESTED;
-                       panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr);
-                       kdb_printf("Panicked during stackshot, skipping panic stackshot\n");
-                       return;
-               } else {
-                       stackshot_begin_loc = debug_buf_ptr;
-
-                       bytes_remaining = debug_buf_size - (unsigned int)((uintptr_t)stackshot_begin_loc - (uintptr_t)debug_buf_base);
-                       err = kcdata_memory_static_init(&kc_panic_data, (mach_vm_address_t)stackshot_begin_loc,
-                           KCDATA_BUFFER_BEGIN_STACKSHOT, bytes_remaining, KCFLAG_USE_MEMCOPY);
-                       if (err != KERN_SUCCESS) {
-                               panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_ERROR;
-                               panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr);
-                               kdb_printf("Failed to initialize kcdata buffer for in-memory panic stackshot, skipping ...\n");
-                               return;
-                       }
-
-                       kdp_snapshot_preflight(-1, (void *) stackshot_begin_loc, bytes_remaining,
-                           (STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_SAVE_LOADINFO | STACKSHOT_KCDATA_FORMAT |
-                           STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_FROM_PANIC |
-                           STACKSHOT_NO_IO_STATS | STACKSHOT_THREAD_WAITINFO), &kc_panic_data, 0);
-                       err = do_stackshot(NULL);
-                       bytes_traced = (int) kdp_stack_snapshot_bytes_traced();
-                       bytes_used = (int) kcdata_memory_get_used_bytes(&kc_panic_data);
-
-                       if ((err != KERN_SUCCESS) && (bytes_used > 0)) {
-                               /*
-                                * We ran out of space while trying to capture a stackshot, try again without user frames.
-                                * It's not safe to log from here, but append a flag to the panic flags.
-                                */
-                               panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_KERNEL_ONLY;
-                               panic_stackshot_reset_state();
-
-                               /* Erase the stackshot data (this region is pre-populated with the NULL character) */
-                               memset(stackshot_begin_loc, '\0', bytes_used);
-
-                               err = kcdata_memory_static_init(&kc_panic_data, (mach_vm_address_t)stackshot_begin_loc,
-                                   KCDATA_BUFFER_BEGIN_STACKSHOT, bytes_remaining, KCFLAG_USE_MEMCOPY);
-                               if (err != KERN_SUCCESS) {
-                                       panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_ERROR;
-                                       panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr);
-                                       kdb_printf("Failed to re-initialize kcdata buffer for kernel only in-memory panic stackshot, skipping ...\n");
-                                       return;
-                               }
-
-                               kdp_snapshot_preflight(-1, (void *) stackshot_begin_loc, bytes_remaining, (STACKSHOT_KCDATA_FORMAT |
-                                   STACKSHOT_NO_IO_STATS | STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY |
-                                   STACKSHOT_FROM_PANIC | STACKSHOT_THREAD_WAITINFO), &kc_panic_data, 0);
-                               err = do_stackshot(NULL);
-                               bytes_traced = (int) kdp_stack_snapshot_bytes_traced();
-                               bytes_used = (int) kcdata_memory_get_used_bytes(&kc_panic_data);
-                       }
-
-                       if (err == KERN_SUCCESS) {
-                               debug_buf_ptr += bytes_traced;
-                               panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_SUCCEEDED;
-                               panic_info->mph_stackshot_offset = PE_get_offset_into_panic_region(stackshot_begin_loc);
-                               panic_info->mph_stackshot_len = bytes_traced;
-
-                               panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr);
-                               kdb_printf("\n** In Memory Panic Stackshot Succeeded ** Bytes Traced %d **\n", bytes_traced);
-                       } else {
-                               if (bytes_used > 0) {
-                                       /* Erase the stackshot data (this region is pre-populated with the NULL character) */
-                                       memset(stackshot_begin_loc, '\0', bytes_used);
-                                       panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_INCOMPLETE;
-
-                                       panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr);
-                                       kdb_printf("\n** In Memory Panic Stackshot Incomplete ** Bytes Filled %d ** Err %d\n", bytes_used, err);
-                               } else {
-                                       bzero(stackshot_begin_loc, bytes_used);
-                                       panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_ERROR;
-
-                                       panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr);
-                                       kdb_printf("\n** In Memory Panic Stackshot Failed ** Bytes Traced %d, err %d\n", bytes_traced, err);
-                               }
-                       }
-               }
-#if DEVELOPMENT || DEBUG
-               if (panic_stackshot_buf != 0) {
-                       /* We're going to try to take another stackshot, reset the state. */
-                       panic_stackshot_reset_state();
-               }
-#endif /* DEVELOPMENT || DEBUG */
-       } else {
+       if (stackshot_active()) {
+               panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_NESTED;
                panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr);
-       }
-
-#if DEVELOPMENT || DEBUG
-
-       if (panic_stackshot_buf == 0) {
-               kdb_printf("No stackshot buffer allocated for file backed panic stackshot, skipping...\n");
+               kdb_printf("Panicked during stackshot, skipping panic stackshot\n");
                return;
        }
 
-       if (stackshot_active()) {
-               kdb_printf("Panicked during stackshot, skipping file backed panic stackshot\n");
+       /* Try to capture an in memory panic_stackshot */
+       if (extended_debug_log_enabled) {
+               /* On coprocessor systems we write this into the extended debug log */
+               stackshot_begin_loc = debug_buf_ptr;
+               bytes_remaining = debug_buf_size - (unsigned int)((uintptr_t)stackshot_begin_loc - (uintptr_t)debug_buf_base);
+       } else if (panic_stackshot_buf != 0) {
+               /* On other systems we use the panic stackshot_buf */
+               stackshot_begin_loc = (char *) panic_stackshot_buf;
+               bytes_remaining = panic_stackshot_buf_len;
+       } else {
+               panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr);
                return;
        }
 
-       err = kcdata_memory_static_init(&kc_panic_data, (mach_vm_address_t)panic_stackshot_buf, KCDATA_BUFFER_BEGIN_STACKSHOT,
-           PANIC_STACKSHOT_BUFSIZE, KCFLAG_USE_MEMCOPY);
+
+       err = kcdata_memory_static_init(&kc_panic_data, (mach_vm_address_t)stackshot_begin_loc,
+           KCDATA_BUFFER_BEGIN_STACKSHOT, (unsigned int) bytes_remaining, KCFLAG_USE_MEMCOPY);
        if (err != KERN_SUCCESS) {
-               kdb_printf("Failed to initialize kcdata buffer for file backed panic stackshot, skipping ...\n");
+               panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_ERROR;
+               panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr);
+               kdb_printf("Failed to initialize kcdata buffer for in-memory panic stackshot, skipping ...\n");
                return;
        }
 
-       kdp_snapshot_preflight(-1, (void *) panic_stackshot_buf, PANIC_STACKSHOT_BUFSIZE, (STACKSHOT_GET_GLOBAL_MEM_STATS | STACKSHOT_SAVE_LOADINFO | STACKSHOT_KCDATA_FORMAT |
-           STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_FROM_PANIC | STACKSHOT_NO_IO_STATS
-           | STACKSHOT_THREAD_WAITINFO), &kc_panic_data, 0);
+       uint32_t stackshot_flags = (STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_SAVE_LOADINFO | STACKSHOT_KCDATA_FORMAT |
+           STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_FROM_PANIC |
+           STACKSHOT_NO_IO_STATS | STACKSHOT_THREAD_WAITINFO);
+#if DEVELOPMENT
+       /*
+        * Include the shared cache layout in panic stackshots on DEVELOPMENT kernels so that we can symbolicate
+        * panic stackshots from corefiles.
+        */
+       stackshot_flags |= STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT;
+#endif
+
+       kdp_snapshot_preflight(-1, (void *) stackshot_begin_loc, (uint32_t) bytes_remaining, stackshot_flags, &kc_panic_data, 0);
        err = do_stackshot(NULL);
        bytes_traced = (int) kdp_stack_snapshot_bytes_traced();
-       if (bytes_traced > 0 && !err) {
+       bytes_used = (int) kcdata_memory_get_used_bytes(&kc_panic_data);
+
+       if ((err != KERN_SUCCESS) && (bytes_used > 0)) {
+               /*
+                * We ran out of space while trying to capture a stackshot, try again without user frames.
+                * It's not safe to log from here (in case we're writing in the middle of the debug buffer on coprocessor systems)
+                * but append a flag to the panic flags.
+                */
+               panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_KERNEL_ONLY;
+               panic_stackshot_reset_state();
+
+               /* Erase the stackshot data (this region is pre-populated with the NULL character) */
+               memset(stackshot_begin_loc, '\0', bytes_used);
+
+               err = kcdata_memory_static_init(&kc_panic_data, (mach_vm_address_t)stackshot_begin_loc,
+                   KCDATA_BUFFER_BEGIN_STACKSHOT, (unsigned int) bytes_remaining, KCFLAG_USE_MEMCOPY);
+               if (err != KERN_SUCCESS) {
+                       panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_ERROR;
+                       panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr);
+                       kdb_printf("Failed to re-initialize kcdata buffer for kernel only in-memory panic stackshot, skipping ...\n");
+                       return;
+               }
+
+               stackshot_flags = (STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_KCDATA_FORMAT | STACKSHOT_FROM_PANIC |
+                   STACKSHOT_NO_IO_STATS | STACKSHOT_THREAD_WAITINFO | STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY);
+#if DEVELOPMENT
+               /*
+                * Include the shared cache layout in panic stackshots on DEVELOPMENT kernels so that we can symbolicate
+                * panic stackshots from corefiles.
+                */
+               stackshot_flags |= STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT;
+#endif
+
+               kdp_snapshot_preflight(-1, (void *) stackshot_begin_loc, (uint32_t) bytes_remaining, stackshot_flags, &kc_panic_data, 0);
+               err = do_stackshot(NULL);
+               bytes_traced = (int) kdp_stack_snapshot_bytes_traced();
+               bytes_used = (int) kcdata_memory_get_used_bytes(&kc_panic_data);
+       }
+
+       if (err == KERN_SUCCESS) {
+               if (extended_debug_log_enabled) {
+                       debug_buf_ptr += bytes_traced;
+               }
+               panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_SUCCEEDED;
+               panic_info->mph_stackshot_offset = PE_get_offset_into_panic_region(stackshot_begin_loc);
+               panic_info->mph_stackshot_len = (uint32_t) bytes_traced;
+
+               panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr);
+               kdb_printf("\n** In Memory Panic Stackshot Succeeded ** Bytes Traced %zu **\n", bytes_traced);
+
+               /* Used by the code that writes the buffer to disk */
+               panic_stackshot_buf = (vm_offset_t) stackshot_begin_loc;
                panic_stackshot_len = bytes_traced;
-               kdb_printf("File backed panic stackshot succeeded, length: %u bytes\n", bytes_traced);
+
+               if (!extended_debug_log_enabled && (gIOPolledCoreFileMode == kIOPolledCoreFileModeStackshot)) {
+                       /* System configured to write panic stackshot to disk */
+                       kern_dump(KERN_DUMP_STACKSHOT_DISK);
+               }
        } else {
-               bytes_used = (int) kcdata_memory_get_used_bytes(&kc_panic_data);
                if (bytes_used > 0) {
-                       kdb_printf("File backed panic stackshot incomplete, consumed %u bytes, error : %d \n", bytes_used, err);
+                       /* Erase the stackshot data (this region is pre-populated with the NULL character) */
+                       memset(stackshot_begin_loc, '\0', bytes_used);
+                       panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_INCOMPLETE;
+
+                       panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr);
+                       kdb_printf("\n** In Memory Panic Stackshot Incomplete ** Bytes Filled %zu ** Err %d\n", bytes_used, err);
                } else {
-                       kdb_printf("File backed panic stackshot incomplete, consumed %u bytes, error : %d \n", bytes_used, err);
+                       bzero(stackshot_begin_loc, bytes_used);
+                       panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_STACKSHOT_FAILED_ERROR;
+
+                       panic_info->mph_other_log_offset = PE_get_offset_into_panic_region(debug_buf_ptr);
+                       kdb_printf("\n** In Memory Panic Stackshot Failed ** Bytes Traced %zu, err %d\n", bytes_traced, err);
                }
        }
-#endif /* DEVELOPMENT || DEBUG */
 
        return;
 }
@@ -991,9 +981,7 @@ SavePanicInfo(
         * Flush the panic log again with the stackshot or any relevant logging
         * from when we tried to capture it.
         */
-       if (extended_debug_log_enabled) {
-               paniclog_flush_internal(kPaniclogFlushStackshot);
-       }
+       paniclog_flush_internal(kPaniclogFlushStackshot);
 }
 
 void
@@ -1261,7 +1249,7 @@ panic_i386_backtrace(void *_frame, int nframes, const char *msg, boolean_t regdu
        boolean_t old_doprnt_hide_pointers = doprnt_hide_pointers;
 
        if (pbtcpu != cn) {
-               hw_atomic_add(&pbtcnt, 1);
+               os_atomic_inc(&pbtcnt, relaxed);
                /* Spin on print backtrace lock, which serializes output
                 * Continue anyway if a timeout occurs.
                 */
@@ -1382,7 +1370,7 @@ out:
         * event of panics on multiple processors.
         */
        hw_lock_unlock(&pbtlock);
-       hw_atomic_sub(&pbtcnt, 1);
+       os_atomic_dec(&pbtcnt, relaxed);
        /* Wait for other processors to complete output
         * Timeout and continue after PBT_TIMEOUT_CYCLES.
         */
@@ -1562,7 +1550,7 @@ print_launchd_info(void)
        int             cn = cpu_number();
 
        if (pbtcpu != cn) {
-               hw_atomic_add(&pbtcnt, 1);
+               os_atomic_inc(&pbtcnt, relaxed);
                /* Spin on print backtrace lock, which serializes output
                 * Continue anyway if a timeout occurs.
                 */
@@ -1581,7 +1569,7 @@ print_launchd_info(void)
         * event of panics on multiple processors.
         */
        hw_lock_unlock(&pbtlock);
-       hw_atomic_sub(&pbtcnt, 1);
+       os_atomic_dec(&pbtcnt, relaxed);
        /* Wait for other processors to complete output
         * Timeout and continue after PBT_TIMEOUT_CYCLES.
         */
index 12e9c602505bfd67b06a0832f22148d05b5541b5..e46ad574887ec9d67ec7492b92dfad2b98183b1d 100644 (file)
@@ -25,6 +25,7 @@ EXPORT_ONLY_FILES =   \
                    locks_i386_inlines.h \
                    machine_routines.h \
                    machine_cpu.h \
+                   memory_types.h \
                    mtrr.h \
                    mp.h \
                    mp_desc.h \
@@ -32,7 +33,7 @@ EXPORT_ONLY_FILES =   \
                    pal_native.h \
                    pal_routines.h \
                    pal_hibernate.h \
-                       panic_hooks.h \
+                   panic_hooks.h \
                    pmCPU.h \
                    pmap.h \
                    proc_reg.h \
index 10ab92123f4390aa28d8e466c546e7c78be78c37..ee93c2eb8d27c78782773d01c73df0202babe796 100644 (file)
@@ -367,17 +367,21 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon)
        /* Restart timer interrupts */
        rtc_timer_start();
 
-#if HIBERNATION
 
+#if MONOTONIC
+       mt_cpu_up(cdp);
+#endif /* MONOTONIC */
+
+#if HIBERNATION
        kprintf("ret from acpi_sleep_cpu hib=%d\n", did_hibernate);
-#endif
+#endif /* HIBERNATION */
 
 #if CONFIG_SLEEP
        /* Becase we don't save the bootstrap page, and we share it
         * between sleep and mp slave init, we need to recreate it
         * after coming back from sleep or hibernate */
        install_real_mode_bootstrap(slave_pstart);
-#endif
+#endif /* CONFIG_SLEEP */
 }
 
 /*
@@ -402,9 +406,17 @@ acpi_idle_kernel(acpi_sleep_callback func, void *refcon)
 
        assert(cpu_number() == master_cpu);
 
+#if MONOTONIC
+       mt_cpu_down(cpu_datap(0));
+#endif /* MONOTONIC */
+
        /* Cancel any pending deadline */
        setPop(0);
-       while (lapic_is_interrupting(LAPIC_TIMER_VECTOR)) {
+       while (lapic_is_interrupting(LAPIC_TIMER_VECTOR)
+#if MONOTONIC
+           || lapic_is_interrupting(LAPIC_VECTOR(PERFCNT))
+#endif /* MONOTONIC */
+           ) {
                (void) ml_set_interrupts_enabled(TRUE);
                setPop(0);
                ml_set_interrupts_enabled(FALSE);
index b04ac6a7d3ac0a32f08b03fa6603aeec26e89e69..50905a62e69503076ab975b11945312fbbfc137f 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
  * @OSF_COPYRIGHT@
  */
-/* 
+/*
  * Mach Operating System
  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  * All Rights Reserved.
- * 
+ *
  * Permission to use, copy, modify and distribute this software and its
  * documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
- * 
+ *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- * 
+ *
  * Carnegie Mellon requests users of this software to return to
- * 
+ *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
- * 
+ *
  * any improvements or extensions that they make and grant Carnegie Mellon
  * the rights to redistribute these changes.
  */
 #ifndef        _I386_ASM_H_
 #define        _I386_ASM_H_
 
-#ifdef _KERNEL
-#include <gprof.h>
-#endif /* _KERNEL */
-
-#if    defined(MACH_KERNEL) || defined(_KERNEL)
-#include <gprof.h>
-#endif /* MACH_KERNEL || _KERNEL */
-
 #if defined(__i386__)
 
 #define S_PC    (%esp)
 #define data16 .byte 0x66
 #define addr16 .byte 0x67
 
-#if !GPROF
 #define MCOUNT
 
-#elif defined(__SHARED__)
+#if defined(__SHARED__)
 #define MCOUNT         ; .data;\
                        .align ALIGN;\
                        LBc(x, 8) .long 0;\
                        Egaddr(%eax,_mcount_ptr);\
                        Gpop;\
                        call *(%eax);
-
-#else  /* !GPROF, !__SHARED__ */
-#define MCOUNT         ; call mcount;
-#endif /* GPROF */
+#endif /* __SHARED__ */
 
 #ifdef __ELF__
 #define ELF_FUNC(x)    .type x,@function
        leaq    (%rsp), %rsi                    ;\
        call    EXT(fn)                         ;\
        mov     (%rsp), %rsp
-       
+
 #define CCALL(fn)                               \
        mov     %rsp, %r12                      ;\
        and     $0xFFFFFFFFFFFFFFF0, %rsp       ;\
index 75ce5c5a94e0882729e9bde8656de18a666a5be6..bd2a0c703ce2e2cec27d48a9f36a933c6ace3ffd 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2015-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
-#ifndef _I386_ATOMIC_H_
-#define _I386_ATOMIC_H_
-
-#include <i386/smp.h>
-
-#if     __SMP__
-
-#define memory_order_consume_smp memory_order_consume
-#define memory_order_acquire_smp memory_order_acquire
-#define memory_order_release_smp memory_order_release
-#define memory_order_acq_rel_smp memory_order_acq_rel
-#define memory_order_seq_cst_smp memory_order_seq_cst
-
-#else
-
-#define memory_order_consume_smp memory_order_relaxed
-#define memory_order_acquire_smp memory_order_relaxed
-#define memory_order_release_smp memory_order_relaxed
-#define memory_order_acq_rel_smp memory_order_relaxed
-#define memory_order_seq_cst_smp memory_order_relaxed
-
+#ifndef _MACHINE_ATOMIC_H
+#error "Do not include <i386/atomic.h> directly, use <machine/atomic.h>"
 #endif
 
-#ifdef ATOMIC_PRIVATE
-
-static inline boolean_t
-atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval,
-    enum memory_order ord, boolean_t wait)
-{
-       (void)wait;
-       return __c11_atomic_compare_exchange_strong((_Atomic uintptr_t *)target, &oldval, newval, ord, memory_order_relaxed);
-}
-
-static inline boolean_t
-atomic_compare_exchange32(uint32_t *target, uint32_t oldval, uint32_t newval,
-    enum memory_order ord, boolean_t wait)
-{
-       (void)wait;
-       return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &oldval, newval, ord, memory_order_relaxed);
-}
+#ifndef _I386_ATOMIC_H_
+#define _I386_ATOMIC_H_
 
-#endif // ATOMIC_PRIVATE
+/* No special configuration for Intel */
 
 #endif // _I386_ATOMIC_H_
index a1d3b4965cf0ca1a8d860f12baa3d0bbd85ce57c..039a31bb63c4918a5cf27916f45ea52a8fdf27e4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -451,7 +451,17 @@ mach_call_munger(x86_saved_state_t *state)
        int call_number;
        mach_call_t mach_call;
        kern_return_t retval;
-       struct mach_call_args args = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+       struct mach_call_args args = {
+               .arg1 = 0,
+               .arg2 = 0,
+               .arg3 = 0,
+               .arg4 = 0,
+               .arg5 = 0,
+               .arg6 = 0,
+               .arg7 = 0,
+               .arg8 = 0,
+               .arg9 = 0
+       };
        x86_saved_state32_t     *regs;
 
        struct uthread *ut = get_bsdthread_info(current_thread());
@@ -542,7 +552,17 @@ mach_call_munger64(x86_saved_state_t *state)
        int call_number;
        int argc;
        mach_call_t mach_call;
-       struct mach_call_args args = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+       struct mach_call_args args = {
+               .arg1 = 0,
+               .arg2 = 0,
+               .arg3 = 0,
+               .arg4 = 0,
+               .arg5 = 0,
+               .arg6 = 0,
+               .arg7 = 0,
+               .arg8 = 0,
+               .arg9 = 0
+       };
        x86_saved_state64_t     *regs;
 
        struct uthread *ut = get_bsdthread_info(current_thread());
@@ -574,8 +594,7 @@ mach_call_munger64(x86_saved_state_t *state)
        argc = mach_trap_table[call_number].mach_trap_arg_count;
        if (argc) {
                int args_in_regs = MIN(6, argc);
-
-               memcpy(&args.arg1, &regs->rdi, args_in_regs * sizeof(syscall_arg_t));
+               __nochk_memcpy(&args.arg1, &regs->rdi, args_in_regs * sizeof(syscall_arg_t));
 
                if (argc > 6) {
                        int copyin_count;
index 67f1ea084845c30bae0896c29635ee6cb7d068f0..2c4a40d839e281773e4a1aa89829a48d87f610a5 100644 (file)
@@ -375,6 +375,16 @@ commpage_init_cpu_capabilities( void )
                    CPUID_LEAF7_FEATURE_AVX512IFMA);
                setif(bits, kHasAVX512VBMI, cpuid_leaf7_features() &
                    CPUID_LEAF7_FEATURE_AVX512VBMI);
+               setif(bits, kHasVAES, cpuid_leaf7_features() &
+                   CPUID_LEAF7_FEATURE_VAES);
+               setif(bits, kHasVPCLMULQDQ, cpuid_leaf7_features() &
+                   CPUID_LEAF7_FEATURE_VPCLMULQDQ);
+               setif(bits, kHasAVX512VNNI, cpuid_leaf7_features() &
+                   CPUID_LEAF7_FEATURE_AVX512VNNI);
+               setif(bits, kHasAVX512BITALG, cpuid_leaf7_features() &
+                   CPUID_LEAF7_FEATURE_AVX512BITALG);
+               setif(bits, kHasAVX512VPOPCNTDQ, cpuid_leaf7_features() &
+                   CPUID_LEAF7_FEATURE_AVX512VPCDQ);
        }
 
        uint64_t misc_enable = rdmsr64(MSR_IA32_MISC_ENABLE);
@@ -482,6 +492,7 @@ commpage_stuff_routine(
        commpage_stuff(rd->commpage_address, rd->code_address, rd->code_length);
 }
 
+
 /* Fill in the 32- or 64-bit commpage.  Called once for each.
  */
 
@@ -568,7 +579,7 @@ commpage_populate( void )
            _COMM_PAGE32_BASE_ADDRESS,
            &time_data32,
            &gtod_time_data32,
-           "commpage 32-bit",
+           _COMM_PAGE32_SIGNATURE_STRING,
            VM_PROT_READ);
 #ifndef __LP64__
        pmap_commpage32_init((vm_offset_t) commPagePtr32, _COMM_PAGE32_BASE_ADDRESS,
@@ -584,7 +595,7 @@ commpage_populate( void )
                    _COMM_PAGE32_START_ADDRESS,                     /* commpage address are relative to 32-bit commpage placement */
                    &time_data64,
                    &gtod_time_data64,
-                   "commpage 64-bit",
+                   _COMM_PAGE64_SIGNATURE_STRING,
                    VM_PROT_READ);
 #ifndef __LP64__
                pmap_commpage64_init((vm_offset_t) commPagePtr64, _COMM_PAGE64_BASE_ADDRESS,
@@ -883,6 +894,54 @@ commpage_update_atm_diagnostic_config(uint32_t diagnostic_config)
        }
 }
 
+/*
+ * update the commpage with if dtrace user land probes are enabled
+ */
+void
+commpage_update_dof(boolean_t enabled)
+{
+#if CONFIG_DTRACE
+       char *cp;
+
+       cp = commPagePtr32;
+       if (cp) {
+               cp += (_COMM_PAGE_DTRACE_DOF_ENABLED - _COMM_PAGE32_BASE_ADDRESS);
+               *cp = (enabled ? 1 : 0);
+       }
+
+       cp = commPagePtr64;
+       if (cp) {
+               cp += (_COMM_PAGE_DTRACE_DOF_ENABLED - _COMM_PAGE32_START_ADDRESS);
+               *cp = (enabled ? 1 : 0);
+       }
+#else
+       (void)enabled;
+#endif
+}
+
+
+/*
+ * update the dyld global config flags
+ */
+void
+commpage_update_dyld_flags(uint64_t value)
+{
+       char *cp;
+
+       cp = commPagePtr32;
+       if (cp) {
+               cp += (_COMM_PAGE_DYLD_SYSTEM_FLAGS - _COMM_PAGE32_BASE_ADDRESS);
+               *(uint64_t *)cp = value;
+       }
+
+       cp = commPagePtr64;
+       if (cp) {
+               cp += (_COMM_PAGE_DYLD_SYSTEM_FLAGS - _COMM_PAGE32_BASE_ADDRESS);
+               *(uint64_t *)cp = value;
+       }
+}
+
+
 /*
  * update the commpage data for last known value of mach_absolute_time()
  */
index 2dc782686964c3a1d8b88f62cfddd5560fc4d31f..2bf2a41f97ac24af9650d56fa2851e3b03eb40ef 100644 (file)
@@ -157,6 +157,8 @@ extern  void    commpage_update_mach_continuous_time(uint64_t sleeptime);
 extern  void    commpage_update_boottime(uint64_t boottime_usec);
 extern  void    commpage_update_kdebug_state(void);
 extern  void    commpage_update_atm_diagnostic_config(uint32_t);
+extern  void    commpage_update_dof(boolean_t enabled);
+extern  void    commpage_update_dyld_flags(uint64_t value);
 extern  void    commpage_post_ucode_update(void);
 
 extern  uint32_t        commpage_is_in_pfz32(uint32_t);
index 517c229c8e84ae3a71180b9cb686eb6314de9079..bad6b10166e0e4aed4e79136f1a83a29ef6e8e71 100644 (file)
@@ -93,6 +93,7 @@ cpu_sleep(void)
 {
        cpu_data_t      *cdp = current_cpu_datap();
 
+       /* This calls IOCPURunPlatformQuiesceActions when sleeping the boot cpu */
        PE_cpu_machine_quiesce(cdp->cpu_id);
 
        cpu_thread_halt();
index 991bf1b3485cab8a96e6f7c9eab4165fb59ac65e..89f8fc52d4c12f1f0da31f3a5b9a6a02fee63920 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -73,7 +73,6 @@
 #define kHasADX                 0x0000000400000000ULL
 #define kHasMPX                 0x0000001000000000ULL
 #define kHasSGX                 0x0000002000000000ULL
-#if !defined(RC_HIDE_XNU_J137)
 #define kHasAVX512F             0x0000004000000000ULL
 #define kHasAVX512CD            0x0000008000000000ULL
 #define kHasAVX512DQ            0x0000010000000000ULL
 #define kHasAVX512IFMA          0x0000040000000000ULL
 #define kHasAVX512VBMI          0x0000080000000000ULL
 #define kHasAVX512VL            0x0000100000000000ULL
-#endif /* not RC_HIDE_XNU_J137 */
+#define kHasVAES                0x0000200000000000ULL
+#define kHasVPCLMULQDQ          0x0000400000000000ULL
+#define kHasAVX512VNNI          0x0000800000000000ULL
+#define kHasAVX512BITALG        0x0001000000000000ULL
+#define kHasAVX512VPOPCNTDQ     0x0002000000000000ULL
 
 
 #ifndef __ASSEMBLER__
@@ -192,7 +195,7 @@ _NumCPUs( void )
 
 #define _COMM_PAGE_ACTIVE_CPUS          (_COMM_PAGE_START_ADDRESS+0x034)        /* uint8_t number of active CPUs (hw.activecpu) */
 #define _COMM_PAGE_PHYSICAL_CPUS        (_COMM_PAGE_START_ADDRESS+0x035)        /* uint8_t number of physical CPUs (hw.physicalcpu_max) */
-#define _COMM_PAGE_LOGICAL_CPUS (_COMM_PAGE_START_ADDRESS+0x036)        /* uint8_t number of logical CPUs (hw.logicalcpu_max) */
+#define _COMM_PAGE_LOGICAL_CPUS         (_COMM_PAGE_START_ADDRESS+0x036)        /* uint8_t number of logical CPUs (hw.logicalcpu_max) */
 #define _COMM_PAGE_UNUSED1              (_COMM_PAGE_START_ADDRESS+0x037)        /* 1 unused bytes */
 #define _COMM_PAGE_MEMORY_SIZE          (_COMM_PAGE_START_ADDRESS+0x038)        /* uint64_t max memory size */
 
@@ -200,7 +203,8 @@ _NumCPUs( void )
 #define _COMM_PAGE_KDEBUG_ENABLE        (_COMM_PAGE_START_ADDRESS+0x044)        /* uint32_t export "kdebug_enable" to userspace */
 #define _COMM_PAGE_ATM_DIAGNOSTIC_CONFIG        (_COMM_PAGE_START_ADDRESS+0x48) /* uint32_t export "atm_diagnostic_config" to userspace */
 
-#define _COMM_PAGE_UNUSED2              (_COMM_PAGE_START_ADDRESS+0x04C)        /* [0x4C,0x50) unused */
+#define _COMM_PAGE_DTRACE_DOF_ENABLED   (_COMM_PAGE_START_ADDRESS+0x04C)        /* uint8_t 0 if userspace DOF disable, 1 if enabled */
+#define _COMM_PAGE_UNUSED2              (_COMM_PAGE_START_ADDRESS+0x04D)        /* [0x4D,0x50) unused */
 
 #define _COMM_PAGE_TIME_DATA_START      (_COMM_PAGE_START_ADDRESS+0x050)        /* base of offsets below (_NT_SCALE etc) */
 #define _COMM_PAGE_NT_TSC_BASE          (_COMM_PAGE_START_ADDRESS+0x050)        /* used by nanotime() */
@@ -221,6 +225,9 @@ _NumCPUs( void )
 #define _COMM_PAGE_BOOTTIME_USEC        (_COMM_PAGE_START_ADDRESS+0x0C8)        /* uint64_t boottime */
 #define _COMM_PAGE_NEWTIMEOFDAY_DATA    (_COMM_PAGE_START_ADDRESS+0x0D0)        /* used by gettimeofday(). Currently, sizeof(new_commpage_timeofday_data_t) = 40 */
 
+/* Resume packed values to the next cacheline */
+#define _COMM_PAGE_DYLD_SYSTEM_FLAGS    (_COMM_PAGE_START_ADDRESS+0x100)        /* uint64_t export kern.dyld_system_flags to userspace */
+
 #define _COMM_PAGE_END                  (_COMM_PAGE_START_ADDRESS+0xfff)        /* end of common page */
 
 /* Warning: kernel commpage.h has a matching c typedef for the following.  They must be kept in sync.  */
index da7919cdafd63b5f8a69dcb77cb80b27a20cfacd..a479eaea8fe601497a8a8a7a6fe091bda21c66bd 100644 (file)
@@ -34,6 +34,7 @@
 #define I386_CPU_DATA
 
 #include <mach_assert.h>
+#include <machine/atomic.h>
 
 #include <kern/assert.h>
 #include <kern/kern_types.h>
@@ -436,7 +437,7 @@ get_active_thread_volatile(void)
        CPU_DATA_GET(cpu_active_thread, thread_t)
 }
 
-static inline __pure2 thread_t
+static inline __attribute__((const)) thread_t
 get_active_thread(void)
 {
        CPU_DATA_GET(cpu_active_thread, thread_t)
@@ -630,6 +631,7 @@ disable_preemption_internal(void)
 {
        assert(get_preemption_level() >= 0);
 
+       os_compiler_barrier(release);
 #if defined(__clang__)
        cpu_data_t GS_RELATIVE *cpu_data = (cpu_data_t GS_RELATIVE *)0UL;
        cpu_data->cpu_preemption_level++;
@@ -638,6 +640,7 @@ disable_preemption_internal(void)
             :
             : "i" (offsetof(cpu_data_t, cpu_preemption_level)));
 #endif
+       os_compiler_barrier(acquire);
        pltrace(FALSE);
 }
 
@@ -646,6 +649,7 @@ enable_preemption_internal(void)
 {
        assert(get_preemption_level() > 0);
        pltrace(TRUE);
+       os_compiler_barrier(release);
 #if defined(__clang__)
        cpu_data_t GS_RELATIVE *cpu_data = (cpu_data_t GS_RELATIVE *)0UL;
        if (0 == --cpu_data->cpu_preemption_level) {
@@ -660,6 +664,7 @@ enable_preemption_internal(void)
                         : "i" (offsetof(cpu_data_t, cpu_preemption_level))
                         : "eax", "ecx", "edx", "cc", "memory");
 #endif
+       os_compiler_barrier(acquire);
 }
 
 static inline void
@@ -668,6 +673,7 @@ enable_preemption_no_check(void)
        assert(get_preemption_level() > 0);
 
        pltrace(TRUE);
+       os_compiler_barrier(release);
 #if defined(__clang__)
        cpu_data_t GS_RELATIVE *cpu_data = (cpu_data_t GS_RELATIVE *)0UL;
        cpu_data->cpu_preemption_level--;
@@ -677,6 +683,7 @@ enable_preemption_no_check(void)
                         : "i" (offsetof(cpu_data_t, cpu_preemption_level))
                         : "cc", "memory");
 #endif
+       os_compiler_barrier(acquire);
 }
 
 static inline void
index 75da80e5f3cc86ae5de201eb4b54f88faf6a593d..0fafb3aad65d1e2a230a925ac1c93b66c9043830 100644 (file)
@@ -255,8 +255,6 @@ do_cwas(i386_cpu_info_t *cpuinfo, boolean_t on_slave)
         * Workaround for reclaiming perf counter 3 due to TSX memory ordering erratum.
         * This workaround does not support being forcibly set (since an MSR must be
         * enumerated, lest we #GP when forced to access it.)
-        * When RTM_FORCE_FORCE is enabled all RTM transactions on the logical CPU will
-        * forcefully abort, but the general purpose counter 3 will report correct values.
         */
        if (cpuid_wa_required(CPU_INTEL_TSXFA) == CWA_ON) {
                wrmsr64(MSR_IA32_TSX_FORCE_ABORT,
@@ -929,43 +927,46 @@ cpuid_set_info(void)
        }
        /* cpuid_set_cache_info must be invoked after set_generic_info */
 
-       if (info_p->cpuid_cpufamily == CPUFAMILY_INTEL_PENRYN) {
-               cpuid_set_cache_info(info_p);
-       }
-
        /*
         * Find the number of enabled cores and threads
         * (which determines whether SMT/Hyperthreading is active).
         */
-       switch (info_p->cpuid_cpufamily) {
-       case CPUFAMILY_INTEL_PENRYN:
-               info_p->core_count   = info_p->cpuid_cores_per_package;
-               info_p->thread_count = info_p->cpuid_logical_per_package;
-               break;
-       case CPUFAMILY_INTEL_WESTMERE: {
-               uint64_t msr = rdmsr64(MSR_CORE_THREAD_COUNT);
-               info_p->core_count   = bitfield32((uint32_t)msr, 19, 16);
-               info_p->thread_count = bitfield32((uint32_t)msr, 15, 0);
-               break;
-       }
-       default: {
-               uint64_t msr = rdmsr64(MSR_CORE_THREAD_COUNT);
-               if (msr == 0) {
-                       /* Provide a non-zero default for some VMMs */
-                       msr = (1 << 16) + 1;
-               }
-               info_p->core_count   = bitfield32((uint32_t)msr, 31, 16);
-               info_p->thread_count = bitfield32((uint32_t)msr, 15, 0);
-               break;
-       }
-       }
-       if (info_p->core_count == 0) {
-               info_p->core_count   = info_p->cpuid_cores_per_package;
-               info_p->thread_count = info_p->cpuid_logical_per_package;
-       }
 
-       if (info_p->cpuid_cpufamily != CPUFAMILY_INTEL_PENRYN) {
+       if (0 != (info_p->cpuid_features & CPUID_FEATURE_VMM) &&
+           PE_parse_boot_argn("-nomsr35h", NULL, 0)) {
+               info_p->core_count = 1;
+               info_p->thread_count = 1;
                cpuid_set_cache_info(info_p);
+       } else {
+               switch (info_p->cpuid_cpufamily) {
+               case CPUFAMILY_INTEL_PENRYN:
+                       cpuid_set_cache_info(info_p);
+                       info_p->core_count   = info_p->cpuid_cores_per_package;
+                       info_p->thread_count = info_p->cpuid_logical_per_package;
+                       break;
+               case CPUFAMILY_INTEL_WESTMERE: {
+                       uint64_t msr = rdmsr64(MSR_CORE_THREAD_COUNT);
+                       if (0 == msr) {
+                               /* Provide a non-zero default for some VMMs */
+                               msr = (1 << 16) | 1;
+                       }
+                       info_p->core_count   = bitfield32((uint32_t)msr, 19, 16);
+                       info_p->thread_count = bitfield32((uint32_t)msr, 15, 0);
+                       cpuid_set_cache_info(info_p);
+                       break;
+               }
+               default: {
+                       uint64_t msr = rdmsr64(MSR_CORE_THREAD_COUNT);
+                       if (0 == msr) {
+                               /* Provide a non-zero default for some VMMs */
+                               msr = (1 << 16) | 1;
+                       }
+                       info_p->core_count   = bitfield32((uint32_t)msr, 31, 16);
+                       info_p->thread_count = bitfield32((uint32_t)msr, 15, 0);
+                       cpuid_set_cache_info(info_p);
+                       break;
+               }
+               }
        }
 
        DBG("cpuid_set_info():\n");
@@ -1093,7 +1094,11 @@ static struct table {
        {CPUID_LEAF7_FEATURE_OSPKE, "OSPKE"},
        {CPUID_LEAF7_FEATURE_WAITPKG, "WAITPKG"},
        {CPUID_LEAF7_FEATURE_GFNI, "GFNI"},
-       {CPUID_LEAF7_FEATURE_AVX512VPCDQ, "AVX512VPCDQ"},
+       {CPUID_LEAF7_FEATURE_VAES, "VAES"},
+       {CPUID_LEAF7_FEATURE_VPCLMULQDQ, "VPCLMULQDQ"},
+       {CPUID_LEAF7_FEATURE_AVX512VNNI, "AVX512VNNI"},
+       {CPUID_LEAF7_FEATURE_AVX512BITALG, "AVX512BITALG"},
+       {CPUID_LEAF7_FEATURE_AVX512VPCDQ, "AVX512VPOPCNTDQ"},
        {CPUID_LEAF7_FEATURE_RDPID, "RDPID"},
        {CPUID_LEAF7_FEATURE_CLDEMOTE, "CLDEMOTE"},
        {CPUID_LEAF7_FEATURE_MOVDIRI, "MOVDIRI"},
@@ -1104,6 +1109,7 @@ static struct table {
     leaf7_extfeature_map[] = {
        { CPUID_LEAF7_EXTFEATURE_AVX5124VNNIW, "AVX5124VNNIW" },
        { CPUID_LEAF7_EXTFEATURE_AVX5124FMAPS, "AVX5124FMAPS" },
+       { CPUID_LEAF7_EXTFEATURE_FSREPMOV, "FSREPMOV" },
        { CPUID_LEAF7_EXTFEATURE_MDCLEAR, "MDCLEAR" },
        { CPUID_LEAF7_EXTFEATURE_TSXFA, "TSXFA" },
        { CPUID_LEAF7_EXTFEATURE_IBRS, "IBRS" },
index 3af0e20ef16b590524a10787031f1ba411b0e443..c80308084014e33309c1b99ffd4ec18388994a7b 100644 (file)
  * Leaf 7, subleaf 0 additional features.
  * Bits returned in %ebx:%ecx to a CPUID request with {%eax,%ecx} of (0x7,0x0}:
  */
-#define CPUID_LEAF7_FEATURE_RDWRFSGS _Bit(0)    /* FS/GS base read/write */
-#define CPUID_LEAF7_FEATURE_TSCOFF   _Bit(1)    /* TSC thread offset */
-#define CPUID_LEAF7_FEATURE_SGX      _Bit(2)    /* Software Guard eXtensions */
-#define CPUID_LEAF7_FEATURE_BMI1     _Bit(3)    /* Bit Manipulation Instrs, set 1 */
-#define CPUID_LEAF7_FEATURE_HLE      _Bit(4)    /* Hardware Lock Elision*/
-#define CPUID_LEAF7_FEATURE_AVX2     _Bit(5)    /* AVX2 Instructions */
-#define CPUID_LEAF7_FEATURE_FDPEO    _Bit(6)    /* x87 FPU Data Pointer updated only on x87 exceptions */
-#define CPUID_LEAF7_FEATURE_SMEP     _Bit(7)    /* Supervisor Mode Execute Protect */
-#define CPUID_LEAF7_FEATURE_BMI2     _Bit(8)    /* Bit Manipulation Instrs, set 2 */
-#define CPUID_LEAF7_FEATURE_ERMS     _Bit(9)    /* Enhanced Rep Movsb/Stosb */
-#define CPUID_LEAF7_FEATURE_INVPCID  _Bit(10)   /* INVPCID intruction, TDB */
-#define CPUID_LEAF7_FEATURE_RTM      _Bit(11)   /* RTM */
-#define CPUID_LEAF7_FEATURE_PQM      _Bit(12)   /* Platform Qos Monitoring */
-#define CPUID_LEAF7_FEATURE_FPU_CSDS _Bit(13)   /* FPU CS/DS deprecation */
-#define CPUID_LEAF7_FEATURE_MPX      _Bit(14)   /* Memory Protection eXtensions */
-#define CPUID_LEAF7_FEATURE_PQE      _Bit(15)   /* Platform Qos Enforcement */
-#define CPUID_LEAF7_FEATURE_AVX512F  _Bit(16)   /* AVX512F instructions */
-#define CPUID_LEAF7_FEATURE_AVX512DQ _Bit(17)   /* AVX512DQ instructions */
-#define CPUID_LEAF7_FEATURE_RDSEED   _Bit(18)   /* RDSEED Instruction */
-#define CPUID_LEAF7_FEATURE_ADX      _Bit(19)   /* ADX Instructions */
-#define CPUID_LEAF7_FEATURE_SMAP     _Bit(20)   /* Supervisor Mode Access Protect */
-#define CPUID_LEAF7_FEATURE_AVX512IFMA _Bit(21) /* AVX512IFMA instructions */
-#define CPUID_LEAF7_FEATURE_CLFSOPT  _Bit(23)   /* CLFSOPT */
-#define CPUID_LEAF7_FEATURE_CLWB     _Bit(24)   /* CLWB */
-#define CPUID_LEAF7_FEATURE_IPT      _Bit(25)   /* Intel Processor Trace */
-#define CPUID_LEAF7_FEATURE_AVX512CD _Bit(28)   /* AVX512CD instructions */
-#define CPUID_LEAF7_FEATURE_SHA      _Bit(29)   /* SHA instructions */
-#define CPUID_LEAF7_FEATURE_AVX512BW _Bit(30)   /* AVX512BW instructions */
-#define CPUID_LEAF7_FEATURE_AVX512VL _Bit(31)   /* AVX512VL instructions */
-
-#define CPUID_LEAF7_FEATURE_PREFETCHWT1 _HBit(0)/* Prefetch Write/T1 hint */
-#define CPUID_LEAF7_FEATURE_AVX512VBMI  _HBit(1)/* AVX512VBMI instructions */
-#define CPUID_LEAF7_FEATURE_UMIP        _HBit(2) /* User Mode Instruction Prevention */
-#define CPUID_LEAF7_FEATURE_PKU         _HBit(3) /* Protection Keys for Usermode */
-#define CPUID_LEAF7_FEATURE_OSPKE       _HBit(4) /* OS has enabled PKE */
-#define CPUID_LEAF7_FEATURE_WAITPKG     _HBit(5) /* WAITPKG instructions */
-#define CPUID_LEAF7_FEATURE_GFNI        _HBit(8) /* Galois Field New Instructions */
-#define CPUID_LEAF7_FEATURE_AVX512VPCDQ _HBit(14) /* AVX512 VPOPCNTDQ instruction */
-#define CPUID_LEAF7_FEATURE_RDPID       _HBit(22) /* RDPID and IA32_TSC_AUX */
-#define CPUID_LEAF7_FEATURE_CLDEMOTE    _HBit(25) /* Cache line demote */
-#define CPUID_LEAF7_FEATURE_MOVDIRI     _HBit(27) /* MOVDIRI instruction */
-#define CPUID_LEAF7_FEATURE_MOVDIRI64B  _HBit(28) /* MOVDIRI64B instruction */
-#define CPUID_LEAF7_FEATURE_SGXLC       _HBit(30) /* SGX Launch Configuration */
+#define CPUID_LEAF7_FEATURE_RDWRFSGS   _Bit(0)    /* FS/GS base read/write */
+#define CPUID_LEAF7_FEATURE_TSCOFF     _Bit(1)    /* TSC thread offset */
+#define CPUID_LEAF7_FEATURE_SGX        _Bit(2)    /* Software Guard eXtensions */
+#define CPUID_LEAF7_FEATURE_BMI1       _Bit(3)    /* Bit Manipulation Instrs, set 1 */
+#define CPUID_LEAF7_FEATURE_HLE        _Bit(4)    /* Hardware Lock Elision*/
+#define CPUID_LEAF7_FEATURE_AVX2       _Bit(5)    /* AVX2 Instructions */
+#define CPUID_LEAF7_FEATURE_FDPEO      _Bit(6)    /* x87 FPU Data Pointer updated only on x87 exceptions */
+#define CPUID_LEAF7_FEATURE_SMEP       _Bit(7)    /* Supervisor Mode Execute Protect */
+#define CPUID_LEAF7_FEATURE_BMI2       _Bit(8)    /* Bit Manipulation Instrs, set 2 */
+#define CPUID_LEAF7_FEATURE_ERMS       _Bit(9)    /* Enhanced Rep Movsb/Stosb */
+#define CPUID_LEAF7_FEATURE_INVPCID    _Bit(10)   /* INVPCID intruction, TDB */
+#define CPUID_LEAF7_FEATURE_RTM        _Bit(11)   /* RTM */
+#define CPUID_LEAF7_FEATURE_PQM        _Bit(12)   /* Platform Qos Monitoring */
+#define CPUID_LEAF7_FEATURE_FPU_CSDS   _Bit(13)   /* FPU CS/DS deprecation */
+#define CPUID_LEAF7_FEATURE_MPX        _Bit(14)   /* Memory Protection eXtensions */
+#define CPUID_LEAF7_FEATURE_PQE        _Bit(15)   /* Platform Qos Enforcement */
+#define CPUID_LEAF7_FEATURE_AVX512F    _Bit(16)   /* AVX512F instructions */
+#define CPUID_LEAF7_FEATURE_AVX512DQ   _Bit(17)   /* AVX512DQ instructions */
+#define CPUID_LEAF7_FEATURE_RDSEED     _Bit(18)   /* RDSEED Instruction */
+#define CPUID_LEAF7_FEATURE_ADX        _Bit(19)   /* ADX Instructions */
+#define CPUID_LEAF7_FEATURE_SMAP       _Bit(20)   /* Supervisor Mode Access Protect */
+#define CPUID_LEAF7_FEATURE_AVX512IFMA _Bit(21)   /* AVX512IFMA instructions */
+#define CPUID_LEAF7_FEATURE_CLFSOPT    _Bit(23)   /* CLFSOPT */
+#define CPUID_LEAF7_FEATURE_CLWB       _Bit(24)   /* CLWB */
+#define CPUID_LEAF7_FEATURE_IPT        _Bit(25)   /* Intel Processor Trace */
+#define CPUID_LEAF7_FEATURE_AVX512CD   _Bit(28)   /* AVX512CD instructions */
+#define CPUID_LEAF7_FEATURE_SHA        _Bit(29)   /* SHA instructions */
+#define CPUID_LEAF7_FEATURE_AVX512BW   _Bit(30)   /* AVX512BW instructions */
+#define CPUID_LEAF7_FEATURE_AVX512VL   _Bit(31)   /* AVX512VL instructions */
+
+#define CPUID_LEAF7_FEATURE_PREFETCHWT1  _HBit(0)  /* Prefetch Write/T1 hint */
+#define CPUID_LEAF7_FEATURE_AVX512VBMI   _HBit(1)  /* AVX512VBMI instructions */
+#define CPUID_LEAF7_FEATURE_UMIP         _HBit(2)  /* User Mode Instruction Prevention */
+#define CPUID_LEAF7_FEATURE_PKU          _HBit(3)  /* Protection Keys for Usermode */
+#define CPUID_LEAF7_FEATURE_OSPKE        _HBit(4)  /* OS has enabled PKE */
+#define CPUID_LEAF7_FEATURE_WAITPKG      _HBit(5)  /* WAITPKG instructions */
+#define CPUID_LEAF7_FEATURE_GFNI         _HBit(8)  /* Galois Field New Instructions */
+#define CPUID_LEAF7_FEATURE_VAES         _HBit(9)  /* Vector-encoded AES */
+#define CPUID_LEAF7_FEATURE_VPCLMULQDQ   _HBit(10) /* Vector Carryless-multiply */
+#define CPUID_LEAF7_FEATURE_AVX512VNNI   _HBit(11) /* AVX512 Vector Neural Net Instructions */
+#define CPUID_LEAF7_FEATURE_AVX512BITALG _HBit(12) /* AVX512 VPOPCNT{B,W} and VPSHUFBITQMB */
+#define CPUID_LEAF7_FEATURE_AVX512VPCDQ  _HBit(14) /* AVX512 VPOPCNTDQ instruction */
+#define CPUID_LEAF7_FEATURE_RDPID        _HBit(22) /* RDPID and IA32_TSC_AUX */
+#define CPUID_LEAF7_FEATURE_CLDEMOTE     _HBit(25) /* Cache line demote */
+#define CPUID_LEAF7_FEATURE_MOVDIRI      _HBit(27) /* MOVDIRI instruction */
+#define CPUID_LEAF7_FEATURE_MOVDIRI64B   _HBit(28) /* MOVDIRI64B instruction */
+#define CPUID_LEAF7_FEATURE_SGXLC        _HBit(30) /* SGX Launch Configuration */
 
 /*
  * Values in EDX returned by CPUID Leaf 7, subleaf 0
  */
 #define CPUID_LEAF7_EXTFEATURE_AVX5124VNNIW     _Bit(2)         /* AVX512_4VNNIW */
 #define CPUID_LEAF7_EXTFEATURE_AVX5124FMAPS     _Bit(3)         /* AVX512_4FMAPS */
+#define CPUID_LEAF7_EXTFEATURE_FSREPMOV         _Bit(4)         /* Fast Short REP MOV */
 #define CPUID_LEAF7_EXTFEATURE_MDCLEAR          _Bit(10)        /* Overloaded VERW / L1D_FLUSH */
 #define CPUID_LEAF7_EXTFEATURE_TSXFA            _Bit(13)        /* TSX RTM_FORCE_ABORT MSR */
 #define CPUID_LEAF7_EXTFEATURE_IBRS             _Bit(26)        /* IBRS / IBPB */
index 0ac53d48c30e63f6e7e0e8cf80a044cc92f3730e..82ab4423f91e3a704bfb6c94f1a62f6099d997ba 100644 (file)
@@ -259,21 +259,6 @@ vzeroupper(void)
 
 static boolean_t fpu_thread_promote_avx512(thread_t);   /* Forward */
 
-/*
- * Define a wrapper for bcopy to defeat destination size checka.
- * This is needed to treat repeated objects such as
- *     _STRUCT_XMM_REG         fpu_ymmh0;
- *     ...
- *     _STRUCT_XMM_REG         fpu_ymmh7;
- * as an array and to copy like so:
- *     bcopy_nockch(src,&dst->fpu_ymmh0,8*sizeof(_STRUCT_XMM_REG));
- * without the compiler throwing a __builtin__memmove_chk error.
- */
-static inline void
-bcopy_nochk(void *_src, void *_dst, size_t _len)
-{
-       bcopy(_src, _dst, _len);
-}
 
 /*
  * Furthermore, make compile-time asserts that no padding creeps into structures
@@ -878,7 +863,7 @@ Retry:
 
                state->fpu_mxcsr &= mxcsr_capability_mask;
 
-               bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size[FP]);
+               __nochk_bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size[FP]);
 
                switch (thread_xstate(thr_act)) {
                case UNDEFINED_FULL:
@@ -906,9 +891,9 @@ Retry:
                        iavx->_xh.xcomp_bv  = 0;
 
                        if (f == x86_AVX_STATE32) {
-                               bcopy_nochk(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
+                               __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
                        } else if (f == x86_AVX_STATE64) {
-                               bcopy_nochk(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
+                               __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
                        } else {
                                iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87);
                        }
@@ -932,23 +917,23 @@ Retry:
 
                        switch (f) {
                        case x86_AVX512_STATE32:
-                               bcopy_nochk(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
-                               bcopy_nochk(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG));
-                               bcopy_nochk(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
+                               __nochk_bcopy(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
+                               __nochk_bcopy(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG));
+                               __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
                                DBG_AVX512_STATE(iavx);
                                break;
                        case x86_AVX_STATE32:
-                               bcopy_nochk(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
+                               __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
                                break;
                        case x86_AVX512_STATE64:
-                               bcopy_nochk(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
-                               bcopy_nochk(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG));
-                               bcopy_nochk(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG));
-                               bcopy_nochk(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
+                               __nochk_bcopy(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
+                               __nochk_bcopy(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG));
+                               __nochk_bcopy(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG));
+                               __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
                                DBG_AVX512_STATE(iavx);
                                break;
                        case x86_AVX_STATE64:
-                               bcopy_nochk(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
+                               __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
                                break;
                        }
                        break;
@@ -1024,7 +1009,7 @@ fpu_get_fxstate(
                 * No valid floating-point state.
                 */
 
-               bcopy((char *)&initial_fp_state, (char *)&state->fpu_fcw,
+               __nochk_bcopy((char *)&initial_fp_state, (char *)&state->fpu_fcw,
                    fp_state_size[FP]);
 
                simple_unlock(&pcb->lock);
@@ -1047,7 +1032,7 @@ fpu_get_fxstate(
                (void)ml_set_interrupts_enabled(intr);
        }
        if (ifps->fp_valid) {
-               bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size[FP]);
+               __nochk_bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size[FP]);
                switch (thread_xstate(thr_act)) {
                case UNDEFINED_FULL:
                case FP_FULL:
@@ -1065,9 +1050,9 @@ fpu_get_fxstate(
                        struct x86_avx_thread_state *iavx = (void *) ifps;
                        x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
                        if (f == x86_AVX_STATE32) {
-                               bcopy_nochk(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
+                               __nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
                        } else if (f == x86_AVX_STATE64) {
-                               bcopy_nochk(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
+                               __nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
                        }
                        break;
                }
@@ -1081,23 +1066,23 @@ fpu_get_fxstate(
                        } xs = { .ts = tstate };
                        switch (f) {
                        case x86_AVX512_STATE32:
-                               bcopy_nochk(iavx->x_Opmask, &xs.s32->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
-                               bcopy_nochk(iavx->x_ZMM_Hi256, &xs.s32->fpu_zmmh0, 8 * sizeof(_STRUCT_YMM_REG));
-                               bcopy_nochk(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
+                               __nochk_bcopy(iavx->x_Opmask, &xs.s32->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
+                               __nochk_bcopy(iavx->x_ZMM_Hi256, &xs.s32->fpu_zmmh0, 8 * sizeof(_STRUCT_YMM_REG));
+                               __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
                                DBG_AVX512_STATE(iavx);
                                break;
                        case x86_AVX_STATE32:
-                               bcopy_nochk(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
+                               __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
                                break;
                        case x86_AVX512_STATE64:
-                               bcopy_nochk(iavx->x_Opmask, &xs.s64->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
-                               bcopy_nochk(iavx->x_Hi16_ZMM, &xs.s64->fpu_zmm16, 16 * sizeof(_STRUCT_ZMM_REG));
-                               bcopy_nochk(iavx->x_ZMM_Hi256, &xs.s64->fpu_zmmh0, 16 * sizeof(_STRUCT_YMM_REG));
-                               bcopy_nochk(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
+                               __nochk_bcopy(iavx->x_Opmask, &xs.s64->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
+                               __nochk_bcopy(iavx->x_Hi16_ZMM, &xs.s64->fpu_zmm16, 16 * sizeof(_STRUCT_ZMM_REG));
+                               __nochk_bcopy(iavx->x_ZMM_Hi256, &xs.s64->fpu_zmmh0, 16 * sizeof(_STRUCT_YMM_REG));
+                               __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
                                DBG_AVX512_STATE(iavx);
                                break;
                        case x86_AVX_STATE64:
-                               bcopy_nochk(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
+                               __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
                                break;
                        }
                        break;
@@ -1163,7 +1148,7 @@ fpu_dup_fxstate(
                if (ifps->fp_valid) {
                        child->machine.ifps = new_ifps;
                        child->machine.xstate = xstate;
-                       bcopy((char *)(ppcb->ifps),
+                       __nochk_bcopy((char *)(ppcb->ifps),
                            (char *)(child->machine.ifps),
                            fp_state_size[xstate]);
 
@@ -1249,7 +1234,7 @@ fpnoextflt(void)
 
        if (pcb->ifps == 0 && !get_interrupt_level()) {
                ifps = fp_state_alloc(xstate);
-               bcopy((char *)&initial_fp_state, (char *)ifps,
+               __nochk_bcopy((char *)&initial_fp_state, (char *)ifps,
                    fp_state_size[xstate]);
                if (!thread_is_64bit_addr(thr_act)) {
                        ifps->fp_save_layout = fpu_YMM_capable ? XSAVE32 : FXSAVE32;
@@ -1555,7 +1540,7 @@ fpu_savearea_promote_avx512(thread_t thread)
 
        /* Allocate an AVX512 savearea and copy AVX state into it */
        if (pcb->xstate != AVX512) {
-               bcopy(ifps, ifps512, fp_state_size[AVX]);
+               __nochk_bcopy(ifps, ifps512, fp_state_size[AVX]);
                pcb->ifps = ifps512;
                pcb->xstate = AVX512;
                ifps512 = NULL;
index f2e340ab7e1c96449d3ce1657650e9dbfe27c736..b691ae36b91b0f823143ed80dd65fe85858c1675 100644 (file)
@@ -81,7 +81,6 @@
 #include <i386/cpu_capabilities.h>
 #include <i386/cpuid.h>
 #include <i386/pmCPU.h>
-#include <i386/pmap.h>
 #include <mach/i386/vm_param.h>
 #include <mach/i386/thread_status.h>
 #include <machine/commpage.h>
        __asm("DEFINITION__define__" SYM ":\t .ascii \"%0\"" : : "n"  ((u_int)(VAL)))
 
 #define DECLAREULL(SYM, VAL) \
-       __asm("DEFINITION__define__" SYM ":\t .ascii \"%0\"" : : "n"  ((unsigned long long)(VAL)))
+       __asm("DEFINITION__define__" SYM ":\t .ascii \"%0\"" : : "i"  ((unsigned long long)(VAL)))
 
 int     main(
        int             argc,
@@ -153,7 +152,6 @@ main(
        DECLARE("TH_CONTINUATION", offsetof(struct thread, continuation));
        DECLARE("TH_KERNEL_STACK", offsetof(struct thread, kernel_stack));
        DECLARE("TH_MUTEX_COUNT", offsetof(struct thread, mutex_count));
-       DECLARE("TH_WAS_PROMOTED_ON_WAKEUP", offsetof(struct thread, was_promoted_on_wakeup));
        DECLARE("TH_IOTIER_OVERRIDE", offsetof(struct thread, iotier_override));
 
        DECLARE("TH_SYSCALLS_MACH", offsetof(struct thread, syscalls_mach));
index d88bb1897312998551dbac9b9ade126787a7444a..2c7a177f1f1bdab0a4c361b790cc0190f84d6d9b 100644 (file)
@@ -166,6 +166,7 @@ hibernate_page_list_allocate(boolean_t log)
        }
 
        if (num_banks >= MAX_BANKS) {
+               HIBLOG("%s error, num_banks exceed MAX_BANKS(0x%x)\n", __FUNCTION__, MAX_BANKS);
                return NULL;
        }
 
index 1e8810b1792e78eb4d89566350b69ad200c68f7f..f1b9f1e1204e8a468346354eec85fb45b787de33 100644 (file)
@@ -70,7 +70,6 @@
 #include <kern/startup.h>
 #include <kern/clock.h>
 #include <kern/pms.h>
-#include <kern/xpr.h>
 #include <kern/cpu_data.h>
 #include <kern/processor.h>
 #include <sys/kdebug.h>
@@ -142,7 +141,12 @@ pml4_entry_t            *IdlePML4;
 int                     kernPhysPML4Index;
 int                     kernPhysPML4EntryCount;
 
-int                     allow_64bit_proc_LDT_ops;
+/*
+ * These are 4K mapping page table pages from KPTphys[] that we wound
+ * up not using. They get ml_static_mfree()'d once the VM is initialized.
+ */
+ppnum_t                 released_PT_ppn = 0;
+uint32_t                released_PT_cnt = 0;
 
 char *physfree;
 void idt64_remap(void);
@@ -397,6 +401,109 @@ Idle_PTs_init(void)
        set_cr3_raw((uintptr_t)ID_MAP_VTOP(IdlePML4));
 }
 
+/*
+ * Release any still unused, preallocated boot kernel page tables.
+ * start..end is the VA range currently unused.
+ */
+void
+Idle_PTs_release(vm_offset_t start, vm_offset_t end)
+{
+       uint32_t i;
+       uint32_t index_start;
+       uint32_t index_limit;
+       ppnum_t pn_first;
+       ppnum_t pn;
+       uint32_t cnt;
+
+       /*
+        * Align start to the next large page boundary
+        */
+       start = ((start + I386_LPGMASK) & ~I386_LPGMASK);
+
+       /*
+        * convert start into an index in KPTphys[]
+        */
+       index_start = (uint32_t)((start - KERNEL_BASE) >> PAGE_SHIFT);
+
+       /*
+        * Find the ending index in KPTphys[]
+        */
+       index_limit = (uint32_t)((end - KERNEL_BASE) >> PAGE_SHIFT);
+
+       if (index_limit > NKPT * PTE_PER_PAGE) {
+               index_limit = NKPT * PTE_PER_PAGE;
+       }
+
+       /*
+        * Make sure all the 4K page tables are empty.
+        * If not, panic a development/debug kernel.
+        * On a production kernel, since this would stop us from booting,
+        * just abort the operation.
+        */
+       for (i = index_start; i < index_limit; ++i) {
+               assert(KPTphys[i] == 0);
+               if (KPTphys[i] != 0) {
+                       return;
+               }
+       }
+
+       /*
+        * Now figure out the indices into the 2nd level page tables, IdlePTD[].
+        */
+       index_start >>= PTPGSHIFT;
+       index_limit >>= PTPGSHIFT;
+       if (index_limit > NPGPTD * PTE_PER_PAGE) {
+               index_limit = NPGPTD * PTE_PER_PAGE;
+       }
+
+       if (index_limit <= index_start) {
+               return;
+       }
+
+
+       /*
+        * Now check the pages referenced from Level 2 tables.
+        * They should be contiguous, assert fail if not on development/debug.
+        * In production, just fail the removal to allow the system to boot.
+        */
+       pn_first = 0;
+       cnt = 0;
+       for (i = index_start; i < index_limit; ++i) {
+               assert(IdlePTD[i] != 0);
+               if (IdlePTD[i] == 0) {
+                       return;
+               }
+
+               pn = (ppnum_t)((PG_FRAME & IdlePTD[i]) >> PTSHIFT);
+               if (cnt == 0) {
+                       pn_first = pn;
+               } else {
+                       assert(pn == pn_first + cnt);
+                       if (pn != pn_first + cnt) {
+                               return;
+                       }
+               }
+               ++cnt;
+       }
+
+       /*
+        * Good to go, clear the level 2 entries and invalidate the TLB
+        */
+       for (i = index_start; i < index_limit; ++i) {
+               IdlePTD[i] = 0;
+       }
+       set_cr3_raw(get_cr3_raw());
+
+       /*
+        * Remember these PFNs to be released later in pmap_lowmem_finalize()
+        */
+       released_PT_ppn = pn_first;
+       released_PT_cnt = cnt;
+#if DEVELOPMENT || DEBUG
+       printf("Idle_PTs_release %d pages from PFN 0x%x\n", released_PT_cnt, released_PT_ppn);
+#endif
+}
+
 extern void vstart_trap_handler;
 
 #define BOOT_TRAP_VECTOR(t)                             \
@@ -485,9 +592,8 @@ vstart(vm_offset_t boot_args_start)
                lphysfree = kernelBootArgs->kaddr + kernelBootArgs->ksize;
                physfree = (void *)(uintptr_t)((lphysfree + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1));
 
-#if DEVELOPMENT || DEBUG
                pal_serial_init();
-#endif
+
                DBG("revision      0x%x\n", kernelBootArgs->Revision);
                DBG("version       0x%x\n", kernelBootArgs->Version);
                DBG("command line  %s\n", kernelBootArgs->CommandLine);
@@ -595,6 +701,14 @@ i386_init(void)
 #endif
 
        master_cpu = 0;
+
+       lck_mod_init();
+
+       /*
+        * Initialize the timer callout world
+        */
+       timer_call_init();
+
        cpu_init();
 
        postcode(CPU_INIT_D);
@@ -613,11 +727,6 @@ i386_init(void)
                dgWork.dgFlags = 0;
        }
 
-       if (!PE_parse_boot_argn("ldt64", &allow_64bit_proc_LDT_ops,
-           sizeof(allow_64bit_proc_LDT_ops))) {
-               allow_64bit_proc_LDT_ops = 0;
-       }
-
        serialmode = 0;
        if (PE_parse_boot_argn("serial", &serialmode, sizeof(serialmode))) {
                /* We want a serial keyboard and/or console */
@@ -696,6 +805,11 @@ i386_init(void)
 
        kernel_debug_string_early("power_management_init");
        power_management_init();
+
+#if MONOTONIC
+       mt_cpu_up(cpu_datap(0));
+#endif /* MONOTONIC */
+
        processor_bootstrap();
        thread_bootstrap();
 
@@ -705,7 +819,7 @@ i386_init(void)
        pstate_trace();
 }
 
-static void
+static void __dead2
 do_init_slave(boolean_t fast_restart)
 {
        void    *init_param     = FULL_SLAVE_INIT;
@@ -761,6 +875,12 @@ do_init_slave(boolean_t fast_restart)
        cpu_thread_init();      /* not strictly necessary */
 
        cpu_init();     /* Sets cpu_running which starter cpu waits for */
+
+
+#if MONOTONIC
+       mt_cpu_up(current_cpu_datap());
+#endif /* MONOTONIC */
+
        slave_main(init_param);
 
        panic("do_init_slave() returned from slave_main()");
@@ -861,7 +981,8 @@ doublemap_init(uint8_t randL3)
         */
 
        dblmap_dist = dblmap_base - hdescb;
-       idt64_hndl_table0[1] = DBLMAP(idt64_hndl_table0[1]);
+       idt64_hndl_table0[1] = DBLMAP(idt64_hndl_table0[1]);    /* 64-bit exit trampoline */
+       idt64_hndl_table0[3] = DBLMAP(idt64_hndl_table0[3]);    /* 32-bit exit trampoline */
        idt64_hndl_table0[6] = (uint64_t)(uintptr_t)&kernel_stack_mask;
 
        extern cpu_data_t cpshadows[], scdatas[];
index 63a1b46ef811b30bec746464d7664e3e08339f6d..a4edd4259ad5e2ec4e706f56e24df2e125709ef2 100644 (file)
@@ -77,6 +77,7 @@
 #include <mach/thread_status.h>
 #include <pexpert/i386/efi.h>
 #include <i386/i386_lowmem.h>
+#include <i386/misc_protos.h>
 #include <x86_64/lowglobals.h>
 #include <i386/pal_routines.h>
 
@@ -119,9 +120,19 @@ vm_offset_t vm_kernel_builtinkmod_text_end;
 #define MAXLORESERVE    (32 * 1024 * 1024)
 
 ppnum_t         max_ppnum = 0;
-ppnum_t         lowest_lo = 0;
-ppnum_t         lowest_hi = 0;
-ppnum_t         highest_hi = 0;
+
+/*
+ * pmap_high_used* are the highest range of physical memory used for kernel
+ * internals (page tables, vm_pages) via pmap_steal_memory() that don't
+ * need to be encrypted in hibernation images. There can be one gap in
+ * the middle of this due to fragmentation when using a mix of small
+ * and large pages.  In that case, the fragment lives between the high
+ * and middle ranges.
+ */
+ppnum_t pmap_high_used_top = 0;
+ppnum_t pmap_high_used_bottom = 0;
+ppnum_t pmap_middle_used_top = 0;
+ppnum_t pmap_middle_used_bottom = 0;
 
 enum {PMAP_MAX_RESERVED_RANGES = 32};
 uint32_t pmap_reserved_pages_allocated = 0;
@@ -168,6 +179,12 @@ uint64_t firmware_MMIO_bytes;
  */
 extern void     *last_kernel_symbol;
 
+#define LG_PPNUM_PAGES (I386_LPGBYTES >> PAGE_SHIFT)
+#define LG_PPNUM_MASK (I386_LPGMASK >> PAGE_SHIFT)
+
+/* set so no region large page fragment pages exist */
+#define RESET_FRAG(r) (((r)->alloc_frag_up = 1), ((r)->alloc_frag_down = 0))
+
 boolean_t       memmap = FALSE;
 #if     DEBUG || DEVELOPMENT
 static void
@@ -181,11 +198,14 @@ kprint_memmap(vm_offset_t maddr, unsigned int msize, unsigned int mcount)
        addr64_t             efi_start, efi_end;
 
        for (j = 0; j < pmap_memory_region_count; j++, p++) {
-               kprintf("pmap region %d type %d base 0x%llx alloc_up 0x%llx alloc_down 0x%llx top 0x%llx\n",
+               kprintf("pmap region %d type %d base 0x%llx alloc_up 0x%llx alloc_down 0x%llx"
+                   " alloc_frag_up 0x%llx alloc_frag_down 0x%llx top 0x%llx\n",
                    j, p->type,
                    (addr64_t) p->base << I386_PGSHIFT,
                    (addr64_t) p->alloc_up << I386_PGSHIFT,
                    (addr64_t) p->alloc_down << I386_PGSHIFT,
+                   (addr64_t) p->alloc_frag_up << I386_PGSHIFT,
+                   (addr64_t) p->alloc_frag_down << I386_PGSHIFT,
                    (addr64_t) p->end   << I386_PGSHIFT);
                region_start = (addr64_t) p->base << I386_PGSHIFT;
                region_end = ((addr64_t) p->end << I386_PGSHIFT) - 1;
@@ -314,7 +334,7 @@ i386_vm_init(uint64_t   maxmem,
        segDATA = getsegbynamefromheader(&_mh_execute_header,
            "__DATA");
        segCONST = getsegbynamefromheader(&_mh_execute_header,
-           "__CONST");
+           "__DATA_CONST");
        cursectTEXT = lastsectTEXT = firstsect(segTEXT);
        /* Discover the last TEXT section within the TEXT segment */
        while ((cursectTEXT = nextsect(segTEXT, cursectTEXT)) != NULL) {
@@ -554,6 +574,7 @@ i386_vm_init(uint64_t   maxmem,
                                    (top < vm_kernel_base_page)) {
                                        pmptr->alloc_up = pmptr->base;
                                        pmptr->alloc_down = pmptr->end;
+                                       RESET_FRAG(pmptr);
                                        pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
                                } else {
                                        /*
@@ -561,6 +582,7 @@ i386_vm_init(uint64_t   maxmem,
                                         */
                                        pmptr->alloc_up = top + 1;
                                        pmptr->alloc_down = top;
+                                       RESET_FRAG(pmptr);
                                }
                                pmptr->type = pmap_type;
                                pmptr->attribute = mptr->Attribute;
@@ -574,6 +596,7 @@ i386_vm_init(uint64_t   maxmem,
                                pmptr->end = (fap - 1);
                                pmptr->alloc_up = pmptr->end + 1;
                                pmptr->alloc_down = pmptr->end;
+                               RESET_FRAG(pmptr);
                                pmptr->type = pmap_type;
                                pmptr->attribute = mptr->Attribute;
                                /*
@@ -587,6 +610,7 @@ i386_vm_init(uint64_t   maxmem,
                                pmptr->type = pmap_type;
                                pmptr->attribute = mptr->Attribute;
                                pmptr->alloc_down = pmptr->end = top;
+                               RESET_FRAG(pmptr);
 
                                if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) {
                                        pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
@@ -599,6 +623,7 @@ i386_vm_init(uint64_t   maxmem,
                                pmptr->type = pmap_type;
                                pmptr->attribute = mptr->Attribute;
                                pmptr->alloc_down = pmptr->end = top;
+                               RESET_FRAG(pmptr);
                                if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) {
                                        pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
                                }
@@ -621,6 +646,7 @@ i386_vm_init(uint64_t   maxmem,
                            (pmptr->base == (prev_pmptr->end + 1))) {
                                prev_pmptr->end = pmptr->end;
                                prev_pmptr->alloc_down = pmptr->alloc_down;
+                               RESET_FRAG(pmptr);
                        } else {
                                pmap_memory_region_count++;
                                prev_pmptr = pmptr;
@@ -692,6 +718,7 @@ i386_vm_init(uint64_t   maxmem,
                        if (pages_to_use == 0) {
                                pmap_memory_regions[cur_region].end = cur_end;
                                pmap_memory_regions[cur_region].alloc_down = cur_end;
+                               RESET_FRAG(&pmap_memory_regions[cur_region]);
                        }
 
                        cur_region++;
@@ -772,113 +799,220 @@ pmap_free_pages(void)
        return (unsigned int)avail_remaining;
 }
 
-
 boolean_t pmap_next_page_reserved(ppnum_t *);
 
 /*
  * Pick a page from a "kernel private" reserved range; works around
- * errata on some hardware.
+ * errata on some hardware. EFI marks pages which can't be used for
+ * certain kinds of I/O-ish activities as reserved. We reserve them for
+ * kernel internal usage and prevent them from ever going on regular
+ * free list.
  */
 boolean_t
-pmap_next_page_reserved(ppnum_t *pn)
+pmap_next_page_reserved(
+       ppnum_t              *pn)
 {
+       uint32_t             n;
+       pmap_memory_region_t *region;
+       uint32_t             reserved_index;
+
        if (pmap_reserved_ranges) {
-               uint32_t n;
-               pmap_memory_region_t *region;
                for (n = 0; n < pmap_last_reserved_range_index; n++) {
-                       uint32_t reserved_index = pmap_reserved_range_indices[n];
+                       reserved_index = pmap_reserved_range_indices[n];
                        region = &pmap_memory_regions[reserved_index];
                        if (region->alloc_up <= region->alloc_down) {
                                *pn = region->alloc_up++;
-                               avail_remaining--;
-
-                               if (*pn > max_ppnum) {
-                                       max_ppnum = *pn;
-                               }
+                       } else if (region->alloc_frag_up <= region->alloc_frag_down) {
+                               *pn = region->alloc_frag_up++;
+                       } else {
+                               continue;
+                       }
+                       avail_remaining--;
 
-                               if (lowest_lo == 0 || *pn < lowest_lo) {
-                                       lowest_lo = *pn;
-                               }
+                       if (*pn > max_ppnum) {
+                               max_ppnum = *pn;
+                       }
 
-                               pmap_reserved_pages_allocated++;
+                       pmap_reserved_pages_allocated++;
 #if DEBUG
-                               if (region->alloc_up > region->alloc_down) {
-                                       kprintf("Exhausted reserved range index: %u, base: 0x%x end: 0x%x, type: 0x%x, attribute: 0x%llx\n", reserved_index, region->base, region->end, region->type, region->attribute);
-                               }
-#endif
-                               return TRUE;
+                       if (region->alloc_up > region->alloc_down) {
+                               kprintf("Exhausted reserved range index: %u, base: 0x%x end: 0x%x, type: 0x%x, attribute: 0x%llx\n", reserved_index, region->base, region->end, region->type, region->attribute);
                        }
+#endif
+                       return TRUE;
                }
        }
        return FALSE;
 }
 
+/*
+ * Return the highest large page available. Fails once there are no more large pages.
+ */
+kern_return_t
+pmap_next_page_large(
+       ppnum_t              *pn)
+{
+       int                  r;
+       pmap_memory_region_t *region;
+       ppnum_t              frag_start;
+       ppnum_t              lgpg;
+
+       if (avail_remaining < LG_PPNUM_PAGES) {
+               return KERN_FAILURE;
+       }
+
+       for (r = pmap_memory_region_count - 1; r >= 0; r--) {
+               region = &pmap_memory_regions[r];
+
+               /*
+                * First check if there is enough memory.
+                */
+               if (region->alloc_down < region->alloc_up ||
+                   (region->alloc_down - region->alloc_up + 1) < LG_PPNUM_PAGES) {
+                       continue;
+               }
+
+               /*
+                * Find the starting large page, creating a fragment if needed.
+                */
+               if ((region->alloc_down & LG_PPNUM_MASK) == LG_PPNUM_MASK) {
+                       lgpg = (region->alloc_down & ~LG_PPNUM_MASK);
+               } else {
+                       /* Can only have 1 fragment per region at a time */
+                       if (region->alloc_frag_up <= region->alloc_frag_down) {
+                               continue;
+                       }
+
+                       /* Check for enough room below any fragment. */
+                       frag_start = (region->alloc_down & ~LG_PPNUM_MASK);
+                       if (frag_start < region->alloc_up ||
+                           frag_start - region->alloc_up < LG_PPNUM_PAGES) {
+                               continue;
+                       }
+
+                       lgpg = frag_start - LG_PPNUM_PAGES;
+                       region->alloc_frag_up = frag_start;
+                       region->alloc_frag_down = region->alloc_down;
+               }
+
+               *pn = lgpg;
+               region->alloc_down = lgpg - 1;
+
+
+               avail_remaining -= LG_PPNUM_PAGES;
+               if (*pn + LG_PPNUM_MASK > max_ppnum) {
+                       max_ppnum = *pn + LG_PPNUM_MASK;
+               }
+
+               return KERN_SUCCESS;
+       }
+       return KERN_FAILURE;
+}
 
 boolean_t
 pmap_next_page_hi(
-       ppnum_t *pn)
+       ppnum_t              *pn,
+       boolean_t            might_free)
 {
        pmap_memory_region_t *region;
-       int     n;
+       int                  n;
 
-       if (pmap_next_page_reserved(pn)) {
+       if (!might_free && pmap_next_page_reserved(pn)) {
                return TRUE;
        }
 
        if (avail_remaining) {
                for (n = pmap_memory_region_count - 1; n >= 0; n--) {
                        region = &pmap_memory_regions[n];
-
-                       if (region->alloc_down >= region->alloc_up) {
+                       if (region->alloc_frag_up <= region->alloc_frag_down) {
+                               *pn = region->alloc_frag_down--;
+                       } else if (region->alloc_down >= region->alloc_up) {
                                *pn = region->alloc_down--;
-                               avail_remaining--;
-
-                               if (*pn > max_ppnum) {
-                                       max_ppnum = *pn;
-                               }
-
-                               if (lowest_lo == 0 || *pn < lowest_lo) {
-                                       lowest_lo = *pn;
-                               }
-
-                               if (lowest_hi == 0 || *pn < lowest_hi) {
-                                       lowest_hi = *pn;
-                               }
+                       } else {
+                               continue;
+                       }
 
-                               if (*pn > highest_hi) {
-                                       highest_hi = *pn;
-                               }
+                       avail_remaining--;
 
-                               return TRUE;
+                       if (*pn > max_ppnum) {
+                               max_ppnum = *pn;
                        }
+
+                       return TRUE;
                }
        }
        return FALSE;
 }
 
+/*
+ * Record which high pages have been allocated so far,
+ * so that pmap_init() can mark them PMAP_NOENCRYPT, which
+ * makes hibernation faster.
+ *
+ * Because of the code in pmap_next_page_large(), we could
+ * theoretically have fragments in several regions.
+ * In practice that just doesn't happen. The last pmap region
+ * is normally the largest and will satisfy all pmap_next_hi/large()
+ * allocations. Since this information is used as an optimization
+ * and it's ok to be conservative, we'll just record the information
+ * for the final region.
+ */
+void
+pmap_hi_pages_done(void)
+{
+       pmap_memory_region_t *r;
+
+       r = &pmap_memory_regions[pmap_memory_region_count - 1];
+       pmap_high_used_top = r->end;
+       if (r->alloc_frag_up <= r->alloc_frag_down) {
+               pmap_high_used_bottom = r->alloc_frag_down + 1;
+               pmap_middle_used_top = r->alloc_frag_up - 1;
+               if (r->alloc_up <= r->alloc_down) {
+                       pmap_middle_used_bottom = r->alloc_down + 1;
+               } else {
+                       pmap_high_used_bottom = r->base;
+               }
+       } else {
+               if (r->alloc_up <= r->alloc_down) {
+                       pmap_high_used_bottom = r->alloc_down + 1;
+               } else {
+                       pmap_high_used_bottom = r->base;
+               }
+       }
+#if     DEBUG || DEVELOPMENT
+       kprintf("pmap_high_used_top      0x%x\n", pmap_high_used_top);
+       kprintf("pmap_high_used_bottom   0x%x\n", pmap_high_used_bottom);
+       kprintf("pmap_middle_used_top    0x%x\n", pmap_middle_used_top);
+       kprintf("pmap_middle_used_bottom 0x%x\n", pmap_middle_used_bottom);
+#endif
+}
 
+/*
+ * Return the next available page from lowest memory for general use.
+ */
 boolean_t
 pmap_next_page(
-       ppnum_t *pn)
+       ppnum_t              *pn)
 {
+       pmap_memory_region_t *region;
+
        if (avail_remaining) {
                while (pmap_memory_region_current < pmap_memory_region_count) {
-                       if (pmap_memory_regions[pmap_memory_region_current].alloc_up >
-                           pmap_memory_regions[pmap_memory_region_current].alloc_down) {
+                       region = &pmap_memory_regions[pmap_memory_region_current];
+                       if (region->alloc_up <= region->alloc_down) {
+                               *pn = region->alloc_up++;
+                       } else if (region->alloc_frag_up <= region->alloc_frag_down) {
+                               *pn = region->alloc_frag_up++;
+                       } else {
                                pmap_memory_region_current++;
                                continue;
                        }
-                       *pn = pmap_memory_regions[pmap_memory_region_current].alloc_up++;
                        avail_remaining--;
 
                        if (*pn > max_ppnum) {
                                max_ppnum = *pn;
                        }
 
-                       if (lowest_lo == 0 || *pn < lowest_lo) {
-                               lowest_lo = *pn;
-                       }
-
                        return TRUE;
                }
        }
index 9a046a2f58b68b0a16090cf71b4323c19eb6da23..9f06f52bceeb1c69893d66cdd0466f9237fbe2ab 100644 (file)
@@ -183,11 +183,11 @@ typedef struct {
 extern  lapic_ops_table_t *lapic_ops;
 
 #define LAPIC_INIT()                    lapic_ops->init();
-#define LAPIC_WRITE(reg, val)            lapic_ops->write(reg, val)
+#define LAPIC_WRITE(reg, val)           lapic_ops->write(reg, val)
 #define LAPIC_READ(reg)                 lapic_ops->read(reg)
-#define LAPIC_READ_OFFSET(reg, off)      LAPIC_READ((reg)+(off))
+#define LAPIC_READ_OFFSET(reg, off)     LAPIC_READ((lapic_register_t)((reg)+(off)))
 #define LAPIC_READ_ICR()                lapic_ops->read_icr()
-#define LAPIC_WRITE_ICR(dst, cmd)        lapic_ops->write_icr(dst, cmd)
+#define LAPIC_WRITE_ICR(dst, cmd)       lapic_ops->write_icr(dst, cmd)
 
 typedef enum {
        periodic,
index 21e74d712d7f926764daa3fb0cf071f8efbc52b4..e553bc4a0098bffcb26dca722afe3f47c8051ce4 100644 (file)
@@ -81,10 +81,10 @@ typedef struct _lck_mtx_ {
                                struct {
                                        volatile uint32_t
                                            lck_mtx_waiters:16,
-                                           lck_mtx_pri:8,
+                                           lck_mtx_pri:8, // unused
                                            lck_mtx_ilocked:1,
                                            lck_mtx_mlocked:1,
-                                           lck_mtx_promoted:1,
+                                           lck_mtx_promoted:1, // unused
                                            lck_mtx_spin:1,
                                            lck_mtx_is_ext:1,
                                            lck_mtx_pad3:3;
@@ -107,7 +107,6 @@ typedef struct _lck_mtx_ {
 #define LCK_MTX_PRIORITY_MSK            0x00ff0000
 #define LCK_MTX_ILOCKED_MSK             0x01000000
 #define LCK_MTX_MLOCKED_MSK             0x02000000
-#define LCK_MTX_PROMOTED_MSK            0x04000000
 #define LCK_MTX_SPIN_MSK                0x08000000
 
 /* This pattern must subsume the interlocked, mlocked and spin bits */
@@ -124,7 +123,8 @@ typedef enum lck_mtx_spinwait_ret_type {
 } lck_mtx_spinwait_ret_type_t;
 
 extern lck_mtx_spinwait_ret_type_t              lck_mtx_lock_spinwait_x86(lck_mtx_t *mutex);
-extern void                                     lck_mtx_lock_wait_x86(lck_mtx_t *mutex);
+struct turnstile;
+extern void                                     lck_mtx_lock_wait_x86(lck_mtx_t *mutex, struct turnstile **ts);
 extern void                                     lck_mtx_lock_acquire_x86(lck_mtx_t *mutex);
 
 extern void                                     lck_mtx_lock_slow(lck_mtx_t *lock);
index 5f693ff515312a7cfad3019412468b2de8082e63..c5b0d303748e16513487e649c2403d71f5df1b48 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -61,7 +61,6 @@
  *     Locking primitives implementation
  */
 
-#define ATOMIC_PRIVATE 1
 #define LOCK_PRIVATE 1
 
 #include <mach_ldebug.h>
@@ -75,7 +74,6 @@
 #include <kern/cpu_data.h>
 #include <kern/cpu_number.h>
 #include <kern/sched_prim.h>
-#include <kern/xpr.h>
 #include <kern/debug.h>
 #include <string.h>
 
 #include <sys/kdebug.h>
 #include <i386/locks_i386_inlines.h>
 
-#if     CONFIG_DTRACE
-#define DTRACE_RW_SHARED        0x0     //reader
-#define DTRACE_RW_EXCL          0x1     //writer
-#define DTRACE_NO_FLAG          0x0     //not applicable
+#if    CONFIG_DTRACE
+#define DTRACE_RW_SHARED       0x0     //reader
+#define DTRACE_RW_EXCL         0x1     //writer
+#define DTRACE_NO_FLAG         0x0     //not applicable
 #endif /* CONFIG_DTRACE */
 
-#define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
-#define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
-#define LCK_RW_LCK_SHARED_CODE          0x102
-#define LCK_RW_LCK_SH_TO_EX_CODE        0x103
-#define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
-#define LCK_RW_LCK_EX_TO_SH_CODE        0x105
+#define        LCK_RW_LCK_EXCLUSIVE_CODE       0x100
+#define        LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
+#define        LCK_RW_LCK_SHARED_CODE          0x102
+#define        LCK_RW_LCK_SH_TO_EX_CODE        0x103
+#define        LCK_RW_LCK_SH_TO_EX1_CODE       0x104
+#define        LCK_RW_LCK_EX_TO_SH_CODE        0x105
 
-#define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
-#define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
-#define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
-#define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
-#define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
-#define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
-#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
-#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
+#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
+#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
+#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
+#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
+#define LCK_RW_LCK_SHARED_SPIN_CODE    0x110
+#define LCK_RW_LCK_SHARED_WAIT_CODE    0x111
+#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE  0x112
+#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE  0x113
 
 
-#define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
+#define        ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 
-unsigned int LcksOpts = 0;
+unsigned int LcksOpts=0;
 
 #if DEVELOPMENT || DEBUG
 unsigned int LckDisablePreemptCheck = 0;
@@ -120,15 +118,15 @@ unsigned int LckDisablePreemptCheck = 0;
 
 /* Forwards */
 
-#if     USLOCK_DEBUG
+#if    USLOCK_DEBUG
 /*
  *     Perform simple lock checks.
  */
-int     uslock_check = 1;
-int     max_lock_loops  = 100000000;
-decl_simple_lock_data(extern, printf_lock)
-decl_simple_lock_data(extern, panic_lock)
-#endif  /* USLOCK_DEBUG */
+int    uslock_check = 1;
+int    max_lock_loops  = 100000000;
+decl_simple_lock_data(extern , printf_lock);
+decl_simple_lock_data(extern , panic_lock);
+#endif /* USLOCK_DEBUG */
 
 extern unsigned int not_in_kdp;
 
@@ -137,23 +135,23 @@ extern unsigned int not_in_kdp;
  *     of the various lock routines.  However, this information
  *     is only used for debugging and statistics.
  */
-typedef void    *pc_t;
-#define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
-#define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
-#if     ANY_LOCK_DEBUG
-#define OBTAIN_PC(pc)   ((pc) = GET_RETURN_PC())
-#define DECL_PC(pc)     pc_t pc;
-#else   /* ANY_LOCK_DEBUG */
+typedef void   *pc_t;
+#define        INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
+#define        INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
+#if    ANY_LOCK_DEBUG
+#define        OBTAIN_PC(pc)   ((pc) = GET_RETURN_PC())
+#define DECL_PC(pc)    pc_t pc;
+#else  /* ANY_LOCK_DEBUG */
 #define DECL_PC(pc)
-#ifdef  lint
+#ifdef lint
 /*
  *     Eliminate lint complaints about unused local pc variables.
  */
-#define OBTAIN_PC(pc)   ++pc
-#else   /* lint */
-#define OBTAIN_PC(pc)
-#endif  /* lint */
-#endif  /* USLOCK_DEBUG */
+#define        OBTAIN_PC(pc)   ++pc
+#else  /* lint */
+#define        OBTAIN_PC(pc)
+#endif /* lint */
+#endif /* USLOCK_DEBUG */
 
 /*
  * atomic exchange API is a low level abstraction of the operations
@@ -168,10 +166,10 @@ typedef void    *pc_t;
 static uint32_t
 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
 {
-       uint32_t        val;
+       uint32_t        val;
 
-       (void)ord;                      // Memory order not used
-       val = __c11_atomic_load((_Atomic uint32_t *)target, memory_order_relaxed);
+       (void)ord;                      // Memory order not used
+       val = os_atomic_load(target, relaxed);
        *previous = val;
        return val;
 }
@@ -183,48 +181,50 @@ atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval,
 }
 
 static void
-atomic_exchange_abort(void)
-{
-}
+atomic_exchange_abort(void) { }
 
 static boolean_t
 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 {
-       uint32_t        value, prev;
+       uint32_t        value, prev;
 
-       for (;;) {
+       for ( ; ; ) {
                value = atomic_exchange_begin32(target, &prev, ord);
                if (value & test_mask) {
-                       if (wait) {
+                       if (wait)
                                cpu_pause();
-                       } else {
+                       else
                                atomic_exchange_abort();
-                       }
                        return FALSE;
                }
                value |= set_mask;
-               if (atomic_exchange_complete32(target, prev, value, ord)) {
+               if (atomic_exchange_complete32(target, prev, value, ord))
                        return TRUE;
-               }
        }
 }
 
+inline boolean_t
+hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
+{
+       return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
+}
+
 /*
  *     Portable lock package implementation of usimple_locks.
  */
 
-#if     USLOCK_DEBUG
-#define USLDBG(stmt)    stmt
-void            usld_lock_init(usimple_lock_t, unsigned short);
-void            usld_lock_pre(usimple_lock_t, pc_t);
-void            usld_lock_post(usimple_lock_t, pc_t);
-void            usld_unlock(usimple_lock_t, pc_t);
-void            usld_lock_try_pre(usimple_lock_t, pc_t);
-void            usld_lock_try_post(usimple_lock_t, pc_t);
-int             usld_lock_common_checks(usimple_lock_t, char *);
-#else   /* USLOCK_DEBUG */
-#define USLDBG(stmt)
-#endif  /* USLOCK_DEBUG */
+#if    USLOCK_DEBUG
+#define        USLDBG(stmt)    stmt
+void           usld_lock_init(usimple_lock_t, unsigned short);
+void           usld_lock_pre(usimple_lock_t, pc_t);
+void           usld_lock_post(usimple_lock_t, pc_t);
+void           usld_unlock(usimple_lock_t, pc_t);
+void           usld_lock_try_pre(usimple_lock_t, pc_t);
+void           usld_lock_try_post(usimple_lock_t, pc_t);
+int            usld_lock_common_checks(usimple_lock_t, char *);
+#else  /* USLOCK_DEBUG */
+#define        USLDBG(stmt)
+#endif /* USLOCK_DEBUG */
 
 /*
  * Forward definitions
@@ -240,7 +240,7 @@ void lck_rw_clear_promotions_x86(thread_t thread);
 static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
 static boolean_t lck_rw_grab_want(lck_rw_t *lock);
 static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
-static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, int prior_lock_state, boolean_t indirect);
+static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, uint32_t state, boolean_t indirect);
 static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
 static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
 static int lck_mtx_interlock_try_lock(lck_mtx_t *mutex, uint32_t *new_state);
@@ -254,16 +254,15 @@ static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint3
  */
 lck_spin_t *
 lck_spin_alloc_init(
-       lck_grp_t       *grp,
-       lck_attr_t      *attr)
+       lck_grp_t       *grp,
+       lck_attr_t      *attr)
 {
-       lck_spin_t      *lck;
+       lck_spin_t      *lck;
 
-       if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0) {
+       if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
                lck_spin_init(lck, grp, attr);
-       }
 
-       return lck;
+       return(lck);
 }
 
 /*
@@ -271,8 +270,8 @@ lck_spin_alloc_init(
  */
 void
 lck_spin_free(
-       lck_spin_t      *lck,
-       lck_grp_t       *grp)
+       lck_spin_t      *lck,
+       lck_grp_t       *grp)
 {
        lck_spin_destroy(lck, grp);
        kfree(lck, sizeof(lck_spin_t));
@@ -283,13 +282,15 @@ lck_spin_free(
  */
 void
 lck_spin_init(
-       lck_spin_t      *lck,
-       lck_grp_t       *grp,
-       __unused lck_attr_t     *attr)
+       lck_spin_t      *lck,
+       lck_grp_t       *grp,
+       __unused lck_attr_t     *attr)
 {
        usimple_lock_init((usimple_lock_t) lck, 0);
-       lck_grp_reference(grp);
-       lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
+       if (grp) {
+               lck_grp_reference(grp);
+               lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
+       }
 }
 
 /*
@@ -297,15 +298,16 @@ lck_spin_init(
  */
 void
 lck_spin_destroy(
-       lck_spin_t      *lck,
-       lck_grp_t       *grp)
+       lck_spin_t      *lck,
+       lck_grp_t       *grp)
 {
-       if (lck->interlock == LCK_SPIN_TAG_DESTROYED) {
+       if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
                return;
-       }
        lck->interlock = LCK_SPIN_TAG_DESTROYED;
-       lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
-       lck_grp_deallocate(grp);
+       if (grp) {
+               lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
+               lck_grp_deallocate(grp);
+       }
        return;
 }
 
@@ -314,8 +316,8 @@ lck_spin_destroy(
  */
 void
 lck_spin_lock_grp(
-       lck_spin_t      *lck,
-       lck_grp_t       *grp)
+       lck_spin_t      *lck,
+       lck_grp_t       *grp)
 {
 #pragma unused(grp)
        usimple_lock((usimple_lock_t) lck, grp);
@@ -323,7 +325,7 @@ lck_spin_lock_grp(
 
 void
 lck_spin_lock(
-       lck_spin_t      *lck)
+       lck_spin_t      *lck)
 {
        usimple_lock((usimple_lock_t) lck, NULL);
 }
@@ -333,24 +335,24 @@ lck_spin_lock(
  */
 void
 lck_spin_unlock(
-       lck_spin_t      *lck)
+       lck_spin_t      *lck)
 {
        usimple_unlock((usimple_lock_t) lck);
 }
 
 boolean_t
 lck_spin_try_lock_grp(
-       lck_spin_t      *lck,
-       lck_grp_t       *grp)
+       lck_spin_t      *lck,
+       lck_grp_t       *grp)
 {
 #pragma unused(grp)
        boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp);
-#if     DEVELOPMENT || DEBUG
+#if    DEVELOPMENT || DEBUG
        if (lrval) {
                pltrace(FALSE);
        }
 #endif
-       return lrval;
+       return(lrval);
 }
 
 
@@ -359,15 +361,15 @@ lck_spin_try_lock_grp(
  */
 boolean_t
 lck_spin_try_lock(
-       lck_spin_t      *lck)
+       lck_spin_t      *lck)
 {
        boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL);
-#if     DEVELOPMENT || DEBUG
+#if    DEVELOPMENT || DEBUG
        if (lrval) {
                pltrace(FALSE);
        }
 #endif
-       return lrval;
+       return(lrval);
 }
 
 /*
@@ -397,8 +399,6 @@ lck_spin_assert(lck_spin_t *lock, unsigned int type)
                if (__improbable(holder != THREAD_NULL)) {
                        if (holder == thread) {
                                panic("Lock owned by current thread %p = %lx", lock, state);
-                       } else {
-                               panic("Lock %p owned by thread %p", lock, holder);
                        }
                }
        }
@@ -410,8 +410,7 @@ lck_spin_assert(lck_spin_t *lock, unsigned int type)
  *      Returns: TRUE if lock is acquired.
  */
 boolean_t
-kdp_lck_spin_is_acquired(lck_spin_t *lck)
-{
+kdp_lck_spin_is_acquired(lck_spin_t *lck) {
        if (not_in_kdp) {
                panic("panic: spinlock acquired check done outside of kernel debugger");
        }
@@ -425,23 +424,21 @@ kdp_lck_spin_is_acquired(lck_spin_t *lck)
  */
 void
 usimple_lock_init(
-       usimple_lock_t  l,
-       __unused unsigned short tag)
+       usimple_lock_t  l,
+       __unused unsigned short tag)
 {
-#ifndef MACHINE_SIMPLE_LOCK
+#ifndef        MACHINE_SIMPLE_LOCK
        USLDBG(usld_lock_init(l, tag));
        hw_lock_init(&l->interlock);
 #else
-       simple_lock_init((simple_lock_t)l, tag);
+       simple_lock_init((simple_lock_t)l,tag);
 #endif
 }
 
 volatile uint32_t spinlock_owner_cpu = ~0;
 volatile usimple_lock_t spinlock_timed_out;
 
-uint32_t
-spinlock_timeout_NMI(uintptr_t thread_addr)
-{
+uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
        uint32_t i;
 
        for (i = 0; i < real_ncpus; i++) {
@@ -467,22 +464,21 @@ spinlock_timeout_NMI(uintptr_t thread_addr)
  */
 void
 (usimple_lock)(
-       usimple_lock_t  l
+       usimple_lock_t  l
        LCK_GRP_ARG(lck_grp_t *grp))
 {
-#ifndef MACHINE_SIMPLE_LOCK
+#ifndef        MACHINE_SIMPLE_LOCK
        DECL_PC(pc);
 
        OBTAIN_PC(pc);
        USLDBG(usld_lock_pre(l, pc));
 
-       if (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) {
+       if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0))   {
                boolean_t uslock_acquired = FALSE;
                while (machine_timeout_suspended()) {
                        enable_preemption();
-                       if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp))) {
+                       if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp)))
                                break;
-                       }
                }
 
                if (uslock_acquired == FALSE) {
@@ -491,11 +487,11 @@ void
                        spinlock_timed_out = l;
                        lock_cpu = spinlock_timeout_NMI(lowner);
                        panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
-                           l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
+                             l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
                }
        }
 #if DEVELOPMENT || DEBUG
-       pltrace(FALSE);
+               pltrace(FALSE);
 #endif
 
        USLDBG(usld_lock_post(l, pc));
@@ -517,15 +513,15 @@ void
  */
 void
 usimple_unlock(
-       usimple_lock_t  l)
+       usimple_lock_t  l)
 {
-#ifndef MACHINE_SIMPLE_LOCK
+#ifndef        MACHINE_SIMPLE_LOCK
        DECL_PC(pc);
 
        OBTAIN_PC(pc);
        USLDBG(usld_unlock(l, pc));
 #if DEVELOPMENT || DEBUG
-       pltrace(TRUE);
+               pltrace(TRUE);
 #endif
        hw_lock_unlock(&l->interlock);
 #else
@@ -548,11 +544,11 @@ usimple_unlock(
  */
 unsigned int
 usimple_lock_try(
-       usimple_lock_t  l,
+       usimple_lock_t  l,
        lck_grp_t *grp)
 {
-#ifndef MACHINE_SIMPLE_LOCK
-       unsigned int    success;
+#ifndef        MACHINE_SIMPLE_LOCK
+       unsigned int    success;
        DECL_PC(pc);
 
        OBTAIN_PC(pc);
@@ -561,48 +557,81 @@ usimple_lock_try(
 #if DEVELOPMENT || DEBUG
                pltrace(FALSE);
 #endif
-               USLDBG(usld_lock_try_post(l, pc));
+       USLDBG(usld_lock_try_post(l, pc));
        }
        return success;
 #else
-       return simple_lock_try((simple_lock_t)l, grp);
+       return(simple_lock_try((simple_lock_t)l, grp));
 #endif
 }
 
 /*
- * Acquire a usimple_lock while polling for pending TLB flushes
+ * Acquire a usimple_lock while polling for pending cpu signals
  * and spinning on a lock.
  *
  */
-void
-usimple_lock_try_lock_loop(usimple_lock_t l, lck_grp_t *grp)
+unsigned int
+(usimple_lock_try_lock_mp_signal_safe_loop_deadline)(usimple_lock_t l,
+       uint64_t deadline
+       LCK_GRP_ARG(lck_grp_t *grp))
 {
        boolean_t istate = ml_get_interrupts_enabled();
+
+       if (deadline < mach_absolute_time()) {
+               return 0;
+       }
+
        while (!simple_lock_try(l, grp)) {
-               if (!istate) {
-                       handle_pending_TLB_flushes();
+               if (!istate)
+                       cpu_signal_handler(NULL);
+               
+               if (deadline < mach_absolute_time()) {
+                       return 0;
                }
+
                cpu_pause();
        }
+
+       return 1;
+}
+
+void
+(usimple_lock_try_lock_loop)(usimple_lock_t l
+       LCK_GRP_ARG(lck_grp_t *grp))
+{
+       usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ULLONG_MAX, grp);
 }
 
-#if     USLOCK_DEBUG
+unsigned int
+(usimple_lock_try_lock_mp_signal_safe_loop_duration)(usimple_lock_t l,
+       uint64_t duration
+       LCK_GRP_ARG(lck_grp_t *grp))
+{
+       uint64_t deadline;
+       uint64_t base_at = mach_absolute_time();
+       uint64_t duration_at;
+
+       nanoseconds_to_absolutetime(duration, &duration_at);
+       deadline = base_at + duration_at;
+       if (deadline < base_at) {
+               /* deadline has overflowed, make it saturate */
+               deadline = ULLONG_MAX;
+       }
+
+       return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, deadline, grp);
+}
+
+#if    USLOCK_DEBUG
 /*
  *     States of a usimple_lock.  The default when initializing
  *     a usimple_lock is setting it up for debug checking.
  */
-#define USLOCK_CHECKED          0x0001          /* lock is being checked */
-#define USLOCK_TAKEN            0x0002          /* lock has been taken */
-#define USLOCK_INIT             0xBAA0          /* lock has been initialized */
-#define USLOCK_INITIALIZED      (USLOCK_INIT|USLOCK_CHECKED)
-#define USLOCK_CHECKING(l)      (uslock_check &&                        \
-                                ((l)->debug.state & USLOCK_CHECKED))
-
-/*
- *     Trace activities of a particularly interesting lock.
- */
-void    usl_trace(usimple_lock_t, int, pc_t, const char *);
-
+#define        USLOCK_CHECKED          0x0001          /* lock is being checked */
+#define        USLOCK_TAKEN            0x0002          /* lock has been taken */
+#define        USLOCK_INIT             0xBAA0          /* lock has been initialized */
+#define        USLOCK_INITIALIZED      (USLOCK_INIT|USLOCK_CHECKED)
+#define        USLOCK_CHECKING(l)      (uslock_check &&                        \
+                                ((l)->debug.state & USLOCK_CHECKED))
 
 /*
  *     Initialize the debugging information contained
@@ -610,12 +639,11 @@ void    usl_trace(usimple_lock_t, int, pc_t, const char *);
  */
 void
 usld_lock_init(
-       usimple_lock_t  l,
-       __unused unsigned short tag)
+       usimple_lock_t  l,
+       __unused unsigned short tag)
 {
-       if (l == USIMPLE_LOCK_NULL) {
+       if (l == USIMPLE_LOCK_NULL)
                panic("lock initialization:  null lock pointer");
-       }
        l->lock_type = USLOCK_TAG;
        l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
        l->debug.lock_cpu = l->debug.unlock_cpu = 0;
@@ -634,18 +662,15 @@ usld_lock_init(
  */
 int
 usld_lock_common_checks(
-       usimple_lock_t  l,
-       char            *caller)
+       usimple_lock_t  l,
+       char            *caller)
 {
-       if (l == USIMPLE_LOCK_NULL) {
+       if (l == USIMPLE_LOCK_NULL)
                panic("%s:  null lock pointer", caller);
-       }
-       if (l->lock_type != USLOCK_TAG) {
+       if (l->lock_type != USLOCK_TAG)
                panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
-       }
-       if (!(l->debug.state & USLOCK_INIT)) {
+       if (!(l->debug.state & USLOCK_INIT))
                panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
-       }
        return USLOCK_CHECKING(l);
 }
 
@@ -657,15 +682,14 @@ usld_lock_common_checks(
 /* ARGSUSED */
 void
 usld_lock_pre(
-       usimple_lock_t  l,
-       pc_t            pc)
+       usimple_lock_t  l,
+       pc_t            pc)
 {
-       char    caller[] = "usimple_lock";
+       char    caller[] = "usimple_lock";
 
 
-       if (!usld_lock_common_checks(l, caller)) {
+       if (!usld_lock_common_checks(l, caller))
                return;
-       }
 
 /*
  *     Note that we have a weird case where we are getting a lock when we are]
@@ -678,13 +702,12 @@ usld_lock_pre(
        if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
            l->debug.lock_thread == (void *) current_thread()) {
                printf("%s:  lock %p already locked (at %p) by",
-                   caller, l, l->debug.lock_pc);
+                     caller, l, l->debug.lock_pc);
                printf(" current thread %p (new attempt at pc %p)\n",
-                   l->debug.lock_thread, pc);
+                      l->debug.lock_thread, pc);
                panic("%s", caller);
        }
        mp_disable_preemption();
-       usl_trace(l, cpu_number(), pc, caller);
        mp_enable_preemption();
 }
 
@@ -697,33 +720,28 @@ usld_lock_pre(
  */
 void
 usld_lock_post(
-       usimple_lock_t  l,
-       pc_t            pc)
+       usimple_lock_t  l,
+       pc_t            pc)
 {
-       int     mycpu;
-       char    caller[] = "successful usimple_lock";
+       int     mycpu;
+       char    caller[] = "successful usimple_lock";
 
 
-       if (!usld_lock_common_checks(l, caller)) {
+       if (!usld_lock_common_checks(l, caller))
                return;
-       }
 
-       if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
+       if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
                panic("%s:  lock %p became uninitialized",
-                   caller, l);
-       }
-       if ((l->debug.state & USLOCK_TAKEN)) {
+                     caller, l);
+       if ((l->debug.state & USLOCK_TAKEN))
                panic("%s:  lock 0x%p became TAKEN by someone else",
-                   caller, l);
-       }
+                     caller, l);
 
        mycpu = cpu_number();
        l->debug.lock_thread = (void *)current_thread();
        l->debug.state |= USLOCK_TAKEN;
        l->debug.lock_pc = pc;
        l->debug.lock_cpu = mycpu;
-
-       usl_trace(l, mycpu, pc, caller);
 }
 
 
@@ -737,34 +755,30 @@ usld_lock_post(
  */
 void
 usld_unlock(
-       usimple_lock_t  l,
-       pc_t            pc)
+       usimple_lock_t  l,
+       pc_t            pc)
 {
-       int     mycpu;
-       char    caller[] = "usimple_unlock";
+       int     mycpu;
+       char    caller[] = "usimple_unlock";
 
 
-       if (!usld_lock_common_checks(l, caller)) {
+       if (!usld_lock_common_checks(l, caller))
                return;
-       }
 
        mycpu = cpu_number();
 
-       if (!(l->debug.state & USLOCK_TAKEN)) {
+       if (!(l->debug.state & USLOCK_TAKEN))
                panic("%s:  lock 0x%p hasn't been taken",
-                   caller, l);
-       }
-       if (l->debug.lock_thread != (void *) current_thread()) {
+                     caller, l);
+       if (l->debug.lock_thread != (void *) current_thread())
                panic("%s:  unlocking lock 0x%p, owned by thread %p",
-                   caller, l, l->debug.lock_thread);
-       }
+                     caller, l, l->debug.lock_thread);
        if (l->debug.lock_cpu != mycpu) {
                printf("%s:  unlocking lock 0x%p on cpu 0x%x",
-                   caller, l, mycpu);
+                      caller, l, mycpu);
                printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
                panic("%s", caller);
        }
-       usl_trace(l, mycpu, pc, caller);
 
        l->debug.unlock_thread = l->debug.lock_thread;
        l->debug.lock_thread = INVALID_PC;
@@ -782,17 +796,13 @@ usld_unlock(
  */
 void
 usld_lock_try_pre(
-       usimple_lock_t  l,
-       pc_t            pc)
+       usimple_lock_t  l,
+       __unused pc_t   pc)
 {
-       char    caller[] = "usimple_lock_try";
+       char    caller[] = "usimple_lock_try";
 
-       if (!usld_lock_common_checks(l, caller)) {
+       if (!usld_lock_common_checks(l, caller))
                return;
-       }
-       mp_disable_preemption();
-       usl_trace(l, cpu_number(), pc, caller);
-       mp_enable_preemption();
 }
 
 
@@ -806,79 +816,45 @@ usld_lock_try_pre(
  */
 void
 usld_lock_try_post(
-       usimple_lock_t  l,
-       pc_t            pc)
+       usimple_lock_t  l,
+       pc_t            pc)
 {
-       int     mycpu;
-       char    caller[] = "successful usimple_lock_try";
+       int     mycpu;
+       char    caller[] = "successful usimple_lock_try";
 
-       if (!usld_lock_common_checks(l, caller)) {
+       if (!usld_lock_common_checks(l, caller))
                return;
-       }
 
-       if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
+       if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
                panic("%s:  lock 0x%p became uninitialized",
-                   caller, l);
-       }
-       if ((l->debug.state & USLOCK_TAKEN)) {
+                     caller, l);
+       if ((l->debug.state & USLOCK_TAKEN))
                panic("%s:  lock 0x%p became TAKEN by someone else",
-                   caller, l);
-       }
+                     caller, l);
 
        mycpu = cpu_number();
        l->debug.lock_thread = (void *) current_thread();
        l->debug.state |= USLOCK_TAKEN;
        l->debug.lock_pc = pc;
        l->debug.lock_cpu = mycpu;
-
-       usl_trace(l, mycpu, pc, caller);
 }
-
-
-/*
- *     For very special cases, set traced_lock to point to a
- *     specific lock of interest.  The result is a series of
- *     XPRs showing lock operations on that lock.  The lock_seq
- *     value is used to show the order of those operations.
- */
-usimple_lock_t          traced_lock;
-unsigned int            lock_seq;
-
-void
-usl_trace(
-       usimple_lock_t  l,
-       int             mycpu,
-       pc_t            pc,
-       const char *    op_name)
-{
-       if (traced_lock == l) {
-               XPR(XPR_SLOCK,
-                   "seq %d, cpu %d, %s @ %x\n",
-                   (uintptr_t) lock_seq, (uintptr_t) mycpu,
-                   (uintptr_t) op_name, (uintptr_t) pc, 0);
-               lock_seq++;
-       }
-}
-
-
-#endif  /* USLOCK_DEBUG */
+#endif /* USLOCK_DEBUG */
 
 /*
  *      Routine:        lck_rw_alloc_init
  */
 lck_rw_t *
 lck_rw_alloc_init(
-       lck_grp_t       *grp,
-       lck_attr_t      *attr)
-{
-       lck_rw_t        *lck;
+       lck_grp_t       *grp,
+       lck_attr_t      *attr) {
+       lck_rw_t        *lck;
 
        if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
                bzero(lck, sizeof(lck_rw_t));
                lck_rw_init(lck, grp, attr);
        }
 
-       return lck;
+       return(lck);
 }
 
 /*
@@ -886,9 +862,8 @@ lck_rw_alloc_init(
  */
 void
 lck_rw_free(
-       lck_rw_t        *lck,
-       lck_grp_t       *grp)
-{
+       lck_rw_t        *lck,
+       lck_grp_t       *grp) {
        lck_rw_destroy(lck, grp);
        kfree(lck, sizeof(lck_rw_t));
 }
@@ -898,12 +873,12 @@ lck_rw_free(
  */
 void
 lck_rw_init(
-       lck_rw_t        *lck,
-       lck_grp_t       *grp,
-       lck_attr_t      *attr)
+       lck_rw_t        *lck,
+       lck_grp_t       *grp,
+       lck_attr_t      *attr)
 {
-       lck_attr_t      *lck_attr = (attr != LCK_ATTR_NULL) ?
-           attr : &LockDefaultLckAttr;
+       lck_attr_t      *lck_attr = (attr != LCK_ATTR_NULL) ?
+                                       attr : &LockDefaultLckAttr;
 
        hw_lock_byte_init(&lck->lck_rw_interlock);
        lck->lck_rw_want_write = FALSE;
@@ -913,7 +888,7 @@ lck_rw_init(
        lck->lck_r_waiting = lck->lck_w_waiting = 0;
        lck->lck_rw_tag = 0;
        lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
-           LCK_ATTR_RW_SHARED_PRIORITY) == 0);
+                               LCK_ATTR_RW_SHARED_PRIORITY) == 0);
 
        lck_grp_reference(grp);
        lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
@@ -924,12 +899,11 @@ lck_rw_init(
  */
 void
 lck_rw_destroy(
-       lck_rw_t        *lck,
-       lck_grp_t       *grp)
+       lck_rw_t        *lck,
+       lck_grp_t       *grp)
 {
-       if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
+       if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
                return;
-       }
 #if MACH_LDEBUG
        lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
 #endif
@@ -955,7 +929,7 @@ lck_rw_destroy(
 static inline boolean_t
 lck_interlock_lock(lck_rw_t *lck)
 {
-       boolean_t       istate;
+       boolean_t       istate;
 
        istate = ml_set_interrupts_enabled(FALSE);
        hw_lock_byte_lock(&lck->lck_rw_interlock);
@@ -978,18 +952,16 @@ lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
 static inline void
 lck_rw_lock_pause(boolean_t interrupts_enabled)
 {
-       if (!interrupts_enabled) {
+       if (!interrupts_enabled)
                handle_pending_TLB_flushes();
-       }
        cpu_pause();
 }
 
 static inline boolean_t
 lck_rw_held_read_or_upgrade(lck_rw_t *lock)
 {
-       if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) {
+       if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE))
                return TRUE;
-       }
        return FALSE;
 }
 
@@ -1004,7 +976,7 @@ lck_rw_deadline_for_spin(lck_rw_t *lck)
                if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
                        /*
                         * there are already threads waiting on this lock... this
-                        * implies that they have spun beyond their deadlines waiting for
+                        * implies that they have spun beyond their deadlines waiting for 
                         * the desired state to show up so we will not bother spinning at this time...
                         *   or
                         * the current number of threads sharing this lock exceeds our capacity to run them
@@ -1012,12 +984,11 @@ lck_rw_deadline_for_spin(lck_rw_t *lck)
                         * to be at 0, we'll not bother spinning since the latency for this to happen is
                         * unpredictable...
                         */
-                       return mach_absolute_time();
+                       return (mach_absolute_time());
                }
-               return mach_absolute_time() + MutexSpin;
-       } else {
-               return mach_absolute_time() + (1LL * 1000000000LL);
-       }
+               return (mach_absolute_time() + MutexSpin);
+       } else
+               return (mach_absolute_time() + (100000LL * 1000000000LL));
 }
 
 
@@ -1036,13 +1007,12 @@ lck_rw_interlock_spin(lck_rw_t *lock)
 static boolean_t
 lck_rw_grab_want(lck_rw_t *lock)
 {
-       uint32_t        data, prev;
+       uint32_t        data, prev;
 
-       for (;;) {
+       for ( ; ; ) {
                data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
-               if ((data & LCK_RW_INTERLOCK) == 0) {
+               if ((data & LCK_RW_INTERLOCK) == 0)
                        break;
-               }
                atomic_exchange_abort();
                lck_rw_interlock_spin(lock);
        }
@@ -1057,13 +1027,12 @@ lck_rw_grab_want(lck_rw_t *lock)
 static boolean_t
 lck_rw_grab_shared(lck_rw_t *lock)
 {
-       uint32_t        data, prev;
+       uint32_t        data, prev;
 
-       for (;;) {
+       for ( ; ; ) {
                data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
-               if ((data & LCK_RW_INTERLOCK) == 0) {
+               if ((data & LCK_RW_INTERLOCK) == 0)
                        break;
-               }
                atomic_exchange_abort();
                lck_rw_interlock_spin(lock);
        }
@@ -1082,19 +1051,19 @@ lck_rw_grab_shared(lck_rw_t *lock)
  */
 static void
 lck_rw_lock_exclusive_gen(
-       lck_rw_t        *lck)
+       lck_rw_t        *lck)
 {
-       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
-       uint64_t        deadline = 0;
-       int             slept = 0;
-       int             gotlock = 0;
-       int             lockheld = 0;
-       wait_result_t   res = 0;
-       boolean_t       istate = -1;
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
+       uint64_t        deadline = 0;
+       int             slept = 0;
+       int             gotlock = 0;
+       int             lockheld = 0;
+       wait_result_t   res = 0;
+       boolean_t       istate = -1;
 
-#if     CONFIG_DTRACE
+#if    CONFIG_DTRACE
        boolean_t dtrace_ls_initialized = FALSE;
-       boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
+       boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
        uint64_t wait_interval = 0;
        int readers_at_sleep = 0;
 #endif
@@ -1102,8 +1071,9 @@ lck_rw_lock_exclusive_gen(
        /*
         *      Try to acquire the lck_rw_want_write bit.
         */
-       while (!lck_rw_grab_want(lck)) {
-#if     CONFIG_DTRACE
+       while ( !lck_rw_grab_want(lck)) {
+
+#if    CONFIG_DTRACE
                if (dtrace_ls_initialized == FALSE) {
                        dtrace_ls_initialized = TRUE;
                        dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
@@ -1119,39 +1089,38 @@ lck_rw_lock_exclusive_gen(
                        }
                }
 #endif
-               if (istate == -1) {
+               if (istate == -1)
                        istate = ml_get_interrupts_enabled();
-               }
 
                deadline = lck_rw_deadline_for_spin(lck);
 
                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
 
-               while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) {
+               while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
                        lck_rw_lock_pause(istate);
-               }
 
                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
 
-               if (gotlock) {
+               if (gotlock)
                        break;
-               }
                /*
                 * if we get here, the deadline has expired w/o us
                 * being able to grab the lock exclusively
                 * check to see if we're allowed to do a thread_block
                 */
                if (lck->lck_rw_can_sleep) {
+
                        istate = lck_interlock_lock(lck);
 
                        if (lck->lck_rw_want_write) {
+
                                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
 
                                lck->lck_w_waiting = TRUE;
 
                                thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
                                res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
-                                   THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
+                                               THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
                                lck_interlock_unlock(lck, istate);
 
                                if (res == THREAD_WAITING) {
@@ -1178,7 +1147,8 @@ lck_rw_lock_exclusive_gen(
         * and the interlock not held, we are safe to proceed
         */
        while (lck_rw_held_read_or_upgrade(lck)) {
-#if     CONFIG_DTRACE
+
+#if    CONFIG_DTRACE
                /*
                 * Either sleeping or spinning is happening, start
                 * a timing of our delay interval now.  If we set it
@@ -1200,29 +1170,27 @@ lck_rw_lock_exclusive_gen(
                        }
                }
 #endif
-               if (istate == -1) {
+               if (istate == -1)
                        istate = ml_get_interrupts_enabled();
-               }
 
                deadline = lck_rw_deadline_for_spin(lck);
 
                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
 
-               while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) {
+               while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
                        lck_rw_lock_pause(istate);
-               }
 
                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
 
-               if (!lockheld) {
+               if ( !lockheld)
                        break;
-               }
                /*
                 * if we get here, the deadline has expired w/o us
                 * being able to grab the lock exclusively
                 * check to see if we're allowed to do a thread_block
                 */
                if (lck->lck_rw_can_sleep) {
+
                        istate = lck_interlock_lock(lck);
 
                        if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
@@ -1232,7 +1200,7 @@ lck_rw_lock_exclusive_gen(
 
                                thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
                                res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
-                                   THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
+                                               THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
                                lck_interlock_unlock(lck, istate);
 
                                if (res == THREAD_WAITING) {
@@ -1252,7 +1220,7 @@ lck_rw_lock_exclusive_gen(
                }
        }
 
-#if     CONFIG_DTRACE
+#if    CONFIG_DTRACE
        /*
         * Decide what latencies we suffered that are Dtrace events.
         * If we have set wait_interval, then we either spun or slept.
@@ -1286,46 +1254,40 @@ lck_rw_lock_exclusive_gen(
  *      Routine:        lck_rw_done
  */
 
-lck_rw_type_t
-lck_rw_done(lck_rw_t *lock)
+lck_rw_type_t lck_rw_done(lck_rw_t *lock)
 {
-       uint32_t        data, prev;
+       uint32_t        data, prev;
 
-       for (;;) {
+       for ( ; ; ) {
                data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
-               if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
+               if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
                        atomic_exchange_abort();
                        lck_rw_interlock_spin(lock);
                        continue;
                }
                if (data & LCK_RW_SHARED_MASK) {
                        data -= LCK_RW_SHARED_READER;
-                       if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
+                       if ((data & LCK_RW_SHARED_MASK) == 0)   /* if reader count has now gone to 0, check for waiters */
                                goto check_waiters;
-                       }
-               } else {                                        /* if reader count == 0, must be exclusive lock */
+               } else {                                        /* if reader count == 0, must be exclusive lock */
                        if (data & LCK_RW_WANT_UPGRADE) {
                                data &= ~(LCK_RW_WANT_UPGRADE);
                        } else {
-                               if (data & LCK_RW_WANT_WRITE) {
+                               if (data & LCK_RW_WANT_WRITE)
                                        data &= ~(LCK_RW_WANT_EXCL);
-                               } else {                                /* lock is not 'owned', panic */
+                               else                                    /* lock is not 'owned', panic */
                                        panic("Releasing non-exclusive RW lock without a reader refcount!");
-                               }
                        }
 check_waiters:
                        if (prev & LCK_RW_W_WAITING) {
                                data &= ~(LCK_RW_W_WAITING);
-                               if ((prev & LCK_RW_PRIV_EXCL) == 0) {
+                               if ((prev & LCK_RW_PRIV_EXCL) == 0)
                                        data &= ~(LCK_RW_R_WAITING);
-                               }
-                       } else {
+                       } else
                                data &= ~(LCK_RW_R_WAITING);
-                       }
                }
-               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
+               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
                        break;
-               }
                cpu_pause();
        }
        return lck_rw_done_gen(lock, prev);
@@ -1336,13 +1298,13 @@ check_waiters:
  *
  *     called from lck_rw_done()
  *     prior_lock_state is the value in the 1st
- *      word of the lock at the time of a successful
+ *     word of the lock at the time of a successful
  *     atomic compare and exchange with the new value...
- *      it represents the state of the lock before we
+ *     it represents the state of the lock before we
  *     decremented the rw_shared_count or cleared either
- *      rw_want_upgrade or rw_want_write and
+ *     rw_want_upgrade or rw_want_write and
  *     the lck_x_waiting bits...  since the wrapper
- *      routine has already changed the state atomically,
+ *     routine has already changed the state atomically, 
  *     we just need to decide if we should
  *     wake up anyone and what value to return... we do
  *     this by examining the state of the lock before
@@ -1410,16 +1372,15 @@ lck_rw_done_gen(
  */
 void
 lck_rw_unlock(
-       lck_rw_t        *lck,
-       lck_rw_type_t   lck_rw_type)
+       lck_rw_t        *lck,
+       lck_rw_type_t   lck_rw_type)
 {
-       if (lck_rw_type == LCK_RW_TYPE_SHARED) {
+       if (lck_rw_type == LCK_RW_TYPE_SHARED)
                lck_rw_unlock_shared(lck);
-       } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
+       else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
                lck_rw_unlock_exclusive(lck);
-       } else {
+       else
                panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
-       }
 }
 
 
@@ -1428,16 +1389,15 @@ lck_rw_unlock(
  */
 void
 lck_rw_unlock_shared(
-       lck_rw_t        *lck)
+       lck_rw_t        *lck)
 {
-       lck_rw_type_t   ret;
+       lck_rw_type_t   ret;
 
        assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
        ret = lck_rw_done(lck);
 
-       if (ret != LCK_RW_TYPE_SHARED) {
+       if (ret != LCK_RW_TYPE_SHARED)
                panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
-       }
 }
 
 
@@ -1446,15 +1406,14 @@ lck_rw_unlock_shared(
  */
 void
 lck_rw_unlock_exclusive(
-       lck_rw_t        *lck)
+       lck_rw_t        *lck)
 {
-       lck_rw_type_t   ret;
+       lck_rw_type_t   ret;
 
        ret = lck_rw_done(lck);
 
-       if (ret != LCK_RW_TYPE_EXCLUSIVE) {
+       if (ret != LCK_RW_TYPE_EXCLUSIVE)
                panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
-       }
 }
 
 
@@ -1463,16 +1422,15 @@ lck_rw_unlock_exclusive(
  */
 void
 lck_rw_lock(
-       lck_rw_t        *lck,
-       lck_rw_type_t   lck_rw_type)
+       lck_rw_t        *lck,
+       lck_rw_type_t   lck_rw_type)
 {
-       if (lck_rw_type == LCK_RW_TYPE_SHARED) {
+       if (lck_rw_type == LCK_RW_TYPE_SHARED)
                lck_rw_lock_shared(lck);
-       } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
+       else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
                lck_rw_lock_exclusive(lck);
-       } else {
+       else
                panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
-       }
 }
 
 /*
@@ -1481,10 +1439,10 @@ lck_rw_lock(
 void
 lck_rw_lock_shared(lck_rw_t *lock)
 {
-       uint32_t        data, prev;
+       uint32_t        data, prev;
 
        current_thread()->rwlock_count++;
-       for (;;) {
+       for ( ; ; ) {
                data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
                if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
                        atomic_exchange_abort();
@@ -1497,14 +1455,13 @@ lck_rw_lock_shared(lck_rw_t *lock)
                        break;
                }
                data += LCK_RW_SHARED_READER;
-               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
+               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
                        break;
-               }
                cpu_pause();
        }
-#if     CONFIG_DTRACE
+#if    CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
-#endif  /* CONFIG_DTRACE */
+#endif /* CONFIG_DTRACE */
        return;
 }
 
@@ -1517,24 +1474,25 @@ lck_rw_lock_shared(lck_rw_t *lock)
  */
 static void
 lck_rw_lock_shared_gen(
-       lck_rw_t        *lck)
+       lck_rw_t        *lck)
 {
-       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
-       uint64_t        deadline = 0;
-       int             gotlock = 0;
-       int             slept = 0;
-       wait_result_t   res = 0;
-       boolean_t       istate = -1;
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
+       uint64_t        deadline = 0;
+       int             gotlock = 0;
+       int             slept = 0;
+       wait_result_t   res = 0;
+       boolean_t       istate = -1;
 
-#if     CONFIG_DTRACE
+#if    CONFIG_DTRACE
        uint64_t wait_interval = 0;
        int readers_at_sleep = 0;
        boolean_t dtrace_ls_initialized = FALSE;
        boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
 #endif
 
-       while (!lck_rw_grab_shared(lck)) {
-#if     CONFIG_DTRACE
+       while ( !lck_rw_grab_shared(lck)) {
+
+#if    CONFIG_DTRACE
                if (dtrace_ls_initialized == FALSE) {
                        dtrace_ls_initialized = TRUE;
                        dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
@@ -1550,43 +1508,42 @@ lck_rw_lock_shared_gen(
                        }
                }
 #endif
-               if (istate == -1) {
+               if (istate == -1)
                        istate = ml_get_interrupts_enabled();
-               }
 
                deadline = lck_rw_deadline_for_spin(lck);
 
                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
-                   trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
+                            trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
 
-               while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) {
+               while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
                        lck_rw_lock_pause(istate);
-               }
 
                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
-                   trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
+                            trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
 
-               if (gotlock) {
+               if (gotlock)
                        break;
-               }
                /*
                 * if we get here, the deadline has expired w/o us
                 * being able to grab the lock for read
                 * check to see if we're allowed to do a thread_block
                 */
                if (lck->lck_rw_can_sleep) {
+
                        istate = lck_interlock_lock(lck);
 
                        if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
                            ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
+
                                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
-                                   trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
+                                            trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
 
                                lck->lck_r_waiting = TRUE;
 
                                thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
                                res = assert_wait(RW_LOCK_READER_EVENT(lck),
-                                   THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
+                                               THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
                                lck_interlock_unlock(lck, istate);
 
                                if (res == THREAD_WAITING) {
@@ -1594,7 +1551,7 @@ lck_rw_lock_shared_gen(
                                        slept++;
                                }
                                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
-                                   trace_lck, res, slept, 0, 0);
+                                            trace_lck, res, slept, 0, 0);
                        } else {
                                lck->lck_rw_shared_count++;
                                lck_interlock_unlock(lck, istate);
@@ -1603,7 +1560,7 @@ lck_rw_lock_shared_gen(
                }
        }
 
-#if     CONFIG_DTRACE
+#if    CONFIG_DTRACE
        if (dtrace_ls_enabled == TRUE) {
                if (slept == 0) {
                        LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
@@ -1627,27 +1584,28 @@ lck_rw_lock_exclusive(lck_rw_t *lock)
 {
        current_thread()->rwlock_count++;
        if (atomic_test_and_set32(&lock->data,
-           (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
-           LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
-#if     CONFIG_DTRACE
+               (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
+               LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
+#if    CONFIG_DTRACE
                LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
-#endif  /* CONFIG_DTRACE */
-       } else {
+#endif /* CONFIG_DTRACE */
+       } else
                lck_rw_lock_exclusive_gen(lock);
-       }
 }
 
 
 /*
  *     Routine:        lck_rw_lock_shared_to_exclusive
+ *
+ *     False returned upon failure, in this case the shared lock is dropped.
  */
 
 boolean_t
 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
 {
-       uint32_t        data, prev;
+       uint32_t        data, prev;
 
-       for (;;) {
+       for ( ; ; ) {
                data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
                if (data & LCK_RW_INTERLOCK) {
                        atomic_exchange_abort();
@@ -1656,26 +1614,22 @@ lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
                }
                if (data & LCK_RW_WANT_UPGRADE) {
                        data -= LCK_RW_SHARED_READER;
-                       if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
-                               data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
-                       }
-                       if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
+                       if ((data & LCK_RW_SHARED_MASK) == 0)           /* we were the last reader */
+                               data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
+                       if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
                                return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
-                       }
                } else {
-                       data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
-                       data -= LCK_RW_SHARED_READER;           /* and shed our read count */
-                       if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
+                       data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
+                       data -= LCK_RW_SHARED_READER;           /* and shed our read count */
+                       if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
                                break;
-                       }
                }
                cpu_pause();
        }
-       /* we now own the WANT_UPGRADE */
-       if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
-               lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
-       }
-#if     CONFIG_DTRACE
+                                               /* we now own the WANT_UPGRADE */
+       if (data & LCK_RW_SHARED_MASK)          /* check to see if all of the readers are drained */
+               lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
+#if    CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
 #endif
        return TRUE;
@@ -1692,12 +1646,12 @@ lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
  */
 static boolean_t
 lck_rw_lock_shared_to_exclusive_failure(
-       lck_rw_t        *lck,
-       uint32_t        prior_lock_state)
+       lck_rw_t        *lck,
+       uint32_t        prior_lock_state)
 {
-       lck_rw_t        *fake_lck;
-       thread_t        thread = current_thread();
-       uint32_t        rwlock_count;
+       lck_rw_t        *fake_lck;
+       thread_t        thread = current_thread();
+       uint32_t        rwlock_count;
 
        /* Check if dropping the lock means that we need to unpromote */
        rwlock_count = thread->rwlock_count--;
@@ -1723,9 +1677,9 @@ lck_rw_lock_shared_to_exclusive_failure(
        }
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
-           VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
+                    VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
 
-       return FALSE;
+       return (FALSE);
 }
 
 
@@ -1739,16 +1693,16 @@ lck_rw_lock_shared_to_exclusive_failure(
  */
 static boolean_t
 lck_rw_lock_shared_to_exclusive_success(
-       lck_rw_t        *lck)
+       lck_rw_t        *lck)
 {
-       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
-       uint64_t        deadline = 0;
-       int             slept = 0;
-       int             still_shared = 0;
-       wait_result_t   res;
-       boolean_t       istate = -1;
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
+       uint64_t        deadline = 0;
+       int             slept = 0;
+       int             still_shared = 0;
+       wait_result_t   res;
+       boolean_t       istate = -1;
 
-#if     CONFIG_DTRACE
+#if    CONFIG_DTRACE
        uint64_t wait_interval = 0;
        int readers_at_sleep = 0;
        boolean_t dtrace_ls_initialized = FALSE;
@@ -1756,7 +1710,8 @@ lck_rw_lock_shared_to_exclusive_success(
 #endif
 
        while (lck->lck_rw_shared_count != 0) {
-#if     CONFIG_DTRACE
+
+#if    CONFIG_DTRACE
                if (dtrace_ls_initialized == FALSE) {
                        dtrace_ls_initialized = TRUE;
                        dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
@@ -1772,42 +1727,40 @@ lck_rw_lock_shared_to_exclusive_success(
                        }
                }
 #endif
-               if (istate == -1) {
+               if (istate == -1)
                        istate = ml_get_interrupts_enabled();
-               }
 
                deadline = lck_rw_deadline_for_spin(lck);
 
                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
-                   trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
+                            trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
 
-               while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) {
+               while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
                        lck_rw_lock_pause(istate);
-               }
 
                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
-                   trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
+                            trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
 
-               if (!still_shared) {
+               if ( !still_shared)
                        break;
-               }
                /*
                 * if we get here, the deadline has expired w/o
                 * the rw_shared_count having drained to 0
                 * check to see if we're allowed to do a thread_block
                 */
                if (lck->lck_rw_can_sleep) {
+
                        istate = lck_interlock_lock(lck);
 
                        if (lck->lck_rw_shared_count != 0) {
                                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
-                                   trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
+                                            trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
 
                                lck->lck_w_waiting = TRUE;
 
                                thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
                                res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
-                                   THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
+                                               THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
                                lck_interlock_unlock(lck, istate);
 
                                if (res == THREAD_WAITING) {
@@ -1815,14 +1768,14 @@ lck_rw_lock_shared_to_exclusive_success(
                                        slept++;
                                }
                                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
-                                   trace_lck, res, slept, 0, 0);
+                                            trace_lck, res, slept, 0, 0);
                        } else {
                                lck_interlock_unlock(lck, istate);
                                break;
                        }
                }
        }
-#if     CONFIG_DTRACE
+#if    CONFIG_DTRACE
        /*
         * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
         */
@@ -1837,37 +1790,33 @@ lck_rw_lock_shared_to_exclusive_success(
        }
        LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
 #endif
-       return TRUE;
+       return (TRUE);
 }
 
 /*
  *     Routine:        lck_rw_lock_exclusive_to_shared
  */
 
-void
-lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
+void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
 {
-       uint32_t        data, prev;
+       uint32_t        data, prev;
 
-       for (;;) {
+       for ( ; ; ) {
                data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
                if (data & LCK_RW_INTERLOCK) {
                        atomic_exchange_abort();
-                       lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
+                       lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
                        continue;
                }
                data += LCK_RW_SHARED_READER;
-               if (data & LCK_RW_WANT_UPGRADE) {
+               if (data & LCK_RW_WANT_UPGRADE)
                        data &= ~(LCK_RW_WANT_UPGRADE);
-               } else {
+               else
                        data &= ~(LCK_RW_WANT_EXCL);
-               }
-               if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
+               if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL)))
                        data &= ~(LCK_RW_W_WAITING);
-               }
-               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
+               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
                        break;
-               }
                cpu_pause();
        }
        return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
@@ -1876,7 +1825,7 @@ lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
 
 /*
  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
- *      Function:
+ *     Function:
  *             assembly fast path has already dropped
  *             our exclusive state and bumped lck_rw_shared_count
  *             all we need to do here is determine if anyone
@@ -1884,16 +1833,16 @@ lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
  */
 static void
 lck_rw_lock_exclusive_to_shared_gen(
-       lck_rw_t        *lck,
-       uint32_t        prior_lock_state)
+       lck_rw_t        *lck,
+       uint32_t        prior_lock_state)
 {
-       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
-       lck_rw_t                *fake_lck;
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
+       lck_rw_t                *fake_lck;
 
        fake_lck = (lck_rw_t *)&prior_lock_state;
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
-           trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
+                            trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
 
        /*
         * don't wake up anyone waiting to take the lock exclusively
@@ -1903,12 +1852,11 @@ lck_rw_lock_exclusive_to_shared_gen(
         * wake up any waiting readers if we don't have any writers waiting,
         * or the lock is NOT marked as rw_priv_excl (writers have privilege)
         */
-       if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
+       if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
                thread_wakeup(RW_LOCK_READER_EVENT(lck));
-       }
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
-           trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
+                            trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
 
 #if CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
@@ -1921,29 +1869,27 @@ lck_rw_lock_exclusive_to_shared_gen(
  */
 boolean_t
 lck_rw_try_lock(
-       lck_rw_t        *lck,
-       lck_rw_type_t   lck_rw_type)
-{
-       if (lck_rw_type == LCK_RW_TYPE_SHARED) {
-               return lck_rw_try_lock_shared(lck);
-       } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
-               return lck_rw_try_lock_exclusive(lck);
-       } else {
+       lck_rw_t        *lck,
+       lck_rw_type_t   lck_rw_type)
+{
+       if (lck_rw_type == LCK_RW_TYPE_SHARED)
+               return(lck_rw_try_lock_shared(lck));
+       else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
+               return(lck_rw_try_lock_exclusive(lck));
+       else
                panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
-       }
-       return FALSE;
+       return(FALSE);
 }
 
 /*
  *     Routine:        lck_rw_try_lock_shared
  */
 
-boolean_t
-lck_rw_try_lock_shared(lck_rw_t *lock)
+boolean_t lck_rw_try_lock_shared(lck_rw_t *lock)
 {
-       uint32_t        data, prev;
+       uint32_t        data, prev;
 
-       for (;;) {
+       for ( ; ; ) {
                data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
                if (data & LCK_RW_INTERLOCK) {
                        atomic_exchange_abort();
@@ -1952,19 +1898,18 @@ lck_rw_try_lock_shared(lck_rw_t *lock)
                }
                if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
                        atomic_exchange_abort();
-                       return FALSE;                   /* lock is busy */
+                       return FALSE;                   /* lock is busy */
                }
-               data += LCK_RW_SHARED_READER;           /* Increment reader refcount */
-               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
+               data += LCK_RW_SHARED_READER;           /* Increment reader refcount */
+               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
                        break;
-               }
                cpu_pause();
        }
        current_thread()->rwlock_count++;
        /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
-#if     CONFIG_DTRACE
+#if    CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
-#endif  /* CONFIG_DTRACE */
+#endif /* CONFIG_DTRACE */
        return TRUE;
 }
 
@@ -1973,12 +1918,11 @@ lck_rw_try_lock_shared(lck_rw_t *lock)
  *     Routine:        lck_rw_try_lock_exclusive
  */
 
-boolean_t
-lck_rw_try_lock_exclusive(lck_rw_t *lock)
+boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock)
 {
-       uint32_t        data, prev;
+       uint32_t        data, prev;
 
-       for (;;) {
+       for ( ; ; ) {
                data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
                if (data & LCK_RW_INTERLOCK) {
                        atomic_exchange_abort();
@@ -1987,27 +1931,26 @@ lck_rw_try_lock_exclusive(lck_rw_t *lock)
                }
                if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
                        atomic_exchange_abort();
-                       return FALSE;                           /* can't get it */
+                       return FALSE;                           /* can't get it */
                }
                data |= LCK_RW_WANT_EXCL;
-               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
+               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
                        break;
-               }
                cpu_pause();
        }
 
        current_thread()->rwlock_count++;
-#if     CONFIG_DTRACE
+#if    CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
-#endif  /* CONFIG_DTRACE */
+#endif /* CONFIG_DTRACE */
        return TRUE;
 }
 
 
 void
 lck_rw_assert(
-       lck_rw_t        *lck,
-       unsigned int    type)
+       lck_rw_t        *lck,
+       unsigned int    type)
 {
        switch (type) {
        case LCK_RW_ASSERT_SHARED:
@@ -2017,7 +1960,7 @@ lck_rw_assert(
                break;
        case LCK_RW_ASSERT_EXCLUSIVE:
                if ((lck->lck_rw_want_write ||
-                   lck->lck_rw_want_upgrade) &&
+                    lck->lck_rw_want_upgrade) &&
                    lck->lck_rw_shared_count == 0) {
                        return;
                }
@@ -2031,8 +1974,8 @@ lck_rw_assert(
                break;
        case LCK_RW_ASSERT_NOTHELD:
                if (!(lck->lck_rw_want_write ||
-                   lck->lck_rw_want_upgrade ||
-                   lck->lck_rw_shared_count != 0)) {
+                         lck->lck_rw_want_upgrade ||
+                         lck->lck_rw_shared_count != 0)) {
                        return;
                }
                break;
@@ -2044,6 +1987,9 @@ lck_rw_assert(
 }
 
 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
+#if MACH_LDEBUG
+__dead2
+#endif
 void
 lck_rw_clear_promotions_x86(thread_t thread)
 {
@@ -2077,8 +2023,7 @@ lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
  */
 boolean_t
-kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
-{
+kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
        if (not_in_kdp) {
                panic("panic: rw lock exclusive check done outside of kernel debugger");
        }
@@ -2112,10 +2057,6 @@ kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
  * Intel lock invariants:
  *
  * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
- * lck_mtx_pri: contains the max priority of all waiters during a contention period
- *      not cleared on last unlock, but stomped over on next first contention
- * lck_mtx_promoted: set when the current lock owner has been promoted
- *      cleared when lock owner unlocks, set on acquire or wait.
  *
  * The lock owner is promoted to the max priority of all its waiters only if it
  * was a lower priority when it acquired or was an owner when a waiter waited.
@@ -2131,7 +2072,7 @@ kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
  *       on acquire.
  */
 
-#ifdef  MUTEX_ZONE
+#ifdef MUTEX_ZONE
 extern zone_t lck_mtx_zone;
 #endif
 
@@ -2140,20 +2081,18 @@ extern zone_t lck_mtx_zone;
  */
 lck_mtx_t *
 lck_mtx_alloc_init(
-       lck_grp_t       *grp,
-       lck_attr_t      *attr)
+       lck_grp_t       *grp,
+       lck_attr_t      *attr)
 {
-       lck_mtx_t       *lck;
-#ifdef  MUTEX_ZONE
-       if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0) {
+       lck_mtx_t       *lck;
+#ifdef MUTEX_ZONE
+       if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
                lck_mtx_init(lck, grp, attr);
-       }
 #else
-       if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) {
+       if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
                lck_mtx_init(lck, grp, attr);
-       }
 #endif
-       return lck;
+       return(lck);
 }
 
 /*
@@ -2161,11 +2100,11 @@ lck_mtx_alloc_init(
  */
 void
 lck_mtx_free(
-       lck_mtx_t       *lck,
-       lck_grp_t       *grp)
+       lck_mtx_t       *lck,
+       lck_grp_t       *grp)
 {
        lck_mtx_destroy(lck, grp);
-#ifdef  MUTEX_ZONE
+#ifdef MUTEX_ZONE
        zfree(lck_mtx_zone, lck);
 #else
        kfree(lck, sizeof(lck_mtx_t));
@@ -2177,9 +2116,9 @@ lck_mtx_free(
  */
 static void
 lck_mtx_ext_init(
-       lck_mtx_ext_t   *lck,
-       lck_grp_t       *grp,
-       lck_attr_t      *attr)
+       lck_mtx_ext_t   *lck,
+       lck_grp_t       *grp,
+       lck_attr_t      *attr)
 {
        bzero((void *)lck, sizeof(lck_mtx_ext_t));
 
@@ -2190,9 +2129,8 @@ lck_mtx_ext_init(
 
        lck->lck_mtx_grp = grp;
 
-       if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
+       if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
                lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
-       }
 
        lck->lck_mtx.lck_mtx_is_ext = 1;
        lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
@@ -2203,18 +2141,17 @@ lck_mtx_ext_init(
  */
 void
 lck_mtx_init(
-       lck_mtx_t       *lck,
-       lck_grp_t       *grp,
-       lck_attr_t      *attr)
+       lck_mtx_t       *lck,
+       lck_grp_t       *grp,
+       lck_attr_t      *attr)
 {
-       lck_mtx_ext_t   *lck_ext;
-       lck_attr_t      *lck_attr;
+       lck_mtx_ext_t   *lck_ext;
+       lck_attr_t      *lck_attr;
 
-       if (attr != LCK_ATTR_NULL) {
+       if (attr != LCK_ATTR_NULL)
                lck_attr = attr;
-       } else {
+       else
                lck_attr = &LockDefaultLckAttr;
-       }
 
        if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
                if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
@@ -2236,18 +2173,17 @@ lck_mtx_init(
  */
 void
 lck_mtx_init_ext(
-       lck_mtx_t       *lck,
-       lck_mtx_ext_t   *lck_ext,
-       lck_grp_t       *grp,
-       lck_attr_t      *attr)
+       lck_mtx_t       *lck,
+       lck_mtx_ext_t   *lck_ext,
+       lck_grp_t       *grp,
+       lck_attr_t      *attr)
 {
-       lck_attr_t      *lck_attr;
+       lck_attr_t      *lck_attr;
 
-       if (attr != LCK_ATTR_NULL) {
+       if (attr != LCK_ATTR_NULL)
                lck_attr = attr;
-       } else {
+       else
                lck_attr = &LockDefaultLckAttr;
-       }
 
        if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
                lck_mtx_ext_init(lck_ext, grp, lck_attr);
@@ -2289,14 +2225,13 @@ lck_mtx_lock_mark_destroyed(
  */
 void
 lck_mtx_destroy(
-       lck_mtx_t       *lck,
-       lck_grp_t       *grp)
+       lck_mtx_t       *lck,
+       lck_grp_t       *grp)
 {
        boolean_t indirect;
 
-       if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
+       if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
                return;
-       }
 #if MACH_LDEBUG
        lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
 #endif
@@ -2304,9 +2239,8 @@ lck_mtx_destroy(
 
        lck_mtx_lock_mark_destroyed(lck, indirect);
 
-       if (indirect) {
+       if (indirect)
                kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
-       }
        lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
        lck_grp_deallocate(grp);
        return;
@@ -2328,7 +2262,7 @@ __attribute__((always_inline))
 static boolean_t
 get_indirect_mutex(
        lck_mtx_t       **lock,
-       uint32_t        *state)
+       uint32_t        *state)
 {
        *lock = &((*lock)->lck_mtx_ptr->lck_mtx);
        *state = ordered_load_mtx_state(*lock);
@@ -2336,23 +2270,22 @@ get_indirect_mutex(
 }
 
 /*
- * Routine:     lck_mtx_unlock_slow
+ * Routine:    lck_mtx_unlock_slow
  *
  * Unlocks a mutex held by current thread.
  *
- * It will wake up waiters if necessary and
- * drop promotions.
+ * It will wake up waiters if necessary.
  *
  * Interlock can be held.
  */
 __attribute__((noinline))
 void
 lck_mtx_unlock_slow(
-       lck_mtx_t       *lock)
+       lck_mtx_t       *lock)
 {
-       thread_t        thread;
-       uint32_t        state, prev;
-       boolean_t       indirect = FALSE;
+       thread_t        thread;
+       uint32_t        state, prev;
+       boolean_t       indirect = FALSE;
 
        state = ordered_load_mtx_state(lock);
 
@@ -2365,15 +2298,13 @@ lck_mtx_unlock_slow(
 
 #if DEVELOPMENT | DEBUG
        thread_t owner = (thread_t)lock->lck_mtx_owner;
-       if (__improbable(owner != thread)) {
-               return lck_mtx_owner_check_panic(lock);
-       }
+       if(__improbable(owner != thread))
+               lck_mtx_owner_check_panic(lock);
 #endif
 
        /* check if it is held as a spinlock */
-       if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) {
+       if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0))
                goto unlock;
-       }
 
        lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
 
@@ -2384,24 +2315,24 @@ unlock:
        ordered_store_mtx_owner(lock, 0);
        /* keep original state in prev for later evaluation */
        prev = state;
-       /* release interlock, promotion and clear spin flag */
-       state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK | LCK_MTX_PROMOTED_MSK));
-       if ((state & LCK_MTX_WAITERS_MSK)) {
-               state -= LCK_MTX_WAITER;        /* decrement waiter count */
-       }
-       ordered_store_mtx_state_release(lock, state);           /* since I own the interlock, I don't need an atomic update */
 
+       if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
 #if     MACH_LDEBUG
-       /* perform lock statistics after drop to prevent delay */
-       if (thread) {
-               thread->mutex_count--;          /* lock statistic */
+               if (thread)
+                       thread->mutex_count--;
+#endif
+               return lck_mtx_unlock_wakeup_tail(lock, state, indirect);
        }
-#endif  /* MACH_LDEBUG */
 
-       /* check if there are waiters to wake up or priority to drop */
-       if ((prev & (LCK_MTX_PROMOTED_MSK | LCK_MTX_WAITERS_MSK))) {
-               return lck_mtx_unlock_wakeup_tail(lock, prev, indirect);
-       }
+       /* release interlock, promotion and clear spin flag */
+       state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK));
+       ordered_store_mtx_state_release(lock, state);           /* since I own the interlock, I don't need an atomic update */
+
+#if    MACH_LDEBUG
+       /* perform lock statistics after drop to prevent delay */
+       if (thread)
+               thread->mutex_count--;          /* lock statistic */
+#endif /* MACH_LDEBUG */
 
        /* re-enable preemption */
        lck_mtx_unlock_finish_inline(lock, FALSE);
@@ -2409,19 +2340,18 @@ unlock:
        return;
 }
 
-#define LCK_MTX_LCK_WAIT_CODE           0x20
-#define LCK_MTX_LCK_WAKEUP_CODE         0x21
-#define LCK_MTX_LCK_SPIN_CODE           0x22
-#define LCK_MTX_LCK_ACQUIRE_CODE        0x23
-#define LCK_MTX_LCK_DEMOTE_CODE         0x24
+#define        LCK_MTX_LCK_WAIT_CODE           0x20
+#define        LCK_MTX_LCK_WAKEUP_CODE         0x21
+#define        LCK_MTX_LCK_SPIN_CODE           0x22
+#define        LCK_MTX_LCK_ACQUIRE_CODE        0x23
+#define LCK_MTX_LCK_DEMOTE_CODE                0x24
 
 /*
  * Routine:    lck_mtx_unlock_wakeup_tail
  *
  * Invoked on unlock when there is
  * contention, i.e. the assembly routine sees
- * that mutex->lck_mtx_waiters != 0 or
- * that mutex->lck_mtx_promoted != 0
+ * that mutex->lck_mtx_waiters != 0
  *
  * neither the mutex or interlock is held
  *
@@ -2431,7 +2361,6 @@ unlock:
  *
  * assembly routine previously did the following to mutex:
  * (after saving the state in prior_lock_state)
- *      cleared lck_mtx_promoted
  *      decremented lck_mtx_waiters if nonzero
  *
  * This function needs to be called as a tail call
@@ -2439,151 +2368,94 @@ unlock:
  */
 __attribute__((noinline))
 static void
-lck_mtx_unlock_wakeup_tail(
-       lck_mtx_t       *mutex,
-       int             prior_lock_state,
-       boolean_t       indirect)
+lck_mtx_unlock_wakeup_tail (
+       lck_mtx_t       *mutex,
+       uint32_t        state,
+       boolean_t       indirect)
 {
-       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
-       lck_mtx_t               fake_lck;
+       struct turnstile *ts;
 
-       /*
-        * prior_lock state is a snapshot of the 2nd word of the
-        * lock in question... we'll fake up a lock with the bits
-        * copied into place and carefully not access anything
-        * beyond whats defined in the second word of a lck_mtx_t
-        */
-       fake_lck.lck_mtx_state = prior_lock_state;
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
+       kern_return_t did_wake;
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
-           trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
+               trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
 
-       if (__probable(fake_lck.lck_mtx_waiters)) {
-               kern_return_t did_wake;
+       ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
 
-               if (fake_lck.lck_mtx_waiters > 1) {
-                       did_wake = thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
-               } else {
-                       did_wake = thread_wakeup_one(LCK_MTX_EVENT(mutex));
-               }
-               /*
-                * The waiters count always precisely matches the number of threads on the waitqueue.
-                * i.e. we should never see ret == KERN_NOT_WAITING.
-                */
-               assert(did_wake == KERN_SUCCESS);
+       if (mutex->lck_mtx_waiters > 1) {
+               /* WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor on the wokenup thread */
+               did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE);
+       } else {
+               did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
+               turnstile_update_inheritor(ts, NULL, TURNSTILE_IMMEDIATE_UPDATE);
        }
+       assert(did_wake == KERN_SUCCESS);
 
-       /* When lck_mtx_promoted was set, then I as the owner definitely have a promotion */
-       if (__improbable(fake_lck.lck_mtx_promoted)) {
-               thread_t thread = current_thread();
-
-               spl_t s = splsched();
-               thread_lock(thread);
-
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
-                   thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
-               assert(thread->was_promoted_on_wakeup == 0);
-               assert(thread->promotions > 0);
-
-               assert_promotions_invariant(thread);
+       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+       turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
 
-               if (--thread->promotions == 0) {
-                       sched_thread_unpromote(thread, trace_lck);
-               }
+       state -= LCK_MTX_WAITER;
+        state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK));
+       ordered_store_mtx_state_release(mutex, state);
 
-               assert_promotions_invariant(thread);
+       assert(current_thread()->turnstile != NULL);
 
-               thread_unlock(thread);
-               splx(s);
-       }
+       turnstile_cleanup();
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
-           trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
+                 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
 
        lck_mtx_unlock_finish_inline(mutex, indirect);
 }
 
 /*
- * Routine:     lck_mtx_lock_acquire_x86
+ * Routine:    lck_mtx_lock_acquire_x86
  *
  * Invoked on acquiring the mutex when there is
  * contention (i.e. the assembly routine sees that
- * that mutex->lck_mtx_waiters != 0 or
- * thread->was_promoted_on_wakeup != 0)...
+ * that mutex->lck_mtx_waiters != 0
  *
  * mutex is owned...  interlock is held... preemption is disabled
  */
 __attribute__((always_inline))
 static void
 lck_mtx_lock_acquire_inline(
-       lck_mtx_t       *mutex)
+       lck_mtx_t       *mutex,
+       struct turnstile *ts)
 {
-       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
-       integer_t               priority;
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
-           trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
-
-       if (mutex->lck_mtx_waiters) {
-               priority = mutex->lck_mtx_pri;
-       } else {
-               priority = 0; /* not worth resetting lck_mtx_pri here, it will be reset by next waiter */
-       }
-       /* the priority must have been set correctly by wait */
-       assert(priority <= MAXPRI_PROMOTE);
-       assert(priority == 0 || priority >= BASEPRI_DEFAULT);
-
-       /* if the mutex wasn't owned, then the owner wasn't promoted */
-       assert(mutex->lck_mtx_promoted == 0);
+                    trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
 
        thread_t thread = (thread_t)mutex->lck_mtx_owner;       /* faster than current_thread() */
+       assert(thread->waiting_for_mutex == NULL);
 
-       if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
-               spl_t s = splsched();
-               thread_lock(thread);
-
-               if (thread->was_promoted_on_wakeup) {
-                       assert(thread->promotions > 0);
+       if (mutex->lck_mtx_waiters > 0) {
+               if (ts == NULL) {
+                       ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
                }
 
-               /* Intel only promotes if priority goes up */
-               if (thread->sched_pri < priority && thread->promotion_priority < priority) {
-                       /* Remember that I need to drop this promotion on unlock */
-                       mutex->lck_mtx_promoted = 1;
+               turnstile_update_inheritor(ts, thread, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
+               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+       }
 
-                       if (thread->promotions++ == 0) {
-                               /* This is the first promotion for the owner */
-                               sched_thread_promote_to_pri(thread, priority, trace_lck);
-                       } else {
-                               /*
-                                * Holder was previously promoted due to a different mutex,
-                                * raise to match this one.
-                                * Or, this thread was promoted on wakeup but someone else
-                                * later contended on mutex at higher priority before we got here
-                                */
-                               sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
-                       }
-               }
+       if (ts != NULL) {
+               turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
+       }
 
-               if (thread->was_promoted_on_wakeup) {
-                       thread->was_promoted_on_wakeup = 0;
-                       if (--thread->promotions == 0) {
-                               sched_thread_unpromote(thread, trace_lck);
-                       }
-               }
+       assert(current_thread()->turnstile != NULL);
 
-               thread_unlock(thread);
-               splx(s);
-       }
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
-           trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
+                    trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
 }
 
 void
 lck_mtx_lock_acquire_x86(
-       lck_mtx_t       *mutex)
+       lck_mtx_t       *mutex)
 {
-       return lck_mtx_lock_acquire_inline(mutex);
+       return lck_mtx_lock_acquire_inline(mutex, NULL);
 }
 
 /*
@@ -2595,19 +2467,20 @@ lck_mtx_lock_acquire_x86(
 __attribute__((noinline))
 static void
 lck_mtx_lock_acquire_tail(
-       lck_mtx_t       *mutex,
-       boolean_t       indirect)
+       lck_mtx_t       *mutex,
+       boolean_t       indirect,
+       struct turnstile *ts)
 {
-       lck_mtx_lock_acquire_inline(mutex);
-       lck_mtx_lock_finish_inline(mutex, ordered_load_mtx_state(mutex), indirect);
+       lck_mtx_lock_acquire_inline(mutex, ts);
+       lck_mtx_lock_finish_inline_with_cleanup(mutex, ordered_load_mtx_state(mutex), indirect);
 }
 
 __attribute__((noinline))
 static boolean_t
 lck_mtx_try_lock_acquire_tail(
-       lck_mtx_t       *mutex)
+       lck_mtx_t       *mutex)
 {
-       lck_mtx_lock_acquire_inline(mutex);
+       lck_mtx_lock_acquire_inline(mutex, NULL);
        lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
 
        return TRUE;
@@ -2616,9 +2489,9 @@ lck_mtx_try_lock_acquire_tail(
 __attribute__((noinline))
 static void
 lck_mtx_convert_spin_acquire_tail(
-       lck_mtx_t       *mutex)
+       lck_mtx_t       *mutex)
 {
-       lck_mtx_lock_acquire_inline(mutex);
+       lck_mtx_lock_acquire_inline(mutex, NULL);
        lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
 }
 
@@ -2640,7 +2513,7 @@ lck_mtx_interlock_lock_set_and_clear_flags(
        uint32_t state, prev;
        state = *new_state;
 
-       for (;;) {
+       for ( ; ; ) {
                /* have to wait for interlock to clear */
                while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
                        cpu_pause();
@@ -2648,12 +2521,11 @@ lck_mtx_interlock_lock_set_and_clear_flags(
                }
                prev = state;                                   /* prev contains snapshot for exchange */
                state |= LCK_MTX_ILOCKED_MSK | xor_flags;       /* pick up interlock */
-               state &= ~and_flags;                            /* clear flags */
+               state &= ~and_flags;                            /* clear flags */
 
                disable_preemption();
-               if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
+               if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire))
                        break;
-               }
                enable_preemption();
                cpu_pause();
                state = ordered_load_mtx_state(mutex);
@@ -2692,12 +2564,12 @@ lck_mtx_interlock_try_lock_set_flags(
        if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
                return 0;
        }
-       prev = state;                                   /* prev contains snapshot for exchange */
-       state |= LCK_MTX_ILOCKED_MSK | or_flags;        /* pick up interlock */
+       prev = state;                                   /* prev contains snapshot for exchange */
+       state |= LCK_MTX_ILOCKED_MSK | or_flags;        /* pick up interlock */
        disable_preemption();
-       if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
-               *new_state = state;
-               return 1;
+       if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
+                       *new_state = state;
+                       return 1;
        }
 
        enable_preemption();
@@ -2717,7 +2589,7 @@ lck_mtx_interlock_try_lock_disable_interrupts(
        lck_mtx_t *mutex,
        boolean_t *istate)
 {
-       uint32_t        state;
+       uint32_t        state;
 
        *istate = ml_set_interrupts_enabled(FALSE);
        state = ordered_load_mtx_state(mutex);
@@ -2749,6 +2621,7 @@ lck_mtx_lock_contended(
        lck_mtx_spinwait_ret_type_t ret;
        uint32_t state;
        thread_t thread;
+       struct turnstile *ts = NULL;
 
 try_again:
 
@@ -2768,7 +2641,7 @@ try_again:
                        lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
                }
 
-       /* just fall through case LCK_MTX_SPINWAIT_SPUN */
+               /* just fall through case LCK_MTX_SPINWAIT_SPUN */
        case LCK_MTX_SPINWAIT_SPUN:
                /*
                 * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
@@ -2781,12 +2654,13 @@ try_again:
                        if (indirect) {
                                lck_grp_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
                        }
-                       lck_mtx_lock_wait_x86(lock);
+                       lck_mtx_lock_wait_x86(lock, &ts);
                        /*
                         * interlock is not held here.
                         */
                        goto try_again;
                } else {
+
                        /* grab the mutex */
                        state |= LCK_MTX_MLOCKED_MSK;
                        ordered_store_mtx_state_release(lock, state);
@@ -2818,12 +2692,22 @@ try_again:
 
        /* mutex has been acquired */
        thread = (thread_t)lock->lck_mtx_owner;
-       if (state & LCK_MTX_WAITERS_MSK || thread->was_promoted_on_wakeup) {
-               return lck_mtx_lock_acquire_tail(lock, indirect);
+       if (state & LCK_MTX_WAITERS_MSK) {
+               /*
+                * lck_mtx_lock_acquire_tail will call
+                * turnstile_complete.
+                */
+               return lck_mtx_lock_acquire_tail(lock, indirect, ts);
        }
 
+       if (ts != NULL) {
+               turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
+       }
+
+       assert(current_thread()->turnstile != NULL);
+
        /* release the interlock */
-       lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
+       lck_mtx_lock_finish_inline_with_cleanup(lock, ordered_load_mtx_state(lock), indirect);
 }
 
 /*
@@ -2831,7 +2715,7 @@ try_again:
  * panic to optimize compiled code.
  */
 
-__attribute__((noinline))
+__attribute__((noinline)) __abortlike
 static void
 lck_mtx_destroyed(
        lck_mtx_t       *lock)
@@ -2856,7 +2740,7 @@ lck_mtx_lock_wait_interlock_to_clear(
 {
        uint32_t state;
 
-       for (;;) {
+       for ( ; ; ) {
                cpu_pause();
                state = ordered_load_mtx_state(lock);
                if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
@@ -2878,7 +2762,7 @@ lck_mtx_try_lock_wait_interlock_to_clear(
 {
        uint32_t state;
 
-       for (;;) {
+       for ( ; ; ) {
                cpu_pause();
                state = ordered_load_mtx_state(lock);
                if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
@@ -2906,9 +2790,9 @@ void
 lck_mtx_lock_slow(
        lck_mtx_t       *lock)
 {
-       boolean_t       indirect = FALSE;
-       uint32_t        state;
-       int             first_miss = 0;
+       boolean_t       indirect = FALSE;
+       uint32_t        state;
+       int             first_miss = 0;
 
        state = ordered_load_mtx_state(lock);
 
@@ -2922,14 +2806,14 @@ lck_mtx_lock_slow(
 
 
                /* is the mutex already held and not indirect */
-               if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
+               if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
                        /* no, must have been the mutex */
                        return lck_mtx_lock_contended(lock, indirect, &first_miss);
                }
 
                /* check to see if it is marked destroyed */
                if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
-                       return lck_mtx_destroyed(lock);
+                       lck_mtx_destroyed(lock);
                }
 
                /* Is this an indirect mutex? */
@@ -2940,7 +2824,7 @@ lck_mtx_lock_slow(
                        lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
 
                        if (state & LCK_MTX_SPIN_MSK) {
-                               /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
+                                /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
                                assert(state & LCK_MTX_ILOCKED_MSK);
                                lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
                        }
@@ -2966,7 +2850,7 @@ lck_mtx_lock_slow(
 
 #if MACH_LDEBUG
        if (thread) {
-               thread->mutex_count++;          /* lock statistic */
+               thread->mutex_count++;          /* lock statistic */
        }
 #endif
        /*
@@ -2974,7 +2858,7 @@ lck_mtx_lock_slow(
         * inherit their priority.
         */
        if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
-               return lck_mtx_lock_acquire_tail(lock, indirect);
+               return lck_mtx_lock_acquire_tail(lock, indirect, NULL);
        }
 
        /* release the interlock */
@@ -3003,13 +2887,13 @@ lck_mtx_try_lock_slow(
                 */
 
                /* is the mutex already held and not indirect */
-               if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
+               if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
                        return FALSE;
                }
 
                /* check to see if it is marked destroyed */
                if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
-                       return lck_mtx_try_destroyed(lock);
+                       lck_mtx_try_destroyed(lock);
                }
 
                /* Is this an indirect mutex? */
@@ -3021,9 +2905,8 @@ lck_mtx_try_lock_slow(
                }
 
                if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
-                       if (indirect) {
+                       if (indirect)
                                lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
-                       }
                        return FALSE;
                }
        }
@@ -3031,9 +2914,8 @@ lck_mtx_try_lock_slow(
        /* no - can't be INDIRECT, DESTROYED or locked */
        while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
                if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
-                       if (indirect) {
+                       if (indirect)
                                lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
-                       }
                        return FALSE;
                }
        }
@@ -3046,7 +2928,7 @@ lck_mtx_try_lock_slow(
 
 #if MACH_LDEBUG
        if (thread) {
-               thread->mutex_count++;          /* lock statistic */
+               thread->mutex_count++;          /* lock statistic */
        }
 #endif
        /*
@@ -3061,12 +2943,13 @@ lck_mtx_try_lock_slow(
        lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
 
        return TRUE;
+
 }
 
 __attribute__((noinline))
 void
 lck_mtx_lock_spin_slow(
-       lck_mtx_t       *lock)
+       lck_mtx_t       *lock)
 {
        boolean_t       indirect = FALSE;
        uint32_t        state;
@@ -3084,14 +2967,14 @@ lck_mtx_lock_spin_slow(
 
 
                /* is the mutex already held and not indirect */
-               if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
+               if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
                        /* no, must have been the mutex */
                        return lck_mtx_lock_contended(lock, indirect, &first_miss);
                }
 
                /* check to see if it is marked destroyed */
                if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
-                       return lck_mtx_destroyed(lock);
+                       lck_mtx_destroyed(lock);
                }
 
                /* Is this an indirect mutex? */
@@ -3102,7 +2985,7 @@ lck_mtx_lock_spin_slow(
                        lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
 
                        if (state & LCK_MTX_SPIN_MSK) {
-                               /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
+                                /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
                                assert(state & LCK_MTX_ILOCKED_MSK);
                                lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
                        }
@@ -3114,7 +2997,7 @@ lck_mtx_lock_spin_slow(
        }
 
        /* no - can't be INDIRECT, DESTROYED or locked */
-       while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
+       while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state) )) {
                if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
                        return lck_mtx_lock_contended(lock, indirect, &first_miss);
                }
@@ -3132,7 +3015,7 @@ lck_mtx_lock_spin_slow(
        }
 #endif
 
-#if     CONFIG_DTRACE
+#if    CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
 #endif
        /* return with the interlock held and preemption disabled */
@@ -3159,13 +3042,13 @@ lck_mtx_try_lock_spin_slow(
                 */
 
                /* is the mutex already held and not indirect */
-               if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
+               if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
                        return FALSE;
                }
 
                /* check to see if it is marked destroyed */
                if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
-                       return lck_mtx_try_destroyed(lock);
+                       lck_mtx_try_destroyed(lock);
                }
 
                /* Is this an indirect mutex? */
@@ -3177,9 +3060,8 @@ lck_mtx_try_lock_spin_slow(
                }
 
                if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
-                       if (indirect) {
+                       if (indirect)
                                lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
-                       }
                        return FALSE;
                }
        }
@@ -3187,9 +3069,8 @@ lck_mtx_try_lock_spin_slow(
        /* no - can't be INDIRECT, DESTROYED or locked */
        while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
                if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
-                       if (indirect) {
+                       if (indirect)
                                lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
-                       }
                        return FALSE;
                }
        }
@@ -3202,7 +3083,7 @@ lck_mtx_try_lock_spin_slow(
 
 #if MACH_LDEBUG
        if (thread) {
-               thread->mutex_count++;          /* lock statistic */
+               thread->mutex_count++;          /* lock statistic */
        }
 #endif
 
@@ -3210,12 +3091,13 @@ lck_mtx_try_lock_spin_slow(
        LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
 #endif
        return TRUE;
+
 }
 
 __attribute__((noinline))
 void
 lck_mtx_convert_spin(
-       lck_mtx_t       *lock)
+       lck_mtx_t       *lock)
 {
        uint32_t state;
 
@@ -3253,7 +3135,7 @@ lck_mtx_convert_spin(
 
 static inline boolean_t
 lck_mtx_lock_grab_mutex(
-       lck_mtx_t       *lock)
+       lck_mtx_t       *lock)
 {
        uint32_t state;
 
@@ -3271,7 +3153,7 @@ lck_mtx_lock_grab_mutex(
 
 #if MACH_LDEBUG
        if (thread) {
-               thread->mutex_count++;          /* lock statistic */
+               thread->mutex_count++;          /* lock statistic */
        }
 #endif
        return TRUE;
@@ -3280,8 +3162,8 @@ lck_mtx_lock_grab_mutex(
 __attribute__((noinline))
 void
 lck_mtx_assert(
-       lck_mtx_t       *lock,
-       unsigned int    type)
+       lck_mtx_t       *lock,
+       unsigned int    type)
 {
        thread_t thread, owner;
        uint32_t state;
@@ -3296,19 +3178,17 @@ lck_mtx_assert(
        owner = (thread_t)lock->lck_mtx_owner;
 
        if (type == LCK_MTX_ASSERT_OWNED) {
-               if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
+               if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))
                        panic("mutex (%p) not owned\n", lock);
-               }
        } else {
-               assert(type == LCK_MTX_ASSERT_NOTOWNED);
-               if (owner == thread) {
+               assert (type == LCK_MTX_ASSERT_NOTOWNED);
+               if (owner == thread)
                        panic("mutex (%p) owned\n", lock);
-               }
        }
 }
 
 /*
- * Routine:     lck_mtx_lock_spinwait_x86
+ * Routine:    lck_mtx_lock_spinwait_x86
  *
  * Invoked trying to acquire a mutex when there is contention but
  * the holder is running on another processor. We spin for up to a maximum
@@ -3322,18 +3202,18 @@ lck_mtx_assert(
 __attribute__((noinline))
 lck_mtx_spinwait_ret_type_t
 lck_mtx_lock_spinwait_x86(
-       lck_mtx_t       *mutex)
+       lck_mtx_t       *mutex)
 {
-       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
-       thread_t        holder;
-       uint64_t        overall_deadline;
-       uint64_t        check_owner_deadline;
-       uint64_t        cur_time;
-       lck_mtx_spinwait_ret_type_t             retval = LCK_MTX_SPINWAIT_SPUN;
-       int             loopcount = 0;
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
+       thread_t        holder;
+       uint64_t        overall_deadline;
+       uint64_t        check_owner_deadline;
+       uint64_t        cur_time;
+       lck_mtx_spinwait_ret_type_t             retval = LCK_MTX_SPINWAIT_SPUN;
+       int             loopcount = 0;
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
-           trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
+                    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
 
        cur_time = mach_absolute_time();
        overall_deadline = cur_time + MutexSpin;
@@ -3354,12 +3234,11 @@ lck_mtx_lock_spinwait_x86(
                }
                cur_time = mach_absolute_time();
 
-               if (cur_time >= overall_deadline) {
+               if (cur_time >= overall_deadline)
                        break;
-               }
 
                if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
-                       boolean_t       istate;
+                       boolean_t       istate;
 
                        /*
                         * We will repeatedly peek at the state of the lock while spinning,
@@ -3372,16 +3251,18 @@ lck_mtx_lock_spinwait_x86(
                         * This is safe because it is a "try_lock", if we can't acquire
                         * the interlock we re-enable the interrupts and fail, so it is
                         * ok to call it even if the interlock was already held.
-                        */
+                       */
                        if (lck_mtx_interlock_try_lock_disable_interrupts(mutex, &istate)) {
+
                                if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
-                                       if (!(holder->machine.specFlags & OnProc) ||
-                                           (holder->state & TH_IDLE)) {
+
+                                       if ( !(holder->machine.specFlags & OnProc) ||
+                                            (holder->state & TH_IDLE)) {
+
                                                lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
 
-                                               if (loopcount == 0) {
+                                               if (loopcount == 0)
                                                        retval = LCK_MTX_SPINWAIT_NO_SPIN;
-                                               }
                                                break;
                                        }
                                }
@@ -3393,31 +3274,32 @@ lck_mtx_lock_spinwait_x86(
                cpu_pause();
 
                loopcount++;
+
        } while (TRUE);
 
-#if     CONFIG_DTRACE
+#if    CONFIG_DTRACE
        /*
         * We've already kept a count via overall_deadline of how long we spun.
         * If dtrace is active, then we compute backwards to decide how
         * long we spun.
         *
         * Note that we record a different probe id depending on whether
-        * this is a direct or indirect mutex.  This allows us to
+        * this is a direct or indirect mutex.  This allows us to 
         * penalize only lock groups that have debug/stats enabled
         * with dtrace processing if desired.
         */
        if (__probable(mutex->lck_mtx_is_ext == 0)) {
                LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
-                   mach_absolute_time() - (overall_deadline - MutexSpin));
+                       mach_absolute_time() - (overall_deadline - MutexSpin));
        } else {
                LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
-                   mach_absolute_time() - (overall_deadline - MutexSpin));
+                       mach_absolute_time() - (overall_deadline - MutexSpin));
        }
        /* The lockstat acquire event is recorded by the assembly code beneath us. */
 #endif
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
-           trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
+                    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
 
        return retval;
 }
@@ -3425,7 +3307,7 @@ lck_mtx_lock_spinwait_x86(
 
 
 /*
- * Routine:     lck_mtx_lock_wait_x86
+ * Routine:    lck_mtx_lock_wait_x86
  *
  * Invoked in order to wait on contention.
  *
@@ -3452,100 +3334,60 @@ lck_mtx_lock_spinwait_x86(
  */
 __attribute__((noinline))
 void
-lck_mtx_lock_wait_x86(
-       lck_mtx_t       *mutex)
+lck_mtx_lock_wait_x86 (
+       lck_mtx_t       *mutex,
+       struct turnstile **ts)
 {
-#if     CONFIG_DTRACE
+       thread_t self = current_thread();
+
+#if    CONFIG_DTRACE
        uint64_t sleep_start = 0;
 
        if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
                sleep_start = mach_absolute_time();
        }
 #endif
-       thread_t self = current_thread();
-       assert(self->waiting_for_mutex == NULL);
-
-       self->waiting_for_mutex = mutex;
-
        __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
-           trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
-           mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
-
-       integer_t waiter_pri = self->sched_pri;
-       waiter_pri = MAX(waiter_pri, self->base_pri);
-       waiter_pri = MAX(waiter_pri, BASEPRI_DEFAULT);
-       waiter_pri = MIN(waiter_pri, MAXPRI_PROMOTE);
-
-       assert(mutex->lck_mtx_pri <= MAXPRI_PROMOTE);
+                    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
+                    mutex->lck_mtx_waiters, 0, 0);
 
-       /* Re-initialize lck_mtx_pri if this is the first contention */
-       if (mutex->lck_mtx_waiters == 0 || mutex->lck_mtx_pri <= waiter_pri) {
-               mutex->lck_mtx_pri = waiter_pri;
-       }
+       assert(self->waiting_for_mutex == NULL);
+       self->waiting_for_mutex = mutex;
+       mutex->lck_mtx_waiters++;
 
        thread_t holder = (thread_t)mutex->lck_mtx_owner;
-
        assert(holder != NULL);
 
        /*
-        * Intel only causes a promotion when priority needs to change,
-        * reducing thread lock holds but leaving us vulnerable to the holder
-        * dropping priority.
+        * lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse
+        * the same turnstile while looping, the matching turnstile compleate will be called
+        * by lck_mtx_lock_contended when finally acquiring the lock.
         */
-       if (holder->sched_pri < mutex->lck_mtx_pri) {
-               int promote_pri = mutex->lck_mtx_pri;
-
-               spl_t s = splsched();
-               thread_lock(holder);
-
-               /* Check again in case sched_pri changed */
-               if (holder->sched_pri < promote_pri && holder->promotion_priority < promote_pri) {
-                       if (mutex->lck_mtx_promoted == 0) {
-                               /* This is the first promotion for this mutex */
-                               mutex->lck_mtx_promoted = 1;
-
-                               if (holder->promotions++ == 0) {
-                                       /* This is the first promotion for holder */
-                                       sched_thread_promote_to_pri(holder, promote_pri, trace_lck);
-                               } else {
-                                       /*
-                                        * Holder was previously promoted due to a different mutex,
-                                        * check if it needs to raise to match this one
-                                        */
-                                       sched_thread_update_promotion_to_pri(holder, promote_pri,
-                                           trace_lck);
-                               }
-                       } else {
-                               /*
-                                * Holder was previously promoted due to this mutex,
-                                * check if the pri needs to go up
-                                */
-                               sched_thread_update_promotion_to_pri(holder, promote_pri, trace_lck);
-                       }
-               }
-
-               thread_unlock(holder);
-               splx(s);
+       if (*ts == NULL) {
+               *ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
        }
 
-       mutex->lck_mtx_waiters++;
-
+       struct turnstile *turnstile = *ts;
        thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
-       assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
+       turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
+
+       waitq_assert_wait64(&turnstile->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
 
        lck_mtx_ilk_unlock(mutex);
 
+       turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
+
        thread_block(THREAD_CONTINUE_NULL);
 
        self->waiting_for_mutex = NULL;
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
-           trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
-           mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
+                    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
+                    mutex->lck_mtx_waiters, 0, 0);
 
-#if     CONFIG_DTRACE
+#if    CONFIG_DTRACE
        /*
         * Record the Dtrace lockstat probe for blocking, block time
         * measured from when we were entered.
@@ -3568,7 +3410,7 @@ lck_mtx_lock_wait_x86(
  *      Returns: TRUE if lock is acquired.
  */
 boolean_t
-kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t     *lck)
+kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t    *lck)
 {
        if (not_in_kdp) {
                panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
@@ -3594,17 +3436,17 @@ void
 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
 {
        lck_rw_t *rwlck = NULL;
-       switch (waitinfo->wait_type) {
-       case kThreadWaitKernelRWLockRead:
-               rwlck = READ_EVENT_TO_RWLOCK(event);
-               break;
-       case kThreadWaitKernelRWLockWrite:
-       case kThreadWaitKernelRWLockUpgrade:
-               rwlck = WRITE_EVENT_TO_RWLOCK(event);
-               break;
-       default:
-               panic("%s was called with an invalid blocking type", __FUNCTION__);
-               break;
+       switch(waitinfo->wait_type) {
+               case kThreadWaitKernelRWLockRead:
+                       rwlck = READ_EVENT_TO_RWLOCK(event);
+                       break;
+               case kThreadWaitKernelRWLockWrite:
+               case kThreadWaitKernelRWLockUpgrade:
+                       rwlck = WRITE_EVENT_TO_RWLOCK(event);
+                       break;
+               default:
+                       panic("%s was called with an invalid blocking type", __FUNCTION__);
+                       break;
        }
        waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
        waitinfo->owner = 0;
index a7e188072bc6e2d33ee5d16b14ee5e7e70f1e895..b10b70febabe617c349abad93307c3c2f2b0a241 100644 (file)
 
 #include <kern/locks.h>
 #include <kern/lock_stat.h>
+#include <kern/turnstile.h>
 
 // Enforce program order of loads and stores.
-#define ordered_load(target) _Generic( (target),\
-               uint32_t* : __c11_atomic_load((_Atomic uint32_t* )(target), memory_order_relaxed), \
-               uintptr_t*: __c11_atomic_load((_Atomic uintptr_t*)(target), memory_order_relaxed) )
-#define ordered_store_release(target, value) _Generic( (target),\
-               uint32_t* : __c11_atomic_store((_Atomic uint32_t* )(target), (value), memory_order_release_smp), \
-               uintptr_t*: __c11_atomic_store((_Atomic uintptr_t*)(target), (value), memory_order_release_smp) )
-#define ordered_store_volatile(target, value) _Generic( (target),\
-               volatile uint32_t* : __c11_atomic_store((_Atomic volatile uint32_t* )(target), (value), memory_order_relaxed), \
-               volatile uintptr_t*: __c11_atomic_store((_Atomic volatile uintptr_t*)(target), (value), memory_order_relaxed) )
+#define ordered_load(target) os_atomic_load(target, compiler_acq_rel)
+#define ordered_store_release(target, value) ({ \
+               os_atomic_store(target, value, release); \
+               os_compiler_barrier(); \
+})
 
 /* Enforce program order of loads and stores. */
 #define ordered_load_mtx_state(lock)                    ordered_load(&(lock)->lck_mtx_state)
 #define ordered_store_mtx_state_release(lock, value)            ordered_store_release(&(lock)->lck_mtx_state, (value))
-#define ordered_store_mtx_owner(lock, value)    ordered_store_volatile(&(lock)->lck_mtx_owner, (value))
+#define ordered_store_mtx_owner(lock, value)    os_atomic_store(&(lock)->lck_mtx_owner, (value), compiler_acq_rel)
 
 #if DEVELOPMENT | DEBUG
-void lck_mtx_owner_check_panic(lck_mtx_t       *mutex);
+void lck_mtx_owner_check_panic(lck_mtx_t       *mutex) __abortlike;
 #endif
 
 __attribute__((always_inline))
@@ -85,6 +82,29 @@ lck_mtx_lock_finish_inline(
 #endif
 }
 
+__attribute__((always_inline))
+static inline void
+lck_mtx_lock_finish_inline_with_cleanup(
+       lck_mtx_t       *mutex,
+       uint32_t        state,
+       boolean_t       indirect)
+{
+       assert(state & LCK_MTX_ILOCKED_MSK);
+
+       /* release the interlock and re-enable preemption */
+       lck_mtx_ilk_unlock_inline(mutex, state);
+
+       turnstile_cleanup();
+
+#if     CONFIG_DTRACE
+       if (indirect) {
+               LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_ACQUIRE, mutex, 0);
+       } else {
+               LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, mutex, 0);
+       }
+#endif
+}
+
 __attribute__((always_inline))
 static inline void
 lck_mtx_try_lock_finish_inline(
index fb0562fe87777de1403d9c6e579f400af07ef0b2..5720cf7e22850d1b062b36a96c563704fb6a0537 100644 (file)
@@ -26,7 +26,6 @@
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
-#define ATOMIC_PRIVATE 1
 #define LOCK_PRIVATE 1
 
 #include <mach_ldebug.h>
@@ -39,7 +38,6 @@
 #include <kern/cpu_data.h>
 #include <kern/cpu_number.h>
 #include <kern/sched_prim.h>
-#include <kern/xpr.h>
 #include <kern/debug.h>
 #include <string.h>
 
@@ -138,7 +136,7 @@ lck_mtx_lock(
        state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK;
 
        disable_preemption();
-       if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
+       if (!os_atomic_cmpxchg(&lock->lck_mtx_state, prev, state, acquire)) {
                enable_preemption();
                return lck_mtx_lock_slow(lock);
        }
@@ -192,7 +190,7 @@ lck_mtx_try_lock(
        state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK;
 
        disable_preemption();
-       if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
+       if (!os_atomic_cmpxchg(&lock->lck_mtx_state, prev, state, acquire)) {
                enable_preemption();
                return lck_mtx_try_lock_slow(lock);
        }
@@ -255,7 +253,7 @@ lck_mtx_lock_spin_always(
        state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK;
 
        disable_preemption();
-       if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
+       if (!os_atomic_cmpxchg(&lock->lck_mtx_state, prev, state, acquire)) {
                enable_preemption();
                return lck_mtx_lock_spin_slow(lock);
        }
@@ -342,7 +340,7 @@ lck_mtx_try_lock_spin_always(
        state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK;
 
        disable_preemption();
-       if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
+       if (!os_atomic_cmpxchg(&lock->lck_mtx_state, prev, state, acquire)) {
                enable_preemption();
                return lck_mtx_try_lock_spin_slow(lock);
        }
@@ -395,7 +393,7 @@ lck_mtx_try_lock_spin(
  * Unlocks a mutex held by current thread.
  * It tries the fast path first, and falls
  * through the slow path in case waiters need to
- * be woken up or promotions need to be dropped.
+ * be woken up.
  *
  * Interlock can be held, and the slow path will
  * unlock the mutex for this case.
@@ -417,7 +415,7 @@ lck_mtx_unlock(
         * Only full mutex will go through the fast path
         * (if the lock was acquired as a spinlock it will
         * fall through the slow path).
-        * If there are waiters or promotions it will fall
+        * If there are waiters it will fall
         * through the slow path.
         * If it is indirect it will fall through the slow path.
         */
@@ -426,7 +424,7 @@ lck_mtx_unlock(
         * Fast path state:
         * interlock not held, no waiters, no promotion and mutex held.
         */
-       prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_WAITERS_MSK | LCK_MTX_PROMOTED_MSK);
+       prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_WAITERS_MSK);
        prev |= LCK_MTX_MLOCKED_MSK;
 
        state = prev | LCK_MTX_ILOCKED_MSK;
@@ -435,7 +433,7 @@ lck_mtx_unlock(
        disable_preemption();
 
        /* the memory order needs to be acquire because it is acquiring the interlock */
-       if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
+       if (!os_atomic_cmpxchg(&lock->lck_mtx_state, prev, state, acquire)) {
                enable_preemption();
                return lck_mtx_unlock_slow(lock);
        }
@@ -445,7 +443,7 @@ lck_mtx_unlock(
 #if DEVELOPMENT | DEBUG
        thread_t owner = (thread_t)lock->lck_mtx_owner;
        if (__improbable(owner != current_thread())) {
-               return lck_mtx_owner_check_panic(lock);
+               lck_mtx_owner_check_panic(lock);
        }
 #endif
 
index 7d4568ed99673da5d2221a82ce53168520f5eec2..84bfb4c405a531181cf2d0a3f8fd0f61cc6595be 100644 (file)
@@ -147,10 +147,9 @@ ml_static_unslide(
        return VM_KERNEL_UNSLIDE(vaddr);
 }
 
-
 /*
- *     Routine:        ml_static_mfree
- *     Function:
+ * Reclaim memory, by virtual address, that was used in early boot that is no longer needed
+ * by the kernel.
  */
 void
 ml_static_mfree(
@@ -160,28 +159,43 @@ ml_static_mfree(
        addr64_t vaddr_cur;
        ppnum_t ppn;
        uint32_t freed_pages = 0;
+       vm_size_t map_size;
 
        assert(vaddr >= VM_MIN_KERNEL_ADDRESS);
 
        assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
 
-       for (vaddr_cur = vaddr;
-           vaddr_cur < round_page_64(vaddr + size);
-           vaddr_cur += PAGE_SIZE) {
+       for (vaddr_cur = vaddr; vaddr_cur < round_page_64(vaddr + size);) {
+               map_size = pmap_query_pagesize(kernel_pmap, vaddr_cur);
+
+               /* just skip if nothing mapped here */
+               if (map_size == 0) {
+                       vaddr_cur += PAGE_SIZE;
+                       continue;
+               }
+
+               /*
+                * Can't free from the middle of a large page.
+                */
+               assert((vaddr_cur & (map_size - 1)) == 0);
+
                ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
-               if (ppn != (vm_offset_t)NULL) {
-                       kernel_pmap->stats.resident_count++;
-                       if (kernel_pmap->stats.resident_count >
-                           kernel_pmap->stats.resident_max) {
-                               kernel_pmap->stats.resident_max =
-                                   kernel_pmap->stats.resident_count;
+               assert(ppn != (ppnum_t)NULL);
+
+               pmap_remove(kernel_pmap, vaddr_cur, vaddr_cur + map_size);
+               while (map_size > 0) {
+                       if (++kernel_pmap->stats.resident_count > kernel_pmap->stats.resident_max) {
+                               kernel_pmap->stats.resident_max = kernel_pmap->stats.resident_count;
                        }
-                       pmap_remove(kernel_pmap, vaddr_cur, vaddr_cur + PAGE_SIZE);
+
                        assert(pmap_valid_page(ppn));
                        if (IS_MANAGED_PAGE(ppn)) {
                                vm_page_create(ppn, (ppn + 1));
                                freed_pages++;
                        }
+                       map_size -= PAGE_SIZE;
+                       vaddr_cur += PAGE_SIZE;
+                       ppn++;
                }
        }
        vm_page_lockspin_queues();
@@ -371,6 +385,7 @@ ml_get_power_state(boolean_t *icp, boolean_t *pidlep)
 }
 
 /* Generate a fake interrupt */
+__dead2
 void
 ml_cause_interrupt(void)
 {
@@ -429,6 +444,7 @@ machine_signal_idle(
        cpu_interrupt(processor->cpu_id);
 }
 
+__dead2
 void
 machine_signal_idle_deferred(
        __unused processor_t processor)
@@ -436,6 +452,7 @@ machine_signal_idle_deferred(
        panic("Unimplemented");
 }
 
+__dead2
 void
 machine_signal_idle_cancel(
        __unused processor_t processor)
@@ -567,7 +584,7 @@ ml_processor_register(
        /* allocate and initialize other per-cpu structures */
        if (!boot_cpu) {
                mp_cpus_call_cpu_init(cpunum);
-               early_random_cpu_init(cpunum);
+               random_cpu_init(cpunum);
        }
 
        /* output arg */
@@ -868,7 +885,7 @@ ml_cpu_down(void)
  * The following are required for parts of the kernel
  * that cannot resolve these functions as inlines:
  */
-extern thread_t current_act(void);
+extern thread_t current_act(void) __attribute__((const));
 thread_t
 current_act(void)
 {
@@ -876,7 +893,7 @@ current_act(void)
 }
 
 #undef current_thread
-extern thread_t current_thread(void);
+extern thread_t current_thread(void) __attribute__((const));
 thread_t
 current_thread(void)
 {
@@ -1045,11 +1062,8 @@ ml_entropy_collect(void)
        assert(cpu_number() == master_cpu);
 
        /* update buffer pointer cyclically */
-       if (EntropyData.index_ptr - EntropyData.buffer == ENTROPY_BUFFER_SIZE) {
-               ep = EntropyData.index_ptr = EntropyData.buffer;
-       } else {
-               ep = EntropyData.index_ptr++;
-       }
+       ep = EntropyData.buffer + (EntropyData.sample_count & ENTROPY_BUFFER_INDEX_MASK);
+       EntropyData.sample_count += 1;
 
        rdtsc_nofence(tsc_lo, tsc_hi);
        *ep = ror32(*ep, 9) ^ tsc_lo;
index 28018871b3c0023878b460a241e6ff6c4d694dcd..b2f1e478fd9834583e812cdd70ea1327bd2eb0a4 100644 (file)
@@ -381,7 +381,6 @@ void interrupt_reset_latency_stats(void);
 void interrupt_populate_latency_stats(char *, unsigned);
 void ml_get_power_state(boolean_t *, boolean_t *);
 
-void timer_queue_expire_local(void*);
 void timer_queue_expire_rescan(void*);
 void ml_timer_evaluate(void);
 boolean_t ml_timer_forced_evaluation(void);
diff --git a/osfmk/i386/memory_types.h b/osfmk/i386/memory_types.h
new file mode 100644 (file)
index 0000000..808a4a7
--- /dev/null
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifndef _I386_MEMORY_TYPES_H_
+#define _I386_MEMORY_TYPES_H_
+
+#define VM_WIMG_COPYBACK                  VM_MEM_COHERENT
+#define VM_WIMG_COPYBACKLW                VM_WIMG_COPYBACK
+#define VM_WIMG_DEFAULT                   VM_MEM_COHERENT
+/* ?? intel ?? */
+#define VM_WIMG_IO                        (VM_MEM_COHERENT |      \
+                                         VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED)
+#define VM_WIMG_POSTED                    VM_WIMG_IO
+#define VM_WIMG_POSTED_REORDERED          VM_WIMG_IO
+#define VM_WIMG_POSTED_COMBINED_REORDERED VM_WIMG_IO
+#define VM_WIMG_WTHRU                     (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED)
+/* write combining mode, aka store gather */
+#define VM_WIMG_WCOMB                     (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT)
+#define VM_WIMG_INNERWBACK       VM_MEM_COHERENT
+#define VM_WIMG_RT               VM_WIMG_WCOMB
+
+#endif /* _I386_MEMORY_TYPES_H_ */
index 8a0165905709432006a0304e3b676da1e34a24b2..c12a9aaf720f736c2f053ff6da275bcd1ededa9b 100644 (file)
@@ -117,19 +117,15 @@ extern void     rtc_sleep_wakeup(uint64_t base);
 
 extern void     rtc_timer_start(void);
 
-extern void     rtc_clock_stepping(
-       uint32_t new_frequency,
-       uint32_t old_frequency);
-extern void     rtc_clock_stepped(
-       uint32_t new_frequency,
-       uint32_t old_frequency);
 extern void     rtc_clock_napped(uint64_t, uint64_t);
 extern void     rtc_clock_adjust(uint64_t);
 
 extern void     pmap_lowmem_finalize(void);
 
 thread_t Switch_context(thread_t, thread_continue_t, thread_t);
-thread_t Shutdown_context(thread_t thread, void (*doshutdown)(processor_t), processor_t  processor);
+
+__not_tail_called thread_t
+Shutdown_context(thread_t thread, void (*doshutdown)(processor_t), processor_t  processor);
 
 #ifdef __x86_64__
 uint64_t x86_64_pre_sleep(void);
@@ -150,6 +146,10 @@ copy_debug_state64(x86_debug_state64_t *src, x86_debug_state64_t *target, boolea
 
 extern void act_machine_switch_pcb(thread_t old, thread_t new);
 
+extern void Idle_PTs_release(vm_offset_t start, vm_offset_t end);
+extern ppnum_t released_PT_ppn;
+extern uint32_t released_PT_cnt;
+
 /* Fast-restart parameters */
 #define FULL_SLAVE_INIT (NULL)
 #define FAST_SLAVE_INIT ((void *)(uintptr_t)1)
index 428f6151cca5b225fa34c1648a8aacc70247ade5..b6654cc393b0f2ebf1ebb7196cc710041069e804 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -32,7 +32,6 @@
 #include <mach_kdp.h>
 #include <kdp/kdp_internal.h>
 #include <mach_ldebug.h>
-#include <gprof.h>
 
 #include <mach/mach_types.h>
 #include <mach/kern_return.h>
@@ -56,8 +55,6 @@
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 
-#include <profiling/profile-mk.h>
-
 #include <i386/bit_routines.h>
 #include <i386/proc_reg.h>
 #include <i386/cpu_threads.h>
@@ -188,24 +185,6 @@ boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler,
 void i386_start_cpu(int lapic_id, int cpu_num);
 void i386_send_NMI(int cpu);
 void NMIPI_enable(boolean_t);
-#if GPROF
-/*
- * Initialize dummy structs for profiling. These aren't used but
- * allows hertz_tick() to be built with GPROF defined.
- */
-struct profile_vars _profile_vars;
-struct profile_vars *_profile_vars_cpus[MAX_CPUS] = { &_profile_vars };
-#define GPROF_INIT()                                                    \
-{                                                                       \
-       int     i;                                                      \
-                                                                        \
-       /* Hack to initialize pointers to unused profiling structs */   \
-       for (i = 1; i < MAX_CPUS; i++)                          \
-               _profile_vars_cpus[i] = &_profile_vars;                 \
-}
-#else
-#define GPROF_INIT()
-#endif /* GPROF */
 
 static lck_grp_t        smp_lck_grp;
 static lck_grp_attr_t   smp_lck_grp_attr;
@@ -245,7 +224,6 @@ smp_init(void)
 
        cpu_thread_init();
 
-       GPROF_INIT();
        DBGLOG_CPU_INIT(master_cpu);
 
        mp_cpus_call_init();
@@ -1500,12 +1478,9 @@ mp_broadcast(
         * signal other processors, which will call mp_broadcast_action()
         */
        mp_bc_count = real_ncpus;                       /* assume max possible active */
-       mp_bc_ncpus = mp_cpus_call(CPUMASK_OTHERS, NOSYNC, *mp_broadcast_action, NULL) + 1;
+       mp_bc_ncpus = mp_cpus_call(CPUMASK_ALL, NOSYNC, *mp_broadcast_action, NULL);
        atomic_decl(&mp_bc_count, real_ncpus - mp_bc_ncpus); /* subtract inactive */
 
-       /* call executor function on this cpu */
-       mp_broadcast_action(NULL);
-
        /* block for other cpus to have run action_func */
        if (mp_bc_ncpus > 1) {
                thread_block(THREAD_CONTINUE_NULL);
index e63c9f4c4fc1a8090802f32010b5f1b3e0ffcfae..43e8085e0c5bce17e8c9d3d718dd9b1c41193643 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -65,6 +65,7 @@
 
 #include <i386/apic.h>
 #include <i386/mp_events.h>
+#include <machine/limits.h>
 
 #define MAX_CPUS        64              /* 8 * sizeof(cpumask_t) */
 
 #include <mach/i386/thread_status.h>
 #include <mach/vm_types.h>
 #include <kern/simple_lock.h>
+#include <kern/assert.h>
 
 __BEGIN_DECLS
 
 extern kern_return_t intel_startCPU(int slot_num);
 extern kern_return_t intel_startCPU_fast(int slot_num);
-extern void i386_init_slave(void);
-extern void i386_init_slave_fast(void);
+extern void i386_init_slave(void) __dead2;
+extern void i386_init_slave_fast(void) __dead2;
 extern void smp_init(void);
 
 extern void cpu_interrupt(int cpu);
@@ -90,7 +92,7 @@ __END_DECLS
 
 extern  unsigned int    real_ncpus;             /* real number of cpus */
 extern  unsigned int    max_ncpus;              /* max number of cpus */
-decl_simple_lock_data(extern, kdb_lock)  /* kdb lock           */
+decl_simple_lock_data(extern, kdb_lock); /* kdb lock           */
 
 __BEGIN_DECLS
 
@@ -153,6 +155,9 @@ typedef enum    {KDP_XCPU_NONE = 0xffff, KDP_CURRENT_LCPU = 0xfffe} kdp_cpu_t;
 
 typedef uint32_t cpu_t;
 typedef volatile uint64_t cpumask_t;
+
+static_assert(sizeof(cpumask_t) * CHAR_BIT >= MAX_CPUS, "cpumask_t bitvector is too small for current MAX_CPUS value");
+
 static inline cpumask_t
 cpu_to_cpumask(cpu_t cpu)
 {
index ad97efdaa94eefc2518f0a3a5de6470aa10bedb8..ac756a7f9e9ef995986f245cef1cf3aeb85a346b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -224,11 +224,11 @@ cldt_t *dyn_ldts;
  * in the uber-space remapping window on the kernel.
  */
 struct fake_descriptor64 kernel_ldt_desc64 = {
-       0,
-       LDTSZ_MIN*sizeof(struct fake_descriptor) - 1,
-       0,
-       ACC_P | ACC_PL_K | ACC_LDT,
-       0
+       .offset64 = 0,
+       .lim_or_seg = LDTSZ_MIN * sizeof(struct fake_descriptor) - 1,
+       .size_or_IST = 0,
+       .access = ACC_P | ACC_PL_K | ACC_LDT,
+       .reserved = 0
 };
 
 /*
@@ -236,11 +236,11 @@ struct fake_descriptor64 kernel_ldt_desc64 = {
  * It is follows pattern of the KERNEL_LDT.
  */
 struct fake_descriptor64 kernel_tss_desc64 = {
-       0,
-       sizeof(struct x86_64_tss) - 1,
-       0,
-       ACC_P | ACC_PL_K | ACC_TSS,
-       0
+       .offset64 = 0,
+       .lim_or_seg = sizeof(struct x86_64_tss) - 1,
+       .size_or_IST = 0,
+       .access = ACC_P | ACC_PL_K | ACC_TSS,
+       .reserved = 0
 };
 
 /*
@@ -499,9 +499,6 @@ cpu_desc_load(cpu_data_t *cdp)
        postcode(CPU_DESC_LOAD_TSS);
        set_tr(KERNEL_TSS);
 
-#if GPROF // Hack to enable mcount to work on K64
-       __asm__ volatile ("mov %0, %%gs" : : "rm" ((unsigned short)(KERNEL_DS)));
-#endif
        postcode(CPU_DESC_LOAD_EXIT);
 }
 
@@ -511,11 +508,8 @@ cpu_desc_load(cpu_data_t *cdp)
 void
 cpu_syscall_init(cpu_data_t *cdp)
 {
-#if MONOTONIC
-       mt_cpu_up(cdp);
-#else /* MONOTONIC */
 #pragma unused(cdp)
-#endif /* !MONOTONIC */
+
        wrmsr64(MSR_IA32_SYSENTER_CS, SYSENTER_CS);
        wrmsr64(MSR_IA32_SYSENTER_EIP, DBLMAP((uintptr_t) hi64_sysenter));
        wrmsr64(MSR_IA32_SYSENTER_ESP, current_cpu_datap()->cpu_desc_index.cdi_sstku);
index 03972d40ba1fce489fea39a9033f93162b1c43ea..bfb34d31b06b116a58b8c83bec19b0b13a3c2c55 100644 (file)
@@ -123,7 +123,7 @@ void pal_thread_terminate_self(thread_t thread);
 void pal_ast_check(thread_t thread);
 
 /* Called by sync_iss_to_iks */
-extern void pal_get_kern_regs( x86_saved_state_t *state );
+extern void pal_get_kern_regs( x86_saved_state_t *state ) __dead2;
 
 /*
  * Platform-specific hlt/sti.
index fe5d56b8e12d1f367c80ea76ad2744544e5635b7..9ece881bd134066069eb2a935eed3b78757fe424 100644 (file)
  * Maps state flavor to number of words in the state:
  */
 unsigned int _MachineStateCount[] = {
-       [x86_THREAD_STATE32]      = x86_THREAD_STATE32_COUNT,
-       [x86_THREAD_STATE64]      = x86_THREAD_STATE64_COUNT,
-       [x86_THREAD_FULL_STATE64] = x86_THREAD_FULL_STATE64_COUNT,
-       [x86_THREAD_STATE]        = x86_THREAD_STATE_COUNT,
-       [x86_FLOAT_STATE32]       = x86_FLOAT_STATE32_COUNT,
-       [x86_FLOAT_STATE64]       = x86_FLOAT_STATE64_COUNT,
-       [x86_FLOAT_STATE]         = x86_FLOAT_STATE_COUNT,
-       [x86_EXCEPTION_STATE32]   = x86_EXCEPTION_STATE32_COUNT,
-       [x86_EXCEPTION_STATE64]   = x86_EXCEPTION_STATE64_COUNT,
-       [x86_EXCEPTION_STATE]     = x86_EXCEPTION_STATE_COUNT,
-       [x86_DEBUG_STATE32]       = x86_DEBUG_STATE32_COUNT,
-       [x86_DEBUG_STATE64]       = x86_DEBUG_STATE64_COUNT,
-       [x86_DEBUG_STATE]         = x86_DEBUG_STATE_COUNT,
-       [x86_AVX_STATE32]         = x86_AVX_STATE32_COUNT,
-       [x86_AVX_STATE64]         = x86_AVX_STATE64_COUNT,
-       [x86_AVX_STATE]           = x86_AVX_STATE_COUNT,
-#if !defined(RC_HIDE_XNU_J137)
-       [x86_AVX512_STATE32]      = x86_AVX512_STATE32_COUNT,
-       [x86_AVX512_STATE64]      = x86_AVX512_STATE64_COUNT,
-       [x86_AVX512_STATE]        = x86_AVX512_STATE_COUNT,
-#endif /* not RC_HIDE_XNU_J137 */
+       [x86_THREAD_STATE32]            = x86_THREAD_STATE32_COUNT,
+       [x86_THREAD_STATE64]            = x86_THREAD_STATE64_COUNT,
+       [x86_THREAD_FULL_STATE64]       = x86_THREAD_FULL_STATE64_COUNT,
+       [x86_THREAD_STATE]              = x86_THREAD_STATE_COUNT,
+       [x86_FLOAT_STATE32]             = x86_FLOAT_STATE32_COUNT,
+       [x86_FLOAT_STATE64]             = x86_FLOAT_STATE64_COUNT,
+       [x86_FLOAT_STATE]               = x86_FLOAT_STATE_COUNT,
+       [x86_EXCEPTION_STATE32]         = x86_EXCEPTION_STATE32_COUNT,
+       [x86_EXCEPTION_STATE64]         = x86_EXCEPTION_STATE64_COUNT,
+       [x86_EXCEPTION_STATE]           = x86_EXCEPTION_STATE_COUNT,
+       [x86_DEBUG_STATE32]             = x86_DEBUG_STATE32_COUNT,
+       [x86_DEBUG_STATE64]             = x86_DEBUG_STATE64_COUNT,
+       [x86_DEBUG_STATE]               = x86_DEBUG_STATE_COUNT,
+       [x86_AVX_STATE32]               = x86_AVX_STATE32_COUNT,
+       [x86_AVX_STATE64]               = x86_AVX_STATE64_COUNT,
+       [x86_AVX_STATE]                 = x86_AVX_STATE_COUNT,
+       [x86_AVX512_STATE32]            = x86_AVX512_STATE32_COUNT,
+       [x86_AVX512_STATE64]            = x86_AVX512_STATE64_COUNT,
+       [x86_AVX512_STATE]              = x86_AVX512_STATE_COUNT,
+       [x86_PAGEIN_STATE]              = x86_PAGEIN_STATE_COUNT
 };
 
 zone_t          iss_zone;               /* zone for saved_state area */
 zone_t          ids_zone;               /* zone for debug_state area */
 
-extern int      allow_64bit_proc_LDT_ops;
-
 /* Forward */
 
 extern void             Thread_continue(void);
@@ -485,6 +482,12 @@ machine_switch_context(
        return Switch_context(old, continuation, new);
 }
 
+boolean_t
+machine_thread_on_core(thread_t thread)
+{
+       return thread->machine.specFlags & OnProc;
+}
+
 thread_t
 machine_processor_shutdown(
        thread_t        thread,
@@ -855,15 +858,10 @@ machine_thread_set_state(
                state = (x86_saved_state32_t *) tstate;
 
                /*
-                * Allow a thread in a 64-bit process to set
-                * 32-bit state iff the code segment originates
-                * in the LDT (the implication is that only
-                * 32-bit code segments are allowed there, so
-                * setting 32-bit state implies a switch to
-                * compatibility mode on resume-to-user).
+                * Refuse to allow 64-bit processes to set
+                * 32-bit state.
                 */
-               if (thread_is_64bit_addr(thr_act) &&
-                   thr_act->task->i386_ldt == 0) {
+               if (thread_is_64bit_addr(thr_act)) {
                        return KERN_INVALID_ARGUMENT;
                }
 
@@ -996,38 +994,34 @@ machine_thread_set_state(
 
        case x86_FLOAT_STATE32:
        case x86_AVX_STATE32:
-#if !defined(RC_HIDE_XNU_J137)
        case x86_AVX512_STATE32:
-#endif /* not RC_HIDE_XNU_J137 */
-               {
-                       if (count != _MachineStateCount[flavor]) {
-                               return KERN_INVALID_ARGUMENT;
-                       }
-
-                       if (thread_is_64bit_addr(thr_act)) {
-                               return KERN_INVALID_ARGUMENT;
-                       }
+       {
+               if (count != _MachineStateCount[flavor]) {
+                       return KERN_INVALID_ARGUMENT;
+               }
 
-                       return fpu_set_fxstate(thr_act, tstate, flavor);
+               if (thread_is_64bit_addr(thr_act)) {
+                       return KERN_INVALID_ARGUMENT;
                }
 
+               return fpu_set_fxstate(thr_act, tstate, flavor);
+       }
+
        case x86_FLOAT_STATE64:
        case x86_AVX_STATE64:
-#if !defined(RC_HIDE_XNU_J137)
        case x86_AVX512_STATE64:
-#endif /* not RC_HIDE_XNU_J137 */
-               {
-                       if (count != _MachineStateCount[flavor]) {
-                               return KERN_INVALID_ARGUMENT;
-                       }
-
-                       if (!thread_is_64bit_addr(thr_act)) {
-                               return KERN_INVALID_ARGUMENT;
-                       }
+       {
+               if (count != _MachineStateCount[flavor]) {
+                       return KERN_INVALID_ARGUMENT;
+               }
 
-                       return fpu_set_fxstate(thr_act, tstate, flavor);
+               if (!thread_is_64bit_addr(thr_act)) {
+                       return KERN_INVALID_ARGUMENT;
                }
 
+               return fpu_set_fxstate(thr_act, tstate, flavor);
+       }
+
        case x86_FLOAT_STATE:
        {
                x86_float_state_t       *state;
@@ -1049,37 +1043,35 @@ machine_thread_set_state(
        }
 
        case x86_AVX_STATE:
-#if !defined(RC_HIDE_XNU_J137)
        case x86_AVX512_STATE:
-#endif
-               {
-                       x86_avx_state_t       *state;
-
-                       if (count != _MachineStateCount[flavor]) {
-                               return KERN_INVALID_ARGUMENT;
-                       }
+       {
+               x86_avx_state_t       *state;
 
-                       state = (x86_avx_state_t *)tstate;
-                       /* Flavors are defined to have sequential values: 32-bit, 64-bit, non-specific */
-                       /* 64-bit flavor? */
-                       if (state->ash.flavor == (flavor - 1) &&
-                           state->ash.count == _MachineStateCount[flavor - 1] &&
-                           thread_is_64bit_addr(thr_act)) {
-                               return fpu_set_fxstate(thr_act,
-                                          (thread_state_t)&state->ufs.as64,
-                                          flavor - 1);
-                       }
-                       /* 32-bit flavor? */
-                       if (state->ash.flavor == (flavor - 2) &&
-                           state->ash.count == _MachineStateCount[flavor - 2] &&
-                           !thread_is_64bit_addr(thr_act)) {
-                               return fpu_set_fxstate(thr_act,
-                                          (thread_state_t)&state->ufs.as32,
-                                          flavor - 2);
-                       }
+               if (count != _MachineStateCount[flavor]) {
                        return KERN_INVALID_ARGUMENT;
                }
 
+               state = (x86_avx_state_t *)tstate;
+               /* Flavors are defined to have sequential values: 32-bit, 64-bit, non-specific */
+               /* 64-bit flavor? */
+               if (state->ash.flavor == (flavor - 1) &&
+                   state->ash.count == _MachineStateCount[flavor - 1] &&
+                   thread_is_64bit_addr(thr_act)) {
+                       return fpu_set_fxstate(thr_act,
+                                  (thread_state_t)&state->ufs.as64,
+                                  flavor - 1);
+               }
+               /* 32-bit flavor? */
+               if (state->ash.flavor == (flavor - 2) &&
+                   state->ash.count == _MachineStateCount[flavor - 2] &&
+                   !thread_is_64bit_addr(thr_act)) {
+                       return fpu_set_fxstate(thr_act,
+                                  (thread_state_t)&state->ufs.as32,
+                                  flavor - 2);
+               }
+               return KERN_INVALID_ARGUMENT;
+       }
+
        case x86_THREAD_STATE32:
        {
                if (count != x86_THREAD_STATE32_COUNT) {
@@ -1108,15 +1100,16 @@ machine_thread_set_state(
 
        case x86_THREAD_FULL_STATE64:
        {
-               if (!allow_64bit_proc_LDT_ops) {
+               if (count != x86_THREAD_FULL_STATE64_COUNT) {
                        return KERN_INVALID_ARGUMENT;
                }
 
-               if (count != x86_THREAD_FULL_STATE64_COUNT) {
+               if (!thread_is_64bit_addr(thr_act)) {
                        return KERN_INVALID_ARGUMENT;
                }
 
-               if (!thread_is_64bit_addr(thr_act)) {
+               /* If this process does not have a custom LDT, return failure */
+               if (thr_act->task->i386_ldt == 0) {
                        return KERN_INVALID_ARGUMENT;
                }
 
@@ -1139,7 +1132,7 @@ machine_thread_set_state(
                        return set_thread_state64(thr_act, &state->uts.ts64, FALSE);
                } else if (state->tsh.flavor == x86_THREAD_FULL_STATE64 &&
                    state->tsh.count == x86_THREAD_FULL_STATE64_COUNT &&
-                   thread_is_64bit_addr(thr_act)) {
+                   thread_is_64bit_addr(thr_act) && thr_act->task->i386_ldt != 0) {
                        return set_thread_state64(thr_act, &state->uts.ts64, TRUE);
                } else if (state->tsh.flavor == x86_THREAD_STATE32 &&
                    state->tsh.count == x86_THREAD_STATE32_COUNT &&
@@ -1207,6 +1200,30 @@ machine_thread_set_state(
        return KERN_SUCCESS;
 }
 
+mach_vm_address_t
+machine_thread_pc(thread_t thr_act)
+{
+       if (thread_is_64bit_addr(thr_act)) {
+               return (mach_vm_address_t)USER_REGS64(thr_act)->isf.rip;
+       } else {
+               return (mach_vm_address_t)USER_REGS32(thr_act)->eip;
+       }
+}
+
+void
+machine_thread_reset_pc(thread_t thr_act, mach_vm_address_t pc)
+{
+       pal_register_cache_state(thr_act, DIRTY);
+
+       if (thread_is_64bit_addr(thr_act)) {
+               if (!IS_USERADDR64_CANONICAL(pc)) {
+                       pc = 0;
+               }
+               USER_REGS64(thr_act)->isf.rip = (uint64_t)pc;
+       } else {
+               USER_REGS32(thr_act)->eip = (uint32_t)pc;
+       }
+}
 
 
 /*
@@ -1268,7 +1285,6 @@ machine_thread_get_state(
                break;
        }
 
-#if !defined(RC_HIDE_XNU_J137)
        case THREAD_STATE_FLAVOR_LIST_10_13:
        {
                if (*count < 6) {
@@ -1286,7 +1302,24 @@ machine_thread_get_state(
                break;
        }
 
-#endif
+       case THREAD_STATE_FLAVOR_LIST_10_15:
+       {
+               if (*count < 7) {
+                       return KERN_INVALID_ARGUMENT;
+               }
+
+               tstate[0] = x86_THREAD_STATE;
+               tstate[1] = x86_FLOAT_STATE;
+               tstate[2] = x86_EXCEPTION_STATE;
+               tstate[3] = x86_DEBUG_STATE;
+               tstate[4] = x86_AVX_STATE;
+               tstate[5] = x86_AVX512_STATE;
+               tstate[6] = x86_PAGEIN_STATE;
+
+               *count = 7;
+               break;
+       }
+
        case x86_SAVED_STATE32:
        {
                x86_saved_state32_t     *state;
@@ -1407,70 +1440,64 @@ machine_thread_get_state(
        }
 
        case x86_AVX_STATE32:
-#if !defined(RC_HIDE_XNU_J137)
        case x86_AVX512_STATE32:
-#endif
-               {
-                       if (*count != _MachineStateCount[flavor]) {
-                               return KERN_INVALID_ARGUMENT;
-                       }
+       {
+               if (*count != _MachineStateCount[flavor]) {
+                       return KERN_INVALID_ARGUMENT;
+               }
 
-                       if (thread_is_64bit_addr(thr_act)) {
-                               return KERN_INVALID_ARGUMENT;
-                       }
+               if (thread_is_64bit_addr(thr_act)) {
+                       return KERN_INVALID_ARGUMENT;
+               }
 
-                       *count = _MachineStateCount[flavor];
+               *count = _MachineStateCount[flavor];
 
-                       return fpu_get_fxstate(thr_act, tstate, flavor);
-               }
+               return fpu_get_fxstate(thr_act, tstate, flavor);
+       }
 
        case x86_AVX_STATE64:
-#if !defined(RC_HIDE_XNU_J137)
        case x86_AVX512_STATE64:
-#endif
-               {
-                       if (*count != _MachineStateCount[flavor]) {
-                               return KERN_INVALID_ARGUMENT;
-                       }
+       {
+               if (*count != _MachineStateCount[flavor]) {
+                       return KERN_INVALID_ARGUMENT;
+               }
 
-                       if (!thread_is_64bit_addr(thr_act)) {
-                               return KERN_INVALID_ARGUMENT;
-                       }
+               if (!thread_is_64bit_addr(thr_act)) {
+                       return KERN_INVALID_ARGUMENT;
+               }
 
-                       *count = _MachineStateCount[flavor];
+               *count = _MachineStateCount[flavor];
 
-                       return fpu_get_fxstate(thr_act, tstate, flavor);
-               }
+               return fpu_get_fxstate(thr_act, tstate, flavor);
+       }
 
        case x86_AVX_STATE:
-#if !defined(RC_HIDE_XNU_J137)
        case x86_AVX512_STATE:
-#endif
-               {
-                       x86_avx_state_t         *state;
-                       thread_state_t          fstate;
-
-                       if (*count < _MachineStateCount[flavor]) {
-                               return KERN_INVALID_ARGUMENT;
-                       }
+       {
+               x86_avx_state_t         *state;
+               thread_state_t          fstate;
 
-                       *count = _MachineStateCount[flavor];
-                       state = (x86_avx_state_t *)tstate;
+               if (*count < _MachineStateCount[flavor]) {
+                       return KERN_INVALID_ARGUMENT;
+               }
 
-                       bzero((char *)state, *count * sizeof(int));
+               *count = _MachineStateCount[flavor];
+               state = (x86_avx_state_t *)tstate;
 
-                       if (thread_is_64bit_addr(thr_act)) {
-                               flavor -= 1; /* 64-bit flavor */
-                               fstate = (thread_state_t) &state->ufs.as64;
-                       } else {
-                               flavor -= 2; /* 32-bit flavor */
-                               fstate = (thread_state_t) &state->ufs.as32;
-                       }
-                       state->ash.flavor = flavor;
-                       state->ash.count  = _MachineStateCount[flavor];
+               bzero((char *)state, *count * sizeof(int));
 
-                       return fpu_get_fxstate(thr_act, fstate, flavor);
+               if (thread_is_64bit_addr(thr_act)) {
+                       flavor -= 1;         /* 64-bit flavor */
+                       fstate = (thread_state_t) &state->ufs.as64;
+               } else {
+                       flavor -= 2;         /* 32-bit flavor */
+                       fstate = (thread_state_t) &state->ufs.as32;
                }
+               state->ash.flavor = flavor;
+               state->ash.count  = _MachineStateCount[flavor];
+
+               return fpu_get_fxstate(thr_act, fstate, flavor);
+       }
 
        case x86_THREAD_STATE32:
        {
@@ -1506,15 +1533,16 @@ machine_thread_get_state(
 
        case x86_THREAD_FULL_STATE64:
        {
-               if (!allow_64bit_proc_LDT_ops) {
+               if (*count < x86_THREAD_FULL_STATE64_COUNT) {
                        return KERN_INVALID_ARGUMENT;
                }
 
-               if (*count < x86_THREAD_FULL_STATE64_COUNT) {
+               if (!thread_is_64bit_addr(thr_act)) {
                        return KERN_INVALID_ARGUMENT;
                }
 
-               if (!thread_is_64bit_addr(thr_act)) {
+               /* If this process does not have a custom LDT, return failure */
+               if (thr_act->task->i386_ldt == 0) {
                        return KERN_INVALID_ARGUMENT;
                }
 
@@ -1680,6 +1708,20 @@ machine_thread_get_state(
                *count = x86_DEBUG_STATE_COUNT;
                break;
        }
+
+       case x86_PAGEIN_STATE:
+       {
+               if (*count < x86_PAGEIN_STATE_COUNT) {
+                       return KERN_INVALID_ARGUMENT;
+               }
+
+               x86_pagein_state_t *state = (void *)tstate;
+
+               state->__pagein_error = thr_act->t_pagein_error;
+
+               *count = x86_PAGEIN_STATE_COUNT;
+               break;
+       }
        default:
                return KERN_INVALID_ARGUMENT;
        }
@@ -1981,15 +2023,16 @@ machine_stack_attach(
        thread_initialize_kernel_state(thread);
 
        statep = STACK_IKS(stack);
-#if defined(__x86_64__)
-       statep->k_rip = (unsigned long) Thread_continue;
-       statep->k_rbx = (unsigned long) thread_continue;
-       statep->k_rsp = (unsigned long) STACK_IKS(stack);
-#else
-       statep->k_eip = (unsigned long) Thread_continue;
-       statep->k_ebx = (unsigned long) thread_continue;
-       statep->k_esp = (unsigned long) STACK_IKS(stack);
-#endif
+
+       /*
+        * Reset the state of the thread to resume from a continuation,
+        * including resetting the stack and frame pointer to avoid backtracers
+        * seeing this temporary state and attempting to walk the defunct stack.
+        */
+       statep->k_rbp = (uint64_t) 0;
+       statep->k_rip = (uint64_t) Thread_continue;
+       statep->k_rbx = (uint64_t) thread_continue;
+       statep->k_rsp = (uint64_t) STACK_IKS(stack);
 
        return;
 }
index ff49e3fe2bbbeb95ad645e0f42f10ffc2a40e1da..d0f1040fea6d9093a4a7958b769f31ee073e188a 100644 (file)
@@ -750,7 +750,7 @@ pmThreadGetUrgency(uint64_t *rt_period, uint64_t *rt_deadline)
        thread_urgency_t urgency;
        uint64_t        arg1, arg2;
 
-       urgency = thread_get_urgency(current_processor()->next_thread, &arg1, &arg2);
+       urgency = thread_get_urgency(THREAD_NULL, &arg1, &arg2);
 
        if (urgency == THREAD_URGENCY_REAL_TIME) {
                if (rt_period != NULL) {
index bd932f8e20c101a750a9bbea595053b0b8b5eb08..06f61e536ac1c8c093fc0db8fcf080e97b0aa0d2 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -226,17 +226,6 @@ extern int      kernPhysPML4EntryCount;
 #define KERNEL_BASE                     (0ULL - (NBPML4 * KERNEL_PML4_COUNT))
 #define KERNEL_BASEMENT                 (KERNEL_BASE - NBPML4)  /* Basement uses one PML4 entry */
 
-#define VM_WIMG_COPYBACK        VM_MEM_COHERENT
-#define VM_WIMG_COPYBACKLW      VM_WIMG_COPYBACK
-#define VM_WIMG_DEFAULT         VM_MEM_COHERENT
-/* ?? intel ?? */
-#define VM_WIMG_IO              (VM_MEM_COHERENT |      \
-                               VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED)
-#define VM_WIMG_POSTED          VM_WIMG_IO
-#define VM_WIMG_WTHRU           (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED)
-/* write combining mode, aka store gather */
-#define VM_WIMG_WCOMB           (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT)
-#define VM_WIMG_INNERWBACK      VM_MEM_COHERENT
 /*
  * Pte related macros
  */
@@ -324,37 +313,11 @@ extern int      kernPhysPML4EntryCount;
 
 #define INTEL_PTE_COMPRESSED_MASK (INTEL_PTE_COMPRESSED | \
                                   INTEL_PTE_COMPRESSED_ALT | INTEL_PTE_SWLOCK)
-#define PTE_IS_COMPRESSED(x, ptep)                                        \
-       ((((x) & INTEL_PTE_VALID) == 0) && /* PTE is not valid... */    \
+#define PTE_IS_COMPRESSED(x, ptep, pmap, vaddr)                            \
+       ((((x) & INTEL_PTE_VALID) == 0) && /* PTE is not valid... */       \
         ((x) & INTEL_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \
-        ((!((x) & ~INTEL_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \
-         (panic_compressed_pte_corrupt((x), &(x), (ptep)), FALSE)))
-
-static inline void
-panic_compressed_pte_corrupt(uint64_t pte, uint64_t *pte_addr, uint64_t *ptep)
-{
-       uint64_t *adj_pteps[2];
-       int pteidx = ((uintptr_t)ptep & INTEL_OFFMASK) / sizeof(pt_entry_t);
-       /*
-        * Grab pointers to PTEs on either side of the PTE in question, unless we're at the start of
-        * a PT (grab pointers to the next and next-next PTEs) or the end of a PT (grab the previous
-        * 2 PTEs).
-        */
-       if (pteidx == 0) {
-               adj_pteps[0] = ptep + 1;
-               adj_pteps[1] = ptep + 2;
-       } else if (pteidx == (NPTPG - 1)) {
-               adj_pteps[0] = ptep - 2;
-               adj_pteps[1] = ptep - 1;
-       } else {
-               adj_pteps[0] = ptep - 1;
-               adj_pteps[1] = ptep + 1;
-       }
-
-       panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted? Adjacent PTEs: 0x%llx@%p, 0x%llx@%p",
-           pte_addr, pte, pte & ~INTEL_PTE_COMPRESSED_MASK, *adj_pteps[0], adj_pteps[0], *adj_pteps[1], adj_pteps[1]);
-       /*NOTREACHED*/
-}
+        ((!((x) & ~INTEL_PTE_COMPRESSED_MASK)) || /* ...no other bits */  \
+         pmap_compressed_pte_corruption_repair((x), &(x), (ptep), (pmap), (vaddr))))
 
 #define pa_to_pte(a)            ((a) & INTEL_PTE_PFN) /* XXX */
 #define pte_to_pa(p)            ((p) & INTEL_PTE_PFN) /* XXX */
@@ -519,6 +482,7 @@ PHYSMAP_PTOV_check(void *paddr)
 }
 
 #define PHYSMAP_PTOV(x) (PHYSMAP_PTOV_check((void*) (x)))
+#define phystokv(x) ((vm_offset_t)(PHYSMAP_PTOV(x)))
 #if MACH_KERNEL_PRIVATE
 extern uint64_t dblmap_base, dblmap_max, dblmap_dist;
 
@@ -580,21 +544,24 @@ struct pmap {
        pml4_entry_t    *pm_pml4;       /* VKA of top level */
        pml4_entry_t    *pm_upml4;      /* Shadow VKA of top level */
        pmap_paddr_t    pm_eptp;        /* EPTP */
+
        task_map_t      pm_task_map;
        boolean_t       pagezero_accessible;
 #define PMAP_PCID_MAX_CPUS      MAX_CPUS        /* Must be a multiple of 8 */
        pcid_t          pmap_pcid_cpus[PMAP_PCID_MAX_CPUS];
        volatile uint8_t pmap_pcid_coherency_vector[PMAP_PCID_MAX_CPUS];
        boolean_t       pm_shared;
+       os_refcnt_t     ref_count;
+       pdpt_entry_t    *pm_pdpt;       /* KVA of 3rd level page */
        vm_object_t     pm_obj;         /* object to hold pde's */
        vm_object_t     pm_obj_pdpt;    /* holds pdpt pages */
        vm_object_t     pm_obj_pml4;    /* holds pml4 pages */
 #if     DEVELOPMENT || DEBUG
        int             nx_enabled;
 #endif
-       int             ref_count;
        ledger_t        ledger;         /* ledger tracking phys mappings */
        struct pmap_statistics  stats;  /* map statistics */
+       uint64_t        corrected_compressed_ptes_count;
 #if MACH_ASSERT
        boolean_t       pmap_stats_assert;
        int             pmap_pid;
@@ -647,10 +614,12 @@ extern void         pmap_put_mapwindow(mapwindow_t *map);
 #endif
 
 typedef struct pmap_memory_regions {
-       ppnum_t base;           /* first page of this region */
-       ppnum_t alloc_up;       /* pages below this one have been "stolen" */
-       ppnum_t alloc_down;     /* pages above this one have been "stolen" */
-       ppnum_t end;            /* last page of this region */
+       ppnum_t base;            /* first page of this region */
+       ppnum_t alloc_up;        /* pages below this one have been "stolen" */
+       ppnum_t alloc_down;      /* pages above this one have been "stolen" */
+       ppnum_t alloc_frag_up;   /* low page of fragment after large page alloc */
+       ppnum_t alloc_frag_down; /* high page of fragment after large page alloc */
+       ppnum_t end;             /* last page of this region */
        uint32_t type;
        uint64_t attribute;
 } pmap_memory_region_t;
@@ -786,6 +755,7 @@ extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_s
     uint64_t *, int *, int *, int *);
 extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__printflike(1, 2));
 
+extern void x86_64_protect_data_const(void);
 /*
  *     Macros for speed.
  */
index 17c6e29479aeae3056b73a6a6329b7bb1f9b9f07..9bfec8a48af4ed093703ee70628c3e415236840d 100644 (file)
@@ -291,10 +291,21 @@ __private_extern__ void
 pmap_pagetable_corruption_msg_log(int (*log_func)(const char * fmt, ...)__printflike(1, 2))
 {
        if (pmap_pagetable_corruption_incidents > 0) {
-               int i, e = MIN(pmap_pagetable_corruption_incidents, PMAP_PAGETABLE_CORRUPTION_MAX_LOG);
+               int i, j, e = MIN(pmap_pagetable_corruption_incidents, PMAP_PAGETABLE_CORRUPTION_MAX_LOG);
                (*log_func)("%u pagetable corruption incident(s) detected, timeout: %u\n", pmap_pagetable_corruption_incidents, pmap_pagetable_corruption_timeout);
                for (i = 0; i < e; i++) {
-                       (*log_func)("Incident 0x%x, reason: 0x%x, action: 0x%x, time: 0x%llx\n", pmap_pagetable_corruption_records[i].incident, pmap_pagetable_corruption_records[i].reason, pmap_pagetable_corruption_records[i].action, pmap_pagetable_corruption_records[i].abstime);
+                       (*log_func)("Incident 0x%x, reason: 0x%x, action: 0x%x, time: 0x%llx\n",
+                           pmap_pagetable_corruption_records[i].incident,
+                           pmap_pagetable_corruption_records[i].reason,
+                           pmap_pagetable_corruption_records[i].action,
+                           pmap_pagetable_corruption_records[i].abstime);
+
+                       if (pmap_pagetable_corruption_records[i].adj_ptes_count > 0) {
+                               for (j = 0; j < pmap_pagetable_corruption_records[i].adj_ptes_count; j++) {
+                                       (*log_func)("\tAdjacent PTE[%d] = 0x%llx\n", j,
+                                           pmap_pagetable_corruption_records[i].adj_ptes[j]);
+                               }
+                       }
                }
        }
 }
index abf263a1ffd189752e4c4a54e176f3ce706de9c1..5928bda3fddcc817e74e4ffe81fe9efec24af9dc 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -303,10 +303,10 @@ extern uint32_t npvhashmask;
 extern pv_hashed_entry_t        *pv_hash_table;  /* hash lists */
 extern pv_hashed_entry_t        pv_hashed_free_list;
 extern pv_hashed_entry_t        pv_hashed_kern_free_list;
-decl_simple_lock_data(extern, pv_hashed_free_list_lock)
-decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock)
-decl_simple_lock_data(extern, pv_hash_table_lock)
-decl_simple_lock_data(extern, phys_backup_lock)
+decl_simple_lock_data(extern, pv_hashed_free_list_lock);
+decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock);
+decl_simple_lock_data(extern, pv_hash_table_lock);
+decl_simple_lock_data(extern, phys_backup_lock);
 
 extern zone_t           pv_hashed_list_zone;    /* zone of pv_hashed_entry
                                                  * structures */
@@ -342,7 +342,7 @@ PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep)
        simple_unlock(&pv_hashed_free_list_lock);
 
        if (pv_hashed_free_count <= pv_hashed_low_water_mark) {
-               if (!mappingrecurse && hw_compare_and_store(0, 1, &mappingrecurse)) {
+               if (!mappingrecurse && os_atomic_cmpxchg(&mappingrecurse, 0, 1, acq_rel)) {
                        thread_wakeup(&mapping_replenish_event);
                }
        }
@@ -375,7 +375,7 @@ PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e)
        simple_unlock(&pv_hashed_kern_free_list_lock);
 
        if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) {
-               if (!mappingrecurse && hw_compare_and_store(0, 1, &mappingrecurse)) {
+               if (!mappingrecurse && os_atomic_cmpxchg(&mappingrecurse, 0, 1, acq_rel)) {
                        thread_wakeup(&mapping_replenish_event);
                }
        }
@@ -507,9 +507,14 @@ extern uint64_t pde_mapped_size;
 extern char             *pmap_phys_attributes;
 extern ppnum_t          last_managed_page;
 
-extern ppnum_t  lowest_lo;
-extern ppnum_t  lowest_hi;
-extern ppnum_t  highest_hi;
+/*
+ * Used to record high memory allocated to kernel before
+ * pmap_init() gets called.
+ */
+extern ppnum_t pmap_high_used_top;
+extern ppnum_t pmap_high_used_bottom;
+extern ppnum_t pmap_middle_used_top;
+extern ppnum_t pmap_middle_used_bottom;
 
 /*
  * when spinning through pmap_remove
@@ -643,13 +648,14 @@ popcnt1(uint64_t distance)
  */
 
 typedef enum {
-       PTE_VALID               = 0x0,
-       PTE_INVALID             = 0x1,
-       PTE_RSVD                = 0x2,
-       PTE_SUPERVISOR          = 0x4,
-       PTE_BITFLIP             = 0x8,
-       PV_BITFLIP              = 0x10,
-       PTE_INVALID_CACHEABILITY = 0x20
+       PTE_VALID                = 0x0,
+       PTE_INVALID              = 0x1,
+       PTE_RSVD                 = 0x2,
+       PTE_SUPERVISOR           = 0x4,
+       PTE_BITFLIP              = 0x8,
+       PV_BITFLIP               = 0x10,
+       PTE_INVALID_CACHEABILITY = 0x20,
+       PTE_NXBITFLIP            = 0x40
 } pmap_pagetable_corruption_t;
 
 typedef enum {
@@ -680,6 +686,9 @@ typedef struct {
        pmap_t pvpmap;
        vm_map_offset_t pvva;
        uint64_t abstime;
+       int adj_ptes_count;
+#define PMPTCR_MAX_ADJ_PTES (2)
+       uint64_t adj_ptes[PMPTCR_MAX_ADJ_PTES];
 } pmap_pagetable_corruption_record_t;
 
 extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[];
@@ -687,10 +696,21 @@ extern uint64_t pmap_pagetable_corruption_last_abstime;
 extern thread_call_t    pmap_pagetable_corruption_log_call;
 extern boolean_t pmap_pagetable_corruption_timeout;
 
-static inline void
-pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva)
+static inline pmap_pagetable_corruption_action_t
+pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason,
+    pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep,
+    ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva, int adj_pteps_cnt, uint64_t **adj_pteps)
 {
        uint32_t pmap_pagetable_corruption_log_index;
+       uint64_t curtime = mach_absolute_time();
+
+       if ((curtime - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
+               pmap_pagetable_corruption_timeout = TRUE;
+               action = PMAP_ACTION_ASSERT;
+       } else {
+               pmap_pagetable_corruption_last_abstime = curtime;
+       }
+
        pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG;
        pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident;
        pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason;
@@ -701,9 +721,17 @@ pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corru
        pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn;
        pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap;
        pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva;
-       pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time();
+       pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = curtime;
+       if (adj_pteps_cnt > 0 && adj_pteps != NULL) {
+               pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes_count = MIN(adj_pteps_cnt, PMPTCR_MAX_ADJ_PTES);
+               for (int i = 0; i < pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes_count; i++) {
+                       pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes[i] = *adj_pteps[i];
+               }
+       }
        /* Asynchronously log */
        thread_call_enter(pmap_pagetable_corruption_log_call);
+
+       return action;
 }
 
 static inline pmap_pagetable_corruption_action_t
@@ -797,14 +825,49 @@ pmap_cpc_exit:
                action = PMAP_ACTION_ASSERT;
        }
 
-       if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
-               action = PMAP_ACTION_ASSERT;
-               pmap_pagetable_corruption_timeout = TRUE;
+       return pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva, 0, 0);
+}
+
+static inline boolean_t
+pmap_compressed_pte_corruption_repair(uint64_t pte, uint64_t *pte_addr, uint64_t *ptep, pmap_t pmap,
+    vm_map_offset_t vaddr)
+{
+       uint64_t *adj_pteps[2];
+       int pteidx = ((uintptr_t)ptep & INTEL_OFFMASK) / sizeof(pt_entry_t);
+       pmap_pagetable_corruption_action_t action = PMAP_ACTION_IGNORE;
+
+       /*
+        * Grab pointers to PTEs on either side of the PTE in question, unless we're at the start of
+        * a PT (grab pointers to the next and next-next PTEs) or the end of a PT (grab the previous
+        * 2 PTEs).
+        */
+       if (pteidx == 0) {
+               adj_pteps[0] = ptep + 1;
+               adj_pteps[1] = ptep + 2;
+       } else if (pteidx == (NPTPG - 1)) {
+               adj_pteps[0] = ptep - 2;
+               adj_pteps[1] = ptep - 1;
        } else {
-               pmap_pagetable_corruption_last_abstime = mach_absolute_time();
+               adj_pteps[0] = ptep - 1;
+               adj_pteps[1] = ptep + 1;
        }
-       pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva);
-       return action;
+
+       /*
+        * Since the compressed PTE no longer has a PTE associated, we cannot pass in the pv data to
+        * pmap_pagetable_corruption_log, so instead supply adjacent PTEs for logging.
+        */
+       if (pmap_pagetable_corruption_log(ROOT_ABSENT, (pte & INTEL_PTE_NX) ? PTE_NXBITFLIP : PTE_BITFLIP,
+           action, pmap, vaddr, ptep, (ppnum_t)~0UL, 0, 0, sizeof(adj_pteps) / sizeof(adj_pteps[0]),
+           adj_pteps) != PMAP_ACTION_ASSERT) {
+               /* Correct the flipped bit(s) and continue */
+               pmap_store_pte(ptep, pte & INTEL_PTE_COMPRESSED_MASK);
+               pmap->corrected_compressed_ptes_count++;
+               return TRUE; /* Returning TRUE to indicate this is a now a valid compressed PTE (we hope) */
+       }
+
+       panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted? Adjacent PTEs: 0x%llx@%p, 0x%llx@%p",
+           pte_addr, pte, pte & ~INTEL_PTE_COMPRESSED_MASK, *adj_pteps[0], adj_pteps[0], *adj_pteps[1], adj_pteps[1]);
+       /*NOTREACHED*/
 }
 
 /*
index 93169df60932b661fbfe8ca52122bcedf08688ba..eae2bf321a1e292ea7210371d66210af073c29e6 100644 (file)
@@ -342,7 +342,7 @@ pmap_find_phys(pmap_t pmap, addr64_t va)
                mp_disable_preemption();
        }
 
-       if (!pmap->ref_count) {
+       if (os_ref_get_count(&pmap->ref_count) == 0) {
                goto pfp_exit;
        }
 
@@ -640,7 +640,7 @@ Retry:
        old_pa_locked = FALSE;
 
        if (old_pa == 0 &&
-           PTE_IS_COMPRESSED(*pte, pte)) {
+           PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr)) {
                /*
                 * "pmap" should be locked at this point, so this should
                 * not race with another pmap_enter() or pmap_remove_range().
@@ -1261,7 +1261,7 @@ pmap_remove_range_options(
                pa = pte_to_pa(p);
                if (pa == 0) {
                        if ((options & PMAP_OPTIONS_REMOVE) &&
-                           (PTE_IS_COMPRESSED(p, cpte))) {
+                           (PTE_IS_COMPRESSED(p, cpte, pmap, vaddr))) {
                                assert(pmap != kernel_pmap);
                                /* one less "compressed"... */
                                stats_compressed++;
@@ -1322,7 +1322,7 @@ check_pte_for_compressed_marker:
                         * loop above, so check again.
                         */
                        if ((options & PMAP_OPTIONS_REMOVE) &&
-                           (PTE_IS_COMPRESSED(*cpte, cpte))) {
+                           (PTE_IS_COMPRESSED(*cpte, cpte, pmap, vaddr))) {
                                assert(pmap != kernel_pmap);
                                /* one less "compressed"... */
                                stats_compressed++;
@@ -1724,7 +1724,7 @@ pmap_page_protect_options(
                        if (pmap != kernel_pmap &&
                            (options & PMAP_OPTIONS_COMPRESSOR) &&
                            IS_INTERNAL_PAGE(pai)) {
-                               assert(!PTE_IS_COMPRESSED(*pte, pte));
+                               assert(!PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr));
                                /* mark this PTE as having been "compressed" */
                                new_pte_value = PTE_COMPRESSED;
                                if (IS_ALTACCT_PAGE(pai, pv_e)) {
@@ -2525,7 +2525,7 @@ pmap_query_page_info(
 
        pa = pte_to_pa(*pte);
        if (pa == 0) {
-               if (PTE_IS_COMPRESSED(*pte, pte)) {
+               if (PTE_IS_COMPRESSED(*pte, pte, pmap, va)) {
                        disp |= PMAP_QUERY_PAGE_COMPRESSED;
                        if (*pte & PTE_COMPRESSED_ALT) {
                                disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
@@ -2581,6 +2581,7 @@ pmap_trim(__unused pmap_t grand, __unused pmap_t subord, __unused addr64_t vstar
        return;
 }
 
+__dead2
 void
 pmap_ledger_alloc_init(size_t size)
 {
@@ -2589,15 +2590,15 @@ pmap_ledger_alloc_init(size_t size)
            __func__, size);
 }
 
+__dead2
 ledger_t
 pmap_ledger_alloc(void)
 {
        panic("%s: unsupported",
            __func__);
-
-       return NULL;
 }
 
+__dead2
 void
 pmap_ledger_free(ledger_t ledger)
 {
index fa269748c3550bb5363d988c2471fc8a52ae5eb1..bc6fa6524b9d48e82c5213f8d69cb71d462815ab 100644 (file)
@@ -227,20 +227,6 @@ rtc_clock_adjust(uint64_t tsc_base_delta)
        rtc_nanotime_set_commpage(rntp);
 }
 
-void
-rtc_clock_stepping(__unused uint32_t new_frequency,
-    __unused uint32_t old_frequency)
-{
-       panic("rtc_clock_stepping unsupported");
-}
-
-void
-rtc_clock_stepped(__unused uint32_t new_frequency,
-    __unused uint32_t old_frequency)
-{
-       panic("rtc_clock_stepped unsupported");
-}
-
 /*
  * rtc_sleep_wakeup:
  *
index 720b743e877b00b961063a687a2f3f5de4ccf6de..2fe8a84623c793ec2af5cfaccc6b057a87df1e14 100644 (file)
@@ -145,14 +145,14 @@ rtc_lapic_set_tsc_deadline_timer(uint64_t deadline, uint64_t now)
  * Definitions for timer operations table
  */
 
-rtc_timer_t     rtc_timer_lapic  = {
-       rtc_lapic_config_timer,
-       rtc_lapic_set_timer
+rtc_timer_t rtc_timer_lapic = {
+       .rtc_config = rtc_lapic_config_timer,
+       .rtc_set    = rtc_lapic_set_timer,
 };
 
-rtc_timer_t     rtc_timer_tsc_deadline  = {
-       rtc_lapic_config_tsc_deadline_timer,
-       rtc_lapic_set_tsc_deadline_timer
+rtc_timer_t rtc_timer_tsc_deadline = {
+       .rtc_config = rtc_lapic_config_tsc_deadline_timer,
+       .rtc_set    = rtc_lapic_set_tsc_deadline_timer,
 };
 
 rtc_timer_t     *rtc_timer = &rtc_timer_lapic; /* defaults to LAPIC timer */
index b9298397d514216814490ac71f4d9e759e36fc05..4fd4d0677d10f0da716ce7560a606405eb5806db 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -120,7 +120,7 @@ typedef usimple_lock_data_t     *simple_lock_t;
 typedef usimple_lock_data_t     simple_lock_data_t;
 
 #define decl_simple_lock_data(class, name) \
-       class   simple_lock_data_t      name;
+       class   simple_lock_data_t      name
 
 #endif  /* !defined(decl_simple_lock_data) */
 
index 74da242cf0ee939e0e7b5628c2d6b732ed6322b4..1c1e8e92615a92b87e573b36d3cca1cc3026087b 100644 (file)
@@ -157,6 +157,8 @@ struct machine_thread {
        int                     physwindow_busy;
 #endif
 
+       uint32_t                last_xcpm_ttd;
+       uint8_t                 last_xcpm_index;
        int                     mthr_do_segchk;
 };
 typedef struct machine_thread *pcb_t;
index e9ef4dea79b9c4c7dca7b45dbde1dea5a440f926..bfc24c4aa93e45faf1235cf34669a3eabfacc287 100644 (file)
@@ -117,7 +117,7 @@ extern void kprint_state(x86_saved_state64_t *saved_state);
  * Forward declarations
  */
 static void user_page_fault_continue(kern_return_t kret);
-static void panic_trap(x86_saved_state64_t *saved_state, uint32_t pl, kern_return_t fault_result);
+static void panic_trap(x86_saved_state64_t *saved_state, uint32_t pl, kern_return_t fault_result) __dead2;
 static void set_recovery_ip(x86_saved_state64_t *saved_state, vm_offset_t ip);
 
 #if CONFIG_DTRACE
@@ -709,7 +709,7 @@ kernel_trap(
        case T_PAGE_FAULT:
 
 #if CONFIG_DTRACE
-               if (thread != THREAD_NULL && thread->options & TH_OPT_DTRACE) { /* Executing under dtrace_probe? */
+               if (thread != THREAD_NULL && thread->t_dtrace_inprobe) { /* Executing under dtrace_probe? */
                        if (dtrace_tally_fault(vaddr)) { /* Should a fault under dtrace be ignored? */
                                /*
                                 * DTrace has "anticipated" the possibility of this fault, and has
@@ -878,12 +878,6 @@ panic_trap(x86_saved_state64_t *regs, uint32_t pl, kern_return_t fault_result)
            potential_smap_fault ? " SMAP fault" : "",
            pl,
            fault_result);
-       /*
-        * This next statement is not executed,
-        * but it's needed to stop the compiler using tail call optimization
-        * for the panic call - which confuses the subsequent backtrace.
-        */
-       cr0 = 0;
 }
 
 #if CONFIG_DTRACE
@@ -1124,6 +1118,14 @@ user_trap(
                        /*NOTREACHED*/
                }
 
+               /*
+                * For a user trap, vm_fault() should never return KERN_FAILURE.
+                * If it does, we're leaking preemption disables somewhere in the kernel.
+                */
+               if (__improbable(kret == KERN_FAILURE)) {
+                       panic("vm_fault() KERN_FAILURE from user fault on thread %p", thread);
+               }
+
                user_page_fault_continue(kret);
        }       /* NOTREACHED */
        break;
@@ -1153,7 +1155,6 @@ user_trap(
 
        default:
                panic("Unexpected user trap, type %d", type);
-               return;
        }
        /* Note: Codepaths that directly return from user_trap() have pending
         * ASTs processed in locore
index fc7df3dfbf8e02039a3f727a73a6508259576446..5601e64f6e542d9ca7ae1e6a3b5f0277d2ab57b3 100644 (file)
@@ -132,8 +132,8 @@ extern void             user_trap(x86_saved_state_t *regs);
 
 extern void             interrupt(x86_saved_state_t *regs);
 
-extern void             panic_double_fault64(x86_saved_state_t *regs);
-extern void             panic_machine_check64(x86_saved_state_t *regs);
+extern void             panic_double_fault64(x86_saved_state_t *regs) __abortlike;
+extern void             panic_machine_check64(x86_saved_state_t *regs) __abortlike;
 
 typedef kern_return_t (*perfCallback)(
        int                     trapno,
index 82f5c5168d6577c8c5041297292898f60d70c930..b5613be392f826c0318a547bda4d2924d3a83c47 100644 (file)
 
 extern void kprintf_break_lock(void);
 extern void kprint_state(x86_saved_state64_t *saved_state);
-void panic_64(x86_saved_state_t *, int, const char *, boolean_t);
 
 extern volatile int panic_double_fault_cpu;
 
@@ -109,7 +108,7 @@ extern volatile int panic_double_fault_cpu;
 /*
  * K64 debug - fatal handler for debug code in the trap vectors.
  */
-extern void
+extern void __dead2
 panic_idt64(x86_saved_state_t *rsp);
 void
 panic_idt64(x86_saved_state_t *rsp)
@@ -120,7 +119,8 @@ panic_idt64(x86_saved_state_t *rsp)
 #endif
 
 
-void
+__dead2
+static void
 panic_64(x86_saved_state_t *sp, __unused int pc, __unused const char *msg, boolean_t do_mca_dump)
 {
        /* Set postcode (DEBUG only) */
index 29334339aa86396e6a565bd865ee64e6278de408..fa5c0ce237307b22538541a60ff4aaf89bdbda95 100644 (file)
@@ -77,6 +77,9 @@
 #include <i386/seg.h>
 #include <i386/thread.h>
 
+#include <IOKit/IOBSD.h> /* for IOTaskHasEntitlement */
+#include <sys/csr.h> /* for csr_check */
+
 #include <sys/errno.h>
 
 static void user_ldt_set_action(void *);
@@ -85,7 +88,7 @@ static int i386_set_ldt_impl(uint32_t *retval, uint64_t start_sel, uint64_t desc
 static int i386_get_ldt_impl(uint32_t *retval, uint64_t start_sel, uint64_t descs,
     uint64_t num_sels);
 
-extern int allow_64bit_proc_LDT_ops;
+#define LDT_IN_64BITPROC_ENTITLEMENT "com.apple.security.ldt-in-64bit-process"
 
 /*
  * Add the descriptors to the LDT, starting with
@@ -441,8 +444,9 @@ i386_set_ldt64(
        uint64_t                descs,  /* out */
        uint64_t                num_sels)
 {
-       if (!allow_64bit_proc_LDT_ops) {
-               return EINVAL;
+       if (csr_check(CSR_ALLOW_UNTRUSTED_KEXTS) != 0 &&
+           !IOTaskHasEntitlement(current_task(), LDT_IN_64BITPROC_ENTITLEMENT)) {
+               return EPERM;
        }
 
        return i386_set_ldt_impl(retval, start_sel, descs, num_sels);
@@ -468,8 +472,9 @@ i386_get_ldt64(
        uint64_t                descs,  /* out */
        uint64_t                num_sels)
 {
-       if (!allow_64bit_proc_LDT_ops) {
-               return EINVAL;
+       if (csr_check(CSR_ALLOW_UNTRUSTED_KEXTS) != 0 &&
+           !IOTaskHasEntitlement(current_task(), LDT_IN_64BITPROC_ENTITLEMENT)) {
+               return EPERM;
        }
 
        return i386_get_ldt_impl(retval, start_sel, descs, num_sels);
diff --git a/osfmk/i386/xpr.h b/osfmk/i386/xpr.h
deleted file mode 100644 (file)
index 3c7449a..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/*
- * Mach Operating System
- * Copyright (c) 1991,1990,1989 Carnegie Mellon University
- * All Rights Reserved.
- *
- * Permission to use, copy, modify and distribute this software and its
- * documentation is hereby granted, provided that both the copyright
- * notice and this permission notice appear in all copies of the
- * software, derivative works or modified versions, and any portions
- * thereof, and that both notices appear in supporting documentation.
- *
- * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
- * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
- * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- *
- * Carnegie Mellon requests users of this software to return to
- *
- *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
- *  School of Computer Science
- *  Carnegie Mellon University
- *  Pittsburgh PA 15213-3890
- *
- * any improvements or extensions that they make and grant Carnegie Mellon
- * the rights to redistribute these changes.
- */
-/*
- */
-
-/*
- *     File:   xpr.h
- *
- *     Machine dependent module for the XPR tracing facility.
- */
-
-#define XPR_TIMESTAMP   (0)
index f63a3f5462feae70a1720d074aff8b0106951692..dab496ef861bff8c00d50f54ccb10082a4fa42a2 100644 (file)
@@ -125,6 +125,8 @@ struct ipc_entry {
 #define IE_BITS_TYPE_MASK       0x001f0000      /* 5 bits of capability type */
 #define IE_BITS_TYPE(bits)      ((bits) & IE_BITS_TYPE_MASK)
 
+#define IE_BITS_EXTYPE_MASK     0x00200000      /* 1 bit for extended capability */
+
 #ifndef NO_PORT_GEN
 #define IE_BITS_GEN_MASK        0xff000000      /* 8 bits for generation */
 #define IE_BITS_GEN(bits)       ((bits) & IE_BITS_GEN_MASK)
index 86d03a586faa2b58061532e4a48e10cc53cc6bfa..44d1efed80fcbe0237e64c769033a238b4aa28f6 100644 (file)
@@ -88,14 +88,14 @@ static lck_spin_t ipc_importance_lock_data;     /* single lock for now */
        lck_spin_assert(&ipc_importance_lock_data, LCK_ASSERT_OWNED)
 
 #if IIE_REF_DEBUG
-#define incr_ref_counter(x) (hw_atomic_add(&(x), 1))
+#define incr_ref_counter(x) (os_atomic_inc(&(x), relaxed))
 
 static inline
 uint32_t
 ipc_importance_reference_internal(ipc_importance_elem_t elem)
 {
        incr_ref_counter(elem->iie_refs_added);
-       return hw_atomic_add(&elem->iie_bits, 1) & IIE_REFS_MASK;
+       return os_atomic_inc(&elem->iie_bits, relaxed) & IIE_REFS_MASK;
 }
 
 static inline
@@ -103,7 +103,7 @@ uint32_t
 ipc_importance_release_internal(ipc_importance_elem_t elem)
 {
        incr_ref_counter(elem->iie_refs_dropped);
-       return hw_atomic_sub(&elem->iie_bits, 1) & IIE_REFS_MASK;
+       return os_atomic_dec(&elem->iie_bits, relaxed) & IIE_REFS_MASK;
 }
 
 static inline
@@ -730,7 +730,7 @@ ipc_importance_task_propagate_helper(
                }
 
                /* determine the task importance to adjust as result (if any) */
-               port = (ipc_port_t) hdr->msgh_remote_port;
+               port = hdr->msgh_remote_port;
                assert(IP_VALID(port));
                ip_lock(port);
                temp_task_imp = IIT_NULL;
@@ -1477,7 +1477,7 @@ ipc_importance_task_drop_legacy_external_assertion(ipc_importance_task_t task_im
 }
 
 
-
+#if LEGACY_IMPORTANCE_DELIVERY
 /* Transfer an assertion to legacy userspace responsibility */
 static kern_return_t
 ipc_importance_task_externalize_legacy_assertion(ipc_importance_task_t task_imp, uint32_t count, __unused int sender_pid)
@@ -1515,6 +1515,7 @@ ipc_importance_task_externalize_legacy_assertion(ipc_importance_task_t task_imp,
 
        return KERN_SUCCESS;
 }
+#endif /* LEGACY_IMPORTANCE_DELIVERY */
 
 /*
  *     Routine:        ipc_importance_task_update_live_donor
@@ -2221,6 +2222,7 @@ ipc_importance_check_circularity(
        int assertcnt = 0;
        ipc_port_t base;
        struct turnstile *send_turnstile = TURNSTILE_NULL;
+       struct task_watchport_elem *watchport_elem = NULL;
 
        assert(port != IP_NULL);
        assert(dest != IP_NULL);
@@ -2308,7 +2310,7 @@ ipc_importance_check_circularity(
 
                /* port (== base) is in limbo */
 
-               assert(ip_active(port));
+               require_ip_active(port);
                assert(port->ip_receiver_name == MACH_PORT_NULL);
                assert(port->ip_destination == IP_NULL);
 
@@ -2318,7 +2320,7 @@ ipc_importance_check_circularity(
 
                        /* base is in transit or in limbo */
 
-                       assert(ip_active(base));
+                       require_ip_active(base);
                        assert(base->ip_receiver_name == MACH_PORT_NULL);
 
                        next = base->ip_destination;
@@ -2347,10 +2349,18 @@ not_circular:
        /* port is in limbo */
        imq_lock(&port->ip_messages);
 
-       assert(ip_active(port));
+       require_ip_active(port);
        assert(port->ip_receiver_name == MACH_PORT_NULL);
        assert(port->ip_destination == IP_NULL);
 
+       /* Port is being enqueued in a kmsg, remove the watchport boost in order to push on destination port */
+       watchport_elem = ipc_port_clear_watchport_elem_internal(port);
+
+       /* Check if the port is being enqueued as a part of sync bootstrap checkin */
+       if (dest->ip_specialreply && dest->ip_sync_bootstrap_checkin) {
+               port->ip_sync_bootstrap_checkin = 1;
+       }
+
        ip_reference(dest);
        port->ip_destination = dest;
 
@@ -2403,7 +2413,7 @@ not_circular:
 
                /* port is in transit */
 
-               assert(ip_active(dest));
+               require_ip_active(dest);
                assert(dest->ip_receiver_name == MACH_PORT_NULL);
                assert(dest->ip_destination != IP_NULL);
                assert(dest->ip_tempowner == 0);
@@ -2451,6 +2461,18 @@ not_circular:
 
        ip_unlock(base);
 
+       /* All locks dropped, call turnstile_update_inheritor_complete for source port's turnstile */
+       if (send_turnstile) {
+               turnstile_update_inheritor_complete(send_turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
+
+               /* Take the mq lock to call turnstile complete */
+               imq_lock(&port->ip_messages);
+               turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL, TURNSTILE_SYNC_IPC);
+               send_turnstile = TURNSTILE_NULL;
+               imq_unlock(&port->ip_messages);
+               turnstile_cleanup();
+       }
+
        /*
         * Transfer assertions now that the ports are unlocked.
         * Avoid extra overhead if transferring to/from the same task.
@@ -2480,18 +2502,6 @@ not_circular:
                ipc_importance_unlock();
        }
 
-       /* All locks dropped, call turnstile_update_inheritor_complete for source port's turnstile */
-       if (send_turnstile) {
-               turnstile_update_inheritor_complete(send_turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
-
-               /* Take the mq lock to call turnstile complete */
-               imq_lock(&port->ip_messages);
-               turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL);
-               send_turnstile = TURNSTILE_NULL;
-               imq_unlock(&port->ip_messages);
-               turnstile_cleanup();
-       }
-
        if (imp_task != IIT_NULL) {
                ipc_importance_task_release(imp_task);
        }
@@ -2500,6 +2510,10 @@ not_circular:
                ipc_importance_task_release(release_imp_task);
        }
 
+       if (watchport_elem) {
+               task_watchport_elem_deallocate(watchport_elem);
+       }
+
        return FALSE;
 }
 
@@ -2518,7 +2532,7 @@ ipc_importance_send(
        ipc_kmsg_t              kmsg,
        mach_msg_option_t       option)
 {
-       ipc_port_t port = (ipc_port_t) kmsg->ikm_header->msgh_remote_port;
+       ipc_port_t port = kmsg->ikm_header->msgh_remote_port;
        boolean_t port_lock_dropped = FALSE;
        ipc_importance_elem_t elem;
        task_t task;
@@ -3154,11 +3168,14 @@ ipc_importance_receive(
        ipc_kmsg_t              kmsg,
        mach_msg_option_t       option)
 {
+       int impresult = -1;
+
+#if IMPORTANCE_TRACE || LEGACY_IMPORTANCE_DELIVERY
+       task_t task_self = current_task();
        unsigned int sender_pid = ((mach_msg_max_trailer_t *)
            ((vm_offset_t)kmsg->ikm_header +
            round_msg(kmsg->ikm_header->msgh_size)))->msgh_audit.val[5];
-       task_t task_self = current_task();
-       int impresult = -1;
+#endif
 
        /* convert to a voucher with an inherit importance attribute? */
        if ((option & MACH_RCV_VOUCHER) != 0) {
@@ -3239,14 +3256,17 @@ ipc_importance_receive(
 
                /* With kmsg unlinked, can safely examine message importance attribute. */
                if (MACH_MSGH_BITS_RAISED_IMPORTANCE(kmsg->ikm_header->msgh_bits)) {
-                       ipc_importance_task_t task_imp = task_self->task_imp_base;
                        ipc_port_t port = kmsg->ikm_header->msgh_remote_port;
+#if LEGACY_IMPORTANCE_DELIVERY
+                       ipc_importance_task_t task_imp = task_self->task_imp_base;
 
                        /* The owner of receive right might have changed, take the internal assertion */
                        if (KERN_SUCCESS == ipc_importance_task_hold_internal_assertion(task_imp, 1)) {
                                ipc_importance_task_externalize_legacy_assertion(task_imp, 1, sender_pid);
                                impresult = 1;
-                       } else {
+                       } else
+#endif
+                       {
                                /* The importance boost never applied to task (clear the bit) */
                                kmsg->ikm_header->msgh_bits &= ~MACH_MSGH_BITS_RAISEIMP;
                                impresult = 0;
@@ -3409,7 +3429,7 @@ static void
 ipc_importance_manager_release(
        ipc_voucher_attr_manager_t              manager);
 
-struct ipc_voucher_attr_manager ipc_importance_manager = {
+const struct ipc_voucher_attr_manager ipc_importance_manager = {
        .ivam_release_value =   ipc_importance_release_value,
        .ivam_get_value =       ipc_importance_get_value,
        .ivam_extract_content = ipc_importance_extract_content,
@@ -3792,6 +3812,7 @@ ipc_importance_command(
  *             reference granted back at registration time, and that reference is never
  *             dropped, this should never be called.
  */
+__abortlike
 static void
 ipc_importance_manager_release(
        ipc_voucher_attr_manager_t              __assert_only manager)
index 9f69a6af193c14d2ab2d09f8333a6050013d9cd0..16ca8ed40f5ad3afb5cd46f7311f8ccf2f4d6b0f 100644 (file)
@@ -95,10 +95,10 @@ struct ipc_importance_elem {
 
 #if !IIE_REF_DEBUG
 #define ipc_importance_reference_internal(elem)         \
-       (hw_atomic_add(&(elem)->iie_bits, 1) & IIE_REFS_MASK)
+       (os_atomic_inc(&(elem)->iie_bits, relaxed) & IIE_REFS_MASK)
 
 #define ipc_importance_release_internal(elem)           \
-       (hw_atomic_sub(&(elem)->iie_bits, 1) & IIE_REFS_MASK)
+       (os_atomic_dec(&(elem)->iie_bits, relaxed) & IIE_REFS_MASK)
 #endif
 
 struct ipc_importance_task {
index 4e45ca60e99a902c3b76e4a2b5f2821f2e4ccbfd..ca4bcee84c00aa3cf87f1f07eeb8baa2ed878b99 100644 (file)
@@ -77,6 +77,7 @@
 #include <mach/kern_return.h>
 
 #include <kern/kern_types.h>
+#include <kern/arcade.h>
 #include <kern/kalloc.h>
 #include <kern/simple_lock.h>
 #include <kern/mach_param.h>
@@ -125,6 +126,8 @@ vm_size_t ipc_kmsg_max_body_space = ((IPC_KMSG_MAX_SPACE * 3) / 4 - MAX_TRAILER_
 int ipc_space_max;
 int ipc_port_max;
 int ipc_pset_max;
+int prioritize_launch = 1;
+int enforce_strict_reply = 0;
 
 
 lck_grp_t               ipc_lck_grp;
@@ -143,6 +146,8 @@ void
 ipc_bootstrap(void)
 {
        kern_return_t kr;
+       int prioritize_launch_bootarg;
+       int strict_reply_bootarg;
 
        lck_grp_attr_setdefault(&ipc_lck_grp_attr);
        lck_grp_init(&ipc_lck_grp, "ipc", &ipc_lck_grp_attr);
@@ -171,6 +176,7 @@ ipc_bootstrap(void)
        /* cant charge callers for port allocations (references passed) */
        zone_change(ipc_object_zones[IOT_PORT], Z_CALLERACCT, FALSE);
        zone_change(ipc_object_zones[IOT_PORT], Z_NOENCRYPT, TRUE);
+       zone_change(ipc_object_zones[IOT_PORT], Z_CLEARMEMORY, TRUE);
 
        ipc_object_zones[IOT_PORT_SET] =
            zinit(sizeof(struct ipc_pset),
@@ -178,6 +184,7 @@ ipc_bootstrap(void)
            sizeof(struct ipc_pset),
            "ipc port sets");
        zone_change(ipc_object_zones[IOT_PORT_SET], Z_NOENCRYPT, TRUE);
+       zone_change(ipc_object_zones[IOT_PORT_SET], Z_CLEARMEMORY, TRUE);
 
        /*
         * Create the basic ipc_kmsg_t zone (the one we also cache)
@@ -216,6 +223,17 @@ ipc_bootstrap(void)
        semaphore_init();
        mk_timer_init();
        host_notify_init();
+
+#if CONFIG_ARCADE
+       arcade_init();
+#endif
+
+       if (PE_parse_boot_argn("prioritize_launch", &prioritize_launch_bootarg, sizeof(prioritize_launch_bootarg))) {
+               prioritize_launch = !!prioritize_launch_bootarg;
+       }
+       if (PE_parse_boot_argn("ipc_strict_reply", &strict_reply_bootarg, sizeof(strict_reply_bootarg))) {
+               enforce_strict_reply = !!strict_reply_bootarg;
+       }
 }
 
 /*
index 803b25bc291d24e79b3901a2e2e1d4ba4712dc0f..f1611fc821036fccebd36c445a33bd5a1ce6daf0 100644 (file)
@@ -96,7 +96,7 @@
 
 #include <pthread/priority_private.h>
 
-#include <machine/machlimits.h>
+#include <machine/limits.h>
 
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
@@ -168,6 +168,7 @@ typedef union{
        mach_msg_legacy_port_descriptor_t                     port;
        mach_msg_ool_descriptor32_t           out_of_line32;
        mach_msg_ool_ports_descriptor32_t     ool_ports32;
+       mach_msg_guarded_port_descriptor32_t  guarded_port32;
        mach_msg_type_descriptor_t                    type;
 } mach_msg_legacy_descriptor_t;
 
@@ -471,7 +472,15 @@ ipc_msg_print_untyped64(
                            dsc->deallocate ? "DEALLOC" : "");
                        break;
                }
+               case MACH_MSG_GUARDED_PORT_DESCRIPTOR: {
+                       mach_msg_guarded_port_descriptor_t *dsc;
 
+                       dsc = (mach_msg_guarded_port_descriptor_t *)&saddr->guarded_port;
+                       kprintf("    GUARDED_PORT name = %p flags = 0x%x disp = ", dsc->name, dsc->flags);
+                       ipc_print_type_name64(dsc->disposition);
+                       kprintf("\n");
+                       break;
+               }
                default: {
                        kprintf("    UNKNOWN DESCRIPTOR 0x%x\n", type);
                        break;
@@ -568,8 +577,9 @@ MACRO_END
 #define KMSG_TRACE_FLAG_TIMER      0x200000
 #define KMSG_TRACE_FLAG_SEMA       0x400000
 #define KMSG_TRACE_FLAG_DTMPOWNER  0x800000
+#define KMSG_TRACE_FLAG_GUARDED_DESC 0x1000000
 
-#define KMSG_TRACE_FLAGS_MASK      0xffffff
+#define KMSG_TRACE_FLAGS_MASK      0x1ffffff
 #define KMSG_TRACE_FLAGS_SHIFT     8
 
 #define KMSG_TRACE_PORTS_MASK      0xff
@@ -577,7 +587,6 @@ MACRO_END
 
 #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
 #include <stdint.h>
-extern boolean_t kdebug_debugid_enabled(uint32_t debugid);
 
 void
 ipc_kmsg_trace_send(ipc_kmsg_t kmsg,
@@ -591,7 +600,7 @@ ipc_kmsg_trace_send(ipc_kmsg_t kmsg,
 
        int kotype = 0;
        uint32_t msg_size = 0;
-       uint32_t msg_flags = KMSG_TRACE_FLAG_TRACED;
+       uint64_t msg_flags = KMSG_TRACE_FLAG_TRACED;
        uint32_t num_ports = 0;
        uint32_t send_pid, dst_pid;
 
@@ -610,7 +619,7 @@ ipc_kmsg_trace_send(ipc_kmsg_t kmsg,
 
        msg = kmsg->ikm_header;
 
-       dst_port = (ipc_port_t)(msg->msgh_remote_port);
+       dst_port = msg->msgh_remote_port;
        if (!IPC_PORT_VALID(dst_port)) {
                return;
        }
@@ -658,7 +667,7 @@ ipc_kmsg_trace_send(ipc_kmsg_t kmsg,
                msg_flags |= KMSG_TRACE_FLAG_SND64;
        }
 
-       src_port = (ipc_port_t)(msg->msgh_local_port);
+       src_port = msg->msgh_local_port;
        if (src_port) {
                if (src_port->ip_messages.imq_qlimit != MACH_PORT_QLIMIT_DEFAULT) {
                        msg_flags |= KMSG_TRACE_FLAG_SRC_NDFLTQ;
@@ -735,6 +744,7 @@ ipc_kmsg_trace_send(ipc_kmsg_t kmsg,
        case IKOT_IOKIT_CONNECT:
        case IKOT_IOKIT_OBJECT:
        case IKOT_IOKIT_IDENT:
+       case IKOT_UEXT_OBJECT:
                msg_flags |= KMSG_TRACE_FLAG_IOKIT;
                break;
        default:
@@ -806,6 +816,12 @@ ipc_kmsg_trace_send(ipc_kmsg_t kmsg,
                                        msg_size -= 16;
                                }
                        } break;
+                       case MACH_MSG_GUARDED_PORT_DESCRIPTOR:
+                               num_ports++;
+                               msg_flags |= KMSG_TRACE_FLAG_GUARDED_DESC;
+                               if (is_task_64bit) {
+                                       msg_size -= 16;
+                               }
                        default:
                                break;
                        }
@@ -818,7 +834,7 @@ ipc_kmsg_trace_send(ipc_kmsg_t kmsg,
        trailer = (mach_msg_trailer_t *)((vm_offset_t)msg +
            round_msg((vm_offset_t)msg->msgh_size));
        if (trailer->msgh_trailer_size <= sizeof(mach_msg_security_trailer_t)) {
-               extern security_token_t KERNEL_SECURITY_TOKEN;
+               extern const security_token_t KERNEL_SECURITY_TOKEN;
                mach_msg_security_trailer_t *strailer;
                strailer = (mach_msg_security_trailer_t *)trailer;
                /*
@@ -873,6 +889,31 @@ mach_msg_return_t ipc_kmsg_copyin_body(
        vm_map_t            map,
        mach_msg_option_t   *optionp);
 
+
+extern int enforce_strict_reply;
+
+static void
+ipc_kmsg_link_reply_context_locked(
+       ipc_port_t reply_port,
+       ipc_port_t voucher_port);
+
+static kern_return_t
+ipc_kmsg_validate_reply_port_locked(
+       ipc_port_t reply_port,
+       mach_msg_option_t options);
+
+static mach_msg_return_t
+ipc_kmsg_validate_reply_context_locked(
+       mach_msg_option_t option,
+       ipc_port_t dest_port,
+       ipc_voucher_t voucher,
+       mach_port_name_t voucher_name);
+
+/* we can't include the BSD <sys/persona.h> header here... */
+#ifndef PERSONA_ID_NONE
+#define PERSONA_ID_NONE ((uint32_t)-1)
+#endif
+
 /*
  *     We keep a per-processor cache of kernel message buffers.
  *     The cache saves the overhead/locking of using kalloc/kfree.
@@ -899,7 +940,7 @@ ipc_kmsg_alloc(
        /*
         * LP64support -
         * Pad the allocation in case we need to expand the
-        * message descrptors for user spaces with pointers larger than
+        * message descriptors for user spaces with pointers larger than
         * the kernel's own, or vice versa.  We don't know how many descriptors
         * there are yet, so just assume the whole body could be
         * descriptors (if there could be any at all).
@@ -1298,10 +1339,10 @@ ipc_kmsg_clean_body(
                        /*
                         * Destroy port rights carried in the message
                         */
-                       if (!IO_VALID((ipc_object_t) dsc->name)) {
+                       if (!IP_VALID(dsc->name)) {
                                continue;
                        }
-                       ipc_object_destroy((ipc_object_t) dsc->name, dsc->disposition);
+                       ipc_object_destroy(ip_to_object(dsc->name), dsc->disposition);
                        break;
                }
                case MACH_MSG_OOL_VOLATILE_DESCRIPTOR:
@@ -1354,8 +1395,20 @@ ipc_kmsg_clean_body(
                            (vm_size_t) dsc->count * sizeof(mach_port_t));
                        break;
                }
+               case MACH_MSG_GUARDED_PORT_DESCRIPTOR: {
+                       mach_msg_guarded_port_descriptor_t *dsc = (typeof(dsc)) & saddr->guarded_port;
+
+                       /*
+                        * Destroy port rights carried in the message
+                        */
+                       if (!IP_VALID(dsc->name)) {
+                               continue;
+                       }
+                       ipc_object_destroy(ip_to_object(dsc->name), dsc->disposition);
+                       break;
+               }
                default: {
-                       _ipc_kmsg_clean_invalid_desc++; /* don't understand this type of descriptor */
+                       _ipc_kmsg_clean_invalid_desc++;         /* don't understand this type of descriptor */
                }
                }
        }
@@ -1388,16 +1441,16 @@ ipc_kmsg_clean_partial(
        /* deal with importance chain while we still have dest and voucher references */
        ipc_importance_clean(kmsg);
 
-       object = (ipc_object_t) kmsg->ikm_header->msgh_remote_port;
+       object = ip_to_object(kmsg->ikm_header->msgh_remote_port);
        assert(IO_VALID(object));
        ipc_object_destroy_dest(object, MACH_MSGH_BITS_REMOTE(mbits));
 
-       object = (ipc_object_t) kmsg->ikm_header->msgh_local_port;
+       object = ip_to_object(kmsg->ikm_header->msgh_local_port);
        if (IO_VALID(object)) {
                ipc_object_destroy(object, MACH_MSGH_BITS_LOCAL(mbits));
        }
 
-       object = (ipc_object_t) kmsg->ikm_voucher;
+       object = ip_to_object(kmsg->ikm_voucher);
        if (IO_VALID(object)) {
                assert(MACH_MSGH_BITS_VOUCHER(mbits) == MACH_MSG_TYPE_MOVE_SEND);
                ipc_object_destroy(object, MACH_MSG_TYPE_PORT_SEND);
@@ -1431,17 +1484,17 @@ ipc_kmsg_clean(
        ipc_importance_clean(kmsg);
 
        mbits = kmsg->ikm_header->msgh_bits;
-       object = (ipc_object_t) kmsg->ikm_header->msgh_remote_port;
+       object = ip_to_object(kmsg->ikm_header->msgh_remote_port);
        if (IO_VALID(object)) {
                ipc_object_destroy_dest(object, MACH_MSGH_BITS_REMOTE(mbits));
        }
 
-       object = (ipc_object_t) kmsg->ikm_header->msgh_local_port;
+       object = ip_to_object(kmsg->ikm_header->msgh_local_port);
        if (IO_VALID(object)) {
                ipc_object_destroy(object, MACH_MSGH_BITS_LOCAL(mbits));
        }
 
-       object = (ipc_object_t) kmsg->ikm_voucher;
+       object = ip_to_object(kmsg->ikm_voucher);
        if (IO_VALID(object)) {
                assert(MACH_MSGH_BITS_VOUCHER(mbits) == MACH_MSG_TYPE_MOVE_SEND);
                ipc_object_destroy(object, MACH_MSG_TYPE_PORT_SEND);
@@ -1688,7 +1741,7 @@ ipc_kmsg_get_from_kernel(
        assert(size >= sizeof(mach_msg_header_t));
        assert((size & 3) == 0);
 
-       dest_port = (ipc_port_t)msg->msgh_remote_port;
+       dest_port = msg->msgh_remote_port;
 
        msg_and_trailer_size = size + MAX_TRAILER_SIZE;
 
@@ -1812,10 +1865,26 @@ ipc_kmsg_send(
 
        ipc_voucher_send_preprocessing(kmsg);
 
-       port = (ipc_port_t) kmsg->ikm_header->msgh_remote_port;
+       port = kmsg->ikm_header->msgh_remote_port;
        assert(IP_VALID(port));
        ip_lock(port);
 
+       /*
+        * If the destination has been guarded with a reply context, and the
+        * sender is consuming a send-once right, then assume this is a reply
+        * to an RPC and we need to validate that this sender is currently in
+        * the correct context.
+        */
+       if (enforce_strict_reply && port->ip_reply_context != 0 &&
+           ((option & MACH_SEND_KERNEL) == 0) &&
+           MACH_MSGH_BITS_REMOTE(kmsg->ikm_header->msgh_bits) == MACH_MSG_TYPE_PORT_SEND_ONCE) {
+               error = ipc_kmsg_validate_reply_context_locked(option, port, th->ith_voucher, th->ith_voucher_name);
+               if (error != MACH_MSG_SUCCESS) {
+                       ip_unlock(port);
+                       return error;
+               }
+       }
+
 #if IMPORTANCE_INHERITANCE
 retry:
 #endif /* IMPORTANCE_INHERITANCE */
@@ -1856,7 +1925,7 @@ retry:
                 *      ipc_port_dealloc_kernel clears ip_receiver
                 *      before destroying a kernel port.
                 */
-               assert(ip_active(port));
+               require_ip_active(port);
                port->ip_messages.imq_seqno++;
                ip_unlock(port);
 
@@ -1872,7 +1941,7 @@ retry:
 
                /* restart the KMSG_INFO tracing for the reply message */
                KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_KMSG_INFO) | DBG_FUNC_START);
-               port = (ipc_port_t) kmsg->ikm_header->msgh_remote_port;
+               port = kmsg->ikm_header->msgh_remote_port;
                assert(IP_VALID(port));
                ip_lock(port);
                /* fall thru with reply - same options */
@@ -1906,7 +1975,7 @@ retry:
                 */
                imq_lock(&port->ip_messages);
 
-               set_ip_srp_msg_sent(port);
+               ipc_special_reply_port_msg_sent(port);
 
                ip_unlock(port);
 
@@ -2159,19 +2228,207 @@ ipc_kmsg_set_qos(
        }
 
        kr = KERN_SUCCESS;
-       if ((options & MACH_SEND_SYNC_OVERRIDE)) {
-               if (IP_VALID(special_reply_port) &&
-                   MACH_MSGH_BITS_LOCAL(kmsg->ikm_header->msgh_bits) == MACH_MSG_TYPE_PORT_SEND_ONCE) {
+
+       if (IP_VALID(special_reply_port) &&
+           MACH_MSGH_BITS_LOCAL(kmsg->ikm_header->msgh_bits) == MACH_MSG_TYPE_PORT_SEND_ONCE) {
+               if ((options & MACH_SEND_SYNC_OVERRIDE)) {
+                       boolean_t sync_bootstrap_checkin = !!(options & MACH_SEND_SYNC_BOOTSTRAP_CHECKIN);
                        /*
                         * Link the destination port to special reply port and make sure that
                         * dest port has a send turnstile, else allocate one.
                         */
-                       ipc_port_link_special_reply_port(special_reply_port, dest_port);
+                       ipc_port_link_special_reply_port(special_reply_port, dest_port, sync_bootstrap_checkin);
                }
        }
        return kr;
 }
 
+static inline void
+ipc_kmsg_allow_immovable_send(
+       ipc_kmsg_t   kmsg,
+       ipc_entry_t  dest_entry)
+{
+       ipc_object_t object = dest_entry->ie_object;
+       /*
+        *      If the dest port is a kobject, allow copyin of immovable send
+        *      rights in the message body to succeed
+        */
+       if (IO_VALID(object) && io_is_kobject(object)) {
+               kmsg->ikm_flags |= IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND;
+       }
+}
+
+/*
+ *     Routine:        ipc_kmsg_link_reply_context_locked
+ *     Purpose:
+ *             Link any required context from the sending voucher
+ *             to the reply port. The ipc_kmsg_copyin function will
+ *             enforce that the sender calls mach_msg in this context.
+ *     Conditions:
+ *             reply port is locked
+ */
+static void
+ipc_kmsg_link_reply_context_locked(
+       ipc_port_t reply_port,
+       ipc_port_t voucher_port)
+{
+       kern_return_t __assert_only kr;
+       uint32_t persona_id = 0;
+       ipc_voucher_t voucher;
+
+       ip_lock_held(reply_port);
+
+       if (!ip_active(reply_port)) {
+               return;
+       }
+
+       voucher = convert_port_to_voucher(voucher_port);
+
+       kr = bank_get_bank_ledger_thread_group_and_persona(voucher, NULL, NULL, &persona_id);
+       assert(kr == KERN_SUCCESS);
+       ipc_voucher_release(voucher);
+
+       if (persona_id == 0 || persona_id == PERSONA_ID_NONE) {
+               /* there was no persona context to record */
+               return;
+       }
+
+       /*
+        * Set the persona_id as the context on the reply port.
+        * This will force the thread that replies to have adopted a voucher
+        * with a matching persona.
+        */
+       reply_port->ip_reply_context = persona_id;
+
+       return;
+}
+
+static kern_return_t
+ipc_kmsg_validate_reply_port_locked(ipc_port_t reply_port, mach_msg_option_t options)
+{
+       ip_lock_held(reply_port);
+
+       if (!ip_active(reply_port)) {
+               /*
+                * Ideally, we would enforce that the reply receive right is
+                * active, but asynchronous XPC cancellation destroys the
+                * receive right, so we just have to return success here.
+                */
+               return KERN_SUCCESS;
+       }
+
+       if (options & MACH_SEND_MSG) {
+               /*
+                * If the rely port is active, then it should not be
+                * in-transit, and the receive right should be in the caller's
+                * IPC space.
+                */
+               if (!reply_port->ip_receiver_name || reply_port->ip_receiver != current_task()->itk_space) {
+                       return KERN_INVALID_CAPABILITY;
+               }
+
+               /*
+                * A port used as a reply port in an RPC should have exactly 1
+                * extant send-once right which we either just made or are
+                * moving as part of the IPC.
+                */
+               if (reply_port->ip_sorights != 1) {
+                       return KERN_INVALID_CAPABILITY;
+               }
+               /*
+                * XPC uses an extra send-right to keep the name of the reply
+                * right around through cancellation.  That makes it harder to
+                * enforce a particular semantic kere, so for now, we say that
+                * you can have a maximum of 1 send right (in addition to your
+                * send once right). In the future, it would be great to lock
+                * this down even further.
+                */
+               if (reply_port->ip_srights > 1) {
+                       return KERN_INVALID_CAPABILITY;
+               }
+
+               /*
+                * The sender can also specify that the receive right should
+                * be immovable. Note that this check only applies to
+                * send-only operations. Combined send/receive or rcv-only
+                * operations can specify an immovable receive right by
+                * opt-ing into guarded descriptors (MACH_RCV_GUARDED_DESC)
+                * and using the MACH_MSG_STRICT_REPLY options flag.
+                */
+               if (MACH_SEND_REPLY_IS_IMMOVABLE(options)) {
+                       if (!reply_port->ip_immovable_receive) {
+                               return KERN_INVALID_CAPABILITY;
+                       }
+               }
+       }
+
+       /*
+        * don't enforce this yet: need a better way of indicating the
+        * receiver wants this...
+        */
+#if 0
+       if (MACH_RCV_WITH_IMMOVABLE_REPLY(options)) {
+               if (!reply_port->ip_immovable_receive) {
+                       return KERN_INVALID_CAPABILITY;
+               }
+       }
+#endif /* 0  */
+
+       return KERN_SUCCESS;
+}
+
+/*
+ *     Routine:        ipc_kmsg_validate_reply_context_locked
+ *     Purpose:
+ *             Validate that the current thread is running in the context
+ *             required by the destination port.
+ *     Conditions:
+ *             dest_port is locked
+ *     Returns:
+ *             MACH_MSG_SUCCESS on success.
+ *             On error, an EXC_GUARD exception is also raised.
+ *             This function *always* resets the port reply context.
+ */
+static mach_msg_return_t
+ipc_kmsg_validate_reply_context_locked(
+       mach_msg_option_t option,
+       ipc_port_t dest_port,
+       ipc_voucher_t voucher,
+       mach_port_name_t voucher_name)
+{
+       uint32_t dest_ctx = dest_port->ip_reply_context;
+       dest_port->ip_reply_context = 0;
+
+       if (!ip_active(dest_port)) {
+               return MACH_MSG_SUCCESS;
+       }
+
+       if (voucher == IPC_VOUCHER_NULL || !MACH_PORT_VALID(voucher_name)) {
+               if ((option & MACH_SEND_KERNEL) == 0) {
+                       mach_port_guard_exception(voucher_name, 0,
+                           (MPG_FLAGS_STRICT_REPLY_INVALID_VOUCHER | dest_ctx),
+                           kGUARD_EXC_STRICT_REPLY);
+               }
+               return MACH_SEND_INVALID_CONTEXT;
+       }
+
+       kern_return_t __assert_only kr;
+       uint32_t persona_id = 0;
+       kr = bank_get_bank_ledger_thread_group_and_persona(voucher, NULL, NULL, &persona_id);
+       assert(kr == KERN_SUCCESS);
+
+       if (dest_ctx != persona_id) {
+               if ((option & MACH_SEND_KERNEL) == 0) {
+                       mach_port_guard_exception(voucher_name, 0,
+                           (MPG_FLAGS_STRICT_REPLY_MISMATCHED_PERSONA | ((((uint64_t)persona_id << 32) & MPG_FLAGS_STRICT_REPLY_MASK) | dest_ctx)),
+                           kGUARD_EXC_STRICT_REPLY);
+               }
+               return MACH_SEND_INVALID_CONTEXT;
+       }
+
+       return MACH_MSG_SUCCESS;
+}
+
 /*
  *     Routine:        ipc_kmsg_copyin_header
  *     Purpose:
@@ -2283,6 +2540,23 @@ ipc_kmsg_copyin_header(
                }
        }
 
+       if (enforce_strict_reply && MACH_SEND_WITH_STRICT_REPLY(*optionp) &&
+           (!MACH_PORT_VALID(reply_name) ||
+           ((reply_type != MACH_MSG_TYPE_MAKE_SEND_ONCE) && (reply_type != MACH_MSG_TYPE_MOVE_SEND_ONCE))
+           )) {
+               /*
+                * The caller cannot enforce a reply context with an invalid
+                * reply port name, or a non-send_once reply disposition.
+                */
+               is_write_unlock(space);
+               if ((*optionp & MACH_SEND_KERNEL) == 0) {
+                       mach_port_guard_exception(reply_name, 0,
+                           (MPG_FLAGS_STRICT_REPLY_INVALID_REPLY_DISP | reply_type),
+                           kGUARD_EXC_STRICT_REPLY);
+               }
+               return MACH_SEND_INVALID_REPLY;
+       }
+
        /*
         *      Handle combinations of validating destination and reply; along
         *      with copying in destination, reply, and voucher in an atomic way.
@@ -2298,6 +2572,8 @@ ipc_kmsg_copyin_header(
                if (dest_entry == IE_NULL) {
                        goto invalid_dest;
                }
+               /* Check if dest port allows immovable send rights to be sent in the kmsg body */
+               ipc_kmsg_allow_immovable_send(kmsg, dest_entry);
 
                /*
                 *      Make sure a future copyin of the reply port will succeed.
@@ -2316,7 +2592,7 @@ ipc_kmsg_copyin_header(
                                goto invalid_reply;
                        }
                        assert(dest_entry != reply_entry); /* names are not equal */
-                       if (!ipc_right_copyin_check(space, reply_name, reply_entry, reply_type)) {
+                       if (!ipc_right_copyin_check_reply(space, reply_name, reply_entry, reply_type)) {
                                goto invalid_reply;
                        }
                }
@@ -2329,14 +2605,13 @@ ipc_kmsg_copyin_header(
                 *      the copyins can be blamed on the destination.
                 */
                kr = ipc_right_copyin_two(space, dest_name, dest_entry,
-                   dest_type, voucher_type,
-                   &dest_port, &dest_soright,
+                   dest_type, voucher_type, &dest_port, &dest_soright,
                    &release_port);
                if (kr != KERN_SUCCESS) {
                        assert(kr != KERN_INVALID_CAPABILITY);
                        goto invalid_dest;
                }
-               voucher_port = (ipc_port_t)dest_port;
+               voucher_port = ip_object_to_port(dest_port);
 
                /*
                 * could not have been one of these dispositions,
@@ -2354,7 +2629,7 @@ ipc_kmsg_copyin_header(
                        kr = ipc_right_copyin(space, reply_name, reply_entry,
                            reply_type, IPC_RIGHT_COPYIN_FLAGS_DEADOK,
                            &reply_port, &reply_soright,
-                           &release_port, &assertcnt);
+                           &release_port, &assertcnt, 0, NULL);
                        assert(assertcnt == 0);
                        assert(kr == KERN_SUCCESS);
                }
@@ -2371,16 +2646,24 @@ ipc_kmsg_copyin_header(
                        if (dest_entry == IE_NULL) {
                                goto invalid_dest;
                        }
+                       ipc_kmsg_allow_immovable_send(kmsg, dest_entry);
+
                        reply_entry = dest_entry;
                        assert(reply_type != 0); /* because name not null */
 
+                       /*
+                        *      Pre-validate that the reply right can be copied in by itself
+                        */
+                       if (!ipc_right_copyin_check_reply(space, reply_name, reply_entry, reply_type)) {
+                               goto invalid_reply;
+                       }
+
                        /*
                         *      Do the joint copyin of the dest disposition and
                         *      reply disposition from the one entry/port.
                         */
                        kr = ipc_right_copyin_two(space, dest_name, dest_entry,
-                           dest_type, reply_type,
-                           &dest_port, &dest_soright,
+                           dest_type, reply_type, &dest_port, &dest_soright,
                            &release_port);
                        if (kr == KERN_INVALID_CAPABILITY) {
                                goto invalid_reply;
@@ -2420,6 +2703,7 @@ ipc_kmsg_copyin_header(
                                goto invalid_dest;
                        }
                        assert(dest_entry != voucher_entry);
+                       ipc_kmsg_allow_immovable_send(kmsg, dest_entry);
 
                        /*
                         *      Make sure reply port entry is valid before dest copyin.
@@ -2435,7 +2719,7 @@ ipc_kmsg_copyin_header(
                                assert(dest_entry != reply_entry); /* names are not equal */
                                assert(reply_type != 0); /* because reply_name not null */
 
-                               if (!ipc_right_copyin_check(space, reply_name, reply_entry, reply_type)) {
+                               if (!ipc_right_copyin_check_reply(space, reply_name, reply_entry, reply_type)) {
                                        goto invalid_reply;
                                }
                        }
@@ -2444,9 +2728,10 @@ ipc_kmsg_copyin_header(
                         *      copyin the destination.
                         */
                        kr = ipc_right_copyin(space, dest_name, dest_entry,
-                           dest_type, IPC_RIGHT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE,
+                           dest_type, (IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND |
+                           IPC_RIGHT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE),
                            &dest_port, &dest_soright,
-                           &release_port, &assertcnt);
+                           &release_port, &assertcnt, 0, NULL);
                        assert(assertcnt == 0);
                        if (kr != KERN_SUCCESS) {
                                goto invalid_dest;
@@ -2462,12 +2747,12 @@ ipc_kmsg_copyin_header(
                                kr = ipc_right_copyin(space, reply_name, reply_entry,
                                    reply_type, IPC_RIGHT_COPYIN_FLAGS_DEADOK,
                                    &reply_port, &reply_soright,
-                                   &release_port, &assertcnt);
+                                   &release_port, &assertcnt, 0, NULL);
                                assert(assertcnt == 0);
                                assert(kr == KERN_SUCCESS);
                        } else {
                                /* convert invalid name to equivalent ipc_object type */
-                               reply_port = (ipc_object_t)CAST_MACH_NAME_TO_PORT(reply_name);
+                               reply_port = ip_to_object(CAST_MACH_NAME_TO_PORT(reply_name));
                        }
                }
 
@@ -2481,11 +2766,11 @@ ipc_kmsg_copyin_header(
                            (ipc_object_t *)&voucher_port,
                            &voucher_soright,
                            &voucher_release_port,
-                           &assertcnt);
+                           &assertcnt, 0, NULL);
                        assert(assertcnt == 0);
                        assert(KERN_SUCCESS == kr);
                        assert(IP_VALID(voucher_port));
-                       assert(ip_active(voucher_port));
+                       require_ip_active(voucher_port);
                }
        }
 
@@ -2538,7 +2823,7 @@ ipc_kmsg_copyin_header(
        if (((*optionp & MACH_SEND_NOTIFY) != 0) &&
            dest_type != MACH_MSG_TYPE_PORT_SEND_ONCE &&
            dest_entry != IE_NULL && dest_entry->ie_request != IE_REQ_NONE) {
-               ipc_port_t dport = (ipc_port_t)dest_port;
+               ipc_port_t dport = ip_object_to_port(dest_port);
 
                assert(dport != IP_NULL);
                ip_lock(dport);
@@ -2577,7 +2862,7 @@ ipc_kmsg_copyin_header(
         * destination port.
         */
        if (needboost == TRUE) {
-               ipc_port_t dport = (ipc_port_t)dest_port;
+               ipc_port_t dport = ip_object_to_port(dest_port);
 
                /* dport still locked from above */
                if (ipc_port_importance_delta(dport, IPID_OPTION_SENDPOSSIBLE, 1) == FALSE) {
@@ -2607,8 +2892,8 @@ ipc_kmsg_copyin_header(
        }
 
        msg->msgh_bits = MACH_MSGH_BITS_SET(dest_type, reply_type, voucher_type, mbits);
-       msg->msgh_remote_port = (ipc_port_t)dest_port;
-       msg->msgh_local_port = (ipc_port_t)reply_port;
+       msg->msgh_remote_port = ip_object_to_port(dest_port);
+       msg->msgh_local_port = ip_object_to_port(reply_port);
 
        /* capture the qos value(s) for the kmsg */
        ipc_kmsg_set_qos(kmsg, *optionp, override);
@@ -2621,6 +2906,37 @@ ipc_kmsg_copyin_header(
                ip_release(voucher_release_port);
        }
 
+       if (enforce_strict_reply && MACH_SEND_WITH_STRICT_REPLY(*optionp) && IP_VALID(msg->msgh_local_port)) {
+               /*
+                * We've already validated that the reply disposition is a
+                * [make/move] send-once. Ideally, we should enforce that the
+                * reply port is also not dead, but XPC asynchronous
+                * cancellation can make the reply port dead before we
+                * actually make it to the mach_msg send.
+                *
+                * Here, we ensure that if we have a non-dead reply port, then
+                * the reply port's receive right should not be in-transit,
+                * and should live in the caller's IPC space.
+                */
+               ipc_port_t rport = msg->msgh_local_port;
+               ip_lock(rport);
+               kr = ipc_kmsg_validate_reply_port_locked(rport, *optionp);
+               ip_unlock(rport);
+               if (kr != KERN_SUCCESS) {
+                       /*
+                        * no descriptors have been copied in yet, but the
+                        * full header has been copied in: clean it up
+                        */
+                       ipc_kmsg_clean_partial(kmsg, 0, NULL, 0, 0);
+                       if ((*optionp & MACH_SEND_KERNEL) == 0) {
+                               mach_port_guard_exception(reply_name, 0,
+                                   (MPG_FLAGS_STRICT_REPLY_INVALID_REPLY_PORT | kr),
+                                   kGUARD_EXC_STRICT_REPLY);
+                       }
+                       return MACH_SEND_INVALID_REPLY;
+               }
+       }
+
        return MACH_MSG_SUCCESS;
 
 invalid_reply:
@@ -2655,19 +2971,7 @@ invalid_dest:
        return MACH_SEND_INVALID_DEST;
 }
 
-mach_msg_descriptor_t *ipc_kmsg_copyin_port_descriptor(
-       volatile mach_msg_port_descriptor_t *dsc,
-       mach_msg_legacy_port_descriptor_t *user_dsc,
-       ipc_space_t space,
-       ipc_object_t dest,
-       ipc_kmsg_t kmsg,
-       mach_msg_option_t *optionp,
-       mach_msg_return_t *mr);
-
-void ipc_print_type_name(
-       int type_name);
-
-mach_msg_descriptor_t *
+static mach_msg_descriptor_t *
 ipc_kmsg_copyin_port_descriptor(
        volatile mach_msg_port_descriptor_t *dsc,
        mach_msg_legacy_port_descriptor_t *user_dsc_in,
@@ -2688,9 +2992,9 @@ ipc_kmsg_copyin_port_descriptor(
 
        name = (mach_port_name_t)user_dsc->name;
        if (MACH_PORT_VALID(name)) {
-               kern_return_t kr = ipc_object_copyin(space, name, user_disp, &object);
+               kern_return_t kr = ipc_object_copyin(space, name, user_disp, &object, 0, NULL, kmsg->ikm_flags);
                if (kr != KERN_SUCCESS) {
-                       if ((*optionp & MACH_SEND_KERNEL) == 0) {
+                       if (((*optionp & MACH_SEND_KERNEL) == 0) && (kr == KERN_INVALID_RIGHT)) {
                                mach_port_guard_exception(name, 0, 0, kGUARD_EXC_SEND_INVALID_RIGHT);
                        }
                        *mr = MACH_SEND_INVALID_RIGHT;
@@ -2698,34 +3002,23 @@ ipc_kmsg_copyin_port_descriptor(
                }
 
                if ((result_disp == MACH_MSG_TYPE_PORT_RECEIVE) &&
-                   ipc_port_check_circularity((ipc_port_t) object,
-                   (ipc_port_t) dest)) {
+                   ipc_port_check_circularity(ip_object_to_port(object),
+                   ip_object_to_port(dest))) {
                        kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS_CIRCULAR;
                }
-               dsc->name = (ipc_port_t) object;
+               dsc->name = ip_object_to_port(object);
        } else {
                dsc->name = CAST_MACH_NAME_TO_PORT(name);
        }
        dsc->disposition = result_disp;
        dsc->type = MACH_MSG_PORT_DESCRIPTOR;
 
-       dsc->pad_end = 0; // debug, unnecessary
+       dsc->pad_end = 0;         // debug, unnecessary
 
        return (mach_msg_descriptor_t *)(user_dsc_in + 1);
 }
 
-mach_msg_descriptor_t * ipc_kmsg_copyin_ool_descriptor(
-       mach_msg_ool_descriptor_t *dsc,
-       mach_msg_descriptor_t *user_dsc,
-       int is_64bit,
-       vm_offset_t *paddr,
-       vm_map_copy_t *copy,
-       vm_size_t *space_needed,
-       vm_map_t map,
-       mach_msg_option_t *optionp,
-       mach_msg_return_t *mr);
-
-mach_msg_descriptor_t *
+static mach_msg_descriptor_t *
 ipc_kmsg_copyin_ool_descriptor(
        mach_msg_ool_descriptor_t *dsc,
        mach_msg_descriptor_t *user_dsc,
@@ -2828,18 +3121,7 @@ ipc_kmsg_copyin_ool_descriptor(
        return user_dsc;
 }
 
-mach_msg_descriptor_t * ipc_kmsg_copyin_ool_ports_descriptor(
-       mach_msg_ool_ports_descriptor_t *dsc,
-       mach_msg_descriptor_t *user_dsc,
-       int is_64bit,
-       vm_map_t map,
-       ipc_space_t space,
-       ipc_object_t dest,
-       ipc_kmsg_t kmsg,
-       mach_msg_option_t *optionp,
-       mach_msg_return_t *mr);
-
-mach_msg_descriptor_t *
+static mach_msg_descriptor_t *
 ipc_kmsg_copyin_ool_ports_descriptor(
        mach_msg_ool_ports_descriptor_t *dsc,
        mach_msg_descriptor_t *user_dsc,
@@ -2950,11 +3232,11 @@ ipc_kmsg_copyin_ool_ports_descriptor(
                ipc_object_t object;
 
                if (!MACH_PORT_VALID(name)) {
-                       objects[i] = (ipc_object_t)CAST_MACH_NAME_TO_PORT(name);
+                       objects[i] = ip_to_object(CAST_MACH_NAME_TO_PORT(name));
                        continue;
                }
 
-               kern_return_t kr = ipc_object_copyin(space, name, user_disp, &object);
+               kern_return_t kr = ipc_object_copyin(space, name, user_disp, &object, 0, NULL, kmsg->ikm_flags);
 
                if (kr != KERN_SUCCESS) {
                        unsigned int j;
@@ -2967,7 +3249,7 @@ ipc_kmsg_copyin_ool_ports_descriptor(
                        }
                        kfree(data, ports_length);
                        dsc->address = NULL;
-                       if ((*optionp & MACH_SEND_KERNEL) == 0) {
+                       if (((*optionp & MACH_SEND_KERNEL) == 0) && (kr == KERN_INVALID_RIGHT)) {
                                mach_port_guard_exception(name, 0, 0, kGUARD_EXC_SEND_INVALID_RIGHT);
                        }
                        *mr = MACH_SEND_INVALID_RIGHT;
@@ -2975,9 +3257,8 @@ ipc_kmsg_copyin_ool_ports_descriptor(
                }
 
                if ((dsc->disposition == MACH_MSG_TYPE_PORT_RECEIVE) &&
-                   ipc_port_check_circularity(
-                           (ipc_port_t) object,
-                           (ipc_port_t) dest)) {
+                   ipc_port_check_circularity(ip_object_to_port(object),
+                   ip_object_to_port(dest))) {
                        kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS_CIRCULAR;
                }
 
@@ -2987,6 +3268,74 @@ ipc_kmsg_copyin_ool_ports_descriptor(
        return user_dsc;
 }
 
+static mach_msg_descriptor_t *
+ipc_kmsg_copyin_guarded_port_descriptor(
+       mach_msg_guarded_port_descriptor_t *dsc,
+       mach_msg_descriptor_t *user_addr,
+       int is_64bit,
+       ipc_space_t space,
+       ipc_object_t dest,
+       ipc_kmsg_t kmsg,
+       mach_msg_option_t *optionp,
+       mach_msg_return_t *mr)
+{
+       mach_msg_descriptor_t       *user_dsc;
+       mach_msg_type_name_t        disp;
+       mach_msg_type_name_t        result_disp;
+       mach_port_name_t            name;
+       mach_msg_guard_flags_t      guard_flags;
+       ipc_object_t                object;
+       mach_port_context_t         context;
+
+       if (!is_64bit) {
+               mach_msg_guarded_port_descriptor32_t *user_gp_dsc = (typeof(user_gp_dsc))user_addr;
+               name = user_gp_dsc->name;
+               guard_flags = user_gp_dsc->flags;
+               disp = user_gp_dsc->disposition;
+               context = user_gp_dsc->context;
+               user_dsc = (mach_msg_descriptor_t *)(user_gp_dsc + 1);
+       } else {
+               mach_msg_guarded_port_descriptor64_t *user_gp_dsc = (typeof(user_gp_dsc))user_addr;
+               name = user_gp_dsc->name;
+               guard_flags = user_gp_dsc->flags;
+               disp = user_gp_dsc->disposition;
+               context = user_gp_dsc->context;
+               user_dsc = (mach_msg_descriptor_t *)(user_gp_dsc + 1);
+       }
+
+       guard_flags &= MACH_MSG_GUARD_FLAGS_MASK;
+       result_disp = ipc_object_copyin_type(disp);
+
+       if (MACH_PORT_VALID(name)) {
+               kern_return_t kr = ipc_object_copyin(space, name, disp, &object, context, &guard_flags, kmsg->ikm_flags);
+               if (kr != KERN_SUCCESS) {
+                       if (((*optionp & MACH_SEND_KERNEL) == 0) && (kr == KERN_INVALID_RIGHT)) {
+                               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_SEND_INVALID_RIGHT);
+                       }
+                       *mr = MACH_SEND_INVALID_RIGHT;
+                       return NULL;
+               }
+
+               if ((result_disp == MACH_MSG_TYPE_PORT_RECEIVE) &&
+                   ipc_port_check_circularity(ip_object_to_port(object),
+                   ip_object_to_port(dest))) {
+                       kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS_CIRCULAR;
+               }
+               dsc->name = ip_object_to_port(object);
+       } else {
+               dsc->name = CAST_MACH_NAME_TO_PORT(name);
+       }
+       dsc->flags = guard_flags;
+       dsc->disposition = result_disp;
+       dsc->type = MACH_MSG_GUARDED_PORT_DESCRIPTOR;
+
+#if __LP64__
+       dsc->pad_end = 0;         // debug, unnecessary
+#endif
+       return user_dsc;
+}
+
+
 /*
  *     Routine:        ipc_kmsg_copyin_body
  *     Purpose:
@@ -3007,6 +3356,7 @@ ipc_kmsg_copyin_ool_ports_descriptor(
  *             MACH_SEND_MSG_TOO_SMALL Body is too small for types/data.
  *             MACH_SEND_INVALID_RT_OOL_SIZE OOL Buffer too large for RT
  *             MACH_MSG_INVALID_RT_DESCRIPTOR Dealloc and RT are incompatible
+ *             MACH_SEND_NO_GRANT_DEST Dest port doesn't accept ports in body
  */
 
 mach_msg_return_t
@@ -3018,27 +3368,33 @@ ipc_kmsg_copyin_body(
 {
        ipc_object_t                dest;
        mach_msg_body_t             *body;
-       mach_msg_descriptor_t       *daddr, *naddr;
+       mach_msg_descriptor_t       *daddr, *naddr, *end;
        mach_msg_descriptor_t       *user_addr, *kern_addr;
        mach_msg_type_number_t      dsc_count;
        boolean_t                   is_task_64bit = (map->max_offset > VM_MAX_ADDRESS);
        boolean_t                   complex = FALSE;
+       boolean_t                   contains_port_desc = FALSE;
        vm_size_t                   space_needed = 0;
        vm_offset_t                 paddr = 0;
        vm_map_copy_t               copy = VM_MAP_COPY_NULL;
        mach_msg_type_number_t      i;
        mach_msg_return_t           mr = MACH_MSG_SUCCESS;
+       ipc_port_t                  remote_port = kmsg->ikm_header->msgh_remote_port;
 
        vm_size_t           descriptor_size = 0;
 
        mach_msg_type_number_t total_ool_port_count = 0;
+       mach_msg_guard_flags_t guard_flags = 0;
+       mach_port_context_t context;
+       mach_msg_type_name_t disp;
 
        /*
         * Determine if the target is a kernel port.
         */
-       dest = (ipc_object_t) kmsg->ikm_header->msgh_remote_port;
+       dest = ip_to_object(remote_port);
        body = (mach_msg_body_t *) (kmsg->ikm_header + 1);
        naddr = (mach_msg_descriptor_t *) (body + 1);
+       end = (mach_msg_descriptor_t *) ((vm_offset_t)kmsg->ikm_header + kmsg->ikm_header->msgh_size);
 
        dsc_count = body->msgh_descriptor_count;
        if (dsc_count == 0) {
@@ -3059,10 +3415,16 @@ ipc_kmsg_copyin_body(
 
                /* make sure the descriptor fits in the message */
                if (is_task_64bit) {
+                       if ((mach_msg_descriptor_t*)((vm_offset_t)daddr + 12) > end) {
+                               mr = MACH_SEND_MSG_TOO_SMALL;
+                               goto clean_message;
+                       }
+
                        switch (daddr->type.type) {
                        case MACH_MSG_OOL_DESCRIPTOR:
                        case MACH_MSG_OOL_VOLATILE_DESCRIPTOR:
                        case MACH_MSG_OOL_PORTS_DESCRIPTOR:
+                       case MACH_MSG_GUARDED_PORT_DESCRIPTOR:
                                descriptor_size += 16;
                                naddr = (typeof(naddr))((vm_offset_t)daddr + 16);
                                break;
@@ -3076,8 +3438,7 @@ ipc_kmsg_copyin_body(
                        naddr = (typeof(naddr))((vm_offset_t)daddr + 12);
                }
 
-               if (naddr > (mach_msg_descriptor_t *)
-                   ((vm_offset_t)kmsg->ikm_header + kmsg->ikm_header->msgh_size)) {
+               if (naddr > end) {
                        mr = MACH_SEND_MSG_TOO_SMALL;
                        goto clean_message;
                }
@@ -3125,6 +3486,7 @@ ipc_kmsg_copyin_body(
                                mr = MACH_SEND_TOO_LARGE;
                                goto clean_message;
                        }
+                       contains_port_desc = TRUE;
                        break;
                case MACH_MSG_OOL_PORTS_DESCRIPTOR:
                        ool_port_count = (is_task_64bit) ?
@@ -3142,6 +3504,35 @@ ipc_kmsg_copyin_body(
                                mr = MACH_SEND_TOO_LARGE;
                                goto clean_message;
                        }
+                       contains_port_desc = TRUE;
+                       break;
+               case MACH_MSG_GUARDED_PORT_DESCRIPTOR:
+                       guard_flags = (is_task_64bit) ?
+                           ((mach_msg_guarded_port_descriptor64_t *)daddr)->flags :
+                           ((mach_msg_guarded_port_descriptor32_t *)daddr)->flags;
+                       context = (is_task_64bit) ?
+                           ((mach_msg_guarded_port_descriptor64_t *)daddr)->context :
+                           ((mach_msg_guarded_port_descriptor32_t *)daddr)->context;
+                       disp = (is_task_64bit) ?
+                           ((mach_msg_guarded_port_descriptor64_t *)daddr)->disposition :
+                           ((mach_msg_guarded_port_descriptor32_t *)daddr)->disposition;
+
+                       /* Only MACH_MSG_TYPE_MOVE_RECEIVE is supported for now */
+                       if (!guard_flags || ((guard_flags & ~MACH_MSG_GUARD_FLAGS_MASK) != 0) ||
+                           ((guard_flags & MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND) && (context != 0)) ||
+                           (disp != MACH_MSG_TYPE_MOVE_RECEIVE)) {
+                               /*
+                                * Invalid guard flags, context or disposition
+                                */
+                               mr = MACH_SEND_INVALID_TYPE;
+                               goto clean_message;
+                       }
+                       if (os_add_overflow(total_ool_port_count, 1, &total_ool_port_count)) {
+                               /* Overflow detected */
+                               mr = MACH_SEND_TOO_LARGE;
+                               goto clean_message;
+                       }
+                       contains_port_desc = TRUE;
                        break;
                }
        }
@@ -3152,6 +3543,16 @@ ipc_kmsg_copyin_body(
                goto clean_message;
        }
 
+       /*
+        * Check if dest is a no-grant port; Since this bit is set only on
+        * port construction and cannot be unset later, we can peek at the
+        * bit without paying the cost of locking the port.
+        */
+       if (contains_port_desc && remote_port->ip_no_grant) {
+               mr = MACH_SEND_NO_GRANT_DEST;
+               goto clean_message;
+       }
+
        /*
         * Allocate space in the pageable kernel ipc copy map for all the
         * ool data that is to be physically copied.  Map is marked wait for
@@ -3168,7 +3569,8 @@ ipc_kmsg_copyin_body(
        /* user_addr = just after base as it was copied in */
        user_addr = (mach_msg_descriptor_t *)((vm_offset_t)kmsg->ikm_header + sizeof(mach_msg_base_t));
 
-       /* Shift the mach_msg_base_t down to make room for dsc_count*16bytes of descriptors */
+       /* Shift the mach_msg_base_t down to make room for dsc_count*16bytes of descriptors on 64 bit kernels
+        */
        if (descriptor_size != 16 * dsc_count) {
                vm_offset_t dsc_adjust = 16 * dsc_count - descriptor_size;
 
@@ -3205,6 +3607,12 @@ ipc_kmsg_copyin_body(
                        kern_addr++;
                        complex = TRUE;
                        break;
+               case MACH_MSG_GUARDED_PORT_DESCRIPTOR:
+                       user_addr = ipc_kmsg_copyin_guarded_port_descriptor((mach_msg_guarded_port_descriptor_t *)kern_addr,
+                           user_addr, is_task_64bit, space, dest, kmsg, optionp, &mr);
+                       kern_addr++;
+                       complex = TRUE;
+                       break;
                default:
                        /* Invalid descriptor */
                        mr = MACH_SEND_INVALID_TYPE;
@@ -3218,7 +3626,7 @@ ipc_kmsg_copyin_body(
                            paddr, space_needed);
                        goto out;
                }
-       } /* End of loop */
+       }         /* End of loop */
 
        if (!complex) {
                kmsg->ikm_header->msgh_bits &= ~MACH_MSGH_BITS_COMPLEX;
@@ -3295,7 +3703,6 @@ ipc_kmsg_copyin(
        }
 
        mr = ipc_kmsg_copyin_body( kmsg, space, map, optionp);
-
        /* unreachable if !DEBUG */
        __unreachable_ok_push
        if (DEBUG_KPRINT_SYSCALL_PREDICATE(DEBUG_KPRINT_SYSCALL_IPC_MASK)) {
@@ -3330,8 +3737,9 @@ ipc_kmsg_copyin_from_kernel(
        mach_msg_bits_t bits = kmsg->ikm_header->msgh_bits;
        mach_msg_type_name_t rname = MACH_MSGH_BITS_REMOTE(bits);
        mach_msg_type_name_t lname = MACH_MSGH_BITS_LOCAL(bits);
-       ipc_object_t remote = (ipc_object_t) kmsg->ikm_header->msgh_remote_port;
-       ipc_object_t local = (ipc_object_t) kmsg->ikm_header->msgh_local_port;
+       ipc_object_t remote = ip_to_object(kmsg->ikm_header->msgh_remote_port);
+       ipc_object_t local = ip_to_object(kmsg->ikm_header->msgh_local_port);
+       ipc_port_t dest = kmsg->ikm_header->msgh_remote_port;
 
        /* translate the destination and reply ports */
        if (!IO_VALID(remote)) {
@@ -3364,6 +3772,30 @@ ipc_kmsg_copyin_from_kernel(
                        return MACH_MSG_SUCCESS;
                }
        }
+
+       /*
+        * Check if the remote port accepts ports in the body.
+        */
+       if (dest->ip_no_grant) {
+               mach_msg_descriptor_t   *saddr;
+               mach_msg_body_t         *body;
+               mach_msg_type_number_t  i, count;
+
+               body = (mach_msg_body_t *) (kmsg->ikm_header + 1);
+               saddr = (mach_msg_descriptor_t *) (body + 1);
+               count = body->msgh_descriptor_count;
+
+               for (i = 0; i < count; i++, saddr++) {
+                       switch (saddr->type.type) {
+                       case MACH_MSG_PORT_DESCRIPTOR:
+                       case MACH_MSG_OOL_PORTS_DESCRIPTOR:
+                       case MACH_MSG_GUARDED_PORT_DESCRIPTOR:
+                               /* no descriptors have been copied in yet */
+                               ipc_kmsg_clean_partial(kmsg, 0, NULL, 0, 0);
+                               return MACH_SEND_NO_GRANT_DEST;
+                       }
+               }
+       }
        {
                mach_msg_descriptor_t   *saddr;
                mach_msg_body_t         *body;
@@ -3384,7 +3816,7 @@ ipc_kmsg_copyin_from_kernel(
 
                                /* this is really the type SEND, SEND_ONCE, etc. */
                                name = dsc->disposition;
-                               object = (ipc_object_t) dsc->name;
+                               object = ip_to_object(dsc->name);
                                dsc->disposition = ipc_object_copyin_type(name);
 
                                if (!IO_VALID(object)) {
@@ -3398,10 +3830,10 @@ ipc_kmsg_copyin_from_kernel(
                                /* assert when the new kobject model is in place since*/
                                /* ports will not be used in kernel to kernel chats   */
 
-                               if (((ipc_port_t)remote)->ip_receiver != ipc_space_kernel) {
+                               if (ip_object_to_port(remote)->ip_receiver != ipc_space_kernel) {
                                        if ((dsc->disposition == MACH_MSG_TYPE_PORT_RECEIVE) &&
-                                           ipc_port_check_circularity((ipc_port_t) object,
-                                           (ipc_port_t) remote)) {
+                                           ipc_port_check_circularity(ip_object_to_port(object),
+                                           ip_object_to_port(remote))) {
                                                kmsg->ikm_header->msgh_bits |=
                                                    MACH_MSGH_BITS_CIRCULAR;
                                        }
@@ -3440,9 +3872,36 @@ ipc_kmsg_copyin_from_kernel(
                                        ipc_object_copyin_from_kernel(object, name);
 
                                        if ((dsc->disposition == MACH_MSG_TYPE_PORT_RECEIVE) &&
-                                           ipc_port_check_circularity(
-                                                   (ipc_port_t) object,
-                                                   (ipc_port_t) remote)) {
+                                           ipc_port_check_circularity(ip_object_to_port(object),
+                                           ip_object_to_port(remote))) {
+                                               kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS_CIRCULAR;
+                                       }
+                               }
+                               break;
+                       }
+                       case MACH_MSG_GUARDED_PORT_DESCRIPTOR: {
+                               mach_msg_guarded_port_descriptor_t *dsc = (typeof(dsc)) & saddr->guarded_port;
+                               mach_msg_type_name_t disp = dsc->disposition;
+                               ipc_object_t object = ip_to_object(dsc->name);
+                               dsc->disposition = ipc_object_copyin_type(disp);
+                               assert(dsc->flags == 0);
+
+                               if (!IO_VALID(object)) {
+                                       break;
+                               }
+
+                               ipc_object_copyin_from_kernel(object, disp);
+                               /*
+                                * avoid circularity when the destination is also
+                                * the kernel.  This check should be changed into an
+                                * assert when the new kobject model is in place since
+                                * ports will not be used in kernel to kernel chats
+                                */
+
+                               if (ip_object_to_port(remote)->ip_receiver != ipc_space_kernel) {
+                                       if ((dsc->disposition == MACH_MSG_TYPE_PORT_RECEIVE) &&
+                                           ipc_port_check_circularity(ip_object_to_port(object),
+                                           ip_object_to_port(remote))) {
                                                kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS_CIRCULAR;
                                        }
                                }
@@ -3467,8 +3926,9 @@ ipc_kmsg_copyin_from_kernel_legacy(
        mach_msg_bits_t bits = kmsg->ikm_header->msgh_bits;
        mach_msg_type_name_t rname = MACH_MSGH_BITS_REMOTE(bits);
        mach_msg_type_name_t lname = MACH_MSGH_BITS_LOCAL(bits);
-       ipc_object_t remote = (ipc_object_t) kmsg->ikm_header->msgh_remote_port;
-       ipc_object_t local = (ipc_object_t) kmsg->ikm_header->msgh_local_port;
+       ipc_object_t remote = ip_to_object(kmsg->ikm_header->msgh_remote_port);
+       ipc_object_t local = ip_to_object(kmsg->ikm_header->msgh_local_port);
+       ipc_port_t dest = kmsg->ikm_header->msgh_remote_port;
 
        /* translate the destination and reply ports */
        if (!IO_VALID(remote)) {
@@ -3501,6 +3961,28 @@ ipc_kmsg_copyin_from_kernel_legacy(
                        return MACH_MSG_SUCCESS;
                }
        }
+
+       if (dest->ip_no_grant) {
+               mach_msg_descriptor_t   *saddr;
+               mach_msg_body_t         *body;
+               mach_msg_type_number_t  i, count;
+
+               body = (mach_msg_body_t *) (kmsg->ikm_header + 1);
+               saddr = (mach_msg_descriptor_t *) (body + 1);
+               count = body->msgh_descriptor_count;
+
+               for (i = 0; i < count; i++, saddr++) {
+                       switch (saddr->type.type) {
+                       case MACH_MSG_PORT_DESCRIPTOR:
+                       case MACH_MSG_OOL_PORTS_DESCRIPTOR:
+                       case MACH_MSG_GUARDED_PORT_DESCRIPTOR:
+                               /* no descriptors have been copied in yet */
+                               ipc_kmsg_clean_partial(kmsg, 0, NULL, 0, 0);
+                               return MACH_SEND_NO_GRANT_DEST;
+                       }
+               }
+       }
+
        {
                mach_msg_legacy_descriptor_t    *saddr;
                mach_msg_descriptor_t   *daddr;
@@ -3533,9 +4015,9 @@ ipc_kmsg_copyin_from_kernel_legacy(
 
                                /* this is really the type SEND, SEND_ONCE, etc. */
                                name = dsc->disposition;
-                               object = (ipc_object_t) CAST_MACH_NAME_TO_PORT(dsc->name);
+                               object = ip_to_object(CAST_MACH_NAME_TO_PORT(dsc->name));
                                dest_dsc->disposition = ipc_object_copyin_type(name);
-                               dest_dsc->name = (mach_port_t)object;
+                               dest_dsc->name = ip_object_to_port(object);
                                dest_dsc->type = MACH_MSG_PORT_DESCRIPTOR;
 
                                if (!IO_VALID(object)) {
@@ -3549,10 +4031,10 @@ ipc_kmsg_copyin_from_kernel_legacy(
                                /* assert when the new kobject model is in place since*/
                                /* ports will not be used in kernel to kernel chats   */
 
-                               if (((ipc_port_t)remote)->ip_receiver != ipc_space_kernel) {
+                               if (ip_object_to_port(remote)->ip_receiver != ipc_space_kernel) {
                                        if ((dest_dsc->disposition == MACH_MSG_TYPE_PORT_RECEIVE) &&
-                                           ipc_port_check_circularity((ipc_port_t) object,
-                                           (ipc_port_t) remote)) {
+                                           ipc_port_check_circularity(ip_object_to_port(object),
+                                           ip_object_to_port(remote))) {
                                                kmsg->ikm_header->msgh_bits |=
                                                    MACH_MSGH_BITS_CIRCULAR;
                                        }
@@ -3610,9 +4092,8 @@ ipc_kmsg_copyin_from_kernel_legacy(
                                        ipc_object_copyin_from_kernel(object, name);
 
                                        if ((disposition == MACH_MSG_TYPE_PORT_RECEIVE) &&
-                                           ipc_port_check_circularity(
-                                                   (ipc_port_t) object,
-                                                   (ipc_port_t) remote)) {
+                                           ipc_port_check_circularity(ip_object_to_port(object),
+                                           ip_object_to_port(remote))) {
                                                kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS_CIRCULAR;
                                        }
                                }
@@ -3625,6 +4106,46 @@ ipc_kmsg_copyin_from_kernel_legacy(
                                dest_dsc->count = port_count;
                                break;
                        }
+                       case MACH_MSG_GUARDED_PORT_DESCRIPTOR: {
+                               mach_msg_type_name_t  disp;
+                               ipc_object_t object;
+                               mach_msg_guarded_port_descriptor32_t   *dsc;
+                               mach_msg_guarded_port_descriptor_t  *dest_dsc;
+
+                               dsc = (typeof(dsc)) & saddr->guarded_port32;
+                               dest_dsc = &daddr->guarded_port;
+
+                               disp = dsc->disposition;
+                               object = ip_to_object(CAST_MACH_NAME_TO_PORT(dsc->name));
+                               assert(dsc->flags == 0);
+                               assert(dsc->context == 0);
+
+                               dest_dsc->disposition = ipc_object_copyin_type(disp);
+                               dest_dsc->name = ip_object_to_port(object);
+                               dest_dsc->type = MACH_MSG_GUARDED_PORT_DESCRIPTOR;
+                               dest_dsc->flags = 0;
+
+                               if (!IO_VALID(object)) {
+                                       break;
+                               }
+
+                               ipc_object_copyin_from_kernel(object, disp);
+
+                               /* CDY avoid circularity when the destination is also */
+                               /* the kernel.  This check should be changed into an  */
+                               /* assert when the new kobject model is in place since*/
+                               /* ports will not be used in kernel to kernel chats   */
+
+                               if (ip_object_to_port(remote)->ip_receiver != ipc_space_kernel) {
+                                       if ((dest_dsc->disposition == MACH_MSG_TYPE_PORT_RECEIVE) &&
+                                           ipc_port_check_circularity(ip_object_to_port(object),
+                                           ip_object_to_port(remote))) {
+                                               kmsg->ikm_header->msgh_bits |=
+                                                   MACH_MSGH_BITS_CIRCULAR;
+                                       }
+                               }
+                               break;
+                       }
                        default: {
 #if     MACH_ASSERT
                                panic("ipc_kmsg_copyin_from_kernel:  bad descriptor");
@@ -3671,7 +4192,7 @@ ipc_kmsg_copyout_header(
 {
        mach_msg_header_t *msg = kmsg->ikm_header;
        mach_msg_bits_t mbits = msg->msgh_bits;
-       ipc_port_t dest = (ipc_port_t) msg->msgh_remote_port;
+       ipc_port_t dest = msg->msgh_remote_port;
 
        assert(IP_VALID(dest));
 
@@ -3744,12 +4265,14 @@ ipc_kmsg_copyout_header(
 
                                /* Is there already an entry we can use? */
                                if ((reply_type != MACH_MSG_TYPE_PORT_SEND_ONCE) &&
-                                   ipc_right_reverse(space, (ipc_object_t) reply, &reply_name, &entry)) {
+                                   ipc_right_reverse(space, ip_to_object(reply), &reply_name, &entry)) {
                                        /* reply port is locked and active */
                                        assert(entry->ie_bits & MACH_PORT_TYPE_SEND_RECEIVE);
                                } else {
                                        ip_lock(reply);
                                        if (!ip_active(reply)) {
+                                               /* clear the context value */
+                                               reply->ip_reply_context = 0;
                                                ip_unlock(reply);
 
                                                release_reply_port = reply;
@@ -3764,14 +4287,42 @@ ipc_kmsg_copyout_header(
                                        ipc_entry_claim(space, &reply_name, &entry);
                                        assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE);
                                        assert(entry->ie_object == IO_NULL);
-                                       entry->ie_object = (ipc_object_t) reply;
+                                       entry->ie_object = ip_to_object(reply);
                                }
 
                                /* space and reply port are locked and active */
-                               ip_reference(reply); /* hold onto the reply port */
+                               ip_reference(reply);         /* hold onto the reply port */
+
+                               /*
+                                * If the receiver would like to enforce strict reply
+                                * semantics, and the message looks like it expects a reply,
+                                * and contains a voucher, then link the context in the
+                                * voucher with the reply port so that the next message sent
+                                * to the reply port must come from a thread that has a
+                                * matching context (voucher).
+                                */
+                               if (enforce_strict_reply && MACH_RCV_WITH_STRICT_REPLY(option) && IP_VALID(voucher)) {
+                                       if (ipc_kmsg_validate_reply_port_locked(reply, option) != KERN_SUCCESS) {
+                                               /* if the receiver isn't happy with the reply port: fail the receive. */
+                                               ip_unlock(reply);
+                                               ipc_entry_dealloc(space, reply_name, entry);
+                                               is_write_unlock(space);
+                                               ip_release(reply);
+                                               return MACH_RCV_INVALID_REPLY;
+                                       }
+                                       ipc_kmsg_link_reply_context_locked(reply, voucher);
+                               } else {
+                                       /*
+                                        * if the receive did not choose to participate
+                                        * in the strict reply/RPC, then don't enforce
+                                        * anything (as this could lead to booby-trapped
+                                        * messages that kill the server).
+                                        */
+                                       reply->ip_reply_context = 0;
+                               }
 
                                kr = ipc_right_copyout(space, reply_name, entry,
-                                   reply_type, TRUE, (ipc_object_t) reply);
+                                   reply_type, NULL, NULL, ip_to_object(reply));
                                assert(kr == KERN_SUCCESS);
                                /* reply port is unlocked */
                        } else {
@@ -3798,7 +4349,7 @@ done_with_reply:
                                if ((option & MACH_RCV_VOUCHER) != 0) {
                                        ipc_entry_t entry;
 
-                                       if (ipc_right_reverse(space, (ipc_object_t) voucher,
+                                       if (ipc_right_reverse(space, ip_to_object(voucher),
                                            &voucher_name, &entry)) {
                                                /* voucher port locked */
                                                assert(entry->ie_bits & MACH_PORT_TYPE_SEND);
@@ -3808,16 +4359,15 @@ done_with_reply:
                                                ipc_entry_claim(space, &voucher_name, &entry);
                                                assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE);
                                                assert(entry->ie_object == IO_NULL);
-                                               entry->ie_object = (ipc_object_t) voucher;
+                                               entry->ie_object = ip_to_object(voucher);
                                                ip_lock(voucher);
                                        }
                                        /* space is locked and active */
-
-                                       assert(ip_active(voucher));
+                                       require_ip_active(voucher);
                                        assert(ip_kotype(voucher) == IKOT_VOUCHER);
                                        kr = ipc_right_copyout(space, voucher_name, entry,
-                                           MACH_MSG_TYPE_MOVE_SEND, TRUE,
-                                           (ipc_object_t) voucher);
+                                           MACH_MSG_TYPE_MOVE_SEND, NULL, NULL,
+                                           ip_to_object(voucher));
                                        /* voucher port is unlocked */
                                } else {
                                        voucher_type = MACH_MSGH_BITS_ZERO;
@@ -3909,7 +4459,7 @@ done_with_voucher:
                 */
 
                if (ip_active(dest)) {
-                       ipc_object_copyout_dest(space, (ipc_object_t) dest,
+                       ipc_object_copyout_dest(space, ip_to_object(dest),
                            dest_type, &dest_name);
                        /* dest is unlocked */
                } else {
@@ -3946,11 +4496,6 @@ done_with_voucher:
                        }
                }
 
-               if (IP_VALID(release_voucher_port)) {
-                       ipc_port_release_send(release_voucher_port);
-               }
-
-
                if ((option & MACH_RCV_VOUCHER) != 0) {
                        KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_MSG_RECV) | DBG_FUNC_NONE,
                            VM_KERNEL_ADDRPERM((uintptr_t)kmsg),
@@ -3967,6 +4512,10 @@ done_with_voucher:
                            0);
                }
 
+               if (IP_VALID(release_voucher_port)) {
+                       ipc_port_release_send(release_voucher_port);
+               }
+
                msg->msgh_bits = MACH_MSGH_BITS_SET(reply_type, dest_type,
                    voucher_type, mbits);
                msg->msgh_local_port = CAST_MACH_NAME_TO_PORT(dest_name);
@@ -3999,6 +4548,8 @@ ipc_kmsg_copyout_object(
        ipc_space_t             space,
        ipc_object_t            object,
        mach_msg_type_name_t    msgt_name,
+       mach_port_context_t     *context,
+       mach_msg_guard_flags_t  *guard_flags,
        mach_port_name_t        *namep)
 {
        kern_return_t kr;
@@ -4008,7 +4559,7 @@ ipc_kmsg_copyout_object(
                return MACH_MSG_SUCCESS;
        }
 
-       kr = ipc_object_copyout(space, object, msgt_name, TRUE, namep);
+       kr = ipc_object_copyout(space, object, msgt_name, context, guard_flags, namep);
        if (kr != KERN_SUCCESS) {
                ipc_object_destroy(object, msgt_name);
 
@@ -4028,12 +4579,7 @@ ipc_kmsg_copyout_object(
        return MACH_MSG_SUCCESS;
 }
 
-mach_msg_descriptor_t *
-ipc_kmsg_copyout_port_descriptor(mach_msg_descriptor_t *dsc,
-    mach_msg_descriptor_t *user_dsc,
-    ipc_space_t space,
-    kern_return_t *mr);
-mach_msg_descriptor_t *
+static mach_msg_descriptor_t *
 ipc_kmsg_copyout_port_descriptor(mach_msg_descriptor_t *dsc,
     mach_msg_descriptor_t *dest_dsc,
     ipc_space_t space,
@@ -4043,18 +4589,15 @@ ipc_kmsg_copyout_port_descriptor(mach_msg_descriptor_t *dsc,
        mach_port_name_t            name;
        mach_msg_type_name_t                disp;
 
-
        /* Copyout port right carried in the message */
        port = dsc->port.name;
        disp = dsc->port.disposition;
        *mr |= ipc_kmsg_copyout_object(space,
-           (ipc_object_t)port,
-           disp,
-           &name);
+           ip_to_object(port), disp, NULL, NULL, &name);
 
        if (current_task() == kernel_task) {
                mach_msg_port_descriptor_t *user_dsc = (typeof(user_dsc))dest_dsc;
-               user_dsc--; // point to the start of this port descriptor
+               user_dsc--;         // point to the start of this port descriptor
                bzero((void *)user_dsc, sizeof(*user_dsc));
                user_dsc->name = CAST_MACH_NAME_TO_PORT(name);
                user_dsc->disposition = disp;
@@ -4062,7 +4605,7 @@ ipc_kmsg_copyout_port_descriptor(mach_msg_descriptor_t *dsc,
                dest_dsc = (typeof(dest_dsc))user_dsc;
        } else {
                mach_msg_legacy_port_descriptor_t *user_dsc = (typeof(user_dsc))dest_dsc;
-               user_dsc--; // point to the start of this port descriptor
+               user_dsc--;         // point to the start of this port descriptor
                bzero((void *)user_dsc, sizeof(*user_dsc));
                user_dsc->name = CAST_MACH_PORT_TO_NAME(name);
                user_dsc->disposition = disp;
@@ -4252,16 +4795,16 @@ ipc_kmsg_copyout_ool_ports_descriptor(mach_msg_ool_ports_descriptor_t *dsc,
                 * for those rights out to user-space.
                 */
                if (rcv_addr != 0) {
-                       mach_port_t *objects = (mach_port_t *) dsc->address;
+                       ipc_object_t *objects = (ipc_object_t *) dsc->address;
                        mach_port_name_t *names = (mach_port_name_t *) dsc->address;
 
                        /* copyout port rights carried in the message */
 
                        for (i = 0; i < count; i++) {
-                               ipc_object_t object = (ipc_object_t)objects[i];
+                               ipc_object_t object = objects[i];
 
                                *mr |= ipc_kmsg_copyout_object(space, object,
-                                   disp, &names[i]);
+                                   disp, NULL, NULL, &names[i]);
                        }
 
                        /* copyout to memory allocated above */
@@ -4325,6 +4868,81 @@ ipc_kmsg_copyout_ool_ports_descriptor(mach_msg_ool_ports_descriptor_t *dsc,
        return user_dsc;
 }
 
+static mach_msg_descriptor_t *
+ipc_kmsg_copyout_guarded_port_descriptor(
+       mach_msg_guarded_port_descriptor_t *dsc,
+       mach_msg_descriptor_t *dest_dsc,
+       int is_64bit,
+       __unused ipc_kmsg_t  kmsg,
+       ipc_space_t space,
+       mach_msg_option_t option,
+       kern_return_t *mr)
+{
+       mach_port_t                 port;
+       mach_port_name_t            name = MACH_PORT_NULL;
+       mach_msg_type_name_t        disp;
+       mach_msg_guard_flags_t      guard_flags;
+       mach_port_context_t         context;
+
+       /* Copyout port right carried in the message */
+       port = dsc->name;
+       disp = dsc->disposition;
+       guard_flags = dsc->flags;
+       context = 0;
+
+       /* Currently kernel_task doesnt support receiving guarded port descriptors */
+       struct knote *kn = current_thread()->ith_knote;
+       if ((kn != ITH_KNOTE_PSEUDO) && (((option & MACH_RCV_GUARDED_DESC) == 0) ||
+           (current_task() == kernel_task))) {
+#if DEVELOPMENT || DEBUG
+               if (current_task() != kernel_task) {
+                       /*
+                        * Simulated crash needed for debugging, notifies the receiver to opt into receiving
+                        * guarded descriptors.
+                        */
+                       mach_port_guard_exception(current_thread()->ith_receiver_name, 0, 0, kGUARD_EXC_RCV_GUARDED_DESC);
+               }
+#endif
+               KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_DESTROY_GUARDED_DESC), current_thread()->ith_receiver_name,
+                   VM_KERNEL_ADDRPERM(port), disp, guard_flags);
+               ipc_object_destroy(ip_to_object(port), disp);
+               mach_msg_legacy_port_descriptor_t *user_dsc = (typeof(user_dsc))dest_dsc;
+               user_dsc--;         // point to the start of this port descriptor
+               bzero((void *)user_dsc, sizeof(*user_dsc));
+               user_dsc->name = name;
+               user_dsc->disposition = disp;
+               user_dsc->type = MACH_MSG_PORT_DESCRIPTOR;
+               dest_dsc = (typeof(dest_dsc))user_dsc;
+       } else {
+               *mr |= ipc_kmsg_copyout_object(space,
+                   ip_to_object(port), disp, &context, &guard_flags, &name);
+
+               if (!is_64bit) {
+                       mach_msg_guarded_port_descriptor32_t *user_dsc = (typeof(user_dsc))dest_dsc;
+                       user_dsc--;         // point to the start of this port descriptor
+                       bzero((void *)user_dsc, sizeof(*user_dsc));
+                       user_dsc->name = name;
+                       user_dsc->flags = guard_flags;
+                       user_dsc->disposition = disp;
+                       user_dsc->type = MACH_MSG_GUARDED_PORT_DESCRIPTOR;
+                       user_dsc->context = CAST_DOWN_EXPLICIT(uint32_t, context);
+                       dest_dsc = (typeof(dest_dsc))user_dsc;
+               } else {
+                       mach_msg_guarded_port_descriptor64_t *user_dsc = (typeof(user_dsc))dest_dsc;
+                       user_dsc--;         // point to the start of this port descriptor
+                       bzero((void *)user_dsc, sizeof(*user_dsc));
+                       user_dsc->name = name;
+                       user_dsc->flags = guard_flags;
+                       user_dsc->disposition = disp;
+                       user_dsc->type = MACH_MSG_GUARDED_PORT_DESCRIPTOR;
+                       user_dsc->context = context;
+                       dest_dsc = (typeof(dest_dsc))user_dsc;
+               }
+       }
+
+       return (mach_msg_descriptor_t *)dest_dsc;
+}
+
 /*
  *     Routine:        ipc_kmsg_copyout_body
  *     Purpose:
@@ -4349,6 +4967,7 @@ ipc_kmsg_copyout_body(
        ipc_kmsg_t              kmsg,
        ipc_space_t             space,
        vm_map_t                map,
+       mach_msg_option_t       option,
        mach_msg_body_t         *slist)
 {
        mach_msg_body_t             *body;
@@ -4390,6 +5009,10 @@ ipc_kmsg_copyout_body(
                        user_dsc = ipc_kmsg_copyout_ool_ports_descriptor(
                                (mach_msg_ool_ports_descriptor_t *)&kern_dsc[i], user_dsc, is_task_64bit, map, space, kmsg, &mr);
                        break;
+               case MACH_MSG_GUARDED_PORT_DESCRIPTOR:
+                       user_dsc = ipc_kmsg_copyout_guarded_port_descriptor(
+                               (mach_msg_guarded_port_descriptor_t *)&kern_dsc[i], user_dsc, is_task_64bit, kmsg, space, option, &mr);
+                       break;
                default: {
                        panic("untyped IPC copyout body: invalid message descriptor");
                }
@@ -4448,6 +5071,7 @@ ipc_kmsg_copyout_size(
                        case MACH_MSG_OOL_DESCRIPTOR:
                        case MACH_MSG_OOL_VOLATILE_DESCRIPTOR:
                        case MACH_MSG_OOL_PORTS_DESCRIPTOR:
+                       case MACH_MSG_GUARDED_PORT_DESCRIPTOR:
                                if (!is_task_64bit) {
                                        send_size -= DESC_SIZE_ADJUSTMENT;
                                }
@@ -4495,7 +5119,7 @@ ipc_kmsg_copyout(
        }
 
        if (kmsg->ikm_header->msgh_bits & MACH_MSGH_BITS_COMPLEX) {
-               mr = ipc_kmsg_copyout_body(kmsg, space, map, slist);
+               mr = ipc_kmsg_copyout_body(kmsg, space, map, option, slist);
 
                if (mr != MACH_MSG_SUCCESS) {
                        mr |= MACH_RCV_BODY_ERROR;
@@ -4533,9 +5157,9 @@ ipc_kmsg_copyout_pseudo(
        mach_msg_body_t         *slist)
 {
        mach_msg_bits_t mbits = kmsg->ikm_header->msgh_bits;
-       ipc_object_t dest = (ipc_object_t) kmsg->ikm_header->msgh_remote_port;
-       ipc_object_t reply = (ipc_object_t) kmsg->ikm_header->msgh_local_port;
-       ipc_object_t voucher = (ipc_object_t) kmsg->ikm_voucher;
+       ipc_object_t dest = ip_to_object(kmsg->ikm_header->msgh_remote_port);
+       ipc_object_t reply = ip_to_object(kmsg->ikm_header->msgh_local_port);
+       ipc_object_t voucher = ip_to_object(kmsg->ikm_voucher);
        mach_msg_type_name_t dest_type = MACH_MSGH_BITS_REMOTE(mbits);
        mach_msg_type_name_t reply_type = MACH_MSGH_BITS_LOCAL(mbits);
        mach_msg_type_name_t voucher_type = MACH_MSGH_BITS_VOUCHER(mbits);
@@ -4560,8 +5184,8 @@ ipc_kmsg_copyout_pseudo(
        ipc_importance_assert_clean(kmsg);
 #endif
 
-       mr = (ipc_kmsg_copyout_object(space, dest, dest_type, &dest_name) |
-           ipc_kmsg_copyout_object(space, reply, reply_type, &reply_name));
+       mr = (ipc_kmsg_copyout_object(space, dest, dest_type, NULL, NULL, &dest_name) |
+           ipc_kmsg_copyout_object(space, reply, reply_type, NULL, NULL, &reply_name));
 
        kmsg->ikm_header->msgh_bits = mbits & MACH_MSGH_BITS_USER;
        kmsg->ikm_header->msgh_remote_port = CAST_MACH_NAME_TO_PORT(dest_name);
@@ -4571,12 +5195,12 @@ ipc_kmsg_copyout_pseudo(
                assert(voucher_type == MACH_MSG_TYPE_MOVE_SEND);
 
                kmsg->ikm_voucher = IP_NULL;
-               mr |= ipc_kmsg_copyout_object(space, voucher, voucher_type, &voucher_name);
+               mr |= ipc_kmsg_copyout_object(space, voucher, voucher_type, NULL, NULL, &voucher_name);
                kmsg->ikm_header->msgh_voucher_port = voucher_name;
        }
 
        if (mbits & MACH_MSGH_BITS_COMPLEX) {
-               mr |= ipc_kmsg_copyout_body(kmsg, space, map, slist);
+               mr |= ipc_kmsg_copyout_body(kmsg, space, map, 0, slist);
        }
 
        return mr;
@@ -4606,9 +5230,9 @@ ipc_kmsg_copyout_dest(
        mach_port_name_t dest_name, reply_name, voucher_name;
 
        mbits = kmsg->ikm_header->msgh_bits;
-       dest = (ipc_object_t) kmsg->ikm_header->msgh_remote_port;
-       reply = (ipc_object_t) kmsg->ikm_header->msgh_local_port;
-       voucher = (ipc_object_t) kmsg->ikm_voucher;
+       dest = ip_to_object(kmsg->ikm_header->msgh_remote_port);
+       reply = ip_to_object(kmsg->ikm_header->msgh_local_port);
+       voucher = ip_to_object(kmsg->ikm_voucher);
        voucher_name = kmsg->ikm_header->msgh_voucher_port;
        dest_type = MACH_MSGH_BITS_REMOTE(mbits);
        reply_type = MACH_MSGH_BITS_LOCAL(mbits);
@@ -4639,7 +5263,7 @@ ipc_kmsg_copyout_dest(
                assert(voucher_type == MACH_MSG_TYPE_MOVE_SEND);
 
                kmsg->ikm_voucher = IP_NULL;
-               ipc_object_destroy((ipc_object_t)voucher, voucher_type);
+               ipc_object_destroy(voucher, voucher_type);
                voucher_name = MACH_PORT_NULL;
        }
 
@@ -4682,7 +5306,7 @@ ipc_kmsg_copyout_to_kernel(
        mach_msg_type_name_t reply_type;
        mach_port_name_t dest_name;
 
-       dest = (ipc_object_t) kmsg->ikm_header->msgh_remote_port;
+       dest = ip_to_object(kmsg->ikm_header->msgh_remote_port);
        reply = kmsg->ikm_header->msgh_local_port;
        dest_type = MACH_MSGH_BITS_REMOTE(kmsg->ikm_header->msgh_bits);
        reply_type = MACH_MSGH_BITS_LOCAL(kmsg->ikm_header->msgh_bits);
@@ -4741,7 +5365,7 @@ ipc_kmsg_copyout_to_kernel_legacy(
        mach_msg_type_name_t reply_type;
        mach_port_name_t dest_name;
 
-       dest = (ipc_object_t) kmsg->ikm_header->msgh_remote_port;
+       dest = ip_to_object(kmsg->ikm_header->msgh_remote_port);
        reply = kmsg->ikm_header->msgh_local_port;
        dest_type = MACH_MSGH_BITS_REMOTE(kmsg->ikm_header->msgh_bits);
        reply_type = MACH_MSGH_BITS_LOCAL(kmsg->ikm_header->msgh_bits);
@@ -4843,9 +5467,20 @@ ipc_kmsg_copyout_to_kernel_legacy(
                        dest_dsc->type = type;
                        break;
                }
+               case MACH_MSG_GUARDED_PORT_DESCRIPTOR: {
+                       mach_msg_guarded_port_descriptor_t *source_dsc = (typeof(source_dsc)) & saddr->guarded_port;
+                       mach_msg_guarded_port_descriptor32_t *dest_dsc = &daddr->guarded_port32;
+
+                       dest_dsc->name = CAST_MACH_PORT_TO_NAME(source_dsc->name);
+                       dest_dsc->disposition = source_dsc->disposition;
+                       dest_dsc->flags = 0;
+                       dest_dsc->type = MACH_MSG_GUARDED_PORT_DESCRIPTOR;
+                       dest_dsc->context = 0;
+                       break;
+               }
                default: {
 #if     MACH_ASSERT
-                       panic("ipc_kmsg_copyin_from_kernel:  bad descriptor");
+                       panic("ipc_kmsg_copyout_to_kernel_legacy: bad descriptor");
 #endif  /* MACH_ASSERT */
                }
                }
@@ -4945,3 +5580,12 @@ done:
 
        return trailer->msgh_trailer_size;
 }
+
+mach_msg_header_t *
+ipc_kmsg_msg_header(ipc_kmsg_t kmsg)
+{
+       if (NULL == kmsg) {
+               return NULL;
+       }
+       return kmsg->ikm_header;
+}
index 74c31f1b418cd3c368c1e97aadc752757c057ce4..68b7c4016b8f1d333accb78a2e3851e31a6d9124 100644 (file)
 #include <ipc/ipc_object.h>
 #include <sys/kdebug.h>
 
+typedef uint32_t ipc_kmsg_flags_t;
+
+#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1       /* Dest port contains an immovable send right */
+
 /*
  *     This structure is only the header for a kmsg buffer;
  *     the actual buffer is normally larger.  The rest of the buffer
 
 struct ipc_kmsg {
        mach_msg_size_t            ikm_size;
+       ipc_kmsg_flags_t           ikm_flags;
        struct ipc_kmsg            *ikm_next;        /* next message on port/discard queue */
        struct ipc_kmsg            *ikm_prev;        /* prev message on port/discard queue */
        mach_msg_header_t          *ikm_header;
@@ -165,6 +170,7 @@ MACRO_END
 #define ikm_init(kmsg, size)                                    \
 MACRO_BEGIN                                                     \
        (kmsg)->ikm_size = (size);                                  \
+       (kmsg)->ikm_flags = 0;                                      \
        (kmsg)->ikm_prealloc = IP_NULL;                             \
        (kmsg)->ikm_voucher = IP_NULL;                              \
        (kmsg)->ikm_importance = IIE_NULL;                          \
@@ -348,6 +354,8 @@ extern mach_msg_return_t ipc_kmsg_copyout_object(
        ipc_space_t             space,
        ipc_object_t            object,
        mach_msg_type_name_t    msgt_name,
+       mach_port_context_t     *context,
+       mach_msg_guard_flags_t  *guard_flags,
        mach_port_name_t        *namep);
 
 /* Copyout the header and body to a user message */
@@ -363,6 +371,7 @@ extern mach_msg_return_t ipc_kmsg_copyout_body(
        ipc_kmsg_t              kmsg,
        ipc_space_t             space,
        vm_map_t                map,
+       mach_msg_option_t       option,
        mach_msg_body_t         *slist);
 
 /* Copyout port rights and out-of-line memory to a user message,
@@ -407,4 +416,7 @@ extern void ipc_kmsg_trace_send(ipc_kmsg_t kmsg,
 #define ipc_kmsg_trace_send(a, b) do { } while (0)
 #endif
 
+extern mach_msg_header_t *
+    ipc_kmsg_msg_header(ipc_kmsg_t);
+
 #endif  /* _IPC_IPC_KMSG_H_ */
index 2a66425982f0347965e2e59b9baba47b0591d7c6..4bec21084b2c88a77d49f5edc94f72c2342e4a4c 100644 (file)
@@ -84,6 +84,7 @@
 #include <kern/thread.h>
 #include <kern/waitq.h>
 
+#include <ipc/port.h>
 #include <ipc/ipc_mqueue.h>
 #include <ipc/ipc_kmsg.h>
 #include <ipc/ipc_port.h>
@@ -106,7 +107,7 @@ int ipc_mqueue_full;            /* address is event for queue space */
 int ipc_mqueue_rcv;             /* address is event for message arrival */
 
 /* forward declarations */
-void ipc_mqueue_receive_results(wait_result_t result);
+static void ipc_mqueue_receive_results(wait_result_t result);
 static void ipc_mqueue_peek_on_thread(
        ipc_mqueue_t        port_mq,
        mach_msg_option_t   option,
@@ -132,6 +133,7 @@ ipc_mqueue_init(
                mqueue->imq_seqno = 0;
                mqueue->imq_msgcount = 0;
                mqueue->imq_qlimit = MACH_PORT_QLIMIT_DEFAULT;
+               mqueue->imq_context = 0;
                mqueue->imq_fullwaiters = FALSE;
 #if MACH_FLIPC
                mqueue->imq_fport = FPORT_NULL;
@@ -417,6 +419,26 @@ leave:
        return KERN_SUCCESS;
 }
 
+
+/*
+ *     Routine:        ipc_mqueue_has_klist
+ *     Purpose:
+ *             Returns whether the given mqueue imq_klist field can be used as a klist.
+ */
+static inline bool
+ipc_mqueue_has_klist(ipc_mqueue_t mqueue)
+{
+       ipc_object_t object = imq_to_object(mqueue);
+       if (io_otype(object) != IOT_PORT) {
+               return true;
+       }
+       ipc_port_t port = ip_from_mq(mqueue);
+       if (port->ip_specialreply) {
+               return false;
+       }
+       return port->ip_sync_link_state == PORT_SYNC_LINK_ANY;
+}
+
 /*
  *     Routine:        ipc_mqueue_changed
  *     Purpose:
@@ -429,7 +451,7 @@ ipc_mqueue_changed(
        ipc_space_t     space,
        ipc_mqueue_t    mqueue)
 {
-       if (IMQ_KLIST_VALID(mqueue) && SLIST_FIRST(&mqueue->imq_klist)) {
+       if (ipc_mqueue_has_klist(mqueue) && SLIST_FIRST(&mqueue->imq_klist)) {
                /*
                 * Indicate that this message queue is vanishing
                 *
@@ -440,7 +462,7 @@ ipc_mqueue_changed(
                 * The new process may want to register the port it gets back with an
                 * EVFILT_MACHPORT filter again, and may have pending sync IPC on this
                 * port pending already, in which case we want the imq_klist field to be
-                * reusable for nefarious purposes (see IMQ_SET_INHERITOR).
+                * reusable for nefarious purposes.
                 *
                 * Fortunately, we really don't need this linkage anymore after this
                 * point as EV_VANISHED / EV_EOF will be the last thing delivered ever.
@@ -458,6 +480,11 @@ ipc_mqueue_changed(
                 */
                assert(space);
                knote_vanish(&mqueue->imq_klist, is_active(space));
+       }
+
+       if (io_otype(imq_to_object(mqueue)) == IOT_PORT) {
+               ipc_port_adjust_sync_link_state_locked(ip_from_mq(mqueue), PORT_SYNC_LINK_ANY, NULL);
+       } else {
                klist_init(&mqueue->imq_klist);
        }
 
@@ -516,7 +543,6 @@ ipc_mqueue_send(
                thread_t cur_thread = current_thread();
                ipc_port_t port = ip_from_mq(mqueue);
                struct turnstile *send_turnstile = TURNSTILE_NULL;
-               turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
                uint64_t deadline;
 
                /*
@@ -544,17 +570,8 @@ ipc_mqueue_send(
                    port_send_turnstile_address(port),
                    TURNSTILE_NULL, TURNSTILE_SYNC_IPC);
 
-               /* Check if the port in is in transit, get the destination port's turnstile */
-               if (ip_active(port) &&
-                   port->ip_receiver_name == MACH_PORT_NULL &&
-                   port->ip_destination != NULL) {
-                       inheritor = port_send_turnstile(port->ip_destination);
-               } else {
-                       inheritor = ipc_port_get_inheritor(port);
-               }
-
-               turnstile_update_inheritor(send_turnstile, inheritor,
-                   TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
+               ipc_port_send_update_inheritor(port, send_turnstile,
+                   TURNSTILE_DELAYED_UPDATE);
 
                wresult = waitq_assert_wait64_leeway(
                        &send_turnstile->ts_waitq,
@@ -575,7 +592,7 @@ ipc_mqueue_send(
 
                /* Call turnstile complete with interlock held */
                imq_lock(mqueue);
-               turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL);
+               turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL, TURNSTILE_SYNC_IPC);
                imq_unlock(mqueue);
 
                /* Call cleanup after dropping the interlock */
@@ -636,11 +653,13 @@ ipc_mqueue_override_send(
                ipc_kmsg_t first = ipc_kmsg_queue_first(&mqueue->imq_messages);
 
                if (first && ipc_kmsg_override_qos(&mqueue->imq_messages, first, override)) {
-                       ipc_port_t port = ip_from_mq(mqueue);
+                       ipc_object_t object = imq_to_object(mqueue);
+                       assert(io_otype(object) == IOT_PORT);
+                       ipc_port_t port = ip_object_to_port(object);
                        if (ip_active(port) &&
                            port->ip_receiver_name != MACH_PORT_NULL &&
                            is_active(port->ip_receiver) &&
-                           IMQ_KLIST_VALID(mqueue)) {
+                           ipc_mqueue_has_klist(mqueue)) {
                                KNOTE(&mqueue->imq_klist, 0);
                        }
                }
@@ -787,11 +806,13 @@ ipc_mqueue_post(
                        if (mqueue->imq_msgcount > 0) {
                                if (ipc_kmsg_enqueue_qos(&mqueue->imq_messages, kmsg)) {
                                        /* if the space is dead there is no point calling KNOTE */
-                                       ipc_port_t port = ip_from_mq(mqueue);
+                                       ipc_object_t object = imq_to_object(mqueue);
+                                       assert(io_otype(object) == IOT_PORT);
+                                       ipc_port_t port = ip_object_to_port(object);
                                        if (ip_active(port) &&
                                            port->ip_receiver_name != MACH_PORT_NULL &&
                                            is_active(port->ip_receiver) &&
-                                           IMQ_KLIST_VALID(mqueue)) {
+                                           ipc_mqueue_has_klist(mqueue)) {
                                                KNOTE(&mqueue->imq_klist, 0);
                                        }
                                }
@@ -902,7 +923,7 @@ out_unlock:
 }
 
 
-/* static */ void
+static void
 ipc_mqueue_receive_results(wait_result_t saved_wait_result)
 {
        thread_t                self = current_thread();
@@ -1077,7 +1098,6 @@ ipc_mqueue_receive_on_thread(
        wait_result_t           wresult;
        uint64_t                deadline;
        struct turnstile        *rcv_turnstile = TURNSTILE_NULL;
-       turnstile_inheritor_t   inheritor = NULL;
 
        /* called with mqueue locked */
 
@@ -1179,8 +1199,10 @@ ipc_mqueue_receive_on_thread(
        }
 
        /*
-        * Threads waiting on a port (not portset)
-        * will wait on port's receive turnstile.
+        * Threads waiting on a special reply port
+        * (not portset or regular ports)
+        * will wait on its receive turnstile.
+        *
         * Donate waiting thread's turnstile and
         * setup inheritor for special reply port.
         * Based on the state of the special reply
@@ -1195,18 +1217,14 @@ ipc_mqueue_receive_on_thread(
         * will be converted to to turnstile waitq
         * in waitq_assert_wait instead of global waitqs.
         */
-       if (imq_is_queue(mqueue)) {
+       if (imq_is_queue(mqueue) && ip_from_mq(mqueue)->ip_specialreply) {
                ipc_port_t port = ip_from_mq(mqueue);
                rcv_turnstile = turnstile_prepare((uintptr_t)port,
                    port_rcv_turnstile_address(port),
                    TURNSTILE_NULL, TURNSTILE_SYNC_IPC);
 
-               if (port->ip_specialreply) {
-                       inheritor = ipc_port_get_special_reply_port_inheritor(port);
-               }
-
-               turnstile_update_inheritor(rcv_turnstile, inheritor,
-                   (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_DELAYED_UPDATE));
+               ipc_port_recv_update_inheritor(port, rcv_turnstile,
+                   TURNSTILE_DELAYED_UPDATE);
        }
 
        thread_set_pending_block_hint(thread, kThreadWaitPortReceive);
@@ -1592,7 +1610,7 @@ ipc_mqueue_set_gather_member_names(
 
                /* only receive rights can be members of port sets */
                if ((entry->ie_bits & MACH_PORT_TYPE_RECEIVE) != MACH_PORT_TYPE_NONE) {
-                       __IGNORE_WCASTALIGN(ipc_port_t port = (ipc_port_t)entry->ie_object);
+                       ipc_port_t port = ip_object_to_port(entry->ie_object);
                        ipc_mqueue_t mq = &port->ip_messages;
 
                        assert(IP_VALID(port));
@@ -1780,6 +1798,7 @@ ipc_mqueue_copyin(
        ipc_object_t            *objectp)
 {
        ipc_entry_t entry;
+       ipc_entry_bits_t bits;
        ipc_object_t object;
        ipc_mqueue_t mqueue;
 
@@ -1795,24 +1814,23 @@ ipc_mqueue_copyin(
                return MACH_RCV_INVALID_NAME;
        }
 
+       bits = entry->ie_bits;
        object = entry->ie_object;
 
-       if (entry->ie_bits & MACH_PORT_TYPE_RECEIVE) {
-               ipc_port_t port;
+       if (bits & MACH_PORT_TYPE_RECEIVE) {
+               ipc_port_t port = ip_object_to_port(object);
 
-               __IGNORE_WCASTALIGN(port = (ipc_port_t) object);
                assert(port != IP_NULL);
 
                ip_lock(port);
-               assert(ip_active(port));
+               require_ip_active(port);
                assert(port->ip_receiver_name == name);
                assert(port->ip_receiver == space);
                is_read_unlock(space);
                mqueue = &port->ip_messages;
-       } else if (entry->ie_bits & MACH_PORT_TYPE_PORT_SET) {
-               ipc_pset_t pset;
+       } else if (bits & MACH_PORT_TYPE_PORT_SET) {
+               ipc_pset_t pset = ips_object_to_pset(object);
 
-               __IGNORE_WCASTALIGN(pset = (ipc_pset_t) object);
                assert(pset != IPS_NULL);
 
                ips_lock(pset);
@@ -1822,6 +1840,10 @@ ipc_mqueue_copyin(
                mqueue = &pset->ips_messages;
        } else {
                is_read_unlock(space);
+               /* guard exception if we never held the receive right in this entry */
+               if ((bits & MACH_PORT_TYPE_EX_RECEIVE) == 0) {
+                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_RCV_INVALID_NAME);
+               }
                return MACH_RCV_INVALID_NAME;
        }
 
@@ -1837,3 +1859,19 @@ ipc_mqueue_copyin(
        *mqueuep = mqueue;
        return MACH_MSG_SUCCESS;
 }
+
+void
+imq_lock(ipc_mqueue_t mq)
+{
+       ipc_object_t object = imq_to_object(mq);
+       ipc_object_validate(object);
+       waitq_lock(&(mq)->imq_wait_queue);
+}
+
+unsigned int
+imq_lock_try(ipc_mqueue_t mq)
+{
+       ipc_object_t object = imq_to_object(mq);
+       ipc_object_validate(object);
+       return waitq_lock_try(&(mq)->imq_wait_queue);
+}
index 140ce1dfdee887833e6a48ff4b7ef83263b12158..4e6fb324031b7392dfec4f43669ae2ee847c16b3 100644 (file)
@@ -90,6 +90,9 @@ typedef struct ipc_mqueue {
                        mach_port_name_t        receiver_name;
                        uint16_t                msgcount;
                        uint16_t                qlimit;
+#ifdef __LP64__
+                       uint32_t                qcontext;
+#endif
 #if MACH_FLIPC
                        struct flipc_port       *fport; // Null for local port, or ptr to flipc port
 #endif
@@ -99,30 +102,32 @@ typedef struct ipc_mqueue {
                } pset;
        } data;
        union {
+               /*
+                * Port Sets:
+                *   only use imq_klist
+                *
+                * Special Reply Ports (ip_specialreply == true):
+                *   only use imq_srp_owner_thread
+                *
+                * Ports, based on ip_sync_link_state, use:
+                * - PORT_SYNC_LINK_ANY:            imq_klist
+                * - PORT_SYNC_LINK_WORKLOOP_KNOTE: imq_inheritor_knote
+                * - PORT_SYNC_LINK_WORKLOOP_STASH: imq_inheritor_turnstile
+                * - PORT_SYNC_LINK_RCV_THREAD: imq_inheritor_thread_ref
+                */
                struct klist imq_klist;
-               uintptr_t imq_inheritor;
+               struct knote *imq_inheritor_knote;
+               struct turnstile *imq_inheritor_turnstile;
+               thread_t imq_inheritor_thread_ref;
+               thread_t imq_srp_owner_thread;
        };
+#ifndef __LP64__
+       uint32_t qcontext;
+#endif
 } *ipc_mqueue_t;
 
 #define IMQ_NULL                ((ipc_mqueue_t) 0)
 
-/*
- * When a receive right is in flight, before it can ever be registered with
- * a new knote, its imq_klist field can be overloaded to hold a pointer
- * to the knote that the port is pushing on through his turnstile.
- *
- * if IMQ_KLIST_VALID() returns true, then the imq_klist field can be used,
- * else IMQ_INHERITOR() can be used to get the pointer to the knote currently
- * being the port turnstile inheritor.
- */
-#define IMQ_KLIST_VALID(imq) (((imq)->imq_inheritor & 1) == 0)
-#define IMQ_INHERITOR(imq) ((struct turnstile *)((imq)->imq_inheritor ^ 1))
-#define IMQ_SET_INHERITOR(imq, inheritor) \
-MACRO_BEGIN                                                                   \
-               assert(((imq)->imq_inheritor & 1) || SLIST_EMPTY(&(imq)->imq_klist)); \
-               ((imq)->imq_inheritor = (uintptr_t)(inheritor) | 1);                  \
-MACRO_END
-
 #define imq_wait_queue          data.port.waitq
 #define imq_messages            data.port.messages
 #define imq_msgcount            data.port.msgcount
@@ -133,6 +138,16 @@ MACRO_END
 #define imq_fport               data.port.fport
 #endif
 
+/*
+ * The qcontext structure member fills in a 32-bit padding gap in ipc_mqueue.
+ * However, the 32-bits are in slightly different places on 32 and 64 bit systems.
+ */
+#ifdef __LP64__
+#define imq_context             data.port.qcontext
+#else
+#define imq_context             qcontext
+#endif
+
 /*
  * we can use the 'eventmask' bits of the waitq b/c
  * they are only used by global queues
@@ -146,28 +161,24 @@ MACRO_END
 #define imq_is_queue(mq)        waitq_is_queue(&(mq)->imq_wait_queue)
 #define imq_is_valid(mq)        waitq_is_valid(&(mq)->imq_wait_queue)
 
-#define imq_lock(mq)            waitq_lock(&(mq)->imq_wait_queue)
-#define imq_lock_try(mq)        waitq_lock_try(&(mq)->imq_wait_queue)
 #define imq_unlock(mq)          waitq_unlock(&(mq)->imq_wait_queue)
 #define imq_held(mq)            waitq_held(&(mq)->imq_wait_queue)
 #define imq_valid(mq)           waitq_valid(&(mq)->imq_wait_queue)
 
+extern void imq_lock(ipc_mqueue_t mq);
+extern unsigned int imq_lock_try(ipc_mqueue_t mq);
+
 /*
  * Get an ipc_mqueue pointer from a waitq pointer. These are traditionally the
  * same pointer, but this conversion makes no assumptions on union structure
  * member positions - it should allow the waitq to move around in either the
  * port-set mqueue or the port mqueue independently.
  */
-#define imq_from_waitq(waitq)   (waitq_is_set(waitq) ? \
-                                       ((struct ipc_mqueue *)((void *)( \
-                                               (uintptr_t)(waitq) - \
-                                               __offsetof(struct ipc_mqueue, imq_set_queue)) \
-                                       )) : \
-                                       ((struct ipc_mqueue *)((void *)( \
-                                               (uintptr_t)(waitq) - \
-                                               __offsetof(struct ipc_mqueue, imq_wait_queue)) \
-                                       )) \
-                                )
+#define imq_from_waitq(waitq)  (waitq_is_set(waitq) ? \
+               __container_of(waitq, struct ipc_mqueue, imq_set_queue.wqset_q) : \
+               __container_of(waitq, struct ipc_mqueue, imq_wait_queue))
+
+#define imq_to_object(mq) ip_to_object(ip_from_mq(mq))
 
 extern void imq_reserve_and_lock(ipc_mqueue_t mq,
     uint64_t *reserved_prepost);
index 1730c5b41aa2bbc8d636509152bb93b3e1e6f13d..f677c6e28c498b6004a7a692464e4dd02855204b 100644 (file)
@@ -158,7 +158,7 @@ void
 ipc_notify_send_once(
        ipc_port_t      port)
 {
-       ipc_port_adjust_special_reply_port(port, IPC_PORT_ADJUST_SR_NONE, FALSE);
+       ipc_port_adjust_special_reply_port(port, IPC_PORT_ADJUST_RESET_BOOSTRAP_CHECKIN, FALSE);
 
        (void)mach_notify_send_once(port);
        /* send-once right consumed */
index 27a1cca4af9c77bd2ad5a4e48df8a758d3501ae8..76fc96b8e5822d3001df3470a273badce659e290 100644 (file)
@@ -128,7 +128,7 @@ ipc_object_release(
  *             Look up an object in a space.
  *     Conditions:
  *             Nothing locked before.  If successful, the object
- *             is returned locked.  The caller doesn't get a ref.
+ *             is returned active and locked.  The caller doesn't get a ref.
  *     Returns:
  *             KERN_SUCCESS            Object returned locked.
  *             KERN_INVALID_TASK       The space is dead.
@@ -146,6 +146,10 @@ ipc_object_translate(
        ipc_object_t object;
        kern_return_t kr;
 
+       if (!MACH_PORT_RIGHT_VALID_TRANSLATE(right)) {
+               return KERN_INVALID_RIGHT;
+       }
+
        kr = ipc_right_lookup_read(space, name, &entry);
        if (kr != KERN_SUCCESS) {
                return kr;
@@ -163,6 +167,11 @@ ipc_object_translate(
        io_lock(object);
        is_read_unlock(space);
 
+       if (!io_active(object)) {
+               io_unlock(object);
+               return KERN_INVALID_NAME;
+       }
+
        *objectp = object;
        return KERN_SUCCESS;
 }
@@ -193,8 +202,9 @@ ipc_object_translate_two(
 {
        ipc_entry_t entry1;
        ipc_entry_t entry2;
-       ipc_object_t object;
+       ipc_object_t object1, object2;
        kern_return_t kr;
+       boolean_t doguard = TRUE;
 
        kr = ipc_right_lookup_two_read(space, name1, &entry1, name2, &entry2);
        if (kr != KERN_SUCCESS) {
@@ -203,26 +213,52 @@ ipc_object_translate_two(
        /* space is read-locked and active */
 
        if ((entry1->ie_bits & MACH_PORT_TYPE(right1)) == MACH_PORT_TYPE_NONE) {
+               /* If looking for receive, and the entry used to hold one, give a pass on EXC_GUARD */
+               if ((right1 & MACH_PORT_RIGHT_RECEIVE) == MACH_PORT_RIGHT_RECEIVE &&
+                   (entry1->ie_bits & MACH_PORT_TYPE_EX_RECEIVE) == MACH_PORT_TYPE_EX_RECEIVE) {
+                       doguard = FALSE;
+               }
                is_read_unlock(space);
-               mach_port_guard_exception(name1, 0, 0, kGUARD_EXC_INVALID_RIGHT);
+               if (doguard) {
+                       mach_port_guard_exception(name1, 0, 0, kGUARD_EXC_INVALID_RIGHT);
+               }
                return KERN_INVALID_RIGHT;
        }
 
        if ((entry2->ie_bits & MACH_PORT_TYPE(right2)) == MACH_PORT_TYPE_NONE) {
+               /* If looking for receive, and the entry used to hold one, give a pass on EXC_GUARD */
+               if ((right2 & MACH_PORT_RIGHT_RECEIVE) == MACH_PORT_RIGHT_RECEIVE &&
+                   (entry2->ie_bits & MACH_PORT_TYPE_EX_RECEIVE) == MACH_PORT_TYPE_EX_RECEIVE) {
+                       doguard = FALSE;
+               }
                is_read_unlock(space);
-               mach_port_guard_exception(name2, 0, 0, kGUARD_EXC_INVALID_RIGHT);
+               if (doguard) {
+                       mach_port_guard_exception(name2, 0, 0, kGUARD_EXC_INVALID_RIGHT);
+               }
                return KERN_INVALID_RIGHT;
        }
 
-       object = entry1->ie_object;
-       assert(object != IO_NULL);
-       io_lock(object);
-       *objectp1 = object;
+       object1 = entry1->ie_object;
+       assert(object1 != IO_NULL);
+       io_lock(object1);
+       if (!io_active(object1)) {
+               io_unlock(object1);
+               is_read_unlock(space);
+               return KERN_INVALID_NAME;
+       }
 
-       object = entry2->ie_object;
-       assert(object != IO_NULL);
-       io_lock(object);
-       *objectp2 = object;
+       object2 = entry2->ie_object;
+       assert(object2 != IO_NULL);
+       io_lock(object2);
+       if (!io_active(object2)) {
+               io_unlock(object1);
+               io_unlock(object2);
+               is_read_unlock(space);
+               return KERN_INVALID_NAME;
+       }
+
+       *objectp1 = object1;
+       *objectp2 = object2;
 
        is_read_unlock(space);
        return KERN_SUCCESS;
@@ -343,11 +379,11 @@ ipc_object_alloc(
        }
 
        if (otype == IOT_PORT) {
-               ipc_port_t port = (ipc_port_t)object;
+               ipc_port_t port = ip_object_to_port(object);
 
                bzero((char *)port, sizeof(*port));
        } else if (otype == IOT_PORT_SET) {
-               ipc_pset_t pset = (ipc_pset_t)object;
+               ipc_pset_t pset = ips_object_to_pset(object);
 
                bzero((char *)pset, sizeof(*pset));
        }
@@ -365,10 +401,10 @@ ipc_object_alloc(
        entry->ie_object = object;
        ipc_entry_modified(space, *namep, entry);
 
+       object->io_bits = io_makebits(TRUE, otype, 0);
        io_lock(object);
 
        object->io_references = 1; /* for entry, not caller */
-       object->io_bits = io_makebits(TRUE, otype, 0);
 
        *objectp = object;
        return KERN_SUCCESS;
@@ -412,11 +448,11 @@ ipc_object_alloc_name(
        }
 
        if (otype == IOT_PORT) {
-               ipc_port_t port = (ipc_port_t)object;
+               ipc_port_t port = ip_object_to_port(object);
 
                bzero((char *)port, sizeof(*port));
        } else if (otype == IOT_PORT_SET) {
-               ipc_pset_t pset = (ipc_pset_t)object;
+               ipc_pset_t pset = ips_object_to_pset(object);
 
                bzero((char *)pset, sizeof(*pset));
        }
@@ -438,16 +474,31 @@ ipc_object_alloc_name(
        entry->ie_object = object;
        ipc_entry_modified(space, name, entry);
 
+       object->io_bits = io_makebits(TRUE, otype, 0);
+
        io_lock(object);
        is_write_unlock(space);
 
        object->io_references = 1; /* for entry, not caller */
-       object->io_bits = io_makebits(TRUE, otype, 0);
 
        *objectp = object;
        return KERN_SUCCESS;
 }
 
+/*     Routine:        ipc_object_validate
+ *     Purpose:
+ *             Validates an ipc port or port set as belonging to the correct
+ *             zone.
+ */
+
+void
+ipc_object_validate(
+       ipc_object_t    object)
+{
+       int otype = (io_otype(object) == IOT_PORT_SET) ? IOT_PORT_SET : IOT_PORT;
+       zone_require(object, ipc_object_zones[otype]);
+}
+
 /*
  *     Routine:        ipc_object_copyin_type
  *     Purpose:
@@ -500,7 +551,10 @@ ipc_object_copyin(
        ipc_space_t             space,
        mach_port_name_t        name,
        mach_msg_type_name_t    msgt_name,
-       ipc_object_t            *objectp)
+       ipc_object_t            *objectp,
+       mach_port_context_t     context,
+       mach_msg_guard_flags_t  *guard_flags,
+       ipc_kmsg_flags_t        kmsg_flags)
 {
        ipc_entry_t entry;
        ipc_port_t soright;
@@ -508,6 +562,11 @@ ipc_object_copyin(
        kern_return_t kr;
        int assertcnt = 0;
 
+       ipc_right_copyin_flags_t irc_flags = IPC_RIGHT_COPYIN_FLAGS_DEADOK;
+       if (kmsg_flags & IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND) {
+               irc_flags |= IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND;
+       }
+
        /*
         *      Could first try a read lock when doing
         *      MACH_MSG_TYPE_COPY_SEND, MACH_MSG_TYPE_MAKE_SEND,
@@ -522,10 +581,12 @@ ipc_object_copyin(
 
        release_port = IP_NULL;
        kr = ipc_right_copyin(space, name, entry,
-           msgt_name, IPC_RIGHT_COPYIN_FLAGS_DEADOK,
+           msgt_name, irc_flags,
            objectp, &soright,
            &release_port,
-           &assertcnt);
+           &assertcnt,
+           context,
+           guard_flags);
        if (IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE) {
                ipc_entry_dealloc(space, name, entry);
        }
@@ -585,17 +646,17 @@ ipc_object_copyin_from_kernel(
 
        switch (msgt_name) {
        case MACH_MSG_TYPE_MOVE_RECEIVE: {
-               ipc_port_t port = (ipc_port_t) object;
+               ipc_port_t port = ip_object_to_port(object);
 
                ip_lock(port);
                imq_lock(&port->ip_messages);
-               assert(ip_active(port));
+               require_ip_active(port);
                if (port->ip_destination != IP_NULL) {
                        assert(port->ip_receiver == ipc_space_kernel);
+                       assert(port->ip_immovable_receive == 0);
 
                        /* relevant part of ipc_port_clear_receiver */
-                       ipc_port_set_mscount(port, 0);
-
+                       port->ip_mscount = 0;
                        port->ip_receiver_name = MACH_PORT_NULL;
                        port->ip_destination = IP_NULL;
                }
@@ -605,7 +666,7 @@ ipc_object_copyin_from_kernel(
        }
 
        case MACH_MSG_TYPE_COPY_SEND: {
-               ipc_port_t port = (ipc_port_t) object;
+               ipc_port_t port = ip_object_to_port(object);
 
                ip_lock(port);
                if (ip_active(port)) {
@@ -618,7 +679,7 @@ ipc_object_copyin_from_kernel(
        }
 
        case MACH_MSG_TYPE_MAKE_SEND: {
-               ipc_port_t port = (ipc_port_t) object;
+               ipc_port_t port = ip_object_to_port(object);
 
                ip_lock(port);
                if (ip_active(port)) {
@@ -636,26 +697,25 @@ ipc_object_copyin_from_kernel(
 
        case MACH_MSG_TYPE_MOVE_SEND: {
                /* move naked send right into the message */
-               assert(((ipc_port_t)object)->ip_srights);
+               assert(ip_object_to_port(object)->ip_srights);
                break;
        }
 
        case MACH_MSG_TYPE_MAKE_SEND_ONCE: {
-               ipc_port_t port = (ipc_port_t) object;
+               ipc_port_t port = ip_object_to_port(object);
 
                ip_lock(port);
                if (ip_active(port)) {
                        assert(port->ip_receiver_name != MACH_PORT_NULL);
                }
-               port->ip_sorights++;
-               ip_reference(port);
+               ipc_port_make_sonce_locked(port);
                ip_unlock(port);
                break;
        }
 
        case MACH_MSG_TYPE_MOVE_SEND_ONCE: {
                /* move naked send-once right into the message */
-               assert(((ipc_port_t)object)->ip_sorights);
+               assert(ip_object_to_port(object)->ip_sorights);
                break;
        }
 
@@ -685,15 +745,15 @@ ipc_object_destroy(
 
        switch (msgt_name) {
        case MACH_MSG_TYPE_PORT_SEND:
-               ipc_port_release_send((ipc_port_t) object);
+               ipc_port_release_send(ip_object_to_port(object));
                break;
 
        case MACH_MSG_TYPE_PORT_SEND_ONCE:
-               ipc_notify_send_once((ipc_port_t) object);
+               ipc_notify_send_once(ip_object_to_port(object));
                break;
 
        case MACH_MSG_TYPE_PORT_RECEIVE:
-               ipc_port_release_receive((ipc_port_t) object);
+               ipc_port_release_receive(ip_object_to_port(object));
                break;
 
        default:
@@ -721,15 +781,15 @@ ipc_object_destroy_dest(
 
        switch (msgt_name) {
        case MACH_MSG_TYPE_PORT_SEND:
-               ipc_port_release_send((ipc_port_t) object);
+               ipc_port_release_send(ip_object_to_port(object));
                break;
 
        case MACH_MSG_TYPE_PORT_SEND_ONCE:
                if (io_active(object) &&
-                   !ip_full_kernel((ipc_port_t) object)) {
-                       ipc_notify_send_once((ipc_port_t) object);
+                   !ip_full_kernel(ip_object_to_port(object))) {
+                       ipc_notify_send_once(ip_object_to_port(object));
                } else {
-                       ipc_port_release_sonce((ipc_port_t) object);
+                       ipc_port_release_sonce(ip_object_to_port(object));
                }
                break;
 
@@ -738,6 +798,98 @@ ipc_object_destroy_dest(
        }
 }
 
+/*
+ *     Routine:        ipc_object_insert_send_right
+ *     Purpose:
+ *             Insert a send right into an object already in the space.
+ *             The specified name must already point to a valid object.
+ *
+ *             Note: This really is a combined copyin()/copyout(),
+ *             that avoids most of the overhead of being implemented that way.
+ *
+ *             This is the fastpath for mach_port_insert_right.
+ *
+ *     Conditions:
+ *             Nothing locked.
+ *
+ *             msgt_name must be MACH_MSG_TYPE_MAKE_SEND_ONCE or
+ *             MACH_MSG_TYPE_MOVE_SEND_ONCE.
+ *
+ *     Returns:
+ *             KERN_SUCCESS            Copied out object, consumed ref.
+ *             KERN_INVALID_TASK       The space is dead.
+ *             KERN_INVALID_NAME       Name doesn't exist in space.
+ *             KERN_INVALID_CAPABILITY The object is dead.
+ *             KERN_RIGHT_EXISTS       Space has rights under another name.
+ */
+kern_return_t
+ipc_object_insert_send_right(
+       ipc_space_t             space,
+       mach_port_name_t        name,
+       mach_msg_type_name_t    msgt_name)
+{
+       ipc_entry_bits_t bits;
+       ipc_object_t object;
+       ipc_entry_t entry;
+       kern_return_t kr;
+
+       assert(msgt_name == MACH_MSG_TYPE_MAKE_SEND ||
+           msgt_name == MACH_MSG_TYPE_COPY_SEND);
+
+       kr = ipc_right_lookup_write(space, name, &entry);
+       if (kr != KERN_SUCCESS) {
+               return kr;
+       }
+       /* space is write-locked and active */
+
+       if (!IO_VALID(entry->ie_object)) {
+               is_write_unlock(space);
+               return KERN_INVALID_CAPABILITY;
+       }
+
+       bits = entry->ie_bits;
+       object = entry->ie_object;
+
+       io_lock(object);
+       if (!io_active(object)) {
+               kr = KERN_INVALID_CAPABILITY;
+       } else if (msgt_name == MACH_MSG_TYPE_MAKE_SEND) {
+               if (bits & MACH_PORT_TYPE_RECEIVE) {
+                       ipc_port_t port = ip_object_to_port(object);
+                       port->ip_mscount++;
+                       if ((bits & MACH_PORT_TYPE_SEND) == 0) {
+                               port->ip_srights++;
+                               bits |= MACH_PORT_TYPE_SEND;
+                       }
+                       /* leave urefs pegged to maximum if it overflowed */
+                       if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) {
+                               bits += 1; /* increment urefs */
+                       }
+                       entry->ie_bits = bits;
+                       ipc_entry_modified(space, name, entry);
+                       kr = KERN_SUCCESS;
+               } else {
+                       kr = KERN_INVALID_RIGHT;
+               }
+       } else { // MACH_MSG_TYPE_COPY_SEND
+               if (bits & MACH_PORT_TYPE_SEND) {
+                       /* leave urefs pegged to maximum if it overflowed */
+                       if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) {
+                               entry->ie_bits = bits + 1; /* increment urefs */
+                       }
+                       ipc_entry_modified(space, name, entry);
+                       kr = KERN_SUCCESS;
+               } else {
+                       kr = KERN_INVALID_RIGHT;
+               }
+       }
+
+       io_unlock(object);
+       is_write_unlock(space);
+
+       return kr;
+}
+
 /*
  *     Routine:        ipc_object_copyout
  *     Purpose:
@@ -760,7 +912,8 @@ ipc_object_copyout(
        ipc_space_t             space,
        ipc_object_t            object,
        mach_msg_type_name_t    msgt_name,
-       boolean_t               overflow,
+       mach_port_context_t     *context,
+       mach_msg_guard_flags_t  *guard_flags,
        mach_port_name_t        *namep)
 {
        struct knote *kn = current_thread()->ith_knote;
@@ -773,7 +926,7 @@ ipc_object_copyout(
 
        if (ITH_KNOTE_VALID(kn, msgt_name)) {
                filt_machport_turnstile_prepare_lazily(kn,
-                   msgt_name, (ipc_port_t)object);
+                   msgt_name, ip_object_to_port(object));
        }
 
        is_write_lock(space);
@@ -822,7 +975,7 @@ ipc_object_copyout(
        /* space is write-locked and active, object is locked and active */
 
        kr = ipc_right_copyout(space, name, entry,
-           msgt_name, overflow, object);
+           msgt_name, context, guard_flags, object);
 
        /* object is unlocked */
        is_write_unlock(space);
@@ -857,14 +1010,12 @@ ipc_object_copyout_name(
        ipc_space_t             space,
        ipc_object_t            object,
        mach_msg_type_name_t    msgt_name,
-       boolean_t               overflow,
        mach_port_name_t        name)
 {
        mach_port_name_t oname;
        ipc_entry_t oentry;
        ipc_entry_t entry;
        kern_return_t kr;
-       struct knote *kn = current_thread()->ith_knote;
 
 #if IMPORTANCE_INHERITANCE
        int assertcnt = 0;
@@ -874,11 +1025,6 @@ ipc_object_copyout_name(
        assert(IO_VALID(object));
        assert(io_otype(object) == IOT_PORT);
 
-       if (ITH_KNOTE_VALID(kn, msgt_name)) {
-               filt_machport_turnstile_prepare_lazily(kn,
-                   msgt_name, (ipc_port_t)object);
-       }
-
        kr = ipc_entry_alloc_name(space, name, &entry);
        if (kr != KERN_SUCCESS) {
                return kr;
@@ -931,7 +1077,7 @@ ipc_object_copyout_name(
         * port has assertions (and the task wants them).
         */
        if (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE) {
-               ipc_port_t port = (ipc_port_t)object;
+               ipc_port_t port = ip_object_to_port(object);
 
                if (space->is_task != TASK_NULL) {
                        task_imp = space->is_task->task_imp_base;
@@ -951,7 +1097,7 @@ ipc_object_copyout_name(
 #endif /* IMPORTANCE_INHERITANCE */
 
        kr = ipc_right_copyout(space, name, entry,
-           msgt_name, overflow, object);
+           msgt_name, NULL, NULL, object);
 
        /* object is unlocked */
        is_write_unlock(space);
@@ -994,8 +1140,6 @@ ipc_object_copyout_dest(
        assert(IO_VALID(object));
        assert(io_active(object));
 
-       io_release(object);
-
        /*
         *      If the space is the receiver/owner of the object,
         *      then we quietly consume the right and return
@@ -1005,7 +1149,7 @@ ipc_object_copyout_dest(
 
        switch (msgt_name) {
        case MACH_MSG_TYPE_PORT_SEND: {
-               ipc_port_t port = (ipc_port_t) object;
+               ipc_port_t port = ip_object_to_port(object);
                ipc_port_t nsrequest = IP_NULL;
                mach_port_mscount_t mscount;
 
@@ -1021,16 +1165,20 @@ ipc_object_copyout_dest(
                        nsrequest = port->ip_nsrequest;
                        port->ip_nsrequest = IP_NULL;
                        mscount = port->ip_mscount;
-                       ip_unlock(port);
+                       ipc_port_clear_sync_rcv_thread_boost_locked(port);
+                       /* port unlocked */
                        ipc_notify_no_senders(nsrequest, mscount);
                } else {
-                       ip_unlock(port);
+                       ipc_port_clear_sync_rcv_thread_boost_locked(port);
+                       /* port unlocked */
                }
+
+               ip_release(port);
                break;
        }
 
        case MACH_MSG_TYPE_PORT_SEND_ONCE: {
-               ipc_port_t port = (ipc_port_t) object;
+               ipc_port_t port = ip_object_to_port(object);
 
                assert(port->ip_sorights > 0);
 
@@ -1039,7 +1187,9 @@ ipc_object_copyout_dest(
 
                        port->ip_sorights--;
                        name = port->ip_receiver_name;
-                       ip_unlock(port);
+                       ipc_port_clear_sync_rcv_thread_boost_locked(port);
+                       /* port unlocked */
+                       ip_release(port);
                } else {
                        /*
                         *      A very bizarre case.  The message
@@ -1050,7 +1200,6 @@ ipc_object_copyout_dest(
                         *      so generate a send-once notification.
                         */
 
-                       ip_reference(port); /* restore ref */
                        ip_unlock(port);
 
                        ipc_notify_send_once(port);
@@ -1069,52 +1218,30 @@ ipc_object_copyout_dest(
 }
 
 /*
- *     Routine:        ipc_object_rename
+ *     Routine:        io_lock
  *     Purpose:
- *             Rename an entry in a space.
- *     Conditions:
- *             Nothing locked.
- *     Returns:
- *             KERN_SUCCESS            Renamed the entry.
- *             KERN_INVALID_TASK       The space was dead.
- *             KERN_INVALID_NAME       oname didn't denote an entry.
- *             KERN_NAME_EXISTS        nname already denoted an entry.
- *             KERN_RESOURCE_SHORTAGE  Couldn't allocate new entry.
+ *             Validate, then acquire a lock on an ipc object
  */
 
-kern_return_t
-ipc_object_rename(
-       ipc_space_t             space,
-       mach_port_name_t        oname,
-       mach_port_name_t        nname)
+void
+io_lock(ipc_object_t io)
 {
-       ipc_entry_t oentry, nentry;
-       kern_return_t kr;
-
-       kr = ipc_entry_alloc_name(space, nname, &nentry);
-       if (kr != KERN_SUCCESS) {
-               return kr;
-       }
-
-       /* space is write-locked and active */
-
-       if (ipc_right_inuse(space, nname, nentry)) {
-               /* space is unlocked */
-               return KERN_NAME_EXISTS;
-       }
-
-       /* don't let ipc_entry_lookup see the uninitialized new entry */
+       ipc_object_validate(io);
+       lck_spin_lock_grp(&(io)->io_lock_data, &ipc_lck_grp);
+}
 
-       if ((oname == nname) ||
-           ((oentry = ipc_entry_lookup(space, oname)) == IE_NULL)) {
-               ipc_entry_dealloc(space, nname, nentry);
-               is_write_unlock(space);
-               return KERN_INVALID_NAME;
-       }
+/*
+ *     Routine:        io_lock_try
+ *     Purpose:
+ *             Validate, then try to acquire a lock on an object,
+ *             fail if there is an existing busy lock
+ */
 
-       kr = ipc_right_rename(space, oname, oentry, nname, nentry);
-       /* space is unlocked */
-       return kr;
+boolean_t
+io_lock_try(ipc_object_t io)
+{
+       ipc_object_validate(io);
+       return lck_spin_try_lock_grp(&(io)->io_lock_data, &ipc_lck_grp);
 }
 
 /*
@@ -1126,11 +1253,8 @@ io_free(
        unsigned int    otype,
        ipc_object_t    object)
 {
-       ipc_port_t      port;
-
        if (otype == IOT_PORT) {
-               port = (ipc_port_t) object;
-               ipc_port_finalize(port);
+               ipc_port_finalize(ip_object_to_port(object));
        }
        io_lock_destroy(object);
        zfree(ipc_object_zones[otype], object);
index 2e23f5681c25e68ec6e339a406ce99bd6653d0a4..77ddc133353c14e642f6350420de4cf031facc18 100644 (file)
@@ -99,8 +99,8 @@ typedef natural_t ipc_object_type_t;
 struct ipc_object {
        ipc_object_bits_t io_bits;
        ipc_object_refs_t io_references;
-       lck_spin_t      io_lock_data;
-};
+       lck_spin_t        io_lock_data;
+} __attribute__((aligned(8)));
 
 /*
  * If another object type needs to participate in io_kotype()-based
@@ -131,7 +131,8 @@ struct ipc_object_header {
  *     definitions in ipc_port.h.
  */
 #define IO_BITS_PORT_INFO       0x0000f000      /* stupid port tricks */
-#define IO_BITS_KOTYPE          0x00000fff      /* used by the object */
+#define IO_BITS_KOTYPE          0x000007ff      /* used by the object */
+#define IO_BITS_KOBJECT         0x00000800      /* port belongs to a kobject */
 #define IO_BITS_OTYPE           0x7fff0000      /* determines a zone */
 #define IO_BITS_ACTIVE          0x80000000      /* is object alive? */
 
@@ -139,6 +140,7 @@ struct ipc_object_header {
 
 #define io_otype(io)            (((io)->io_bits & IO_BITS_OTYPE) >> 16)
 #define io_kotype(io)           ((io)->io_bits & IO_BITS_KOTYPE)
+#define io_is_kobject(io)       (((io)->io_bits & IO_BITS_KOBJECT) != IKOT_NONE)
 
 #define io_makebits(active, otype, kotype)      \
        (((active) ? IO_BITS_ACTIVE : 0) | ((otype) << 16) | (kotype))
@@ -151,6 +153,7 @@ struct ipc_object_header {
 #define IOT_NUMBER              2               /* number of types used */
 
 extern zone_t ipc_object_zones[IOT_NUMBER];
+extern lck_grp_t        ipc_lck_grp;
 
 #define io_alloc(otype)         \
                ((ipc_object_t) zalloc(ipc_object_zones[(otype)]))
@@ -167,15 +170,18 @@ extern void     io_free(
        lck_spin_init(&(io)->io_lock_data, &ipc_lck_grp, &ipc_lck_attr)
 #define io_lock_destroy(io) \
        lck_spin_destroy(&(io)->io_lock_data, &ipc_lck_grp)
-#define io_lock(io) \
-       lck_spin_lock_grp(&(io)->io_lock_data, &ipc_lck_grp)
-#define io_lock_try(io) \
-       lck_spin_try_lock_grp(&(io)->io_lock_data, &ipc_lck_grp)
+#define io_lock_held(io) \
+       LCK_SPIN_ASSERT(&(io)->io_lock_data, LCK_ASSERT_OWNED)
 #define io_lock_held_kdp(io) \
        kdp_lck_spin_is_acquired(&(io)->io_lock_data)
 #define io_unlock(io) \
        lck_spin_unlock(&(io)->io_lock_data)
 
+extern void io_lock(
+       ipc_object_t io);
+extern boolean_t io_lock_try(
+       ipc_object_t io);
+
 #define _VOLATILE_ volatile
 
 /* Sanity check the ref count.  If it is 0, we may be doubly zfreeing.
@@ -191,7 +197,7 @@ extern void     io_free(
  * and zfree modifies that to point to the next free zone element.
  */
 #define IO_MAX_REFERENCES                                               \
-       (unsigned)(~0 ^ (1 << (sizeof(int)*BYTE_SIZE - 1)))
+       (unsigned)(~0 ^ (1U << (sizeof(int)*BYTE_SIZE - 1)))
 
 static inline void
 io_reference(ipc_object_t io)
@@ -199,8 +205,10 @@ io_reference(ipc_object_t io)
        ipc_object_refs_t new_io_references;
        ipc_object_refs_t old_io_references;
 
-       assert((io)->io_references > 0 &&
-           (io)->io_references < IO_MAX_REFERENCES);
+       if ((io)->io_references == 0 ||
+           (io)->io_references >= IO_MAX_REFERENCES) {
+               panic("%s: reference count %u is invalid\n", __func__, (io)->io_references);
+       }
 
        do {
                old_io_references = (io)->io_references;
@@ -219,8 +227,10 @@ io_release(ipc_object_t io)
        ipc_object_refs_t new_io_references;
        ipc_object_refs_t old_io_references;
 
-       assert((io)->io_references > 0 &&
-           (io)->io_references < IO_MAX_REFERENCES);
+       if ((io)->io_references == 0 ||
+           (io)->io_references >= IO_MAX_REFERENCES) {
+               panic("%s: reference count %u is invalid\n", __func__, (io)->io_references);
+       }
 
        do {
                old_io_references = (io)->io_references;
@@ -277,6 +287,10 @@ extern kern_return_t ipc_object_translate_two(
        mach_port_right_t       right2,
        ipc_object_t            *objectp2);
 
+/* Validate an object as belonging to the correct zone */
+extern void ipc_object_validate(
+       ipc_object_t object);
+
 /* Allocate a dead-name entry */
 extern kern_return_t
 ipc_object_alloc_dead(
@@ -315,7 +329,10 @@ extern kern_return_t ipc_object_copyin(
        ipc_space_t             space,
        mach_port_name_t        name,
        mach_msg_type_name_t    msgt_name,
-       ipc_object_t            *objectp);
+       ipc_object_t            *objectp,
+       mach_port_context_t     context,
+       mach_msg_guard_flags_t  *guard_flags,
+       uint32_t                kmsg_flags);
 
 /* Copyin a naked capability from the kernel */
 extern void ipc_object_copyin_from_kernel(
@@ -332,12 +349,19 @@ extern void ipc_object_destroy_dest(
        ipc_object_t            object,
        mach_msg_type_name_t    msgt_name);
 
+/* Insert a send right into an object already in the current space */
+extern kern_return_t ipc_object_insert_send_right(
+       ipc_space_t             space,
+       mach_port_name_t        name,
+       mach_msg_type_name_t    msgt_name);
+
 /* Copyout a capability, placing it into a space */
 extern kern_return_t ipc_object_copyout(
        ipc_space_t             space,
        ipc_object_t            object,
        mach_msg_type_name_t    msgt_name,
-       boolean_t               overflow,
+       mach_port_context_t     *context,
+       mach_msg_guard_flags_t  *guard_flags,
        mach_port_name_t        *namep);
 
 /* Copyout a capability with a name, placing it into a space */
@@ -345,7 +369,6 @@ extern kern_return_t ipc_object_copyout_name(
        ipc_space_t             space,
        ipc_object_t            object,
        mach_msg_type_name_t    msgt_name,
-       boolean_t               overflow,
        mach_port_name_t        name);
 
 /* Translate/consume the destination right of a message */
@@ -355,10 +378,4 @@ extern void ipc_object_copyout_dest(
        mach_msg_type_name_t    msgt_name,
        mach_port_name_t        *namep);
 
-/* Rename an entry in a space */
-extern kern_return_t ipc_object_rename(
-       ipc_space_t             space,
-       mach_port_name_t        oname,
-       mach_port_name_t        nname);
-
 #endif  /* _IPC_IPC_OBJECT_H_ */
index ee9a7571e557e2bfb34c3c386cfe4ea9d16faac0..cd8c04b817db9f067c89650b18f7f37ca8d325b6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <ipc/ipc_notify.h>
 #include <ipc/ipc_table.h>
 #include <ipc/ipc_importance.h>
-#include <machine/machlimits.h>
+#include <machine/limits.h>
 #include <kern/turnstile.h>
 
 #include <security/mac_mach_internal.h>
 
 #include <string.h>
 
-decl_lck_spin_data(, ipc_port_multiple_lock_data)
+decl_lck_spin_data(, ipc_port_multiple_lock_data);
 ipc_port_timestamp_t    ipc_port_timestamp_data;
 int ipc_portbt;
+extern int prioritize_launch;
 
 #if     MACH_ASSERT
 void    ipc_port_init_debug(
@@ -114,6 +115,14 @@ void    ipc_port_callstack_init_debug(
 
 #endif  /* MACH_ASSERT */
 
+static void
+ipc_port_send_turnstile_recompute_push_locked(
+       ipc_port_t port);
+
+static thread_t
+ipc_port_get_watchport_inheritor(
+       ipc_port_t port);
+
 void
 ipc_port_release(ipc_port_t port)
 {
@@ -180,7 +189,7 @@ ipc_port_request_alloc(
        *importantp = FALSE;
 #endif /* IMPORTANCE_INHERITANCE */
 
-       assert(ip_active(port));
+       require_ip_active(port);
        assert(name != MACH_PORT_NULL);
        assert(soright != IP_NULL);
 
@@ -248,8 +257,7 @@ ipc_port_request_grow(
 {
        ipc_table_size_t its;
        ipc_port_request_t otable, ntable;
-
-       assert(ip_active(port));
+       require_ip_active(port);
 
        otable = port->ip_requests;
        if (otable == IPR_NULL) {
@@ -366,7 +374,7 @@ ipc_port_request_sparm(
        if (index != IE_REQ_NONE) {
                ipc_port_request_t ipr, table;
 
-               assert(ip_active(port));
+               require_ip_active(port);
 
                table = port->ip_requests;
                assert(table != IPR_NULL);
@@ -456,7 +464,7 @@ ipc_port_request_cancel(
        ipc_port_request_t ipr, table;
        ipc_port_t request = IP_NULL;
 
-       assert(ip_active(port));
+       require_ip_active(port);
        table = port->ip_requests;
        assert(table != IPR_NULL);
 
@@ -492,8 +500,7 @@ ipc_port_pdrequest(
        ipc_port_t      *previousp)
 {
        ipc_port_t previous;
-
-       assert(ip_active(port));
+       require_ip_active(port);
 
        previous = port->ip_pdrequest;
        port->ip_pdrequest = notify;
@@ -523,8 +530,7 @@ ipc_port_nsrequest(
 {
        ipc_port_t previous;
        mach_port_mscount_t mscount;
-
-       assert(ip_active(port));
+       require_ip_active(port);
 
        previous = port->ip_nsrequest;
        mscount = port->ip_mscount;
@@ -579,7 +585,7 @@ ipc_port_clear_receiver(
 
        /*
         * Send anyone waiting on the port's queue directly away.
-        * Also clear the mscount and seqno.
+        * Also clear the mscount, seqno, guard bits
         */
        imq_lock(mqueue);
        if (port->ip_receiver_name) {
@@ -590,6 +596,11 @@ ipc_port_clear_receiver(
        port->ip_mscount = 0;
        mqueue->imq_seqno = 0;
        port->ip_context = port->ip_guarded = port->ip_strict_guard = 0;
+       /*
+        * clear the immovable bit so the port can move back to anyone listening
+        * for the port destroy notification
+        */
+       port->ip_immovable_receive = 0;
 
        if (should_destroy) {
                /*
@@ -644,6 +655,7 @@ ipc_port_init(
 
        port->ip_premsg = IKM_NULL;
        port->ip_context = 0;
+       port->ip_reply_context = 0;
 
        port->ip_sprequests  = 0;
        port->ip_spimportant = 0;
@@ -652,12 +664,17 @@ ipc_port_init(
 
        port->ip_guarded      = 0;
        port->ip_strict_guard = 0;
+       port->ip_immovable_receive = 0;
+       port->ip_no_grant    = 0;
+       port->ip_immovable_send = 0;
        port->ip_impcount    = 0;
 
        port->ip_specialreply = 0;
        port->ip_sync_link_state = PORT_SYNC_LINK_ANY;
+       port->ip_sync_bootstrap_checkin = 0;
+       port->ip_watchport_elem = NULL;
 
-       reset_ip_srp_bits(port);
+       ipc_special_reply_port_bits_reset(port);
 
        port->ip_send_turnstile = TURNSTILE_NULL;
 
@@ -682,20 +699,26 @@ ipc_port_init(
 kern_return_t
 ipc_port_alloc(
        ipc_space_t             space,
+       bool                    make_send_right,
        mach_port_name_t        *namep,
        ipc_port_t              *portp)
 {
        ipc_port_t port;
        mach_port_name_t name;
        kern_return_t kr;
+       mach_port_type_t type = MACH_PORT_TYPE_RECEIVE;
+       mach_port_urefs_t urefs = 0;
 
 #if     MACH_ASSERT
        uintptr_t buf[IP_CALLSTACK_MAX];
        ipc_port_callstack_init_debug(&buf[0], IP_CALLSTACK_MAX);
 #endif /* MACH_ASSERT */
 
-       kr = ipc_object_alloc(space, IOT_PORT,
-           MACH_PORT_TYPE_RECEIVE, 0,
+       if (make_send_right) {
+               type |= MACH_PORT_TYPE_SEND;
+               urefs = 1;
+       }
+       kr = ipc_object_alloc(space, IOT_PORT, type, urefs,
            &name, (ipc_object_t *) &port);
        if (kr != KERN_SUCCESS) {
                return kr;
@@ -704,6 +727,12 @@ ipc_port_alloc(
        /* port and space are locked */
        ipc_port_init(port, space, name);
 
+       if (make_send_right) {
+               /* ipc_object_alloc() already made the entry reference */
+               port->ip_srights++;
+               port->ip_mscount++;
+       }
+
 #if     MACH_ASSERT
        ipc_port_init_debug(port, &buf[0], IP_CALLSTACK_MAX);
 #endif  /* MACH_ASSERT */
@@ -898,6 +927,7 @@ ipc_port_destroy(ipc_port_t port)
        ipc_mqueue_t mqueue;
        ipc_kmsg_t kmsg;
        boolean_t special_reply = port->ip_specialreply;
+       struct task_watchport_elem *watchport_elem = NULL;
 
 #if IMPORTANCE_INHERITANCE
        ipc_importance_task_t release_imp_task = IIT_NULL;
@@ -906,10 +936,13 @@ ipc_port_destroy(ipc_port_t port)
        natural_t assertcnt = 0;
 #endif /* IMPORTANCE_INHERITANCE */
 
-       assert(ip_active(port));
+       require_ip_active(port);
        /* port->ip_receiver_name is garbage */
        /* port->ip_receiver/port->ip_destination is garbage */
 
+       /* clear any reply-port context */
+       port->ip_reply_context = 0;
+
        /* check for a backup port */
        pdrequest = port->ip_pdrequest;
 
@@ -944,20 +977,27 @@ ipc_port_destroy(ipc_port_t port)
 
                /* we assume the ref for pdrequest */
                port->ip_pdrequest = IP_NULL;
-               ip_unlock(port);
+
+               imq_lock(&port->ip_messages);
+               watchport_elem = ipc_port_clear_watchport_elem_internal(port);
+               ipc_port_send_turnstile_recompute_push_locked(port);
+               /* mqueue and port unlocked */
 
                if (special_reply) {
                        ipc_port_adjust_special_reply_port(port,
                            IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE, FALSE);
                }
+
+               if (watchport_elem) {
+                       task_watchport_elem_deallocate(watchport_elem);
+                       watchport_elem = NULL;
+               }
                /* consumes our refs for port and pdrequest */
                ipc_notify_port_destroyed(pdrequest, port);
 
                goto drop_assertions;
        }
 
-       nsrequest = port->ip_nsrequest;
-
        /*
         * The mach_msg_* paths don't hold a port lock, they only hold a
         * reference to the port object. If a thread raced us and is now
@@ -973,6 +1013,11 @@ ipc_port_destroy(ipc_port_t port)
        assert(port->ip_in_pset == 0);
        assert(port->ip_mscount == 0);
 
+       imq_lock(&port->ip_messages);
+       watchport_elem = ipc_port_clear_watchport_elem_internal(port);
+       imq_unlock(&port->ip_messages);
+       nsrequest = port->ip_nsrequest;
+
        /*
         * If the port has a preallocated message buffer and that buffer
         * is not inuse, free it.  If it has an inuse one, then the kmsg
@@ -988,14 +1033,26 @@ ipc_port_destroy(ipc_port_t port)
                assert(kmsg != IKM_NULL);
                inuse_port = ikm_prealloc_inuse_port(kmsg);
                ipc_kmsg_clear_prealloc(kmsg, port);
-               ip_unlock(port);
+
+               imq_lock(&port->ip_messages);
+               ipc_port_send_turnstile_recompute_push_locked(port);
+               /* mqueue and port unlocked */
+
                if (inuse_port != IP_NULL) {
                        assert(inuse_port == port);
                } else {
                        ipc_kmsg_free(kmsg);
                }
        } else {
-               ip_unlock(port);
+               imq_lock(&port->ip_messages);
+               ipc_port_send_turnstile_recompute_push_locked(port);
+               /* mqueue and port unlocked */
+       }
+
+       /* Deallocate the watchport element */
+       if (watchport_elem) {
+               task_watchport_elem_deallocate(watchport_elem);
+               watchport_elem = NULL;
        }
 
        /* unlink the kmsg from special reply port */
@@ -1077,6 +1134,7 @@ ipc_port_check_circularity(
        return ipc_importance_check_circularity(port, dest);
 #else
        ipc_port_t base;
+       struct task_watchport_elem *watchport_elem = NULL;
 
        assert(port != IP_NULL);
        assert(dest != IP_NULL);
@@ -1134,8 +1192,7 @@ ipc_port_check_circularity(
                ipc_port_multiple_unlock();
 
                /* port (== base) is in limbo */
-
-               assert(ip_active(port));
+               require_ip_active(port);
                assert(port->ip_receiver_name == MACH_PORT_NULL);
                assert(port->ip_destination == IP_NULL);
 
@@ -1144,8 +1201,7 @@ ipc_port_check_circularity(
                        ipc_port_t next;
 
                        /* dest is in transit or in limbo */
-
-                       assert(ip_active(base));
+                       require_ip_active(base);
                        assert(base->ip_receiver_name == MACH_PORT_NULL);
 
                        next = base->ip_destination;
@@ -1170,11 +1226,18 @@ not_circular:
        imq_lock(&port->ip_messages);
 
        /* port is in limbo */
-
-       assert(ip_active(port));
+       require_ip_active(port);
        assert(port->ip_receiver_name == MACH_PORT_NULL);
        assert(port->ip_destination == IP_NULL);
 
+       /* Clear the watchport boost */
+       watchport_elem = ipc_port_clear_watchport_elem_internal(port);
+
+       /* Check if the port is being enqueued as a part of sync bootstrap checkin */
+       if (dest->ip_specialreply && dest->ip_sync_bootstrap_checkin) {
+               port->ip_sync_bootstrap_checkin = 1;
+       }
+
        ip_reference(dest);
        port->ip_destination = dest;
 
@@ -1185,6 +1248,13 @@ not_circular:
                    port_send_turnstile_address(port),
                    TURNSTILE_NULL, TURNSTILE_SYNC_IPC);
 
+               /*
+                * What ipc_port_adjust_port_locked would do,
+                * but we need to also drop even more locks before
+                * calling turnstile_update_inheritor_complete().
+                */
+               ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
+
                turnstile_update_inheritor(send_turnstile, port_send_turnstile(dest),
                    (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE));
 
@@ -1204,8 +1274,7 @@ not_circular:
                }
 
                /* port is in transit */
-
-               assert(ip_active(dest));
+               require_ip_active(dest);
                assert(dest->ip_receiver_name == MACH_PORT_NULL);
                assert(dest->ip_destination != IP_NULL);
 
@@ -1227,35 +1296,153 @@ not_circular:
 
                /* Take the mq lock to call turnstile complete */
                imq_lock(&port->ip_messages);
-               turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL);
+               turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL, TURNSTILE_SYNC_IPC);
                send_turnstile = TURNSTILE_NULL;
                imq_unlock(&port->ip_messages);
                turnstile_cleanup();
        }
 
+       if (watchport_elem) {
+               task_watchport_elem_deallocate(watchport_elem);
+       }
+
        return FALSE;
 #endif /* !IMPORTANCE_INHERITANCE */
 }
 
-struct turnstile *
-ipc_port_get_inheritor(ipc_port_t port)
+/*
+ * Update the recv turnstile inheritor for a port.
+ *
+ * Sync IPC through the port receive turnstile only happens for the special
+ * reply port case. It has three sub-cases:
+ *
+ * 1. a send-once right is in transit, and pushes on the send turnstile of its
+ *    destination mqueue.
+ *
+ * 2. a send-once right has been stashed on a knote it was copied out "through",
+ *    as the first such copied out port.
+ *
+ * 3. a send-once right has been stashed on a knote it was copied out "through",
+ *    as the second or more copied out port.
+ */
+void
+ipc_port_recv_update_inheritor(
+       ipc_port_t port,
+       struct turnstile *rcv_turnstile,
+       turnstile_update_flags_t flags)
 {
-       ipc_mqueue_t mqueue = &port->ip_messages;
+       struct turnstile *inheritor = TURNSTILE_NULL;
        struct knote *kn;
 
-       assert(imq_held(mqueue));
+       if (ip_active(port) && port->ip_specialreply) {
+               imq_held(&port->ip_messages);
+
+               switch (port->ip_sync_link_state) {
+               case PORT_SYNC_LINK_PORT:
+                       if (port->ip_sync_inheritor_port != NULL) {
+                               inheritor = port_send_turnstile(port->ip_sync_inheritor_port);
+                       }
+                       break;
 
-       if (!IMQ_KLIST_VALID(mqueue)) {
-               return IMQ_INHERITOR(mqueue);
+               case PORT_SYNC_LINK_WORKLOOP_KNOTE:
+                       kn = port->ip_sync_inheritor_knote;
+                       inheritor = filt_ipc_kqueue_turnstile(kn);
+                       break;
+
+               case PORT_SYNC_LINK_WORKLOOP_STASH:
+                       inheritor = port->ip_sync_inheritor_ts;
+                       break;
+               }
        }
 
-       SLIST_FOREACH(kn, &port->ip_messages.imq_klist, kn_selnext) {
-               if ((kn->kn_sfflags & MACH_RCV_MSG) && (kn->kn_status & KN_DISPATCH)) {
-                       return filt_machport_kqueue_turnstile(kn);
+       turnstile_update_inheritor(rcv_turnstile, inheritor,
+           flags | TURNSTILE_INHERITOR_TURNSTILE);
+}
+
+/*
+ * Update the send turnstile inheritor for a port.
+ *
+ * Sync IPC through the port send turnstile has 7 possible reasons to be linked:
+ *
+ * 1. a special reply port is part of sync ipc for bootstrap checkin and needs
+ *    to push on thread doing the sync ipc.
+ *
+ * 2. a receive right is in transit, and pushes on the send turnstile of its
+ *    destination mqueue.
+ *
+ * 3. port was passed as an exec watchport and port is pushing on main thread
+ *    of the task.
+ *
+ * 4. a receive right has been stashed on a knote it was copied out "through",
+ *    as the first such copied out port (same as PORT_SYNC_LINK_WORKLOOP_KNOTE
+ *    for the special reply port)
+ *
+ * 5. a receive right has been stashed on a knote it was copied out "through",
+ *    as the second or more copied out port (same as
+ *    PORT_SYNC_LINK_WORKLOOP_STASH for the special reply port)
+ *
+ * 6. a receive right has been copied out as a part of sync bootstrap checkin
+ *    and needs to push on thread doing the sync bootstrap checkin.
+ *
+ * 7. the receive right is monitored by a knote, and pushes on any that is
+ *    registered on a workloop. filt_machport makes sure that if such a knote
+ *    exists, it is kept as the first item in the knote list, so we never need
+ *    to walk.
+ */
+void
+ipc_port_send_update_inheritor(
+       ipc_port_t port,
+       struct turnstile *send_turnstile,
+       turnstile_update_flags_t flags)
+{
+       ipc_mqueue_t mqueue = &port->ip_messages;
+       turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
+       struct knote *kn;
+       turnstile_update_flags_t inheritor_flags = TURNSTILE_INHERITOR_TURNSTILE;
+
+       assert(imq_held(mqueue));
+
+       if (!ip_active(port)) {
+               /* this port is no longer active, it should not push anywhere */
+       } else if (port->ip_specialreply) {
+               /* Case 1. */
+               if (port->ip_sync_bootstrap_checkin && prioritize_launch) {
+                       inheritor = port->ip_messages.imq_srp_owner_thread;
+                       inheritor_flags = TURNSTILE_INHERITOR_THREAD;
+               }
+       } else if (port->ip_receiver_name == MACH_PORT_NULL &&
+           port->ip_destination != NULL) {
+               /* Case 2. */
+               inheritor = port_send_turnstile(port->ip_destination);
+       } else if (port->ip_watchport_elem != NULL) {
+               /* Case 3. */
+               if (prioritize_launch) {
+                       assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY);
+                       inheritor = ipc_port_get_watchport_inheritor(port);
+                       inheritor_flags = TURNSTILE_INHERITOR_THREAD;
+               }
+       } else if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE) {
+               /* Case 4. */
+               inheritor = filt_ipc_kqueue_turnstile(mqueue->imq_inheritor_knote);
+       } else if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_STASH) {
+               /* Case 5. */
+               inheritor = mqueue->imq_inheritor_turnstile;
+       } else if (port->ip_sync_link_state == PORT_SYNC_LINK_RCV_THREAD) {
+               /* Case 6. */
+               if (prioritize_launch) {
+                       inheritor = port->ip_messages.imq_inheritor_thread_ref;
+                       inheritor_flags = TURNSTILE_INHERITOR_THREAD;
+               }
+       } else if ((kn = SLIST_FIRST(&mqueue->imq_klist))) {
+               /* Case 7. Push on a workloop that is interested */
+               if (filt_machport_kqueue_has_turnstile(kn)) {
+                       assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY);
+                       inheritor = filt_ipc_kqueue_turnstile(kn);
                }
        }
 
-       return TURNSTILE_NULL;
+       turnstile_update_inheritor(send_turnstile, inheritor,
+           flags | inheritor_flags);
 }
 
 /*
@@ -1271,7 +1458,6 @@ void
 ipc_port_send_turnstile_prepare(ipc_port_t port)
 {
        struct turnstile *turnstile = TURNSTILE_NULL;
-       struct turnstile *inheritor = TURNSTILE_NULL;
        struct turnstile *send_turnstile = TURNSTILE_NULL;
 
 retry_alloc:
@@ -1290,22 +1476,9 @@ retry_alloc:
                    turnstile, TURNSTILE_SYNC_IPC);
                turnstile = TURNSTILE_NULL;
 
-               /*
-                * if port in transit, setup linkage for its turnstile,
-                * otherwise the link it to WL turnstile.
-                */
-               if (ip_active(port) &&
-                   port->ip_receiver_name == MACH_PORT_NULL &&
-                   port->ip_destination != IP_NULL) {
-                       assert(port->ip_receiver_name == MACH_PORT_NULL);
-                       assert(port->ip_destination != IP_NULL);
+               ipc_port_send_update_inheritor(port, send_turnstile,
+                   TURNSTILE_IMMEDIATE_UPDATE);
 
-                       inheritor = port_send_turnstile(port->ip_destination);
-               } else {
-                       inheritor = ipc_port_get_inheritor(port);
-               }
-               turnstile_update_inheritor(send_turnstile, inheritor,
-                   TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE);
                /* turnstile complete will be called in ipc_port_send_turnstile_complete */
        }
 
@@ -1343,7 +1516,7 @@ ipc_port_send_turnstile_complete(ipc_port_t port)
        port_send_turnstile(port)->ts_port_ref--;
        if (port_send_turnstile(port)->ts_port_ref == 0) {
                turnstile_complete((uintptr_t)port, port_send_turnstile_address(port),
-                   &turnstile);
+                   &turnstile, TURNSTILE_SYNC_IPC);
                assert(turnstile != TURNSTILE_NULL);
        }
        imq_unlock(&port->ip_messages);
@@ -1355,6 +1528,20 @@ ipc_port_send_turnstile_complete(ipc_port_t port)
        }
 }
 
+/*
+ *     Routine:        ipc_port_rcv_turnstile
+ *     Purpose:
+ *             Get the port's receive turnstile
+ *
+ *     Conditions:
+ *             mqueue locked or thread waiting on turnstile is locked.
+ */
+static struct turnstile *
+ipc_port_rcv_turnstile(ipc_port_t port)
+{
+       return turnstile_lookup_by_proprietor((uintptr_t)port, TURNSTILE_SYNC_IPC);
+}
+
 
 /*
  *     Routine:        ipc_port_rcv_turnstile_waitq
@@ -1384,21 +1571,6 @@ ipc_port_rcv_turnstile_waitq(struct waitq *waitq)
 }
 
 
-/*
- *     Routine:        ipc_port_rcv_turnstile
- *     Purpose:
- *             Get the port's receive turnstile
- *
- *     Conditions:
- *             mqueue locked or thread waiting on turnstile is locked.
- */
-struct turnstile *
-ipc_port_rcv_turnstile(ipc_port_t port)
-{
-       return turnstile_lookup_by_proprietor((uintptr_t)port);
-}
-
-
 /*
  *     Routine:        ipc_port_link_special_reply_port
  *     Purpose:
@@ -1411,7 +1583,8 @@ ipc_port_rcv_turnstile(ipc_port_t port)
 void
 ipc_port_link_special_reply_port(
        ipc_port_t special_reply_port,
-       ipc_port_t dest_port)
+       ipc_port_t dest_port,
+       boolean_t sync_bootstrap_checkin)
 {
        boolean_t drop_turnstile_ref = FALSE;
 
@@ -1422,6 +1595,10 @@ ipc_port_link_special_reply_port(
        ip_lock(special_reply_port);
        imq_lock(&special_reply_port->ip_messages);
 
+       if (sync_bootstrap_checkin && special_reply_port->ip_specialreply) {
+               special_reply_port->ip_sync_bootstrap_checkin = 1;
+       }
+
        /* Check if we need to drop the acquired turnstile ref on dest port */
        if (!special_reply_port->ip_specialreply ||
            special_reply_port->ip_sync_link_state != PORT_SYNC_LINK_ANY ||
@@ -1446,14 +1623,14 @@ ipc_port_link_special_reply_port(
 
 #if DEVELOPMENT || DEBUG
 inline void
-reset_ip_srp_bits(ipc_port_t special_reply_port)
+ipc_special_reply_port_bits_reset(ipc_port_t special_reply_port)
 {
        special_reply_port->ip_srp_lost_link = 0;
        special_reply_port->ip_srp_msg_sent = 0;
 }
 
-inline void
-reset_ip_srp_msg_sent(ipc_port_t special_reply_port)
+static inline void
+ipc_special_reply_port_msg_sent_reset(ipc_port_t special_reply_port)
 {
        if (special_reply_port->ip_specialreply == 1) {
                special_reply_port->ip_srp_msg_sent = 0;
@@ -1461,15 +1638,15 @@ reset_ip_srp_msg_sent(ipc_port_t special_reply_port)
 }
 
 inline void
-set_ip_srp_msg_sent(ipc_port_t special_reply_port)
+ipc_special_reply_port_msg_sent(ipc_port_t special_reply_port)
 {
        if (special_reply_port->ip_specialreply == 1) {
                special_reply_port->ip_srp_msg_sent = 1;
        }
 }
 
-inline void
-set_ip_srp_lost_link(ipc_port_t special_reply_port)
+static inline void
+ipc_special_reply_port_lost_link(ipc_port_t special_reply_port)
 {
        if (special_reply_port->ip_specialreply == 1 && special_reply_port->ip_srp_msg_sent == 0) {
                special_reply_port->ip_srp_lost_link = 1;
@@ -1478,25 +1655,25 @@ set_ip_srp_lost_link(ipc_port_t special_reply_port)
 
 #else /* DEVELOPMENT || DEBUG */
 inline void
-reset_ip_srp_bits(__unused ipc_port_t special_reply_port)
+ipc_special_reply_port_bits_reset(__unused ipc_port_t special_reply_port)
 {
        return;
 }
 
-inline void
-reset_ip_srp_msg_sent(__unused ipc_port_t special_reply_port)
+static inline void
+ipc_special_reply_port_msg_sent_reset(__unused ipc_port_t special_reply_port)
 {
        return;
 }
 
 inline void
-set_ip_srp_msg_sent(__unused ipc_port_t special_reply_port)
+ipc_special_reply_port_msg_sent(__unused ipc_port_t special_reply_port)
 {
        return;
 }
 
-inline void
-set_ip_srp_lost_link(__unused ipc_port_t special_reply_port)
+static inline void
+ipc_special_reply_port_lost_link(__unused ipc_port_t special_reply_port)
 {
        return;
 }
@@ -1505,10 +1682,11 @@ set_ip_srp_lost_link(__unused ipc_port_t special_reply_port)
 /*
  *     Routine:        ipc_port_adjust_special_reply_port_locked
  *     Purpose:
- *             If the special port has a turnstile, update it's inheritor.
+ *             If the special port has a turnstile, update its inheritor.
  *     Condition:
  *             Special reply port locked on entry.
  *             Special reply port unlocked on return.
+ *             The passed in port is a special reply port.
  *     Returns:
  *             None.
  */
@@ -1522,21 +1700,30 @@ ipc_port_adjust_special_reply_port_locked(
        ipc_port_t dest_port = IPC_PORT_NULL;
        int sync_link_state = PORT_SYNC_LINK_NO_LINKAGE;
        turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
-       struct turnstile *dest_ts = TURNSTILE_NULL, *ts = TURNSTILE_NULL;
+       struct turnstile *ts = TURNSTILE_NULL;
 
+       assert(special_reply_port->ip_specialreply);
+
+       ip_lock_held(special_reply_port); // ip_sync_link_state is touched
        imq_lock(&special_reply_port->ip_messages);
 
        if (flags & IPC_PORT_ADJUST_SR_RECEIVED_MSG) {
-               reset_ip_srp_msg_sent(special_reply_port);
+               ipc_special_reply_port_msg_sent_reset(special_reply_port);
+       }
+
+       if (flags & IPC_PORT_ADJUST_UNLINK_THREAD) {
+               special_reply_port->ip_messages.imq_srp_owner_thread = NULL;
+       }
+
+       if (flags & IPC_PORT_ADJUST_RESET_BOOSTRAP_CHECKIN) {
+               special_reply_port->ip_sync_bootstrap_checkin = 0;
        }
 
        /* Check if the special reply port is marked non-special */
-       if (special_reply_port->ip_specialreply == 0 ||
-           special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_ANY) {
+       if (special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_ANY) {
                if (get_turnstile) {
                        turnstile_complete((uintptr_t)special_reply_port,
-                           port_rcv_turnstile_address(special_reply_port),
-                           NULL);
+                           port_rcv_turnstile_address(special_reply_port), NULL, TURNSTILE_SYNC_IPC);
                }
                imq_unlock(&special_reply_port->ip_messages);
                ip_unlock(special_reply_port);
@@ -1546,32 +1733,23 @@ ipc_port_adjust_special_reply_port_locked(
                return;
        }
 
-       /* Clear thread's special reply port and clear linkage */
-       if (flags & IPC_PORT_ADJUST_SR_CLEAR_SPECIAL_REPLY) {
-               /* This option should only be specified by a non blocking thread */
-               assert(get_turnstile == FALSE);
-               special_reply_port->ip_specialreply = 0;
-
-               reset_ip_srp_bits(special_reply_port);
-
-               /* Check if need to break linkage */
-               if (special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_NO_LINKAGE) {
-                       imq_unlock(&special_reply_port->ip_messages);
-                       ip_unlock(special_reply_port);
-                       return;
-               }
-       } else if (flags & IPC_PORT_ADJUST_SR_LINK_WORKLOOP) {
-               if (special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_ANY ||
-                   special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_PORT) {
-                       if (ITH_KNOTE_VALID(kn, MACH_MSG_TYPE_PORT_SEND_ONCE)) {
-                               inheritor = filt_machport_stash_port(kn, special_reply_port,
-                                   &sync_link_state);
-                       }
+       if (flags & IPC_PORT_ADJUST_SR_LINK_WORKLOOP) {
+               if (ITH_KNOTE_VALID(kn, MACH_MSG_TYPE_PORT_SEND_ONCE)) {
+                       inheritor = filt_machport_stash_port(kn, special_reply_port,
+                           &sync_link_state);
                }
        } else if (flags & IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE) {
                sync_link_state = PORT_SYNC_LINK_ANY;
        }
 
+       /* Check if need to break linkage */
+       if (!get_turnstile && sync_link_state == PORT_SYNC_LINK_NO_LINKAGE &&
+           special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_NO_LINKAGE) {
+               imq_unlock(&special_reply_port->ip_messages);
+               ip_unlock(special_reply_port);
+               return;
+       }
+
        switch (special_reply_port->ip_sync_link_state) {
        case PORT_SYNC_LINK_PORT:
                dest_port = special_reply_port->ip_sync_inheritor_port;
@@ -1581,7 +1759,6 @@ ipc_port_adjust_special_reply_port_locked(
                special_reply_port->ip_sync_inheritor_knote = NULL;
                break;
        case PORT_SYNC_LINK_WORKLOOP_STASH:
-               dest_ts = special_reply_port->ip_sync_inheritor_ts;
                special_reply_port->ip_sync_inheritor_ts = NULL;
                break;
        }
@@ -1593,12 +1770,11 @@ ipc_port_adjust_special_reply_port_locked(
                special_reply_port->ip_sync_inheritor_knote = kn;
                break;
        case PORT_SYNC_LINK_WORKLOOP_STASH:
-               turnstile_reference(inheritor);
                special_reply_port->ip_sync_inheritor_ts = inheritor;
                break;
        case PORT_SYNC_LINK_NO_LINKAGE:
                if (flags & IPC_PORT_ADJUST_SR_ENABLE_EVENT) {
-                       set_ip_srp_lost_link(special_reply_port);
+                       ipc_special_reply_port_lost_link(special_reply_port);
                }
                break;
        }
@@ -1606,14 +1782,13 @@ ipc_port_adjust_special_reply_port_locked(
        /* Get thread's turnstile donated to special reply port */
        if (get_turnstile) {
                turnstile_complete((uintptr_t)special_reply_port,
-                   port_rcv_turnstile_address(special_reply_port),
-                   NULL);
+                   port_rcv_turnstile_address(special_reply_port), NULL, TURNSTILE_SYNC_IPC);
        } else {
                ts = ipc_port_rcv_turnstile(special_reply_port);
                if (ts) {
                        turnstile_reference(ts);
-                       turnstile_update_inheritor(ts, inheritor,
-                           (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE));
+                       ipc_port_recv_update_inheritor(special_reply_port, ts,
+                           TURNSTILE_IMMEDIATE_UPDATE);
                }
        }
 
@@ -1628,22 +1803,18 @@ ipc_port_adjust_special_reply_port_locked(
                turnstile_deallocate_safe(ts);
        }
 
-       /* Release the ref on the dest port and it's turnstile */
+       /* Release the ref on the dest port and its turnstile */
        if (dest_port) {
                ipc_port_send_turnstile_complete(dest_port);
                /* release the reference on the dest port */
                ip_release(dest_port);
        }
-
-       if (dest_ts) {
-               turnstile_deallocate_safe(dest_ts);
-       }
 }
 
 /*
  *     Routine:        ipc_port_adjust_special_reply_port
  *     Purpose:
- *             If the special port has a turnstile, update it's inheritor.
+ *             If the special port has a turnstile, update its inheritor.
  *     Condition:
  *             Nothing locked.
  *     Returns:
@@ -1655,39 +1826,310 @@ ipc_port_adjust_special_reply_port(
        uint8_t flags,
        boolean_t get_turnstile)
 {
-       ip_lock(special_reply_port);
-       ipc_port_adjust_special_reply_port_locked(special_reply_port, NULL, flags, get_turnstile);
-       /* special_reply_port unlocked */
+       if (special_reply_port->ip_specialreply) {
+               ip_lock(special_reply_port);
+               ipc_port_adjust_special_reply_port_locked(special_reply_port, NULL,
+                   flags, get_turnstile);
+               /* special_reply_port unlocked */
+       }
+       if (get_turnstile) {
+               assert(current_thread()->turnstile != TURNSTILE_NULL);
+       }
 }
 
 /*
- *     Routine:        ipc_port_get_special_reply_port_inheritor
+ *     Routine:        ipc_port_adjust_sync_link_state_locked
  *     Purpose:
- *             Returns the current inheritor of the special reply port
+ *             Update the sync link state of the port and the
+ *             turnstile inheritor.
  *     Condition:
- *             mqueue is locked, port is a special reply port
+ *             Port and mqueue locked on entry.
+ *             Port and mqueue locked on return.
  *     Returns:
- *             the current inheritor
+ *              None.
  */
-turnstile_inheritor_t
-ipc_port_get_special_reply_port_inheritor(
-       ipc_port_t port)
+void
+ipc_port_adjust_sync_link_state_locked(
+       ipc_port_t port,
+       int sync_link_state,
+       turnstile_inheritor_t inheritor)
 {
-       assert(port->ip_specialreply);
-       imq_held(&port->ip_messages);
-
        switch (port->ip_sync_link_state) {
-       case PORT_SYNC_LINK_PORT:
-               if (port->ip_sync_inheritor_port != NULL) {
-                       return port_send_turnstile(port->ip_sync_inheritor_port);
-               }
-               break;
+       case PORT_SYNC_LINK_RCV_THREAD:
+               /* deallocate the thread reference for the inheritor */
+               thread_deallocate_safe(port->ip_messages.imq_inheritor_thread_ref);
+       /* Fall through */
+
+       default:
+               klist_init(&port->ip_messages.imq_klist);
+       }
+
+       switch (sync_link_state) {
        case PORT_SYNC_LINK_WORKLOOP_KNOTE:
-               return filt_machport_stashed_special_reply_port_turnstile(port);
+               port->ip_messages.imq_inheritor_knote = inheritor;
+               break;
        case PORT_SYNC_LINK_WORKLOOP_STASH:
-               return port->ip_sync_inheritor_ts;
+               port->ip_messages.imq_inheritor_turnstile = inheritor;
+               break;
+       case PORT_SYNC_LINK_RCV_THREAD:
+               /* The thread could exit without clearing port state, take a thread ref */
+               thread_reference((thread_t)inheritor);
+               port->ip_messages.imq_inheritor_thread_ref = inheritor;
+               break;
+       default:
+               klist_init(&port->ip_messages.imq_klist);
+               sync_link_state = PORT_SYNC_LINK_ANY;
+       }
+
+       port->ip_sync_link_state = sync_link_state;
+}
+
+
+/*
+ *     Routine:        ipc_port_adjust_port_locked
+ *     Purpose:
+ *             If the port has a turnstile, update its inheritor.
+ *     Condition:
+ *             Port locked on entry.
+ *             Port unlocked on return.
+ *     Returns:
+ *             None.
+ */
+void
+ipc_port_adjust_port_locked(
+       ipc_port_t port,
+       struct knote *kn,
+       boolean_t sync_bootstrap_checkin)
+{
+       int sync_link_state = PORT_SYNC_LINK_ANY;
+       turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
+
+       ip_lock_held(port); // ip_sync_link_state is touched
+       imq_held(&port->ip_messages);
+
+       assert(!port->ip_specialreply);
+
+       if (kn) {
+               inheritor = filt_machport_stash_port(kn, port, &sync_link_state);
+               if (sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE) {
+                       inheritor = kn;
+               }
+       } else if (sync_bootstrap_checkin) {
+               inheritor = current_thread();
+               sync_link_state = PORT_SYNC_LINK_RCV_THREAD;
        }
-       return TURNSTILE_INHERITOR_NULL;
+
+       ipc_port_adjust_sync_link_state_locked(port, sync_link_state, inheritor);
+       port->ip_sync_bootstrap_checkin = 0;
+
+       ipc_port_send_turnstile_recompute_push_locked(port);
+       /* port and mqueue unlocked */
+}
+
+/*
+ *     Routine:        ipc_port_clear_sync_rcv_thread_boost_locked
+ *     Purpose:
+ *             If the port is pushing on rcv thread, clear it.
+ *     Condition:
+ *             Port locked on entry
+ *             mqueue is not locked.
+ *             Port unlocked on return.
+ *     Returns:
+ *             None.
+ */
+void
+ipc_port_clear_sync_rcv_thread_boost_locked(
+       ipc_port_t port)
+{
+       ip_lock_held(port); // ip_sync_link_state is touched
+
+       if (port->ip_sync_link_state != PORT_SYNC_LINK_RCV_THREAD) {
+               ip_unlock(port);
+               return;
+       }
+
+       imq_lock(&port->ip_messages);
+       ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
+
+       ipc_port_send_turnstile_recompute_push_locked(port);
+       /* port and mqueue unlocked */
+}
+
+/*
+ *     Routine:        ipc_port_add_watchport_elem_locked
+ *     Purpose:
+ *             Transfer the turnstile boost of watchport to task calling exec.
+ *     Condition:
+ *             Port locked on entry.
+ *             Port unlocked on return.
+ *     Returns:
+ *             KERN_SUCESS on success.
+ *             KERN_FAILURE otherwise.
+ */
+kern_return_t
+ipc_port_add_watchport_elem_locked(
+       ipc_port_t                 port,
+       struct task_watchport_elem *watchport_elem,
+       struct task_watchport_elem **old_elem)
+{
+       ip_lock_held(port);
+       imq_held(&port->ip_messages);
+
+       /* Watchport boost only works for non-special active ports mapped in an ipc space */
+       if (!ip_active(port) || port->ip_specialreply ||
+           port->ip_receiver_name == MACH_PORT_NULL) {
+               imq_unlock(&port->ip_messages);
+               ip_unlock(port);
+               return KERN_FAILURE;
+       }
+
+       if (port->ip_sync_link_state != PORT_SYNC_LINK_ANY) {
+               /* Sever the linkage if the port was pushing on knote */
+               ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
+       }
+
+       *old_elem = port->ip_watchport_elem;
+       port->ip_watchport_elem = watchport_elem;
+
+       ipc_port_send_turnstile_recompute_push_locked(port);
+       /* port and mqueue unlocked */
+       return KERN_SUCCESS;
+}
+
+/*
+ *     Routine:        ipc_port_clear_watchport_elem_internal_conditional_locked
+ *     Purpose:
+ *             Remove the turnstile boost of watchport and recompute the push.
+ *     Condition:
+ *             Port locked on entry.
+ *             Port unlocked on return.
+ *     Returns:
+ *             KERN_SUCESS on success.
+ *             KERN_FAILURE otherwise.
+ */
+kern_return_t
+ipc_port_clear_watchport_elem_internal_conditional_locked(
+       ipc_port_t                 port,
+       struct task_watchport_elem *watchport_elem)
+{
+       ip_lock_held(port);
+       imq_held(&port->ip_messages);
+
+       if (port->ip_watchport_elem != watchport_elem) {
+               imq_unlock(&port->ip_messages);
+               ip_unlock(port);
+               return KERN_FAILURE;
+       }
+
+       ipc_port_clear_watchport_elem_internal(port);
+       ipc_port_send_turnstile_recompute_push_locked(port);
+       /* port and mqueue unlocked */
+       return KERN_SUCCESS;
+}
+
+/*
+ *     Routine:        ipc_port_replace_watchport_elem_conditional_locked
+ *     Purpose:
+ *             Replace the turnstile boost of watchport and recompute the push.
+ *     Condition:
+ *             Port locked on entry.
+ *             Port unlocked on return.
+ *     Returns:
+ *             KERN_SUCESS on success.
+ *             KERN_FAILURE otherwise.
+ */
+kern_return_t
+ipc_port_replace_watchport_elem_conditional_locked(
+       ipc_port_t                 port,
+       struct task_watchport_elem *old_watchport_elem,
+       struct task_watchport_elem *new_watchport_elem)
+{
+       ip_lock_held(port);
+       imq_held(&port->ip_messages);
+
+       if (port->ip_watchport_elem != old_watchport_elem) {
+               imq_unlock(&port->ip_messages);
+               ip_unlock(port);
+               return KERN_FAILURE;
+       }
+
+       port->ip_watchport_elem = new_watchport_elem;
+       ipc_port_send_turnstile_recompute_push_locked(port);
+       /* port and mqueue unlocked */
+       return KERN_SUCCESS;
+}
+
+/*
+ *     Routine:        ipc_port_clear_watchport_elem_internal
+ *     Purpose:
+ *             Remove the turnstile boost of watchport.
+ *     Condition:
+ *             Port locked on entry.
+ *             Port locked on return.
+ *     Returns:
+ *             Old task_watchport_elem returned.
+ */
+struct task_watchport_elem *
+ipc_port_clear_watchport_elem_internal(
+       ipc_port_t                 port)
+{
+       struct task_watchport_elem *watchport_elem;
+
+       ip_lock_held(port);
+       imq_held(&port->ip_messages);
+
+       watchport_elem = port->ip_watchport_elem;
+       port->ip_watchport_elem = NULL;
+
+       return watchport_elem;
+}
+
+/*
+ *     Routine:        ipc_port_send_turnstile_recompute_push_locked
+ *     Purpose:
+ *             Update send turnstile inheritor of port and recompute the push.
+ *     Condition:
+ *             Port locked on entry.
+ *             Port unlocked on return.
+ *     Returns:
+ *             None.
+ */
+static void
+ipc_port_send_turnstile_recompute_push_locked(
+       ipc_port_t port)
+{
+       struct turnstile *send_turnstile = port_send_turnstile(port);
+       if (send_turnstile) {
+               turnstile_reference(send_turnstile);
+               ipc_port_send_update_inheritor(port, send_turnstile,
+                   TURNSTILE_IMMEDIATE_UPDATE);
+       }
+       imq_unlock(&port->ip_messages);
+       ip_unlock(port);
+
+       if (send_turnstile) {
+               turnstile_update_inheritor_complete(send_turnstile,
+                   TURNSTILE_INTERLOCK_NOT_HELD);
+               turnstile_deallocate_safe(send_turnstile);
+       }
+}
+
+/*
+ *     Routine:        ipc_port_get_watchport_inheritor
+ *     Purpose:
+ *             Returns inheritor for watchport.
+ *
+ *     Conditions:
+ *             mqueue locked.
+ *     Returns:
+ *             watchport inheritor.
+ */
+static thread_t
+ipc_port_get_watchport_inheritor(
+       ipc_port_t port)
+{
+       imq_held(&port->ip_messages);
+       return port->ip_watchport_elem->twe_task->watchports->tw_thread;
 }
 
 /*
@@ -1951,51 +2393,6 @@ ipc_port_importance_delta(
 }
 #endif /* IMPORTANCE_INHERITANCE */
 
-/*
- *     Routine:        ipc_port_lookup_notify
- *     Purpose:
- *             Make a send-once notify port from a receive right.
- *             Returns IP_NULL if name doesn't denote a receive right.
- *     Conditions:
- *             The space must be locked (read or write) and active.
- *              Being the active space, we can rely on thread server_id
- *             context to give us the proper server level sub-order
- *             within the space.
- */
-
-ipc_port_t
-ipc_port_lookup_notify(
-       ipc_space_t             space,
-       mach_port_name_t        name)
-{
-       ipc_port_t port;
-       ipc_entry_t entry;
-
-       assert(is_active(space));
-
-       entry = ipc_entry_lookup(space, name);
-       if (entry == IE_NULL) {
-               return IP_NULL;
-       }
-       if ((entry->ie_bits & MACH_PORT_TYPE_RECEIVE) == 0) {
-               return IP_NULL;
-       }
-
-       __IGNORE_WCASTALIGN(port = (ipc_port_t) entry->ie_object);
-       assert(port != IP_NULL);
-
-       ip_lock(port);
-       assert(ip_active(port));
-       assert(port->ip_receiver_name == name);
-       assert(port->ip_receiver == space);
-
-       ip_reference(port);
-       port->ip_sorights++;
-       ip_unlock(port);
-
-       return port;
-}
-
 /*
  *     Routine:        ipc_port_make_send_locked
  *     Purpose:
@@ -2008,7 +2405,7 @@ ipc_port_t
 ipc_port_make_send_locked(
        ipc_port_t      port)
 {
-       assert(ip_active(port));
+       require_ip_active(port);
        port->ip_mscount++;
        port->ip_srights++;
        ip_reference(port);
@@ -2031,9 +2428,7 @@ ipc_port_make_send(
 
        ip_lock(port);
        if (ip_active(port)) {
-               port->ip_mscount++;
-               port->ip_srights++;
-               ip_reference(port);
+               ipc_port_make_send_locked(port);
                ip_unlock(port);
                return port;
        }
@@ -2041,6 +2436,22 @@ ipc_port_make_send(
        return IP_DEAD;
 }
 
+/*
+ *     Routine:        ipc_port_copy_send_locked
+ *     Purpose:
+ *             Make a naked send right from another naked send right.
+ *     Conditions:
+ *             port locked and active.
+ */
+void
+ipc_port_copy_send_locked(
+       ipc_port_t      port)
+{
+       assert(port->ip_srights > 0);
+       port->ip_srights++;
+       ip_reference(port);
+}
+
 /*
  *     Routine:        ipc_port_copy_send
  *     Purpose:
@@ -2065,10 +2476,7 @@ ipc_port_copy_send(
 
        ip_lock(port);
        if (ip_active(port)) {
-               assert(port->ip_srights > 0);
-
-               ip_reference(port);
-               port->ip_srights++;
+               ipc_port_copy_send_locked(port);
                sright = port;
        } else {
                sright = IP_DEAD;
@@ -2097,44 +2505,8 @@ ipc_port_copyout_send(
        if (IP_VALID(sright)) {
                kern_return_t kr;
 
-               kr = ipc_object_copyout(space, (ipc_object_t) sright,
-                   MACH_MSG_TYPE_PORT_SEND, TRUE, &name);
-               if (kr != KERN_SUCCESS) {
-                       ipc_port_release_send(sright);
-
-                       if (kr == KERN_INVALID_CAPABILITY) {
-                               name = MACH_PORT_DEAD;
-                       } else {
-                               name = MACH_PORT_NULL;
-                       }
-               }
-       } else {
-               name = CAST_MACH_PORT_TO_NAME(sright);
-       }
-
-       return name;
-}
-
-/*
- *     Routine:        ipc_port_copyout_name_send
- *     Purpose:
- *             Copyout a naked send right (possibly null/dead) to given name,
- *             or if that fails, destroy the right.
- *     Conditions:
- *             Nothing locked.
- */
-
-mach_port_name_t
-ipc_port_copyout_name_send(
-       ipc_port_t      sright,
-       ipc_space_t     space,
-       mach_port_name_t name)
-{
-       if (IP_VALID(sright)) {
-               kern_return_t kr;
-
-               kr = ipc_object_copyout_name(space, (ipc_object_t) sright,
-                   MACH_MSG_TYPE_PORT_SEND, TRUE, name);
+               kr = ipc_object_copyout(space, ip_to_object(sright),
+                   MACH_MSG_TYPE_PORT_SEND, NULL, NULL, &name);
                if (kr != KERN_SUCCESS) {
                        ipc_port_release_send(sright);
 
@@ -2212,7 +2584,7 @@ ipc_port_t
 ipc_port_make_sonce_locked(
        ipc_port_t      port)
 {
-       assert(ip_active(port));
+       require_ip_active(port);
        port->ip_sorights++;
        ip_reference(port);
        return port;
@@ -2236,8 +2608,7 @@ ipc_port_make_sonce(
 
        ip_lock(port);
        if (ip_active(port)) {
-               port->ip_sorights++;
-               ip_reference(port);
+               ipc_port_make_sonce_locked(port);
                ip_unlock(port);
                return port;
        }
@@ -2267,7 +2638,7 @@ ipc_port_release_sonce(
                return;
        }
 
-       ipc_port_adjust_special_reply_port(port, IPC_PORT_ADJUST_SR_NONE, FALSE);
+       ipc_port_adjust_special_reply_port(port, IPC_PORT_ADJUST_RESET_BOOSTRAP_CHECKIN, FALSE);
 
        ip_lock(port);
 
@@ -2302,7 +2673,7 @@ ipc_port_release_receive(
        }
 
        ip_lock(port);
-       assert(ip_active(port));
+       require_ip_active(port);
        assert(port->ip_receiver_name == MACH_PORT_NULL);
        dest = port->ip_destination;
 
@@ -2330,7 +2701,7 @@ ipc_port_alloc_special(
 {
        ipc_port_t port;
 
-       __IGNORE_WCASTALIGN(port = (ipc_port_t) io_alloc(IOT_PORT));
+       port = ip_object_to_port(io_alloc(IOT_PORT));
        if (port == IP_NULL) {
                return IP_NULL;
        }
@@ -2341,7 +2712,7 @@ ipc_port_alloc_special(
 #endif /* MACH_ASSERT */
 
        bzero((char *)port, sizeof(*port));
-       io_lock_init(&port->ip_object);
+       io_lock_init(ip_to_object(port));
        port->ip_references = 1;
        port->ip_object.io_bits = io_makebits(TRUE, IOT_PORT, 0);
 
@@ -2369,7 +2740,7 @@ ipc_port_dealloc_special(
        __assert_only ipc_space_t       space)
 {
        ip_lock(port);
-       assert(ip_active(port));
+       require_ip_active(port);
 //     assert(port->ip_receiver_name != MACH_PORT_NULL);
        assert(port->ip_receiver == space);
 
@@ -2384,7 +2755,7 @@ ipc_port_dealloc_special(
        imq_unlock(&port->ip_messages);
 
        /* relevant part of ipc_port_clear_receiver */
-       ipc_port_set_mscount(port, 0);
+       port->ip_mscount = 0;
        port->ip_messages.imq_seqno = 0;
 
        ipc_port_destroy(port);
@@ -2447,7 +2818,7 @@ kdp_mqueue_send_find_owner(struct waitq * waitq, __assert_only event64_t event,
        assert(waitq_is_turnstile_queue(waitq));
 
        turnstile = waitq_to_turnstile(waitq);
-       ipc_port_t port     = (ipc_port_t)turnstile->ts_proprietor; /* we are blocking on send */
+       ipc_port_t port = (ipc_port_t)turnstile->ts_proprietor; /* we are blocking on send */
        assert(kdp_is_in_zone(port, "ipc ports"));
 
        waitinfo->owner = 0;
index 971f77821caff0632b0c8ba30aac712f2e8b26b3..3fccd24605f414e654703ff330dbc5c0ab96bbe1 100644 (file)
@@ -139,9 +139,10 @@ struct ipc_port {
        union {
                struct ipc_kmsg *premsg;
                struct turnstile *send_turnstile;
-               SLIST_ENTRY(ipc_port) dealloc_elm;
        } kdata2;
 
+       struct task_watchport_elem *ip_watchport_elem;
+
        mach_vm_address_t ip_context;
 
        natural_t ip_sprequests:1,      /* send-possible requests outstanding */
@@ -151,8 +152,12 @@ struct ipc_port {
            ip_guarded:1,               /* port guarded (use context value as guard) */
            ip_strict_guard:1,          /* Strict guarding; Prevents user manipulation of context values directly */
            ip_specialreply:1,          /* port is a special reply port */
-           ip_sync_link_state:3,       /* link the special reply port to destination port/ Workloop */
-           ip_impcount:22;             /* number of importance donations in nested queue */
+           ip_sync_link_state:3,       /* link the port to destination port/ Workloop */
+           ip_sync_bootstrap_checkin:1,/* port part of sync bootstrap checkin, push on thread doing the checkin */
+           ip_immovable_receive:1,     /* the receive right cannot be moved out of a space, until it is destroyed */
+           ip_no_grant:1,              /* Port wont accept complex messages containing (ool) port descriptors */
+           ip_immovable_send:1,        /* No send(once) rights to this port can be moved out of a space */
+           ip_impcount:18;             /* number of importance donations in nested queue */
 
        mach_port_mscount_t ip_mscount;
        mach_port_rights_t ip_srights;
@@ -175,10 +180,10 @@ struct ipc_port {
 
 
 #define ip_references           ip_object.io_references
-#define ip_bits                 ip_object.io_bits
 
 #define ip_receiver_name        ip_messages.imq_receiver_name
 #define ip_in_pset              ip_messages.imq_in_pset
+#define ip_reply_context        ip_messages.imq_context
 
 #define ip_receiver             data.receiver
 #define ip_destination          data.destination
@@ -192,7 +197,6 @@ struct ipc_port {
 
 #define ip_premsg               kdata2.premsg
 #define ip_send_turnstile       kdata2.send_turnstile
-#define ip_dealloc_elm          kdata2.dealloc_elm
 
 #define port_send_turnstile(port)       (IP_PREALLOC(port) ? (port)->ip_premsg->ikm_turnstile : (port)->ip_send_turnstile)
 
@@ -208,11 +212,12 @@ MACRO_END
 #define port_send_turnstile_address(port)                    \
 (IP_PREALLOC(port) ? &((port)->ip_premsg->ikm_turnstile) : &((port)->ip_send_turnstile))
 
-#define port_rcv_turnstile_address(port) (NULL)
+#define port_rcv_turnstile_address(port) \
+       (NULL)
 
 
 /*
- * SYNC IPC state flags for special reply port.
+ * SYNC IPC state flags for special reply port/ rcv right.
  *
  * PORT_SYNC_LINK_ANY
  *    Special reply port is not linked to any other port
@@ -237,26 +242,34 @@ MACRO_END
  *    Message sent to special reply port, do
  *    not allow any linkages till receive is
  *    complete.
+ *
+ * PORT_SYNC_LINK_RCV_THREAD
+ *    Receive right copied out as a part of bootstrap check in,
+ *    push on the thread which copied out the port.
  */
 #define PORT_SYNC_LINK_ANY              (0)
 #define PORT_SYNC_LINK_PORT             (0x1)
 #define PORT_SYNC_LINK_WORKLOOP_KNOTE   (0x2)
 #define PORT_SYNC_LINK_WORKLOOP_STASH   (0x3)
 #define PORT_SYNC_LINK_NO_LINKAGE       (0x4)
+#define PORT_SYNC_LINK_RCV_THREAD       (0x5)
 
 #define IP_NULL                 IPC_PORT_NULL
 #define IP_DEAD                 IPC_PORT_DEAD
 #define IP_VALID(port)          IPC_PORT_VALID(port)
 
-#define ip_active(port)         io_active(&(port)->ip_object)
-#define ip_lock_init(port)      io_lock_init(&(port)->ip_object)
-#define ip_lock(port)           io_lock(&(port)->ip_object)
-#define ip_lock_try(port)       io_lock_try(&(port)->ip_object)
-#define ip_lock_held_kdp(port)  io_lock_held_kdp(&(port)->ip_object)
-#define ip_unlock(port)         io_unlock(&(port)->ip_object)
+#define ip_object_to_port(io)   __container_of(io, struct ipc_port, ip_object)
+#define ip_to_object(port)      (&(port)->ip_object)
+#define ip_active(port)         io_active(ip_to_object(port))
+#define ip_lock_init(port)      io_lock_init(ip_to_object(port))
+#define ip_lock_held(port)      io_lock_held(ip_to_object(port))
+#define ip_lock(port)           io_lock(ip_to_object(port))
+#define ip_lock_try(port)       io_lock_try(ip_to_object(port))
+#define ip_lock_held_kdp(port)  io_lock_held_kdp(ip_to_object(port))
+#define ip_unlock(port)         io_unlock(ip_to_object(port))
 
-#define ip_reference(port)      io_reference(&(port)->ip_object)
-#define ip_release(port)        io_release(&(port)->ip_object)
+#define ip_reference(port)      io_reference(ip_to_object(port))
+#define ip_release(port)        io_release(ip_to_object(port))
 
 /* get an ipc_port pointer from an ipc_mqueue pointer */
 #define ip_from_mq(mq) \
@@ -265,7 +278,8 @@ MACRO_END
 #define ip_reference_mq(mq)     ip_reference(ip_from_mq(mq))
 #define ip_release_mq(mq)       ip_release(ip_from_mq(mq))
 
-#define ip_kotype(port)         io_kotype(&(port)->ip_object)
+#define ip_kotype(port)         io_kotype(ip_to_object(port))
+#define ip_is_kobject(port)     io_is_kobject(ip_to_object(port))
 
 #define ip_full_kernel(port)    imq_full_kernel(&(port)->ip_messages)
 #define ip_full(port)           imq_full(&(port)->ip_messages)
@@ -279,18 +293,18 @@ MACRO_END
  * therefore cannot be blocked waiting for memory themselves).
  */
 #define IP_BIT_PREALLOC         0x00008000      /* preallocated mesg */
-#define IP_PREALLOC(port)       ((port)->ip_bits & IP_BIT_PREALLOC)
+#define IP_PREALLOC(port)       ((port)->ip_object.io_bits & IP_BIT_PREALLOC)
 
 #define IP_SET_PREALLOC(port, kmsg)                                     \
 MACRO_BEGIN                                                             \
-       (port)->ip_bits |= IP_BIT_PREALLOC;                             \
+       (port)->ip_object.io_bits |= IP_BIT_PREALLOC;                   \
        (port)->ip_premsg = (kmsg);                                     \
 MACRO_END
 
 #define IP_CLEAR_PREALLOC(port, kmsg)                                   \
 MACRO_BEGIN                                                             \
        assert((port)->ip_premsg == kmsg);                              \
-       (port)->ip_bits &= ~IP_BIT_PREALLOC;                            \
+       (port)->ip_object.io_bits &= ~IP_BIT_PREALLOC;                  \
        (port)->ip_premsg = IKM_NULL;                                   \
 MACRO_END
 
@@ -364,15 +378,34 @@ extern ipc_port_timestamp_t ipc_port_timestamp(void);
 
 #define IP_TIMESTAMP_ORDER(one, two)    ((int) ((one) - (two)) < 0)
 
+static inline void
+require_ip_active(ipc_port_t port)
+{
+       if (!ip_active(port)) {
+               panic("Using inactive port %p", port);
+       }
+}
+
+static inline kern_return_t
+ipc_port_translate(
+       ipc_space_t                     space,
+       mach_port_name_t                name,
+       mach_port_right_t               right,
+       ipc_port_t                     *portp)
+{
+       ipc_object_t object;
+       kern_return_t kr;
+
+       kr = ipc_object_translate(space, name, right, &object);
+       *portp = (kr == KERN_SUCCESS) ? ip_object_to_port(object) : IP_NULL;
+       return kr;
+}
+
 #define ipc_port_translate_receive(space, name, portp)                  \
-               ipc_object_translate((space), (name),                   \
-                                    MACH_PORT_RIGHT_RECEIVE,           \
-                                    (ipc_object_t *) (portp))
+       ipc_port_translate((space), (name), MACH_PORT_RIGHT_RECEIVE, portp)
 
 #define ipc_port_translate_send(space, name, portp)                     \
-               ipc_object_translate((space), (name),                   \
-                                    MACH_PORT_RIGHT_SEND,              \
-                                    (ipc_object_t *) (portp))
+       ipc_port_translate((space), (name), MACH_PORT_RIGHT_SEND, portp)
 
 /* Allocate a notification request slot */
 #if IMPORTANCE_INHERITANCE
@@ -421,23 +454,6 @@ extern boolean_t ipc_port_request_sparm(
        mach_msg_option_t         option,
        mach_msg_priority_t       override);
 
-/* Macros for manipulating a port's dead name notificaiton requests */
-#define ipc_port_request_rename(port, index, oname, nname)              \
-MACRO_BEGIN                                                             \
-       ipc_port_request_t ipr, table;                                  \
-                                                                        \
-       assert(ip_active(port));                                        \
-                                                                        \
-       table = port->ip_requests;                                      \
-       assert(table != IPR_NULL);                                      \
-                                                                        \
-       ipr = &table[index];                                            \
-       assert(ipr->ipr_name == oname);                                 \
-                                                                        \
-       ipr->ipr_name = nname;                                          \
-MACRO_END
-
-
 /* Make a port-deleted request */
 extern void ipc_port_pdrequest(
        ipc_port_t      port,
@@ -451,13 +467,6 @@ extern void ipc_port_nsrequest(
        ipc_port_t              notify,
        ipc_port_t              *previousp);
 
-#define ipc_port_set_mscount(port, mscount)                             \
-MACRO_BEGIN                                                             \
-       assert(ip_active(port));                                        \
-                                                                        \
-       (port)->ip_mscount = (mscount);                                 \
-MACRO_END
-
 /* Prepare a receive right for transmission/destruction */
 extern boolean_t ipc_port_clear_receiver(
        ipc_port_t              port,
@@ -472,6 +481,7 @@ extern void ipc_port_init(
 /* Allocate a port */
 extern kern_return_t ipc_port_alloc(
        ipc_space_t             space,
+       bool                    make_send_right,
        mach_port_name_t        *namep,
        ipc_port_t              *portp);
 
@@ -511,35 +521,40 @@ enum {
 void
 ipc_port_link_special_reply_port(
        ipc_port_t special_reply_port,
-       ipc_port_t dest_port);
+       ipc_port_t dest_port,
+       boolean_t sync_bootstrap_checkin);
 
 #define IPC_PORT_ADJUST_SR_NONE                      0
-#define IPC_PORT_ADJUST_SR_CLEAR_SPECIAL_REPLY       0x1
-#define IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE        0x2
-#define IPC_PORT_ADJUST_SR_LINK_WORKLOOP             0x4
-
+#define IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE        0x1
+#define IPC_PORT_ADJUST_SR_LINK_WORKLOOP             0x2
+#define IPC_PORT_ADJUST_UNLINK_THREAD                0x4
 #define IPC_PORT_ADJUST_SR_RECEIVED_MSG              0x8
 #define IPC_PORT_ADJUST_SR_ENABLE_EVENT              0x10
+#define IPC_PORT_ADJUST_RESET_BOOSTRAP_CHECKIN       0x20
 
 void
-reset_ip_srp_bits(ipc_port_t special_reply_port);
-
-void
-reset_ip_srp_msg_sent(ipc_port_t special_reply_port);
+ipc_special_reply_port_bits_reset(ipc_port_t special_reply_port);
 
 void
-set_ip_srp_msg_sent(ipc_port_t special_reply_port);
+ipc_special_reply_port_msg_sent(ipc_port_t special_reply_port);
 
 void
-set_ip_srp_lost_link(ipc_port_t special_reply_port);
+ipc_special_reply_port_msg_sent(ipc_port_t special_reply_port);
 
 /* Adjust special reply port linkage */
-void ipc_port_adjust_special_reply_port_locked(
+void
+ipc_port_adjust_special_reply_port_locked(
        ipc_port_t special_reply_port,
        struct knote *kn,
        uint8_t flags,
        boolean_t get_turnstile);
 
+void
+ipc_port_adjust_sync_link_state_locked(
+       ipc_port_t port,
+       int sync_link_state,
+       turnstile_inheritor_t inheritor);
+
 /* Adjust special reply port linkage */
 void
 ipc_port_adjust_special_reply_port(
@@ -547,9 +562,36 @@ ipc_port_adjust_special_reply_port(
        uint8_t flags,
        boolean_t get_turnstile);
 
-turnstile_inheritor_t
-ipc_port_get_special_reply_port_inheritor(
-       ipc_port_t special_reply_port);
+void
+ipc_port_adjust_port_locked(
+       ipc_port_t port,
+       struct knote *kn,
+       boolean_t sync_bootstrap_checkin);
+
+void
+ipc_port_clear_sync_rcv_thread_boost_locked(
+       ipc_port_t port);
+
+kern_return_t
+ipc_port_add_watchport_elem_locked(
+       ipc_port_t                 port,
+       struct task_watchport_elem *watchport_elem,
+       struct task_watchport_elem **old_elem);
+
+kern_return_t
+ipc_port_clear_watchport_elem_internal_conditional_locked(
+       ipc_port_t                 port,
+       struct task_watchport_elem *watchport_elem);
+
+kern_return_t
+ipc_port_replace_watchport_elem_conditional_locked(
+       ipc_port_t                 port,
+       struct task_watchport_elem *old_watchport_elem,
+       struct task_watchport_elem *new_watchport_elem);
+
+struct task_watchport_elem *
+ipc_port_clear_watchport_elem_internal(
+       ipc_port_t                 port);
 
 void
 ipc_port_send_turnstile_prepare(ipc_port_t port);
@@ -560,9 +602,6 @@ ipc_port_send_turnstile_complete(ipc_port_t port);
 struct waitq *
 ipc_port_rcv_turnstile_waitq(struct waitq *waitq);
 
-struct turnstile *
-ipc_port_rcv_turnstile(ipc_port_t port);
-
 /* apply importance delta to port only */
 extern mach_port_delta_t
 ipc_port_impcount_delta(
@@ -586,11 +625,6 @@ ipc_port_importance_delta(
        mach_port_delta_t       delta);
 #endif /* IMPORTANCE_INHERITANCE */
 
-/* Make a send-once notify port from a receive right */
-extern ipc_port_t ipc_port_lookup_notify(
-       ipc_space_t             space,
-       mach_port_name_t        name);
-
 /* Make a naked send right from a receive right - port locked and active */
 extern ipc_port_t ipc_port_make_send_locked(
        ipc_port_t      port);
@@ -599,6 +633,10 @@ extern ipc_port_t ipc_port_make_send_locked(
 extern ipc_port_t ipc_port_make_send(
        ipc_port_t      port);
 
+/* Make a naked send right from another naked send right - port locked and active */
+extern void ipc_port_copy_send_locked(
+       ipc_port_t      port);
+
 /* Make a naked send right from another naked send right */
 extern ipc_port_t ipc_port_copy_send(
        ipc_port_t      port);
@@ -608,12 +646,6 @@ extern mach_port_name_t ipc_port_copyout_send(
        ipc_port_t      sright,
        ipc_space_t     space);
 
-/* Copyout a naked send right to given name */
-extern mach_port_name_t ipc_port_copyout_name_send(
-       ipc_port_t      sright,
-       ipc_space_t     space,
-       mach_port_name_t name);
-
 #endif /* MACH_KERNEL_PRIVATE */
 
 #if KERNEL_PRIVATE
@@ -670,8 +702,13 @@ extern void ipc_port_track_dealloc(
 extern void ipc_port_debug_init(void);
 #endif  /* MACH_ASSERT */
 
-extern struct turnstile *ipc_port_get_inheritor(
-       ipc_port_t port);
+extern void ipc_port_recv_update_inheritor(ipc_port_t port,
+    struct turnstile *turnstile,
+    turnstile_update_flags_t flags);
+
+extern void ipc_port_send_update_inheritor(ipc_port_t port,
+    struct turnstile *turnstile,
+    turnstile_update_flags_t flags);
 
 #define ipc_port_alloc_kernel()         \
                ipc_port_alloc_special(ipc_space_kernel)
index c14e98a792f291abf2b25cfa8d8872123a38b049..523c496600894c3a89d45013e8b6b53a129bb5e2 100644 (file)
@@ -175,14 +175,14 @@ ipc_pset_alloc_special(
        assert(space->is_table == IE_NULL);
        assert(!is_active(space));
 
-       __IGNORE_WCASTALIGN(pset = (ipc_pset_t)io_alloc(IOT_PORT_SET));
+       pset = ips_object_to_pset(io_alloc(IOT_PORT_SET));
        if (pset == IPS_NULL) {
                return IPS_NULL;
        }
 
        bzero((char *)pset, sizeof(*pset));
 
-       io_lock_init(&pset->ips_object);
+       io_lock_init(ips_to_object(pset));
        pset->ips_references = 1;
        pset->ips_object.io_bits = io_makebits(TRUE, IOT_PORT_SET, 0);
 
@@ -205,7 +205,7 @@ ipc_pset_member(
        ipc_pset_t      pset,
        ipc_port_t      port)
 {
-       assert(ip_active(port));
+       require_ip_active(port);
 
        return ipc_mqueue_member(&port->ip_messages, &pset->ips_messages);
 }
@@ -230,7 +230,7 @@ ipc_pset_add(
        kern_return_t kr;
 
        assert(ips_active(pset));
-       assert(ip_active(port));
+       require_ip_active(port);
 
        kr = ipc_mqueue_add(&port->ip_messages, &pset->ips_messages,
            reserved_link, reserved_prepost);
@@ -256,8 +256,7 @@ ipc_pset_remove(
        ipc_port_t        port)
 {
        kern_return_t kr;
-
-       assert(ip_active(port));
+       require_ip_active(port);
 
        if (port->ip_in_pset == 0) {
                return KERN_NOT_IN_SET;
@@ -299,7 +298,7 @@ ipc_pset_lazy_allocate(
        }
 
        psobj = entry->ie_object;
-       __IGNORE_WCASTALIGN(pset = (ipc_pset_t) psobj);
+       pset = ips_object_to_pset(psobj);
        assert(pset != NULL);
        ipc_mqueue_t set_mqueue = &pset->ips_messages;
        struct waitq_set *wqset =  &set_mqueue->imq_set_queue;
@@ -384,7 +383,7 @@ ipc_pset_destroy(
 /*
  * Kqueue EVFILT_MACHPORT support
  *
- * - kn_ptr.p_mqueue points to the monitored mqueue
+ * - kn_mqueue points to the monitored mqueue
  *
  * - (in/out) ext[0] holds a mach_vm_address_t to a userspace buffer
  *   that can be used to direct-deliver messages when
@@ -422,12 +421,18 @@ filt_machport_adjust_qos(struct knote *kn, ipc_kmsg_t first)
 }
 
 struct turnstile *
-filt_machport_kqueue_turnstile(struct knote *kn)
+filt_ipc_kqueue_turnstile(struct knote *kn)
 {
-       if ((kn->kn_sfflags & MACH_RCV_MSG) && (kn->kn_status & KN_DISPATCH)) {
-               return kqueue_turnstile(knote_get_kq(kn));
-       }
-       return TURNSTILE_NULL;
+       assert(kn->kn_filter == EVFILT_MACHPORT || kn->kn_filter == EVFILT_WORKLOOP);
+       return kqueue_turnstile(knote_get_kq(kn));
+}
+
+bool
+filt_machport_kqueue_has_turnstile(struct knote *kn)
+{
+       assert(kn->kn_filter == EVFILT_MACHPORT);
+       return ((kn->kn_sfflags & MACH_RCV_MSG) || (kn->kn_sfflags & MACH_RCV_SYNC_PEEK))
+              && (kn->kn_flags & EV_DISPATCH);
 }
 
 /*
@@ -444,15 +449,24 @@ filt_machport_kqueue_turnstile(struct knote *kn)
 struct turnstile *
 filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link)
 {
-       struct turnstile *ts = filt_machport_kqueue_turnstile(kn);
+       struct turnstile *ts = TURNSTILE_NULL;
 
-       if (!ts) {
+       if (kn->kn_filter == EVFILT_WORKLOOP) {
+               assert(kn->kn_mqueue == NULL);
+               kn->kn_mqueue = &port->ip_messages;
+               ip_reference(port);
+               if (link) {
+                       *link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
+               }
+               ts = filt_ipc_kqueue_turnstile(kn);
+       } else if (!filt_machport_kqueue_has_turnstile(kn)) {
                if (link) {
                        *link = PORT_SYNC_LINK_NO_LINKAGE;
                }
        } else if (kn->kn_ext[3] == 0) {
                ip_reference(port);
                kn->kn_ext[3] = (uintptr_t)port;
+               ts = filt_ipc_kqueue_turnstile(kn);
                if (link) {
                        *link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
                }
@@ -466,19 +480,6 @@ filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link)
        return ts;
 }
 
-struct turnstile *
-filt_machport_stashed_special_reply_port_turnstile(ipc_port_t port)
-{
-       struct knote *kn = port->ip_sync_inheritor_knote;
-
-       assert(port->ip_specialreply);
-       assert(port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE);
-       if (kn->kn_ext[3] == (uint64_t)port) {
-               return kqueue_turnstile(knote_get_kq(kn));
-       }
-       return kn->kn_hook;
-}
-
 /*
  * Lazily prepare a turnstile so that filt_machport_stash_port()
  * can be called with the mqueue lock held.
@@ -500,11 +501,15 @@ filt_machport_turnstile_prepare_lazily(
        /* This is called from within filt_machportprocess */
        assert((kn->kn_status & KN_SUPPRESSED) && (kn->kn_status & KN_LOCKED));
 
-       struct turnstile *ts = filt_machport_kqueue_turnstile(kn);
-       if (ts == TURNSTILE_NULL || kn->kn_ext[3] == 0 || kn->kn_hook) {
+       if (!filt_machport_kqueue_has_turnstile(kn)) {
+               return;
+       }
+
+       if (kn->kn_ext[3] == 0 || kn->kn_hook) {
                return;
        }
 
+       struct turnstile *ts = filt_ipc_kqueue_turnstile(kn);
        if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && port->ip_specialreply) ||
            (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE)) {
                struct turnstile *kn_ts = turnstile_alloc();
@@ -516,6 +521,67 @@ filt_machport_turnstile_prepare_lazily(
        }
 }
 
+static void
+filt_machport_turnstile_complete_port(struct knote *kn, ipc_port_t port,
+    ipc_mqueue_t mqueue)
+{
+       struct turnstile *ts = TURNSTILE_NULL;
+
+       ip_lock(port);
+       if (port->ip_specialreply) {
+               /*
+                * If the reply has been sent to the special reply port already,
+                * then the special reply port may already be reused to do something
+                * entirely different.
+                *
+                * However, the only reason for it to still point to this knote is
+                * that it's still waiting for a reply, so when this is the case,
+                * neuter the linkage.
+                */
+               if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
+                   port->ip_sync_inheritor_knote == kn) {
+                       ipc_port_adjust_special_reply_port_locked(port, NULL,
+                           (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE);
+               } else {
+                       ip_unlock(port);
+               }
+       } else {
+               /*
+                * For receive rights, if their IMQ_KNOTE() is still this
+                * knote, then sever the link.
+                */
+               imq_lock(mqueue);
+               if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
+                   mqueue->imq_inheritor_knote == kn) {
+                       ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
+                       ts = port_send_turnstile(port);
+               }
+               if (ts) {
+                       turnstile_reference(ts);
+                       turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
+                           TURNSTILE_IMMEDIATE_UPDATE);
+               }
+               imq_unlock(mqueue);
+               ip_unlock(port);
+
+               if (ts) {
+                       turnstile_update_inheritor_complete(ts,
+                           TURNSTILE_INTERLOCK_NOT_HELD);
+                       turnstile_deallocate(ts);
+               }
+       }
+
+       ip_release(port);
+}
+
+void
+filt_wldetach_sync_ipc(struct knote *kn)
+{
+       ipc_mqueue_t mqueue = kn->kn_mqueue;
+       filt_machport_turnstile_complete_port(kn, ip_from_mq(mqueue), mqueue);
+       kn->kn_mqueue = NULL;
+}
+
 /*
  * Other half of filt_machport_turnstile_prepare_lazily()
  *
@@ -524,75 +590,20 @@ filt_machport_turnstile_prepare_lazily(
 static void
 filt_machport_turnstile_complete(struct knote *kn)
 {
-       struct turnstile *ts = TURNSTILE_NULL;
-
        if (kn->kn_ext[3]) {
                ipc_port_t port = (ipc_port_t)kn->kn_ext[3];
-               ipc_mqueue_t mqueue = &port->ip_messages;
-
-               ip_lock(port);
-               if (port->ip_specialreply) {
-                       /*
-                        * If the reply has been sent to the special reply port already,
-                        * then the special reply port may already be reused to do something
-                        * entirely different.
-                        *
-                        * However, the only reason for it to still point to this knote is
-                        * that it's still waiting for a reply, so when this is the case,
-                        * neuter the linkage.
-                        */
-                       if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
-                           port->ip_sync_inheritor_knote == kn) {
-                               ipc_port_adjust_special_reply_port_locked(port, NULL,
-                                   (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE);
-                       } else {
-                               ip_unlock(port);
-                       }
-               } else {
-                       struct turnstile *kq_ts = kqueue_turnstile(knote_get_kq(kn));
-
-                       /*
-                        * For receive rights, if their IMQ_INHERITOR() is still this
-                        * workloop, then sever the link.
-                        *
-                        * It has a theoretical hole: if the port is sent again to a new
-                        * receive right that is also monitored by the same kqueue,
-                        * we would sever the link incorrectly.
-                        *
-                        * However this would be a REALLY cumbersome thing to do.
-                        */
-                       imq_lock(mqueue);
-                       if (!IMQ_KLIST_VALID(mqueue) && IMQ_INHERITOR(mqueue) == kq_ts) {
-                               turnstile_deallocate_safe(kq_ts);
-                               klist_init(&mqueue->imq_klist);
-                               ts = port_send_turnstile(port);
-                       }
-                       if (ts) {
-                               turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
-                                   TURNSTILE_IMMEDIATE_UPDATE);
-                               turnstile_reference(ts);
-                       }
-                       imq_unlock(mqueue);
-                       ip_unlock(port);
-
-                       if (ts) {
-                               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
-                               turnstile_deallocate(ts);
-                       }
-               }
-
-               ip_release(port);
+               filt_machport_turnstile_complete_port(kn, port, &port->ip_messages);
                kn->kn_ext[3] = 0;
        }
 
        if (kn->kn_hook) {
-               ts = kn->kn_hook;
+               struct turnstile *ts = kn->kn_hook;
 
                turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
                    TURNSTILE_IMMEDIATE_UPDATE);
                turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
 
-               turnstile_complete((uintptr_t)kn, (struct turnstile **)&kn->kn_hook, &ts);
+               turnstile_complete((uintptr_t)kn, (struct turnstile **)&kn->kn_hook, &ts, TURNSTILE_KNOTE);
                turnstile_cleanup();
 
                assert(ts);
@@ -600,16 +611,105 @@ filt_machport_turnstile_complete(struct knote *kn)
        }
 }
 
+static void
+filt_machport_link(ipc_mqueue_t mqueue, struct knote *kn)
+{
+       struct knote *hd = SLIST_FIRST(&mqueue->imq_klist);
+
+       if (hd && filt_machport_kqueue_has_turnstile(kn)) {
+               SLIST_INSERT_AFTER(hd, kn, kn_selnext);
+       } else {
+               SLIST_INSERT_HEAD(&mqueue->imq_klist, kn, kn_selnext);
+       }
+}
+
+static void
+filt_machport_unlink(ipc_mqueue_t mqueue, struct knote *kn)
+{
+       struct knote **knprev;
+
+       KNOTE_DETACH(&mqueue->imq_klist, kn);
+
+       /* make sure the first knote is a knote we can push on */
+       SLIST_FOREACH_PREVPTR(kn, knprev, &mqueue->imq_klist, kn_selnext) {
+               if (filt_machport_kqueue_has_turnstile(kn)) {
+                       *knprev = SLIST_NEXT(kn, kn_selnext);
+                       SLIST_INSERT_HEAD(&mqueue->imq_klist, kn, kn_selnext);
+                       break;
+               }
+       }
+}
+
+int
+filt_wlattach_sync_ipc(struct knote *kn)
+{
+       mach_port_name_t name = (mach_port_name_t)kn->kn_id;
+       ipc_space_t space = current_space();
+       ipc_entry_t entry;
+       ipc_port_t port = IP_NULL;
+       int error = 0;
+
+       if (ipc_right_lookup_read(space, name, &entry) != KERN_SUCCESS) {
+               return ENOENT;
+       }
+
+       /* space is read-locked */
+
+       if (entry->ie_bits & MACH_PORT_TYPE_RECEIVE) {
+               port = ip_object_to_port(entry->ie_object);
+               if (port->ip_specialreply) {
+                       error = ENOENT;
+               }
+       } else if (entry->ie_bits & MACH_PORT_TYPE_SEND_ONCE) {
+               port = ip_object_to_port(entry->ie_object);
+               if (!port->ip_specialreply) {
+                       error = ENOENT;
+               }
+       } else {
+               error = ENOENT;
+       }
+       if (error) {
+               is_read_unlock(space);
+               return error;
+       }
+
+       ip_lock(port);
+       is_read_unlock(space);
+
+       if (port->ip_sync_link_state == PORT_SYNC_LINK_ANY) {
+               ip_unlock(port);
+               /*
+                * We cannot start a sync IPC inheritance chain, only further one
+                * Note: this can also happen if the inheritance chain broke
+                * because the original requestor died.
+                */
+               return ENOENT;
+       }
+
+       if (port->ip_specialreply) {
+               ipc_port_adjust_special_reply_port_locked(port, kn,
+                   IPC_PORT_ADJUST_SR_LINK_WORKLOOP, FALSE);
+       } else {
+               ipc_port_adjust_port_locked(port, kn, FALSE);
+       }
+
+       /* make sure the port was stashed */
+       assert(kn->kn_mqueue == &port->ip_messages);
+
+       /* port has been unlocked by ipc_port_adjust_* */
+
+       return 0;
+}
+
 static int
 filt_machportattach(
        struct knote *kn,
-       __unused struct kevent_internal_s *kev)
+       __unused struct kevent_qos_s *kev)
 {
-       mach_port_name_t name = (mach_port_name_t)kn->kn_kevent.ident;
+       mach_port_name_t name = (mach_port_name_t)kn->kn_id;
        uint64_t wq_link_id = waitq_link_reserve(NULL);
        ipc_space_t space = current_space();
        ipc_kmsg_t first;
-       struct turnstile *turnstile = TURNSTILE_NULL;
        struct turnstile *send_turnstile = TURNSTILE_NULL;
 
        int error;
@@ -621,132 +721,174 @@ filt_machportattach(
        kn->kn_flags &= ~EV_EOF;
        kn->kn_ext[3] = 0;
 
-       if ((kn->kn_sfflags & MACH_RCV_MSG) && (kn->kn_status & KN_DISPATCH)) {
+       if (filt_machport_kqueue_has_turnstile(kn)) {
                /*
                 * If the filter is likely to support sync IPC override,
                 * and it happens to be attaching to a workloop,
                 * make sure the workloop has an allocated turnstile.
                 */
-               turnstile = kqueue_alloc_turnstile(knote_get_kq(kn));
+               kqueue_alloc_turnstile(knote_get_kq(kn));
        }
 
+lookup_again:
        kr = ipc_right_lookup_read(space, name, &entry);
 
-check_lookup:
-       if (kr == KERN_SUCCESS) {
-               /* space is read-locked and active */
-
-               if (entry->ie_bits & MACH_PORT_TYPE_PORT_SET) {
-                       ipc_pset_t pset;
-
-                       if (knote_link_waitqset_should_lazy_alloc(kn)) {
-                               is_read_unlock(space);
-
-                               /*
-                                * We need to link the portset of the kn,
-                                * to insure that the link is allocated before taking
-                                * any spinlocks.
-                                */
-                               knote_link_waitqset_lazy_alloc(kn);
-
-                               /*
-                                * We had to drop the space lock because knote_link_waitqset_lazy_alloc()
-                                * could have allocated memory. The ipc_right_lookup_read()
-                                * function returns with the space locked, so we need to revalidate state.
-                                */
-                               kr = ipc_right_lookup_read(space, name, &entry);
-                               if (!(kr == KERN_SUCCESS) || !(entry->ie_bits & MACH_PORT_TYPE_PORT_SET)) {
-                                       goto check_lookup;
-                               }
-                       }
+       if (kr != KERN_SUCCESS) {
+               error = ENOENT;
+               goto out;
+       }
 
-                       __IGNORE_WCASTALIGN(pset = (ipc_pset_t)entry->ie_object);
-                       mqueue = &pset->ips_messages;
-                       ips_reference(pset);
+       /* space is read-locked and active */
 
-                       imq_lock(mqueue);
-                       kn->kn_ptr.p_mqueue = mqueue;
+       if ((entry->ie_bits & MACH_PORT_TYPE_PORT_SET) &&
+           knote_link_waitqset_should_lazy_alloc(kn)) {
+               is_read_unlock(space);
 
+               /*
+                * We need to link the portset of the kn,
+                * to insure that the link is allocated before taking
+                * any spinlocks.
+                *
+                * Because we have to drop the space lock so that
+                * knote_link_waitqset_lazy_alloc() can allocate memory,
+                * we will need to redo the lookup.
+                */
+               knote_link_waitqset_lazy_alloc(kn);
+               goto lookup_again;
+       }
+
+       if (entry->ie_bits & MACH_PORT_TYPE_PORT_SET) {
+               ipc_pset_t pset;
+
+               pset = ips_object_to_pset(entry->ie_object);
+               mqueue = &pset->ips_messages;
+               ips_reference(pset);
+
+               imq_lock(mqueue);
+               kn->kn_mqueue = mqueue;
+
+               /*
+                * Bind the portset wait queue directly to knote/kqueue.
+                * This allows us to just use wait_queue foo to effect a wakeup,
+                * rather than having to call knote() from the Mach code on each
+                * message.  We still attach the knote to the mqueue klist for
+                * NOTE_REVOKE purposes only.
+                */
+               error = knote_link_waitq(kn, &mqueue->imq_wait_queue, &wq_link_id);
+               if (!error) {
+                       filt_machport_link(mqueue, kn);
+                       imq_unlock(mqueue);
+               } else {
+                       kn->kn_mqueue = IMQ_NULL;
+                       imq_unlock(mqueue);
+                       ips_release(pset);
+               }
+
+               is_read_unlock(space);
+
+               /*
+                * linked knotes are marked stay-active and therefore don't
+                * need an indication of their fired state to be returned
+                * from the attach operation.
+                */
+       } else if (entry->ie_bits & MACH_PORT_TYPE_RECEIVE) {
+               ipc_port_t port = ip_object_to_port(entry->ie_object);
+
+               if (port->ip_specialreply) {
                        /*
-                        * Bind the portset wait queue directly to knote/kqueue.
-                        * This allows us to just use wait_queue foo to effect a wakeup,
-                        * rather than having to call knote() from the Mach code on each
-                        * message.  We still attach the knote to the mqueue klist for
-                        * NOTE_REVOKE purposes only.
+                        * Registering for kevents on special reply ports
+                        * isn't supported for two reasons:
+                        *
+                        * 1. it really makes very little sense for a port that
+                        *    is supposed to be used synchronously
+                        *
+                        * 2. their mqueue's imq_klist field will be used to
+                        *    store the receive turnstile, so we can't possibly
+                        *    attach them anyway.
                         */
-                       error = knote_link_waitq(kn, &mqueue->imq_wait_queue, &wq_link_id);
-                       if (!error) {
-                               assert(IMQ_KLIST_VALID(mqueue));
-                               KNOTE_ATTACH(&mqueue->imq_klist, kn);
-                               imq_unlock(mqueue);
-                       } else {
-                               kn->kn_ptr.p_mqueue = IMQ_NULL;
-                               imq_unlock(mqueue);
-                               ips_release(pset);
-                       }
-
                        is_read_unlock(space);
+                       error = ENOTSUP;
+                       goto out;
+               }
+
+               mqueue = &port->ip_messages;
+               ip_reference(port);
+
+               /*
+                * attach knote to port and determine result
+                * If the filter requested direct message receipt,
+                * we may need to adjust the qos of the knote to
+                * reflect the requested and override qos of the
+                * first message in the queue.
+                */
+               ip_lock(port);
+               imq_lock(mqueue);
 
+               kn->kn_mqueue = mqueue;
+               if (port->ip_sync_link_state != PORT_SYNC_LINK_ANY) {
                        /*
-                        * linked knotes are marked stay-active and therefore don't
-                        * need an indication of their fired state to be returned
-                        * from the attach operation.
+                        * We're attaching a port that used to have an IMQ_KNOTE,
+                        * clobber this state, we'll fixup its turnstile inheritor below.
                         */
-               } else if (entry->ie_bits & MACH_PORT_TYPE_RECEIVE) {
-                       ipc_port_t port;
+                       ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
+               }
+               filt_machport_link(mqueue, kn);
+
+               if ((first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) {
+                       result = FILTER_ACTIVE | filt_machport_adjust_qos(kn, first);
+               }
 
-                       __IGNORE_WCASTALIGN(port = (ipc_port_t)entry->ie_object);
-                       mqueue = &port->ip_messages;
-                       ip_reference(port);
+               /*
+                * Update the port's turnstile inheritor
+                *
+                * Unlike filt_machportdetach(), we don't have to care about races for
+                * turnstile_workloop_pusher_info(): filt_machport_link() doesn't affect
+                * already pushing knotes, and if the current one becomes the new
+                * pusher, it'll only be visible when turnstile_workloop_pusher_info()
+                * returns.
+                */
+               send_turnstile = port_send_turnstile(port);
+               if (send_turnstile) {
+                       turnstile_reference(send_turnstile);
+                       ipc_port_send_update_inheritor(port, send_turnstile,
+                           TURNSTILE_IMMEDIATE_UPDATE);
 
                        /*
-                        * attach knote to port and determine result
-                        * If the filter requested direct message receipt,
-                        * we may need to adjust the qos of the knote to
-                        * reflect the requested and override qos of the
-                        * first message in the queue.
+                        * rdar://problem/48861190
+                        *
+                        * When a listener connection resumes a peer,
+                        * updating the inheritor above has moved the push
+                        * from the current thread to the workloop.
+                        *
+                        * However, we haven't told the workloop yet
+                        * that it needs a thread request, and we risk
+                        * to be preeempted as soon as we drop the space
+                        * lock below.
+                        *
+                        * To avoid this disable preemption and let kevent
+                        * reenable it after it takes the kqlock.
                         */
-                       imq_lock(mqueue);
-                       kn->kn_ptr.p_mqueue = mqueue;
-                       if (!IMQ_KLIST_VALID(mqueue)) {
-                               /*
-                                * We're attaching a port that used to have an IMQ_INHERITOR,
-                                * clobber this state, and set the inheritor of its turnstile
-                                * to the kqueue it's now attached to.
-                                */
-                               turnstile_deallocate_safe(IMQ_INHERITOR(mqueue));
-                               klist_init(&mqueue->imq_klist);
-                       }
-                       KNOTE_ATTACH(&mqueue->imq_klist, kn);
-
-                       /* Update the port's turnstile inheritor */
-                       send_turnstile = port_send_turnstile(port);
-                       if (send_turnstile) {
-                               turnstile_reference(send_turnstile);
-                               turnstile_update_inheritor(send_turnstile, turnstile,
-                                   (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE));
-                       }
+                       disable_preemption();
+                       result |= FILTER_THREADREQ_NODEFEER;
+               }
 
-                       if ((first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) {
-                               result = FILTER_ACTIVE | filt_machport_adjust_qos(kn, first);
-                       }
-                       imq_unlock(mqueue);
-                       is_read_unlock(space);
-                       if (send_turnstile) {
-                               turnstile_update_inheritor_complete(send_turnstile,
-                                   TURNSTILE_INTERLOCK_NOT_HELD);
-                               turnstile_deallocate(send_turnstile);
-                       }
+               imq_unlock(mqueue);
+               ip_unlock(port);
 
-                       error = 0;
-               } else {
-                       is_read_unlock(space);
-                       error = ENOTSUP;
+               is_read_unlock(space);
+               if (send_turnstile) {
+                       turnstile_update_inheritor_complete(send_turnstile,
+                           TURNSTILE_INTERLOCK_NOT_HELD);
+                       turnstile_deallocate_safe(send_turnstile);
                }
+
+               error = 0;
        } else {
-               error = ENOENT;
+               is_read_unlock(space);
+               error = ENOTSUP;
        }
 
+out:
        waitq_link_release(wq_link_id);
 
        /* bail out on errors */
@@ -758,18 +900,17 @@ check_lookup:
        return result;
 }
 
-/* NOT proud of these - we should have a stricter relationship between mqueue and ipc object */
-#define mqueue_to_pset(mq) ((ipc_pset_t)((uintptr_t)mq-offsetof(struct ipc_pset, ips_messages)))
-#define mqueue_to_port(mq) ((ipc_port_t)((uintptr_t)mq-offsetof(struct ipc_port, ip_messages)))
-#define mqueue_to_object(mq) (((ipc_object_t)(mq)) - 1)
-
+/* Validate imq_to_object implementation "works" */
+_Static_assert(offsetof(struct ipc_pset, ips_messages) ==
+    offsetof(struct ipc_port, ip_messages),
+    "Make sure the mqueue aliases in both ports and psets");
 
 static void
 filt_machportdetach(
        struct knote *kn)
 {
-       ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue;
-       ipc_object_t object = mqueue_to_object(mqueue);
+       ipc_mqueue_t mqueue = kn->kn_mqueue;
+       ipc_object_t object = imq_to_object(mqueue);
        struct turnstile *send_turnstile = TURNSTILE_NULL;
 
        filt_machport_turnstile_complete(kn);
@@ -780,24 +921,36 @@ filt_machportdetach(
                 * ipc_mqueue_changed() already unhooked this knote from the mqueue,
                 */
        } else {
-               assert(IMQ_KLIST_VALID(mqueue));
-               KNOTE_DETACH(&mqueue->imq_klist, kn);
-       }
+               ipc_port_t port = IP_NULL;
 
-       if (io_otype(object) == IOT_PORT) {
-               ipc_port_t port = ip_from_mq(mqueue);
+               /*
+                * When the knote being detached is the first one in the list,
+                * then unlinking the knote *and* updating the turnstile inheritor
+                * need to happen atomically with respect to the callers of
+                * turnstile_workloop_pusher_info().
+                *
+                * The caller of turnstile_workloop_pusher_info() will use the kq req
+                * lock (and hence the kqlock), so we just need to hold the kqlock too.
+                */
+               if (io_otype(object) == IOT_PORT) {
+                       port = ip_object_to_port(object);
+                       assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY);
+                       if (kn == SLIST_FIRST(&mqueue->imq_klist)) {
+                               send_turnstile = port_send_turnstile(port);
+                       }
+               }
+
+               filt_machport_unlink(mqueue, kn);
 
-               send_turnstile = port_send_turnstile(port);
                if (send_turnstile) {
                        turnstile_reference(send_turnstile);
-                       turnstile_update_inheritor(send_turnstile,
-                           ipc_port_get_inheritor(port),
-                           TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE);
+                       ipc_port_send_update_inheritor(port, send_turnstile,
+                           TURNSTILE_IMMEDIATE_UPDATE);
                }
        }
 
        /* Clear the knote pointer once the knote has been removed from turnstile */
-       kn->kn_ptr.p_mqueue = IMQ_NULL;
+       kn->kn_mqueue = IMQ_NULL;
        imq_unlock(mqueue);
 
        if (send_turnstile) {
@@ -833,7 +986,7 @@ filt_machportdetach(
 static int
 filt_machportevent(struct knote *kn, long hint __assert_only)
 {
-       ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue;
+       ipc_mqueue_t mqueue = kn->kn_mqueue;
        ipc_kmsg_t first;
        int result = 0;
 
@@ -853,9 +1006,9 @@ filt_machportevent(struct knote *kn, long hint __assert_only)
 static int
 filt_machporttouch(
        struct knote *kn,
-       struct kevent_internal_s *kev)
+       struct kevent_qos_s *kev)
 {
-       ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue;
+       ipc_mqueue_t mqueue = kn->kn_mqueue;
        ipc_kmsg_t first;
        int result = 0;
 
@@ -892,15 +1045,12 @@ filt_machporttouch(
 }
 
 static int
-filt_machportprocess(
-       struct knote *kn,
-       struct filt_process_s *process_data,
-       struct kevent_internal_s *kev)
+filt_machportprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
-       ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue;
-       ipc_object_t object = mqueue_to_object(mqueue);
+       ipc_mqueue_t mqueue = kn->kn_mqueue;
+       ipc_object_t object = imq_to_object(mqueue);
        thread_t self = current_thread();
-       boolean_t used_filtprocess_data = FALSE;
+       kevent_ctx_t kectx = NULL;
 
        wait_result_t wresult;
        mach_msg_option_t option;
@@ -908,7 +1058,7 @@ filt_machportprocess(
        mach_msg_size_t size;
 
        /* Capture current state */
-       *kev = kn->kn_kevent;
+       knote_fill_kevent(kn, kev, MACH_PORT_NULL);
        kev->ext[3] = 0; /* hide our port reference from userspace */
 
        /* If already deallocated/moved return one last EOF event */
@@ -922,7 +1072,7 @@ filt_machportprocess(
         * name of the port and sizeof the waiting message.
         */
        option = kn->kn_sfflags & (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
-           MACH_RCV_TRAILER_MASK | MACH_RCV_VOUCHER);
+           MACH_RCV_TRAILER_MASK | MACH_RCV_VOUCHER | MACH_MSG_STRICT_REPLY);
 
        if (option & MACH_RCV_MSG) {
                addr = (mach_vm_address_t) kn->kn_ext[0];
@@ -932,13 +1082,12 @@ filt_machportprocess(
                 * If the kevent didn't specify a buffer and length, carve a buffer
                 * from the filter processing data according to the flags.
                 */
-               if (size == 0 && process_data != NULL) {
-                       used_filtprocess_data = TRUE;
-
-                       addr = (mach_vm_address_t)process_data->fp_data_out;
-                       size = (mach_msg_size_t)process_data->fp_data_resid;
+               if (size == 0) {
+                       kectx = kevent_get_context(self);
+                       addr  = (mach_vm_address_t)kectx->kec_data_out;
+                       size  = (mach_msg_size_t)kectx->kec_data_resid;
                        option |= (MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY);
-                       if (process_data->fp_flags & KEVENT_FLAG_STACK_DATA) {
+                       if (kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) {
                                option |= MACH_RCV_STACK;
                        }
                }
@@ -1037,16 +1186,15 @@ filt_machportprocess(
         * store the address used in the knote and adjust the residual and
         * other parameters for future use.
         */
-       if (used_filtprocess_data) {
-               assert(process_data->fp_data_resid >= size);
-               process_data->fp_data_resid -= size;
-               if ((process_data->fp_flags & KEVENT_FLAG_STACK_DATA) == 0) {
-                       kev->ext[0] = process_data->fp_data_out;
-                       process_data->fp_data_out += size;
+       if (kectx) {
+               assert(kectx->kec_data_resid >= size);
+               kectx->kec_data_resid -= size;
+               if ((kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) == 0) {
+                       kev->ext[0] = kectx->kec_data_out;
+                       kectx->kec_data_out += size;
                } else {
                        assert(option & MACH_RCV_STACK);
-                       kev->ext[0] = process_data->fp_data_out +
-                           process_data->fp_data_resid;
+                       kev->ext[0] = kectx->kec_data_out + kectx->kec_data_resid;
                }
        }
 
@@ -1081,7 +1229,7 @@ filt_machportprocess(
 static int
 filt_machportpeek(struct knote *kn)
 {
-       ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue;
+       ipc_mqueue_t mqueue = kn->kn_mqueue;
 
        return ipc_mqueue_set_peek(mqueue) ? FILTER_ACTIVE : 0;
 }
index f0e5df94231350155527d0519c6b4be46427244e..efdcbcf8193ea08e1d6abbee24d77a60725b1f60 100644 (file)
@@ -84,13 +84,15 @@ struct ipc_pset {
 
 #define ips_references          ips_object.io_references
 
-#define ips_active(pset)        io_active(&(pset)->ips_object)
-#define ips_lock(pset)          io_lock(&(pset)->ips_object)
-#define ips_lock_try(pset)      io_lock_try(&(pset)->ips_object)
-#define ips_lock_held_kdp(pset) io_lock_held_kdp(&(pset)->ips_object)
-#define ips_unlock(pset)        io_unlock(&(pset)->ips_object)
-#define ips_reference(pset)     io_reference(&(pset)->ips_object)
-#define ips_release(pset)       io_release(&(pset)->ips_object)
+#define ips_object_to_pset(io)  __container_of(io, struct ipc_pset, ips_object)
+#define ips_to_object(pset)     (&(pset)->ips_object)
+#define ips_active(pset)        io_active(ips_to_object(pset))
+#define ips_lock(pset)          io_lock(ips_to_object(pset))
+#define ips_lock_try(pset)      io_lock_try(ips_to_object(pset))
+#define ips_lock_held_kdp(pset) io_lock_held_kdp(ips_to_object(pset))
+#define ips_unlock(pset)        io_unlock(ips_to_object(pset))
+#define ips_reference(pset)     io_reference(ips_to_object(pset))
+#define ips_release(pset)       io_release(ips_to_object(pset))
 
 /* get an ipc_pset pointer from an ipc_mqueue pointer */
 #define ips_from_mq(mq) \
@@ -144,11 +146,11 @@ extern void ipc_pset_destroy(
        ipc_pset_t      pset);
 
 #if MACH_KERNEL_PRIVATE
-extern struct turnstile *filt_machport_kqueue_turnstile(
+extern struct turnstile *filt_ipc_kqueue_turnstile(
+       struct knote *kn);
+bool
+filt_machport_kqueue_has_turnstile(
        struct knote *kn);
-
-extern struct turnstile *filt_machport_stashed_special_reply_port_turnstile(
-       ipc_port_t port);
 
 extern void filt_machport_turnstile_prepare_lazily(
        struct knote *kn,
index 08a79c30033b94cab239c5349871a21d3d5be93b..00a256e177fa5ddd57bbb21b392495cbe90e0eb1 100644 (file)
@@ -204,7 +204,7 @@ ipc_right_reverse(
        assert(is_active(space));
        assert(io_otype(object) == IOT_PORT);
 
-       port = (ipc_port_t) object;
+       port = ip_object_to_port(object);
 
        ip_lock(port);
        if (!ip_active(port)) {
@@ -221,17 +221,17 @@ ipc_right_reverse(
 
                assert(entry != IE_NULL);
                assert(entry->ie_bits & MACH_PORT_TYPE_RECEIVE);
-               assert(port == (ipc_port_t) entry->ie_object);
+               assert(port == ip_object_to_port(entry->ie_object));
 
                *namep = name;
                *entryp = entry;
                return TRUE;
        }
 
-       if (ipc_hash_lookup(space, (ipc_object_t) port, namep, entryp)) {
+       if (ipc_hash_lookup(space, ip_to_object(port), namep, entryp)) {
                assert((entry = *entryp) != IE_NULL);
                assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_SEND);
-               assert(port == (ipc_port_t) entry->ie_object);
+               assert(port == ip_object_to_port(entry->ie_object));
 
                return TRUE;
        }
@@ -301,7 +301,7 @@ ipc_right_request_alloc(
                if (entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) {
                        ipc_port_request_index_t new_request;
 
-                       port = (ipc_port_t) entry->ie_object;
+                       port = ip_object_to_port(entry->ie_object);
                        assert(port != IP_NULL);
 
                        if (!ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
@@ -448,8 +448,8 @@ ipc_right_request_cancel(
 {
        ipc_port_t previous;
 
-       assert(ip_active(port));
-       assert(port == (ipc_port_t) entry->ie_object);
+       require_ip_active(port);
+       assert(port == ip_object_to_port(entry->ie_object));
 
        if (entry->ie_request == IE_REQ_NONE) {
                return IP_NULL;
@@ -511,7 +511,7 @@ ipc_right_check(
        ipc_entry_bits_t bits;
 
        assert(is_active(space));
-       assert(port == (ipc_port_t) entry->ie_object);
+       assert(port == ip_object_to_port(entry->ie_object));
 
        ip_lock(port);
        if (ip_active(port) ||
@@ -545,7 +545,7 @@ ipc_right_check(
         */
 
        if ((bits & MACH_PORT_TYPE_SEND) != 0) {
-               ipc_hash_delete(space, (ipc_object_t)port, name, entry);
+               ipc_hash_delete(space, ip_to_object(port), name, entry);
        }
 
        /* convert entry to dead name */
@@ -622,7 +622,7 @@ ipc_right_terminate(
                break;
 
        case MACH_PORT_TYPE_PORT_SET: {
-               ipc_pset_t pset = (ipc_pset_t) entry->ie_object;
+               ipc_pset_t pset = ips_object_to_pset(entry->ie_object);
 
                assert(entry->ie_request == IE_REQ_NONE);
                assert(pset != IPS_NULL);
@@ -637,7 +637,7 @@ ipc_right_terminate(
        case MACH_PORT_TYPE_RECEIVE:
        case MACH_PORT_TYPE_SEND_RECEIVE:
        case MACH_PORT_TYPE_SEND_ONCE: {
-               ipc_port_t port = (ipc_port_t) entry->ie_object;
+               ipc_port_t port = ip_object_to_port(entry->ie_object);
                ipc_port_t request;
                ipc_port_t nsrequest = IP_NULL;
                mach_port_mscount_t mscount = 0;
@@ -673,6 +673,7 @@ ipc_right_terminate(
                        ipc_port_destroy(port); /* clears receiver, consumes our ref, unlocks */
                } else if (type & MACH_PORT_TYPE_SEND_ONCE) {
                        assert(port->ip_sorights > 0);
+                       port->ip_reply_context = 0;
                        ip_unlock(port);
 
                        ipc_notify_send_once(port); /* consumes our ref */
@@ -736,7 +737,7 @@ ipc_right_destroy(
                break;
 
        case MACH_PORT_TYPE_PORT_SET: {
-               ipc_pset_t pset = (ipc_pset_t) entry->ie_object;
+               ipc_pset_t pset = ips_object_to_pset(entry->ie_object);
 
                assert(entry->ie_request == IE_REQ_NONE);
                assert(pset != IPS_NULL);
@@ -756,7 +757,7 @@ ipc_right_destroy(
        case MACH_PORT_TYPE_RECEIVE:
        case MACH_PORT_TYPE_SEND_RECEIVE:
        case MACH_PORT_TYPE_SEND_ONCE: {
-               ipc_port_t port = (ipc_port_t) entry->ie_object;
+               ipc_port_t port = ip_object_to_port(entry->ie_object);
                ipc_port_t nsrequest = IP_NULL;
                mach_port_mscount_t mscount = 0;
                ipc_port_t request;
@@ -764,8 +765,7 @@ ipc_right_destroy(
                assert(port != IP_NULL);
 
                if (type == MACH_PORT_TYPE_SEND) {
-                       ipc_hash_delete(space, (ipc_object_t) port,
-                           name, entry);
+                       ipc_hash_delete(space, ip_to_object(port), name, entry);
                }
 
                ip_lock(port);
@@ -813,12 +813,13 @@ ipc_right_destroy(
                }
 
                if (type & MACH_PORT_TYPE_RECEIVE) {
-                       assert(ip_active(port));
+                       require_ip_active(port);
                        assert(port->ip_receiver == space);
 
                        ipc_port_destroy(port); /* clears receiver, consumes our ref, unlocks */
                } else if (type & MACH_PORT_TYPE_SEND_ONCE) {
                        assert(port->ip_sorights > 0);
+                       port->ip_reply_context = 0;
                        ip_unlock(port);
 
                        ipc_notify_send_once(port); /* consumes our ref */
@@ -885,7 +886,7 @@ ipc_right_dealloc(
                assert(IE_BITS_UREFS(bits) == 0);
                assert(entry->ie_request == IE_REQ_NONE);
 
-               pset = (ipc_pset_t) entry->ie_object;
+               pset = ips_object_to_pset(entry->ie_object);
                assert(pset != IPS_NULL);
 
                entry->ie_object = IO_NULL;
@@ -929,7 +930,7 @@ dead_name:
 
                assert(IE_BITS_UREFS(bits) == 1);
 
-               port = (ipc_port_t) entry->ie_object;
+               port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
                if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
@@ -941,6 +942,13 @@ dead_name:
 
                assert(port->ip_sorights > 0);
 
+               /*
+                * clear any reply context:
+                * no one will be sending the response b/c we are destroying
+                * the single, outstanding send once right.
+                */
+               port->ip_reply_context = 0;
+
                request = ipc_right_request_cancel_macro(space, port, name, entry);
                ip_unlock(port);
 
@@ -965,7 +973,7 @@ dead_name:
 
                assert(IE_BITS_UREFS(bits) > 0);
 
-               port = (ipc_port_t) entry->ie_object;
+               port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
                if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
@@ -988,8 +996,7 @@ dead_name:
 
                        request = ipc_right_request_cancel_macro(space, port,
                            name, entry);
-                       ipc_hash_delete(space, (ipc_object_t) port,
-                           name, entry);
+                       ipc_hash_delete(space, ip_to_object(port), name, entry);
 
                        ip_unlock(port);
                        entry->ie_object = IO_NULL;
@@ -1022,11 +1029,11 @@ dead_name:
 
                assert(IE_BITS_UREFS(bits) > 0);
 
-               port = (ipc_port_t) entry->ie_object;
+               port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
                ip_lock(port);
-               assert(ip_active(port));
+               require_ip_active(port);
                assert(port->ip_receiver_name == name);
                assert(port->ip_receiver == space);
                assert(port->ip_srights > 0);
@@ -1129,7 +1136,7 @@ ipc_right_delta(
                        goto invalid_value;
                }
 
-               pset = (ipc_pset_t) entry->ie_object;
+               pset = ips_object_to_pset(entry->ie_object);
                assert(pset != IPS_NULL);
 
                entry->ie_object = IO_NULL;
@@ -1147,7 +1154,9 @@ ipc_right_delta(
                ipc_port_t request = IP_NULL;
 
                if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) {
-                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT);
+                       if ((bits & MACH_PORT_TYPE_EX_RECEIVE) == 0) {
+                               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT);
+                       }
                        goto invalid_right;
                }
 
@@ -1159,7 +1168,7 @@ ipc_right_delta(
                        goto invalid_value;
                }
 
-               port = (ipc_port_t) entry->ie_object;
+               port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
                /*
@@ -1169,7 +1178,7 @@ ipc_right_delta(
                 */
 
                ip_lock(port);
-               assert(ip_active(port));
+               require_ip_active(port);
                assert(port->ip_receiver_name == name);
                assert(port->ip_receiver == space);
 
@@ -1202,7 +1211,8 @@ ipc_right_delta(
                                 */
                                ipc_entry_modified(space, name, entry);
                                entry->ie_bits &= ~MACH_PORT_TYPE_RECEIVE;
-                               ipc_hash_insert(space, (ipc_object_t) port,
+                               entry->ie_bits |= MACH_PORT_TYPE_EX_RECEIVE;
+                               ipc_hash_insert(space, ip_to_object(port),
                                    name, entry);
                                ip_reference(port);
                        } else {
@@ -1214,7 +1224,7 @@ ipc_right_delta(
                                 *      port is destroyed "first".
                                 */
                                bits &= ~IE_BITS_TYPE_MASK;
-                               bits |= MACH_PORT_TYPE_DEAD_NAME;
+                               bits |= (MACH_PORT_TYPE_DEAD_NAME | MACH_PORT_TYPE_EX_RECEIVE);
                                if (entry->ie_request) {
                                        entry->ie_request = IE_REQ_NONE;
                                        /* if urefs are pegged due to overflow, leave them pegged */
@@ -1255,7 +1265,7 @@ ipc_right_delta(
                assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND_ONCE);
                assert(IE_BITS_UREFS(bits) == 1);
 
-               port = (ipc_port_t) entry->ie_object;
+               port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
                if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
@@ -1277,6 +1287,13 @@ ipc_right_delta(
                        goto success;
                }
 
+               /*
+                * clear any reply context:
+                * no one will be sending the response b/c we are destroying
+                * the single, outstanding send once right.
+                */
+               port->ip_reply_context = 0;
+
                request = ipc_right_request_cancel_macro(space, port, name, entry);
                ip_unlock(port);
 
@@ -1298,7 +1315,7 @@ ipc_right_delta(
                mach_port_urefs_t urefs;
 
                if (bits & MACH_PORT_TYPE_SEND_RIGHTS) {
-                       port = (ipc_port_t) entry->ie_object;
+                       port = ip_object_to_port(entry->ie_object);
                        assert(port != IP_NULL);
 
                        if (!ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
@@ -1372,7 +1389,15 @@ ipc_right_delta(
 
                if ((bits & MACH_PORT_TYPE_SEND) == 0) {
                        /* invalid right exception only when not live/dead confusion */
-                       if ((bits & MACH_PORT_TYPE_DEAD_NAME) == 0) {
+                       if ((bits & MACH_PORT_TYPE_DEAD_NAME) == 0
+#if !defined(AE_MAKESENDRIGHT_FIXED)
+                           /*
+                            * AE tries to add single send right without knowing if it already owns one.
+                            * But if it doesn't, it should own the receive right and delta should be 1.
+                            */
+                           && (((bits & MACH_PORT_TYPE_RECEIVE) == 0) || (delta != 1))
+#endif
+                           ) {
                                mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT);
                        }
                        goto invalid_right;
@@ -1380,7 +1405,7 @@ ipc_right_delta(
 
                /* maximum urefs for send is MACH_PORT_UREFS_MAX */
 
-               port = (ipc_port_t) entry->ie_object;
+               port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
                if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
@@ -1444,7 +1469,7 @@ ipc_right_delta(
 
                                request = ipc_right_request_cancel_macro(space, port,
                                    name, entry);
-                               ipc_hash_delete(space, (ipc_object_t) port,
+                               ipc_hash_delete(space, ip_to_object(port),
                                    name, entry);
 
                                ip_unlock(port);
@@ -1541,8 +1566,17 @@ ipc_right_destruct(
 
        assert(is_active(space));
 
-       if (((bits & MACH_PORT_TYPE_RECEIVE) == 0) ||
-           (srdelta && ((bits & MACH_PORT_TYPE_SEND) == 0))) {
+       if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) {
+               is_write_unlock(space);
+
+               /* No exception if we used to have receive and held entry since */
+               if ((bits & MACH_PORT_TYPE_EX_RECEIVE) == 0) {
+                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT);
+               }
+               return KERN_INVALID_RIGHT;
+       }
+
+       if (srdelta && (bits & MACH_PORT_TYPE_SEND) == 0) {
                is_write_unlock(space);
                mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT);
                return KERN_INVALID_RIGHT;
@@ -1552,11 +1586,11 @@ ipc_right_destruct(
                goto invalid_value;
        }
 
-       port = (ipc_port_t) entry->ie_object;
+       port = ip_object_to_port(entry->ie_object);
        assert(port != IP_NULL);
 
        ip_lock(port);
-       assert(ip_active(port));
+       require_ip_active(port);
        assert(port->ip_receiver_name == name);
        assert(port->ip_receiver == space);
 
@@ -1639,7 +1673,8 @@ ipc_right_destruct(
                         */
                        ipc_entry_modified(space, name, entry);
                        entry->ie_bits &= ~MACH_PORT_TYPE_RECEIVE;
-                       ipc_hash_insert(space, (ipc_object_t) port,
+                       entry->ie_bits |= MACH_PORT_TYPE_EX_RECEIVE;
+                       ipc_hash_insert(space, ip_to_object(port),
                            name, entry);
                        ip_reference(port);
                } else {
@@ -1651,7 +1686,7 @@ ipc_right_destruct(
                         *      port is destroyed "first".
                         */
                        bits &= ~IE_BITS_TYPE_MASK;
-                       bits |= MACH_PORT_TYPE_DEAD_NAME;
+                       bits |= (MACH_PORT_TYPE_DEAD_NAME | MACH_PORT_TYPE_EX_RECEIVE);
                        if (entry->ie_request) {
                                entry->ie_request = IE_REQ_NONE;
                                if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) {
@@ -1719,14 +1754,14 @@ ipc_right_info(
 
        bits = entry->ie_bits;
        request = entry->ie_request;
-       port = (ipc_port_t) entry->ie_object;
+       port = ip_object_to_port(entry->ie_object);
 
        if (bits & MACH_PORT_TYPE_RECEIVE) {
                assert(IP_VALID(port));
 
                if (request != IE_REQ_NONE) {
                        ip_lock(port);
-                       assert(ip_active(port));
+                       require_ip_active(port);
                        type |= ipc_port_request_type(port, name, request);
                        ip_unlock(port);
                }
@@ -1761,27 +1796,29 @@ ipc_right_info(
 }
 
 /*
- *     Routine:        ipc_right_copyin_check
+ *     Routine:        ipc_right_copyin_check_reply
  *     Purpose:
- *             Check if a subsequent ipc_right_copyin would succeed.
+ *             Check if a subsequent ipc_right_copyin would succeed. Used only
+ *             by ipc_kmsg_copyin_header to check if reply_port can be copied in.
+ *             If the reply port is an immovable send right, it errors out.
  *     Conditions:
  *             The space is locked (read or write) and active.
  */
 
 boolean_t
-ipc_right_copyin_check(
+ipc_right_copyin_check_reply(
        __assert_only ipc_space_t       space,
-       __unused mach_port_name_t       name,
-       ipc_entry_t                     entry,
-       mach_msg_type_name_t            msgt_name)
+       mach_port_name_t                reply_name,
+       ipc_entry_t                     reply_entry,
+       mach_msg_type_name_t            reply_type)
 {
        ipc_entry_bits_t bits;
-       ipc_port_t port;
+       ipc_port_t reply_port;
 
-       bits = entry->ie_bits;
+       bits = reply_entry->ie_bits;
        assert(is_active(space));
 
-       switch (msgt_name) {
+       switch (reply_type) {
        case MACH_MSG_TYPE_MAKE_SEND:
                if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) {
                        return FALSE;
@@ -1795,17 +1832,8 @@ ipc_right_copyin_check(
                break;
 
        case MACH_MSG_TYPE_MOVE_RECEIVE:
-               if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) {
-                       return FALSE;
-               }
-               if (io_kotype(entry->ie_object) != IKOT_NONE) {
-                       return FALSE;
-               }
-               port = (ipc_port_t) entry->ie_object;
-               if (port->ip_specialreply) {
-                       return FALSE;
-               }
-               break;
+               /* ipc_kmsg_copyin_header already filters it out */
+               return FALSE;
 
        case MACH_MSG_TYPE_COPY_SEND:
        case MACH_MSG_TYPE_MOVE_SEND:
@@ -1818,19 +1846,29 @@ ipc_right_copyin_check(
                        return FALSE;
                }
 
-               port = (ipc_port_t) entry->ie_object;
-               assert(port != IP_NULL);
+               reply_port = ip_object_to_port(reply_entry->ie_object);
+               assert(reply_port != IP_NULL);
 
                /*
                 * active status peek to avoid checks that will be skipped
                 * on copyin for dead ports.  Lock not held, so will not be
                 * atomic (but once dead, there's no going back).
                 */
-               if (!ip_active(port)) {
+               if (!ip_active(reply_port)) {
                        break;
                }
 
-               if (msgt_name == MACH_MSG_TYPE_MOVE_SEND_ONCE) {
+               /*
+                * Can't copyin a send right that is marked immovable. This bit
+                * is set only during port creation and never unset. So it can
+                * be read without a lock.
+                */
+               if (reply_port->ip_immovable_send) {
+                       mach_port_guard_exception(reply_name, 0, 0, kGUARD_EXC_IMMOVABLE);
+                       return FALSE;
+               }
+
+               if (reply_type == MACH_MSG_TYPE_MOVE_SEND_ONCE) {
                        if ((bits & MACH_PORT_TYPE_SEND_ONCE) == 0) {
                                return FALSE;
                        }
@@ -1850,6 +1888,40 @@ ipc_right_copyin_check(
        return TRUE;
 }
 
+/*
+ *     Routine:        ipc_right_copyin_check_guard_locked
+ *     Purpose:
+ *             Check if the port is guarded and the guard
+ *             value matches the one passed in the arguments.
+ *             If MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND is set,
+ *             check if the port is unguarded.
+ *     Conditions:
+ *             The port is locked.
+ *     Returns:
+ *             KERN_SUCCESS            Port is either unguarded
+ *                                     or guarded with expected value
+ *             KERN_INVALID_ARGUMENT   Port is either unguarded already or guard mismatch.
+ *                                     This also raises a EXC_GUARD exception.
+ */
+static kern_return_t
+ipc_right_copyin_check_guard_locked(
+       mach_port_name_t name,
+       ipc_port_t port,
+       mach_port_context_t context,
+       mach_msg_guard_flags_t *guard_flags)
+{
+       mach_msg_guard_flags_t flags = *guard_flags;
+       if ((flags & MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND) && !port->ip_guarded && !context) {
+               return KERN_SUCCESS;
+       } else if (port->ip_guarded && (port->ip_context == context)) {
+               return KERN_SUCCESS;
+       }
+
+       /* Incorrect guard; Raise exception */
+       mach_port_guard_exception(name, context, port->ip_context, kGUARD_EXC_INCORRECT_GUARD);
+       return KERN_INVALID_ARGUMENT;
+}
+
 /*
  *     Routine:        ipc_right_copyin
  *     Purpose:
@@ -1871,26 +1943,32 @@ ipc_right_copyin_check(
  *     Returns:
  *             KERN_SUCCESS            Acquired an object, possibly IO_DEAD.
  *             KERN_INVALID_RIGHT      Name doesn't denote correct right.
+ *             KERN_INVALID_CAPABILITY Trying to move an kobject port or an immovable right
+ *             KERN_INVALID_ARGUMENT   Port is unguarded or guard mismatch
  */
 
 kern_return_t
 ipc_right_copyin(
-       ipc_space_t              space,
-       mach_port_name_t         name,
-       ipc_entry_t              entry,
-       mach_msg_type_name_t     msgt_name,
-       ipc_right_copyin_flags_t flags,
-       ipc_object_t             *objectp,
-       ipc_port_t               *sorightp,
-       ipc_port_t               *releasep,
-       int                      *assertcntp)
+       ipc_space_t                space,
+       mach_port_name_t           name,
+       ipc_entry_t                entry,
+       mach_msg_type_name_t       msgt_name,
+       ipc_right_copyin_flags_t   flags,
+       ipc_object_t               *objectp,
+       ipc_port_t                 *sorightp,
+       ipc_port_t                 *releasep,
+       int                        *assertcntp,
+       mach_port_context_t        context,
+       mach_msg_guard_flags_t     *guard_flags)
 {
        ipc_entry_bits_t bits;
        ipc_port_t port;
+       kern_return_t kr;
+       boolean_t deadok = flags & IPC_RIGHT_COPYIN_FLAGS_DEADOK? TRUE : FALSE;
+       boolean_t allow_imm_send = flags & IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND? TRUE : FALSE;
 
        *releasep = IP_NULL;
        *assertcntp = 0;
-       boolean_t deadok = (flags & IPC_RIGHT_COPYIN_FLAGS_DEADOK) ? TRUE : FALSE;
 
        bits = entry->ie_bits;
 
@@ -1902,20 +1980,17 @@ ipc_right_copyin(
                        goto invalid_right;
                }
 
-               port = (ipc_port_t) entry->ie_object;
+               port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
                ip_lock(port);
-               assert(ip_active(port));
                assert(port->ip_receiver_name == name);
                assert(port->ip_receiver == space);
 
-               port->ip_mscount++;
-               port->ip_srights++;
-               ip_reference(port);
+               ipc_port_make_send_locked(port);
                ip_unlock(port);
 
-               *objectp = (ipc_object_t) port;
+               *objectp = ip_to_object(port);
                *sorightp = IP_NULL;
                break;
        }
@@ -1925,19 +2000,18 @@ ipc_right_copyin(
                        goto invalid_right;
                }
 
-               port = (ipc_port_t) entry->ie_object;
+               port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
                ip_lock(port);
-               assert(ip_active(port));
+               require_ip_active(port);
                assert(port->ip_receiver_name == name);
                assert(port->ip_receiver == space);
 
-               port->ip_sorights++;
-               ip_reference(port);
+               ipc_port_make_sonce_locked(port);
                ip_unlock(port);
 
-               *objectp = (ipc_object_t) port;
+               *objectp = ip_to_object(port);
                *sorightp = IP_NULL;
                break;
        }
@@ -1963,24 +2037,41 @@ ipc_right_copyin(
                         * situation which is, "This is a valid receive right,
                         * but it's also a kobject and you can't move it."
                         */
+                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE);
                        return KERN_INVALID_CAPABILITY;
                }
 
-               port = (ipc_port_t) entry->ie_object;
+               port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
                ip_lock(port);
-               assert(ip_active(port));
+               require_ip_active(port);
                assert(port->ip_receiver_name == name);
                assert(port->ip_receiver == space);
 
+               if (port->ip_immovable_receive) {
+                       assert(port->ip_receiver != ipc_space_kernel);
+                       ip_unlock(port);
+                       assert(current_task() != kernel_task);
+                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE);
+                       return KERN_INVALID_CAPABILITY;
+               }
+
+               if (guard_flags != NULL) {
+                       kr = ipc_right_copyin_check_guard_locked(name, port, context, guard_flags);
+                       if (kr != KERN_SUCCESS) {
+                               ip_unlock(port);
+                               return kr;
+                       }
+               }
+
                if (bits & MACH_PORT_TYPE_SEND) {
                        assert(IE_BITS_TYPE(bits) ==
                            MACH_PORT_TYPE_SEND_RECEIVE);
                        assert(IE_BITS_UREFS(bits) > 0);
                        assert(port->ip_srights > 0);
 
-                       ipc_hash_insert(space, (ipc_object_t) port,
+                       ipc_hash_insert(space, ip_to_object(port),
                            name, entry);
                        ip_reference(port);
                } else {
@@ -1992,9 +2083,15 @@ ipc_right_copyin(
                        entry->ie_object = IO_NULL;
                }
                entry->ie_bits = bits & ~MACH_PORT_TYPE_RECEIVE;
+               entry->ie_bits |= MACH_PORT_TYPE_EX_RECEIVE;
                ipc_entry_modified(space, name, entry);
 
+               /* ipc_port_clear_receiver unguards the port and clears the ip_immovable_receive bit */
                (void)ipc_port_clear_receiver(port, FALSE); /* don't destroy the port/mqueue */
+               if (guard_flags != NULL) {
+                       /* this flag will be cleared during copyout */
+                       *guard_flags = *guard_flags | MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND;
+               }
 
 #if IMPORTANCE_INHERITANCE
                /*
@@ -2017,7 +2114,7 @@ ipc_right_copyin(
 
                ip_unlock(port);
 
-               *objectp = (ipc_object_t) port;
+               *objectp = ip_to_object(port);
                *sorightp = request;
                break;
        }
@@ -2035,7 +2132,7 @@ ipc_right_copyin(
 
                assert(IE_BITS_UREFS(bits) > 0);
 
-               port = (ipc_port_t) entry->ie_object;
+               port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
                if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
@@ -2053,13 +2150,16 @@ ipc_right_copyin(
                        goto invalid_right;
                }
 
-               assert(port->ip_srights > 0);
+               if (!allow_imm_send && port->ip_immovable_send) {
+                       ip_unlock(port);
+                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE);
+                       return KERN_INVALID_CAPABILITY;
+               }
 
-               port->ip_srights++;
-               ip_reference(port);
+               ipc_port_copy_send_locked(port);
                ip_unlock(port);
 
-               *objectp = (ipc_object_t) port;
+               *objectp = ip_to_object(port);
                *sorightp = IP_NULL;
                break;
        }
@@ -2079,7 +2179,7 @@ ipc_right_copyin(
 
                assert(IE_BITS_UREFS(bits) > 0);
 
-               port = (ipc_port_t) entry->ie_object;
+               port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
                if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
@@ -2097,9 +2197,14 @@ ipc_right_copyin(
                        goto invalid_right;
                }
 
-               assert(port->ip_srights > 0);
+               if (!allow_imm_send && port->ip_immovable_send) {
+                       ip_unlock(port);
+                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE);
+                       return KERN_INVALID_CAPABILITY;
+               }
 
                if (IE_BITS_UREFS(bits) == 1) {
+                       assert(port->ip_srights > 0);
                        if (bits & MACH_PORT_TYPE_RECEIVE) {
                                assert(port->ip_receiver_name == name);
                                assert(port->ip_receiver == space);
@@ -2113,7 +2218,7 @@ ipc_right_copyin(
 
                                request = ipc_right_request_cancel_macro(space, port,
                                    name, entry);
-                               ipc_hash_delete(space, (ipc_object_t) port,
+                               ipc_hash_delete(space, ip_to_object(port),
                                    name, entry);
                                entry->ie_object = IO_NULL;
                                /* transfer entry's reference to caller */
@@ -2121,8 +2226,7 @@ ipc_right_copyin(
                        entry->ie_bits = bits & ~
                            (IE_BITS_UREFS_MASK | MACH_PORT_TYPE_SEND);
                } else {
-                       port->ip_srights++;
-                       ip_reference(port);
+                       ipc_port_copy_send_locked(port);
                        /* if urefs are pegged due to overflow, leave them pegged */
                        if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) {
                                entry->ie_bits = bits - 1; /* decrement urefs */
@@ -2132,7 +2236,7 @@ ipc_right_copyin(
                ipc_entry_modified(space, name, entry);
                ip_unlock(port);
 
-               *objectp = (ipc_object_t) port;
+               *objectp = ip_to_object(port);
                *sorightp = request;
                break;
        }
@@ -2152,7 +2256,7 @@ ipc_right_copyin(
 
                assert(IE_BITS_UREFS(bits) > 0);
 
-               port = (ipc_port_t) entry->ie_object;
+               port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
                if (ipc_right_check(space, port, name, entry, flags)) {
@@ -2175,6 +2279,12 @@ ipc_right_copyin(
                        goto invalid_right;
                }
 
+               if (!allow_imm_send && port->ip_immovable_send) {
+                       ip_unlock(port);
+                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE);
+                       return KERN_INVALID_CAPABILITY;
+               }
+
                assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND_ONCE);
                assert(IE_BITS_UREFS(bits) == 1);
                assert(port->ip_sorights > 0);
@@ -2186,7 +2296,7 @@ ipc_right_copyin(
                entry->ie_bits = bits & ~
                    (IE_BITS_UREFS_MASK | MACH_PORT_TYPE_SEND_ONCE);
                ipc_entry_modified(space, name, entry);
-               *objectp = (ipc_object_t) port;
+               *objectp = ip_to_object(port);
                *sorightp = request;
                break;
        }
@@ -2235,93 +2345,6 @@ move_dead:
        return KERN_SUCCESS;
 }
 
-/*
- *     Routine:        ipc_right_copyin_undo
- *     Purpose:
- *             Undoes the effects of an ipc_right_copyin
- *             of a send/send-once right that is dead.
- *             (Object is either IO_DEAD or a dead port.)
- *     Conditions:
- *             The space is write-locked and active.
- */
-
-void
-ipc_right_copyin_undo(
-       ipc_space_t             space,
-       mach_port_name_t        name,
-       ipc_entry_t             entry,
-       mach_msg_type_name_t    msgt_name,
-       ipc_object_t            object,
-       ipc_port_t              soright)
-{
-       ipc_entry_bits_t bits;
-
-       bits = entry->ie_bits;
-
-       assert(is_active(space));
-
-       assert((msgt_name == MACH_MSG_TYPE_MOVE_SEND) ||
-           (msgt_name == MACH_MSG_TYPE_COPY_SEND) ||
-           (msgt_name == MACH_MSG_TYPE_MOVE_SEND_ONCE));
-
-       if (soright != IP_NULL) {
-               assert((msgt_name == MACH_MSG_TYPE_MOVE_SEND) ||
-                   (msgt_name == MACH_MSG_TYPE_MOVE_SEND_ONCE));
-               assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_NONE);
-               assert(object != IO_DEAD);
-
-               entry->ie_bits = ((bits & ~IE_BITS_RIGHT_MASK) |
-                   MACH_PORT_TYPE_DEAD_NAME | 2);
-       } else if (IE_BITS_TYPE(bits) == MACH_PORT_TYPE_NONE) {
-               assert((msgt_name == MACH_MSG_TYPE_MOVE_SEND) ||
-                   (msgt_name == MACH_MSG_TYPE_MOVE_SEND_ONCE));
-
-               entry->ie_bits = ((bits & ~IE_BITS_RIGHT_MASK) |
-                   MACH_PORT_TYPE_DEAD_NAME | 1);
-       } else if (IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME) {
-               assert(object == IO_DEAD);
-               assert(IE_BITS_UREFS(bits) > 0);
-
-               if (msgt_name != MACH_MSG_TYPE_COPY_SEND) {
-                       assert(IE_BITS_UREFS(bits) <= MACH_PORT_UREFS_MAX);
-                       /* if urefs are pegged due to overflow, leave them pegged */
-                       if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) {
-                               entry->ie_bits = bits + 1; /* increment urefs */
-                       }
-               }
-       } else {
-               assert((msgt_name == MACH_MSG_TYPE_MOVE_SEND) ||
-                   (msgt_name == MACH_MSG_TYPE_COPY_SEND));
-               assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND);
-               assert(object != IO_DEAD);
-               assert(entry->ie_object == object);
-               assert(IE_BITS_UREFS(bits) > 0);
-
-               if (msgt_name != MACH_MSG_TYPE_COPY_SEND) {
-                       assert(IE_BITS_UREFS(bits) <= MACH_PORT_UREFS_MAX);
-                       /* if urefs are pegged due to overflow, leave them pegged */
-                       if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) {
-                               entry->ie_bits = bits + 1; /* increment urefs */
-                       }
-               }
-
-               /*
-                *      May as well convert the entry to a dead name.
-                *      (Or if it is a compat entry, destroy it.)
-                */
-
-               (void) ipc_right_check(space, (ipc_port_t) object,
-                   name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE);
-               /* object is dead so it is not locked */
-       }
-       ipc_entry_modified(space, name, entry);
-       /* release the reference acquired by copyin */
-
-       if (object != IO_DEAD) {
-               io_release(object);
-       }
-}
-
 /*
  *     Routine:        ipc_right_copyin_two_move_sends
  *     Purpose:
@@ -2365,7 +2388,7 @@ ipc_right_copyin_two_move_sends(
                goto invalid_right;
        }
 
-       port = (ipc_port_t) entry->ie_object;
+       port = ip_object_to_port(entry->ie_object);
        assert(port != IP_NULL);
 
        if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
@@ -2374,45 +2397,57 @@ ipc_right_copyin_two_move_sends(
        }
        /* port is locked and active */
 
-       assert(port->ip_srights > 0);
+       if (urefs > 2) {
+               /*
+                * We are moving 2 urefs as naked send rights, which is decomposed as:
+                * - two copy sends (which doesn't affect the make send count)
+                * - decrementing the local urefs twice.
+                */
+               ipc_port_copy_send_locked(port);
+               ipc_port_copy_send_locked(port);
+               /* if urefs are pegged due to overflow, leave them pegged */
+               if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) {
+                       entry->ie_bits = bits - 2; /* decrement urefs */
+               }
+       } else {
+               /*
+                * We have exactly 2 send rights for this port in this space,
+                * which means that we will liberate the naked send right held
+                * by this entry.
+                *
+                * However refcounting rules around entries are that naked send rights
+                * on behalf of spaces do not have an associated port reference,
+                * so we need to donate one ...
+                */
+               ipc_port_copy_send_locked(port);
 
-       if (urefs == 2) {
                if (bits & MACH_PORT_TYPE_RECEIVE) {
                        assert(port->ip_receiver_name == name);
                        assert(port->ip_receiver == space);
                        assert(IE_BITS_TYPE(bits) ==
                            MACH_PORT_TYPE_SEND_RECEIVE);
 
-                       port->ip_srights++;
-                       ip_reference(port);
+                       /* ... that we inject manually when the entry stays alive */
                        ip_reference(port);
                } else {
                        assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND);
 
+                       /* ... that we steal from the entry when it dies */
                        request = ipc_right_request_cancel_macro(space, port,
                            name, entry);
 
-                       port->ip_srights++;
-                       ip_reference(port);
-                       ipc_hash_delete(space, (ipc_object_t) port,
+                       ipc_hash_delete(space, ip_to_object(port),
                            name, entry);
                        entry->ie_object = IO_NULL;
                }
+
                entry->ie_bits = bits & ~(IE_BITS_UREFS_MASK | MACH_PORT_TYPE_SEND);
-       } else {
-               port->ip_srights += 2;
-               ip_reference(port);
-               ip_reference(port);
-               /* if urefs are pegged due to overflow, leave them pegged */
-               if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) {
-                       entry->ie_bits = bits - 2; /* decrement urefs */
-               }
        }
        ipc_entry_modified(space, name, entry);
 
        ip_unlock(port);
 
-       *objectp = (ipc_object_t) port;
+       *objectp = ip_to_object(port);
        *sorightp = request;
        return KERN_SUCCESS;
 
@@ -2430,6 +2465,7 @@ invalid_right:
  *     Conditions:
  *             The space is write-locked and active.
  *             The object is returned with two refs/rights.
+ *             Msgt_one refers to the dest_type
  *     Returns:
  *             KERN_SUCCESS            Acquired an object.
  *             KERN_INVALID_RIGHT      Name doesn't denote correct right(s).
@@ -2437,14 +2473,14 @@ invalid_right:
  */
 kern_return_t
 ipc_right_copyin_two(
-       ipc_space_t             space,
-       mach_port_name_t        name,
-       ipc_entry_t             entry,
-       mach_msg_type_name_t    msgt_one,
-       mach_msg_type_name_t    msgt_two,
-       ipc_object_t            *objectp,
-       ipc_port_t              *sorightp,
-       ipc_port_t              *releasep)
+       ipc_space_t               space,
+       mach_port_name_t          name,
+       ipc_entry_t               entry,
+       mach_msg_type_name_t      msgt_one,
+       mach_msg_type_name_t      msgt_two,
+       ipc_object_t              *objectp,
+       ipc_port_t                *sorightp,
+       ipc_port_t                *releasep)
 {
        kern_return_t kr;
        int assertcnt = 0;
@@ -2452,14 +2488,6 @@ ipc_right_copyin_two(
        assert(MACH_MSG_TYPE_PORT_ANY_SEND(msgt_one));
        assert(MACH_MSG_TYPE_PORT_ANY_SEND(msgt_two));
 
-
-       /*
-        * Pre-validate the second disposition is possible all by itself.
-        */
-       if (!ipc_right_copyin_check(space, name, entry, msgt_two)) {
-               return KERN_INVALID_CAPABILITY;
-       }
-
        /*
         *      This is a little tedious to make atomic, because
         *      there are 25 combinations of valid dispositions.
@@ -2491,9 +2519,9 @@ ipc_right_copyin_two(
                ipc_object_t object_two;
 
                kr = ipc_right_copyin(space, name, entry,
-                   msgt_one, IPC_RIGHT_COPYIN_FLAGS_NONE,
+                   msgt_one, IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND,
                    objectp, sorightp, releasep,
-                   &assertcnt);
+                   &assertcnt, 0, NULL);
                assert(assertcnt == 0);
                if (kr != KERN_SUCCESS) {
                        return kr;
@@ -2512,7 +2540,7 @@ ipc_right_copyin_two(
                kr = ipc_right_copyin(space, name, entry,
                    msgt_two, IPC_RIGHT_COPYIN_FLAGS_NONE,
                    &object_two, sorightp, releasep,
-                   &assertcnt);
+                   &assertcnt, 0, NULL);
                assert(assertcnt == 0);
                assert(kr == KERN_SUCCESS);
                assert(*sorightp == IP_NULL);
@@ -2550,9 +2578,9 @@ ipc_right_copyin_two(
                }
 
                kr = ipc_right_copyin(space, name, entry,
-                   msgt_name, IPC_RIGHT_COPYIN_FLAGS_NONE,
+                   msgt_name, IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND,
                    objectp, sorightp, releasep,
-                   &assertcnt);
+                   &assertcnt, 0, NULL);
                assert(assertcnt == 0);
                if (kr != KERN_SUCCESS) {
                        return kr;
@@ -2563,7 +2591,7 @@ ipc_right_copyin_two(
                 *      that's OK.  Neither right will be usable to send
                 *      a message anyway.
                 */
-               (void)ipc_port_copy_send((ipc_port_t)*objectp);
+               (void)ipc_port_copy_send(ip_object_to_port(*objectp));
        }
 
        return KERN_SUCCESS;
@@ -2580,9 +2608,9 @@ ipc_right_copyin_two(
  *             because user-reference overflow isn't a possibility.
  *
  *             If copying out the object would cause the user-reference
- *             count in the entry to overflow, and overflow is TRUE,
- *             then instead the user-reference count is left pegged
- *             to its maximum value and the copyout succeeds anyway.
+ *             count in the entry to overflow, then the user-reference
+ *             count is left pegged to its maximum value and the copyout
+ *             succeeds anyway.
  *     Conditions:
  *             The space is write-locked and active.
  *             The object is locked and active.
@@ -2597,7 +2625,8 @@ ipc_right_copyout(
        mach_port_name_t        name,
        ipc_entry_t             entry,
        mach_msg_type_name_t    msgt_name,
-       __unused boolean_t      overflow,
+       mach_port_context_t     *context,
+       mach_msg_guard_flags_t  *guard_flags,
        ipc_object_t            object)
 {
        ipc_entry_bits_t bits;
@@ -2610,7 +2639,7 @@ ipc_right_copyout(
        assert(io_active(object));
        assert(entry->ie_object == object);
 
-       port = (ipc_port_t) object;
+       port = ip_object_to_port(object);
 
        switch (msgt_name) {
        case MACH_MSG_TYPE_PORT_SEND_ONCE:
@@ -2673,8 +2702,7 @@ ipc_right_copyout(
 
                        /* entry is locked holding ref, so can use port */
 
-                       ipc_hash_insert(space, (ipc_object_t) port,
-                           name, entry);
+                       ipc_hash_insert(space, ip_to_object(port), name, entry);
                }
 
                entry->ie_bits = (bits | MACH_PORT_TYPE_SEND) + 1; /* increment urefs */
@@ -2683,9 +2711,6 @@ ipc_right_copyout(
 
        case MACH_MSG_TYPE_PORT_RECEIVE: {
                ipc_port_t dest;
-               turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
-               struct turnstile *ts = TURNSTILE_NULL;
-
 #if IMPORTANCE_INHERITANCE
                natural_t assertcnt = port->ip_impcount;
 #endif /* IMPORTANCE_INHERITANCE */
@@ -2699,50 +2724,49 @@ ipc_right_copyout(
                port->ip_receiver_name = name;
                port->ip_receiver = space;
 
-               assert((bits & MACH_PORT_TYPE_RECEIVE) == 0);
+               struct knote *kn = current_thread()->ith_knote;
 
-               /* Update the port's turnstile linkage to WL turnstile */
-               ts = port_send_turnstile(port);
-               if (ts) {
-                       struct knote *kn = current_thread()->ith_knote;
-                       if (ITH_KNOTE_VALID(kn, MACH_MSG_TYPE_PORT_RECEIVE)) {
-                               inheritor = filt_machport_stash_port(kn, port, NULL);
-                               if (inheritor) {
-                                       turnstile_reference(inheritor);
-                                       IMQ_SET_INHERITOR(&port->ip_messages, inheritor);
-                               }
+               if ((guard_flags != NULL) && ((*guard_flags & MACH_MSG_GUARD_FLAGS_IMMOVABLE_RECEIVE) != 0)) {
+                       assert(port->ip_immovable_receive == 0);
+                       port->ip_guarded = 1;
+                       port->ip_strict_guard = 0;
+                       /* pseudo receive shouldn't set the receive right as immovable in the sender's space */
+                       if (kn != ITH_KNOTE_PSEUDO) {
+                               port->ip_immovable_receive = 1;
                        }
-                       turnstile_reference(ts);
-                       turnstile_update_inheritor(ts, inheritor,
-                           (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE));
+                       port->ip_context = current_thread()->ith_msg_addr;
+                       *context = port->ip_context;
+                       *guard_flags = *guard_flags & ~MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND;
                }
 
-               imq_unlock(&port->ip_messages);
-
+               assert((bits & MACH_PORT_TYPE_RECEIVE) == 0);
                if (bits & MACH_PORT_TYPE_SEND) {
                        assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND);
                        assert(IE_BITS_UREFS(bits) > 0);
                        assert(port->ip_srights > 0);
-
-                       ip_unlock(port);
-                       ip_release(port);
-
-                       /* entry is locked holding ref, so can use port */
-                       ipc_hash_delete(space, (ipc_object_t) port, name, entry);
                } else {
                        assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_NONE);
                        assert(IE_BITS_UREFS(bits) == 0);
+               }
 
-                       /* transfer ref to entry */
-                       ip_unlock(port);
+               boolean_t sync_bootstrap_checkin = FALSE;
+               if (kn != ITH_KNOTE_PSEUDO && port->ip_sync_bootstrap_checkin) {
+                       sync_bootstrap_checkin = TRUE;
                }
-               entry->ie_bits = bits | MACH_PORT_TYPE_RECEIVE;
-               ipc_entry_modified(space, name, entry);
+               if (!ITH_KNOTE_VALID(kn, MACH_MSG_TYPE_PORT_RECEIVE)) {
+                       kn = NULL;
+               }
+               ipc_port_adjust_port_locked(port, kn, sync_bootstrap_checkin);
+               /* port & message queue are unlocked */
 
-               if (ts) {
-                       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
-                       turnstile_deallocate_safe(ts);
+               if (bits & MACH_PORT_TYPE_SEND) {
+                       ip_release(port);
+
+                       /* entry is locked holding ref, so can use port */
+                       ipc_hash_delete(space, ip_to_object(port), name, entry);
                }
+               entry->ie_bits = bits | MACH_PORT_TYPE_RECEIVE;
+               ipc_entry_modified(space, name, entry);
 
                if (dest != IP_NULL) {
 #if IMPORTANCE_INHERITANCE
@@ -2769,137 +2793,3 @@ ipc_right_copyout(
        }
        return KERN_SUCCESS;
 }
-
-/*
- *     Routine:        ipc_right_rename
- *     Purpose:
- *             Transfer an entry from one name to another.
- *             The old entry is deallocated.
- *     Conditions:
- *             The space is write-locked and active.
- *             The new entry is unused.  Upon return,
- *             the space is unlocked.
- *     Returns:
- *             KERN_SUCCESS            Moved entry to new name.
- */
-
-kern_return_t
-ipc_right_rename(
-       ipc_space_t             space,
-       mach_port_name_t        oname,
-       ipc_entry_t             oentry,
-       mach_port_name_t        nname,
-       ipc_entry_t             nentry)
-{
-       ipc_port_request_index_t request = oentry->ie_request;
-       ipc_entry_bits_t bits = oentry->ie_bits;
-       ipc_object_t object = oentry->ie_object;
-       ipc_port_t release_port = IP_NULL;
-
-       assert(is_active(space));
-       assert(oname != nname);
-
-       /*
-        *      If IE_BITS_COMPAT, we can't allow the entry to be renamed
-        *      if the port is dead.  (This would foil ipc_port_destroy.)
-        *      Instead we should fail because oentry shouldn't exist.
-        *      Note IE_BITS_COMPAT implies ie_request != 0.
-        */
-
-       if (request != IE_REQ_NONE) {
-               ipc_port_t port;
-
-               assert(bits & MACH_PORT_TYPE_PORT_RIGHTS);
-               port = (ipc_port_t) object;
-               assert(port != IP_NULL);
-
-               if (ipc_right_check(space, port, oname, oentry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
-                       request = IE_REQ_NONE;
-                       object = IO_NULL;
-                       bits = oentry->ie_bits;
-                       release_port = port;
-                       assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME);
-                       assert(oentry->ie_request == IE_REQ_NONE);
-               } else {
-                       /* port is locked and active */
-
-                       ipc_port_request_rename(port, request, oname, nname);
-                       ip_unlock(port);
-                       oentry->ie_request = IE_REQ_NONE;
-               }
-       }
-
-       /* initialize nentry before letting ipc_hash_insert see it */
-
-       assert((nentry->ie_bits & IE_BITS_RIGHT_MASK) == 0);
-       nentry->ie_bits |= bits & IE_BITS_RIGHT_MASK;
-       nentry->ie_request = request;
-       nentry->ie_object = object;
-
-       switch (IE_BITS_TYPE(bits)) {
-       case MACH_PORT_TYPE_SEND: {
-               ipc_port_t port;
-
-               port = (ipc_port_t) object;
-               assert(port != IP_NULL);
-
-               /* remember, there are no other share entries possible */
-               /* or we can't do the rename.  Therefore we do not need */
-               /* to check the other subspaces */
-               ipc_hash_delete(space, (ipc_object_t) port, oname, oentry);
-               ipc_hash_insert(space, (ipc_object_t) port, nname, nentry);
-               break;
-       }
-
-       case MACH_PORT_TYPE_RECEIVE:
-       case MACH_PORT_TYPE_SEND_RECEIVE: {
-               ipc_port_t port;
-
-               port = (ipc_port_t) object;
-               assert(port != IP_NULL);
-
-               ip_lock(port);
-               imq_lock(&port->ip_messages);
-               assert(ip_active(port));
-               assert(port->ip_receiver_name == oname);
-               assert(port->ip_receiver == space);
-
-               port->ip_receiver_name = nname;
-               imq_unlock(&port->ip_messages);
-               ip_unlock(port);
-               break;
-       }
-
-       case MACH_PORT_TYPE_PORT_SET: {
-               ipc_pset_t pset;
-
-               pset = (ipc_pset_t) object;
-               assert(pset != IPS_NULL);
-
-               ips_lock(pset);
-               assert(ips_active(pset));
-
-               ips_unlock(pset);
-               break;
-       }
-
-       case MACH_PORT_TYPE_SEND_ONCE:
-       case MACH_PORT_TYPE_DEAD_NAME:
-               break;
-
-       default:
-               panic("ipc_right_rename: strange rights");
-       }
-
-       assert(oentry->ie_request == IE_REQ_NONE);
-       oentry->ie_object = IO_NULL;
-       ipc_entry_dealloc(space, oname, oentry);
-       ipc_entry_modified(space, nname, nentry);
-       is_write_unlock(space);
-
-       if (release_port != IP_NULL) {
-               ip_release(release_port);
-       }
-
-       return KERN_SUCCESS;
-}
index d995aef3a11aecdf147ea53c3890572c7786c7c7..a3049efc70a38bc7297ed376152d10ba14d7af42 100644 (file)
@@ -78,7 +78,7 @@ typedef uint32_t ipc_right_copyin_flags_t;
 
 #define IPC_RIGHT_COPYIN_FLAGS_NONE                   0x0
 #define IPC_RIGHT_COPYIN_FLAGS_DEADOK                 0x1
-#define IPC_RIGHT_COPYIN_FLAGS_RESERVED               0x2
+#define IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND   0x2
 #define IPC_RIGHT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE   0x4   /* allow copyin of a send once right to a dead port with no dead name requests */
 
 /* Find an entry in a space, given the name */
@@ -180,44 +180,37 @@ extern kern_return_t ipc_right_info(
        mach_port_type_t        *typep,
        mach_port_urefs_t       *urefsp);
 
-/* Check if a subsequent ipc_right_copyin would succeed */
-extern boolean_t ipc_right_copyin_check(
-       ipc_space_t             space,
-       mach_port_name_t        name,
-       ipc_entry_t             entry,
-       mach_msg_type_name_t    msgt_name);
+/* Check if a subsequent ipc_right_copyin of the reply port will succeed */
+extern boolean_t ipc_right_copyin_check_reply(
+       ipc_space_t              space,
+       mach_port_name_t         reply_name,
+       ipc_entry_t              reply_entry,
+       mach_msg_type_name_t     reply_type);
 
 /* Copyin a capability from a space */
 extern kern_return_t ipc_right_copyin(
-       ipc_space_t              space,
-       mach_port_name_t         name,
-       ipc_entry_t              entry,
-       mach_msg_type_name_t     msgt_name,
-       ipc_right_copyin_flags_t flags,
-       ipc_object_t             *objectp,
-       ipc_port_t               *sorightp,
-       ipc_port_t               *releasep,
-       int                      *assertcntp);
-
-/* Undo the effects of an ipc_right_copyin */
-extern void ipc_right_copyin_undo(
-       ipc_space_t             space,
-       mach_port_name_t        name,
-       ipc_entry_t             entry,
-       mach_msg_type_name_t    msgt_name,
-       ipc_object_t            object,
-       ipc_port_t              soright);
+       ipc_space_t               space,
+       mach_port_name_t          name,
+       ipc_entry_t               entry,
+       mach_msg_type_name_t      msgt_name,
+       ipc_right_copyin_flags_t  flags,
+       ipc_object_t              *objectp,
+       ipc_port_t                *sorightp,
+       ipc_port_t                *releasep,
+       int                       *assertcntp,
+       mach_port_context_t       context,
+       mach_msg_guard_flags_t    *guard_flags);
 
 /* Copyin a pair of dispositions from a space */
 extern kern_return_t ipc_right_copyin_two(
-       ipc_space_t             space,
-       mach_port_name_t        name,
-       ipc_entry_t             entry,
-       mach_msg_type_name_t    msgt_one,
-       mach_msg_type_name_t    msgt_two,
-       ipc_object_t            *objectp,
-       ipc_port_t              *sorightp,
-       ipc_port_t              *releasep);
+       ipc_space_t               space,
+       mach_port_name_t          name,
+       ipc_entry_t               entry,
+       mach_msg_type_name_t      msgt_one,
+       mach_msg_type_name_t      msgt_two,
+       ipc_object_t              *objectp,
+       ipc_port_t                *sorightp,
+       ipc_port_t                *releasep);
 
 /* Copyout a capability to a space */
 extern kern_return_t ipc_right_copyout(
@@ -225,15 +218,8 @@ extern kern_return_t ipc_right_copyout(
        mach_port_name_t        name,
        ipc_entry_t             entry,
        mach_msg_type_name_t    msgt_name,
-       boolean_t               overflow,
+       mach_port_context_t     *context,
+       mach_msg_guard_flags_t  *guard_flags,
        ipc_object_t            object);
 
-/* Reanme a capability */
-extern kern_return_t ipc_right_rename(
-       ipc_space_t             space,
-       mach_port_name_t        oname,
-       ipc_entry_t             oentry,
-       mach_port_name_t        nname,
-       ipc_entry_t             nentry);
-
 #endif  /* _IPC_IPC_RIGHT_H_ */
index c0a2d1d15d4e10ce53e78249536b8c9f61c602a1..161c55403a9953d58381dee0ed66e56b9417eef7 100644 (file)
@@ -185,7 +185,6 @@ extern lck_attr_t       ipc_lck_attr;
                                                        &ipc_lck_grp)
 
 #define is_write_lock(is)       lck_spin_lock_grp(&(is)->is_lock_data, &ipc_lck_grp)
-#define is_write_lock_try(is)   lck_spin_try_lock_grp(&(is)->is_lock_data, &ipc_lck_grp)
 #define is_write_unlock(is)     lck_spin_unlock(&(is)->is_lock_data)
 #define is_write_sleep(is)      lck_spin_sleep_grp(&(is)->is_lock_data,     \
                                                        LCK_SLEEP_DEFAULT,                                      \
@@ -245,6 +244,14 @@ extern void ipc_space_rand_freelist(
 
 /* Generate a new gencount rollover point from a space's entropy pool */
 extern ipc_entry_bits_t ipc_space_get_rollpoint(ipc_space_t space);
+
+/* Allocate a new port/set/dead-name in a space */
+extern kern_return_t mach_port_allocate_internal(
+       ipc_space_t       space,
+       mach_port_right_t right,
+       mach_port_qos_t   *qosp,
+       mach_port_name_t  *namep);
+
 #endif /* MACH_KERNEL_PRIVATE */
 #endif /* __APPLE_API_PRIVATE */
 
index ff6da560511d4cc38cd6355b6303a715f2593cf2..eeb226a87cd6cb13df49df38dd94f9b94a0e7bf9 100644 (file)
@@ -208,12 +208,14 @@ ipc_voucher_init(void)
            sizeof(struct ipc_voucher),
            "ipc vouchers");
        zone_change(ipc_voucher_zone, Z_NOENCRYPT, TRUE);
+       zone_change(ipc_voucher_zone, Z_CLEARMEMORY, TRUE);
 
        ipc_voucher_attr_control_zone = zinit(sizeof(struct ipc_voucher_attr_control),
            attr_manager_max * sizeof(struct ipc_voucher_attr_control),
            sizeof(struct ipc_voucher_attr_control),
            "ipc voucher attr controls");
        zone_change(ipc_voucher_attr_control_zone, Z_NOENCRYPT, TRUE);
+       zone_change(ipc_voucher_attr_control_zone, Z_CLEARMEMORY, TRUE);
 
        /* initialize voucher hash */
        ivht_lock_init();
@@ -318,7 +320,7 @@ iv_dealloc(ipc_voucher_t iv, boolean_t unhash)
         * is gone.  We can just discard it now.
         */
        if (IP_VALID(port)) {
-               assert(ip_active(port));
+               require_ip_active(port);
                assert(port->ip_srights == 0);
 
                ipc_port_dealloc_kernel(port);
@@ -404,6 +406,7 @@ convert_port_to_voucher(
        ipc_port_t      port)
 {
        if (IP_VALID(port)) {
+               zone_require(port, ipc_object_zones[IOT_PORT]);
                ipc_voucher_t voucher = (ipc_voucher_t) port->ip_kobject;
 
                /*
@@ -415,8 +418,9 @@ convert_port_to_voucher(
                        return IV_NULL;
                }
 
-               assert(ip_active(port));
+               require_ip_active(port);
 
+               zone_require(voucher, ipc_voucher_zone);
                ipc_voucher_reference(voucher);
                return voucher;
        }
@@ -477,26 +481,19 @@ ipc_voucher_release(ipc_voucher_t voucher)
  * Purpose:
  *     Called whenever the Mach port system detects no-senders
  *     on the voucher port.
- *
- *     Each time the send-right count goes positive, a no-senders
- *     notification is armed (and a voucher reference is donated).
- *     So, each notification that comes in must release a voucher
- *     reference.  If more send rights have been added since it
- *     fired (asynchronously), they will be protected by a different
- *     reference hold.
  */
 void
 ipc_voucher_notify(mach_msg_header_t *msg)
 {
        mach_no_senders_notification_t *notification = (void *)msg;
        ipc_port_t port = notification->not_header.msgh_remote_port;
-       ipc_voucher_t iv;
 
-       assert(ip_active(port));
+       require_ip_active(port);
        assert(IKOT_VOUCHER == ip_kotype(port));
-       iv = (ipc_voucher_t)port->ip_kobject;
 
-       ipc_voucher_release(iv);
+       /* consume the reference donated by convert_voucher_to_port */
+       zone_require((ipc_voucher_t)port->ip_kobject, ipc_voucher_zone);
+       ipc_voucher_release((ipc_voucher_t)port->ip_kobject);
 }
 
 /*
@@ -505,48 +502,22 @@ ipc_voucher_notify(mach_msg_header_t *msg)
 ipc_port_t
 convert_voucher_to_port(ipc_voucher_t voucher)
 {
-       ipc_port_t      port, send;
-
        if (IV_NULL == voucher) {
                return IP_NULL;
        }
 
+       zone_require(voucher, ipc_voucher_zone);
        assert(os_ref_get_count(&voucher->iv_refs) > 0);
 
-       /* create a port if needed */
-       port = voucher->iv_port;
-       if (!IP_VALID(port)) {
-               port = ipc_port_alloc_kernel();
-               assert(IP_VALID(port));
-               ipc_kobject_set_atomically(port, (ipc_kobject_t) voucher, IKOT_VOUCHER);
-
-               /* If we lose the race, deallocate and pick up the other guy's port */
-               if (!OSCompareAndSwapPtr(IP_NULL, port, &voucher->iv_port)) {
-                       ipc_port_dealloc_kernel(port);
-                       port = voucher->iv_port;
-                       assert(ip_kotype(port) == IKOT_VOUCHER);
-                       assert(port->ip_kobject == (ipc_kobject_t)voucher);
-               }
-       }
-
-       ip_lock(port);
-       assert(ip_active(port));
-       send = ipc_port_make_send_locked(port);
-
-       if (1 == port->ip_srights) {
-               ipc_port_t old_notify;
-
-               /* transfer our ref to the port, and arm the no-senders notification */
-               assert(IP_NULL == port->ip_nsrequest);
-               ipc_port_nsrequest(port, port->ip_mscount, ipc_port_make_sonce_locked(port), &old_notify);
-               /* port unlocked */
-               assert(IP_NULL == old_notify);
-       } else {
-               /* piggyback on the existing port reference, so consume ours */
-               ip_unlock(port);
+       /*
+        * make a send right and donate our reference for ipc_voucher_notify
+        * if this is the first send right
+        */
+       if (!ipc_kobject_make_send_lazy_alloc_port(&voucher->iv_port,
+           (ipc_kobject_t)voucher, IKOT_VOUCHER)) {
                ipc_voucher_release(voucher);
        }
-       return send;
+       return voucher->iv_port;
 }
 
 #define ivace_reset_data(ivace_elem, next_index) {       \
@@ -650,7 +621,7 @@ ivac_dealloc(ipc_voucher_attr_control_t ivac)
         * is gone.  We can just discard it now.
         */
        if (IP_VALID(port)) {
-               assert(ip_active(port));
+               require_ip_active(port);
                assert(port->ip_srights == 0);
 
                ipc_port_dealloc_kernel(port);
@@ -699,6 +670,7 @@ convert_port_to_voucher_attr_control(
        ipc_port_t      port)
 {
        if (IP_VALID(port)) {
+               zone_require(port, ipc_object_zones[IOT_PORT]);
                ipc_voucher_attr_control_t ivac = (ipc_voucher_attr_control_t) port->ip_kobject;
 
                /*
@@ -710,35 +682,32 @@ convert_port_to_voucher_attr_control(
                if (ip_kotype(port) != IKOT_VOUCHER_ATTR_CONTROL) {
                        return IVAC_NULL;
                }
+               require_ip_active(port);
 
-               assert(ip_active(port));
-
+               zone_require(ivac, ipc_voucher_attr_control_zone);
                ivac_reference(ivac);
                return ivac;
        }
        return IVAC_NULL;
 }
 
+/*
+ * Routine:    ipc_voucher_notify
+ * Purpose:
+ *     Called whenever the Mach port system detects no-senders
+ *     on the voucher attr control port.
+ */
 void
 ipc_voucher_attr_control_notify(mach_msg_header_t *msg)
 {
        mach_no_senders_notification_t *notification = (void *)msg;
        ipc_port_t port = notification->not_header.msgh_remote_port;
-       ipc_voucher_attr_control_t ivac;
 
+       require_ip_active(port);
        assert(IKOT_VOUCHER_ATTR_CONTROL == ip_kotype(port));
-       ip_lock(port);
-       assert(ip_active(port));
 
-       /* if no new send rights, drop a control reference */
-       if (port->ip_mscount == notification->not_count) {
-               ivac = (ipc_voucher_attr_control_t)port->ip_kobject;
-               ip_unlock(port);
-
-               ivac_release(ivac);
-       } else {
-               ip_unlock(port);
-       }
+       /* release the reference donated by convert_voucher_attr_control_to_port */
+       ivac_release((ipc_voucher_attr_control_t)port->ip_kobject);
 }
 
 /*
@@ -747,48 +716,21 @@ ipc_voucher_attr_control_notify(mach_msg_header_t *msg)
 ipc_port_t
 convert_voucher_attr_control_to_port(ipc_voucher_attr_control_t control)
 {
-       ipc_port_t      port, send;
-
        if (IVAC_NULL == control) {
                return IP_NULL;
        }
 
-       /* create a port if needed */
-       port = control->ivac_port;
-       if (!IP_VALID(port)) {
-               port = ipc_port_alloc_kernel();
-               assert(IP_VALID(port));
-               if (OSCompareAndSwapPtr(IP_NULL, port, &control->ivac_port)) {
-                       ip_lock(port);
-                       ipc_kobject_set_atomically(port, (ipc_kobject_t) control, IKOT_VOUCHER_ATTR_CONTROL);
-               } else {
-                       ipc_port_dealloc_kernel(port);
-                       port = control->ivac_port;
-                       ip_lock(port);
-                       assert(ip_kotype(port) == IKOT_VOUCHER_ATTR_CONTROL);
-                       assert(port->ip_kobject == (ipc_kobject_t)control);
-               }
-       } else {
-               ip_lock(port);
-       }
+       zone_require(control, ipc_voucher_attr_control_zone);
 
-       assert(ip_active(port));
-       send = ipc_port_make_send_locked(port);
-
-       if (1 == port->ip_srights) {
-               ipc_port_t old_notify;
-
-               /* transfer our ref to the port, and arm the no-senders notification */
-               assert(IP_NULL == port->ip_nsrequest);
-               ipc_port_nsrequest(port, port->ip_mscount, ipc_port_make_sonce_locked(port), &old_notify);
-               assert(IP_NULL == old_notify);
-               /* ipc_port_nsrequest unlocks the port */
-       } else {
-               /* piggyback on the existing port reference, so consume ours */
-               ip_unlock(port);
+       /*
+        * make a send right and donate our reference for
+        * ipc_voucher_attr_control_notify if this is the first send right
+        */
+       if (!ipc_kobject_make_send_lazy_alloc_port(&control->ivac_port,
+           (ipc_kobject_t)control, IKOT_VOUCHER_ATTR_CONTROL)) {
                ivac_release(control);
        }
-       return send;
+       return control->ivac_port;
 }
 
 /*
@@ -1213,7 +1155,7 @@ ivgt_lookup(iv_index_t key_index,
 }
 
 /*
- *     Routine:        ipc_replace_voucher_value
+ *     Routine:        ipc_replace_voucher_value
  *     Purpose:
  *             Replace the <voucher, key> value with the results of
  *             running the supplied command through the resource
@@ -1307,7 +1249,7 @@ ipc_replace_voucher_value(
 }
 
 /*
- *     Routine:        ipc_directly_replace_voucher_value
+ *     Routine:        ipc_directly_replace_voucher_value
  *     Purpose:
  *             Replace the <voucher, key> value with the value-handle
  *             supplied directly by the attribute manager.
@@ -1513,8 +1455,7 @@ ipc_execute_voucher_recipe_command(
 
                        new_value = *(mach_voucher_attr_value_handle_t *)(void *)content;
                        kr = ipc_directly_replace_voucher_value(voucher,
-                           key,
-                           new_value);
+                           key, new_value);
                        if (KERN_SUCCESS != kr) {
                                return kr;
                        }
@@ -1592,7 +1533,7 @@ ipc_execute_voucher_recipe_command(
 }
 
 /*
- *     Routine:        iv_checksum
+ *     Routine:        iv_checksum
  *     Purpose:
  *             Compute the voucher sum.  This is more position-
  *             relevant than many other checksums - important for
@@ -1622,7 +1563,7 @@ iv_checksum(ipc_voucher_t voucher, boolean_t *emptyp)
 }
 
 /*
- *     Routine:        iv_dedup
+ *     Routine:        iv_dedup
  *     Purpose:
  *             See if the set of values represented by this new voucher
  *             already exist in another voucher.  If so return a reference
@@ -1787,7 +1728,7 @@ iv_dedup(ipc_voucher_t new_iv)
 }
 
 /*
- *     Routine:        ipc_create_mach_voucher
+ *     Routine:        ipc_create_mach_voucher
  *     Purpose:
  *             Create a new mach voucher and initialize it with the
  *             value(s) created by having the appropriate resource
@@ -1858,7 +1799,7 @@ ipc_create_mach_voucher(
 }
 
 /*
- *     Routine:        ipc_voucher_attr_control_create_mach_voucher
+ *     Routine:        ipc_voucher_attr_control_create_mach_voucher
  *     Purpose:
  *             Create a new mach voucher and initialize it with the
  *             value(s) created by having the appropriate resource
@@ -1945,7 +1886,7 @@ ipc_voucher_attr_control_create_mach_voucher(
 }
 
 /*
- *      ipc_register_well_known_mach_voucher_attr_manager
+ *     ipc_register_well_known_mach_voucher_attr_manager
  *
  *     Register the resource manager responsible for a given key value.
  */
@@ -2007,7 +1948,7 @@ ipc_register_well_known_mach_voucher_attr_manager(
 }
 
 /*
- *      Routine:       mach_voucher_extract_attr_content
+ *     Routine:        mach_voucher_extract_attr_content
  *     Purpose:
  *             Extract the content for a given <voucher, key> pair.
  *
@@ -2070,14 +2011,12 @@ mach_voucher_extract_attr_content(
        /* callout to manager */
 
        kr = (manager->ivam_extract_content)(manager, key,
-           vals, vals_count,
-           &command,
-           content, in_out_size);
+           vals, vals_count, &command, content, in_out_size);
        return kr;
 }
 
 /*
- *      Routine:       mach_voucher_extract_attr_recipe
+ *     Routine:        mach_voucher_extract_attr_recipe
  *     Purpose:
  *             Extract a recipe for a given <voucher, key> pair.
  *
@@ -2163,7 +2102,7 @@ mach_voucher_extract_attr_recipe(
 
 
 /*
- *     Routine:        mach_voucher_extract_all_attr_recipes
+ *     Routine:        mach_voucher_extract_all_attr_recipes
  *     Purpose:
  *             Extract all the (non-default) contents for a given voucher,
  *             building up a recipe that could be provided to a future
@@ -2253,7 +2192,7 @@ mach_voucher_extract_all_attr_recipes(
 }
 
 /*
- *     Routine:        mach_voucher_debug_info
+ *     Routine:        mach_voucher_debug_info
  *     Purpose:
  *             Extract all the (non-default) contents for a given mach port name,
  *             building up a recipe that could be provided to a future
@@ -2284,6 +2223,10 @@ mach_voucher_debug_info(
        kern_return_t kr;
        ipc_port_t port = MACH_PORT_NULL;
 
+       if (space == IS_NULL) {
+               return KERN_INVALID_TASK;
+       }
+
        if (!MACH_PORT_VALID(voucher_name)) {
                return KERN_INVALID_ARGUMENT;
        }
@@ -2307,7 +2250,7 @@ mach_voucher_debug_info(
 #endif
 
 /*
- *      Routine:       mach_voucher_attr_command
+ *     Routine:        mach_voucher_attr_command
  *     Purpose:
  *             Invoke an attribute-specific command through this voucher.
  *
@@ -2380,7 +2323,7 @@ mach_voucher_attr_command(
 }
 
 /*
- *      Routine:       mach_voucher_attr_control_get_values
+ *     Routine:        mach_voucher_attr_control_get_values
  *     Purpose:
  *             For a given voucher, get the value handle associated with the
  *             specified attribute manager.
@@ -2416,7 +2359,7 @@ mach_voucher_attr_control_get_values(
 }
 
 /*
- *      Routine:       mach_voucher_attr_control_create_mach_voucher
+ *     Routine:        mach_voucher_attr_control_create_mach_voucher
  *     Purpose:
  *             Create a new mach voucher and initialize it by processing the
  *             supplied recipe(s).
@@ -2510,7 +2453,7 @@ mach_voucher_attr_control_create_mach_voucher(
 }
 
 /*
- *      Routine:       host_create_mach_voucher
+ *     Routine:        host_create_mach_voucher
  *     Purpose:
  *             Create a new mach voucher and initialize it by processing the
  *             supplied recipe(s).
@@ -2598,10 +2541,10 @@ host_create_mach_voucher(
 }
 
 /*
- *      Routine:       host_register_well_known_mach_voucher_attr_manager
+ *     Routine:        host_register_well_known_mach_voucher_attr_manager
  *     Purpose:
  *             Register the user-level resource manager responsible for a given
- *              key value.
+ *             key value.
  *     Conditions:
  *             The manager port passed in has to be converted/wrapped
  *             in an ipc_voucher_attr_manager_t structure and then call the
@@ -2650,7 +2593,7 @@ host_register_well_known_mach_voucher_attr_manager(
 }
 
 /*
- *      Routine:       host_register_mach_voucher_attr_manager
+ *     Routine:        host_register_mach_voucher_attr_manager
  *     Purpose:
  *             Register the user-space resource manager and return a
  *             dynamically allocated key.
@@ -3025,7 +2968,7 @@ static void
 user_data_release(
        ipc_voucher_attr_manager_t              manager);
 
-struct ipc_voucher_attr_manager user_data_manager = {
+const struct ipc_voucher_attr_manager user_data_manager = {
        .ivam_release_value =   user_data_release_value,
        .ivam_get_value =       user_data_get_value,
        .ivam_extract_content = user_data_extract_content,
@@ -3048,7 +2991,7 @@ ipc_voucher_attr_control_t test_control;
 #endif
 
 /*
- *     Routine:        user_data_release_value
+ *     Routine:        user_data_release_value
  *     Purpose:
  *             Release a made reference on a specific value managed by
  *             this voucher attribute manager.
@@ -3086,7 +3029,7 @@ user_data_release_value(
 }
 
 /*
- *     Routine:        user_data_checksum
+ *     Routine:        user_data_checksum
  *     Purpose:
  *             Provide a rudimentary checksum for the data presented
  *             to these voucher attribute managers.
@@ -3107,7 +3050,7 @@ user_data_checksum(
 }
 
 /*
- *     Routine:        user_data_dedup
+ *     Routine:        user_data_dedup
  *     Purpose:
  *             See if the content represented by this request already exists
  *             in another user data element.  If so return a made reference
index 19df67bf7fc492b93999d907febcb88e5bf8f4a2..cf1d90c0b281d69285d8f1d4045528ffd7561062 100644 (file)
@@ -256,7 +256,7 @@ mach_port_space_info(
                iin->iin_type = IE_BITS_TYPE(bits);
                if ((entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) != MACH_PORT_TYPE_NONE &&
                    entry->ie_request != IE_REQ_NONE) {
-                       __IGNORE_WCASTALIGN(ipc_port_t port = (ipc_port_t) entry->ie_object);
+                       ipc_port_t port = ip_object_to_port(entry->ie_object);
 
                        assert(IP_VALID(port));
                        ip_lock(port);
@@ -488,7 +488,7 @@ mach_port_kobject(
                return KERN_INVALID_RIGHT;
        }
 
-       __IGNORE_WCASTALIGN(port = (ipc_port_t) entry->ie_object);
+       port = ip_object_to_port(entry->ie_object);
        assert(port != IP_NULL);
 
        ip_lock(port);
@@ -501,18 +501,18 @@ mach_port_kobject(
 
        *typep = (unsigned int) ip_kotype(port);
        kaddr = (mach_vm_address_t)port->ip_kobject;
-       ip_unlock(port);
-
+       *addrp = 0;
 #if (DEVELOPMENT || DEBUG)
-       if (0 != kaddr && is_ipc_kobject(*typep)) {
+       if (kaddr && ip_is_kobject(port)) {
                *addrp = VM_KERNEL_UNSLIDE_OR_PERM(kaddr);
-       } else
+       }
 #endif
-       *addrp = 0;
+       ip_unlock(port);
 
        return KERN_SUCCESS;
 }
 #endif /* MACH_IPC_DEBUG */
+
 /*
  *     Routine:        mach_port_kernel_object [Legacy kernel call]
  *     Purpose:
index 52bf7b11c6c723050a3218329bd86c11ebb5f9d5..603f841d199b732c27f079b4b3fe26770be49246 100644 (file)
@@ -277,18 +277,30 @@ _kernelrpc_mach_port_insert_right_trap(struct _kernelrpc_mach_port_insert_right_
                goto done;
        }
 
+       if (args->name == args->poly) {
+               switch (args->polyPoly) {
+               case MACH_MSG_TYPE_MAKE_SEND:
+               case MACH_MSG_TYPE_COPY_SEND:
+                       /* fastpath MAKE_SEND / COPY_SEND which is the most common case */
+                       rv = ipc_object_insert_send_right(task->itk_space, args->poly,
+                           args->polyPoly);
+                       goto done;
+
+               default:
+                       break;
+               }
+       }
+
        rv = ipc_object_copyin(task->itk_space, args->poly, args->polyPoly,
-           (ipc_object_t *)&port);
+           (ipc_object_t *)&port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND);
        if (rv != KERN_SUCCESS) {
                goto done;
        }
        disp = ipc_object_copyin_type(args->polyPoly);
 
        rv = mach_port_insert_right(task->itk_space, args->name, port, disp);
-       if (rv != KERN_SUCCESS) {
-               if (IO_VALID((ipc_object_t)port)) {
-                       ipc_object_destroy((ipc_object_t)port, disp);
-               }
+       if (rv != KERN_SUCCESS && IP_VALID(port)) {
+               ipc_object_destroy(ip_to_object(port), disp);
        }
 
 done:
@@ -472,6 +484,88 @@ done:
        return rv;
 }
 
+int
+_kernelrpc_mach_port_type_trap(struct _kernelrpc_mach_port_type_args *args)
+{
+       task_t task = port_name_to_task(args->target);
+       int rv = MACH_SEND_INVALID_DEST;
+       mach_port_type_t type;
+
+       if (task != current_task()) {
+               goto done;
+       }
+
+       rv = mach_port_type(task->itk_space, args->name, &type);
+       if (rv == KERN_SUCCESS) {
+               rv = copyout(&type, args->ptype, sizeof(type));
+       }
+
+done:
+       if (task) {
+               task_deallocate(task);
+       }
+       return rv;
+}
+
+int
+_kernelrpc_mach_port_request_notification_trap(
+       struct _kernelrpc_mach_port_request_notification_args *args)
+{
+       task_t task = port_name_to_task(args->target);
+       int rv = MACH_SEND_INVALID_DEST;
+       ipc_port_t notify, previous;
+       mach_msg_type_name_t disp;
+       mach_port_name_t previous_name = MACH_PORT_NULL;
+
+       if (task != current_task()) {
+               goto done;
+       }
+
+       disp = ipc_object_copyin_type(args->notifyPoly);
+       if (disp != MACH_MSG_TYPE_PORT_SEND_ONCE) {
+               goto done;
+       }
+
+       if (MACH_PORT_VALID(args->notify)) {
+               rv = ipc_object_copyin(task->itk_space, args->notify, args->notifyPoly,
+                   (ipc_object_t *)&notify, 0, NULL, 0);
+       } else {
+               notify = CAST_MACH_NAME_TO_PORT(args->notify);
+       }
+       if (rv != KERN_SUCCESS) {
+               goto done;
+       }
+
+       rv = mach_port_request_notification(task->itk_space, args->name,
+           args->msgid, args->sync, notify, &previous);
+       if (rv != KERN_SUCCESS) {
+               ipc_object_destroy(ip_to_object(notify), disp);
+               goto done;
+       }
+
+       if (IP_VALID(previous)) {
+               // Remove once <rdar://problem/45522961> is fixed.
+               // We need to make ith_knote NULL as ipc_object_copyout() uses
+               // thread-argument-passing and its value should not be garbage
+               current_thread()->ith_knote = ITH_KNOTE_NULL;
+               rv = ipc_object_copyout(task->itk_space, ip_to_object(previous),
+                   MACH_MSG_TYPE_PORT_SEND_ONCE, NULL, NULL, &previous_name);
+               if (rv != KERN_SUCCESS) {
+                       ipc_object_destroy(ip_to_object(previous),
+                           MACH_MSG_TYPE_PORT_SEND_ONCE);
+                       goto done;
+               }
+       }
+
+       rv = copyout(&previous_name, args->previous, sizeof(previous_name));
+
+done:
+       if (task) {
+               task_deallocate(task);
+       }
+       return rv;
+}
+
 kern_return_t
 host_create_mach_voucher_trap(struct host_create_mach_voucher_args *args)
 {
index c66b20303dbb43007e1bba757cbbee1476f9c20f..e4a901230a1b3f5bef49d3332e783842e920f9a7 100644 (file)
@@ -155,8 +155,8 @@ mach_msg_rcv_link_special_reply_port(
 void
 mach_msg_receive_results_complete(ipc_object_t object);
 
-security_token_t KERNEL_SECURITY_TOKEN = KERNEL_SECURITY_TOKEN_VALUE;
-audit_token_t KERNEL_AUDIT_TOKEN = KERNEL_AUDIT_TOKEN_VALUE;
+const security_token_t KERNEL_SECURITY_TOKEN = KERNEL_SECURITY_TOKEN_VALUE;
+const audit_token_t KERNEL_AUDIT_TOKEN = KERNEL_AUDIT_TOKEN_VALUE;
 
 mach_msg_format_0_trailer_t trailer_template = {
        /* mach_msg_trailer_type_t */ MACH_MSG_TRAILER_FORMAT_0,
@@ -568,14 +568,13 @@ mach_msg_overwrite_trap(
 
                mr = ipc_mqueue_copyin(space, rcv_name, &mqueue, &object);
                if (mr != MACH_MSG_SUCCESS) {
-                       mach_port_guard_exception(rcv_name, 0, 0, kGUARD_EXC_RCV_INVALID_NAME);
                        return mr;
                }
                /* hold ref for object */
 
                if ((option & MACH_RCV_SYNC_WAIT) && !(option & MACH_SEND_SYNC_OVERRIDE)) {
                        ipc_port_t special_reply_port;
-                       __IGNORE_WCASTALIGN(special_reply_port = (ipc_port_t) object);
+                       special_reply_port = ip_object_to_port(object);
                        /* link the special reply port to the destination */
                        mr = mach_msg_rcv_link_special_reply_port(special_reply_port,
                            (mach_port_name_t)override);
@@ -635,20 +634,19 @@ mach_msg_rcv_link_special_reply_port(
                return MACH_RCV_INVALID_NOTIFY;
        }
 
-       kr = ipc_object_copyin(current_space(),
-           dest_name_port, MACH_MSG_TYPE_COPY_SEND,
-           (ipc_object_t *) &dest_port);
+       kr = ipc_port_translate_send(current_space(), dest_name_port, &dest_port);
+       if (kr == KERN_SUCCESS) {
+               ip_reference(dest_port);
+               ip_unlock(dest_port);
 
-       /*
-        * The receive right of dest port might have gone away,
-        * do not fail the receive in that case.
-        */
-       if (kr == KERN_SUCCESS && IP_VALID(dest_port)) {
+               /*
+                * The receive right of dest port might have gone away,
+                * do not fail the receive in that case.
+                */
                ipc_port_link_special_reply_port(special_reply_port,
-                   dest_port);
+                   dest_port, FALSE);
 
-               /* release the send right */
-               ipc_port_release_send(dest_port);
+               ip_release(dest_port);
        }
        return MACH_MSG_SUCCESS;
 }
@@ -672,7 +670,7 @@ mach_msg_receive_results_complete(ipc_object_t object)
        boolean_t get_turnstile = self->turnstile ? FALSE : TRUE;
 
        if (io_otype(object) == IOT_PORT) {
-               __IGNORE_WCASTALIGN(port = (ipc_port_t) object);
+               port = ip_object_to_port(object);
        } else {
                assert(self->turnstile != TURNSTILE_NULL);
                return;
index 41089c94198dfc4e78b1f8d18a6c8c22e06cdbe5..9f4d8b677246bfa3c8584d257936f7a35b800e08 100644 (file)
 #endif
 
 
-/*
- * Forward declarations
- */
-void mach_port_names_helper(
-       ipc_port_timestamp_t    timestamp,
-       ipc_entry_t             entry,
-       mach_port_name_t        name,
-       mach_port_name_t        *names,
-       mach_port_type_t        *types,
-       ipc_entry_num_t         *actualp);
-
-void mach_port_gst_helper(
-       ipc_pset_t              pset,
-       ipc_entry_num_t         maxnames,
-       mach_port_name_t        *names,
-       ipc_entry_num_t         *actualp);
-
-/* Needs port locked */
-void mach_port_get_status_helper(
-       ipc_port_t              port,
-       mach_port_status_t      *status);
-
 /* Zeroed template of qos flags */
 
 static mach_port_qos_t  qos_template;
@@ -138,8 +116,7 @@ static mach_port_qos_t  qos_template;
  *     Conditions:
  *             Space containing entry is [at least] read-locked.
  */
-
-void
+static void
 mach_port_names_helper(
        ipc_port_timestamp_t    timestamp,
        ipc_entry_t             entry,
@@ -156,14 +133,14 @@ mach_port_names_helper(
 
        bits = entry->ie_bits;
        request = entry->ie_request;
-       __IGNORE_WCASTALIGN(port = (ipc_port_t) entry->ie_object);
+       port = ip_object_to_port(entry->ie_object);
 
        if (bits & MACH_PORT_TYPE_RECEIVE) {
                assert(IP_VALID(port));
 
                if (request != IE_REQ_NONE) {
                        ip_lock(port);
-                       assert(ip_active(port));
+                       require_ip_active(port);
                        type |= ipc_port_request_type(port, name, request);
                        ip_unlock(port);
                }
@@ -462,7 +439,6 @@ mach_port_type(
 
        kr = ipc_right_lookup_write(space, name, &entry);
        if (kr != KERN_SUCCESS) {
-               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME);
                return kr;
        }
 
@@ -634,11 +610,15 @@ mach_port_allocate_qos(
 /*
  *     Routine:        mach_port_allocate_full [kernel call]
  *     Purpose:
- *             Allocates a right in a space.  Supports all of the
- *             special cases, such as specifying a subsystem,
- *             a specific name, a real-time port, etc.
- *             The name may be any legal name in the space that doesn't
+ *             Allocates a right in a space.  Supports the
+ *             special case of specifying a name. The name may
+ *      be any legal name in the space that doesn't
  *             currently denote a right.
+ *
+ *      While we no longer support users requesting
+ *      preallocated message for the port, we still
+ *      check for errors in such requests and then
+ *      just clear the request.
  *     Conditions:
  *             Nothing locked.
  *     Returns:
@@ -658,7 +638,6 @@ mach_port_allocate_full(
        mach_port_qos_t         *qosp,
        mach_port_name_t        *namep)
 {
-       ipc_kmsg_t              kmsg = IKM_NULL;
        kern_return_t           kr;
 
        if (space == IS_NULL) {
@@ -675,38 +654,85 @@ mach_port_allocate_full(
                }
        }
 
+       /*
+        * Don't actually honor prealloc requests from user-space
+        * (for security reasons, and because it isn't guaranteed anyway).
+        * Keep old errors for legacy reasons.
+        */
        if (qosp->prealloc) {
                if (qosp->len > MACH_MSG_SIZE_MAX - MAX_TRAILER_SIZE) {
                        return KERN_RESOURCE_SHORTAGE;
-               } else {
-                       mach_msg_size_t size = qosp->len + MAX_TRAILER_SIZE;
-
-                       if (right != MACH_PORT_RIGHT_RECEIVE) {
-                               return KERN_INVALID_VALUE;
-                       }
-
-                       kmsg = (ipc_kmsg_t)ipc_kmsg_prealloc(size);
-                       if (kmsg == IKM_NULL) {
-                               return KERN_RESOURCE_SHORTAGE;
-                       }
                }
+               if (right != MACH_PORT_RIGHT_RECEIVE) {
+                       return KERN_INVALID_VALUE;
+               }
+               qosp->prealloc = 0;
        }
 
+       kr = mach_port_allocate_internal(space, right, qosp, namep);
+       return kr;
+}
+
+
+/*
+ *     Routine:        mach_port_allocate_internal [kernel private]
+ *     Purpose:
+ *             Allocates a right in a space.  Supports all of the
+ *             special cases, a specific name, a real-time port, etc.
+ *             The name may be any legal name in the space that doesn't
+ *             currently denote a right.
+ *     Conditions:
+ *             Nothing locked.
+ *     Returns:
+ *             KERN_SUCCESS            The right is allocated.
+ *             KERN_INVALID_TASK       The space is null.
+ *             KERN_INVALID_TASK       The space is dead.
+ *             KERN_INVALID_VALUE      "right" isn't a legal kind of right.
+ *             KERN_RESOURCE_SHORTAGE  Couldn't allocate memory.
+ *             KERN_NO_SPACE           No room in space for another right.
+ */
+kern_return_t
+mach_port_allocate_internal(
+       ipc_space_t             space,
+       mach_port_right_t       right,
+       mach_port_qos_t         *qosp,
+       mach_port_name_t        *namep)
+{
+       kern_return_t   kr;
+
+       assert(space != IS_NULL);
+
        switch (right) {
        case MACH_PORT_RIGHT_RECEIVE:
        {
+               ipc_kmsg_t      kmsg = IKM_NULL;
                ipc_port_t      port;
 
+               /*
+                * For in-kernel uses, only allow small (from the kmsg zone)
+                * preallocated messages for the port.
+                */
+               if (qosp->prealloc) {
+                       mach_msg_size_t size = qosp->len;
+
+                       if (size > IKM_SAVED_MSG_SIZE - MAX_TRAILER_SIZE) {
+                               panic("mach_port_allocate_internal: too large a prealloc kmsg");
+                       }
+                       kmsg = (ipc_kmsg_t)ipc_kmsg_prealloc(size + MAX_TRAILER_SIZE);
+                       if (kmsg == IKM_NULL) {
+                               return KERN_RESOURCE_SHORTAGE;
+                       }
+               }
+
                if (qosp->name) {
                        kr = ipc_port_alloc_name(space, *namep, &port);
                } else {
-                       kr = ipc_port_alloc(space, namep, &port);
+                       kr = ipc_port_alloc(space, FALSE, namep, &port);
                }
                if (kr == KERN_SUCCESS) {
                        if (kmsg != IKM_NULL) {
                                ipc_kmsg_set_prealloc(kmsg, port);
                        }
-
                        ip_unlock(port);
                } else if (kmsg != IKM_NULL) {
                        ipc_kmsg_free(kmsg);
@@ -878,7 +904,6 @@ mach_port_get_refs(
 
        kr = ipc_right_lookup_write(space, name, &entry);
        if (kr != KERN_SUCCESS) {
-               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME);
                return kr;
        }
 
@@ -1112,8 +1137,7 @@ mach_port_set_mscount(
        }
        /* port is locked and active */
 
-       ipc_port_set_mscount(port, mscount);
-
+       port->ip_mscount = mscount;
        ip_unlock(port);
        return KERN_SUCCESS;
 }
@@ -1327,7 +1351,7 @@ mach_port_get_set_status(
                }
 
                /* just use a portset reference from here on out */
-               __IGNORE_WCASTALIGN(pset = (ipc_pset_t) psobj);
+               pset = ips_object_to_pset(psobj);
                ips_reference(pset);
                ips_unlock(pset);
 
@@ -1420,9 +1444,9 @@ mach_port_move_member(
        mach_port_name_t        member,
        mach_port_name_t        after)
 {
-       ipc_entry_t entry;
+       ipc_object_t port_obj, ps_obj;
        ipc_port_t port;
-       ipc_pset_t nset;
+       ipc_pset_t nset = IPS_NULL;
        kern_return_t kr;
        uint64_t wq_link_id = 0;
        uint64_t wq_reserved_prepost = 0;
@@ -1458,54 +1482,34 @@ mach_port_move_member(
                }
        }
 
-       kr = ipc_right_lookup_read(space, member, &entry);
+       if (after != MACH_PORT_NULL) {
+               kr = ipc_object_translate_two(space,
+                   member, MACH_PORT_RIGHT_RECEIVE, &port_obj,
+                   after, MACH_PORT_RIGHT_PORT_SET, &ps_obj);
+       } else {
+               kr = ipc_object_translate(space,
+                   member, MACH_PORT_RIGHT_RECEIVE, &port_obj);
+       }
        if (kr != KERN_SUCCESS) {
                goto done;
        }
-       /* space is read-locked and active */
 
-       if ((entry->ie_bits & MACH_PORT_TYPE_RECEIVE) == 0) {
-               is_read_unlock(space);
-               kr = KERN_INVALID_RIGHT;
-               goto done;
+       port = ip_object_to_port(port_obj);
+       if (after != MACH_PORT_NULL) {
+               nset = ips_object_to_pset(ps_obj);
        }
+       /* port and nset are locked */
 
-       __IGNORE_WCASTALIGN(port = (ipc_port_t) entry->ie_object);
-       assert(port != IP_NULL);
-
-       if (after == MACH_PORT_NULL) {
-               nset = IPS_NULL;
-       } else {
-               entry = ipc_entry_lookup(space, after);
-               if (entry == IE_NULL) {
-                       is_read_unlock(space);
-                       kr = KERN_INVALID_NAME;
-                       goto done;
-               }
-
-               if ((entry->ie_bits & MACH_PORT_TYPE_PORT_SET) == 0) {
-                       is_read_unlock(space);
-                       kr = KERN_INVALID_RIGHT;
-                       goto done;
-               }
-
-               __IGNORE_WCASTALIGN(nset = (ipc_pset_t) entry->ie_object);
-               assert(nset != IPS_NULL);
-       }
-       ip_lock(port);
-       assert(ip_active(port));
        ipc_pset_remove_from_all(port);
 
-       if (nset != IPS_NULL) {
-               ips_lock(nset);
+       if (after != MACH_PORT_NULL) {
                kr = ipc_pset_add(nset, port, &wq_link_id, &wq_reserved_prepost);
                ips_unlock(nset);
        }
+
        ip_unlock(port);
-       is_read_unlock(space);
 
 done:
-
        /*
         * on success the ipc_pset_add() will consume the wq_link_id
         * value (resetting it to 0), so this function is always safe to call.
@@ -1552,6 +1556,7 @@ done:
  *             KERN_INVALID_CAPABILITY The notify port is dead.
  *     MACH_NOTIFY_PORT_DESTROYED:
  *             KERN_INVALID_VALUE      Sync isn't zero.
+ *             KERN_FAILURE            Re-registering for this notification
  *     MACH_NOTIFY_DEAD_NAME:
  *             KERN_RESOURCE_SHORTAGE  Couldn't allocate memory.
  *             KERN_INVALID_ARGUMENT   Name denotes dead name, but
@@ -1592,7 +1597,7 @@ mach_port_request_notification(
                        return kr;
                }
 
-               port = (ipc_port_t) entry->ie_object;
+               port = ip_object_to_port(entry->ie_object);
 
                if (port->ip_subsystem != NULL) {
                        is_write_unlock(space);
@@ -1606,7 +1611,7 @@ mach_port_request_notification(
 
        switch (id) {
        case MACH_NOTIFY_PORT_DESTROYED: {
-               ipc_port_t port, previous;
+               ipc_port_t port;
 
                if (sync != 0) {
                        return KERN_INVALID_VALUE;
@@ -1628,10 +1633,16 @@ mach_port_request_notification(
                        return KERN_INVALID_RIGHT;
                }
 
-               ipc_port_pdrequest(port, notify, &previous);
-               /* port is unlocked */
+               /* Allow only one registeration of this notification */
+               if (port->ip_pdrequest != IP_NULL) {
+                       ip_unlock(port);
+                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_KERN_FAILURE);
+                       return KERN_FAILURE;
+               }
 
-               *previousp = previous;
+               ipc_port_pdrequest(port, notify, previousp);
+               /* port is unlocked */
+               assert(*previousp == IP_NULL);
                break;
        }
 
@@ -1728,12 +1739,12 @@ mach_port_insert_right(
                return KERN_INVALID_VALUE;
        }
 
-       if (!IO_VALID((ipc_object_t) poly)) {
+       if (!IP_VALID(poly)) {
                return KERN_INVALID_CAPABILITY;
        }
 
-       return ipc_object_copyout_name(space, (ipc_object_t) poly,
-                  polyPoly, FALSE, name);
+       return ipc_object_copyout_name(space, ip_to_object(poly),
+                  polyPoly, name);
 }
 
 /*
@@ -1779,7 +1790,8 @@ mach_port_extract_right(
                return KERN_INVALID_RIGHT;
        }
 
-       kr = ipc_object_copyin(space, name, msgt_name, (ipc_object_t *) poly);
+       kr = ipc_object_copyin(space, name, msgt_name, (ipc_object_t *) poly, 0, NULL,
+           IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND);
 
        if (kr == KERN_SUCCESS) {
                *polyPoly = ipc_object_copyin_type(msgt_name);
@@ -1797,7 +1809,7 @@ mach_port_extract_right(
  *     Returns:
  *             None.
  */
-void
+static void
 mach_port_get_status_helper(
        ipc_port_t              port,
        mach_port_status_t      *statusp)
@@ -1830,6 +1842,12 @@ mach_port_get_status_helper(
                if (port->ip_strict_guard) {
                        statusp->mps_flags |= MACH_PORT_STATUS_FLAG_STRICT_GUARD;
                }
+               if (port->ip_immovable_receive) {
+                       statusp->mps_flags |= MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE;
+               }
+       }
+       if (port->ip_no_grant) {
+               statusp->mps_flags |= MACH_PORT_STATUS_FLAG_NO_GRANT;
        }
        return;
 }
@@ -2038,7 +2056,7 @@ mach_port_set_attributes(
                 * associated it with a kobject already (timer, host_notify target),
                 * or is a special reply port.
                 */
-               if (is_ipc_kobject(ip_kotype(port)) || port->ip_specialreply) {
+               if (ip_is_kobject(port) || port->ip_specialreply) {
                        ip_unlock(port);
                        return KERN_INVALID_ARGUMENT;
                }
@@ -2097,7 +2115,7 @@ mach_port_set_attributes(
                 * it with a kobject already (timer, host_notify target),
                 * or is a special reply port.
                 */
-               if (is_ipc_kobject(ip_kotype(port)) || port->ip_specialreply) {
+               if (ip_is_kobject(port) || port->ip_specialreply) {
                        ip_unlock(port);
                        return KERN_INVALID_ARGUMENT;
                }
@@ -2175,8 +2193,8 @@ mach_port_insert_member(
        assert(psobj != IO_NULL);
        assert(obj != IO_NULL);
 
-       __IGNORE_WCASTALIGN(kr = ipc_pset_add((ipc_pset_t)psobj, (ipc_port_t)obj,
-           &wq_link_id, &wq_reserved_prepost));
+       kr = ipc_pset_add(ips_object_to_pset(psobj), ip_object_to_port(obj),
+           &wq_link_id, &wq_reserved_prepost);
 
        io_unlock(psobj);
        io_unlock(obj);
@@ -2236,7 +2254,7 @@ mach_port_extract_member(
        assert(psobj != IO_NULL);
        assert(obj != IO_NULL);
 
-       __IGNORE_WCASTALIGN(kr = ipc_pset_remove((ipc_pset_t)psobj, (ipc_port_t)obj));
+       kr = ipc_pset_remove(ips_object_to_pset(psobj), ip_object_to_port(obj));
 
        io_unlock(psobj);
        io_unlock(obj);
@@ -2288,15 +2306,25 @@ static kern_return_t
 mach_port_guard_locked(
        ipc_port_t              port,
        uint64_t                guard,
-       boolean_t               strict)
+       uint64_t                flags)
 {
        if (port->ip_context) {
                return KERN_INVALID_ARGUMENT;
        }
 
+       int strict = (flags & MPG_STRICT)? 1 : 0;
+       int immovable_receive = (flags & MPG_IMMOVABLE_RECEIVE)? 1 : 0;
+
+       imq_lock(&port->ip_messages);
        port->ip_context = guard;
        port->ip_guarded = 1;
-       port->ip_strict_guard = (strict)?1:0;
+       port->ip_strict_guard = strict;
+       /* ip_immovable_receive bit is sticky and can't be un-guarded */
+       if (!port->ip_immovable_receive) {
+               port->ip_immovable_receive = immovable_receive;
+       }
+       imq_unlock(&port->ip_messages);
+
        return KERN_SUCCESS;
 }
 
@@ -2330,8 +2358,12 @@ mach_port_unguard_locked(
                return KERN_INVALID_ARGUMENT;
        }
 
+       imq_lock(&port->ip_messages);
        port->ip_context = 0;
        port->ip_guarded = port->ip_strict_guard = 0;
+       /* Don't clear the ip_immovable_receive bit */
+       imq_unlock(&port->ip_messages);
+
        return KERN_SUCCESS;
 }
 
@@ -2359,7 +2391,13 @@ mach_port_guard_exception(
        EXC_GUARD_ENCODE_TARGET(code, name);
        mach_exception_subcode_t subcode = (uint64_t)portguard;
        thread_t t = current_thread();
-       thread_guard_violation(t, code, subcode);
+       boolean_t fatal = FALSE;
+       if (t->task->task_exc_guard & TASK_EXC_GUARD_MP_FATAL) {
+               fatal = TRUE;
+       } else if (reason <= MAX_FATAL_kGUARD_EXC_CODE) {
+               fatal = TRUE;
+       }
+       thread_guard_violation(t, code, subcode, fatal);
 }
 
 
@@ -2392,6 +2430,8 @@ mach_port_guard_ast(thread_t t,
        case kGUARD_EXC_SET_CONTEXT:
        case kGUARD_EXC_UNGUARDED:
        case kGUARD_EXC_INCORRECT_GUARD:
+       case kGUARD_EXC_IMMOVABLE:
+       case kGUARD_EXC_STRICT_REPLY:
                task_exception_notify(EXC_GUARD, code, subcode);
                task_bsdtask_kill(task);
                break;
@@ -2465,14 +2505,22 @@ mach_port_construct(
        }
 
        /* Allocate a new port in the IPC space */
-       kr = ipc_port_alloc(space, name, &port);
+       kr = ipc_port_alloc(space, (options->flags & MPO_INSERT_SEND_RIGHT),
+           name, &port);
        if (kr != KERN_SUCCESS) {
                return kr;
        }
 
        /* Port locked and active */
        if (options->flags & MPO_CONTEXT_AS_GUARD) {
-               kr = mach_port_guard_locked(port, (uint64_t) context, (options->flags & MPO_STRICT));
+               uint64_t flags = 0;
+               if (options->flags & MPO_STRICT) {
+                       flags |= MPG_STRICT;
+               }
+               if (options->flags & MPO_IMMOVABLE_RECEIVE) {
+                       flags |= MPG_IMMOVABLE_RECEIVE;
+               }
+               kr = mach_port_guard_locked(port, (uint64_t) context, flags);
                /* A newly allocated and locked port should always be guarded successfully */
                assert(kr == KERN_SUCCESS);
        } else {
@@ -2513,23 +2561,12 @@ mach_port_construct(
                }
        }
 
-       if (options->flags & MPO_INSERT_SEND_RIGHT) {
-               kr = ipc_object_copyin(space, *name, MACH_MSG_TYPE_MAKE_SEND, (ipc_object_t *)&port);
-               if (kr != KERN_SUCCESS) {
-                       goto cleanup;
-               }
-
-               kr = mach_port_insert_right(space, *name, port, MACH_MSG_TYPE_PORT_SEND);
-               if (kr != KERN_SUCCESS) {
-                       goto cleanup;
-               }
-       }
-
        return KERN_SUCCESS;
 
 cleanup:
        /* Attempt to destroy port. If its already destroyed by some other thread, we're done */
-       (void) mach_port_destruct(space, *name, 0, context);
+       (void) mach_port_destruct(space, *name,
+           (options->flags & MPO_INSERT_SEND_RIGHT) ? -1 : 0, context);
        return kr;
 }
 
@@ -2604,6 +2641,7 @@ mach_port_guard(
 {
        kern_return_t           kr;
        ipc_port_t              port;
+       uint64_t flags = 0;
 
        if (space == IS_NULL) {
                return KERN_INVALID_TASK;
@@ -2624,7 +2662,11 @@ mach_port_guard(
        }
 
        /* Port locked and active */
-       kr = mach_port_guard_locked(port, guard, strict);
+       if (strict) {
+               flags = MPG_STRICT;
+       }
+
+       kr = mach_port_guard_locked(port, guard, flags);
        ip_unlock(port);
 
        if (KERN_INVALID_ARGUMENT == kr) {
@@ -2681,3 +2723,131 @@ mach_port_unguard(
 
        return kr;
 }
+
+/*
+ *     Routine:        mach_port_guard_with_flags [kernel call]
+ *     Purpose:
+ *             Guard a mach port with specified guard value and guard flags.
+ *             The context field of the port is used as the guard.
+ *     Conditions:
+ *             Should hold receive right for that port
+ *     Returns:
+ *             KERN_SUCCESS            The name is destroyed.
+ *             KERN_INVALID_TASK       The space is null.
+ *             KERN_INVALID_TASK       The space is dead.
+ *             KERN_INVALID_NAME       The name doesn't denote a right.
+ *             KERN_INVALID_RIGHT      The right isn't correct.
+ *             KERN_INVALID_ARGUMENT   Port already contains a context/guard.
+ *             KERN_INVALID_CAPABILITY Cannot set MPG_IMMOVABLE_RECEIVE flag for a port with
+ *                                     a movable port-destroyed notification port
+ */
+kern_return_t
+mach_port_guard_with_flags(
+       ipc_space_t             space,
+       mach_port_name_t        name,
+       uint64_t                guard,
+       uint64_t                flags)
+{
+       kern_return_t           kr;
+       ipc_port_t              port;
+
+       if (space == IS_NULL) {
+               return KERN_INVALID_TASK;
+       }
+
+       if (!MACH_PORT_VALID(name)) {
+               return KERN_INVALID_NAME;
+       }
+
+       kr = ipc_port_translate_receive(space, name, &port);
+       if (kr != KERN_SUCCESS) {
+               mach_port_guard_exception(name, 0, 0,
+                   ((KERN_INVALID_NAME == kr) ?
+                   kGUARD_EXC_INVALID_NAME :
+                   kGUARD_EXC_INVALID_RIGHT));
+               return kr;
+       }
+
+       /* Port locked and active */
+       kr = mach_port_guard_locked(port, guard, flags);
+       ip_unlock(port);
+
+       if (KERN_INVALID_ARGUMENT == kr) {
+               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_ARGUMENT);
+       }
+
+       return kr;
+}
+
+/*
+ *     Routine:        mach_port_swap_guard [kernel call]
+ *     Purpose:
+ *             Swap guard value.
+ *     Conditions:
+ *             Port should already be guarded.
+ *     Returns:
+ *             KERN_SUCCESS            The name is destroyed.
+ *             KERN_INVALID_TASK       The space is null.
+ *             KERN_INVALID_TASK       The space is dead.
+ *             KERN_INVALID_NAME       The name doesn't denote a right.
+ *             KERN_INVALID_RIGHT      The right isn't correct.
+ *             KERN_INVALID_ARGUMENT   Port doesn't contain a guard; is strictly guarded
+ *                                     or the old_guard doesnt match the context
+ */
+kern_return_t
+mach_port_swap_guard(
+       ipc_space_t             space,
+       mach_port_name_t        name,
+       uint64_t                old_guard,
+       uint64_t                new_guard)
+{
+       kern_return_t           kr;
+       ipc_port_t              port;
+
+       if (space == IS_NULL) {
+               return KERN_INVALID_TASK;
+       }
+
+       if (!MACH_PORT_VALID(name)) {
+               return KERN_INVALID_NAME;
+       }
+
+       kr = ipc_port_translate_receive(space, name, &port);
+       if (kr != KERN_SUCCESS) {
+               mach_port_guard_exception(name, 0, 0,
+                   ((KERN_INVALID_NAME == kr) ?
+                   kGUARD_EXC_INVALID_NAME :
+                   kGUARD_EXC_INVALID_RIGHT));
+               return kr;
+       }
+
+       /* Port locked and active */
+       if (!port->ip_guarded) {
+               ip_unlock(port);
+               mach_port_guard_exception(name, old_guard, 0, kGUARD_EXC_UNGUARDED);
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       if (port->ip_strict_guard) {
+               uint64_t portguard = port->ip_context;
+               ip_unlock(port);
+               /* For strictly guarded ports, disallow overwriting context; Raise Exception */
+               mach_port_guard_exception(name, old_guard, portguard, kGUARD_EXC_SET_CONTEXT);
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       if (port->ip_context != old_guard) {
+               uint64_t portguard = port->ip_context;
+               ip_unlock(port);
+               mach_port_guard_exception(name, old_guard, portguard, kGUARD_EXC_INCORRECT_GUARD);
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       imq_lock(&port->ip_messages);
+       port->ip_context = new_guard;
+       imq_unlock(&port->ip_messages);
+
+       ip_unlock(port);
+
+       return KERN_SUCCESS;
+}
index 9646da05f4c998f1a6b380e1fd9ac876ada31992..eb0431b9553cdab7e28c5ddf7586720a5629d588 100644 (file)
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
+#include <sys/cdefs.h>
+
 typedef enum {
        KDP_EVENT_ENTER,
        KDP_EVENT_EXIT,
        KDP_EVENT_PANICLOG
 } kdp_event_t;
 
+__BEGIN_DECLS
 typedef void (*kdp_callout_fn_t)(void *arg, kdp_event_t event);
 
 /*
@@ -43,3 +46,5 @@ typedef void (*kdp_callout_fn_t)(void *arg, kdp_event_t event);
  * non-trivial service.
  */
 extern void kdp_register_callout(kdp_callout_fn_t fn, void *arg);
+
+__END_DECLS
index 08edfb7c64bb37f354952b7e4042896c2024d119..5971b1cfe29116bf3b8c420e97e8cac3b2137ea0 100644 (file)
@@ -737,12 +737,11 @@ kernel_pmap_present_mapping(uint64_t vaddr, uint64_t * pvincr, uintptr_t * pvphy
     if (ppn && pvphysaddr)
     {
         uint64_t phys = ptoa_64(ppn);
-#if defined(__arm__) || defined(__arm64__)
-        if (isphysmem(phys))        *pvphysaddr = phystokv(phys);
-#else
-        if (physmap_enclosed(phys)) *pvphysaddr = (uintptr_t)PHYSMAP_PTOV(phys);
-#endif
-        else                        ppn = 0;
+        if (physmap_enclosed(phys)) {
+               *pvphysaddr = phystokv(phys);
+        } else {
+               ppn = 0;
+       }
     }
 
     return (ppn);
@@ -758,16 +757,18 @@ pmap_traverse_present_mappings(pmap_t __unused pmap,
     IOReturn        ret;
     vm_map_offset_t vcurstart, vcur;
     uint64_t        vincr = 0;
-    vm_map_offset_t debug_start;
-    vm_map_offset_t debug_end;
+    vm_map_offset_t debug_start = trunc_page((vm_map_offset_t) debug_buf_base);
+    vm_map_offset_t debug_end = round_page((vm_map_offset_t) (debug_buf_base + debug_buf_size));
+#if defined(XNU_TARGET_OS_BRIDGE)
+    vm_map_offset_t macos_panic_start = trunc_page((vm_map_offset_t) macos_panic_base);
+    vm_map_offset_t macos_panic_end = round_page((vm_map_offset_t) (macos_panic_base + macos_panic_size));
+#endif
+
     boolean_t       lastvavalid;
 #if defined(__arm__) || defined(__arm64__)
     vm_page_t m = VM_PAGE_NULL;
 #endif
 
-    debug_start = trunc_page((vm_map_offset_t) debug_buf_base);
-    debug_end   = round_page((vm_map_offset_t) (debug_buf_base + debug_buf_size));
-
 #if defined(__x86_64__)
     assert(!is_ept_pmap(pmap));
 #endif
@@ -827,8 +828,12 @@ pmap_traverse_present_mappings(pmap_t __unused pmap,
        if (ppn != 0)
        {
            if (((vcur < debug_start) || (vcur >= debug_end))
-               && !(EFI_VALID_PAGE(ppn) ||
-                    pmap_valid_page(ppn)))
+               && !(EFI_VALID_PAGE(ppn) || pmap_valid_page(ppn))
+#if defined(XNU_TARGET_OS_BRIDGE)
+               // include the macOS panic region if it's mapped
+               && ((vcur < macos_panic_start) || (vcur >= macos_panic_end))
+#endif
+               )
            {
                /* not something we want */
                ppn = 0;
@@ -1170,7 +1175,7 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant)
 
        assert (existing_log_size <= debug_buf_size);
 
-       if (kd_variant == KERN_DUMP_DISK) {
+       if ((kd_variant == KERN_DUMP_DISK) || (kd_variant == KERN_DUMP_STACKSHOT_DISK)) {
                /* Open the file for output */
                if ((ret = (*outproc)(KDP_WRQ, NULL, 0, NULL)) != kIOReturnSuccess) {
                        kern_coredump_log(NULL, "outproc(KDP_WRQ, NULL, 0, NULL) returned 0x%x\n", ret);
@@ -1184,7 +1189,7 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant)
        bzero(&outvars, sizeof(outvars));
        outvars.outproc = outproc;
 
-       if (kd_variant == KERN_DUMP_DISK) {
+       if ((kd_variant == KERN_DUMP_DISK) || (kd_variant == KERN_DUMP_STACKSHOT_DISK)) {
                outvars.zoutput     = kdp_core_zoutput;
                /* Space for file header, panic log, core log */
                foffset = (KERN_COREDUMP_HEADERSIZE + existing_log_size + KERN_COREDUMP_MAXDEBUGLOGSIZE +
@@ -1215,6 +1220,35 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant)
        kern_coredump_log(NULL, "%s", (kd_variant == KERN_DUMP_DISK) ? "Writing local cores..." :
                       "Transmitting kernel state, please wait:\n");
 
+
+#if defined(__x86_64__)
+       if (((kd_variant == KERN_DUMP_STACKSHOT_DISK) || (kd_variant == KERN_DUMP_DISK)) && ((panic_stackshot_buf != 0) && (panic_stackshot_len != 0))) {
+               uint64_t compressed_stackshot_len = 0;
+
+               if ((ret = kdp_reset_output_vars(&outvars, panic_stackshot_len)) != KERN_SUCCESS) {
+                       kern_coredump_log(NULL, "Failed to reset outvars for stackshot with len 0x%zx, returned 0x%x\n", panic_stackshot_len, ret);
+                       dump_succeeded = FALSE;
+               } else if ((ret = kdp_core_output(&outvars, panic_stackshot_len, (void *)panic_stackshot_buf)) != KERN_SUCCESS) {
+                       kern_coredump_log(NULL, "Failed to write panic stackshot to file, kdp_coreoutput(outvars, %lu, %p) returned 0x%x\n",
+                                      panic_stackshot_len, (void *) panic_stackshot_buf, ret);
+                       dump_succeeded = FALSE;
+               } else if ((ret = kdp_core_output(&outvars, 0, NULL)) != KERN_SUCCESS) {
+                       kern_coredump_log(NULL, "Failed to flush stackshot data : kdp_core_output(%p, 0, NULL) returned 0x%x\n", &outvars, ret);
+                       dump_succeeded = FALSE;
+               } else if ((ret = kern_dump_record_file(&outvars, "panic_stackshot.kcdata", foffset, &compressed_stackshot_len)) != KERN_SUCCESS) {
+                       kern_coredump_log(NULL, "Failed to record panic stackshot in corefile header, kern_dump_record_file returned 0x%x\n", ret);
+                       dump_succeeded = FALSE;
+               } else {
+                       kern_coredump_log(NULL, "Recorded panic stackshot in corefile at offset 0x%llx, compressed to %llu bytes\n", foffset, compressed_stackshot_len);
+                       foffset = roundup((foffset + compressed_stackshot_len), KERN_COREDUMP_BEGIN_FILEBYTES_ALIGN);
+                       if ((ret = kern_dump_seek_to_next_file(&outvars, foffset)) != kIOReturnSuccess) {
+                               kern_coredump_log(NULL, "Failed to seek to stackshot file offset 0x%llx, kern_dump_seek_to_next_file returned 0x%x\n", foffset, ret);
+                               dump_succeeded = FALSE;
+                       }
+               }
+       }
+#endif
+
        if (kd_variant == KERN_DUMP_DISK) {
                /*
                 * Dump co-processors as well, foffset will be overwritten with the
@@ -1223,7 +1257,7 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant)
                if (kern_do_coredump(&outvars, FALSE, foffset, &foffset) != 0) {
                        dump_succeeded = FALSE;
                }
-       } else {
+       } else if (kd_variant != KERN_DUMP_STACKSHOT_DISK) {
                /* Only the kernel */
                if (kern_do_coredump(&outvars, TRUE, foffset, &foffset) != 0) {
                        dump_succeeded = FALSE;
@@ -1231,34 +1265,6 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant)
        }
 
        if (kd_variant == KERN_DUMP_DISK) {
-#if defined(__x86_64__) && (DEVELOPMENT || DEBUG)
-               /* Write the macOS panic stackshot on its own to a separate 'corefile' */
-               if (panic_stackshot_buf && panic_stackshot_len) {
-                       uint64_t compressed_stackshot_len = 0;
-
-                       /* Seek to the offset of the next 'file' (foffset provided/updated from kern_do_coredump) */
-                       if ((ret = kern_dump_seek_to_next_file(&outvars, foffset)) != kIOReturnSuccess) {
-                               kern_coredump_log(NULL, "Failed to seek to stackshot file offset 0x%llx, kern_dump_seek_to_next_file returned 0x%x\n", foffset, ret);
-                               dump_succeeded = FALSE;
-                       } else if ((ret = kdp_reset_output_vars(&outvars, panic_stackshot_len)) != KERN_SUCCESS) {
-                               kern_coredump_log(NULL, "Failed to reset outvars for stackshot with len 0x%zx, returned 0x%x\n", panic_stackshot_len, ret);
-                               dump_succeeded = FALSE;
-                       } else if ((ret = kdp_core_output(&outvars, panic_stackshot_len, (void *)panic_stackshot_buf)) != KERN_SUCCESS) {
-                               kern_coredump_log(NULL, "Failed to write panic stackshot to file, kdp_coreoutput(outvars, %lu, %p) returned 0x%x\n",
-                                              panic_stackshot_len, (void *) panic_stackshot_buf, ret);
-                               dump_succeeded = FALSE;
-                       } else if ((ret = kdp_core_output(&outvars, 0, NULL)) != KERN_SUCCESS) {
-                               kern_coredump_log(NULL, "Failed to flush stackshot data : kdp_core_output(%p, 0, NULL) returned 0x%x\n", &outvars, ret);
-                               dump_succeeded = FALSE;
-                       } else if ((ret = kern_dump_record_file(&outvars, "panic_stackshot.kcdata", foffset, &compressed_stackshot_len)) != KERN_SUCCESS) {
-                               kern_coredump_log(NULL, "Failed to record panic stackshot in corefile header, kern_dump_record_file returned 0x%x\n", ret);
-                               dump_succeeded = FALSE;
-                       } else {
-                               kern_coredump_log(NULL, "Recorded panic stackshot in corefile at offset 0x%llx, compressed to %llu bytes\n", foffset, compressed_stackshot_len);
-                       }
-               }
-#endif /* defined(__x86_64__) && (DEVELOPMENT || DEBUG) */
-
                /* Write the debug log -- first seek to the end of the corefile header */
                foffset = KERN_COREDUMP_HEADERSIZE;
                if ((ret = (*outproc)(KDP_SEEK, NULL, sizeof(foffset), &foffset)) != kIOReturnSuccess) {
@@ -1356,14 +1362,14 @@ kern_dump(enum kern_dump_type kd_variant)
 #if KASAN
        kasan_disable();
 #endif
-       if (kd_variant == KERN_DUMP_DISK) {
+       if ((kd_variant == KERN_DUMP_DISK) || (kd_variant == KERN_DUMP_STACKSHOT_DISK)) {
                if (dumped_local) return (0);
                if (local_dump_in_progress) return (-1);
                local_dump_in_progress = TRUE;
 #if CONFIG_EMBEDDED
                hwsd_info->xhsdci_status = XHSDCI_STATUS_KERNEL_BUSY;
 #endif
-               ret = do_kern_dump(&kern_dump_disk_proc, KERN_DUMP_DISK);
+               ret = do_kern_dump(&kern_dump_disk_proc, kd_variant);
                if (ret == 0) {
                        dumped_local = TRUE;
                        kern_dump_successful = TRUE;
@@ -1548,12 +1554,6 @@ kdp_core_init(void)
        PE_consistent_debug_register(kDbgIdAstrisConnection, kvtophys((vm_offset_t) hwsd_info), sizeof(pmap_paddr_t));
        PE_consistent_debug_register(kDbgIdAstrisConnectionVers, CUR_XNU_HWSDCI_STRUCT_VERS, sizeof(uint32_t));
 #endif /* CONFIG_EMBEDDED */
-
-#if defined(__x86_64__) && (DEVELOPMENT || DEBUG)
-       /* Allocate space in the kernel map for the panic stackshot */
-       kr = kmem_alloc(kernel_map, &panic_stackshot_buf, PANIC_STACKSHOT_BUFSIZE, VM_KERN_MEMORY_DIAG);
-       assert (KERN_SUCCESS == kr);
-#endif /* defined(__x86_64__) && (DEVELOPMENT || DEBUG) */
 }
 
 #endif /* CONFIG_KDP_INTERACTIVE_DEBUGGING */
index d69d92b5f6e23034e95c3f59404d779d68d5a33a..a297107f3a73d6d310ca00e8a89ca4a03a757313 100644 (file)
@@ -137,6 +137,7 @@ enum kern_dump_type {
 #if CONFIG_EMBEDDED
        KERN_DUMP_HW_SHMEM_DBG, /* coordinated hardware shared memory debugger core dump */
 #endif
+       KERN_DUMP_STACKSHOT_DISK, /* local, stackshot on device coredump */
 };
 
 int kern_dump(enum kern_dump_type kd_variant);
index edb879981b7d80e139988002e4063b953486e262..ea5d3d5000dc44692d71fbca47e81079a22122e9 100644 (file)
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
+#include <sys/cdefs.h>
+
 /*
  * Ethernet debugger header file
  */
 
+__BEGIN_DECLS
 typedef void (*kdp_send_t)(void * pkt, unsigned int pkt_len);
 typedef void (*kdp_receive_t)(void * pkt, unsigned int * pkt_len,
     unsigned int timeout);
@@ -39,3 +42,4 @@ kdp_register_send_receive(kdp_send_t send, kdp_receive_t receive);
 
 void
 kdp_unregister_send_receive(kdp_send_t send, kdp_receive_t receive);
+__END_DECLS
index c8636563a81c3ca88f2925b889ff4ec6d5358799..260f10ddb6582551635124cfd0543fd27912d5fb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -265,7 +265,7 @@ static void kdp_serial_send(void *rpkt, unsigned int rpkt_len);
 #endif
 
 static uint32_t kdp_current_ip_address = 0;
-static struct kdp_ether_addr kdp_current_mac_address = {{0, 0, 0, 0, 0, 0}};
+static struct kdp_ether_addr kdp_current_mac_address = {.ether_addr_octet = {0, 0, 0, 0, 0, 0}};
 static void *kdp_current_ifp;
 
 static void kdp_handler( void *);
@@ -282,12 +282,12 @@ static boolean_t router_specified = FALSE;
 static boolean_t corename_specified = FALSE;
 static unsigned int panicd_port = CORE_REMOTE_PORT;
 
-static struct kdp_ether_addr etherbroadcastaddr = {{0xff, 0xff, 0xff, 0xff, 0xff, 0xff}};
+static struct kdp_ether_addr etherbroadcastaddr = {.ether_addr_octet = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}};
 
-static struct kdp_ether_addr router_mac = {{0, 0, 0, 0, 0, 0}};
-static struct kdp_ether_addr destination_mac = {{0, 0, 0, 0, 0, 0}};
-static struct kdp_ether_addr temp_mac = {{0, 0, 0, 0, 0, 0}};
-static struct kdp_ether_addr current_resolved_MAC = {{0, 0, 0, 0, 0, 0}};
+static struct kdp_ether_addr router_mac = {.ether_addr_octet = {0, 0, 0, 0, 0, 0}};
+static struct kdp_ether_addr destination_mac = {.ether_addr_octet = {0, 0, 0, 0, 0, 0}};
+static struct kdp_ether_addr temp_mac = {.ether_addr_octet = {0, 0, 0, 0, 0, 0}};
+static struct kdp_ether_addr current_resolved_MAC = {.ether_addr_octet = {0, 0, 0, 0, 0, 0}};
 
 static boolean_t flag_panic_dump_in_progress = FALSE;
 static boolean_t flag_router_mac_initialized = FALSE;
@@ -703,7 +703,7 @@ void
 kdp_set_interface(void *ifp, const struct kdp_ether_addr *macaddr)
 {
        char kdpstr[80];
-       struct kdp_in_addr addr = { 0 };
+       struct kdp_in_addr addr = { .s_addr = 0 };
        unsigned int len;
 
        kdp_current_ifp = ifp;
@@ -1556,18 +1556,33 @@ create_panic_header(unsigned int request, const char *corename,
 
        if (request == KDP_WRQ) {
                char *cp;
+               size_t length_remaining = (sizeof(pkt.data) - pkt.off), bytes_filled = 0;
 
                cp = coreh->th_u.tu_rpl;
-               cp += strlcpy(cp, corename, KDP_MAXPACKET);
+               bytes_filled = strlcpy(cp, corename, length_remaining);
+               cp += bytes_filled;
                *cp++ = '\0';
-               cp += strlcpy(cp, mode, KDP_MAXPACKET - strlen(corename));
+               /* account for the extra NULL character that has been added historically */
+               length_remaining -= (bytes_filled + 1);
+
+               bytes_filled = strlcpy(cp, mode, length_remaining);
+               cp += bytes_filled;
+               *cp++ = '\0';
+               /* account for the extra NULL character that has been added historically */
+               length_remaining -= (bytes_filled + 1);
+
+               bytes_filled = strlcpy(cp, KDP_FEATURE_MASK_STRING, length_remaining);
+               cp += bytes_filled;
                *cp++ = '\0';
-               cp += strlcpy(cp, KDP_FEATURE_MASK_STRING, sizeof(KDP_FEATURE_MASK_STRING));
-               *cp++ = '\0'; /* Redundant */
+               /* account for the extra NULL character that has been added historically */
+               length_remaining -= (bytes_filled + 1);
+
                bcopy(&kdp_crashdump_feature_mask, cp, sizeof(kdp_crashdump_feature_mask));
                kdp_crashdump_pkt_size = KDP_LARGE_CRASHDUMP_PKT_SIZE;
-               PE_parse_boot_argn("kdp_crashdump_pkt_size", &kdp_crashdump_pkt_size, sizeof(kdp_crashdump_pkt_size));
                cp += sizeof(kdp_crashdump_feature_mask);
+               length_remaining -= sizeof(kdp_crashdump_feature_mask);
+
+               PE_parse_boot_argn("kdp_crashdump_pkt_size", &kdp_crashdump_pkt_size, sizeof(kdp_crashdump_pkt_size));
                *(uint32_t *)cp = htonl(kdp_crashdump_pkt_size);
        } else {
                coreh->th_block = htonl((unsigned int) block);
@@ -1803,6 +1818,7 @@ kdp_get_xnu_version(char *versionbuf)
        char vstr[20];
        int retval = -1;
        char *vptr;
+       size_t length_remaining = (sizeof(pkt.data) - pkt.off);
 
        strlcpy(vstr, "custom", 10);
        if (kdp_machine_vm_read((mach_vm_address_t)(uintptr_t)version, versionbuf, 128)) {
@@ -1823,7 +1839,7 @@ kdp_get_xnu_version(char *versionbuf)
                        retval = 0;
                }
        }
-       strlcpy(versionbuf, vstr, KDP_MAXPACKET);
+       strlcpy(versionbuf, vstr, length_remaining);
        return retval;
 }
 
@@ -2279,7 +2295,7 @@ kdp_init(void)
 #endif /* CONFIG_KDP_INTERACTIVE_DEBUGGING */
 
 #if !(MACH_KDP && CONFIG_KDP_INTERACTIVE_DEBUGGING)
-static struct kdp_ether_addr kdp_current_mac_address = {{0, 0, 0, 0, 0, 0}};
+static struct kdp_ether_addr kdp_current_mac_address = {.ether_addr_octet = {0, 0, 0, 0, 0, 0}};
 
 /* XXX ugly forward declares to stop warnings */
 void *kdp_get_interface(void);
index 1e1cb028c7f98341e221ec8ccb93392e49a2868a..a324da1721e7771b2879c2459162286cbff71d9c 100644 (file)
@@ -41,6 +41,9 @@
 #include <libkern/OSAtomic.h>
 #include <vm/vm_map.h>
 
+#if defined(HAS_APPLE_PAC)
+#include <ptrauth.h>
+#endif
 
 #define KDP_TEST_HARNESS 0
 #if KDP_TEST_HARNESS
@@ -364,7 +367,7 @@ kdp_trap(unsigned int exception, struct arm_saved_state * saved_state)
         * increment for both of them.
         */
        if ((instr == GDB_TRAP_INSTR1) || (instr == GDB_TRAP_INSTR2)) {
-               set_saved_state_pc(saved_state, get_saved_state_pc(saved_state) + 4);
+               add_saved_state_pc(saved_state, 4);
        }
 #else
 #error Unknown architecture.
@@ -722,6 +725,10 @@ machine_trace_thread64(thread_t thread,
                }
 
                prevlr = *(uint64_t *)kern_virt_addr;
+#if defined(HAS_APPLE_PAC)
+               /* return addresses on stack signed by arm64e ABI */
+               prevlr = (uint64_t) ptrauth_strip((void *)prevlr, ptrauth_key_return_address);
+#endif
                if (!user_p) {
                        prevlr = VM_KERNEL_UNSLIDE(prevlr);
                }
index 6050ad502ae8ccb6c55b2936564ff00b5549c9d7..d0c41d90ab08651ab4627180884b5e275d41863a 100644 (file)
@@ -94,7 +94,7 @@ uint32_t coredump_registered_count = 0;
 struct kern_coredump_core *kernel_helper = NULL;
 
 static struct kern_coredump_core *
-kern_register_coredump_helper_internal(int kern_coredump_config_vers, kern_coredump_callback_config *kc_callbacks,
+kern_register_coredump_helper_internal(int kern_coredump_config_vers, const kern_coredump_callback_config *kc_callbacks,
     void *refcon, const char *core_description, boolean_t xnu_callback, boolean_t is64bit,
     uint32_t mh_magic, cpu_type_t cpu_type, cpu_subtype_t cpu_subtype)
 {
@@ -166,7 +166,7 @@ kern_register_coredump_helper_internal(int kern_coredump_config_vers, kern_cored
 }
 
 kern_return_t
-kern_register_coredump_helper(int kern_coredump_config_vers, kern_coredump_callback_config *kc_callbacks,
+kern_register_coredump_helper(int kern_coredump_config_vers, const kern_coredump_callback_config *kc_callbacks,
     void *refcon, const char *core_description, boolean_t is64bit, uint32_t mh_magic,
     cpu_type_t cpu_type, cpu_subtype_t cpu_subtype)
 {
@@ -720,7 +720,7 @@ kern_do_coredump(void *core_outvars, boolean_t kernel_only, uint64_t first_file_
 #else /* CONFIG_KDP_INTERACTIVE_DEBUGGING */
 
 kern_return_t
-kern_register_coredump_helper(int kern_coredump_config_vers, kern_coredump_callback_config *kc_callbacks, void* refcon,
+kern_register_coredump_helper(int kern_coredump_config_vers, const kern_coredump_callback_config *kc_callbacks, void* refcon,
     const char *core_description, boolean_t is64bit, uint32_t mh_magic,
     cpu_type_t cpu_type, cpu_subtype_t cpu_subtype)
 {
index 46bf717d3645f774c63f0e21906be2bd5a1f57de..3d736c1f880946bbcd46b1bf1203d757544c074c 100644 (file)
@@ -169,7 +169,7 @@ typedef struct {
  * coredump infrastructure. In addition to the callback config and version of the config
  * structure, a description of the core should be provided -- i.e.: AP
  */
-kern_return_t kern_register_coredump_helper(int kern_coredump_config_vers, kern_coredump_callback_config *kc_callbacks, void *refcon,
+kern_return_t kern_register_coredump_helper(int kern_coredump_config_vers, const kern_coredump_callback_config *kc_callbacks, void *refcon,
     const char *core_description, boolean_t is64bit, uint32_t mh_magic, cpu_type_t cpu_type, cpu_subtype_t cpu_subtype);
 
 #if PRIVATE
index c58d9a7f13408f722a332caac00212e22cac7fcb..9f53a26ec78ac256ed127c849f112eb1e32078a5 100644 (file)
@@ -14,16 +14,19 @@ DATAFILES = \
        kcdata.h
 
 PRIVATE_DATAFILES = \
+       arithmetic_128.h  \
+       block_hint.h \
        cs_blobs.h \
-       trustcache.h \
        debug.h \
        ecc.h \
-       block_hint.h \
        lock_stat.h \
        monotonic.h \
-       arithmetic_128.h  \
-       turnstile.h \
-       remote_time.h
+       remote_time.h \
+       restartable.h \
+       sched_clutch.h \
+       trustcache.h \
+       turnstile.h
+
 
 EXPORT_FILES = \
        affinity.h \
@@ -33,6 +36,7 @@ EXPORT_FILES = \
        bits.h \
        btlog.h \
        call_entry.h \
+       circle_queue.h \
        clock.h \
        coalition.h \
        cpu_number.h \
@@ -59,6 +63,7 @@ EXPORT_FILES = \
        policy_internal.h \
        processor.h \
        queue.h \
+       mpsc_queue.h \
        priority_queue.h \
        sched_prim.h \
        sfi.h \
@@ -80,6 +85,7 @@ PRIVATE_EXPORT_FILES = \
        copyout_shim.h
 
 XNU_ONLY_EXPORTS = \
+       arcade.h \
        cpu_quiesce.h \
        ipc_kobject.h \
        ux_handler.h
diff --git a/osfmk/kern/arcade.c b/osfmk/kern/arcade.c
new file mode 100644 (file)
index 0000000..48c4b60
--- /dev/null
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <mach/mach_types.h>
+#include <kern/kern_types.h>
+#include <mach/notify.h>
+#include <mach/resource_monitors.h>
+
+#include <mach/host_special_ports.h>
+#include <mach/mach_host_server.h>
+#include <mach/host_priv_server.h>
+#include <mach/fairplayd_notification.h>
+#include <mach/arcade_upcall.h>
+
+#include <kern/kern_types.h>
+#include <kern/assert.h>
+#include <kern/kalloc.h>
+#include <kern/host.h>
+#include <kern/ast.h>
+#include <kern/task.h>
+
+#include <kern/arcade.h>
+#include <mach/arcade_register_server.h>
+
+#include <IOKit/IOBSD.h>
+
+#if !defined(MAXPATHLEN)
+#define MAXPATHLEN 4096
+#endif
+
+extern struct proc *current_proc(void);
+extern int proc_pidpathinfo_internal(struct proc *p, uint64_t arg,
+    char *buffer, uint32_t buffersize,
+    int32_t *retval);
+extern off_t proc_getexecutableoffset(struct proc *p);
+
+/*
+ * Simple structure to represent a handle for the Arcade registration.
+ *
+ * This registration is done with an independent kobject callback, rather
+ * than a reply, so that we execute it in the context of the user-space
+ * server replying (in order to do an entitlement check on the reply).
+ *
+ * We cache the resulting upcall port until it fails, and then we go
+ * get another one.
+ */
+struct arcade_register {
+       ipc_port_t ar_port;
+};
+typedef struct arcade_register *arcade_register_t;
+
+static struct arcade_register arcade_register_global;
+
+void
+arcade_prepare(task_t task, thread_t thread)
+{
+       /* Platform binaries are exempt */
+       if (task->t_flags & TF_PLATFORM) {
+               return;
+       }
+
+       /* Check to see if the task has the arcade entitlement */
+       if (!IOTaskHasEntitlement(task, "com.apple.developer.arcade-operations")) {
+               return;
+       }
+
+       /* Others will stop in the AST to make an upcall */
+       thread_ast_set(thread, AST_ARCADE);
+}
+
+static lck_grp_attr_t *arcade_upcall_lck_grp_attr;
+static lck_grp_t *arcade_upcall_lck_grp;
+static lck_mtx_t arcade_upcall_mutex;
+
+static ipc_port_t arcade_upcall_port = IP_NULL;
+static boolean_t arcade_upcall_refresh_in_progress = FALSE;
+static boolean_t arcade_upcall_refresh_waiters = FALSE;
+
+void
+arcade_init(void)
+{
+       ipc_port_t port;
+
+       arcade_upcall_lck_grp_attr = lck_grp_attr_alloc_init();
+       arcade_upcall_lck_grp = lck_grp_alloc_init("arcade_upcall", arcade_upcall_lck_grp_attr);
+       lck_mtx_init(&arcade_upcall_mutex, arcade_upcall_lck_grp, NULL);
+
+       /* Initialize the global arcade_register kobject and associated port */
+       port = ipc_kobject_alloc_port((ipc_kobject_t)&arcade_register_global,
+           IKOT_ARCADE_REG, IPC_KOBJECT_ALLOC_MAKE_SEND);
+       arcade_register_global.ar_port = port;
+}
+
+arcade_register_t
+convert_port_to_arcade_register(
+       ipc_port_t              port)
+{
+       arcade_register_t arcade_reg = ARCADE_REG_NULL;
+
+       if (IP_VALID(port)) {
+               /* No need to lock port because of how refs managed */
+               if (ip_kotype(port) == IKOT_ARCADE_REG) {
+                       assert(ip_active(port));
+                       arcade_reg = (arcade_register_t)port->ip_kobject;
+                       assert(arcade_reg == &arcade_register_global);
+                       assert(arcade_reg->ar_port == port);
+               }
+       }
+       return arcade_reg;
+}
+
+ipc_port_t
+convert_arcade_register_to_port(
+       arcade_register_t arcade_reg)
+{
+       ipc_port_t port = IP_NULL;
+
+       if (arcade_reg == &arcade_register_global) {
+               port = arcade_reg->ar_port;
+       }
+       return port;
+}
+
+kern_return_t
+arcade_register_new_upcall(
+       arcade_register_t arcade_reg,
+       mach_port_t port)
+{
+       if (arcade_reg == ARCADE_REG_NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+       assert(arcade_reg == &arcade_register_global);
+
+       /* Check to see if this is the real arcade subscription service */
+       if (!IOTaskHasEntitlement(current_task(), "com.apple.arcade.fpsd")) {
+               return KERN_INVALID_VALUE;
+       }
+
+       lck_mtx_lock(&arcade_upcall_mutex);
+
+       if (arcade_upcall_refresh_in_progress) {
+               /* If we have an old arcade upcall port, discard it */
+               if (IP_VALID(arcade_upcall_port)) {
+                       ipc_port_release_send(arcade_upcall_port);
+                       arcade_upcall_port = IP_NULL;
+               }
+               arcade_upcall_port = port; /* owns send right */
+
+               /* Wake up anyone waiting for the update */
+               lck_mtx_unlock(&arcade_upcall_mutex);
+               thread_wakeup(&arcade_upcall_port);
+               return KERN_SUCCESS;
+       }
+
+       lck_mtx_unlock(&arcade_upcall_mutex);
+       return KERN_FAILURE;
+}
+
+
+static kern_return_t
+arcade_upcall_refresh(uint64_t deadline)
+{
+       ipc_port_t fairplayd_port = IP_NULL;
+       wait_result_t wr = THREAD_NOT_WAITING;
+       kern_return_t kr;
+
+       LCK_MTX_ASSERT(&arcade_upcall_mutex, LCK_MTX_ASSERT_OWNED);
+
+       /* If someone else is doing the update, wait for them */
+       if (arcade_upcall_refresh_in_progress) {
+               arcade_upcall_refresh_waiters = TRUE;
+               wr = lck_mtx_sleep(&arcade_upcall_mutex, LCK_SLEEP_DEFAULT,
+                   &arcade_upcall_refresh_in_progress, THREAD_INTERRUPTIBLE);
+               goto out;
+       }
+
+       arcade_upcall_refresh_in_progress = TRUE;
+
+       /* If we have an old arcade upcall port, discard it */
+       if (IP_VALID(arcade_upcall_port)) {
+               ipc_port_release_send(arcade_upcall_port);
+               arcade_upcall_port = IP_NULL;
+       }
+
+#if 0
+       if (host_get_fairplayd_port(host_priv_self(), &fairplayd_port) != KERN_SUCCESS) {
+               panic("arcade_upcall_refresh(get fairplayd)");
+       }
+#else
+       /* Temporary hack because launchd is rejecting the other special port number */
+       if (host_get_unfreed_port(host_priv_self(), &fairplayd_port) != KERN_SUCCESS) {
+               panic("arcade_upcall_refresh(get fairplayd)");
+       }
+#endif
+
+       /* If no valid fairplayd port registered, we're done */
+       if (!IP_VALID(fairplayd_port)) {
+               goto finish_in_progress;
+       }
+
+       /*
+        * Send a fairplayd notification to request a new arcade upcall port.
+        * Pass along a send right to the arcade_register kobject to complete
+        * the registration.
+        */
+       ipc_port_t port = convert_arcade_register_to_port(&arcade_register_global);
+       kr = fairplayd_arcade_request(fairplayd_port, port);
+
+       ipc_port_release_send(fairplayd_port);
+
+       switch (kr) {
+       case MACH_MSG_SUCCESS:
+               break;
+       default:
+               goto finish_in_progress;
+       }
+
+       /*
+        * Wait on the arcade upcall port to get registered through the
+        * registration kobject waiting with a deadline here.
+        */
+       wr = lck_mtx_sleep_deadline(&arcade_upcall_mutex, LCK_SLEEP_DEFAULT,
+           &arcade_upcall_port, THREAD_INTERRUPTIBLE, deadline);
+
+finish_in_progress:
+       arcade_upcall_refresh_in_progress = FALSE;
+
+       /* Wakeup any waiters */
+       if (arcade_upcall_refresh_waiters) {
+               arcade_upcall_refresh_waiters = FALSE;
+               thread_wakeup_with_result(&arcade_upcall_refresh_in_progress, wr);
+       }
+
+out:
+       switch (wr) {
+       case THREAD_AWAKENED:
+               return KERN_SUCCESS;
+       default:
+               return KERN_FAILURE;
+       }
+}
+
+static kern_return_t
+__MAKING_UPCALL_TO_ARCADE_VALIDATION_SERVICE__(mach_port_t port,
+    vm_map_copy_t path,
+    vm_size_t pathlen,
+    off_t offset,
+    boolean_t *should_killp)
+{
+       mach_msg_type_number_t len = (mach_msg_type_number_t)pathlen;
+       return arcade_upcall(port, (vm_offset_t)path, len, offset, should_killp);
+}
+
+void
+arcade_ast(__unused thread_t thread)
+{
+       ipc_port_t port;
+       uint64_t deadline;
+       kern_return_t kr;
+       int retval;
+
+       /* Determine the deadline */
+       clock_interval_to_deadline(10, NSEC_PER_SEC, &deadline);
+
+restart:
+       lck_mtx_lock(&arcade_upcall_mutex);
+       port = ipc_port_copy_send(arcade_upcall_port);
+       /*
+        * if the arcade_upcall_port was inactive, "port" will be IP_DEAD.
+        * Otherwise, it holds a send right to the arcade_upcall_port.
+        */
+
+       while (!IP_VALID(port)) {
+               /*
+                * Refresh the arcade upcall port. If that gives up,
+                * give up ourselves.
+                */
+               kr = arcade_upcall_refresh(deadline);
+               if (kr != KERN_SUCCESS) {
+                       lck_mtx_unlock(&arcade_upcall_mutex);
+                       goto fail;
+               }
+               port = ipc_port_copy_send(arcade_upcall_port);
+       }
+       lck_mtx_unlock(&arcade_upcall_mutex);
+
+       /* We have an upcall port send right */
+
+       /* Gather the data we need to send in the upcall */
+       off_t offset;
+       struct proc *p = current_proc();
+       char *path;
+       vm_map_copy_t copy;
+
+       kr = kmem_alloc(ipc_kernel_map, (vm_offset_t *)&path, MAXPATHLEN, VM_KERN_MEMORY_IPC);
+       if (kr != KERN_SUCCESS) {
+               ipc_port_release_send(port);
+               return;
+       }
+       bzero(path, MAXPATHLEN);
+       retval = proc_pidpathinfo_internal(p, 0, path, MAXPATHLEN, NULL);
+       assert(!retval);
+       kr = vm_map_unwire(ipc_kernel_map,
+           vm_map_trunc_page((vm_offset_t)path, VM_MAP_PAGE_MASK(ipc_kernel_map)),
+           vm_map_round_page((vm_offset_t)path + MAXPATHLEN, VM_MAP_PAGE_MASK(ipc_kernel_map)),
+           FALSE);
+       assert(kr == KERN_SUCCESS);
+       kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)path, MAXPATHLEN, TRUE, &copy);
+       assert(kr == KERN_SUCCESS);
+
+       offset = proc_getexecutableoffset(p);
+
+       /* MAKE THE UPCALL */
+       boolean_t should_kill = TRUE;
+       kr = __MAKING_UPCALL_TO_ARCADE_VALIDATION_SERVICE__(port, copy, MAXPATHLEN, offset, &should_kill);
+       ipc_port_release_send(port);
+
+       switch (kr) {
+       case MACH_SEND_INVALID_DEST:
+               vm_map_copy_discard(copy);
+       /* fall thru */
+       case MIG_SERVER_DIED:
+               goto restart;
+       case KERN_SUCCESS:
+               if (should_kill == TRUE) {
+                       /*
+                        * Invalid subscription. UI already presented as to why it did not
+                        * launch.
+                        */
+                       task_terminate_internal(current_task());
+               }
+               break;
+       default:
+fail:
+               /*
+                * Failure of the subscription validation mechanism, not a rejection.
+                * for a missing subscription. There will be no indication WHY this
+                * process didn't launch. We might want this to be an exit_with_reason()
+                * in the future.
+                */
+               task_terminate_internal(current_task());
+               break;
+       }
+}
diff --git a/osfmk/kern/arcade.h b/osfmk/kern/arcade.h
new file mode 100644 (file)
index 0000000..b3a230f
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifndef _KERN_ARCADE_H_
+#define _KERN_ARCADE_H_
+
+#include <mach/mach_types.h>
+#include <kern/kern_types.h>
+
+#include <libkern/section_keywords.h>
+
+
+#if XNU_KERNEL_PRIVATE
+
+struct arcade_register;
+
+extern void arcade_init(void);
+
+extern void arcade_ast(thread_t thread);
+
+extern void arcade_prepare(task_t task, thread_t thread);
+
+extern void arcade_register_notify(mach_msg_header_t *msg);
+
+extern void arcade_register_reference(arcade_register_t arcade_reg);
+
+extern void arcade_register_release(arcade_register_t arcade_reg);
+
+extern mach_port_t convert_arcade_register_to_port(arcade_register_t arcade_reg);
+
+extern arcade_register_t convert_port_to_arcade_register(mach_port_t port);
+
+#endif /* XNU_KERNEL_PRIVATE */
+
+#endif /* _KERN_ARCADE_H_ */
index e6da6c5a3708fa9610e804e493f2e473735484b1..80ca1193ec8f9793474085312a77120208216511 100644 (file)
@@ -70,6 +70,9 @@
 
 __BEGIN_DECLS
 /* Assert error */
+#if !CONFIG_NONFATAL_ASSERTS
+__abortlike
+#endif
 extern void     Assert(
        const char      *file,
        int             line,
index 21fb3f55414250b814a43c5359cfaf3cbdc88906..d0e3415297d97e78c30557f286a1ab8fbf6e9137 100644 (file)
@@ -62,6 +62,7 @@
 #include <kern/sched_prim.h>
 #include <kern/thread.h>
 #include <kern/processor.h>
+#include <kern/restartable.h>
 #include <kern/spl.h>
 #include <kern/sfi.h>
 #if CONFIG_TELEMETRY
 #include <security/mac_mach_internal.h> // for MACF AST hook
 #include <stdatomic.h>
 
+#if CONFIG_ARCADE
+#include <kern/arcade.h>
+#endif
+
 static void __attribute__((noinline, noreturn, disable_tail_calls))
 thread_preempted(__unused void* parameter, __unused wait_result_t result)
 {
@@ -217,6 +222,13 @@ ast_taken_user(void)
        }
 #endif
 
+#if CONFIG_ARCADE
+       if (reasons & AST_ARCADE) {
+               thread_ast_clear(thread, AST_ARCADE);
+               arcade_ast(thread);
+       }
+#endif
+
        if (reasons & AST_APC) {
                thread_ast_clear(thread, AST_APC);
                thread_apc_ast(thread);
@@ -237,6 +249,11 @@ ast_taken_user(void)
                kperf_kpc_thread_ast(thread);
        }
 
+       if (reasons & AST_RESET_PCS) {
+               thread_ast_clear(thread, AST_RESET_PCS);
+               thread_reset_pcs_ast(thread);
+       }
+
        if (reasons & AST_KEVENT) {
                thread_ast_clear(thread, AST_KEVENT);
                uint16_t bits = atomic_exchange(&thread->kevent_ast_bits, 0);
@@ -319,8 +336,7 @@ ast_taken_user(void)
        assert((thread->sched_flags & TH_SFLAG_PROMOTED) == 0);
        assert((thread->sched_flags & TH_SFLAG_DEPRESS) == 0);
 
-       assert(thread->promotions == 0);
-       assert(thread->was_promoted_on_wakeup == 0);
+       assert(thread->kern_promotion_schedpri == 0);
        assert(thread->waiting_for_mutex == NULL);
        assert(thread->rwlock_count == 0);
 }
index 23fe592184d70fba3c68248169229defc5f4a00c..8487484ab1a29cbe8aa69d19d0da7c5dcf9ba9dc 100644 (file)
@@ -118,7 +118,8 @@ typedef uint32_t ast_t;
 #define AST_BSD                 0x80
 #define AST_KPERF               0x100   /* kernel profiling */
 #define AST_MACF                0x200   /* MACF user ret pending */
-/* 0x400, 0x800 unused */
+#define AST_RESET_PCS           0x400   /* restartable ranges */
+#define AST_ARCADE              0x800   /* arcade subsciption support */
 #define AST_GUARD               0x1000
 #define AST_TELEMETRY_USER      0x2000  /* telemetry sample requested on interrupt from userspace */
 #define AST_TELEMETRY_KERNEL    0x4000  /* telemetry sample requested on interrupt from kernel */
@@ -140,7 +141,8 @@ typedef uint32_t ast_t;
                AST_TELEMETRY_PMI | AST_TELEMETRY_IO)
 
 /* Per-thread ASTs follow the thread at context-switch time. */
-#define AST_PER_THREAD  (AST_APC | AST_BSD | AST_MACF | AST_LEDGER | AST_GUARD | AST_TELEMETRY_ALL | AST_KEVENT)
+#define AST_PER_THREAD  (AST_APC | AST_BSD | AST_MACF | AST_RESET_PCS | \
+       AST_ARCADE | AST_LEDGER | AST_GUARD | AST_TELEMETRY_ALL | AST_KEVENT)
 
 /* Handle AST_URGENT detected while in the kernel */
 extern void ast_taken_kernel(void);
@@ -180,8 +182,8 @@ extern void ast_propagate(thread_t thread);
  *
  *     See act_set_ast() for an example.
  */
-#define thread_ast_set(act, reason)     (hw_atomic_or_noret(&(act)->ast, (reason)))
-#define thread_ast_clear(act, reason)   (hw_atomic_and_noret(&(act)->ast, ~(reason)))
+#define thread_ast_set(act, reason)     ((void)os_atomic_or(&(act)->ast, (reason), relaxed))
+#define thread_ast_clear(act, reason)   ((void)os_atomic_andnot(&(act)->ast, (reason), relaxed))
 
 #ifdef MACH_BSD
 
@@ -197,5 +199,7 @@ extern void dtrace_ast(void);
 
 extern void kevent_ast(thread_t thread, uint16_t bits);
 extern void act_set_astkevent(thread_t thread, uint16_t bits);
+extern uint16_t act_clear_astkevent(thread_t thread, uint16_t bits);
+extern void act_set_ast_reset_pcs(thread_t thread);
 
 #endif  /* _KERN_AST_H_ */
index d67c3edaf13904988d8dd6c34b482cb8c3b6d4d1..eb51597a266d78fb5262de5030ef0ee0f381e70b 100644 (file)
 ipc_port_t
 audit_session_mksend(struct auditinfo_addr *aia_p, ipc_port_t *sessionport)
 {
-       ipc_port_t sendport = IPC_PORT_NULL;
-       ipc_port_t port;
-
-       /*
-        * If we don't have an existing session port, then create one.
-        */
-       port = *sessionport;
-       if (!IP_VALID(port)) {
-               ipc_port_t new_port = ipc_port_alloc_kernel();
-               if (!IP_VALID(new_port)) {
-                       return new_port;
-               }
-               ipc_kobject_set(new_port, (ipc_kobject_t)aia_p, IKOT_AU_SESSIONPORT);
-               if (!OSCompareAndSwapPtr(port, new_port, sessionport)) {
-                       ipc_port_dealloc_kernel(new_port);
-               }
-               port = *sessionport;
-       }
-
-       assert(ip_active(port) && IKOT_AU_SESSIONPORT == ip_kotype(port));
-       sendport = ipc_port_make_send(port);
-
-       /*
-        * If we don't have a no-senders notification outstanding against
-        * the port, take a reference on the session and request one.
-        */
-       if (IP_NULL == port->ip_nsrequest) {
-               ipc_port_t notifyport;
-
-               audit_session_aiaref(aia_p);
-
-
-               ip_lock(port);
-               /* Need a send-once right for the target of the notification */
-               notifyport = ipc_port_make_sonce_locked(port);
-               /* Request a no-senders notification (at the new make-send threshold) */
-               ipc_port_nsrequest(port, port->ip_mscount, notifyport, &notifyport);
-               /* port unlocked */
-
-               if (IP_NULL != notifyport) {
-                       /* race requesting notification */
-                       audit_session_aiaunref(aia_p);
-                       ipc_port_release_sonce(notifyport);
-               }
+       audit_session_aiaref(aia_p);
+       if (!ipc_kobject_make_send_lazy_alloc_port(sessionport,
+           (ipc_kobject_t)aia_p, IKOT_AU_SESSIONPORT)) {
+               audit_session_aiaunref(aia_p);
        }
 
-       return sendport;
+       return *sessionport;
 }
 
 
@@ -129,7 +89,7 @@ audit_session_porttoaia(ipc_port_t port)
        if (IP_VALID(port)) {
                ip_lock(port);
                if (IKOT_AU_SESSIONPORT == ip_kotype(port)) {
-                       assert(ip_active(port));
+                       require_ip_active(port);
                        aia_p = (struct auditinfo_addr *)port->ip_kobject;
                }
                ip_unlock(port);
@@ -147,53 +107,21 @@ audit_session_porttoaia(ipc_port_t port)
  * Parameters: msg             A Mach no-senders notification message.
  *
  * Notes: It is possible that new send rights are created after a
- *       no-senders notification has been sent (i.e. via audit_session_mksend).
- *       We check the port's mscount against the notification's not_count
- *       to detect when this happens, and re-arm the notification in that
- *       case.
- *
- *       In the normal case (no new senders), we first mark the port
- *       as dying by setting its object type to IKOT_NONE so that
- *       audit_session_mksend will no longer use it to create
- *       additional send rights.  We can then safely call
- *       audit_session_port_destroy with no locks.
+ *       no-senders notification has been sent, but they will be protected
+ *       by another aia reference.
  */
 void
 audit_session_nosenders(mach_msg_header_t *msg)
 {
        mach_no_senders_notification_t *notification = (void *)msg;
        ipc_port_t port = notification->not_header.msgh_remote_port;
-       ipc_port_t notifyport;
        struct auditinfo_addr *port_aia_p = NULL;
 
+       require_ip_active(port);
        assert(IKOT_AU_SESSIONPORT == ip_kotype(port));
-       ip_lock(port);
-       assert(ip_active(port));
        port_aia_p = (struct auditinfo_addr *)port->ip_kobject;
        assert(NULL != port_aia_p);
 
-       /*
-        * if new send rights have been made since the last notify
-        * request, re-arm the notification with the new threshold.
-        */
-       if (port->ip_mscount > notification->not_count) {
-               notifyport = ipc_port_make_sonce_locked(port);
-               ipc_port_nsrequest(port, port->ip_mscount, notifyport, &notifyport);
-               /* port unlocked */
-
-               if (IP_NULL != notifyport) {
-                       /* race re-arming the notification */
-                       ipc_port_release_sonce(notifyport);
-                       audit_session_aiaunref(port_aia_p);
-               }
-               return;
-       }
-
-       /*
-        * Otherwise, no more extant send rights, so release the
-        * reference held on the session by those send rights.
-        */
-       ip_unlock(port);
        audit_session_aiaunref(port_aia_p);
 }
 
@@ -203,7 +131,7 @@ audit_session_portdestroy(ipc_port_t *sessionport)
        ipc_port_t port = *sessionport;
 
        if (IP_VALID(port)) {
-               assert(ip_active(port));
+               require_ip_active(port);
                assert(IKOT_AU_SESSIONPORT == ip_kotype(port));
                ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE);
                ipc_port_dealloc_kernel(port);
index d99787543c67b1e0eb7dae048d2dd7135cc4967b..82daadce69ac429ec5bdac663b793653e76029ec 100644 (file)
 #include <arm/cpu_data_internal.h>
 #endif
 
+#if defined(HAS_APPLE_PAC)
+#include <ptrauth.h>
+#endif
 
 
-uint32_t __attribute__((noinline))
-backtrace(uintptr_t *bt, uint32_t max_frames)
+unsigned int __attribute__((noinline))
+backtrace(uintptr_t *bt, unsigned int max_frames, bool *was_truncated_out)
 {
-       return backtrace_frame(bt, max_frames, __builtin_frame_address(0));
+       return backtrace_frame(bt, max_frames, __builtin_frame_address(0),
+           was_truncated_out);
 }
 
 /*
@@ -57,12 +61,13 @@ backtrace(uintptr_t *bt, uint32_t max_frames)
  * inlined, it doesn't record the frame of the function it's inside (because
  * there's no stack frame).
  */
-uint32_t __attribute__((noinline, not_tail_called))
-backtrace_frame(uintptr_t *bt, uint32_t max_frames, void *start_frame)
+unsigned int __attribute__((noinline, not_tail_called))
+backtrace_frame(uintptr_t *bt, unsigned int max_frames, void *start_frame,
+    bool *was_truncated_out)
 {
        thread_t thread = current_thread();
        uintptr_t *fp;
-       uint32_t frame_index = 0;
+       unsigned int frame_index = 0;
        uintptr_t top, bottom;
        bool in_valid_stack;
 
@@ -98,7 +103,12 @@ backtrace_frame(uintptr_t *bt, uint32_t max_frames, void *start_frame)
                        break;
                }
 
+#if defined(HAS_APPLE_PAC)
+               /* return addresses signed by arm64e ABI */
+               bt[frame_index++] = (uintptr_t) ptrauth_strip((void *)ret_addr, ptrauth_key_return_address);
+#else /* defined(HAS_APPLE_PAC) */
                bt[frame_index++] = ret_addr;
+#endif /* !defined(HAS_APPLE_PAC) */
 
                /* stacks grow down; backtracing should be moving to higher addresses */
                if (next_fp <= fp) {
@@ -107,6 +117,15 @@ backtrace_frame(uintptr_t *bt, uint32_t max_frames, void *start_frame)
                fp = next_fp;
        }
 
+       /* NULL-terminate the list, if space is available */
+       if (frame_index != max_frames) {
+               bt[frame_index] = 0;
+       }
+
+       if (fp != NULL && frame_index == max_frames && was_truncated_out) {
+               *was_truncated_out = true;
+       }
+
        return frame_index;
 #undef IN_STK_BOUNDS
 }
@@ -197,8 +216,9 @@ interrupted_kernel_pc_fp(uintptr_t *pc, uintptr_t *fp)
 #error "interrupted_kernel_pc_fp: unsupported architecture"
 #endif /* !defined(__arm__) */
 
-uint32_t
-backtrace_interrupted(uintptr_t *bt, uint32_t max_frames)
+unsigned int
+backtrace_interrupted(uintptr_t *bt, unsigned int max_frames,
+    bool *was_truncated_out)
 {
        uintptr_t pc;
        uintptr_t fp;
@@ -218,32 +238,32 @@ backtrace_interrupted(uintptr_t *bt, uint32_t max_frames)
                return 1;
        }
 
-       return backtrace_frame(bt + 1, max_frames - 1, (void *)fp) + 1;
+       return backtrace_frame(bt + 1, max_frames - 1, (void *)fp,
+           was_truncated_out) + 1;
 }
 
 int
-backtrace_user(uintptr_t *bt, uint32_t max_frames, uint32_t *frames_out,
-    bool *user_64_out)
+backtrace_user(uintptr_t *bt, unsigned int max_frames,
+    unsigned int *frames_out, bool *user_64_out, bool *was_truncated_out)
 {
-       return backtrace_thread_user(current_thread(), bt, max_frames, frames_out,
-                  user_64_out);
+       return backtrace_thread_user(current_thread(), bt, max_frames,
+           frames_out, user_64_out, was_truncated_out);
 }
 
 int
-backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames,
-    uint32_t *frames_out, bool *user_64_out)
+backtrace_thread_user(void *thread, uintptr_t *bt, unsigned int max_frames,
+    unsigned int *frames_out, bool *user_64_out, bool *was_truncated_out)
 {
        bool user_64;
-       uintptr_t pc, fp, next_fp;
+       uintptr_t pc = 0, fp = 0, next_fp = 0;
        vm_map_t map = NULL, old_map = NULL;
-       uint32_t frame_index = 0;
+       unsigned int frame_index = 0;
        int err = 0;
-       size_t frame_size;
+       size_t frame_size = 0;
 
        assert(bt != NULL);
        assert(max_frames > 0);
        assert(frames_out != NULL);
-       assert(user_64_out != NULL);
 
 #if defined(__x86_64__)
 
@@ -297,10 +317,6 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames,
 #error "backtrace_thread_user: unsupported architecture"
 #endif /* !defined(__arm__) */
 
-       if (max_frames == 0) {
-               goto out;
-       }
-
        bt[frame_index++] = pc;
 
        if (frame_index >= max_frames) {
@@ -327,7 +343,7 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames,
                } u32;
        } frame;
 
-       frame_size = 2 * (user_64 ? sizeof(uint64_t) : sizeof(uint32_t));
+       frame_size = 2 * (user_64 ? 8 : 4);
 
        /* switch to the correct map, for copyin */
        if (thread != current_thread()) {
@@ -343,6 +359,9 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames,
        while (fp != 0 && frame_index < max_frames) {
                err = copyin(fp, (char *)&frame, frame_size);
                if (err) {
+                       if (was_truncated_out) {
+                               *was_truncated_out = true;
+                       }
                        goto out;
                }
 
@@ -353,7 +372,13 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames,
                }
 
                uintptr_t ret_addr = user_64 ? frame.u64.ret : frame.u32.ret;
+#if defined(HAS_APPLE_PAC)
+               /* return addresses signed by arm64e ABI */
+               bt[frame_index++] = (uintptr_t)ptrauth_strip((void *)ret_addr,
+                   ptrauth_key_return_address);
+#else /* defined(HAS_APPLE_PAC) */
                bt[frame_index++] = ret_addr;
+#endif /* !defined(HAS_APPLE_PAC) */
 
                /* stacks grow down; backtracing should be moving to higher addresses */
                if (next_fp <= fp) {
@@ -368,7 +393,19 @@ out:
                vm_map_deallocate(map);
        }
 
-       *user_64_out = user_64;
+       /* NULL-terminate the list, if space is available */
+       if (frame_index != max_frames) {
+               bt[frame_index] = 0;
+       }
+
+       if (fp != 0 && frame_index == max_frames && was_truncated_out) {
+               *was_truncated_out = true;
+       }
+
+       if (user_64_out) {
+               *user_64_out = user_64;
+       }
+
        *frames_out = frame_index;
        return err;
 #undef INVALID_USER_FP
index 8bdafcddb96073cc0947a82b4277f2cff02486ed..8b56b26df075a4334f0a06f63f2d11fe184d491f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2016-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -26,8 +26,8 @@
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
-#ifndef BACKTRACE_H
-#define BACKTRACE_H
+#ifndef KERN_BACKTRACE_H
+#define KERN_BACKTRACE_H
 
 #include <stdbool.h>
 #include <stdint.h>
 
 __BEGIN_DECLS
 
-/*
- * Backtrace the current thread, storing up to max_frames return addresses in
- * bt.  Returns the number of return addresses stored.
+/*!
+ * @function backtrace
+ *
+ * @abstract backtrace the current thread's kernel stack
+ *
+ * @discussion Backtrace the kernel stack of the current thread, storing up
+ * to btlen return addresses in bt.  Returns the number of return addresses
+ * stored and sets was_truncated to true if it is non-NULL and the backtrace was
+ * truncated to fit in the provided space.  The backtrace starts at the calling
+ * function.  A zero will be stored after the return addresses in the buffer,
+ * if space allows.
+ *
+ * @param bt Clients must provide a buffer in which to store the return
+ * addresses.
+ *
+ * @param btlen Along with the buffer, its length (in terms of uintptr_t) must
+ * also be provided.
+ *
+ * @param was_truncated Optionally, clients can provide a boolean out-parameter
+ * that will be set to true if the backtrace was truncated due to a lack of
+ * buffer space.
+ *
+ * @return The number of return addresses written to bt is returned.  The
+ * function cannot return an error.
  */
-uint32_t backtrace(uintptr_t *bt, uint32_t max_frames)
+unsigned int backtrace(uintptr_t *bt, unsigned int btlen, bool *was_truncated)
 __attribute__((noinline));
 
-/*
- * Backtrace the current thread starting at the frame pointer start_fp, storing
- * up to max_frames return addresses in bt.  Returns the number of return
- * addresses stored.
+/*!
+ * @function backtrace_from
+ *
+ * @abstract backtrace the current thread's kernel stack from a frame pointer
+ *
+ * @discussion Backtrace the kernel stack of the current thread from the given
+ * frame pointer startfp, storing up to btlen return addresses in bt.  Returns
+ * the number of return addresses written and sets trunc to true if trunc is
+ * non-NULL and the backtrace was truncated to fit in the provided space.  The
+ * frame pointer provided must point to a valid frame on the current thread's
+ * stack.
+ *
+ * @param startfp The frame pointer to start backtracing from is required, and
+ * must be point to a valid frame on the current thread's stack.
+ *
+ * @seealso backtrace
  */
-uint32_t backtrace_frame(uintptr_t *bt, uint32_t max_frames, void *start_frame)
+unsigned int backtrace_frame(uintptr_t *bt, unsigned int btlen, void *startfp,
+    bool *was_truncated)
 __attribute__((noinline, not_tail_called));
 
-/*
- * Backtrace the kernel stack of the context that was interrupted, storing up
- * to max_frames return addresses in bt.  Returns 0 on success, and non-zero
- * otherwise.  On success, the number of frames written is stored at the value
- * pointed to by frames_out.
+/*!
+ * @function backtrace_interrupted
+ *
+ * @abstract backtrace the interrupted context
+ *
+ * @discussion Backtrace the kernel stack of the interrupted thread, storing up
+ * to btlen return addresses in bt.  This function must be called from interrupt
+ * context.
  *
- * Must be called from interrupt context.
+ * @seealso backtrace
  */
-uint32_t backtrace_interrupted(uintptr_t *bt, uint32_t max_frames);
+unsigned int backtrace_interrupted(uintptr_t *bt, unsigned int btlen,
+    bool *was_truncated);
 
-/*
- * Backtrace the user stack of the current thread, storing up to max_frames
- * return addresses in bt.  Returns 0 on success, and non-zero otherwise.  On
- * success, the number of frames written is stored at the value pointed to by
- * frames_out and the value pointed to by user_64_out is set true if the user
- * space thread was running in 64-bit mode, and false otherwise.
+/*!
+ * @function backtrace_user
+ *
+ * @abstract backtrace the current thread's user space stack
+ *
+ * @discussion Backtrace the user stack of the current thread, storing up to
+ * btlen return addresses in bt.  This function cannot be called on a kernel
+ * thread, nor can it be called from interrupt context or with interrupts
+ * disabled.
  *
- * Must not be called from interrupt context or with interrupts disabled.
+ * @param btwritten On success, the number of return addresses written is stored
+ * here.
+ *
+ * @param user64 On success, true is stored here if user space was running in
+ * 64-bit mode, and false is stored otherwise.
+ *
+ * @return Returns 0 on success and an errno value on error.
+ *
+ * @seealso backtrace
  */
-int backtrace_user(uintptr_t *bt, uint32_t max_frames, uint32_t *frames_out,
-    bool *user_64_out);
+int backtrace_user(uintptr_t *bt, unsigned int btlen, unsigned int *btwritten,
+    bool *user64, bool *was_truncated);
 
 /*
- * Backtrace the user stack of the given thread, storing up to max_frames return
- * addresses in bt.  Returns 0 on success, and non-zero otherwise.  On success,
- * the number of frames written is stored at the value pointed to by frames_out
- * and the value pointed to by user_64_out is set true if the user space thread
- * was running in 64-bit mode, and false otherwise.
+ * @function backtrace_thread_user
+ *
+ * @abstract backtrace a given thread's user space stack
+ *
+ * @discussion Backtrace the user stack of the given thread, storing up to btlen
+ * return addresses in bt.  This function cannot be called on a kernel thread,
+ * nor can it be called from interrupt context or with interrupts disabled.
+ *
+ * @param thread The user thread to backtrace is required.
  *
- * Must not be called from interrupt context or with interrupts disabled.
+ * @see backtrace_user
  */
-int backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames,
-    uint32_t *frames_out, bool *user_64_out);
+int backtrace_thread_user(void *thread, uintptr_t *bt, unsigned int btlen,
+    unsigned int *btwritten, bool *user64, bool *was_truncated);
 
 __END_DECLS
 
-#endif /* !defined(BACKTRACE_H) */
+#endif /* !defined(KERN_BACKTRACE_H) */
index 47db873e12ad20ff229a131c445769839c261c59..00dbc4b78de8eb6564632ff3447a3d31fbec4dcf 100644 (file)
@@ -31,6 +31,7 @@
 #ifndef __BITS_H__
 #define __BITS_H__
 
+#include <kern/assert.h>
 #include <kern/kalloc.h>
 #include <stdbool.h>
 #include <stdint.h>
index 7f351fe989c1d10e325ecc2b26a0dc12b6d2340f..25fb8477e48223737aaa8af4eadabd089df90db9 100644 (file)
@@ -48,6 +48,7 @@ typedef enum thread_snapshot_wait_flags {
        kThreadWaitParkedWorkQueue      = 0x0f,
        kThreadWaitWorkloopSyncWait     = 0x10,
        kThreadWaitOnProcess            = 0x11,
+       kThreadWaitSleepWithInheritor   = 0x12,
 } __attribute__((packed)) block_hint_t;
 
 _Static_assert(sizeof(block_hint_t) <= sizeof(short),
@@ -70,6 +71,8 @@ extern void kdp_pthread_find_owner(thread_t thread, thread_waitinfo_t *waitinfo)
 extern void *kdp_pthread_get_thread_kwq(thread_t thread);
 extern void kdp_workloop_sync_wait_find_owner(thread_t thread, event64_t event, thread_waitinfo_t *waitinfo);
 extern void kdp_wait4_find_process(thread_t thread, event64_t event, thread_waitinfo_t *waitinfo);
+extern void kdp_sleep_with_inheritor_find_owner(struct waitq * waitq, __unused event64_t event, thread_waitinfo_t * waitinfo);
+extern void kdp_turnstile_fill_tsinfo(struct turnstile *ts, thread_turnstileinfo_t *tsinfo);
 
 #endif /* XNU_KERNEL_PRIVATE */
 
index add2c1d51e0ea5d2fbada9ea75c4039b0932b7ea..e89c8cfb8378e97b6453115fc9fb1e1f107e6503 100644 (file)
@@ -44,6 +44,7 @@
 #include <vm/vm_protos.h> /* last */
 #include <sys/resource.h>
 #include <sys/signal.h>
+#include <sys/errno.h>
 
 #if MONOTONIC
 #include <kern/monotonic.h>
@@ -51,6 +52,7 @@
 #endif /* MONOTONIC */
 
 #include <machine/limits.h>
+#include <sys/codesign.h> /* CS_CDHASH_LEN */
 
 #undef thread_should_halt
 
@@ -68,13 +70,15 @@ int fill_task_rusage(task_t task, rusage_info_current *ri);
 int fill_task_io_rusage(task_t task, rusage_info_current *ri);
 int fill_task_qos_rusage(task_t task, rusage_info_current *ri);
 void fill_task_monotonic_rusage(task_t task, rusage_info_current *ri);
-uint64_t get_task_logical_writes(task_t task);
+uint64_t get_task_logical_writes(task_t task, boolean_t external);
 void fill_task_billed_usage(task_t task, rusage_info_current *ri);
 void task_bsdtask_kill(task_t);
 
 extern uint64_t get_dispatchqueue_serialno_offset_from_proc(void *p);
+extern uint64_t get_dispatchqueue_label_offset_from_proc(void *p);
 extern uint64_t proc_uniqueid(void *p);
 extern int proc_pidversion(void *p);
+extern int proc_getcdhash(void *p, char *cdhash);
 
 #if MACH_BSD
 extern void psignal(void *, int);
@@ -124,6 +128,20 @@ get_bsdthread_info(thread_t th)
        return th->uthread;
 }
 
+/*
+ * This is used to remember any FS error from VNOP_PAGEIN code when
+ * invoked under vm_fault(). The value is an errno style value. It can
+ * be retrieved by exception handlers using thread_get_state().
+ */
+void
+set_thread_pagein_error(thread_t th, int error)
+{
+       assert(th == current_thread());
+       if (error == 0 || th->t_pagein_error == 0) {
+               th->t_pagein_error = error;
+       }
+}
+
 #if defined(__x86_64__)
 /*
  * Returns non-zero if the thread has a non-NULL task
@@ -313,23 +331,6 @@ get_task_ipcspace(task_t t)
        return t->itk_space;
 }
 
-int
-get_task_numactivethreads(task_t task)
-{
-       thread_t        inc;
-       int num_active_thr = 0;
-       task_lock(task);
-
-       for (inc  = (thread_t)(void *)queue_first(&task->threads);
-           !queue_end(&task->threads, (queue_entry_t)inc); inc = (thread_t)(void *)queue_next(&inc->task_threads)) {
-               if (inc->active) {
-                       num_active_thr++;
-               }
-       }
-       task_unlock(task);
-       return num_active_thr;
-}
-
 int
 get_task_numacts(task_t t)
 {
@@ -689,6 +690,18 @@ get_task_cpu_time(task_t task)
        return 0;
 }
 
+uint32_t
+get_task_loadTag(task_t task)
+{
+       return os_atomic_load(&task->loadTag, relaxed);
+}
+
+uint32_t
+set_task_loadTag(task_t task, uint32_t loadTag)
+{
+       return os_atomic_xchg(&task->loadTag, loadTag, relaxed);
+}
+
 /*
  *
  */
@@ -1007,8 +1020,8 @@ fill_taskthreadinfo(task_t task, uint64_t thaddr, bool thuniqueid, struct proc_t
                                err = 1;
                                goto out;
                        }
-                       ptinfo->pth_user_time = ((basic_info.user_time.seconds * (integer_t)NSEC_PER_SEC) + (basic_info.user_time.microseconds * (integer_t)NSEC_PER_USEC));
-                       ptinfo->pth_system_time = ((basic_info.system_time.seconds * (integer_t)NSEC_PER_SEC) + (basic_info.system_time.microseconds * (integer_t)NSEC_PER_USEC));
+                       ptinfo->pth_user_time = (((uint64_t)basic_info.user_time.seconds * NSEC_PER_SEC) + ((uint64_t)basic_info.user_time.microseconds * NSEC_PER_USEC));
+                       ptinfo->pth_system_time = (((uint64_t)basic_info.system_time.seconds * NSEC_PER_SEC) + ((uint64_t)basic_info.system_time.microseconds * NSEC_PER_USEC));
 
                        ptinfo->pth_cpu_usage = basic_info.cpu_usage;
                        ptinfo->pth_policy = basic_info.policy;
@@ -1078,14 +1091,17 @@ fill_task_rusage(task_t task, rusage_info_current *ri)
 {
        struct task_power_info powerinfo;
 
+       uint64_t runnable_time = 0;
+
        assert(task != TASK_NULL);
        task_lock(task);
 
-       task_power_info_locked(task, &powerinfo, NULL, NULL);
+       task_power_info_locked(task, &powerinfo, NULL, NULL, &runnable_time);
        ri->ri_pkg_idle_wkups = powerinfo.task_platform_idle_wakeups;
        ri->ri_interrupt_wkups = powerinfo.task_interrupt_wakeups;
        ri->ri_user_time = powerinfo.total_user;
        ri->ri_system_time = powerinfo.total_system;
+       ri->ri_runnable_time = runnable_time;
 
        ledger_get_balance(task->ledger, task_ledgers.phys_footprint,
            (ledger_amount_t *)&ri->ri_phys_footprint);
@@ -1175,12 +1191,19 @@ fill_task_monotonic_rusage(task_t task, rusage_info_current *ri)
 }
 
 uint64_t
-get_task_logical_writes(task_t task)
+get_task_logical_writes(task_t task, boolean_t external)
 {
        assert(task != TASK_NULL);
        struct ledger_entry_info lei;
 
        task_lock(task);
+
+       if (external == FALSE) {
+               ledger_get_entry_info(task->ledger, task_ledgers.logical_writes, &lei);
+       } else {
+               ledger_get_entry_info(task->ledger, task_ledgers.logical_writes_to_external, &lei);
+       }
+
        ledger_get_entry_info(task->ledger, task_ledgers.logical_writes, &lei);
 
        task_unlock(task);
@@ -1199,6 +1222,18 @@ get_task_dispatchqueue_serialno_offset(task_t task)
        return dq_serialno_offset;
 }
 
+uint64_t
+get_task_dispatchqueue_label_offset(task_t task)
+{
+       uint64_t dq_label_offset = 0;
+
+       if (task->bsd_info) {
+               dq_label_offset = get_dispatchqueue_label_offset_from_proc(task->bsd_info);
+       }
+
+       return dq_label_offset;
+}
+
 uint64_t
 get_task_uniqueid(task_t task)
 {
@@ -1226,3 +1261,37 @@ get_task_crash_label(task_t task)
        return task->crash_label;
 }
 #endif
+
+int
+fill_taskipctableinfo(task_t task, uint32_t *table_size, uint32_t *table_free)
+{
+       ipc_space_t space = task->itk_space;
+       if (space == NULL) {
+               return -1;
+       }
+
+       is_read_lock(space);
+       if (!is_active(space)) {
+               is_read_unlock(space);
+               return -1;
+       }
+
+       *table_size = space->is_table_size;
+       *table_free = space->is_table_free;
+
+       is_read_unlock(space);
+
+       return 0;
+}
+
+int
+get_task_cdhash(task_t task, char cdhash[static CS_CDHASH_LEN])
+{
+       int result = 0;
+
+       task_lock(task);
+       result = task->bsd_info ? proc_getcdhash(task->bsd_info, cdhash) : ESRCH;
+       task_unlock(task);
+
+       return result;
+}
index 584be02cf5532d1a6115c32522761a03cead0b64..93f6e31176d033e5563613aa9580eed05c5f6841 100644 (file)
@@ -157,7 +157,6 @@ lookup_btrecord_byhash(btlog_t *btlog, uint32_t md5_hash, void *bt[], size_t btc
        recindex = btlog->head;
        record = lookup_btrecord(btlog, recindex);
        while (recindex != BTLOG_RECORDINDEX_NONE) {
-               assert(record->bthash);
                assert(!TAILQ_EMPTY(&record->element_record_queue));
                if (record->bthash == md5_hash) {
                        /*
@@ -677,8 +676,6 @@ retry:
        hashidx = calculate_hashidx_for_element((uintptr_t)element, btlog);
        hashelem = btlog_get_elem_from_freelist(btlog);
 
-       assert(record->bthash);
-
        hashelem->elem = ~((uintptr_t)element);
        hashelem->operation = record->operation;
        hashelem->recindex = recindex;
diff --git a/osfmk/kern/circle_queue.h b/osfmk/kern/circle_queue.h
new file mode 100644 (file)
index 0000000..4ec2af2
--- /dev/null
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _KERN_CIRCLE_QUEUE_H_
+#define _KERN_CIRCLE_QUEUE_H_
+
+#include <kern/queue.h>
+#include <kern/assert.h>
+
+__BEGIN_DECLS
+
+/*
+ * Circle Queue Management APIs
+ *
+ * These are similar to the queues from queue.h,
+ * but the circle queue head is a single pointer to the first element
+ * of the queue.
+ */
+
+typedef struct circle_queue_head {
+       queue_entry_t head;
+} circle_queue_head_t, *circle_queue_t;
+
+static inline bool
+circle_queue_empty(circle_queue_t cq)
+{
+       return cq->head == NULL;
+}
+
+static inline queue_entry_t
+circle_queue_first(circle_queue_t cq)
+{
+       return cq->head;
+}
+
+static inline queue_entry_t
+circle_queue_last(circle_queue_t cq)
+{
+       queue_entry_t elt = circle_queue_first(cq);
+       if (elt) {
+               __builtin_assume(elt->prev != NULL);
+               return elt->prev;
+       }
+       return NULL;
+}
+
+static inline queue_entry_t
+circle_queue_next(circle_queue_t cq, queue_entry_t elt)
+{
+       return elt->next == cq->head ? NULL : elt->next;
+}
+
+static inline size_t
+circle_queue_length(circle_queue_t cq)
+{
+       queue_entry_t elt = circle_queue_first(cq);
+       size_t n = 0;
+
+       for (; elt; elt = circle_queue_next(cq, elt)) {
+               n++;
+       }
+       return n;
+}
+
+static inline void
+circle_enqueue_tail(circle_queue_t cq, queue_entry_t elt)
+{
+       queue_entry_t head = circle_queue_first(cq);
+       queue_entry_t tail = circle_queue_last(cq);
+
+       if (head == NULL) {
+               cq->head = elt->next = elt->prev = elt;
+       } else {
+               elt->next = head;
+               elt->prev = tail;
+               tail->next = elt;
+               head->prev = elt;
+       }
+}
+
+static inline void
+circle_enqueue_head(circle_queue_t cq, queue_entry_t elt)
+{
+       circle_enqueue_tail(cq, elt);
+       cq->head = elt;
+}
+
+static inline void
+circle_dequeue(circle_queue_t cq, queue_entry_t elt)
+{
+       queue_entry_t elt_prev = elt->prev;
+       queue_entry_t elt_next = elt->next;
+
+       if (elt == elt_next) {
+               assert(cq->head == elt);
+               cq->head = NULL;
+       } else {
+               elt_prev->next = elt_next;
+               elt_next->prev = elt_prev;
+               if (cq->head == elt) {
+                       cq->head = elt_next;
+               }
+       }
+       __DEQUEUE_ELT_CLEANUP(elt);
+}
+
+static inline queue_entry_t
+circle_dequeue_head(circle_queue_t cq)
+{
+       queue_entry_t elt = circle_queue_first(cq);
+       if (elt) {
+               circle_dequeue(cq, elt);
+       }
+       return elt;
+}
+
+static inline queue_entry_t
+circle_dequeue_tail(circle_queue_t cq)
+{
+       queue_entry_t elt = circle_queue_last(cq);
+       if (elt) {
+               circle_dequeue(cq, elt);
+       }
+       return elt;
+}
+
+/*
+ *     Macro:          cqe_element
+ *     Function:
+ *             Convert a cirle_queue_entry_t pointer to a queue element pointer.
+ *             Get a pointer to the user-defined element containing
+ *             a given cirle_queue_entry_t
+ *     Header:
+ *             <type> * cqe_element(cirle_queue_entry_t qe, <type>, field)
+ *                     qe      - queue entry to convert
+ *                     <type>  - what's in the queue (e.g., struct some_data)
+ *                     <field> - is the chain field in <type>
+ *     Note:
+ *             Do not use pointer types for <type>
+ */
+#define cqe_element(qe, type, field) __container_of(qe, type, field)
+
+/*
+ *     Macro:          cqe_foreach
+ *     Function:
+ *             Iterate over each queue_entry_t structure.
+ *             Generates a 'for' loop, setting 'qe' to
+ *             each queue_entry_t in the queue.
+ *     Header:
+ *             cqe_foreach(queue_entry_t qe, queue_t head)
+ *                     qe   - iteration variable
+ *                     head - pointer to queue_head_t (head of queue)
+ *     Note:
+ *             This should only be used with Method 1 queue iteration (linkage chains)
+ */
+#define cqe_foreach(qe, head) \
+       for (qe = circle_queue_first(head); qe; qe = circle_queue_next(head, qe))
+
+/*
+ *     Macro:          cqe_foreach_safe
+ *     Function:
+ *             Safely iterate over each queue_entry_t structure.
+ *
+ *             Use this iterator macro if you plan to remove the
+ *             queue_entry_t, qe, from the queue during the
+ *             iteration.
+ *     Header:
+ *             cqe_foreach_safe(queue_entry_t qe, queue_t head)
+ *                     qe   - iteration variable
+ *                     head - pointer to queue_head_t (head of queue)
+ *     Note:
+ *             This should only be used with Method 1 queue iteration (linkage chains)
+ */
+#define cqe_foreach_safe(qe, head) \
+       for (queue_entry_t _ne, _qe = circle_queue_first(head); \
+            (qe = _qe) && (_ne = circle_queue_next(head, _qe), 1); \
+            _qe = _ne)
+
+/*
+ *     Macro:          cqe_foreach_element
+ *     Function:
+ *             Iterate over each _element_ in a queue
+ *             where each queue_entry_t points to another
+ *             queue_entry_t, i.e., managed by the [de|en]queue_head/
+ *             [de|en]queue_tail / remqueue / etc. function.
+ *     Header:
+ *             cqe_foreach_element(<type> *elt, queue_t head, <field>)
+ *                     elt     - iteration variable
+ *                     <type>  - what's in the queue (e.g., struct some_data)
+ *                     <field> - is the chain field in <type>
+ *     Note:
+ *             This should only be used with Method 1 queue iteration (linkage chains)
+ */
+#define cqe_foreach_element(elt, head, field) \
+       for (queue_entry_t _qe = circle_queue_first(head); \
+            _qe && (elt = cqe_element(_qe, typeof(*(elt)), field), 1); \
+            _qe = circle_queue_next(head, _qe))
+
+/*
+ *     Macro:          cqe_foreach_element_safe
+ *     Function:
+ *             Safely iterate over each _element_ in a queue
+ *             where each queue_entry_t points to another
+ *             queue_entry_t, i.e., managed by the [de|en]queue_head/
+ *             [de|en]queue_tail / remqueue / etc. function.
+ *
+ *             Use this iterator macro if you plan to remove the
+ *             element, elt, from the queue during the iteration.
+ *     Header:
+ *             cqe_foreach_element_safe(<type> *elt, queue_t head, <field>)
+ *                     elt     - iteration variable
+ *                     <type>  - what's in the queue (e.g., struct some_data)
+ *                     <field> - is the chain field in <type>
+ *     Note:
+ *             This should only be used with Method 1 queue iteration (linkage chains)
+ */
+#define cqe_foreach_element_safe(elt, head, field) \
+       for (queue_entry_t _ne, _qe = circle_queue_first(head); \
+            _qe && (elt = cqe_element(_qe, typeof(*(elt)), field), \
+            _ne = circle_queue_next(head, _qe), 1); \
+            _qe = _ne)
+
+/* Dequeue an element from head, or return NULL if the queue is empty */
+#define cqe_dequeue_head(head, type, field) ({ \
+       queue_entry_t _tmp_entry = circle_dequeue_head((head)); \
+       type *_tmp_element = (type*) NULL; \
+       if (_tmp_entry != (queue_entry_t) NULL) \
+               _tmp_element = cqe_element(_tmp_entry, type, field); \
+       _tmp_element; \
+})
+
+/* Dequeue an element from tail, or return NULL if the queue is empty */
+#define cqe_dequeue_tail(head, type, field) ({ \
+       queue_entry_t _tmp_entry = circle_dequeue_tail((head)); \
+       type *_tmp_element = (type*) NULL; \
+       if (_tmp_entry != (queue_entry_t) NULL) \
+               _tmp_element = cqe_element(_tmp_entry, type, field); \
+       _tmp_element; \
+})
+
+/* Peek at the first element, or return NULL if the queue is empty */
+#define cqe_queue_first(head, type, field) ({ \
+       queue_entry_t _tmp_entry = circle_queue_first((head)); \
+       type *_tmp_element = (type*) NULL; \
+       if (_tmp_entry != (queue_entry_t) NULL) \
+               _tmp_element = cqe_element(_tmp_entry, type, field); \
+       _tmp_element; \
+})
+
+/* Peek at the next element, or return NULL if it is last */
+#define cqe_queue_next(elt, head, type, field) ({ \
+       queue_entry_t _tmp_entry = circle_queue_next((head), (elt)); \
+       type *_tmp_element = (type*) NULL; \
+       if (_tmp_entry != (queue_entry_t) NULL) \
+               _tmp_element = cqe_element(_tmp_entry, type, field); \
+       _tmp_element; \
+})
+
+/* Peek at the tail element, or return NULL if the queue is empty */
+#define cqe_queue_last(head, type, field) ({ \
+       queue_entry_t _tmp_entry = circle_queue_last((head)); \
+       type *_tmp_element = (type*) NULL; \
+       if (_tmp_entry != (queue_entry_t) NULL) \
+               _tmp_element = cqe_element(_tmp_entry, type, field); \
+       _tmp_element; \
+})
+
+/*
+ *     Macro:          circle_queue_init
+ *     Function:
+ *             Initialize the given circle queue.
+ *     Header:
+ *             void circle_queue_init(q)
+ *                     circle_queue_t          q;      \* MODIFIED *\
+ */
+#define circle_queue_init(q)   \
+MACRO_BEGIN             \
+       (q)->head = NULL; \
+MACRO_END
+
+__END_DECLS
+
+#endif  /* _KERN_QUEUE_H_ */
index 578a7f6a64b484060dc8c77edc8164ff43c5f2d2..6801e0f310c2646b582c8be2a0ca3bd3db359464 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -89,7 +89,7 @@
 uint32_t        hz_tick_interval = 1;
 static uint64_t has_monotonic_clock = 0;
 
-decl_simple_lock_data(, clock_lock)
+decl_simple_lock_data(, clock_lock);
 lck_grp_attr_t * settime_lock_grp_attr;
 lck_grp_t * settime_lock_grp;
 lck_attr_t * settime_lock_attr;
@@ -295,7 +295,7 @@ static void print_all_clock_variables_internal(const char *, struct clock_calend
  *
  *     The trick is to use a generation count and set the low bit when it is
  *     being updated/read; by doing this, we guarantee, through use of the
- *     hw_atomic functions, that the generation is incremented when the bit
+ *     os_atomic functions, that the generation is incremented when the bit
  *     is cleared atomically (by using a 1 bit add).
  */
 static struct unlocked_clock_calend {
@@ -1673,7 +1673,7 @@ clock_get_calendar_nanotime_nowait(
                 * off the "in progress" bit to get the current generation
                 * count.
                 */
-               (void)hw_atomic_and(&stable.gen, ~(uint32_t)1);
+               os_atomic_andnot(&stable.gen, 1, relaxed);
 
                /*
                 * If an update _is_ in progress, the generation count will be
@@ -1712,7 +1712,7 @@ clock_track_calend_nowait(void)
                 * will flag an update in progress to an async caller trying
                 * to examine the contents.
                 */
-               (void)hw_atomic_or(&flipflop[i].gen, 1);
+               os_atomic_or(&flipflop[i].gen, 1, relaxed);
 
                flipflop[i].calend = tmp;
 
@@ -1722,7 +1722,7 @@ clock_track_calend_nowait(void)
                 * count after taking a copy while in progress, the count
                 * will be off by two.
                 */
-               (void)hw_atomic_add(&flipflop[i].gen, 1);
+               os_atomic_inc(&flipflop[i].gen, relaxed);
        }
 }
 
index 430a2da536a6d7e0620cc0b665c938fadf686f66..3ec6264d88adcc4e66906b0d143bb27bd43cebf2 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -96,7 +96,7 @@ typedef struct alarm    alarm_data_t;
 #define ALARM_DONE      4               /* alarm has expired */
 
 /* local data declarations */
-decl_simple_lock_data(static, alarm_lock)        /* alarm synchronization */
+decl_simple_lock_data(static, alarm_lock);       /* alarm synchronization */
 static struct   zone            *alarm_zone;    /* zone for user alarms */
 static struct   alarm           *alrmfree;              /* alarm free list pointer */
 static struct   alarm           *alrmdone;              /* alarm done list pointer */
@@ -139,9 +139,10 @@ kern_return_t   rtclock_getattr(
        mach_msg_type_number_t  *count);
 
 SECURITY_READ_ONLY_EARLY(struct clock_ops) sysclk_ops = {
-       NULL, rtclock_init,
-       rtclock_gettime,
-       rtclock_getattr,
+       .c_config   = NULL,
+       .c_init     = rtclock_init,
+       .c_gettime  = rtclock_gettime,
+       .c_getattr  = rtclock_getattr,
 };
 
 kern_return_t   calend_gettime(
@@ -153,20 +154,26 @@ kern_return_t   calend_getattr(
        mach_msg_type_number_t  *count);
 
 SECURITY_READ_ONLY_EARLY(struct clock_ops) calend_ops = {
-       NULL, NULL,
-       calend_gettime,
-       calend_getattr,
+       .c_config   = NULL,
+       .c_init     = NULL,
+       .c_gettime  = calend_gettime,
+       .c_getattr  = calend_getattr,
 };
 
 /*
  * List of clock devices.
  */
-SECURITY_READ_ONLY_LATE(struct  clock) clock_list[] = {
-       /* SYSTEM_CLOCK */
-       { &sysclk_ops, 0, 0 },
-
-       /* CALENDAR_CLOCK */
-       { &calend_ops, 0, 0 }
+SECURITY_READ_ONLY_LATE(struct clock) clock_list[] = {
+       [SYSTEM_CLOCK] = {
+               .cl_ops     = &sysclk_ops,
+               .cl_service = IPC_PORT_NULL,
+               .cl_control = IPC_PORT_NULL,
+       },
+       [CALENDAR_CLOCK] = {
+               .cl_ops     = &calend_ops,
+               .cl_service = IPC_PORT_NULL,
+               .cl_control = IPC_PORT_NULL,
+       },
 };
 int     clock_count = sizeof(clock_list) / sizeof(clock_list[0]);
 
index 0db480817f1654f7de5859081e947c22c686126e..025a2c3f1cbe26b93e24cb2775a5b69619ee2418 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <mach/boolean.h>
 
 #include <kern/coalition.h>
+#include <kern/exc_resource.h>
 #include <kern/host.h>
 #include <kern/kalloc.h>
 #include <kern/ledger.h>
 #include <kern/mach_param.h> /* for TASK_CHUNK */
+#if MONOTONIC
+#include <kern/monotonic.h>
+#endif /* MONOTONIC */
+#include <kern/policy_internal.h>
 #include <kern/task.h>
 #include <kern/thread_group.h>
 #include <kern/zalloc.h>
 #include <mach/host_priv.h>
 #include <mach/host_special_ports.h>
 
+#include <os/log.h>
+
 #include <sys/errno.h>
 
 /*
  * BSD interface functions
  */
 int coalitions_get_list(int type, struct procinfo_coalinfo *coal_list, int list_sz);
-boolean_t coalition_is_leader(task_t task, int coal_type, coalition_t *coal);
+coalition_t task_get_coalition(task_t task, int type);
+boolean_t coalition_is_leader(task_t task, coalition_t coal);
 task_t coalition_get_leader(coalition_t coal);
 int coalition_get_task_count(coalition_t coal);
 uint64_t coalition_get_page_count(coalition_t coal, int *ntasks);
@@ -61,6 +69,14 @@ int coalition_get_pid_list(coalition_t coal, uint32_t rolemask, int sort_order,
 /* defined in task.c */
 extern ledger_template_t task_ledger_template;
 
+/*
+ * Templates; task template is copied due to potential allocation limits on
+ * task ledgers.
+ */
+ledger_template_t coalition_task_ledger_template = NULL;
+ledger_template_t coalition_ledger_template = NULL;
+
+extern int      proc_selfpid(void);
 /*
  * Coalition zone needs limits. We expect there will be as many coalitions as
  * tasks (same order of magnitude), so use the task zone's limits.
@@ -175,6 +191,10 @@ static void          i_coal_resource_iterate_tasks(coalition_t coal, void *ctx,
 static_assert(COALITION_NUM_THREAD_QOS_TYPES == THREAD_QOS_LAST);
 
 struct i_resource_coalition {
+       /*
+        * This keeps track of resource utilization of tasks that are no longer active
+        * in the coalition and is updated when a task is removed from the coalition.
+        */
        ledger_t ledger;
        uint64_t bytesread;
        uint64_t byteswritten;
@@ -184,9 +204,15 @@ struct i_resource_coalition {
        uint64_t logical_deferred_writes;
        uint64_t logical_invalidated_writes;
        uint64_t logical_metadata_writes;
+       uint64_t logical_immediate_writes_to_external;
+       uint64_t logical_deferred_writes_to_external;
+       uint64_t logical_invalidated_writes_to_external;
+       uint64_t logical_metadata_writes_to_external;
        uint64_t cpu_ptime;
        uint64_t cpu_time_eqos[COALITION_NUM_THREAD_QOS_TYPES];      /* cpu time per effective QoS class */
        uint64_t cpu_time_rqos[COALITION_NUM_THREAD_QOS_TYPES];      /* cpu time per requested QoS class */
+       uint64_t cpu_instructions;
+       uint64_t cpu_cycles;
 
        uint64_t task_count;      /* tasks that have started in this coalition */
        uint64_t dead_task_count; /* tasks that have exited in this coalition;
@@ -200,6 +226,11 @@ struct i_resource_coalition {
        uint64_t time_nonempty;
 
        queue_head_t tasks;         /* List of active tasks in the coalition */
+       /*
+        * This ledger is used for triggering resource exception. For the tracked resources, this is updated
+        * when the member tasks' resource usage changes.
+        */
+       ledger_t resource_monitor_ledger;
 };
 
 /*
@@ -212,7 +243,7 @@ static kern_return_t i_coal_jetsam_adopt_task(coalition_t coal, task_t task);
 static kern_return_t i_coal_jetsam_remove_task(coalition_t coal, task_t task);
 static kern_return_t i_coal_jetsam_set_taskrole(coalition_t coal,
     task_t task, int role);
-static int           i_coal_jetsam_get_taskrole(coalition_t coal, task_t task);
+int           i_coal_jetsam_get_taskrole(coalition_t coal, task_t task);
 static void          i_coal_jetsam_iterate_tasks(coalition_t coal, void *ctx,
     void (*callback)(coalition_t, void *, task_t));
 
@@ -256,7 +287,7 @@ struct coalition {
 
        queue_chain_t coalitions;   /* global list of coalitions */
 
-       decl_lck_mtx_data(, lock)    /* Coalition lock. */
+       decl_lck_mtx_data(, lock);    /* Coalition lock. */
 
        /* put coalition type-specific structures here */
        union {
@@ -316,6 +347,178 @@ static const struct coalition_type
 #endif /* CONFIG_EMBEDDED */
 
 
+/*
+ *
+ * Coalition ledger implementation
+ *
+ */
+
+struct coalition_ledger_indices coalition_ledgers =
+{.logical_writes = -1, };
+void __attribute__((noinline)) SENDING_NOTIFICATION__THIS_COALITION_IS_CAUSING_TOO_MUCH_IO(int flavor);
+
+ledger_t
+coalition_ledger_get_from_task(task_t task)
+{
+       ledger_t ledger = LEDGER_NULL;
+       coalition_t coal = task->coalition[COALITION_TYPE_RESOURCE];
+
+       if (coal != NULL && (!queue_empty(&task->task_coalition[COALITION_TYPE_RESOURCE]))) {
+               ledger = coal->r.resource_monitor_ledger;
+               ledger_reference(ledger);
+       }
+       return ledger;
+}
+
+
+enum {
+       COALITION_IO_LEDGER_ENABLE,
+       COALITION_IO_LEDGER_DISABLE
+};
+
+void
+coalition_io_monitor_ctl(struct coalition *coalition, uint32_t flags, int64_t limit)
+{
+       ledger_t ledger = coalition->r.resource_monitor_ledger;
+
+       if (flags == COALITION_IO_LEDGER_ENABLE) {
+               /* Configure the logical I/O ledger */
+               ledger_set_limit(ledger, coalition_ledgers.logical_writes, (limit * 1024 * 1024), 0);
+               ledger_set_period(ledger, coalition_ledgers.logical_writes, (COALITION_LEDGER_MONITOR_INTERVAL_SECS * NSEC_PER_SEC));
+       } else if (flags == COALITION_IO_LEDGER_DISABLE) {
+               ledger_disable_refill(ledger, coalition_ledgers.logical_writes);
+               ledger_disable_callback(ledger, coalition_ledgers.logical_writes);
+       }
+}
+
+int
+coalition_ledger_set_logical_writes_limit(struct coalition *coalition, int64_t limit)
+{
+       int error = 0;
+
+       /*  limit = -1 will be used to disable the limit and the callback */
+       if (limit > COALITION_MAX_LOGICAL_WRITES_LIMIT || limit == 0 || limit < -1) {
+               error = EINVAL;
+               goto out;
+       }
+
+       coalition_lock(coalition);
+       if (limit == -1) {
+               coalition_io_monitor_ctl(coalition, COALITION_IO_LEDGER_DISABLE, limit);
+       } else {
+               coalition_io_monitor_ctl(coalition, COALITION_IO_LEDGER_ENABLE, limit);
+       }
+       coalition_unlock(coalition);
+out:
+       return error;
+}
+
+void __attribute__((noinline))
+SENDING_NOTIFICATION__THIS_COALITION_IS_CAUSING_TOO_MUCH_IO(int flavor)
+{
+       int pid = proc_selfpid();
+       ledger_amount_t new_limit;
+       task_t task = current_task();
+       struct ledger_entry_info lei;
+       kern_return_t kr;
+       ledger_t ledger;
+       struct coalition *coalition = task->coalition[COALITION_TYPE_RESOURCE];
+
+       assert(coalition != NULL);
+       ledger = coalition->r.resource_monitor_ledger;
+
+       switch (flavor) {
+       case FLAVOR_IO_LOGICAL_WRITES:
+               ledger_get_entry_info(ledger, coalition_ledgers.logical_writes, &lei);
+               trace_resource_violation(RMON_LOGWRITES_VIOLATED, &lei);
+               break;
+       default:
+               goto Exit;
+       }
+
+       os_log(OS_LOG_DEFAULT, "Coalition [%lld] caught causing excessive I/O (flavor: %d). Task I/O: %lld MB. [Limit : %lld MB per %lld secs]. Triggered by process [%d]\n",
+           coalition->id, flavor, (lei.lei_balance / (1024 * 1024)), (lei.lei_limit / (1024 * 1024)),
+           (lei.lei_refill_period / NSEC_PER_SEC), pid);
+
+       kr = send_resource_violation(send_disk_writes_violation, task, &lei, kRNFlagsNone);
+       if (kr) {
+               os_log(OS_LOG_DEFAULT, "ERROR %#x returned from send_resource_violation(disk_writes, ...)\n", kr);
+       }
+
+       /*
+        * Continue to monitor the coalition after it hits the initital limit, but increase
+        * the limit exponentially so that we don't spam the listener.
+        */
+       new_limit = (lei.lei_limit / 1024 / 1024) * 4;
+       coalition_lock(coalition);
+       if (new_limit > COALITION_MAX_LOGICAL_WRITES_LIMIT) {
+               coalition_io_monitor_ctl(coalition, COALITION_IO_LEDGER_DISABLE, -1);
+       } else {
+               coalition_io_monitor_ctl(coalition, COALITION_IO_LEDGER_ENABLE, new_limit);
+       }
+       coalition_unlock(coalition);
+
+Exit:
+       return;
+}
+
+void
+coalition_io_rate_exceeded(int warning, const void *param0, __unused const void *param1)
+{
+       if (warning == 0) {
+               SENDING_NOTIFICATION__THIS_COALITION_IS_CAUSING_TOO_MUCH_IO((int)param0);
+       }
+}
+
+void
+init_coalition_ledgers(void)
+{
+       ledger_template_t t;
+       assert(coalition_ledger_template == NULL);
+
+       if ((t = ledger_template_create("Per-coalition ledgers")) == NULL) {
+               panic("couldn't create coalition ledger template");
+       }
+
+       coalition_ledgers.logical_writes = ledger_entry_add(t, "logical_writes", "res", "bytes");
+
+       if (coalition_ledgers.logical_writes < 0) {
+               panic("couldn't create entries for coaliton ledger template");
+       }
+
+       ledger_set_callback(t, coalition_ledgers.logical_writes, coalition_io_rate_exceeded, (void *)FLAVOR_IO_LOGICAL_WRITES, NULL);
+       ledger_template_complete(t);
+
+       coalition_task_ledger_template = ledger_template_copy(task_ledger_template, "Coalition task ledgers");
+
+       if (coalition_task_ledger_template == NULL) {
+               panic("couldn't create coalition task ledger template");
+       }
+
+       ledger_template_complete(coalition_task_ledger_template);
+
+       coalition_ledger_template = t;
+}
+
+void
+coalition_io_ledger_update(task_t task, int32_t flavor, boolean_t is_credit, uint32_t io_size)
+{
+       ledger_t ledger;
+       coalition_t coal = task->coalition[COALITION_TYPE_RESOURCE];
+
+       assert(coal != NULL);
+       ledger = coal->r.resource_monitor_ledger;
+       if (LEDGER_VALID(ledger)) {
+               if (flavor == FLAVOR_IO_LOGICAL_WRITES) {
+                       if (is_credit) {
+                               ledger_credit(ledger, coalition_ledgers.logical_writes, io_size);
+                       } else {
+                               ledger_debit(ledger, coalition_ledgers.logical_writes, io_size);
+                       }
+               }
+       }
+}
+
 static void
 coalition_notify_user(uint64_t id, uint32_t flags)
 {
@@ -341,12 +544,18 @@ i_coal_resource_init(coalition_t coal, boolean_t privileged)
 {
        (void)privileged;
        assert(coal && coal->type == COALITION_TYPE_RESOURCE);
-       coal->r.ledger = ledger_instantiate(task_ledger_template,
+       coal->r.ledger = ledger_instantiate(coalition_task_ledger_template,
            LEDGER_CREATE_ACTIVE_ENTRIES);
        if (coal->r.ledger == NULL) {
                return KERN_RESOURCE_SHORTAGE;
        }
 
+       coal->r.resource_monitor_ledger = ledger_instantiate(coalition_ledger_template,
+           LEDGER_CREATE_ACTIVE_ENTRIES);
+       if (coal->r.resource_monitor_ledger == NULL) {
+               return KERN_RESOURCE_SHORTAGE;
+       }
+
        queue_init(&coal->r.tasks);
 
        return KERN_SUCCESS;
@@ -356,7 +565,9 @@ static void
 i_coal_resource_dealloc(coalition_t coal)
 {
        assert(coal && coal->type == COALITION_TYPE_RESOURCE);
+
        ledger_dereference(coal->r.ledger);
+       ledger_dereference(coal->r.resource_monitor_ledger);
 }
 
 static kern_return_t
@@ -429,12 +640,24 @@ i_coal_resource_remove_task(coalition_t coal, task_t task)
 #else
                cr->energy += task_energy(task);
 #endif
-               cr->logical_immediate_writes += task->task_immediate_writes;
-               cr->logical_deferred_writes += task->task_deferred_writes;
-               cr->logical_invalidated_writes += task->task_invalidated_writes;
-               cr->logical_metadata_writes += task->task_metadata_writes;
+               cr->logical_immediate_writes += task->task_writes_counters_internal.task_immediate_writes;
+               cr->logical_deferred_writes += task->task_writes_counters_internal.task_deferred_writes;
+               cr->logical_invalidated_writes += task->task_writes_counters_internal.task_invalidated_writes;
+               cr->logical_metadata_writes += task->task_writes_counters_internal.task_metadata_writes;
+               cr->logical_immediate_writes_to_external += task->task_writes_counters_external.task_immediate_writes;
+               cr->logical_deferred_writes_to_external += task->task_writes_counters_external.task_deferred_writes;
+               cr->logical_invalidated_writes_to_external += task->task_writes_counters_external.task_invalidated_writes;
+               cr->logical_metadata_writes_to_external += task->task_writes_counters_external.task_metadata_writes;
                cr->cpu_ptime += task_cpu_ptime(task);
                task_update_cpu_time_qos_stats(task, cr->cpu_time_eqos, cr->cpu_time_rqos);
+#if MONOTONIC
+               uint64_t counts[MT_CORE_NFIXED] = {};
+               (void)mt_fixed_task_counts(task, counts);
+               cr->cpu_cycles += counts[MT_CORE_CYCLES];
+#if defined(MT_CORE_INSTRS)
+               cr->cpu_instructions += counts[MT_CORE_INSTRS];
+#endif /* defined(MT_CORE_INSTRS) */
+#endif /* MONOTONIC */
        }
 
        /* remove the task from the coalition's list */
@@ -498,7 +721,7 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us
                }
        }
 
-       ledger_t sum_ledger = ledger_instantiate(task_ledger_template, LEDGER_CREATE_ACTIVE_ENTRIES);
+       ledger_t sum_ledger = ledger_instantiate(coalition_task_ledger_template, LEDGER_CREATE_ACTIVE_ENTRIES);
        if (sum_ledger == LEDGER_NULL) {
                return KERN_RESOURCE_SHORTAGE;
        }
@@ -518,6 +741,10 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us
        uint64_t logical_deferred_writes = coal->r.logical_deferred_writes;
        uint64_t logical_invalidated_writes = coal->r.logical_invalidated_writes;
        uint64_t logical_metadata_writes = coal->r.logical_metadata_writes;
+       uint64_t logical_immediate_writes_to_external = coal->r.logical_immediate_writes_to_external;
+       uint64_t logical_deferred_writes_to_external = coal->r.logical_deferred_writes_to_external;
+       uint64_t logical_invalidated_writes_to_external = coal->r.logical_invalidated_writes_to_external;
+       uint64_t logical_metadata_writes_to_external = coal->r.logical_metadata_writes_to_external;
        int64_t cpu_time_billed_to_me = 0;
        int64_t cpu_time_billed_to_others = 0;
        int64_t energy_billed_to_me = 0;
@@ -527,6 +754,9 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us
        memcpy(cpu_time_eqos, coal->r.cpu_time_eqos, sizeof(cpu_time_eqos));
        uint64_t cpu_time_rqos[COALITION_NUM_THREAD_QOS_TYPES];
        memcpy(cpu_time_rqos, coal->r.cpu_time_rqos, sizeof(cpu_time_rqos));
+       uint64_t cpu_instructions = coal->r.cpu_instructions;
+       uint64_t cpu_cycles = coal->r.cpu_cycles;
+
        /*
         * Add to that all the active tasks' ledgers. Tasks cannot deallocate
         * out from under us, since we hold the coalition lock.
@@ -549,12 +779,25 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us
 #else
                energy += task_energy(task);
 #endif
-               logical_immediate_writes += task->task_immediate_writes;
-               logical_deferred_writes += task->task_deferred_writes;
-               logical_invalidated_writes += task->task_invalidated_writes;
-               logical_metadata_writes += task->task_metadata_writes;
+               logical_immediate_writes += task->task_writes_counters_internal.task_immediate_writes;
+               logical_deferred_writes += task->task_writes_counters_internal.task_deferred_writes;
+               logical_invalidated_writes += task->task_writes_counters_internal.task_invalidated_writes;
+               logical_metadata_writes += task->task_writes_counters_internal.task_metadata_writes;
+               logical_immediate_writes_to_external += task->task_writes_counters_external.task_immediate_writes;
+               logical_deferred_writes_to_external += task->task_writes_counters_external.task_deferred_writes;
+               logical_invalidated_writes_to_external += task->task_writes_counters_external.task_invalidated_writes;
+               logical_metadata_writes_to_external += task->task_writes_counters_external.task_metadata_writes;
+
                cpu_ptime += task_cpu_ptime(task);
                task_update_cpu_time_qos_stats(task, cpu_time_eqos, cpu_time_rqos);
+#if MONOTONIC
+               uint64_t counts[MT_CORE_NFIXED] = {};
+               (void)mt_fixed_task_counts(task, counts);
+               cpu_cycles += counts[MT_CORE_CYCLES];
+#if defined(MT_CORE_INSTRS)
+               cpu_instructions += counts[MT_CORE_INSTRS];
+#endif /* defined(MT_CORE_INSTRS) */
+#endif /* MONOTONIC */
        }
 
        kr = ledger_get_balance(sum_ledger, task_ledgers.cpu_time_billed_to_me, (int64_t *)&cpu_time_billed_to_me);
@@ -620,9 +863,15 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us
        cru_out->logical_deferred_writes = logical_deferred_writes;
        cru_out->logical_invalidated_writes = logical_invalidated_writes;
        cru_out->logical_metadata_writes = logical_metadata_writes;
+       cru_out->logical_immediate_writes_to_external = logical_immediate_writes_to_external;
+       cru_out->logical_deferred_writes_to_external = logical_deferred_writes_to_external;
+       cru_out->logical_invalidated_writes_to_external = logical_invalidated_writes_to_external;
+       cru_out->logical_metadata_writes_to_external = logical_metadata_writes_to_external;
        cru_out->cpu_ptime = cpu_ptime;
        cru_out->cpu_time_eqos_len = COALITION_NUM_THREAD_QOS_TYPES;
        memcpy(cru_out->cpu_time_eqos, cpu_time_eqos, sizeof(cru_out->cpu_time_eqos));
+       cru_out->cpu_cycles = cpu_cycles;
+       cru_out->cpu_instructions = cpu_instructions;
        ledger_dereference(sum_ledger);
        sum_ledger = LEDGER_NULL;
 
@@ -776,7 +1025,7 @@ i_coal_jetsam_set_taskrole(coalition_t coal, task_t task, int role)
        return KERN_SUCCESS;
 }
 
-static int
+int
 i_coal_jetsam_get_taskrole(coalition_t coal, task_t task)
 {
        struct i_jetsam_coalition *cj;
@@ -1176,7 +1425,7 @@ task_coalition_adjust_focal_count(task_t task, int count, uint32_t *new_count)
                return FALSE;
        }
 
-       *new_count = hw_atomic_add(&coal->focal_task_count, count);
+       *new_count = os_atomic_add(&coal->focal_task_count, count, relaxed);
        assert(*new_count != UINT32_MAX);
        return TRUE;
 }
@@ -1200,7 +1449,7 @@ task_coalition_adjust_nonfocal_count(task_t task, int count, uint32_t *new_count
                return FALSE;
        }
 
-       *new_count = hw_atomic_add(&coal->nonfocal_task_count, count);
+       *new_count = os_atomic_add(&coal->nonfocal_task_count, count, relaxed);
        assert(*new_count != UINT32_MAX);
        return TRUE;
 }
@@ -1672,6 +1921,8 @@ coalitions_init(void)
 
        init_task_ledgers();
 
+       init_coalition_ledgers();
+
        for (i = 0, ctype = &s_coalition_types[0]; i < COALITION_NUM_TYPES; ctype++, i++) {
                /* verify the entry in the global coalition types array */
                if (ctype->type != i ||
@@ -1735,47 +1986,38 @@ coalitions_get_list(int type, struct procinfo_coalinfo *coal_list, int list_sz)
 }
 
 /*
- * Jetsam coalition interface
- *
+ * Return the coaltion of the given type to which the task belongs.
  */
-boolean_t
-coalition_is_leader(task_t task, int coal_type, coalition_t *coal)
+coalition_t
+task_get_coalition(task_t task, int coal_type)
 {
        coalition_t c;
-       boolean_t ret;
-
-       if (coal) { /* handle the error cases gracefully */
-               *coal = COALITION_NULL;
-       }
-
-       if (!task) {
-               return FALSE;
-       }
 
-       if (coal_type > COALITION_TYPE_MAX) {
-               return FALSE;
+       if (task == NULL || coal_type > COALITION_TYPE_MAX) {
+               return COALITION_NULL;
        }
 
        c = task->coalition[coal_type];
-       if (!c) {
-               return FALSE;
-       }
+       assert(c == COALITION_NULL || (int)c->type == coal_type);
+       return c;
+}
 
-       assert((int)c->type == coal_type);
+/*
+ * Report if the given task is the leader of the given jetsam coalition.
+ */
+boolean_t
+coalition_is_leader(task_t task, coalition_t coal)
+{
+       boolean_t ret = FALSE;
 
-       coalition_lock(c);
+       if (coal != COALITION_NULL) {
+               coalition_lock(coal);
 
-       if (coal) {
-               *coal = c;
-       }
+               ret = (coal->type == COALITION_TYPE_JETSAM && coal->j.leader == task);
 
-       ret = FALSE;
-       if (c->type == COALITION_TYPE_JETSAM && c->j.leader == task) {
-               ret = TRUE;
+               coalition_unlock(coal);
        }
 
-       coalition_unlock(c);
-
        return ret;
 }
 
index 29da7719cadaf8852d630f6b21be57088fa85f47..5afbf004bd50be3299a33cbf1a4a72b98ec5ba86 100644 (file)
@@ -79,6 +79,23 @@ void coalition_for_each_task(coalition_t coal, void *ctx,
 
 void coalition_set_efficient(coalition_t coal);
 
+/*  Coalition ledger  */
+struct coalition_ledger_indices {
+       int logical_writes;
+};
+void init_coalition_ledgers(void);
+int coalition_ledger_set_logical_writes_limit(coalition_t coal, int64_t limit);
+void coalition_io_monitor_ctl(struct coalition *coalition, uint32_t flags, int64_t limit);
+ledger_t coalition_ledger_get_from_task(task_t task);
+void coalition_io_rate_exceeded(int warning, const void *param0, __unused const void *param1);
+void coalition_io_ledger_update(task_t task, int32_t flavor, boolean_t is_credit, uint32_t io_size);
+
+/* Max limit for coalition logical_writes ledger in MB. Setting to 16 TB */
+#define COALITION_MAX_LOGICAL_WRITES_LIMIT ((ledger_amount_t)(1ULL << 24))
+/* logical_writes ledger's refill time interval */
+#define COALITION_LEDGER_MONITOR_INTERVAL_SECS (24 * 60 * 60)
+
+
 typedef void (*coalition_iterate_fn_t)(void*, int, coalition_t);
 kern_return_t coalition_iterate_stackshot(coalition_iterate_fn_t callout, void *arg, uint32_t coalition_type);
 
index bd04dc7da8c1f8edd4f783f0f2847ecabc89cb6e..57c43f5b4ddd0b19c31e8ab91c46904b443941f6 100644 (file)
@@ -80,7 +80,7 @@ static uint64_t cpu_checkin_last_commit;
 #define CPU_CHECKIN_MIN_INTERVAL_US     4000 /* 4ms */
 #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */
 static uint64_t cpu_checkin_min_interval;
-uint32_t cpu_checkin_min_interval_us;
+static uint32_t cpu_checkin_min_interval_us;
 
 #if __LP64__
 static_assert(MAX_CPUS <= 32);
@@ -134,6 +134,12 @@ cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us)
        cpu_checkin_min_interval = abstime;
 }
 
+uint32_t
+cpu_quiescent_counter_get_min_interval_us(void)
+{
+       return cpu_checkin_min_interval_us;
+}
+
 
 /*
  * Called when all running CPUs have checked in.
@@ -151,7 +157,7 @@ cpu_quiescent_counter_commit(uint64_t ctime)
 
        cpu_checkin_last_commit = ctime;
 
-       old_state = os_atomic_and(&cpu_quiescing_checkin_state, ~CPU_CHECKIN_MASK, release);
+       old_state = os_atomic_andnot(&cpu_quiescing_checkin_state, CPU_CHECKIN_MASK, release);
 
        KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0);
 }
@@ -272,8 +278,8 @@ cpu_quiescent_counter_leave(uint64_t ctime)
 
        checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid);
 
-       checkin_mask_t orig_state = os_atomic_and_orig(&cpu_quiescing_checkin_state,
-           ~mask, acq_rel);
+       checkin_mask_t orig_state = os_atomic_andnot_orig(&cpu_quiescing_checkin_state,
+           mask, acq_rel);
 
        assert((orig_state & cpu_expected_bit(cpuid)));
 
index 1c95370425b2f71614551fb59aa5d4934dc8b5ef..261669a2d045894ff1e47c32606d52ae2fb6d2bd 100644 (file)
@@ -54,8 +54,8 @@ extern void cpu_quiescent_counter_ast(void);
 extern void cpu_quiescent_counter_init(void);
 
 /* use of these is guarded by the config */
-extern uint32_t cpu_checkin_min_interval_us;
 extern void cpu_quiescent_counter_set_min_interval_us(uint32_t new_value);
+extern uint32_t cpu_quiescent_counter_get_min_interval_us(void);
 
 #else /* CONFIG_QUIESCE_COUNTER */
 
index dd7a28996cff343a375377379207c8e97ffb0d5d..8578f687adbc7f8f2d76a1ee0f127f580eab4e36 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -73,6 +73,7 @@
 #include <kern/kern_cdata.h>
 #include <kern/zalloc.h>
 #include <vm/vm_kern.h>
+#include <vm/vm_map.h>
 #include <vm/pmap.h>
 #include <stdarg.h>
 #include <stdatomic.h>
@@ -85,6 +86,8 @@
 #include <kern/processor.h>
 
 #if defined(__i386__) || defined(__x86_64__)
+#include <IOKit/IOBSD.h>
+
 #include <i386/cpu_threads.h>
 #include <i386/pmCPU.h>
 #endif
@@ -173,24 +176,27 @@ uint64_t debugger_panic_options = 0;
 const char *debugger_message = NULL;
 unsigned long debugger_panic_caller = 0;
 
-void panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsigned int reason, void *ctx,
-    uint64_t panic_options_mask, void *panic_data, unsigned long panic_caller);
-static void kdp_machine_reboot_type(unsigned int type);
-__attribute__((noreturn)) void panic_spin_forever(void);
+void panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args,
+    unsigned int reason, void *ctx, uint64_t panic_options_mask, void *panic_data,
+    unsigned long panic_caller) __dead2;
+static void kdp_machine_reboot_type(unsigned int type, uint64_t debugger_flags);
+void panic_spin_forever(void) __dead2;
 extern kern_return_t do_stackshot(void);
+extern void PE_panic_hook(const char*);
 
+#if CONFIG_NONFATAL_ASSERTS
 int mach_assert = 1;
+#endif
 
 #define NESTEDDEBUGGERENTRYMAX 5
+static unsigned int max_debugger_entry_count = NESTEDDEBUGGERENTRYMAX;
 
 #if CONFIG_EMBEDDED
 #define DEBUG_BUF_SIZE (4096)
 #define KDBG_TRACE_PANIC_FILENAME "/var/log/panic.trace"
 #else
-/*
- * EXTENDED_/DEBUG_BUF_SIZE can't grow without updates to SMC and iBoot to store larger panic logs on co-processor systems */
 #define DEBUG_BUF_SIZE ((3 * PAGE_SIZE) + offsetof(struct macos_panic_header, mph_data))
-#define EXTENDED_DEBUG_BUF_SIZE 0x0013ff80
+/* EXTENDED_DEBUG_BUF_SIZE definition is now in debug.h */
 static_assert(((EXTENDED_DEBUG_BUF_SIZE % PANIC_FLUSH_BOUNDARY) == 0), "Extended debug buf size must match SMC alignment requirements");
 #define KDBG_TRACE_PANIC_FILENAME "/var/tmp/panic.trace"
 #endif
@@ -257,6 +263,14 @@ int kext_assertions_enable =
     FALSE;
 #endif
 
+/*
+ * Maintain the physically-contiguous carveout for the `phys_carveout_mb`
+ * boot-arg.
+ */
+SECURITY_READ_ONLY_LATE(vm_offset_t) phys_carveout = 0;
+SECURITY_READ_ONLY_LATE(uintptr_t) phys_carveout_pa = 0;
+SECURITY_READ_ONLY_LATE(size_t) phys_carveout_size = 0;
+
 void
 panic_init(void)
 {
@@ -269,9 +283,11 @@ panic_init(void)
                uuid_unparse_upper(*(uuid_t *)uuid, kernel_uuid_string);
        }
 
+#if CONFIG_NONFATAL_ASSERTS
        if (!PE_parse_boot_argn("assertions", &mach_assert, sizeof(mach_assert))) {
                mach_assert = 1;
        }
+#endif
 
        /*
         * Initialize the value of the debug boot-arg
@@ -298,6 +314,11 @@ panic_init(void)
 #endif
 #endif /* CONFIG_EMBEDDED */
        }
+
+       if (!PE_parse_boot_argn("nested_panic_max", &max_debugger_entry_count, sizeof(max_debugger_entry_count))) {
+               max_debugger_entry_count = NESTEDDEBUGGERENTRYMAX;
+       }
+
 #endif /* ((CONFIG_EMBEDDED && MACH_KDP) || defined(__x86_64__)) */
 
 #if DEVELOPMENT || DEBUG
@@ -342,6 +363,15 @@ extended_debug_log_init(void)
        debug_buf_size = (EXTENDED_DEBUG_BUF_SIZE - offsetof(struct macos_panic_header, mph_data));
 
        extended_debug_log_enabled = TRUE;
+
+       /*
+        * Insert a compiler barrier so we don't free the other panic stackshot buffer
+        * until after we've marked the new one as available
+        */
+       __compiler_barrier();
+       kmem_free(kernel_map, panic_stackshot_buf, panic_stackshot_buf_len);
+       panic_stackshot_buf = 0;
+       panic_stackshot_buf_len = 0;
 }
 #endif /* defined (__x86_64__) */
 
@@ -358,14 +388,64 @@ debug_log_init(void)
        debug_buf_ptr = debug_buf_base;
        debug_buf_size = gPanicSize - sizeof(struct embedded_panic_header);
 #else
+       kern_return_t kr = KERN_SUCCESS;
        bzero(panic_info, DEBUG_BUF_SIZE);
 
        assert(debug_buf_base != NULL);
        assert(debug_buf_ptr != NULL);
        assert(debug_buf_size != 0);
+
+       /*
+        * We allocate a buffer to store a panic time stackshot. If we later discover that this is a
+        * system that supports flushing a stackshot via an extended debug log (see above), we'll free this memory
+        * as it's not necessary on this platform. This information won't be available until the IOPlatform has come
+        * up.
+        */
+       kr = kmem_alloc(kernel_map, &panic_stackshot_buf, PANIC_STACKSHOT_BUFSIZE, VM_KERN_MEMORY_DIAG);
+       assert(kr == KERN_SUCCESS);
+       if (kr == KERN_SUCCESS) {
+               panic_stackshot_buf_len = PANIC_STACKSHOT_BUFSIZE;
+       }
 #endif
 }
 
+void
+phys_carveout_init(void)
+{
+       if (!PE_i_can_has_debugger(NULL)) {
+               return;
+       }
+
+       unsigned int phys_carveout_mb = 0;
+
+       if (!PE_parse_boot_argn("phys_carveout_mb", &phys_carveout_mb,
+           sizeof(phys_carveout_mb))) {
+               return;
+       }
+       if (phys_carveout_mb == 0) {
+               return;
+       }
+
+       size_t size = 0;
+       if (os_mul_overflow(phys_carveout_mb, 1024 * 1024, &size)) {
+               printf("phys_carveout_mb size overflowed (%uMB)\n",
+                   phys_carveout_mb);
+               return;
+       }
+
+       kern_return_t kr = kmem_alloc_contig(kernel_map, &phys_carveout, size,
+           VM_MAP_PAGE_MASK(kernel_map), 0, 0, KMA_NOPAGEWAIT,
+           VM_KERN_MEMORY_DIAG);
+       if (kr != KERN_SUCCESS) {
+               printf("failed to allocate %uMB for phys_carveout_mb: %u\n",
+                   phys_carveout_mb, (unsigned int)kr);
+               return;
+       }
+
+       phys_carveout_pa = kvtophys(phys_carveout);
+       phys_carveout_size = size;
+}
+
 static void
 DebuggerLock()
 {
@@ -373,7 +453,7 @@ DebuggerLock()
        int debugger_exp_cpu = DEBUGGER_NO_CPU;
        assert(ml_get_interrupts_enabled() == FALSE);
 
-       if (debugger_cpu == my_cpu) {
+       if (atomic_load(&debugger_cpu) == my_cpu) {
                return;
        }
 
@@ -387,7 +467,7 @@ DebuggerLock()
 static void
 DebuggerUnlock()
 {
-       assert(debugger_cpu == cpu_number());
+       assert(atomic_load_explicit(&debugger_cpu, memory_order_relaxed) == cpu_number());
 
        /*
         * We don't do an atomic exchange here in case
@@ -396,7 +476,7 @@ DebuggerUnlock()
         * lock so we can simply store DEBUGGER_NO_CPU and follow with
         * a barrier.
         */
-       debugger_cpu = DEBUGGER_NO_CPU;
+       atomic_store(&debugger_cpu, DEBUGGER_NO_CPU);
        OSMemoryBarrier();
 
        return;
@@ -486,10 +566,12 @@ Assert(
        const char      *expression
        )
 {
+#if CONFIG_NONFATAL_ASSERTS
        if (!mach_assert) {
                kprintf("%s:%d non-fatal Assertion: %s", file, line, expression);
                return;
        }
+#endif
 
        panic_plain("%s:%d Assertion failed: %s", file, line, expression);
 }
@@ -513,7 +595,7 @@ DebuggerWithContext(unsigned int reason, void *ctx, const char *message,
 
        CPUDEBUGGERCOUNT++;
 
-       if (CPUDEBUGGERCOUNT > NESTEDDEBUGGERENTRYMAX) {
+       if (CPUDEBUGGERCOUNT > max_debugger_entry_count) {
                static boolean_t in_panic_kprintf = FALSE;
 
                /* Notify any listeners that we've started a panic */
@@ -522,12 +604,12 @@ DebuggerWithContext(unsigned int reason, void *ctx, const char *message,
                if (!in_panic_kprintf) {
                        in_panic_kprintf = TRUE;
                        kprintf("Detected nested debugger entry count exceeding %d\n",
-                           NESTEDDEBUGGERENTRYMAX);
+                           max_debugger_entry_count);
                        in_panic_kprintf = FALSE;
                }
 
                if (!panicDebugging) {
-                       kdp_machine_reboot_type(kPEPanicRestartCPU);
+                       kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_options_mask);
                }
 
                panic_spin_forever();
@@ -689,8 +771,11 @@ void
 panic_with_thread_context(unsigned int reason, void *ctx, uint64_t debugger_options_mask, thread_t thread, const char *str, ...)
 {
        va_list panic_str_args;
+       __assert_only os_ref_count_t th_ref_count;
 
        assert_thread_magic(thread);
+       th_ref_count = os_ref_get_count(&thread->ref_count);
+       assertf(th_ref_count > 0, "panic_with_thread_context called with invalid thread %p with refcount %u", thread, th_ref_count);
 
        /* Take a reference on the thread so it doesn't disappear by the time we try to backtrace it */
        thread_reference(thread);
@@ -718,17 +803,12 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign
 
        if (ml_wants_panic_trap_to_debugger()) {
                ml_panic_trap_to_debugger(panic_format_str, panic_args, reason, ctx, panic_options_mask, panic_caller);
-
-               /*
-                * This should not return, but we return here for the tail call
-                * as it simplifies the backtrace.
-                */
-               return;
+               __builtin_trap();
        }
 
        CPUDEBUGGERCOUNT++;
 
-       if (CPUDEBUGGERCOUNT > NESTEDDEBUGGERENTRYMAX) {
+       if (CPUDEBUGGERCOUNT > max_debugger_entry_count) {
                static boolean_t in_panic_kprintf = FALSE;
 
                /* Notify any listeners that we've started a panic */
@@ -737,12 +817,12 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign
                if (!in_panic_kprintf) {
                        in_panic_kprintf = TRUE;
                        kprintf("Detected nested debugger entry count exceeding %d\n",
-                           NESTEDDEBUGGERENTRYMAX);
+                           max_debugger_entry_count);
                        in_panic_kprintf = FALSE;
                }
 
                if (!panicDebugging) {
-                       kdp_machine_reboot_type(kPEPanicRestartCPU);
+                       kdp_machine_reboot_type(kPEPanicRestartCPU, panic_options_mask);
                }
 
                panic_spin_forever();
@@ -752,11 +832,7 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign
        DEBUGGER_DEBUGGING_NESTED_PANIC_IF_REQUESTED((panic_options_mask & DEBUGGER_OPTION_RECURPANIC_ENTRY));
 #endif
 
-#if CONFIG_EMBEDDED
-       if (PE_arm_debug_panic_hook) {
-               PE_arm_debug_panic_hook(panic_format_str);
-       }
-#endif
+       PE_panic_hook(panic_format_str);
 
 #if defined (__x86_64__)
        plctrace_disable();
@@ -805,11 +881,11 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign
         * Not reached.
         */
        panic_stop();
+       __builtin_unreachable();
 }
 
-__attribute__((noreturn))
 void
-panic_spin_forever()
+panic_spin_forever(void)
 {
        paniclog_append_noflush("\nPlease go to https://panic.apple.com to report this panic\n");
 
@@ -818,17 +894,21 @@ panic_spin_forever()
 }
 
 static void
-kdp_machine_reboot_type(unsigned int type)
+kdp_machine_reboot_type(unsigned int type, uint64_t debugger_flags)
 {
        printf("Attempting system restart...");
-       PEHaltRestart(type);
+       if ((type == kPEPanicRestartCPU) && (debugger_flags & DEBUGGER_OPTION_SKIP_PANICEND_CALLOUTS)) {
+               PEHaltRestart(kPEPanicRestartCPUNoPanicEndCallouts);
+       } else {
+               PEHaltRestart(type);
+       }
        halt_all_cpus(TRUE);
 }
 
 void
 kdp_machine_reboot(void)
 {
-       kdp_machine_reboot_type(kPEPanicRestartCPU);
+       kdp_machine_reboot_type(kPEPanicRestartCPU, 0);
 }
 
 /*
@@ -930,7 +1010,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned
 
                /* DEBUGGER_OPTION_PANICLOGANDREBOOT is used for two finger resets on embedded so we get a paniclog */
                if (debugger_panic_options & DEBUGGER_OPTION_PANICLOGANDREBOOT) {
-                       PEHaltRestart(kPEPanicRestartCPU);
+                       PEHaltRestart(kPEPanicRestartCPUNoCallouts);
                }
        }
 
@@ -942,14 +1022,14 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned
         */
        if ((debugger_panic_options & DEBUGGER_OPTION_SKIP_LOCAL_COREDUMP) &&
            (debug_boot_arg & DB_REBOOT_POST_CORE)) {
-               kdp_machine_reboot_type(kPEPanicRestartCPU);
+               kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_panic_options);
        }
 
        /*
         * Consider generating a local corefile if the infrastructure is configured
         * and we haven't disabled on-device coredumps.
         */
-       if (!(debug_boot_arg & DB_DISABLE_LOCAL_CORE)) {
+       if (on_device_corefile_enabled()) {
                if (!kdp_has_polled_corefile()) {
                        if (debug_boot_arg & (DB_KERN_DUMP_ON_PANIC | DB_KERN_DUMP_ON_NMI)) {
                                paniclog_append_noflush("skipping local kernel core because core file could not be opened prior to panic (error : 0x%x)",
@@ -992,13 +1072,13 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned
                         */
                        if ((debug_boot_arg & DB_REBOOT_POST_CORE) &&
                            ((ret == 0) || (debugger_panic_options & DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT))) {
-                               kdp_machine_reboot_type(kPEPanicRestartCPU);
+                               kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_panic_options);
                        }
                }
        }
 
        if (debug_boot_arg & DB_REBOOT_ALWAYS) {
-               kdp_machine_reboot_type(kPEPanicRestartCPU);
+               kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_panic_options);
        }
 
        /* If KDP is configured, try to trap to the debugger */
@@ -1025,7 +1105,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned
 #endif /* CONFIG_KDP_INTERACTIVE_DEBUGGING */
 
        if (!panicDebugging) {
-               kdp_machine_reboot_type(kPEPanicRestartCPU);
+               kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_panic_options);
        }
 
        panic_spin_forever();
@@ -1372,7 +1452,24 @@ panic_display_disk_errors(void)
                panic_disk_error_description[sizeof(panic_disk_error_description) - 1] = '\0';
                paniclog_append_noflush("Root disk errors: \"%s\"\n", panic_disk_error_description);
        }
-};
+}
+
+static void
+panic_display_shutdown_status(void)
+{
+#if defined(__i386__) || defined(__x86_64__)
+       paniclog_append_noflush("System shutdown begun: %s\n", IOPMRootDomainGetWillShutdown() ? "YES" : "NO");
+       if (gIOPolledCoreFileMode == kIOPolledCoreFileModeNotInitialized) {
+               paniclog_append_noflush("Panic diags file unavailable, panic occurred prior to initialization\n");
+       } else if (gIOPolledCoreFileMode != kIOPolledCoreFileModeDisabled) {
+               /*
+                * If we haven't marked the corefile as explicitly disabled, and we've made it past initialization, then we know the current
+                * system was configured to use disk based diagnostics at some point.
+                */
+               paniclog_append_noflush("Panic diags file available: %s (0x%x)\n", (gIOPolledCoreFileMode != kIOPolledCoreFileModeClosed) ? "YES" : "NO", kdp_polled_corefile_error());
+       }
+#endif
+}
 
 extern const char version[];
 extern char osversion[];
@@ -1401,6 +1498,7 @@ panic_display_system_configuration(boolean_t launchd_exit)
                }
                panic_display_model_name();
                panic_display_disk_errors();
+               panic_display_shutdown_status();
                if (!launchd_exit) {
                        panic_display_uptime();
                        panic_display_zprint();
@@ -1528,7 +1626,8 @@ kern_feature_override(uint32_t fmask)
 {
        if (kern_feature_overrides == 0) {
                uint32_t fdisables = 0;
-               /* Expected to be first invoked early, in a single-threaded
+               /*
+                * Expected to be first invoked early, in a single-threaded
                 * environment
                 */
                if (PE_parse_boot_argn("validation_disables", &fdisables, sizeof(fdisables))) {
@@ -1540,3 +1639,32 @@ kern_feature_override(uint32_t fmask)
        }
        return (kern_feature_overrides & fmask) == fmask;
 }
+
+boolean_t
+on_device_corefile_enabled(void)
+{
+       assert(debug_boot_arg_inited);
+#if CONFIG_KDP_INTERACTIVE_DEBUGGING
+       if ((debug_boot_arg != 0) && !(debug_boot_arg & DB_DISABLE_LOCAL_CORE)) {
+               return TRUE;
+       }
+#endif
+       return FALSE;
+}
+
+boolean_t
+panic_stackshot_to_disk_enabled(void)
+{
+       assert(debug_boot_arg_inited);
+#if defined(__x86_64__)
+       if (PEGetCoprocessorVersion() < kCoprocessorVersion2) {
+               /* Only enabled on pre-Gibraltar machines where it hasn't been disabled explicitly */
+               if ((debug_boot_arg != 0) && (debug_boot_arg & DB_DISABLE_STACKSHOT_TO_DISK)) {
+                       return FALSE;
+               }
+
+               return TRUE;
+       }
+#endif
+       return FALSE;
+}
index 7e82f1b34d995e39618ee8f33e550919286edf66..57effae1ccecd86795c68527a8d2a48c28b26dba 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -217,8 +217,8 @@ enum micro_snapshot_flags {
  * Flags used in the following assortment of snapshots.
  */
 enum generic_snapshot_flags {
-       kUser64_p                       = 0x1,
-       kKernel64_p             = 0x2
+       kUser64_p               = 0x1, /* Userspace uses 64 bit pointers */
+       kKernel64_p             = 0x2  /* The kernel uses 64 bit pointers */
 };
 
 #define VM_PRESSURE_TIME_WINDOW 5 /* seconds */
@@ -270,6 +270,7 @@ enum {
 #define KF_MATV_OVRD (0x8)
 #define KF_STACKSHOT_OVRD (0x10)
 #define KF_COMPRSV_OVRD (0x20)
+#define KF_INTERRUPT_MASKED_DEBUG_OVRD (0x40)
 
 boolean_t kern_feature_override(uint32_t fmask);
 
@@ -351,6 +352,35 @@ struct macos_panic_header {
 #define MACOS_PANIC_HEADER_FLAG_COREDUMP_FAILED               0x200
 #define MACOS_PANIC_HEADER_FLAG_STACKSHOT_KERNEL_ONLY         0x400
 
+/*
+ * Any change to the below structure should mirror the structure defined in MacEFIFirmware
+ * (and vice versa)
+ */
+
+struct efi_aurr_panic_header {
+       uint32_t efi_aurr_magic;
+       uint32_t efi_aurr_crc;
+       uint32_t efi_aurr_version;
+       uint32_t efi_aurr_reset_cause;
+       uint32_t efi_aurr_reset_log_offset;
+       uint32_t efi_aurr_reset_log_len;
+       char efi_aurr_panic_data[];
+} __attribute__((packed));
+
+/*
+ * EXTENDED_/DEBUG_BUF_SIZE can't grow without updates to SMC and iBoot to store larger panic logs on co-processor systems
+ */
+#define EXTENDED_DEBUG_BUF_SIZE 0x0013ff80
+
+#define EFI_AURR_PANIC_STRING_MAX_LEN 112
+#define EFI_AURR_EXTENDED_LOG_SIZE (EXTENDED_DEBUG_BUF_SIZE - sizeof(struct efi_aurr_panic_header) - EFI_AURR_PANIC_STRING_MAX_LEN)
+
+struct efi_aurr_extended_panic_log {
+       char efi_aurr_extended_log_buf[EFI_AURR_EXTENDED_LOG_SIZE];
+       uint32_t efi_aurr_log_tail; /* Circular buffer indices */
+       uint32_t efi_aurr_log_head; /* ditto.. */
+} __attribute__((packed));
+
 #endif /* __APPLE_API_UNSTABLE */
 #endif /* __APPLE_API_PRIVATE */
 
@@ -358,7 +388,8 @@ struct macos_panic_header {
 
 __BEGIN_DECLS
 
-extern void panic(const char *string, ...) __printflike(1, 2);
+__abortlike __printflike(1, 2)
+extern void panic(const char *string, ...);
 
 __END_DECLS
 
@@ -445,6 +476,7 @@ enum {
                                         * release bridgeOS.
                                         */
 #define DB_REBOOT_ALWAYS        0x100000 /* Don't wait for debugger connection */
+#define DB_DISABLE_STACKSHOT_TO_DISK 0x200000 /* Disable writing stackshot to local disk */
 
 /*
  * Values for a 64-bit mask that's passed to the debugger.
@@ -460,6 +492,8 @@ enum {
 #define DEBUGGER_OPTION_SKIP_LOCAL_COREDUMP         0x80ULL /* don't try to save local coredumps for this panic */
 #define DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT    0x100ULL /* attempt to save coredump. always reboot */
 #define DEBUGGER_INTERNAL_OPTION_THREAD_BACKTRACE   0x200ULL /* backtrace the specified thread in the paniclog (x86 only) */
+#define DEBUGGER_OPTION_PRINT_CPU_USAGE_PANICLOG    0x400ULL /* print extra CPU usage data in the panic log */
+#define DEBUGGER_OPTION_SKIP_PANICEND_CALLOUTS      0x800ULL /* (bridgeOS) skip the kPEPanicEnd callouts -- don't wait for x86 to finish sending panic data */
 
 #define DEBUGGER_INTERNAL_OPTIONS_MASK              (DEBUGGER_INTERNAL_OPTION_THREAD_BACKTRACE)
 
@@ -472,12 +506,20 @@ __BEGIN_DECLS
 #define PANIC_LOCATION __FILE__ ":" LINE_NUMBER(__LINE__)
 
 #if CONFIG_EMBEDDED
-#define panic(ex, ...) (panic)(# ex, ## __VA_ARGS__)
+#define panic(ex, ...)  ({ \
+               __asm__("" ::: "memory"); \
+               (panic)(# ex, ## __VA_ARGS__); \
+       })
 #else
-#define panic(ex, ...) (panic)(# ex "@" PANIC_LOCATION, ## __VA_ARGS__)
+#define panic(ex, ...)  ({ \
+               __asm__("" ::: "memory"); \
+               (panic)(# ex "@" PANIC_LOCATION, ## __VA_ARGS__); \
+       })
 #endif
 
-void panic_with_options(unsigned int reason, void *ctx, uint64_t debugger_options_mask, const char *str, ...);
+__abortlike __printflike(4, 5)
+void panic_with_options(unsigned int reason, void *ctx,
+    uint64_t debugger_options_mask, const char *str, ...);
 void Debugger(const char * message);
 void populate_model_name(char *);
 
@@ -497,7 +539,9 @@ __END_DECLS
 #if defined (__x86_64__)
 struct thread;
 
-void panic_with_thread_context(unsigned int reason, void *ctx, uint64_t debugger_options_mask, struct thread* th, const char *str, ...);
+__abortlike __printflike(5, 6)
+void panic_with_thread_context(unsigned int reason, void *ctx,
+    uint64_t debugger_options_mask, struct thread* th, const char *str, ...);
 #endif
 
 /* limit the max size to a reasonable length */
@@ -535,6 +579,19 @@ extern "C" {
 kern_return_t
 stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint32_t flags,
     uint64_t delta_since_timestamp, unsigned *bytes_traced);
+
+/*
+ * Returns whether on device corefiles are enabled based on the build
+ * and boot configuration.
+ */
+boolean_t on_device_corefile_enabled(void);
+
+/*
+ * Returns whether panic stackshot to disk is enabled based on the build
+ * and boot configuration.
+ */
+boolean_t panic_stackshot_to_disk_enabled(void);
+
 #ifdef __cplusplus
 }
 #endif
@@ -542,11 +599,16 @@ stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint32_t flags,
 #if !CONFIG_EMBEDDED
 extern char debug_buf[];
 extern boolean_t coprocessor_paniclog_flush;
-extern boolean_t extended_debug_log_enabled;;
+extern boolean_t extended_debug_log_enabled;
 #endif /* !CONFIG_EMBEDDED */
 
 extern char     *debug_buf_base;
 
+#if defined(XNU_TARGET_OS_BRIDGE)
+extern uint64_t macos_panic_base;
+extern unsigned int macos_panic_size;
+#endif /* defined(XNU_TARGET_OS_BRIDGE) */
+
 extern char     kernel_uuid_string[];
 extern char     panic_disk_error_description[];
 extern size_t   panic_disk_error_description_size;
@@ -587,10 +649,22 @@ extern const char       *debugger_panic_str;
 extern char *debug_buf_ptr;
 extern unsigned int debug_buf_size;
 
-extern void     debug_log_init(void);
-extern void     debug_putc(char);
+extern void debug_log_init(void);
+extern void debug_putc(char);
+
+extern void panic_init(void);
+
+/*
+ * Initialize the physical carveout requested with the `phys_carveout_mb`
+ * boot-arg.  This should only be called at kernel startup, when physically
+ * contiguous pages are plentiful.
+ */
+extern void phys_carveout_init(void);
+
+extern uintptr_t phys_carveout_pa;
+extern size_t phys_carveout_size;
+
 
-extern void     panic_init(void);
 
 #if defined (__x86_64__)
 extern void extended_debug_log_init(void);
@@ -598,12 +672,12 @@ extern void extended_debug_log_init(void);
 int     packA(char *inbuf, uint32_t length, uint32_t buflen);
 void    unpackA(char *inbuf, uint32_t length);
 
-#if DEVELOPMENT || DEBUG
 #define PANIC_STACKSHOT_BUFSIZE (1024 * 1024)
 
 extern uintptr_t panic_stackshot_buf;
+extern size_t panic_stackshot_buf_len;
+
 extern size_t panic_stackshot_len;
-#endif /* DEVELOPMENT || DEBUG */
 #endif /* defined (__x86_64__) */
 
 void    SavePanicInfo(const char *message, void *panic_data, uint64_t panic_options);
index 18ec562563e5b29b4dcc1cc1791364045ecf2fe6..1084cdf0d21bc077114989201c6aad5c19d9e759 100644 (file)
@@ -56,9 +56,9 @@
  */
 
 #define EXC_GUARD_DECODE_GUARD_TYPE(code) \
-       (((code) >> 61) & 0x7ull)
+       ((((uint64_t)(code)) >> 61) & 0x7ull)
 #define EXC_GUARD_DECODE_GUARD_FLAVOR(code) \
-       (((code) >> 32) & 0x1fffffff)
+       ((((uint64_t)(code)) >> 32) & 0x1fffffff)
 #define EXC_GUARD_DECODE_GUARD_TARGET(code) \
        ((uint32_t)(code))
 
index d6d6ffbf2cf12e31f6ad62211ed6dd2189f6eb4c..c059f1c50f5075e52d11f6b7dd02a5dc57030ff3 100644 (file)
@@ -92,7 +92,7 @@
 
 #include <pexpert/pexpert.h>
 
-extern int panic_on_exception_triage;
+bool panic_on_exception_triage = false;
 
 unsigned long c_thr_exc_raise = 0;
 unsigned long c_thr_exc_raise_state = 0;
@@ -123,6 +123,23 @@ kern_return_t bsd_exception(
        mach_msg_type_number_t  codeCnt);
 #endif /* MACH_BSD */
 
+/*
+ * Routine: exception_init
+ * Purpose:
+ *   Global initialization of state for exceptions.
+ * Conditions:
+ *   None.
+ */
+void
+exception_init(void)
+{
+       int tmp = 0;
+
+       if (PE_parse_boot_argn("-panic_on_exception_triage", &tmp, sizeof(tmp))) {
+               panic_on_exception_triage = true;
+       }
+}
+
 /*
  *     Routine:        exception_deliver
  *     Purpose:
@@ -204,7 +221,7 @@ exception_deliver(
        lck_mtx_unlock(mutex);
 
        code64 = (behavior & MACH_EXCEPTION_CODES);
-       behavior &= ~MACH_EXCEPTION_CODES;
+       behavior &= ~MACH_EXCEPTION_MASK;
 
        if (!code64) {
                small_code[0] = CAST_DOWN_EXPLICIT(exception_data_type_t, code[0]);
@@ -230,17 +247,12 @@ exception_deliver(
 #endif
 
        if (behavior != EXCEPTION_STATE) {
-               if (thread != current_thread() || exception == EXC_CORPSE_NOTIFY) {
-                       task_reference(task);
-                       task_port = convert_task_to_port(task);
-                       /* task ref consumed */
-                       thread_reference(thread);
-                       thread_port = convert_thread_to_port(thread);
-                       /* thread ref consumed */
-               } else {
-                       task_port = retrieve_task_self_fast(thread->task);
-                       thread_port = retrieve_thread_self_fast(thread);
-               }
+               task_reference(task);
+               task_port = convert_task_to_port(task);
+               /* task ref consumed */
+               thread_reference(thread);
+               thread_port = convert_thread_to_port(thread);
+               /* thread ref consumed */
        }
 
        switch (behavior) {
index 0f5a81effebc5217c97585a48f9948d543a43bff..163994656c697d54eef14d0b6fb6b32ee46427c7 100644 (file)
@@ -50,6 +50,9 @@ struct exception_action {
        struct label            *label;         /* MAC label associated with action */
 };
 
+/* Initialize global state needed for exceptions. */
+extern void exception_init(void);
+
 /* Make an up-call to a thread's exception server */
 extern kern_return_t exception_triage(
        exception_type_t        exception,
index 9e0cd1e960c3bb09c1615eb80f98c05de5642aa5..71ef32fae90d152b3a292ac3f45864bb12f007dd 100644 (file)
@@ -60,13 +60,14 @@ hibernate_alloc_page_lists(
 
        page_list = hibernate_page_list_allocate(TRUE);
        if (!page_list) {
+               HIBLOG("%s: failed for page_list\n", __FUNCTION__);
                retval = KERN_RESOURCE_SHORTAGE;
                goto done;
        }
        page_list_wired = hibernate_page_list_allocate(FALSE);
        if (!page_list_wired) {
                kfree(page_list, page_list->list_size);
-
+               HIBLOG("%s: failed for page_list_wired\n", __FUNCTION__);
                retval = KERN_RESOURCE_SHORTAGE;
                goto done;
        }
@@ -74,7 +75,7 @@ hibernate_alloc_page_lists(
        if (!page_list_pal) {
                kfree(page_list, page_list->list_size);
                kfree(page_list_wired, page_list_wired->list_size);
-
+               HIBLOG("%s: failed for page_list_pal\n", __FUNCTION__);
                retval = KERN_RESOURCE_SHORTAGE;
                goto done;
        }
index 3d3e853ef333998435557944e207775415c34fe1..e336fcc09f1f6edaa183ede2bb0e087ce4c9d439 100644 (file)
@@ -95,6 +95,9 @@
 #include <vm/vm_purgeable_internal.h>
 #include <vm/vm_pageout.h>
 
+#include <IOKit/IOBSD.h> // IOTaskHasEntitlement
+#include <IOKit/IOKitKeys.h> // DriverKit entitlement strings
+
 
 #if CONFIG_ATM
 #include <atm/atm_internal.h>
@@ -340,11 +343,19 @@ host_info(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_type_num
                user_arch_info = (host_preferred_user_arch_t)info;
 
 #if defined(PREFERRED_USER_CPU_TYPE) && defined(PREFERRED_USER_CPU_SUBTYPE)
-               user_arch_info->cpu_type = PREFERRED_USER_CPU_TYPE;
-               user_arch_info->cpu_subtype = PREFERRED_USER_CPU_SUBTYPE;
+               cpu_type_t preferred_cpu_type;
+               cpu_subtype_t preferred_cpu_subtype;
+               if (!PE_get_default("kern.preferred_cpu_type", &preferred_cpu_type, sizeof(cpu_type_t))) {
+                       preferred_cpu_type = PREFERRED_USER_CPU_TYPE;
+               }
+               if (!PE_get_default("kern.preferred_cpu_subtype", &preferred_cpu_subtype, sizeof(cpu_subtype_t))) {
+                       preferred_cpu_subtype = PREFERRED_USER_CPU_SUBTYPE;
+               }
+               user_arch_info->cpu_type    = preferred_cpu_type;
+               user_arch_info->cpu_subtype = preferred_cpu_subtype;
 #else
-               int master_id = master_processor->cpu_id;
-               user_arch_info->cpu_type = slot_type(master_id);
+               int master_id               = master_processor->cpu_id;
+               user_arch_info->cpu_type    = slot_type(master_id);
                user_arch_info->cpu_subtype = slot_subtype(master_id);
 #endif
 
@@ -1314,6 +1325,10 @@ host_set_special_port(host_priv_t host_priv, int id, ipc_port_t port)
                return KERN_INVALID_ARGUMENT;
        }
 
+       if (task_is_driver(current_task())) {
+               return KERN_NO_ACCESS;
+       }
+
 #if CONFIG_MACF
        if (mac_task_check_set_host_special_port(current_task(), id, port) != 0) {
                return KERN_NO_ACCESS;
@@ -1341,6 +1356,17 @@ host_get_special_port(host_priv_t host_priv, __unused int node, int id, ipc_port
                return KERN_INVALID_ARGUMENT;
        }
 
+       task_t task = current_task();
+       if (task && task_is_driver(task) && id > HOST_MAX_SPECIAL_KERNEL_PORT) {
+               /* allow HID drivers to get the sysdiagnose port for keychord handling */
+               if (IOTaskHasEntitlement(task, kIODriverKitHIDFamilyEventServiceEntitlementKey) &&
+                   id == HOST_SYSDIAGNOSE_PORT) {
+                       goto get_special_port;
+               }
+               return KERN_NO_ACCESS;
+       }
+
+get_special_port:
        host_lock(host_priv);
        port = realhost.special[id];
        *portp = ipc_port_copy_send(port);
index 480eb4bf8fb5c1ce142693de83ced2b363a4821c..8ada4462afff7f9f933e5a1d835ed8852ef2c5ef 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -77,7 +77,7 @@
 #include <mach/vm_statistics.h>
 
 struct  host {
-       decl_lck_mtx_data(, lock)                /* lock to protect exceptions */
+       decl_lck_mtx_data(, lock);               /* lock to protect exceptions */
        ipc_port_t special[HOST_MAX_SPECIAL_PORT + 1];
        struct exception_action exc_actions[EXC_TYPES_COUNT];
 };
index 27c8bc750b238d8c5d4bdf8c29ecb3ffbeaf7ac9..dfb2703b4013ccb8a9747102c0eae8b97cebd94e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -37,7 +37,7 @@
 
 #include "mach/host_notify_reply.h"
 
-decl_lck_mtx_data(, host_notify_lock)
+decl_lck_mtx_data(, host_notify_lock);
 
 lck_mtx_ext_t                   host_notify_lock_ext;
 lck_grp_t                               host_notify_lock_grp;
index 683076b2fe6f51707e3d9c41ae145efa8291b960..74a06ea76e562d4282fa195a7343e41451c4da49 100644 (file)
@@ -26,6 +26,7 @@
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
+#include <kern/ast.h>
 #include <kern/locks.h>
 #include <kern/task.h>
 #include <kern/thread.h>
@@ -230,3 +231,9 @@ hv_thread_trap(uint64_t index, uint64_t arg)
 {
        return HV_TRAP_DISPATCH(HV_THREAD_TRAP, index, hv_get_thread_target(), arg);
 }
+
+boolean_t
+hv_ast_pending(void)
+{
+       return current_cpu_datap()->cpu_pending_ast & (AST_APC | AST_BSD);
+}
index 72d5bd2cf86d5bb850803fabca758044b489a8d0..a945a18b967fc29f129c7e9ffd7e1f913b7f0735 100644 (file)
@@ -80,6 +80,7 @@ extern void hv_release_callbacks(void);
 extern void hv_suspend(void);
 extern kern_return_t hv_task_trap(uint64_t index, uint64_t arg);
 extern kern_return_t hv_thread_trap(uint64_t index, uint64_t arg);
+extern boolean_t hv_ast_pending(void);
 
 #if defined(__cplusplus)
 }
index 502f876a4b61a3ecdb43ab8f6fe76de7954978d3..800c7b857e390eccbdb61f66a7b9b23deb97e068 100644 (file)
@@ -194,7 +194,7 @@ port_name_to_clock(
        if (ipc_port_translate_send(space, clock_name, &port) != KERN_SUCCESS) {
                return clock;
        }
-       if (ip_active(port) && (ip_kotype(port) == IKOT_CLOCK)) {
+       if (ip_kotype(port) == IKOT_CLOCK) {
                clock = (clock_t) port->ip_kobject;
        }
        ip_unlock(port);
index c3b4a4516077309f558c11a64fc33efe5c8614bf..2b1b29008a0e1b13db182fff8f1efd5a7812f721 100644 (file)
@@ -106,32 +106,17 @@ ipc_host_init(void)
        /*
         *      Allocate and set up the two host ports.
         */
-       port = ipc_port_alloc_kernel();
-       if (port == IP_NULL) {
-               panic("ipc_host_init");
-       }
-
-       ipc_kobject_set(port, (ipc_kobject_t) &realhost, IKOT_HOST_SECURITY);
-       kernel_set_special_port(&realhost, HOST_SECURITY_PORT,
-           ipc_port_make_send(port));
-
-       port = ipc_port_alloc_kernel();
-       if (port == IP_NULL) {
-               panic("ipc_host_init");
-       }
-
-       ipc_kobject_set(port, (ipc_kobject_t) &realhost, IKOT_HOST);
-       kernel_set_special_port(&realhost, HOST_PORT,
-           ipc_port_make_send(port));
+       port = ipc_kobject_alloc_port((ipc_kobject_t) &realhost, IKOT_HOST_SECURITY,
+           IPC_KOBJECT_ALLOC_MAKE_SEND);
+       kernel_set_special_port(&realhost, HOST_SECURITY_PORT, port);
 
-       port = ipc_port_alloc_kernel();
-       if (port == IP_NULL) {
-               panic("ipc_host_init");
-       }
+       port = ipc_kobject_alloc_port((ipc_kobject_t) &realhost, IKOT_HOST,
+           IPC_KOBJECT_ALLOC_MAKE_SEND);
+       kernel_set_special_port(&realhost, HOST_PORT, port);
 
-       ipc_kobject_set(port, (ipc_kobject_t) &realhost, IKOT_HOST_PRIV);
-       kernel_set_special_port(&realhost, HOST_PRIV_PORT,
-           ipc_port_make_send(port));
+       port = ipc_kobject_alloc_port((ipc_kobject_t) &realhost, IKOT_HOST_PRIV,
+           IPC_KOBJECT_ALLOC_MAKE_SEND);
+       kernel_set_special_port(&realhost, HOST_PRIV_PORT, port);
 
        /* the rest of the special ports will be set up later */
 
@@ -297,7 +282,7 @@ convert_port_to_host(
                if (ip_kotype(port) == IKOT_HOST ||
                    ip_kotype(port) == IKOT_HOST_PRIV) {
                        host = (host_t) port->ip_kobject;
-                       assert(ip_active(port));
+                       require_ip_active(port);
                }
        }
        return host;
@@ -583,7 +568,7 @@ host_set_exception_ports(
        }
 
        if (IP_VALID(new_port)) {
-               switch (new_behavior & ~MACH_EXCEPTION_CODES) {
+               switch (new_behavior & ~MACH_EXCEPTION_MASK) {
                case EXCEPTION_DEFAULT:
                case EXCEPTION_STATE:
                case EXCEPTION_STATE_IDENTITY:
index 2d63117cf7b5978f562be8470a38acc256219ff3..d2e0c17469a9c06fe7392639d33f74d117a9dae2 100644 (file)
 #include <mach/vm32_map_server.h>
 #endif
 #include <mach/thread_act_server.h>
+#include <mach/restartable_server.h>
 
 #include <mach/exc_server.h>
 #include <mach/mach_exc_server.h>
 
 #include <UserNotification/UNDReplyServer.h>
 
+#if     CONFIG_ARCADE
+#include <mach/arcade_register_server.h>
+#endif
+
 #if     CONFIG_AUDIT
 #include <kern/audit_sessionport.h>
 #endif
 #include <kern/host_notify.h>
 #include <kern/mk_timer.h>
 #include <kern/misc_protos.h>
+
+#if CONFIG_ARCADE
+#include <kern/arcade.h>
+#endif /* CONFIG_ARCADE */
+
 #include <ipc/ipc_kmsg.h>
 #include <ipc/ipc_port.h>
 #include <ipc/ipc_voucher.h>
@@ -195,6 +205,7 @@ const struct mig_subsystem *mig_e[] = {
        (const struct mig_subsystem *)&mach_voucher_subsystem,
        (const struct mig_subsystem *)&mach_voucher_attr_control_subsystem,
        (const struct mig_subsystem *)&memory_entry_subsystem,
+       (const struct mig_subsystem *)&task_restartable_subsystem,
 
 #if     XK_PROXY
        (const struct mig_subsystem *)&do_uproxy_xk_uproxy_subsystem,
@@ -207,6 +218,9 @@ const struct mig_subsystem *mig_e[] = {
 #endif  /* MCMSG && iPSC860 */
        (const struct mig_subsystem *)&catch_exc_subsystem,
        (const struct mig_subsystem *)&catch_mach_exc_subsystem,
+#if CONFIG_ARCADE
+       (const struct mig_subsystem *)&arcade_register_subsystem,
+#endif
 };
 
 void
@@ -272,7 +286,6 @@ ipc_kobject_server(
        mach_msg_size_t reply_size;
        ipc_kmsg_t reply;
        kern_return_t kr;
-       ipc_port_t *destp;
        ipc_port_t  replyp = IPC_PORT_NULL;
        mach_msg_format_0_trailer_t *trailer;
        mig_hash_t *ptr;
@@ -280,9 +293,25 @@ ipc_kobject_server(
        uint32_t exec_token;
        boolean_t exec_token_changed = FALSE;
        int request_msgh_id = request->ikm_header->msgh_id;
+       natural_t ikot;
+       ipc_port_t port;
 
+       reply = NULL;
+       port = request->ikm_header->msgh_remote_port;
+       if (IP_VALID(port)) {
+               ikot = ip_kotype(port);
+       } else {
+               ikot = IKOT_UNKNOWN;
+       }
+       if (ikot == IKOT_UEXT_OBJECT) {
+               kr = uext_server(request, &reply);
+               if ((MIG_NO_REPLY == kr) || (KERN_SUCCESS == kr)) {
+                       ipc_kmsg_trace_send(request, option);
+                       goto msgdone;
+               }
+       }
        /*
-        * Find out corresponding mig_hash entry if any
+        * Find corresponding mig_hash entry if any
         */
        {
                unsigned int i = (unsigned int)MIG_HASH(request_msgh_id);
@@ -297,7 +326,7 @@ ipc_kobject_server(
                        reply_size = mig_reply_size;
                } else {
                        reply_size = ptr->size;
-#if     MACH_COUNTER
+#if     MACH_COUNTERS
                        ptr->callcount++;
 #endif
                }
@@ -353,8 +382,7 @@ ipc_kobject_server(
                         * Check if the port is a task port, if its a task port then
                         * snapshot the task exec token before the mig routine call.
                         */
-                       ipc_port_t port = request->ikm_header->msgh_remote_port;
-                       if (IP_VALID(port) && ip_kotype(port) == IKOT_TASK) {
+                       if (ikot == IKOT_TASK) {
                                task = convert_port_to_task_with_exec_token(port, &exec_token);
                        }
 
@@ -386,31 +414,39 @@ ipc_kobject_server(
                kernel_task->messages_sent++;
        }
 
+       if (!(reply->ikm_header->msgh_bits & MACH_MSGH_BITS_COMPLEX) &&
+           ((mig_reply_error_t *) reply->ikm_header)->RetCode != KERN_SUCCESS) {
+               kr = ((mig_reply_error_t *) reply->ikm_header)->RetCode;
+       } else {
+               kr = KERN_SUCCESS;
+       }
+
+msgdone:
        /*
         *      Destroy destination. The following code differs from
         *      ipc_object_destroy in that we release the send-once
         *      right instead of generating a send-once notification
-        *      (which would bring us here again, creating a loop).
+        *      (which would bring us here again, creating a loop).
         *      It also differs in that we only expect send or
         *      send-once rights, never receive rights.
         *
         *      We set msgh_remote_port to IP_NULL so that the kmsg
         *      destroy routines don't try to destroy the port twice.
         */
-       destp = (ipc_port_t *) &request->ikm_header->msgh_remote_port;
        switch (MACH_MSGH_BITS_REMOTE(request->ikm_header->msgh_bits)) {
        case MACH_MSG_TYPE_PORT_SEND:
-               ipc_port_release_send(*destp);
+               ipc_port_release_send(request->ikm_header->msgh_remote_port);
+               request->ikm_header->msgh_remote_port = IP_NULL;
                break;
 
        case MACH_MSG_TYPE_PORT_SEND_ONCE:
-               ipc_port_release_sonce(*destp);
+               ipc_port_release_sonce(request->ikm_header->msgh_remote_port);
+               request->ikm_header->msgh_remote_port = IP_NULL;
                break;
 
        default:
                panic("ipc_kobject_server: strange destination rights");
        }
-       *destp = IP_NULL;
 
        /*
         *      Destroy voucher.  The kernel MIG servers never take ownership
@@ -423,13 +459,6 @@ ipc_kobject_server(
                request->ikm_voucher = IP_NULL;
        }
 
-       if (!(reply->ikm_header->msgh_bits & MACH_MSGH_BITS_COMPLEX) &&
-           ((mig_reply_error_t *) reply->ikm_header)->RetCode != KERN_SUCCESS) {
-               kr = ((mig_reply_error_t *) reply->ikm_header)->RetCode;
-       } else {
-               kr = KERN_SUCCESS;
-       }
-
        if ((kr == KERN_SUCCESS) || (kr == MIG_NO_REPLY)) {
                /*
                 *      The server function is responsible for the contents
@@ -449,18 +478,23 @@ ipc_kobject_server(
                ipc_kmsg_destroy(request);
        }
 
-       replyp = (ipc_port_t)reply->ikm_header->msgh_remote_port;
-
        if (kr == MIG_NO_REPLY) {
                /*
                 *      The server function will send a reply message
                 *      using the reply port right, which it has saved.
                 */
 
-               ipc_kmsg_free(reply);
-
+               if (reply) {
+                       ipc_kmsg_free(reply);
+               }
                return IKM_NULL;
-       } else if (!IP_VALID(replyp)) {
+       }
+
+       if (reply) {
+               replyp = reply->ikm_header->msgh_remote_port;
+       }
+
+       if (!IP_VALID(replyp)) {
                /*
                 *      Can't queue the reply message if the destination
                 *      (the reply port) isn't valid.
@@ -567,10 +601,122 @@ ipc_kobject_set_atomically(
 {
        assert(type == IKOT_NONE || ip_active(port));
 #if     MACH_ASSERT
-       port->ip_spares[2] = (port->ip_bits & IO_BITS_KOTYPE);
+       port->ip_spares[2] = (port->ip_object.io_bits & IO_BITS_KOTYPE);
 #endif  /* MACH_ASSERT */
-       port->ip_bits = (port->ip_bits & ~IO_BITS_KOTYPE) | type;
+       port->ip_object.io_bits = (port->ip_object.io_bits & ~IO_BITS_KOTYPE) | type;
        port->ip_kobject = kobject;
+       if (type != IKOT_NONE) {
+               /* Once set, this bit can never be unset */
+               port->ip_object.io_bits |= IO_BITS_KOBJECT;
+       }
+}
+
+/*
+ *     Routine:        ipc_kobject_alloc_port
+ *     Purpose:
+ *             Allocate a kobject port in the kernel space of the specified type.
+ *
+ *             This function never fails.
+ *
+ *     Conditions:
+ *             No locks held (memory is allocated)
+ */
+ipc_port_t
+ipc_kobject_alloc_port(
+       ipc_kobject_t           kobject,
+       ipc_kobject_type_t      type,
+       ipc_kobject_alloc_options_t     options)
+{
+       ipc_port_t port = ipc_port_alloc_kernel();
+
+       if (port == IP_NULL) {
+               panic("ipc_kobject_alloc_port(): failed to allocate port");
+       }
+
+       ipc_kobject_set_atomically(port, kobject, type);
+
+       if (options & IPC_KOBJECT_ALLOC_MAKE_SEND) {
+               ipc_port_make_send_locked(port);
+       }
+       if (options & IPC_KOBJECT_ALLOC_NSREQUEST) {
+               ipc_port_make_sonce_locked(port);
+               port->ip_nsrequest = port;
+       }
+       if (options & IPC_KOBJECT_ALLOC_NO_GRANT) {
+               port->ip_no_grant = 1;
+       }
+       if (options & IPC_KOBJECT_ALLOC_IMMOVABLE_SEND) {
+               port->ip_immovable_send = 1;
+       }
+
+       return port;
+}
+
+/*
+ *     Routine:        ipc_kobject_make_send_lazy_alloc_port
+ *     Purpose:
+ *             Make a send once for a kobject port.
+ *
+ *             A location owning this port is passed in port_store.
+ *             If no port exists, a port is made lazily.
+ *
+ *             A send right is made for the port, and if this is the first one
+ *             (possibly not for the first time), then the no-more-senders
+ *             notification is rearmed.
+ *
+ *             When a notification is armed, the kobject must donate
+ *             one of its references to the port. It is expected
+ *             the no-more-senders notification will consume this reference.
+ *
+ *     Returns:
+ *             TRUE if a notification was armed
+ *             FALSE else
+ *
+ *     Conditions:
+ *             Nothing is locked, memory can be allocated.
+ *             The caller must be able to donate a kobject reference to the port.
+ */
+boolean_t
+ipc_kobject_make_send_lazy_alloc_port(
+       ipc_port_t              *port_store,
+       ipc_kobject_t           kobject,
+       ipc_kobject_type_t      type)
+{
+       ipc_port_t port, previous;
+       boolean_t rc = FALSE;
+
+       port = os_atomic_load(port_store, dependency);
+
+       if (!IP_VALID(port)) {
+               port = ipc_kobject_alloc_port(kobject, type,
+                   IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
+               if (os_atomic_cmpxchgv(port_store, IP_NULL, port, &previous, release)) {
+                       return TRUE;
+               }
+
+               // undo what ipc_kobject_alloc_port() did above
+               port->ip_nsrequest = IP_NULL;
+               port->ip_mscount = 0;
+               port->ip_sorights = 0;
+               port->ip_srights = 0;
+               ip_release(port);
+               ip_release(port);
+               ipc_port_dealloc_kernel(port);
+
+               port = previous;
+       }
+
+       ip_lock(port);
+       ipc_port_make_send_locked(port);
+       if (port->ip_srights == 1) {
+               ipc_port_make_sonce_locked(port);
+               assert(port->ip_nsrequest == IP_NULL);
+               port->ip_nsrequest = port;
+               rc = TRUE;
+       }
+       ip_unlock(port);
+
+       return rc;
 }
 
 /*
@@ -616,7 +762,7 @@ ipc_kobject_notify(
        mach_msg_header_t *reply_header)
 {
        mach_msg_max_trailer_t * trailer;
-       ipc_port_t port = (ipc_port_t) request_header->msgh_remote_port;
+       ipc_port_t port = request_header->msgh_remote_port;
 
        ((mig_reply_error_t *) reply_header)->RetCode = MIG_NO_REPLY;
 
@@ -706,6 +852,7 @@ ipc_kobject_notify(
        case IKOT_IOKIT_OBJECT:
        case IKOT_IOKIT_CONNECT:
        case IKOT_IOKIT_IDENT:
+       case IKOT_UEXT_OBJECT:
        {
                return iokit_notify(request_header);
        }
index 95f1507762f77a414b6b8ac930c32dfd10440f9b..4431f29ca9f88da1c4df8bf08b9ef995ac3b2648 100644 (file)
@@ -130,17 +130,16 @@ typedef natural_t       ipc_kobject_type_t;
 #define IKOT_VOUCHER_ATTR_CONTROL       38
 #define IKOT_WORK_INTERVAL              39
 #define IKOT_UX_HANDLER                 40
+#define IKOT_UEXT_OBJECT                41
+#define IKOT_ARCADE_REG                 42
 
 /*
  * Add new entries here and adjust IKOT_UNKNOWN.
  * Please keep ipc/ipc_object.c:ikot_print_array up to date.
  */
-#define IKOT_UNKNOWN                    41      /* magic catchall       */
+#define IKOT_UNKNOWN                    43      /* magic catchall       */
 #define IKOT_MAX_TYPE   (IKOT_UNKNOWN+1)        /* # of IKOT_ types    */
 
-
-#define is_ipc_kobject(ikot)    ((ikot) != IKOT_NONE)
-
 #ifdef MACH_KERNEL_PRIVATE
 
 /*
@@ -149,27 +148,56 @@ typedef natural_t       ipc_kobject_type_t;
  */
 
 /* Dispatch a kernel server function */
-extern ipc_kmsg_t       ipc_kobject_server(
-       ipc_kmsg_t           request,
-       mach_msg_option_t    option);
+extern ipc_kmsg_t ipc_kobject_server(
+       ipc_kmsg_t                  request,
+       mach_msg_option_t           option);
 
 /* Make a port represent a kernel object of the given type */
-extern void             ipc_kobject_set(
-       ipc_port_t                      port,
-       ipc_kobject_t           kobject,
-       ipc_kobject_type_t      type);
+extern void ipc_kobject_set(
+       ipc_port_t                  port,
+       ipc_kobject_t               kobject,
+       ipc_kobject_type_t          type);
+
+extern void ipc_kobject_set_atomically(
+       ipc_port_t                  port,
+       ipc_kobject_t               kobject,
+       ipc_kobject_type_t          type);
+
+__options_decl(ipc_kobject_alloc_options_t, uint32_t, {
+       /* Just make the naked port */
+       IPC_KOBJECT_ALLOC_NONE      = 0x00000000,
+       /* Make a send right */
+       IPC_KOBJECT_ALLOC_MAKE_SEND = 0x00000001,
+       /* Register for no-more-senders */
+       IPC_KOBJECT_ALLOC_NSREQUEST = 0x00000002,
+       /* Make it no grant port */
+       IPC_KOBJECT_ALLOC_NO_GRANT  = 0x00000004,
+       /* Make all the send rights immovable */
+       IPC_KOBJECT_ALLOC_IMMOVABLE_SEND = 0x00000008,
+});
+
+/* Allocates a kobject port, never fails */
+extern ipc_port_t ipc_kobject_alloc_port(
+       ipc_kobject_t               kobject,
+       ipc_kobject_type_t          type,
+       ipc_kobject_alloc_options_t options);
+
+/* Makes a send right, lazily allocating a kobject port, arming for no-senders, never fails */
+extern boolean_t ipc_kobject_make_send_lazy_alloc_port(
+       ipc_port_t                 *port_store,
+       ipc_kobject_t               kobject,
+       ipc_kobject_type_t          type) __result_use_check;
 
-extern void             ipc_kobject_set_atomically(
-       ipc_port_t                      port,
-       ipc_kobject_t           kobject,
-       ipc_kobject_type_t      type);
 
 /* Release any kernel object resources associated with a port */
-extern void             ipc_kobject_destroy(
-       ipc_port_t                      port);
+extern void ipc_kobject_destroy(
+       ipc_port_t                  port);
 
 #define null_conversion(port)   (port)
 
+extern kern_return_t
+uext_server(ipc_kmsg_t request, ipc_kmsg_t * reply);
+
 #endif /* MACH_KERNEL_PRIVATE */
 
 #endif /* KERNEL_PRIVATE */
index 4770d8b878a4ca21088af0011b3ff42b53594be0..722384a00bf235c0e359bc26abbea8e727d1e4ea 100644 (file)
@@ -420,10 +420,9 @@ mach_msg_rpc_from_kernel_body(
 
        for (;;) {
                ipc_mqueue_t mqueue;
-               ipc_object_t object;
 
                assert(reply->ip_in_pset == 0);
-               assert(ip_active(reply));
+               require_ip_active(reply);
 
                /* JMM - why this check? */
                if (!self->active && !self->inspection) {
@@ -445,8 +444,7 @@ mach_msg_rpc_from_kernel_body(
                kmsg = self->ith_kmsg;
                seqno = self->ith_seqno;
 
-               __IGNORE_WCASTALIGN(object = (ipc_object_t) reply);
-               mach_msg_receive_results_complete(object);
+               mach_msg_receive_results_complete(ip_to_object(reply));
 
                if (mr == MACH_MSG_SUCCESS) {
                        break;
@@ -586,6 +584,13 @@ mach_msg_destroy_from_kernel_proper(mach_msg_header_t *msg)
                        kfree(dsc->address, (vm_size_t) dsc->count * sizeof(mach_port_t));
                        break;
                }
+               case MACH_MSG_GUARDED_PORT_DESCRIPTOR: {
+                       mach_msg_guarded_port_descriptor_t *dsc = (mach_msg_guarded_port_descriptor_t *)&daddr->guarded_port;
+                       if (IO_VALID((ipc_object_t) dsc->name)) {
+                               ipc_object_destroy((ipc_object_t) dsc->name, dsc->disposition);
+                       }
+                       break;
+               }
                default:
                        break;
                }
@@ -633,7 +638,7 @@ mach_msg_overwrite(
 
                if ((send_size & 3) ||
                    send_size < sizeof(mach_msg_header_t) ||
-                   (send_size < sizeof(mach_msg_body_t) && (msg->msgh_bits & MACH_MSGH_BITS_COMPLEX))) {
+                   (send_size < sizeof(mach_msg_base_t) && (msg->msgh_bits & MACH_MSGH_BITS_COMPLEX))) {
                        return MACH_SEND_MSG_TOO_SMALL;
                }
 
@@ -962,7 +967,13 @@ mig_object_deallocate(
        mig_object_t    mig_object)
 {
        assert(mig_object != MIG_OBJECT_NULL);
-       mig_object->pVtbl->Release((IMIGObject *)mig_object);
+       ipc_port_t port = mig_object->port;
+       if (mig_object->pVtbl->Release((IMIGObject *)mig_object) == 0) {
+               if (IP_VALID(port)) {
+                       assert(!port->ip_srights);
+                       ipc_port_dealloc_kernel(port);
+               }
+       }
 }
 
 /*
@@ -981,56 +992,20 @@ ipc_port_t
 convert_mig_object_to_port(
        mig_object_t    mig_object)
 {
-       ipc_port_t      port;
-       boolean_t       deallocate = TRUE;
-
        if (mig_object == MIG_OBJECT_NULL) {
                return IP_NULL;
        }
 
-       port = mig_object->port;
-       while ((port == IP_NULL) ||
-           ((port = ipc_port_make_send(port)) == IP_NULL)) {
-               ipc_port_t      previous;
-
-               /*
-                * Either the port was never set up, or it was just
-                * deallocated out from under us by the no-senders
-                * processing.  In either case, we must:
-                *      Attempt to make one
-                *      Arrange for no senders
-                *      Try to atomically register it with the object
-                *              Destroy it if we are raced.
-                */
-               port = ipc_port_alloc_kernel();
-               ip_lock(port);
-               ipc_kobject_set_atomically(port,
-                   (ipc_kobject_t) mig_object,
-                   IKOT_MIG);
-
-               /* make a sonce right for the notification */
-               port->ip_sorights++;
-               ip_reference(port);
-
-               ipc_port_nsrequest(port, 1, port, &previous);
-               /* port unlocked */
-
-               assert(previous == IP_NULL);
-
-               if (OSCompareAndSwapPtr((void *)IP_NULL, (void *)port,
-                   (void * volatile *)&mig_object->port)) {
-                       deallocate = FALSE;
-               } else {
-                       ipc_port_dealloc_kernel(port);
-                       port = mig_object->port;
-               }
-       }
-
-       if (deallocate) {
-               mig_object->pVtbl->Release((IMIGObject *)mig_object);
+       /*
+        * make a send right and donate our reference for mig_object_no_senders
+        * if this is the first send right
+        */
+       if (!ipc_kobject_make_send_lazy_alloc_port(&mig_object->port,
+           (ipc_kobject_t) mig_object, IKOT_MIG)) {
+               mig_object_deallocate(mig_object);
        }
 
-       return port;
+       return mig_object->port;
 }
 
 
@@ -1082,59 +1057,18 @@ convert_port_to_mig_object(
  *             Base implementation of a no-senders notification handler
  *             for MIG objects. If there truly are no more senders, must
  *             destroy the port and drop its reference on the object.
- *     Returns:
- *             TRUE  - port deallocate and reference dropped
- *             FALSE - more senders arrived, re-registered for notification
  *     Conditions:
  *             Nothing locked.
  */
-
-boolean_t
+void
 mig_object_no_senders(
-       ipc_port_t              port,
-       mach_port_mscount_t     mscount)
+       ipc_port_t              port)
 {
-       mig_object_t            mig_object;
-
-       ip_lock(port);
-       if (port->ip_mscount > mscount) {
-               ipc_port_t      previous;
-
-               /*
-                * Somebody created new send rights while the
-                * notification was in-flight.  Just create a
-                * new send-once right and re-register with
-                * the new (higher) mscount threshold.
-                */
-               /* make a sonce right for the notification */
-               port->ip_sorights++;
-               ip_reference(port);
-               ipc_port_nsrequest(port, mscount, port, &previous);
-               /* port unlocked */
-
-               assert(previous == IP_NULL);
-               return FALSE;
-       }
-
-       /*
-        * Clear the port pointer while we have it locked.
-        */
-       mig_object = (mig_object_t)port->ip_kobject;
-       mig_object->port = IP_NULL;
+       require_ip_active(port);
+       assert(IKOT_MIG == ip_kotype(port));
 
-       /*
-        * Bring the sequence number and mscount in
-        * line with ipc_port_destroy assertion.
-        */
-       port->ip_mscount = 0;
-       port->ip_messages.imq_seqno = 0;
-       ipc_port_destroy(port); /* releases lock */
-
-       /*
-        * Release the port's reference on the object.
-        */
-       mig_object->pVtbl->Release((IMIGObject *)mig_object);
-       return TRUE;
+       /* consume the reference donated by convert_mig_object_to_port */
+       mig_object_deallocate((mig_object_t)port->ip_kobject);
 }
 
 /*
index 3fb5a8cba8dc95117423a895f7d5e7e081125b96..a4fad67e18dd2356c0d0a98b70488f97358dd4ac 100644 (file)
@@ -167,13 +167,20 @@ extern mach_msg_return_t mach_msg_send_from_kernel_with_options_legacy(
        mach_msg_size_t         send_size,
        mach_msg_option_t       option,
        mach_msg_timeout_t      timeout_val);
-#endif /* XNU_KERNEL_PRIVATE */
 
+extern mach_msg_return_t mach_msg_send_from_kernel_with_options(
+       mach_msg_header_t       *msg,
+       mach_msg_size_t         send_size,
+       mach_msg_option_t       option,
+       mach_msg_timeout_t      timeout_val)
+__XNU_INTERNAL(mach_msg_send_from_kernel_with_options);
+#else
 extern mach_msg_return_t mach_msg_send_from_kernel_with_options(
        mach_msg_header_t       *msg,
        mach_msg_size_t         send_size,
        mach_msg_option_t       option,
        mach_msg_timeout_t      timeout_val);
+#endif /* XNU_KERNEL_PRIVATE */
 
 __END_DECLS
 
@@ -229,9 +236,8 @@ extern mig_object_t convert_port_to_mig_object(
        ipc_port_t              port,
        const MIGIID            *iid);
 
-boolean_t mig_object_no_senders(
-       ipc_port_t              port,
-       mach_port_mscount_t     mscount);
+extern void mig_object_no_senders(
+       ipc_port_t              port);
 
 #endif  /* MACH_KERNEL_PRIVATE */
 
index 655d385e89cd8a1266f780483c8516e8f57bcd9e..16c3c5a51b560b62f62d4a504a338001e180db1a 100644 (file)
@@ -52,27 +52,8 @@ extern void fileport_releasefg(struct fileglob *);
 ipc_port_t
 fileport_alloc(struct fileglob *fg)
 {
-       ipc_port_t fileport;
-       ipc_port_t sendport;
-       ipc_port_t notifyport;
-
-       fileport = ipc_port_alloc_kernel();
-       if (fileport == IP_NULL) {
-               goto out;
-       }
-
-       ipc_kobject_set(fileport, (ipc_kobject_t)fg, IKOT_FILEPORT);
-       ip_lock(fileport); /* unlocked by ipc_port_nsrequest */
-       notifyport = ipc_port_make_sonce_locked(fileport);
-       ipc_port_nsrequest(fileport, 1, notifyport, &notifyport);
-
-       sendport = ipc_port_make_send(fileport);
-       if (!IP_VALID(sendport)) {
-               panic("Couldn't allocate send right for fileport!\n");
-       }
-
-out:
-       return fileport;
+       return ipc_kobject_alloc_port((ipc_kobject_t)fg, IKOT_FILEPORT,
+                  IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
 }
 
 
@@ -174,7 +155,8 @@ fileport_invoke(task_t task, mach_port_name_t name,
        struct fileglob *fg;
 
        kr = ipc_object_copyin(task->itk_space, name,
-           MACH_MSG_TYPE_COPY_SEND, (ipc_object_t *)&fileport);
+           MACH_MSG_TYPE_COPY_SEND, (ipc_object_t *)&fileport, 0, NULL,
+           IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND);
        if (kr != KERN_SUCCESS) {
                return kr;
        }
index d09b42157bac44b9004769995da2ae6278d3b098..7f65888d55716088ecc58deff1108b776a8edc81 100644 (file)
@@ -65,8 +65,7 @@ port_name_to_semaphore(
                return KERN_INVALID_NAME;
        }
 
-       kr = ipc_object_translate(current_space(), name, MACH_PORT_RIGHT_SEND,
-           (ipc_object_t *) &kern_port);
+       kr = ipc_port_translate_send(current_space(), name, &kern_port);
        if (kr != KERN_SUCCESS) {
                *semaphorep = SEMAPHORE_NULL;
                return kr;
@@ -108,7 +107,7 @@ convert_port_to_semaphore(ipc_port_t port)
                 * keeps the semaphore bound to the port (and active).
                 */
                if (ip_kotype(port) == IKOT_SEMAPHORE) {
-                       assert(ip_active(port));
+                       require_ip_active(port);
                        semaphore = (semaphore_t) port->ip_kobject;
                        semaphore_reference(semaphore);
                        return semaphore;
@@ -132,47 +131,19 @@ convert_port_to_semaphore(ipc_port_t port)
 ipc_port_t
 convert_semaphore_to_port(semaphore_t semaphore)
 {
-       ipc_port_t port, send;
-
        if (semaphore == SEMAPHORE_NULL) {
                return IP_NULL;
        }
 
-       /* caller is donating a reference */
-       port = semaphore->port;
-
-       if (!IP_VALID(port)) {
-               port = ipc_port_alloc_kernel();
-               assert(IP_VALID(port));
-               ipc_kobject_set_atomically(port, (ipc_kobject_t) semaphore, IKOT_SEMAPHORE);
-
-               /* If we lose the race, deallocate and pick up the other guy's port */
-               if (!OSCompareAndSwapPtr(IP_NULL, port, &semaphore->port)) {
-                       ipc_port_dealloc_kernel(port);
-                       port = semaphore->port;
-                       assert(ip_kotype(port) == IKOT_SEMAPHORE);
-                       assert(port->ip_kobject == (ipc_kobject_t)semaphore);
-               }
-       }
-
-       ip_lock(port);
-       assert(ip_active(port));
-       send = ipc_port_make_send_locked(port);
-
-       if (1 == port->ip_srights) {
-               ipc_port_t old_notify;
-
-               /* transfer our ref to the port, and arm the no-senders notification */
-               assert(IP_NULL == port->ip_nsrequest);
-               ipc_port_nsrequest(port, port->ip_mscount, ipc_port_make_sonce_locked(port), &old_notify);
-               /* port unlocked */
-               assert(IP_NULL == old_notify);
-       } else {
-               /* piggyback on the existing port reference, so consume ours */
-               ip_unlock(port);
+       /*
+        * make a send right and donate our reference for
+        * semaphore_notify if this is the first send right
+        */
+       if (!ipc_kobject_make_send_lazy_alloc_port(&semaphore->port,
+           (ipc_kobject_t) semaphore, IKOT_SEMAPHORE)) {
                semaphore_dereference(semaphore);
        }
-       return send;
+       return semaphore->port;
 }
 
 /*
@@ -194,13 +165,11 @@ semaphore_notify(mach_msg_header_t *msg)
 {
        mach_no_senders_notification_t *notification = (void *)msg;
        ipc_port_t port = notification->not_header.msgh_remote_port;
-       semaphore_t semaphore;
 
-       assert(ip_active(port));
+       require_ip_active(port);
        assert(IKOT_SEMAPHORE == ip_kotype(port));
-       semaphore = (semaphore_t)port->ip_kobject;
 
-       semaphore_dereference(semaphore);
+       semaphore_dereference((semaphore_t)port->ip_kobject);
 }
 
 lock_set_t
index 03fdc53bf28246622f80cbbccd32d950d5fef6c2..7d0384cf272ea27fed789306480749e068b19e43 100644 (file)
@@ -431,10 +431,8 @@ ipc_task_reset(
        struct label *unset_label = mac_exc_create_label();
 #endif
 
-       new_kport = ipc_port_alloc_kernel();
-       if (new_kport == IP_NULL) {
-               panic("ipc_task_reset");
-       }
+       new_kport = ipc_kobject_alloc_port((ipc_kobject_t)task, IKOT_TASK,
+           IPC_KOBJECT_ALLOC_MAKE_SEND);
 
        itk_lock(task);
 
@@ -443,6 +441,7 @@ ipc_task_reset(
        if (old_kport == IP_NULL) {
                /* the task is already terminated (can this happen?) */
                itk_unlock(task);
+               ipc_port_release_send(new_kport);
                ipc_port_dealloc_kernel(new_kport);
 #if CONFIG_MACF
                mac_exc_free_label(unset_label);
@@ -450,9 +449,8 @@ ipc_task_reset(
                return;
        }
 
-       task->itk_self = new_kport;
        old_sself = task->itk_sself;
-       task->itk_sself = ipc_port_make_send(new_kport);
+       task->itk_sself = task->itk_self = new_kport;
 
        /* Set the old kport to IKOT_NONE and update the exec token while under the port lock */
        ip_lock(old_kport);
@@ -460,8 +458,6 @@ ipc_task_reset(
        task->exec_token += 1;
        ip_unlock(old_kport);
 
-       ipc_kobject_set(new_kport, (ipc_kobject_t) task, IKOT_TASK);
-
        for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) {
                old_exc_actions[i] = IP_NULL;
 
@@ -519,18 +515,13 @@ ipc_thread_init(
 {
        ipc_port_t      kport;
 
-       kport = ipc_port_alloc_kernel();
-       if (kport == IP_NULL) {
-               panic("ipc_thread_init");
-       }
+       kport = ipc_kobject_alloc_port((ipc_kobject_t)thread, IKOT_THREAD,
+           IPC_KOBJECT_ALLOC_MAKE_SEND);
 
-       thread->ith_self = kport;
-       thread->ith_sself = ipc_port_make_send(kport);
+       thread->ith_sself = thread->ith_self = kport;
        thread->ith_special_reply_port = NULL;
        thread->exc_actions = NULL;
 
-       ipc_kobject_set(kport, (ipc_kobject_t)thread, IKOT_THREAD);
-
 #if IMPORTANCE_INHERITANCE
        thread->ith_assertions = 0;
 #endif
@@ -582,6 +573,11 @@ ipc_thread_disable(
        if (kport != IP_NULL) {
                ipc_kobject_set(kport, IKO_NULL, IKOT_NONE);
        }
+
+       /* unbind the thread special reply port */
+       if (IP_VALID(thread->ith_special_reply_port)) {
+               ipc_port_unbind_special_reply_port(thread, TRUE);
+       }
 }
 
 /*
@@ -623,11 +619,6 @@ ipc_thread_terminate(
        assert(thread->ith_assertions == 0);
 #endif
 
-       /* unbind the thread special reply port */
-       if (IP_VALID(thread->ith_special_reply_port)) {
-               ipc_port_unbind_special_reply_port(thread, TRUE);
-       }
-
        assert(ipc_kmsg_queue_empty(&thread->ith_messages));
 
        if (thread->ith_rpc_reply != IP_NULL) {
@@ -663,18 +654,18 @@ ipc_thread_reset(
        struct label *new_label = mac_exc_create_label();
 #endif
 
-       new_kport = ipc_port_alloc_kernel();
-       if (new_kport == IP_NULL) {
-               panic("ipc_task_reset");
-       }
+       new_kport = ipc_kobject_alloc_port((ipc_kobject_t)thread, IKOT_THREAD,
+           IPC_KOBJECT_ALLOC_MAKE_SEND);
 
        thread_mtx_lock(thread);
 
        old_kport = thread->ith_self;
+       old_sself = thread->ith_sself;
 
        if (old_kport == IP_NULL && thread->inspection == FALSE) {
                /* the  is already terminated (can this happen?) */
                thread_mtx_unlock(thread);
+               ipc_port_release_send(new_kport);
                ipc_port_dealloc_kernel(new_kport);
 #if CONFIG_MACF
                mac_exc_free_label(new_label);
@@ -682,13 +673,10 @@ ipc_thread_reset(
                return;
        }
 
-       thread->ith_self = new_kport;
-       old_sself = thread->ith_sself;
-       thread->ith_sself = ipc_port_make_send(new_kport);
+       thread->ith_sself = thread->ith_self = new_kport;
        if (old_kport != IP_NULL) {
                ipc_kobject_set(old_kport, IKO_NULL, IKOT_NONE);
        }
-       ipc_kobject_set(new_kport, (ipc_kobject_t) thread, IKOT_THREAD);
 
        /*
         * Only ports that were set by root-owned processes
@@ -754,6 +742,7 @@ ipc_port_t
 retrieve_task_self_fast(
        task_t          task)
 {
+       __assert_only ipc_port_t sright;
        ipc_port_t port;
 
        assert(task == current_task());
@@ -763,12 +752,8 @@ retrieve_task_self_fast(
 
        if ((port = task->itk_sself) == task->itk_self) {
                /* no interposing */
-
-               ip_lock(port);
-               assert(ip_active(port));
-               ip_reference(port);
-               port->ip_srights++;
-               ip_unlock(port);
+               sright = ipc_port_copy_send(port);
+               assert(sright == port);
        } else {
                port = ipc_port_copy_send(port);
        }
@@ -793,6 +778,7 @@ ipc_port_t
 retrieve_thread_self_fast(
        thread_t                thread)
 {
+       __assert_only ipc_port_t sright;
        ipc_port_t port;
 
        assert(thread == current_thread());
@@ -803,12 +789,8 @@ retrieve_thread_self_fast(
 
        if ((port = thread->ith_sself) == thread->ith_self) {
                /* no interposing */
-
-               ip_lock(port);
-               assert(ip_active(port));
-               ip_reference(port);
-               port->ip_srights++;
-               ip_unlock(port);
+               sright = ipc_port_copy_send(port);
+               assert(sright == port);
        } else {
                port = ipc_port_copy_send(port);
        }
@@ -886,7 +868,7 @@ mach_reply_port(
        mach_port_name_t name;
        kern_return_t kr;
 
-       kr = ipc_port_alloc(current_task()->itk_space, &name, &port);
+       kr = ipc_port_alloc(current_task()->itk_space, FALSE, &name, &port);
        if (kr == KERN_SUCCESS) {
                ip_unlock(port);
        } else {
@@ -913,7 +895,6 @@ thread_get_special_reply_port(
 {
        ipc_port_t port;
        mach_port_name_t name;
-       mach_port_name_t send_name;
        kern_return_t kr;
        thread_t thread = current_thread();
 
@@ -925,25 +906,10 @@ thread_get_special_reply_port(
                }
        }
 
-       kr = ipc_port_alloc(current_task()->itk_space, &name, &port);
+       kr = ipc_port_alloc(current_task()->itk_space, TRUE, &name, &port);
        if (kr == KERN_SUCCESS) {
                ipc_port_bind_special_reply_port_locked(port);
-
-               /* Make a send right and insert it in the space at specified name */
-               ipc_port_make_send_locked(port);
                ip_unlock(port);
-               send_name = ipc_port_copyout_name_send(port, current_task()->itk_space, name);
-               /*
-                * If insertion of send right failed, userland is doing something bad, error out.
-                * The space was marked inactive or the receive right just inserted above at the
-                * given name was moved, in either case do not try to deallocate the receive right.
-                */
-               if (send_name == MACH_PORT_NULL || send_name == MACH_PORT_DEAD) {
-                       if (IP_VALID(thread->ith_special_reply_port)) {
-                               ipc_port_unbind_special_reply_port(thread, TRUE);
-                       }
-                       name = MACH_PORT_NULL;
-               }
        } else {
                name = MACH_PORT_NULL;
        }
@@ -971,8 +937,9 @@ ipc_port_bind_special_reply_port_locked(
        thread->ith_special_reply_port = port;
        port->ip_specialreply = 1;
        port->ip_sync_link_state = PORT_SYNC_LINK_ANY;
+       port->ip_messages.imq_srp_owner_thread = thread;
 
-       reset_ip_srp_bits(port);
+       ipc_special_reply_port_bits_reset(port);
 }
 
 /*
@@ -1003,7 +970,7 @@ ipc_port_unbind_special_reply_port(
 
        thread->ith_special_reply_port = NULL;
        ipc_port_adjust_special_reply_port_locked(special_reply_port, NULL,
-           IPC_PORT_ADJUST_SR_CLEAR_SPECIAL_REPLY, FALSE);
+           IPC_PORT_ADJUST_UNLINK_THREAD, FALSE);
        /* port unlocked */
 
        ip_release(special_reply_port);
@@ -1214,6 +1181,10 @@ task_set_special_port(
                return KERN_INVALID_ARGUMENT;
        }
 
+       if (task_is_driver(current_task())) {
+               return KERN_NO_ACCESS;
+       }
+
        switch (which) {
        case TASK_KERNEL_PORT:
                whichp = &task->itk_sself;
@@ -1546,21 +1517,32 @@ convert_port_to_locked_task_inspect(ipc_port_t port)
        return TASK_INSPECT_NULL;
 }
 
-
-/*
- *     Routine:        convert_port_to_task
- *     Purpose:
- *             Convert from a port to a task.
- *             Doesn't consume the port ref; produces a task ref,
- *             which may be null.
- *     Conditions:
- *             Nothing locked.
- */
-task_t
-convert_port_to_task(
-       ipc_port_t              port)
+static task_t
+convert_port_to_task_locked(
+       ipc_port_t              port,
+       uint32_t                *exec_token)
 {
-       return convert_port_to_task_with_exec_token(port, NULL);
+       task_t          task = TASK_NULL;
+
+       ip_lock_held(port);
+       require_ip_active(port);
+
+       if (ip_kotype(port) == IKOT_TASK) {
+               task_t ct = current_task();
+               task = (task_t)port->ip_kobject;
+               assert(task != TASK_NULL);
+
+               if (task_conversion_eval(ct, task)) {
+                       return TASK_NULL;
+               }
+
+               if (exec_token) {
+                       *exec_token = task->exec_token;
+               }
+               task_reference_internal(task);
+       }
+
+       return task;
 }
 
 /*
@@ -1582,30 +1564,32 @@ convert_port_to_task_with_exec_token(
 
        if (IP_VALID(port)) {
                ip_lock(port);
-
-               if (ip_active(port) &&
-                   ip_kotype(port) == IKOT_TASK) {
-                       task_t ct = current_task();
-                       task = (task_t)port->ip_kobject;
-                       assert(task != TASK_NULL);
-
-                       if (task_conversion_eval(ct, task)) {
-                               ip_unlock(port);
-                               return TASK_NULL;
-                       }
-
-                       if (exec_token) {
-                               *exec_token = task->exec_token;
-                       }
-                       task_reference_internal(task);
+               if (ip_active(port)) {
+                       task = convert_port_to_task_locked(port, exec_token);
                }
-
                ip_unlock(port);
        }
 
        return task;
 }
 
+/*
+ *     Routine:        convert_port_to_task
+ *     Purpose:
+ *             Convert from a port to a task.
+ *             Doesn't consume the port ref; produces a task ref,
+ *             which may be null.
+ *     Conditions:
+ *             Nothing locked.
+ */
+task_t
+convert_port_to_task(
+       ipc_port_t              port)
+{
+       return convert_port_to_task_with_exec_token(port, NULL);
+}
+
+
 /*
  *     Routine:        convert_port_to_task_name
  *     Purpose:
@@ -1639,6 +1623,25 @@ convert_port_to_task_name(
        return task;
 }
 
+static task_inspect_t
+convert_port_to_task_inspect_locked(
+       ipc_port_t              port)
+{
+       task_inspect_t task = TASK_INSPECT_NULL;
+
+       ip_lock_held(port);
+       require_ip_active(port);
+
+       if (ip_kotype(port) == IKOT_TASK) {
+               task = (task_inspect_t)port->ip_kobject;
+               assert(task != TASK_INSPECT_NULL);
+
+               task_reference_internal(task);
+       }
+
+       return task;
+}
+
 /*
  *     Routine:        convert_port_to_task_inspect
  *     Purpose:
@@ -1656,15 +1659,9 @@ convert_port_to_task_inspect(
 
        if (IP_VALID(port)) {
                ip_lock(port);
-
-               if (ip_active(port) &&
-                   ip_kotype(port) == IKOT_TASK) {
-                       task = (task_inspect_t)port->ip_kobject;
-                       assert(task != TASK_INSPECT_NULL);
-
-                       task_reference_internal(task);
+               if (ip_active(port)) {
+                       task = convert_port_to_task_inspect_locked(port);
                }
-
                ip_unlock(port);
        }
 
@@ -1814,29 +1811,54 @@ convert_port_to_map(
  *             Nothing locked.
  */
 
-thread_t
-convert_port_to_thread(
-       ipc_port_t              port)
+static thread_t
+convert_port_to_thread_locked(
+       ipc_port_t               port,
+       port_to_thread_options_t options)
 {
        thread_t        thread = THREAD_NULL;
 
-       if (IP_VALID(port)) {
-               ip_lock(port);
+       ip_lock_held(port);
+       require_ip_active(port);
 
-               if (ip_active(port) &&
-                   ip_kotype(port) == IKOT_THREAD) {
-                       thread = (thread_t)port->ip_kobject;
-                       assert(thread != THREAD_NULL);
+       if (ip_kotype(port) == IKOT_THREAD) {
+               thread = (thread_t)port->ip_kobject;
+               assert(thread != THREAD_NULL);
 
+               if (options & PORT_TO_THREAD_NOT_CURRENT_THREAD) {
+                       if (thread == current_thread()) {
+                               return THREAD_NULL;
+                       }
+               }
+
+               if (options & PORT_TO_THREAD_IN_CURRENT_TASK) {
+                       if (thread->task != current_task()) {
+                               return THREAD_NULL;
+                       }
+               } else {
                        /* Use task conversion rules for thread control conversions */
                        if (task_conversion_eval(current_task(), thread->task) != KERN_SUCCESS) {
-                               ip_unlock(port);
                                return THREAD_NULL;
                        }
-
-                       thread_reference_internal(thread);
                }
 
+               thread_reference_internal(thread);
+       }
+
+       return thread;
+}
+
+thread_t
+convert_port_to_thread(
+       ipc_port_t              port)
+{
+       thread_t        thread = THREAD_NULL;
+
+       if (IP_VALID(port)) {
+               ip_lock(port);
+               if (ip_active(port)) {
+                       thread = convert_port_to_thread_locked(port, PORT_TO_THREAD_NONE);
+               }
                ip_unlock(port);
        }
 
@@ -1899,28 +1921,21 @@ convert_thread_inspect_to_port(thread_inspect_t thread)
  *             A name of MACH_PORT_NULL is valid for the null thread.
  *     Conditions:
  *             Nothing locked.
- *
- *     TODO: Could this be faster if it were ipc_port_translate_send based, like thread_switch?
- *           We could avoid extra lock/unlock and extra ref operations on the port.
  */
 thread_t
 port_name_to_thread(
-       mach_port_name_t        name)
+       mach_port_name_t         name,
+       port_to_thread_options_t options)
 {
        thread_t        thread = THREAD_NULL;
        ipc_port_t      kport;
+       kern_return_t kr;
 
        if (MACH_PORT_VALID(name)) {
-               if (ipc_object_copyin(current_space(), name,
-                   MACH_MSG_TYPE_COPY_SEND,
-                   (ipc_object_t *)&kport) != KERN_SUCCESS) {
-                       return THREAD_NULL;
-               }
-
-               thread = convert_port_to_thread(kport);
-
-               if (IP_VALID(kport)) {
-                       ipc_port_release_send(kport);
+               kr = ipc_port_translate_send(current_space(), name, &kport);
+               if (kr == KERN_SUCCESS) {
+                       thread = convert_port_to_thread_locked(kport, options);
+                       ip_unlock(kport);
                }
        }
 
@@ -1931,22 +1946,15 @@ task_t
 port_name_to_task(
        mach_port_name_t name)
 {
-       ipc_port_t kern_port;
+       ipc_port_t kport;
        kern_return_t kr;
        task_t task = TASK_NULL;
 
        if (MACH_PORT_VALID(name)) {
-               kr = ipc_object_copyin(current_space(), name,
-                   MACH_MSG_TYPE_COPY_SEND,
-                   (ipc_object_t *) &kern_port);
-               if (kr != KERN_SUCCESS) {
-                       return TASK_NULL;
-               }
-
-               task = convert_port_to_task(kern_port);
-
-               if (IP_VALID(kern_port)) {
-                       ipc_port_release_send(kern_port);
+               kr = ipc_port_translate_send(current_space(), name, &kport);
+               if (kr == KERN_SUCCESS) {
+                       task = convert_port_to_task_locked(kport, NULL);
+                       ip_unlock(kport);
                }
        }
        return task;
@@ -1956,22 +1964,15 @@ task_inspect_t
 port_name_to_task_inspect(
        mach_port_name_t name)
 {
-       ipc_port_t kern_port;
+       ipc_port_t kport;
        kern_return_t kr;
        task_inspect_t ti = TASK_INSPECT_NULL;
 
        if (MACH_PORT_VALID(name)) {
-               kr = ipc_object_copyin(current_space(), name,
-                   MACH_MSG_TYPE_COPY_SEND,
-                   (ipc_object_t *)&kern_port);
-               if (kr != KERN_SUCCESS) {
-                       return TASK_NULL;
-               }
-
-               ti = convert_port_to_task_inspect(kern_port);
-
-               if (IP_VALID(kern_port)) {
-                       ipc_port_release_send(kern_port);
+               kr = ipc_port_translate_send(current_space(), name, &kport);
+               if (kr == KERN_SUCCESS) {
+                       ti = convert_port_to_task_inspect_locked(kport);
+                       ip_unlock(kport);
                }
        }
        return ti;
@@ -2070,12 +2071,8 @@ convert_task_suspension_token_to_port(
        task_lock(task);
        if (task->active) {
                if (task->itk_resume == IP_NULL) {
-                       task->itk_resume = ipc_port_alloc_kernel();
-                       if (!IP_VALID(task->itk_resume)) {
-                               panic("failed to create resume port");
-                       }
-
-                       ipc_kobject_set(task->itk_resume, (ipc_kobject_t) task, IKOT_TASK_RESUME);
+                       task->itk_resume = ipc_kobject_alloc_port((ipc_kobject_t) task,
+                           IKOT_TASK_RESUME, IPC_KOBJECT_ALLOC_NONE);
                }
 
                /*
@@ -2232,7 +2229,7 @@ thread_set_exception_ports(
        }
 
        if (IP_VALID(new_port)) {
-               switch (new_behavior & ~MACH_EXCEPTION_CODES) {
+               switch (new_behavior & ~MACH_EXCEPTION_MASK) {
                case EXCEPTION_DEFAULT:
                case EXCEPTION_STATE:
                case EXCEPTION_STATE_IDENTITY:
@@ -2327,7 +2324,7 @@ task_set_exception_ports(
        }
 
        if (IP_VALID(new_port)) {
-               switch (new_behavior & ~MACH_EXCEPTION_CODES) {
+               switch (new_behavior & ~MACH_EXCEPTION_MASK) {
                case EXCEPTION_DEFAULT:
                case EXCEPTION_STATE:
                case EXCEPTION_STATE_IDENTITY:
@@ -2452,7 +2449,7 @@ thread_swap_exception_ports(
        }
 
        if (IP_VALID(new_port)) {
-               switch (new_behavior & ~MACH_EXCEPTION_CODES) {
+               switch (new_behavior & ~MACH_EXCEPTION_MASK) {
                case EXCEPTION_DEFAULT:
                case EXCEPTION_STATE:
                case EXCEPTION_STATE_IDENTITY:
@@ -2573,7 +2570,7 @@ task_swap_exception_ports(
        }
 
        if (IP_VALID(new_port)) {
-               switch (new_behavior & ~MACH_EXCEPTION_CODES) {
+               switch (new_behavior & ~MACH_EXCEPTION_MASK) {
                case EXCEPTION_DEFAULT:
                case EXCEPTION_STATE:
                case EXCEPTION_STATE_IDENTITY:
index ce6d746e320f5214861774272f47cac7f65cdcc3..5ad86b9992dd14eb7991d169f49f187ef0d21f22 100644 (file)
@@ -175,8 +175,15 @@ extern thread_t convert_port_to_thread(
 extern thread_inspect_t convert_port_to_thread_inspect(
        ipc_port_t              port);
 
+__options_decl(port_to_thread_options_t, uint32_t, {
+       PORT_TO_THREAD_NONE               = 0x0000,
+       PORT_TO_THREAD_IN_CURRENT_TASK    = 0x0001,
+       PORT_TO_THREAD_NOT_CURRENT_THREAD = 0x0002,
+});
+
 extern thread_t port_name_to_thread(
-       mach_port_name_t        port_name);
+       mach_port_name_t            port_name,
+       port_to_thread_options_t    options);
 
 /* Deallocate a space ref produced by convert_port_to_space */
 extern void space_deallocate(
index 63b8aaffb88f45ba8b30c8a47046644eb1bf531a..ffe8d7658ae754edfaa683d4330d3d5151a98fd7 100644 (file)
@@ -96,11 +96,11 @@ vm_size_t kalloc_kernmap_size;  /* size of kallocs that can come from kernel map
 /* how many times we couldn't allocate out of kalloc_map and fell back to kernel_map */
 unsigned long kalloc_fallback_count;
 
-unsigned int kalloc_large_inuse;
-vm_size_t    kalloc_large_total;
-vm_size_t    kalloc_large_max;
-vm_size_t    kalloc_largest_allocated = 0;
-uint64_t    kalloc_large_sum;
+uint_t     kalloc_large_inuse;
+vm_size_t  kalloc_large_total;
+vm_size_t  kalloc_large_max;
+vm_size_t  kalloc_largest_allocated = 0;
+uint64_t   kalloc_large_sum;
 
 int     kalloc_fake_zone_index = -1; /* index of our fake zone in statistics arrays */
 
@@ -191,8 +191,9 @@ KALLOC_ZINFO_SFREE(vm_size_t bytes)
  * 4096       Y                    N                   N
  * 6144       N                    N                   N
  * 8192       Y                    N                   N
+ * 12288      N                    X                   X
  * 16384      N                    N                   N
- * 32768      N                    N                   N
+ * 32768      X                    N                   N
  *
  */
 static const struct kalloc_zone_config {
@@ -300,8 +301,8 @@ static const struct kalloc_zone_config {
        KZC_ENTRY(4096, true),
        KZC_ENTRY(6144, false),
        KZC_ENTRY(8192, true),
-       KZC_ENTRY(16384, false),
-       KZC_ENTRY(32768, false),
+       KZC_ENTRY(12288, false),
+       KZC_ENTRY(16384, false)
 
 #endif /* CONFIG_EMBEDDED */
 
@@ -407,13 +408,7 @@ kalloc_init(
        kalloc_map_min = min;
        kalloc_map_max = min + kalloc_map_size - 1;
 
-       /*
-        * Create zones up to a least 4 pages because small page-multiples are
-        * common allocations.  Also ensure that zones up to size 16KB bytes exist.
-        * This is desirable because messages are allocated with kalloc(), and
-        * messages up through size 8192 are common.
-        */
-       kalloc_max = PAGE_SIZE << 2;
+       kalloc_max = (k_zone_config[MAX_K_ZONE - 1].kzc_size << 1);
        if (kalloc_max < KiB(16)) {
                kalloc_max = KiB(16);
        }
@@ -674,6 +669,7 @@ vm_size_t
        DTRACE_VM3(kfree, vm_size_t, -1, vm_size_t, size, void*, addr);
 
        kalloc_spin_lock();
+       assert(kalloc_large_total >= size);
        kalloc_large_total -= size;
        kalloc_large_inuse--;
        kalloc_unlock();
@@ -685,9 +681,9 @@ vm_size_t
 
 void *
 kalloc_canblock(
-       vm_size_t              * psize,
-       boolean_t              canblock,
-       vm_allocation_site_t * site)
+       vm_size_t             *psize,
+       boolean_t             canblock,
+       vm_allocation_site_t *site)
 {
        zone_t z;
        vm_size_t size;
@@ -724,6 +720,8 @@ kalloc_canblock(
                /* large allocation - use guard pages instead of small redzones */
                size = round_page(req_size + 2 * PAGE_SIZE);
                assert(size >= MAX_SIZE_ZDLUT && size >= kalloc_max_prerounded);
+#else
+               size = round_page(size);
 #endif
 
                if (size >= kalloc_kernmap_size) {
@@ -760,6 +758,7 @@ kalloc_canblock(
                        }
 
                        kalloc_large_inuse++;
+                       assert(kalloc_large_total + size >= kalloc_large_total); /* no wrap around */
                        kalloc_large_total += size;
                        kalloc_large_sum += size;
 
@@ -775,7 +774,7 @@ kalloc_canblock(
                /* fixup the return address to skip the redzone */
                addr = (void *)kasan_alloc((vm_offset_t)addr, size, req_size, PAGE_SIZE);
 #else
-               *psize = round_page(size);
+               *psize = size;
 #endif
                DTRACE_VM3(kalloc, vm_size_t, size, vm_size_t, *psize, void*, addr);
                return addr;
@@ -863,6 +862,7 @@ void
                kmem_free(alloc_map, (vm_offset_t)data, size);
                kalloc_spin_lock();
 
+               assert(kalloc_large_total >= size);
                kalloc_large_total -= size;
                kalloc_large_inuse--;
 
@@ -949,7 +949,7 @@ OSMalloc_Tagref(
                panic("OSMalloc_Tagref():'%s' has bad state 0x%08X\n", tag->OSMT_name, tag->OSMT_state);
        }
 
-       (void)hw_atomic_add(&tag->OSMT_refcnt, 1);
+       os_atomic_inc(&tag->OSMT_refcnt, relaxed);
 }
 
 void
@@ -960,8 +960,8 @@ OSMalloc_Tagrele(
                panic("OSMalloc_Tagref():'%s' has bad state 0x%08X\n", tag->OSMT_name, tag->OSMT_state);
        }
 
-       if (hw_atomic_sub(&tag->OSMT_refcnt, 1) == 0) {
-               if (hw_compare_and_store(OSMT_VALID | OSMT_RELEASED, OSMT_VALID | OSMT_RELEASED, &tag->OSMT_state)) {
+       if (os_atomic_dec(&tag->OSMT_refcnt, relaxed) == 0) {
+               if (os_atomic_cmpxchg(&tag->OSMT_state, OSMT_VALID | OSMT_RELEASED, OSMT_VALID | OSMT_RELEASED, acq_rel)) {
                        OSMalloc_tag_spin_lock();
                        (void)remque((queue_entry_t)tag);
                        OSMalloc_tag_unlock();
@@ -976,11 +976,11 @@ void
 OSMalloc_Tagfree(
        OSMallocTag            tag)
 {
-       if (!hw_compare_and_store(OSMT_VALID, OSMT_VALID | OSMT_RELEASED, &tag->OSMT_state)) {
+       if (!os_atomic_cmpxchg(&tag->OSMT_state, OSMT_VALID, OSMT_VALID | OSMT_RELEASED, acq_rel)) {
                panic("OSMalloc_Tagfree():'%s' has bad state 0x%08X \n", tag->OSMT_name, tag->OSMT_state);
        }
 
-       if (hw_atomic_sub(&tag->OSMT_refcnt, 1) == 0) {
+       if (os_atomic_dec(&tag->OSMT_refcnt, relaxed) == 0) {
                OSMalloc_tag_spin_lock();
                (void)remque((queue_entry_t)tag);
                OSMalloc_tag_unlock();
index 85cf4998b3792d85ce01eeec8b4f5cb953144e27..f00a3be8fd027cf46a406a76998602f06ec80dec 100644 (file)
@@ -436,45 +436,47 @@ struct kcdata_type_definition {
  * NOTE: Please update kcdata/libkdd/kcdtypes.c if you make any changes
  * in STACKSHOT_KCTYPE_* types.
  */
-#define STACKSHOT_KCTYPE_IOSTATS 0x901u                   /* io_stats_snapshot */
-#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u          /* struct mem_and_io_snapshot */
-#define STACKSHOT_KCCONTAINER_TASK 0x903u
-#define STACKSHOT_KCCONTAINER_THREAD 0x904u
-#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u             /* task_snapshot_v2 */
-#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u           /* thread_snapshot_v2, thread_snapshot_v3 */
-#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u             /* int[] */
-#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u      /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
-#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u               /* char[] */
-#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au           /* struct stack_snapshot_frame32 */
-#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu         /* struct stack_snapshot_frame64 */
-#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu           /* struct stack_snapshot_frame32 */
-#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du         /* struct stack_snapshot_frame64 */
-#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu                  /* boot args string */
-#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu                 /* os version string */
-#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u            /* kernel page size in uint32_t */
-#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u              /* jetsam level in uint32_t */
-#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u     /* timestamp used for the delta stackshot */
-#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u              /* uint32_t */
-#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u            /* uint64_t */
-#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u              /* uint32_t */
-#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u            /* uint64_t */
-#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u          /* uint64_t */
-#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u         /* uint64_t */
-#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u                 /* struct stackshot_cpu_times or stackshot_cpu_times_v2 */
-#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au        /* struct stackshot_duration */
-#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu     /* struct stackshot_fault_stats */
-#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO  0x91cu     /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
-#define STACKSHOT_KCTYPE_THREAD_WAITINFO 0x91du           /* struct stackshot_thread_waitinfo */
-#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT 0x91eu     /* struct thread_group_snapshot or thread_group_snapshot_v2 */
-#define STACKSHOT_KCTYPE_THREAD_GROUP 0x91fu              /* uint64_t */
-#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT 0x920u /* struct jetsam_coalition_snapshot */
-#define STACKSHOT_KCTYPE_JETSAM_COALITION 0x921u          /* uint64_t */
-#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION 0x922u     /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */
-#define STACKSHOT_KCTYPE_INSTRS_CYCLES 0x923u             /* struct instrs_cycles_snapshot */
-#define STACKSHOT_KCTYPE_USER_STACKTOP 0x924u             /* struct stack_snapshot_stacktop */
-#define STACKSHOT_KCTYPE_ASID 0x925u                      /* uint32_t */
-#define STACKSHOT_KCTYPE_PAGE_TABLES 0x926u               /* uint64_t */
-#define STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT 0x927u    /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
+#define STACKSHOT_KCTYPE_IOSTATS                     0x901u /* io_stats_snapshot */
+#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS            0x902u /* struct mem_and_io_snapshot */
+#define STACKSHOT_KCCONTAINER_TASK                   0x903u
+#define STACKSHOT_KCCONTAINER_THREAD                 0x904u
+#define STACKSHOT_KCTYPE_TASK_SNAPSHOT               0x905u /* task_snapshot_v2 */
+#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT             0x906u /* thread_snapshot_v2, thread_snapshot_v3 */
+#define STACKSHOT_KCTYPE_DONATING_PIDS               0x907u /* int[] */
+#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO        0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
+#define STACKSHOT_KCTYPE_THREAD_NAME                 0x909u /* char[] */
+#define STACKSHOT_KCTYPE_KERN_STACKFRAME             0x90Au /* struct stack_snapshot_frame32 */
+#define STACKSHOT_KCTYPE_KERN_STACKFRAME64           0x90Bu /* struct stack_snapshot_frame64 */
+#define STACKSHOT_KCTYPE_USER_STACKFRAME             0x90Cu /* struct stack_snapshot_frame32 */
+#define STACKSHOT_KCTYPE_USER_STACKFRAME64           0x90Du /* struct stack_snapshot_frame64 */
+#define STACKSHOT_KCTYPE_BOOTARGS                    0x90Eu /* boot args string */
+#define STACKSHOT_KCTYPE_OSVERSION                   0x90Fu /* os version string */
+#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE              0x910u /* kernel page size in uint32_t */
+#define STACKSHOT_KCTYPE_JETSAM_LEVEL                0x911u /* jetsam level in uint32_t */
+#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP       0x912u /* timestamp used for the delta stackshot */
+#define STACKSHOT_KCTYPE_KERN_STACKLR                0x913u /* uint32_t */
+#define STACKSHOT_KCTYPE_KERN_STACKLR64              0x914u /* uint64_t */
+#define STACKSHOT_KCTYPE_USER_STACKLR                0x915u /* uint32_t */
+#define STACKSHOT_KCTYPE_USER_STACKLR64              0x916u /* uint64_t */
+#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS            0x917u /* uint64_t */
+#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS           0x918u /* uint64_t */
+#define STACKSHOT_KCTYPE_CPU_TIMES                   0x919u /* struct stackshot_cpu_times or stackshot_cpu_times_v2 */
+#define STACKSHOT_KCTYPE_STACKSHOT_DURATION          0x91au /* struct stackshot_duration */
+#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS       0x91bu /* struct stackshot_fault_stats */
+#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO        0x91cu /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
+#define STACKSHOT_KCTYPE_THREAD_WAITINFO             0x91du /* struct stackshot_thread_waitinfo */
+#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT       0x91eu /* struct thread_group_snapshot or thread_group_snapshot_v2 */
+#define STACKSHOT_KCTYPE_THREAD_GROUP                0x91fu /* uint64_t */
+#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT   0x920u /* struct jetsam_coalition_snapshot */
+#define STACKSHOT_KCTYPE_JETSAM_COALITION            0x921u /* uint64_t */
+#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION       0x922u /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */
+#define STACKSHOT_KCTYPE_INSTRS_CYCLES               0x923u /* struct instrs_cycles_snapshot */
+#define STACKSHOT_KCTYPE_USER_STACKTOP               0x924u /* struct stack_snapshot_stacktop */
+#define STACKSHOT_KCTYPE_ASID                        0x925u /* uint32_t */
+#define STACKSHOT_KCTYPE_PAGE_TABLES                 0x926u /* uint64_t */
+#define STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT      0x927u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
+#define STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL 0x928u /* dispatch queue label */
+#define STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO        0x929u /* struct stackshot_thread_turnstileinfo */
 
 #define STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT 0x940u   /* task_delta_snapshot_v2 */
 #define STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT 0x941u /* thread_delta_snapshot_v* */
@@ -517,6 +519,7 @@ struct user64_dyld_uuid_info {
 };
 
 enum task_snapshot_flags {
+       /* k{User,Kernel}64_p (values 0x1 and 0x2) are defined in generic_snapshot_flags */
        kTaskRsrcFlagged                      = 0x4, // In the EXC_RESOURCE danger zone?
        kTerminatedSnapshot                   = 0x8,
        kPidSuspended                         = 0x10, // true for suspended task
@@ -546,6 +549,7 @@ enum task_snapshot_flags {
 };
 
 enum thread_snapshot_flags {
+       /* k{User,Kernel}64_p (values 0x1 and 0x2) are defined in generic_snapshot_flags */
        kHasDispatchSerial    = 0x4,
        kStacksPCOnly         = 0x8,    /* Stack traces have no frame pointers. */
        kThreadDarwinBG       = 0x10,   /* Thread is darwinbg */
@@ -814,6 +818,18 @@ typedef struct stackshot_thread_waitinfo {
        uint8_t wait_type;      /* The type of object that the thread is waiting on */
 } __attribute__((packed)) thread_waitinfo_t;
 
+typedef struct stackshot_thread_turnstileinfo {
+       uint64_t waiter;        /* The thread that's waiting on the object */
+       uint64_t turnstile_context; /* Associated data (either thread id, or workq addr) */
+       uint8_t turnstile_priority;
+       uint8_t number_of_hops;
+#define STACKSHOT_TURNSTILE_STATUS_UNKNOWN      (1 << 0) /* The final inheritor is unknown (bug?) */
+#define STACKSHOT_TURNSTILE_STATUS_LOCKED_WAITQ (1 << 1) /* A waitq was found to be locked */
+#define STACKSHOT_TURNSTILE_STATUS_WORKQUEUE    (1 << 2) /* The final inheritor is a workqueue */
+#define STACKSHOT_TURNSTILE_STATUS_THREAD       (1 << 3) /* The final inheritor is a thread */
+       uint64_t turnstile_flags;
+} __attribute__((packed)) thread_turnstileinfo_t;
+
 #define STACKSHOT_WAITOWNER_KERNEL         (UINT64_MAX - 1)
 #define STACKSHOT_WAITOWNER_PORT_LOCKED    (UINT64_MAX - 2)
 #define STACKSHOT_WAITOWNER_PSET_LOCKED    (UINT64_MAX - 3)
@@ -895,6 +911,8 @@ struct crashinfo_proc_uniqidentifierinfo {
 #define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE               0x828 /* uint64_t */
 #define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE_COMPRESSED    0x829 /* uint64_t */
 #define TASK_CRASHINFO_LEDGER_WIRED_MEM                         0x82A /* uint64_t */
+#define TASK_CRASHINFO_PROC_PERSONA_ID                          0x82B /* uid_t */
+#define TASK_CRASHINFO_MEMORY_LIMIT_INCREASE                    0x82C /* uint32_t */
 
 
 
@@ -971,7 +989,7 @@ kcdata_iter_unsafe(void *buffer)
        return iter;
 }
 
-static const kcdata_iter_t kcdata_invalid_iter = { .item = 0, .end = 0 };
+static const kcdata_iter_t kcdata_invalid_iter = { .item = NULL, .end = NULL };
 
 static inline
 int
index 05ab16ce91ea2ffe5e16e26e49df6d2e7c8eadf8..8a9c4cd75920fde2c3b40204f73d056ac9b5d10f 100644 (file)
@@ -125,10 +125,13 @@ static int              kdp_stackshot_kcdata_format(int pid, uint32_t trace_flag
 uint32_t                kdp_stack_snapshot_bytes_traced(void);
 static void             kdp_mem_and_io_snapshot(struct mem_and_io_snapshot *memio_snap);
 static boolean_t        kdp_copyin(vm_map_t map, uint64_t uaddr, void *dest, size_t size, boolean_t try_fault, uint32_t *kdp_fault_result);
+static int              kdp_copyin_string(task_t task, uint64_t addr, char *buf, int buf_sz, boolean_t try_fault, uint32_t *kdp_fault_results);
 static boolean_t        kdp_copyin_word(task_t task, uint64_t addr, uint64_t *result, boolean_t try_fault, uint32_t *kdp_fault_results);
 static uint64_t         proc_was_throttled_from_task(task_t task);
 static void             stackshot_thread_wait_owner_info(thread_t thread, thread_waitinfo_t * waitinfo);
 static int              stackshot_thread_has_valid_waitinfo(thread_t thread);
+static void             stackshot_thread_turnstileinfo(thread_t thread, thread_turnstileinfo_t *tsinfo);
+static int              stackshot_thread_has_valid_turnstileinfo(thread_t thread);
 
 #if CONFIG_COALITIONS
 static void             stackshot_coalition_jetsam_count(void *arg, int i, coalition_t coal);
@@ -148,6 +151,7 @@ static uint64_t         proc_did_throttle_from_task(task_t task);
 extern void             proc_name_kdp(task_t task, char * buf, int size);
 extern int              proc_threadname_kdp(void * uth, char * buf, size_t size);
 extern void             proc_starttime_kdp(void * p, uint64_t * tv_sec, uint64_t * tv_usec, uint64_t * abstime);
+extern boolean_t        proc_binary_uuid_kdp(task_t task, uuid_t uuid);
 extern int              memorystatus_get_pressure_status_kdp(void);
 extern void             memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit);
 
@@ -238,6 +242,8 @@ SECURITY_READ_ONLY_LATE(static uint32_t) max_tracebuf_size = SANE_TRACEBUF_SIZE;
 #define ROUNDUP(x, y)            ((((x)+(y)-1)/(y))*(y))
 #endif
 
+#define STACKSHOT_QUEUE_LABEL_MAXSIZE  64
+
 /*
  * Initialize the mutex governing access to the stack snapshot subsystem
  * and other stackshot related bits.
@@ -1023,18 +1029,48 @@ kcdata_record_uuid_info(kcdata_descriptor_t kcd, task_t task, uint32_t trace_fla
                }
        }
 
-       if (task_pid > 0 && uuid_info_count > 0 && uuid_info_count < MAX_LOADINFOS) {
-               uint32_t uuid_info_size       = (uint32_t)(task_64bit_addr ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info));
-               uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size;
+       if (save_loadinfo_p && task_pid > 0 && (uuid_info_count < MAX_LOADINFOS)) {
+               uint32_t copied_uuid_count = 0;
+               uint32_t uuid_info_size = (uint32_t)(task_64bit_addr ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info));
+               uint32_t uuid_info_array_size = 0;
 
-               kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task_64bit_addr ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO),
-                   uuid_info_size, uuid_info_count, &out_addr));
+               /* If we found some UUID information, first try to copy it in -- this will only be non-zero if we had a pmap above */
+               if (uuid_info_count > 0) {
+                       uuid_info_array_size = uuid_info_count * uuid_info_size;
 
-               /* Copy in the UUID info array
-                * It may be nonresident, in which case just fix up nloadinfos to 0 in the task_snap
-                */
-               if (have_pmap && !kdp_copyin(task->map, uuid_info_addr, (void *)out_addr, uuid_info_array_size, should_fault, &kdp_fault_results)) {
-                       bzero((void *)out_addr, uuid_info_array_size);
+                       kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task_64bit_addr ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO),
+                           uuid_info_size, uuid_info_count, &out_addr));
+
+                       if (!kdp_copyin(task->map, uuid_info_addr, (void *)out_addr, uuid_info_array_size, should_fault, &kdp_fault_results)) {
+                               bzero((void *)out_addr, uuid_info_array_size);
+                       } else {
+                               copied_uuid_count = uuid_info_count;
+                       }
+               }
+
+               uuid_t binary_uuid;
+               if (!copied_uuid_count && proc_binary_uuid_kdp(task, binary_uuid)) {
+                       /* We failed to copyin the UUID information, try to store the UUID of the main binary we have in the proc */
+                       if (uuid_info_array_size == 0) {
+                               /* We just need to store one UUID */
+                               uuid_info_array_size = uuid_info_size;
+                               kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task_64bit_addr ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO),
+                                   uuid_info_size, 1, &out_addr));
+                       }
+
+                       if (task_64bit_addr) {
+                               struct user64_dyld_uuid_info *uuid_info = (struct user64_dyld_uuid_info *)out_addr;
+                               uint64_t image_load_address = task->mach_header_vm_address;
+
+                               stackshot_memcpy(&uuid_info->imageUUID, binary_uuid, sizeof(uuid_t));
+                               stackshot_memcpy(&uuid_info->imageLoadAddress, &image_load_address, sizeof(image_load_address));
+                       } else {
+                               struct user32_dyld_uuid_info *uuid_info = (struct user32_dyld_uuid_info *)out_addr;
+                               uint32_t image_load_address = (uint32_t) task->mach_header_vm_address;
+
+                               stackshot_memcpy(&uuid_info->imageUUID, binary_uuid, sizeof(uuid_t));
+                               stackshot_memcpy(&uuid_info->imageLoadAddress, &image_load_address, sizeof(image_load_address));
+                       }
                }
        } else if (task_pid == 0 && uuid_info_count > 0 && uuid_info_count < MAX_LOADINFOS) {
                uintptr_t image_load_address;
@@ -1197,7 +1233,7 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t trace
 
 #if __arm__ || __arm64__
        if (collect_asid && have_pmap) {
-               uint32_t asid = task->map->pmap->asid;
+               uint32_t asid = PMAP_VASID(task->map->pmap);
                kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_ASID, sizeof(uint32_t), &out_addr));
                stackshot_memcpy((void*)out_addr, &asid, sizeof(asid));
        }
@@ -1300,7 +1336,7 @@ kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t
 
 #if __arm__ || __arm64__
        if (collect_asid && have_pmap) {
-               uint32_t asid = task->map->pmap->asid;
+               uint32_t asid = PMAP_VASID(task->map->pmap);
                kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_ASID, sizeof(uint32_t), &out_addr));
                stackshot_memcpy((void*)out_addr, &asid, sizeof(asid));
        }
@@ -1375,7 +1411,8 @@ kcdata_record_thread_snapshot(
        cur_thread_snap = (struct thread_snapshot_v4 *)out_addr;
 
        /* Populate the thread snapshot header */
-       cur_thread_snap->ths_thread_id      = thread_tid(thread);
+       cur_thread_snap->ths_ss_flags = 0;
+       cur_thread_snap->ths_thread_id = thread_tid(thread);
        cur_thread_snap->ths_wait_event = VM_KERNEL_UNSLIDE_OR_PERM(thread->wait_event);
        cur_thread_snap->ths_continuation = VM_KERNEL_UNSLIDE(thread->continuation);
        cur_thread_snap->ths_total_syscalls = thread->syscalls_mach + thread->syscalls_unix;
@@ -1400,6 +1437,27 @@ kcdata_record_thread_snapshot(
                                        cur_thread_snap->ths_ss_flags |= kHasDispatchSerial;
                                        cur_thread_snap->ths_dqserialnum = dqserialnum;
                                }
+
+                               /* try copying in the queue label */
+                               uint64_t label_offs = get_task_dispatchqueue_label_offset(task);
+                               if (label_offs) {
+                                       uint64_t dqlabeladdr = dqaddr + label_offs;
+                                       uint64_t actual_dqlabeladdr = 0;
+
+                                       copyin_ok = kdp_copyin_word(task, dqlabeladdr, &actual_dqlabeladdr, FALSE, NULL);
+                                       if (copyin_ok && actual_dqlabeladdr != 0) {
+                                               char label_buf[STACKSHOT_QUEUE_LABEL_MAXSIZE];
+                                               int len;
+
+                                               bzero(label_buf, STACKSHOT_QUEUE_LABEL_MAXSIZE * sizeof(char));
+                                               len = kdp_copyin_string(task, actual_dqlabeladdr, label_buf, STACKSHOT_QUEUE_LABEL_MAXSIZE, FALSE, NULL);
+                                               if (len > 0) {
+                                                       mach_vm_address_t label_addr = 0;
+                                                       kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL, len, &label_addr));
+                                                       stackshot_strlcpy((char*)label_addr, &label_buf[0], len);
+                                               }
+                                       }
+                               }
                        }
                }
        }
@@ -1415,7 +1473,6 @@ kcdata_record_thread_snapshot(
                cur_thread_snap->ths_sys_time = 0;
        }
 
-       cur_thread_snap->ths_ss_flags = 0;
        if (thread->thread_tag & THREAD_TAG_MAINTHREAD) {
                cur_thread_snap->ths_ss_flags |= kThreadMain;
        }
@@ -1658,7 +1715,9 @@ classify_thread(thread_t thread, boolean_t * thread_on_core_p, uint32_t trace_fl
        processor_t last_processor = thread->last_processor;
 
        boolean_t thread_on_core =
-           (last_processor != PROCESSOR_NULL && last_processor->state == PROCESSOR_RUNNING && last_processor->active_thread == thread);
+           (last_processor != PROCESSOR_NULL &&
+           (last_processor->state == PROCESSOR_SHUTDOWN || last_processor->state == PROCESSOR_RUNNING) &&
+           last_processor->active_thread == thread);
 
        *thread_on_core_p = thread_on_core;
 
@@ -1694,6 +1753,7 @@ kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task)
        int num_delta_thread_snapshots = 0;
        int num_nonrunnable_threads    = 0;
        int num_waitinfo_threads       = 0;
+       int num_turnstileinfo_threads  = 0;
 
        uint64_t task_start_abstime    = 0;
        boolean_t task_delta_stackshot = FALSE;
@@ -1701,6 +1761,13 @@ kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task)
        boolean_t some_thread_ran = FALSE;
        unaligned_u64 *task_snap_ss_flags = NULL;
 
+#if INTERRUPT_MASKED_DEBUG && MONOTONIC
+       uint64_t task_begin_cpu_cycle_count = 0;
+       if (!panic_stackshot) {
+               task_begin_cpu_cycle_count = mt_cur_cpu_cycles();
+       }
+#endif
+
        if ((task == NULL) || !ml_validate_nofault((vm_offset_t)task, sizeof(struct task))) {
                error = KERN_FAILURE;
                goto error_exit;
@@ -1783,8 +1850,14 @@ kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task)
                        /* We want to report owner information regardless of whether a thread
                         * has changed since the last delta, whether it's a normal stackshot,
                         * or whether it's nonrunnable */
-                       if (save_owner_info && stackshot_thread_has_valid_waitinfo(thread)) {
-                               num_waitinfo_threads++;
+                       if (save_owner_info) {
+                               if (stackshot_thread_has_valid_waitinfo(thread)) {
+                                       num_waitinfo_threads++;
+                               }
+
+                               if (stackshot_thread_has_valid_turnstileinfo(thread)) {
+                                       num_turnstileinfo_threads++;
+                               }
                        }
                }
 
@@ -1806,8 +1879,10 @@ kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task)
                        nonrunnable_tids = (uint64_t *)out_addr;
                }
 
-               thread_waitinfo_t *thread_waitinfo = NULL;
-               int current_waitinfo_index         = 0;
+               thread_waitinfo_t *thread_waitinfo           = NULL;
+               thread_turnstileinfo_t *thread_turnstileinfo = NULL;
+               int current_waitinfo_index              = 0;
+               int current_turnstileinfo_index         = 0;
 
                if (num_waitinfo_threads > 0) {
                        kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_WAITINFO,
@@ -1815,7 +1890,15 @@ kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task)
                        thread_waitinfo = (thread_waitinfo_t *)out_addr;
                }
 
-               if (num_delta_thread_snapshots > 0 || num_nonrunnable_threads > 0 || num_waitinfo_threads > 0) {
+               if (num_turnstileinfo_threads > 0) {
+                       /* get space for the turnstile info */
+                       kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO,
+                           sizeof(thread_turnstileinfo_t), num_turnstileinfo_threads, &out_addr));
+                       thread_turnstileinfo = (thread_turnstileinfo_t *)out_addr;
+               }
+
+               if (num_delta_thread_snapshots > 0 || num_nonrunnable_threads > 0 ||
+                   num_waitinfo_threads > 0 || num_turnstileinfo_threads > 0) {
                        queue_iterate(&task->threads, thread, thread_t, task_threads)
                        {
                                if (active_kthreads_only_p && thread->kernel_stack == 0) {
@@ -1823,10 +1906,18 @@ kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task)
                                }
 
                                /* If we want owner info, we should capture it regardless of its classification */
-                               if (save_owner_info && stackshot_thread_has_valid_waitinfo(thread)) {
-                                       stackshot_thread_wait_owner_info(
-                                               thread,
-                                               &thread_waitinfo[current_waitinfo_index++]);
+                               if (save_owner_info) {
+                                       if (stackshot_thread_has_valid_waitinfo(thread)) {
+                                               stackshot_thread_wait_owner_info(
+                                                       thread,
+                                                       &thread_waitinfo[current_waitinfo_index++]);
+                                       }
+
+                                       if (stackshot_thread_has_valid_turnstileinfo(thread)) {
+                                               stackshot_thread_turnstileinfo(
+                                                       thread,
+                                                       &thread_turnstileinfo[current_turnstileinfo_index++]);
+                                       }
                                }
 
                                boolean_t thread_on_core;
@@ -1883,6 +1974,13 @@ kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task)
                        kcd_exit_on_error(kcdata_record_shared_cache_info(stackshot_kcdata_p, task, task_snap_ss_flags));
                        kcd_exit_on_error(kcdata_record_uuid_info(stackshot_kcdata_p, task, ctx->trace_flags, have_pmap, task_snap_ss_flags));
                }
+
+#if INTERRUPT_MASKED_DEBUG && MONOTONIC
+               if (!panic_stackshot) {
+                       kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - task_begin_cpu_cycle_count),
+                           "task_cpu_cycle_count"));
+               }
+#endif
                /* mark end of task snapshot data */
                kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END, STACKSHOT_KCCONTAINER_TASK,
                    task_uniqueid));
@@ -1906,6 +2004,14 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac
        uint32_t length_to_copy = 0, tmp32 = 0;
        abs_time = mach_absolute_time();
 
+#if INTERRUPT_MASKED_DEBUG && MONOTONIC
+       uint64_t stackshot_begin_cpu_cycle_count = 0;
+
+       if (!panic_stackshot) {
+               stackshot_begin_cpu_cycle_count = mt_cur_cpu_cycles();
+       }
+#endif
+
        /* process the flags */
        boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
        boolean_t use_fault_path          = ((trace_flags & (STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_ENABLE_BT_FAULTING)) != 0);
@@ -2020,6 +2126,15 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac
 #if CONFIG_COALITIONS
        int num_coalitions = 0;
        struct jetsam_coalition_snapshot *coalitions = NULL;
+
+#if INTERRUPT_MASKED_DEBUG && MONOTONIC
+       uint64_t coalition_begin_cpu_cycle_count = 0;
+
+       if (!panic_stackshot && (trace_flags & STACKSHOT_SAVE_JETSAM_COALITIONS)) {
+               coalition_begin_cpu_cycle_count = mt_cur_cpu_cycles();
+       }
+#endif
+
        /* Iterate over coalitions */
        if (trace_flags & STACKSHOT_SAVE_JETSAM_COALITIONS) {
                if (coalition_iterate_stackshot(stackshot_coalition_jetsam_count, &num_coalitions, COALITION_TYPE_JETSAM) != KERN_SUCCESS) {
@@ -2037,6 +2152,12 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac
                        goto error_exit;
                }
        }
+#if INTERRUPT_MASKED_DEBUG && MONOTONIC
+       if (!panic_stackshot && (coalition_begin_cpu_cycle_count != 0)) {
+               kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - coalition_begin_cpu_cycle_count),
+                   "coalitions_cpu_cycle_count"));
+       }
+#endif
 #else
        trace_flags &= ~(STACKSHOT_SAVE_JETSAM_COALITIONS);
 #endif /* CONFIG_COALITIONS */
@@ -2089,6 +2210,13 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac
 
        kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, trace_flags, "stackshot_out_flags"));
 
+#if INTERRUPT_MASKED_DEBUG && MONOTONIC
+       if (!panic_stackshot) {
+               kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - stackshot_begin_cpu_cycle_count),
+                   "stackshot_total_cpu_cycle_cnt"));
+       }
+#endif
+
        kcd_exit_on_error(kcdata_write_buffer_end(stackshot_kcdata_p));
 
        /*  === END of populating stackshot data === */
@@ -2294,7 +2422,7 @@ boolean_t
 kdp_copyin_word(
        task_t task, uint64_t addr, uint64_t *result, boolean_t try_fault, uint32_t *kdp_fault_results)
 {
-       if (task_has_64Bit_data(task)) {
+       if (task_has_64Bit_addr(task)) {
                return kdp_copyin(task->map, addr, result, sizeof(uint64_t), try_fault, kdp_fault_results);
        } else {
                uint32_t buf;
@@ -2304,6 +2432,46 @@ kdp_copyin_word(
        }
 }
 
+int
+kdp_copyin_string(
+       task_t task, uint64_t addr, char *buf, int buf_sz, boolean_t try_fault, uint32_t *kdp_fault_results)
+{
+       int i;
+       uint64_t validated = 0, valid_from;
+       uint64_t phys_src, phys_dest;
+
+       for (i = 0; i < buf_sz; i++) {
+               if (validated == 0) {
+                       valid_from = i;
+                       phys_src = kdp_find_phys(task->map, addr + i, try_fault, kdp_fault_results);
+                       phys_dest = kvtophys((vm_offset_t)&buf[i]);
+                       uint64_t src_rem = PAGE_SIZE - (phys_src & PAGE_MASK);
+                       uint64_t dst_rem = PAGE_SIZE - (phys_dest & PAGE_MASK);
+                       if (phys_src && phys_dest) {
+                               validated = MIN(src_rem, dst_rem);
+                               if (validated) {
+                                       bcopy_phys(phys_src, phys_dest, 1);
+                                       validated--;
+                               } else {
+                                       return 0;
+                               }
+                       } else {
+                               return 0;
+                       }
+               } else {
+                       bcopy_phys(phys_src + (i - valid_from), phys_dest + (i - valid_from), 1);
+                       validated--;
+               }
+
+               if (buf[i] == '\0') {
+                       return i + 1;
+               }
+       }
+
+       /* ran out of space */
+       return -1;
+}
+
 boolean_t
 kdp_copyin(vm_map_t map, uint64_t uaddr, void *dest, size_t size, boolean_t try_fault, uint32_t *kdp_fault_results)
 {
@@ -2416,13 +2584,7 @@ machine_trace_thread_get_kva(vm_offset_t cur_target_addr, vm_map_t map, uint32_t
                if (cur_phys_addr == 0) {
                        return 0;
                }
-#if __x86_64__
-               kern_virt_target_addr = (vm_offset_t) PHYSMAP_PTOV(cur_phys_addr);
-#elif __arm__ || __arm64__
                kern_virt_target_addr = phystokv(cur_phys_addr);
-#else
-#error Oh come on... we should really unify the physical -> kernel virtual interface
-#endif
                prev_target_page = cur_target_page;
                prev_target_kva = (kern_virt_target_addr & ~PAGE_MASK);
                validate_next_addr = FALSE;
@@ -2526,11 +2688,33 @@ stackshot_thread_has_valid_waitinfo(thread_t thread)
        }
 }
 
+/* Determine if a thread has turnstileinfo that stackshot can provide */
+static int
+stackshot_thread_has_valid_turnstileinfo(thread_t thread)
+{
+       struct turnstile *ts = thread_get_waiting_turnstile(thread);
+
+       return stackshot_thread_has_valid_waitinfo(thread) &&
+              ts != TURNSTILE_NULL;
+}
+
+static void
+stackshot_thread_turnstileinfo(thread_t thread, thread_turnstileinfo_t *tsinfo)
+{
+       struct turnstile *ts;
+
+       /* acquire turnstile information and store it in the stackshot */
+       ts = thread_get_waiting_turnstile(thread);
+       tsinfo->waiter = thread_tid(thread);
+       kdp_turnstile_fill_tsinfo(ts, tsinfo);
+}
+
 static void
 stackshot_thread_wait_owner_info(thread_t thread, thread_waitinfo_t *waitinfo)
 {
-       waitinfo->waiter    = thread_tid(thread);
-       waitinfo->wait_type = thread->block_hint;
+       waitinfo->waiter        = thread_tid(thread);
+       waitinfo->wait_type     = thread->block_hint;
+
        switch (waitinfo->wait_type) {
        case kThreadWaitKernelMutex:
                kdp_lck_mtx_find_owner(thread->waitq, thread->wait_event, waitinfo);
@@ -2564,6 +2748,9 @@ stackshot_thread_wait_owner_info(thread_t thread, thread_waitinfo_t *waitinfo)
        case kThreadWaitOnProcess:
                kdp_wait4_find_process(thread, thread->wait_event, waitinfo);
                break;
+       case kThreadWaitSleepWithInheritor:
+               kdp_sleep_with_inheritor_find_owner(thread->waitq, thread->wait_event, waitinfo);
+               break;
        default:
                waitinfo->owner = 0;
                waitinfo->context = 0;
index 556f67273695b7a51efc084f2ad88237a06d7dfd..3e9f12902f739951413e6077f92f0efd2bc40f09 100644 (file)
@@ -81,7 +81,7 @@ typedef int wait_result_t;
 #define THREAD_NOT_WAITING      10              /* thread didn't need to wait */
 
 typedef void (*thread_continue_t)(void *, wait_result_t);
-#define THREAD_CONTINUE_NULL    ((thread_continue_t) 0)
+#define THREAD_CONTINUE_NULL    ((thread_continue_t) NULL)
 
 /*
  * Interruptible flag for waits.
index 287a213828be764bec675bbe484cd8514d52ee56..10ff9f8deb000e6304ecef7fca05d5ef1985db1d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -126,7 +126,7 @@ kmod_control(
 {
        NOT_SUPPORTED_KERNEL();
        return KERN_NOT_SUPPORTED;
-};
+}
 
 /********************************************************************/
 kern_return_t
index 7f7bba13942c5ed6b50a1b8305dece66b2fa44e7..a0d9258723e56353e20ea92345d272f593fa152e 100644 (file)
@@ -34,6 +34,7 @@
 #include <kern/kalloc.h>
 #include <kern/task.h>
 #include <kern/thread.h>
+#include <kern/coalition.h>
 
 #include <kern/processor.h>
 #include <kern/machine.h>
@@ -96,6 +97,7 @@ struct entry_template {
 };
 
 lck_grp_t ledger_lck_grp;
+os_refgrp_decl(static, ledger_refgrp, "ledger", NULL);
 
 /*
  * Modifying the reference count, table size, or table contents requires
@@ -206,6 +208,41 @@ ledger_template_create(const char *name)
        return template;
 }
 
+ledger_template_t
+ledger_template_copy(ledger_template_t template, const char *name)
+{
+       struct entry_template * new_entries = NULL;
+       ledger_template_t new_template = ledger_template_create(name);
+
+       if (new_template == NULL) {
+               return new_template;
+       }
+
+       template_lock(template);
+       assert(template->lt_initialized);
+
+       new_entries = (struct entry_template *)
+           kalloc(sizeof(struct entry_template) * template->lt_table_size);
+
+       if (new_entries) {
+               /* Copy the template entries. */
+               bcopy(template->lt_entries, new_entries, sizeof(struct entry_template) * template->lt_table_size);
+               kfree(new_template->lt_entries, sizeof(struct entry_template) * new_template->lt_table_size);
+
+               new_template->lt_entries = new_entries;
+               new_template->lt_table_size = template->lt_table_size;
+               new_template->lt_cnt = template->lt_cnt;
+       } else {
+               /* Tear down the new template; we've failed. :( */
+               ledger_template_dereference(new_template);
+               new_template = NULL;
+       }
+
+       template_unlock(template);
+
+       return new_template;
+}
+
 void
 ledger_template_dereference(ledger_template_t template)
 {
@@ -214,6 +251,8 @@ ledger_template_dereference(ledger_template_t template)
        template_unlock(template);
 
        if (template->lt_refs == 0) {
+               kfree(template->lt_entries, sizeof(struct entry_template) * template->lt_table_size);
+               lck_mtx_destroy(&template->lt_lock, &ledger_lck_grp);
                kfree(template, sizeof(*template));
        }
 }
@@ -385,7 +424,7 @@ ledger_instantiate(ledger_template_t template, int entry_type)
 
        ledger->l_template = template;
        ledger->l_id = ledger_cnt++;
-       os_ref_init(&ledger->l_refs, NULL);
+       os_ref_init(&ledger->l_refs, &ledger_refgrp);
        ledger->l_size = (int32_t)cnt;
 
        template_lock(template);
@@ -433,35 +472,25 @@ flag_clear(volatile uint32_t *flags, uint32_t bit)
 /*
  * Take a reference on a ledger
  */
-kern_return_t
+void
 ledger_reference(ledger_t ledger)
 {
        if (!LEDGER_VALID(ledger)) {
-               return KERN_INVALID_ARGUMENT;
-       }
-       os_ref_retain(&ledger->l_refs);
-       return KERN_SUCCESS;
-}
-
-int
-ledger_reference_count(ledger_t ledger)
-{
-       if (!LEDGER_VALID(ledger)) {
-               return -1;
+               return;
        }
 
-       return os_ref_get_count(&ledger->l_refs);
+       os_ref_retain(&ledger->l_refs);
 }
 
 /*
  * Remove a reference on a ledger.  If this is the last reference,
  * deallocate the unused ledger.
  */
-kern_return_t
+void
 ledger_dereference(ledger_t ledger)
 {
        if (!LEDGER_VALID(ledger)) {
-               return KERN_INVALID_ARGUMENT;
+               return;
        }
 
        if (os_ref_release(&ledger->l_refs) == 0) {
@@ -471,8 +500,6 @@ ledger_dereference(ledger_t ledger)
                        pmap_ledger_free(ledger);
                }
        }
-
-       return KERN_SUCCESS;
 }
 
 /*
@@ -828,7 +855,7 @@ ledger_rollup(ledger_t to_ledger, ledger_t from_ledger)
 {
        int i;
 
-       assert(to_ledger->l_template == from_ledger->l_template);
+       assert(to_ledger->l_template->lt_cnt == from_ledger->l_template->lt_cnt);
 
        for (i = 0; i < to_ledger->l_size; i++) {
                ledger_rollup_entry(to_ledger, from_ledger, i);
@@ -847,7 +874,7 @@ ledger_rollup_entry(ledger_t to_ledger, ledger_t from_ledger, int entry)
 {
        struct ledger_entry *from_le, *to_le;
 
-       assert(to_ledger->l_template == from_ledger->l_template);
+       assert(to_ledger->l_template->lt_cnt == from_ledger->l_template->lt_cnt);
        if (ENTRY_VALID(from_ledger, entry) && ENTRY_VALID(to_ledger, entry)) {
                from_le = &from_ledger->l_entries[entry];
                to_le   =   &to_ledger->l_entries[entry];
@@ -1305,6 +1332,7 @@ ledger_ast(thread_t thread)
 {
        struct ledger   *l = thread->t_ledger;
        struct ledger   *thl;
+       struct ledger   *coalition_ledger;
        uint32_t        block;
        uint64_t        now;
        uint8_t         task_flags;
@@ -1388,6 +1416,11 @@ top:
        }
        block |= ledger_check_needblock(l, now);
 
+       coalition_ledger = coalition_ledger_get_from_task(task);
+       if (LEDGER_VALID(coalition_ledger)) {
+               block |= ledger_check_needblock(coalition_ledger, now);
+       }
+       ledger_dereference(coalition_ledger);
        /*
         * If we are supposed to block on the availability of one or more
         * resources, find the first entry in deficit for which we should wait.
@@ -1453,7 +1486,7 @@ ledger_check_needblock(ledger_t l, uint64_t now)
                if (le->le_flags & LF_REFILL_SCHEDULED) {
                        assert(!(le->le_flags & LF_TRACKING_MAX));
 
-                       if ((le->_le.le_refill.le_last_refill + le->_le.le_refill.le_refill_period) > now) {
+                       if ((le->_le.le_refill.le_last_refill + le->_le.le_refill.le_refill_period) <= now) {
                                ledger_refill(now, l, i);
                                if (limit_exceeded(le) == FALSE) {
                                        continue;
index 3e3e6c32349b6e2aa4dc4da037c7a1deac172edf..9be77bb0c282192605e3396fef0b7cbc5b14d2e1 100644 (file)
@@ -96,7 +96,7 @@ struct ledger_entry {
 
 struct ledger {
        uint64_t                l_id;
-       struct os_refcnt        l_refs;
+       os_refcnt_t             l_refs;
        int32_t                 l_size;
        struct ledger_template *l_template;
        struct ledger_entry     l_entries[0] __attribute__((aligned(8)));
@@ -141,6 +141,7 @@ typedef void (*ledger_callback_t)(int warning, const void * param0, const void *
 extern void ledger_init(void);
 
 extern ledger_template_t ledger_template_create(const char *name);
+extern ledger_template_t ledger_template_copy(ledger_template_t template, const char *name);
 extern void ledger_template_dereference(ledger_template_t template);
 extern int ledger_entry_add(ledger_template_t template, const char *key,
     const char *group, const char *units);
@@ -207,9 +208,8 @@ extern kern_return_t ledger_rollup_entry(ledger_t to_ledger, ledger_t from_ledge
 
 extern void ledger_ast(thread_t thread);
 
-extern int ledger_reference_count(ledger_t ledger);
-extern kern_return_t ledger_reference(ledger_t ledger);
-extern kern_return_t ledger_dereference(ledger_t ledger);
+extern void ledger_reference(ledger_t ledger);
+extern void ledger_dereference(ledger_t ledger);
 
 /* Support for ledger() syscall */
 #ifdef LEDGER_DEBUG
index 56472c560c47479acf8c7e4692df0d936bd3f7d3..e677ded1a8e3521e47093e5547e1de4df7b87ae7 100644 (file)
@@ -31,8 +31,7 @@
 #include <kern/queue.h>
 #include <mach/mach_types.h>
 
-#define LCK_GRP_NULL    (lck_grp_t *)0
-
+#define LCK_GRP_NULL    (lck_grp_t *)NULL
 
 typedef unsigned int    lck_type_t;
 
@@ -42,6 +41,7 @@ typedef unsigned int    lck_type_t;
 
 #if XNU_KERNEL_PRIVATE
 
+#include <os/refcnt.h>
 /*
  * Arguments wrapped in LCK_GRP_ARG() will be elided
  * when LOCK_STATS is not set.
@@ -86,7 +86,7 @@ typedef struct _lck_grp_stats_ {
 
 typedef struct _lck_grp_ {
        queue_chain_t           lck_grp_link;
-       uint32_t                lck_grp_refcnt;
+       os_refcnt_t             lck_grp_refcnt;
        uint32_t                lck_grp_spincnt;
        uint32_t                lck_grp_mtxcnt;
        uint32_t                lck_grp_rwcnt;
@@ -99,6 +99,7 @@ typedef struct _lck_grp_ {
 typedef struct _lck_grp_ lck_grp_t;
 #endif /* XNU_KERNEL_PRIVATE */
 
+
 #ifdef  MACH_KERNEL_PRIVATE
 typedef struct _lck_grp_attr_ {
        uint32_t        grp_attr_val;
@@ -113,7 +114,7 @@ extern lck_grp_attr_t  LockDefaultGroupAttr;
 typedef struct __lck_grp_attr__ lck_grp_attr_t;
 #endif /* MACH_KERNEL_PRIVATE */
 
-#define LCK_GRP_ATTR_NULL       (lck_grp_attr_t *)0
+#define LCK_GRP_ATTR_NULL       (lck_grp_attr_t *)NULL
 
 __BEGIN_DECLS
 
@@ -157,7 +158,6 @@ extern  void                    lck_grp_lckcnt_incr(
 extern  void                    lck_grp_lckcnt_decr(
        lck_grp_t               *grp,
        lck_type_t              lck_type);
-
 #endif /* MACH_KERNEL_PRIVATE */
 
 #endif /* _KERN_LOCK_GROUP_H */
index 04106709bb2315ba99b6ef74a3b6e4aab8953cb9..78aee369cc6eb3b890350fec66e17eff3b9d7f5f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -54,7 +54,6 @@
  * the rights to redistribute these changes.
  */
 
-#define ATOMIC_PRIVATE 1
 #define LOCK_PRIVATE 1
 
 #include <mach_ldebug.h>
 #define ALIGN_TEST(p, t) do{}while(0)
 #endif
 
-/* Silence the volatile to _Atomic cast warning */
-#define ATOMIC_CAST(t, p) ((_Atomic t*)(uintptr_t)(p))
-
-/* Enforce program order of loads and stores. */
-#define ordered_load(target, type) \
-               __c11_atomic_load((_Atomic type *)(target), memory_order_relaxed)
-#define ordered_store(target, type, value) \
-               __c11_atomic_store((_Atomic type *)(target), value, memory_order_relaxed)
-
-#define ordered_load_hw(lock)                   ordered_load(&(lock)->lock_data, uintptr_t)
-#define ordered_store_hw(lock, value)   ordered_store(&(lock)->lock_data, uintptr_t, (value))
-
 #define NOINLINE                __attribute__((noinline))
 
+#define ordered_load_hw(lock)          os_atomic_load(&(lock)->lock_data, compiler_acq_rel)
+#define ordered_store_hw(lock, value)  os_atomic_store(&(lock)->lock_data, (value), compiler_acq_rel)
+
 
 queue_head_t     lck_grp_queue;
 unsigned int     lck_grp_cnt;
 
-decl_lck_mtx_data(, lck_grp_lock)
+decl_lck_mtx_data(, lck_grp_lock);
 static lck_mtx_ext_t lck_grp_lock_ext;
 
 SECURITY_READ_ONLY_LATE(boolean_t) spinlock_timeout_panic = TRUE;
@@ -175,7 +165,7 @@ lck_mod_init(
                LockCompatGroup.lck_grp_attr |= LCK_GRP_ATTR_TIME_STAT;
        }
 
-       LockCompatGroup.lck_grp_refcnt = 1;
+       os_ref_init(&LockCompatGroup.lck_grp_refcnt, NULL);
 
        enqueue_tail(&lck_grp_queue, (queue_entry_t)&LockCompatGroup);
        lck_grp_cnt = 1;
@@ -228,7 +218,7 @@ void
 lck_grp_attr_setstat(
        lck_grp_attr_t  *attr)
 {
-       (void)hw_atomic_or(&attr->grp_attr_val, LCK_GRP_ATTR_STAT);
+       os_atomic_or(&attr->grp_attr_val, LCK_GRP_ATTR_STAT, relaxed);
 }
 
 
@@ -307,7 +297,7 @@ lck_grp_init(lck_grp_t * grp, const char * grp_name, lck_grp_attr_t * attr)
 #endif /* LOCK_STATS */
        }
 
-       grp->lck_grp_refcnt = 1;
+       os_ref_init(&grp->lck_grp_refcnt, NULL);
 
        lck_mtx_lock(&lck_grp_lock);
        enqueue_tail(&lck_grp_queue, (queue_entry_t)grp);
@@ -339,7 +329,7 @@ void
 lck_grp_reference(
        lck_grp_t       *grp)
 {
-       (void)hw_atomic_add(&grp->lck_grp_refcnt, 1);
+       os_ref_retain(&grp->lck_grp_refcnt);
 }
 
 
@@ -351,9 +341,11 @@ void
 lck_grp_deallocate(
        lck_grp_t       *grp)
 {
-       if (hw_atomic_sub(&grp->lck_grp_refcnt, 1) == 0) {
-               kfree(grp, sizeof(lck_grp_t));
+       if (os_ref_release(&grp->lck_grp_refcnt) != 0) {
+               return;
        }
+
+       kfree(grp, sizeof(lck_grp_t));
 }
 
 /*
@@ -381,7 +373,7 @@ lck_grp_lckcnt_incr(
                return panic("lck_grp_lckcnt_incr(): invalid lock type: %d\n", lck_type);
        }
 
-       (void)hw_atomic_add(lckcnt, 1);
+       os_atomic_inc(lckcnt, relaxed);
 }
 
 /*
@@ -411,7 +403,7 @@ lck_grp_lckcnt_decr(
                return;
        }
 
-       updated = (int)hw_atomic_sub(lckcnt, 1);
+       updated = os_atomic_dec(lckcnt, relaxed);
        assert(updated >= 0);
 }
 
@@ -467,7 +459,7 @@ void
 lck_attr_setdebug(
        lck_attr_t      *attr)
 {
-       (void)hw_atomic_or(&attr->lck_attr_val, LCK_ATTR_DEBUG);
+       os_atomic_or(&attr->lck_attr_val, LCK_ATTR_DEBUG, relaxed);
 }
 
 /*
@@ -477,7 +469,7 @@ void
 lck_attr_cleardebug(
        lck_attr_t      *attr)
 {
-       (void)hw_atomic_and(&attr->lck_attr_val, ~LCK_ATTR_DEBUG);
+       os_atomic_andnot(&attr->lck_attr_val, LCK_ATTR_DEBUG, relaxed);
 }
 
 
@@ -488,7 +480,7 @@ void
 lck_attr_rw_shared_priority(
        lck_attr_t      *attr)
 {
-       (void)hw_atomic_or(&attr->lck_attr_val, LCK_ATTR_RW_SHARED_PRIORITY);
+       os_atomic_or(&attr->lck_attr_val, LCK_ATTR_RW_SHARED_PRIORITY, relaxed);
 }
 
 
@@ -513,6 +505,31 @@ hw_lock_init(hw_lock_t lock)
        ordered_store_hw(lock, 0);
 }
 
+#if     __SMP__
+static inline bool
+hw_lock_trylock_contended(hw_lock_t lock, uintptr_t newval)
+{
+#if OS_ATOMIC_USE_LLSC
+       uintptr_t oldval;
+       os_atomic_rmw_loop(&lock->lock_data, oldval, newval, acquire, {
+               if (oldval != 0) {
+                       wait_for_event(); // clears the monitor so we don't need give_up()
+                       return false;
+               }
+       });
+       return true;
+#else // !OS_ATOMIC_USE_LLSC
+#if OS_ATOMIC_HAS_LLSC
+       uintptr_t oldval = os_atomic_load_exclusive(&lock->lock_data, relaxed);
+       if (oldval != 0) {
+               wait_for_event(); // clears the monitor so we don't need give_up()
+               return false;
+       }
+#endif // OS_ATOMIC_HAS_LLSC
+       return os_atomic_cmpxchg(&lock->lock_data, 0, newval, acquire);
+#endif // !OS_ATOMIC_USE_LLSC
+}
+
 /*
  *     Routine: hw_lock_lock_contended
  *
@@ -520,8 +537,6 @@ hw_lock_init(hw_lock_t lock)
  *     timeout is in mach_absolute_time ticks. Called with
  *     preemption disabled.
  */
-
-#if     __SMP__
 static unsigned int NOINLINE
 hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean_t do_panic LCK_GRP_ARG(lck_grp_t *grp))
 {
@@ -551,8 +566,7 @@ hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean
                                continue;
                        }
 #endif
-                       if (atomic_compare_exchange(&lock->lock_data, 0, data,
-                           memory_order_acquire_smp, TRUE)) {
+                       if (hw_lock_trylock_contended(lock, data)) {
 #if CONFIG_DTRACE || LOCK_STATS
                                if (__improbable(stat_enabled)) {
                                        lck_grp_spin_update_spin(lock LCK_GRP_ARG(grp), mach_absolute_time() - begin);
@@ -578,6 +592,42 @@ hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean
 }
 #endif  // __SMP__
 
+void *
+hw_wait_while_equals(void **address, void *current)
+{
+#if     __SMP__
+       void *v;
+       uint64_t end = 0;
+
+       for (;;) {
+               for (int i = 0; i < LOCK_SNOOP_SPINS; i++) {
+                       cpu_pause();
+#if OS_ATOMIC_HAS_LLSC
+                       v = os_atomic_load_exclusive(address, relaxed);
+                       if (__probable(v != current)) {
+                               os_atomic_clear_exclusive();
+                               return v;
+                       }
+                       wait_for_event();
+#else
+                       v = os_atomic_load(address, relaxed);
+                       if (__probable(v != current)) {
+                               return v;
+                       }
+#endif // OS_ATOMIC_HAS_LLSC
+               }
+               if (end == 0) {
+                       end = ml_get_timebase() + LOCK_PANIC_TIMEOUT;
+               } else if (ml_get_timebase() >= end) {
+                       panic("Wait while equals timeout @ *%p == %p", address, v);
+               }
+       }
+#else // !__SMP__
+       panic("Value at %p is %p", address, current);
+       __builtin_unreachable();
+#endif // !__SMP__
+}
+
 static inline void
 hw_lock_lock_internal(hw_lock_t lock, thread_t thread LCK_GRP_ARG(lck_grp_t *grp))
 {
@@ -585,14 +635,12 @@ hw_lock_lock_internal(hw_lock_t lock, thread_t thread LCK_GRP_ARG(lck_grp_t *grp
 
        state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
 #if     __SMP__
-
 #if     LOCK_PRETEST
        if (ordered_load_hw(lock)) {
                goto contended;
        }
 #endif  // LOCK_PRETEST
-       if (atomic_compare_exchange(&lock->lock_data, 0, state,
-           memory_order_acquire_smp, TRUE)) {
+       if (hw_lock_trylock_contended(lock, state)) {
                goto end;
        }
 #if     LOCK_PRETEST
@@ -659,14 +707,12 @@ int
        disable_preemption_for_thread(thread);
        state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
 #if     __SMP__
-
 #if     LOCK_PRETEST
        if (ordered_load_hw(lock)) {
                goto contended;
        }
 #endif  // LOCK_PRETEST
-       if (atomic_compare_exchange(&lock->lock_data, 0, state,
-           memory_order_acquire_smp, TRUE)) {
+       if (hw_lock_trylock_contended(lock, state)) {
                success = 1;
                goto end;
        }
@@ -704,8 +750,8 @@ hw_lock_try_internal(hw_lock_t lock, thread_t thread LCK_GRP_ARG(lck_grp_t *grp)
                goto failed;
        }
 #endif  // LOCK_PRETEST
-       success = atomic_compare_exchange(&lock->lock_data, 0, LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK,
-           memory_order_acquire_smp, FALSE);
+       success = os_atomic_cmpxchg(&lock->lock_data, 0,
+           LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK, acquire);
 #else
        if (lock->lock_data == 0) {
                lock->lock_data = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
@@ -754,7 +800,7 @@ int
 static inline void
 hw_lock_unlock_internal(hw_lock_t lock)
 {
-       __c11_atomic_store((_Atomic uintptr_t *)&lock->lock_data, 0, memory_order_release_smp);
+       os_atomic_store(&lock->lock_data, 0, release);
 #if __arm__ || __arm64__
        // ARM tests are only for open-source exclusion
        set_event();
@@ -790,6 +836,198 @@ hw_lock_held(hw_lock_t lock)
        return ordered_load_hw(lock) != 0;
 }
 
+#if     __SMP__
+static unsigned int
+hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp));
+#endif
+
+static inline unsigned int
+hw_lock_bit_to_internal(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp))
+{
+       unsigned int success = 0;
+       uint32_t        mask = (1 << bit);
+#if     !__SMP__
+       uint32_t        state;
+#endif
+
+#if     __SMP__
+       if (__improbable(!hw_atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE))) {
+               success = hw_lock_bit_to_contended(lock, mask, timeout LCK_GRP_ARG(grp));
+       } else {
+               success = 1;
+       }
+#else   // __SMP__
+       (void)timeout;
+       state = ordered_load_bit(lock);
+       if (!(mask & state)) {
+               ordered_store_bit(lock, state | mask);
+               success = 1;
+       }
+#endif  // __SMP__
+
+       if (success) {
+               lck_grp_spin_update_held(lock LCK_GRP_ARG(grp));
+       }
+
+       return success;
+}
+
+unsigned
+int
+(hw_lock_bit_to)(hw_lock_bit_t * lock, unsigned int bit, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp))
+{
+       _disable_preemption();
+       return hw_lock_bit_to_internal(lock, bit, timeout LCK_GRP_ARG(grp));
+}
+
+#if     __SMP__
+static unsigned int NOINLINE
+hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp))
+{
+       uint64_t        end = 0;
+       int             i;
+#if CONFIG_DTRACE || LOCK_STATS
+       uint64_t begin = 0;
+       boolean_t stat_enabled = lck_grp_spin_spin_enabled(lock LCK_GRP_ARG(grp));
+#endif /* CONFIG_DTRACE || LOCK_STATS */
+
+#if LOCK_STATS || CONFIG_DTRACE
+       if (__improbable(stat_enabled)) {
+               begin = mach_absolute_time();
+       }
+#endif /* LOCK_STATS || CONFIG_DTRACE */
+       for (;;) {
+               for (i = 0; i < LOCK_SNOOP_SPINS; i++) {
+                       // Always load-exclusive before wfe
+                       // This grabs the monitor and wakes up on a release event
+                       if (hw_atomic_test_and_set32(lock, mask, mask, memory_order_acquire, TRUE)) {
+                               goto end;
+                       }
+               }
+               if (end == 0) {
+                       end = ml_get_timebase() + timeout;
+               } else if (ml_get_timebase() >= end) {
+                       break;
+               }
+       }
+       return 0;
+end:
+#if CONFIG_DTRACE || LOCK_STATS
+       if (__improbable(stat_enabled)) {
+               lck_grp_spin_update_spin(lock LCK_GRP_ARG(grp), mach_absolute_time() - begin);
+       }
+       lck_grp_spin_update_miss(lock LCK_GRP_ARG(grp));
+#endif /* CONFIG_DTRACE || LCK_GRP_STAT */
+
+       return 1;
+}
+#endif  // __SMP__
+
+void
+(hw_lock_bit)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp))
+{
+       if (hw_lock_bit_to(lock, bit, LOCK_PANIC_TIMEOUT, LCK_GRP_PROBEARG(grp))) {
+               return;
+       }
+#if     __SMP__
+       panic("hw_lock_bit(): timed out (%p)", lock);
+#else
+       panic("hw_lock_bit(): interlock held (%p)", lock);
+#endif
+}
+
+void
+(hw_lock_bit_nopreempt)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp))
+{
+       if (__improbable(get_preemption_level() == 0)) {
+               panic("Attempt to take no-preempt bitlock %p in preemptible context", lock);
+       }
+       if (hw_lock_bit_to_internal(lock, bit, LOCK_PANIC_TIMEOUT LCK_GRP_ARG(grp))) {
+               return;
+       }
+#if     __SMP__
+       panic("hw_lock_bit_nopreempt(): timed out (%p)", lock);
+#else
+       panic("hw_lock_bit_nopreempt(): interlock held (%p)", lock);
+#endif
+}
+
+unsigned
+int
+(hw_lock_bit_try)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp))
+{
+       uint32_t        mask = (1 << bit);
+#if     !__SMP__
+       uint32_t        state;
+#endif
+       boolean_t       success = FALSE;
+
+       _disable_preemption();
+#if     __SMP__
+       // TODO: consider weak (non-looping) atomic test-and-set
+       success = hw_atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE);
+#else
+       state = ordered_load_bit(lock);
+       if (!(mask & state)) {
+               ordered_store_bit(lock, state | mask);
+               success = TRUE;
+       }
+#endif  // __SMP__
+       if (!success) {
+               _enable_preemption();
+       }
+
+       if (success) {
+               lck_grp_spin_update_held(lock LCK_GRP_ARG(grp));
+       }
+
+       return success;
+}
+
+static inline void
+hw_unlock_bit_internal(hw_lock_bit_t *lock, unsigned int bit)
+{
+       uint32_t        mask = (1 << bit);
+#if     !__SMP__
+       uint32_t        state;
+#endif
+
+#if     __SMP__
+       os_atomic_andnot(lock, mask, release);
+#if __arm__
+       set_event();
+#endif
+#else   // __SMP__
+       state = ordered_load_bit(lock);
+       ordered_store_bit(lock, state & ~mask);
+#endif  // __SMP__
+#if CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_SPIN_UNLOCK_RELEASE, lock, bit);
+#endif
+}
+
+/*
+ *     Routine:        hw_unlock_bit
+ *
+ *             Release spin-lock. The second parameter is the bit number to test and set.
+ *             Decrement the preemption level.
+ */
+void
+hw_unlock_bit(hw_lock_bit_t * lock, unsigned int bit)
+{
+       hw_unlock_bit_internal(lock, bit);
+       _enable_preemption();
+}
+
+void
+hw_unlock_bit_nopreempt(hw_lock_bit_t * lock, unsigned int bit)
+{
+       if (__improbable(get_preemption_level() == 0)) {
+               panic("Attempt to release no-preempt bitlock %p in preemptible context", lock);
+       }
+       hw_unlock_bit_internal(lock, bit);
+}
+
 /*
  * Routine:    lck_spin_sleep
  */
@@ -983,37 +1221,9 @@ lck_mtx_sleep_deadline(
  * The lock owner is always promoted to the max priority of all its waiters.
  * Max priority is capped at MAXPRI_PROMOTE.
  *
- * lck_mtx_pri being set implies that the lock owner is promoted to at least lck_mtx_pri
- *      This prevents the thread from dropping in priority while holding a mutex
- *      (note: Intel locks currently don't do this, to avoid thread lock churn)
- *
- * thread->promotions has a +1 for every mutex currently promoting the thread
- * and 1 for was_promoted_on_wakeup being set.
- * TH_SFLAG_PROMOTED is set on a thread whenever it has any promotions
- * from any mutex (i.e. thread->promotions != 0)
- *
- * was_promoted_on_wakeup is set on a thread which is woken up by a mutex when
- * it raises the priority of the woken thread to match lck_mtx_pri.
- * It can be set for multiple iterations of wait, fail to acquire, re-wait, etc
- * was_promoted_on_wakeup being set always implies a +1 promotions count.
- *
  * The last waiter is not given a promotion when it wakes up or acquires the lock.
  * When the last waiter is waking up, a new contender can always come in and
  * steal the lock without having to wait for the last waiter to make forward progress.
- *
- * lck_mtx_waiters has a +1 for every waiter currently between wait and acquire
- * This prevents us from asserting that every wakeup wakes up a thread.
- * This also causes excess thread_wakeup calls in the unlock path.
- * It can only be fooled into thinking there are more waiters than are
- * actually blocked, not less.
- * It does allows us to reduce the complexity of the lock state.
- *
- * This also means that a starved bg thread as the last waiter could end up
- * keeping the lock in the contended state for a long period of time, which
- * may keep lck_mtx_pri artificially high for a very long time even though
- * it is not participating or blocking anyone else.
- * Intel locks don't have this problem because they can go uncontended
- * as soon as there are no blocked threads involved.
  */
 
 /*
@@ -1034,9 +1244,10 @@ lck_mtx_sleep_deadline(
 void
 lck_mtx_lock_wait(
        lck_mtx_t                       *lck,
-       thread_t                        holder)
+       thread_t                        holder,
+       struct turnstile                **ts)
 {
-       thread_t                self = current_thread();
+       thread_t                thread = current_thread();
        lck_mtx_t               *mutex;
        __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
 
@@ -1057,64 +1268,27 @@ lck_mtx_lock_wait(
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
            trace_lck, (uintptr_t)thread_tid(thread), 0, 0, 0);
 
-       spl_t s = splsched();
-       thread_lock(holder);
-
-       assert_promotions_invariant(holder);
-
-       if ((holder->sched_flags & TH_SFLAG_DEPRESS) == 0) {
-               assert(holder->sched_pri >= mutex->lck_mtx_pri);
-       }
-
-       integer_t priority = self->sched_pri;
-       priority = MAX(priority, self->base_pri);
-       priority = MAX(priority, BASEPRI_DEFAULT);
-       priority = MIN(priority, MAXPRI_PROMOTE);
-
-       if (mutex->lck_mtx_pri == 0) {
-               /* This is the first promotion for this mutex */
-               if (holder->promotions++ == 0) {
-                       /* This is the first promotion for holder */
-                       sched_thread_promote_to_pri(holder, priority, trace_lck);
-               } else {
-                       /* Holder was previously promoted due to a different mutex, raise to match this one */
-                       sched_thread_update_promotion_to_pri(holder, priority, trace_lck);
-               }
-       } else {
-               /* Holder was previously promoted due to this mutex, check if the pri needs to go up */
-               sched_thread_update_promotion_to_pri(holder, priority, trace_lck);
-       }
-
-       assert(holder->promotions > 0);
-       assert(holder->promotion_priority >= priority);
-
-       if ((holder->sched_flags & TH_SFLAG_DEPRESS) == 0) {
-               assert(holder->sched_pri >= mutex->lck_mtx_pri);
-       }
-
-       assert_promotions_invariant(holder);
-
-       thread_unlock(holder);
-       splx(s);
+       assert(thread->waiting_for_mutex == NULL);
+       thread->waiting_for_mutex = mutex;
+       mutex->lck_mtx_waiters++;
 
-       if (mutex->lck_mtx_pri < priority) {
-               mutex->lck_mtx_pri = priority;
+       if (*ts == NULL) {
+               *ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
        }
 
-       if (self->waiting_for_mutex == NULL) {
-               self->waiting_for_mutex = mutex;
-               mutex->lck_mtx_waiters++;
-       }
+       struct turnstile *turnstile = *ts;
+       thread_set_pending_block_hint(thread, kThreadWaitKernelMutex);
+       turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
 
-       assert(self->waiting_for_mutex == mutex);
+       waitq_assert_wait64(&turnstile->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
 
-       thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
-       assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
        lck_mtx_ilk_unlock(mutex);
 
+       turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
+
        thread_block(THREAD_CONTINUE_NULL);
 
-       assert(mutex->lck_mtx_waiters > 0);
+       thread->waiting_for_mutex = NULL;
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
 #if     CONFIG_DTRACE
@@ -1146,11 +1320,11 @@ lck_mtx_lock_wait(
  */
 int
 lck_mtx_lock_acquire(
-       lck_mtx_t               *lck)
+       lck_mtx_t               *lck,
+       struct turnstile        *ts)
 {
        thread_t                thread = current_thread();
        lck_mtx_t               *mutex;
-       integer_t               priority;
 
        if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) {
                mutex = lck;
@@ -1158,79 +1332,19 @@ lck_mtx_lock_acquire(
                mutex = &lck->lck_mtx_ptr->lck_mtx;
        }
 
-       /*
-        * If waiting_for_mutex is set, then this thread was previously blocked waiting on this lock
-        * If it's un-set, then this thread stole the lock from another waiter.
-        */
-       if (thread->waiting_for_mutex == mutex) {
-               assert(mutex->lck_mtx_waiters > 0);
-
-               thread->waiting_for_mutex = NULL;
-               mutex->lck_mtx_waiters--;
-       }
-
        assert(thread->waiting_for_mutex == NULL);
 
        if (mutex->lck_mtx_waiters > 0) {
-               priority = mutex->lck_mtx_pri;
-       } else {
-               /* I was the last waiter, so the mutex is no longer promoted or contended */
-               mutex->lck_mtx_pri = 0;
-               priority = 0;
-       }
-
-       if (priority || thread->was_promoted_on_wakeup) {
-               __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
-
-               /*
-                * Note: was_promoted_on_wakeup can happen for multiple wakeups in a row without
-                * an intervening acquire if a thread keeps failing to acquire the lock
-                *
-                * If priority is true but not promoted on wakeup,
-                * then this is a lock steal of a promoted mutex, so it needs a ++ of promotions.
-                *
-                * If promoted on wakeup is true, but priority is not,
-                * then this is the last owner, and the last owner does not need a promotion.
-                */
-
-               spl_t s = splsched();
-               thread_lock(thread);
-
-               assert_promotions_invariant(thread);
-
-               if (thread->was_promoted_on_wakeup) {
-                       assert(thread->promotions > 0);
+               if (ts == NULL) {
+                       ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
                }
 
-               if (priority) {
-                       if (thread->promotions++ == 0) {
-                               /* This is the first promotion for holder */
-                               sched_thread_promote_to_pri(thread, priority, trace_lck);
-                       } else {
-                               /*
-                                * Holder was previously promoted due to a different mutex, raise to match this one
-                                * Or, this thread was promoted on wakeup but someone else later contended on mutex
-                                * at higher priority before we got here
-                                */
-                               sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
-                       }
-               }
-
-               if (thread->was_promoted_on_wakeup) {
-                       thread->was_promoted_on_wakeup = 0;
-                       if (--thread->promotions == 0) {
-                               sched_thread_unpromote(thread, trace_lck);
-                       }
-               }
-
-               assert_promotions_invariant(thread);
-
-               if (priority && (thread->sched_flags & TH_SFLAG_DEPRESS) == 0) {
-                       assert(thread->sched_pri >= priority);
-               }
+               turnstile_update_inheritor(ts, thread, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
+               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+       }
 
-               thread_unlock(thread);
-               splx(s);
+       if (ts != NULL) {
+               turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
        }
 
        return mutex->lck_mtx_waiters;
@@ -1243,11 +1357,10 @@ lck_mtx_lock_acquire(
  *
  * Called with the interlock locked.
  *
- * TODO: the 'waiters' flag does not indicate waiters exist on the waitqueue,
- * it indicates waiters exist between wait and acquire.
- * This means that here we may do extra unneeded wakeups.
+ * NOTE: callers should call turnstile_clenup after
+ * dropping the interlock.
  */
-void
+boolean_t
 lck_mtx_unlock_wakeup(
        lck_mtx_t                       *lck,
        thread_t                        holder)
@@ -1255,6 +1368,8 @@ lck_mtx_unlock_wakeup(
        thread_t                thread = current_thread();
        lck_mtx_t               *mutex;
        __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
+       struct turnstile *ts;
+       kern_return_t did_wake;
 
        if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) {
                mutex = lck;
@@ -1270,88 +1385,29 @@ lck_mtx_unlock_wakeup(
            trace_lck, (uintptr_t)thread_tid(thread), 0, 0, 0);
 
        assert(mutex->lck_mtx_waiters > 0);
-       assert(thread->was_promoted_on_wakeup == 0);
        assert(thread->waiting_for_mutex == NULL);
 
-       /*
-        * The waiters count does not precisely match the number of threads on the waitqueue,
-        * therefore we cannot assert that we actually wake up a thread here
-        */
+       ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
+
        if (mutex->lck_mtx_waiters > 1) {
-               thread_wakeup_one_with_pri(LCK_MTX_EVENT(lck), lck->lck_mtx_pri);
+               /* WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor on the wokenup thread */
+               did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE);
        } else {
-               thread_wakeup_one(LCK_MTX_EVENT(lck));
+               did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
+               turnstile_update_inheritor(ts, NULL, TURNSTILE_IMMEDIATE_UPDATE);
        }
+       assert(did_wake == KERN_SUCCESS);
 
-       /* When mutex->lck_mtx_pri is set, it means means I as the owner have a promotion. */
-       if (mutex->lck_mtx_pri) {
-               spl_t s = splsched();
-               thread_lock(thread);
-
-               assert(thread->promotions > 0);
-
-               assert_promotions_invariant(thread);
+       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+       turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
 
-               if (--thread->promotions == 0) {
-                       sched_thread_unpromote(thread, trace_lck);
-               }
-
-               assert_promotions_invariant(thread);
-
-               thread_unlock(thread);
-               splx(s);
-       }
+       mutex->lck_mtx_waiters--;
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
-}
-
-/*
- * Callout from the waitqueue code from inside thread_wakeup_one_with_pri
- * At splsched, thread is pulled from waitq, still locked, not on runqueue yet
- *
- * We always make sure to set the promotion flag, even if the thread is already at this priority,
- * so that it doesn't go down.
- */
-void
-lck_mtx_wakeup_adjust_pri(thread_t thread, integer_t priority)
-{
-       assert(priority <= MAXPRI_PROMOTE);
-       assert(thread->waiting_for_mutex != NULL);
-
-       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(thread->waiting_for_mutex);
-
-       assert_promotions_invariant(thread);
-
-       if (thread->was_promoted_on_wakeup) {
-               /* Thread was previously promoted, but contended again */
-               sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
-               return;
-       }
-
-       if (thread->promotions > 0 && priority <= thread->promotion_priority) {
-               /*
-                * Thread is already promoted to the right level, no need to do more
-                * I can draft off of another promotion here, which is OK
-                * because I know the thread will soon run acquire to get its own promotion
-                */
-               assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED);
-               return;
-       }
-
-       thread->was_promoted_on_wakeup = 1;
-
-       if (thread->promotions++ == 0) {
-               /* This is the first promotion for this thread */
-               sched_thread_promote_to_pri(thread, priority, trace_lck);
-       } else {
-               /* Holder was previously promoted due to a different mutex, raise to match this one */
-               sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
-       }
 
-       assert_promotions_invariant(thread);
+       return mutex->lck_mtx_waiters > 0;
 }
 
-
 /*
  * Routine:     mutex_pause
  *
@@ -1703,56 +1759,1495 @@ host_lockgroup_info(
 }
 
 /*
- * Atomic primitives, prototyped in kern/simple_lock.h
- * Noret versions are more efficient on some architectures
+ * sleep_with_inheritor and wakeup_with_inheritor KPI
+ *
+ * Functions that allow to sleep on an event and use turnstile to propagate the priority of the sleeping threads to
+ * the latest thread specified as inheritor.
+ *
+ * The inheritor management is delegated to the caller, the caller needs to store a thread identifier to provide to this functions to specified upon whom
+ * direct the push. The inheritor cannot run in user space while holding a push from an event. Therefore is the caller responsibility to call a
+ * wakeup_with_inheritor from inheritor before running in userspace or specify another inheritor before letting the old inheritor run in userspace.
+ *
+ * sleep_with_inheritor requires to hold a locking primitive while invoked, but wakeup_with_inheritor and change_sleep_inheritor don't require it.
+ *
+ * Turnstile requires a non blocking primitive as interlock to synchronize the turnstile data structure manipulation, threfore sleep_with_inheritor, change_sleep_inheritor and
+ * wakeup_with_inheritor will require the same interlock to manipulate turnstiles.
+ * If sleep_with_inheritor is associated with a locking primitive that can block (like lck_mtx_t or lck_rw_t), an handoff to a non blocking primitive is required before
+ * invoking any turnstile operation.
+ *
+ * All functions will save the turnstile associated with the event on the turnstile kernel hash table and will use the the turnstile kernel hash table bucket
+ * spinlock as the turnstile interlock. Because we do not want to hold interrupt disabled while holding the bucket interlock a new turnstile kernel hash table
+ * is instantiated for this KPI to manage the hash without interrupt disabled.
+ * Also:
+ * - all events on the system that hash on the same bucket will contend on the same spinlock.
+ * - every event will have a dedicated wait_queue.
+ *
+ * Different locking primitives can be associated with sleep_with_inheritor as long as the primitive_lock() and primitive_unlock() functions are provided to
+ * sleep_with_inheritor_turnstile to perform the handoff with the bucket spinlock.
  */
 
-uint32_t
-hw_atomic_add(volatile uint32_t *dest, uint32_t delt)
+kern_return_t
+wakeup_with_inheritor_and_turnstile_type(event_t event, turnstile_type_t type, wait_result_t result, bool wake_one, lck_wake_action_t action, thread_t *thread_wokenup)
 {
-       ALIGN_TEST(dest, uint32_t);
-       return __c11_atomic_fetch_add(ATOMIC_CAST(uint32_t, dest), delt, memory_order_relaxed) + delt;
-}
+       uint32_t index;
+       struct turnstile *ts = NULL;
+       kern_return_t ret = KERN_NOT_WAITING;
+       int priority;
+       thread_t wokeup;
 
-uint32_t
-hw_atomic_sub(volatile uint32_t *dest, uint32_t delt)
-{
-       ALIGN_TEST(dest, uint32_t);
-       return __c11_atomic_fetch_sub(ATOMIC_CAST(uint32_t, dest), delt, memory_order_relaxed) - delt;
-}
+       /*
+        * the hash bucket spinlock is used as turnstile interlock
+        */
+       turnstile_hash_bucket_lock((uintptr_t)event, &index, type);
 
-uint32_t
-hw_atomic_or(volatile uint32_t *dest, uint32_t mask)
-{
-       ALIGN_TEST(dest, uint32_t);
-       return __c11_atomic_fetch_or(ATOMIC_CAST(uint32_t, dest), mask, memory_order_relaxed) | mask;
-}
+       ts = turnstile_prepare((uintptr_t)event, NULL, TURNSTILE_NULL, type);
 
+       if (wake_one) {
+               if (action == LCK_WAKE_DEFAULT) {
+                       priority = WAITQ_PROMOTE_ON_WAKE;
+               } else {
+                       assert(action == LCK_WAKE_DO_NOT_TRANSFER_PUSH);
+                       priority = WAITQ_ALL_PRIORITIES;
+               }
+
+               /*
+                * WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor
+                * if it finds a thread
+                */
+               wokeup = waitq_wakeup64_identify(&ts->ts_waitq, CAST_EVENT64_T(event), result, priority);
+               if (wokeup != NULL) {
+                       if (thread_wokenup != NULL) {
+                               *thread_wokenup = wokeup;
+                       } else {
+                               thread_deallocate_safe(wokeup);
+                       }
+                       ret = KERN_SUCCESS;
+                       if (action == LCK_WAKE_DO_NOT_TRANSFER_PUSH) {
+                               goto complete;
+                       }
+               } else {
+                       if (thread_wokenup != NULL) {
+                               *thread_wokenup = NULL;
+                       }
+                       turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
+                       ret = KERN_NOT_WAITING;
+               }
+       } else {
+               ret = waitq_wakeup64_all(&ts->ts_waitq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
+               turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
+       }
+
+       /*
+        * turnstile_update_inheritor_complete could be called while holding the interlock.
+        * In this case the new inheritor or is null, or is a thread that is just been woken up
+        * and have not blocked because it is racing with the same interlock used here
+        * after the wait.
+        * So there is no chain to update for the new inheritor.
+        *
+        * However unless the current thread is the old inheritor,
+        * old inheritor can be blocked and requires a chain update.
+        *
+        * The chain should be short because kernel turnstiles cannot have user turnstiles
+        * chained after them.
+        *
+        * We can anyway optimize this by asking turnstile to tell us
+        * if old inheritor needs an update and drop the lock
+        * just in that case.
+        */
+       turnstile_hash_bucket_unlock((uintptr_t)NULL, &index, type, 0);
+
+       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
+
+       turnstile_hash_bucket_lock((uintptr_t)NULL, &index, type);
+
+complete:
+       turnstile_complete((uintptr_t)event, NULL, NULL, type);
+
+       turnstile_hash_bucket_unlock((uintptr_t)NULL, &index, type, 0);
+
+       turnstile_cleanup();
+
+       return ret;
+}
+
+static wait_result_t
+sleep_with_inheritor_and_turnstile_type(event_t event,
+    thread_t inheritor,
+    wait_interrupt_t interruptible,
+    uint64_t deadline,
+    turnstile_type_t type,
+    void (^primitive_lock)(void),
+    void (^primitive_unlock)(void))
+{
+       wait_result_t ret;
+       uint32_t index;
+       struct turnstile *ts = NULL;
+
+       /*
+        * the hash bucket spinlock is used as turnstile interlock,
+        * lock it before releasing the primitive lock
+        */
+       turnstile_hash_bucket_lock((uintptr_t)event, &index, type);
+
+       primitive_unlock();
+
+       ts = turnstile_prepare((uintptr_t)event, NULL, TURNSTILE_NULL, type);
+
+       thread_set_pending_block_hint(current_thread(), kThreadWaitSleepWithInheritor);
+       /*
+        * We need TURNSTILE_DELAYED_UPDATE because we will call
+        * waitq_assert_wait64 after.
+        */
+       turnstile_update_inheritor(ts, inheritor, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
+
+       ret = waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(event), interruptible, deadline);
+
+       turnstile_hash_bucket_unlock((uintptr_t)NULL, &index, type, 0);
+
+       /*
+        * Update new and old inheritor chains outside the interlock;
+        */
+       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
+
+       if (ret == THREAD_WAITING) {
+               ret = thread_block(THREAD_CONTINUE_NULL);
+       }
+
+       turnstile_hash_bucket_lock((uintptr_t)NULL, &index, type);
+
+       turnstile_complete((uintptr_t)event, NULL, NULL, type);
+
+       turnstile_hash_bucket_unlock((uintptr_t)NULL, &index, type, 0);
+
+       turnstile_cleanup();
+
+       primitive_lock();
+
+       return ret;
+}
+
+kern_return_t
+change_sleep_inheritor_and_turnstile_type(event_t event,
+    thread_t inheritor,
+    turnstile_type_t type)
+{
+       uint32_t index;
+       struct turnstile *ts = NULL;
+       kern_return_t ret =  KERN_SUCCESS;
+       /*
+        * the hash bucket spinlock is used as turnstile interlock
+        */
+       turnstile_hash_bucket_lock((uintptr_t)event, &index, type);
+
+       ts = turnstile_prepare((uintptr_t)event, NULL, TURNSTILE_NULL, type);
+
+       if (!turnstile_has_waiters(ts)) {
+               ret = KERN_NOT_WAITING;
+       }
+
+       /*
+        * We will not call an assert_wait later so use TURNSTILE_IMMEDIATE_UPDATE
+        */
+       turnstile_update_inheritor(ts, inheritor, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
+
+       turnstile_hash_bucket_unlock((uintptr_t)NULL, &index, type, 0);
+
+       /*
+        * update the chains outside the interlock
+        */
+       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
+
+       turnstile_hash_bucket_lock((uintptr_t)NULL, &index, type);
+
+       turnstile_complete((uintptr_t)event, NULL, NULL, type);
+
+       turnstile_hash_bucket_unlock((uintptr_t)NULL, &index, type, 0);
+
+       turnstile_cleanup();
+
+       return ret;
+}
+
+typedef void (^void_block_void)(void);
+
+/*
+ * sleep_with_inheritor functions with lck_mtx_t as locking primitive.
+ */
+
+wait_result_t
+lck_mtx_sleep_with_inheritor_and_turnstile_type(lck_mtx_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline, turnstile_type_t type)
+{
+       LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED);
+
+       if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
+               return sleep_with_inheritor_and_turnstile_type(event,
+                          inheritor,
+                          interruptible,
+                          deadline,
+                          type,
+                          ^{;},
+                          ^{lck_mtx_unlock(lock);});
+       } else if (lck_sleep_action & LCK_SLEEP_SPIN) {
+               return sleep_with_inheritor_and_turnstile_type(event,
+                          inheritor,
+                          interruptible,
+                          deadline,
+                          type,
+                          ^{lck_mtx_lock_spin(lock);},
+                          ^{lck_mtx_unlock(lock);});
+       } else if (lck_sleep_action & LCK_SLEEP_SPIN_ALWAYS) {
+               return sleep_with_inheritor_and_turnstile_type(event,
+                          inheritor,
+                          interruptible,
+                          deadline,
+                          type,
+                          ^{lck_mtx_lock_spin_always(lock);},
+                          ^{lck_mtx_unlock(lock);});
+       } else {
+               return sleep_with_inheritor_and_turnstile_type(event,
+                          inheritor,
+                          interruptible,
+                          deadline,
+                          type,
+                          ^{lck_mtx_lock(lock);},
+                          ^{lck_mtx_unlock(lock);});
+       }
+}
+
+/*
+ * Name: lck_spin_sleep_with_inheritor
+ *
+ * Description: deschedule the current thread and wait on the waitq associated with event to be woken up.
+ *              While waiting, the sched priority of the waiting thread will contribute to the push of the event that will
+ *              be directed to the inheritor specified.
+ *              An interruptible mode and deadline can be specified to return earlier from the wait.
+ *
+ * Args:
+ *   Arg1: lck_spin_t lock used to protect the sleep. The lock will be dropped while sleeping and reaquired before returning according to the sleep action specified.
+ *   Arg2: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_UNLOCK.
+ *   Arg3: event to wait on.
+ *   Arg4: thread to propagate the event push to.
+ *   Arg5: interruptible flag for wait.
+ *   Arg6: deadline for wait.
+ *
+ * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified.
+ *             Lock will be dropped while waiting.
+ *             The inheritor specified cannot run in user space until another inheritor is specified for the event or a
+ *             wakeup for the event is called.
+ *
+ * Returns: result of the wait.
+ */
+wait_result_t
+lck_spin_sleep_with_inheritor(
+       lck_spin_t *lock,
+       lck_sleep_action_t lck_sleep_action,
+       event_t event,
+       thread_t inheritor,
+       wait_interrupt_t interruptible,
+       uint64_t deadline)
+{
+       if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
+               return sleep_with_inheritor_and_turnstile_type(event, inheritor,
+                          interruptible, deadline, TURNSTILE_SLEEP_INHERITOR,
+                          ^{}, ^{ lck_spin_unlock(lock); });
+       } else {
+               return sleep_with_inheritor_and_turnstile_type(event, inheritor,
+                          interruptible, deadline, TURNSTILE_SLEEP_INHERITOR,
+                          ^{ lck_spin_lock(lock); }, ^{ lck_spin_unlock(lock); });
+       }
+}
+
+/*
+ * Name: lck_mtx_sleep_with_inheritor
+ *
+ * Description: deschedule the current thread and wait on the waitq associated with event to be woken up.
+ *              While waiting, the sched priority of the waiting thread will contribute to the push of the event that will
+ *              be directed to the inheritor specified.
+ *              An interruptible mode and deadline can be specified to return earlier from the wait.
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the sleep. The lock will be dropped while sleeping and reaquired before returning according to the sleep action specified.
+ *   Arg2: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_UNLOCK, LCK_SLEEP_SPIN, LCK_SLEEP_SPIN_ALWAYS.
+ *   Arg3: event to wait on.
+ *   Arg4: thread to propagate the event push to.
+ *   Arg5: interruptible flag for wait.
+ *   Arg6: deadline for wait.
+ *
+ * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified.
+ *             Lock will be dropped while waiting.
+ *             The inheritor specified cannot run in user space until another inheritor is specified for the event or a
+ *             wakeup for the event is called.
+ *
+ * Returns: result of the wait.
+ */
+wait_result_t
+lck_mtx_sleep_with_inheritor(lck_mtx_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline)
+{
+       return lck_mtx_sleep_with_inheritor_and_turnstile_type(lock, lck_sleep_action, event, inheritor, interruptible, deadline, TURNSTILE_SLEEP_INHERITOR);
+}
+
+/*
+ * sleep_with_inheritor functions with lck_rw_t as locking primitive.
+ */
+
+wait_result_t
+lck_rw_sleep_with_inheritor_and_turnstile_type(lck_rw_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline, turnstile_type_t type)
+{
+       __block lck_rw_type_t lck_rw_type = LCK_RW_TYPE_EXCLUSIVE;
+
+       LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD);
+
+       if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
+               return sleep_with_inheritor_and_turnstile_type(event,
+                          inheritor,
+                          interruptible,
+                          deadline,
+                          type,
+                          ^{;},
+                          ^{lck_rw_type = lck_rw_done(lock);});
+       } else if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
+               return sleep_with_inheritor_and_turnstile_type(event,
+                          inheritor,
+                          interruptible,
+                          deadline,
+                          type,
+                          ^{lck_rw_lock(lock, lck_rw_type);},
+                          ^{lck_rw_type = lck_rw_done(lock);});
+       } else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
+               return sleep_with_inheritor_and_turnstile_type(event,
+                          inheritor,
+                          interruptible,
+                          deadline,
+                          type,
+                          ^{lck_rw_lock_exclusive(lock);},
+                          ^{lck_rw_type = lck_rw_done(lock);});
+       } else {
+               return sleep_with_inheritor_and_turnstile_type(event,
+                          inheritor,
+                          interruptible,
+                          deadline,
+                          type,
+                          ^{lck_rw_lock_shared(lock);},
+                          ^{lck_rw_type = lck_rw_done(lock);});
+       }
+}
+
+/*
+ * Name: lck_rw_sleep_with_inheritor
+ *
+ * Description: deschedule the current thread and wait on the waitq associated with event to be woken up.
+ *              While waiting, the sched priority of the waiting thread will contribute to the push of the event that will
+ *              be directed to the inheritor specified.
+ *              An interruptible mode and deadline can be specified to return earlier from the wait.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the sleep. The lock will be dropped while sleeping and reaquired before returning according to the sleep action specified.
+ *   Arg2: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_SHARED, LCK_SLEEP_EXCLUSIVE.
+ *   Arg3: event to wait on.
+ *   Arg4: thread to propagate the event push to.
+ *   Arg5: interruptible flag for wait.
+ *   Arg6: deadline for wait.
+ *
+ * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified.
+ *             Lock will be dropped while waiting.
+ *             The inheritor specified cannot run in user space until another inheritor is specified for the event or a
+ *             wakeup for the event is called.
+ *
+ * Returns: result of the wait.
+ */
+wait_result_t
+lck_rw_sleep_with_inheritor(lck_rw_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline)
+{
+       return lck_rw_sleep_with_inheritor_and_turnstile_type(lock, lck_sleep_action, event, inheritor, interruptible, deadline, TURNSTILE_SLEEP_INHERITOR);
+}
+
+/*
+ * wakeup_with_inheritor functions are independent from the locking primitive.
+ */
+
+/*
+ * Name: wakeup_one_with_inheritor
+ *
+ * Description: wake up one waiter for event if any. The thread woken up will be the one with the higher sched priority waiting on event.
+ *              The push for the event will be transferred from the last inheritor to the woken up thread if LCK_WAKE_DEFAULT is specified.
+ *              If LCK_WAKE_DO_NOT_TRANSFER_PUSH is specified the push will not be transferred.
+ *
+ * Args:
+ *   Arg1: event to wake from.
+ *   Arg2: wait result to pass to the woken up thread.
+ *   Arg3: wake flag. LCK_WAKE_DEFAULT or LCK_WAKE_DO_NOT_TRANSFER_PUSH.
+ *   Arg4: pointer for storing the thread wokenup.
+ *
+ * Returns: KERN_NOT_WAITING if no threads were waiting, KERN_SUCCESS otherwise.
+ *
+ * Conditions: The new inheritor wokenup cannot run in user space until another inheritor is specified for the event or a
+ *             wakeup for the event is called.
+ *             A reference for the wokenup thread is acquired.
+ *             NOTE: this cannot be called from interrupt context.
+ */
+kern_return_t
+wakeup_one_with_inheritor(event_t event, wait_result_t result, lck_wake_action_t action, thread_t *thread_wokenup)
+{
+       return wakeup_with_inheritor_and_turnstile_type(event,
+                  TURNSTILE_SLEEP_INHERITOR,
+                  result,
+                  TRUE,
+                  action,
+                  thread_wokenup);
+}
+
+/*
+ * Name: wakeup_all_with_inheritor
+ *
+ * Description: wake up all waiters waiting for event. The old inheritor will lose the push.
+ *
+ * Args:
+ *   Arg1: event to wake from.
+ *   Arg2: wait result to pass to the woken up threads.
+ *
+ * Returns: KERN_NOT_WAITING if no threads were waiting, KERN_SUCCESS otherwise.
+ *
+ * Conditions: NOTE: this cannot be called from interrupt context.
+ */
+kern_return_t
+wakeup_all_with_inheritor(event_t event, wait_result_t result)
+{
+       return wakeup_with_inheritor_and_turnstile_type(event,
+                  TURNSTILE_SLEEP_INHERITOR,
+                  result,
+                  FALSE,
+                  0,
+                  NULL);
+}
+
+/*
+ * change_sleep_inheritor is independent from the locking primitive.
+ */
+
+/*
+ * Name: change_sleep_inheritor
+ *
+ * Description: Redirect the push of the waiting threads of event to the new inheritor specified.
+ *
+ * Args:
+ *   Arg1: event to redirect the push.
+ *   Arg2: new inheritor for event.
+ *
+ * Returns: KERN_NOT_WAITING if no threads were waiting, KERN_SUCCESS otherwise.
+ *
+ * Conditions: In case of success, the new inheritor cannot run in user space until another inheritor is specified for the event or a
+ *             wakeup for the event is called.
+ *             NOTE: this cannot be called from interrupt context.
+ */
+kern_return_t
+change_sleep_inheritor(event_t event, thread_t inheritor)
+{
+       return change_sleep_inheritor_and_turnstile_type(event,
+                  inheritor,
+                  TURNSTILE_SLEEP_INHERITOR);
+}
+
+void
+kdp_sleep_with_inheritor_find_owner(struct waitq * waitq, __unused event64_t event, thread_waitinfo_t * waitinfo)
+{
+       assert(waitinfo->wait_type == kThreadWaitSleepWithInheritor);
+       assert(waitq_is_turnstile_queue(waitq));
+       waitinfo->owner = 0;
+       waitinfo->context = 0;
+
+       if (waitq_held(waitq)) {
+               return;
+       }
+
+       struct turnstile *turnstile = waitq_to_turnstile(waitq);
+       assert(turnstile->ts_inheritor_flags & TURNSTILE_INHERITOR_THREAD);
+       waitinfo->owner = thread_tid(turnstile->ts_inheritor);
+}
+
+typedef void (*void_func_void)(void);
+
+static kern_return_t
+gate_try_close(gate_t *gate)
+{
+       uintptr_t state;
+       thread_t holder;
+       kern_return_t ret;
+       __assert_only bool waiters;
+       thread_t thread = current_thread();
+
+       if (os_atomic_cmpxchg(&gate->gate_data, 0, GATE_THREAD_TO_STATE(thread), acquire)) {
+               return KERN_SUCCESS;
+       }
+
+       gate_ilock(gate);
+       state = ordered_load_gate(gate);
+       holder = GATE_STATE_TO_THREAD(state);
+
+       if (holder == NULL) {
+               waiters = gate_has_waiters(state);
+               assert(waiters == FALSE);
+
+               state = GATE_THREAD_TO_STATE(current_thread());
+               state |= GATE_ILOCK;
+               ordered_store_gate(gate, state);
+               ret = KERN_SUCCESS;
+       } else {
+               if (holder == current_thread()) {
+                       panic("Trying to close a gate already owned by current thread %p", current_thread());
+               }
+               ret = KERN_FAILURE;
+       }
+
+       gate_iunlock(gate);
+       return ret;
+}
+
+static void
+gate_close(gate_t* gate)
+{
+       uintptr_t state;
+       thread_t holder;
+       __assert_only bool waiters;
+       thread_t thread = current_thread();
+
+       if (os_atomic_cmpxchg(&gate->gate_data, 0, GATE_THREAD_TO_STATE(thread), acquire)) {
+               return;
+       }
+
+       gate_ilock(gate);
+       state = ordered_load_gate(gate);
+       holder = GATE_STATE_TO_THREAD(state);
+
+       if (holder != NULL) {
+               panic("Closing a gate already owned by %p from current thread %p", holder, current_thread());
+       }
+
+       waiters = gate_has_waiters(state);
+       assert(waiters == FALSE);
+
+       state = GATE_THREAD_TO_STATE(thread);
+       state |= GATE_ILOCK;
+       ordered_store_gate(gate, state);
+
+       gate_iunlock(gate);
+}
+
+static void
+gate_open_turnstile(gate_t *gate)
+{
+       struct turnstile *ts = NULL;
+
+       ts = turnstile_prepare((uintptr_t)gate, &gate->turnstile, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
+       waitq_wakeup64_all(&ts->ts_waitq, CAST_EVENT64_T(GATE_EVENT(gate)), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
+       turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
+       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+       turnstile_complete((uintptr_t)gate, &gate->turnstile, NULL, TURNSTILE_KERNEL_MUTEX);
+       /*
+        * We can do the cleanup while holding the interlock.
+        * It is ok because:
+        * 1. current_thread is the previous inheritor and it is running
+        * 2. new inheritor is NULL.
+        * => No chain of turnstiles needs to be updated.
+        */
+       turnstile_cleanup();
+}
+
+static void
+gate_open(gate_t *gate)
+{
+       uintptr_t state;
+       thread_t holder;
+       bool waiters;
+       thread_t thread = current_thread();
+
+       if (os_atomic_cmpxchg(&gate->gate_data, GATE_THREAD_TO_STATE(thread), 0, release)) {
+               return;
+       }
+
+       gate_ilock(gate);
+       state = ordered_load_gate(gate);
+       holder = GATE_STATE_TO_THREAD(state);
+       waiters = gate_has_waiters(state);
+
+       if (holder != thread) {
+               panic("Opening gate owned by %p from current thread %p", holder, thread);
+       }
+
+       if (waiters) {
+               gate_open_turnstile(gate);
+       }
+
+       state = GATE_ILOCK;
+       ordered_store_gate(gate, state);
+
+       gate_iunlock(gate);
+}
+
+static kern_return_t
+gate_handoff_turnstile(gate_t *gate,
+    int flags,
+    thread_t *thread_woken_up,
+    bool *waiters)
+{
+       struct turnstile *ts = NULL;
+       kern_return_t ret = KERN_FAILURE;
+       thread_t hp_thread;
+
+       ts = turnstile_prepare((uintptr_t)gate, &gate->turnstile, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
+       /*
+        * Wake up the higest priority thread waiting on the gate
+        */
+       hp_thread = waitq_wakeup64_identify(&ts->ts_waitq, CAST_EVENT64_T(GATE_EVENT(gate)), THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE);
+
+       if (hp_thread != NULL) {
+               /*
+                * In this case waitq_wakeup64_identify has called turnstile_update_inheritor for us
+                */
+               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+               *thread_woken_up = hp_thread;
+               *waiters = turnstile_has_waiters(ts);
+               /*
+                * Note: hp_thread is the new holder and the new inheritor.
+                * In case there are no more waiters, it doesn't need to be the inheritor
+                * and it shouldn't be it by the time it finishes the wait, so that its next open or
+                * handoff can go through the fast path.
+                * We could set the inheritor to NULL here, or the new holder itself can set it
+                * on its way back from the sleep. In the latter case there are more chanses that
+                * new waiters will come by, avoiding to do the opearation at all.
+                */
+               ret = KERN_SUCCESS;
+       } else {
+               /*
+                * waiters can have been woken up by an interrupt and still not
+                * have updated gate->waiters, so we couldn't find them on the waitq.
+                * Update the inheritor to NULL here, so that the current thread can return to userspace
+                * indipendently from when the interrupted waiters will finish the wait.
+                */
+               if (flags == GATE_HANDOFF_OPEN_IF_NO_WAITERS) {
+                       turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
+                       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+               }
+               // there are no waiters.
+               ret = KERN_NOT_WAITING;
+       }
+
+       turnstile_complete((uintptr_t)gate, &gate->turnstile, NULL, TURNSTILE_KERNEL_MUTEX);
+
+       /*
+        * We can do the cleanup while holding the interlock.
+        * It is ok because:
+        * 1. current_thread is the previous inheritor and it is running
+        * 2. new inheritor is NULL or it is a just wokenup thread that will race acquiring the lock
+        *    of the gate before trying to sleep.
+        * => No chain of turnstiles needs to be updated.
+        */
+       turnstile_cleanup();
+
+       return ret;
+}
+
+static kern_return_t
+gate_handoff(gate_t *gate,
+    int flags)
+{
+       kern_return_t ret;
+       thread_t new_holder = NULL;
+       uintptr_t state;
+       thread_t holder;
+       bool waiters;
+       thread_t thread = current_thread();
+
+       assert(flags == GATE_HANDOFF_OPEN_IF_NO_WAITERS || flags == GATE_HANDOFF_DEFAULT);
+
+       if (flags == GATE_HANDOFF_OPEN_IF_NO_WAITERS) {
+               if (os_atomic_cmpxchg(&gate->gate_data, GATE_THREAD_TO_STATE(thread), 0, release)) {
+                       //gate opened but there were no waiters, so return KERN_NOT_WAITING.
+                       return KERN_NOT_WAITING;
+               }
+       }
+
+       gate_ilock(gate);
+       state = ordered_load_gate(gate);
+       holder = GATE_STATE_TO_THREAD(state);
+       waiters = gate_has_waiters(state);
+
+       if (holder != current_thread()) {
+               panic("Handing off gate owned by %p from current thread %p", holder, current_thread());
+       }
+
+       if (waiters) {
+               ret = gate_handoff_turnstile(gate, flags, &new_holder, &waiters);
+               if (ret == KERN_SUCCESS) {
+                       state = GATE_THREAD_TO_STATE(new_holder);
+                       if (waiters) {
+                               state |= GATE_WAITERS;
+                       }
+               } else {
+                       if (flags == GATE_HANDOFF_OPEN_IF_NO_WAITERS) {
+                               state = 0;
+                       }
+               }
+       } else {
+               if (flags == GATE_HANDOFF_OPEN_IF_NO_WAITERS) {
+                       state = 0;
+               }
+               ret = KERN_NOT_WAITING;
+       }
+       state |= GATE_ILOCK;
+       ordered_store_gate(gate, state);
+
+       gate_iunlock(gate);
+
+       if (new_holder) {
+               thread_deallocate(new_holder);
+       }
+       return ret;
+}
+
+static void_func_void
+gate_steal_turnstile(gate_t *gate,
+    thread_t new_inheritor)
+{
+       struct turnstile *ts = NULL;
+
+       ts = turnstile_prepare((uintptr_t)gate, &gate->turnstile, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
+
+       turnstile_update_inheritor(ts, new_inheritor, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
+       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+       turnstile_complete((uintptr_t)gate, &gate->turnstile, NULL, TURNSTILE_KERNEL_MUTEX);
+
+       /*
+        * turnstile_cleanup might need to update the chain of the old holder.
+        * This operation should happen without the turnstile interlock held.
+        */
+       return turnstile_cleanup;
+}
+
+static void
+gate_steal(gate_t *gate)
+{
+       uintptr_t state;
+       thread_t holder;
+       thread_t thread = current_thread();
+       bool waiters;
+
+       void_func_void func_after_interlock_unlock;
+
+       gate_ilock(gate);
+       state = ordered_load_gate(gate);
+       holder = GATE_STATE_TO_THREAD(state);
+       waiters = gate_has_waiters(state);
+
+       assert(holder != NULL);
+       state = GATE_THREAD_TO_STATE(thread) | GATE_ILOCK;
+       if (waiters) {
+               state |= GATE_WAITERS;
+               ordered_store_gate(gate, state);
+               func_after_interlock_unlock = gate_steal_turnstile(gate, thread);
+               gate_iunlock(gate);
+
+               func_after_interlock_unlock();
+       } else {
+               ordered_store_gate(gate, state);
+               gate_iunlock(gate);
+       }
+}
+
+static void_func_void
+gate_wait_turnstile(gate_t *gate,
+    wait_interrupt_t interruptible,
+    uint64_t deadline,
+    thread_t holder,
+    wait_result_t* wait,
+    bool* waiters)
+{
+       struct turnstile *ts;
+       uintptr_t state;
+
+       ts = turnstile_prepare((uintptr_t)gate, &gate->turnstile, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
+
+       turnstile_update_inheritor(ts, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
+       waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(GATE_EVENT(gate)), interruptible, deadline);
+
+       gate_iunlock(gate);
+
+       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
+
+       *wait = thread_block(THREAD_CONTINUE_NULL);
+
+       gate_ilock(gate);
+
+       *waiters = turnstile_has_waiters(ts);
+
+       if (!*waiters) {
+               /*
+                * We want to enable the fast path as soon as we see that there are no more waiters.
+                * On the fast path the holder will not do any turnstile operations.
+                * Set the inheritor as NULL here.
+                *
+                * NOTE: if it was an open operation that woke this thread up, the inheritor has
+                * already been set to NULL.
+                */
+               state = ordered_load_gate(gate);
+               holder = GATE_STATE_TO_THREAD(state);
+               if (holder &&
+                   ((*wait != THREAD_AWAKENED) ||     // thread interrupted or timedout
+                   holder == current_thread())) {     // thread was woken up and it is the new holder
+                       turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
+                       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
+               }
+       }
+
+       turnstile_complete((uintptr_t)gate, &gate->turnstile, NULL, TURNSTILE_KERNEL_MUTEX);
+
+       /*
+        * turnstile_cleanup might need to update the chain of the old holder.
+        * This operation should happen without the turnstile primitive interlock held.
+        */
+       return turnstile_cleanup;
+}
+
+static gate_wait_result_t
+gate_wait(gate_t* gate,
+    wait_interrupt_t interruptible,
+    uint64_t deadline,
+    void (^primitive_unlock)(void),
+    void (^primitive_lock)(void))
+{
+       gate_wait_result_t ret;
+       void_func_void func_after_interlock_unlock;
+       wait_result_t wait_result;
+       uintptr_t state;
+       thread_t holder;
+       bool waiters;
+
+
+       gate_ilock(gate);
+       state = ordered_load_gate(gate);
+       holder = GATE_STATE_TO_THREAD(state);
+
+       if (holder == NULL) {
+               panic("Trying to wait on open gate thread %p gate %p", current_thread(), gate);
+       }
+
+       state |= GATE_WAITERS;
+       ordered_store_gate(gate, state);
+
+       /*
+        * Release the primitive lock before any
+        * turnstile operation. Turnstile
+        * does not support a blocking primitive as
+        * interlock.
+        *
+        * In this way, concurrent threads will be
+        * able to acquire the primitive lock
+        * but still will wait for me through the
+        * gate interlock.
+        */
+       primitive_unlock();
+
+       func_after_interlock_unlock = gate_wait_turnstile(    gate,
+           interruptible,
+           deadline,
+           holder,
+           &wait_result,
+           &waiters);
+
+       state = ordered_load_gate(gate);
+       holder = GATE_STATE_TO_THREAD(state);
+
+       switch (wait_result) {
+       case THREAD_INTERRUPTED:
+       case THREAD_TIMED_OUT:
+               assert(holder != current_thread());
+
+               if (waiters) {
+                       state |= GATE_WAITERS;
+               } else {
+                       state &= ~GATE_WAITERS;
+               }
+               ordered_store_gate(gate, state);
+
+               if (wait_result == THREAD_INTERRUPTED) {
+                       ret = GATE_INTERRUPTED;
+               } else {
+                       ret = GATE_TIMED_OUT;
+               }
+               break;
+       default:
+               /*
+                * Note it is possible that even if the gate was handed off to
+                * me, someone called gate_steal() before I woke up.
+                *
+                * As well as it is possible that the gate was opened, but someone
+                * closed it while I was waking up.
+                *
+                * In both cases we return GATE_OPENED, as the gate was opened to me
+                * at one point, it is the caller responsibility to check again if
+                * the gate is open.
+                */
+               if (holder == current_thread()) {
+                       ret = GATE_HANDOFF;
+               } else {
+                       ret = GATE_OPENED;
+               }
+               break;
+       }
+
+       gate_iunlock(gate);
+
+       /*
+        * turnstile func that needs to be executed without
+        * holding the primitive interlock
+        */
+       func_after_interlock_unlock();
+
+       primitive_lock();
+
+       return ret;
+}
+static void
+gate_assert(gate_t *gate, int flags)
+{
+       uintptr_t state;
+       thread_t holder;
+
+       gate_ilock(gate);
+       state = ordered_load_gate(gate);
+       holder = GATE_STATE_TO_THREAD(state);
+
+       switch (flags) {
+       case GATE_ASSERT_CLOSED:
+               assert(holder != NULL);
+               break;
+       case GATE_ASSERT_OPEN:
+               assert(holder == NULL);
+               break;
+       case GATE_ASSERT_HELD:
+               assert(holder == current_thread());
+               break;
+       default:
+               panic("invalid %s flag %d", __func__, flags);
+       }
+
+       gate_iunlock(gate);
+}
+
+static void
+gate_init(gate_t *gate)
+{
+       gate->gate_data = 0;
+       gate->turnstile = NULL;
+}
+
+static void
+gate_destroy(__assert_only gate_t *gate)
+{
+       assert(gate->gate_data == 0);
+       assert(gate->turnstile == NULL);
+}
+
+/*
+ * Name: lck_rw_gate_init
+ *
+ * Description: initializes a variable declared with decl_lck_rw_gate_data.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ */
+void
+lck_rw_gate_init(lck_rw_t *lock, gate_t *gate)
+{
+       (void) lock;
+       gate_init(gate);
+}
+
+/*
+ * Name: lck_rw_gate_destroy
+ *
+ * Description: destroys a variable previously initialized.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ */
+void
+lck_rw_gate_destroy(lck_rw_t *lock, gate_t *gate)
+{
+       (void) lock;
+       gate_destroy(gate);
+}
+
+/*
+ * Name: lck_rw_gate_try_close
+ *
+ * Description: Tries to close the gate.
+ *              In case of success the current thread will be set as
+ *              the holder of the gate.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *
+ * Returns:
+ *          KERN_SUCCESS in case the gate was successfully closed. The current thread is the new holder
+ *          of the gate.
+ *          A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on
+ *          to wake up possible waiters on the gate before returning to userspace.
+ *          If the intent is to conditionally probe the gate before waiting, the lock must not be dropped
+ *          between the calls to lck_rw_gate_try_close() and lck_rw_gate_wait().
+ *
+ *          KERN_FAILURE in case the gate was already closed. Will panic if the current thread was already the holder of the gate.
+ *          lck_rw_gate_wait() should be called instead if the intent is to unconditionally wait on this gate.
+ *          The calls to lck_rw_gate_try_close() and lck_rw_gate_wait() should
+ *          be done without dropping the lock that is protecting the gate in between.
+ */
+int
+lck_rw_gate_try_close(__assert_only lck_rw_t *lock, gate_t *gate)
+{
+       LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD);
+
+       return gate_try_close(gate);
+}
+
+/*
+ * Name: lck_rw_gate_close
+ *
+ * Description: Closes the gate. The current thread will be set as
+ *              the holder of the gate. Will panic if the gate is already closed.
+ *              A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on
+ *              to wake up possible waiters on the gate before returning to userspace.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The gate must be open.
+ *
+ */
+void
+lck_rw_gate_close(__assert_only lck_rw_t *lock, gate_t *gate)
+{
+       LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD);
+
+       return gate_close(gate);
+}
+
+/*
+ * Name: lck_rw_gate_open
+ *
+ * Description: Opens the gate and wakes up possible waiters.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The current thread must be the holder of the gate.
+ *
+ */
+void
+lck_rw_gate_open(__assert_only lck_rw_t *lock, gate_t *gate)
+{
+       LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD);
+
+       gate_open(gate);
+}
+
+/*
+ * Name: lck_rw_gate_handoff
+ *
+ * Description: Tries to transfer the ownership of the gate. The waiter with highest sched
+ *              priority will be selected as the new holder of the gate, and woken up,
+ *              with the gate remaining in the closed state throughout.
+ *              If no waiters are present, the gate will be kept closed and KERN_NOT_WAITING
+ *              will be returned.
+ *              GATE_HANDOFF_OPEN_IF_NO_WAITERS flag can be used to specify if the gate should be opened in
+ *              case no waiters were found.
+ *
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ *   Arg3: flags - GATE_HANDOFF_DEFAULT or GATE_HANDOFF_OPEN_IF_NO_WAITERS
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The current thread must be the holder of the gate.
+ *
+ * Returns:
+ *          KERN_SUCCESS in case one of the waiters became the new holder.
+ *          KERN_NOT_WAITING in case there were no waiters.
+ *
+ */
+kern_return_t
+lck_rw_gate_handoff(__assert_only lck_rw_t *lock, gate_t *gate, int flags)
+{
+       LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD);
+
+       return gate_handoff(gate, flags);
+}
+
+/*
+ * Name: lck_rw_gate_steal
+ *
+ * Description: Set the current ownership of the gate. It sets the current thread as the
+ *              new holder of the gate.
+ *              A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on
+ *              to wake up possible waiters on the gate before returning to userspace.
+ *              NOTE: the previous holder should not call lck_rw_gate_open() or lck_rw_gate_handoff()
+ *              anymore.
+ *
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The gate must be closed and the current thread must not already be the holder.
+ *
+ */
+void
+lck_rw_gate_steal(__assert_only lck_rw_t *lock, gate_t *gate)
+{
+       LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD);
+
+       gate_steal(gate);
+}
+
+/*
+ * Name: lck_rw_gate_wait
+ *
+ * Description: Waits for the current thread to become the holder of the gate or for the
+ *              gate to become open. An interruptible mode and deadline can be specified
+ *              to return earlier from the wait.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ *   Arg3: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_SHARED, LCK_SLEEP_EXCLUSIVE.
+ *   Arg3: interruptible flag for wait.
+ *   Arg4: deadline
+ *
+ * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified.
+ *             Lock will be dropped while waiting.
+ *             The gate must be closed.
+ *
+ * Returns: Reason why the thread was woken up.
+ *          GATE_HANDOFF - the current thread was handed off the ownership of the gate.
+ *                         A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on
+ *                         to wake up possible waiters on the gate before returning to userspace.
+ *          GATE_OPENED - the gate was opened by the holder.
+ *          GATE_TIMED_OUT - the thread was woken up by a timeout.
+ *          GATE_INTERRUPTED - the thread was interrupted while sleeping.
+ *
+ */
+gate_wait_result_t
+lck_rw_gate_wait(lck_rw_t *lock, gate_t *gate, lck_sleep_action_t lck_sleep_action, wait_interrupt_t interruptible, uint64_t deadline)
+{
+       __block lck_rw_type_t lck_rw_type = LCK_RW_TYPE_EXCLUSIVE;
+
+       LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD);
+
+       if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
+               return gate_wait(gate,
+                          interruptible,
+                          deadline,
+                          ^{lck_rw_type = lck_rw_done(lock);},
+                          ^{;});
+       } else if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
+               return gate_wait(gate,
+                          interruptible,
+                          deadline,
+                          ^{lck_rw_type = lck_rw_done(lock);},
+                          ^{lck_rw_lock(lock, lck_rw_type);});
+       } else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
+               return gate_wait(gate,
+                          interruptible,
+                          deadline,
+                          ^{lck_rw_type = lck_rw_done(lock);},
+                          ^{lck_rw_lock_exclusive(lock);});
+       } else {
+               return gate_wait(gate,
+                          interruptible,
+                          deadline,
+                          ^{lck_rw_type = lck_rw_done(lock);},
+                          ^{lck_rw_lock_shared(lock);});
+       }
+}
+
+/*
+ * Name: lck_rw_gate_assert
+ *
+ * Description: asserts that the gate is in the specified state.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ *   Arg3: flags to specified assert type.
+ *         GATE_ASSERT_CLOSED - the gate is currently closed
+ *         GATE_ASSERT_OPEN - the gate is currently opened
+ *         GATE_ASSERT_HELD - the gate is currently closed and the current thread is the holder
+ */
+void
+lck_rw_gate_assert(__assert_only lck_rw_t *lock, gate_t *gate, int flags)
+{
+       LCK_RW_ASSERT(lock, LCK_RW_ASSERT_HELD);
+
+       gate_assert(gate, flags);
+       return;
+}
+
+/*
+ * Name: lck_mtx_gate_init
+ *
+ * Description: initializes a variable declared with decl_lck_mtx_gate_data.
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ */
+void
+lck_mtx_gate_init(lck_mtx_t *lock, gate_t *gate)
+{
+       (void) lock;
+       gate_init(gate);
+}
+
+/*
+ * Name: lck_mtx_gate_destroy
+ *
+ * Description: destroys a variable previously initialized
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ */
+void
+lck_mtx_gate_destroy(lck_mtx_t *lock, gate_t *gate)
+{
+       (void) lock;
+       gate_destroy(gate);
+}
+
+/*
+ * Name: lck_mtx_gate_try_close
+ *
+ * Description: Tries to close the gate.
+ *              In case of success the current thread will be set as
+ *              the holder of the gate.
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *
+ * Returns:
+ *          KERN_SUCCESS in case the gate was successfully closed. The current thread is the new holder
+ *          of the gate.
+ *          A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on
+ *          to wake up possible waiters on the gate before returning to userspace.
+ *          If the intent is to conditionally probe the gate before waiting, the lock must not be dropped
+ *          between the calls to lck_mtx_gate_try_close() and lck_mtx_gate_wait().
+ *
+ *          KERN_FAILURE in case the gate was already closed. Will panic if the current thread was already the holder of the gate.
+ *          lck_mtx_gate_wait() should be called instead if the intent is to unconditionally wait on this gate.
+ *          The calls to lck_mtx_gate_try_close() and lck_mtx_gate_wait() should
+ *          be done without dropping the lock that is protecting the gate in between.
+ */
+int
+lck_mtx_gate_try_close(__assert_only lck_mtx_t *lock, gate_t *gate)
+{
+       LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED);
+
+       return gate_try_close(gate);
+}
+
+/*
+ * Name: lck_mtx_gate_close
+ *
+ * Description: Closes the gate. The current thread will be set as
+ *              the holder of the gate. Will panic if the gate is already closed.
+ *              A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on
+ *              to wake up possible waiters on the gate before returning to userspace.
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The gate must be open.
+ *
+ */
 void
-hw_atomic_or_noret(volatile uint32_t *dest, uint32_t mask)
+lck_mtx_gate_close(__assert_only lck_mtx_t *lock, gate_t *gate)
 {
-       ALIGN_TEST(dest, uint32_t);
-       __c11_atomic_fetch_or(ATOMIC_CAST(uint32_t, dest), mask, memory_order_relaxed);
+       LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED);
+
+       return gate_close(gate);
+}
+
+/*
+ * Name: lck_mtx_gate_open
+ *
+ * Description: Opens of the gate and wakes up possible waiters.
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The current thread must be the holder of the gate.
+ *
+ */
+void
+lck_mtx_gate_open(__assert_only lck_mtx_t *lock, gate_t *gate)
+{
+       LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED);
+
+       gate_open(gate);
 }
 
-uint32_t
-hw_atomic_and(volatile uint32_t *dest, uint32_t mask)
+/*
+ * Name: lck_mtx_gate_handoff
+ *
+ * Description: Set the current ownership of the gate. The waiter with highest sched
+ *              priority will be selected as the new holder of the gate, and woken up,
+ *              with the gate remaining in the closed state throughout.
+ *              If no waiters are present, the gate will be kept closed and KERN_NOT_WAITING
+ *              will be returned.
+ *              OPEN_ON_FAILURE flag can be used to specify if the gate should be opened in
+ *              case no waiters were found.
+ *
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ *   Arg3: flags - GATE_NO_FALGS or OPEN_ON_FAILURE
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The current thread must be the holder of the gate.
+ *
+ * Returns:
+ *          KERN_SUCCESS in case one of the waiters became the new holder.
+ *          KERN_NOT_WAITING in case there were no waiters.
+ *
+ */
+kern_return_t
+lck_mtx_gate_handoff(__assert_only lck_mtx_t *lock, gate_t *gate, int flags)
 {
-       ALIGN_TEST(dest, uint32_t);
-       return __c11_atomic_fetch_and(ATOMIC_CAST(uint32_t, dest), mask, memory_order_relaxed) & mask;
+       LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED);
+
+       return gate_handoff(gate, flags);
 }
 
+/*
+ * Name: lck_mtx_gate_steal
+ *
+ * Description: Steals the ownership of the gate. It sets the current thread as the
+ *              new holder of the gate.
+ *              A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on
+ *              to wake up possible waiters on the gate before returning to userspace.
+ *              NOTE: the previous holder should not call lck_mtx_gate_open() or lck_mtx_gate_handoff()
+ *              anymore.
+ *
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The gate must be closed and the current thread must not already be the holder.
+ *
+ */
 void
-hw_atomic_and_noret(volatile uint32_t *dest, uint32_t mask)
+lck_mtx_gate_steal(__assert_only lck_mtx_t *lock, gate_t *gate)
+{
+       LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED);
+
+       gate_steal(gate);
+}
+
+/*
+ * Name: lck_mtx_gate_wait
+ *
+ * Description: Waits for the current thread to become the holder of the gate or for the
+ *              gate to become open. An interruptible mode and deadline can be specified
+ *              to return earlier from the wait.
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ *   Arg3: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_UNLOCK, LCK_SLEEP_SPIN, LCK_SLEEP_SPIN_ALWAYS.
+ *   Arg3: interruptible flag for wait.
+ *   Arg4: deadline
+ *
+ * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified.
+ *             Lock will be dropped while waiting.
+ *             The gate must be closed.
+ *
+ * Returns: Reason why the thread was woken up.
+ *          GATE_HANDOFF - the current thread was handed off the ownership of the gate.
+ *                         A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on
+ *                         to wake up possible waiters on the gate before returning to userspace.
+ *          GATE_OPENED - the gate was opened by the holder.
+ *          GATE_TIMED_OUT - the thread was woken up by a timeout.
+ *          GATE_INTERRUPTED - the thread was interrupted while sleeping.
+ *
+ */
+gate_wait_result_t
+lck_mtx_gate_wait(lck_mtx_t *lock, gate_t *gate, lck_sleep_action_t lck_sleep_action, wait_interrupt_t interruptible, uint64_t deadline)
 {
-       ALIGN_TEST(dest, uint32_t);
-       __c11_atomic_fetch_and(ATOMIC_CAST(uint32_t, dest), mask, memory_order_relaxed);
+       LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED);
+
+       if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
+               return gate_wait(gate,
+                          interruptible,
+                          deadline,
+                          ^{lck_mtx_unlock(lock);},
+                          ^{;});
+       } else if (lck_sleep_action & LCK_SLEEP_SPIN) {
+               return gate_wait(gate,
+                          interruptible,
+                          deadline,
+                          ^{lck_mtx_unlock(lock);},
+                          ^{lck_mtx_lock_spin(lock);});
+       } else if (lck_sleep_action & LCK_SLEEP_SPIN_ALWAYS) {
+               return gate_wait(gate,
+                          interruptible,
+                          deadline,
+                          ^{lck_mtx_unlock(lock);},
+                          ^{lck_mtx_lock_spin_always(lock);});
+       } else {
+               return gate_wait(gate,
+                          interruptible,
+                          deadline,
+                          ^{lck_mtx_unlock(lock);},
+                          ^{lck_mtx_lock(lock);});
+       }
 }
 
-uint32_t
-hw_compare_and_store(uint32_t oldval, uint32_t newval, volatile uint32_t *dest)
+/*
+ * Name: lck_mtx_gate_assert
+ *
+ * Description: asserts that the gate is in the specified state.
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ *   Arg3: flags to specified assert type.
+ *         GATE_ASSERT_CLOSED - the gate is currently closed
+ *         GATE_ASSERT_OPEN - the gate is currently opened
+ *         GATE_ASSERT_HELD - the gate is currently closed and the current thread is the holder
+ */
+void
+lck_mtx_gate_assert(__assert_only lck_mtx_t *lock, gate_t *gate, int flags)
 {
-       ALIGN_TEST(dest, uint32_t);
-       return __c11_atomic_compare_exchange_strong(ATOMIC_CAST(uint32_t, dest), &oldval, newval,
-                  memory_order_acq_rel_smp, memory_order_relaxed);
+       LCK_MTX_ASSERT(lock, LCK_MTX_ASSERT_OWNED);
+
+       gate_assert(gate, flags);
 }
index dd5f3a54af87ba19329e6af6750865062c15f2e5..51c1da4c13d061d7f4bf825bc00f6b8a63e28dad 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -58,6 +58,10 @@ typedef unsigned int            lck_sleep_action_t;
 
 #define LCK_SLEEP_MASK          0x3f    /* Valid actions */
 
+typedef unsigned int            lck_wake_action_t;
+
+#define LCK_WAKE_DEFAULT                0x00 /* If waiters are present, transfer their push to the wokenup thread */
+#define LCK_WAKE_DO_NOT_TRANSFER_PUSH   0x01 /* Do not transfer waiters push when waking up */
 
 #ifdef  MACH_KERNEL_PRIVATE
 typedef struct _lck_attr_ {
@@ -75,7 +79,7 @@ extern lck_attr_t      LockDefaultLckAttr;
 typedef struct __lck_attr__ lck_attr_t;
 #endif
 
-#define LCK_ATTR_NULL (lck_attr_t *)0
+#define LCK_ATTR_NULL (lck_attr_t *)NULL
 
 __BEGIN_DECLS
 
@@ -118,7 +122,7 @@ extern  void                    lck_attr_rw_shared_priority(
 extern  void                    lck_attr_free(
        lck_attr_t              *attr);
 
-#define decl_lck_spin_data(class, name)     class lck_spin_t name;
+#define decl_lck_spin_data(class, name)     class lck_spin_t name
 
 extern lck_spin_t               *lck_spin_alloc_init(
        lck_grp_t               *grp,
@@ -191,7 +195,7 @@ extern void lck_mtx_init_ext(lck_mtx_t *lck, struct _lck_mtx_ext_ *lck_ext,
 #endif
 
 
-#define decl_lck_mtx_data(class, name)     class lck_mtx_t name;
+#define decl_lck_mtx_data(class, name)     class lck_mtx_t name
 
 extern lck_mtx_t                *lck_mtx_alloc_init(
        lck_grp_t               *grp,
@@ -227,16 +231,594 @@ extern wait_result_t    lck_mtx_sleep_deadline(
        event_t                         event,
        wait_interrupt_t        interruptible,
        uint64_t                        deadline);
+
+#ifdef KERNEL_PRIVATE
+/*
+ * Name: lck_spin_sleep_with_inheritor
+ *
+ * Description: deschedule the current thread and wait on the waitq associated with event to be woken up.
+ *              While waiting, the sched priority of the waiting thread will contribute to the push of the event that will
+ *              be directed to the inheritor specified.
+ *              An interruptible mode and deadline can be specified to return earlier from the wait.
+ *
+ * Args:
+ *   Arg1: lck_spin_t lock used to protect the sleep. The lock will be dropped while sleeping and reaquired before returning according to the sleep action specified.
+ *   Arg2: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_UNLOCK.
+ *   Arg3: event to wait on.
+ *   Arg4: thread to propagate the event push to.
+ *   Arg5: interruptible flag for wait.
+ *   Arg6: deadline for wait.
+ *
+ * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified.
+ *             Lock will be dropped while waiting.
+ *             The inheritor specified cannot run in user space until another inheritor is specified for the event or a
+ *             wakeup for the event is called.
+ *
+ * Returns: result of the wait.
+ */
+extern wait_result_t lck_spin_sleep_with_inheritor(lck_spin_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline);
+
+/*
+ * Name: lck_mtx_sleep_with_inheritor
+ *
+ * Description: deschedule the current thread and wait on the waitq associated with event to be woken up.
+ *              While waiting, the sched priority of the waiting thread will contribute to the push of the event that will
+ *              be directed to the inheritor specified.
+ *              An interruptible mode and deadline can be specified to return earlier from the wait.
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the sleep. The lock will be dropped while sleeping and reaquired before returning according to the sleep action specified.
+ *   Arg2: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_UNLOCK, LCK_SLEEP_SPIN, LCK_SLEEP_SPIN_ALWAYS.
+ *   Arg3: event to wait on.
+ *   Arg4: thread to propagate the event push to.
+ *   Arg5: interruptible flag for wait.
+ *   Arg6: deadline for wait.
+ *
+ * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified.
+ *             Lock will be dropped while waiting.
+ *             The inheritor specified cannot run in user space until another inheritor is specified for the event or a
+ *             wakeup for the event is called.
+ *
+ * Returns: result of the wait.
+ */
+extern wait_result_t lck_mtx_sleep_with_inheritor(lck_mtx_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline);
+
+/*
+ * Name: lck_mtx_sleep_with_inheritor
+ *
+ * Description: deschedule the current thread and wait on the waitq associated with event to be woken up.
+ *              While waiting, the sched priority of the waiting thread will contribute to the push of the event that will
+ *              be directed to the inheritor specified.
+ *              An interruptible mode and deadline can be specified to return earlier from the wait.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the sleep. The lock will be dropped while sleeping and reaquired before returning according to the sleep action specified.
+ *   Arg2: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_SHARED, LCK_SLEEP_EXCLUSIVE.
+ *   Arg3: event to wait on.
+ *   Arg4: thread to propagate the event push to.
+ *   Arg5: interruptible flag for wait.
+ *   Arg6: deadline for wait.
+ *
+ * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified.
+ *             Lock will be dropped while waiting.
+ *             The inheritor specified cannot run in user space until another inheritor is specified for the event or a
+ *             wakeup for the event is called.
+ *
+ * Returns: result of the wait.
+ */
+extern wait_result_t lck_rw_sleep_with_inheritor(lck_rw_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline);
+
+/*
+ * Name: wakeup_one_with_inheritor
+ *
+ * Description: wake up one waiter for event if any. The thread woken up will be the one with the higher sched priority waiting on event.
+ *              The push for the event will be transferred from the last inheritor to the woken up thread.
+ *
+ * Args:
+ *   Arg1: event to wake from.
+ *   Arg2: wait result to pass to the woken up thread.
+ *   Arg3: pointer for storing the thread wokenup.
+ *
+ * Returns: KERN_NOT_WAITING if no threads were waiting, KERN_SUCCESS otherwise.
+ *
+ * Conditions: The new inheritor wokenup cannot run in user space until another inheritor is specified for the event or a
+ *             wakeup for the event is called.
+ *             A reference for the wokenup thread is acquired.
+ *             NOTE: this cannot be called from interrupt context.
+ */
+extern kern_return_t wakeup_one_with_inheritor(event_t event, wait_result_t result, lck_wake_action_t action, thread_t *thread_wokenup);
+
+/*
+ * Name: wakeup_all_with_inheritor
+ *
+ * Description: wake up all waiters waiting for event. The old inheritor will lose the push.
+ *
+ * Args:
+ *   Arg1: event to wake from.
+ *   Arg2: wait result to pass to the woken up threads.
+ *
+ * Returns: KERN_NOT_WAITING if no threads were waiting, KERN_SUCCESS otherwise.
+ *
+ * Conditions: NOTE: this cannot be called from interrupt context.
+ */
+extern kern_return_t wakeup_all_with_inheritor(event_t event, wait_result_t result);
+
+/*
+ * Name: change_sleep_inheritor
+ *
+ * Description: Redirect the push of the waiting threads of event to the new inheritor specified.
+ *
+ * Args:
+ *   Arg1: event to redirect the push.
+ *   Arg2: new inheritor for event.
+ *
+ * Returns: KERN_NOT_WAITING if no threads were waiting, KERN_SUCCESS otherwise.
+ *
+ * Conditions: In case of success, the new inheritor cannot run in user space until another inheritor is specified for the event or a
+ *             wakeup for the event is called.
+ *             NOTE: this cannot be called from interrupt context.
+ */
+extern kern_return_t change_sleep_inheritor(event_t event, thread_t inheritor);
+
+/*
+ * gate structure
+ */
+typedef struct gate {
+       uintptr_t gate_data; // thread holder, interlock bit and waiter bit
+       struct turnstile *turnstile; // protected by the interlock bit
+} gate_t;
+
+#define GATE_ILOCK_BIT   0
+#define GATE_WAITERS_BIT 1
+
+#define GATE_ILOCK (1 << GATE_ILOCK_BIT)
+#define GATE_WAITERS (1 << GATE_WAITERS_BIT)
+
+#define gate_ilock(gate) hw_lock_bit((hw_lock_bit_t*)(&(gate)->gate_data), GATE_ILOCK_BIT, LCK_GRP_NULL)
+#define gate_iunlock(gate) hw_unlock_bit((hw_lock_bit_t*)(&(gate)->gate_data), GATE_ILOCK_BIT)
+#define gate_has_waiters(state) ((state & GATE_WAITERS) != 0)
+#define ordered_load_gate(gate) os_atomic_load(&(gate)->gate_data, compiler_acq_rel)
+#define ordered_store_gate(gate, value)  os_atomic_store(&(gate)->gate_data, value, compiler_acq_rel)
+
+#define GATE_THREAD_MASK (~(uintptr_t)(GATE_ILOCK | GATE_WAITERS))
+#define GATE_STATE_TO_THREAD(state) (thread_t)(state & GATE_THREAD_MASK)
+#define GATE_THREAD_TO_STATE(thread) ((uintptr_t)thread)
+
+/*
+ * Possible gate_wait_result_t values.
+ */
+typedef int gate_wait_result_t;
+#define GATE_HANDOFF       0
+#define GATE_OPENED        1
+#define GATE_TIMED_OUT     2
+#define GATE_INTERRUPTED   3
+
+/*
+ * Gate flags used by gate_assert
+ */
+#define GATE_ASSERT_CLOSED         0
+#define GATE_ASSERT_OPEN           1
+#define GATE_ASSERT_HELD           2
+
+/*
+ * Gate flags used by gate_handoff
+ */
+#define GATE_HANDOFF_DEFAULT                    0
+#define GATE_HANDOFF_OPEN_IF_NO_WAITERS         1
+
+#define GATE_EVENT(gate)     ((event_t) gate)
+#define EVENT_TO_GATE(event) ((gate_t *) event)
+
+/*
+ * Name: decl_lck_rw_gate_data
+ *
+ * Description: declares a gate variable with specified storage class.
+ *              The gate itself will be stored in this variable and it is the caller's responsibility
+ *              to ensure that this variable's memory is going to be accessible by all threads that will use
+ *              the gate.
+ *              Every gate function will require a pointer to this variable as parameter. The same pointer should
+ *              be used in every thread.
+ *
+ *              The variable needs to be initialized once with lck_rw_gate_init() and destroyed once with
+ *              lck_rw_gate_destroy() when not needed anymore.
+ *
+ *              The gate will be used in conjunction with a lck_rw_t.
+ *
+ * Args:
+ *   Arg1: storage class.
+ *   Arg2: variable name.
+ */
+#define decl_lck_rw_gate_data(class, name)                              class gate_t name
+
+/*
+ * Name: lck_rw_gate_init
+ *
+ * Description: initializes a variable declared with decl_lck_rw_gate_data.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ */
+extern void lck_rw_gate_init(lck_rw_t *lock, gate_t *gate);
+
+/*
+ * Name: lck_rw_gate_destroy
+ *
+ * Description: destroys a variable previously initialized.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ */
+extern void lck_rw_gate_destroy(lck_rw_t *lock, gate_t *gate);
+
+/*
+ * Name: lck_rw_gate_try_close
+ *
+ * Description: Tries to close the gate.
+ *              In case of success the current thread will be set as
+ *              the holder of the gate.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *
+ * Returns:
+ *          KERN_SUCCESS in case the gate was successfully closed. The current thread is the new holder
+ *          of the gate.
+ *          A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on
+ *          to wake up possible waiters on the gate before returning to userspace.
+ *          If the intent is to conditionally probe the gate before waiting, the lock must not be dropped
+ *          between the calls to lck_rw_gate_try_close() and lck_rw_gate_wait().
+ *
+ *          KERN_FAILURE in case the gate was already closed. Will panic if the current thread was already the holder of the gate.
+ *          lck_rw_gate_wait() should be called instead if the intent is to unconditionally wait on this gate.
+ *          The calls to lck_rw_gate_try_close() and lck_rw_gate_wait() should
+ *          be done without dropping the lock that is protecting the gate in between.
+ */
+extern kern_return_t lck_rw_gate_try_close(lck_rw_t *lock, gate_t *gate);
+
+/*
+ * Name: lck_rw_gate_close
+ *
+ * Description: Closes the gate. The current thread will be set as
+ *              the holder of the gate. Will panic if the gate is already closed.
+ *              A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on
+ *              to wake up possible waiters on the gate before returning to userspace.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The gate must be open.
+ *
+ */
+extern void lck_rw_gate_close(lck_rw_t *lock, gate_t *gate);
+
+
+/*
+ * Name: lck_rw_gate_open
+ *
+ * Description: Opens the gate and wakes up possible waiters.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The current thread must be the holder of the gate.
+ *
+ */
+extern void lck_rw_gate_open(lck_rw_t *lock, gate_t *gate);
+
+/*
+ * Name: lck_rw_gate_handoff
+ *
+ * Description: Tries to transfer the ownership of the gate. The waiter with highest sched
+ *              priority will be selected as the new holder of the gate, and woken up,
+ *              with the gate remaining in the closed state throughout.
+ *              If no waiters are present, the gate will be kept closed and KERN_NOT_WAITING
+ *              will be returned.
+ *              GATE_HANDOFF_OPEN_IF_NO_WAITERS flag can be used to specify if the gate should be opened in
+ *              case no waiters were found.
+ *
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ *   Arg3: flags - GATE_HANDOFF_DEFAULT or GATE_HANDOFF_OPEN_IF_NO_WAITERS
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The current thread must be the holder of the gate.
+ *
+ * Returns:
+ *          KERN_SUCCESS in case one of the waiters became the new holder.
+ *          KERN_NOT_WAITING in case there were no waiters.
+ *
+ */
+extern kern_return_t lck_rw_gate_handoff(lck_rw_t *lock, gate_t *gate, int flags);
+
+/*
+ * Name: lck_rw_gate_steal
+ *
+ * Description: Set the current ownership of the gate. It sets the current thread as the
+ *              new holder of the gate.
+ *              A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on
+ *              to wake up possible waiters on the gate before returning to userspace.
+ *              NOTE: the previous holder should not call lck_rw_gate_open() or lck_rw_gate_handoff()
+ *              anymore.
+ *
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The gate must be closed and the current thread must not already be the holder.
+ *
+ */
+extern void lck_rw_gate_steal(lck_rw_t *lock, gate_t *gate);
+
+/*
+ * Name: lck_rw_gate_wait
+ *
+ * Description: Waits for the current thread to become the holder of the gate or for the
+ *              gate to become open. An interruptible mode and deadline can be specified
+ *              to return earlier from the wait.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ *   Arg3: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_SHARED, LCK_SLEEP_EXCLUSIVE.
+ *   Arg3: interruptible flag for wait.
+ *   Arg4: deadline
+ *
+ * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified.
+ *             Lock will be dropped while waiting.
+ *             The gate must be closed.
+ *
+ * Returns: Reason why the thread was woken up.
+ *          GATE_HANDOFF - the current thread was handed off the ownership of the gate.
+ *                         A matching lck_rw_gate_open() or lck_rw_gate_handoff() needs to be called later on
+ *                         to wake up possible waiters on the gate before returning to userspace.
+ *          GATE_OPENED - the gate was opened by the holder.
+ *          GATE_TIMED_OUT - the thread was woken up by a timeout.
+ *          GATE_INTERRUPTED - the thread was interrupted while sleeping.
+ *
+ */
+extern gate_wait_result_t lck_rw_gate_wait(lck_rw_t *lock, gate_t *gate, lck_sleep_action_t lck_sleep_action, wait_interrupt_t interruptible, uint64_t deadline);
+
+/*
+ * Name: lck_rw_gate_assert
+ *
+ * Description: asserts that the gate is in the specified state.
+ *
+ * Args:
+ *   Arg1: lck_rw_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_rw_gate_data.
+ *   Arg3: flags to specified assert type.
+ *         GATE_ASSERT_CLOSED - the gate is currently closed
+ *         GATE_ASSERT_OPEN - the gate is currently opened
+ *         GATE_ASSERT_HELD - the gate is currently closed and the current thread is the holder
+ */
+extern void lck_rw_gate_assert(lck_rw_t *lock, gate_t *gate, int flags);
+
+/*
+ * Name: decl_lck_mtx_gate_data
+ *
+ * Description: declares a gate variable with specified storage class.
+ *              The gate itself will be stored in this variable and it is the caller's responsibility
+ *              to ensure that this variable's memory is going to be accessible by all threads that will use
+ *              the gate.
+ *              Every gate function will require a pointer to this variable as parameter. The same pointer should
+ *              be used in every thread.
+ *
+ *              The variable needs to be initialized once with lck_mtx_gate_init() and destroyed once with
+ *              lck_mtx_gate_destroy() when not needed anymore.
+ *
+ *              The gate will be used in conjunction with a lck_mtx_t.
+ *
+ * Args:
+ *   Arg1: storage class.
+ *   Arg2: variable name.
+ */
+#define decl_lck_mtx_gate_data(class, name)                             class gate_t name
+
+/*
+ * Name: lck_mtx_gate_init
+ *
+ * Description: initializes a variable declared with decl_lck_mtx_gate_data.
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ */
+extern void lck_mtx_gate_init(lck_mtx_t *lock, gate_t *gate);
+
+/*
+ * Name: lck_mtx_gate_destroy
+ *
+ * Description: destroys a variable previously initialized
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ */
+extern void lck_mtx_gate_destroy(lck_mtx_t *lock, gate_t *gate);
+
+/*
+ * Name: lck_mtx_gate_try_close
+ *
+ * Description: Tries to close the gate.
+ *              In case of success the current thread will be set as
+ *              the holder of the gate.
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *
+ * Returns:
+ *          KERN_SUCCESS in case the gate was successfully closed. The current thread is the new holder
+ *          of the gate.
+ *          A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on
+ *          to wake up possible waiters on the gate before returning to userspace.
+ *          If the intent is to conditionally probe the gate before waiting, the lock must not be dropped
+ *          between the calls to lck_mtx_gate_try_close() and lck_mtx_gate_wait().
+ *
+ *          KERN_FAILURE in case the gate was already closed. Will panic if the current thread was already the holder of the gate.
+ *          lck_mtx_gate_wait() should be called instead if the intent is to unconditionally wait on this gate.
+ *          The calls to lck_mtx_gate_try_close() and lck_mtx_gate_wait() should
+ *          be done without dropping the lock that is protecting the gate in between.
+ */
+extern kern_return_t lck_mtx_gate_try_close(lck_mtx_t *lock, gate_t *gate);
+
+/*
+ * Name: lck_mtx_gate_close
+ *
+ * Description: Closes the gate. The current thread will be set as
+ *              the holder of the gate. Will panic if the gate is already closed.
+ *              A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on
+ *              to wake up possible waiters on the gate before returning to userspace.
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The gate must be open.
+ *
+ */
+extern void lck_mtx_gate_close(lck_mtx_t *lock, gate_t *gate);
+
+/*
+ * Name: lck_mtx_gate_open
+ *
+ * Description: Opens of the gate and wakes up possible waiters.
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The current thread must be the holder of the gate.
+ *
+ */
+extern void lck_mtx_gate_open(lck_mtx_t *lock, gate_t *gate);
+
+/*
+ * Name: lck_mtx_gate_handoff
+ *
+ * Description: Set the current ownership of the gate. The waiter with highest sched
+ *              priority will be selected as the new holder of the gate, and woken up,
+ *              with the gate remaining in the closed state throughout.
+ *              If no waiters are present, the gate will be kept closed and KERN_NOT_WAITING
+ *              will be returned.
+ *              OPEN_ON_FAILURE flag can be used to specify if the gate should be opened in
+ *              case no waiters were found.
+ *
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ *   Arg3: flags - GATE_NO_FALGS or OPEN_ON_FAILURE
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The current thread must be the holder of the gate.
+ *
+ * Returns:
+ *          KERN_SUCCESS in case one of the waiters became the new holder.
+ *          KERN_NOT_WAITING in case there were no waiters.
+ *
+ */
+extern kern_return_t lck_mtx_gate_handoff(lck_mtx_t *lock, gate_t *gate, int flags);
+
+/*
+ * Name: lck_mtx_gate_steal
+ *
+ * Description: Steals the ownership of the gate. It sets the current thread as the
+ *              new holder of the gate.
+ *              A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on
+ *              to wake up possible waiters on the gate before returning to userspace.
+ *              NOTE: the previous holder should not call lck_mtx_gate_open() or lck_mtx_gate_handoff()
+ *              anymore.
+ *
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ *
+ * Conditions: Lock must be held. Returns with the lock held.
+ *             The gate must be closed and the current thread must not already be the holder.
+ *
+ */
+extern void lck_mtx_gate_steal(lck_mtx_t *lock, gate_t *gate);
+
+/*
+ * Name: lck_mtx_gate_wait
+ *
+ * Description: Waits for the current thread to become the holder of the gate or for the
+ *              gate to become open. An interruptible mode and deadline can be specified
+ *              to return earlier from the wait.
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ *   Arg3: sleep action. LCK_SLEEP_DEFAULT, LCK_SLEEP_UNLOCK, LCK_SLEEP_SPIN, LCK_SLEEP_SPIN_ALWAYS.
+ *   Arg3: interruptible flag for wait.
+ *   Arg4: deadline
+ *
+ * Conditions: Lock must be held. Returns with the lock held according to the sleep action specified.
+ *             Lock will be dropped while waiting.
+ *             The gate must be closed.
+ *
+ * Returns: Reason why the thread was woken up.
+ *          GATE_HANDOFF - the current thread was handed off the ownership of the gate.
+ *                         A matching lck_mtx_gate_open() or lck_mtx_gate_handoff() needs to be called later on
+ *                         to wake up possible waiters on the gate before returning to userspace.
+ *          GATE_OPENED - the gate was opened by the holder.
+ *          GATE_TIMED_OUT - the thread was woken up by a timeout.
+ *          GATE_INTERRUPTED - the thread was interrupted while sleeping.
+ *
+ */
+extern gate_wait_result_t lck_mtx_gate_wait(lck_mtx_t *lock, gate_t *gate, lck_sleep_action_t lck_sleep_action, wait_interrupt_t interruptible, uint64_t deadline);
+
+/*
+ * Name: lck_mtx_gate_assert
+ *
+ * Description: asserts that the gate is in the specified state.
+ *
+ * Args:
+ *   Arg1: lck_mtx_t lock used to protect the gate.
+ *   Arg2: pointer to the gate data declared with decl_lck_mtx_gate_data.
+ *   Arg3: flags to specified assert type.
+ *         GATE_ASSERT_CLOSED - the gate is currently closed
+ *         GATE_ASSERT_OPEN - the gate is currently opened
+ *         GATE_ASSERT_HELD - the gate is currently closed and the current thread is the holder
+ */
+extern void lck_mtx_gate_assert(lck_mtx_t *lock, gate_t *gate, int flags);
+
+
+#endif //KERNEL_PRIVATE
+
 #if DEVELOPMENT || DEBUG
+#define FULL_CONTENDED 0
+#define HALF_CONTENDED 1
+#define MAX_CONDENDED  2
+
 extern void             erase_all_test_mtx_stats(void);
 extern int              get_test_mtx_stats_string(char* buffer, int buffer_size);
 extern void             lck_mtx_test_init(void);
 extern void             lck_mtx_test_lock(void);
 extern void             lck_mtx_test_unlock(void);
 extern int              lck_mtx_test_mtx_uncontended(int iter, char* buffer, int buffer_size);
-extern int              lck_mtx_test_mtx_contended(int iter, char* buffer, int buffer_size);
+extern int              lck_mtx_test_mtx_contended(int iter, char* buffer, int buffer_size, int type);
 extern int              lck_mtx_test_mtx_uncontended_loop_time(int iter, char* buffer, int buffer_size);
-extern int              lck_mtx_test_mtx_contended_loop_time(int iter, char* buffer, int buffer_size);
+extern int              lck_mtx_test_mtx_contended_loop_time(int iter, char* buffer, int buffer_size, int type);
 #endif
 #ifdef  KERNEL_PRIVATE
 
@@ -310,14 +892,17 @@ __END_DECLS
 #define LCK_MTX_ASSERT_NOTOWNED LCK_ASSERT_NOTOWNED
 
 #ifdef  MACH_KERNEL_PRIVATE
+struct turnstile;
 extern void                             lck_mtx_lock_wait(
        lck_mtx_t               *lck,
-       thread_t                holder);
+       thread_t                holder,
+       struct turnstile        **ts);
 
 extern int                              lck_mtx_lock_acquire(
-       lck_mtx_t               *lck);
+       lck_mtx_t               *lck,
+       struct turnstile        *ts);
 
-extern void                             lck_mtx_unlock_wakeup(
+extern  boolean_t                            lck_mtx_unlock_wakeup(
        lck_mtx_t               *lck,
        thread_t                holder);
 
@@ -331,7 +916,7 @@ extern void lck_mtx_wakeup_adjust_pri(thread_t thread, integer_t priority);
 
 #endif
 
-#define decl_lck_rw_data(class, name)     class lck_rw_t name;
+#define decl_lck_rw_data(class, name)     class lck_rw_t name
 
 typedef unsigned int     lck_rw_type_t;
 
index 9b1e47c72a554a6244317829d9c5106ebb611dec..bc05894c3131601ca898e06454babd52f8110c86 100644 (file)
@@ -85,15 +85,15 @@ struct lt_elem {
 };
 
 /* reference count bits should _always_ be the low-order bits */
-#define LT_BITS_REFCNT_MASK  (0x1FFFFFFF)
+#define LT_BITS_REFCNT_MASK  (0x1FFFFFFFU)
 #define LT_BITS_REFCNT_SHIFT (0)
 #define LT_BITS_REFCNT       (LT_BITS_REFCNT_MASK << LT_BITS_REFCNT_SHIFT)
 
-#define LT_BITS_TYPE_MASK    (0x3)
+#define LT_BITS_TYPE_MASK    (0x3U)
 #define LT_BITS_TYPE_SHIFT   (29)
 #define LT_BITS_TYPE         (LT_BITS_TYPE_MASK << LT_BITS_TYPE_SHIFT)
 
-#define LT_BITS_VALID_MASK   (0x1)
+#define LT_BITS_VALID_MASK   (0x1U)
 #define LT_BITS_VALID_SHIFT  (31)
 #define LT_BITS_VALID        (LT_BITS_VALID_MASK << LT_BITS_VALID_SHIFT)
 
index a61ee71dba714f16ffaa1d6152bcfab11ec06f81..c90a8ea6dc2b4f43b4d7b1eb413fd8b3cea223f7 100644 (file)
@@ -85,6 +85,7 @@
 #include <kern/processor.h>
 #include <kern/queue.h>
 #include <kern/sched.h>
+#include <kern/startup.h>
 #include <kern/task.h>
 #include <kern/thread.h>
 
@@ -111,8 +112,14 @@ extern void (*dtrace_cpu_state_changed_hook)(int, boolean_t);
 struct machine_info     machine_info;
 
 /* Forwards */
-void                    processor_doshutdown(
-       processor_t                     processor);
+static void
+processor_doshutdown(processor_t processor);
+
+static void
+processor_offline(void * parameter, __unused wait_result_t result);
+
+static void
+processor_offline_intstack(processor_t processor) __dead2;
 
 /*
  *     processor_up:
@@ -126,19 +133,32 @@ processor_up(
 {
        processor_set_t         pset;
        spl_t                           s;
+       boolean_t pset_online = false;
 
        s = splsched();
        init_ast_check(processor);
        pset = processor->processor_set;
        pset_lock(pset);
+       if (pset->online_processor_count == 0) {
+               /* About to bring the first processor of a pset online */
+               pset_online = true;
+       }
        ++pset->online_processor_count;
        pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
-       (void)hw_atomic_add(&processor_avail_count, 1);
+       os_atomic_inc(&processor_avail_count, relaxed);
        if (processor->is_recommended) {
-               (void)hw_atomic_add(&processor_avail_count_user, 1);
+               os_atomic_inc(&processor_avail_count_user, relaxed);
        }
        commpage_update_active_cpus();
-       pset_unlock(pset);
+       if (pset_online) {
+               /* New pset is coming up online; callout to the
+                * scheduler in case it wants to adjust runqs.
+                */
+               SCHED(pset_made_schedulable)(processor, pset, true);
+               /* pset lock dropped */
+       } else {
+               pset_unlock(pset);
+       }
        ml_cpu_up();
        splx(s);
 
@@ -252,20 +272,22 @@ processor_shutdown(
 /*
  * Called with interrupts disabled.
  */
-void
+static void
 processor_doshutdown(
-       processor_t                     processor)
+       processor_t processor)
 {
-       thread_t                        old_thread, self = current_thread();
-       processor_t                     prev;
-       processor_set_t                 pset;
+       thread_t self = current_thread();
 
        /*
         *      Get onto the processor to shutdown
         */
-       prev = thread_bind(processor);
+       processor_t prev = thread_bind(processor);
        thread_block(THREAD_CONTINUE_NULL);
 
+       /* interrupts still disabled */
+       assert(ml_get_interrupts_enabled() == FALSE);
+
+       assert(processor == current_processor());
        assert(processor->state == PROCESSOR_SHUTDOWN);
 
 #if CONFIG_DTRACE
@@ -283,88 +305,127 @@ processor_doshutdown(
        }
 #endif
 
-       pset = processor->processor_set;
+       processor_set_t pset = processor->processor_set;
+
        pset_lock(pset);
        pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE);
        --pset->online_processor_count;
-       (void)hw_atomic_sub(&processor_avail_count, 1);
+       os_atomic_dec(&processor_avail_count, relaxed);
        if (processor->is_recommended) {
-               (void)hw_atomic_sub(&processor_avail_count_user, 1);
+               os_atomic_dec(&processor_avail_count_user, relaxed);
        }
        commpage_update_active_cpus();
        SCHED(processor_queue_shutdown)(processor);
        /* pset lock dropped */
        SCHED(rt_queue_shutdown)(processor);
 
+       thread_bind(prev);
+
+       /* interrupts still disabled */
+
        /*
-        * Continue processor shutdown in shutdown context.
-        *
-        * We save the current context in machine_processor_shutdown in such a way
-        * that when this thread is next invoked it will return from here instead of
-        * from the machine_switch_context() in thread_invoke like a normal context switch.
-        *
-        * As such, 'old_thread' is neither the idle thread nor the current thread - it's whatever
-        * thread invoked back to this one. (Usually, it's another processor's idle thread.)
-        *
-        * TODO: Make this a real thread_run of the idle_thread, so we don't have to keep this in sync
-        * with thread_invoke.
+        * Continue processor shutdown on the processor's idle thread.
+        * The handoff won't fail because the idle thread has a reserved stack.
+        * Switching to the idle thread leaves interrupts disabled,
+        * so we can't accidentally take an interrupt after the context switch.
         */
-       thread_bind(prev);
-       old_thread = machine_processor_shutdown(self, processor_offline, processor);
+       thread_t shutdown_thread = processor->idle_thread;
+       shutdown_thread->continuation = processor_offline;
+       shutdown_thread->parameter = processor;
 
-       thread_dispatch(old_thread, self);
+       thread_run(self, NULL, NULL, shutdown_thread);
 }
 
 /*
- * Complete the shutdown and place the processor offline.
- *
- * Called at splsched in the shutdown context.
- * This performs a minimal thread_invoke() to the idle thread,
- * so it needs to be kept in sync with what thread_invoke() does.
+ * Called in the context of the idle thread to shut down the processor
  *
- * The onlining half of this is done in load_context().
+ * A shut-down processor looks like it's 'running' the idle thread parked
+ * in this routine, but it's actually been powered off and has no hardware state.
  */
-void
+static void
 processor_offline(
-       processor_t                     processor)
+       void * parameter,
+       __unused wait_result_t result)
 {
+       processor_t processor = (processor_t) parameter;
+       thread_t self = current_thread();
+       __assert_only thread_t old_thread = THREAD_NULL;
+
        assert(processor == current_processor());
-       assert(processor->active_thread == current_thread());
+       assert(self->state & TH_IDLE);
+       assert(processor->idle_thread == self);
+       assert(ml_get_interrupts_enabled() == FALSE);
+       assert(self->continuation == NULL);
+       assert(processor->processor_offlined == false);
 
-       thread_t old_thread = processor->active_thread;
-       thread_t new_thread = processor->idle_thread;
+       bool enforce_quiesce_safety = gEnforceQuiesceSafety;
 
-       if (!new_thread->kernel_stack) {
-               /* the idle thread has a reserved stack, so this will never fail */
-               if (!stack_alloc_try(new_thread)) {
-                       panic("processor_offline");
-               }
+       /*
+        * Scheduling is now disabled for this processor.
+        * Ensure that primitives that need scheduling (like mutexes) know this.
+        */
+       if (enforce_quiesce_safety) {
+               disable_preemption();
        }
 
-       processor->active_thread = new_thread;
-       processor_state_update_idle(processor);
-       processor->starting_pri = IDLEPRI;
-       processor->deadline = UINT64_MAX;
-       new_thread->last_processor = processor;
+       /* convince slave_main to come back here */
+       processor->processor_offlined = true;
+
+       /*
+        * Switch to the interrupt stack and shut down the processor.
+        *
+        * When the processor comes back, it will eventually call load_context which
+        * restores the context saved by machine_processor_shutdown, returning here.
+        */
+       old_thread = machine_processor_shutdown(self, processor_offline_intstack, processor);
+
+       /* old_thread should be NULL because we got here through Load_context */
+       assert(old_thread == THREAD_NULL);
+
+       assert(processor == current_processor());
+       assert(processor->idle_thread == current_thread());
 
-       uint64_t ctime = mach_absolute_time();
+       assert(ml_get_interrupts_enabled() == FALSE);
+       assert(self->continuation == NULL);
 
-       processor->last_dispatch = ctime;
-       old_thread->last_run_time = ctime;
+       /* Extract the machine_param value stashed by slave_main */
+       void * machine_param = self->parameter;
+       self->parameter = NULL;
 
-       /* Update processor->thread_timer and ->kernel_timer to point to the new thread */
-       processor_timer_switch_thread(ctime, &new_thread->system_timer);
-       PROCESSOR_DATA(processor, kernel_timer) = &new_thread->system_timer;
-       timer_stop(PROCESSOR_DATA(processor, current_state), ctime);
+       /* Re-initialize the processor */
+       slave_machine_init(machine_param);
 
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-           MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
-           old_thread->reason, (uintptr_t)thread_tid(new_thread),
-           old_thread->sched_pri, new_thread->sched_pri, 0);
+       assert(processor->processor_offlined == true);
+       processor->processor_offlined = false;
 
-       machine_set_current_thread(new_thread);
+       if (enforce_quiesce_safety) {
+               enable_preemption();
+       }
+
+       /*
+        * Now that the processor is back, invoke the idle thread to find out what to do next.
+        * idle_thread will enable interrupts.
+        */
+       thread_block(idle_thread);
+       /*NOTREACHED*/
+}
+
+/*
+ * Complete the shutdown and place the processor offline.
+ *
+ * Called at splsched in the shutdown context
+ * (i.e. on the idle thread, on the interrupt stack)
+ *
+ * The onlining half of this is done in load_context().
+ */
+static void
+processor_offline_intstack(
+       processor_t processor)
+{
+       assert(processor == current_processor());
+       assert(processor->active_thread == current_thread());
 
-       thread_dispatch(old_thread, new_thread);
+       timer_stop(PROCESSOR_DATA(processor, current_state), processor->last_dispatch);
 
        cpu_quiescent_counter_leave(processor->last_dispatch);
 
index 9dbb6eb2b9d8deae1f60d0e285bdd2922630d145..2c285e5c43281c71b18e0a08e37e47db5120fa61 100644 (file)
  * Machine support declarations.
  */
 
-extern void             processor_up(
-       processor_t             processor);
+extern void processor_up(
+       processor_t processor);
 
-extern void             processor_offline(
-       processor_t             processor);
-
-extern void             processor_start_thread(void *machine_param);
+extern void processor_start_thread(void *machine_param,
+    wait_result_t result);
 
 /*
  * Must be implemented in machine dependent code.
index 37b89450d348cf0f85e4264bd4d6e435f5d4e583..9f51a4b6fad3399b5e651742d7438ea80b415c0f 100644 (file)
@@ -64,3 +64,17 @@ memset_s(void *s, size_t smax, int c, size_t n)
 
        return err;
 }
+
+int
+timingsafe_bcmp(const void *b1, const void *b2, size_t n)
+{
+       const unsigned char *p1 = b1, *p2 = b2;
+       unsigned char ret = 0;
+
+       for (; n > 0; n--) {
+               ret |= *p1++ ^ *p2++;
+       }
+
+       /* map zero to zero and nonzero to one */
+       return (ret + 0xff) >> 8;
+}
index c1dee4267321a63a45a23f2c5c0f30ea98df17d0..a2585e4731d5c7617291dfa0e832eedb636e22c8 100644 (file)
@@ -86,17 +86,44 @@ extern int testbit(
        int             which,
        int             *bitmap);
 
-/* Move an aligned 32 or 64-bit word from user space to kernel space
+/*
+ * Move an aligned 32 or 64-bit word from user space to kernel space
  * using a single read instruction
+ */
+extern int copyin_atomic32(
+       const user_addr_t   user_addr,
+       uint32_t            *kernel_addr);
+
+extern int copyin_atomic64(
+       const user_addr_t   user_addr,
+       uint64_t            *kernel_addr);
+
+/*
+ * Does an atomic copyin at the specified user_address and compares
+ * it to the passed in value, and if it matches, waits.
  *
- * when reading a 32-bit word, the value is 0-extended into the kernel space
- * 64-bit buffer passed as `kernel_addr`
- * (think `*kernel_addr = *(uint32_t *)user_addr`)
+ * This is used to implement adaptive spinning for userspace synchronization
+ *
+ * Returns:
+ * 0:       the value mached, and it paused efficiently for the platform
+ * ESTALE:  the value didn't match, and it returned immediately
+ * other:   the copyin failed (EFAULT, EINVAL, ...)
  */
-extern int copyin_word(
+extern int copyin_atomic32_wait_if_equals(
        const user_addr_t   user_addr,
-       uint64_t            *kernel_addr,
-       vm_size_t           nbytes);
+       uint32_t            value);
+
+/*
+ * Move a 32 or 64-bit word from kernel space to user space
+ * using a single write instruction
+ */
+extern int copyout_atomic32(
+       uint32_t            u32,
+       user_addr_t         user_addr);
+
+extern int copyout_atomic64(
+       uint64_t            u64,
+       user_addr_t         user_addr);
 
 /* Move a NUL-terminated string from a user space to kernel space */
 extern int copyinstr(
@@ -121,9 +148,6 @@ extern int copyoutmsg(
 extern void inval_copy_windows(thread_t);
 extern void copy_window_fault(thread_t, vm_map_t, int);
 
-extern int copyin_validate(const user_addr_t, uintptr_t, vm_size_t);
-extern int copyout_validate(uintptr_t, const user_addr_t, vm_size_t);
-
 extern int sscanf(const char *input, const char *fmt, ...) __scanflike(2, 3);
 
 /* sprintf() is being deprecated. Please use snprintf() instead. */
index 883a1e31b745c139ffe62a72140019b09e0ae52b..8de9c9012cf57260872fd372b9c30fd0bc629829 100644 (file)
@@ -49,7 +49,9 @@
 static zone_t           mk_timer_zone;
 
 static mach_port_qos_t mk_timer_qos = {
-       FALSE, TRUE, 0, sizeof(mk_timer_expire_msg_t)
+       .name       = FALSE,
+       .prealloc   = TRUE,
+       .len        = sizeof(mk_timer_expire_msg_t),
 };
 
 static void     mk_timer_expire(
@@ -71,7 +73,7 @@ mk_timer_create_trap(
                return MACH_PORT_NULL;
        }
 
-       result = mach_port_allocate_qos(myspace, MACH_PORT_RIGHT_RECEIVE,
+       result = mach_port_allocate_internal(myspace, MACH_PORT_RIGHT_RECEIVE,
            &mk_timer_qos, &name);
        if (result == KERN_SUCCESS) {
                result = ipc_port_translate_receive(myspace, name, &port);
index e8fcde1642d46d853ab69b241432d7ad89b746c1..9b744407bc41f1b03dc55a05891bf128d4c73002 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2017-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -28,6 +28,8 @@
 #ifndef KERN_MONOTONIC_H
 #define KERN_MONOTONIC_H
 
+#if MONOTONIC
+
 #include <stdbool.h>
 #include <stdint.h>
 #include <sys/cdefs.h>
@@ -156,4 +158,6 @@ __END_DECLS
 
 #endif /* MACH_KERNEL_PRIVATE */
 
+#endif /* MONOTONIC */
+
 #endif /* !defined(KERN_MONOTONIC_H) */
diff --git a/osfmk/kern/mpsc_queue.c b/osfmk/kern/mpsc_queue.c
new file mode 100644 (file)
index 0000000..4784b0d
--- /dev/null
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <machine/machine_cpu.h>
+#include <kern/locks.h>
+#include <kern/mpsc_queue.h>
+#include <kern/thread.h>
+
+#pragma mark Single Consumer calls
+
+__attribute__((noinline))
+static mpsc_queue_chain_t
+_mpsc_queue_wait_for_enqueuer(struct mpsc_queue_chain *_Atomic *ptr)
+{
+       return hw_wait_while_equals((void **)ptr, NULL);
+}
+
+void
+mpsc_queue_restore_batch(mpsc_queue_head_t q, mpsc_queue_chain_t first,
+    mpsc_queue_chain_t last)
+{
+       mpsc_queue_chain_t head = os_atomic_load(&q->mpqh_head.mpqc_next, relaxed);
+
+       os_atomic_store(&last->mpqc_next, head, relaxed);
+
+       if (head == NULL &&
+           !os_atomic_cmpxchg(&q->mpqh_tail, &q->mpqh_head, last, release)) {
+               head = os_atomic_load(&q->mpqh_head.mpqc_next, relaxed);
+               if (__improbable(head == NULL)) {
+                       head = _mpsc_queue_wait_for_enqueuer(&q->mpqh_head.mpqc_next);
+               }
+               os_atomic_store(&last->mpqc_next, head, relaxed);
+       }
+
+       os_atomic_store(&q->mpqh_head.mpqc_next, first, relaxed);
+}
+
+mpsc_queue_chain_t
+mpsc_queue_dequeue_batch(mpsc_queue_head_t q, mpsc_queue_chain_t *tail_out,
+    os_atomic_dependency_t dependency)
+{
+       mpsc_queue_chain_t head, tail;
+
+       q = os_atomic_inject_dependency(q, dependency);
+
+       tail = os_atomic_load(&q->mpqh_tail, relaxed);
+       if (__improbable(tail == &q->mpqh_head)) {
+               *tail_out = NULL;
+               return NULL;
+       }
+
+       head = os_atomic_load(&q->mpqh_head.mpqc_next, relaxed);
+       if (__improbable(head == NULL)) {
+               head = _mpsc_queue_wait_for_enqueuer(&q->mpqh_head.mpqc_next);
+       }
+       os_atomic_store(&q->mpqh_head.mpqc_next, NULL, relaxed);
+       /*
+        * 22708742: set tail to &q->mpqh_head with release, so that NULL write
+        * to head above doesn't clobber the head set by concurrent enqueuer
+        *
+        * The other half of the seq_cst is required to pair with any enqueuer that
+        * contributed to an element in this list (pairs with the release fence in
+        * __mpsc_queue_append_update_tail().
+        *
+        * Making this seq_cst instead of acq_rel makes mpsc_queue_append*()
+        * visibility transitive (when items hop from one queue to the next)
+        * which is expected by clients implicitly.
+        *
+        * Note that this is the same number of fences that a traditional lock
+        * would have, but as a once-per-batch cost.
+        */
+       *tail_out = os_atomic_xchg(&q->mpqh_tail, &q->mpqh_head, seq_cst);
+
+       return head;
+}
+
+mpsc_queue_chain_t
+mpsc_queue_batch_next(mpsc_queue_chain_t cur, mpsc_queue_chain_t tail)
+{
+       mpsc_queue_chain_t elm = NULL;
+       if (cur == tail || cur == NULL) {
+               return elm;
+       }
+
+       elm = os_atomic_load(&cur->mpqc_next, relaxed);
+       if (__improbable(elm == NULL)) {
+               elm = _mpsc_queue_wait_for_enqueuer(&cur->mpqc_next);
+       }
+       return elm;
+}
+
+#pragma mark "GCD"-like facilities
+
+static void _mpsc_daemon_queue_drain(mpsc_daemon_queue_t, thread_t);
+static void _mpsc_daemon_queue_enqueue(mpsc_daemon_queue_t, mpsc_queue_chain_t);
+
+/* thread based queues */
+
+static void
+_mpsc_queue_thread_continue(void *param, wait_result_t wr __unused)
+{
+       mpsc_daemon_queue_t dq = param;
+
+       assert(dq->mpd_thread == current_thread());
+       _mpsc_daemon_queue_drain(dq, dq->mpd_thread);
+       thread_block_parameter(_mpsc_queue_thread_continue, dq);
+}
+
+static void
+_mpsc_queue_thread_wakeup(mpsc_daemon_queue_t dq)
+{
+       thread_wakeup_thread((event_t)dq, dq->mpd_thread);
+}
+
+static kern_return_t
+_mpsc_daemon_queue_init_with_thread(mpsc_daemon_queue_t dq,
+    mpsc_daemon_invoke_fn_t invoke, int pri, const char *name,
+    mpsc_daemon_queue_kind_t kind)
+{
+       kern_return_t kr;
+
+       *dq = (struct mpsc_daemon_queue){
+               .mpd_kind   = kind,
+               .mpd_invoke = invoke,
+               .mpd_queue  = MPSC_QUEUE_INITIALIZER(dq->mpd_queue),
+               .mpd_chain  = { MPSC_QUEUE_NOTQUEUED_MARKER },
+       };
+
+       kr = kernel_thread_create(_mpsc_queue_thread_continue, dq, pri,
+           &dq->mpd_thread);
+       if (kr == KERN_SUCCESS) {
+               thread_set_thread_name(dq->mpd_thread, name);
+               thread_start_in_assert_wait(dq->mpd_thread, (event_t)dq, THREAD_UNINT);
+               thread_deallocate(dq->mpd_thread);
+       }
+       return kr;
+}
+
+kern_return_t
+mpsc_daemon_queue_init_with_thread(mpsc_daemon_queue_t dq,
+    mpsc_daemon_invoke_fn_t invoke, int pri, const char *name)
+{
+       return _mpsc_daemon_queue_init_with_thread(dq, invoke, pri, name,
+                  MPSC_QUEUE_KIND_THREAD);
+}
+
+/* thread-call based queues */
+
+static void
+_mpsc_queue_thread_call_drain(thread_call_param_t arg0,
+    thread_call_param_t arg1 __unused)
+{
+       _mpsc_daemon_queue_drain((mpsc_daemon_queue_t)arg0, NULL);
+}
+
+static void
+_mpsc_queue_thread_call_wakeup(mpsc_daemon_queue_t dq)
+{
+       thread_call_enter(dq->mpd_call);
+}
+
+void
+mpsc_daemon_queue_init_with_thread_call(mpsc_daemon_queue_t dq,
+    mpsc_daemon_invoke_fn_t invoke, thread_call_priority_t pri)
+{
+       *dq = (struct mpsc_daemon_queue){
+               .mpd_kind   = MPSC_QUEUE_KIND_THREAD_CALL,
+               .mpd_invoke = invoke,
+               .mpd_queue  = MPSC_QUEUE_INITIALIZER(dq->mpd_queue),
+               .mpd_chain  = { MPSC_QUEUE_NOTQUEUED_MARKER },
+       };
+       dq->mpd_call = thread_call_allocate_with_options(
+               _mpsc_queue_thread_call_drain, dq, pri, THREAD_CALL_OPTIONS_ONCE);
+}
+
+/* nested queues */
+
+void
+mpsc_daemon_queue_nested_invoke(mpsc_queue_chain_t elm,
+    __unused mpsc_daemon_queue_t tq)
+{
+       mpsc_daemon_queue_t dq;
+       dq = mpsc_queue_element(elm, struct mpsc_daemon_queue, mpd_chain);
+       _mpsc_daemon_queue_drain(dq, NULL);
+}
+
+static void
+_mpsc_daemon_queue_nested_wakeup(mpsc_daemon_queue_t dq)
+{
+       _mpsc_daemon_queue_enqueue(dq->mpd_target, &dq->mpd_chain);
+}
+
+void
+mpsc_daemon_queue_init_with_target(mpsc_daemon_queue_t dq,
+    mpsc_daemon_invoke_fn_t invoke, mpsc_daemon_queue_t target)
+{
+       *dq = (struct mpsc_daemon_queue){
+               .mpd_kind   = MPSC_QUEUE_KIND_NESTED,
+               .mpd_invoke = invoke,
+               .mpd_target = target,
+               .mpd_queue  = MPSC_QUEUE_INITIALIZER(dq->mpd_queue),
+               .mpd_chain  = { MPSC_QUEUE_NOTQUEUED_MARKER },
+       };
+}
+
+/* enqueue, drain & cancelation */
+
+static void
+_mpsc_daemon_queue_drain(mpsc_daemon_queue_t dq, thread_t self)
+{
+       mpsc_daemon_invoke_fn_t invoke = dq->mpd_invoke;
+       mpsc_daemon_queue_kind_t kind = dq->mpd_kind;
+       mpsc_queue_chain_t head, cur, tail;
+       mpsc_daemon_queue_state_t st;
+
+       if (kind == MPSC_QUEUE_KIND_THREAD_CRITICAL) {
+               self->options |= TH_OPT_SYSTEM_CRITICAL;
+       }
+
+again:
+       /*
+        * Most of the time we're woken up because we're dirty,
+        * This atomic xor sets DRAINING and clears WAKEUP in a single atomic
+        * in that case.
+        *
+        * However, if we're woken up for cancelation, the state may be reduced to
+        * the CANCELED bit set only, and then the xor will actually set WAKEUP.
+        * We need to correct this and clear it back to avoid looping below.
+        * This is safe to do as no one is allowed to enqueue more work after
+        * cancelation has happened.
+        *
+        * We use `st` as a dependency token to pair with the release fence in
+        * _mpsc_daemon_queue_enqueue() which gives us the guarantee that the update
+        * to the tail of the MPSC queue that made it non empty is visible to us.
+        */
+       st = os_atomic_xor(&dq->mpd_state,
+           MPSC_QUEUE_STATE_DRAINING | MPSC_QUEUE_STATE_WAKEUP, dependency);
+       assert(st & MPSC_QUEUE_STATE_DRAINING);
+       if (__improbable(st & MPSC_QUEUE_STATE_WAKEUP)) {
+               assert(st & MPSC_QUEUE_STATE_CANCELED);
+               os_atomic_andnot(&dq->mpd_state, MPSC_QUEUE_STATE_WAKEUP, relaxed);
+       }
+
+       os_atomic_dependency_t dep = os_atomic_make_dependency((uintptr_t)st);
+       while ((head = mpsc_queue_dequeue_batch(&dq->mpd_queue, &tail, dep))) {
+               mpsc_queue_batch_foreach_safe(cur, head, tail) {
+                       os_atomic_store(&cur->mpqc_next,
+                           MPSC_QUEUE_NOTQUEUED_MARKER, relaxed);
+                       invoke(cur, dq);
+               }
+       }
+
+       if (self) {
+               assert_wait((event_t)dq, THREAD_UNINT);
+       }
+
+       /*
+        * Unlike GCD no fence is necessary here: there is no concept similar
+        * to "dispatch_sync()" that would require changes this thread made to be
+        * visible to other threads as part of the mpsc_daemon_queue machinery.
+        *
+        * Making updates that happened on the daemon queue visible to other threads
+        * is the responsibility of the client.
+        */
+       st = os_atomic_andnot(&dq->mpd_state, MPSC_QUEUE_STATE_DRAINING, relaxed);
+
+       /*
+        * A wakeup has happened while we were draining,
+        * which means that the queue did an [ empty -> non empty ]
+        * transition during our drain.
+        *
+        * Chances are we already observed and drained everything,
+        * but we need to be absolutely sure, so start a drain again
+        * as the enqueuer observed the DRAINING bit and has skipped calling
+        * _mpsc_daemon_queue_wakeup().
+        */
+       if (__improbable(st & MPSC_QUEUE_STATE_WAKEUP)) {
+               if (self) {
+                       clear_wait(self, THREAD_AWAKENED);
+               }
+               goto again;
+       }
+
+       /* dereferencing `dq` past this point is unsafe */
+
+       if (kind == MPSC_QUEUE_KIND_THREAD_CRITICAL) {
+               self->options &= ~TH_OPT_SYSTEM_CRITICAL;
+       }
+
+       if (__improbable(st & MPSC_QUEUE_STATE_CANCELED)) {
+               thread_wakeup(&dq->mpd_state);
+               if (self) {
+                       clear_wait(self, THREAD_AWAKENED);
+                       thread_terminate_self();
+                       __builtin_unreachable();
+               }
+       }
+}
+
+static void
+_mpsc_daemon_queue_wakeup(mpsc_daemon_queue_t dq)
+{
+       switch (dq->mpd_kind) {
+       case MPSC_QUEUE_KIND_NESTED:
+               _mpsc_daemon_queue_nested_wakeup(dq);
+               break;
+       case MPSC_QUEUE_KIND_THREAD:
+       case MPSC_QUEUE_KIND_THREAD_CRITICAL:
+               _mpsc_queue_thread_wakeup(dq);
+               break;
+       case MPSC_QUEUE_KIND_THREAD_CALL:
+               _mpsc_queue_thread_call_wakeup(dq);
+               break;
+       default:
+               panic("mpsc_queue[%p]: invalid kind (%d)", dq, dq->mpd_kind);
+       }
+}
+
+static void
+_mpsc_daemon_queue_enqueue(mpsc_daemon_queue_t dq, mpsc_queue_chain_t elm)
+{
+       mpsc_daemon_queue_state_t st;
+
+       if (mpsc_queue_append(&dq->mpd_queue, elm)) {
+               /*
+                * Pairs with the acquire fence in _mpsc_daemon_queue_drain().
+                */
+               st = os_atomic_or_orig(&dq->mpd_state, MPSC_QUEUE_STATE_WAKEUP, release);
+               if (__improbable(st & MPSC_QUEUE_STATE_CANCELED)) {
+                       panic("mpsc_queue[%p]: use after cancelation", dq);
+               }
+
+               if ((st & (MPSC_QUEUE_STATE_DRAINING | MPSC_QUEUE_STATE_WAKEUP)) == 0) {
+                       _mpsc_daemon_queue_wakeup(dq);
+               }
+       }
+}
+
+void
+mpsc_daemon_enqueue(mpsc_daemon_queue_t dq, mpsc_queue_chain_t elm,
+    mpsc_queue_options_t options)
+{
+       if (options & MPSC_QUEUE_DISABLE_PREEMPTION) {
+               disable_preemption();
+       }
+
+       _mpsc_daemon_queue_enqueue(dq, elm);
+
+       if (options & MPSC_QUEUE_DISABLE_PREEMPTION) {
+               enable_preemption();
+       }
+}
+
+void
+mpsc_daemon_queue_cancel_and_wait(mpsc_daemon_queue_t dq)
+{
+       mpsc_daemon_queue_state_t st;
+
+       assert_wait((event_t)&dq->mpd_state, THREAD_UNINT);
+
+       st = os_atomic_or_orig(&dq->mpd_state, MPSC_QUEUE_STATE_CANCELED, relaxed);
+       if (__improbable(st & MPSC_QUEUE_STATE_CANCELED)) {
+               panic("mpsc_queue[%p]: cancelled twice (%x)", dq, st);
+       }
+
+       if (dq->mpd_kind == MPSC_QUEUE_KIND_NESTED && st == 0) {
+               clear_wait(current_thread(), THREAD_AWAKENED);
+       } else {
+               disable_preemption();
+               _mpsc_daemon_queue_wakeup(dq);
+               enable_preemption();
+               thread_block(THREAD_CONTINUE_NULL);
+       }
+
+       switch (dq->mpd_kind) {
+       case MPSC_QUEUE_KIND_NESTED:
+               dq->mpd_target = NULL;
+               break;
+       case MPSC_QUEUE_KIND_THREAD:
+       case MPSC_QUEUE_KIND_THREAD_CRITICAL:
+               dq->mpd_thread = NULL;
+               break;
+       case MPSC_QUEUE_KIND_THREAD_CALL:
+               thread_call_cancel_wait(dq->mpd_call);
+               thread_call_free(dq->mpd_call);
+               dq->mpd_call = NULL;
+               break;
+       default:
+               panic("mpsc_queue[%p]: invalid kind (%d)", dq, dq->mpd_kind);
+       }
+       dq->mpd_kind = MPSC_QUEUE_KIND_UNKNOWN;
+}
+
+#pragma mark deferred deallocation daemon
+
+static struct mpsc_daemon_queue thread_deferred_deallocation_queue;
+
+void
+thread_deallocate_daemon_init(void)
+{
+       kern_return_t kr;
+
+       kr = _mpsc_daemon_queue_init_with_thread(&thread_deferred_deallocation_queue,
+           mpsc_daemon_queue_nested_invoke, MINPRI_KERNEL,
+           "daemon.deferred-deallocation", MPSC_QUEUE_KIND_THREAD_CRITICAL);
+       if (kr != KERN_SUCCESS) {
+               panic("thread_deallocate_daemon_init: creating daemon failed (%d)", kr);
+       }
+}
+
+void
+thread_deallocate_daemon_register_queue(mpsc_daemon_queue_t dq,
+    mpsc_daemon_invoke_fn_t invoke)
+{
+       mpsc_daemon_queue_init_with_target(dq, invoke,
+           &thread_deferred_deallocation_queue);
+}
diff --git a/osfmk/kern/mpsc_queue.h b/osfmk/kern/mpsc_queue.h
new file mode 100644 (file)
index 0000000..a2a6218
--- /dev/null
@@ -0,0 +1,671 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _KERN_MPSC_QUEUE_H_
+#define _KERN_MPSC_QUEUE_H_
+
+#ifdef XNU_KERNEL_PRIVATE
+
+#include <machine/atomic.h>
+#include <kern/macro_help.h>
+#include <kern/thread_call.h>
+
+#endif // XNU_KERNEL_PRIVATE
+
+#include <sys/cdefs.h>
+
+__BEGIN_DECLS
+
+/*!
+ * @typedef struct mpsc_queue_chain
+ *
+ * @brief
+ * Type for the intrusive linkage used by MPSC queues.
+ */
+typedef struct mpsc_queue_chain {
+       struct mpsc_queue_chain *_Atomic mpqc_next;
+} *mpsc_queue_chain_t;
+
+/*!
+ * @typedef struct mpsc_queue_head
+ *
+ * @brief
+ * The type for a multi-producer single-consumer queue.
+ *
+ * @discussion
+ * MPSC queues allow for producers to not be affected by other producers or the
+ * consumer. Which means in turn that having producers in interrupt context
+ * does not require that other producers disable interrupts like a traditional
+ * spinlock based approach would require.
+ *
+ * These queues shine when data is produced from the entire system and is
+ * consumed from a single serial context (logging, tracing, ...).
+ * mpsc_daemon_queue_t is provided as a fully ready/easy-to-use pre-packaged
+ * solution for these common use cases.
+ *
+ * - mpsc_queue_append() can be used to append a single item
+ * - mpsc_queue_append_list() can be used to append a batch of items at once.
+ *
+ * Functions for the consumer side assume proper serialization that is not
+ * provided by the MPSC queue itself. Dequeuing doesn't require preemption
+ * to be disabled.
+ *
+ * <h2>Algorithm</h2>
+ *
+ * The base of the enqueue algorithm is a single atomic exchange (first half,
+ * called __mpsc_queue_append_update_tail) and a list fixup (2nd half, called
+ * __mpsc_queue_append_update_prev).
+ *
+ * Graphically, enqueuing `X` looks like this, with each step being done
+ * atomically (for the empty queue case, `tail` points to `head`):
+ *
+ *     | orig state          | update_tail         | update_prev         |
+ *     +---------------------+---------------------+---------------------+
+ *     |                     |                     |                     |
+ *     | head -> e1 -> e2 -. | head -> e1 -> e2 -. | head -> e1 -> e2 -. |
+ *     |                   | |                   | |                   | |
+ *     |         ,- ... <--' |         ,- ... <--' |         ,- ... <--' |
+ *     |         |           |         |           |         |           |
+ *     |         v           |         v           |         v           |
+ *     | tail -> eN -> NULL  | tail    eN -> NULL  | tail    eN          |
+ *     |                     |   |                 |   |     |           |
+ *     |                     |   |                 |   |     v           |
+ *     |         X -> NULL   |   `---> X -> NULL   |   '---> X -> NULL   |
+ *     |                     |                     |                     |
+ *     +---------------------+---------------------+---------------------+
+ *
+ *
+ * There is a small 1-instruction gap of inconsistency which makes the chosen
+ * algorithm non linearizable, and requires enqueuers to disable preemption
+ * during the enqueue so as not to starve the consumer forever.
+ *
+ * As far as memory visibility is concerned, enqueuing uses a release fence in
+ * update_tail which pairs with memory fences in mpsc_queue_dequeue_batch().
+ *
+ * Note: as far as the data structure in memory, its layout is equivalent to
+ *       a BSD <sys/queue.h> STAILQ. However because of this inconsistency
+ *       window and memory ordering concerns, it is incorrect to use STAILQ
+ *       macros on an MPSC queue.
+ */
+typedef struct mpsc_queue_head {
+       struct mpsc_queue_chain mpqh_head;
+       struct mpsc_queue_chain *_Atomic mpqh_tail;
+} *mpsc_queue_head_t;
+
+/*!
+ * @macro MPSC_QUEUE_INITIALIZER
+ *
+ * @brief
+ * Macro to use in static initializers for mpsc queues.
+ *
+ * @param head
+ * The name of the variable to initialize.
+ */
+#define MPSC_QUEUE_INITIALIZER(head)   { .mpqh_tail = &(head).mpqh_head }
+
+#ifdef XNU_KERNEL_PRIVATE
+
+/*!
+ * @function mpsc_queue_init
+ *
+ * @brief
+ * Dynamically initialize an mpsc queue.
+ *
+ * @discussion
+ * This initialization assumes that the object holding the queue head
+ * is initialized before it can be made visible to other threads/cores.
+ *
+ * @param q
+ * The queue to initialize.
+ */
+static inline void
+mpsc_queue_init(mpsc_queue_head_t q)
+{
+       os_atomic_init(&q->mpqh_head.mpqc_next, NULL);
+       os_atomic_init(&q->mpqh_tail, &q->mpqh_head);
+}
+
+/*!
+ * @typedef enum mpsc_queue_options
+ */
+typedef enum mpsc_queue_options {
+       MPSC_QUEUE_NONE                = 0,
+       MPSC_QUEUE_DISABLE_PREEMPTION  = 1 << 0,
+} mpsc_queue_options_t;
+
+/*!
+ * @const MPSC_QUEUE_NOTQUEUED_MARKER
+ *
+ * @brief
+ * Magical marker that implementations can use to poison the chain pointer of
+ * elements not on any MPSC queue.
+ */
+#define MPSC_QUEUE_NOTQUEUED_MARKER ((mpsc_queue_chain_t)~0ul)
+
+/*!
+ * @macro mpsc_queue_element
+ *
+ * @brief
+ * Macro to find the pointer of an element back from its MPSC chain linkage.
+ */
+#define mpsc_queue_element(ptr, type, field) __container_of(ptr, type, field)
+
+
+#pragma mark Advanced Multi Producer calls
+
+/**
+ * @function __mpsc_queue_append_update_tail
+ *
+ * @brief
+ * First half of the enqueue operation onto a multi-producer single-consumer
+ * queue.
+ *
+ * @discussion
+ * This function is available for algorithms that need to do things (such as
+ * taking a refcount) before calling __mpsc_queue_append_update_prev().
+ *
+ * Preemption should be disabled before calling
+ * __mpsc_queue_append_update_tail(), and until
+ * __mpsc_queue_append_update_prev() has returned.
+ *
+ * @param q
+ * The queue to update.
+ *
+ * @param elm
+ * The element to append to `q`.
+ *
+ * @returns
+ * A token to later pass to __mpsc_queue_append_update_prev()
+ * to complete the enqueue.
+ */
+static inline mpsc_queue_chain_t
+__mpsc_queue_append_update_tail(mpsc_queue_head_t q, mpsc_queue_chain_t elm)
+{
+       os_atomic_store(&elm->mpqc_next, NULL, relaxed);
+       return os_atomic_xchg(&q->mpqh_tail, elm, release);
+}
+
+/**
+ * @function __mpsc_queue_append_was_empty
+ *
+ * @brief
+ * Tests whether the queue was empty at the time
+ * __mpsc_queue_append_update_tail() was called.
+ *
+ * @param q
+ * The queue to test emptiness for.
+ *
+ * @param prev
+ * The token returned by __mpsc_queue_append_update_tail().
+ *
+ * @returns
+ * Whether the queue was empty (true) or not (false).
+ */
+static inline bool
+__mpsc_queue_append_was_empty(mpsc_queue_head_t q, mpsc_queue_chain_t prev)
+{
+       return &q->mpqh_head == prev;
+}
+
+/**
+ * @function __mpsc_queue_append_update_prev
+ *
+ * @brief
+ * Second half of the enqueue operation onto a multi-producer single-consumer
+ * queue.
+ *
+ * @discussion
+ * This function is available for algorithms that need to do things (such as
+ * taking a refcount) before calling __mpsc_queue_append_update_prev().
+ *
+ * Preemption should be disabled before calling
+ * __mpsc_queue_append_update_tail(), and until
+ * __mpsc_queue_append_update_prev() has returned.
+ *
+ * @param prev
+ * The token returned by __mpsc_queue_append_update_tail().
+ *
+ * @param elm
+ * The element to append to the queue.
+ */
+static inline void
+__mpsc_queue_append_update_prev(mpsc_queue_chain_t prev, mpsc_queue_chain_t elm)
+{
+       os_atomic_store(&prev->mpqc_next, elm, relaxed);
+}
+
+
+#pragma mark Multi Producer calls
+
+/**
+ * @function mpsc_queue_append_list
+ *
+ * @brief
+ * Enqueues a list of elements onto a queue.
+ *
+ * @discussion
+ * This enqueues a list that has to be fully formed from `first` to `last`
+ * at the end of `q`.
+ *
+ * Preemption should be disabled when calling mpsc_queue_append_list().
+ *
+ * @param q
+ * The queue to update.
+ *
+ * @param first
+ * The first of the list elements being appended.
+ *
+ * @param last
+ * The last of the list elements being appended.
+ */
+static inline bool
+mpsc_queue_append_list(mpsc_queue_head_t q, mpsc_queue_chain_t first,
+    mpsc_queue_chain_t last)
+{
+       mpsc_queue_chain_t prev = __mpsc_queue_append_update_tail(q, last);
+       __mpsc_queue_append_update_prev(prev, first);
+       return __mpsc_queue_append_was_empty(q, prev);
+}
+
+/**
+ * @function __mpsc_queue_append_update_tail
+ *
+ * @brief
+ * Enqueues an element onto a queue.
+ *
+ * @discussion
+ * Preemption should be disabled when calling mpsc_queue_append().
+ *
+ * @param q    the queue to update
+ * @param elm  the element to append
+ */
+static inline bool
+mpsc_queue_append(mpsc_queue_head_t q, mpsc_queue_chain_t elm)
+{
+       return mpsc_queue_append_list(q, elm, elm);
+}
+
+
+#pragma mark Single Consumer calls
+
+/**
+ * @function mpsc_queue_dequeue_batch()
+ *
+ * @brief
+ * Atomically empty a queue at once and return the batch head and tail.
+ *
+ * @discussion
+ * Consumer function, must be called in a serialized way with respect to any
+ * other consumer function.
+ *
+ * @param q
+ * The queue
+ *
+ * @param tail
+ * An out pointer filled with the last element captured.
+ *
+ * @param dependency
+ * A dependency token (to rely on consume / hardware dependencies)
+ * When not trying to take advantage of hardware dependencies, just pass NULL.
+ *
+ * @returns
+ * The first element of the batch if any, or NULL the queue was empty.
+ */
+mpsc_queue_chain_t
+mpsc_queue_dequeue_batch(mpsc_queue_head_t q, mpsc_queue_chain_t *tail,
+    os_atomic_dependency_t dependency);
+
+/**
+ * @function mpsc_queue_batch_next()
+ *
+ * @brief
+ * Function used to consume an element from a batch dequeued with
+ * mpsc_queue_dequeue_batch().
+ *
+ * @discussion
+ * Once a batch has been dequeued, there is no need to hold the consumer lock
+ * anymore to consume it.
+ *
+ * mpsc_queue_batch_foreach_safe() is the preferred interface to consume
+ * the whole batch.
+ *
+ * @param cur
+ * The current inspected element of the batch (must be the batch head or
+ * a value returned by mpsc_queue_batch_next()).
+ *
+ * @param tail
+ * The last element of the batch.
+ *
+ * @returns
+ * The next element if any.
+ */
+mpsc_queue_chain_t
+mpsc_queue_batch_next(mpsc_queue_chain_t cur, mpsc_queue_chain_t tail);
+
+/**
+ * @macro mpsc_queue_batch_foreach_safe
+ *
+ * @brief
+ * Macro used to enumerate a batch dequeued with mpsc_queue_dequeue_batch().
+ *
+ * @param item
+ * The item being currently visited.
+ *
+ * @param head
+ * The first element of the batch.
+ *
+ * @param tail
+ * The last element of the batch.
+ */
+#define mpsc_queue_batch_foreach_safe(item, head, tail) \
+               for (mpsc_queue_chain_t __tmp, __item = (head), __tail = (tail); \
+                               __tmp = mpsc_queue_batch_next(__item, __tail), (item) = __item; \
+                               __item = __tmp)
+
+/**
+ * @function mpsc_queue_restore_batch()
+ *
+ * @brief
+ * "Restore"s a batch at the head of the queue.
+ *
+ * @discussion
+ * Consumer function, must be called in a serialized way with respect to any
+ * other consumer function.
+ *
+ * @param q
+ * The queue
+ *
+ * @param first
+ * The first element to put back.
+ *
+ * @param last
+ * The last element to put back.
+ * It is the responsibility of the caller to ensure the linkages from first to
+ * last are properly set up before calling this function.
+ */
+void
+mpsc_queue_restore_batch(mpsc_queue_head_t q, mpsc_queue_chain_t first,
+    mpsc_queue_chain_t last);
+
+
+#pragma mark "GCD"-like facilities
+
+/*!
+ * @typedef struct mpsc_daemon_queue
+ *
+ * @brief
+ * Daemon queues are a ready-to use packaging of the low level MPSC queue
+ * primitive.
+ *
+ * @discussion
+ * mpsc_queue_t requires handling of state transitions of the queue and
+ * dequeuing yourself, which is a non trivial task.
+ *
+ * Daemon queues are a simple packaged solution that allows for mpsc_queue_t to
+ * form hierarchies (mostly for layering purposes), and be serviced at the
+ * bottom of such a hierarchy by a thread or a thread call.
+ *
+ * Daemon queues assume homogenous items, and are setup with an `invoke`
+ * callback that is called in the dequeuer on every item as they are dequeued.
+ */
+typedef struct mpsc_daemon_queue *mpsc_daemon_queue_t;
+
+/*!
+ * @typedef struct mpsc_daemon_queue
+ *
+ * @brief
+ * The type for MPSC Daemon Queues invoke callbacks.
+ */
+typedef void (*mpsc_daemon_invoke_fn_t)(mpsc_queue_chain_t elm,
+    mpsc_daemon_queue_t dq);
+
+/*!
+ * @enum mpsc_daemon_queue_kind
+ *
+ * @brief
+ * Internal type, not to be used by clients.
+ */
+typedef enum mpsc_daemon_queue_kind {
+       MPSC_QUEUE_KIND_UNKNOWN,
+       MPSC_QUEUE_KIND_NESTED,
+       MPSC_QUEUE_KIND_THREAD,
+       MPSC_QUEUE_KIND_THREAD_CRITICAL,
+       MPSC_QUEUE_KIND_THREAD_CALL,
+} mpsc_daemon_queue_kind_t;
+
+/*!
+ * @enum mpsc_daemon_queue_state
+ *
+ * @brief
+ * Internal type, not to be used by clients.
+ */
+typedef enum mpsc_daemon_queue_state {
+       MPSC_QUEUE_STATE_DRAINING = 0x0001,
+       MPSC_QUEUE_STATE_WAKEUP   = 0x0002,
+       MPSC_QUEUE_STATE_CANCELED = 0x0004,
+} mpsc_daemon_queue_state_t;
+
+struct mpsc_daemon_queue {
+       mpsc_daemon_queue_kind_t    mpd_kind;
+       mpsc_daemon_queue_state_t _Atomic mpd_state;
+       mpsc_daemon_invoke_fn_t     mpd_invoke;
+       union {
+               mpsc_daemon_queue_t     mpd_target;
+               struct thread          *mpd_thread;
+               struct thread_call     *mpd_call;
+       };
+       struct mpsc_queue_head      mpd_queue;
+       struct mpsc_queue_chain     mpd_chain;
+};
+
+/*!
+ * @function mpsc_daemon_queue_init_with_thread
+ *
+ * @brief
+ * Sets up a daemon queue to be a base queue drained by a kernel thread.
+ *
+ * @discussion
+ * The function will allocate the thread and start it in assert_wait.
+ *
+ * @param dq
+ * The queue to initialize
+ *
+ * @param invoke
+ * The invoke function called on individual items on the queue during drain.
+ *
+ * @param pri
+ * The scheduler priority for the created thread.
+ *
+ * @param name
+ * The name to give to the created thread.
+ *
+ * @returns
+ * Whether creating the thread was successful.
+ */
+kern_return_t
+mpsc_daemon_queue_init_with_thread(mpsc_daemon_queue_t dq,
+    mpsc_daemon_invoke_fn_t invoke, int pri, const char *name);
+
+
+/*!
+ * @function mpsc_daemon_queue_init_with_thread_call
+ *
+ * @brief
+ * Sets up a daemon queue to be a base queue drained by a thread call.
+ *
+ * @param dq
+ * The queue to initialize
+ *
+ * @param invoke
+ * The invoke function called on individual items on the queue during drain.
+ *
+ * @param pri
+ * The priority the thread call will run at.
+ */
+void
+mpsc_daemon_queue_init_with_thread_call(mpsc_daemon_queue_t dq,
+    mpsc_daemon_invoke_fn_t invoke, thread_call_priority_t pri);
+
+/*!
+ * @function mpsc_daemon_queue_init_with_target
+ *
+ * @brief
+ * Sets up a daemon queue to target another daemon queue.
+ *
+ * @discussion
+ * The targetting relationship is useful for subsystem layering purposes only.
+ * Because draining a given queue is atomic with respect to its target, target
+ * queue hierarchies are prone to starvation.
+ *
+ * @param dq
+ * The queue to initialize
+ *
+ * @param invoke
+ * The invoke function called on individual items on the queue during drain.
+ *
+ * @param target
+ * The target queue of the initialized queue, which has to be initialized with
+ * the mpsc_daemon_queue_nested_invoke invoke handler.
+ */
+void
+mpsc_daemon_queue_init_with_target(mpsc_daemon_queue_t dq,
+    mpsc_daemon_invoke_fn_t invoke, mpsc_daemon_queue_t target);
+
+/*!
+ * @function mpsc_daemon_queue_nested_invoke
+ *
+ * @brief
+ * The invoke function to pass to mpsc_daemon_queue_init_* when a queue is meant
+ * to be targeted by other queues.
+ */
+void
+mpsc_daemon_queue_nested_invoke(mpsc_queue_chain_t elm,
+    mpsc_daemon_queue_t dq);
+
+/*!
+ * @function mpsc_daemon_queue_cancel_and_wait
+ *
+ * @brief
+ * Cancels the queue so that the object owning it can be destroyed.
+ *
+ * @discussion
+ * This interface will cancel the queue and wait synchronously for the
+ * cancelation to have taken effect, possibly waiting on elements currently
+ * draining.
+ *
+ * Sending objects to the daemon queue after cancelation is undefined.
+ *
+ * Calling this function multiple times is undefined.
+ *
+ * Tearing down daemon queue hierarchies is the responsibility of the adopter.
+ */
+void
+mpsc_daemon_queue_cancel_and_wait(mpsc_daemon_queue_t dq);
+
+/*!
+ * @function mpsc_daemon_enqueue
+ *
+ * @brief
+ * Send ("async") an item to a given daemon on a given queue.
+ *
+ * @discussion
+ * It is the responsibility of the caller to ensure preemption is disabled when
+ * this call is made.
+ *
+ * @param dq
+ * The daemon queue to enqueue the element onto.
+ *
+ * @param elm
+ * The item to enqueue.
+ *
+ * @param options
+ * Options applicable to the enqueue. In particupar passing
+ * MPSC_QUEUE_DISABLE_PREEMPTION makes sure preemption is properly disabled
+ * during the enqueue.
+ */
+void
+mpsc_daemon_enqueue(mpsc_daemon_queue_t dq, mpsc_queue_chain_t elm,
+    mpsc_queue_options_t options);
+
+
+#pragma mark Deferred deallocation daemon
+
+/*!
+ * @function thread_deallocate_daemon_init
+ *
+ * @brief
+ * Initializes the deferred deallocation daemon, called by thread_daemon_init().
+ *
+ * @discussion
+ * The deferred deallocation daemon is a kernel thread based daemon queue that
+ * is targeted by nested daemon queues.
+ *
+ * It is used to perform deferred deallocation for objects that can't safely be
+ * deallocated from the context where the deallocation should normally occur.
+ *
+ * Subsystems using it are for example: turnstiles, workqueues, threads.
+ *
+ * @warning
+ * New queues should be added to this daemon with great care,
+ * as abusing it can lead to unbounded amount of kernel work.
+ */
+void
+thread_deallocate_daemon_init(void);
+
+/*!
+ * @function thread_deallocate_daemon_register_queue
+ *
+ * @brief
+ * Dynamically register a queue for deferred deletion with the deferred
+ * deallocation daemon.
+ *
+ * @param dq
+ * The daemon queue to register with the deferred deallocation daemon.
+ *
+ * @param invoke
+ * The callback called on every element of this queue by the deallocation
+ * daemon.
+ */
+void
+thread_deallocate_daemon_register_queue(mpsc_daemon_queue_t dq,
+    mpsc_daemon_invoke_fn_t invoke);
+
+
+#pragma mark tests
+#if DEBUG || DEVELOPMENT
+
+int
+mpsc_test_pingpong(uint64_t count, uint64_t *out);
+
+#endif /* DEBUG || DEVELOPMENT */
+
+#endif /* XNU_KERNEL_PRIVATE */
+
+__END_DECLS
+
+#endif /* _KERN_MPSC_QUEUE_H_ */
index 0a2e47e3507f8058746b40a46ac835188448dcc7..094113569051db0b571e1dde29574fff1e6d2a71 100644 (file)
@@ -41,7 +41,8 @@
 #include <mach/task_policy.h>
 #include <kern/task.h>
 #include <kern/ledger.h>
-
+#include <sys/kdebug.h>
+#include <kern/sched_prim.h>
 /*
  ******************************
  * XNU-internal functionality
@@ -74,42 +75,41 @@ extern kern_return_t task_importance(task_t task, integer_t importance);
 /* flavors (also DBG_IMPORTANCE subclasses  0x20 - 0x3F) */
 
 /* internal or external, thread or task */
-#define TASK_POLICY_DARWIN_BG           0x21
-#define TASK_POLICY_IOPOL               0x22
-#define TASK_POLICY_IO                  0x23
-#define TASK_POLICY_PASSIVE_IO          0x24
+#define TASK_POLICY_DARWIN_BG           IMP_TASK_POLICY_DARWIN_BG
+#define TASK_POLICY_IOPOL               IMP_TASK_POLICY_IOPOL
+#define TASK_POLICY_IO                  IMP_TASK_POLICY_IO
+#define TASK_POLICY_PASSIVE_IO          IMP_TASK_POLICY_PASSIVE_IO
 
 /* internal, task only */
-#define TASK_POLICY_DARWIN_BG_IOPOL     0x27
+#define TASK_POLICY_DARWIN_BG_IOPOL     IMP_TASK_POLICY_DARWIN_BG_IOPOL
 
 /* task-only attributes */
-#define TASK_POLICY_TAL                 0x28
-#define TASK_POLICY_BOOST               0x29
-#define TASK_POLICY_ROLE                0x2A
+#define TASK_POLICY_TAL                 IMP_TASK_POLICY_TAL
+#define TASK_POLICY_BOOST               IMP_TASK_POLICY_BOOST
+#define TASK_POLICY_ROLE                IMP_TASK_POLICY_ROLE
 /* unused                               0x2B */
-#define TASK_POLICY_TERMINATED          0x2C
-#define TASK_POLICY_NEW_SOCKETS_BG      0x2D
-#define TASK_POLICY_SUP_ACTIVE          0x2E
-#define TASK_POLICY_LATENCY_QOS         0x2F
-#define TASK_POLICY_THROUGH_QOS         0x30
-#define TASK_POLICY_WATCHERS_BG         0x31
-
-#define TASK_POLICY_SFI_MANAGED         0x34
-#define TASK_POLICY_ALL_SOCKETS_BG      0x37
-
-#define TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS  0x39 /* latency as value1, throughput as value2 */
-#define TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS  0x3A /* latency as value1, throughput as value2 */
+#define TASK_POLICY_TERMINATED          IMP_TASK_POLICY_TERMINATED
+#define TASK_POLICY_NEW_SOCKETS_BG      IMP_TASK_POLICY_NEW_SOCKETS_BG
+#define TASK_POLICY_SUP_ACTIVE          IMP_TASK_POLICY_SUP_ACTIVE
+#define TASK_POLICY_LATENCY_QOS         IMP_TASK_POLICY_LATENCY_QOS
+#define TASK_POLICY_THROUGH_QOS         IMP_TASK_POLICY_THROUGH_QOS
+#define TASK_POLICY_WATCHERS_BG         IMP_TASK_POLICY_WATCHERS_BG
+#define TASK_POLICY_SFI_MANAGED         IMP_TASK_POLICY_SFI_MANAGED
+#define TASK_POLICY_ALL_SOCKETS_BG      IMP_TASK_POLICY_ALL_SOCKETS_BG
+
+#define TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS IMP_TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS /* latency as value1, throughput as value2 */
+#define TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS  IMP_TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS /* latency as value1, throughput as value2 */
 
 /* thread-only attributes */
-#define TASK_POLICY_PIDBIND_BG          0x32
+#define TASK_POLICY_PIDBIND_BG          IMP_TASK_POLICY_PIDBIND_BG
 /* unused                               0x33 */
-#define TASK_POLICY_QOS                 0x35
-#define TASK_POLICY_QOS_OVERRIDE        0x36
-#define TASK_POLICY_QOS_AND_RELPRIO     0x38 /* QoS as value1, relative priority as value2 */
-#define TASK_POLICY_QOS_WORKQ_OVERRIDE  0x3B
-#define TASK_POLICY_QOS_PROMOTE         0x3C
-#define TASK_POLICY_QOS_IPC_OVERRIDE    0x3D
-// was TASK_POLICY_QOS_SYNC_IPC_OVERRIDE 0x3E
+#define TASK_POLICY_QOS                 0x35 /* Used only as a convenience for getter */
+#define TASK_POLICY_QOS_OVERRIDE        IMP_TASK_POLICY_QOS_OVERRIDE
+#define TASK_POLICY_QOS_AND_RELPRIO     IMP_TASK_POLICY_QOS_AND_RELPRIO /* QoS as value1, relative priority as value2 */
+#define TASK_POLICY_QOS_WORKQ_OVERRIDE  IMP_TASK_POLICY_QOS_WORKQ_OVERRIDE
+#define TASK_POLICY_QOS_PROMOTE         IMP_TASK_POLICY_QOS_PROMOTE
+#define TASK_POLICY_QOS_KEVENT_OVERRIDE IMP_TASK_POLICY_QOS_KEVENT_OVERRIDE
+#define TASK_POLICY_QOS_SERVICER_OVERRIDE IMP_TASK_POLICY_QOS_SERVICER_OVERRIDE
 
 #define TASK_POLICY_MAX                 0x3F
 
@@ -133,8 +133,8 @@ extern int  proc_task_role_to_darwin_role(int task_role);
 
 /* Functions used by kern_exec.c */
 extern void task_set_main_thread_qos(task_t task, thread_t main_thread);
-extern void proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp, int role,
-    ipc_port_t * portwatch_ports, int portwatch_count);
+extern void proc_set_task_spawnpolicy(task_t task, thread_t thread, int apptype, int qos_clamp, int role,
+    ipc_port_t * portwatch_ports, uint32_t portwatch_count);
 extern void proc_inherit_task_role(task_t new_task, task_t old_task);
 
 /* IO Throttle tiers */
@@ -167,6 +167,9 @@ extern int task_get_apptype(task_t);
 extern void proc_apply_task_networkbg(void * bsd_info, thread_t thread);
 #endif /* MACH_BSD */
 
+extern void thread_freeze_base_pri(thread_t thread);
+extern bool thread_unfreeze_base_pri(thread_t thread);
+
 /* Functions used by pthread_shims.c */
 extern int proc_thread_qos_add_override(task_t task, thread_t thread, uint64_t tid,
     int override_qos, boolean_t first_override_for_resource,
@@ -245,15 +248,21 @@ extern kern_return_t thread_policy_set_internal(thread_t thread, thread_policy_f
     thread_policy_t policy_info, mach_msg_type_number_t count);
 
 extern boolean_t thread_recompute_user_promotion_locked(thread_t thread);
+extern boolean_t thread_recompute_kernel_promotion_locked(thread_t thread);
 extern thread_qos_t thread_user_promotion_qos_for_pri(int priority);
 
 extern void thread_set_exec_promotion(thread_t thread);
 extern void thread_clear_exec_promotion(thread_t thread);
 
-/* for IPC override management */
-extern void thread_add_ipc_override(thread_t thread, uint32_t qos_override);
-extern void thread_update_ipc_override(thread_t thread, uint32_t qos_override);
-extern void thread_drop_ipc_override(thread_t thread);
+/* for servicer override management (workloops only) */
+extern void thread_add_servicer_override(thread_t thread, uint32_t qos_override);
+extern void thread_update_servicer_override(thread_t thread, uint32_t qos_override);
+extern void thread_drop_servicer_override(thread_t thread);
+
+/* for generic kevent override management */
+extern void thread_add_kevent_override(thread_t thread, uint32_t qos_override);
+extern void thread_update_kevent_override(thread_t thread, uint32_t qos_override);
+extern void thread_drop_kevent_override(thread_t thread);
 
 /* for ipc_pset.c */
 extern thread_qos_t thread_get_requested_qos(thread_t thread, int *relpri);
@@ -280,7 +289,8 @@ typedef struct task_pend_token {
            tpt_update_throttle     :1,
            tpt_update_thread_sfi   :1,
            tpt_force_recompute_pri :1,
-           tpt_update_tg_ui_flag   :1;
+           tpt_update_tg_ui_flag   :1,
+           tpt_update_turnstile    :1;
 } *task_pend_token_t;
 
 extern void task_policy_update_complete_unlocked(task_t task, task_pend_token_t pend_token);
index 9fb14d2628975d70f4dac53a71f22f5e15c86228..0feea0aeb5540ee6899887768c7a75c1e1ff67e9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <arm/cpu_data_internal.h>
 #endif
 
+#ifdef HAS_APPLE_PAC
+#include <mach/vm_param.h>
+#include <ptrauth.h>
+#endif /* HAS_APPLE_PAC */
 
 #define isdigit(d) ((d) >= '0' && (d) <= '9')
 #define Ctod(c) ((c) - '0')
@@ -256,6 +260,11 @@ __doprnt(
        char    c;
        int             capitals;
        int             long_long;
+       enum {
+               INT,
+               SHORT,
+               CHAR,
+       } numeric_type = INT;
        int             nprinted = 0;
 
        while ((c = *fmt) != '\0') {
@@ -269,6 +278,7 @@ __doprnt(
                fmt++;
 
                long_long = 0;
+               numeric_type = INT;
                length = 0;
                prec = -1;
                ladjust = FALSE;
@@ -337,6 +347,13 @@ __doprnt(
                                long_long = 1;
                                c = *++fmt;
                        }
+               } else if (c == 'h') {
+                       c = *++fmt;
+                       numeric_type = SHORT;
+                       if (c == 'h') {
+                               numeric_type = CHAR;
+                               c = *++fmt;
+                       }
                } else if (c == 'q' || c == 'L') {
                        long_long = 1;
                        c = *++fmt;
@@ -521,6 +538,7 @@ __doprnt(
 
                case 'u':
                        truncate = _doprnt_truncates;
+               /* FALLTHROUGH */
                case 'U':
                        base = 10;
                        goto print_unsigned;
@@ -530,6 +548,7 @@ __doprnt(
                        if (sizeof(int) < sizeof(void *)) {
                                long_long = 1;
                        }
+               /* FALLTHROUGH */
                case 'x':
                        truncate = _doprnt_truncates;
                        base = 16;
@@ -542,12 +561,14 @@ __doprnt(
 
                case 'r':
                        truncate = _doprnt_truncates;
+               /* FALLTHROUGH */
                case 'R':
                        base = radix;
                        goto print_signed;
 
                case 'n':
                        truncate = _doprnt_truncates;
+               /* FALLTHROUGH */
                case 'N':
                        base = radix;
                        goto print_unsigned;
@@ -558,6 +579,16 @@ print_signed:
                        } else {
                                n = va_arg(argp, int);
                        }
+                       switch (numeric_type) {
+                       case SHORT:
+                               n = (short)n;
+                               break;
+                       case CHAR:
+                               n = (char)n;
+                               break;
+                       default:
+                               break;
+                       }
                        if (n >= 0) {
                                u = n;
                                sign_char = plus_sign;
@@ -573,6 +604,16 @@ print_unsigned:
                        } else {
                                u = va_arg(argp, unsigned int);
                        }
+                       switch (numeric_type) {
+                       case SHORT:
+                               u = (unsigned short)u;
+                               break;
+                       case CHAR:
+                               u = (unsigned char)u;
+                               break;
+                       default:
+                               break;
+                       }
                        goto print_num;
 
 print_num:
@@ -591,6 +632,13 @@ print_num:
                                        const char* strp = str;
                                        int strl = sizeof(str) - 1;
 
+#ifdef HAS_APPLE_PAC
+                                       /**
+                                        * Strip out the pointer authentication code before
+                                        * checking whether the pointer is a kernel address.
+                                        */
+                                       u = (unsigned long long)VM_KERNEL_STRIP_PTR(u);
+#endif /* HAS_APPLE_PAC */
 
                                        if (u >= VM_MIN_KERNEL_AND_KEXT_ADDRESS && u <= VM_MAX_KERNEL_ADDRESS) {
                                                while (*strp != '\0') {
@@ -681,7 +729,19 @@ dummy_putc(int ch, void *arg)
 {
        void (*real_putc)(char) = arg;
 
-       real_putc(ch);
+       /*
+        * Attempts to panic (or otherwise log to console) during early boot
+        * can result in _doprnt() and _doprnt_log() being called from
+        * _kprintf() before PE_init_kprintf() has been called. This causes
+        * the "putc" param to _doprnt() and _doprnt_log() to be passed as
+        * NULL. That NULL makes its way here, and we would try jump to it.
+        * Given that this is a poor idea, and this happens at very early
+        * boot, there is not a way to report this easily (we are likely
+        * already panicing), so we'll just do nothing instead of crashing.
+        */
+       if (real_putc) {
+               real_putc(ch);
+       }
 }
 
 void
@@ -710,11 +770,11 @@ _doprnt_log(
 boolean_t       new_printf_cpu_number = FALSE;
 #endif  /* MP_PRINTF */
 
-decl_simple_lock_data(, printf_lock)
-decl_simple_lock_data(, bsd_log_spinlock)
+decl_simple_lock_data(, printf_lock);
+decl_simple_lock_data(, bsd_log_spinlock);
 
 lck_grp_t oslog_stream_lock_grp;
-decl_lck_spin_data(, oslog_stream_lock)
+decl_lck_spin_data(, oslog_stream_lock);
 void oslog_lock_init(void);
 
 extern void bsd_log_init(void);
index 26c60c043989b1d3d4dab875283436c5f6adcfa1..5ac1ce75682771a2cdec2bc0ae407397e584ead3 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -76,6 +76,8 @@
 #include <kern/ledger.h>
 #include <machine/machparam.h>
 #include <kern/machine.h>
+#include <kern/policy_internal.h>
+#include <kern/sched_clutch.h>
 
 #ifdef CONFIG_MACH_APPROXIMATE_TIME
 #include <machine/commpage.h>  /* for commpage_update_mach_approximate_time */
@@ -85,8 +87,6 @@
 #include <kern/monotonic.h>
 #endif /* MONOTONIC */
 
-static void sched_update_thread_bucket(thread_t thread);
-
 /*
  *     thread_quantum_expire:
  *
@@ -156,6 +156,7 @@ thread_quantum_expire(
         */
        if ((thread->sched_mode == TH_MODE_REALTIME || thread->sched_mode == TH_MODE_FIXED) &&
            !(thread->sched_flags & TH_SFLAG_PROMOTED) &&
+           !(thread->kern_promotion_schedpri != 0) &&
            !(thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) &&
            !(thread->options & TH_OPT_SYSTEM_CRITICAL)) {
                uint64_t new_computation;
@@ -278,6 +279,10 @@ sched_set_thread_base_priority(thread_t thread, int priority)
        }
 
        int old_base_pri = thread->base_pri;
+       thread->req_base_pri = priority;
+       if (thread->sched_flags & TH_SFLAG_BASE_PRI_FROZEN) {
+               priority = MAX(priority, old_base_pri);
+       }
        thread->base_pri = priority;
 
        if ((thread->state & TH_RUN) == TH_RUN) {
@@ -301,11 +306,49 @@ sched_set_thread_base_priority(thread_t thread, int priority)
                machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
                    ctime, PERFCONTROL_CALLOUT_WAKE_UNSAFE, thread);
        }
-       sched_update_thread_bucket(thread);
+#if !CONFIG_SCHED_CLUTCH
+       /* For the clutch scheduler, this operation is done in set_sched_pri() */
+       SCHED(update_thread_bucket)(thread);
+#endif /* !CONFIG_SCHED_CLUTCH */
 
        thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
 }
 
+/*
+ *     sched_set_kernel_thread_priority:
+ *
+ *     Set the absolute base priority of the thread
+ *     and reset its scheduled priority.
+ *
+ *     Called with the thread unlocked.
+ */
+void
+sched_set_kernel_thread_priority(thread_t thread, int new_priority)
+{
+       spl_t s = splsched();
+
+       thread_lock(thread);
+
+       assert(thread->sched_mode != TH_MODE_REALTIME);
+       assert(thread->effective_policy.thep_qos == THREAD_QOS_UNSPECIFIED);
+
+       if (new_priority > thread->max_priority) {
+               new_priority = thread->max_priority;
+       }
+#if CONFIG_EMBEDDED
+       if (new_priority < MAXPRI_THROTTLE) {
+               new_priority = MAXPRI_THROTTLE;
+       }
+#endif /* CONFIG_EMBEDDED */
+
+       thread->importance = new_priority - thread->task_priority;
+
+       sched_set_thread_base_priority(thread, new_priority);
+
+       thread_unlock(thread);
+       splx(s);
+}
+
 /*
  *     thread_recompute_sched_pri:
  *
@@ -342,6 +385,14 @@ thread_recompute_sched_pri(thread_t thread, set_sched_pri_options_t options)
                        priority = DEPRESSPRI;
                }
 
+               if (thread->kern_promotion_schedpri > 0) {
+                       priority = MAX(priority, thread->kern_promotion_schedpri);
+
+                       if (sched_mode != TH_MODE_REALTIME) {
+                               priority = MIN(priority, MAXPRI_PROMOTE);
+                       }
+               }
+
                if (sched_flags & TH_SFLAG_PROMOTED) {
                        priority = MAX(priority, thread->promotion_priority);
 
@@ -412,6 +463,15 @@ lightweight_update_priority(thread_t thread)
 
                thread->cpu_delta += delta;
 
+#if CONFIG_SCHED_CLUTCH
+               /*
+                * Update the CPU usage for the thread group to which the thread belongs.
+                * The implementation assumes that the thread ran for the entire delta
+                * as part of the same thread group.
+                */
+               sched_clutch_cpu_usage_update(thread, delta);
+#endif /* CONFIG_SCHED_CLUTCH */
+
                priority = sched_compute_timeshare_priority(thread);
 
                if (priority != thread->sched_pri) {
@@ -427,17 +487,40 @@ lightweight_update_priority(thread_t thread)
  *     is  usage = (usage >> shift1) +/- (usage >> abs(shift2))  where the
  *     +/- is determined by the sign of shift 2.
  */
-struct shift_data {
-       int     shift1;
-       int     shift2;
-};
 
-#define SCHED_DECAY_TICKS       32
-static struct shift_data        sched_decay_shifts[SCHED_DECAY_TICKS] = {
-       {1, 1}, {1, 3}, {1, -3}, {2, -7}, {3, 5}, {3, -5}, {4, -8}, {5, 7},
-       {5, -7}, {6, -10}, {7, 10}, {7, -9}, {8, -11}, {9, 12}, {9, -11}, {10, -13},
-       {11, 14}, {11, -13}, {12, -15}, {13, 17}, {13, -15}, {14, -17}, {15, 19}, {16, 18},
-       {16, -19}, {17, 22}, {18, 20}, {18, -20}, {19, 26}, {20, 22}, {20, -22}, {21, -27}
+const struct shift_data        sched_decay_shifts[SCHED_DECAY_TICKS] = {
+       { .shift1 = 1, .shift2 = 1 },
+       { .shift1 = 1, .shift2 = 3 },
+       { .shift1 = 1, .shift2 = -3 },
+       { .shift1 = 2, .shift2 = -7 },
+       { .shift1 = 3, .shift2 = 5 },
+       { .shift1 = 3, .shift2 = -5 },
+       { .shift1 = 4, .shift2 = -8 },
+       { .shift1 = 5, .shift2 = 7 },
+       { .shift1 = 5, .shift2 = -7 },
+       { .shift1 = 6, .shift2 = -10 },
+       { .shift1 = 7, .shift2 = 10 },
+       { .shift1 = 7, .shift2 = -9 },
+       { .shift1 = 8, .shift2 = -11 },
+       { .shift1 = 9, .shift2 = 12 },
+       { .shift1 = 9, .shift2 = -11 },
+       { .shift1 = 10, .shift2 = -13 },
+       { .shift1 = 11, .shift2 = 14 },
+       { .shift1 = 11, .shift2 = -13 },
+       { .shift1 = 12, .shift2 = -15 },
+       { .shift1 = 13, .shift2 = 17 },
+       { .shift1 = 13, .shift2 = -15 },
+       { .shift1 = 14, .shift2 = -17 },
+       { .shift1 = 15, .shift2 = 19 },
+       { .shift1 = 16, .shift2 = 18 },
+       { .shift1 = 16, .shift2 = -19 },
+       { .shift1 = 17, .shift2 = 22 },
+       { .shift1 = 18, .shift2 = 20 },
+       { .shift1 = 18, .shift2 = -20 },
+       { .shift1 = 19, .shift2 = 26 },
+       { .shift1 = 20, .shift2 = 22 },
+       { .shift1 = 20, .shift2 = -22 },
+       { .shift1 = 21, .shift2 = -27 }
 };
 
 /*
@@ -447,7 +530,9 @@ static struct shift_data        sched_decay_shifts[SCHED_DECAY_TICKS] = {
  */
 extern int sched_pri_decay_band_limit;
 
-#ifdef CONFIG_EMBEDDED
+
+/* Only use the decay floor logic on embedded non-clutch schedulers */
+#if CONFIG_EMBEDDED && !CONFIG_SCHED_CLUTCH
 
 int
 sched_compute_timeshare_priority(thread_t thread)
@@ -479,7 +564,7 @@ sched_compute_timeshare_priority(thread_t thread)
        return priority;
 }
 
-#else /* CONFIG_EMBEDDED */
+#else /* CONFIG_EMBEDDED && !CONFIG_SCHED_CLUTCH */
 
 int
 sched_compute_timeshare_priority(thread_t thread)
@@ -496,7 +581,7 @@ sched_compute_timeshare_priority(thread_t thread)
        return priority;
 }
 
-#endif /* CONFIG_EMBEDDED */
+#endif /* CONFIG_EMBEDDED && !CONFIG_SCHED_CLUTCH */
 
 /*
  *     can_update_priority
@@ -556,7 +641,16 @@ update_priority(
                thread->cpu_usage += delta + thread->cpu_delta;
                thread->cpu_delta = 0;
 
-               struct shift_data *shiftp = &sched_decay_shifts[ticks];
+#if CONFIG_SCHED_CLUTCH
+               /*
+                * Update the CPU usage for the thread group to which the thread belongs.
+                * The implementation assumes that the thread ran for the entire delta
+                * as part of the same thread group.
+                */
+               sched_clutch_cpu_usage_update(thread, delta);
+#endif /* CONFIG_SCHED_CLUTCH */
+
+               const struct shift_data *shiftp = &sched_decay_shifts[ticks];
 
                if (shiftp->shift2 > 0) {
                        thread->cpu_usage =   (thread->cpu_usage >> shiftp->shift1) +
@@ -589,7 +683,11 @@ update_priority(
         * values. The updated pri_shift would be used to calculate the
         * new priority of the thread.
         */
+#if CONFIG_SCHED_CLUTCH
+       thread->pri_shift = sched_clutch_thread_pri_shift(thread, thread->th_sched_bucket);
+#else /* CONFIG_SCHED_CLUTCH */
        thread->pri_shift = sched_pri_shifts[thread->th_sched_bucket];
+#endif /* CONFIG_SCHED_CLUTCH */
 
        /* Recompute scheduled priority if appropriate. */
        if (thread->sched_mode == TH_MODE_TIMESHARE) {
@@ -603,9 +701,13 @@ update_priority(
 /*
  * TH_BUCKET_RUN is a count of *all* runnable non-idle threads.
  * Each other bucket is a count of the runnable non-idle threads
- * with that property.
+ * with that property. All updates to these counts should be
+ * performed with os_atomic_* operations.
+ *
+ * For the clutch scheduler, this global bucket is used only for
+ * keeping the total global run count.
  */
-volatile uint32_t       sched_run_buckets[TH_BUCKET_MAX];
+uint32_t       sched_run_buckets[TH_BUCKET_MAX];
 
 static void
 sched_incr_bucket(sched_bucket_t bucket)
@@ -613,7 +715,7 @@ sched_incr_bucket(sched_bucket_t bucket)
        assert(bucket >= TH_BUCKET_FIXPRI &&
            bucket <= TH_BUCKET_SHARE_BG);
 
-       hw_atomic_add(&sched_run_buckets[bucket], 1);
+       os_atomic_inc(&sched_run_buckets[bucket], relaxed);
 }
 
 static void
@@ -622,19 +724,17 @@ sched_decr_bucket(sched_bucket_t bucket)
        assert(bucket >= TH_BUCKET_FIXPRI &&
            bucket <= TH_BUCKET_SHARE_BG);
 
-       assert(sched_run_buckets[bucket] > 0);
+       assert(os_atomic_load(&sched_run_buckets[bucket], relaxed) > 0);
 
-       hw_atomic_sub(&sched_run_buckets[bucket], 1);
+       os_atomic_dec(&sched_run_buckets[bucket], relaxed);
 }
 
-/* TH_RUN & !TH_IDLE controls whether a thread has a run count */
-
 uint32_t
 sched_run_incr(thread_t thread)
 {
        assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
 
-       uint32_t new_count = hw_atomic_add(&sched_run_buckets[TH_BUCKET_RUN], 1);
+       uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
 
        sched_incr_bucket(thread->th_sched_bucket);
 
@@ -648,12 +748,12 @@ sched_run_decr(thread_t thread)
 
        sched_decr_bucket(thread->th_sched_bucket);
 
-       uint32_t new_count = hw_atomic_sub(&sched_run_buckets[TH_BUCKET_RUN], 1);
+       uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
 
        return new_count;
 }
 
-static void
+void
 sched_update_thread_bucket(thread_t thread)
 {
        sched_bucket_t old_bucket = thread->th_sched_bucket;
@@ -718,7 +818,7 @@ sched_set_thread_mode(thread_t thread, sched_mode_t new_mode)
 
        thread->sched_mode = new_mode;
 
-       sched_update_thread_bucket(thread);
+       SCHED(update_thread_bucket)(thread);
 }
 
 /*
@@ -789,95 +889,6 @@ sched_thread_mode_undemote(thread_t thread, uint32_t reason)
        }
 }
 
-/*
- * Promote thread to a specific priority
- *
- * Promotion must not last past syscall boundary
- * Clients must always pair promote and unpromote 1:1
- *
- * Called at splsched with thread locked
- */
-void
-sched_thread_promote_to_pri(thread_t    thread,
-    int         priority,
-    __kdebug_only uintptr_t   trace_obj /* already unslid */)
-{
-       assert((thread->sched_flags & TH_SFLAG_PROMOTED) != TH_SFLAG_PROMOTED);
-       assert(thread->promotion_priority == 0);
-       assert(priority <= MAXPRI_PROMOTE);
-       assert(priority > 0);
-
-       KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTED),
-           thread_tid(thread), trace_obj, priority);
-
-       thread->sched_flags |= TH_SFLAG_PROMOTED;
-       thread->promotion_priority = priority;
-
-       thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
-}
-
-
-/*
- * Update a pre-existing priority promotion to have a higher priority floor
- * Priority can only go up from the previous value
- * Update must occur while a promotion is active
- *
- * Called at splsched with thread locked
- */
-void
-sched_thread_update_promotion_to_pri(thread_t   thread,
-    int        priority,
-    __kdebug_only uintptr_t  trace_obj /* already unslid */)
-{
-       assert(thread->promotions > 0);
-       assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED);
-       assert(thread->promotion_priority > 0);
-       assert(priority <= MAXPRI_PROMOTE);
-
-       if (thread->promotion_priority < priority) {
-               KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTED_UPDATE),
-                   thread_tid(thread), trace_obj, priority);
-
-               thread->promotion_priority = priority;
-               thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
-       }
-}
-
-/*
- * End a priority promotion
- * Demotes a thread back to its expected priority without the promotion in place
- *
- * Called at splsched with thread locked
- */
-void
-sched_thread_unpromote(thread_t     thread,
-    __kdebug_only uintptr_t    trace_obj /* already unslid */)
-{
-       assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED);
-       assert(thread->promotion_priority > 0);
-
-       KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UNPROMOTED),
-           thread_tid(thread), trace_obj, 0);
-
-       thread->sched_flags &= ~TH_SFLAG_PROMOTED;
-       thread->promotion_priority = 0;
-
-       thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
-}
-
-/* called with thread locked */
-void
-assert_promotions_invariant(thread_t thread)
-{
-       if (thread->promotions > 0) {
-               assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED);
-       }
-
-       if (thread->promotions == 0) {
-               assert((thread->sched_flags & TH_SFLAG_PROMOTED) != TH_SFLAG_PROMOTED);
-       }
-}
-
 /*
  * Promote thread to have a sched pri floor for a specific reason
  *
index dcb7d76a8258c6ce1b02fa1cf785639dcc0fb92c..fc35f70a3182de987a82a24b002242ccde9f51d7 100644 (file)
@@ -741,8 +741,8 @@ priority_queue_entry_increase(struct priority_queue *que, priority_queue_entry_t
  *              <type *> min element
  */
 #define priority_queue_min(q, type, field) ({                                                                   \
-       assert(pqueue_is_min_heap(que));                                                                         \
-       priority_queue_entry_key(pqueue_unpack_root(q), type, field);                                           \
+       assert(pqueue_is_min_heap(q));                                                                          \
+       pqe_element(pqueue_unpack_root(q), type, field);                                                        \
 })
 
 /*
@@ -807,7 +807,7 @@ priority_queue_entry_increase(struct priority_queue *que, priority_queue_entry_t
  *              <type *> min element
  */
 #define priority_queue_remove_min(q, type, field, cmp_fn) ({                                                    \
-       assert(pqueue_is_min_heap(que));                                                                         \
+       assert(pqueue_is_min_heap(q));                                                                         \
        pqe_element(pqueue_remove_root(q, pqueue_unpack_root(q), cmp_fn), type, field);                         \
 })
 
index 486efc100f6589b8ec95415243c3611879b34361..85c506f04dc6e19d18ace459a05ce86aa58e8590 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -94,7 +94,7 @@
 
 struct processor_set    pset0;
 struct pset_node                pset_node0;
-decl_simple_lock_data(static, pset_node_lock)
+decl_simple_lock_data(static, pset_node_lock);
 
 lck_grp_t pset_lck_grp;
 
@@ -105,13 +105,13 @@ int                                             tasks_count;
 int                                             terminated_tasks_count;
 queue_head_t                    threads;
 int                                             threads_count;
-decl_lck_mtx_data(, tasks_threads_lock)
-decl_lck_mtx_data(, tasks_corpse_lock)
+decl_lck_mtx_data(, tasks_threads_lock);
+decl_lck_mtx_data(, tasks_corpse_lock);
 
 processor_t                             processor_list;
 unsigned int                    processor_count;
 static processor_t              processor_list_tail;
-decl_simple_lock_data(, processor_list_lock)
+decl_simple_lock_data(, processor_list_lock);
 
 uint32_t                                processor_avail_count;
 uint32_t                                processor_avail_count_user;
@@ -198,7 +198,7 @@ processor_init(
        assert(cpu_id < MAX_SCHED_CPUS);
 
        processor->state = PROCESSOR_OFF_LINE;
-       processor->active_thread = processor->next_thread = processor->idle_thread = THREAD_NULL;
+       processor->active_thread = processor->startup_thread = processor->idle_thread = THREAD_NULL;
        processor->processor_set = pset;
        processor_state_update_idle(processor);
        processor->starting_pri = MINPRI;
@@ -207,10 +207,11 @@ processor_init(
        processor->quantum_end = UINT64_MAX;
        processor->deadline = UINT64_MAX;
        processor->first_timeslice = FALSE;
+       processor->processor_offlined = false;
        processor->processor_primary = processor; /* no SMT relationship known at this point */
        processor->processor_secondary = NULL;
-       processor->is_SMT = FALSE;
-       processor->is_recommended = (pset->recommended_bitmask & (1ULL << cpu_id)) ? TRUE : FALSE;
+       processor->is_SMT = false;
+       processor->is_recommended = true;
        processor->processor_self = IP_NULL;
        processor_data_init(processor);
        processor->processor_list = NULL;
@@ -221,6 +222,9 @@ processor_init(
        s = splsched();
        pset_lock(pset);
        bit_set(pset->cpu_bitmask, cpu_id);
+       bit_set(pset->recommended_bitmask, cpu_id);
+       bit_set(pset->primary_map, cpu_id);
+       bit_set(pset->cpu_state_map[PROCESSOR_OFF_LINE], cpu_id);
        if (pset->cpu_set_count++ == 0) {
                pset->cpu_set_low = pset->cpu_set_hi = cpu_id;
        } else {
@@ -402,10 +406,9 @@ pset_init(
        pset->cpu_set_count = 0;
        pset->last_chosen = -1;
        pset->cpu_bitmask = 0;
-       pset->recommended_bitmask = ~0ULL;
-       pset->primary_map = ~0ULL;
-       pset->cpu_state_map[PROCESSOR_OFF_LINE] = ~0ULL;
-       for (uint i = PROCESSOR_SHUTDOWN; i < PROCESSOR_STATE_LEN; i++) {
+       pset->recommended_bitmask = 0;
+       pset->primary_map = 0;
+       for (uint i = 0; i < PROCESSOR_STATE_LEN; i++) {
                pset->cpu_state_map[i] = 0;
        }
        pset->pending_AST_URGENT_cpu_mask = 0;
@@ -662,8 +665,8 @@ processor_start(
         *      start up thread.
         */
        if (processor->active_thread == THREAD_NULL &&
-           processor->next_thread == THREAD_NULL) {
-               result = kernel_thread_create((thread_continue_t)processor_start_thread, NULL, MAXPRI_KERNEL, &thread);
+           processor->startup_thread == THREAD_NULL) {
+               result = kernel_thread_create(processor_start_thread, NULL, MAXPRI_KERNEL, &thread);
                if (result != KERN_SUCCESS) {
                        s = splsched();
                        pset_lock(pset);
@@ -677,7 +680,7 @@ processor_start(
                s = splsched();
                thread_lock(thread);
                thread->bound_processor = processor;
-               processor->next_thread = thread;
+               processor->startup_thread = thread;
                thread->state = TH_RUN;
                thread->last_made_runnable_time = mach_absolute_time();
                thread_unlock(thread);
@@ -1416,9 +1419,39 @@ pset_reference(
        return;
 }
 
+
+#if CONFIG_SCHED_CLUTCH
+
+/*
+ * The clutch scheduler decides the recommendation of a thread based
+ * on its thread group's properties and recommendations. The only thread
+ * level property it looks at is the bucket for the thread to implement
+ * the policy of not running Utility & BG buckets on the P-cores. Any
+ * other policy being added to this routine might need to be reflected
+ * in places such as sched_clutch_hierarchy_thread_pset() &
+ * sched_clutch_migrate_thread_group() which rely on getting the recommendations
+ * right.
+ *
+ * Note: The current implementation does not support TH_SFLAG_ECORE_ONLY &
+ * TH_SFLAG_PCORE_ONLY flags which are used for debugging utilities. A similar
+ * version of that functionality can be implemented by putting these flags
+ * on a thread group instead of individual thread basis.
+ *
+ */
 pset_cluster_type_t
 recommended_pset_type(thread_t thread)
 {
        (void)thread;
        return PSET_SMP;
 }
+
+#else /* CONFIG_SCHED_CLUTCH */
+
+pset_cluster_type_t
+recommended_pset_type(thread_t thread)
+{
+       (void)thread;
+       return PSET_SMP;
+}
+
+#endif /* CONFIG_SCHED_CLUTCH */
index 223aae3b79518116b4842f157f45da7b83530547..06e54544c4d332ec32e51518e3000b43c04177d1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -83,6 +83,9 @@
 #include <mach/sfi_class.h>
 #include <kern/processor_data.h>
 #include <kern/cpu_quiesce.h>
+#include <kern/sched_clutch.h>
+#include <kern/assert.h>
+#include <machine/limits.h>
 
 /*
  *     Processor state is accessed by locking the scheduling lock
  */
 #endif
 
-#define PROCESSOR_OFF_LINE              0       /* Not available */
-#define PROCESSOR_SHUTDOWN              1       /* Going off-line */
-#define PROCESSOR_START                 2       /* Being started */
-/*                                      3         Formerly Inactive (unavailable) */
-#define PROCESSOR_IDLE                  4       /* Idle (available) */
-#define PROCESSOR_DISPATCHING   5       /* Dispatching (idle -> active) */
-#define PROCESSOR_RUNNING               6       /* Normal execution */
-#define PROCESSOR_STATE_LEN             (PROCESSOR_RUNNING+1)
+typedef enum {
+       PROCESSOR_OFF_LINE      = 0,    /* Not available */
+       PROCESSOR_SHUTDOWN      = 1,    /* Going off-line */
+       PROCESSOR_START         = 2,    /* Being started */
+       PROCESSOR_UNUSED        = 3,    /* Formerly Inactive (unavailable) */
+       PROCESSOR_IDLE          = 4,    /* Idle (available) */
+       PROCESSOR_DISPATCHING   = 5,    /* Dispatching (idle -> active) */
+       PROCESSOR_RUNNING       = 6,    /* Normal execution */
+       PROCESSOR_STATE_LEN     = (PROCESSOR_RUNNING + 1)
+} processor_state_t;
 
 typedef enum {
        PSET_SMP,
@@ -160,10 +165,10 @@ struct processor_set {
 #define SCHED_PSET_TLOCK (1)
 #if __SMP__
 #if     defined(SCHED_PSET_TLOCK)
-       /* TODO: reorder struct for temporal cache locality */
+/* TODO: reorder struct for temporal cache locality */
        __attribute__((aligned(128))) lck_ticket_t      sched_lock;
 #else /* SCHED_PSET_TLOCK*/
-       __attribute__((aligned(128))) simple_lock_data_t        sched_lock;
+       __attribute__((aligned(128))) lck_spin_t        sched_lock;     /* lock for above */
 #endif /* SCHED_PSET_TLOCK*/
 #endif
 
@@ -171,6 +176,9 @@ struct processor_set {
        struct run_queue        pset_runq;      /* runq for this processor set */
 #endif
        struct rt_queue         rt_runq;        /* realtime runq for this processor set */
+#if CONFIG_SCHED_CLUTCH
+       struct sched_clutch_root                pset_clutch_root; /* clutch hierarchy root */
+#endif /* CONFIG_SCHED_CLUTCH */
 
 #if defined(CONFIG_SCHED_TRADITIONAL)
        int                                     pset_runq_bound_count;
@@ -221,16 +229,16 @@ extern struct pset_node pset_node0;
 
 extern queue_head_t             tasks, terminated_tasks, threads, corpse_tasks; /* Terminated tasks are ONLY for stackshot */
 extern int                              tasks_count, terminated_tasks_count, threads_count;
-decl_lck_mtx_data(extern, tasks_threads_lock)
-decl_lck_mtx_data(extern, tasks_corpse_lock)
+decl_lck_mtx_data(extern, tasks_threads_lock);
+decl_lck_mtx_data(extern, tasks_corpse_lock);
 
 struct processor {
-       int                     state;                  /* See above */
+       processor_state_t       state;                  /* See above */
        bool                    is_SMT;
        bool                    is_recommended;
        struct thread           *active_thread;         /* thread running on processor */
-       struct thread           *next_thread;           /* next thread when dispatched */
        struct thread           *idle_thread;           /* this processor's idle thread. */
+       struct thread           *startup_thread;
 
        processor_set_t         processor_set;  /* assigned set */
 
@@ -255,6 +263,7 @@ struct processor {
 
        uint64_t                        deadline;               /* current deadline */
        bool                    first_timeslice;        /* has the quantum expired since context switch */
+       bool                    processor_offlined;        /* has the processor been explicitly processor_offline'ed */
        bool                    must_idle;              /* Needs to be forced idle as next selected thread is allowed on this processor */
 
        processor_t             processor_primary;      /* pointer to primary processor for
@@ -279,7 +288,7 @@ struct processor {
 };
 
 extern processor_t              processor_list;
-decl_simple_lock_data(extern, processor_list_lock)
+decl_simple_lock_data(extern, processor_list_lock);
 
 #define MAX_SCHED_CPUS          64 /* Maximum number of CPUs supported by the scheduler.  bits.h:bitmap_*() macros need to be used to support greater than 64 */
 extern processor_t              processor_array[MAX_SCHED_CPUS]; /* array indexed by cpuid */
@@ -304,20 +313,16 @@ extern lck_grp_t pset_lck_grp;
 #define pset_unlock(p)                  lck_ticket_unlock(&(p)->sched_lock)
 #define pset_assert_locked(p)           lck_ticket_assert_owned(&(p)->sched_lock)
 #else /* SCHED_PSET_TLOCK*/
-#define pset_lock(p)                    simple_lock(&(p)->sched_lock, &pset_lck_grp)
-#define pset_unlock(p)                  simple_unlock(&(p)->sched_lock)
-#define pset_lock_init(p)               simple_lock_init(&(p)->sched_lock, 0)
-#if defined(__arm__) || defined(__arm64__)
+#define pset_lock_init(p)               lck_spin_init(&(p)->sched_lock, &pset_lck_grp, NULL)
+#define pset_lock(p)                    lck_spin_lock_grp(&(p)->sched_lock, &pset_lck_grp)
+#define pset_unlock(p)                  lck_spin_unlock(&(p)->sched_lock)
 #define pset_assert_locked(p)           LCK_SPIN_ASSERT(&(p)->sched_lock, LCK_ASSERT_OWNED)
-#else /* arm || arm64 */
-/* See <rdar://problem/39630910> pset_lock() should be converted to use lck_spin_lock() instead of simple_lock() */
-#define pset_assert_locked(p)           do { (void)p; } while(0)
-#endif /* !arm && !arm64 */
-#endif /* !SCHED_PSET_TLOCK */
+#endif /*!SCHED_PSET_TLOCK*/
+
 #define rt_lock_lock(p)                 simple_lock(&SCHED(rt_runq)(p)->rt_lock, &pset_lck_grp)
 #define rt_lock_unlock(p)               simple_unlock(&SCHED(rt_runq)(p)->rt_lock)
 #define rt_lock_init(p)                 simple_lock_init(&SCHED(rt_runq)(p)->rt_lock, 0)
-#else /* !SMP */
+#else
 #define pset_lock(p)                    do { (void)p; } while(0)
 #define pset_unlock(p)                  do { (void)p; } while(0)
 #define pset_lock_init(p)               do { (void)p; } while(0)
@@ -468,6 +473,8 @@ extern unsigned int             processor_count;
 extern processor_t      cpu_to_processor(int cpu);
 
 extern kern_return_t    enable_smt_processors(bool enable);
+
+extern boolean_t        processor_in_panic_context(processor_t processor);
 __END_DECLS
 
 #endif /* KERNEL_PRIVATE */
index 01a7386750d02895ab75678367b42be6ea3c492d..b658db17f1f14b1b8ab05b436342492cfdb0a84d 100644 (file)
@@ -47,3 +47,10 @@ processor_data_init(
 
        PROCESSOR_DATA(processor, debugger_state).db_current_op = DBOP_NONE;
 }
+
+boolean_t
+processor_in_panic_context(
+       processor_t             processor)
+{
+       return PROCESSOR_DATA(processor, debugger_state).db_entry_count > 0;
+}
index 6c2f21ec53449bb3f03c0d6ec8f01d4e1ba4b4dd..bee13ded0359200248c3e074d32578966f33281e 100644 (file)
@@ -36,6 +36,8 @@
  * #include kern/processor.h instead of this file.
  */
 
+#ifdef XNU_KERNEL_PRIVATE
+
 #ifdef MACH_KERNEL_PRIVATE
 
 #include <ipc/ipc_kmsg.h>
@@ -150,4 +152,8 @@ MACRO_END
 
 #endif /* MACH_KERNEL_PRIVATE */
 
+extern boolean_t processor_in_panic_context(processor_t processor);
+
+#endif /* XNU_KERNEL_PRIVATE */
+
 #endif /* _KERN_PROCESSOR_DATA_H_ */
index 6af62629fddcab41e67364c78915b880e364641a..68032cbe7f7414bd19c7ab467bc16faeaf6b881b 100644 (file)
@@ -71,6 +71,7 @@
 #include <kern/macro_help.h>
 
 #include <sys/cdefs.h>
+#include <string.h>
 
 __BEGIN_DECLS
 
@@ -231,14 +232,14 @@ __QUEUE_ELT_VALIDATE(queue_entry_t elt)
 {
        queue_entry_t   elt_next, elt_prev;
 
-       if (__improbable(elt == (queue_entry_t)0)) {
+       if (__improbable(elt == (queue_entry_t)NULL)) {
                panic("Invalid queue element %p", elt);
        }
 
        elt_next = elt->next;
        elt_prev = elt->prev;
 
-       if (__improbable(elt_next == (queue_entry_t)0 || elt_prev == (queue_entry_t)0)) {
+       if (__improbable(elt_next == (queue_entry_t)NULL || elt_prev == (queue_entry_t)NULL)) {
                panic("Invalid queue element pointers for %p: next %p prev %p", elt, elt_next, elt_prev);
        }
        if (__improbable(elt_next->prev != elt || elt_prev->next != elt)) {
@@ -250,8 +251,8 @@ __QUEUE_ELT_VALIDATE(queue_entry_t elt)
 static inline void
 __DEQUEUE_ELT_CLEANUP(queue_entry_t elt)
 {
-       (elt)->next = (queue_entry_t) 0;
-       (elt)->prev = (queue_entry_t) 0;
+       (elt)->next = (queue_entry_t)NULL;
+       (elt)->prev = (queue_entry_t)NULL;
 }
 #else
 #define __QUEUE_ELT_VALIDATE(elt) do { } while (0)
@@ -292,7 +293,7 @@ static __inline__ queue_entry_t
 dequeue_head(
        queue_t que)
 {
-       queue_entry_t   elt = (queue_entry_t) 0;
+       queue_entry_t   elt = (queue_entry_t)NULL;
        queue_entry_t   new_head;
 
        if (que->next != que) {
@@ -311,7 +312,7 @@ static __inline__ queue_entry_t
 dequeue_tail(
        queue_t que)
 {
-       queue_entry_t   elt = (queue_entry_t) 0;
+       queue_entry_t   elt = (queue_entry_t)NULL;
        queue_entry_t   new_tail;
 
        if (que->prev != que) {
@@ -449,8 +450,7 @@ re_queue_tail(queue_t que, queue_entry_t elt)
  *     Note:
  *             Do not use pointer types for <type>
  */
-#define qe_element(qe, type, field) \
-       ((type *)((void *)((char *)(qe) - __offsetof(type, field))))
+#define qe_element(qe, type, field) __container_of(qe, type, field)
 
 /*
  *     Macro:          qe_foreach
index 11022ed4d9bb7afa9e9f74cc42cf3c9117639680..0fa8aa49117307113dc164dccfb90556118f32d7 100644 (file)
@@ -25,7 +25,6 @@
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
-#include <stdatomic.h>
 #include <mach/mach_time.h>
 #include <mach/clock_types.h>
 #include <kern/misc_protos.h>
@@ -39,6 +38,7 @@
 #include <kern/kern_types.h>
 #include <kern/thread.h>
 #include <machine/commpage.h>
+#include <machine/atomic.h>
 
 #if CONFIG_MACH_BRIDGE_SEND_TIME
 
@@ -58,9 +58,9 @@ uint32_t mach_bridge_timer_enable(uint32_t new_value, int change);
 extern void mach_bridge_send_timestamp(uint64_t);
 
 void
-mach_bridge_timer_maintenance()
+mach_bridge_timer_maintenance(void)
 {
-       if (!bt_init_flag) {
+       if (!os_atomic_load(&bt_init_flag, acquire)) {
                return;
        }
 
@@ -81,7 +81,7 @@ done:
 void
 mach_bridge_timer_init(void)
 {
-       assert(!bt_init_flag);
+       assert(!os_atomic_load(&bt_init_flag, relaxed));
        /* Initialize the lock */
        static lck_grp_t *bt_lck_grp = NULL;
        bt_lck_grp = lck_grp_alloc_init("bridgetimestamp", LCK_GRP_ATTR_NULL);
@@ -97,7 +97,7 @@ uint32_t
 mach_bridge_timer_enable(uint32_t new_value, int change)
 {
        uint32_t current_value = 0;
-       assert(bt_init_flag == 1);
+       assert(os_atomic_load(&bt_init_flag, relaxed));
        lck_spin_lock(bt_maintenance_lock);
        if (change) {
                bt_enable_flag = new_value;
@@ -119,6 +119,7 @@ mach_bridge_timer_enable(uint32_t new_value, int change)
 void mach_bridge_add_timestamp(uint64_t remote_timestamp, uint64_t local_timestamp);
 void bt_calibration_thread_start(void);
 lck_spin_t *ts_conversion_lock = NULL;
+void bt_params_add(struct bt_params *params);
 
 /* function called by sysctl */
 struct bt_params bt_params_get_latest(void);
@@ -140,7 +141,7 @@ static uint64_t received_remote_timestamp = 0;
 static struct bt_params bt_params_hist[BT_PARAMS_COUNT] = {};
 static int bt_params_idx = -1;
 
-static inline void
+void
 bt_params_add(struct bt_params *params)
 {
        lck_spin_assert(ts_conversion_lock, LCK_ASSERT_OWNED);
@@ -149,6 +150,7 @@ bt_params_add(struct bt_params *params)
        bt_params_hist[bt_params_idx] = *params;
 }
 
+#if defined(XNU_TARGET_OS_BRIDGE)
 static inline struct bt_params*
 bt_params_find(uint64_t local_ts)
 {
@@ -169,6 +171,20 @@ bt_params_find(uint64_t local_ts)
 
        return NULL;
 }
+#endif /* defined(XNU_TARGET_OS_BRIDGE) */
+
+static inline struct bt_params
+bt_params_get_latest_locked(void)
+{
+       lck_spin_assert(ts_conversion_lock, LCK_ASSERT_OWNED);
+
+       struct bt_params latest_params = {};
+       if (bt_params_idx >= 0) {
+               latest_params = bt_params_hist[bt_params_idx];
+       }
+
+       return latest_params;
+}
 
 struct bt_params
 bt_params_get_latest(void)
@@ -176,11 +192,9 @@ bt_params_get_latest(void)
        struct bt_params latest_params = {};
 
        /* Check if ts_converison_lock has been initialized */
-       if (atomic_load(&bt_init_flag)) {
+       if (os_atomic_load(&bt_init_flag, acquire)) {
                lck_spin_lock(ts_conversion_lock);
-               if (bt_params_idx >= 0) {
-                       latest_params = bt_params_hist[bt_params_idx];
-               }
+               latest_params = bt_params_get_latest_locked();
                lck_spin_unlock(ts_conversion_lock);
        }
        return latest_params;
@@ -472,7 +486,9 @@ bt_calibration_thread_start(void)
  * the local time.
  *
  * If local_timestamp = 0, then the remote_timestamp is calculated
- * corresponding to the current mach_absolute_time. Monotonicity of
+ * corresponding to the current mach_absolute_time.
+ *
+ * If XNU_TARGET_OS_BRIDGE is defined, then monotonicity of
  * predicted time is guaranteed only for recent local_timestamp values
  * lesser than the current mach_absolute_time upto 1 second.
  *
@@ -499,27 +515,31 @@ mach_bridge_remote_time(uint64_t local_timestamp)
        /* neither the send or receive side of the bridge is defined: echo the input */
        return local_timestamp;
 #else
-       if (!atomic_load(&bt_init_flag)) {
+       if (!os_atomic_load(&bt_init_flag, acquire)) {
                return 0;
        }
 
+       uint64_t remote_timestamp = 0;
+
        lck_spin_lock(ts_conversion_lock);
        uint64_t now = mach_absolute_time();
-
-       uint64_t remote_timestamp = 0;
-       uint64_t local_timestamp_ns = 0;
        if (!local_timestamp) {
                local_timestamp = now;
-       } else if (local_timestamp > now) {
-               goto out_unlock;
        }
-       absolutetime_to_nanoseconds(local_timestamp, &local_timestamp_ns);
-       struct bt_params *params = bt_params_find(local_timestamp_ns);
-       remote_timestamp = mach_bridge_compute_timestamp(local_timestamp_ns, params);
-
-out_unlock:
+#if defined(XNU_TARGET_OS_BRIDGE)
+       uint64_t local_timestamp_ns = 0;
+       if (local_timestamp < now) {
+               absolutetime_to_nanoseconds(local_timestamp, &local_timestamp_ns);
+               struct bt_params *params = bt_params_find(local_timestamp_ns);
+               remote_timestamp = mach_bridge_compute_timestamp(local_timestamp_ns, params);
+       }
+#else
+       struct bt_params params = bt_params_get_latest_locked();
+       remote_timestamp = mach_bridge_compute_timestamp(local_timestamp, &params);
+#endif /* defined(XNU_TARGET_OS_BRIDGE) */
        lck_spin_unlock(ts_conversion_lock);
        KDBG(MACHDBG_CODE(DBG_MACH_CLOCK, MACH_BRIDGE_REMOTE_TIME), local_timestamp, remote_timestamp, now);
+
        return remote_timestamp;
 #endif /* !defined(CONFIG_MACH_BRIDGE_RECV_TIME) */
 #endif /* defined(CONFIG_MACH_BRIDGE_SEND_TIME) */
index dc1d04a6154616863627f29b3724c072dadfc93e..020e845b476bb7bc9f392d1cf87e69855bfaf718 100644 (file)
@@ -55,7 +55,12 @@ mach_bridge_compute_timestamp(uint64_t local_ts_ns, struct bt_params *params)
         */
        int64_t remote_ts = 0;
        int64_t rate_prod = 0;
-       rate_prod = (int64_t)(params->rate * (double)((int64_t)local_ts_ns - (int64_t)params->base_local_ts));
+       /* To avoid precision loss due to typecasting from int64_t to double */
+       if (params->rate != 1.0) {
+               rate_prod = (int64_t)(params->rate * (double)((int64_t)local_ts_ns - (int64_t)params->base_local_ts));
+       } else {
+               rate_prod = (int64_t)local_ts_ns - (int64_t)params->base_local_ts;
+       }
        if (os_add_overflow((int64_t)params->base_remote_ts, rate_prod, &remote_ts)) {
                return 0;
        }
diff --git a/osfmk/kern/restartable.c b/osfmk/kern/restartable.c
new file mode 100644 (file)
index 0000000..c4e0a7a
--- /dev/null
@@ -0,0 +1,450 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <mach/mach_types.h>
+#include <mach/task.h>
+
+#include <kern/ast.h>
+#include <kern/kalloc.h>
+#include <kern/kern_types.h>
+#include <kern/mach_param.h>
+#include <kern/machine.h>
+#include <kern/misc_protos.h>
+#include <kern/processor.h>
+#include <kern/queue.h>
+#include <kern/restartable.h>
+#include <kern/task.h>
+#include <kern/thread.h>
+#include <kern/waitq.h>
+
+#include <os/hash.h>
+#include <os/refcnt.h>
+
+/**
+ * @file osfmk/kern/restartable.c
+ *
+ * @brief
+ * This module implements restartable userspace functions.
+ *
+ * @discussion
+ * task_restartable_ranges_register() allows task to configure
+ * the restartable ranges, only once per task,
+ * before it has made its second thread.
+ *
+ * task_restartable_ranges_synchronize() can later be used to trigger
+ * restarts for threads with a PC in a restartable region.
+ *
+ * It is implemented with an AST (AST_RESET_PCS) that will cause threads
+ * as they return to userspace to reset PCs in a restartable region
+ * to the recovery offset of this region.
+ *
+ * Because signal delivery would mask the proper saved PC for threads,
+ * sigreturn also forcefully sets the AST and will go through the logic
+ * every single time.
+ */
+
+typedef int (*cmpfunc_t)(const void *a, const void *b);
+extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
+
+struct restartable_ranges {
+       queue_chain_t            rr_link;
+       os_refcnt_t              rr_ref;
+       uint32_t                 rr_count;
+       uint32_t                 rr_hash;
+       task_restartable_range_t rr_ranges[];
+};
+
+#if DEBUG || DEVELOPMENT
+#define RR_HASH_SIZE   256
+#else
+// Release kernel userspace should have shared caches and a single registration
+#define RR_HASH_SIZE    16
+#endif
+
+static queue_head_t rr_hash[RR_HASH_SIZE];
+lck_spin_t rr_spinlock;
+lck_grp_t rr_lock_grp;
+
+#define rr_lock()   lck_spin_lock_grp(&rr_spinlock, &rr_lock_grp)
+#define rr_unlock() lck_spin_unlock(&rr_spinlock);
+
+#pragma mark internals
+
+/**
+ * @function _ranges_cmp
+ *
+ * @brief
+ * Compares two ranges together.
+ */
+static int
+_ranges_cmp(const void *_r1, const void *_r2)
+{
+       const task_restartable_range_t *r1 = _r1;
+       const task_restartable_range_t *r2 = _r2;
+
+       if (r1->location != r2->location) {
+               return r1->location < r2->location ? -1 : 1;
+       }
+       if (r1->length == r2->length) {
+               return 0;
+       }
+       return r1->length < r2->length ? -1 : 1;
+}
+
+/**
+ * @function _ranges_validate
+ *
+ * @brief
+ * Validates an array of PC ranges for wraps and intersections.
+ *
+ * @discussion
+ * This sorts and modifies the input.
+ *
+ * The ranges must:
+ * - not wrap around,
+ * - have a length/recovery offset within a page of the range start
+ *
+ * @returns
+ * - KERN_SUCCESS:          ranges are valid
+ * - KERN_INVALID_ARGUMENT: ranges are invalid
+ */
+static kern_return_t
+_ranges_validate(task_t task, task_restartable_range_t *ranges, uint32_t count)
+{
+       qsort(ranges, count, sizeof(task_restartable_range_t), _ranges_cmp);
+       uint64_t limit = task_has_64Bit_data(task) ? UINT64_MAX : UINT32_MAX;
+       uint64_t end, recovery;
+
+       for (size_t i = 0; i < count; i++) {
+               if (ranges[i].length > TASK_RESTARTABLE_OFFSET_MAX ||
+                   ranges[i].recovery_offs > TASK_RESTARTABLE_OFFSET_MAX) {
+                       return KERN_INVALID_ARGUMENT;
+               }
+               if (ranges[i].flags) {
+                       return KERN_INVALID_ARGUMENT;
+               }
+               if (os_add_overflow(ranges[i].location, ranges[i].length, &end)) {
+                       return KERN_INVALID_ARGUMENT;
+               }
+               if (os_add_overflow(ranges[i].location, ranges[i].recovery_offs, &recovery)) {
+                       return KERN_INVALID_ARGUMENT;
+               }
+               if (ranges[i].location > limit || end > limit || recovery > limit) {
+                       return KERN_INVALID_ARGUMENT;
+               }
+               if (i + 1 < count && end > ranges[i + 1].location) {
+                       return KERN_INVALID_ARGUMENT;
+               }
+       }
+
+       return KERN_SUCCESS;
+}
+
+/**
+ * @function _ranges_lookup
+ *
+ * @brief
+ * Lookup the left side of a range for a given PC within a set of ranges.
+ *
+ * @returns
+ * - 0: no PC range found
+ * - the left-side of the range.
+ */
+__attribute__((always_inline))
+static mach_vm_address_t
+_ranges_lookup(struct restartable_ranges *rr, mach_vm_address_t pc)
+{
+       task_restartable_range_t *ranges = rr->rr_ranges;
+       uint32_t l = 0, r = rr->rr_count;
+
+       if (pc <= ranges[0].location) {
+               return 0;
+       }
+       if (pc >= ranges[r - 1].location + ranges[r - 1].length) {
+               return 0;
+       }
+
+       while (l < r) {
+               uint32_t i = (r + l) / 2;
+               mach_vm_address_t location = ranges[i].location;
+
+               if (pc <= location) {
+                       /* if the PC is exactly at pc_start, no reset is needed */
+                       r = i;
+               } else if (location + ranges[i].length <= pc) {
+                       /* if the PC is exactly at the end, it's out of the function */
+                       l = i + 1;
+               } else {
+                       /* else it's strictly in the range, return the recovery pc */
+                       return location + ranges[i].recovery_offs;
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * @function _restartable_ranges_dispose
+ *
+ * @brief
+ * Helper to dispose of a range that has reached a 0 refcount.
+ */
+__attribute__((noinline))
+static void
+_restartable_ranges_dispose(struct restartable_ranges *rr, bool hash_remove)
+{
+       if (hash_remove) {
+               rr_lock();
+               remqueue(&rr->rr_link);
+               rr_unlock();
+       }
+       kfree(rr, sizeof(*rr) + rr->rr_count * sizeof(task_restartable_range_t));
+}
+
+/**
+ * @function _restartable_ranges_equals
+ *
+ * @brief
+ * Helper to compare two restartable ranges.
+ */
+static bool
+_restartable_ranges_equals(
+       const struct restartable_ranges *rr1,
+       const struct restartable_ranges *rr2)
+{
+       size_t rr1_size = rr1->rr_count * sizeof(task_restartable_range_t);
+       return rr1->rr_hash == rr2->rr_hash &&
+              rr1->rr_count == rr2->rr_count &&
+              memcmp(rr1->rr_ranges, rr2->rr_ranges, rr1_size) == 0;
+}
+
+/**
+ * @function _restartable_ranges_create
+ *
+ * @brief
+ * Helper to create a uniqued restartable range.
+ *
+ * @returns
+ * - KERN_SUCCESS
+ * - KERN_INVALID_ARGUMENT: the validation of the new ranges failed.
+ * - KERN_RESOURCE_SHORTAGE: too many ranges, out of memory
+ */
+static kern_return_t
+_restartable_ranges_create(task_t task, task_restartable_range_t *ranges,
+    uint32_t count, struct restartable_ranges **rr_storage)
+{
+       struct restartable_ranges *rr, *rr_found, *rr_base;
+       queue_head_t *head;
+       uint32_t base_count, total_count;
+       size_t base_size, size;
+       kern_return_t kr;
+
+       rr_base = *rr_storage;
+       base_count = rr_base ? rr_base->rr_count : 0;
+       base_size = sizeof(task_restartable_range_t) * base_count;
+       size = sizeof(task_restartable_range_t) * count;
+
+       if (os_add_overflow(base_count, count, &total_count)) {
+               return KERN_INVALID_ARGUMENT;
+       }
+       if (total_count > 1024) {
+               return KERN_RESOURCE_SHORTAGE;
+       }
+
+       rr = kalloc(sizeof(*rr) + base_size + size);
+       if (rr == NULL) {
+               return KERN_RESOURCE_SHORTAGE;
+       }
+
+       queue_chain_init(rr->rr_link);
+       os_ref_init(&rr->rr_ref, NULL);
+       rr->rr_count = total_count;
+       if (base_size) {
+               memcpy(rr->rr_ranges, rr_base->rr_ranges, base_size);
+       }
+       memcpy(rr->rr_ranges + base_count, ranges, size);
+       kr = _ranges_validate(task, rr->rr_ranges, total_count);
+       if (kr) {
+               _restartable_ranges_dispose(rr, false);
+               return kr;
+       }
+       rr->rr_hash = os_hash_jenkins(rr->rr_ranges,
+           rr->rr_count * sizeof(task_restartable_range_t));
+
+       head = &rr_hash[rr->rr_hash % RR_HASH_SIZE];
+
+       rr_lock();
+       queue_iterate(head, rr_found, struct restartable_ranges *, rr_link) {
+               if (_restartable_ranges_equals(rr, rr_found) &&
+               os_ref_retain_try(&rr_found->rr_ref)) {
+                       goto found;
+               }
+       }
+
+       enqueue_tail(head, &rr->rr_link);
+       rr_found = rr;
+
+found:
+       if (rr_base && os_ref_release_relaxed(&rr_base->rr_ref) == 0) {
+               remqueue(&rr_base->rr_link);
+       } else {
+               rr_base = NULL;
+       }
+       rr_unlock();
+
+       *rr_storage = rr_found;
+
+       if (rr_found != rr) {
+               _restartable_ranges_dispose(rr, false);
+       }
+       if (rr_base) {
+               _restartable_ranges_dispose(rr_base, false);
+       }
+       return KERN_SUCCESS;
+}
+
+#pragma mark extern interfaces
+
+void
+restartable_ranges_release(struct restartable_ranges *rr)
+{
+       if (os_ref_release_relaxed(&rr->rr_ref) == 0) {
+               _restartable_ranges_dispose(rr, true);
+       }
+}
+
+void
+thread_reset_pcs_ast(thread_t thread)
+{
+       task_t task = thread->task;
+       struct restartable_ranges *rr;
+       mach_vm_address_t pc;
+
+       /*
+        * Because restartable_ranges are set while the task only has on thread
+        * and can't be mutated outside of this, no lock is required to read this.
+        */
+       rr = task->restartable_ranges;
+       if (rr) {
+               /* pairs with the barrier in task_restartable_ranges_synchronize() */
+               os_atomic_thread_fence(acquire);
+
+               pc = _ranges_lookup(rr, machine_thread_pc(thread));
+
+               if (pc) {
+                       machine_thread_reset_pc(thread, pc);
+               }
+       }
+}
+
+void
+restartable_init(void)
+{
+       lck_grp_init(&rr_lock_grp, "restartable ranges", LCK_GRP_ATTR_NULL);
+       lck_spin_init(&rr_spinlock, &rr_lock_grp, LCK_ATTR_NULL);
+       for (size_t i = 0; i < RR_HASH_SIZE; i++) {
+               queue_head_init(rr_hash[i]);
+       }
+}
+
+#pragma mark MiG interfaces
+
+kern_return_t
+task_restartable_ranges_register(
+       task_t                    task,
+       task_restartable_range_t *ranges,
+       mach_msg_type_number_t    count)
+{
+       kern_return_t kr;
+       thread_t th;
+
+       if (task != current_task()) {
+               return KERN_FAILURE;
+       }
+
+       kr = _ranges_validate(task, ranges, count);
+
+       if (kr == KERN_SUCCESS) {
+               task_lock(task);
+
+               queue_iterate(&task->threads, th, thread_t, task_threads) {
+                       if (th != current_thread()) {
+                               kr = KERN_NOT_SUPPORTED;
+                               break;
+                       }
+               }
+#if !DEBUG && !DEVELOPMENT
+               /*
+                * For security reasons, on release kernels, only allow for this to be
+                * configured once.
+                *
+                * But to be able to test the feature we need to relax this for
+                * dev kernels.
+                */
+               if (task->restartable_ranges) {
+                       kr = KERN_NOT_SUPPORTED;
+               }
+#endif
+               if (kr == KERN_SUCCESS) {
+                       kr = _restartable_ranges_create(task, ranges, count,
+                           &task->restartable_ranges);
+               }
+               task_unlock(task);
+       }
+
+       return kr;
+}
+
+kern_return_t
+task_restartable_ranges_synchronize(task_t task)
+{
+       thread_t thread;
+
+       if (task != current_task()) {
+               return KERN_FAILURE;
+       }
+
+       /* pairs with the barrier in thread_reset_pcs_ast() */
+       os_atomic_thread_fence(release);
+
+       task_lock(task);
+
+       if (task->restartable_ranges) {
+               queue_iterate(&task->threads, thread, thread_t, task_threads) {
+                       if (thread != current_thread()) {
+                               thread_mtx_lock(thread);
+                               act_set_ast_reset_pcs(thread);
+                               thread_mtx_unlock(thread);
+                       }
+               }
+       }
+
+       task_unlock(task);
+
+       return KERN_SUCCESS;
+}
diff --git a/osfmk/kern/restartable.h b/osfmk/kern/restartable.h
new file mode 100644 (file)
index 0000000..af6ba4d
--- /dev/null
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _KERN_RESTARTABLE_H_
+#define _KERN_RESTARTABLE_H_
+
+#include <sys/cdefs.h>
+#include <mach/message.h>
+#include <mach/task.h>
+
+__BEGIN_DECLS
+
+/*!
+ * @typedef task_restartable_range_t
+ *
+ * @brief
+ * Describes a userspace recoverable range.
+ *
+ * @field location
+ * The pointer to the beginning of a restartable section.
+ *
+ * @field length
+ * The length of the critical section anchored at location.
+ *
+ * @field recovery_offs
+ * The offset from the initial location that should be used for the recovery
+ * codepath.
+ *
+ * @field flags
+ * Currently unused, pass 0.
+ */
+typedef struct {
+       mach_vm_address_t location;
+       unsigned short    length;
+       unsigned short    recovery_offs;
+       unsigned int      flags;
+} task_restartable_range_t;
+
+typedef task_restartable_range_t *task_restartable_range_array_t;
+
+/*!
+ * @function task_restartable_ranges_register
+ *
+ * @brief
+ * Register a set of restartable ranges for the current task.
+ *
+ * @param task
+ * The task to operate on
+ *
+ * @param ranges
+ * An array of address ranges for which PC resets are performed.
+ *
+ * @param count
+ * The number of address ranges.
+ *
+ * @returns
+ * - KERN_SUCCESS on success
+ * - KERN_FAILURE if the task isn't the current one
+ * - KERN_INVALID_ARGUMENT for various invalid inputs
+ * - KERN_NOT_SUPPORTED the request is not supported (second registration on
+ *   release kernels, registration when the task has gone wide)
+ * - KERN_RESOURCE_SHORTAGE if not enough memory
+ */
+extern kern_return_t task_restartable_ranges_register(
+       task_t                         task,
+       task_restartable_range_array_t ranges,
+       mach_msg_type_number_t         count);
+
+/*!
+ * @function task_restartable_ranges_synchronize
+ *
+ * @brief
+ * Require for all threads in the task to reset their PC
+ * if within a restartable range.
+ *
+ * @param task
+ * The task to operate on (needs to be current task)
+ *
+ * @returns
+ * - KERN_SUCCESS
+ * - KERN_FAILURE if the task isn't the current one
+ */
+extern kern_return_t task_restartable_ranges_synchronize(task_t task);
+
+/*!
+ * @const TASK_RESTARTABLE_OFFSET_MAX
+ * The maximum value length / recovery_offs can have.
+ */
+#define TASK_RESTARTABLE_OFFSET_MAX  4096u
+
+#ifdef KERNEL_PRIVATE
+
+struct restartable_ranges;
+
+/**
+ * @function restartable_init
+ *
+ * @brief
+ * Initializes the restartable module.
+ */
+extern void restartable_init(void);
+
+/**
+ * @function restartable_ranges_release
+ *
+ * @brief
+ * Release a reference on a restartable range.
+ */
+extern void restartable_ranges_release(struct restartable_ranges *ranges);
+
+/**
+ * @function thread_reset_pcs_ast
+ *
+ * @brief
+ * Perform the work at the AST boundary to reset thread PCS.
+ */
+extern void thread_reset_pcs_ast(struct thread *thread);
+
+#endif // KERNEL_PRIVATE
+
+__END_DECLS
+
+#endif  /* _KERN_RESTARTABLE_H_ */
index f3c1c88a290d7212e0d0c183027548e746cccbda..be8727dd060091cb8dc4f6419c0309748c718972 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <mach/policy.h>
 #include <kern/kern_types.h>
 #include <kern/smp.h>
-#include <kern/queue.h>
+#include <kern/circle_queue.h>
 #include <kern/macro_help.h>
 #include <kern/timer_call.h>
 #include <kern/ast.h>
 #include <kern/kalloc.h>
 #include <kern/bits.h>
 
-#define NRQS            128                             /* 128 levels per run queue */
+#define NRQS_MAX        (128)                           /* maximum number of priority levels */
 
-#define MAXPRI          (NRQS-1)
+#define MAXPRI          (NRQS_MAX-1)
 #define MINPRI          0                               /* lowest legal priority schedulable */
 #define IDLEPRI         MINPRI                          /* idle thread priority */
 #define NOPRI           -1
  */
 
 #define BASEPRI_RTQUEUES        (BASEPRI_REALTIME + 1)                          /* 97 */
-#define BASEPRI_REALTIME        (MAXPRI - (NRQS / 4) + 1)                       /* 96 */
+#define BASEPRI_REALTIME        (MAXPRI - (NRQS_MAX / 4) + 1)                   /* 96 */
 
 #define MAXPRI_KERNEL           (BASEPRI_REALTIME - 1)                          /* 95 */
 #define BASEPRI_PREEMPT_HIGH    (BASEPRI_PREEMPT + 1)                           /* 93 */
 #define BASEPRI_VM              (BASEPRI_PREEMPT - 1)                           /* 91 */
 
 #define BASEPRI_KERNEL          (MINPRI_KERNEL + 1)                             /* 81 */
-#define MINPRI_KERNEL           (MAXPRI_KERNEL - (NRQS / 8) + 1)                /* 80 */
+#define MINPRI_KERNEL           (MAXPRI_KERNEL - (NRQS_MAX / 8) + 1)            /* 80 */
 
 #define MAXPRI_RESERVED         (MINPRI_KERNEL - 1)                             /* 79 */
 #define BASEPRI_GRAPHICS        (MAXPRI_RESERVED - 3)                           /* 76 */
-#define MINPRI_RESERVED         (MAXPRI_RESERVED - (NRQS / 8) + 1)              /* 64 */
+#define MINPRI_RESERVED         (MAXPRI_RESERVED - (NRQS_MAX / 8) + 1)          /* 64 */
 
 #define MAXPRI_USER             (MINPRI_RESERVED - 1)                           /* 63 */
 #define BASEPRI_CONTROL         (BASEPRI_DEFAULT + 17)                          /* 48 */
 #define BASEPRI_FOREGROUND      (BASEPRI_DEFAULT + 16)                          /* 47 */
 #define BASEPRI_BACKGROUND      (BASEPRI_DEFAULT + 15)                          /* 46 */
 #define BASEPRI_USER_INITIATED  (BASEPRI_DEFAULT +  6)                          /* 37 */
-#define BASEPRI_DEFAULT         (MAXPRI_USER - (NRQS / 4))                      /* 31 */
+#define BASEPRI_DEFAULT         (MAXPRI_USER - (NRQS_MAX / 4))                  /* 31 */
 #define MAXPRI_SUPPRESSED       (BASEPRI_DEFAULT - 3)                           /* 28 */
 #define BASEPRI_UTILITY         (BASEPRI_DEFAULT - 11)                          /* 20 */
 #define MAXPRI_THROTTLE         (MINPRI + 4)                                    /*  4 */
 #define MINPRI_EXEC             (BASEPRI_DEFAULT)       /* floor when in exec state */
 #define MINPRI_WAITQ            (BASEPRI_DEFAULT)       /* floor when in waitq handover state */
 
+#define NRQS                    (BASEPRI_REALTIME)      /* Non-realtime levels for runqs */
+
+/* Ensure that NRQS is large enough to represent all non-realtime threads; even promoted ones */
+_Static_assert((NRQS == (MAXPRI_PROMOTE + 1)), "Runqueues are too small to hold all non-realtime threads");
 
 /* Type used for thread->sched_mode and saved_mode */
 typedef enum {
@@ -183,14 +187,25 @@ typedef enum {
        TH_MODE_TIMESHARE,                                      /* use timesharing algorithm */
 } sched_mode_t;
 
+/*
+ * Since the clutch scheduler organizes threads based on the thread group
+ * and the scheduling bucket, its important to not mix threads from multiple
+ * priority bands into the same bucket. To achieve that, in the clutch bucket
+ * world, there is a scheduling bucket per QoS effectively.
+ */
+
 /* Buckets used for load calculation */
 typedef enum {
-       TH_BUCKET_RUN = 0,      /* All runnable threads */
-       TH_BUCKET_FIXPRI,       /* Fixed-priority */
-       TH_BUCKET_SHARE_FG,     /* Timeshare thread above BASEPRI_DEFAULT */
-       TH_BUCKET_SHARE_DF,     /* Timeshare thread between BASEPRI_DEFAULT and BASEPRI_UTILITY */
-       TH_BUCKET_SHARE_UT,     /* Timeshare thread between BASEPRI_UTILITY and MAXPRI_THROTTLE */
-       TH_BUCKET_SHARE_BG,     /* Timeshare thread between MAXPRI_THROTTLE and MINPRI */
+       TH_BUCKET_FIXPRI = 0,                   /* Fixed-priority */
+       TH_BUCKET_SHARE_FG,                     /* Timeshare thread above BASEPRI_DEFAULT */
+#if CONFIG_SCHED_CLUTCH
+       TH_BUCKET_SHARE_IN,                     /* Timeshare thread between BASEPRI_USER_INITIATED and BASEPRI_DEFAULT */
+#endif /* CONFIG_SCHED_CLUTCH */
+       TH_BUCKET_SHARE_DF,                     /* Timeshare thread between BASEPRI_DEFAULT and BASEPRI_UTILITY */
+       TH_BUCKET_SHARE_UT,                     /* Timeshare thread between BASEPRI_UTILITY and MAXPRI_THROTTLE */
+       TH_BUCKET_SHARE_BG,                     /* Timeshare thread between MAXPRI_THROTTLE and MINPRI */
+       TH_BUCKET_RUN,                          /* All runnable threads */
+       TH_BUCKET_SCHED_MAX = TH_BUCKET_RUN,    /* Maximum schedulable buckets */
        TH_BUCKET_MAX,
 } sched_bucket_t;
 
@@ -200,18 +215,18 @@ typedef enum {
 #define invalid_pri(pri) ((pri) < MINPRI || (pri) > MAXPRI)
 
 struct runq_stats {
-       uint64_t                                count_sum;
-       uint64_t                                last_change_timestamp;
+       uint64_t                count_sum;
+       uint64_t                last_change_timestamp;
 };
 
 #if defined(CONFIG_SCHED_TIMESHARE_CORE) || defined(CONFIG_SCHED_PROTO)
 
 struct run_queue {
-       int                                     highq;                          /* highest runnable queue */
-       bitmap_t                                bitmap[BITMAP_LEN(NRQS)];       /* run queue bitmap array */
-       int                                     count;                          /* # of threads total */
-       int                                     urgency;                        /* level of preemption urgency */
-       queue_head_t            queues[NRQS];           /* one for each priority */
+       int                     highq;                          /* highest runnable queue */
+       bitmap_t                bitmap[BITMAP_LEN(NRQS)];       /* run queue bitmap array */
+       int                     count;                          /* # of threads total */
+       int                     urgency;                        /* level of preemption urgency */
+       circle_queue_head_t     queues[NRQS];           /* one for each priority */
 
        struct runq_stats       runq_stats;
 };
@@ -236,7 +251,7 @@ struct rt_queue {
        _Atomic int             count;                          /* # of threads total */
        queue_head_t            queue;                          /* all runnable RT threads */
 #if __SMP__
-       decl_simple_lock_data(, rt_lock)
+       decl_simple_lock_data(, rt_lock);
 #endif
        struct runq_stats       runq_stats;
 };
@@ -393,10 +408,17 @@ extern uint32_t         avenrun[3], mach_factor[3];
 extern uint64_t         max_unsafe_computation;
 extern uint64_t         max_poll_computation;
 
-extern volatile uint32_t sched_run_buckets[TH_BUCKET_MAX];
+extern uint32_t         sched_run_buckets[TH_BUCKET_MAX];
 
 extern uint32_t sched_run_incr(thread_t thread);
 extern uint32_t sched_run_decr(thread_t thread);
+extern void sched_update_thread_bucket(thread_t thread);
+
+#define SCHED_DECAY_TICKS       32
+struct shift_data {
+       int     shift1;
+       int     shift2;
+};
 
 /*
  *     thread_timer_delta macro takes care of both thread timers.
index 709803b9e9a55d0d623437044e8077691ad86aa4..a6a855c9ff1114d95a2c0f07dd60ae50e254f50e 100644 (file)
@@ -174,7 +174,8 @@ static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul <<
 #define SCHED_LOAD_EWMA_UNSCALE(load)   (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
 
 /*
- * Routine to capture the latest runnable counts and update sched_load */
+ * Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers)
+ */
 void
 compute_sched_load(void)
 {
@@ -187,12 +188,12 @@ compute_sched_load(void)
        uint32_t ncpus = processor_avail_count;
        uint32_t load_now[TH_BUCKET_MAX];
 
-       load_now[TH_BUCKET_RUN]      = sched_run_buckets[TH_BUCKET_RUN];
-       load_now[TH_BUCKET_FIXPRI]   = sched_run_buckets[TH_BUCKET_FIXPRI];
-       load_now[TH_BUCKET_SHARE_FG] = sched_run_buckets[TH_BUCKET_SHARE_FG];
-       load_now[TH_BUCKET_SHARE_DF] = sched_run_buckets[TH_BUCKET_SHARE_DF];
-       load_now[TH_BUCKET_SHARE_UT] = sched_run_buckets[TH_BUCKET_SHARE_UT];
-       load_now[TH_BUCKET_SHARE_BG] = sched_run_buckets[TH_BUCKET_SHARE_BG];
+       load_now[TH_BUCKET_RUN]      = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
+       load_now[TH_BUCKET_FIXPRI]   = os_atomic_load(&sched_run_buckets[TH_BUCKET_FIXPRI], relaxed);
+       load_now[TH_BUCKET_SHARE_FG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_FG], relaxed);
+       load_now[TH_BUCKET_SHARE_DF] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_DF], relaxed);
+       load_now[TH_BUCKET_SHARE_UT] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_UT], relaxed);
+       load_now[TH_BUCKET_SHARE_BG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_BG], relaxed);
 
        assert(load_now[TH_BUCKET_RUN] >= 0);
        assert(load_now[TH_BUCKET_FIXPRI] >= 0);
@@ -285,7 +286,7 @@ compute_sched_load(void)
 void
 compute_averages(uint64_t stdelta)
 {
-       uint32_t nthreads = sched_run_buckets[TH_BUCKET_RUN] - 1;
+       uint32_t nthreads = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) - 1;
        uint32_t ncpus = processor_avail_count;
 
        /* Update the global pri_shifts based on the latest values */
diff --git a/osfmk/kern/sched_clutch.c b/osfmk/kern/sched_clutch.c
new file mode 100644 (file)
index 0000000..7a246a0
--- /dev/null
@@ -0,0 +1,2174 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <mach/mach_types.h>
+#include <mach/machine.h>
+#include <machine/machine_routines.h>
+#include <machine/sched_param.h>
+#include <machine/machine_cpu.h>
+#include <kern/kern_types.h>
+#include <kern/debug.h>
+#include <kern/machine.h>
+#include <kern/misc_protos.h>
+#include <kern/processor.h>
+#include <kern/queue.h>
+#include <kern/sched.h>
+#include <kern/sched_prim.h>
+#include <kern/task.h>
+#include <kern/thread.h>
+#include <kern/sched_clutch.h>
+#include <machine/atomic.h>
+#include <kern/sched_clutch.h>
+#include <sys/kdebug.h>
+
+
+#if CONFIG_SCHED_CLUTCH
+
+/* Forward declarations of static routines */
+
+/* Root level hierarchy management */
+static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
+static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t);
+static void sched_clutch_root_pri_update(sched_clutch_root_t);
+static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t);
+static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
+static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
+
+/* Root bucket level hierarchy management */
+static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
+static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t);
+static int sched_clutch_root_bucket_pri_compare(sched_clutch_root_bucket_t, sched_clutch_root_bucket_t);
+
+/* Clutch bucket level hierarchy management */
+static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t);
+static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t);
+static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t);
+static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t);
+static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t);
+
+static void sched_clutch_bucket_cpu_usage_update(sched_clutch_bucket_t, uint64_t);
+static void sched_clutch_bucket_cpu_blocked_update(sched_clutch_bucket_t, uint64_t);
+static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
+static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t);
+
+/* Clutch timeshare properties updates */
+static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
+static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
+static void sched_clutch_bucket_cpu_adjust(sched_clutch_bucket_t);
+static void sched_clutch_bucket_timeshare_update(sched_clutch_bucket_t);
+static boolean_t sched_thread_sched_pri_promoted(thread_t);
+/* Clutch membership management */
+static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
+static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t);
+static thread_t sched_clutch_thread_highest(sched_clutch_root_t);
+
+/* Clutch properties updates */
+static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
+static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
+static int sched_clutch_root_priority(sched_clutch_root_t);
+
+
+/* Helper debugging routines */
+static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
+
+
+
+/*
+ * Global priority queue comparator routine for root buckets. The
+ * routine implements the priority queue as a minimum deadline queue
+ * to achieve EDF scheduling.
+ */
+priority_queue_compare_fn_t sched_clutch_root_bucket_compare;
+
+
+/*
+ * Special markers for buckets that have invalid WCELs/quantums etc.
+ */
+#define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
+#define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
+
+/*
+ * Root level bucket WCELs
+ *
+ * The root level bucket selection algorithm is an Earliest Deadline
+ * First (EDF) algorithm where the deadline for buckets are defined
+ * by the worst-case-execution-latency and the make runnable timestamp
+ * for the bucket.
+ *
+ */
+static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
+       SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
+       0,                                              /* FG */
+       37500,                                          /* IN (37.5ms) */
+       75000,                                          /* DF (75ms) */
+       150000,                                         /* UT (150ms) */
+       250000                                          /* BG (250ms) */
+};
+static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
+
+/*
+ * Root level bucket warp
+ *
+ * Each root level bucket has a warp value associated with it as well.
+ * The warp value allows the root bucket to effectively warp ahead of
+ * lower priority buckets for a limited time even if it has a later
+ * deadline. The warping behavior provides extra (but limited)
+ * opportunity for high priority buckets to remain responsive.
+ */
+
+/* Special warp deadline value to indicate that the bucket has not used any warp yet */
+#define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED    (SCHED_CLUTCH_INVALID_TIME_64)
+
+/* Warp window durations for various tiers */
+static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
+       SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
+       8000,                                           /* FG (8ms)*/
+       4000,                                           /* IN (4ms) */
+       2000,                                           /* DF (2ms) */
+       1000,                                           /* UT (1ms) */
+       0                                               /* BG (0ms) */
+};
+static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
+
+/*
+ * Thread level quantum
+ *
+ * The algorithm defines quantums for threads at various buckets. This
+ * (combined with the root level bucket quantums) restricts how much
+ * the lower priority levels can preempt the higher priority threads.
+ */
+static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
+       10000,                                          /* FIXPRI (10ms) */
+       10000,                                          /* FG (10ms) */
+       8000,                                           /* IN (8ms) */
+       6000,                                           /* DF (6ms) */
+       4000,                                           /* UT (4ms) */
+       2000                                            /* BG (2ms) */
+};
+static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
+
+enum sched_clutch_state {
+       SCHED_CLUTCH_STATE_EMPTY = 0,
+       SCHED_CLUTCH_STATE_RUNNABLE,
+};
+
+/*
+ * sched_clutch_us_to_abstime()
+ *
+ * Initializer for converting all durations in usec to abstime
+ */
+static void
+sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
+{
+       for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
+               if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
+                       abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
+               } else {
+                       clock_interval_to_absolutetime_interval(us_vals[i],
+                           NSEC_PER_USEC, &abstime_vals[i]);
+               }
+       }
+}
+
+#if DEVELOPMENT || DEBUG
+
+/*
+ * sched_clutch_hierarchy_locked_assert()
+ *
+ * Debugging helper routine. Asserts that the hierarchy is locked. The locking
+ * for the hierarchy depends on where the hierarchy is hooked. The current
+ * implementation hooks the hierarchy at the pset, so the hierarchy is locked
+ * using the pset lock.
+ */
+static inline void
+sched_clutch_hierarchy_locked_assert(
+       sched_clutch_root_t root_clutch)
+{
+       pset_assert_locked(root_clutch->scr_pset);
+}
+
+#else /* DEVELOPMENT || DEBUG */
+
+static inline void
+sched_clutch_hierarchy_locked_assert(
+       __unused sched_clutch_root_t root_clutch)
+{
+}
+
+#endif /* DEVELOPMENT || DEBUG */
+
+/*
+ * sched_clutch_thr_count_inc()
+ *
+ * Increment thread count at a hierarchy level with overflow checks.
+ */
+static void
+sched_clutch_thr_count_inc(
+       uint16_t *thr_count)
+{
+       if (__improbable(os_inc_overflow(thr_count))) {
+               panic("sched_clutch thread count overflowed!");
+       }
+}
+
+/*
+ * sched_clutch_thr_count_dec()
+ *
+ * Decrement thread count at a hierarchy level with underflow checks.
+ */
+static void
+sched_clutch_thr_count_dec(
+       uint16_t *thr_count)
+{
+       if (__improbable(os_dec_overflow(thr_count))) {
+               panic("sched_clutch thread count underflowed!");
+       }
+}
+
+
+/*
+ * sched_clutch_root_init()
+ *
+ * Routine to initialize the scheduler hierarchy root.
+ */
+static void
+sched_clutch_root_init(
+       sched_clutch_root_t root_clutch,
+       processor_set_t pset)
+{
+       root_clutch->scr_thr_count = 0;
+       root_clutch->scr_priority = NOPRI;
+       root_clutch->scr_urgency = 0;
+       root_clutch->scr_pset = pset;
+
+       /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
+       queue_init(&root_clutch->scr_clutch_buckets);
+
+       /* Initialize the queue which maintains all runnable foreign clutch buckets */
+       queue_init(&root_clutch->scr_foreign_buckets);
+
+       /* Initialize the bitmap and priority queue of runnable root buckets */
+       sched_clutch_root_bucket_compare = priority_heap_make_comparator(a, b, struct sched_clutch_root_bucket, scrb_pqlink, {
+               return (a->scrb_deadline < b->scrb_deadline) ? 1 : ((a->scrb_deadline == b->scrb_deadline) ? 0 : -1);
+       });
+       priority_queue_init(&root_clutch->scr_root_buckets, PRIORITY_QUEUE_GENERIC_KEY | PRIORITY_QUEUE_MIN_HEAP);
+       bitmap_zero(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX);
+       bitmap_zero(root_clutch->scr_warp_available, TH_BUCKET_SCHED_MAX);
+
+       /* Initialize all the root buckets */
+       for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
+               sched_clutch_root_bucket_init(&root_clutch->scr_buckets[i], i);
+       }
+}
+
+/*
+ * sched_clutch_root_bucket_init()
+ *
+ * Routine to initialize root buckets.
+ */
+static void
+sched_clutch_root_bucket_init(
+       sched_clutch_root_bucket_t root_bucket,
+       sched_bucket_t bucket)
+{
+       root_bucket->scrb_bucket = bucket;
+       priority_queue_init(&root_bucket->scrb_clutch_buckets, PRIORITY_QUEUE_BUILTIN_KEY | PRIORITY_QUEUE_MAX_HEAP);
+       priority_queue_entry_init(&root_bucket->scrb_pqlink);
+       root_bucket->scrb_deadline = SCHED_CLUTCH_INVALID_TIME_64;
+       root_bucket->scrb_warped_deadline = 0;
+       root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
+}
+
+/*
+ * sched_clutch_root_bucket_pri_compare()
+ *
+ * Routine to compare root buckets based on the highest runnable clutch
+ * bucket priorities in the root buckets.
+ */
+static int
+sched_clutch_root_bucket_pri_compare(
+       sched_clutch_root_bucket_t a,
+       sched_clutch_root_bucket_t b)
+{
+       sched_clutch_bucket_t a_highest = sched_clutch_root_bucket_highest_clutch_bucket(a);
+       sched_clutch_bucket_t b_highest = sched_clutch_root_bucket_highest_clutch_bucket(b);
+       return (a_highest->scb_priority > b_highest->scb_priority) ?
+              1 : ((a_highest->scb_priority == b_highest->scb_priority) ? 0 : -1);
+}
+
+/*
+ * sched_clutch_root_select_aboveui()
+ *
+ * Special case scheduling for Above UI bucket.
+ *
+ * AboveUI threads are typically system critical threads that need low latency
+ * which is why they are handled specially.
+ *
+ * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
+ * important to maintain some native priority order between those buckets. The policy
+ * implemented here is to compare the highest clutch buckets of both buckets; if the
+ * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
+ * deadline based scheduling which should pickup the timeshare buckets.
+ *
+ * The implementation allows extremely low latency CPU access for Above UI threads
+ * while supporting the use case of high priority timeshare threads contending with
+ * lower priority fixed priority threads.
+ */
+static boolean_t
+sched_clutch_root_select_aboveui(
+       sched_clutch_root_t root_clutch)
+{
+       if (bitmap_test(root_clutch->scr_runnable_bitmap, TH_BUCKET_FIXPRI)) {
+               sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
+               sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_buckets[TH_BUCKET_SHARE_FG];
+
+               if (!bitmap_test(root_clutch->scr_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
+                       /* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */
+                       return true;
+               }
+               if (sched_clutch_root_bucket_pri_compare(root_bucket_aboveui, root_bucket_sharefg) >= 0) {
+                       /* If the aboveUI bucket has a higher native clutch bucket priority, schedule it */
+                       return true;
+               }
+       }
+       return false;
+}
+
+
+/*
+ * sched_clutch_root_highest_root_bucket()
+ *
+ * Main routine to find the highest runnable root level bucket.
+ * This routine is called from performance sensitive contexts; so it is
+ * crucial to keep this O(1).
+ *
+ */
+static sched_clutch_root_bucket_t
+sched_clutch_root_highest_root_bucket(
+       sched_clutch_root_t root_clutch,
+       uint64_t timestamp)
+{
+       sched_clutch_hierarchy_locked_assert(root_clutch);
+       if (bitmap_lsb_first(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
+               return NULL;
+       }
+
+       if (sched_clutch_root_select_aboveui(root_clutch)) {
+               return &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
+       }
+
+       /*
+        * Above UI bucket is not runnable or has a low priority clutch bucket; use the earliest deadline model
+        * to schedule threads. The idea is that as the timeshare buckets use CPU, they will drop their
+        * interactivity score and allow low priority AboveUI clutch buckets to be scheduled.
+        */
+
+       /* Find the earliest deadline bucket */
+       sched_clutch_root_bucket_t edf_bucket = priority_queue_min(&root_clutch->scr_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
+
+       sched_clutch_root_bucket_t warp_bucket = NULL;
+       int warp_bucket_index = -1;
+evaluate_warp_buckets:
+       /* Check if any higher runnable buckets have warp available */
+       warp_bucket_index = bitmap_lsb_first(root_clutch->scr_warp_available, TH_BUCKET_SCHED_MAX);
+
+       if ((warp_bucket_index == -1) || (warp_bucket_index >= edf_bucket->scrb_bucket)) {
+               /* No higher buckets have warp available; choose the edf bucket and replenish its warp */
+               sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
+               edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
+               edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
+               bitmap_set(root_clutch->scr_warp_available, edf_bucket->scrb_bucket);
+               return edf_bucket;
+       }
+
+       /*
+        * Looks like there is a root bucket which is higher in the natural priority
+        * order than edf_bucket and might have some warp remaining.
+        */
+       warp_bucket = &root_clutch->scr_buckets[warp_bucket_index];
+       if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
+               /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
+               warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
+               sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
+               return warp_bucket;
+       }
+       if (warp_bucket->scrb_warped_deadline > timestamp) {
+               /* Root bucket already has a warp window open with some warp remaining */
+               sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
+               return warp_bucket;
+       }
+
+       /* For this bucket, warp window was opened sometime in the past but has now
+        * expired. Mark the bucket as not avilable for warp anymore and re-run the
+        * warp bucket selection logic.
+        */
+       warp_bucket->scrb_warp_remaining = 0;
+       bitmap_clear(root_clutch->scr_warp_available, warp_bucket->scrb_bucket);
+       goto evaluate_warp_buckets;
+}
+
+/*
+ * sched_clutch_root_bucket_deadline_calculate()
+ *
+ * Calculate the deadline for the bucket based on its WCEL
+ */
+static uint64_t
+sched_clutch_root_bucket_deadline_calculate(
+       sched_clutch_root_bucket_t root_bucket,
+       uint64_t timestamp)
+{
+       /* For fixpri AboveUI bucket always return it as the earliest deadline */
+       if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
+               return 0;
+       }
+
+       /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
+       return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
+}
+
+/*
+ * sched_clutch_root_bucket_deadline_update()
+ *
+ * Routine to update the deadline of the root bucket when it is selected.
+ * Updating the deadline also moves the root_bucket in the EDF priority
+ * queue.
+ */
+static void
+sched_clutch_root_bucket_deadline_update(
+       sched_clutch_root_bucket_t root_bucket,
+       sched_clutch_root_t root_clutch,
+       uint64_t timestamp)
+{
+       if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
+               /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
+               return;
+       }
+       uint64_t old_deadline = root_bucket->scrb_deadline;
+       uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
+       assert(old_deadline <= new_deadline);
+       if (old_deadline != new_deadline) {
+               root_bucket->scrb_deadline = new_deadline;
+               /* Since the priority queue is a min-heap, use the decrease routine even though the deadline has a larger value now */
+               priority_queue_entry_decrease(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, PRIORITY_QUEUE_KEY_NONE, sched_clutch_root_bucket_compare);
+       }
+}
+
+/*
+ * sched_clutch_root_bucket_runnable()
+ *
+ * Routine to insert a newly runnable root bucket into the hierarchy.
+ * Also updates the deadline and warp parameters as necessary.
+ */
+static void
+sched_clutch_root_bucket_runnable(
+       sched_clutch_root_bucket_t root_bucket,
+       sched_clutch_root_t root_clutch,
+       uint64_t timestamp)
+{
+       /* Mark the root bucket as runnable */
+       bitmap_set(root_clutch->scr_runnable_bitmap, root_bucket->scrb_bucket);
+       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE) | DBG_FUNC_NONE,
+           root_bucket->scrb_bucket, SCHED_CLUTCH_STATE_RUNNABLE, 0, 0, 0);
+
+       if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
+               /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
+               return;
+       }
+
+       root_bucket->scrb_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
+       priority_queue_insert(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, PRIORITY_QUEUE_KEY_NONE, sched_clutch_root_bucket_compare);
+
+       if (root_bucket->scrb_warp_remaining) {
+               /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
+               bitmap_set(root_clutch->scr_warp_available, root_bucket->scrb_bucket);
+       }
+}
+
+/*
+ * sched_clutch_root_bucket_empty()
+ *
+ * Routine to remove an empty root bucket from the hierarchy.
+ * Also updates the deadline and warp parameters as necessary.
+ */
+static void
+sched_clutch_root_bucket_empty(
+       sched_clutch_root_bucket_t root_bucket,
+       sched_clutch_root_t root_clutch,
+       uint64_t timestamp)
+{
+       bitmap_clear(root_clutch->scr_runnable_bitmap, root_bucket->scrb_bucket);
+       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE) | DBG_FUNC_NONE,
+           root_bucket->scrb_bucket, SCHED_CLUTCH_STATE_EMPTY, 0, 0, 0);
+
+       if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
+               /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
+               return;
+       }
+
+       priority_queue_remove(&root_clutch->scr_root_buckets, &root_bucket->scrb_pqlink, sched_clutch_root_bucket_compare);
+
+       bitmap_clear(root_clutch->scr_warp_available, root_bucket->scrb_bucket);
+       if (root_bucket->scrb_warped_deadline > timestamp) {
+               /*
+                * For root buckets that were using the warp, check if the warp
+                * deadline is in the future. If yes, remove the wall time the
+                * warp was active and update the warp remaining. This allows
+                * the root bucket to use the remaining warp the next time it
+                * becomes runnable.
+                */
+               root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
+       } else if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
+               /*
+                * If the root bucket's warped deadline is in the past, it has used up
+                * all the warp it was assigned. Empty out its warp remaining.
+                */
+               root_bucket->scrb_warp_remaining = 0;
+       }
+}
+
+/*
+ * sched_clutch_root_pri_update()
+ *
+ * The root level priority is used for thread selection and preemption
+ * logic.
+ */
+static void
+sched_clutch_root_pri_update(
+       sched_clutch_root_t root_clutch)
+{
+       sched_clutch_hierarchy_locked_assert(root_clutch);
+       if (bitmap_lsb_first(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
+               /* No runnable root buckets */
+               root_clutch->scr_priority = NOPRI;
+               assert(root_clutch->scr_urgency == 0);
+               return;
+       }
+       sched_clutch_root_bucket_t root_bucket = NULL;
+       /* Special case for AboveUI (uses same logic as thread selection) */
+       if (sched_clutch_root_select_aboveui(root_clutch)) {
+               root_bucket = &root_clutch->scr_buckets[TH_BUCKET_FIXPRI];
+       } else {
+               /*
+                * AboveUI bucket is not runnable or has a low clutch bucket priority,
+                * select the next runnable root bucket in natural priority order. This logic
+                * is slightly different from thread selection, because thread selection
+                * considers deadlines, warps etc. to decide the most optimal bucket at a
+                * given timestamp. Since the priority value is used for preemption decisions
+                * only, it needs to be based on the highest runnable thread available in
+                * the timeshare domain.
+                */
+               int root_bucket_index = bitmap_lsb_next(root_clutch->scr_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
+               assert(root_bucket_index != -1);
+               root_bucket = &root_clutch->scr_buckets[root_bucket_index];
+       }
+       /* For the selected root bucket, find the highest priority clutch bucket */
+       sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
+       root_clutch->scr_priority = priority_queue_max_key(&clutch_bucket->scb_clutchpri_prioq);
+}
+
+/*
+ * sched_clutch_root_urgency_inc()
+ *
+ * Routine to increment the urgency at the root level based on the thread
+ * priority that is being inserted into the hierarchy. The root urgency
+ * counter is updated based on the urgency of threads in any of the
+ * clutch buckets which are part of the hierarchy.
+ *
+ * Always called with the pset lock held.
+ */
+static void
+sched_clutch_root_urgency_inc(
+       sched_clutch_root_t root_clutch,
+       thread_t thread)
+{
+       if (SCHED(priority_is_urgent)(thread->sched_pri)) {
+               root_clutch->scr_urgency++;
+       }
+}
+
+/*
+ * sched_clutch_root_urgency_dec()
+ *
+ * Routine to decrement the urgency at the root level based on the thread
+ * priority that is being removed from the hierarchy. The root urgency
+ * counter is updated based on the urgency of threads in any of the
+ * clutch buckets which are part of the hierarchy.
+ *
+ * Always called with the pset lock held.
+ */
+static void
+sched_clutch_root_urgency_dec(
+       sched_clutch_root_t root_clutch,
+       thread_t thread)
+{
+       if (SCHED(priority_is_urgent)(thread->sched_pri)) {
+               root_clutch->scr_urgency--;
+       }
+}
+
+/*
+ * Clutch bucket level scheduling
+ *
+ * The second level of scheduling is the clutch bucket level scheduling
+ * which tries to schedule thread groups within root_buckets. Each
+ * clutch represents a thread group and a clutch_bucket represents
+ * threads at a particular sched_bucket within that thread group. The
+ * goal of this level of scheduling is to allow interactive thread
+ * groups low latency access to the CPU. It also provides slight
+ * scheduling preference for App and unrestricted thread groups.
+ *
+ * The clutch bucket scheduling algorithm measures an interactivity
+ * score for all clutch buckets. The interactivity score is based
+ * on the ratio of the CPU used and the voluntary blocking of threads
+ * within the clutch bucket. The algorithm is very close to the ULE
+ * scheduler on FreeBSD in terms of calculations. The interactivity
+ * score provides an interactivity boost in the range of
+ * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
+ * thread groups to win over CPU spinners.
+ */
+
+/* Priority boost range for interactivity */
+#define SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT     (8)
+uint8_t sched_clutch_bucket_interactive_pri = SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT;
+
+/* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
+uint64_t sched_clutch_bucket_adjust_threshold = 0;
+#define SCHED_CLUTCH_BUCKET_ADJUST_THRESHOLD_USECS      (500000)
+
+/* The ratio to scale the cpu/blocked time per window */
+#define SCHED_CLUTCH_BUCKET_ADJUST_RATIO                (10)
+
+/* rate at which interactivity score is recalculated. This keeps the score smooth in terms of extremely bursty behavior */
+uint64_t sched_clutch_bucket_interactivity_delta = 0;
+#define SCHED_CLUTCH_BUCKET_INTERACTIVITY_DELTA_USECS_DEFAULT   (25000)
+
+/*
+ * In order to allow App thread groups some preference over daemon thread
+ * groups, the App clutch_buckets get a 8 point boost. The boost value should
+ * be chosen such that badly behaved apps are still penalized over well
+ * behaved interactive daemon clutch_buckets.
+ */
+#define SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT           (8)
+uint8_t sched_clutch_bucket_pri_boost = SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT;
+
+/* Initial value for voluntary blocking time for the clutch_bucket */
+#define SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID  (uint32_t)(~0)
+
+/*
+ * sched_clutch_bucket_init()
+ *
+ * Initializer for clutch buckets.
+ */
+static void
+sched_clutch_bucket_init(
+       sched_clutch_bucket_t clutch_bucket,
+       sched_clutch_t clutch,
+       sched_bucket_t bucket)
+{
+       bzero(clutch_bucket, sizeof(struct sched_clutch_bucket));
+
+       clutch_bucket->scb_bucket = bucket;
+       /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
+       clutch_bucket->scb_priority = 0;
+       /*
+        * All thread groups should be initialized to be interactive; this allows the newly launched
+        * thread groups to fairly compete with already running thread groups.
+        */
+       clutch_bucket->scb_interactivity_score = (sched_clutch_bucket_interactive_pri * 2);
+       clutch_bucket->scb_foreign = false;
+
+       os_atomic_store(&clutch_bucket->scb_timeshare_tick, 0, relaxed);
+       os_atomic_store(&clutch_bucket->scb_pri_shift, INT8_MAX, relaxed);
+
+       clutch_bucket->scb_interactivity_ts = 0;
+       clutch_bucket->scb_blocked_ts = SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID;
+       priority_queue_entry_init(&clutch_bucket->scb_pqlink);
+       clutch_bucket->scb_clutch = clutch;
+       clutch_bucket->scb_root = NULL;
+       priority_queue_init(&clutch_bucket->scb_clutchpri_prioq, PRIORITY_QUEUE_BUILTIN_KEY | PRIORITY_QUEUE_MAX_HEAP);
+       run_queue_init(&clutch_bucket->scb_runq);
+}
+
+/*
+ * sched_clutch_init_with_thread_group()
+ *
+ * Initialize the sched_clutch when the thread group is being created
+ */
+void
+sched_clutch_init_with_thread_group(
+       sched_clutch_t clutch,
+       struct thread_group *tg)
+{
+       os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
+
+       /* Initialize all the clutch buckets */
+       for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
+               sched_clutch_bucket_init(&(clutch->sc_clutch_buckets[i]), clutch, i);
+       }
+
+       /* Grouping specific fields */
+       clutch->sc_tg = tg;
+       os_atomic_store(&clutch->sc_tg_priority, 0, relaxed);
+}
+
+/*
+ * sched_clutch_destroy()
+ *
+ * Destructor for clutch; called from thread group release code.
+ */
+void
+sched_clutch_destroy(
+       __unused sched_clutch_t clutch)
+{
+       assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
+}
+
+
+/*
+ * sched_clutch_bucket_hierarchy_insert()
+ *
+ * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
+ */
+static void
+sched_clutch_bucket_hierarchy_insert(
+       sched_clutch_root_t root_clutch,
+       sched_clutch_bucket_t clutch_bucket,
+       sched_bucket_t bucket,
+       uint64_t timestamp)
+{
+       sched_clutch_hierarchy_locked_assert(root_clutch);
+       if (bucket > TH_BUCKET_FIXPRI) {
+               /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
+               enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
+       }
+       sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket];
+
+       /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
+       if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) {
+               sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
+       }
+
+       /* Insert the clutch bucket into the root bucket priority queue */
+       priority_queue_insert(&root_bucket->scrb_clutch_buckets, &clutch_bucket->scb_pqlink, clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+       os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
+       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_STATE) | DBG_FUNC_NONE,
+           thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, SCHED_CLUTCH_STATE_RUNNABLE, clutch_bucket->scb_priority, 0);
+}
+
+/*
+ * sched_clutch_bucket_hierarchy_remove()
+ *
+ * Rotuine to remove a empty clutch bucket from the root hierarchy.
+ */
+static void
+sched_clutch_bucket_hierarchy_remove(
+       sched_clutch_root_t root_clutch,
+       sched_clutch_bucket_t clutch_bucket,
+       sched_bucket_t bucket,
+       uint64_t timestamp)
+{
+       sched_clutch_hierarchy_locked_assert(root_clutch);
+       if (bucket > TH_BUCKET_FIXPRI) {
+               /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
+               remqueue(&clutch_bucket->scb_listlink);
+       }
+
+       sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket];
+
+       /* Remove the clutch bucket from the root bucket priority queue */
+       priority_queue_remove(&root_bucket->scrb_clutch_buckets, &clutch_bucket->scb_pqlink, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+       os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
+       clutch_bucket->scb_blocked_ts = timestamp;
+       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_STATE) | DBG_FUNC_NONE,
+           thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, SCHED_CLUTCH_STATE_EMPTY, 0, 0);
+
+       /* If the root bucket priority queue is now empty, remove it from the root priority queue */
+       if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) {
+               sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
+       }
+}
+
+/*
+ * sched_clutch_bucket_base_pri()
+ *
+ * Calculates the "base" priority of the clutch bucket. The base
+ * priority of the clutch bucket is the sum of the max of highest
+ * base_pri and highest sched_pri in the clutch bucket and any
+ * grouping specific (App/Daemon...) boosts applicable to the
+ * clutch_bucket.
+ */
+static uint8_t
+sched_clutch_bucket_base_pri(
+       sched_clutch_bucket_t clutch_bucket)
+{
+       uint8_t clutch_boost = 0;
+       assert(clutch_bucket->scb_runq.count != 0);
+
+       sched_clutch_t clutch = clutch_bucket->scb_clutch;
+
+       /*
+        * Since the clutch bucket can contain threads that are members of the group due
+        * to the sched_pri being promoted or due to their base pri, the base priority of
+        * the entire clutch bucket should be based on the highest thread (promoted or base)
+        * in the clutch bucket.
+        */
+       uint8_t max_pri = priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq) ? 0 : priority_queue_max_key(&clutch_bucket->scb_clutchpri_prioq);
+
+       /*
+        * For all AboveUI clutch buckets and clutch buckets for thread groups that
+        * havent been specified as SCHED_CLUTCH_TG_PRI_LOW, give a priority boost
+        */
+       if ((clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) ||
+           (os_atomic_load(&clutch->sc_tg_priority, relaxed) != SCHED_CLUTCH_TG_PRI_LOW)) {
+               clutch_boost = sched_clutch_bucket_pri_boost;
+       }
+       return max_pri + clutch_boost;
+}
+
+/*
+ * sched_clutch_bucket_interactivity_score_calculate()
+ *
+ * Routine to calculate the interactivity score for the clutch bucket. The
+ * interactivity score is based on the ratio of CPU used by all threads in
+ * the bucket and the blocked time of the bucket as a whole.
+ */
+static uint8_t
+sched_clutch_bucket_interactivity_score_calculate(
+       sched_clutch_bucket_t clutch_bucket,
+       uint64_t timestamp)
+{
+       if (clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) {
+               /*
+                * Since the root bucket selection algorithm for Above UI looks at clutch bucket
+                * priorities, make sure all AboveUI buckets are marked interactive.
+                */
+               assert(clutch_bucket->scb_interactivity_score == (2 * sched_clutch_bucket_interactive_pri));
+               return clutch_bucket->scb_interactivity_score;
+       }
+
+       if (clutch_bucket->scb_interactivity_ts == 0) {
+               /*
+                * This indicates a newly initialized clutch bucket; return the default interactivity score
+                * and update timestamp.
+                */
+               clutch_bucket->scb_interactivity_ts = timestamp;
+               return clutch_bucket->scb_interactivity_score;
+       }
+
+       if (timestamp < (clutch_bucket->scb_interactivity_ts + sched_clutch_bucket_interactivity_delta)) {
+               return clutch_bucket->scb_interactivity_score;
+       }
+
+       /* Check if the clutch bucket accounting needs to be scaled */
+       sched_clutch_bucket_cpu_adjust(clutch_bucket);
+       clutch_bucket->scb_interactivity_ts = timestamp;
+
+       sched_clutch_bucket_cpu_data_t scb_cpu_data;
+       scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, relaxed);
+       clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
+       clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
+
+       /*
+        * In extremely CPU contended cases, it is possible that the clutch bucket has been runnable
+        * for a long time but none of its threads have been picked up for execution. In that case, both
+        * the CPU used and blocked would be 0.
+        */
+       if ((cpu_blocked == 0) && (cpu_used == 0)) {
+               return clutch_bucket->scb_interactivity_score;
+       }
+
+       /*
+        * For all timeshare buckets, calculate the interactivity score of the bucket
+        * and add it to the base priority
+        */
+       uint8_t interactive_score = 0;
+       if (cpu_blocked > cpu_used) {
+               /* Interactive clutch_bucket case */
+               interactive_score = sched_clutch_bucket_interactive_pri +
+                   ((sched_clutch_bucket_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
+       } else {
+               /* Non-interactive clutch_bucket case */
+               interactive_score = ((sched_clutch_bucket_interactive_pri * cpu_blocked) / cpu_used);
+       }
+       clutch_bucket->scb_interactivity_score = interactive_score;
+       return interactive_score;
+}
+
+/*
+ * sched_clutch_bucket_pri_calculate()
+ *
+ * The priority calculation algorithm for the clutch_bucket is a slight
+ * modification on the ULE interactivity score. It uses the base priority
+ * of the clutch bucket and applies an interactivity score boost to the
+ * highly responsive clutch buckets.
+ */
+
+static uint8_t
+sched_clutch_bucket_pri_calculate(
+       sched_clutch_bucket_t clutch_bucket,
+       uint64_t timestamp)
+{
+       /* For empty clutch buckets, return priority 0 */
+       if (clutch_bucket->scb_thr_count == 0) {
+               return 0;
+       }
+
+       uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
+       uint8_t interactive_score = sched_clutch_bucket_interactivity_score_calculate(clutch_bucket, timestamp);
+
+       assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
+       uint8_t pri = base_pri + interactive_score;
+       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
+           thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
+       return pri;
+}
+
+/*
+ * sched_clutch_root_bucket_highest_clutch_bucket()
+ *
+ * Routine to find the highest priority clutch bucket
+ * within the root bucket.
+ */
+static sched_clutch_bucket_t
+sched_clutch_root_bucket_highest_clutch_bucket(
+       sched_clutch_root_bucket_t root_bucket)
+{
+       if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) {
+               return NULL;
+       }
+       return priority_queue_max(&root_bucket->scrb_clutch_buckets, struct sched_clutch_bucket, scb_pqlink);
+}
+
+/*
+ * sched_clutch_bucket_runnable()
+ *
+ * Perform all operations needed when a new clutch bucket becomes runnable.
+ * It involves inserting the clutch_bucket into the hierarchy and updating the
+ * root priority appropriately.
+ */
+static boolean_t
+sched_clutch_bucket_runnable(
+       sched_clutch_bucket_t clutch_bucket,
+       sched_clutch_root_t root_clutch,
+       uint64_t timestamp)
+{
+       sched_clutch_hierarchy_locked_assert(root_clutch);
+       sched_clutch_bucket_cpu_blocked_update(clutch_bucket, timestamp);
+       clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
+       sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp);
+       /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
+       sched_clutch_bucket_timeshare_update(clutch_bucket);
+       int16_t root_old_pri = root_clutch->scr_priority;
+       sched_clutch_root_pri_update(root_clutch);
+       return root_clutch->scr_priority > root_old_pri;
+}
+
+/*
+ * sched_clutch_bucket_update()
+ *
+ * Update the clutch_bucket's position in the hierarchy based on whether
+ * the newly runnable thread changes its priority. Also update the root
+ * priority accordingly.
+ */
+static boolean_t
+sched_clutch_bucket_update(
+       sched_clutch_bucket_t clutch_bucket,
+       sched_clutch_root_t root_clutch,
+       uint64_t timestamp)
+{
+       sched_clutch_hierarchy_locked_assert(root_clutch);
+       uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
+       if (new_pri == clutch_bucket->scb_priority) {
+               return false;
+       }
+       struct priority_queue *bucket_prioq = &root_clutch->scr_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
+
+       if (new_pri < clutch_bucket->scb_priority) {
+               clutch_bucket->scb_priority = new_pri;
+               priority_queue_entry_decrease(bucket_prioq, &clutch_bucket->scb_pqlink,
+                   clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+       } else {
+               clutch_bucket->scb_priority = new_pri;
+               priority_queue_entry_increase(bucket_prioq, &clutch_bucket->scb_pqlink,
+                   clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+       }
+
+       int16_t root_old_pri = root_clutch->scr_priority;
+       sched_clutch_root_pri_update(root_clutch);
+       return root_clutch->scr_priority > root_old_pri;
+}
+
+/*
+ * sched_clutch_bucket_empty()
+ *
+ * Perform all the operations needed when a clutch_bucket is no longer runnable.
+ * It involves removing the clutch bucket from the hierarchy and updaing the root
+ * priority appropriately.
+ */
+static void
+sched_clutch_bucket_empty(
+       sched_clutch_bucket_t clutch_bucket,
+       sched_clutch_root_t root_clutch,
+       uint64_t timestamp)
+{
+       sched_clutch_hierarchy_locked_assert(root_clutch);
+       sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp);
+       clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
+       sched_clutch_root_pri_update(root_clutch);
+}
+
+/*
+ * sched_clutch_cpu_usage_update()
+ *
+ * Routine to update CPU usage of the thread in the hierarchy.
+ */
+void
+sched_clutch_cpu_usage_update(
+       thread_t thread,
+       uint64_t delta)
+{
+       if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
+               return;
+       }
+       sched_clutch_t clutch = sched_clutch_for_thread(thread);
+       sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
+       sched_clutch_bucket_cpu_usage_update(clutch_bucket, delta);
+}
+
+/*
+ * sched_clutch_bucket_cpu_usage_update()
+ *
+ * Routine to update the CPU usage of the clutch_bucket.
+ */
+static void
+sched_clutch_bucket_cpu_usage_update(
+       sched_clutch_bucket_t clutch_bucket,
+       uint64_t delta)
+{
+       if (clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) {
+               /* Since Above UI bucket has maximum interactivity score always, nothing to do here */
+               return;
+       }
+
+       /*
+        * The CPU usage should not overflow the clutch_cpu_data_t type. Since the usage is used to
+        * calculate interactivity score, it is safe to restrict it to CLUTCH_CPU_DATA_MAX.
+        */
+       delta = MIN(delta, CLUTCH_CPU_DATA_MAX);
+       os_atomic_add_orig(&(clutch_bucket->scb_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
+}
+
+/*
+ * sched_clutch_bucket_cpu_blocked_update()
+ *
+ * Routine to update CPU blocked time for clutch_bucket.
+ */
+static void
+sched_clutch_bucket_cpu_blocked_update(
+       sched_clutch_bucket_t clutch_bucket,
+       uint64_t timestamp)
+{
+       if ((clutch_bucket->scb_bucket == TH_BUCKET_FIXPRI) ||
+           (clutch_bucket->scb_blocked_ts == SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID)) {
+               /* For Above UI bucket and a newly initialized clutch bucket, nothing to do here */
+               return;
+       }
+
+       uint64_t blocked_time = timestamp - clutch_bucket->scb_blocked_ts;
+       if (blocked_time > sched_clutch_bucket_adjust_threshold) {
+               blocked_time = sched_clutch_bucket_adjust_threshold;
+       }
+
+       /*
+        * The CPU blocked should not overflow the clutch_cpu_data_t type. Since the blocked is used to
+        * calculate interactivity score, it is safe to restrict it to CLUTCH_CPU_DATA_MAX.
+        */
+       blocked_time = MIN(blocked_time, CLUTCH_CPU_DATA_MAX);
+       clutch_cpu_data_t __assert_only cpu_blocked_orig = os_atomic_add_orig(&(clutch_bucket->scb_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
+       /* The blocked time is scaled every so often, it should never overflow */
+       assert(blocked_time <= (CLUTCH_CPU_DATA_MAX - cpu_blocked_orig));
+}
+
+/*
+ * sched_clutch_bucket_cpu_adjust()
+ *
+ * Routine to scale the cpu usage and blocked time once the sum gets bigger
+ * than sched_clutch_bucket_adjust_threshold. Allows the values to remain
+ * manageable and maintain the same ratio while allowing clutch buckets to
+ * adjust behavior and reflect in the interactivity score in a reasonable
+ * amount of time.
+ */
+static void
+sched_clutch_bucket_cpu_adjust(
+       sched_clutch_bucket_t clutch_bucket)
+{
+       sched_clutch_bucket_cpu_data_t old_cpu_data = {};
+       sched_clutch_bucket_cpu_data_t new_cpu_data = {};
+       do {
+               old_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, relaxed);
+               clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
+               clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
+               if ((cpu_used + cpu_blocked) < sched_clutch_bucket_adjust_threshold) {
+                       return;
+               }
+
+               /*
+                * The accumulation of CPU used and blocked is past the threshold; scale it
+                * down to lose old history.
+                */
+               new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_ADJUST_RATIO;
+               new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_ADJUST_RATIO;
+       } while (!os_atomic_cmpxchg(&clutch_bucket->scb_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed));
+}
+
+/*
+ * Thread level scheduling algorithm
+ *
+ * The thread level scheduling algorithm uses the mach timeshare
+ * decay based algorithm to achieve sharing between threads within the
+ * same clutch bucket. The load/priority shifts etc. are all maintained
+ * at the clutch bucket level and used for decay calculation of the
+ * threads. The load sampling is still driven off the scheduler tick
+ * for runnable clutch buckets (it does not use the new higher frequency
+ * EWMA based load calculation). The idea is that the contention and load
+ * within clutch_buckets should be limited enough to not see heavy decay
+ * and timeshare effectively.
+ */
+
+/*
+ * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
+ *
+ * Increment the run count for the clutch bucket associated with the
+ * thread.
+ */
+uint32_t
+sched_clutch_thread_run_bucket_incr(
+       thread_t thread,
+       sched_bucket_t bucket)
+{
+       if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
+               return 0;
+       }
+       sched_clutch_t clutch = sched_clutch_for_thread(thread);
+       return sched_clutch_run_bucket_incr(clutch, bucket);
+}
+
+static uint32_t
+sched_clutch_run_bucket_incr(
+       sched_clutch_t clutch,
+       sched_bucket_t bucket)
+{
+       assert(bucket != TH_BUCKET_RUN);
+       sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
+       uint32_t result = os_atomic_inc(&(clutch_bucket->scb_run_count), relaxed);
+       return result;
+}
+
+/*
+ * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
+ *
+ * Decrement the run count for the clutch bucket associated with the
+ * thread.
+ */
+uint32_t
+sched_clutch_thread_run_bucket_decr(
+       thread_t thread,
+       sched_bucket_t bucket)
+{
+       if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
+               return 0;
+       }
+       sched_clutch_t clutch = sched_clutch_for_thread(thread);
+       return sched_clutch_run_bucket_decr(clutch, bucket);
+}
+
+static uint32_t
+sched_clutch_run_bucket_decr(
+       sched_clutch_t clutch,
+       sched_bucket_t bucket)
+{
+       assert(bucket != TH_BUCKET_RUN);
+       sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
+       uint32_t result = os_atomic_dec(&(clutch_bucket->scb_run_count), relaxed);
+       return result;
+}
+
+/*
+ * sched_clutch_bucket_timeshare_update()
+ *
+ * Routine to update the load and priority shift for the clutch_bucket every
+ * sched_tick. For runnable clutch_buckets, the sched tick handling code
+ * iterates the clutch buckets and calls this routine. For all others, the
+ * clutch_bucket maintains a "last updated schedtick" parameter. As threads
+ * become runnable in the clutch bucket, if this value is outdated, the load
+ * and shifts are updated.
+ *
+ * Possible optimization:
+ * - The current algorithm samples the load every sched tick (125ms).
+ *   This is prone to spikes in runnable counts; if that turns out to be
+ *   a problem, a simple solution would be to do the EWMA trick to sample
+ *   load at every load_tick (30ms) and use the averaged value for the pri
+ *   shift calculation.
+ */
+static void
+sched_clutch_bucket_timeshare_update(
+       sched_clutch_bucket_t clutch_bucket)
+{
+       if (clutch_bucket->scb_bucket < TH_BUCKET_SHARE_FG) {
+               return;
+       }
+
+       /*
+        * Update the timeshare parameters for the clutch bucket if they havent been updated
+        * in this tick.
+        */
+       uint32_t bucket_sched_ts = os_atomic_load(&clutch_bucket->scb_timeshare_tick, relaxed);
+       uint32_t current_sched_ts = sched_tick;
+       if (bucket_sched_ts != current_sched_ts) {
+               os_atomic_store(&clutch_bucket->scb_timeshare_tick, current_sched_ts, relaxed);
+               uint32_t bucket_load = (os_atomic_load(&clutch_bucket->scb_run_count, relaxed) / processor_avail_count);
+               bucket_load = MIN(bucket_load, NRQS - 1);
+               uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_load];
+               os_atomic_store(&clutch_bucket->scb_pri_shift, pri_shift, relaxed);
+       }
+}
+
+/*
+ * sched_clutch_thread_clutch_update()
+ *
+ * Routine called when the thread changes its thread group. The current
+ * implementation relies on the fact that the thread group is changed only
+ * from the context of the thread itself. Due to this fact, the thread
+ * group change causes only counter updates in the old & new clutch
+ * buckets and no hierarchy changes. The routine also attributes the CPU
+ * used so far to the old clutch.
+ */
+void
+sched_clutch_thread_clutch_update(
+       thread_t thread,
+       sched_clutch_t old_clutch,
+       sched_clutch_t new_clutch)
+{
+       uint32_t cpu_delta;
+       assert(current_thread() == thread);
+
+       if (old_clutch) {
+               sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
+               /*
+                * Calculate the CPU used by this thread in the old bucket and
+                * add it to the old clutch bucket. This uses the same CPU usage
+                * logic as update_priority etc.
+                */
+               thread_timer_delta(thread, cpu_delta);
+               if (thread->pri_shift < INT8_MAX) {
+                       thread->sched_usage += cpu_delta;
+               }
+               thread->cpu_delta += cpu_delta;
+               sched_clutch_bucket_cpu_usage_update(&(old_clutch->sc_clutch_buckets[thread->th_sched_bucket]), cpu_delta);
+       }
+
+       if (new_clutch) {
+               sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
+       }
+}
+
+/* Thread Insertion/Removal/Selection routines */
+
+/*
+ * sched_clutch_thread_insert()
+ *
+ * Routine to insert a thread into the sched clutch hierarchy.
+ * Update the counts at all levels of the hierarchy and insert the nodes
+ * as they become runnable. Always called with the pset lock held.
+ */
+static boolean_t
+sched_clutch_thread_insert(
+       sched_clutch_root_t root_clutch,
+       thread_t thread,
+       integer_t options)
+{
+       boolean_t result = FALSE;
+
+       sched_clutch_hierarchy_locked_assert(root_clutch);
+       sched_clutch_t clutch = sched_clutch_for_thread(thread);
+       assert(thread->thread_group == clutch->sc_tg);
+
+       uint64_t current_timestamp = mach_absolute_time();
+       sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
+       assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
+
+       /* Insert thread into the clutch_bucket runq using sched_pri */
+       run_queue_enqueue(&clutch_bucket->scb_runq, thread, options);
+       /* Increment the urgency counter for the root if necessary */
+       sched_clutch_root_urgency_inc(root_clutch, thread);
+
+       /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
+       priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->sched_clutchpri_link,
+           sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri,
+           PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+       os_atomic_inc(&clutch->sc_thr_count, relaxed);
+       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_STATE) | DBG_FUNC_NONE,
+           thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, thread_tid(thread), SCHED_CLUTCH_STATE_RUNNABLE, 0);
+
+       /* Enqueue the clutch into the hierarchy (if needed) and update properties */
+       if (clutch_bucket->scb_thr_count == 0) {
+               sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
+               sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
+               /* Insert the newly runnable clutch bucket into the hierarchy */
+               result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp);
+       } else {
+               sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
+               sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
+               /* Update the position of the clutch bucket in the hierarchy */
+               result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp);
+       }
+       return result;
+}
+
+/*
+ * sched_clutch_thread_remove()
+ *
+ * Routine to remove a thread from the sched clutch hierarchy.
+ * Update the counts at all levels of the hierarchy and remove the nodes
+ * as they become empty. Always called with the pset lock held.
+ */
+static void
+sched_clutch_thread_remove(
+       sched_clutch_root_t root_clutch,
+       thread_t thread,
+       uint64_t current_timestamp)
+{
+       sched_clutch_hierarchy_locked_assert(root_clutch);
+       sched_clutch_t clutch = sched_clutch_for_thread(thread);
+       assert(thread->thread_group == clutch->sc_tg);
+       assert(thread->runq != PROCESSOR_NULL);
+
+       sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
+       assert(clutch_bucket->scb_root == root_clutch);
+
+       /* Decrement the urgency counter for the root if necessary */
+       sched_clutch_root_urgency_dec(root_clutch, thread);
+       /* Remove thread from the clutch_bucket */
+       run_queue_remove(&clutch_bucket->scb_runq, thread);
+
+       priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->sched_clutchpri_link,
+           PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_STATE) | DBG_FUNC_NONE,
+           thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, thread_tid(thread), SCHED_CLUTCH_STATE_EMPTY, 0);
+
+       /* Update counts at various levels of the hierarchy */
+       os_atomic_dec(&clutch->sc_thr_count, relaxed);
+       sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
+       sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
+
+       /* Remove the clutch from hierarchy (if needed) and update properties */
+       if (clutch_bucket->scb_thr_count == 0) {
+               sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp);
+       } else {
+               sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp);
+       }
+}
+
+/*
+ * sched_clutch_thread_highest()
+ *
+ * Routine to find and remove the highest priority thread
+ * from the sched clutch hierarchy. The algorithm looks at the
+ * hierarchy for the most eligible runnable thread and calls
+ * sched_clutch_thread_remove(). Always called with the
+ * pset lock held.
+ */
+static thread_t
+sched_clutch_thread_highest(
+       sched_clutch_root_t root_clutch)
+{
+       sched_clutch_hierarchy_locked_assert(root_clutch);
+       uint64_t current_timestamp = mach_absolute_time();
+
+       /* Select the highest priority root bucket */
+       sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp);
+       if (root_bucket == NULL) {
+               return THREAD_NULL;
+       }
+       /* Since a thread is being picked from this root bucket, update its deadline */
+       sched_clutch_root_bucket_deadline_update(root_bucket, root_clutch, current_timestamp);
+
+       /* Find the highest priority clutch bucket in this root bucket */
+       sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
+       assert(clutch_bucket != NULL);
+
+       /* Find the highest priority runnable thread in this clutch bucket */
+       thread_t thread = run_queue_peek(&clutch_bucket->scb_runq);
+       assert(thread != NULL);
+
+       /* Remove and return the thread from the hierarchy */
+       sched_clutch_thread_remove(root_clutch, thread, current_timestamp);
+       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
+           thread_tid(thread), thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0);
+       return thread;
+}
+
+
+/* High level global accessor routines */
+
+/*
+ * sched_clutch_root_urgency()
+ *
+ * Routine to get the urgency of the highest runnable
+ * thread in the hierarchy.
+ */
+static uint32_t
+sched_clutch_root_urgency(
+       sched_clutch_root_t root_clutch)
+{
+       return root_clutch->scr_urgency;
+}
+
+/*
+ * sched_clutch_root_count_sum()
+ *
+ * The count_sum mechanism is used for scheduler runq
+ * statistics calculation. Its only useful for debugging
+ * purposes; since it takes a mach_absolute_time() on
+ * other scheduler implementations, its better to avoid
+ * populating this until absolutely necessary.
+ */
+static uint32_t
+sched_clutch_root_count_sum(
+       __unused sched_clutch_root_t root_clutch)
+{
+       return 0;
+}
+
+/*
+ * sched_clutch_root_priority()
+ *
+ * Routine to get the priority of the highest runnable
+ * thread in the hierarchy.
+ */
+static int
+sched_clutch_root_priority(
+       sched_clutch_root_t root_clutch)
+{
+       return root_clutch->scr_priority;
+}
+
+/*
+ * sched_clutch_root_count()
+ *
+ * Returns total number of runnable threads in the hierarchy.
+ */
+uint32_t
+sched_clutch_root_count(
+       sched_clutch_root_t root_clutch)
+{
+       return root_clutch->scr_thr_count;
+}
+
+/*
+ * sched_clutch_thread_pri_shift()
+ *
+ * Routine to get the priority shift value for a thread.
+ * Since the timesharing is done at the clutch_bucket level,
+ * this routine gets the clutch_bucket and retrieves the
+ * values from there.
+ */
+uint32_t
+sched_clutch_thread_pri_shift(
+       thread_t thread,
+       sched_bucket_t bucket)
+{
+       if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
+               return UINT8_MAX;
+       }
+       assert(bucket != TH_BUCKET_RUN);
+       sched_clutch_t clutch = sched_clutch_for_thread(thread);
+       sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[bucket]);
+       return os_atomic_load(&clutch_bucket->scb_pri_shift, relaxed);
+}
+
+#pragma mark -- Clutch Scheduler Algorithm
+
+static void
+sched_clutch_init(void);
+
+static void
+sched_clutch_timebase_init(void);
+
+static thread_t
+sched_clutch_steal_thread(processor_set_t pset);
+
+static void
+sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
+
+static boolean_t
+sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
+    sched_options_t options);
+
+static boolean_t
+sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
+
+static ast_t
+sched_clutch_processor_csw_check(processor_t processor);
+
+static boolean_t
+sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
+
+static int
+sched_clutch_runq_count(processor_t processor);
+
+static boolean_t
+sched_clutch_processor_queue_empty(processor_t processor);
+
+static uint64_t
+sched_clutch_runq_stats_count_sum(processor_t processor);
+
+static int
+sched_clutch_processor_bound_count(processor_t processor);
+
+static void
+sched_clutch_pset_init(processor_set_t pset);
+
+static void
+sched_clutch_processor_init(processor_t processor);
+
+static thread_t
+sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason);
+
+static void
+sched_clutch_processor_queue_shutdown(processor_t processor);
+
+static sched_mode_t
+sched_clutch_initial_thread_sched_mode(task_t parent_task);
+
+static uint32_t
+sched_clutch_initial_quantum_size(thread_t thread);
+
+static bool
+sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread);
+
+static uint32_t
+sched_clutch_run_incr(thread_t thread);
+
+static uint32_t
+sched_clutch_run_decr(thread_t thread);
+
+static void
+sched_clutch_update_thread_bucket(thread_t thread);
+
+const struct sched_dispatch_table sched_clutch_dispatch = {
+       .sched_name                                     = "clutch",
+       .init                                           = sched_clutch_init,
+       .timebase_init                                  = sched_clutch_timebase_init,
+       .processor_init                                 = sched_clutch_processor_init,
+       .pset_init                                      = sched_clutch_pset_init,
+       .maintenance_continuation                       = sched_timeshare_maintenance_continue,
+       .choose_thread                                  = sched_clutch_choose_thread,
+       .steal_thread_enabled                           = sched_steal_thread_enabled,
+       .steal_thread                                   = sched_clutch_steal_thread,
+       .compute_timeshare_priority                     = sched_compute_timeshare_priority,
+       .choose_processor                               = choose_processor,
+       .processor_enqueue                              = sched_clutch_processor_enqueue,
+       .processor_queue_shutdown                       = sched_clutch_processor_queue_shutdown,
+       .processor_queue_remove                         = sched_clutch_processor_queue_remove,
+       .processor_queue_empty                          = sched_clutch_processor_queue_empty,
+       .priority_is_urgent                             = priority_is_urgent,
+       .processor_csw_check                            = sched_clutch_processor_csw_check,
+       .processor_queue_has_priority                   = sched_clutch_processor_queue_has_priority,
+       .initial_quantum_size                           = sched_clutch_initial_quantum_size,
+       .initial_thread_sched_mode                      = sched_clutch_initial_thread_sched_mode,
+       .can_update_priority                            = can_update_priority,
+       .update_priority                                = update_priority,
+       .lightweight_update_priority                    = lightweight_update_priority,
+       .quantum_expire                                 = sched_default_quantum_expire,
+       .processor_runq_count                           = sched_clutch_runq_count,
+       .processor_runq_stats_count_sum                 = sched_clutch_runq_stats_count_sum,
+       .processor_bound_count                          = sched_clutch_processor_bound_count,
+       .thread_update_scan                             = sched_clutch_thread_update_scan,
+       .multiple_psets_enabled                         = TRUE,
+       .sched_groups_enabled                           = FALSE,
+       .avoid_processor_enabled                        = TRUE,
+       .thread_avoid_processor                         = sched_clutch_thread_avoid_processor,
+       .processor_balance                              = sched_SMT_balance,
+
+       .rt_runq                                        = sched_rtglobal_runq,
+       .rt_init                                        = sched_rtglobal_init,
+       .rt_queue_shutdown                              = sched_rtglobal_queue_shutdown,
+       .rt_runq_scan                                   = sched_rtglobal_runq_scan,
+       .rt_runq_count_sum                              = sched_rtglobal_runq_count_sum,
+
+       .qos_max_parallelism                            = sched_qos_max_parallelism,
+       .check_spill                                    = sched_check_spill,
+       .ipi_policy                                     = sched_ipi_policy,
+       .thread_should_yield                            = sched_thread_should_yield,
+       .run_count_incr                                 = sched_clutch_run_incr,
+       .run_count_decr                                 = sched_clutch_run_decr,
+       .update_thread_bucket                           = sched_clutch_update_thread_bucket,
+       .pset_made_schedulable                          = sched_pset_made_schedulable,
+};
+
+__attribute__((always_inline))
+static inline run_queue_t
+sched_clutch_bound_runq(processor_t processor)
+{
+       return &processor->runq;
+}
+
+__attribute__((always_inline))
+static inline sched_clutch_root_t
+sched_clutch_processor_root_clutch(processor_t processor)
+{
+       return &processor->processor_set->pset_clutch_root;
+}
+
+__attribute__((always_inline))
+static inline run_queue_t
+sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
+{
+       assert(thread->bound_processor == processor);
+       return sched_clutch_bound_runq(processor);
+}
+
+static uint32_t
+sched_clutch_initial_quantum_size(thread_t thread)
+{
+       if (thread == THREAD_NULL) {
+               return std_quantum;
+       }
+       assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
+       return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
+}
+
+static sched_mode_t
+sched_clutch_initial_thread_sched_mode(task_t parent_task)
+{
+       if (parent_task == kernel_task) {
+               return TH_MODE_FIXED;
+       } else {
+               return TH_MODE_TIMESHARE;
+       }
+}
+
+static void
+sched_clutch_processor_init(processor_t processor)
+{
+       run_queue_init(&processor->runq);
+}
+
+static void
+sched_clutch_pset_init(processor_set_t pset)
+{
+       sched_clutch_root_init(&pset->pset_clutch_root, pset);
+}
+
+static void
+sched_clutch_init(void)
+{
+       if (!PE_parse_boot_argn("sched_clutch_bucket_interactive_pri", &sched_clutch_bucket_interactive_pri, sizeof(sched_clutch_bucket_interactive_pri))) {
+               sched_clutch_bucket_interactive_pri = SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI_DEFAULT;
+       }
+       if (!PE_parse_boot_argn("sched_clutch_bucket_pri_boost", &sched_clutch_bucket_pri_boost, sizeof(sched_clutch_bucket_pri_boost))) {
+               sched_clutch_bucket_pri_boost = SCHED_CLUTCH_BUCKET_PRI_BOOST_DEFAULT;
+       }
+       sched_timeshare_init();
+}
+
+static void
+sched_clutch_timebase_init(void)
+{
+       sched_timeshare_timebase_init();
+       sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
+       sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
+       sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
+       clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_ADJUST_THRESHOLD_USECS,
+           NSEC_PER_USEC, &sched_clutch_bucket_adjust_threshold);
+
+       uint32_t interactivity_delta = 0;
+       if (!PE_parse_boot_argn("sched_clutch_bucket_interactivity_delta_usecs", &interactivity_delta, sizeof(interactivity_delta))) {
+               interactivity_delta = SCHED_CLUTCH_BUCKET_INTERACTIVITY_DELTA_USECS_DEFAULT;
+       }
+       clock_interval_to_absolutetime_interval(interactivity_delta, NSEC_PER_USEC, &sched_clutch_bucket_interactivity_delta);
+}
+
+static thread_t
+sched_clutch_choose_thread(
+       processor_t      processor,
+       int              priority,
+       __unused ast_t            reason)
+{
+       int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
+       uint32_t clutch_count = sched_clutch_root_count(sched_clutch_processor_root_clutch(processor));
+       run_queue_t bound_runq = sched_clutch_bound_runq(processor);
+       boolean_t choose_from_boundq = false;
+
+       if (bound_runq->highq < priority &&
+           clutch_pri < priority) {
+               return THREAD_NULL;
+       }
+
+       if (bound_runq->count && clutch_count) {
+               if (bound_runq->highq >= clutch_pri) {
+                       choose_from_boundq = true;
+               }
+       } else if (bound_runq->count) {
+               choose_from_boundq = true;
+       } else if (clutch_count) {
+               choose_from_boundq = false;
+       } else {
+               return THREAD_NULL;
+       }
+
+       thread_t thread = THREAD_NULL;
+       if (choose_from_boundq == false) {
+               sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
+               thread = sched_clutch_thread_highest(pset_clutch_root);
+       } else {
+               thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
+       }
+       return thread;
+}
+
+static boolean_t
+sched_clutch_processor_enqueue(
+       processor_t       processor,
+       thread_t          thread,
+       sched_options_t   options)
+{
+       boolean_t       result;
+
+       thread->runq = processor;
+       if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
+               sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
+               result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
+       } else {
+               run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
+               result = run_queue_enqueue(rq, thread, options);
+       }
+       return result;
+}
+
+static boolean_t
+sched_clutch_processor_queue_empty(processor_t processor)
+{
+       return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
+              sched_clutch_bound_runq(processor)->count == 0;
+}
+
+static ast_t
+sched_clutch_processor_csw_check(processor_t processor)
+{
+       boolean_t       has_higher;
+       int             pri;
+
+       if (sched_clutch_thread_avoid_processor(processor, current_thread())) {
+               return AST_PREEMPT | AST_URGENT;
+       }
+
+       run_queue_t bound_runq = sched_clutch_bound_runq(processor);
+       int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
+
+       assert(processor->active_thread != NULL);
+
+       pri = MAX(clutch_pri, bound_runq->highq);
+
+       if (processor->first_timeslice) {
+               has_higher = (pri > processor->current_pri);
+       } else {
+               has_higher = (pri >= processor->current_pri);
+       }
+
+       if (has_higher) {
+               if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
+                       return AST_PREEMPT | AST_URGENT;
+               }
+
+               if (bound_runq->urgency > 0) {
+                       return AST_PREEMPT | AST_URGENT;
+               }
+
+               return AST_PREEMPT;
+       }
+
+       return AST_NONE;
+}
+
+static boolean_t
+sched_clutch_processor_queue_has_priority(processor_t    processor,
+    int            priority,
+    boolean_t      gte)
+{
+       run_queue_t bound_runq = sched_clutch_bound_runq(processor);
+
+       int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
+
+       if (gte) {
+               return qpri >= priority;
+       } else {
+               return qpri > priority;
+       }
+}
+
+static int
+sched_clutch_runq_count(processor_t processor)
+{
+       return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
+}
+
+static uint64_t
+sched_clutch_runq_stats_count_sum(processor_t processor)
+{
+       uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
+
+       if (processor->cpu_id == processor->processor_set->cpu_set_low) {
+               return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
+       } else {
+               return bound_sum;
+       }
+}
+static int
+sched_clutch_processor_bound_count(processor_t processor)
+{
+       return sched_clutch_bound_runq(processor)->count;
+}
+
+static void
+sched_clutch_processor_queue_shutdown(processor_t processor)
+{
+       processor_set_t pset = processor->processor_set;
+       sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
+       thread_t        thread;
+       queue_head_t    tqueue;
+
+       /* We only need to migrate threads if this is the last active processor in the pset */
+       if (pset->online_processor_count > 0) {
+               pset_unlock(pset);
+               return;
+       }
+
+       queue_init(&tqueue);
+       while (sched_clutch_root_count(pset_clutch_root) > 0) {
+               thread = sched_clutch_thread_highest(pset_clutch_root);
+               enqueue_tail(&tqueue, &thread->runq_links);
+       }
+
+       pset_unlock(pset);
+
+       qe_foreach_element_safe(thread, &tqueue, runq_links) {
+               remqueue(&thread->runq_links);
+
+               thread_lock(thread);
+
+               thread_setrun(thread, SCHED_TAILQ);
+
+               thread_unlock(thread);
+       }
+}
+
+static boolean_t
+sched_clutch_processor_queue_remove(
+       processor_t processor,
+       thread_t    thread)
+{
+       run_queue_t             rq;
+       processor_set_t         pset = processor->processor_set;
+
+       pset_lock(pset);
+
+       if (processor == thread->runq) {
+               /*
+                * Thread is on a run queue and we have a lock on
+                * that run queue.
+                */
+               if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
+                       sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
+                       sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time());
+               } else {
+                       rq = sched_clutch_thread_bound_runq(processor, thread);
+                       run_queue_remove(rq, thread);
+               }
+       } else {
+               /*
+                * The thread left the run queue before we could
+                * lock the run queue.
+                */
+               assert(thread->runq == PROCESSOR_NULL);
+               processor = PROCESSOR_NULL;
+       }
+
+       pset_unlock(pset);
+
+       return processor != PROCESSOR_NULL;
+}
+
+static thread_t
+sched_clutch_steal_thread(processor_set_t pset)
+{
+       processor_set_t nset, cset = pset;
+       thread_t        thread;
+
+       do {
+               sched_clutch_root_t pset_clutch_root = &cset->pset_clutch_root;
+               if (sched_clutch_root_count(pset_clutch_root) > 0) {
+                       thread = sched_clutch_thread_highest(pset_clutch_root);
+                       pset_unlock(cset);
+                       return thread;
+               }
+
+               nset = next_pset(cset);
+
+               if (nset != pset) {
+                       pset_unlock(cset);
+
+                       cset = nset;
+                       pset_lock(cset);
+               }
+       } while (nset != pset);
+
+       pset_unlock(cset);
+
+       return THREAD_NULL;
+}
+
+static void
+sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
+{
+       boolean_t               restart_needed = FALSE;
+       processor_t             processor = processor_list;
+       processor_set_t         pset;
+       thread_t                thread;
+       spl_t                   s;
+
+       /*
+        *  We update the threads associated with each processor (bound and idle threads)
+        *  and then update the threads in each pset runqueue.
+        */
+
+       do {
+               do {
+                       pset = processor->processor_set;
+
+                       s = splsched();
+                       pset_lock(pset);
+
+                       restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
+
+                       pset_unlock(pset);
+                       splx(s);
+
+                       if (restart_needed) {
+                               break;
+                       }
+
+                       thread = processor->idle_thread;
+                       if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
+                               if (thread_update_add_thread(thread) == FALSE) {
+                                       restart_needed = TRUE;
+                                       break;
+                               }
+                       }
+               } while ((processor = processor->processor_list) != NULL);
+
+               /* Ok, we now have a collection of candidates -- fix them. */
+               thread_update_process_threads();
+       } while (restart_needed);
+
+       pset = &pset0;
+
+       do {
+               do {
+                       s = splsched();
+                       pset_lock(pset);
+
+                       if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
+                               queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
+                               sched_clutch_bucket_t clutch_bucket;
+                               qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
+                                       sched_clutch_bucket_timeshare_update(clutch_bucket);
+                                       restart_needed = runq_scan(&clutch_bucket->scb_runq, scan_context);
+                                       if (restart_needed) {
+                                               break;
+                                       }
+                               }
+                       }
+
+                       pset_unlock(pset);
+                       splx(s);
+                       if (restart_needed) {
+                               break;
+                       }
+               } while ((pset = pset->pset_list) != NULL);
+
+               /* Ok, we now have a collection of candidates -- fix them. */
+               thread_update_process_threads();
+       } while (restart_needed);
+}
+
+extern int sched_allow_rt_smt;
+
+/* Return true if this thread should not continue running on this processor */
+static bool
+sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread)
+{
+       if (processor->processor_primary != processor) {
+               /*
+                * This is a secondary SMT processor.  If the primary is running
+                * a realtime thread, only allow realtime threads on the secondary.
+                */
+               if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+/*
+ * For the clutch scheduler, the run counts are maintained in the clutch
+ * buckets (i.e thread group scheduling structure).
+ */
+static uint32_t
+sched_clutch_run_incr(thread_t thread)
+{
+       assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
+       uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
+       sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
+       return new_count;
+}
+
+static uint32_t
+sched_clutch_run_decr(thread_t thread)
+{
+       assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
+       uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
+       sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
+       return new_count;
+}
+
+static sched_bucket_t
+sched_convert_pri_to_bucket(uint8_t priority)
+{
+       sched_bucket_t bucket = TH_BUCKET_RUN;
+
+       if (priority > BASEPRI_USER_INITIATED) {
+               bucket = TH_BUCKET_SHARE_FG;
+       } else if (priority > BASEPRI_DEFAULT) {
+               bucket = TH_BUCKET_SHARE_IN;
+       } else if (priority > BASEPRI_UTILITY) {
+               bucket = TH_BUCKET_SHARE_DF;
+       } else if (priority > MAXPRI_THROTTLE) {
+               bucket = TH_BUCKET_SHARE_UT;
+       } else {
+               bucket = TH_BUCKET_SHARE_BG;
+       }
+       return bucket;
+}
+
+/*
+ * For threads that have changed sched_pri without changing the
+ * base_pri for any reason other than decay, use the sched_pri
+ * as the bucketizing priority instead of base_pri. All such
+ * changes are typically due to kernel locking primitives boosts
+ * or demotions.
+ */
+static boolean_t
+sched_thread_sched_pri_promoted(thread_t thread)
+{
+       return (thread->sched_flags & TH_SFLAG_PROMOTED) ||
+              (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
+              (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
+              (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
+              (thread->kern_promotion_schedpri != 0);
+}
+
+/*
+ * Routine to update the scheduling bucket for the thread.
+ *
+ * In the clutch scheduler implementation, the thread's bucket
+ * is based on sched_pri if it was promoted due to a kernel
+ * primitive; otherwise its based on the thread base_pri. This
+ * enhancement allows promoted threads to reach a higher priority
+ * bucket and potentially get selected sooner for scheduling.
+ *
+ * Also, the clutch scheduler does not honor fixed priority below
+ * FG priority. It simply puts those threads in the corresponding
+ * timeshare bucket. The reason for to do that is because it is
+ * extremely hard to define the scheduling properties of such threads
+ * and they typically lead to performance issues.
+ */
+
+void
+sched_clutch_update_thread_bucket(thread_t thread)
+{
+       sched_bucket_t old_bucket = thread->th_sched_bucket;
+       sched_bucket_t new_bucket = TH_BUCKET_RUN;
+       assert(thread->runq == PROCESSOR_NULL);
+
+       int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
+
+       switch (thread->sched_mode) {
+       case TH_MODE_FIXED:
+               if (pri >= BASEPRI_FOREGROUND) {
+                       new_bucket = TH_BUCKET_FIXPRI;
+               } else {
+                       new_bucket = sched_convert_pri_to_bucket(pri);
+               }
+               break;
+
+       case TH_MODE_REALTIME:
+               new_bucket = TH_BUCKET_FIXPRI;
+               break;
+
+       case TH_MODE_TIMESHARE:
+               new_bucket = sched_convert_pri_to_bucket(pri);
+               break;
+
+       default:
+               panic("unexpected mode: %d", thread->sched_mode);
+               break;
+       }
+
+       if (old_bucket == new_bucket) {
+               return;
+       }
+
+       thread->th_sched_bucket = new_bucket;
+       thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
+
+       /*
+        * Since this is called after the thread has been removed from the runq,
+        * only the run counts need to be updated. The re-insert into the runq
+        * would put the thread into the correct new bucket's runq.
+        */
+       if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
+               sched_clutch_thread_run_bucket_decr(thread, old_bucket);
+               sched_clutch_thread_run_bucket_incr(thread, new_bucket);
+       }
+}
+
+
+#endif /* CONFIG_SCHED_CLUTCH */
diff --git a/osfmk/kern/sched_clutch.h b/osfmk/kern/sched_clutch.h
new file mode 100644 (file)
index 0000000..4cfad12
--- /dev/null
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _KERN_SCHED_CLUTCH_H_
+#define _KERN_SCHED_CLUTCH_H_
+
+#include <kern/sched.h>
+#include <machine/atomic.h>
+#include <kern/priority_queue.h>
+#include <kern/thread_group.h>
+#include <kern/bits.h>
+
+#if CONFIG_SCHED_CLUTCH
+
+/*
+ * Clutch ordering based on thread group flags (specified
+ * by the thread grouping mechanism). These properties
+ * define a thread group specific priority boost.
+ *
+ * The current implementation gives a slight boost to
+ * HIGH & MED thread groups which effectively deprioritizes
+ * daemon thread groups which are marked "Efficient" on AMP
+ * systems.
+ */
+#define SCHED_CLUTCH_TG_PRI_LOW           0x0
+#define SCHED_CLUTCH_TG_PRI_MED           0x1
+#define SCHED_CLUTCH_TG_PRI_HIGH          0x2
+
+/*
+ * For the current implementation, bound threads are not managed
+ * in the clutch hierarchy. This helper macro is used to indicate
+ * if the thread should be in the hierarchy.
+ */
+#define SCHED_CLUTCH_THREAD_ELIGIBLE(thread)    ((thread->bound_processor) == PROCESSOR_NULL)
+
+/*
+ *
+ * Clutch hierarchy locking protocol
+ *
+ * The scheduler clutch hierarchy is protected by a combination of
+ * atomics and pset lock.
+ * - All fields protected by the pset lock are annotated with (P)
+ * - All fields updated using atomics are annotated with (A)
+ * - All fields that are unprotected and are not updated after
+ *   initialization are annotated with (I)
+ */
+
+/*
+ * struct sched_clutch_root_bucket
+ *
+ * A clutch_root_bucket represents all threads across all thread groups
+ * that are in the same scheduler bucket (FG/IN/...). The clutch_root_bucket
+ * is selected for execution by the root level bucket selection algorithm
+ * which bases the decision on the clutch_root_bucket's deadline (EDF). The
+ * deadline for a root bucket is calculated based on its runnable timestamp
+ * and the worst-case-execution-latency values specied in sched_clutch_root_bucket_wcel[]
+ */
+struct sched_clutch_root_bucket {
+       /* (I) sched bucket represented by this root bucket */
+       uint8_t                         scrb_bucket;
+       /* (P) priority queue for all clutch buckets in this sched bucket */
+       struct priority_queue           scrb_clutch_buckets;
+       /* (P) priority queue entry to use for enqueueing root bucket into root prioq */
+       struct priority_queue_entry     scrb_pqlink;
+       /* (P) ageout deadline for this root bucket */
+       uint64_t                        scrb_deadline;
+       /* (P) warped deadline for root bucket */
+       uint64_t                        scrb_warped_deadline;
+       /* (P) warp remaining for root bucket */
+       uint64_t                        scrb_warp_remaining;
+};
+typedef struct sched_clutch_root_bucket *sched_clutch_root_bucket_t;
+
+/*
+ * struct sched_clutch_root
+ *
+ * A clutch_root represents the root of the hierarchy. It maintains a
+ * priority queue of all runnable root buckets. The clutch_root also
+ * maintains the information about the last clutch_root_bucket scheduled
+ * in order to implement bucket level quantum. The bucket level quantums
+ * allow low priority buckets to get a "fair" chance of using the CPU even
+ * if they contain a bunch of short executing threads. The bucket quantums
+ * are configured using sched_clutch_root_bucket_quantum[]
+ */
+struct sched_clutch_root {
+       /* (P) root level priority; represents the highest runnable thread in the hierarchy */
+       int16_t                         scr_priority;
+       /* (P) total number of runnable threads in the hierarchy */
+       uint16_t                        scr_thr_count;
+       /* (P) root level urgency; represents the urgency of the whole hierarchy for pre-emption purposes */
+       int16_t                         scr_urgency;
+
+       /* (I) processor set this hierarchy belongs to */
+       processor_set_t                 scr_pset;
+       /*
+        * (P) list of all runnable clutch buckets across the system;
+        * allows easy iteration in the sched tick based timesharing code
+        */
+       queue_head_t                    scr_clutch_buckets;
+       /*
+        * (P) list of all runnable foreign buckets in this hierarchy;
+        * used for tracking thread groups which need to be migrated when
+        * psets are available
+        */
+       queue_head_t                    scr_foreign_buckets;
+
+       /* Root level bucket management */
+
+       /* (P) bitmap of all runnable clutch_root_buckets; used for root pri calculation */
+       bitmap_t                        scr_runnable_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)];
+       /* (P) bitmap of all runnable root buckets which have warps remaining */
+       bitmap_t                        scr_warp_available[BITMAP_LEN(TH_BUCKET_SCHED_MAX)];
+       /* (P) priority queue of all runnable clutch_root_buckets */
+       struct priority_queue           scr_root_buckets;
+       /* (P) storage for all possible clutch_root_buckets */
+       struct sched_clutch_root_bucket scr_buckets[TH_BUCKET_SCHED_MAX];
+};
+typedef struct sched_clutch_root *sched_clutch_root_t;
+
+/* forward declaration for sched_clutch */
+struct sched_clutch;
+
+/*
+ * sched_clutch_bucket_cpu_data_t
+ *
+ * Used for maintaining clutch bucket used and blocked time. The
+ * values are used for calculating the interactivity score for the
+ * clutch bucket.
+ *
+ * Since the CPU used/blocked calculation uses wide atomics, the data
+ * types used are different based on the platform.
+ */
+
+#if __arm64__
+
+#define CLUTCH_CPU_DATA_MAX             (UINT64_MAX)
+typedef uint64_t                        clutch_cpu_data_t;
+typedef unsigned __int128               clutch_cpu_data_wide_t;
+
+#else /* __arm64__ */
+
+#define CLUTCH_CPU_DATA_MAX             (UINT32_MAX)
+typedef uint32_t                        clutch_cpu_data_t;
+typedef uint64_t                        clutch_cpu_data_wide_t;
+
+#endif /* __arm64__ */
+
+typedef union sched_clutch_bucket_cpu_data {
+       struct {
+               /* Clutch bucket CPU used across all threads */
+               clutch_cpu_data_t       scbcd_cpu_used;
+               /* Clutch bucket voluntary blocked time */
+               clutch_cpu_data_t       scbcd_cpu_blocked;
+       } cpu_data;
+       clutch_cpu_data_wide_t          scbcd_cpu_data_packed;
+} sched_clutch_bucket_cpu_data_t;
+
+/*
+ * struct sched_clutch_bucket
+ *
+ * A sched_clutch_bucket represents the set of threads for a thread
+ * group at a particular scheduling bucket. It maintains information
+ * about the CPU usage & blocking behavior of all threads part of
+ * the clutch_bucket and maintains the timesharing attributes for
+ * threads in its runq. It uses the decay based algorithm to timeshare
+ * among threads in the runq.
+ */
+struct sched_clutch_bucket {
+       /* (I) bucket for the clutch_bucket */
+       uint8_t                         scb_bucket;
+       /* (P) priority of the clutch bucket */
+       uint8_t                         scb_priority;
+       /* (P) interactivity score of the clutch bucket */
+       uint8_t                         scb_interactivity_score;
+       /* (P) flag to indicate if the bucket is a foreign bucket */
+       bool                            scb_foreign;
+
+       /* Properties used for timesharing threads in this clutch_bucket */
+
+       /* (P) number of threads in this clutch_bucket; should match runq.count */
+       uint16_t                        scb_thr_count;
+       /* (A) run count (running + runnable) for this clutch_bucket */
+       uint16_t _Atomic                scb_run_count;
+       /* (A) sched tick when the clutch bucket load/shifts were updated */
+       uint32_t _Atomic                scb_timeshare_tick;
+       /* (A) priority shifts for threads in the clutch_bucket */
+       uint32_t _Atomic                scb_pri_shift;
+       /* (P) linkage for all clutch_buckets in a root bucket; used for tick operations */
+       queue_chain_t                   scb_listlink;
+
+
+       /* (P) timestamp for the last time the interactivity score was updated */
+       uint64_t                        scb_interactivity_ts;
+       /* (P) timestamp for the last time the clutch_bucket blocked */
+       uint64_t                        scb_blocked_ts;
+
+       /* (A) CPU usage information for the clutch bucket */
+       sched_clutch_bucket_cpu_data_t  scb_cpu_data;
+
+       /* (P) linkage for clutch_bucket in root_bucket priority queue */
+       struct priority_queue_entry     scb_pqlink;
+       /* (I) clutch to which this clutch bucket belongs */
+       struct sched_clutch             *scb_clutch;
+       /* (A) pointer to the root of the hierarchy this bucket is in */
+       struct sched_clutch_root        *scb_root;
+       /* (P) priority queue of threads based on their promoted/base priority */
+       struct priority_queue           scb_clutchpri_prioq;
+       /* (P) runq of threads in clutch_bucket */
+       struct run_queue                scb_runq;
+};
+typedef struct sched_clutch_bucket *sched_clutch_bucket_t;
+
+
+/*
+ * struct sched_clutch
+ *
+ * A sched_clutch is a 1:1 mapping to a thread group. It maintains the
+ * storage for all clutch buckets for this thread group and some properties
+ * of the thread group (such as flags etc.)
+ */
+struct sched_clutch {
+       /*
+        * (A) number of runnable threads in sched_clutch; needs to be atomic
+        * to support cross cluster sched_clutch migrations.
+        */
+       uint16_t _Atomic                sc_thr_count;
+       /*
+        * Grouping specific parameters. Currently the implementation only
+        * supports thread_group based grouping.
+        */
+       union {
+               /* (A) priority specified by the thread grouping mechanism */
+               uint8_t _Atomic         sc_tg_priority;
+       };
+       union {
+               /* (I) Pointer to thread group */
+               struct thread_group     *sc_tg;
+       };
+       /* (I) storage for all clutch_buckets for this clutch */
+       struct sched_clutch_bucket      sc_clutch_buckets[TH_BUCKET_SCHED_MAX];
+};
+typedef struct sched_clutch *sched_clutch_t;
+
+
+/* Clutch lifecycle management */
+void sched_clutch_init_with_thread_group(sched_clutch_t, struct thread_group *);
+void sched_clutch_destroy(sched_clutch_t);
+
+/* Clutch thread membership management */
+void sched_clutch_thread_clutch_update(thread_t, sched_clutch_t, sched_clutch_t);
+
+/* Clutch timesharing stats management */
+uint32_t sched_clutch_thread_run_bucket_incr(thread_t, sched_bucket_t);
+uint32_t sched_clutch_thread_run_bucket_decr(thread_t, sched_bucket_t);
+void sched_clutch_cpu_usage_update(thread_t, uint64_t);
+uint32_t sched_clutch_thread_pri_shift(thread_t, sched_bucket_t);
+
+/* Clutch properties accessors */
+uint32_t sched_clutch_root_count(sched_clutch_root_t);
+
+/* Grouping specific external routines */
+extern sched_clutch_t sched_clutch_for_thread(thread_t);
+
+#endif /* CONFIG_SCHED_CLUTCH */
+
+#endif /* _KERN_SCHED_CLUTCH_H_ */
diff --git a/osfmk/kern/sched_clutch.md b/osfmk/kern/sched_clutch.md
new file mode 100644 (file)
index 0000000..64da1a5
--- /dev/null
@@ -0,0 +1,151 @@
+# Clutch Scheduler
+
+## Background
+
+The XNU kernel runs on a variety of platforms with strong requirements for being dynamic and efficient. It needs to deliver on a wide range of requirements; from quick access to CPU for latency sensitive workloads (eg. UI interactions, multimedia recording/playback) to starvation avoidance for lower priority batch workloads (eg. photos sync, source compilation). The traditional Mach scheduler attempts to achieve these goals by expecting all threads in the system to be tagged with a priority number and treating high priority threads as interactive threads and low priority threads as batch threads. It then uses a timesharing model based on priority decay to penalize threads as they use CPU to achieve fairshare and starvation avoidance. This approach however loses the relationship between threads and higher level user workloads, making it impossible for the scheduler to reason about the workload as a whole which is what the end user cares about. One artifact of this thread based timesharing approach is that threads at the same priority level are treated similarly irrespective of which user workload they are servicing, which often leads to non-optimal decisions. It ultimately leads to priority inflation across the platform with individual subsystems raising their priority to avoid starvation and timesharing with other unrelated threads. The traditional thread level scheduling model also suffers from the following issues:
+
+* **Inaccurate accounting**: CPU accounting at the thread level incentivizes creating more threads on the system. Also in the world of GCD and workqueues where threads are created and destroyed rapidly, thread level accounting is inaccurate and allows excessive CPU usage.
+* **Poor isolation**: In the Mach scheduler, timesharing is achieved by decaying the priority of threads depending on global system load. This property could lead to a burst of activity at the same or lower priority band causing decay for the App/UI thread leading to poor performance and responsiveness. The scheduler offers very limited isolation between threads working on latency sensitive UI workloads and threads performing bulk non-latency sensitive operations.
+
+## Clutch Scheduler Design
+
+In order to reason about higher level user workloads, the clutch scheduler schedules groups of threads instead of individual threads. Breaking away from the traditional single-tier scheduling model, it implements a hierarchical scheduler which makes optimal decisions at various thread grouping levels. The hierarchical scheduler, as its implemented today, has 3 levels:
+
+* Scheduling Bucket Level
+* Thread Group Level
+* Thread Level
+
+### Scheduling Bucket Level
+
+The highest level is the scheduling bucket level which decides which class of threads should be picked for execution. The kernel maintains a notion of scheduling bucket per thread which are defined based on the base/scheduling priority of the threads. These scheduling buckets roughly map to the QoS classes used by the OS runtime to define performance expectations for various pieces of work. All runnable threads with the same scheduling bucket are represented by a single entry at this level. These entries are known as *root buckets* throughout the implementation. The goal of this level is to provide low latency access to the CPU for high QoS classes while ensuring starvation avoidance for the low QoS classes.
+
+**Implementation**
+
+The scheduling bucket level uses an Earliest Deadline First (EDF) algorithm to decide which root bucket should be selected next for execution. Each root bucket with runnable threads is represented as an entry in a priority queue which is ordered by the bucket's deadline. The bucket selection algorithm simply selects the root bucket with the earliest deadline in the priority queue. The deadline for a root bucket is calculated based on its first-runnable timestamp and its **Worst Case Execution Latency (WCEL)** value which is pre-defined for each bucket. The WCEL values are picked based on the decay curve followed by the Mach timesharing algorithm to allow the system to function similar to the existing scheduler from a higher level perspective.
+
+```
+static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
+        SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
+        0,                                              /* FG */
+        37500,                                          /* IN (37.5ms) */
+        75000,                                          /* DF (75ms) */
+        150000,                                         /* UT (150ms) */
+        250000                                          /* BG (250ms) */
+};
+```
+
+Whenever a root bucket transitions from non-runnable to runnable, its deadline is set to (now + WCEL[bucket]). This ensures that the bucket would be scheduled at WCEL[bucket] even in a heavily loaded system. Once the root bucket is picked for execution, its deadline is pushed by WCEL[bucket] into the future. This basic implementation of EDF suffers from one major issue. In a heavily loaded system, it is possible that the higher buckets have used up enough CPU in the recent past such that its behind the lower buckets in deadline order. Now, if a small burst of user-critical workload shows up, the high bucket has to wait for the lower buckets to run before it can get CPU which might lead to performance issues. In order to address that, the bucket level scheduler implements a root bucket warp mechanism. Each bucket is provided a warp value which is refreshed whenever the bucket is selected due to its deadline expiring. 
+
+```
+static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
+        SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
+        8000,                                           /* FG (8ms)*/
+        4000,                                           /* IN (4ms) */
+        2000,                                           /* DF (2ms) */
+        1000,                                           /* UT (1ms) */
+        0                                               /* BG (0ms) */
+};
+```
+The root bucket selection logic finds the earliest deadline bucket and then checks if there are any higher (in natural priority order) buckets that have warp remaining. If there is such a higher bucket, it would select that bucket and effectively open a warp window. During this warp window the scheduler would continue to select this warping bucket over lower priority buckets. Once the warping bucket is drained or the warp window expires, the scheduler goes back to scheduling buckets in deadline order. This mechanism provides a bounded advantage to higher level buckets to allow them to remain responsive in the presence of bursty workloads.
+
+The FIXPRI bucket is special cased since it contains extremely latency sensitive threads. Since the priority range for AboveUI and FG Timeshare buckets overlap, it is important to maintain some native priority order between those buckets. The policy implemented here is to compare the highest clutch buckets of both buckets; if the Above UI bucket is higher, schedule it immediately. Otherwise fall through to the deadline based scheduling as described above. The implementation allows extremely low latency CPU access for Above UI threads while supporting the use case of high priority timeshare threads contending with lower priority fixed priority threads which is observed in some media workloads. Since the timeshare bucket will eventually drop in priority as it consumes CPU, this model provides the desired behavior for timeshare threads above UI. 
+
+The scheduling bucket level also maintains a bitmap of runnable root buckets to allow quick checks for empty hierarchy and root level priority calculation. 
+
+The EDF algorithm is the best choice for this level due to the following reasons:
+
+* Deadline based scheduling allows the scheduler to define strict bounds on worst case execution latencies for all scheduling buckets.
+* The EDF algorithm is dynamic based on bucket runnability and selection. Since all deadline updates are computationally cheap, the algorithm can maintain up-to-date information without measurable overhead.
+* It achieves the goals of maintaining low scheduling latency for high buckets and starvation avoidance for low buckets efficiently.
+* Since the bucket level scheduler deals with a fixed small number of runnable buckets in the worst case, it is easy to configure in terms of defining deadlines, warps etc.
+
+### Thread Group Level
+
+The second level is the “thread group” level which decides which thread group within a bucket should be selected next for execution. Thread groups are a mechanism introduced with the AMP scheduler which represent a collection of threads working on behalf of a specific workload. Each thread group with runnable threads within a bucket is represented as an entry at this level. These entries are known as *clutch buckets* throughout the implementation. The goal of this level is to share the CPU among various user workloads with preference to interactive applications over compute-intensive batch workloads.
+
+**Implementation**
+
+The thread group level implements a variation of the FreeBSD ULE scheduler to decide which clutch bucket should be selected next for execution. Each clutch bucket with runnable threads is represented as an entry in a priority queue which is ordered by clutch bucket priorities. The clutch bucket selection algorithm simply selects the clutch bucket with the highest priority in the priority queue. The priority calculation for the clutch buckets is based on the following factors:
+
+* **Highest runnable thread in the clutch bucket**: The clutch bucket maintains a priority queue which contains threads ordered by their promoted or base priority (whichever property made the thread eligible to be part of that clutch bucket). It uses the highest of these threads to calculate the base priority of the clutch bucket. The use of both base and sched priority allows the scheduler to honor priority differences specified from userspace via SPIs, priority boosts due to priority inheritance mechanisms like turnstiles and other priority affecting mechanisms outside the core scheduler.
+* **Interactivity score**: The scheduler calculates an interactivity score based on the ratio of voluntary blocking time and CPU usage time for the clutch bucket as a whole. This score allows the scheduler to prefer highly interactive thread groups over batch processing compute intensive thread groups.
+* **Thread Group Type**: In order to improve battery life on AMP devices, the OS marks daemon thread groups as “Efficient”. These thread groups typically represent work that is not directly related to the user requested workload. The scheduler de-prioritizes these thread groups over others by factoring this into the priority calculation.
+
+The interactivity score based algorithm is well suited for this level due to the following reasons:
+
+* It allows for a fair sharing of CPU among thread groups based on their recent behavior. Since the algorithm only looks at recent CPU usage history, it also adapts to changing behavior quickly.
+* Since the priority calculation is fairly cheap, the scheduler is able to maintain up-to-date information about all thread groups which leads to more optimal decisions.
+* Thread groups provide a convenient abstraction for groups of threads working together for a user workload. Basing scheduling decisions on this abstraction allows the system to make interesting choices such as preferring Apps over daemons which is typically better for system responsiveness.
+
+### Thread Level
+
+At the lowest level the scheduler decides which thread within a clutch bucket should be selected next for execution. Each runnable thread in the clutch bucket is represented as an entry in a runqueue which is organized based on the schedpri of threads. The thread selection algorithm simply selects the highest priority thread in the runqueue. The schedpri calculation for the threads is based on the traditional Mach scheduling algorithm which uses load & CPU usage to decay priority for a thread. The thread decay model is more suited at this level as compared to the global scheduler because the load calculation only accounts for threads in the same clutch bucket. Since all threads in the same clutch bucket belong to the same thread group and scheduling bucket, this algorithm provides quick CPU access for latency sensitive threads within the clutch bucket without impacting other non-related threads in the system.
+
+**Implementation**
+
+The thread level scheduler implements the Mach timesharing algorithm to decide which thread within the clutch bucket should be selected next for execution. All runnable threads in a clutch bucket are inserted into the runqueue based on the schedpri. The scheduler calculates the schedpri of the threads in a clutch bucket based on the number of runnable threads in the clutch bucket and the CPU usage of individual threads. The load information is updated every scheduler tick and the threads use this information for priority decay calculation as they use CPU. The priority decay algorithm attempts to reward bursty interactive threads and penalize CPU intensive threads. Once a thread is selected for running, it is assigned a quantum which is based on the scheduling bucket it belongs to. The quanta for various buckets are defined statically as:
+
+```
+static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
+        10000,                                          /* FIXPRI (10ms) */
+        10000,                                          /* FG (10ms) */
+        8000,                                           /* IN (8ms) */
+        6000,                                           /* DF (6ms) */
+        4000,                                           /* UT (4ms) */
+        2000                                            /* BG (2ms) */
+};
+```
+
+The per-bucket thread quantum allows the scheduler to bound the worst case execution latency for a low priority thread which has been starved by higher priority threads.
+
+##Scheduler Priority Calculations
+
+###Root Priority Calculation
+
+The scheduler maintains a root level priority for the hierarchy in order to make decisions regarding pre-emptions and thread selection. The root priority is updated as threads are inserted/removed from the hierarchy. The root level also maintains the urgency bits to help with pre-emption decisions. Since the root level priority/urgency is used for pre-emption decisions, it is based on the threads in the hierarchy and is calculated as follows:
+
+```
+Root Priority Calculation:
+* If AboveUI bucket is runnable, 
+*     Compare priority of AboveUI highest clutch bucket (CBUI) with Timeshare FG highest clutch bucket (CBFG)
+*     If pri(CBUI) >= pri(CBFG), select CBUI
+* Otherwise find the (non-AboveUI) highest priority root bucket that is runnable and select its highest clutch bucket
+* Find the highest priority (promoted or base pri) thread within that clutch bucket and assign that as root priority
+
+Root Urgency Calculation:
+* On thread insertion into the hierarchy, increment the root level urgency based on thread's sched_pri
+* On thread removal from the hierarchy, decrement the root level urgency based on thread's sched_pri
+
+```
+
+###Root Bucket Priority Calculation
+
+The root bucket priority is simply the deadline of the root bucket which is calculated by adding the WCEL of the bucket to the timestamp of the root bucket becoming runnable.
+
+```
+root-bucket priority = now + WCEL[bucket]
+```
+
+###Clutch Bucket Priority Calculation
+
+As mentioned earlier, the priority value of a clutch bucket is calculated based on the highest runnable thread, interactivity score and the thread group type. The actual calculation algorithm is as follows:
+
+```
+* Find the highest runnable thread (promoted or basepri) in the clutch bucket (maxpri)
+* Check if the thread group for this clutch bucket is marked Efficient. 
+*      If not, assign a positive boost value (clutch_boost)
+* Calculate the ratio of CPU blocked and CPU used for the clutch bucket.
+*      If blocked > used, assign a score (interactivity_score) in the higher range.
+*      Else, assign a score (interactivity_score) in the lower range.
+* clutch-bucket priority = maxpri + clutch_boost + interactivity_score
+```
+
+###Thread Priority Calculation
+
+The thread priority calculation is based on the Mach timesharing algorithm. It is calculated in the following manner:
+
+```
+* Every scheduler tick, snapshot the load for the clutch bucket
+* Use the load value to calculate the priority shift values for all threads in the clutch bucket
+* thread priority = base priority - (thread CPU usage >> priority shift)
+```
index e7f506c06d70d6d1f337ec549721095c971d5be4..0f04cd427de14db056a4fda5f995f5922fcb5515 100644 (file)
@@ -56,7 +56,8 @@ static void
 sched_dualq_thread_update_scan(sched_update_scan_context_t scan_context);
 
 static boolean_t
-sched_dualq_processor_enqueue(processor_t processor, thread_t thread, integer_t options);
+sched_dualq_processor_enqueue(processor_t processor, thread_t thread,
+    sched_options_t options);
 
 static boolean_t
 sched_dualq_processor_queue_remove(processor_t processor, thread_t thread);
@@ -126,7 +127,6 @@ const struct sched_dispatch_table sched_dualq_dispatch = {
        .processor_runq_stats_count_sum                 = sched_dualq_runq_stats_count_sum,
        .processor_bound_count                          = sched_dualq_processor_bound_count,
        .thread_update_scan                             = sched_dualq_thread_update_scan,
-       .direct_dispatch_to_idle_processors             = FALSE,
        .multiple_psets_enabled                         = TRUE,
        .sched_groups_enabled                           = FALSE,
        .avoid_processor_enabled                        = TRUE,
@@ -143,6 +143,10 @@ const struct sched_dispatch_table sched_dualq_dispatch = {
        .check_spill                                    = sched_check_spill,
        .ipi_policy                                     = sched_ipi_policy,
        .thread_should_yield                            = sched_thread_should_yield,
+       .run_count_incr                                 = sched_run_incr,
+       .run_count_decr                                 = sched_run_decr,
+       .update_thread_bucket                           = sched_update_thread_bucket,
+       .pset_made_schedulable                          = sched_pset_made_schedulable,
 };
 
 __attribute__((always_inline))
@@ -238,7 +242,7 @@ sched_dualq_choose_thread(
        }
 
        if (processor->is_SMT) {
-               thread_t potential_thread = run_queue_dequeue(chosen_runq, SCHED_PEEK | SCHED_HEADQ);
+               thread_t potential_thread = run_queue_peek(chosen_runq);
                if (potential_thread == THREAD_NULL) {
                        return THREAD_NULL;
                }
@@ -280,7 +284,7 @@ static boolean_t
 sched_dualq_processor_enqueue(
        processor_t       processor,
        thread_t          thread,
-       integer_t         options)
+       sched_options_t   options)
 {
        run_queue_t     rq = dualq_runq_for_thread(processor, thread);
        boolean_t       result;
index af61fd5524f76ebe3aac8598f7e4261a1510c1f5..5f663aba77b28e92765b6ed471056e1c2dd55e6b 100644 (file)
@@ -138,7 +138,7 @@ static boolean_t
 sched_grrr_processor_enqueue(
        processor_t                    processor,
        thread_t                       thread,
-       integer_t                      options);
+       sched_options_t                options);
 
 static void
 sched_grrr_processor_queue_shutdown(
@@ -219,7 +219,6 @@ const struct sched_dispatch_table sched_grrr_dispatch = {
        .processor_runq_stats_count_sum                 = sched_grrr_processor_runq_stats_count_sum,
        .processor_bound_count                          = sched_grrr_processor_bound_count,
        .thread_update_scan                             = sched_grrr_thread_update_scan,
-       .direct_dispatch_to_idle_processors             = TRUE,
        .multiple_psets_enabled                         = TRUE,
        .sched_groups_enabled                           = FALSE,
        .avoid_processor_enabled                        = FALSE,
@@ -236,6 +235,10 @@ const struct sched_dispatch_table sched_grrr_dispatch = {
        .check_spill                                    = sched_check_spill,
        .ipi_policy                                     = sched_ipi_policy,
        .thread_should_yield                            = sched_thread_should_yield,
+       .run_count_incr                                 = sched_run_incr,
+       .run_count_decr                                 = sched_run_decr,
+       .update_thread_bucket                           = sched_update_thread_bucket,
+       .pset_made_schedulable                          = sched_pset_made_schedulable,
 };
 
 extern int      max_unsafe_quanta;
@@ -348,7 +351,7 @@ static boolean_t
 sched_grrr_processor_enqueue(
        processor_t                    processor,
        thread_t                       thread,
-       integer_t                      options __unused)
+       sched_options_t                options __unused)
 {
        grrr_run_queue_t                rq = &processor->grrr_runq;
        boolean_t                               result;
index 9b40847776c0825631e87aecc695e6d6ca939546..a96f7bd639e6172b6a66918d58099c40a52f3a41 100644 (file)
@@ -246,7 +246,8 @@ static void
 sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context);
 
 static boolean_t
-sched_multiq_processor_enqueue(processor_t processor, thread_t thread, integer_t options);
+sched_multiq_processor_enqueue(processor_t processor, thread_t thread,
+    sched_options_t options);
 
 static boolean_t
 sched_multiq_processor_queue_remove(processor_t processor, thread_t thread);
@@ -319,7 +320,6 @@ const struct sched_dispatch_table sched_multiq_dispatch = {
        .processor_runq_stats_count_sum                 = sched_multiq_runq_stats_count_sum,
        .processor_bound_count                          = sched_multiq_processor_bound_count,
        .thread_update_scan                             = sched_multiq_thread_update_scan,
-       .direct_dispatch_to_idle_processors             = FALSE,
        .multiple_psets_enabled                         = FALSE,
        .sched_groups_enabled                           = TRUE,
        .avoid_processor_enabled                        = TRUE,
@@ -336,6 +336,10 @@ const struct sched_dispatch_table sched_multiq_dispatch = {
        .check_spill                                    = sched_check_spill,
        .ipi_policy                                     = sched_ipi_policy,
        .thread_should_yield                            = sched_thread_should_yield,
+       .run_count_incr                                 = sched_run_incr,
+       .run_count_decr                                 = sched_run_decr,
+       .update_thread_bucket                           = sched_update_thread_bucket,
+       .pset_made_schedulable                          = sched_pset_made_schedulable,
 };
 
 
@@ -494,9 +498,9 @@ entry_queue_first_entry(entry_queue_t rq)
 {
        assert(rq->count != 0);
 
-       queue_t queue = &rq->queues[rq->highq];
+       circle_queue_t queue = &rq->queues[rq->highq];
 
-       sched_entry_t entry = qe_queue_first(queue, struct sched_entry, entry_links);
+       sched_entry_t entry = cqe_queue_first(queue, struct sched_entry, entry_links);
 
        assert(entry->sched_pri == rq->highq);
 
@@ -527,9 +531,9 @@ group_first_thread(sched_group_t group)
 
        assert(rq->count != 0);
 
-       queue_t queue = &rq->queues[rq->highq];
+       circle_queue_t queue = &rq->queues[rq->highq];
 
-       thread_t thread = qe_queue_first(queue, struct thread, runq_links);
+       thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
 
        assert(thread != THREAD_NULL);
        assert_thread_magic(thread);
@@ -546,7 +550,7 @@ group_first_thread(sched_group_t group)
 static void
 entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pri)
 {
-       queue_t q;
+       circle_queue_t q;
        sched_entry_t elem;
 
        assert(queue_chain_linked(&entry->entry_links));
@@ -554,7 +558,7 @@ entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pr
 
        q = &runq->queues[expected_pri];
 
-       qe_foreach_element(elem, q, entry_links) {
+       cqe_foreach_element(elem, q, entry_links) {
                if (elem == entry) {
                        return;
                }
@@ -567,7 +571,7 @@ entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pr
 static void
 sched_group_check_thread(sched_group_t group, thread_t thread)
 {
-       queue_t q;
+       circle_queue_t q;
        thread_t elem;
        int pri = thread->sched_pri;
 
@@ -575,7 +579,7 @@ sched_group_check_thread(sched_group_t group, thread_t thread)
 
        q = &group->runq.queues[pri];
 
-       qe_foreach_element(elem, q, runq_links) {
+       cqe_foreach_element(elem, q, runq_links) {
                if (elem == thread) {
                        return;
                }
@@ -635,19 +639,19 @@ static sched_entry_t
 entry_queue_dequeue_entry(entry_queue_t rq)
 {
        sched_entry_t   sched_entry;
-       queue_t         queue = &rq->queues[rq->highq];
+       circle_queue_t  queue = &rq->queues[rq->highq];
 
        assert(rq->count > 0);
-       assert(!queue_empty(queue));
+       assert(!circle_queue_empty(queue));
 
-       sched_entry = qe_dequeue_head(queue, struct sched_entry, entry_links);
+       sched_entry = cqe_dequeue_head(queue, struct sched_entry, entry_links);
 
        SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
        rq->count--;
        if (SCHED(priority_is_urgent)(rq->highq)) {
                rq->urgency--; assert(rq->urgency >= 0);
        }
-       if (queue_empty(queue)) {
+       if (circle_queue_empty(queue)) {
                rq_bitmap_clear(rq->bitmap, rq->highq);
                rq->highq = bitmap_first(rq->bitmap, NRQS);
        }
@@ -667,13 +671,13 @@ entry_queue_enqueue_entry(
        integer_t     options)
 {
        int             sched_pri = entry->sched_pri;
-       queue_t         queue = &rq->queues[sched_pri];
+       circle_queue_t  queue = &rq->queues[sched_pri];
        boolean_t       result = FALSE;
 
        assert(entry->runq == 0);
 
-       if (queue_empty(queue)) {
-               enqueue_tail(queue, &entry->entry_links);
+       if (circle_queue_empty(queue)) {
+               circle_enqueue_tail(queue, &entry->entry_links);
 
                rq_bitmap_set(rq->bitmap, sched_pri);
                if (sched_pri > rq->highq) {
@@ -682,9 +686,9 @@ entry_queue_enqueue_entry(
                }
        } else {
                if (options & SCHED_TAILQ) {
-                       enqueue_tail(queue, &entry->entry_links);
+                       circle_enqueue_tail(queue, &entry->entry_links);
                } else {
-                       enqueue_head(queue, &entry->entry_links);
+                       circle_enqueue_head(queue, &entry->entry_links);
                }
        }
        if (SCHED(priority_is_urgent)(sched_pri)) {
@@ -722,7 +726,7 @@ entry_queue_remove_entry(
                rq->urgency--; assert(rq->urgency >= 0);
        }
 
-       if (queue_empty(&rq->queues[sched_pri])) {
+       if (circle_queue_empty(&rq->queues[sched_pri])) {
                /* update run queue status */
                rq_bitmap_clear(rq->bitmap, sched_pri);
                rq->highq = bitmap_first(rq->bitmap, NRQS);
@@ -737,8 +741,8 @@ entry_queue_change_entry(
        sched_entry_t entry,
        integer_t     options)
 {
-       int     sched_pri   = entry->sched_pri;
-       queue_t queue       = &rq->queues[sched_pri];
+       int            sched_pri   = entry->sched_pri;
+       circle_queue_t queue       = &rq->queues[sched_pri];
 
 #if defined(MULTIQ_SANITY_CHECK)
        if (multiq_sanity_check) {
@@ -746,10 +750,11 @@ entry_queue_change_entry(
        }
 #endif
 
+       circle_dequeue(queue, &entry->entry_links);
        if (options & SCHED_TAILQ) {
-               re_queue_tail(queue, &entry->entry_links);
+               circle_enqueue_tail(queue, &entry->entry_links);
        } else {
-               re_queue_head(queue, &entry->entry_links);
+               circle_enqueue_head(queue, &entry->entry_links);
        }
 }
 /*
@@ -764,14 +769,14 @@ group_run_queue_dequeue_thread(
        boolean_t     *queue_empty)
 {
        thread_t        thread;
-       queue_t         queue = &rq->queues[rq->highq];
+       circle_queue_t  queue = &rq->queues[rq->highq];
 
        assert(rq->count > 0);
-       assert(!queue_empty(queue));
+       assert(!circle_queue_empty(queue));
 
        *thread_pri = rq->highq;
 
-       thread = qe_dequeue_head(queue, struct thread, runq_links);
+       thread = cqe_dequeue_head(queue, struct thread, runq_links);
        assert_thread_magic(thread);
 
        SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
@@ -779,7 +784,7 @@ group_run_queue_dequeue_thread(
        if (SCHED(priority_is_urgent)(rq->highq)) {
                rq->urgency--; assert(rq->urgency >= 0);
        }
-       if (queue_empty(queue)) {
+       if (circle_queue_empty(queue)) {
                rq_bitmap_clear(rq->bitmap, rq->highq);
                rq->highq = bitmap_first(rq->bitmap, NRQS);
                *queue_empty = TRUE;
@@ -801,14 +806,14 @@ group_run_queue_enqueue_thread(
        integer_t      thread_pri,
        integer_t      options)
 {
-       queue_t         queue = &rq->queues[thread_pri];
+       circle_queue_t  queue = &rq->queues[thread_pri];
        boolean_t       result = FALSE;
 
        assert(thread->runq == PROCESSOR_NULL);
        assert_thread_magic(thread);
 
-       if (queue_empty(queue)) {
-               enqueue_tail(queue, &thread->runq_links);
+       if (circle_queue_empty(queue)) {
+               circle_enqueue_tail(queue, &thread->runq_links);
 
                rq_bitmap_set(rq->bitmap, thread_pri);
                if (thread_pri > rq->highq) {
@@ -817,9 +822,9 @@ group_run_queue_enqueue_thread(
                result = TRUE;
        } else {
                if (options & SCHED_TAILQ) {
-                       enqueue_tail(queue, &thread->runq_links);
+                       circle_enqueue_tail(queue, &thread->runq_links);
                } else {
-                       enqueue_head(queue, &thread->runq_links);
+                       circle_enqueue_head(queue, &thread->runq_links);
                }
        }
        if (SCHED(priority_is_urgent)(thread_pri)) {
@@ -841,12 +846,13 @@ group_run_queue_remove_thread(
        thread_t        thread,
        integer_t       thread_pri)
 {
+       circle_queue_t  queue = &rq->queues[thread_pri];
        boolean_t       result = FALSE;
 
        assert_thread_magic(thread);
        assert(thread->runq != PROCESSOR_NULL);
 
-       remqueue(&thread->runq_links);
+       circle_dequeue(queue, &thread->runq_links);
 
        SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
        rq->count--;
@@ -854,7 +860,7 @@ group_run_queue_remove_thread(
                rq->urgency--; assert(rq->urgency >= 0);
        }
 
-       if (queue_empty(&rq->queues[thread_pri])) {
+       if (circle_queue_empty(queue)) {
                /* update run queue status */
                rq_bitmap_clear(rq->bitmap, thread_pri);
                rq->highq = bitmap_first(rq->bitmap, NRQS);
@@ -1148,7 +1154,7 @@ static boolean_t
 sched_multiq_processor_enqueue(
        processor_t      processor,
        thread_t         thread,
-       integer_t        options)
+       sched_options_t  options)
 {
        boolean_t       result;
 
@@ -1415,7 +1421,7 @@ group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context)
            queue_index = bitmap_next(runq->bitmap, queue_index)) {
                sched_entry_t entry;
 
-               qe_foreach_element(entry, &runq->queues[queue_index], entry_links) {
+               cqe_foreach_element(entry, &runq->queues[queue_index], entry_links) {
                        assert(count > 0);
 
                        sched_group_t group = group_for_entry(entry);
index e5a3d2e2eac35616ee345d60d62e0a4e2ac3a951..c59175da0edf2cceab9a7d32bd5c4ce73b7f3cb5 100644 (file)
@@ -75,7 +75,7 @@
 #include <machine/machine_routines.h>
 #include <machine/sched_param.h>
 #include <machine/machine_cpu.h>
-#include <machine/machlimits.h>
+#include <machine/limits.h>
 #include <machine/atomic.h>
 
 #include <machine/commpage.h>
@@ -235,12 +235,6 @@ static void preempt_pri_init(void);
 
 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 
-#if CONFIG_SCHED_IDLE_IN_PLACE
-static thread_t thread_select_idle(
-       thread_t                        thread,
-       processor_t                     processor);
-#endif
-
 thread_t        processor_idle(
        thread_t                        thread,
        processor_t                     processor);
@@ -280,11 +274,9 @@ sched_vm_group_maintenance(void);
 
 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 int8_t          sched_load_shifts[NRQS];
-bitmap_t        sched_preempt_pri[BITMAP_LEN(NRQS)];
+bitmap_t        sched_preempt_pri[BITMAP_LEN(NRQS_MAX)];
 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 
-const struct sched_dispatch_table *sched_current_dispatch = NULL;
-
 /*
  * Statically allocate a buffer to hold the longest possible
  * scheduler description string, as currently implemented.
@@ -309,67 +301,10 @@ uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
 /* Global flag which indicates whether Background Stepper Context is enabled */
 static int cpu_throttle_enabled = 1;
 
-#if DEBUG
-
-/* Since using the indirect function dispatch table has a negative impact on
- * context switch performance, only allow DEBUG kernels to use that mechanism.
- */
-static void
-sched_init_override(void)
-{
-       char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
-
-       /* Check for runtime selection of the scheduler algorithm */
-       if (!PE_parse_boot_argn("sched", sched_arg, sizeof(sched_arg))) {
-               sched_arg[0] = '\0';
-       }
-       if (strlen(sched_arg) > 0) {
-               if (0) {
-                       /* Allow pattern below */
-#if defined(CONFIG_SCHED_TRADITIONAL)
-               } else if (0 == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) {
-                       sched_current_dispatch = &sched_traditional_dispatch;
-               } else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) {
-                       sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
-#endif
-#if defined(CONFIG_SCHED_MULTIQ)
-               } else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) {
-                       sched_current_dispatch = &sched_multiq_dispatch;
-               } else if (0 == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) {
-                       sched_current_dispatch = &sched_dualq_dispatch;
-#endif
-               } else {
-#if defined(CONFIG_SCHED_TRADITIONAL)
-                       printf("Unrecognized scheduler algorithm: %s\n", sched_arg);
-                       printf("Scheduler: Using instead: %s\n", sched_traditional_with_pset_runqueue_dispatch.sched_name);
-                       sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
-#else
-                       panic("Unrecognized scheduler algorithm: %s", sched_arg);
-#endif
-               }
-               kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name));
-       } else {
-#if   defined(CONFIG_SCHED_MULTIQ)
-               sched_current_dispatch = &sched_dualq_dispatch;
-#elif defined(CONFIG_SCHED_TRADITIONAL)
-               sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
-#else
-#error No default scheduler implementation
-#endif
-               kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
-       }
-}
-
-#endif /* DEBUG */
-
 void
 sched_init(void)
 {
-#if DEBUG
-       sched_init_override();
-#else /* DEBUG */
        kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
-#endif /* DEBUG */
 
        if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
                /* No boot-args, check in device tree */
@@ -505,7 +440,7 @@ pset_rt_init(processor_set_t pset)
 {
        rt_lock_init(pset);
 
-       pset->rt_runq.count = 0;
+       os_atomic_init(&pset->rt_runq.count, 0);
        queue_init(&pset->rt_runq.queue);
        memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats);
 }
@@ -734,28 +669,18 @@ thread_unblock(
                }
 
                /* Update the runnable thread count */
-               new_run_count = sched_run_incr(thread);
+               new_run_count = SCHED(run_count_incr)(thread);
        } else {
                /*
                 * Either the thread is idling in place on another processor,
                 * or it hasn't finished context switching yet.
                 */
-#if CONFIG_SCHED_IDLE_IN_PLACE
-               if (thread->state & TH_IDLE) {
-                       processor_t             processor = thread->last_processor;
-
-                       if (processor != current_processor()) {
-                               machine_signal_idle(processor);
-                       }
-               }
-#else
                assert((thread->state & TH_IDLE) == 0);
-#endif
                /*
                 * The run count is only dropped after the context switch completes
                 * and the thread is still waiting, so we should not run_incr here
                 */
-               new_run_count = sched_run_buckets[TH_BUCKET_RUN];
+               new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
        }
 
 
@@ -875,7 +800,7 @@ thread_go(
        if (thread_unblock(thread, wresult)) {
 #if     SCHED_TRACE_THREAD_WAKEUPS
                backtrace(&thread->thread_wakeup_bt[0],
-                   (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)));
+                   (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL);
 #endif
                thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
        }
@@ -901,6 +826,10 @@ thread_mark_wait_locked(
        boolean_t                       at_safe_point;
        wait_interrupt_t        interruptible = interruptible_orig;
 
+       if (thread->state & TH_IDLE) {
+               panic("Invalid attempt to wait while running the idle thread");
+       }
+
        assert(!(thread->state & (TH_WAIT | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT)));
 
        /*
@@ -2237,139 +2166,12 @@ idle:
                pset_unlock(pset);
 #endif
 
-#if CONFIG_SCHED_IDLE_IN_PLACE
-               /*
-                *      Choose idle thread if fast idle is not possible.
-                */
-               if (processor->processor_primary != processor) {
-                       return processor->idle_thread;
-               }
-
-               if ((thread->state & (TH_IDLE | TH_TERMINATE | TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active || thread->sched_pri >= BASEPRI_RTQUEUES) {
-                       return processor->idle_thread;
-               }
-
-               /*
-                *      Perform idling activities directly without a
-                *      context switch.  Return dispatched thread,
-                *      else check again for a runnable thread.
-                */
-               new_thread = thread_select_idle(thread, processor);
-
-#else /* !CONFIG_SCHED_IDLE_IN_PLACE */
-
-               /*
-                * Do a full context switch to idle so that the current
-                * thread can start running on another processor without
-                * waiting for the fast-idled processor to wake up.
-                */
                new_thread = processor->idle_thread;
-
-#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
        } while (new_thread == THREAD_NULL);
 
        return new_thread;
 }
 
-#if CONFIG_SCHED_IDLE_IN_PLACE
-/*
- *     thread_select_idle:
- *
- *     Idle the processor using the current thread context.
- *
- *     Called with thread locked, then dropped and relocked.
- */
-static thread_t
-thread_select_idle(
-       thread_t                thread,
-       processor_t             processor)
-{
-       thread_t                new_thread;
-       uint64_t                arg1, arg2;
-       int                     urgency;
-
-       sched_run_decr(thread);
-
-       thread->state |= TH_IDLE;
-       processor_state_update_idle(procssor);
-
-       /* Reload precise timing global policy to thread-local policy */
-       thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
-
-       thread_unlock(thread);
-
-       /*
-        *      Switch execution timing to processor idle thread.
-        */
-       processor->last_dispatch = mach_absolute_time();
-
-#ifdef CONFIG_MACH_APPROXIMATE_TIME
-       commpage_update_mach_approximate_time(processor->last_dispatch);
-#endif
-
-       thread->last_run_time = processor->last_dispatch;
-       processor_timer_switch_thread(processor->last_dispatch,
-           &processor->idle_thread->system_timer);
-       PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
-
-
-       /*
-        *      Cancel the quantum timer while idling.
-        */
-       timer_call_quantum_timer_cancel(&processor->quantum_timer);
-       processor->first_timeslice = FALSE;
-
-       if (thread->sched_call) {
-               (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
-       }
-
-       thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL);
-
-       /*
-        *      Enable interrupts and perform idling activities.  No
-        *      preemption due to TH_IDLE being set.
-        */
-       spllo(); new_thread = processor_idle(thread, processor);
-
-       /*
-        *      Return at splsched.
-        */
-       if (thread->sched_call) {
-               (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
-       }
-
-       thread_lock(thread);
-
-       /*
-        *      If awakened, switch to thread timer and start a new quantum.
-        *      Otherwise skip; we will context switch to another thread or return here.
-        */
-       if (!(thread->state & TH_WAIT)) {
-               uint64_t time_now = processor->last_dispatch = mach_absolute_time();
-               processor_timer_switch_thread(time_now, &thread->system_timer);
-               timer_update(&thread->runnable_timer, time_now);
-               PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
-               thread_quantum_init(thread);
-               processor->quantum_end = time_now + thread->quantum_remaining;
-               timer_call_quantum_timer_enter(&processor->quantum_timer,
-                   thread, processor->quantum_end, time_now);
-               processor->first_timeslice = TRUE;
-
-               thread->computation_epoch = time_now;
-       }
-
-       thread->state &= ~TH_IDLE;
-
-       urgency = thread_get_urgency(thread, &arg1, &arg2);
-
-       thread_tell_urgency(urgency, arg1, arg2, 0, new_thread);
-
-       sched_run_incr(thread);
-
-       return new_thread;
-}
-#endif /* CONFIG_SCHED_IDLE_IN_PLACE */
-
 /*
  * thread_invoke
  *
@@ -2468,6 +2270,7 @@ thread_invoke(
 
                        /*
                         * Context switch by performing a stack handoff.
+                        * Requires both threads to be parked in a continuation.
                         */
                        continuation = thread->continuation;
                        parameter = thread->parameter;
@@ -2521,6 +2324,12 @@ thread_invoke(
                        kperf_off_cpu(self);
 #endif /* KPERF */
 
+                       /*
+                        * This is where we actually switch thread identity,
+                        * and address space if required.  However, register
+                        * state is not switched - this routine leaves the
+                        * stack and register state active on the current CPU.
+                        */
                        TLOG(1, "thread_invoke: calling stack_handoff\n");
                        stack_handoff(self, thread);
 
@@ -2545,8 +2354,16 @@ thread_invoke(
 
                        counter(c_thread_invoke_hits++);
 
+                       boolean_t enable_interrupts = TRUE;
+
+                       /* idle thread needs to stay interrupts-disabled */
+                       if ((thread->state & TH_IDLE)) {
+                               enable_interrupts = FALSE;
+                       }
+
                        assert(continuation);
-                       call_continuation(continuation, parameter, thread->wait_result, TRUE);
+                       call_continuation(continuation, parameter,
+                           thread->wait_result, enable_interrupts);
                        /*NOTREACHED*/
                } else if (thread == self) {
                        /* same thread but with continuation */
@@ -2573,7 +2390,15 @@ thread_invoke(
 
                        self->continuation = self->parameter = NULL;
 
-                       call_continuation(continuation, parameter, self->wait_result, TRUE);
+                       boolean_t enable_interrupts = TRUE;
+
+                       /* idle thread needs to stay interrupts-disabled */
+                       if ((self->state & TH_IDLE)) {
+                               enable_interrupts = FALSE;
+                       }
+
+                       call_continuation(continuation, parameter,
+                           self->wait_result, enable_interrupts);
                        /*NOTREACHED*/
                }
        } else {
@@ -2669,30 +2494,33 @@ need_stack:
         * been stored on the stack or a non-volatile register, but a stale idea of
         * what was on the CPU is newly-accurate because that thread is again
         * running on the CPU.
+        *
+        * If one of the threads is using a continuation, thread_continue
+        * is used to stitch up its context.
+        *
+        * If we are invoking a thread which is resuming from a continuation,
+        * the CPU will invoke thread_continue next.
+        *
+        * If the current thread is parking in a continuation, then its state
+        * won't be saved and the stack will be discarded. When the stack is
+        * re-allocated, it will be configured to resume from thread_continue.
         */
        assert(continuation == self->continuation);
        thread = machine_switch_context(self, continuation, thread);
        assert(self == current_thread_volatile());
        TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
 
+       assert(continuation == NULL && self->continuation == NULL);
+
        DTRACE_SCHED(on__cpu);
 
 #if KPERF
        kperf_on_cpu(self, NULL, __builtin_frame_address(0));
 #endif /* KPERF */
 
-       /*
-        * We have been resumed and are set to run.
-        */
+       /* We have been resumed and are set to run. */
        thread_dispatch(thread, self);
 
-       if (continuation) {
-               self->continuation = self->parameter = NULL;
-
-               call_continuation(continuation, parameter, self->wait_result, TRUE);
-               /*NOTREACHED*/
-       }
-
        return TRUE;
 }
 
@@ -2716,7 +2544,7 @@ pset_cancel_deferred_dispatch(
        uint32_t                sampled_sched_run_count;
 
        pset_lock(pset);
-       sampled_sched_run_count = (volatile uint32_t) sched_run_buckets[TH_BUCKET_RUN];
+       sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
 
        /*
         * If we have emptied the run queue, and our current thread is runnable, we
@@ -2768,7 +2596,6 @@ pset_cancel_deferred_dispatch(
                                 * reasonable facsimile of PROCESSOR_IDLE.
                                 */
 
-                               assert(active_processor->next_thread == THREAD_NULL);
                                processor_state_update_idle(active_processor);
                                active_processor->deadline = UINT64_MAX;
                                pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
@@ -2809,6 +2636,7 @@ thread_csw_callout(
  *      "self" is the new current thread that we have context switched to
  *
  *     Called at splsched.
+ *
  */
 void
 thread_dispatch(
@@ -3048,7 +2876,7 @@ thread_dispatch(
                                thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
                                thread->chosen_processor = PROCESSOR_NULL;
 
-                               new_run_count = sched_run_decr(thread);
+                               new_run_count = SCHED(run_count_decr)(thread);
 
 #if CONFIG_SCHED_SFI
                                if (thread->reason & AST_SFI) {
@@ -3153,8 +2981,7 @@ thread_dispatch(
         * TODO: Can we state that redispatching our old thread is also
         * uninteresting?
         */
-       if ((((volatile uint32_t)sched_run_buckets[TH_BUCKET_RUN]) == 1) &&
-           !(self->state & TH_IDLE)) {
+       if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) {
                pset_cancel_deferred_dispatch(processor->processor_set, processor);
        }
 #endif
@@ -3291,6 +3118,9 @@ thread_run(
  *
  *     Called at splsched when a thread first receives
  *     a new stack after a continuation.
+ *
+ *     Called with THREAD_NULL as the old thread when
+ *     invoked by machine_load_context.
  */
 void
 thread_continue(
@@ -3305,6 +3135,8 @@ thread_continue(
        continuation = self->continuation;
        parameter = self->parameter;
 
+       assert(continuation != NULL);
+
 #if KPERF
        kperf_on_cpu(self, continuation, NULL);
 #endif
@@ -3320,7 +3152,13 @@ thread_continue(
 
        TLOG(1, "thread_continue: calling call_continuation\n");
 
-       boolean_t enable_interrupts = thread != THREAD_NULL;
+       boolean_t enable_interrupts = TRUE;
+
+       /* bootstrap thread, idle thread need to stay interrupts-disabled */
+       if (thread == THREAD_NULL || (self->state & TH_IDLE)) {
+               enable_interrupts = FALSE;
+       }
+
        call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
        /*NOTREACHED*/
 }
@@ -3360,7 +3198,7 @@ run_queue_init(
        }
        rq->urgency = rq->count = 0;
        for (int i = 0; i < NRQS; i++) {
-               queue_init(&rq->queues[i]);
+               circle_queue_init(&rq->queues[i]);
        }
 }
 
@@ -3375,25 +3213,16 @@ run_queue_init(
  */
 thread_t
 run_queue_dequeue(
-       run_queue_t   rq,
-       integer_t     options)
+       run_queue_t     rq,
+       sched_options_t options)
 {
-       thread_t    thread;
-       queue_t     queue = &rq->queues[rq->highq];
-
-       if (options & SCHED_PEEK) {
-               if (options & SCHED_HEADQ) {
-                       thread = qe_queue_first(queue, struct thread, runq_links);
-               } else {
-                       thread = qe_queue_last(queue, struct thread, runq_links);
-               }
-               return thread;
-       }
+       thread_t        thread;
+       circle_queue_t  queue = &rq->queues[rq->highq];
 
        if (options & SCHED_HEADQ) {
-               thread = qe_dequeue_head(queue, struct thread, runq_links);
+               thread = cqe_dequeue_head(queue, struct thread, runq_links);
        } else {
-               thread = qe_dequeue_tail(queue, struct thread, runq_links);
+               thread = cqe_dequeue_tail(queue, struct thread, runq_links);
        }
 
        assert(thread != THREAD_NULL);
@@ -3405,7 +3234,7 @@ run_queue_dequeue(
        if (SCHED(priority_is_urgent)(rq->highq)) {
                rq->urgency--; assert(rq->urgency >= 0);
        }
-       if (queue_empty(queue)) {
+       if (circle_queue_empty(queue)) {
                bitmap_clear(rq->bitmap, rq->highq);
                rq->highq = bitmap_first(rq->bitmap, NRQS);
        }
@@ -3423,17 +3252,17 @@ run_queue_dequeue(
  */
 boolean_t
 run_queue_enqueue(
-       run_queue_t   rq,
-       thread_t      thread,
-       integer_t     options)
+       run_queue_t      rq,
+       thread_t         thread,
+       sched_options_t  options)
 {
-       queue_t     queue = &rq->queues[thread->sched_pri];
-       boolean_t   result = FALSE;
+       circle_queue_t  queue = &rq->queues[thread->sched_pri];
+       boolean_t       result = FALSE;
 
        assert_thread_magic(thread);
 
-       if (queue_empty(queue)) {
-               enqueue_tail(queue, &thread->runq_links);
+       if (circle_queue_empty(queue)) {
+               circle_enqueue_tail(queue, &thread->runq_links);
 
                rq_bitmap_set(rq->bitmap, thread->sched_pri);
                if (thread->sched_pri > rq->highq) {
@@ -3442,9 +3271,9 @@ run_queue_enqueue(
                }
        } else {
                if (options & SCHED_TAILQ) {
-                       enqueue_tail(queue, &thread->runq_links);
+                       circle_enqueue_tail(queue, &thread->runq_links);
                } else {
-                       enqueue_head(queue, &thread->runq_links);
+                       circle_enqueue_head(queue, &thread->runq_links);
                }
        }
        if (SCHED(priority_is_urgent)(thread->sched_pri)) {
@@ -3468,17 +3297,19 @@ run_queue_remove(
        run_queue_t    rq,
        thread_t       thread)
 {
+       circle_queue_t  queue = &rq->queues[thread->sched_pri];
+
        assert(thread->runq != PROCESSOR_NULL);
        assert_thread_magic(thread);
 
-       remqueue(&thread->runq_links);
+       circle_dequeue(queue, &thread->runq_links);
        SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
        rq->count--;
        if (SCHED(priority_is_urgent)(thread->sched_pri)) {
                rq->urgency--; assert(rq->urgency >= 0);
        }
 
-       if (queue_empty(&rq->queues[thread->sched_pri])) {
+       if (circle_queue_empty(queue)) {
                /* update run queue status */
                bitmap_clear(rq->bitmap, thread->sched_pri);
                rq->highq = bitmap_first(rq->bitmap, NRQS);
@@ -3487,6 +3318,28 @@ run_queue_remove(
        thread->runq = PROCESSOR_NULL;
 }
 
+/*
+ *      run_queue_peek
+ *
+ *      Peek at the runq and return the highest
+ *      priority thread from the runq.
+ *
+ *     The run queue must be locked.
+ */
+thread_t
+run_queue_peek(
+       run_queue_t    rq)
+{
+       if (rq->count > 0) {
+               circle_queue_t queue = &rq->queues[rq->highq];
+               thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
+               assert_thread_magic(thread);
+               return thread;
+       } else {
+               return THREAD_NULL;
+       }
+}
+
 /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
 void
 sched_rtglobal_runq_scan(sched_update_scan_context_t scan_context)
@@ -3585,22 +3438,6 @@ realtime_setrun(
        /* <rdar://problem/15102234> */
        assert(thread->bound_processor == PROCESSOR_NULL);
 
-       /*
-        *      Dispatch directly onto idle processor.
-        */
-       if ((thread->bound_processor == processor)
-           && processor->state == PROCESSOR_IDLE) {
-               processor->next_thread = thread;
-               processor_state_update_from_thread(processor, thread);
-               processor->deadline = thread->realtime.deadline;
-               pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
-
-               ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR);
-               pset_unlock(pset);
-               sched_ipi_perform(processor, ipi_type);
-               return;
-       }
-
        if (processor->current_pri < BASEPRI_RTQUEUES) {
                preempt = (AST_PREEMPT | AST_URGENT);
        } else if (thread->realtime.deadline < processor->deadline) {
@@ -3614,7 +3451,6 @@ realtime_setrun(
        ipi_type = SCHED_IPI_NONE;
        if (preempt != AST_NONE) {
                if (processor->state == PROCESSOR_IDLE) {
-                       processor->next_thread = THREAD_NULL;
                        processor_state_update_from_thread(processor, thread);
                        processor->deadline = thread->realtime.deadline;
                        pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
@@ -3624,7 +3460,7 @@ realtime_setrun(
                                ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_PREEMPT);
                        }
                } else if (processor->state == PROCESSOR_DISPATCHING) {
-                       if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline))) {
+                       if ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline)) {
                                processor_state_update_from_thread(processor, thread);
                                processor->deadline = thread->realtime.deadline;
                        }
@@ -3833,13 +3669,12 @@ processor_setrun(
 
        if (preempt != AST_NONE) {
                if (processor->state == PROCESSOR_IDLE) {
-                       processor->next_thread = THREAD_NULL;
                        processor_state_update_from_thread(processor, thread);
                        processor->deadline = UINT64_MAX;
                        pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
                        ipi_action = eExitIdle;
                } else if (processor->state == PROCESSOR_DISPATCHING) {
-                       if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
+                       if (processor->current_pri < thread->sched_pri) {
                                processor_state_update_from_thread(processor, thread);
                                processor->deadline = UINT64_MAX;
                        }
@@ -3857,7 +3692,6 @@ processor_setrun(
                    thread->sched_pri >= processor->current_pri) {
                        ipi_action = eInterruptRunning;
                } else if (processor->state == PROCESSOR_IDLE) {
-                       processor->next_thread = THREAD_NULL;
                        processor_state_update_from_thread(processor, thread);
                        processor->deadline = UINT64_MAX;
                        pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
@@ -4237,6 +4071,25 @@ choose_processor(
                }
 
                /*
+                * lc_processor is used to indicate the best processor set run queue
+                * on which to enqueue a thread when all available CPUs are busy with
+                * higher priority threads, so try to make sure it is initialized.
+                */
+               if (lc_processor == PROCESSOR_NULL) {
+                       cpumap_t available_map = ((pset->cpu_state_map[PROCESSOR_IDLE] |
+                           pset->cpu_state_map[PROCESSOR_RUNNING] |
+                           pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
+                           pset->recommended_bitmask);
+                       cpuid = lsb_first(available_map);
+                       if (cpuid >= 0) {
+                               lc_processor = processor_array[cpuid];
+                               lowest_count = SCHED(processor_runq_count)(lc_processor);
+                       }
+               }
+
+               /*
+                * Move onto the next processor set.
+                *
                 * If all primary processors in this pset are running a higher
                 * priority thread, move on to next pset. Only when we have
                 * exhausted the search for primary processors do we
@@ -4263,6 +4116,7 @@ choose_processor(
         * the secondary processor that would perturb the least priority
         * primary, or the least busy primary.
         */
+       boolean_t fallback_processor = false;
        do {
                /* lowest_priority is evaluated in the main loops above */
                if (lp_idle_secondary_processor != PROCESSOR_NULL) {
@@ -4276,14 +4130,15 @@ choose_processor(
                        lc_processor = PROCESSOR_NULL;
                } else {
                        /*
-                        * All processors are executing higher
-                        * priority threads, and the lowest_count
-                        * candidate was not usable, so we pick a processor
-                        * to give this thread somewhere to be enqueued.
+                        * All processors are executing higher priority threads, and
+                        * the lowest_count candidate was not usable.
                         *
-                        *  TODO: Need tracepoint or something to show when this happens
-                        *  TODO: Prefer a processor in the original pset
+                        * For AMP platforms running the clutch scheduler always
+                        * return a processor from the requested pset to allow the
+                        * thread to be enqueued in the correct runq. For non-AMP
+                        * platforms, simply return the master_processor.
                         */
+                       fallback_processor = true;
                        processor = master_processor;
                }
 
@@ -4299,12 +4154,16 @@ choose_processor(
 
                /*
                 * We must verify that the chosen processor is still available.
-                * master_processor is an exception, since we may need to preempt
-                * a running thread on it during processor shutdown (for sleep),
-                * and that thread needs to be enqueued on its runqueue to run
-                * when the processor is restarted.
+                * The cases where we pick the master_processor or the fallback
+                * processor are execptions, since we may need enqueue a thread
+                * on its runqueue if this is the last remaining processor
+                * during pset shutdown.
+                *
+                * <rdar://problem/47559304> would really help here since it
+                * gets rid of the weird last processor SHUTDOWN case where
+                * the pset is still schedulable.
                 */
-               if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)) {
+               if (processor != master_processor && (fallback_processor == false) && (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)) {
                        processor = PROCESSOR_NULL;
                }
        } while (processor == PROCESSOR_NULL);
@@ -4325,7 +4184,7 @@ choose_processor(
 void
 thread_setrun(
        thread_t                        thread,
-       integer_t                       options)
+       sched_options_t                 options)
 {
        processor_t                     processor;
        processor_set_t         pset;
@@ -4656,6 +4515,18 @@ set_sched_pri(
 
        /* If we're already at this priority, no need to mess with the runqueue */
        if (new_priority == old_priority) {
+#if CONFIG_SCHED_CLUTCH
+               /* For the first thread in the system, the priority is correct but
+                * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch
+                * scheduler relies on the bucket being set for all threads, update
+                * its bucket here.
+                */
+               if (thread->th_sched_bucket == TH_BUCKET_RUN) {
+                       assert(is_current_thread);
+                       SCHED(update_thread_bucket)(thread);
+               }
+#endif /* CONFIG_SCHED_CLUTCH */
+
                return;
        }
 
@@ -4668,6 +4539,16 @@ set_sched_pri(
 
        thread->sched_pri = new_priority;
 
+#if CONFIG_SCHED_CLUTCH
+       /*
+        * Since for the clutch scheduler, the thread's bucket determines its runq
+        * in the hierarchy it is important to update the bucket when the thread
+        * lock is held and the thread has been removed from the runq hierarchy.
+        */
+       SCHED(update_thread_bucket)(thread);
+
+#endif /* CONFIG_SCHED_CLUTCH */
+
        KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
            (uintptr_t)thread_tid(thread),
            thread->base_pri,
@@ -4853,7 +4734,7 @@ thread_run_queue_remove(
  * thread locked, at splsched
  */
 void
-thread_run_queue_reinsert(thread_t thread, integer_t options)
+thread_run_queue_reinsert(thread_t thread, sched_options_t options)
 {
        assert(thread->runq == PROCESSOR_NULL);
        assert(thread->state & (TH_RUN));
@@ -5026,21 +4907,13 @@ processor_idle(
                        }
                }
 
-#if CONFIG_SCHED_IDLE_IN_PLACE
-               if (thread != THREAD_NULL) {
-                       /* Did idle-in-place thread wake up */
-                       if ((thread->state & (TH_WAIT | TH_SUSP)) != TH_WAIT || thread->wake_active) {
-                               break;
-                       }
-               }
-#endif
-
                IDLE_KERNEL_DEBUG_CONSTANT(
                        MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
 
                machine_track_platform_idle(TRUE);
 
                machine_idle();
+               /* returns with interrupts enabled */
 
                machine_track_platform_idle(FALSE);
 
@@ -5077,8 +4950,6 @@ processor_idle(
 
        cpu_quiescent_counter_join(ctime);
 
-       assert(processor->next_thread == NULL);
-
        ast_t reason = AST_NONE;
 
        /* We're handling all scheduling AST's */
@@ -5105,20 +4976,42 @@ processor_idle(
  *     Each processor has a dedicated thread which
  *     executes the idle loop when there is no suitable
  *     previous context.
+ *
+ *     This continuation is entered with interrupts disabled.
  */
 void
-idle_thread(void)
+idle_thread(__assert_only void* parameter,
+    __unused wait_result_t result)
 {
-       processor_t             processor = current_processor();
-       thread_t                new_thread;
+       assert(ml_get_interrupts_enabled() == FALSE);
+       assert(parameter == NULL);
+
+       processor_t processor = current_processor();
+
+       /*
+        * Ensure that anything running in idle context triggers
+        * preemption-disabled checks.
+        */
+       disable_preemption();
+
+       /*
+        * Enable interrupts temporarily to handle any pending interrupts
+        * or IPIs before deciding to sleep
+        */
+       spllo();
+
+       thread_t new_thread = processor_idle(THREAD_NULL, processor);
+       /* returns with interrupts disabled */
+
+       enable_preemption();
 
-       new_thread = processor_idle(THREAD_NULL, processor);
        if (new_thread != THREAD_NULL) {
-               thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
+               thread_run(processor->idle_thread,
+                   idle_thread, NULL, new_thread);
                /*NOTREACHED*/
        }
 
-       thread_block((thread_continue_t)idle_thread);
+       thread_block(idle_thread);
        /*NOTREACHED*/
 }
 
@@ -5131,7 +5024,7 @@ idle_thread_create(
        spl_t                   s;
        char                    name[MAXTHREADNAMESIZE];
 
-       result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
+       result = kernel_thread_create(idle_thread, NULL, MAXPRI_KERNEL, &thread);
        if (result != KERN_SUCCESS) {
                return result;
        }
@@ -5345,31 +5238,35 @@ sched_timeshare_consider_maintenance(uint64_t ctime)
 
                uint64_t ndeadline = ctime + sched_tick_interval;
 
-               if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
+               if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) {
                        thread_wakeup((event_t)sched_timeshare_maintenance_continue);
                        sched_maintenance_wakeups++;
                }
        }
 
-       uint64_t load_compute_deadline = __c11_atomic_load(&sched_load_compute_deadline, memory_order_relaxed);
+#if !CONFIG_SCHED_CLUTCH
+       /*
+        * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch
+        * scheduler, the load is maintained at the thread group and bucket level.
+        */
+       uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed);
 
        if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
                uint64_t new_deadline = 0;
-               if (__c11_atomic_compare_exchange_strong(&sched_load_compute_deadline, &load_compute_deadline, new_deadline,
-                   memory_order_relaxed, memory_order_relaxed)) {
+               if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) {
                        compute_sched_load();
                        new_deadline = ctime + sched_load_compute_interval_abs;
-                       __c11_atomic_store(&sched_load_compute_deadline, new_deadline, memory_order_relaxed);
+                       os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed);
                }
        }
+#endif /* CONFIG_SCHED_CLUTCH */
 
 #if __arm64__
-       uint64_t perf_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline, memory_order_relaxed);
+       uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed);
 
        if (__improbable(perf_deadline && ctime >= perf_deadline)) {
                /* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
-               if (__c11_atomic_compare_exchange_strong(&sched_perfcontrol_callback_deadline, &perf_deadline, 0,
-                   memory_order_relaxed, memory_order_relaxed)) {
+               if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) {
                        machine_perfcontrol_deadline_passed(perf_deadline);
                }
        }
@@ -5477,9 +5374,9 @@ runq_scan(
            queue_index >= 0;
            queue_index = bitmap_next(runq->bitmap, queue_index)) {
                thread_t thread;
-               queue_t  queue = &runq->queues[queue_index];
+               circle_queue_t queue = &runq->queues[queue_index];
 
-               qe_foreach_element(thread, queue, runq_links) {
+               cqe_foreach_element(thread, queue, runq_links) {
                        assert(count > 0);
                        assert_thread_magic(thread);
 
@@ -5983,6 +5880,8 @@ sched_update_recommended_cores(uint64_t recommended_cores)
                bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */
        }
 
+       boolean_t pset_newly_recommended = false;
+
        /* First set recommended cores */
        pset_lock(pset);
        avail_count = 0;
@@ -5991,11 +5890,15 @@ sched_update_recommended_cores(uint64_t recommended_cores)
                if (nset != pset) {
                        pset_unlock(pset);
                        pset = nset;
+                       pset_newly_recommended = false;
                        pset_lock(pset);
                }
 
                if (bit_test(recommended_cores, processor->cpu_id)) {
                        processor->is_recommended = TRUE;
+                       if (bit_first(pset->recommended_bitmask) == -1) {
+                               pset_newly_recommended = true;
+                       }
                        bit_set(pset->recommended_bitmask, processor->cpu_id);
 
                        if (processor->state == PROCESSOR_IDLE) {
@@ -6006,6 +5909,9 @@ sched_update_recommended_cores(uint64_t recommended_cores)
                        if (processor->state != PROCESSOR_OFF_LINE) {
                                avail_count++;
                        }
+                       if (pset_newly_recommended) {
+                               SCHED(pset_made_schedulable)(processor, pset, false);
+                       }
                }
        } while ((processor = processor->processor_list) != NULL);
        pset_unlock(pset);
@@ -6103,13 +6009,6 @@ sched_qos_max_parallelism(__unused int qos, uint64_t options)
            (host_info_t)&hinfo, &count);
        assert(kret == KERN_SUCCESS);
 
-       /* We would not want multiple realtime threads running on the
-        * same physical core; even for SMT capable machines.
-        */
-       if (options & QOS_PARALLELISM_REALTIME) {
-               return hinfo.physical_cpu;
-       }
-
        if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
                return hinfo.logical_cpu;
        } else {
@@ -6149,20 +6048,8 @@ sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
         * then I cancelled the callback, otherwise I didn't
         */
 
-       uint64_t old_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline,
-           memory_order_relaxed);
-
-
-       while (!__c11_atomic_compare_exchange_weak(&sched_perfcontrol_callback_deadline,
-           &old_deadline, new_deadline,
-           memory_order_relaxed, memory_order_relaxed)) {
-               ;
-       }
-
-
-       /* now old_deadline contains previous value, which might not be the same if it raced */
-
-       return (old_deadline != 0) ? TRUE : FALSE;
+       return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
+                  relaxed) != 0;
 }
 
 #endif /* __arm64__ */
@@ -6170,7 +6057,13 @@ sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
 void
 sched_update_pset_load_average(processor_set_t pset)
 {
-       int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
+#if CONFIG_SCHED_CLUTCH
+       int non_rt_load = sched_clutch_root_count(&pset->pset_clutch_root);
+#else /* CONFIG_SCHED_CLUTCH */
+       int non_rt_load = pset->pset_runq.count;
+#endif /* CONFIG_SCHED_CLUTCH */
+
+       int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
        int new_load_average = (pset->load_average + load) >> 1;
 
        pset->load_average = new_load_average;
@@ -6316,6 +6209,14 @@ sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor)
        return ok_to_run_realtime_thread;
 }
 
+void
+sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock)
+{
+       if (drop_lock) {
+               pset_unlock(pset);
+       }
+}
+
 void
 thread_set_no_smt(bool set)
 {
@@ -6365,4 +6266,4 @@ sysctl_task_get_no_smt(void)
        }
        return '0';
 }
-#endif
+#endif /* DEVELOPMENT || DEBUG */
index 12776a617eebdc64faa3119e878cd57dd80ab0c7..2f806bdd0ce1c01b4cc78a2431efa3bc47548f4c 100644 (file)
@@ -78,6 +78,7 @@
 #ifdef  MACH_KERNEL_PRIVATE
 
 #include <kern/sched_urgency.h>
+#include <kern/thread_group.h>
 
 /* Initialization */
 extern void             sched_init(void);
@@ -140,24 +141,23 @@ extern int                      thread_run(
        thread_t                        new_thread);
 
 /* Resume thread with new stack */
-extern void                     thread_continue(
-       thread_t                old_thread);
+extern __dead2 void     thread_continue(thread_t old_thread);
 
 /* Invoke continuation */
-extern void             call_continuation(
+extern __dead2 void     call_continuation(
        thread_continue_t       continuation,
-       void                            *parameter,
+       void                    *parameter,
        wait_result_t           wresult,
-       boolean_t           enable_interrupts);
+       boolean_t               enable_interrupts);
 
 /*
  * Flags that can be passed to set_sched_pri
  * to skip side effects
  */
-typedef enum {
+__options_decl(set_sched_pri_options_t, uint32_t, {
        SETPRI_DEFAULT  = 0x0,
        SETPRI_LAZY     = 0x1,  /* Avoid setting AST flags or sending IPIs */
-} set_sched_pri_options_t;
+});
 
 /* Set the current scheduled priority */
 extern void set_sched_pri(
@@ -170,6 +170,12 @@ extern void             sched_set_thread_base_priority(
        thread_t                thread,
        int                             priority);
 
+/* Set absolute base priority of the specified thread */
+extern void             sched_set_kernel_thread_priority(
+       thread_t                thread,
+       int                             priority);
+
+
 /* Set the thread's true scheduling mode */
 extern void             sched_set_thread_mode(thread_t thread,
     sched_mode_t mode);
@@ -180,12 +186,6 @@ extern void             sched_thread_mode_demote(thread_t thread,
 extern void             sched_thread_mode_undemote(thread_t thread,
     uint32_t reason);
 
-extern void sched_thread_promote_to_pri(thread_t thread, int priority, uintptr_t trace_obj);
-extern void sched_thread_update_promotion_to_pri(thread_t thread, int priority, uintptr_t trace_obj);
-extern void sched_thread_unpromote(thread_t thread, uintptr_t trace_obj);
-
-extern void assert_promotions_invariant(thread_t thread);
-
 extern void sched_thread_promote_reason(thread_t thread, uint32_t reason, uintptr_t trace_obj);
 extern void sched_thread_unpromote_reason(thread_t thread, uint32_t reason, uintptr_t trace_obj);
 
@@ -212,8 +212,10 @@ extern void             lightweight_update_priority(
 
 extern void             sched_default_quantum_expire(thread_t thread);
 
-/* Idle processor thread */
-extern void             idle_thread(void);
+/* Idle processor thread continuation */
+extern void             idle_thread(
+       void*           parameter,
+       wait_result_t   result);
 
 extern kern_return_t    idle_thread_create(
        processor_t             processor);
@@ -228,19 +230,18 @@ extern wait_result_t    thread_block_reason(
        void                            *parameter,
        ast_t                           reason);
 
-/* Reschedule thread for execution */
-extern void             thread_setrun(
-       thread_t        thread,
-       integer_t       options);
-
-typedef enum {
+__options_decl(sched_options_t, uint32_t, {
        SCHED_NONE      = 0x0,
        SCHED_TAILQ     = 0x1,
        SCHED_HEADQ     = 0x2,
        SCHED_PREEMPT   = 0x4,
        SCHED_REBALANCE = 0x8,
-       SCHED_PEEK      = 0x10,
-} sched_options_t;
+});
+
+/* Reschedule thread for execution */
+extern void             thread_setrun(
+       thread_t        thread,
+       sched_options_t options);
 
 extern processor_set_t  task_choose_pset(
        task_t                  task);
@@ -267,17 +268,20 @@ extern void             run_queue_init(
 
 extern thread_t run_queue_dequeue(
        run_queue_t           runq,
-       integer_t             options);
+       sched_options_t       options);
 
 extern boolean_t        run_queue_enqueue(
        run_queue_t           runq,
-       thread_t                      thread,
-       integer_t             options);
+       thread_t              thread,
+       sched_options_t       options);
 
 extern void     run_queue_remove(
        run_queue_t            runq,
        thread_t                       thread);
 
+extern thread_t run_queue_peek(
+       run_queue_t            runq);
+
 struct sched_update_scan_context {
        uint64_t        earliest_bg_make_runnable_time;
        uint64_t        earliest_normal_make_runnable_time;
@@ -287,6 +291,11 @@ typedef struct sched_update_scan_context *sched_update_scan_context_t;
 
 extern void             sched_rtglobal_runq_scan(sched_update_scan_context_t scan_context);
 
+extern void sched_pset_made_schedulable(
+       processor_t processor,
+       processor_set_t pset,
+       boolean_t drop_lock);
+
 /*
  * Enum to define various events which need IPIs. The IPI policy
  * engine decides what kind of IPI to use based on destination
@@ -351,7 +360,7 @@ extern boolean_t        thread_run_queue_remove(thread_t thread);
 thread_t thread_run_queue_remove_for_handoff(thread_t thread);
 
 /* Put a thread back in the run queue after being yanked */
-extern void thread_run_queue_reinsert(thread_t thread, integer_t options);
+extern void thread_run_queue_reinsert(thread_t thread, sched_options_t options);
 
 extern void             thread_timer_expire(
        void                    *thread,
@@ -469,8 +478,6 @@ extern void             thread_exception_return(void) __dead2;
 /* String declaring the name of the current scheduler */
 extern char sched_string[SCHED_STRING_MAX_LENGTH];
 
-extern thread_t port_name_to_thread_for_ulock(mach_port_name_t  thread_name);
-
 /* Attempt to context switch to a specific runnable thread */
 extern wait_result_t thread_handoff_deallocate(thread_t thread);
 
@@ -572,22 +579,19 @@ extern boolean_t preemption_enabled(void);
  * a function pointer table.
  */
 
-#if   !defined(CONFIG_SCHED_TRADITIONAL) && !defined(CONFIG_SCHED_PROTO) && !defined(CONFIG_SCHED_GRRR) && !defined(CONFIG_SCHED_MULTIQ)
+#if   !defined(CONFIG_SCHED_TRADITIONAL) && !defined(CONFIG_SCHED_PROTO) && !defined(CONFIG_SCHED_GRRR) && !defined(CONFIG_SCHED_MULTIQ) && !defined(CONFIG_SCHED_CLUTCH)
 #error Enable at least one scheduler algorithm in osfmk/conf/MASTER.XXX
 #endif
 
-#if DEBUG
-#define SCHED(f) (sched_current_dispatch->f)
-#else /* DEBUG */
 
-/*
- * For DEV & REL kernels, use a static dispatch table instead of
- * using the indirect function table.
- */
+#if CONFIG_SCHED_CLUTCH
+extern const struct sched_dispatch_table sched_clutch_dispatch;
+#define SCHED(f) (sched_clutch_dispatch.f)
+#else /* CONFIG_SCHED_CLUTCH */
 extern const struct sched_dispatch_table sched_dualq_dispatch;
 #define SCHED(f) (sched_dualq_dispatch.f)
+#endif /* CONFIG_SCHED_CLUTCH */
 
-#endif /* DEBUG */
 
 struct sched_dispatch_table {
        const char *sched_name;
@@ -636,7 +640,7 @@ struct sched_dispatch_table {
        boolean_t (*processor_enqueue)(
                processor_t                    processor,
                thread_t                       thread,
-               integer_t                      options);
+               sched_options_t                options);
 
        /* Migrate threads away in preparation for processor shutdown */
        void (*processor_queue_shutdown)(
@@ -713,13 +717,6 @@ struct sched_dispatch_table {
 
        void            (*thread_update_scan)(sched_update_scan_context_t scan_context);
 
-       /*
-        * Use processor->next_thread to pin a thread to an idle
-        * processor. If FALSE, threads are enqueued and can
-        * be stolen by other processors.
-        */
-       boolean_t   direct_dispatch_to_idle_processors;
-
        /* Supports more than one pset */
        boolean_t   multiple_psets_enabled;
        /* Supports scheduler groups */
@@ -747,6 +744,16 @@ struct sched_dispatch_table {
        void    (*check_spill)(processor_set_t pset, thread_t thread);
        sched_ipi_type_t (*ipi_policy)(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
        bool    (*thread_should_yield)(processor_t processor, thread_t thread);
+
+       /* Routine to update run counts */
+       uint32_t (*run_count_incr)(thread_t thread);
+       uint32_t (*run_count_decr)(thread_t thread);
+
+       /* Routine to update scheduling bucket for a thread */
+       void (*update_thread_bucket)(thread_t thread);
+
+       /* Routine to inform the scheduler when a new pset becomes schedulable */
+       void (*pset_made_schedulable)(processor_t processor, processor_set_t pset, boolean_t drop_lock);
 };
 
 #if defined(CONFIG_SCHED_TRADITIONAL)
@@ -767,11 +774,9 @@ extern const struct sched_dispatch_table sched_proto_dispatch;
 extern const struct sched_dispatch_table sched_grrr_dispatch;
 #endif
 
-/*
- * It is an error to invoke any scheduler-related code
- * before this is set up
- */
-extern const struct sched_dispatch_table *sched_current_dispatch;
+#if defined(CONFIG_SCHED_CLUTCH)
+extern const struct sched_dispatch_table sched_clutch_dispatch;
+#endif
 
 #endif  /* MACH_KERNEL_PRIVATE */
 
index f1297189ddd4ccf9079158ba1fd3f1a02f6ddc40..ce5d226f15ea75577a57334bfb2665893c46d5d7 100644 (file)
@@ -97,7 +97,7 @@ static boolean_t
 sched_proto_processor_enqueue(
        processor_t                    processor,
        thread_t                       thread,
-       integer_t                      options);
+       sched_options_t                options);
 
 static void
 sched_proto_processor_queue_shutdown(
@@ -182,7 +182,6 @@ const struct sched_dispatch_table sched_proto_dispatch = {
        .processor_runq_stats_count_sum                 = sched_proto_processor_runq_stats_count_sum,
        .processor_bound_count                          = sched_proto_processor_bound_count,
        .thread_update_scan                             = sched_proto_thread_update_scan,
-       .direct_dispatch_to_idle_processors             = TRUE,
        .multiple_psets_enabled                         = TRUE,
        .sched_groups_enabled                           = FALSE,
        .avoid_processor_enabled                        = FALSE,
@@ -199,6 +198,10 @@ const struct sched_dispatch_table sched_proto_dispatch = {
        .check_spill                                    = sched_check_spill,
        .ipi_policy                                     = sched_ipi_policy,
        .thread_should_yield                            = sched_thread_should_yield,
+       .run_count_incr                                 = sched_run_incr,
+       .run_count_decr                                 = sched_run_decr,
+       .update_thread_bucket                           = sched_update_thread_bucket,
+       .pset_made_schedulable                          = sched_pset_made_schedulable,
 };
 
 static struct run_queue *global_runq;
@@ -307,7 +310,7 @@ sched_proto_choose_thread(processor_t           processor,
     ast_t                         reason __unused)
 {
        run_queue_t             rq = global_runq;
-       queue_t                 queue;
+       circle_queue_t          queue;
        int                             pri, count;
        thread_t                thread;
 
@@ -329,18 +332,17 @@ sched_proto_choose_thread(processor_t           processor,
         */
 
        while (count > 0 && pri >= priority) {
-               thread = (thread_t)queue_first(queue);
-               while (!queue_end(queue, (queue_entry_t)thread)) {
+               cqe_foreach_element_safe(thread, queue, runq_links) {
                        if ((thread->bound_processor == PROCESSOR_NULL ||
                            thread->bound_processor == processor) &&
                            runqueue_generation != thread->runqueue_generation) {
-                               remqueue((queue_entry_t)thread);
+                               circle_dequeue(queue, &thread->runq_links);
 
                                thread->runq = PROCESSOR_NULL;
                                thread->runqueue_generation = runqueue_generation;
                                SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
                                rq->count--;
-                               if (queue_empty(queue)) {
+                               if (circle_queue_empty(queue)) {
                                        bitmap_clear(rq->bitmap, pri);
                                        rq->highq = bitmap_first(rq->bitmap, NRQS);
                                }
@@ -401,7 +403,7 @@ static boolean_t
 sched_proto_processor_enqueue(
        processor_t                    processor __unused,
        thread_t                       thread,
-       integer_t                      options)
+       sched_options_t                options)
 {
        run_queue_t             rq = global_runq;
        boolean_t               result;
@@ -439,20 +441,7 @@ sched_proto_processor_queue_remove(
                 *      Thread is on a run queue and we have a lock on
                 *      that run queue.
                 */
-               remqueue((queue_entry_t)thread);
-               SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
-               rq->count--;
-               if (SCHED(priority_is_urgent)(thread->sched_pri)) {
-                       rq->urgency--; assert(rq->urgency >= 0);
-               }
-
-               if (queue_empty(rq->queues + thread->sched_pri)) {
-                       /* update run queue status */
-                       bitmap_clear(rq->bitmap, thread->sched_pri);
-                       rq->highq = bitmap_first(rq->bitmap, NRQS);
-               }
-
-               thread->runq = PROCESSOR_NULL;
+               run_queue_remove(rq, thread);
        } else {
                /*
                 *      The thread left the run queue before we could
index e91504583c7547ee6022d600067b7d8d50af551f..3297904d074f9774075f3f55105158e40b26ce05 100644 (file)
@@ -81,7 +81,8 @@ static void
 sched_traditional_processor_queue_shutdown(processor_t processor);
 
 static boolean_t
-sched_traditional_processor_enqueue(processor_t processor, thread_t thread, integer_t options);
+sched_traditional_processor_enqueue(processor_t processor, thread_t thread,
+    sched_options_t options);
 
 static boolean_t
 sched_traditional_processor_queue_remove(processor_t processor, thread_t thread);
@@ -160,7 +161,6 @@ const struct sched_dispatch_table sched_traditional_dispatch = {
        .processor_runq_stats_count_sum                 = sched_traditional_processor_runq_stats_count_sum,
        .processor_bound_count                          = sched_traditional_processor_bound_count,
        .thread_update_scan                             = sched_traditional_thread_update_scan,
-       .direct_dispatch_to_idle_processors             = TRUE,
        .multiple_psets_enabled                         = TRUE,
        .sched_groups_enabled                           = FALSE,
        .avoid_processor_enabled                        = FALSE,
@@ -177,6 +177,10 @@ const struct sched_dispatch_table sched_traditional_dispatch = {
        .check_spill                                    = sched_check_spill,
        .ipi_policy                                     = sched_ipi_policy,
        .thread_should_yield                            = sched_thread_should_yield,
+       .run_count_incr                                 = sched_run_incr,
+       .run_count_decr                                 = sched_run_decr,
+       .update_thread_bucket                           = sched_update_thread_bucket,
+       .pset_made_schedulable                          = sched_pset_made_schedulable,
 };
 
 const struct sched_dispatch_table sched_traditional_with_pset_runqueue_dispatch = {
@@ -208,7 +212,6 @@ const struct sched_dispatch_table sched_traditional_with_pset_runqueue_dispatch
        .processor_runq_stats_count_sum                 = sched_traditional_with_pset_runqueue_processor_runq_stats_count_sum,
        .processor_bound_count                          = sched_traditional_processor_bound_count,
        .thread_update_scan                             = sched_traditional_thread_update_scan,
-       .direct_dispatch_to_idle_processors             = FALSE,
        .multiple_psets_enabled                         = TRUE,
        .sched_groups_enabled                           = FALSE,
        .avoid_processor_enabled                        = FALSE,
@@ -225,6 +228,10 @@ const struct sched_dispatch_table sched_traditional_with_pset_runqueue_dispatch
        .check_spill                                    = sched_check_spill,
        .ipi_policy                                     = sched_ipi_policy,
        .thread_should_yield                            = sched_thread_should_yield,
+       .run_count_incr                                 = sched_run_incr,
+       .run_count_decr                                 = sched_run_decr,
+       .update_thread_bucket                           = sched_update_thread_bucket,
+       .pset_made_schedulable                          = sched_pset_made_schedulable,
 };
 
 static void
@@ -337,17 +344,16 @@ sched_traditional_choose_thread_from_runq(
        run_queue_t     rq,
        int             priority)
 {
-       queue_t         queue   = rq->queues + rq->highq;
+       circle_queue_t  queue   = rq->queues + rq->highq;
        int             pri     = rq->highq;
        int             count   = rq->count;
        thread_t        thread;
 
        while (count > 0 && pri >= priority) {
-               thread = (thread_t)(uintptr_t)queue_first(queue);
-               while (!queue_end(queue, (queue_entry_t)thread)) {
+               cqe_foreach_element_safe(thread, queue, runq_links) {
                        if (thread->bound_processor == PROCESSOR_NULL ||
                            thread->bound_processor == processor) {
-                               remqueue((queue_entry_t)thread);
+                               circle_dequeue(queue, &thread->runq_links);
 
                                thread->runq = PROCESSOR_NULL;
                                SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
@@ -355,16 +361,13 @@ sched_traditional_choose_thread_from_runq(
                                if (SCHED(priority_is_urgent)(pri)) {
                                        rq->urgency--; assert(rq->urgency >= 0);
                                }
-                               if (queue_empty(queue)) {
+                               if (circle_queue_empty(queue)) {
                                        bitmap_clear(rq->bitmap, pri);
                                        rq->highq = bitmap_first(rq->bitmap, NRQS);
                                }
-
                                return thread;
                        }
                        count--;
-
-                       thread = (thread_t)(uintptr_t)queue_next((queue_entry_t)thread);
                }
 
                queue--; pri--;
@@ -397,8 +400,8 @@ sched_traditional_initial_thread_sched_mode(task_t parent_task)
  */
 static boolean_t
 sched_traditional_processor_enqueue(processor_t   processor,
-    thread_t      thread,
-    integer_t     options)
+    thread_t        thread,
+    sched_options_t options)
 {
        run_queue_t     rq = runq_for_processor(processor);
        boolean_t       result;
@@ -521,21 +524,18 @@ sched_traditional_processor_queue_shutdown(processor_t processor)
 {
        processor_set_t         pset    = processor->processor_set;
        run_queue_t             rq      = runq_for_processor(processor);
-       queue_t                 queue   = rq->queues + rq->highq;
+       circle_queue_t          queue   = rq->queues + rq->highq;
        int                     pri     = rq->highq;
        int                     count   = rq->count;
-       thread_t                next, thread;
-       queue_head_t            tqueue;
+       thread_t                thread;
+       circle_queue_head_t     tqueue;
 
-       queue_init(&tqueue);
+       circle_queue_init(&tqueue);
 
        while (count > 0) {
-               thread = (thread_t)(uintptr_t)queue_first(queue);
-               while (!queue_end(queue, (queue_entry_t)thread)) {
-                       next = (thread_t)(uintptr_t)queue_next((queue_entry_t)thread);
-
+               cqe_foreach_element_safe(thread, queue, runq_links) {
                        if (thread->bound_processor == PROCESSOR_NULL) {
-                               remqueue((queue_entry_t)thread);
+                               circle_dequeue(queue, &thread->runq_links);
 
                                thread->runq = PROCESSOR_NULL;
                                SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
@@ -544,16 +544,14 @@ sched_traditional_processor_queue_shutdown(processor_t processor)
                                if (SCHED(priority_is_urgent)(pri)) {
                                        rq->urgency--; assert(rq->urgency >= 0);
                                }
-                               if (queue_empty(queue)) {
+                               if (circle_queue_empty(queue)) {
                                        bitmap_clear(rq->bitmap, pri);
                                        rq->highq = bitmap_first(rq->bitmap, NRQS);
                                }
 
-                               enqueue_tail(&tqueue, (queue_entry_t)thread);
+                               circle_enqueue_tail(&tqueue, &thread->runq_links);
                        }
                        count--;
-
-                       thread = next;
                }
 
                queue--; pri--;
@@ -561,7 +559,7 @@ sched_traditional_processor_queue_shutdown(processor_t processor)
 
        pset_unlock(pset);
 
-       while ((thread = (thread_t)(uintptr_t)dequeue_head(&tqueue)) != THREAD_NULL) {
+       while ((thread = cqe_dequeue_head(&tqueue, struct thread, runq_links)) != THREAD_NULL) {
                thread_lock(thread);
 
                thread_setrun(thread, SCHED_TAILQ);
@@ -652,16 +650,15 @@ static thread_t
 sched_traditional_steal_processor_thread(processor_t processor)
 {
        run_queue_t     rq      = runq_for_processor(processor);
-       queue_t         queue   = rq->queues + rq->highq;
+       circle_queue_t  queue   = rq->queues + rq->highq;
        int             pri     = rq->highq;
        int             count   = rq->count;
        thread_t        thread;
 
        while (count > 0) {
-               thread = (thread_t)(uintptr_t)queue_first(queue);
-               while (!queue_end(queue, (queue_entry_t)thread)) {
+               cqe_foreach_element_safe(thread, queue, runq_links) {
                        if (thread->bound_processor == PROCESSOR_NULL) {
-                               remqueue((queue_entry_t)thread);
+                               circle_dequeue(queue, &thread->runq_links);
 
                                thread->runq = PROCESSOR_NULL;
                                SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
@@ -670,7 +667,7 @@ sched_traditional_steal_processor_thread(processor_t processor)
                                if (SCHED(priority_is_urgent)(pri)) {
                                        rq->urgency--; assert(rq->urgency >= 0);
                                }
-                               if (queue_empty(queue)) {
+                               if (circle_queue_empty(queue)) {
                                        bitmap_clear(rq->bitmap, pri);
                                        rq->highq = bitmap_first(rq->bitmap, NRQS);
                                }
@@ -678,8 +675,6 @@ sched_traditional_steal_processor_thread(processor_t processor)
                                return thread;
                        }
                        count--;
-
-                       thread = (thread_t)(uintptr_t)queue_next((queue_entry_t)thread);
                }
 
                queue--; pri--;
index 258d323dbf8a5bdefeaef814c1fb6684e42afca7..d791567b0004072aa31198312bc6c6c6ddd34467 100644 (file)
@@ -77,6 +77,7 @@
 #include <machine/simple_lock.h>
 
 #ifdef  MACH_KERNEL_PRIVATE
+#include <machine/atomic.h>
 #include <mach_ldebug.h>
 
 extern void                     hw_lock_init(
@@ -141,61 +142,14 @@ extern void                     hw_lock_unlock_nopreempt(
 extern unsigned int             hw_lock_held(
        hw_lock_t);
 
+extern boolean_t hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait);
 #endif  /* MACH_KERNEL_PRIVATE */
 
 __BEGIN_DECLS
 
-extern uint32_t                 hw_atomic_add(
-       volatile uint32_t       *dest,
-       uint32_t        delt);
-
-extern uint32_t                 hw_atomic_sub(
-       volatile uint32_t       *dest,
-       uint32_t        delt);
-
-extern uint32_t                 hw_atomic_or(
-       volatile uint32_t       *dest,
-       uint32_t        mask);
-
-extern uint32_t                 hw_atomic_and(
-       volatile uint32_t       *dest,
-       uint32_t        mask);
-
-/*
- * Variant of hw_atomic_or which doesn't return a value; potentially
- * more efficient on some platforms.
- */
-extern void                     hw_atomic_or_noret(
-       volatile uint32_t       *dest,
-       uint32_t        mask);
-/*
- * Variant of hw_atomic_and which doesn't return a value; potentially
- * more efficient on some platforms.
- */
-
-extern void                     hw_atomic_and_noret(
-       volatile uint32_t       *dest,
-       uint32_t        mask);
-
-extern uint32_t                 hw_compare_and_store(
-       uint32_t        oldval,
-       uint32_t        newval,
-       volatile uint32_t       *dest);
-
-extern void                     hw_queue_atomic(
-       unsigned int *anchor,
-       unsigned int *elem,
-       unsigned int disp);
-
-extern void                     hw_queue_atomic_list(
-       unsigned int *anchor,
-       unsigned int *first,
-       unsigned int *last,
-       unsigned int disp);
-
-extern unsigned int             *hw_dequeue_atomic(
-       unsigned int *anchor,
-       unsigned int disp);
+extern void *                   hw_wait_while_equals(
+       void    **address,
+       void    *current);
 
 extern void                     usimple_lock_init(
        usimple_lock_t,
@@ -213,6 +167,19 @@ extern unsigned int             usimple_lock_try(
 extern void             usimple_lock_try_lock_loop(
        usimple_lock_t,
        lck_grp_t*);
+
+#if defined(__x86_64__)
+extern unsigned int     usimple_lock_try_lock_mp_signal_safe_loop_deadline(
+       usimple_lock_t,
+       uint64_t,
+       lck_grp_t*);
+
+extern unsigned int     usimple_lock_try_lock_mp_signal_safe_loop_duration(
+       usimple_lock_t,
+       uint64_t,
+       lck_grp_t*);
+#endif
+
 #else
 extern void                     usimple_lock(
        usimple_lock_t);
@@ -228,6 +195,18 @@ extern void             usimple_lock_try_lock_loop(
        usimple_lock_t);
 #define usimple_lock_try_lock_loop(lck, grp) usimple_lock_try_lock_loop(lck)
 
+#if defined(__x86_64__)
+extern unsigned int     usimple_lock_try_lock_mp_signal_safe_loop_deadline(
+       usimple_lock_t,
+       uint64_t);
+#define usimple_lock_try_lock_mp_signal_safe_loop_deadline(lck, ddl, grp) usimple_lock_try_lock_mp_signal_safe_loop_deadline(lck, ddl)
+
+extern unsigned int     usimple_lock_try_lock_mp_signal_safe_loop_duration(
+       usimple_lock_t,
+       uint64_t);
+#define usimple_lock_try_lock_mp_signal_safe_loop_duration(lck, dur, grp) usimple_lock_try_lock_mp_signal_safe_loop_duration(lck, dur)
+#endif
+
 #endif /* LOCK_STATS */
 
 extern void                     usimple_unlock(
@@ -250,9 +229,73 @@ __END_DECLS
 #define simple_unlock(l)        usimple_unlock(l)
 #define simple_lock_try(l, grp)      usimple_lock_try(l, grp)
 #define simple_lock_try_lock_loop(l, grp)    usimple_lock_try_lock_loop(l, grp)
+#define simple_lock_try_lock_mp_signal_safe_loop_deadline(l, ddl, grp)    usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ddl, grp)
+#define simple_lock_try_lock_mp_signal_safe_loop_duration(l, dur, grp)    usimple_lock_try_lock_mp_signal_safe_loop_duration(l, dur, grp)
 #define simple_lock_addr(l)     (&(l))
 #endif /* !defined(simple_lock_init) */
 
+#ifdef MACH_KERNEL_PRIVATE
+
+typedef uint32_t hw_lock_bit_t;
+
+#if LOCK_STATS
+extern void     hw_lock_bit(
+       hw_lock_bit_t *,
+       unsigned int,
+       lck_grp_t*);
+
+extern void     hw_lock_bit_nopreempt(
+       hw_lock_bit_t *,
+       unsigned int,
+       lck_grp_t*);
+
+extern unsigned int hw_lock_bit_try(
+       hw_lock_bit_t *,
+       unsigned int,
+       lck_grp_t*);
+
+extern unsigned int hw_lock_bit_to(
+       hw_lock_bit_t *,
+       unsigned int,
+       uint32_t,
+       lck_grp_t*);
+
+#else
+extern void     hw_lock_bit(
+       hw_lock_bit_t *,
+       unsigned int);
+#define hw_lock_bit(lck, bit, grp) hw_lock_bit(lck, bit)
+
+extern void     hw_lock_bit_nopreempt(
+       hw_lock_bit_t *,
+       unsigned int);
+#define hw_lock_bit_nopreempt(lck, bit, grp) hw_lock_bit_nopreempt(lck, bit)
+
+extern unsigned int hw_lock_bit_try(
+       hw_lock_bit_t *,
+       unsigned int);
+#define hw_lock_bit_try(lck, bit, grp) hw_lock_bit_try(lck, bit)
+
+extern unsigned int hw_lock_bit_to(
+       hw_lock_bit_t *,
+       unsigned int,
+       uint32_t);
+#define hw_lock_bit_to(lck, bit, timeout, grp) hw_lock_bit_to(lck, bit, timeout)
+
+#endif /* LOCK_STATS */
+
+extern void     hw_unlock_bit(
+       hw_lock_bit_t *,
+       unsigned int);
+
+extern void     hw_unlock_bit_nopreempt(
+       hw_lock_bit_t *,
+       unsigned int);
+
+#define hw_lock_bit_held(l, b) (((*(l))&(1<<b))!=0)
+
+#endif  /* MACH_KERNEL_PRIVATE */
+
 #endif /*!_KERN_SIMPLE_LOCK_H_*/
 
 #endif  /* KERNEL_PRIVATE */
index 2874583643d8c1462f701c45ca7aa9ad48d31305..0481ddfdbf8f65681b257c07b408e90227abab68 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -55,7 +55,7 @@
  *     because stack_alloc_try/thread_invoke operate at splsched.
  */
 
-decl_simple_lock_data(static, stack_lock_data)
+decl_simple_lock_data(static, stack_lock_data);
 #define stack_lock()            simple_lock(&stack_lock_data, LCK_GRP_NULL)
 #define stack_unlock()          simple_unlock(&stack_lock_data)
 
index 199c83afc0381538cb26e421e3deb804613f545f..d01037df8ba9f2e4794892bb49fa0e91f5f215ca 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -67,7 +67,6 @@
  */
 
 #include <debug.h>
-#include <xpr_debug.h>
 #include <mach_kdp.h>
 
 #include <mach/boolean.h>
@@ -86,6 +85,7 @@
 #include <kern/ledger.h>
 #include <kern/machine.h>
 #include <kern/processor.h>
+#include <kern/restartable.h>
 #include <kern/sched_prim.h>
 #include <kern/turnstile.h>
 #if CONFIG_SCHED_SFI
@@ -98,7 +98,6 @@
 #if CONFIG_TELEMETRY
 #include <kern/telemetry.h>
 #endif
-#include <kern/xpr.h>
 #include <kern/zalloc.h>
 #include <kern/locks.h>
 #include <kern/debug.h>
@@ -220,11 +219,6 @@ unsigned int trace_wrap = 0;
 boolean_t trace_serial = FALSE;
 boolean_t early_boot_complete = FALSE;
 
-/* physically contiguous carveouts */
-SECURITY_READ_ONLY_LATE(uintptr_t) phys_carveout = 0;
-SECURITY_READ_ONLY_LATE(uintptr_t) phys_carveout_pa = 0;
-SECURITY_READ_ONLY_LATE(size_t) phys_carveout_size = 0;
-
 /* mach leak logging */
 int log_leaks = 0;
 
@@ -250,13 +244,6 @@ kernel_early_bootstrap(void)
                serverperfmode = 1;
        }
 
-       lck_mod_init();
-
-       /*
-        * Initialize the timer callout world
-        */
-       timer_call_init();
-
 #if CONFIG_SCHED_SFI
        /*
         * Configure SFI classes
@@ -387,6 +374,9 @@ kernel_bootstrap(void)
        kernel_bootstrap_log("thread_init");
        thread_init();
 
+       kernel_bootstrap_log("restartable_init");
+       restartable_init();
+
        kernel_bootstrap_log("workq_init");
        workq_init();
 
@@ -414,6 +404,9 @@ kernel_bootstrap(void)
        /* initialize host_statistics */
        host_statistics_init();
 
+       /* initialize exceptions */
+       exception_init();
+
        /*
         *      Create a kernel thread to execute the kernel bootstrap.
         */
@@ -431,6 +424,7 @@ kernel_bootstrap(void)
        /* TODO: do a proper thread_start() (without the thread_setrun()) */
        thread->state = TH_RUN;
        thread->last_made_runnable_time = mach_absolute_time();
+       thread_set_thread_name(thread, "kernel_bootstrap_thread");
 
        thread_deallocate(thread);
 
@@ -522,28 +516,12 @@ kernel_bootstrap_thread(void)
 #if (defined(__i386__) || defined(__x86_64__)) && NCOPY_WINDOWS > 0
        /*
         * Create and initialize the physical copy window for processor 0
-        * This is required before starting kicking off  IOKit.
+        * This is required before starting kicking off IOKit.
         */
        cpu_physwindow_init(0);
 #endif
 
-       if (PE_i_can_has_debugger(NULL)) {
-               unsigned int phys_carveout_mb = 0;
-               if (PE_parse_boot_argn("phys_carveout_mb", &phys_carveout_mb,
-                   sizeof(phys_carveout_mb)) && phys_carveout_mb > 0) {
-                       phys_carveout_size = phys_carveout_mb * 1024 * 1024;
-                       kern_return_t kr = kmem_alloc_contig(kernel_map,
-                           (vm_offset_t *)&phys_carveout, phys_carveout_size,
-                           VM_MAP_PAGE_MASK(kernel_map), 0, 0, KMA_NOPAGEWAIT,
-                           VM_KERN_MEMORY_DIAG);
-                       if (kr != KERN_SUCCESS) {
-                               kprintf("failed to allocate %uMB for phys_carveout_mb: %u\n",
-                                   phys_carveout_mb, (unsigned int)kr);
-                       } else {
-                               phys_carveout_pa = kvtophys((vm_offset_t)phys_carveout);
-                       }
-               }
-       }
+       phys_carveout_init();
 
 #if MACH_KDP
        kernel_bootstrap_log("kdp_init");
@@ -700,6 +678,10 @@ kernel_bootstrap_thread(void)
        bsd_init();
 #endif
 
+#if defined (__x86_64__)
+       x86_64_protect_data_const();
+#endif
+
 
        /*
         * Get rid of pages used for early boot tracing.
@@ -730,6 +712,8 @@ kernel_bootstrap_thread(void)
  *     slave_main:
  *
  *     Load the first thread to start a processor.
+ *     This path will also be used by the master processor
+ *     after being offlined.
  */
 void
 slave_main(void *machine_param)
@@ -741,13 +725,19 @@ slave_main(void *machine_param)
         *      Use the idle processor thread if there
         *      is no dedicated start up thread.
         */
-       if (processor->next_thread == THREAD_NULL) {
+       if (processor->processor_offlined == true) {
+               /* Return to the saved processor_offline context */
+               assert(processor->startup_thread == THREAD_NULL);
+
                thread = processor->idle_thread;
-               thread->continuation = (thread_continue_t)processor_start_thread;
                thread->parameter = machine_param;
+       } else if (processor->startup_thread) {
+               thread = processor->startup_thread;
+               processor->startup_thread = THREAD_NULL;
        } else {
-               thread = processor->next_thread;
-               processor->next_thread = THREAD_NULL;
+               thread = processor->idle_thread;
+               thread->continuation = processor_start_thread;
+               thread->parameter = machine_param;
        }
 
        load_context(thread);
@@ -762,7 +752,8 @@ slave_main(void *machine_param)
  *     Called at splsched.
  */
 void
-processor_start_thread(void *machine_param)
+processor_start_thread(void *machine_param,
+    __unused wait_result_t result)
 {
        processor_t             processor = current_processor();
        thread_t                self = current_thread();
@@ -774,7 +765,7 @@ processor_start_thread(void *machine_param)
         *      reenter the idle loop, else terminate.
         */
        if (self == processor->idle_thread) {
-               thread_block((thread_continue_t)idle_thread);
+               thread_block(idle_thread);
        }
 
        thread_terminate(self);
@@ -785,6 +776,8 @@ processor_start_thread(void *machine_param)
  *     load_context:
  *
  *     Start the first thread on a processor.
+ *     This may be the first thread ever run on a processor, or
+ *     it could be a processor that was previously offlined.
  */
 static void __attribute__((noreturn))
 load_context(
@@ -799,7 +792,6 @@ load_context(
        machine_set_current_thread(thread);
 
        load_context_kprintf("processor_up\n");
-       processor_up(processor);
 
        PMAP_ACTIVATE_KERNEL(processor->cpu_id);
 
@@ -822,7 +814,7 @@ load_context(
         * running for load calculations.
         */
        if (!(thread->state & TH_IDLE)) {
-               sched_run_incr(thread);
+               SCHED(run_count_incr)(thread);
        }
 
        processor->active_thread = thread;
@@ -834,6 +826,8 @@ load_context(
        processor->deadline = UINT64_MAX;
        thread->last_processor = processor;
 
+       processor_up(processor);
+
        processor->last_dispatch = mach_absolute_time();
        timer_start(&thread->system_timer, processor->last_dispatch);
        PROCESSOR_DATA(processor, thread_timer) = PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
index 898a5e8bb3e87326b0a500c3778f5244cb5fdaa2..062c9cb4beb28398db6bc0dd177c406ae3eb9824 100644 (file)
@@ -480,7 +480,7 @@ semaphore_signal_thread_trap(
         * pre-post the semaphore.
         */
        if (thread_name != MACH_PORT_NULL) {
-               thread = port_name_to_thread(thread_name);
+               thread = port_name_to_thread(thread_name, PORT_TO_THREAD_NONE);
                if (thread == THREAD_NULL) {
                        return KERN_INVALID_ARGUMENT;
                }
index 0bb52436d73abc386a749aa05255dc199f7906ec..ef23168835391a01e8c081191870736faf2bac36 100644 (file)
@@ -205,15 +205,20 @@ thread_switch(
        thread_t                        thread = THREAD_NULL;
        thread_t                        self = current_thread();
        mach_port_name_t                thread_name = args->thread_name;
-       int                                             option = args->option;
+       int                             option = args->option;
        mach_msg_timeout_t              option_time = args->option_time;
-       uint32_t                                scale_factor = NSEC_PER_MSEC;
-       boolean_t                               depress_option = FALSE;
-       boolean_t                               wait_option = FALSE;
+       uint32_t                        scale_factor = NSEC_PER_MSEC;
+       boolean_t                       depress_option = FALSE;
+       boolean_t                       wait_option = FALSE;
        wait_interrupt_t                interruptible = THREAD_ABORTSAFE;
+       port_to_thread_options_t        ptt_options = PORT_TO_THREAD_NOT_CURRENT_THREAD;
 
        /*
         *      Validate and process option.
+        *
+        * OSLock boosting only applies to other threads
+        * in your same task (even if you have a port for
+        * a thread in another task)
         */
        switch (option) {
        case SWITCH_OPTION_NONE:
@@ -232,10 +237,12 @@ thread_switch(
        case SWITCH_OPTION_OSLOCK_DEPRESS:
                depress_option = TRUE;
                interruptible |= THREAD_WAIT_NOREPORT;
+               ptt_options |= PORT_TO_THREAD_IN_CURRENT_TASK;
                break;
        case SWITCH_OPTION_OSLOCK_WAIT:
                wait_option = TRUE;
                interruptible |= THREAD_WAIT_NOREPORT;
+               ptt_options |= PORT_TO_THREAD_IN_CURRENT_TASK;
                break;
        default:
                return KERN_INVALID_ARGUMENT;
@@ -245,46 +252,21 @@ thread_switch(
         * Translate the port name if supplied.
         */
        if (thread_name != MACH_PORT_NULL) {
-               ipc_port_t port;
-
-               if (ipc_port_translate_send(self->task->itk_space,
-                   thread_name, &port) == KERN_SUCCESS) {
-                       ip_reference(port);
-                       ip_unlock(port);
-
-                       thread = convert_port_to_thread(port);
-                       ip_release(port);
-
-                       if (thread == self) {
-                               thread_deallocate(thread);
-                               thread = THREAD_NULL;
-                       }
-               }
+               thread = port_name_to_thread(thread_name, ptt_options);
        }
 
        if (option == SWITCH_OPTION_OSLOCK_DEPRESS || option == SWITCH_OPTION_OSLOCK_WAIT) {
                if (thread != THREAD_NULL) {
-                       if (thread->task != self->task) {
-                               /*
-                                * OSLock boosting only applies to other threads
-                                * in your same task (even if you have a port for
-                                * a thread in another task)
-                                */
-
-                               thread_deallocate(thread);
-                               thread = THREAD_NULL;
-                       } else {
-                               /*
-                                * Attempt to kick the lock owner up to our same IO throttling tier.
-                                * If the thread is currently blocked in throttle_lowpri_io(),
-                                * it will immediately break out.
-                                *
-                                * TODO: SFI break out?
-                                */
-                               int new_policy = proc_get_effective_thread_policy(self, TASK_POLICY_IO);
-
-                               set_thread_iotier_override(thread, new_policy);
-                       }
+                       /*
+                        * Attempt to kick the lock owner up to our same IO throttling tier.
+                        * If the thread is currently blocked in throttle_lowpri_io(),
+                        * it will immediately break out.
+                        *
+                        * TODO: SFI break out?
+                        */
+                       int new_policy = proc_get_effective_thread_policy(self, TASK_POLICY_IO);
+
+                       set_thread_iotier_override(thread, new_policy);
                }
        }
 
@@ -353,41 +335,6 @@ thread_yield_with_continuation(
 }
 
 
-/* Returns a +1 thread reference */
-thread_t
-port_name_to_thread_for_ulock(mach_port_name_t thread_name)
-{
-       thread_t thread = THREAD_NULL;
-       thread_t self = current_thread();
-
-       /*
-        * Translate the port name if supplied.
-        */
-       if (thread_name != MACH_PORT_NULL) {
-               ipc_port_t port;
-
-               if (ipc_port_translate_send(self->task->itk_space,
-                   thread_name, &port) == KERN_SUCCESS) {
-                       ip_reference(port);
-                       ip_unlock(port);
-
-                       thread = convert_port_to_thread(port);
-                       ip_release(port);
-
-                       if (thread == THREAD_NULL) {
-                               return thread;
-                       }
-
-                       if ((thread == self) || (thread->task != self->task)) {
-                               thread_deallocate(thread);
-                               thread = THREAD_NULL;
-                       }
-               }
-       }
-
-       return thread;
-}
-
 /* This function is called after an assert_wait(), therefore it must not
  * cause another wait until after the thread_run() or thread_block()
  *
@@ -531,6 +478,9 @@ thread_depress_expire(void      *p0,
 
        if (--thread->depress_timer_active == 0) {
                thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
+               if ((thread->state & TH_RUN) == TH_RUN) {
+                       thread->last_basepri_change_time = mach_absolute_time();
+               }
                thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
        }
 
@@ -579,6 +529,9 @@ thread_depress_abort_locked(thread_t thread)
        assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
 
        thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
+       if ((thread->state & TH_RUN) == TH_RUN) {
+               thread->last_basepri_change_time = mach_absolute_time();
+       }
 
        thread_recompute_sched_pri(thread, SETPRI_LAZY);
 
index b6dee96ef5d40baf122cb1a4a95e3bbc1fcb2b5b..571530ed4bfba7b3af7968fa16bf2cfbc56109fa 100644 (file)
@@ -179,8 +179,8 @@ const mach_trap_t       mach_trap_table[MACH_TRAP_TABLE_COUNT] = {
 /* 73 */ MACH_TRAP(kern_invalid, 0, 0, NULL),
 /* 74 */ MACH_TRAP(kern_invalid, 0, 0, NULL),
 /* 75 */ MACH_TRAP(kern_invalid, 0, 0, NULL),
-/* 76 */ MACH_TRAP(kern_invalid, 0, 0, NULL),
-/* 77 */ MACH_TRAP(kern_invalid, 0, 0, NULL),
+/* 76 */ MACH_TRAP(_kernelrpc_mach_port_type_trap, 3, 3, munge_wwww),
+/* 77 */ MACH_TRAP(_kernelrpc_mach_port_request_notification_trap, 7, 7, munge_wwwwwww),
 /* 78 */ MACH_TRAP(kern_invalid, 0, 0, NULL),
 /* 79 */ MACH_TRAP(kern_invalid, 0, 0, NULL),
 /* 80 */ MACH_TRAP(kern_invalid, 0, 0, NULL),
@@ -200,7 +200,7 @@ const mach_trap_t       mach_trap_table[MACH_TRAP_TABLE_COUNT] = {
 /* 94 */ MACH_TRAP(mk_timer_cancel_trap, 2, 2, munge_ww),
 /* 95 */ MACH_TRAP(mk_timer_arm_leeway_trap, 4, 6, munge_wlll),
 /* traps 64 - 95 reserved (debo) */
-/* 96 */ MACH_TRAP(kern_invalid, 0, 0, NULL),
+/* 96 */ MACH_TRAP(debug_control_port_for_pid, 3, 3, munge_www),
 /* 97 */ MACH_TRAP(kern_invalid, 0, 0, NULL),
 /* 98 */ MACH_TRAP(kern_invalid, 0, 0, NULL),
 /* 99 */ MACH_TRAP(kern_invalid, 0, 0, NULL),
@@ -315,8 +315,8 @@ const char * mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = {
 /* 74 */ "kern_invalid",
 /* 75 */ "kern_invalid",
 /* 76 */ "kern_invalid",
-/* 77 */ "kern_invalid",
-/* 78 */ "kern_invalid",
+/* 77 */ "_kernelrpc_mach_port_type_trap",
+/* 78 */ "_kernelrpc_mach_port_request_notification_trap",
 /* 79 */ "kern_invalid",
 /* 80 */ "kern_invalid",
 /* 81 */ "kern_invalid",
@@ -335,7 +335,7 @@ const char * mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = {
 /* 94 */ "mk_timer_cancel_trap",
 /* 95 */ "kern_invalid",
 /* traps 64 - 95 reserved (debo) */
-/* 96 */ "kern_invalid",
+/* 96 */ "debug_control_port_for_pid",
 /* 97 */ "kern_invalid",
 /* 98 */ "kern_invalid",
 /* 99 */ "kern_invalid",
index 708ef1787cdc8b522c091a6a5465b1b019d394e3..0374456e1090a163788a54542d0993dd1650b755 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <kern/exc_resource.h>
 #include <kern/machine.h>
 #include <kern/policy_internal.h>
+#include <kern/restartable.h>
 
 #include <corpses/task_corpse.h>
 #if CONFIG_TELEMETRY
 #include <vm/vm_pageout.h>
 #include <vm/vm_protos.h>
 #include <vm/vm_purgeable_internal.h>
+#include <vm/vm_compressor_pager.h>
 
 #include <sys/resource.h>
 #include <sys/signalvar.h> /* for coredump */
-
+#include <sys/bsdtask_info.h>
 /*
  * Exported interfaces
  */
 #include <libkern/OSAtomic.h>
 #include <libkern/section_keywords.h>
 
+#include <mach-o/loader.h>
+
 #if CONFIG_ATM
 #include <atm/atm_internal.h>
 #endif
@@ -216,10 +220,26 @@ SECURITY_READ_ONLY_LATE(struct _task_ledger_indices) task_ledgers __attribute__(
  .purgeable_nonvolatile = -1,
  .purgeable_volatile_compressed = -1,
  .purgeable_nonvolatile_compressed = -1,
+ .tagged_nofootprint = -1,
+ .tagged_footprint = -1,
+ .tagged_nofootprint_compressed = -1,
+ .tagged_footprint_compressed = -1,
  .network_volatile = -1,
  .network_nonvolatile = -1,
  .network_volatile_compressed = -1,
  .network_nonvolatile_compressed = -1,
+ .media_nofootprint = -1,
+ .media_footprint = -1,
+ .media_nofootprint_compressed = -1,
+ .media_footprint_compressed = -1,
+ .graphics_nofootprint = -1,
+ .graphics_footprint = -1,
+ .graphics_nofootprint_compressed = -1,
+ .graphics_footprint_compressed = -1,
+ .neural_nofootprint = -1,
+ .neural_footprint = -1,
+ .neural_nofootprint_compressed = -1,
+ .neural_footprint_compressed = -1,
  .platform_idle_wakeups = -1,
  .interrupt_wakeups = -1,
 #if !CONFIG_EMBEDDED
@@ -229,12 +249,15 @@ SECURITY_READ_ONLY_LATE(struct _task_ledger_indices) task_ledgers __attribute__(
  .cpu_time_billed_to_others = -1,
  .physical_writes = -1,
  .logical_writes = -1,
- .energy_billed_to_me = -1,
- .energy_billed_to_others = -1,
+ .logical_writes_to_external = -1,
+#if DEBUG || DEVELOPMENT
  .pages_grabbed = -1,
  .pages_grabbed_kern = -1,
  .pages_grabbed_iopl = -1,
- .pages_grabbed_upl = -1};
+ .pages_grabbed_upl = -1,
+#endif
+ .energy_billed_to_me = -1,
+ .energy_billed_to_others = -1};
 
 /* System sleep state */
 boolean_t tasks_suspend_state;
@@ -253,6 +276,7 @@ kern_return_t task_resume_internal(task_t);
 static kern_return_t task_start_halt_locked(task_t task, boolean_t should_mark_corpse);
 
 extern kern_return_t iokit_task_terminate(task_t task);
+extern void          iokit_task_app_suspended_changed(task_t task);
 
 extern kern_return_t exception_deliver(thread_t, exception_type_t, mach_exception_data_t, mach_msg_type_number_t, struct exception_action *, lck_mtx_t *);
 extern void bsd_copythreadname(void *dst_uth, void *src_uth);
@@ -293,7 +317,8 @@ uint64_t task_iomon_interval_secs;      /* Per-task I/O monitor interval in secs
 #define IO_TELEMETRY_DEFAULT_LIMIT              (10ll * 1024ll * 1024ll)
 int64_t io_telemetry_limit;                     /* Threshold to take a microstackshot (0 indicated I/O telemetry is turned off) */
 int64_t global_logical_writes_count = 0;        /* Global count for logical writes */
-static boolean_t global_update_logical_writes(int64_t);
+int64_t global_logical_writes_to_external_count = 0;        /* Global count for logical writes to external storage*/
+static boolean_t global_update_logical_writes(int64_t, int64_t*);
 
 #define TASK_MAX_THREAD_LIMIT 256
 
@@ -309,6 +334,8 @@ int hwm_user_cores = 0; /* high watermark violations generate user core files */
 #endif
 
 #ifdef MACH_BSD
+extern uint32_t proc_platform(struct proc *);
+extern uint32_t proc_sdk(struct proc *);
 extern void     proc_getexecutableuuid(void *, unsigned char *, unsigned long);
 extern int      proc_pid(struct proc *p);
 extern int      proc_selfpid(void);
@@ -324,6 +351,7 @@ extern void     proc_memstat_terminated(struct proc* p, boolean_t set);
 extern void     memorystatus_on_ledger_footprint_exceeded(int warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal);
 extern void     memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal);
 extern boolean_t memorystatus_allowed_vm_map_fork(task_t task);
+extern uint64_t  memorystatus_available_memory_internal(proc_t p);
 
 #if DEVELOPMENT || DEBUG
 extern void memorystatus_abort_vm_map_fork(task_t);
@@ -337,9 +365,9 @@ extern void memorystatus_abort_vm_map_fork(task_t);
 int exc_resource_threads_enabled;
 #endif /* DEVELOPMENT || DEBUG */
 
-#if (DEVELOPMENT || DEBUG) && TASK_EXC_GUARD_DELIVER_CORPSE
-uint32_t task_exc_guard_default = TASK_EXC_GUARD_MP_DELIVER | TASK_EXC_GUARD_MP_CORPSE |
-    TASK_EXC_GUARD_VM_DELIVER | TASK_EXC_GUARD_VM_CORPSE;
+#if (DEVELOPMENT || DEBUG)
+uint32_t task_exc_guard_default = TASK_EXC_GUARD_MP_DELIVER | TASK_EXC_GUARD_MP_ONCE | TASK_EXC_GUARD_MP_CORPSE |
+    TASK_EXC_GUARD_VM_DELIVER | TASK_EXC_GUARD_VM_ONCE | TASK_EXC_GUARD_VM_CORPSE;
 #else
 uint32_t task_exc_guard_default = 0;
 #endif
@@ -351,7 +379,29 @@ static void task_wait_locked(task_t task, boolean_t until_not_runnable);
 static void task_release_locked(task_t task);
 
 static void task_synchronizer_destroy_all(task_t task);
+static os_ref_count_t
+task_add_turnstile_watchports_locked(
+       task_t                      task,
+       struct task_watchports      *watchports,
+       struct task_watchport_elem  **previous_elem_array,
+       ipc_port_t                  *portwatch_ports,
+       uint32_t                    portwatch_count);
+
+static os_ref_count_t
+task_remove_turnstile_watchports_locked(
+       task_t                 task,
+       struct task_watchports *watchports,
+       ipc_port_t             *port_freelist);
+
+static struct task_watchports *
+task_watchports_alloc_init(
+       task_t        task,
+       thread_t      thread,
+       uint32_t      count);
 
+static void
+task_watchports_deallocate(
+       struct task_watchports *watchports);
 
 void
 task_set_64bit(
@@ -453,8 +503,12 @@ task_set_platform_binary(
        task_lock(task);
        if (is_platform) {
                task->t_flags |= TF_PLATFORM;
+               /* set exc guard default behavior for first-party code */
+               task->task_exc_guard = (task_exc_guard_default & TASK_EXC_GUARD_ALL);
        } else {
                task->t_flags &= ~(TF_PLATFORM);
+               /* set exc guard default behavior for third-party code */
+               task->task_exc_guard = ((task_exc_guard_default >> TASK_EXC_GUARD_THIRD_PARTY_DEFAULT_SHIFT) & TASK_EXC_GUARD_ALL);
        }
        task_unlock(task);
 }
@@ -496,6 +550,16 @@ task_set_dyld_info(
        task_unlock(task);
 }
 
+void
+task_set_mach_header_address(
+       task_t task,
+       mach_vm_address_t addr)
+{
+       task_lock(task);
+       task->mach_header_vm_address = addr;
+       task_unlock(task);
+}
+
 void
 task_atm_reset(__unused task_t task)
 {
@@ -541,54 +605,80 @@ task_clear_exec_copy_flag(task_t task)
        task->t_procflags &= ~TPF_EXEC_COPY;
 }
 
-/*
- * This wait event is t_procflags instead of t_flags because t_flags is volatile
- *
- * TODO: store the flags in the same place as the event
- * rdar://problem/28501994
- */
 event_t
 task_get_return_wait_event(task_t task)
 {
-       return (event_t)&task->t_procflags;
+       return (event_t)&task->returnwait_inheritor;
 }
 
 void
-task_clear_return_wait(task_t task)
+task_clear_return_wait(task_t task, uint32_t flags)
 {
-       task_lock(task);
-
-       task->t_flags &= ~TF_LRETURNWAIT;
-
-       if (task->t_flags & TF_LRETURNWAITER) {
+       if (flags & TCRW_CLEAR_INITIAL_WAIT) {
                thread_wakeup(task_get_return_wait_event(task));
-               task->t_flags &= ~TF_LRETURNWAITER;
        }
 
-       task_unlock(task);
+       if (flags & TCRW_CLEAR_FINAL_WAIT) {
+               is_write_lock(task->itk_space);
+
+               task->t_returnwaitflags &= ~TRW_LRETURNWAIT;
+               task->returnwait_inheritor = NULL;
+
+               if (task->t_returnwaitflags & TRW_LRETURNWAITER) {
+                       struct turnstile *turnstile = turnstile_prepare((uintptr_t) task_get_return_wait_event(task),
+                           NULL, TURNSTILE_NULL, TURNSTILE_ULOCK);
+
+                       waitq_wakeup64_all(&turnstile->ts_waitq,
+                           CAST_EVENT64_T(task_get_return_wait_event(task)),
+                           THREAD_AWAKENED, 0);
+
+                       turnstile_update_inheritor(turnstile, NULL,
+                           TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
+                       turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_HELD);
+
+                       turnstile_complete((uintptr_t) task_get_return_wait_event(task), NULL, NULL, TURNSTILE_ULOCK);
+                       turnstile_cleanup();
+                       task->t_returnwaitflags &= ~TRW_LRETURNWAITER;
+               }
+               is_write_unlock(task->itk_space);
+       }
 }
 
 void __attribute__((noreturn))
 task_wait_to_return(void)
 {
-       task_t task;
+       task_t task = current_task();
 
-       task = current_task();
-       task_lock(task);
+       is_write_lock(task->itk_space);
+
+       if (task->t_returnwaitflags & TRW_LRETURNWAIT) {
+               struct turnstile *turnstile = turnstile_prepare((uintptr_t) task_get_return_wait_event(task),
+                   NULL, TURNSTILE_NULL, TURNSTILE_ULOCK);
 
-       if (task->t_flags & TF_LRETURNWAIT) {
                do {
-                       task->t_flags |= TF_LRETURNWAITER;
-                       assert_wait(task_get_return_wait_event(task), THREAD_UNINT);
-                       task_unlock(task);
+                       task->t_returnwaitflags |= TRW_LRETURNWAITER;
+                       turnstile_update_inheritor(turnstile, task->returnwait_inheritor,
+                           (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
+
+                       waitq_assert_wait64(&turnstile->ts_waitq,
+                           CAST_EVENT64_T(task_get_return_wait_event(task)),
+                           THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
+
+                       is_write_unlock(task->itk_space);
+
+                       turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
 
                        thread_block(THREAD_CONTINUE_NULL);
 
-                       task_lock(task);
-               } while (task->t_flags & TF_LRETURNWAIT);
+                       is_write_lock(task->itk_space);
+               } while (task->t_returnwaitflags & TRW_LRETURNWAIT);
+
+               turnstile_complete((uintptr_t) task_get_return_wait_event(task), NULL, NULL, TURNSTILE_ULOCK);
        }
 
-       task_unlock(task);
+       is_write_unlock(task->itk_space);
+       turnstile_cleanup();
+
 
 #if CONFIG_MACF
        /*
@@ -843,12 +933,18 @@ task_init(void)
         * Create the kernel task as the first task.
         */
 #ifdef __LP64__
-       if (task_create_internal(TASK_NULL, NULL, FALSE, TRUE, TRUE, TF_NONE, TPF_NONE, &kernel_task) != KERN_SUCCESS)
+       if (task_create_internal(TASK_NULL, NULL, FALSE, TRUE, TRUE, TF_NONE, TPF_NONE, TWF_NONE, &kernel_task) != KERN_SUCCESS)
 #else
-       if (task_create_internal(TASK_NULL, NULL, FALSE, FALSE, FALSE, TF_NONE, TPF_NONE, &kernel_task) != KERN_SUCCESS)
+       if (task_create_internal(TASK_NULL, NULL, FALSE, FALSE, FALSE, TF_NONE, TPF_NONE, TWF_NONE, &kernel_task) != KERN_SUCCESS)
 #endif
        { panic("task_init\n");}
 
+#if defined(HAS_APPLE_PAC)
+       kernel_task->rop_pid = KERNEL_ROP_ID;
+       // kernel_task never runs at EL0, but machine_thread_state_convert_from/to_user() relies on
+       // disable_user_jop to be false for kernel threads (e.g. in exception delivery on thread_exception_daemon)
+       ml_task_set_disable_user_jop(kernel_task, FALSE);
+#endif
 
        vm_map_deallocate(kernel_task->map);
        kernel_task->map = kernel_map;
@@ -997,15 +1093,33 @@ init_task_ledgers(void)
        task_ledgers.purgeable_nonvolatile = ledger_entry_add(t, "purgeable_nonvolatile", "physmem", "bytes");
        task_ledgers.purgeable_volatile_compressed = ledger_entry_add(t, "purgeable_volatile_compress", "physmem", "bytes");
        task_ledgers.purgeable_nonvolatile_compressed = ledger_entry_add(t, "purgeable_nonvolatile_compress", "physmem", "bytes");
+#if DEBUG || DEVELOPMENT
        task_ledgers.pages_grabbed = ledger_entry_add(t, "pages_grabbed", "physmem", "count");
        task_ledgers.pages_grabbed_kern = ledger_entry_add(t, "pages_grabbed_kern", "physmem", "count");
        task_ledgers.pages_grabbed_iopl = ledger_entry_add(t, "pages_grabbed_iopl", "physmem", "count");
        task_ledgers.pages_grabbed_upl = ledger_entry_add(t, "pages_grabbed_upl", "physmem", "count");
-
+#endif
+       task_ledgers.tagged_nofootprint = ledger_entry_add(t, "tagged_nofootprint", "physmem", "bytes");
+       task_ledgers.tagged_footprint = ledger_entry_add(t, "tagged_footprint", "physmem", "bytes");
+       task_ledgers.tagged_nofootprint_compressed = ledger_entry_add(t, "tagged_nofootprint_compressed", "physmem", "bytes");
+       task_ledgers.tagged_footprint_compressed = ledger_entry_add(t, "tagged_footprint_compressed", "physmem", "bytes");
        task_ledgers.network_volatile = ledger_entry_add(t, "network_volatile", "physmem", "bytes");
        task_ledgers.network_nonvolatile = ledger_entry_add(t, "network_nonvolatile", "physmem", "bytes");
        task_ledgers.network_volatile_compressed = ledger_entry_add(t, "network_volatile_compressed", "physmem", "bytes");
        task_ledgers.network_nonvolatile_compressed = ledger_entry_add(t, "network_nonvolatile_compressed", "physmem", "bytes");
+       task_ledgers.media_nofootprint = ledger_entry_add(t, "media_nofootprint", "physmem", "bytes");
+       task_ledgers.media_footprint = ledger_entry_add(t, "media_footprint", "physmem", "bytes");
+       task_ledgers.media_nofootprint_compressed = ledger_entry_add(t, "media_nofootprint_compressed", "physmem", "bytes");
+       task_ledgers.media_footprint_compressed = ledger_entry_add(t, "media_footprint_compressed", "physmem", "bytes");
+       task_ledgers.graphics_nofootprint = ledger_entry_add(t, "graphics_nofootprint", "physmem", "bytes");
+       task_ledgers.graphics_footprint = ledger_entry_add(t, "graphics_footprint", "physmem", "bytes");
+       task_ledgers.graphics_nofootprint_compressed = ledger_entry_add(t, "graphics_nofootprint_compressed", "physmem", "bytes");
+       task_ledgers.graphics_footprint_compressed = ledger_entry_add(t, "graphics_footprint_compressed", "physmem", "bytes");
+       task_ledgers.neural_nofootprint = ledger_entry_add(t, "neural_nofootprint", "physmem", "bytes");
+       task_ledgers.neural_footprint = ledger_entry_add(t, "neural_footprint", "physmem", "bytes");
+       task_ledgers.neural_nofootprint_compressed = ledger_entry_add(t, "neural_nofootprint_compressed", "physmem", "bytes");
+       task_ledgers.neural_footprint_compressed = ledger_entry_add(t, "neural_footprint_compressed", "physmem", "bytes");
+
 
        task_ledgers.platform_idle_wakeups = ledger_entry_add(t, "platform_idle_wakeups", "power",
            "count");
@@ -1045,6 +1159,7 @@ init_task_ledgers(void)
        task_ledgers.cpu_time_billed_to_others = ledger_entry_add(t, "cpu_time_billed_to_others", "sched", "ns");
        task_ledgers.physical_writes = ledger_entry_add(t, "physical_writes", "res", "bytes");
        task_ledgers.logical_writes = ledger_entry_add(t, "logical_writes", "res", "bytes");
+       task_ledgers.logical_writes_to_external = ledger_entry_add(t, "logical_writes_to_external", "res", "bytes");
        task_ledgers.energy_billed_to_me = ledger_entry_add(t, "energy_billed_to_me", "power", "nj");
        task_ledgers.energy_billed_to_others = ledger_entry_add(t, "energy_billed_to_others", "power", "nj");
 
@@ -1064,15 +1179,32 @@ init_task_ledgers(void)
            (task_ledgers.purgeable_nonvolatile < 0) ||
            (task_ledgers.purgeable_volatile_compressed < 0) ||
            (task_ledgers.purgeable_nonvolatile_compressed < 0) ||
+           (task_ledgers.tagged_nofootprint < 0) ||
+           (task_ledgers.tagged_footprint < 0) ||
+           (task_ledgers.tagged_nofootprint_compressed < 0) ||
+           (task_ledgers.tagged_footprint_compressed < 0) ||
            (task_ledgers.network_volatile < 0) ||
            (task_ledgers.network_nonvolatile < 0) ||
            (task_ledgers.network_volatile_compressed < 0) ||
            (task_ledgers.network_nonvolatile_compressed < 0) ||
+           (task_ledgers.media_nofootprint < 0) ||
+           (task_ledgers.media_footprint < 0) ||
+           (task_ledgers.media_nofootprint_compressed < 0) ||
+           (task_ledgers.media_footprint_compressed < 0) ||
+           (task_ledgers.graphics_nofootprint < 0) ||
+           (task_ledgers.graphics_footprint < 0) ||
+           (task_ledgers.graphics_nofootprint_compressed < 0) ||
+           (task_ledgers.graphics_footprint_compressed < 0) ||
+           (task_ledgers.neural_nofootprint < 0) ||
+           (task_ledgers.neural_footprint < 0) ||
+           (task_ledgers.neural_nofootprint_compressed < 0) ||
+           (task_ledgers.neural_footprint_compressed < 0) ||
            (task_ledgers.platform_idle_wakeups < 0) ||
            (task_ledgers.interrupt_wakeups < 0) ||
            (task_ledgers.cpu_time_billed_to_me < 0) || (task_ledgers.cpu_time_billed_to_others < 0) ||
            (task_ledgers.physical_writes < 0) ||
            (task_ledgers.logical_writes < 0) ||
+           (task_ledgers.logical_writes_to_external < 0) ||
            (task_ledgers.energy_billed_to_me < 0) ||
            (task_ledgers.energy_billed_to_others < 0)
            ) {
@@ -1090,15 +1222,32 @@ init_task_ledgers(void)
        ledger_track_credit_only(t, task_ledgers.purgeable_nonvolatile);
        ledger_track_credit_only(t, task_ledgers.purgeable_volatile_compressed);
        ledger_track_credit_only(t, task_ledgers.purgeable_nonvolatile_compressed);
+#if DEBUG || DEVELOPMENT
        ledger_track_credit_only(t, task_ledgers.pages_grabbed);
        ledger_track_credit_only(t, task_ledgers.pages_grabbed_kern);
        ledger_track_credit_only(t, task_ledgers.pages_grabbed_iopl);
        ledger_track_credit_only(t, task_ledgers.pages_grabbed_upl);
-
+#endif
+       ledger_track_credit_only(t, task_ledgers.tagged_nofootprint);
+       ledger_track_credit_only(t, task_ledgers.tagged_footprint);
+       ledger_track_credit_only(t, task_ledgers.tagged_nofootprint_compressed);
+       ledger_track_credit_only(t, task_ledgers.tagged_footprint_compressed);
        ledger_track_credit_only(t, task_ledgers.network_volatile);
        ledger_track_credit_only(t, task_ledgers.network_nonvolatile);
        ledger_track_credit_only(t, task_ledgers.network_volatile_compressed);
        ledger_track_credit_only(t, task_ledgers.network_nonvolatile_compressed);
+       ledger_track_credit_only(t, task_ledgers.media_nofootprint);
+       ledger_track_credit_only(t, task_ledgers.media_footprint);
+       ledger_track_credit_only(t, task_ledgers.media_nofootprint_compressed);
+       ledger_track_credit_only(t, task_ledgers.media_footprint_compressed);
+       ledger_track_credit_only(t, task_ledgers.graphics_nofootprint);
+       ledger_track_credit_only(t, task_ledgers.graphics_footprint);
+       ledger_track_credit_only(t, task_ledgers.graphics_nofootprint_compressed);
+       ledger_track_credit_only(t, task_ledgers.graphics_footprint_compressed);
+       ledger_track_credit_only(t, task_ledgers.neural_nofootprint);
+       ledger_track_credit_only(t, task_ledgers.neural_footprint);
+       ledger_track_credit_only(t, task_ledgers.neural_nofootprint_compressed);
+       ledger_track_credit_only(t, task_ledgers.neural_footprint_compressed);
 
        ledger_track_maximum(t, task_ledgers.phys_footprint, 60);
 #if MACH_ASSERT
@@ -1115,10 +1264,26 @@ init_task_ledgers(void)
                ledger_panic_on_negative(t, task_ledgers.purgeable_volatile_compressed);
                ledger_panic_on_negative(t, task_ledgers.purgeable_nonvolatile_compressed);
 
+               ledger_panic_on_negative(t, task_ledgers.tagged_nofootprint);
+               ledger_panic_on_negative(t, task_ledgers.tagged_footprint);
+               ledger_panic_on_negative(t, task_ledgers.tagged_nofootprint_compressed);
+               ledger_panic_on_negative(t, task_ledgers.tagged_footprint_compressed);
                ledger_panic_on_negative(t, task_ledgers.network_volatile);
                ledger_panic_on_negative(t, task_ledgers.network_nonvolatile);
                ledger_panic_on_negative(t, task_ledgers.network_volatile_compressed);
                ledger_panic_on_negative(t, task_ledgers.network_nonvolatile_compressed);
+               ledger_panic_on_negative(t, task_ledgers.media_nofootprint);
+               ledger_panic_on_negative(t, task_ledgers.media_footprint);
+               ledger_panic_on_negative(t, task_ledgers.media_nofootprint_compressed);
+               ledger_panic_on_negative(t, task_ledgers.media_footprint_compressed);
+               ledger_panic_on_negative(t, task_ledgers.graphics_nofootprint);
+               ledger_panic_on_negative(t, task_ledgers.graphics_footprint);
+               ledger_panic_on_negative(t, task_ledgers.graphics_nofootprint_compressed);
+               ledger_panic_on_negative(t, task_ledgers.graphics_footprint_compressed);
+               ledger_panic_on_negative(t, task_ledgers.neural_nofootprint);
+               ledger_panic_on_negative(t, task_ledgers.neural_footprint);
+               ledger_panic_on_negative(t, task_ledgers.neural_nofootprint_compressed);
+               ledger_panic_on_negative(t, task_ledgers.neural_footprint_compressed);
        }
 #endif /* MACH_ASSERT */
 
@@ -1129,7 +1294,6 @@ init_task_ledgers(void)
        ledger_set_callback(t, task_ledgers.interrupt_wakeups,
            task_wakeups_rate_exceeded, NULL, NULL);
        ledger_set_callback(t, task_ledgers.physical_writes, task_io_rate_exceeded, (void *)FLAVOR_IO_PHYSICAL_WRITES, NULL);
-       ledger_set_callback(t, task_ledgers.logical_writes, task_io_rate_exceeded, (void *)FLAVOR_IO_LOGICAL_WRITES, NULL);
 
        ledger_template_complete(t);
        task_ledger_template = t;
@@ -1146,6 +1310,7 @@ task_create_internal(
        boolean_t is_64bit_data,
        uint32_t        t_flags,
        uint32_t        t_procflags,
+       uint8_t         t_returnwaitflags,
        task_t          *child_task)            /* OUT */
 {
        task_t                  new_task;
@@ -1169,6 +1334,10 @@ task_create_internal(
                return KERN_RESOURCE_SHORTAGE;
        }
 
+#if defined(HAS_APPLE_PAC)
+       ml_task_set_rop_pid(new_task, parent_task, inherit_memory);
+       ml_task_set_disable_user_jop(new_task, inherit_memory ? parent_task->disable_user_jop : FALSE);
+#endif
 
        new_task->ledger = ledger;
 
@@ -1180,7 +1349,8 @@ task_create_internal(
        if (!(t_flags & TF_CORPSE_FORK) && inherit_memory) {
                new_task->map = vm_map_fork(ledger, parent_task->map, 0);
        } else {
-               new_task->map = vm_map_create(pmap_create(ledger, 0, is_64bit),
+               unsigned int pmap_flags = is_64bit ? PMAP_CREATE_64BIT : 0;
+               new_task->map = vm_map_create(pmap_create_options(ledger, 0, pmap_flags),
                    (vm_map_offset_t)(VM_MIN_ADDRESS),
                    (vm_map_offset_t)(VM_MAX_ADDRESS), TRUE);
        }
@@ -1202,11 +1372,14 @@ task_create_internal(
        new_task->priv_flags = 0;
        new_task->t_flags = t_flags;
        new_task->t_procflags = t_procflags;
+       new_task->t_returnwaitflags = t_returnwaitflags;
+       new_task->returnwait_inheritor = current_thread();
        new_task->importance = 0;
        new_task->crashed_thread_id = 0;
        new_task->exec_token = 0;
-
-       new_task->task_exc_guard = task_exc_guard_default;
+       new_task->watchports = NULL;
+       new_task->restartable_ranges = NULL;
+       new_task->task_exc_guard = 0;
 
 #if CONFIG_ATM
        new_task->atm_context = NULL;
@@ -1308,6 +1481,7 @@ task_create_internal(
 
                new_task->all_image_info_addr = parent_task->all_image_info_addr;
                new_task->all_image_info_size = parent_task->all_image_info_size;
+               new_task->mach_header_vm_address = 0;
 
                if (inherit_memory && parent_task->affinity_space) {
                        task_affinity_create(parent_task, new_task);
@@ -1386,6 +1560,7 @@ task_create_internal(
                new_task->c_switch = 0;
                new_task->p_switch = 0;
                new_task->ps_switch = 0;
+               new_task->decompressions = 0;
                new_task->low_mem_notified_warn = 0;
                new_task->low_mem_notified_critical = 0;
                new_task->purged_memory_warn = 0;
@@ -1398,10 +1573,15 @@ task_create_internal(
                new_task->task_timer_wakeups_bin_1 = 0;
                new_task->task_timer_wakeups_bin_2 = 0;
                new_task->task_gpu_ns = 0;
-               new_task->task_immediate_writes = 0;
-               new_task->task_deferred_writes = 0;
-               new_task->task_invalidated_writes = 0;
-               new_task->task_metadata_writes = 0;
+               new_task->task_writes_counters_internal.task_immediate_writes = 0;
+               new_task->task_writes_counters_internal.task_deferred_writes = 0;
+               new_task->task_writes_counters_internal.task_invalidated_writes = 0;
+               new_task->task_writes_counters_internal.task_metadata_writes = 0;
+               new_task->task_writes_counters_external.task_immediate_writes = 0;
+               new_task->task_writes_counters_external.task_deferred_writes = 0;
+               new_task->task_writes_counters_external.task_invalidated_writes = 0;
+               new_task->task_writes_counters_external.task_metadata_writes = 0;
+
                new_task->task_energy = 0;
 #if MONOTONIC
                memset(&new_task->task_monotonic, 0, sizeof(new_task->task_monotonic));
@@ -1448,15 +1628,18 @@ task_create_internal(
                new_task->dispatchqueue_offset = parent_task->dispatchqueue_offset;
        }
 
+       new_task->task_can_transfer_memory_ownership = FALSE;
        new_task->task_volatile_objects = 0;
        new_task->task_nonvolatile_objects = 0;
-       new_task->task_purgeable_disowning = FALSE;
-       new_task->task_purgeable_disowned = FALSE;
+       new_task->task_objects_disowning = FALSE;
+       new_task->task_objects_disowned = FALSE;
+       new_task->task_owned_objects = 0;
        queue_init(&new_task->task_objq);
        task_objq_lock_init(new_task);
 
 #if __arm64__
        new_task->task_legacy_footprint = FALSE;
+       new_task->task_extra_footprint_limit = FALSE;
 #endif /* __arm64__ */
        new_task->task_region_footprint = FALSE;
        new_task->task_has_crossed_thread_limit = FALSE;
@@ -1476,6 +1659,7 @@ task_create_internal(
        new_task->t_flags &= ~(TF_DARKWAKE_MODE);
 
        queue_init(&new_task->io_user_clients);
+       new_task->loadTag = 0;
 
        ipc_task_enable(new_task);
 
@@ -1509,6 +1693,7 @@ task_rollup_accounting_info(task_t to_task, task_t from_task)
        to_task->faults = from_task->faults;
        to_task->pageins = from_task->pageins;
        to_task->cow_faults = from_task->cow_faults;
+       to_task->decompressions = from_task->decompressions;
        to_task->messages_sent = from_task->messages_sent;
        to_task->messages_received = from_task->messages_received;
        to_task->syscalls_mach = from_task->syscalls_mach;
@@ -1528,10 +1713,14 @@ task_rollup_accounting_info(task_t to_task, task_t from_task)
        to_task->task_timer_wakeups_bin_1 = from_task->task_timer_wakeups_bin_1;
        to_task->task_timer_wakeups_bin_2 = from_task->task_timer_wakeups_bin_2;
        to_task->task_gpu_ns = from_task->task_gpu_ns;
-       to_task->task_immediate_writes = from_task->task_immediate_writes;
-       to_task->task_deferred_writes = from_task->task_deferred_writes;
-       to_task->task_invalidated_writes = from_task->task_invalidated_writes;
-       to_task->task_metadata_writes = from_task->task_metadata_writes;
+       to_task->task_writes_counters_internal.task_immediate_writes = from_task->task_writes_counters_internal.task_immediate_writes;
+       to_task->task_writes_counters_internal.task_deferred_writes = from_task->task_writes_counters_internal.task_deferred_writes;
+       to_task->task_writes_counters_internal.task_invalidated_writes = from_task->task_writes_counters_internal.task_invalidated_writes;
+       to_task->task_writes_counters_internal.task_metadata_writes = from_task->task_writes_counters_internal.task_metadata_writes;
+       to_task->task_writes_counters_external.task_immediate_writes = from_task->task_writes_counters_external.task_immediate_writes;
+       to_task->task_writes_counters_external.task_deferred_writes = from_task->task_writes_counters_external.task_deferred_writes;
+       to_task->task_writes_counters_external.task_invalidated_writes = from_task->task_writes_counters_external.task_invalidated_writes;
+       to_task->task_writes_counters_external.task_metadata_writes = from_task->task_writes_counters_external.task_metadata_writes;
        to_task->task_energy = from_task->task_energy;
 
        /* Skip ledger roll up for memory accounting entries */
@@ -1591,7 +1780,18 @@ task_deallocate(
                return;
        }
 
+       /*
+        * The task should be dead at this point. Ensure other resources
+        * like threads, are gone before we trash the world.
+        */
+       assert(queue_empty(&task->threads));
+       assert(task->bsd_info == NULL);
+       assert(!is_active(task->itk_space));
+       assert(!task->active);
+       assert(task->active_thread_count == 0);
+
        lck_mtx_lock(&tasks_threads_lock);
+       assert(terminated_tasks_count > 0);
        queue_remove(&terminated_tasks, task, task_t, tasks);
        terminated_tasks_count--;
        lck_mtx_unlock(&tasks_threads_lock);
@@ -1635,19 +1835,24 @@ task_deallocate(
        }
 #endif /* MACH_ASSERT */
 
-       vm_purgeable_disown(task);
-       assert(task->task_purgeable_disowned);
+       vm_owned_objects_disown(task);
+       assert(task->task_objects_disowned);
        if (task->task_volatile_objects != 0 ||
-           task->task_nonvolatile_objects != 0) {
+           task->task_nonvolatile_objects != 0 ||
+           task->task_owned_objects != 0) {
                panic("task_deallocate(%p): "
-                   "volatile_objects=%d nonvolatile_objects=%d\n",
+                   "volatile_objects=%d nonvolatile_objects=%d owned=%d\n",
                    task,
                    task->task_volatile_objects,
-                   task->task_nonvolatile_objects);
+                   task->task_nonvolatile_objects,
+                   task->task_owned_objects);
        }
 
        vm_map_deallocate(task->map);
        is_release(task->itk_space);
+       if (task->restartable_ranges) {
+               restartable_ranges_release(task->restartable_ranges);
+       }
 
        ledger_get_entries(task->ledger, task_ledgers.interrupt_wakeups,
            &interrupt_wakeups, &debit);
@@ -1898,7 +2103,7 @@ task_deliver_crash_notification(
        task_reference(task);
        task_port = convert_task_to_port(task);
        ip_lock(task_port);
-       assert(ip_active(task_port));
+       require_ip_active(task_port);
        ipc_port_nsrequest(task_port, task_port->ip_mscount, ipc_port_make_sonce_locked(task_port), &old_notify);
        /* port unlocked */
        assert(IP_NULL == old_notify);
@@ -2070,7 +2275,7 @@ task_port_notify(mach_msg_header_t *msg)
        ipc_port_t port = notification->not_header.msgh_remote_port;
        task_t task;
 
-       assert(ip_active(port));
+       require_ip_active(port);
        assert(IKOT_TASK == ip_kotype(port));
        task = (task_t) port->ip_kobject;
 
@@ -2417,6 +2622,11 @@ task_terminate_internal(
         */
        task_synchronizer_destroy_all(task);
 
+       /*
+        *      Clear the watchport boost on the task.
+        */
+       task_remove_turnstile_watchports(task);
+
        /*
         *      Destroy the IPC space, leaving just a reference for it.
         */
@@ -2806,6 +3016,12 @@ task_wait_locked(
        }
 }
 
+boolean_t
+task_is_app_suspended(task_t task)
+{
+       return task->pidsuspended;
+}
+
 /*
  *     task_release_locked:
  *
@@ -3103,6 +3319,11 @@ release_task_hold(
        return KERN_SUCCESS;
 }
 
+boolean_t
+get_task_suspended(task_t task)
+{
+       return 0 != task->user_stop_count;
+}
 
 /*
  *     task_suspend:
@@ -3124,7 +3345,7 @@ task_suspend(
        task_t          task)
 {
        kern_return_t                   kr;
-       mach_port_t                     port, send, old_notify;
+       mach_port_t                     port;
        mach_port_name_t                name;
 
        if (task == TASK_NULL || task == kernel_task) {
@@ -3133,43 +3354,23 @@ task_suspend(
 
        task_lock(task);
 
-       /*
-        * Claim a send right on the task resume port, and request a no-senders
-        * notification on that port (if none outstanding).
-        */
-       if (task->itk_resume == IP_NULL) {
-               task->itk_resume = ipc_port_alloc_kernel();
-               if (!IP_VALID(task->itk_resume)) {
-                       panic("failed to create resume port");
-               }
-               ipc_kobject_set(task->itk_resume, (ipc_kobject_t)task, IKOT_TASK_RESUME);
-       }
-
-       port = task->itk_resume;
-       ip_lock(port);
-       assert(ip_active(port));
-
-       send = ipc_port_make_send_locked(port);
-       assert(IP_VALID(send));
-
-       if (port->ip_nsrequest == IP_NULL) {
-               ipc_port_nsrequest(port, port->ip_mscount, ipc_port_make_sonce_locked(port), &old_notify);
-               assert(old_notify == IP_NULL);
-               /* port unlocked */
-       } else {
-               ip_unlock(port);
-       }
-
        /*
         * place a legacy hold on the task.
         */
        kr = place_task_hold(task, TASK_HOLD_LEGACY);
        if (kr != KERN_SUCCESS) {
                task_unlock(task);
-               ipc_port_release_send(send);
                return kr;
        }
 
+       /*
+        * Claim a send right on the task resume port, and request a no-senders
+        * notification on that port (if none outstanding).
+        */
+       (void)ipc_kobject_make_send_lazy_alloc_port(&task->itk_resume,
+           (ipc_kobject_t)task, IKOT_TASK_RESUME);
+       port = task->itk_resume;
+
        task_unlock(task);
 
        /*
@@ -3177,8 +3378,8 @@ task_suspend(
         * but we'll look it up when calling a traditional resume.  Any IPC operations that
         * deallocate the send right will auto-release the suspension.
         */
-       if ((kr = ipc_kmsg_copyout_object(current_task()->itk_space, (ipc_object_t)send,
-           MACH_MSG_TYPE_MOVE_SEND, &name)) != KERN_SUCCESS) {
+       if ((kr = ipc_kmsg_copyout_object(current_task()->itk_space, ip_to_object(port),
+           MACH_MSG_TYPE_MOVE_SEND, NULL, NULL, &name)) != KERN_SUCCESS) {
                printf("warning: %s(%d) failed to copyout suspension token for pid %d with error: %d\n",
                    proc_name_address(current_task()->bsd_info), proc_pid(current_task()->bsd_info),
                    task_pid(task), kr);
@@ -3215,7 +3416,7 @@ task_resume(
 
        is_write_lock(space);
        if (is_active(space) && IP_VALID(task->itk_resume) &&
-           ipc_hash_lookup(space, (ipc_object_t)task->itk_resume, &resume_port_name, &resume_port_entry) == TRUE) {
+           ipc_hash_lookup(space, ip_to_object(task->itk_resume), &resume_port_name, &resume_port_entry) == TRUE) {
                /*
                 * We found a suspension token in the caller's IPC space. Release a send right to indicate that
                 * we are holding one less legacy hold on the task from this caller.  If the release failed,
@@ -3325,7 +3526,7 @@ task_resume2(
 boolean_t
 task_suspension_notify(mach_msg_header_t *request_header)
 {
-       ipc_port_t port = (ipc_port_t) request_header->msgh_remote_port;
+       ipc_port_t port = request_header->msgh_remote_port;
        task_t task = convert_port_to_task_suspension_token(port);
        mach_msg_type_number_t not_count;
 
@@ -3373,7 +3574,7 @@ task_suspension_notify(mach_msg_header_t *request_header)
        return TRUE;
 }
 
-kern_return_t
+static kern_return_t
 task_pidsuspend_locked(task_t task)
 {
        kern_return_t kr;
@@ -3418,6 +3619,10 @@ task_pidsuspend(
 
        task_unlock(task);
 
+       if ((KERN_SUCCESS == kr) && task->message_app_suspended) {
+               iokit_task_app_suspended_changed(task);
+       }
+
        return kr;
 }
 
@@ -3456,6 +3661,10 @@ task_pidresume(
 
        task_unlock(task);
 
+       if ((KERN_SUCCESS == kr) && task->message_app_suspended) {
+               iokit_task_app_suspended_changed(task);
+       }
+
 #if CONFIG_FREEZE
 
        task_lock(task);
@@ -3472,6 +3681,436 @@ task_pidresume(
        return kr;
 }
 
+os_refgrp_decl(static, task_watchports_refgrp, "task_watchports", NULL);
+
+/*
+ *     task_add_turnstile_watchports:
+ *             Setup watchports to boost the main thread of the task.
+ *
+ *     Arguments:
+ *             task: task being spawned
+ *             thread: main thread of task
+ *             portwatch_ports: array of watchports
+ *             portwatch_count: number of watchports
+ *
+ *     Conditions:
+ *             Nothing locked.
+ */
+void
+task_add_turnstile_watchports(
+       task_t          task,
+       thread_t        thread,
+       ipc_port_t      *portwatch_ports,
+       uint32_t        portwatch_count)
+{
+       struct task_watchports *watchports = NULL;
+       struct task_watchport_elem *previous_elem_array[TASK_MAX_WATCHPORT_COUNT] = {};
+       os_ref_count_t refs;
+
+       /* Check if the task has terminated */
+       if (!task->active) {
+               return;
+       }
+
+       assert(portwatch_count <= TASK_MAX_WATCHPORT_COUNT);
+
+       watchports = task_watchports_alloc_init(task, thread, portwatch_count);
+
+       /* Lock the ipc space */
+       is_write_lock(task->itk_space);
+
+       /* Setup watchports to boost the main thread */
+       refs = task_add_turnstile_watchports_locked(task,
+           watchports, previous_elem_array, portwatch_ports,
+           portwatch_count);
+
+       /* Drop the space lock */
+       is_write_unlock(task->itk_space);
+
+       if (refs == 0) {
+               task_watchports_deallocate(watchports);
+       }
+
+       /* Drop the ref on previous_elem_array */
+       for (uint32_t i = 0; i < portwatch_count && previous_elem_array[i] != NULL; i++) {
+               task_watchport_elem_deallocate(previous_elem_array[i]);
+       }
+}
+
+/*
+ *     task_remove_turnstile_watchports:
+ *             Clear all turnstile boost on the task from watchports.
+ *
+ *     Arguments:
+ *             task: task being terminated
+ *
+ *     Conditions:
+ *             Nothing locked.
+ */
+void
+task_remove_turnstile_watchports(
+       task_t          task)
+{
+       os_ref_count_t refs = TASK_MAX_WATCHPORT_COUNT;
+       struct task_watchports *watchports = NULL;
+       ipc_port_t port_freelist[TASK_MAX_WATCHPORT_COUNT] = {};
+       uint32_t portwatch_count;
+
+       /* Lock the ipc space */
+       is_write_lock(task->itk_space);
+
+       /* Check if watchport boost exist */
+       if (task->watchports == NULL) {
+               is_write_unlock(task->itk_space);
+               return;
+       }
+       watchports = task->watchports;
+       portwatch_count = watchports->tw_elem_array_count;
+
+       refs = task_remove_turnstile_watchports_locked(task, watchports,
+           port_freelist);
+
+       is_write_unlock(task->itk_space);
+
+       /* Drop all the port references */
+       for (uint32_t i = 0; i < portwatch_count && port_freelist[i] != NULL; i++) {
+               ip_release(port_freelist[i]);
+       }
+
+       /* Clear the task and thread references for task_watchport */
+       if (refs == 0) {
+               task_watchports_deallocate(watchports);
+       }
+}
+
+/*
+ *     task_transfer_turnstile_watchports:
+ *             Transfer all watchport turnstile boost from old task to new task.
+ *
+ *     Arguments:
+ *             old_task: task calling exec
+ *             new_task: new exec'ed task
+ *             thread: main thread of new task
+ *
+ *     Conditions:
+ *             Nothing locked.
+ */
+void
+task_transfer_turnstile_watchports(
+       task_t   old_task,
+       task_t   new_task,
+       thread_t new_thread)
+{
+       struct task_watchports *old_watchports = NULL;
+       struct task_watchports *new_watchports = NULL;
+       os_ref_count_t old_refs = TASK_MAX_WATCHPORT_COUNT;
+       os_ref_count_t new_refs = TASK_MAX_WATCHPORT_COUNT;
+       uint32_t portwatch_count;
+
+       if (old_task->watchports == NULL || !new_task->active) {
+               return;
+       }
+
+       /* Get the watch port count from the old task */
+       is_write_lock(old_task->itk_space);
+       if (old_task->watchports == NULL) {
+               is_write_unlock(old_task->itk_space);
+               return;
+       }
+
+       portwatch_count = old_task->watchports->tw_elem_array_count;
+       is_write_unlock(old_task->itk_space);
+
+       new_watchports = task_watchports_alloc_init(new_task, new_thread, portwatch_count);
+
+       /* Lock the ipc space for old task */
+       is_write_lock(old_task->itk_space);
+
+       /* Lock the ipc space for new task */
+       is_write_lock(new_task->itk_space);
+
+       /* Check if watchport boost exist */
+       if (old_task->watchports == NULL || !new_task->active) {
+               is_write_unlock(new_task->itk_space);
+               is_write_unlock(old_task->itk_space);
+               (void)task_watchports_release(new_watchports);
+               task_watchports_deallocate(new_watchports);
+               return;
+       }
+
+       old_watchports = old_task->watchports;
+       assert(portwatch_count == old_task->watchports->tw_elem_array_count);
+
+       /* Setup new task watchports */
+       new_task->watchports = new_watchports;
+
+       for (uint32_t i = 0; i < portwatch_count; i++) {
+               ipc_port_t port = old_watchports->tw_elem[i].twe_port;
+
+               if (port == NULL) {
+                       task_watchport_elem_clear(&new_watchports->tw_elem[i]);
+                       continue;
+               }
+
+               /* Lock the port and check if it has the entry */
+               ip_lock(port);
+               imq_lock(&port->ip_messages);
+
+               task_watchport_elem_init(&new_watchports->tw_elem[i], new_task, port);
+
+               if (ipc_port_replace_watchport_elem_conditional_locked(port,
+                   &old_watchports->tw_elem[i], &new_watchports->tw_elem[i]) == KERN_SUCCESS) {
+                       task_watchport_elem_clear(&old_watchports->tw_elem[i]);
+
+                       task_watchports_retain(new_watchports);
+                       old_refs = task_watchports_release(old_watchports);
+
+                       /* Check if all ports are cleaned */
+                       if (old_refs == 0) {
+                               old_task->watchports = NULL;
+                       }
+               } else {
+                       task_watchport_elem_clear(&new_watchports->tw_elem[i]);
+               }
+               /* mqueue and port unlocked by ipc_port_replace_watchport_elem_conditional_locked */
+       }
+
+       /* Drop the reference on new task_watchports struct returned by task_watchports_alloc_init */
+       new_refs = task_watchports_release(new_watchports);
+       if (new_refs == 0) {
+               new_task->watchports = NULL;
+       }
+
+       is_write_unlock(new_task->itk_space);
+       is_write_unlock(old_task->itk_space);
+
+       /* Clear the task and thread references for old_watchport */
+       if (old_refs == 0) {
+               task_watchports_deallocate(old_watchports);
+       }
+
+       /* Clear the task and thread references for new_watchport */
+       if (new_refs == 0) {
+               task_watchports_deallocate(new_watchports);
+       }
+}
+
+/*
+ *     task_add_turnstile_watchports_locked:
+ *             Setup watchports to boost the main thread of the task.
+ *
+ *     Arguments:
+ *             task: task to boost
+ *             watchports: watchport structure to be attached to the task
+ *             previous_elem_array: an array of old watchport_elem to be returned to caller
+ *             portwatch_ports: array of watchports
+ *             portwatch_count: number of watchports
+ *
+ *     Conditions:
+ *             ipc space of the task locked.
+ *             returns array of old watchport_elem in previous_elem_array
+ */
+static os_ref_count_t
+task_add_turnstile_watchports_locked(
+       task_t                      task,
+       struct task_watchports      *watchports,
+       struct task_watchport_elem  **previous_elem_array,
+       ipc_port_t                  *portwatch_ports,
+       uint32_t                    portwatch_count)
+{
+       os_ref_count_t refs = TASK_MAX_WATCHPORT_COUNT;
+
+       /* Check if the task is still active */
+       if (!task->active) {
+               refs = task_watchports_release(watchports);
+               return refs;
+       }
+
+       assert(task->watchports == NULL);
+       task->watchports = watchports;
+
+       for (uint32_t i = 0, j = 0; i < portwatch_count; i++) {
+               ipc_port_t port = portwatch_ports[i];
+
+               task_watchport_elem_init(&watchports->tw_elem[i], task, port);
+               if (port == NULL) {
+                       task_watchport_elem_clear(&watchports->tw_elem[i]);
+                       continue;
+               }
+
+               ip_lock(port);
+               imq_lock(&port->ip_messages);
+
+               /* Check if port is in valid state to be setup as watchport */
+               if (ipc_port_add_watchport_elem_locked(port, &watchports->tw_elem[i],
+                   &previous_elem_array[j]) != KERN_SUCCESS) {
+                       task_watchport_elem_clear(&watchports->tw_elem[i]);
+                       continue;
+               }
+               /* port and mqueue unlocked on return */
+
+               ip_reference(port);
+               task_watchports_retain(watchports);
+               if (previous_elem_array[j] != NULL) {
+                       j++;
+               }
+       }
+
+       /* Drop the reference on task_watchport struct returned by os_ref_init */
+       refs = task_watchports_release(watchports);
+       if (refs == 0) {
+               task->watchports = NULL;
+       }
+
+       return refs;
+}
+
+/*
+ *     task_remove_turnstile_watchports_locked:
+ *             Clear all turnstile boost on the task from watchports.
+ *
+ *     Arguments:
+ *             task: task to remove watchports from
+ *             watchports: watchports structure for the task
+ *             port_freelist: array of ports returned with ref to caller
+ *
+ *
+ *     Conditions:
+ *             ipc space of the task locked.
+ *             array of ports with refs are returned in port_freelist
+ */
+static os_ref_count_t
+task_remove_turnstile_watchports_locked(
+       task_t                 task,
+       struct task_watchports *watchports,
+       ipc_port_t             *port_freelist)
+{
+       os_ref_count_t refs = TASK_MAX_WATCHPORT_COUNT;
+
+       for (uint32_t i = 0, j = 0; i < watchports->tw_elem_array_count; i++) {
+               ipc_port_t port = watchports->tw_elem[i].twe_port;
+               if (port == NULL) {
+                       continue;
+               }
+
+               /* Lock the port and check if it has the entry */
+               ip_lock(port);
+               imq_lock(&port->ip_messages);
+               if (ipc_port_clear_watchport_elem_internal_conditional_locked(port,
+                   &watchports->tw_elem[i]) == KERN_SUCCESS) {
+                       task_watchport_elem_clear(&watchports->tw_elem[i]);
+                       port_freelist[j++] = port;
+                       refs = task_watchports_release(watchports);
+
+                       /* Check if all ports are cleaned */
+                       if (refs == 0) {
+                               task->watchports = NULL;
+                               break;
+                       }
+               }
+               /* mqueue and port unlocked by ipc_port_clear_watchport_elem_internal_conditional_locked */
+       }
+       return refs;
+}
+
+/*
+ *     task_watchports_alloc_init:
+ *             Allocate and initialize task watchport struct.
+ *
+ *     Conditions:
+ *             Nothing locked.
+ */
+static struct task_watchports *
+task_watchports_alloc_init(
+       task_t        task,
+       thread_t      thread,
+       uint32_t      count)
+{
+       struct task_watchports *watchports = kalloc(sizeof(struct task_watchports) +
+           count * sizeof(struct task_watchport_elem));
+
+       task_reference(task);
+       thread_reference(thread);
+       watchports->tw_task = task;
+       watchports->tw_thread = thread;
+       watchports->tw_elem_array_count = count;
+       os_ref_init(&watchports->tw_refcount, &task_watchports_refgrp);
+
+       return watchports;
+}
+
+/*
+ *     task_watchports_deallocate:
+ *             Deallocate task watchport struct.
+ *
+ *     Conditions:
+ *             Nothing locked.
+ */
+static void
+task_watchports_deallocate(
+       struct task_watchports *watchports)
+{
+       uint32_t portwatch_count = watchports->tw_elem_array_count;
+
+       task_deallocate(watchports->tw_task);
+       thread_deallocate(watchports->tw_thread);
+       kfree(watchports, sizeof(struct task_watchports) + portwatch_count * sizeof(struct task_watchport_elem));
+}
+
+/*
+ *     task_watchport_elem_deallocate:
+ *             Deallocate task watchport element and release its ref on task_watchport.
+ *
+ *     Conditions:
+ *             Nothing locked.
+ */
+void
+task_watchport_elem_deallocate(
+       struct task_watchport_elem *watchport_elem)
+{
+       os_ref_count_t refs = TASK_MAX_WATCHPORT_COUNT;
+       task_t task = watchport_elem->twe_task;
+       struct task_watchports *watchports = NULL;
+       ipc_port_t port = NULL;
+
+       assert(task != NULL);
+
+       /* Take the space lock to modify the elememt */
+       is_write_lock(task->itk_space);
+
+       watchports = task->watchports;
+       assert(watchports != NULL);
+
+       port = watchport_elem->twe_port;
+       assert(port != NULL);
+
+       task_watchport_elem_clear(watchport_elem);
+       refs = task_watchports_release(watchports);
+
+       if (refs == 0) {
+               task->watchports = NULL;
+       }
+
+       is_write_unlock(task->itk_space);
+
+       ip_release(port);
+       if (refs == 0) {
+               task_watchports_deallocate(watchports);
+       }
+}
+
+/*
+ *     task_has_watchports:
+ *             Return TRUE if task has watchport boosts.
+ *
+ *     Conditions:
+ *             Nothing locked.
+ */
+boolean_t
+task_has_watchports(task_t task)
+{
+       return task->watchports != NULL;
+}
 
 #if DEVELOPMENT || DEBUG
 
@@ -3601,7 +4240,7 @@ task_freeze(
 
        task_unlock(task);
 
-       kr = vm_map_freeze(task->map,
+       kr = vm_map_freeze(task,
            purgeable_count,
            wired_count,
            clean_count,
@@ -4322,7 +4961,7 @@ task_info(
                        break;
                }
 
-               task_power_info_locked(task, (task_power_info_t)task_info_out, NULL, NULL);
+               task_power_info_locked(task, (task_power_info_t)task_info_out, NULL, NULL, NULL);
                break;
        }
 
@@ -4333,7 +4972,7 @@ task_info(
                        break;
                }
                task_power_info_v2_t tpiv2 = (task_power_info_v2_t) task_info_out;
-               task_power_info_locked(task, &tpiv2->cpu_energy, &tpiv2->gpu_energy, tpiv2);
+               task_power_info_locked(task, &tpiv2->cpu_energy, &tpiv2->gpu_energy, tpiv2, NULL);
                break;
        }
 
@@ -4343,6 +4982,39 @@ task_info(
                task_vm_info_t          vm_info;
                vm_map_t                map;
 
+#if __arm64__
+               struct proc *p;
+               uint32_t platform, sdk;
+               p = current_proc();
+               platform = proc_platform(p);
+               sdk = proc_sdk(p);
+               if (original_task_info_count > TASK_VM_INFO_REV2_COUNT &&
+                   platform == PLATFORM_IOS &&
+                   sdk != 0 &&
+                   (sdk >> 16) <= 12) {
+                       /*
+                        * Some iOS apps pass an incorrect value for
+                        * task_info_count, expressed in number of bytes
+                        * instead of number of "natural_t" elements.
+                        * For the sake of backwards binary compatibility
+                        * for apps built with an iOS12 or older SDK and using
+                        * the "rev2" data structure, let's fix task_info_count
+                        * for them, to avoid stomping past the actual end
+                        * of their buffer.
+                        */
+#if DEVELOPMENT || DEBUG
+                       printf("%s:%d %d[%s] rdar://49484582 task_info_count %d -> %d platform %d sdk %d.%d.%d\n", __FUNCTION__, __LINE__, proc_pid(p), proc_name_address(p), original_task_info_count, TASK_VM_INFO_REV2_COUNT, platform, (sdk >> 16), ((sdk >> 8) & 0xff), (sdk & 0xff));
+#endif /* DEVELOPMENT || DEBUG */
+                       DTRACE_VM4(workaround_task_vm_info_count,
+                           mach_msg_type_number_t, original_task_info_count,
+                           mach_msg_type_number_t, TASK_VM_INFO_REV2_COUNT,
+                           uint32_t, platform,
+                           uint32_t, sdk);
+                       original_task_info_count = TASK_VM_INFO_REV2_COUNT;
+                       *task_info_count = original_task_info_count;
+               }
+#endif /* __arm64__ */
+
                if (*task_info_count < TASK_VM_INFO_REV0_COUNT) {
                        error = KERN_INVALID_ARGUMENT;
                        break;
@@ -4445,6 +5117,90 @@ task_info(
                        vm_info->max_address = map->max_offset;
                        *task_info_count = TASK_VM_INFO_REV2_COUNT;
                }
+               if (original_task_info_count >= TASK_VM_INFO_REV3_COUNT) {
+                       ledger_get_lifetime_max(task->ledger,
+                           task_ledgers.phys_footprint,
+                           &vm_info->ledger_phys_footprint_peak);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.purgeable_nonvolatile,
+                           &vm_info->ledger_purgeable_nonvolatile);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.purgeable_nonvolatile_compressed,
+                           &vm_info->ledger_purgeable_novolatile_compressed);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.purgeable_volatile,
+                           &vm_info->ledger_purgeable_volatile);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.purgeable_volatile_compressed,
+                           &vm_info->ledger_purgeable_volatile_compressed);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.network_nonvolatile,
+                           &vm_info->ledger_tag_network_nonvolatile);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.network_nonvolatile_compressed,
+                           &vm_info->ledger_tag_network_nonvolatile_compressed);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.network_volatile,
+                           &vm_info->ledger_tag_network_volatile);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.network_volatile_compressed,
+                           &vm_info->ledger_tag_network_volatile_compressed);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.media_footprint,
+                           &vm_info->ledger_tag_media_footprint);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.media_footprint_compressed,
+                           &vm_info->ledger_tag_media_footprint_compressed);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.media_nofootprint,
+                           &vm_info->ledger_tag_media_nofootprint);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.media_nofootprint_compressed,
+                           &vm_info->ledger_tag_media_nofootprint_compressed);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.graphics_footprint,
+                           &vm_info->ledger_tag_graphics_footprint);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.graphics_footprint_compressed,
+                           &vm_info->ledger_tag_graphics_footprint_compressed);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.graphics_nofootprint,
+                           &vm_info->ledger_tag_graphics_nofootprint);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.graphics_nofootprint_compressed,
+                           &vm_info->ledger_tag_graphics_nofootprint_compressed);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.neural_footprint,
+                           &vm_info->ledger_tag_neural_footprint);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.neural_footprint_compressed,
+                           &vm_info->ledger_tag_neural_footprint_compressed);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.neural_nofootprint,
+                           &vm_info->ledger_tag_neural_nofootprint);
+                       ledger_get_balance(task->ledger,
+                           task_ledgers.neural_nofootprint_compressed,
+                           &vm_info->ledger_tag_neural_nofootprint_compressed);
+                       *task_info_count = TASK_VM_INFO_REV3_COUNT;
+               }
+               if (original_task_info_count >= TASK_VM_INFO_REV4_COUNT) {
+                       if (task->bsd_info) {
+                               vm_info->limit_bytes_remaining =
+                                   memorystatus_available_memory_internal(task->bsd_info);
+                       } else {
+                               vm_info->limit_bytes_remaining = 0;
+                       }
+                       *task_info_count = TASK_VM_INFO_REV4_COUNT;
+               }
+               if (original_task_info_count >= TASK_VM_INFO_REV5_COUNT) {
+                       thread_t thread;
+                       integer_t total = task->decompressions;
+                       queue_iterate(&task->threads, thread, thread_t, task_threads) {
+                               total += thread->decompressions;
+                       }
+                       vm_info->decompressions = total;
+                       *task_info_count = TASK_VM_INFO_REV5_COUNT;
+               }
 
                if (task != kernel_task) {
                        vm_map_unlock_read(map);
@@ -4546,6 +5302,7 @@ task_info(
        {
 #if DEVELOPMENT || DEBUG
                task_debug_info_internal_t dbg_info;
+               ipc_space_t space = task->itk_space;
                if (*task_info_count < TASK_DEBUG_INFO_INTERNAL_COUNT) {
                        error = KERN_NOT_SUPPORTED;
                        break;
@@ -4557,8 +5314,11 @@ task_info(
                }
                dbg_info = (task_debug_info_internal_t) task_info_out;
                dbg_info->ipc_space_size = 0;
-               if (task->itk_space) {
-                       dbg_info->ipc_space_size = task->itk_space->is_table_size;
+
+               if (space) {
+                       is_read_lock(space);
+                       dbg_info->ipc_space_size = space->is_table_size;
+                       is_read_unlock(space);
                }
 
                dbg_info->suspend_count = task->suspend_count;
@@ -4626,11 +5386,14 @@ task_power_info_locked(
        task_t                  task,
        task_power_info_t       info,
        gpu_energy_data_t       ginfo,
-       task_power_info_v2_t    infov2)
+       task_power_info_v2_t    infov2,
+       uint64_t                *runnable_time)
 {
        thread_t                thread;
        ledger_amount_t         tmp;
 
+       uint64_t                runnable_time_sum = 0;
+
        task_lock_assert_owned(task);
 
        ledger_get_entries(task->ledger, task_ledgers.interrupt_wakeups,
@@ -4643,6 +5406,7 @@ task_power_info_locked(
 
        info->total_user = task->total_user_time;
        info->total_system = task->total_system_time;
+       runnable_time_sum = task->total_runnable_time;
 
 #if CONFIG_EMBEDDED
        if (infov2) {
@@ -4696,12 +5460,20 @@ task_power_info_locked(
                        info->total_user += tval;
                }
 
+               tval = timer_grab(&thread->runnable_timer);
+
+               runnable_time_sum += tval;
+
                if (ginfo) {
                        ginfo->task_gpu_utilisation += ml_gpu_stat(thread);
                }
                thread_unlock(thread);
                splx(x);
        }
+
+       if (runnable_time) {
+               *runnable_time = runnable_time_sum;
+       }
 }
 
 /*
@@ -5482,8 +6254,13 @@ task_set_phys_footprint_limit_internal(
        boolean_t memlimit_is_fatal)
 {
        ledger_amount_t old;
+       kern_return_t ret;
 
-       ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &old);
+       ret = ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &old);
+
+       if (ret != KERN_SUCCESS) {
+               return ret;
+       }
 
        /*
         * Check that limit >> 20 will not give an "unexpected" 32-bit
@@ -5550,8 +6327,13 @@ task_get_phys_footprint_limit(
        int *limit_mb)
 {
        ledger_amount_t limit;
+       kern_return_t ret;
+
+       ret = ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &limit);
+       if (ret != KERN_SUCCESS) {
+               return ret;
+       }
 
-       ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &limit);
        /*
         * Check that limit >> 20 will not give an "unexpected" signed, 32-bit
         * result. There are, however, implicit assumptions that -1 mb limit
@@ -5902,13 +6684,13 @@ SENDING_NOTIFICATION__THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS(void)
 }
 
 static boolean_t
-global_update_logical_writes(int64_t io_delta)
+global_update_logical_writes(int64_t io_delta, int64_t *global_write_count)
 {
        int64_t old_count, new_count;
        boolean_t needs_telemetry;
 
        do {
-               new_count = old_count = global_logical_writes_count;
+               new_count = old_count = *global_write_count;
                new_count += io_delta;
                if (new_count >= io_telemetry_limit) {
                        new_count = 0;
@@ -5916,7 +6698,7 @@ global_update_logical_writes(int64_t io_delta)
                } else {
                        needs_telemetry = FALSE;
                }
-       } while (!OSCompareAndSwap64(old_count, new_count, &global_logical_writes_count));
+       } while (!OSCompareAndSwap64(old_count, new_count, global_write_count));
        return needs_telemetry;
 }
 
@@ -5924,7 +6706,10 @@ void
 task_update_logical_writes(task_t task, uint32_t io_size, int flags, void *vp)
 {
        int64_t io_delta = 0;
+       int64_t * global_counter_to_update;
        boolean_t needs_telemetry = FALSE;
+       int ledger_to_update = 0;
+       struct task_writes_counters * writes_counters_to_update;
 
        if ((!task) || (!io_size) || (!vp)) {
                return;
@@ -5933,29 +6718,45 @@ task_update_logical_writes(task_t task, uint32_t io_size, int flags, void *vp)
        KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_DATA_WRITE)) | DBG_FUNC_NONE,
            task_pid(task), io_size, flags, (uintptr_t)VM_KERNEL_ADDRPERM(vp), 0);
        DTRACE_IO4(logical_writes, struct task *, task, uint32_t, io_size, int, flags, vnode *, vp);
+
+       // Is the drive backing this vnode internal or external to the system?
+       if (vnode_isonexternalstorage(vp) == false) {
+               global_counter_to_update = &global_logical_writes_count;
+               ledger_to_update = task_ledgers.logical_writes;
+               writes_counters_to_update = &task->task_writes_counters_internal;
+       } else {
+               global_counter_to_update = &global_logical_writes_to_external_count;
+               ledger_to_update = task_ledgers.logical_writes_to_external;
+               writes_counters_to_update = &task->task_writes_counters_external;
+       }
+
        switch (flags) {
        case TASK_WRITE_IMMEDIATE:
-               OSAddAtomic64(io_size, (SInt64 *)&(task->task_immediate_writes));
-               ledger_credit(task->ledger, task_ledgers.logical_writes, io_size);
+               OSAddAtomic64(io_size, (SInt64 *)&(writes_counters_to_update->task_immediate_writes));
+               ledger_credit(task->ledger, ledger_to_update, io_size);
+               coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, TRUE, io_size);
                break;
        case TASK_WRITE_DEFERRED:
-               OSAddAtomic64(io_size, (SInt64 *)&(task->task_deferred_writes));
-               ledger_credit(task->ledger, task_ledgers.logical_writes, io_size);
+               OSAddAtomic64(io_size, (SInt64 *)&(writes_counters_to_update->task_deferred_writes));
+               ledger_credit(task->ledger, ledger_to_update, io_size);
+               coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, TRUE, io_size);
                break;
        case TASK_WRITE_INVALIDATED:
-               OSAddAtomic64(io_size, (SInt64 *)&(task->task_invalidated_writes));
-               ledger_debit(task->ledger, task_ledgers.logical_writes, io_size);
+               OSAddAtomic64(io_size, (SInt64 *)&(writes_counters_to_update->task_invalidated_writes));
+               ledger_debit(task->ledger, ledger_to_update, io_size);
+               coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, FALSE, io_size);
                break;
        case TASK_WRITE_METADATA:
-               OSAddAtomic64(io_size, (SInt64 *)&(task->task_metadata_writes));
-               ledger_credit(task->ledger, task_ledgers.logical_writes, io_size);
+               OSAddAtomic64(io_size, (SInt64 *)&(writes_counters_to_update->task_metadata_writes));
+               ledger_credit(task->ledger, ledger_to_update, io_size);
+               coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, TRUE, io_size);
                break;
        }
 
        io_delta = (flags == TASK_WRITE_INVALIDATED) ? ((int64_t)io_size * -1ll) : ((int64_t)io_size);
        if (io_telemetry_limit != 0) {
                /* If io_telemetry_limit is 0, disable global updates and I/O telemetry */
-               needs_telemetry = global_update_logical_writes(io_delta);
+               needs_telemetry = global_update_logical_writes(io_delta, global_counter_to_update);
                if (needs_telemetry) {
                        act_set_io_telemetry_ast(current_thread());
                }
@@ -5975,18 +6776,12 @@ task_io_monitor_ctl(task_t task, uint32_t *flags)
                /* Configure the physical I/O ledger */
                ledger_set_limit(ledger, task_ledgers.physical_writes, (task_iomon_limit_mb * 1024 * 1024), 0);
                ledger_set_period(ledger, task_ledgers.physical_writes, (task_iomon_interval_secs * NSEC_PER_SEC));
-
-               /* Configure the logical I/O ledger */
-               ledger_set_limit(ledger, task_ledgers.logical_writes, (task_iomon_limit_mb * 1024 * 1024), 0);
-               ledger_set_period(ledger, task_ledgers.logical_writes, (task_iomon_interval_secs * NSEC_PER_SEC));
        } else if (*flags & IOMON_DISABLE) {
                /*
                 * Caller wishes to disable I/O monitor on the task.
                 */
                ledger_disable_refill(ledger, task_ledgers.physical_writes);
                ledger_disable_callback(ledger, task_ledgers.physical_writes);
-               ledger_disable_refill(ledger, task_ledgers.logical_writes);
-               ledger_disable_callback(ledger, task_ledgers.logical_writes);
        }
 
        task_unlock(task);
@@ -6023,9 +6818,6 @@ SENDING_NOTIFICATION__THIS_PROCESS_IS_CAUSING_TOO_MUCH_IO(int flavor)
        case FLAVOR_IO_PHYSICAL_WRITES:
                ledger_get_entry_info(task->ledger, task_ledgers.physical_writes, &lei);
                break;
-       case FLAVOR_IO_LOGICAL_WRITES:
-               ledger_get_entry_info(task->ledger, task_ledgers.logical_writes, &lei);
-               break;
        }
 
 
@@ -6403,6 +7195,13 @@ task_could_use_secluded_mem(
 {
        return task->task_could_use_secluded_mem;
 }
+
+boolean_t
+task_could_also_use_secluded_mem(
+       task_t  task)
+{
+       return task->task_could_also_use_secluded_mem;
+}
 #endif /* CONFIG_SECLUDED_MEMORY */
 
 queue_head_t *
@@ -6411,6 +7210,12 @@ task_io_user_clients(task_t task)
        return &task->io_user_clients;
 }
 
+void
+task_set_message_app_suspended(task_t task, boolean_t enable)
+{
+       task->message_app_suspended = enable;
+}
+
 void
 task_copy_fields_for_exec(task_t dst_task, task_t src_task)
 {
@@ -6472,14 +7277,162 @@ task_get_darkwake_mode(task_t task)
        return (task->t_flags & TF_DARKWAKE_MODE) != 0;
 }
 
+kern_return_t
+task_get_exc_guard_behavior(
+       task_t task,
+       task_exc_guard_behavior_t *behaviorp)
+{
+       if (task == TASK_NULL) {
+               return KERN_INVALID_TASK;
+       }
+       *behaviorp = task->task_exc_guard;
+       return KERN_SUCCESS;
+}
+
+#ifndef TASK_EXC_GUARD_ALL
+/* Temporary define until two branches are merged */
+#define TASK_EXC_GUARD_ALL (TASK_EXC_GUARD_VM_ALL | 0xf0)
+#endif
+
+kern_return_t
+task_set_exc_guard_behavior(
+       task_t task,
+       task_exc_guard_behavior_t behavior)
+{
+       if (task == TASK_NULL) {
+               return KERN_INVALID_TASK;
+       }
+       if (behavior & ~TASK_EXC_GUARD_ALL) {
+               return KERN_INVALID_VALUE;
+       }
+       task->task_exc_guard = behavior;
+       return KERN_SUCCESS;
+}
+
 #if __arm64__
+extern int legacy_footprint_entitlement_mode;
+extern void memorystatus_act_on_legacy_footprint_entitlement(proc_t, boolean_t);
+
 void
 task_set_legacy_footprint(
-       task_t          task,
-       boolean_t       new_val)
+       task_t task)
 {
        task_lock(task);
-       task->task_legacy_footprint = new_val;
+       task->task_legacy_footprint = TRUE;
+       task_unlock(task);
+}
+
+void
+task_set_extra_footprint_limit(
+       task_t task)
+{
+       if (task->task_extra_footprint_limit) {
+               return;
+       }
+       task_lock(task);
+       if (!task->task_extra_footprint_limit) {
+               memorystatus_act_on_legacy_footprint_entitlement(task->bsd_info, TRUE);
+               task->task_extra_footprint_limit = TRUE;
+       }
        task_unlock(task);
 }
 #endif /* __arm64__ */
+
+static inline ledger_amount_t
+task_ledger_get_balance(
+       ledger_t        ledger,
+       int             ledger_idx)
+{
+       ledger_amount_t amount;
+       amount = 0;
+       ledger_get_balance(ledger, ledger_idx, &amount);
+       return amount;
+}
+
+/*
+ * Gather the amount of memory counted in a task's footprint due to
+ * being in a specific set of ledgers.
+ */
+void
+task_ledgers_footprint(
+       ledger_t        ledger,
+       ledger_amount_t *ledger_resident,
+       ledger_amount_t *ledger_compressed)
+{
+       *ledger_resident = 0;
+       *ledger_compressed = 0;
+
+       /* purgeable non-volatile memory */
+       *ledger_resident += task_ledger_get_balance(ledger, task_ledgers.purgeable_nonvolatile);
+       *ledger_compressed += task_ledger_get_balance(ledger, task_ledgers.purgeable_nonvolatile_compressed);
+
+       /* "default" tagged memory */
+       *ledger_resident += task_ledger_get_balance(ledger, task_ledgers.tagged_footprint);
+       *ledger_compressed += task_ledger_get_balance(ledger, task_ledgers.tagged_footprint_compressed);
+
+       /* "network" currently never counts in the footprint... */
+
+       /* "media" tagged memory */
+       *ledger_resident += task_ledger_get_balance(ledger, task_ledgers.media_footprint);
+       *ledger_compressed += task_ledger_get_balance(ledger, task_ledgers.media_footprint_compressed);
+
+       /* "graphics" tagged memory */
+       *ledger_resident += task_ledger_get_balance(ledger, task_ledgers.graphics_footprint);
+       *ledger_compressed += task_ledger_get_balance(ledger, task_ledgers.graphics_footprint_compressed);
+
+       /* "neural" tagged memory */
+       *ledger_resident += task_ledger_get_balance(ledger, task_ledgers.neural_footprint);
+       *ledger_compressed += task_ledger_get_balance(ledger, task_ledgers.neural_footprint_compressed);
+}
+
+void
+task_set_memory_ownership_transfer(
+       task_t    task,
+       boolean_t value)
+{
+       task_lock(task);
+       task->task_can_transfer_memory_ownership = value;
+       task_unlock(task);
+}
+
+void
+task_copy_vmobjects(task_t task, vm_object_query_t query, int len, int64_t* num)
+{
+       vm_object_t find_vmo;
+       int64_t size = 0;
+
+       task_objq_lock(task);
+       if (query != NULL) {
+               queue_iterate(&task->task_objq, find_vmo, vm_object_t, task_objq)
+               {
+                       int byte_size;
+                       vm_object_query_t p = &query[size++];
+
+                       p->object_id = (vm_object_id_t) VM_KERNEL_ADDRPERM(find_vmo);
+                       p->virtual_size = find_vmo->internal ? find_vmo->vo_size : 0;
+                       p->resident_size = find_vmo->resident_page_count * PAGE_SIZE;
+                       p->wired_size = find_vmo->wired_page_count * PAGE_SIZE;
+                       p->reusable_size = find_vmo->reusable_page_count * PAGE_SIZE;
+                       p->vo_no_footprint = find_vmo->vo_no_footprint;
+                       p->vo_ledger_tag = find_vmo->vo_ledger_tag;
+                       p->purgable = find_vmo->purgable;
+
+                       if (find_vmo->internal && find_vmo->pager_created && find_vmo->pager != NULL) {
+                               p->compressed_size = vm_compressor_pager_get_count(find_vmo->pager) * PAGE_SIZE;
+                       } else {
+                               p->compressed_size = 0;
+                       }
+
+                       /* make sure to not overrun */
+                       byte_size = (int) size * sizeof(vm_object_query_data_t);
+                       if ((int)(byte_size + sizeof(vm_object_query_data_t)) > len) {
+                               break;
+                       }
+               }
+       } else {
+               size = task->task_owned_objects;
+       }
+       task_objq_unlock(task);
+
+       *num = size;
+}
index 00f16e0e4576c30884a26ac8b8714b5eb6d9cf5c..1db8fb09d4bf01fd4767d227312532f933c44098 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -99,6 +99,7 @@
 #include <kern/kern_cdata.h>
 #include <mach/sfi_class.h>
 #include <kern/queue.h>
+#include <sys/kern_sysctl.h>
 #endif /* XNU_KERNEL_PRIVATE */
 
 #ifdef  MACH_KERNEL_PRIVATE
@@ -143,20 +144,32 @@ struct _cpu_time_qos_stats {
        uint64_t cpu_time_qos_user_interactive;
 };
 
+struct task_writes_counters {
+       uint64_t task_immediate_writes;
+       uint64_t task_deferred_writes;
+       uint64_t task_invalidated_writes;
+       uint64_t task_metadata_writes;
+};
+
+struct task_watchports;
 #include <bank/bank_internal.h>
 
 struct task {
        /* Synchronization/destruction information */
-       decl_lck_mtx_data(, lock)                /* Task's lock */
+       decl_lck_mtx_data(, lock);               /* Task's lock */
        os_refcnt_t     ref_count;      /* Number of references to me */
        boolean_t       active;         /* Task has not been terminated */
        boolean_t       halting;        /* Task is being halted */
+       boolean_t       message_app_suspended;  /* Let iokit know when pidsuspended */
+
        /* Virtual timers */
        uint32_t                vtimers;
 
        /* Miscellaneous */
        vm_map_t        map;            /* Address space description */
        queue_chain_t   tasks;  /* global list of tasks */
+       struct task_watchports *watchports; /* watchports passed in spawn */
+       turnstile_inheritor_t returnwait_inheritor; /* inheritor for task_wait */
 
 #if defined(CONFIG_SCHED_MULTIQ)
        sched_group_t sched_group;
@@ -164,6 +177,7 @@ struct task {
 
        /* Threads in this task */
        queue_head_t            threads;
+       struct restartable_ranges *restartable_ranges;
 
        processor_set_t         pset_hint;
        struct affinity_space   *affinity_space;
@@ -192,7 +206,7 @@ struct task {
        uint64_t                total_runnable_time;
 
        /* IPC structures */
-       decl_lck_mtx_data(, itk_lock_data)
+       decl_lck_mtx_data(, itk_lock_data);
        struct ipc_port *itk_self;      /* not a right, doesn't hold ref */
        struct ipc_port *itk_nself;     /* not a right, doesn't hold ref */
        struct ipc_port *itk_sself;     /* a send right */
@@ -221,6 +235,7 @@ struct task {
        MACHINE_TASK
 
        integer_t faults;              /* faults counter */
+       integer_t decompressions;      /* decompression counter */
        integer_t pageins;             /* pageins counter */
        integer_t cow_faults;          /* copy on write fault counter */
        integer_t messages_sent;       /* messages sent counter */
@@ -252,8 +267,6 @@ struct task {
 #define TF_CORPSE               0x00000020                              /* task is a corpse */
 #define TF_PENDING_CORPSE       0x00000040                              /* task corpse has not been reported yet */
 #define TF_CORPSE_FORK          0x00000080                              /* task is a forked corpse */
-#define TF_LRETURNWAIT          0x00000100                              /* task is waiting for fork/posix_spawn/exec to complete */
-#define TF_LRETURNWAITER        0x00000200                              /* task is waiting for TF_LRETURNWAIT to get cleared */
 #define TF_PLATFORM             0x00000400                              /* task is a platform binary */
 #define TF_CA_CLIENT_WI         0x00000800                              /* task has CA_CLIENT work interval */
 #define TF_DARKWAKE_MODE        0x00001000                              /* task is in darkwake mode */
@@ -311,6 +324,11 @@ struct task {
 #define task_is_exec_copy_internal(task)        \
        (((task)->t_procflags & TPF_EXEC_COPY) != 0)
 
+       uint8_t                  t_returnwaitflags;
+#define TWF_NONE                 0
+#define TRW_LRETURNWAIT          0x01           /* task is waiting for fork/posix_spawn/exec to complete */
+#define TRW_LRETURNWAITER        0x02           /* task is waiting for TRW_LRETURNWAIT to get cleared */
+
        mach_vm_address_t       all_image_info_addr; /* dyld __all_image_info     */
        mach_vm_size_t          all_image_info_size; /* section location and size */
 
@@ -373,10 +391,9 @@ struct task {
            memlimit_attrs_reserved             :28;     /* reserved for future use */
 
        io_stat_info_t          task_io_stats;
-       uint64_t                task_immediate_writes __attribute__((aligned(8)));
-       uint64_t                task_deferred_writes __attribute__((aligned(8)));
-       uint64_t                task_invalidated_writes __attribute__((aligned(8)));
-       uint64_t                task_metadata_writes __attribute__((aligned(8)));
+
+       struct task_writes_counters task_writes_counters_internal;
+       struct task_writes_counters task_writes_counters_external;
 
        /*
         * The cpu_time_qos_stats fields are protected by the task lock
@@ -395,18 +412,21 @@ struct task {
        struct mt_task task_monotonic;
 #endif /* MONOTONIC */
 
+       uint8_t         task_can_transfer_memory_ownership;
+       uint8_t         task_objects_disowning;
+       uint8_t         task_objects_disowned;
        /* # of purgeable volatile VM objects owned by this task: */
        int             task_volatile_objects;
        /* # of purgeable but not volatile VM objects owned by this task: */
        int             task_nonvolatile_objects;
-       boolean_t       task_purgeable_disowning;
-       boolean_t       task_purgeable_disowned;
+       int             task_owned_objects;
        queue_head_t    task_objq;
-       decl_lck_mtx_data(, task_objq_lock) /* protects "task_objq" */
+       decl_lck_mtx_data(, task_objq_lock); /* protects "task_objq" */
 
        unsigned int    task_thread_limit:16;
 #if __arm64__
        unsigned int    task_legacy_footprint:1;
+       unsigned int    task_extra_footprint_limit:1;
 #endif /* __arm64__ */
        unsigned int    task_region_footprint:1;
        unsigned int    task_has_crossed_thread_limit:1;
@@ -438,23 +458,20 @@ struct task {
        uint8_t task_suppressed_secluded;
 #endif /* CONFIG_SECLUDED_MEMORY */
 
-       uint32_t task_exc_guard;
+       task_exc_guard_behavior_t task_exc_guard;
 
        queue_head_t    io_user_clients;
-};
 
-#define TASK_EXC_GUARD_VM_DELIVER            0x01 /* Deliver virtual memory EXC_GUARD exceptions */
-#define TASK_EXC_GUARD_VM_ONCE               0x02 /* Deliver them only once */
-#define TASK_EXC_GUARD_VM_CORPSE             0x04 /* Deliver them via a forked corpse */
-#define TASK_EXC_GUARD_VM_FATAL              0x08 /* Virtual Memory EXC_GUARD delivery is fatal */
-#define TASK_EXC_GUARD_VM_ALL                0x0f
+       mach_vm_address_t mach_header_vm_address;
 
-#define TASK_EXC_GUARD_MP_DELIVER            0x10 /* Deliver mach port EXC_GUARD exceptions */
-#define TASK_EXC_GUARD_MP_ONCE               0x20 /* Deliver them only once */
-#define TASK_EXC_GUARD_MP_CORPSE             0x04 /* Deliver them via a forked corpse */
-#define TASK_EXC_GUARD_MP_FATAL              0x80 /* mach port EXC_GUARD delivery is fatal */
+       uint32_t loadTag; /* dext ID used for logging identity */
+};
 
-extern uint32_t task_exc_guard_default;
+/*
+ * EXC_GUARD default delivery behavior for optional Mach port and VM guards.
+ * Applied to new tasks at creation time.
+ */
+extern task_exc_guard_behavior_t task_exc_guard_default;
 
 extern kern_return_t
     task_violated_guard(mach_exception_code_t, mach_exception_subcode_t, void *);
@@ -475,6 +492,11 @@ extern kern_return_t
 #define itk_lock(task)          lck_mtx_lock(&(task)->itk_lock_data)
 #define itk_unlock(task)        lck_mtx_unlock(&(task)->itk_lock_data)
 
+/* task clear return wait flags */
+#define TCRW_CLEAR_INITIAL_WAIT   0x1
+#define TCRW_CLEAR_FINAL_WAIT     0x2
+#define TCRW_CLEAR_ALL_WAIT       (TCRW_CLEAR_INITIAL_WAIT | TCRW_CLEAR_FINAL_WAIT)
+
 #define TASK_REFERENCE_LEAK_DEBUG 0
 
 #if TASK_REFERENCE_LEAK_DEBUG
@@ -506,9 +528,49 @@ extern void             init_task_ledgers(void);
 #define current_task_fast()     (current_thread()->task)
 #define current_task()          current_task_fast()
 
+extern bool task_is_driver(task_t task);
+
 extern lck_attr_t      task_lck_attr;
 extern lck_grp_t       task_lck_grp;
 
+struct task_watchport_elem {
+       task_t                          twe_task;
+       ipc_port_t                      twe_port;     /* (Space lock) */
+};
+
+struct task_watchports {
+       os_refcnt_t                     tw_refcount;           /* (Space lock) */
+       task_t                          tw_task;               /* (Space lock) & tw_refcount == 0 */
+       thread_t                        tw_thread;             /* (Space lock) & tw_refcount == 0 */
+       uint32_t                        tw_elem_array_count;   /* (Space lock) */
+       struct task_watchport_elem      tw_elem[];             /* (Space lock) & (Portlock) & (mq lock) */
+};
+
+#define task_watchports_retain(x)   (os_ref_retain(&(x)->tw_refcount))
+#define task_watchports_release(x)  (os_ref_release(&(x)->tw_refcount))
+
+#define task_watchport_elem_init(elem, task, port) \
+do {                                               \
+       (elem)->twe_task = (task);                 \
+       (elem)->twe_port = (port);                 \
+} while(0)
+
+#define task_watchport_elem_clear(elem) task_watchport_elem_init((elem), NULL, NULL)
+
+extern void
+task_add_turnstile_watchports(
+       task_t          task,
+       thread_t        thread,
+       ipc_port_t      *portwatch_ports,
+       uint32_t        portwatch_count);
+
+extern void
+task_watchport_elem_deallocate(
+       struct          task_watchport_elem *watchport_elem);
+
+extern boolean_t
+task_has_watchports(task_t task);
+
 #else   /* MACH_KERNEL_PRIVATE */
 
 __BEGIN_DECLS
@@ -516,10 +578,19 @@ __BEGIN_DECLS
 extern task_t   current_task(void);
 
 extern void             task_reference(task_t   task);
+extern bool task_is_driver(task_t task);
 
 #define TF_NONE                 0
-#define TF_LRETURNWAIT          0x00000100                              /* task is waiting for fork/posix_spawn/exec to complete */
-#define TF_LRETURNWAITER        0x00000200                              /* task is waiting for TF_LRETURNWAIT to get cleared */
+
+#define TWF_NONE                 0
+#define TRW_LRETURNWAIT          0x01           /* task is waiting for fork/posix_spawn/exec to complete */
+#define TRW_LRETURNWAITER        0x02           /* task is waiting for TRW_LRETURNWAIT to get cleared */
+
+/* task clear return wait flags */
+#define TCRW_CLEAR_INITIAL_WAIT   0x1
+#define TCRW_CLEAR_FINAL_WAIT     0x2
+#define TCRW_CLEAR_ALL_WAIT       (TCRW_CLEAR_INITIAL_WAIT | TCRW_CLEAR_FINAL_WAIT)
+
 
 #define TPF_NONE                0
 #define TPF_EXEC_COPY           0x00000002                              /* task is the new copy of an exec */
@@ -531,6 +602,10 @@ __END_DECLS
 
 __BEGIN_DECLS
 
+#ifdef KERNEL_PRIVATE
+extern boolean_t                task_is_app_suspended(task_t task);
+#endif
+
 #ifdef  XNU_KERNEL_PRIVATE
 
 /* Hold all threads in a task */
@@ -553,8 +628,6 @@ extern kern_return_t    task_resume_internal(           task_t          task);
 /* Suspends a task by placing a hold on its threads */
 extern kern_return_t    task_pidsuspend(
        task_t          task);
-extern kern_return_t    task_pidsuspend_locked(
-       task_t          task);
 
 /* Resumes a previously paused task */
 extern kern_return_t    task_pidresume(
@@ -565,6 +638,14 @@ extern kern_return_t    task_send_trace_memory(
        uint32_t        pid,
        uint64_t        uniqueid);
 
+extern void             task_remove_turnstile_watchports(
+       task_t          task);
+
+extern void             task_transfer_turnstile_watchports(
+       task_t          old_task,
+       task_t          new_task,
+       thread_t        new_thread);
+
 #if DEVELOPMENT || DEBUG
 
 extern kern_return_t    task_disconnect_page_mappings(
@@ -612,6 +693,7 @@ extern kern_return_t    task_create_internal(
        boolean_t       is_64bit_data,
        uint32_t        flags,
        uint32_t        procflags,
+       uint8_t         t_returnwaitflags,
        task_t          *child_task);                                                   /* OUT */
 
 extern kern_return_t    task_info(
@@ -624,7 +706,8 @@ extern void             task_power_info_locked(
        task_t                  task,
        task_power_info_t       info,
        gpu_energy_data_t       gpu_energy,
-       task_power_info_v2_t    infov2);
+       task_power_info_v2_t    infov2,
+       uint64_t                *runnable_time);
 
 extern uint64_t         task_gpu_utilisation(
        task_t   task);
@@ -676,12 +759,14 @@ extern void             task_set_dyld_info(
        mach_vm_address_t addr,
        mach_vm_size_t size);
 
+extern void task_set_mach_header_address(
+       task_t task,
+       mach_vm_address_t addr);
+
 /* Get number of activations in a task */
 extern int              get_task_numacts(
        task_t          task);
 
-extern int get_task_numactivethreads(task_t task);
-
 struct label;
 extern kern_return_t task_collect_crash_info(
        task_t task,
@@ -694,6 +779,7 @@ void task_wait_till_threads_terminate_locked(task_t task);
 
 /* JMM - should just be temporary (implementation in bsd_kern still) */
 extern void     set_bsdtask_info(task_t, void *);
+extern uint32_t set_task_loadTag(task_t task, uint32_t loadTag);
 extern vm_map_t get_task_map_reference(task_t);
 extern vm_map_t swap_task_map(task_t, thread_t, vm_map_t);
 extern pmap_t   get_task_pmap(task_t);
@@ -710,6 +796,7 @@ extern uint64_t get_task_purgeable_size(task_t);
 extern uint64_t get_task_cpu_time(task_t);
 extern uint64_t get_task_dispatchqueue_offset(task_t);
 extern uint64_t get_task_dispatchqueue_serialno_offset(task_t);
+extern uint64_t get_task_dispatchqueue_label_offset(task_t);
 extern uint64_t get_task_uniqueid(task_t task);
 extern int      get_task_version(task_t task);
 
@@ -725,6 +812,7 @@ extern uint64_t get_task_page_table(task_t);
 extern uint64_t get_task_network_nonvolatile(task_t);
 extern uint64_t get_task_network_nonvolatile_compressed(task_t);
 extern uint64_t get_task_wired_mem(task_t);
+extern uint32_t get_task_loadTag(task_t task);
 
 extern kern_return_t task_convert_phys_footprint_limit(int, int *);
 extern kern_return_t task_set_phys_footprint_limit_internal(task_t, int, int *, boolean_t, boolean_t);
@@ -777,10 +865,26 @@ struct _task_ledger_indices {
        int purgeable_nonvolatile;
        int purgeable_volatile_compressed;
        int purgeable_nonvolatile_compressed;
+       int tagged_nofootprint;
+       int tagged_footprint;
+       int tagged_nofootprint_compressed;
+       int tagged_footprint_compressed;
        int network_volatile;
        int network_nonvolatile;
        int network_volatile_compressed;
        int network_nonvolatile_compressed;
+       int media_nofootprint;
+       int media_footprint;
+       int media_nofootprint_compressed;
+       int media_footprint_compressed;
+       int graphics_nofootprint;
+       int graphics_footprint;
+       int graphics_nofootprint_compressed;
+       int graphics_footprint_compressed;
+       int neural_nofootprint;
+       int neural_footprint;
+       int neural_nofootprint_compressed;
+       int neural_footprint_compressed;
        int platform_idle_wakeups;
        int interrupt_wakeups;
 #if CONFIG_SCHED_SFI
@@ -790,12 +894,15 @@ struct _task_ledger_indices {
        int cpu_time_billed_to_others;
        int physical_writes;
        int logical_writes;
+       int logical_writes_to_external;
        int energy_billed_to_me;
        int energy_billed_to_others;
+#if DEBUG || DEVELOPMENT
        int pages_grabbed;
        int pages_grabbed_kern;
        int pages_grabbed_iopl;
        int pages_grabbed_upl;
+#endif
 };
 extern struct _task_ledger_indices task_ledgers;
 
@@ -817,7 +924,7 @@ extern void task_set_32bit_log_flag(task_t task);
 #endif /* CONFIG_32BIT_TELEMETRY */
 extern boolean_t task_is_active(task_t task);
 extern boolean_t task_is_halting(task_t task);
-extern void task_clear_return_wait(task_t task);
+extern void task_clear_return_wait(task_t task, uint32_t flags);
 extern void task_wait_to_return(void) __attribute__((noreturn));
 extern event_t task_get_return_wait_event(task_t task);
 
@@ -825,6 +932,10 @@ extern void task_atm_reset(task_t task);
 extern void task_bank_reset(task_t task);
 extern void task_bank_init(task_t task);
 
+#if CONFIG_ARCADE
+extern void task_prep_arcade(task_t task, thread_t thread);
+#endif /* CONFIG_ARCADE */
+
 extern int task_pid(task_t task);
 extern boolean_t task_has_assertions(task_t task);
 /* End task_policy */
@@ -833,9 +944,12 @@ extern void      task_set_gpu_denied(task_t task, boolean_t denied);
 extern boolean_t task_is_gpu_denied(task_t task);
 
 extern queue_head_t * task_io_user_clients(task_t task);
+extern void     task_set_message_app_suspended(task_t task, boolean_t enable);
 
 extern void task_copy_fields_for_exec(task_t dst_task, task_t src_task);
 
+extern void task_copy_vmobjects(task_t task, vm_object_query_t query, int len, int64_t* num);
+
 #endif  /* XNU_KERNEL_PRIVATE */
 
 #ifdef  KERNEL_PRIVATE
@@ -847,6 +961,7 @@ extern vm_map_t get_task_map(task_t);
 extern ledger_t get_task_ledger(task_t);
 
 extern boolean_t get_task_pidsuspended(task_t);
+extern boolean_t get_task_suspended(task_t);
 extern boolean_t get_task_frozen(task_t);
 
 /* Convert from a task to a port */
@@ -860,10 +975,10 @@ extern task_suspension_token_t convert_port_to_task_suspension_token(ipc_port_t
 
 extern boolean_t task_suspension_notify(mach_msg_header_t *);
 
-#define TASK_WRITE_IMMEDIATE            0x1
-#define TASK_WRITE_DEFERRED             0x2
-#define TASK_WRITE_INVALIDATED          0x4
-#define TASK_WRITE_METADATA             0x8
+#define TASK_WRITE_IMMEDIATE                 0x1
+#define TASK_WRITE_DEFERRED                  0x2
+#define TASK_WRITE_INVALIDATED               0x4
+#define TASK_WRITE_METADATA                  0x8
 extern void     task_update_logical_writes(task_t task, uint32_t io_size, int flags, void *vp);
 
 #if CONFIG_SECLUDED_MEMORY
@@ -880,19 +995,23 @@ extern boolean_t task_can_use_secluded_mem(
        task_t task,
        boolean_t is_allocate);
 extern boolean_t task_could_use_secluded_mem(task_t task);
+extern boolean_t task_could_also_use_secluded_mem(task_t task);
 #endif /* CONFIG_SECLUDED_MEMORY */
 
 extern void task_set_darkwake_mode(task_t, boolean_t);
 extern boolean_t task_get_darkwake_mode(task_t);
 
 #if __arm64__
-extern void task_set_legacy_footprint(task_t task, boolean_t new_val);
+extern void task_set_legacy_footprint(task_t task);
+extern void task_set_extra_footprint_limit(task_t task);
 #endif /* __arm64__ */
 
 #if CONFIG_MACF
 extern struct label *get_task_crash_label(task_t task);
 #endif /* CONFIG_MACF */
 
+extern int get_task_cdhash(task_t task, char cdhash[]);
+
 #endif  /* KERNEL_PRIVATE */
 
 extern task_t   kernel_task;
@@ -911,6 +1030,12 @@ extern void             task_suspension_token_deallocate(
 
 extern boolean_t task_self_region_footprint(void);
 extern void task_self_region_footprint_set(boolean_t newval);
+extern void task_ledgers_footprint(ledger_t ledger,
+    ledger_amount_t *ledger_resident,
+    ledger_amount_t *ledger_compressed);
+extern void task_set_memory_ownership_transfer(
+       task_t task,
+       boolean_t value);
 
 __END_DECLS
 
index 2faf0f7cd11db54be8e3dc72a13727f30169590d..e51b5b7bc90e49793a86652144ae1c706170e63a 100644 (file)
@@ -865,7 +865,7 @@ task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_t
                        break;
                }
        } else {
-               /* Daemons get USER_INTERACTIVE squashed to USER_INITIATED */
+               /* Daemons and dext get USER_INTERACTIVE squashed to USER_INITIATED */
                next.tep_qos_ceiling = THREAD_QOS_USER_INITIATED;
        }
 
@@ -1077,6 +1077,7 @@ task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_t
        case TASK_APPTYPE_DAEMON_STANDARD:
        case TASK_APPTYPE_DAEMON_ADAPTIVE:
        case TASK_APPTYPE_DAEMON_BACKGROUND:
+       case TASK_APPTYPE_DRIVER:
        default:
                next.tep_live_donor = 0;
                break;
@@ -1907,8 +1908,8 @@ extern boolean_t ipc_importance_interactive_receiver;
  * TODO: Make this function more table-driven instead of ad-hoc
  */
 void
-proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp, int role,
-    ipc_port_t * portwatch_ports, int portwatch_count)
+proc_set_task_spawnpolicy(task_t task, thread_t thread, int apptype, int qos_clamp, int role,
+    ipc_port_t * portwatch_ports, uint32_t portwatch_count)
 {
        struct task_pend_token pend_token = {};
 
@@ -1968,6 +1969,13 @@ proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp, int role,
                task_importance_mark_denap_receiver(task, FALSE);
                break;
 
+       case TASK_APPTYPE_DRIVER:
+               task_importance_mark_donor(task, FALSE);
+               task_importance_mark_live_donor(task, FALSE);
+               task_importance_mark_receiver(task, FALSE);
+               task_importance_mark_denap_receiver(task, FALSE);
+               break;
+
        case TASK_APPTYPE_NONE:
                break;
        }
@@ -1975,10 +1983,10 @@ proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp, int role,
        if (portwatch_ports != NULL && apptype == TASK_APPTYPE_DAEMON_ADAPTIVE) {
                int portwatch_boosts = 0;
 
-               for (int i = 0; i < portwatch_count; i++) {
+               for (uint32_t i = 0; i < portwatch_count; i++) {
                        ipc_port_t port = NULL;
 
-                       if ((port = portwatch_ports[i]) != NULL) {
+                       if (IP_VALID(port = portwatch_ports[i])) {
                                int boost = 0;
                                task_add_importance_watchport(task, port, &boost);
                                portwatch_boosts += boost;
@@ -1990,6 +1998,11 @@ proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp, int role,
                }
        }
 
+       /* Redirect the turnstile push of watchports to task */
+       if (portwatch_count && portwatch_ports != NULL) {
+               task_add_turnstile_watchports(task, thread, portwatch_ports, portwatch_count);
+       }
+
        task_lock(task);
 
        if (apptype == TASK_APPTYPE_APP_TAL) {
@@ -2065,6 +2078,7 @@ task_compute_main_thread_qos(task_t task)
        case TASK_APPTYPE_DAEMON_INTERACTIVE:
        case TASK_APPTYPE_DAEMON_STANDARD:
        case TASK_APPTYPE_DAEMON_ADAPTIVE:
+       case TASK_APPTYPE_DRIVER:
                primordial_qos = THREAD_QOS_LEGACY;
                break;
 
@@ -2117,6 +2131,15 @@ task_is_daemon(task_t task)
        }
 }
 
+bool
+task_is_driver(task_t task)
+{
+       if (!task) {
+               return FALSE;
+       }
+       return task->requested_policy.trp_apptype == TASK_APPTYPE_DRIVER;
+}
+
 boolean_t
 task_is_app(task_t task)
 {
index 7df3ce15b1e869122abbb1336b86202726fa982b..b723f0b7c4c7eb46c51e38e9760a05430333a628 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -106,7 +106,12 @@ uint32_t telemetry_timestamp = 0;
  * compute_averages().  It will notify its client (if one
  * exists) when it has enough data to be worth flushing.
  */
-struct micro_snapshot_buffer telemetry_buffer = {0, 0, 0, 0};
+struct micro_snapshot_buffer telemetry_buffer = {
+       .buffer = 0,
+       .size = 0,
+       .current_position = 0,
+       .end_point = 0
+};
 
 int                                     telemetry_bytes_since_last_mark = -1; // How much data since buf was last marked?
 int                                     telemetry_buffer_notify_at = 0;
@@ -478,20 +483,23 @@ telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro
         * buffer with the global telemetry lock held -- so we must do our (possibly faulting)
         * copies from userland here, before taking the lock.
         */
-       uintptr_t frames[MAX_CALLSTACK_FRAMES] = {};
-       bool user64;
-       int backtrace_error = backtrace_user(frames, MAX_CALLSTACK_FRAMES, &btcount, &user64);
+
+       uintptr_t frames[128];
+       bool user64_regs = false;
+       int backtrace_error = backtrace_user(frames,
+           sizeof(frames) / sizeof(frames[0]), &btcount, &user64_regs, NULL);
        if (backtrace_error) {
                return;
        }
+       bool user64_va = task_has_64Bit_addr(task);
 
        /*
         * Find the actual [slid] address of the shared cache's UUID, and copy it in from userland.
         */
-       int                                                     shared_cache_uuid_valid = 0;
-       uint64_t                                        shared_cache_base_address;
-       struct _dyld_cache_header       shared_cache_header;
-       uint64_t                                        shared_cache_slide;
+       int shared_cache_uuid_valid = 0;
+       uint64_t shared_cache_base_address = 0;
+       struct _dyld_cache_header shared_cache_header = {};
+       uint64_t shared_cache_slide = 0;
 
        /*
         * Don't copy in the entire shared cache header; we only need the UUID. Calculate the
@@ -516,15 +524,18 @@ telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro
         *
         * XXX - make this common with kdp?
         */
-       uint32_t                        uuid_info_count = 0;
-       mach_vm_address_t       uuid_info_addr = 0;
-       if (task_has_64Bit_addr(task)) {
+       uint32_t uuid_info_count = 0;
+       mach_vm_address_t uuid_info_addr = 0;
+       uint32_t uuid_info_size = 0;
+       if (user64_va) {
+               uuid_info_size = sizeof(struct user64_dyld_uuid_info);
                struct user64_dyld_all_image_infos task_image_infos;
                if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) {
                        uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount;
                        uuid_info_addr = task_image_infos.uuidArray;
                }
        } else {
+               uuid_info_size = sizeof(struct user32_dyld_uuid_info);
                struct user32_dyld_all_image_infos task_image_infos;
                if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) {
                        uuid_info_count = task_image_infos.uuidArrayCount;
@@ -549,7 +560,6 @@ telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro
                uuid_info_count = TELEMETRY_MAX_UUID_COUNT;
        }
 
-       uint32_t uuid_info_size = (uint32_t)(task_has_64Bit_addr(thread->task) ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info));
        uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size;
        char     *uuid_info_array = NULL;
 
@@ -579,10 +589,10 @@ telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro
        if (dqkeyaddr != 0) {
                uint64_t dqaddr = 0;
                uint64_t dq_serialno_offset = get_task_dispatchqueue_serialno_offset(task);
-               if ((copyin(dqkeyaddr, (char *)&dqaddr, (task_has_64Bit_addr(task) ? 8 : 4)) == 0) &&
+               if ((copyin(dqkeyaddr, (char *)&dqaddr, (user64_va ? 8 : 4)) == 0) &&
                    (dqaddr != 0) && (dq_serialno_offset != 0)) {
                        uint64_t dqserialnumaddr = dqaddr + dq_serialno_offset;
-                       if (copyin(dqserialnumaddr, (char *)&dqserialnum, (task_has_64Bit_addr(task) ? 8 : 4)) == 0) {
+                       if (copyin(dqserialnumaddr, (char *)&dqserialnum, (user64_va ? 8 : 4)) == 0) {
                                dqserialnum_valid = 1;
                        }
                }
@@ -694,7 +704,7 @@ copytobuffer:
        tsnap->latency_qos = task_grab_latency_qos(task);
 
        strlcpy(tsnap->p_comm, proc_name_address(p), sizeof(tsnap->p_comm));
-       if (task_has_64Bit_addr(thread->task)) {
+       if (user64_va) {
                tsnap->ss_flags |= kUser64_p;
        }
 
@@ -796,7 +806,7 @@ copytobuffer:
                current_buffer->current_position += sizeof(dqserialnum);
        }
 
-       if (user64) {
+       if (user64_regs) {
                framesize = 8;
                thsnap->ss_flags |= kUser64_p;
        } else {
@@ -1182,9 +1192,9 @@ bootprofile_timer_call(
        if (bootprofile_buffer_current_position < bootprofile_buffer_size) {
                uint32_t flags = STACKSHOT_KCDATA_FORMAT | STACKSHOT_TRYLOCK | STACKSHOT_SAVE_LOADINFO
                    | STACKSHOT_GET_GLOBAL_MEM_STATS;
-#if __x86_64__
+#if !CONFIG_EMBEDDED
                flags |= STACKSHOT_SAVE_KEXT_LOADINFO;
-#endif /* __x86_64__ */
+#endif
 
 
                /* OR on flags specified in boot-args */
index 2e1069bb900bf8cc5f655269756892d031ca0927..9f88ebefa21ba6bd6e54796d09a8d07188b28566 100644 (file)
@@ -648,6 +648,7 @@ struct lck_mtx_thread_arg {
        int my_locked;
        int* other_locked;
        thread_t other_thread;
+       int type;
 };
 
 static void
@@ -660,6 +661,8 @@ test_mtx_lock_unlock_contended_thread(
        thread_t other_thread;
        int* my_locked;
        int* other_locked;
+       int type;
+       uint64_t start, stop;
 
        printf("Starting thread %p\n", current_thread());
 
@@ -672,6 +675,7 @@ test_mtx_lock_unlock_contended_thread(
 
        my_locked = &info->my_locked;
        other_locked = info->other_locked;
+       type = info->type;
 
        *my_locked = 0;
        val = os_atomic_inc(&synch, relaxed);
@@ -682,19 +686,26 @@ test_mtx_lock_unlock_contended_thread(
        //warming up the test
        for (i = 0; i < WARMUP_ITER; i++) {
                lck_mtx_test_lock();
-
-               os_atomic_xchg(my_locked, 1, relaxed);
+               int prev = os_atomic_load(other_locked, relaxed);
+               os_atomic_add(my_locked, 1, relaxed);
                if (i != WARMUP_ITER - 1) {
-                       while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) {
-                               ;
+                       if (type == FULL_CONTENDED) {
+                               while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) {
+                                       ;
+                               }
+                       } else {
+                               start = mach_absolute_time();
+                               stop = start + (MutexSpin / 2);
+                               while (mach_absolute_time() < stop) {
+                                       ;
+                               }
                        }
-                       os_atomic_xchg(my_locked, 0, relaxed);
                }
 
                lck_mtx_test_unlock();
 
                if (i != WARMUP_ITER - 1) {
-                       while (os_atomic_load(other_locked, relaxed) == 0) {
+                       while (os_atomic_load(other_locked, relaxed) == prev) {
                                ;
                        }
                }
@@ -723,18 +734,25 @@ test_mtx_lock_unlock_contended_thread(
 
        for (i = 0; i < iterations; i++) {
                lck_mtx_test_lock();
-
-               os_atomic_xchg(my_locked, 1, relaxed);
+               int prev = os_atomic_load(other_locked, relaxed);
+               os_atomic_add(my_locked, 1, relaxed);
                if (i != iterations - 1) {
-                       while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) {
-                               ;
+                       if (type == FULL_CONTENDED) {
+                               while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) {
+                                       ;
+                               }
+                       } else {
+                               start = mach_absolute_time();
+                               stop = start + (MutexSpin / 2);
+                               while (mach_absolute_time() < stop) {
+                                       ;
+                               }
                        }
-                       os_atomic_xchg(my_locked, 0, relaxed);
                }
                lck_mtx_test_unlock_mtx();
 
                if (i != iterations - 1) {
-                       while (os_atomic_load(other_locked, relaxed) == 0) {
+                       while (os_atomic_load(other_locked, relaxed) == prev) {
                                ;
                        }
                }
@@ -750,7 +768,8 @@ kern_return_t
 lck_mtx_test_mtx_contended(
        int iter,
        char* buffer,
-       int buffer_size)
+       int buffer_size,
+       int type)
 {
        thread_t thread1, thread2;
        kern_return_t result;
@@ -759,10 +778,17 @@ lck_mtx_test_mtx_contended(
        wait_barrier = 0;
        iterations = iter;
 
+       if (type < 0 || type > MAX_CONDENDED) {
+               printf("%s invalid type %d\n", __func__, type);
+               return 0;
+       }
+
        erase_all_test_mtx_stats();
 
        targs[0].other_thread = NULL;
        targs[1].other_thread = NULL;
+       targs[0].type = type;
+       targs[1].type = type;
 
        result = kernel_thread_start((thread_continue_t)test_mtx_lock_unlock_contended_thread, &targs[0], &thread1);
        if (result != KERN_SUCCESS) {
@@ -812,6 +838,8 @@ test_mtx_lck_unlock_contended_loop_time_thread(
        thread_t other_thread;
        int* my_locked;
        int* other_locked;
+       int type;
+       uint64_t start, stop;
 
        printf("Starting thread %p\n", current_thread());
 
@@ -824,6 +852,7 @@ test_mtx_lck_unlock_contended_loop_time_thread(
 
        my_locked = &info->my_locked;
        other_locked = info->other_locked;
+       type = info->type;
 
        *my_locked = 0;
        val = os_atomic_inc(&synch, relaxed);
@@ -835,18 +864,26 @@ test_mtx_lck_unlock_contended_loop_time_thread(
        for (i = 0; i < WARMUP_ITER; i++) {
                lck_mtx_lock(&test_mtx);
 
-               os_atomic_xchg(my_locked, 1, relaxed);
+               int prev = os_atomic_load(other_locked, relaxed);
+               os_atomic_add(my_locked, 1, relaxed);
                if (i != WARMUP_ITER - 1) {
-                       while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) {
-                               ;
+                       if (type == FULL_CONTENDED) {
+                               while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) {
+                                       ;
+                               }
+                       } else {
+                               start = mach_absolute_time();
+                               stop = start + (MutexSpin / 2);
+                               while (mach_absolute_time() < stop) {
+                                       ;
+                               }
                        }
-                       os_atomic_xchg(my_locked, 0, relaxed);
                }
 
                lck_mtx_unlock(&test_mtx);
 
                if (i != WARMUP_ITER - 1) {
-                       while (os_atomic_load(other_locked, relaxed) == 0) {
+                       while (os_atomic_load(other_locked, relaxed) == prev) {
                                ;
                        }
                }
@@ -878,18 +915,26 @@ test_mtx_lck_unlock_contended_loop_time_thread(
        for (i = 0; i < iterations; i++) {
                lck_mtx_lock(&test_mtx);
 
-               os_atomic_xchg(my_locked, 1, relaxed);
+               int prev = os_atomic_load(other_locked, relaxed);
+               os_atomic_add(my_locked, 1, relaxed);
                if (i != iterations - 1) {
-                       while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) {
-                               ;
+                       if (type == FULL_CONTENDED) {
+                               while (os_atomic_load(&other_thread->state, relaxed) & TH_RUN) {
+                                       ;
+                               }
+                       } else {
+                               start = mach_absolute_time();
+                               stop = start + (MutexSpin / 2);
+                               while (mach_absolute_time() < stop) {
+                                       ;
+                               }
                        }
-                       os_atomic_xchg(my_locked, 0, relaxed);
                }
 
                lck_mtx_unlock(&test_mtx);
 
                if (i != iterations - 1) {
-                       while (os_atomic_load(other_locked, relaxed) == 0) {
+                       while (os_atomic_load(other_locked, relaxed) == prev) {
                                ;
                        }
                }
@@ -910,7 +955,8 @@ int
 lck_mtx_test_mtx_contended_loop_time(
        int iter,
        char *buffer,
-       int buffer_size)
+       int buffer_size,
+       int type)
 {
        thread_t thread1, thread2;
        kern_return_t result;
@@ -921,6 +967,11 @@ lck_mtx_test_mtx_contended_loop_time(
        iterations = iter;
        uint64_t time, time_run;
 
+       if (type < 0 || type > MAX_CONDENDED) {
+               printf("%s invalid type %d\n", __func__, type);
+               return 0;
+       }
+
        targs[0].other_thread = NULL;
        targs[1].other_thread = NULL;
 
@@ -938,6 +989,8 @@ lck_mtx_test_mtx_contended_loop_time(
        /* this are t1 args */
        targs[0].my_locked = 0;
        targs[0].other_locked = &targs[1].my_locked;
+       targs[0].type = type;
+       targs[1].type = type;
 
        os_atomic_xchg(&targs[0].other_thread, thread2, release);
 
diff --git a/osfmk/kern/test_mpsc_queue.c b/osfmk/kern/test_mpsc_queue.c
new file mode 100644 (file)
index 0000000..d369ed1
--- /dev/null
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <machine/machine_cpu.h>
+#include <kern/locks.h>
+#include <kern/mpsc_queue.h>
+#include <kern/thread.h>
+
+#if !DEBUG && !DEVELOPMENT
+#error "Test only file"
+#endif
+
+#include <sys/errno.h>
+
+struct mpsc_test_pingpong_queue {
+       struct mpsc_daemon_queue queue;
+       struct mpsc_queue_chain link;
+       struct mpsc_test_pingpong_queue *other;
+       uint64_t *count, *end;
+};
+
+static void
+mpsc_test_pingpong_invoke(mpsc_queue_chain_t elm, mpsc_daemon_queue_t dq)
+{
+       struct mpsc_test_pingpong_queue *q;
+       q = mpsc_queue_element(elm, struct mpsc_test_pingpong_queue, link);
+       assert(&q->queue == dq);
+
+       if (*q->count % 10000 == 0) {
+               printf("mpsc_test_pingpong: %lld asyncs left\n", *q->count);
+       }
+       if ((*q->count)-- > 0) {
+               mpsc_daemon_enqueue(&q->other->queue, &q->other->link,
+                   MPSC_QUEUE_DISABLE_PREEMPTION);
+       } else {
+               *q->end = mach_absolute_time();
+               thread_wakeup(&mpsc_test_pingpong_invoke);
+       }
+}
+
+/*
+ * The point of this test is to exercise the enqueue/unlock-drain race
+ * since the MPSC queue tries to mimize wakeups when it knows it's useless.
+ *
+ * It also ensures basic enqueue properties,
+ * and will panic if anything goes wrong to help debugging state.
+ *
+ * Performance wise, we will always go through the wakeup codepath,
+ * hence this is mostly a benchmark of
+ * assert_wait()/clear_wait()/thread_block()/thread_wakeup()
+ * rather than a benchmark of the MPSC queues.
+ */
+int
+mpsc_test_pingpong(uint64_t count, uint64_t *out)
+{
+       struct mpsc_test_pingpong_queue ping, pong;
+       kern_return_t kr;
+       wait_result_t wr;
+
+       if (count < 1000 || count > 1000 * 1000) {
+               return EINVAL;
+       }
+
+       printf("mpsc_test_pingpong: START\n");
+
+       kr = mpsc_daemon_queue_init_with_thread(&ping.queue,
+           mpsc_test_pingpong_invoke, MINPRI_KERNEL, "ping");
+       if (kr != KERN_SUCCESS) {
+               panic("mpsc_test_pingpong: unable to create pong: %x", kr);
+       }
+
+       kr = mpsc_daemon_queue_init_with_thread(&pong.queue,
+           mpsc_test_pingpong_invoke, MINPRI_KERNEL, "pong");
+       if (kr != KERN_SUCCESS) {
+               panic("mpsc_test_pingpong: unable to create ping: %x", kr);
+       }
+
+       uint64_t n = count, start, end;
+       ping.count = pong.count = &n;
+       ping.end   = pong.end   = &end;
+       ping.other = &pong;
+       pong.other = &ping;
+
+       assert_wait_timeout(&mpsc_test_pingpong_invoke, THREAD_UNINT,
+           5000, 1000 * NSEC_PER_USEC);
+       start = mach_absolute_time();
+       mpsc_daemon_enqueue(&ping.queue, &ping.link, MPSC_QUEUE_DISABLE_PREEMPTION);
+
+       wr = thread_block(THREAD_CONTINUE_NULL);
+       if (wr == THREAD_TIMED_OUT) {
+               panic("mpsc_test_pingpong: timed out: ping:%p pong:%p", &ping, &pong);
+       }
+
+       printf("mpsc_test_pingpong: CLEANUP\n");
+
+       mpsc_daemon_queue_cancel_and_wait(&ping.queue);
+       mpsc_daemon_queue_cancel_and_wait(&pong.queue);
+       absolutetime_to_nanoseconds(end - start, out);
+
+       printf("mpsc_test_pingpong: %lld ping-pongs in %lld ns (%lld.%03lld us/async)\n",
+           count, *out, (*out / count) / 1000, (*out / count) % 1000);
+       return 0;
+}
index 7dfcb02b3e0864d0866e956aa67fdde63d4b4023..f554222b137269053ffc505e5dfddfa2d3cbae52 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <kern/telemetry.h>
 #include <kern/policy_internal.h>
 #include <kern/turnstile.h>
+#include <kern/sched_clutch.h>
 
 #include <corpses/task_corpse.h>
 #if KPC
 #include <sys/bsdtask_info.h>
 #include <mach/sdt.h>
 #include <san/kasan.h>
+#if CONFIG_KSANCOV
+#include <san/ksancov.h>
+#endif
 
 #include <stdatomic.h>
 
+#if defined(HAS_APPLE_PAC)
+#include <ptrauth.h>
+#include <arm64/proc_reg.h>
+#endif /* defined(HAS_APPLE_PAC) */
 
 /*
  * Exported interfaces
@@ -165,25 +173,16 @@ lck_grp_t                                       thread_lck_grp;
 
 struct zone                                     *thread_qos_override_zone;
 
-decl_simple_lock_data(static, thread_stack_lock)
-static queue_head_t             thread_stack_queue;
-
-decl_simple_lock_data(static, thread_terminate_lock)
-static queue_head_t             thread_terminate_queue;
-
-static queue_head_t             thread_deallocate_queue;
-
-static queue_head_t             turnstile_deallocate_queue;
+static struct mpsc_daemon_queue thread_stack_queue;
+static struct mpsc_daemon_queue thread_terminate_queue;
+static struct mpsc_daemon_queue thread_deallocate_queue;
+static struct mpsc_daemon_queue thread_exception_queue;
 
+decl_simple_lock_data(static, crashed_threads_lock);
 static queue_head_t             crashed_threads_queue;
 
-static queue_head_t             workq_deallocate_queue;
-
-decl_simple_lock_data(static, thread_exception_lock)
-static queue_head_t             thread_exception_queue;
-
 struct thread_exception_elt {
-       queue_chain_t           elt;
+       struct mpsc_queue_chain link;
        exception_type_t        exception_type;
        task_t                  exception_task;
        thread_t                exception_thread;
@@ -211,7 +210,7 @@ int task_threadmax = CONFIG_THREAD_MAX;
 
 static uint64_t         thread_unique_id = 100;
 
-struct _thread_ledger_indices thread_ledgers = { -1 };
+struct _thread_ledger_indices thread_ledgers = { .cpu_time = -1 };
 static ledger_template_t thread_ledger_template = NULL;
 static void init_thread_ledgers(void);
 
@@ -290,7 +289,6 @@ thread_bootstrap(void)
        thread_template.sched_pri = 0;
        thread_template.max_priority = 0;
        thread_template.task_priority = 0;
-       thread_template.promotions = 0;
        thread_template.rwlock_count = 0;
        thread_template.waiting_for_mutex = NULL;
 
@@ -394,7 +392,7 @@ thread_bootstrap(void)
        thread_template.effective_policy = (struct thread_effective_policy) {};
 
        bzero(&thread_template.overrides, sizeof(thread_template.overrides));
-       thread_template.sync_ipc_overrides = 0;
+       thread_template.kevent_overrides = 0;
 
        thread_template.iotier_override = THROTTLE_LEVEL_NONE;
        thread_template.thread_io_stats = NULL;
@@ -405,6 +403,7 @@ thread_bootstrap(void)
 
        thread_template.thread_timer_wakeups_bin_1 = thread_template.thread_timer_wakeups_bin_2 = 0;
        thread_template.callout_woken_from_icontext = thread_template.callout_woken_from_platform_idle = 0;
+       thread_template.guard_exc_fatal = 0;
 
        thread_template.thread_tag = 0;
 
@@ -413,6 +412,7 @@ thread_bootstrap(void)
 
        thread_template.th_work_interval = NULL;
 
+       thread_template.decompressions = 0;
        init_thread = thread_template;
 
        /* fiddle with init thread to skip asserts in set_sched_pri */
@@ -491,6 +491,7 @@ thread_corpse_continue(void)
        /*NOTREACHED*/
 }
 
+__dead2
 static void
 thread_terminate_continue(void)
 {
@@ -562,12 +563,12 @@ thread_terminate_self(void)
 
        /*
         * After this subtraction, this thread should never access
-        * task->bsd_info unless it got 0 back from the hw_atomic_sub.  It
+        * task->bsd_info unless it got 0 back from the os_atomic_dec.  It
         * could be racing with other threads to be the last thread in the
         * process, and the last thread in the process will tear down the proc
         * structure and zero-out task->bsd_info.
         */
-       threadcnt = hw_atomic_sub(&task->active_thread_count, 1);
+       threadcnt = os_atomic_dec(&task->active_thread_count, relaxed);
 
        /*
         * If we are the last thread to terminate and the task is
@@ -683,8 +684,7 @@ thread_terminate_self(void)
        assert((thread->sched_flags & TH_SFLAG_RW_PROMOTED) == 0);
        assert((thread->sched_flags & TH_SFLAG_EXEC_PROMOTED) == 0);
        assert((thread->sched_flags & TH_SFLAG_PROMOTED) == 0);
-       assert(thread->promotions == 0);
-       assert(thread->was_promoted_on_wakeup == 0);
+       assert(thread->kern_promotion_schedpri == 0);
        assert(thread->waiting_for_mutex == NULL);
        assert(thread->rwlock_count == 0);
 
@@ -735,8 +735,6 @@ thread_deallocate_complete(
 
        assert(os_ref_get_count(&thread->ref_count) == 0);
 
-       assert(thread_owned_workloops_count(thread) == 0);
-
        if (!(thread->state & TH_TERMINATE2)) {
                panic("thread_deallocate: thread not properly terminated\n");
        }
@@ -799,29 +797,6 @@ thread_deallocate_complete(
        zfree(thread_zone, thread);
 }
 
-void
-thread_starts_owning_workloop(thread_t thread)
-{
-       atomic_fetch_add_explicit(&thread->kqwl_owning_count, 1,
-           memory_order_relaxed);
-}
-
-void
-thread_ends_owning_workloop(thread_t thread)
-{
-       __assert_only uint32_t count;
-       count = atomic_fetch_sub_explicit(&thread->kqwl_owning_count, 1,
-           memory_order_relaxed);
-       assert(count > 0);
-}
-
-uint32_t
-thread_owned_workloops_count(thread_t thread)
-{
-       return atomic_load_explicit(&thread->kqwl_owning_count,
-                  memory_order_relaxed);
-}
-
 /*
  *     thread_inspect_deallocate:
  *
@@ -835,49 +810,41 @@ thread_inspect_deallocate(
 }
 
 /*
- *     thread_exception_daemon:
+ *     thread_exception_queue_invoke:
  *
  *     Deliver EXC_{RESOURCE,GUARD} exception
  */
 static void
-thread_exception_daemon(void)
+thread_exception_queue_invoke(mpsc_queue_chain_t elm,
+    __assert_only mpsc_daemon_queue_t dq)
 {
        struct thread_exception_elt *elt;
        task_t task;
        thread_t thread;
        exception_type_t etype;
 
-       simple_lock(&thread_exception_lock, LCK_GRP_NULL);
-       while ((elt = (struct thread_exception_elt *)dequeue_head(&thread_exception_queue)) != NULL) {
-               simple_unlock(&thread_exception_lock);
-
-               etype = elt->exception_type;
-               task = elt->exception_task;
-               thread = elt->exception_thread;
-               assert_thread_magic(thread);
+       assert(dq == &thread_exception_queue);
+       elt = mpsc_queue_element(elm, struct thread_exception_elt, link);
 
-               kfree(elt, sizeof(*elt));
-
-               /* wait for all the threads in the task to terminate */
-               task_lock(task);
-               task_wait_till_threads_terminate_locked(task);
-               task_unlock(task);
-
-               /* Consumes the task ref returned by task_generate_corpse_internal */
-               task_deallocate(task);
-               /* Consumes the thread ref returned by task_generate_corpse_internal */
-               thread_deallocate(thread);
+       etype = elt->exception_type;
+       task = elt->exception_task;
+       thread = elt->exception_thread;
+       assert_thread_magic(thread);
 
-               /* Deliver the notification, also clears the corpse. */
-               task_deliver_crash_notification(task, thread, etype, 0);
+       kfree(elt, sizeof(*elt));
 
-               simple_lock(&thread_exception_lock, LCK_GRP_NULL);
-       }
+       /* wait for all the threads in the task to terminate */
+       task_lock(task);
+       task_wait_till_threads_terminate_locked(task);
+       task_unlock(task);
 
-       assert_wait((event_t)&thread_exception_queue, THREAD_UNINT);
-       simple_unlock(&thread_exception_lock);
+       /* Consumes the task ref returned by task_generate_corpse_internal */
+       task_deallocate(task);
+       /* Consumes the thread ref returned by task_generate_corpse_internal */
+       thread_deallocate(thread);
 
-       thread_block((thread_continue_t)thread_exception_daemon);
+       /* Deliver the notification, also clears the corpse. */
+       task_deliver_crash_notification(task, thread, etype, 0);
 }
 
 /*
@@ -897,11 +864,8 @@ thread_exception_enqueue(
        elt->exception_task = task;
        elt->exception_thread = thread;
 
-       simple_lock(&thread_exception_lock, LCK_GRP_NULL);
-       enqueue_tail(&thread_exception_queue, (queue_entry_t)elt);
-       simple_unlock(&thread_exception_lock);
-
-       thread_wakeup((event_t)&thread_exception_queue);
+       mpsc_daemon_enqueue(&thread_exception_queue, &elt->link,
+           MPSC_QUEUE_DISABLE_PREEMPTION);
 }
 
 /*
@@ -934,150 +898,94 @@ thread_copy_resource_info(
        *dst_thread->thread_io_stats = *src_thread->thread_io_stats;
 }
 
-/*
- *     thread_terminate_daemon:
- *
- *     Perform final clean up for terminating threads.
- */
 static void
-thread_terminate_daemon(void)
+thread_terminate_queue_invoke(mpsc_queue_chain_t e,
+    __assert_only mpsc_daemon_queue_t dq)
 {
-       thread_t        self, thread;
-       task_t          task;
-
-       self = current_thread();
-       self->options |= TH_OPT_SYSTEM_CRITICAL;
-
-       (void)splsched();
-       simple_lock(&thread_terminate_lock, LCK_GRP_NULL);
-
-thread_terminate_start:
-       while ((thread = qe_dequeue_head(&thread_terminate_queue, struct thread, runq_links)) != THREAD_NULL) {
-               assert_thread_magic(thread);
-
-               /*
-                * if marked for crash reporting, skip reaping.
-                * The corpse delivery thread will clear bit and enqueue
-                * for reaping when done
-                */
-               if (thread->inspection) {
-                       enqueue_tail(&crashed_threads_queue, &thread->runq_links);
-                       continue;
-               }
-
-               simple_unlock(&thread_terminate_lock);
-               (void)spllo();
-
-               task = thread->task;
-
-               task_lock(task);
-               task->total_user_time += timer_grab(&thread->user_timer);
-               task->total_ptime += timer_grab(&thread->ptime);
-               task->total_runnable_time += timer_grab(&thread->runnable_timer);
-               if (thread->precise_user_kernel_time) {
-                       task->total_system_time += timer_grab(&thread->system_timer);
-               } else {
-                       task->total_user_time += timer_grab(&thread->system_timer);
-               }
-
-               task->c_switch += thread->c_switch;
-               task->p_switch += thread->p_switch;
-               task->ps_switch += thread->ps_switch;
-
-               task->syscalls_unix += thread->syscalls_unix;
-               task->syscalls_mach += thread->syscalls_mach;
-
-               task->task_timer_wakeups_bin_1 += thread->thread_timer_wakeups_bin_1;
-               task->task_timer_wakeups_bin_2 += thread->thread_timer_wakeups_bin_2;
-               task->task_gpu_ns += ml_gpu_stat(thread);
-               task->task_energy += ml_energy_stat(thread);
-
-#if MONOTONIC
-               mt_terminate_update(task, thread);
-#endif /* MONOTONIC */
+       thread_t thread = mpsc_queue_element(e, struct thread, mpsc_links);
+       task_t task = thread->task;
 
-               thread_update_qos_cpu_time(thread);
+       assert(dq == &thread_terminate_queue);
 
-               queue_remove(&task->threads, thread, thread_t, task_threads);
-               task->thread_count--;
-
-               /*
-                * If the task is being halted, and there is only one thread
-                * left in the task after this one, then wakeup that thread.
-                */
-               if (task->thread_count == 1 && task->halting) {
-                       thread_wakeup((event_t)&task->halting);
-               }
+       task_lock(task);
 
+       /*
+        * if marked for crash reporting, skip reaping.
+        * The corpse delivery thread will clear bit and enqueue
+        * for reaping when done
+        *
+        * Note: the inspection field is set under the task lock
+        *
+        * FIXME[mad]: why enqueue for termination before `inspection` is false ?
+        */
+       if (__improbable(thread->inspection)) {
+               simple_lock(&crashed_threads_lock, &thread_lck_grp);
                task_unlock(task);
 
-               lck_mtx_lock(&tasks_threads_lock);
-               queue_remove(&threads, thread, thread_t, threads);
-               threads_count--;
-               lck_mtx_unlock(&tasks_threads_lock);
-
-               thread_deallocate(thread);
-
-               (void)splsched();
-               simple_lock(&thread_terminate_lock, LCK_GRP_NULL);
+               enqueue_tail(&crashed_threads_queue, &thread->runq_links);
+               simple_unlock(&crashed_threads_lock);
+               return;
        }
 
-       while ((thread = qe_dequeue_head(&thread_deallocate_queue, struct thread, runq_links)) != THREAD_NULL) {
-               assert_thread_magic(thread);
 
-               simple_unlock(&thread_terminate_lock);
-               (void)spllo();
+       task->total_user_time += timer_grab(&thread->user_timer);
+       task->total_ptime += timer_grab(&thread->ptime);
+       task->total_runnable_time += timer_grab(&thread->runnable_timer);
+       if (thread->precise_user_kernel_time) {
+               task->total_system_time += timer_grab(&thread->system_timer);
+       } else {
+               task->total_user_time += timer_grab(&thread->system_timer);
+       }
 
-               thread_deallocate_complete(thread);
+       task->c_switch += thread->c_switch;
+       task->p_switch += thread->p_switch;
+       task->ps_switch += thread->ps_switch;
 
-               (void)splsched();
-               simple_lock(&thread_terminate_lock, LCK_GRP_NULL);
-       }
+       task->syscalls_unix += thread->syscalls_unix;
+       task->syscalls_mach += thread->syscalls_mach;
 
-       struct turnstile *turnstile;
-       while ((turnstile = qe_dequeue_head(&turnstile_deallocate_queue, struct turnstile, ts_deallocate_link)) != TURNSTILE_NULL) {
-               simple_unlock(&thread_terminate_lock);
-               (void)spllo();
+       task->task_timer_wakeups_bin_1 += thread->thread_timer_wakeups_bin_1;
+       task->task_timer_wakeups_bin_2 += thread->thread_timer_wakeups_bin_2;
+       task->task_gpu_ns += ml_gpu_stat(thread);
+       task->task_energy += ml_energy_stat(thread);
+       task->decompressions += thread->decompressions;
 
-               turnstile_destroy(turnstile);
+#if MONOTONIC
+       mt_terminate_update(task, thread);
+#endif /* MONOTONIC */
 
-               (void)splsched();
-               simple_lock(&thread_terminate_lock, LCK_GRP_NULL);
-       }
+       thread_update_qos_cpu_time(thread);
 
-       queue_entry_t qe;
+       queue_remove(&task->threads, thread, thread_t, task_threads);
+       task->thread_count--;
 
        /*
-        * see workq_deallocate_enqueue: struct workqueue is opaque to thread.c and
-        * we just link pieces of memory here
+        * If the task is being halted, and there is only one thread
+        * left in the task after this one, then wakeup that thread.
         */
-       while ((qe = dequeue_head(&workq_deallocate_queue))) {
-               simple_unlock(&thread_terminate_lock);
-               (void)spllo();
+       if (task->thread_count == 1 && task->halting) {
+               thread_wakeup((event_t)&task->halting);
+       }
+
+       task_unlock(task);
 
-               workq_destroy((struct workqueue *)qe);
+       lck_mtx_lock(&tasks_threads_lock);
+       queue_remove(&threads, thread, thread_t, threads);
+       threads_count--;
+       lck_mtx_unlock(&tasks_threads_lock);
 
-               (void)splsched();
-               simple_lock(&thread_terminate_lock, LCK_GRP_NULL);
-       }
+       thread_deallocate(thread);
+}
 
-       /*
-        * Check if something enqueued in thread terminate/deallocate queue
-        * while processing workq deallocate queue
-        */
-       if (!queue_empty(&thread_terminate_queue) ||
-           !queue_empty(&thread_deallocate_queue) ||
-           !queue_empty(&turnstile_deallocate_queue)) {
-               goto thread_terminate_start;
-       }
+static void
+thread_deallocate_queue_invoke(mpsc_queue_chain_t e,
+    __assert_only mpsc_daemon_queue_t dq)
+{
+       thread_t thread = mpsc_queue_element(e, struct thread, mpsc_links);
 
-       assert_wait((event_t)&thread_terminate_queue, THREAD_UNINT);
-       simple_unlock(&thread_terminate_lock);
-       /* splsched */
+       assert(dq == &thread_deallocate_queue);
 
-       self->options &= ~TH_OPT_SYSTEM_CRITICAL;
-       thread_block((thread_continue_t)thread_terminate_daemon);
-       /*NOTREACHED*/
+       thread_deallocate_complete(thread);
 }
 
 /*
@@ -1093,11 +1001,8 @@ thread_terminate_enqueue(
 {
        KDBG_RELEASE(TRACE_DATA_THREAD_TERMINATE, thread->thread_id);
 
-       simple_lock(&thread_terminate_lock, LCK_GRP_NULL);
-       enqueue_tail(&thread_terminate_queue, &thread->runq_links);
-       simple_unlock(&thread_terminate_lock);
-
-       thread_wakeup((event_t)&thread_terminate_queue);
+       mpsc_daemon_enqueue(&thread_terminate_queue, &thread->mpsc_links,
+           MPSC_QUEUE_DISABLE_PREEMPTION);
 }
 
 /*
@@ -1109,56 +1014,8 @@ static void
 thread_deallocate_enqueue(
        thread_t                thread)
 {
-       spl_t s = splsched();
-
-       simple_lock(&thread_terminate_lock, LCK_GRP_NULL);
-       enqueue_tail(&thread_deallocate_queue, &thread->runq_links);
-       simple_unlock(&thread_terminate_lock);
-
-       thread_wakeup((event_t)&thread_terminate_queue);
-       splx(s);
-}
-
-/*
- *     turnstile_deallocate_enqueue:
- *
- *     Enqueue a turnstile for final deallocation.
- */
-void
-turnstile_deallocate_enqueue(
-       struct turnstile *turnstile)
-{
-       spl_t s = splsched();
-
-       simple_lock(&thread_terminate_lock, LCK_GRP_NULL);
-       enqueue_tail(&turnstile_deallocate_queue, &turnstile->ts_deallocate_link);
-       simple_unlock(&thread_terminate_lock);
-
-       thread_wakeup((event_t)&thread_terminate_queue);
-       splx(s);
-}
-
-/*
- *     workq_deallocate_enqueue:
- *
- *     Enqueue a workqueue for final deallocation.
- */
-void
-workq_deallocate_enqueue(
-       struct workqueue *wq)
-{
-       spl_t s = splsched();
-
-       simple_lock(&thread_terminate_lock, LCK_GRP_NULL);
-       /*
-        * this is just to delay a zfree(), so we link the memory with no regards
-        * for how the struct looks like.
-        */
-       enqueue_tail(&workq_deallocate_queue, (queue_entry_t)wq);
-       simple_unlock(&thread_terminate_lock);
-
-       thread_wakeup((event_t)&thread_terminate_queue);
-       splx(s);
+       mpsc_daemon_enqueue(&thread_deallocate_queue, &thread->mpsc_links,
+           MPSC_QUEUE_DISABLE_PREEMPTION);
 }
 
 /*
@@ -1167,13 +1024,11 @@ workq_deallocate_enqueue(
  * who are no longer being inspected.
  */
 void
-thread_terminate_crashed_threads()
+thread_terminate_crashed_threads(void)
 {
        thread_t th_remove;
-       boolean_t should_wake_terminate_queue = FALSE;
-       spl_t s = splsched();
 
-       simple_lock(&thread_terminate_lock, LCK_GRP_NULL);
+       simple_lock(&crashed_threads_lock, &thread_lck_grp);
        /*
         * loop through the crashed threads queue
         * to put any threads that are not being inspected anymore
@@ -1184,58 +1039,39 @@ thread_terminate_crashed_threads()
                assert(th_remove != current_thread());
 
                if (th_remove->inspection == FALSE) {
-                       re_queue_tail(&thread_terminate_queue, &th_remove->runq_links);
-                       should_wake_terminate_queue = TRUE;
+                       remqueue(&th_remove->runq_links);
+                       mpsc_daemon_enqueue(&thread_terminate_queue, &th_remove->mpsc_links,
+                           MPSC_QUEUE_NONE);
                }
        }
 
-       simple_unlock(&thread_terminate_lock);
-       splx(s);
-       if (should_wake_terminate_queue == TRUE) {
-               thread_wakeup((event_t)&thread_terminate_queue);
-       }
+       simple_unlock(&crashed_threads_lock);
 }
 
 /*
- *     thread_stack_daemon:
+ *     thread_stack_queue_invoke:
  *
  *     Perform stack allocation as required due to
  *     invoke failures.
  */
 static void
-thread_stack_daemon(void)
+thread_stack_queue_invoke(mpsc_queue_chain_t elm,
+    __assert_only mpsc_daemon_queue_t dq)
 {
-       thread_t                thread;
-       spl_t                   s;
-
-       s = splsched();
-       simple_lock(&thread_stack_lock, LCK_GRP_NULL);
-
-       while ((thread = qe_dequeue_head(&thread_stack_queue, struct thread, runq_links)) != THREAD_NULL) {
-               assert_thread_magic(thread);
-
-               simple_unlock(&thread_stack_lock);
-               splx(s);
-
-               /* allocate stack with interrupts enabled so that we can call into VM */
-               stack_alloc(thread);
+       thread_t thread = mpsc_queue_element(elm, struct thread, mpsc_links);
 
-               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_WAIT) | DBG_FUNC_END, thread_tid(thread), 0, 0, 0, 0);
+       assert(dq == &thread_stack_queue);
 
-               s = splsched();
-               thread_lock(thread);
-               thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
-               thread_unlock(thread);
+       /* allocate stack with interrupts enabled so that we can call into VM */
+       stack_alloc(thread);
 
-               simple_lock(&thread_stack_lock, LCK_GRP_NULL);
-       }
+       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_WAIT) | DBG_FUNC_END, thread_tid(thread), 0, 0, 0, 0);
 
-       assert_wait((event_t)&thread_stack_queue, THREAD_UNINT);
-       simple_unlock(&thread_stack_lock);
+       spl_t s = splsched();
+       thread_lock(thread);
+       thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
+       thread_unlock(thread);
        splx(s);
-
-       thread_block((thread_continue_t)thread_stack_daemon);
-       /*NOTREACHED*/
 }
 
 /*
@@ -1252,52 +1088,39 @@ thread_stack_enqueue(
        KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_WAIT) | DBG_FUNC_START, thread_tid(thread), 0, 0, 0, 0);
        assert_thread_magic(thread);
 
-       simple_lock(&thread_stack_lock, LCK_GRP_NULL);
-       enqueue_tail(&thread_stack_queue, &thread->runq_links);
-       simple_unlock(&thread_stack_lock);
-
-       thread_wakeup((event_t)&thread_stack_queue);
+       mpsc_daemon_enqueue(&thread_stack_queue, &thread->mpsc_links,
+           MPSC_QUEUE_DISABLE_PREEMPTION);
 }
 
 void
 thread_daemon_init(void)
 {
        kern_return_t   result;
-       thread_t        thread = NULL;
 
-       simple_lock_init(&thread_terminate_lock, 0);
-       queue_init(&thread_terminate_queue);
-       queue_init(&thread_deallocate_queue);
-       queue_init(&workq_deallocate_queue);
-       queue_init(&turnstile_deallocate_queue);
-       queue_init(&crashed_threads_queue);
+       thread_deallocate_daemon_init();
 
-       result = kernel_thread_start_priority((thread_continue_t)thread_terminate_daemon, NULL, MINPRI_KERNEL, &thread);
-       if (result != KERN_SUCCESS) {
-               panic("thread_daemon_init: thread_terminate_daemon");
-       }
+       thread_deallocate_daemon_register_queue(&thread_terminate_queue,
+           thread_terminate_queue_invoke);
 
-       thread_deallocate(thread);
+       thread_deallocate_daemon_register_queue(&thread_deallocate_queue,
+           thread_deallocate_queue_invoke);
 
-       simple_lock_init(&thread_stack_lock, 0);
-       queue_init(&thread_stack_queue);
+       simple_lock_init(&crashed_threads_lock, 0);
+       queue_init(&crashed_threads_queue);
 
-       result = kernel_thread_start_priority((thread_continue_t)thread_stack_daemon, NULL, BASEPRI_PREEMPT_HIGH, &thread);
+       result = mpsc_daemon_queue_init_with_thread(&thread_stack_queue,
+           thread_stack_queue_invoke, BASEPRI_PREEMPT_HIGH,
+           "daemon.thread-stack");
        if (result != KERN_SUCCESS) {
                panic("thread_daemon_init: thread_stack_daemon");
        }
 
-       thread_deallocate(thread);
-
-       simple_lock_init(&thread_exception_lock, 0);
-       queue_init(&thread_exception_queue);
-
-       result = kernel_thread_start_priority((thread_continue_t)thread_exception_daemon, NULL, MINPRI_KERNEL, &thread);
+       result = mpsc_daemon_queue_init_with_thread(&thread_exception_queue,
+           thread_exception_queue_invoke, MINPRI_KERNEL,
+           "daemon.thread-exception");
        if (result != KERN_SUCCESS) {
                panic("thread_daemon_init: thread_exception_daemon");
        }
-
-       thread_deallocate(thread);
 }
 
 #define TH_OPTION_NONE          0x00
@@ -1384,19 +1207,27 @@ thread_create_internal(
        new_thread->continuation = continuation;
        new_thread->parameter = parameter;
        new_thread->inheritor_flags = TURNSTILE_UPDATE_FLAGS_NONE;
-       priority_queue_init(&new_thread->inheritor_queue,
+       priority_queue_init(&new_thread->sched_inheritor_queue,
+           PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
+       priority_queue_init(&new_thread->base_inheritor_queue,
            PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
+#if CONFIG_SCHED_CLUTCH
+       priority_queue_entry_init(&new_thread->sched_clutchpri_link);
+#endif /* CONFIG_SCHED_CLUTCH */
 
        /* Allocate I/O Statistics structure */
        new_thread->thread_io_stats = (io_stat_info_t)kalloc(sizeof(struct io_stat_info));
        assert(new_thread->thread_io_stats != NULL);
        bzero(new_thread->thread_io_stats, sizeof(struct io_stat_info));
-       new_thread->sync_ipc_overrides = 0;
 
 #if KASAN
        kasan_init_thread(&new_thread->kasan_data);
 #endif
 
+#if CONFIG_KSANCOV
+       new_thread->ksancov_data = NULL;
+#endif
+
 #if CONFIG_IOSCHED
        /* Clear out the I/O Scheduling info for AppleFSCompression */
        new_thread->decmp_upl = NULL;
@@ -1503,6 +1334,7 @@ thread_create_internal(
        new_thread->max_priority = parent_task->max_priority;
        new_thread->task_priority = parent_task->priority;
 
+
        int new_priority = (priority < 0) ? parent_task->priority: priority;
        new_priority = (priority < 0)? parent_task->priority: priority;
        if (new_priority > new_thread->max_priority) {
@@ -1520,7 +1352,11 @@ thread_create_internal(
 
 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
        new_thread->sched_stamp = sched_tick;
+#if CONFIG_SCHED_CLUTCH
+       new_thread->pri_shift = sched_clutch_thread_pri_shift(new_thread, new_thread->th_sched_bucket);
+#else /* CONFIG_SCHED_CLUTCH */
        new_thread->pri_shift = sched_pri_shifts[new_thread->th_sched_bucket];
+#endif /* CONFIG_SCHED_CLUTCH */
 #endif /* defined(CONFIG_SCHED_TIMESHARE_CORE) */
 
 #if CONFIG_EMBEDDED
@@ -1536,8 +1372,7 @@ thread_create_internal(
        parent_task->thread_count++;
 
        /* So terminating threads don't need to take the task lock to decrement */
-       hw_atomic_add(&parent_task->active_thread_count, 1);
-
+       os_atomic_inc(&parent_task->active_thread_count, relaxed);
 
        queue_enter(&threads, new_thread, thread_t, threads);
        threads_count++;
@@ -2119,8 +1954,8 @@ thread_info_internal(
                 * the PROC_PIDTHREADINFO flavor (which can't be used on corpses)
                 */
                retrieve_thread_basic_info(thread, &basic_info);
-               extended_info->pth_user_time = ((basic_info.user_time.seconds * (integer_t)NSEC_PER_SEC) + (basic_info.user_time.microseconds * (integer_t)NSEC_PER_USEC));
-               extended_info->pth_system_time = ((basic_info.system_time.seconds * (integer_t)NSEC_PER_SEC) + (basic_info.system_time.microseconds * (integer_t)NSEC_PER_USEC));
+               extended_info->pth_user_time = (((uint64_t)basic_info.user_time.seconds * NSEC_PER_SEC) + ((uint64_t)basic_info.user_time.microseconds * NSEC_PER_USEC));
+               extended_info->pth_system_time = (((uint64_t)basic_info.system_time.seconds * NSEC_PER_SEC) + ((uint64_t)basic_info.system_time.microseconds * NSEC_PER_USEC));
 
                extended_info->pth_cpu_usage = basic_info.cpu_usage;
                extended_info->pth_policy = basic_info.policy;
@@ -2359,33 +2194,43 @@ clear_thread_rwlock_boost(void)
        }
 }
 
-
 /*
  * XXX assuming current thread only, for now...
  */
 void
 thread_guard_violation(thread_t thread,
-    mach_exception_data_type_t code, mach_exception_data_type_t subcode)
+    mach_exception_data_type_t code, mach_exception_data_type_t subcode, boolean_t fatal)
 {
        assert(thread == current_thread());
 
-       /* don't set up the AST for kernel threads */
+       /* Don't set up the AST for kernel threads; this check is needed to ensure
+        * that the guard_exc_* fields in the thread structure are set only by the
+        * current thread and therefore, don't require a lock.
+        */
        if (thread->task == kernel_task) {
                return;
        }
 
-       spl_t s = splsched();
+       assert(EXC_GUARD_DECODE_GUARD_TYPE(code));
+
        /*
         * Use the saved state area of the thread structure
         * to store all info required to handle the AST when
-        * returning to userspace
+        * returning to userspace. It's possible that there is
+        * already a pending guard exception. If it's non-fatal,
+        * it can only be over-written by a fatal exception code.
         */
-       assert(EXC_GUARD_DECODE_GUARD_TYPE(code));
+       if (thread->guard_exc_info.code && (thread->guard_exc_fatal || !fatal)) {
+               return;
+       }
+
        thread->guard_exc_info.code = code;
        thread->guard_exc_info.subcode = subcode;
+       thread->guard_exc_fatal = fatal ? 1 : 0;
+
+       spl_t s = splsched();
        thread_ast_set(thread, AST_GUARD);
        ast_propagate(thread);
-
        splx(s);
 }
 
@@ -2407,6 +2252,7 @@ guard_ast(thread_t t)
 
        t->guard_exc_info.code = 0;
        t->guard_exc_info.subcode = 0;
+       t->guard_exc_fatal = 0;
 
        switch (EXC_GUARD_DECODE_GUARD_TYPE(code)) {
        case GUARD_TYPE_NONE:
@@ -2534,20 +2380,17 @@ SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void)
        }
 
        /* TODO: show task total runtime (via TASK_ABSOLUTETIME_INFO)? */
-       printf("process %s[%d] thread %llu caught burning CPU! "
-           "It used more than %d%% CPU over %u seconds "
-           "(actual recent usage: %d%% over ~%llu seconds).  "
-           "Thread lifetime cpu usage %d.%06ds, (%d.%06d user, %d.%06d sys) "
-           "ledger balance: %lld mabs credit: %lld mabs debit: %lld mabs "
-           "limit: %llu mabs period: %llu ns last refill: %llu ns%s.\n",
-           procname, pid, tid,
-           percentage, interval_sec,
-           usage_percent,
-           (lei.lei_last_refill + NSEC_PER_SEC / 2) / NSEC_PER_SEC,
+       printf("process %s[%d] thread %llu caught burning CPU! It used more than %d%% CPU over %u seconds\n",
+           procname, pid, tid, percentage, interval_sec);
+       printf("  (actual recent usage: %d%% over ~%llu seconds)\n",
+           usage_percent, (lei.lei_last_refill + NSEC_PER_SEC / 2) / NSEC_PER_SEC);
+       printf("  Thread lifetime cpu usage %d.%06ds, (%d.%06d user, %d.%06d sys)\n",
            thread_total_time.seconds, thread_total_time.microseconds,
            thread_user_time.seconds, thread_user_time.microseconds,
-           thread_system_time.seconds, thread_system_time.microseconds,
-           lei.lei_balance, lei.lei_credit, lei.lei_debit,
+           thread_system_time.seconds, thread_system_time.microseconds);
+       printf("  Ledger balance: %lld; mabs credit: %lld; mabs debit: %lld\n",
+           lei.lei_balance, lei.lei_credit, lei.lei_debit);
+       printf("  mabs limit: %llu; mabs period: %llu ns; last refill: %llu ns%s.\n",
            lei.lei_limit, lei.lei_refill_period, lei.lei_last_refill,
            (fatal ? " [fatal violation]" : ""));
 
@@ -3008,10 +2851,6 @@ thread_should_halt(
  * thread_set_voucher_name - reset the voucher port name bound to this thread
  *
  * Conditions:  nothing locked
- *
- *     If we already converted the previous name to a cached voucher
- *     reference, then we discard that reference here.  The next lookup
- *     will cache it again.
  */
 
 kern_return_t
@@ -3022,6 +2861,7 @@ thread_set_voucher_name(mach_port_name_t voucher_name)
        ipc_voucher_t voucher;
        ledger_t bankledger = NULL;
        struct thread_group *banktg = NULL;
+       uint32_t persona_id = 0;
 
        if (MACH_PORT_DEAD == voucher_name) {
                return KERN_INVALID_RIGHT;
@@ -3036,7 +2876,7 @@ thread_set_voucher_name(mach_port_name_t voucher_name)
                        return KERN_INVALID_ARGUMENT;
                }
        }
-       bank_get_bank_ledger_and_thread_group(new_voucher, &bankledger, &banktg);
+       bank_get_bank_ledger_thread_group_and_persona(new_voucher, &bankledger, &banktg, &persona_id);
 
        thread_mtx_lock(thread);
        voucher = thread->ith_voucher;
@@ -3051,7 +2891,7 @@ thread_set_voucher_name(mach_port_name_t voucher_name)
            (uintptr_t)thread_tid(thread),
            (uintptr_t)voucher_name,
            VM_KERNEL_ADDRPERM((uintptr_t)new_voucher),
-           1, 0);
+           persona_id, 0);
 
        if (IPC_VOUCHER_NULL != voucher) {
                ipc_voucher_release(voucher);
@@ -3065,10 +2905,6 @@ thread_set_voucher_name(mach_port_name_t voucher_name)
  *
  *  Conditions:  nothing locked
  *
- *  A reference to the voucher may be lazily pending, if someone set the voucher name
- *  but nobody has done a lookup yet.  In that case, we'll have to do the equivalent
- *  lookup here.
- *
  *  NOTE:       At the moment, there is no distinction between the current and effective
  *             vouchers because we only set them at the thread level currently.
  */
@@ -3079,7 +2915,6 @@ thread_get_mach_voucher(
        ipc_voucher_t           *voucherp)
 {
        ipc_voucher_t           voucher;
-       mach_port_name_t        voucher_name;
 
        if (THREAD_NULL == thread) {
                return KERN_INVALID_ARGUMENT;
@@ -3088,7 +2923,6 @@ thread_get_mach_voucher(
        thread_mtx_lock(thread);
        voucher = thread->ith_voucher;
 
-       /* if already cached, just return a ref */
        if (IPC_VOUCHER_NULL != voucher) {
                ipc_voucher_reference(voucher);
                thread_mtx_unlock(thread);
@@ -3096,41 +2930,9 @@ thread_get_mach_voucher(
                return KERN_SUCCESS;
        }
 
-       voucher_name = thread->ith_voucher_name;
-
-       /* convert the name to a port, then voucher reference */
-       if (MACH_PORT_VALID(voucher_name)) {
-               ipc_port_t port;
-
-               if (KERN_SUCCESS !=
-                   ipc_object_copyin(thread->task->itk_space, voucher_name,
-                   MACH_MSG_TYPE_COPY_SEND, (ipc_object_t *)&port)) {
-                       thread->ith_voucher_name = MACH_PORT_NULL;
-                       thread_mtx_unlock(thread);
-                       *voucherp = IPC_VOUCHER_NULL;
-                       return KERN_SUCCESS;
-               }
-
-               /* convert to a voucher ref to return, and cache a ref on thread */
-               voucher = convert_port_to_voucher(port);
-               ipc_voucher_reference(voucher);
-               thread->ith_voucher = voucher;
-               thread_mtx_unlock(thread);
-
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                   MACHDBG_CODE(DBG_MACH_IPC, MACH_THREAD_SET_VOUCHER) | DBG_FUNC_NONE,
-                   (uintptr_t)thread_tid(thread),
-                   (uintptr_t)port,
-                   VM_KERNEL_ADDRPERM((uintptr_t)voucher),
-                   2, 0);
-
-
-               ipc_port_release_send(port);
-       } else {
-               thread_mtx_unlock(thread);
-       }
+       thread_mtx_unlock(thread);
 
-       *voucherp = voucher;
+       *voucherp = IPC_VOUCHER_NULL;
        return KERN_SUCCESS;
 }
 
@@ -3140,8 +2942,8 @@ thread_get_mach_voucher(
  *  Conditions: callers holds a reference on the voucher.
  *             nothing locked.
  *
- *  We grab another reference to the voucher and bind it to the thread.  Any lazy
- *  binding is erased.  The old voucher reference associated with the thread is
+ *  We grab another reference to the voucher and bind it to the thread.
+ *  The old voucher reference associated with the thread is
  *  discarded.
  */
 kern_return_t
@@ -3152,6 +2954,7 @@ thread_set_mach_voucher(
        ipc_voucher_t old_voucher;
        ledger_t bankledger = NULL;
        struct thread_group *banktg = NULL;
+       uint32_t persona_id = 0;
 
        if (THREAD_NULL == thread) {
                return KERN_INVALID_ARGUMENT;
@@ -3162,7 +2965,7 @@ thread_set_mach_voucher(
        }
 
        ipc_voucher_reference(voucher);
-       bank_get_bank_ledger_and_thread_group(voucher, &bankledger, &banktg);
+       bank_get_bank_ledger_thread_group_and_persona(voucher, &bankledger, &banktg, &persona_id);
 
        thread_mtx_lock(thread);
        old_voucher = thread->ith_voucher;
@@ -3177,7 +2980,7 @@ thread_set_mach_voucher(
            (uintptr_t)thread_tid(thread),
            (uintptr_t)MACH_PORT_NULL,
            VM_KERNEL_ADDRPERM((uintptr_t)voucher),
-           3, 0);
+           persona_id, 0);
 
        ipc_voucher_release(old_voucher);
 
@@ -3296,12 +3099,44 @@ thread_set_allocation_name(kern_allocation_name_t new_name)
        return ret;
 }
 
+void *
+thread_iokit_tls_get(uint32_t index)
+{
+       assert(index < THREAD_SAVE_IOKIT_TLS_COUNT);
+       return current_thread()->saved.iokit.tls[index];
+}
+
+void
+thread_iokit_tls_set(uint32_t index, void * data)
+{
+       assert(index < THREAD_SAVE_IOKIT_TLS_COUNT);
+       current_thread()->saved.iokit.tls[index] = data;
+}
+
 uint64_t
 thread_get_last_wait_duration(thread_t thread)
 {
        return thread->last_made_runnable_time - thread->last_run_time;
 }
 
+integer_t
+thread_kern_get_pri(thread_t thr)
+{
+       return thr->base_pri;
+}
+
+void
+thread_kern_set_pri(thread_t thr, integer_t pri)
+{
+       sched_set_kernel_thread_priority(thr, pri);
+}
+
+integer_t
+thread_kern_get_kernel_maxpri(void)
+{
+       return MAXPRI_KERNEL;
+}
+
 #if CONFIG_DTRACE
 uint32_t
 dtrace_get_thread_predcache(thread_t thread)
@@ -3343,11 +3178,11 @@ dtrace_get_thread_tracing(thread_t thread)
        }
 }
 
-boolean_t
-dtrace_get_thread_reentering(thread_t thread)
+uint16_t
+dtrace_get_thread_inprobe(thread_t thread)
 {
        if (thread != THREAD_NULL) {
-               return (thread->options & TH_OPT_DTRACE) ? TRUE : FALSE;
+               return thread->t_dtrace_inprobe;
        } else {
                return 0;
        }
@@ -3371,6 +3206,14 @@ kasan_get_thread_data(thread_t thread)
 }
 #endif
 
+#if CONFIG_KSANCOV
+void **
+__sanitizer_get_thread_data(thread_t thread)
+{
+       return &thread->ksancov_data;
+}
+#endif
+
 int64_t
 dtrace_calc_thread_recent_vtime(thread_t thread)
 {
@@ -3413,14 +3256,10 @@ dtrace_set_thread_tracing(thread_t thread, int64_t accum)
 }
 
 void
-dtrace_set_thread_reentering(thread_t thread, boolean_t vbool)
+dtrace_set_thread_inprobe(thread_t thread, uint16_t inprobe)
 {
        if (thread != THREAD_NULL) {
-               if (vbool) {
-                       thread->options |= TH_OPT_DTRACE;
-               } else {
-                       thread->options &= (~TH_OPT_DTRACE);
-               }
+               thread->t_dtrace_inprobe = inprobe;
        }
 }
 
@@ -3439,7 +3278,14 @@ dtrace_set_thread_recover(thread_t thread, vm_offset_t recover)
 vm_offset_t
 dtrace_sign_and_set_thread_recover(thread_t thread, vm_offset_t recover)
 {
+#if defined(HAS_APPLE_PAC)
+       return dtrace_set_thread_recover(thread,
+                  (vm_address_t)ptrauth_sign_unauthenticated((void *)recover,
+                  ptrauth_key_function_pointer,
+                  ptrauth_blend_discriminator(&thread->recover, PAC_DISCRIMINATOR_RECOVER)));
+#else /* defined(HAS_APPLE_PAC) */
        return dtrace_set_thread_recover(thread, recover);
+#endif /* defined(HAS_APPLE_PAC) */
 }
 
 void
index 39b9c4fcfb8d4454ec647a4f517090b8a3327e20..83346637674f16dcdce945d6f1b05f8499c45be4 100644 (file)
 #include <kern/debug.h>
 #include <kern/block_hint.h>
 #include <kern/turnstile.h>
+#include <kern/mpsc_queue.h>
 
 #include <kern/waitq.h>
 #include <san/kasan.h>
 
 #include <ipc/ipc_kmsg.h>
 
+#include <machine/atomic.h>
 #include <machine/cpu_data.h>
 #include <machine/thread.h>
 
@@ -172,6 +174,7 @@ struct thread {
        union {
                queue_chain_t                   runq_links;             /* run queue links */
                queue_chain_t                   wait_links;             /* wait queue links */
+               struct mpsc_queue_chain         mpsc_links;             /* thread daemon mpsc links */
                struct priority_queue_entry     wait_prioq_links;       /* priority ordered waitq links */
        };
 
@@ -181,17 +184,29 @@ struct thread {
        struct waitq           *waitq;          /* wait queue this thread is enqueued on */
        struct turnstile       *turnstile;      /* thread's turnstile, protected by primitives interlock */
        void                   *inheritor;      /* inheritor of the primitive the thread will block on */
-       struct priority_queue  inheritor_queue; /* Inheritor queue */
+       struct priority_queue  sched_inheritor_queue; /* Inheritor queue for kernel promotion */
+       struct priority_queue  base_inheritor_queue; /* Inheritor queue for user promotion */
+
+#if CONFIG_SCHED_CLUTCH
+       /*
+        * In the clutch scheduler, the threads are maintained in runqs at the clutch_bucket
+        * level (clutch_bucket defines a unique thread group and scheduling bucket pair). In
+        * order to determine the priority of the clutch bucket as a whole, it is necessary to
+        * find the highest thread in it. The thread could be present in the clutch bucket due
+        * to its base_pri or its promoted pri. This link is used to maintain that queue.
+        */
+       struct priority_queue_entry sched_clutchpri_link;
+
+#endif /* CONFIG_SCHED_CLUTCH */
 
        /* Data updated during assert_wait/thread_wakeup */
 #if __SMP__
-       decl_simple_lock_data(, sched_lock)      /* scheduling lock (thread_lock()) */
-       decl_simple_lock_data(, wake_lock)       /* for thread stop / wait (wake_lock()) */
+       decl_simple_lock_data(, sched_lock);     /* scheduling lock (thread_lock()) */
+       decl_simple_lock_data(, wake_lock);      /* for thread stop / wait (wake_lock()) */
 #endif
        integer_t               options;                        /* options set by thread itself */
 #define TH_OPT_INTMASK          0x0003          /* interrupt / abort level */
 #define TH_OPT_VMPRIV           0x0004          /* may allocate reserved memory */
-#define TH_OPT_DTRACE           0x0008          /* executing under dtrace_probe */
 #define TH_OPT_SYSTEM_CRITICAL  0x0010          /* Thread must always be allowed to run - even under heavy load */
 #define TH_OPT_PROC_CPULIMIT    0x0020          /* Thread has a task-wide CPU limit applied to it */
 #define TH_OPT_PRVT_CPULIMIT    0x0040          /* Thread has a thread-private CPU limit applied to it */
@@ -220,6 +235,10 @@ struct thread {
        struct kasan_thread_data kasan_data;
 #endif
 
+#if CONFIG_KSANCOV
+       void *ksancov_data;
+#endif
+
        /* Thread state: */
        int                                     state;
 /*
@@ -262,7 +281,7 @@ struct thread {
 /* unused TH_SFLAG_PRI_UPDATE           0x0100 */
 #define TH_SFLAG_EAGERPREEMPT           0x0200          /* Any preemption of this thread should be treated as if AST_URGENT applied */
 #define TH_SFLAG_RW_PROMOTED            0x0400          /* promote reason: blocking with RW lock held */
-/* unused TH_SFLAG_THROTTLE_DEMOTED     0x0800 */
+#define TH_SFLAG_BASE_PRI_FROZEN        0x0800          /* (effective) base_pri is frozen */
 #define TH_SFLAG_WAITQ_PROMOTED         0x1000          /* promote reason: waitq wakeup (generally for IPC receive) */
 
 
@@ -274,7 +293,8 @@ struct thread {
 #define TH_SFLAG_RW_PROMOTED_BIT        (10)    /* 0x400 */
 
        int16_t                         sched_pri;              /* scheduled (current) priority */
-       int16_t                         base_pri;               /* base priority */
+       int16_t                         base_pri;               /* effective base priority (equal to req_base_pri unless TH_SFLAG_BASE_PRI_FROZEN) */
+       int16_t                         req_base_pri;           /* requested base priority */
        int16_t                         max_priority;           /* copy of max base priority */
        int16_t                         task_priority;          /* copy of task base priority */
        int16_t                         promotion_priority;     /* priority thread is currently promoted to */
@@ -285,16 +305,14 @@ struct thread {
 #endif
 #endif
 
-       int16_t                         promotions;                     /* level of promotion */
        int                             iotier_override; /* atomic operations to set, cleared on ret to user */
-       struct os_refcnt                ref_count;              /* number of references to me */
+       os_refcnt_t                     ref_count;              /* number of references to me */
 
        lck_mtx_t*                      waiting_for_mutex;      /* points to mutex we're waiting for until we acquire it */
 
        uint32_t                        rwlock_count;   /* Number of lck_rw_t locks held by thread */
 
        integer_t                       importance;                     /* task-relative importance */
-       uint32_t                        was_promoted_on_wakeup;         /* thread promoted on wakeup to acquire mutex */
 
        /* Priority depression expiration */
        integer_t                       depress_timer_active;
@@ -412,6 +430,10 @@ struct thread {
                        kern_return_t           result;                         /* primary result */
                        mach_msg_continue_t continuation;
                } sema;
+               struct {
+#define THREAD_SAVE_IOKIT_TLS_COUNT     8
+                       void                    *tls[THREAD_SAVE_IOKIT_TLS_COUNT];
+               } iokit;
        } saved;
 
        /* Only user threads can cause guard exceptions, only kernel threads can be thread call threads */
@@ -456,7 +478,7 @@ struct thread {
        boolean_t pmap_footprint_suspended;
 #endif /* DEVELOPMENT || DEBUG */
 
-       decl_lck_mtx_data(, mutex)
+       decl_lck_mtx_data(, mutex);
 
 
        /* Pending thread ast(s) */
@@ -484,8 +506,9 @@ struct thread {
 #endif
 
 #if CONFIG_DTRACE
-       uint32_t t_dtrace_flags;                /* DTrace thread states */
+       uint16_t t_dtrace_flags;                /* DTrace thread states */
 #define TH_DTRACE_EXECSUCCESS   0x01
+       uint16_t t_dtrace_inprobe;          /* Executing under dtrace_probe */
        uint32_t t_dtrace_predcache;        /* DTrace per thread predicate value hint */
        int64_t t_dtrace_tracing;               /* Thread time under dtrace_probe() */
        int64_t t_dtrace_vtime;
@@ -498,10 +521,11 @@ struct thread {
        uint64_t    t_page_creation_throttled_hard;
        uint64_t    t_page_creation_throttled_soft;
 #endif /* DEVELOPMENT || DEBUG */
+       int         t_pagein_error;            /* for vm_fault(), holds error from vnop_pagein() */
 
 #ifdef KPERF
-/* The high 7 bits are the number of frames to sample of a user callstack. */
-#define T_KPERF_CALLSTACK_DEPTH_OFFSET     (25)
+/* The high 8 bits are the number of frames to sample of a user callstack. */
+#define T_KPERF_CALLSTACK_DEPTH_OFFSET     (24)
 #define T_KPERF_SET_CALLSTACK_DEPTH(DEPTH) (((uint32_t)(DEPTH)) << T_KPERF_CALLSTACK_DEPTH_OFFSET)
 #define T_KPERF_GET_CALLSTACK_DEPTH(FLAGS) ((FLAGS) >> T_KPERF_CALLSTACK_DEPTH_OFFSET)
 #endif
@@ -559,10 +583,9 @@ struct thread {
                user_addr_t     override_resource;
        } *overrides;
 
-       uint32_t        ipc_overrides;
-       _Atomic uint32_t kqwl_owning_count;
-       uint32_t        sync_ipc_overrides;
+       uint32_t        kevent_overrides;
        uint16_t        user_promotion_basepri;
+       uint16_t         kern_promotion_schedpri;
        _Atomic uint16_t kevent_ast_bits;
 
        io_stat_info_t                  thread_io_stats; /* per-thread I/O statistics */
@@ -576,10 +599,16 @@ struct thread {
        uint32_t                        thread_timer_wakeups_bin_1;
        uint32_t                        thread_timer_wakeups_bin_2;
        uint16_t                        thread_tag;
+       /*
+        * callout_* fields are only set for thread call threads whereas guard_exc_fatal is set
+        * by user threads on themselves while taking a guard exception. So it's okay for them to
+        * share this bitfield.
+        */
        uint16_t                        callout_woken_from_icontext:1,
            callout_woken_from_platform_idle:1,
            callout_woke_thread:1,
-           thread_bitfield_unused:13;
+           guard_exc_fatal:1,
+           thread_bitfield_unused:12;
 
        mach_port_name_t                ith_voucher_name;
        ipc_voucher_t                   ith_voucher;
@@ -596,6 +625,7 @@ struct thread {
        turnstile_update_flags_t inheritor_flags; /* inheritor flags for inheritor field */
        block_hint_t    pending_block_hint;
        block_hint_t    block_hint;      /* What type of primitive last caused us to block. */
+       integer_t       decompressions;  /* Per-thread decompressions counter to be added to per-task decompressions counter */
 };
 
 #define ith_state           saved.receive.state
@@ -657,9 +687,6 @@ MACRO_END
 extern void                     thread_deallocate(
        thread_t                thread);
 
-extern void                     thread_deallocate_safe(
-       thread_t                thread);
-
 extern void                     thread_inspect_deallocate(
        thread_inspect_t        thread);
 
@@ -690,9 +717,6 @@ extern void                     thread_copy_resource_info(
 
 extern void                     thread_terminate_crashed_threads(void);
 
-extern void                     turnstile_deallocate_enqueue(
-       struct turnstile *turnstile);
-
 extern void                     thread_stack_enqueue(
        thread_t                thread);
 
@@ -702,7 +726,7 @@ extern void                     thread_hold(
 extern void                     thread_release(
        thread_t        thread);
 
-extern void                     thread_corpse_continue(void);
+extern void                     thread_corpse_continue(void) __dead2;
 
 extern boolean_t                thread_is_active(thread_t thread);
 
@@ -789,7 +813,6 @@ extern thread_t                 machine_switch_context(
 extern void                             machine_load_context(
        thread_t                thread) __attribute__((noreturn));
 
-
 extern kern_return_t    machine_thread_state_initialize(
        thread_t                                thread);
 
@@ -799,6 +822,16 @@ extern kern_return_t    machine_thread_set_state(
        thread_state_t                  state,
        mach_msg_type_number_t  count);
 
+extern mach_vm_address_t machine_thread_pc(
+       thread_t                thread);
+
+extern void machine_thread_reset_pc(
+       thread_t                thread,
+       mach_vm_address_t       pc);
+
+extern boolean_t        machine_thread_on_core(
+       thread_t                thread);
+
 extern kern_return_t    machine_thread_get_state(
        thread_t                                thread,
        thread_flavor_t                 flavor,
@@ -866,7 +899,7 @@ vm_offset_t                     max_valid_stack_address(void);
 static inline uint16_t
 thread_set_tag_internal(thread_t        thread, uint16_t tag)
 {
-       return __sync_fetch_and_or(&thread->thread_tag, tag);
+       return os_atomic_or_orig(&thread->thread_tag, tag, relaxed);
 }
 
 static inline uint16_t
@@ -889,7 +922,7 @@ extern void thread_mtx_lock(thread_t thread);
 
 extern void thread_mtx_unlock(thread_t thread);
 
-extern thread_t         current_thread(void);
+extern thread_t         current_thread(void) __attribute__((const));
 
 extern void                     thread_reference(
        thread_t        thread);
@@ -897,6 +930,19 @@ extern void                     thread_reference(
 extern void                     thread_deallocate(
        thread_t        thread);
 
+#if BSD_KERNEL_PRIVATE
+/* Duplicated from osfmk/kern/ipc_tt.h */
+__options_decl(port_to_thread_options_t, uint32_t, {
+       PORT_TO_THREAD_NONE               = 0x0000,
+       PORT_TO_THREAD_IN_CURRENT_TASK    = 0x0001,
+       PORT_TO_THREAD_NOT_CURRENT_THREAD = 0x0002,
+});
+
+extern thread_t port_name_to_thread(
+       mach_port_name_t            port_name,
+       port_to_thread_options_t    options);
+#endif /* BSD_KERNEL_PRIVATE */
+
 __END_DECLS
 
 #endif  /* MACH_KERNEL_PRIVATE */
@@ -905,22 +951,21 @@ __END_DECLS
 
 __BEGIN_DECLS
 
-extern void                     thread_starts_owning_workloop(
-       thread_t                thread);
-
-extern void                     thread_ends_owning_workloop(
-       thread_t                thread);
-
-extern uint32_t         thread_owned_workloops_count(
+extern void                     thread_deallocate_safe(
        thread_t                thread);
 
-
 extern uint64_t                 thread_dispatchqaddr(
        thread_t thread);
 
 extern uint64_t                 thread_rettokern_addr(
        thread_t thread);
 
+extern integer_t        thread_kern_get_pri(thread_t thr) __attribute__((const));
+
+extern void             thread_kern_set_pri(thread_t thr, integer_t pri);
+
+extern integer_t        thread_kern_get_kernel_maxpri(void) __attribute__((const));
+
 __END_DECLS
 
 #endif  /* KERNEL_PRIVATE */
@@ -1114,6 +1159,7 @@ extern int              thread_task_has_ldt(thread_t);
 #endif
 extern void             *get_bsdthread_info(thread_t);
 extern void             set_bsdthread_info(thread_t, void *);
+extern void             set_thread_pagein_error(thread_t, int);
 extern void             *uthread_alloc(task_t, thread_t, int);
 extern event_t  workq_thread_init_and_wq_lock(task_t, thread_t); // bsd/pthread/
 extern void             uthread_cleanup_name(void *uthread);
@@ -1143,13 +1189,13 @@ extern void act_set_io_telemetry_ast(thread_t);
 extern uint32_t dtrace_get_thread_predcache(thread_t);
 extern int64_t dtrace_get_thread_vtime(thread_t);
 extern int64_t dtrace_get_thread_tracing(thread_t);
-extern boolean_t dtrace_get_thread_reentering(thread_t);
+extern uint16_t dtrace_get_thread_inprobe(thread_t);
 extern int dtrace_get_thread_last_cpu_id(thread_t);
 extern vm_offset_t dtrace_get_kernel_stack(thread_t);
 extern void dtrace_set_thread_predcache(thread_t, uint32_t);
 extern void dtrace_set_thread_vtime(thread_t, int64_t);
 extern void dtrace_set_thread_tracing(thread_t, int64_t);
-extern void dtrace_set_thread_reentering(thread_t, boolean_t);
+extern void dtrace_set_thread_inprobe(thread_t, uint16_t);
 extern vm_offset_t dtrace_set_thread_recover(thread_t, vm_offset_t);
 extern vm_offset_t dtrace_sign_and_set_thread_recover(thread_t, vm_offset_t);
 extern void dtrace_thread_bootstrap(void);
@@ -1182,7 +1228,7 @@ extern void mach_port_guard_ast(thread_t,
 extern void virt_memory_guard_ast(thread_t,
     mach_exception_code_t, mach_exception_subcode_t);
 extern void thread_guard_violation(thread_t,
-    mach_exception_code_t, mach_exception_subcode_t);
+    mach_exception_code_t, mach_exception_subcode_t, boolean_t);
 extern void thread_update_io_stats(thread_t, int size, int io_flags);
 
 extern kern_return_t    thread_set_voucher_name(mach_port_name_t name);
@@ -1191,22 +1237,6 @@ extern kern_return_t thread_get_current_voucher_origin_pid(int32_t *pid);
 extern void set_thread_rwlock_boost(void);
 extern void clear_thread_rwlock_boost(void);
 
-/*! @function thread_has_thread_name
- *   @abstract Checks if a thread has a name.
- *   @discussion This function takes one input, a thread, and returns a boolean value indicating if that thread already has a name associated with it.
- *   @param th The thread to inspect.
- *   @result TRUE if the thread has a name, FALSE otherwise.
- */
-extern boolean_t thread_has_thread_name(thread_t th);
-
-/*! @function thread_set_thread_name
- *   @abstract Set a thread's name.
- *   @discussion This function takes two input parameters: a thread to name, and the name to apply to the thread.  The name will be attached to the thread in order to better identify the thread.
- *   @param th The thread to be named.
- *   @param name The name to apply to the thread.
- */
-extern void thread_set_thread_name(thread_t th, const char* name);
-
 extern void thread_enable_send_importance(thread_t thread, boolean_t enable);
 
 /*
@@ -1268,6 +1298,21 @@ extern bool thread_get_no_smt(void);
 
 #endif  /* XNU_KERNEL_PRIVATE */
 
+/*! @function thread_has_thread_name
+ *   @abstract Checks if a thread has a name.
+ *   @discussion This function takes one input, a thread, and returns a boolean value indicating if that thread already has a name associated with it.
+ *   @param th The thread to inspect.
+ *   @result TRUE if the thread has a name, FALSE otherwise.
+ */
+extern boolean_t thread_has_thread_name(thread_t th);
+
+/*! @function thread_set_thread_name
+ *   @abstract Set a thread's name.
+ *   @discussion This function takes two input parameters: a thread to name, and the name to apply to the thread.  The name will be copied over to the thread in order to better identify the thread.  If the name is longer than MAXTHREADNAMESIZE - 1, it will be truncated.
+ *   @param th The thread to be named.
+ *   @param name The name to apply to the thread.
+ */
+extern void thread_set_thread_name(thread_t th, const char* name);
 
 /*! @function kernel_thread_start
  *   @abstract Create a kernel thread.
@@ -1293,6 +1338,8 @@ extern ipc_port_t convert_thread_inspect_to_port(thread_inspect_t);
 extern boolean_t is_vm_privileged(void);
 extern boolean_t set_vm_privilege(boolean_t);
 extern kern_allocation_name_t thread_set_allocation_name(kern_allocation_name_t new_name);
+extern void *thread_iokit_tls_get(uint32_t index);
+extern void thread_iokit_tls_set(uint32_t index, void * data);
 #endif /* KERNEL_PRIVATE */
 
 __END_DECLS
index c93dda8e30ddad37935ad1d6390f52e418ac5fe8..944d61d991f1993014d7d898d2ea83e96c3672ff 100644 (file)
@@ -1132,12 +1132,32 @@ act_set_astbsd(
 void
 act_set_astkevent(thread_t thread, uint16_t bits)
 {
-       atomic_fetch_or(&thread->kevent_ast_bits, bits);
+       os_atomic_or(&thread->kevent_ast_bits, bits, relaxed);
 
        /* kevent AST shouldn't send immediate IPIs */
        act_set_ast_async(thread, AST_KEVENT);
 }
 
+uint16_t
+act_clear_astkevent(thread_t thread, uint16_t bits)
+{
+       /*
+        * avoid the atomic operation if none of the bits is set,
+        * which will be the common case.
+        */
+       uint16_t cur = os_atomic_load(&thread->kevent_ast_bits, relaxed);
+       if (cur & bits) {
+               cur = os_atomic_andnot_orig(&thread->kevent_ast_bits, bits, relaxed);
+       }
+       return cur & bits;
+}
+
+void
+act_set_ast_reset_pcs(thread_t thread)
+{
+       act_set_ast(thread, AST_RESET_PCS);
+}
+
 void
 act_set_kperf(
        thread_t        thread)
index 5f2676de75aa92367f10ba7d12d023c6db8af3ac..7c8be9695e9edbad5c5a5e4ecbab480005d16fed 100644 (file)
@@ -63,11 +63,11 @@ typedef enum {
        TCF_COUNT       = 2,
 } thread_call_flavor_t;
 
-typedef enum {
+__options_decl(thread_call_group_flags_t, uint32_t, {
        TCG_NONE                = 0x0,
        TCG_PARALLEL            = 0x1,
        TCG_DEALLOC_ACTIVE      = 0x2,
-} thread_call_group_flags_t;
+});
 
 static struct thread_call_group {
        const char *            tcg_name;
index f67111223b9d795a9f03eaa7a746579806b16f1c..49f212298fc6b77c008fd09c55b27331d5b663d1 100644 (file)
@@ -40,6 +40,7 @@
 #include <kern/queue.h>
 #include <kern/locks.h>
 #include <kern/thread_group.h>
+#include <kern/sched_clutch.h>
 
 
 #if CONFIG_EMBEDDED
index 75f81a456e0626ed60daea1338baf43addd401b5..3ba515bef0387a9a8980d659e2f92e2ce26c464b 100644 (file)
@@ -128,7 +128,7 @@ static void
 proc_set_thread_policy_spinlocked(thread_t thread, int category, int flavor, int value, int value2, task_pend_token_t pend_token);
 
 static void
-thread_set_requested_policy_spinlocked(thread_t thread, int category, int flavor, int value, int value2);
+thread_set_requested_policy_spinlocked(thread_t thread, int category, int flavor, int value, int value2, task_pend_token_t pend_token);
 
 static int
 thread_get_requested_policy_spinlocked(thread_t thread, int category, int flavor, int* value2);
@@ -644,6 +644,50 @@ unlock:
        return kr;
 }
 
+void
+thread_freeze_base_pri(thread_t thread)
+{
+       assert(thread == current_thread());
+
+       spl_t s = splsched();
+       thread_lock(thread);
+
+       assert((thread->sched_flags & TH_SFLAG_BASE_PRI_FROZEN) == 0);
+       thread->sched_flags |= TH_SFLAG_BASE_PRI_FROZEN;
+
+       thread_unlock(thread);
+       splx(s);
+}
+
+bool
+thread_unfreeze_base_pri(thread_t thread)
+{
+       assert(thread == current_thread());
+       integer_t base_pri;
+       ast_t ast = 0;
+
+       spl_t s = splsched();
+       thread_lock(thread);
+
+       assert(thread->sched_flags & TH_SFLAG_BASE_PRI_FROZEN);
+       thread->sched_flags &= ~TH_SFLAG_BASE_PRI_FROZEN;
+
+       base_pri = thread->req_base_pri;
+       if (base_pri != thread->base_pri) {
+               /*
+                * This function returns "true" if the base pri change
+                * is the most likely cause for the preemption.
+                */
+               sched_set_thread_base_priority(thread, base_pri);
+               ast = ast_peek(AST_PREEMPT);
+       }
+
+       thread_unlock(thread);
+       splx(s);
+
+       return ast != 0;
+}
+
 uint8_t
 thread_workq_pri_for_qos(thread_qos_t qos)
 {
@@ -938,6 +982,9 @@ thread_update_qos_cpu_time(thread_t thread)
  *
  * Called with thread_lock and thread mutex held.
  */
+extern thread_t vm_pageout_scan_thread;
+extern boolean_t vps_dynamic_priority_enabled;
+
 void
 thread_recompute_priority(
        thread_t                thread)
@@ -1301,7 +1348,7 @@ thread_policy_get(
 
                        info->thps_user_promotions          = 0;
                        info->thps_user_promotion_basepri   = thread->user_promotion_basepri;
-                       info->thps_ipc_overrides            = thread->ipc_overrides;
+                       info->thps_ipc_overrides            = thread->kevent_overrides;
 
                        proc_get_thread_policy_bitfield(thread, info);
 
@@ -1464,7 +1511,8 @@ thread_policy_update_internal_spinlocked(thread_t thread, boolean_t recompute_pr
        if (requested.thrp_qos != THREAD_QOS_UNSPECIFIED) {
                next_qos = MAX(requested.thrp_qos_override, next_qos);
                next_qos = MAX(requested.thrp_qos_promote, next_qos);
-               next_qos = MAX(requested.thrp_qos_ipc_override, next_qos);
+               next_qos = MAX(requested.thrp_qos_kevent_override, next_qos);
+               next_qos = MAX(requested.thrp_qos_wlsvc_override, next_qos);
                next_qos = MAX(requested.thrp_qos_workq_override, next_qos);
        }
 
@@ -1658,6 +1706,8 @@ thread_policy_update_internal_spinlocked(thread_t thread, boolean_t recompute_pr
                pend_token->tpt_update_thread_sfi = 1;
        }
 
+       integer_t old_base_pri = thread->base_pri;
+
        /*
         * Step 5:
         *  Update other subsystems as necessary if something has changed
@@ -1672,6 +1722,20 @@ thread_policy_update_internal_spinlocked(thread_t thread, boolean_t recompute_pr
            recompute_priority) {
                thread_recompute_priority(thread);
        }
+
+       /*
+        * Check if the thread is waiting on a turnstile and needs priority propagation.
+        */
+       if (pend_token->tpt_update_turnstile &&
+           ((old_base_pri == thread->base_pri) ||
+           !thread_get_waiting_turnstile(thread))) {
+               /*
+                * Reset update turnstile pend token since either
+                * the thread priority did not change or thread is
+                * not blocked on a turnstile.
+                */
+               pend_token->tpt_update_turnstile = 0;
+       }
 }
 
 
@@ -1750,6 +1814,10 @@ thread_policy_update_complete_unlocked(thread_t thread, task_pend_token_t pend_t
        if (pend_token->tpt_update_thread_sfi) {
                sfi_reevaluate(thread);
        }
+
+       if (pend_token->tpt_update_turnstile) {
+               turnstile_update_thread_priority_chain(thread);
+       }
 }
 
 /*
@@ -1790,7 +1858,7 @@ proc_set_thread_policy_spinlocked(thread_t          thread,
            thread_tid(thread), threquested_0(thread),
            threquested_1(thread), value, 0);
 
-       thread_set_requested_policy_spinlocked(thread, category, flavor, value, value2);
+       thread_set_requested_policy_spinlocked(thread, category, flavor, value, value2, pend_token);
 
        thread_policy_update_spinlocked(thread, FALSE, pend_token);
 
@@ -1805,10 +1873,11 @@ proc_set_thread_policy_spinlocked(thread_t          thread,
  */
 static void
 thread_set_requested_policy_spinlocked(thread_t     thread,
-    int          category,
-    int          flavor,
-    int          value,
-    int          value2)
+    int               category,
+    int               flavor,
+    int               value,
+    int               value2,
+    task_pend_token_t pend_token)
 {
        int tier, passive;
 
@@ -1869,26 +1938,24 @@ thread_set_requested_policy_spinlocked(thread_t     thread,
                requested.thrp_through_qos = value;
                break;
 
-       case TASK_POLICY_QOS:
-               assert(category == TASK_POLICY_ATTRIBUTE);
-               requested.thrp_qos = value;
-               break;
-
        case TASK_POLICY_QOS_OVERRIDE:
                assert(category == TASK_POLICY_ATTRIBUTE);
                requested.thrp_qos_override = value;
+               pend_token->tpt_update_turnstile = 1;
                break;
 
        case TASK_POLICY_QOS_AND_RELPRIO:
                assert(category == TASK_POLICY_ATTRIBUTE);
                requested.thrp_qos = value;
                requested.thrp_qos_relprio = value2;
+               pend_token->tpt_update_turnstile = 1;
                DTRACE_BOOST3(qos_set, uint64_t, thread->thread_id, int, requested.thrp_qos, int, requested.thrp_qos_relprio);
                break;
 
        case TASK_POLICY_QOS_WORKQ_OVERRIDE:
                assert(category == TASK_POLICY_ATTRIBUTE);
                requested.thrp_qos_workq_override = value;
+               pend_token->tpt_update_turnstile = 1;
                break;
 
        case TASK_POLICY_QOS_PROMOTE:
@@ -1896,9 +1963,16 @@ thread_set_requested_policy_spinlocked(thread_t     thread,
                requested.thrp_qos_promote = value;
                break;
 
-       case TASK_POLICY_QOS_IPC_OVERRIDE:
+       case TASK_POLICY_QOS_KEVENT_OVERRIDE:
                assert(category == TASK_POLICY_ATTRIBUTE);
-               requested.thrp_qos_ipc_override = value;
+               requested.thrp_qos_kevent_override = value;
+               pend_token->tpt_update_turnstile = 1;
+               break;
+
+       case TASK_POLICY_QOS_SERVICER_OVERRIDE:
+               assert(category == TASK_POLICY_ATTRIBUTE);
+               requested.thrp_qos_wlsvc_override = value;
+               pend_token->tpt_update_turnstile = 1;
                break;
 
        case TASK_POLICY_TERMINATED:
@@ -2023,9 +2097,13 @@ thread_get_requested_policy_spinlocked(thread_t thread,
                assert(category == TASK_POLICY_ATTRIBUTE);
                value = requested.thrp_qos_promote;
                break;
-       case TASK_POLICY_QOS_IPC_OVERRIDE:
+       case TASK_POLICY_QOS_KEVENT_OVERRIDE:
+               assert(category == TASK_POLICY_ATTRIBUTE);
+               value = requested.thrp_qos_kevent_override;
+               break;
+       case TASK_POLICY_QOS_SERVICER_OVERRIDE:
                assert(category == TASK_POLICY_ATTRIBUTE);
-               value = requested.thrp_qos_ipc_override;
+               value = requested.thrp_qos_wlsvc_override;
                break;
        case TASK_POLICY_TERMINATED:
                assert(category == TASK_POLICY_ATTRIBUTE);
@@ -2644,10 +2722,9 @@ void
 proc_thread_qos_deallocate(thread_t thread)
 {
        /* This thread must have no more IPC overrides. */
-       assert(thread->ipc_overrides == 0);
-       assert(thread->requested_policy.thrp_qos_ipc_override == THREAD_QOS_UNSPECIFIED);
-       assert(thread->sync_ipc_overrides == 0);
-       assert(thread->requested_policy.thrp_qos_sync_ipc_override == THREAD_QOS_UNSPECIFIED);
+       assert(thread->kevent_overrides == 0);
+       assert(thread->requested_policy.thrp_qos_kevent_override == THREAD_QOS_UNSPECIFIED);
+       assert(thread->requested_policy.thrp_qos_wlsvc_override == THREAD_QOS_UNSPECIFIED);
 
        /*
         * Clear out any lingering override objects.
@@ -2688,7 +2765,7 @@ task_set_main_thread_qos(task_t task, thread_t thread)
 
        int primordial_qos = task_compute_main_thread_qos(task);
 
-       proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS,
+       proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO,
            primordial_qos, 0, &pend_token);
 
        thread_mtx_unlock(thread);
@@ -2719,6 +2796,46 @@ task_get_default_manager_qos(task_t task)
        return primordial_qos;
 }
 
+/*
+ * Check if the kernel promotion on thread has changed
+ * and apply it.
+ *
+ * thread locked on entry and exit
+ */
+boolean_t
+thread_recompute_kernel_promotion_locked(thread_t thread)
+{
+       boolean_t needs_update = FALSE;
+       int kern_promotion_schedpri = thread_get_inheritor_turnstile_sched_priority(thread);
+
+       /*
+        * For now just assert that kern_promotion_schedpri <= MAXPRI_PROMOTE.
+        * TURNSTILE_KERNEL_PROMOTE adds threads on the waitq already capped to MAXPRI_PROMOTE
+        * and propagates the priority through the chain with the same cap, because as of now it does
+        * not differenciate on the kernel primitive.
+        *
+        * If this assumption will change with the adoption of a kernel primitive that does not
+        * cap the when adding/propagating,
+        * then here is the place to put the generic cap for all kernel primitives
+        * (converts the assert to kern_promotion_schedpri = MIN(priority, MAXPRI_PROMOTE))
+        */
+       assert(kern_promotion_schedpri <= MAXPRI_PROMOTE);
+
+       if (kern_promotion_schedpri != thread->kern_promotion_schedpri) {
+               KDBG(MACHDBG_CODE(
+                           DBG_MACH_SCHED, MACH_TURNSTILE_KERNEL_CHANGE) | DBG_FUNC_NONE,
+                   thread_tid(thread),
+                   kern_promotion_schedpri,
+                   thread->kern_promotion_schedpri);
+
+               needs_update = TRUE;
+               thread->kern_promotion_schedpri = kern_promotion_schedpri;
+               thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
+       }
+
+       return needs_update;
+}
+
 /*
  * Check if the user promotion on thread has changed
  * and apply it.
@@ -2731,7 +2848,7 @@ thread_recompute_user_promotion_locked(thread_t thread)
 {
        boolean_t needs_update = FALSE;
        struct task_pend_token pend_token = {};
-       int user_promotion_basepri = MIN(thread_get_inheritor_turnstile_priority(thread), MAXPRI_USER);
+       int user_promotion_basepri = MIN(thread_get_inheritor_turnstile_base_priority(thread), MAXPRI_USER);
        int old_base_pri = thread->base_pri;
        thread_qos_t qos_promotion;
 
@@ -2745,6 +2862,11 @@ thread_recompute_user_promotion_locked(thread_t thread)
                    user_promotion_basepri,
                    thread->user_promotion_basepri,
                    0, 0);
+               KDBG(MACHDBG_CODE(
+                           DBG_MACH_SCHED, MACH_TURNSTILE_USER_CHANGE) | DBG_FUNC_NONE,
+                   thread_tid(thread),
+                   user_promotion_basepri,
+                   thread->user_promotion_basepri);
        }
 
        /* Update the user promotion base pri */
@@ -2791,8 +2913,8 @@ thread_user_promotion_qos_for_pri(int priority)
 }
 
 /*
- * Set the thread's QoS IPC override
- * Owned by the IPC subsystem
+ * Set the thread's QoS Kevent override
+ * Owned by the Kevent subsystem
  *
  * May be called with spinlocks held, but not spinlocks
  * that may deadlock against the thread lock, the throttle lock, or the SFI lock.
@@ -2802,7 +2924,7 @@ thread_user_promotion_qos_for_pri(int priority)
  * Before the thread is deallocated, there must be 0 remaining overrides.
  */
 static void
-thread_ipc_override(thread_t    thread,
+thread_kevent_override(thread_t    thread,
     uint32_t    qos_override,
     boolean_t   is_new_override)
 {
@@ -2812,13 +2934,13 @@ thread_ipc_override(thread_t    thread,
        spl_t s = splsched();
        thread_lock(thread);
 
-       uint32_t old_override = thread->requested_policy.thrp_qos_ipc_override;
+       uint32_t old_override = thread->requested_policy.thrp_qos_kevent_override;
 
        assert(qos_override > THREAD_QOS_UNSPECIFIED);
        assert(qos_override < THREAD_QOS_LAST);
 
        if (is_new_override) {
-               if (thread->ipc_overrides++ == 0) {
+               if (thread->kevent_overrides++ == 0) {
                        /* This add is the first override for this thread */
                        assert(old_override == THREAD_QOS_UNSPECIFIED);
                } else {
@@ -2827,7 +2949,7 @@ thread_ipc_override(thread_t    thread,
                }
        } else {
                /* There must be at least one override (the previous add call) in effect */
-               assert(thread->ipc_overrides > 0);
+               assert(thread->kevent_overrides > 0);
                assert(old_override > THREAD_QOS_UNSPECIFIED);
        }
 
@@ -2835,7 +2957,7 @@ thread_ipc_override(thread_t    thread,
         * We can't allow lowering if there are several IPC overrides because
         * the caller can't possibly know the whole truth
         */
-       if (thread->ipc_overrides == 1) {
+       if (thread->kevent_overrides == 1) {
                needs_update = qos_override != old_override;
        } else {
                needs_update = qos_override > old_override;
@@ -2843,7 +2965,7 @@ thread_ipc_override(thread_t    thread,
 
        if (needs_update) {
                proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
-                   TASK_POLICY_QOS_IPC_OVERRIDE,
+                   TASK_POLICY_QOS_KEVENT_OVERRIDE,
                    qos_override, 0, &pend_token);
                assert(pend_token.tpt_update_sockets == 0);
        }
@@ -2855,37 +2977,35 @@ thread_ipc_override(thread_t    thread,
 }
 
 void
-thread_add_ipc_override(thread_t    thread,
-    uint32_t    qos_override)
+thread_add_kevent_override(thread_t thread, uint32_t qos_override)
 {
-       thread_ipc_override(thread, qos_override, TRUE);
+       thread_kevent_override(thread, qos_override, TRUE);
 }
 
 void
-thread_update_ipc_override(thread_t     thread,
-    uint32_t     qos_override)
+thread_update_kevent_override(thread_t thread, uint32_t qos_override)
 {
-       thread_ipc_override(thread, qos_override, FALSE);
+       thread_kevent_override(thread, qos_override, FALSE);
 }
 
 void
-thread_drop_ipc_override(thread_t thread)
+thread_drop_kevent_override(thread_t thread)
 {
        struct task_pend_token pend_token = {};
 
        spl_t s = splsched();
        thread_lock(thread);
 
-       assert(thread->ipc_overrides > 0);
+       assert(thread->kevent_overrides > 0);
 
-       if (--thread->ipc_overrides == 0) {
+       if (--thread->kevent_overrides == 0) {
                /*
                 * There are no more overrides for this thread, so we should
                 * clear out the saturated override value
                 */
 
                proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
-                   TASK_POLICY_QOS_IPC_OVERRIDE, THREAD_QOS_UNSPECIFIED,
+                   TASK_POLICY_QOS_KEVENT_OVERRIDE, THREAD_QOS_UNSPECIFIED,
                    0, &pend_token);
        }
 
@@ -2895,6 +3015,69 @@ thread_drop_ipc_override(thread_t thread)
        thread_policy_update_complete_unlocked(thread, &pend_token);
 }
 
+/*
+ * Set the thread's QoS Workloop Servicer override
+ * Owned by the Kevent subsystem
+ *
+ * May be called with spinlocks held, but not spinlocks
+ * that may deadlock against the thread lock, the throttle lock, or the SFI lock.
+ *
+ * One 'add' must be balanced by one 'drop'.
+ * Between 'add' and 'drop', the overide QoS value may be updated with an 'update'.
+ * Before the thread is deallocated, there must be 0 remaining overrides.
+ */
+static void
+thread_servicer_override(thread_t    thread,
+    uint32_t    qos_override,
+    boolean_t   is_new_override)
+{
+       struct task_pend_token pend_token = {};
+
+       spl_t s = splsched();
+       thread_lock(thread);
+
+       if (is_new_override) {
+               assert(!thread->requested_policy.thrp_qos_wlsvc_override);
+       } else {
+               assert(thread->requested_policy.thrp_qos_wlsvc_override);
+       }
+
+       proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
+           TASK_POLICY_QOS_SERVICER_OVERRIDE,
+           qos_override, 0, &pend_token);
+
+       thread_unlock(thread);
+       splx(s);
+
+       assert(pend_token.tpt_update_sockets == 0);
+       thread_policy_update_complete_unlocked(thread, &pend_token);
+}
+
+void
+thread_add_servicer_override(thread_t thread, uint32_t qos_override)
+{
+       assert(qos_override > THREAD_QOS_UNSPECIFIED);
+       assert(qos_override < THREAD_QOS_LAST);
+
+       thread_servicer_override(thread, qos_override, TRUE);
+}
+
+void
+thread_update_servicer_override(thread_t thread, uint32_t qos_override)
+{
+       assert(qos_override > THREAD_QOS_UNSPECIFIED);
+       assert(qos_override < THREAD_QOS_LAST);
+
+       thread_servicer_override(thread, qos_override, FALSE);
+}
+
+void
+thread_drop_servicer_override(thread_t thread)
+{
+       thread_servicer_override(thread, THREAD_QOS_UNSPECIFIED, FALSE);
+}
+
+
 /* Get current requested qos / relpri, may be called from spinlock context */
 thread_qos_t
 thread_get_requested_qos(thread_t thread, int *relpri)
index 08c841925b39a23c5d9233b97bfa5a61433d323d..cba8edd6d1ed1e36e402ee263b414f3653fd0cf2 100644 (file)
@@ -176,6 +176,8 @@ extern int              setPop(uint64_t time);
 
 extern void             timer_resync_deadlines(void);
 
+extern void             timer_queue_expire_local(void *arg);
+
 extern void             timer_set_deadline(uint64_t deadline);
 
 extern void             quantum_timer_set_deadline(uint64_t deadline);
index 22c57cc56054a023f4ad45510772e69c3345ceef..1c75a2ecc5f148c8b398f0876d28fd6717fefc2b 100644 (file)
@@ -95,6 +95,23 @@ tlock_mark_owned(lck_ticket_t *tlock, thread_t cthread)
        __c11_atomic_store((_Atomic thread_t *)&tlock->lck_owner, cthread, __ATOMIC_RELAXED);
 }
 
+#if __arm__ || __arm64__
+__unused static uint8_t
+load_exclusive_acquire8(uint8_t *target)
+{
+       uint8_t value;
+#if __arm__
+       value = __builtin_arm_ldrex(target);
+       __c11_atomic_thread_fence(__ATOMIC_ACQUIRE);
+#else
+       value = __builtin_arm_ldaex(target);    // ldaxr
+       /* "Compiler barrier", no barrier instructions are emitted */
+       atomic_signal_fence(memory_order_acquire);
+#endif
+       return value;
+}
+#endif
+
 /* On contention, poll for ownership
  * Returns when the current ticket is observed equal to "mt"
  */
@@ -117,7 +134,7 @@ tlock_contended(uint8_t *tp, uint8_t mt, lck_ticket_t *tlock, thread_t cthread)
                                 * TODO: determine specific micro-architectures
                                 * which benefit, modern CPUs may not
                                 */
-                               clear_exclusive();
+                               os_atomic_clear_exclusive();
                                tlock_mark_owned(tlock, cthread);
                                return;
                        }
index 355039d84c613797dab1cd0659a20c2f2b0d1053..7017c7fa804d846b9a08ab1e0d1053ac400e425a 100644 (file)
@@ -35,6 +35,7 @@
 
 #include <uuid/uuid.h>
 
+#ifdef PLATFORM_BridgeOS
 /* Version 0 trust caches: No defined sorting order (thus only suitable for small trust caches).
  * Used for loadable trust caches only, until phasing out support. */
 typedef uint8_t trust_cache_hash0[CS_CDHASH_LEN];
@@ -44,6 +45,7 @@ struct trust_cache_module0 {
        uint32_t num_hashes;
        trust_cache_hash0 hashes[];
 } __attribute__((__packed__));
+#endif
 
 
 /* Version 1 trust caches: Always sorted by cdhash, added hash type and flags field.
@@ -65,6 +67,22 @@ struct trust_cache_module1 {
 // Trust Cache Entry Flags
 #define CS_TRUST_CACHE_AMFID    0x1                     // valid cdhash for amfid
 
+/* Trust Cache lookup functions return their result as a 32bit value
+ * comprised of subfields, for straightforward passing through layers.
+ *
+ * Format:
+ *
+ * 0xXXCCBBAA
+ *
+ * AA:  0-7: lookup result
+ *  bit  0: TC_LOOKUP_FOUND: set if any entry found
+ *  bit  1: (obsolete) TC_LOOKUP_FALLBACK: set if found in legacy static trust cache
+ *  bit  2-7: reserved
+ * BB:  8-15: entry flags pass-through, see "Trust Cache Entry Flags" above
+ * CC: 16-23: code directory hash type of entry, see CS_HASHTYPE_* in cs_blobs.h
+ * XX: 24-31: reserved
+ */
+
 #define TC_LOOKUP_HASH_TYPE_SHIFT               16
 #define TC_LOOKUP_HASH_TYPE_MASK                0xff0000L;
 #define TC_LOOKUP_FLAGS_SHIFT                   8
@@ -73,7 +91,6 @@ struct trust_cache_module1 {
 #define TC_LOOKUP_RESULT_MASK                   0xffL
 
 #define TC_LOOKUP_FOUND         1
-// #define TC_LOOKUP_FALLBACK      2 /* obsolete with removal of legacy static trust caches */
 
 #ifdef XNU_KERNEL_PRIVATE
 
index ea58c5477c985e1d7b96c9b92eda60af8751100e..6375a3704da93dc0209f16689d87a0fa6588aa7c 100644 (file)
@@ -40,7 +40,7 @@
 #include <kern/sched_prim.h>
 #include <kern/zalloc.h>
 #include <kern/debug.h>
-#include <machine/machlimits.h>
+#include <machine/limits.h>
 #include <machine/atomic.h>
 
 #include <pexpert/pexpert.h>
 
 static zone_t turnstiles_zone;
 static int turnstile_max_hop;
+static struct mpsc_daemon_queue turnstile_deallocate_queue;
 #define MAX_TURNSTILES (thread_max)
 #define TURNSTILES_CHUNK (THREAD_CHUNK)
 
 /* Global table for turnstile promote policy for all type of turnstiles */
-turnstile_promote_policy_t turnstile_promote_policy[TURNSTILE_TOTAL_TYPES] = {
+static const turnstile_promote_policy_t turnstile_promote_policy[TURNSTILE_TOTAL_TYPES] = {
        [TURNSTILE_NONE]          = TURNSTILE_PROMOTE_NONE,
        [TURNSTILE_KERNEL_MUTEX]  = TURNSTILE_KERNEL_PROMOTE,
        [TURNSTILE_ULOCK]         = TURNSTILE_USER_PROMOTE,
@@ -62,6 +63,20 @@ turnstile_promote_policy_t turnstile_promote_policy[TURNSTILE_TOTAL_TYPES] = {
        [TURNSTILE_WORKLOOPS]     = TURNSTILE_USER_IPC_PROMOTE,
        [TURNSTILE_WORKQS]        = TURNSTILE_USER_IPC_PROMOTE,
        [TURNSTILE_KNOTE]         = TURNSTILE_USER_IPC_PROMOTE,
+       [TURNSTILE_SLEEP_INHERITOR] = TURNSTILE_KERNEL_PROMOTE,
+};
+
+/* Global table for turnstile hash lock policy for all type of turnstiles */
+static const turnstile_hash_lock_policy_t turnstile_hash_lock_policy[TURNSTILE_TOTAL_TYPES] = {
+       [TURNSTILE_NONE]          = TURNSTILE_HASH_LOCK_POLICY_NONE,
+       [TURNSTILE_KERNEL_MUTEX]  = TURNSTILE_HASH_LOCK_POLICY_NONE,
+       [TURNSTILE_ULOCK]         = TURNSTILE_HASH_LOCK_POLICY_NONE,
+       [TURNSTILE_PTHREAD_MUTEX] = TURNSTILE_HASH_LOCK_POLICY_NONE,
+       [TURNSTILE_SYNC_IPC]      = TURNSTILE_HASH_LOCK_POLICY_NONE,
+       [TURNSTILE_WORKLOOPS]     = TURNSTILE_HASH_LOCK_POLICY_NONE,
+       [TURNSTILE_WORKQS]        = TURNSTILE_HASH_LOCK_POLICY_NONE,
+       [TURNSTILE_KNOTE]         = TURNSTILE_HASH_LOCK_POLICY_NONE,
+       [TURNSTILE_SLEEP_INHERITOR] = (TURNSTILE_IRQ_UNSAFE_HASH | TURNSTILE_LOCKED_HASH),
 };
 
 os_refgrp_decl(static, turnstile_refgrp, "turnstile", NULL);
@@ -90,8 +105,7 @@ static struct turnstile_stats turnstile_boost_stats[TURNSTILE_MAX_HOP_DEFAULT] =
 static struct turnstile_stats turnstile_unboost_stats[TURNSTILE_MAX_HOP_DEFAULT] = {};
 uint64_t thread_block_on_turnstile_count;
 uint64_t thread_block_on_regular_waitq_count;
-
-#endif
+#endif /* DEVELOPMENT || DEBUG */
 
 #ifndef max
 #define max(a, b)        (((a) > (b)) ? (a) : (b))
@@ -161,6 +175,9 @@ static turnstile_stats_update_flags_t
 thread_get_update_flags_for_turnstile_propagation_stoppage(thread_t thread);
 static turnstile_stats_update_flags_t
 turnstile_get_update_flags_for_above_UI_pri_change(struct turnstile *turnstile);
+static void turnstile_stash_inheritor(turnstile_inheritor_t new_inheritor,
+    turnstile_update_flags_t flags);
+static int turnstile_compute_thread_push(struct turnstile *turnstile, thread_t thread);
 
 #if DEVELOPMENT || DEBUG
 /* Test primitives and interfaces for testing turnstiles */
@@ -173,6 +190,9 @@ struct tstile_test_prim {
 
 struct tstile_test_prim *test_prim_ts_inline;
 struct tstile_test_prim *test_prim_global_htable;
+struct tstile_test_prim *test_prim_global_ts_kernel;
+struct tstile_test_prim *test_prim_global_ts_kernel_hash;
+
 static void
 tstile_test_prim_init(struct tstile_test_prim **test_prim_ptr);
 #endif
@@ -235,9 +255,12 @@ struct turnstile_htable_bucket {
 };
 
 SECURITY_READ_ONLY_LATE(static uint32_t) ts_htable_buckets;
-/* Global hashtable for turnstiles */
+/* Global hashtable for turnstiles managed with interrupts disabled */
+SECURITY_READ_ONLY_LATE(static struct turnstile_htable_bucket *)turnstile_htable_irq_safe;
+/* Global hashtable for turnstiles managed with interrupts enabled */
 SECURITY_READ_ONLY_LATE(static struct turnstile_htable_bucket *)turnstile_htable;
 
+
 /* Bucket locks for turnstile hashtable */
 lck_grp_t               turnstiles_htable_lock_grp;
 lck_attr_t              turnstiles_htable_lock_attr;
@@ -250,6 +273,9 @@ lck_grp_attr_t          turnstiles_htable_lock_grp_attr;
 #define turnstile_bucket_unlock(bucket) \
        lck_spin_unlock(&bucket->ts_ht_bucket_lock)
 
+#define kdp_turnstile_bucket_is_locked(bucket) \
+       kdp_lck_spin_is_acquired(&bucket->ts_ht_bucket_lock)
+
 /*
  * Name: turnstiles_hashtable_init
  *
@@ -271,18 +297,26 @@ turnstiles_hashtable_init(void)
 
        assert(ts_htable_buckets <= TURNSTILE_HTABLE_BUCKETS_MAX);
        uint32_t ts_htable_size = ts_htable_buckets * sizeof(struct turnstile_htable_bucket);
+       turnstile_htable_irq_safe = (struct turnstile_htable_bucket *)kalloc(ts_htable_size);
+       if (turnstile_htable_irq_safe == NULL) {
+               panic("Turnstiles hash table memory allocation failed!");
+       }
+
        turnstile_htable = (struct turnstile_htable_bucket *)kalloc(ts_htable_size);
        if (turnstile_htable == NULL) {
                panic("Turnstiles hash table memory allocation failed!");
        }
-
        lck_grp_attr_setdefault(&turnstiles_htable_lock_grp_attr);
        lck_grp_init(&turnstiles_htable_lock_grp, "turnstiles_htable_locks", &turnstiles_htable_lock_grp_attr);
        lck_attr_setdefault(&turnstiles_htable_lock_attr);
 
-       /* Initialize all the buckets of the hashtable */
+       /* Initialize all the buckets of the hashtables */
        for (uint32_t i = 0; i < ts_htable_buckets; i++) {
-               struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[i]);
+               struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable_irq_safe[i]);
+               turnstile_bucket_lock_init(ts_bucket);
+               SLIST_INIT(&ts_bucket->ts_ht_bucket_list);
+
+               ts_bucket = &(turnstile_htable[i]);
                turnstile_bucket_lock_init(ts_bucket);
                SLIST_INIT(&ts_bucket->ts_ht_bucket_list);
        }
@@ -377,6 +411,110 @@ turnstile_hash(uintptr_t proprietor)
        return hash & (ts_htable_buckets - 1);
 }
 
+static inline struct turnstile_htable_bucket *
+turnstile_get_bucket(uint32_t index, turnstile_type_t type)
+{
+       struct turnstile_htable_bucket *ts_bucket;
+       int hash_policy = turnstile_hash_lock_policy[type];
+
+       if (hash_policy & TURNSTILE_IRQ_UNSAFE_HASH) {
+               ts_bucket = &(turnstile_htable[index]);
+       } else {
+               ts_bucket = &(turnstile_htable_irq_safe[index]);
+       }
+
+       return ts_bucket;
+}
+
+/*
+ * Name: turnstile_hash_bucket_lock
+ *
+ * Description: locks the spinlock associated with proprietor's bucket.
+ *              if proprietor is specified the index for the hash will be
+ *              recomputed and returned in index_proprietor,
+ *              otherwise the value save in index_proprietor is used as index.
+ *
+ * Args:
+ *   Arg1: proprietor (key) for hashing
+ *   Arg2: index for proprietor in the hash
+ *   Arg3: turnstile type
+ *
+ * Returns: old value of irq if irq were disabled before acquiring the lock.
+ */
+unsigned
+turnstile_hash_bucket_lock(uintptr_t proprietor, uint32_t *index_proprietor, turnstile_type_t type)
+{
+       struct turnstile_htable_bucket *ts_bucket;
+       int hash_policy = turnstile_hash_lock_policy[type];
+       bool irq_safe = !(hash_policy & TURNSTILE_IRQ_UNSAFE_HASH);
+       spl_t ret = 0;
+       uint32_t index;
+
+       /*
+        * If the proprietor is specified, the caller doesn't know
+        * the index in the hash, so compute it.
+        * Otherwise use the value of index provided.
+        */
+       if (proprietor) {
+               index = turnstile_hash(proprietor);
+               *index_proprietor = index;
+       } else {
+               index = *index_proprietor;
+       }
+
+       ts_bucket = turnstile_get_bucket(index, type);
+
+       if (irq_safe) {
+               ret = splsched();
+       }
+
+       turnstile_bucket_lock(ts_bucket);
+
+       return ret;
+}
+
+/*
+ * Name: turnstile_hash_bucket_unlock
+ *
+ * Description: unlocks the spinlock associated with proprietor's bucket.
+ *              if proprietor is specified the index for the hash will be
+ *              recomputed and returned in index_proprietor,
+ *              otherwise the value save in index_proprietor is used as index.
+ *
+ * Args:
+ *   Arg1: proprietor (key) for hashing
+ *   Arg2: index for proprietor in the hash
+ *   Arg3: turnstile type
+ *   Arg4: irq value returned by turnstile_hash_bucket_lock
+ *
+ */
+void
+turnstile_hash_bucket_unlock(uintptr_t proprietor, uint32_t *index_proprietor, turnstile_type_t type, unsigned s)
+{
+       struct turnstile_htable_bucket *ts_bucket;
+       int hash_policy = turnstile_hash_lock_policy[type];
+       bool irq_safe = !(hash_policy & TURNSTILE_IRQ_UNSAFE_HASH);
+       uint32_t index;
+
+       /*
+        * If the proprietor is specified, the caller doesn't know
+        * the index in the hash, so compute it.
+        * Otherwise use the value of index provided.
+        */
+       if (proprietor) {
+               index = turnstile_hash(proprietor);
+               *index_proprietor = index;
+       } else {
+               index = *index_proprietor;
+       }
+       ts_bucket = turnstile_get_bucket(index, type);
+
+       turnstile_bucket_unlock(ts_bucket);
+       if (irq_safe) {
+               splx(s);
+       }
+}
+
 /*
  * Name: turnstile_htable_lookup_add
  *
@@ -389,6 +527,7 @@ turnstile_hash(uintptr_t proprietor)
  * Args:
  *   Arg1: proprietor
  *   Arg2: new turnstile for primitive
+ *   Arg3: turnstile_type_t type
  *
  * Returns:
  *   Previous turnstile for proprietor in the hash table
@@ -396,15 +535,26 @@ turnstile_hash(uintptr_t proprietor)
 static struct turnstile *
 turnstile_htable_lookup_add(
        uintptr_t proprietor,
-       struct turnstile *new_turnstile)
+       struct turnstile *new_turnstile,
+       turnstile_type_t type)
 {
        uint32_t index = turnstile_hash(proprietor);
        assert(index < ts_htable_buckets);
-       struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[index]);
+       struct turnstile_htable_bucket *ts_bucket;
+       int hash_policy = turnstile_hash_lock_policy[type];
+       bool needs_lock = !(hash_policy & TURNSTILE_LOCKED_HASH);
+       bool irq_safe = !(hash_policy & TURNSTILE_IRQ_UNSAFE_HASH);
        spl_t s;
 
-       s = splsched();
-       turnstile_bucket_lock(ts_bucket);
+       ts_bucket = turnstile_get_bucket(index, type);
+
+       if (needs_lock) {
+               if (irq_safe) {
+                       s = splsched();
+               }
+               turnstile_bucket_lock(ts_bucket);
+       }
+
        struct turnstile *ts;
 
        SLIST_FOREACH(ts, &ts_bucket->ts_ht_bucket_list, ts_htable_link) {
@@ -413,8 +563,12 @@ turnstile_htable_lookup_add(
                         * Found an entry in the hashtable for this proprietor; add thread turnstile to freelist
                         * and return this turnstile
                         */
-                       turnstile_bucket_unlock(ts_bucket);
-                       splx(s);
+                       if (needs_lock) {
+                               turnstile_bucket_unlock(ts_bucket);
+                               if (irq_safe) {
+                                       splx(s);
+                               }
+                       }
                        turnstile_freelist_insert(ts, new_turnstile);
                        return ts;
                }
@@ -423,8 +577,12 @@ turnstile_htable_lookup_add(
        /* No entry for this proprietor; add the new turnstile in the hash table */
        SLIST_INSERT_HEAD(&ts_bucket->ts_ht_bucket_list, new_turnstile, ts_htable_link);
        turnstile_state_add(new_turnstile, TURNSTILE_STATE_HASHTABLE);
-       turnstile_bucket_unlock(ts_bucket);
-       splx(s);
+       if (needs_lock) {
+               turnstile_bucket_unlock(ts_bucket);
+               if (irq_safe) {
+                       splx(s);
+               }
+       }
        /* Since there was no previous entry for this proprietor, return TURNSTILE_NULL */
        return TURNSTILE_NULL;
 }
@@ -442,6 +600,7 @@ turnstile_htable_lookup_add(
  * Args:
  *   Arg1: proprietor
  *   Arg2: free turnstile to be returned
+ *   Arg3: turnstile_type_t type
  *
  * Returns:
  *   turnstile for this proprietor in the hashtable after the removal
@@ -449,16 +608,27 @@ turnstile_htable_lookup_add(
 static struct turnstile *
 turnstable_htable_lookup_remove(
        uintptr_t proprietor,
-       struct turnstile **free_turnstile)
+       struct turnstile **free_turnstile,
+       turnstile_type_t type)
 {
        uint32_t index = turnstile_hash(proprietor);
        assert(index < ts_htable_buckets);
-       struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[index]);
+       struct turnstile_htable_bucket *ts_bucket;
        struct turnstile *ret_turnstile = TURNSTILE_NULL;
+       int hash_policy = turnstile_hash_lock_policy[type];
+       bool needs_lock = !(hash_policy & TURNSTILE_LOCKED_HASH);
+       bool irq_safe = !(hash_policy & TURNSTILE_IRQ_UNSAFE_HASH);
        spl_t s;
 
-       s = splsched();
-       turnstile_bucket_lock(ts_bucket);
+       ts_bucket = turnstile_get_bucket(index, type);
+
+       if (needs_lock) {
+               if (irq_safe) {
+                       s = splsched();
+               }
+               turnstile_bucket_lock(ts_bucket);
+       }
+
        struct turnstile *ts, **prev_tslink;
        /* Find the turnstile for the given proprietor in the hashtable */
        SLIST_FOREACH_PREVPTR(ts, prev_tslink, &ts_bucket->ts_ht_bucket_list, ts_htable_link) {
@@ -474,8 +644,12 @@ turnstable_htable_lookup_remove(
                /* No turnstiles on the freelist; remove the turnstile from the hashtable and mark it freed */
                *prev_tslink = SLIST_NEXT(ret_turnstile, ts_htable_link);
                turnstile_state_remove(ret_turnstile, TURNSTILE_STATE_HASHTABLE);
-               turnstile_bucket_unlock(ts_bucket);
-               splx(s);
+               if (needs_lock) {
+                       turnstile_bucket_unlock(ts_bucket);
+                       if (irq_safe) {
+                               splx(s);
+                       }
+               }
                *free_turnstile = ret_turnstile;
                return TURNSTILE_NULL;
        } else {
@@ -483,8 +657,12 @@ turnstable_htable_lookup_remove(
                 * Turnstile has free turnstiles on its list; leave the hashtable unchanged
                 * and return the first turnstile in the freelist as the free turnstile
                 */
-               turnstile_bucket_unlock(ts_bucket);
-               splx(s);
+               if (needs_lock) {
+                       turnstile_bucket_unlock(ts_bucket);
+                       if (irq_safe) {
+                               splx(s);
+                       }
+               }
                *free_turnstile = turnstile_freelist_remove(ret_turnstile);
                return ret_turnstile;
        }
@@ -499,21 +677,39 @@ turnstable_htable_lookup_remove(
  *
  * Args:
  *   Arg1: proprietor
+ *   Arg2: turnstile_type_t type
  *
  * Returns:
  *   Turnstile for proprietor in the hash table
  */
 static struct turnstile *
 turnstile_htable_lookup(
-       uintptr_t proprietor)
+       uintptr_t proprietor,
+       turnstile_type_t type)
 {
        uint32_t index = turnstile_hash(proprietor);
        assert(index < ts_htable_buckets);
-       struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[index]);
+       bool kdp_ctx = !not_in_kdp;
+       struct turnstile_htable_bucket *ts_bucket = turnstile_get_bucket(index, type);
+       int hash_policy = turnstile_hash_lock_policy[type];
+       bool needs_lock = !(hash_policy & TURNSTILE_LOCKED_HASH);
+       bool irq_safe = !(hash_policy & TURNSTILE_IRQ_UNSAFE_HASH);
        spl_t s;
 
-       s = splsched();
-       turnstile_bucket_lock(ts_bucket);
+       if (needs_lock) {
+               if (irq_safe && !kdp_ctx) {
+                       s = splsched();
+               }
+
+               if (kdp_ctx) {
+                       if (kdp_turnstile_bucket_is_locked(ts_bucket)) {
+                               /* This should move to TURNSTILE_BUSY once 51725781 is in the build */
+                               return TURNSTILE_NULL;
+                       }
+               } else {
+                       turnstile_bucket_lock(ts_bucket);
+               }
+       }
        struct turnstile *ts = TURNSTILE_NULL;
        struct turnstile *ret_turnstile = TURNSTILE_NULL;
 
@@ -525,11 +721,39 @@ turnstile_htable_lookup(
                }
        }
 
-       turnstile_bucket_unlock(ts_bucket);
-       splx(s);
+       if (needs_lock && !kdp_ctx) {
+               turnstile_bucket_unlock(ts_bucket);
+               if (irq_safe) {
+                       splx(s);
+               }
+       }
+
        return ret_turnstile;
 }
 
+/*
+ * Name: turnstile_deallocate_queue_invoke
+ *
+ * Description: invoke function for the asynchronous turnstile deallocation
+ *              queue
+ *
+ * Arg1: &turnstile_deallocate_queue
+ * Arg2: a pointer to the turnstile ts_deallocate_link member of a tunrstile to
+ *       destroy.
+ *
+ * Returns: None.
+ */
+static void
+turnstile_deallocate_queue_invoke(mpsc_queue_chain_t e,
+    __assert_only mpsc_daemon_queue_t dq)
+{
+       struct turnstile *ts;
+
+       ts = mpsc_queue_element(e, struct turnstile, ts_deallocate_link);
+       assert(dq == &turnstile_deallocate_queue);
+       turnstile_destroy(ts);
+}
+
 /*
  * Name: turnstiles_init
  *
@@ -553,6 +777,9 @@ turnstiles_init(void)
 
        turnstiles_hashtable_init();
 
+       thread_deallocate_daemon_register_queue(&turnstile_deallocate_queue,
+           turnstile_deallocate_queue_invoke);
+
 #if DEVELOPMENT || DEBUG
        /* Initialize the global turnstile locks and lock group */
 
@@ -566,6 +793,8 @@ turnstiles_init(void)
        /* Initialize turnstile test primitive */
        tstile_test_prim_init(&test_prim_ts_inline);
        tstile_test_prim_init(&test_prim_global_htable);
+       tstile_test_prim_init(&test_prim_global_ts_kernel);
+       tstile_test_prim_init(&test_prim_global_ts_kernel_hash);
 #endif
        return;
 }
@@ -620,12 +849,12 @@ turnstile_init(struct turnstile *turnstile)
 
        turnstile->ts_inheritor = TURNSTILE_INHERITOR_NULL;
        SLIST_INIT(&turnstile->ts_free_turnstiles);
-       turnstile->ts_type_gencount = 0;
+       os_atomic_init(&turnstile->ts_type_gencount, 0);
        turnstile_set_type_and_increment_gencount(turnstile, TURNSTILE_NONE);
        turnstile_state_init(turnstile, TURNSTILE_STATE_THREAD);
        os_ref_init_count(&turnstile->ts_refcount, &turnstile_refgrp, 1);
        turnstile->ts_proprietor = TURNSTILE_PROPRIETOR_NULL;
-       turnstile->ts_priority = MAXPRI_THROTTLE;
+       turnstile->ts_priority = 0;
        turnstile->ts_inheritor_flags = TURNSTILE_UPDATE_FLAGS_NONE;
        turnstile->ts_port_ref = 0;
        priority_queue_init(&turnstile->ts_inheritor_queue,
@@ -694,8 +923,8 @@ turnstile_deallocate_safe(struct turnstile *turnstile)
        }
 
        if (__improbable(os_ref_release(&turnstile->ts_refcount) == 0)) {
-               /* enqueue the turnstile for thread deallocate deamon to call turnstile_destroy */
-               turnstile_deallocate_enqueue(turnstile);
+               mpsc_daemon_enqueue(&turnstile_deallocate_queue,
+                   &turnstile->ts_deallocate_link, MPSC_QUEUE_DISABLE_PREEMPTION);
        }
 }
 
@@ -772,7 +1001,7 @@ turnstile_prepare(
        thread_turnstile->ts_proprietor = proprietor;
        turnstile_state_remove(thread_turnstile, TURNSTILE_STATE_THREAD);
 
-       thread_turnstile->ts_priority = MAXPRI_THROTTLE;
+       thread_turnstile->ts_priority = 0;
 #if DEVELOPMENT || DEBUG
        thread_turnstile->ts_prev_thread = thread_turnstile->ts_thread;
        thread_turnstile->ts_thread = NULL;
@@ -802,7 +1031,7 @@ turnstile_prepare(
                /*
                 * Lookup the primitive in the turnstile hash table and see if it already has an entry.
                 */
-               ret_turnstile = turnstile_htable_lookup_add(proprietor, thread_turnstile);
+               ret_turnstile = turnstile_htable_lookup_add(proprietor, thread_turnstile, type);
                if (ret_turnstile == NULL) {
                        ret_turnstile = thread_turnstile;
                        KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
@@ -820,13 +1049,13 @@ turnstile_prepare(
  * Name: turnstile_complete
  *
  * Description: Transfer the primitive's turnstile or from it's freelist to current thread.
- *              Function is called holding the interlock (spinlock) of the primitive.
  *              Current thread will have a turnstile attached to it after this call.
  *
  * Args:
  *   Arg1: proprietor
  *   Arg2: pointer in primitive struct to update turnstile
  *   Arg3: pointer to store the returned turnstile instead of attaching it to thread
+ *   Arg4: type of primitive
  *
  * Returns:
  *   None.
@@ -835,7 +1064,8 @@ void
 turnstile_complete(
        uintptr_t proprietor,
        struct turnstile **tstore,
-       struct turnstile **out_turnstile)
+       struct turnstile **out_turnstile,
+       turnstile_type_t type)
 {
        thread_t thread = current_thread();
        struct turnstile *primitive_turnstile = TURNSTILE_NULL;
@@ -861,7 +1091,7 @@ turnstile_complete(
                primitive_turnstile = *tstore;
        } else {
                /* Use the global hash to find and remove a turnstile */
-               primitive_turnstile = turnstable_htable_lookup_remove(proprietor, &thread_turnstile);
+               primitive_turnstile = turnstable_htable_lookup_remove(proprietor, &thread_turnstile, type);
        }
        if (primitive_turnstile == NULL) {
                /*
@@ -910,6 +1140,42 @@ turnstile_complete(
        return;
 }
 
+/*
+ * Name: turnstile_kernel_update_inheritor_on_wake_locked
+ *
+ * Description: Set thread as the inheritor of the turnstile and
+ *             boost the inheritor.
+ * Args:
+ *   Arg1: turnstile
+ *   Arg2: new_inheritor
+ *   Arg3: flags
+ *
+ * Called with turnstile locked
+ */
+void
+turnstile_kernel_update_inheritor_on_wake_locked(
+       struct turnstile *turnstile,
+       turnstile_inheritor_t new_inheritor,
+       turnstile_update_flags_t flags __assert_only)
+{
+       /* for now only kernel primitives are allowed to call this function */
+       __assert_only turnstile_promote_policy_t policy =
+           turnstile_promote_policy[turnstile_get_type(turnstile)];
+
+       assert(flags & TURNSTILE_INHERITOR_THREAD);
+       assert(policy == TURNSTILE_KERNEL_PROMOTE || policy == TURNSTILE_USER_PROMOTE);
+
+       turnstile_stash_inheritor((thread_t)new_inheritor, TURNSTILE_INHERITOR_THREAD);
+       /*
+        * new_inheritor has just been removed from the turnstile waitq,
+        * the turnstile new priority needs to be recomputed so that
+        * when new_inheritor will become this turnstile inheritor can
+        * inherit the correct priority.
+        */
+       turnstile_recompute_priority_locked(turnstile);
+       turnstile_update_inheritor_locked(turnstile);
+}
+
 /*
  * Name: turnstile_update_inheritor_locked
  *
@@ -947,116 +1213,124 @@ turnstile_update_inheritor_locked(
        switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) {
        case TURNSTILE_USER_PROMOTE:
        case TURNSTILE_USER_IPC_PROMOTE:
+               break;
+       case TURNSTILE_KERNEL_PROMOTE:
+               /* some sanity checks, turnstile kernel can push just between threads */
+               if (old_inheritor) {
+                       assert(old_inheritor_flags & TURNSTILE_INHERITOR_THREAD);
+               }
 
-               /* Check if update is needed */
-               if (old_inheritor == new_inheritor && old_inheritor == NULL) {
-                       break;
+               if (new_inheritor) {
+                       assert(new_inheritor_flags & TURNSTILE_INHERITOR_THREAD);
                }
 
-               if (old_inheritor == new_inheritor) {
-                       if (new_inheritor_flags & TURNSTILE_INHERITOR_THREAD) {
-                               thread_t thread_inheritor = (thread_t)new_inheritor;
+               break;
+       default:
+               panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile));
+       }
 
-                               assert(old_inheritor_flags & TURNSTILE_INHERITOR_THREAD);
+       /* Check if update is needed */
+       if (old_inheritor == new_inheritor && old_inheritor == NULL) {
+               goto done;
+       }
 
-                               /* adjust turnstile position in the thread's inheritor list */
-                               new_inheritor_needs_update = thread_update_turnstile_promotion(
-                                       thread_inheritor, turnstile);
-                       } else if (new_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) {
-                               struct turnstile *inheritor_turnstile = new_inheritor;
+       if (old_inheritor == new_inheritor) {
+               if (new_inheritor_flags & TURNSTILE_INHERITOR_THREAD) {
+                       thread_t thread_inheritor = (thread_t)new_inheritor;
 
-                               assert(old_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE);
+                       assert(old_inheritor_flags & TURNSTILE_INHERITOR_THREAD);
 
-                               new_inheritor_needs_update = turnstile_update_turnstile_promotion(
-                                       inheritor_turnstile, turnstile);
-                       } else if (new_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) {
-                               /*
-                                * When we are still picking "WORKQ" then possible racing
-                                * updates will call redrive through their own propagation
-                                * and we don't need to update anything here.
-                                */
-                               turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED |
-                                   TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile);
-                       } else {
-                               panic("Inheritor flags lost along the way");
-                       }
+                       /* adjust turnstile position in the thread's inheritor list */
+                       new_inheritor_needs_update = thread_update_turnstile_promotion(
+                               thread_inheritor, turnstile);
+               } else if (new_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) {
+                       struct turnstile *inheritor_turnstile = new_inheritor;
 
-                       /* Update turnstile stats */
-                       if (!new_inheritor_needs_update) {
-                               turnstile_stats_update(1, TSU_PRI_PROPAGATION |
-                                   TSU_TURNSTILE_ARG | TSU_BOOST_ARG | tsu_flags, turnstile);
-                       }
-                       break;
+                       assert(old_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE);
+
+                       new_inheritor_needs_update = turnstile_update_turnstile_promotion(
+                               inheritor_turnstile, turnstile);
+               } else if (new_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) {
+                       /*
+                        * When we are still picking "WORKQ" then possible racing
+                        * updates will call redrive through their own propagation
+                        * and we don't need to update anything here.
+                        */
+                       turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED |
+                           TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile);
+               } else {
+                       panic("Inheritor flags lost along the way");
                }
 
-               if (old_inheritor != NULL) {
-                       if (old_inheritor_flags & TURNSTILE_INHERITOR_THREAD) {
-                               thread_t thread_inheritor = (thread_t)old_inheritor;
-
-                               /* remove turnstile from thread's inheritor list */
-                               old_inheritor_needs_update = thread_remove_turnstile_promotion(thread_inheritor, turnstile);
-                       } else if (old_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) {
-                               struct turnstile *old_turnstile = old_inheritor;
-
-                               old_inheritor_needs_update = turnstile_remove_turnstile_promotion(
-                                       old_turnstile, turnstile);
-                       } else if (old_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) {
-                               /*
-                                * We don't need to do anything when the push was WORKQ
-                                * because nothing is pushed on in the first place.
-                                */
-                               turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED |
-                                   TSU_TURNSTILE_ARG, turnstile);
-                       } else {
-                               panic("Inheritor flags lost along the way");
-                       }
-                       /* Update turnstile stats */
-                       if (!old_inheritor_needs_update) {
-                               turnstile_stats_update(1, TSU_PRI_PROPAGATION | TSU_TURNSTILE_ARG,
-                                   turnstile);
-                       }
+               /* Update turnstile stats */
+               if (!new_inheritor_needs_update) {
+                       turnstile_stats_update(1, TSU_PRI_PROPAGATION |
+                           TSU_TURNSTILE_ARG | TSU_BOOST_ARG | tsu_flags, turnstile);
                }
+               goto done;
+       }
 
-               if (new_inheritor != NULL) {
-                       if (new_inheritor_flags & TURNSTILE_INHERITOR_THREAD) {
-                               thread_t thread_inheritor = (thread_t)new_inheritor;
-
-                               assert(new_inheritor_flags & TURNSTILE_INHERITOR_THREAD);
-                               /* add turnstile to thread's inheritor list */
-                               new_inheritor_needs_update = thread_add_turnstile_promotion(
-                                       thread_inheritor, turnstile);
-                       } else if (new_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) {
-                               struct turnstile *new_turnstile = new_inheritor;
-
-                               new_inheritor_needs_update = turnstile_add_turnstile_promotion(
-                                       new_turnstile, turnstile);
-                       } else if (new_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) {
-                               struct workqueue *wq_inheritor = new_inheritor;
-
-                               new_inheritor_needs_update = workq_add_turnstile_promotion(
-                                       wq_inheritor, turnstile);
-                               if (!new_inheritor_needs_update) {
-                                       turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED |
-                                           TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile);
-                               }
-                       } else {
-                               panic("Inheritor flags lost along the way");
-                       }
-                       /* Update turnstile stats */
-                       if (!new_inheritor_needs_update) {
-                               turnstile_stats_update(1, TSU_PRI_PROPAGATION |
-                                   TSU_TURNSTILE_ARG | TSU_BOOST_ARG | tsu_flags, turnstile);
-                       }
+       if (old_inheritor != NULL) {
+               if (old_inheritor_flags & TURNSTILE_INHERITOR_THREAD) {
+                       thread_t thread_inheritor = (thread_t)old_inheritor;
+
+                       /* remove turnstile from thread's inheritor list */
+                       old_inheritor_needs_update = thread_remove_turnstile_promotion(thread_inheritor, turnstile);
+               } else if (old_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) {
+                       struct turnstile *old_turnstile = old_inheritor;
+
+                       old_inheritor_needs_update = turnstile_remove_turnstile_promotion(
+                               old_turnstile, turnstile);
+               } else if (old_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) {
+                       /*
+                        * We don't need to do anything when the push was WORKQ
+                        * because nothing is pushed on in the first place.
+                        */
+                       turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED |
+                           TSU_TURNSTILE_ARG, turnstile);
+               } else {
+                       panic("Inheritor flags lost along the way");
+               }
+               /* Update turnstile stats */
+               if (!old_inheritor_needs_update) {
+                       turnstile_stats_update(1, TSU_PRI_PROPAGATION | TSU_TURNSTILE_ARG,
+                           turnstile);
                }
+       }
 
-               break;
+       if (new_inheritor != NULL) {
+               if (new_inheritor_flags & TURNSTILE_INHERITOR_THREAD) {
+                       thread_t thread_inheritor = (thread_t)new_inheritor;
 
-       case TURNSTILE_KERNEL_PROMOTE:
-               break;
-       default:
-               panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile));
+                       assert(new_inheritor_flags & TURNSTILE_INHERITOR_THREAD);
+                       /* add turnstile to thread's inheritor list */
+                       new_inheritor_needs_update = thread_add_turnstile_promotion(
+                               thread_inheritor, turnstile);
+               } else if (new_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) {
+                       struct turnstile *new_turnstile = new_inheritor;
+
+                       new_inheritor_needs_update = turnstile_add_turnstile_promotion(
+                               new_turnstile, turnstile);
+               } else if (new_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) {
+                       struct workqueue *wq_inheritor = new_inheritor;
+
+                       new_inheritor_needs_update = workq_add_turnstile_promotion(
+                               wq_inheritor, turnstile);
+                       if (!new_inheritor_needs_update) {
+                               turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED |
+                                   TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile);
+                       }
+               } else {
+                       panic("Inheritor flags lost along the way");
+               }
+               /* Update turnstile stats */
+               if (!new_inheritor_needs_update) {
+                       turnstile_stats_update(1, TSU_PRI_PROPAGATION |
+                           TSU_TURNSTILE_ARG | TSU_BOOST_ARG | tsu_flags, turnstile);
+               }
        }
 
+done:
        if (old_inheritor_needs_update) {
                old_inheritor_flags |= TURNSTILE_INHERITOR_NEEDS_PRI_UPDATE;
        }
@@ -1077,28 +1351,25 @@ turnstile_update_inheritor_locked(
 }
 
 /*
- * Name: turnstile_update_inheritor
+ * Name: turnstile_stash_inheritor
  *
- * Description: Update the inheritor of the turnstile and boost the
- *              inheritor. It will take a thread reference on the inheritor.
+ * Description: Save the new inheritor reference of the turnstile on the
+ *              current thread. It will take a thread reference on the inheritor.
  *              Called with the interlock of the primitive held.
  *
  * Args:
- *   Arg1: turnstile
- *   Arg2: inheritor
- *   Arg3: flags - TURNSTILE_DELAYED_UPDATE - update will happen later in assert_wait
+ *   Arg1: inheritor
+ *   Arg2: flags
  *
  * Returns:
  *   old inheritor reference is stashed on current thread's struct.
  */
-void
-turnstile_update_inheritor(
-       struct turnstile *turnstile,
+static void
+turnstile_stash_inheritor(
        turnstile_inheritor_t new_inheritor,
        turnstile_update_flags_t flags)
 {
        thread_t thread = current_thread();
-       spl_t spl;
 
        /*
         * Set the inheritor on calling thread struct, no need
@@ -1123,6 +1394,32 @@ turnstile_update_inheritor(
                panic("Missing type in flags (%x) for inheritor (%p)", flags,
                    new_inheritor);
        }
+}
+
+/*
+ * Name: turnstile_update_inheritor
+ *
+ * Description: Update the inheritor of the turnstile and boost the
+ *              inheritor. It will take a thread reference on the inheritor.
+ *              Called with the interlock of the primitive held.
+ *
+ * Args:
+ *   Arg1: turnstile
+ *   Arg2: inheritor
+ *   Arg3: flags - TURNSTILE_DELAYED_UPDATE - update will happen later in assert_wait
+ *
+ * Returns:
+ *   old inheritor reference is stashed on current thread's struct.
+ */
+void
+turnstile_update_inheritor(
+       struct turnstile *turnstile,
+       turnstile_inheritor_t new_inheritor,
+       turnstile_update_flags_t flags)
+{
+       spl_t spl;
+
+       turnstile_stash_inheritor(new_inheritor, flags);
 
        /* Do not perform the update if delayed update is specified */
        if (flags & TURNSTILE_DELAYED_UPDATE) {
@@ -1157,7 +1454,7 @@ turnstile_update_inheritor(
  */
 static boolean_t
 turnstile_need_thread_promotion_update(
-       struct turnstile *dst_turnstile __assert_only,
+       struct turnstile *dst_turnstile,
        thread_t thread)
 {
        int thread_link_priority;
@@ -1166,7 +1463,10 @@ turnstile_need_thread_promotion_update(
        thread_link_priority = priority_queue_entry_key(&(dst_turnstile->ts_waitq.waitq_prio_queue),
            &(thread->wait_prioq_links));
 
-       needs_update = (thread_link_priority == thread->base_pri) ? FALSE : TRUE;
+       int priority = turnstile_compute_thread_push(dst_turnstile, thread);
+
+       needs_update = (thread_link_priority == priority) ? FALSE : TRUE;
+
        return needs_update;
 }
 
@@ -1221,21 +1521,25 @@ turnstile_update_thread_promotion_locked(
        struct turnstile *dst_turnstile,
        thread_t thread)
 {
-       int thread_link_priority = priority_queue_entry_key(&(dst_turnstile->ts_waitq.waitq_prio_queue),
+       int thread_link_priority;
+
+       int priority = turnstile_compute_thread_push(dst_turnstile, thread);
+
+       thread_link_priority = priority_queue_entry_key(&(dst_turnstile->ts_waitq.waitq_prio_queue),
            &(thread->wait_prioq_links));
 
-       if (thread->base_pri != thread_link_priority) {
+       if (priority != thread_link_priority) {
                KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
                    (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_MOVED_IN_TURNSTILE_WAITQ))) | DBG_FUNC_NONE,
                    VM_KERNEL_UNSLIDE_OR_PERM(dst_turnstile),
                    thread_tid(thread),
-                   thread->base_pri,
+                   priority,
                    thread_link_priority, 0);
        }
 
        if (!turnstile_priority_queue_update_entry_key(
                    &dst_turnstile->ts_waitq.waitq_prio_queue,
-                   &thread->wait_prioq_links, thread->base_pri)) {
+                   &thread->wait_prioq_links, priority)) {
                return FALSE;
        }
 
@@ -1243,7 +1547,6 @@ turnstile_update_thread_promotion_locked(
        return turnstile_recompute_priority_locked(dst_turnstile);
 }
 
-
 /*
  * Name: thread_add_turnstile_promotion
  *
@@ -1273,12 +1576,30 @@ thread_add_turnstile_promotion(
            VM_KERNEL_UNSLIDE_OR_PERM(turnstile),
            turnstile->ts_priority, 0, 0);
 
-       priority_queue_entry_init(&(turnstile->ts_inheritor_links));
-       if (priority_queue_insert(&thread->inheritor_queue,
-           &turnstile->ts_inheritor_links, turnstile->ts_priority,
-           PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
-               /* Update thread priority */
-               needs_update = thread_recompute_user_promotion_locked(thread);
+       priority_queue_entry_init(&turnstile->ts_inheritor_links);
+
+       switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) {
+       case TURNSTILE_USER_PROMOTE:
+       case TURNSTILE_USER_IPC_PROMOTE:
+
+               if (priority_queue_insert(&(thread->base_inheritor_queue),
+                   &turnstile->ts_inheritor_links, turnstile->ts_priority,
+                   PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
+                       needs_update = thread_recompute_user_promotion_locked(thread);
+               }
+
+               break;
+       case TURNSTILE_KERNEL_PROMOTE:
+
+               if (priority_queue_insert(&(thread->sched_inheritor_queue),
+                   &turnstile->ts_inheritor_links, turnstile->ts_priority,
+                   PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
+                       needs_update = thread_recompute_kernel_promotion_locked(thread);
+               }
+
+               break;
+       default:
+               panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile));
        }
 
        /* Update turnstile stats */
@@ -1290,10 +1611,10 @@ thread_add_turnstile_promotion(
        }
 
        thread_unlock(thread);
+
        return needs_update;
 }
 
-
 /*
  * Name: thread_remove_turnstile_promotion
  *
@@ -1314,7 +1635,6 @@ thread_remove_turnstile_promotion(
 {
        boolean_t needs_update = FALSE;
 
-       /* Update the pairing heap */
        thread_lock(thread);
 
        KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
@@ -1323,11 +1643,26 @@ thread_remove_turnstile_promotion(
            VM_KERNEL_UNSLIDE_OR_PERM(turnstile),
            0, 0, 0);
 
-       if (priority_queue_remove(&thread->inheritor_queue,
-           &turnstile->ts_inheritor_links,
-           PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
-               /* Update thread priority */
-               needs_update = thread_recompute_user_promotion_locked(thread);
+       /* Update the pairing heap */
+
+       switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) {
+       case TURNSTILE_USER_PROMOTE:
+       case TURNSTILE_USER_IPC_PROMOTE:
+               if (priority_queue_remove(&(thread->base_inheritor_queue),
+                   &turnstile->ts_inheritor_links,
+                   PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
+                       needs_update = thread_recompute_user_promotion_locked(thread);
+               }
+               break;
+       case TURNSTILE_KERNEL_PROMOTE:
+               if (priority_queue_remove(&(thread->sched_inheritor_queue),
+                   &turnstile->ts_inheritor_links,
+                   PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
+                       needs_update = thread_recompute_kernel_promotion_locked(thread);
+               }
+               break;
+       default:
+               panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile));
        }
 
        /* Update turnstile stats */
@@ -1338,6 +1673,7 @@ thread_remove_turnstile_promotion(
        }
 
        thread_unlock(thread);
+
        return needs_update;
 }
 
@@ -1360,11 +1696,21 @@ thread_needs_turnstile_promotion_update(
        struct turnstile *turnstile)
 {
        boolean_t needs_update = FALSE;
-       int turnstile_link_priority;
+       int turnstile_link_priority = 0;
 
-       /* Update the pairing heap */
-       turnstile_link_priority = priority_queue_entry_key(&(thread->inheritor_queue),
-           &(turnstile->ts_inheritor_links));
+       switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) {
+       case TURNSTILE_USER_PROMOTE:
+       case TURNSTILE_USER_IPC_PROMOTE:
+               turnstile_link_priority = priority_queue_entry_key(&(thread->base_inheritor_queue),
+                   &(turnstile->ts_inheritor_links));
+               break;
+       case TURNSTILE_KERNEL_PROMOTE:
+               turnstile_link_priority = priority_queue_entry_key(&(thread->sched_inheritor_queue),
+                   &(turnstile->ts_inheritor_links));
+               break;
+       default:
+               panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile));
+       }
 
        needs_update = (turnstile_link_priority == turnstile->ts_priority) ? FALSE : TRUE;
        return needs_update;
@@ -1388,8 +1734,30 @@ thread_update_turnstile_promotion_locked(
        thread_t thread,
        struct turnstile *turnstile)
 {
-       int turnstile_link_priority = priority_queue_entry_key(&(thread->inheritor_queue),
-           &(turnstile->ts_inheritor_links));
+       boolean_t needs_update = FALSE;
+       int turnstile_link_priority = 0;
+
+       switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) {
+       case TURNSTILE_USER_PROMOTE:
+       case TURNSTILE_USER_IPC_PROMOTE:
+               turnstile_link_priority = priority_queue_entry_key(&(thread->base_inheritor_queue), &turnstile->ts_inheritor_links);
+
+               if (turnstile_priority_queue_update_entry_key(&(thread->base_inheritor_queue),
+                   &turnstile->ts_inheritor_links, turnstile->ts_priority)) {
+                       needs_update = thread_recompute_user_promotion_locked(thread);
+               }
+               break;
+       case TURNSTILE_KERNEL_PROMOTE:
+               turnstile_link_priority = priority_queue_entry_key(&(thread->sched_inheritor_queue), &turnstile->ts_inheritor_links);
+
+               if (turnstile_priority_queue_update_entry_key(&(thread->sched_inheritor_queue),
+                   &turnstile->ts_inheritor_links, turnstile->ts_priority)) {
+                       needs_update = thread_recompute_kernel_promotion_locked(thread);
+               }
+               break;
+       default:
+               panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile));
+       }
 
        if (turnstile->ts_priority != turnstile_link_priority) {
                KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
@@ -1400,13 +1768,7 @@ thread_update_turnstile_promotion_locked(
                    turnstile_link_priority, 0);
        }
 
-       if (!turnstile_priority_queue_update_entry_key(&thread->inheritor_queue,
-           &turnstile->ts_inheritor_links, turnstile->ts_priority)) {
-               return FALSE;
-       }
-
-       /* Update thread priority */
-       return thread_recompute_user_promotion_locked(thread);
+       return needs_update;
 }
 
 
@@ -1437,8 +1799,9 @@ thread_update_turnstile_promotion(
                return needs_update;
        }
 
-       /* Update the pairing heap */
        thread_lock(thread);
+
+       /* Update the pairing heap */
        needs_update = thread_update_turnstile_promotion_locked(thread, turnstile);
 
        /* Update turnstile stats */
@@ -1448,36 +1811,65 @@ thread_update_turnstile_promotion(
                    TSU_TURNSTILE_ARG | TSU_BOOST_ARG,
                    turnstile);
        }
+
        thread_unlock(thread);
+
        return needs_update;
 }
 
 
 /*
- * Name: thread_get_inheritor_turnstile_priority
+ * Name: thread_get_inheritor_turnstile_sched_priority
+ *
+ * Description: Get the max sched priority of all the inheritor turnstiles
+ *
+ * Arg1: thread
+ *
+ * Returns: Max sched priority of all the inheritor turnstiles.
+ *
+ * Condition: thread locked
+ */
+int
+thread_get_inheritor_turnstile_sched_priority(thread_t thread)
+{
+       struct turnstile *max_turnstile;
+
+       max_turnstile = priority_queue_max(&thread->sched_inheritor_queue,
+           struct turnstile, ts_inheritor_links);
+
+       if (max_turnstile) {
+               return priority_queue_entry_key(&thread->sched_inheritor_queue,
+                          &max_turnstile->ts_inheritor_links);
+       }
+
+       return 0;
+}
+
+/*
+ * Name: thread_get_inheritor_turnstile_base_priority
  *
- * Description: Get the max priority of all the inheritor turnstiles
+ * Description: Get the max base priority of all the inheritor turnstiles
  *
  * Arg1: thread
  *
- * Returns: Max priority of all the inheritor turnstiles.
+ * Returns: Max base priority of all the inheritor turnstiles.
  *
  * Condition: thread locked
  */
 int
-thread_get_inheritor_turnstile_priority(thread_t thread)
+thread_get_inheritor_turnstile_base_priority(thread_t thread)
 {
        struct turnstile *max_turnstile;
 
-       max_turnstile = priority_queue_max(&thread->inheritor_queue,
+       max_turnstile = priority_queue_max(&thread->base_inheritor_queue,
            struct turnstile, ts_inheritor_links);
 
        if (max_turnstile) {
-               return priority_queue_entry_key(&thread->inheritor_queue,
+               return priority_queue_entry_key(&thread->base_inheritor_queue,
                           &max_turnstile->ts_inheritor_links);
        }
 
-       return MAXPRI_THROTTLE;
+       return 0;
 }
 
 
@@ -1516,7 +1908,6 @@ thread_get_waiting_turnstile(thread_t thread)
        return turnstile;
 }
 
-
 /*
  * Name: turnstile_lookup_by_proprietor
  *
@@ -1524,6 +1915,7 @@ thread_get_waiting_turnstile(thread_t thread)
  *              turnstile hash.
  *
  * Arg1: port
+ * Arg2: turnstile_type_t type
  *
  * Returns: turnstile: if the proprietor has a turnstile.
  *          TURNSTILE_NULL: otherwise.
@@ -1531,12 +1923,11 @@ thread_get_waiting_turnstile(thread_t thread)
  * Condition: proprietor interlock held.
  */
 struct turnstile *
-turnstile_lookup_by_proprietor(uintptr_t proprietor)
+turnstile_lookup_by_proprietor(uintptr_t proprietor, turnstile_type_t type)
 {
-       return turnstile_htable_lookup(proprietor);
+       return turnstile_htable_lookup(proprietor, type);
 }
 
-
 /*
  * Name: thread_get_update_flags_for_turnstile_propagation_stoppage
  *
@@ -1831,6 +2222,88 @@ turnstile_remove_turnstile_promotion(
        return needs_update;
 }
 
+/*
+ * Name: turnstile_compute_thread_push
+ *
+ * Description: Compute the priority at which the thread will push
+ *       on the turnstile.
+ *
+ * Arg1: turnstile
+ * Arg2: thread
+ *
+ * Condition: wq locked
+ */
+static int
+turnstile_compute_thread_push(
+       struct turnstile *turnstile,
+       thread_t thread)
+{
+       int priority = 0;
+       switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) {
+       case TURNSTILE_USER_PROMOTE:
+       case TURNSTILE_USER_IPC_PROMOTE:
+               priority = thread->base_pri;
+               break;
+       case TURNSTILE_KERNEL_PROMOTE:
+               /*
+                * Ideally this should be policy based
+                * according to the turnstile type.
+                *
+                * The priority with which each thread pushes on
+                * a primitive should be primitive dependent.
+                */
+               priority = thread->sched_pri;
+               priority = MAX(priority, thread->base_pri);
+               priority = MAX(priority, BASEPRI_DEFAULT);
+               priority = MIN(priority, MAXPRI_PROMOTE);
+               break;
+       default:
+               panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile));
+       }
+
+       return priority;
+}
+
+/*
+ * Name: turnstile_waitq_add_thread_priority_queue
+ *
+ * Description: add thread to the turnstile wq
+ *
+ * Arg1: turnstile wq
+ * Arg2: thread to add
+ *
+ * Condition: wq locked
+ */
+void
+turnstile_waitq_add_thread_priority_queue(
+       struct waitq *wq,
+       thread_t thread)
+{
+       struct turnstile *turnstile = waitq_to_turnstile(wq);
+       int priority = turnstile_compute_thread_push(turnstile, thread);
+
+       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+           (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_ADDED_TO_TURNSTILE_WAITQ))) | DBG_FUNC_NONE,
+           VM_KERNEL_UNSLIDE_OR_PERM(turnstile),
+           thread_tid(thread),
+           priority, 0, 0);
+       /*
+        * For turnstile queues (which use priority queues),
+        * insert the thread in the heap based on its priority.
+        * Note that the priority queue implementation
+        * is currently not stable, so does not maintain fifo for
+        * threads at the same pri. Also, if the pri
+        * of the thread changes while its blocked in the waitq,
+        * the thread position should be updated in the priority
+        * queue by calling priority queue increase/decrease
+        * operations.
+        */
+       priority_queue_entry_init(&(thread->wait_prioq_links));
+       priority_queue_insert(&wq->waitq_prio_queue,
+           &thread->wait_prioq_links, priority,
+           PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+}
+
 /*
  * Name: turnstile_recompute_priority_locked
  *
@@ -1854,12 +2327,13 @@ turnstile_recompute_priority_locked(
        boolean_t needs_priority_update = FALSE;
        thread_t max_thread = THREAD_NULL;
        struct turnstile *max_turnstile;
-       int thread_max_pri = MAXPRI_THROTTLE;
-       int turnstile_max_pri = MAXPRI_THROTTLE;
+       int thread_max_pri = 0;
+       int turnstile_max_pri = 0;
 
        switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) {
        case TURNSTILE_USER_PROMOTE:
        case TURNSTILE_USER_IPC_PROMOTE:
+       case TURNSTILE_KERNEL_PROMOTE:
 
                old_priority = turnstile->ts_priority;
 
@@ -1875,6 +2349,7 @@ turnstile_recompute_priority_locked(
                    struct turnstile, ts_inheritor_links);
 
                if (max_turnstile) {
+                       assert(turnstile_promote_policy[turnstile_get_type(turnstile)] != TURNSTILE_KERNEL_PROMOTE);
                        turnstile_max_pri = priority_queue_entry_key(&turnstile->ts_inheritor_queue,
                            &max_turnstile->ts_inheritor_links);
                }
@@ -1896,8 +2371,6 @@ turnstile_recompute_priority_locked(
                break;
 
        case TURNSTILE_PROMOTE_NONE:
-       case TURNSTILE_KERNEL_PROMOTE:
-
                /* The turnstile was repurposed, do nothing */
                break;
 
@@ -1990,6 +2463,139 @@ turnstile_workq_proprietor_of_max_turnstile(
        return max_priority;
 }
 
+/*
+ * Name: turnstile_workloop_pusher_info
+ *
+ * Description: Returns the priority of the turnstile push for a workloop,
+ *              and the thread or knote responsible for this push.
+ *
+ * Args: workloop turnstile
+ *
+ * Returns:
+ *    Priority of the push or 0
+ *    Thread (with a +1 reference) with that push or THREAD_NULL.
+ *    Port (with a +1 reference) with that push, or IP_NULL.
+ *    Sync IPC knote with the highest push (or NULL)
+ */
+int
+turnstile_workloop_pusher_info(
+       struct turnstile *turnstile,
+       thread_t *thread_out,
+       ipc_port_t *port_out,
+       struct knote **knote_out)
+{
+       struct turnstile *max_ts;
+       thread_t max_thread;
+       int max_thread_pri = 0;
+       int max_ts_pri = 0;
+       ipc_port_t port;
+
+       assert(turnstile_get_type(turnstile) == TURNSTILE_WORKLOOPS);
+
+       spl_t s = splsched();
+       waitq_lock(&turnstile->ts_waitq);
+
+       max_thread = priority_queue_max(&turnstile->ts_waitq.waitq_prio_queue,
+           struct thread, wait_prioq_links);
+       if (max_thread) {
+               max_thread_pri = priority_queue_entry_key(
+                       &turnstile->ts_waitq.waitq_prio_queue,
+                       &max_thread->wait_prioq_links);
+       }
+
+       max_ts = priority_queue_max(&turnstile->ts_inheritor_queue,
+           struct turnstile, ts_inheritor_links);
+       if (max_ts) {
+               max_ts_pri = priority_queue_entry_key(&turnstile->ts_inheritor_queue,
+                   &max_ts->ts_inheritor_links);
+       }
+
+       /*
+        * Reasons to push on a workloop turnstile are:
+        *
+        * 1. threads in dispatch sync
+        *
+        * 2. sync IPC pushes, which in turn have 4 sub-cases:
+        *
+        *   2.a. special reply port or receive right pushing through a knote
+        *        turnstile,
+        *
+        *   2.b. special reply port stashed on a knote, pushing on the workloop
+        *        directly,
+        *
+        *   2.c. receive right stashed on a knote, pushing on the workloop
+        *        directly,
+        *
+        *   2.d. a receive right monitored by a knote, pushing on the workloop
+        *        directly.
+        *
+        * See ipc_port_send_update_inheritor(), ipc_port_recv_update_inheritor().
+        *
+        * Note: dereferencing the knote in the caller is safe provided this
+        * function i scalled under the proper interlocks (the filt_wllock + req
+        * lock) which serializes with the knote going away.
+        */
+       if (max_thread_pri > max_ts_pri) {
+               thread_reference(max_thread);
+               *thread_out = max_thread;
+               *port_out = NULL;
+               *knote_out = NULL;
+       } else if (max_ts_pri) {
+               switch (turnstile_get_type(max_ts)) {
+               case TURNSTILE_KNOTE:
+                       /* 2.a. */
+                       *thread_out = THREAD_NULL;
+                       *port_out = IP_NULL;
+                       *knote_out = (struct knote *)max_ts->ts_proprietor;
+                       break;
+
+               case TURNSTILE_SYNC_IPC:
+                       /* 2.[bcd] */
+                       port = (ipc_port_t)max_ts->ts_proprietor;
+                       ip_reference(port);
+                       *thread_out = THREAD_NULL;
+                       *port_out = port;
+                       *knote_out = NULL;
+                       break;
+
+               default:
+                       panic("Unexpected type for turnstile %p", max_ts);
+               }
+       } else {
+               *thread_out = THREAD_NULL;
+               *port_out = IP_NULL;
+               *knote_out = NULL;
+       }
+
+       waitq_unlock(&turnstile->ts_waitq);
+       splx(s);
+
+       return max(max_thread_pri, max_ts_pri);
+}
+
+/*
+ * Name: turnstile_has_waiters
+ *
+ * Description: returns if there are waiters on the turnstile
+ *
+ * Arg1: turnstile: turnstile
+ *
+ * Returns: TRUE if there are waiters, FALSE otherwise.
+ */
+
+boolean_t
+turnstile_has_waiters(struct turnstile *turnstile)
+{
+       boolean_t ret;
+
+       spl_t s = splsched();
+       waitq_lock(&turnstile->ts_waitq);
+       ret = !priority_queue_empty(&turnstile->ts_waitq.waitq_prio_queue);
+       waitq_unlock(&turnstile->ts_waitq);
+       splx(s);
+
+       return ret;
+}
 
 /*
  * Name: turnstile_update_inheritor_priority_chain
@@ -2023,8 +2629,8 @@ turnstile_update_inheritor_priority_chain(
        if (turnstile_flags & TURNSTILE_INHERITOR_THREAD) {
                thread = inheritor;
                thread_lock(thread);
-               //TODO: Need to call sched promotion for kernel mutex.
                thread_recompute_user_promotion_locked(thread);
+               thread_recompute_kernel_promotion_locked(thread);
        } else if (turnstile_flags & TURNSTILE_INHERITOR_TURNSTILE) {
                turnstile = inheritor;
                waitq_lock(&turnstile->ts_waitq);
@@ -2151,6 +2757,23 @@ turnstile_cleanup(void)
        }
 }
 
+/*
+ * Name: turnstile_update_thread_priority_chain
+ *
+ * Description: Priority of a thread blocked on a turnstile
+ *              has changed, update the turnstile priority.
+ *
+ * Arg1: thread: thread whose priority has changed.
+ *
+ * Returns: None.
+ */
+void
+turnstile_update_thread_priority_chain(thread_t thread)
+{
+       turnstile_update_inheritor_priority_chain(thread,
+           TURNSTILE_INHERITOR_THREAD | TURNSTILE_UPDATE_BOOST);
+}
+
 /*
  * Name: turnstile_update_inheritor_workq_priority_chain
  *
@@ -2177,6 +2800,7 @@ turnstile_update_inheritor_workq_priority_chain(struct turnstile *turnstile, spl
 
        if (!workq_lock_held) {
                workq_reference(wq);
+               disable_preemption();
        }
        waitq_unlock(&turnstile->ts_waitq);
        splx(s);
@@ -2184,6 +2808,7 @@ turnstile_update_inheritor_workq_priority_chain(struct turnstile *turnstile, spl
        workq_schedule_creator_turnstile_redrive(wq, workq_lock_held);
 
        if (!workq_lock_held) {
+               enable_preemption();
                workq_deallocate_safe(wq);
        }
 }
@@ -2478,14 +3103,14 @@ turnstile_stats_update(
        /*
         * Check if turnstile stats needs to be updated.
         * Bail out if the turnstile or thread does not
-        * have any user promotion, i.e. pri 4.
+        * have any user promotion.
         * Bail out if it is the first hop of WQ turnstile
         * since WQ's use of a turnstile for the admission check
         * introduces a lot of noise due to state changes.
         */
        if (flags & TSU_TURNSTILE_ARG) {
                struct turnstile *ts = (struct turnstile *)inheritor;
-               if (ts->ts_priority <= MAXPRI_THROTTLE) {
+               if (ts->ts_priority == 0) {
                        return;
                }
 
@@ -2494,7 +3119,7 @@ turnstile_stats_update(
                }
        } else if (flags & TSU_THREAD_ARG) {
                thread_t thread = (thread_t)inheritor;
-               if (thread->user_promotion_basepri <= MAXPRI_THROTTLE) {
+               if (thread->user_promotion_basepri == 0) {
                        return;
                }
        } else {
@@ -2534,6 +3159,60 @@ turnstile_stats_update(
 #endif
 }
 
+static uint64_t
+kdp_turnstile_traverse_inheritor_chain(struct turnstile *ts, uint64_t *flags, uint8_t *hops)
+{
+       if (waitq_held(&ts->ts_waitq)) {
+               *flags |= STACKSHOT_TURNSTILE_STATUS_LOCKED_WAITQ;
+               return 0;
+       }
+
+       *hops = *hops + 1;
+
+       if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) {
+               return kdp_turnstile_traverse_inheritor_chain(ts->ts_inheritor, flags, hops);
+       }
+
+       if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_THREAD) {
+               *flags |= STACKSHOT_TURNSTILE_STATUS_THREAD;
+               return (uint64_t) thread_tid(ts->ts_inheritor);
+       }
+
+       if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) {
+               *flags |= STACKSHOT_TURNSTILE_STATUS_WORKQUEUE;
+               return VM_KERNEL_UNSLIDE_OR_PERM(ts->ts_inheritor);
+       }
+
+       *flags |= STACKSHOT_TURNSTILE_STATUS_UNKNOWN;
+       return 0;
+}
+
+void
+kdp_turnstile_fill_tsinfo(struct turnstile *ts, thread_turnstileinfo_t *tsinfo)
+{
+       uint64_t final_inheritor;
+       uint64_t flags = 0;
+       uint8_t hops = 0;
+
+       tsinfo->turnstile_context  = 0;
+       tsinfo->number_of_hops     = 0;
+       tsinfo->turnstile_priority = 0;
+
+       assert(ts != TURNSTILE_NULL);
+
+       if (waitq_held(&ts->ts_waitq)) {
+               tsinfo->turnstile_flags |= STACKSHOT_TURNSTILE_STATUS_LOCKED_WAITQ;
+               return;
+       }
+
+       final_inheritor = kdp_turnstile_traverse_inheritor_chain(ts, &flags, &hops);
+
+       /* store some metadata about the turnstile itself */
+       tsinfo->turnstile_flags = flags;
+       tsinfo->number_of_hops = hops;
+       tsinfo->turnstile_priority = ts->ts_priority;
+       tsinfo->turnstile_context = final_inheritor;
+}
 
 #if DEVELOPMENT || DEBUG
 
@@ -2592,10 +3271,45 @@ tstile_test_prim_init(struct tstile_test_prim **test_prim_ptr)
 }
 
 int
-tstile_test_prim_lock(boolean_t use_hashtable)
+tstile_test_prim_lock(int val)
 {
-       struct tstile_test_prim *test_prim = use_hashtable ? test_prim_global_htable : test_prim_ts_inline;
+       struct tstile_test_prim *test_prim;
+       boolean_t use_hashtable;
+       turnstile_type_t type;
+       wait_interrupt_t wait_type;
+
+       switch (val) {
+       case SYSCTL_TURNSTILE_TEST_USER_DEFAULT:
+               test_prim = test_prim_ts_inline;
+               use_hashtable = FALSE;
+               wait_type = THREAD_ABORTSAFE;
+               type = TURNSTILE_ULOCK;
+               break;
+       case SYSCTL_TURNSTILE_TEST_USER_HASHTABLE:
+               test_prim = test_prim_global_htable;
+               use_hashtable = TRUE;
+               wait_type = THREAD_ABORTSAFE;
+               type = TURNSTILE_ULOCK;
+               break;
+       case SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT:
+               test_prim = test_prim_global_ts_kernel;
+               use_hashtable = FALSE;
+               wait_type = THREAD_UNINT | THREAD_WAIT_NOREPORT_USER;
+               type = TURNSTILE_KERNEL_MUTEX;
+               break;
+       case SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE:
+               test_prim = test_prim_global_ts_kernel_hash;
+               use_hashtable = TRUE;
+               wait_type = THREAD_UNINT | THREAD_WAIT_NOREPORT_USER;
+               type = TURNSTILE_KERNEL_MUTEX;
+               break;
+
+       default:
+               return -1;
+       }
+
 lock_start:
+
        /* take the interlock of the primitive */
        tstile_test_prim_lock_interlock(test_prim);
 
@@ -2612,7 +3326,7 @@ lock_start:
        /* primitive locked, get a turnstile */
        prim_turnstile = turnstile_prepare((uintptr_t)test_prim,
            use_hashtable ? NULL : &test_prim->ttprim_turnstile,
-           TURNSTILE_NULL, TURNSTILE_ULOCK);
+           TURNSTILE_NULL, type);
 
        assert(prim_turnstile != TURNSTILE_NULL);
 
@@ -2629,12 +3343,11 @@ lock_start:
                turnstile_update_inheritor_complete(prim_turnstile, TURNSTILE_INTERLOCK_HELD);
 
                turnstile_complete((uintptr_t)test_prim,
-                   use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL);
+                   use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL, type);
 
                tstile_test_prim_unlock_interlock(test_prim);
 
                turnstile_cleanup();
-
                return 0;
        }
 
@@ -2644,7 +3357,7 @@ lock_start:
            (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
 
        waitq_assert_wait64(&prim_turnstile->ts_waitq,
-           CAST_EVENT64_T(test_prim), THREAD_ABORTSAFE,
+           CAST_EVENT64_T(test_prim), wait_type,
            TIMEOUT_WAIT_FOREVER);
 
        /* drop the interlock */
@@ -2659,7 +3372,7 @@ lock_start:
        tstile_test_prim_lock_interlock(test_prim);
        test_prim->tt_prim_waiters--;
        turnstile_complete((uintptr_t)test_prim,
-           use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL);
+           use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL, type);
 
        tstile_test_prim_unlock_interlock(test_prim);
 
@@ -2674,9 +3387,37 @@ lock_start:
 }
 
 int
-tstile_test_prim_unlock(boolean_t use_hashtable)
+tstile_test_prim_unlock(int val)
 {
-       struct tstile_test_prim *test_prim = use_hashtable ? test_prim_global_htable : test_prim_ts_inline;
+       struct tstile_test_prim *test_prim;
+       boolean_t use_hashtable;
+       turnstile_type_t type;
+
+       switch (val) {
+       case SYSCTL_TURNSTILE_TEST_USER_DEFAULT:
+               test_prim = test_prim_ts_inline;
+               use_hashtable = FALSE;
+               type = TURNSTILE_ULOCK;
+               break;
+       case SYSCTL_TURNSTILE_TEST_USER_HASHTABLE:
+               test_prim = test_prim_global_htable;
+               use_hashtable = TRUE;
+               type = TURNSTILE_ULOCK;
+               break;
+       case SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT:
+               test_prim = test_prim_global_ts_kernel;
+               use_hashtable = FALSE;
+               type = TURNSTILE_KERNEL_MUTEX;
+               break;
+       case SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE:
+               test_prim = test_prim_global_ts_kernel_hash;
+               use_hashtable = TRUE;
+               type = TURNSTILE_KERNEL_MUTEX;
+               break;
+       default:
+               return -1;
+       }
+
        /* take the interlock of the primitive */
        tstile_test_prim_lock_interlock(test_prim);
 
@@ -2704,7 +3445,7 @@ tstile_test_prim_unlock(boolean_t use_hashtable)
        /* primitive locked, get a turnstile */
        prim_turnstile = turnstile_prepare((uintptr_t)test_prim,
            use_hashtable ? NULL : &test_prim->ttprim_turnstile,
-           TURNSTILE_NULL, TURNSTILE_ULOCK);
+           TURNSTILE_NULL, type);
 
        assert(prim_turnstile != TURNSTILE_NULL);
 
@@ -2715,12 +3456,12 @@ tstile_test_prim_unlock(boolean_t use_hashtable)
 
        waitq_wakeup64_one(&prim_turnstile->ts_waitq,
            CAST_EVENT64_T(test_prim),
-           THREAD_AWAKENED, WAITQ_SELECT_MAX_PRI);
+           THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
 
        turnstile_update_inheritor_complete(prim_turnstile, TURNSTILE_INTERLOCK_HELD);
 
        turnstile_complete((uintptr_t)test_prim,
-           use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL);
+           use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL, type);
 
        tstile_test_prim_unlock_interlock(test_prim);
 
index b6749758146fcfde3d47a31d14b0c271e9244922..6050fa3d9c36c40acb36df7ebf15aaa1113266ea 100644 (file)
@@ -53,6 +53,8 @@ struct turnstile_stats {
 #include <os/refcnt.h>
 #include <kern/assert.h>
 #include <kern/kern_types.h>
+#include <kern/mpsc_queue.h>
+#include <kern/locks.h>
 
 /*
  * turnstile_type_t : Indicates the type of primitive the turnstile is associated with
@@ -67,7 +69,8 @@ typedef enum __attribute__((packed)) turnstile_type {
        TURNSTILE_WORKLOOPS = 5,
        TURNSTILE_WORKQS = 6,
        TURNSTILE_KNOTE = 7,
-       TURNSTILE_TOTAL_TYPES = 8,
+       TURNSTILE_SLEEP_INHERITOR = 8,
+       TURNSTILE_TOTAL_TYPES = 9,
 } turnstile_type_t;
 
 /*
@@ -112,6 +115,12 @@ typedef enum __attribute__((packed)) turnstile_type {
  * TURNSTILE_KNOTE
  *    Interlock: the knote lock
  *    Inheritor: WL turnstile
+ *
+ * TURNSTILE_SLEEP_INHERITOR
+ *    Interlock: turnstile_htable bucket spinlock.
+ *    Inheritor: threads.
+ *    Lock order: turnstile lock, thread lock.
+ *
  */
 
 typedef enum __attribute__((flag_enum)) turnstile_promote_policy {
@@ -121,6 +130,12 @@ typedef enum __attribute__((flag_enum)) turnstile_promote_policy {
        TURNSTILE_USER_IPC_PROMOTE = 0x4,
 } turnstile_promote_policy_t;
 
+typedef enum __attribute__((flag_enum)) turnstile_hash_lock_policy {
+       TURNSTILE_HASH_LOCK_POLICY_NONE = 0,
+       TURNSTILE_IRQ_UNSAFE_HASH = 0x1,
+       TURNSTILE_LOCKED_HASH = 0x2,
+} turnstile_hash_lock_policy_t;
+
 /*
  * Turnstile state flags
  *
@@ -178,7 +193,7 @@ MACRO_END
 
 #endif /* DEVELOPMENT || DEBUG */
 
-/* Foward declaration of turnstile */
+struct knote;
 struct turnstile;
 
 /*
@@ -311,7 +326,7 @@ struct turnstile {
        struct priority_queue         ts_inheritor_queue;    /* Queue of turnstile with us as an inheritor (WL) */
        union {
                struct priority_queue_entry ts_inheritor_links;    /* Inheritor queue links */
-               queue_chain_t         ts_deallocate_link;    /* thread deallocate link */
+               struct mpsc_queue_chain   ts_deallocate_link;    /* thread deallocate link */
        };
        SLIST_ENTRY(turnstile)        ts_htable_link;        /* linkage for turnstile in global hash table */
        uintptr_t                     ts_proprietor;         /* hash key lookup turnstile (IL) */
@@ -333,7 +348,7 @@ struct turnstile {
 
 /* IL - interlock, WL - turnstile lock i.e. waitq lock */
 
-#define TURNSTILE_PROPRIETOR_NULL 0
+#define TURNSTILE_PROPRIETOR_NULL 0ul
 
 /*
  * Name: turnstiles_init
@@ -398,6 +413,21 @@ turnstile_reference(struct turnstile *turnstile);
 void
 turnstile_deallocate(struct turnstile *turnstile);
 
+/*
+ * Name: turnstile_waitq_add_thread_priority_queue
+ *
+ * Description: add thread to the turnstile waitq
+ *
+ * Arg1: waitq
+ * Arg2: thread
+ *
+ * Conditions: waitq locked
+ */
+void
+turnstile_waitq_add_thread_priority_queue(
+       struct waitq* wq,
+       thread_t thread);
+
 /*
  * Name: turnstile_deallocate_safe
  *
@@ -464,6 +494,27 @@ turnstile_workq_proprietor_of_max_turnstile(
        struct turnstile *turnstile,
        uintptr_t *proprietor);
 
+/*
+ * Name: turnstile_workloop_pusher_info
+ *
+ * Description: Returns the priority of the turnstile push for a workloop,
+ *              and the thread or knote responsible for this push.
+ *
+ * Args: workloop turnstile
+ *
+ * Returns:
+ *    Priority of the push or 0
+ *    Thread (with a +1 reference) with that push or THREAD_NULL.
+ *    Port (with a +1 reference) with that push, or IP_NULL.
+ *    Sync IPC knote with the highest push (or NULL)
+ */
+int
+turnstile_workloop_pusher_info(
+       struct turnstile *turnstile,
+       thread_t *thread,
+       ipc_port_t *port,
+       struct knote **knote_out);
+
 /*
  * Name: turnstile_cleanup
  *
@@ -477,6 +528,19 @@ turnstile_workq_proprietor_of_max_turnstile(
 void
 turnstile_cleanup(void);
 
+/*
+ * Name: turnstile_update_thread_priority_chain
+ *
+ * Description: Priority of a thread blocked on a turnstile
+ *              has changed, update the turnstile priority.
+ *
+ * Arg1: thread: thread whose priority has changed.
+ *
+ * Returns: None.
+ */
+void
+turnstile_update_thread_priority_chain(thread_t thread);
+
 /*
  * Name: turnstile_update_inheritor_locked
  *
@@ -494,18 +558,32 @@ void
 turnstile_update_inheritor_locked(struct turnstile *turnstile);
 
 /*
- * Name: thread_get_inheritor_turnstile_priority
+ * Name: thread_get_inheritor_turnstile_base_priority
  *
- * Description: Get the max priority of all the inheritor turnstiles
+ * Description: Get the max base priority of all the inheritor turnstiles
  *
  * Arg1: thread
  *
- * Returns: Max priority of all the inheritor turnstiles.
+ * Returns: Max base priority of all the inheritor turnstiles.
  *
  * Condition: thread locked
  */
 int
-thread_get_inheritor_turnstile_priority(thread_t thread);
+thread_get_inheritor_turnstile_base_priority(thread_t thread);
+
+/*
+ * Name: thread_get_inheritor_turnstile_sched_priority
+ *
+ * Description: Get the max sched priority of all the inheritor turnstiles
+ *
+ * Arg1: thread
+ *
+ * Returns: Max sched priority of all the inheritor turnstiles.
+ *
+ * Condition: thread locked
+ */
+int
+thread_get_inheritor_turnstile_sched_priority(thread_t thread);
 
 /*
  * Name: thread_get_waiting_turnstile
@@ -529,6 +607,7 @@ thread_get_waiting_turnstile(thread_t thread);
  *              turnstile hash.
  *
  * Arg1: port
+ * Arg2: turnstile_type_t type
  *
  * Returns: turnstile: if the proprietor has a turnstile.
  *          TURNSTILE_NULL: otherwise.
@@ -536,7 +615,20 @@ thread_get_waiting_turnstile(thread_t thread);
  * Condition: proprietor interlock held.
  */
 struct turnstile *
-turnstile_lookup_by_proprietor(uintptr_t proprietor);
+turnstile_lookup_by_proprietor(uintptr_t proprietor, turnstile_type_t type);
+
+/*
+ * Name: turnstile_has_waiters
+ *
+ * Description: returns if there are waiters on the turnstile
+ *
+ * Arg1: turnstile: turnstile
+ *
+ * Returns: TRUE if there are waiters, FALSE otherwise.
+ */
+
+boolean_t
+turnstile_has_waiters(struct turnstile *turnstile);
 
 /*
  * Name: turnstile_stats_update
@@ -557,12 +649,17 @@ turnstile_stats_update(
 
 #if DEVELOPMENT || DEBUG
 
+#define SYSCTL_TURNSTILE_TEST_USER_DEFAULT              1
+#define SYSCTL_TURNSTILE_TEST_USER_HASHTABLE            2
+#define SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT            3
+#define SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE          4
+
 /* Functions used by debug test primitive exported by sysctls */
 int
-tstile_test_prim_lock(boolean_t use_hashtable);
+tstile_test_prim_lock(int val);
 
 int
-tstile_test_prim_unlock(boolean_t use_hashtable);
+tstile_test_prim_unlock(int val);
 
 int
 turnstile_get_boost_stats_sysctl(void *req);
@@ -573,6 +670,42 @@ turnstile_get_unboost_stats_sysctl(void *req);
 
 /* Interface */
 
+/*
+ * Name: turnstile_hash_bucket_lock
+ *
+ * Description: locks the spinlock associated with proprietor's bucket.
+ *              if proprietor is specified the index for the hash will be
+ *              recomputed and returned in index_proprietor,
+ *              otherwise the value save in index_proprietor is used as index.
+ *
+ * Args:
+ *   Arg1: proprietor (key) for hashing
+ *   Arg2: index for proprietor in the hash
+ *   Arg3: turnstile type
+ *
+ * Returns: old value of irq if irq were disabled before acquiring the lock.
+ */
+unsigned
+turnstile_hash_bucket_lock(uintptr_t proprietor, uint32_t *index_proprietor, turnstile_type_t type);
+
+/*
+ * Name: turnstile_hash_bucket_unlock
+ *
+ * Description: unlocks the spinlock associated with proprietor's bucket.
+ *              if proprietor is specified the index for the hash will be
+ *              recomputed and returned in index_proprietor,
+ *              otherwise the value save in index_proprietor is used as index.
+ *
+ * Args:
+ *   Arg1: proprietor (key) for hashing
+ *   Arg2: index for proprietor in the hash
+ *   Arg3: turnstile type
+ *   Arg4: irq value returned by turnstile_hash_bucket_lock
+ *
+ */
+void
+turnstile_hash_bucket_unlock(uintptr_t proprietor, uint32_t *index_proprietor, turnstile_type_t type, unsigned s);
+
 /*
  * Name: turnstile_prepare
  *
@@ -609,6 +742,7 @@ turnstile_prepare(
  *   Arg1: proprietor
  *   Arg2: pointer in primitive struct to update turnstile
  *   Arg3: pointer to store the returned turnstile instead of attaching it to thread
+ *   Arg4: type of primitive
  *
  * Returns:
  *   None.
@@ -617,7 +751,8 @@ void
 turnstile_complete(
        uintptr_t proprietor,
        struct turnstile **tstore,
-       struct turnstile **turnstile);
+       struct turnstile **turnstile,
+       turnstile_type_t type);
 
 /*
  * Name: turnstile_update_inheritor
@@ -665,6 +800,46 @@ turnstile_update_inheritor_complete(
        struct turnstile *turnstile,
        turnstile_update_complete_flags_t flags);
 
+
+/*
+ * Name: turnstile_kernel_update_inheritor_on_wake_locked
+ *
+ * Description: Set thread as the inheritor of the turnstile and
+ *             boost the inheritor.
+ * Args:
+ *   Arg1: turnstile
+ *   Arg2: new_inheritor
+ *   Arg3: flags
+ *
+ * Called with turnstile locked
+ */
+void
+turnstile_kernel_update_inheritor_on_wake_locked(
+       struct turnstile *turnstile,
+       turnstile_inheritor_t new_inheritor,
+       turnstile_update_flags_t flags);
+
+/*
+ * Internal KPI for sleep_with_inheritor, wakeup_with_inheritor, change_sleep_inheritor
+ * meant to allow specifing the turnstile type to use to have different policy
+ * on how to push on the inheritor.
+ *
+ * Differently from the "standard" KPI in locks.h these are meant to be used only
+ * if you know what you are doing with turnstile.
+ */
+
+extern wait_result_t
+lck_mtx_sleep_with_inheritor_and_turnstile_type(lck_mtx_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline, turnstile_type_t type);
+
+extern wait_result_t
+lck_rw_sleep_with_inheritor_and_turnstile_type(lck_rw_t *lock, lck_sleep_action_t lck_sleep_action, event_t event, thread_t inheritor, wait_interrupt_t interruptible, uint64_t deadline, turnstile_type_t type);
+
+extern kern_return_t
+wakeup_with_inheritor_and_turnstile_type(event_t event, turnstile_type_t type, wait_result_t result, bool wake_one, lck_wake_action_t action, thread_t *thread_wokenup);
+
+extern kern_return_t
+change_sleep_inheritor_and_turnstile_type(event_t event, thread_t inheritor, turnstile_type_t type);
+
 #endif /* KERNEL_PRIVATE */
 #if XNU_KERNEL_PRIVATE
 
@@ -673,14 +848,10 @@ struct workqueue;
 /* pthread_workqueue.c */
 extern void workq_reference(struct workqueue *wq);
 extern void workq_deallocate_safe(struct workqueue *wq);
-extern void workq_destroy(struct workqueue *wq);
 extern bool workq_is_current_thread_updating_turnstile(struct workqueue *wq);
 extern void workq_schedule_creator_turnstile_redrive(struct workqueue *wq,
     bool locked);
 
-/* thread.c */
-extern void     workq_deallocate_enqueue(struct workqueue *wq);
-
 #endif /* XNU_KERNEL_PRIVATE */
 
 #endif /* _TURNSTILE_H_ */
index a20237379e3430244fd2a01c2aa7cae582ad9689..0329eeea676839bbbac9cdc1648c2f2915e258ec 100644 (file)
@@ -68,13 +68,8 @@ SECURITY_READ_ONLY_LATE(ipc_port_t)     ux_handler_port       = IP_NULL;
 void
 ux_handler_init(void)
 {
-       ux_handler_port = ipc_port_alloc_kernel();
-
-       if (ux_handler_port == IP_NULL) {
-               panic("can't allocate unix exception port");
-       }
-
-       ipc_kobject_set(ux_handler_port, (ipc_kobject_t)&ux_handler_kobject, IKOT_UX_HANDLER);
+       ux_handler_port = ipc_kobject_alloc_port((ipc_kobject_t)&ux_handler_kobject,
+           IKOT_UX_HANDLER, IPC_KOBJECT_ALLOC_NONE);
 }
 
 /*
index 1a38d76feb9a34676279de9f07859cbc6ddba3e5..2348ef57209aa03d5f16ebe6649bccca03be47b3 100644 (file)
@@ -164,7 +164,7 @@ lck_grp_t waitq_lck_grp;
  * Prepost callback function for specially marked waitq sets
  * (prepost alternative)
  */
-extern void waitq_set__CALLING_PREPOST_HOOK__(void *ctx, void *memberctx, int priority);
+extern void waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *ctx);
 
 #define DEFAULT_MIN_FREE_TABLE_ELEM    100
 static uint32_t g_min_free_table_elem;
@@ -1706,7 +1706,7 @@ waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip)
                skip = 0;
        }
        memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
-       backtrace(buf, g_nwaitq_btframes + skip);
+       backtrace(buf, g_nwaitq_btframes + skip, NULL);
        memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
 }
 #else /* no stats */
@@ -1850,29 +1850,8 @@ waitq_thread_insert(struct waitq *wq,
     thread_t thread, boolean_t fifo)
 {
        if (waitq_is_turnstile_queue(wq)) {
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                   (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_ADDED_TO_TURNSTILE_WAITQ))) | DBG_FUNC_NONE,
-                   VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),
-                   thread_tid(thread),
-                   thread->base_pri, 0, 0);
-
                turnstile_stats_update(0, TSU_TURNSTILE_BLOCK_COUNT, NULL);
-
-               /*
-                * For turnstile queues (which use priority queues),
-                * insert the thread in the heap based on its current
-                * base_pri. Note that the priority queue implementation
-                * is currently not stable, so does not maintain fifo for
-                * threads at the same base_pri. Also, if the base_pri
-                * of the thread changes while its blocked in the waitq,
-                * the thread position should be updated in the priority
-                * queue by calling priority queue increase/decrease
-                * operations.
-                */
-               priority_queue_entry_init(&(thread->wait_prioq_links));
-               priority_queue_insert(&wq->waitq_prio_queue,
-                   &thread->wait_prioq_links, thread->base_pri,
-                   PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+               turnstile_waitq_add_thread_priority_queue(wq, thread);
        } else {
                turnstile_stats_update(0, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);
                if (fifo) {
@@ -2059,6 +2038,7 @@ struct waitq_select_args {
        event64_t        event;
        waitq_select_cb  select_cb;
        void            *select_ctx;
+       int             priority;
 
        uint64_t        *reserved_preposts;
 
@@ -2119,16 +2099,13 @@ waitq_select_walk_cb(struct waitq *waitq, void *ctx,
         */
        do_waitq_select_n_locked(&args);
 
-       if (*(args.nthreads) > 0 ||
-           (args.threadq && !queue_empty(args.threadq))) {
+       if (*args.nthreads > 0 || (args.threadq && !queue_empty(args.threadq))) {
                /* at least 1 thread was selected and returned: don't prepost */
-               if (args.max_threads > 0 &&
-                   *(args.nthreads) >= args.max_threads) {
+               if (args.max_threads > 0 && *args.nthreads >= args.max_threads) {
                        /* break out of the setid walk */
                        ret = WQ_ITERATE_FOUND;
                }
-               goto out_unlock;
-       } else {
+       } else if (args.event == NO_EVENT64) {
                /*
                 * No thread selected: prepost 'waitq' to 'wqset'
                 * if wqset can handle preposts and the event is set to 0.
@@ -2139,14 +2116,39 @@ waitq_select_walk_cb(struct waitq *waitq, void *ctx,
                 * callout function and pass the set's 'prepost_hook.' This
                 * could potentially release another thread to handle events.
                 */
-               if (args.event == NO_EVENT64) {
-                       if (waitq_set_can_prepost(wqset)) {
-                               wq_prepost_do_post_locked(
-                                       wqset, waitq, args.reserved_preposts);
-                       } else if (waitq_set_has_prepost_hook(wqset)) {
-                               waitq_set__CALLING_PREPOST_HOOK__(
-                                       wqset->wqset_prepost_hook, waitq, 0);
-                       }
+               if (waitq_set_can_prepost(wqset)) {
+                       wq_prepost_do_post_locked(
+                               wqset, waitq, args.reserved_preposts);
+               } else if (waitq_set_has_prepost_hook(wqset)) {
+                       waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook;
+
+                       /*
+                        * When calling out to the prepost hook,
+                        * we drop the waitq lock, to allow for the kevent
+                        * subsytem to call into the waitq subsystem again,
+                        * without risking a deadlock.
+                        *
+                        * However, we need to guard against wqset going away,
+                        * so we increment the prepost hook use count
+                        * while the lock is dropped.
+                        *
+                        * This lets waitq_set_deinit() know to wait for the
+                        * prepost hook call to be done before it can proceed.
+                        *
+                        * Note: we need to keep preemption disabled the whole
+                        * time as waitq_set_deinit will spin on this.
+                        */
+
+                       disable_preemption();
+                       os_atomic_inc(hook, relaxed);
+                       waitq_set_unlock(wqset);
+
+                       waitq_set__CALLING_PREPOST_HOOK__(hook);
+
+                       /* Note: after this decrement, the wqset may be deallocated */
+                       os_atomic_dec(hook, relaxed);
+                       enable_preemption();
+                       return ret;
                }
        }
 
@@ -2324,6 +2326,13 @@ waitq_prioq_iterate_locked(struct waitq *safeq, struct waitq *waitq,
 
                if (first_thread == THREAD_NULL) {
                        first_thread = thread;
+                       /*
+                        * turnstile_kernel_update_inheritor_on_wake_locked will lock
+                        * first_thread, so call it before locking it.
+                        */
+                       if (args->priority == WAITQ_PROMOTE_ON_WAKE && first_thread != THREAD_NULL && waitq_is_turnstile_queue(safeq)) {
+                               turnstile_kernel_update_inheritor_on_wake_locked(waitq_to_turnstile(safeq), (turnstile_inheritor_t)first_thread, TURNSTILE_INHERITOR_THREAD);
+                       }
                }
 
                /* For the peek operation, break out early */
@@ -2431,6 +2440,7 @@ do_waitq_select_n_locked(struct waitq_select_args *args)
                /* we know this is the first (and only) thread */
                ++(*nthreads);
                *(args->spl) = (safeq != waitq) ? spl : splsched();
+
                thread_lock(first_thread);
                thread_clear_waitq_state(first_thread);
                waitq_thread_remove(safeq, first_thread);
@@ -2510,7 +2520,8 @@ waitq_select_n_locked(struct waitq *waitq,
     void *select_ctx,
     uint64_t *reserved_preposts,
     queue_t threadq,
-    int max_threads, spl_t *spl)
+    int max_threads, spl_t *spl,
+    int priority)
 {
        int nthreads = 0;
 
@@ -2520,6 +2531,7 @@ waitq_select_n_locked(struct waitq *waitq,
                .event = event,
                .select_cb = select_cb,
                .select_ctx = select_ctx,
+               .priority = priority,
                .reserved_preposts = reserved_preposts,
                .threadq = threadq,
                .max_threads = max_threads,
@@ -2547,14 +2559,13 @@ waitq_select_one_locked(struct waitq *waitq, event64_t event,
     uint64_t *reserved_preposts,
     int priority, spl_t *spl)
 {
-       (void)priority;
        int nthreads;
        queue_head_t threadq;
 
        queue_init(&threadq);
 
        nthreads = waitq_select_n_locked(waitq, event, NULL, NULL,
-           reserved_preposts, &threadq, 1, spl);
+           reserved_preposts, &threadq, 1, spl, priority);
 
        /* if we selected a thread, return it (still locked) */
        if (!queue_empty(&threadq)) {
@@ -2569,96 +2580,6 @@ waitq_select_one_locked(struct waitq *waitq, event64_t event,
        return THREAD_NULL;
 }
 
-struct find_max_pri_ctx {
-       integer_t max_sched_pri;
-       integer_t max_base_pri;
-       thread_t highest_thread;
-};
-
-/**
- * callback function that finds the max priority thread
- *
- * Conditions:
- *      'waitq' is locked
- *      'thread' is not locked
- */
-static thread_t
-waitq_find_max_pri_cb(void         *ctx_in,
-    __unused struct waitq *waitq,
-    __unused int           is_global,
-    thread_t      thread)
-{
-       struct find_max_pri_ctx *ctx = (struct find_max_pri_ctx *)ctx_in;
-
-       /*
-        * thread is not locked, use pri as a hint only
-        * wake up the highest base pri, and find the highest sched pri at that base pri
-        */
-       integer_t sched_pri = *(volatile int16_t *)&thread->sched_pri;
-       integer_t base_pri  = *(volatile int16_t *)&thread->base_pri;
-
-       if (ctx->highest_thread == THREAD_NULL ||
-           (base_pri > ctx->max_base_pri) ||
-           (base_pri == ctx->max_base_pri && sched_pri > ctx->max_sched_pri)) {
-               /* don't select the thread, just update ctx */
-
-               ctx->max_sched_pri  = sched_pri;
-               ctx->max_base_pri   = base_pri;
-               ctx->highest_thread = thread;
-       }
-
-       return THREAD_NULL;
-}
-
-/**
- * select from a waitq the highest priority thread waiting for a given event
- *
- * Conditions:
- *     'waitq' is locked
- *
- * Returns:
- *     A locked thread that's been removed from the waitq, but has not
- *     yet been put on a run queue. Caller is responsible to call splx
- *     with the '*spl' value.
- */
-static thread_t
-waitq_select_max_locked(struct waitq *waitq, event64_t event,
-    uint64_t *reserved_preposts,
-    spl_t *spl)
-{
-       __assert_only int nthreads;
-       assert(!waitq->waitq_set_id); /* doesn't support recursive sets */
-
-       struct find_max_pri_ctx ctx = {
-               .max_sched_pri = 0,
-               .max_base_pri = 0,
-               .highest_thread = THREAD_NULL,
-       };
-
-       /*
-        * Scan the waitq to find the highest priority thread.
-        * This doesn't remove any thread from the queue
-        */
-       nthreads = waitq_select_n_locked(waitq, event,
-           waitq_find_max_pri_cb,
-           &ctx, reserved_preposts, NULL, 1, spl);
-
-       assert(nthreads == 0);
-
-       if (ctx.highest_thread != THREAD_NULL) {
-               __assert_only kern_return_t ret;
-
-               /* Remove only the thread we just found */
-               ret = waitq_select_thread_locked(waitq, event, ctx.highest_thread, spl);
-
-               assert(ret == KERN_SUCCESS);
-               return ctx.highest_thread;
-       }
-
-       return THREAD_NULL;
-}
-
-
 struct select_thread_ctx {
        thread_t      thread;
        event64_t     event;
@@ -3051,9 +2972,6 @@ maybe_adjust_thread_pri(thread_t   thread,
                }
 
                sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq);
-       } else if (priority > 0) {
-               /* Mutex subsystem wants to see this thread before we 'go' it */
-               lck_mtx_wakeup_adjust_pri(thread, priority);
        }
 }
 
@@ -3123,7 +3041,7 @@ waitq_wakeup64_all_locked(struct waitq *waitq,
 
        nthreads = waitq_select_n_locked(waitq, wake_event, NULL, NULL,
            reserved_preposts,
-           &wakeup_queue, -1, &th_spl);
+           &wakeup_queue, -1, &th_spl, priority);
 
        /* set each thread running */
        ret = KERN_NOT_WAITING;
@@ -3175,16 +3093,9 @@ waitq_wakeup64_one_locked(struct waitq *waitq,
 
        assert(waitq_held(waitq));
 
-       if (priority == WAITQ_SELECT_MAX_PRI) {
-               thread = waitq_select_max_locked(waitq, wake_event,
-                   reserved_preposts,
-                   &th_spl);
-       } else {
-               thread = waitq_select_one_locked(waitq, wake_event,
-                   reserved_preposts,
-                   priority, &th_spl);
-       }
-
+       thread = waitq_select_one_locked(waitq, wake_event,
+           reserved_preposts,
+           priority, &th_spl);
 
        if (thread != THREAD_NULL) {
                waitq_stats_count_wakeup(waitq);
@@ -3233,15 +3144,9 @@ waitq_wakeup64_identify_locked(struct waitq     *waitq,
 
        assert(waitq_held(waitq));
 
-       if (priority == WAITQ_SELECT_MAX_PRI) {
-               thread = waitq_select_max_locked(waitq, wake_event,
-                   reserved_preposts,
-                   spl);
-       } else {
-               thread = waitq_select_one_locked(waitq, wake_event,
-                   reserved_preposts,
-                   priority, spl);
-       }
+       thread = waitq_select_one_locked(waitq, wake_event,
+           reserved_preposts,
+           priority, spl);
 
        if (thread != THREAD_NULL) {
                waitq_stats_count_wakeup(waitq);
@@ -3508,7 +3413,7 @@ wqset_clear_prepost_chain_cb(struct waitq_set __unused *wqset,
  *     NULL on failure
  */
 struct waitq_set *
-waitq_set_alloc(int policy, void *prepost_hook)
+waitq_set_alloc(int policy, waitq_set_prepost_hook_t *prepost_hook)
 {
        struct waitq_set *wqset;
 
@@ -3537,7 +3442,7 @@ waitq_set_alloc(int policy, void *prepost_hook)
 kern_return_t
 waitq_set_init(struct waitq_set *wqset,
     int policy, uint64_t *reserved_link,
-    void *prepost_hook)
+    waitq_set_prepost_hook_t *prepost_hook)
 {
        struct waitq_link *link;
        kern_return_t ret;
@@ -3677,6 +3582,20 @@ waitq_set_deinit(struct waitq_set *wqset)
 
        waitq_set_lock(wqset);
 
+       if (waitq_set_has_prepost_hook(wqset)) {
+               waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook;
+               /*
+                * If the wqset_prepost_hook value is non 0,
+                * then another core is currently posting to this waitq set
+                * and we need for it to finish what it's doing.
+                */
+               while (os_atomic_load(hook, relaxed) != 0) {
+                       waitq_set_unlock(wqset);
+                       delay(1);
+                       waitq_set_lock(wqset);
+               }
+       }
+
        set_id = wqset->wqset_id;
 
        if (waitqs_is_linked(wqset) || set_id == 0) {
index 9eb863a7b9e152d0851f420ca8ab24c7dc9d0ed6..2d897573373730e0ca0ef4b116dd1e04ea68b890 100644 (file)
@@ -49,7 +49,7 @@
  */
 #define WAITQ_ALL_PRIORITIES   (-1)
 #define WAITQ_PROMOTE_PRIORITY (-2)
-#define WAITQ_SELECT_MAX_PRI   (-3)
+#define WAITQ_PROMOTE_ON_WAKE  (-3)
 
 typedef enum e_waitq_lock_state {
        WAITQ_KEEP_LOCKED    = 0x01,
@@ -175,7 +175,7 @@ struct waitq_set {
        };
 };
 
-#define WQSET_NOT_LINKED ((uint64_t)(~0))
+#define WQSET_NOT_LINKED       ((uint64_t)(~0))
 static_assert(sizeof(struct waitq_set) == WQS_OPAQUE_SIZE, "waitq_set structure size mismatch");
 static_assert(__alignof(struct waitq_set) == WQS_OPAQUE_ALIGN, "waitq_set structure alignment mismatch");
 
@@ -388,14 +388,17 @@ extern struct waitq *_global_eventq(char *event, size_t event_length);
 
 extern struct waitq *global_waitq(int index);
 
+typedef uint16_t waitq_set_prepost_hook_t;
+
 /*
  * set alloc/init/free
  */
-extern struct waitq_set *waitq_set_alloc(int policy, void *prepost_hook);
+extern struct waitq_set *waitq_set_alloc(int policy,
+    waitq_set_prepost_hook_t *prepost_hook);
 
 extern kern_return_t waitq_set_init(struct waitq_set *wqset,
     int policy, uint64_t *reserved_link,
-    void *prepost_hook);
+    waitq_set_prepost_hook_t *prepost_hook);
 
 extern void waitq_set_deinit(struct waitq_set *wqset);
 
index 4c1d4cbdae9fb5dec3ccc31b8b9ee6f876bb46c6..5986b975dda25ec882a1f3d5318d42c886d70c05 100644 (file)
@@ -106,49 +106,6 @@ wi_release(struct work_interval *work_interval)
        }
 }
 
-/*
- * work_interval_port_alloc
- *
- * Description: Obtain a send right for the given work interval struct.
- *
- * Parameters:  work_interval - A work_interval struct
- *              Consumes a +1 ref count on work_interval, now owned by the port.
- *
- * Returns:     Port of type IKOT_WORK_INTERVAL with work_interval set as its kobject.
- *              Returned with a +1 send right and no-senders notification armed.
- *              Work interval struct reference is held by the port.
- */
-static ipc_port_t
-work_interval_port_alloc(struct work_interval *work_interval)
-{
-       ipc_port_t work_interval_port = ipc_port_alloc_kernel();
-
-       if (work_interval_port == IP_NULL) {
-               panic("failed to allocate work interval port");
-       }
-
-       assert(work_interval->wi_port == IP_NULL);
-
-       ip_lock(work_interval_port);
-       ipc_kobject_set_atomically(work_interval_port, (ipc_kobject_t)work_interval,
-           IKOT_WORK_INTERVAL);
-
-       ipc_port_t notify_port = ipc_port_make_sonce_locked(work_interval_port);
-       ipc_port_t old_notify_port = IP_NULL;
-       ipc_port_nsrequest(work_interval_port, 1, notify_port, &old_notify_port);
-       /* port unlocked */
-
-       assert(old_notify_port == IP_NULL);
-
-       /* This is the only make-send that will happen on this port */
-       ipc_port_t send_port = ipc_port_make_send(work_interval_port);
-       assert(IP_VALID(send_port));
-
-       work_interval->wi_port = work_interval_port;
-
-       return send_port;
-}
-
 /*
  * work_interval_port_convert
  *
@@ -390,12 +347,11 @@ kern_work_interval_create(thread_t thread,
        task_t creating_task = current_task();
        if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
                /*
-                * CA_CLIENT work intervals do not create new thread groups
-                * and are non-joinable.
-                * There can only be one CA_CLIENT work interval (created by UIKit)
+                * CA_CLIENT work intervals do not create new thread groups.
+                * There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
                 * per each application task
                 */
-               if (create_flags & (WORK_INTERVAL_FLAG_JOINABLE | WORK_INTERVAL_FLAG_GROUP)) {
+               if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
                        return KERN_FAILURE;
                }
                if (!task_is_app(creating_task)) {
@@ -417,11 +373,14 @@ kern_work_interval_create(thread_t thread,
 
 
        if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
-               /* work_interval has a +1 ref, moves to the port */
-               ipc_port_t port = work_interval_port_alloc(work_interval);
                mach_port_name_t name = MACH_PORT_NULL;
 
-               name = ipc_port_copyout_send(port, current_space());
+               /* work_interval has a +1 ref, moves to the port */
+               work_interval->wi_port = ipc_kobject_alloc_port(
+                       (ipc_kobject_t)work_interval, IKOT_WORK_INTERVAL,
+                       IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
+
+               name = ipc_port_copyout_send(work_interval->wi_port, current_space());
 
                if (!MACH_PORT_VALID(name)) {
                        /*
diff --git a/osfmk/kern/xpr.c b/osfmk/kern/xpr.c
deleted file mode 100644 (file)
index 0c28eab..0000000
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/*
- * Mach Operating System
- * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
- * All Rights Reserved.
- *
- * Permission to use, copy, modify and distribute this software and its
- * documentation is hereby granted, provided that both the copyright
- * notice and this permission notice appear in all copies of the
- * software, derivative works or modified versions, and any portions
- * thereof, and that both notices appear in supporting documentation.
- *
- * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
- * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
- * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- *
- * Carnegie Mellon requests users of this software to return to
- *
- *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
- *  School of Computer Science
- *  Carnegie Mellon University
- *  Pittsburgh PA 15213-3890
- *
- * any improvements or extensions that they make and grant Carnegie Mellon
- * the rights to redistribute these changes.
- */
-/*
- * xpr silent tracing circular buffer.
- */
-
-#include <mach/machine/vm_types.h>
-#include <kern/xpr.h>
-#include <kern/spl.h>
-#include <kern/cpu_number.h>
-#include <kern/misc_protos.h>
-#include <kern/thread.h>
-#include <vm/vm_kern.h>
-#include <string.h>
-
-/*
- *     After a spontaneous reboot, it is desirable to look
- *     at the old xpr buffer.  Assuming xprbootstrap allocates
- *     the buffer in the same place in physical memory and
- *     the reboot doesn't clear memory, this should work.
- *     xprptr will be reset, but the saved value should be OK.
- *     Just set xprenable false so the buffer isn't overwritten.
- */
-
-decl_simple_lock_data(, xprlock)
-boolean_t xprenable = TRUE;     /* Enable xpr tracing */
-int nxprbufs = 0;       /* Number of contiguous xprbufs allocated */
-int xprflags = 0;       /* Bit mask of xpr flags enabled */
-struct xprbuf *xprbase; /* Pointer to circular buffer nxprbufs*sizeof(xprbuf)*/
-struct xprbuf *xprptr;  /* Currently allocated xprbuf */
-struct xprbuf *xprlast; /* Pointer to end of circular buffer */
-
-void
-xpr(
-       const char      *msg,
-       long            arg1,
-       long            arg2,
-       long            arg3,
-       long            arg4,
-       long            arg5)
-{
-       spl_t s;
-       struct xprbuf *x;
-
-       /* If we aren't initialized, ignore trace request */
-       if (!xprenable || (xprptr == 0)) {
-               return;
-       }
-       /* Guard against all interrupts and allocate next buffer. */
-
-       s = splhigh();
-       simple_lock(&xprlock, LCK_GRP_NULL);
-       x = xprptr++;
-       if (xprptr >= xprlast) {
-               /* wrap around */
-               xprptr = xprbase;
-       }
-       /* Save xprptr in allocated memory. */
-       *(struct xprbuf **)xprlast = xprptr;
-       simple_unlock(&xprlock);
-       x->timestamp = XPR_TIMESTAMP;
-       splx(s);
-       x->msg = msg;
-       x->arg1 = arg1;
-       x->arg2 = arg2;
-       x->arg3 = arg3;
-       x->arg4 = arg4;
-       x->arg5 = arg5;
-       mp_disable_preemption();
-       x->cpuinfo = cpu_number();
-       mp_enable_preemption();
-}
diff --git a/osfmk/kern/xpr.h b/osfmk/kern/xpr.h
deleted file mode 100644 (file)
index e63d9e6..0000000
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/*
- * Mach Operating System
- * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
- * All Rights Reserved.
- *
- * Permission to use, copy, modify and distribute this software and its
- * documentation is hereby granted, provided that both the copyright
- * notice and this permission notice appear in all copies of the
- * software, derivative works or modified versions, and any portions
- * thereof, and that both notices appear in supporting documentation.
- *
- * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
- * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
- * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- *
- * Carnegie Mellon requests users of this software to return to
- *
- *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
- *  School of Computer Science
- *  Carnegie Mellon University
- *  Pittsburgh PA 15213-3890
- *
- * any improvements or extensions that they make and grant Carnegie Mellon
- * the rights to redistribute these changes.
- */
-/*
- */
-/*
- * Include file for xpr circular buffer silent tracing.
- *
- */
-/*
- * If the kernel flag XPRDEBUG is set, the XPR macro is enabled.  The
- * macro should be invoked something like the following:
- *     XPR(XPR_SYSCALLS, "syscall: %d, 0x%x\n", syscallno, arg1, 0,0,0);
- * which will expand into the following code:
- *     if (xprflags & XPR_SYSCALLS)
- *             xpr("syscall: %d, 0x%x\n", syscallno, arg1, 0,0,0);
- * Xpr will log the pointer to the printf string and up to 5 arguements,
- * along with a timestamp and cpuinfo (for multi-processor systems), into
- * a circular buffer.  The actual printf processing is delayed until after
- * the buffer has been collected.  It is assumed that the text/data segments
- * of the kernel can easily be reconstructed in a post-processor which
- * performs the printf processing.
- *
- * If the XPRDEBUG compilation switch is not set, the XPR macro expands
- * to nothing.
- */
-
-#ifndef _KERN_XPR_H_
-#define _KERN_XPR_H_
-
-#ifdef  MACH_KERNEL
-#include <xpr_debug.h>
-#else   /* MACH_KERNEL */
-#include <sys/features.h>
-#endif  /* MACH_KERNEL */
-
-#include <machine/xpr.h>
-
-#if     XPR_DEBUG
-
-#define XPR(flags, msg, arg1, arg2, arg3, arg4, arg5)           \
-MACRO_BEGIN                                                     \
-       if (xprflags & (flags)) {                               \
-               xpr((msg), (long)(arg1), (long)(arg2),          \
-                   (long)(arg3), (long)(arg4), (long)(arg5));  \
-       }                                                       \
-MACRO_END
-
-extern int xprflags;
-
-/*
- * flags for message types.
- */
-#define XPR_TRAPS               (1 << 1)
-#define XPR_SCHED               (1 << 2)
-#define XPR_LOCK                (1 << 3)
-#define XPR_SLOCK               (1 << 4)
-#define XPR_PMAP                (1 << 6)
-#define XPR_VM_MAP              (1 << 7)
-#define XPR_VM_OBJECT           (1 << 8)
-#define XPR_VM_OBJECT_CACHE     (1 << 9)
-#define XPR_VM_PAGE             (1 << 10)
-#define XPR_VM_PAGEOUT          (1 << 11)
-#define XPR_MEMORY_OBJECT       (1 << 12)
-#define XPR_VM_FAULT            (1 << 13)
-#define XPR_VM_OBJECT_REP       (1 << 14)
-#define XPR_DEFAULT_PAGER       (1 << 15)
-#define XPR_INODE_PAGER         (1 << 16)
-#define XPR_INODE_PAGER_DATA    (1 << 17)
-#define XPR_XMM                 (1 << 18)
-
-#else   /* XPR_DEBUG */
-#define XPR(flags, msg, arg1, arg2, arg3, arg4, arg5)
-#endif  /* XPR_DEBUG */
-
-struct xprbuf {
-       const char      *msg;
-       long            arg1, arg2, arg3, arg4, arg5;
-       int             timestamp;
-       int             cpuinfo;
-};
-
-/* Bootstrap XPR facility */
-extern void xprbootstrap(void);
-
-/* Enable XPR facility */
-extern void xprinit(void);
-
-/* Log an XPR message */
-extern void xpr(
-       const char      *msg,
-       long            arg1,
-       long            arg2,
-       long            arg3,
-       long            arg4,
-       long            arg5);
-
-#endif /* _KERN_XPR_H_ */
index 19562002c19dfae953a22a7e5975c5490d122ea6..f25e4040768d77eda5cf7fb7878e7148ba494457 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -370,7 +370,7 @@ vm_offset_t     zone_map_max_address = 0;
 /* VM region for all metadata structures */
 vm_offset_t     zone_metadata_region_min = 0;
 vm_offset_t     zone_metadata_region_max = 0;
-decl_lck_mtx_data(static, zone_metadata_region_lck)
+decl_lck_mtx_data(static, zone_metadata_region_lck);
 lck_attr_t      zone_metadata_lock_attr;
 lck_mtx_ext_t   zone_metadata_region_lck_ext;
 
@@ -383,12 +383,6 @@ struct zone_free_element {
 
 #if CONFIG_ZCACHE
 
-#if !CONFIG_GZALLOC
-bool use_caching = TRUE;
-#else
-bool use_caching = FALSE;
-#endif /* !CONFIG_GZALLOC */
-
 /*
  * Decides whether per-cpu zone caching is to be enabled for all zones.
  * Can be set to TRUE via the boot-arg '-zcache_all'.
@@ -412,11 +406,15 @@ zone_caching_enabled(zone_t z)
 /*
  *      Protects zone_array, num_zones, num_zones_in_use, and zone_empty_bitmap
  */
-decl_simple_lock_data(, all_zones_lock)
+decl_simple_lock_data(, all_zones_lock);
 unsigned int            num_zones_in_use;
 unsigned int            num_zones;
 
+#if KASAN
+#define MAX_ZONES       512
+#else /* !KASAN */
 #define MAX_ZONES       320
+#endif/* !KASAN */
 struct zone             zone_array[MAX_ZONES];
 
 /* Used to keep track of empty slots in the zone_array */
@@ -428,7 +426,7 @@ bitmap_t zone_empty_bitmap[BITMAP_LEN(MAX_ZONES)];
  * Or we can end up with multiple test zones (if a second zinit() comes through before zdestroy()),  which could lead us to
  * run out of zones.
  */
-decl_simple_lock_data(, zone_test_lock)
+decl_simple_lock_data(, zone_test_lock);
 static boolean_t zone_test_running = FALSE;
 static zone_t test_zone_ptr = NULL;
 #endif /* DEBUG || DEVELOPMENT */
@@ -636,6 +634,31 @@ get_zone_page(struct zone_page_metadata *page_meta)
        }
 }
 
+/*
+ * Routine to panic if a pointer is not mapped to an expected zone.
+ * This can be used as a means of pinning an object to the zone it is expected
+ * to be a part of.  Causes a panic if the address does not belong to any
+ * specified zone, does not belong to any zone, has been freed and therefore
+ * unmapped from the zone, or the pointer contains an uninitialized value that
+ * does not belong to any zone.
+ */
+
+void
+zone_require(void *addr, zone_t expected_zone)
+{
+       struct zone *src_zone = NULL;
+       struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
+
+       src_zone = PAGE_METADATA_GET_ZONE(page_meta);
+       if (__improbable(src_zone == NULL)) {
+               panic("Address not in a zone for zone_require check (addr: %p)", addr);
+       }
+
+       if (__improbable(src_zone != expected_zone)) {
+               panic("Address not in expected zone for zone_require check (addr: %p, zone: %s)", addr, src_zone->zone_name);
+       }
+}
+
 /*
  * ZTAGS
  */
@@ -677,7 +700,7 @@ static vm_map_t     zone_tags_map;
 
 // simple heap allocator for allocating the tags for new memory
 
-decl_lck_mtx_data(, ztLock)    /* heap lock */
+decl_lck_mtx_data(, ztLock);    /* heap lock */
 enum{
        ztFreeIndexCount = 8,
        ztFreeIndexMax   = (ztFreeIndexCount - 1),
@@ -1186,8 +1209,8 @@ is_sane_zone_element(zone_t      zone,
 }
 
 /* Someone wrote to freed memory. */
+__dead2
 static inline void
-/* noreturn */
 zone_element_was_modified_panic(zone_t        zone,
     vm_offset_t   element,
     vm_offset_t   found,
@@ -1210,10 +1233,9 @@ zone_element_was_modified_panic(zone_t        zone,
  * The primary and backup pointers don't match.
  * Determine which one was likely the corrupted pointer, find out what it
  * probably should have been, and panic.
- * I would like to mark this as noreturn, but panic() isn't marked noreturn.
  */
+__dead2
 static void
-/* noreturn */
 backup_ptr_mismatch_panic(zone_t        zone,
     vm_offset_t   element,
     vm_offset_t   primary,
@@ -1517,7 +1539,7 @@ MACRO_END
 /*
  *     Exclude more than one concurrent garbage collection
  */
-decl_lck_mtx_data(, zone_gc_lock)
+decl_lck_mtx_data(, zone_gc_lock);
 
 lck_attr_t      zone_gc_lck_attr;
 lck_grp_t       zone_gc_lck_grp;
@@ -1532,7 +1554,10 @@ vm_size_t panic_kext_memory_size = 0;
 
 #define ZALLOC_DEBUG_ZONEGC             0x00000001
 #define ZALLOC_DEBUG_ZCRAM              0x00000002
-uint32_t zalloc_debug = 0;
+
+#if DEBUG || DEVELOPMENT
+static uint32_t zalloc_debug = 0;
+#endif
 
 /*
  * Zone leak debugging code
@@ -2294,6 +2319,7 @@ zinit(
                                bitmap_clear(zone_empty_bitmap, index);
                                num_zones_in_use++;
                                z->zone_valid = TRUE;
+                               z->zone_destruction = FALSE;
 
                                /* All other state is already set up since the zone was previously in use. Return early. */
                                simple_unlock(&all_zones_lock);
@@ -2380,7 +2406,9 @@ zinit(
        z->zp_count = 0;
        z->kasan_quarantine = TRUE;
        z->zone_valid = TRUE;
+       z->zone_destruction = FALSE;
        z->cpu_cache_enabled = FALSE;
+       z->clear_memory = FALSE;
 
 #if CONFIG_ZLEAKS
        z->zleak_capture = 0;
@@ -2544,7 +2572,7 @@ static void zone_replenish_thread(zone_t);
 /* High priority VM privileged thread used to asynchronously refill a designated
  * zone, such as the reserved VM map entry zone.
  */
-__attribute__((noreturn))
+__dead2
 static void
 zone_replenish_thread(zone_t z)
 {
@@ -2576,6 +2604,10 @@ zone_replenish_thread(zone_t z)
                                zflags |= KMA_NOENCRYPT;
                        }
 
+                       if (z->clear_memory) {
+                               zflags |= KMA_ZERO;
+                       }
+
                        /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */
                        if (is_zone_map_nearing_exhaustion()) {
                                thread_wakeup((event_t) &vm_pageout_garbage_collect);
@@ -2655,6 +2687,7 @@ zdestroy(zone_t z)
         */
        z->zone_valid = FALSE;
 #endif
+       z->zone_destruction = TRUE;
        unlock_zone(z);
 
 #if CONFIG_ZCACHE
@@ -2796,10 +2829,12 @@ zcram(
                assert((zone->allows_foreign == TRUE) && (zone->elem_size <= (PAGE_SIZE - sizeof(struct zone_page_metadata))));
        }
 
+#if DEBUG || DEVELOPMENT
        if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) {
                kprintf("zcram(%p[%s], 0x%lx%s, 0x%lx)\n", zone, zone->zone_name,
                    (unsigned long)newmem, from_zm ? "" : "[F]", (unsigned long)size);
        }
+#endif /* DEBUG || DEVELOPMENT */
 
        ZONE_PAGE_COUNT_INCR(zone, (size / PAGE_SIZE));
 
@@ -2878,6 +2913,11 @@ zfill(
        vm_size_t alloc_size = zone->alloc_size;
        vm_size_t elem_per_alloc = alloc_size / zone->elem_size;
        vm_size_t nalloc = (nelem + elem_per_alloc - 1) / elem_per_alloc;
+       int zflags = KMA_KOBJECT;
+
+       if (zone->clear_memory) {
+               zflags |= KMA_ZERO;
+       }
 
        /* Don't mix-and-match zfill with foreign memory */
        assert(!zone->allows_foreign);
@@ -2887,7 +2927,7 @@ zfill(
                thread_wakeup((event_t) &vm_pageout_garbage_collect);
        }
 
-       kr = kernel_memory_allocate(zone_map, &memory, nalloc * alloc_size, 0, KMA_KOBJECT, VM_KERN_MEMORY_ZONE);
+       kr = kernel_memory_allocate(zone_map, &memory, nalloc * alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
        if (kr != KERN_SUCCESS) {
                printf("%s: kernel_memory_allocate() of %lu bytes failed\n",
                    __func__, (unsigned long)(nalloc * alloc_size));
@@ -2911,9 +2951,11 @@ zone_bootstrap(void)
 {
        char temp_buf[16];
 
+#if DEBUG || DEVELOPMENT
        if (!PE_parse_boot_argn("zalloc_debug", &zalloc_debug, sizeof(zalloc_debug))) {
                zalloc_debug = 0;
        }
+#endif /* DEBUG || DEVELOPMENT */
 
        /* Set up zone element poisoning */
        zp_init();
@@ -3204,6 +3246,18 @@ zalloc_poison_element(boolean_t check_poison, zone_t zone, vm_offset_t addr)
        }
 }
 
+/*
+ * When deleting page mappings from the kernel map, it might be necessary to split
+ * apart an existing vm_map_entry. That means that a "free" operation, will need to
+ * *allocate* new vm_map_entry structures before it can free a page.
+ *
+ * This reserve here is the number of elements which are held back from everyone except
+ * the zone_gc thread. This is done so the zone_gc thread should never have to wait for
+ * the zone replenish thread for vm_map_entry structs. If it did, it could wind up
+ * in a deadlock.
+ */
+#define VM_MAP_ENTRY_RESERVE_CNT 8
+
 /*
  *     zalloc returns an element from the specified zone.
  */
@@ -3222,9 +3276,8 @@ zalloc_internal(
        vm_offset_t     addr = 0;
        kern_return_t   retval;
        uintptr_t       zbt[MAX_ZTRACE_DEPTH];  /* used in zone leak logging and zone leak detection */
-       unsigned int            numsaved = 0;
-       boolean_t       zone_replenish_wakeup = FALSE, zone_alloc_throttle = FALSE;
-       thread_t thr = current_thread();
+       unsigned int    numsaved = 0;
+       thread_t        thr = current_thread();
        boolean_t       check_poison = FALSE;
        boolean_t       set_doing_alloc_with_vm_priv = FALSE;
 
@@ -3268,7 +3321,7 @@ zalloc_internal(
        if (__improbable(zone->zleak_on && sample_counter(&zone->zleak_capture, zleak_sample_factor) == TRUE)) {
                /* Avoid backtracing twice if zone logging is on */
                if (numsaved == 0) {
-                       zleak_tracedepth = backtrace(zbt, MAX_ZTRACE_DEPTH);
+                       zleak_tracedepth = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
                } else {
                        zleak_tracedepth = numsaved;
                }
@@ -3289,6 +3342,10 @@ zalloc_internal(
 #if KASAN_ZALLOC
                                addr = kasan_fixup_allocated_element_address(zone, addr);
 #endif
+                               if (__improbable(DO_LOGGING(zone) && addr)) {
+                                       btlog_add_entry(zone->zlog_btlog, (void *)addr,
+                                           ZOP_ALLOC, (void **)zbt, numsaved);
+                               }
                                DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
                                return (void *)addr;
                        }
@@ -3299,48 +3356,56 @@ zalloc_internal(
        lock_zone(zone);
        assert(zone->zone_valid);
 
+       /*
+        * Check if we need another thread to replenish the zone.
+        * This is used for elements, like vm_map_entry, which are
+        * needed themselves to implement zalloc().
+        */
        if (zone->async_prio_refill && zone->zone_replenish_thread) {
-               vm_size_t zfreec = (zone->cur_size - (zone->count * zone->elem_size));
-               vm_size_t zrefillwm = zone->prio_refill_watermark * zone->elem_size;
-               zone_replenish_wakeup = (zfreec < zrefillwm);
-               zone_alloc_throttle = (((zfreec < (zrefillwm / 2)) && ((thr->options & TH_OPT_VMPRIV) == 0)) || (zfreec == 0));
-
-               do {
-                       if (zone_replenish_wakeup) {
-                               zone_replenish_wakeups_initiated++;
-                               /* Signal the potentially waiting
-                                * refill thread.
-                                */
-                               thread_wakeup(&zone->zone_replenish_thread);
+               vm_size_t curr_free;
+               vm_size_t refill_level;
+               const vm_size_t reserved_min = VM_MAP_ENTRY_RESERVE_CNT * zone->elem_size;
 
-                               /* We don't want to wait around for zone_replenish_thread to bump up the free count
-                                * if we're in zone_gc(). This keeps us from deadlocking with zone_replenish_thread.
-                                */
-                               if (thr->options & TH_OPT_ZONE_GC) {
-                                       break;
-                               }
+               for (;;) {
+                       curr_free = (zone->cur_size - (zone->count * zone->elem_size));
+                       refill_level = zone->prio_refill_watermark * zone->elem_size;
 
-                               unlock_zone(zone);
-                               /* Scheduling latencies etc. may prevent
-                                * the refill thread from keeping up
-                                * with demand. Throttle consumers
-                                * when we fall below half the
-                                * watermark, unless VM privileged
-                                */
-                               if (zone_alloc_throttle) {
-                                       zone_replenish_throttle_count++;
-                                       assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
-                                       thread_block(THREAD_CONTINUE_NULL);
-                               }
-                               lock_zone(zone);
-                               assert(zone->zone_valid);
+                       /*
+                        * Nothing to do if there are plenty of elements.
+                        */
+                       if (curr_free > refill_level) {
+                               break;
+                       }
+
+                       /*
+                        * Wakeup the replenish thread.
+                        */
+                       zone_replenish_wakeups_initiated++;
+                       thread_wakeup(&zone->zone_replenish_thread);
+
+                       /*
+                        * If we:
+                        * - still have head room, more than half the refill amount, or
+                        * - this is a VMPRIV thread and we're still above reserved, or
+                        * - this is the zone garbage collection thread which may use the reserve
+                        * then we don't have to wait for the replenish thread.
+                        *
+                        * The reserve for the garbage collection thread is to avoid a deadlock
+                        * on the zone_map_lock between the replenish thread and GC thread.
+                        */
+                       if (curr_free > refill_level / 2 ||
+                           ((thr->options & TH_OPT_VMPRIV) && curr_free > reserved_min) ||
+                           (thr->options & TH_OPT_ZONE_GC)) {
+                               break;
                        }
+                       zone_replenish_throttle_count++;
+                       unlock_zone(zone);
+                       assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
+                       thread_block(THREAD_CONTINUE_NULL);
+                       lock_zone(zone);
 
-                       zfreec = (zone->cur_size - (zone->count * zone->elem_size));
-                       zrefillwm = zone->prio_refill_watermark * zone->elem_size;
-                       zone_replenish_wakeup = (zfreec < zrefillwm);
-                       zone_alloc_throttle = (((zfreec < (zrefillwm / 2)) && ((thr->options & TH_OPT_VMPRIV) == 0)) || (zfreec == 0));
-               } while (zone_alloc_throttle == TRUE);
+                       assert(zone->zone_valid);
+               }
        }
 
        if (__probable(addr == 0)) {
@@ -3350,9 +3415,10 @@ zalloc_internal(
        /* If we're here because of zone_gc(), we didn't wait for zone_replenish_thread to finish.
         * So we need to ensure that we did successfully grab an element. And we only need to assert
         * this for zones that have a replenish thread configured (in this case, the Reserved VM map
-        * entries zone).
+        * entries zone). The value of reserved_min in the previous bit of code should have given us
+        * headroom even though the GC thread didn't wait.
         */
-       if (thr->options & TH_OPT_ZONE_GC && zone->async_prio_refill) {
+       if ((thr->options & TH_OPT_ZONE_GC) && zone->async_prio_refill) {
                assert(addr != 0);
        }
 
@@ -3444,6 +3510,10 @@ zalloc_internal(
                                        zflags |= KMA_NOENCRYPT;
                                }
 
+                               if (zone->clear_memory) {
+                                       zflags |= KMA_ZERO;
+                               }
+
                                /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */
                                if (is_zone_map_nearing_exhaustion()) {
                                        thread_wakeup((event_t) &vm_pageout_garbage_collect);
@@ -3573,7 +3643,7 @@ zalloc_internal(
                        unsigned int count, idx;
                        /* Fill element, from tail, with backtrace in reverse order */
                        if (numsaved == 0) {
-                               numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH);
+                               numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
                        }
                        count = (unsigned int)(zone->elem_size / sizeof(uintptr_t));
                        if (count >= numsaved) {
@@ -3976,7 +4046,16 @@ zone_change(
                break;
        case Z_CACHING_ENABLED:
 #if     CONFIG_ZCACHE
-               if (value == TRUE && use_caching) {
+               if (value == TRUE) {
+#if     CONFIG_GZALLOC
+                       /*
+                        * Per cpu zone caching should be
+                        * disabled if gzalloc is enabled.
+                        */
+                       if (gzalloc_enabled()) {
+                               break;
+                       }
+#endif
                        if (zcache_ready()) {
                                zcache_init(zone);
                        } else {
@@ -3985,6 +4064,9 @@ zone_change(
                }
 #endif
                break;
+       case Z_CLEARMEMORY:
+               zone->clear_memory = value;
+               break;
        default:
                panic("Zone_change: Wrong Item Type!");
                /* break; */
@@ -4012,72 +4094,81 @@ zone_free_count(zone_t zone)
        return free_count;
 }
 
-/* Drops the elements in the free queue of a zone. Called by zone_gc() on each zone, and when a zone is zdestroy'ed. */
+/*
+ * Drops (i.e. frees) the elements in the all free pages queue of a zone.
+ * Called by zone_gc() on each zone and when a zone is zdestroy()ed.
+ */
 void
 drop_free_elements(zone_t z)
 {
-       vm_size_t                                       elt_size, size_freed;
-       unsigned int                                                    total_freed_pages = 0;
-       uint64_t                                        old_all_free_count;
-       struct zone_page_metadata       *page_meta;
-       queue_head_t                            page_meta_head;
+       vm_size_t                 elt_size;
+       unsigned int              total_freed_pages = 0;
+       struct zone_page_metadata *page_meta;
+       vm_address_t              free_page_address;
+       vm_size_t                 size_to_free;
 
        lock_zone(z);
-       if (queue_empty(&z->pages.all_free)) {
-               unlock_zone(z);
-               return;
-       }
 
-       /*
-        * Snatch all of the free elements away from the zone.
-        */
        elt_size = z->elem_size;
-       old_all_free_count = z->count_all_free_pages;
-       queue_new_head(&z->pages.all_free, &page_meta_head, struct zone_page_metadata *, pages);
-       queue_init(&z->pages.all_free);
-       z->count_all_free_pages = 0;
-       unlock_zone(z);
 
-       /* Iterate through all elements to find out size and count of elements we snatched */
-       size_freed = 0;
-       queue_iterate(&page_meta_head, page_meta, struct zone_page_metadata *, pages) {
+       while (!queue_empty(&z->pages.all_free)) {
+               page_meta = (struct zone_page_metadata *)queue_first(&z->pages.all_free);
                assert(from_zone_map((vm_address_t)page_meta, sizeof(*page_meta))); /* foreign elements should be in any_free_foreign */
-               size_freed += elt_size * page_meta->free_count;
-       }
+               /*
+                * Don't drain zones with async refill to below the refill threshold,
+                * as they need some reserve to function properly.
+                */
+               if (!z->zone_destruction &&
+                   z->async_prio_refill && z->zone_replenish_thread &&
+                   (vm_size_t)(page_meta->free_count - z->countfree) < z->prio_refill_watermark) {
+                       break;
+               }
 
-       /* Update the zone size and free element count */
-       lock_zone(z);
-       z->cur_size -= size_freed;
-       z->countfree -= size_freed / elt_size;
-       unlock_zone(z);
+               (void)dequeue_head(&z->pages.all_free);
+
+               assert(z->countfree >= page_meta->free_count);
+               z->countfree -= page_meta->free_count;
+
+               assert(z->count_all_free_pages >= page_meta->page_count);
+               z->count_all_free_pages -= page_meta->page_count;
+
+               assert(z->cur_size >= page_meta->free_count * elt_size);
+               z->cur_size -= page_meta->free_count * elt_size;
+
+               ZONE_PAGE_COUNT_DECR(z, page_meta->page_count);
+               unlock_zone(z);
 
-       while ((page_meta = (struct zone_page_metadata *)dequeue_head(&page_meta_head)) != NULL) {
-               vm_address_t        free_page_address;
                /* Free the pages for metadata and account for them */
                free_page_address = get_zone_page(page_meta);
-               ZONE_PAGE_COUNT_DECR(z, page_meta->page_count);
                total_freed_pages += page_meta->page_count;
-               old_all_free_count -= page_meta->page_count;
+               size_to_free = page_meta->page_count * PAGE_SIZE;
 #if KASAN_ZALLOC
-               kasan_poison_range(free_page_address, page_meta->page_count * PAGE_SIZE, ASAN_VALID);
+               kasan_poison_range(free_page_address, size_to_free, ASAN_VALID);
 #endif
 #if VM_MAX_TAG_ZONES
                if (z->tags) {
-                       ztMemoryRemove(z, free_page_address, (page_meta->page_count * PAGE_SIZE));
+                       ztMemoryRemove(z, free_page_address, size_to_free);
                }
 #endif /* VM_MAX_TAG_ZONES */
-               kmem_free(zone_map, free_page_address, (page_meta->page_count * PAGE_SIZE));
+               kmem_free(zone_map, free_page_address, size_to_free);
                if (current_thread()->options & TH_OPT_ZONE_GC) {
                        thread_yield_to_preemption();
                }
+               lock_zone(z);
+       }
+       if (z->zone_destruction) {
+               assert(queue_empty(&z->pages.all_free));
+               assert(z->count_all_free_pages == 0);
        }
+       unlock_zone(z);
 
-       /* We freed all the pages from the all_free list for this zone */
-       assert(old_all_free_count == 0);
 
+#if DEBUG || DEVELOPMENT
        if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
-               kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, (unsigned long)size_freed / elt_size, total_freed_pages);
+               kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name,
+                   (unsigned long)((total_freed_pages * PAGE_SIZE) / elt_size), total_freed_pages);
        }
+#endif /* DEBUG || DEVELOPMENT */
 }
 
 /*     Zone garbage collection
@@ -4113,9 +4204,11 @@ zone_gc(boolean_t consider_jetsams)
        max_zones = num_zones;
        simple_unlock(&all_zones_lock);
 
+#if DEBUG || DEVELOPMENT
        if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
                kprintf("zone_gc() starting...\n");
        }
+#endif /* DEBUG || DEVELOPMENT */
 
        for (i = 0; i < max_zones; i++) {
                z = &(zone_array[i]);
@@ -4710,11 +4803,6 @@ mach_zone_force_gc(
 extern unsigned int stack_total;
 extern unsigned long long stack_allocs;
 
-#if defined(__i386__) || defined (__x86_64__)
-extern unsigned int inuse_ptepages_count;
-extern long long alloc_ptepages_count;
-#endif
-
 zone_t
 zone_find_largest(void)
 {
index 412390316b02eb205d11a5ef35fabd2f931de75a..c5f356ff960ebbec4406ea56754131f933e9e2af 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -122,7 +122,7 @@ struct zone {
        int             countfree;      /* Number of free elements */
        int     count_all_free_pages;  /* Number of pages collectable by GC */
        lck_attr_t      lock_attr;      /* zone lock attribute */
-       decl_lck_mtx_data(, lock)        /* zone lock */
+       decl_lck_mtx_data(, lock);       /* zone lock */
        lck_mtx_ext_t   lock_ext;       /* placeholder for indirect mutex */
        vm_size_t       cur_size;       /* current memory utilization */
        vm_size_t       max_size;       /* how large can this zone grow */
@@ -130,7 +130,7 @@ struct zone {
        vm_size_t       alloc_size;     /* size used for more memory */
        uint64_t        page_count __attribute__((aligned(8)));   /* number of pages used by this zone */
        uint64_t        sum_count;      /* count of allocs (life of zone) */
-       uint32_t
+       uint64_t
        /* boolean_t */ exhaustible        :1,  /* (F) merely return if empty? */
        /* boolean_t */ collectable        :1,  /* (F) garbage collect empty pages */
        /* boolean_t */ expandable         :1,  /* (T) expand zone (with message)? */
@@ -155,7 +155,9 @@ struct zone {
        /* boolean_t */ zone_valid         :1,
        /* boolean_t */ cpu_cache_enable_when_ready  :1,
        /* boolean_t */ cpu_cache_enabled  :1,
-       /* future    */ _reserved          :3;
+       /* boolean_t */ clear_memory       :1,
+       /* boolean_t */ zone_destruction   :1,
+       /* future    */ _reserved          :33;
 
        int             index;          /* index into zone_info arrays for this zone */
        const char      *zone_name;     /* a name for the zone */
@@ -278,6 +280,7 @@ __BEGIN_DECLS
 #define Z_TAGS_ENABLED  11      /* Store tags */
 #endif  /* XNU_KERNEL_PRIVATE */
 #define Z_CACHING_ENABLED 12    /*enable and initialize per-cpu caches for the zone*/
+#define Z_CLEARMEMORY 13        /* Use KMA_ZERO on new allocations */
 
 #ifdef  XNU_KERNEL_PRIVATE
 
@@ -463,6 +466,15 @@ extern void             zone_change(
 extern void             zdestroy(
        zone_t          zone);
 
+#ifdef XNU_KERNEL_PRIVATE
+
+/* Panic if a pointer is not mapped to the zone specified */
+extern void             zone_require(
+       void *addr,
+       zone_t expected_zone);
+
+#endif /* XNU_KERNEL_PRIVATE */
+
 __END_DECLS
 
 #endif  /* _KERN_ZALLOC_H_ */
index 0ca209fe2dc7315e56748d144ffcd83e8f41dbca..bd0a50dc89927ec1bd76c96ce2f3287642f65883 100644 (file)
@@ -654,13 +654,13 @@ zcache_canary_validate(zone_t zone, void *element)
 
        vm_offset_t primary_value = (*primary ^ (uintptr_t)element);
        if (primary_value != zcache_canary) {
-               panic("Zone cache element was used after free! Element %p was corrupted at beginning; Expected %p but found %p; canary %p",
-                   element, (void *)(zcache_canary ^ (uintptr_t)element), (void *)(*primary), (void *)zcache_canary);
+               panic("Zone cache element was used after free! Element %p was corrupted at beginning; Expected %p but found %p; canary %p; zone %p (%s)",
+                   element, (void *)(zcache_canary ^ (uintptr_t)element), (void *)(*primary), (void *)zcache_canary, zone, zone->zone_name);
        }
 
        vm_offset_t backup_value = (*backup ^ (uintptr_t)element);
        if (backup_value != zcache_canary) {
-               panic("Zone cache element was used after free! Element %p was corrupted at end; Expected %p but found %p; canary %p",
-                   element, (void *)(zcache_canary ^ (uintptr_t)element), (void *)(*backup), (void *)zcache_canary);
+               panic("Zone cache element was used after free! Element %p was corrupted at end; Expected %p but found %p; canary %p; zone %p (%s)",
+                   element, (void *)(zcache_canary ^ (uintptr_t)element), (void *)(*backup), (void *)zcache_canary, zone, zone->zone_name);
        }
 }
index a49df09c0256e282fab85c31da340b762de8580c..624f8c42d47447bc18abdb0cae9e3c89c52b9fee 100644 (file)
@@ -41,7 +41,7 @@ COMP_FILES = ${MIG_KUSRC}
 do_build_all:: $(COMP_FILES)
 
 ${MIG_KUSRC} : kextd_mach.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS}   \
                -user    kextd_mach.c           \
                -header  kextd_mach.h           \
index 2ff723f937b328f7846cef30d30411d638b97f08..90d8e341f36f56242dcc28190150d12d5e4707c6 100644 (file)
@@ -124,7 +124,7 @@ static kern_return_t
 kperf_sample_internal(struct kperf_sample *sbuf,
     struct kperf_context *context,
     unsigned sample_what, unsigned sample_flags,
-    unsigned actionid, uint32_t ucallstack_depth)
+    unsigned actionid, unsigned ucallstack_depth)
 {
        int pended_ucallstack = 0;
        int pended_th_dispatch = 0;
@@ -164,29 +164,24 @@ kperf_sample_internal(struct kperf_sample *sbuf,
        bool is_kernel = (context->cur_pid == 0);
 
        if (actionid && actionid <= actionc) {
-               sbuf->kcallstack.nframes = actionv[actionid - 1].kcallstack_depth;
+               sbuf->kcallstack.kpkc_nframes =
+                   actionv[actionid - 1].kcallstack_depth;
        } else {
-               sbuf->kcallstack.nframes = MAX_CALLSTACK_FRAMES;
+               sbuf->kcallstack.kpkc_nframes = MAX_KCALLSTACK_FRAMES;
        }
 
        if (ucallstack_depth) {
-               sbuf->ucallstack.nframes = ucallstack_depth;
+               sbuf->ucallstack.kpuc_nframes = ucallstack_depth;
        } else {
-               sbuf->ucallstack.nframes = MAX_CALLSTACK_FRAMES;
+               sbuf->ucallstack.kpuc_nframes = MAX_UCALLSTACK_FRAMES;
        }
 
-       sbuf->kcallstack.flags = CALLSTACK_VALID;
-       sbuf->ucallstack.flags = CALLSTACK_VALID;
+       sbuf->kcallstack.kpkc_flags = 0;
+       sbuf->ucallstack.kpuc_flags = 0;
 
-       /* an event occurred. Sample everything and dump it in a
-        * buffer.
-        */
-
-       /* collect data from samplers */
        if (sample_what & SAMPLER_TH_INFO) {
                kperf_thread_info_sample(&sbuf->th_info, context);
 
-               /* See if we should drop idle thread samples */
                if (!(sample_flags & SAMPLE_FLAG_IDLE_THREADS)) {
                        if (sbuf->th_info.kpthi_runmode & 0x40) {
                                on_idle_thread = true;
@@ -223,7 +218,7 @@ kperf_sample_internal(struct kperf_sample *sbuf,
 
                if (sample_flags & SAMPLE_FLAG_PEND_USER) {
                        if (sample_what & SAMPLER_USTACK) {
-                               pended_ucallstack = kperf_ucallstack_pend(context, sbuf->ucallstack.nframes);
+                               pended_ucallstack = kperf_ucallstack_pend(context, sbuf->ucallstack.kpuc_nframes);
                        }
 
                        if (sample_what & SAMPLER_TH_DISPATCH) {
@@ -323,6 +318,9 @@ log_sample:
                }
        }
 
+       if (sample_what & SAMPLER_PMC_CONFIG) {
+               kperf_kpc_config_log(&(sbuf->kpcdata));
+       }
        if (sample_what & SAMPLER_PMC_THREAD) {
                kperf_kpc_thread_log(&(sbuf->kpcdata));
        } else if (sample_what & SAMPLER_PMC_CPU) {
@@ -483,12 +481,12 @@ void
 kperf_ast_set_callstack_depth(thread_t thread, uint32_t depth)
 {
        uint32_t ast_flags = kperf_get_thread_flags(thread);
-       uint32_t existing_callstack_depth = T_KPERF_GET_CALLSTACK_DEPTH(ast_flags);
+       uint32_t existing_callstack_depth =
+           T_KPERF_GET_CALLSTACK_DEPTH(ast_flags);
 
-       if (existing_callstack_depth != depth) {
+       if (existing_callstack_depth < depth) {
                ast_flags &= ~T_KPERF_SET_CALLSTACK_DEPTH(depth);
                ast_flags |= T_KPERF_SET_CALLSTACK_DEPTH(depth);
-
                kperf_set_thread_flags(thread, ast_flags);
        }
 }
@@ -614,8 +612,8 @@ kperf_action_reset(void)
                kperf_action_set_samplers(i + 1, 0);
                kperf_action_set_userdata(i + 1, 0);
                kperf_action_set_filter(i + 1, -1);
-               kperf_action_set_ucallstack_depth(i + 1, MAX_CALLSTACK_FRAMES);
-               kperf_action_set_kcallstack_depth(i + 1, MAX_CALLSTACK_FRAMES);
+               kperf_action_set_ucallstack_depth(i + 1, MAX_UCALLSTACK_FRAMES);
+               kperf_action_set_kcallstack_depth(i + 1, MAX_KCALLSTACK_FRAMES);
        }
 }
 
@@ -667,8 +665,8 @@ kperf_action_set_count(unsigned count)
 
        for (unsigned int i = old_count; i < count; i++) {
                new_actionv[i].pid_filter = -1;
-               new_actionv[i].ucallstack_depth = MAX_CALLSTACK_FRAMES;
-               new_actionv[i].kcallstack_depth = MAX_CALLSTACK_FRAMES;
+               new_actionv[i].ucallstack_depth = MAX_UCALLSTACK_FRAMES;
+               new_actionv[i].kcallstack_depth = MAX_KCALLSTACK_FRAMES;
        }
 
        actionv = new_actionv;
@@ -688,7 +686,7 @@ kperf_action_set_ucallstack_depth(unsigned action_id, uint32_t depth)
                return EINVAL;
        }
 
-       if (depth > MAX_CALLSTACK_FRAMES) {
+       if (depth > MAX_UCALLSTACK_FRAMES) {
                return EINVAL;
        }
 
@@ -704,7 +702,7 @@ kperf_action_set_kcallstack_depth(unsigned action_id, uint32_t depth)
                return EINVAL;
        }
 
-       if (depth > MAX_CALLSTACK_FRAMES) {
+       if (depth > MAX_KCALLSTACK_FRAMES) {
                return EINVAL;
        }
 
@@ -723,7 +721,7 @@ kperf_action_get_ucallstack_depth(unsigned action_id, uint32_t * depth_out)
        assert(depth_out);
 
        if (action_id == 0) {
-               *depth_out = MAX_CALLSTACK_FRAMES;
+               *depth_out = MAX_UCALLSTACK_FRAMES;
        } else {
                *depth_out = actionv[action_id - 1].ucallstack_depth;
        }
@@ -741,7 +739,7 @@ kperf_action_get_kcallstack_depth(unsigned action_id, uint32_t * depth_out)
        assert(depth_out);
 
        if (action_id == 0) {
-               *depth_out = MAX_CALLSTACK_FRAMES;
+               *depth_out = MAX_KCALLSTACK_FRAMES;
        } else {
                *depth_out = actionv[action_id - 1].kcallstack_depth;
        }
index 228cd9fe00f9fb2480f7040aed7d7c9d65a4769f..d6f0fb9a9deaab402ea4904e78de49581a32453d 100644 (file)
 #endif
 
 static void
-callstack_fixup_user(struct callstack *cs, thread_t thread)
+callstack_fixup_user(struct kp_ucallstack *cs, thread_t thread)
 {
        uint64_t fixup_val = 0;
-       assert(cs->nframes < MAX_CALLSTACK_FRAMES);
+       assert(cs->kpuc_nframes < MAX_UCALLSTACK_FRAMES);
 
 #if defined(__x86_64__)
        user_addr_t sp_user;
@@ -83,7 +83,7 @@ callstack_fixup_user(struct callstack *cs, thread_t thread)
 
        /* encode thumb mode into low bit of PC */
        if (get_saved_state_cpsr(state) & PSR_TF) {
-               cs->frames[0] |= 1ULL;
+               cs->kpuc_frames[0] |= 1ULL;
        }
 
        fixup_val = get_saved_state_lr(state);
@@ -93,7 +93,7 @@ callstack_fixup_user(struct callstack *cs, thread_t thread)
 #endif
 
 out:
-       cs->frames[cs->nframes++] = fixup_val;
+       cs->kpuc_frames[cs->kpuc_nframes++] = fixup_val;
 }
 
 #if defined(__x86_64__)
@@ -186,10 +186,10 @@ interrupted_kernel_lr(uintptr_t *lr)
 
 
 static void
-callstack_fixup_interrupted(struct callstack *cs)
+callstack_fixup_interrupted(struct kp_kcallstack *cs)
 {
        uintptr_t fixup_val = 0;
-       assert(cs->nframes < MAX_CALLSTACK_FRAMES);
+       assert(cs->kpkc_nframes < MAX_KCALLSTACK_FRAMES);
 
        /*
         * Only provide arbitrary data on development or debug kernels.
@@ -202,12 +202,12 @@ callstack_fixup_interrupted(struct callstack *cs)
 #endif /* defined(__x86_64__) */
 #endif /* DEVELOPMENT || DEBUG */
 
-       assert(cs->flags & CALLSTACK_KERNEL);
-       cs->frames[cs->nframes++] = fixup_val;
+       assert(cs->kpkc_flags & CALLSTACK_KERNEL);
+       cs->kpkc_frames[cs->kpkc_nframes++] = fixup_val;
 }
 
 void
-kperf_continuation_sample(struct callstack *cs, struct kperf_context *context)
+kperf_continuation_sample(struct kp_kcallstack *cs, struct kperf_context *context)
 {
        thread_t thread;
 
@@ -218,42 +218,46 @@ kperf_continuation_sample(struct callstack *cs, struct kperf_context *context)
        assert(thread != NULL);
        assert(thread->continuation != NULL);
 
-       cs->flags = CALLSTACK_CONTINUATION | CALLSTACK_VALID | CALLSTACK_KERNEL;
+       cs->kpkc_flags = CALLSTACK_CONTINUATION | CALLSTACK_VALID | CALLSTACK_KERNEL;
 #ifdef __LP64__
-       cs->flags |= CALLSTACK_64BIT;
+       cs->kpkc_flags |= CALLSTACK_64BIT;
 #endif
 
-       cs->nframes = 1;
-       cs->frames[0] = VM_KERNEL_UNSLIDE(thread->continuation);
+       cs->kpkc_nframes = 1;
+       cs->kpkc_frames[0] = VM_KERNEL_UNSLIDE(thread->continuation);
 }
 
 void
-kperf_backtrace_sample(struct callstack *cs, struct kperf_context *context)
+kperf_backtrace_sample(struct kp_kcallstack *cs, struct kperf_context *context)
 {
        assert(cs != NULL);
        assert(context != NULL);
        assert(context->cur_thread == current_thread());
 
-       cs->flags = CALLSTACK_KERNEL | CALLSTACK_KERNEL_WORDS;
+       cs->kpkc_flags = CALLSTACK_KERNEL | CALLSTACK_KERNEL_WORDS;
 #ifdef __LP64__
-       cs->flags |= CALLSTACK_64BIT;
+       cs->kpkc_flags |= CALLSTACK_64BIT;
 #endif
 
        BUF_VERB(PERF_CS_BACKTRACE | DBG_FUNC_START, 1);
 
-       cs->nframes = backtrace_frame((uintptr_t *)&(cs->frames), cs->nframes - 1,
-           context->starting_fp);
-       if (cs->nframes > 0) {
-               cs->flags |= CALLSTACK_VALID;
+       bool trunc = false;
+       cs->kpkc_nframes = backtrace_frame(cs->kpkc_word_frames,
+           cs->kpkc_nframes - 1, context->starting_fp, &trunc);
+       if (cs->kpkc_nframes > 0) {
+               cs->kpkc_flags |= CALLSTACK_VALID;
                /*
                 * Fake the value pointed to by the stack pointer or the link
                 * register for symbolicators.
                 */
-               cs->frames[cs->nframes + 1] = 0;
-               cs->nframes += 1;
+               cs->kpkc_word_frames[cs->kpkc_nframes + 1] = 0;
+               cs->kpkc_nframes += 1;
+       }
+       if (trunc) {
+               cs->kpkc_nframes |= CALLSTACK_TRUNCATED;
        }
 
-       BUF_VERB(PERF_CS_BACKTRACE | DBG_FUNC_END, cs->nframes);
+       BUF_VERB(PERF_CS_BACKTRACE | DBG_FUNC_END, cs->kpkc_nframes);
 }
 
 kern_return_t chudxnu_thread_get_callstack64_kperf(thread_t thread,
@@ -261,96 +265,96 @@ kern_return_t chudxnu_thread_get_callstack64_kperf(thread_t thread,
     boolean_t user_only);
 
 void
-kperf_kcallstack_sample(struct callstack *cs, struct kperf_context *context)
+kperf_kcallstack_sample(struct kp_kcallstack *cs, struct kperf_context *context)
 {
        thread_t thread;
 
        assert(cs != NULL);
        assert(context != NULL);
-       assert(cs->nframes <= MAX_CALLSTACK_FRAMES);
+       assert(cs->kpkc_nframes <= MAX_KCALLSTACK_FRAMES);
 
        thread = context->cur_thread;
        assert(thread != NULL);
 
        BUF_INFO(PERF_CS_KSAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread),
-           cs->nframes);
-
-       cs->flags = CALLSTACK_KERNEL;
+           cs->kpkc_nframes);
 
+       cs->kpkc_flags = CALLSTACK_KERNEL;
 #ifdef __LP64__
-       cs->flags |= CALLSTACK_64BIT;
+       cs->kpkc_flags |= CALLSTACK_64BIT;
 #endif
 
        if (ml_at_interrupt_context()) {
                assert(thread == current_thread());
-               cs->flags |= CALLSTACK_KERNEL_WORDS;
-               cs->nframes = backtrace_interrupted((uintptr_t *)cs->frames,
-                   cs->nframes - 1);
-               if (cs->nframes != 0) {
+               cs->kpkc_flags |= CALLSTACK_KERNEL_WORDS;
+               bool trunc = false;
+               cs->kpkc_nframes = backtrace_interrupted(
+                   cs->kpkc_word_frames, cs->kpkc_nframes - 1, &trunc);
+               if (cs->kpkc_nframes != 0) {
                        callstack_fixup_interrupted(cs);
                }
+               if (trunc) {
+                       cs->kpkc_flags |= CALLSTACK_TRUNCATED;
+               }
        } else {
                /*
                 * Rely on legacy CHUD backtracer to backtrace kernel stacks on
                 * other threads.
                 */
                kern_return_t kr;
-               kr = chudxnu_thread_get_callstack64_kperf(thread, cs->frames,
-                   &cs->nframes, FALSE);
+               kr = chudxnu_thread_get_callstack64_kperf(thread,
+                   cs->kpkc_frames, &cs->kpkc_nframes, FALSE);
                if (kr == KERN_SUCCESS) {
-                       cs->flags |= CALLSTACK_VALID;
+                       cs->kpkc_flags |= CALLSTACK_VALID;
                } else if (kr == KERN_RESOURCE_SHORTAGE) {
-                       cs->flags |= CALLSTACK_VALID;
-                       cs->flags |= CALLSTACK_TRUNCATED;
+                       cs->kpkc_flags |= CALLSTACK_VALID;
+                       cs->kpkc_flags |= CALLSTACK_TRUNCATED;
                } else {
-                       cs->nframes = 0;
+                       cs->kpkc_nframes = 0;
                }
        }
 
-       if (cs->nframes == 0) {
+       if (!(cs->kpkc_flags & CALLSTACK_VALID)) {
                BUF_INFO(PERF_CS_ERROR, ERR_GETSTACK);
        }
 
-       BUF_INFO(PERF_CS_KSAMPLE | DBG_FUNC_END, (uintptr_t)thread_tid(thread), cs->flags, cs->nframes);
+       BUF_INFO(PERF_CS_KSAMPLE | DBG_FUNC_END, (uintptr_t)thread_tid(thread),
+           cs->kpkc_flags, cs->kpkc_nframes);
 }
 
 void
-kperf_ucallstack_sample(struct callstack *cs, struct kperf_context *context)
+kperf_ucallstack_sample(struct kp_ucallstack *cs, struct kperf_context *context)
 {
-       thread_t thread;
-       bool user_64 = false;
-       int err;
-
-       assert(cs != NULL);
-       assert(context != NULL);
-       assert(cs->nframes <= MAX_CALLSTACK_FRAMES);
        assert(ml_get_interrupts_enabled() == TRUE);
 
-       thread = context->cur_thread;
+       thread_t thread = context->cur_thread;
        assert(thread != NULL);
 
-       BUF_INFO(PERF_CS_USAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread),
-           cs->nframes);
+       BUF_INFO(PERF_CS_USAMPLE | DBG_FUNC_START,
+           (uintptr_t)thread_tid(thread), cs->kpuc_nframes);
 
-       cs->flags = 0;
-
-       err = backtrace_thread_user(thread, (uintptr_t *)cs->frames,
-           cs->nframes - 1, &cs->nframes, &user_64);
-       cs->flags |= CALLSTACK_KERNEL_WORDS;
-       if (user_64) {
-               cs->flags |= CALLSTACK_64BIT;
+       bool user64 = false;
+       bool trunc = false;
+       int err = backtrace_thread_user(thread, cs->kpuc_frames,
+           cs->kpuc_nframes - 1, &cs->kpuc_nframes, &user64, &trunc);
+       cs->kpuc_flags = CALLSTACK_KERNEL_WORDS;
+       if (user64) {
+               cs->kpuc_flags |= CALLSTACK_64BIT;
+       }
+       if (trunc) {
+               cs->kpuc_flags |= CALLSTACK_TRUNCATED;
        }
 
        if (!err || err == EFAULT) {
                callstack_fixup_user(cs, thread);
-               cs->flags |= CALLSTACK_VALID;
+               cs->kpuc_flags |= CALLSTACK_VALID;
        } else {
-               cs->nframes = 0;
+               cs->kpuc_nframes = 0;
                BUF_INFO(PERF_CS_ERROR, ERR_GETSTACK, err);
        }
 
        BUF_INFO(PERF_CS_USAMPLE | DBG_FUNC_END, (uintptr_t)thread_tid(thread),
-           cs->flags, cs->nframes);
+           cs->kpuc_flags, cs->kpuc_nframes);
 }
 
 static inline uintptr_t
@@ -378,38 +382,36 @@ scrub_frame(uint64_t *bt, int n_frames, int frame)
 }
 
 static void
-callstack_log(struct callstack *cs, uint32_t hcode, uint32_t dcode)
+callstack_log(uint32_t hdrid, uint32_t dataid, void *vframes,
+    unsigned int nframes, unsigned int flags)
 {
-       BUF_VERB(PERF_CS_LOG | DBG_FUNC_START, cs->flags, cs->nframes);
+       BUF_VERB(PERF_CS_LOG | DBG_FUNC_START, flags, nframes);
 
-       /* framing information for the stack */
-       BUF_DATA(hcode, cs->flags, cs->nframes);
+       BUF_DATA(hdrid, flags, nframes);
 
-       /* how many batches of 4 */
-       unsigned int nframes = cs->nframes;
-       unsigned int n = nframes / 4;
+       unsigned int nevts = nframes / 4;
        unsigned int ovf = nframes % 4;
        if (ovf != 0) {
-               n++;
+               nevts++;
        }
 
-       bool kern = cs->flags & CALLSTACK_KERNEL;
+       bool kern = flags & CALLSTACK_KERNEL;
 
-       if (cs->flags & CALLSTACK_KERNEL_WORDS) {
-               uintptr_t *frames = (uintptr_t *)cs->frames;
-               for (unsigned int i = 0; i < n; i++) {
+       if (flags & CALLSTACK_KERNEL_WORDS) {
+               uintptr_t *frames = vframes;
+               for (unsigned int i = 0; i < nevts; i++) {
                        unsigned int j = i * 4;
-                       BUF_DATA(dcode,
+                       BUF_DATA(dataid,
                            scrub_word(frames, nframes, j + 0, kern),
                            scrub_word(frames, nframes, j + 1, kern),
                            scrub_word(frames, nframes, j + 2, kern),
                            scrub_word(frames, nframes, j + 3, kern));
                }
        } else {
-               for (unsigned int i = 0; i < n; i++) {
-                       uint64_t *frames = cs->frames;
+               for (unsigned int i = 0; i < nevts; i++) {
+                       uint64_t *frames = vframes;
                        unsigned int j = i * 4;
-                       BUF_DATA(dcode,
+                       BUF_DATA(dataid,
                            scrub_frame(frames, nframes, j + 0),
                            scrub_frame(frames, nframes, j + 1),
                            scrub_frame(frames, nframes, j + 2),
@@ -417,19 +419,21 @@ callstack_log(struct callstack *cs, uint32_t hcode, uint32_t dcode)
                }
        }
 
-       BUF_VERB(PERF_CS_LOG | DBG_FUNC_END, cs->flags, cs->nframes);
+       BUF_VERB(PERF_CS_LOG | DBG_FUNC_END, flags, nframes);
 }
 
 void
-kperf_kcallstack_log( struct callstack *cs )
+kperf_kcallstack_log(struct kp_kcallstack *cs)
 {
-       callstack_log(cs, PERF_CS_KHDR, PERF_CS_KDATA);
+       callstack_log(PERF_CS_KHDR, PERF_CS_KDATA, cs->kpkc_frames,
+           cs->kpkc_nframes, cs->kpkc_flags);
 }
 
 void
-kperf_ucallstack_log( struct callstack *cs )
+kperf_ucallstack_log(struct kp_ucallstack *cs)
 {
-       callstack_log(cs, PERF_CS_UHDR, PERF_CS_UDATA);
+       callstack_log(PERF_CS_UHDR, PERF_CS_UDATA, cs->kpuc_frames,
+           cs->kpuc_nframes, cs->kpuc_flags);
 }
 
 int
@@ -662,6 +666,9 @@ chudxnu_thread_get_callstack64_kperf(
 }
 #elif __arm64__
 
+#if defined(HAS_APPLE_PAC)
+#include <ptrauth.h>
+#endif
 
 // chudxnu_thread_get_callstack gathers a raw callstack along with any information needed to
 // fix it up later (in case we stopped program as it was saving values into prev stack frame, etc.)
@@ -789,7 +796,12 @@ chudxnu_thread_get_callstack64_internal(
                                            (vm_offset_t)fp,
                                            (vm_size_t)sizeof(frame));
                                        if (kr == KERN_SUCCESS) {
+#if defined(HAS_APPLE_PAC)
+                                               /* return addresses on stack will be signed by arm64e ABI */
+                                               pc = (uint64_t)ptrauth_strip((void *)frame[1], ptrauth_key_return_address);
+#else
                                                pc = frame[1];
+#endif
                                                nextFramePointer = (uint64_t *)frame[0];
                                        } else {
                                                pc = 0ULL;
@@ -803,7 +815,12 @@ chudxnu_thread_get_callstack64_internal(
                                    (vm_offset_t)fp,
                                    (vm_size_t)sizeof(frame));
                                if (kr == KERN_SUCCESS) {
+#if defined(HAS_APPLE_PAC)
+                                       /* return addresses on stack will be signed by arm64e ABI */
+                                       pc = (uint64_t)ptrauth_strip((void *)frame[1], ptrauth_key_return_address);
+#else
                                        pc = frame[1];
+#endif
                                        nextFramePointer = (uint64_t *)(frame[0]);
                                } else {
                                        pc = 0ULL;
index 76d442ced52b7810145e06a3d683b56e4a18edb5..a144a8b952bcfa1cd0890ae19370f67bc0f88cd4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2011-2019 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -29,7 +29,8 @@
 #ifndef KPERF_CALLSTACK_H
 #define KPERF_CALLSTACK_H
 
-#define MAX_CALLSTACK_FRAMES (128)
+#define MAX_KCALLSTACK_FRAMES (128)
+#define MAX_UCALLSTACK_FRAMES (256)
 
 /* the callstack contains valid data */
 #define CALLSTACK_VALID        (1U << 0)
 /* the frames field is filled with uintptr_t, not uint64_t */
 #define CALLSTACK_KERNEL_WORDS (1U << 6)
 
-struct callstack {
-       uint32_t flags;
-       uint32_t nframes;
-       /* WARNING this can be uintptr_t instead if CALLSTACK_KERNEL_WORDS is set */
-       uint64_t frames[MAX_CALLSTACK_FRAMES];
+struct kp_ucallstack {
+       uint32_t kpuc_flags;
+       uint32_t kpuc_nframes;
+       uintptr_t kpuc_frames[MAX_UCALLSTACK_FRAMES];
+};
+
+struct kp_kcallstack {
+       uint32_t kpkc_flags;
+       uint32_t kpkc_nframes;
+       union {
+               uintptr_t kpkc_word_frames[MAX_KCALLSTACK_FRAMES];
+               uint64_t kpkc_frames[MAX_KCALLSTACK_FRAMES];
+       };
 };
 
 struct kperf_context;
 
-void kperf_kcallstack_sample(struct callstack *cs, struct kperf_context *);
-void kperf_kcallstack_log(struct callstack *cs);
-void kperf_continuation_sample(struct callstack *cs, struct kperf_context *);
-void kperf_backtrace_sample(struct callstack *cs, struct kperf_context *context);
+void kperf_kcallstack_sample(struct kp_kcallstack *cs, struct kperf_context *);
+void kperf_kcallstack_log(struct kp_kcallstack *cs);
+void kperf_continuation_sample(struct kp_kcallstack *cs, struct kperf_context *);
+void kperf_backtrace_sample(struct kp_kcallstack *cs, struct kperf_context *context);
 
-void kperf_ucallstack_sample(struct callstack *cs, struct kperf_context *);
+void kperf_ucallstack_sample(struct kp_ucallstack *cs, struct kperf_context *);
 int kperf_ucallstack_pend(struct kperf_context *, uint32_t depth);
-void kperf_ucallstack_log(struct callstack *cs);
+void kperf_ucallstack_log(struct kp_ucallstack *cs);
 
 #endif /* !defined(KPERF_CALLSTACK_H) */
index bd5c582f06cb16ef43d0ca309885c0811ccd4846..17a94be8edb844bc5abeb78f4720514ff0ab6b33 100644 (file)
@@ -342,9 +342,11 @@ kperf_port_to_pid(mach_port_name_t portname)
        if (task == TASK_NULL) {
                return -1;
        }
+
        pid_t pid = task_pid(task);
-       /* drop the ref taken by port_name_to_task */
-       (void)task_deallocate_internal(task);
+
+       os_ref_count_t __assert_only count = task_deallocate_internal(task);
+       assert(count != 0);
 
        return pid;
 }
index 26b7b777e02e41565237bf929003a76ca7165d7d..43df937a282071449f217d04c792e2fdf33bcd0b 100644 (file)
@@ -90,7 +90,7 @@ kperf_kpc_cpu_sample(struct kpcdata *kpcd, int sample_config)
        BUF_INFO(PERF_KPC_CPU_SAMPLE | DBG_FUNC_END, kpcd->running, kpcd->counterc);
 }
 
-static void
+void
 kperf_kpc_config_log(const struct kpcdata *kpcd)
 {
        BUF_DATA(PERF_KPC_CONFIG,
@@ -98,64 +98,66 @@ kperf_kpc_config_log(const struct kpcdata *kpcd)
            kpcd->counterc,
            kpc_get_counter_count(KPC_CLASS_FIXED_MASK),
            kpcd->configc);
+
+#if __LP64__
+       unsigned int max = (kpcd->configc + 3) / 4;
+       for (unsigned int i = 0; i < max; i++) {
+               uint32_t flag = (i == 0) ? DBG_FUNC_START : ((i == (max - 1)) ? DBG_FUNC_END : DBG_FUNC_NONE);
+               BUF_DATA(PERF_KPC_CFG_REG | flag,
+                   kpcd->configv[0 + i * 4], kpcd->configv[1 + i * 4],
+                   kpcd->configv[2 + i * 4], kpcd->configv[3 + i * 4]);
+       }
+#else /* __LP64__ */
+       unsigned int max = (kpcd->configc + 1) / 2;
+       for (unsigned int i = 0; i < max; i++) {
+               uint32_t flag = (i == 0) ? DBG_FUNC_START : ((i == (max - 1)) ? DBG_FUNC_END : DBG_FUNC_NONE);
+               BUF_DATA(PERF_KPC_CFG_REG32 | flag,
+                   kpcd->configv[i * 2] >> 32ULL,
+                   kpcd->configv[i * 2] & 0xffffffffULL,
+                   kpcd->configv[i * 2 + 1] >> 32ULL,
+                   kpcd->configv[i * 2 + 1] & 0xffffffffULL);
+       }
+#endif /* !__LP64__ */
 }
 
 static void
 kperf_kpc_log(uint32_t code, uint32_t code32, const struct kpcdata *kpcd)
 {
-       unsigned i;
-
 #if __LP64__
-       (void)code32;
-       /* config registers */
-       for (i = 0; i < ((kpcd->configc + 3) / 4); i++) {
-               BUF_DATA(PERF_KPC_CFG_REG,
-                   kpcd->configv[0 + i * 4],
-                   kpcd->configv[1 + i * 4],
-                   kpcd->configv[2 + i * 4],
-                   kpcd->configv[3 + i * 4]);
-       }
-
+#pragma unused(code32)
+       unsigned int max = (kpcd->counterc + 3) / 4;
        /* and the actual counts with one 64-bit argument each */
-       for (i = 0; i < ((kpcd->counterc + 3) / 4); i++) {
-               BUF_DATA(code,
+       for (unsigned int i = 0; i < max; i++) {
+               uint32_t flag = (i == 0) ? DBG_FUNC_START : ((i == (max - 1)) ? DBG_FUNC_END : DBG_FUNC_NONE);
+               BUF_DATA(code | flag,
                    kpcd->counterv[0 + i * 4],
                    kpcd->counterv[1 + i * 4],
                    kpcd->counterv[2 + i * 4],
                    kpcd->counterv[3 + i * 4]);
        }
-#else
-       (void)code;
-       /* config registers */
-       for (i = 0; i < ((kpcd->configc + 1) / 2); i++) {
-               BUF_DATA(PERF_KPC_CFG_REG32,
-                   (kpcd->configv[0 + i * 2] >> 32ULL),
-                   kpcd->configv[0 + i * 2] & 0xffffffffULL,
-                   (kpcd->configv[1 + i * 2] >> 32ULL),
-                   kpcd->configv[1 + i * 2] & 0xffffffffULL);
-       }
-
+#else /* __LP64__ */
+#pragma unused(code)
+       unsigned int max = (kpcd->counterc + 1) / 2;
        /* and the actual counts with two 32-bit trace arguments each */
-       for (i = 0; i < ((kpcd->counterc + 1) / 2); i++) {
-               BUF_DATA(code32,
+       for (unsigned int i = 0; i < max; i++) {
+               uint32_t flag = (i == 0) ? DBG_FUNC_START : ((i == (max - 1)) ? DBG_FUNC_END : DBG_FUNC_NONE);
+               BUF_DATA(code32 | flag,
                    (kpcd->counterv[0 + i * 2] >> 32ULL),
                    kpcd->counterv[0 + i * 2] & 0xffffffffULL,
                    (kpcd->counterv[1 + i * 2] >> 32ULL),
                    kpcd->counterv[1 + i * 2] & 0xffffffffULL);
        }
-#endif
+#endif /* !__LP64__ */
 }
 
 void
 kperf_kpc_cpu_log(const struct kpcdata *kpcd)
 {
-       kperf_kpc_config_log(kpcd);
        kperf_kpc_log(PERF_KPC_DATA, PERF_KPC_DATA32, kpcd);
 }
 
 void
 kperf_kpc_thread_log(const struct kpcdata *kpcd)
 {
-       kperf_kpc_config_log(kpcd);
        kperf_kpc_log(PERF_KPC_DATA_THREAD, PERF_KPC_DATA_THREAD32, kpcd);
 }
index 9b5d58b71b91e1c3bef65645c1e1871e3f4a15e7..a65fe14fdd1bd854de1a20f3383ef0d5e309b443 100644 (file)
@@ -49,5 +49,6 @@ void kperf_kpc_thread_sample(struct kpcdata *, int);
 void kperf_kpc_cpu_sample(struct kpcdata *, int);
 void kperf_kpc_thread_log(const struct kpcdata *);
 void kperf_kpc_cpu_log(const struct kpcdata *);
+void kperf_kpc_config_log(const struct kpcdata *);
 
 #endif /* __KPERF_KPC_H__ */
index 7d1c478baf0bdf77ed8ce52ff65c6741fc8af90b..a6287f39c7eafcec13f800f8218eded62d6acbe2 100644 (file)
@@ -177,7 +177,7 @@ kperf_timer_handler(void *param0, __unused void *param1)
 
        uint32_t actionid = KPERF_TMR_ACTION(action_state);
        if (actionid == 0) {
-               return;
+               goto deactivate;
        }
 
 #if DEVELOPMENT || DEBUG
index 35e186ebbd94189d1935c2c126514416c1fa1492..9af5ba5b55d67255e0d8d365a680a7b3bab64b10 100644 (file)
@@ -43,8 +43,8 @@ struct kperf_sample {
 
        struct kperf_task_snapshot tk_snapshot;
 
-       struct callstack   kcallstack;
-       struct callstack   ucallstack;
+       struct kp_kcallstack kcallstack;
+       struct kp_ucallstack ucallstack;
        struct meminfo     meminfo;
 
 #if KPC
index 91ebb502648df36a9d7a4dcaafa4ba804ccd6c34..901e500f750dd8b775e1c45e95970f08db7ff551 100644 (file)
@@ -159,8 +159,10 @@ kperf_thread_scheduling_sample(struct kperf_thread_scheduling *thsc,
        thsc->kpthsc_requested_qos_override = MAX(thread->requested_policy.thrp_qos_override,
            thread->requested_policy.thrp_qos_workq_override);
        thsc->kpthsc_requested_qos_promote = thread->requested_policy.thrp_qos_promote;
-       thsc->kpthsc_requested_qos_ipc_override = thread->requested_policy.thrp_qos_ipc_override;
-       thsc->kpthsc_requested_qos_sync_ipc_override = thread->requested_policy.thrp_qos_sync_ipc_override;
+       thsc->kpthsc_requested_qos_kevent_override = MAX(
+               thread->requested_policy.thrp_qos_kevent_override,
+               thread->requested_policy.thrp_qos_wlsvc_override);
+       thsc->kpthsc_requested_qos_sync_ipc_override = THREAD_QOS_UNSPECIFIED;
        thsc->kpthsc_effective_latency_qos = thread->effective_policy.thep_latency_qos;
 
        BUF_INFO(PERF_TI_SCHEDSAMPLE | DBG_FUNC_END);
@@ -182,8 +184,7 @@ kperf_thread_scheduling_log(struct kperf_thread_scheduling *thsc)
            | thsc->kpthsc_requested_qos_override,
            ((uint64_t)thsc->kpthsc_effective_latency_qos << 61)
            | ((uint64_t)thsc->kpthsc_requested_qos_promote << 58)
-           | ((uint64_t)thsc->kpthsc_requested_qos_ipc_override << 55)
-           | ((uint64_t)thsc->kpthsc_requested_qos_sync_ipc_override << 52)
+           | ((uint64_t)thsc->kpthsc_requested_qos_kevent_override << 55)
            );
        BUF_DATA(PERF_TI_SCHEDDATA_3, thsc->kpthsc_runnable_time);
 #else
@@ -200,8 +201,7 @@ kperf_thread_scheduling_log(struct kperf_thread_scheduling *thsc)
            | thsc->kpthsc_requested_qos_override,
            ((uint32_t)thsc->kpthsc_effective_latency_qos << 29)
            | ((uint32_t)thsc->kpthsc_requested_qos_promote << 26)
-           | ((uint32_t)thsc->kpthsc_requested_qos_ipc_override << 23)
-           | ((uint32_t)thsc->kpthsc_requested_qos_sync_ipc_override << 20)
+           | ((uint32_t)thsc->kpthsc_requested_qos_kevent_override << 23)
            );
        BUF_DATA(PERF_TI_SCHEDDATA3_32, UPPER_32(thsc->kpthsc_runnable_time),
            LOWER_32(thsc->kpthsc_runnable_time));
@@ -353,12 +353,8 @@ kperf_thread_inscyc_log(struct kperf_context *context)
                return;
        }
 
-       uint64_t counts[MT_CORE_NFIXED];
-
-       int ret = mt_fixed_thread_counts(cur_thread, counts);
-       if (ret) {
-               return;
-       }
+       uint64_t counts[MT_CORE_NFIXED] = { 0 };
+       mt_cur_thread_fixed_counts(counts);
 
 #if defined(__LP64__)
        BUF_DATA(PERF_TI_INSCYCDATA, counts[MT_CORE_INSTRS], counts[MT_CORE_CYCLES]);
index 9696dfec5995d62846e61a7635238c0c95c36cf1..09a188554d59e2d2b62ca790c0793bab4f549f1c 100644 (file)
@@ -43,6 +43,9 @@ void kperf_thread_info_sample(struct kperf_thread_info *,
     struct kperf_context *);
 void kperf_thread_info_log(struct kperf_thread_info *);
 
+// legacy names
+#define kpthsc_requested_qos_ipc_override kpthsc_requested_qos_kevent_override
+
 /* scheduling information */
 struct kperf_thread_scheduling {
        uint64_t kpthsc_user_time;
@@ -55,8 +58,8 @@ struct kperf_thread_scheduling {
            kpthsc_requested_qos :3,
            kpthsc_requested_qos_override :3,
            kpthsc_requested_qos_promote :3,
-           kpthsc_requested_qos_ipc_override :3,
-           kpthsc_requested_qos_sync_ipc_override :3,
+           kpthsc_requested_qos_kevent_override :3,
+           kpthsc_requested_qos_sync_ipc_override :3,             /* obsolete */
            kpthsc_effective_latency_qos :3;
 };
 
index 2a1b677460dba174f8d07b3f741c718b2bf63f2d..1bcf828cc8959f6175a7ef9cc1abd18716bd868b 100644 (file)
@@ -50,7 +50,11 @@ extern "C" {
 
 #ifndef NULL
 #if defined (__cplusplus)
+#if __cplusplus >= 201103L
+#define NULL nullptr
+#else
 #define NULL 0
+#endif
 #else
 #define NULL ((void *)0)
 #endif
@@ -93,50 +97,85 @@ extern int      strprefix(const char *s1, const char *s2);
 extern int      bcmp(const void *, const void *, size_t);
 extern void     bcopy(const void *, void *, size_t);
 extern void     bzero(void *, size_t);
+extern int      timingsafe_bcmp(const void *b1, const void *b2, size_t n);
 
 #ifdef PRIVATE
 #include <san/memintrinsics.h>
 #endif
 
+#if __has_builtin(__builtin_dynamic_object_size)
+#define XNU_BOS __builtin_dynamic_object_size
+#else
+#define XNU_BOS __builtin_object_size
+#endif
+
+
+/* __nochk_ functions for opting out of type 1 bounds checking */
+__attribute__((always_inline)) static inline void *
+__nochk_memcpy(void *dest, const void *src, size_t len)
+{
+       return __builtin___memcpy_chk(dest, src, len, XNU_BOS(dest, 0));
+}
+__attribute__((always_inline)) static inline void *
+__nochk_memmove(void *dest, const void *src, size_t len)
+{
+       return __builtin___memmove_chk(dest, src, len, XNU_BOS(dest, 0));
+}
+__attribute__((always_inline)) static inline void
+__nochk_bcopy(const void *src, void *dest, size_t len)
+{
+       __builtin___memmove_chk(dest, src, len, XNU_BOS(dest, 0));
+}
+
 #if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_13
 /* older deployment target */
 #elif defined(KASAN) || (defined (_FORTIFY_SOURCE) && _FORTIFY_SOURCE == 0)
-/* FORTIFY_SOURCE disabled */
+/* _FORTIFY_SOURCE disabled */
 #else /* _chk macros */
+
+#ifdef XNU_KERNEL_PRIVATE
+/* Stricter checking in xnu than kexts. When type is set to 1, __builtin_object_size
+ * returns the size of the closest surrounding sub-object, which would detect copying past
+ * the end of a struct member. */
+#define BOS_COPY_TYPE 1
+#else
+#define BOS_COPY_TYPE 0
+#endif
+
 #if __has_builtin(__builtin___memcpy_chk)
-#define memcpy(dest, src, len) __builtin___memcpy_chk(dest, src, len, __builtin_object_size(dest, 0))
+#define memcpy(dest, src, len) __builtin___memcpy_chk(dest, src, len, XNU_BOS(dest, BOS_COPY_TYPE))
 #endif
 
 #if __has_builtin(__builtin___memmove_chk)
-#define memmove(dest, src, len) __builtin___memmove_chk(dest, src, len, __builtin_object_size(dest, 0))
+#define memmove(dest, src, len) __builtin___memmove_chk(dest, src, len, XNU_BOS(dest, BOS_COPY_TYPE))
 #endif
 
 #if __has_builtin(__builtin___strncpy_chk)
-#define strncpy(dest, src, len) __builtin___strncpy_chk(dest, src, len, __builtin_object_size(dest, 1))
+#define strncpy(dest, src, len) __builtin___strncpy_chk(dest, src, len, XNU_BOS(dest, 1))
 #endif
 
 #if __has_builtin(__builtin___strncat_chk)
-#define strncat(dest, src, len) __builtin___strncat_chk(dest, src, len, __builtin_object_size(dest, 1))
+#define strncat(dest, src, len) __builtin___strncat_chk(dest, src, len, XNU_BOS(dest, 1))
 #endif
 
 #if __has_builtin(__builtin___strlcat_chk)
-#define strlcat(dest, src, len) __builtin___strlcat_chk(dest, src, len, __builtin_object_size(dest, 1))
+#define strlcat(dest, src, len) __builtin___strlcat_chk(dest, src, len, XNU_BOS(dest, 1))
 #endif
 
 #if __has_builtin(__builtin___strlcpy_chk)
-#define strlcpy(dest, src, len) __builtin___strlcpy_chk(dest, src, len, __builtin_object_size(dest, 1))
+#define strlcpy(dest, src, len) __builtin___strlcpy_chk(dest, src, len, XNU_BOS(dest, 1))
 #endif
 
 #if __has_builtin(__builtin___strcpy_chk)
-#define strcpy(dest, src, len) __builtin___strcpy_chk(dest, src, __builtin_object_size(dest, 1))
+#define strcpy(dest, src, len) __builtin___strcpy_chk(dest, src, XNU_BOS(dest, 1))
 #endif
 
 #if __has_builtin(__builtin___strcat_chk)
-#define strcat(dest, src) __builtin___strcat_chk(dest, src, __builtin_object_size(dest, 1))
+#define strcat(dest, src) __builtin___strcat_chk(dest, src, XNU_BOS(dest, 1))
 #endif
 
 #if __has_builtin(__builtin___memmove_chk)
-#define bcopy(src, dest, len) __builtin___memmove_chk(dest, src, len, __builtin_object_size(dest, 0))
+#define bcopy(src, dest, len) __builtin___memmove_chk(dest, src, len, XNU_BOS(dest, BOS_COPY_TYPE))
 #endif
 
 #endif /* _chk macros */
index 737206913704b03d47fccdec566302c4c8b21a59..9a9326fc7e7e9882ee3d3204eeefacf4d247e77e 100644 (file)
@@ -99,7 +99,7 @@ typedef volatile unsigned long  vulong_t;
  * Deprecation macro
  */
 #if __GNUC__ >= 3
-#define __deprecated __attribute__((deprecated))
+#define __deprecated    __attribute__((__deprecated__))
 #else
 #define __deprecated /* nothing */
 #endif
index 8ad03c5ebda6fa42bec1646bf7961315edce5dce..a2591e477d70f2e41673e65d27e3493adca38440 100644 (file)
@@ -42,7 +42,7 @@ COMP_FILES = ${MIG_KUSRC}
 do_build_all:: $(COMP_FILES)
 
 ${MIG_KUSRC} : lockd_mach.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS}   \
                -user    lockd_mach.c           \
                -header  lockd_mach.h           \
index e728f0a4f4c0b879dd5ba8d6fdc6f8dc9e1b71d9..310027b65b6c16bce1a9e3575e7c54a297de2dec 100644 (file)
@@ -49,14 +49,20 @@ MIG_DEFS =  \
        thread_act.defs \
        vm_map.defs
 
+MIG_PRIVATE_DEFS = \
+       restartable.defs
+
 MACH_PRIVATE_DEFS = \
        coalition_notification.defs \
+       fairplayd_notification.defs \
+       arcade_upcall.defs \
        ktrace_background.defs \
        mach_notify.defs \
        memory_object_control.defs \
        memory_object_default.defs \
        sysdiagnose_notification.defs \
        upl.defs \
+       vfs_nspace.defs \
        vm32_map.defs
 
 #
@@ -68,12 +74,15 @@ MIG_USHDRS = \
        clock_reply_server.h \
        coalition_notification_server.h \
        exc_server.h \
+       fairplayd_notification_server.h \
+       arcade_upcall_server.h \
        mach_exc_server.h \
        memory_object_default_server.h \
        notify_server.h \
        task_access_server.h \
        telemetry_notification_server.h \
-       sysdiagnose_notification_server.h
+       sysdiagnose_notification_server.h \
+       vfs_nspace_server.h
 
 MIG_UUHDRS = \
        clock.h \
@@ -95,7 +104,8 @@ MIG_UUHDRS = \
        task_access.h \
        thread_act.h \
        upl.h \
-       vm_map.h
+       vm_map.h \
+       vfs_nspace.h
 
 MIGINCLUDES = ${MIG_UUHDRS} ${MIG_USHDRS}
 
@@ -170,6 +180,8 @@ PRIVATE_DATAFILES = \
        bootstrap.h \
        coalition.h \
        coalition_notification.defs \
+       fairplayd_notification.defs \
+       arcade_upcall.defs \
        host_info.h \
        ktrace_background.defs \
        mach_host.defs \
@@ -189,7 +201,9 @@ PRIVATE_DATAFILES = \
        task_policy.h \
        thread_policy.h \
        thread_switch.h \
-       vm_prot.h
+       vfs_nspace.defs \
+       vm_prot.h \
+       ${MIG_PRIVATE_DEFS}
 
 INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES}
 
@@ -222,7 +236,7 @@ ${MIGINCLUDES} : ${MIG_TYPES}
 
 ${MIG_UUHDRS} : \
        %.h : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)$(MIG) $(MIGFLAGS)         \
                -server /dev/null       \
                -user /dev/null         \
@@ -231,7 +245,7 @@ ${MIG_UUHDRS} : \
 
 ${MIG_USHDRS} : \
        %_server.h : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)$(MIG) $(MIGFLAGS)         \
                -server /dev/null       \
                -user /dev/null         \
@@ -275,6 +289,8 @@ MIG_KUSRC = \
        clock_reply_user.c \
        coalition_notification_user.c \
        exc_user.c \
+       fairplayd_notification_user.c \
+       arcade_upcall_user.c \
        host_notify_reply_user.c \
        ktrace_background_user.c \
        mach_exc_user.c \
@@ -286,10 +302,12 @@ MIG_KUSRC = \
        task_access_user.c \
        telemetry_notification_user.c \
        upl_user.c \
+       vfs_nspace_user.c \
        vm_map_user.c \
        sysdiagnose_notification_user.c
 
 MIG_KSHDRS = \
+       arcade_register_server.h \
        clock_server.h \
        clock_priv_server.h \
        exc_server.h \
@@ -308,6 +326,7 @@ MIG_KSHDRS = \
        memory_object_default_server.h \
        processor_server.h \
        processor_set_server.h \
+       restartable_server.h \
        task_server.h \
        thread_act_server.h \
        upl_server.h \
@@ -315,6 +334,7 @@ MIG_KSHDRS = \
        vm32_map_server.h
 
 MIG_KSSRC = \
+       arcade_register_server.c \
        clock_server.c \
        clock_priv_server.c \
        exc_server.c \
@@ -333,6 +353,7 @@ MIG_KSSRC = \
        memory_object_default_server.c \
        processor_server.c \
        processor_set_server.c \
+       restartable_server.c \
        task_server.c \
        thread_act_server.c \
        upl_server.c \
@@ -363,7 +384,7 @@ ${COMP_FILES} : ${MIG_TYPES}
 
 ${MIG_KUSRC} : \
        %_user.c : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS}        \
                -user    $*_user.c              \
                -header  $*.h              \
@@ -373,7 +394,7 @@ ${MIG_KUSRC} : \
 
 ${MIG_KSSRC}: \
        %_server.c : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS}        \
                -user    /dev/null              \
                -header  /dev/null              \
diff --git a/osfmk/mach/arcade_register.defs b/osfmk/mach/arcade_register.defs
new file mode 100644 (file)
index 0000000..78f4eec
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ *  Interface definition for the fairplay upcall mechanism.
+ */
+
+subsystem
+#if KERNEL_SERVER
+    KernelServer
+#endif /* KERNEL_SERVER */
+    arcade_register 51471;
+
+#include <mach/std_types.defs>
+#include <mach/mach_types.defs>
+
+routine arcade_register_new_upcall(
+                         arcade_register       : arcade_register_t;
+                         arcade_upcall     : mach_port_t);
+
diff --git a/osfmk/mach/arcade_upcall.defs b/osfmk/mach/arcade_upcall.defs
new file mode 100644 (file)
index 0000000..db724f0
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+
+* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ *  Interface definition for the fairplay upcall mechanism.
+ */
+
+subsystem
+#if KERNEL_USER
+    KernelUser
+#endif /* KERNEL_USER */
+    arcade_upcall 61471;
+
+#include <mach/std_types.defs>
+#include <mach/mach_types.defs>
+
+routine arcade_upcall(
+                 arcade_upcall     : mach_port_t;
+                 path              : pointer_t;
+                 offset            : uint64_t;
+       out       should_kill       : boolean_t); 
+
+/* vim: set ft=c : */
+
index cc815f80bed536e9e1afbe1a43f3880c2aa749b1..d5f4d864d60dd5114f0ce9983aabbe4d7e01195d 100644 (file)
 #include <machine/types.h> /* __uint32_t */
 
 #if __DARWIN_UNIX03
-#define _STRUCT_ARM_EXCEPTION_STATE     struct __darwin_arm_exception_state
+#define _STRUCT_ARM_EXCEPTION_STATE struct __darwin_arm_exception_state
 _STRUCT_ARM_EXCEPTION_STATE
 {
-       __uint32_t      __exception; /* number of arm exception taken */
-       __uint32_t      __fsr; /* Fault status */
-       __uint32_t      __far; /* Virtual Fault Address */
+       __uint32_t __exception; /* number of arm exception taken */
+       __uint32_t __fsr;       /* Fault status */
+       __uint32_t __far;       /* Virtual Fault Address */
 };
 #else /* !__DARWIN_UNIX03 */
-#define _STRUCT_ARM_EXCEPTION_STATE     struct arm_exception_state
+#define _STRUCT_ARM_EXCEPTION_STATE struct arm_exception_state
 _STRUCT_ARM_EXCEPTION_STATE
 {
-       __uint32_t      exception; /* number of arm exception taken */
-       __uint32_t      fsr; /* Fault status */
-       __uint32_t      far; /* Virtual Fault Address */
+       __uint32_t exception;   /* number of arm exception taken */
+       __uint32_t fsr;         /* Fault status */
+       __uint32_t far;         /* Virtual Fault Address */
 };
 #endif /* __DARWIN_UNIX03 */
 
 #if __DARWIN_UNIX03
-#define _STRUCT_ARM_EXCEPTION_STATE64   struct __darwin_arm_exception_state64
+#define _STRUCT_ARM_EXCEPTION_STATE64 struct __darwin_arm_exception_state64
 _STRUCT_ARM_EXCEPTION_STATE64
 {
-       __uint64_t      __far; /* Virtual Fault Address */
-       __uint32_t      __esr; /* Exception syndrome */
-       __uint32_t      __exception; /* number of arm exception taken */
+       __uint64_t __far;       /* Virtual Fault Address */
+       __uint32_t __esr;       /* Exception syndrome */
+       __uint32_t __exception; /* number of arm exception taken */
 };
 #else /* !__DARWIN_UNIX03 */
-#define _STRUCT_ARM_EXCEPTION_STATE64   struct arm_exception_state64
+#define _STRUCT_ARM_EXCEPTION_STATE64 struct arm_exception_state64
 _STRUCT_ARM_EXCEPTION_STATE64
 {
-       __uint64_t      far; /* Virtual Fault Address */
-       __uint32_t      esr; /* Exception syndrome */
-       __uint32_t      exception; /* number of arm exception taken */
+       __uint64_t far;         /* Virtual Fault Address */
+       __uint32_t esr;         /* Exception syndrome */
+       __uint32_t exception;   /* number of arm exception taken */
 };
 #endif /* __DARWIN_UNIX03 */
 
 #if __DARWIN_UNIX03
-#define _STRUCT_ARM_THREAD_STATE        struct __darwin_arm_thread_state
+#define _STRUCT_ARM_THREAD_STATE struct __darwin_arm_thread_state
 _STRUCT_ARM_THREAD_STATE
 {
-       __uint32_t      __r[13];        /* General purpose register r0-r12 */
-       __uint32_t      __sp;           /* Stack pointer r13 */
-       __uint32_t      __lr;           /* Link register r14 */
-       __uint32_t      __pc;           /* Program counter r15 */
-       __uint32_t      __cpsr;         /* Current program status register */
+       __uint32_t __r[13]; /* General purpose register r0-r12 */
+       __uint32_t __sp;    /* Stack pointer r13 */
+       __uint32_t __lr;    /* Link register r14 */
+       __uint32_t __pc;    /* Program counter r15 */
+       __uint32_t __cpsr;  /* Current program status register */
 };
 #else /* !__DARWIN_UNIX03 */
-#define _STRUCT_ARM_THREAD_STATE        struct arm_thread_state
+#define _STRUCT_ARM_THREAD_STATE struct arm_thread_state
 _STRUCT_ARM_THREAD_STATE
 {
-       __uint32_t      r[13];  /* General purpose register r0-r12 */
-       __uint32_t      sp;             /* Stack pointer r13 */
-       __uint32_t      lr;             /* Link register r14 */
-       __uint32_t      pc;             /* Program counter r15 */
-       __uint32_t      cpsr;           /* Current program status register */
+       __uint32_t r[13];   /* General purpose register r0-r12 */
+       __uint32_t sp;      /* Stack pointer r13 */
+       __uint32_t lr;      /* Link register r14 */
+       __uint32_t pc;      /* Program counter r15 */
+       __uint32_t cpsr;    /* Current program status register */
 };
 #endif /* __DARWIN_UNIX03 */
 
-#if __DARWIN_UNIX03
-#define _STRUCT_ARM_THREAD_STATE64      struct __darwin_arm_thread_state64
-_STRUCT_ARM_THREAD_STATE64
-{
-       __uint64_t    __x[29];  /* General purpose registers x0-x28 */
-       __uint64_t    __fp;             /* Frame pointer x29 */
-       __uint64_t    __lr;             /* Link register x30 */
-       __uint64_t    __sp;             /* Stack pointer x31 */
-       __uint64_t    __pc;             /* Program counter */
-       __uint32_t    __cpsr;   /* Current program status register */
-       __uint32_t    __pad;    /* Same size for 32-bit or 64-bit clients */
-};
-#else /* !__DARWIN_UNIX03 */
+#if defined(KERNEL)
+
+#define __DARWIN_OPAQUE_ARM_THREAD_STATE64 0
+#define __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH 0x1
+#define __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR 0x2
+
 #define _STRUCT_ARM_THREAD_STATE64      struct arm_thread_state64
 _STRUCT_ARM_THREAD_STATE64
 {
@@ -114,92 +107,342 @@ _STRUCT_ARM_THREAD_STATE64
        __uint64_t    sp;               /* Stack pointer x31 */
        __uint64_t    pc;               /* Program counter */
        __uint32_t    cpsr;             /* Current program status register */
-       __uint32_t    __pad;    /* Same size for 32-bit or 64-bit clients */
+       __uint32_t    flags;    /* Flags describing structure format */
 };
+
+#else /* defined(KERNEL) */
+
+/*
+ * By default, the pointer fields in the arm_thread_state64_t structure are
+ * opaque on the arm64e architecture and require the use of accessor macros.
+ * This mode can also be enabled on the arm64 architecture by building with
+ * -D__DARWIN_OPAQUE_ARM_THREAD_STATE64=1.
+ */
+#if defined(__arm64__) && defined(__LP64__)
+
+#if __has_feature(ptrauth_calls)
+#define __DARWIN_OPAQUE_ARM_THREAD_STATE64 1
+#define __DARWIN_PTRAUTH_ARM_THREAD_STATE64 1
+#endif /* __has_feature(ptrauth_calls) */
+
+#ifndef __DARWIN_OPAQUE_ARM_THREAD_STATE64
+#define __DARWIN_OPAQUE_ARM_THREAD_STATE64 0
+#endif
+
+#else /* defined(__arm64__) && defined(__LP64__) */
+
+#undef __DARWIN_OPAQUE_ARM_THREAD_STATE64
+#define __DARWIN_OPAQUE_ARM_THREAD_STATE64 0
+
+#endif /* defined(__arm64__) && defined(__LP64__) */
+
+#if __DARWIN_UNIX03
+#define _STRUCT_ARM_THREAD_STATE64 struct __darwin_arm_thread_state64
+#if __DARWIN_OPAQUE_ARM_THREAD_STATE64
+_STRUCT_ARM_THREAD_STATE64
+{
+       __uint64_t __x[29];     /* General purpose registers x0-x28 */
+       void*      __opaque_fp; /* Frame pointer x29 */
+       void*      __opaque_lr; /* Link register x30 */
+       void*      __opaque_sp; /* Stack pointer x31 */
+       void*      __opaque_pc; /* Program counter */
+       __uint32_t __cpsr;      /* Current program status register */
+       __uint32_t __opaque_flags; /* Flags describing structure format */
+};
+#else /* __DARWIN_OPAQUE_ARM_THREAD_STATE64 */
+_STRUCT_ARM_THREAD_STATE64
+{
+       __uint64_t __x[29]; /* General purpose registers x0-x28 */
+       __uint64_t __fp;    /* Frame pointer x29 */
+       __uint64_t __lr;    /* Link register x30 */
+       __uint64_t __sp;    /* Stack pointer x31 */
+       __uint64_t __pc;    /* Program counter */
+       __uint32_t __cpsr;  /* Current program status register */
+       __uint32_t __pad;   /* Same size for 32-bit or 64-bit clients */
+};
+#endif /* __DARWIN_OPAQUE_ARM_THREAD_STATE64 */
+#else /* !__DARWIN_UNIX03 */
+#define _STRUCT_ARM_THREAD_STATE64 struct arm_thread_state64
+#if __DARWIN_OPAQUE_ARM_THREAD_STATE64
+_STRUCT_ARM_THREAD_STATE64
+{
+       __uint64_t x[29];       /* General purpose registers x0-x28 */
+       void*      __opaque_fp; /* Frame pointer x29 */
+       void*      __opaque_lr; /* Link register x30 */
+       void*      __opaque_sp; /* Stack pointer x31 */
+       void*      __opaque_pc; /* Program counter */
+       __uint32_t cpsr;        /* Current program status register */
+       __uint32_t __opaque_flags; /* Flags describing structure format */
+};
+#else /* __DARWIN_OPAQUE_ARM_THREAD_STATE64 */
+_STRUCT_ARM_THREAD_STATE64
+{
+       __uint64_t x[29]; /* General purpose registers x0-x28 */
+       __uint64_t fp;    /* Frame pointer x29 */
+       __uint64_t lr;    /* Link register x30 */
+       __uint64_t sp;    /* Stack pointer x31 */
+       __uint64_t pc;    /* Program counter */
+       __uint32_t cpsr;  /* Current program status register */
+       __uint32_t __pad; /* Same size for 32-bit or 64-bit clients */
+};
+#endif /* __DARWIN_OPAQUE_ARM_THREAD_STATE64 */
 #endif /* __DARWIN_UNIX03 */
-#if !defined(KERNEL)
+
 #if __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__)
+
+/* Accessor macros for arm_thread_state64_t pointer fields */
+
+#if __has_feature(ptrauth_calls) && defined(__LP64__)
+#include <ptrauth.h>
+
+#if !__DARWIN_OPAQUE_ARM_THREAD_STATE64 || !__DARWIN_PTRAUTH_ARM_THREAD_STATE64
+#error "Invalid configuration"
+#endif
+
+#define __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH 0x1
+#define __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR 0x2
+
+/* Return pc field of arm_thread_state64_t as a data pointer value */
+#define __darwin_arm_thread_state64_get_pc(ts) \
+       __extension__ ({ const _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \
+       (uintptr_t)(__tsp->__opaque_pc && !(__tsp->__opaque_flags &       \
+       __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ?                   \
+       ptrauth_auth_data(__tsp->__opaque_pc,                             \
+       ptrauth_key_process_independent_code,                             \
+       ptrauth_string_discriminator("pc")) : __tsp->__opaque_pc); })
+/* Return pc field of arm_thread_state64_t as a function pointer. May return
+ * NULL if a valid function pointer cannot be constructed, the caller should
+ * fall back to the __darwin_arm_thread_state64_get_pc() macro in that case. */
+#define __darwin_arm_thread_state64_get_pc_fptr(ts) \
+       __extension__ ({ const _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \
+       (__tsp->__opaque_pc && !(__tsp->__opaque_flags &                  \
+       __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ?                   \
+       ptrauth_auth_function(__tsp->__opaque_pc,                         \
+       ptrauth_key_process_independent_code,                             \
+       ptrauth_string_discriminator("pc")) : NULL); })
+/* Set pc field of arm_thread_state64_t to a function pointer */
+#define __darwin_arm_thread_state64_set_pc_fptr(ts, fptr) \
+       __extension__ ({ _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts);   \
+       __typeof__(fptr) __f = (fptr); __tsp->__opaque_pc =           \
+       (__f ? (!(__tsp->__opaque_flags &                             \
+       __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ?               \
+       ptrauth_auth_and_resign(__f, ptrauth_key_function_pointer, 0, \
+       ptrauth_key_process_independent_code,                         \
+       ptrauth_string_discriminator("pc")) : ptrauth_auth_data(__f,  \
+       ptrauth_key_function_pointer, 0)) : __f); })
+/* Return lr field of arm_thread_state64_t as a data pointer value */
+#define __darwin_arm_thread_state64_get_lr(ts) \
+       __extension__ ({ const _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \
+       (uintptr_t)(__tsp->__opaque_lr && !(__tsp->__opaque_flags & (     \
+       __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH |                    \
+       __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR)) ?                \
+       ptrauth_auth_data(__tsp->__opaque_lr,                             \
+       ptrauth_key_process_independent_code,                             \
+       ptrauth_string_discriminator("lr")) : __tsp->__opaque_lr); })
+/* Return lr field of arm_thread_state64_t as a function pointer. May return
+ * NULL if a valid function pointer cannot be constructed, the caller should
+ * fall back to the __darwin_arm_thread_state64_get_lr() macro in that case. */
+#define __darwin_arm_thread_state64_get_lr_fptr(ts) \
+       __extension__ ({ const _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \
+       (__tsp->__opaque_lr && !(__tsp->__opaque_flags & (                \
+       __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH |                    \
+       __DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR)) ?                \
+       ptrauth_auth_function(__tsp->__opaque_lr,                         \
+       ptrauth_key_process_independent_code,                             \
+       ptrauth_string_discriminator("lr")) : NULL); })
+/* Set lr field of arm_thread_state64_t to a function pointer */
+#define __darwin_arm_thread_state64_set_lr_fptr(ts, fptr) \
+       __extension__ ({ _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts);            \
+       __typeof__(fptr) __f = (fptr); __tsp->__opaque_lr =                    \
+       (__f ? (!(__tsp->__opaque_flags &                                      \
+       __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ? (__tsp->__opaque_flags \
+       &= ~__DARWIN_ARM_THREAD_STATE64_FLAGS_IB_SIGNED_LR ,                   \
+       ptrauth_auth_and_resign(__f, ptrauth_key_function_pointer, 0,          \
+       ptrauth_key_process_independent_code,                                  \
+       ptrauth_string_discriminator("lr"))) : ptrauth_auth_data(__f,          \
+       ptrauth_key_function_pointer, 0)) : __f); })
+/* Return sp field of arm_thread_state64_t as a data pointer value */
+#define __darwin_arm_thread_state64_get_sp(ts) \
+       __extension__ ({ const _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \
+       (uintptr_t)(__tsp->__opaque_sp && !(__tsp->__opaque_flags &       \
+       __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ?                   \
+       ptrauth_auth_data(__tsp->__opaque_sp,                             \
+       ptrauth_key_process_independent_data,                             \
+       ptrauth_string_discriminator("sp")) : __tsp->__opaque_sp); })
+/* Set sp field of arm_thread_state64_t to a data pointer value */
+#define __darwin_arm_thread_state64_set_sp(ts, ptr) \
+       __extension__ ({ _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \
+       void *__p = (void*)(uintptr_t)(ptr); __tsp->__opaque_sp =   \
+       (__p && !(__tsp->__opaque_flags &                           \
+       __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ?             \
+       ptrauth_sign_unauthenticated(__p,                           \
+       ptrauth_key_process_independent_data,                       \
+       ptrauth_string_discriminator("sp")) : __p); })
+/* Return fp field of arm_thread_state64_t as a data pointer value */
+#define __darwin_arm_thread_state64_get_fp(ts) \
+       __extension__ ({ const _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \
+       (uintptr_t)(__tsp->__opaque_fp && !(__tsp->__opaque_flags &       \
+       __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ?                   \
+       ptrauth_auth_data(__tsp->__opaque_fp,                             \
+       ptrauth_key_process_independent_data,                             \
+       ptrauth_string_discriminator("fp")) : __tsp->__opaque_fp); })
+/* Set fp field of arm_thread_state64_t to a data pointer value */
+#define __darwin_arm_thread_state64_set_fp(ts, ptr) \
+       __extension__ ({ _STRUCT_ARM_THREAD_STATE64 *__tsp = &(ts); \
+       void *__p = (void*)(uintptr_t)(ptr); __tsp->__opaque_fp =   \
+       (__p && !(__tsp->__opaque_flags &                           \
+       __DARWIN_ARM_THREAD_STATE64_FLAGS_NO_PTRAUTH) ?             \
+       ptrauth_sign_unauthenticated(__p,                           \
+       ptrauth_key_process_independent_data,                       \
+       ptrauth_string_discriminator("fp")) : __p); })
+
+#else /* __has_feature(ptrauth_calls) && defined(__LP64__) */
+
+#if __DARWIN_OPAQUE_ARM_THREAD_STATE64
+
+#ifndef __LP64__
+#error "Invalid configuration"
+#endif
+
+/* Return pc field of arm_thread_state64_t as a data pointer value */
+#define __darwin_arm_thread_state64_get_pc(ts) \
+       ((uintptr_t)((ts).__opaque_pc))
+/* Return pc field of arm_thread_state64_t as a function pointer */
+#define __darwin_arm_thread_state64_get_pc_fptr(ts) \
+       ((ts).__opaque_pc)
+/* Set pc field of arm_thread_state64_t to a function pointer */
+#define __darwin_arm_thread_state64_set_pc_fptr(ts, fptr) \
+       ((ts).__opaque_pc = (fptr))
+/* Return lr field of arm_thread_state64_t as a data pointer value */
+#define __darwin_arm_thread_state64_get_lr(ts) \
+       ((uintptr_t)((ts).__opaque_lr))
+/* Return lr field of arm_thread_state64_t as a function pointer */
+#define __darwin_arm_thread_state64_get_lr_fptr(ts) \
+       ((ts).__opaque_lr)
+/* Set lr field of arm_thread_state64_t to a function pointer */
+#define __darwin_arm_thread_state64_set_lr_fptr(ts, fptr) \
+       ((ts).__opaque_lr = (fptr))
+/* Return sp field of arm_thread_state64_t as a data pointer value */
+#define __darwin_arm_thread_state64_get_sp(ts) \
+       ((uintptr_t)((ts).__opaque_sp))
+/* Set sp field of arm_thread_state64_t to a data pointer value */
+#define __darwin_arm_thread_state64_set_sp(ts, ptr) \
+       ((ts).__opaque_sp = (void*)(uintptr_t)(ptr))
+/* Return fp field of arm_thread_state64_t as a data pointer value */
+#define __darwin_arm_thread_state64_get_fp(ts) \
+       ((uintptr_t)((ts).__opaque_fp))
+/* Set fp field of arm_thread_state64_t to a data pointer value */
+#define __darwin_arm_thread_state64_set_fp(ts, ptr) \
+       ((ts).__opaque_fp = (void*)(uintptr_t)(ptr))
+
+#else /* __DARWIN_OPAQUE_ARM_THREAD_STATE64 */
 #if __DARWIN_UNIX03
+
+/* Return pc field of arm_thread_state64_t as a data pointer value */
 #define __darwin_arm_thread_state64_get_pc(ts) \
-               ((ts).__pc)
+       ((ts).__pc)
+/* Return pc field of arm_thread_state64_t as a function pointer */
 #define __darwin_arm_thread_state64_get_pc_fptr(ts) \
-               ((void*)(uintptr_t)((ts).__pc))
+       ((void*)(uintptr_t)((ts).__pc))
+/* Set pc field of arm_thread_state64_t to a function pointer */
 #define __darwin_arm_thread_state64_set_pc_fptr(ts, fptr) \
-               ((ts).__pc = (uintptr_t)(fptr))
+       ((ts).__pc = (uintptr_t)(fptr))
+/* Return lr field of arm_thread_state64_t as a data pointer value */
 #define __darwin_arm_thread_state64_get_lr(ts) \
-               ((ts).__lr)
+       ((ts).__lr)
+/* Return lr field of arm_thread_state64_t as a function pointer */
 #define __darwin_arm_thread_state64_get_lr_fptr(ts) \
-               ((void*)(uintptr_t)((ts).__lr))
+       ((void*)(uintptr_t)((ts).__lr))
+/* Set lr field of arm_thread_state64_t to a function pointer */
 #define __darwin_arm_thread_state64_set_lr_fptr(ts, fptr) \
-               ((ts).__lr = (uintptr_t)(fptr))
+       ((ts).__lr = (uintptr_t)(fptr))
+/* Return sp field of arm_thread_state64_t as a data pointer value */
 #define __darwin_arm_thread_state64_get_sp(ts) \
-               ((ts).__sp)
+       ((ts).__sp)
+/* Set sp field of arm_thread_state64_t to a data pointer value */
 #define __darwin_arm_thread_state64_set_sp(ts, ptr) \
-               ((ts).__sp = (uintptr_t)(ptr))
+       ((ts).__sp = (uintptr_t)(ptr))
+/* Return fp field of arm_thread_state64_t as a data pointer value */
 #define __darwin_arm_thread_state64_get_fp(ts) \
-               ((ts).__fp)
+       ((ts).__fp)
+/* Set fp field of arm_thread_state64_t to a data pointer value */
 #define __darwin_arm_thread_state64_set_fp(ts, ptr) \
-               ((ts).__fp = (uintptr_t)(ptr))
-#else /* !__DARWIN_UNIX03 */
+       ((ts).__fp = (uintptr_t)(ptr))
+
+#else /* __DARWIN_UNIX03 */
+
+/* Return pc field of arm_thread_state64_t as a data pointer value */
 #define __darwin_arm_thread_state64_get_pc(ts) \
-               ((ts).pc)
+       ((ts).pc)
+/* Return pc field of arm_thread_state64_t as a function pointer */
 #define __darwin_arm_thread_state64_get_pc_fptr(ts) \
-               ((void*)(uintptr_t)((ts).pc))
+       ((void*)(uintptr_t)((ts).pc))
+/* Set pc field of arm_thread_state64_t to a function pointer */
 #define __darwin_arm_thread_state64_set_pc_fptr(ts, fptr) \
-               ((ts).pc = (uintptr_t)(fptr))
+       ((ts).pc = (uintptr_t)(fptr))
+/* Return lr field of arm_thread_state64_t as a data pointer value */
 #define __darwin_arm_thread_state64_get_lr(ts) \
-               ((ts).lr)
+       ((ts).lr)
+/* Return lr field of arm_thread_state64_t as a function pointer */
 #define __darwin_arm_thread_state64_get_lr_fptr(ts) \
-               ((void*)(uintptr_t)((ts).lr))
+       ((void*)(uintptr_t)((ts).lr))
+/* Set lr field of arm_thread_state64_t to a function pointer */
 #define __darwin_arm_thread_state64_set_lr_fptr(ts, fptr) \
-               ((ts).lr = (uintptr_t)(fptr))
+       ((ts).lr = (uintptr_t)(fptr))
+/* Return sp field of arm_thread_state64_t as a data pointer value */
 #define __darwin_arm_thread_state64_get_sp(ts) \
-               ((ts).sp)
+       ((ts).sp)
+/* Set sp field of arm_thread_state64_t to a data pointer value */
 #define __darwin_arm_thread_state64_set_sp(ts, ptr) \
-               ((ts).sp = (uintptr_t)(ptr))
+       ((ts).sp = (uintptr_t)(ptr))
+/* Return fp field of arm_thread_state64_t as a data pointer value */
 #define __darwin_arm_thread_state64_get_fp(ts) \
-               ((ts).fp)
+       ((ts).fp)
+/* Set fp field of arm_thread_state64_t to a data pointer value */
 #define __darwin_arm_thread_state64_set_fp(ts, ptr) \
-               ((ts).fp = (uintptr_t)(ptr))
+       ((ts).fp = (uintptr_t)(ptr))
+
 #endif /* __DARWIN_UNIX03 */
+#endif /* __DARWIN_OPAQUE_ARM_THREAD_STATE64 */
+
+#endif /* __has_feature(ptrauth_calls) && defined(__LP64__) */
 #endif /* __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__) */
 #endif /* !defined(KERNEL) */
 
 #if __DARWIN_UNIX03
-#define _STRUCT_ARM_VFP_STATE           struct __darwin_arm_vfp_state
+#define _STRUCT_ARM_VFP_STATE struct __darwin_arm_vfp_state
 _STRUCT_ARM_VFP_STATE
 {
-       __uint32_t        __r[64];
-       __uint32_t        __fpscr;
+       __uint32_t __r[64];
+       __uint32_t __fpscr;
 };
 #else /* !__DARWIN_UNIX03 */
-#define _STRUCT_ARM_VFP_STATE   struct arm_vfp_state
+#define _STRUCT_ARM_VFP_STATE struct arm_vfp_state
 _STRUCT_ARM_VFP_STATE
 {
-       __uint32_t        r[64];
-       __uint32_t        fpscr;
+       __uint32_t r[64];
+       __uint32_t fpscr;
 };
 #endif /* __DARWIN_UNIX03 */
 
 #if __DARWIN_UNIX03
-#define _STRUCT_ARM_NEON_STATE64        struct __darwin_arm_neon_state64
-#define _STRUCT_ARM_NEON_STATE          struct __darwin_arm_neon_state
+#define _STRUCT_ARM_NEON_STATE64 struct __darwin_arm_neon_state64
+#define _STRUCT_ARM_NEON_STATE   struct __darwin_arm_neon_state
 
 #if defined(__arm64__)
 _STRUCT_ARM_NEON_STATE64
 {
-       __uint128_t       __v[32];
-       __uint32_t        __fpsr;
-       __uint32_t        __fpcr;
+       __uint128_t __v[32];
+       __uint32_t  __fpsr;
+       __uint32_t  __fpcr;
 };
 
 _STRUCT_ARM_NEON_STATE
 {
-       __uint128_t       __v[16];
-       __uint32_t        __fpsr;
-       __uint32_t        __fpcr;
+       __uint128_t __v[16];
+       __uint32_t  __fpsr;
+       __uint32_t  __fpcr;
 };
-
 #elif defined(__arm__)
 /*
  * No 128-bit intrinsic for ARM; leave it opaque for now.
@@ -225,15 +468,16 @@ _STRUCT_ARM_NEON_STATE
 #if defined(__arm64__)
 _STRUCT_ARM_NEON_STATE64
 {
-       __uint128_t             q[32];
-       uint32_t                fpsr;
-       uint32_t                fpcr;
+       __uint128_t q[32];
+       uint32_t    fpsr;
+       uint32_t    fpcr;
 };
+
 _STRUCT_ARM_NEON_STATE
 {
-       __uint128_t             q[16];
-       uint32_t                fpsr;
-       uint32_t                fpcr;
+       __uint128_t q[16];
+       uint32_t    fpsr;
+       uint32_t    fpcr;
 };
 #elif defined(__arm__)
 /*
@@ -255,6 +499,13 @@ _STRUCT_ARM_NEON_STATE
 
 #endif /* __DARWIN_UNIX03 */
 
+
+#define _STRUCT_ARM_PAGEIN_STATE struct __arm_pagein_state
+_STRUCT_ARM_PAGEIN_STATE
+{
+       int __pagein_error;
+};
+
 /*
  * Debug State
  */
@@ -265,19 +516,19 @@ _STRUCT_ARM_NEON_STATE
 #define _STRUCT_ARM_DEBUG_STATE struct __darwin_arm_debug_state
 _STRUCT_ARM_DEBUG_STATE
 {
-       __uint32_t        __bvr[16];
-       __uint32_t        __bcr[16];
-       __uint32_t        __wvr[16];
-       __uint32_t        __wcr[16];
+       __uint32_t __bvr[16];
+       __uint32_t __bcr[16];
+       __uint32_t __wvr[16];
+       __uint32_t __wcr[16];
 };
 #else /* !__DARWIN_UNIX03 */
 #define _STRUCT_ARM_DEBUG_STATE struct arm_debug_state
 _STRUCT_ARM_DEBUG_STATE
 {
-       __uint32_t        bvr[16];
-       __uint32_t        bcr[16];
-       __uint32_t        wvr[16];
-       __uint32_t        wcr[16];
+       __uint32_t bvr[16];
+       __uint32_t bcr[16];
+       __uint32_t wvr[16];
+       __uint32_t wcr[16];
 };
 #endif /* __DARWIN_UNIX03 */
 
@@ -286,22 +537,22 @@ _STRUCT_ARM_DEBUG_STATE
 /* ARM's arm_debug_state is ARM64's arm_legacy_debug_state */
 
 #if __DARWIN_UNIX03
-#define _STRUCT_ARM_LEGACY_DEBUG_STATE  struct arm_legacy_debug_state
+#define _STRUCT_ARM_LEGACY_DEBUG_STATE struct arm_legacy_debug_state
 _STRUCT_ARM_LEGACY_DEBUG_STATE
 {
-       __uint32_t        __bvr[16];
-       __uint32_t        __bcr[16];
-       __uint32_t        __wvr[16];
-       __uint32_t        __wcr[16];
+       __uint32_t __bvr[16];
+       __uint32_t __bcr[16];
+       __uint32_t __wvr[16];
+       __uint32_t __wcr[16];
 };
 #else /* __DARWIN_UNIX03 */
-#define _STRUCT_ARM_LEGACY_DEBUG_STATE  struct arm_legacy_debug_state
+#define _STRUCT_ARM_LEGACY_DEBUG_STATE struct arm_legacy_debug_state
 _STRUCT_ARM_LEGACY_DEBUG_STATE
 {
-       __uint32_t        bvr[16];
-       __uint32_t        bcr[16];
-       __uint32_t        wvr[16];
-       __uint32_t        wcr[16];
+       __uint32_t bvr[16];
+       __uint32_t bcr[16];
+       __uint32_t wvr[16];
+       __uint32_t wcr[16];
 };
 #endif /* __DARWIN_UNIX03 */
 #else
@@ -309,55 +560,55 @@ _STRUCT_ARM_LEGACY_DEBUG_STATE
 #endif
 
 #if __DARWIN_UNIX03
-#define _STRUCT_ARM_DEBUG_STATE32       struct __darwin_arm_debug_state32
+#define _STRUCT_ARM_DEBUG_STATE32 struct __darwin_arm_debug_state32
 _STRUCT_ARM_DEBUG_STATE32
 {
-       __uint32_t        __bvr[16];
-       __uint32_t        __bcr[16];
-       __uint32_t        __wvr[16];
-       __uint32_t        __wcr[16];
-       __uint64_t        __mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */
+       __uint32_t __bvr[16];
+       __uint32_t __bcr[16];
+       __uint32_t __wvr[16];
+       __uint32_t __wcr[16];
+       __uint64_t __mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */
 };
 
-#define _STRUCT_ARM_DEBUG_STATE64       struct __darwin_arm_debug_state64
+#define _STRUCT_ARM_DEBUG_STATE64 struct __darwin_arm_debug_state64
 _STRUCT_ARM_DEBUG_STATE64
 {
-       __uint64_t        __bvr[16];
-       __uint64_t        __bcr[16];
-       __uint64_t        __wvr[16];
-       __uint64_t        __wcr[16];
-       __uint64_t        __mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */
+       __uint64_t __bvr[16];
+       __uint64_t __bcr[16];
+       __uint64_t __wvr[16];
+       __uint64_t __wcr[16];
+       __uint64_t __mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */
 };
 #else /* !__DARWIN_UNIX03 */
-#define _STRUCT_ARM_DEBUG_STATE32       struct arm_debug_state32
+#define _STRUCT_ARM_DEBUG_STATE32 struct arm_debug_state32
 _STRUCT_ARM_DEBUG_STATE32
 {
-       __uint32_t        bvr[16];
-       __uint32_t        bcr[16];
-       __uint32_t        wvr[16];
-       __uint32_t        wcr[16];
-       __uint64_t        mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */
+       __uint32_t bvr[16];
+       __uint32_t bcr[16];
+       __uint32_t wvr[16];
+       __uint32_t wcr[16];
+       __uint64_t mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */
 };
 
-#define _STRUCT_ARM_DEBUG_STATE64       struct arm_debug_state64
+#define _STRUCT_ARM_DEBUG_STATE64 struct arm_debug_state64
 _STRUCT_ARM_DEBUG_STATE64
 {
-       __uint64_t        bvr[16];
-       __uint64_t        bcr[16];
-       __uint64_t        wvr[16];
-       __uint64_t        wcr[16];
-       __uint64_t        mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */
+       __uint64_t bvr[16];
+       __uint64_t bcr[16];
+       __uint64_t wvr[16];
+       __uint64_t wcr[16];
+       __uint64_t mdscr_el1; /* Bit 0 is SS (Hardware Single Step) */
 };
 #endif /* __DARWIN_UNIX03 */
 
 #if __DARWIN_UNIX03
-#define _STRUCT_ARM_CPMU_STATE64        struct __darwin_arm_cpmu_state64
+#define _STRUCT_ARM_CPMU_STATE64 struct __darwin_arm_cpmu_state64
 _STRUCT_ARM_CPMU_STATE64
 {
        __uint64_t __ctrs[16];
 };
 #else /* __DARWIN_UNIX03 */
-#define _STRUCT_ARM_CPMU_STATE64        struct arm_cpmu_state64
+#define _STRUCT_ARM_CPMU_STATE64 struct arm_cpmu_state64
 _STRUCT_ARM_CPMU_STATE64
 {
        __uint64_t ctrs[16];
index 14478d091cbfcfe6f65aa32f4b7a779e21073f57..06658bc1ea03000db61c0f1c298d76a03ab94363 100644 (file)
 
 #define EXC_ARM_UNDEFINED       1       /* Undefined */
 
+/*
+ *      EXC_ARITHMETIC
+ */
+
+#define EXC_ARM_FP_UNDEFINED    0       /* Undefined Floating Point Exception */
+#define EXC_ARM_FP_IO           1       /* Invalid Floating Point Operation */
+#define EXC_ARM_FP_DZ           2       /* Floating Point Divide by Zero */
+#define EXC_ARM_FP_OF           3       /* Floating Point Overflow */
+#define EXC_ARM_FP_UF           4       /* Floating Point Underflow */
+#define EXC_ARM_FP_IX           5       /* Inexact Floating Point Result */
+#define EXC_ARM_FP_ID           6       /* Floating Point Denormal Input */
 
 /*
  *      EXC_BAD_ACCESS
@@ -54,7 +65,7 @@
 #define EXC_ARM_DA_ALIGN        0x101   /* Alignment Fault */
 #define EXC_ARM_DA_DEBUG        0x102   /* Debug (watch/break) Fault */
 #define EXC_ARM_SP_ALIGN        0x103   /* SP Alignment Fault */
-#define EXC_ARM_SWP                     0x104   /* SWP instruction */
+#define EXC_ARM_SWP             0x104   /* SWP instruction */
 
 /*
  *     EXC_BREAKPOINT
index 0751024a2945c7679ff2a534ee8f7e0b45857a3f..519e1d935ef0b2912aa5e832a8297ee6bb7bb8b3 100644 (file)
@@ -30,8 +30,6 @@
 #ifndef _MACH_ARM_SDT_ISA_H
 #define        _MACH_ARM_SDT_ISA_H
 
-/* #pragma ident       "@(#)sdt.h      1.7     05/06/08 SMI" */
-
 /*
  * Only define when testing.  This makes the calls into actual calls to
  * test functions.
index 3a6369f8f1402b1ae49fda08deb5a847c5476dbe..bedc4090ad8b915fd28e079aea2911b64cadb6f7 100644 (file)
@@ -33,7 +33,7 @@
 #define _MACH_ARM_THREAD_STATE_H_
 
 /* Size of maximum exported thread state in words */
-#define ARM_THREAD_STATE_MAX    (144)    /* Size of biggest state possible */
+#define ARM_THREAD_STATE_MAX    (1296)    /* Size of biggest state possible */
 
 #if defined (__arm__) || defined(__arm64__)
 #define THREAD_STATE_MAX        ARM_THREAD_STATE_MAX
index 27a5441d8fa26c43800c7f43a82546934b18f978..b12c02b5bbfbcb05ab99655489e7ae3ad1450000 100644 (file)
  *  Flavors
  */
 
-#define ARM_THREAD_STATE                1
+#define ARM_THREAD_STATE         1
 #define ARM_UNIFIED_THREAD_STATE ARM_THREAD_STATE
-#define ARM_VFP_STATE                   2
-#define ARM_EXCEPTION_STATE             3
-#define ARM_DEBUG_STATE                 4 /* pre-armv8 */
-#define THREAD_STATE_NONE               5
-#define ARM_THREAD_STATE64              6
-#define ARM_EXCEPTION_STATE64   7
-// ARM_THREAD_STATE_LAST (legacy) 8
-#define ARM_THREAD_STATE32              9
+#define ARM_VFP_STATE            2
+#define ARM_EXCEPTION_STATE      3
+#define ARM_DEBUG_STATE          4 /* pre-armv8 */
+#define THREAD_STATE_NONE        5
+#define ARM_THREAD_STATE64       6
+#define ARM_EXCEPTION_STATE64    7
+//      ARM_THREAD_STATE_LAST    8 /* legacy */
+#define ARM_THREAD_STATE32       9
 
 /* API */
-#define ARM_DEBUG_STATE32               14
-#define ARM_DEBUG_STATE64               15
-#define ARM_NEON_STATE                  16
-#define ARM_NEON_STATE64                17
-#define ARM_CPMU_STATE64                18
+#define ARM_DEBUG_STATE32        14
+#define ARM_DEBUG_STATE64        15
+#define ARM_NEON_STATE           16
+#define ARM_NEON_STATE64         17
+#define ARM_CPMU_STATE64         18
 
 #ifdef XNU_KERNEL_PRIVATE
 /* For kernel use */
-#define ARM_SAVED_STATE32               20
-#define ARM_SAVED_STATE64               21
-#define ARM_NEON_SAVED_STATE32  22
-#define ARM_NEON_SAVED_STATE64  23
+#define ARM_SAVED_STATE32        20
+#define ARM_SAVED_STATE64        21
+#define ARM_NEON_SAVED_STATE32   22
+#define ARM_NEON_SAVED_STATE64   23
 #endif /* XNU_KERNEL_PRIVATE */
 
+
+#define ARM_STATE_FLAVOR_IS_OTHER_VALID(_flavor_) 0
+
+#define ARM_PAGEIN_STATE         27
+
 #define VALID_THREAD_STATE_FLAVOR(x) \
-((x == ARM_THREAD_STATE)                ||      \
- (x == ARM_VFP_STATE)                   ||      \
- (x == ARM_EXCEPTION_STATE)     ||      \
- (x == ARM_DEBUG_STATE)                 ||      \
- (x == THREAD_STATE_NONE)              ||  \
- (x == ARM_THREAD_STATE32)             ||      \
- (x == ARM_THREAD_STATE64)             ||      \
- (x == ARM_EXCEPTION_STATE64)  ||      \
- (x == ARM_NEON_STATE)         ||      \
- (x == ARM_NEON_STATE64)               ||      \
- (x == ARM_DEBUG_STATE32)               ||      \
- (x == ARM_DEBUG_STATE64))
+       ((x == ARM_THREAD_STATE) ||           \
+        (x == ARM_VFP_STATE) ||              \
+        (x == ARM_EXCEPTION_STATE) ||        \
+        (x == ARM_DEBUG_STATE) ||            \
+        (x == THREAD_STATE_NONE) ||          \
+        (x == ARM_THREAD_STATE32) ||         \
+        (x == ARM_THREAD_STATE64) ||         \
+        (x == ARM_EXCEPTION_STATE64) ||      \
+        (x == ARM_NEON_STATE) ||             \
+        (x == ARM_NEON_STATE64) ||           \
+        (x == ARM_DEBUG_STATE32) ||          \
+        (x == ARM_DEBUG_STATE64) ||          \
+        (x == ARM_PAGEIN_STATE) ||           \
+        (ARM_STATE_FLAVOR_IS_OTHER_VALID(x)))
 
 struct arm_state_hdr {
        uint32_t flavor;
@@ -92,32 +99,50 @@ struct arm_state_hdr {
 };
 typedef struct arm_state_hdr arm_state_hdr_t;
 
-typedef _STRUCT_ARM_THREAD_STATE                arm_thread_state_t;
-typedef _STRUCT_ARM_THREAD_STATE                arm_thread_state32_t;
-typedef _STRUCT_ARM_THREAD_STATE64              arm_thread_state64_t;
+typedef _STRUCT_ARM_THREAD_STATE   arm_thread_state_t;
+typedef _STRUCT_ARM_THREAD_STATE   arm_thread_state32_t;
+typedef _STRUCT_ARM_THREAD_STATE64 arm_thread_state64_t;
 
 #if !defined(KERNEL)
 #if __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__)
+
+/* Accessor macros for arm_thread_state64_t pointer fields */
+
+/* Return pc field of arm_thread_state64_t as a data pointer value */
 #define arm_thread_state64_get_pc(ts) \
                __darwin_arm_thread_state64_get_pc(ts)
+/* Return pc field of arm_thread_state64_t as a function pointer. May return
+ * NULL if a valid function pointer cannot be constructed, the caller should
+ * fall back to the arm_thread_state64_get_pc() macro in that case. */
 #define arm_thread_state64_get_pc_fptr(ts) \
                __darwin_arm_thread_state64_get_pc_fptr(ts)
+/* Set pc field of arm_thread_state64_t to a function pointer */
 #define arm_thread_state64_set_pc_fptr(ts, fptr) \
                __darwin_arm_thread_state64_set_pc_fptr(ts, fptr)
+/* Return lr field of arm_thread_state64_t as a data pointer value */
 #define arm_thread_state64_get_lr(ts) \
                __darwin_arm_thread_state64_get_lr(ts)
+/* Return lr field of arm_thread_state64_t as a function pointer. May return
+ * NULL if a valid function pointer cannot be constructed, the caller should
+ * fall back to the arm_thread_state64_get_lr() macro in that case. */
 #define arm_thread_state64_get_lr_fptr(ts) \
                __darwin_arm_thread_state64_get_lr_fptr(ts)
+/* Set lr field of arm_thread_state64_t to a function pointer */
 #define arm_thread_state64_set_lr_fptr(ts, fptr) \
                __darwin_arm_thread_state64_set_lr_fptr(ts, fptr)
+/* Return sp field of arm_thread_state64_t as a data pointer value */
 #define arm_thread_state64_get_sp(ts) \
                __darwin_arm_thread_state64_get_sp(ts)
+/* Set sp field of arm_thread_state64_t to a data pointer value */
 #define arm_thread_state64_set_sp(ts, ptr) \
                __darwin_arm_thread_state64_set_sp(ts, ptr)
+/* Return fp field of arm_thread_state64_t as a data pointer value */
 #define arm_thread_state64_get_fp(ts) \
                __darwin_arm_thread_state64_get_fp(ts)
+/* Set fp field of arm_thread_state64_t to a data pointer value */
 #define arm_thread_state64_set_fp(ts, ptr) \
                __darwin_arm_thread_state64_set_fp(ts, ptr)
+
 #endif /* __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__) */
 #endif /* !defined(KERNEL) */
 
@@ -128,79 +153,86 @@ struct arm_unified_thread_state {
                arm_thread_state64_t ts_64;
        } uts;
 };
-#define ts_32   uts.ts_32
-#define ts_64   uts.ts_64
+#define ts_32 uts.ts_32
+#define ts_64 uts.ts_64
 typedef struct arm_unified_thread_state arm_unified_thread_state_t;
 
 #define ARM_THREAD_STATE_COUNT ((mach_msg_type_number_t) \
-   (sizeof (arm_thread_state_t)/sizeof(uint32_t)))
+       (sizeof (arm_thread_state_t)/sizeof(uint32_t)))
 #define ARM_THREAD_STATE32_COUNT ((mach_msg_type_number_t) \
-   (sizeof (arm_thread_state32_t)/sizeof(uint32_t)))
+       (sizeof (arm_thread_state32_t)/sizeof(uint32_t)))
 #define ARM_THREAD_STATE64_COUNT ((mach_msg_type_number_t) \
-   (sizeof (arm_thread_state64_t)/sizeof(uint32_t)))
+       (sizeof (arm_thread_state64_t)/sizeof(uint32_t)))
 #define ARM_UNIFIED_THREAD_STATE_COUNT ((mach_msg_type_number_t) \
-   (sizeof (arm_unified_thread_state_t)/sizeof(uint32_t)))
+       (sizeof (arm_unified_thread_state_t)/sizeof(uint32_t)))
+
 
+typedef _STRUCT_ARM_VFP_STATE         arm_vfp_state_t;
+typedef _STRUCT_ARM_NEON_STATE        arm_neon_state_t;
+typedef _STRUCT_ARM_NEON_STATE        arm_neon_state32_t;
+typedef _STRUCT_ARM_NEON_STATE64      arm_neon_state64_t;
 
-typedef _STRUCT_ARM_VFP_STATE                   arm_vfp_state_t;
-typedef _STRUCT_ARM_NEON_STATE                  arm_neon_state_t;
-typedef _STRUCT_ARM_NEON_STATE                  arm_neon_state32_t;
-typedef _STRUCT_ARM_NEON_STATE64                arm_neon_state64_t;
 
-typedef _STRUCT_ARM_EXCEPTION_STATE             arm_exception_state_t;
-typedef _STRUCT_ARM_EXCEPTION_STATE             arm_exception_state32_t;
-typedef _STRUCT_ARM_EXCEPTION_STATE64   arm_exception_state64_t;
+typedef _STRUCT_ARM_EXCEPTION_STATE   arm_exception_state_t;
+typedef _STRUCT_ARM_EXCEPTION_STATE   arm_exception_state32_t;
+typedef _STRUCT_ARM_EXCEPTION_STATE64 arm_exception_state64_t;
 
-typedef _STRUCT_ARM_DEBUG_STATE32               arm_debug_state32_t;
-typedef _STRUCT_ARM_DEBUG_STATE64               arm_debug_state64_t;
+typedef _STRUCT_ARM_DEBUG_STATE32     arm_debug_state32_t;
+typedef _STRUCT_ARM_DEBUG_STATE64     arm_debug_state64_t;
+
+typedef _STRUCT_ARM_PAGEIN_STATE      arm_pagein_state_t;
 
 #if defined(XNU_KERNEL_PRIVATE) && defined(__arm64__)
 /* See below for ARM64 kernel structure definition for arm_debug_state. */
-#else
+#else /* defined(XNU_KERNEL_PRIVATE) && defined(__arm64__) */
 /*
  * Otherwise not ARM64 kernel and we must preserve legacy ARM definitions of
  * arm_debug_state for binary compatability of userland consumers of this file.
  */
 #if defined(__arm__)
-typedef _STRUCT_ARM_DEBUG_STATE                 arm_debug_state_t;
+typedef _STRUCT_ARM_DEBUG_STATE        arm_debug_state_t;
 #elif defined(__arm64__)
-typedef _STRUCT_ARM_LEGACY_DEBUG_STATE          arm_debug_state_t;
-#else
+typedef _STRUCT_ARM_LEGACY_DEBUG_STATE arm_debug_state_t;
+#else /* defined(__arm__) */
 #error Undefined architecture
-#endif
-#endif
+#endif /* defined(__arm__) */
+#endif /* defined(XNU_KERNEL_PRIVATE) && defined(__arm64__) */
 
 #define ARM_VFP_STATE_COUNT ((mach_msg_type_number_t) \
-   (sizeof (arm_vfp_state_t)/sizeof(uint32_t)))
+       (sizeof (arm_vfp_state_t)/sizeof(uint32_t)))
 
 #define ARM_EXCEPTION_STATE_COUNT ((mach_msg_type_number_t) \
-   (sizeof (arm_exception_state_t)/sizeof(uint32_t)))
+       (sizeof (arm_exception_state_t)/sizeof(uint32_t)))
 
 #define ARM_EXCEPTION_STATE64_COUNT ((mach_msg_type_number_t) \
-   (sizeof (arm_exception_state64_t)/sizeof(uint32_t)))
+       (sizeof (arm_exception_state64_t)/sizeof(uint32_t)))
 
 #define ARM_DEBUG_STATE_COUNT ((mach_msg_type_number_t) \
-   (sizeof (arm_debug_state_t)/sizeof(uint32_t)))
+       (sizeof (arm_debug_state_t)/sizeof(uint32_t)))
 
 #define ARM_DEBUG_STATE32_COUNT ((mach_msg_type_number_t) \
-   (sizeof (arm_debug_state32_t)/sizeof(uint32_t)))
+       (sizeof (arm_debug_state32_t)/sizeof(uint32_t)))
+
+#define ARM_PAGEIN_STATE_COUNT ((mach_msg_type_number_t) \
+       (sizeof (arm_pagein_state_t)/sizeof(uint32_t)))
 
 #define ARM_DEBUG_STATE64_COUNT ((mach_msg_type_number_t) \
-   (sizeof (arm_debug_state64_t)/sizeof(uint32_t)))
+       (sizeof (arm_debug_state64_t)/sizeof(uint32_t)))
 
 #define ARM_NEON_STATE_COUNT ((mach_msg_type_number_t) \
-   (sizeof (arm_neon_state_t)/sizeof(uint32_t)))
+       (sizeof (arm_neon_state_t)/sizeof(uint32_t)))
 
 #define ARM_NEON_STATE64_COUNT ((mach_msg_type_number_t) \
-   (sizeof (arm_neon_state64_t)/sizeof(uint32_t)))
+       (sizeof (arm_neon_state64_t)/sizeof(uint32_t)))
+
+#define MACHINE_THREAD_STATE       ARM_THREAD_STATE
+#define MACHINE_THREAD_STATE_COUNT ARM_UNIFIED_THREAD_STATE_COUNT
 
-#define MACHINE_THREAD_STATE            ARM_THREAD_STATE
-#define MACHINE_THREAD_STATE_COUNT      ARM_UNIFIED_THREAD_STATE_COUNT
 
 /*
  * Largest state on this machine:
  */
-#define THREAD_MACHINE_STATE_MAX        THREAD_STATE_MAX
+#define THREAD_MACHINE_STATE_MAX THREAD_STATE_MAX
 
 #ifdef XNU_KERNEL_PRIVATE
 
@@ -243,17 +275,17 @@ const_thread_state64(const arm_unified_thread_state_t *its)
 #if defined(__arm__)
 #include <arm/proc_reg.h>
 
-#define ARM_SAVED_STATE                 THREAD_STATE_NONE + 1
+#define ARM_SAVED_STATE (THREAD_STATE_NONE + 1)
 
 struct arm_saved_state {
-       uint32_t    r[13];  /* General purpose register r0-r12 */
-       uint32_t    sp; /* Stack pointer r13 */
-       uint32_t    lr; /* Link register r14 */
-       uint32_t    pc; /* Program counter r15 */
-       uint32_t    cpsr;   /* Current program status register */
-       uint32_t    fsr;    /* Fault status */
-       uint32_t    far;    /* Virtual Fault Address */
-       uint32_t    exception;/* exception number */
+       uint32_t r[13];     /* General purpose register r0-r12 */
+       uint32_t sp;        /* Stack pointer r13 */
+       uint32_t lr;        /* Link register r14 */
+       uint32_t pc;        /* Program counter r15 */
+       uint32_t cpsr;      /* Current program status register */
+       uint32_t fsr;       /* Fault status */
+       uint32_t far;       /* Virtual Fault Address */
+       uint32_t exception; /* exception number */
 };
 typedef struct arm_saved_state arm_saved_state_t;
 
@@ -262,6 +294,12 @@ typedef struct arm_saved_state arm_saved_state_t;
  */
 typedef struct arm_saved_state arm_saved_state32_t;
 
+static inline void
+copy_signed_thread_state(arm_saved_state_t *dst, const arm_saved_state_t *src)
+{
+       *dst = *src;
+}
+
 static inline arm_saved_state32_t*
 saved_state32(arm_saved_state_t *iss)
 {
@@ -276,13 +314,13 @@ is_saved_state32(const arm_saved_state_t *iss __unused)
 
 
 struct arm_saved_state_tagged {
-       uint32_t                                        tag;
-       struct arm_saved_state          state;
+       uint32_t               tag;
+       struct arm_saved_state state;
 };
 typedef struct arm_saved_state_tagged arm_saved_state_tagged_t;
 
 #define ARM_SAVED_STATE32_COUNT ((mach_msg_type_number_t) \
-               (sizeof (arm_saved_state_t)/sizeof(unsigned int)))
+       (sizeof (arm_saved_state_t)/sizeof(unsigned int)))
 
 
 static inline register_t
@@ -291,6 +329,12 @@ get_saved_state_pc(const arm_saved_state_t *iss)
        return iss->pc;
 }
 
+static inline void
+add_saved_state_pc(arm_saved_state_t *iss, int diff)
+{
+       iss->pc += diff;
+}
+
 static inline void
 set_saved_state_pc(arm_saved_state_t *iss, register_t pc)
 {
@@ -339,6 +383,13 @@ get_saved_state_cpsr(const arm_saved_state_t *iss)
        return iss->cpsr;
 }
 
+static inline void
+mask_saved_state_cpsr(arm_saved_state_t *iss, uint32_t set_bits, uint32_t clear_bits)
+{
+       iss->cpsr |= set_bits;
+       iss->cpsr &= clear_bits;
+}
+
 static inline void
 set_saved_state_cpsr(arm_saved_state_t *iss, register_t cpsr)
 {
@@ -368,46 +419,49 @@ set_saved_state_reg(arm_saved_state_t *iss, unsigned regno, register_t val)
  */
 
 struct arm_saved_state32 {
-       uint32_t        r[13];          /* General purpose register r0-r12 */
-       uint32_t        sp;                     /* Stack pointer r13 */
-       uint32_t        lr;                     /* Link register r14 */
-       uint32_t        pc;                     /* Program counter r15 */
-       uint32_t        cpsr;           /* Current program status register */
-       uint32_t        far;            /* Virtual fault address */
-       uint32_t        esr;            /* Exception syndrome register */
-       uint32_t        exception;      /* Exception number */
+       uint32_t r[13];     /* General purpose register r0-r12 */
+       uint32_t sp;        /* Stack pointer r13 */
+       uint32_t lr;        /* Link register r14 */
+       uint32_t pc;        /* Program counter r15 */
+       uint32_t cpsr;      /* Current program status register */
+       uint32_t far;       /* Virtual fault address */
+       uint32_t esr;       /* Exception syndrome register */
+       uint32_t exception; /* Exception number */
 };
 typedef struct arm_saved_state32 arm_saved_state32_t;
 
 struct arm_saved_state32_tagged {
-       uint32_t                                        tag;
-       struct arm_saved_state32        state;
+       uint32_t                 tag;
+       struct arm_saved_state32 state;
 };
 typedef struct arm_saved_state32_tagged arm_saved_state32_tagged_t;
 
 #define ARM_SAVED_STATE32_COUNT ((mach_msg_type_number_t) \
-               (sizeof (arm_saved_state32_t)/sizeof(unsigned int)))
+               (sizeof(arm_saved_state32_t)/sizeof(unsigned int)))
 
 struct arm_saved_state64 {
-       uint64_t        x[29];          /* General purpose registers x0-x28 */
-       uint64_t        fp;                     /* Frame pointer x29 */
-       uint64_t        lr;                     /* Link register x30 */
-       uint64_t        sp;                     /* Stack pointer x31 */
-       uint64_t        pc;                     /* Program counter */
-       uint32_t        cpsr;           /* Current program status register */
-       uint32_t        reserved;       /* Reserved padding */
-       uint64_t        far;            /* Virtual fault address */
-       uint32_t        esr;            /* Exception syndrome register */
-       uint32_t        exception;      /* Exception number */
+       uint64_t x[29];     /* General purpose registers x0-x28 */
+       uint64_t fp;        /* Frame pointer x29 */
+       uint64_t lr;        /* Link register x30 */
+       uint64_t sp;        /* Stack pointer x31 */
+       uint64_t pc;        /* Program counter */
+       uint32_t cpsr;      /* Current program status register */
+       uint32_t reserved;  /* Reserved padding */
+       uint64_t far;       /* Virtual fault address */
+       uint32_t esr;       /* Exception syndrome register */
+       uint32_t exception; /* Exception number */
+#if defined(HAS_APPLE_PAC)
+       uint64_t jophash;
+#endif /* defined(HAS_APPLE_PAC) */
 };
 typedef struct arm_saved_state64 arm_saved_state64_t;
 
 #define ARM_SAVED_STATE64_COUNT ((mach_msg_type_number_t) \
-               (sizeof (arm_saved_state64_t)/sizeof(unsigned int)))
+       (sizeof(arm_saved_state64_t)/sizeof(unsigned int)))
 
 struct arm_saved_state64_tagged {
-       uint32_t                                        tag;
-       struct arm_saved_state64        state;
+       uint32_t                 tag;
+       struct arm_saved_state64 state;
 };
 typedef struct arm_saved_state64_tagged arm_saved_state64_tagged_t;
 
@@ -418,11 +472,85 @@ struct arm_saved_state {
                struct arm_saved_state64 ss_64;
        } uss;
 } __attribute__((aligned(16)));
-#define ss_32   uss.ss_32
-#define ss_64   uss.ss_64
+#define ss_32 uss.ss_32
+#define ss_64 uss.ss_64
 
 typedef struct arm_saved_state arm_saved_state_t;
 
+#if defined(XNU_KERNEL_PRIVATE)
+#if defined(HAS_APPLE_PAC)
+/*
+ * Methods used to sign and check thread state to detect corruptions of saved
+ * thread state across exceptions and context switches.
+ */
+extern void ml_sign_thread_state(arm_saved_state_t *, uint64_t, uint32_t, uint64_t, uint64_t, uint64_t);
+
+extern void ml_check_signed_state(const arm_saved_state_t *, uint64_t, uint32_t, uint64_t, uint64_t, uint64_t);
+
+/* XXX: including stddef.f here breaks ctfmerge on some builds, so use __builtin_offsetof() instead of offsetof() */
+#define ss64_offsetof(x) __builtin_offsetof(struct arm_saved_state, ss_64.x)
+
+/**
+ * Verify the signed thread state in _iss, execute the assembly instructions
+ * _instr, and re-sign the modified thread state.  Varargs specify additional
+ * inputs.
+ *
+ * _instr may read or modify the thread state in the following registers:
+ *
+ * x0: _iss
+ * x1: authed _iss->ss_64.pc
+ * w2: authed _iss->ss_64.cpsr
+ * x3: authed _iss->ss_64.lr
+ * x4: authed _iss->ss_64.x16
+ * x5: authed _iss->ss_64.x17
+ * x6: scratch register
+ * x7: scratch register
+ */
+#define MANIPULATE_SIGNED_THREAD_STATE(_iss, _instr, ...)               \
+       asm volatile (                                                  \
+               "mov    x8, lr"                         "\n"            \
+               "mov    x0, %[iss]"                     "\n"            \
+               "ldp    x4, x5, [x0, %[SS64_X16]]"      "\n"            \
+               "ldr    x6, [x0, %[SS64_PC]]"           "\n"            \
+               "ldr    w7, [x0, %[SS64_CPSR]]"         "\n"            \
+               "ldr    x3, [x0, %[SS64_LR]]"           "\n"            \
+               "mov    x1, x6"                         "\n"            \
+               "mov    w2, w7"                         "\n"            \
+               "bl     _ml_check_signed_state"         "\n"            \
+               "mov    x1, x6"                         "\n"            \
+               "mov    w2, w7"                         "\n"            \
+               _instr                                  "\n"            \
+               "bl     _ml_sign_thread_state"          "\n"            \
+               "mov    lr, x8"                         "\n"            \
+               :                                                       \
+               : [iss]         "r"(_iss),                              \
+                 [SS64_X16]    "i"(ss64_offsetof(x[16])),              \
+                 [SS64_PC]     "i"(ss64_offsetof(pc)),                 \
+                 [SS64_CPSR]   "i"(ss64_offsetof(cpsr)),               \
+                 [SS64_LR]     "i"(ss64_offsetof(lr)),##__VA_ARGS__    \
+               : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8"  \
+       )
+
+static inline void
+check_and_sign_copied_thread_state(arm_saved_state_t *dst, const arm_saved_state_t *src)
+{
+       MANIPULATE_SIGNED_THREAD_STATE(src,
+           "mov        x0, %[dst]",
+           [dst] "r"(dst)
+           );
+}
+#endif /* defined(HAS_APPLE_PAC) */
+
+static inline void
+copy_signed_thread_state(arm_saved_state_t *dst, const arm_saved_state_t *src)
+{
+       *dst = *src;
+#if defined(HAS_APPLE_PAC)
+       check_and_sign_copied_thread_state(dst, src);
+#endif
+}
+
+#endif /* defined(XNU_KERNEL_PRIVATE) */
 
 static inline boolean_t
 is_saved_state32(const arm_saved_state_t *iss)
@@ -466,13 +594,41 @@ get_saved_state_pc(const arm_saved_state_t *iss)
        return is_saved_state32(iss) ? const_saved_state32(iss)->pc : const_saved_state64(iss)->pc;
 }
 
+static inline void
+add_saved_state_pc(arm_saved_state_t *iss, int diff)
+{
+       if (is_saved_state32(iss)) {
+               uint64_t pc = saved_state32(iss)->pc + diff;
+               saved_state32(iss)->pc = CAST_ASSERT_SAFE(uint32_t, pc);
+       } else {
+#if defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC)
+               MANIPULATE_SIGNED_THREAD_STATE(iss,
+                   "mov        w6, %w[diff]            \n"
+                   "add        x1, x1, w6, sxtw        \n"
+                   "str        x1, [x0, %[SS64_PC]]    \n",
+                   [diff] "r"(diff)
+                   );
+#else
+               saved_state64(iss)->pc += diff;
+#endif /* defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) */
+       }
+}
+
 static inline void
 set_saved_state_pc(arm_saved_state_t *iss, register_t pc)
 {
        if (is_saved_state32(iss)) {
                saved_state32(iss)->pc = CAST_ASSERT_SAFE(uint32_t, pc);
        } else {
+#if defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC)
+               MANIPULATE_SIGNED_THREAD_STATE(iss,
+                   "mov        x1, %[pc]               \n"
+                   "str        x1, [x0, %[SS64_PC]]    \n",
+                   [pc] "r"(pc)
+                   );
+#else
                saved_state64(iss)->pc = pc;
+#endif /* defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) */
        }
 }
 
@@ -504,7 +660,15 @@ set_saved_state_lr(arm_saved_state_t *iss, register_t lr)
        if (is_saved_state32(iss)) {
                saved_state32(iss)->lr = CAST_ASSERT_SAFE(uint32_t, lr);
        } else {
+#if defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC)
+               MANIPULATE_SIGNED_THREAD_STATE(iss,
+                   "mov        x3, %[lr]               \n"
+                   "str        x3, [x0, %[SS64_LR]]    \n",
+                   [lr] "r"(lr)
+                   );
+#else
                saved_state64(iss)->lr = lr;
+#endif /* defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) */
        }
 }
 
@@ -550,6 +714,25 @@ set_saved_state_reg(arm_saved_state_t *iss, unsigned reg, register_t value)
        if (is_saved_state32(iss)) {
                saved_state32(iss)->r[reg] = CAST_ASSERT_SAFE(uint32_t, value);
        } else {
+#if defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC)
+               /* x16 and x17 are part of the jophash */
+               if (reg == 16) {
+                       MANIPULATE_SIGNED_THREAD_STATE(iss,
+                           "mov        x4, %[value]            \n"
+                           "str        x4, [x0, %[SS64_X16]]   \n",
+                           [value] "r"(value)
+                           );
+                       return;
+               } else if (reg == 17) {
+                       MANIPULATE_SIGNED_THREAD_STATE(iss,
+                           "mov        x5, %[value]            \n"
+                           "str        x5, [x0, %[SS64_X17]]   \n",
+                           [value] "r"(value),
+                           [SS64_X17] "i"(ss64_offsetof(x[17]))
+                           );
+                       return;
+               }
+#endif
                saved_state64(iss)->x[reg] = value;
        }
 }
@@ -560,13 +743,45 @@ get_saved_state_cpsr(const arm_saved_state_t *iss)
        return is_saved_state32(iss) ? const_saved_state32(iss)->cpsr : const_saved_state64(iss)->cpsr;
 }
 
+static inline void
+mask_saved_state_cpsr(arm_saved_state_t *iss, uint32_t set_bits, uint32_t clear_bits)
+{
+       if (is_saved_state32(iss)) {
+               saved_state32(iss)->cpsr |= set_bits;
+               saved_state32(iss)->cpsr &= ~clear_bits;
+       } else {
+#if defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC)
+               MANIPULATE_SIGNED_THREAD_STATE(iss,
+                   "mov        w6, %w[set_bits]        \n"
+                   "orr        w2, w2, w6, lsl #0      \n"
+                   "mov        w6, %w[clear_bits]      \n"
+                   "bic        w2, w2, w6, lsl #0      \n"
+                   "str        w2, [x0, %[SS64_CPSR]]  \n",
+                   [set_bits] "r"(set_bits),
+                   [clear_bits] "r"(clear_bits)
+                   );
+#else
+               saved_state64(iss)->cpsr |= set_bits;
+               saved_state64(iss)->cpsr &= ~clear_bits;
+#endif /* defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) */
+       }
+}
+
 static inline void
 set_saved_state_cpsr(arm_saved_state_t *iss, uint32_t cpsr)
 {
        if (is_saved_state32(iss)) {
                saved_state32(iss)->cpsr = cpsr;
        } else {
+#if defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC)
+               MANIPULATE_SIGNED_THREAD_STATE(iss,
+                   "mov        w2, %w[cpsr]            \n"
+                   "str        w2, [x0, %[SS64_CPSR]]  \n",
+                   [cpsr] "r"(cpsr)
+                   );
+#else
                saved_state64(iss)->cpsr = cpsr;
+#endif /* defined(XNU_KERNEL_PRIVATE) && defined(HAS_APPLE_PAC) */
        }
 }
 
@@ -626,10 +841,10 @@ get_saved_state_svc_number(const arm_saved_state_t *iss)
        return is_saved_state32(iss) ? (int)const_saved_state32(iss)->r[12] : (int)const_saved_state64(iss)->x[ARM64_SYSCALL_CODE_REG_NUM]; /* Only first word counts here */
 }
 
-typedef _STRUCT_ARM_LEGACY_DEBUG_STATE          arm_legacy_debug_state_t;
+typedef _STRUCT_ARM_LEGACY_DEBUG_STATE arm_legacy_debug_state_t;
 
 struct arm_debug_aggregate_state {
-       arm_state_hdr_t         dsh;
+       arm_state_hdr_t dsh;
        union {
                arm_debug_state32_t ds32;
                arm_debug_state64_t ds64;
@@ -639,7 +854,7 @@ struct arm_debug_aggregate_state {
 typedef struct arm_debug_aggregate_state arm_debug_state_t;
 
 #define ARM_LEGACY_DEBUG_STATE_COUNT ((mach_msg_type_number_t) \
-   (sizeof (arm_legacy_debug_state_t)/sizeof(uint32_t)))
+       (sizeof (arm_legacy_debug_state_t)/sizeof(uint32_t)))
 
 /*
  * NEON context
@@ -650,31 +865,31 @@ typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
 
 struct arm_neon_saved_state32 {
        union {
-               uint128_t       q[16];
-               uint64_t        d[32];
-               uint32_t        s[32];
+               uint128_t q[16];
+               uint64_t  d[32];
+               uint32_t  s[32];
        } v;
-       uint32_t                fpsr;
-       uint32_t                fpcr;
+       uint32_t fpsr;
+       uint32_t fpcr;
 };
 typedef struct arm_neon_saved_state32 arm_neon_saved_state32_t;
 
 #define ARM_NEON_SAVED_STATE32_COUNT ((mach_msg_type_number_t) \
-               (sizeof (arm_neon_saved_state32_t)/sizeof(unsigned int)))
+       (sizeof (arm_neon_saved_state32_t)/sizeof(unsigned int)))
 
 struct arm_neon_saved_state64 {
        union {
-               uint128_t               q[32];
-               uint64x2_t              d[32];
-               uint32x4_t              s[32];
+               uint128_t  q[32];
+               uint64x2_t d[32];
+               uint32x4_t s[32];
        } v;
-       uint32_t                fpsr;
-       uint32_t                fpcr;
+       uint32_t fpsr;
+       uint32_t fpcr;
 };
 typedef struct arm_neon_saved_state64 arm_neon_saved_state64_t;
 
 #define ARM_NEON_SAVED_STATE64_COUNT ((mach_msg_type_number_t) \
-               (sizeof (arm_neon_saved_state64_t)/sizeof(unsigned int)))
+       (sizeof (arm_neon_saved_state64_t)/sizeof(unsigned int)))
 
 struct arm_neon_saved_state {
        arm_state_hdr_t nsh;
@@ -684,8 +899,8 @@ struct arm_neon_saved_state {
        } uns;
 };
 typedef struct arm_neon_saved_state arm_neon_saved_state_t;
-#define ns_32   uns.ns_32
-#define ns_64   uns.ns_64
+#define ns_32 uns.ns_32
+#define ns_64 uns.ns_64
 
 static inline boolean_t
 is_neon_saved_state32(const arm_neon_saved_state_t *state)
@@ -725,13 +940,13 @@ typedef struct arm_context arm_context_t;
 extern void saved_state_to_thread_state64(const arm_saved_state_t*, arm_thread_state64_t*);
 extern void thread_state64_to_saved_state(const arm_thread_state64_t*, arm_saved_state_t*);
 
-#else
+#else /* defined(__arm__) */
 #error Unknown arch
-#endif
+#endif /* defined(__arm__) */
 
 extern void saved_state_to_thread_state32(const arm_saved_state_t*, arm_thread_state32_t*);
 extern void thread_state32_to_saved_state(const arm_thread_state32_t*, arm_saved_state_t*);
 
 #endif /* XNU_KERNEL_PRIVATE */
 
-#endif    /* _ARM_THREAD_STATUS_H_ */
+#endif /* _ARM_THREAD_STATUS_H_ */
index 8f43cebb48371e3842b6417e19c04200e65a117d..12939b6eebef61af4a1802fadf748e9dfc728700 100644 (file)
@@ -153,8 +153,11 @@ extern unsigned         PAGE_SHIFT_CONST;
 #define VM_MAX_ADDRESS          ((vm_address_t) 0x0000000080000000ULL)
 
 /* system-wide values */
-#define MACH_VM_MIN_ADDRESS     ((mach_vm_offset_t) 0x0ULL)
-#define MACH_VM_MAX_ADDRESS     ((mach_vm_offset_t) 0x0000000FC0000000ULL)
+#define MACH_VM_MIN_ADDRESS_RAW 0x0ULL
+#define MACH_VM_MAX_ADDRESS_RAW 0x0000000FC0000000ULL
+#define MACH_VM_MIN_ADDRESS     ((mach_vm_offset_t) MACH_VM_MIN_ADDRESS_RAW)
+#define MACH_VM_MAX_ADDRESS     ((mach_vm_offset_t) MACH_VM_MAX_ADDRESS_RAW)
+
 
 #else
 #error architecture not supported
@@ -177,7 +180,7 @@ extern unsigned         PAGE_SHIFT_CONST;
  */
 #define VM_KERNEL_POINTER_SIGNIFICANT_BITS  37
 #define VM_MIN_KERNEL_ADDRESS   ((vm_address_t) 0xffffffe000000000ULL)
-#define VM_MAX_KERNEL_ADDRESS   ((vm_address_t) 0xfffffff3ffffffffULL)
+#define VM_MAX_KERNEL_ADDRESS   ((vm_address_t) 0xfffffffbffffffffULL)
 #else
 #error architecture not supported
 #endif
@@ -185,7 +188,12 @@ extern unsigned         PAGE_SHIFT_CONST;
 #define VM_MIN_KERNEL_AND_KEXT_ADDRESS  \
                                VM_MIN_KERNEL_ADDRESS
 
+#if __has_feature(ptrauth_calls)
+#include <ptrauth.h>
+#define VM_KERNEL_STRIP_PTR(_v) (ptrauth_strip((void *)(uintptr_t)(_v), ptrauth_key_asia))
+#else /* !ptrauth_calls */
 #define VM_KERNEL_STRIP_PTR(_v) (_v)
+#endif /* ptrauth_calls */
 
 #define VM_KERNEL_ADDRESS(_va)  \
        ((((vm_address_t)VM_KERNEL_STRIP_PTR(_va)) >= VM_MIN_KERNEL_ADDRESS) && \
@@ -198,11 +206,18 @@ extern unsigned         PAGE_SHIFT_CONST;
 extern unsigned long            gVirtBase, gPhysBase, gPhysSize;
 
 #define isphysmem(a)            (((vm_address_t)(a) - gPhysBase) < gPhysSize)
+#define physmap_enclosed(a)     isphysmem(a)
 
 #if KASAN
 /* Increase the stack sizes to account for the redzones that get added to every
  * stack object. */
 # define KERNEL_STACK_SIZE      (4*4*4096)
+#elif DEBUG
+/**
+ * Increase the stack size to account for less efficient use of stack space when
+ * compiling with -O0.
+ */
+# define KERNEL_STACK_SIZE      (2*4*4096)
 #else
 # define KERNEL_STACK_SIZE      (4*4096)
 #endif
index 2974440ffc8659042c3adc08e4d046fcbe56d62a..82be1cf5a252048332c14ac577b0f5c4fd291742 100644 (file)
@@ -138,11 +138,17 @@ struct coalition_resource_usage {
        uint64_t logical_deferred_writes;
        uint64_t logical_invalidated_writes;
        uint64_t logical_metadata_writes;
+       uint64_t logical_immediate_writes_to_external;
+       uint64_t logical_deferred_writes_to_external;
+       uint64_t logical_invalidated_writes_to_external;
+       uint64_t logical_metadata_writes_to_external;
        uint64_t energy_billed_to_me;
        uint64_t energy_billed_to_others;
        uint64_t cpu_ptime;
        uint64_t cpu_time_eqos_len;     /* Stores the number of thread QoS types */
        uint64_t cpu_time_eqos[COALITION_NUM_THREAD_QOS_TYPES];
+       uint64_t cpu_instructions;
+       uint64_t cpu_cycles;
 };
 
 #ifdef PRIVATE
@@ -158,6 +164,9 @@ struct coalition_resource_usage {
 #define COALITION_INFO_SET_NAME 2
 #define COALITION_INFO_SET_EFFICIENCY 3
 
+/* coalition_ledger_set operations */
+#define COALITION_LEDGER_SET_LOGICAL_WRITES_LIMIT 1
+
 #define COALITION_EFFICIENCY_VALID_FLAGS    (COALITION_FLAGS_EFFICIENT)
 
 /* structure returned from libproc coalition listing interface */
index 83c8c90e7720f84300083195613cecb90673cfee..31ee691b747c9c42a121d44aef7453fa4375e485 100644 (file)
  *     the thread identity and state.
  */
 
+#define MACH_EXCEPTION_ERRORS           0x40000000
+/*     include additional exception specific errors, not used yet.  */
+
 #define MACH_EXCEPTION_CODES            0x80000000
 /*     Send 64-bit code and subcode in the exception header */
 
+#define MACH_EXCEPTION_MASK             (MACH_EXCEPTION_CODES | MACH_EXCEPTION_ERRORS)
 /*
  * Masks for exception definitions, above
  * bit zero is unused, therefore 1 word = 31 exception types
diff --git a/osfmk/mach/fairplayd_notification.defs b/osfmk/mach/fairplayd_notification.defs
new file mode 100644 (file)
index 0000000..31250ed
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ *  Interface definition for the fairplay upcall mechanism.
+ */
+
+subsystem
+#if KERNEL_USER
+    KernelUser
+#endif /* KERNEL_USER */
+    fairplay 41471;
+
+#include <mach/std_types.defs>
+#include <mach/mach_types.defs>
+
+/*
+ * Notification from the kernel requesting a new arcade service
+ * port from fairplayd. Instead of replying with a port, the
+ * new port passed to the arcade_provider port sent here via
+ * arcade_set_upcall_port().
+ */
+simpleroutine fairplayd_arcade_request(
+              fairplayd_port    : mach_port_t;
+              arcade_reg_port   : mach_port_t);
+
index 2bca65e0d02f991bca61c531d4ce3010d181d4b2..12d8b3e454c43691c1de2a70a46e5b8bca653007 100644 (file)
@@ -125,7 +125,7 @@ typedef struct host_can_has_debugger_info       *host_can_has_debugger_info_t;
 #define HOST_CAN_HAS_DEBUGGER_COUNT ((mach_msg_type_number_t) \
                (sizeof(host_can_has_debugger_info_data_t)/sizeof(integer_t)))
 
-#pragma pack(4)
+#pragma pack(push, 4)
 
 struct host_basic_info {
        integer_t               max_cpus;               /* max number of CPUs possible */
@@ -141,7 +141,7 @@ struct host_basic_info {
        uint64_t                max_mem;                /* actual size of physical memory */
 };
 
-#pragma pack()
+#pragma pack(pop)
 
 typedef struct host_basic_info  host_basic_info_data_t;
 typedef struct host_basic_info  *host_basic_info_t;
index f4632ed1386c08842ec52ff6789edcdf60c73392..d09b44b6b624b9fe1dcf928953ddd0819390130d 100644 (file)
 #define HOST_RESOURCE_NOTIFY_PORT       (20 + HOST_MAX_SPECIAL_KERNEL_PORT)
 #define HOST_CLOSURED_PORT              (21 + HOST_MAX_SPECIAL_KERNEL_PORT)
 #define HOST_SYSPOLICYD_PORT            (22 + HOST_MAX_SPECIAL_KERNEL_PORT)
+#define HOST_FILECOORDINATIOND_PORT     (23 + HOST_MAX_SPECIAL_KERNEL_PORT)
+#define HOST_FAIRPLAYD_PORT             (24 + HOST_MAX_SPECIAL_KERNEL_PORT)
 
-#define HOST_MAX_SPECIAL_PORT           HOST_SYSPOLICYD_PORT
+#define HOST_MAX_SPECIAL_PORT           HOST_FAIRPLAYD_PORT
 /* MAX = last since rdar://35861175 */
 
 /* obsolete name */
 #define host_set_syspolicyd_port(host, port)    \
        (host_set_special_port((host), HOST_SYSPOLICYD_PORT, (port)))
 
+#define host_get_filecoordinationd_port(host, port)     \
+       (host_get_special_port((host),                  \
+       HOST_LOCAL_NODE, HOST_FILECOORDINATIOND_PORT, (port)))
+#define host_set_filecoordinationd_port(host, port)     \
+       (host_set_special_port((host), HOST_FILECOORDINATIOND_PORT, (port)))
+
+#define host_get_fairplayd_port(host, port)     \
+       (host_get_special_port((host),                  \
+       HOST_LOCAL_NODE, HOST_FAIRPLAYD_PORT, (port)))
+#define host_set_fairplayd_port(host, port)     \
+       (host_set_special_port((host), HOST_FAIRPLAYD_PORT, (port)))
+
 /* HOST_RESOURCE_NOTIFY_PORT doesn't #defines these conveniences.
  *  All lookups go through send_resource_violation()
  */
index 3f63a005815c61fc424ea37055aa1bd9397720b6..b998ba05605c101f627e745fe12c9c7748855c61 100644 (file)
@@ -624,6 +624,12 @@ _STRUCT_X86_DEBUG_STATE32
 };
 #endif /* !__DARWIN_UNIX03 */
 
+#define        _STRUCT_X86_PAGEIN_STATE        struct __x86_pagein_state
+_STRUCT_X86_PAGEIN_STATE
+{
+       int __pagein_error;
+};
+
 /*
  * 64 bit versions of the above
  */
@@ -690,7 +696,7 @@ _STRUCT_X86_THREAD_STATE64
 #define        _STRUCT_X86_THREAD_FULL_STATE64 struct __darwin_x86_thread_full_state64
 _STRUCT_X86_THREAD_FULL_STATE64
 {
-       _STRUCT_X86_THREAD_STATE64      ss64;
+       _STRUCT_X86_THREAD_STATE64      __ss64;
        __uint64_t                      __ds;
        __uint64_t                      __es;
        __uint64_t                      __ss;
index 324ac645bc0fa6511077cc260e7e6452c566fb3e..2744c0be6ee0422b2966569867debc635ed28ab2 100644 (file)
 #define x86_AVX_STATE32                 16
 #define x86_AVX_STATE64                 (x86_AVX_STATE32 + 1)
 #define x86_AVX_STATE                   (x86_AVX_STATE32 + 2)
-#if !defined(RC_HIDE_XNU_J137)
 #define x86_AVX512_STATE32              19
 #define x86_AVX512_STATE64              (x86_AVX512_STATE32 + 1)
 #define x86_AVX512_STATE                (x86_AVX512_STATE32 + 2)
-#endif /* not RC_HIDE_XNU_J137 */
+#define x86_PAGEIN_STATE                22
 #define x86_THREAD_FULL_STATE64         23
 
 /*
  * platform. The macro must be manually updated to include all of the valid
  * exception flavors as defined above.
  */
-#if !defined(RC_HIDE_XNU_J137)
-#define VALID_THREAD_STATE_FLAVOR(x)       \
-        ((x == x86_THREAD_STATE32)      || \
-         (x == x86_FLOAT_STATE32)       || \
-         (x == x86_EXCEPTION_STATE32)   || \
-         (x == x86_DEBUG_STATE32)       || \
-         (x == x86_THREAD_STATE64)      || \
-         (x == x86_THREAD_FULL_STATE64) || \
-         (x == x86_FLOAT_STATE64)       || \
-         (x == x86_EXCEPTION_STATE64)   || \
-         (x == x86_DEBUG_STATE64)       || \
-         (x == x86_THREAD_STATE)        || \
-         (x == x86_FLOAT_STATE)         || \
-         (x == x86_EXCEPTION_STATE)     || \
-         (x == x86_DEBUG_STATE)         || \
-         (x == x86_AVX_STATE32)         || \
-         (x == x86_AVX_STATE64)         || \
-         (x == x86_AVX_STATE)           || \
-         (x == x86_AVX512_STATE32)      || \
-         (x == x86_AVX512_STATE64)      || \
-         (x == x86_AVX512_STATE)        || \
+#define VALID_THREAD_STATE_FLAVOR(x)            \
+        ((x == x86_THREAD_STATE32)             || \
+         (x == x86_FLOAT_STATE32)              || \
+         (x == x86_EXCEPTION_STATE32)          || \
+         (x == x86_DEBUG_STATE32)              || \
+         (x == x86_THREAD_STATE64)             || \
+         (x == x86_THREAD_FULL_STATE64)        || \
+         (x == x86_FLOAT_STATE64)              || \
+         (x == x86_EXCEPTION_STATE64)          || \
+         (x == x86_DEBUG_STATE64)              || \
+         (x == x86_THREAD_STATE)               || \
+         (x == x86_FLOAT_STATE)                || \
+         (x == x86_EXCEPTION_STATE)            || \
+         (x == x86_DEBUG_STATE)                || \
+         (x == x86_AVX_STATE32)                || \
+         (x == x86_AVX_STATE64)                || \
+         (x == x86_AVX_STATE)                  || \
+         (x == x86_AVX512_STATE32)             || \
+         (x == x86_AVX512_STATE64)             || \
+         (x == x86_AVX512_STATE)               || \
+         (x == x86_PAGEIN_STATE)               || \
          (x == THREAD_STATE_NONE))
-#else
-#define VALID_THREAD_STATE_FLAVOR(x)       \
-        ((x == x86_THREAD_STATE32)     || \
-         (x == x86_FLOAT_STATE32)      || \
-         (x == x86_EXCEPTION_STATE32)  || \
-         (x == x86_DEBUG_STATE32)      || \
-         (x == x86_THREAD_STATE64)     || \
-         (x == x86_FLOAT_STATE64)      || \
-         (x == x86_EXCEPTION_STATE64)  || \
-         (x == x86_DEBUG_STATE64)      || \
-         (x == x86_THREAD_STATE)       || \
-         (x == x86_FLOAT_STATE)        || \
-         (x == x86_EXCEPTION_STATE)    || \
-         (x == x86_DEBUG_STATE)        || \
-         (x == x86_AVX_STATE32)        || \
-         (x == x86_AVX_STATE64)        || \
-         (x == x86_AVX_STATE)          || \
-         (x == THREAD_STATE_NONE))
-#endif /* not RC_HIDE_XNU_J137 */
 
 struct x86_state_hdr {
        uint32_t        flavor;
@@ -221,11 +201,9 @@ typedef _STRUCT_X86_AVX_STATE32 x86_avx_state32_t;
 #define x86_AVX_STATE32_COUNT ((mach_msg_type_number_t) \
                (sizeof(x86_avx_state32_t)/sizeof(unsigned int)))
 
-#if !defined(RC_HIDE_XNU_J137)
 typedef _STRUCT_X86_AVX512_STATE32 x86_avx512_state32_t;
 #define x86_AVX512_STATE32_COUNT ((mach_msg_type_number_t) \
                (sizeof(x86_avx512_state32_t)/sizeof(unsigned int)))
-#endif /* not RC_HIDE_XNU_J137 */
 
 /*
  * to be deprecated in the future
@@ -262,11 +240,9 @@ typedef _STRUCT_X86_AVX_STATE64 x86_avx_state64_t;
 #define x86_AVX_STATE64_COUNT ((mach_msg_type_number_t) \
                (sizeof(x86_avx_state64_t)/sizeof(unsigned int)))
 
-#if !defined(RC_HIDE_XNU_J137)
 typedef _STRUCT_X86_AVX512_STATE64 x86_avx512_state64_t;
 #define x86_AVX512_STATE64_COUNT ((mach_msg_type_number_t) \
                (sizeof(x86_avx512_state64_t)/sizeof(unsigned int)))
-#endif /* not RC_HIDE_XNU_J137 */
 
 typedef _STRUCT_X86_EXCEPTION_STATE64 x86_exception_state64_t;
 #define x86_EXCEPTION_STATE64_COUNT     ((mach_msg_type_number_t) \
@@ -280,6 +256,12 @@ typedef _STRUCT_X86_DEBUG_STATE64 x86_debug_state64_t;
 
 #define X86_DEBUG_STATE64_COUNT x86_DEBUG_STATE64_COUNT
 
+typedef _STRUCT_X86_PAGEIN_STATE x86_pagein_state_t;
+#define x86_PAGEIN_STATE_COUNT \
+    ((mach_msg_type_number_t)(sizeof(x86_pagein_state_t) / sizeof(int)))
+
+#define X86_PAGEIN_STATE_COUNT x86_PAGEIN_STATE_COUNT
+
 /*
  * Combined thread, float and exception states
  */
@@ -323,7 +305,6 @@ struct x86_avx_state {
        } ufs;
 };
 
-#if !defined(RC_HIDE_XNU_J137)
 struct x86_avx512_state {
        x86_state_hdr_t                 ash;
        union {
@@ -331,7 +312,6 @@ struct x86_avx512_state {
                x86_avx512_state64_t    as64;
        } ufs;
 };
-#endif /* not RC_HIDE_XNU_J137 */
 
 typedef struct x86_thread_state x86_thread_state_t;
 #define x86_THREAD_STATE_COUNT  ((mach_msg_type_number_t) \
@@ -353,11 +333,9 @@ typedef struct x86_avx_state x86_avx_state_t;
 #define x86_AVX_STATE_COUNT ((mach_msg_type_number_t) \
                (sizeof(x86_avx_state_t)/sizeof(unsigned int)))
 
-#if !defined(RC_HIDE_XNU_J137)
 typedef struct x86_avx512_state x86_avx512_state_t;
 #define x86_AVX512_STATE_COUNT ((mach_msg_type_number_t) \
                (sizeof(x86_avx512_state_t)/sizeof(unsigned int)))
-#endif /* not RC_HIDE_XNU_J137 */
 
 /*
  * Machine-independent way for servers and Mach's exception mechanism to
index 361cfdfd25efd34b8272688ca9f4b315762a904a..a4fba6d3ac6471b2f465636237b2c00735d9dc92 100644 (file)
@@ -67,7 +67,7 @@ typedef kern_return_t kmod_stop_func_t(struct kmod_info * ki, void * data);
 *
 * All structures must be #pragma pack(4).
 ***********************************************************************/
-#pragma pack(4)
+#pragma pack(push, 4)
 
 /* Run-time struct only; never saved to a file */
 typedef struct kmod_reference {
@@ -133,7 +133,7 @@ typedef struct kmod_info_64_v1 {
        uint64_t            stop_addr;
 } kmod_info_64_v1_t;
 
-#pragma pack()
+#pragma pack(pop)
 
 #if PRAGMA_MARK
 #pragma mark Kmod structure declaration macros
index 18e2cb68bb337477f9dcc47bea9fa6d7e523d609..fb426130163ba48b4099d22436a8098be42fde0a 100644 (file)
@@ -70,4 +70,7 @@
 
 #define TASK_PORT_REGISTER_MAX  3
 
+/* Number of watchport for task */
+#define TASK_MAX_WATCHPORT_COUNT 32
+
 #endif  /* _MACH_MACH_PARAM_H_ */
index 5bc503421dfbbd14c92f51293ed3f6c7e668d3ef..ea3328933de73c099e427beef7a7de4b9b0de3f5 100644 (file)
@@ -623,4 +623,39 @@ routine mach_port_special_reply_port_reset_link(
 #else
 skip;
 #endif
+
+/*
+ *     Guard an already existing port. Allows guarding
+ *     receive rights only. Uses the context field in the
+ *     port structure to store the guard.
+ */
+routine mach_port_guard_with_flags(
+               task            : ipc_space_t;
+               name            : mach_port_name_t;
+#ifdef LIBSYSCALL_INTERFACE
+               guard           : mach_port_context_t;
+#else
+               guard           : uint64_t;
+#endif
+               flags           : uint64_t);
+
+/*
+ *     Swap guard value of an existing guarded port. Works
+ *     only if it is not a strict guard.
+ */
+routine mach_port_swap_guard(
+               task            : ipc_space_t;
+               name            : mach_port_name_t;
+#ifdef LIBSYSCALL_INTERFACE
+               old_guard       : mach_port_context_t;
+#else
+               old_guard       : uint64_t;
+#endif
+
+#ifdef LIBSYSCALL_INTERFACE
+               new_guard       : mach_port_context_t);
+#else
+               new_guard       : uint64_t);
+#endif
+
 /* vim: set ft=c : */
index b41206e1f8838e03dd789062f564087cfe3aa915..f2601ade4141ea9b0fb28d9c733062d9b49a7b15 100644 (file)
@@ -54,7 +54,7 @@ kern_return_t           mach_wait_until(
 
 uint64_t                        mach_absolute_time(void);
 
-__OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_8_0)
+__OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_8_0)
 uint64_t                        mach_approximate_time(void);
 
 /*
index 2639dbfb1d475944afc46d59807d9f59abafa926..064514ebc111483d5b4d772023be2d45e407ded0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -320,6 +320,20 @@ extern kern_return_t mach_voucher_extract_attr_recipe_trap(
        mach_voucher_attr_raw_recipe_t recipe,
        mach_msg_type_number_t *recipe_size);
 
+extern kern_return_t _kernelrpc_mach_port_type_trap(
+       ipc_space_t task,
+       mach_port_name_t name,
+       mach_port_type_t *ptype);
+
+extern kern_return_t _kernelrpc_mach_port_request_notification_trap(
+       ipc_space_t task,
+       mach_port_name_t name,
+       mach_msg_id_t msgid,
+       mach_port_mscount_t sync,
+       mach_port_name_t notify,
+       mach_msg_type_name_t notifyPoly,
+       mach_port_name_t *previous);
+
 /*
  *     Obsolete interfaces.
  */
@@ -338,6 +352,11 @@ extern kern_return_t pid_for_task(
        mach_port_name_t t,
        int *x);
 
+extern kern_return_t debug_control_port_for_pid(
+       mach_port_name_t target_tport,
+       int pid,
+       mach_port_name_t *t);
+
 #else   /* KERNEL */
 
 #ifdef  XNU_KERNEL_PRIVATE
@@ -370,7 +389,7 @@ extern kern_return_t pid_for_task(
 #endif
 
 #define PAD_ARG_(arg_type, arg_name) \
-  char arg_name##_l_[PADL_(arg_type)]; arg_type arg_name; char arg_name##_r_[PADR_(arg_type)];
+  char arg_name##_l_[PADL_(arg_type)]; arg_type arg_name; char arg_name##_r_[PADR_(arg_type)]
 
 /*
  * To support 32-bit clients as well as 64-bit clients, argument
@@ -503,6 +522,14 @@ struct pid_for_task_args {
 extern kern_return_t pid_for_task(
        struct pid_for_task_args *args);
 
+struct debug_control_port_for_pid_args {
+       PAD_ARG_(mach_port_name_t, target_tport);
+       PAD_ARG_(int, pid);
+       PAD_ARG_(user_addr_t, t);
+};
+extern kern_return_t debug_control_port_for_pid(
+       struct debug_control_port_for_pid_args *args);
+
 struct macx_swapon_args {
        PAD_ARG_(uint64_t, filename);
        PAD_ARG_(int, flags);
@@ -814,6 +841,26 @@ struct mach_voucher_extract_attr_recipe_args {
 extern kern_return_t mach_voucher_extract_attr_recipe_trap(
        struct mach_voucher_extract_attr_recipe_args *args);
 
+struct _kernelrpc_mach_port_type_args {
+       PAD_ARG_(mach_port_name_t, target);
+       PAD_ARG_(mach_port_right_t, name);
+       PAD_ARG_(user_addr_t, ptype);
+};
+extern kern_return_t _kernelrpc_mach_port_type_trap(
+       struct _kernelrpc_mach_port_type_args *args);
+
+struct _kernelrpc_mach_port_request_notification_args {
+       PAD_ARG_(mach_port_name_t, target);
+       PAD_ARG_(mach_port_name_t, name);
+       PAD_ARG_(mach_msg_id_t, msgid);
+       PAD_ARG_(mach_port_mscount_t, sync);
+       PAD_ARG_(mach_port_name_t, notify);
+       PAD_ARG_(mach_msg_type_name_t, notifyPoly);
+       PAD_ARG_(user_addr_t, previous);
+};
+extern kern_return_t _kernelrpc_mach_port_request_notification_trap(
+       struct _kernelrpc_mach_port_request_notification_args *args);
+
 
 /* not published to LP64 clients yet */
 struct iokit_user_client_trap_args {
index 27dbd26a6115443b565e554f684324350ade66ca..d2e9fb0b4ecae222c9a9610a6e91450ab83be731 100644 (file)
@@ -218,6 +218,12 @@ type ipc_space_inspect_t = mach_port_t
 #endif /* KERNEL_SERVER */
                ;
 
+type arcade_register_t = mach_port_t
+#if    KERNEL_SERVER
+               intran: arcade_register_t convert_port_to_arcade_register(mach_port_t)
+#endif /* KERNEL_SERVER */
+               ;
+
 type vm_prot_t = int;
 type vm_inherit_t = int;
 type vm_purgable_t = int;
@@ -258,11 +264,12 @@ type thread_policy_t              = array[*:16] of integer_t;
                 * task_basic_info_64_2_t
                 * mach_task_basic_info_t (12 ints)
                 * task_power_info_t (18 ints)
+                * task_vm_info_t (87 ints)
                 * If other task_info flavors are added, this
                 * definition may need to be changed. (See
                 * mach/task_info.h and mach/policy.h) */
 type task_flavor_t             = int;
-type task_info_t               = array[*:52] of integer_t;
+type task_info_t               = array[*:87] of integer_t;
 
 type task_purgable_info_t      = struct[68] of integer_t;
 
@@ -272,6 +279,8 @@ type task_policy_t          = array[*:16] of integer_t;
 type task_inspect_flavor_t = natural_t;
 type task_inspect_info_t = array[*:4] of integer_t;
 
+type task_exc_guard_behavior_t = uint32_t;
+
 type mem_entry_name_port_t = mach_port_t
 #if     KERNEL_SERVER
                intran: mem_entry_name_port_t null_conversion(mach_port_t)
@@ -549,6 +558,8 @@ type task_suspension_token_t = mach_port_move_send_once_t
 #endif /* KERNEL_SERVER */
                ;
 
+type vfs_path_t = c_string[4096];
+type nspace_path_t = c_string[1024];   /* 1024 == PATH_MAX */
 
 /* public voucher types */
 
@@ -624,6 +635,9 @@ simport <kern/sync_lock.h>; /* for lock-set conversions */
 simport <kern/sync_sema.h>;    /* for semaphore conversions */
 simport <vm/memory_object.h>;  /* for memory object type conversions */
 simport <vm/vm_map.h>;         /* for vm_map conversions */
+#if CONFIG_ARCADE
+simport <kern/arcade.h>;    /* for arcade_register conversions */
+#endif
 #endif /* MACH_KERNEL_PRIVATE */
 
 simport <kern/ipc_mig.h>;      /* pick up kernel-specific MIG things */
index 480c607683fec74a91ba3e73192063818ad821a3..5430caaebb343b85e534f952609318724875eb76 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -134,6 +134,7 @@ typedef struct alarm                    *alarm_t;
 typedef struct clock                    *clock_serv_t;
 typedef struct clock                    *clock_ctrl_t;
 
+typedef struct arcade_register          *arcade_register_t;
 
 /*
  * OBSOLETE: lock_set interfaces are obsolete.
@@ -155,6 +156,8 @@ struct ledger;
 struct alarm;
 struct clock;
 
+struct arcade_register;
+
 __END_DECLS
 
 #endif  /* MACH_KERNEL_PRIVATE */
@@ -188,6 +191,7 @@ typedef mach_port_t             alarm_t;
 typedef mach_port_t             clock_serv_t;
 typedef mach_port_t             clock_ctrl_t;
 
+typedef mach_port_t             arcade_register_t;
 #endif  /* KERNEL */
 
 /*
@@ -254,21 +258,46 @@ typedef clock_serv_t            clock_serv_port_t;
 typedef clock_ctrl_t            clock_ctrl_port_t;
 typedef exception_handler_t     exception_port_t;
 typedef exception_handler_array_t exception_port_arrary_t;
+typedef char vfs_path_t[4096];
+typedef char nspace_path_t[1024]; /* 1024 == PATH_MAX */
 
-
+#ifdef KERNEL
+#define TASK_NULL               ((task_t) NULL)
+#define TASK_NAME_NULL          ((task_name_t) NULL)
+#define TASK_INSPECT_NULL               ((task_inspect_t) NULL)
+#define THREAD_NULL             ((thread_t) NULL)
+#define THREAD_INSPECT_NULL     ((thread_inspect_t)NULL)
+#define TID_NULL                ((uint64_t) NULL)
+#define THR_ACT_NULL            ((thread_act_t) NULL)
+#define IPC_SPACE_NULL          ((ipc_space_t) NULL)
+#define IPC_SPACE_INSPECT_NULL  ((ipc_space_inspect_t) NULL)
+#define COALITION_NULL          ((coalition_t) NULL)
+#define HOST_NULL               ((host_t) NULL)
+#define HOST_PRIV_NULL          ((host_priv_t)NULL)
+#define HOST_SECURITY_NULL      ((host_security_t)NULL)
+#define PROCESSOR_SET_NULL      ((processor_set_t) NULL)
+#define PROCESSOR_NULL          ((processor_t) NULL)
+#define SEMAPHORE_NULL          ((semaphore_t) NULL)
+#define LOCK_SET_NULL           ((lock_set_t) NULL)
+#define LEDGER_NULL             ((ledger_t) NULL)
+#define ALARM_NULL              ((alarm_t) NULL)
+#define CLOCK_NULL              ((clock_t) NULL)
+#define UND_SERVER_NULL         ((UNDServerRef) NULL)
+#define ARCADE_REG_NULL         ((arcade_register_t) NULL)
+#else
 #define TASK_NULL               ((task_t) 0)
 #define TASK_NAME_NULL          ((task_name_t) 0)
 #define TASK_INSPECT_NULL               ((task_inspect_t) 0)
 #define THREAD_NULL             ((thread_t) 0)
-#define THREAD_INSPECT_NULL     ((thread_inspect_t)0)
+#define THREAD_INSPECT_NULL     ((thread_inspect_t) 0)
 #define TID_NULL                ((uint64_t) 0)
 #define THR_ACT_NULL            ((thread_act_t) 0)
 #define IPC_SPACE_NULL          ((ipc_space_t) 0)
 #define IPC_SPACE_INSPECT_NULL  ((ipc_space_inspect_t) 0)
 #define COALITION_NULL          ((coalition_t) 0)
 #define HOST_NULL               ((host_t) 0)
-#define HOST_PRIV_NULL          ((host_priv_t)0)
-#define HOST_SECURITY_NULL      ((host_security_t)0)
+#define HOST_PRIV_NULL          ((host_priv_t) 0)
+#define HOST_SECURITY_NULL      ((host_security_t) 0)
 #define PROCESSOR_SET_NULL      ((processor_set_t) 0)
 #define PROCESSOR_NULL          ((processor_t) 0)
 #define SEMAPHORE_NULL          ((semaphore_t) 0)
@@ -277,6 +306,8 @@ typedef exception_handler_array_t exception_port_arrary_t;
 #define ALARM_NULL              ((alarm_t) 0)
 #define CLOCK_NULL              ((clock_t) 0)
 #define UND_SERVER_NULL         ((UNDServerRef) 0)
+#define ARCADE_REG_NULL         ((arcade_register_t) 0)
+#endif
 
 /* DEPRECATED */
 typedef natural_t       ledger_item_t;
index 6181a64e45c13d4df7e539cf61bf7e388235d9e6..f7a7afcbd7790a5e9c9764b6d33148d0131bd7f4 100644 (file)
@@ -158,7 +158,7 @@ typedef mach_voucher_attr_recipe_command_t *mach_voucher_attr_recipe_command_arr
  *
  * An element in a recipe list to create a voucher.
  */
-#pragma pack(1)
+#pragma pack(push, 1)
 
 typedef struct mach_voucher_attr_recipe_data {
        mach_voucher_attr_key_t                 key;
@@ -179,7 +179,7 @@ typedef mach_msg_type_number_t mach_voucher_attr_raw_recipe_array_size_t;
 #define MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE   5120
 #define MACH_VOUCHER_TRAP_STACK_LIMIT                 256
 
-#pragma pack()
+#pragma pack(pop)
 
 /*
  * VOUCHER ATTRIBUTE MANAGER Writer types
@@ -216,7 +216,7 @@ typedef mach_port_t             ipc_voucher_attr_control_t;
 struct ipc_voucher_attr_manager;
 struct ipc_voucher_attr_control;
 #endif
-typedef struct ipc_voucher_attr_manager *ipc_voucher_attr_manager_t;
+typedef const struct ipc_voucher_attr_manager *ipc_voucher_attr_manager_t;
 typedef struct ipc_voucher_attr_control *ipc_voucher_attr_control_t;
 #endif
 #define IPC_VOUCHER_ATTR_MANAGER_NULL ((ipc_voucher_attr_manager_t) 0)
index 672bd17cbfb14e8e570c8b5435b6fbed1779a784..654bfc30dbde79c96db74f23525b3d75d223ca0c 100644 (file)
@@ -129,6 +129,7 @@ __END_DECLS
  */
 #define CPU_ARCH_MASK           0xff000000      /* mask for architecture bits */
 #define CPU_ARCH_ABI64          0x01000000      /* 64 bit ABI */
+#define CPU_ARCH_ABI64_32       0x02000000      /* ABI for 64-bit hardware with 32-bit types; LP32 */
 
 /*
  *     Machine types known by all.
@@ -152,6 +153,7 @@ __END_DECLS
 #define CPU_TYPE_HPPA           ((cpu_type_t) 11)
 #define CPU_TYPE_ARM            ((cpu_type_t) 12)
 #define CPU_TYPE_ARM64          (CPU_TYPE_ARM | CPU_ARCH_ABI64)
+#define CPU_TYPE_ARM64_32       (CPU_TYPE_ARM | CPU_ARCH_ABI64_32)
 #define CPU_TYPE_MC88000        ((cpu_type_t) 13)
 #define CPU_TYPE_SPARC          ((cpu_type_t) 14)
 #define CPU_TYPE_I860           ((cpu_type_t) 15)
@@ -159,6 +161,7 @@ __END_DECLS
 /* skip                                ((cpu_type_t) 17)       */
 #define CPU_TYPE_POWERPC                ((cpu_type_t) 18)
 #define CPU_TYPE_POWERPC64              (CPU_TYPE_POWERPC | CPU_ARCH_ABI64)
+/* skip                                ((cpu_type_t) 19)       */
 
 /*
  *     Machine subtypes (these are defined here, instead of in a machine
@@ -352,22 +355,32 @@ __END_DECLS
 #define CPU_SUBTYPE_ARM_V6              ((cpu_subtype_t) 6)
 #define CPU_SUBTYPE_ARM_V5TEJ           ((cpu_subtype_t) 7)
 #define CPU_SUBTYPE_ARM_XSCALE          ((cpu_subtype_t) 8)
-#define CPU_SUBTYPE_ARM_V7              ((cpu_subtype_t) 9)
+#define CPU_SUBTYPE_ARM_V7              ((cpu_subtype_t) 9)  /* ARMv7-A and ARMv7-R */
 #define CPU_SUBTYPE_ARM_V7F             ((cpu_subtype_t) 10) /* Cortex A9 */
 #define CPU_SUBTYPE_ARM_V7S             ((cpu_subtype_t) 11) /* Swift */
 #define CPU_SUBTYPE_ARM_V7K             ((cpu_subtype_t) 12)
+#define CPU_SUBTYPE_ARM_V8              ((cpu_subtype_t) 13)
 #define CPU_SUBTYPE_ARM_V6M             ((cpu_subtype_t) 14) /* Not meant to be run under xnu */
 #define CPU_SUBTYPE_ARM_V7M             ((cpu_subtype_t) 15) /* Not meant to be run under xnu */
 #define CPU_SUBTYPE_ARM_V7EM            ((cpu_subtype_t) 16) /* Not meant to be run under xnu */
-
-#define CPU_SUBTYPE_ARM_V8              ((cpu_subtype_t) 13)
+#define CPU_SUBTYPE_ARM_V8M             ((cpu_subtype_t) 17) /* Not meant to be run under xnu */
 
 /*
  *  ARM64 subtypes
  */
 #define CPU_SUBTYPE_ARM64_ALL           ((cpu_subtype_t) 0)
 #define CPU_SUBTYPE_ARM64_V8            ((cpu_subtype_t) 1)
+#define CPU_SUBTYPE_ARM64E              ((cpu_subtype_t) 2)
 
+/* CPU subtype feature flags for ptrauth on arm64e platforms */
+#define CPU_SUBTYPE_ARM64_PTR_AUTH_MASK 0x0f000000
+#define CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(x) (((x) & CPU_SUBTYPE_ARM64_PTR_AUTH_MASK) >> 24)
+
+/*
+ *  ARM64_32 subtypes
+ */
+#define CPU_SUBTYPE_ARM64_32_ALL        ((cpu_subtype_t) 0)
+#define CPU_SUBTYPE_ARM64_32_V8 ((cpu_subtype_t) 1)
 
 #endif /* !__ASSEMBLER__ */
 
@@ -409,6 +422,7 @@ __END_DECLS
 #define CPUFAMILY_ARM_TWISTER           0x92fb37c8
 #define CPUFAMILY_ARM_HURRICANE         0x67ceee93
 #define CPUFAMILY_ARM_MONSOON_MISTRAL   0xe81e7ef6
+#define CPUFAMILY_ARM_VORTEX_TEMPEST    0x07d34b9f
 
 /* The following synonyms are deprecated: */
 #define CPUFAMILY_INTEL_6_23    CPUFAMILY_INTEL_PENRYN
index cceb7b41968f3bd7892a7fc8e5ac031c5a5a56a1..9c24d5db7bbde841e3b84e30f9fe07d38e00d40f 100644 (file)
 
 #define DTRACE_PROBE1(provider, name, arg0) {                                                   \
        uintptr_t __dtrace_args[ARG1_EXTENT] __attribute__ ((aligned (16)));                    \
-       __dtrace_args[0] = (uintptr_t)arg0;                                                             \
+       __dtrace_args[0] = (uintptr_t)(arg0);                                                             \
        DTRACE_CALL1ARG(provider, name)                                                         \
 }
 
 #define DTRACE_PROBE2(provider, name, arg0, arg1) {                                             \
        uintptr_t __dtrace_args[ARGS2_EXTENT] __attribute__ ((aligned (16)));                   \
-       __dtrace_args[0] = (uintptr_t)arg0;                                                             \
-       __dtrace_args[1] = (uintptr_t)arg1;                                                             \
+       __dtrace_args[0] = (uintptr_t)(arg0);                                                             \
+       __dtrace_args[1] = (uintptr_t)(arg1);                                                             \
        DTRACE_CALL2ARGS(provider, name)                                                        \
 }
 
 #define DTRACE_PROBE3(provider, name, arg0, arg1, arg2) {                                       \
        uintptr_t __dtrace_args[ARGS3_EXTENT] __attribute__ ((aligned (16)));                   \
-       __dtrace_args[0] = (uintptr_t)arg0;                                                             \
-       __dtrace_args[1] = (uintptr_t)arg1;                                                             \
-       __dtrace_args[2] = (uintptr_t)arg2;                                                             \
+       __dtrace_args[0] = (uintptr_t)(arg0);                                                             \
+       __dtrace_args[1] = (uintptr_t)(arg1);                                                             \
+       __dtrace_args[2] = (uintptr_t)(arg2);                                                             \
        DTRACE_CALL3ARGS(provider, name)                                                        \
 }
 
 #define DTRACE_PROBE4(provider, name, arg0, arg1, arg2, arg3) {                                 \
        uintptr_t __dtrace_args[ARGS4_EXTENT] __attribute__ ((aligned (16)));                   \
-       __dtrace_args[0] = (uintptr_t)arg0;                                                             \
-       __dtrace_args[1] = (uintptr_t)arg1;                                                             \
-       __dtrace_args[2] = (uintptr_t)arg2;                                                             \
-       __dtrace_args[3] = (uintptr_t)arg3;                                                             \
+       __dtrace_args[0] = (uintptr_t)(arg0);                                                             \
+       __dtrace_args[1] = (uintptr_t)(arg1);                                                             \
+       __dtrace_args[2] = (uintptr_t)(arg2);                                                             \
+       __dtrace_args[3] = (uintptr_t)(arg3);                                                             \
        DTRACE_CALL4ARGS(provider, name)                                                        \
 }
 
 #define DTRACE_PROBE5(provider, name, arg0, arg1, arg2, arg3, arg4) {                           \
        uintptr_t __dtrace_args[ARGS5_EXTENT] __attribute__ ((aligned (16)));                   \
-       __dtrace_args[0] = (uintptr_t)arg0;                                                             \
-       __dtrace_args[1] = (uintptr_t)arg1;                                                             \
-       __dtrace_args[2] = (uintptr_t)arg2;                                                             \
-       __dtrace_args[3] = (uintptr_t)arg3;                                                             \
-       __dtrace_args[4] = (uintptr_t)arg4;                                                             \
+       __dtrace_args[0] = (uintptr_t)(arg0);                                                             \
+       __dtrace_args[1] = (uintptr_t)(arg1);                                                             \
+       __dtrace_args[2] = (uintptr_t)(arg2);                                                             \
+       __dtrace_args[3] = (uintptr_t)(arg3);                                                             \
+       __dtrace_args[4] = (uintptr_t)(arg4);                                                             \
        DTRACE_CALL5ARGS(provider, name)                                                        \
 }
 
 #define DTRACE_PROBE6(provider, name, arg0, arg1, arg2, arg3, arg4, arg5) {                     \
        uintptr_t __dtrace_args[ARGS6_EXTENT] __attribute__ ((aligned (16)));                   \
-       __dtrace_args[0] = (uintptr_t)arg0;                                                             \
-       __dtrace_args[1] = (uintptr_t)arg1;                                                             \
-       __dtrace_args[2] = (uintptr_t)arg2;                                                             \
-       __dtrace_args[3] = (uintptr_t)arg3;                                                             \
-       __dtrace_args[4] = (uintptr_t)arg4;                                                             \
-       __dtrace_args[5] = (uintptr_t)arg5;                                                             \
+       __dtrace_args[0] = (uintptr_t)(arg0);                                                             \
+       __dtrace_args[1] = (uintptr_t)(arg1);                                                             \
+       __dtrace_args[2] = (uintptr_t)(arg2);                                                             \
+       __dtrace_args[3] = (uintptr_t)(arg3);                                                             \
+       __dtrace_args[4] = (uintptr_t)(arg4);                                                             \
+       __dtrace_args[5] = (uintptr_t)(arg5);                                                             \
        DTRACE_CALL6ARGS(provider, name)                                                        \
 }
 
 #define DTRACE_PROBE7(provider, name, arg0, arg1, arg2, arg3, arg4, arg5, arg6) {               \
        uintptr_t __dtrace_args[ARGS7_EXTENT] __attribute__ ((aligned (16)));                   \
-       __dtrace_args[0] = (uintptr_t)arg0;                                                             \
-       __dtrace_args[1] = (uintptr_t)arg1;                                                             \
-       __dtrace_args[2] = (uintptr_t)arg2;                                                             \
-       __dtrace_args[3] = (uintptr_t)arg3;                                                             \
-       __dtrace_args[4] = (uintptr_t)arg4;                                                             \
-       __dtrace_args[5] = (uintptr_t)arg5;                                                             \
-       __dtrace_args[6] = (uintptr_t)arg6;                                                             \
+       __dtrace_args[0] = (uintptr_t)(arg0);                                                             \
+       __dtrace_args[1] = (uintptr_t)(arg1);                                                             \
+       __dtrace_args[2] = (uintptr_t)(arg2);                                                             \
+       __dtrace_args[3] = (uintptr_t)(arg3);                                                             \
+       __dtrace_args[4] = (uintptr_t)(arg4);                                                             \
+       __dtrace_args[5] = (uintptr_t)(arg5);                                                             \
+       __dtrace_args[6] = (uintptr_t)(arg6);                                                             \
        DTRACE_CALL7ARGS(provider, name)                                                        \
 }
 
 #define DTRACE_PROBE8(provider, name, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7) {         \
        uintptr_t __dtrace_args[ARGS8_EXTENT] __attribute__ ((aligned (16)));                   \
-       __dtrace_args[0] = (uintptr_t)arg0;                                                             \
-       __dtrace_args[1] = (uintptr_t)arg1;                                                             \
-       __dtrace_args[2] = (uintptr_t)arg2;                                                             \
-       __dtrace_args[3] = (uintptr_t)arg3;                                                             \
-       __dtrace_args[4] = (uintptr_t)arg4;                                                             \
-       __dtrace_args[5] = (uintptr_t)arg5;                                                             \
-       __dtrace_args[6] = (uintptr_t)arg6;                                                             \
-       __dtrace_args[7] = (uintptr_t)arg7;                                                             \
+       __dtrace_args[0] = (uintptr_t)(arg0);                                                             \
+       __dtrace_args[1] = (uintptr_t)(arg1);                                                             \
+       __dtrace_args[2] = (uintptr_t)(arg2);                                                             \
+       __dtrace_args[3] = (uintptr_t)(arg3);                                                             \
+       __dtrace_args[4] = (uintptr_t)(arg4);                                                             \
+       __dtrace_args[5] = (uintptr_t)(arg5);                                                             \
+       __dtrace_args[6] = (uintptr_t)(arg6);                                                             \
+       __dtrace_args[7] = (uintptr_t)(arg7);                                                             \
        DTRACE_CALL8ARGS(provider, name)                                                        \
 }
 
 #define DTRACE_PROBE9(provider, name, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8) {   \
        uintptr_t __dtrace_args[ARGS9_EXTENT] __attribute__ ((aligned (16)));                   \
-       __dtrace_args[0] = (uintptr_t)arg0;                                                             \
-       __dtrace_args[1] = (uintptr_t)arg1;                                                             \
-       __dtrace_args[2] = (uintptr_t)arg2;                                                             \
-       __dtrace_args[3] = (uintptr_t)arg3;                                                             \
-       __dtrace_args[4] = (uintptr_t)arg4;                                                             \
-       __dtrace_args[5] = (uintptr_t)arg5;                                                             \
-       __dtrace_args[6] = (uintptr_t)arg6;                                                             \
-       __dtrace_args[7] = (uintptr_t)arg7;                                                             \
-       __dtrace_args[8] = (uintptr_t)arg8;                                                             \
+       __dtrace_args[0] = (uintptr_t)(arg0);                                                             \
+       __dtrace_args[1] = (uintptr_t)(arg1);                                                             \
+       __dtrace_args[2] = (uintptr_t)(arg2);                                                             \
+       __dtrace_args[3] = (uintptr_t)(arg3);                                                             \
+       __dtrace_args[4] = (uintptr_t)(arg4);                                                             \
+       __dtrace_args[5] = (uintptr_t)(arg5);                                                             \
+       __dtrace_args[6] = (uintptr_t)(arg6);                                                             \
+       __dtrace_args[7] = (uintptr_t)(arg7);                                                             \
+       __dtrace_args[8] = (uintptr_t)(arg8);                                                             \
        DTRACE_CALL9ARGS(provider, name)                                                        \
 }
 
 #define DTRACE_PROBE10(provider, name, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9) {    \
        uintptr_t __dtrace_args[ARGS10_EXTENT] __attribute__ ((aligned (16)));                          \
-       __dtrace_args[0] = (uintptr_t)arg0;                                                                     \
-       __dtrace_args[1] = (uintptr_t)arg1;                                                                     \
-       __dtrace_args[2] = (uintptr_t)arg2;                                                                     \
-       __dtrace_args[3] = (uintptr_t)arg3;                                                                     \
-       __dtrace_args[4] = (uintptr_t)arg4;                                                                     \
-       __dtrace_args[5] = (uintptr_t)arg5;                                                                     \
-       __dtrace_args[6] = (uintptr_t)arg6;                                                                     \
-       __dtrace_args[7] = (uintptr_t)arg7;                                                                     \
-       __dtrace_args[8] = (uintptr_t)arg8;                                                                     \
-       __dtrace_args[9] = (uintptr_t)arg9;                                                                     \
+       __dtrace_args[0] = (uintptr_t)(arg0);                                                                     \
+       __dtrace_args[1] = (uintptr_t)(arg1);                                                                     \
+       __dtrace_args[2] = (uintptr_t)(arg2);                                                                     \
+       __dtrace_args[3] = (uintptr_t)(arg3);                                                                     \
+       __dtrace_args[4] = (uintptr_t)(arg4);                                                                     \
+       __dtrace_args[5] = (uintptr_t)(arg5);                                                                     \
+       __dtrace_args[6] = (uintptr_t)(arg6);                                                                     \
+       __dtrace_args[7] = (uintptr_t)(arg7);                                                                     \
+       __dtrace_args[8] = (uintptr_t)(arg8);                                                                     \
+       __dtrace_args[9] = (uintptr_t)(arg9);                                                                     \
        DTRACE_CALL10ARGS(provider, name)                                                               \
 }
 
 #define DTRACE_MEMORYSTATUS3(name, type1, arg1, type2, arg2, type3, arg3)               \
        DTRACE_PROBE3(__sdt_, name, arg1, arg2, arg3);
 
+#define DTRACE_MEMORYSTATUS4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4)  \
+       DTRACE_PROBE4(__sdt_, name, arg1, arg2, arg3, arg4);
+
 #define DTRACE_MEMORYSTATUS6(name, type1, arg1, type2, arg2,                    \
            type3, arg3, type4, arg4, type5, arg5, type6, arg6) \
        DTRACE_PROBE6(__vminfo_, name, arg1, arg2, arg3, arg4, arg5, arg6)
            type3, arg3, type4, arg4, type5, arg5, type6, arg6) \
        DTRACE_PROBE6(__vminfo_, name, arg1, arg2, arg3, arg4, arg5, arg6)
 
+#define DTRACE_VM7(name, type1, arg1, type2, arg2,                      \
+           type3, arg3, type4, arg4, type5, arg5, type6, arg6, type7, arg7) \
+       DTRACE_PROBE7(__vminfo_, name, arg1, arg2, arg3, arg4, arg5, arg6, arg7)
+
 #define DTRACE_IP(name)                                                 \
        DTRACE_PROBE(__ip_, name)
 
            type4, arg4, type5, arg5, type6, arg6, type7, arg7)                 \
        DTRACE_PROBE7(__ip_, name, arg1, arg2, arg3, arg4, arg5, arg6, arg7)
 
+#define DTRACE_ROUTE(name)                                              \
+       DTRACE_PROBE(__route_, name)
+
+#define DTRACE_ROUTE1(name, type1, arg1)                                \
+       DTRACE_PROBE1(__route_, name, arg1)
+
+#define DTRACE_ROUTE2(name, type1, arg1, type2, arg2)                   \
+       DTRACE_PROBE2(__route_, name, arg1, arg2)
+
+#define DTRACE_ROUTE3(name, type1, arg1, type2, arg2, type3, arg3)      \
+       DTRACE_PROBE3(__route_, name, arg1, arg2, arg3)
+
+#define DTRACE_ROUTE4(name, type1, arg1, type2, arg2,                   \
+           type3, arg3, type4, arg4)                                           \
+       DTRACE_PROBE4(__route_, name, arg1, arg2, arg3, arg4)
+
+#define DTRACE_ROUTE5(name, typ1, arg1, type2, arg2, type3, arg3,       \
+           type4, arg4, type5, arg5)                                           \
+       DTRACE_PROBE5(__route_, name, arg1, arg2, arg3, arg4, arg5)
+
+#define DTRACE_ROUTE6(name, type1, arg1, type2, arg2, type3, arg3,      \
+           type4, arg4, type5, arg5, type6, arg6)                               \
+       DTRACE_PROBE6(__route_, name, arg1, arg2, arg3, arg4, arg5, arg6)
+
+#define DTRACE_ROUTE7(name, type1, arg1, type2, arg2, type3, arg3,      \
+           type4, arg4, type5, arg5, type6, arg6, type7, arg7)                 \
+       DTRACE_PROBE7(__route_, name, arg1, arg2, arg3, arg4, arg5, arg6, arg7)
+
 #define DTRACE_TCP(name)                                                 \
        DTRACE_PROBE(__tcp_, name)
 
            type3, arg3, type4, arg4, type5, arg5, type6, arg6)                 \
        DTRACE_PROBE6(__boost_, name, arg1, arg2, arg3, arg4, arg5, arg6);
 
+#if KASAN
+#define DTRACE_KASAN(name)                                              \
+       DTRACE_PROBE(__kasan_, name);
+
+#define DTRACE_KASAN1(name, type1, arg1)                                \
+       DTRACE_PROBE1(__kasan_, name, arg1);
+
+#define DTRACE_KASAN2(name, type1, arg1, type2, arg2)                   \
+       DTRACE_PROBE2(__kasan_, name, arg1, arg2);
+
+#define DTRACE_KASAN3(name, type1, arg1, type2, arg2, type3, arg3)      \
+       DTRACE_PROBE3(__kasan_, name, arg1, arg2, arg3);
+
+#define DTRACE_KASAN4(name, type1, arg1, type2, arg2,                   \
+           type3, arg3, type4, arg4)                                       \
+       DTRACE_PROBE4(__kasan_, name, arg1, arg2, arg3, arg4);
+
+#define DTRACE_KASAN5(name, type1, arg1, type2, arg2,                   \
+           type3, arg3, type4, arg4, type5, arg5)                          \
+       DTRACE_PROBE5(__kasan_, name, arg1, arg2, arg3, arg4, arg5);
+#endif /* KASAN */
+
 #if PRIVATE
 #endif /* PRIVATE */
 
index 07e8fa454d6869cf89cac61ac5922473b6b4668c..bcc83f26c2a3f5bf6dd9d478928ffd0d0cf11e5a 100644 (file)
@@ -32,10 +32,6 @@ subsystem
 #endif /* KERNEL_SERVER */
        memory_entry 4900;
 
-#if !KERNEL && !LIBSYSCALL_INTERFACE
-    UserPrefix _kernelrpc_;
-#endif
-
 #include <mach/std_types.defs>
 #include <mach/mach_types.defs>
 #include <mach_debug/mach_debug_types.defs>
@@ -50,3 +46,9 @@ routine mach_memory_entry_access_tracking(
        inout   access_tracking         : int;
        out     access_tracking_reads   : uint32_t;
        out     access_tracking_writes  : uint32_t);
+
+routine mach_memory_entry_ownership(
+               mem_entry       : mem_entry_name_port_t;
+               owner           : task_t;
+               ledger_tag      : int;
+               ledger_flags    : int);
index fec1df84a187923585e4749ab928d7dde06af57e..6c5cdd9411507d5e0cf1377d6627f494a15fb9f6 100644 (file)
@@ -369,14 +369,17 @@ typedef struct memory_object_attr_info  memory_object_attr_info_data_t;
 
 /* named entry processor mapping options */
 /* enumerated */
-#define MAP_MEM_NOOP            0
-#define MAP_MEM_COPYBACK        1
-#define MAP_MEM_IO              2
-#define MAP_MEM_WTHRU           3
-#define MAP_MEM_WCOMB           4       /* Write combining mode */
-                                        /* aka store gather     */
-#define MAP_MEM_INNERWBACK      5
-#define MAP_MEM_POSTED          6
+#define MAP_MEM_NOOP                      0
+#define MAP_MEM_COPYBACK                  1
+#define MAP_MEM_IO                        2
+#define MAP_MEM_WTHRU                     3
+#define MAP_MEM_WCOMB                     4       /* Write combining mode */
+                                                  /* aka store gather     */
+#define MAP_MEM_INNERWBACK                5
+#define MAP_MEM_POSTED                    6
+#define MAP_MEM_RT                        7
+#define MAP_MEM_POSTED_REORDERED          8
+#define MAP_MEM_POSTED_COMBINED_REORDERED 9
 
 #define GET_MAP_MEM(flags)      \
        ((((unsigned int)(flags)) >> 24) & 0xFF)
@@ -386,7 +389,7 @@ typedef struct memory_object_attr_info  memory_object_attr_info_data_t;
                        & 0xFF000000) | ((flags) & 0xFFFFFF));
 
 /* leave room for vm_prot bits (0xFF ?) */
-#define MAP_MEM_LEDGER_TAG_NETWORK 0x002000 /* charge to "network" ledger */
+#define MAP_MEM_LEDGER_TAGGED        0x002000 /* object owned by a specific task and ledger */
 #define MAP_MEM_PURGABLE_KERNEL_ONLY 0x004000 /* volatility controlled by kernel */
 #define MAP_MEM_GRAB_SECLUDED   0x008000 /* can grab secluded pages */
 #define MAP_MEM_ONLY            0x010000 /* change processor caching  */
@@ -409,9 +412,9 @@ typedef struct memory_object_attr_info  memory_object_attr_info_data_t;
        MAP_MEM_USE_DATA_ADDR |                            \
        MAP_MEM_VM_COPY |                                  \
        MAP_MEM_VM_SHARE |                                 \
+       MAP_MEM_LEDGER_TAGGED |                            \
        MAP_MEM_4K_DATA_ADDR)
 #define MAP_MEM_FLAGS_ALL (                     \
-       MAP_MEM_LEDGER_TAG_NETWORK |            \
        MAP_MEM_FLAGS_USER)
 
 #ifdef KERNEL
index ceb069a6a5e2662cb84fd7916221f4a4a99282de..a1a3a032599a2a049184672e9b3f0481526d6bf5 100644 (file)
@@ -251,6 +251,12 @@ typedef unsigned int mach_msg_copy_options_t;
 #define MACH_MSG_KALLOC_COPY_T          4
 #endif  /* MACH_KERNEL */
 
+#define MACH_MSG_GUARD_FLAGS_NONE                   0x0000
+#define MACH_MSG_GUARD_FLAGS_IMMOVABLE_RECEIVE      0x0001    /* Move the receive right and mark it as immovable */
+#define MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND      0x0002    /* Verify that the port is unguarded */
+#define MACH_MSG_GUARD_FLAGS_MASK                   0x0003    /* Valid flag bits */
+typedef unsigned int mach_msg_guard_flags_t;
+
 /*
  * In a complex mach message, the mach_msg_header_t is followed by
  * a descriptor count, then an array of that number of descriptors
@@ -269,8 +275,9 @@ typedef unsigned int mach_msg_descriptor_type_t;
 #define MACH_MSG_OOL_DESCRIPTOR                 1
 #define MACH_MSG_OOL_PORTS_DESCRIPTOR           2
 #define MACH_MSG_OOL_VOLATILE_DESCRIPTOR        3
+#define MACH_MSG_GUARDED_PORT_DESCRIPTOR        4
 
-#pragma pack(4)
+#pragma pack(push, 4)
 
 typedef struct{
        natural_t                     pad1;
@@ -363,6 +370,48 @@ typedef struct{
 #endif
 } mach_msg_ool_ports_descriptor_t;
 
+typedef struct{
+       uint32_t                      context;
+       mach_port_name_t              name;
+       mach_msg_guard_flags_t        flags : 16;
+       mach_msg_type_name_t          disposition : 8;
+       mach_msg_descriptor_type_t    type : 8;
+} mach_msg_guarded_port_descriptor32_t;
+
+typedef struct{
+       uint64_t                      context;
+       mach_msg_guard_flags_t        flags : 16;
+       mach_msg_type_name_t          disposition : 8;
+       mach_msg_descriptor_type_t    type : 8;
+       mach_port_name_t              name;
+} mach_msg_guarded_port_descriptor64_t;
+
+typedef struct{
+#if defined(KERNEL)
+       mach_port_t                   name;
+#if !defined(__LP64__)
+       uint32_t                      pad1;
+#endif
+       mach_msg_guard_flags_t        flags : 16;
+       mach_msg_type_name_t          disposition : 8;
+       mach_msg_descriptor_type_t    type : 8;
+#if defined(__LP64__)
+       uint32_t                      pad_end;
+#endif /* defined(__LP64__) */
+#else
+       mach_port_context_t           context;
+#if !defined(__LP64__)
+       mach_port_name_t              name;
+#endif
+       mach_msg_guard_flags_t        flags : 16;
+       mach_msg_type_name_t          disposition : 8;
+       mach_msg_descriptor_type_t    type : 8;
+#if defined(__LP64__)
+       mach_port_name_t              name;
+#endif /* defined(__LP64__) */
+#endif /* defined(KERNEL) */
+} mach_msg_guarded_port_descriptor_t;
+
 /*
  * LP64support - This union definition is not really
  * appropriate in LP64 mode because not all descriptors
@@ -374,6 +423,7 @@ typedef union{
        mach_msg_ool_descriptor32_t           out_of_line;
        mach_msg_ool_ports_descriptor32_t     ool_ports;
        mach_msg_type_descriptor_t            type;
+       mach_msg_guarded_port_descriptor32_t  guarded_port;
 } mach_msg_descriptor_t;
 #else
 typedef union{
@@ -381,6 +431,7 @@ typedef union{
        mach_msg_ool_descriptor_t             out_of_line;
        mach_msg_ool_ports_descriptor_t       ool_ports;
        mach_msg_type_descriptor_t            type;
+       mach_msg_guarded_port_descriptor_t    guarded_port;
 } mach_msg_descriptor_t;
 #endif
 
@@ -576,10 +627,10 @@ typedef mach_msg_security_trailer_t mach_msg_format_0_trailer_t;
 #define MACH_MSG_TRAILER_FORMAT_0_SIZE sizeof(mach_msg_format_0_trailer_t)
 
 #define   KERNEL_SECURITY_TOKEN_VALUE  { {0, 1} }
-extern security_token_t KERNEL_SECURITY_TOKEN;
+extern const security_token_t KERNEL_SECURITY_TOKEN;
 
 #define   KERNEL_AUDIT_TOKEN_VALUE  { {0, 0, 0, 0, 0, 0, 0, 0} }
-extern audit_token_t KERNEL_AUDIT_TOKEN;
+extern const audit_token_t KERNEL_AUDIT_TOKEN;
 
 typedef integer_t mach_msg_options_t;
 
@@ -597,7 +648,7 @@ typedef union{
        mach_msg_empty_rcv_t  rcv;
 } mach_msg_empty_t;
 
-#pragma pack()
+#pragma pack(pop)
 
 /* utility to round the message size - will become machine dependent */
 #define round_msg(x)    (((mach_msg_size_t)(x) + sizeof (natural_t) - 1) & \
@@ -683,7 +734,7 @@ typedef integer_t mach_msg_option_t;
 #define MACH_RCV_LARGE_IDENTITY 0x00000008      /* identify source of large messages */
 
 #define MACH_SEND_TIMEOUT       0x00000010      /* timeout value applies to send */
-#define MACH_SEND_OVERRIDE  0x00000020  /* priority override for send */
+#define MACH_SEND_OVERRIDE      0x00000020      /* priority override for send */
 #define MACH_SEND_INTERRUPT     0x00000040      /* don't restart interrupted sends */
 #define MACH_SEND_NOTIFY        0x00000080      /* arm send-possible notify */
 #define MACH_SEND_ALWAYS        0x00010000      /* ignore qlimits - kernel only */
@@ -692,16 +743,23 @@ typedef integer_t mach_msg_option_t;
 #define MACH_SEND_NODENAP       MACH_SEND_NOIMPORTANCE
 #define MACH_SEND_IMPORTANCE    0x00080000      /* msg carries importance - kernel only */
 #define MACH_SEND_SYNC_OVERRIDE 0x00100000      /* msg should do sync ipc override */
-#define MACH_SEND_PROPAGATE_QOS  0x00200000     /* IPC should propagate the caller's QoS */
+#define MACH_SEND_PROPAGATE_QOS 0x00200000      /* IPC should propagate the caller's QoS */
 #define MACH_SEND_SYNC_USE_THRPRI       MACH_SEND_PROPAGATE_QOS /* obsolete name */
-#define MACH_SEND_KERNEL    0x00400000  /* full send from kernel space - kernel only */
+#define MACH_SEND_KERNEL        0x00400000      /* full send from kernel space - kernel only */
+#define MACH_SEND_SYNC_BOOTSTRAP_CHECKIN  0x00800000      /* special reply port should boost thread doing sync bootstrap checkin */
 
 #define MACH_RCV_TIMEOUT        0x00000100      /* timeout value applies to receive */
-#define MACH_RCV_NOTIFY         0x00000200      /* reserved - legacy */
+#define MACH_RCV_NOTIFY         0x00000000      /* legacy name (value was: 0x00000200) */
 #define MACH_RCV_INTERRUPT      0x00000400      /* don't restart interrupted receive */
 #define MACH_RCV_VOUCHER        0x00000800      /* willing to receive voucher port */
-#define MACH_RCV_OVERWRITE      0x00001000      /* scatter receive (deprecated) */
+#define MACH_RCV_OVERWRITE      0x00000000      /* scatter receive (deprecated) */
+#define MACH_RCV_GUARDED_DESC   0x00001000      /* Can receive new guarded descriptor */
 #define MACH_RCV_SYNC_WAIT      0x00004000      /* sync waiter waiting for rcv */
+#define MACH_RCV_SYNC_PEEK      0x00008000      /* sync waiter waiting to peek */
+
+#define MACH_MSG_STRICT_REPLY   0x00000200      /* Enforce specific properties about the reply port, and
+                                                * the context in which a thread replies to a message.
+                                                * This flag must be passed on both the SEND and RCV */
 
 #ifdef XNU_KERNEL_PRIVATE
 
@@ -745,12 +803,15 @@ typedef integer_t mach_msg_option_t;
 #define MACH_SEND_USER (MACH_SEND_MSG | MACH_SEND_TIMEOUT | \
                                                MACH_SEND_NOTIFY | MACH_SEND_OVERRIDE | \
                                                MACH_SEND_TRAILER | MACH_SEND_NOIMPORTANCE | \
-                                               MACH_SEND_SYNC_OVERRIDE | MACH_SEND_PROPAGATE_QOS)
+                                               MACH_SEND_SYNC_OVERRIDE | MACH_SEND_PROPAGATE_QOS | \
+                                               MACH_SEND_SYNC_BOOTSTRAP_CHECKIN | \
+                                               MACH_MSG_STRICT_REPLY | MACH_RCV_GUARDED_DESC)
 
 #define MACH_RCV_USER (MACH_RCV_MSG | MACH_RCV_TIMEOUT | \
                                           MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | \
                                           MACH_RCV_VOUCHER | MACH_RCV_TRAILER_MASK | \
-                                          MACH_RCV_SYNC_WAIT)
+                                          MACH_RCV_SYNC_WAIT | MACH_RCV_SYNC_PEEK  | \
+                                          MACH_RCV_GUARDED_DESC | MACH_MSG_STRICT_REPLY)
 
 #define MACH_MSG_OPTION_USER     (MACH_SEND_USER | MACH_RCV_USER)
 
@@ -768,6 +829,21 @@ typedef integer_t mach_msg_option_t;
 #define MACH_SEND_KERNEL_DEFAULT (MACH_SEND_MSG | \
                                  MACH_SEND_ALWAYS | MACH_SEND_NOIMPORTANCE)
 
+#define MACH_SEND_WITH_STRICT_REPLY(_opts) (((_opts) & (MACH_MSG_STRICT_REPLY | MACH_SEND_MSG)) == \
+                                           (MACH_MSG_STRICT_REPLY | MACH_SEND_MSG))
+
+#define MACH_SEND_REPLY_IS_IMMOVABLE(_opts) (((_opts) & (MACH_MSG_STRICT_REPLY | \
+                                                        MACH_SEND_MSG | MACH_RCV_MSG | \
+                                                        MACH_RCV_GUARDED_DESC)) == \
+                                            (MACH_MSG_STRICT_REPLY | MACH_SEND_MSG | MACH_RCV_GUARDED_DESC))
+
+#define MACH_RCV_WITH_STRICT_REPLY(_opts)  (((_opts) & (MACH_MSG_STRICT_REPLY | MACH_RCV_MSG)) == \
+                                           (MACH_MSG_STRICT_REPLY | MACH_RCV_MSG))
+
+#define MACH_RCV_WITH_IMMOVABLE_REPLY(_opts) (((_opts) & (MACH_MSG_STRICT_REPLY | \
+                                                         MACH_RCV_MSG | MACH_RCV_GUARDED_DESC)) == \
+                                             (MACH_MSG_STRICT_REPLY | MACH_RCV_MSG | MACH_RCV_GUARDED_DESC))
+
 #endif /* MACH_KERNEL_PRIVATE */
 
 /*
@@ -881,8 +957,12 @@ typedef kern_return_t mach_msg_return_t;
 /* A field in the header had a bad value. */
 #define MACH_SEND_INVALID_TRAILER       0x10000011
 /* The trailer to be sent does not match kernel format. */
+#define MACH_SEND_INVALID_CONTEXT       0x10000012
+/* The sending thread context did not match the context on the dest port */
 #define MACH_SEND_INVALID_RT_OOL_SIZE   0x10000015
 /* compatibility: no longer a returned error */
+#define MACH_SEND_NO_GRANT_DEST         0x10000016
+/* The destination port doesn't accept ports in body */
 
 #define MACH_RCV_IN_PROGRESS            0x10004001
 /* Thread is waiting for receive.  (Internal use only.) */
@@ -916,6 +996,8 @@ typedef kern_return_t mach_msg_return_t;
 /* trailer type or number of trailer elements not supported */
 #define MACH_RCV_IN_PROGRESS_TIMED      0x10004011
 /* Waiting for receive with timeout. (Internal use only.) */
+#define MACH_RCV_INVALID_REPLY          0x10004012
+/* invalid reply port used in a STRICT_REPLY message */
 
 #ifdef XNU_KERNEL_PRIVATE
 #define MACH_PEEK_IN_PROGRESS           0x10008001
index ee94955bc0b7bb089d066f6644c0faf7b75d1ba9..74d2d01096fc18ed164e4f8b0b9577d9cbc2300c 100644 (file)
@@ -140,6 +140,17 @@ typedef struct mig_symtab {
                                                                 */
 } mig_symtab_t;
 
+/*
+ * A compiler attribute for annotating all MIG server routines and other
+ * functions that should behave similarly.  Allows the compiler to perform
+ * additional static bug-finding over them.
+ */
+#if __has_attribute(mig_server_routine)
+#define MIG_SERVER_ROUTINE __attribute__((mig_server_routine))
+#else
+#define MIG_SERVER_ROUTINE
+#endif
+
 #ifdef  PRIVATE
 
 /* MIG object runtime - not ready for public consumption */
index db15ea83fa980b1c3851803a33ba77e5169ede15..2eb09af2eeb83d83057800d14754e01037af6710 100644 (file)
@@ -140,7 +140,7 @@ struct ipc_port;
 
 typedef struct ipc_port         *ipc_port_t;
 
-#define IPC_PORT_NULL           ((ipc_port_t) 0UL)
+#define IPC_PORT_NULL           ((ipc_port_t) NULL)
 #define IPC_PORT_DEAD           ((ipc_port_t)~0UL)
 #define IPC_PORT_VALID(port) \
        ((port) != IPC_PORT_NULL && (port) != IPC_PORT_DEAD)
@@ -189,7 +189,11 @@ typedef mach_port_t                     *mach_port_array_t;
  *  that a port right was present, but it died.
  */
 
+#if defined(XNU_KERNEL_PRIVATE) && defined(__cplusplus)
+#define MACH_PORT_NULL          NULL
+#else
 #define MACH_PORT_NULL          0  /* intentional loose typing */
+#endif
 #define MACH_PORT_DEAD          ((mach_port_name_t) ~0)
 #define MACH_PORT_VALID(name)                           \
                (((name) != MACH_PORT_NULL) &&          \
@@ -243,8 +247,13 @@ typedef natural_t mach_port_right_t;
 #define MACH_PORT_RIGHT_SEND_ONCE       ((mach_port_right_t) 2)
 #define MACH_PORT_RIGHT_PORT_SET        ((mach_port_right_t) 3)
 #define MACH_PORT_RIGHT_DEAD_NAME       ((mach_port_right_t) 4)
-#define MACH_PORT_RIGHT_LABELH          ((mach_port_right_t) 5)
-#define MACH_PORT_RIGHT_NUMBER          ((mach_port_right_t) 6)
+#define MACH_PORT_RIGHT_LABELH          ((mach_port_right_t) 5) /* obsolete right */
+#define MACH_PORT_RIGHT_NUMBER          ((mach_port_right_t) 6) /* right not implemented */
+
+#ifdef MACH_KERNEL_PRIVATE
+#define MACH_PORT_RIGHT_VALID_TRANSLATE(right) \
+       ((right) >= MACH_PORT_RIGHT_SEND && (right) <= MACH_PORT_RIGHT_DEAD_NAME)
+#endif
 
 typedef natural_t mach_port_type_t;
 typedef mach_port_type_t *mach_port_type_array_t;
@@ -258,7 +267,13 @@ typedef mach_port_type_t *mach_port_type_array_t;
 #define MACH_PORT_TYPE_SEND_ONCE    MACH_PORT_TYPE(MACH_PORT_RIGHT_SEND_ONCE)
 #define MACH_PORT_TYPE_PORT_SET     MACH_PORT_TYPE(MACH_PORT_RIGHT_PORT_SET)
 #define MACH_PORT_TYPE_DEAD_NAME    MACH_PORT_TYPE(MACH_PORT_RIGHT_DEAD_NAME)
-#define MACH_PORT_TYPE_LABELH       MACH_PORT_TYPE(MACH_PORT_RIGHT_LABELH)
+#define MACH_PORT_TYPE_LABELH       MACH_PORT_TYPE(MACH_PORT_RIGHT_LABELH) /* obsolete */
+
+
+#ifdef MACH_KERNEL_PRIVATE
+/* Holder used to have a receive right - remembered to filter exceptions */
+#define MACH_PORT_TYPE_EX_RECEIVE   MACH_PORT_TYPE_LABELH
+#endif
 
 /* Convenient combinations. */
 
@@ -332,6 +347,8 @@ typedef struct mach_port_limits {
 #define MACH_PORT_STATUS_FLAG_IMP_DONATION      0x08
 #define MACH_PORT_STATUS_FLAG_REVIVE            0x10
 #define MACH_PORT_STATUS_FLAG_TASKPTR           0x20
+#define MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE 0x40
+#define MACH_PORT_STATUS_FLAG_NO_GRANT          0x80
 
 typedef struct mach_port_info_ext {
        mach_port_status_t      mpie_status;
@@ -384,6 +401,7 @@ typedef struct mach_port_qos {
 #define MPO_INSERT_SEND_RIGHT   0x10    /* Insert a send right for the port */
 #define MPO_STRICT              0x20    /* Apply strict guarding for port */
 #define MPO_DENAP_RECEIVER      0x40    /* Mark the port as App de-nap receiver */
+#define MPO_IMMOVABLE_RECEIVE   0x80    /* Mark the port as immovable; protected by the guard context */
 /*
  * Structure to define optional attributes for a newly
  * constructed port.
@@ -410,7 +428,9 @@ enum mach_port_guard_exception_codes {
        kGUARD_EXC_SET_CONTEXT               = 1u << 2,
        kGUARD_EXC_UNGUARDED                 = 1u << 3,
        kGUARD_EXC_INCORRECT_GUARD           = 1u << 4,
-       /* start of non-fatal guards */
+       kGUARD_EXC_IMMOVABLE                 = 1u << 5,
+       kGUARD_EXC_STRICT_REPLY              = 1u << 6,
+       /* start of [optionally] non-fatal guards */
        kGUARD_EXC_INVALID_RIGHT         = 1u << 8,
        kGUARD_EXC_INVALID_NAME          = 1u << 9,
        kGUARD_EXC_INVALID_VALUE         = 1u << 10,
@@ -420,12 +440,31 @@ enum mach_port_guard_exception_codes {
        kGUARD_EXC_KERN_FAILURE          = 1u << 14,
        kGUARD_EXC_KERN_RESOURCE         = 1u << 15,
        kGUARD_EXC_SEND_INVALID_REPLY    = 1u << 16,
-       kGUARD_EXC_SEND_INVALID_VOUCHER  = 1u << 16,
-       kGUARD_EXC_SEND_INVALID_RIGHT    = 1u << 17,
-       kGUARD_EXC_RCV_INVALID_NAME      = 1u << 18,
-       kGUARD_EXC_RCV_INVALID_NOTIFY    = 1u << 19
+       kGUARD_EXC_SEND_INVALID_VOUCHER  = 1u << 17,
+       kGUARD_EXC_SEND_INVALID_RIGHT    = 1u << 18,
+       kGUARD_EXC_RCV_INVALID_NAME      = 1u << 19,
+       kGUARD_EXC_RCV_GUARDED_DESC      = 1u << 20, /* should never be fatal; for development only */
 };
 
+#define MAX_FATAL_kGUARD_EXC_CODE (1u << 6)
+
+/*
+ * These flags are used as bits in the subcode of kGUARD_EXC_STRICT_REPLY exceptions.
+ */
+#define MPG_FLAGS_STRICT_REPLY_INVALID_REPLY_DISP  (0x01ull << 56)
+#define MPG_FLAGS_STRICT_REPLY_INVALID_REPLY_PORT  (0x02ull << 56)
+#define MPG_FLAGS_STRICT_REPLY_INVALID_VOUCHER     (0x04ull << 56)
+#define MPG_FLAGS_STRICT_REPLY_NO_BANK_ATTR        (0x08ull << 56)
+#define MPG_FLAGS_STRICT_REPLY_MISMATCHED_PERSONA  (0x10ull << 56)
+#define MPG_FLAGS_STRICT_REPLY_MASK                (0xffull << 56)
+
+/*
+ * Flags for mach_port_guard_with_flags. These flags extend
+ * the attributes associated with a guarded port.
+ */
+#define MPG_STRICT              0x01    /* Apply strict guarding for a port */
+#define MPG_IMMOVABLE_RECEIVE   0x02    /* Receive right cannot be moved out of the space */
+
 #if     !__DARWIN_UNIX03 && !defined(_NO_PORT_T_FROM_MACH)
 /*
  *  Mach 3.0 renamed everything to have mach_ in front of it.
diff --git a/osfmk/mach/restartable.defs b/osfmk/mach/restartable.defs
new file mode 100644 (file)
index 0000000..74c1125
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+subsystem
+#if    KERNEL_SERVER
+         KernelServer
+#endif /* KERNEL_SERVER */
+         task_restartable 8000;
+
+#include <mach/std_types.defs>
+#include <mach/mach_types.defs>
+#include <mach_debug/mach_debug_types.defs>
+
+import <kern/restartable.h>;
+
+type task_restartable_range_t = array[2] of uint64_t;
+type task_restartable_range_array_t = array[*:64] of task_restartable_range_t;
+
+routine task_restartable_ranges_register(
+               target_task     : task_t;
+               ranges          : task_restartable_range_array_t);
+
+routine task_restartable_ranges_synchronize(
+               target_task : task_t);
+
+/* vim: set ft=c : */
+
index be70167dc41578f0438826437103759c99625328..0faf73ee82b4ef539bac4abc9179737189a46b4b 100644 (file)
 #define SHARED_REGION_NESTING_MIN_ARM           ?
 #define SHARED_REGION_NESTING_MAX_ARM           ?
 
+#define SHARED_REGION_BASE_ARM64_32             0x1A000000ULL
+#define SHARED_REGION_SIZE_ARM64_32             0x40000000ULL
+#define SHARED_REGION_NESTING_BASE_ARM64_32     0x1A000000ULL
+#define SHARED_REGION_NESTING_SIZE_ARM64_32     0x40000000ULL
+#define SHARED_REGION_NESTING_MIN_ARM64_32      ?
+#define SHARED_REGION_NESTING_MAX_ARM64_32      ?
 
 #ifdef XNU_KERNEL_PRIVATE
 /* ARM64_TODO: move to higher memory */
 #define SHARED_REGION_NESTING_SIZE              SHARED_REGION_NESTING_SIZE_ARM
 #define SHARED_REGION_NESTING_MIN               SHARED_REGION_NESTING_MIN_ARM
 #define SHARED_REGION_NESTING_MAX               SHARED_REGION_NESTING_MAX_ARM
+#elif defined(__arm64__) && !defined(__LP64__)
+#define SHARED_REGION_BASE                      SHARED_REGION_BASE_ARM64_32
+#define SHARED_REGION_SIZE                      SHARED_REGION_SIZE_ARM64_32
+#define SHARED_REGION_NESTING_BASE              SHARED_REGION_NESTING_BASE_ARM64_32
+#define SHARED_REGION_NESTING_SIZE              SHARED_REGION_NESTING_SIZE_ARM64_32
+#define SHARED_REGION_NESTING_MIN               SHARED_REGION_NESTING_MIN_ARM64_32
+#define SHARED_REGION_NESTING_MAX               SHARED_REGION_NESTING_MAX_ARM64_32
 #elif defined(__arm64__) && defined(__LP64__)
 #define SHARED_REGION_BASE                      SHARED_REGION_BASE_ARM64
 #define SHARED_REGION_SIZE                      SHARED_REGION_SIZE_ARM64
index b8c1c4d3224f5f816bfec47fe2e3be05c85fc9d1..f1e419809969fce217334631108e2f7c03953589 100644 (file)
@@ -151,6 +151,10 @@ kernel_trap(mach_voucher_extract_attr_recipe_trap,-72,4)
 /* mach_voucher_attr_command */
 /* mach_voucher_debug_info */
 
+/* more mach_port traps */
+kernel_trap(_kernelrpc_mach_port_type_trap,-76,3)
+kernel_trap(_kernelrpc_mach_port_request_notification_trap,-77,7)
+
 kernel_trap(mach_timebase_info_trap,-89,1)
 
 #if            defined(__LP64__)
@@ -176,6 +180,8 @@ kernel_trap(mk_timer_arm_leeway,-95,4)
 #else
 kernel_trap(mk_timer_arm_leeway,-95,7)
 #endif
+kernel_trap(debug_control_port_for_pid,-96,3)
+
 /*
  * N.B: Trap #-100 is in use by IOTrap.s in the IOKit Framework
  * (iokit_user_client_trap)
index 5ac64e7d5281097ec0ebad2e29a101a7780e306a..378fe2039e949bdfa4415b600edce5b13aaf4511 100644 (file)
@@ -504,5 +504,13 @@ routine task_inspect(
            flavor   : task_inspect_flavor_t;
        out info_out : task_inspect_info_t, CountInOut);
 
+routine task_get_exc_guard_behavior(
+               task     : task_inspect_t;
+       out behavior : task_exc_guard_behavior_t);
+
+routine task_set_exc_guard_behavior(
+               task     : task_t;
+               behavior : task_exc_guard_behavior_t);
+               
 /* vim: set ft=c : */
 
index 1248749f147e811a8def35c12ead874c4a083c45..93fa357c96553498eef291f569f6ab928e58723b 100644 (file)
@@ -88,7 +88,7 @@ typedef integer_t       task_info_data_t[TASK_INFO_MAX];
  *     Currently defined information structures.
  */
 
-#pragma pack(4)
+#pragma pack(push, 4)
 
 /* Don't use this, use MACH_TASK_BASIC_INFO instead */
 #define TASK_BASIC_INFO_32      4       /* basic information */
@@ -390,12 +390,47 @@ struct task_vm_info {
        /* added for rev2 */
        mach_vm_address_t       min_address;
        mach_vm_address_t       max_address;
+
+       /* added for rev3 */
+       int64_t ledger_phys_footprint_peak;
+       int64_t ledger_purgeable_nonvolatile;
+       int64_t ledger_purgeable_novolatile_compressed;
+       int64_t ledger_purgeable_volatile;
+       int64_t ledger_purgeable_volatile_compressed;
+       int64_t ledger_tag_network_nonvolatile;
+       int64_t ledger_tag_network_nonvolatile_compressed;
+       int64_t ledger_tag_network_volatile;
+       int64_t ledger_tag_network_volatile_compressed;
+       int64_t ledger_tag_media_footprint;
+       int64_t ledger_tag_media_footprint_compressed;
+       int64_t ledger_tag_media_nofootprint;
+       int64_t ledger_tag_media_nofootprint_compressed;
+       int64_t ledger_tag_graphics_footprint;
+       int64_t ledger_tag_graphics_footprint_compressed;
+       int64_t ledger_tag_graphics_nofootprint;
+       int64_t ledger_tag_graphics_nofootprint_compressed;
+       int64_t ledger_tag_neural_footprint;
+       int64_t ledger_tag_neural_footprint_compressed;
+       int64_t ledger_tag_neural_nofootprint;
+       int64_t ledger_tag_neural_nofootprint_compressed;
+
+       /* added for rev4 */
+       uint64_t limit_bytes_remaining;
+
+       /* added for rev5 */
+       integer_t decompressions;
 };
 typedef struct task_vm_info     task_vm_info_data_t;
 typedef struct task_vm_info     *task_vm_info_t;
 #define TASK_VM_INFO_COUNT      ((mach_msg_type_number_t) \
                (sizeof (task_vm_info_data_t) / sizeof (natural_t)))
-#define TASK_VM_INFO_REV2_COUNT TASK_VM_INFO_COUNT
+#define TASK_VM_INFO_REV5_COUNT TASK_VM_INFO_COUNT
+#define TASK_VM_INFO_REV4_COUNT /* doesn't include decompressions */ \
+       ((mach_msg_type_number_t) (TASK_VM_INFO_REV5_COUNT - 1))
+#define TASK_VM_INFO_REV3_COUNT /* doesn't include limit bytes */ \
+       ((mach_msg_type_number_t) (TASK_VM_INFO_REV4_COUNT - 2))
+#define TASK_VM_INFO_REV2_COUNT /* doesn't include extra ledgers info */ \
+       ((mach_msg_type_number_t) (TASK_VM_INFO_REV3_COUNT - 42))
 #define TASK_VM_INFO_REV1_COUNT /* doesn't include min and max address */ \
        ((mach_msg_type_number_t) (TASK_VM_INFO_REV2_COUNT - 4))
 #define TASK_VM_INFO_REV0_COUNT /* doesn't include phys_footprint */ \
@@ -496,6 +531,35 @@ typedef struct task_debug_info_internal task_debug_info_internal_data_t;
 
 #endif /* PRIVATE */
 
+/*
+ * Type to control EXC_GUARD delivery options for a task
+ * via task_get/set_exc_guard_behavior interface(s).
+ */
+typedef uint32_t task_exc_guard_behavior_t;
+
+/* EXC_GUARD optional delivery settings on a per-task basis */
+#define TASK_EXC_GUARD_VM_DELIVER            0x01 /* Deliver virtual memory EXC_GUARD exceptions */
+#define TASK_EXC_GUARD_VM_ONCE               0x02 /* Deliver them only once */
+#define TASK_EXC_GUARD_VM_CORPSE             0x04 /* Deliver them via a forked corpse */
+#define TASK_EXC_GUARD_VM_FATAL              0x08 /* Virtual Memory EXC_GUARD delivery is fatal */
+#define TASK_EXC_GUARD_VM_ALL                0x0f
+
+#define TASK_EXC_GUARD_MP_DELIVER            0x10 /* Deliver mach port EXC_GUARD exceptions */
+#define TASK_EXC_GUARD_MP_ONCE               0x20 /* Deliver them only once */
+#define TASK_EXC_GUARD_MP_CORPSE             0x40 /* Deliver them via a forked corpse */
+#define TASK_EXC_GUARD_MP_FATAL              0x80 /* mach port EXC_GUARD delivery is fatal */
+#define TASK_EXC_GUARD_MP_ALL                0xf0
+
+#define TASK_EXC_GUARD_ALL                   0xff /* All optional deliver settings */
+
+#ifdef PRIVATE
+/*
+ * Experimental mode of setting default guard behavior for non-Apple processes
+ * The default for 3rd party guards is shifted up 8 bits - but otherwise the same values as above.
+ */
+#define TASK_EXC_GUARD_THIRD_PARTY_DEFAULT_SHIFT 0x8 /* 3rd party default shifted up in boot-arg */
+#endif
+
 /*
  * Obsolete interfaces.
  */
@@ -506,6 +570,6 @@ typedef struct task_debug_info_internal task_debug_info_internal_data_t;
 
 #define TASK_SCHED_INFO                 14
 
-#pragma pack()
+#pragma pack(pop)
 
 #endif  /* _MACH_TASK_INFO_H_ */
index f1b7cc0c583349d010fc0e2b8450d95fef1d2d1c..6aa6e91803e0678aac09a4275df0eaffe7ed3aaf 100644 (file)
@@ -337,6 +337,7 @@ typedef struct task_policy_state *task_policy_state_t;
 #define TASK_APPTYPE_DAEMON_BACKGROUND   4
 #define TASK_APPTYPE_APP_DEFAULT         5
 #define TASK_APPTYPE_APP_TAL             6
+#define TASK_APPTYPE_DRIVER              7
 
 /* task policy state flags */
 #define TASK_IMP_RECEIVER                    0x00000001
index 779071686b76dedd694d1fd17ed52e62c7322b49..d1e5ec465ab063ee3c449fd47d86220706db0547 100644 (file)
@@ -128,4 +128,8 @@ typedef int     task_special_port_t;
 #define task_set_task_debug_control_port(task, port) \
                (task_set_special_port((task), TASK_DEBUG_CONTROL_PORT, (port)))
 
+#ifdef XNU_KERNEL_PRIVATE
+#define DEBUG_PORT_ENTITLEMENT "com.apple.private.debug_port"
+#endif /* XNU_KERNEL_PRIVATE */
+
 #endif  /* _MACH_TASK_SPECIAL_PORTS_H_ */
index 7f6ac49ffcf05fad36db53a9c94dc93fa0183f26..b0c82bdc5500f961e033f626e82dbaf5af8685e9 100644 (file)
@@ -375,6 +375,9 @@ typedef struct thread_qos_policy      *thread_qos_policy_t;
 
 #define THREAD_POLICY_INTERNAL_STRUCT_VERSION 5
 
+// legacy names
+#define thrp_qos_ipc_override   thrp_qos_kevent_override
+
 struct thread_requested_policy {
        uint64_t        thrp_int_darwinbg       :1,     /* marked as darwinbg via setpriority */
            thrp_ext_darwinbg       :1,
@@ -390,12 +393,13 @@ struct thread_requested_policy {
            thrp_qos_relprio        :4,                 /* thread qos relative priority (store as inverse, -10 -> 0xA) */
            thrp_qos_override       :3,                 /* thread qos class override */
            thrp_qos_promote        :3,                 /* thread qos class from promotion */
-           thrp_qos_ipc_override   :3,                 /* thread qos class from ipc override */
+           thrp_qos_kevent_override:3,                 /* thread qos class from kevent override */
            thrp_terminated         :1,                 /* heading for termination */
            thrp_qos_sync_ipc_override:3,               /* now unused */
            thrp_qos_workq_override :3,                 /* thread qos class override (workq) */
+           thrp_qos_wlsvc_override :3,                 /* workloop servicer qos class override */
 
-           thrp_reserved           :26;
+           thrp_reserved           :23;
 };
 
 struct thread_effective_policy {
index 90ff1e0cb74817d5eee674817e2c19767f5e77dc..a91b936ebade78aae44ee2e826661685d59dfef9 100644 (file)
@@ -89,6 +89,7 @@ typedef natural_t       thread_state_data_t[THREAD_STATE_MAX];
 #define THREAD_STATE_FLAVOR_LIST_NEW    128
 #define THREAD_STATE_FLAVOR_LIST_10_9   129
 #define THREAD_STATE_FLAVOR_LIST_10_13  130
+#define THREAD_STATE_FLAVOR_LIST_10_15  131
 
 typedef int                     thread_state_flavor_t;
 typedef thread_state_flavor_t   *thread_state_flavor_array_t;
diff --git a/osfmk/mach/vfs_nspace.defs b/osfmk/mach/vfs_nspace.defs
new file mode 100644 (file)
index 0000000..aaca5bf
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2013 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ *  Interface definition for the namespace facility.
+ */
+
+subsystem
+#if KERNEL_USER
+    KernelUser
+#endif /* KERNEL_USER */
+    vfs_nspace 867800; /* 'V''N'00 */
+
+#include <mach/std_types.defs>
+#include <mach/mach_types.defs>
+#include <atm/atm_types.defs>
+
+ServerPrefix receive_;
+UserPrefix   send_;
+
+/* DEPRECATED */
+routine nspace_handle(
+              nspace_handler_port : mach_port_t;
+              pid                 : uint32_t;
+           in path                : vfs_path_t;
+          out handler_error       : int
+);
+
+routine nspace_resolve_cancel(
+              nspace_handler_port : mach_port_t;
+              req_id              : uint32_t
+);
+
+routine nspace_resolve_path(
+              nspace_handler_port : mach_port_t;
+              req_id              : uint32_t;
+              pid                 : uint32_t;
+              op                  : uint32_t;
+           in path                : nspace_path_t;
+          out xxx_rdar44371223    : int
+);
+
+/* vim: set ft=c : */
index 2bb038e212c6fe4ecf1332334961290df6d63b59..deef9ffd87fbf2ff871c6e98ff4feb90ee7f9e83 100644 (file)
@@ -327,7 +327,12 @@ extern vm_offset_t              vm_kernel_builtinkmod_text_end;
  */
 
 __BEGIN_DECLS
+#if XNU_KERNEL_PRIVATE
+extern vm_offset_t vm_kernel_addrhash(vm_offset_t addr)
+__XNU_INTERNAL(vm_kernel_addrhash);
+#else
 extern vm_offset_t vm_kernel_addrhash(vm_offset_t addr);
+#endif
 __END_DECLS
 
 #define __DO_UNSLIDE(_v) ((vm_offset_t)VM_KERNEL_STRIP_PTR(_v) - vm_kernel_slide)
index 416699cb82bc3a56eadbe70d398970a34b520270..21abcad6e55e1388a3370a0f5a3927bd13636793 100644 (file)
@@ -50,7 +50,7 @@
 
 #include <sys/cdefs.h>
 
-#pragma pack(4)
+#pragma pack(push, 4)
 
 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
 // so probably should be a real 32b ID vs. ptr.
@@ -270,17 +270,24 @@ struct vm_region_submap_info_64 {
        vm32_object_id_t        object_id;      /* obj/map name, not a handle */
        unsigned short          user_wired_count;
        unsigned int            pages_reusable;
+       vm_object_id_t          object_id_full;
 };
 
 typedef struct vm_region_submap_info_64         *vm_region_submap_info_64_t;
 typedef struct vm_region_submap_info_64          vm_region_submap_info_data_64_t;
 
-#define VM_REGION_SUBMAP_INFO_V1_SIZE   \
+#define VM_REGION_SUBMAP_INFO_V2_SIZE   \
        (sizeof (vm_region_submap_info_data_64_t))
+#define VM_REGION_SUBMAP_INFO_V1_SIZE   \
+       (VM_REGION_SUBMAP_INFO_V2_SIZE - \
+        sizeof (vm_object_id_t) /* object_id_full */ )
 #define VM_REGION_SUBMAP_INFO_V0_SIZE   \
        (VM_REGION_SUBMAP_INFO_V1_SIZE - \
         sizeof (unsigned int) /* pages_reusable */ )
 
+#define VM_REGION_SUBMAP_INFO_V2_COUNT_64 \
+       ((mach_msg_type_number_t) \
+        (VM_REGION_SUBMAP_INFO_V2_SIZE / sizeof (natural_t)))
 #define VM_REGION_SUBMAP_INFO_V1_COUNT_64 \
        ((mach_msg_type_number_t) \
         (VM_REGION_SUBMAP_INFO_V1_SIZE / sizeof (natural_t)))
@@ -289,7 +296,7 @@ typedef struct vm_region_submap_info_64          vm_region_submap_info_data_64_t
         (VM_REGION_SUBMAP_INFO_V0_SIZE / sizeof (natural_t)))
 
 /* set this to the latest version */
-#define VM_REGION_SUBMAP_INFO_COUNT_64          VM_REGION_SUBMAP_INFO_V1_COUNT_64
+#define VM_REGION_SUBMAP_INFO_COUNT_64          VM_REGION_SUBMAP_INFO_V2_COUNT_64
 
 struct vm_region_submap_short_info_64 {
        vm_prot_t               protection;     /* present access protection */
@@ -314,8 +321,6 @@ typedef struct vm_region_submap_short_info_64    vm_region_submap_short_info_dat
        ((mach_msg_type_number_t)                                       \
         (sizeof (vm_region_submap_short_info_data_64_t) / sizeof (natural_t)))
 
-
-
 struct mach_vm_read_entry {
        mach_vm_address_t address;
        mach_vm_size_t size;
@@ -342,7 +347,7 @@ typedef struct vm_read_entry            vm_read_entry_t[VM_MAP_ENTRY_MAX];
 typedef struct vm32_read_entry          vm32_read_entry_t[VM_MAP_ENTRY_MAX];
 #endif
 
-#pragma pack()
+#pragma pack(pop)
 
 
 #define VM_PAGE_INFO_MAX
index 9e72c81d2128d07b930a87c700aa4eab1c77f313..267d5df2fbe45f2f4c083848670d2124ef80706f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -223,6 +223,7 @@ typedef struct vm_purgeable_info        *vm_purgeable_info_t;
 #define VM_PAGE_QUERY_PAGE_CS_VALIDATED 0x100
 #define VM_PAGE_QUERY_PAGE_CS_TAINTED   0x200
 #define VM_PAGE_QUERY_PAGE_CS_NX        0x400
+#define VM_PAGE_QUERY_PAGE_REUSABLE     0x800
 
 #ifdef  MACH_KERNEL_PRIVATE
 
@@ -333,12 +334,13 @@ typedef struct pmap_statistics  *pmap_statistics_t;
 #define VM_FLAGS_USER_MAP       (VM_FLAGS_USER_ALLOCATE |       \
                                 VM_FLAGS_RETURN_4K_DATA_ADDR | \
                                 VM_FLAGS_RETURN_DATA_ADDR)
-#define VM_FLAGS_USER_REMAP     (VM_FLAGS_FIXED |    \
-                                VM_FLAGS_ANYWHERE | \
-                                VM_FLAGS_RANDOM_ADDR | \
-                                VM_FLAGS_OVERWRITE| \
-                                VM_FLAGS_RETURN_DATA_ADDR |\
-                                VM_FLAGS_RESILIENT_CODESIGN)
+#define VM_FLAGS_USER_REMAP     (VM_FLAGS_FIXED |               \
+                                VM_FLAGS_ANYWHERE |            \
+                                VM_FLAGS_RANDOM_ADDR |         \
+                                VM_FLAGS_OVERWRITE|            \
+                                VM_FLAGS_RETURN_DATA_ADDR |    \
+                                VM_FLAGS_RESILIENT_CODESIGN |  \
+                                VM_FLAGS_RESILIENT_MEDIA)
 
 #define VM_FLAGS_SUPERPAGE_SHIFT 16
 #define SUPERPAGE_NONE                  0       /* no superpages, if all bits are 0 */
@@ -379,7 +381,14 @@ typedef struct {
            vmkf_remap_prot_copy:1,
            vmkf_cs_enforcement_override:1,
            vmkf_cs_enforcement:1,
-           __vmkf_unused:16;
+           vmkf_nested_pmap:1,
+           vmkf_no_copy_on_read:1,
+#if !defined(CONFIG_EMBEDDED)
+           vmkf_32bit_map_va:1,
+           __vmkf_unused:13;
+#else
+           __vmkf_unused:14;
+#endif
 } vm_map_kernel_flags_t;
 #define VM_MAP_KERNEL_FLAGS_NONE (vm_map_kernel_flags_t) {              \
        .vmkf_atomic_entry = 0, /* keep entry atomic (no coalescing) */ \
@@ -398,10 +407,39 @@ typedef struct {
        .vmkf_remap_prot_copy = 0, /* vm_remap for VM_PROT_COPY */      \
        .vmkf_cs_enforcement_override = 0, /* override CS_ENFORCEMENT */ \
        .vmkf_cs_enforcement = 0,  /* new value for CS_ENFORCEMENT */   \
+       .vmkf_nested_pmap = 0, /* use a nested pmap */                  \
+       .vmkf_no_copy_on_read = 0, /* do not use copy_on_read */        \
        .__vmkf_unused = 0                                              \
 }
+
+typedef struct {
+       unsigned int
+           vmnekf_ledger_tag:3,
+           vmnekf_ledger_no_footprint:1,
+           __vmnekf_unused:28;
+} vm_named_entry_kernel_flags_t;
+#define VM_NAMED_ENTRY_KERNEL_FLAGS_NONE (vm_named_entry_kernel_flags_t) {    \
+       .vmnekf_ledger_tag = 0,                                                \
+       .vmnekf_ledger_no_footprint = 0,                                       \
+       .__vmnekf_unused = 0                                                   \
+}
+
 #endif /* KERNEL_PRIVATE */
 
+/* current accounting postmark */
+#define __VM_LEDGER_ACCOUNTING_POSTMARK 2019032600
+
+/* discrete values: */
+#define VM_LEDGER_TAG_NONE      0x00000000
+#define VM_LEDGER_TAG_DEFAULT   0x00000001
+#define VM_LEDGER_TAG_NETWORK   0x00000002
+#define VM_LEDGER_TAG_MEDIA     0x00000003
+#define VM_LEDGER_TAG_GRAPHICS  0x00000004
+#define VM_LEDGER_TAG_NEURAL    0x00000005
+#define VM_LEDGER_TAG_MAX       0x00000005
+/* individual bits: */
+#define VM_LEDGER_FLAG_NO_FOOTPRINT     0x00000001
+#define VM_LEDGER_FLAGS (VM_LEDGER_FLAG_NO_FOOTPRINT)
 
 
 #define VM_MEMORY_MALLOC 1
@@ -569,6 +607,15 @@ typedef struct {
 /* memory allocated by Accounts framework */
 #define VM_MEMORY_ACCOUNTS 98
 
+/* memory allocated by Sanitizer runtime libraries */
+#define VM_MEMORY_SANITIZER 99
+
+/* Differentiate memory needed by GPU drivers and frameworks from generic IOKit allocations */
+#define VM_MEMORY_IOACCELERATOR 100
+
+/* memory allocated by CoreMedia for global image registration of frames */
+#define VM_MEMORY_CM_REGWARP 101
+
 /* Reserve 240-255 for application */
 #define VM_MEMORY_APPLICATION_SPECIFIC_1 240
 #define VM_MEMORY_APPLICATION_SPECIFIC_16 255
index 057c00533811c1ee94a36364e74479cd449d04e3..95eaafd5ee202b0cbad35783b734fb17d69a2395 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -95,8 +95,8 @@ typedef struct _vm_map          *vm_map_t;
 typedef struct vm_object        *vm_object_t;
 typedef struct vm_object_fault_info     *vm_object_fault_info_t;
 
-#define PMAP_NULL               ((pmap_t) 0)
-#define VM_OBJECT_NULL  ((vm_object_t) 0)
+#define PMAP_NULL               ((pmap_t) NULL)
+#define VM_OBJECT_NULL  ((vm_object_t) NULL)
 
 #else   /* KERNEL_PRIVATE */
 
@@ -104,7 +104,11 @@ typedef mach_port_t             vm_map_t;
 
 #endif  /* KERNEL_PRIVATE */
 
+#ifdef KERNEL
+#define VM_MAP_NULL             ((vm_map_t) NULL)
+#else
 #define VM_MAP_NULL             ((vm_map_t) 0)
+#endif
 
 /*
  * Evolving definitions, likely to change.
@@ -166,7 +170,7 @@ struct vm_allocation_site {
        uint16_t  flags;
        uint16_t  subtotalscount;
        struct vm_allocation_total subtotals[0];
-       char      name[0];
+       /* char      name[0]; -- this is placed after subtotals, see KA_NAME() */
 };
 typedef struct vm_allocation_site vm_allocation_site_t;
 
@@ -197,7 +201,7 @@ typedef struct upl              *upl_t;
 typedef struct vm_map_copy      *vm_map_copy_t;
 typedef struct vm_named_entry   *vm_named_entry_t;
 
-#define VM_MAP_COPY_NULL        ((vm_map_copy_t) 0)
+#define VM_MAP_COPY_NULL        ((vm_map_copy_t) NULL)
 
 #else   /* KERNEL_PRIVATE */
 
@@ -206,8 +210,14 @@ typedef mach_port_t             vm_named_entry_t;
 
 #endif  /* KERNEL_PRIVATE */
 
+#ifdef KERNEL
+#define UPL_NULL                ((upl_t) NULL)
+#define VM_NAMED_ENTRY_NULL     ((vm_named_entry_t) NULL)
+#else
 #define UPL_NULL                ((upl_t) 0)
 #define VM_NAMED_ENTRY_NULL     ((vm_named_entry_t) 0)
+#endif
+
 #ifdef PRIVATE
 typedef struct {
        uint64_t rtfabstime; // mach_continuous_time at start of fault
index 8542493e91b0ba608684f5d754e9633efc66a081..0e31820d1e2ee0874ccb753920421fc0b8f88a35 100644 (file)
@@ -10,7 +10,9 @@ PRIVATE_DATAFILES = \
        cpu_capabilities.h
 
 KERNELFILES = \
+       atomic_impl.h   \
        atomic.h        \
+       config.h                \
        cpu_capabilities.h      \
        cpu_number.h    \
        io_map_entries.h \
@@ -20,10 +22,12 @@ KERNELFILES = \
        machine_remote_time.h \
        machine_routines.h      \
        machine_kpc.h           \
+       memory_types.h          \
        monotonic.h \
        pal_routines.h          \
        pal_hibernate.h         \
-       simple_lock.h
+       simple_lock.h           \
+       smp.h
 
 EXPORT_FILES = \
        machine_remote_time.h
index 3c367624803c6013f312e517727a477c9b95048d..ab11c70046f62ac848b0fa66b5af0f00d338e763 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2015-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #ifndef _MACHINE_ATOMIC_H
 #define _MACHINE_ATOMIC_H
 
-#include <stdatomic.h>
+/*
+ * Internal implementation details are in a separate header
+ */
+#include <machine/atomic_impl.h>
+
+/*!
+ * @file <machine/atomic.h>
+ *
+ * @brief
+ * This file defines nicer (terser and safer) wrappers for C11's <stdatomic.h>.
+ *
+ * @discussion
+ * @see xnu.git::doc/atomics.md which provides more extensive documentation
+ * about this header.
+ *
+ * Note that some of the macros defined in this file may be overridden by
+ * architecture specific headers.
+ *
+ * All the os_atomic* functions take an operation ordering argument that can be:
+ * - C11 memory orders: relaxed, acquire, release, acq_rel or seq_cst which
+ *   imply a memory fence on SMP machines, and always carry the matching
+ *   compiler barrier semantics.
+ *
+ * - the os_atomic-specific `dependency` memory ordering that is used to
+ *   document intent to a carry a data or address dependency.
+ *   See doc/atomics.md for more information.
+ *
+ * - a compiler barrier: compiler_acquire, compiler_release, compiler_acq_rel
+ *   without a corresponding memory fence.
+ */
+
+/*!
+ * @function os_compiler_barrier
+ *
+ * @brief
+ * Provide a compiler barrier according to the specified ordering.
+ *
+ * @param m
+ * An optional ordering among `acquire`, `release` or `acq_rel` which defaults
+ * to `acq_rel` when not specified.
+ * These are equivalent to the `compiler_acquire`, `compiler_release` and
+ * `compiler_acq_rel` orderings taken by the os_atomic* functions
+ */
+#define os_compiler_barrier(b...) \
+               atomic_signal_fence(_os_compiler_barrier_##b)
+
+/*!
+ * @function os_atomic_thread_fence
+ *
+ * @brief
+ * Memory fence which is elided in non-SMP mode, but always carries the
+ * corresponding compiler barrier.
+ *
+ * @param m
+ * The ordering for this fence.
+ */
+#define os_atomic_thread_fence(m)  ({ \
+               atomic_thread_fence(memory_order_##m##_smp); \
+               atomic_signal_fence(memory_order_##m); \
+})
 
-#define _os_atomic_c11_atomic(p) \
-               ((typeof(*(p)) _Atomic *)(p))
+/*!
+ * @function os_atomic_init
+ *
+ * @brief
+ * Wrapper for C11 atomic_init()
+ *
+ * @discussion
+ * This initialization is not performed atomically, and so must only be used as
+ * part of object initialization before the object is made visible to other
+ * threads/cores.
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param v
+ * The value to initialize the variable with.
+ *
+ * @returns
+ * The value loaded from @a p.
+ */
+#define os_atomic_init(p, v) \
+               atomic_init(_os_atomic_c11_atomic(p), v)
 
-#define _os_atomic_basetypeof(p) \
-               typeof(atomic_load(((typeof(*(p)) _Atomic *)(p))))
+/*!
+ * @function os_atomic_load_is_plain, os_atomic_store_is_plain
+ *
+ * @brief
+ * Return whether a relaxed atomic load (resp. store) to an atomic variable
+ * is implemented as a single plain load (resp. store) instruction.
+ *
+ * @discussion
+ * Non-relaxed loads/stores may involve additional memory fence instructions
+ * or more complex atomic instructions.
+ *
+ * This is a construct that can safely be used in static asserts.
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @returns
+ * True when relaxed atomic loads (resp. stores) compile to a plain load
+ * (resp. store) instruction, false otherwise.
+ */
+#define os_atomic_load_is_plain(p)  (sizeof(*(p)) <= sizeof(void *))
+#define os_atomic_store_is_plain(p) os_atomic_load_is_plain(p)
 
-#define _os_atomic_c11_op_orig(p, v, m, o) \
-               atomic_##o##_explicit(_os_atomic_c11_atomic(p), v, \
-               memory_order_##m)
+/*!
+ * @function os_atomic_load
+ *
+ * @brief
+ * Wrapper for C11 atomic_load_explicit(), guaranteed to compile to a single
+ * plain load instruction (when @a m is `relaxed`).
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * The value loaded from @a p.
+ */
+#define os_atomic_load(p, m)  ({ \
+               _Static_assert(os_atomic_load_is_plain(p), "Load is wide"); \
+               _os_atomic_basetypeof(p) _r; \
+               _os_compiler_barrier_before_atomic(m); \
+               _r = atomic_load_explicit(_os_atomic_c11_atomic(p), \
+                               memory_order_##m##_smp); \
+               _os_compiler_barrier_after_atomic(m); \
+               _r; \
+})
 
-#define _os_atomic_c11_op(p, v, m, o, op) \
-               ({ typeof(v) _v = (v); _os_atomic_c11_op_orig(p, v, m, o) op _v; })
+/*!
+ * @function os_atomic_load_wide
+ *
+ * @brief
+ * Wrapper for C11 atomic_load_explicit(), which may be implemented by a
+ * compare-exchange loop for double-wide variables.
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * The value loaded from @a p.
+ */
+#define os_atomic_load_wide(p, m)  ({ \
+               _os_atomic_basetypeof(p) _r; \
+               _os_compiler_barrier_before_atomic(m); \
+               _r = atomic_load_explicit(_os_atomic_c11_atomic(p), \
+                               memory_order_##m##_smp); \
+               _os_compiler_barrier_after_atomic(m); \
+               _r; \
+})
 
-#define os_atomic_thread_fence(m)  atomic_thread_fence(memory_order_##m)
+/*!
+ * @function os_atomic_store
+ *
+ * @brief
+ * Wrapper for C11 atomic_store_explicit(), guaranteed to compile to a single
+ * plain store instruction (when @a m is `relaxed`).
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param v
+ * The value to store.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * The value stored at @a p.
+ */
+#define os_atomic_store(p, v, m)  ({ \
+               _Static_assert(os_atomic_store_is_plain(p), "Store is wide"); \
+               _os_atomic_basetypeof(p) _v = (v); \
+               _os_compiler_barrier_before_atomic(m); \
+               atomic_store_explicit(_os_atomic_c11_atomic(p), _v, \
+                               memory_order_##m##_smp); \
+               _os_compiler_barrier_after_atomic(m); \
+               _v; \
+})
 
-#define os_atomic_load(p, m) \
-               atomic_load_explicit(_os_atomic_c11_atomic(p), memory_order_##m)
-#define os_atomic_store(p, v, m)    _os_atomic_c11_op_orig(p, v, m, store)
+/*!
+ * @function os_atomic_store_wide
+ *
+ * @brief
+ * Wrapper for C11 atomic_store_explicit(), which may be implemented by a
+ * compare-exchange loop for double-wide variables.
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param v
+ * The value to store.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * The value stored at @a p.
+ */
+#define os_atomic_store_wide(p, v, m)  ({ \
+               _os_atomic_basetypeof(p) _v = (v); \
+               _os_compiler_barrier_before_atomic(m); \
+               atomic_store_explicit(_os_atomic_c11_atomic(p), _v, \
+                               memory_order_##m##_smp); \
+               _os_compiler_barrier_after_atomic(m); \
+               _v; \
+})
 
+/*!
+ * @function os_atomic_add, os_atomic_add_orig
+ *
+ * @brief
+ * Wrappers for C11 atomic_fetch_add_explicit().
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param v
+ * The value to add.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * os_atomic_add_orig returns the value of the variable before the atomic add,
+ * os_atomic_add returns the value of the variable after the atomic add.
+ */
 #define os_atomic_add_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_add)
 #define os_atomic_add(p, v, m)      _os_atomic_c11_op(p, v, m, fetch_add, +)
 
+/*!
+ * @function os_atomic_inc, os_atomic_inc_orig
+ *
+ * @brief
+ * Perform an atomic increment.
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * os_atomic_inc_orig returns the value of the variable before the atomic increment,
+ * os_atomic_inc returns the value of the variable after the atomic increment.
+ */
 #define os_atomic_inc_orig(p, m)    _os_atomic_c11_op_orig(p, 1, m, fetch_add)
 #define os_atomic_inc(p, m)         _os_atomic_c11_op(p, 1, m, fetch_add, +)
 
+/*!
+ * @function os_atomic_sub, os_atomic_sub_orig
+ *
+ * @brief
+ * Wrappers for C11 atomic_fetch_sub_explicit().
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param v
+ * The value to subtract.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * os_atomic_sub_orig returns the value of the variable before the atomic subtract,
+ * os_atomic_sub returns the value of the variable after the atomic subtract.
+ */
 #define os_atomic_sub_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_sub)
 #define os_atomic_sub(p, v, m)      _os_atomic_c11_op(p, v, m, fetch_sub, -)
 
+/*!
+ * @function os_atomic_dec, os_atomic_dec_orig
+ *
+ * @brief
+ * Perform an atomic decrement.
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * os_atomic_dec_orig returns the value of the variable before the atomic decrement,
+ * os_atomic_dec returns the value of the variable after the atomic decrement.
+ */
 #define os_atomic_dec_orig(p, m)    _os_atomic_c11_op_orig(p, 1, m, fetch_sub)
 #define os_atomic_dec(p, m)         _os_atomic_c11_op(p, 1, m, fetch_sub, -)
 
+/*!
+ * @function os_atomic_and, os_atomic_and_orig
+ *
+ * @brief
+ * Wrappers for C11 atomic_fetch_and_explicit().
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param v
+ * The value to and.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * os_atomic_and_orig returns the value of the variable before the atomic and,
+ * os_atomic_and returns the value of the variable after the atomic and.
+ */
 #define os_atomic_and_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_and)
 #define os_atomic_and(p, v, m)      _os_atomic_c11_op(p, v, m, fetch_and, &)
 
+/*!
+ * @function os_atomic_andnot, os_atomic_andnot_orig
+ *
+ * @brief
+ * Wrappers for C11 atomic_fetch_and_explicit(p, ~value).
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param v
+ * The value whose complement to and.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * os_atomic_andnot_orig returns the value of the variable before the atomic andnot,
+ * os_atomic_andnot returns the value of the variable after the atomic andnot.
+ */
+#define os_atomic_andnot_orig(p, v, m) _os_atomic_c11_op_orig(p, ~(v), m, fetch_and)
+#define os_atomic_andnot(p, v, m)      _os_atomic_c11_op(p, ~(v), m, fetch_and, &)
+
+/*!
+ * @function os_atomic_or, os_atomic_or_orig
+ *
+ * @brief
+ * Wrappers for C11 atomic_fetch_or_explicit().
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param v
+ * The value to or.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * os_atomic_or_orig returns the value of the variable before the atomic or,
+ * os_atomic_or returns the value of the variable after the atomic or.
+ */
 #define os_atomic_or_orig(p, v, m)  _os_atomic_c11_op_orig(p, v, m, fetch_or)
 #define os_atomic_or(p, v, m)       _os_atomic_c11_op(p, v, m, fetch_or, |)
 
+/*!
+ * @function os_atomic_xor, os_atomic_xor_orig
+ *
+ * @brief
+ * Wrappers for C11 atomic_fetch_xor_explicit().
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param v
+ * The value to xor.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * os_atomic_xor_orig returns the value of the variable before the atomic xor,
+ * os_atomic_xor returns the value of the variable after the atomic xor.
+ */
 #define os_atomic_xor_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_xor)
 #define os_atomic_xor(p, v, m)      _os_atomic_c11_op(p, v, m, fetch_xor, ^)
 
+/*!
+ * @function os_atomic_min, os_atomic_min_orig
+ *
+ * @brief
+ * Wrappers for Clang's __atomic_fetch_min()
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param v
+ * The value to minimize.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * os_atomic_min_orig returns the value of the variable before the atomic min,
+ * os_atomic_min returns the value of the variable after the atomic min.
+ */
+#define os_atomic_min_orig(p, v, m) _os_atomic_clang_op_orig(p, v, m, fetch_min)
+#define os_atomic_min(p, v, m)      _os_atomic_clang_op(p, v, m, fetch_min, MIN)
+
+/*!
+ * @function os_atomic_max, os_atomic_max_orig
+ *
+ * @brief
+ * Wrappers for Clang's __atomic_fetch_max()
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param v
+ * The value to maximize.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * os_atomic_max_orig returns the value of the variable before the atomic max,
+ * os_atomic_max returns the value of the variable after the atomic max.
+ */
+#define os_atomic_max_orig(p, v, m) _os_atomic_clang_op_orig(p, v, m, fetch_max)
+#define os_atomic_max(p, v, m)      _os_atomic_clang_op(p, v, m, fetch_max, MAX)
+
+/*!
+ * @function os_atomic_xchg
+ *
+ * @brief
+ * Wrapper for C11 atomic_exchange_explicit().
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param v
+ * The value to exchange with.
+ *
+ * @param m
+ * The ordering to use.
+ *
+ * @returns
+ * The value of the variable before the exchange.
+ */
 #define os_atomic_xchg(p, v, m)     _os_atomic_c11_op_orig(p, v, m, exchange)
 
-#define os_atomic_cmpxchg(p, e, v, m) \
-               ({ _os_atomic_basetypeof(p) _r = (e); \
-               atomic_compare_exchange_strong_explicit(_os_atomic_c11_atomic(p), \
-               &_r, v, memory_order_##m, memory_order_relaxed); })
-#define os_atomic_cmpxchgv(p, e, v, g, m) \
-               ({ _os_atomic_basetypeof(p) _r = (e); int _b = \
-               atomic_compare_exchange_strong_explicit(_os_atomic_c11_atomic(p), \
-               &_r, v, memory_order_##m, memory_order_relaxed); *(g) = _r; _b; })
-#define os_atomic_cmpxchgvw(p, e, v, g, m) \
-               ({ _os_atomic_basetypeof(p) _r = (e); int _b = \
-               atomic_compare_exchange_weak_explicit(_os_atomic_c11_atomic(p), \
-               &_r, v, memory_order_##m, memory_order_relaxed); *(g) = _r;  _b; })
+/*!
+ * @function os_atomic_cmpxchg
+ *
+ * @brief
+ * Wrapper for C11 atomic_compare_exchange_strong_explicit().
+ *
+ * @discussion
+ * Loops around os_atomic_cmpxchg() may want to consider using the
+ * os_atomic_rmw_loop() construct instead to take advantage of the C11 weak
+ * compare-exchange operation.
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param e
+ * The value expected in the atomic variable.
+ *
+ * @param v
+ * The value to store if the atomic variable has the expected value @a e.
+ *
+ * @param m
+ * The ordering to use in case of success.
+ * The ordering in case of failure is always `relaxed`.
+ *
+ * @returns
+ * 0 if the compare-exchange failed.
+ * 1 if the compare-exchange succeeded.
+ */
+#define os_atomic_cmpxchg(p, e, v, m)  ({ \
+               _os_atomic_basetypeof(p) _r = (e); int _b; \
+               _os_compiler_barrier_before_atomic(m); \
+               _b = atomic_compare_exchange_strong_explicit(_os_atomic_c11_atomic(p), \
+                               &_r, v, memory_order_##m##_smp, memory_order_relaxed); \
+               _os_compiler_barrier_after_atomic(m); \
+               _b; \
+})
+
+/*!
+ * @function os_atomic_cmpxchgv
+ *
+ * @brief
+ * Wrapper for C11 atomic_compare_exchange_strong_explicit().
+ *
+ * @discussion
+ * Loops around os_atomic_cmpxchgv() may want to consider using the
+ * os_atomic_rmw_loop() construct instead to take advantage of the C11 weak
+ * compare-exchange operation.
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param e
+ * The value expected in the atomic variable.
+ *
+ * @param v
+ * The value to store if the atomic variable has the expected value @a e.
+ *
+ * @param g
+ * A pointer to a location that is filled with the value that was present in
+ * the atomic variable before the compare-exchange (whether successful or not).
+ * This can be used to redrive compare-exchange loops.
+ *
+ * @param m
+ * The ordering to use in case of success.
+ * The ordering in case of failure is always `relaxed`.
+ *
+ * @returns
+ * 0 if the compare-exchange failed.
+ * 1 if the compare-exchange succeeded.
+ */
+#define os_atomic_cmpxchgv(p, e, v, g, m)  ({ \
+               _os_atomic_basetypeof(p) _r = (e); int _b; \
+               _os_compiler_barrier_before_atomic(m); \
+               _b = atomic_compare_exchange_strong_explicit(_os_atomic_c11_atomic(p), \
+                               &_r, v, memory_order_##m##_smp, memory_order_relaxed); \
+               _os_compiler_barrier_after_atomic(m); \
+               *(g) = _r; _b; \
+})
 
+/*!
+ * @function os_atomic_rmw_loop
+ *
+ * @brief
+ * Advanced read-modify-write construct to wrap compare-exchange loops.
+ *
+ * @param p
+ * A pointer to an atomic variable to be modified.
+ *
+ * @param ov
+ * The name of the variable that will contain the original value of the atomic
+ * variable (reloaded every iteration of the loop).
+ *
+ * @param nv
+ * The name of the variable that will contain the new value to compare-exchange
+ * the atomic variable to (typically computed from @a ov every iteration of the
+ * loop).
+ *
+ * @param m
+ * The ordering to use in case of success.
+ * The ordering in case of failure is always `relaxed`.
+ *
+ * @param ...
+ * Code block that validates the value of @p ov and computes the new value of
+ * @p nv that the atomic variable will be compare-exchanged to in an iteration
+ * of the loop.
+ *
+ * The loop can be aborted using os_atomic_rmw_loop_give_up(), e.g. when the
+ * value of @p ov is found to be "invalid" for the ovarall operation.
+ * `continue` cannot be used in this context.
+ *
+ * No stores to memory should be performed within the code block as it may cause
+ * LL/SC transactions used to implement compare-exchange to fail persistently.
+ *
+ * @returns
+ * 0 if the loop was aborted with os_atomic_rmw_loop_give_up().
+ * 1 if the loop completed.
+ */
 #define os_atomic_rmw_loop(p, ov, nv, m, ...)  ({ \
-               bool _result = false; \
+               int _result = 0; \
                typeof(p) _p = (p); \
-               ov = os_atomic_load(_p, relaxed); \
+               _os_compiler_barrier_before_atomic(m); \
+               ov = atomic_load_explicit(_os_atomic_c11_atomic(_p), \
+                               memory_order_relaxed); \
                do { \
                        __VA_ARGS__; \
-                       _result = os_atomic_cmpxchgvw(_p, ov, nv, &ov, m); \
-               } while (!_result); \
+                       _result = atomic_compare_exchange_weak_explicit( \
+                                       _os_atomic_c11_atomic(_p), &ov, nv, \
+                                       memory_order_##m##_smp, memory_order_relaxed); \
+               } while (__builtin_expect(!_result, 0)); \
+               _os_compiler_barrier_after_atomic(m); \
                _result; \
        })
 
-#define os_atomic_rmw_loop_give_up_with_fence(m, expr) \
-               ({ os_atomic_thread_fence(m); expr; __builtin_unreachable(); })
-#define os_atomic_rmw_loop_give_up(expr) \
-               os_atomic_rmw_loop_give_up_with_fence(relaxed, expr)
+/*!
+ * @function os_atomic_rmw_loop_give_up
+ *
+ * @brief
+ * Abort an os_atomic_rmw_loop() loop.
+ *
+ * @param ...
+ * Optional code block to execute before the `break` out of the loop. May
+ * further alter the control flow (e.g. using `return`, `goto`, ...).
+ */
+#define os_atomic_rmw_loop_give_up(...) ({ __VA_ARGS__; break; })
+
+/*!
+ * @typedef os_atomic_dependency_t
+ *
+ * @brief
+ * Type for dependency tokens that can be derived from loads with dependency
+ * and injected into various expressions.
+ *
+ * @warning
+ * The implementation of atomic dependencies makes painstakingly sure that the
+ * compiler doesn't know that os_atomic_dependency_t::__opaque_zero is always 0.
+ *
+ * Users of os_atomic_dependency_t MUST NOT test its value (even with an
+ * assert), as doing so would allow the compiler to reason about the value and
+ * elide its use to inject hardware dependencies (thwarting the entire purpose
+ * of the construct).
+ */
+typedef struct { unsigned long __opaque_zero; } os_atomic_dependency_t;
+
+/*!
+ * @const OS_ATOMIC_DEPENDENCY_NONE
+ *
+ * @brief
+ * A value to pass to functions that can carry dependencies, to indicate that
+ * no dependency should be carried.
+ */
+#define OS_ATOMIC_DEPENDENCY_NONE \
+               ((os_atomic_dependency_t){ 0UL })
+
+/*!
+ * @function os_atomic_make_dependency
+ *
+ * @brief
+ * Create a dependency token that can be injected into expressions to force a
+ * hardware dependency.
+ *
+ * @discussion
+ * This function is only useful for cases where the dependency needs to be used
+ * several times.
+ *
+ * os_atomic_load_with_dependency_on() and os_atomic_inject_dependency() are
+ * otherwise capable of automatically creating dependency tokens.
+ *
+ * @param v
+ * The result of:
+ * - an os_atomic_load(..., dependency),
+ * - an os_atomic_inject_dependency(),
+ * - an os_atomic_load_with_dependency_on().
+ *
+ * Note that due to implementation limitations, the type of @p v must be
+ * register-sized, if necessary an explicit cast is required.
+ *
+ * @returns
+ * An os_atomic_dependency_t token that can be used to prolongate dependency
+ * chains.
+ *
+ * The token value is always 0, but the compiler must never be able to reason
+ * about that fact (c.f. os_atomic_dependency_t)
+ */
+#define os_atomic_make_dependency(v) \
+               ((void)(v), OS_ATOMIC_DEPENDENCY_NONE)
+
+/*!
+ * @function os_atomic_inject_dependency
+ *
+ * @brief
+ * Inject a hardware dependency resulting from a `dependency` load into a
+ * specified pointer.
+ *
+ * @param p
+ * A pointer to inject the dependency into.
+ *
+ * @param e
+ * - a dependency token returned from os_atomic_make_dependency(),
+ *
+ * - OS_ATOMIC_DEPENDENCY_NONE, which turns this operation into a no-op,
+ *
+ * - any value accepted by os_atomic_make_dependency().
+ *
+ * @returns
+ * A value equal to @a p but that prolongates the dependency chain rooted at
+ * @a e.
+ */
+#define os_atomic_inject_dependency(p, e) \
+               ((typeof(*(p)) *)((p) + _os_atomic_auto_dependency(e).__opaque_zero))
 
-#define os_atomic_force_dependency_on(p, e) (p)
+/*!
+ * @function os_atomic_load_with_dependency_on
+ *
+ * @brief
+ * Load that prolongates the dependency chain rooted at `v`.
+ *
+ * @discussion
+ * This is shorthand for:
+ *
+ * <code>
+ *   os_atomic_load(os_atomic_inject_dependency(p, e), dependency)
+ * </code>
+ *
+ * @param p
+ * A pointer to an atomic variable.
+ *
+ * @param e
+ * - a dependency token returned from os_atomic_make_dependency(),
+ *
+ * - OS_ATOMIC_DEPENDENCY_NONE, which turns this operation into a no-op,
+ *
+ * - any value accepted by os_atomic_make_dependency().
+ *
+ * @returns
+ * The value loaded from @a p.
+ */
 #define os_atomic_load_with_dependency_on(p, e) \
-               os_atomic_load(os_atomic_force_dependency_on(p, e), relaxed)
+               os_atomic_load(os_atomic_inject_dependency(p, e), dependency)
+
+/*!
+ * @const OS_ATOMIC_HAS_LLSC
+ *
+ * @brief
+ * Whether the platform has LL/SC features.
+ *
+ * @discussion
+ * When set, the os_atomic_*_exclusive() macros are defined.
+ */
+#define OS_ATOMIC_HAS_LLSC  0
+
+/*!
+ * @const OS_ATOMIC_USE_LLSC
+ *
+ * @brief
+ * Whether os_atomic* use LL/SC internally.
+ *
+ * @discussion
+ * OS_ATOMIC_USE_LLSC implies OS_ATOMIC_HAS_LLSC.
+ */
+#define OS_ATOMIC_USE_LLSC  0
 
 #if defined (__x86_64__)
 #include "i386/atomic.h"
diff --git a/osfmk/machine/atomic_impl.h b/osfmk/machine/atomic_impl.h
new file mode 100644 (file)
index 0000000..9e646f8
--- /dev/null
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ * This header provides some gory details to implement the <machine/atomic.h>
+ * interfaces. Nothing in this header should be called directly, no promise is
+ * made to keep this interface stable.
+ */
+
+#ifndef _MACHINE_ATOMIC_H
+#error "Do not include <machine/atomic_impl.h> directly, use <machine/atomic.h>"
+#endif
+
+#ifndef _MACHINE_ATOMIC_IMPL_H
+#define _MACHINE_ATOMIC_IMPL_H
+
+#include <stdatomic.h>
+#include <machine/smp.h>
+
+static inline int
+memory_order_has_acquire(enum memory_order ord)
+{
+       switch (ord) {
+       case memory_order_consume:
+       case memory_order_acquire:
+       case memory_order_acq_rel:
+       case memory_order_seq_cst:
+               return 1;
+       default:
+               return 0;
+       }
+}
+
+static inline int
+memory_order_has_release(enum memory_order ord)
+{
+       switch (ord) {
+       case memory_order_release:
+       case memory_order_acq_rel:
+       case memory_order_seq_cst:
+               return 1;
+       default:
+               return 0;
+       }
+}
+
+#if __SMP__
+
+#define memory_order_relaxed_smp            memory_order_relaxed
+#define memory_order_compiler_acquire_smp   memory_order_relaxed
+#define memory_order_compiler_release_smp   memory_order_relaxed
+#define memory_order_compiler_acq_rel_smp   memory_order_relaxed
+#define memory_order_consume_smp            memory_order_consume
+#define memory_order_dependency_smp         memory_order_acquire
+#define memory_order_acquire_smp            memory_order_acquire
+#define memory_order_release_smp            memory_order_release
+#define memory_order_acq_rel_smp            memory_order_acq_rel
+#define memory_order_seq_cst_smp            memory_order_seq_cst
+
+#else
+
+#define memory_order_relaxed_smp            memory_order_relaxed
+#define memory_order_compiler_acquire_smp   memory_order_relaxed
+#define memory_order_compiler_release_smp   memory_order_relaxed
+#define memory_order_compiler_acq_rel_smp   memory_order_relaxed
+#define memory_order_consume_smp            memory_order_relaxed
+#define memory_order_dependency_smp         memory_order_relaxed
+#define memory_order_acquire_smp            memory_order_relaxed
+#define memory_order_release_smp            memory_order_relaxed
+#define memory_order_acq_rel_smp            memory_order_relaxed
+#define memory_order_seq_cst_smp            memory_order_relaxed
+
+#endif
+
+/*
+ * Hack needed for os_compiler_barrier() to work (including with empty argument)
+ */
+#define _os_compiler_barrier_relaxed        memory_order_relaxed
+#define _os_compiler_barrier_acquire        memory_order_acquire
+#define _os_compiler_barrier_release        memory_order_release
+#define _os_compiler_barrier_acq_rel        memory_order_acq_rel
+#define _os_compiler_barrier_               memory_order_acq_rel
+
+/*
+ * Mapping between compiler barrier/memory orders and:
+ * - compiler barriers before atomics ("rel_barrier")
+ * - compiler barriers after atomics ("acq_barrier")
+ */
+#define _os_rel_barrier_relaxed             memory_order_relaxed
+#define _os_rel_barrier_compiler_acquire    memory_order_relaxed
+#define _os_rel_barrier_compiler_release    memory_order_release
+#define _os_rel_barrier_compiler_acq_rel    memory_order_release
+#define _os_rel_barrier_consume             memory_order_relaxed
+#define _os_rel_barrier_dependency          memory_order_relaxed
+#define _os_rel_barrier_acquire             memory_order_relaxed
+#define _os_rel_barrier_release             memory_order_release
+#define _os_rel_barrier_acq_rel             memory_order_release
+#define _os_rel_barrier_seq_cst             memory_order_release
+
+#define _os_acq_barrier_relaxed             memory_order_relaxed
+#define _os_acq_barrier_compiler_acquire    memory_order_acquire
+#define _os_acq_barrier_compiler_release    memory_order_relaxed
+#define _os_acq_barrier_compiler_acq_rel    memory_order_acquire
+#define _os_acq_barrier_consume             memory_order_acquire
+#define _os_acq_barrier_dependency          memory_order_acquire
+#define _os_acq_barrier_acquire             memory_order_acquire
+#define _os_acq_barrier_release             memory_order_relaxed
+#define _os_acq_barrier_acq_rel             memory_order_acquire
+#define _os_acq_barrier_seq_cst             memory_order_acquire
+
+#define _os_compiler_barrier_before_atomic(m) \
+               atomic_signal_fence(_os_rel_barrier_##m)
+#define _os_compiler_barrier_after_atomic(m) \
+               atomic_signal_fence(_os_acq_barrier_##m)
+
+/*
+ * Mapping between compiler barrier/memmory orders and:
+ * - memory fences before atomics ("rel_fence")
+ * - memory fences after atomics ("acq_fence")
+ */
+#define _os_rel_fence_relaxed               memory_order_relaxed
+#define _os_rel_fence_compiler_acquire      memory_order_relaxed
+#define _os_rel_fence_compiler_release      memory_order_release
+#define _os_rel_fence_compiler_acq_rel      memory_order_release
+#define _os_rel_fence_consume               memory_order_relaxed_smp
+#define _os_rel_fence_dependency            memory_order_relaxed_smp
+#define _os_rel_fence_acquire               memory_order_relaxed_smp
+#define _os_rel_fence_release               memory_order_release_smp
+#define _os_rel_fence_acq_rel               memory_order_release_smp
+#define _os_rel_fence_seq_cst               memory_order_release_smp
+
+#define _os_acq_fence_relaxed               memory_order_relaxed
+#define _os_acq_fence_compiler_acquire      memory_order_relaxed
+#define _os_acq_fence_compiler_release      memory_order_relaxed
+#define _os_acq_fence_compiler_acq_rel      memory_order_relaxed
+#define _os_acq_fence_consume               memory_order_acquire_smp
+#define _os_acq_fence_dependency            memory_order_dependency_smp
+#define _os_acq_fence_acquire               memory_order_acquire_smp
+#define _os_acq_fence_release               memory_order_relaxed_smp
+#define _os_acq_fence_acq_rel               memory_order_acquire_smp
+#define _os_acq_fence_seq_cst               memory_order_acquire_smp
+
+#define _os_memory_fence_before_atomic(m) \
+               atomic_thread_fence(_os_rel_fence_##m)
+#define _os_memory_fence_after_atomic(m) \
+               atomic_thread_fence(_os_acq_fence_##m)
+
+/*
+ * Misc. helpers
+ */
+
+/*
+ * For this implementation, we make sure the compiler cannot coalesce any of the
+ * os_atomic calls by casting all atomic variables to `volatile _Atomic`.
+ *
+ * At the time this decision was taken, clang has been treating all `_Atomic`
+ * accesses as if qualified `volatile _Atomic`, so the cast below freezes that
+ * aspect of the codegen in time.
+ *
+ * When/if clang starts coalescing non-volatile _Atomics, we may decide to add
+ * coalescing orderings, e.g. {relaxed,acquire,release,acq_rel,seq_cst}_nv.
+ */
+#define _os_atomic_c11_atomic(p) \
+               ((typeof(*(p)) volatile _Atomic *)(p))
+
+#define _os_atomic_basetypeof(p) \
+               typeof(atomic_load(_os_atomic_c11_atomic(p)))
+
+#define _os_atomic_op_orig(p, v, m, o)  ({ \
+               _os_atomic_basetypeof(p) _r; \
+               _os_compiler_barrier_before_atomic(m); \
+               _r = o(_os_atomic_c11_atomic(p), v, memory_order_##m##_smp); \
+               _os_compiler_barrier_after_atomic(m); \
+               _r; \
+})
+
+#define _os_atomic_c11_op_orig(p, v, m, o) \
+               _os_atomic_op_orig(p, v, m, atomic_##o##_explicit)
+
+#define _os_atomic_c11_op(p, v, m, o, op) \
+               ({ typeof(v) _v = (v); _os_atomic_c11_op_orig(p, _v, m, o) op _v; })
+
+#define _os_atomic_clang_op_orig(p, v, m, o) \
+               _os_atomic_op_orig(p, v, m, __atomic_##o)
+
+#define _os_atomic_clang_op(p, v, m, o, op) \
+               ({ typeof(v) _v = (v); _os_atomic_basetypeof(p) _r = \
+                       _os_atomic_clang_op_orig(p, _v, m, o); op(_r, _v); })
+
+#define _os_atomic_auto_dependency(e) \
+               _Generic(e, \
+                       os_atomic_dependency_t: (e), \
+                       default: os_atomic_make_dependency(e))
+
+#endif /* _MACHINE_ATOMIC_IMPL_H */
diff --git a/osfmk/machine/memory_types.h b/osfmk/machine/memory_types.h
new file mode 100644 (file)
index 0000000..bb0d637
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifndef _MACHINE_MEMORY_TYPES_H
+#define _MACHINE_MEMORY_TYPES_H
+
+#if defined (__i386__) || defined(__x86_64__)
+#include "i386/memory_types.h"
+#elif defined (__arm__) || defined (__arm64__)
+#include "arm/memory_types.h"
+#else
+#error architecture not supported
+#endif
+
+#endif /* _MACHINE_MEMORY_TYPES_H */
index 4b3d9df3169932164a19d910b1acd23bd5575b7b..5ee28781ef6b662fe85b9be9956e5b04fd77979c 100644 (file)
@@ -48,6 +48,11 @@ struct mt_cpu {
        uint64_t mtc_snaps[MT_CORE_NFIXED];
        uint64_t mtc_counts[MT_CORE_NFIXED];
        uint64_t mtc_counts_last[MT_CORE_NFIXED];
+       uint64_t mtc_npmis;
+       /*
+        * Whether this CPU should be using PMCs.
+        */
+       bool mtc_active;
 };
 
 struct mt_thread {
@@ -60,6 +65,8 @@ struct mt_task {
 };
 
 struct mt_cpu *mt_cur_cpu(void);
+
+uint64_t mt_count_pmis(void);
 void mt_mtc_update_fixed_counts(struct mt_cpu *mtc, uint64_t *counts,
     uint64_t *counts_since);
 uint64_t mt_mtc_update_count(struct mt_cpu *mtc, unsigned int ctr);
diff --git a/osfmk/machine/xpr.h b/osfmk/machine/xpr.h
deleted file mode 100644 (file)
index ee3be2d..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-#ifndef _MACHINE_XPR_H
-#define _MACHINE_XPR_H
-
-#if defined (__i386__) || defined (__x86_64__)
-#include "i386/xpr.h"
-#elif defined (__arm__) || defined (__arm64__)
-#include "arm/xpr.h"
-#else
-#error architecture not supported
-#endif
-
-#endif /* _MACHINE_XPR_H */
index 44c865a17dff55903cf936ea5dc9b682f6c223b6..4e000828b737d9b7f7149a29d7e29765f9567121 100644 (file)
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
-#include <kern/cpu_data.h>
+#include <kern/locks.h>
 #include <kern/cpu_number.h>
-#include <kern/kalloc.h>
-#include <kern/machine.h>
-#include <kern/misc_protos.h>
-#include <kern/processor.h>
-#include <kern/sched.h>
-#include <kern/startup.h>
-#include <kern/thread.h>
-#include <kern/thread_call.h>
-#include <mach/machine.h>
-#include <mach/processor.h>
-#include <machine/cpu_data.h>
-#include <machine/simple_lock.h>
-#include <sys/errno.h>
-#include <sys/kdebug.h>
+#include <libkern/section_keywords.h>
+#include <libkern/crypto/sha2.h>
+#include <machine/machine_cpu.h>
+#include <machine/machine_routines.h>
+#include <pexpert/pexpert.h>
 #include <sys/random.h>
-#include <vm/pmap.h>
-#include <vm/vm_page.h>
-
+#include <prng/random.h>
 #include <corecrypto/ccdigest.h>
 #include <corecrypto/ccdrbg.h>
 #include <corecrypto/cckprng.h>
-#include <corecrypto/ccsha1.h>
 #include <corecrypto/ccsha2.h>
-#include <prng/random.h>
 
-#include <IOKit/IOPlatformExpert.h>
-#include <console/serial_protos.h>
-#include <pexpert/pexpert.h>
+static struct cckprng_ctx *prng_ctx;
 
-#include <libkern/section_keywords.h>
+static SECURITY_READ_ONLY_LATE(struct cckprng_funcs) prng_funcs;
+static SECURITY_READ_ONLY_LATE(int) prng_ready;
 
-#if defined(__arm__) || defined(__arm64__)
-#include <arm/cpu_data_internal.h> // For MAX_CPUS
-#endif
-
-#if defined(__x86_64__)
-#include <i386/cpuid.h>
+entropy_data_t EntropyData = {};
 
-static int
-rdseed_step(uint64_t * seed)
-{
-       uint8_t ok;
+#define SEED_SIZE (SHA256_DIGEST_LENGTH)
+static uint8_t bootseed[SEED_SIZE];
 
-       asm volatile ("rdseed %0; setc %1" : "=r"(*seed), "=qm"(ok));
-
-       return (int)ok;
-}
-
-static int
-rdseed_retry(uint64_t * seed, size_t nretries)
+static void
+bootseed_init_bootloader(const struct ccdigest_info * di, ccdigest_ctx_t ctx)
 {
-       size_t i;
+       uint8_t seed[64];
+       uint32_t n;
 
-       for (i = 0; i < nretries; i += 1) {
-               if (rdseed_step(seed)) {
-                       return 1;
-               } else {
-                       asm volatile ("pause");
-               }
+       n = PE_get_random_seed(seed, sizeof(seed));
+       if (n < sizeof(seed)) {
+               /*
+                * Insufficient entropy is fatal.  We must fill the
+                * entire entropy buffer during initializaton.
+                */
+               panic("Expected %lu seed bytes from bootloader, but got %u.\n", sizeof(seed), n);
        }
 
-       return 0;
+       ccdigest_update(di, ctx, sizeof(seed), seed);
+       cc_clear(sizeof(seed), seed);
 }
 
-static size_t
-rdseed_seed(void * buf, size_t nwords)
-{
-       uint64_t * buf_words;
-       size_t i;
-
-       if (nwords > 8) {
-               nwords = 8;
-       }
-
-       buf_words = buf;
-       for (i = 0; i < nwords; i += 1) {
-               if (!rdseed_retry(buf_words + i, 10)) {
-                       return i;
-               }
-       }
-
-       return nwords;
-}
+#if defined(__x86_64__)
+#include <i386/cpuid.h>
 
-static int
-rdrand_step(uint64_t * rand)
+static void
+bootseed_init_native(const struct ccdigest_info * di, ccdigest_ctx_t ctx)
 {
+       uint64_t x;
        uint8_t ok;
+       size_t i = 0;
+       size_t n;
 
-       asm volatile ("rdrand %0; setc %1" : "=r"(*rand), "=qm"(ok));
-
-       return (int)ok;
-}
-
-static int
-rdrand_retry(uint64_t * rand, size_t nretries)
-{
-       size_t i;
-
-       for (i = 0; i < nretries; i += 1) {
-               if (rdrand_step(rand)) {
-                       return 1;
+       if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_RDSEED) {
+               n = SEED_SIZE / sizeof(x);
+
+               while (i < n) {
+                       asm volatile ("rdseed %0; setc %1" : "=r"(x), "=qm"(ok) : : "cc");
+                       if (ok) {
+                               ccdigest_update(di, ctx, sizeof(x), &x);
+                               i += 1;
+                       } else {
+                               // Intel recommends to pause between unsuccessful rdseed attempts.
+                               cpu_pause();
+                       }
                }
-       }
-
-       return 0;
-}
-
-static size_t
-rdrand_seed(void * buf, size_t nwords)
-{
-       size_t i;
-       uint64_t w;
-       uint8_t hash[CCSHA256_OUTPUT_SIZE];
-       const struct ccdigest_info * di = &ccsha256_ltc_di;
-
-       ccdigest_di_decl(di, ctx);
-       ccdigest_init(di, ctx);
-
-       for (i = 0; i < 1023; i += 1) {
-               if (!rdrand_retry(&w, 10)) {
-                       nwords = 0;
-                       goto out;
+       } else if (cpuid_features() & CPUID_FEATURE_RDRAND) {
+               // The Intel documentation guarantees a reseed every 512 rdrand calls.
+               n = (SEED_SIZE / sizeof(x)) * 512;
+
+               while (i < n) {
+                       asm volatile ("rdrand %0; setc %1" : "=r"(x), "=qm"(ok) : : "cc");
+                       if (ok) {
+                               ccdigest_update(di, ctx, sizeof(x), &x);
+                               i += 1;
+                       } else {
+                               // Intel does not recommend pausing between unsuccessful rdrand attempts.
+                       }
                }
-               ccdigest_update(di, ctx, sizeof w, &w);
        }
 
-       ccdigest_final(di, ctx, hash);
-
-       if (nwords > 2) {
-               nwords = 2;
-       }
-
-       memcpy(buf, hash, nwords * sizeof(uint64_t));
-
-out:
-       ccdigest_di_clear(di, ctx);
-       bzero(hash, sizeof hash);
-       bzero(&w, sizeof w);
-
-       return nwords;
+       cc_clear(sizeof(x), &x);
 }
 
+#else
+
 static void
-intel_entropysource(void * buf, size_t * nbytes)
+bootseed_init_native(__unused const struct ccdigest_info * di, __unused ccdigest_ctx_t ctx)
 {
-       size_t nwords;
-
-       /* only handle complete words */
-       assert(*nbytes % sizeof(uint64_t) == 0);
-
-       nwords = (*nbytes) / sizeof(uint64_t);
-       if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_RDSEED) {
-               nwords  = rdseed_seed(buf, nwords);
-               *nbytes = nwords * sizeof(uint64_t);
-       } else if (cpuid_features() & CPUID_FEATURE_RDRAND) {
-               nwords  = rdrand_seed(buf, nwords);
-               *nbytes = nwords * sizeof(uint64_t);
-       } else {
-               *nbytes = 0;
-       }
 }
 
-#endif /* defined(__x86_64__) */
-
-void entropy_buffer_read(void * buffer, size_t * count);
-
-typedef void (*entropysource)(void * buf, size_t * nbytes);
-
-static const entropysource entropysources[] = {
-       entropy_buffer_read,
-#if defined(__x86_64__)
-       intel_entropysource,
 #endif
-};
-
-static const size_t nsources = sizeof entropysources / sizeof entropysources[0];
-
-static size_t
-entropy_readall(void * buf, size_t nbytes_persource)
-{
-       uint8_t * buf_bytes = buf;
-       size_t i;
-       size_t nbytes_total = 0;
-
-       for (i = 0; i < nsources; i += 1) {
-               size_t nbytes = nbytes_persource;
-               entropysources[i](buf_bytes, &nbytes);
-               bzero(buf_bytes + nbytes, nbytes_persource - nbytes);
-               nbytes_total += nbytes;
-               buf_bytes += nbytes_persource;
-       }
 
-       return nbytes_total;
-}
-
-static struct {
-       struct cckprng_ctx ctx;
-       struct {
-               lck_grp_t * group;
-               lck_attr_t * attrs;
-               lck_grp_attr_t * group_attrs;
-               lck_mtx_t * mutex;
-       } lock;
-} prng;
-
-static SECURITY_READ_ONLY_LATE(prng_fns_t) prng_fns = NULL;
-
-static int
-prng_init(cckprng_ctx_t ctx, size_t nbytes, const void * seed)
+static void
+bootseed_init(void)
 {
-       int err = prng_fns->init(ctx, nbytes, seed);
-       if (err == CCKPRNG_ABORT) {
-               panic("prng_init");
-       }
-       return err;
-}
-
-#define PERMIT_WRITE_RANDOM 0
+       const struct ccdigest_info * di = &ccsha256_ltc_di;
 
-#if PERMIT_WRITE_RANDOM
-static int
-prng_reseed(cckprng_ctx_t ctx, size_t nbytes, const void * seed)
-{
-       int err = prng_fns->reseed(ctx, nbytes, seed);
-       if (err == CCKPRNG_ABORT) {
-               panic("prng_reseed");
-       }
-       return err;
-}
-#endif
+       ccdigest_di_decl(di, ctx);
+       ccdigest_init(di, ctx);
 
-static int
-prng_addentropy(cckprng_ctx_t ctx, size_t nbytes, const void * entropy)
-{
-       int err = prng_fns->addentropy(ctx, nbytes, entropy);
-       if (err == CCKPRNG_ABORT) {
-               panic("prng_addentropy");
-       }
-       return err;
-}
+       bootseed_init_bootloader(di, ctx);
+       bootseed_init_native(di, ctx);
 
-static int
-prng_generate(cckprng_ctx_t ctx, size_t nbytes, void * out)
-{
-       int err = prng_fns->generate(ctx, nbytes, out);
-       if (err == CCKPRNG_ABORT) {
-               panic("prng_generate");
-       }
-       return err;
+       ccdigest_final(di, ctx, bootseed);
+       ccdigest_di_clear(di, ctx);
 }
 
-entropy_data_t EntropyData = {.index_ptr = EntropyData.buffer};
+#define EARLY_RANDOM_STATE_STATIC_SIZE (264)
 
 static struct {
-       uint8_t seed[nsources][EARLY_RANDOM_SEED_SIZE];
-       int seedset;
-       uint8_t master_drbg_state[EARLY_RANDOM_STATE_STATIC_SIZE];
-       struct ccdrbg_state * drbg_states[MAX_CPUS];
+       uint8_t drbg_state[EARLY_RANDOM_STATE_STATIC_SIZE];
        struct ccdrbg_info drbg_info;
        const struct ccdrbg_nisthmac_custom drbg_custom;
 } erandom = {.drbg_custom = {
-                    .di         = &ccsha1_eay_di,
+                    .di         = &ccsha256_ltc_di,
                     .strictFIPS = 0,
             }};
 
 static void read_erandom(void * buf, uint32_t nbytes);
 
-void
-entropy_buffer_read(void * buffer, size_t * count)
-{
-       boolean_t current_state;
-       unsigned int i, j;
-
-       if (!erandom.seedset) {
-               panic("early_random was never invoked");
-       }
-
-       if (*count > ENTROPY_BUFFER_BYTE_SIZE) {
-               *count = ENTROPY_BUFFER_BYTE_SIZE;
-       }
-
-       current_state = ml_early_set_interrupts_enabled(FALSE);
-
-       memcpy(buffer, EntropyData.buffer, *count);
-
-       /* Consider removing this mixing step rdar://problem/31668239 */
-       for (i = 0, j = (ENTROPY_BUFFER_SIZE - 1); i < ENTROPY_BUFFER_SIZE; j = i, i++) {
-               EntropyData.buffer[i] = EntropyData.buffer[i] ^ EntropyData.buffer[j];
-       }
-
-       (void) ml_early_set_interrupts_enabled(current_state);
-
-#if DEVELOPMENT || DEBUG
-       uint32_t * word = buffer;
-       /* Good for both 32-bit and 64-bit kernels. */
-       for (i = 0; i < ENTROPY_BUFFER_SIZE; i += 4) {
-               /*
-                * We use "EARLY" here so that we can grab early entropy on
-                * ARM, where tracing is not started until after PRNG is
-                * initialized.
-                */
-               KERNEL_DEBUG_EARLY(ENTROPY_READ(i / 4), word[i + 0], word[i + 1], word[i + 2], word[i + 3]);
-       }
-#endif
-}
-
 /*
  * Return a uniformly distributed 64-bit random number.
  *
- * This interface should have minimal dependencies on kernel
- * services, and thus be available very early in the life
- * of the kernel.
- * This provides cryptographically secure randomness.
- * Each processor has its own generator instance.
- * It is seeded (lazily) with entropy provided by the Booter.
+ * This interface should have minimal dependencies on kernel services,
+ * and thus be available very early in the life of the kernel.
  *
- * For <rdar://problem/17292592> the algorithm switched from LCG to
- * NIST HMAC DBRG as follows:
- *  - When first called (on OSX this is very early while page tables are being
- *    built) early_random() calls ccdrbg_factory_hmac() to set-up a ccdbrg info
- *    structure.
- *  - The boot processor's ccdrbg state structure is a statically allocated area
- *    which is then initialized by calling the ccdbrg_init method.
- *    The initial entropy is 16 bytes of boot entropy.
- *    The nonce is the first 8 bytes of entropy xor'ed with a timestamp
- *    from ml_get_timebase().
- *    The personalization data provided is null.
- *  - The first 64-bit random value is returned on the boot processor from
- *    an invocation of the ccdbrg_generate method.
- *  - Non-boot processor's DRBG state structures are allocated dynamically
- *    from prng_init(). Each is initialized with the same 16 bytes of entropy
- *    but with a different timestamped nonce and cpu number as personalization.
- *  - Subsequent calls to early_random() pass to read_erandom() to generate
- *    an 8-byte random value.  read_erandom() ensures that pre-emption is
- *    disabled and selects the DBRG state from the current processor.
- *    The ccdbrg_generate method is called for the required random output.
- *    If this method returns CCDRBG_STATUS_NEED_RESEED, the erandom.seed buffer
- *    is re-filled with kernel-harvested entropy and the ccdbrg_reseed method is
- *    called with this new entropy. The kernel panics if a reseed fails.
+ * This provides cryptographically secure randomness contingent on the
+ * quality of the seed. It is seeded (lazily) with entropy provided by
+ * the Booter.
+ *
+ * The implementation is a NIST HMAC-SHA256 DRBG instance used as
+ * follows:
+ *
+ *  - When first called (on macOS this is very early while page tables
+ *    are being built) early_random() calls ccdrbg_factory_hmac() to
+ *    set-up a ccdbrg info structure.
+ *
+ *  - The boot seed (64 bytes) is hashed with SHA256. Where available,
+ *    hardware RNG outputs are mixed into the seed. (See
+ *    bootseed_init.) The resulting seed is 32 bytes.
+ *
+ *  - The ccdrbg state structure is a statically allocated area which
+ *    is then initialized by calling the ccdbrg_init method. The
+ *    initial entropy is the 32-byte seed described above. The nonce
+ *    is an 8-byte timestamp from ml_get_timebase(). The
+ *    personalization data provided is a fixed string.
+ *
+ *  - 64-bit outputs are generated via read_erandom, a wrapper around
+ *    the ccdbrg_generate method. (Since "strict FIPS" is disabled,
+ *    the DRBG will never request a reseed.)
+ *
+ *  - After the kernel PRNG is initialized, read_erandom defers
+ *    generation to it via read_random_generate. (Note that this
+ *    function acquires a per-processor mutex.)
  */
 uint64_t
 early_random(void)
 {
-       uint32_t cnt = 0;
        uint64_t result;
        uint64_t nonce;
        int rc;
-       int ps;
-       struct ccdrbg_state * state;
-
-       if (!erandom.seedset) {
-               erandom.seedset = 1;
-               cnt             = PE_get_random_seed((unsigned char *)EntropyData.buffer, sizeof(EntropyData.buffer));
-
-               if (cnt < sizeof(EntropyData.buffer)) {
-                       /*
-                        * Insufficient entropy is fatal.  We must fill the
-                        * entire entropy buffer during initializaton.
-                        */
-                       panic("EntropyData needed %lu bytes, but got %u.\n", sizeof(EntropyData.buffer), cnt);
-               }
+       const char ps[] = "xnu early random";
+       static int init = 0;
 
-               entropy_readall(&erandom.seed, EARLY_RANDOM_SEED_SIZE);
+       if (init == 0) {
+               bootseed_init();
 
                /* Init DRBG for NIST HMAC */
                ccdrbg_factory_nisthmac(&erandom.drbg_info, &erandom.drbg_custom);
-               assert(erandom.drbg_info.size <= sizeof(erandom.master_drbg_state));
-               state                           = (struct ccdrbg_state *)erandom.master_drbg_state;
-               erandom.drbg_states[master_cpu] = state;
+               assert(erandom.drbg_info.size <= sizeof(erandom.drbg_state));
 
                /*
                 * Init our DBRG from the boot entropy and a timestamp as nonce
                 * and the cpu number as personalization.
                 */
-               assert(sizeof(erandom.seed) > sizeof(nonce));
+               assert(sizeof(bootseed) > sizeof(nonce));
                nonce = ml_get_timebase();
-               ps    = 0; /* boot cpu */
-               rc    = ccdrbg_init(&erandom.drbg_info, state, sizeof(erandom.seed), erandom.seed, sizeof(nonce), &nonce, sizeof(ps), &ps);
-               cc_clear(sizeof(nonce), &nonce);
+               rc = ccdrbg_init(&erandom.drbg_info, (struct ccdrbg_state *)erandom.drbg_state, sizeof(bootseed), bootseed, sizeof(nonce), &nonce, sizeof(ps) - 1, ps);
                if (rc != CCDRBG_STATUS_OK) {
                        panic("ccdrbg_init() returned %d", rc);
                }
 
-               /* Generate output */
-               rc = ccdrbg_generate(&erandom.drbg_info, state, sizeof(result), &result, 0, NULL);
-               if (rc != CCDRBG_STATUS_OK) {
-                       panic("ccdrbg_generate() returned %d", rc);
-               }
+               cc_clear(sizeof(nonce), &nonce);
 
-               return result;
+               init = 1;
        }
-       ;
 
-#if defined(__x86_64__)
-       /*
-        * Calling read_erandom() before gsbase is initialized is potentially
-        * catastrophic, so assert that it's not set to the magic value set
-        * in i386_init.c before proceeding with the call.  We cannot use
-        * assert here because it ultimately calls panic, which executes
-        * operations that involve accessing %gs-relative data (and additionally
-        * causes a debug trap which will not work properly this early in boot.)
-        */
-       if (rdmsr64(MSR_IA32_GS_BASE) == EARLY_GSBASE_MAGIC) {
-               kprintf("[early_random] Cannot proceed: GSBASE is not initialized\n");
-               hlt();
-               /*NOTREACHED*/
-       }
-#endif
        read_erandom(&result, sizeof(result));
 
        return result;
 }
 
 static void
-read_erandom(void * buffer, u_int numBytes)
+read_random_generate(uint8_t *buffer, u_int numbytes);
+
+static void
+read_erandom(void * buf, uint32_t nbytes)
 {
-       int cpu;
+       uint8_t * buffer_bytes = buf;
+       size_t n;
        int rc;
-       size_t nbytes;
-       struct ccdrbg_state * state;
-
-       mp_disable_preemption();
-       cpu   = cpu_number();
-       state = erandom.drbg_states[cpu];
-       assert(state);
-       for (;;) {
-               /* Generate output */
-               rc = ccdrbg_generate(&erandom.drbg_info, state, numBytes, buffer, 0, NULL);
-               if (rc == CCDRBG_STATUS_OK) {
-                       break;
-               }
-               if (rc == CCDRBG_STATUS_NEED_RESEED) {
-                       /* It's time to reseed. Get more entropy */
-                       nbytes = entropy_readall(erandom.seed, EARLY_RANDOM_SEED_SIZE);
-                       assert(nbytes >= EARLY_RANDOM_SEED_SIZE);
-                       rc = ccdrbg_reseed(&erandom.drbg_info, state, sizeof(erandom.seed), erandom.seed, 0, NULL);
-                       cc_clear(sizeof(erandom.seed), erandom.seed);
-                       if (rc == CCDRBG_STATUS_OK) {
-                               continue;
-                       }
-                       panic("read_erandom reseed error %d\n", rc);
-               }
-               panic("read_erandom ccdrbg error %d\n", rc);
+
+       // We defer to the kernel PRNG after it has been installed and
+       // initialized. This happens during corecrypto kext
+       // initialization.
+       if (prng_ready) {
+               read_random_generate(buf, nbytes);
+               return;
        }
-       mp_enable_preemption();
-}
 
-void
-read_frandom(void * buffer, u_int numBytes)
-{
-       uint8_t * buffer_bytes = buffer;
-       int nbytes;
+       // The DBRG request size is limited, so we break the request into
+       // chunks.
+       while (nbytes > 0) {
+               n = MIN(nbytes, PAGE_SIZE);
 
-       /*
-        * Split up into requests for blocks smaller than
-        * than the DBRG request limit. iThis limit is private but
-        * for NISTHMAC it's known to be greater then 4096.
-        */
-       while (numBytes) {
-               nbytes = MIN(numBytes, PAGE_SIZE);
-               read_erandom(buffer_bytes, nbytes);
-               buffer_bytes += nbytes;
-               numBytes -= nbytes;
+               // Since "strict FIPS" is disabled, the DRBG will never
+               // request a reseed; therefore, we panic on any error
+               rc = ccdrbg_generate(&erandom.drbg_info, (struct ccdrbg_state *)erandom.drbg_state, n, buffer_bytes, 0, NULL);
+               if (rc != CCDRBG_STATUS_OK) {
+                       panic("read_erandom ccdrbg error %d\n", rc);
+               }
+
+               buffer_bytes += n;
+               nbytes -= n;
        }
 }
 
 void
-early_random_cpu_init(int cpu)
+read_frandom(void * buffer, u_int numBytes)
 {
-       uint64_t nonce;
-       int rc;
-       struct ccdrbg_state * state;
-
-       /*
-        * Allocate state and initialize DBRG state for early_random()
-        * for this processor.
-        */
-       assert(cpu != master_cpu);
-       assert(erandom.drbg_states[cpu] == NULL);
-
-       state = kalloc(erandom.drbg_info.size);
-       if (state == NULL) {
-               panic("prng_init kalloc failed\n");
-       }
-       erandom.drbg_states[cpu] = state;
-
-       /*
-        * Init our DBRG from boot entropy, nonce as timestamp
-        * and use the cpu number as the personalization parameter.
-        */
-       nonce = ml_get_timebase();
-       rc    = ccdrbg_init(&erandom.drbg_info, state, sizeof(erandom.seed), erandom.seed, sizeof(nonce), &nonce, sizeof(cpu), &cpu);
-       cc_clear(sizeof(nonce), &nonce);
-       if (rc != CCDRBG_STATUS_OK) {
-               panic("ccdrbg_init() returned %d", rc);
-       }
+       read_erandom(buffer, numBytes);
 }
 
 void
-register_and_init_prng(prng_fns_t fns)
+register_and_init_prng(struct cckprng_ctx *ctx, const struct cckprng_funcs *funcs)
 {
-       uint8_t buf[nsources][ENTROPY_BUFFER_BYTE_SIZE];
-       size_t nbytes;
-
        assert(cpu_number() == master_cpu);
-       assert(prng_fns == NULL);
+       assert(!prng_ready);
 
-       prng_fns = fns;
+       prng_ctx = ctx;
+       prng_funcs = *funcs;
 
-       /* make a mutex to control access */
-       prng.lock.group_attrs = lck_grp_attr_alloc_init();
-       prng.lock.group       = lck_grp_alloc_init("random", prng.lock.group_attrs);
-       prng.lock.attrs       = lck_attr_alloc_init();
-       prng.lock.mutex       = lck_mtx_alloc_init(prng.lock.group, prng.lock.attrs);
+       uint64_t nonce = ml_get_timebase();
+       prng_funcs.init(prng_ctx, MAX_CPUS, sizeof(EntropyData.buffer), EntropyData.buffer, &EntropyData.sample_count, sizeof(bootseed), bootseed, sizeof(nonce), &nonce);
+       prng_funcs.initgen(prng_ctx, master_cpu);
+       prng_ready = 1;
 
-       nbytes = entropy_readall(buf, ENTROPY_BUFFER_BYTE_SIZE);
-       (void)prng_init(&prng.ctx, nbytes, buf);
-       cc_clear(sizeof(buf), buf);
+       cc_clear(sizeof(bootseed), bootseed);
+       cc_clear(sizeof(erandom), &erandom);
 }
 
-static void
-Reseed(void)
+void
+random_cpu_init(int cpu)
 {
-       uint8_t buf[nsources][ENTROPY_BUFFER_BYTE_SIZE];
-       size_t nbytes;
+       assert(cpu != master_cpu);
 
-       lck_mtx_assert(prng.lock.mutex, LCK_MTX_ASSERT_OWNED);
+       if (!prng_ready) {
+               panic("random_cpu_init: kernel prng has not been installed");
+       }
 
-       nbytes = entropy_readall(buf, ENTROPY_BUFFER_BYTE_SIZE);
-       PRNG_CCKPRNG((void)prng_addentropy(&prng.ctx, nbytes, buf));
-       cc_clear(sizeof(buf), buf);
+       prng_funcs.initgen(prng_ctx, cpu);
 }
 
 /* export good random numbers to the rest of the kernel */
 void
 read_random(void * buffer, u_int numbytes)
 {
-       int err;
-
-       lck_mtx_lock(prng.lock.mutex);
+       prng_funcs.refresh(prng_ctx);
+       read_random_generate(buffer, numbytes);
+}
 
+static void
+ensure_gsbase(void)
+{
+#if defined(__x86_64__) && (DEVELOPMENT || DEBUG)
        /*
-        * Call PRNG, reseeding and retrying if requested.
+        * Calling cpu_number() before gsbase is initialized is potentially
+        * catastrophic, so assert that it's not set to the magic value set
+        * in i386_init.c before proceeding with the call.  We cannot use
+        * assert here because it ultimately calls panic, which executes
+        * operations that involve accessing %gs-relative data (and additionally
+        * causes a debug trap which will not work properly this early in boot.)
         */
-       for (;;) {
-               PRNG_CCKPRNG(err = prng_generate(&prng.ctx, numbytes, buffer));
-               if (err == CCKPRNG_OK) {
-                       break;
-               }
-               if (err == CCKPRNG_NEED_ENTROPY) {
-                       Reseed();
-                       continue;
-               }
-               panic("read_random() error %d\n", err);
+       if (rdmsr64(MSR_IA32_GS_BASE) == EARLY_GSBASE_MAGIC) {
+               kprintf("[early_random] Cannot proceed: GSBASE is not initialized\n");
+               hlt();
+               /*NOTREACHED*/
        }
+#endif
+}
+
+static void
+read_random_generate(uint8_t *buffer, u_int numbytes)
+{
+       ensure_gsbase();
+
+       while (numbytes > 0) {
+               size_t n = MIN(numbytes, CCKPRNG_GENERATE_MAX_NBYTES);
 
-       lck_mtx_unlock(prng.lock.mutex);
+               prng_funcs.generate(prng_ctx, cpu_number(), n, buffer);
+
+               buffer += n;
+               numbytes -= n;
+       }
 }
 
 int
 write_random(void * buffer, u_int numbytes)
 {
-#if PERMIT_WRITE_RANDOM
-       int err;
+       uint8_t seed[SHA256_DIGEST_LENGTH];
+       SHA256_CTX ctx;
 
-       lck_mtx_lock(prng.lock.mutex);
-       err = prng_reseed(&prng.ctx, numbytes, buffer);
-       lck_mtx_unlock(prng.lock.mutex);
+       /* hash the input to minimize the time we need to hold the lock */
+       SHA256_Init(&ctx);
+       SHA256_Update(&ctx, buffer, numbytes);
+       SHA256_Final(seed, &ctx);
+
+       prng_funcs.reseed(prng_ctx, sizeof(seed), seed);
+       cc_clear(sizeof(seed), seed);
 
-       return err ? EIO : 0;
-#else
-#pragma unused(buffer, numbytes)
        return 0;
-#endif
 }
 
 /*
@@ -620,9 +366,7 @@ void
 random_bool_init(struct bool_gen * bg)
 {
        /* Seed the random boolean generator */
-       for (int i = 0; i < RANDOM_BOOL_GEN_SEED_COUNT; i++) {
-               bg->seed[i] = (unsigned int)early_random();
-       }
+       read_frandom(bg->seed, sizeof(bg->seed));
        bg->state = 0;
        simple_lock_init(&bg->lock, 0);
 }
index a49b6c7301dc3c91d6c6e9c158cf1be81a0d48b8..61432793bbee18d7f58b15496424ffb5f8385d68 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 
 __BEGIN_DECLS
 
+#include <corecrypto/cckprng.h>
+
 #ifdef XNU_KERNEL_PRIVATE
 
-#define ENTROPY_BUFFER_BYTE_SIZE 64
+#define ENTROPY_BUFFER_BYTE_SIZE 32
 
 #define ENTROPY_BUFFER_SIZE (ENTROPY_BUFFER_BYTE_SIZE / sizeof(uint32_t))
 
+// This mask can be applied to EntropyData.sample_count to get an
+// index suitable for storing the next sample in
+// EntropyData.buffer. Note that ENTROPY_BUFFER_SIZE must be a power
+// of two for the following mask calculation to be valid.
+#define ENTROPY_BUFFER_INDEX_MASK (ENTROPY_BUFFER_SIZE - 1)
+
 typedef struct entropy_data {
        /*
-        * TODO: Should index_ptr be volatile?  Are we exposed to any races that
+        * TODO: Should sample_count be volatile?  Are we exposed to any races that
         * we care about if it is not?
         */
-       uint32_t * index_ptr;
+
+       // At 32 bits, this counter can overflow. Since we're primarily
+       // interested in the delta from one read to the next, we don't
+       // worry about this too much.
+       uint32_t sample_count;
        uint32_t buffer[ENTROPY_BUFFER_SIZE];
 } entropy_data_t;
 
@@ -51,55 +63,12 @@ extern entropy_data_t EntropyData;
 /* Trace codes for DBG_SEC_KERNEL: */
 #define ENTROPY_READ(n) SECURITYDBG_CODE(DBG_SEC_KERNEL, n) /* n: 0 .. 3 */
 
-/*
- * Early_random implementation params: */
-#define EARLY_RANDOM_SEED_SIZE (16)
-#define EARLY_RANDOM_STATE_STATIC_SIZE (264)
+void random_cpu_init(int cpu);
 
-void early_random_cpu_init(int cpu);
-
-/*
- * Wrapper for requesting a CCKPRNG operation.
- * This macro makes the DRBG call with pre-emption disabled to ensure that
- * any attempt to block will cause a panic. And the operation is timed and
- * cannot exceed 10msec (for development kernels).
- * But skip this while we retain Yarrow.
- */
-#define YARROW 1
-#if YARROW
-#define PRNG_CCKPRNG(op) \
-       MACRO_BEGIN          \
-       op;                  \
-       MACRO_END
-#else
-#define PRNG_CCKPRNG(op)                                                      \
-       MACRO_BEGIN                                                               \
-       uint64_t start;                                                           \
-       uint64_t stop;                                                            \
-       disable_preemption();                                                     \
-       start = mach_absolute_time();                                             \
-       op;                                                                       \
-       stop = mach_absolute_time();                                              \
-       enable_preemption();                                                      \
-       assert(stop - start < 10 * NSEC_PER_MSEC || machine_timeout_suspended()); \
-       (void)start;                                                              \
-       (void)stop;                                                               \
-       MACRO_END
-#endif
 
 #endif /* XNU_KERNEL_PRIVATE */
 
-#include <corecrypto/cckprng.h>
-
-/* kernel prng */
-typedef const struct prng_fns {
-       int (*init)(cckprng_ctx_t ctx, size_t nbytes, const void * seed);
-       int (*reseed)(cckprng_ctx_t ctx, size_t nbytes, const void * seed);
-       int (*addentropy)(cckprng_ctx_t ctx, size_t nbytes, const void * entropy);
-       int (*generate)(cckprng_ctx_t ctx, size_t nbytes, void * out);
-} * prng_fns_t;
-
-void register_and_init_prng(prng_fns_t fns);
+void register_and_init_prng(struct cckprng_ctx *ctx, const struct cckprng_funcs *funcs);
 
 #include <kern/simple_lock.h>
 /* Definitions for boolean PRNG */
@@ -107,7 +76,7 @@ void register_and_init_prng(prng_fns_t fns);
 struct bool_gen {
        unsigned int seed[RANDOM_BOOL_GEN_SEED_COUNT];
        unsigned int state;
-       decl_simple_lock_data(, lock)
+       decl_simple_lock_data(, lock);
 };
 
 extern void random_bool_init(struct bool_gen * bg);
diff --git a/osfmk/profiling/Makefile b/osfmk/profiling/Makefile
deleted file mode 100644 (file)
index b7dc252..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
-export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
-export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
-export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
-
-include $(MakeInc_cmd)
-include $(MakeInc_def)
-
-INSTINC_SUBDIRS = \
-       machine
-
-INSTINC_SUBDIRS_X86_64 = \
-       x86_64
-
-INSTINC_SUBDIRS_X86_64H = \
-       x86_64
-
-INSTINC_SUBDIRS_ARM = \
-       arm
-
-INSTINC_SUBDIRS_ARM64 = \
-       arm
-
-EXPINC_SUBDIRS = \
-       machine
-
-EXPINC_SUBDIRS_ARM = \
-       arm
-
-EXPINC_SUBDIRS_ARM64 = \
-       arm
-
-EXPINC_SUBDIRS_X86_64 = \
-       x86_64
-
-EXPINC_SUBDIRS_X86_64H = \
-       x86_64
-
-DATAFILES = \
-       profile-internal.h profile-mk.h profile-kgmon.c
-
-MIGINCLUDES = \
-
-INSTALL_MI_LIST        = ${DATAFILES} ${_MIG_HDRS_} ${MIGINCLUDES}
-
-INSTALL_MI_DIR = profile
-
-EXPORT_MI_LIST = ${DATAFILES} ${_MIG_HDRS_} ${MIGINCLUDES}
-
-EXPORT_MI_DIR = profile
-
-.ORDER: ${_MIG_HDRS_} ${MIGINCLUDES}
-
-include $(MakeInc_rule)
-include $(MakeInc_dir)
diff --git a/osfmk/profiling/i386/Makefile b/osfmk/profiling/i386/Makefile
deleted file mode 100644 (file)
index 1253a00..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
-export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
-export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
-export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
-
-include $(MakeInc_cmd)
-include $(MakeInc_def)
-
-DATAFILES = \
-       profile-md.h profile-md.c profile-asm.s
-
-INSTALL_MD_LIST = ${DATAFILES}
-
-INSTALL_MD_DIR = profile/i386
-
-EXPORT_MD_LIST = ${DATAFILES}
-
-EXPORT_MD_DIR = profile/i386
-
-include $(MakeInc_rule)
-include $(MakeInc_dir)
diff --git a/osfmk/profiling/i386/profile-md.h b/osfmk/profiling/i386/profile-md.h
deleted file mode 100644 (file)
index 942a543..0000000
+++ /dev/null
@@ -1,370 +0,0 @@
-/*
- * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/*
- * HISTORY
- *
- * Revision 1.1.1.1  1998/09/22 21:05:49  wsanchez
- * Import of Mac OS X kernel (~semeria)
- *
- * Revision 1.1.1.1  1998/03/07 02:26:08  wsanchez
- * Import of OSF Mach kernel (~mburg)
- *
- * Revision 1.1.5.2  1996/07/31  09:57:36  paire
- *      Added some more constraints to __asm__ functions for compilation
- *      under gcc2.7.1 for PROF_CNT_[L]{ADD|SUB} macros
- *      [96/06/14            paire]
- *
- * Revision 1.1.5.1  1995/01/06  19:53:52  devrcs
- *      mk6 CR668 - 1.3b26 merge
- *      new file for mk6
- *      [1994/10/12  22:25:27  dwm]
- *
- * Revision 1.1.2.2  1994/05/16  19:19:26  meissner
- *      Add {,L}PROF_CNT_{SUB,LSUB,OVERFLOW} macros for gprof command.
- *      [1994/05/10  10:36:06  meissner]
- *
- *      Correct 64-bit integer asms to specify result values as inputs, and use =g instead of =m.
- *      Cast the integer argument to PROF_CNT_ADD to unsigned long, so a short register is widened.
- *      Add more support for writing the gprof command.
- *      PROF_CNT_{EQ,NE} should not use ^=, it just uses ^.
- *      Round PROF_CNT_DIGITS up to 24 bytes so it is word aligned.
- *      _profile_cnt_to_decimal now takes the low/high values as separate arguments.
- *      Delete _profile_cnt_to_hex.
- *      [1994/04/28  21:45:07  meissner]
- *
- *      Add more 64 bit arithmetic macros to support writing gprof.
- *      [1994/04/20  15:47:05  meissner]
- *
- * Revision 1.1.2.1  1994/04/08  17:51:56  meissner
- *      Correct spelling on LPROF_CNT_TO_LDOUBLE macro.
- *      [1994/04/08  16:18:06  meissner]
- *
- *      Make LHISTCOUNTER be 64 bits.
- *      Define LPROF_CNT_INC to increment LHISTCOUNTER.
- *      [1994/04/08  12:40:32  meissner]
- *
- *      Make most stats 64 bits, except for things like memory allocation.
- *      [1994/04/02  14:58:34  meissner]
- *
- *      Add overflow support for {gprof,prof,old,dummy}_mcount counters.
- *      [1994/03/17  20:13:37  meissner]
- *
- *      Add gprof/prof overflow support
- *      [1994/03/17  14:56:56  meissner]
- *
- *      Define LHISTCOUNTER.
- *      [1994/02/28  12:05:16  meissner]
- *
- *      Set HISTFRACTION to 4, so new lprofil call takes the same space.
- *      [1994/02/24  16:15:34  meissner]
- *
- *      Add too_low/too_high to profile_stats.
- *      [1994/02/16  22:38:23  meissner]
- *
- *      Make prof_cnt_t unsigned long.
- *      [1994/02/11  16:52:09  meissner]
- *
- *      Remember function unique ptr in gfuncs structure to reset profiling.
- *      Add support for range checking gprof arc {from,self}pc addresses.
- *      Add counter for # times acontext was locked.
- *      Expand copyright.
- *      [1994/02/07  12:41:08  meissner]
- *
- *      Keep track of the number of times the kernel overflows the HISTCOUNTER counter.
- *      [1994/02/03  20:13:31  meissner]
- *
- *      Add stats for {user,kernel,idle} mode in the kernel.
- *      [1994/02/03  15:17:36  meissner]
- *
- *      No change.
- *      [1994/02/03  00:58:59  meissner]
- *
- *      Combine _profile_{vars,stats,md}; Allow more than one _profile_vars.
- *      [1994/02/01  12:04:04  meissner]
- *
- *      Split # records to # gprof and # prof records.
- *      Add my_cpu/max_cpu fields.
- *      [1994/01/28  23:33:30  meissner]
- *
- *      Eliminate hash_{size,mask} from gfuncs structure.
- *      [1994/01/26  20:23:41  meissner]
- *
- *      Add structure size fields to _profile_{vars,stats,md}.
- *      Add major/minor version number to _profile_md.
- *      Move allocation context block pointer to main structure.
- *      Delete shift count for allocation contexts.
- *      [1994/01/25  01:46:08  meissner]
- *
- *      Add HASHFRACTION
- *      [1994/01/22  01:14:02  meissner]
- *
- *      Split profile-md.h into profile-internal.h and profile-md.
- *      [1994/01/20  20:57:18  meissner]
- *
- *      Fixup copyright.
- *      [1994/01/18  23:08:14  meissner]
- *
- *      Make flags byte-sized.
- *      Add have_bb flag.
- *      Add init_format flag.
- *      [1994/01/18  21:57:18  meissner]
- *
- *      CR 10198 - Initial version.
- *      [1994/01/18  19:44:59  meissner]
- *
- * $EndLog$
- */
-
-#ifndef _PROFILE_MD_H
-#define _PROFILE_MD_H
-
-#include <types.h>
-
-/*
- * Define the interfaces between the assembly language profiling support
- * that is common between the kernel, mach servers, and user space library.
- */
-
-/*
- * Integer types used.
- */
-
-/*
- * These hold either a pointer or a signed/unsigned int.
- * They are 32 bit on i386 and 64 bit on x86_64.
- */
-typedef long            prof_ptrint_t;
-typedef unsigned long   prof_uptrint_t;
-
-typedef long            prof_lock_t;    /* lock word type */
-typedef unsigned char   prof_flag_t;    /* type for boolean flags */
-
-/*
- * Double precision counter.
- */
-
-/* These are 64 bit on both i386 and x86_64 */
-typedef unsigned long prof_cnt_t;
-
-/* x86_64 */
-#define PROF_CNT_INC(cnt)       (cnt++)
-#define PROF_CNT_ADD(cnt, val)   (cnt+=val)
-#define PROF_CNT_LADD(cnt, val)  (cnt+=val)
-#define PROF_CNT_SUB(cnt, val)   (cnt-=val)
-#define PROF_CNT_LSUB(cnt, val)  (cnt-=val)
-
-#define PROF_ULONG_TO_CNT(cnt, val)      (((cnt).high = 0), ((cnt).low = val))
-#define PROF_CNT_OVERFLOW(cnt, high, low) (((high) = (cnt).high), ((low) = (cnt).low))
-#define PROF_CNT_TO_ULONG(cnt)          (((cnt).high == 0) ? (cnt).low : 0xffffffffu)
-#define PROF_CNT_TO_LDOUBLE(cnt)        ((((long double)(cnt).high) * 4294967296.0L) + (long double)(cnt).low)
-#define PROF_CNT_TO_DECIMAL(buf, cnt)    _profile_cnt_to_decimal(buf, (cnt).low, (cnt).high)
-#define PROF_CNT_EQ_0(cnt)              (((cnt).high | (cnt).low) == 0)
-#define PROF_CNT_NE_0(cnt)              (((cnt).high | (cnt).low) != 0)
-#define PROF_CNT_EQ(cnt1, cnt2)          ((((cnt1).high ^ (cnt2).high) | ((cnt1).low ^ (cnt2).low)) == 0)
-#define PROF_CNT_NE(cnt1, cnt2)          ((((cnt1).high ^ (cnt2).high) | ((cnt1).low ^ (cnt2).low)) != 0)
-#define PROF_CNT_GT(cnt1, cnt2)          (((cnt1).high > (cnt2).high) || ((cnt1).low > (cnt2).low))
-#define PROF_CNT_LT(cnt1, cnt2)          (((cnt1).high < (cnt2).high) || ((cnt1).low < (cnt2).low))
-
-/* max # digits + null to hold prof_cnt_t values (round up to multiple of 4) */
-#define PROF_CNT_DIGITS                 24
-
-/*
- * Types of the profil counter.
- */
-
-typedef unsigned short  HISTCOUNTER;            /* profil */
-typedef prof_cnt_t      LHISTCOUNTER;           /* lprofil */
-
-#define LPROF_ULONG_TO_CNT(cnt, val)     PROF_ULONG_TO_CNT(cnt,val)
-#define LPROF_CNT_INC(lp)               PROF_CNT_INC(lp)
-#define LPROF_CNT_ADD(lp, val)           PROF_CNT_ADD(lp,val)
-#define LPROF_CNT_LADD(lp, val)          PROF_CNT_LADD(lp,val)
-#define LPROF_CNT_SUB(lp, val)           PROF_CNT_SUB(lp,val)
-#define LPROF_CNT_LSUB(lp, val)          PROF_CNT_LSUB(lp,val)
-#define LPROF_CNT_OVERFLOW(lp, high, low) PROF_CNT_OVERFLOW(lp,high,low)
-#define LPROF_CNT_TO_ULONG(lp)          PROF_CNT_TO_ULONG(lp)
-#define LPROF_CNT_TO_LDOUBLE(lp)        PROF_CNT_TO_LDOUBLE(lp)
-#define LPROF_CNT_TO_DECIMAL(buf, cnt)   PROF_CNT_TO_DECIMAL(buf,cnt)
-#define LPROF_CNT_EQ_0(cnt)             PROF_CNT_EQ_0(cnt)
-#define LPROF_CNT_NE_0(cnt)             PROF_CNT_NE_0(cnt)
-#define LPROF_CNT_EQ(cnt1, cnt2)         PROF_CNT_EQ(cnt1,cnt2)
-#define LPROF_CNT_NE(cnt1, cnt2)         PROF_CNT_NE(cnt1,cnt2)
-#define LPROF_CNT_GT(cnt1, cnt2)         PROF_CNT_GT(cnt1,cnt2)
-#define LPROF_CNT_LT(cnt1, cnt2)         PROF_CNT_LT(cnt1,cnt2)
-#define LPROF_CNT_DIGITS                PROF_CNT_DIGITS
-
-/*
- *  fraction of text space to allocate for histogram counters
- */
-
-#define HISTFRACTION    4
-
-/*
- * Fraction of text space to allocate for from hash buckets.
- */
-
-#define HASHFRACTION    HISTFRACTION
-
-/*
- * Prof call count, external format.
- */
-
-struct prof_ext {
-       prof_uptrint_t  cvalue;         /* caller address */
-       prof_uptrint_t  cncall;         /* # of calls */
-};
-
-/*
- * Prof call count, internal format.
- */
-
-struct prof_int {
-       struct prof_ext prof;           /* external prof struct */
-       prof_uptrint_t  overflow;       /* # times prof counter overflowed */
-};
-
-/*
- * Gprof arc, external format.
- */
-
-struct gprof_arc {
-       prof_uptrint_t   frompc;        /* caller's caller */
-       prof_uptrint_t   selfpc;        /* caller's address */
-       prof_uptrint_t   count;         /* # times arc traversed */
-};
-
-/*
- * Gprof arc, internal format.
- */
-
-struct hasharc {
-       struct hasharc  *next;          /* next gprof record */
-       struct gprof_arc arc;           /* gprof record */
-       prof_uptrint_t   overflow;      /* # times counter overflowed */
-};
-
-/*
- * Linked list of all function profile blocks.
- */
-
-#define MAX_CACHE       3               /* # cache table entries */
-
-struct gfuncs {
-       struct hasharc **hash_ptr;              /* gprof hash table */
-       struct hasharc **unique_ptr;            /* function unique pointer */
-       struct prof_int prof;                   /* -p stats for elf */
-       struct hasharc *cache_ptr[MAX_CACHE];   /* cache element pointers */
-};
-
-/*
- * Profile information which might be written out in ELF {,g}mon.out files.
- */
-
-#define MAX_BUCKETS 9                   /* max bucket chain to print out */
-
-struct profile_stats {                  /* Debugging counters */
-       prof_uptrint_t major_version;   /* major version number */
-       prof_uptrint_t minor_version;   /* minor version number */
-       prof_uptrint_t stats_size;      /* size of profile_vars structure */
-       prof_uptrint_t profil_buckets;  /* # profil buckets */
-       prof_uptrint_t my_cpu;          /* identify current cpu/thread */
-       prof_uptrint_t max_cpu;         /* identify max cpu/thread */
-       prof_uptrint_t prof_records;    /* # of functions profiled */
-       prof_uptrint_t gprof_records;   /* # of gprof arcs */
-       prof_uptrint_t hash_buckets;    /* # gprof hash buckets */
-       prof_uptrint_t bogus_count;     /* # of bogus functions found in gprof */
-
-       prof_cnt_t cnt;                 /* # of calls to _{,g}prof_mcount */
-       prof_cnt_t dummy;               /* # of calls to _dummy_mcount */
-       prof_cnt_t old_mcount;          /* # of calls to old mcount */
-       prof_cnt_t hash_search;         /* # hash buckets searched */
-       prof_cnt_t hash_num;            /* # times hash table searched */
-       prof_cnt_t user_ticks;          /* # ticks in user space */
-       prof_cnt_t kernel_ticks;        /* # ticks in kernel space */
-       prof_cnt_t idle_ticks;          /* # ticks in idle mode */
-       prof_cnt_t overflow_ticks;      /* # ticks where HISTCOUNTER overflowed */
-       prof_cnt_t acontext_locked;     /* # times an acontext was locked */
-       prof_cnt_t too_low;             /* # times a histogram tick was too low */
-       prof_cnt_t too_high;            /* # times a histogram tick was too high */
-       prof_cnt_t prof_overflow;       /* # times a prof count field overflowed */
-       prof_cnt_t gprof_overflow;      /* # times a gprof count field overflowed */
-
-       /* allocation statistics */
-       prof_uptrint_t num_alloc[(int)ACONTEXT_MAX];    /* # allocations */
-       prof_uptrint_t bytes_alloc[(int)ACONTEXT_MAX];  /* bytes allocated */
-       prof_uptrint_t num_context[(int)ACONTEXT_MAX];  /* # contexts */
-       prof_uptrint_t wasted[(int)ACONTEXT_MAX];       /* wasted bytes */
-       prof_uptrint_t overhead[(int)ACONTEXT_MAX];     /* overhead bytes */
-
-       prof_uptrint_t buckets[MAX_BUCKETS + 1]; /* # hash indexes that have n buckets */
-       prof_cnt_t     cache_hits[MAX_CACHE];  /* # times nth cache entry matched */
-
-       prof_cnt_t stats_unused[64];    /* reserved for future use */
-};
-
-#define PROFILE_MAJOR_VERSION 1
-#define PROFILE_MINOR_VERSION 1
-
-/*
- * Machine dependent fields.
- */
-
-struct profile_md {
-       int major_version;              /* major version number */
-       int minor_version;              /* minor version number */
-       size_t md_size;                 /* size of profile_md structure */
-       struct hasharc **hash_ptr;      /* gprof hash table */
-       size_t hash_size;               /* size of hash table */
-       prof_uptrint_t num_cache;       /* # of cache entries */
-       void (*save_mcount_ptr)(void);  /* save for _mcount_ptr */
-       void(**mcount_ptr_ptr)(void);   /* pointer to _mcount_ptr */
-       struct hasharc *dummy_ptr;      /* pointer to dummy gprof record */
-       void *(*alloc_pages)(size_t);   /* pointer to _profile_alloc_pages */
-       char num_buffer[PROF_CNT_DIGITS]; /* convert 64 bit ints to string */
-       long md_unused[58];             /* add unused fields */
-};
-
-/*
- * Record information about each function call.  Specify
- * caller, caller's caller, and a unique label for use by
- * the profiling routines.
- */
-extern void _prof_mcount(void);
-extern void _gprof_mcount(void);
-extern void _dummy_mcount(void);
-extern void (*_mcount_ptr)(void);
-
-/*
- * Function in profile-md.c to convert prof_cnt_t to string format (decimal & hex).
- */
-extern char *_profile_cnt_to_decimal(char *, prof_uptrint_t, prof_uptrint_t);
-
-#endif /* _PROFILE_MD_H */
diff --git a/osfmk/profiling/machine/Makefile b/osfmk/profiling/machine/Makefile
deleted file mode 100644 (file)
index 3ee9858..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
-export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
-export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
-export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
-
-include $(MakeInc_cmd)
-include $(MakeInc_def)
-
-DATAFILES = \
-       profile-md.h
-
-INSTALL_MI_LIST = ${DATAFILES}
-
-INSTALL_MI_DIR = profile/machine
-
-EXPORT_MI_LIST = ${DATAFILES}
-
-EXPORT_MI_DIR = profile/machine
-
-include $(MakeInc_rule)
-include $(MakeInc_dir)
diff --git a/osfmk/profiling/machine/profile-md.h b/osfmk/profiling/machine/profile-md.h
deleted file mode 100644 (file)
index 11a08f9..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-#ifndef _MACH_MACHINE_PROFILE_MD_H_
-#define _MACH_MACHINE_PROFILE_MD_H_
-
-#if defined (__i386__) || defined (__x86_64__)
-#include "profiling/i386/profile-md.h"
-#elif defined (__arm__) || defined (__arm64__)
-#include "profiling/arm/profile-md.h"
-#else
-#error architecture not supported
-#endif
-
-#endif /* _MACH_MACHINE_PROFILE_MD_H_ */
diff --git a/osfmk/profiling/profile-internal.h b/osfmk/profiling/profile-internal.h
deleted file mode 100644 (file)
index 8f6cdfe..0000000
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/*
- * Define the internal interfaces between the profiling support that is
- * common between the kernel, mach servers, and user space library.
- */
-
-#ifndef _PROFILE_INTERNAL_H
-#define _PROFILE_INTERNAL_H
-
-/*
- * Allow us not to require stdio.h in kernel/server space, but
- * use it in user space.
- */
-
-#if !defined(MACH_KERNEL) && !defined(_KERNEL)
-#include <stdio.h>
-#endif
-
-/*
- * Scaling factor for the profil system call.
- */
-
-#define SCALE_1_TO_1    0x10000L
-
-
-/*
- * Forward reference to structures used.
- */
-
-struct profile_vars;
-struct profile_stats;
-struct profile_md;
-struct profile_dci;
-struct profile_profil;
-struct callback;
-struct gprof_arc;
-struct prof_ext;
-
-/*
- * Profiling type
- */
-
-typedef enum profile_type {
-       PROFILE_NONE,
-       PROFILE_GPROF,
-       PROFILE_PROF
-} profile_type_t;
-
-/*
- * Whether to allocate memory in _profile_md_init.
- */
-
-typedef enum profile_alloc_mem {
-       PROFILE_ALLOC_MEM_NO,
-       PROFILE_ALLOC_MEM_YES
-} profile_alloc_mem_t;
-
-/*
- * Allocation context block types.
- */
-
-typedef enum acontext_type {
-       ACONTEXT_PROF,                  /* 0: prof records */
-       ACONTEXT_GPROF,                 /* 1: gprof arcs */
-       ACONTEXT_GFUNC,                 /* 2: gprof function headers */
-       ACONTEXT_MISC,                  /* 3: misc. allocations */
-       ACONTEXT_PROFIL,                /* 4: profil based allocations */
-       ACONTEXT_DCI,                   /* 5: dci based allocations */
-       ACONTEXT_BASIC_BLOCK,           /* 6: basic block allocations */
-       ACONTEXT_CALLBACK,              /* 7: callback structures */
-       ACONTEXT_MAX = 32               /* # allocation contexts */
-} acontext_type_t;
-
-#define ACONTEXT_FIRST ACONTEXT_PROF
-
-#define ACONTEXT_NAMES {                                                \
-                "prof",                                                \
-                "gprof",                                               \
-                "gfunc",                                               \
-                "misc",                                                \
-                "profil",                                              \
-                "dci",                                                 \
-                "bb",                                                  \
-                "callback",                                            \
-                "#8",                                                  \
-                "#9",                                                  \
-                "#10",                                                 \
-                "#11",                                                 \
-                "#12",                                                 \
-                "#13",                                                 \
-                "#14",                                                 \
-                "#15",                                                 \
-                "#16",                                                 \
-                "#17",                                                 \
-                "#18",                                                 \
-                "#19",                                                 \
-                "#20",                                                 \
-                "#21",                                                 \
-                "#22",                                                 \
-                "#23",                                                 \
-                "#24",                                                 \
-                "#25",                                                 \
-                "#26",                                                 \
-                "#27",                                                 \
-                "#28",                                                 \
-                "#29",                                                 \
-                "#30",                                                 \
-                "#31",                                                 \
-        }
-
-/*
- * Kgmon control codes
- */
-
-typedef enum kgmon_control {
-       KGMON_UNUSED,                   /* insure no 0 is ever used */
-       KGMON_GET_STATUS,               /* return whether or not profiling is active */
-       KGMON_GET_PROFILE_VARS,         /* return the _profile_vars structure */
-       KGMON_GET_PROFILE_STATS,        /* return the _profile_stats structure */
-       KGMON_GET_DEBUG,                /* return whether or not debugging is on */
-
-       KGMON_SET_PROFILE_ON    = 50,   /* turn on profiling */
-       KGMON_SET_PROFILE_OFF,          /* turn off profiling */
-       KGMON_SET_PROFILE_RESET,        /* reset profiling tables */
-       KGMON_SET_DEBUG_ON,             /* turn on debugging */
-       KGMON_SET_DEBUG_OFF             /* turn off debugging */
-} kgmon_control_t;
-
-#define KGMON_GET_MIN   KGMON_GET_STATUS
-#define KGMON_GET_MAX   KGMON_GET_DEBUG
-#define KGMON_SET_MIN   KGMON_SET_PROFILE_ON
-#define KGMON_SET_MAX   KGMON_SET_DEBUG_OFF
-
-#define ENCODE_KGMON(num, control, cpu_thread)                          \
-  ((num) = ((cpu_thread) << 8) | (control))
-
-#define DECODE_KGMON(num, control, cpu_thread)                          \
-do {                                                                    \
-       control = (num) & 0xff;                                         \
-       cpu_thread = (num) >> 8;                                        \
-} while (0)
-
-#define LEGAL_KGMON(num) (((unsigned long)(num)) <= 0xffff)
-
-/*
- * Pull in all of the machine dependent types now after defining the enums.
- */
-
-#include <profiling/machine/profile-md.h>
-
-/*
- *  general rounding functions.
- */
-
-#define ROUNDDOWN(x, y)  (((x)/(y))*(y))
-#define ROUNDUP(x, y)    ((((x)+(y)-1)/(y))*(y))
-
-/*
- * Linked list of pages allocated for a particular allocation context block.
- */
-
-struct page_list {
-       void *first;                    /* pointer to first byte available */
-       void *ptr;                      /* pointer to next available byte */
-       struct page_list *next;         /* next page allocated */
-       size_t bytes_free;              /* # bytes available */
-       size_t bytes_allocated;         /* # bytes allocates so far */
-       size_t num_allocations;         /* # of allocations */
-};
-
-/*
- * Allocation context block.
- */
-
-struct alloc_context {
-       struct alloc_context *next;     /* next allocation context block */
-       struct page_list *plist;        /* head of page list */
-       prof_lock_t lock;               /* lock field available to asm */
-};
-
-
-/*
- * Callback structure that records information for one record in the
- * profiling output.
- */
-
-#define STR_MAX 32
-
-struct callback {
-       void    *sec_ptr;               /* callback user data */
-                                       /* callback function */
-       size_t (*callback)(struct profile_vars *, struct callback *);
-       long     sec_val1;              /* section specific value */
-       long     sec_val2;              /* section specific value */
-       size_t   sec_recsize;           /* record size */
-       size_t   sec_length;            /* total length */
-       char     sec_name[STR_MAX];     /* section name */
-};
-
-/*
- * Basic profil information (except for the profil buffer).
- */
-
-struct profile_profil {
-       prof_uptrint_t lowpc;           /* lowest address */
-       prof_uptrint_t highpc;          /* highest address */
-       size_t text_len;                /* highpc-lowpc */
-       size_t profil_len;              /* length of the profil buffer */
-       size_t counter_size;            /* size of indivual counters (HISTCOUNTER) */
-       unsigned long scale;            /* scaling factor (65536 / scale) */
-       unsigned long profil_unused[8]; /* currently unused */
-};
-
-/*
- * Profiling internal variables.  This structure is intended to be machine independent.
- */
-
-struct profile_vars {
-       int major_version;              /* major version number */
-       int minor_version;              /* minor version number */
-       size_t vars_size;               /* size of profile_vars structure */
-       size_t plist_size;              /* size of page_list structure */
-       size_t acontext_size;           /* size of allocation context struct */
-       size_t callback_size;           /* size of callback structure */
-       profile_type_t type;            /* profile type */
-       const char *error_msg;          /* error message for perror */
-       const char *filename;           /* filename to write to */
-       char *str_ptr;                  /* string table */
-
-#if !defined(MACH_KERNEL) && !defined(_KERNEL)
-       FILE *stream;                   /* stdio stream to write to */
-       FILE *diag_stream;              /* stdio stream to write diagnostics to */
-                                       /* function to write out some bytes */
-       size_t (*fwrite_func)(const void *, size_t, size_t, FILE *);
-#else
-       void *stream;                   /* pointer passed to fwrite_func */
-       void *diag_stream;              /* stdio stream to write diagnostics to */
-                                       /* function to write out some bytes */
-       size_t (*fwrite_func)(const void *, size_t, size_t, void *);
-#endif
-
-       size_t page_size;               /* machine pagesize */
-       size_t str_bytes;               /* # bytes in string table */
-       size_t str_total;               /* # bytes allocated total for string table */
-       long clock_ticks;               /* # clock ticks per second */
-
-       /* profil related variables */
-       struct profile_profil profil_info; /* profil information */
-       HISTCOUNTER *profil_buf;        /* profil buffer */
-
-       /* Profiling output selection */
-       void (*output_init)(struct profile_vars *);     /* output init function */
-       void (*output)(struct profile_vars *);          /* output function */
-       void *output_ptr;                               /* output specific info */
-
-       /* allocation contexts */
-       struct alloc_context *acontext[(int)ACONTEXT_MAX];
-
-       void (*bogus_func)(void);       /* Function to use if address out of bounds */
-       prof_uptrint_t vars_unused[63]; /* future growth */
-
-       /* Various flags */
-       prof_flag_t init;               /* != 0 if initialized */
-       prof_flag_t active;             /* != 0 if profiling is active */
-       prof_flag_t do_profile;         /* != 0 if profiling is being done */
-       prof_flag_t use_dci;            /* != 0 if using DCI */
-
-       prof_flag_t use_profil;         /* != 0 if using profil */
-       prof_flag_t recursive_alloc;    /* != 0 if alloc taking place */
-       prof_flag_t output_uarea;       /* != 0 if output the uarea */
-       prof_flag_t output_stats;       /* != 0 if output the stats */
-
-       prof_flag_t output_clock;       /* != 0 if output the clock ticks */
-       prof_flag_t multiple_sections;  /* != 0 if output allows multiple sections */
-       prof_flag_t have_bb;            /* != 0 if we have basic block data */
-       prof_flag_t init_format;        /* != 0 if output format has been chosen */
-
-       prof_flag_t debug;              /* != 0 if debugging */
-       prof_flag_t check_funcs;        /* != 0 if check gprof arcs for being in range */
-       prof_flag_t flag_unused[62];    /* space for more flags */
-
-       struct profile_stats stats;     /* profiling statistics */
-       struct profile_md md;           /* machine dependent info */
-};
-
-/*
- * Profiling static data.
- */
-
-extern struct profile_vars  _profile_vars;
-
-/*
- * Functions called by the machine dependent routines, and provided by
- * specific routines to the kernel, server, and user space library.
- */
-
-#if (__GNUC__ < 2) || (__GNUC__ == 2 && __GNUC_MINOR__ < 5) || defined(lint)
-#define __attribute__(arg)
-#endif
-
-#if defined(_KERNEL) || defined(MACH_KERNEL)
-#define _profile_printf printf
-#else
-extern int _profile_printf(const char *, ...) __attribute__((format(printf, 1, 2)));
-#endif
-
-extern void *_profile_alloc_pages(size_t);
-extern void _profile_free_pages(void *, size_t);
-extern void _profile_error(struct profile_vars *);
-
-/*
- * Functions provided by the machine dependent files.
- */
-
-extern void _profile_md_init(struct profile_vars *, profile_type_t, profile_alloc_mem_t);
-extern int _profile_md_start(void);
-extern int _profile_md_stop(void);
-extern void *_profile_alloc(struct profile_vars *, size_t, acontext_type_t);
-extern size_t _gprof_write(struct profile_vars *, struct callback *);
-extern size_t _prof_write(struct profile_vars *, struct callback *);
-extern void _profile_update_stats(struct profile_vars *);
-extern void _profile_reset(struct profile_vars *);
-
-#if !defined(_KERNEL) && !defined(MACH_KERNEL)
-extern void _profile_print_stats(FILE *, const struct profile_stats *, const struct profile_profil *);
-extern void _profile_merge_stats(struct profile_stats *, const struct profile_stats *);
-#else
-
-/*
- * Functions defined in profile-kgmon.c
- */
-
-extern long _profile_kgmon(int,
-    size_t,
-    long,
-    int,
-    void **,
-    void (*)(kgmon_control_t));
-#ifdef _KERNEL
-extern void kgmon_server_control(kgmon_control_t);
-
-#endif /* _KERNEL */
-#endif /* _KERNEL or MACH_KERNEL */
-
-#endif /* _PROFILE_INTERNAL_H */
diff --git a/osfmk/profiling/profile-kgmon.c b/osfmk/profiling/profile-kgmon.c
deleted file mode 100644 (file)
index c29f610..0000000
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/*
- * HISTORY
- *
- * Revision 1.1.1.1  1998/09/22 21:05:49  wsanchez
- * Import of Mac OS X kernel (~semeria)
- *
- * Revision 1.1.1.1  1998/03/07 02:26:08  wsanchez
- * Import of OSF Mach kernel (~mburg)
- *
- * Revision 1.1.5.1  1995/01/06  19:54:04  devrcs
- *      mk6 CR668 - 1.3b26 merge
- *      new file for mk6
- *      [1994/10/12  22:25:34  dwm]
- *
- * Revision 1.1.2.1  1994/04/08  17:52:05  meissner
- *      Add callback function to _profile_kgmon.
- *      [1994/02/16  22:38:31  meissner]
- *
- *      _profile_kgmon now returns pointer to area, doesn't do move itself.
- *      [1994/02/11  16:52:17  meissner]
- *
- *      Move all printfs into if (pv->debug) { ... } blocks.
- *      Add debug printfs protected by if (pv->debug) for all error conditions.
- *      Add code to reset profiling information.
- *      Add code to get/set debug flag.
- *      Expand copyright.
- *      [1994/02/07  12:41:14  meissner]
- *
- *      Add support to copy arbitrary regions.
- *      Delete several of the KGMON_GET commands, now that arb. regions are supported.
- *      Explicitly call _profile_update_stats before dumping vars or stats.
- *      [1994/02/03  00:59:05  meissner]
- *
- *      Combine _profile_{vars,stats,md}; Allow more than one _profile_vars.
- *      [1994/02/01  12:04:09  meissner]
- *
- *      CR 10198 - Initial version.
- *      [1994/01/28  23:33:37  meissner]
- *
- * $EndLog$
- */
-
-#include <profiling/profile-internal.h>
-
-#ifdef MACH_KERNEL
-#include <profiling/machine/profile-md.h>
-#endif
-
-#ifndef PROFILE_VARS
-#define PROFILE_VARS(cpu) (&_profile_vars)
-#endif
-
-/*
- * Kgmon interface.  This returns the count of bytes moved if everything was ok,
- * or -1 if there were errors.
- */
-
-long
-_profile_kgmon(int write,
-    size_t count,
-    long indx,
-    int max_cpus,
-    void **p_ptr,
-    void (*control_func)(kgmon_control_t))
-{
-       kgmon_control_t kgmon;
-       int cpu;
-       int error = 0;
-       int i;
-       struct profile_vars *pv;
-       static struct callback dummy_callback;
-
-       *p_ptr = (void *)0;
-
-       /*
-        * If the number passed is not within bounds, just copy the data directly.
-        */
-
-       if (!LEGAL_KGMON(indx)) {
-               *p_ptr = (void *)indx;
-               if (!write) {
-                       if (PROFILE_VARS(0)->debug) {
-                               printf("_profile_kgmon: copy %5ld bytes, from 0x%lx\n",
-                                   (long)count,
-                                   (long)indx);
-                       }
-               } else {
-                       if (PROFILE_VARS(0)->debug) {
-                               printf("_profile_kgmon: copy %5ld bytes, to 0x%lx\n",
-                                   (long)count,
-                                   (long)indx);
-                       }
-               }
-
-               return count;
-       }
-
-       /*
-        * Decode the record number into the component pieces.
-        */
-
-       DECODE_KGMON(indx, kgmon, cpu);
-
-       if (PROFILE_VARS(0)->debug) {
-               printf("_profile_kgmon: start: kgmon control = %2d, cpu = %d, count = %ld\n",
-                   kgmon, cpu, (long)count);
-       }
-
-       /* Validate the CPU number */
-       if (cpu < 0 || cpu >= max_cpus) {
-               if (PROFILE_VARS(0)->debug) {
-                       printf("KGMON, bad cpu %d\n", cpu);
-               }
-
-               return -1;
-       } else {
-               pv = PROFILE_VARS(cpu);
-
-               if (!write) {
-                       switch (kgmon) {
-                       default:
-                               if (PROFILE_VARS(0)->debug) {
-                                       printf("Unknown KGMON read command\n");
-                               }
-
-                               error = -1;
-                               break;
-
-                       case KGMON_GET_STATUS:          /* return whether or not profiling is active */
-                               if (cpu != 0) {
-                                       if (PROFILE_VARS(0)->debug) {
-                                               printf("KGMON_GET_STATUS: cpu = %d\n", cpu);
-                                       }
-
-                                       error = -1;
-                                       break;
-                               }
-
-                               if (count != sizeof(pv->active)) {
-                                       if (PROFILE_VARS(0)->debug) {
-                                               printf("KGMON_GET_STATUS: count = %ld, should be %ld\n",
-                                                   (long)count,
-                                                   (long)sizeof(pv->active));
-                                       }
-
-                                       error = -1;
-                                       break;
-                               }
-
-                               *p_ptr = (void *)&pv->active;
-                               break;
-
-                       case KGMON_GET_DEBUG:           /* return whether or not debugging is active */
-                               if (cpu != 0) {
-                                       if (PROFILE_VARS(0)->debug) {
-                                               printf("KGMON_GET_DEBUG: cpu = %d\n", cpu);
-                                       }
-
-                                       error = -1;
-                                       break;
-                               }
-
-                               if (count != sizeof(pv->debug)) {
-                                       if (PROFILE_VARS(0)->debug) {
-                                               printf("KGMON_GET_DEBUG: count = %ld, should be %ld\n",
-                                                   (long)count,
-                                                   (long)sizeof(pv->active));
-                                       }
-
-                                       error = -1;
-                                       break;
-                               }
-
-                               *p_ptr = (void *)&pv->debug;
-                               break;
-
-                       case KGMON_GET_PROFILE_VARS:    /* return the _profile_vars structure */
-                               if (count != sizeof(struct profile_vars)) {
-                                       if (PROFILE_VARS(0)->debug) {
-                                               printf("KGMON_GET_PROFILE_VARS: count = %ld, should be %ld\n",
-                                                   (long)count,
-                                                   (long)sizeof(struct profile_vars));
-                                       }
-
-                                       error = -1;
-                                       break;
-                               }
-
-                               _profile_update_stats(pv);
-                               *p_ptr = (void *)pv;
-                               break;
-
-                       case KGMON_GET_PROFILE_STATS:   /* return the _profile_stats structure */
-                               if (count != sizeof(struct profile_stats)) {
-                                       if (PROFILE_VARS(0)->debug) {
-                                               printf("KGMON_GET_PROFILE_STATS: count = %ld, should be = %ld\n",
-                                                   (long)count,
-                                                   (long)sizeof(struct profile_stats));
-                                       }
-
-                                       error = -1;
-                                       break;
-                               }
-
-                               _profile_update_stats(pv);
-                               *p_ptr = (void *)&pv->stats;
-                               break;
-                       }
-               } else {
-                       switch (kgmon) {
-                       default:
-                               if (PROFILE_VARS(0)->debug) {
-                                       printf("Unknown KGMON write command\n");
-                               }
-
-                               error = -1;
-                               break;
-
-                       case KGMON_SET_PROFILE_ON:      /* turn on profiling */
-                               if (cpu != 0) {
-                                       if (PROFILE_VARS(0)->debug) {
-                                               printf("KGMON_SET_PROFILE_ON, cpu = %d\n", cpu);
-                                       }
-
-                                       error = -1;
-                                       break;
-                               }
-
-                               if (!PROFILE_VARS(0)->active) {
-                                       for (i = 0; i < max_cpus; i++) {
-                                               PROFILE_VARS(i)->active = 1;
-                                       }
-
-                                       if (control_func) {
-                                               (*control_func)(kgmon);
-                                       }
-
-                                       _profile_md_start();
-                               }
-
-                               count = 0;
-                               break;
-
-                       case KGMON_SET_PROFILE_OFF:     /* turn off profiling */
-                               if (cpu != 0) {
-                                       if (PROFILE_VARS(0)->debug) {
-                                               printf("KGMON_SET_PROFILE_OFF, cpu = %d\n", cpu);
-                                       }
-
-                                       error = -1;
-                                       break;
-                               }
-
-                               if (PROFILE_VARS(0)->active) {
-                                       for (i = 0; i < max_cpus; i++) {
-                                               PROFILE_VARS(i)->active = 0;
-                                       }
-
-                                       _profile_md_stop();
-
-                                       if (control_func) {
-                                               (*control_func)(kgmon);
-                                       }
-                               }
-
-                               count = 0;
-                               break;
-
-                       case KGMON_SET_PROFILE_RESET:   /* reset profiling */
-                               if (cpu != 0) {
-                                       if (PROFILE_VARS(0)->debug) {
-                                               printf("KGMON_SET_PROFILE_RESET, cpu = %d\n", cpu);
-                                       }
-
-                                       error = -1;
-                                       break;
-                               }
-
-                               for (i = 0; i < max_cpus; i++) {
-                                       _profile_reset(PROFILE_VARS(i));
-                               }
-
-                               if (control_func) {
-                                       (*control_func)(kgmon);
-                               }
-
-                               count = 0;
-                               break;
-
-                       case KGMON_SET_DEBUG_ON:        /* turn on profiling */
-                               if (cpu != 0) {
-                                       if (PROFILE_VARS(0)->debug) {
-                                               printf("KGMON_SET_DEBUG_ON, cpu = %d\n", cpu);
-                                       }
-
-                                       error = -1;
-                                       break;
-                               }
-
-                               if (!PROFILE_VARS(0)->debug) {
-                                       for (i = 0; i < max_cpus; i++) {
-                                               PROFILE_VARS(i)->debug = 1;
-                                       }
-
-                                       if (control_func) {
-                                               (*control_func)(kgmon);
-                                       }
-                               }
-
-                               count = 0;
-                               break;
-
-                       case KGMON_SET_DEBUG_OFF:       /* turn off profiling */
-                               if (cpu != 0) {
-                                       if (PROFILE_VARS(0)->debug) {
-                                               printf("KGMON_SET_DEBUG_OFF, cpu = %d\n", cpu);
-                                       }
-
-                                       error = -1;
-                                       break;
-                               }
-
-                               if (PROFILE_VARS(0)->debug) {
-                                       for (i = 0; i < max_cpus; i++) {
-                                               PROFILE_VARS(i)->debug = 0;
-                                       }
-
-                                       if (control_func) {
-                                               (*control_func)(kgmon);
-                                       }
-                               }
-
-                               count = 0;
-                               break;
-                       }
-               }
-       }
-
-       if (error) {
-               if (PROFILE_VARS(0)->debug) {
-                       printf("_profile_kgmon: done:  kgmon control = %2d, cpu = %d, error = %d\n",
-                           kgmon, cpu, error);
-               }
-
-               return -1;
-       }
-
-       if (PROFILE_VARS(0)->debug) {
-               printf("_profile_kgmon: done:  kgmon control = %2d, cpu = %d, count = %ld\n",
-                   kgmon, cpu, (long)count);
-       }
-
-       return count;
-}
diff --git a/osfmk/profiling/profile-mk.c b/osfmk/profiling/profile-mk.c
deleted file mode 100644 (file)
index 4111735..0000000
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/*
- * Microkernel interface to common profiling.
- */
-
-#include <profiling/profile-mk.h>
-#include <string.h>
-#include <kern/cpu_number.h>
-#include <kern/processor.h>
-#include <kern/spl.h>
-#include <kern/misc_protos.h>
-#include <vm/vm_kern.h>
-#include <mach/vm_param.h>
-
-#include <device/ds_routines.h>
-#include <device/io_req.h>
-#include <device/buf.h>
-
-extern char etext[], pstart[];
-
-void *
-_profile_alloc_pages(size_t size)
-{
-       vm_offset_t addr;
-
-       /*
-        * For the MK, we can't support allocating pages at runtime, because we
-        * might be at interrupt level, so abort if we didn't size the table
-        * properly.
-        */
-
-       if (PROFILE_VARS(0)->active) {
-               panic("Call to _profile_alloc_pages while profiling is running.");
-       }
-
-       if (kmem_alloc(kernel_map, &addr, size)) {
-               panic("Could not allocate memory for profiling");
-       }
-
-       memset((void *)addr, '\0', size);
-       if (PROFILE_VARS(0)->debug) {
-               printf("Allocated %d bytes for profiling, address 0x%x\n", (int)size, (int)addr);
-       }
-
-       return (caddr_t)addr;
-}
-
-void
-_profile_free_pages(void *addr, size_t size)
-{
-       if (PROFILE_VARS(0)->debug) {
-               printf("Freed %d bytes for profiling, address 0x%x\n", (int)size, (int)addr);
-       }
-
-       kmem_free(kernel_map, (vm_offset_t)addr, size);
-       return;
-}
-
-void
-_profile_error(struct profile_vars *pv)
-{
-       panic("Fatal error in profiling");
-}
-
-void
-kmstartup(void)
-{
-       prof_uptrint_t textsize;
-       prof_uptrint_t monsize;
-       prof_uptrint_t lowpc;
-       prof_uptrint_t highpc;
-       struct profile_vars *pv;
-
-       /*
-        * round lowpc and highpc to multiples of the density we're using
-        * so the rest of the scaling (here and in gprof) stays in ints.
-        */
-
-       lowpc = ROUNDDOWN((prof_uptrint_t)&pstart[0], HISTFRACTION * sizeof(LHISTCOUNTER));
-       highpc = ROUNDUP((prof_uptrint_t)&etext[0], HISTFRACTION * sizeof(LHISTCOUNTER));
-       textsize = highpc - lowpc;
-       monsize = (textsize / HISTFRACTION) * sizeof(LHISTCOUNTER);
-
-       pv = PROFILE_VARS(0);
-
-#ifdef DEBUG_PROFILE
-       pv->debug = 1;
-#endif
-       pv->page_size = PAGE_SIZE;
-       _profile_md_init(pv, PROFILE_GPROF, PROFILE_ALLOC_MEM_YES);
-
-       /* Profil related variables */
-       pv->profil_buf = _profile_alloc(pv, monsize, ACONTEXT_PROFIL);
-       pv->profil_info.highpc = highpc;
-       pv->profil_info.lowpc = lowpc;
-       pv->profil_info.text_len = textsize;
-       pv->profil_info.profil_len = monsize;
-       pv->profil_info.counter_size = sizeof(LHISTCOUNTER);
-       pv->profil_info.scale = 0x10000 / HISTFRACTION;
-       pv->stats.profil_buckets = monsize / sizeof(LHISTCOUNTER);
-
-       /* Other gprof variables */
-       pv->stats.my_cpu = 0;
-       pv->stats.max_cpu = 1;  /* initial number of cpus */
-       pv->init = 1;
-       pv->active = 1;
-       pv->use_dci = 0;
-       pv->use_profil = 1;
-       pv->check_funcs = 1;            /* for now */
-
-       if (pv->debug) {
-               printf("Profiling kernel, s_textsize=%ld, monsize=%ld [0x%lx..0x%lx], cpu = %d\n",
-                   (long)textsize,
-                   (long)monsize,
-                   (long)lowpc,
-                   (long)highpc,
-                   0);
-       }
-
-       _profile_md_start();
-}
-
-/* driver component */
-
-int
-gprofprobe(caddr_t port, void *ctlr)
-{
-       return 1;
-}
-
-void
-gprofattach(void)
-{
-       kmstartup();
-       return;
-}
-
-/* struct bus_device *gprofinfo[NGPROF]; */
-struct bus_device *gprofinfo[1];
-
-struct  bus_driver      gprof_driver = {
-       gprofprobe, 0, gprofattach, 0, 0, "gprof", gprofinfo, "gprofc", 0, 0
-};
-
-
-io_return_t
-gprofopen(dev_t dev,
-    int flags,
-    io_req_t ior)
-{
-       ior->io_error = D_SUCCESS;
-       return 0;
-}
-
-void
-gprofclose(dev_t dev)
-{
-       return;
-}
-
-void
-gprofstrategy(io_req_t ior)
-{
-       void *sys_ptr = (void *)0;
-
-       long count = _profile_kgmon(!(ior->io_op & IO_READ),
-           ior->io_count,
-           ior->io_recnum,
-           1,
-           &sys_ptr,
-           (void (*)(kgmon_control_t))0);
-
-       if (count < 0) {
-               ior->io_error = D_INVALID_RECNUM;
-       } else {
-               if (count > 0 && sys_ptr != (void *)0) {
-                       if (ior->io_op & IO_READ) {
-                               memcpy((void *)ior->io_data, sys_ptr, count);
-                       } else {
-                               memcpy(sys_ptr, (void *)ior->io_data, count);
-                       }
-               }
-
-               ior->io_error = D_SUCCESS;
-               ior->io_residual = ior->io_count - count;
-       }
-
-       iodone(ior);
-}
-
-io_return_t
-gprofread(dev_t dev,
-    io_req_t ior)
-{
-       return block_io(gprofstrategy, minphys, ior);
-}
-
-io_return_t
-gprofwrite(dev_t dev,
-    io_req_t ior)
-{
-       return block_io(gprofstrategy, minphys, ior);
-}
diff --git a/osfmk/profiling/profile-mk.h b/osfmk/profiling/profile-mk.h
deleted file mode 100644 (file)
index 8e36900..0000000
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/*
- * Microkernel interface to common profiling.
- */
-
-#include <profiling/profile-internal.h>
-#include <mach/std_types.h>
-#include <types.h>
-#include <device/device_types.h>
-
-/*
- * JMM - We don't use these, just the BSD interfaces.
- */
-#if 0
-extern void kmstartup(void);
-extern int gprofprobe(caddr_t, void *);
-extern void gprofattach(void);
-extern int gprofopen(dev_t, int, io_req_t);
-extern void gprofclose(dev_t);
-extern void gprofstrategy(io_req_t);
-extern int gprofread(dev_t, io_req_t);
-extern int gprofwrite(dev_t, io_req_t);
-#endif
-
-/*
- * Macros to access the nth cpu's profile variable structures.
- */
-
-#define PROFILE_VARS(cpu) (&_profile_vars)
index 01669bf9ca2ec6e6574a2083fde09d1f7e5add6d..4748deb1cc5f60c23badd5fcf7b11a99c3623f7b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -45,6 +45,7 @@
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <kern/priority_queue.h>
+#include <string.h>
 
 #if !(DEVELOPMENT || DEBUG)
 #error "Testing is not enabled on RELEASE configurations"
@@ -63,6 +64,13 @@ kern_return_t zalloc_test(void);
 kern_return_t RandomULong_test(void);
 kern_return_t kcdata_api_test(void);
 kern_return_t priority_queue_test(void);
+kern_return_t ts_kernel_primitive_test(void);
+kern_return_t ts_kernel_sleep_inheritor_test(void);
+kern_return_t ts_kernel_gate_test(void);
+kern_return_t ts_kernel_turnstile_chain_test(void);
+kern_return_t ts_kernel_timingsafe_bcmp_test(void);
+
+extern kern_return_t kprintf_hhx_test(void);
 
 #if defined(__arm__) || defined(__arm64__)
 kern_return_t pmap_coredump_test(void);
@@ -81,12 +89,18 @@ extern kern_return_t ex_cb_test(void);
 #if __ARM_PAN_AVAILABLE__
 extern kern_return_t arm64_pan_test(void);
 #endif
+#if defined(HAS_APPLE_PAC)
+extern kern_return_t arm64_ropjop_test(void);
+#endif /* defined(HAS_APPLE_PAC) */
 #endif /* __arm64__ */
 
 extern kern_return_t test_thread_call(void);
 
 
-struct xnupost_panic_widget xt_panic_widgets = {NULL, NULL, NULL, NULL};
+struct xnupost_panic_widget xt_panic_widgets = {.xtp_context_p = NULL,
+                                               .xtp_outval_p = NULL,
+                                               .xtp_func_name = NULL,
+                                               .xtp_func = NULL};
 
 struct xnupost_test kernel_post_tests[] = {XNUPOST_TEST_CONFIG_BASIC(zalloc_test),
                                           XNUPOST_TEST_CONFIG_BASIC(RandomULong_test),
@@ -98,6 +112,9 @@ struct xnupost_test kernel_post_tests[] = {XNUPOST_TEST_CONFIG_BASIC(zalloc_test
 #if __ARM_PAN_AVAILABLE__
                                           XNUPOST_TEST_CONFIG_BASIC(arm64_pan_test),
 #endif
+#if defined(HAS_APPLE_PAC)
+                                          XNUPOST_TEST_CONFIG_BASIC(arm64_ropjop_test),
+#endif /* defined(HAS_APPLE_PAC) */
 #endif /* __arm64__ */
                                           XNUPOST_TEST_CONFIG_BASIC(kcdata_api_test),
                                           XNUPOST_TEST_CONFIG_BASIC(console_serial_test),
@@ -109,7 +126,13 @@ struct xnupost_test kernel_post_tests[] = {XNUPOST_TEST_CONFIG_BASIC(zalloc_test
                                           XNUPOST_TEST_CONFIG_BASIC(bitmap_post_test),
                                           //XNUPOST_TEST_CONFIG_TEST_PANIC(kcdata_api_assert_tests)
                                           XNUPOST_TEST_CONFIG_BASIC(test_thread_call),
-                                          XNUPOST_TEST_CONFIG_BASIC(priority_queue_test), };
+                                          XNUPOST_TEST_CONFIG_BASIC(priority_queue_test),
+                                          XNUPOST_TEST_CONFIG_BASIC(ts_kernel_primitive_test),
+                                          XNUPOST_TEST_CONFIG_BASIC(ts_kernel_sleep_inheritor_test),
+                                          XNUPOST_TEST_CONFIG_BASIC(ts_kernel_gate_test),
+                                          XNUPOST_TEST_CONFIG_BASIC(ts_kernel_turnstile_chain_test),
+                                          XNUPOST_TEST_CONFIG_BASIC(ts_kernel_timingsafe_bcmp_test),
+                                          XNUPOST_TEST_CONFIG_BASIC(kprintf_hhx_test), };
 
 uint32_t kernel_post_tests_count = sizeof(kernel_post_tests) / sizeof(xnupost_test_data_t);
 
@@ -685,10 +708,34 @@ struct sample_disk_io_stats {
 } __attribute__((packed));
 
 struct kcdata_subtype_descriptor test_disk_io_stats_def[] = {
-       {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 0 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_count"},
-       {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 1 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_size"},
-       {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, 2 * sizeof(uint64_t), KCS_SUBTYPE_PACK_SIZE(4, sizeof(uint64_t)), "io_priority_count"},
-       {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, (2 + 4) * sizeof(uint64_t), sizeof(uint64_t), "io_priority_size"},
+       {
+               .kcs_flags = KCS_SUBTYPE_FLAGS_NONE,
+               .kcs_elem_type = KC_ST_UINT64,
+               .kcs_elem_offset = 0 * sizeof(uint64_t),
+               .kcs_elem_size = sizeof(uint64_t),
+               .kcs_name = "disk_reads_count"
+       },
+       {
+               .kcs_flags = KCS_SUBTYPE_FLAGS_NONE,
+               .kcs_elem_type = KC_ST_UINT64,
+               .kcs_elem_offset = 1 * sizeof(uint64_t),
+               .kcs_elem_size = sizeof(uint64_t),
+               .kcs_name = "disk_reads_size"
+       },
+       {
+               .kcs_flags = KCS_SUBTYPE_FLAGS_ARRAY,
+               .kcs_elem_type = KC_ST_UINT64,
+               .kcs_elem_offset = 2 * sizeof(uint64_t),
+               .kcs_elem_size = KCS_SUBTYPE_PACK_SIZE(4, sizeof(uint64_t)),
+               .kcs_name = "io_priority_count"
+       },
+       {
+               .kcs_flags = KCS_SUBTYPE_FLAGS_ARRAY,
+               .kcs_elem_type = KC_ST_UINT64,
+               .kcs_elem_offset = (2 + 4) * sizeof(uint64_t),
+               .kcs_elem_size = sizeof(uint64_t),
+               .kcs_name = "io_priority_size"
+       },
 };
 
 kern_return_t
@@ -926,3 +973,1679 @@ pmap_coredump_test(void)
        return KERN_SUCCESS;
 }
 #endif
+
+struct ts_kern_prim_test_args {
+       int *end_barrier;
+       int *notify_b;
+       int *wait_event_b;
+       int before_num;
+       int *notify_a;
+       int *wait_event_a;
+       int after_num;
+       int priority_to_check;
+};
+
+static void
+wait_threads(
+       int* var,
+       int num)
+{
+       if (var != NULL) {
+               while (os_atomic_load(var, acquire) != num) {
+                       assert_wait((event_t) var, THREAD_UNINT);
+                       if (os_atomic_load(var, acquire) != num) {
+                               (void) thread_block(THREAD_CONTINUE_NULL);
+                       } else {
+                               clear_wait(current_thread(), THREAD_AWAKENED);
+                       }
+               }
+       }
+}
+
+static void
+wake_threads(
+       int* var)
+{
+       if (var) {
+               os_atomic_inc(var, relaxed);
+               thread_wakeup((event_t) var);
+       }
+}
+
+extern void IOSleep(int);
+
+static void
+thread_lock_unlock_kernel_primitive(
+       void *args,
+       __unused wait_result_t wr)
+{
+       thread_t thread = current_thread();
+       struct ts_kern_prim_test_args *info = (struct ts_kern_prim_test_args*) args;
+       int pri;
+
+       thread_lock(thread);
+       pri = thread->sched_pri;
+       thread_unlock(thread);
+
+       wait_threads(info->wait_event_b, info->before_num);
+       wake_threads(info->notify_b);
+
+       tstile_test_prim_lock(SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT);
+
+       wake_threads(info->notify_a);
+       wait_threads(info->wait_event_a, info->after_num);
+
+       IOSleep(100);
+
+       if (info->priority_to_check) {
+               thread_lock(thread);
+               pri = thread->sched_pri;
+               thread_unlock(thread);
+               T_ASSERT(pri == info->priority_to_check, "Priority thread: current sched %d sched wanted %d", pri, info->priority_to_check);
+       }
+
+       tstile_test_prim_unlock(SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT);
+
+       wake_threads(info->end_barrier);
+       thread_terminate_self();
+}
+
+kern_return_t
+ts_kernel_primitive_test(void)
+{
+       thread_t owner, thread1, thread2;
+       struct ts_kern_prim_test_args targs[2] = {};
+       kern_return_t result;
+       int end_barrier = 0;
+       int owner_locked = 0;
+       int waiters_ready = 0;
+
+       T_LOG("Testing turnstile kernel primitive");
+
+       targs[0].notify_b = NULL;
+       targs[0].wait_event_b = NULL;
+       targs[0].before_num = 0;
+       targs[0].notify_a = &owner_locked;
+       targs[0].wait_event_a = &waiters_ready;
+       targs[0].after_num = 2;
+       targs[0].priority_to_check = 90;
+       targs[0].end_barrier = &end_barrier;
+
+       // Start owner with priority 80
+       result = kernel_thread_start_priority((thread_continue_t)thread_lock_unlock_kernel_primitive, &targs[0], 80, &owner);
+       T_ASSERT(result == KERN_SUCCESS, "Starting owner");
+
+       targs[1].notify_b = &waiters_ready;
+       targs[1].wait_event_b = &owner_locked;
+       targs[1].before_num = 1;
+       targs[1].notify_a = NULL;
+       targs[1].wait_event_a = NULL;
+       targs[1].after_num = 0;
+       targs[1].priority_to_check = 0;
+       targs[1].end_barrier = &end_barrier;
+
+       // Start waiters with priority 85 and 90
+       result = kernel_thread_start_priority((thread_continue_t)thread_lock_unlock_kernel_primitive, &targs[1], 85, &thread1);
+       T_ASSERT(result == KERN_SUCCESS, "Starting thread1");
+
+       result = kernel_thread_start_priority((thread_continue_t)thread_lock_unlock_kernel_primitive, &targs[1], 90, &thread2);
+       T_ASSERT(result == KERN_SUCCESS, "Starting thread2");
+
+       wait_threads(&end_barrier, 3);
+
+       return KERN_SUCCESS;
+}
+
+#define MTX_LOCK 0
+#define RW_LOCK 1
+
+#define NUM_THREADS 4
+
+struct synch_test_common {
+       unsigned int nthreads;
+       thread_t *threads;
+       int max_pri;
+       int test_done;
+};
+
+static kern_return_t
+init_synch_test_common(struct synch_test_common *info, unsigned int nthreads)
+{
+       info->nthreads = nthreads;
+       info->threads = kalloc(sizeof(thread_t) * nthreads);
+       if (!info->threads) {
+               return ENOMEM;
+       }
+
+       return KERN_SUCCESS;
+}
+
+static void
+destroy_synch_test_common(struct synch_test_common *info)
+{
+       kfree(info->threads, sizeof(thread_t) * info->nthreads);
+}
+
+static void
+start_threads(thread_continue_t func, struct synch_test_common *info, bool sleep_after_first)
+{
+       thread_t thread;
+       kern_return_t result;
+       uint i;
+       int priority = 75;
+
+       info->test_done = 0;
+
+       for (i = 0; i < info->nthreads; i++) {
+               info->threads[i] = NULL;
+       }
+
+       info->max_pri = priority + (info->nthreads - 1) * 5;
+       if (info->max_pri > 95) {
+               info->max_pri = 95;
+       }
+
+       for (i = 0; i < info->nthreads; i++) {
+               result = kernel_thread_start_priority((thread_continue_t)func, info, priority, &thread);
+               os_atomic_store(&info->threads[i], thread, release);
+               T_ASSERT(result == KERN_SUCCESS, "Starting thread %d, priority %d, %p", i, priority, thread);
+
+               priority += 5;
+
+               if (i == 0 && sleep_after_first) {
+                       IOSleep(100);
+               }
+       }
+}
+
+static unsigned int
+get_max_pri(struct synch_test_common * info)
+{
+       return info->max_pri;
+}
+
+static void
+wait_all_thread(struct synch_test_common * info)
+{
+       wait_threads(&info->test_done, info->nthreads);
+}
+
+static void
+notify_waiter(struct synch_test_common * info)
+{
+       wake_threads(&info->test_done);
+}
+
+static void
+wait_for_waiters(struct synch_test_common *info)
+{
+       uint i, j;
+       thread_t thread;
+
+       for (i = 0; i < info->nthreads; i++) {
+               j = 0;
+               while (os_atomic_load(&info->threads[i], acquire) == NULL) {
+                       if (j % 100 == 0) {
+                               IOSleep(10);
+                       }
+                       j++;
+               }
+
+               if (info->threads[i] != current_thread()) {
+                       j = 0;
+                       do {
+                               thread = os_atomic_load(&info->threads[i], relaxed);
+                               if (thread == (thread_t) 1) {
+                                       break;
+                               }
+
+                               if (!(thread->state & TH_RUN)) {
+                                       break;
+                               }
+
+                               if (j % 100 == 0) {
+                                       IOSleep(100);
+                               }
+                               j++;
+
+                               if (thread->started == FALSE) {
+                                       continue;
+                               }
+                       } while (thread->state & TH_RUN);
+               }
+       }
+}
+
+static void
+exclude_current_waiter(struct synch_test_common *info)
+{
+       uint i, j;
+
+       for (i = 0; i < info->nthreads; i++) {
+               j = 0;
+               while (os_atomic_load(&info->threads[i], acquire) == NULL) {
+                       if (j % 100 == 0) {
+                               IOSleep(10);
+                       }
+                       j++;
+               }
+
+               if (os_atomic_load(&info->threads[i], acquire) == current_thread()) {
+                       os_atomic_store(&info->threads[i], (thread_t)1, release);
+                       return;
+               }
+       }
+}
+
+struct info_sleep_inheritor_test {
+       struct synch_test_common head;
+       lck_mtx_t mtx_lock;
+       lck_rw_t rw_lock;
+       decl_lck_mtx_gate_data(, gate);
+       boolean_t gate_closed;
+       int prim_type;
+       boolean_t work_to_do;
+       unsigned int max_pri;
+       unsigned int steal_pri;
+       int synch_value;
+       int synch;
+       int value;
+       int handoff_failure;
+       thread_t thread_inheritor;
+};
+
+static void
+primitive_lock(struct info_sleep_inheritor_test *info)
+{
+       switch (info->prim_type) {
+       case MTX_LOCK:
+               lck_mtx_lock(&info->mtx_lock);
+               break;
+       case RW_LOCK:
+               lck_rw_lock(&info->rw_lock, LCK_RW_TYPE_EXCLUSIVE);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+}
+
+static void
+primitive_unlock(struct info_sleep_inheritor_test *info)
+{
+       switch (info->prim_type) {
+       case MTX_LOCK:
+               lck_mtx_unlock(&info->mtx_lock);
+               break;
+       case RW_LOCK:
+               lck_rw_unlock(&info->rw_lock, LCK_RW_TYPE_EXCLUSIVE);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+}
+
+static wait_result_t
+primitive_sleep_with_inheritor(struct info_sleep_inheritor_test *info)
+{
+       wait_result_t ret = KERN_SUCCESS;
+       switch (info->prim_type) {
+       case MTX_LOCK:
+               ret = lck_mtx_sleep_with_inheritor(&info->mtx_lock, LCK_SLEEP_DEFAULT, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+               break;
+       case RW_LOCK:
+               ret = lck_rw_sleep_with_inheritor(&info->rw_lock, LCK_SLEEP_DEFAULT, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+
+       return ret;
+}
+
+static void
+primitive_wakeup_one_with_inheritor(struct info_sleep_inheritor_test *info)
+{
+       switch (info->prim_type) {
+       case MTX_LOCK:
+       case RW_LOCK:
+               wakeup_one_with_inheritor((event_t) &info->thread_inheritor, THREAD_AWAKENED, LCK_WAKE_DEFAULT, &info->thread_inheritor);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+}
+
+static void
+primitive_wakeup_all_with_inheritor(struct info_sleep_inheritor_test *info)
+{
+       switch (info->prim_type) {
+       case MTX_LOCK:
+       case RW_LOCK:
+               wakeup_all_with_inheritor((event_t) &info->thread_inheritor, THREAD_AWAKENED);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+       return;
+}
+
+static void
+primitive_change_sleep_inheritor(struct info_sleep_inheritor_test *info)
+{
+       switch (info->prim_type) {
+       case MTX_LOCK:
+       case RW_LOCK:
+               change_sleep_inheritor((event_t) &info->thread_inheritor, info->thread_inheritor);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+       return;
+}
+
+static kern_return_t
+primitive_gate_try_close(struct info_sleep_inheritor_test *info)
+{
+       kern_return_t ret = KERN_SUCCESS;
+       switch (info->prim_type) {
+       case MTX_LOCK:
+               ret = lck_mtx_gate_try_close(&info->mtx_lock, &info->gate);
+               break;
+       case RW_LOCK:
+               ret = lck_rw_gate_try_close(&info->rw_lock, &info->gate);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+       return ret;
+}
+
+static gate_wait_result_t
+primitive_gate_wait(struct info_sleep_inheritor_test *info)
+{
+       gate_wait_result_t ret = GATE_OPENED;
+       switch (info->prim_type) {
+       case MTX_LOCK:
+               ret = lck_mtx_gate_wait(&info->mtx_lock, &info->gate, LCK_SLEEP_DEFAULT, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+               break;
+       case RW_LOCK:
+               ret = lck_rw_gate_wait(&info->rw_lock, &info->gate, LCK_SLEEP_DEFAULT, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+       return ret;
+}
+
+static void
+primitive_gate_open(struct info_sleep_inheritor_test *info)
+{
+       switch (info->prim_type) {
+       case MTX_LOCK:
+               lck_mtx_gate_open(&info->mtx_lock, &info->gate);
+               break;
+       case RW_LOCK:
+               lck_rw_gate_open(&info->rw_lock, &info->gate);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+}
+
+static void
+primitive_gate_close(struct info_sleep_inheritor_test *info)
+{
+       switch (info->prim_type) {
+       case MTX_LOCK:
+               lck_mtx_gate_close(&info->mtx_lock, &info->gate);
+               break;
+       case RW_LOCK:
+               lck_rw_gate_close(&info->rw_lock, &info->gate);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+}
+
+static void
+primitive_gate_steal(struct info_sleep_inheritor_test *info)
+{
+       switch (info->prim_type) {
+       case MTX_LOCK:
+               lck_mtx_gate_steal(&info->mtx_lock, &info->gate);
+               break;
+       case RW_LOCK:
+               lck_rw_gate_steal(&info->rw_lock, &info->gate);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+}
+
+static kern_return_t
+primitive_gate_handoff(struct info_sleep_inheritor_test *info, int flags)
+{
+       kern_return_t ret = KERN_SUCCESS;
+       switch (info->prim_type) {
+       case MTX_LOCK:
+               ret = lck_mtx_gate_handoff(&info->mtx_lock, &info->gate, flags);
+               break;
+       case RW_LOCK:
+               ret = lck_rw_gate_handoff(&info->rw_lock, &info->gate, flags);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+       return ret;
+}
+
+static void
+primitive_gate_assert(struct info_sleep_inheritor_test *info, int type)
+{
+       switch (info->prim_type) {
+       case MTX_LOCK:
+               lck_mtx_gate_assert(&info->mtx_lock, &info->gate, type);
+               break;
+       case RW_LOCK:
+               lck_rw_gate_assert(&info->rw_lock, &info->gate, type);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+}
+
+static void
+primitive_gate_init(struct info_sleep_inheritor_test *info)
+{
+       switch (info->prim_type) {
+       case MTX_LOCK:
+               lck_mtx_gate_init(&info->mtx_lock, &info->gate);
+               break;
+       case RW_LOCK:
+               lck_rw_gate_init(&info->rw_lock, &info->gate);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+}
+
+static void
+primitive_gate_destroy(struct info_sleep_inheritor_test *info)
+{
+       switch (info->prim_type) {
+       case MTX_LOCK:
+               lck_mtx_gate_destroy(&info->mtx_lock, &info->gate);
+               break;
+       case RW_LOCK:
+               lck_rw_gate_destroy(&info->rw_lock, &info->gate);
+               break;
+       default:
+               panic("invalid type %d", info->prim_type);
+       }
+}
+
+static void
+thread_inheritor_like_mutex(
+       void *args,
+       __unused wait_result_t wr)
+{
+       wait_result_t wait;
+
+       struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args;
+       uint my_pri = current_thread()->sched_pri;
+
+       T_LOG("Started thread pri %d %p", my_pri, current_thread());
+
+       /*
+        * spin here to start concurrently
+        */
+       wake_threads(&info->synch);
+       wait_threads(&info->synch, info->synch_value);
+
+       primitive_lock(info);
+
+       if (info->thread_inheritor == NULL) {
+               info->thread_inheritor = current_thread();
+       } else {
+               wait = primitive_sleep_with_inheritor(info);
+               T_ASSERT(wait == THREAD_AWAKENED || wait == THREAD_NOT_WAITING, "sleep_with_inheritor return");
+       }
+       primitive_unlock(info);
+
+       IOSleep(100);
+       info->value++;
+
+       primitive_lock(info);
+
+       T_ASSERT(info->thread_inheritor == current_thread(), "thread_inheritor is %p", info->thread_inheritor);
+       primitive_wakeup_one_with_inheritor(info);
+       T_LOG("woken up %p", info->thread_inheritor);
+
+       if (info->thread_inheritor == NULL) {
+               T_ASSERT(info->handoff_failure == 0, "handoff failures");
+               info->handoff_failure++;
+       } else {
+               T_ASSERT(info->thread_inheritor != current_thread(), "thread_inheritor is %p", info->thread_inheritor);
+               thread_deallocate(info->thread_inheritor);
+       }
+
+       primitive_unlock(info);
+
+       assert(current_thread()->kern_promotion_schedpri == 0);
+       notify_waiter((struct synch_test_common *)info);
+
+       thread_terminate_self();
+}
+
+static void
+thread_just_inheritor_do_work(
+       void *args,
+       __unused wait_result_t wr)
+{
+       struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args;
+       uint my_pri = current_thread()->sched_pri;
+       uint max_pri;
+
+       T_LOG("Started thread pri %d %p", my_pri, current_thread());
+       primitive_lock(info);
+
+       if (info->thread_inheritor == NULL) {
+               info->thread_inheritor = current_thread();
+               primitive_unlock(info);
+               T_LOG("Thread pri %d first to run %p", my_pri, current_thread());
+
+               wait_threads(&info->synch, info->synch_value - 1);
+
+               wait_for_waiters((struct synch_test_common *)info);
+
+               max_pri = get_max_pri((struct synch_test_common *) info);
+               T_ASSERT((uint) current_thread()->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", current_thread()->sched_pri, max_pri);
+
+               os_atomic_store(&info->synch, 0, relaxed);
+               primitive_lock(info);
+               primitive_wakeup_all_with_inheritor(info);
+       } else {
+               wake_threads(&info->synch);
+               primitive_sleep_with_inheritor(info);
+       }
+
+       primitive_unlock(info);
+
+       assert(current_thread()->kern_promotion_schedpri == 0);
+       notify_waiter((struct synch_test_common *)info);
+
+       thread_terminate_self();
+}
+
+static void
+thread_steal_work(
+       void *args,
+       __unused wait_result_t wr)
+{
+       struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args;
+       uint my_pri = current_thread()->sched_pri;
+
+       T_LOG("Started thread pri %d %p", my_pri, current_thread());
+       primitive_lock(info);
+
+       if (info->thread_inheritor == NULL) {
+               info->thread_inheritor = current_thread();
+               exclude_current_waiter((struct synch_test_common *)info);
+
+               T_LOG("Thread pri %d first to run %p", my_pri, current_thread());
+               primitive_unlock(info);
+
+               wait_threads(&info->synch, info->synch_value - 2);
+
+               wait_for_waiters((struct synch_test_common *)info);
+               T_LOG("Thread pri %d first to run %p", my_pri, current_thread());
+               primitive_lock(info);
+               if (info->thread_inheritor == current_thread()) {
+                       primitive_wakeup_all_with_inheritor(info);
+               }
+       } else {
+               if (info->steal_pri == 0) {
+                       info->steal_pri = my_pri;
+                       info->thread_inheritor = current_thread();
+                       primitive_change_sleep_inheritor(info);
+                       exclude_current_waiter((struct synch_test_common *)info);
+
+                       primitive_unlock(info);
+
+                       wait_threads(&info->synch, info->synch_value - 2);
+
+                       T_LOG("Thread pri %d stole push %p", my_pri, current_thread());
+                       wait_for_waiters((struct synch_test_common *)info);
+
+                       T_ASSERT((uint) current_thread()->sched_pri == info->steal_pri, "sleep_inheritor inheritor priority current is %d, should be %d", current_thread()->sched_pri, info->steal_pri);
+
+                       primitive_lock(info);
+                       primitive_wakeup_all_with_inheritor(info);
+               } else {
+                       if (my_pri > info->steal_pri) {
+                               info->steal_pri = my_pri;
+                       }
+                       wake_threads(&info->synch);
+                       primitive_sleep_with_inheritor(info);
+                       exclude_current_waiter((struct synch_test_common *)info);
+               }
+       }
+       primitive_unlock(info);
+
+       assert(current_thread()->kern_promotion_schedpri == 0);
+       notify_waiter((struct synch_test_common *)info);
+
+       thread_terminate_self();
+}
+
+static void
+thread_no_inheritor_work(
+       void *args,
+       __unused wait_result_t wr)
+{
+       struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args;
+       uint my_pri = current_thread()->sched_pri;
+
+       T_LOG("Started thread pri %d %p", my_pri, current_thread());
+       primitive_lock(info);
+
+       info->value--;
+       if (info->value == 0) {
+               primitive_wakeup_all_with_inheritor(info);
+       } else {
+               info->thread_inheritor = NULL;
+               primitive_sleep_with_inheritor(info);
+       }
+
+       primitive_unlock(info);
+
+       assert(current_thread()->kern_promotion_schedpri == 0);
+       notify_waiter((struct synch_test_common *)info);
+
+       thread_terminate_self();
+}
+
+static void
+thread_mtx_work(
+       void *args,
+       __unused wait_result_t wr)
+{
+       struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args;
+       uint my_pri = current_thread()->sched_pri;
+       int i;
+       u_int8_t rand;
+       unsigned int mod_rand;
+       uint max_pri;
+
+       T_LOG("Started thread pri %d %p", my_pri, current_thread());
+
+       for (i = 0; i < 10; i++) {
+               lck_mtx_lock(&info->mtx_lock);
+               if (info->thread_inheritor == NULL) {
+                       info->thread_inheritor = current_thread();
+                       lck_mtx_unlock(&info->mtx_lock);
+
+                       T_LOG("Thread pri %d first to run %p", my_pri, current_thread());
+
+                       wait_threads(&info->synch, info->synch_value - 1);
+                       wait_for_waiters((struct synch_test_common *)info);
+                       max_pri = get_max_pri((struct synch_test_common *) info);
+                       T_ASSERT((uint) current_thread()->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", current_thread()->sched_pri, max_pri);
+
+                       os_atomic_store(&info->synch, 0, relaxed);
+
+                       lck_mtx_lock(&info->mtx_lock);
+                       info->thread_inheritor = NULL;
+                       wakeup_all_with_inheritor((event_t) &info->thread_inheritor, THREAD_AWAKENED);
+                       lck_mtx_unlock(&info->mtx_lock);
+                       continue;
+               }
+
+               read_random(&rand, sizeof(rand));
+               mod_rand = rand % 2;
+
+               wake_threads(&info->synch);
+               switch (mod_rand) {
+               case 0:
+                       lck_mtx_sleep_with_inheritor(&info->mtx_lock, LCK_SLEEP_DEFAULT, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+                       lck_mtx_unlock(&info->mtx_lock);
+                       break;
+               case 1:
+                       lck_mtx_sleep_with_inheritor(&info->mtx_lock, LCK_SLEEP_UNLOCK, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+                       break;
+               default:
+                       panic("rand()mod4 returned %u (random %u)", mod_rand, rand);
+               }
+       }
+
+       /*
+        * spin here to stop using the lock as mutex
+        */
+       wake_threads(&info->synch);
+       wait_threads(&info->synch, info->synch_value);
+
+       for (i = 0; i < 10; i++) {
+               /* read_random might sleep so read it before acquiring the mtx as spin */
+               read_random(&rand, sizeof(rand));
+
+               lck_mtx_lock_spin(&info->mtx_lock);
+               if (info->thread_inheritor == NULL) {
+                       info->thread_inheritor = current_thread();
+                       lck_mtx_unlock(&info->mtx_lock);
+
+                       T_LOG("Thread pri %d first to run %p", my_pri, current_thread());
+                       wait_for_waiters((struct synch_test_common *)info);
+                       max_pri = get_max_pri((struct synch_test_common *) info);
+                       T_ASSERT((uint) current_thread()->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", current_thread()->sched_pri, max_pri);
+
+                       lck_mtx_lock_spin(&info->mtx_lock);
+                       info->thread_inheritor = NULL;
+                       wakeup_all_with_inheritor((event_t) &info->thread_inheritor, THREAD_AWAKENED);
+                       lck_mtx_unlock(&info->mtx_lock);
+                       continue;
+               }
+
+               mod_rand = rand % 2;
+               switch (mod_rand) {
+               case 0:
+                       lck_mtx_sleep_with_inheritor(&info->mtx_lock, LCK_SLEEP_SPIN, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+                       lck_mtx_unlock(&info->mtx_lock);
+                       break;
+               case 1:
+                       lck_mtx_sleep_with_inheritor(&info->mtx_lock, LCK_SLEEP_SPIN_ALWAYS, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+                       lck_mtx_unlock(&info->mtx_lock);
+                       break;
+               default:
+                       panic("rand()mod4 returned %u (random %u)", mod_rand, rand);
+               }
+       }
+       assert(current_thread()->kern_promotion_schedpri == 0);
+       notify_waiter((struct synch_test_common *)info);
+
+       thread_terminate_self();
+}
+
+static void
+thread_rw_work(
+       void *args,
+       __unused wait_result_t wr)
+{
+       struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args;
+       uint my_pri = current_thread()->sched_pri;
+       int i;
+       lck_rw_type_t type;
+       u_int8_t rand;
+       unsigned int mod_rand;
+       uint max_pri;
+
+       T_LOG("Started thread pri %d %p", my_pri, current_thread());
+
+       for (i = 0; i < 10; i++) {
+try_again:
+               type = LCK_RW_TYPE_SHARED;
+               lck_rw_lock(&info->rw_lock, type);
+               if (info->thread_inheritor == NULL) {
+                       type = LCK_RW_TYPE_EXCLUSIVE;
+
+                       if (lck_rw_lock_shared_to_exclusive(&info->rw_lock)) {
+                               if (info->thread_inheritor == NULL) {
+                                       info->thread_inheritor = current_thread();
+                                       lck_rw_unlock(&info->rw_lock, type);
+                                       wait_threads(&info->synch, info->synch_value - 1);
+
+                                       T_LOG("Thread pri %d first to run %p", my_pri, current_thread());
+                                       wait_for_waiters((struct synch_test_common *)info);
+                                       max_pri = get_max_pri((struct synch_test_common *) info);
+                                       T_ASSERT((uint) current_thread()->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", current_thread()->sched_pri, max_pri);
+
+                                       os_atomic_store(&info->synch, 0, relaxed);
+
+                                       lck_rw_lock(&info->rw_lock, type);
+                                       info->thread_inheritor = NULL;
+                                       wakeup_all_with_inheritor((event_t) &info->thread_inheritor, THREAD_AWAKENED);
+                                       lck_rw_unlock(&info->rw_lock, type);
+                                       continue;
+                               }
+                       } else {
+                               goto try_again;
+                       }
+               }
+
+               read_random(&rand, sizeof(rand));
+               mod_rand = rand % 4;
+
+               wake_threads(&info->synch);
+               switch (mod_rand) {
+               case 0:
+                       lck_rw_sleep_with_inheritor(&info->rw_lock, LCK_SLEEP_DEFAULT, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+                       lck_rw_unlock(&info->rw_lock, type);
+                       break;
+               case 1:
+                       lck_rw_sleep_with_inheritor(&info->rw_lock, LCK_SLEEP_UNLOCK, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+                       break;
+               case 2:
+                       lck_rw_sleep_with_inheritor(&info->rw_lock, LCK_SLEEP_SHARED, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+                       lck_rw_unlock(&info->rw_lock, LCK_RW_TYPE_SHARED);
+                       break;
+               case 3:
+                       lck_rw_sleep_with_inheritor(&info->rw_lock, LCK_SLEEP_EXCLUSIVE, (event_t) &info->thread_inheritor, info->thread_inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+                       lck_rw_unlock(&info->rw_lock, LCK_RW_TYPE_EXCLUSIVE);
+                       break;
+               default:
+                       panic("rand()mod4 returned %u (random %u)", mod_rand, rand);
+               }
+       }
+
+       assert(current_thread()->kern_promotion_schedpri == 0);
+       notify_waiter((struct synch_test_common *)info);
+
+       thread_terminate_self();
+}
+
+static void
+test_sleep_with_wake_all(struct info_sleep_inheritor_test *info, int prim_type)
+{
+       info->prim_type = prim_type;
+       info->synch = 0;
+       info->synch_value = info->head.nthreads;
+
+       info->thread_inheritor = NULL;
+
+       start_threads((thread_continue_t)thread_just_inheritor_do_work, (struct synch_test_common *)info, TRUE);
+       wait_all_thread((struct synch_test_common *)info);
+}
+
+static void
+test_sleep_with_wake_one(struct info_sleep_inheritor_test *info, int prim_type)
+{
+       info->prim_type = prim_type;
+
+       info->synch = 0;
+       info->synch_value = info->head.nthreads;
+       info->value = 0;
+       info->handoff_failure = 0;
+       info->thread_inheritor = NULL;
+
+       start_threads((thread_continue_t)thread_inheritor_like_mutex, (struct synch_test_common *)info, FALSE);
+       wait_all_thread((struct synch_test_common *)info);
+
+       T_ASSERT(info->value == (int)info->head.nthreads, "value protected by sleep");
+       T_ASSERT(info->handoff_failure == 1, "handoff failures");
+}
+
+static void
+test_change_sleep_inheritor(struct info_sleep_inheritor_test *info, int prim_type)
+{
+       info->prim_type = prim_type;
+
+       info->thread_inheritor = NULL;
+       info->steal_pri = 0;
+       info->synch = 0;
+       info->synch_value = info->head.nthreads;
+
+       start_threads((thread_continue_t)thread_steal_work, (struct synch_test_common *)info, FALSE);
+       wait_all_thread((struct synch_test_common *)info);
+}
+
+static void
+test_no_inheritor(struct info_sleep_inheritor_test *info, int prim_type)
+{
+       info->prim_type = prim_type;
+       info->synch = 0;
+       info->synch_value = info->head.nthreads;
+
+       info->thread_inheritor = NULL;
+       info->value = info->head.nthreads;
+
+       start_threads((thread_continue_t)thread_no_inheritor_work, (struct synch_test_common *)info, FALSE);
+       wait_all_thread((struct synch_test_common *)info);
+}
+
+static void
+test_rw_lock(struct info_sleep_inheritor_test *info)
+{
+       info->thread_inheritor = NULL;
+       info->value = info->head.nthreads;
+       info->synch = 0;
+       info->synch_value = info->head.nthreads;
+
+       start_threads((thread_continue_t)thread_rw_work, (struct synch_test_common *)info, FALSE);
+       wait_all_thread((struct synch_test_common *)info);
+}
+
+static void
+test_mtx_lock(struct info_sleep_inheritor_test *info)
+{
+       info->thread_inheritor = NULL;
+       info->value = info->head.nthreads;
+       info->synch = 0;
+       info->synch_value = info->head.nthreads;
+
+       start_threads((thread_continue_t)thread_mtx_work, (struct synch_test_common *)info, FALSE);
+       wait_all_thread((struct synch_test_common *)info);
+}
+
+kern_return_t
+ts_kernel_sleep_inheritor_test(void)
+{
+       struct info_sleep_inheritor_test info = {};
+
+       init_synch_test_common((struct synch_test_common *)&info, NUM_THREADS);
+
+       lck_attr_t* lck_attr = lck_attr_alloc_init();
+       lck_grp_attr_t* lck_grp_attr = lck_grp_attr_alloc_init();
+       lck_grp_t* lck_grp = lck_grp_alloc_init("test sleep_inheritor", lck_grp_attr);
+
+       lck_mtx_init(&info.mtx_lock, lck_grp, lck_attr);
+       lck_rw_init(&info.rw_lock, lck_grp, lck_attr);
+
+       /*
+        * Testing lck_mtx_sleep_with_inheritor and wakeup_all_with_inheritor
+        */
+       T_LOG("Testing mtx sleep with inheritor and wake_all_with_inheritor");
+       test_sleep_with_wake_all(&info, MTX_LOCK);
+
+       /*
+        * Testing rw_mtx_sleep_with_inheritor and wakeup_all_with_inheritor
+        */
+       T_LOG("Testing rw sleep with inheritor and wake_all_with_inheritor");
+       test_sleep_with_wake_all(&info, RW_LOCK);
+
+       /*
+        * Testing lck_mtx_sleep_with_inheritor and wakeup_one_with_inheritor
+        */
+       T_LOG("Testing mtx sleep with inheritor and wake_one_with_inheritor");
+       test_sleep_with_wake_one(&info, MTX_LOCK);
+
+       /*
+        * Testing lck_rw_sleep_with_inheritor and wakeup_one_with_inheritor
+        */
+       T_LOG("Testing rw sleep with inheritor and wake_one_with_inheritor");
+       test_sleep_with_wake_one(&info, RW_LOCK);
+
+       /*
+        * Testing lck_mtx_sleep_with_inheritor and wakeup_all_with_inheritor
+        * and change_sleep_inheritor
+        */
+       T_LOG("Testing change_sleep_inheritor with mxt sleep");
+       test_change_sleep_inheritor(&info, MTX_LOCK);
+
+       /*
+        * Testing lck_mtx_sleep_with_inheritor and wakeup_all_with_inheritor
+        * and change_sleep_inheritor
+        */
+       T_LOG("Testing change_sleep_inheritor with rw sleep");
+       test_change_sleep_inheritor(&info, RW_LOCK);
+
+       /*
+        * Testing lck_mtx_sleep_with_inheritor and wakeup_all_with_inheritor
+        * with inheritor NULL
+        */
+       T_LOG("Testing inheritor NULL");
+       test_no_inheritor(&info, MTX_LOCK);
+
+       /*
+        * Testing lck_mtx_sleep_with_inheritor and wakeup_all_with_inheritor
+        * with inheritor NULL
+        */
+       T_LOG("Testing inheritor NULL");
+       test_no_inheritor(&info, RW_LOCK);
+
+       /*
+        * Testing mtx locking combinations
+        */
+       T_LOG("Testing mtx locking combinations");
+       test_mtx_lock(&info);
+
+       /*
+        * Testing rw locking combinations
+        */
+       T_LOG("Testing rw locking combinations");
+       test_rw_lock(&info);
+
+       destroy_synch_test_common((struct synch_test_common *)&info);
+
+       lck_attr_free(lck_attr);
+       lck_grp_attr_free(lck_grp_attr);
+       lck_rw_destroy(&info.rw_lock, lck_grp);
+       lck_mtx_destroy(&info.mtx_lock, lck_grp);
+       lck_grp_free(lck_grp);
+
+       return KERN_SUCCESS;
+}
+
+static void
+thread_gate_aggressive(
+       void *args,
+       __unused wait_result_t wr)
+{
+       struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args;
+       uint my_pri = current_thread()->sched_pri;
+
+       T_LOG("Started thread pri %d %p", my_pri, current_thread());
+
+       primitive_lock(info);
+       if (info->thread_inheritor == NULL) {
+               info->thread_inheritor = current_thread();
+               primitive_gate_assert(info, GATE_ASSERT_OPEN);
+               primitive_gate_close(info);
+               exclude_current_waiter((struct synch_test_common *)info);
+
+               primitive_unlock(info);
+
+               wait_threads(&info->synch, info->synch_value - 2);
+               wait_for_waiters((struct synch_test_common *)info);
+               T_LOG("Thread pri %d first to run %p", my_pri, current_thread());
+
+               primitive_lock(info);
+               if (info->thread_inheritor == current_thread()) {
+                       primitive_gate_open(info);
+               }
+       } else {
+               if (info->steal_pri == 0) {
+                       info->steal_pri = my_pri;
+                       info->thread_inheritor = current_thread();
+                       primitive_gate_steal(info);
+                       exclude_current_waiter((struct synch_test_common *)info);
+
+                       primitive_unlock(info);
+                       wait_threads(&info->synch, info->synch_value - 2);
+
+                       T_LOG("Thread pri %d stole push %p", my_pri, current_thread());
+                       wait_for_waiters((struct synch_test_common *)info);
+                       T_ASSERT((uint) current_thread()->sched_pri == info->steal_pri, "gate keeper priority current is %d, should be %d", current_thread()->sched_pri, info->steal_pri);
+
+                       primitive_lock(info);
+                       primitive_gate_open(info);
+               } else {
+                       if (my_pri > info->steal_pri) {
+                               info->steal_pri = my_pri;
+                       }
+                       wake_threads(&info->synch);
+                       primitive_gate_wait(info);
+                       exclude_current_waiter((struct synch_test_common *)info);
+               }
+       }
+       primitive_unlock(info);
+
+       assert(current_thread()->kern_promotion_schedpri == 0);
+       notify_waiter((struct synch_test_common *)info);
+
+       thread_terminate_self();
+}
+
+static void
+thread_gate_like_mutex(
+       void *args,
+       __unused wait_result_t wr)
+{
+       gate_wait_result_t wait;
+       kern_return_t ret;
+       uint my_pri = current_thread()->sched_pri;
+
+       struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args;
+
+       T_LOG("Started thread pri %d %p", my_pri, current_thread());
+
+       /*
+        * spin here to start concurrently
+        */
+       wake_threads(&info->synch);
+       wait_threads(&info->synch, info->synch_value);
+
+       primitive_lock(info);
+
+       if (primitive_gate_try_close(info) != KERN_SUCCESS) {
+               wait = primitive_gate_wait(info);
+               T_ASSERT(wait == GATE_HANDOFF, "gate_wait return");
+       }
+
+       primitive_gate_assert(info, GATE_ASSERT_HELD);
+
+       primitive_unlock(info);
+
+       IOSleep(100);
+       info->value++;
+
+       primitive_lock(info);
+
+       ret = primitive_gate_handoff(info, GATE_HANDOFF_DEFAULT);
+       if (ret == KERN_NOT_WAITING) {
+               T_ASSERT(info->handoff_failure == 0, "handoff failures");
+               primitive_gate_handoff(info, GATE_HANDOFF_OPEN_IF_NO_WAITERS);
+               info->handoff_failure++;
+       }
+
+       primitive_unlock(info);
+       notify_waiter((struct synch_test_common *)info);
+
+       thread_terminate_self();
+}
+
+static void
+thread_just_one_do_work(
+       void *args,
+       __unused wait_result_t wr)
+{
+       struct info_sleep_inheritor_test *info = (struct info_sleep_inheritor_test*) args;
+       uint my_pri = current_thread()->sched_pri;
+       uint max_pri;
+
+       T_LOG("Started thread pri %d %p", my_pri, current_thread());
+
+       primitive_lock(info);
+check_again:
+       if (info->work_to_do) {
+               if (primitive_gate_try_close(info) == KERN_SUCCESS) {
+                       primitive_gate_assert(info, GATE_ASSERT_HELD);
+                       primitive_unlock(info);
+
+                       T_LOG("Thread pri %d acquired the gate %p", my_pri, current_thread());
+                       wait_threads(&info->synch, info->synch_value - 1);
+                       wait_for_waiters((struct synch_test_common *)info);
+                       max_pri = get_max_pri((struct synch_test_common *) info);
+                       T_ASSERT((uint) current_thread()->sched_pri == max_pri, "gate owner priority current is %d, should be %d", current_thread()->sched_pri, max_pri);
+                       os_atomic_store(&info->synch, 0, relaxed);
+
+                       primitive_lock(info);
+                       info->work_to_do = FALSE;
+                       primitive_gate_open(info);
+               } else {
+                       primitive_gate_assert(info, GATE_ASSERT_CLOSED);
+                       wake_threads(&info->synch);
+                       primitive_gate_wait(info);
+                       goto check_again;
+               }
+       }
+       primitive_unlock(info);
+
+       assert(current_thread()->kern_promotion_schedpri == 0);
+       notify_waiter((struct synch_test_common *)info);
+       thread_terminate_self();
+}
+
+static void
+test_gate_push(struct info_sleep_inheritor_test *info, int prim_type)
+{
+       info->prim_type = prim_type;
+
+       primitive_gate_init(info);
+       info->work_to_do = TRUE;
+       info->synch = 0;
+       info->synch_value = NUM_THREADS;
+
+       start_threads((thread_continue_t)thread_just_one_do_work, (struct synch_test_common *) info, TRUE);
+       wait_all_thread((struct synch_test_common *)info);
+
+       primitive_gate_destroy(info);
+}
+
+static void
+test_gate_handoff(struct info_sleep_inheritor_test *info, int prim_type)
+{
+       info->prim_type = prim_type;
+
+       primitive_gate_init(info);
+
+       info->synch = 0;
+       info->synch_value = NUM_THREADS;
+       info->value = 0;
+       info->handoff_failure = 0;
+
+       start_threads((thread_continue_t)thread_gate_like_mutex, (struct synch_test_common *)info, false);
+       wait_all_thread((struct synch_test_common *)info);
+
+       T_ASSERT(info->value == NUM_THREADS, "value protected by gate");
+       T_ASSERT(info->handoff_failure == 1, "handoff failures");
+
+       primitive_gate_destroy(info);
+}
+
+static void
+test_gate_steal(struct info_sleep_inheritor_test *info, int prim_type)
+{
+       info->prim_type = prim_type;
+
+       primitive_gate_init(info);
+
+       info->synch = 0;
+       info->synch_value = NUM_THREADS;
+       info->thread_inheritor = NULL;
+       info->steal_pri = 0;
+
+       start_threads((thread_continue_t)thread_gate_aggressive, (struct synch_test_common *)info, FALSE);
+       wait_all_thread((struct synch_test_common *)info);
+
+       primitive_gate_destroy(info);
+}
+
+kern_return_t
+ts_kernel_gate_test(void)
+{
+       struct info_sleep_inheritor_test info = {};
+
+       T_LOG("Testing gate primitive");
+
+       init_synch_test_common((struct synch_test_common *)&info, NUM_THREADS);
+
+       lck_attr_t* lck_attr = lck_attr_alloc_init();
+       lck_grp_attr_t* lck_grp_attr = lck_grp_attr_alloc_init();
+       lck_grp_t* lck_grp = lck_grp_alloc_init("test gate", lck_grp_attr);
+
+       lck_mtx_init(&info.mtx_lock, lck_grp, lck_attr);
+       lck_rw_init(&info.rw_lock, lck_grp, lck_attr);
+
+       /*
+        * Testing the priority inherited by the keeper
+        * lck_mtx_gate_try_close, lck_mtx_gate_open, lck_mtx_gate_wait
+        */
+       T_LOG("Testing gate push, lck");
+       test_gate_push(&info, MTX_LOCK);
+
+       T_LOG("Testing gate push, rw");
+       test_gate_push(&info, RW_LOCK);
+
+       /*
+        * Testing the handoff
+        * lck_mtx_gate_wait, lck_mtx_gate_handoff
+        */
+       T_LOG("Testing gate handoff, lck");
+       test_gate_handoff(&info, MTX_LOCK);
+
+       T_LOG("Testing gate handoff, rw");
+       test_gate_handoff(&info, RW_LOCK);
+
+       /*
+        * Testing the steal
+        * lck_mtx_gate_close, lck_mtx_gate_wait, lck_mtx_gate_steal, lck_mtx_gate_handoff
+        */
+       T_LOG("Testing gate steal, lck");
+       test_gate_steal(&info, MTX_LOCK);
+
+       T_LOG("Testing gate steal, rw");
+       test_gate_steal(&info, RW_LOCK);
+
+       destroy_synch_test_common((struct synch_test_common *)&info);
+
+       lck_attr_free(lck_attr);
+       lck_grp_attr_free(lck_grp_attr);
+       lck_mtx_destroy(&info.mtx_lock, lck_grp);
+       lck_grp_free(lck_grp);
+
+       return KERN_SUCCESS;
+}
+
+#define NUM_THREAD_CHAIN 6
+
+struct turnstile_chain_test {
+       struct synch_test_common head;
+       lck_mtx_t mtx_lock;
+       int synch_value;
+       int synch;
+       int synch2;
+       gate_t gates[NUM_THREAD_CHAIN];
+};
+
+static void
+thread_sleep_gate_chain_work(
+       void *args,
+       __unused wait_result_t wr)
+{
+       struct turnstile_chain_test *info = (struct turnstile_chain_test*) args;
+       thread_t self = current_thread();
+       uint my_pri = self->sched_pri;
+       uint max_pri;
+       uint i;
+       thread_t inheritor = NULL, woken_up;
+       event_t wait_event, wake_event;
+       kern_return_t ret;
+
+       T_LOG("Started thread pri %d %p", my_pri, self);
+
+       /*
+        * Need to use the threads ids, wait for all of them to be populated
+        */
+
+       while (os_atomic_load(&info->head.threads[info->head.nthreads - 1], acquire) == NULL) {
+               IOSleep(10);
+       }
+
+       max_pri = get_max_pri((struct synch_test_common *) info);
+
+       for (i = 0; i < info->head.nthreads; i = i + 2) {
+               // even threads will close a gate
+               if (info->head.threads[i] == self) {
+                       lck_mtx_lock(&info->mtx_lock);
+                       lck_mtx_gate_close(&info->mtx_lock, &info->gates[i]);
+                       lck_mtx_unlock(&info->mtx_lock);
+                       break;
+               }
+       }
+
+       wake_threads(&info->synch2);
+       wait_threads(&info->synch2, info->synch_value);
+
+       if (self == os_atomic_load(&info->head.threads[0], acquire)) {
+               wait_threads(&info->synch, info->synch_value - 1);
+               wait_for_waiters((struct synch_test_common *)info);
+
+               T_ASSERT((uint) self->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", self->sched_pri, max_pri);
+
+               lck_mtx_lock(&info->mtx_lock);
+               lck_mtx_gate_open(&info->mtx_lock, &info->gates[0]);
+               lck_mtx_unlock(&info->mtx_lock);
+       } else {
+               wait_event = NULL;
+               wake_event = NULL;
+               for (i = 0; i < info->head.nthreads; i++) {
+                       if (info->head.threads[i] == self) {
+                               inheritor = info->head.threads[i - 1];
+                               wait_event = (event_t) &info->head.threads[i - 1];
+                               wake_event = (event_t) &info->head.threads[i];
+                               break;
+                       }
+               }
+               assert(wait_event != NULL);
+
+               lck_mtx_lock(&info->mtx_lock);
+               wake_threads(&info->synch);
+
+               if (i % 2 != 0) {
+                       lck_mtx_gate_wait(&info->mtx_lock, &info->gates[i - 1], LCK_SLEEP_UNLOCK, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+                       T_ASSERT((uint) self->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", self->sched_pri, max_pri);
+
+                       ret = wakeup_one_with_inheritor(wake_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &woken_up);
+                       if (ret == KERN_SUCCESS) {
+                               T_ASSERT(i != (info->head.nthreads - 1), "thread id");
+                               T_ASSERT(woken_up == info->head.threads[i + 1], "wakeup_one_with_inheritor woke next");
+                       } else {
+                               T_ASSERT(i == (info->head.nthreads - 1), "thread id");
+                       }
+
+                       // i am still the inheritor, wake all to drop inheritership
+                       ret = wakeup_all_with_inheritor(wake_event, LCK_WAKE_DEFAULT);
+                       T_ASSERT(ret == KERN_NOT_WAITING, "waiters on event");
+               } else {
+                       // I previously closed a gate
+                       lck_mtx_sleep_with_inheritor(&info->mtx_lock, LCK_SLEEP_UNLOCK, wait_event, inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+                       T_ASSERT((uint) self->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", self->sched_pri, max_pri);
+
+                       lck_mtx_lock(&info->mtx_lock);
+                       lck_mtx_gate_open(&info->mtx_lock, &info->gates[i]);
+                       lck_mtx_unlock(&info->mtx_lock);
+               }
+       }
+
+       assert(current_thread()->kern_promotion_schedpri == 0);
+       notify_waiter((struct synch_test_common *)info);
+
+       thread_terminate_self();
+}
+
+static void
+thread_gate_chain_work(
+       void *args,
+       __unused wait_result_t wr)
+{
+       struct turnstile_chain_test *info = (struct turnstile_chain_test*) args;
+       thread_t self = current_thread();
+       uint my_pri = self->sched_pri;
+       uint max_pri;
+       uint i;
+       T_LOG("Started thread pri %d %p", my_pri, self);
+
+
+       /*
+        * Need to use the threads ids, wait for all of them to be populated
+        */
+       while (os_atomic_load(&info->head.threads[info->head.nthreads - 1], acquire) == NULL) {
+               IOSleep(10);
+       }
+
+       max_pri = get_max_pri((struct synch_test_common *) info);
+
+       for (i = 0; i < info->head.nthreads; i++) {
+               if (info->head.threads[i] == self) {
+                       lck_mtx_lock(&info->mtx_lock);
+                       lck_mtx_gate_close(&info->mtx_lock, &info->gates[i]);
+                       lck_mtx_unlock(&info->mtx_lock);
+                       break;
+               }
+       }
+       assert(i != info->head.nthreads);
+
+       wake_threads(&info->synch2);
+       wait_threads(&info->synch2, info->synch_value);
+
+       if (self == os_atomic_load(&info->head.threads[0], acquire)) {
+               wait_threads(&info->synch, info->synch_value - 1);
+
+               wait_for_waiters((struct synch_test_common *)info);
+
+               T_ASSERT((uint) self->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", self->sched_pri, max_pri);
+
+               lck_mtx_lock(&info->mtx_lock);
+               lck_mtx_gate_open(&info->mtx_lock, &info->gates[0]);
+               lck_mtx_unlock(&info->mtx_lock);
+       } else {
+               lck_mtx_lock(&info->mtx_lock);
+               wake_threads(&info->synch);
+               lck_mtx_gate_wait(&info->mtx_lock, &info->gates[i - 1], LCK_SLEEP_UNLOCK, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+
+               T_ASSERT((uint) self->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", self->sched_pri, max_pri);
+
+               lck_mtx_lock(&info->mtx_lock);
+               lck_mtx_gate_open(&info->mtx_lock, &info->gates[i]);
+               lck_mtx_unlock(&info->mtx_lock);
+       }
+
+       assert(current_thread()->kern_promotion_schedpri == 0);
+       notify_waiter((struct synch_test_common *)info);
+
+       thread_terminate_self();
+}
+
+static void
+thread_sleep_chain_work(
+       void *args,
+       __unused wait_result_t wr)
+{
+       struct turnstile_chain_test *info = (struct turnstile_chain_test*) args;
+       thread_t self = current_thread();
+       uint my_pri = self->sched_pri;
+       uint max_pri;
+       event_t wait_event, wake_event;
+       uint i;
+       thread_t inheritor = NULL, woken_up = NULL;
+       kern_return_t ret;
+
+       T_LOG("Started thread pri %d %p", my_pri, self);
+
+       /*
+        * Need to use the threads ids, wait for all of them to be populated
+        */
+       while (os_atomic_load(&info->head.threads[info->head.nthreads - 1], acquire) == NULL) {
+               IOSleep(10);
+       }
+
+       max_pri = get_max_pri((struct synch_test_common *) info);
+
+       if (self == os_atomic_load(&info->head.threads[0], acquire)) {
+               wait_threads(&info->synch, info->synch_value - 1);
+
+               wait_for_waiters((struct synch_test_common *)info);
+
+               T_ASSERT((uint) self->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", self->sched_pri, max_pri);
+
+               ret = wakeup_one_with_inheritor((event_t) &info->head.threads[0], THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &woken_up);
+               T_ASSERT(ret == KERN_SUCCESS, "wakeup_one_with_inheritor woke next");
+               T_ASSERT(woken_up == info->head.threads[1], "thread woken up");
+
+               // i am still the inheritor, wake all to drop inheritership
+               ret = wakeup_all_with_inheritor((event_t) &info->head.threads[0], LCK_WAKE_DEFAULT);
+               T_ASSERT(ret == KERN_NOT_WAITING, "waiters on event");
+       } else {
+               wait_event = NULL;
+               wake_event = NULL;
+               for (i = 0; i < info->head.nthreads; i++) {
+                       if (info->head.threads[i] == self) {
+                               inheritor = info->head.threads[i - 1];
+                               wait_event = (event_t) &info->head.threads[i - 1];
+                               wake_event = (event_t) &info->head.threads[i];
+                               break;
+                       }
+               }
+
+               assert(wait_event != NULL);
+               lck_mtx_lock(&info->mtx_lock);
+               wake_threads(&info->synch);
+
+               lck_mtx_sleep_with_inheritor(&info->mtx_lock, LCK_SLEEP_UNLOCK, wait_event, inheritor, THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
+
+               T_ASSERT((uint) self->sched_pri == max_pri, "sleep_inheritor inheritor priority current is %d, should be %d", self->sched_pri, max_pri);
+
+               ret = wakeup_one_with_inheritor(wake_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &woken_up);
+               if (ret == KERN_SUCCESS) {
+                       T_ASSERT(i != (info->head.nthreads - 1), "thread id");
+                       T_ASSERT(woken_up == info->head.threads[i + 1], "wakeup_one_with_inheritor woke next");
+               } else {
+                       T_ASSERT(i == (info->head.nthreads - 1), "thread id");
+               }
+
+               // i am still the inheritor, wake all to drop inheritership
+               ret = wakeup_all_with_inheritor(wake_event, LCK_WAKE_DEFAULT);
+               T_ASSERT(ret == KERN_NOT_WAITING, "waiters on event");
+       }
+
+       assert(current_thread()->kern_promotion_schedpri == 0);
+       notify_waiter((struct synch_test_common *)info);
+
+       thread_terminate_self();
+}
+
+static void
+test_sleep_chain(struct turnstile_chain_test *info)
+{
+       info->synch = 0;
+       info->synch_value = info->head.nthreads;
+
+       start_threads((thread_continue_t)thread_sleep_chain_work, (struct synch_test_common *)info, FALSE);
+       wait_all_thread((struct synch_test_common *)info);
+}
+
+static void
+test_gate_chain(struct turnstile_chain_test *info)
+{
+       info->synch = 0;
+       info->synch2 = 0;
+       info->synch_value = info->head.nthreads;
+
+       start_threads((thread_continue_t)thread_gate_chain_work, (struct synch_test_common *)info, FALSE);
+       wait_all_thread((struct synch_test_common *)info);
+}
+
+static void
+test_sleep_gate_chain(struct turnstile_chain_test *info)
+{
+       info->synch = 0;
+       info->synch2 = 0;
+       info->synch_value = info->head.nthreads;
+
+       start_threads((thread_continue_t)thread_sleep_gate_chain_work, (struct synch_test_common *)info, FALSE);
+       wait_all_thread((struct synch_test_common *)info);
+}
+
+kern_return_t
+ts_kernel_turnstile_chain_test(void)
+{
+       struct turnstile_chain_test info = {};
+       int i;
+
+       init_synch_test_common((struct synch_test_common *)&info, NUM_THREAD_CHAIN);
+       lck_attr_t* lck_attr = lck_attr_alloc_init();
+       lck_grp_attr_t* lck_grp_attr = lck_grp_attr_alloc_init();
+       lck_grp_t* lck_grp = lck_grp_alloc_init("test gate", lck_grp_attr);
+
+       lck_mtx_init(&info.mtx_lock, lck_grp, lck_attr);
+       for (i = 0; i < NUM_THREAD_CHAIN; i++) {
+               lck_mtx_gate_init(&info.mtx_lock, &info.gates[i]);
+       }
+
+       T_LOG("Testing sleep chain, lck");
+       test_sleep_chain(&info);
+
+       T_LOG("Testing gate chain, lck");
+       test_gate_chain(&info);
+
+       T_LOG("Testing sleep and gate chain, lck");
+       test_sleep_gate_chain(&info);
+
+       destroy_synch_test_common((struct synch_test_common *)&info);
+       for (i = 0; i < NUM_THREAD_CHAIN; i++) {
+               lck_mtx_gate_destroy(&info.mtx_lock, &info.gates[i]);
+       }
+       lck_attr_free(lck_attr);
+       lck_grp_attr_free(lck_grp_attr);
+       lck_mtx_destroy(&info.mtx_lock, lck_grp);
+       lck_grp_free(lck_grp);
+
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+ts_kernel_timingsafe_bcmp_test(void)
+{
+       int i, buf_size;
+       char *buf = NULL;
+
+       // empty
+       T_ASSERT(timingsafe_bcmp(NULL, NULL, 0) == 0, NULL);
+       T_ASSERT(timingsafe_bcmp("foo", "foo", 0) == 0, NULL);
+       T_ASSERT(timingsafe_bcmp("foo", "bar", 0) == 0, NULL);
+
+       // equal
+       T_ASSERT(timingsafe_bcmp("foo", "foo", strlen("foo")) == 0, NULL);
+
+       // unequal
+       T_ASSERT(timingsafe_bcmp("foo", "bar", strlen("foo")) == 1, NULL);
+       T_ASSERT(timingsafe_bcmp("foo", "goo", strlen("foo")) == 1, NULL);
+       T_ASSERT(timingsafe_bcmp("foo", "fpo", strlen("foo")) == 1, NULL);
+       T_ASSERT(timingsafe_bcmp("foo", "fop", strlen("foo")) == 1, NULL);
+
+       // all possible bitwise differences
+       for (i = 1; i < 256; i += 1) {
+               unsigned char a = 0;
+               unsigned char b = (unsigned char)i;
+
+               T_ASSERT(timingsafe_bcmp(&a, &b, sizeof(a)) == 1, NULL);
+       }
+
+       // large
+       buf_size = 1024 * 16;
+       buf = kalloc(buf_size);
+       T_EXPECT_NOTNULL(buf, "kalloc of buf");
+
+       read_random(buf, buf_size);
+       T_ASSERT(timingsafe_bcmp(buf, buf, buf_size) == 0, NULL);
+       T_ASSERT(timingsafe_bcmp(buf, buf + 1, buf_size - 1) == 1, NULL);
+       T_ASSERT(timingsafe_bcmp(buf, buf + 128, 128) == 1, NULL);
+
+       memcpy(buf + 128, buf, 128);
+       T_ASSERT(timingsafe_bcmp(buf, buf + 128, 128) == 0, NULL);
+
+       kfree(buf, buf_size);
+
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+kprintf_hhx_test(void)
+{
+       printf("POST hhx test %hx%hx%hx%hx %hhx%hhx%hhx%hhx - %llx",
+           (unsigned short)0xfeed, (unsigned short)0xface,
+           (unsigned short)0xabad, (unsigned short)0xcafe,
+           (unsigned char)'h', (unsigned char)'h', (unsigned char)'x',
+           (unsigned char)'!',
+           0xfeedfaceULL);
+       return KERN_SUCCESS;
+}
index ee73016ad5e4b81d48b78174d130b313c1b8967e..99624e77e5867eafedeec9766a7c07422df66233 100644 (file)
 #include <vm/pmap.h>
 #include <kern/ledger.h>
 #include <kern/thread.h>
-
+#if defined(__arm64__)
+#include <pexpert/arm64/board_config.h>
+#endif
 
 extern ledger_template_t task_ledger_template;
 
+extern boolean_t arm_force_fast_fault(ppnum_t, vm_prot_t, int, void*);
+extern kern_return_t arm_fast_fault(pmap_t, vm_map_address_t, vm_prot_t, bool, bool);
+
 kern_return_t test_pmap_enter_disconnect(unsigned int num_loops);
 kern_return_t test_pmap_iommu_disconnect(void);
+kern_return_t test_pmap_extended(void);
 
 #define PMAP_TEST_VA (0xDEAD << PAGE_SHIFT)
 
@@ -46,7 +52,7 @@ typedef struct {
 } pmap_test_thread_args;
 
 static pmap_t
-pmap_create_wrapper()
+pmap_create_wrapper(unsigned int flags)
 {
        pmap_t new_pmap = NULL;
        ledger_t ledger;
@@ -54,7 +60,7 @@ pmap_create_wrapper()
        if ((ledger = ledger_instantiate(task_ledger_template, LEDGER_CREATE_ACTIVE_ENTRIES)) == NULL) {
                return NULL;
        }
-       new_pmap = pmap_create(ledger, 0, FALSE);
+       new_pmap = pmap_create_options(ledger, 0, flags);
        ledger_dereference(ledger);
        return new_pmap;
 }
@@ -74,7 +80,7 @@ test_pmap_enter_disconnect(unsigned int num_loops)
 {
        kern_return_t kr = KERN_SUCCESS;
        thread_t disconnect_thread;
-       pmap_t new_pmap = pmap_create_wrapper();
+       pmap_t new_pmap = pmap_create_wrapper(0);
        if (new_pmap == NULL) {
                return KERN_FAILURE;
        }
@@ -118,3 +124,9 @@ test_pmap_iommu_disconnect(void)
 {
        return KERN_SUCCESS;
 }
+
+kern_return_t
+test_pmap_extended(void)
+{
+       return KERN_SUCCESS;
+}
index cee9312b9e907444b41fa9ff2df2f1673d6cf2f9..4d22f2639ba0991aad840d0a1941316a55e116d1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -90,13 +90,29 @@ extern uint32_t kernel_post_tests_count;
 extern uint32_t total_post_tests_count;
 
 #define XNUPOST_TEST_CONFIG_BASIC(func)                   \
-       {                                                     \
-               XT_CONFIG_RUN, 0, -1, T_STATE_PASS, 0, 0, 0, (func), "xnu."#func \
+       {                                                 \
+               .xt_config = XT_CONFIG_RUN,               \
+               .xt_test_num = 0,                         \
+               .xt_retval = -1,                          \
+               .xt_expected_retval = T_STATE_PASS,       \
+               .xt_begin_time = 0,                       \
+               .xt_end_time = 0,                         \
+               .xt_test_actions = 0,                     \
+               .xt_func = (func),                        \
+               .xt_name = "xnu."#func                    \
        }
 
 #define XNUPOST_TEST_CONFIG_TEST_PANIC(func)                       \
-       {                                                              \
-               XT_CONFIG_EXPECT_PANIC, 0, -1, T_STATE_PASS, 0, 0, 0, (func), "xnu."#func \
+       {                                                          \
+               .xt_config = XT_CONFIG_EXPECT_PANIC,               \
+               .xt_test_num = 0,                                  \
+               .xt_retval = -1,                                   \
+               .xt_expected_retval = T_STATE_PASS,                \
+               .xt_begin_time = 0,                                \
+               .xt_end_time = 0,                                  \
+               .xt_test_actions = 0,                              \
+               .xt_func = (func),                                 \
+               .xt_name = "xnu."#func                             \
        }
 
 void xnupost_init(void);
index 0453c363ea62e4183516ee97f4138038de657eb0..d9f9cc2094325e9088355f4a5588ff2a57e710cf 100644 (file)
@@ -9,6 +9,7 @@ include $(MakeInc_def)
 DATAFILES =
 
 EXPORT_ONLY_FILES = \
+       memory_types.h \
        pmap.h \
        vm_fault.h \
        vm_kern.h \
index 707f58aa92d336c096e46e264da7f5641381c706..cedf45dbcee7c4867b83b6f2b7c01fcc01c60962 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -97,19 +97,19 @@ mach_get_vm_end(vm_map_t map)
  */
 
 const struct memory_object_pager_ops vnode_pager_ops = {
-       vnode_pager_reference,
-       vnode_pager_deallocate,
-       vnode_pager_init,
-       vnode_pager_terminate,
-       vnode_pager_data_request,
-       vnode_pager_data_return,
-       vnode_pager_data_initialize,
-       vnode_pager_data_unlock,
-       vnode_pager_synchronize,
-       vnode_pager_map,
-       vnode_pager_last_unmap,
-       NULL, /* data_reclaim */
-       "vnode pager"
+       .memory_object_reference = vnode_pager_reference,
+       .memory_object_deallocate = vnode_pager_deallocate,
+       .memory_object_init = vnode_pager_init,
+       .memory_object_terminate = vnode_pager_terminate,
+       .memory_object_data_request = vnode_pager_data_request,
+       .memory_object_data_return = vnode_pager_data_return,
+       .memory_object_data_initialize = vnode_pager_data_initialize,
+       .memory_object_data_unlock = vnode_pager_data_unlock,
+       .memory_object_synchronize = vnode_pager_synchronize,
+       .memory_object_map = vnode_pager_map,
+       .memory_object_last_unmap = vnode_pager_last_unmap,
+       .memory_object_data_reclaim = NULL,
+       .memory_object_pager_name = "vnode pager"
 };
 
 typedef struct vnode_pager {
@@ -985,7 +985,6 @@ vnode_pager_lookup_vnode(
 
 static int fill_vnodeinfoforaddr( vm_map_entry_t entry, uintptr_t * vnodeaddr, uint32_t * vid);
 
-
 int
 fill_procregioninfo(task_t task, uint64_t arg, struct proc_regioninfo_internal *pinfo, uintptr_t *vnodeaddr, uint32_t  *vid)
 {
@@ -1017,30 +1016,27 @@ fill_procregioninfo(task_t task, uint64_t arg, struct proc_regioninfo_internal *
                if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
                        if (do_region_footprint &&
                            address == tmp_entry->vme_end) {
-                               ledger_amount_t nonvol, nonvol_compressed;
+                               ledger_amount_t ledger_resident;
+                               ledger_amount_t ledger_compressed;
 
                                /*
                                 * This request is right after the last valid
                                 * memory region;  instead of reporting the
                                 * end of the address space, report a fake
                                 * memory region to account for non-volatile
-                                * purgeable memory owned by this task.
+                                * purgeable and/or ledger-tagged memory
+                                * owned by this task.
                                 */
-
-                               ledger_get_balance(
-                                       task->ledger,
-                                       task_ledgers.purgeable_nonvolatile,
-                                       &nonvol);
-                               ledger_get_balance(
-                                       task->ledger,
-                                       task_ledgers.purgeable_nonvolatile_compressed,
-                                       &nonvol_compressed);
-                               if (nonvol + nonvol_compressed == 0) {
+                               task_ledgers_footprint(task->ledger,
+                                   &ledger_resident,
+                                   &ledger_compressed);
+                               if (ledger_resident + ledger_compressed == 0) {
                                        /* nothing to report */
                                        vm_map_unlock_read(map);
                                        vm_map_deallocate(map);
                                        return 0;
                                }
+
                                /* provide fake region for purgeable */
                                pinfo->pri_offset = address;
                                pinfo->pri_protection = VM_PROT_DEFAULT;
@@ -1050,22 +1046,22 @@ fill_procregioninfo(task_t task, uint64_t arg, struct proc_regioninfo_internal *
                                pinfo->pri_user_wired_count = 0;
                                pinfo->pri_user_tag = -1;
                                pinfo->pri_pages_resident =
-                                   (uint32_t) (nonvol / PAGE_SIZE);
+                                   (uint32_t) (ledger_resident / PAGE_SIZE);
                                pinfo->pri_pages_shared_now_private = 0;
                                pinfo->pri_pages_swapped_out =
-                                   (uint32_t) (nonvol_compressed / PAGE_SIZE);
+                                   (uint32_t) (ledger_compressed / PAGE_SIZE);
                                pinfo->pri_pages_dirtied =
-                                   (uint32_t) (nonvol / PAGE_SIZE);
+                                   (uint32_t) (ledger_resident / PAGE_SIZE);
                                pinfo->pri_ref_count = 1;
                                pinfo->pri_shadow_depth = 0;
                                pinfo->pri_share_mode = SM_PRIVATE;
                                pinfo->pri_private_pages_resident =
-                                   (uint32_t) (nonvol / PAGE_SIZE);
+                                   (uint32_t) (ledger_resident / PAGE_SIZE);
                                pinfo->pri_shared_pages_resident = 0;
                                pinfo->pri_obj_id = INFO_MAKE_FAKE_OBJECT_ID(map, task_ledgers.purgeable_nonvolatile);
                                pinfo->pri_address = address;
                                pinfo->pri_size =
-                                   (uint64_t) (nonvol + nonvol_compressed);
+                                   (uint64_t) (ledger_resident + ledger_compressed);
                                pinfo->pri_depth = 0;
 
                                vm_map_unlock_read(map);
@@ -1228,6 +1224,58 @@ fill_procregioninfo_onlymappedvnodes(task_t task, uint64_t arg, struct proc_regi
        return 0;
 }
 
+int
+find_region_details(task_t task, vm_map_offset_t offset,
+    uintptr_t *vnodeaddr, uint32_t *vid,
+    uint64_t *start, uint64_t *len)
+{
+       vm_map_t        map;
+       vm_map_entry_t  tmp_entry, entry;
+       int             rc = 0;
+
+       task_lock(task);
+       map = task->map;
+       if (map == VM_MAP_NULL) {
+               task_unlock(task);
+               return 0;
+       }
+       vm_map_reference(map);
+       task_unlock(task);
+
+       vm_map_lock_read(map);
+       if (!vm_map_lookup_entry(map, offset, &tmp_entry)) {
+               if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
+                       rc = 0;
+                       goto ret;
+               }
+       } else {
+               entry = tmp_entry;
+       }
+
+       while (entry != vm_map_to_entry(map)) {
+               *vnodeaddr = 0;
+               *vid = 0;
+               *start = 0;
+               *len = 0;
+
+               if (entry->is_sub_map == 0) {
+                       if (fill_vnodeinfoforaddr(entry, vnodeaddr, vid)) {
+                               *start = entry->vme_start;
+                               *len = entry->vme_end - entry->vme_start;
+                               rc = 1;
+                               goto ret;
+                       }
+               }
+
+               entry = entry->vme_next;
+       }
+
+ret:
+       vm_map_unlock_read(map);
+       vm_map_deallocate(map);
+       return rc;
+}
+
 static int
 fill_vnodeinfoforaddr(
        vm_map_entry_t                  entry,
index 6b478027ad49aa2079ebfe4cd4970fba5b28a9db..377d1aacccb9a7c75e9e96223e86b929ae4403f1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 
 /* until component support available */
 const struct memory_object_pager_ops device_pager_ops = {
-       device_pager_reference,
-       device_pager_deallocate,
-       device_pager_init,
-       device_pager_terminate,
-       device_pager_data_request,
-       device_pager_data_return,
-       device_pager_data_initialize,
-       device_pager_data_unlock,
-       device_pager_synchronize,
-       device_pager_map,
-       device_pager_last_unmap,
-       NULL, /* data_reclaim */
-       "device pager"
+       .memory_object_reference = device_pager_reference,
+       .memory_object_deallocate = device_pager_deallocate,
+       .memory_object_init = device_pager_init,
+       .memory_object_terminate = device_pager_terminate,
+       .memory_object_data_request = device_pager_data_request,
+       .memory_object_data_return = device_pager_data_return,
+       .memory_object_data_initialize = device_pager_data_initialize,
+       .memory_object_data_unlock = device_pager_data_unlock,
+       .memory_object_synchronize = device_pager_synchronize,
+       .memory_object_map = device_pager_map,
+       .memory_object_last_unmap = device_pager_last_unmap,
+       .memory_object_data_reclaim = NULL,
+       .memory_object_pager_name = "device pager"
 };
 
 typedef uintptr_t device_port_t;
@@ -179,6 +179,8 @@ device_pager_setup(
            &control);
        object = memory_object_control_to_vm_object(control);
 
+       memory_object_mark_trusted(control);
+
        assert(object != VM_OBJECT_NULL);
        vm_object_lock(object);
        object->true_share = TRUE;
@@ -383,7 +385,6 @@ device_pager_reference(
 
        device_object = device_pager_lookup(mem_obj);
        os_ref_retain(&device_object->ref_count);
-
        DTRACE_VM2(device_pager_reference,
            device_pager_t, device_object,
            unsigned int, os_ref_get_count(&device_object->ref_count));
index 190f4a26fb26cc6cde2eeb644bceaf93994b8924..512efd04bea6e7055df839867b2af0e274ac0b91 100644 (file)
@@ -30,7 +30,7 @@
 #include <stdint.h>
 #include <stddef.h>
 #include <kern/assert.h>
-#include <machine/machlimits.h>
+#include <machine/limits.h>
 #include "lz4_assembly_select.h"
 #include "lz4_constants.h"
 
index 9a35734bc82282b30c724e58beb0b46007e74dc1..db1574d06032d37300a1c4047971b5d9c66ae24b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -84,7 +84,6 @@
  */
 #include <string.h>             /* For memcpy() */
 
-#include <kern/xpr.h>
 #include <kern/host.h>
 #include <kern/thread.h>        /* For current_thread() */
 #include <kern/ipc_mig.h>
 #include <vm/vm_protos.h>
 
 memory_object_default_t memory_manager_default = MEMORY_OBJECT_DEFAULT_NULL;
-decl_lck_mtx_data(, memory_manager_default_lock)
+decl_lck_mtx_data(, memory_manager_default_lock);
 
 
 /*
@@ -166,11 +165,6 @@ memory_object_lock_page(
        boolean_t               should_flush,
        vm_prot_t               prot)
 {
-       XPR(XPR_MEMORY_OBJECT,
-           "m_o_lock_page, page 0x%X rtn %d flush %d prot %d\n",
-           m, should_return, should_flush, prot, 0);
-
-
        if (m->vmp_busy || m->vmp_cleaning) {
                return MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK;
        }
@@ -447,10 +441,6 @@ vm_object_sync(
        boolean_t       rv;
        int             flags;
 
-       XPR(XPR_VM_OBJECT,
-           "vm_o_sync, object 0x%X, offset 0x%X size 0x%x flush %d rtn %d\n",
-           object, offset, size, should_flush, should_return);
-
        /*
         * Lock the object, and acquire a paging reference to
         * prevent the memory_object and control ports from
@@ -1058,10 +1048,6 @@ vm_object_set_attributes_common(
 {
        boolean_t       object_became_ready;
 
-       XPR(XPR_MEMORY_OBJECT,
-           "m_o_set_attr_com, object 0x%X flg %x strat %d\n",
-           object, (may_cache & 1), copy_strategy, 0, 0);
-
        if (object == VM_OBJECT_NULL) {
                return KERN_INVALID_ARGUMENT;
        }
@@ -1879,6 +1865,24 @@ memory_object_mark_io_tracking(
        }
 }
 
+void
+memory_object_mark_trusted(
+       memory_object_control_t control)
+{
+       vm_object_t             object;
+
+       if (control == NULL) {
+               return;
+       }
+       object = memory_object_control_to_vm_object(control);
+
+       if (object != VM_OBJECT_NULL) {
+               vm_object_lock(object);
+               object->pager_trusted = TRUE;
+               vm_object_unlock(object);
+       }
+}
+
 #if CONFIG_SECLUDED_MEMORY
 void
 memory_object_mark_eligible_for_secluded(
index e70c96b0279b573582b77b40ddd03a223907f1aa..cc4eba042385502624fb47f0660005ffb7f4fe50 100644 (file)
@@ -145,6 +145,9 @@ extern void             memory_object_mark_unused(
 extern void             memory_object_mark_io_tracking(
        memory_object_control_t         control);
 
+extern void             memory_object_mark_trusted(
+       memory_object_control_t         control);
+
 #if CONFIG_SECLUDED_MEMORY
 extern void             memory_object_mark_eligible_for_secluded(
        memory_object_control_t         control,
diff --git a/osfmk/vm/memory_types.h b/osfmk/vm/memory_types.h
new file mode 100644 (file)
index 0000000..a918462
--- /dev/null
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/* machine independent WIMG bits */
+
+#ifndef _VM_MEMORY_TYPES_H_
+#define _VM_MEMORY_TYPES_H_
+
+#include <machine/memory_types.h>
+
+#define VM_MEM_GUARDED          0x1             /* (G) Guarded Storage */
+#define VM_MEM_COHERENT         0x2             /* (M) Memory Coherency */
+#define VM_MEM_NOT_CACHEABLE    0x4             /* (I) Cache Inhibit */
+#define VM_MEM_WRITE_THROUGH    0x8             /* (W) Write-Through */
+
+#define VM_WIMG_USE_DEFAULT     0x80
+#define VM_WIMG_MASK            0xFF
+
+#endif /* _VM_MEMORY_TYPES_H_ */
index af183829672088f0db28763f30ca1fb1cb5ce0eb..873bae99888fc081f9cf0f3c0afb94ddd5fa005f 100644 (file)
@@ -119,6 +119,7 @@ extern boolean_t pmap_has_managed_page(ppnum_t first, ppnum_t last);
 #include <mach_assert.h>
 
 #include <machine/pmap.h>
+#include <vm/memory_types.h>
 
 /*
  *     Routines used for initialization.
@@ -133,6 +134,7 @@ extern boolean_t pmap_has_managed_page(ppnum_t first, ppnum_t last);
  */
 
 extern void *pmap_steal_memory(vm_size_t size); /* Early memory allocation */
+extern void *pmap_steal_freeable_memory(vm_size_t size); /* Early memory allocation */
 
 extern uint_t pmap_free_pages(void); /* report remaining unused physical pages */
 
@@ -140,8 +142,6 @@ extern void pmap_startup(vm_offset_t *startp, vm_offset_t *endp); /* allocate vm
 
 extern void pmap_init(void); /* Initialization, once we have kernel virtual memory.  */
 
-extern void pmap_pv_fixup(vm_offset_t start, vm_size_t size);
-
 extern void mapping_adjust(void); /* Adjust free mapping count */
 
 extern void mapping_free_prime(void); /* Primes the mapping block release list */
@@ -150,7 +150,7 @@ extern void mapping_free_prime(void); /* Primes the mapping block release list *
 /*
  *     If machine/pmap.h defines MACHINE_PAGES, it must implement
  *     the above functions.  The pmap module has complete control.
- *     Otherwise, it must implement
+ *     Otherwise, it must implement the following functions:
  *             pmap_free_pages
  *             pmap_virtual_space
  *             pmap_next_page
@@ -163,34 +163,31 @@ extern void mapping_free_prime(void); /* Primes the mapping block release list *
  *     However, for best performance pmap_free_pages should be accurate.
  */
 
-extern boolean_t        pmap_next_page(ppnum_t *pnum);
-extern boolean_t        pmap_next_page_hi(ppnum_t *pnum);
-/* During VM initialization,
- * return the next unused
- * physical page.
+/*
+ * Routines to return the next unused physical page.
+ */
+extern boolean_t pmap_next_page(ppnum_t *pnum);
+extern boolean_t pmap_next_page_hi(ppnum_t *pnum, boolean_t might_free);
+#ifdef __x86_64__
+extern kern_return_t pmap_next_page_large(ppnum_t *pnum);
+extern void pmap_hi_pages_done(void);
+#endif
+
+/*
+ * Report virtual space available for the kernel.
  */
-extern void             pmap_virtual_space(
+extern void pmap_virtual_space(
        vm_offset_t     *virtual_start,
        vm_offset_t     *virtual_end);
-/* During VM initialization,
- * report virtual space
- * available for the kernel.
- */
 #endif  /* MACHINE_PAGES */
 
 /*
- *     Routines to manage the physical map data structure.
+ * Routines to manage the physical map data structure.
  */
-extern pmap_t           pmap_create(    /* Create a pmap_t. */
+extern pmap_t           pmap_create_options(    /* Create a pmap_t. */
        ledger_t        ledger,
        vm_map_size_t   size,
-       boolean_t       is_64bit);
-#if __x86_64__
-extern pmap_t           pmap_create_options(
-       ledger_t        ledger,
-       vm_map_size_t   size,
-       int             flags);
-#endif
+       unsigned int    flags);
 
 extern pmap_t(pmap_kernel)(void);               /* Return the kernel's pmap */
 extern void             pmap_reference(pmap_t pmap);    /* Gain a reference. */
@@ -330,9 +327,9 @@ extern void pmap_sync_page_attributes_phys(ppnum_t pa);
  * the given physical page is mapped into no pmap.
  * pmap_assert_free() will panic() if pn is not free.
  */
-extern boolean_t pmap_verify_free(ppnum_t pn);
+extern boolean_t        pmap_verify_free(ppnum_t pn);
 #if MACH_ASSERT
-extern void      pmap_assert_free(ppnum_t pn);
+extern void pmap_assert_free(ppnum_t pn);
 #endif
 
 /*
@@ -649,24 +646,14 @@ extern void             pmap_clear_noencrypt(ppnum_t pn);
 extern pmap_t   kernel_pmap;                    /* The kernel's map */
 #define         pmap_kernel()   (kernel_pmap)
 
-/* machine independent WIMG bits */
-
-#define VM_MEM_GUARDED          0x1             /* (G) Guarded Storage */
-#define VM_MEM_COHERENT         0x2             /* (M) Memory Coherency */
-#define VM_MEM_NOT_CACHEABLE    0x4             /* (I) Cache Inhibit */
-#define VM_MEM_WRITE_THROUGH    0x8             /* (W) Write-Through */
-
-#define VM_WIMG_USE_DEFAULT     0x80
-#define VM_WIMG_MASK            0xFF
-
 #define VM_MEM_SUPERPAGE        0x100           /* map a superpage instead of a base page */
 #define VM_MEM_STACK            0x200
 
-#if __x86_64__
 /* N.B. These use the same numerical space as the PMAP_EXPAND_OPTIONS
  * definitions in i386/pmap_internal.h
  */
 #define PMAP_CREATE_64BIT       0x1
+#if __x86_64__
 #define PMAP_CREATE_EPT         0x2
 #define PMAP_CREATE_KNOWN_FLAGS (PMAP_CREATE_64BIT | PMAP_CREATE_EPT)
 #endif
@@ -718,7 +705,9 @@ extern void             pmap_remove_options(    /* Remove mappings. */
 extern void             fillPage(ppnum_t pa, unsigned int fill);
 
 #if defined(__LP64__)
-void pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr);
+extern void pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr);
+extern kern_return_t pmap_pre_expand_large(pmap_t pmap, vm_map_offset_t vaddr);
+extern vm_size_t pmap_query_pagesize(pmap_t map, vm_map_offset_t vaddr);
 #endif
 
 mach_vm_size_t pmap_query_resident(pmap_t pmap,
@@ -774,12 +763,16 @@ kern_return_t pmap_pgtrace_fault(pmap_t pmap, vm_map_offset_t va, arm_saved_stat
 #endif
 
 
+#ifdef PLATFORM_BridgeOS
 struct pmap_legacy_trust_cache {
        struct pmap_legacy_trust_cache *next;
        uuid_t uuid;
        uint32_t num_hashes;
        uint8_t hashes[][CS_CDHASH_LEN];
 };
+#else
+struct pmap_legacy_trust_cache;
+#endif
 
 extern kern_return_t pmap_load_legacy_trust_cache(struct pmap_legacy_trust_cache *trust_cache,
     const vm_size_t trust_cache_len);
@@ -815,6 +808,15 @@ extern pmap_tc_ret_t pmap_load_image4_trust_cache(
        vm_size_t img4_manifest_actual_len,
        bool dry_run);
 
+extern bool pmap_is_trust_cache_loaded(const uuid_t uuid);
+extern uint32_t pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN]);
+extern bool pmap_lookup_in_loaded_trust_caches(const uint8_t cdhash[CS_CDHASH_LEN]);
+
+extern bool pmap_in_ppl(void);
+
+extern void *pmap_claim_reserved_ppl_page(void);
+extern void pmap_free_reserved_ppl_page(void *kva);
+
 extern void pmap_ledger_alloc_init(size_t);
 extern ledger_t pmap_ledger_alloc(void);
 extern void pmap_ledger_free(ledger_t);
index 416e90fa259aa7a7a3aff16145c3b554f8f9e528..1b79225742f5e97d9c86b3ce07a55a380f032d20 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -123,19 +123,19 @@ void crypt_info_deallocate(struct pager_crypt_info *crypt_info);
  * These routines are invoked by VM via the memory_object_*() interfaces.
  */
 const struct memory_object_pager_ops apple_protect_pager_ops = {
-       apple_protect_pager_reference,
-       apple_protect_pager_deallocate,
-       apple_protect_pager_init,
-       apple_protect_pager_terminate,
-       apple_protect_pager_data_request,
-       apple_protect_pager_data_return,
-       apple_protect_pager_data_initialize,
-       apple_protect_pager_data_unlock,
-       apple_protect_pager_synchronize,
-       apple_protect_pager_map,
-       apple_protect_pager_last_unmap,
-       NULL, /* data_reclaim */
-       "apple_protect"
+       .memory_object_reference = apple_protect_pager_reference,
+       .memory_object_deallocate = apple_protect_pager_deallocate,
+       .memory_object_init = apple_protect_pager_init,
+       .memory_object_terminate = apple_protect_pager_terminate,
+       .memory_object_data_request = apple_protect_pager_data_request,
+       .memory_object_data_return = apple_protect_pager_data_return,
+       .memory_object_data_initialize = apple_protect_pager_data_initialize,
+       .memory_object_data_unlock = apple_protect_pager_data_unlock,
+       .memory_object_synchronize = apple_protect_pager_synchronize,
+       .memory_object_map = apple_protect_pager_map,
+       .memory_object_last_unmap = apple_protect_pager_last_unmap,
+       .memory_object_data_reclaim = NULL,
+       .memory_object_pager_name = "apple_protect"
 };
 
 /*
@@ -167,7 +167,7 @@ typedef struct apple_protect_pager {
 int apple_protect_pager_count = 0;              /* number of pagers */
 int apple_protect_pager_count_mapped = 0;       /* number of unmapped pagers */
 queue_head_t apple_protect_pager_queue;
-decl_lck_mtx_data(, apple_protect_pager_lock)
+decl_lck_mtx_data(, apple_protect_pager_lock);
 
 /*
  * Maximum number of unmapped pagers we're willing to keep around.
@@ -511,24 +511,13 @@ retry_src_fault:
                dst_pnum = (ppnum_t)
                    upl_phys_page(upl_pl, (int)(cur_offset / PAGE_SIZE));
                assert(dst_pnum != 0);
-#if __x86_64__
-               src_vaddr = (vm_map_offset_t)
-                   PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page)
-                       << PAGE_SHIFT);
-               dst_vaddr = (vm_map_offset_t)
-                   PHYSMAP_PTOV((pmap_paddr_t)dst_pnum << PAGE_SHIFT);
 
-#elif __arm__ || __arm64__
                src_vaddr = (vm_map_offset_t)
                    phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page)
                        << PAGE_SHIFT);
                dst_vaddr = (vm_map_offset_t)
                    phystokv((pmap_paddr_t)dst_pnum << PAGE_SHIFT);
-#else
-#error "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..."
-               src_vaddr = 0;
-               dst_vaddr = 0;
-#endif
+
                src_page_object = VM_PAGE_OBJECT(src_page);
 
                /*
@@ -1164,6 +1153,8 @@ apple_protect_pager_create(
            &control);
        assert(kr == KERN_SUCCESS);
 
+       memory_object_mark_trusted(control);
+
        lck_mtx_lock(&apple_protect_pager_lock);
        /* the new pager is now ready to be used */
        pager->is_ready = TRUE;
index ee77679da7e55f4ae2de1d7c31c7e4ef42db9b77..6c5a42214b4e350824cd78201d2ee2f384b12980 100644 (file)
 #include <kern/thread_group.h>
 #include <san/kasan.h>
 
-#if !CONFIG_EMBEDDED
+#if defined(__x86_64__)
 #include <i386/misc_protos.h>
 #endif
+#if defined(__arm64__)
+#include <arm/machine_routines.h>
+#endif
 
 #include <IOKit/IOHibernatePrivate.h>
 
@@ -595,7 +598,12 @@ vm_compressor_init(void)
        PE_parse_boot_argn("vm_compression_limit", &vm_compression_limit, sizeof(vm_compression_limit));
 
 #ifdef CONFIG_EMBEDDED
+#if XNU_TARGET_OS_WATCH
+       // rdar://problem/51012698
+       vm_compressor_minorcompact_threshold_divisor = 40;
+#else
        vm_compressor_minorcompact_threshold_divisor = 20;
+#endif
        vm_compressor_majorcompact_threshold_divisor = 30;
        vm_compressor_unthrottle_threshold_divisor = 40;
        vm_compressor_catchup_threshold_divisor = 60;
@@ -641,7 +649,7 @@ vm_compressor_init(void)
        compressor_pool_max_size = C_SEG_MAX_LIMIT;
        compressor_pool_max_size *= C_SEG_BUFSIZE;
 
-#if defined(__x86_64__)
+#if !CONFIG_EMBEDDED
 
        if (vm_compression_limit == 0) {
                if (max_mem <= (4ULL * 1024ULL * 1024ULL * 1024ULL)) {
@@ -873,7 +881,7 @@ c_seg_validate(c_segment_t c_seg, boolean_t must_be_compact)
                if (c_size) {
                        uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset];
                        if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) {
-                               panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%llx 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
+                               panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%llx 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, (uint64_t)cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
                        }
                }
 #endif
@@ -1088,12 +1096,14 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head)
 {
        int     old_state = c_seg->c_state;
 
-#if __i386__ || __x86_64__
+#if !CONFIG_EMBEDDED
+#if     DEVELOPMENT || DEBUG
        if (new_state != C_IS_FILLING) {
                LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED);
        }
        LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
 #endif
+#endif /* !CONFIG_EMBEDDED */
        switch (old_state) {
        case C_IS_EMPTY:
                assert(new_state == C_IS_FILLING || new_state == C_IS_FREE);
@@ -3048,16 +3058,6 @@ c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead)
 
        unused_bytes = trunc_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset - c_seg->c_nextoffset));
 
-#ifndef _OPEN_SOURCE
-       /* TODO: The HW codec can generate, lazily, a '2nd page not mapped'
-        * exception. So on such a platform, or platforms where we're confident
-        * the codec does not require a buffer page to absorb trailing writes,
-        * we can create an unmapped hole at the tail of the segment, rather
-        * than a populated mapping. This will also guarantee that the codec
-        * does not overwrite valid data past the edge of the segment and
-        * thus eliminate the depopulation overhead.
-        */
-#endif
        if (unused_bytes) {
                offset_to_depopulate = C_SEG_BYTES_TO_OFFSET(round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_nextoffset)));
 
@@ -3561,8 +3561,16 @@ sv_compression:
 static inline void
 sv_decompress(int32_t *ddst, int32_t pattern)
 {
-#if __x86_64__
+//     assert(__builtin_constant_p(PAGE_SIZE) != 0);
+#if defined(__x86_64__)
        memset_word(ddst, pattern, PAGE_SIZE / sizeof(int32_t));
+#elif defined(__arm64__)
+       assert((PAGE_SIZE % 128) == 0);
+       if (pattern == 0) {
+               fill32_dczva((addr64_t)ddst, PAGE_SIZE);
+       } else {
+               fill32_nt((addr64_t)ddst, PAGE_SIZE, pattern);
+       }
 #else
        size_t          i;
 
@@ -3570,9 +3578,10 @@ sv_decompress(int32_t *ddst, int32_t pattern)
         * compiler to emit NEON stores, cf.
         * <rdar://problem/25839866> Loop autovectorization
         * anomalies.
-        * We use separate loops for each PAGE_SIZE
+        */
+       /* * We use separate loops for each PAGE_SIZE
         * to allow the autovectorizer to engage, as PAGE_SIZE
-        * is currently not a constant.
+        * may not be a constant.
         */
 
        __unreachable_ok_push
@@ -3758,7 +3767,7 @@ bypass_busy_check:
                unsigned csvpop;
                uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset];
                if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) {
-                       panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%llx 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
+                       panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%x 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
                }
 #endif
 
@@ -3987,17 +3996,7 @@ vm_compressor_get(ppnum_t pn, int *slot, int flags)
                 */
                dptr = (int32_t *)(uintptr_t)dst;
                data = c_segment_sv_hash_table[slot_ptr->s_cindx].he_data;
-#if __x86_64__
-               memset_word(dptr, data, PAGE_SIZE / sizeof(int32_t));
-#else
-               {
-                       int             i;
-
-                       for (i = 0; i < (int)(PAGE_SIZE / sizeof(int32_t)); i++) {
-                               *dptr++ = data;
-                       }
-               }
-#endif
+               sv_decompress(dptr, data);
                if (!(flags & C_KEEP)) {
                        c_segment_sv_hash_drop_ref(slot_ptr->s_cindx);
 
index 8f6971fb4ae17b1d24823074df1ab8accc5a338a..4874789d5596a92778c20d58b131708bd7661638 100644 (file)
@@ -425,6 +425,16 @@ vm_compressor_swap_init()
                panic("vm_swapfile_gc_thread: create failed");
        }
        thread_set_thread_name(thread, "VM_swapfile_gc");
+
+       /*
+        * Swapfile garbage collection will need to allocate memory
+        * to complete its swap reclaim and in-memory compaction.
+        * So allow it to dip into the reserved VM page pool.
+        */
+       thread_lock(thread);
+       thread->options |= TH_OPT_VMPRIV;
+       thread_unlock(thread);
+
        thread_deallocate(thread);
 
        proc_set_thread_policy_with_tid(kernel_task, thread->thread_id,
@@ -679,6 +689,10 @@ vm_swapfile_create_thread(void)
                        break;
                }
 
+               if (compressor_store_stop_compaction == TRUE) {
+                       break;
+               }
+
                clock_get_system_nanotime(&sec, &nsec);
 
                if (VM_SWAP_SHOULD_CREATE(sec) == 0) {
@@ -700,6 +714,10 @@ vm_swapfile_create_thread(void)
                thread_wakeup((event_t)&hibernate_in_progress_with_pinned_swap);
        }
 
+       if (compressor_store_stop_compaction == TRUE) {
+               thread_wakeup((event_t)&compressor_store_stop_compaction);
+       }
+
        assert_wait((event_t)&vm_swapfile_create_needed, THREAD_UNINT);
 
        lck_mtx_unlock(&vm_swap_data_lock);
@@ -813,6 +831,10 @@ vm_swapfile_gc_thread(void)
                thread_wakeup((event_t)&hibernate_in_progress_with_pinned_swap);
        }
 
+       if (compressor_store_stop_compaction == TRUE) {
+               thread_wakeup((event_t)&compressor_store_stop_compaction);
+       }
+
        assert_wait((event_t)&vm_swapfile_gc_needed, THREAD_UNINT);
 
        lck_mtx_unlock(&vm_swap_data_lock);
@@ -1110,8 +1132,6 @@ again:
                        soc->swp_io_busy = 1;
                        vm_swapout_soc_busy++;
                }
-               vm_swapout_thread_throttle_adjust();
-               vm_pageout_io_throttle();
 
 c_seg_is_empty:
                if (c_swapout_count == 0) {
@@ -1123,6 +1143,12 @@ c_seg_is_empty:
                if ((soc = vm_swapout_find_done_soc())) {
                        vm_swapout_complete_soc(soc);
                }
+               lck_mtx_unlock_always(c_list_lock);
+
+               vm_swapout_thread_throttle_adjust();
+               vm_pageout_io_throttle();
+
+               lck_mtx_lock_spin_always(c_list_lock);
        }
        if ((soc = vm_swapout_find_done_soc())) {
                vm_swapout_complete_soc(soc);
index 6eda976840f17a04fa91e3dd0d40504da5ec12d2..a0a93f882407d92b72dc636388518ed131203989 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -63,6 +63,8 @@
 #include <kern/kalloc.h>
 #include <kern/ipc_kobject.h>
 
+#include <machine/atomic.h>
+
 #include <mach/memory_object_control.h>
 #include <mach/memory_object_types.h>
 #include <mach/upl.h>
@@ -119,19 +121,19 @@ kern_return_t compressor_memory_object_data_reclaim(
        __unused boolean_t              reclaim_backing_store);
 
 const struct memory_object_pager_ops compressor_pager_ops = {
-       compressor_memory_object_reference,
-       compressor_memory_object_deallocate,
-       compressor_memory_object_init,
-       compressor_memory_object_terminate,
-       compressor_memory_object_data_request,
-       compressor_memory_object_data_return,
-       compressor_memory_object_data_initialize,
-       compressor_memory_object_data_unlock,
-       compressor_memory_object_synchronize,
-       compressor_memory_object_map,
-       compressor_memory_object_last_unmap,
-       compressor_memory_object_data_reclaim,
-       "compressor pager"
+       .memory_object_reference = compressor_memory_object_reference,
+       .memory_object_deallocate = compressor_memory_object_deallocate,
+       .memory_object_init = compressor_memory_object_init,
+       .memory_object_terminate = compressor_memory_object_terminate,
+       .memory_object_data_request = compressor_memory_object_data_request,
+       .memory_object_data_return = compressor_memory_object_data_return,
+       .memory_object_data_initialize = compressor_memory_object_data_initialize,
+       .memory_object_data_unlock = compressor_memory_object_data_unlock,
+       .memory_object_synchronize = compressor_memory_object_synchronize,
+       .memory_object_map = compressor_memory_object_map,
+       .memory_object_last_unmap = compressor_memory_object_last_unmap,
+       .memory_object_data_reclaim = compressor_memory_object_data_reclaim,
+       .memory_object_pager_name = "compressor pager"
 };
 
 /* internal data structures */
@@ -662,7 +664,7 @@ compressor_pager_slot_lookup(
                                 * This memory barrier should take care of this
                                 * according to the platform requirements.
                                 */
-                               __c11_atomic_thread_fence(memory_order_release);
+                               os_atomic_thread_fence(release);
 
                                chunk = pager->cpgr_slots.cpgr_islots[chunk_idx] = t_chunk;
                                t_chunk = NULL;
index 331777917f8151114d0dd54f79a6766a7bdc55b0..277c964870313d0ec8237db8d6335d8742c8f23a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -82,7 +82,6 @@
 #include <kern/thread.h>
 #include <kern/sched_prim.h>
 #include <kern/host.h>
-#include <kern/xpr.h>
 #include <kern/mach_param.h>
 #include <kern/macro_help.h>
 #include <kern/zalloc.h>
 
 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 
+int vm_protect_privileged_from_untrusted = 1;
+
 unsigned int    vm_object_pagein_throttle = 16;
 
 /*
@@ -151,6 +152,12 @@ uint64_t vm_hard_throttle_threshold;
 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC  20000
 
 
+#define VM_STAT_DECOMPRESSIONS()        \
+MACRO_BEGIN                             \
+       VM_STAT_INCR(decompressions);       \
+       current_thread()->decompressions++; \
+MACRO_END
+
 boolean_t current_thread_aborted(void);
 
 /* Forward declarations of internal routines. */
@@ -203,7 +210,7 @@ uint64_t vm_cs_defer_to_pmap_cs = 0;
 uint64_t vm_cs_defer_to_pmap_cs_not = 0;
 #endif /* PMAP_CS */
 
-void vm_pre_fault(vm_map_offset_t);
+void vm_pre_fault(vm_map_offset_t, vm_prot_t);
 
 extern char *kdp_compressor_decompressed_page;
 extern addr64_t kdp_compressor_decompressed_page_paddr;
@@ -270,6 +277,8 @@ vm_fault_init(void)
                PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
        }
        printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
+
+       PE_parse_boot_argn("vm_protect_privileged_from_untrusted", &vm_protect_privileged_from_untrusted, sizeof(vm_protect_privileged_from_untrusted));
 }
 
 void
@@ -987,11 +996,6 @@ vm_fault_page(
        first_m = VM_PAGE_NULL;
        access_required = fault_type;
 
-
-       XPR(XPR_VM_FAULT,
-           "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
-           object, offset, fault_type, *protection, 0);
-
        /*
         * default type of fault
         */
@@ -1081,10 +1085,6 @@ vm_fault_page(
 #endif
                                wait_result = PAGE_SLEEP(object, m, interruptible);
 
-                               XPR(XPR_VM_FAULT,
-                                   "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
-                                   object, offset,
-                                   m, 0, 0);
                                counter(c_vm_fault_page_block_busy_kernel++);
 
                                if (wait_result != THREAD_AWAKENED) {
@@ -1207,12 +1207,6 @@ vm_fault_page(
                                                return error;
                                        }
 
-                                       XPR(XPR_VM_FAULT,
-                                           "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
-                                           object, offset,
-                                           m,
-                                           first_object, 0);
-
                                        if (object != first_object) {
                                                /*
                                                 * free the absent page we just found
@@ -1270,11 +1264,6 @@ vm_fault_page(
                                                vm_page_queues_remove(m, FALSE);
                                                vm_page_unlock_queues();
                                        }
-                                       XPR(XPR_VM_FAULT,
-                                           "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
-                                           object, offset,
-                                           next_object,
-                                           offset + object->vo_shadow_offset, 0);
 
                                        offset += object->vo_shadow_offset;
                                        fault_info->lo_offset += object->vo_shadow_offset;
@@ -1310,10 +1299,6 @@ vm_fault_page(
 #if TRACEFAULTPAGE
                                dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
 #endif
-                               XPR(XPR_VM_FAULT,
-                                   "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
-                                   object, offset,
-                                   m, 0, 0);
                                /*
                                 * take an extra ref so that object won't die
                                 */
@@ -1391,9 +1376,6 @@ vm_fault_page(
 #if TRACEFAULTPAGE
                        dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 #endif
-                       XPR(XPR_VM_FAULT,
-                           "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
-                           object, offset, m, 0, 0);
                        assert(!m->vmp_busy);
                        assert(!m->vmp_absent);
 
@@ -1476,10 +1458,6 @@ vm_fault_page(
                                        VM_PAGE_FREE(m);
                                }
 
-                               XPR(XPR_VM_FAULT,
-                                   "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
-                                   object, offset, 0, 0, 0);
-
                                /*
                                 * take an extra ref so object won't die
                                 */
@@ -1729,11 +1707,6 @@ vm_fault_page(
                                wants_copy_flag = VM_PROT_NONE;
                        }
 
-                       XPR(XPR_VM_FAULT,
-                           "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
-                           object, offset, m,
-                           access_required | wants_copy_flag, 0);
-
                        if (object->copy == first_object) {
                                /*
                                 * if we issue the memory_object_data_request in
@@ -1878,11 +1851,6 @@ dont_look_for_page:
                        assert(m == VM_PAGE_NULL);
                }
 
-               XPR(XPR_VM_FAULT,
-                   "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
-                   object, offset, m,
-                   object->shadow, 0);
-
                next_object = object->shadow;
 
                if (next_object == VM_OBJECT_NULL) {
@@ -1985,11 +1953,6 @@ dont_look_for_page:
            !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
 #endif  /* EXTRA_ASSERTIONS */
 
-       XPR(XPR_VM_FAULT,
-           "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
-           object, offset, m,
-           first_object, first_m);
-
        /*
         * If the page is being written, but isn't
         * already owned by the top-level object,
@@ -2038,10 +2001,6 @@ dont_look_for_page:
 
                                return VM_FAULT_MEMORY_SHORTAGE;
                        }
-                       XPR(XPR_VM_FAULT,
-                           "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
-                           object, offset,
-                           m, copy_m, 0);
 
                        vm_page_copy(m, copy_m);
 
@@ -2373,10 +2332,6 @@ done:
        *result_page = m;
        *top_page = first_m;
 
-       XPR(XPR_VM_FAULT,
-           "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
-           object, offset, m, first_m, 0);
-
        if (m != VM_PAGE_NULL) {
                assert(VM_PAGE_OBJECT(m) == object);
 
@@ -2410,7 +2365,7 @@ done:
                        vm_fault_is_sequential(object, offset, fault_info->behavior);
                        vm_fault_deactivate_behind(object, offset, fault_info->behavior);
                } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
-                       VM_STAT_INCR(decompressions);
+                       VM_STAT_DECOMPRESSIONS();
                }
                if (type_of_fault) {
                        *type_of_fault = my_fault;
@@ -2778,21 +2733,29 @@ vm_fault_enter(vm_page_t m,
                                        pathname_len = __PATH_MAX;
                                        filename = pathname + pathname_len;
                                        filename_len = __PATH_MAX;
+
+                                       if (vnode_pager_get_object_name(file_object->pager,
+                                           pathname,
+                                           pathname_len,
+                                           filename,
+                                           filename_len,
+                                           &truncated_path) == KERN_SUCCESS) {
+                                               /* safety first... */
+                                               pathname[__PATH_MAX - 1] = '\0';
+                                               filename[__PATH_MAX - 1] = '\0';
+
+                                               vnode_pager_get_object_mtime(file_object->pager,
+                                                   &mtime,
+                                                   &cs_mtime);
+                                       } else {
+                                               kfree(pathname, __PATH_MAX * 2);
+                                               pathname = NULL;
+                                               filename = NULL;
+                                               pathname_len = 0;
+                                               filename_len = 0;
+                                               truncated_path = FALSE;
+                                       }
                                }
-                               vnode_pager_get_object_name(file_object->pager,
-                                   pathname,
-                                   pathname_len,
-                                   filename,
-                                   filename_len,
-                                   &truncated_path);
-                               if (pathname) {
-                                       /* safety first... */
-                                       pathname[__PATH_MAX - 1] = '\0';
-                                       filename[__PATH_MAX - 1] = '\0';
-                               }
-                               vnode_pager_get_object_mtime(file_object->pager,
-                                   &mtime,
-                                   &cs_mtime);
                        }
                        printf("CODE SIGNING: process %d[%s]: "
                            "rejecting invalid page at address 0x%llx "
@@ -2886,13 +2849,21 @@ vm_fault_enter(vm_page_t m,
                        }
                        if (panic_on_cs_killed &&
                            object->object_is_shared_cache) {
+                               char *tainted_contents;
+                               vm_map_offset_t src_vaddr;
+                               src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
+                               tainted_contents = kalloc(PAGE_SIZE);
+                               bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
+                               printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
                                panic("CODE SIGNING: process %d[%s]: "
-                                   "rejecting invalid page at address 0x%llx "
+                                   "rejecting invalid page (phys#0x%x) at address 0x%llx "
                                    "from offset 0x%llx in file \"%s%s%s\" "
                                    "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
                                    "(signed:%d validated:%d tainted:%d nx:%d"
                                    "wpmapped:%d dirty:%d depth:%d)\n",
-                                   pid, procname, (addr64_t) vaddr,
+                                   pid, procname,
+                                   VM_PAGE_GET_PHYS_PAGE(m),
+                                   (addr64_t) vaddr,
                                    file_offset,
                                    (pathname ? pathname : "<nil>"),
                                    (truncated_path ? "/.../" : ""),
@@ -3261,20 +3232,10 @@ MACRO_END
                }
 #endif /* VM_OBJECT_ACCESS_TRACKING */
 
+
 #if PMAP_CS
-               /*
-                * If CS enforcement is on, we don't ask for an executable page if the
-                * fault does not call for execution, because that can fail in
-                * situations where the caller only actually wanted read access.
-                * However, it may be better to instead retry without execute on
-                * failure, or pass a flag into pmap_enter to do the right thing.
-                */
-               // TODO: <rdar://problem/30997388> maybe do something better than masking out VM_PROT_EXECUTE on non-execute faults
-               if (pmap_cs_enforced(pmap) && !(caller_prot & VM_PROT_EXECUTE)) {
-                       prot &= ~VM_PROT_EXECUTE;
-               }
+pmap_enter_retry:
 #endif
-
                /* Prevent a deadlock by not
                 * holding the object lock if we need to wait for a page in
                 * pmap_enter() - <rdar://problem/7138958> */
@@ -3282,6 +3243,18 @@ MACRO_END
                    wired,
                    pmap_options | PMAP_OPTIONS_NOWAIT,
                    pe_result);
+#if PMAP_CS
+               /*
+                * Retry without execute permission if we encountered a codesigning
+                * failure on a non-execute fault.  This allows applications which
+                * don't actually need to execute code to still map it for read access.
+                */
+               if ((pe_result == KERN_CODESIGN_ERROR) && pmap_cs_enforced(pmap) &&
+                   (prot & VM_PROT_EXECUTE) && !(caller_prot & VM_PROT_EXECUTE)) {
+                       prot &= ~VM_PROT_EXECUTE;
+                       goto pmap_enter_retry;
+               }
+#endif
 #if __x86_64__
                if (pe_result == KERN_INVALID_ARGUMENT &&
                    pmap == PMAP_NULL &&
@@ -3351,12 +3324,12 @@ after_the_pmap_enter:
 }
 
 void
-vm_pre_fault(vm_map_offset_t vaddr)
+vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
 {
        if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
                vm_fault(current_map(),      /* map */
                    vaddr,                   /* vaddr */
-                   VM_PROT_READ,            /* fault_type */
+                   prot,                    /* fault_type */
                    FALSE,                   /* change_wiring */
                    VM_KERN_MEMORY_NONE,     /* tag - not wiring */
                    THREAD_UNINT,            /* interruptible */
@@ -3418,6 +3391,14 @@ vm_fault(
                   NULL);
 }
 
+static boolean_t
+current_proc_is_privileged(void)
+{
+       return csproc_get_platform_binary(current_proc());
+}
+
+uint64_t vm_copied_on_read = 0;
+
 kern_return_t
 vm_fault_internal(
        vm_map_t        map,
@@ -3468,13 +3449,16 @@ vm_fault_internal(
        int                     throttle_delay;
        int                     compressed_count_delta;
        int                     grab_options;
+       boolean_t               need_copy;
+       boolean_t               need_copy_on_read;
        vm_map_offset_t         trace_vaddr;
        vm_map_offset_t         trace_real_vaddr;
-#if DEVELOPMENT || DEBUG
        vm_map_offset_t         real_vaddr;
+       boolean_t               resilient_media_retry = FALSE;
+       vm_object_t             resilient_media_object = VM_OBJECT_NULL;
+       vm_object_offset_t      resilient_media_offset = (vm_object_offset_t)-1;
 
        real_vaddr = vaddr;
-#endif /* DEVELOPMENT || DEBUG */
        trace_real_vaddr = vaddr;
        vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
 
@@ -3521,7 +3505,12 @@ vm_fault_internal(
        current_task()->faults++;
        original_fault_type = fault_type;
 
+       need_copy = FALSE;
        if (fault_type & VM_PROT_WRITE) {
+               need_copy = TRUE;
+       }
+
+       if (need_copy) {
                object_lock_type = OBJECT_LOCK_EXCLUSIVE;
        } else {
                object_lock_type = OBJECT_LOCK_SHARED;
@@ -3554,7 +3543,20 @@ RetryFault:
        map = original_map;
        vm_map_lock_read(map);
 
-       kr = vm_map_lookup_locked(&map, vaddr, fault_type,
+       if (resilient_media_retry) {
+               /*
+                * If we have to insert a fake zero-filled page to hide
+                * a media failure to provide the real page, we need to
+                * resolve any pending copy-on-write on this mapping.
+                * VM_PROT_COPY tells vm_map_lookup_locked() to deal
+                * with that even if this is not a "write" fault.
+                */
+               need_copy = TRUE;
+               object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+       }
+
+       kr = vm_map_lookup_locked(&map, vaddr,
+           (fault_type | (need_copy ? VM_PROT_COPY : 0)),
            object_lock_type, &version,
            &object, &offset, &prot, &wired,
            &fault_info,
@@ -3571,12 +3573,49 @@ RetryFault:
        fault_info.mark_zf_absent = FALSE;
        fault_info.batch_pmap_op = FALSE;
 
+       if (resilient_media_retry) {
+               /*
+                * We're retrying this fault after having detected a media
+                * failure from a "resilient_media" mapping.
+                * Check that the mapping is still pointing at the object
+                * that just failed to provide a page.
+                */
+               assert(resilient_media_object != VM_OBJECT_NULL);
+               assert(resilient_media_offset != (vm_object_offset_t)-1);
+               if (object != VM_OBJECT_NULL &&
+                   object == resilient_media_object &&
+                   offset == resilient_media_offset &&
+                   fault_info.resilient_media) {
+                       /*
+                        * This mapping still points at the same object
+                        * and is still "resilient_media": proceed in
+                        * "recovery-from-media-failure" mode, where we'll
+                        * insert a zero-filled page in the top object.
+                        */
+//                     printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
+               } else {
+                       /* not recovering: reset state */
+//                     printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
+                       resilient_media_retry = FALSE;
+                       /* release our extra reference on failed object */
+//                     printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
+                       vm_object_deallocate(resilient_media_object);
+                       resilient_media_object = VM_OBJECT_NULL;
+                       resilient_media_offset = (vm_object_offset_t)-1;
+               }
+       } else {
+               assert(resilient_media_object == VM_OBJECT_NULL);
+               resilient_media_offset = (vm_object_offset_t)-1;
+       }
+
        /*
         * If the page is wired, we must fault for the current protection
         * value, to avoid further faults.
         */
        if (wired) {
                fault_type = prot | VM_PROT_WRITE;
+       }
+       if (wired || need_copy) {
                /*
                 * since we're treating this fault as a 'write'
                 * we must hold the top object lock exclusively
@@ -3851,7 +3890,7 @@ reclaimed_from_pageout:
 
                        if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m, m_object) ||
                            (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
-upgrade_for_validation:
+upgrade_lock_and_retry:
                                /*
                                 * We might need to validate this page
                                 * against its code signature, so we
@@ -3901,7 +3940,58 @@ upgrade_for_validation:
                                goto FastPmapEnter;
                        }
 
-                       if ((fault_type & VM_PROT_WRITE) == 0) {
+                       if (!need_copy &&
+                           !fault_info.no_copy_on_read &&
+                           cur_object != object &&
+                           !cur_object->internal &&
+                           !cur_object->pager_trusted &&
+                           vm_protect_privileged_from_untrusted &&
+                           !((prot & VM_PROT_EXECUTE) &&
+                           cur_object->code_signed &&
+                           cs_process_enforcement(NULL)) &&
+                           current_proc_is_privileged()) {
+                               /*
+                                * We're faulting on a page in "object" and
+                                * went down the shadow chain to "cur_object"
+                                * to find out that "cur_object"'s pager
+                                * is not "trusted", i.e. we can not trust it
+                                * to always return the same contents.
+                                * Since the target is a "privileged" process,
+                                * let's treat this as a copy-on-read fault, as
+                                * if it was a copy-on-write fault.
+                                * Once "object" gets a copy of this page, it
+                                * won't have to rely on "cur_object" to
+                                * provide the contents again.
+                                *
+                                * This is done by setting "need_copy" and
+                                * retrying the fault from the top with the
+                                * appropriate locking.
+                                *
+                                * Special case: if the mapping is executable
+                                * and the untrusted object is code-signed and
+                                * the process is "cs_enforced", we do not
+                                * copy-on-read because that would break
+                                * code-signing enforcement expectations (an
+                                * executable page must belong to a code-signed
+                                * object) and we can rely on code-signing
+                                * to re-validate the page if it gets evicted
+                                * and paged back in.
+                                */
+//                             printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
+                               vm_copied_on_read++;
+                               need_copy = TRUE;
+
+                               vm_object_unlock(object);
+                               vm_object_unlock(cur_object);
+                               object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+                               vm_map_unlock_read(map);
+                               if (real_map != map) {
+                                       vm_map_unlock(real_map);
+                               }
+                               goto RetryFault;
+                       }
+
+                       if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
                                if (!pmap_has_prot_policy(prot)) {
                                        prot &= ~VM_PROT_WRITE;
                                } else {
@@ -3986,7 +4076,6 @@ FastPmapEnter:
                                            need_retry_ptr,
                                            &type_of_fault);
                                }
-#if DEVELOPMENT || DEBUG
                                {
                                        int     event_code = 0;
 
@@ -4002,7 +4091,6 @@ FastPmapEnter:
 
                                        DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
                                }
-#endif
                                if (kr == KERN_SUCCESS &&
                                    physpage_p != NULL) {
                                        /* for vm_map_wire_and_extract() */
@@ -4111,7 +4199,7 @@ FastPmapEnter:
 
                        if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
                            VM_FAULT_NEED_CS_VALIDATION(NULL, m, m_object)) {
-                               goto upgrade_for_validation;
+                               goto upgrade_lock_and_retry;
                        }
 
                        /*
@@ -4417,7 +4505,7 @@ FastPmapEnter:
 
                                        type_of_fault = my_fault_type;
 
-                                       VM_STAT_INCR(decompressions);
+                                       VM_STAT_DECOMPRESSIONS();
 
                                        if (cur_object != object) {
                                                if (insert_cur_object) {
@@ -4439,7 +4527,8 @@ FastPmapEnter:
                                 * that the pager doesn't have this page
                                 */
                        }
-                       if (cur_object->shadow == VM_OBJECT_NULL) {
+                       if (cur_object->shadow == VM_OBJECT_NULL ||
+                           resilient_media_retry) {
                                /*
                                 * Zero fill fault.  Page gets
                                 * inserted into the original object.
@@ -4485,6 +4574,9 @@ FastPmapEnter:
                                                goto RetryFault;
                                        }
                                }
+                               if (!object->internal) {
+                                       panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
+                               }
                                m = vm_page_alloc(object, offset);
                                m_object = NULL;
 
@@ -4588,6 +4680,22 @@ handle_copy_delay:
        assert(object != kernel_object);
        assert(object != vm_submap_object);
 
+       if (resilient_media_retry) {
+               /*
+                * We could get here if we failed to get a free page
+                * to zero-fill and had to take the slow path again.
+                * Reset our "recovery-from-failed-media" state.
+                */
+               assert(resilient_media_object != VM_OBJECT_NULL);
+               assert(resilient_media_offset != (vm_object_offset_t)-1);
+               /* release our extra reference on failed object */
+//             printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
+               vm_object_deallocate(resilient_media_object);
+               resilient_media_object = VM_OBJECT_NULL;
+               resilient_media_offset = (vm_object_offset_t)-1;
+               resilient_media_retry = FALSE;
+       }
+
        /*
         * Make a reference to this object to
         * prevent its disposal while we are messing with
@@ -4598,8 +4706,7 @@ handle_copy_delay:
        vm_object_reference_locked(object);
        vm_object_paging_begin(object);
 
-       XPR(XPR_VM_FAULT, "vm_fault -> vm_fault_page\n", 0, 0, 0, 0, 0);
-
+       set_thread_pagein_error(cthread, 0);
        error_code = 0;
 
        result_page = VM_PAGE_NULL;
@@ -4627,10 +4734,35 @@ handle_copy_delay:
         */
        if (kr != VM_FAULT_SUCCESS &&
            kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
-               /*
-                * we didn't succeed, lose the object reference immediately.
-                */
-               vm_object_deallocate(object);
+               if (kr == VM_FAULT_MEMORY_ERROR &&
+                   fault_info.resilient_media) {
+                       assertf(object->internal, "object %p", object);
+                       /*
+                        * This fault failed but the mapping was
+                        * "media resilient", so we'll retry the fault in
+                        * recovery mode to get a zero-filled page in the
+                        * top object.
+                        * Keep the reference on the failing object so
+                        * that we can check that the mapping is still
+                        * pointing to it when we retry the fault.
+                        */
+//                     printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
+                       assert(!resilient_media_retry); /* no double retry */
+                       assert(resilient_media_object == VM_OBJECT_NULL);
+                       assert(resilient_media_offset == (vm_object_offset_t)-1);
+                       resilient_media_retry = TRUE;
+                       resilient_media_object = object;
+                       resilient_media_offset = offset;
+//                     printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
+                       goto RetryFault;
+               } else {
+                       /*
+                        * we didn't succeed, lose the object reference
+                        * immediately.
+                        */
+                       vm_object_deallocate(object);
+                       object = VM_OBJECT_NULL; /* no longer valid */
+               }
 
                /*
                 * See why we failed, and take corrective action.
@@ -4848,11 +4980,47 @@ handle_copy_delay:
                object_locks_dropped = FALSE;
        }
 
+       if (!need_copy &&
+           !fault_info.no_copy_on_read &&
+           m != VM_PAGE_NULL &&
+           VM_PAGE_OBJECT(m) != object &&
+           !VM_PAGE_OBJECT(m)->pager_trusted &&
+           vm_protect_privileged_from_untrusted &&
+           !((prot & VM_PROT_EXECUTE) &&
+           VM_PAGE_OBJECT(m)->code_signed &&
+           cs_process_enforcement(NULL)) &&
+           current_proc_is_privileged()) {
+               /*
+                * We found the page we want in an "untrusted" VM object
+                * down the shadow chain.  Since the target is "privileged"
+                * we want to perform a copy-on-read of that page, so that the
+                * mapped object gets a stable copy and does not have to
+                * rely on the "untrusted" object to provide the same
+                * contents if the page gets reclaimed and has to be paged
+                * in again later on.
+                *
+                * Special case: if the mapping is executable and the untrusted
+                * object is code-signed and the process is "cs_enforced", we
+                * do not copy-on-read because that would break code-signing
+                * enforcement expectations (an executable page must belong
+                * to a code-signed object) and we can rely on code-signing
+                * to re-validate the page if it gets evicted and paged back in.
+                */
+//             printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
+               vm_copied_on_read++;
+               need_copy_on_read = TRUE;
+               need_copy = TRUE;
+       } else {
+               need_copy_on_read = FALSE;
+       }
+
        /*
         * If we want to wire down this page, but no longer have
         * adequate permissions, we must start all over.
+        * If we decided to copy-on-read, we must also start all over.
         */
-       if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
+       if ((wired && (fault_type != (prot | VM_PROT_WRITE))) ||
+           need_copy_on_read) {
                vm_map_unlock_read(map);
                if (real_map != map) {
                        vm_map_unlock(real_map);
@@ -4907,7 +5075,6 @@ handle_copy_delay:
                }
                assert(VM_PAGE_OBJECT(m) == m_object);
 
-#if DEVELOPMENT || DEBUG
                {
                        int     event_code = 0;
 
@@ -4923,7 +5090,6 @@ handle_copy_delay:
 
                        DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
                }
-#endif
                if (kr != KERN_SUCCESS) {
                        /* abort this page fault */
                        vm_map_unlock_read(map);
@@ -5094,6 +5260,18 @@ cleanup:
 done:
        thread_interrupt_level(interruptible_state);
 
+       if (resilient_media_object != VM_OBJECT_NULL) {
+               assert(resilient_media_retry);
+               assert(resilient_media_offset != (vm_object_offset_t)-1);
+               /* release extra reference on failed object */
+//             printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
+               vm_object_deallocate(resilient_media_object);
+               resilient_media_object = VM_OBJECT_NULL;
+               resilient_media_offset = (vm_object_offset_t)-1;
+               resilient_media_retry = FALSE;
+       }
+       assert(!resilient_media_retry);
+
        /*
         * Only I/O throttle on faults which cause a pagein/swapin.
         */
@@ -5289,9 +5467,6 @@ vm_fault_unwire(
 
                                vm_object_lock(object);
                                vm_object_paging_begin(object);
-                               XPR(XPR_VM_FAULT,
-                                   "vm_fault_unwire -> vm_fault_page\n",
-                                   0, 0, 0, 0, 0);
                                result_page = VM_PAGE_NULL;
                                result = vm_fault_page(
                                        object,
@@ -5744,7 +5919,6 @@ RetryDestinationFault:;
                }
                fault_info_dst.cluster_size = cluster_size;
 
-               XPR(XPR_VM_FAULT, "vm_fault_copy -> vm_fault_page\n", 0, 0, 0, 0, 0);
                dst_page = VM_PAGE_NULL;
                result = vm_fault_page(dst_object,
                    vm_object_trunc_page(dst_offset),
@@ -5839,9 +6013,6 @@ RetrySourceFault:;
                                }
                                fault_info_src.cluster_size = cluster_size;
 
-                               XPR(XPR_VM_FAULT,
-                                   "vm_fault_copy(2) -> vm_fault_page\n",
-                                   0, 0, 0, 0, 0);
                                result_page = VM_PAGE_NULL;
                                result = vm_fault_page(
                                        src_object,
@@ -6516,7 +6687,7 @@ vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr
         * further user stack traversals, thus avoiding copyin()s and further
         * faults.
         */
-       int btr = backtrace_thread_user(cthread, &bpc, 1U, &bfrs, &u64);
+       int btr = backtrace_thread_user(cthread, &bpc, 1U, &bfrs, &u64, NULL);
 
        if ((btr == 0) && (bfrs > 0)) {
                cfpc = bpc;
index 973851680586b1db557a423253ce606220cfb60a..8fe4c76d860324a1dcd2b0911516071e08799389 100644 (file)
@@ -96,9 +96,13 @@ extern kern_return_t vm_fault(
 #endif
        int             interruptible,
        pmap_t          pmap,
-       vm_map_offset_t pmap_addr);
+       vm_map_offset_t pmap_addr)
+#if XNU_KERNEL_PRIVATE
+__XNU_INTERNAL(vm_fault)
+#endif
+;
 
-extern void vm_pre_fault(vm_map_offset_t);
+extern void vm_pre_fault(vm_map_offset_t, vm_prot_t);
 
 #ifdef  MACH_KERNEL_PRIVATE
 
index cdc379909f7f07d2eadc5101f4071b9c20535e76..4a9e7a43e670bc32a7df852c33d15901e19eff8b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -118,19 +118,19 @@ kern_return_t fourk_pager_last_unmap(memory_object_t mem_obj);
  * These routines are invoked by VM via the memory_object_*() interfaces.
  */
 const struct memory_object_pager_ops fourk_pager_ops = {
-       fourk_pager_reference,
-       fourk_pager_deallocate,
-       fourk_pager_init,
-       fourk_pager_terminate,
-       fourk_pager_data_request,
-       fourk_pager_data_return,
-       fourk_pager_data_initialize,
-       fourk_pager_data_unlock,
-       fourk_pager_synchronize,
-       fourk_pager_map,
-       fourk_pager_last_unmap,
-       NULL, /* data_reclaim */
-       "fourk_pager"
+       .memory_object_reference = fourk_pager_reference,
+       .memory_object_deallocate = fourk_pager_deallocate,
+       .memory_object_init = fourk_pager_init,
+       .memory_object_terminate = fourk_pager_terminate,
+       .memory_object_data_request = fourk_pager_data_request,
+       .memory_object_data_return = fourk_pager_data_return,
+       .memory_object_data_initialize = fourk_pager_data_initialize,
+       .memory_object_data_unlock = fourk_pager_data_unlock,
+       .memory_object_synchronize = fourk_pager_synchronize,
+       .memory_object_map = fourk_pager_map,
+       .memory_object_last_unmap = fourk_pager_last_unmap,
+       .memory_object_data_reclaim = NULL,
+       .memory_object_pager_name = "fourk_pager"
 };
 
 /*
@@ -163,7 +163,7 @@ typedef struct fourk_pager {
 int fourk_pager_count = 0;              /* number of pagers */
 int fourk_pager_count_mapped = 0;       /* number of unmapped pagers */
 queue_head_t fourk_pager_queue;
-decl_lck_mtx_data(, fourk_pager_lock)
+decl_lck_mtx_data(, fourk_pager_lock);
 
 /*
  * Maximum number of unmapped pagers we're willing to keep around.
@@ -759,6 +759,8 @@ fourk_pager_create(void)
            &control);
        assert(kr == KERN_SUCCESS);
 
+       memory_object_mark_trusted(control);
+
        lck_mtx_lock(&fourk_pager_lock);
        /* the new pager is now ready to be used */
        pager->is_ready = TRUE;
@@ -892,23 +894,8 @@ fourk_pager_data_request(
                dst_pnum = (ppnum_t)
                    upl_phys_page(upl_pl, (int)(cur_offset / PAGE_SIZE));
                assert(dst_pnum != 0);
-#if __x86_64__
-               dst_vaddr = (vm_map_offset_t)
-                   PHYSMAP_PTOV((pmap_paddr_t)dst_pnum << PAGE_SHIFT);
-#elif __arm__ || __arm64__
                dst_vaddr = (vm_map_offset_t)
                    phystokv((pmap_paddr_t)dst_pnum << PAGE_SHIFT);
-#else
-               kr = pmap_enter(kernel_pmap,
-                   dst_vaddr,
-                   dst_pnum,
-                   VM_PROT_READ | VM_PROT_WRITE,
-                   VM_PROT_NONE,
-                   0,
-                   TRUE);
-
-               assert(kr == KERN_SUCCESS);
-#endif
 
                /* retrieve appropriate data for each 4K-page in this page */
                if (PAGE_SHIFT == FOURK_PAGE_SHIFT &&
@@ -1084,29 +1071,9 @@ retry_src_fault:
                                vm_page_unlock_queues();
                        }
 
-#if __x86_64__
-                       src_vaddr = (vm_map_offset_t)
-                           PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page)
-                               << PAGE_SHIFT);
-#elif __arm__ || __arm64__
                        src_vaddr = (vm_map_offset_t)
                            phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page)
                                << PAGE_SHIFT);
-#else
-                       /*
-                        * Establish an explicit mapping of the source
-                        * physical page.
-                        */
-                       kr = pmap_enter(kernel_pmap,
-                           src_vaddr,
-                           VM_PAGE_GET_PHYS_PAGE(src_page),
-                           VM_PROT_READ,
-                           VM_PROT_NONE,
-                           0,
-                           TRUE);
-
-                       assert(kr == KERN_SUCCESS);
-#endif
 
                        /*
                         * Validate the 4K page we want from
index 2c24548215715e228704bd65fd2302db83f9f3c6..e20fd75a08ec7f7acb5628dce3e2602bdd0b9ee4 100644 (file)
@@ -93,6 +93,7 @@ boolean_t vm_kernel_ready = FALSE;
 boolean_t kmem_ready = FALSE;
 boolean_t kmem_alloc_ready = FALSE;
 boolean_t zlog_ready = FALSE;
+boolean_t iokit_iomd_setownership_enabled = TRUE;
 
 vm_offset_t kmapoff_kaddr;
 unsigned int kmapoff_pgcnt;
@@ -180,7 +181,7 @@ vm_mem_bootstrap(void)
                zsize += zsize >> 1;
 #endif /* __LP64__ */
 
-#if defined(__x86_64__)
+#if !CONFIG_EMBEDDED
                /*
                 * The max_zonemap_size was based on physical memory and might make the
                 * end of the zone go beyond what vm_page_[un]pack_ptr() can handle.
@@ -249,6 +250,13 @@ vm_mem_bootstrap(void)
        zcache_bootstrap();
 #endif
        vm_rtfault_record_init();
+
+       PE_parse_boot_argn("iokit_iomd_setownership_enabled", &iokit_iomd_setownership_enabled, sizeof(iokit_iomd_setownership_enabled));
+       if (!iokit_iomd_setownership_enabled) {
+               kprintf("IOKit IOMD setownership DISABLED\n");
+       } else {
+               kprintf("IOKit IOMD setownership ENABLED\n");
+       }
 }
 
 void
index a0ba80a9bdec1a6f415eba5448ae9e0ed0831ac8..86b1c0128f875f617b5f486d75c4e85ab6073e51 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -34,6 +34,6 @@
 
 extern void vm_mem_bootstrap(void);
 extern void vm_mem_init(void);
-extern void vm_map_steal_memory(void);;
+extern void vm_map_steal_memory(void);
 
 #endif /* VM_INIT_H */
index 29f44cd6d4146f674c4aff9e6cf5073910aa6fed..d20642916da4e5849dd1fe15a54c78999a575c07 100644 (file)
@@ -81,6 +81,7 @@
 
 #include <libkern/OSDebug.h>
 #include <libkern/crypto/sha2.h>
+#include <libkern/section_keywords.h>
 #include <sys/kdebug.h>
 
 #include <san/kasan.h>
@@ -89,8 +90,8 @@
  *     Variables exported by this module.
  */
 
-vm_map_t        kernel_map;
-vm_map_t        kernel_pageable_map;
+SECURITY_READ_ONLY_LATE(vm_map_t) kernel_map;
+vm_map_t         kernel_pageable_map;
 
 extern boolean_t vm_kernel_ready;
 
@@ -370,8 +371,6 @@ kernel_memory_allocate(
 
        if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
                for (i = 0; i < wired_page_count; i++) {
-                       uint64_t        unavailable;
-
                        for (;;) {
                                if (flags & KMA_LOMEM) {
                                        mem = vm_page_grablo();
@@ -391,8 +390,11 @@ kernel_memory_allocate(
                                        kr = KERN_RESOURCE_SHORTAGE;
                                        goto out;
                                }
-                               unavailable = (vm_page_wire_count + vm_page_free_target) * PAGE_SIZE;
 
+                               /* VM privileged threads should have waited in vm_page_grab() and not get here. */
+                               assert(!(current_thread()->options & TH_OPT_VMPRIV));
+
+                               uint64_t unavailable = (vm_page_wire_count + vm_page_free_target) * PAGE_SIZE;
                                if (unavailable > max_mem || map_size > (max_mem - unavailable)) {
                                        kr = KERN_RESOURCE_SHORTAGE;
                                        goto out;
index 7d78e27cdc1a483af1c201f86aa8d295f40e42b3..9d25f2777b192ef0755bd39d83f83bbf68f26ed2 100644 (file)
@@ -108,7 +108,7 @@ extern kern_return_t kmem_alloc(
        vm_map_t    map,
        vm_offset_t *addrp,
        vm_size_t   size,
-       vm_tag_t    tag);
+       vm_tag_t    tag) __XNU_INTERNAL(kmem_alloc);
 
 extern kern_return_t kmem_alloc_contig(
        vm_map_t        map,
@@ -131,7 +131,7 @@ extern kern_return_t    kmem_alloc_pageable(
        vm_map_t        map,
        vm_offset_t     *addrp,
        vm_size_t       size,
-       vm_tag_t        tag);
+       vm_tag_t        tag) __XNU_INTERNAL(kmem_alloc_pageable);
 
 extern kern_return_t    kmem_alloc_aligned(
        vm_map_t        map,
@@ -166,7 +166,7 @@ extern kern_return_t    kmem_alloc_kobject(
        vm_map_t        map,
        vm_offset_t     *addrp,
        vm_size_t       size,
-       vm_tag_t        tag);
+       vm_tag_t        tag) __XNU_INTERNAL(kmem_alloc_kobject);
 
 extern kern_return_t kernel_memory_populate(
        vm_map_t        map,
@@ -434,7 +434,12 @@ extern vm_map_t ipc_kernel_map;
 #ifdef KERNEL
 
 __BEGIN_DECLS
+#if MACH_KERNEL_PRIVATE
+extern vm_offset_t vm_kernel_addrhash(vm_offset_t addr)
+__XNU_INTERNAL(vm_kernel_addrhash);
+#else
 extern vm_offset_t vm_kernel_addrhash(vm_offset_t addr);
+#endif
 __END_DECLS
 
 extern void vm_kernel_addrhide(
index bc5c093d2700b36b81985564d8728c090e65a024..031cb82981cd4a1633ca656efc20d5d9f0896625 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <ipc/ipc_port.h>
 #include <kern/sched_prim.h>
 #include <kern/misc_protos.h>
-#include <kern/xpr.h>
 
 #include <mach/vm_map_server.h>
 #include <mach/mach_host_server.h>
@@ -141,6 +140,8 @@ int vm_map_debug_fourk = 0;
 SECURITY_READ_ONLY_LATE(int) vm_map_executable_immutable = 1;
 int vm_map_executable_immutable_verbose = 0;
 
+os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
+
 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
 /* Internal prototypes
  */
@@ -397,6 +398,7 @@ boolean_t _vmec_reserved = (NEW)->from_reserved_zone;   \
        (NEW)->vme_resilient_codesign = FALSE; \
        (NEW)->vme_resilient_media = FALSE;     \
        (NEW)->vme_atomic = FALSE;      \
+       (NEW)->vme_no_copy_on_read = FALSE;     \
 MACRO_END
 
 #define vm_map_entry_copy_full(NEW, OLD)                 \
@@ -406,6 +408,43 @@ boolean_t _vmecf_reserved = (NEW)->from_reserved_zone;  \
 (NEW)->from_reserved_zone = _vmecf_reserved;                    \
 MACRO_END
 
+/*
+ * Normal lock_read_to_write() returns FALSE/0 on failure.
+ * These functions evaluate to zero on success and non-zero value on failure.
+ */
+__attribute__((always_inline))
+int
+vm_map_lock_read_to_write(vm_map_t map)
+{
+       if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
+               DTRACE_VM(vm_map_lock_upgrade);
+               return 0;
+       }
+       return 1;
+}
+
+__attribute__((always_inline))
+boolean_t
+vm_map_try_lock(vm_map_t map)
+{
+       if (lck_rw_try_lock_exclusive(&(map)->lock)) {
+               DTRACE_VM(vm_map_lock_w);
+               return TRUE;
+       }
+       return FALSE;
+}
+
+__attribute__((always_inline))
+boolean_t
+vm_map_try_lock_read(vm_map_t map)
+{
+       if (lck_rw_try_lock_shared(&(map)->lock)) {
+               DTRACE_VM(vm_map_lock_r);
+               return TRUE;
+       }
+       return FALSE;
+}
+
 /*
  *     Decide if we want to allow processes to execute from their data or stack areas.
  *     override_nx() returns true if we do.  Data/stack execution can be enabled independently
@@ -640,9 +679,6 @@ vm_map_apple_protected(
         * properly page-aligned) or a "fourk_pager", itself backed by a
         * vnode pager (if 4K-aligned but not page-aligned).
         */
-#else /* __arm64__ */
-       assert(start_aligned == start);
-       assert(end_aligned == end);
 #endif /* __arm64__ */
 
        map_addr = start_aligned;
@@ -1129,10 +1165,10 @@ vm_map_create_options(
        result->size = 0;
        result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
        result->user_wire_size  = 0;
-#if __x86_64__
+#if !CONFIG_EMBEDDED
        result->vmmap_high_start = 0;
-#endif /* __x86_64__ */
-       result->map_refcnt = 1;
+#endif
+       os_ref_init_count(&result->map_refcnt, &map_refgrp, 1);
 #if     TASK_SWAPPER
        result->res_count = 1;
        result->sw_state = MAP_SW_IN;
@@ -1230,7 +1266,7 @@ _vm_map_entry_create(
 #if     MAP_ENTRY_CREATION_DEBUG
        entry->vme_creation_maphdr = map_header;
        backtrace(&entry->vme_creation_bt[0],
-           (sizeof(entry->vme_creation_bt) / sizeof(uintptr_t)));
+           (sizeof(entry->vme_creation_bt) / sizeof(uintptr_t)), NULL);
 #endif
        return entry;
 }
@@ -1310,7 +1346,7 @@ vm_map_res_reference(vm_map_t map)
 {
        /* assert map is locked */
        assert(map->res_count >= 0);
-       assert(map->map_refcnt >= map->res_count);
+       assert(os_ref_get_count(&map->map_refcnt) >= map->res_count);
        if (map->res_count == 0) {
                lck_mtx_unlock(&map->s_lock);
                vm_map_lock(map);
@@ -1337,8 +1373,8 @@ vm_map_reference_swap(vm_map_t map)
        assert(map != VM_MAP_NULL);
        lck_mtx_lock(&map->s_lock);
        assert(map->res_count >= 0);
-       assert(map->map_refcnt >= map->res_count);
-       map->map_refcnt++;
+       assert(os_ref_get_count(&map->map_refcnt) >= map->res_count);
+       os_ref_retain_locked(&map->map_refcnt);
        vm_map_res_reference(map);
        lck_mtx_unlock(&map->s_lock);
 }
@@ -1364,7 +1400,7 @@ vm_map_res_deallocate(vm_map_t map)
                vm_map_unlock(map);
                lck_mtx_lock(&map->s_lock);
        }
-       assert(map->map_refcnt >= map->res_count);
+       assert(os_ref_get_count(&map->map_refcnt) >= map->res_count);
 }
 #endif  /* MACH_ASSERT && TASK_SWAPPER */
 
@@ -2093,6 +2129,10 @@ vm_memory_malloc_no_cow(
 {
        uint64_t alias_mask;
 
+       if (alias > 63) {
+               return FALSE;
+       }
+
        alias_mask = 1ULL << alias;
        if (alias_mask & vm_memory_malloc_no_cow_mask) {
                return TRUE;
@@ -2148,6 +2188,7 @@ vm_map_enter(
        boolean_t               no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
        boolean_t               is_submap = vmk_flags.vmkf_submap;
        boolean_t               permanent = vmk_flags.vmkf_permanent;
+       boolean_t               no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
        boolean_t               entry_for_jit = vmk_flags.vmkf_map_jit;
        boolean_t               iokit_acct = vmk_flags.vmkf_iokit_acct;
        boolean_t               resilient_codesign = ((flags & VM_FLAGS_RESILIENT_CODESIGN) != 0);
@@ -2241,13 +2282,32 @@ vm_map_enter(
                }
        }
 
-       if (resilient_codesign || resilient_media) {
+       if (resilient_codesign) {
+               assert(!is_submap);
                if ((cur_protection & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ||
                    (max_protection & (VM_PROT_WRITE | VM_PROT_EXECUTE))) {
                        return KERN_PROTECTION_FAILURE;
                }
        }
 
+       if (resilient_media) {
+               assert(!is_submap);
+//             assert(!needs_copy);
+               if (object != VM_OBJECT_NULL &&
+                   !object->internal) {
+                       /*
+                        * This mapping is directly backed by an external
+                        * memory manager (e.g. a vnode pager for a file):
+                        * we would not have any safe place to inject
+                        * a zero-filled page if an actual page is not
+                        * available, without possibly impacting the actual
+                        * contents of the mapped object (e.g. the file),
+                        * so we can't provide any media resiliency here.
+                        */
+                       return KERN_INVALID_ARGUMENT;
+               }
+       }
+
        if (is_submap) {
                if (purgable) {
                        /* submaps can not be purgeable */
@@ -2285,7 +2345,15 @@ vm_map_enter(
 #endif  /* __arm__ */
                effective_max_offset = 0x00000000FFFFF000ULL;
        } else {
+#if     !defined(CONFIG_EMBEDDED)
+               if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
+                       effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
+               } else {
+                       effective_max_offset = map->max_offset;
+               }
+#else
                effective_max_offset = map->max_offset;
+#endif
        }
 
        if (size == 0 ||
@@ -2392,13 +2460,13 @@ StartAgain:;
                        }
                        start = *address;
                }
-#if __x86_64__
+#if !CONFIG_EMBEDDED
                else if ((start == 0 || start == vm_map_min(map)) &&
                    !map->disable_vmentry_reuse &&
                    map->vmmap_high_start != 0) {
                        start = map->vmmap_high_start;
                }
-#endif /* __x86_64__ */
+#endif
 
 
                /*
@@ -2815,6 +2883,7 @@ StartAgain:;
            (!entry->vme_resilient_codesign) &&
            (!entry->vme_resilient_media) &&
            (!entry->vme_atomic) &&
+           (entry->vme_no_copy_on_read == no_copy_on_read) &&
 
            ((entry->vme_end - entry->vme_start) + size <=
            (user_alias == VM_MEMORY_REALLOC ?
@@ -2888,6 +2957,7 @@ StartAgain:;
                                0,
                                no_cache,
                                permanent,
+                               no_copy_on_read,
                                superpage_size,
                                clear_map_aligned,
                                is_submap,
@@ -2903,8 +2973,8 @@ StartAgain:;
                        }
 
                        if (resilient_media &&
-                           !((cur_protection | max_protection) &
-                           (VM_PROT_WRITE | VM_PROT_EXECUTE))) {
+                           (object == VM_OBJECT_NULL ||
+                           object->internal)) {
                                new_entry->vme_resilient_media = TRUE;
                        }
 
@@ -2955,13 +3025,13 @@ StartAgain:;
                                assert(!new_entry->iokit_acct);
                                submap = (vm_map_t) object;
                                submap_is_64bit = vm_map_is_64bit(submap);
-                               use_pmap = (user_alias == VM_MEMORY_SHARED_PMAP);
+                               use_pmap = vmk_flags.vmkf_nested_pmap;
 #ifndef NO_NESTED_PMAP
                                if (use_pmap && submap->pmap == NULL) {
                                        ledger_t ledger = map->pmap->ledger;
                                        /* we need a sub pmap to nest... */
-                                       submap->pmap = pmap_create(ledger, 0,
-                                           submap_is_64bit);
+                                       submap->pmap = pmap_create_options(ledger, 0,
+                                           submap_is_64bit ? PMAP_CREATE_64BIT : 0);
                                        if (submap->pmap == NULL) {
                                                /* let's proceed without nesting... */
                                        }
@@ -3264,6 +3334,7 @@ vm_map_enter_fourk(
        boolean_t               no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
        boolean_t               is_submap = vmk_flags.vmkf_submap;
        boolean_t               permanent = vmk_flags.vmkf_permanent;
+       boolean_t               no_copy_on_read = vmk_flags.vmkf_permanent;
        boolean_t               entry_for_jit = vmk_flags.vmkf_map_jit;
 //     boolean_t               iokit_acct = vmk_flags.vmkf_iokit_acct;
        unsigned int            superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT);
@@ -3532,7 +3603,8 @@ vm_map_enter_fourk(
            copy_object,
            0,                         /* offset */
            FALSE,                         /* needs_copy */
-           FALSE, FALSE,
+           FALSE,
+           FALSE,
            cur_protection, max_protection,
            VM_BEHAVIOR_DEFAULT,
            ((entry_for_jit)
@@ -3541,6 +3613,7 @@ vm_map_enter_fourk(
            0,
            no_cache,
            permanent,
+           no_copy_on_read,
            superpage_size,
            clear_map_aligned,
            is_submap,
@@ -5194,7 +5267,7 @@ vm_map_clip_unnest(
        pmap_unnest(map->pmap,
            entry->vme_start,
            entry->vme_end - entry->vme_start);
-       if ((map->mapped_in_other_pmaps) && (map->map_refcnt)) {
+       if ((map->mapped_in_other_pmaps) && os_ref_get_count(&map->map_refcnt) != 0) {
                /* clean up parent map/maps */
                vm_map_submap_pmap_clean(
                        map, entry->vme_start,
@@ -5599,8 +5672,8 @@ vm_map_submap(
                        /* nest if platform code will allow */
                        if (submap->pmap == NULL) {
                                ledger_t ledger = map->pmap->ledger;
-                               submap->pmap = pmap_create(ledger,
-                                   (vm_map_size_t) 0, FALSE);
+                               submap->pmap = pmap_create_options(ledger,
+                                   (vm_map_size_t) 0, 0);
                                if (submap->pmap == PMAP_NULL) {
                                        vm_map_unlock(map);
                                        return KERN_NO_SPACE;
@@ -5652,10 +5725,6 @@ vm_map_protect(
        int                             pmap_options = 0;
        kern_return_t                   kr;
 
-       XPR(XPR_VM_MAP,
-           "vm_map_protect, 0x%X start 0x%X end 0x%X, new 0x%X %d",
-           map, start, end, new_prot, set_max);
-
        if (new_prot & VM_PROT_COPY) {
                vm_map_offset_t         new_start;
                vm_prot_t               cur_prot, max_prot;
@@ -7349,8 +7418,9 @@ vm_map_submap_pmap_clean(
                                VME_SUBMAP(entry),
                                VME_OFFSET(entry));
                } else {
-                       if ((map->mapped_in_other_pmaps) && (map->map_refcnt)
-                           && (VME_OBJECT(entry) != NULL)) {
+                       if (map->mapped_in_other_pmaps &&
+                           os_ref_get_count(&map->map_refcnt) != 0 &&
+                           VME_OBJECT(entry) != NULL) {
                                vm_object_pmap_protect_options(
                                        VME_OBJECT(entry),
                                        (VME_OFFSET(entry) +
@@ -7385,8 +7455,9 @@ vm_map_submap_pmap_clean(
                                VME_SUBMAP(entry),
                                VME_OFFSET(entry));
                } else {
-                       if ((map->mapped_in_other_pmaps) && (map->map_refcnt)
-                           && (VME_OBJECT(entry) != NULL)) {
+                       if (map->mapped_in_other_pmaps &&
+                           os_ref_get_count(&map->map_refcnt) != 0 &&
+                           VME_OBJECT(entry) != NULL) {
                                vm_object_pmap_protect_options(
                                        VME_OBJECT(entry),
                                        VME_OFFSET(entry),
@@ -7479,16 +7550,23 @@ vm_map_guard_exception(
        unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
        unsigned int target = 0; /* should we pass in pid associated with map? */
        mach_exception_data_type_t subcode = (uint64_t)gap_start;
+       boolean_t fatal = FALSE;
+
+       task_t task = current_task();
 
        /* Can't deliver exceptions to kernel task */
-       if (current_task() == kernel_task) {
+       if (task == kernel_task) {
                return;
        }
 
        EXC_GUARD_ENCODE_TYPE(code, guard_type);
        EXC_GUARD_ENCODE_FLAVOR(code, reason);
        EXC_GUARD_ENCODE_TARGET(code, target);
-       thread_guard_violation(current_thread(), code, subcode);
+
+       if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
+               fatal = TRUE;
+       }
+       thread_guard_violation(current_thread(), code, subcode, fatal);
 }
 
 /*
@@ -7518,8 +7596,8 @@ vm_map_delete(
        unsigned int            last_timestamp = ~0; /* unlikely value */
        int                     interruptible;
        vm_map_offset_t         gap_start;
-       vm_map_offset_t         save_start = start;
-       vm_map_offset_t         save_end = end;
+       __unused vm_map_offset_t save_start = start;
+       __unused vm_map_offset_t save_end = end;
        const vm_map_offset_t   FIND_GAP = 1;   /* a not page aligned value */
        const vm_map_offset_t   GAPS_OK = 2;    /* a different not page aligned value */
 
@@ -7609,7 +7687,7 @@ vm_map_delete(
                        SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
                } else {
                        if (map->pmap == kernel_pmap &&
-                           map->map_refcnt != 0) {
+                           os_ref_get_count(&map->map_refcnt) != 0) {
                                panic("vm_map_delete(%p,0x%llx,0x%llx): "
                                    "no map entry at 0x%llx\n",
                                    map,
@@ -8041,7 +8119,8 @@ vm_map_delete(
                                        entry->vme_end - entry->vme_start,
                                        pmap_flags);
 #endif  /* NO_NESTED_PMAP */
-                               if ((map->mapped_in_other_pmaps) && (map->map_refcnt)) {
+                               if (map->mapped_in_other_pmaps &&
+                                   os_ref_get_count(&map->map_refcnt) != 0) {
                                        /* clean up parent map/maps */
                                        vm_map_submap_pmap_clean(
                                                map, entry->vme_start,
@@ -8058,7 +8137,8 @@ vm_map_delete(
                } else if (VME_OBJECT(entry) != kernel_object &&
                    VME_OBJECT(entry) != compressor_object) {
                        object = VME_OBJECT(entry);
-                       if ((map->mapped_in_other_pmaps) && (map->map_refcnt)) {
+                       if (map->mapped_in_other_pmaps &&
+                           os_ref_get_count(&map->map_refcnt) != 0) {
                                vm_object_pmap_protect_options(
                                        object, VME_OFFSET(entry),
                                        entry->vme_end - entry->vme_start,
@@ -8113,7 +8193,7 @@ vm_map_delete(
                next = entry->vme_next;
 
                if (map->pmap == kernel_pmap &&
-                   map->map_refcnt != 0 &&
+                   os_ref_get_count(&map->map_refcnt) != 0 &&
                    entry->vme_end < end &&
                    (next == vm_map_to_entry(map) ||
                    next->vme_start != entry->vme_end)) {
@@ -8229,18 +8309,6 @@ vm_map_delete(
                    vm_map_offset_t, save_start,
                    vm_map_offset_t, save_end);
                if (!(flags & VM_MAP_REMOVE_GAPS_OK)) {
-#if defined(DEVELOPMENT) || defined(DEBUG)
-                       /* log just once if not checking, otherwise log each one */
-                       if (!map->warned_delete_gap ||
-                           (task_exc_guard_default & TASK_EXC_GUARD_VM_ALL) != 0) {
-                               printf("vm_map_delete: map %p [%p...%p] nothing at %p\n",
-                                   (void *)map, (void *)save_start, (void *)save_end,
-                                   (void *)gap_start);
-                               if (!map->warned_delete_gap) {
-                                       map->warned_delete_gap = 1;
-                               }
-                       }
-#endif
                        vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
                }
        }
@@ -8931,7 +8999,7 @@ start_overwrite:
                                        entry->is_sub_map = FALSE;
                                        vm_map_deallocate(
                                                VME_SUBMAP(entry));
-                                       VME_OBJECT_SET(entry, NULL);
+                                       VME_OBJECT_SET(entry, VM_OBJECT_NULL);
                                        VME_OFFSET_SET(entry, 0);
                                        entry->is_shared = FALSE;
                                        entry->needs_copy = FALSE;
@@ -9611,7 +9679,7 @@ vm_map_copy_overwrite_unaligned(
                        }
                        dst_object = vm_object_allocate((vm_map_size_t)
                            entry->vme_end - entry->vme_start);
-                       VME_OBJECT(entry) = dst_object;
+                       VME_OBJECT_SET(entry, dst_object);
                        VME_OFFSET_SET(entry, 0);
                        assert(entry->use_pmap);
                        vm_map_lock_write_to_read(dst_map);
@@ -10735,6 +10803,7 @@ StartAgain:;
                while (entry != vm_map_copy_to_entry(copy)) {
                        new = vm_map_copy_entry_create(copy, !copy->cpy_hdr.entries_pageable);
                        vm_map_entry_copy_full(new, entry);
+                       new->vme_no_copy_on_read = FALSE;
                        assert(!new->iokit_acct);
                        if (new->is_sub_map) {
                                /* clr address space specifics */
@@ -11080,8 +11149,6 @@ vm_map_copyin_internal(
                           src_destroy, copy_result);
        }
 
-       XPR(XPR_VM_MAP, "vm_map_copyin_common map 0x%x addr 0x%x len 0x%x dest %d\n", src_map, src_addr, len, src_destroy, 0);
-
        /*
         *      Allocate a header element for the list.
         *
@@ -11342,13 +11409,10 @@ vm_map_copyin_internal(
 
 
 RestartCopy:
-               XPR(XPR_VM_MAP, "vm_map_copyin_common src_obj 0x%x ent 0x%x obj 0x%x was_wired %d\n",
-                   src_object, new_entry, VME_OBJECT(new_entry),
-                   was_wired, 0);
                if ((src_object == VM_OBJECT_NULL ||
                    (!was_wired && !map_share && !tmp_entry->is_shared)) &&
                    vm_object_copy_quickly(
-                           &VME_OBJECT(new_entry),
+                           VME_OBJECT_PTR(new_entry),
                            src_offset,
                            src_size,
                            &src_needs_copy,
@@ -11425,7 +11489,7 @@ CopySlowly:
                                src_offset,
                                src_size,
                                THREAD_UNINT,
-                               &VME_OBJECT(new_entry));
+                               VME_OBJECT_PTR(new_entry));
                        VME_OFFSET_SET(new_entry, 0);
                        new_entry->needs_copy = FALSE;
                } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
@@ -11455,7 +11519,7 @@ CopySlowly:
                        result = vm_object_copy_strategically(src_object,
                            src_offset,
                            src_size,
-                           &VME_OBJECT(new_entry),
+                           VME_OBJECT_PTR(new_entry),
                            &new_offset,
                            &new_entry_needs_copy);
                        if (new_offset != VME_OFFSET(new_entry)) {
@@ -12368,7 +12432,12 @@ vm_map_fork(
 #error Unknown architecture.
 #endif
 
-       new_pmap = pmap_create(ledger, (vm_map_size_t) 0, pmap_is64bit);
+       unsigned int pmap_flags = 0;
+       pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
+#if defined(HAS_APPLE_PAC)
+       pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
+#endif
+       new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
 
        vm_map_reference_swap(old_map);
        vm_map_lock(old_map);
@@ -12473,7 +12542,7 @@ vm_map_fork(
                        }
 
                        if (!vm_object_copy_quickly(
-                                   &VME_OBJECT(new_entry),
+                                   VME_OBJECT_PTR(new_entry),
                                    VME_OFFSET(old_entry),
                                    (old_entry->vme_end -
                                    old_entry->vme_start),
@@ -12711,6 +12780,7 @@ submap_recurse:
                vm_map_entry_t          submap_entry;
                vm_prot_t               subentry_protection;
                vm_prot_t               subentry_max_protection;
+               boolean_t               subentry_no_copy_on_read;
                boolean_t               mapped_needs_copy = FALSE;
 
                local_vaddr = vaddr;
@@ -12920,6 +12990,7 @@ RetrySubMap:
 
                        subentry_protection = submap_entry->protection;
                        subentry_max_protection = submap_entry->max_protection;
+                       subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
                        vm_map_unlock(map);
                        submap_entry = NULL; /* not valid after map unlock */
 
@@ -12996,6 +13067,8 @@ RetrySubMap:
                                entry->protection |= subentry_protection;
                        }
                        entry->max_protection |= subentry_max_protection;
+                       /* propagate no_copy_on_read */
+                       entry->vme_no_copy_on_read = subentry_no_copy_on_read;
 
                        if ((entry->protection & VM_PROT_WRITE) &&
                            (entry->protection & VM_PROT_EXECUTE) &&
@@ -13209,6 +13282,8 @@ protection_failure:
 #endif /* CONFIG_PMAP_CS */
                fault_info->mark_zf_absent = FALSE;
                fault_info->batch_pmap_op = FALSE;
+               fault_info->resilient_media = entry->vme_resilient_media;
+               fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
        }
 
        /*
@@ -13348,6 +13423,9 @@ vm_map_region_recurse_64(
                if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
                        *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
                }
+               if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
+                       *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
+               }
        }
 
        user_address = *address;
@@ -13534,23 +13612,19 @@ recurse_again:
                    next_entry == NULL && /* & there are no more regions */
                    /* & we haven't already provided our fake region: */
                    user_address <= vm_map_last_entry(map)->vme_end) {
-                       ledger_amount_t nonvol, nonvol_compressed;
+                       ledger_amount_t ledger_resident, ledger_compressed;
+
                        /*
                         * Add a fake memory region to account for
-                        * purgeable memory that counts towards this
-                        * task's memory footprint, i.e. the resident
-                        * compressed pages of non-volatile objects
-                        * owned by that task.
+                        * purgeable and/or ledger-tagged memory that
+                        * counts towards this task's memory footprint,
+                        * i.e. the resident/compressed pages of non-volatile
+                        * objects owned by that task.
                         */
-                       ledger_get_balance(
-                               map->pmap->ledger,
-                               task_ledgers.purgeable_nonvolatile,
-                               &nonvol);
-                       ledger_get_balance(
-                               map->pmap->ledger,
-                               task_ledgers.purgeable_nonvolatile_compressed,
-                               &nonvol_compressed);
-                       if (nonvol + nonvol_compressed == 0) {
+                       task_ledgers_footprint(map->pmap->ledger,
+                           &ledger_resident,
+                           &ledger_compressed);
+                       if (ledger_resident + ledger_compressed == 0) {
                                /* no purgeable memory usage to report */
                                return KERN_INVALID_ADDRESS;
                        }
@@ -13561,9 +13635,9 @@ recurse_again:
                                submap_info->inheritance = VM_INHERIT_DEFAULT;
                                submap_info->offset = 0;
                                submap_info->user_tag = -1;
-                               submap_info->pages_resident = (unsigned int) (nonvol / PAGE_SIZE);
+                               submap_info->pages_resident = (unsigned int) (ledger_resident / PAGE_SIZE);
                                submap_info->pages_shared_now_private = 0;
-                               submap_info->pages_swapped_out = (unsigned int) (nonvol_compressed / PAGE_SIZE);
+                               submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / PAGE_SIZE);
                                submap_info->pages_dirtied = submap_info->pages_resident;
                                submap_info->ref_count = 1;
                                submap_info->shadow_depth = 0;
@@ -13590,7 +13664,7 @@ recurse_again:
                                short_info->ref_count = 1;
                        }
                        *nesting_depth = 0;
-                       *size = (vm_map_size_t) (nonvol + nonvol_compressed);
+                       *size = (vm_map_size_t) (ledger_resident + ledger_compressed);
 //                     *address = user_address;
                        *address = vm_map_last_entry(map)->vme_end;
                        return KERN_SUCCESS;
@@ -13706,7 +13780,7 @@ recurse_again:
                        } else {
                                extended.share_mode = SM_PRIVATE;
                        }
-                       extended.ref_count = VME_SUBMAP(curr_entry)->map_refcnt;
+                       extended.ref_count = os_ref_get_count(&VME_SUBMAP(curr_entry)->map_refcnt);
                }
        }
 
@@ -13724,6 +13798,9 @@ recurse_again:
                if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
                        submap_info->pages_reusable = extended.pages_reusable;
                }
+               if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
+                       submap_info->object_id_full = (vm_object_id_t) (VME_OBJECT(curr_entry) != NULL) ? VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry)) : 0ULL;
+               }
        } else {
                short_info->external_pager = extended.external_pager;
                short_info->shadow_depth = extended.shadow_depth;
@@ -14039,7 +14116,7 @@ vm_map_region_top_walk(
                                    OBJ_RESIDENT_COUNT(obj, entry_size);
                        } else {
                                if (ref_count == 1 ||
-                                   (ref_count == 2 && !(obj->pager_trusted) && !(obj->internal))) {
+                                   (ref_count == 2 && obj->named)) {
                                        top->share_mode = SM_PRIVATE;
                                        top->private_pages_resident =
                                            OBJ_RESIDENT_COUNT(obj,
@@ -14235,7 +14312,7 @@ collect_object_info:
                shadow_object = obj->shadow;
                shadow_depth = 0;
 
-               if (!(obj->pager_trusted) && !(obj->internal)) {
+               if (!(obj->internal)) {
                        extended->external_pager = 1;
                }
 
@@ -14246,8 +14323,7 @@ collect_object_info:
                            shadow_depth++) {
                                vm_object_t     next_shadow;
 
-                               if (!(shadow_object->pager_trusted) &&
-                                   !(shadow_object->internal)) {
+                               if (!(shadow_object->internal)) {
                                        extended->external_pager = 1;
                                }
 
@@ -14342,7 +14418,7 @@ vm_map_region_look_for_page(
 
 
        while (TRUE) {
-               if (!(object->pager_trusted) && !(object->internal)) {
+               if (!(object->internal)) {
                        extended->external_pager = 1;
                }
 
@@ -14506,6 +14582,7 @@ vm_map_simplify_entry(
            this_entry->vme_resilient_codesign) &&
            (prev_entry->vme_resilient_media ==
            this_entry->vme_resilient_media) &&
+           (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
 
            (prev_entry->wired_count == this_entry->wired_count) &&
            (prev_entry->user_wired_count == this_entry->user_wired_count) &&
@@ -14751,10 +14828,6 @@ vm_map_behavior_set(
        vm_map_entry_t  entry;
        vm_map_entry_t  temp_entry;
 
-       XPR(XPR_VM_MAP,
-           "vm_map_behavior_set, 0x%X start 0x%X end 0x%X behavior %d",
-           map, start, end, new_behavior, 0);
-
        if (start > end ||
            start < vm_map_min(map) ||
            end > vm_map_max(map)) {
@@ -14847,9 +14920,9 @@ vm_map_behavior_set(
 /*
  * Internals for madvise(MADV_WILLNEED) system call.
  *
- * The present implementation is to do a read-ahead if the mapping corresponds
- * to a mapped regular file.  If it's an anonymous mapping, then we do nothing
- * and basically ignore the "advice" (which we are always free to do).
+ * The implementation is to do:-
+ * a) read-ahead if the mapping corresponds to a mapped regular file
+ * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
  */
 
 
@@ -14929,69 +15002,98 @@ vm_map_willneed(
                }
 
                /*
-                * If there's no read permission to this mapping, then just
-                * skip it.
+                * If the entry is a submap OR there's no read permission
+                * to this mapping, then just skip it.
                 */
-               if ((entry->protection & VM_PROT_READ) == 0) {
+               if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
                        entry = entry->vme_next;
                        start = entry->vme_start;
                        continue;
                }
 
-               /*
-                * Find the file object backing this map entry.  If there is
-                * none, then we simply ignore the "will need" advice for this
-                * entry and go on to the next one.
-                */
-               if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
-                       entry = entry->vme_next;
-                       start = entry->vme_start;
-                       continue;
-               }
+               object = VME_OBJECT(entry);
 
-               /*
-                * The data_request() could take a long time, so let's
-                * release the map lock to avoid blocking other threads.
-                */
-               vm_map_unlock_read(map);
+               if (object == NULL ||
+                   (object && object->internal)) {
+                       /*
+                        * Memory range backed by anonymous memory.
+                        */
+                       vm_size_t region_size = 0, effective_page_size = 0;
+                       vm_map_offset_t addr = 0, effective_page_mask = 0;
 
-               vm_object_paging_begin(object);
-               pager = object->pager;
-               vm_object_unlock(object);
+                       region_size = len;
+                       addr = start;
 
-               /*
-                * Get the data from the object asynchronously.
-                *
-                * Note that memory_object_data_request() places limits on the
-                * amount of I/O it will do.  Regardless of the len we
-                * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
-                * silently truncates the len to that size.  This isn't
-                * necessarily bad since madvise shouldn't really be used to
-                * page in unlimited amounts of data.  Other Unix variants
-                * limit the willneed case as well.  If this turns out to be an
-                * issue for developers, then we can always adjust the policy
-                * here and still be backwards compatible since this is all
-                * just "advice".
-                */
-               kr = memory_object_data_request(
-                       pager,
-                       offset + object->paging_offset,
-                       0,      /* ignored */
-                       VM_PROT_READ,
-                       (memory_object_fault_info_t)&fault_info);
+                       effective_page_mask = MAX(vm_map_page_mask(current_map()), PAGE_MASK);
+                       effective_page_size = effective_page_mask + 1;
 
-               vm_object_lock(object);
-               vm_object_paging_end(object);
-               vm_object_unlock(object);
+                       vm_map_unlock_read(map);
 
-               /*
-                * If we couldn't do the I/O for some reason, just give up on
-                * the madvise.  We still return success to the user since
-                * madvise isn't supposed to fail when the advice can't be
-                * taken.
-                */
-               if (kr != KERN_SUCCESS) {
-                       return KERN_SUCCESS;
+                       while (region_size) {
+                               vm_pre_fault(
+                                       vm_map_trunc_page(addr, effective_page_mask),
+                                       VM_PROT_READ | VM_PROT_WRITE);
+
+                               region_size -= effective_page_size;
+                               addr += effective_page_size;
+                       }
+               } else {
+                       /*
+                        * Find the file object backing this map entry.  If there is
+                        * none, then we simply ignore the "will need" advice for this
+                        * entry and go on to the next one.
+                        */
+                       if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
+                               entry = entry->vme_next;
+                               start = entry->vme_start;
+                               continue;
+                       }
+
+                       vm_object_paging_begin(object);
+                       pager = object->pager;
+                       vm_object_unlock(object);
+
+                       /*
+                        * The data_request() could take a long time, so let's
+                        * release the map lock to avoid blocking other threads.
+                        */
+                       vm_map_unlock_read(map);
+
+                       /*
+                        * Get the data from the object asynchronously.
+                        *
+                        * Note that memory_object_data_request() places limits on the
+                        * amount of I/O it will do.  Regardless of the len we
+                        * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
+                        * silently truncates the len to that size.  This isn't
+                        * necessarily bad since madvise shouldn't really be used to
+                        * page in unlimited amounts of data.  Other Unix variants
+                        * limit the willneed case as well.  If this turns out to be an
+                        * issue for developers, then we can always adjust the policy
+                        * here and still be backwards compatible since this is all
+                        * just "advice".
+                        */
+                       kr = memory_object_data_request(
+                               pager,
+                               offset + object->paging_offset,
+                               0,      /* ignored */
+                               VM_PROT_READ,
+                               (memory_object_fault_info_t)&fault_info);
+
+                       vm_object_lock(object);
+                       vm_object_paging_end(object);
+                       vm_object_unlock(object);
+
+                       /*
+                        * If we couldn't do the I/O for some reason, just give up on
+                        * the madvise.  We still return success to the user since
+                        * madvise isn't supposed to fail when the advice can't be
+                        * taken.
+                        */
+
+                       if (kr != KERN_SUCCESS) {
+                               return KERN_SUCCESS;
+                       }
                }
 
                start += len;
@@ -15480,6 +15582,7 @@ vm_map_entry_insert(
        unsigned                wired_count,
        boolean_t               no_cache,
        boolean_t               permanent,
+       boolean_t               no_copy_on_read,
        unsigned int            superpage_size,
        boolean_t               clear_map_aligned,
        boolean_t               is_submap,
@@ -15563,9 +15666,6 @@ vm_map_entry_insert(
                {
                        new_entry->used_for_jit = TRUE;
                        map->jit_entry_exists = TRUE;
-
-                       /* Tell the pmap that it supports JIT. */
-                       pmap_set_jit_entitled(map->pmap);
                }
        } else {
                new_entry->used_for_jit = FALSE;
@@ -15575,6 +15675,7 @@ vm_map_entry_insert(
        new_entry->vme_resilient_codesign = FALSE;
        new_entry->vme_resilient_media = FALSE;
        new_entry->vme_atomic = FALSE;
+       new_entry->vme_no_copy_on_read = no_copy_on_read;
 
        /*
         *      Insert the new entry into the list.
@@ -15706,7 +15807,8 @@ vm_map_remap_extract(
                                 * This entry uses "IOKit accounting".
                                 */
                        } else if (object != VM_OBJECT_NULL &&
-                           object->purgable != VM_PURGABLE_DENY) {
+                           (object->purgable != VM_PURGABLE_DENY ||
+                           object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
                                /*
                                 * Purgeable objects have their own accounting:
                                 * no pmap accounting for them.
@@ -15852,16 +15954,20 @@ vm_map_remap_extract(
                 */
 RestartCopy:
                if (!copy) {
-                       /*
-                        * Cannot allow an entry describing a JIT
-                        * region to be shared across address spaces.
-                        */
-                       if (src_entry->used_for_jit == TRUE && !same_map) {
+                       if (src_entry->used_for_jit == TRUE) {
+                               if (same_map) {
+                               } else {
 #if CONFIG_EMBEDDED
-                               result = KERN_INVALID_ARGUMENT;
-                               break;
+                                       /*
+                                        * Cannot allow an entry describing a JIT
+                                        * region to be shared across address spaces.
+                                        */
+                                       result = KERN_INVALID_ARGUMENT;
+                                       break;
 #endif /* CONFIG_EMBEDDED */
+                               }
                        }
+
                        src_entry->is_shared = TRUE;
                        new_entry->is_shared = TRUE;
                        if (!(new_entry->is_sub_map)) {
@@ -15873,7 +15979,7 @@ RestartCopy:
                        new_entry->needs_copy = TRUE;
                        object = VM_OBJECT_NULL;
                } else if (src_entry->wired_count == 0 &&
-                   vm_object_copy_quickly(&VME_OBJECT(new_entry),
+                   vm_object_copy_quickly(VME_OBJECT_PTR(new_entry),
                    VME_OFFSET(new_entry),
                    (new_entry->vme_end -
                    new_entry->vme_start),
@@ -15946,7 +16052,7 @@ RestartCopy:
                                        (new_entry->vme_end -
                                        new_entry->vme_start),
                                        THREAD_UNINT,
-                                       &VME_OBJECT(new_entry));
+                                       VME_OBJECT_PTR(new_entry));
 
                                VME_OFFSET_SET(new_entry, 0);
                                new_entry->needs_copy = FALSE;
@@ -15959,7 +16065,7 @@ RestartCopy:
                                        offset,
                                        (new_entry->vme_end -
                                        new_entry->vme_start),
-                                       &VME_OBJECT(new_entry),
+                                       VME_OBJECT_PTR(new_entry),
                                        &new_offset,
                                        &new_entry_needs_copy);
                                if (new_offset != VME_OFFSET(new_entry)) {
@@ -16126,6 +16232,13 @@ vm_map_remap(
                return KERN_INVALID_ARGUMENT;
        }
 
+       if (flags & VM_FLAGS_RESILIENT_MEDIA) {
+               /* must be copy-on-write to be "media resilient" */
+               if (!copy) {
+                       return KERN_INVALID_ARGUMENT;
+               }
+       }
+
        result = vm_map_remap_extract(src_map, memory_address,
            size, copy, &map_header,
            cur_protection,
@@ -16165,6 +16278,12 @@ vm_map_remap(
                        entry->vme_start += *address;
                        entry->vme_end += *address;
                        assert(!entry->map_aligned);
+                       if ((flags & VM_FLAGS_RESILIENT_MEDIA) &&
+                           !entry->is_sub_map &&
+                           (VME_OBJECT(entry) == VM_OBJECT_NULL ||
+                           VME_OBJECT(entry)->internal)) {
+                               entry->vme_resilient_media = TRUE;
+                       }
                        vm_map_store_entry_link(target_map, insp_entry, entry,
                            vmk_flags);
                        insp_entry = entry;
@@ -16876,6 +16995,7 @@ vm_map_page_range_info_internal(
        vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
        vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
        boolean_t               do_region_footprint;
+       ledger_amount_t         ledger_resident, ledger_compressed;
 
        switch (flavor) {
        case VM_PAGE_INFO_BASIC:
@@ -16913,6 +17033,8 @@ vm_map_page_range_info_internal(
 
        vm_map_lock_read(map);
 
+       task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
+
        for (curr_s_offset = start; curr_s_offset < end;) {
                /*
                 * New lookup needs reset of these variables.
@@ -16924,8 +17046,6 @@ vm_map_page_range_info_internal(
 
                if (do_region_footprint &&
                    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
-                       ledger_amount_t nonvol_compressed;
-
                        /*
                         * Request for "footprint" info about a page beyond
                         * the end of address space: this must be for
@@ -16934,13 +17054,9 @@ vm_map_page_range_info_internal(
                         * memory owned by this task.
                         */
                        disposition = 0;
-                       nonvol_compressed = 0;
-                       ledger_get_balance(
-                               map->pmap->ledger,
-                               task_ledgers.purgeable_nonvolatile_compressed,
-                               &nonvol_compressed);
+
                        if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
-                           (unsigned) nonvol_compressed) {
+                           (unsigned) ledger_compressed) {
                                /*
                                 * We haven't reported all the "non-volatile
                                 * compressed" pages yet, so report this fake
@@ -17214,6 +17330,9 @@ vm_map_page_range_info_internal(
                                        } else {
                                                disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
                                        }
+                                       if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
+                                               disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
+                                       }
                                } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
                                        assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
                                        disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
@@ -17344,6 +17463,9 @@ vm_map_page_range_info_internal(
                                        if (m->vmp_cs_nx) {
                                                disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
                                        }
+                                       if (m->vmp_reusable || curr_object->all_reusable) {
+                                               disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
+                                       }
                                }
                        }
 
@@ -17794,10 +17916,10 @@ vm_map_reference(
        lck_mtx_lock(&map->s_lock);
 #if     TASK_SWAPPER
        assert(map->res_count > 0);
-       assert(map->map_refcnt >= map->res_count);
+       assert(os_ref_get_count(&map->map_refcnt) >= map->res_count);
        map->res_count++;
 #endif
-       map->map_refcnt++;
+       os_ref_retain_locked(&map->map_refcnt);
        lck_mtx_unlock(&map->s_lock);
 }
 
@@ -17819,13 +17941,13 @@ vm_map_deallocate(
        }
 
        lck_mtx_lock(&map->s_lock);
-       ref = --map->map_refcnt;
+       ref = os_ref_release_locked(&map->map_refcnt);
        if (ref > 0) {
                vm_map_res_deallocate(map);
                lck_mtx_unlock(&map->s_lock);
                return;
        }
-       assert(map->map_refcnt == 0);
+       assert(os_ref_get_count(&map->map_refcnt) == 0);
        lck_mtx_unlock(&map->s_lock);
 
 #if     TASK_SWAPPER
@@ -17901,6 +18023,19 @@ vm_map_set_jumbo(vm_map_t map)
 #endif
 }
 
+/*
+ * This map has a JIT entitlement
+ */
+void
+vm_map_set_jit_entitled(vm_map_t map)
+{
+#if defined (__arm64__)
+       pmap_set_jit_entitled(map->pmap);
+#else /* arm64 */
+       (void) map;
+#endif
+}
+
 /*
  * Expand the maximum size of an existing map.
  */
@@ -18384,12 +18519,12 @@ extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
 
 kern_return_t
 vm_map_freeze(
-       vm_map_t map,
+       task_t       task,
        unsigned int *purgeable_count,
        unsigned int *wired_count,
        unsigned int *clean_count,
        unsigned int *dirty_count,
-       __unused unsigned int dirty_budget,
+       unsigned int dirty_budget,
        unsigned int *shared_count,
        int          *freezer_error_code,
        boolean_t    eval_only)
@@ -18408,6 +18543,8 @@ vm_map_freeze(
         * block any page faults or lookups while we are
         * in the middle of freezing this vm map.
         */
+       vm_map_t map = task->map;
+
        vm_map_lock(map);
 
        assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
@@ -18459,6 +18596,30 @@ again:
 
                        if (src_object->internal == TRUE) {
                                if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+                                       /*
+                                        * We skip purgeable objects during evaluation phase only.
+                                        * If we decide to freeze this process, we'll explicitly
+                                        * purge these objects before we go around again with
+                                        * 'evaluation_phase' set to FALSE.
+                                        */
+
+                                       if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
+                                               /*
+                                                * We want to purge objects that may not belong to this task but are mapped
+                                                * in this task alone. Since we already purged this task's purgeable memory
+                                                * at the end of a successful evaluation phase, we want to avoid doing no-op calls
+                                                * on this task's purgeable objects. Hence the check for only volatile objects.
+                                                */
+                                               if (evaluation_phase == FALSE &&
+                                                   (src_object->purgable == VM_PURGABLE_VOLATILE) &&
+                                                   (src_object->ref_count == 1)) {
+                                                       vm_object_lock(src_object);
+                                                       vm_object_purge(src_object, 0);
+                                                       vm_object_unlock(src_object);
+                                               }
+                                               continue;
+                                       }
+
                                        /*
                                         * Pages belonging to this object could be swapped to disk.
                                         * Make sure it's not a shared object because we could end
@@ -18468,6 +18629,7 @@ again:
                                         * more than once within our own map. But we don't do full searches,
                                         * we just look at the entries following our current entry.
                                         */
+
                                        if (src_object->ref_count > 1) {
                                                if (src_object != cur_shared_object) {
                                                        obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
@@ -18503,8 +18665,7 @@ again:
                                        }
                                }
 
-                               vm_object_compressed_freezer_pageout(src_object);
-
+                               uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
                                *wired_count += src_object->wired_page_count;
 
                                if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
@@ -18519,6 +18680,10 @@ again:
                                        kr = KERN_NO_SPACE;
                                        break;
                                }
+                               if (paged_out_count >= dirty_budget) {
+                                       break;
+                               }
+                               dirty_budget -= paged_out_count;
                        }
                }
        }
@@ -18550,6 +18715,8 @@ again:
                        goto done;
                }
 
+               vm_purgeable_purge_task_owned(task);
+
                goto again;
        } else {
                kr = KERN_SUCCESS;
@@ -18923,7 +19090,7 @@ vm_commit_pagezero_status(vm_map_t lmap)
        pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
 }
 
-#if __x86_64__
+#if !CONFIG_EMBEDDED
 void
 vm_map_set_high_start(
        vm_map_t        map,
@@ -18931,7 +19098,7 @@ vm_map_set_high_start(
 {
        map->vmmap_high_start = high_start;
 }
-#endif /* __x86_64__ */
+#endif
 
 #if PMAP_CS
 kern_return_t
@@ -19722,8 +19889,16 @@ vm_map_copy_footprint_ledgers(
        vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
        vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
        vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
+       vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
+       vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
        vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
        vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
+       vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
+       vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
+       vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
+       vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
+       vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
+       vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
        vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
 }
 
@@ -19771,3 +19946,146 @@ vm_map_copy_ledger(
                    delta);
        }
 }
+
+#if MACH_ASSERT
+
+extern int pmap_ledgers_panic;
+extern int pmap_ledgers_panic_leeway;
+
+#define LEDGER_DRIFT(__LEDGER)                    \
+       int             __LEDGER##_over;          \
+       ledger_amount_t __LEDGER##_over_total;    \
+       ledger_amount_t __LEDGER##_over_max;      \
+       int             __LEDGER##_under;         \
+       ledger_amount_t __LEDGER##_under_total;   \
+       ledger_amount_t __LEDGER##_under_max
+
+struct {
+       uint64_t        num_pmaps_checked;
+
+       LEDGER_DRIFT(phys_footprint);
+       LEDGER_DRIFT(internal);
+       LEDGER_DRIFT(internal_compressed);
+       LEDGER_DRIFT(iokit_mapped);
+       LEDGER_DRIFT(alternate_accounting);
+       LEDGER_DRIFT(alternate_accounting_compressed);
+       LEDGER_DRIFT(page_table);
+       LEDGER_DRIFT(purgeable_volatile);
+       LEDGER_DRIFT(purgeable_nonvolatile);
+       LEDGER_DRIFT(purgeable_volatile_compressed);
+       LEDGER_DRIFT(purgeable_nonvolatile_compressed);
+       LEDGER_DRIFT(tagged_nofootprint);
+       LEDGER_DRIFT(tagged_footprint);
+       LEDGER_DRIFT(tagged_nofootprint_compressed);
+       LEDGER_DRIFT(tagged_footprint_compressed);
+       LEDGER_DRIFT(network_volatile);
+       LEDGER_DRIFT(network_nonvolatile);
+       LEDGER_DRIFT(network_volatile_compressed);
+       LEDGER_DRIFT(network_nonvolatile_compressed);
+       LEDGER_DRIFT(media_nofootprint);
+       LEDGER_DRIFT(media_footprint);
+       LEDGER_DRIFT(media_nofootprint_compressed);
+       LEDGER_DRIFT(media_footprint_compressed);
+       LEDGER_DRIFT(graphics_nofootprint);
+       LEDGER_DRIFT(graphics_footprint);
+       LEDGER_DRIFT(graphics_nofootprint_compressed);
+       LEDGER_DRIFT(graphics_footprint_compressed);
+       LEDGER_DRIFT(neural_nofootprint);
+       LEDGER_DRIFT(neural_footprint);
+       LEDGER_DRIFT(neural_nofootprint_compressed);
+       LEDGER_DRIFT(neural_footprint_compressed);
+} pmap_ledgers_drift;
+
+void
+vm_map_pmap_check_ledgers(
+       pmap_t          pmap,
+       ledger_t        ledger,
+       int             pid,
+       char            *procname)
+{
+       ledger_amount_t bal;
+       boolean_t       do_panic;
+
+       do_panic = FALSE;
+
+       pmap_ledgers_drift.num_pmaps_checked++;
+
+#define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
+MACRO_BEGIN                                                             \
+       int panic_on_negative = TRUE;                                   \
+       ledger_get_balance(ledger,                                      \
+                          task_ledgers.__LEDGER,                       \
+                          &bal);                                       \
+       ledger_get_panic_on_negative(ledger,                            \
+                                    task_ledgers.__LEDGER,             \
+                                    &panic_on_negative);               \
+       if (bal != 0) {                                                 \
+               if (panic_on_negative ||                                \
+                   (pmap_ledgers_panic &&                              \
+                    pmap_ledgers_panic_leeway > 0 &&                   \
+                    (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
+                     bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
+                       do_panic = TRUE;                                \
+               }                                                       \
+               printf("LEDGER BALANCE proc %d (%s) "                   \
+                      "\"%s\" = %lld\n",                               \
+                      pid, procname, #__LEDGER, bal);                  \
+               if (bal > 0) {                                          \
+                       pmap_ledgers_drift.__LEDGER##_over++;           \
+                       pmap_ledgers_drift.__LEDGER##_over_total += bal; \
+                       if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
+                               pmap_ledgers_drift.__LEDGER##_over_max = bal; \
+                       }                                               \
+               } else if (bal < 0) {                                   \
+                       pmap_ledgers_drift.__LEDGER##_under++;          \
+                       pmap_ledgers_drift.__LEDGER##_under_total += bal; \
+                       if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
+                               pmap_ledgers_drift.__LEDGER##_under_max = bal; \
+                       }                                               \
+               }                                                       \
+       }                                                               \
+MACRO_END
+
+       LEDGER_CHECK_BALANCE(phys_footprint);
+       LEDGER_CHECK_BALANCE(internal);
+       LEDGER_CHECK_BALANCE(internal_compressed);
+       LEDGER_CHECK_BALANCE(iokit_mapped);
+       LEDGER_CHECK_BALANCE(alternate_accounting);
+       LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
+       LEDGER_CHECK_BALANCE(page_table);
+       LEDGER_CHECK_BALANCE(purgeable_volatile);
+       LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
+       LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
+       LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
+       LEDGER_CHECK_BALANCE(tagged_nofootprint);
+       LEDGER_CHECK_BALANCE(tagged_footprint);
+       LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
+       LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
+       LEDGER_CHECK_BALANCE(network_volatile);
+       LEDGER_CHECK_BALANCE(network_nonvolatile);
+       LEDGER_CHECK_BALANCE(network_volatile_compressed);
+       LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
+       LEDGER_CHECK_BALANCE(media_nofootprint);
+       LEDGER_CHECK_BALANCE(media_footprint);
+       LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
+       LEDGER_CHECK_BALANCE(media_footprint_compressed);
+       LEDGER_CHECK_BALANCE(graphics_nofootprint);
+       LEDGER_CHECK_BALANCE(graphics_footprint);
+       LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
+       LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
+       LEDGER_CHECK_BALANCE(neural_nofootprint);
+       LEDGER_CHECK_BALANCE(neural_footprint);
+       LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
+       LEDGER_CHECK_BALANCE(neural_footprint_compressed);
+
+       if (do_panic) {
+               if (pmap_ledgers_panic) {
+                       panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
+                           pmap, pid, procname);
+               } else {
+                       printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
+                           pmap, pid, procname);
+               }
+       }
+}
+#endif /* MACH_ASSERT */
index 533b8d78cde8cf8a5950818471ceae03bf72297f..3360cdfb4f137fddfe45b5a787b2a6537e581b99 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -80,6 +80,7 @@
 #include <mach/vm_param.h>
 #include <mach/sdt.h>
 #include <vm/pmap.h>
+#include <os/overflow.h>
 
 #ifdef  KERNEL_PRIVATE
 
@@ -113,6 +114,7 @@ __END_DECLS
 #include <kern/macro_help.h>
 
 #include <kern/thread.h>
+#include <os/refcnt.h>
 
 #define current_map_fast()      (current_thread()->map)
 #define current_map()           (current_map_fast())
@@ -130,7 +132,7 @@ __END_DECLS
  *                              used for inter-map copy operations
  */
 typedef struct vm_map_entry     *vm_map_entry_t;
-#define VM_MAP_ENTRY_NULL       ((vm_map_entry_t) 0)
+#define VM_MAP_ENTRY_NULL       ((vm_map_entry_t) NULL)
 
 
 /*
@@ -172,7 +174,7 @@ extern queue_head_t vm_named_entry_list;
  */
 
 struct vm_named_entry {
-       decl_lck_mtx_data(, Lock)               /* Synchronization */
+       decl_lck_mtx_data(, Lock);              /* Synchronization */
        union {
                vm_object_t     object;         /* object I point to */
                vm_map_t        map;            /* map backing submap */
@@ -217,55 +219,6 @@ struct vm_map_links {
        vm_map_offset_t         end;            /* end address */
 };
 
-/*
- * IMPORTANT:
- * The "alias" field can be updated while holding the VM map lock
- * "shared".  It's OK as along as it's the only field that can be
- * updated without the VM map "exclusive" lock.
- */
-#define VME_OBJECT(entry) ((entry)->vme_object.vmo_object)
-#define VME_OBJECT_SET(entry, object)                           \
-       MACRO_BEGIN                                             \
-       (entry)->vme_object.vmo_object = (object);              \
-       MACRO_END
-#define VME_SUBMAP(entry) ((entry)->vme_object.vmo_submap)
-#define VME_SUBMAP_SET(entry, submap)                           \
-       MACRO_BEGIN                                             \
-       (entry)->vme_object.vmo_submap = (submap);              \
-       MACRO_END
-#define VME_OFFSET(entry) ((entry)->vme_offset & ~PAGE_MASK)
-#define VME_OFFSET_SET(entry, offset)           \
-       MACRO_BEGIN                             \
-       int __alias;                            \
-       __alias = VME_ALIAS((entry));           \
-       assert((offset & PAGE_MASK) == 0);      \
-       (entry)->vme_offset = offset | __alias; \
-       MACRO_END
-#define VME_OBJECT_SHADOW(entry, length)                        \
-       MACRO_BEGIN                                             \
-       vm_object_t             __object;                       \
-       vm_object_offset_t      __offset;                       \
-       __object = VME_OBJECT((entry));                         \
-       __offset = VME_OFFSET((entry));                         \
-       vm_object_shadow(&__object, &__offset, (length));       \
-       if (__object != VME_OBJECT((entry))) {                  \
-               VME_OBJECT_SET((entry), __object);              \
-               (entry)->use_pmap = TRUE;                       \
-       }                                                       \
-       if (__offset != VME_OFFSET((entry))) {                  \
-               VME_OFFSET_SET((entry), __offset);              \
-       }                                                       \
-       MACRO_END
-
-#define VME_ALIAS_MASK (PAGE_MASK)
-#define VME_ALIAS(entry) ((unsigned int)((entry)->vme_offset & VME_ALIAS_MASK))
-#define VME_ALIAS_SET(entry, alias) \
-       MACRO_BEGIN                                                     \
-       vm_map_offset_t __offset;                                       \
-       __offset = VME_OFFSET((entry));                                 \
-       (entry)->vme_offset = __offset | ((alias) & VME_ALIAS_MASK);    \
-       MACRO_END
-
 /*
  * FOOTPRINT ACCOUNTING:
  * The "memory footprint" is better described in the pmap layer.
@@ -344,8 +297,8 @@ struct vm_map_entry {
        /* boolean_t */ vme_resilient_codesign:1,
        /* boolean_t */ vme_resilient_media:1,
        /* boolean_t */ vme_atomic:1, /* entry cannot be split/coalesced */
-       __unused:4;
-       ;
+       /* boolean_t */ vme_no_copy_on_read:1,
+       __unused:3;
 
        unsigned short          wired_count;    /* can be paged if = 0 */
        unsigned short          user_wired_count; /* for vm_wire */
@@ -362,6 +315,86 @@ struct vm_map_entry {
 #endif
 };
 
+#define VME_SUBMAP_PTR(entry)                   \
+       (&((entry)->vme_object.vmo_submap))
+#define VME_SUBMAP(entry)                                       \
+       ((vm_map_t)((uintptr_t)0 + *VME_SUBMAP_PTR(entry)))
+#define VME_OBJECT_PTR(entry)                   \
+       (&((entry)->vme_object.vmo_object))
+#define VME_OBJECT(entry)                                       \
+       ((vm_object_t)((uintptr_t)0 + *VME_OBJECT_PTR(entry)))
+#define VME_OFFSET(entry)                       \
+       ((entry)->vme_offset & ~PAGE_MASK)
+#define VME_ALIAS_MASK (PAGE_MASK)
+#define VME_ALIAS(entry)                                        \
+       ((unsigned int)((entry)->vme_offset & VME_ALIAS_MASK))
+
+static inline void
+VME_OBJECT_SET(
+       vm_map_entry_t entry,
+       vm_object_t object)
+{
+       entry->vme_object.vmo_object = object;
+       if (object != VM_OBJECT_NULL && !object->internal) {
+               entry->vme_resilient_media = FALSE;
+       }
+       entry->vme_resilient_codesign = FALSE;
+       entry->used_for_jit = FALSE;
+}
+static inline void
+VME_SUBMAP_SET(
+       vm_map_entry_t entry,
+       vm_map_t submap)
+{
+       entry->vme_object.vmo_submap = submap;
+}
+static inline void
+VME_OFFSET_SET(
+       vm_map_entry_t entry,
+       vm_map_offset_t offset)
+{
+       int alias;
+       alias = VME_ALIAS(entry);
+       assert((offset & PAGE_MASK) == 0);
+       entry->vme_offset = offset | alias;
+}
+/*
+ * IMPORTANT:
+ * The "alias" field can be updated while holding the VM map lock
+ * "shared".  It's OK as along as it's the only field that can be
+ * updated without the VM map "exclusive" lock.
+ */
+static inline void
+VME_ALIAS_SET(
+       vm_map_entry_t entry,
+       int alias)
+{
+       vm_map_offset_t offset;
+       offset = VME_OFFSET(entry);
+       entry->vme_offset = offset | (alias & VME_ALIAS_MASK);
+}
+
+static inline void
+VME_OBJECT_SHADOW(
+       vm_map_entry_t entry,
+       vm_object_size_t length)
+{
+       vm_object_t object;
+       vm_object_offset_t offset;
+
+       object = VME_OBJECT(entry);
+       offset = VME_OFFSET(entry);
+       vm_object_shadow(&object, &offset, length);
+       if (object != VME_OBJECT(entry)) {
+               VME_OBJECT_SET(entry, object);
+               entry->use_pmap = TRUE;
+       }
+       if (offset != VME_OFFSET(entry)) {
+               VME_OFFSET_SET(entry, offset);
+       }
+}
+
+
 /*
  * Convenience macros for dealing with superpages
  * SUPERPAGE_NBASEPAGES is architecture dependent and defined in pmap.h
@@ -426,9 +459,9 @@ struct _vm_map {
        vm_map_size_t           size;           /* virtual size */
        vm_map_size_t           user_wire_limit;/* rlimit on user locked memory */
        vm_map_size_t           user_wire_size; /* current size of user locked memory in this map */
-#if __x86_64__
+#if !CONFIG_EMBEDDED
        vm_map_offset_t         vmmap_high_start;
-#endif /* __x86_64__ */
+#endif
 
        union {
                /*
@@ -446,7 +479,7 @@ struct _vm_map {
        } vmu1;
 #define highest_entry_end       vmu1.vmu1_highest_entry_end
 #define lowest_unnestable_start vmu1.vmu1_lowest_unnestable_start
-       decl_lck_mtx_data(, s_lock)             /* Lock ref, res fields */
+       decl_lck_mtx_data(, s_lock);                    /* Lock ref, res fields */
        lck_mtx_ext_t           s_lock_ext;
        vm_map_entry_t          hint;           /* hint for quick lookups */
        union {
@@ -455,7 +488,7 @@ struct _vm_map {
        } vmmap_u_1;
 #define hole_hint vmmap_u_1.vmmap_hole_hint
 #define vmmap_corpse_footprint vmmap_u_1.vmmap_corpse_footprint
-       union{
+       union {
                vm_map_entry_t          _first_free;    /* First free space hint */
                struct vm_map_links*    _holes;         /* links all holes between entries */
        } f_s;                                          /* Union for free space data structures being used */
@@ -463,7 +496,7 @@ struct _vm_map {
 #define first_free              f_s._first_free
 #define holes_list              f_s._holes
 
-       int                     map_refcnt;     /* Reference count */
+       struct os_refcnt        map_refcnt;     /* Reference count */
 
 #if     TASK_SWAPPER
        int                     res_count;      /* Residence count (swap) */
@@ -483,8 +516,7 @@ struct _vm_map {
        /* boolean_t */ map_disallow_new_exec:1,         /* Disallow new executable code */
        /* boolean_t */ jit_entry_exists:1,
        /* boolean_t */ has_corpse_footprint:1,
-       /* boolean_t */ warned_delete_gap:1,
-       /* reserved */ pad:19;
+       /* reserved */ pad:20;
        unsigned int            timestamp;      /* Version number */
 };
 
@@ -633,39 +665,14 @@ struct vm_map_copy {
        lck_rw_lock_exclusive_to_shared(&(map)->lock); \
        MACRO_END
 
-/*
- * lock_read_to_write() returns FALSE on failure.  This function evaluates to
- * zero on success and non-zero value on failure.
- */
-static inline int
-vm_map_lock_read_to_write(vm_map_t map)
-{
-       if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
-               DTRACE_VM(vm_map_lock_upgrade);
-               return 0;
-       }
-       return 1;
-}
+__attribute__((always_inline))
+int vm_map_lock_read_to_write(vm_map_t map);
 
-static inline boolean_t
-vm_map_try_lock(vm_map_t map)
-{
-       if (lck_rw_try_lock_exclusive(&(map)->lock)) {
-               DTRACE_VM(vm_map_lock_w);
-               return TRUE;
-       }
-       return FALSE;
-}
+__attribute__((always_inline))
+boolean_t vm_map_try_lock(vm_map_t map);
 
-static inline boolean_t
-vm_map_try_lock_read(vm_map_t map)
-{
-       if (lck_rw_try_lock_shared(&(map)->lock)) {
-               DTRACE_VM(vm_map_lock_r);
-               return TRUE;
-       }
-       return FALSE;
-}
+__attribute__((always_inline))
+boolean_t vm_map_try_lock_read(vm_map_t map);
 
 #if MACH_ASSERT || DEBUG
 #define vm_map_lock_assert_held(map) \
@@ -767,6 +774,7 @@ extern vm_map_entry_t   vm_map_entry_insert(
        unsigned                wired_count,
        boolean_t               no_cache,
        boolean_t               permanent,
+       boolean_t               no_copy_on_read,
        unsigned int            superpage_size,
        boolean_t               clear_map_aligned,
        boolean_t               is_submap,
@@ -810,14 +818,14 @@ extern void             vm_map_reference_swap(
 #else   /* MACH_ASSERT */
 
 #define vm_map_reference(map)           \
-MACRO_BEGIN                                     \
-       vm_map_t Map = (map);           \
-       if (Map) {                              \
-               lck_mtx_lock(&Map->s_lock);     \
-               Map->res_count++;               \
-               Map->map_refcnt++;              \
-               lck_mtx_unlock(&Map->s_lock);   \
-       }                                       \
+MACRO_BEGIN                                      \
+       vm_map_t Map = (map);                    \
+       if (Map) {                               \
+               lck_mtx_lock(&Map->s_lock);      \
+               Map->res_count++;                \
+               os_ref_retain(&Map->map_refcnt); \
+               lck_mtx_unlock(&Map->s_lock);    \
+       }                                        \
 MACRO_END
 
 #define vm_map_res_reference(map)               \
@@ -850,7 +858,7 @@ MACRO_END
 MACRO_BEGIN                             \
        vm_map_t Map = (map);           \
        lck_mtx_lock(&Map->s_lock);     \
-       ++Map->map_refcnt;              \
+       os_ref_retain(&Map->map_refcnt);\
        vm_map_res_reference(Map);      \
        lck_mtx_unlock(&Map->s_lock);   \
 MACRO_END
@@ -869,7 +877,7 @@ MACRO_BEGIN                                     \
        vm_map_t Map = (map);                   \
        if (Map) {                              \
                lck_mtx_lock(&Map->s_lock);     \
-               Map->map_refcnt++;              \
+               os_ref_retain(&Map->map_refcnt);\
                lck_mtx_unlock(&Map->s_lock);   \
        }                                       \
 MACRO_END
@@ -1447,6 +1455,9 @@ extern void             vm_map_set_32bit(
 extern void             vm_map_set_jumbo(
        vm_map_t                map);
 
+extern void             vm_map_set_jit_entitled(
+       vm_map_t                map);
+
 extern void             vm_map_set_max_addr(
        vm_map_t                map, vm_map_offset_t new_max_offset);
 
@@ -1474,11 +1485,11 @@ extern kern_return_t    vm_map_raise_max_offset(
 extern kern_return_t    vm_map_raise_min_offset(
        vm_map_t        map,
        vm_map_offset_t new_min_offset);
-#if __x86_64__
+#if !CONFIG_EMBEDDED
 extern void vm_map_set_high_start(
        vm_map_t        map,
        vm_map_offset_t high_start);
-#endif /* __x86_64__ */
+#endif
 
 extern vm_map_offset_t  vm_compute_max_offset(
        boolean_t               is64);
@@ -1534,6 +1545,20 @@ extern boolean_t        vm_map_page_aligned(
        vm_map_offset_t         offset,
        vm_map_offset_t         mask);
 
+static inline int
+vm_map_range_overflows(vm_map_offset_t addr, vm_map_size_t size)
+{
+       vm_map_offset_t sum;
+       return os_add_overflow(addr, size, &sum);
+}
+
+static inline int
+mach_vm_range_overflows(mach_vm_offset_t addr, mach_vm_size_t size)
+{
+       mach_vm_offset_t sum;
+       return os_add_overflow(addr, size, &sum);
+}
+
 #ifdef XNU_KERNEL_PRIVATE
 extern kern_return_t vm_map_page_info(
        vm_map_t                map,
@@ -1590,13 +1615,16 @@ static inline void
 vm_prot_to_wimg(unsigned int prot, unsigned int *wimg)
 {
        switch (prot) {
-       case MAP_MEM_NOOP:              break;
-       case MAP_MEM_IO:                *wimg = VM_WIMG_IO; break;
-       case MAP_MEM_COPYBACK:          *wimg = VM_WIMG_USE_DEFAULT; break;
-       case MAP_MEM_INNERWBACK:        *wimg = VM_WIMG_INNERWBACK; break;
-       case MAP_MEM_POSTED:            *wimg = VM_WIMG_POSTED; break;
-       case MAP_MEM_WTHRU:             *wimg = VM_WIMG_WTHRU; break;
-       case MAP_MEM_WCOMB:             *wimg = VM_WIMG_WCOMB; break;
+       case MAP_MEM_NOOP:                      break;
+       case MAP_MEM_IO:                        *wimg = VM_WIMG_IO; break;
+       case MAP_MEM_COPYBACK:                  *wimg = VM_WIMG_USE_DEFAULT; break;
+       case MAP_MEM_INNERWBACK:                *wimg = VM_WIMG_INNERWBACK; break;
+       case MAP_MEM_POSTED:                    *wimg = VM_WIMG_POSTED; break;
+       case MAP_MEM_POSTED_REORDERED:          *wimg = VM_WIMG_POSTED_REORDERED; break;
+       case MAP_MEM_POSTED_COMBINED_REORDERED: *wimg = VM_WIMG_POSTED_COMBINED_REORDERED; break;
+       case MAP_MEM_WTHRU:                     *wimg = VM_WIMG_WTHRU; break;
+       case MAP_MEM_WCOMB:                     *wimg = VM_WIMG_WCOMB; break;
+       case MAP_MEM_RT:                        *wimg = VM_WIMG_RT; break;
        default:
                panic("Unrecognized mapping type %u\n", prot);
        }
@@ -1671,7 +1699,7 @@ extern int vm_map_disconnect_page_mappings(
 #if CONFIG_FREEZE
 
 extern kern_return_t vm_map_freeze(
-       vm_map_t     map,
+       task_t       task,
        unsigned int *purgeable_count,
        unsigned int *wired_count,
        unsigned int *clean_count,
index e4782aedbf7860313ee8f1119a5f3737bf3600a9..df03e1ca0c2818249299c9f8a48104e24850b9e7 100644 (file)
@@ -125,7 +125,7 @@ _vm_map_store_entry_link( struct vm_map_header * mapHdr, vm_map_entry_t after_wh
 #endif
 #if MAP_ENTRY_INSERTION_DEBUG
        backtrace(&entry->vme_insertion_bt[0],
-           (sizeof(entry->vme_insertion_bt) / sizeof(uintptr_t)));
+           (sizeof(entry->vme_insertion_bt) / sizeof(uintptr_t)), NULL);
 #endif
 }
 
index c66a1446ae9da6e7a6efee87ee114f9c696823d2..b036575e71b9ffb050d97ed0e267ec1afb327c84 100644 (file)
@@ -57,14 +57,15 @@ rb_node_compare(struct vm_map_store *node, struct vm_map_store *parent)
        return 0;
 }
 
+__dead2
 void
-vm_map_store_walk_rb( vm_map_t map, vm_map_entry_t *wrong_vme, vm_map_entry_t *vm_entry)
+vm_map_store_walk_rb(vm_map_t map, vm_map_entry_t *wrong_vme, vm_map_entry_t *vm_entry)
 {
-       struct vm_map_header hdr = map->hdr;
-       struct vm_map_store *rb_entry = RB_ROOT(&(hdr.rb_head_store));
-       vm_map_entry_t cur = *vm_entry;
+       struct vm_map_header *hdr = &map->hdr;
+       struct vm_map_store  *rb_entry = RB_ROOT(&hdr->rb_head_store);
+       vm_map_entry_t       cur = *vm_entry;
 
-       rb_entry = RB_FIND( rb_head, &(hdr.rb_head_store), &(cur->store));
+       rb_entry = RB_FIND(rb_head, &hdr->rb_head_store, &(cur->store));
        if (rb_entry == NULL) {
                panic("NO SUCH ENTRY %p. Gave back %p", *vm_entry, *wrong_vme);
        } else {
@@ -74,12 +75,12 @@ vm_map_store_walk_rb( vm_map_t map, vm_map_entry_t *wrong_vme, vm_map_entry_t *v
 
 
 boolean_t
-vm_map_store_lookup_entry_rb( vm_map_t map, vm_map_offset_t address, vm_map_entry_t *vm_entry)
+vm_map_store_lookup_entry_rb(vm_map_t map, vm_map_offset_t address, vm_map_entry_t *vm_entry)
 {
-       struct vm_map_header hdr = map->hdr;
-       struct vm_map_store *rb_entry = RB_ROOT(&(hdr.rb_head_store));
-       vm_map_entry_t cur = vm_map_to_entry(map);
-       vm_map_entry_t prev = VM_MAP_ENTRY_NULL;
+       struct vm_map_header *hdr = &map->hdr;
+       struct vm_map_store  *rb_entry = RB_ROOT(&hdr->rb_head_store);
+       vm_map_entry_t       cur = vm_map_to_entry(map);
+       vm_map_entry_t       prev = VM_MAP_ENTRY_NULL;
 
        while (rb_entry != (struct vm_map_store*)NULL) {
                cur =  VME_FOR_STORE(rb_entry);
@@ -226,7 +227,7 @@ check_map_sanity(vm_map_t map, vm_map_entry_t old_hole_entry)
                return;
        }
 
-       hole_entry = (vm_map_entry_t) map->holes_list;
+       hole_entry = CAST_DOWN(vm_map_entry_t, map->holes_list);
        next_hole_entry = hole_entry->vme_next;
 
        map_entry = vm_map_first_entry(map);
@@ -236,7 +237,7 @@ check_map_sanity(vm_map_t map, vm_map_entry_t old_hole_entry)
                hole_entry = next_hole_entry;
                next_hole_entry = hole_entry->vme_next;
 
-               if (hole_entry == (vm_map_entry_t)map->holes_list) {
+               if (hole_entry == CAST_DOWN(vm_map_entry_t, map->holes_list)) {
                        break;
                }
        }
@@ -264,7 +265,7 @@ check_map_sanity(vm_map_t map, vm_map_entry_t old_hole_entry)
                        hole_entry = next_hole_entry;
                        next_hole_entry = hole_entry->vme_next;
 
-                       if (hole_entry == (vm_map_entry_t)map->holes_list) {
+                       if (hole_entry == CAST_DOWN(vm_map_entry_t, map->holes_list)) {
                                break;
                        }
                }
index 099fdea8d63ed150e48c86393c0552ca70fcf5a9..5b6250afc8d1d9277af3d8ed012b62ca8949c3ed 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -80,7 +80,6 @@
 #include <kern/kern_types.h>
 #include <kern/assert.h>
 #include <kern/queue.h>
-#include <kern/xpr.h>
 #include <kern/kalloc.h>
 #include <kern/zalloc.h>
 #include <kern/host.h>
@@ -353,10 +352,6 @@ _vm_object_allocate(
        vm_object_size_t        size,
        vm_object_t             object)
 {
-       XPR(XPR_VM_OBJECT,
-           "vm_object_allocate, object 0x%X size 0x%X\n",
-           object, size, 0, 0, 0);
-
        *object = vm_object_template;
        vm_page_queue_init(&object->memq);
 #if UPL_DEBUG || CONFIG_IOSCHED
@@ -539,8 +534,8 @@ vm_object_bootstrap(void)
        vm_object_template.volatile_fault = FALSE;
        vm_object_template.all_reusable = FALSE;
        vm_object_template.blocked_access = FALSE;
-       vm_object_template.vo_ledger_tag = VM_OBJECT_LEDGER_TAG_NONE;
-       vm_object_template.__object2_unused_bits = 0;
+       vm_object_template.vo_ledger_tag = VM_LEDGER_TAG_NONE;
+       vm_object_template.vo_no_footprint = FALSE;
 #if CONFIG_IOSCHED || UPL_DEBUG
        vm_object_template.uplq.prev = NULL;
        vm_object_template.uplq.next = NULL;
@@ -650,6 +645,7 @@ vm_io_reprioritize_init(void)
 
        result = kernel_thread_start_priority(io_reprioritize_thread, NULL, 95 /* MAXPRI_KERNEL */, &thread);
        if (result == KERN_SUCCESS) {
+               thread_set_thread_name(thread, "VM_io_reprioritize_thread");
                thread_deallocate(thread);
        } else {
                panic("Could not create io_reprioritize_thread");
@@ -671,6 +667,7 @@ vm_object_reaper_init(void)
        if (kr != KERN_SUCCESS) {
                panic("failed to launch vm_object_reaper_thread kr=0x%x", kr);
        }
+       thread_set_thread_name(thread, "VM_object_reaper_thread");
        thread_deallocate(thread);
 }
 
@@ -909,12 +906,6 @@ vm_object_deallocate(
                        continue;
                }
 
-               XPR(XPR_VM_OBJECT,
-                   "vm_o_deallocate: 0x%X res %d paging_ops %d thread 0x%p ref %d\n",
-                   object, object->resident_page_count,
-                   object->paging_in_progress,
-                   (void *)current_thread(), object->ref_count);
-
                VM_OBJ_RES_DECR(object);        /* XXX ? */
                /*
                 *      Terminate this object. If it had a shadow,
@@ -1333,9 +1324,6 @@ vm_object_terminate(
 {
        vm_object_t     shadow_object;
 
-       XPR(XPR_VM_OBJECT, "vm_object_terminate, object 0x%X ref %d\n",
-           object, object->ref_count, 0, 0, 0);
-
        vm_object_lock_assert_exclusive(object);
 
        if (!object->pageout && (!object->internal && object->can_persist) &&
@@ -1484,12 +1472,21 @@ vm_object_reap(
        if (object->internal &&
            (object->purgable != VM_PURGABLE_DENY ||
            object->vo_ledger_tag)) {
+               int ledger_flags;
+               kern_return_t kr;
+
+               ledger_flags = 0;
+               if (object->vo_no_footprint) {
+                       ledger_flags |= VM_LEDGER_FLAG_NO_FOOTPRINT;
+               }
                assert(!object->alive);
                assert(object->terminating);
-               vm_object_ownership_change(object,
-                   object->vo_ledger_tag,                        /* unchanged */
-                   NULL,                        /* no owner */
-                   FALSE);                        /* task_objq not locked */
+               kr = vm_object_ownership_change(object,
+                   object->vo_ledger_tag,   /* unchanged */
+                   NULL,                    /* no owner */
+                   ledger_flags,
+                   FALSE);                  /* task_objq not locked */
+               assert(kr == KERN_SUCCESS);
                assert(object->vo_owner == NULL);
        }
 
@@ -2109,7 +2106,7 @@ typedef uint64_t        chunk_state_t;
  * while processing a higher level object in the shadow chain.
  */
 
-#define PAGE_ALREADY_HANDLED(c, p)      (((c) & (1LL << (p))) == 0)
+#define PAGE_ALREADY_HANDLED(c, p)      (((c) & (1ULL << (p))) == 0)
 
 /*
  * Mark the page at offset 'p' in the bit map as having been processed.
@@ -2117,7 +2114,7 @@ typedef uint64_t        chunk_state_t;
 
 #define MARK_PAGE_HANDLED(c, p) \
 MACRO_BEGIN \
-       (c) = (c) & ~(1LL << (p)); \
+       (c) = (c) & ~(1ULL << (p)); \
 MACRO_END
 
 
@@ -2875,9 +2872,6 @@ vm_object_copy_slowly(
 
        struct vm_object_fault_info fault_info = {};
 
-       XPR(XPR_VM_OBJECT, "v_o_c_slowly obj 0x%x off 0x%x size 0x%x\n",
-           src_object, src_offset, size, 0, 0);
-
        if (size == 0) {
                vm_object_unlock(src_object);
                *_result_object = VM_OBJECT_NULL;
@@ -3018,7 +3012,6 @@ vm_object_copy_slowly(
                        }
                        fault_info.cluster_size = cluster_size;
 
-                       XPR(XPR_VM_FAULT, "vm_object_copy_slowly -> vm_fault_page", 0, 0, 0, 0, 0);
                        _result_page = VM_PAGE_NULL;
                        result = vm_fault_page(src_object, src_offset,
                            VM_PROT_READ, FALSE,
@@ -3161,8 +3154,6 @@ vm_object_copy_quickly(
        vm_object_t     object = *_object;
        memory_object_copy_strategy_t copy_strategy;
 
-       XPR(XPR_VM_OBJECT, "v_o_c_quickly obj 0x%x off 0x%x size 0x%x\n",
-           *_object, offset, size, 0, 0);
        if (object == VM_OBJECT_NULL) {
                *_src_needs_copy = FALSE;
                *_dst_needs_copy = FALSE;
@@ -3674,10 +3665,6 @@ Retry:
        vm_object_unlock(src_object);
        vm_object_unlock(new_copy);
 
-       XPR(XPR_VM_OBJECT,
-           "vm_object_copy_delayed: used copy object %X for source %X\n",
-           new_copy, src_object, 0, 0, 0);
-
        return new_copy;
 }
 
@@ -3776,7 +3763,6 @@ vm_object_copy_strategically(
                break;
 
        case MEMORY_OBJECT_COPY_SYMMETRIC:
-               XPR(XPR_VM_OBJECT, "v_o_c_strategically obj 0x%x off 0x%x size 0x%x\n", src_object, src_offset, size, 0, 0);
                vm_object_unlock(src_object);
                result = KERN_MEMORY_RESTART_COPY;
                break;
@@ -4003,6 +3989,7 @@ vm_object_memory_object_associate(
                assert(object->pager_created);
                assert(!object->pager_initialized);
                assert(!object->pager_ready);
+               assert(object->pager_trusted);
        } else {
                object = vm_object_allocate(size);
                assert(object != VM_OBJECT_NULL);
@@ -4124,6 +4111,7 @@ vm_object_compressor_pager_create(
         */
 
        object->pager_created = TRUE;
+       object->pager_trusted = TRUE;
        object->paging_offset = 0;
 
        vm_object_unlock(object);
@@ -4444,9 +4432,6 @@ vm_object_do_collapse(
        backing_object->alive = FALSE;
        vm_object_unlock(backing_object);
 
-       XPR(XPR_VM_OBJECT, "vm_object_collapse, collapsed 0x%X\n",
-           backing_object, 0, 0, 0, 0);
-
 #if VM_OBJECT_TRACKING
        if (vm_object_tracking_inited) {
                btlog_remove_entries_for_element(vm_object_tracking_btlog,
@@ -4624,9 +4609,6 @@ vm_object_collapse(
                return;
        }
 
-       XPR(XPR_VM_OBJECT, "vm_object_collapse, obj 0x%X\n",
-           object, 0, 0, 0, 0);
-
        if (object == VM_OBJECT_NULL) {
                return;
        }
@@ -4794,12 +4776,6 @@ retry:
                                goto retry;
                        }
 
-                       XPR(XPR_VM_OBJECT,
-                           "vm_object_collapse: %x to %x, pager %x, pager_control %x\n",
-                           backing_object, object,
-                           backing_object->pager,
-                           backing_object->pager_control, 0);
-
                        /*
                         *      Collapse the object with its backing
                         *      object, and try again with the object's
@@ -5153,10 +5129,6 @@ vm_object_coalesce(
                return TRUE;
        }
 
-       XPR(XPR_VM_OBJECT,
-           "vm_object_coalesce: 0x%X prev_off 0x%X prev_size 0x%X next_size 0x%X\n",
-           prev_object, prev_offset, prev_size, next_size, 0);
-
        vm_object_lock(prev_object);
 
        /*
@@ -5516,11 +5488,6 @@ vm_object_lock_request(
 
        should_flush = flags & MEMORY_OBJECT_DATA_FLUSH;
 
-       XPR(XPR_MEMORY_OBJECT,
-           "vm_o_lock_request, obj 0x%X off 0x%X size 0x%X flags %X prot %X\n",
-           object, offset, size,
-           (((should_return & 1) << 1) | should_flush), prot);
-
        /*
         *      Check for bogus arguments.
         */
@@ -6579,8 +6546,6 @@ MACRO_END
        assert(object1->__object3_unused_bits == 0);
        assert(object2->__object3_unused_bits == 0);
 #endif /* CONFIG_SECLUDED_MEMORY */
-       assert(object1->__object2_unused_bits == 0);
-       assert(object2->__object2_unused_bits == 0);
 #if UPL_DEBUG
        /* "uplq" refers to the object not its contents (see upl_transpose()) */
 #endif
@@ -7491,15 +7456,16 @@ vm_object_compressed_freezer_done()
 }
 
 
-void
+uint32_t
 vm_object_compressed_freezer_pageout(
-       vm_object_t object)
+       vm_object_t object, uint32_t dirty_budget)
 {
        vm_page_t                       p;
        vm_page_t                       local_freeq = NULL;
        int                             local_freed = 0;
        kern_return_t                   retval = KERN_SUCCESS;
        int                             obj_resident_page_count_snapshot = 0;
+       uint32_t                        paged_out_count = 0;
 
        assert(object != VM_OBJECT_NULL);
        assert(object->internal);
@@ -7517,7 +7483,7 @@ vm_object_compressed_freezer_pageout(
 
                if (!object->pager_initialized || object->pager == MEMORY_OBJECT_NULL) {
                        vm_object_unlock(object);
-                       return;
+                       return paged_out_count;
                }
        }
 
@@ -7563,7 +7529,7 @@ vm_object_compressed_freezer_pageout(
 
        vm_object_activity_begin(object);
 
-       while ((obj_resident_page_count_snapshot--) && !vm_page_queue_empty(&object->memq)) {
+       while ((obj_resident_page_count_snapshot--) && !vm_page_queue_empty(&object->memq) && paged_out_count < dirty_budget) {
                p = (vm_page_t)vm_page_queue_first(&object->memq);
 
                KERNEL_DEBUG(0xe0430004 | DBG_FUNC_START, object, local_freed, 0, 0, 0);
@@ -7643,6 +7609,7 @@ vm_object_compressed_freezer_pageout(
                        p->vmp_snext = local_freeq;
                        local_freeq = p;
                        local_freed++;
+                       paged_out_count++;
 
                        if (local_freed >= MAX_FREE_BATCH) {
                                OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
@@ -7681,6 +7648,7 @@ vm_object_compressed_freezer_pageout(
                thread_yield_internal(FREEZER_DUTY_CYCLE_OFF_MS);
                clock_get_uptime(&c_freezer_last_yield_ts);
        }
+       return paged_out_count;
 }
 
 #endif /* CONFIG_FREEZE */
@@ -8110,24 +8078,96 @@ vm_object_ledger_tag_ledgers(
 {
        assert(object->shadow == VM_OBJECT_NULL);
 
+       *do_footprint = !object->vo_no_footprint;
+
        switch (object->vo_ledger_tag) {
-       case VM_OBJECT_LEDGER_TAG_NONE:
-               /* regular purgeable memory */
+       case VM_LEDGER_TAG_NONE:
+               /*
+                * Regular purgeable memory:
+                * counts in footprint only when nonvolatile.
+                */
+               *do_footprint = TRUE;
                assert(object->purgable != VM_PURGABLE_DENY);
                *ledger_idx_volatile = task_ledgers.purgeable_volatile;
                *ledger_idx_nonvolatile = task_ledgers.purgeable_nonvolatile;
                *ledger_idx_volatile_compressed = task_ledgers.purgeable_volatile_compressed;
                *ledger_idx_nonvolatile_compressed = task_ledgers.purgeable_nonvolatile_compressed;
-               *do_footprint = TRUE;
                break;
-       case VM_OBJECT_LEDGER_TAG_NETWORK:
+       case VM_LEDGER_TAG_DEFAULT:
+               /*
+                * "default" tagged memory:
+                * counts in footprint only when nonvolatile and not marked
+                * as "no_footprint".
+                */
+               *ledger_idx_volatile = task_ledgers.tagged_nofootprint;
+               *ledger_idx_volatile_compressed = task_ledgers.tagged_nofootprint_compressed;
+               if (*do_footprint) {
+                       *ledger_idx_nonvolatile = task_ledgers.tagged_footprint;
+                       *ledger_idx_nonvolatile_compressed = task_ledgers.tagged_footprint_compressed;
+               } else {
+                       *ledger_idx_nonvolatile = task_ledgers.tagged_nofootprint;
+                       *ledger_idx_nonvolatile_compressed = task_ledgers.tagged_nofootprint_compressed;
+               }
+               break;
+       case VM_LEDGER_TAG_NETWORK:
+               /*
+                * "network" tagged memory:
+                * never counts in footprint.
+                */
+               *do_footprint = FALSE;
                *ledger_idx_volatile = task_ledgers.network_volatile;
                *ledger_idx_volatile_compressed = task_ledgers.network_volatile_compressed;
                *ledger_idx_nonvolatile = task_ledgers.network_nonvolatile;
                *ledger_idx_nonvolatile_compressed = task_ledgers.network_nonvolatile_compressed;
-               *do_footprint = FALSE;
                break;
-       case VM_OBJECT_LEDGER_TAG_MEDIA:
+       case VM_LEDGER_TAG_MEDIA:
+               /*
+                * "media" tagged memory:
+                * counts in footprint only when nonvolatile and not marked
+                * as "no footprint".
+                */
+               *ledger_idx_volatile = task_ledgers.media_nofootprint;
+               *ledger_idx_volatile_compressed = task_ledgers.media_nofootprint_compressed;
+               if (*do_footprint) {
+                       *ledger_idx_nonvolatile = task_ledgers.media_footprint;
+                       *ledger_idx_nonvolatile_compressed = task_ledgers.media_footprint_compressed;
+               } else {
+                       *ledger_idx_nonvolatile = task_ledgers.media_nofootprint;
+                       *ledger_idx_nonvolatile_compressed = task_ledgers.media_nofootprint_compressed;
+               }
+               break;
+       case VM_LEDGER_TAG_GRAPHICS:
+               /*
+                * "graphics" tagged memory:
+                * counts in footprint only when nonvolatile and not marked
+                * as "no footprint".
+                */
+               *ledger_idx_volatile = task_ledgers.graphics_nofootprint;
+               *ledger_idx_volatile_compressed = task_ledgers.graphics_nofootprint_compressed;
+               if (*do_footprint) {
+                       *ledger_idx_nonvolatile = task_ledgers.graphics_footprint;
+                       *ledger_idx_nonvolatile_compressed = task_ledgers.graphics_footprint_compressed;
+               } else {
+                       *ledger_idx_nonvolatile = task_ledgers.graphics_nofootprint;
+                       *ledger_idx_nonvolatile_compressed = task_ledgers.graphics_nofootprint_compressed;
+               }
+               break;
+       case VM_LEDGER_TAG_NEURAL:
+               /*
+                * "neural" tagged memory:
+                * counts in footprint only when nonvolatile and not marked
+                * as "no footprint".
+                */
+               *ledger_idx_volatile = task_ledgers.neural_nofootprint;
+               *ledger_idx_volatile_compressed = task_ledgers.neural_nofootprint_compressed;
+               if (*do_footprint) {
+                       *ledger_idx_nonvolatile = task_ledgers.neural_footprint;
+                       *ledger_idx_nonvolatile_compressed = task_ledgers.neural_footprint_compressed;
+               } else {
+                       *ledger_idx_nonvolatile = task_ledgers.neural_nofootprint;
+                       *ledger_idx_nonvolatile_compressed = task_ledgers.neural_nofootprint_compressed;
+               }
+               break;
        default:
                panic("%s: object %p has unsupported ledger_tag %d\n",
                    __FUNCTION__, object, object->vo_ledger_tag);
@@ -8139,7 +8179,8 @@ vm_object_ownership_change(
        vm_object_t     object,
        int             new_ledger_tag,
        task_t          new_owner,
-       boolean_t       task_objq_locked)
+       int             new_ledger_flags,
+       boolean_t       old_task_objq_locked)
 {
        int             old_ledger_tag;
        task_t          old_owner;
@@ -8151,14 +8192,84 @@ vm_object_ownership_change(
        int             ledger_idx_nonvolatile_compressed;
        int             ledger_idx;
        int             ledger_idx_compressed;
-       boolean_t       do_footprint;
+       boolean_t       do_footprint, old_no_footprint, new_no_footprint;
+       boolean_t       new_task_objq_locked;
 
        vm_object_lock_assert_exclusive(object);
-       assert(object->internal);
+
+       if (!object->internal) {
+               return KERN_INVALID_ARGUMENT;
+       }
+       if (new_ledger_tag == VM_LEDGER_TAG_NONE &&
+           object->purgable == VM_PURGABLE_DENY) {
+               /* non-purgeable memory must have a valid non-zero ledger tag */
+               return KERN_INVALID_ARGUMENT;
+       }
+       if (new_ledger_tag < 0 ||
+           new_ledger_tag > VM_LEDGER_TAG_MAX) {
+               return KERN_INVALID_ARGUMENT;
+       }
+       if (new_ledger_flags & ~VM_LEDGER_FLAGS) {
+               return KERN_INVALID_ARGUMENT;
+       }
+       if (object->vo_ledger_tag == VM_LEDGER_TAG_NONE &&
+           object->purgable == VM_PURGABLE_DENY) {
+               /*
+                * This VM object is neither ledger-tagged nor purgeable.
+                * We can convert it to "ledger tag" ownership iff it
+                * has not been used at all yet (no resident pages and
+                * no pager) and it's going to be assigned to a valid task.
+                */
+               if (object->resident_page_count != 0 ||
+                   object->pager != NULL ||
+                   object->pager_created ||
+                   object->ref_count != 1 ||
+                   object->vo_owner != TASK_NULL ||
+                   object->copy_strategy != MEMORY_OBJECT_COPY_NONE ||
+                   new_owner == TASK_NULL) {
+                       return KERN_FAILURE;
+               }
+       }
+
+       if (new_ledger_flags & VM_LEDGER_FLAG_NO_FOOTPRINT) {
+               new_no_footprint = TRUE;
+       } else {
+               new_no_footprint = FALSE;
+       }
+#if __arm64__
+       if (!new_no_footprint &&
+           object->purgable != VM_PURGABLE_DENY &&
+           new_owner != TASK_NULL &&
+           new_owner != VM_OBJECT_OWNER_DISOWNED &&
+           new_owner->task_legacy_footprint) {
+               /*
+                * This task has been granted "legacy footprint" and should
+                * not be charged for its IOKit purgeable memory.  Since we
+                * might now change the accounting of such memory to the
+                * "graphics" ledger, for example, give it the "no footprint"
+                * option.
+                */
+               new_no_footprint = TRUE;
+       }
+#endif /* __arm64__ */
+       assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
+       assert(object->shadow == VM_OBJECT_NULL);
+       assert(object->copy == VM_OBJECT_NULL);
 
        old_ledger_tag = object->vo_ledger_tag;
+       old_no_footprint = object->vo_no_footprint;
        old_owner = VM_OBJECT_OWNER(object);
 
+       DTRACE_VM7(object_ownership_change,
+           vm_object_t, object,
+           task_t, old_owner,
+           int, old_ledger_tag,
+           int, old_no_footprint,
+           task_t, new_owner,
+           int, new_ledger_tag,
+           int, new_no_footprint);
+
+       assert(object->internal);
        resident_count = object->resident_page_count - object->wired_page_count;
        wired_count = object->wired_page_count;
        compressed_count = vm_compressor_pager_get_count(object->pager);
@@ -8169,8 +8280,9 @@ vm_object_ownership_change(
        if (old_owner != TASK_NULL &&
            ((old_owner != new_owner)           /* new owner ... */
            ||                                  /* ... or ... */
-           (old_ledger_tag &&                  /* ... new ledger */
-           old_ledger_tag != new_ledger_tag))) {
+           (old_no_footprint != new_no_footprint) /* new "no_footprint" */
+           ||                                  /* ... or ... */
+           old_ledger_tag != new_ledger_tag)) { /* ... new ledger */
                /*
                 * Take this object off of the old owner's ledgers.
                 */
@@ -8236,10 +8348,11 @@ vm_object_ownership_change(
                        /* remove object from old_owner's list of owned objects */
                        DTRACE_VM2(object_owner_remove,
                            vm_object_t, object,
-                           task_t, new_owner);
-                       if (!task_objq_locked) {
+                           task_t, old_owner);
+                       if (!old_task_objq_locked) {
                                task_objq_lock(old_owner);
                        }
+                       old_owner->task_owned_objects--;
                        queue_remove(&old_owner->task_objq, object,
                            vm_object_t, task_objq);
                        switch (object->purgable) {
@@ -8255,7 +8368,7 @@ vm_object_ownership_change(
                        default:
                                break;
                        }
-                       if (!task_objq_locked) {
+                       if (!old_task_objq_locked) {
                                task_objq_unlock(old_owner);
                        }
                }
@@ -8264,12 +8377,49 @@ vm_object_ownership_change(
        /*
         * Switch to new ledger tag and/or owner.
         */
+
+       new_task_objq_locked = FALSE;
+       if (new_owner != old_owner &&
+           new_owner != TASK_NULL &&
+           new_owner != VM_OBJECT_OWNER_DISOWNED) {
+               /*
+                * If the new owner is not accepting new objects ("disowning"),
+                * the object becomes "disowned" and will be added to
+                * the kernel's task_objq.
+                *
+                * Check first without locking, to avoid blocking while the
+                * task is disowning its objects.
+                */
+               if (new_owner->task_objects_disowning) {
+                       new_owner = VM_OBJECT_OWNER_DISOWNED;
+               } else {
+                       task_objq_lock(new_owner);
+                       /* check again now that we have the lock */
+                       if (new_owner->task_objects_disowning) {
+                               new_owner = VM_OBJECT_OWNER_DISOWNED;
+                               task_objq_unlock(new_owner);
+                       } else {
+                               new_task_objq_locked = TRUE;
+                       }
+               }
+       }
+
        object->vo_ledger_tag = new_ledger_tag;
        object->vo_owner = new_owner;
+       object->vo_no_footprint = new_no_footprint;
 
        if (new_owner == VM_OBJECT_OWNER_DISOWNED) {
+               /*
+                * Disowned objects are added to the kernel's task_objq but
+                * are marked as owned by "VM_OBJECT_OWNER_DISOWNED" to
+                * differentiate them from objects intentionally owned by
+                * the kernel.
+                */
                assert(old_owner != kernel_task);
                new_owner = kernel_task;
+               assert(!new_task_objq_locked);
+               task_objq_lock(new_owner);
+               new_task_objq_locked = TRUE;
        }
 
        /*
@@ -8278,8 +8428,9 @@ vm_object_ownership_change(
        if (new_owner != TASK_NULL &&
            ((new_owner != old_owner)           /* new owner ... */
            ||                                  /* ... or ... */
-           (new_ledger_tag &&                  /* ... new ledger */
-           new_ledger_tag != old_ledger_tag))) {
+           (new_no_footprint != old_no_footprint) /* ... new "no_footprint" */
+           ||                                  /* ... or ... */
+           new_ledger_tag != old_ledger_tag)) { /* ... new ledger */
                /*
                 * Add this object to the new owner's ledgers.
                 */
@@ -8346,7 +8497,8 @@ vm_object_ownership_change(
                        DTRACE_VM2(object_owner_add,
                            vm_object_t, object,
                            task_t, new_owner);
-                       task_objq_lock(new_owner);
+                       assert(new_task_objq_locked);
+                       new_owner->task_owned_objects++;
                        queue_enter(&new_owner->task_objq, object,
                            vm_object_t, task_objq);
                        switch (object->purgable) {
@@ -8362,9 +8514,100 @@ vm_object_ownership_change(
                        default:
                                break;
                        }
-                       task_objq_unlock(new_owner);
                }
        }
 
+       if (new_task_objq_locked) {
+               task_objq_unlock(new_owner);
+       }
+
        return KERN_SUCCESS;
 }
+
+void
+vm_owned_objects_disown(
+       task_t  task)
+{
+       vm_object_t     next_object;
+       vm_object_t     object;
+       int             collisions;
+       kern_return_t   kr;
+
+       if (task == NULL) {
+               return;
+       }
+
+       collisions = 0;
+
+again:
+       if (task->task_objects_disowned) {
+               /* task has already disowned its owned objects */
+               assert(task->task_volatile_objects == 0);
+               assert(task->task_nonvolatile_objects == 0);
+               assert(task->task_owned_objects == 0);
+               return;
+       }
+
+       task_objq_lock(task);
+
+       task->task_objects_disowning = TRUE;
+
+       for (object = (vm_object_t) queue_first(&task->task_objq);
+           !queue_end(&task->task_objq, (queue_entry_t) object);
+           object = next_object) {
+               if (task->task_nonvolatile_objects == 0 &&
+                   task->task_volatile_objects == 0 &&
+                   task->task_owned_objects == 0) {
+                       /* no more objects owned by "task" */
+                       break;
+               }
+
+               next_object = (vm_object_t) queue_next(&object->task_objq);
+
+#if DEBUG
+               assert(object->vo_purgeable_volatilizer == NULL);
+#endif /* DEBUG */
+               assert(object->vo_owner == task);
+               if (!vm_object_lock_try(object)) {
+                       task_objq_unlock(task);
+                       mutex_pause(collisions++);
+                       goto again;
+               }
+               /* transfer ownership to the kernel */
+               assert(VM_OBJECT_OWNER(object) != kernel_task);
+               kr = vm_object_ownership_change(
+                       object,
+                       object->vo_ledger_tag, /* unchanged */
+                       VM_OBJECT_OWNER_DISOWNED, /* new owner */
+                       0, /* new_ledger_flags */
+                       TRUE);  /* old_owner->task_objq locked */
+               assert(kr == KERN_SUCCESS);
+               assert(object->vo_owner == VM_OBJECT_OWNER_DISOWNED);
+               vm_object_unlock(object);
+       }
+
+       if (__improbable(task->task_volatile_objects != 0 ||
+           task->task_nonvolatile_objects != 0 ||
+           task->task_owned_objects != 0)) {
+               panic("%s(%p): volatile=%d nonvolatile=%d owned=%d q=%p q_first=%p q_last=%p",
+                   __FUNCTION__,
+                   task,
+                   task->task_volatile_objects,
+                   task->task_nonvolatile_objects,
+                   task->task_owned_objects,
+                   &task->task_objq,
+                   queue_first(&task->task_objq),
+                   queue_last(&task->task_objq));
+       }
+
+       /* there shouldn't be any objects owned by task now */
+       assert(task->task_volatile_objects == 0);
+       assert(task->task_nonvolatile_objects == 0);
+       assert(task->task_owned_objects == 0);
+       assert(task->task_objects_disowning);
+
+       /* and we don't need to try and disown again */
+       task->task_objects_disowned = TRUE;
+
+       task_objq_unlock(task);
+}
index 3fef567bb0679504b12fb528a1d8ca9571a48a96..eedfb09e4fae6cb5bcbaecacf20b1a7f7ad4ba93 100644 (file)
@@ -128,7 +128,9 @@ struct vm_object_fault_info {
        /* boolean_t */ pmap_cs_associated:1,
        /* boolean_t */ mark_zf_absent:1,
        /* boolean_t */ batch_pmap_op:1,
-           __vm_object_fault_info_unused_bits:25;
+       /* boolean_t */ resilient_media:1,
+       /* boolean_t */ no_copy_on_read:1,
+           __vm_object_fault_info_unused_bits:23;
        int             pmap_options;
 };
 
@@ -362,8 +364,8 @@ struct vm_object {
 #else /* VM_OBJECT_ACCESS_TRACKING */
        __unused_access_tracking:1,
 #endif /* VM_OBJECT_ACCESS_TRACKING */
-       vo_ledger_tag:2,
-           __object2_unused_bits:2;            /* for expansion */
+       vo_ledger_tag:3,
+           vo_no_footprint:1;
 
 #if VM_OBJECT_ACCESS_TRACKING
        uint32_t        access_tracking_reads;
@@ -407,12 +409,6 @@ struct vm_object {
 #endif /* DEBUG */
 };
 
-/* values for object->vo_ledger_tag */
-#define VM_OBJECT_LEDGER_TAG_NONE       0
-#define VM_OBJECT_LEDGER_TAG_NETWORK    1
-#define VM_OBJECT_LEDGER_TAG_MEDIA      2
-#define VM_OBJECT_LEDGER_TAG_RESERVED   3
-
 #define VM_OBJECT_PURGEABLE_FAULT_ERROR(object)                         \
        ((object)->volatile_fault &&                                    \
         ((object)->purgable == VM_PURGABLE_VOLATILE ||                 \
@@ -892,9 +888,9 @@ __private_extern__ void         vm_object_reap_pages(
 
 #if CONFIG_FREEZE
 
-__private_extern__ void
+__private_extern__ uint32_t
 vm_object_compressed_freezer_pageout(
-       vm_object_t     object);
+       vm_object_t     object, uint32_t dirty_budget);
 
 __private_extern__ void
 vm_object_compressed_freezer_done(
@@ -1203,8 +1199,9 @@ extern void     vm_object_ledger_tag_ledgers(
        boolean_t *do_footprint);
 extern kern_return_t vm_object_ownership_change(
        vm_object_t object,
-       int ledger_tag,
-       task_t owner,
+       int new_ledger_tag,
+       task_t new_owner,
+       int new_ledger_flags,
        boolean_t task_objq_locked);
 
 #endif  /* _VM_VM_OBJECT_H_ */
index 9e0304dbfb22b1873094c458ecd1722105aaa857..e9a3fbdf8ee7cac32ed7a654de8b6498e3522af6 100644 (file)
@@ -373,11 +373,14 @@ vm_page_pack_ptr(uintptr_t p)
 static inline uintptr_t
 vm_page_unpack_ptr(uintptr_t p)
 {
+       extern unsigned int vm_pages_count;
+
        if (!p) {
                return (uintptr_t)0;
        }
 
        if (p & VM_PACKED_FROM_VM_PAGES_ARRAY) {
+               assert((uint32_t)(p & ~VM_PACKED_FROM_VM_PAGES_ARRAY) < vm_pages_count);
                return (uintptr_t)(&vm_pages[(uint32_t)(p & ~VM_PACKED_FROM_VM_PAGES_ARRAY)]);
        }
        return (p << VM_PACKED_POINTER_SHIFT) + (uintptr_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS;
@@ -1151,9 +1154,36 @@ unsigned int    vm_page_inactive_count; /* How many pages are inactive? */
 extern
 unsigned int    vm_page_secluded_count; /* How many pages are secluded? */
 extern
-unsigned int    vm_page_secluded_count_free;
+unsigned int    vm_page_secluded_count_free; /* how many of them are free? */
 extern
-unsigned int    vm_page_secluded_count_inuse;
+unsigned int    vm_page_secluded_count_inuse; /* how many of them are in use? */
+/*
+ * We keep filling the secluded pool with new eligible pages and
+ * we can overshoot our target by a lot.
+ * When there's memory pressure, vm_pageout_scan() will re-balance the queues,
+ * pushing the extra secluded pages to the active or free queue.
+ * Since these "over target" secluded pages are actually "available", jetsam
+ * should consider them as such, so make them visible to jetsam via the
+ * "vm_page_secluded_count_over_target" counter and update it whenever we
+ * update vm_page_secluded_count or vm_page_secluded_target.
+ */
+extern
+unsigned int    vm_page_secluded_count_over_target;
+#define VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE()                     \
+       MACRO_BEGIN                                                     \
+       if (vm_page_secluded_count > vm_page_secluded_target) {         \
+               vm_page_secluded_count_over_target =                    \
+                       (vm_page_secluded_count - vm_page_secluded_target); \
+       } else {                                                        \
+               vm_page_secluded_count_over_target = 0;                 \
+       }                                                               \
+       MACRO_END
+#define VM_PAGE_SECLUDED_COUNT_OVER_TARGET() vm_page_secluded_count_over_target
+#else /* CONFIG_SECLUDED_MEMORY */
+#define VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE() \
+       MACRO_BEGIN                                 \
+       MACRO_END
+#define VM_PAGE_SECLUDED_COUNT_OVER_TARGET() 0
 #endif /* CONFIG_SECLUDED_MEMORY */
 extern
 unsigned int    vm_page_cleaned_count; /* How many pages are in the clean queue? */
@@ -1195,6 +1225,8 @@ extern
 unsigned int    vm_page_gobble_count;
 extern
 unsigned int    vm_page_stolen_count;   /* Count of stolen pages not acccounted in zones */
+extern
+unsigned int    vm_page_kern_lpage_count;   /* Count of large pages used in early boot */
 
 
 #if DEVELOPMENT || DEBUG
@@ -1453,6 +1485,7 @@ extern void memorystatus_pages_update(unsigned int pages_avail);
        memorystatus_pages_update(              \
                vm_page_pageable_external_count + \
                vm_page_free_count +            \
+               VM_PAGE_SECLUDED_COUNT_OVER_TARGET() + \
                (VM_DYNAMIC_PAGING_ENABLED() ? 0 : vm_page_purgeable_count) \
                ); \
        } while(0)
index 6b6e3d04d6796dff5f35142bcf8b61a0012aacc8..21b7d3951387548463a4c6b08c533cb342c2ce29 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -87,7 +87,6 @@
 #include <kern/misc_protos.h>
 #include <kern/sched.h>
 #include <kern/thread.h>
-#include <kern/xpr.h>
 #include <kern/kalloc.h>
 #include <kern/policy_internal.h>
 #include <kern/thread_group.h>
@@ -137,12 +136,17 @@ extern unsigned int memorystatus_frozen_count;
 extern unsigned int memorystatus_suspended_count;
 extern vm_pressure_level_t memorystatus_vm_pressure_level;
 
+extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
+extern uint32_t memorystatus_jetsam_fg_band_waiters;
+
 void vm_pressure_response(void);
 extern void consider_vm_pressure_events(void);
 
 #define MEMORYSTATUS_SUSPENDED_THRESHOLD  4
 #endif /* VM_PRESSURE_EVENTS */
 
+thread_t  vm_pageout_scan_thread = THREAD_NULL;
+boolean_t vps_dynamic_priority_enabled = FALSE;
 
 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 #ifdef  CONFIG_EMBEDDED
@@ -306,9 +310,13 @@ extern void vm_pageout_scan(void);
 
 void vm_tests(void); /* forward */
 
+boolean_t vm_pageout_running = FALSE;
+
+uint32_t vm_page_upl_tainted = 0;
+uint32_t vm_page_iopl_tainted = 0;
+
 #if !CONFIG_EMBEDDED
 static boolean_t vm_pageout_waiter  = FALSE;
-static boolean_t vm_pageout_running = FALSE;
 #endif /* !CONFIG_EMBEDDED */
 
 
@@ -529,11 +537,6 @@ vm_pageclean_setup(
        assert(!m->vmp_cleaning);
 #endif
 
-       XPR(XPR_VM_PAGEOUT,
-           "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
-           VM_PAGE_OBJECT(m), m->vmp_offset, m,
-           new_m, new_offset);
-
        pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
 
        /*
@@ -589,10 +592,6 @@ vm_pageout_initialize_page(
        vm_object_offset_t      paging_offset;
        memory_object_t         pager;
 
-       XPR(XPR_VM_PAGEOUT,
-           "vm_pageout_initialize_page, page 0x%X\n",
-           m, 0, 0, 0, 0);
-
        assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
 
        object = VM_PAGE_OBJECT(m);
@@ -699,11 +698,6 @@ vm_pageout_cluster(vm_page_t m)
        vm_object_t     object = VM_PAGE_OBJECT(m);
        struct          vm_pageout_queue *q;
 
-
-       XPR(XPR_VM_PAGEOUT,
-           "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
-           object, m->vmp_offset, m, 0, 0);
-
        VM_PAGE_CHECK(m);
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
        vm_object_lock_assert_exclusive(object);
@@ -1741,694 +1735,1356 @@ update_vm_info(void)
 
 extern boolean_t hibernation_vmqueues_inspection;
 
-void
-vm_page_balance_inactive(int max_to_move)
-{
-       vm_page_t m;
+/*
+ * Return values for functions called by vm_pageout_scan
+ * that control its flow.
+ *
+ * PROCEED -- vm_pageout_scan will keep making forward progress.
+ * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
+ * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
+ */
 
-       LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
+#define VM_PAGEOUT_SCAN_PROCEED                 (0)
+#define VM_PAGEOUT_SCAN_DONE_RETURN             (1)
+#define VM_PAGEOUT_SCAN_NEXT_ITERATION          (2)
+
+/*
+ * This function is called only from vm_pageout_scan and
+ * it moves overflow secluded pages (one-at-a-time) to the
+ * batched 'local' free Q or active Q.
+ */
+static void
+vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
+{
+#if CONFIG_SECLUDED_MEMORY
+       /*
+        * Deal with secluded_q overflow.
+        */
+       if (vm_page_secluded_count > vm_page_secluded_target) {
+               vm_page_t secluded_page;
 
-       if (hibernation_vmqueues_inspection == TRUE) {
                /*
-                * It is likely that the hibernation code path is
-                * dealing with these very queues as we are about
-                * to move pages around in/from them and completely
-                * change the linkage of the pages.
-                *
-                * And so we skip the rebalancing of these queues.
+                * SECLUDED_AGING_BEFORE_ACTIVE:
+                * Excess secluded pages go to the active queue and
+                * will later go to the inactive queue.
                 */
-               return;
+               assert((vm_page_secluded_count_free +
+                   vm_page_secluded_count_inuse) ==
+                   vm_page_secluded_count);
+               secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
+               assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
+
+               vm_page_queues_remove(secluded_page, FALSE);
+               assert(!secluded_page->vmp_fictitious);
+               assert(!VM_PAGE_WIRED(secluded_page));
+
+               if (secluded_page->vmp_object == 0) {
+                       /* transfer to free queue */
+                       assert(secluded_page->vmp_busy);
+                       secluded_page->vmp_snext = *local_freeq;
+                       *local_freeq = secluded_page;
+                       *local_freed += 1;
+               } else {
+                       /* transfer to head of active queue */
+                       vm_page_enqueue_active(secluded_page, FALSE);
+                       secluded_page = VM_PAGE_NULL;
+               }
        }
-       vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
-           vm_page_inactive_count +
-           vm_page_speculative_count);
+#else /* CONFIG_SECLUDED_MEMORY */
 
-       while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
-               VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
-
-               m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
+#pragma unused(local_freeq)
+#pragma unused(local_freed)
 
-               assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
-               assert(!m->vmp_laundry);
-               assert(VM_PAGE_OBJECT(m) != kernel_object);
-               assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
+       return;
 
-               DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
+#endif /* CONFIG_SECLUDED_MEMORY */
+}
 
-               /*
-                * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
-                *
-                * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
-                * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
-                * new reference happens. If no futher references happen on the page after that remote TLB flushes
-                * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
-                * by pageout_scan, which is just fine since the last reference would have happened quite far
-                * in the past (TLB caches don't hang around for very long), and of course could just as easily
-                * have happened before we moved the page
-                */
-               if (m->vmp_pmapped == TRUE) {
-                       pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
-               }
+/*
+ * This function is called only from vm_pageout_scan and
+ * it initializes the loop targets for vm_pageout_scan().
+ */
+static void
+vps_init_page_targets(void)
+{
+       /*
+        * LD TODO: Other page targets should be calculated here too.
+        */
+       vm_page_anonymous_min = vm_page_inactive_target / 20;
 
-               /*
-                * The page might be absent or busy,
-                * but vm_page_deactivate can handle that.
-                * FALSE indicates that we don't want a H/W clear reference
-                */
-               vm_page_deactivate_internal(m, FALSE);
+       if (vm_pageout_state.vm_page_speculative_percentage > 50) {
+               vm_pageout_state.vm_page_speculative_percentage = 50;
+       } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
+               vm_pageout_state.vm_page_speculative_percentage = 1;
        }
-}
 
+       vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
+           vm_page_inactive_count);
+}
 
 /*
- *     vm_pageout_scan does the dirty work for the pageout daemon.
- *     It returns with both vm_page_queue_free_lock and vm_page_queue_lock
- *     held and vm_page_free_wanted == 0.
+ * This function is called only from vm_pageout_scan and
+ * it purges a single VM object at-a-time and will either
+ * make vm_pageout_scan() restart the loop or keeping moving forward.
  */
-void
-vm_pageout_scan(void)
+static int
+vps_purge_object()
 {
-       unsigned int loop_count = 0;
-       unsigned int inactive_burst_count = 0;
-       unsigned int reactivated_this_call;
-       unsigned int reactivate_limit;
-       vm_page_t   local_freeq = NULL;
-       int         local_freed = 0;
-       int         delayed_unlock;
-       int         delayed_unlock_limit = 0;
-       int         refmod_state = 0;
-       int     vm_pageout_deadlock_target = 0;
-       struct  vm_pageout_queue *iq;
-       struct  vm_pageout_queue *eq;
-       struct  vm_speculative_age_q *sq;
-       struct  flow_control    flow_control = { 0, { 0, 0 } };
-       boolean_t inactive_throttled = FALSE;
-       mach_timespec_t ts;
-       unsigned        int msecs = 0;
-       vm_object_t     object = NULL;
-       uint32_t        inactive_reclaim_run;
-       boolean_t       exceeded_burst_throttle;
-       boolean_t       grab_anonymous = FALSE;
-       boolean_t       force_anonymous = FALSE;
-       boolean_t       force_speculative_aging = FALSE;
-       int             anons_grabbed = 0;
-       int             page_prev_q_state = 0;
-#if CONFIG_BACKGROUND_QUEUE
-       boolean_t       page_from_bg_q = FALSE;
-#endif
-       int             cache_evict_throttle = 0;
-       uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
-       uint32_t        inactive_external_count;
-       int             force_purge = 0;
-       int             divisor;
-#define DELAY_SPECULATIVE_AGE   1000
-       int             delay_speculative_age = 0;
-       vm_object_t     m_object = VM_OBJECT_NULL;
+       int             force_purge;
+
+       assert(available_for_purge >= 0);
+       force_purge = 0; /* no force-purging */
 
 #if VM_PRESSURE_EVENTS
        vm_pressure_level_t pressure_level;
-#endif /* VM_PRESSURE_EVENTS */
-
-       VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
-           vm_pageout_vminfo.vm_pageout_freed_speculative,
-           vm_pageout_state.vm_pageout_inactive_clean,
-           vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
-           vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
 
-       flow_control.state = FCS_IDLE;
-       iq = &vm_pageout_queue_internal;
-       eq = &vm_pageout_queue_external;
-       sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
+       pressure_level = memorystatus_vm_pressure_level;
 
+       if (pressure_level > kVMPressureNormal) {
+               if (pressure_level >= kVMPressureCritical) {
+                       force_purge = vm_pageout_state.memorystatus_purge_on_critical;
+               } else if (pressure_level >= kVMPressureUrgent) {
+                       force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
+               } else if (pressure_level >= kVMPressureWarning) {
+                       force_purge = vm_pageout_state.memorystatus_purge_on_warning;
+               }
+       }
+#endif /* VM_PRESSURE_EVENTS */
 
-       XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
+       if (available_for_purge || force_purge) {
+               memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
 
-       /* Ask the pmap layer to return any pages it no longer needs. */
-       uint64_t pmap_wired_pages_freed = pmap_release_pages_fast();
+               VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
+               if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
+                       VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
+                       VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
+                       memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
 
-       vm_page_lock_queues();
+                       return VM_PAGEOUT_SCAN_NEXT_ITERATION;
+               }
+               VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
+               memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
+       }
 
-       vm_page_wire_count -= pmap_wired_pages_freed;
+       return VM_PAGEOUT_SCAN_PROCEED;
+}
 
-       delayed_unlock = 1;
+/*
+ * This function is called only from vm_pageout_scan and
+ * it will try to age the next speculative Q if the oldest
+ * one is empty.
+ */
+static int
+vps_age_speculative_queue(boolean_t force_speculative_aging)
+{
+#define DELAY_SPECULATIVE_AGE   1000
 
        /*
-        *      Calculate the max number of referenced pages on the inactive
-        *      queue that we will reactivate.
+        * try to pull pages from the aging bins...
+        * see vm_page.h for an explanation of how
+        * this mechanism works
         */
-       reactivated_this_call = 0;
-       reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
-           vm_page_inactive_count);
-       inactive_reclaim_run = 0;
+       boolean_t                       can_steal = FALSE;
+       int                             num_scanned_queues;
+       static int                      delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
+       mach_timespec_t                 ts;
+       struct vm_speculative_age_q     *aq;
+       struct vm_speculative_age_q     *sq;
 
-       vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
+       sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
 
-       /*
-        *      We must limit the rate at which we send pages to the pagers
-        *      so that we don't tie up too many pages in the I/O queues.
-        *      We implement a throttling mechanism using the laundry count
-        *      to limit the number of pages outstanding to the default
-        *      and external pagers.  We can bypass the throttles and look
-        *      for clean pages if the pageout queues don't drain in a timely
-        *      fashion since this may indicate that the pageout paths are
-        *      stalled waiting for memory, which only we can provide.
-        */
+       aq = &vm_page_queue_speculative[speculative_steal_index];
 
-Restart:
+       num_scanned_queues = 0;
+       while (vm_page_queue_empty(&aq->age_q) &&
+           num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
+               speculative_steal_index++;
 
-       assert(object == NULL);
-       assert(delayed_unlock != 0);
+               if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
+                       speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
+               }
 
-       vm_page_anonymous_min = vm_page_inactive_target / 20;
+               aq = &vm_page_queue_speculative[speculative_steal_index];
+       }
 
-       if (vm_pageout_state.vm_page_speculative_percentage > 50) {
-               vm_pageout_state.vm_page_speculative_percentage = 50;
-       } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
-               vm_pageout_state.vm_page_speculative_percentage = 1;
+       if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
+               /*
+                * XXX We've scanned all the speculative
+                * queues but still haven't found one
+                * that is not empty, even though
+                * vm_page_speculative_count is not 0.
+                */
+               if (!vm_page_queue_empty(&sq->age_q)) {
+                       return VM_PAGEOUT_SCAN_NEXT_ITERATION;
+               }
+#if DEVELOPMENT || DEBUG
+               panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
+#endif
+               /* readjust... */
+               vm_page_speculative_count = 0;
+               /* ... and continue */
+               return VM_PAGEOUT_SCAN_NEXT_ITERATION;
        }
 
-       vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
-           vm_page_inactive_count);
+       if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
+               can_steal = TRUE;
+       } else {
+               if (!delay_speculative_age) {
+                       mach_timespec_t ts_fully_aged;
 
-       for (;;) {
-               vm_page_t m;
+                       ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
+                       ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
+                           * 1000 * NSEC_PER_USEC;
 
-               DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
+                       ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
 
-               if (vm_upl_wait_for_pages < 0) {
-                       vm_upl_wait_for_pages = 0;
+                       clock_sec_t sec;
+                       clock_nsec_t nsec;
+                       clock_get_system_nanotime(&sec, &nsec);
+                       ts.tv_sec = (unsigned int) sec;
+                       ts.tv_nsec = nsec;
+
+                       if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
+                               can_steal = TRUE;
+                       } else {
+                               delay_speculative_age++;
+                       }
+               } else {
+                       delay_speculative_age++;
+                       if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
+                               delay_speculative_age = 0;
+                       }
                }
+       }
+       if (can_steal == TRUE) {
+               vm_page_speculate_ageit(aq);
+       }
 
-               delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
+       return VM_PAGEOUT_SCAN_PROCEED;
+}
 
-               if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
-                       delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
+/*
+ * This function is called only from vm_pageout_scan and
+ * it evicts a single VM object from the cache.
+ */
+static int inline
+vps_object_cache_evict(vm_object_t *object_to_unlock)
+{
+       static int                      cache_evict_throttle = 0;
+       struct vm_speculative_age_q     *sq;
+
+       sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
+
+       if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
+               int     pages_evicted;
+
+               if (*object_to_unlock != NULL) {
+                       vm_object_unlock(*object_to_unlock);
+                       *object_to_unlock = NULL;
                }
+               KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
 
-#if CONFIG_SECLUDED_MEMORY
-               /*
-                * Deal with secluded_q overflow.
-                */
-               if (vm_page_secluded_count > vm_page_secluded_target) {
-                       vm_page_t secluded_page;
+               pages_evicted = vm_object_cache_evict(100, 10);
+
+               KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
+
+               if (pages_evicted) {
+                       vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
+
+                       VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
+                           vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
+                       memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
 
                        /*
-                        * SECLUDED_AGING_BEFORE_ACTIVE:
-                        * Excess secluded pages go to the active queue and
-                        * will later go to the inactive queue.
+                        * we just freed up to 100 pages,
+                        * so go back to the top of the main loop
+                        * and re-evaulate the memory situation
                         */
-                       assert((vm_page_secluded_count_free +
-                           vm_page_secluded_count_inuse) ==
-                           vm_page_secluded_count);
-                       secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
-                       assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
-
-                       vm_page_queues_remove(secluded_page, FALSE);
-                       assert(!secluded_page->vmp_fictitious);
-                       assert(!VM_PAGE_WIRED(secluded_page));
-
-                       if (secluded_page->vmp_object == 0) {
-                               /* transfer to free queue */
-                               assert(secluded_page->vmp_busy);
-                               secluded_page->vmp_snext = local_freeq;
-                               local_freeq = secluded_page;
-                               local_freed++;
-                       } else {
-                               /* transfer to head of active queue */
-                               vm_page_enqueue_active(secluded_page, FALSE);
-                               secluded_page = VM_PAGE_NULL;
-                       }
+                       return VM_PAGEOUT_SCAN_NEXT_ITERATION;
+               } else {
+                       cache_evict_throttle = 1000;
                }
-#endif /* CONFIG_SECLUDED_MEMORY */
+       }
+       if (cache_evict_throttle) {
+               cache_evict_throttle--;
+       }
 
-               assert(delayed_unlock);
+       return VM_PAGEOUT_SCAN_PROCEED;
+}
+
+
+/*
+ * This function is called only from vm_pageout_scan and
+ * it calculates the filecache min. that needs to be maintained
+ * as we start to steal pages.
+ */
+static void
+vps_calculate_filecache_min(void)
+{
+       int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
 
+#if CONFIG_JETSAM
+       /*
+        * don't let the filecache_min fall below 15% of available memory
+        * on systems with an active compressor that isn't nearing its
+        * limits w/r to accepting new data
+        *
+        * on systems w/o the compressor/swapper, the filecache is always
+        * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
+        * since most (if not all) of the anonymous pages are in the
+        * throttled queue (which isn't counted as available) which
+        * effectively disables this filter
+        */
+       if (vm_compressor_low_on_space() || divisor == 0) {
+               vm_pageout_state.vm_page_filecache_min = 0;
+       } else {
+               vm_pageout_state.vm_page_filecache_min =
+                   ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
+       }
+#else
+       if (vm_compressor_out_of_space() || divisor == 0) {
+               vm_pageout_state.vm_page_filecache_min = 0;
+       } else {
                /*
-                * maintain our balance
+                * don't let the filecache_min fall below the specified critical level
                 */
-               vm_page_balance_inactive(1);
+               vm_pageout_state.vm_page_filecache_min =
+                   ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
+       }
+#endif
+       if (vm_page_free_count < (vm_page_free_reserved / 4)) {
+               vm_pageout_state.vm_page_filecache_min = 0;
+       }
+}
 
+/*
+ * This function is called only from vm_pageout_scan and
+ * it updates the flow control time to detect if VM pageoutscan
+ * isn't making progress.
+ */
+static void
+vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
+{
+       mach_timespec_t ts;
+       clock_sec_t sec;
+       clock_nsec_t nsec;
 
-               /**********************************************************************
-               * above this point we're playing with the active and secluded queues
-               * below this point we're playing with the throttling mechanisms
-               * and the inactive queue
-               **********************************************************************/
+       ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
+       ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
+       clock_get_system_nanotime(&sec, &nsec);
+       flow_control->ts.tv_sec = (unsigned int) sec;
+       flow_control->ts.tv_nsec = nsec;
+       ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
 
-               if (vm_page_free_count + local_freed >= vm_page_free_target) {
-                       vm_pageout_scan_wants_object = VM_OBJECT_NULL;
+       flow_control->state = FCS_DELAYED;
 
-                       vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
-                           VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
-                       /*
-                        * make sure the pageout I/O threads are running
-                        * throttled in case there are still requests
-                        * in the laundry... since we have met our targets
-                        * we don't need the laundry to be cleaned in a timely
-                        * fashion... so let's avoid interfering with foreground
-                        * activity
-                        */
-                       vm_pageout_adjust_eq_iothrottle(eq, TRUE);
+       vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
+}
 
-                       lck_mtx_lock(&vm_page_queue_free_lock);
+/*
+ * This function is called only from vm_pageout_scan and
+ * it is the flow control logic of VM pageout scan which
+ * controls if it should block and for how long.
+ * Any blocking of vm_pageout_scan happens ONLY in this function.
+ */
+static int
+vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
+    vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
+{
+       boolean_t       exceeded_burst_throttle = FALSE;
+       unsigned int    msecs = 0;
+       uint32_t        inactive_external_count;
+       mach_timespec_t ts;
+       struct  vm_pageout_queue *iq;
+       struct  vm_pageout_queue *eq;
+       struct  vm_speculative_age_q *sq;
 
-                       if ((vm_page_free_count >= vm_page_free_target) &&
-                           (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
+       iq = &vm_pageout_queue_internal;
+       eq = &vm_pageout_queue_external;
+       sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
+
+       /*
+        * Sometimes we have to pause:
+        *      1) No inactive pages - nothing to do.
+        *      2) Loop control - no acceptable pages found on the inactive queue
+        *         within the last vm_pageout_burst_inactive_throttle iterations
+        *      3) Flow control - default pageout queue is full
+        */
+       if (vm_page_queue_empty(&vm_page_queue_inactive) &&
+           vm_page_queue_empty(&vm_page_queue_anonymous) &&
+           vm_page_queue_empty(&vm_page_queue_cleaned) &&
+           vm_page_queue_empty(&sq->age_q)) {
+               VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
+               msecs = vm_pageout_state.vm_pageout_empty_wait;
+       } else if (inactive_burst_count >=
+           MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
+           (vm_page_inactive_count +
+           vm_page_speculative_count))) {
+               VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
+               msecs = vm_pageout_state.vm_pageout_burst_wait;
+
+               exceeded_burst_throttle = TRUE;
+       } else if (VM_PAGE_Q_THROTTLED(iq) &&
+           VM_DYNAMIC_PAGING_ENABLED()) {
+               clock_sec_t sec;
+               clock_nsec_t nsec;
+
+               switch (flow_control->state) {
+               case FCS_IDLE:
+                       if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
+                           vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
                                /*
-                                * done - we have met our target *and*
-                                * there is no one waiting for a page.
+                                * since the compressor is running independently of vm_pageout_scan
+                                * let's not wait for it just yet... as long as we have a healthy supply
+                                * of filecache pages to work with, let's keep stealing those.
                                 */
-return_from_scan:
-                               assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
-
-                               VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
-                                   vm_pageout_state.vm_pageout_inactive,
-                                   vm_pageout_state.vm_pageout_inactive_used, 0, 0);
-                               VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
-                                   vm_pageout_vminfo.vm_pageout_freed_speculative,
-                                   vm_pageout_state.vm_pageout_inactive_clean,
-                                   vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
-                                   vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
+                               inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
 
-                               return;
+                               if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
+                                   (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
+                                       *anons_grabbed = ANONS_GRABBED_LIMIT;
+                                       VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
+                                       return VM_PAGEOUT_SCAN_PROCEED;
+                               }
                        }
-                       lck_mtx_unlock(&vm_page_queue_free_lock);
-               }
 
-               /*
-                * Before anything, we check if we have any ripe volatile
-                * objects around. If so, try to purge the first object.
-                * If the purge fails, fall through to reclaim a page instead.
-                * If the purge succeeds, go back to the top and reevalute
-                * the new memory situation.
-                */
+                       vps_flow_control_reset_deadlock_timer(flow_control);
+                       msecs = vm_pageout_state.vm_pageout_deadlock_wait;
 
-               assert(available_for_purge >= 0);
-               force_purge = 0; /* no force-purging */
+                       break;
 
-#if VM_PRESSURE_EVENTS
-               pressure_level = memorystatus_vm_pressure_level;
+               case FCS_DELAYED:
+                       clock_get_system_nanotime(&sec, &nsec);
+                       ts.tv_sec = (unsigned int) sec;
+                       ts.tv_nsec = nsec;
 
-               if (pressure_level > kVMPressureNormal) {
-                       if (pressure_level >= kVMPressureCritical) {
-                               force_purge = vm_pageout_state.memorystatus_purge_on_critical;
-                       } else if (pressure_level >= kVMPressureUrgent) {
-                               force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
-                       } else if (pressure_level >= kVMPressureWarning) {
-                               force_purge = vm_pageout_state.memorystatus_purge_on_warning;
+                       if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
+                               /*
+                                * the pageout thread for the default pager is potentially
+                                * deadlocked since the
+                                * default pager queue has been throttled for more than the
+                                * allowable time... we need to move some clean pages or dirty
+                                * pages belonging to the external pagers if they aren't throttled
+                                * vm_page_free_wanted represents the number of threads currently
+                                * blocked waiting for pages... we'll move one page for each of
+                                * these plus a fixed amount to break the logjam... once we're done
+                                * moving this number of pages, we'll re-enter the FSC_DELAYED state
+                                * with a new timeout target since we have no way of knowing
+                                * whether we've broken the deadlock except through observation
+                                * of the queue associated with the default pager... we need to
+                                * stop moving pages and allow the system to run to see what
+                                * state it settles into.
+                                */
+
+                               *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
+                                   vm_page_free_wanted + vm_page_free_wanted_privileged;
+                               VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
+                               flow_control->state = FCS_DEADLOCK_DETECTED;
+                               thread_wakeup((event_t) &vm_pageout_garbage_collect);
+                               return VM_PAGEOUT_SCAN_PROCEED;
                        }
-               }
-#endif /* VM_PRESSURE_EVENTS */
+                       /*
+                        * just resniff instead of trying
+                        * to compute a new delay time... we're going to be
+                        * awakened immediately upon a laundry completion,
+                        * so we won't wait any longer than necessary
+                        */
+                       msecs = vm_pageout_state.vm_pageout_idle_wait;
+                       break;
 
-               if (available_for_purge || force_purge) {
-                       if (object != NULL) {
-                               vm_object_unlock(object);
-                               object = NULL;
+               case FCS_DEADLOCK_DETECTED:
+                       if (*vm_pageout_deadlock_target) {
+                               return VM_PAGEOUT_SCAN_PROCEED;
                        }
 
-                       memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
+                       vps_flow_control_reset_deadlock_timer(flow_control);
+                       msecs = vm_pageout_state.vm_pageout_deadlock_wait;
 
-                       VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
-                       if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
-                               VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
-                               VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
-                               memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
-                               continue;
-                       }
-                       VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
-                       memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
+                       break;
                }
+       } else {
+               /*
+                * No need to pause...
+                */
+               return VM_PAGEOUT_SCAN_PROCEED;
+       }
 
-               if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
-                       /*
-                        * try to pull pages from the aging bins...
-                        * see vm_page.h for an explanation of how
-                        * this mechanism works
-                        */
-                       struct vm_speculative_age_q     *aq;
-                       boolean_t       can_steal = FALSE;
-                       int num_scanned_queues;
+       vm_pageout_scan_wants_object = VM_OBJECT_NULL;
 
-                       aq = &vm_page_queue_speculative[speculative_steal_index];
+       vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
+           VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
 
-                       num_scanned_queues = 0;
-                       while (vm_page_queue_empty(&aq->age_q) &&
-                           num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
-                               speculative_steal_index++;
+       if (vm_page_free_count >= vm_page_free_target) {
+               /*
+                * we're here because
+                *  1) someone else freed up some pages while we had
+                *     the queues unlocked above
+                * and we've hit one of the 3 conditions that
+                * cause us to pause the pageout scan thread
+                *
+                * since we already have enough free pages,
+                * let's avoid stalling and return normally
+                *
+                * before we return, make sure the pageout I/O threads
+                * are running throttled in case there are still requests
+                * in the laundry... since we have enough free pages
+                * we don't need the laundry to be cleaned in a timely
+                * fashion... so let's avoid interfering with foreground
+                * activity
+                *
+                * we don't want to hold vm_page_queue_free_lock when
+                * calling vm_pageout_adjust_eq_iothrottle (since it
+                * may cause other locks to be taken), we do the intitial
+                * check outside of the lock.  Once we take the lock,
+                * we recheck the condition since it may have changed.
+                * if it has, no problem, we will make the threads
+                * non-throttled before actually blocking
+                */
+               vm_pageout_adjust_eq_iothrottle(eq, TRUE);
+       }
+       lck_mtx_lock(&vm_page_queue_free_lock);
 
-                               if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
-                                       speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
-                               }
+       if (vm_page_free_count >= vm_page_free_target &&
+           (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
+               return VM_PAGEOUT_SCAN_DONE_RETURN;
+       }
+       lck_mtx_unlock(&vm_page_queue_free_lock);
 
-                               aq = &vm_page_queue_speculative[speculative_steal_index];
-                       }
+       if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
+               /*
+                * we're most likely about to block due to one of
+                * the 3 conditions that cause vm_pageout_scan to
+                * not be able to make forward progress w/r
+                * to providing new pages to the free queue,
+                * so unthrottle the I/O threads in case we
+                * have laundry to be cleaned... it needs
+                * to be completed ASAP.
+                *
+                * even if we don't block, we want the io threads
+                * running unthrottled since the sum of free +
+                * clean pages is still under our free target
+                */
+               vm_pageout_adjust_eq_iothrottle(eq, FALSE);
+       }
+       if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
+               /*
+                * if we get here we're below our free target and
+                * we're stalling due to a full laundry queue or
+                * we don't have any inactive pages other then
+                * those in the clean queue...
+                * however, we have pages on the clean queue that
+                * can be moved to the free queue, so let's not
+                * stall the pageout scan
+                */
+               flow_control->state = FCS_IDLE;
+               return VM_PAGEOUT_SCAN_PROCEED;
+       }
+       if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
+               flow_control->state = FCS_IDLE;
+               return VM_PAGEOUT_SCAN_PROCEED;
+       }
 
-                       if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
-                               /*
-                                * XXX We've scanned all the speculative
-                                * queues but still haven't found one
-                                * that is not empty, even though
-                                * vm_page_speculative_count is not 0.
-                                */
-                               if (!vm_page_queue_empty(&sq->age_q)) {
-                                       continue;
-                               }
-#if DEVELOPMENT || DEBUG
-                               panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
-#endif
-                               /* readjust... */
-                               vm_page_speculative_count = 0;
-                               /* ... and continue */
-                               continue;
-                       }
+       VM_CHECK_MEMORYSTATUS;
 
-                       if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
-                               can_steal = TRUE;
-                       } else {
-                               if (!delay_speculative_age) {
-                                       mach_timespec_t ts_fully_aged;
+       if (flow_control->state != FCS_IDLE) {
+               VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
+       }
 
-                                       ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
-                                       ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
-                                           * 1000 * NSEC_PER_USEC;
+       iq->pgo_throttled = TRUE;
+       assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
 
-                                       ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
+       counter(c_vm_pageout_scan_block++);
 
-                                       clock_sec_t sec;
-                                       clock_nsec_t nsec;
-                                       clock_get_system_nanotime(&sec, &nsec);
-                                       ts.tv_sec = (unsigned int) sec;
-                                       ts.tv_nsec = nsec;
+       vm_page_unlock_queues();
 
-                                       if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
-                                               can_steal = TRUE;
-                                       } else {
-                                               delay_speculative_age++;
-                                       }
-                               } else {
-                                       delay_speculative_age++;
-                                       if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
-                                               delay_speculative_age = 0;
-                                       }
-                               }
-                       }
-                       if (can_steal == TRUE) {
-                               vm_page_speculate_ageit(aq);
-                       }
-               }
-               force_speculative_aging = FALSE;
+       assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
 
-               if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
-                       int     pages_evicted;
+       VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
+           iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
+       memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
 
-                       if (object != NULL) {
-                               vm_object_unlock(object);
-                               object = NULL;
-                       }
-                       KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
+       thread_block(THREAD_CONTINUE_NULL);
 
-                       pages_evicted = vm_object_cache_evict(100, 10);
+       VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
+           iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
+       memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
 
-                       KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
+       vm_page_lock_queues();
 
-                       if (pages_evicted) {
-                               vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
+       iq->pgo_throttled = FALSE;
 
-                               VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
-                                   vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
-                               memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
+       vps_init_page_targets();
 
-                               /*
-                                * we just freed up to 100 pages,
-                                * so go back to the top of the main loop
-                                * and re-evaulate the memory situation
-                                */
-                               continue;
-                       } else {
-                               cache_evict_throttle = 1000;
-                       }
-               }
-               if (cache_evict_throttle) {
-                       cache_evict_throttle--;
-               }
+       return VM_PAGEOUT_SCAN_NEXT_ITERATION;
+}
 
-               divisor = vm_pageout_state.vm_page_filecache_min_divisor;
+/*
+ * This function is called only from vm_pageout_scan and
+ * it will find and return the most appropriate page to be
+ * reclaimed.
+ */
+static int
+vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
+    boolean_t *is_page_from_bg_q, unsigned int reactivated_this_call)
+{
+       vm_page_t                       m = NULL;
+       vm_object_t                     m_object = VM_OBJECT_NULL;
+       uint32_t                        inactive_external_count;
+       struct vm_speculative_age_q     *sq;
+       struct vm_pageout_queue         *iq;
+       int                             retval = VM_PAGEOUT_SCAN_PROCEED;
+
+       sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
+       iq = &vm_pageout_queue_internal;
+
+       while (1) {
+               *is_page_from_bg_q = FALSE;
+
+               m = NULL;
+               m_object = VM_OBJECT_NULL;
+
+               if (VM_DYNAMIC_PAGING_ENABLED()) {
+                       assert(vm_page_throttled_count == 0);
+                       assert(vm_page_queue_empty(&vm_page_queue_throttled));
+               }
 
-#if CONFIG_JETSAM
                /*
-                * don't let the filecache_min fall below 15% of available memory
-                * on systems with an active compressor that isn't nearing its
-                * limits w/r to accepting new data
-                *
-                * on systems w/o the compressor/swapper, the filecache is always
-                * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
-                * since most (if not all) of the anonymous pages are in the
-                * throttled queue (which isn't counted as available) which
-                * effectively disables this filter
+                * Try for a clean-queue inactive page.
+                * These are pages that vm_pageout_scan tried to steal earlier, but
+                * were dirty and had to be cleaned.  Pick them up now that they are clean.
                 */
-               if (vm_compressor_low_on_space() || divisor == 0) {
-                       vm_pageout_state.vm_page_filecache_min = 0;
-               } else {
-                       vm_pageout_state.vm_page_filecache_min =
-                           ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
-               }
-#else
-               if (vm_compressor_out_of_space() || divisor == 0) {
-                       vm_pageout_state.vm_page_filecache_min = 0;
-               } else {
-                       /*
-                        * don't let the filecache_min fall below the specified critical level
-                        */
-                       vm_pageout_state.vm_page_filecache_min =
-                           ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
-               }
-#endif
-               if (vm_page_free_count < (vm_page_free_reserved / 4)) {
-                       vm_pageout_state.vm_page_filecache_min = 0;
+               if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
+                       m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
+
+                       assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
+
+                       break;
                }
 
-               exceeded_burst_throttle = FALSE;
                /*
-                * Sometimes we have to pause:
-                *      1) No inactive pages - nothing to do.
-                *      2) Loop control - no acceptable pages found on the inactive queue
-                *         within the last vm_pageout_burst_inactive_throttle iterations
-                *      3) Flow control - default pageout queue is full
+                * The next most eligible pages are ones we paged in speculatively,
+                * but which have not yet been touched and have been aged out.
                 */
-               if (vm_page_queue_empty(&vm_page_queue_inactive) &&
-                   vm_page_queue_empty(&vm_page_queue_anonymous) &&
-                   vm_page_queue_empty(&vm_page_queue_cleaned) &&
-                   vm_page_queue_empty(&sq->age_q)) {
-                       VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
-                       msecs = vm_pageout_state.vm_pageout_empty_wait;
-                       goto vm_pageout_scan_delay;
-               } else if (inactive_burst_count >=
-                   MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
-                   (vm_page_inactive_count +
-                   vm_page_speculative_count))) {
-                       VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
-                       msecs = vm_pageout_state.vm_pageout_burst_wait;
-
-                       exceeded_burst_throttle = TRUE;
-                       goto vm_pageout_scan_delay;
-               } else if (VM_PAGE_Q_THROTTLED(iq) &&
-                   VM_DYNAMIC_PAGING_ENABLED()) {
-                       clock_sec_t sec;
-                       clock_nsec_t nsec;
+               if (!vm_page_queue_empty(&sq->age_q)) {
+                       m = (vm_page_t) vm_page_queue_first(&sq->age_q);
 
-                       switch (flow_control.state) {
-                       case FCS_IDLE:
-                               if ((vm_page_free_count + local_freed) < vm_page_free_target &&
-                                   vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
-                                       /*
-                                        * since the compressor is running independently of vm_pageout_scan
-                                        * let's not wait for it just yet... as long as we have a healthy supply
-                                        * of filecache pages to work with, let's keep stealing those.
-                                        */
-                                       inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
+                       assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
 
-                                       if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
-                                           (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
-                                               anons_grabbed = ANONS_GRABBED_LIMIT;
-                                               VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
-                                               goto consider_inactive;
-                                       }
-                               }
-reset_deadlock_timer:
-                               ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
-                               ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
-                               clock_get_system_nanotime(&sec, &nsec);
-                               flow_control.ts.tv_sec = (unsigned int) sec;
-                               flow_control.ts.tv_nsec = nsec;
-                               ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
-
-                               flow_control.state = FCS_DELAYED;
-                               msecs = vm_pageout_state.vm_pageout_deadlock_wait;
-
-                               vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
+                       if (!m->vmp_dirty || force_anonymous == FALSE) {
                                break;
+                       } else {
+                               m = NULL;
+                       }
+               }
 
-                       case FCS_DELAYED:
-                               clock_get_system_nanotime(&sec, &nsec);
-                               ts.tv_sec = (unsigned int) sec;
-                               ts.tv_nsec = nsec;
+#if CONFIG_BACKGROUND_QUEUE
+               if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
+                       vm_object_t     bg_m_object = NULL;
 
-                               if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
-                                       /*
-                                        * the pageout thread for the default pager is potentially
-                                        * deadlocked since the
-                                        * default pager queue has been throttled for more than the
-                                        * allowable time... we need to move some clean pages or dirty
-                                        * pages belonging to the external pagers if they aren't throttled
-                                        * vm_page_free_wanted represents the number of threads currently
-                                        * blocked waiting for pages... we'll move one page for each of
-                                        * these plus a fixed amount to break the logjam... once we're done
-                                        * moving this number of pages, we'll re-enter the FSC_DELAYED state
-                                        * with a new timeout target since we have no way of knowing
-                                        * whether we've broken the deadlock except through observation
-                                        * of the queue associated with the default pager... we need to
-                                        * stop moving pages and allow the system to run to see what
-                                        * state it settles into.
-                                        */
-                                       vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
-                                           vm_page_free_wanted + vm_page_free_wanted_privileged;
-                                       VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
-                                       flow_control.state = FCS_DEADLOCK_DETECTED;
-                                       thread_wakeup((event_t) &vm_pageout_garbage_collect);
-                                       goto consider_inactive;
-                               }
+                       m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
+
+                       bg_m_object = VM_PAGE_OBJECT(m);
+
+                       if (!VM_PAGE_PAGEABLE(m)) {
                                /*
-                                * just resniff instead of trying
-                                * to compute a new delay time... we're going to be
-                                * awakened immediately upon a laundry completion,
-                                * so we won't wait any longer than necessary
+                                * This page is on the background queue
+                                * but not on a pageable queue.  This is
+                                * likely a transient state and whoever
+                                * took it out of its pageable queue
+                                * will likely put it back on a pageable
+                                * queue soon but we can't deal with it
+                                * at this point, so let's ignore this
+                                * page.
                                 */
-                               msecs = vm_pageout_state.vm_pageout_idle_wait;
-                               break;
+                       } else if (force_anonymous == FALSE || bg_m_object->internal) {
+                               if (bg_m_object->internal &&
+                                   (VM_PAGE_Q_THROTTLED(iq) ||
+                                   vm_compressor_out_of_space() == TRUE ||
+                                   vm_page_free_count < (vm_page_free_reserved / 4))) {
+                                       vm_pageout_skipped_bq_internal++;
+                               } else {
+                                       *is_page_from_bg_q = TRUE;
 
-                       case FCS_DEADLOCK_DETECTED:
-                               if (vm_pageout_deadlock_target) {
-                                       goto consider_inactive;
+                                       if (bg_m_object->internal) {
+                                               vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
+                                       } else {
+                                               vm_pageout_vminfo.vm_pageout_considered_bq_external++;
+                                       }
+                                       break;
                                }
-                               goto reset_deadlock_timer;
                        }
-vm_pageout_scan_delay:
-                       vm_pageout_scan_wants_object = VM_OBJECT_NULL;
+               }
+#endif /* CONFIG_BACKGROUND_QUEUE */
 
-                       vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
-                           VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
+               inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
 
-                       if (vm_page_free_count >= vm_page_free_target) {
-                               /*
-                                * we're here because
-                                *  1) someone else freed up some pages while we had
-                                *     the queues unlocked above
-                                * and we've hit one of the 3 conditions that
-                                * cause us to pause the pageout scan thread
-                                *
-                                * since we already have enough free pages,
-                                * let's avoid stalling and return normally
-                                *
-                                * before we return, make sure the pageout I/O threads
-                                * are running throttled in case there are still requests
-                                * in the laundry... since we have enough free pages
-                                * we don't need the laundry to be cleaned in a timely
-                                * fashion... so let's avoid interfering with foreground
-                                * activity
-                                *
-                                * we don't want to hold vm_page_queue_free_lock when
-                                * calling vm_pageout_adjust_eq_iothrottle (since it
-                                * may cause other locks to be taken), we do the intitial
-                                * check outside of the lock.  Once we take the lock,
-                                * we recheck the condition since it may have changed.
-                                * if it has, no problem, we will make the threads
-                                * non-throttled before actually blocking
-                                */
-                               vm_pageout_adjust_eq_iothrottle(eq, TRUE);
-                       }
-                       lck_mtx_lock(&vm_page_queue_free_lock);
+               if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
+                   (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
+                       *grab_anonymous = TRUE;
+                       *anons_grabbed = 0;
 
-                       if (vm_page_free_count >= vm_page_free_target &&
-                           (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
-                               goto return_from_scan;
-                       }
-                       lck_mtx_unlock(&vm_page_queue_free_lock);
+                       vm_pageout_vminfo.vm_pageout_skipped_external++;
+                       goto want_anonymous;
+               }
+               *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
 
-                       if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
-                               /*
-                                * we're most likely about to block due to one of
-                                * the 3 conditions that cause vm_pageout_scan to
-                                * not be able to make forward progress w/r
-                                * to providing new pages to the free queue,
-                                * so unthrottle the I/O threads in case we
-                                * have laundry to be cleaned... it needs
-                                * to be completed ASAP.
-                                *
-                                * even if we don't block, we want the io threads
-                                * running unthrottled since the sum of free +
-                                * clean pages is still under our free target
-                                */
-                               vm_pageout_adjust_eq_iothrottle(eq, FALSE);
-                       }
-                       if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
-                               /*
-                                * if we get here we're below our free target and
-                                * we're stalling due to a full laundry queue or
-                                * we don't have any inactive pages other then
-                                * those in the clean queue...
-                                * however, we have pages on the clean queue that
-                                * can be moved to the free queue, so let's not
-                                * stall the pageout scan
-                                */
-                               flow_control.state = FCS_IDLE;
-                               goto consider_inactive;
+#if CONFIG_JETSAM
+               /* If the file-backed pool has accumulated
+                * significantly more pages than the jetsam
+                * threshold, prefer to reclaim those
+                * inline to minimise compute overhead of reclaiming
+                * anonymous pages.
+                * This calculation does not account for the CPU local
+                * external page queues, as those are expected to be
+                * much smaller relative to the global pools.
+                */
+
+               struct vm_pageout_queue *eq = &vm_pageout_queue_external;
+
+               if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
+                       if (vm_page_pageable_external_count >
+                           vm_pageout_state.vm_page_filecache_min) {
+                               if ((vm_page_pageable_external_count *
+                                   vm_pageout_memorystatus_fb_factor_dr) >
+                                   (memorystatus_available_pages_critical *
+                                   vm_pageout_memorystatus_fb_factor_nr)) {
+                                       *grab_anonymous = FALSE;
+
+                                       VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
+                               }
                        }
-                       if (flow_control.state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
-                               flow_control.state = FCS_IDLE;
-                               goto consider_inactive;
+                       if (*grab_anonymous) {
+                               VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
                        }
+               }
+#endif /* CONFIG_JETSAM */
 
-                       VM_CHECK_MEMORYSTATUS;
+want_anonymous:
+               if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
+                       if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
+                               m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
+
+                               assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
+                               *anons_grabbed = 0;
+
+                               if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
+                                       if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
+                                               if ((++reactivated_this_call % 100)) {
+                                                       vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
+
+                                                       vm_page_activate(m);
+                                                       VM_STAT_INCR(reactivations);
+#if CONFIG_BACKGROUND_QUEUE
+#if DEVELOPMENT || DEBUG
+                                                       if (*is_page_from_bg_q == TRUE) {
+                                                               if (m_object->internal) {
+                                                                       vm_pageout_rejected_bq_internal++;
+                                                               } else {
+                                                                       vm_pageout_rejected_bq_external++;
+                                                               }
+                                                       }
+#endif /* DEVELOPMENT || DEBUG */
+#endif /* CONFIG_BACKGROUND_QUEUE */
+                                                       vm_pageout_state.vm_pageout_inactive_used++;
+
+                                                       m = NULL;
+                                                       retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
+
+                                                       break;
+                                               }
 
-                       if (flow_control.state != FCS_IDLE) {
-                               VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
+                                               /*
+                                                * steal 1% of the file backed pages even if
+                                                * we are under the limit that has been set
+                                                * for a healthy filecache
+                                                */
+                                       }
+                               }
+                               break;
                        }
+               }
+               if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
+                       m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
+
+                       assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
+                       *anons_grabbed += 1;
+
+                       break;
+               }
 
-                       iq->pgo_throttled = TRUE;
-                       assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
+               m = NULL;
+       }
 
-                       counter(c_vm_pageout_scan_block++);
+       *victim_page = m;
 
-                       vm_page_unlock_queues();
+       return retval;
+}
 
-                       assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
+/*
+ * This function is called only from vm_pageout_scan and
+ * it will put a page back on the active/inactive queue
+ * if we can't reclaim it for some reason.
+ */
+static void
+vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
+{
+       if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
+               vm_page_enqueue_inactive(m, FALSE);
+       } else {
+               vm_page_activate(m);
+       }
 
-                       VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
-                           iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
-                       memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
+#if CONFIG_BACKGROUND_QUEUE
+#if DEVELOPMENT || DEBUG
+       vm_object_t m_object = VM_PAGE_OBJECT(m);
 
-                       thread_block(THREAD_CONTINUE_NULL);
+       if (page_from_bg_q == TRUE) {
+               if (m_object->internal) {
+                       vm_pageout_rejected_bq_internal++;
+               } else {
+                       vm_pageout_rejected_bq_external++;
+               }
+       }
+#endif /* DEVELOPMENT || DEBUG */
+#endif /* CONFIG_BACKGROUND_QUEUE */
+}
 
-                       VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
-                           iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
-                       memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
+/*
+ * This function is called only from vm_pageout_scan and
+ * it will try to grab the victim page's VM object (m_object)
+ * which differs from the previous victim page's object (object).
+ */
+static int
+vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
+{
+       struct vm_speculative_age_q *sq;
 
-                       vm_page_lock_queues();
+       sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
 
-                       iq->pgo_throttled = FALSE;
+       /*
+        * the object associated with candidate page is
+        * different from the one we were just working
+        * with... dump the lock if we still own it
+        */
+       if (*object != NULL) {
+               vm_object_unlock(*object);
+               *object = NULL;
+       }
+       /*
+        * Try to lock object; since we've alread got the
+        * page queues lock, we can only 'try' for this one.
+        * if the 'try' fails, we need to do a mutex_pause
+        * to allow the owner of the object lock a chance to
+        * run... otherwise, we're likely to trip over this
+        * object in the same state as we work our way through
+        * the queue... clumps of pages associated with the same
+        * object are fairly typical on the inactive and active queues
+        */
+       if (!vm_object_lock_try_scan(m_object)) {
+               vm_page_t m_want = NULL;
+
+               vm_pageout_vminfo.vm_pageout_inactive_nolock++;
+
+               if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
+                       VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
+               }
+
+               pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
+
+               m->vmp_reference = FALSE;
+
+               if (!m_object->object_is_shared_cache) {
+                       /*
+                        * don't apply this optimization if this is the shared cache
+                        * object, it's too easy to get rid of very hot and important
+                        * pages...
+                        * m->vmp_object must be stable since we hold the page queues lock...
+                        * we can update the scan_collisions field sans the object lock
+                        * since it is a separate field and this is the only spot that does
+                        * a read-modify-write operation and it is never executed concurrently...
+                        * we can asynchronously set this field to 0 when creating a UPL, so it
+                        * is possible for the value to be a bit non-determistic, but that's ok
+                        * since it's only used as a hint
+                        */
+                       m_object->scan_collisions = 1;
+               }
+               if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
+                       m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
+               } else if (!vm_page_queue_empty(&sq->age_q)) {
+                       m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
+               } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
+                   !vm_page_queue_empty(&vm_page_queue_inactive)) {
+                       m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
+               } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
+                       m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
+               }
+
+               /*
+                * this is the next object we're going to be interested in
+                * try to make sure its available after the mutex_pause
+                * returns control
+                */
+               if (m_want) {
+                       vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
+               }
+
+               vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
+
+               return VM_PAGEOUT_SCAN_NEXT_ITERATION;
+       } else {
+               *object = m_object;
+               vm_pageout_scan_wants_object = VM_OBJECT_NULL;
+       }
+
+       return VM_PAGEOUT_SCAN_PROCEED;
+}
+
+/*
+ * This function is called only from vm_pageout_scan and
+ * it notices that pageout scan may be rendered ineffective
+ * due to a FS deadlock and will jetsam a process if possible.
+ * If jetsam isn't supported, it'll move the page to the active
+ * queue to try and get some different pages pushed onwards so
+ * we can try to get out of this scenario.
+ */
+static void
+vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
+    int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
+{
+       struct  vm_pageout_queue *eq;
+       vm_object_t cur_object = VM_OBJECT_NULL;
+
+       cur_object = *object;
+
+       eq = &vm_pageout_queue_external;
+
+       if (cur_object->internal == FALSE) {
+               /*
+                * we need to break up the following potential deadlock case...
+                *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
+                *  b) The thread doing the writing is waiting for pages while holding the truncate lock
+                *  c) Most of the pages in the inactive queue belong to this file.
+                *
+                * we are potentially in this deadlock because...
+                *  a) the external pageout queue is throttled
+                *  b) we're done with the active queue and moved on to the inactive queue
+                *  c) we've got a dirty external page
+                *
+                * since we don't know the reason for the external pageout queue being throttled we
+                * must suspect that we are deadlocked, so move the current page onto the active queue
+                * in an effort to cause a page from the active queue to 'age' to the inactive queue
+                *
+                * if we don't have jetsam configured (i.e. we have a dynamic pager), set
+                * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
+                * pool the next time we select a victim page... if we can make enough new free pages,
+                * the deadlock will break, the external pageout queue will empty and it will no longer
+                * be throttled
+                *
+                * if we have jetsam configured, keep a count of the pages reactivated this way so
+                * that we can try to find clean pages in the active/inactive queues before
+                * deciding to jetsam a process
+                */
+               vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
+
+               vm_page_check_pageable_safe(m);
+               assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
+               vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
+               m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
+               vm_page_active_count++;
+               vm_page_pageable_external_count++;
+
+               vm_pageout_adjust_eq_iothrottle(eq, FALSE);
+
+#if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
+
+#pragma unused(force_anonymous)
+
+               *vm_pageout_inactive_external_forced_reactivate_limit -= 1;
+
+               if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
+                       *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
+                       /*
+                        * Possible deadlock scenario so request jetsam action
+                        */
+
+                       assert(cur_object);
+                       vm_object_unlock(cur_object);
+
+                       cur_object = VM_OBJECT_NULL;
+
+                       /*
+                        * VM pageout scan needs to know we have dropped this lock and so set the
+                        * object variable we got passed in to NULL.
+                        */
+                       *object = VM_OBJECT_NULL;
+
+                       vm_page_unlock_queues();
+
+                       VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
+                           vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
+
+                       /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
+                       if (memorystatus_kill_on_VM_page_shortage(FALSE) == TRUE) {
+                               VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
+                       }
+
+                       VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
+                           vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
+
+                       vm_page_lock_queues();
+                       *delayed_unlock = 1;
+               }
+#else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
+
+#pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
+#pragma unused(delayed_unlock)
+
+               *force_anonymous = TRUE;
+#endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
+       } else {
+               vm_page_activate(m);
+               VM_STAT_INCR(reactivations);
+
+#if CONFIG_BACKGROUND_QUEUE
+#if DEVELOPMENT || DEBUG
+               if (is_page_from_bg_q == TRUE) {
+                       if (cur_object->internal) {
+                               vm_pageout_rejected_bq_internal++;
+                       } else {
+                               vm_pageout_rejected_bq_external++;
+                       }
+               }
+#endif /* DEVELOPMENT || DEBUG */
+#endif /* CONFIG_BACKGROUND_QUEUE */
+
+               vm_pageout_state.vm_pageout_inactive_used++;
+       }
+}
+
+
+void
+vm_page_balance_inactive(int max_to_move)
+{
+       vm_page_t m;
+
+       LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
+
+       if (hibernation_vmqueues_inspection == TRUE) {
+               /*
+                * It is likely that the hibernation code path is
+                * dealing with these very queues as we are about
+                * to move pages around in/from them and completely
+                * change the linkage of the pages.
+                *
+                * And so we skip the rebalancing of these queues.
+                */
+               return;
+       }
+       vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
+           vm_page_inactive_count +
+           vm_page_speculative_count);
+
+       while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
+               VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
+
+               m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
+
+               assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
+               assert(!m->vmp_laundry);
+               assert(VM_PAGE_OBJECT(m) != kernel_object);
+               assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
+
+               DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
+
+               /*
+                * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
+                *
+                * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
+                * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
+                * new reference happens. If no futher references happen on the page after that remote TLB flushes
+                * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
+                * by pageout_scan, which is just fine since the last reference would have happened quite far
+                * in the past (TLB caches don't hang around for very long), and of course could just as easily
+                * have happened before we moved the page
+                */
+               if (m->vmp_pmapped == TRUE) {
+                       pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
+               }
+
+               /*
+                * The page might be absent or busy,
+                * but vm_page_deactivate can handle that.
+                * FALSE indicates that we don't want a H/W clear reference
+                */
+               vm_page_deactivate_internal(m, FALSE);
+       }
+}
+
+
+/*
+ *     vm_pageout_scan does the dirty work for the pageout daemon.
+ *     It returns with both vm_page_queue_free_lock and vm_page_queue_lock
+ *     held and vm_page_free_wanted == 0.
+ */
+void
+vm_pageout_scan(void)
+{
+       unsigned int loop_count = 0;
+       unsigned int inactive_burst_count = 0;
+       unsigned int reactivated_this_call;
+       unsigned int reactivate_limit;
+       vm_page_t   local_freeq = NULL;
+       int         local_freed = 0;
+       int         delayed_unlock;
+       int         delayed_unlock_limit = 0;
+       int         refmod_state = 0;
+       int     vm_pageout_deadlock_target = 0;
+       struct  vm_pageout_queue *iq;
+       struct  vm_pageout_queue *eq;
+       struct  vm_speculative_age_q *sq;
+       struct  flow_control    flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
+       boolean_t inactive_throttled = FALSE;
+       vm_object_t     object = NULL;
+       uint32_t        inactive_reclaim_run;
+       boolean_t       grab_anonymous = FALSE;
+       boolean_t       force_anonymous = FALSE;
+       boolean_t       force_speculative_aging = FALSE;
+       int             anons_grabbed = 0;
+       int             page_prev_q_state = 0;
+       boolean_t       page_from_bg_q = FALSE;
+       uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
+       vm_object_t     m_object = VM_OBJECT_NULL;
+       int             retval = 0;
+       boolean_t       lock_yield_check = FALSE;
+
+
+       VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
+           vm_pageout_vminfo.vm_pageout_freed_speculative,
+           vm_pageout_state.vm_pageout_inactive_clean,
+           vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
+           vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
+
+       flow_control.state = FCS_IDLE;
+       iq = &vm_pageout_queue_internal;
+       eq = &vm_pageout_queue_external;
+       sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
+
+       /* Ask the pmap layer to return any pages it no longer needs. */
+       uint64_t pmap_wired_pages_freed = pmap_release_pages_fast();
+
+       vm_page_lock_queues();
+
+       vm_page_wire_count -= pmap_wired_pages_freed;
+
+       delayed_unlock = 1;
+
+       /*
+        *      Calculate the max number of referenced pages on the inactive
+        *      queue that we will reactivate.
+        */
+       reactivated_this_call = 0;
+       reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
+           vm_page_inactive_count);
+       inactive_reclaim_run = 0;
+
+       vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
+
+       /*
+        *      We must limit the rate at which we send pages to the pagers
+        *      so that we don't tie up too many pages in the I/O queues.
+        *      We implement a throttling mechanism using the laundry count
+        *      to limit the number of pages outstanding to the default
+        *      and external pagers.  We can bypass the throttles and look
+        *      for clean pages if the pageout queues don't drain in a timely
+        *      fashion since this may indicate that the pageout paths are
+        *      stalled waiting for memory, which only we can provide.
+        */
+
+       vps_init_page_targets();
+       assert(object == NULL);
+       assert(delayed_unlock != 0);
+
+       for (;;) {
+               vm_page_t m;
+
+               DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
+
+               if (lock_yield_check) {
+                       lock_yield_check = FALSE;
+
+                       if (delayed_unlock++ > delayed_unlock_limit) {
+                               int freed = local_freed;
+
+                               vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
+                                   VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
+                               if (freed == 0) {
+                                       lck_mtx_yield(&vm_page_queue_lock);
+                               }
+                       } else if (vm_pageout_scan_wants_object) {
+                               vm_page_unlock_queues();
+                               mutex_pause(0);
+                               vm_page_lock_queues();
+                       }
+               }
+
+               if (vm_upl_wait_for_pages < 0) {
+                       vm_upl_wait_for_pages = 0;
+               }
+
+               delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
+
+               if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
+                       delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
+               }
+
+               vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
+
+               assert(delayed_unlock);
+
+               /*
+                * maintain our balance
+                */
+               vm_page_balance_inactive(1);
+
+
+               /**********************************************************************
+               * above this point we're playing with the active and secluded queues
+               * below this point we're playing with the throttling mechanisms
+               * and the inactive queue
+               **********************************************************************/
+
+               if (vm_page_free_count + local_freed >= vm_page_free_target) {
+                       vm_pageout_scan_wants_object = VM_OBJECT_NULL;
+
+                       vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
+                           VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
+                       /*
+                        * make sure the pageout I/O threads are running
+                        * throttled in case there are still requests
+                        * in the laundry... since we have met our targets
+                        * we don't need the laundry to be cleaned in a timely
+                        * fashion... so let's avoid interfering with foreground
+                        * activity
+                        */
+                       vm_pageout_adjust_eq_iothrottle(eq, TRUE);
+
+                       lck_mtx_lock(&vm_page_queue_free_lock);
+
+                       if ((vm_page_free_count >= vm_page_free_target) &&
+                           (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
+                               /*
+                                * done - we have met our target *and*
+                                * there is no one waiting for a page.
+                                */
+return_from_scan:
+                               assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
+
+                               VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
+                                   vm_pageout_state.vm_pageout_inactive,
+                                   vm_pageout_state.vm_pageout_inactive_used, 0, 0);
+                               VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
+                                   vm_pageout_vminfo.vm_pageout_freed_speculative,
+                                   vm_pageout_state.vm_pageout_inactive_clean,
+                                   vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
+                                   vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
+
+                               return;
+                       }
+                       lck_mtx_unlock(&vm_page_queue_free_lock);
+               }
+
+               /*
+                * Before anything, we check if we have any ripe volatile
+                * objects around. If so, try to purge the first object.
+                * If the purge fails, fall through to reclaim a page instead.
+                * If the purge succeeds, go back to the top and reevalute
+                * the new memory situation.
+                */
+               retval = vps_purge_object();
+
+               if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
+                       /*
+                        * Success
+                        */
+                       if (object != NULL) {
+                               vm_object_unlock(object);
+                               object = NULL;
+                       }
+
+                       lock_yield_check = FALSE;
+                       continue;
+               }
+
+               /*
+                * If our 'aged' queue is empty and we have some speculative pages
+                * in the other queues, let's go through and see if we need to age
+                * them.
+                *
+                * If we succeeded in aging a speculative Q or just that everything
+                * looks normal w.r.t queue age and queue counts, we keep going onward.
+                *
+                * If, for some reason, we seem to have a mismatch between the spec.
+                * page count and the page queues, we reset those variables and
+                * restart the loop (LD TODO: Track this better?).
+                */
+               if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
+                       retval = vps_age_speculative_queue(force_speculative_aging);
+
+                       if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
+                               lock_yield_check = FALSE;
+                               continue;
+                       }
+               }
+               force_speculative_aging = FALSE;
+
+               /*
+                * Check to see if we need to evict objects from the cache.
+                *
+                * Note: 'object' here doesn't have anything to do with
+                * the eviction part. We just need to make sure we have dropped
+                * any object lock we might be holding if we need to go down
+                * into the eviction logic.
+                */
+               retval = vps_object_cache_evict(&object);
+
+               if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
+                       lock_yield_check = FALSE;
+                       continue;
+               }
+
+
+               /*
+                * Calculate our filecache_min that will affect the loop
+                * going forward.
+                */
+               vps_calculate_filecache_min();
+
+               /*
+                * LD TODO: Use a structure to hold all state variables for a single
+                * vm_pageout_scan iteration and pass that structure to this function instead.
+                */
+               retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
+                   &delayed_unlock, &local_freeq, &local_freed,
+                   &vm_pageout_deadlock_target, inactive_burst_count);
 
+               if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
                        if (loop_count >= vm_page_inactive_count) {
                                loop_count = 0;
                        }
+
                        inactive_burst_count = 0;
 
-                       goto Restart;
-                       /*NOTREACHED*/
-               }
+                       assert(object == NULL);
+                       assert(delayed_unlock != 0);
 
+                       lock_yield_check = FALSE;
+                       continue;
+               } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
+                       goto return_from_scan;
+               }
 
                flow_control.state = FCS_IDLE;
-consider_inactive:
+
                vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
                    vm_pageout_inactive_external_forced_reactivate_limit);
                loop_count++;
@@ -2438,157 +3094,22 @@ consider_inactive:
                /*
                 * Choose a victim.
                 */
-               while (1) {
-#if CONFIG_BACKGROUND_QUEUE
-                       page_from_bg_q = FALSE;
-#endif /* CONFIG_BACKGROUND_QUEUE */
-
-                       m = NULL;
-                       m_object = VM_OBJECT_NULL;
-
-                       if (VM_DYNAMIC_PAGING_ENABLED()) {
-                               assert(vm_page_throttled_count == 0);
-                               assert(vm_page_queue_empty(&vm_page_queue_throttled));
-                       }
-
-                       /*
-                        * Try for a clean-queue inactive page.
-                        * These are pages that vm_pageout_scan tried to steal earlier, but
-                        * were dirty and had to be cleaned.  Pick them up now that they are clean.
-                        */
-                       if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
-                               m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
-
-                               assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
-
-                               break;
-                       }
-
-                       /*
-                        * The next most eligible pages are ones we paged in speculatively,
-                        * but which have not yet been touched and have been aged out.
-                        */
-                       if (!vm_page_queue_empty(&sq->age_q)) {
-                               m = (vm_page_t) vm_page_queue_first(&sq->age_q);
-
-                               assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
-
-                               if (!m->vmp_dirty || force_anonymous == FALSE) {
-                                       break;
-                               } else {
-                                       m = NULL;
-                               }
-                       }
-
-#if CONFIG_BACKGROUND_QUEUE
-                       if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
-                               vm_object_t     bg_m_object = NULL;
-
-                               m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
-
-                               bg_m_object = VM_PAGE_OBJECT(m);
 
-                               if (!VM_PAGE_PAGEABLE(m)) {
-                                       /*
-                                        * This page is on the background queue
-                                        * but not on a pageable queue.  This is
-                                        * likely a transient state and whoever
-                                        * took it out of its pageable queue
-                                        * will likely put it back on a pageable
-                                        * queue soon but we can't deal with it
-                                        * at this point, so let's ignore this
-                                        * page.
-                                        */
-                               } else if (force_anonymous == FALSE || bg_m_object->internal) {
-                                       if (bg_m_object->internal &&
-                                           (VM_PAGE_Q_THROTTLED(iq) ||
-                                           vm_compressor_out_of_space() == TRUE ||
-                                           vm_page_free_count < (vm_page_free_reserved / 4))) {
-                                               vm_pageout_skipped_bq_internal++;
-                                       } else {
-                                               page_from_bg_q = TRUE;
-
-                                               if (bg_m_object->internal) {
-                                                       vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
-                                               } else {
-                                                       vm_pageout_vminfo.vm_pageout_considered_bq_external++;
-                                               }
-                                               break;
-                                       }
-                               }
-                       }
-#endif
-                       inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
-
-                       if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
-                           (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
-                               grab_anonymous = TRUE;
-                               anons_grabbed = 0;
+               m = NULL;
+               retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, reactivated_this_call);
 
-                               vm_pageout_vminfo.vm_pageout_skipped_external++;
-                               goto want_anonymous;
-                       }
-                       grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
+               if (m == NULL) {
+                       if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
+                               reactivated_this_call++;
 
-#if CONFIG_JETSAM
-                       /* If the file-backed pool has accumulated
-                        * significantly more pages than the jetsam
-                        * threshold, prefer to reclaim those
-                        * inline to minimise compute overhead of reclaiming
-                        * anonymous pages.
-                        * This calculation does not account for the CPU local
-                        * external page queues, as those are expected to be
-                        * much smaller relative to the global pools.
-                        */
-                       if (grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
-                               if (vm_page_pageable_external_count >
-                                   vm_pageout_state.vm_page_filecache_min) {
-                                       if ((vm_page_pageable_external_count *
-                                           vm_pageout_memorystatus_fb_factor_dr) >
-                                           (memorystatus_available_pages_critical *
-                                           vm_pageout_memorystatus_fb_factor_nr)) {
-                                               grab_anonymous = FALSE;
-
-                                               VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
-                                       }
-                               }
-                               if (grab_anonymous) {
-                                       VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
-                               }
-                       }
-#endif /* CONFIG_JETSAM */
+                               inactive_burst_count = 0;
 
-want_anonymous:
-                       if (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
-                               if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
-                                       m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
-
-                                       assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
-                                       anons_grabbed = 0;
-
-                                       if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
-                                               if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
-                                                       if ((++reactivated_this_call % 100)) {
-                                                               vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
-                                                               goto must_activate_page;
-                                                       }
-                                                       /*
-                                                        * steal 1% of the file backed pages even if
-                                                        * we are under the limit that has been set
-                                                        * for a healthy filecache
-                                                        */
-                                               }
-                                       }
-                                       break;
+                               if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
+                                       VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
                                }
-                       }
-                       if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
-                               m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
 
-                               assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
-                               anons_grabbed++;
-
-                               break;
+                               lock_yield_check = TRUE;
+                               continue;
                        }
 
                        /*
@@ -2603,17 +3124,20 @@ want_anonymous:
                        VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
 
                        if (!vm_page_queue_empty(&sq->age_q)) {
-                               goto done_with_inactivepage;
+                               lock_yield_check = TRUE;
+                               continue;
                        }
 
                        if (vm_page_speculative_count) {
                                force_speculative_aging = TRUE;
-                               goto done_with_inactivepage;
+                               lock_yield_check = TRUE;
+                               continue;
                        }
                        panic("vm_pageout: no victim");
 
                        /* NOTREACHED */
                }
+
                assert(VM_PAGE_PAGEABLE(m));
                m_object = VM_PAGE_OBJECT(m);
                force_anonymous = FALSE;
@@ -2642,78 +3166,19 @@ want_anonymous:
                 * already got the lock
                 */
                if (m_object != object) {
+                       boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
+
                        /*
-                        * the object associated with candidate page is
-                        * different from the one we were just working
-                        * with... dump the lock if we still own it
-                        */
-                       if (object != NULL) {
-                               vm_object_unlock(object);
-                               object = NULL;
-                       }
-                       /*
-                        * Try to lock object; since we've alread got the
-                        * page queues lock, we can only 'try' for this one.
-                        * if the 'try' fails, we need to do a mutex_pause
-                        * to allow the owner of the object lock a chance to
-                        * run... otherwise, we're likely to trip over this
-                        * object in the same state as we work our way through
-                        * the queue... clumps of pages associated with the same
-                        * object are fairly typical on the inactive and active queues
+                        * vps_switch_object() will always drop the 'object' lock first
+                        * and then try to acquire the 'm_object' lock. So 'object' has to point to
+                        * either 'm_object' or NULL.
                         */
-                       if (!vm_object_lock_try_scan(m_object)) {
-                               vm_page_t m_want = NULL;
-
-                               vm_pageout_vminfo.vm_pageout_inactive_nolock++;
-
-                               if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
-                                       VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
-                               }
-
-                               pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
-
-                               m->vmp_reference = FALSE;
-
-                               if (!m_object->object_is_shared_cache) {
-                                       /*
-                                        * don't apply this optimization if this is the shared cache
-                                        * object, it's too easy to get rid of very hot and important
-                                        * pages...
-                                        * m->vmp_object must be stable since we hold the page queues lock...
-                                        * we can update the scan_collisions field sans the object lock
-                                        * since it is a separate field and this is the only spot that does
-                                        * a read-modify-write operation and it is never executed concurrently...
-                                        * we can asynchronously set this field to 0 when creating a UPL, so it
-                                        * is possible for the value to be a bit non-determistic, but that's ok
-                                        * since it's only used as a hint
-                                        */
-                                       m_object->scan_collisions = 1;
-                               }
-                               if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
-                                       m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
-                               } else if (!vm_page_queue_empty(&sq->age_q)) {
-                                       m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
-                               } else if ((grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT ||
-                                   vm_page_queue_empty(&vm_page_queue_anonymous)) &&
-                                   !vm_page_queue_empty(&vm_page_queue_inactive)) {
-                                       m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
-                               } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
-                                       m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
-                               }
+                       retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
 
-                               /*
-                                * this is the next object we're going to be interested in
-                                * try to make sure its available after the mutex_pause
-                                * returns control
-                                */
-                               if (m_want) {
-                                       vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
-                               }
-
-                               goto requeue_page;
+                       if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
+                               lock_yield_check = TRUE;
+                               continue;
                        }
-                       object = m_object;
-                       vm_pageout_scan_wants_object = VM_OBJECT_NULL;
                }
                assert(m_object == object);
                assert(VM_PAGE_OBJECT(m) == m_object);
@@ -2729,24 +3194,11 @@ want_anonymous:
                        if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
                                VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
                        }
-requeue_page:
-                       if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
-                               vm_page_enqueue_inactive(m, FALSE);
-                       } else {
-                               vm_page_activate(m);
-                       }
-#if CONFIG_BACKGROUND_QUEUE
-#if DEVELOPMENT || DEBUG
-                       if (page_from_bg_q == TRUE) {
-                               if (m_object->internal) {
-                                       vm_pageout_rejected_bq_internal++;
-                               } else {
-                                       vm_pageout_rejected_bq_external++;
-                               }
-                       }
-#endif
-#endif
-                       goto done_with_inactivepage;
+
+                       vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
+
+                       lock_yield_check = TRUE;
+                       continue;
                }
 
                /*
@@ -2770,7 +3222,8 @@ requeue_page:
                 *      just leave it off the paging queues
                 */
                if (m->vmp_free_when_done || m->vmp_cleaning) {
-                       goto done_with_inactivepage;
+                       lock_yield_check = TRUE;
+                       continue;
                }
 
 
@@ -2839,7 +3292,9 @@ reclaim_page:
                        }
 
                        inactive_burst_count = 0;
-                       goto done_with_inactivepage;
+
+                       lock_yield_check = TRUE;
+                       continue;
                }
                if (object->copy == VM_OBJECT_NULL) {
                        /*
@@ -2915,18 +3370,15 @@ reclaim_page:
                        /* deal with a rogue "reusable" page */
                        VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
                }
-               divisor = vm_pageout_state.vm_page_xpmapped_min_divisor;
 
-               if (divisor == 0) {
+               if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
                        vm_pageout_state.vm_page_xpmapped_min = 0;
                } else {
-                       vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / divisor;
+                       vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor;
                }
 
                if (!m->vmp_no_cache &&
-#if CONFIG_BACKGROUND_QUEUE
                    page_from_bg_q == FALSE &&
-#endif
                    (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
                    (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
                        /*
@@ -2959,7 +3411,6 @@ reactivate_page:
                                        vm_page_deactivate(m);
                                        VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
                                } else {
-must_activate_page:
                                        /*
                                         * The page was/is being used, so put back on active list.
                                         */
@@ -2976,14 +3427,16 @@ must_activate_page:
                                                vm_pageout_rejected_bq_external++;
                                        }
                                }
-#endif
-#endif
+#endif /* DEVELOPMENT || DEBUG */
+#endif /* CONFIG_BACKGROUND_QUEUE */
+
                                if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
                                        VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
                                }
                                vm_pageout_state.vm_pageout_inactive_used++;
 
-                               goto done_with_inactivepage;
+                               lock_yield_check = TRUE;
+                               continue;
                        }
                        /*
                         * Make sure we call pmap_get_refmod() if it
@@ -2998,10 +3451,6 @@ must_activate_page:
                        }
                }
 
-               XPR(XPR_VM_PAGEOUT,
-                   "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
-                   object, m->vmp_offset, m, 0, 0);
-
                /*
                 * we've got a candidate page to steal...
                 *
@@ -3045,81 +3494,22 @@ throttle_inactive:
                        VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
 
                        inactive_burst_count = 0;
-                       goto done_with_inactivepage;
+
+                       lock_yield_check = TRUE;
+                       continue;
                }
                if (inactive_throttled == TRUE) {
-                       if (object->internal == FALSE) {
-                               /*
-                                * we need to break up the following potential deadlock case...
-                                *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
-                                *  b) The thread doing the writing is waiting for pages while holding the truncate lock
-                                *  c) Most of the pages in the inactive queue belong to this file.
-                                *
-                                * we are potentially in this deadlock because...
-                                *  a) the external pageout queue is throttled
-                                *  b) we're done with the active queue and moved on to the inactive queue
-                                *  c) we've got a dirty external page
-                                *
-                                * since we don't know the reason for the external pageout queue being throttled we
-                                * must suspect that we are deadlocked, so move the current page onto the active queue
-                                * in an effort to cause a page from the active queue to 'age' to the inactive queue
-                                *
-                                * if we don't have jetsam configured (i.e. we have a dynamic pager), set
-                                * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
-                                * pool the next time we select a victim page... if we can make enough new free pages,
-                                * the deadlock will break, the external pageout queue will empty and it will no longer
-                                * be throttled
-                                *
-                                * if we have jetsam configured, keep a count of the pages reactivated this way so
-                                * that we can try to find clean pages in the active/inactive queues before
-                                * deciding to jetsam a process
-                                */
-                               vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
-
-                               vm_page_check_pageable_safe(m);
-                               assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
-                               vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
-                               m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
-                               vm_page_active_count++;
-                               vm_page_pageable_external_count++;
-
-                               vm_pageout_adjust_eq_iothrottle(eq, FALSE);
-
-#if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
-                               vm_pageout_inactive_external_forced_reactivate_limit--;
-
-                               if (vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
-                                       vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
-                                       /*
-                                        * Possible deadlock scenario so request jetsam action
-                                        */
-                                       assert(object);
-                                       vm_object_unlock(object);
-                                       object = VM_OBJECT_NULL;
-                                       vm_page_unlock_queues();
-
-                                       VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
-                                           vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
-
-                                       /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
-                                       if (memorystatus_kill_on_VM_page_shortage(FALSE) == TRUE) {
-                                               VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
-                                       }
+                       vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
+                           &delayed_unlock, &force_anonymous, page_from_bg_q);
 
-                                       VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
-                                           vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
+                       inactive_burst_count = 0;
 
-                                       vm_page_lock_queues();
-                                       delayed_unlock = 1;
-                               }
-#else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
-                               force_anonymous = TRUE;
-#endif
-                               inactive_burst_count = 0;
-                               goto done_with_inactivepage;
-                       } else {
-                               goto must_activate_page;
+                       if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
+                               VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
                        }
+
+                       lock_yield_check = TRUE;
+                       continue;
                }
 
                /*
@@ -3261,21 +3651,6 @@ throttle_inactive:
                vm_pageout_cluster(m);
                inactive_burst_count = 0;
 
-done_with_inactivepage:
-
-               if (delayed_unlock++ > delayed_unlock_limit) {
-                       int freed = local_freed;
-
-                       vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
-                           VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
-                       if (freed == 0) {
-                               lck_mtx_yield(&vm_page_queue_lock);
-                       }
-               } else if (vm_pageout_scan_wants_object) {
-                       vm_page_unlock_queues();
-                       mutex_pause(0);
-                       vm_page_lock_queues();
-               }
                /*
                 * back to top of pageout scan loop
                 */
@@ -3335,11 +3710,9 @@ vm_pageout_continue(void)
        DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
        VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
 
-#if !CONFIG_EMBEDDED
        lck_mtx_lock(&vm_page_queue_free_lock);
        vm_pageout_running = TRUE;
        lck_mtx_unlock(&vm_page_queue_free_lock);
-#endif /* CONFIG_EMBEDDED */
 
        vm_pageout_scan();
        /*
@@ -3350,8 +3723,8 @@ vm_pageout_continue(void)
        assert(vm_page_free_wanted_privileged == 0);
        assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
 
-#if !CONFIG_EMBEDDED
        vm_pageout_running = FALSE;
+#if !CONFIG_EMBEDDED
        if (vm_pageout_waiter) {
                vm_pageout_waiter = FALSE;
                thread_wakeup((event_t)&vm_pageout_waiter);
@@ -3944,6 +4317,7 @@ vm_pageout_iothread_internal(struct cq *cq)
        }
 
 
+
        thread_set_thread_name(current_thread(), "VM_compressor");
 #if DEVELOPMENT || DEBUG
        vmct_stats.vmct_minpages[cq->id] = INT32_MAX;
@@ -4063,53 +4437,67 @@ vm_pressure_response(void)
 }
 #endif /* VM_PRESSURE_EVENTS */
 
+/*
+ * Function called by a kernel thread to either get the current pressure level or
+ * wait until memory pressure changes from a given level.
+ */
 kern_return_t
 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
 {
-#if CONFIG_EMBEDDED
-
-       return KERN_FAILURE;
-
-#elif !VM_PRESSURE_EVENTS
+#if !VM_PRESSURE_EVENTS
 
        return KERN_FAILURE;
 
 #else /* VM_PRESSURE_EVENTS */
 
-       kern_return_t   kr = KERN_SUCCESS;
+       wait_result_t       wr = 0;
+       vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
 
-       if (pressure_level != NULL) {
-               vm_pressure_level_t     old_level = memorystatus_vm_pressure_level;
+       if (pressure_level == NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
 
-               if (wait_for_pressure == TRUE) {
-                       wait_result_t           wr = 0;
+       if (*pressure_level == kVMPressureJetsam) {
+               if (!wait_for_pressure) {
+                       return KERN_INVALID_ARGUMENT;
+               }
 
-                       while (old_level == *pressure_level) {
-                               wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
-                                   THREAD_INTERRUPTIBLE);
-                               if (wr == THREAD_WAITING) {
-                                       wr = thread_block(THREAD_CONTINUE_NULL);
-                               }
-                               if (wr == THREAD_INTERRUPTED) {
-                                       return KERN_ABORTED;
-                               }
-                               if (wr == THREAD_AWAKENED) {
-                                       old_level = memorystatus_vm_pressure_level;
+               lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
+               wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters,
+                   THREAD_INTERRUPTIBLE);
+               if (wr == THREAD_WAITING) {
+                       ++memorystatus_jetsam_fg_band_waiters;
+                       lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
+                       wr = thread_block(THREAD_CONTINUE_NULL);
+               } else {
+                       lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
+               }
+               if (wr != THREAD_AWAKENED) {
+                       return KERN_ABORTED;
+               }
+               *pressure_level = kVMPressureJetsam;
+               return KERN_SUCCESS;
+       }
 
-                                       if (old_level != *pressure_level) {
-                                               break;
-                                       }
-                               }
+       if (wait_for_pressure == TRUE) {
+               while (old_level == *pressure_level) {
+                       wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
+                           THREAD_INTERRUPTIBLE);
+                       if (wr == THREAD_WAITING) {
+                               wr = thread_block(THREAD_CONTINUE_NULL);
+                       }
+                       if (wr == THREAD_INTERRUPTED) {
+                               return KERN_ABORTED;
                        }
-               }
 
-               *pressure_level = old_level;
-               kr = KERN_SUCCESS;
-       } else {
-               kr = KERN_INVALID_ARGUMENT;
+                       if (wr == THREAD_AWAKENED) {
+                               old_level = memorystatus_vm_pressure_level;
+                       }
+               }
        }
 
-       return kr;
+       *pressure_level = old_level;
+       return KERN_SUCCESS;
 #endif /* VM_PRESSURE_EVENTS */
 }
 
@@ -4238,34 +4626,41 @@ extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
 void
 vm_set_restrictions()
 {
-       host_basic_info_data_t hinfo;
-       mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
+       int vm_restricted_to_single_processor = 0;
+
+       if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
+               kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
+               vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
+       } else {
+               host_basic_info_data_t hinfo;
+               mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
 
 #define BSD_HOST 1
-       host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
+               host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
 
-       assert(hinfo.max_cpus > 0);
+               assert(hinfo.max_cpus > 0);
 
-       if (hinfo.max_cpus <= 3) {
-               /*
-                * on systems with a limited number of CPUS, bind the
-                * 4 major threads that can free memory and that tend to use
-                * a fair bit of CPU under pressured conditions to a single processor.
-                * This insures that these threads don't hog all of the available CPUs
-                * (important for camera launch), while allowing them to run independently
-                * w/r to locks... the 4 threads are
-                * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
-                * vm_compressor_swap_trigger_thread (minor and major compactions),
-                * memorystatus_thread (jetsams).
-                *
-                * the first time the thread is run, it is responsible for checking the
-                * state of vm_restricted_to_single_processor, and if TRUE it calls
-                * thread_bind_master...  someday this should be replaced with a group
-                * scheduling mechanism and KPI.
-                */
-               vm_pageout_state.vm_restricted_to_single_processor = TRUE;
-       } else {
-               vm_pageout_state.vm_restricted_to_single_processor = FALSE;
+               if (hinfo.max_cpus <= 3) {
+                       /*
+                        * on systems with a limited number of CPUS, bind the
+                        * 4 major threads that can free memory and that tend to use
+                        * a fair bit of CPU under pressured conditions to a single processor.
+                        * This insures that these threads don't hog all of the available CPUs
+                        * (important for camera launch), while allowing them to run independently
+                        * w/r to locks... the 4 threads are
+                        * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
+                        * vm_compressor_swap_trigger_thread (minor and major compactions),
+                        * memorystatus_thread (jetsams).
+                        *
+                        * the first time the thread is run, it is responsible for checking the
+                        * state of vm_restricted_to_single_processor, and if TRUE it calls
+                        * thread_bind_master...  someday this should be replaced with a group
+                        * scheduling mechanism and KPI.
+                        */
+                       vm_pageout_state.vm_restricted_to_single_processor = TRUE;
+               } else {
+                       vm_pageout_state.vm_restricted_to_single_processor = FALSE;
+               }
        }
 }
 
@@ -4282,19 +4677,53 @@ vm_pageout(void)
         */
        s = splsched();
 
+       vm_pageout_scan_thread = self;
+
+#if CONFIG_VPS_DYNAMIC_PRIO
+
+       int             vps_dynprio_bootarg = 0;
+
+       if (PE_parse_boot_argn("vps_dynamic_priority_enabled", &vps_dynprio_bootarg, sizeof(vps_dynprio_bootarg))) {
+               vps_dynamic_priority_enabled = (vps_dynprio_bootarg ? TRUE : FALSE);
+               kprintf("Overriding vps_dynamic_priority_enabled to %d\n", vps_dynamic_priority_enabled);
+       } else {
+               if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
+                       vps_dynamic_priority_enabled = TRUE;
+               } else {
+                       vps_dynamic_priority_enabled = FALSE;
+               }
+       }
+
+       if (vps_dynamic_priority_enabled) {
+               sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
+               thread_set_eager_preempt(self);
+       } else {
+               sched_set_kernel_thread_priority(self, BASEPRI_VM);
+       }
+
+#else /* CONFIG_VPS_DYNAMIC_PRIO */
+
+       vps_dynamic_priority_enabled = FALSE;
+       sched_set_kernel_thread_priority(self, BASEPRI_VM);
+
+#endif /* CONFIG_VPS_DYNAMIC_PRIO */
+
        thread_lock(self);
        self->options |= TH_OPT_VMPRIV;
-       sched_set_thread_base_priority(self, BASEPRI_VM);
        thread_unlock(self);
 
        if (!self->reserved_stack) {
                self->reserved_stack = self->kernel_stack;
        }
 
-       if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
+       if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
+           vps_dynamic_priority_enabled == FALSE) {
                thread_vm_bind_group_add();
        }
 
+
+
+
        splx(s);
 
        thread_set_thread_name(current_thread(), "VM_pageout_scan");
@@ -4412,7 +4841,7 @@ vm_pageout(void)
        if (result != KERN_SUCCESS) {
                panic("vm_pageout_iothread_external: create failed");
        }
-
+       thread_set_thread_name(vm_pageout_state.vm_pageout_external_iothread, "VM_pageout_external_iothread");
        thread_deallocate(vm_pageout_state.vm_pageout_external_iothread);
 
        result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
@@ -4421,7 +4850,7 @@ vm_pageout(void)
        if (result != KERN_SUCCESS) {
                panic("vm_pageout_garbage_collect: create failed");
        }
-
+       thread_set_thread_name(thread, "VM_pageout_garbage_collect");
        thread_deallocate(thread);
 
 #if VM_PRESSURE_EVENTS
@@ -5267,7 +5696,7 @@ check_busy:
 
                                pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
                                assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
-                               lite_list[pg_num >> 5] |= 1 << (pg_num & 31);
+                               lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
 
                                if (hw_dirty) {
                                        if (pmap_flushes_delayed == FALSE) {
@@ -5512,7 +5941,7 @@ check_busy:
 
                                pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
                                assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
-                               lite_list[pg_num >> 5] |= 1 << (pg_num & 31);
+                               lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
 
                                if (hw_dirty) {
                                        pmap_clear_modify(phys_page);
@@ -5542,7 +5971,22 @@ check_busy:
                                upl->flags &= ~UPL_CLEAR_DIRTY;
                                upl->flags |= UPL_SET_DIRTY;
                                dirty = TRUE;
-                               upl->flags |= UPL_SET_DIRTY;
+                               /*
+                                * Page belonging to a code-signed object is about to
+                                * be written. Mark it tainted and disconnect it from
+                                * all pmaps so processes have to fault it back in and
+                                * deal with the tainted bit.
+                                */
+                               if (object->code_signed && dst_page->vmp_cs_tainted == FALSE) {
+                                       dst_page->vmp_cs_tainted = TRUE;
+                                       vm_page_upl_tainted++;
+                                       if (dst_page->vmp_pmapped) {
+                                               refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
+                                               if (refmod_state & VM_MEM_REFERENCED) {
+                                                       dst_page->vmp_reference = TRUE;
+                                               }
+                                       }
+                               }
                        } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
                                /*
                                 * clean in place for read implies
@@ -6343,7 +6787,7 @@ process_upl_to_enter:
                        pg_num = (unsigned int) (new_offset / PAGE_SIZE);
                        assert(pg_num == new_offset / PAGE_SIZE);
 
-                       if (lite_list[pg_num >> 5] & (1 << (pg_num & 31))) {
+                       if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
                                VM_PAGE_GRAB_FICTITIOUS(alias_page);
 
                                vm_object_lock(object);
@@ -6773,8 +7217,8 @@ process_upl_to_commit:
                        pg_num = (unsigned int) (target_offset / PAGE_SIZE);
                        assert(pg_num == target_offset / PAGE_SIZE);
 
-                       if (lite_list[pg_num >> 5] & (1 << (pg_num & 31))) {
-                               lite_list[pg_num >> 5] &= ~(1 << (pg_num & 31));
+                       if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
+                               lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
 
                                if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
                                        m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
@@ -7009,10 +7453,17 @@ process_upl_to_commit:
                if (m->vmp_free_when_done) {
                        /*
                         * With the clean queue enabled, UPL_PAGEOUT should
-                        * no longer set the pageout bit. It's pages now go
+                        * no longer set the pageout bit. Its pages now go
                         * to the clean queue.
+                        *
+                        * We don't use the cleaned Q anymore and so this
+                        * assert isn't correct. The code for the clean Q
+                        * still exists and might be used in the future. If we
+                        * go back to the cleaned Q, we will re-enable this
+                        * assert.
+                        *
+                        * assert(!(upl->flags & UPL_PAGEOUT));
                         */
-                       assert(!(flags & UPL_PAGEOUT));
                        assert(!m_object->internal);
 
                        m->vmp_free_when_done = FALSE;
@@ -7454,8 +7905,8 @@ process_upl_to_abort:
                m = VM_PAGE_NULL;
 
                if (upl->flags & UPL_LITE) {
-                       if (lite_list[pg_num >> 5] & (1 << (pg_num & 31))) {
-                               lite_list[pg_num >> 5] &= ~(1 << (pg_num & 31));
+                       if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
+                               lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
 
                                if (!(upl->flags & UPL_KERNEL_OBJECT)) {
                                        m = vm_page_lookup(shadow_object, target_offset +
@@ -7914,7 +8365,7 @@ vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t us
                }
                entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
                assert(entry >= 0 && entry < object->resident_page_count);
-               lite_list[entry >> 5] |= 1 << (entry & 31);
+               lite_list[entry >> 5] |= 1U << (entry & 31);
 
                phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
 
@@ -8039,7 +8490,7 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u
 
                vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
 
-               lite_list[entry >> 5] |= 1 << (entry & 31);
+               lite_list[entry >> 5] |= 1U << (entry & 31);
 
                phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
 
@@ -8719,6 +9170,22 @@ memory_error:
 
                if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
                        SET_PAGE_DIRTY(dst_page, TRUE);
+                       /*
+                        * Page belonging to a code-signed object is about to
+                        * be written. Mark it tainted and disconnect it from
+                        * all pmaps so processes have to fault it back in and
+                        * deal with the tainted bit.
+                        */
+                       if (object->code_signed && dst_page->vmp_cs_tainted == FALSE) {
+                               dst_page->vmp_cs_tainted = TRUE;
+                               vm_page_iopl_tainted++;
+                               if (dst_page->vmp_pmapped) {
+                                       int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
+                                       if (refmod & VM_MEM_REFERENCED) {
+                                               dst_page->vmp_reference = TRUE;
+                                       }
+                               }
+                       }
                }
                if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
                        pmap_sync_page_attributes_phys(phys_page);
@@ -8730,7 +9197,7 @@ record_phys_addr:
                        upl->flags |= UPL_HAS_BUSY;
                }
 
-               lite_list[entry >> 5] |= 1 << (entry & 31);
+               lite_list[entry >> 5] |= 1U << (entry & 31);
 
                if (phys_page > upl->highest_page) {
                        upl->highest_page = phys_page;
@@ -9023,7 +9490,7 @@ upl_range_needed(
  * virtaul address space each time we need to work with
  * a physical page.
  */
-decl_simple_lock_data(, vm_paging_lock)
+decl_simple_lock_data(, vm_paging_lock);
 #define VM_PAGING_NUM_PAGES     64
 vm_map_offset_t vm_paging_base_address = 0;
 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
@@ -9107,20 +9574,10 @@ vm_paging_map_object(
 
        if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
                /* use permanent 1-to-1 kernel mapping of physical memory ? */
-#if __x86_64__
-               *address = (vm_map_offset_t)
-                   PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) <<
-                       PAGE_SHIFT);
-               *need_unmap = FALSE;
-               return KERN_SUCCESS;
-#elif __arm__ || __arm64__
                *address = (vm_map_offset_t)
                    phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
                *need_unmap = FALSE;
                return KERN_SUCCESS;
-#else
-#warn "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..."
-#endif
 
                assert(page->vmp_busy);
                /*
@@ -9492,7 +9949,8 @@ vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
                                        }
 
                                        vector_upl->upl_elems[i] = NULL;
-                                       invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1);
+                                       invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
+                                           relaxed);
                                        if (invalid_upls == vector_upl->num_upls) {
                                                return TRUE;
                                        } else {
@@ -10339,7 +10797,7 @@ vm_test_wire_and_extract(void)
 
        ledger = ledger_instantiate(task_ledger_template,
            LEDGER_CREATE_ACTIVE_ENTRIES);
-       user_map = vm_map_create(pmap_create(ledger, 0, PMAP_CREATE_64BIT),
+       user_map = vm_map_create(pmap_create_options(ledger, 0, PMAP_CREATE_64BIT),
            0x100000000ULL,
            0x200000000ULL,
            TRUE);
index b0608aef57c2a94474e7657e8c9d9883e3917c85..378c4765ca12fbf7d74e53b270b22f4826cd2ca8 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -347,7 +347,7 @@ struct upl_io_completion {
 
 
 struct upl {
-       decl_lck_mtx_data(, Lock)       /* Synchronization */
+       decl_lck_mtx_data(, Lock);      /* Synchronization */
        int             ref_count;
        int             ext_ref_count;
        int             flags;
@@ -481,7 +481,7 @@ extern void vm_paging_unmap_object(
        vm_object_t             object,
        vm_map_offset_t         start,
        vm_map_offset_t         end);
-decl_simple_lock_data(extern, vm_paging_lock)
+decl_simple_lock_data(extern, vm_paging_lock);
 
 /*
  * Backing store throttle when BS is exhausted
@@ -644,7 +644,7 @@ struct vm_pageout_vminfo {
        unsigned long vm_pageout_skipped_external;
 
        unsigned long vm_pageout_pages_evicted;
-       unsigned long vm_pageout_pages_purged;;
+       unsigned long vm_pageout_pages_purged;
        unsigned long vm_pageout_freed_cleaned;
        unsigned long vm_pageout_freed_speculative;
        unsigned long vm_pageout_freed_external;
index 43d45dbbe7afa44bfa9bcdc3ecf5f49895e6f534..66dbe7ce75722817d8e29a4ff3a8f28d9396703d 100644 (file)
@@ -109,6 +109,14 @@ extern kern_return_t vm_map_purgable_control(
        vm_purgable_t           control,
        int                     *state);
 
+#if MACH_ASSERT
+extern void vm_map_pmap_check_ledgers(
+       pmap_t          pmap,
+       ledger_t        ledger,
+       int             pid,
+       char            *procname);
+#endif /* MACH_ASSERT */
+
 extern kern_return_t
 vnode_pager_get_object_vnode(
        memory_object_t mem_obj,
@@ -191,11 +199,11 @@ extern void swapfile_pager_bootstrap(void);
 extern memory_object_t swapfile_pager_setup(struct vnode *vp);
 extern memory_object_control_t swapfile_pager_control(memory_object_t mem_obj);
 
-#if __arm64__ || ((__ARM_ARCH_7K__ >= 2) && defined(PLATFORM_WatchOS))
+#if __arm64__ || (__ARM_ARCH_7K__ >= 2)
 #define SIXTEENK_PAGE_SIZE      0x4000
 #define SIXTEENK_PAGE_MASK      0x3FFF
 #define SIXTEENK_PAGE_SHIFT     14
-#endif /* __arm64__ || ((__ARM_ARCH_7K__ >= 2) && defined(PLATFORM_WatchOS)) */
+#endif /* __arm64__ || (__ARM_ARCH_7K__ >= 2) */
 
 #if __arm64__
 #define FOURK_PAGE_SIZE         0x1000
@@ -473,6 +481,7 @@ extern void log_unnest_badness(
        vm_map_offset_t lowest_unnestable_addr);
 
 struct proc;
+struct proc *current_proc(void);
 extern int cs_allow_invalid(struct proc *p);
 extern int cs_invalid_page(addr64_t vaddr, boolean_t *cs_killed);
 
@@ -566,11 +575,11 @@ extern int proc_get_memstat_priority(struct proc*, boolean_t);
 /* the object purger. purges the next eligible object from memory. */
 /* returns TRUE if an object was purged, otherwise FALSE. */
 boolean_t vm_purgeable_object_purge_one_unlocked(int force_purge_below_group);
-void vm_purgeable_disown(task_t task);
 void vm_purgeable_nonvolatile_owner_update(task_t       owner,
     int          delta);
 void vm_purgeable_volatile_owner_update(task_t          owner,
     int             delta);
+void vm_owned_objects_disown(task_t task);
 
 
 struct trim_list {
@@ -622,6 +631,7 @@ extern int secluded_for_filecache;
 extern int secluded_for_fbdp;
 #endif
 
+extern uint64_t vm_page_secluded_drain(void);
 extern void             memory_object_mark_eligible_for_secluded(
        memory_object_control_t         control,
        boolean_t                       eligible_for_secluded);
@@ -635,6 +645,7 @@ extern kern_return_t mach_make_memory_entry_internal(
        memory_object_size_t    *size,
        memory_object_offset_t offset,
        vm_prot_t               permission,
+       vm_named_entry_kernel_flags_t vmne_kflags,
        ipc_port_t              *object_handle,
        ipc_port_t              parent_handle);
 
@@ -655,6 +666,17 @@ extern kern_return_t mach_make_memory_entry_internal(
 #define VM_SWAP_FLAGS_FORCE_DEFRAG     1
 #define VM_SWAP_FLAGS_FORCE_RECLAIM    2
 
+#if __arm64__
+/*
+ * Flags to control the behavior of
+ * the legacy footprint entitlement.
+ */
+#define LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE             (1)
+#define LEGACY_FOOTPRINT_ENTITLEMENT_IOS11_ACCT         (2)
+#define LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE     (3)
+
+#endif /* __arm64__ */
+
 #endif  /* _VM_VM_PROTOS_H_ */
 
 #endif  /* XNU_KERNEL_PRIVATE */
index 6ebfaf77ac89afc85cef62e3fee9312019de5afb..17350f63b9b84521a0efeb6e534332c2622b8751 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  *
@@ -84,7 +84,7 @@ struct purgeable_q purgeable_queues[PURGEABLE_Q_TYPE_MAX];
 queue_head_t purgeable_nonvolatile_queue;
 int purgeable_nonvolatile_count;
 
-decl_lck_mtx_data(, vm_purgeable_queue_lock)
+decl_lck_mtx_data(, vm_purgeable_queue_lock);
 
 static token_idx_t vm_purgeable_token_remove_first(purgeable_q_t queue);
 
@@ -100,14 +100,16 @@ vm_purgeable_token_check_queue(purgeable_q_t queue)
        token_idx_t     unripe = 0;
        int             our_inactive_count;
 
+
 #if DEVELOPMENT
-       static unsigned lightweight_check = 0;
+       static int lightweight_check = 0;
 
        /*
-        * Due to performance impact, only perform this check
-        * every 100 times on DEVELOPMENT kernels.
+        * Due to performance impact, perform this check less frequently on DEVELOPMENT kernels.
+        * Checking the queue scales linearly with its length, so we compensate by
+        * by performing this check less frequently as the queue grows.
         */
-       if (lightweight_check++ < 100) {
+       if (lightweight_check++ < (100 + queue->debug_count_tokens / 512)) {
                return;
        }
 
@@ -1287,105 +1289,6 @@ vm_purgeable_account(
 }
 #endif /* DEVELOPMENT || DEBUG */
 
-void
-vm_purgeable_disown(
-       task_t  task)
-{
-       vm_object_t     next_object;
-       vm_object_t     object;
-       int             collisions;
-
-       if (task == NULL) {
-               return;
-       }
-
-       /*
-        * Scan the purgeable objects queues for objects owned by "task".
-        * This has to be done "atomically" under the "vm_purgeable_queue"
-        * lock, to ensure that no new purgeable object get associated
-        * with this task or moved between queues while we're scanning.
-        */
-
-       /*
-        * Scan non-volatile queue for objects owned by "task".
-        */
-
-       collisions = 0;
-
-again:
-       if (task->task_purgeable_disowned) {
-               /* task has already disowned its purgeable memory */
-               assert(task->task_volatile_objects == 0);
-               assert(task->task_nonvolatile_objects == 0);
-               return;
-       }
-
-       lck_mtx_lock(&vm_purgeable_queue_lock);
-       task_objq_lock(task);
-
-       task->task_purgeable_disowning = TRUE;
-
-       for (object = (vm_object_t) queue_first(&task->task_objq);
-           !queue_end(&task->task_objq, (queue_entry_t) object);
-           object = next_object) {
-               if (task->task_nonvolatile_objects == 0 &&
-                   task->task_volatile_objects == 0) {
-                       /* no more purgeable objects owned by "task" */
-                       break;
-               }
-
-               next_object = (vm_object_t) queue_next(&object->task_objq);
-               if (object->purgable == VM_PURGABLE_DENY) {
-                       /* not a purgeable object: skip */
-                       continue;
-               }
-
-#if DEBUG
-               assert(object->vo_purgeable_volatilizer == NULL);
-#endif /* DEBUG */
-               assert(object->vo_owner == task);
-               if (!vm_object_lock_try(object)) {
-                       lck_mtx_unlock(&vm_purgeable_queue_lock);
-                       task_objq_unlock(task);
-                       mutex_pause(collisions++);
-                       goto again;
-               }
-               /* transfer ownership to the kernel */
-               assert(VM_OBJECT_OWNER(object) != kernel_task);
-               vm_object_ownership_change(
-                       object,
-                       object->vo_ledger_tag, /* unchanged */
-                       VM_OBJECT_OWNER_DISOWNED, /* new owner */
-                       TRUE);  /* old_owner->task_objq locked */
-               assert(object->vo_owner == VM_OBJECT_OWNER_DISOWNED);
-               vm_object_unlock(object);
-       }
-
-       if (__improbable(task->task_volatile_objects != 0 ||
-           task->task_nonvolatile_objects != 0)) {
-               panic("%s(%p): volatile=%d nonvolatile=%d q=%p q_first=%p q_last=%p",
-                   __FUNCTION__,
-                   task,
-                   task->task_volatile_objects,
-                   task->task_nonvolatile_objects,
-                   &task->task_objq,
-                   queue_first(&task->task_objq),
-                   queue_last(&task->task_objq));
-       }
-
-       /* there shouldn't be any purgeable objects owned by task now */
-       assert(task->task_volatile_objects == 0);
-       assert(task->task_nonvolatile_objects == 0);
-       assert(task->task_purgeable_disowning);
-
-       /* and we don't need to try and disown again */
-       task->task_purgeable_disowned = TRUE;
-
-       lck_mtx_unlock(&vm_purgeable_queue_lock);
-       task_objq_unlock(task);
-}
-
-
 static uint64_t
 vm_purgeable_queue_purge_task_owned(
        purgeable_q_t   queue,
@@ -1505,6 +1408,9 @@ vm_purgeable_nonvolatile_enqueue(
        vm_object_t     object,
        task_t          owner)
 {
+       int ledger_flags;
+       kern_return_t kr;
+
        vm_object_lock_assert_exclusive(object);
 
        assert(object->purgable == VM_PURGABLE_NONVOLATILE);
@@ -1513,7 +1419,7 @@ vm_purgeable_nonvolatile_enqueue(
        lck_mtx_lock(&vm_purgeable_queue_lock);
 
        if (owner != NULL &&
-           owner->task_purgeable_disowning) {
+           owner->task_objects_disowning) {
                /* task is exiting and no longer tracking purgeable objects */
                owner = VM_OBJECT_OWNER_DISOWNED;
        }
@@ -1526,10 +1432,16 @@ vm_purgeable_nonvolatile_enqueue(
        object->vo_purgeable_volatilizer = NULL;
 #endif /* DEBUG */
 
-       vm_object_ownership_change(object,
-           object->vo_ledger_tag,                        /* tag unchanged */
+       ledger_flags = 0;
+       if (object->vo_no_footprint) {
+               ledger_flags |= VM_LEDGER_FLAG_NO_FOOTPRINT;
+       }
+       kr = vm_object_ownership_change(object,
+           object->vo_ledger_tag,                             /* tag unchanged */
            owner,
+           ledger_flags,
            FALSE);                             /* task_objq_locked */
+       assert(kr == KERN_SUCCESS);
 
        assert(object->objq.next == NULL);
        assert(object->objq.prev == NULL);
@@ -1549,6 +1461,7 @@ vm_purgeable_nonvolatile_dequeue(
        vm_object_t     object)
 {
        task_t  owner;
+       kern_return_t kr;
 
        vm_object_lock_assert_exclusive(object);
 
@@ -1563,11 +1476,13 @@ vm_purgeable_nonvolatile_dequeue(
                 */
                /* transfer ownership to the kernel */
                assert(VM_OBJECT_OWNER(object) != kernel_task);
-               vm_object_ownership_change(
+               kr = vm_object_ownership_change(
                        object,
                        object->vo_ledger_tag,  /* unchanged */
                        VM_OBJECT_OWNER_DISOWNED, /* new owner */
+                       0, /* ledger_flags */
                        FALSE); /* old_owner->task_objq locked */
+               assert(kr == KERN_SUCCESS);
                assert(object->vo_owner == VM_OBJECT_OWNER_DISOWNED);
        }
 
@@ -1763,7 +1678,7 @@ vm_object_owner_compressed_update(
        switch (object->purgable) {
        case VM_PURGABLE_DENY:
                /* not purgeable: must be ledger-tagged */
-               assert(object->vo_ledger_tag != VM_OBJECT_LEDGER_TAG_NONE);
+               assert(object->vo_ledger_tag != VM_LEDGER_TAG_NONE);
        /* fallthru */
        case VM_PURGABLE_NONVOLATILE:
                if (delta > 0) {
index f2599e77156151969e42003632fbfc8857e6d41f..fb0a7d4735eeaa489608226a09a88eebad3a600e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  *
@@ -82,7 +82,7 @@ extern int available_for_purge;
  * mostly used on a user context and we don't want any contention with the
  * pageout daemon.
  */
-decl_lck_mtx_data(extern, vm_purgeable_queue_lock)
+decl_lck_mtx_data(extern, vm_purgeable_queue_lock);
 
 /* add a new token to queue. called by vm_object_purgeable_control */
 /* enter with page queue locked */
index 4cdb916929675c9d5152ac0131f070a0c2759843..bf3b6d3a899527241086dfe2cab902b692043c34 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -77,7 +77,6 @@
 #include <kern/thread.h>
 #include <kern/kalloc.h>
 #include <kern/zalloc.h>
-#include <kern/xpr.h>
 #include <kern/ledger.h>
 #include <vm/pmap.h>
 #include <vm/vm_init.h>
@@ -96,6 +95,9 @@
 #include <vm/memory_object.h>
 #include <vm/vm_purgeable_internal.h>
 #include <vm/vm_compressor.h>
+#if defined (__x86_64__)
+#include <i386/misc_protos.h>
+#endif
 
 #if CONFIG_PHANTOM_CACHE
 #include <vm/vm_phantom_cache.h>
 
 #include <sys/kdebug.h>
 
+#if defined(HAS_APPLE_PAC)
+#include <ptrauth.h>
+#endif
 
 #if MACH_ASSERT
 
 
 #endif /* MACH_ASSERT */
 
+extern boolean_t vm_pageout_running;
+extern thread_t  vm_pageout_scan_thread;
+extern boolean_t vps_dynamic_priority_enabled;
+
 char    vm_page_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
 char    vm_page_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
 char    vm_page_non_speculative_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
@@ -328,7 +337,7 @@ unsigned int    vm_page_free_count;
 zone_t  vm_page_array_zone;
 zone_t  vm_page_zone;
 vm_locks_array_t vm_page_locks;
-decl_lck_mtx_data(, vm_page_alloc_lock)
+decl_lck_mtx_data(, vm_page_alloc_lock);
 lck_mtx_ext_t vm_page_alloc_lock_ext;
 
 unsigned int    vm_page_local_q_count = 0;
@@ -399,6 +408,7 @@ unsigned int    vm_page_inactive_count;
 unsigned int    vm_page_secluded_count;
 unsigned int    vm_page_secluded_count_free;
 unsigned int    vm_page_secluded_count_inuse;
+unsigned int    vm_page_secluded_count_over_target;
 #endif /* CONFIG_SECLUDED_MEMORY */
 unsigned int    vm_page_anonymous_count;
 unsigned int    vm_page_throttled_count;
@@ -409,6 +419,9 @@ unsigned int    vm_page_wire_count_on_boot = 0;
 unsigned int    vm_page_stolen_count = 0;
 unsigned int    vm_page_wire_count_initial;
 unsigned int    vm_page_gobble_count = 0;
+unsigned int    vm_page_kern_lpage_count = 0;
+
+uint64_t        booter_size;  /* external so it can be found in core dumps */
 
 #define VM_PAGE_WIRE_COUNT_WARNING      0
 #define VM_PAGE_GOBBLE_COUNT_WARNING    0
@@ -644,6 +657,12 @@ vm_get_delayed_page(int grab_options)
        assert(vm_delayed_count > 0);
        --vm_delayed_count;
 
+#if defined(__x86_64__)
+       /* x86 cluster code requires increasing phys_page in vm_pages[] */
+       if (vm_pages_count > 0) {
+               assert(pnum > vm_pages[vm_pages_count - 1].vmp_phys_page);
+       }
+#endif
        p = &vm_pages[vm_pages_count];
        assert(p < vm_page_array_ending_addr);
        vm_page_init(p, pnum, FALSE);
@@ -687,8 +706,8 @@ vm_free_delayed_pages(void)
        vm_page_t   p;
        vm_page_t   list = NULL;
        uint_t      cnt = 0;
-       vm_offset_t start_free_page;
-       vm_size_t   free_size;
+       vm_offset_t start_free_va;
+       int64_t     free_size;
 
        while ((p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE)) != NULL) {
                if (vm_himemory_mode) {
@@ -711,29 +730,39 @@ vm_free_delayed_pages(void)
                vm_page_release(p, FALSE);
        }
 #if DEVELOPMENT || DEBUG
-       kprintf("vm_free_delayed_pages: freed %d pages\n", cnt);
+       kprintf("vm_free_delayed_pages: initialized %d free pages\n", cnt);
 #endif
 
        /*
         * Free up any unused full pages at the end of the vm_pages[] array
         */
-       start_free_page = round_page((vm_offset_t)&vm_pages[vm_pages_count]);
-       if (start_free_page < (vm_offset_t)vm_page_array_ending_addr) {
-               free_size = trunc_page((vm_offset_t)vm_page_array_ending_addr - start_free_page);
-               if (free_size > 0) {
-#if DEVELOPMENT || DEBUG
-                       kprintf("Freeing final unused %ld bytes from vm_pages[] at 0x%lx\n",
-                           (long)free_size, (long)start_free_page);
+       start_free_va = round_page((vm_offset_t)&vm_pages[vm_pages_count]);
+
+#if defined(__x86_64__)
+       /*
+        * Since x86 might have used large pages for vm_pages[], we can't
+        * free starting in the middle of a partially used large page.
+        */
+       if (pmap_query_pagesize(kernel_pmap, start_free_va) == I386_LPGBYTES) {
+               start_free_va = ((start_free_va + I386_LPGMASK) & ~I386_LPGMASK);
+       }
 #endif
-                       pmap_pv_fixup(start_free_page, free_size);
-                       ml_static_mfree(start_free_page, free_size);
-                       vm_page_array_ending_addr = (void *)start_free_page;
+       if (start_free_va < (vm_offset_t)vm_page_array_ending_addr) {
+               free_size = trunc_page((vm_offset_t)vm_page_array_ending_addr - start_free_va);
+               if (free_size > 0) {
+                       ml_static_mfree(start_free_va, (vm_offset_t)free_size);
+                       vm_page_array_ending_addr = (void *)start_free_va;
 
                        /*
                         * Note there's no locking here, as only this thread will ever change this value.
                         * The reader, vm_page_diagnose, doesn't grab any locks for the counts it looks at.
                         */
-                       --vm_page_stolen_count;
+                       vm_page_stolen_count -= (free_size >> PAGE_SHIFT);
+
+#if DEVELOPMENT || DEBUG
+                       kprintf("Freeing final unused %ld bytes from vm_pages[] at 0x%lx\n",
+                           (long)free_size, (long)start_free_va);
+#endif
                }
        }
 
@@ -1183,6 +1212,9 @@ vm_page_bootstrap(
 #endif
        vm_page_wire_count_initial = vm_page_wire_count;
 
+       /* capture this for later use */
+       booter_size = ml_get_booter_memory_size();
+
        printf("vm_page_bootstrap: %d free pages, %d wired pages, (up to %d of which are delayed free)\n",
            vm_page_free_count, vm_page_wire_count, vm_delayed_count);
 
@@ -1192,81 +1224,103 @@ vm_page_bootstrap(
 
 #ifndef MACHINE_PAGES
 /*
- *     We implement pmap_steal_memory and pmap_startup with the help
- *     of two simpler functions, pmap_virtual_space and pmap_next_page.
+ * This is the early boot time allocator for data structures needed to bootstrap the VM system.
+ * On x86 it will allocate large pages if size is sufficiently large. We don't need to do this
+ * on ARM yet, due to the combination of a large base page size and smaller RAM devices.
  */
-
-void *
-pmap_steal_memory(
-       vm_size_t size)
+static void *
+pmap_steal_memory_internal(
+       vm_size_t size,
+       boolean_t might_free)
 {
        kern_return_t kr;
-       vm_offset_t addr, vaddr;
+       vm_offset_t addr;
+       vm_offset_t map_addr;
        ppnum_t phys_page;
 
        /*
-        *      We round the size to a round multiple.
+        * Size needs to be aligned to word size.
         */
-
        size = (size + sizeof(void *) - 1) & ~(sizeof(void *) - 1);
 
        /*
-        *      If this is the first call to pmap_steal_memory,
-        *      we have to initialize ourself.
+        * On the first call, get the initial values for virtual address space
+        * and page align them.
         */
-
        if (virtual_space_start == virtual_space_end) {
                pmap_virtual_space(&virtual_space_start, &virtual_space_end);
+               virtual_space_start = round_page(virtual_space_start);
+               virtual_space_end = trunc_page(virtual_space_end);
 
+#if defined(__x86_64__)
                /*
-                *      The initial values must be aligned properly, and
-                *      we don't trust the pmap module to do it right.
+                * Release remaining unused section of preallocated KVA and the 4K page tables
+                * that map it. This makes the VA available for large page mappings.
                 */
-
-               virtual_space_start = round_page(virtual_space_start);
-               virtual_space_end = trunc_page(virtual_space_end);
+               Idle_PTs_release(virtual_space_start, virtual_space_end);
+#endif
        }
 
        /*
-        *      Allocate virtual memory for this request.
+        * Allocate the virtual space for this request. On x86, we'll align to a large page
+        * address if the size is big enough to back with at least 1 large page.
         */
-
+#if defined(__x86_64__)
+       if (size >= I386_LPGBYTES) {
+               virtual_space_start = ((virtual_space_start + I386_LPGMASK) & ~I386_LPGMASK);
+       }
+#endif
        addr = virtual_space_start;
        virtual_space_start += size;
 
        //kprintf("pmap_steal_memory: %08lX - %08lX; size=%08lX\n", (long)addr, (long)virtual_space_start, (long)size); /* (TEST/DEBUG) */
 
        /*
-        *      Allocate and map physical pages to back new virtual pages.
+        * Allocate and map physical pages to back the new virtual space.
         */
+       map_addr = round_page(addr);
+       while (map_addr < addr + size) {
+#if defined(__x86_64__)
+               /*
+                * Back with a large page if properly aligned on x86
+                */
+               if ((map_addr & I386_LPGMASK) == 0 &&
+                   map_addr + I386_LPGBYTES <= addr + size &&
+                   pmap_pre_expand_large(kernel_pmap, map_addr) == KERN_SUCCESS &&
+                   pmap_next_page_large(&phys_page) == KERN_SUCCESS) {
+                       kr = pmap_enter(kernel_pmap, map_addr, phys_page,
+                           VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE,
+                           VM_WIMG_USE_DEFAULT | VM_MEM_SUPERPAGE, FALSE);
+
+                       if (kr != KERN_SUCCESS) {
+                               panic("pmap_steal_memory: pmap_enter() large failed, new_addr=%#lx, phys_page=%u",
+                                   (unsigned long)map_addr, phys_page);
+                       }
+                       map_addr += I386_LPGBYTES;
+                       vm_page_wire_count += I386_LPGBYTES >> PAGE_SHIFT;
+                       vm_page_stolen_count += I386_LPGBYTES >> PAGE_SHIFT;
+                       vm_page_kern_lpage_count++;
+                       continue;
+               }
+#endif
 
-       for (vaddr = round_page(addr);
-           vaddr < addr + size;
-           vaddr += PAGE_SIZE) {
-               if (!pmap_next_page_hi(&phys_page)) {
+               if (!pmap_next_page_hi(&phys_page, might_free)) {
                        panic("pmap_steal_memory() size: 0x%llx\n", (uint64_t)size);
                }
 
-               /*
-                *      XXX Logically, these mappings should be wired,
-                *      but some pmap modules barf if they are.
-                */
-#if defined(__LP64__)
-#ifdef  __arm64__
-               /* ARM64_TODO: verify that we really don't need this */
-#else
-               pmap_pre_expand(kernel_pmap, vaddr);
-#endif
+#if defined(__x86_64__)
+               pmap_pre_expand(kernel_pmap, map_addr);
 #endif
 
-               kr = pmap_enter(kernel_pmap, vaddr, phys_page,
+               kr = pmap_enter(kernel_pmap, map_addr, phys_page,
                    VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE,
                    VM_WIMG_USE_DEFAULT, FALSE);
 
                if (kr != KERN_SUCCESS) {
-                       panic("pmap_steal_memory() pmap_enter failed, vaddr=%#lx, phys_page=%u",
-                           (unsigned long)vaddr, phys_page);
+                       panic("pmap_steal_memory() pmap_enter failed, map_addr=%#lx, phys_page=%u",
+                           (unsigned long)map_addr, phys_page);
                }
+               map_addr += PAGE_SIZE;
 
                /*
                 * Account for newly stolen memory
@@ -1275,12 +1329,35 @@ pmap_steal_memory(
                vm_page_stolen_count++;
        }
 
+#if defined(__x86_64__)
+       /*
+        * The call with might_free is currently the last use of pmap_steal_memory*().
+        * Notify the pmap layer to record which high pages were allocated so far.
+        */
+       if (might_free) {
+               pmap_hi_pages_done();
+       }
+#endif
 #if KASAN
        kasan_notify_address(round_page(addr), size);
 #endif
        return (void *) addr;
 }
 
+void *
+pmap_steal_memory(
+       vm_size_t size)
+{
+       return pmap_steal_memory_internal(size, FALSE);
+}
+
+void *
+pmap_steal_freeable_memory(
+       vm_size_t size)
+{
+       return pmap_steal_memory_internal(size, TRUE);
+}
+
 #if CONFIG_SECLUDED_MEMORY
 /* boot-args to control secluded memory */
 unsigned int secluded_mem_mb = 0;       /* # of MBs of RAM to seclude */
@@ -1336,7 +1413,7 @@ pmap_startup(
        mem_sz += round_page(virtual_space_start) - virtual_space_start;        /* Account for any slop */
        npages = (uint_t)(mem_sz / (PAGE_SIZE + sizeof(*vm_pages)));    /* scaled to include the vm_page_ts */
 
-       vm_pages = (vm_page_t) pmap_steal_memory(npages * sizeof *vm_pages);
+       vm_pages = (vm_page_t) pmap_steal_freeable_memory(npages * sizeof *vm_pages);
 
        /*
         * Check if we want to initialize pages to a known value
@@ -1483,6 +1560,12 @@ pmap_startup(
                assert((i + vm_first_phys_ppnum) == phys_page);
 #endif
 
+#if defined(__x86_64__)
+               /* The x86 clump freeing code requires increasing ppn's to work correctly */
+               if (i > 0) {
+                       assert(phys_page > vm_pages[i - 1].vmp_phys_page);
+               }
+#endif
                ++vm_pages_count;
                vm_page_init(&vm_pages[i], phys_page, FALSE);
                if (fill) {
@@ -1684,9 +1767,6 @@ vm_page_insert_internal(
        int                     ledger_idx_nonvolatile_compressed;
        boolean_t               do_footprint;
 
-       XPR(XPR_VM_PAGE,
-           "vm_page_insert, object 0x%X offset 0x%X page 0x%X\n",
-           object, offset, mem, 0, 0);
 #if 0
        /*
         * we may not hold the page queue lock
@@ -2028,11 +2108,6 @@ vm_page_remove(
 
        m_object = VM_PAGE_OBJECT(mem);
 
-       XPR(XPR_VM_PAGE,
-           "vm_page_remove, object 0x%X offset 0x%X page 0x%X\n",
-           m_object, mem->vmp_offset,
-           mem, 0, 0);
-
        vm_object_lock_assert_exclusive(m_object);
        assert(mem->vmp_tabled);
        assert(!mem->vmp_cleaning);
@@ -2434,11 +2509,6 @@ vm_page_rename(
        assert(m_object != new_object);
        assert(m_object);
 
-       XPR(XPR_VM_PAGE,
-           "vm_page_rename, new object 0x%X, offset 0x%X page 0x%X\n",
-           new_object, new_offset,
-           mem, 0, 0);
-
        /*
         *      Changes to mem->vmp_object require the page lock because
         *      the pageout daemon uses that lock to get the object.
@@ -3272,15 +3342,22 @@ return_page_from_cpu_list:
        /*
         *      Decide if we should poke the pageout daemon.
         *      We do this if the free count is less than the low
-        *      water mark, or if the free count is less than the high
-        *      water mark (but above the low water mark) and the inactive
-        *      count is less than its target.
+        *      water mark. VM Pageout Scan will keep running till
+        *      the free_count > free_target (& hence above free_min).
+        *      This wakeup is to catch the possibility of the counts
+        *      dropping between VM Pageout Scan parking and this check.
         *
         *      We don't have the counts locked ... if they change a little,
         *      it doesn't really matter.
         */
        if (vm_page_free_count < vm_page_free_min) {
-               thread_wakeup((event_t) &vm_page_free_wanted);
+               lck_mtx_lock(&vm_page_queue_free_lock);
+               if (vm_pageout_running == FALSE) {
+                       lck_mtx_unlock(&vm_page_queue_free_lock);
+                       thread_wakeup((event_t) &vm_page_free_wanted);
+               } else {
+                       lck_mtx_unlock(&vm_page_queue_free_lock);
+               }
        }
 
        VM_CHECK_MEMORYSTATUS;
@@ -3434,6 +3511,66 @@ reactivate_secluded_page:
 
        return mem;
 }
+
+uint64_t
+vm_page_secluded_drain(void)
+{
+       vm_page_t local_freeq;
+       int local_freed;
+       uint64_t num_reclaimed;
+       unsigned int saved_secluded_count, saved_secluded_target;
+
+       num_reclaimed = 0;
+       local_freeq = NULL;
+       local_freed = 0;
+
+       vm_page_lock_queues();
+
+       saved_secluded_count = vm_page_secluded_count;
+       saved_secluded_target = vm_page_secluded_target;
+       vm_page_secluded_target = 0;
+       VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
+       while (vm_page_secluded_count) {
+               vm_page_t secluded_page;
+
+               assert((vm_page_secluded_count_free +
+                   vm_page_secluded_count_inuse) ==
+                   vm_page_secluded_count);
+               secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
+               assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
+
+               vm_page_queues_remove(secluded_page, FALSE);
+               assert(!secluded_page->vmp_fictitious);
+               assert(!VM_PAGE_WIRED(secluded_page));
+
+               if (secluded_page->vmp_object == 0) {
+                       /* transfer to free queue */
+                       assert(secluded_page->vmp_busy);
+                       secluded_page->vmp_snext = local_freeq;
+                       local_freeq = secluded_page;
+                       local_freed += 1;
+               } else {
+                       /* transfer to head of active queue */
+                       vm_page_enqueue_active(secluded_page, FALSE);
+                       secluded_page = VM_PAGE_NULL;
+               }
+               num_reclaimed++;
+       }
+       vm_page_secluded_target = saved_secluded_target;
+       VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
+
+//     printf("FBDP %s:%d secluded_count %d->%d, target %d, reclaimed %lld\n", __FUNCTION__, __LINE__, saved_secluded_count, vm_page_secluded_count, vm_page_secluded_target, num_reclaimed);
+
+       vm_page_unlock_queues();
+
+       if (local_freed) {
+               vm_page_free_list(local_freeq, TRUE);
+               local_freeq = NULL;
+               local_freed = 0;
+       }
+
+       return num_reclaimed;
+}
 #endif /* CONFIG_SECLUDED_MEMORY */
 
 
@@ -3467,6 +3604,7 @@ vm_page_release(
 #if CONFIG_SECLUDED_MEMORY
        int     need_secluded_wakeup = 0;
 #endif /* CONFIG_SECLUDED_MEMORY */
+       event_t wakeup_event = NULL;
 
        if (page_queues_locked) {
                LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
@@ -3533,6 +3671,7 @@ vm_page_release(
                vm_page_queue_enter_first(&vm_page_queue_secluded, mem, vmp_pageq);
                mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
                vm_page_secluded_count++;
+               VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
                vm_page_secluded_count_free++;
                if (!page_queues_locked) {
                        vm_page_unlock_queues();
@@ -3597,15 +3736,25 @@ vm_page_release(
        lck_mtx_unlock(&vm_page_queue_free_lock);
 
        if (need_priv_wakeup) {
-               thread_wakeup_one((event_t) &vm_page_free_wanted_privileged);
+               wakeup_event = &vm_page_free_wanted_privileged;
        }
 #if CONFIG_SECLUDED_MEMORY
        else if (need_secluded_wakeup) {
-               thread_wakeup_one((event_t) &vm_page_free_wanted_secluded);
+               wakeup_event = &vm_page_free_wanted_secluded;
        }
 #endif /* CONFIG_SECLUDED_MEMORY */
        else if (need_wakeup) {
-               thread_wakeup_one((event_t) &vm_page_free_count);
+               wakeup_event = &vm_page_free_count;
+       }
+
+       if (wakeup_event) {
+               if (vps_dynamic_priority_enabled == TRUE) {
+                       thread_t thread_woken = NULL;
+                       wakeup_one_with_inheritor((event_t) wakeup_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &thread_woken);
+                       thread_deallocate(thread_woken);
+               } else {
+                       thread_wakeup_one((event_t) wakeup_event);
+               }
        }
 
        VM_CHECK_MEMORYSTATUS;
@@ -3634,6 +3783,7 @@ vm_page_release_startup(
                mem->vmp_lopage = FALSE;
                mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
                vm_page_secluded_count++;
+               VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
                vm_page_secluded_count_free++;
                queue_free = &vm_page_queue_secluded;
 #endif /* CONFIG_SECLUDED_MEMORY */
@@ -3679,6 +3829,7 @@ vm_page_wait(
        kern_return_t   wait_result;
        int             need_wakeup = 0;
        int             is_privileged = current_thread()->options & TH_OPT_VMPRIV;
+       event_t         wait_event = NULL;
 
        lck_mtx_lock_spin(&vm_page_queue_free_lock);
 
@@ -3696,7 +3847,7 @@ vm_page_wait(
                if (vm_page_free_wanted_privileged++ == 0) {
                        need_wakeup = 1;
                }
-               wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, interruptible);
+               wait_event = (event_t)&vm_page_free_wanted_privileged;
 #if CONFIG_SECLUDED_MEMORY
        } else if (secluded_for_apps &&
            task_can_use_secluded_mem(current_task(), FALSE)) {
@@ -3712,25 +3863,41 @@ vm_page_wait(
                if (vm_page_free_wanted_secluded++ == 0) {
                        need_wakeup = 1;
                }
-               wait_result = assert_wait(
-                       (event_t)&vm_page_free_wanted_secluded,
-                       interruptible);
+               wait_event = (event_t)&vm_page_free_wanted_secluded;
 #endif /* CONFIG_SECLUDED_MEMORY */
        } else {
                if (vm_page_free_wanted++ == 0) {
                        need_wakeup = 1;
                }
-               wait_result = assert_wait((event_t)&vm_page_free_count,
-                   interruptible);
+               wait_event = (event_t)&vm_page_free_count;
        }
-       lck_mtx_unlock(&vm_page_queue_free_lock);
-       counter(c_vm_page_wait_block++);
 
-       if (need_wakeup) {
-               thread_wakeup((event_t)&vm_page_free_wanted);
-       }
+       /*
+        * We don't do a vm_pageout_scan wakeup if we already have
+        * some waiters because vm_pageout_scan checks for waiters
+        * before it returns and does so behind the vm_page_queue_free_lock,
+        * which we own when we bump the waiter counts.
+        */
+
+       if (vps_dynamic_priority_enabled == TRUE) {
+               /*
+                * We are waking up vm_pageout_scan here. If it needs
+                * the vm_page_queue_free_lock before we unlock it
+                * we'll end up just blocking and incur an extra
+                * context switch. Could be a perf. issue.
+                */
+
+               counter(c_vm_page_wait_block++);
 
-       if (wait_result == THREAD_WAITING) {
+               if (need_wakeup) {
+                       thread_wakeup((event_t)&vm_page_free_wanted);
+               }
+
+               /*
+                * LD: This event is going to get recorded every time because
+                * we don't get back THREAD_WAITING from lck_mtx_sleep_with_inheritor.
+                * We just block in that routine.
+                */
                VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START,
                    vm_page_free_wanted_privileged,
                    vm_page_free_wanted,
@@ -3740,12 +3907,39 @@ vm_page_wait(
                    0,
 #endif /* CONFIG_SECLUDED_MEMORY */
                    0);
-               wait_result = thread_block(THREAD_CONTINUE_NULL);
-               VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
-                   VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0);
+               wait_result =  lck_mtx_sleep_with_inheritor(&vm_page_queue_free_lock,
+                   LCK_SLEEP_UNLOCK,
+                   wait_event,
+                   vm_pageout_scan_thread,
+                   interruptible,
+                   0);
+       } else {
+               wait_result = assert_wait(wait_event, interruptible);
+
+               lck_mtx_unlock(&vm_page_queue_free_lock);
+               counter(c_vm_page_wait_block++);
+
+               if (need_wakeup) {
+                       thread_wakeup((event_t)&vm_page_free_wanted);
+               }
+
+               if (wait_result == THREAD_WAITING) {
+                       VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START,
+                           vm_page_free_wanted_privileged,
+                           vm_page_free_wanted,
+#if CONFIG_SECLUDED_MEMORY
+                           vm_page_free_wanted_secluded,
+#else /* CONFIG_SECLUDED_MEMORY */
+                           0,
+#endif /* CONFIG_SECLUDED_MEMORY */
+                           0);
+                       wait_result = thread_block(THREAD_CONTINUE_NULL);
+                       VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
+                           VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0);
+               }
        }
 
-       return wait_result == THREAD_AWAKENED;
+       return (wait_result == THREAD_AWAKENED) || (wait_result == THREAD_NOT_WAITING);
 }
 
 /*
@@ -4099,6 +4293,8 @@ vm_page_free_list(
 #if CONFIG_SECLUDED_MEMORY
                        unsigned int    need_wakeup_secluded = 0;
 #endif /* CONFIG_SECLUDED_MEMORY */
+                       event_t         priv_wakeup_event, secluded_wakeup_event, normal_wakeup_event;
+                       boolean_t       priv_wakeup_all, secluded_wakeup_all, normal_wakeup_all;
 
                        lck_mtx_lock_spin(&vm_page_queue_free_lock);
 
@@ -4174,27 +4370,32 @@ vm_page_free_list(
                        }
                        lck_mtx_unlock(&vm_page_queue_free_lock);
 
+                       priv_wakeup_event = NULL;
+                       secluded_wakeup_event = NULL;
+                       normal_wakeup_event = NULL;
+
+                       priv_wakeup_all = FALSE;
+                       secluded_wakeup_all = FALSE;
+                       normal_wakeup_all = FALSE;
+
+
                        if (need_priv_wakeup != 0) {
                                /*
                                 * There shouldn't be that many VM-privileged threads,
                                 * so let's wake them all up, even if we don't quite
                                 * have enough pages to satisfy them all.
                                 */
-                               thread_wakeup((event_t)&vm_page_free_wanted_privileged);
+                               priv_wakeup_event = (event_t)&vm_page_free_wanted_privileged;
+                               priv_wakeup_all = TRUE;
                        }
 #if CONFIG_SECLUDED_MEMORY
                        if (need_wakeup_secluded != 0 &&
                            vm_page_free_wanted_secluded == 0) {
-                               thread_wakeup((event_t)
-                                   &vm_page_free_wanted_secluded);
+                               secluded_wakeup_event = (event_t)&vm_page_free_wanted_secluded;
+                               secluded_wakeup_all = TRUE;
+                               need_wakeup_secluded = 0;
                        } else {
-                               for (;
-                                   need_wakeup_secluded != 0;
-                                   need_wakeup_secluded--) {
-                                       thread_wakeup_one(
-                                               (event_t)
-                                               &vm_page_free_wanted_secluded);
-                               }
+                               secluded_wakeup_event = (event_t)&vm_page_free_wanted_secluded;
                        }
 #endif /* CONFIG_SECLUDED_MEMORY */
                        if (need_wakeup != 0 && vm_page_free_wanted == 0) {
@@ -4203,13 +4404,82 @@ vm_page_free_list(
                                 * after this, so let's wake them all up at
                                 * once.
                                 */
-                               thread_wakeup((event_t) &vm_page_free_count);
+                               normal_wakeup_event = (event_t) &vm_page_free_count;
+                               normal_wakeup_all = TRUE;
+                               need_wakeup = 0;
                        } else {
-                               for (; need_wakeup != 0; need_wakeup--) {
+                               normal_wakeup_event = (event_t) &vm_page_free_count;
+                       }
+
+                       if (priv_wakeup_event ||
+#if CONFIG_SECLUDED_MEMORY
+                           secluded_wakeup_event ||
+#endif /* CONFIG_SECLUDED_MEMORY */
+                           normal_wakeup_event) {
+                               if (vps_dynamic_priority_enabled == TRUE) {
+                                       thread_t thread_woken = NULL;
+
+                                       if (priv_wakeup_all == TRUE) {
+                                               wakeup_all_with_inheritor(priv_wakeup_event, THREAD_AWAKENED);
+                                       }
+
+#if CONFIG_SECLUDED_MEMORY
+                                       if (secluded_wakeup_all == TRUE) {
+                                               wakeup_all_with_inheritor(secluded_wakeup_event, THREAD_AWAKENED);
+                                       }
+
+                                       while (need_wakeup_secluded-- != 0) {
+                                               /*
+                                                * Wake up one waiter per page we just released.
+                                                */
+                                               wakeup_one_with_inheritor(secluded_wakeup_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &thread_woken);
+                                               thread_deallocate(thread_woken);
+                                       }
+#endif /* CONFIG_SECLUDED_MEMORY */
+
+                                       if (normal_wakeup_all == TRUE) {
+                                               wakeup_all_with_inheritor(normal_wakeup_event, THREAD_AWAKENED);
+                                       }
+
+                                       while (need_wakeup-- != 0) {
+                                               /*
+                                                * Wake up one waiter per page we just released.
+                                                */
+                                               wakeup_one_with_inheritor(normal_wakeup_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &thread_woken);
+                                               thread_deallocate(thread_woken);
+                                       }
+                               } else {
                                        /*
-                                        * Wake up one waiter per page we just released.
+                                        * Non-priority-aware wakeups.
                                         */
-                                       thread_wakeup_one((event_t) &vm_page_free_count);
+
+                                       if (priv_wakeup_all == TRUE) {
+                                               thread_wakeup(priv_wakeup_event);
+                                       }
+
+#if CONFIG_SECLUDED_MEMORY
+                                       if (secluded_wakeup_all == TRUE) {
+                                               thread_wakeup(secluded_wakeup_event);
+                                       }
+
+                                       while (need_wakeup_secluded-- != 0) {
+                                               /*
+                                                * Wake up one waiter per page we just released.
+                                                */
+                                               thread_wakeup_one(secluded_wakeup_event);
+                                       }
+
+#endif /* CONFIG_SECLUDED_MEMORY */
+                                       if (normal_wakeup_all == TRUE) {
+                                               thread_wakeup(normal_wakeup_event);
+                                       }
+
+                                       while (need_wakeup-- != 0) {
+                                               /*
+                                                * Wake up one waiter per page we just released.
+                                                */
+                                               thread_wakeup_one(normal_wakeup_event);
+                                       }
                                }
                        }
 
@@ -4685,6 +4955,7 @@ vm_page_activate(
                                vm_page_queue_enter(&vm_page_queue_secluded, m, vmp_pageq);
                                m->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
                                vm_page_secluded_count++;
+                               VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
                                vm_page_secluded_count_inuse++;
                                assert(!m_object->internal);
 //                             vm_page_pageable_external_count++;
@@ -4760,6 +5031,7 @@ vm_page_speculate(
                         */
                        aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000;
                        aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
+
                        ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
                } else {
                        aq = &vm_page_queue_speculative[speculative_age_index];
@@ -4785,6 +5057,7 @@ vm_page_speculate(
 
                                aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000;
                                aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
+
                                ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
                        }
                }
@@ -5128,9 +5401,6 @@ void
 vm_page_zero_fill(
        vm_page_t       m)
 {
-       XPR(XPR_VM_PAGE,
-           "vm_page_zero_fill, object 0x%X offset 0x%X page 0x%X\n",
-           VM_PAGE_OBJECT(m), m->vmp_offset, m, 0, 0);
 #if 0
        /*
         * we don't hold the page queue lock
@@ -5187,11 +5457,6 @@ vm_page_copy(
 
        src_m_object = VM_PAGE_OBJECT(src_m);
 
-       XPR(XPR_VM_PAGE,
-           "vm_page_copy, object 0x%X offset 0x%X to object 0x%X offset 0x%X\n",
-           src_m_object, src_m->vmp_offset,
-           VM_PAGE_OBJECT(dest_m), dest_m->vmp_offset,
-           0);
 #if 0
        /*
         * we don't hold the page queue lock
@@ -6234,7 +6499,13 @@ cpm_allocate(
         * determine need for wakeups
         */
        if (vm_page_free_count < vm_page_free_min) {
-               thread_wakeup((event_t) &vm_page_free_wanted);
+               lck_mtx_lock(&vm_page_queue_free_lock);
+               if (vm_pageout_running == FALSE) {
+                       lck_mtx_unlock(&vm_page_queue_free_lock);
+                       thread_wakeup((event_t) &vm_page_free_wanted);
+               } else {
+                       lck_mtx_unlock(&vm_page_queue_free_lock);
+               }
        }
 
        VM_CHECK_MEMORYSTATUS;
@@ -6298,10 +6569,22 @@ vm_page_do_delayed_work(
        if (!vm_page_trylockspin_queues()) {
                vm_object_unlock(object);
 
+               /*
+                * "Turnstile enabled vm_pageout_scan" can be runnable
+                * for a very long time without getting on a core.
+                * If this is a higher priority thread it could be
+                * waiting here for a very long time respecting the fact
+                * that pageout_scan would like its object after VPS does
+                * a mutex_pause(0).
+                * So we cap the number of yields in the vm_object_lock_avoid()
+                * case to a single mutex_pause(0) which will give vm_pageout_scan
+                * 10us to run and grab the object if needed.
+                */
                vm_page_lockspin_queues();
 
                for (j = 0;; j++) {
-                       if (!vm_object_lock_avoid(object) &&
+                       if ((!vm_object_lock_avoid(object) ||
+                           (vps_dynamic_priority_enabled && (j > 0))) &&
                            _vm_object_lock_try(object)) {
                                break;
                        }
@@ -8394,6 +8677,7 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq)
        {
                vm_page_queue_remove(&vm_page_queue_secluded, mem, vmp_pageq);
                vm_page_secluded_count--;
+               VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
                if (m_object == VM_OBJECT_NULL) {
                        vm_page_secluded_count_free--;
                        was_pageable = FALSE;
@@ -8628,6 +8912,9 @@ vm_tag_bt(void)
                /* Pull return address from one spot above the frame pointer */
                retaddr = *(frameptr + 1);
 
+#if defined(HAS_APPLE_PAC)
+               retaddr = (uintptr_t) ptrauth_strip((void *)retaddr, ptrauth_key_return_address);
+#endif
 
                if (((retaddr < vm_kernel_builtinkmod_text_end) && (retaddr >= vm_kernel_builtinkmod_text))
                    || (retaddr < vm_kernel_stext) || (retaddr > vm_kernel_top)) {
@@ -8937,6 +9224,7 @@ kern_allocation_update_subtotal(kern_allocation_name_t allocation, uint32_t subt
 
        subidx = 0;
        assert(VM_KERN_MEMORY_NONE != subtag);
+       lck_spin_lock(&vm_allocation_sites_lock);
        for (; subidx < allocation->subtotalscount; subidx++) {
                if (VM_KERN_MEMORY_NONE == allocation->subtotals[subidx].tag) {
                        allocation->subtotals[subidx].tag = subtag;
@@ -8946,6 +9234,7 @@ kern_allocation_update_subtotal(kern_allocation_name_t allocation, uint32_t subt
                        break;
                }
        }
+       lck_spin_unlock(&vm_allocation_sites_lock);
        assert(subidx < allocation->subtotalscount);
        if (subidx >= allocation->subtotalscount) {
                return;
@@ -8957,13 +9246,10 @@ kern_allocation_update_subtotal(kern_allocation_name_t allocation, uint32_t subt
 
        if (delta < 0) {
                assertf(total->total >= ((uint64_t)-delta), "name %p", allocation);
-               OSAddAtomic64(delta, &total->total);
                assertf(other->mapped >= ((uint64_t)-delta), "other %p", other);
-               OSAddAtomic64(delta, &other->mapped);
-       } else {
-               OSAddAtomic64(delta, &other->mapped);
-               OSAddAtomic64(delta, &total->total);
        }
+       OSAddAtomic64(delta, &other->mapped);
+       OSAddAtomic64(delta, &total->total);
 }
 
 const char *
@@ -9215,14 +9501,12 @@ vm_page_diagnose_estimate(void)
        return count;
 }
 
-
 kern_return_t
 vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zones_collectable_bytes)
 {
        uint64_t                 wired_size;
        uint64_t                 wired_managed_size;
        uint64_t                 wired_reserved_size;
-       uint64_t                 booter_size;
        boolean_t                iterate;
        mach_memory_info_t     * counts;
 
@@ -9241,7 +9525,6 @@ vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zone
 #endif
        wired_managed_size  = ptoa_64(vm_page_wire_count - vm_page_wire_count_initial);
 
-       booter_size = ml_get_booter_memory_size();
        wired_size += booter_size;
 
        assert(num_info >= VM_KERN_COUNTER_COUNT);
@@ -9455,6 +9738,7 @@ start_secluded_suppression(task_t task)
                task->task_suppressed_secluded = TRUE;
                vm_page_secluded_save_target = vm_page_secluded_target;
                vm_page_secluded_target = 0;
+               VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
        }
        lck_spin_unlock(&secluded_suppress_slock);
 }
@@ -9466,6 +9750,7 @@ stop_secluded_suppression(task_t task)
        if (task->task_suppressed_secluded && --vm_page_secluded_suppress_cnt == 0) {
                task->task_suppressed_secluded = FALSE;
                vm_page_secluded_target = vm_page_secluded_save_target;
+               VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
        }
        lck_spin_unlock(&secluded_suppress_slock);
 }
index 03624ce0737265ef2f6a1f857922e227aead7c9c..d8befa53c08641435f975281544e278337ffb967 100644 (file)
  */
 #define PROCESS_SHARED_CACHE_LAYOUT 0x00
 
+#if defined(HAS_APPLE_PAC)
+#include <ptrauth.h>
+#endif /* HAS_APPLE_PAC */
 
 /* "dyld" uses this to figure out what the kernel supports */
 int shared_region_version = 3;
@@ -181,10 +184,10 @@ kern_return_t vm_shared_region_slide_mapping(
        memory_object_control_t); /* forward */
 
 static int __commpage_setup = 0;
-#if defined(__i386__) || defined(__x86_64__)
+#if !CONFIG_EMBEDDED
 static int __system_power_source = 1;   /* init to extrnal power source */
 static void post_sys_powersource_internal(int i, int internal);
-#endif /* __i386__ || __x86_64__ */
+#endif
 
 
 /*
@@ -729,7 +732,6 @@ vm_shared_region_create(
                switch (cputype) {
 #if defined(__arm__) || defined(__arm64__)
                case CPU_TYPE_ARM:
-               case CPU_TYPE_ARM64:
                        base_address = SHARED_REGION_BASE_ARM;
                        size = SHARED_REGION_SIZE_ARM;
                        pmap_nesting_start = SHARED_REGION_NESTING_BASE_ARM;
@@ -775,7 +777,7 @@ vm_shared_region_create(
        {
                struct pmap *pmap_nested;
 
-               pmap_nested = pmap_create(NULL, 0, is_64bit);
+               pmap_nested = pmap_create_options(NULL, 0, is_64bit ? PMAP_CREATE_64BIT : 0);
                if (pmap_nested != PMAP_NULL) {
                        pmap_set_nested(pmap_nested);
                        sub_map = vm_map_create(pmap_nested, 0, size, TRUE);
@@ -786,7 +788,7 @@ vm_shared_region_create(
                                vm_map_set_page_shift(sub_map,
                                    SIXTEENK_PAGE_SHIFT);
                        }
-#elif (__ARM_ARCH_7K__ >= 2) && defined(PLATFORM_WatchOS)
+#elif (__ARM_ARCH_7K__ >= 2)
                        /* enforce 16KB alignment for watch targets with new ABI */
                        vm_map_set_page_shift(sub_map, SIXTEENK_PAGE_SHIFT);
 #endif /* __arm64__ */
@@ -796,7 +798,7 @@ vm_shared_region_create(
        }
 #else
        /* create a VM sub map and its pmap */
-       sub_map = vm_map_create(pmap_create(NULL, 0, is_64bit),
+       sub_map = vm_map_create(pmap_create_options(NULL, 0, is_64bit),
            0, size,
            TRUE);
 #endif
@@ -848,6 +850,9 @@ vm_shared_region_create(
        si->start = 0;
        si->end = 0;
        si->slide = 0;
+#if defined(HAS_APPLE_PAC)
+       si->si_ptrauth = FALSE; /* no pointer authentication by default */
+#endif /* HAS_APPLE_PAC */
        si->slide_object = NULL;
        si->slide_info_size = 0;
        si->slide_info_entry = NULL;
@@ -1153,6 +1158,7 @@ vm_shared_region_map_file(
        vm_map_kernel_flags_t   vmk_flags;
        mach_vm_offset_t        sfm_min_address = ~0;
        mach_vm_offset_t        sfm_max_address = 0;
+       mach_vm_offset_t        sfm_end;
        struct _dyld_cache_header sr_cache_header;
 
 #if __arm64__
@@ -1234,8 +1240,17 @@ vm_shared_region_map_file(
                        sfm_min_address = mappings[i].sfm_address;
                }
 
-               if ((mappings[i].sfm_address + mappings[i].sfm_size) > sfm_max_address) {
-                       sfm_max_address = mappings[i].sfm_address + mappings[i].sfm_size;
+               if (os_add_overflow(mappings[i].sfm_address,
+                   mappings[i].sfm_size,
+                   &sfm_end) ||
+                   (vm_map_round_page(sfm_end, VM_MAP_PAGE_MASK(sr_map)) <
+                   mappings[i].sfm_address)) {
+                       /* overflow */
+                       kr = KERN_INVALID_ARGUMENT;
+                       break;
+               }
+               if (sfm_end > sfm_max_address) {
+                       sfm_max_address = sfm_end;
                }
 
                if (mappings[i].sfm_init_prot & VM_PROT_ZF) {
@@ -1274,6 +1289,8 @@ vm_shared_region_map_file(
 
                vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
                vmk_flags.vmkf_already = TRUE;
+               /* no copy-on-read for mapped binaries */
+               vmk_flags.vmkf_no_copy_on_read = 1;
 
                /* establish that mapping, OK if it's "already" there */
                if (map_port == MACH_PORT_NULL) {
@@ -1335,6 +1352,12 @@ vm_shared_region_map_file(
                                first_mapping = target_address;
                        }
 
+#if defined(HAS_APPLE_PAC)
+                       /*
+                        * Set "sr_slid_mapping"
+                        * it is used to get the userland address for address authentication.
+                        */
+#endif
                        if ((slid_mapping == (mach_vm_offset_t) -1) &&
                            (mapping_to_slide == &mappings[i])) {
                                slid_mapping = target_address;
@@ -1385,26 +1408,30 @@ vm_shared_region_map_file(
                                mappings[i].sfm_size = 0;
                                kr = KERN_SUCCESS;
                        } else {
-                               /* this mapping failed ! */
-                               SHARED_REGION_TRACE_ERROR(
-                                       ("shared_region: mapping[%d]: "
-                                       "address:0x%016llx size:0x%016llx "
-                                       "offset:0x%016llx "
-                                       "maxprot:0x%x prot:0x%x failed 0x%x\n",
-                                       i,
-                                       (long long)mappings[i].sfm_address,
-                                       (long long)mappings[i].sfm_size,
-                                       (long long)mappings[i].sfm_file_offset,
-                                       mappings[i].sfm_max_prot,
-                                       mappings[i].sfm_init_prot,
-                                       kr));
-
-                               vm_shared_region_undo_mappings(sr_map, sr_base_address, mappings, i);
                                break;
                        }
                }
        }
 
+       if (kr != KERN_SUCCESS) {
+               /* the last mapping we tried (mappings[i]) failed ! */
+               assert(i < mappings_count);
+               SHARED_REGION_TRACE_ERROR(
+                       ("shared_region: mapping[%d]: "
+                       "address:0x%016llx size:0x%016llx "
+                       "offset:0x%016llx "
+                       "maxprot:0x%x prot:0x%x failed 0x%x\n",
+                       i,
+                       (long long)mappings[i].sfm_address,
+                       (long long)mappings[i].sfm_size,
+                       (long long)mappings[i].sfm_file_offset,
+                       mappings[i].sfm_max_prot,
+                       mappings[i].sfm_init_prot,
+                       kr));
+               /* undo all the previous mappings */
+               vm_shared_region_undo_mappings(sr_map, sr_base_address, mappings, i);
+       }
+
        if (kr == KERN_SUCCESS &&
            slide_size != 0 &&
            mapping_to_slide != NULL) {
@@ -1694,25 +1721,29 @@ vm_shared_region_enter(
        /*
         * We may need to map several pmap-nested portions, due to platform
         * specific restrictions on pmap nesting.
-        * The pmap-nesting is triggered by the "VM_MEMORY_SHARED_PMAP" alias...
+        * The pmap-nesting is triggered by the "vmkf_nested_pmap" flag...
         */
        for (;
            sr_pmap_nesting_size > 0;
            sr_offset += mapping_size,
            sr_size -= mapping_size,
            sr_pmap_nesting_size -= mapping_size) {
+               vm_map_kernel_flags_t vmk_flags;
+
                target_address = sr_address + sr_offset;
                mapping_size = sr_pmap_nesting_size;
                if (mapping_size > pmap_nesting_size_max) {
                        mapping_size = (vm_map_offset_t) pmap_nesting_size_max;
                }
+               vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
+               vmk_flags.vmkf_nested_pmap = TRUE;
                kr = vm_map_enter_mem_object(
                        map,
                        &target_address,
                        mapping_size,
                        0,
                        VM_FLAGS_FIXED,
-                       VM_MAP_KERNEL_FLAGS_NONE,
+                       vmk_flags,
                        VM_MEMORY_SHARED_PMAP,
                        sr_handle,
                        sr_offset,
@@ -1911,6 +1942,13 @@ vm_shared_region_slide_mapping(
        si->start = start;
        si->end = si->start + size;
        si->slide = slide;
+#if defined(HAS_APPLE_PAC)
+       if (sr->sr_cpu_type == CPU_TYPE_ARM64 &&
+           sr->sr_cpu_subtype == CPU_SUBTYPE_ARM64E) {
+               /* arm64e has pointer authentication */
+               si->si_ptrauth = TRUE;
+       }
+#endif /* HAS_APPLE_PAC */
 
        /* find the shared region's map entry to slide */
        sr_map = vm_shared_region_vm_map(sr);
@@ -2465,6 +2503,11 @@ vm_shared_region_slide_page_v3(vm_shared_region_slide_info_t si, vm_offset_t vad
                        return KERN_FAILURE;
                }
 
+#if defined(HAS_APPLE_PAC)
+               uint16_t diversity_data = (uint16_t)(value >> 32);
+               bool hasAddressDiversity = (value & (1ULL << 48)) != 0;
+               ptrauth_key key = (ptrauth_key)((value >> 49) & 0x3);
+#endif /* HAS_APPLE_PAC */
                bool isAuthenticated = (value & (1ULL << 63)) != 0;
 
                if (isAuthenticated) {
@@ -2474,6 +2517,23 @@ vm_shared_region_slide_page_v3(vm_shared_region_slide_info_t si, vm_offset_t vad
                        const uint64_t value_add = s_info->value_add;
                        value += value_add;
 
+#if defined(HAS_APPLE_PAC)
+                       uint64_t discriminator = diversity_data;
+                       if (hasAddressDiversity) {
+                               // First calculate a new discriminator using the address of where we are trying to store the value
+                               uintptr_t pageOffset = rebaseLocation - page_content;
+                               discriminator = __builtin_ptrauth_blend_discriminator((void*)(((uintptr_t)uservaddr) + pageOffset), discriminator);
+                       }
+
+                       if (si->si_ptrauth &&
+                           !(BootArgs->bootFlags & kBootFlagsDisableUserJOP)) {
+                               /*
+                                * these pointers are used in user mode. disable the kernel key diversification
+                                * so we can sign them for use in user mode.
+                                */
+                               value = (uintptr_t)pmap_sign_user_ptr((void *)value, key, discriminator);
+                       }
+#endif /* HAS_APPLE_PAC */
                } else {
                        // The new value for a rebase is the low 51-bits of the threaded value plus the slide.
                        // Regular pointer which needs to fit in 51-bits of value.
@@ -2658,7 +2718,7 @@ _vm_commpage_init(
        if (kr != KERN_SUCCESS) {
                panic("_vm_commpage_init: could not allocate mem_entry");
        }
-       new_map = vm_map_create(pmap_create(NULL, 0, 0), 0, size, TRUE);
+       new_map = vm_map_create(pmap_create_options(NULL, 0, 0), 0, size, PMAP_CREATE_64BIT);
        if (new_map == VM_MAP_NULL) {
                panic("_vm_commpage_init: could not allocate VM map");
        }
@@ -2736,11 +2796,11 @@ vm_commpage_init(void)
        /* populate them according to this specific platform */
        commpage_populate();
        __commpage_setup = 1;
-#if defined(__i386__) || defined(__x86_64__)
+#if !CONFIG_EMBEDDED
        if (__system_power_source == 0) {
                post_sys_powersource_internal(0, 1);
        }
-#endif /* __i386__ || __x86_64__ */
+#endif
 
        SHARED_REGION_TRACE_DEBUG(
                ("commpage: init() <-\n"));
@@ -2812,6 +2872,7 @@ vm_commpage_enter(
            (commpage_size & (pmap_nesting_size_min - 1)) == 0) {
                /* the commpage is properly aligned or sized for pmap-nesting */
                tag = VM_MEMORY_SHARED_PMAP;
+               vmk_flags.vmkf_nested_pmap = TRUE;
        }
        /* map the comm page in the task's address space */
        assert(commpage_handle != IPC_PORT_NULL);
@@ -3030,19 +3091,19 @@ done:
  * 1 if it is internal power source ie battery
  */
 void
-#if defined(__i386__) || defined(__x86_64__)
+#if !CONFIG_EMBEDDED
 post_sys_powersource(int i)
 #else
 post_sys_powersource(__unused int i)
 #endif
 {
-#if defined(__i386__) || defined(__x86_64__)
+#if !CONFIG_EMBEDDED
        post_sys_powersource_internal(i, 0);
-#endif /* __i386__ || __x86_64__ */
+#endif
 }
 
 
-#if defined(__i386__) || defined(__x86_64__)
+#if !CONFIG_EMBEDDED
 static void
 post_sys_powersource_internal(int i, int internal)
 {
@@ -3058,4 +3119,4 @@ post_sys_powersource_internal(int i, int internal)
                }
        }
 }
-#endif /* __i386__ || __x86_64__ */
+#endif
index bfe7f518baf8fc6182f85e5e2a186d48b7b890c8..95fe9fa544e1d2e64a356c5a11f5fe1371e42704 100644 (file)
@@ -187,6 +187,9 @@ struct vm_shared_region_slide_info {
        mach_vm_offset_t        start;
        mach_vm_offset_t        end;
        uint32_t                slide;
+#if defined(HAS_APPLE_PAC)
+       boolean_t               si_ptrauth;
+#endif /* HAS_APPLE_PAC */
        vm_object_t             slide_object;
        mach_vm_size_t          slide_info_size;
        vm_shared_region_slide_info_entry_t     slide_info_entry;
index a4d1fc46f6c62321cfc972f6b8a93751d2dc6b9e..35cd0e817823e4881b8c5cde9f3e05c6c065672f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -118,19 +118,19 @@ kern_return_t shared_region_pager_last_unmap(memory_object_t mem_obj);
  * These routines are invoked by VM via the memory_object_*() interfaces.
  */
 const struct memory_object_pager_ops shared_region_pager_ops = {
-       shared_region_pager_reference,
-       shared_region_pager_deallocate,
-       shared_region_pager_init,
-       shared_region_pager_terminate,
-       shared_region_pager_data_request,
-       shared_region_pager_data_return,
-       shared_region_pager_data_initialize,
-       shared_region_pager_data_unlock,
-       shared_region_pager_synchronize,
-       shared_region_pager_map,
-       shared_region_pager_last_unmap,
-       NULL, /* data_reclaim */
-       "shared_region"
+       .memory_object_reference = shared_region_pager_reference,
+       .memory_object_deallocate = shared_region_pager_deallocate,
+       .memory_object_init = shared_region_pager_init,
+       .memory_object_terminate = shared_region_pager_terminate,
+       .memory_object_data_request = shared_region_pager_data_request,
+       .memory_object_data_return = shared_region_pager_data_return,
+       .memory_object_data_initialize = shared_region_pager_data_initialize,
+       .memory_object_data_unlock = shared_region_pager_data_unlock,
+       .memory_object_synchronize = shared_region_pager_synchronize,
+       .memory_object_map = shared_region_pager_map,
+       .memory_object_last_unmap = shared_region_pager_last_unmap,
+       .memory_object_data_reclaim = NULL,
+       .memory_object_pager_name = "shared_region"
 };
 
 /*
@@ -159,7 +159,7 @@ typedef struct shared_region_pager {
 int shared_region_pager_count = 0;              /* number of pagers */
 int shared_region_pager_count_mapped = 0;       /* number of unmapped pagers */
 queue_head_t shared_region_pager_queue;
-decl_lck_mtx_data(, shared_region_pager_lock)
+decl_lck_mtx_data(, shared_region_pager_lock);
 
 /*
  * Maximum number of unmapped pagers we're willing to keep around.
@@ -513,24 +513,12 @@ retry_src_fault:
                dst_pnum = (ppnum_t)
                    upl_phys_page(upl_pl, (int)(cur_offset / PAGE_SIZE));
                assert(dst_pnum != 0);
-#if __x86_64__
-               src_vaddr = (vm_map_offset_t)
-                   PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page)
-                       << PAGE_SHIFT);
-               dst_vaddr = (vm_map_offset_t)
-                   PHYSMAP_PTOV((pmap_paddr_t)dst_pnum << PAGE_SHIFT);
 
-#elif __arm__ || __arm64__
                src_vaddr = (vm_map_offset_t)
                    phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page)
                        << PAGE_SHIFT);
                dst_vaddr = (vm_map_offset_t)
                    phystokv((pmap_paddr_t)dst_pnum << PAGE_SHIFT);
-#else
-#error "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..."
-               src_vaddr = 0;
-               dst_vaddr = 0;
-#endif
                src_page_object = VM_PAGE_OBJECT(src_page);
 
                /*
@@ -983,6 +971,7 @@ shared_region_pager_create(
        shared_region_pager_t   pager;
        memory_object_control_t control;
        kern_return_t           kr;
+       vm_object_t             object;
 
        pager = (shared_region_pager_t) kalloc(sizeof(*pager));
        if (pager == SHARED_REGION_PAGER_NULL) {
@@ -1027,9 +1016,19 @@ shared_region_pager_create(
            &control);
        assert(kr == KERN_SUCCESS);
 
+       memory_object_mark_trusted(control);
+
        lck_mtx_lock(&shared_region_pager_lock);
        /* the new pager is now ready to be used */
        pager->is_ready = TRUE;
+       object = memory_object_to_vm_object((memory_object_t) pager);
+       assert(object);
+       /*
+        * No one knows about this object and so we get away without the object lock.
+        * This object is _eventually_ backed by the dyld shared cache and so we want
+        * to benefit from the lock priority boosting.
+        */
+       object->object_is_shared_cache = TRUE;
        lck_mtx_unlock(&shared_region_pager_lock);
 
        /* wakeup anyone waiting for this pager to be ready */
index a8b27af225c93daa77162bc9c243e1f3b8ccb332..8ebc2d3e1e1dc24d4b34ad677a86c7beaedada98 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -33,8 +33,8 @@
 #include <kern/ipc_kobject.h>
 #include <kern/kalloc.h>
 #include <kern/queue.h>
-#include <os/refcnt.h>
 
+#include <vm/memory_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
@@ -115,19 +115,19 @@ kern_return_t swapfile_pager_last_unmap(memory_object_t mem_obj);
  * These routines are invoked by VM via the memory_object_*() interfaces.
  */
 const struct memory_object_pager_ops swapfile_pager_ops = {
-       swapfile_pager_reference,
-       swapfile_pager_deallocate,
-       swapfile_pager_init,
-       swapfile_pager_terminate,
-       swapfile_pager_data_request,
-       swapfile_pager_data_return,
-       swapfile_pager_data_initialize,
-       swapfile_pager_data_unlock,
-       swapfile_pager_synchronize,
-       swapfile_pager_map,
-       swapfile_pager_last_unmap,
-       NULL, /* data_reclaim */
-       "swapfile pager"
+       .memory_object_reference = swapfile_pager_reference,
+       .memory_object_deallocate = swapfile_pager_deallocate,
+       .memory_object_init = swapfile_pager_init,
+       .memory_object_terminate = swapfile_pager_terminate,
+       .memory_object_data_request = swapfile_pager_data_request,
+       .memory_object_data_return = swapfile_pager_data_return,
+       .memory_object_data_initialize = swapfile_pager_data_initialize,
+       .memory_object_data_unlock = swapfile_pager_data_unlock,
+       .memory_object_synchronize = swapfile_pager_synchronize,
+       .memory_object_map = swapfile_pager_map,
+       .memory_object_last_unmap = swapfile_pager_last_unmap,
+       .memory_object_data_reclaim = NULL,
+       .memory_object_pager_name = "swapfile pager"
 };
 
 /*
@@ -140,7 +140,7 @@ typedef struct swapfile_pager {
 
        /* pager-specific data */
        queue_chain_t           pager_queue;    /* next & prev pagers */
-       struct os_refcnt        ref_count;      /* reference count */
+       unsigned int            ref_count;      /* reference count */
        boolean_t               is_ready;       /* is this pager ready ? */
        boolean_t               is_mapped;      /* is this pager mapped ? */
        struct vnode            *swapfile_vnode;/* the swapfile's vnode */
@@ -153,7 +153,7 @@ typedef struct swapfile_pager {
  */
 int swapfile_pager_count = 0;           /* number of pagers */
 queue_head_t swapfile_pager_queue;
-decl_lck_mtx_data(, swapfile_pager_lock)
+decl_lck_mtx_data(, swapfile_pager_lock);
 
 /*
  * Statistics & counters.
@@ -334,7 +334,7 @@ swapfile_pager_data_request(
 
        pager = swapfile_pager_lookup(mem_obj);
        assert(pager->is_ready);
-       assert(os_ref_get_count(&pager->ref_count) > 1); /* pager is alive and mapped */
+       assert(pager->ref_count > 1); /* pager is alive and mapped */
 
        PAGER_DEBUG(PAGER_PAGEIN, ("swapfile_pager_data_request: %p, %llx, %x, %x, pager %p\n", mem_obj, offset, length, protection_required, pager));
 
@@ -493,7 +493,8 @@ swapfile_pager_reference(
        pager = swapfile_pager_lookup(mem_obj);
 
        lck_mtx_lock(&swapfile_pager_lock);
-       os_ref_retain_locked(&pager->ref_count);
+       assert(pager->ref_count > 0);
+       pager->ref_count++;
        lck_mtx_unlock(&swapfile_pager_lock);
 }
 
@@ -567,9 +568,9 @@ swapfile_pager_deallocate_internal(
        }
 
        /* drop a reference on this pager */
-       os_ref_count_t refcount = os_ref_release_locked(&pager->ref_count);
+       pager->ref_count--;
 
-       if (refcount == 1) {
+       if (pager->ref_count == 1) {
                /*
                 * Only the "named" reference is left, which means that
                 * no one is really holding on to this pager anymore.
@@ -579,7 +580,7 @@ swapfile_pager_deallocate_internal(
                /* the pager is all ours: no need for the lock now */
                lck_mtx_unlock(&swapfile_pager_lock);
                swapfile_pager_terminate_internal(pager);
-       } else if (refcount == 0) {
+       } else if (pager->ref_count == 0) {
                /*
                 * Dropped the existence reference;  the memory object has
                 * been terminated.  Do some final cleanup and release the
@@ -667,7 +668,7 @@ swapfile_pager_map(
 
        lck_mtx_lock(&swapfile_pager_lock);
        assert(pager->is_ready);
-       assert(os_ref_get_count(&pager->ref_count) > 0); /* pager is alive */
+       assert(pager->ref_count > 0); /* pager is alive */
        if (pager->is_mapped == FALSE) {
                /*
                 * First mapping of this pager:  take an extra reference
@@ -675,7 +676,7 @@ swapfile_pager_map(
                 * are removed.
                 */
                pager->is_mapped = TRUE;
-               os_ref_retain_locked(&pager->ref_count);
+               pager->ref_count++;
        }
        lck_mtx_unlock(&swapfile_pager_lock);
 
@@ -726,7 +727,7 @@ swapfile_pager_lookup(
 
        assert(mem_obj->mo_pager_ops == &swapfile_pager_ops);
        __IGNORE_WCASTALIGN(pager = (swapfile_pager_t) mem_obj);
-       assert(os_ref_get_count(&pager->ref_count) > 0);
+       assert(pager->ref_count > 0);
        return pager;
 }
 
@@ -755,7 +756,7 @@ swapfile_pager_create(
        pager->swp_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL;
 
        pager->is_ready = FALSE;/* not ready until it has a "name" */
-       os_ref_init(&pager->ref_count, NULL); /* setup reference */
+       pager->ref_count = 1;   /* setup reference */
        pager->is_mapped = FALSE;
        pager->swapfile_vnode = vp;
 
@@ -772,7 +773,7 @@ swapfile_pager_create(
        if (!queue_end(&swapfile_pager_queue,
            (queue_entry_t) pager2)) {
                /* while we hold the lock, transfer our setup ref to winner */
-               os_ref_retain_locked(&pager2->ref_count);
+               pager2->ref_count++;
                /* we lost the race, down with the loser... */
                lck_mtx_unlock(&swapfile_pager_lock);
                pager->swapfile_vnode = NULL;
@@ -799,6 +800,8 @@ swapfile_pager_create(
            &control);
        assert(kr == KERN_SUCCESS);
 
+       memory_object_mark_trusted(control);
+
        lck_mtx_lock(&swapfile_pager_lock);
        /* the new pager is now ready to be used */
        pager->is_ready = TRUE;
@@ -839,7 +842,7 @@ swapfile_pager_setup(
                pager = SWAPFILE_PAGER_NULL;
        } else {
                /* make sure pager doesn't disappear */
-               os_ref_retain_locked(&pager->ref_count);
+               pager->ref_count++;
        }
 
        lck_mtx_unlock(&swapfile_pager_lock);
index 92df956137d494c0269c3522a46d87a4f26d0a27..ab106cb5a0f5f7f4f8fcf8b0f0641cbd7da5ad49 100644 (file)
 #include <san/kasan.h>
 
 #include <libkern/OSDebug.h>
+#include <IOKit/IOBSD.h>
 
 vm_size_t        upl_offset_to_pagelist = 0;
 
@@ -2286,6 +2287,8 @@ mach_make_memory_entry_64(
        ipc_port_t              *object_handle,
        ipc_port_t              parent_handle)
 {
+       vm_named_entry_kernel_flags_t   vmne_kflags;
+
        if ((permission & MAP_MEM_FLAGS_MASK) & ~MAP_MEM_FLAGS_USER) {
                /*
                 * Unknown flag: reject for forward compatibility.
@@ -2293,10 +2296,15 @@ mach_make_memory_entry_64(
                return KERN_INVALID_VALUE;
        }
 
+       vmne_kflags = VM_NAMED_ENTRY_KERNEL_FLAGS_NONE;
+       if (permission & MAP_MEM_LEDGER_TAGGED) {
+               vmne_kflags.vmnekf_ledger_tag = VM_LEDGER_TAG_DEFAULT;
+       }
        return mach_make_memory_entry_internal(target_map,
                   size,
                   offset,
                   permission,
+                  vmne_kflags,
                   object_handle,
                   parent_handle);
 }
@@ -2305,8 +2313,9 @@ kern_return_t
 mach_make_memory_entry_internal(
        vm_map_t                target_map,
        memory_object_size_t    *size,
-       memory_object_offset_t offset,
+       memory_object_offset_t  offset,
        vm_prot_t               permission,
+       vm_named_entry_kernel_flags_t   vmne_kflags,
        ipc_port_t              *object_handle,
        ipc_port_t              parent_handle)
 {
@@ -2423,6 +2432,9 @@ mach_make_memory_entry_internal(
                }
                return KERN_SUCCESS;
        } else if (permission & MAP_MEM_NAMED_CREATE) {
+               int     ledger_flags = 0;
+               task_t  owner;
+
                map_end = vm_map_round_page(offset + *size, PAGE_MASK);
                map_size = map_end - map_start;
 
@@ -2451,48 +2463,78 @@ mach_make_memory_entry_internal(
                object = vm_object_allocate(map_size);
                assert(object != VM_OBJECT_NULL);
 
-               if (permission & MAP_MEM_PURGABLE) {
-                       task_t owner;
+               /*
+                * XXX
+                * We use this path when we want to make sure that
+                * nobody messes with the object (coalesce, for
+                * example) before we map it.
+                * We might want to use these objects for transposition via
+                * vm_object_transpose() too, so we don't want any copy or
+                * shadow objects either...
+                */
+               object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
+               object->true_share = TRUE;
 
-                       if (!(permission & VM_PROT_WRITE)) {
-                               /* if we can't write, we can't purge */
-                               vm_object_deallocate(object);
-                               kr = KERN_INVALID_ARGUMENT;
-                               goto make_mem_done;
-                       }
-                       object->purgable = VM_PURGABLE_NONVOLATILE;
-                       if (permission & MAP_MEM_PURGABLE_KERNEL_ONLY) {
-                               object->purgeable_only_by_kernel = TRUE;
-                       }
+               owner = current_task();
+               if ((permission & MAP_MEM_PURGABLE) ||
+                   vmne_kflags.vmnekf_ledger_tag) {
                        assert(object->vo_owner == NULL);
                        assert(object->resident_page_count == 0);
                        assert(object->wired_page_count == 0);
-                       vm_object_lock(object);
-                       owner = current_task();
-#if __arm64__
-                       if (owner->task_legacy_footprint) {
-                               /*
-                                * For ios11, we failed to account for
-                                * this memory.  Keep doing that for
-                                * legacy apps (built before ios12),
-                                * for backwards compatibility's sake...
-                                */
-                               owner = kernel_task;
+                       assert(owner != TASK_NULL);
+                       if (vmne_kflags.vmnekf_ledger_no_footprint) {
+                               ledger_flags |= VM_LEDGER_FLAG_NO_FOOTPRINT;
+                               object->vo_no_footprint = TRUE;
                        }
+                       if (permission & MAP_MEM_PURGABLE) {
+                               if (!(permission & VM_PROT_WRITE)) {
+                                       /* if we can't write, we can't purge */
+                                       vm_object_deallocate(object);
+                                       kr = KERN_INVALID_ARGUMENT;
+                                       goto make_mem_done;
+                               }
+                               object->purgable = VM_PURGABLE_NONVOLATILE;
+                               if (permission & MAP_MEM_PURGABLE_KERNEL_ONLY) {
+                                       object->purgeable_only_by_kernel = TRUE;
+                               }
+#if __arm64__
+                               if (owner->task_legacy_footprint) {
+                                       /*
+                                        * For ios11, we failed to account for
+                                        * this memory.  Keep doing that for
+                                        * legacy apps (built before ios12),
+                                        * for backwards compatibility's sake...
+                                        */
+                                       owner = kernel_task;
+                               }
 #endif /* __arm64__ */
-                       vm_purgeable_nonvolatile_enqueue(object, owner);
-                       vm_object_unlock(object);
+                               vm_object_lock(object);
+                               vm_purgeable_nonvolatile_enqueue(object, owner);
+                               vm_object_unlock(object);
+                       }
                }
 
-               if (permission & MAP_MEM_LEDGER_TAG_NETWORK) {
-                       /* make this object owned by the calling task */
+               if (vmne_kflags.vmnekf_ledger_tag) {
+                       /*
+                        * Bill this object to the current task's
+                        * ledgers for the given tag.
+                        */
+                       if (vmne_kflags.vmnekf_ledger_no_footprint) {
+                               ledger_flags |= VM_LEDGER_FLAG_NO_FOOTPRINT;
+                       }
                        vm_object_lock(object);
-                       vm_object_ownership_change(
+                       object->vo_ledger_tag = vmne_kflags.vmnekf_ledger_tag;
+                       kr = vm_object_ownership_change(
                                object,
-                               VM_OBJECT_LEDGER_TAG_NETWORK,
-                               current_task(), /* new owner */
+                               vmne_kflags.vmnekf_ledger_tag,
+                               owner, /* new owner */
+                               ledger_flags,
                                FALSE); /* task_objq locked? */
                        vm_object_unlock(object);
+                       if (kr != KERN_SUCCESS) {
+                               vm_object_deallocate(object);
+                               goto make_mem_done;
+                       }
                }
 
 #if CONFIG_SECLUDED_MEMORY
@@ -2527,18 +2569,6 @@ mach_make_memory_entry_internal(
 
                /* the object has no pages, so no WIMG bits to update here */
 
-               /*
-                * XXX
-                * We use this path when we want to make sure that
-                * nobody messes with the object (coalesce, for
-                * example) before we map it.
-                * We might want to use these objects for transposition via
-                * vm_object_transpose() too, so we don't want any copy or
-                * shadow objects either...
-                */
-               object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
-               object->true_share = TRUE;
-
                user_entry->backing.object = object;
                user_entry->internal = TRUE;
                user_entry->is_sub_map = FALSE;
@@ -3297,10 +3327,11 @@ redo_lookup:
                }
 
                if (parent_entry->is_sub_map) {
-                       user_entry->backing.map = parent_entry->backing.map;
-                       vm_map_lock(user_entry->backing.map);
-                       user_entry->backing.map->map_refcnt++;
-                       vm_map_unlock(user_entry->backing.map);
+                       vm_map_t map = parent_entry->backing.map;
+                       user_entry->backing.map = map;
+                       lck_mtx_lock(&map->s_lock);
+                       os_ref_retain_locked(&map->map_refcnt);
+                       lck_mtx_unlock(&map->s_lock);
                } else {
                        object = parent_entry->backing.object;
                        assert(object != VM_OBJECT_NULL);
@@ -3455,7 +3486,6 @@ mach_memory_entry_allocate(
 {
        vm_named_entry_t        user_entry;
        ipc_port_t              user_handle;
-       ipc_port_t              previous;
 
        user_entry = (vm_named_entry_t) kalloc(sizeof *user_entry);
        if (user_entry == NULL) {
@@ -3465,25 +3495,6 @@ mach_memory_entry_allocate(
 
        named_entry_lock_init(user_entry);
 
-       user_handle = ipc_port_alloc_kernel();
-       if (user_handle == IP_NULL) {
-               kfree(user_entry, sizeof *user_entry);
-               return KERN_FAILURE;
-       }
-       ip_lock(user_handle);
-
-       /* make a sonce right */
-       user_handle->ip_sorights++;
-       ip_reference(user_handle);
-
-       /* make a send right */
-       user_handle->ip_mscount++;
-       user_handle->ip_srights++;
-       ip_reference(user_handle);
-
-       ipc_port_nsrequest(user_handle, 1, user_handle, &previous);
-       /* nsrequest unlocks user_handle */
-
        user_entry->backing.object = NULL;
        user_entry->is_sub_map = FALSE;
        user_entry->is_copy = FALSE;
@@ -3494,8 +3505,9 @@ mach_memory_entry_allocate(
        user_entry->protection = VM_PROT_NONE;
        user_entry->ref_count = 1;
 
-       ipc_kobject_set(user_handle, (ipc_kobject_t) user_entry,
-           IKOT_NAMED_ENTRY);
+       user_handle = ipc_kobject_alloc_port((ipc_kobject_t)user_entry,
+           IKOT_NAMED_ENTRY,
+           IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
 
        *user_entry_p = user_entry;
        *user_handle_p = user_handle;
@@ -3731,6 +3743,88 @@ memory_entry_access_tracking_internal(
        return kr;
 }
 
+kern_return_t
+mach_memory_entry_ownership(
+       ipc_port_t      entry_port,
+       task_t          owner,
+       int             ledger_tag,
+       int             ledger_flags)
+{
+       task_t                  cur_task;
+       kern_return_t           kr;
+       vm_named_entry_t        mem_entry;
+       vm_object_t             object;
+
+       cur_task = current_task();
+       if (cur_task != kernel_task &&
+           (owner != cur_task ||
+           (ledger_flags & VM_LEDGER_FLAG_NO_FOOTPRINT) ||
+           ledger_tag == VM_LEDGER_TAG_NETWORK)) {
+               /*
+                * An entitlement is required to:
+                * + tranfer memory ownership to someone else,
+                * + request that the memory not count against the footprint,
+                * + tag as "network" (since that implies "no footprint")
+                */
+               if (!cur_task->task_can_transfer_memory_ownership &&
+                   IOTaskHasEntitlement(cur_task,
+                   "com.apple.private.memory.ownership_transfer")) {
+                       cur_task->task_can_transfer_memory_ownership = TRUE;
+               }
+               if (!cur_task->task_can_transfer_memory_ownership) {
+                       return KERN_NO_ACCESS;
+               }
+       }
+
+       if (ledger_flags & ~VM_LEDGER_FLAGS) {
+               return KERN_INVALID_ARGUMENT;
+       }
+       if (ledger_tag <= 0 ||
+           ledger_tag > VM_LEDGER_TAG_MAX) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       if (!IP_VALID(entry_port) ||
+           ip_kotype(entry_port) != IKOT_NAMED_ENTRY) {
+               return KERN_INVALID_ARGUMENT;
+       }
+       mem_entry = (vm_named_entry_t) entry_port->ip_kobject;
+
+       named_entry_lock(mem_entry);
+
+       if (mem_entry->is_sub_map ||
+           mem_entry->is_copy) {
+               named_entry_unlock(mem_entry);
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       object = mem_entry->backing.object;
+       if (object == VM_OBJECT_NULL) {
+               named_entry_unlock(mem_entry);
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       vm_object_lock(object);
+
+       /* check that named entry covers entire object ? */
+       if (mem_entry->offset != 0 || object->vo_size != mem_entry->size) {
+               vm_object_unlock(object);
+               named_entry_unlock(mem_entry);
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       named_entry_unlock(mem_entry);
+
+       kr = vm_object_ownership_change(object,
+           ledger_tag,
+           owner,
+           ledger_flags,
+           FALSE);                             /* task_objq_locked */
+       vm_object_unlock(object);
+
+       return kr;
+}
+
 kern_return_t
 mach_memory_entry_get_page_counts(
        ipc_port_t      entry_port,
index 534b780efc4b487d8cc16499f2f9d2730fef8f07..46c5051e70671bec676a2db6dfb1a38fd6c9c798 100644 (file)
@@ -52,7 +52,7 @@ ${MIGINCLUDES} : ${MIG_TYPES}
 
 ${MIG_UUHDRS} : \
        %.h : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)$(MIG) $(MIGFLAGS)         \
                -server /dev/null       \
                -user /dev/null         \
@@ -61,7 +61,7 @@ ${MIG_UUHDRS} : \
 
 ${MIG_USHDRS} : \
        %_server.h : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)$(MIG) $(MIGFLAGS)         \
                -server /dev/null       \
                -user /dev/null         \
@@ -97,7 +97,7 @@ ${COMP_FILES} : ${MIG_TYPES}
 
 ${MIG_KUSRC} : \
        %_user.c : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKUFLAGS}        \
                -user    $*_user.c              \
                -header  $*.h              \
@@ -107,7 +107,7 @@ ${MIG_KUSRC} : \
 
 ${MIG_KSSRC}: \
        %_server.c : %.defs
-       @echo "$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0)"
+       $(call makelog,$(ColorM)MIG$(Color0) $(ColorF)$@$(Color0))
        $(_v)${MIG} ${MIGFLAGS} ${MIGKSFLAGS}        \
                -user    /dev/null              \
                -header  /dev/null              \
index 28d3f54b3e0403f79681b989f0fc1bc335904ff1..22118c32592d17b9fb0a530bcfebe7c805fa023b 100644 (file)
@@ -97,7 +97,7 @@ ipc_pthread_priority_release(ipc_voucher_attr_manager_t __assert_only manager);
 /*
  * communication channel from voucher system to IPC_PTHREAD_PRIORITY
  */
-struct ipc_voucher_attr_manager ipc_pthread_priority_manager = {
+const struct ipc_voucher_attr_manager ipc_pthread_priority_manager = {
        .ivam_release_value    = ipc_pthread_priority_release_value,
        .ivam_get_value        = ipc_pthread_priority_get_value,
        .ivam_extract_content  = ipc_pthread_priority_extract_content,
index 557fae0ecbbb76c00f779ee148c69874241c2781..5482ff39be6431f91415508bf64b70954be304ff 100644 (file)
@@ -31,6 +31,7 @@
 #include <i386/param.h>
 #include <i386/misc_protos.h>
 #include <i386/cpu_data.h>
+#include <i386/machine_cpu.h>
 #include <i386/machine_routines.h>
 #include <i386/cpuid.h>
 #include <i386/vmx.h>
@@ -79,7 +80,10 @@ const int copysize_limit_panic = (64 * MB);
  */
 extern int _bcopy(const void *, void *, vm_size_t);
 extern int _bcopystr(const void *, void *, vm_size_t, vm_size_t *);
-extern int _copyin_word(const char *src, uint64_t *dst, vm_size_t len);
+extern int _copyin_atomic32(const char *src, uint32_t *dst);
+extern int _copyin_atomic64(const char *src, uint64_t *dst);
+extern int _copyout_atomic32(const uint32_t *u32, char *src);
+extern int _copyout_atomic64(const uint64_t *u64, char *src);
 
 /* On by default, optionally disabled by boot-arg */
 extern boolean_t copyio_zalloc_check;
@@ -92,7 +96,10 @@ extern boolean_t copyio_zalloc_check;
 #define COPYINSTR       2       /* string variant of copyout */
 #define COPYINPHYS      3       /* from user virtual to kernel physical */
 #define COPYOUTPHYS     4       /* from kernel physical to user virtual */
-#define COPYINWORD      5       /* from user virtual to kernel virtual */
+#define COPYINATOMIC32  5       /* from user virtual to kernel virtual */
+#define COPYINATOMIC64  6       /* from user virtual to kernel virtual */
+#define COPYOUTATOMIC32 7       /* from user virtual to kernel virtual */
+#define COPYOUTATOMIC64 8       /* from user virtual to kernel virtual */
 
 #if ENABLE_SMAPLOG
 typedef struct {
@@ -210,11 +217,27 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr,
                goto out;
        }
 
+       if (copy_type >= COPYINATOMIC32 && copy_type <= COPYOUTATOMIC64) {
+               if (__improbable(pmap == kernel_pmap)) {
+                       error = EFAULT;
+                       goto out;
+               }
+       }
+
 #if KASAN
-       if (copy_type == COPYIN || copy_type == COPYINSTR || copy_type == COPYINWORD) {
+       switch (copy_type) {
+       case COPYIN:
+       case COPYINSTR:
+       case COPYINATOMIC32:
+       case COPYINATOMIC64:
                __asan_storeN((uptr)kernel_addr, nbytes);
-       } else if (copy_type == COPYOUT) {
+               break;
+       case COPYOUT:
+       case COPYOUTATOMIC32:
+       case COPYOUTATOMIC64:
                __asan_loadN((uptr)kernel_addr, nbytes);
+               kasan_check_uninitialized((vm_address_t)kernel_addr, nbytes);
+               break;
        }
 #endif
 
@@ -288,10 +311,24 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr,
                    nbytes);
                break;
 
-       case COPYINWORD:
-               error = _copyin_word((const void *) user_addr,
-                   (void *) kernel_addr,
-                   nbytes);
+       case COPYINATOMIC32:
+               error = _copyin_atomic32((const void *) user_addr,
+                   (void *) kernel_addr);
+               break;
+
+       case COPYINATOMIC64:
+               error = _copyin_atomic64((const void *) user_addr,
+                   (void *) kernel_addr);
+               break;
+
+       case COPYOUTATOMIC32:
+               error = _copyout_atomic32((const void *) kernel_addr,
+                   (void *) user_addr);
+               break;
+
+       case COPYOUTATOMIC64:
+               error = _copyout_atomic64((const void *) kernel_addr,
+                   (void *) user_addr);
                break;
 
        case COPYINSTR:
@@ -395,23 +432,63 @@ copyin(const user_addr_t user_addr, void *kernel_addr, vm_size_t nbytes)
 }
 
 /*
- * copyin_word
- * Read an aligned value from userspace as a single memory transaction.
- * This function supports userspace synchronization features
+ * copy{in,out}_atomic{32,64}
+ * Read or store an aligned value from userspace as a single memory transaction.
+ * These functions support userspace synchronization features
  */
 int
-copyin_word(const user_addr_t user_addr, uint64_t *kernel_addr, vm_size_t nbytes)
+copyin_atomic32(const user_addr_t user_addr, uint32_t *kernel_addr)
+{
+       /* Test alignment */
+       if (user_addr & 3) {
+               return EINVAL;
+       }
+       return copyio(COPYINATOMIC32, user_addr, (char *)(uintptr_t)kernel_addr, 4, NULL, 0);
+}
+
+int
+copyin_atomic32_wait_if_equals(const user_addr_t user_addr, uint32_t value)
+{
+       uint32_t u32;
+       int result = copyin_atomic32(user_addr, &u32);
+       if (__improbable(result)) {
+               return result;
+       }
+       if (u32 != value) {
+               return ESTALE;
+       }
+       cpu_pause();
+       return 0;
+}
+
+int
+copyin_atomic64(const user_addr_t user_addr, uint64_t *kernel_addr)
+{
+       /* Test alignment */
+       if (user_addr & 7) {
+               return EINVAL;
+       }
+       return copyio(COPYINATOMIC64, user_addr, (char *)(uintptr_t)kernel_addr, 8, NULL, 0);
+}
+
+int
+copyout_atomic32(uint32_t value, user_addr_t user_addr)
 {
-       /* Verify sizes */
-       if ((nbytes != 4) && (nbytes != 8)) {
+       /* Test alignment */
+       if (user_addr & 3) {
                return EINVAL;
        }
+       return copyio(COPYOUTATOMIC32, user_addr, (char *)&value, 4, NULL, 0);
+}
 
+int
+copyout_atomic64(uint64_t value, user_addr_t user_addr)
+{
        /* Test alignment */
-       if (user_addr & (nbytes - 1)) {
+       if (user_addr & 7) {
                return EINVAL;
        }
-       return copyio(COPYINWORD, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0);
+       return copyio(COPYOUTATOMIC64, user_addr, (char *)&value, 8, NULL, 0);
 }
 
 int
index cb72f459e30f897538cb9abbfde84d929f6e18c5..e09e1e1793906686bf94078317b582f2ed1a66bd 100644 (file)
 #include <i386/proc_reg.h>
 #include <assym.s>
 
+/*
+ * void Load_context(
+ *                   thread_t thread)    // %rdi
+ *
+ * Loads the first thread context to run on a CPU,
+ * i.e. without switching from a previous thread.
+ *
+ * returns 'old' thread in %rax (which is always NULL)
+ */
 Entry(Load_context)
-       movq    TH_KERNEL_STACK(%rdi),%rcx      /* get kernel stack */
-       leaq    -IKS_SIZE(%rcx),%rdx
-       addq    EXT(kernel_stack_size)(%rip),%rdx /* point to stack top */
-       movq    %rcx,%gs:CPU_ACTIVE_STACK       /* store stack address */
-       movq    %rdx,%gs:CPU_KERNEL_STACK       /* store stack top */
+       movq    %rdi, %rdx                      /* move thread arg to rdx */
 
-       movq    %rdx,%rsp
-       xorl    %ebp, %ebp
+       movq    %rdx,%gs:CPU_ACTIVE_THREAD      /* new thread is active */
+       movq    TH_KERNEL_STACK(%rdx),%rdx      /* get its kernel stack */
+       lea     -IKS_SIZE(%rdx),%rcx
+       add     EXT(kernel_stack_size)(%rip),%rcx /* point to stack top */
+
+       movq    %rdx,%gs:CPU_ACTIVE_STACK       /* set current stack */
+       movq    %rcx,%gs:CPU_KERNEL_STACK       /* set stack top */
 
-       xorl    %edi,%edi                       /* return zero (no old thread) */
-       call    EXT(thread_continue)
+       movq    KSS_RSP(%rcx),%rsp              /* switch stacks */
+       movq    KSS_RBX(%rcx),%rbx              /* restore registers */
+       movq    KSS_RBP(%rcx),%rbp
+       movq    KSS_R12(%rcx),%r12
+       movq    KSS_R13(%rcx),%r13
+       movq    KSS_R14(%rcx),%r14
+       movq    KSS_R15(%rcx),%r15
 
+       xorl    %eax, %eax                      /* set return value to zero (no old thread) */
+
+       jmp    *KSS_RIP(%rcx)                   /* return old thread */
 
 /*
  * thread_t Switch_context(
- *             thread_t old,                           // %rsi
- *             thread_continue_t continuation,         // %rdi
+ *             thread_t old,                           // %rdi
+ *             thread_continue_t continuation,         // %rsi
  *             thread_t new)                           // %rdx
+ *
+ * returns 'old' thread in %rax
  */
 Entry(Switch_context)
        popq    %rax                            /* pop return PC */
@@ -114,14 +134,21 @@ Entry(Switch_context)
        movq    KSS_R13(%rcx),%r13
        movq    KSS_R14(%rcx),%r14
        movq    KSS_R15(%rcx),%r15
-       jmp     *KSS_RIP(%rcx)                  /* return old thread */
-
+       jmp     *KSS_RIP(%rcx)                  /* return old thread in %rax */
 
+/*
+ * machine_stack_attach sets this as the RIP of newly-attached stacks
+ * %rbx is the C routine to call
+ * %rax is the parameter to pass to the C routine
+ *
+ * This stub is needed to convert the return value of the old thread from Switch_context
+ * in %rax into a parameter to thread_continue passed in %rdi, because using the
+ * same register for the first argument and first retval makes too much sense for the SysV ABI.
+ */
 Entry(Thread_continue)
        movq    %rax, %rdi                      /* this is the old thread from Switch_context */
-       xorq    %rbp,%rbp                       /* zero frame pointer */
        call    *%rbx                           /* call real continuation */
-
+       int3                                    /* (should never return) */
 
 /*
  * thread_t Shutdown_context(
@@ -131,9 +158,7 @@ Entry(Thread_continue)
  *
  * saves the kernel context of the thread,
  * switches to the interrupt stack,
- * continues the thread (with thread_continue),
  * then runs routine on the interrupt stack.
- *
  */
 Entry(Shutdown_context)
        movq    %gs:CPU_KERNEL_STACK,%rcx       /* get old kernel stack top */
@@ -143,7 +168,8 @@ Entry(Shutdown_context)
        movq    %r13,KSS_R13(%rcx)
        movq    %r14,KSS_R14(%rcx)
        movq    %r15,KSS_R15(%rcx)
-       popq    KSS_RIP(%rcx)                   /* save return PC */
+       popq    %r8                             /* extract return PC */
+       movq    %r8,KSS_RIP(%rcx)               /* save return PC */
        movq    %rsp,KSS_RSP(%rcx)              /* save SP */
 
        movq    %gs:CPU_ACTIVE_STACK,%rcx       /* get old kernel stack */
@@ -155,7 +181,12 @@ Entry(Shutdown_context)
        movq    %rsp, %gs:CPU_ACTIVE_STACK
        movq    EXT(kernel_stack_size)(%rip),%rcx /* point to stack top */
        subq    %rcx, %gs:CPU_ACTIVE_STACK
+
+       pushq   %r8                             /* set up a call frame on new stack */
+       pushq   %rbp
+       movq    %rsp, %rbp
+
        movq    %rdx,%rdi                       /* processor arg to routine */
        call    *%rsi                           /* call routine to run */
-       hlt                                     /* (should never return) */
+       int3                                    /* (should never return) */
 
index d17bb5bb7b2ca8ab4fa3596f42791dbe65fbcaa3..d54c1c095b99f6ed4fc4374cb746e1540cd491ac 100644 (file)
@@ -87,7 +87,7 @@ EXT(idt64_hndl_table0):
 /* 0x00 */     .quad EXT(ks_dispatch)
 /* 0x08 */     .quad EXT(ks_64bit_return)
 /* 0x10 */     .quad 0 /* Populated with CPU shadow displacement*/
-/* 0x18 */     .quad EXT(ks_return)
+/* 0x18 */     .quad EXT(ks_32bit_return)
 #define        TBL0_OFF_DISP_USER_WITH_POPRAX  0x20
 /* 0x20 */     .quad EXT(ks_dispatch_user_with_pop_rax)
 #define        TBL0_OFF_DISP_KERN_WITH_POPRAX  0x28
@@ -244,9 +244,14 @@ Entry(idt64_mdep_scall)
  * PCB stack and then dispatch as normal.
  * For faults in kernel-space, we need to scrub for kernel exit faults and
  * treat these as user-space faults. But for all other kernel-space faults
- * we continue to run on the IST1 stack and we dispatch to handle the fault
+ * we continue to run on the IST1 stack as we dispatch to handle the fault
  * as fatal.
  */
+Entry(idt64_segnp)
+       pushq   $(HNDL_ALLTRAPS)
+       pushq   $(T_SEGMENT_NOT_PRESENT)
+       jmp     L_check_for_kern_flt
+
 Entry(idt64_gen_prot)
        pushq   $(HNDL_ALLTRAPS)
        pushq   $(T_GENERAL_PROTECTION)
@@ -267,6 +272,16 @@ L_check_for_kern_flt:
        pushq   %rax
        testb   $3, 8+ISF64_CS(%rsp)
        jnz     L_dispatch_from_user_no_push_rax                /* Fault from user, go straight to dispatch */
+
+       /* Check if the fault occurred in the 32-bit segment restoration window (which executes with user gsb) */
+       leaq    L_32bit_seg_restore_begin(%rip), %rax
+       cmpq    %rax, 8+ISF64_RIP(%rsp)
+       jb      L_not_32bit_segrestores
+       leaq    L_32bit_seg_restore_done(%rip), %rax
+       cmpq    %rax, 8+ISF64_RIP(%rsp)
+       jae     L_not_32bit_segrestores
+       jmp     1f
+L_not_32bit_segrestores:
        leaq    EXT(ret32_iret)(%rip), %rax
        cmpq    %rax, 8+ISF64_RIP(%rsp)
        je      1f
@@ -309,6 +324,7 @@ L_check_for_kern_flt:
        /*
         * Fix the stack so the original trap frame is current, then jump to dispatch
         */
+
        movq    %rax, 16+ISF64_CS(%rsp)
 
        movq    ISF64_RSP-24(%rbx), %rax
@@ -326,10 +342,6 @@ L_check_for_kern_flt:
        popq    %rbx
        jmp     L_dispatch_from_user_no_push_rax
 
-Entry(idt64_segnp)
-       pushq   $(HNDL_ALLTRAPS)
-       pushq   $(T_SEGMENT_NOT_PRESENT)
-       jmp     L_dispatch
 
 /*
  * Fatal exception handlers:
@@ -551,6 +563,7 @@ L_dispatch_from_kernel_no_push_rax:
        jmp *(%rax)
 /* User return: register restoration and address space switch sequence */
 Entry(ks_64bit_return)
+
        mov     R64_R14(%r15), %r14
        mov     R64_R13(%r15), %r13
        mov     R64_R12(%r15), %r12
@@ -810,6 +823,176 @@ L_64bit_entry_reject:
        movq    $(T_INVALID_OPCODE), 8+ISF64_TRAPNO(%rsp)
        jmp     L_dispatch_kgsb
 
+Entry(ks_32bit_return)
+
+       /* Validate CS/DS/ES/FS/GS segment selectors with the Load Access Rights instruction prior to restoration */
+       /* Exempt "known good" statically configured selectors, e.g. USER_CS, USER_DS and 0 */
+       cmpl    $(USER_CS), R32_CS(%r15)
+       jz      11f
+       larw    R32_CS(%r15), %ax
+       jnz     L_32_reset_cs
+       /* Ensure that the segment referenced by CS in the saved state is a code segment (bit 11 == 1) */
+       testw   $0x800, %ax
+       jz      L_32_reset_cs           /* Update stored %cs with known-good selector if ZF == 1 */
+       jmp     11f
+L_32_reset_cs:
+       movl    $(USER_CS), R32_CS(%r15)
+11:
+       cmpl    $(USER_DS), R32_DS(%r15)
+       jz      22f
+       cmpl    $0, R32_DS(%r15)
+       jz      22f
+       larw    R32_DS(%r15), %ax
+       jz      22f
+       movl    $(USER_DS), R32_DS(%r15)
+22:
+       cmpl    $(USER_DS), R32_ES(%r15)
+       jz      33f
+       cmpl    $0, R32_ES(%r15)
+       jz      33f
+       larw    R32_ES(%r15), %ax
+       jz      33f
+       movl    $(USER_DS), R32_ES(%r15)
+33:
+       cmpl    $(USER_DS), R32_FS(%r15)
+       jz      44f
+       cmpl    $0, R32_FS(%r15)
+       jz      44f
+       larw    R32_FS(%r15), %ax
+       jz      44f
+       movl    $(USER_DS), R32_FS(%r15)
+44:
+       cmpl    $(USER_CTHREAD), R32_GS(%r15)
+       jz      55f
+       cmpl    $0, R32_GS(%r15)
+       jz      55f
+       larw    R32_GS(%r15), %ax
+       jz      55f
+       movl    $(USER_CTHREAD), R32_GS(%r15)
+55:
+
+       /*
+        * Restore general 32-bit registers
+        */
+       movl    R32_EAX(%r15), %eax
+       movl    R32_EBX(%r15), %ebx
+       movl    R32_ECX(%r15), %ecx
+       movl    R32_EDX(%r15), %edx
+       movl    R32_EBP(%r15), %ebp
+       movl    R32_ESI(%r15), %esi
+       movl    R32_EDI(%r15), %edi
+       movl    R32_DS(%r15), %r8d
+       movl    R32_ES(%r15), %r9d
+       movl    R32_FS(%r15), %r10d
+       movl    R32_GS(%r15), %r11d
+
+       /* Switch to the per-cpu (doublemapped) exception stack */
+       mov     %gs:CPU_ESTACK, %rsp
+
+       /* Now transfer the ISF to the exception stack in preparation for iret, below */
+       movl    R32_SS(%r15), %r12d
+       push    %r12
+       movl    R32_UESP(%r15), %r12d
+       push    %r12
+       movl    R32_EFLAGS(%r15), %r12d
+       push    %r12
+       movl    R32_CS(%r15), %r12d
+       push    %r12
+       movl    R32_EIP(%r15), %r12d
+       push    %r12
+
+       movl    %gs:CPU_NEED_SEGCHK, %r14d      /* %r14 will be zeroed just before we return */
+
+       /*
+        * Finally, switch to the user pagetables.  After this, all %gs-relative
+        * accesses MUST be to cpu shadow data ONLY.  Note that after we restore %gs
+        * (after the swapgs), no %gs-relative accesses should be performed.
+        */
+       /* Discover user cr3/ASID */
+       mov     %gs:CPU_UCR3, %r13
+#if    DEBUG
+       mov     %r13, %gs:CPU_EXIT_CR3
+#endif
+       mov     %r13, %cr3
+
+       swapgs
+
+       /*
+        * Restore segment registers. A #GP taken here will push state onto IST1,
+        * not the exception stack.  Note that the placement of the labels here
+        * corresponds to the fault address-detection logic (so do not change them
+        * without also changing that code).
+        */
+L_32bit_seg_restore_begin:
+       mov     %r8, %ds
+       mov     %r9, %es
+       mov     %r10, %fs
+       mov     %r11, %gs
+L_32bit_seg_restore_done:
+
+       /* Zero 64-bit-exclusive GPRs to prevent data leaks */
+       xor     %r8, %r8
+       xor     %r9, %r9
+       xor     %r10, %r10
+       xor     %r11, %r11
+       xor     %r12, %r12
+       xor     %r13, %r13
+       xor     %r15, %r15
+
+       /*
+        * At this point, the stack contains:
+        *
+        * +--------------+
+        * |  Return SS   | +32
+        * |  Return RSP  | +24
+        * |  Return RFL  | +16
+        * |  Return CS   | +8
+        * |  Return RIP  | <-- rsp
+        * +--------------+
+        */
+
+       cmpl    $(SYSENTER_CS), 8(%rsp)
+                                       /* test for sysexit */
+       je      L_rtu_via_sysexit
+
+       cmpl    $1, %r14d
+       je      L_verw_island
+
+L_after_verw:
+       xor     %r14, %r14
+
+.globl EXT(ret32_iret)
+EXT(ret32_iret):
+       iretq                           /* return from interrupt */
+
+L_verw_island:
+       verw    32(%rsp)
+       jmp     L_after_verw
+
+L_verw_island_1:
+       verw    16(%rsp)
+       jmp     L_after_verw_1
+
+L_rtu_via_sysexit:
+       pop     %rdx                    /* user return eip */
+       pop     %rcx                    /* pop and toss cs */
+       andl    $(~EFL_IF), (%rsp)      /* clear interrupts enable, sti below */
+
+       /*
+        * %ss is now at 16(%rsp)
+        */
+       cmpl    $1, %r14d
+       je      L_verw_island_1
+L_after_verw_1:
+       xor     %r14, %r14
+
+       popf                            /* flags - carry denotes failure */
+       pop     %rcx                    /* user return esp */
+
+
+       sti                             /* interrupts enabled after sysexit */
+       sysexitl                        /* 32-bit sysexit */
+
 /* End of double-mapped TEXT */
 .text
 
@@ -842,9 +1025,6 @@ Entry(ks_dispatch_user_with_pop_rax)
        pop     %rax
        jmp     EXT(ks_dispatch_user)
 
-Entry (ks_return)
-       jmp     .
-
 Entry(ks_dispatch_user)
        cmpl    $(TASK_MAP_32BIT), %gs:CPU_TASK_MAP
        je      L_dispatch_U32          /* 32-bit user task */
@@ -1086,7 +1266,15 @@ L_cr3_switch_return:
        movq    $0, %gs:CPU_DR7
 4:
        cmpl    $(SS_64), SS_FLAVOR(%r15)       /* 64-bit state? */
-       je      L_64bit_return
+       jne     L_32bit_return
+
+       /*
+        * Restore general 64-bit registers.
+        * Here on fault stack and PCB address in R15.
+        */
+       leaq    EXT(idt64_hndl_table0)(%rip), %rax
+       jmp     *8(%rax)
+
 
 L_32bit_return:
 #if DEBUG_IDT64
@@ -1098,155 +1286,9 @@ L_32bit_return:
 1:
 #endif /* DEBUG_IDT64 */
 
-       /*
-        * Restore registers into the machine state for iret.
-        * Here on fault stack and PCB address in R11.
-        */
-       movl    R32_EIP(%r15), %eax
-       movl    %eax, R64_RIP(%r15)
-       movl    R32_EFLAGS(%r15), %eax
-       movl    %eax, R64_RFLAGS(%r15)
-       movl    R32_CS(%r15), %eax
-       movl    %eax, R64_CS(%r15)
-       movl    R32_UESP(%r15), %eax
-       movl    %eax, R64_RSP(%r15)
-       movl    R32_SS(%r15), %eax
-       movl    %eax, R64_SS(%r15)
-
-       /* Validate CS/DS/ES/FS/GS segment selectors with the Load Access Rights instruction prior to restoration */
-       /* Exempt "known good" statically configured selectors, e.g. USER_CS, USER_DS and 0 */
-       cmpl    $(USER_CS), R32_CS(%r15)
-       jz      11f
-       larw    R32_CS(%r15), %ax
-       jnz     L_32_reset_cs
-       /* Ensure that the segment referenced by CS in the saved state is a code segment (bit 11 == 1) */
-       testw   $0x800, %ax
-       jz      L_32_reset_cs           /* Update stored %cs with known-good selector if ZF == 1 */
-       jmp     11f
-L_32_reset_cs:
-       movl    $(USER_CS), R32_CS(%r15)
-11:
-       cmpl    $(USER_DS), R32_DS(%r15)
-       jz      22f
-       cmpl    $0, R32_DS(%r15)
-       jz      22f
-       larw    R32_DS(%r15), %ax
-       jz      22f
-       movl    $(USER_DS), R32_DS(%r15)
-22:
-       cmpl    $(USER_DS), R32_ES(%r15)
-       jz      33f
-       cmpl    $0, R32_ES(%r15)
-       jz      33f
-       larw    R32_ES(%r15), %ax
-       jz      33f
-       movl    $(USER_DS), R32_ES(%r15)
-33:
-       cmpl    $(USER_DS), R32_FS(%r15)
-       jz      44f
-       cmpl    $0, R32_FS(%r15)
-       jz      44f
-       larw    R32_FS(%r15), %ax
-       jz      44f
-       movl    $(USER_DS), R32_FS(%r15)
-44:
-       cmpl    $(USER_CTHREAD), R32_GS(%r15)
-       jz      55f
-       cmpl    $0, R32_GS(%r15)
-       jz      55f
-       larw    R32_GS(%r15), %ax
-       jz      55f
-       movl    $(USER_CTHREAD), R32_GS(%r15)
-55:
-       /*
-        * Restore general 32-bit registers
-        */
-       movl    R32_EAX(%r15), %eax
-       movl    R32_EBX(%r15), %ebx
-       movl    R32_ECX(%r15), %ecx
-       movl    R32_EDX(%r15), %edx
-       movl    R32_EBP(%r15), %ebp
-       movl    R32_ESI(%r15), %esi
-       movl    R32_EDI(%r15), %edi
-
-       /*
-        * Restore segment registers. A segment exception taken here will
-        * push state on the IST1 stack and will not affect the "PCB stack".
-        */
-       mov     %r15, %rsp              /* Set the PCB as the stack */
-       movl    %gs:CPU_NEED_SEGCHK, %r14d      /* %r14 will be restored below */
-       swapgs
-
-       /* Zero 64-bit-exclusive GPRs to prevent data leaks */
-       xor     %r8, %r8
-       xor     %r9, %r9
-       xor     %r10, %r10
-       xor     %r11, %r11
-       xor     %r12, %r12
-       xor     %r13, %r13
-       xor     %r15, %r15
-
-       movw    R32_DS(%rsp), %ds
-       movw    R32_ES(%rsp), %es
-       movw    R32_FS(%rsp), %fs
-       movw    R32_GS(%rsp), %gs
-
-       /* pop compat frame + trapno, trapfn and error */       
-       add     $(ISS64_OFFSET)+8+8+8, %rsp
-
-       /*
-        * At this point, the stack contains:
-        *
-        * +--------------+
-        * |  Return SS   | +32
-        * |  Return RSP  | +24
-        * |  Return RFL  | +16
-        * |  Return CS   | +8
-        * |  Return RIP  | <-- rsp
-        * +--------------+
-        */
-
-       cmpl    $(SYSENTER_CS), 8(%rsp)
-                                       /* test for sysexit */
-       je      L_rtu_via_sysexit
-
-       cmpl    $1, %r14d
-       je      L_verw_island
-
-L_after_verw:
-       xor     %r14, %r14
-
-.globl EXT(ret32_iret)
-EXT(ret32_iret):
-       iretq                           /* return from interrupt */
-
-L_verw_island:
-       verw    32(%rsp)
-       jmp     L_after_verw
-
-L_verw_island_1:
-       verw    16(%rsp)
-       jmp     L_after_verw_1
-
-L_rtu_via_sysexit:
-       pop     %rdx                    /* user return eip */
-       pop     %rcx                    /* pop and toss cs */
-       andl    $(~EFL_IF), (%rsp)      /* clear interrupts enable, sti below */
-
-       /*
-        * %ss is now at 16(%rsp)
-        */
-       cmpl    $1, %r14d
-       je      L_verw_island_1
-L_after_verw_1:
-       xor     %r14, %r14
-
-       popf                            /* flags - carry denotes failure */
-       pop     %rcx                    /* user return esp */
-
+       leaq    EXT(idt64_hndl_table0)(%rip), %rax
+       jmp     *0x18(%rax)
 
-       sti                             /* interrupts enabled after sysexit */
-       sysexitl                        /* 32-bit sysexit */
 
 L_dr_restore_island:
        movq    TH_PCB_IDS(%rdx),%rax   /* Obtain this thread's debug state */
@@ -1298,8 +1340,6 @@ ret_to_kernel:
        hlt
 2:
 #endif
-
-L_64bit_return:
        /*
         * Restore general 64-bit registers.
         * Here on fault stack and PCB address in R15.
index da2ccc40dfa999544761d6e343fd67c0cd433482..ce27db8dd3fce41a7cb3a883fc11a1f4ae4a95b9 100644 (file)
@@ -227,7 +227,7 @@ kpc_reload_configurable(int ctr)
        return old;
 }
 
-void kpc_pmi_handler(x86_saved_state_t *state);
+void kpc_pmi_handler(void);
 
 static void
 set_running_fixed(boolean_t on)
@@ -470,7 +470,7 @@ kpc_get_curcpu_counters_mp_call(void *args)
        r = kpc_get_curcpu_counters(handler->classes, NULL, &handler->buf[offset]);
 
        /* number of counters added by this CPU, needs to be atomic  */
-       hw_atomic_add(&(handler->nb_counters), r);
+       os_atomic_add(&(handler->nb_counters), r, relaxed);
 }
 
 int
@@ -632,7 +632,7 @@ kpc_set_config_arch(struct kpc_config_remote *mp_config)
 
 /* PMI stuff */
 void
-kpc_pmi_handler(__unused x86_saved_state_t *state)
+kpc_pmi_handler(void)
 {
        uint64_t status, extra;
        uint32_t ctr;
index d88f2a08a32dd77eb9631712b9a2ef0ad68f9923..4c71fd46186194cd2ebbe8f2896b94be30a5a86c 100644 (file)
@@ -319,35 +319,85 @@ _bcopystr_fail:
        ret
 
 /*
- * Copyin 32 or 64 bit aligned word as a single transaction
+ * Copyin 32 bit aligned word as a single transaction
  * rdi: source address (user)
  * rsi: destination address (kernel)
- * rdx: size (4 or 8)
  */
-Entry(_copyin_word)
+Entry(_copyin_atomic32)
        pushq   %rbp                    /* Save registers */
        movq    %rsp, %rbp
-       cmpl    $0x4, %edx              /* If size = 4 */
-       je      L_copyin_word_4         /*      handle 32-bit load */
-       movl    $(EINVAL), %eax         /* Set up error status */
-       cmpl    $0x8, %edx              /* If size != 8 */
-       jne     L_copyin_word_exit      /*      exit with error */
        RECOVERY_SECTION
-       RECOVER(L_copyin_word_fail)     /* Set up recovery handler for next instruction*/
-       movq    (%rdi), %rax            /* Load quad from user */
-       jmp     L_copyin_word_store
-L_copyin_word_4:
-       RECOVERY_SECTION
-       RECOVER(L_copyin_word_fail)     /* Set up recovery handler for next instruction */
+       RECOVER(L_copyin_atomic32_fail) /* Set up recovery handler for next instruction */
        movl    (%rdi), %eax            /* Load long from user */
-L_copyin_word_store:
+       movl    %eax, (%rsi)            /* Store to kernel */
+       xorl    %eax, %eax              /* Return success */
+       popq    %rbp                    /* Restore registers */
+       retq                            /* Return */
+
+L_copyin_atomic32_fail:
+       movl    $(EFAULT), %eax         /* Return error for failure */
+       popq    %rbp                    /* Restore registers */
+       retq                            /* Return */
+
+/*
+ * Copyin 64 bit aligned word as a single transaction
+ * rdi: source address (user)
+ * rsi: destination address (kernel)
+ */
+Entry(_copyin_atomic64)
+       pushq   %rbp                    /* Save registers */
+       movq    %rsp, %rbp
+       RECOVERY_SECTION
+       RECOVER(L_copyin_atomic64_fail) /* Set up recovery handler for next instruction*/
+       movq    (%rdi), %rax            /* Load quad from user */
        movq    %rax, (%rsi)            /* Store to kernel */
        xorl    %eax, %eax              /* Return success */
-L_copyin_word_exit:
        popq    %rbp                    /* Restore registers */
        retq                            /* Return */
 
-L_copyin_word_fail:
+L_copyin_atomic64_fail:
+       movl    $(EFAULT), %eax         /* Return error for failure */
+       popq    %rbp                    /* Restore registers */
+       retq                            /* Return */
+
+/*
+ * Copyin 32 bit aligned word as a single transaction
+ * rdi: source address (kernel)
+ * rsi: destination address (user)
+ */
+Entry(_copyout_atomic32)
+       pushq   %rbp                    /* Save registers */
+       movq    %rsp, %rbp
+       movl    (%rdi), %eax            /* Load long from kernel */
+       RECOVERY_SECTION
+       RECOVER(L_copyout_atomic32_fail)        /* Set up recovery handler for next instruction*/
+       movl    %eax, (%rsi)            /* Store long to user */
+       xorl    %eax, %eax              /* Return success */
+       popq    %rbp                    /* Restore registers */
+       retq                            /* Return */
+
+L_copyout_atomic32_fail:
+       movl    $(EFAULT), %eax         /* Return error for failure */
+       popq    %rbp                    /* Restore registers */
+       retq                            /* Return */
+
+/*
+ * Copyin 64 bit aligned word as a single transaction
+ * rdi: source address (kernel)
+ * rsi: destination address (user)
+ */
+Entry(_copyout_atomic64)
+       pushq   %rbp                    /* Save registers */
+       movq    %rsp, %rbp
+       movq    (%rdi), %rax            /* Load quad from kernel */
+       RECOVERY_SECTION
+       RECOVER(L_copyout_atomic64_fail)        /* Set up recovery handler for next instruction*/
+       movq    %rax, (%rsi)            /* Store quad to user */
+       xorl    %eax, %eax              /* Return success */
+       popq    %rbp                    /* Restore registers */
+       retq                            /* Return */
+
+L_copyout_atomic64_fail:
        movl    $(EFAULT), %eax         /* Return error for failure */
        popq    %rbp                    /* Restore registers */
        retq                            /* Return */
index cb63ffcad1b3a32b2ad4f575aae5e75918ce46db..807ecfc52c652b7f0b048e567d16fa36c669015a 100644 (file)
@@ -912,14 +912,10 @@ void
 fillPage(ppnum_t pa, unsigned int fill)
 {
        uint64_t        src;
-       int             i;
        int             cnt = PAGE_SIZE / sizeof(unsigned int);
-       unsigned int   *addr;
 
        src = i386_ptob(pa);
-       for (i = 0, addr = (unsigned int *)PHYSMAP_PTOV(src); i < cnt; i++) {
-               *addr++ = fill;
-       }
+       memset_word((int *)PHYSMAP_PTOV(src), fill, cnt);
 }
 
 static inline void
index 3c834c04142979bc4cc6867a1b1b1ba29226e6e4..6172f3181186a720ceaae26a8feb00011851cbad 100644 (file)
@@ -27,7 +27,7 @@
  */
 #include <kern/misc_protos.h>
 #include <x86_64/machine_remote_time.h>
-#include <stdatomic.h>
+#include <machine/atomic.h>
 #include <kern/locks.h>
 #include <kern/clock.h>
 
@@ -55,10 +55,10 @@ mach_bridge_register_regwrite_timestamp_callback(mach_bridge_regwrite_timestamp_
 {
        static uint64_t delay_amount = 0;
 
-       if (!atomic_load(&bt_init_flag)) {
+       if (!os_atomic_load(&bt_init_flag, relaxed)) {
                mach_bridge_timer_init();
                nanoseconds_to_absolutetime(DELAY_INTERVAL_NS, &delay_amount);
-               bt_init_flag = 1;
+               os_atomic_store(&bt_init_flag, 1, release);
        }
 
        lck_spin_lock(bt_maintenance_lock);
index 9a69f0805997bf11f12568b491aa538bea375abf..b182f653a3dc002af0ffd1aea03642794d6ef06d 100644 (file)
@@ -158,17 +158,33 @@ mt_core_set_snap(unsigned int ctr, uint64_t count)
 
 #define GLOBAL_OVF 0x390
 
+static void mt_check_for_pmi(struct mt_cpu *mtc, x86_saved_state_t *state);
+
+static void
+enable_counters(void)
+{
+       wrmsr64(FIXED_CTR_CTRL, FIXED_CTR_CTRL_INIT | FIXED_CTR_CTRL_ENABLE);
+       wrmsr64(GLOBAL_CTRL, GLOBAL_CTRL_FIXED_EN);
+}
+
+static void
+disable_counters(void)
+{
+       wrmsr64(GLOBAL_CTRL, 0);
+}
+
 static void
 core_down(cpu_data_t *cpu)
 {
        if (!mt_core_supported) {
                return;
        }
-
        assert(ml_get_interrupts_enabled() == FALSE);
+       struct mt_cpu *mtc = &cpu->cpu_monotonic;
 
-       wrmsr64(GLOBAL_CTRL, 0);
-       mt_mtc_update_fixed_counts(&cpu->cpu_monotonic, NULL, NULL);
+       disable_counters();
+       mt_mtc_update_fixed_counts(mtc, NULL, NULL);
+       mtc->mtc_active = false;
 }
 
 static void
@@ -187,8 +203,8 @@ core_up(cpu_data_t *cpu)
        for (int i = 0; i < MT_CORE_NFIXED; i++) {
                mt_core_set_snap(i, mtc->mtc_snaps[i]);
        }
-       wrmsr64(FIXED_CTR_CTRL, FIXED_CTR_CTRL_INIT | FIXED_CTR_CTRL_ENABLE);
-       wrmsr64(GLOBAL_CTRL, GLOBAL_CTRL_FIXED_EN);
+       enable_counters();
+       mtc->mtc_active = true;
 }
 
 void
@@ -206,17 +222,27 @@ mt_cpu_up(cpu_data_t *cpu)
        ml_set_interrupts_enabled(intrs_en);
 }
 
-static int
-mt_pmi_x86_64(x86_saved_state_t *state)
+uint64_t
+mt_count_pmis(void)
 {
-       uint64_t status;
-       struct mt_cpu *mtc;
+       uint64_t npmis = 0;
+       for (unsigned int i = 0; i < real_ncpus; i++) {
+               cpu_data_t *cpu = cpu_data_ptr[i];
+               npmis += cpu->cpu_monotonic.mtc_npmis;
+       }
+       return npmis;
+}
 
-       assert(ml_get_interrupts_enabled() == FALSE);
-       mtc = mt_cur_cpu();
-       status = rdmsr64(GLOBAL_STATUS);
+static void
+mt_check_for_pmi(struct mt_cpu *mtc, x86_saved_state_t *state)
+{
+       uint64_t status = rdmsr64(GLOBAL_STATUS);
+
+       mtc->mtc_npmis += 1;
 
-       (void)atomic_fetch_add_explicit(&mt_pmis, 1, memory_order_relaxed);
+       if (mtc->mtc_active) {
+               disable_counters();
+       }
 
        for (unsigned int i = 0; i < MT_CORE_NFIXED; i++) {
                if (status & CTR_FIX_POS(i)) {
@@ -228,8 +254,11 @@ mt_pmi_x86_64(x86_saved_state_t *state)
                        mtc->mtc_counts[i] += delta;
 
                        if (mt_microstackshots && mt_microstackshot_ctr == i) {
-                               x86_saved_state64_t *state64 = saved_state64(state);
-                               bool user_mode = (state64->isf.cs & 0x3) ? true : false;
+                               bool user_mode = false;
+                               if (state) {
+                                       x86_saved_state64_t *state64 = saved_state64(state);
+                                       user_mode = (state64->isf.cs & 0x3) != 0;
+                               }
                                KDBG_RELEASE(KDBG_EVENTID(DBG_MONOTONIC, DBG_MT_DEBUG, 1),
                                    mt_microstackshot_ctr, user_mode);
                                mt_microstackshot_pmi_handler(user_mode, mt_microstackshot_ctx);
@@ -245,9 +274,20 @@ mt_pmi_x86_64(x86_saved_state_t *state)
 
        /* if any of the configurable counters overflowed, tell kpc */
        if (status & ((UINT64_C(1) << 4) - 1)) {
-               extern void kpc_pmi_handler(x86_saved_state_t *state);
-               kpc_pmi_handler(state);
+               extern void kpc_pmi_handler(void);
+               kpc_pmi_handler();
+       }
+
+       if (mtc->mtc_active) {
+               enable_counters();
        }
+}
+
+static int
+mt_pmi_x86_64(x86_saved_state_t *state)
+{
+       assert(ml_get_interrupts_enabled() == FALSE);
+       mt_check_for_pmi(mt_cur_cpu(), state);
        return 0;
 }
 
@@ -290,6 +330,9 @@ mt_microstackshot_start_arch(uint64_t period)
 void
 mt_early_init(void)
 {
+       if (PE_parse_boot_argn("-nomt_core", NULL, 0)) {
+               return;
+       }
        i386_cpu_info_t *info = cpuid_info();
        if (info->cpuid_arch_perf_leaf.version >= 2) {
                lapic_set_pmi_func((i386_intr_func_t)mt_pmi_x86_64);
index 50557b01077a0d434a606b6296b069efe0a790b3..87298757b174eacc92f7274de743d11972286acf 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <libkern/kernel_mach_header.h>
 
 #include <pexpert/i386/efi.h>
-
+#include <libkern/section_keywords.h>
 #if MACH_ASSERT
 int pmap_stats_assert = 1;
 #endif /* MACH_ASSERT */
@@ -192,11 +192,11 @@ uint32_t npvhashmask = 0, npvhashbuckets = 0;
 
 pv_hashed_entry_t       pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
 pv_hashed_entry_t       pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
-decl_simple_lock_data(, pv_hashed_free_list_lock)
-decl_simple_lock_data(, pv_hashed_kern_free_list_lock)
-decl_simple_lock_data(, pv_hash_table_lock)
+decl_simple_lock_data(, pv_hashed_free_list_lock);
+decl_simple_lock_data(, pv_hashed_kern_free_list_lock);
+decl_simple_lock_data(, pv_hash_table_lock);
 
-decl_simple_lock_data(, phys_backup_lock)
+decl_simple_lock_data(, phys_backup_lock);
 
 zone_t          pv_hashed_list_zone;    /* zone of pv_hashed_entry structures */
 
@@ -229,7 +229,7 @@ pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
 #define current_pmap()          (vm_map_pmap(current_thread()->map))
 
 struct pmap     kernel_pmap_store;
-pmap_t          kernel_pmap;
+SECURITY_READ_ONLY_LATE(pmap_t)          kernel_pmap = NULL;
 
 struct zone     *pmap_zone;             /* zone of pmap structures */
 
@@ -244,16 +244,16 @@ int             pt_fake_zone_index = -1;
 
 extern  long    NMIPI_acks;
 
-boolean_t       kernel_text_ps_4K = TRUE;
+SECURITY_READ_ONLY_LATE(boolean_t)       kernel_text_ps_4K = TRUE;
 
 extern char     end;
 
 static int      nkpt;
 
 #if DEVELOPMENT || DEBUG
-boolean_t       pmap_disable_kheap_nx = FALSE;
-boolean_t       pmap_disable_kstack_nx = FALSE;
-boolean_t       wpkernel = TRUE;
+SECURITY_READ_ONLY_LATE(boolean_t)       pmap_disable_kheap_nx = FALSE;
+SECURITY_READ_ONLY_LATE(boolean_t)       pmap_disable_kstack_nx = FALSE;
+SECURITY_READ_ONLY_LATE(boolean_t)       wpkernel = TRUE;
 #else
 const boolean_t wpkernel = TRUE;
 #endif
@@ -410,7 +410,7 @@ pmap_bootstrap(
         */
 
        kernel_pmap = &kernel_pmap_store;
-       kernel_pmap->ref_count = 1;
+       os_ref_init(&kernel_pmap->ref_count, NULL);
 #if DEVELOPMENT || DEBUG
        kernel_pmap->nx_enabled = TRUE;
 #endif
@@ -699,6 +699,37 @@ hibernate_rebuild_pmap_structs(void)
 
 #endif
 
+/*
+ * Create pv entries for kernel pages mapped by early startup code.
+ * These have to exist so we can ml_static_mfree() them later.
+ */
+static void
+pmap_pv_fixup(vm_offset_t start_va, vm_offset_t end_va)
+{
+       ppnum_t           ppn;
+       pv_rooted_entry_t pv_h;
+       uint32_t          pgsz;
+
+       start_va = round_page(start_va);
+       end_va = trunc_page(end_va);
+       while (start_va < end_va) {
+               pgsz = PAGE_SIZE;
+               ppn = pmap_find_phys(kernel_pmap, start_va);
+               if (ppn != 0 && IS_MANAGED_PAGE(ppn)) {
+                       pv_h = pai_to_pvh(ppn);
+                       assert(pv_h->qlink.next == 0);           /* shouldn't be init'd yet */
+                       assert(pv_h->pmap == 0);
+                       pv_h->va_and_flags = start_va;
+                       pv_h->pmap = kernel_pmap;
+                       queue_init(&pv_h->qlink);
+                       if (pmap_query_pagesize(kernel_pmap, start_va) == I386_LPGBYTES) {
+                               pgsz = I386_LPGBYTES;
+                       }
+               }
+               start_va += pgsz;
+       }
+}
+
 /*
  *     Initialize the pmap module.
  *     Called by vm_init, to initialize any structures that the pmap
@@ -793,7 +824,8 @@ pmap_init(void)
                                        last_managed_page = pn;
                                }
 
-                               if (pn >= lowest_hi && pn <= highest_hi) {
+                               if ((pmap_high_used_bottom <= pn && pn <= pmap_high_used_top) ||
+                                   (pmap_middle_used_bottom <= pn && pn <= pmap_middle_used_top)) {
                                        pmap_phys_attributes[pn] |= PHYS_NOENCRYPT;
                                }
                        }
@@ -843,19 +875,16 @@ pmap_init(void)
        zone_change(pv_hashed_list_zone, Z_NOENCRYPT, TRUE);
        zone_change(pv_hashed_list_zone, Z_GZALLOC_EXEMPT, TRUE);
 
-       /* create pv entries for kernel pages that might get pmap_remove()ed */
-       vaddr = (vm_map_offset_t) VM_MIN_KERNEL_ADDRESS;
-       for (ppn = VM_MIN_KERNEL_PAGE; ppn < i386_btop(avail_start); ppn++) {
-               pv_rooted_entry_t pv_h;
+       /*
+        * Create pv entries for kernel pages that might get pmap_remove()ed.
+        *
+        * - very low pages that were identity mapped.
+        * - vm_pages[] entries that might be unused and reclaimed.
+        */
+       assert((uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start <= (uintptr_t)vm_page_array_beginning_addr);
+       pmap_pv_fixup((uintptr_t)VM_MIN_KERNEL_ADDRESS, (uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start);
+       pmap_pv_fixup((uintptr_t)vm_page_array_beginning_addr, (uintptr_t)vm_page_array_ending_addr);
 
-               pv_h = pai_to_pvh(ppn);
-               assert(pv_h->qlink.next == 0);           /* shouldn't be init'd yet */
-               assert(pv_h->pmap == NULL);
-               pv_h->va_and_flags = vaddr;
-               vaddr += PAGE_SIZE;
-               pv_h->pmap = kernel_pmap;
-               queue_init(&pv_h->qlink);
-       }
        pmap_initialized = TRUE;
 
        max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
@@ -872,31 +901,6 @@ pmap_init(void)
 #endif /* CONFIG_VMX */
 }
 
-/*
- * Create pv entries for kernel pages mapped by low level
- * startup code.  These have to exist so we can pmap_remove() them.
- */
-void
-pmap_pv_fixup(vm_offset_t start, vm_size_t length)
-{
-       ppnum_t           ppn;
-       pv_rooted_entry_t pv_h;
-
-       while (length != 0) {
-               ppn = pmap_find_phys(kernel_pmap, start);
-               if (ppn != 0) {
-                       pv_h = pai_to_pvh(ppn);
-                       assert(pv_h->qlink.next == 0);           /* shouldn't be init'd yet */
-                       assert(pv_h->pmap == 0);
-                       pv_h->va_and_flags = start;
-                       pv_h->pmap = kernel_pmap;
-                       queue_init(&pv_h->qlink);
-               }
-               start += PAGE_SIZE;
-               length -= PAGE_SIZE;
-       }
-}
-
 static
 void
 pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro)
@@ -939,6 +943,24 @@ pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolea
        DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0);
 }
 
+/*
+ * Reclaim memory for early boot 4K page tables that were converted to large page mappings.
+ * We know this memory is part of the KPTphys[] array that was allocated in Idle_PTs_init(),
+ * so we can free it using its address in that array.
+ */
+static void
+pmap_free_early_PT(ppnum_t ppn, uint32_t cnt)
+{
+       ppnum_t KPTphys_ppn;
+       vm_offset_t offset;
+
+       KPTphys_ppn = pmap_find_phys(kernel_pmap, (uintptr_t)KPTphys);
+       assert(ppn >= KPTphys_ppn);
+       assert(ppn + cnt <= KPTphys_ppn + NKPT);
+       offset = (ppn - KPTphys_ppn) << PAGE_SHIFT;
+       ml_static_mfree((uintptr_t)KPTphys + offset, PAGE_SIZE * cnt);
+}
+
 /*
  * Called once VM is fully initialized so that we can release unused
  * sections of low memory to the general pool.
@@ -985,7 +1007,7 @@ pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolea
  * The now unused level-1 PTE pages are also freed.
  */
 extern ppnum_t  vm_kernel_base_page;
-static uint32_t constptes = 0, dataptes = 0;
+static uint32_t dataptes = 0;
 
 void
 pmap_lowmem_finalize(void)
@@ -1059,6 +1081,14 @@ pmap_lowmem_finalize(void)
         */
        pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base);
 
+       /*
+        * Release any memory for early boot 4K page table pages that got replaced
+        * with large page mappings for vm_pages[]. We know this memory is part of
+        * the KPTphys[] array that was allocated in Idle_PTs_init(), so we can free
+        * it using that address.
+        */
+       pmap_free_early_PT(released_PT_ppn, released_PT_cnt);
+
        /*
         * If text and data are both 2MB-aligned,
         * we can map text with large-pages,
@@ -1123,8 +1153,10 @@ pmap_lowmem_finalize(void)
                        vm_offset_t     pte_phys;
                        pt_entry_t      *pdep;
                        pt_entry_t      pde;
+                       ppnum_t         KPT_ppn;
 
                        pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva);
+                       KPT_ppn = (ppnum_t)((*pdep & PG_FRAME) >> PAGE_SHIFT);
                        ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
                        DBG("myva: %p pdep: %p ptep: %p\n",
                            (void *) myva, (void *) pdep, (void *) ptep);
@@ -1145,34 +1177,14 @@ pmap_lowmem_finalize(void)
 
                        /*
                         * Free the now-unused level-1 pte.
-                        * Note: ptep is a virtual address to the pte in the
-                        *   recursive map. We can't use this address to free
-                        *   the page. Instead we need to compute its address
-                        *   in the Idle PTEs in "low memory".
                         */
-                       vm_offset_t vm_ptep = (vm_offset_t) KPTphys
-                           + (pte_phys >> PTPGSHIFT);
-                       DBG("ml_static_mfree(%p,0x%x) for pte\n",
-                           (void *) vm_ptep, PAGE_SIZE);
-                       ml_static_mfree(vm_ptep, PAGE_SIZE);
+                       pmap_free_early_PT(KPT_ppn, 1);
                }
 
                /* Change variable read by sysctl machdep.pmap */
                pmap_kernel_text_ps = I386_LPGBYTES;
        }
 
-       boolean_t doconstro = TRUE;
-#if DEVELOPMENT || DEBUG
-       (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
-#endif
-       if (doconstro) {
-               if (sconst & PAGE_MASK) {
-                       panic("CONST segment misaligned 0x%lx 0x%lx\n",
-                           sconst, econst);
-               }
-               kprintf("Marking const DATA read-only\n");
-       }
-
        vm_offset_t dva;
 
        for (dva = sdata; dva < edata; dva += I386_PGBYTES) {
@@ -1187,20 +1199,6 @@ pmap_lowmem_finalize(void)
        }
        assert(dataptes > 0);
 
-       for (dva = sconst; dva < econst; dva += I386_PGBYTES) {
-               pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
-
-               dpte = *dptep;
-
-               assert((dpte & INTEL_PTE_VALID));
-               dpte |= INTEL_PTE_NX;
-               dpte &= ~INTEL_PTE_WRITE;
-               constptes++;
-               pmap_store_pte(dptep, dpte);
-       }
-
-       assert(constptes > 0);
-
        kernel_segment_command_t * seg;
        kernel_section_t         * sec;
 
@@ -1255,6 +1253,25 @@ pmap_lowmem_finalize(void)
        splx(spl);
 }
 
+/*
+ *     Mark the const data segment as read-only, non-executable.
+ */
+void
+x86_64_protect_data_const()
+{
+       boolean_t doconstro = TRUE;
+#if DEVELOPMENT || DEBUG
+       (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
+#endif
+       if (doconstro) {
+               if (sconst & PAGE_MASK) {
+                       panic("CONST segment misaligned 0x%lx 0x%lx\n",
+                           sconst, econst);
+               }
+               kprintf("Marking const DATA read-only\n");
+               pmap_protect(kernel_pmap, sconst, econst, VM_PROT_READ);
+       }
+}
 /*
  * this function is only used for debugging fron the vm layer
  */
@@ -1285,7 +1302,6 @@ pmap_verify_free(
        return result;
 }
 
-
 #if MACH_ASSERT
 void
 pmap_assert_free(ppnum_t pn)
@@ -1401,6 +1417,22 @@ hv_ept_pmap_create(void **ept_pmap, void **eptp)
        return;
 }
 
+/*
+ * pmap_create() is used by some special, legacy 3rd party kexts.
+ * In our kernel code, always use pmap_create_options().
+ */
+extern pmap_t pmap_create(ledger_t ledger, vm_map_size_t sz, boolean_t is_64bit);
+
+__attribute__((used))
+pmap_t
+pmap_create(
+       ledger_t      ledger,
+       vm_map_size_t sz,
+       boolean_t     is_64bit)
+{
+       return pmap_create_options(ledger, sz, is_64bit ? PMAP_CREATE_64BIT : 0);
+}
+
 /*
  *     Create and return a physical map.
  *
@@ -1418,7 +1450,7 @@ pmap_t
 pmap_create_options(
        ledger_t        ledger,
        vm_map_size_t   sz,
-       int             flags)
+       unsigned int    flags)
 {
        pmap_t          p;
        vm_size_t       size;
@@ -1457,8 +1489,7 @@ pmap_create_options(
        p->pmap_rwl.lck_rw_can_sleep = FALSE;
 
        bzero(&p->stats, sizeof(p->stats));
-
-       p->ref_count = 1;
+       os_ref_init(&p->ref_count, NULL);
 #if DEVELOPMENT || DEBUG
        p->nx_enabled = 1;
 #endif
@@ -1542,15 +1573,6 @@ pmap_create_options(
        return p;
 }
 
-pmap_t
-pmap_create(
-       ledger_t        ledger,
-       vm_map_size_t   sz,
-       boolean_t       is_64bit)
-{
-       return pmap_create_options(ledger, sz, ((is_64bit) ? PMAP_CREATE_64BIT : 0));
-}
-
 /*
  * We maintain stats and ledgers so that a task's physical footprint is:
  * phys_footprint = ((internal - alternate_accounting)
@@ -1563,114 +1585,6 @@ pmap_create(
  */
 
 #if MACH_ASSERT
-struct {
-       uint64_t        num_pmaps_checked;
-
-       int             phys_footprint_over;
-       ledger_amount_t phys_footprint_over_total;
-       ledger_amount_t phys_footprint_over_max;
-       int             phys_footprint_under;
-       ledger_amount_t phys_footprint_under_total;
-       ledger_amount_t phys_footprint_under_max;
-
-       int             internal_over;
-       ledger_amount_t internal_over_total;
-       ledger_amount_t internal_over_max;
-       int             internal_under;
-       ledger_amount_t internal_under_total;
-       ledger_amount_t internal_under_max;
-
-       int             internal_compressed_over;
-       ledger_amount_t internal_compressed_over_total;
-       ledger_amount_t internal_compressed_over_max;
-       int             internal_compressed_under;
-       ledger_amount_t internal_compressed_under_total;
-       ledger_amount_t internal_compressed_under_max;
-
-       int             iokit_mapped_over;
-       ledger_amount_t iokit_mapped_over_total;
-       ledger_amount_t iokit_mapped_over_max;
-       int             iokit_mapped_under;
-       ledger_amount_t iokit_mapped_under_total;
-       ledger_amount_t iokit_mapped_under_max;
-
-       int             alternate_accounting_over;
-       ledger_amount_t alternate_accounting_over_total;
-       ledger_amount_t alternate_accounting_over_max;
-       int             alternate_accounting_under;
-       ledger_amount_t alternate_accounting_under_total;
-       ledger_amount_t alternate_accounting_under_max;
-
-       int             alternate_accounting_compressed_over;
-       ledger_amount_t alternate_accounting_compressed_over_total;
-       ledger_amount_t alternate_accounting_compressed_over_max;
-       int             alternate_accounting_compressed_under;
-       ledger_amount_t alternate_accounting_compressed_under_total;
-       ledger_amount_t alternate_accounting_compressed_under_max;
-
-       int             page_table_over;
-       ledger_amount_t page_table_over_total;
-       ledger_amount_t page_table_over_max;
-       int             page_table_under;
-       ledger_amount_t page_table_under_total;
-       ledger_amount_t page_table_under_max;
-
-       int             purgeable_volatile_over;
-       ledger_amount_t purgeable_volatile_over_total;
-       ledger_amount_t purgeable_volatile_over_max;
-       int             purgeable_volatile_under;
-       ledger_amount_t purgeable_volatile_under_total;
-       ledger_amount_t purgeable_volatile_under_max;
-
-       int             purgeable_nonvolatile_over;
-       ledger_amount_t purgeable_nonvolatile_over_total;
-       ledger_amount_t purgeable_nonvolatile_over_max;
-       int             purgeable_nonvolatile_under;
-       ledger_amount_t purgeable_nonvolatile_under_total;
-       ledger_amount_t purgeable_nonvolatile_under_max;
-
-       int             purgeable_volatile_compressed_over;
-       ledger_amount_t purgeable_volatile_compressed_over_total;
-       ledger_amount_t purgeable_volatile_compressed_over_max;
-       int             purgeable_volatile_compressed_under;
-       ledger_amount_t purgeable_volatile_compressed_under_total;
-       ledger_amount_t purgeable_volatile_compressed_under_max;
-
-       int             purgeable_nonvolatile_compressed_over;
-       ledger_amount_t purgeable_nonvolatile_compressed_over_total;
-       ledger_amount_t purgeable_nonvolatile_compressed_over_max;
-       int             purgeable_nonvolatile_compressed_under;
-       ledger_amount_t purgeable_nonvolatile_compressed_under_total;
-       ledger_amount_t purgeable_nonvolatile_compressed_under_max;
-
-       int             network_volatile_over;
-       ledger_amount_t network_volatile_over_total;
-       ledger_amount_t network_volatile_over_max;
-       int             network_volatile_under;
-       ledger_amount_t network_volatile_under_total;
-       ledger_amount_t network_volatile_under_max;
-
-       int             network_nonvolatile_over;
-       ledger_amount_t network_nonvolatile_over_total;
-       ledger_amount_t network_nonvolatile_over_max;
-       int             network_nonvolatile_under;
-       ledger_amount_t network_nonvolatile_under_total;
-       ledger_amount_t network_nonvolatile_under_max;
-
-       int             network_volatile_compressed_over;
-       ledger_amount_t network_volatile_compressed_over_total;
-       ledger_amount_t network_volatile_compressed_over_max;
-       int             network_volatile_compressed_under;
-       ledger_amount_t network_volatile_compressed_under_total;
-       ledger_amount_t network_volatile_compressed_under_max;
-
-       int             network_nonvolatile_compressed_over;
-       ledger_amount_t network_nonvolatile_compressed_over_total;
-       ledger_amount_t network_nonvolatile_compressed_over_max;
-       int             network_nonvolatile_compressed_under;
-       ledger_amount_t network_nonvolatile_compressed_under_total;
-       ledger_amount_t network_nonvolatile_compressed_under_max;
-} pmap_ledgers_drift;
 static void pmap_check_ledgers(pmap_t pmap);
 #else /* MACH_ASSERT */
 static inline void
@@ -1689,7 +1603,7 @@ extern int vm_wired_objects_page_count;
 void
 pmap_destroy(pmap_t     p)
 {
-       int             c;
+       os_ref_count_t c;
 
        if (p == PMAP_NULL) {
                return;
@@ -1700,7 +1614,7 @@ pmap_destroy(pmap_t     p)
 
        PMAP_LOCK_EXCLUSIVE(p);
 
-       c = --p->ref_count;
+       c = os_ref_release_locked(&p->ref_count);
 
        pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE);
 
@@ -1762,7 +1676,7 @@ pmap_reference(pmap_t   p)
 {
        if (p != PMAP_NULL) {
                PMAP_LOCK_EXCLUSIVE(p);
-               p->ref_count++;
+               os_ref_retain_locked(&p->ref_count);
                PMAP_UNLOCK_EXCLUSIVE(p);;
        }
 }
@@ -2273,73 +2187,148 @@ pmap_expand(
 
        return KERN_SUCCESS;
 }
-
-/* On K64 machines with more than 32GB of memory, pmap_steal_memory
- * will allocate past the 1GB of pre-expanded virtual kernel area. This
- * function allocates all the page tables using memory from the same pool
- * that pmap_steal_memory uses, rather than calling vm_page_grab (which
- * isn't available yet). */
-void
-pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr)
+/*
+ * Query a pmap to see what size a given virtual address is mapped with.
+ * If the vaddr is not mapped, returns 0.
+ */
+vm_size_t
+pmap_query_pagesize(
+       pmap_t          pmap,
+       vm_map_offset_t vaddr)
 {
-       ppnum_t pn;
-       pt_entry_t              *pte;
-       boolean_t               is_ept = is_ept_pmap(pmap);
+       pd_entry_t      *pdep;
+       vm_size_t       size = 0;
 
+       assert(!is_ept_pmap(pmap));
        PMAP_LOCK_EXCLUSIVE(pmap);
 
+       pdep = pmap_pde(pmap, vaddr);
+       if (pdep != PD_ENTRY_NULL) {
+               if (*pdep & INTEL_PTE_PS) {
+                       size = I386_LPGBYTES;
+               } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
+                       size = I386_PGBYTES;
+               }
+       }
+
+       PMAP_UNLOCK_EXCLUSIVE(pmap);
+
+       return size;
+}
+
+/*
+ * Ensure the page table hierarchy is filled in down to
+ * the large page level. Additionally returns FAILURE if
+ * a lower page table already exists.
+ */
+static kern_return_t
+pmap_pre_expand_large_internal(
+       pmap_t          pmap,
+       vm_map_offset_t vaddr)
+{
+       ppnum_t         pn;
+       pt_entry_t      *pte;
+       boolean_t       is_ept = is_ept_pmap(pmap);
+       kern_return_t   kr = KERN_SUCCESS;
+
        if (pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
-               if (!pmap_next_page_hi(&pn)) {
-                       panic("pmap_pre_expand");
+               if (!pmap_next_page_hi(&pn, FALSE)) {
+                       panic("pmap_pre_expand_large no PDPT");
                }
 
                pmap_zero_page(pn);
 
                pte = pmap64_pml4(pmap, vaddr);
 
-               pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
-                   | PTE_READ(is_ept)
-                   | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
-                   PTE_WRITE(is_ept));
+               pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) |
+                   PTE_READ(is_ept) |
+                   (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) |
+                   PTE_WRITE(is_ept));
 
                pte = pmap64_user_pml4(pmap, vaddr);
 
-               pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
-                   | PTE_READ(is_ept)
-                   | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
-                   PTE_WRITE(is_ept));
+               pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) |
+                   PTE_READ(is_ept) |
+                   (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) |
+                   PTE_WRITE(is_ept));
        }
 
        if (pmap_pde(pmap, vaddr) == PD_ENTRY_NULL) {
-               if (!pmap_next_page_hi(&pn)) {
-                       panic("pmap_pre_expand");
+               if (!pmap_next_page_hi(&pn, FALSE)) {
+                       panic("pmap_pre_expand_large no PDE");
                }
 
                pmap_zero_page(pn);
 
                pte = pmap64_pdpt(pmap, vaddr);
 
-               pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
-                   | PTE_READ(is_ept)
-                   | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
-                   | PTE_WRITE(is_ept));
+               pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) |
+                   PTE_READ(is_ept) |
+                   (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) |
+                   PTE_WRITE(is_ept));
+       } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
+               kr = KERN_FAILURE;
        }
 
-       if (pmap_pte(pmap, vaddr) == PT_ENTRY_NULL) {
-               if (!pmap_next_page_hi(&pn)) {
-                       panic("pmap_pre_expand");
-               }
+       return kr;
+}
 
-               pmap_zero_page(pn);
+/*
+ * Wrapper that locks the pmap.
+ */
+kern_return_t
+pmap_pre_expand_large(
+       pmap_t          pmap,
+       vm_map_offset_t vaddr)
+{
+       kern_return_t   kr;
+
+       PMAP_LOCK_EXCLUSIVE(pmap);
+       kr = pmap_pre_expand_large_internal(pmap, vaddr);
+       PMAP_UNLOCK_EXCLUSIVE(pmap);
+       return kr;
+}
+
+/*
+ * On large memory machines, pmap_steal_memory() will allocate past
+ * the 1GB of pre-allocated/mapped virtual kernel area. This function
+ * expands kernel the page tables to cover a given vaddr. It uses pages
+ * from the same pool that pmap_steal_memory() uses, since vm_page_grab()
+ * isn't available yet.
+ */
+void
+pmap_pre_expand(
+       pmap_t          pmap,
+       vm_map_offset_t vaddr)
+{
+       ppnum_t         pn;
+       pt_entry_t      *pte;
+       boolean_t       is_ept = is_ept_pmap(pmap);
 
-               pte = pmap_pde(pmap, vaddr);
+       /*
+        * This returns failure if a 4K page table already exists.
+        * Othewise it fills in the page table hierarchy down
+        * to that level.
+        */
+       PMAP_LOCK_EXCLUSIVE(pmap);
+       if (pmap_pre_expand_large_internal(pmap, vaddr) == KERN_FAILURE) {
+               PMAP_UNLOCK_EXCLUSIVE(pmap);
+               return;
+       }
 
-               pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
-                   | PTE_READ(is_ept)
-                   | (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER)
-                   | PTE_WRITE(is_ept));
+       /* Add the lowest table */
+       if (!pmap_next_page_hi(&pn, FALSE)) {
+               panic("pmap_pre_expand");
        }
 
+       pmap_zero_page(pn);
+
+       pte = pmap_pde(pmap, vaddr);
+
+       pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) |
+           PTE_READ(is_ept) |
+           (is_ept ? INTEL_EPT_EX : INTEL_PTE_USER) |
+           PTE_WRITE(is_ept));
        PMAP_UNLOCK_EXCLUSIVE(pmap);
 }
 
@@ -2367,124 +2356,6 @@ pmap_sync_page_attributes_phys(ppnum_t pa)
        cache_flush_page_phys(pa);
 }
 
-
-
-#ifdef CURRENTLY_UNUSED_AND_UNTESTED
-
-int     collect_ref;
-int     collect_unref;
-
-/*
- *     Routine:        pmap_collect
- *     Function:
- *             Garbage collects the physical map system for
- *             pages which are no longer used.
- *             Success need not be guaranteed -- that is, there
- *             may well be pages which are not referenced, but
- *             others may be collected.
- *     Usage:
- *             Called by the pageout daemon when pages are scarce.
- */
-void
-pmap_collect(
-       pmap_t          p)
-{
-       pt_entry_t              *pdp, *ptp;
-       pt_entry_t              *eptp;
-       int                     wired;
-       boolean_t               is_ept;
-
-       if (p == PMAP_NULL) {
-               return;
-       }
-
-       if (p == kernel_pmap) {
-               return;
-       }
-
-       is_ept = is_ept_pmap(p);
-
-       /*
-        *      Garbage collect map.
-        */
-       PMAP_LOCK(p);
-
-       for (pdp = (pt_entry_t *)p->dirbase;
-           pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI + 1)];
-           pdp++) {
-               if (*pdp & PTE_VALID_MASK(is_ept)) {
-                       if (*pdp & PTE_REF(is_ept)) {
-                               pmap_store_pte(pdp, *pdp & ~PTE_REF(is_ept));
-                               collect_ref++;
-                       } else {
-                               collect_unref++;
-                               ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase));
-                               eptp = ptp + NPTEPG;
-
-                               /*
-                                * If the pte page has any wired mappings, we cannot
-                                * free it.
-                                */
-                               wired = 0;
-                               {
-                                       pt_entry_t *ptep;
-                                       for (ptep = ptp; ptep < eptp; ptep++) {
-                                               if (iswired(*ptep)) {
-                                                       wired = 1;
-                                                       break;
-                                               }
-                                       }
-                               }
-                               if (!wired) {
-                                       /*
-                                        * Remove the virtual addresses mapped by this pte page.
-                                        */
-                                       pmap_remove_range(p,
-                                           pdetova(pdp - (pt_entry_t *)p->dirbase),
-                                           ptp,
-                                           eptp);
-
-                                       /*
-                                        * Invalidate the page directory pointer.
-                                        */
-                                       pmap_store_pte(pdp, 0x0);
-
-                                       PMAP_UNLOCK(p);
-
-                                       /*
-                                        * And free the pte page itself.
-                                        */
-                                       {
-                                               vm_page_t m;
-
-                                               vm_object_lock(p->pm_obj);
-
-                                               m = vm_page_lookup(p->pm_obj, (vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0]) * PAGE_SIZE);
-                                               if (m == VM_PAGE_NULL) {
-                                                       panic("pmap_collect: pte page not in object");
-                                               }
-
-                                               vm_object_unlock(p->pm_obj);
-
-                                               VM_PAGE_FREE(m);
-
-                                               OSAddAtomic(-1, &inuse_ptepages_count);
-                                               PMAP_ZINFO_PFREE(p, PAGE_SIZE);
-                                       }
-
-                                       PMAP_LOCK(p);
-                               }
-                       }
-               }
-       }
-
-       PMAP_UPDATE_TLBS(p, 0x0, 0xFFFFFFFFFFFFF000ULL);
-       PMAP_UNLOCK(p);
-       return;
-}
-#endif
-
-
 void
 pmap_copy_page(ppnum_t src, ppnum_t dst)
 {
@@ -3224,10 +3095,8 @@ static void
 pmap_check_ledgers(
        pmap_t pmap)
 {
-       ledger_amount_t bal;
-       int             pid;
-       char            *procname;
-       boolean_t       do_panic;
+       int     pid;
+       char    *procname;
 
        if (pmap->pmap_pid == 0) {
                /*
@@ -3245,73 +3114,10 @@ pmap_check_ledgers(
                return;
        }
 
-       do_panic = FALSE;
        pid = pmap->pmap_pid;
        procname = pmap->pmap_procname;
 
-       pmap_ledgers_drift.num_pmaps_checked++;
-
-#define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
-MACRO_BEGIN                                                             \
-       int panic_on_negative = TRUE;                                   \
-       ledger_get_balance(pmap->ledger,                                \
-                          task_ledgers.__LEDGER,                       \
-                          &bal);                                       \
-       ledger_get_panic_on_negative(pmap->ledger,                      \
-                                    task_ledgers.__LEDGER,             \
-                                    &panic_on_negative);               \
-       if (bal != 0) {                                                 \
-               if (panic_on_negative ||                                \
-                   (pmap_ledgers_panic &&                              \
-                    pmap_ledgers_panic_leeway > 0 &&                   \
-                    (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
-                     bal < (pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
-                       do_panic = TRUE;                                \
-               }                                                       \
-               printf("LEDGER BALANCE proc %d (%s) "                   \
-                      "\"%s\" = %lld\n",                               \
-                      pid, procname, #__LEDGER, bal);                  \
-               if (bal > 0) {                                          \
-                       pmap_ledgers_drift.__LEDGER##_over++;           \
-                       pmap_ledgers_drift.__LEDGER##_over_total += bal; \
-                       if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
-                               pmap_ledgers_drift.__LEDGER##_over_max = bal; \
-                       }                                               \
-               } else if (bal < 0) {                                   \
-                       pmap_ledgers_drift.__LEDGER##_under++;          \
-                       pmap_ledgers_drift.__LEDGER##_under_total += bal; \
-                       if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
-                               pmap_ledgers_drift.__LEDGER##_under_max = bal; \
-                       }                                               \
-               }                                                       \
-       }                                                               \
-MACRO_END
-
-       LEDGER_CHECK_BALANCE(phys_footprint);
-       LEDGER_CHECK_BALANCE(internal);
-       LEDGER_CHECK_BALANCE(internal_compressed);
-       LEDGER_CHECK_BALANCE(iokit_mapped);
-       LEDGER_CHECK_BALANCE(alternate_accounting);
-       LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
-       LEDGER_CHECK_BALANCE(page_table);
-       LEDGER_CHECK_BALANCE(purgeable_volatile);
-       LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
-       LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
-       LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
-       LEDGER_CHECK_BALANCE(network_volatile);
-       LEDGER_CHECK_BALANCE(network_nonvolatile);
-       LEDGER_CHECK_BALANCE(network_volatile_compressed);
-       LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
-
-       if (do_panic) {
-               if (pmap_ledgers_panic) {
-                       panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
-                           pmap, pid, procname);
-               } else {
-                       printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
-                           pmap, pid, procname);
-               }
-       }
+       vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
 
        if (pmap->stats.resident_count != 0 ||
 #if 35156815
@@ -3464,3 +3270,44 @@ pmap_load_image4_trust_cache(struct pmap_image4_trust_cache __unused *trust_cach
        return PMAP_TC_UNKNOWN_FORMAT;
 }
 
+
+bool
+pmap_is_trust_cache_loaded(const uuid_t __unused uuid)
+{
+       // Unsupported on this architecture.
+       return false;
+}
+
+bool
+pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])
+{
+       // Unsupported on this architecture.
+       return false;
+}
+
+uint32_t
+pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])
+{
+       // Unsupported on this architecture.
+       return false;
+}
+
+bool
+pmap_in_ppl(void)
+{
+       // Nonexistent on this architecture.
+       return false;
+}
+
+void *
+pmap_claim_reserved_ppl_page(void)
+{
+       // Unsupported on this architecture.
+       return NULL;
+}
+
+void
+pmap_free_reserved_ppl_page(void __unused *kva)
+{
+       // Unsupported on this architecture.
+}
index e569811d6d0c8454f59b58525d266d5e6833717b..1fe7142ac664f8b9757713d163a5c19fce424599 100644 (file)
@@ -52,6 +52,24 @@ consistent_debug_allocate_entry(void)
        return NULL;
 }
 
+boolean_t
+PE_consistent_debug_lookup_entry(uint64_t record_id, uint64_t *phys_addr, uint64_t *length)
+{
+       assert(phys_addr != NULL);
+       assert(length != NULL);
+
+       for (unsigned int i = 0; i < consistent_debug_registry->top_level_header.num_records; i++) {
+               if (consistent_debug_registry->records[i].record_id == record_id) {
+                       *phys_addr = consistent_debug_registry->records[i].physaddr;
+                       *length = consistent_debug_registry->records[i].length;
+
+                       return true;
+               }
+       }
+
+       return false;
+}
+
 int
 PE_consistent_debug_inherit(void)
 {
index 4328c7f2fd268b16e8f4ce096076a334e337ec09..34ec23be76a04290cb10264e771543a16c91a20f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2019 Apple Inc. All rights reserved.
  * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
  */
 #include <pexpert/pexpert.h>
@@ -13,6 +13,7 @@
 #include <pexpert/arm64/board_config.h>
 #endif
 
+#include <kern/clock.h>
 #include <machine/machine_routines.h>
 #if DEVELOPMENT || DEBUG
 #include <kern/simple_lock.h>
@@ -33,7 +34,7 @@ static uint32_t gTCFG0Value;
 static uint32_t pe_arm_init_timer(void *args);
 
 #if DEVELOPMENT || DEBUG
-decl_simple_lock_data(, panic_trace_lock; )
+decl_simple_lock_data(, panic_hook_lock);
 #endif
 /*
  * pe_identify_machine:
@@ -96,11 +97,6 @@ pe_identify_machine(boot_args * bootArgs)
                hclk = mclk / 4;
                pclk = hclk / 2;
                tclk = 100000;  /* timer is at 100khz */
-       } else if (!strcmp(gPESoCDeviceType, "bcm2837-io")) {
-               mclk = 1200000000;
-               hclk = mclk / 4;
-               pclk = hclk / 2;
-               tclk = 1000000;
        } else {
                use_dt = 1;
        }
@@ -278,9 +274,6 @@ pe_arm_get_soc_revision(void)
 
 extern void     fleh_fiq_generic(void);
 
-#if defined(ARM_BOARD_CLASS_S5L8960X)
-static struct tbd_ops    s5l8960x_funcs = {NULL, NULL, NULL};
-#endif /* defined(ARM_BOARD_CLASS_S5L8960X) */
 
 #if defined(ARM_BOARD_CLASS_T7000)
 static struct tbd_ops    t7000_funcs = {NULL, NULL, NULL};
@@ -321,6 +314,9 @@ static struct tbd_ops    t8015_funcs = {NULL, NULL, NULL};
 
 
 
+
+
+
 #if defined(ARM_BOARD_CLASS_BCM2837)
 static struct tbd_ops    bcm2837_funcs = {NULL, NULL, NULL};
 #endif /* defined(ARM_BOARD_CLASS_BCM2837) */
@@ -341,23 +337,31 @@ typedef enum{
 } panic_trace_t;
 static panic_trace_t bootarg_panic_trace;
 
+static int bootarg_stop_clocks;
+
 // The command buffer contains the converted commands from the device tree for commanding cpu_halt, enable_trace, etc.
 #define DEBUG_COMMAND_BUFFER_SIZE 256
 typedef struct command_buffer_element {
        uintptr_t address;
-       uint16_t destination_cpu_selector;
        uintptr_t value;
+       uint16_t destination_cpu_selector;
+       uint16_t delay_us;
+       bool is_32bit;
 } command_buffer_element_t;
 static command_buffer_element_t debug_command_buffer[DEBUG_COMMAND_BUFFER_SIZE];                // statically allocate to prevent needing alloc at runtime
-static uint32_t  next_command_bufffer_entry = 0;                                                                                // index of next unused slot in debug_command_buffer
+static uint32_t  next_command_buffer_entry = 0;                                                                                // index of next unused slot in debug_command_buffer
 
-#define CPU_SELECTOR_SHIFT                              ((sizeof(int)-2)*8)
-#define CPU_SELECTOR_MASK                               (0xFFFF << CPU_SELECTOR_SHIFT)
-#define REGISTER_OFFSET_MASK                    (~CPU_SELECTOR_MASK)
+#define CPU_SELECTOR_SHIFT              (16)
+#define CPU_SELECTOR_MASK               (0xFFFF << CPU_SELECTOR_SHIFT)
+#define REGISTER_OFFSET_MASK            ((1 << CPU_SELECTOR_SHIFT) - 1)
 #define REGISTER_OFFSET(register_prop)  (register_prop & REGISTER_OFFSET_MASK)
-#define CPU_SELECTOR(register_offset)   (register_offset >> CPU_SELECTOR_SHIFT) // Upper 16bits holds the cpu selector
-#define MAX_WINDOW_SIZE                                 0xFFFF
-#define PE_ISSPACE(c)                                   (c == ' ' || c == '\t' || c == '\n' || c == '\12')
+#define CPU_SELECTOR(register_offset)   ((register_offset & CPU_SELECTOR_MASK) >> CPU_SELECTOR_SHIFT) // Upper 16bits holds the cpu selector
+#define MAX_WINDOW_SIZE                 0xFFFF
+#define PE_ISSPACE(c)                   (c == ' ' || c == '\t' || c == '\n' || c == '\12')
+#define DELAY_SHIFT                     (32)
+#define DELAY_MASK                      (0xFFFFULL << DELAY_SHIFT)
+#define DELAY_US(register_offset)       ((register_offset & DELAY_MASK) >> DELAY_SHIFT)
+#define REGISTER_32BIT_MASK             (1ULL << 63)
 /*
  *  0x0000 - all cpus
  *  0x0001 - cpu 0
@@ -376,6 +380,8 @@ static command_buffer_element_t *cpu_halt;
 static command_buffer_element_t *enable_trace;
 static command_buffer_element_t *enable_alt_trace;
 static command_buffer_element_t *trace_halt;
+static command_buffer_element_t *enable_stop_clocks;
+static command_buffer_element_t *stop_clocks;
 
 // Record which CPU is currently running one of our debug commands, so we can trap panic reentrancy to PE_arm_debug_panic_hook.
 static int running_debug_command_on_cpu_number = -1;
@@ -396,13 +402,13 @@ pe_init_debug_command(DTEntry entryP, command_buffer_element_t **command_buffer,
        }
 
        // make sure command will fit
-       if (next_command_bufffer_entry + prop_size / sizeof(uintptr_t) > DEBUG_COMMAND_BUFFER_SIZE - 1) {
+       if (next_command_buffer_entry + prop_size / sizeof(uintptr_t) > DEBUG_COMMAND_BUFFER_SIZE - 1) {
                panic("pe_init_debug_command: property %s is %u bytes, command buffer only has %lu bytes remaining\n",
-                   entry_name, prop_size, ((DEBUG_COMMAND_BUFFER_SIZE - 1) - next_command_bufffer_entry) * sizeof(uintptr_t));
+                   entry_name, prop_size, ((DEBUG_COMMAND_BUFFER_SIZE - 1) - next_command_buffer_entry) * sizeof(uintptr_t));
        }
 
        // Hold the pointer in a temp variable and later assign it to command buffer, in case we panic while half-initialized
-       command_starting_index = next_command_bufffer_entry;
+       command_starting_index = next_command_buffer_entry;
 
        // convert to real virt addresses and stuff commands into debug_command_buffer
        for (; prop_size; reg_prop += 2, prop_size -= 2 * sizeof(uintptr_t)) {
@@ -420,14 +426,21 @@ pe_init_debug_command(DTEntry entryP, command_buffer_element_t **command_buffer,
                        if ((REGISTER_OFFSET(*reg_prop) + sizeof(uintptr_t)) >= reg_window_size) {
                                panic("pe_init_debug_command: Command Offset is %lx, exceeds allocated size of %x\n", REGISTER_OFFSET(*reg_prop), reg_window_size );
                        }
-                       debug_command_buffer[next_command_bufffer_entry].address = debug_reg_window + REGISTER_OFFSET(*reg_prop);
-                       debug_command_buffer[next_command_bufffer_entry].destination_cpu_selector = CPU_SELECTOR(*reg_prop);
-                       debug_command_buffer[next_command_bufffer_entry++].value = *(reg_prop + 1);
+                       debug_command_buffer[next_command_buffer_entry].address = debug_reg_window + REGISTER_OFFSET(*reg_prop);
+                       debug_command_buffer[next_command_buffer_entry].destination_cpu_selector = CPU_SELECTOR(*reg_prop);
+#if defined(__arm64__)
+                       debug_command_buffer[next_command_buffer_entry].delay_us = DELAY_US(*reg_prop);
+                       debug_command_buffer[next_command_buffer_entry].is_32bit = ((*reg_prop & REGISTER_32BIT_MASK) != 0);
+#else
+                       debug_command_buffer[next_command_buffer_entry].delay_us = 0;
+                       debug_command_buffer[next_command_buffer_entry].is_32bit = false;
+#endif
+                       debug_command_buffer[next_command_buffer_entry++].value = *(reg_prop + 1);
                }
        }
 
        // null terminate the address field of the command to end it
-       debug_command_buffer[next_command_bufffer_entry++].address = 0;
+       debug_command_buffer[next_command_buffer_entry++].address = 0;
 
        // save pointer into table for this command
        *command_buffer = &debug_command_buffer[command_starting_index];
@@ -437,18 +450,31 @@ static void
 pe_run_debug_command(command_buffer_element_t *command_buffer)
 {
        // When both the CPUs panic, one will get stuck on the lock and the other CPU will be halted when the first executes the debug command
-       simple_lock(&panic_trace_lock, LCK_GRP_NULL);
+       simple_lock(&panic_hook_lock, LCK_GRP_NULL);
+
        running_debug_command_on_cpu_number = cpu_number();
 
        while (command_buffer && command_buffer->address) {
                if (IS_CPU_SELECTED(running_debug_command_on_cpu_number, command_buffer->destination_cpu_selector)) {
-                       *((volatile uintptr_t*)(command_buffer->address)) = command_buffer->value;      // register = value;
+                       if (command_buffer->is_32bit) {
+                               *((volatile uint32_t*)(command_buffer->address)) = (uint32_t)(command_buffer->value);
+                       } else {
+                               *((volatile uintptr_t*)(command_buffer->address)) = command_buffer->value;      // register = value;
+                       }
+                       if (command_buffer->delay_us != 0) {
+                               uint64_t deadline;
+                               nanoseconds_to_absolutetime(command_buffer->delay_us * NSEC_PER_USEC, &deadline);
+                               deadline += ml_get_timebase();
+                               while (ml_get_timebase() < deadline) {
+                                       ;
+                               }
+                       }
                }
                command_buffer++;
        }
 
        running_debug_command_on_cpu_number = -1;
-       simple_unlock(&panic_trace_lock);
+       simple_unlock(&panic_hook_lock);
 }
 
 
@@ -470,10 +496,12 @@ PE_arm_debug_enable_trace(void)
 }
 
 static void
-PEARMDebugPanicHook(const char *str)
+PE_arm_panic_hook(const char *str __unused)
 {
        (void)str; // not used
-
+       if (bootarg_stop_clocks != 0) {
+               pe_run_debug_command(stop_clocks);
+       }
        // if panic trace is enabled
        if (bootarg_panic_trace != 0) {
                if (running_debug_command_on_cpu_number == cpu_number()) {
@@ -482,19 +510,40 @@ PEARMDebugPanicHook(const char *str)
                        return;  // allow the normal panic operation to occur.
                }
 
-               // Stop tracing to freze the buffer and return to normal panic processing.
+               // Stop tracing to freeze the buffer and return to normal panic processing.
                pe_run_debug_command(trace_halt);
        }
 }
 
-void (*PE_arm_debug_panic_hook)(const char *str) = PEARMDebugPanicHook;
+void (*PE_arm_debug_panic_hook)(const char *str) = PE_arm_panic_hook;
+
+void
+PE_init_cpu(void)
+{
+       if (bootarg_stop_clocks != 0) {
+               pe_run_debug_command(enable_stop_clocks);
+       }
+}
 
 #else
 
-void (*PE_arm_debug_panic_hook)(const char *str) = NULL;
+void(*const PE_arm_debug_panic_hook)(const char *str) = NULL;
+
+void
+PE_init_cpu(void)
+{
+}
 
 #endif  // DEVELOPMENT || DEBUG
 
+void
+PE_panic_hook(const char *str __unused)
+{
+       if (PE_arm_debug_panic_hook != NULL) {
+               PE_arm_debug_panic_hook(str);
+       }
+}
+
 void
 pe_arm_init_debug(void *args)
 {
@@ -516,7 +565,7 @@ pe_arm_init_debug(void *args)
                        // When args != NULL, this means we're being called from arm_init on the boot CPU.
                        // This controls one-time initialization of the Panic Trace infrastructure
 
-                       simple_lock_init(&panic_trace_lock, 0); //assuming single threaded mode
+                       simple_lock_init(&panic_hook_lock, 0); //assuming single threaded mode
 
                        // Panic_halt is deprecated. Please use panic_trace istead.
                        unsigned int temp_bootarg_panic_trace;
@@ -536,6 +585,12 @@ pe_arm_init_debug(void *args)
                                // start tracing now if enabled
                                PE_arm_debug_enable_trace();
                        }
+                       unsigned int temp_bootarg_stop_clocks;
+                       if (PE_parse_boot_argn("stop_clocks", &temp_bootarg_stop_clocks, sizeof(temp_bootarg_stop_clocks))) {
+                               pe_init_debug_command(entryP, &enable_stop_clocks, "enable_stop_clocks");
+                               pe_init_debug_command(entryP, &stop_clocks, "stop_clocks");
+                               bootarg_stop_clocks = temp_bootarg_stop_clocks;
+                       }
 #endif
                }
        } else {
@@ -615,11 +670,6 @@ pe_arm_init_timer(void *args)
        timer_base = gTimerBase;
        soc_phys = gSocPhys;
 
-#if defined(ARM_BOARD_CLASS_S5L8960X)
-       if (!strcmp(gPESoCDeviceType, "s5l8960x-io")) {
-               tbd_funcs = &s5l8960x_funcs;
-       } else
-#endif
 #if defined(ARM_BOARD_CLASS_T7000)
        if (!strcmp(gPESoCDeviceType, "t7000-io") ||
            !strcmp(gPESoCDeviceType, "t7001-io")) {
index 1113d5a5cd4307bf50f378ef38d287a26e577f22..fda4a4dbbe3eb3df1f2a01ced8fa3536171f602e 100644 (file)
@@ -59,6 +59,17 @@ vm_offset_t gPanicBase;
 unsigned int gPanicSize;
 struct embedded_panic_header *panic_info = NULL;
 
+#if (DEVELOPMENT || DEBUG) && defined(XNU_TARGET_OS_BRIDGE)
+/*
+ * On DEVELOPMENT bridgeOS, we map the x86 panic region
+ * so we can include this data in bridgeOS corefiles
+ */
+uint64_t macos_panic_base = 0;
+unsigned int macos_panic_size = 0;
+
+struct macos_panic_header *mac_panic_header = NULL;
+#endif
+
 /* Maximum size of panic log excluding headers, in bytes */
 static unsigned int panic_text_len;
 
@@ -83,7 +94,20 @@ check_for_panic_log(void)
        uint32_t *panic_region_length;
 
        /*
-        * Find the vram node in the device tree
+        * DT properties for the panic region are populated by UpdateDeviceTree() in iBoot:
+        *
+        * chosen {
+        *   embedded-panic-log-size = <0x00080000>;
+        *   [a bunch of other stuff]
+        * };
+        *
+        * pram {
+        *   reg = <0x00000008_fbc48000 0x00000000_000b4000>;
+        * };
+        *
+        * reg[0] is the physical address
+        * reg[1] is the size of iBoot's kMemoryRegion_Panic (not used)
+        * embedded-panic-log-size is the maximum amount of data to store in the buffer
         */
        if (kSuccess != DTLookupEntry(0, "pram", &entry)) {
                return;
@@ -101,16 +125,25 @@ check_for_panic_log(void)
                return;
        }
 
-       /*
-        * Map the first page of VRAM into the kernel for use in case of
-        * panic
-        */
-       /* Note: map as normal memory. */
        gPanicBase = ml_io_map_wcomb(reg_prop[0], panic_region_length[0]);
 
        /* Deduct the size of the panic header from the panic region size */
        panic_text_len = panic_region_length[0] - sizeof(struct embedded_panic_header);
        gPanicSize = panic_region_length[0];
+
+#if DEVELOPMENT && defined(XNU_TARGET_OS_BRIDGE)
+       if (PE_consistent_debug_enabled()) {
+               uint64_t macos_panic_physbase = 0;
+               uint64_t macos_panic_physlen = 0;
+               /* Populate the macOS panic region data if it's present in consistent debug */
+               if (PE_consistent_debug_lookup_entry(kDbgIdMacOSPanicRegion, &macos_panic_physbase, &macos_panic_physlen)) {
+                       macos_panic_base = ml_io_map_with_prot(macos_panic_physbase, macos_panic_physlen, VM_PROT_READ);
+                       mac_panic_header = (struct macos_panic_header *) ((void *) macos_panic_base);
+                       macos_panic_size = macos_panic_physlen;
+               }
+       }
+#endif /* DEVELOPMENT && defined(XNU_TARGET_OS_BRIDGE) */
+
 #endif
        panic_info = (struct embedded_panic_header *)gPanicBase;
 
@@ -476,20 +509,13 @@ PE_call_timebase_callback(void)
 /*
  * The default PE_poll_input handler.
  */
-static int
+int
 PE_stub_poll_input(__unused unsigned int options, char *c)
 {
        *c = uart_getc();
        return 0;               /* 0 for success, 1 for unsupported */
 }
 
-/*
- * Called by the kernel debugger to poll for keyboard input.
- * Keyboard drivers may replace the default stub function
- * with their polled-mode input function.
- */
-int             (*PE_poll_input) (unsigned int options, char *c) = PE_stub_poll_input;
-
 /*
  * This routine will return 1 if you are running on a device with a variant
  * of iBoot that allows debugging. This is typically not the case on production
index 5287e5c86d2abf7f75cedecca361506566d44941..fdac3ab1ad6325a9ed1f255c2eed2e90ec1272f5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  */
 /*
  * file: pe_kprintf.c
@@ -18,7 +18,9 @@ void            (*PE_kputc)(char c) = 0;
 
 SECURITY_READ_ONLY_LATE(unsigned int)    disable_serial_output = TRUE;
 
-decl_simple_lock_data(static, kprintf_lock)
+decl_simple_lock_data(static, kprintf_lock);
+
+static void serial_putc_crlf(char c);
 
 void
 PE_init_kprintf(boolean_t vm_initialized)
@@ -39,7 +41,7 @@ PE_init_kprintf(boolean_t vm_initialized)
                }
 
                if (serial_init()) {
-                       PE_kputc = serial_putc;
+                       PE_kputc = serial_putc_crlf;
                } else {
                        PE_kputc = cnputc;
                }
@@ -131,6 +133,15 @@ kprintf(const char *fmt, ...)
        }
 }
 
+static void
+serial_putc_crlf(char c)
+{
+       if (c == '\n') {
+               uart_putc('\r');
+       }
+       uart_putc(c);
+}
+
 void
 serial_putc(char c)
 {
index 3e70e3f2dae6c0c22785f9853a7890e6e56a40be..0d8ffefe2e2833ae1014084cc27c647137966716 100644 (file)
@@ -10,6 +10,7 @@
 #include <kern/clock.h>
 #include <kern/debug.h>
 #include <libkern/OSBase.h>
+#include <libkern/section_keywords.h>
 #include <mach/mach_time.h>
 #include <machine/atomic.h>
 #include <machine/machine_routines.h>
@@ -35,13 +36,13 @@ struct pe_serial_functions {
        void            (*td0) (int c);
        int             (*rr0) (void);
        int             (*rd0) (void);
+       struct pe_serial_functions *next;
 };
 
-static struct pe_serial_functions *gPESF;
+SECURITY_READ_ONLY_LATE(static struct pe_serial_functions*) gPESF = NULL;
 
-static int      uart_initted = 0;       /* 1 if init'ed */
-
-static vm_offset_t      uart_base;
+static int         uart_initted = 0;    /* 1 if init'ed */
+static vm_offset_t uart_base = 0;
 
 /*****************************************************************************/
 
@@ -51,6 +52,8 @@ static int32_t dt_pclk      = -1;
 static int32_t dt_sampling  = -1;
 static int32_t dt_ubrdiv    = -1;
 
+static void ln2410_uart_set_baud_rate(__unused int unit, uint32_t baud_rate);
+
 static void
 ln2410_uart_init(void)
 {
@@ -66,7 +69,7 @@ ln2410_uart_init(void)
        rUCON0 = ucon0;
        rUMCON0 = 0x00;         /* Clear Flow Control */
 
-       gPESF->uart_set_baud_rate(0, 115200);
+       ln2410_uart_set_baud_rate(0, 115200);
 
        rUFCON0 = 0x03;         /* Clear & Enable FIFOs */
        rUMCON0 = 0x01;         /* Assert RTS on UART0 */
@@ -137,15 +140,24 @@ ln2410_rd0(void)
        return (int)rURXH0;
 }
 
-static struct pe_serial_functions ln2410_serial_functions = {
-       ln2410_uart_init, ln2410_uart_set_baud_rate,
-       ln2410_tr0, ln2410_td0, ln2410_rr0, ln2410_rd0
+SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) ln2410_serial_functions =
+{
+       .uart_init = ln2410_uart_init,
+       .uart_set_baud_rate = ln2410_uart_set_baud_rate,
+       .tr0 = ln2410_tr0,
+       .td0 = ln2410_td0,
+       .rr0 = ln2410_rr0,
+       .rd0 = ln2410_rd0
 };
 
 #endif  /* S3CUART */
 
 /*****************************************************************************/
 
+static void
+dcc_uart_init(void)
+{
+}
 
 static unsigned int
 read_dtr(void)
@@ -213,9 +225,14 @@ dcc_rd0(void)
        return read_dtr();
 }
 
-static struct pe_serial_functions dcc_serial_functions = {
-       NULL, NULL,
-       dcc_tr0, dcc_td0, dcc_rr0, dcc_rd0
+SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) dcc_serial_functions =
+{
+       .uart_init = dcc_uart_init,
+       .uart_set_baud_rate = NULL,
+       .tr0 = dcc_tr0,
+       .td0 = dcc_td0,
+       .rr0 = dcc_rr0,
+       .rd0 = dcc_rd0
 };
 
 /*****************************************************************************/
@@ -465,7 +482,7 @@ validation_failure:
        PE_consistent_debug_register(kDbgIdConsoleHeaderAP, pa_panic_base, panic_size);
 }
 
-static struct pe_serial_functions shmcon_serial_functions =
+SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) shmcon_serial_functions =
 {
        .uart_init = shmcon_init,
        .uart_set_baud_rate = shmcon_set_baud_rate,
@@ -505,6 +522,7 @@ static uint64_t prev_dockfifo_spaces;       // Previous w_stat level of the Dock
 static uint32_t dockfifo_capacity;
 static uint64_t dockfifo_stall_grace;
 
+static vm_offset_t dockfifo_uart_base = 0;
 
 //=======================
 // Local funtions
@@ -521,7 +539,7 @@ dockfifo_drain_on_stall()
                // It's been more than DOCKFIFO_WR_MAX_STALL_US and nobody read from the FIFO
                // Drop a character.
                (void)rDOCKFIFO_R_DATA(DOCKFIFO_UART_READ, 1);
-               prev_dockfifo_spaces++;
+               os_atomic_inc(&prev_dockfifo_spaces, relaxed);
                return 1;
        }
        return 0;
@@ -548,7 +566,7 @@ static void
 dockfifo_uart_td0(int c)
 {
        rDOCKFIFO_W_DATA(DOCKFIFO_UART_WRITE, 1) = (unsigned)(c & 0xff);
-       prev_dockfifo_spaces--; // After writing a byte we have one fewer space than previously expected.
+       os_atomic_dec(&prev_dockfifo_spaces, relaxed); // After writing a byte we have one fewer space than previously expected.
 }
 
 static int
@@ -578,7 +596,7 @@ dockfifo_uart_init(void)
        dockfifo_capacity = rDOCKFIFO_W_STAT(DOCKFIFO_UART_WRITE) & 0xffff;
 }
 
-static struct pe_serial_functions dockfifo_uart_serial_functions =
+SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) dockfifo_uart_serial_functions =
 {
        .uart_init = dockfifo_uart_init,
        .uart_set_baud_rate = NULL,
@@ -601,6 +619,7 @@ static bool             use_sw_drain;
 static uint64_t         prev_dockchannel_drained_time;  // Last time we've seen the DockChannel drained by an external agent
 static uint64_t         prev_dockchannel_spaces;        // Previous w_stat level of the DockChannel.
 static uint64_t         dockchannel_stall_grace;
+static vm_offset_t      dockchannel_uart_base = 0;
 
 //=======================
 // Local funtions
@@ -617,7 +636,7 @@ dockchannel_drain_on_stall()
                // It's been more than DOCKCHANEL_WR_MAX_STALL_US and nobody read from the FIFO
                // Drop a character.
                (void)rDOCKCHANNELS_DEV_RDATA1(DOCKCHANNEL_UART_CHANNEL);
-               prev_dockchannel_spaces++;
+               os_atomic_inc(&prev_dockchannel_spaces, relaxed);
                return 1;
        }
        return 0;
@@ -648,7 +667,7 @@ dockchannel_uart_td0(int c)
 {
        rDOCKCHANNELS_DEV_WDATA1(DOCKCHANNEL_UART_CHANNEL) = (unsigned)(c & 0xff);
        if (use_sw_drain) {
-               prev_dockchannel_spaces--; // After writing a byte we have one fewer space than previously expected.
+               os_atomic_dec(&prev_dockchannel_spaces, relaxed); // After writing a byte we have one fewer space than previously expected.
        }
 }
 
@@ -664,6 +683,15 @@ dockchannel_uart_rd0(void)
        return (int)((rDOCKCHANNELS_DEV_RDATA1(DOCKCHANNEL_UART_CHANNEL) >> 8) & 0xff);
 }
 
+static void
+dockchannel_uart_clear_intr(void)
+{
+       rDOCKCHANNELS_AGENT_AP_INTR_CTRL &= ~(0x3);
+       rDOCKCHANNELS_AGENT_AP_INTR_STATUS |= 0x3;
+       rDOCKCHANNELS_AGENT_AP_ERR_INTR_CTRL &= ~(0x3);
+       rDOCKCHANNELS_AGENT_AP_ERR_INTR_STATUS |= 0x3;
+}
+
 static void
 dockchannel_uart_init(void)
 {
@@ -672,10 +700,7 @@ dockchannel_uart_init(void)
        }
 
        // Clear all interrupt enable and status bits
-       rDOCKCHANNELS_AGENT_AP_INTR_CTRL &= ~(0x3);
-       rDOCKCHANNELS_AGENT_AP_INTR_STATUS |= 0x3;
-       rDOCKCHANNELS_AGENT_AP_ERR_INTR_CTRL &= ~(0x3);
-       rDOCKCHANNELS_AGENT_AP_ERR_INTR_STATUS |= 0x3;
+       dockchannel_uart_clear_intr();
 
        // Setup DRAIN timer
        rDOCKCHANNELS_DEV_DRAIN_CFG(DOCKCHANNEL_UART_CHANNEL) = max_dockchannel_drain_period;
@@ -685,7 +710,7 @@ dockchannel_uart_init(void)
        rDOCKCHANNELS_DOCK_RDATA1(DOCKCHANNEL_UART_CHANNEL);
 }
 
-static struct pe_serial_functions dockchannel_uart_serial_functions =
+SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) dockchannel_uart_serial_functions =
 {
        .uart_init = dockchannel_uart_init,
        .uart_set_baud_rate = NULL,
@@ -699,8 +724,8 @@ static struct pe_serial_functions dockchannel_uart_serial_functions =
 
 /****************************************************************************/
 #ifdef  PI3_UART
-vm_offset_t pi3_gpio_base_vaddr;
-vm_offset_t pi3_aux_base_vaddr;
+vm_offset_t pi3_gpio_base_vaddr = 0;
+vm_offset_t pi3_aux_base_vaddr = 0;
 static int
 pi3_uart_tr0(void)
 {
@@ -775,7 +800,7 @@ pi3_uart_init(void)
        BCM2837_PUT32(BCM2837_AUX_MU_CNTL_REG_V, 3);
 }
 
-static struct pe_serial_functions pi3_uart_serial_functions =
+SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) pi3_uart_serial_functions =
 {
        .uart_init = pi3_uart_init,
        .uart_set_baud_rate = NULL,
@@ -787,111 +812,104 @@ static struct pe_serial_functions pi3_uart_serial_functions =
 
 #endif /* PI3_UART */
 /*****************************************************************************/
+
+static void
+register_serial_functions(struct pe_serial_functions *fns)
+{
+       fns->next = gPESF;
+       gPESF = fns;
+}
+
 int
 serial_init(void)
 {
        DTEntry         entryP = NULL;
-       uint32_t        prop_size, dccmode;
+       uint32_t        prop_size;
        vm_offset_t     soc_base;
        uintptr_t       *reg_prop;
-       uint32_t        *prop_value = NULL;
-       char            *serial_compat = 0;
-#ifdef SHMCON
-       uint32_t        jconmode;
-#endif
-#ifdef DOCKFIFO_UART
-       uint32_t        no_dockfifo_uart;
-#endif
-#ifdef DOCKCHANNEL_UART
-       uint32_t        no_dockchannel_uart;
-#endif
-#ifdef PI3_UART
-       uint32_t        is_pi3;
-#endif
+       uint32_t        *prop_value __unused = NULL;
+       char            *serial_compat __unused = 0;
+       uint32_t        dccmode;
 
-       if (uart_initted && gPESF) {
-               gPESF->uart_init();
+       struct pe_serial_functions *fns = gPESF;
+
+       if (uart_initted) {
+               while (fns != NULL) {
+                       fns->uart_init();
+                       fns = fns->next;
+               }
                kprintf("reinit serial\n");
                return 1;
        }
 
        dccmode = 0;
        if (PE_parse_boot_argn("dcc", &dccmode, sizeof(dccmode))) {
-               gPESF = &dcc_serial_functions;
-               uart_initted = 1;
-               return 1;
+               register_serial_functions(&dcc_serial_functions);
        }
 #ifdef SHMCON
-       jconmode = 0;
+       uint32_t jconmode = 0;
        if (PE_parse_boot_argn("jcon", &jconmode, sizeof jconmode)) {
-               gPESF = &shmcon_serial_functions;
-               gPESF->uart_init();
-               uart_initted = 1;
-               return 1;
+               register_serial_functions(&shmcon_serial_functions);
        }
 #endif /* SHMCON */
 
-#ifdef PI3_UART
-#pragma unused(prop_value)
-       is_pi3 = 0;
-       if (PE_parse_boot_argn("-pi3", &is_pi3, sizeof(is_pi3))) { // FIXME: remove the not operator after boot args are set up.
-               pi3_gpio_base_vaddr = ml_io_map((vm_offset_t)BCM2837_GPIO_BASE, BCM2837_GPIO_SIZE);
-               pi3_aux_base_vaddr = ml_io_map((vm_offset_t)BCM2837_AUX_BASE, BCM2837_AUX_SIZE);
-               gPESF = &pi3_uart_serial_functions;
-               gPESF->uart_init();
-               uart_initted = 1;
-               return 1;
-       }
-#endif /* PI3_UART */
-
        soc_base = pe_arm_get_soc_base_phys();
 
        if (soc_base == 0) {
                return 0;
        }
 
+#ifdef PI3_UART
+       if (DTFindEntry("name", "gpio", &entryP) == kSuccess) {
+               DTGetProperty(entryP, "reg", (void **)&reg_prop, &prop_size);
+               pi3_gpio_base_vaddr = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1));
+       }
+       if (DTFindEntry("name", "aux", &entryP) == kSuccess) {
+               DTGetProperty(entryP, "reg", (void **)&reg_prop, &prop_size);
+               pi3_aux_base_vaddr = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1));
+       }
+       if ((pi3_gpio_base_vaddr != 0) && (pi3_aux_base_vaddr != 0)) {
+               register_serial_functions(&pi3_uart_serial_functions);
+       }
+#endif /* PI3_UART */
+
 #ifdef DOCKFIFO_UART
-       no_dockfifo_uart = 0;
+       uint32_t no_dockfifo_uart = 0;
        PE_parse_boot_argn("no-dockfifo-uart", &no_dockfifo_uart, sizeof(no_dockfifo_uart));
        if (no_dockfifo_uart == 0) {
                if (DTFindEntry("name", "dockfifo-uart", &entryP) == kSuccess) {
                        DTGetProperty(entryP, "reg", (void **)&reg_prop, &prop_size);
-                       uart_base = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1));
-               } else {
-                       return 0;
+                       dockfifo_uart_base = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1));
+                       register_serial_functions(&dockfifo_uart_serial_functions);
                }
-               gPESF = &dockfifo_uart_serial_functions;
-               gPESF->uart_init();
-               uart_initted = 1;
-               return 1;
        }
 #endif /* DOCKFIFO_UART */
 
 #ifdef DOCKCHANNEL_UART
-       no_dockchannel_uart = 0;
-       // Keep the old name for boot-arg
-       PE_parse_boot_argn("no-dockfifo-uart", &no_dockchannel_uart, sizeof(no_dockchannel_uart));
-       if (no_dockchannel_uart == 0) {
-               if (DTFindEntry("name", "dockchannel-uart", &entryP) == kSuccess) {
-                       DTGetProperty(entryP, "reg", (void **)&reg_prop, &prop_size);
-                       // Should be two reg entries
-                       if (prop_size / sizeof(uintptr_t) != 4) {
-                               panic("Malformed dockchannel-uart property");
-                       }
-                       uart_base = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1));
-                       dock_agent_base = ml_io_map(soc_base + *(reg_prop + 2), *(reg_prop + 3));
-                       gPESF = &dockchannel_uart_serial_functions;
+       uint32_t no_dockchannel_uart = 0;
+       if (DTFindEntry("name", "dockchannel-uart", &entryP) == kSuccess) {
+               DTGetProperty(entryP, "reg", (void **)&reg_prop, &prop_size);
+               // Should be two reg entries
+               if (prop_size / sizeof(uintptr_t) != 4) {
+                       panic("Malformed dockchannel-uart property");
+               }
+               dockchannel_uart_base = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1));
+               dock_agent_base = ml_io_map(soc_base + *(reg_prop + 2), *(reg_prop + 3));
+               PE_parse_boot_argn("no-dockfifo-uart", &no_dockchannel_uart, sizeof(no_dockchannel_uart));
+               // Keep the old name for boot-arg
+               if (no_dockchannel_uart == 0) {
+                       register_serial_functions(&dockchannel_uart_serial_functions);
                        DTGetProperty(entryP, "max-aop-clk", (void **)&prop_value, &prop_size);
                        max_dockchannel_drain_period = (uint32_t)((prop_value)?  (*prop_value * 0.03) : DOCKCHANNEL_DRAIN_PERIOD);
                        DTGetProperty(entryP, "enable-sw-drain", (void **)&prop_value, &prop_size);
                        use_sw_drain = (prop_value)?  *prop_value : 0;
-                       gPESF->uart_init();
-                       uart_initted = 1;
-                       return 1;
+               } else {
+                       dockchannel_uart_clear_intr();
                }
                // If no dockchannel-uart is found in the device tree, fall back
                // to looking for the traditional UART serial console.
        }
+
 #endif /* DOCKCHANNEL_UART */
 
        /*
@@ -938,24 +956,25 @@ serial_init(void)
                }
        }
        if (!strcmp(serial_compat, "uart,16550")) {
-               gPESF = &ln2410_serial_functions;
+               register_serial_functions(&ln2410_serial_functions);
        } else if (!strcmp(serial_compat, "uart-16550")) {
-               gPESF = &ln2410_serial_functions;
+               register_serial_functions(&ln2410_serial_functions);
        } else if (!strcmp(serial_compat, "uart,s5i3000")) {
-               gPESF = &ln2410_serial_functions;
+               register_serial_functions(&ln2410_serial_functions);
        } else if (!strcmp(serial_compat, "uart-1,samsung")) {
-               gPESF = &ln2410_serial_functions;
+               register_serial_functions(&ln2410_serial_functions);
        }
-#elif   defined (ARM_BOARD_CONFIG_MV88F6710)
-       if (!strcmp(serial_compat, "uart16x50,mmio")) {
-               gPESF = &uart16x50_serial_functions;
-       }
-#endif
-       else {
+#endif /* S3CUART */
+
+       if (gPESF == NULL) {
                return 0;
        }
 
-       gPESF->uart_init();
+       fns = gPESF;
+       while (fns != NULL) {
+               fns->uart_init();
+               fns = fns->next;
+       }
 
        uart_initted = 1;
 
@@ -965,22 +984,25 @@ serial_init(void)
 void
 uart_putc(char c)
 {
-       if (uart_initted) {
-               while (!gPESF->tr0()) {
+       struct pe_serial_functions *fns = gPESF;
+       while (fns != NULL) {
+               while (!fns->tr0()) {
                        ;               /* Wait until THR is empty. */
                }
-               gPESF->td0(c);
+               fns->td0(c);
+               fns = fns->next;
        }
 }
 
 int
 uart_getc(void)
 {                               /* returns -1 if no data available */
-       if (uart_initted) {
-               if (!gPESF->rr0()) {
-                       return -1;      /* Receive data read */
+       struct pe_serial_functions *fns = gPESF;
+       while (fns != NULL) {
+               if (fns->rr0()) {
+                       return fns->rd0();
                }
-               return gPESF->rd0();
+               fns = fns->next;
        }
        return -1;
 }
index 7bd79d9ae78b20528784ccb5d5684d4a6bcb579f..05c4b79cf0cdde381cf245f419ccc25af4fec458 100644 (file)
@@ -23,7 +23,7 @@ endif
 
 $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS)
        $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG)
-       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG);
+       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG)
 
 do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
        $(_v)${MAKE} \
@@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
                SOURCE=$(subst conf/,,$(SOURCE))                        \
                TARGET=${TARGET}                                        \
                OBJPATH=${OBJPATH}                                      \
-               build_all;
+               build_all
 
 do_build_all:: do_all
 
index b9962d602e2877badc595cd0a1299dd0c9865d9d..b5357650d14543d2ec50ff217f7ee2e9e37393b1 100644 (file)
@@ -66,9 +66,9 @@ $(SOBJS): .SFLAGS
        $(_v)$(REPLACECONTENTS) $@ $(S_KCC) $(SFLAGS) $(INCFLAGS)
 
 $(COMPONENT).filelist: $(OBJS)
-       @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)"
+       $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0))
        $(_v)for obj in ${OBJS}; do     \
-                echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
+                $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
        done > $(COMPONENT).filelist
 
 do_all: $(COMPONENT).filelist
index b6b86ef4aa8528a3cb1811b518ff3924def18d16..f687f7c30f9643c6e3e89f7374a9f9889f3b6cc3 100644 (file)
@@ -1,5 +1,3 @@
-OPTIONS/gprof                    optional gprof
-
 pexpert/arm/pe_bootargs.c            standard
 pexpert/arm/pe_identify_machine.c    standard
 pexpert/arm/pe_init.c                standard
index aada62e1e7e845b649120e2f7311fa422dbcb6fa..b2ab3b4d4a96d2853a18e1c6b2fc8a2c0aad7c16 100644 (file)
@@ -1,5 +1,3 @@
-OPTIONS/gprof                    optional gprof
-
 pexpert/arm/pe_bootargs.c              standard
 pexpert/arm/pe_consistent_debug.c      standard
 pexpert/arm/pe_identify_machine.c      standard
index 0ba9ffc1892614388607b843dd572f41ad020c72..d0c246e7c7082cf14029e70df037b017c5ff0cb0 100644 (file)
@@ -1,5 +1,3 @@
-OPTIONS/gprof                    optional gprof
-
 pexpert/i386/pe_init.c                standard
 pexpert/i386/pe_bootargs.c            standard
 pexpert/i386/pe_identify_machine.c    standard
index 5bf70059cf5f11a22b5170188c859efdbc0fbf79..fddac8d3e1d67497f2b4acf51354db4cd5f9c015 100644 (file)
@@ -51,23 +51,6 @@ struct i24 {
 #define NUM     0
 #define STR     1
 
-#if !defined(__LP64__) && !defined(__arm__)
-boolean_t
-PE_parse_boot_arg(
-       const char  *arg_string,
-       void            *arg_ptr)
-{
-       int max_len = -1;
-
-#if CONFIG_EMBEDDED
-       /* Limit arg size to 4 byte when no size is given */
-       max_len = 4;
-#endif
-
-       return PE_parse_boot_argn(arg_string, arg_ptr, max_len);
-}
-#endif
-
 static boolean_t
 PE_parse_boot_argn_internal(
        const char *arg_string,
@@ -393,7 +376,16 @@ getval(
 boolean_t
 PE_imgsrc_mount_supported()
 {
+#if CONFIG_LOCKERBOOT
+       /*
+        * Booting from a locker requires that we be able to mount the containing
+        * volume inside the locker. This looks redundant, but this is here in case
+        * the other conditional needs to be modified for some reason.
+        */
+       return TRUE;
+#else
        return TRUE;
+#endif
 }
 
 boolean_t
index 71e26b08fdcfd8c032c95b8c451a2e617605cb97..7d8d0bdcbb9b119e1638cef2f18836d5b9a7f499 100644 (file)
@@ -71,3 +71,8 @@ pe_identify_machine(__unused boot_args *args)
        gPEClockFrequencyInfo.bus_to_dec_rate_den =
            gPEClockFrequencyInfo.bus_clock_rate_hz / gPEClockFrequencyInfo.dec_clock_rate_hz;
 }
+
+void
+PE_panic_hook(const char *str __unused)
+{
+}
index c2debbbd182c5bddc22ae87886ce161932083cb2..4892e95c7f19dadfb41efff55ff6c56bf61ff316 100644 (file)
@@ -336,21 +336,13 @@ PE_call_timebase_callback(void)
 /*
  * The default (non-functional) PE_poll_input handler.
  */
-static int
+int
 PE_stub_poll_input(__unused unsigned int options, char * c)
 {
        *c = 0xff;
        return 1; /* 0 for success, 1 for unsupported */
 }
 
-/*
- * Called by the kernel debugger to poll for keyboard input.
- * Keyboard drivers may replace the default stub function
- * with their polled-mode input function.
- */
-int (*PE_poll_input)(unsigned int options, char * c)
-        = PE_stub_poll_input;
-
 boolean_t
 PE_reboot_on_panic(void)
 {
index 63e10d9f8e92d7671ce8a0e5a2facd7a65a48181..ce2ff230e865e80d1c9593efbabf52362faabed0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <i386/proc_reg.h>
 #include <os/log_private.h>
 #include <libkern/section_keywords.h>
+#include <kern/processor.h>
+#include <kern/clock.h>
+#include <mach/clock_types.h>
+
+extern uint64_t LockTimeOut;
+extern processor_t      current_processor(void);
 
 /* Globals */
 void (*PE_kputc)(char c);
@@ -53,7 +59,7 @@ SECURITY_READ_ONLY_LATE(unsigned int) disable_serial_output = FALSE;
 SECURITY_READ_ONLY_LATE(unsigned int) disable_serial_output = TRUE;
 #endif
 
-decl_simple_lock_data(static, kprintf_lock)
+decl_simple_lock_data(static, kprintf_lock);
 
 void
 PE_init_kprintf(boolean_t vm_initialized)
@@ -110,6 +116,9 @@ _kprintf(const char *format, ...)
 
 static int cpu_last_locked = 0;
 
+#define KPRINTF_LOCKWAIT_PATIENT (LockTimeOut)
+#define KPRINTF_LOCKWAIT_IMPATIENT (LockTimeOut >> 4)
+
 __attribute__((noinline, not_tail_called))
 void
 kprintf(const char *fmt, ...)
@@ -117,6 +126,8 @@ kprintf(const char *fmt, ...)
        va_list    listp;
        va_list    listp2;
        boolean_t  state;
+       boolean_t  in_panic_context = FALSE;
+       unsigned int kprintf_lock_grabbed;
        void      *caller = __builtin_return_address(0);
 
        if (!disable_serial_output) {
@@ -142,17 +153,16 @@ kprintf(const char *fmt, ...)
                        return;
                }
 
-               /*
-                * Spin to get kprintf lock but poll for incoming signals
-                * while interrupts are masked.
-                */
                state = ml_set_interrupts_enabled(FALSE);
 
                pal_preemption_assert();
 
-               while (!simple_lock_try(&kprintf_lock, LCK_GRP_NULL)) {
-                       (void) cpu_signal_handler(NULL);
-               }
+               in_panic_context = processor_in_panic_context(current_processor());
+
+               // If current CPU is in panic context, be a little more impatient.
+               kprintf_lock_grabbed = simple_lock_try_lock_mp_signal_safe_loop_duration(&kprintf_lock,
+                   in_panic_context ? KPRINTF_LOCKWAIT_IMPATIENT : KPRINTF_LOCKWAIT_PATIENT,
+                   LCK_GRP_NULL);
 
                if (cpu_number() != cpu_last_locked) {
                        MP_DEBUG_KPRINTF("[cpu%d...]\n", cpu_number());
@@ -164,7 +174,10 @@ kprintf(const char *fmt, ...)
                _doprnt(fmt, &listp, PE_kputc, 16);
                va_end(listp);
 
-               simple_unlock(&kprintf_lock);
+               if (kprintf_lock_grabbed) {
+                       simple_unlock(&kprintf_lock);
+               }
+
                ml_set_interrupts_enabled(state);
 
                // If interrupts are enabled
index d5b46d1a95f83d0c37ec78403883bab1e2bd0487..86386e4628d06ee434d9f71f1ae5daa79e0cb220 100644 (file)
@@ -11,6 +11,7 @@ DATAFILES = \
        board_config.h \
        boot.h \
        consistent_debug.h \
+       dockchannel.h \
        PL192_VIC.h \
        protos.h \
        S3cUART.h \
index a39829ba3225b2db9a68d0fc3bf5a2b7ede9e163..6c6d2e07cafbc421c2916adbf67f344e3613891f 100644 (file)
 #define DOCKFIFO_W_SPACING              (0x1000)
 #define DOCKFIFO_SPACING                (0x3000)
 
-#define rDOCKFIFO_R_DATA(_f, _n)        (*(volatile uint32_t *)(uart_base + ((_f) * DOCKFIFO_SPACING) + ((_n) * 4)))
-#define rDOCKFIFO_R_STAT(_f)            (*(volatile uint32_t *)(uart_base + ((_f) * DOCKFIFO_SPACING) + 0x14))
-#define rDOCKFIFO_W_DATA(_f, _n)        (*(volatile uint32_t *)(uart_base + ((_f) * DOCKFIFO_SPACING) + DOCKFIFO_W_SPACING + ((_n) * 4)))
-#define rDOCKFIFO_W_STAT(_f)            (*(volatile uint32_t *)(uart_base + ((_f) * DOCKFIFO_SPACING) + DOCKFIFO_W_SPACING + 0x14))
-#define rDOCKFIFO_CNFG(_f)              (*(volatile uint32_t *)(uart_base + ((_f) * DOCKFIFO_SPACING) + 0x2000))
-#define rDOCKFIFO_DRAIN(_f)             (*(volatile uint32_t *)(uart_base + ((_f) * DOCKFIFO_SPACING) + 0x2004))
-#define rDOCKFIFO_INTMASK(_f)           (*(volatile uint32_t *)(uart_base + ((_f) * DOCKFIFO_SPACING) + 0x2008))
+#define rDOCKFIFO_R_DATA(_f, _n)        (*(volatile uint32_t *)(dockfifo_uart_base + ((_f) * DOCKFIFO_SPACING) + ((_n) * 4)))
+#define rDOCKFIFO_R_STAT(_f)            (*(volatile uint32_t *)(dockfifo_uart_base + ((_f) * DOCKFIFO_SPACING) + 0x14))
+#define rDOCKFIFO_W_DATA(_f, _n)        (*(volatile uint32_t *)(dockfifo_uart_base + ((_f) * DOCKFIFO_SPACING) + DOCKFIFO_W_SPACING + ((_n) * 4)))
+#define rDOCKFIFO_W_STAT(_f)            (*(volatile uint32_t *)(dockfifo_uart_base + ((_f) * DOCKFIFO_SPACING) + DOCKFIFO_W_SPACING + 0x14))
+#define rDOCKFIFO_CNFG(_f)              (*(volatile uint32_t *)(dockfifo_uart_base + ((_f) * DOCKFIFO_SPACING) + 0x2000))
+#define rDOCKFIFO_DRAIN(_f)             (*(volatile uint32_t *)(dockfifo_uart_base + ((_f) * DOCKFIFO_SPACING) + 0x2004))
+#define rDOCKFIFO_INTMASK(_f)           (*(volatile uint32_t *)(dockfifo_uart_base + ((_f) * DOCKFIFO_SPACING) + 0x2008))
 
 #endif
 
index 9f90baead955e5bd4db41ae561a61b59e08a07de..e231d30be1205d03fdd5d96e77157a0a18e416d3 100644 (file)
 
 #include <pexpert/arm/S3cUART.h>
 
-#define rPMGR_EVENT_TMR                         (*(volatile uint32_t *) (timer_base + 0x00000))
-#define rPMGR_EVENT_TMR_PERIOD                  (*(volatile uint32_t *) (timer_base + 0x00004))
-#define rPMGR_EVENT_TMR_CTL                     (*(volatile uint32_t *) (timer_base + 0x00008))
-
-#define PMGR_EVENT_TMR_CTL_EN                   (1 << 0)
-
-#define DOCKCHANNEL_UART                        (1)
-#define DOCKCHANNEL_STRIDE                      (0x10000)
-
-// Channel index
-#define DOCKCHANNEL_UART_CHANNEL                (0)
+#include <pexpert/arm/dockchannel.h>
 
 // AOP_CLOCK frequency * 30 ms
 #define DOCKCHANNEL_DRAIN_PERIOD                (96000000 * 0.03)
 
-#define rDOCKCHANNELS_AGENT_AP_INTR_CTRL        (*(volatile uint32_t *) (dock_agent_base + 0x00))
-#define rDOCKCHANNELS_AGENT_AP_INTR_STATUS      (*(volatile uint32_t *) (dock_agent_base + 0x04))
-#define rDOCKCHANNELS_AGENT_AP_ERR_INTR_CTRL    (*(volatile uint32_t *) (dock_agent_base + 0x08))
-#define rDOCKCHANNELS_AGENT_AP_ERR_INTR_STATUS  (*(volatile uint32_t *) (dock_agent_base + 0x0c))
-
-#define rDOCKCHANNELS_DEV_DRAIN_CFG(_ch)        (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x0008))
+#define rPMGR_EVENT_TMR                         (*(volatile uint32_t *) (timer_base + 0x00000))
+#define rPMGR_EVENT_TMR_PERIOD                  (*(volatile uint32_t *) (timer_base + 0x00004))
+#define rPMGR_EVENT_TMR_CTL                     (*(volatile uint32_t *) (timer_base + 0x00008))
 
-#define rDOCKCHANNELS_DEV_WDATA1(_ch)           (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4004))
-#define rDOCKCHANNELS_DEV_WSTAT(_ch)            (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4014))
-#define rDOCKCHANNELS_DEV_RDATA0(_ch)           (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4018))
-#define rDOCKCHANNELS_DEV_RDATA1(_ch)           (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x401c))
+#define PMGR_EVENT_TMR_CTL_EN                   (1 << 0)
 
-#define rDOCKCHANNELS_DOCK_RDATA1(_ch)          (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0xc01c))
-#define rDOCKCHANNELS_DOCK_RDATA3(_ch)          (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0xc024))
 #endif
 
 #endif /* ! _PEXPERT_ARM_T8002_H */
index ceb3175947442eeb1000d98c7c61ee4b06e4b15b..62549b8687ef221061914feb25ea07715458f16f 100644 (file)
@@ -69,6 +69,8 @@ typedef enum {
 #define kDbgIdAstrisConnection          DEBUG_RECORD_ID_LONG('A','S','T','R','C','N','X','N')
 #define kDbgIdAstrisConnectionVers      DEBUG_RECORD_ID_LONG('A','S','T','R','C','V','E','R')
 
+#define kDbgIdMacOSPanicRegion          DEBUG_RECORD_ID_LONG('M','A','C','P','A','N','I','C')
+
 #define kDbgIdUnusedEntry       0x0ULL
 #define kDbgIdReservedEntry     DEBUG_RECORD_ID_LONG('R','E','S','E','R','V','E', 'D')
 #define kDbgIdFreeReqEntry      DEBUG_RECORD_ID_LONG('F','R','E','E','-','R','E','Q')
@@ -126,6 +128,12 @@ int PE_consistent_debug_inherit(void);
  */
 int PE_consistent_debug_register(uint64_t record_id, uint64_t physaddr, uint64_t length);
 
+/*
+ * Lookup an exidting entry from the consistent debug structure, populate the attributes
+ * if it exists.
+ */
+boolean_t PE_consistent_debug_lookup_entry(uint64_t record_id, uint64_t *phys_addr, uint64_t *length);
+
 /*
  * Returns whether consistent debug is enabled on the current device.
  */
diff --git a/pexpert/pexpert/arm/dockchannel.h b/pexpert/pexpert/arm/dockchannel.h
new file mode 100644 (file)
index 0000000..0d012dd
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifndef _PEXPERT_ARM_DOCKCHANNEL_H
+#define _PEXPERT_ARM_DOCKCHANNEL_H
+
+#define DOCKCHANNEL_UART                        (1)
+#define DOCKCHANNEL_STRIDE                      (0x10000)
+
+// Channel index
+#define DOCKCHANNEL_UART_CHANNEL                (0)
+
+#define rDOCKCHANNELS_AGENT_AP_INTR_CTRL        (*(volatile uint32_t *) (dock_agent_base + 0x00))
+#define rDOCKCHANNELS_AGENT_AP_INTR_STATUS      (*(volatile uint32_t *) (dock_agent_base + 0x04))
+#define rDOCKCHANNELS_AGENT_AP_ERR_INTR_CTRL    (*(volatile uint32_t *) (dock_agent_base + 0x08))
+#define rDOCKCHANNELS_AGENT_AP_ERR_INTR_STATUS  (*(volatile uint32_t *) (dock_agent_base + 0x0c))
+
+#define rDOCKCHANNELS_DEV_DRAIN_CFG(_ch)        (*(volatile uint32_t *) (dockchannel_uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x0008))
+
+#define rDOCKCHANNELS_DEV_WDATA1(_ch)           (*(volatile uint32_t *) (dockchannel_uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4004))
+#define rDOCKCHANNELS_DEV_WSTAT(_ch)            (*(volatile uint32_t *) (dockchannel_uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4014))
+#define rDOCKCHANNELS_DEV_RDATA0(_ch)           (*(volatile uint32_t *) (dockchannel_uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4018))
+#define rDOCKCHANNELS_DEV_RDATA1(_ch)           (*(volatile uint32_t *) (dockchannel_uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x401c))
+
+#define rDOCKCHANNELS_DOCK_RDATA1(_ch)          (*(volatile uint32_t *) (dockchannel_uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0xc01c))
+#define rDOCKCHANNELS_DOCK_RDATA3(_ch)          (*(volatile uint32_t *) (dockchannel_uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0xc024))
+
+#endif  /* !_PEXPERT_ARM_DOCKCHANNEL_H */
index 59148fd86f5a4ac7ddafada29869ecfdd19f8cbe..cc3a2147fc0d85b5317b757c40372647cc7b71f8 100644 (file)
 
 #define PI3_BREAK                               asm volatile("brk #0");
 
-#define BCM2837_GPIO_BASE       0x3F200000
-#define BCM2837_GPIO_SIZE       0xA0
-#define BCM2837_GPFSEL0         0x3F200000
-#define BCM2837_GPSET0          0x3F20001C
-#define BCM2837_GPCLR0          0x3F200028
-#define BCM2837_GPPUD           0x3F200094
-#define BCM2837_GPPUDCLK0       0x3F200098
-
-#define BCM2837_AUX_BASE        0x3F215000
-#define BCM2837_AUX_SIZE        0x70
-#define BCM2837_AUX_ENABLES     0x3F215004
-#define BCM2837_AUX_MU_IO_REG   0x3F215040
-#define BCM2837_AUX_MU_IER_REG  0x3F215044
-#define BCM2837_AUX_MU_IIR_REG  0x3F215048
-#define BCM2837_AUX_MU_LCR_REG  0x3F21504C
-#define BCM2837_AUX_MU_MCR_REG  0x3F215050
-#define BCM2837_AUX_MU_LSR_REG  0x3F215054
-#define BCM2837_AUX_MU_MSR_REG  0x3F215058
-#define BCM2837_AUX_MU_SCRATCH  0x3F21505C
-#define BCM2837_AUX_MU_CNTL_REG 0x3F215060
-#define BCM2837_AUX_MU_STAT_REG 0x3F215064
-#define BCM2837_AUX_MU_BAUD_REG 0x3F215068
-
 #define BCM2837_GPFSEL0_V               (pi3_gpio_base_vaddr + 0x0)
 #define BCM2837_GPSET0_V                (pi3_gpio_base_vaddr + 0x1C)
 #define BCM2837_GPCLR0_V                (pi3_gpio_base_vaddr + 0x28)
index 49f2b889e51e2c5e509109bcc7814e188a92c7dc..059b64ee8192a49aece162711e300e258f95f668 100644 (file)
@@ -13,15 +13,14 @@ DATAFILES = \
        board_config.h \
        boot.h \
        S3c2410x.h \
-       S5L8960X.h \
        T7000.h \
        S8000.h \
        T8010.h \
-       cyclone.h \
        typhoon.h \
        twister.h \
        hurricane.h \
-       BCM2837.h
+       BCM2837.h \
+       spr_locks.h
 
 
 INSTALL_MD_LIST        = ${DATAFILES}
diff --git a/pexpert/pexpert/arm64/S5L8960X.h b/pexpert/pexpert/arm64/S5L8960X.h
deleted file mode 100644 (file)
index 82e1403..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) 2011 Apple Inc. All rights reserved.
- */
-
-#ifndef _PEXPERT_ARM_S5L8960X_H
-#define _PEXPERT_ARM_S5L8960X_H
-
-#include <pexpert/arm64/AIC.h>
-#include <pexpert/arm64/cyclone.h>
-
-#define WITH_CLASSIC_S2R        1
-
-#ifndef ASSEMBLER
-
-#include <pexpert/arm/S3cUART.h>
-
-#endif
-
-#endif /* ! _PEXPERT_ARM_S5L8960X_H */
index ed1ecbb116a98b0cbf5e65359e953fd4bdd6d430..826414b54c32d6b9c5c548df9206d9763119356e 100644 (file)
 #ifndef ASSEMBLER
 
 #include <pexpert/arm/S3cUART.h>
+#include <pexpert/arm/dockchannel.h>
 #include <pexpert/arm64/AMCC.h>
 
-#define DOCKCHANNEL_UART                        (1)
-#define DOCKCHANNEL_STRIDE                      (0x10000)
-
-// Channel index
-#define DOCKCHANNEL_UART_CHANNEL                (0)
-
 // AOP_CLOCK frequency * 30 ms
 #define DOCKCHANNEL_DRAIN_PERIOD                (192000000 * 0.03)
 
-#define rDOCKCHANNELS_AGENT_AP_INTR_CTRL        (*(volatile uint32_t *) (dock_agent_base + 0x00))
-#define rDOCKCHANNELS_AGENT_AP_INTR_STATUS      (*(volatile uint32_t *) (dock_agent_base + 0x04))
-#define rDOCKCHANNELS_AGENT_AP_ERR_INTR_CTRL    (*(volatile uint32_t *) (dock_agent_base + 0x08))
-#define rDOCKCHANNELS_AGENT_AP_ERR_INTR_STATUS  (*(volatile uint32_t *) (dock_agent_base + 0x0c))
-
-#define rDOCKCHANNELS_DEV_DRAIN_CFG(_ch)        (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x0008))
-
-#define rDOCKCHANNELS_DEV_WDATA1(_ch)           (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4004))
-#define rDOCKCHANNELS_DEV_WSTAT(_ch)            (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4014))
-#define rDOCKCHANNELS_DEV_RDATA0(_ch)           (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x4018))
-#define rDOCKCHANNELS_DEV_RDATA1(_ch)           (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0x401c))
-
-#define rDOCKCHANNELS_DOCK_RDATA1(_ch)          (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0xc01c))
-#define rDOCKCHANNELS_DOCK_RDATA3(_ch)          (*(volatile uint32_t *) (uart_base + ((_ch) * DOCKCHANNEL_STRIDE) + 0xc024))
-
 #endif
 
 #endif /* ! _PEXPERT_ARM_T8010_H */
index ac3c6d32073bb49eebf97f132b258b6f556a9f9f..3d32aca8b6624d47d6b18f80f2c0827392961bb1 100644 (file)
 
 #ifdef APPLE_ARM64_ARCH_FAMILY
 
-#define ARM64_REG_HID0                                         S3_0_c15_c0_0
-#define ARM64_REG_HID0_LoopBuffDisb                            (1<<20)
-#define ARM64_REG_HID0_ICPrefLimitOneBrn                       (1<<25) 
-#define ARM64_REG_HID0_PMULLFuseDisable                                (1ULL<<33)
-#define ARM64_REG_HID0_ICPrefDepth_bshift                      60
-#define ARM64_REG_HID0_ICPrefDepth_bmsk                                (7ULL <<ARM64_REG_HID0_ICPrefDepth_bshift)
-
-#define ARM64_REG_EHID0                                                S3_0_c15_c0_1
-#define ARM64_REG_EHID0_nfpRetFwdDisb                          (1ULL<<45)
-
-#define ARM64_REG_HID1                                         S3_0_c15_c1_0
-#define ARM64_REG_HID1_disCmpBrFusion                          (1<<14)
-#define ARM64_REG_HID1_rccForceAllIexL3ClksOn                  (1<<23)
-#define ARM64_REG_HID1_rccDisStallInactiveIexCtl               (1<<24)
-#define ARM64_REG_HID1_disLspFlushWithContextSwitch            (1<<25)
-#define ARM64_REG_HID1_disAESFuseAcrossGrp                     (1<<44)
-#define ARM64_REG_HID1_enaBrKillLimit                          (1ULL << 60)
-
-#define ARM64_REG_HID2                                         S3_0_c15_c2_0
-#define ARM64_REG_HID2_disMMUmtlbPrefetch                      (1<<13)
-
-#define ARM64_REG_HID3                                         S3_0_c15_c3_0
-#define ARM64_REG_HID3_DisDcZvaCmdOnly                         (1<<25)
-#define ARM64_REG_HID3_DisXmonSnpEvictTriggerL2StarvationMode  (1<<54)
-#define ARM64_REG_HID3_DisColorOpt                             (1<<2)
-
-#define ARM64_REG_EHID3                                                S3_0_c15_c3_1
-#define ARM64_REG_EHID3_DisColorOpt                            (1<<2)
-#define ARM64_REG_EHID3_DisDcZvaCmdOnly                                (1<<25)
-
-#define ARM64_REG_HID4                                         S3_0_c15_c4_0
-#define ARM64_REG_EHID4                                                S3_0_c15_c4_1
-
-#define ARM64_REG_HID4_DisDcMVAOps                             (1<<11)
-#define ARM64_REG_HID4_DisSpecLnchRead                 (1<<33)
-#define ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd                        (1<<39)
-#define ARM64_REG_HID4_DisDcSWL2Ops                            (1<<44)
-
-#define ARM64_REG_HID5                                         S3_0_c15_c5_0
-#define ARM64_REG_HID5_DisHwpLd                                        (1<<44)
-#define ARM64_REG_HID5_DisHwpSt                                        (1<<45)
-#define ARM64_REG_HID5_DisFullLineWr                           (1ULL << 57)
-#define ARM64_REG_HID5_EnableDnFIFORdStall                     (1ULL << 54)
-#define ARM64_REG_HID5_CrdEdbSnpRsvd_mask                      (3ULL << 14)
-#define ARM64_REG_HID5_CrdEdbSnpRsvd_VALUE                     (2ULL << 14)
-
-#define ARM64_REG_EHID5                                                S3_0_c15_c5_1
-#define ARM64_REG_EHID5_DisFillByp                     (1 << 35)
-
-#define ARM64_REG_HID6                                         S3_0_c15_c6_0
-#define ARM64_REG_HID6_DisClkDivGating                         (1ULL << 55)
-
-#define ARM64_REG_HID7                                         S3_0_c15_c7_0
-#define ARM64_REG_HID7_disNexFastFmul                          (1 << 10)
-#define ARM64_REG_HID7_disCrossPick2                           (1ULL << 7)
-
-#define ARM64_REG_HID8                                         S3_0_c15_c8_0
-#define ARM64_REG_HID8_DataSetID0_VALUE                                (0xF << 4)
-#define ARM64_REG_HID8_DataSetID1_VALUE                                (0xF << 8)
-#define ARM64_REG_HID8_WkeForceStrictOrder                     (0x1ULL << 35)
-#define ARM64_REG_HID8_DataSetID2_VALUE                                (0xF << 56)
-#define ARM64_REG_HID8_DataSetID3_VALUE                                (0xF << 60)
-
-#define ARM64_REG_HID9                                         S3_0_c15_c9_0
-
-#define ARM64_REG_HID10                                                S3_0_c15_c10_0
-#define ARM64_REG_HID10_DisHwpGups                             (1ULL << 0)
-
-#define ARM64_REG_EHID10                                               S3_0_c15_c10_1
-#define ARM64_REG_EHID10_rccDisPwrSavePrfClkOff        (1ULL << 19)
-
-#if defined(APPLECYCLONE) || defined(APPLETYPHOON) || defined(APPLETWISTER)
-#define ARM64_REG_HID11                                                S3_0_c15_c13_0
-#else
-#define ARM64_REG_HID11                                                S3_0_c15_c11_0
-#endif
-#define ARM64_REG_HID11_DisX64NTLnchOpt                                (1ULL << 1)
-#define ARM64_REG_HID11_DisFillC1BubOpt                                (1<<7)
-#define ARM64_REG_HID11_DisFastDrainOpt                                (1ULL << 23)
-
-#define ARM64_REG_EHID11                                       S3_0_c15_c11_1
-#define ARM64_REG_EHID11_SmbDrainThresh_mask                   (3ULL << 40)
-
-#if defined(APPLECYCLONE) || defined(APPLETYPHOON) || defined(APPLETWISTER)
-#define ARM64_REG_CYC_CFG                                      S3_5_c15_c4_0
-#define ARM64_REG_CYC_CFG_skipInit                             (1ULL<<30)
-#define ARM64_REG_CYC_CFG_deepSleep                            (1ULL<<24)
-#else
-#define ARM64_REG_ACC_OVRD                                     S3_5_c15_c6_0
+#define ARM64_REG_HID0                    S3_0_c15_c0_0
+#define ARM64_REG_HID0_LoopBuffDisb       (1<<20)
+#define ARM64_REG_HID0_ICPrefLimitOneBrn  (1<<25)
+#define ARM64_REG_HID0_PMULLFuseDisable   (1ULL<<33)
+#define ARM64_REG_HID0_CacheFusionDisable (1ULL<<36)
+#define ARM64_REG_HID0_ICPrefDepth_bshift 60
+#define ARM64_REG_HID0_ICPrefDepth_bmsk   (7ULL <<ARM64_REG_HID0_ICPrefDepth_bshift)
+
+#define ARM64_REG_EHID0               S3_0_c15_c0_1
+#define ARM64_REG_EHID0_nfpRetFwdDisb (1ULL<<45)
+
+#define ARM64_REG_HID1                              S3_0_c15_c1_0
+#define ARM64_REG_HID1_disCmpBrFusion               (1<<14)
+#define ARM64_REG_HID1_forceNexL3ClkOn              (1<<15)
+#define ARM64_REG_HID1_rccForceAllIexL3ClksOn       (1<<23)
+#define ARM64_REG_HID1_rccDisStallInactiveIexCtl    (1<<24)
+#define ARM64_REG_HID1_disLspFlushWithContextSwitch (1<<25)
+#define ARM64_REG_HID1_disAESFuseAcrossGrp          (1<<44)
+#define ARM64_REG_HID1_disMSRSpecDAIF               (1ULL << 49)
+#define ARM64_REG_HID1_enaBrKillLimit               (1ULL << 60)
+
+#define ARM64_REG_EHID1                             S3_0_c15_c1_1
+#define ARM64_REG_EHID1_disMSRSpecDAIF              (1ULL << 30)
+
+#define ARM64_REG_HID2                    S3_0_c15_c2_0
+#define ARM64_REG_HID2_disMMUmtlbPrefetch (1<<13)
+
+#define ARM64_REG_HID3                                        S3_0_c15_c3_0
+#define ARM64_REG_HID3_DisColorOpt                            (1<<2)
+#define ARM64_REG_HID3_DisDcZvaCmdOnly                        (1<<25)
+#define ARM64_REG_HID3_DisXmonSnpEvictTriggerL2StarvationMode (1<<54)
+
+#define ARM64_REG_EHID3                 S3_0_c15_c3_1
+#define ARM64_REG_EHID3_DisColorOpt     (1<<2)
+#define ARM64_REG_EHID3_DisDcZvaCmdOnly (1<<25)
+
+#define ARM64_REG_HID4                          S3_0_c15_c4_0
+#define ARM64_REG_EHID4                         S3_0_c15_c4_1
+
+#define ARM64_REG_HID4_DisDcMVAOps              (1<<11)
+#define ARM64_REG_HID4_DisSpecLnchRead          (1<<33)
+#define ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd (1<<39)
+#define ARM64_REG_HID4_CnfCntrThresh_shift     (40)
+#define ARM64_REG_HID4_CnfCntrThresh_mask      (0x3ULL << ARM64_REG_HID4_CnfCntrThresh_shift)
+#define ARM64_REG_HID4_DisDcSWL2Ops             (1<<44)
+#define ARM64_REG_HID4_disSpecLSRedirect        (1<<9)
+#define ARM64_REG_HID4_DisSTNTWidget            (1<<1)
+
+#define ARM64_REG_HID5                     S3_0_c15_c5_0
+#define ARM64_REG_HID5_DisHwpLd            (1<<44)
+#define ARM64_REG_HID5_DisHwpSt            (1<<45)
+#define ARM64_REG_HID5_DisFill2cMerge      (1ULL << 61)
+#define ARM64_REG_HID5_EnableDnFIFORdStall (1ULL << 54)
+#define ARM64_REG_HID5_DisFullLineWr       (1ULL << 57)
+#define ARM64_REG_HID5_CrdEdbSnpRsvd_mask  (3ULL << 14)
+#define ARM64_REG_HID5_CrdEdbSnpRsvd_VALUE (2ULL << 14)
+
+#define ARM64_REG_EHID5            S3_0_c15_c5_1
+#define ARM64_REG_EHID5_DisFillByp (1 << 35)
+
+#define ARM64_REG_HID6                 S3_0_c15_c6_0
+#define ARM64_REG_HID6_DisClkDivGating (1ULL << 55)
+
+#define ARM64_REG_HID7                S3_0_c15_c7_0
+#define ARM64_REG_HID7_disNexFastFmul (1 << 10)
+#define ARM64_REG_HID7_disCrossPick2  (1ULL << 7)
+
+#define ARM64_REG_HID8                     S3_0_c15_c8_0
+#define ARM64_REG_HID8_DataSetID0_VALUE    (0xF << 4)
+#define ARM64_REG_HID8_DataSetID1_VALUE    (0xF << 8)
+#define ARM64_REG_HID8_WkeForceStrictOrder (0x1ULL << 35)
+#define ARM64_REG_HID8_DataSetID2_VALUE    (0xF << 56)
+#define ARM64_REG_HID8_DataSetID3_VALUE    (0xF << 60)
+
+#define ARM64_REG_HID9                         S3_0_c15_c9_0
+#define ARM64_REG_HID9_DisSTNTWidgetForUnalign (1ULL << 52)
+#define ARM64_REG_HID9_EnableFixBug47221499    (1ULL << 54)
+
+#define ARM64_REG_HID10            S3_0_c15_c10_0
+#define ARM64_REG_HID10_DisHwpGups (1ULL << 0)
+
+#define ARM64_REG_EHID10                        S3_0_c15_c10_1
+#define ARM64_REG_EHID10_rccDisPwrSavePrfClkOff (1ULL << 19)
+#define ARM64_REG_EHID10_ForceWStDrainUc        (1ULL << 32)
+
+#if defined(APPLETYPHOON) || defined(APPLETWISTER)
+#define ARM64_REG_HID11                 S3_0_c15_c13_0
+#else /* defined(APPLECYCLONE) || defined(APPLETYPHOON) || defined(APPLETWISTER) */
+#define ARM64_REG_HID11                 S3_0_c15_c11_0
+#endif /* defined(APPLECYCLONE) || defined(APPLETYPHOON) || defined(APPLETWISTER) */
+#define ARM64_REG_HID11_DisX64NTLnchOpt (1ULL << 1)
+#define ARM64_REG_HID11_DisFillC1BubOpt (1ULL << 7)
+#define ARM64_REG_HID11_DisFastDrainOpt (1ULL << 23)
+
+#define ARM64_REG_EHID11                     S3_0_c15_c11_1
+#define ARM64_REG_EHID11_SmbDrainThresh_mask (3ULL << 40)
+
+#define ARM64_REG_HID13                      S3_0_c15_c14_0
+#define ARM64_REG_HID13_PreCyc_shift         (14)
+#define ARM64_REG_HID13_PreCyc_mask          (0xFULL << ARM64_REG_HID13_PreCyc_shift)
+
+#define ARM64_REG_HID16                      S3_0_c15_c15_2
+#define ARM64_REG_HID16_leqThrottleAggr      (1ULL << 18)
+#define ARM64_REG_HID16_EnRs4Sec             (1ULL << 57)
+#define ARM64_REG_HID16_DisxPickRs45         (1ULL << 60)
+#define ARM64_REG_HID16_EnMPxPick45          (1ULL << 61)
+#define ARM64_REG_HID16_EnMPCyc7             (1ULL << 62)
+
+#if defined(APPLETYPHOON) || defined(APPLETWISTER)
+#define ARM64_REG_CYC_CFG              S3_5_c15_c4_0
+#define ARM64_REG_CYC_CFG_skipInit     (1ULL<<30)
+#define ARM64_REG_CYC_CFG_deepSleep    (1ULL<<24)
+#else /* defined(APPLECYCLONE) || defined(APPLETYPHOON) || defined(APPLETWISTER) */
+#define ARM64_REG_ACC_OVRD             S3_5_c15_c6_0
 #if defined(APPLEMONSOON)
-#define ARM64_REG_ACC_EBLK_OVRD                                        S3_5_c15_c6_1   // EBLK_OVRD on Zephyr
-#endif
-#define ARM64_REG_ACC_OVRD_enDeepSleep                         (1ULL << 34)
-#define ARM64_REG_ACC_OVRD_disPioOnWfiCpu                      (1ULL << 32)
-#define ARM64_REG_ACC_OVRD_dsblClkDtr                          (1ULL << 29)
-#define ARM64_REG_ACC_OVRD_cpmWakeUp_mask                      (3ULL << 27)
-#define ARM64_REG_ACC_OVRD_cpmWakeUp_force                     (3ULL << 27)
-#define ARM64_REG_ACC_OVRD_ok2PwrDnCPM_mask                    (3ULL << 25)
-#define ARM64_REG_ACC_OVRD_ok2PwrDnCPM_deny                    (2ULL << 25)
-#define ARM64_REG_ACC_OVRD_ok2PwrDnCPM_deepsleep               (3ULL << 25)
-#define ARM64_REG_ACC_OVRD_ok2TrDnLnk_mask                     (3ULL << 17)
-#define ARM64_REG_ACC_OVRD_ok2TrDnLnk_deepsleep                        (3ULL << 17)
-#define ARM64_REG_ACC_OVRD_disL2Flush4AccSlp_mask              (3ULL << 15)
-#define ARM64_REG_ACC_OVRD_disL2Flush4AccSlp_deepsleep         (2ULL << 15)
-#define ARM64_REG_ACC_OVRD_ok2PwrDnSRM_mask                    (3ULL << 13)
-#define ARM64_REG_ACC_OVRD_ok2PwrDnSRM_deepsleep               (3ULL << 13)
-#endif
-
-#define ARM64_REG_CYC_OVRD                                     S3_5_c15_c5_0
-#define ARM64_REG_CYC_OVRD_ok2pwrdn_force_up                   (2<<24)
-#define ARM64_REG_CYC_OVRD_ok2pwrdn_force_down                 (3<<24)
-#define ARM64_REG_CYC_OVRD_disWfiRetn                          (1<<0)
+#define ARM64_REG_ACC_EBLK_OVRD        S3_5_c15_c6_1 // EBLK_OVRD on Zephyr
+#endif /* defined(APPLEMONSOON) */
+
+#define ARM64_REG_ACC_OVRD_enDeepSleep                 (1ULL << 34)
+#define ARM64_REG_ACC_OVRD_disPioOnWfiCpu              (1ULL << 32)
+#define ARM64_REG_ACC_OVRD_dsblClkDtr                  (1ULL << 29)
+#define ARM64_REG_ACC_OVRD_cpmWakeUp_mask              (3ULL << 27)
+#define ARM64_REG_ACC_OVRD_cpmWakeUp_force             (3ULL << 27)
+#define ARM64_REG_ACC_OVRD_ok2PwrDnCPM_mask            (3ULL << 25)
+#define ARM64_REG_ACC_OVRD_ok2PwrDnCPM_deny            (2ULL << 25)
+#define ARM64_REG_ACC_OVRD_ok2PwrDnCPM_deepsleep       (3ULL << 25)
+#define ARM64_REG_ACC_OVRD_ok2TrDnLnk_mask             (3ULL << 17)
+#define ARM64_REG_ACC_OVRD_ok2TrDnLnk_deepsleep        (3ULL << 17)
+#define ARM64_REG_ACC_OVRD_disL2Flush4AccSlp_mask      (3ULL << 15)
+#define ARM64_REG_ACC_OVRD_disL2Flush4AccSlp_deepsleep (2ULL << 15)
+#define ARM64_REG_ACC_OVRD_ok2PwrDnSRM_mask            (3ULL << 13)
+#define ARM64_REG_ACC_OVRD_ok2PwrDnSRM_deepsleep       (3ULL << 13)
+
+#endif /* defined(APPLECYCLONE) || defined(APPLETYPHOON) || defined(APPLETWISTER) */
+
+#define ARM64_REG_CYC_OVRD                     S3_5_c15_c5_0
+#define ARM64_REG_CYC_OVRD_ok2pwrdn_force_up   (2<<24)
+#define ARM64_REG_CYC_OVRD_ok2pwrdn_force_down (3<<24)
+#define ARM64_REG_CYC_OVRD_disWfiRetn          (1<<0)
 
 #if defined(APPLEMONSOON)
-#define ARM64_REG_CYC_OVRD_dsblSnoopTime_mask                  (3ULL << 30)
-#define ARM64_REG_CYC_OVRD_dsblSnoopPTime                      (1ULL << 31)    /// Don't fetch the timebase from the P-block
+#define ARM64_REG_CYC_OVRD_dsblSnoopTime_mask  (3ULL << 30)
+#define ARM64_REG_CYC_OVRD_dsblSnoopPTime      (1ULL << 31)  /// Don't fetch the timebase from the P-block
 #endif /* APPLEMONSOON */
 
-#define ARM64_REG_LSU_ERR_STS                          S3_3_c15_c0_0
-#define ARM64_REG_LSU_ERR_STS_L1DTlbMultiHitEN (1ULL<<54)
+#define ARM64_REG_LSU_ERR_STS                  S3_3_c15_c0_0
+#define ARM64_REG_LSU_ERR_STS_L1DTlbMultiHitEN (1ULL<<54)
 
-#define ARM64_REG_E_LSU_ERR_STS                                S3_3_c15_c2_0
+#define ARM64_REG_E_LSU_ERR_STS                S3_3_c15_c2_0
 
-#define ARM64_REG_LSU_ERR_CTL                          S3_3_c15_c1_0
-#define ARM64_REG_LSU_ERR_CTL_L1DTlbMultiHitEN (1ULL<<3)
+#define ARM64_REG_LSU_ERR_CTL                  S3_3_c15_c1_0
+#define ARM64_REG_LSU_ERR_CTL_L1DTlbMultiHitEN (1ULL<<3)
 
-#define ARM64_REG_FED_ERR_STS                          S3_4_C15_C0_0
+#define ARM64_REG_FED_ERR_STS                  S3_4_C15_C0_0
 
-#define ARM64_REG_E_FED_ERR_STS                                S3_4_C15_C0_2
+#define ARM64_REG_E_FED_ERR_STS                S3_4_C15_C0_2
 
-#define ARM64_REG_MMU_ERR_STS                          S3_6_c15_c0_0
+#define ARM64_REG_MMU_ERR_STS                  S3_6_c15_c0_0
 
-#define ARM64_REG_E_MMU_ERR_STS                                s3_6_c15_c2_0
+#define ARM64_REG_E_MMU_ERR_STS                s3_6_c15_c2_0
 
-#define ARM64_REG_L2C_ERR_STS                          S3_3_c15_c8_0
+#define ARM64_REG_L2C_ERR_STS                  S3_3_c15_c8_0
 
-#define ARM64_REG_L2C_ERR_ADR                          S3_3_c15_c9_0
+#define ARM64_REG_L2C_ERR_ADR                  S3_3_c15_c9_0
 
-#define ARM64_REG_L2C_ERR_INF                          S3_3_c15_c10_0
+#define ARM64_REG_L2C_ERR_INF                  S3_3_c15_c10_0
 
-#define ARM64_REG_MIGSTS_EL1                           S3_4_c15_c0_4
+#define ARM64_REG_MIGSTS_EL1                   S3_4_c15_c0_4
+
+#define ARM64_REG_DPC_ERR_STS                  S3_5_c15_c0_5
 
 #if defined(HAS_KTRR)
 
 #ifdef ASSEMBLER
-#define ARM64_REG_KTRR_LOWER_EL1                        S3_4_c15_c2_3
-#define ARM64_REG_KTRR_UPPER_EL1                        S3_4_c15_c2_4
-#define ARM64_REG_KTRR_LOCK_EL1                         S3_4_c15_c2_2
-#else
-#define ARM64_REG_KTRR_LOWER_EL1                        "S3_4_c15_c2_3"
-#define ARM64_REG_KTRR_UPPER_EL1                        "S3_4_c15_c2_4"
-#define ARM64_REG_KTRR_LOCK_EL1                         "S3_4_c15_c2_2"
+#define ARM64_REG_KTRR_LOWER_EL1 S3_4_c15_c2_3
+#define ARM64_REG_KTRR_UPPER_EL1 S3_4_c15_c2_4
+#define ARM64_REG_KTRR_LOCK_EL1  S3_4_c15_c2_2
+#else /* ASSEMBLER */
+#define ARM64_REG_KTRR_LOWER_EL1 "S3_4_c15_c2_3"
+#define ARM64_REG_KTRR_UPPER_EL1 "S3_4_c15_c2_4"
+#define ARM64_REG_KTRR_LOCK_EL1  "S3_4_c15_c2_2"
 #endif /* ASSEMBLER */
 
 #endif /* defined (HAS_KTRR) */
 
 
 
-#endif /* APPLE_ARM64_ARCH_FAMILY */
+#endif /* APPLE_ARM64_ARCH_FAMILY */
+
+
+
+#if defined(HAS_APPLE_PAC)
+
+#ifdef ASSEMBLER
+#define ARM64_REG_APCTL_EL1            S3_4_c15_c0_4
+#define ARM64_REG_APSTS_EL1            S3_6_c15_c12_4
+#else /* ASSEMBLER */
+#define ARM64_REG_APCTL_EL1            "S3_4_c15_c0_4"
+#define ARM64_REG_APSTS_EL1            "S3_6_c15_c12_4"
+#endif /* ASSEMBLER */
+
+#if ASSEMBLER
+#define ARM64_REG_KERNELKEYLO_EL1      S3_4_c15_c1_0
+#define ARM64_REG_KERNELKEYHI_EL1      S3_4_c15_c1_1
+
+#define ARM64_REG_APIAKEYLO_EL1        S3_0_c2_c1_0
+#define ARM64_REG_APIAKEYHI_EL1        S3_0_c2_c1_1
+#define ARM64_REG_APIBKEYLO_EL1        S3_0_c2_c1_2
+#define ARM64_REG_APIBKEYHI_EL1        S3_0_c2_c1_3
+
+#define ARM64_REG_APDAKEYLO_EL1        S3_0_c2_c2_0
+#define ARM64_REG_APDAKEYHI_EL1        S3_0_c2_c2_1
+#define ARM64_REG_APDBKEYLO_EL1        S3_0_c2_c2_2
+#define ARM64_REG_APDBKEYHI_EL1        S3_0_c2_c2_3
+
+#define ARM64_REG_APGAKEYLO_EL1        S3_0_c2_c3_0
+#define ARM64_REG_APGAKEYHI_EL1        S3_0_c2_c3_1
+#else /* ASSEMBLER */
+#define ARM64_REG_APCTL_EL1            "S3_4_c15_c0_4"
+
+#define ARM64_REG_KERNELKEYLO_EL1      "S3_4_c15_c1_0"
+#define ARM64_REG_KERNELKEYHI_EL1      "S3_4_c15_c1_1"
+
+#define ARM64_REG_APIAKEYLO_EL1        "S3_0_c2_c1_0"
+#define ARM64_REG_APIAKEYHI_EL1        "S3_0_c2_c1_1"
+#define ARM64_REG_APIBKEYLO_EL1        "S3_0_c2_c1_2"
+#define ARM64_REG_APIBKEYHI_EL1        "S3_0_c2_c1_3"
+
+#define ARM64_REG_APDAKEYLO_EL1        "S3_0_c2_c2_0"
+#define ARM64_REG_APDAKEYHI_EL1        "S3_0_c2_c2_1"
+#define ARM64_REG_APDBKEYLO_EL1        "S3_0_c2_c2_2"
+#define ARM64_REG_APDBKEYHI_EL1        "S3_0_c2_c2_3"
+
+#define ARM64_REG_APGAKEYLO_EL1        "S3_0_c2_c3_0"
+#define ARM64_REG_APGAKEYHI_EL1        "S3_0_c2_c3_1"
+#endif /* ASSEMBLER */
+#endif /* HAS_APPLE_PAC */
+
 
 
 
+#define MPIDR_PNE_SHIFT 16 // pcore not ecore
+#define MPIDR_PNE       (1 << MPIDR_PNE_SHIFT)
 
 
-#define MPIDR_PNE_SHIFT                       16       // pcore not ecore
-#define MPIDR_PNE                      (1 << MPIDR_PNE_SHIFT)
 
 #ifdef ASSEMBLER
 
 /*
- *  arg0: register in which to store result
- *      0=>not a p-core, non-zero=>p-core
+ * arg0: register in which to store result
+ *   0=>not a p-core, non-zero=>p-core
  */
 .macro ARM64_IS_PCORE
 #if defined(APPLEMONSOON) || HAS_CLUSTER
-       mrs             $0, MPIDR_EL1
-       and             $0, $0, #(MPIDR_PNE)
-#endif
+       mrs $0, MPIDR_EL1
+       and $0, $0, #(MPIDR_PNE)
+#endif /* defined(APPLEMONSOON) || HAS_CLUSTER */
 .endmacro
 
 /*
  * reads a special purpose register, using a different msr for e- vs. p-cores
- * arg0: register indicating the current core type, see ARM64_IS_PCORE
- * arg1: register in which to store the result of the read
- * arg2: SPR to use for e-core
- * arg3: SPR to use for p-core or non-AMP architecture
+ *   arg0: register indicating the current core type, see ARM64_IS_PCORE
+ *   arg1: register in which to store the result of the read
+ *   arg2: SPR to use for e-core
+ *   arg3: SPR to use for p-core or non-AMP architecture
  */
 .macro ARM64_READ_EP_SPR
 #if defined(APPLEMONSOON) || HAS_CLUSTER
-       cbnz            $0, 1f
+       cbnz $0, 1f
 // e-core
-       mrs             $1, $2
-       b               2f
+       mrs  $1, $2
+       b    2f
 // p-core
 1:
-#endif
-       mrs             $1, $3
+#endif /* defined(APPLEMONSOON) || HAS_CLUSTER */
+       mrs  $1, $3
 2:
 .endmacro
 
  */
 .macro ARM64_WRITE_EP_SPR
 #if defined(APPLEMONSOON) || HAS_CLUSTER
-       cbnz            $0, 1f
+       cbnz $0, 1f
 // e-core
-       msr             $2, $1
-       b               2f
+       msr  $2, $1
+       b    2f
 // p-core
 1:
-#endif
-       msr             $3, $1
+#endif /* defined(APPLEMONSOON) || HAS_CLUSTER */
+       msr  $3, $1
 2:
 .endmacro
 
index c4a0edd1d83316d64d282affaec6e17392d503ed..bad75685783cfbbbea7a031ee248d34e7875d796 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2019 Apple Inc. All rights reserved.
  * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved.
  */
 #ifndef _PEXPERT_ARM_BOARD_CONFIG_H
@@ -7,19 +7,6 @@
 
 #include <mach/machine.h>
 
-#ifdef ARM64_BOARD_CONFIG_S5L8960X
-#define APPLE_ARM64_ARCH_FAMILY  1
-#define APPLECYCLONE
-#define ARM_ARCH_TIMER
-#include <pexpert/arm64/S5L8960X.h>
-#define __ARM_L2CACHE_SIZE_LOG__ 20
-#define ARM_BOARD_WFE_TIMEOUT_NS 1000
-#define ARM_BOARD_CLASS_S5L8960X
-#define KERNEL_INTEGRITY_WT 1
-#define PEXPERT_NO_3X_IMAGES    1
-#define CORE_NCTRS 8
-#define CPMU_AIC_PMI 1
-#endif  /* ARM64_BOARD_CONFIG_S5L8960X */
 
 #ifdef ARM64_BOARD_CONFIG_T7000
 #define APPLE_ARM64_ARCH_FAMILY  1
 
 
 
+
+
+
 #ifdef ARM64_BOARD_CONFIG_BCM2837
 #define BCM2837
 #define BCM2837_BRINGUP
 #define __ARM_L2CACHE_SIZE_LOG__ 19
 #define ARM_BOARD_CLASS_BCM2837
 #define CPU_COUNT 4
+#define CORE_NCTRS 8 /* Placeholder; KPC is not enabled for this target */
 #endif  /* ARM64_BOARD_CONFIG_BCM2837 */
 
 #endif /* ! _PEXPERT_ARM_BOARD_CONFIG_H */
index 1bb95329701fd24cd0b8dcad4bcb47474171079c..1bcf4990e4fad01622e643eebd7fa501aedf07cd 100644 (file)
@@ -9,11 +9,21 @@
 #ifndef _PEXPERT_ARM64_BOOT_H_
 #define _PEXPERT_ARM64_BOOT_H_
 
+#ifdef KERNEL
 #include <kern/kern_types.h>
+#endif
 #include <pexpert/arm/consistent_debug.h>
 #include <pexpert/arm/protos.h>
 
-#define BOOT_LINE_LENGTH        256
+/*
+ * Maximum size of an environment variable value. This particular value is
+ * chosen to accommodate the maximum encoded size of the system token as
+ * computed in https://confluence.sd.apple.com/display/TK/System+Token.
+ *
+ * This value matches iBoot's IBOOT_MAX_ENV_VAR_DATA_SIZE.
+ * There are no iBoot headers so have to duplicate it here for now.
+ */
+#define BOOT_LINE_LENGTH        608
 
 /*
  * Video information..
diff --git a/pexpert/pexpert/arm64/cyclone.h b/pexpert/pexpert/arm64/cyclone.h
deleted file mode 100644 (file)
index 6d5d900..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) 2012-2013 Apple Inc. All rights reserved.
- */
-
-#ifndef _PEXPERT_ARM_CYCLONE_H
-#define _PEXPERT_ARM_CYCLONE_H
-
-#ifdef APPLECYCLONE
-#include "arm64_common.h"
-
-#define MONITOR                 1 /* Use EL3 monitor */
-#define NO_ECORE                1
-#define HAS_32BIT_DBGWRAP       1
-
-/*
- * Determined by experiment (not described in manual):
- * A0 is variant 0, B0 is variant 1.  See arm64/proc_reg.h
- * for how these values are constructed from the MIDR.
- */
-#define CYCLONE_CPU_VERSION_A0                  0x00
-#define CYCLONE_CPU_VERSION_B0                  0x10
-
-#endif
-
-#endif /* ! _PEXPERT_ARM_CYCLONE_H */
index dea4c8f3ee24415182d79ce38f6d70f0b246749b..bf1b181d20d061b7bee940cd26b9a0b9749bf464 100644 (file)
@@ -1,13 +1,14 @@
 /*
- * Copyright (c) 2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2014-2018 Apple Inc. All rights reserved.
  */
 
 #ifndef _PEXPERT_ARM_HURRICANE_H
 #define _PEXPERT_ARM_HURRICANE_H
 
-#define NO_MONITOR      1 /* No EL3 for this CPU -- ever */
-#define HAS_MIGSTS      1 /* Has MIGSTS register, and supports migration between p-core and e-core */
-#define HAS_KTRR        1 /* Has KTRR registers */
+#define NO_MONITOR          1 /* No EL3 for this CPU -- ever */
+#define HAS_MIGSTS          1 /* Has MIGSTS register, and supports migration between p-core and e-core */
+#define HAS_KTRR            1 /* Has KTRR registers */
+#define HAS_CPMU_L2C_EVENTS 1 /* Has L2 cache events in CPMU */
 
 #ifdef APPLEHURRICANE
 #include "arm64_common.h"
diff --git a/pexpert/pexpert/arm64/spr_locks.h b/pexpert/pexpert/arm64/spr_locks.h
new file mode 100644 (file)
index 0000000..5d42a95
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifndef _PEXPERT_ARM64_SPR_LOCKS_H
+#define _PEXPERT_ARM64_SPR_LOCKS_H
+
+#define MSR_RO_CTL_HID4                 (1ULL << 4)
+#define MSR_RO_CTL_CYC_OVRD             (1ULL << 27)
+#define MSR_RO_CTL_ACC_OVRD             (1ULL << 47)
+
+#define MSR_RO_CTL_VAL                  (~0ULL & ~(MSR_RO_CTL_HID4 | MSR_RO_CTL_CYC_OVRD | MSR_RO_CTL_ACC_OVRD))
+#define MSR_LOCK_VAL                    (1ULL << 0)
+
+#define CPU_PIO_RO_CTL_DBG_WRAP         (1ULL << 49)
+#define CPU_PIO_RO_CTL_TRACE_CORE_CFG   (1ULL << 54)
+
+#define CPU_PIO_RO_CTL_VAL              (~0ULL & ~(CPU_PIO_RO_CTL_DBG_WRAP | CPU_PIO_RO_CTL_TRACE_CORE_CFG))
+#define CPU_PIO_LOCK_VAL                (1ULL << 0)
+
+#define ACC_PIO_RO_CTL_PBLK_OVRD        (1ULL << 47)
+#define ACC_PIO_RO_CTL_DBG_CTL          (1ULL << 48)
+#define ACC_PIO_RO_CTL_DBG_PMGR         (1ULL << 50)
+#define ACC_PIO_RO_CTL_DBG_WRAP_GLB     (1ULL << 51)
+#define ACC_PIO_RO_CTL_TRACE_CTL        (1ULL << 53)
+#define ACC_PIO_RO_CTL_TRC_UT_CTL       (1ULL << 55)
+#define ACC_PIO_RO_CTL_OCLA_CTL         (1ULL << 56)
+
+#define ACC_PIO_RO_CTL_VAL              (~0ULL & ~(ACC_PIO_RO_CTL_PBLK_OVRD | ACC_PIO_RO_CTL_DBG_CTL | ACC_PIO_RO_CTL_DBG_PMGR |        \
+                                                  ACC_PIO_RO_CTL_DBG_WRAP_GLB | ACC_PIO_RO_CTL_TRACE_CTL |                             \
+                                                  ACC_PIO_RO_CTL_TRC_UT_CTL | ACC_PIO_RO_CTL_OCLA_CTL))
+#define ACC_PIO_LOCK_VAL                (1ULL << 0)
+
+#endif /* _PEXPERT_ARM64_SPR_LOCKS_H */
index 0a17b3f220fd0a1135096f96c19a70bff2f71f4e..4fc2b84808c7ee7a961cf8181bfa8ae2c6dddbe1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2014-2018 Apple Inc. All rights reserved.
  */
 
 #ifndef _PEXPERT_ARM_TWISTER_H
@@ -8,6 +8,7 @@
 #define MONITOR                 1 /* Use EL3 monitor */
 #define NO_ECORE                1
 #define HAS_32BIT_DBGWRAP       1
+#define HAS_CPMU_L2C_EVENTS     1 /* Has L2 cache events in CPMU */
 
 #ifdef APPLETWISTER
 #include "arm64_common.h"
index e91c1faa513d906abb02ef8b4333f41c0b30d44f..dba7d43621426e7cd101ddb1f87137905ab68bbc 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
  */
 
 #ifndef _PEXPERT_ARM_TYPHOON_H
@@ -8,6 +8,8 @@
 #define MONITOR                 1 /* Use EL3 monitor */
 #define NO_ECORE                1
 #define HAS_32BIT_DBGWRAP       1
+#define HAS_CPMU_BIU_EVENTS     1 /* Has BIU events in CPMU */
+#define HAS_CPMU_L2C_EVENTS     1 /* Has L2 cache events in CPMU */
 
 #ifdef APPLETYPHOON
 #include "arm64_common.h"
index d01a6ee19a55efa0e8334856f5204ee53f12bb40..c721ce842bbc30970ba0187ad08b28b5458af6c7 100644 (file)
@@ -91,6 +91,8 @@ uint32_t PE_get_random_seed(
 uint32_t PE_i_can_has_debugger(
        uint32_t *);
 
+int PE_stub_poll_input(unsigned int options, char *c);
+
 #if defined(__arm__) || defined(__arm64__)
 boolean_t PE_panic_debugging_enabled(void);
 
@@ -230,10 +232,6 @@ enum {
        kPEWaitForInput     = 0x00000001,
        kPERawInput         = 0x00000002
 };
-extern int (*PE_poll_input)(
-       unsigned int options,
-       char * c);
-
 extern int (*PE_write_IIC)(
        unsigned char addr,
        unsigned char reg,
@@ -314,12 +312,6 @@ extern PE_state_t PE_state;
 extern char * PE_boot_args(
        void);
 
-#if !defined(__LP64__) && !defined(__arm__)
-extern boolean_t PE_parse_boot_arg(
-       const char      *arg_string,
-       void            *arg_ptr) __deprecated;
-#endif
-
 extern boolean_t PE_parse_boot_argn(
        const char      *arg_string,
        void            *arg_ptr,
@@ -384,14 +376,20 @@ extern void pe_init_debug(void);
 
 extern boolean_t PE_imgsrc_mount_supported(void);
 
+extern void PE_panic_hook(const char *str);
+
+extern void PE_init_cpu(void);
+
 #if defined(__arm__) || defined(__arm64__)
 typedef void (*perfmon_interrupt_handler_func)(cpu_id_t source);
 extern kern_return_t PE_cpu_perfmon_interrupt_install_handler(perfmon_interrupt_handler_func handler);
 extern void PE_cpu_perfmon_interrupt_enable(cpu_id_t target, boolean_t enable);
 
-extern void (*PE_arm_debug_panic_hook)(const char *str);
 #if DEVELOPMENT || DEBUG
 extern void PE_arm_debug_enable_trace(void);
+extern void (*PE_arm_debug_panic_hook)(const char *str);
+#else
+extern void(*const PE_arm_debug_panic_hook)(const char *str);
 #endif
 #endif
 
index 4911193fb20146ce56f260aacc8ec96c43f23f79..fdca03b8c64f6891fa7a7d537208fbd3c13281a0 100644 (file)
@@ -101,6 +101,7 @@ ___asan_version_mismatch_check_apple_900
 ___asan_version_mismatch_check_apple_902
 ___asan_version_mismatch_check_apple_1000
 ___asan_version_mismatch_check_apple_1001
+___asan_version_mismatch_check_apple_clang_1100
 ___asan_init
 ___asan_memcpy
 ___asan_memmove
@@ -115,3 +116,48 @@ ___asan_strlcat
 ___asan_strncat
 ___asan_strlen
 ___asan_strnlen
+___ubsan_handle_add_overflow
+___ubsan_handle_add_overflow_abort
+___ubsan_handle_builtin_unreachable
+___ubsan_handle_divrem_overflow
+___ubsan_handle_divrem_overflow_abort
+___ubsan_handle_float_cast_overflow
+___ubsan_handle_float_cast_overflow_abort
+___ubsan_handle_function_type_mismatch
+___ubsan_handle_function_type_mismatch_abort
+___ubsan_handle_implicit_conversion
+___ubsan_handle_implicit_conversion_abort
+___ubsan_handle_invalid_builtin
+___ubsan_handle_invalid_builtin_abort
+___ubsan_handle_load_invalid_value
+___ubsan_handle_load_invalid_value_abort
+___ubsan_handle_missing_return
+___ubsan_handle_mul_overflow
+___ubsan_handle_mul_overflow_abort
+___ubsan_handle_negate_overflow
+___ubsan_handle_negate_overflow_abort
+___ubsan_handle_nonnull_arg
+___ubsan_handle_nonnull_arg_abort
+___ubsan_handle_nonnull_return
+___ubsan_handle_nonnull_return_abort
+___ubsan_handle_nullability_arg
+___ubsan_handle_nullability_arg_abort
+___ubsan_handle_nullability_return
+___ubsan_handle_nullability_return_abort
+___ubsan_handle_out_of_bounds
+___ubsan_handle_out_of_bounds_abort
+___ubsan_handle_pointer_overflow
+___ubsan_handle_pointer_overflow_abort
+___ubsan_handle_shift_out_of_bounds
+___ubsan_handle_shift_out_of_bounds_abort
+___ubsan_handle_sub_overflow
+___ubsan_handle_sub_overflow_abort
+___ubsan_handle_type_mismatch_v1
+___ubsan_handle_type_mismatch_v1_abort
+___ubsan_handle_vla_bound_not_positive
+___ubsan_handle_vla_bound_not_positive_abort
+___sanitizer_cov_trace_pc
+___sanitizer_cov_trace_pc_guard
+___sanitizer_cov_trace_pc_guard_init
+___sanitizer_cov_trace_pc_indirect
+___sanitizer_cov_pcs_init
index e8c09216728238a988b5c3c496088b6b0b417357..81639099434ed554b352f66bfabba88df0edcfa8 100644 (file)
@@ -17,6 +17,7 @@ PRIVATE_KERNELFILES = \
 
 # Available only in xnu proper
 PRIVATE_XNUFILES = \
+       ksancov.h \
        kasan.h
 
 INSTALL_MI_LIST = ${DATAFILES}
@@ -32,7 +33,7 @@ COMP_SUBDIRS = conf
 
 .DELETE_ON_ERROR:
 $(OBJROOT)/san/kasan-blacklist-%: $(SOURCE)/kasan-blacklist $(SOURCE)/ubsan-blacklist $(SOURCE)/kasan-blacklist-%
-       @echo "$(ColorH)GENERATING$(Color0)    $(ColorLF)$(notdir $@)$(Color0)"
+       $(call makelog,$(ColorH)GENERATING$(Color0)    $(ColorLF)$(notdir $@)$(Color0))
        $(_v)sed -e 's,^src:\./,src:'"$(SRCROOT)/," $^ > $@
        $(_v)$(SOURCE)/tools/validate_blacklist.sh "$@"
 
@@ -57,7 +58,7 @@ endif
 
 # Our external dependency on allsymbols is fine because this runs in a later phase (config_install vs. config_all)
 $(OBJPATH)/%.symbolset: $(SOURCE)/%.exports
-       @echo "$(ColorH)SYMBOLSET$(Color0)  $(ColorF)$*$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""
+       $(call makelog,$(ColorH)SYMBOLSET$(Color0)     $(ColorF)$*$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))")
        $(_v)$(KEXT_CREATE_SYMBOL_SET)                  \
                $(ARCH_FLAGS_$(CURRENT_ARCH_CONFIG))    \
                -import $(OBJPATH)/allsymbols           \
@@ -66,12 +67,12 @@ $(OBJPATH)/%.symbolset: $(SOURCE)/%.exports
 
 $(DSTROOT_KEXT): $(DSTROOT_KEXT_PATH)/% : $(OBJPATH)/%.symbolset
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorF)INSTALL$(Color0)    $(ColorF)$(notdir $@)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""
+       $(call makelog,$(ColorF)INSTALL$(Color0)    $(ColorF)$(notdir $@)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))")
        $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@
 
 $(SYMROOT_KEXT): $(SYMROOT_KEXT_PATH)/% : $(DSTROOT_KEXT_PATH)/%
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorF)INSTALL$(Color0)    $(ColorF)$(notdir $@)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""
+       $(call makelog,$(ColorF)INSTALL$(Color0)    $(ColorF)$(notdir $@)$(Color0) "($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))")
        $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@
 
 do_config_install:: $(DSTROOT_KEXT) $(SYMROOT_KEXT)
@@ -85,7 +86,7 @@ endif
 
 $(KASAN_HELPER_SCRIPTS): $(DSTROOT)/$(DEVELOPER_EXTRAS_DIR)/% : $(SOURCE)/tools/%
        $(_v)$(MKDIR) $(dir $@)
-       @echo "$(ColorH)INSTALL$(Color0)    $(ColorF)$(@F)$(Color0)"
+       $(call makelog,$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0))
        $(_v)$(INSTALL) $(EXEC_INSTALL_FLAGS) $< $@
 
 do_config_install:: $(KASAN_HELPER_SCRIPTS)
index 7bd79d9ae78b20528784ccb5d5684d4a6bcb579f..05c4b79cf0cdde381cf245f419ccc25af4fec458 100644 (file)
@@ -23,7 +23,7 @@ endif
 
 $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS)
        $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG)
-       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG);
+       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG)
 
 do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
        $(_v)${MAKE} \
@@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
                SOURCE=$(subst conf/,,$(SOURCE))                        \
                TARGET=${TARGET}                                        \
                OBJPATH=${OBJPATH}                                      \
-               build_all;
+               build_all
 
 do_build_all:: do_all
 
index 8c60bc15bd5ba9c4efffaf2b948c8c1c0c4b140d..42c6ee1edb1e768a4cad3306ff055c74c0eb8f11 100644 (file)
@@ -47,19 +47,6 @@ COMP_SUBDIRS =
 # Rebuild if per-file overrides change
 ${OBJS}: $(firstword $(MAKEFILE_LIST))
 
-# set file list manually
-OBJS =
-
-ifeq ($(KASAN),1)
-OBJS += kasan.o kasan-fakestack.o kasan-memintrinsics.o kasan_dynamic_blacklist.o
-OBJS += kasan-$(CURRENT_ARCH_CONFIG_LC).o
-OBJS += kasan-test.o kasan-test-$(CURRENT_ARCH_CONFIG_LC).o
-endif
-
-ifeq ($(UBSAN),1)
-OBJS += ubsan.o ubsan_log.o
-endif
-
 # Rebuild if global compile flags change
 $(COBJS): .CFLAGS
 .CFLAGS: ALWAYS
@@ -76,13 +63,13 @@ $(SOBJS): .SFLAGS
        $(_v)$(REPLACECONTENTS) $@ $(KASAN)
 
 $(COMPONENT).filelist: $(OBJS) .KASANFLAGS
-       @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)"
+       $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0))
        $(_v)for obj in ${OBJS}; do     \
-                echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
+                $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
        done > $(COMPONENT).filelist
 
 $(TARGET)/$(CURRENT_KERNEL_CONFIG)/kasan_blacklist_dynamic.h: $(SRCROOT)/$(COMPONENT)/kasan-blacklist-dynamic
-       @echo "$(ColorH)GENERATING$(Color0)    $(ColorLF)$(notdir $@)$(Color0)"
+       $(call makelog,$(ColorH)GENERATING$(Color0)    $(ColorLF)$(notdir $@)$(Color0))
        @$(SRCROOT)/$(COMPONENT)/tools/generate_dynamic_blacklist.py "$<" > "$@"
 
 $(SRCROOT)/$(COMPONENT)/kasan_dynamic_blacklist.c: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/kasan_blacklist_dynamic.h
index 0c312a11ff87b04e0bb97b8588b73bf1207b5e8e..e4388a3df3d882ced979e9227c8f2d5e4295e368 100644 (file)
@@ -1,7 +1,14 @@
-san/kasan.c standard
-san/kasan-fakestack.c standard
-san/kasan-test.c standard
-san/kasan-memintrinsics.c standard
-san/kasan_dynamic_blacklist.c standard
-san/ubsan.c standard
-san/ubsan_log.c standard
+OPTIONS/CONFIG_KASAN            optional config_kasan
+OPTIONS/CONFIG_UBSAN            optional config_ubsan
+OPTIONS/CONFIG_KSANCOV          optional config_ksancov
+
+san/kasan.c                     optional config_kasan
+san/kasan-fakestack.c           optional config_kasan
+san/kasan-test.c                optional config_kasan
+san/kasan-memintrinsics.c       optional config_kasan
+san/kasan_dynamic_blacklist.c   optional config_kasan
+
+san/ubsan.c                     optional config_ubsan
+san/ubsan_log.c                 optional config_ubsan
+
+san/ksancov.c                   optional config_ksancov
index 4303b854de8fd94945cf4573b8c643552de6d9ce..d43c7ffabe94ff6db7f962f14fada35767d33732 100644 (file)
@@ -1,3 +1,2 @@
-# KASAN
-san/kasan-arm64.c standard
-san/kasan-test-arm64.s standard
+san/kasan-arm64.c       optional config_kasan
+san/kasan-test-arm64.s  optional config_kasan
index bd884e798036cec53163485827ce3b51a56a694c..91b496f474a7f222f25ec45fee3b54317771ee4f 100644 (file)
@@ -1,5 +1,2 @@
-# options
-
-# KASAN
-san/kasan-x86_64.c standard
-san/kasan-test-x86_64.s standard
+san/kasan-x86_64.c          optional config_kasan
+san/kasan-test-x86_64.s     optional config_kasan
index 3d3a2336482872c8802c3add79294f0d7d1342da..909a075efde4bc14f2d6b0bc4c26281aa92b4234 100644 (file)
@@ -45,7 +45,7 @@
 #include <memintrinsics.h>
 
 #include <pexpert/arm64/boot.h>
-#include <arm64/proc_reg.h>
+#include <arm64/tlb.h>
 
 #include <libkern/kernel_mach_header.h>
 
@@ -69,11 +69,10 @@ extern vm_offset_t intstack, intstack_top;
 extern vm_offset_t excepstack, excepstack_top;
 
 void kasan_bootstrap(boot_args *, vm_offset_t pgtable);
-void flush_mmu_tlb(void);
 
-#define KASAN_SHIFT_ARM64 0xdffffff800000000ULL /* Defined in makedefs/MakeInc.def */
-#define KASAN_SHADOW_MIN  0xfffffff400000000ULL
-#define KASAN_SHADOW_MAX  0xfffffff680000000ULL
+#define KASAN_SHIFT_ARM64 0xe000000000000000ULL /* Defined in makedefs/MakeInc.def */
+#define KASAN_SHADOW_MIN  0xfffffffc00000000ULL
+#define KASAN_SHADOW_MAX  0xffffffff80000000ULL
 
 _Static_assert(KASAN_SHIFT == KASAN_SHIFT_ARM64, "KASan inconsistent shadow shift");
 _Static_assert(VM_MAX_KERNEL_ADDRESS < KASAN_SHADOW_MIN, "KASan shadow overlaps with kernel VM");
@@ -124,7 +123,6 @@ kasan_map_shadow_internal(vm_offset_t address, vm_size_t size, bool is_zero, boo
                uint64_t *base = cpu_tte;
                uint64_t *pte;
 
-#if !__ARM64_TWO_LEVEL_PMAP__
                /* lookup L1 entry */
                pte = base + ((shadow_base & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT);
                if (*pte & ARM_TTE_VALID) {
@@ -134,7 +132,6 @@ kasan_map_shadow_internal(vm_offset_t address, vm_size_t size, bool is_zero, boo
                        *pte = ((uint64_t)alloc_zero_page() & ARM_TTE_TABLE_MASK) | ARM_TTE_VALID | ARM_TTE_TYPE_TABLE;
                }
                base = (uint64_t *)phystokv(*pte & ARM_TTE_TABLE_MASK);
-#endif
 
                /* lookup L2 entry */
                pte = base + ((shadow_base & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
@@ -204,7 +201,6 @@ kasan_map_shadow_early(vm_offset_t address, vm_size_t size, bool is_zero)
 
                uint64_t *base = (uint64_t *)bootstrap_pgtable_phys;
 
-#if !__ARM64_TWO_LEVEL_PMAP__
                /* lookup L1 entry */
                pte = base + ((virt_shadow_target & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT);
                if (*pte & ARM_TTE_VALID) {
@@ -216,7 +212,6 @@ kasan_map_shadow_early(vm_offset_t address, vm_size_t size, bool is_zero)
                        *pte = ((uint64_t)pg & ARM_TTE_TABLE_MASK) | ARM_TTE_VALID | ARM_TTE_TYPE_TABLE;
                }
                base = (uint64_t *)(*pte & ARM_TTE_TABLE_MASK);
-#endif
 
                /* lookup L2 entry */
                pte = base + ((virt_shadow_target & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
@@ -332,14 +327,12 @@ kasan_is_shadow_mapped(uintptr_t shadowp)
        assert(shadowp >= KASAN_SHADOW_MIN);
        assert(shadowp < KASAN_SHADOW_MAX);
 
-#if !__ARM64_TWO_LEVEL_PMAP__
        /* lookup L1 entry */
        pte = base + ((shadowp & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT);
        if (!(*pte & ARM_TTE_VALID)) {
                return false;
        }
        base = (uint64_t *)phystokv(*pte & ARM_TTE_TABLE_MASK);
-#endif
 
        /* lookup L2 entry */
        pte = base + ((shadowp & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
index 48ce86d46c0dccb2aa04344f649e31c124c810ac..38df385ad6bfda4d83c808d606fe3b77fb9665e4 100644 (file)
@@ -25,6 +25,9 @@ src:./san/kasan-x86_64.c
 src:./san/kasan-memintrinsics.c
 src:./san/kasan_dynamic_blacklist.c
 
+# Exclude other sanitizers
+src:./san/ksancov.c
+
 # Exclude dtrace function that does weird stack manipulations
 fun:fbt_perfCallback
 
index 517bce143eb906abecae06c5903e99e94dc2b4ec..69a8dc15fd2b1ec36e80d437e272149453dc66d0 100644 (file)
@@ -66,7 +66,6 @@ src:./osfmk/i386/pmap_x86_common.c
 src:./osfmk/i386/pmCPU.c
 src:./osfmk/i386/startup64.c
 src:./osfmk/i386/lapic_native.c
-src:./osfmk/i386/fpu.c
 src:./osfmk/vm/vm_compressor.c
 fun:doublemap_init
 fun:getsegbynamefromheader
index 9ec9433dff3fbfc9ed3e9ab5b3e026ef0df7aa02..cec75e459be6af135216980993f5c1efcfe8d00a 100644 (file)
@@ -43,6 +43,7 @@
 #include <mach/mach_types.h>
 #include <mach/vm_param.h>
 #include <mach/machine/vm_param.h>
+#include <mach/sdt.h>
 #include <libkern/libkern.h>
 #include <libkern/OSAtomic.h>
 #include <libkern/kernel_mach_header.h>
@@ -66,9 +67,11 @@ vm_offset_t kernel_vtop;
 
 static unsigned kasan_enabled;
 static unsigned quarantine_enabled;
-static unsigned enabled_checks = TYPE_ALL; /* bitmask of enabled checks */
-static unsigned report_ignored;            /* issue non-fatal report for disabled/blacklisted checks */
-static unsigned free_yield = 0;            /* ms yield after each free */
+static unsigned enabled_checks = TYPE_ALL & ~TYPE_LEAK; /* bitmask of enabled checks */
+static unsigned report_ignored;                         /* issue non-fatal report for disabled/blacklisted checks */
+static unsigned free_yield = 0;                         /* ms yield after each free */
+static unsigned leak_threshold = 3;                     /* threshold for uninitialized memory leak detection */
+static unsigned leak_fatal_threshold = 0;               /* threshold for treating leaks as fatal errors (0 means never) */
 
 /* forward decls */
 static void kasan_crash_report(uptr p, uptr width, access_t access, violation_t reason);
@@ -91,6 +94,8 @@ extern vm_size_t ml_stack_size(void);
 static const size_t BACKTRACE_BITS       = 4;
 static const size_t BACKTRACE_MAXFRAMES  = (1UL << BACKTRACE_BITS) - 1;
 
+static vm_size_t kasan_alloc_retrieve_bt(vm_address_t addr, uintptr_t frames[static BACKTRACE_MAXFRAMES]);
+
 decl_simple_lock_data(, kasan_vm_lock);
 static thread_t kasan_lock_holder;
 
@@ -317,6 +322,67 @@ kasan_check_shadow(vm_address_t base, vm_size_t sz, uint8_t shadow)
        return true;
 }
 
+static void
+kasan_report_leak(vm_address_t base, vm_size_t sz, vm_offset_t offset, vm_size_t leak_sz)
+{
+       if (leak_fatal_threshold > leak_threshold && leak_sz >= leak_fatal_threshold){
+               kasan_violation(base + offset, leak_sz, TYPE_LEAK, REASON_UNINITIALIZED);
+       }
+
+       char string_rep[BACKTRACE_MAXFRAMES * 20] = {};
+       vm_offset_t stack_base = dtrace_get_kernel_stack(current_thread());
+       bool is_stack = (base >= stack_base && base < (stack_base + kernel_stack_size));
+
+       if (!is_stack) {
+               uintptr_t alloc_bt[BACKTRACE_MAXFRAMES] = {};
+               vm_size_t num_frames = 0;
+               size_t l = 0;
+               num_frames = kasan_alloc_retrieve_bt(base, alloc_bt);
+               for (vm_size_t i = 0; i < num_frames; i++) {
+                       l += snprintf(string_rep + l, sizeof(string_rep) - l, " %lx", alloc_bt[i]);
+               }
+       }
+
+       DTRACE_KASAN5(leak_detected,
+                                 vm_address_t, base,
+                                 vm_size_t, sz,
+                                 vm_offset_t, offset,
+                                 vm_size_t, leak_sz,
+                                 char *, string_rep);
+}
+
+/*
+ * Check for possible uninitialized memory contained in [base, base+sz).
+ */
+void
+kasan_check_uninitialized(vm_address_t base, vm_size_t sz)
+{
+       if (!(enabled_checks & TYPE_LEAK) || sz < leak_threshold) {
+               return;
+       }
+
+       vm_address_t cur = base;
+       vm_address_t end = base + sz;
+       vm_size_t count = 0;
+       vm_size_t max_count = 0;
+       vm_address_t leak_offset = 0;
+       uint8_t byte = 0;
+
+       while (cur < end) {
+               byte = *(uint8_t *)cur;
+               count = (byte == KASAN_UNINITIALIZED_HEAP) ? (count + 1) : 0;
+               if (count > max_count) {
+                       max_count = count;
+                       leak_offset = cur - (count - 1) - base;
+               }
+               cur += 1;
+       }
+
+       if (max_count >= leak_threshold) {
+               kasan_report_leak(base, sz, leak_offset, max_count);
+       }
+}
+
 /*
  *
  * KASAN violation reporting
@@ -332,6 +398,8 @@ access_str(access_t type)
                return "store to";
        } else if (type & TYPE_FREE) {
                return "free of";
+       } else if (type & TYPE_LEAK) {
+               return "leak from";
        } else {
                return "access of";
        }
@@ -468,7 +536,8 @@ kasan_log_report(uptr p, uptr width, access_t access, violation_t reason)
         * print a backtrace
         */
 
-       nframes = backtrace_frame(bt, nframes, __builtin_frame_address(0)); /* ignore current frame */
+       nframes = backtrace_frame(bt, nframes, __builtin_frame_address(0),
+           NULL); /* ignore current frame */
 
        buf[0] = '\0';
        l += snprintf(buf+l, len-l, "Backtrace: ");
@@ -483,8 +552,8 @@ kasan_log_report(uptr p, uptr width, access_t access, violation_t reason)
 #define REPORT_DECLARE(n) \
        void OS_NORETURN __asan_report_load##n(uptr p)  { kasan_crash_report(p, n, TYPE_LOAD,  0); } \
        void OS_NORETURN __asan_report_store##n(uptr p) { kasan_crash_report(p, n, TYPE_STORE, 0); } \
-       void UNSUPPORTED_API(__asan_report_exp_load##n, uptr a, int32_t b); \
-       void UNSUPPORTED_API(__asan_report_exp_store##n, uptr a, int32_t b);
+       void OS_NORETURN UNSUPPORTED_API(__asan_report_exp_load##n, uptr a, int32_t b); \
+       void OS_NORETURN UNSUPPORTED_API(__asan_report_exp_store##n, uptr a, int32_t b);
 
 REPORT_DECLARE(1)
 REPORT_DECLARE(2)
@@ -731,12 +800,23 @@ kasan_init(void)
                if (arg & KASAN_ARGS_NOPOISON_GLOBAL) {
                        enabled_checks &= ~TYPE_POISON_GLOBAL;
                }
+               if (arg & KASAN_ARGS_CHECK_LEAKS) {
+                       enabled_checks |= TYPE_LEAK;
+               }
        }
 
        if (PE_parse_boot_argn("kasan.free_yield_ms", &arg, sizeof(arg))) {
                free_yield = arg;
        }
 
+       if (PE_parse_boot_argn("kasan.leak_threshold", &arg, sizeof(arg))) {
+               leak_threshold = arg;
+       }
+
+       if (PE_parse_boot_argn("kasan.leak_fatal_threshold", &arg, sizeof(arg))) {
+               leak_fatal_threshold = arg;
+       }
+
        /* kasan.bl boot-arg handled in kasan_init_dybl() */
 
        quarantine_enabled = 1;
@@ -870,7 +950,7 @@ kasan_alloc_bt(uint32_t *ptr, vm_size_t sz, vm_size_t skip)
 
        if (frames > 0) {
                frames = min(frames + skip, BACKTRACE_MAXFRAMES);
-               frames = backtrace(bt, frames);
+               frames = backtrace(bt, frames, NULL);
 
                while (frames > sz && skip > 0) {
                        bt++;
@@ -906,6 +986,40 @@ kasan_alloc_crc(vm_offset_t addr)
        return crc;
 }
 
+static vm_size_t
+kasan_alloc_retrieve_bt(vm_address_t addr, uintptr_t frames[static BACKTRACE_MAXFRAMES])
+{
+       vm_size_t num_frames = 0;
+       uptr shadow = (uptr)SHADOW_FOR_ADDRESS(addr);
+       uptr max_search = shadow - 4096;
+       vm_address_t alloc_base = 0;
+       size_t fsize = 0;
+
+       /* walk the shadow backwards to find the allocation base */
+       while (shadow >= max_search) {
+               if (*(uint8_t *)shadow == ASAN_HEAP_LEFT_RZ) {
+                       alloc_base = ADDRESS_FOR_SHADOW(shadow) + 8;
+                       break;
+               }
+               shadow--;
+       }
+
+       if (alloc_base) {
+               struct kasan_alloc_header *header = header_for_user_addr(alloc_base);
+               if (magic_for_addr(alloc_base, LIVE_XOR) == header->magic) {
+                       struct kasan_alloc_footer *footer = footer_for_user_addr(alloc_base, &fsize);
+                       if ((fsize/sizeof(footer->backtrace[0])) >= header->frames) {
+                               num_frames = header->frames;
+                               for (size_t i = 0; i < num_frames; i++) {
+                                       frames[i] = footer->backtrace[i] + vm_kernel_slid_base;
+                               }
+                       }
+               }
+       }
+
+       return num_frames;
+}
+
 /*
  * addr: base address of full allocation (including redzones)
  * size: total size of allocation (include redzones)
@@ -930,6 +1044,10 @@ kasan_alloc(vm_offset_t addr, vm_size_t size, vm_size_t req, vm_size_t leftrz)
 
        addr += leftrz;
 
+       if (enabled_checks & TYPE_LEAK) {
+               __nosan_memset((void *)addr, KASAN_UNINITIALIZED_HEAP, req);
+       }
+
        /* stash the allocation sizes in the left redzone */
        struct kasan_alloc_header *h = header_for_user_addr(addr);
        h->magic = magic_for_addr(addr, LIVE_XOR);
@@ -1206,11 +1324,32 @@ __asan_poison_cxx_array_cookie(uptr p)
        *shadow = ASAN_ARRAY_COOKIE;
 }
 
+/*
+ * Unpoison the C++ array cookie (if it exists). We don't know exactly where it
+ * lives relative to the start of the buffer, but it's always the word immediately
+ * before the start of the array data, so for naturally-aligned objects we need to
+ * search at most 2 shadow bytes.
+ */
+void
+kasan_unpoison_cxx_array_cookie(void *ptr)
+{
+       uint8_t *shadow = SHADOW_FOR_ADDRESS((uptr)ptr);
+       for (size_t i = 0; i < 2; i++) {
+               if (shadow[i] == ASAN_ARRAY_COOKIE) {
+                       shadow[i] = ASAN_VALID;
+                       return;
+               } else if (shadow[i] != ASAN_VALID) {
+                       /* must have seen the cookie by now */
+                       return;
+               }
+       }
+}
+
 #define ACCESS_CHECK_DECLARE(type, sz, access) \
        void __asan_##type##sz(uptr addr) { \
                kasan_check_range((const void *)addr, sz, access); \
        } \
-       void UNSUPPORTED_API(__asan_exp_##type##sz, uptr a, int32_t b);
+       void OS_NORETURN UNSUPPORTED_API(__asan_exp_##type##sz, uptr a, int32_t b);
 
 ACCESS_CHECK_DECLARE(load,  1,  TYPE_LOAD);
 ACCESS_CHECK_DECLARE(load,  2,  TYPE_LOAD);
@@ -1314,17 +1453,18 @@ UNUSED_ABI(__asan_version_mismatch_check_apple_900, void);
 UNUSED_ABI(__asan_version_mismatch_check_apple_902, void);
 UNUSED_ABI(__asan_version_mismatch_check_apple_1000, void);
 UNUSED_ABI(__asan_version_mismatch_check_apple_1001, void);
+UNUSED_ABI(__asan_version_mismatch_check_apple_clang_1100, void);
 
-void UNSUPPORTED_API(__asan_init_v5, void);
-void UNSUPPORTED_API(__asan_register_globals, uptr a, uptr b);
-void UNSUPPORTED_API(__asan_unregister_globals, uptr a, uptr b);
-void UNSUPPORTED_API(__asan_register_elf_globals, uptr a, uptr b, uptr c);
-void UNSUPPORTED_API(__asan_unregister_elf_globals, uptr a, uptr b, uptr c);
+void OS_NORETURN UNSUPPORTED_API(__asan_init_v5, void);
+void OS_NORETURN UNSUPPORTED_API(__asan_register_globals, uptr a, uptr b);
+void OS_NORETURN UNSUPPORTED_API(__asan_unregister_globals, uptr a, uptr b);
+void OS_NORETURN UNSUPPORTED_API(__asan_register_elf_globals, uptr a, uptr b, uptr c);
+void OS_NORETURN UNSUPPORTED_API(__asan_unregister_elf_globals, uptr a, uptr b, uptr c);
 
-void UNSUPPORTED_API(__asan_exp_loadN, uptr addr, size_t sz, int32_t e);
-void UNSUPPORTED_API(__asan_exp_storeN, uptr addr, size_t sz, int32_t e);
-void UNSUPPORTED_API(__asan_report_exp_load_n, uptr addr, unsigned long b, int32_t c);
-void UNSUPPORTED_API(__asan_report_exp_store_n, uptr addr, unsigned long b, int32_t c);
+void OS_NORETURN UNSUPPORTED_API(__asan_exp_loadN, uptr addr, size_t sz, int32_t e);
+void OS_NORETURN UNSUPPORTED_API(__asan_exp_storeN, uptr addr, size_t sz, int32_t e);
+void OS_NORETURN UNSUPPORTED_API(__asan_report_exp_load_n, uptr addr, unsigned long b, int32_t c);
+void OS_NORETURN UNSUPPORTED_API(__asan_report_exp_store_n, uptr addr, unsigned long b, int32_t c);
 
 /*
  *
@@ -1370,6 +1510,8 @@ SYSCTL_UINT(_kern_kasan, OID_AUTO, checks, CTLFLAG_RW, &enabled_checks, 0, "");
 SYSCTL_UINT(_kern_kasan, OID_AUTO, quarantine, CTLFLAG_RW, &quarantine_enabled, 0, "");
 SYSCTL_UINT(_kern_kasan, OID_AUTO, report_ignored, CTLFLAG_RW, &report_ignored, 0, "");
 SYSCTL_UINT(_kern_kasan, OID_AUTO, free_yield_ms, CTLFLAG_RW, &free_yield, 0, "");
+SYSCTL_UINT(_kern_kasan, OID_AUTO, leak_threshold, CTLFLAG_RW, &leak_threshold, 0, "");
+SYSCTL_UINT(_kern_kasan, OID_AUTO, leak_fatal_threshold, CTLFLAG_RW, &leak_fatal_threshold, 0, "");
 SYSCTL_UINT(_kern_kasan, OID_AUTO, memused, CTLFLAG_RD, &shadow_pages_used, 0, "");
 SYSCTL_UINT(_kern_kasan, OID_AUTO, memtotal, CTLFLAG_RD, &shadow_pages_total, 0, "");
 SYSCTL_UINT(_kern_kasan, OID_AUTO, kexts, CTLFLAG_RD, &kexts_loaded, 0, "");
index 308efa2e90d22afef7e40a90b5df13bb2abbad54..fcfc444629914a24dfd0030f26a7670b94f6c0f2 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -103,12 +103,16 @@ void kasan_notify_address_nopoison(vm_offset_t address, vm_size_t size);
 void kasan_unpoison_stack(vm_offset_t stack, vm_size_t size);
 void kasan_unpoison_curstack(bool whole_stack);
 bool kasan_check_shadow(vm_address_t base, vm_size_t sz, uint8_t shadow);
+void kasan_unpoison_cxx_array_cookie(void *ptr);
 
 void kasan_fakestack_drop(thread_t thread); /* mark all fakestack entries for thread as unused */
 void kasan_fakestack_gc(thread_t thread);   /* free and poison all unused fakestack objects for thread */
 void kasan_fakestack_suspend(void);
 void kasan_fakestack_resume(void);
 
+/* check for uninitialized memory */
+void kasan_check_uninitialized(vm_address_t base, vm_size_t sz);
+
 struct kasan_test;
 void __kasan_runtests(struct kasan_test *, int numtests);
 
@@ -172,7 +176,7 @@ extern const uintptr_t __asan_shadow_memory_dynamic_address;
        ret func ## 2(__VA_ARGS__); \
        ret func ## 4(__VA_ARGS__); \
        ret func ## 8(__VA_ARGS__); \
-       ret func ## 16(__VA_ARGS__); \
+       ret func ## 16(__VA_ARGS__)
 
 __BEGIN_DECLS
 
index 04d0973dca80b5564321d5eeaeb2d216743af241..cb661abd0a56baa10bab246475ee9a04d2cf87d5 100644 (file)
@@ -363,7 +363,8 @@ kasan_is_blacklisted(access_t type)
                return false;
        }
 
-       nframes = backtrace_frame(bt, MAX_FRAMES, __builtin_frame_address(0));
+       nframes = backtrace_frame(bt, MAX_FRAMES, __builtin_frame_address(0),
+           NULL);
        boolean_t flag;
 
        if (nframes >= 1) {
index 7a920961ebfd7035f8f6ed8f6515887a76d68d45..f0565a0043c5e45b9d965ebc4b77c72db39a8229 100644 (file)
@@ -71,6 +71,10 @@ typedef uintptr_t uptr;
 #define KASAN_ARGS_NODYCHECKS      0x0100U
 #define KASAN_ARGS_NOPOISON_HEAP   0x0200U
 #define KASAN_ARGS_NOPOISON_GLOBAL 0x0400U
+#define KASAN_ARGS_CHECK_LEAKS     0x0800U
+
+/* uninitialized memory detection */
+#define KASAN_UNINITIALIZED_HEAP   0xbe
 
 #ifndef KASAN
 # error KASAN undefined
@@ -110,6 +114,7 @@ enum __attribute__((flag_enum)) kasan_access_types {
        TYPE_POISON_HEAP   = BIT(14),
        /* no TYPE_POISON_STACK, because the runtime does not control stack poisoning */
        TYPE_TEST          = BIT(15),
+       TYPE_LEAK          = BIT(16),
 
        /* masks */
        TYPE_MEM     = TYPE_MEMR | TYPE_MEMW,            /* memory intrinsics */
@@ -130,6 +135,7 @@ enum kasan_violation_types {
        REASON_INVALID_SIZE =   2, /* free size did not match alloc size */
        REASON_MOD_AFTER_FREE = 3, /* object modified after free */
        REASON_MOD_OOB =        4, /* out of bounds modification of object */
+       REASON_UNINITIALIZED =  5, /* leak of uninitialized kernel memory */
 };
 
 typedef enum kasan_access_types access_t;
diff --git a/san/ksancov.c b/san/ksancov.c
new file mode 100644 (file)
index 0000000..a8d7c81
--- /dev/null
@@ -0,0 +1,769 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <string.h>
+#include <stdbool.h>
+#include <stdatomic.h>
+
+#include <kern/assert.h>
+#include <kern/cpu_data.h>
+#include <kern/locks.h>
+#include <kern/debug.h>
+#include <kern/kalloc.h>
+#include <kern/zalloc.h>
+#include <kern/task.h>
+#include <kern/thread.h>
+
+#include <vm/vm_kern.h>
+#include <vm/vm_protos.h>
+
+#include <mach/mach_vm.h>
+#include <mach/mach_types.h>
+#include <mach/mach_port.h>
+#include <mach/vm_map.h>
+#include <mach/vm_param.h>
+#include <mach/machine/vm_param.h>
+#include <machine/atomic.h>
+
+#include <sys/stat.h> /* dev_t */
+#include <miscfs/devfs/devfs.h> /* must come after sys/stat.h */
+#include <sys/conf.h> /* must come after sys/stat.h */
+
+#include <libkern/libkern.h>
+#include <libkern/OSAtomic.h>
+#include <os/overflow.h>
+
+#include <san/ksancov.h>
+
+/* header mess... */
+struct uthread;
+typedef struct uthread * uthread_t;
+
+#include <sys/sysproto.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+
+#define USE_PC_TABLE 0
+#define KSANCOV_MAX_DEV 64
+
+extern boolean_t ml_at_interrupt_context(void);
+extern boolean_t ml_get_interrupts_enabled(void);
+
+static int ksancov_detach(dev_t dev);
+
+static int dev_major;
+static size_t nedges = 0;
+static uint32_t __unused npcs = 0;
+
+static _Atomic unsigned active_devs;
+
+enum {
+       KS_MODE_NONE,
+       KS_MODE_TRACE,
+       KS_MODE_COUNTERS,
+       KS_MODE_MAX
+};
+
+struct ksancov_dev {
+       unsigned mode;
+
+       union {
+               struct ksancov_trace *trace;
+               struct ksancov_counters *counters;
+       };
+       size_t sz;     /* size of allocated trace/counters buffer */
+
+       size_t maxpcs;
+
+       thread_t thread;
+       dev_t dev;
+};
+
+/* array of devices indexed by devnode minor */
+static struct ksancov_dev *ksancov_devs[KSANCOV_MAX_DEV];
+
+static struct ksancov_edgemap *ksancov_edgemap;
+
+static inline struct ksancov_dev *
+get_dev(dev_t dev)
+{
+       int mn = minor(dev);
+       return ksancov_devs[mn];
+}
+
+void
+__sanitizer_cov_trace_pc_indirect(void * __unused callee)
+{
+       return;
+}
+
+#define GUARD_SEEN     (uint32_t)0x80000000
+#define GUARD_IDX_MASK (uint32_t)0x0fffffff
+
+static inline void __attribute__((always_inline))
+trace_pc_guard(uint32_t *guardp, void *caller)
+{
+       /* record the pc for this guard */
+       if (guardp) {
+               uint32_t gd = *guardp;
+               if (__improbable(gd && !(gd & GUARD_SEEN) && ksancov_edgemap)) {
+                       size_t idx = gd & GUARD_IDX_MASK;
+                       if (idx < ksancov_edgemap->nedges) {
+                               ksancov_edgemap->addrs[idx] = (uint32_t)(VM_KERNEL_UNSLIDE(caller) - VM_MIN_KERNEL_ADDRESS - 1);
+                               *guardp |= GUARD_SEEN;
+                       }
+               }
+       }
+
+       if (__probable(os_atomic_load(&active_devs, relaxed) == 0)) {
+               /* early exit when nothing is active */
+               return;
+       }
+
+       if (ml_at_interrupt_context()) {
+               return;
+       }
+
+       uint32_t pc = (uint32_t)(VM_KERNEL_UNSLIDE(caller) - VM_MIN_KERNEL_ADDRESS - 1);
+
+       thread_t th = current_thread();
+       if (__improbable(th == THREAD_NULL)) {
+               return;
+       }
+
+       struct ksancov_dev *dev = *(struct ksancov_dev **)__sanitizer_get_thread_data(th);
+       if (__probable(dev == NULL)) {
+               return;
+       }
+
+       if (dev->mode == KS_MODE_TRACE) {
+               struct ksancov_trace *trace = dev->trace;
+               if (os_atomic_load(&trace->enabled, relaxed) == 0) {
+                       return;
+               }
+
+               if (os_atomic_load(&trace->head, relaxed) >= dev->maxpcs) {
+                       return; /* overflow */
+               }
+
+               uint32_t idx = os_atomic_inc_orig(&trace->head, relaxed);
+               if (__improbable(idx >= dev->maxpcs)) {
+                       return;
+               }
+
+               trace->pcs[idx] = pc;
+       } else {
+               size_t idx = *guardp & GUARD_IDX_MASK;
+
+               struct ksancov_counters *counters = dev->counters;
+               if (os_atomic_load(&counters->enabled, relaxed) == 0) {
+                       return;
+               }
+
+               /* saturating 8bit add */
+               if (counters->hits[idx] < KSANCOV_MAX_HITS) {
+                       counters->hits[idx]++;
+               }
+       }
+}
+
+void __attribute__((noinline))
+__sanitizer_cov_trace_pc(void)
+{
+       trace_pc_guard(NULL, __builtin_return_address(0));
+}
+
+void __attribute__((noinline))
+__sanitizer_cov_trace_pc_guard(uint32_t *guardp)
+{
+       trace_pc_guard(guardp, __builtin_return_address(0));
+}
+
+void
+__sanitizer_cov_trace_pc_guard_init(uint32_t *start, uint32_t *stop)
+{
+       /* assign a unique number to each guard */
+       for (; start != stop; start++) {
+               if (*start == 0) {
+                       if (nedges < KSANCOV_MAX_EDGES) {
+                               *start = ++nedges;
+                       }
+               }
+       }
+}
+
+void
+__sanitizer_cov_pcs_init(uintptr_t *start, uintptr_t *stop)
+{
+#if USE_PC_TABLE
+       static const uintptr_t pc_table_seen_flag = 0x100;
+
+       for (; start < stop; start += 2) {
+               uintptr_t pc = start[0];
+               uintptr_t flags = start[1];
+
+               /*
+                * This function gets called multiple times on the same range, so mark the
+                * ones we've seen using unused bits in the flags field.
+                */
+               if (flags & pc_table_seen_flag) {
+                       continue;
+               }
+
+               start[1] |= pc_table_seen_flag;
+               assert(npcs < KSANCOV_MAX_EDGES - 1);
+               edge_addrs[++npcs] = pc;
+       }
+#else
+       (void)start;
+       (void)stop;
+#endif
+}
+
+static void *
+ksancov_do_map(uintptr_t base, size_t sz, vm_prot_t prot)
+{
+       kern_return_t kr;
+       mach_port_t mem_entry = MACH_PORT_NULL;
+       mach_vm_address_t user_addr = 0;
+       memory_object_size_t size = sz;
+
+       kr = mach_make_memory_entry_64(kernel_map,
+           &size,
+           (mach_vm_offset_t)base,
+           MAP_MEM_VM_SHARE | prot,
+           &mem_entry,
+           MACH_PORT_NULL);
+       if (kr != KERN_SUCCESS) {
+               return NULL;
+       }
+
+       kr = mach_vm_map_kernel(get_task_map(current_task()),
+           &user_addr,
+           size,
+           0,
+           VM_FLAGS_ANYWHERE,
+           VM_MAP_KERNEL_FLAGS_NONE,
+           VM_KERN_MEMORY_NONE,
+           mem_entry,
+           0,
+           FALSE,
+           prot,
+           prot,
+           VM_INHERIT_SHARE);
+
+       /*
+        * At this point, either vm_map() has taken a reference on the memory entry
+        * and we can release our local reference, or the map failed and the entry
+        * needs to be freed.
+        */
+       mach_memory_entry_port_release(mem_entry);
+
+       if (kr != KERN_SUCCESS) {
+               return NULL;
+       }
+
+       return (void *)user_addr;
+}
+
+/*
+ * map the sancov buffer into the current process
+ */
+static int
+ksancov_map(dev_t dev, void **bufp, size_t *sizep)
+{
+       struct ksancov_dev *d = get_dev(dev);
+       if (!d) {
+               return EINVAL;
+       }
+
+       uintptr_t addr;
+       size_t size = d->sz;
+
+       if (d->mode == KS_MODE_TRACE) {
+               if (!d->trace) {
+                       return EINVAL;
+               }
+               addr = (uintptr_t)d->trace;
+       } else if (d->mode == KS_MODE_COUNTERS) {
+               if (!d->counters) {
+                       return EINVAL;
+               }
+               addr = (uintptr_t)d->counters;
+       } else {
+               return EINVAL; /* not configured */
+       }
+
+       void *buf = ksancov_do_map(addr, size, VM_PROT_READ | VM_PROT_WRITE);
+       if (buf == NULL) {
+               return ENOMEM;
+       }
+
+       *bufp = buf;
+       *sizep = size;
+       return 0;
+}
+
+/*
+ * map the edge -> pc mapping as read-only
+ */
+static int
+ksancov_map_edgemap(dev_t dev, void **bufp, size_t *sizep)
+{
+       struct ksancov_dev *d = get_dev(dev);
+       if (!d) {
+               return EINVAL;
+       }
+
+       uintptr_t addr = (uintptr_t)ksancov_edgemap;
+       size_t size = sizeof(struct ksancov_edgemap) + ksancov_edgemap->nedges * sizeof(uint32_t);
+
+       void *buf = ksancov_do_map(addr, size, VM_PROT_READ);
+       if (buf == NULL) {
+               return ENOMEM;
+       }
+
+       *bufp = buf;
+       *sizep = size;
+       return 0;
+}
+
+
+/*
+ * Device node management
+ */
+
+static int
+ksancov_open(dev_t dev, int flags, int devtype, proc_t p)
+{
+#pragma unused(flags,devtype,p)
+       if (minor(dev) >= KSANCOV_MAX_DEV) {
+               return EBUSY;
+       }
+
+       /* allocate a device entry */
+       struct ksancov_dev *d = kalloc_tag(sizeof(struct ksancov_dev), VM_KERN_MEMORY_DIAG);
+       if (!d) {
+               return ENOMEM;
+       }
+
+       d->mode = KS_MODE_NONE;
+       d->trace = NULL;
+       d->maxpcs = 1024U * 64; /* default to 256k buffer => 64k pcs */
+       d->dev = dev;
+       d->thread = THREAD_NULL;
+
+       ksancov_devs[minor(dev)] = d;
+
+       return 0;
+}
+
+static int
+ksancov_trace_alloc(dev_t dev, size_t maxpcs)
+{
+       struct ksancov_dev *d = get_dev(dev);
+       if (!d) {
+               return EINVAL;
+       }
+
+       if (d->mode != KS_MODE_NONE) {
+               return EBUSY; /* trace/counters already created */
+       }
+       assert(d->trace == NULL);
+
+       uintptr_t buf;
+       size_t sz;
+       if (os_mul_and_add_overflow(maxpcs, sizeof(uint32_t), sizeof(struct ksancov_trace), &sz)) {
+               return EINVAL;
+       }
+
+       /* allocate the shared memory buffer */
+       kern_return_t kr = kmem_alloc_flags(kernel_map, &buf, sz, VM_KERN_MEMORY_DIAG, KMA_ZERO);
+       if (kr != KERN_SUCCESS) {
+               return ENOMEM;
+       }
+
+       struct ksancov_trace *trace = (struct ksancov_trace *)buf;
+       trace->magic = KSANCOV_TRACE_MAGIC;
+       trace->offset = VM_MIN_KERNEL_ADDRESS;
+       trace->head = 0;
+       trace->enabled = 0;
+       trace->maxpcs = maxpcs;
+
+       d->trace = trace;
+       d->sz = sz;
+       d->maxpcs = maxpcs;
+       d->mode = KS_MODE_TRACE;
+
+       return 0;
+}
+
+static int
+ksancov_counters_alloc(dev_t dev)
+{
+       struct ksancov_dev *d = get_dev(dev);
+       if (!d) {
+               return EINVAL;
+       }
+
+       if (d->mode != KS_MODE_NONE) {
+               return EBUSY; /* trace/counters already created */
+       }
+       assert(d->counters == NULL);
+
+       uintptr_t buf;
+       size_t sz = sizeof(struct ksancov_counters) + ksancov_edgemap->nedges * sizeof(uint8_t);
+
+       /* allocate the shared memory buffer */
+       kern_return_t kr = kmem_alloc_flags(kernel_map, &buf, sz, VM_KERN_MEMORY_DIAG, KMA_ZERO);
+       if (kr != KERN_SUCCESS) {
+               return ENOMEM;
+       }
+
+       struct ksancov_counters *counters = (struct ksancov_counters *)buf;
+       counters->magic = KSANCOV_COUNTERS_MAGIC;
+       counters->nedges = ksancov_edgemap->nedges;
+       counters->enabled = 0;
+
+       d->counters = counters;
+       d->sz = sz;
+       d->mode = KS_MODE_COUNTERS;
+
+       return 0;
+}
+
+/*
+ * attach a thread to a ksancov dev instance
+ */
+static int
+ksancov_attach(dev_t dev, thread_t th)
+{
+       struct ksancov_dev *d = get_dev(dev);
+       if (!d) {
+               return EINVAL;
+       }
+
+       if (d->thread != THREAD_NULL) {
+               int ret = ksancov_detach(dev);
+               if (ret) {
+                       return ret;
+               }
+       }
+
+       if (th != current_thread()) {
+               /* can only attach to self presently */
+               return EINVAL;
+       }
+
+       struct ksancov_dev **devp = (void *)__sanitizer_get_thread_data(th);
+       if (*devp) {
+               return EBUSY; /* one dev per thread */
+       }
+
+       d->thread = th;
+       thread_reference(d->thread);
+
+       os_atomic_store(devp, d, relaxed);
+       os_atomic_add(&active_devs, 1, relaxed);
+
+       return 0;
+}
+
+extern void
+thread_wait(
+       thread_t        thread,
+       boolean_t       until_not_runnable);
+
+
+/*
+ * disconnect thread from ksancov dev
+ */
+static int
+ksancov_detach(dev_t dev)
+{
+       struct ksancov_dev *d = get_dev(dev);
+       if (!d) {
+               return EINVAL;
+       }
+
+       if (d->thread == THREAD_NULL) {
+               /* no thread attached */
+               return 0;
+       }
+
+       /* disconnect dev from thread */
+       struct ksancov_dev **devp = (void *)__sanitizer_get_thread_data(d->thread);
+       if (*devp != NULL) {
+               assert(*devp == d);
+               os_atomic_store(devp, NULL, relaxed);
+       }
+
+       if (d->thread != current_thread()) {
+               /* wait until it's safe to yank */
+               thread_wait(d->thread, TRUE);
+       }
+
+       /* drop our thread reference */
+       thread_deallocate(d->thread);
+       d->thread = THREAD_NULL;
+
+       return 0;
+}
+
+static int
+ksancov_close(dev_t dev, int flags, int devtype, proc_t p)
+{
+#pragma unused(flags,devtype,p)
+       struct ksancov_dev *d = get_dev(dev);
+       if (!d) {
+               return EINVAL;
+       }
+
+       if (d->mode == KS_MODE_TRACE) {
+               struct ksancov_trace *trace = d->trace;
+               if (trace) {
+                       /* trace allocated - delete it */
+
+                       os_atomic_sub(&active_devs, 1, relaxed);
+                       os_atomic_store(&trace->enabled, 0, relaxed); /* stop tracing */
+
+                       ksancov_detach(dev);
+
+                       /* free trace */
+                       kmem_free(kernel_map, (uintptr_t)d->trace, d->sz);
+                       d->trace = NULL;
+                       d->sz = 0;
+               }
+       } else if (d->mode == KS_MODE_COUNTERS) {
+               struct ksancov_counters *counters = d->counters;
+               if (counters) {
+                       os_atomic_sub(&active_devs, 1, relaxed);
+                       os_atomic_store(&counters->enabled, 0, relaxed); /* stop tracing */
+
+                       ksancov_detach(dev);
+
+                       /* free counters */
+                       kmem_free(kernel_map, (uintptr_t)d->counters, d->sz);
+                       d->counters = NULL;
+                       d->sz = 0;
+               }
+       }
+
+       ksancov_devs[minor(dev)] = NULL; /* dev no longer discoverable */
+
+       /* free the ksancov device instance */
+       kfree(d, sizeof(struct ksancov_dev));
+
+       return 0;
+}
+
+static void
+ksancov_testpanic(volatile uint64_t guess)
+{
+       const uint64_t tgt = 0xf85de3b12891c817UL;
+
+#define X(n) ((tgt & (0xfUL << (4*n))) == (guess & (0xfUL << (4*n))))
+
+       if (X(0)) {
+               if (X(1)) {
+                       if (X(2)) {
+                               if (X(3)) {
+                                       if (X(4)) {
+                                               if (X(5)) {
+                                                       if (X(6)) {
+                                                               if (X(7)) {
+                                                                       if (X(8)) {
+                                                                               if (X(9)) {
+                                                                                       if (X(10)) {
+                                                                                               if (X(11)) {
+                                                                                                       if (X(12)) {
+                                                                                                               if (X(13)) {
+                                                                                                                       if (X(14)) {
+                                                                                                                               if (X(15)) {
+                                                                                                                                       panic("ksancov: found test value\n");
+                                                                                                                               }
+                                                                                                                       }
+                                                                                                               }
+                                                                                                       }
+                                                                                               }
+                                                                                       }
+                                                                               }
+                                                                       }
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+}
+
+static int
+ksancov_ioctl(dev_t dev, unsigned long cmd, caddr_t _data, int fflag, proc_t p)
+{
+#pragma unused(fflag,p)
+       int ret = 0;
+       void *data = (void *)_data;
+
+       struct ksancov_dev *d = get_dev(dev);
+       if (!d) {
+               return EINVAL; /* dev not open */
+       }
+
+       if (cmd == KSANCOV_IOC_TRACE) {
+               size_t maxpcs = *(size_t *)data;
+               ret = ksancov_trace_alloc(dev, maxpcs);
+               if (ret) {
+                       return ret;
+               }
+       } else if (cmd == KSANCOV_IOC_COUNTERS) {
+               ret = ksancov_counters_alloc(dev);
+               if (ret) {
+                       return ret;
+               }
+       } else if (cmd == KSANCOV_IOC_MAP) {
+               struct ksancov_buf_desc *mcmd = (struct ksancov_buf_desc *)data;
+
+               if (d->mode == KS_MODE_NONE) {
+                       return EINVAL; /* mode not configured */
+               }
+
+               /* map buffer into the userspace VA space */
+               void *buf;
+               size_t size;
+               ret = ksancov_map(dev, &buf, &size);
+               if (ret) {
+                       return ret;
+               }
+
+               mcmd->ptr = (uintptr_t)buf;
+               mcmd->sz = size;
+       } else if (cmd == KSANCOV_IOC_MAP_EDGEMAP) {
+               struct ksancov_buf_desc *mcmd = (struct ksancov_buf_desc *)data;
+
+               /* map buffer into the userspace VA space */
+               void *buf;
+               size_t size;
+               ret = ksancov_map_edgemap(dev, &buf, &size);
+               if (ret) {
+                       return ret;
+               }
+
+               mcmd->ptr = (uintptr_t)buf;
+               mcmd->sz = size;
+       } else if (cmd == KSANCOV_IOC_START) {
+               if (d->mode == KS_MODE_NONE) {
+                       return EINVAL; /* not configured */
+               }
+
+               ret = ksancov_attach(dev, current_thread());
+               if (ret) {
+                       return ret;
+               }
+       } else if (cmd == KSANCOV_IOC_NEDGES) {
+               size_t *nptr = (size_t *)data;
+               *nptr = nedges;
+       } else if (cmd == KSANCOV_IOC_TESTPANIC) {
+               uint64_t guess = *(uint64_t *)data;
+               ksancov_testpanic(guess);
+       } else {
+               /* unknown ioctl */
+               return ENODEV;
+       }
+
+       return ret;
+}
+
+static int
+ksancov_dev_clone(dev_t dev, int action)
+{
+#pragma unused(dev)
+       if (action == DEVFS_CLONE_ALLOC) {
+               for (size_t i = 0; i < KSANCOV_MAX_DEV; i++) {
+                       if (ksancov_devs[i] == NULL) {
+                               return i;
+                       }
+               }
+       } else if (action == DEVFS_CLONE_FREE) {
+               return 0;
+       }
+
+       return -1;
+}
+
+static struct cdevsw
+    ksancov_cdev = {
+       .d_open =  ksancov_open,
+       .d_close = ksancov_close,
+       .d_ioctl = ksancov_ioctl,
+
+       .d_read = eno_rdwrt,
+       .d_write = eno_rdwrt,
+       .d_stop = eno_stop,
+       .d_reset = eno_reset,
+       .d_select = eno_select,
+       .d_mmap = eno_mmap,
+       .d_strategy = eno_strat,
+       .d_type = 0
+};
+
+int
+ksancov_init_dev(void)
+{
+       dev_major = cdevsw_add(-1, &ksancov_cdev);
+       if (dev_major < 0) {
+               printf("ksancov: failed to allocate major device node\n");
+               return -1;
+       }
+
+       dev_t dev = makedev(dev_major, 0);
+       void *node = devfs_make_node_clone(dev, DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
+           ksancov_dev_clone, KSANCOV_DEVNODE);
+       if (!node) {
+               printf("ksancov: failed to create device node\n");
+               return -1;
+       }
+
+       /* This could be moved to the first use of /dev/ksancov to save memory */
+       uintptr_t buf;
+       size_t sz = sizeof(struct ksancov_edgemap) + KSANCOV_MAX_EDGES * sizeof(uint32_t);
+
+       kern_return_t kr = kmem_alloc_flags(kernel_map, &buf, sz, VM_KERN_MEMORY_DIAG, KMA_ZERO);
+       if (kr) {
+               printf("ksancov: failed to allocate edge addr map\n");
+               return -1;
+       }
+
+       ksancov_edgemap = (void *)buf;
+       ksancov_edgemap->magic = KSANCOV_EDGEMAP_MAGIC;
+       ksancov_edgemap->nedges = nedges;
+       ksancov_edgemap->offset = VM_MIN_KERNEL_ADDRESS;
+
+       return 0;
+}
diff --git a/san/ksancov.h b/san/ksancov.h
new file mode 100644 (file)
index 0000000..80936b4
--- /dev/null
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _KSANCOV_H_
+#define _KSANCOV_H_
+
+#include <stdint.h>
+#include <stdatomic.h>
+#include <sys/ioccom.h>
+
+#define KSANCOV_DEVNODE "ksancov"
+#define KSANCOV_PATH "/dev/" KSANCOV_DEVNODE
+
+/*
+ * ioctl
+ */
+
+struct ksancov_buf_desc {
+       uintptr_t ptr;  /* ptr to shared buffer [out] */
+       size_t sz;      /* size of shared buffer [out] */
+};
+
+/* Set mode */
+#define KSANCOV_IOC_TRACE        _IOW('K', 1, size_t) /* number of pcs */
+#define KSANCOV_IOC_COUNTERS     _IO('K', 2)
+
+/* Establish a shared mapping of the coverage buffer. */
+#define KSANCOV_IOC_MAP          _IOWR('K', 8, struct ksancov_buf_desc)
+
+/* Establish a shared mapping of the edge address buffer. */
+#define KSANCOV_IOC_MAP_EDGEMAP  _IOWR('K', 9, struct ksancov_buf_desc)
+
+/* Log the current thread */
+#define KSANCOV_IOC_START        _IOW('K', 10, uintptr_t)
+
+#define KSANCOV_IOC_NEDGES       _IOR('K', 50, size_t)
+
+#define KSANCOV_IOC_TESTPANIC    _IOW('K', 20, uint64_t)
+
+
+/*
+ * shared kernel-user mapping
+ */
+
+#define KSANCOV_MAX_EDGES       512UL*1024
+#define KSANCOV_MAX_HITS        UINT8_MAX
+#define KSANCOV_TRACE_MAGIC     (uint32_t)0x5AD17F5BU
+#define KSANCOV_COUNTERS_MAGIC  (uint32_t)0x5AD27F6BU
+#define KSANCOV_EDGEMAP_MAGIC   (uint32_t)0x5AD37F7BU
+
+struct ksancov_header {
+       uint32_t magic;
+       _Atomic uint32_t enabled;
+};
+
+struct ksancov_trace {
+       /* userspace R/O fields */
+       union {
+               struct ksancov_header hdr;
+               struct {
+                       uint32_t magic;
+                       _Atomic uint32_t enabled;
+               };
+       };
+
+       uintptr_t offset; /* pc entries relative to this */
+       uint32_t maxpcs;
+       _Atomic uint32_t head;
+       uint32_t pcs[];
+};
+
+struct ksancov_counters {
+       union {
+               struct ksancov_header hdr;
+               struct {
+                       uint32_t magic;
+                       _Atomic uint32_t enabled;
+               };
+       };
+
+       uint32_t nedges; /* total number of edges */
+       uint8_t hits[];  /* hits on each edge (8bit saturating) */
+};
+
+struct ksancov_edgemap {
+       uint32_t magic;
+       uint32_t nedges;
+       uintptr_t offset; /* edge addrs relative to this */
+       uint32_t addrs[]; /* address of each edge relative to 'offset' */
+};
+
+#if XNU_KERNEL_PRIVATE
+int ksancov_init_dev(void);
+void **__sanitizer_get_thread_data(thread_t);
+
+/*
+ * SanitizerCoverage ABI
+ */
+extern void __sanitizer_cov_trace_pc_guard(uint32_t *guard);
+extern void __sanitizer_cov_trace_pc_guard_init(uint32_t *start, uint32_t *stop);
+extern void __sanitizer_cov_pcs_init(uintptr_t *start, uintptr_t *stop);
+extern void __sanitizer_cov_trace_pc(void);
+extern void __sanitizer_cov_trace_pc_indirect(void *callee);
+#endif
+
+#ifndef KERNEL
+
+#include <strings.h>
+#include <assert.h>
+#include <unistd.h>
+
+/*
+ * ksancov userspace API
+ *
+ * Usage:
+ * 1) open the ksancov device
+ * 2) set the coverage mode (trace or edge counters)
+ * 3) map the coverage buffer
+ * 4) start the trace on a thread
+ * 5) flip the enable bit
+ */
+
+static inline int
+ksancov_open(void)
+{
+       return open(KSANCOV_PATH, 0);
+}
+
+static inline int
+ksancov_map(int fd, uintptr_t *buf, size_t *sz)
+{
+       int ret;
+       struct ksancov_buf_desc mc = {0};
+
+       ret = ioctl(fd, KSANCOV_IOC_MAP, &mc);
+       if (ret == -1) {
+               return errno;
+       }
+
+       *buf = mc.ptr;
+       if (sz) {
+               *sz = mc.sz;
+       }
+
+       struct ksancov_trace *trace = (void *)mc.ptr;
+       assert(trace->magic == KSANCOV_TRACE_MAGIC ||
+           trace->magic == KSANCOV_COUNTERS_MAGIC);
+
+       return 0;
+}
+
+static inline int
+ksancov_map_edgemap(int fd, uintptr_t *buf, size_t *sz)
+{
+       int ret;
+       struct ksancov_buf_desc mc = {0};
+
+       ret = ioctl(fd, KSANCOV_IOC_MAP_EDGEMAP, &mc);
+       if (ret == -1) {
+               return errno;
+       }
+
+       *buf = mc.ptr;
+       if (sz) {
+               *sz = mc.sz;
+       }
+
+       struct ksancov_trace *trace = (void *)mc.ptr;
+       assert(trace->magic == KSANCOV_EDGEMAP_MAGIC);
+
+       return 0;
+}
+
+static inline size_t
+ksancov_nedges(int fd)
+{
+       size_t nedges;
+       int ret = ioctl(fd, KSANCOV_IOC_NEDGES, &nedges);
+       if (ret == -1) {
+               return SIZE_MAX;
+       }
+       return nedges;
+}
+
+static inline int
+ksancov_mode_trace(int fd, size_t entries)
+{
+       int ret;
+       ret = ioctl(fd, KSANCOV_IOC_TRACE, &entries);
+       if (ret == -1) {
+               return errno;
+       }
+       return 0;
+}
+
+static inline int
+ksancov_mode_counters(int fd)
+{
+       int ret;
+       ret = ioctl(fd, KSANCOV_IOC_COUNTERS);
+       if (ret == -1) {
+               return errno;
+       }
+       return 0;
+}
+
+static inline int
+ksancov_thread_self(int fd)
+{
+       int ret;
+       uintptr_t th = 0;
+       ret = ioctl(fd, KSANCOV_IOC_START, &th);
+       if (ret == -1) {
+               return errno;
+       }
+       return 0;
+}
+
+static inline int
+ksancov_start(void *buf)
+{
+       struct ksancov_header *hdr = (struct ksancov_header *)buf;
+       atomic_store_explicit(&hdr->enabled, 1, memory_order_relaxed);
+       return 0;
+}
+
+static inline int
+ksancov_stop(void *buf)
+{
+       struct ksancov_header *hdr = (struct ksancov_header *)buf;
+       atomic_store_explicit(&hdr->enabled, 0, memory_order_relaxed);
+       return 0;
+}
+
+static inline int
+ksancov_reset(void *buf)
+{
+       struct ksancov_header *hdr = (struct ksancov_header *)buf;
+       if (hdr->magic == KSANCOV_TRACE_MAGIC) {
+               struct ksancov_trace *trace = (struct ksancov_trace *)buf;
+               atomic_store_explicit(&trace->head, 0, memory_order_relaxed);
+       } else if (hdr->magic == KSANCOV_COUNTERS_MAGIC) {
+               struct ksancov_counters *counters = (struct ksancov_counters *)buf;
+               bzero(counters->hits, counters->nedges);
+       } else {
+               return EINVAL;
+       }
+       return 0;
+}
+
+static inline uintptr_t
+ksancov_edge_addr(struct ksancov_edgemap *addrs, size_t idx)
+{
+       assert(addrs);
+       if (idx >= addrs->nedges) {
+               return 0;
+       }
+       return addrs->addrs[idx] + addrs->offset;
+}
+
+static inline size_t
+ksancov_trace_max_pcs(struct ksancov_trace *trace)
+{
+       return trace->maxpcs;
+}
+
+static inline uintptr_t
+ksancov_trace_offset(struct ksancov_trace *trace)
+{
+       assert(trace);
+       return trace->offset;
+}
+
+static inline size_t
+ksancov_trace_head(struct ksancov_trace *trace)
+{
+       size_t maxlen = trace->maxpcs;
+       size_t head = atomic_load_explicit(&trace->head, memory_order_acquire);
+       return head < maxlen ? head : maxlen;
+}
+
+static inline uintptr_t
+ksancov_trace_entry(struct ksancov_trace *trace, size_t i)
+{
+       if (i >= trace->head) {
+               return 0;
+       }
+
+       return trace->pcs[i] + trace->offset;
+}
+
+#endif
+
+#endif /* _KSANCOV_H_ */
index 9e5f2eda27d110f7ab173a9e0baa25fb1c3d1b8b..0c0a11ecebd55993695419dcec0b83b71e6d01c3 100644 (file)
@@ -54,7 +54,7 @@ __nosan_bcmp(const void *a, const void *b, size_t sz)
 static inline void
 __nosan_bcopy(const void *src, void *dst, size_t sz)
 {
-       return bcopy(src, dst, sz);
+       bcopy(src, dst, sz);
 }
 static inline int
 __nosan_memcmp(const void *a, const void *b, size_t sz)
@@ -64,7 +64,7 @@ __nosan_memcmp(const void *a, const void *b, size_t sz)
 static inline void
 __nosan_bzero(void *dst, size_t sz)
 {
-       return bzero(dst, sz);
+       bzero(dst, sz);
 }
 
 static inline size_t
diff --git a/san/tools/ksancov.c b/san/tools/ksancov.c
new file mode 100644 (file)
index 0000000..0a35fc0
--- /dev/null
@@ -0,0 +1,217 @@
+#if 0
+CC = clang
+    CFLAGS = -O3
+    $(MAKEFILE_LIST:.c = ):
+
+            ifeq (0, 1)
+            * /
+#endif
+
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdatomic.h>
+#include <errno.h>
+
+#include <fcntl.h>
+#include <sys/ioctl.h>
+
+#include <getopt.h>
+
+#include "../ksancov.h"
+
+            static void
+            usage(void)
+           {
+                   fprintf(stderr,
+                   "usage: ./ksancov [OPTIONS]\n\n"
+                   "  -t | --trace        use trace (PC log) mode [default]\n"
+                   "  -c | --counters     use edge counter mode\n"
+                   "  -n | --entries <n>  override max entries in trace log\n"
+                   "  -x | --exec <path>  instrument execution of binary at <path>\n");
+                   exit(1);
+           }
+
+            int
+            main(int argc, char *argv[])
+           {
+                   struct ksancov_trace *trace = NULL;
+                   struct ksancov_counters *counters = NULL;
+                   struct ksancov_header *header = NULL;
+
+                   int ret;
+                   size_t max_entries = 64UL * 1024;
+                   char *path = NULL;
+                   bool docounters = false;
+
+                   struct option opts[] = {
+                           { "entries", required_argument, NULL, 'n' },
+                           { "exec", required_argument, NULL, 'x' },
+
+                           { "trace", no_argument, NULL, 't' },
+                           { "counters", no_argument, NULL, 'c' },
+
+                           { NULL, 0, NULL, 0 }
+                   };
+
+                   int ch;
+                   while ((ch = getopt_long(argc, argv, "tn:x:c", opts, NULL)) != -1) {
+                           switch (ch) {
+                           case 'n':
+                                   max_entries = strtoul(optarg, NULL, 0);
+                                   break;
+                           case 'x':
+                                   path = optarg;
+                                   break;
+                           case 't':
+                                   docounters = false;
+                                   break;
+                           case 'c':
+                                   docounters = true;
+                                   break;
+                           default:
+                                   usage();
+                           }
+                           ;
+                   }
+
+                   int fd;
+                   uintptr_t addr;
+                   size_t sz;
+
+                   fd = ksancov_open();
+                   if (fd < 0) {
+                           perror("ksancov_open");
+                           return errno;
+                   }
+                   fprintf(stderr, "opened ksancov on fd %i\n", fd);
+
+                   uintptr_t e;
+                   ret = ksancov_map_edgemap(fd, &e, NULL);
+                   if (ret) {
+                           perror("ksancov map counters\n");
+                           return ret;
+                   }
+                   struct ksancov_edgemap *map = (void *)e;
+                   fprintf(stderr, "nedges (edgemap) = %u\n", map->nedges);
+
+                   if (docounters) {
+                           ret = ksancov_mode_counters(fd);
+                           if (ret) {
+                                   perror("ksancov set mode\n");
+                                   return ret;
+                           }
+                   } else {
+                           ret = ksancov_mode_trace(fd, max_entries);
+                           if (ret) {
+                                   perror("ksancov set mode\n");
+                                   return ret;
+                           }
+                   }
+
+                   ret = ksancov_map(fd, &addr, &sz);
+                   if (ret) {
+                           perror("ksancov map");
+                           return ret;
+                   }
+                   fprintf(stderr, "mapped to 0x%lx + %lu\n", addr, sz);
+
+                   if (docounters) {
+                           counters = (void *)addr;
+                           fprintf(stderr, "nedges (counters) = %u\n", counters->nedges);
+                   } else {
+                           trace = (void *)addr;
+                           fprintf(stderr, "maxpcs = %lu\n", ksancov_trace_max_pcs(trace));
+                   }
+                   header = (void *)addr;
+
+                   if (path) {
+                           int pid = fork();
+                           if (pid == 0) {
+                                   /* child */
+
+                                   ret = ksancov_thread_self(fd);
+                                   if (ret) {
+                                           perror("ksancov thread");
+                                           return ret;
+                                   }
+
+                                   ksancov_reset(header);
+                                   ksancov_start(header);
+                                   ret = execl(path, path, 0);
+                                   perror("execl");
+
+                                   exit(1);
+                           } else {
+                                   /* parent */
+                                   waitpid(pid, NULL, 0);
+                                   ksancov_stop(header);
+                           }
+                   } else {
+                           ret = ksancov_thread_self(fd);
+                           if (ret) {
+                                   perror("ksancov thread");
+                                   return ret;
+                           }
+
+                           ksancov_reset(header);
+                           ksancov_start(header);
+                           int ppid = getppid();
+                           ksancov_stop(header);
+                           fprintf(stderr, "ppid = %i\n", ppid);
+                   }
+
+                   if (docounters) {
+                           for (size_t i = 0; i < counters->nedges; i++) {
+                                   size_t hits = counters->hits[i];
+                                   if (hits) {
+                                           fprintf(stderr, "0x%lx: %lu hits [idx %lu]\n", ksancov_edge_addr(map, i), hits, i);
+                                   }
+                           }
+                   } else {
+                           size_t head = ksancov_trace_head(trace);
+                           fprintf(stderr, "head = %lu\n", head);
+                           for (uint32_t i = 0; i < head; i++) {
+                                   uintptr_t pc = ksancov_trace_entry(trace, i);
+                                   fprintf(stderr, "0x%lx\n", pc);
+                           }
+                   }
+
+                   ret = close(fd);
+                   fprintf(stderr, "close = %i\n", ret);
+
+                   return 0;
+           }
+
+/*
+ *  endif
+ # */
index 2e48edff58e0af8a4494ab290c21d1ebf7c549e4..1104fdd17b44c0ac0c00fc56dbe117813a92426f 100644 (file)
@@ -1,9 +1,15 @@
 [.*]
 src:./san/ubsan*
 
+[bounds]
+src:./osfmk/corecrypto/*
+
 [alignment]
 
 src:./libsa/bootstrap.cpp
 src:./bsd/net/necp_client.c
 src:./pexpert/arm/pe_identify_machine.c
 
+[object-size]
+src:./osfmk/i386/locks_i386.c
+
index 86b6d293b94ada9d09e38e5e8fd21f38da783234..04eea747f6eec8ebba3d4a0ace13e6f3dac385d3 100644 (file)
@@ -33,7 +33,7 @@
 
 static const bool ubsan_print = false;
 static const uint32_t line_acquired = 0x80000000UL;
-
+static const char *get_type_check_kind(uint8_t kind);
 static size_t
 format_loc(struct san_src_loc *loc, char *dst, size_t sz)
 {
@@ -98,24 +98,41 @@ format_shift(struct ubsan_violation *v, char *buf, size_t sz)
        return n;
 }
 
-static const char *const
-align_kinds[] = {
-       "load",
-       "store",
-       "<unknown>",
-       "member access",
-       "<unknown>",
+static const char * const
+type_check_kinds[] = {
+       "load of", "store to", "reference binding to", "member access within",
+       "member call on", "constructor call on", "downcast of", "downcast of",
+       "upcast of", "cast to virtual base of", "_Nonnull binding to"
 };
 
+static const char *
+get_type_check_kind(uint8_t kind)
+{
+       return (kind < (sizeof(type_check_kinds) / sizeof(type_check_kinds[0])))
+              ? type_check_kinds[kind]
+              : "some";
+}
+
 static size_t
-format_alignment(struct ubsan_violation *v, char *buf, size_t sz)
+format_type_mismatch(struct ubsan_violation *v, char *buf, size_t sz)
 {
        size_t n = 0;
-       struct san_type_desc *ty = v->align->ty;
+       size_t alignment = 1 << v->align->align;
+       void *ptr = (void*)v->lhs;
+       const char * kind = get_type_check_kind(v->align->kind);
+       if (NULL == ptr) {
+               //null pointer use
+               n += snprintf(buf + n, sz - n, "%s NULL pointer of type %s\n", kind, v->align->ty->name);
+       } else if (alignment && ((uintptr_t)ptr & (alignment - 1))) {
+               //misaligned pointer use
+               n += snprintf(buf + n, sz - n, "%s mis-aligned address %p for type %s ", kind, (void*)v->lhs, v->align->ty->name);
+               n += snprintf(buf + n, sz - n, "which requires %d byte alignment\n", 1 << v->align->align);
+       } else {
+               //insufficient object size
+               n += snprintf(buf + n, sz - n, "%s address %p with insufficient space for an object of type %s\n",
+                   kind, ptr, v->align->ty->name);
+       }
 
-       n += snprintf(buf + n, sz - n, "mis-aligned %s of 0x%llx\n", align_kinds[v->align->kind], v->lhs);
-       n += snprintf(buf + n, sz - n, "  expected %d-byte alignment, type = %s\n",
-           1 << v->align->align, ty->name);
        return n;
 }
 
@@ -150,8 +167,8 @@ ubsan_format(struct ubsan_violation *v, char *buf, size_t sz)
        case UBSAN_SHIFT:
                n += format_shift(v, buf + n, sz - n);
                break;
-       case UBSAN_ALIGN:
-               n += format_alignment(v, buf + n, sz - n);
+       case UBSAN_TYPE_MISMATCH:
+               n += format_type_mismatch(v, buf + n, sz - n);
                break;
        case UBSAN_POINTER_OVERFLOW:
                n += snprintf(buf + n, sz - n, "pointer overflow, before = 0x%llx, after = 0x%llx\n", v->lhs, v->rhs);
@@ -159,6 +176,9 @@ ubsan_format(struct ubsan_violation *v, char *buf, size_t sz)
        case UBSAN_OOB:
                n += format_oob(v, buf + n, sz - n);
                break;
+       case UBSAN_GENERIC:
+               n += snprintf(buf + n, sz - n, "%s\n", v->func);
+               break;
        default:
                panic("unknown violation");
        }
@@ -236,14 +256,14 @@ DEFINE_OVERFLOW(negate)
 void
 __ubsan_handle_type_mismatch_v1(struct ubsan_align_desc *desc, uint64_t val)
 {
-       struct ubsan_violation v = { UBSAN_ALIGN, val, 0, .align = desc, &desc->loc };
+       struct ubsan_violation v = { UBSAN_TYPE_MISMATCH, val, 0, .align = desc, &desc->loc };
        ubsan_handle(&v, false);
 }
 
 void
 __ubsan_handle_type_mismatch_v1_abort(struct ubsan_align_desc *desc, uint64_t val)
 {
-       struct ubsan_violation v = { UBSAN_ALIGN, val, 0, .align = desc, &desc->loc };
+       struct ubsan_violation v = { UBSAN_TYPE_MISMATCH, val, 0, .align = desc, &desc->loc };
        ubsan_handle(&v, true);
 }
 
@@ -274,3 +294,27 @@ __ubsan_handle_out_of_bounds_abort(struct ubsan_oob_desc *desc, uint64_t idx)
        struct ubsan_violation v = { UBSAN_OOB, idx, 0, .oob = desc, &desc->loc };
        ubsan_handle(&v, true);
 }
+
+#define DEFINE_GENERIC(check) \
+       void __ubsan_handle_##check (struct san_src_loc* loc) \
+       { \
+               struct ubsan_violation v = { UBSAN_GENERIC, 0, 0, .func = __func__, loc }; \
+               ubsan_handle(&v, false); \
+       } \
+       void __ubsan_handle_##check##_abort(struct san_src_loc* loc) \
+       { \
+               struct ubsan_violation v = { UBSAN_GENERIC, 0, 0, .func = __func__, loc }; \
+               ubsan_handle(&v, true); \
+       }
+
+DEFINE_GENERIC(invalid_builtin)
+DEFINE_GENERIC(load_invalid_value)
+DEFINE_GENERIC(nonnull_arg)
+DEFINE_GENERIC(vla_bound_not_positive)
+DEFINE_GENERIC(float_cast_overflow)
+DEFINE_GENERIC(function_type_mismatch)
+DEFINE_GENERIC(missing_return)
+DEFINE_GENERIC(nonnull_return)
+DEFINE_GENERIC(nullability_arg)
+DEFINE_GENERIC(nullability_return)
+DEFINE_GENERIC(implicit_conversion)
index e78dacefc3cec887722040dab69e8491bd514f8d..9dff870a241200816af7d397c27493b339df51ec 100644 (file)
@@ -95,6 +95,8 @@ enum {
        UBSAN_ALIGN,
        UBSAN_POINTER_OVERFLOW,
        UBSAN_OOB,
+       UBSAN_GENERIC,
+       UBSAN_TYPE_MISMATCH,
        UBSAN_VIOLATION_MAX,
 };
 
@@ -109,6 +111,7 @@ struct ubsan_violation {
                struct ubsan_align_desc *align;
                struct ubsan_ptroverflow_desc *ptroverflow;
                struct ubsan_oob_desc *oob;
+               const char *func;
        };
        struct san_src_loc *loc;
 };
@@ -121,23 +124,47 @@ size_t ubsan_format(struct ubsan_violation *, char *buf, size_t sz);
  */
 
 void __ubsan_handle_add_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
-void __ubsan_handle_sub_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
-void __ubsan_handle_mul_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
-void __ubsan_handle_divrem_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
-void __ubsan_handle_negate_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
 void __ubsan_handle_add_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
-void __ubsan_handle_sub_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
-void __ubsan_handle_mul_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_builtin_unreachable(struct ubsan_unreachable_desc *);
+void __ubsan_handle_divrem_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
 void __ubsan_handle_divrem_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_mul_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_mul_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_negate_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
 void __ubsan_handle_negate_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
-void __ubsan_handle_builtin_unreachable(struct ubsan_unreachable_desc *);
+void __ubsan_handle_out_of_bounds(struct ubsan_oob_desc *, uint64_t idx);
+void __ubsan_handle_out_of_bounds_abort(struct ubsan_oob_desc *, uint64_t idx);
+void __ubsan_handle_pointer_overflow(struct ubsan_ptroverflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_pointer_overflow_abort(struct ubsan_ptroverflow_desc *, uint64_t lhs, uint64_t rhs);
 void __ubsan_handle_shift_out_of_bounds(struct ubsan_shift_desc *, uint64_t lhs, uint64_t rhs);
 void __ubsan_handle_shift_out_of_bounds_abort(struct ubsan_shift_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_sub_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_sub_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
 void __ubsan_handle_type_mismatch_v1(struct ubsan_align_desc *, uint64_t val);
 void __ubsan_handle_type_mismatch_v1_abort(struct ubsan_align_desc *, uint64_t val);
-void __ubsan_handle_pointer_overflow(struct ubsan_ptroverflow_desc *, uint64_t lhs, uint64_t rhs);
-void __ubsan_handle_pointer_overflow_abort(struct ubsan_ptroverflow_desc *, uint64_t lhs, uint64_t rhs);
-void __ubsan_handle_out_of_bounds(struct ubsan_oob_desc *, uint64_t idx);
-void __ubsan_handle_out_of_bounds_abort(struct ubsan_oob_desc *, uint64_t idx);
+
+/* currently unimplemented */
+void __ubsan_handle_float_cast_overflow(struct san_src_loc *);
+void __ubsan_handle_float_cast_overflow_abort(struct san_src_loc *);
+void __ubsan_handle_function_type_mismatch(struct san_src_loc *);
+void __ubsan_handle_function_type_mismatch_abort(struct san_src_loc *);
+void __ubsan_handle_implicit_conversion(struct san_src_loc *);
+void __ubsan_handle_implicit_conversion_abort(struct san_src_loc *);
+void __ubsan_handle_invalid_builtin(struct san_src_loc *);
+void __ubsan_handle_invalid_builtin_abort(struct san_src_loc *);
+void __ubsan_handle_load_invalid_value(struct san_src_loc *);
+void __ubsan_handle_load_invalid_value_abort(struct san_src_loc *);
+void __ubsan_handle_missing_return(struct san_src_loc *);
+void __ubsan_handle_missing_return_abort(struct san_src_loc *);
+void __ubsan_handle_nonnull_arg(struct san_src_loc *);
+void __ubsan_handle_nonnull_arg_abort(struct san_src_loc *);
+void __ubsan_handle_nonnull_return(struct san_src_loc *);
+void __ubsan_handle_nonnull_return_abort(struct san_src_loc *);
+void __ubsan_handle_nullability_arg(struct san_src_loc *);
+void __ubsan_handle_nullability_arg_abort(struct san_src_loc *);
+void __ubsan_handle_nullability_return(struct san_src_loc *);
+void __ubsan_handle_nullability_return_abort(struct san_src_loc *);
+void __ubsan_handle_vla_bound_not_positive(struct san_src_loc *);
+void __ubsan_handle_vla_bound_not_positive_abort(struct san_src_loc *);
 
 #endif /* _UBSAN_H_ */
index a02bf51dfe412b014dec93376eb3cb07af139fcf..d0a3fcc6931339425700c2ba3e018c96840b7958 100644 (file)
@@ -101,7 +101,7 @@ sysctl_ubsan_log_dump SYSCTL_HANDLER_ARGS
        if (!buf) {
                return 0;
        }
-       buf[0] = '\0';
+       bzero(buf, sz);
 
        for (size_t i = start; i != end; i = next_entry(i)) {
                n += ubsan_format(&ubsan_log[i], buf + n, sz - n);
index 1917d6e86ab07ef9b76328c0c32e13962412d514..77eb0bf7c301a0d0ea1fa4e3b796cf79598c49d4 100644 (file)
@@ -6,7 +6,7 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
 include $(MakeInc_cmd)
 include $(MakeInc_def)
 
-INCDIR=/usr/local/include
+INCDIR=$(SDKHEADERSROOT)/usr/local/include
 
 # Installs header file for user level -
 #         $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
index 7bd79d9ae78b20528784ccb5d5684d4a6bcb579f..05c4b79cf0cdde381cf245f419ccc25af4fec458 100644 (file)
@@ -23,7 +23,7 @@ endif
 
 $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile: $(SRCROOT)/SETUP/config/doconf $(OBJROOT)/SETUP/config $(DOCONFDEPS)
        $(_v)$(MKDIR) $(TARGET)/$(CURRENT_KERNEL_CONFIG)
-       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG);
+       $(_v)$(SRCROOT)/SETUP/config/doconf -c -cpu $(DOCONF_ARCH_CONFIG_LC) -soc $(CURRENT_MACHINE_CONFIG_LC) -d $(TARGET)/$(CURRENT_KERNEL_CONFIG) -s $(SOURCE) -m $(MASTERCONFDIR) $(CURRENT_KERNEL_CONFIG)
 
 do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
        $(_v)${MAKE} \
@@ -35,7 +35,7 @@ do_all: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/Makefile
                SOURCE=$(subst conf/,,$(SOURCE))                        \
                TARGET=${TARGET}                                        \
                OBJPATH=${OBJPATH}                                      \
-               build_all;
+               build_all
 
 do_build_all:: do_all
 
index 8330c0a5f0d43f18d20b6925e4f3d8011e3e78b3..2d75b556cbc9242639f88e69af2bbd2052fae513 100644 (file)
@@ -75,9 +75,9 @@ $(SOBJS): .SFLAGS
        $(_v)$(REPLACECONTENTS) $@ $(S_KCC) $(SFLAGS) $(INCFLAGS)
 
 $(COMPONENT).filelist: $(OBJS)
-       @echo "$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0)"
+       $(call makelog,$(ColorL)LDFILELIST$(Color0) $(ColorLF)$(COMPONENT)$(Color0))
        $(_v)for obj in ${OBJS}; do     \
-                echo $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
+                $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
        done > $(COMPONENT).filelist
 
 do_all: $(COMPONENT).filelist
index 504cf4b9f854d048e7701ea693ca88b3423caaef..44b591fd4ddc1f0cefd19c248d496aeb0b8c08fe 100644 (file)
@@ -236,12 +236,12 @@ mac_audit_text(char *text, mac_policy_handle_t handle)
 {
        char *sanitized;
        const char *name;
-       int i, size, plen, len;
+       size_t i, size, plen, text_len;
 
        name = mac_get_mpc(handle)->mpc_name;
-       len = strlen(text);
+       text_len = strlen(text);
        plen = 2 + strlen(name);
-       if (plen + len >= MAC_AUDIT_DATA_LIMIT) {
+       if (plen + text_len >= MAC_AUDIT_DATA_LIMIT) {
                return EINVAL;
        }
 
@@ -249,18 +249,18 @@ mac_audit_text(char *text, mac_policy_handle_t handle)
         * Make sure the text is only composed of only ASCII printable
         * characters.
         */
-       for (i = 0; i < len; i++) {
+       for (i = 0; i < text_len; i++) {
                if (text[i] < (char) 32 || text[i] > (char) 126) {
                        return EINVAL;
                }
        }
 
-       size = len + plen + 1;
+       size = text_len + plen + 1;
        sanitized = (char *)zalloc(mac_audit_data_zone);
 
        strlcpy(sanitized, name, MAC_AUDIT_DATA_LIMIT);
-       strncat(sanitized, ": ", MAC_AUDIT_DATA_LIMIT - plen + 2);
-       strncat(sanitized, text, MAC_AUDIT_DATA_LIMIT - plen);
+       strlcat(sanitized, ": ", MAC_AUDIT_DATA_LIMIT);
+       strlcat(sanitized, text, MAC_AUDIT_DATA_LIMIT);
 
        return audit_mac_data(MAC_AUDIT_TEXT_TYPE, size, (u_char *)sanitized);
 }
index c31e5fdc631676ec3a8c29738bf6346b2b70d27f..6a99e38fd91ad95c31e8f22397f556875a9e6bbe 100644 (file)
@@ -181,9 +181,20 @@ SYSCTL_UINT(_security_mac, OID_AUTO, label_mbufs, SECURITY_MAC_CTLFLAGS,
  * already has to deal with uninitialized labels, this probably won't
  * be a problem.
  */
+#if CONFIG_MACF_LAZY_VNODE_LABELS
+unsigned int    mac_label_vnodes = 1;
+#else
 unsigned int    mac_label_vnodes = 0;
-SYSCTL_UINT(_security_mac, OID_AUTO, labelvnodes, SECURITY_MAC_CTLFLAGS,
-    &mac_label_vnodes, 0, "Label all vnodes");
+#endif /* CONFIG_MACF_LAZY_VNODE_LABELS */
+SYSCTL_UINT(_security_mac, OID_AUTO, labelvnodes, SECURITY_MAC_CTLFLAGS
+#if CONFIG_MACF_LAZY_VNODE_LABELS
+    | CTLFLAG_RD
+#endif
+    , &mac_label_vnodes, 0, "Label all vnodes");
+
+unsigned int mac_vnode_label_count = 0;
+SYSCTL_UINT(_security_mac, OID_AUTO, vnode_label_count, SECURITY_MAC_CTLFLAGS | CTLFLAG_RD,
+    &mac_vnode_label_count, 0, "Count of vnode labels");
 
 unsigned int mac_device_enforce = 1;
 SYSCTL_UINT(_security_mac, OID_AUTO, device_enforce, SECURITY_MAC_CTLFLAGS,
index b40928bb0b6f0eb4fa2ba975156b9983b1e2e76b..3f9b671989d6caa8faf7f457ebae5fcf2cacfe11 100644 (file)
@@ -266,6 +266,7 @@ int     mac_mount_check_getattr(vfs_context_t ctx, struct mount *mp,
 int     mac_mount_check_label_update(vfs_context_t ctx, struct mount *mp);
 int     mac_mount_check_mount(vfs_context_t ctx, struct vnode *vp,
     struct componentname *cnp, const char *vfc_name);
+int     mac_mount_check_mount_late(vfs_context_t ctx, struct mount *mp);
 int     mac_mount_check_snapshot_create(vfs_context_t ctx, struct mount *mp,
     const char *name);
 int     mac_mount_check_snapshot_delete(vfs_context_t ctx, struct mount *mp,
@@ -364,6 +365,7 @@ int     mac_proc_check_setlcid(proc_t proc1, proc_t proc2,
     pid_t pid1, pid_t pid2);
 int     mac_proc_check_signal(proc_t proc1, proc_t proc2,
     int signum);
+int     mac_proc_check_syscall_unix(proc_t proc, int scnum);
 int     mac_proc_check_wait(proc_t proc1, proc_t proc2);
 void    mac_proc_notify_exit(proc_t proc);
 int     mac_setsockopt_label(kauth_cred_t cred, struct socket *so,
@@ -411,7 +413,6 @@ int     mac_system_check_acct(kauth_cred_t cred, struct vnode *vp);
 int     mac_system_check_audit(kauth_cred_t cred, void *record, int length);
 int     mac_system_check_auditctl(kauth_cred_t cred, struct vnode *vp);
 int     mac_system_check_auditon(kauth_cred_t cred, int cmd);
-int     mac_system_check_chud(kauth_cred_t cred);
 int     mac_system_check_host_priv(kauth_cred_t cred);
 int     mac_system_check_info(kauth_cred_t, const char *info_type);
 int     mac_system_check_nfsd(kauth_cred_t cred);
@@ -563,6 +564,7 @@ int     mac_vnode_label_externalize_audit(struct vnode *vp, struct mac *mac);
 void    mac_vnode_label_free(struct label *label);
 void    mac_vnode_label_init(struct vnode *vp);
 int     mac_vnode_label_init_needed(struct vnode *vp);
+struct label *mac_vnode_label_allocate(vnode_t vp);
 void    mac_vnode_label_recycle(struct vnode *vp);
 void    mac_vnode_label_update(vfs_context_t ctx, struct vnode *vp,
     struct label *newlabel);
index ec457f0cb909ad7b455fbfa26923dae15833d755..503b2ea4b553dc1761260b0a81baee4dd511a63d 100644 (file)
@@ -169,6 +169,7 @@ extern unsigned int mac_label_mbufs;
 #endif
 
 extern unsigned int mac_label_vnodes;
+extern unsigned int mac_vnode_label_count;
 
 static bool mac_proc_check_enforce(proc_t p);
 
index 09a8bec123d451c0891e0b276d7f1dc4d3d51505..9baaa2df9a337297221c6eb46201830c7a7d3193 100644 (file)
@@ -1804,6 +1804,22 @@ typedef int mpo_mount_check_mount_t(
        struct componentname *cnp,
        const char *vfc_name
        );
+/**
+ *  @brief Access control check for mounting a file system (late)
+ *  @param cred Subject credential
+ *  @param mp Mount point
+ *
+ *  Similar to mpo_mount_check_mount, but occurs after VFS_MOUNT has been
+ *  called, making it possible to access mnt_vfsstat.f_mntfromname and other
+ *  fields.
+ *
+ *  @return Return 0 if access is granted, otherwise an appropriate value for
+ *  errno should be returned.
+ */
+typedef int mpo_mount_check_mount_late_t(
+       kauth_cred_t cred,
+       struct mount *mp
+       );
 /**
  *  @brief Access control check for fs_snapshot_create
  *  @param cred Subject credential
@@ -3073,6 +3089,24 @@ typedef int mpo_proc_check_signal_t(
        struct proc *proc,
        int signum
        );
+/**
+ *  @brief Access control check for Unix syscalls.
+ *  @param proc Subject process
+ *  @param scnum Syscall number; see bsd/kern/syscalls.master.
+ *
+ *  Determine whether the subject process can perform the passed syscall (number).
+ *
+ *  @warning Programs typically expect to be able to make syscalls as part of
+ *  their normal process lifecycle; caution should be exercised when restricting
+ *  which syscalls a process can perform.
+ *
+ *  @return Return 0 if access is granted, otherwise an appropriate value for
+ *  errno should be returned. Suggested failure: EPERM for lack of privilege.
+ */
+typedef int mpo_proc_check_syscall_unix_t(
+       struct proc *proc,
+       int scnum
+       );
 /**
  *  @brief Access control check for wait
  *  @param cred Subject credential
@@ -3106,32 +3140,6 @@ typedef int mpo_proc_check_wait_t(
 typedef void mpo_proc_notify_exit_t(
        struct proc *proc
        );
-/**
- *  @brief Destroy process label
- *  @param label The label to be destroyed
- *
- *  Destroy a process label.  Since the object is going
- *  out of scope, policy modules should free any internal storage
- *  associated with the label so that it may be destroyed.
- */
-typedef void mpo_proc_label_destroy_t(
-       struct label *label
-       );
-/**
- *  @brief Initialize process label
- *  @param label New label to initialize
- *  @see mpo_cred_label_init_t
- *
- *  Initialize the label for a newly instantiated BSD process structure.
- *  Normally, security policies will store the process label in the user
- *  credential rather than here in the process structure.  However,
- *  there are some floating label policies that may need to temporarily
- *  store a label in the process structure until it is safe to update
- *  the user credential label.  Sleeping is permitted.
- */
-typedef void mpo_proc_label_init_t(
-       struct label *label
-       );
 /**
  *  @brief Access control check for skywalk flow connect
  *  @param cred Subject credential
@@ -3836,20 +3844,6 @@ typedef int mpo_system_check_auditon_t(
        kauth_cred_t cred,
        int cmd
        );
-/**
- *  @brief Access control check for using CHUD facilities
- *  @param cred Subject credential
- *
- *  Determine whether the subject identified by the credential can perform
- *  performance-related tasks using the CHUD system call.  This interface is
- *  deprecated.
- *
- *  @return Return 0 if access is granted, otherwise an appropriate value for
- *  errno should be returned.
- */
-typedef int mpo_system_check_chud_t(
-       kauth_cred_t cred
-       );
 /**
  *  @brief Access control check for obtaining the host control port
  *  @param cred Subject credential
@@ -5859,11 +5853,12 @@ typedef int mpo_vnode_label_internalize_t(
        );
 /**
  *  @brief Clean up a vnode label
- *  @param label The label to be cleaned for re-use
+ *  @param label The label to be cleaned or purged
  *
  *  Clean up a vnode label.  Darwin (Tiger, 8.x) allocates vnodes on demand, but
  *  typically never frees them.  Before vnodes are placed back on free lists for
- *  re-use, policies can cleanup or overwrite any information present in the label.
+ *  re-use, policies can cleanup or overwrite any information present in the label,
+ *  or free any internal resources used for the label.
  */
 typedef void mpo_vnode_label_recycle_t(
        struct label *label
@@ -6288,7 +6283,7 @@ typedef void mpo_reserved_hook_t(void);
  * Please note that this should be kept in sync with the check assumptions
  * policy in bsd/kern/policy_check.c (policy_ops struct).
  */
-#define MAC_POLICY_OPS_VERSION 55 /* inc when new reserved slots are taken */
+#define MAC_POLICY_OPS_VERSION 58 /* inc when new reserved slots are taken */
 struct mac_policy_ops {
        mpo_audit_check_postselect_t            *mpo_audit_check_postselect;
        mpo_audit_check_preselect_t             *mpo_audit_check_preselect;
@@ -6428,8 +6423,8 @@ struct mac_policy_ops {
        mpo_vnode_check_rename_t                *mpo_vnode_check_rename;
        mpo_kext_check_query_t                  *mpo_kext_check_query;
        mpo_proc_notify_exec_complete_t         *mpo_proc_notify_exec_complete;
-       mpo_reserved_hook_t                     *mpo_reserved5;
-       mpo_reserved_hook_t                     *mpo_reserved6;
+       mpo_reserved_hook_t                     *mpo_reserved4;
+       mpo_proc_check_syscall_unix_t           *mpo_proc_check_syscall_unix;
        mpo_proc_check_expose_task_t            *mpo_proc_check_expose_task;
        mpo_proc_check_set_host_special_port_t  *mpo_proc_check_set_host_special_port;
        mpo_proc_check_set_host_exception_port_t *mpo_proc_check_set_host_exception_port;
@@ -6441,9 +6436,9 @@ struct mac_policy_ops {
        mpo_exc_action_label_update_t           *mpo_exc_action_label_update;
 
        mpo_vnode_check_trigger_resolve_t       *mpo_vnode_check_trigger_resolve;
+       mpo_mount_check_mount_late_t            *mpo_mount_check_mount_late;
        mpo_reserved_hook_t                     *mpo_reserved1;
        mpo_reserved_hook_t                     *mpo_reserved2;
-       mpo_reserved_hook_t                     *mpo_reserved3;
        mpo_skywalk_flow_check_connect_t        *mpo_skywalk_flow_check_connect;
        mpo_skywalk_flow_check_listen_t         *mpo_skywalk_flow_check_listen;
 
@@ -6479,8 +6474,8 @@ struct mac_policy_ops {
        mpo_proc_check_setlcid_t                *mpo_proc_check_setlcid;
        mpo_proc_check_signal_t                 *mpo_proc_check_signal;
        mpo_proc_check_wait_t                   *mpo_proc_check_wait;
-       mpo_proc_label_destroy_t                *mpo_proc_label_destroy;
-       mpo_proc_label_init_t                   *mpo_proc_label_init;
+       mpo_reserved_hook_t                     *mpo_reserved5;
+       mpo_reserved_hook_t                     *mpo_reserved6;
 
        mpo_socket_check_accept_t               *mpo_socket_check_accept;
        mpo_socket_check_accepted_t             *mpo_socket_check_accepted;
@@ -6630,7 +6625,7 @@ struct mac_policy_ops {
 
        mpo_iokit_check_set_properties_t        *mpo_iokit_check_set_properties;
 
-       mpo_system_check_chud_t                 *mpo_system_check_chud;
+       mpo_reserved_hook_t                     *mpo_reserved3;
 
        mpo_vnode_check_searchfs_t              *mpo_vnode_check_searchfs;
 
@@ -6922,6 +6917,8 @@ int     mac_file_removexattr(struct fileglob *fg, const char *name);
  */
 intptr_t        mac_label_get(struct label *l, int slot);
 void            mac_label_set(struct label *l, int slot, intptr_t v);
+intptr_t        mac_vnode_label_get(struct vnode *vp, int slot, intptr_t sentinel);
+void            mac_vnode_label_set(struct vnode *vp, int slot, intptr_t v);
 
 #define mac_get_mpc(h)          (mac_policy_list.entries[h].mpc)
 
index 3552fe991a1699fbfb9f885d79a2bdaf07410571..603b7499c70bc577ad7ad9a496d3dc86d5f9e762 100644 (file)
@@ -506,6 +506,26 @@ mac_proc_check_signal(proc_t curp, struct proc *proc, int signum)
        return error;
 }
 
+int
+mac_proc_check_syscall_unix(proc_t curp, int scnum)
+{
+       int error;
+
+#if SECURITY_MAC_CHECK_ENFORCE
+       /* 21167099 - only check if we allow write */
+       if (!mac_proc_enforce) {
+               return 0;
+       }
+#endif
+       if (!mac_proc_check_enforce(curp)) {
+               return 0;
+       }
+
+       MAC_CHECK(proc_check_syscall_unix, curp, scnum);
+
+       return error;
+}
+
 int
 mac_proc_check_wait(proc_t curp, struct proc *proc)
 {
index a0c00105bc41d5a4dcc80815bef62ce6e9d2a137..f34eee62e3c55b33187eae065cdd1d2bafc75360 100644 (file)
@@ -87,23 +87,6 @@ mac_system_check_acct(kauth_cred_t cred, struct vnode *vp)
        return error;
 }
 
-int
-mac_system_check_chud(kauth_cred_t cred)
-{
-       int error;
-
-#if SECURITY_MAC_CHECK_ENFORCE
-       /* 21167099 - only check if we allow write */
-       if (!mac_system_enforce) {
-               return 0;
-       }
-#endif
-
-       MAC_CHECK(system_check_chud, cred);
-
-       return error;
-}
-
 int
 mac_system_check_host_priv(kauth_cred_t cred)
 {
index 81b311012fb7a1abbc5b9821b3c904b6c6b7470a..95afa830b40c0e0c9e5bc2bf33c95072de63294f 100644 (file)
@@ -63,6 +63,7 @@
  */
 
 #include <kern/kalloc.h>
+#include <libkern/OSAtomic.h>
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -80,6 +81,8 @@
 #include <sys/reason.h>
 #include <sys/uio_internal.h>
 #include <sys/vnode_internal.h>
+#include <sys/kdebug.h>
+
 
 #include <miscfs/devfs/devfsdefs.h>
 #include <miscfs/devfs/fdesc.h>
 /* convert {R,W,X}_OK values to V{READ,WRITE,EXEC} */
 #define ACCESS_MODE_TO_VNODE_MASK(m)    (m << 6)
 
+
+/*
+ * Optional tracing of policy operations. Define VFS_TRACE_POLICY_OPS to trace the operations.
+ *
+ * Along with DBG_FSYSTEM and DBG_VFS, dcode in the macros below is used to construct
+ * KDBG_EVENTID(DBG_FSYSTEM, DBG_VFS, dcode) global event id, see bsd/sys/kdebug.h.
+ * Note that dcode is multiplied by 4 and ORed as part of the construction. See bsd/kern/trace_codes
+ * for list of system-wide {global event id, name} pairs. Currently DBG_VFS event ids are in range
+ * [0x3130000, 0x313016C].
+ */
+
+//#define VFS_TRACE_POLICY_OPS
+
+#ifdef VFS_TRACE_POLICY_OPS
+#define DBG_VFS_CODE(dcode)                     FSDBG_CODE(DBG_VFS, dcode)
+#define VFS_KERNEL_DEBUG_START0(dcode)          KERNEL_DEBUG_CONSTANT(DBG_VFS_CODE(dcode) | DBG_FUNC_START, 0, 0, 0, 0, 0)
+#define VFS_KERNEL_DEBUG_END0(dcode)            KERNEL_DEBUG_CONSTANT(DBG_VFS_CODE(dcode) | DBG_FUNC_END, 0, 0, 0, 0, 0)
+#define VFS_KERNEL_DEBUG_START1(dcode, darg)    KERNEL_DEBUG_CONSTANT(DBG_VFS_CODE(dcode) | DBG_FUNC_START, darg, 0, 0, 0, 0)
+#define VFS_KERNEL_DEBUG_END1(dcode, darg)      KERNEL_DEBUG_CONSTANT(DBG_VFS_CODE(dcode) | DBG_FUNC_END, darg, 0, 0, 0, 0)
+#else
+#define VFS_KERNEL_DEBUG_START0(dcode)          do {} while (0)
+#define VFS_KERNEL_DEBUG_END0(dcode)            do {} while (0)
+#define VFS_KERNEL_DEBUG_START1(dcode, darg)    do {} while (0)
+#define VFS_KERNEL_DEBUG_END1(dcode, darg)      do {} while (0)
+#endif
+
 static struct label *
 mac_devfsdirent_label_alloc(void)
 {
@@ -98,7 +127,9 @@ mac_devfsdirent_label_alloc(void)
        if (label == NULL) {
                return NULL;
        }
+       VFS_KERNEL_DEBUG_START0(0);
        MAC_PERFORM(devfs_label_init, label);
+       VFS_KERNEL_DEBUG_END0(0);
        return label;
 }
 
@@ -117,7 +148,9 @@ mac_mount_label_alloc(void)
        if (label == NULL) {
                return NULL;
        }
+       VFS_KERNEL_DEBUG_START0(1);
        MAC_PERFORM(mount_label_init, label);
+       VFS_KERNEL_DEBUG_END0(1);
        return label;
 }
 
@@ -136,7 +169,10 @@ mac_vnode_label_alloc(void)
        if (label == NULL) {
                return NULL;
        }
+       VFS_KERNEL_DEBUG_START0(2);
        MAC_PERFORM(vnode_label_init, label);
+       VFS_KERNEL_DEBUG_END0(2);
+       OSIncrementAtomic(&mac_vnode_label_count);
        return label;
 }
 
@@ -149,7 +185,21 @@ mac_vnode_label_init(vnode_t vp)
 int
 mac_vnode_label_init_needed(vnode_t vp)
 {
+#if CONFIG_MACF_LAZY_VNODE_LABELS
+       (void)vp;
+       return false;
+#else
        return mac_label_vnodes != 0 && vp->v_label == NULL;
+#endif
+}
+
+struct label *
+mac_vnode_label_allocate(vnode_t vp)
+{
+       if (mac_vnode_label_init_needed(vp)) {
+               vp->v_label = mac_vnode_label_alloc();
+       }
+       return vp->v_label;
 }
 
 /*
@@ -161,12 +211,21 @@ void
 mac_vnode_label_recycle(vnode_t vp)
 {
        MAC_PERFORM(vnode_label_recycle, vp->v_label);
+#if CONFIG_MACF_LAZY_VNODE_LABELS
+       if (vp->v_label) {
+               mac_vnode_label_destroy(vp);
+               vp->v_label = NULL;
+               vp->v_lflag &= ~VL_LABELED;
+       }
+#endif
 }
 
 static void
 mac_devfs_label_free(struct label *label)
 {
+       VFS_KERNEL_DEBUG_START1(3, label);
        MAC_PERFORM(devfs_label_destroy, label);
+       VFS_KERNEL_DEBUG_END1(3, label);
        mac_labelzone_free(label);
 }
 
@@ -182,7 +241,9 @@ mac_devfs_label_destroy(struct devnode *de)
 static void
 mac_mount_label_free(struct label *label)
 {
+       VFS_KERNEL_DEBUG_START1(4, label);
        MAC_PERFORM(mount_label_destroy, label);
+       VFS_KERNEL_DEBUG_END1(4, label);
        mac_labelzone_free(label);
 }
 
@@ -198,11 +259,15 @@ mac_mount_label_destroy(struct mount *mp)
 void
 mac_vnode_label_free(struct label *label)
 {
-       MAC_PERFORM(vnode_label_destroy, label);
-       mac_labelzone_free(label);
+       if (label != NULL) {
+               VFS_KERNEL_DEBUG_START1(5, label);
+               MAC_PERFORM(vnode_label_destroy, label);
+               VFS_KERNEL_DEBUG_END1(5, label);
+               mac_labelzone_free(label);
+               OSDecrementAtomic(&mac_vnode_label_count);
+       }
 }
 
-#ifndef __APPLE__
 void
 mac_vnode_label_destroy(struct vnode *vp)
 {
@@ -211,16 +276,17 @@ mac_vnode_label_destroy(struct vnode *vp)
                vp->v_label = NULL;
        }
 }
-#endif
 
 void
 mac_vnode_label_copy(struct label *src, struct label *dest)
 {
+       VFS_KERNEL_DEBUG_START1(6, src);
        if (src == NULL) {
                MAC_PERFORM(vnode_label_init, dest);
        } else {
                MAC_PERFORM(vnode_label_copy, src, dest);
        }
+       VFS_KERNEL_DEBUG_END1(6, src);
 }
 
 int
@@ -287,7 +353,9 @@ mac_devfs_label_copy(struct label *src, struct label *dest)
        }
 #endif
 
+       VFS_KERNEL_DEBUG_START1(7, src);
        MAC_PERFORM(devfs_label_copy, src, dest);
+       VFS_KERNEL_DEBUG_END1(7, src);
 }
 
 void
@@ -301,8 +369,10 @@ mac_devfs_label_update(struct mount *mp, struct devnode *de,
        }
 #endif
 
+       VFS_KERNEL_DEBUG_START1(8, vp);
        MAC_PERFORM(devfs_label_update, mp, de, de->dn_label, vp,
            vp->v_label);
+       VFS_KERNEL_DEBUG_END1(8, vp);
 }
 
 int
@@ -348,10 +418,12 @@ mac_vnode_label_associate_devfs(struct mount *mp, struct devnode *de,
        }
 #endif
 
+       VFS_KERNEL_DEBUG_START1(9, vp);
        MAC_PERFORM(vnode_label_associate_devfs,
            mp, mp ? mp->mnt_mntlabel : NULL,
            de, de->dn_label,
            vp, vp->v_label);
+       VFS_KERNEL_DEBUG_END1(9, vp);
 }
 
 int
@@ -359,8 +431,10 @@ mac_vnode_label_associate_extattr(struct mount *mp, struct vnode *vp)
 {
        int error;
 
+       VFS_KERNEL_DEBUG_START1(10, vp);
        MAC_CHECK(vnode_label_associate_extattr, mp, mp->mnt_mntlabel, vp,
            vp->v_label);
+       VFS_KERNEL_DEBUG_END1(10, vp);
 
        return error;
 }
@@ -378,8 +452,10 @@ mac_vnode_label_associate_singlelabel(struct mount *mp, struct vnode *vp)
                return;
        }
 
+       VFS_KERNEL_DEBUG_START1(11, vp);
        MAC_PERFORM(vnode_label_associate_singlelabel, mp,
            mp ? mp->mnt_mntlabel : NULL, vp, vp->v_label);
+       VFS_KERNEL_DEBUG_END1(11, vp);
 }
 
 int
@@ -399,8 +475,10 @@ mac_vnode_notify_create(vfs_context_t ctx, struct mount *mp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(12, vp);
        MAC_CHECK(vnode_notify_create, cred, mp, mp->mnt_mntlabel,
            dvp, dvp->v_label, vp, vp->v_label, cnp);
+       VFS_KERNEL_DEBUG_END1(12, vp);
 
        return error;
 }
@@ -421,8 +499,10 @@ mac_vnode_notify_rename(vfs_context_t ctx, struct vnode *vp,
        if (!mac_cred_check_enforce(cred)) {
                return;
        }
+       VFS_KERNEL_DEBUG_START1(13, vp);
        MAC_PERFORM(vnode_notify_rename, cred, vp, vp->v_label,
            dvp, dvp->v_label, cnp);
+       VFS_KERNEL_DEBUG_END1(13, vp);
 }
 
 void
@@ -440,7 +520,9 @@ mac_vnode_notify_open(vfs_context_t ctx, struct vnode *vp, int acc_flags)
        if (!mac_cred_check_enforce(cred)) {
                return;
        }
+       VFS_KERNEL_DEBUG_START1(14, vp);
        MAC_PERFORM(vnode_notify_open, cred, vp, vp->v_label, acc_flags);
+       VFS_KERNEL_DEBUG_END1(14, vp);
 }
 
 void
@@ -459,7 +541,9 @@ mac_vnode_notify_link(vfs_context_t ctx, struct vnode *vp,
        if (!mac_cred_check_enforce(cred)) {
                return;
        }
+       VFS_KERNEL_DEBUG_START1(15, vp);
        MAC_PERFORM(vnode_notify_link, cred, dvp, dvp->v_label, vp, vp->v_label, cnp);
+       VFS_KERNEL_DEBUG_END1(15, vp);
 }
 
 void
@@ -477,7 +561,9 @@ mac_vnode_notify_deleteextattr(vfs_context_t ctx, struct vnode *vp, const char *
        if (!mac_cred_check_enforce(cred)) {
                return;
        }
+       VFS_KERNEL_DEBUG_START1(16, vp);
        MAC_PERFORM(vnode_notify_deleteextattr, cred, vp, vp->v_label, name);
+       VFS_KERNEL_DEBUG_END1(16, vp);
 }
 
 void
@@ -495,7 +581,9 @@ mac_vnode_notify_setacl(vfs_context_t ctx, struct vnode *vp, struct kauth_acl *a
        if (!mac_cred_check_enforce(cred)) {
                return;
        }
+       VFS_KERNEL_DEBUG_START1(17, vp);
        MAC_PERFORM(vnode_notify_setacl, cred, vp, vp->v_label, acl);
+       VFS_KERNEL_DEBUG_END1(17, vp);
 }
 
 void
@@ -513,7 +601,9 @@ mac_vnode_notify_setattrlist(vfs_context_t ctx, struct vnode *vp, struct attrlis
        if (!mac_cred_check_enforce(cred)) {
                return;
        }
+       VFS_KERNEL_DEBUG_START1(18, vp);
        MAC_PERFORM(vnode_notify_setattrlist, cred, vp, vp->v_label, alist);
+       VFS_KERNEL_DEBUG_END1(18, vp);
 }
 
 void
@@ -531,7 +621,9 @@ mac_vnode_notify_setextattr(vfs_context_t ctx, struct vnode *vp, const char *nam
        if (!mac_cred_check_enforce(cred)) {
                return;
        }
+       VFS_KERNEL_DEBUG_START1(19, vp);
        MAC_PERFORM(vnode_notify_setextattr, cred, vp, vp->v_label, name, uio);
+       VFS_KERNEL_DEBUG_END1(19, vp);
 }
 
 void
@@ -549,7 +641,9 @@ mac_vnode_notify_setflags(vfs_context_t ctx, struct vnode *vp, u_long flags)
        if (!mac_cred_check_enforce(cred)) {
                return;
        }
+       VFS_KERNEL_DEBUG_START1(20, vp);
        MAC_PERFORM(vnode_notify_setflags, cred, vp, vp->v_label, flags);
+       VFS_KERNEL_DEBUG_END1(20, vp);
 }
 
 void
@@ -567,7 +661,9 @@ mac_vnode_notify_setmode(vfs_context_t ctx, struct vnode *vp, mode_t mode)
        if (!mac_cred_check_enforce(cred)) {
                return;
        }
+       VFS_KERNEL_DEBUG_START1(21, vp);
        MAC_PERFORM(vnode_notify_setmode, cred, vp, vp->v_label, mode);
+       VFS_KERNEL_DEBUG_END1(21, vp);
 }
 
 void
@@ -585,7 +681,9 @@ mac_vnode_notify_setowner(vfs_context_t ctx, struct vnode *vp, uid_t uid, gid_t
        if (!mac_cred_check_enforce(cred)) {
                return;
        }
+       VFS_KERNEL_DEBUG_START1(22, vp);
        MAC_PERFORM(vnode_notify_setowner, cred, vp, vp->v_label, uid, gid);
+       VFS_KERNEL_DEBUG_END1(22, vp);
 }
 
 void
@@ -603,7 +701,9 @@ mac_vnode_notify_setutimes(vfs_context_t ctx, struct vnode *vp, struct timespec
        if (!mac_cred_check_enforce(cred)) {
                return;
        }
+       VFS_KERNEL_DEBUG_START1(23, vp);
        MAC_PERFORM(vnode_notify_setutimes, cred, vp, vp->v_label, atime, mtime);
+       VFS_KERNEL_DEBUG_END1(23, vp);
 }
 
 void
@@ -621,7 +721,9 @@ mac_vnode_notify_truncate(vfs_context_t ctx, kauth_cred_t file_cred, struct vnod
        if (!mac_cred_check_enforce(cred)) {
                return;
        }
+       VFS_KERNEL_DEBUG_START1(24, vp);
        MAC_PERFORM(vnode_notify_truncate, cred, file_cred, vp, vp->v_label);
+       VFS_KERNEL_DEBUG_END1(24, vp);
 }
 
 /*
@@ -645,8 +747,10 @@ mac_vnode_label_update_extattr(struct mount *mp, struct vnode *vp,
                return;
        }
 
+       VFS_KERNEL_DEBUG_START1(25, vp);
        MAC_PERFORM(vnode_label_update_extattr, mp, mp->mnt_mntlabel, vp,
            vp->v_label, name);
+       VFS_KERNEL_DEBUG_END1(25, vp);
        if (error == 0) {
                return;
        }
@@ -678,7 +782,9 @@ mac_vnode_label_store(vfs_context_t ctx, struct vnode *vp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(26, vp);
        MAC_CHECK(vnode_label_store, cred, vp, vp->v_label, intlabel);
+       VFS_KERNEL_DEBUG_END1(26, vp);
 
        return error;
 }
@@ -710,6 +816,7 @@ mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t new, struct vnode *
         *     calling exec_spawnattr_getmacpolicyinfo() and before passing the
         *     spawnattrlen as an argument to the hook.
         */
+       VFS_KERNEL_DEBUG_START1(27, vp);
        {
                struct mac_policy_conf *mpc;
                u_int i;
@@ -756,6 +863,7 @@ mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t new, struct vnode *
                }
        }
        *labelupdateerror = error;
+       VFS_KERNEL_DEBUG_END1(27, vp);
 }
 
 int
@@ -775,6 +883,7 @@ mac_cred_check_label_update_execve(vfs_context_t ctx, struct vnode *vp, off_t of
 
        cred = vfs_context_ucred(ctx);
 
+       VFS_KERNEL_DEBUG_START1(28, vp);
        /*
         * NB: Cannot use MAC_BOOLEAN macro because we need a sequence point after
         *     calling exec_spawnattr_getmacpolicyinfo() and before passing the
@@ -820,6 +929,7 @@ mac_cred_check_label_update_execve(vfs_context_t ctx, struct vnode *vp, off_t of
                        mac_policy_list_unbusy();
                }
        }
+       VFS_KERNEL_DEBUG_END1(28, vp);
 
        return result;
 }
@@ -844,7 +954,9 @@ mac_vnode_check_access(vfs_context_t ctx, struct vnode *vp,
        }
        /* Convert {R,W,X}_OK values to V{READ,WRITE,EXEC} for entry points */
        mask = ACCESS_MODE_TO_VNODE_MASK(acc_mode);
+       VFS_KERNEL_DEBUG_START1(29, vp);
        MAC_CHECK(vnode_check_access, cred, vp, vp->v_label, mask);
+       VFS_KERNEL_DEBUG_END1(29, vp);
        return error;
 }
 
@@ -864,7 +976,9 @@ mac_vnode_check_chdir(vfs_context_t ctx, struct vnode *dvp)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(30, dvp);
        MAC_CHECK(vnode_check_chdir, cred, dvp, dvp->v_label);
+       VFS_KERNEL_DEBUG_END1(30, dvp);
        return error;
 }
 
@@ -885,7 +999,9 @@ mac_vnode_check_chroot(vfs_context_t ctx, struct vnode *dvp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(31, dvp);
        MAC_CHECK(vnode_check_chroot, cred, dvp, dvp->v_label, cnp);
+       VFS_KERNEL_DEBUG_END1(31, dvp);
        return error;
 }
 
@@ -906,8 +1022,10 @@ mac_vnode_check_clone(vfs_context_t ctx, struct vnode *dvp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(32, dvp);
        MAC_CHECK(vnode_check_clone, cred, dvp, dvp->v_label, vp,
            vp->v_label, cnp);
+       VFS_KERNEL_DEBUG_END1(32, dvp);
        return error;
 }
 int
@@ -927,7 +1045,9 @@ mac_vnode_check_create(vfs_context_t ctx, struct vnode *dvp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(33, dvp);
        MAC_CHECK(vnode_check_create, cred, dvp, dvp->v_label, cnp, vap);
+       VFS_KERNEL_DEBUG_END1(33, dvp);
        return error;
 }
 
@@ -948,8 +1068,10 @@ mac_vnode_check_unlink(vfs_context_t ctx, struct vnode *dvp, struct vnode *vp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(34, dvp);
        MAC_CHECK(vnode_check_unlink, cred, dvp, dvp->v_label, vp,
            vp->v_label, cnp);
+       VFS_KERNEL_DEBUG_END1(34, dvp);
        return error;
 }
 #if 0
@@ -970,7 +1092,9 @@ mac_vnode_check_deleteacl(vfs_context_t ctx, struct vnode *vp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(35, dvp);
        MAC_CHECK(vnode_check_deleteacl, cred, vp, vp->v_label, type);
+       VFS_KERNEL_DEBUG_END1(35, dvp);
        return error;
 }
 #endif
@@ -992,7 +1116,9 @@ mac_vnode_check_deleteextattr(vfs_context_t ctx, struct vnode *vp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(36, vp);
        MAC_CHECK(vnode_check_deleteextattr, cred, vp, vp->v_label, name);
+       VFS_KERNEL_DEBUG_END1(36, vp);
        return error;
 }
 int
@@ -1012,8 +1138,10 @@ mac_vnode_check_exchangedata(vfs_context_t ctx,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(37, v1);
        MAC_CHECK(vnode_check_exchangedata, cred, v1, v1->v_label,
            v2, v2->v_label);
+       VFS_KERNEL_DEBUG_END1(37, v1);
 
        return error;
 }
@@ -1035,7 +1163,9 @@ mac_vnode_check_getacl(vfs_context_t ctx, struct vnode *vp, acl_type_t type)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(38, vp);
        MAC_CHECK(vnode_check_getacl, cred, vp, vp->v_label, type);
+       VFS_KERNEL_DEBUG_END1(38, vp);
        return error;
 }
 #endif
@@ -1057,7 +1187,9 @@ mac_vnode_check_getattr(vfs_context_t ctx, struct ucred *file_cred,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(39, vp);
        MAC_CHECK(vnode_check_getattr, cred, file_cred, vp, vp->v_label, va);
+       VFS_KERNEL_DEBUG_END1(39, vp);
        return error;
 }
 
@@ -1078,7 +1210,9 @@ mac_vnode_check_getattrlist(vfs_context_t ctx, struct vnode *vp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(40, vp);
        MAC_CHECK(vnode_check_getattrlist, cred, vp, vp->v_label, alist);
+       VFS_KERNEL_DEBUG_END1(40, vp);
 
        /* Falsify results instead of returning error? */
        return error;
@@ -1105,6 +1239,7 @@ mac_vnode_check_exec(vfs_context_t ctx, struct vnode *vp,
         *     calling exec_spawnattr_getmacpolicyinfo() and before passing the
         *     spawnattrlen as an argument to the hook.
         */
+       VFS_KERNEL_DEBUG_START1(41, vp);
        {
                struct mac_policy_conf *mpc;
                u_int i;
@@ -1153,6 +1288,7 @@ mac_vnode_check_exec(vfs_context_t ctx, struct vnode *vp,
                        mac_policy_list_unbusy();
                }
        }
+       VFS_KERNEL_DEBUG_END1(41, vp);
 
        return error;
 }
@@ -1173,7 +1309,9 @@ mac_vnode_check_fsgetpath(vfs_context_t ctx, struct vnode *vp)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(42, vp);
        MAC_CHECK(vnode_check_fsgetpath, cred, vp, vp->v_label);
+       VFS_KERNEL_DEBUG_END1(42, vp);
        return error;
 }
 
@@ -1199,8 +1337,10 @@ mac_vnode_check_signature(struct vnode *vp, struct cs_blob *cs_blob,
        }
 #endif
 
+       VFS_KERNEL_DEBUG_START1(43, vp);
        MAC_CHECK(vnode_check_signature, vp, vp->v_label, cpu_type, cs_blob,
            cs_flags, signer_type, flags, &fatal_failure_desc, &fatal_failure_desc_len);
+       VFS_KERNEL_DEBUG_END1(43, vp);
 
        if (fatal_failure_desc_len) {
                // A fatal code signature validation failure occured, formulate a crash
@@ -1305,7 +1445,9 @@ mac_vnode_check_getacl(vfs_context_t ctx, struct vnode *vp, acl_type_t type)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(44, vp);
        MAC_CHECK(vnode_check_getacl, cred, vp, vp->v_label, type);
+       VFS_KERNEL_DEBUG_END1(44, vp);
        return error;
 }
 #endif
@@ -1327,8 +1469,10 @@ mac_vnode_check_getextattr(vfs_context_t ctx, struct vnode *vp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(45, vp);
        MAC_CHECK(vnode_check_getextattr, cred, vp, vp->v_label,
            name, uio);
+       VFS_KERNEL_DEBUG_END1(45, vp);
        return error;
 }
 
@@ -1348,7 +1492,9 @@ mac_vnode_check_ioctl(vfs_context_t ctx, struct vnode *vp, u_int cmd)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(46, vp);
        MAC_CHECK(vnode_check_ioctl, cred, vp, vp->v_label, cmd);
+       VFS_KERNEL_DEBUG_END1(46, vp);
        return error;
 }
 
@@ -1369,8 +1515,10 @@ mac_vnode_check_kqfilter(vfs_context_t ctx, kauth_cred_t file_cred,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(47, vp);
        MAC_CHECK(vnode_check_kqfilter, cred, file_cred, kn, vp,
            vp->v_label);
+       VFS_KERNEL_DEBUG_END1(47, vp);
 
        return error;
 }
@@ -1392,8 +1540,10 @@ mac_vnode_check_link(vfs_context_t ctx, struct vnode *dvp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(48, vp);
        MAC_CHECK(vnode_check_link, cred, dvp, dvp->v_label, vp,
            vp->v_label, cnp);
+       VFS_KERNEL_DEBUG_END1(48, vp);
        return error;
 }
 
@@ -1413,7 +1563,9 @@ mac_vnode_check_listextattr(vfs_context_t ctx, struct vnode *vp)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(49, vp);
        MAC_CHECK(vnode_check_listextattr, cred, vp, vp->v_label);
+       VFS_KERNEL_DEBUG_END1(49, vp);
        return error;
 }
 
@@ -1434,7 +1586,9 @@ mac_vnode_check_lookup_preflight(vfs_context_t ctx, struct vnode *dvp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(50, dvp);
        MAC_CHECK(vnode_check_lookup_preflight, cred, dvp, dvp->v_label, path, pathlen);
+       VFS_KERNEL_DEBUG_END1(50, dvp);
        return error;
 }
 
@@ -1455,7 +1609,9 @@ mac_vnode_check_lookup(vfs_context_t ctx, struct vnode *dvp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(51, dvp);
        MAC_CHECK(vnode_check_lookup, cred, dvp, dvp->v_label, cnp);
+       VFS_KERNEL_DEBUG_END1(51, dvp);
        return error;
 }
 
@@ -1475,7 +1631,9 @@ mac_vnode_check_open(vfs_context_t ctx, struct vnode *vp, int acc_mode)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(52, vp);
        MAC_CHECK(vnode_check_open, cred, vp, vp->v_label, acc_mode);
+       VFS_KERNEL_DEBUG_END1(52, vp);
        return error;
 }
 
@@ -1496,8 +1654,10 @@ mac_vnode_check_read(vfs_context_t ctx, struct ucred *file_cred,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(53, vp);
        MAC_CHECK(vnode_check_read, cred, file_cred, vp,
            vp->v_label);
+       VFS_KERNEL_DEBUG_END1(53, vp);
 
        return error;
 }
@@ -1518,7 +1678,9 @@ mac_vnode_check_readdir(vfs_context_t ctx, struct vnode *dvp)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(54, dvp);
        MAC_CHECK(vnode_check_readdir, cred, dvp, dvp->v_label);
+       VFS_KERNEL_DEBUG_END1(54, dvp);
        return error;
 }
 
@@ -1538,7 +1700,9 @@ mac_vnode_check_readlink(vfs_context_t ctx, struct vnode *vp)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(55, vp);
        MAC_CHECK(vnode_check_readlink, cred, vp, vp->v_label);
+       VFS_KERNEL_DEBUG_END1(55, vp);
        return error;
 }
 
@@ -1559,7 +1723,9 @@ mac_vnode_check_label_update(vfs_context_t ctx, struct vnode *vp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(56, vp);
        MAC_CHECK(vnode_check_label_update, cred, vp, vp->v_label, newlabel);
+       VFS_KERNEL_DEBUG_END1(56, vp);
 
        return error;
 }
@@ -1583,21 +1749,25 @@ mac_vnode_check_rename(vfs_context_t ctx, struct vnode *dvp,
                return 0;
        }
 
+       VFS_KERNEL_DEBUG_START1(57, vp);
        MAC_CHECK(vnode_check_rename_from, cred, dvp, dvp->v_label, vp,
            vp->v_label, cnp);
        if (error) {
+               VFS_KERNEL_DEBUG_END1(57, vp);
                return error;
        }
 
        MAC_CHECK(vnode_check_rename_to, cred, tdvp, tdvp->v_label, tvp,
            tvp != NULL ? tvp->v_label : NULL, dvp == tdvp, tcnp);
        if (error) {
+               VFS_KERNEL_DEBUG_END1(57, vp);
                return error;
        }
 
        MAC_CHECK(vnode_check_rename, cred, dvp, dvp->v_label, vp,
            vp->v_label, cnp, tdvp, tdvp->v_label, tvp,
            tvp != NULL ? tvp->v_label : NULL, tcnp);
+       VFS_KERNEL_DEBUG_END1(57, vp);
        return error;
 }
 
@@ -1617,7 +1787,9 @@ mac_vnode_check_revoke(vfs_context_t ctx, struct vnode *vp)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(58, vp);
        MAC_CHECK(vnode_check_revoke, cred, vp, vp->v_label);
+       VFS_KERNEL_DEBUG_END1(58, vp);
        return error;
 }
 
@@ -1637,7 +1809,9 @@ mac_vnode_check_searchfs(vfs_context_t ctx, struct vnode *vp, struct attrlist *a
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(59, vp);
        MAC_CHECK(vnode_check_searchfs, cred, vp, vp->v_label, alist);
+       VFS_KERNEL_DEBUG_END1(59, vp);
        return error;
 }
 
@@ -1657,7 +1831,9 @@ mac_vnode_check_select(vfs_context_t ctx, struct vnode *vp, int which)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(60, vp);
        MAC_CHECK(vnode_check_select, cred, vp, vp->v_label, which);
+       VFS_KERNEL_DEBUG_END1(60, vp);
        return error;
 }
 
@@ -1678,7 +1854,9 @@ mac_vnode_check_setacl(vfs_context_t ctx, struct vnode *vp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(61, vp);
        MAC_CHECK(vnode_check_setacl, cred, vp, vp->v_label, acl);
+       VFS_KERNEL_DEBUG_END1(61, vp);
        return error;
 }
 
@@ -1699,7 +1877,9 @@ mac_vnode_check_setattrlist(vfs_context_t ctx, struct vnode *vp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(62, vp);
        MAC_CHECK(vnode_check_setattrlist, cred, vp, vp->v_label, alist);
+       VFS_KERNEL_DEBUG_END1(62, vp);
        return error;
 }
 
@@ -1720,8 +1900,10 @@ mac_vnode_check_setextattr(vfs_context_t ctx, struct vnode *vp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(63, vp);
        MAC_CHECK(vnode_check_setextattr, cred, vp, vp->v_label,
            name, uio);
+       VFS_KERNEL_DEBUG_END1(63, vp);
        return error;
 }
 
@@ -1741,7 +1923,9 @@ mac_vnode_check_setflags(vfs_context_t ctx, struct vnode *vp, u_long flags)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(64, vp);
        MAC_CHECK(vnode_check_setflags, cred, vp, vp->v_label, flags);
+       VFS_KERNEL_DEBUG_END1(64, vp);
        return error;
 }
 
@@ -1761,7 +1945,9 @@ mac_vnode_check_setmode(vfs_context_t ctx, struct vnode *vp, mode_t mode)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(65, vp);
        MAC_CHECK(vnode_check_setmode, cred, vp, vp->v_label, mode);
+       VFS_KERNEL_DEBUG_END1(65, vp);
        return error;
 }
 
@@ -1782,7 +1968,9 @@ mac_vnode_check_setowner(vfs_context_t ctx, struct vnode *vp, uid_t uid,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(66, vp);
        MAC_CHECK(vnode_check_setowner, cred, vp, vp->v_label, uid, gid);
+       VFS_KERNEL_DEBUG_END1(66, vp);
        return error;
 }
 
@@ -1803,8 +1991,10 @@ mac_vnode_check_setutimes(vfs_context_t ctx, struct vnode *vp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(67, vp);
        MAC_CHECK(vnode_check_setutimes, cred, vp, vp->v_label, atime,
            mtime);
+       VFS_KERNEL_DEBUG_END1(67, vp);
        return error;
 }
 
@@ -1825,8 +2015,10 @@ mac_vnode_check_stat(vfs_context_t ctx, struct ucred *file_cred,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(68, vp);
        MAC_CHECK(vnode_check_stat, cred, file_cred, vp,
            vp->v_label);
+       VFS_KERNEL_DEBUG_END1(68, vp);
        return error;
 }
 
@@ -1847,7 +2039,9 @@ mac_vnode_check_trigger_resolve(vfs_context_t ctx, struct vnode *dvp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(69, dvp);
        MAC_CHECK(vnode_check_trigger_resolve, cred, dvp, dvp->v_label, cnp);
+       VFS_KERNEL_DEBUG_END1(69, dvp);
        return error;
 }
 
@@ -1868,8 +2062,10 @@ mac_vnode_check_truncate(vfs_context_t ctx, struct ucred *file_cred,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(70, vp);
        MAC_CHECK(vnode_check_truncate, cred, file_cred, vp,
            vp->v_label);
+       VFS_KERNEL_DEBUG_END1(70, vp);
 
        return error;
 }
@@ -1891,7 +2087,9 @@ mac_vnode_check_write(vfs_context_t ctx, struct ucred *file_cred,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(71, vp);
        MAC_CHECK(vnode_check_write, cred, file_cred, vp, vp->v_label);
+       VFS_KERNEL_DEBUG_END1(71, vp);
 
        return error;
 }
@@ -1913,7 +2111,9 @@ mac_vnode_check_uipc_bind(vfs_context_t ctx, struct vnode *dvp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(72, dvp);
        MAC_CHECK(vnode_check_uipc_bind, cred, dvp, dvp->v_label, cnp, vap);
+       VFS_KERNEL_DEBUG_END1(72, dvp);
        return error;
 }
 
@@ -1933,7 +2133,9 @@ mac_vnode_check_uipc_connect(vfs_context_t ctx, struct vnode *vp, struct socket
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(73, vp);
        MAC_CHECK(vnode_check_uipc_connect, cred, vp, vp->v_label, (socket_t) so);
+       VFS_KERNEL_DEBUG_END1(73, vp);
        return error;
 }
 
@@ -1955,7 +2157,9 @@ mac_vnode_label_update(vfs_context_t ctx, struct vnode *vp, struct label *newlab
                tmpl = NULL;
        }
 
+       VFS_KERNEL_DEBUG_START1(74, vp);
        MAC_PERFORM(vnode_label_update, cred, vp, vp->v_label, newlabel);
+       VFS_KERNEL_DEBUG_END1(74, vp);
        vnode_unlock(vp);
 
        if (tmpl != NULL) {
@@ -1975,7 +2179,9 @@ mac_vnode_find_sigs(struct proc *p, struct vnode *vp, off_t offset)
        }
 #endif
 
+       VFS_KERNEL_DEBUG_START1(75, vp);
        MAC_CHECK(vnode_find_sigs, p, vp, offset, vp->v_label);
+       VFS_KERNEL_DEBUG_END1(75, vp);
 
        return error;
 }
@@ -2026,7 +2232,9 @@ mac_mount_label_associate(vfs_context_t ctx, struct mount *mp)
                }
        }
 
+       VFS_KERNEL_DEBUG_START1(76, mp);
        MAC_PERFORM(mount_label_associate, cred, mp, mp->mnt_mntlabel);
+       VFS_KERNEL_DEBUG_END1(76, mp);
 #if DEBUG
        printf("MAC Framework enabling %s support: %s -> %s (%s)\n",
            mp->mnt_flag & MNT_MULTILABEL ? "multilabel" : "singlelabel",
@@ -2053,7 +2261,32 @@ mac_mount_check_mount(vfs_context_t ctx, struct vnode *vp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(77, vp);
        MAC_CHECK(mount_check_mount, cred, vp, vp->v_label, cnp, vfc_name);
+       VFS_KERNEL_DEBUG_END1(77, vp);
+
+       return error;
+}
+
+int
+mac_mount_check_mount_late(vfs_context_t ctx, struct mount *mp)
+{
+       kauth_cred_t cred;
+       int error;
+
+#if SECURITY_MAC_CHECK_ENFORCE
+       /* 21167099 - only check if we allow write */
+       if (!mac_vnode_enforce) {
+               return 0;
+       }
+#endif
+       cred = vfs_context_ucred(ctx);
+       if (!mac_cred_check_enforce(cred)) {
+               return 0;
+       }
+       VFS_KERNEL_DEBUG_START1(78, mp);
+       MAC_CHECK(mount_check_mount_late, cred, mp);
+       VFS_KERNEL_DEBUG_END1(78, mp);
 
        return error;
 }
@@ -2075,7 +2308,9 @@ mac_mount_check_snapshot_create(vfs_context_t ctx, struct mount *mp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(79, mp);
        MAC_CHECK(mount_check_snapshot_create, cred, mp, name);
+       VFS_KERNEL_DEBUG_END1(79, mp);
        return error;
 }
 
@@ -2096,7 +2331,9 @@ mac_mount_check_snapshot_delete(vfs_context_t ctx, struct mount *mp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(80, mp);
        MAC_CHECK(mount_check_snapshot_delete, cred, mp, name);
+       VFS_KERNEL_DEBUG_END1(80, mp);
        return error;
 }
 
@@ -2117,7 +2354,9 @@ mac_mount_check_snapshot_revert(vfs_context_t ctx, struct mount *mp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(81, mp);
        MAC_CHECK(mount_check_snapshot_revert, cred, mp, name);
+       VFS_KERNEL_DEBUG_END1(81, mp);
        return error;
 }
 
@@ -2137,7 +2376,9 @@ mac_mount_check_remount(vfs_context_t ctx, struct mount *mp)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(82, mp);
        MAC_CHECK(mount_check_remount, cred, mp, mp->mnt_mntlabel);
+       VFS_KERNEL_DEBUG_END1(82, mp);
 
        return error;
 }
@@ -2158,7 +2399,9 @@ mac_mount_check_umount(vfs_context_t ctx, struct mount *mp)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(83, mp);
        MAC_CHECK(mount_check_umount, cred, mp, mp->mnt_mntlabel);
+       VFS_KERNEL_DEBUG_END1(83, mp);
 
        return error;
 }
@@ -2180,7 +2423,9 @@ mac_mount_check_getattr(vfs_context_t ctx, struct mount *mp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(84, mp);
        MAC_CHECK(mount_check_getattr, cred, mp, mp->mnt_mntlabel, vfa);
+       VFS_KERNEL_DEBUG_END1(84, mp);
        return error;
 }
 
@@ -2201,7 +2446,9 @@ mac_mount_check_setattr(vfs_context_t ctx, struct mount *mp,
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(85, mp);
        MAC_CHECK(mount_check_setattr, cred, mp, mp->mnt_mntlabel, vfa);
+       VFS_KERNEL_DEBUG_END1(85, mp);
        return error;
 }
 
@@ -2221,7 +2468,9 @@ mac_mount_check_stat(vfs_context_t ctx, struct mount *mount)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(86, mount);
        MAC_CHECK(mount_check_stat, cred, mount, mount->mnt_mntlabel);
+       VFS_KERNEL_DEBUG_END1(86, mount);
 
        return error;
 }
@@ -2242,7 +2491,9 @@ mac_mount_check_label_update(vfs_context_t ctx, struct mount *mount)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(87, mount);
        MAC_CHECK(mount_check_label_update, cred, mount, mount->mnt_mntlabel);
+       VFS_KERNEL_DEBUG_END1(87, mount);
 
        return error;
 }
@@ -2263,7 +2514,9 @@ mac_mount_check_fsctl(vfs_context_t ctx, struct mount *mp, u_int cmd)
        if (!mac_cred_check_enforce(cred)) {
                return 0;
        }
+       VFS_KERNEL_DEBUG_START1(88, mp);
        MAC_CHECK(mount_check_fsctl, cred, mp, mp->mnt_mntlabel, cmd);
+       VFS_KERNEL_DEBUG_END1(88, mp);
 
        return error;
 }
@@ -2279,8 +2532,10 @@ mac_devfs_label_associate_device(dev_t dev, struct devnode *de,
        }
 #endif
 
+       VFS_KERNEL_DEBUG_START1(89, de);
        MAC_PERFORM(devfs_label_associate_device, dev, de, de->dn_label,
            fullpath);
+       VFS_KERNEL_DEBUG_END1(89, de);
 }
 
 void
@@ -2294,8 +2549,10 @@ mac_devfs_label_associate_directory(const char *dirname, int dirnamelen,
        }
 #endif
 
+       VFS_KERNEL_DEBUG_START1(90, de);
        MAC_PERFORM(devfs_label_associate_directory, dirname, dirnamelen, de,
            de->dn_label, fullpath);
+       VFS_KERNEL_DEBUG_END1(90, de);
 }
 
 int
@@ -2369,18 +2626,21 @@ mac_vnode_label_associate_fdesc(struct mount *mp, struct fdescnode *fnp,
 
        error = 0;
 
+       VFS_KERNEL_DEBUG_START1(91, vp);
        /*
         * If no backing file, let the policy choose which label to use.
         */
        if (fnp->fd_fd == -1) {
                MAC_PERFORM(vnode_label_associate_file, vfs_context_ucred(ctx),
                    mp, mp->mnt_mntlabel, NULL, NULL, vp, vp->v_label);
+               VFS_KERNEL_DEBUG_END1(91, vp);
                return 0;
        }
 
        p = vfs_context_proc(ctx);
        error = fp_lookup(p, fnp->fd_fd, &fp, 0);
        if (error) {
+               VFS_KERNEL_DEBUG_END1(91, vp);
                return error;
        }
 
@@ -2395,7 +2655,12 @@ mac_vnode_label_associate_fdesc(struct mount *mp, struct fdescnode *fnp,
                if ((error = vnode_getwithref(fvp))) {
                        goto out;
                }
-               MAC_PERFORM(vnode_label_copy, fvp->v_label, vp->v_label);
+               if (fvp->v_label != NULL) {
+                       if (mac_label_vnodes != 0 && vp->v_label == NULL) {
+                               mac_vnode_label_init(vp); /* init dst label */
+                       }
+                       MAC_PERFORM(vnode_label_copy, fvp->v_label, vp->v_label);
+               }
                (void)vnode_put(fvp);
                break;
 #if CONFIG_MACF_SOCKET_SUBSET
@@ -2437,6 +2702,34 @@ mac_vnode_label_associate_fdesc(struct mount *mp, struct fdescnode *fnp,
                break;
        }
 out:
+       VFS_KERNEL_DEBUG_END1(91, vp);
        fp_drop(p, fnp->fd_fd, fp, 0);
        return error;
 }
+
+intptr_t
+mac_vnode_label_get(struct vnode *vp, int slot, intptr_t sentinel)
+{
+       struct label *l;
+
+       KASSERT(vp != NULL, ("mac_vnode_label_get: NULL vnode"));
+       l = vp->v_label;
+       if (l != NULL) {
+               return mac_label_get(l, slot);
+       } else {
+               return sentinel;
+       }
+}
+
+void
+mac_vnode_label_set(struct vnode *vp, int slot, intptr_t v)
+{
+       struct label *l;
+       KASSERT(vp != NULL, ("mac_vnode_label_set: NULL vnode"));
+       l = vp->v_label;
+       if (l == NULL) {
+               mac_vnode_label_init(vp);
+               l = vp->v_label;
+       }
+       mac_label_set(l, slot, v);
+}
index 6f4c096c70fb83b37d260b33475b2932eb728a66..3fb5132d588b7b0b2f1c027756caaab01fa20411 100644 (file)
@@ -46,12 +46,17 @@ vnode_label(struct mount *mp, struct vnode *dvp, struct vnode *vp,
     struct componentname *cnp, int flags, vfs_context_t ctx)
 {
        int error = 0;
-
+       bool exit_fast;
 
        /* fast path checks... */
 
        /* are we labeling vnodes? If not still notify of create */
-       if (mac_label_vnodes == 0) {
+#if CONFIG_MACF_LAZY_VNODE_LABELS
+       exit_fast = true;
+#else
+       exit_fast = (mac_label_vnodes == 0);
+#endif
+       if (exit_fast) {
                if (flags & VNODE_LABEL_CREATE) {
                        error = mac_vnode_notify_create(ctx,
                            mp, dvp, vp, cnp);
index 5f165b8b764ac5e3f8c32ac949212ebccc345b1e..78b2cfa4f8ba2b4cb822dc35b86d7f522944a528 100644 (file)
@@ -11,6 +11,7 @@ ENABLE_LTE_TESTS=YES
 OTHER_LTE_INCLUDE_FILES += \
        /System/Library/PrivateFrameworks/LoggingSupport.framework, \
        /System/Library/PrivateFrameworks/MobileKeyBag.framework, \
+       /System/Library/Frameworks/IOSurface.framework, \
        /usr/local/lib/libdarwintest_utils.dylib, \
        /usr/lib/libapple_crypto.dylib,
 
@@ -24,7 +25,7 @@ include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.common
 OTHER_CFLAGS  = -Weverything -Wno-gnu-union-cast -Wno-missing-field-initializers -Wno-partial-availability
 OTHER_CFLAGS += -Wno-missing-noreturn -Wno-vla -Wno-reserved-id-macro -Wno-documentation-unknown-command
 OTHER_CFLAGS += -Wno-padded -Wno-used-but-marked-unused -Wno-covered-switch-default -Wno-nullability-extension
-OTHER_CFLAGS += -Wno-gnu-empty-initializer -Wno-unused-macros -Wno-undef
+OTHER_CFLAGS += -Wno-gnu-empty-initializer -Wno-unused-macros -Wno-undef -Wno-fixed-enum-extension
 OTHER_CFLAGS += --std=gnu11 -isystem $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
 OTHER_CFLAGS += -UT_NAMESPACE_PREFIX -DT_NAMESPACE_PREFIX=xnu
 OTHER_CFLAGS += -F $(SDKROOT)/System/Library/PrivateFrameworks
@@ -56,10 +57,24 @@ backtracing: OTHER_LDFLAGS += -framework CoreSymbolication
 
 data_protection: OTHER_LDFLAGS += -ldarwintest_utils -framework IOKit
 
+immovable_send: excserver
+immovable_send: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT)
+immovable_send: OTHER_LDFLAGS += -ldarwintest_utils -lpthread -framework IOKit
+
+CUSTOM_TARGETS += immovable_send_client
+immovable_send: immovable_send_client
+
+immovable_send_client: immovable_send_client.c
+       $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) immovable_send_client.c -o $(SYMROOT)/immovable_send_client
+
+install-immovable_send_client: immovable_send_client
+       mkdir -p $(INSTALLDIR)
+       cp $(SYMROOT)/immovable_send_client $(INSTALLDIR)/
+
 kdebug: INVALID_ARCHS = i386
 kdebug: OTHER_LDFLAGS = -framework ktrace -ldarwintest_utils -framework kperf
 
-EXCLUDED_SOURCES += drop_priv.c kperf_helpers.c xnu_quick_test_helpers.c
+EXCLUDED_SOURCES += drop_priv.c kperf_helpers.c xnu_quick_test_helpers.c memorystatus_assertion_helpers.c
 
 ifneq ($(PLATFORM),iPhoneOS)
 EXCLUDED_SOURCES += jumbo_va_spaces_28530648.c perf_compressor.c memorystatus_freeze_test.c
@@ -68,11 +83,19 @@ endif
 perf_compressor: OTHER_LDFLAGS += -ldarwintest_utils
 perf_compressor: CODE_SIGN_ENTITLEMENTS=./private_entitlement.plist
 
+memorystatus_freeze_test: CODE_SIGN_ENTITLEMENTS=./task_for_pid_entitlement.plist
 memorystatus_freeze_test: OTHER_LDFLAGS += -ldarwintest_utils
+memorystatus_freeze_test: OTHER_CFLAGS += -ldarwintest_utils memorystatus_assertion_helpers.c
 
-stackshot: OTHER_CFLAGS += -Wno-objc-messaging-id
-stackshot: OTHER_LDFLAGS += -lkdd -framework Foundation
-stackshot: INVALID_ARCHS = i386
+memorystatus_is_assertion: OTHER_LDFLAGS += -ldarwintest_utils
+memorystatus_is_assertion: OTHER_CFLAGS += memorystatus_assertion_helpers.c
+
+shared_cache_tests: INVALID_ARCHS = i386
+shared_cache_tests: OTHER_LDFLAGS += -ldarwintest_utils
+
+stackshot_tests: OTHER_CFLAGS += -Wno-objc-messaging-id
+stackshot_tests: OTHER_LDFLAGS += -lkdd -ldarwintest_utils -framework Foundation
+stackshot_tests: INVALID_ARCHS = i386
 
 telemetry: OTHER_LDFLAGS = -framework ktrace -framework CoreFoundation
 telemetry: INVALID_ARCHS = i386
@@ -106,11 +129,45 @@ perf_exit: OTHER_LDFLAGS = -framework ktrace -ldarwintest_utils
 perf_exit: INVALID_ARCHS = i386
 perf_exit: CODE_SIGN_ENTITLEMENTS=./private_entitlement.plist
 
+CUSTOM_TARGETS += prioritize_process_launch_helper
+prioritize_process_launch: prioritize_process_launch_helper
+
+prioritize_process_launch_helper: prioritize_process_launch_helper.c
+       $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) prioritize_process_launch_helper.c -o $(SYMROOT)/prioritize_process_launch_helper
+       echo $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@; \
+       env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@;
+
+install-prioritize_process_launch_helper: prioritize_process_launch_helper
+       mkdir -p $(INSTALLDIR)
+       cp $(SYMROOT)/prioritize_process_launch_helper $(INSTALLDIR)/
+
+
 perf_spawn_fork: CODE_SIGN_ENTITLEMENTS=./private_entitlement.plist
 
+mach_exception_reply: OTHER_CFLAGS += -Wno-cast-align
+
 os_thread_self_restrict: os_thread_self_restrict.c os_thread_self_restrict-entitlements.plist
 os_thread_self_restrict: CODE_SIGN_ENTITLEMENTS=os_thread_self_restrict-entitlements.plist
 
+osptr: OTHER_CXXFLAGS += -I$(SRCROOT)/../libkern -std=c++98
+osptr: OTHER_CXXFLAGS += osptr_helper.cpp
+
+osptr_dumb: OTHER_CXXFLAGS += -I$(SRCROOT)/../libkern -std=c++17
+
+osptr_11: OTHER_CXXFLAGS += -I$(SRCROOT)/../libkern -std=c++11
+osptr_11: OTHER_CXXFLAGS += osptr_helper.cpp
+osptr_11: osptr.cpp
+       $(CXX) $(DT_CXXFLAGS) $(OTHER_CXXFLAGS) $(CXXFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@
+
+osptr_17: OTHER_CXXFLAGS += -I$(SRCROOT)/../libkern -std=c++17
+osptr_17: OTHER_CXXFLAGS += osptr_helper.cpp
+osptr_17: osptr.cpp
+       $(CXX) $(DT_CXXFLAGS) $(OTHER_CXXFLAGS) $(CXXFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@
+
+EXCLUDED_SOURCES += osptr_helper.cpp
+
+os_refcnt: OTHER_CFLAGS += -I$(SRCROOT)/../libkern/ -Wno-gcc-compat -Wno-undef -O3 -flto
+
 task_inspect: CODE_SIGN_ENTITLEMENTS = task_inspect.entitlements
 task_inspect: OTHER_CFLAGS += -DENTITLED=1
 
@@ -126,8 +183,6 @@ install-perf_exit_proc: perf_exit_proc
        mkdir -p $(INSTALLDIR)
        cp $(SYMROOT)/perf_exit_proc $(INSTALLDIR)/
 
-perf_kdebug: INVALID_ARCHS = i386
-
 stackshot_idle_25570396: INVALID_ARCHS = i386
 stackshot_idle_25570396: OTHER_LDFLAGS += -lkdd -framework Foundation
 
@@ -160,7 +215,7 @@ install-vm_set_max_addr_helper: vm_set_max_addr_helper
        cp $(SYMROOT)/vm_set_max_addr_helper $(INSTALLDIR)/
 
 ifeq ($(PLATFORM),iPhoneOS)
-OTHER_TEST_TARGETS += jumbo_va_spaces_28530648_unentitled
+OTHER_TEST_TARGETS += jumbo_va_spaces_28530648_unentitled vm_phys_footprint_legacy
 jumbo_va_spaces_28530648: CODE_SIGN_ENTITLEMENTS = jumbo_va_spaces_28530648.entitlements
 jumbo_va_spaces_28530648: OTHER_CFLAGS += -DENTITLED=1
 jumbo_va_spaces_28530648: OTHER_LDFLAGS += -ldarwintest_utils
@@ -168,6 +223,13 @@ jumbo_va_spaces_28530648: OTHER_LDFLAGS += -ldarwintest_utils
 jumbo_va_spaces_28530648_unentitled: OTHER_LDFLAGS += -ldarwintest_utils
 jumbo_va_spaces_28530648_unentitled: jumbo_va_spaces_28530648.c
        $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@
+
+vm_phys_footprint_legacy: OTHER_LDFLAGS += -framework CoreFoundation -framework IOSurface
+vm_phys_footprint_legacy: OTHER_CFLAGS += -DLEGACY_FOOTPRINT_ENTITLED=1
+vm_phys_footprint_legacy: CODE_SIGN_ENTITLEMENTS=./legacy_footprint.entitlement
+vm_phys_footprint_legacy: vm_phys_footprint.c
+       $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@
+
 endif
 
 task_info_28439149: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist
@@ -177,11 +239,13 @@ proc_info: OTHER_LDFLAGS += -ldarwintest_utils
 
 proc_info_list_kthreads: CODE_SIGN_ENTITLEMENTS = ./proc_info_list_kthreads.entitlements
 
+proc_info_44873309: CODE_SIGN_ENTITLEMENTS = ./proc_info_44873309.entitlements
+
 disk_mount_conditioner: disk_mount_conditioner*
 disk_mount_conditioner: CODE_SIGN_ENTITLEMENTS=./disk_mount_conditioner-entitlements.plist
 disk_mount_conditioner: OTHER_LDFLAGS += -ldarwintest_utils
 
-OTHER_TEST_TARGETS += disk_mount_conditioner_unentitled
+disk_mount_conditioner: OTHER_TEST_TARGETS += disk_mount_conditioner_unentitled
 disk_mount_conditioner_unentitled: OTHER_CFLAGS += -DTEST_UNENTITLED
 disk_mount_conditioner_unentitled: OTHER_LDFLAGS += -ldarwintest_utils
 disk_mount_conditioner_unentitled: disk_mount_conditioner.c
@@ -200,6 +264,8 @@ thread_group_set_32261625: INVALID_ARCHS = i386
 
 task_info: CODE_SIGN_ENTITLEMENTS = task_for_pid_entitlement.plist
 
+task_vm_info_decompressions: INVALID_ARCHS = x86_64 i386
+
 socket_bind_35243417: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist
 socket_bind_35685803: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist
 
@@ -209,8 +275,14 @@ ifneq (osx,$(TARGET_NAME))
 EXCLUDED_SOURCES += no32exec_35914211.c no32exec_35914211_helper.c
 endif
 
-no32exec_35914211_helper:  INVALID_ARCHS = x86_64
-no32exec_35914211:  INVALID_ARCHS = i386
+no32exec_35914211_helper: INVALID_ARCHS = x86_64
+no32exec_35914211_helper_binprefs:
+       $(CC) $(OTHER_CFLAGS) $(CFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) -ldarwintest -arch i386 -arch x86_64 \
+               no32exec_35914211_helper_binprefs.c -o $(SYMROOT)/no32exec_35914211_helper_binprefs
+
+no32exec_35914211: INVALID_ARCHS = i386
+no32exec_35914211: no32exec_35914211_helper
+no32exec_35914211: no32exec_35914211_helper_binprefs
 
 MIG:=SDKROOT=$(SDKROOT) $(shell xcrun -sdk "$(TARGETSDK)" -find mig)
 
@@ -227,6 +299,9 @@ install-excserver: ;
 exc_resource_threads: excserver
 exc_resource_threads: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT)
 
+fp_exception: excserver
+fp_exception: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT)
+
 ifneq (osx,$(TARGET_NAME))
 EXCLUDED_SOURCES += ldt_code32.s ldt.c
 else
@@ -239,7 +314,8 @@ $(OBJROOT)/ldt_mach_exc_server.c:
 
 ldt: INVALID_ARCHS = i386
 ldt: $(OBJROOT)/ldt_mach_exc_server.c
-ldt: OTHER_CFLAGS += -I $(OBJROOT) $(SRCROOT)/ldt_code32.s -Wl,-pagezero_size,0x1000
+ldt: OTHER_CFLAGS += -I $(OBJROOT) $(SRCROOT)/ldt_code32.s -Wl,-pagezero_size,0x1000 -Wno-missing-variable-declarations
+ldt: CODE_SIGN_ENTITLEMENTS=ldt_entitlement.plist
 endif
 
 ifneq ($(PLATFORM),BridgeOS)
@@ -249,8 +325,9 @@ remote_time: INVALID_ARCHS = armv7 armv7s arm64_32
 endif
 
 vm_phys_footprint: OTHER_LDFLAGS += -framework CoreFoundation -framework IOSurface
-vm_phys_footprint_legacy: legacy_footprint.entitlement
-vm_phys_footprint_legacy: OTHER_LDFLAGS += -framework CoreFoundation -framework IOSurface
-vm_phys_footprint_legacy: CODE_SIGN_ENTITLEMENTS=./legacy_footprint.entitlement
+
+debug_control_port_for_pid: CODE_SIGN_ENTITLEMENTS = ./debug_control_port_for_pid_entitlement.plist
+
+prng: OTHER_LDFLAGS += -ldarwintest_utils
 
 include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.targets
index 345361957acc08746f5484856dd88eebc65e88c1..44c0a9ed143864e1b17118f9ef9869b77c1b379c 100644 (file)
@@ -16,7 +16,8 @@
 
 T_GLOBAL_META(
        T_META_NAMESPACE("xnu.intel"),
-       T_META_CHECK_LEAKS(false)
+       T_META_CHECK_LEAKS(false),
+       T_META_RUN_CONCURRENTLY(true)
        );
 
 #define NORMAL_RUN_TIME  (10)
index 379960766a58e8c49ce727b4ab6fdb439a29c4ce..f0af5447daf208709afdda6ed447952159dee507 100644 (file)
@@ -9,15 +9,17 @@
 #include <sys/mman.h>
 #include <sys/sysctl.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 #define USER_FRAMES (12)
 
-#define NON_RECURSE_FRAMES (5)
+#define NON_RECURSE_FRAMES (4)
 
 static const char *user_bt[USER_FRAMES] = {
-       NULL, NULL,
+       NULL,
        "backtrace_thread",
        "recurse_a", "recurse_b", "recurse_a", "recurse_b",
-       "recurse_a", "recurse_b", "recurse_a",
+       "recurse_a", "recurse_b", "recurse_a", "recurse_b",
        "expect_stack", NULL
 };
 
@@ -28,13 +30,15 @@ expect_frame(const char **bt, unsigned int bt_len, CSSymbolRef symbol,
        const char *name;
        unsigned int frame_idx = max_frames - bt_idx - 1;
 
-       if (bt[frame_idx] == NULL) {
-               T_LOG("frame %2u: skipping system frame", frame_idx);
+       if (CSIsNull(symbol)) {
+               T_FAIL("invalid symbol for address %#lx at frame %d", addr,
+                   frame_idx);
                return;
        }
 
-       if (CSIsNull(symbol)) {
-               T_FAIL("invalid symbol for address %#lx at frame %d", addr, frame_idx);
+       if (bt[frame_idx] == NULL) {
+               T_LOG("frame %2u: skipping system frame %s", frame_idx,
+                   CSSymbolGetName(symbol));
                return;
        }
 
diff --git a/tests/coalition_info.c b/tests/coalition_info.c
new file mode 100644 (file)
index 0000000..14ce533
--- /dev/null
@@ -0,0 +1,64 @@
+#include <darwintest.h>
+#include <inttypes.h>
+#include <mach/coalition.h>
+#include <stdint.h>
+#include <sys/coalition.h>
+#include <sys/sysctl.h>
+#include <libproc.h>
+#include <unistd.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+static void
+skip_if_monotonic_unsupported(void)
+{
+       int r;
+       int supported = 0;
+       size_t supported_size = sizeof(supported);
+
+       r = sysctlbyname("kern.monotonic.supported", &supported, &supported_size,
+           NULL, 0);
+       if (r < 0) {
+               T_WITH_ERRNO;
+               T_SKIP("could not find \"kern.monotonic.supported\" sysctl");
+       }
+
+       if (!supported) {
+               T_SKIP("monotonic is not supported on this platform");
+       }
+}
+
+T_DECL(coalition_resource_info_counters,
+    "ensure that coalition resource info produces valid counter data")
+{
+       skip_if_monotonic_unsupported();
+
+       struct proc_pidcoalitioninfo idinfo = {};
+       int ret = proc_pidinfo(getpid(), PROC_PIDCOALITIONINFO, 0,
+           &idinfo, sizeof(idinfo));
+       T_ASSERT_POSIX_SUCCESS(ret, "proc_pidinfo(... PROC_PIDCOALITIONINFO ...)");
+
+       uint64_t resid = idinfo.coalition_id[COALITION_TYPE_RESOURCE];
+
+       struct coalition_resource_usage coalusage[2] = {};
+       ret = coalition_info_resource_usage(resid, &coalusage[0],
+           sizeof(coalusage[0]));
+       T_ASSERT_POSIX_SUCCESS(ret, "coalition_info_resource_usage()");
+       T_EXPECT_GT(coalusage[0].cpu_instructions, UINT64_C(0),
+           "instruction count is non-zero");
+       T_EXPECT_GT(coalusage[0].cpu_cycles, UINT64_C(0),
+           "cycle count is non-zero");
+
+       sleep(1);
+
+       ret = coalition_info_resource_usage(resid, &coalusage[1],
+           sizeof(coalusage[1]));
+       T_ASSERT_POSIX_SUCCESS(ret, "coalition_info_resource_usage()");
+
+       T_EXPECT_GE(coalusage[1].cpu_instructions, coalusage[0].cpu_instructions,
+           "instruction count is monotonically increasing (+%" PRIu64 ")",
+           coalusage[1].cpu_instructions - coalusage[0].cpu_instructions);
+       T_EXPECT_GE(coalusage[1].cpu_cycles, coalusage[0].cpu_cycles,
+           "cycle count is monotonically increasing (+%" PRIu64 ")",
+           coalusage[1].cpu_cycles - coalusage[0].cpu_cycles);
+}
index a3641bdbe7ff6d2317f80d6d250ec351a793a4ea..24a2c156cb99d6c42262c66884d07c1f686d213c 100644 (file)
@@ -27,6 +27,8 @@
 
 #include <os/tsd.h> /* private header for _os_cpu_number */
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 /* const variables aren't constants, but enums are */
 enum { max_threads = 40 };
 
index 7a7e4dc8a8f5fbd45607ef5a1e4e409f6c8a473e..bb0411decd154ef6e604e8bff78c277f10ff4696 100644 (file)
@@ -750,8 +750,8 @@ set_passcode(char * new_passcode, char * old_passcode)
        }
 
        char * const keystorectl_args[] = {
-               KEYSTORECTL_PATH,
-               "change-password",
+               KEYBAGDTEST_PATH,
+               "syspass",
                old_passcode,
                new_passcode,
                NULL
diff --git a/tests/debug_control_port_for_pid.c b/tests/debug_control_port_for_pid.c
new file mode 100644 (file)
index 0000000..6985908
--- /dev/null
@@ -0,0 +1,24 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <mach/mach.h>
+#include <mach/mach_types.h>
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+T_DECL(debug_control_port_for_pid_success,
+    "Verify that with debug_port entitlement you can call debug_control_port_for_pid",
+    T_META_ASROOT(true), T_META_CHECK_LEAKS(false))
+{
+       if (geteuid() != 0) {
+               T_SKIP("test requires root privileges to run.");
+       }
+
+       mach_port_t port = MACH_PORT_NULL;
+       T_ASSERT_MACH_SUCCESS(debug_control_port_for_pid(mach_task_self(), 1, &port), "debug_control_port_for_pid");
+       T_EXPECT_NE(port, MACH_PORT_NULL, "debug_port");
+       mach_port_deallocate(mach_task_self(), port);
+}
diff --git a/tests/debug_control_port_for_pid_entitlement.plist b/tests/debug_control_port_for_pid_entitlement.plist
new file mode 100644 (file)
index 0000000..c1cadea
--- /dev/null
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+        <key>com.apple.private.debug_port</key>
+        <true/>
+</dict>
+</plist>
index 6c733f4518b9a384f90488140bc4cf241d8c76c7..4cc70598db30d924ed0ad49fd6c7031c3f618b7b 100644 (file)
@@ -24,7 +24,8 @@ static void perf_setup(char **path, int *fd);
 
 T_GLOBAL_META(
        T_META_NAMESPACE("xnu.vfs.dmc"),
-       T_META_ASROOT(true)
+       T_META_ASROOT(true),
+       T_META_RUN_CONCURRENTLY(true)
        );
 
 #pragma mark Entitled Tests
@@ -271,7 +272,8 @@ T_DECL(fsctl_set_nonroot,
 }
 
 T_DECL(fsctl_delays,
-    "Validate I/O delays when DMC is enabled")
+    "Validate I/O delays when DMC is enabled",
+    T_META_RUN_CONCURRENTLY(false))
 {
        char *path;
        int fd;
index 173a8ef825d589179692228ac2fb8f5782febc77..09caf8cda1127624adab223b149130c82b073ccf 100644 (file)
@@ -19,6 +19,8 @@
 
 #include <excserver.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 static dispatch_semaphore_t sync_sema;
 
 kern_return_t
diff --git a/tests/extended_getdirentries64.c b/tests/extended_getdirentries64.c
new file mode 100644 (file)
index 0000000..f30652a
--- /dev/null
@@ -0,0 +1,45 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+#include <darwintest.h>
+#include <darwintest_multiprocess.h>
+
+#define PRIVATE 1
+#include "../bsd/sys/dirent.h"
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+ssize_t __getdirentries64(int fd, void *buf, size_t bufsize, off_t *basep);
+
+T_DECL(getdirentries64_extended, "check for GETDIRENTRIES64_EOF")
+{
+       char buf[GETDIRENTRIES64_EXTENDED_BUFSIZE];
+       getdirentries64_flags_t *flags;
+       ssize_t result;
+       off_t offset;
+       int fd;
+       bool eof = false;
+
+       flags = (getdirentries64_flags_t *)(uintptr_t)(buf + sizeof(buf) -
+           sizeof(getdirentries64_flags_t));
+       fd = open("/", O_DIRECTORY | O_RDONLY);
+       T_ASSERT_POSIX_SUCCESS(fd, "open(/)");
+
+       for (;;) {
+               *flags = (getdirentries64_flags_t)~0;
+               result = __getdirentries64(fd, buf, sizeof(buf), &offset);
+               T_ASSERT_POSIX_SUCCESS(result, "__getdirentries64()");
+               T_ASSERT_LE((size_t)result, sizeof(buf) - sizeof(getdirentries64_flags_t),
+                   "The kernel should have left space for the flags");
+               T_ASSERT_NE(*flags, (getdirentries64_flags_t)~0,
+                   "The kernel should have returned status");
+               if (eof) {
+                       T_ASSERT_EQ(result, 0l, "At EOF, we really should be done");
+                       T_ASSERT_TRUE(*flags & GETDIRENTRIES64_EOF, "And EOF should still be set");
+                       T_END;
+               }
+               T_ASSERT_NE(result, 0l, "We're not at EOF, we should have an entry");
+               eof = (*flags & GETDIRENTRIES64_EOF);
+       }
+}
diff --git a/tests/fp_exception.c b/tests/fp_exception.c
new file mode 100644 (file)
index 0000000..5010d9f
--- /dev/null
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2019 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/**
+ * On devices that support it, this test ensures that a mach exception is
+ * generated when an ARMv8 floating point exception is triggered.
+ */
+#include <darwintest.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <mach/mach.h>
+#include <mach/thread_status.h>
+#include <mach/exception.h>
+#include <pthread.h>
+
+#if __has_feature(ptrauth_calls)
+#include <ptrauth.h>
+#endif
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+/* The bit to set in FPCR to enable the divide-by-zero floating point exception. */
+#define FPCR_DIV_EXC 0x200
+
+/* Whether we caught the EXC_ARITHMETIC mach exception or not. */
+static volatile bool mach_exc_caught = false;
+
+/**
+ * mach_exc_server() is a MIG-generated function that verifies the message
+ * that was received is indeed a mach exception and then calls
+ * catch_mach_exception_raise_state() to handle the exception.
+ */
+extern boolean_t mach_exc_server(mach_msg_header_t *, mach_msg_header_t *);
+
+/**
+ * This has to be defined for linking purposes, but it's unused in this test.
+ */
+kern_return_t
+catch_mach_exception_raise(
+       mach_port_t exception_port,
+       mach_port_t thread,
+       mach_port_t task,
+       exception_type_t type,
+       exception_data_t codes,
+       mach_msg_type_number_t code_count)
+{
+#pragma unused(exception_port, thread, task, type, codes, code_count)
+       T_FAIL("Triggered catch_mach_exception_raise() which shouldn't happen...");
+       __builtin_unreachable();
+}
+
+/**
+ * Called by mach_exc_server() to handle the exception. This will verify the
+ * exception is a floating point divide-by-zero exception and will then modify
+ * the thread state to move to the next instruction.
+ */
+kern_return_t
+catch_mach_exception_raise_state(
+       mach_port_t exception_port,
+       exception_type_t type,
+       exception_data_t codes,
+       mach_msg_type_number_t code_count,
+       int *flavor,
+       thread_state_t in_state,
+       mach_msg_type_number_t in_state_count,
+       thread_state_t out_state,
+       mach_msg_type_number_t *out_state_count)
+{
+#pragma unused(exception_port, type, codes, code_count, flavor, in_state, in_state_count, out_state, out_state_count)
+#ifdef __arm64__
+       T_LOG("Caught a mach exception!\n");
+
+       /* Floating point divide by zero should cause an EXC_ARITHMETIC exception. */
+       T_ASSERT_EQ(type, EXC_ARITHMETIC, "Caught an EXC_ARITHMETIC exception");
+
+       /* There should only be two code vales. */
+       T_ASSERT_EQ(code_count, 2, "Two code values were provided with the mach exception");
+
+       /**
+        * The code values should be 64-bit since MACH_EXCEPTION_CODES was specified
+        * when setting the exception port.
+        */
+       uint64_t *codes_64 = (uint64_t*)codes;
+       T_LOG("Mach exception codes[0]: %#llx, codes[1]: %#llx\n", codes_64[0], codes_64[1]);
+
+       /* Verify that we're receiving 64-bit ARM thread state values. */
+       T_ASSERT_EQ(*flavor, ARM_THREAD_STATE64, "The thread state flavor is ARM_THREAD_STATE64");
+       T_ASSERT_EQ(in_state_count, ARM_THREAD_STATE64_COUNT, "The thread state count is ARM_THREAD_STATE64_COUNT");
+
+       /* Verify the exception is a floating point divide-by-zero exception. */
+       T_ASSERT_EQ(codes_64[0], EXC_ARM_FP_DZ, "The subcode is EXC_ARM_FP_DZ (floating point divide-by-zero)");
+
+       /**
+        * Increment the PC to the next instruction so the thread doesn't cause
+        * another exception when it resumes.
+        */
+       *out_state_count = in_state_count; /* size of state object in 32-bit words */
+       memcpy((void*)out_state, (void*)in_state, in_state_count * 4);
+       arm_thread_state64_t *state = (arm_thread_state64_t*)out_state;
+
+       void *pc = (void*)(arm_thread_state64_get_pc(*state) + 4);
+#if __has_feature(ptrauth_calls)
+       /* Have to sign the new PC value when pointer authentication is enabled. */
+       pc = ptrauth_sign_unauthenticated(pc, ptrauth_key_function_pointer, 0);
+#endif
+       arm_thread_state64_set_pc_fptr(*state, pc);
+
+       mach_exc_caught = true;
+#endif /* __arm64__ */
+
+       /* Return KERN_SUCCESS to tell the kernel to keep running the victim thread. */
+       return KERN_SUCCESS;
+}
+
+/**
+ * This has to be defined for linking purposes, but it's unused in this test.
+ */
+kern_return_t
+catch_mach_exception_raise_state_identity(
+       mach_port_t exception_port,
+       mach_port_t thread,
+       mach_port_t task,
+       exception_type_t type,
+       exception_data_t codes,
+       mach_msg_type_number_t code_count,
+       int *flavor,
+       thread_state_t in_state,
+       mach_msg_type_number_t in_state_count,
+       thread_state_t out_state,
+       mach_msg_type_number_t *out_state_count)
+{
+#pragma unused(exception_port, thread, task, type, codes, code_count, flavor, in_state, in_state_count, out_state, out_state_count)
+       T_FAIL("Triggered catch_mach_exception_raise_state_identity() which shouldn't happen...");
+       __builtin_unreachable();
+}
+
+/**
+ * Thread to handle the mach exception generated by the floating point exception.
+ *
+ * @param arg The exception port to wait for a message on.
+ */
+void *
+exc_server_thread(void *arg)
+{
+       mach_port_t exc_port = *(mach_port_t*)arg;
+
+       /**
+        * mach_msg_server_once is a helper function provided by libsyscall that
+        * handles creating mach messages, blocks waiting for a message on the
+        * exception port, calls mach_exc_server() to handle the exception, and
+        * sends a reply based on the return value of mach_exc_server().
+        */
+#define MACH_MSG_REPLY_SIZE 4096
+       kern_return_t kr = mach_msg_server_once(mach_exc_server, MACH_MSG_REPLY_SIZE, exc_port, 0);
+       T_ASSERT_MACH_SUCCESS(kr, "Received mach exception message");
+
+       pthread_exit((void*)0);
+       __builtin_unreachable();
+}
+
+T_DECL(armv8_fp_exception,
+    "Test that ARMv8 floating point exceptions generate mach exceptions.")
+{
+#ifndef __arm64__
+       T_SKIP("Running on non-arm64 target, skipping...");
+#else
+       pthread_t exc_thread;
+       mach_port_t exc_port = MACH_PORT_NULL;
+       mach_port_t task = mach_task_self();
+       mach_port_t thread = mach_thread_self();
+       kern_return_t kr = KERN_SUCCESS;
+
+       /* Attempt to enable Divide-by-Zero floating point exceptions in hardware. */
+       uint64_t fpcr = __builtin_arm_rsr64("FPCR") | FPCR_DIV_EXC;
+       __builtin_arm_wsr64("FPCR", fpcr);
+#define DSB_ISH 0xb
+       __builtin_arm_dsb(DSB_ISH);
+
+       /* Devices that don't support floating point exceptions have FPCR as RAZ/WI. */
+       if (__builtin_arm_rsr64("FPCR") != fpcr) {
+               T_SKIP("Running on a device that doesn't support floating point exceptions, skipping...");
+       }
+
+       /* Create the mach port the exception messages will be sent to. */
+       kr = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &exc_port);
+       T_ASSERT_MACH_SUCCESS(kr, "Allocated mach exception port");
+
+       /**
+        * Insert a send right into the exception port that the kernel will use to
+        * send the exception thread the exception messages.
+        */
+       kr = mach_port_insert_right(task, exc_port, exc_port, MACH_MSG_TYPE_MAKE_SEND);
+       T_ASSERT_MACH_SUCCESS(kr, "Inserted a SEND right into the exception port");
+
+       /* Tell the kernel what port to send EXC_ARITHMETIC exceptions to. */
+       kr = thread_set_exception_ports(
+               thread,
+               EXC_MASK_ARITHMETIC,
+               exc_port,
+               EXCEPTION_STATE | MACH_EXCEPTION_CODES,
+               ARM_THREAD_STATE64);
+       T_ASSERT_MACH_SUCCESS(kr, "Set the exception port to my custom handler");
+
+       /* Spawn the exception server's thread. */
+       int err = pthread_create(&exc_thread, (pthread_attr_t*)0, exc_server_thread, (void*)&exc_port);
+       T_ASSERT_POSIX_ZERO(err, "Spawned exception server thread");
+
+       /* No need to wait for the exception server to be joined when it exits. */
+       pthread_detach(exc_thread);
+
+       /**
+        * This should cause a floating point divide-by-zero exception to get triggered.
+        *
+        * The kernel shouldn't resume this thread until the mach exception is handled
+        * by the exception server that was just spawned. The exception handler will
+        * explicitly increment the PC += 4 to move to the next instruction.
+        */
+       float a = 6.5f;
+       float b = 0.0f;
+       __asm volatile ("fdiv %s0, %s1, %s2" : "=w" (a) : "w" (a), "w" (b));
+
+       if (mach_exc_caught) {
+               T_PASS("The expected floating point divide-by-zero exception was caught!");
+       } else {
+               T_FAIL("The floating point divide-by-zero exception was not captured :(");
+       }
+#endif /* __arm64__ */
+}
index 815abe79e5a986cfd059c158a8353c6bb5e440a6..872585d26f7b3fbe9061939c5c10d8a01a9d0b66 100644 (file)
@@ -31,6 +31,8 @@
 #include <signal.h>
 #include <unistd.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 T_DECL(waitpid_nohang, "FreeBSDarwin--waitpid_nohang")
 {
        pid_t child, pid;
index e2f792b4cbcbcae83e4d4701be89d29131ab0694..e10e939ef1fb5b9cc3e04edb3032e97bac99efda 100644 (file)
@@ -4,6 +4,8 @@
 
 #include <darwintest.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 extern int __gettimeofday(struct timeval *, struct timezone *);
 
 T_DECL(gettimeofday, "gettimeofday()",
index 27809e7478c0ca4a1c174e452eabb8d445664293..b26ade3a031b80de2016b2f21280d81b6fe3d984 100644 (file)
@@ -7,6 +7,8 @@
 #include <darwintest.h>
 #include <stdlib.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 #if !defined(CS_OPS_CLEARPLATFORM)
 #define CS_OPS_CLEARPLATFORM 13
 #endif
diff --git a/tests/immovable_rights.c b/tests/immovable_rights.c
new file mode 100644 (file)
index 0000000..bc484d8
--- /dev/null
@@ -0,0 +1,51 @@
+#include <darwintest.h>
+#include <servers/bootstrap.h>
+#include <mach/mach.h>
+#include <mach/message.h>
+#include <stdlib.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+#include <mach/port.h>
+#include <mach/mach_port.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+T_DECL(immovable_rights, "Create a port with immovable receive rights") {
+       mach_port_t imm_port;
+       mach_port_options_t opts = {
+               .flags = MPO_CONTEXT_AS_GUARD | MPO_IMMOVABLE_RECEIVE
+       };
+       kern_return_t kr;
+
+       kr = mach_port_construct(mach_task_self(), &opts, 0x10, &imm_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct");
+
+       mach_port_status_t status;
+       mach_msg_type_number_t status_size = MACH_PORT_RECEIVE_STATUS_COUNT;
+       kr = mach_port_get_attributes(mach_task_self(), imm_port,
+           MACH_PORT_RECEIVE_STATUS, (mach_port_info_t)&status, &status_size);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_get_attributes");
+       T_LOG("Status flags %d", status.mps_flags);
+       T_ASSERT_NE(0, (status.mps_flags & MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE), "Imm rcv bit is set");
+
+       mach_port_t imm_port2;
+       mach_port_options_t opts2 = {};
+
+       kr = mach_port_construct(mach_task_self(), &opts2, 0, &imm_port2);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct");
+
+       kr = mach_port_guard_with_flags(mach_task_self(), imm_port2, 0x11, (uint64_t)MPG_IMMOVABLE_RECEIVE);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_guard_with_flags");
+
+       kr = mach_port_get_attributes(mach_task_self(), imm_port2,
+           MACH_PORT_RECEIVE_STATUS, (mach_port_info_t)&status, &status_size);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_get_attributes");
+       T_LOG("Status flags %d", status.mps_flags);
+       T_ASSERT_NE(0, (status.mps_flags & MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE), "Imm rcv bit is set");
+
+       kr = mach_port_swap_guard(mach_task_self(), imm_port2, 0x11, 0xde18);
+       T_ASSERT_MACH_SUCCESS(kr, "mach_port_swap_guard");
+
+       kr = mach_port_unguard(mach_task_self(), imm_port2, 0xde18);
+       T_ASSERT_MACH_SUCCESS(kr, "mach_port_unguard");
+}
diff --git a/tests/immovable_send.c b/tests/immovable_send.c
new file mode 100644 (file)
index 0000000..2e1f90d
--- /dev/null
@@ -0,0 +1,215 @@
+#include <darwintest.h>
+#include <servers/bootstrap.h>
+#include <mach/mach.h>
+#include <mach/message.h>
+#include <stdlib.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+#include <darwintest_multiprocess.h>
+#include <excserver.h>
+#include <spawn.h>
+#include <spawn_private.h>
+#include <libproc_internal.h>
+#include <signal.h>
+
+#include <IOKit/IOKitLib.h>
+
+#define TASK_EXC_GUARD_MP_DELIVER 0x10
+#define MAX_ARGV 2
+
+extern char **environ;
+
+kern_return_t
+catch_mach_exception_raise_state(mach_port_t exception_port,
+    exception_type_t exception,
+    const mach_exception_data_t code,
+    mach_msg_type_number_t code_count,
+    int * flavor,
+    const thread_state_t old_state,
+    mach_msg_type_number_t old_state_count,
+    thread_state_t new_state,
+    mach_msg_type_number_t * new_state_count)
+{
+#pragma unused(exception_port, exception, code, code_count, flavor, old_state, old_state_count, new_state, new_state_count)
+       T_FAIL("Unsupported catch_mach_exception_raise_state");
+       return KERN_NOT_SUPPORTED;
+}
+
+kern_return_t
+catch_mach_exception_raise_state_identity(mach_port_t exception_port,
+    mach_port_t thread,
+    mach_port_t task,
+    exception_type_t exception,
+    mach_exception_data_t code,
+    mach_msg_type_number_t code_count,
+    int * flavor,
+    thread_state_t old_state,
+    mach_msg_type_number_t old_state_count,
+    thread_state_t new_state,
+    mach_msg_type_number_t * new_state_count)
+{
+#pragma unused(exception_port, thread, task, exception, code, code_count, flavor, old_state, old_state_count, new_state, new_state_count)
+       T_FAIL("Unsupported catch_mach_exception_raise_state_identity");
+       return KERN_NOT_SUPPORTED;
+}
+
+kern_return_t
+catch_mach_exception_raise(mach_port_t exception_port,
+    mach_port_t thread,
+    mach_port_t task,
+    exception_type_t exception,
+    mach_exception_data_t code,
+    mach_msg_type_number_t code_count)
+{
+#pragma unused(exception_port, task, thread, code_count)
+       T_ASSERT_EQ(exception, EXC_GUARD, "exception type");
+       T_LOG("Exception raised with exception code : %llx\n", *code);
+       T_END;
+       return KERN_SUCCESS;
+}
+
+typedef struct {
+       mach_msg_header_t   header;
+       mach_msg_body_t     body;
+       mach_msg_port_descriptor_t port_descriptor;
+       mach_msg_trailer_t  trailer;            // subtract this when sending
+} ipc_complex_message;
+
+struct args {
+       char *server_port_name;
+       mach_port_t server_port;
+};
+
+void parse_args(struct args *args);
+void server_setup(struct args* args);
+void* exception_server_thread(void *arg);
+mach_port_t create_exception_port(void);
+
+#define TEST_TIMEOUT    10
+
+void
+parse_args(struct args *args)
+{
+       args->server_port_name = "TEST_IMMOVABLE_SEND";
+       args->server_port = MACH_PORT_NULL;
+}
+
+/* Create a mach IPC listener which will respond to the client's message */
+void
+server_setup(struct args *args)
+{
+       kern_return_t ret;
+       mach_port_t bsport;
+
+       ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE,
+           &args->server_port);
+       T_ASSERT_MACH_SUCCESS(ret, "server: mach_port_allocate()");
+
+       ret = mach_port_insert_right(mach_task_self(), args->server_port, args->server_port,
+           MACH_MSG_TYPE_MAKE_SEND);
+       T_ASSERT_MACH_SUCCESS(ret, "server: mach_port_insert_right()");
+
+       ret = task_get_bootstrap_port(mach_task_self(), &bsport);
+       T_ASSERT_MACH_SUCCESS(ret, "server: task_get_bootstrap_port()");
+
+       ret = bootstrap_register(bsport, args->server_port_name, args->server_port);
+       T_ASSERT_MACH_SUCCESS(ret, "server: bootstrap_register()");
+
+       T_LOG("server: waiting for IPC messages from client on port '%s'.\n",
+           args->server_port_name);
+}
+
+mach_port_t
+create_exception_port()
+{
+       kern_return_t kret;
+       mach_port_t exc_port = MACH_PORT_NULL;
+       mach_port_t task = mach_task_self();
+
+       kret = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &exc_port);
+       T_EXPECT_MACH_SUCCESS(kret, "mach_port_allocate exc_port");
+
+       kret = mach_port_insert_right(task, exc_port, exc_port, MACH_MSG_TYPE_MAKE_SEND);
+       T_EXPECT_MACH_SUCCESS(kret, "mach_port_insert_right exc_port");
+
+       return exc_port;
+}
+
+void *
+exception_server_thread(void *arg)
+{
+       kern_return_t kr;
+       mach_port_t exc_port = *(mach_port_t *)arg;
+       T_EXPECT_NE(exc_port, MACH_PORT_NULL, "exception port is not null");
+
+       /* Handle exceptions on exc_port */
+       kr = mach_msg_server(mach_exc_server, 4096, exc_port, 0);
+       T_EXPECT_MACH_SUCCESS(kr, "mach_msg_server");
+
+       return NULL;
+}
+
+T_DECL(catch_exception, "Send guard port descriptor to another process", T_META_IGNORECRASHES(".*immovable_send_client.*"))
+{
+       uint32_t task_exc_guard = 0;
+       size_t te_size = sizeof(&task_exc_guard);
+       kern_return_t kr;
+       mach_msg_type_number_t  maskCount = 1;
+       exception_mask_t        mask;
+       exception_handler_t     handler;
+       exception_behavior_t    behavior;
+       thread_state_flavor_t   flavor;
+       mach_port_t             task = mach_task_self();
+       struct args*            server_args = (struct args*)malloc(sizeof(struct args));
+       posix_spawnattr_t       attrs;
+       char *test_prog_name = "./immovable_send_client";
+       char *child_args[MAX_ARGV];
+
+       T_LOG("Check if task_exc_guard exception has been enabled\n");
+       sysctlbyname("kern.task_exc_guard_default", &task_exc_guard, &te_size, NULL, 0);
+       //TODO: check if sysctlbyname is successful
+
+       /* Create the bootstrap port */
+       parse_args(server_args);
+       server_setup(server_args);
+
+       /* Create the exception port for the server */
+       mach_port_t exc_port = create_exception_port();
+       T_EXPECT_NOTNULL(exc_port, "Create a new exception port");
+
+       pthread_t s_exc_thread;
+
+       /* Create exception serving thread */
+       int ret = pthread_create(&s_exc_thread, NULL, exception_server_thread, &exc_port);
+       T_EXPECT_POSIX_SUCCESS(ret, "pthread_create exception_server_thread");
+
+       /* Get current exception ports */
+       kr = task_get_exception_ports(task, EXC_MASK_GUARD, &mask,
+           &maskCount, &handler, &behavior, &flavor);
+       T_EXPECT_MACH_SUCCESS(kr, "task_get_exception_ports");
+
+       /* Initialize posix_spawn attributes */
+       posix_spawnattr_init(&attrs);
+
+       int err = posix_spawnattr_setexceptionports_np(&attrs, EXC_MASK_GUARD, exc_port,
+           (exception_behavior_t) (EXCEPTION_DEFAULT | MACH_EXCEPTION_CODES), 0);
+       T_EXPECT_POSIX_SUCCESS(err, "posix_spawnattr_setflags");
+
+       child_args[0] = test_prog_name;
+       child_args[1] = NULL;
+
+       err = posix_spawn(NULL, child_args[0], NULL, &attrs, &child_args[0], environ);
+       T_EXPECT_POSIX_SUCCESS(err, "posix_spawn immovable_send_client");
+
+       int child_status;
+       /* Wait for child and check for exception */
+       if (-1 == wait4(-1, &child_status, 0, NULL)) {
+               T_FAIL("wait4: child mia");
+       }
+
+       if (WIFEXITED(child_status) && WEXITSTATUS(child_status)) {
+               T_LOG("Child exited with status = %x", child_status);
+       }
+
+       sigsuspend(0);
+}
diff --git a/tests/immovable_send_client.c b/tests/immovable_send_client.c
new file mode 100644 (file)
index 0000000..682cdcf
--- /dev/null
@@ -0,0 +1,130 @@
+#include <darwintest.h>
+#include <servers/bootstrap.h>
+#include <mach/mach.h>
+#include <mach/message.h>
+#include <stdlib.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+#include <darwintest_multiprocess.h>
+#include <IOKit/IOKitLib.h>
+
+typedef struct {
+       mach_msg_header_t   header;
+       mach_msg_body_t     body;
+       mach_msg_port_descriptor_t port_descriptor;
+       mach_msg_trailer_t  trailer;            // subtract this when sending
+} ipc_complex_message;
+
+static ipc_complex_message icm_request = {};
+
+struct args {
+       const char *progname;
+       int verbose;
+       int voucher;
+       int num_msgs;
+       const char *server_port_name;
+       mach_port_t server_port;
+       mach_port_t reply_port;
+       mach_port_t voucher_port;
+       int request_msg_size;
+       void *request_msg;
+       int reply_msg_size;
+       void *reply_msg;
+       mach_port_t sp_voucher_port;
+       uint32_t persona_id;
+       long client_pid;
+};
+
+static void
+parse_args(struct args *args)
+{
+       args->verbose = 0;
+       args->voucher = 0;
+       args->server_port_name = "TEST_IMMOVABLE_SEND";
+       args->server_port = MACH_PORT_NULL;
+       args->reply_port = MACH_PORT_NULL;
+       args->voucher_port = MACH_PORT_NULL;
+       args->num_msgs = 1;
+       args->request_msg_size = sizeof(ipc_complex_message) - sizeof(mach_msg_trailer_t);
+       //args->reply_msg_size = sizeof(ipc_complex_message2) - sizeof(mach_msg_trailer_t);
+       args->request_msg = &icm_request;
+       args->reply_msg = NULL;
+       args->client_pid = getpid();
+}
+
+int
+main()
+{
+       struct args client_args = {};
+       parse_args(&client_args);
+
+       /* Find the bootstrap port */
+       mach_port_t bsport;
+       kern_return_t ret = task_get_bootstrap_port(mach_task_self(), &bsport);
+       if (ret) {
+               mach_error("client: task_get_bootstrap_port()", ret);
+               exit(1);
+       }
+
+       printf("client: Look up bootstrap service port\n");
+       ret = bootstrap_look_up(bsport, client_args.server_port_name,
+           &client_args.server_port);
+       if (ret) {
+               mach_error("client: bootstrap_look_up()", ret);
+               exit(1);
+       }
+
+       printf("client: Look up the ioconnect service port to be sent\n");
+       io_service_t amfi = IO_OBJECT_NULL;
+       io_connect_t connect = IO_OBJECT_NULL;
+       IOReturn ioret;
+
+       amfi = IOServiceGetMatchingService(kIOMasterPortDefault, IOServiceMatching("AppleMobileFileIntegrity"));
+       if (amfi == IO_OBJECT_NULL) {
+               fprintf(stderr, "client: unable to find AppleMobileFileIntegrity service\n");
+               exit(1);
+       }
+       ioret = IOServiceOpen(amfi, mach_task_self(), 0, &connect);
+       if (ioret != kIOReturnSuccess) {
+               fprintf(stderr, "client: unable to open user client: 0x%x\n", ret);
+               exit(1);
+       }
+
+       printf("client: Found the matching io_connect port = %d\n", connect);
+
+       /* Construct the message */
+       mach_msg_header_t *request = (mach_msg_header_t *)client_args.request_msg;
+       request->msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND, 0,
+           0, 0) | MACH_MSGH_BITS_COMPLEX;
+       request->msgh_size = (mach_msg_size_t)client_args.request_msg_size;
+       request->msgh_remote_port = client_args.server_port;
+       request->msgh_local_port = MACH_PORT_NULL;
+       request->msgh_id = 1;
+
+       ipc_complex_message *complexmsg = (ipc_complex_message *)request;
+       complexmsg->body.msgh_descriptor_count = 1;
+       complexmsg->port_descriptor.name = connect;
+       complexmsg->port_descriptor.disposition = MACH_MSG_TYPE_MOVE_SEND;
+       complexmsg->port_descriptor.type = MACH_MSG_PORT_DESCRIPTOR;
+
+       mach_msg_option_t option = MACH_SEND_MSG;
+
+       printf("client: Sending request (expecting it to fail) \n");
+       mach_msg_return_t mret = mach_msg(request,
+           option,
+           (mach_msg_size_t)client_args.request_msg_size,
+           0,
+           MACH_PORT_NULL,
+           MACH_MSG_TIMEOUT_NONE,
+           MACH_PORT_NULL);
+
+       printf("client: mach_msg returned %x\n", mret);
+       if (mret != MACH_SEND_INVALID_RIGHT) {
+               mach_error("client: mach_msg", mret);
+               exit(1);
+       }
+
+       printf("It should never reach here\n");
+
+       return 0;
+}
diff --git a/tests/in_cksum_test.c b/tests/in_cksum_test.c
new file mode 100644 (file)
index 0000000..5731727
--- /dev/null
@@ -0,0 +1,235 @@
+/* <rdar://problem/49479689> arm64 os_cpu_in_cksum_mbuf sometimes incorrect with unaligned input buffer */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include <darwintest.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+extern uint32_t os_cpu_in_cksum(const void *, uint32_t, uint32_t);
+
+/****************************************************************/
+static void
+log_hexdump(const void *inp, size_t len)
+{
+       unsigned i, off = 0;
+       char buf[9 + 16 * 3 + 1];
+       for (i = 0; i < len; i++) {
+               if (i % 16 == 0) {
+                       off = (unsigned)snprintf(buf, sizeof(buf), "%08x:", i);
+               }
+               off += (unsigned)snprintf(buf + off, sizeof(buf) - off, " %02x", (((const uint8_t *)inp)[i]) & 0xff);
+               if (i % 16 == 15) {
+                       T_LOG("%s", buf);
+               }
+       }
+       if (len % 16) {
+               T_LOG("%s", buf);
+       }
+}
+
+/* I was going to use the one from rfc1701 section 4.1
+ * but then I saw the errata.  Hopefully this is dumb but
+ * correct, even though it is not particularly efficient.
+ */
+static uint16_t
+dumb_in_cksum(const uint8_t *buf, size_t len)
+{
+       uint32_t partial = 0;
+       while (len > 1) {
+               uint16_t val = buf[1];
+               val <<= 8;
+               val |= buf[0];
+               len -= 2;
+               buf += 2;
+               partial += val;
+               while ((val = partial >> 16)) {
+                       partial &= 0xffff;
+                       partial += val;
+               }
+       }
+       if (len) {
+               uint16_t val = buf[0];
+               partial += val;
+               while ((val = partial >> 16)) {
+                       partial &= 0xffff;
+                       partial += val;
+               }
+       }
+       return ~partial & 0xffff;
+}
+
+/* Calculate a checksum divided into partial checksums */
+static uint16_t
+split_in_cksum(const uint8_t *buf, int nsegs, const uint32_t *seglens, const uint8_t *aligns, uint8_t *tmpbuf)
+{
+       uint32_t partial = 0;
+
+       for (int i = 0; i < nsegs; i++) {
+               /* Only the last segment can have an odd length */
+               assert((i + 1 == nsegs) || seglens[i] % 2 == 0);
+
+               /* Copy a segment into the tmpbuf with the requested alignment */
+               memcpy(tmpbuf + aligns[i], buf, seglens[i]);
+
+               partial = os_cpu_in_cksum(tmpbuf + aligns[i], seglens[i], partial);
+               buf += seglens[i];
+       }
+
+       return ~partial & 0xffff;
+}
+
+static void
+test_checksum(const uint8_t *data, uint32_t len)
+{
+       uint16_t dsum = dumb_in_cksum(data, len);
+
+       const uint8_t MAXALIGN = 8;
+
+       uint8_t tmpbuf[len + MAXALIGN];
+       uint32_t seglens[2];
+       uint8_t aligns[2];
+       for (uint16_t split = 0; split < len; split += 2) {
+               seglens[0] = split;
+               seglens[1] = len - split;
+               for (aligns[0] = 0; aligns[0] < MAXALIGN; aligns[0]++) {
+                       for (aligns[1] = 0; aligns[1] < MAXALIGN; aligns[1]++) {
+                               uint16_t osum = split_in_cksum(data, 2, seglens, aligns, tmpbuf);
+                               if (osum != dsum) {
+                                       /* hexdump packet and alignments for debugging */
+                                       log_hexdump(data, len);
+                                       T_LOG("len %d seg[0] %d seg[1] %d align[0] %d align[1] %d\n", len, seglens[0], seglens[1], aligns[0], aligns[1]);
+                               }
+                               T_QUIET; T_ASSERT_EQ(osum, dsum, "checksum mismatch got 0x%04x expecting 0x%04x", htons(osum), htons(dsum));
+                       }
+               }
+       }
+       T_PASS("OK len %d", len);
+}
+
+static void
+test_one_random_packet(uint32_t maxlen)
+{
+       /* Pick a packet length */
+       uint32_t len = arc4random_uniform(maxlen);
+       uint8_t data[len];
+       arc4random_buf(data, len);
+       test_checksum(data, len);
+}
+
+/*
+ * This is the checksummed portion of the first packet in checksum_error.pcap
+ * It is known to cause a problem at splits 44 and 46 with second alignment of 1 or 3
+ */
+static uint8_t pkt49479689[] = {
+/*00000000*/ 0xc0, 0xa8, 0x01, 0x06, 0xc0, 0xa8, 0x01, 0x07, 0x00, 0x06, 0x05, 0xc8, 0xcb, 0xf1, 0xc0, 0x24,  // |...............$|
+/*00000010*/ 0x2d, 0x23, 0x48, 0xd6, 0x3b, 0x44, 0x96, 0x7f, 0x80, 0x10, 0x20, 0x86, 0x00, 0x00, 0x00, 0x00,  // |-#H.;D.... ..,..|
+/*00000020*/ 0x01, 0x01, 0x08, 0x0a, 0x0c, 0xc4, 0x69, 0x3a, 0x31, 0x63, 0xb3, 0x37, 0x55, 0xe1, 0x62, 0x48,  // |......i:1c.7U.bH|
+/*00000030*/ 0xa4, 0xff, 0xff, 0xa0, 0xc5, 0xd9, 0x5d, 0xd2, 0x4d, 0xe4, 0xca, 0xd7, 0x83, 0x27, 0xcc, 0x90,  // |......].M....'..|
+/*00000040*/ 0x02, 0x26, 0x63, 0xd3, 0x02, 0x3c, 0xf1, 0x20, 0x15, 0xa6, 0x8b, 0xff, 0x98, 0x8d, 0x57, 0x2a,  // |.&c..<. ......W*|
+/*00000050*/ 0x06, 0x4b, 0x06, 0x49, 0x5d, 0x8a, 0x28, 0x66, 0xe6, 0x57, 0x71, 0xd9, 0x27, 0xd1, 0xb9, 0xd6,  // |.K.I].(f.Wq.'...|
+/*00000060*/ 0x20, 0x48, 0x13, 0x2e, 0xbf, 0x30, 0x8c, 0xce, 0x49, 0x99, 0x2a, 0xb7, 0x94, 0xa4, 0x3a, 0x8e,  // | H...0..I.*...:.|
+/*00000070*/ 0x35, 0xcc, 0x48, 0xb2, 0x7f, 0xe1, 0xca, 0x2f, 0x08, 0x49, 0x7f, 0x35, 0x61, 0xcf, 0x59, 0xa2,  // |5.H..../.I.5a.Y.|
+/*00000080*/ 0x3a, 0x5e, 0x10, 0x5a, 0x0a, 0xd7, 0xa2, 0x38, 0x64, 0xe1, 0x7c, 0x5d, 0xbd, 0x29, 0x65, 0x5a,  // |:^.Z...8d.|].)eZ|
+/*00000090*/ 0xf2, 0x14, 0x30, 0x51, 0x9b, 0x56, 0xbb, 0xe2, 0x04, 0x48, 0x04, 0x23, 0x53, 0x30, 0x3a, 0x0a,  // |..0Q.V...H.#S0:.|
+/*000000a0*/ 0x48, 0x5a, 0xdd, 0xe4, 0xd7, 0x5e, 0x5b, 0x5d, 0x90, 0x89, 0x7d, 0xf0, 0xad, 0x24, 0x1a, 0xa8,  // |HZ...^[]..}..$..|
+/*000000b0*/ 0x81, 0xc1, 0x6b, 0x11, 0x97, 0x68, 0xc0, 0xbb, 0xe4, 0x5c, 0xba, 0x1a, 0xe8, 0x9c, 0xc9, 0x8b,  // |..k..h...\......|
+/*000000c0*/ 0xb8, 0x2b, 0x11, 0x85, 0x7f, 0xbf, 0x19, 0x81, 0xb0, 0xfc, 0xfd, 0x4a, 0xac, 0x7b, 0xd3, 0x60,  // |.+.........J.{.`|
+/*000000d0*/ 0x44, 0x1f, 0x5e, 0x8d, 0x05, 0x6e, 0xd7, 0xd1, 0xef, 0x11, 0x84, 0xd3, 0x0d, 0x63, 0xcf, 0x56,  // |D.^..n.......c.V|
+/*000000e0*/ 0xf9, 0x27, 0xc4, 0xd0, 0x39, 0x0e, 0xac, 0x7e, 0xba, 0xb3, 0xb8, 0x9c, 0x21, 0x21, 0xc8, 0xa0,  // |.'..9..~....!!..|
+/*000000f0*/ 0xbc, 0xd8, 0x82, 0x6f, 0x81, 0xa6, 0xc2, 0xf5, 0xe0, 0xdb, 0x41, 0xd0, 0xd4, 0x18, 0x2a, 0x5b,  // |...o......A...*[|
+/*00000100*/ 0x93, 0x3d, 0x5a, 0x08, 0xe2, 0xac, 0x8d, 0xd3, 0x7d, 0xcc, 0x49, 0x33, 0xc9, 0xb8, 0x9e, 0x12,  // |.=Z.....}.I3....|
+/*00000110*/ 0x86, 0x63, 0x38, 0x9c, 0xce, 0x4a, 0xb7, 0xcc, 0xe9, 0x4b, 0x5e, 0xb5, 0x24, 0x42, 0x47, 0x28,  // |.c8..J...K^.$BG(|
+/*00000120*/ 0x1c, 0x09, 0xe8, 0x84, 0xa6, 0xf0, 0x5f, 0x03, 0x94, 0x6f, 0x6a, 0x18, 0x60, 0xc3, 0x12, 0x58,  // |......_..oj.`..X|
+/*00000130*/ 0x6c, 0xbe, 0x13, 0x85, 0xa4, 0xdf, 0xe1, 0x8c, 0x3a, 0x04, 0xe9, 0x56, 0xa3, 0x09, 0x41, 0xf1,  // |l.......:..V..A.|
+/*00000140*/ 0x70, 0xf5, 0xc4, 0x27, 0x8e, 0x18, 0x09, 0x56, 0x5f, 0x82, 0x08, 0xec, 0x84, 0x55, 0x3b, 0x58,  // |p..'...V_....U;X|
+/*00000150*/ 0x84, 0x7b, 0xc8, 0x63, 0x70, 0x6a, 0x83, 0x04, 0xc8, 0xff, 0xe7, 0x6a, 0xbc, 0xee, 0xc0, 0xfe,  // |.{.cpj.....j....|
+/*00000160*/ 0xef, 0x60, 0xb7, 0x04, 0xb5, 0x57, 0x53, 0x5b, 0xeb, 0x4d, 0xec, 0x22, 0xe8, 0x59, 0x22, 0x64,  // |.`...WS[.M.".Y"d|
+/*00000170*/ 0x20, 0x5a, 0x61, 0x7d, 0x92, 0x02, 0x80, 0xd0, 0x85, 0x56, 0x98, 0x75, 0xbe, 0x35, 0xaf, 0xe4,  // | Za}.....V.u.5..|
+/*00000180*/ 0xc3, 0x06, 0xfa, 0xc2, 0x29, 0xce, 0x80, 0xe2, 0x68, 0xf3, 0xd8, 0x4b, 0x72, 0x46, 0x6e, 0xa3,  // |....)...h..KrFn.|
+/*00000190*/ 0x88, 0x57, 0xfb, 0x08, 0xec, 0x60, 0x2f, 0x3c, 0xa4, 0xaf, 0x08, 0x64, 0x45, 0x16, 0xba, 0x7b,  // |.W...`/<...dE..{|
+/*000001a0*/ 0xad, 0x24, 0x7a, 0x1f, 0x53, 0x46, 0x0c, 0xe6, 0xe9, 0x99, 0xd7, 0x2b, 0x9d, 0x62, 0xd9, 0x4a,  // |.$z.SF.....+.b.J|
+/*000001b0*/ 0x80, 0x2a, 0x43, 0xc2, 0x78, 0xa6, 0x6b, 0x38, 0x8e, 0xc8, 0x40, 0x6b, 0x03, 0xe2, 0x47, 0x04,  // |.*C.x.k8..@k..G.|
+/*000001c0*/ 0xda, 0x08, 0x72, 0xf5, 0xbc, 0x66, 0x3f, 0x33, 0x4d, 0xb6, 0x26, 0xd0, 0x66, 0x8c, 0xa0, 0x70,  // |..r..f?3M.&.f..p|
+/*000001d0*/ 0x25, 0xbc, 0x68, 0xda, 0x02, 0x79, 0x89, 0xed, 0x0c, 0xfc, 0xe7, 0x3d, 0x15, 0xcf, 0x5e, 0xc9,  // |%.h..y.....=..^.|
+/*000001e0*/ 0x63, 0xe0, 0x64, 0xb1, 0xfb, 0x28, 0xf7, 0x29, 0x52, 0xcf, 0x7a, 0xe3, 0x6d, 0x46, 0xc5, 0x1a,  // |c.d..(.)R.z.mF..|
+/*000001f0*/ 0x71, 0x24, 0x4e, 0x12, 0x56, 0x86, 0xc7, 0xf5, 0x98, 0x3e, 0xa9, 0xbc, 0x5d, 0xe9, 0x22, 0x88,  // |q$N.V....>..].".|
+/*00000200*/ 0x9b, 0x61, 0xc4, 0xa2, 0xcc, 0x27, 0x54, 0x07, 0x88, 0xeb, 0xe1, 0x4e, 0xaa, 0x0a, 0xd6, 0x94,  // |.a...'T....N....|
+/*00000210*/ 0x83, 0x32, 0xf8, 0x1d, 0xff, 0x67, 0xe5, 0x63, 0x78, 0x04, 0x11, 0x24, 0x25, 0xd7, 0x22, 0x54,  // |.2...g.cx..$%."T|
+/*00000220*/ 0x73, 0x87, 0xc9, 0x53, 0x72, 0x51, 0xda, 0x24, 0x33, 0xd7, 0x5c, 0x40, 0x86, 0x77, 0xf9, 0xc2,  // |s..SrQ.$3.\@.w..|
+/*00000230*/ 0xeb, 0x7d, 0x4c, 0x72, 0xeb, 0xc9, 0x8b, 0xcc, 0x79, 0xcd, 0x4a, 0x5a, 0x9e, 0xe2, 0x83, 0x20,  // |.}Lr....y.JZ... |
+/*00000240*/ 0x19, 0x5b, 0x4b, 0xe6, 0x5c, 0xe2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x7b, 0x80, 0x69,  // |.[K.\........{.i|
+/*00000250*/ 0x29, 0x53, 0x97, 0xc2, 0xc9, 0x4c, 0x00, 0x00, 0x00, 0x00, 0x67, 0x75, 0x81, 0x80, 0x12, 0x6e,  // |)S...L....gu...n|
+/*00000260*/ 0x50, 0x66, 0xe9, 0x0a, 0x28, 0x3b, 0x1a, 0xf1, 0xcb, 0x46, 0x72, 0xf7, 0xe9, 0x9f, 0x84, 0x29,  // |Pf..(;...Fr....)|
+/*00000270*/ 0xb9, 0x95, 0xf9, 0x6d, 0x5d, 0x04, 0x51, 0x7f, 0x0e, 0xf0, 0xe4, 0x3d, 0x4b, 0xd2, 0xb2, 0xb5,  // |...m].Q....=K...|
+/*00000280*/ 0x51, 0xf0, 0x31, 0x8e, 0x55, 0x18, 0x54, 0xf7, 0xee, 0x03, 0x37, 0x07, 0x33, 0x43, 0x8b, 0x5a,  // |Q.1.U.T...7.3C.Z|
+/*00000290*/ 0x1d, 0x16, 0xe8, 0xc4, 0x8b, 0x2c, 0x8a, 0x01, 0x5c, 0x45, 0xc6, 0xd1, 0x9d, 0xa9, 0x0a, 0xe2,  // |.....,..\E......|
+/*000002a0*/ 0x15, 0x4b, 0x8b, 0x00, 0x84, 0xbf, 0x3d, 0xad, 0xed, 0x86, 0x8e, 0x5c, 0x76, 0xe9, 0xbe, 0x4b,  // |.K....=....\v..K|
+/*000002b0*/ 0xd5, 0xb5, 0xb0, 0x08, 0x7f, 0xd7, 0x71, 0x57, 0x44, 0x67, 0x31, 0x8b, 0x43, 0x7d, 0xf8, 0x5a,  // |......qWDg1.C}.Z|
+/*000002c0*/ 0xcd, 0xe6, 0x4c, 0xec, 0x89, 0xa5, 0xd1, 0x03, 0x86, 0xfd, 0x01, 0x7d, 0x22, 0x32, 0xf0, 0xc3,  // |..L........}"2..|
+/*000002d0*/ 0x23, 0x99, 0x8e, 0x69, 0x14, 0x54, 0x54, 0x03, 0xec, 0x27, 0x6a, 0x7d, 0x13, 0xc7, 0xe2, 0x39,  // |#..i.TT..'j}...9|
+/*000002e0*/ 0x2b, 0xc0, 0x1a, 0x70, 0x82, 0xe9, 0x80, 0x73, 0xf4, 0x27, 0x26, 0xca, 0x5c, 0xf6, 0x7f, 0x46,  // |+..p...s.'&.\..F|
+/*000002f0*/ 0xf7, 0x00, 0x58, 0x3c, 0x3a, 0xcc, 0x1e, 0x9b, 0xd2, 0x22, 0x78, 0x04, 0x23, 0xc6, 0xfb, 0xdf,  // |..X<:...."x.#...|
+/*00000300*/ 0x8b, 0x36, 0xd6, 0xfa, 0xd8, 0x53, 0xbd, 0x0e, 0xaf, 0x1a, 0x04, 0xd1, 0x81, 0xd6, 0x1f, 0x1a,  // |.6...S..........|
+/*00000310*/ 0x74, 0x4d, 0xcf, 0xf6, 0xcf, 0x61, 0x6c, 0xd9, 0x7f, 0x1e, 0xb3, 0x1c, 0x2e, 0x74, 0x1a, 0x37,  // |tM...al......t.7|
+/*00000320*/ 0xfa, 0x2a, 0x24, 0x6d, 0xc2, 0x6d, 0x54, 0xfb, 0xd7, 0x9b, 0x34, 0x87, 0xeb, 0xac, 0x38, 0xc7,  // |.*$m.mT...4...8.|
+/*00000330*/ 0xe3, 0xc9, 0x6a, 0x98, 0x04, 0x2b, 0x33, 0x2d, 0x87, 0xf4, 0x25, 0xd6, 0x64, 0x14, 0xe8, 0xd0,  // |..j..+3-..%.d...|
+/*00000340*/ 0x84, 0x18, 0xc0, 0x39, 0x4d, 0xb5, 0xe5, 0xe2, 0xdb, 0x74, 0x59, 0x52, 0xad, 0x91, 0x1a, 0x55,  // |...9M....tYR...U|
+/*00000350*/ 0xae, 0xa3, 0xe1, 0x73, 0x4e, 0x76, 0x14, 0x94, 0xab, 0xec, 0x69, 0xb7, 0x0c, 0xa3, 0x71, 0x14,  // |...sNv....i...q.|
+/*00000360*/ 0x04, 0xbf, 0xf9, 0x75, 0xca, 0x2b, 0x8a, 0xa4, 0x5b, 0xe6, 0xe8, 0x61, 0x8d, 0xad, 0x1a, 0x62,  // |...u.+..[..a...b|
+/*00000370*/ 0x97, 0xaa, 0xfa, 0x3f, 0x88, 0x75, 0xcd, 0xe7, 0x29, 0x66, 0xbd, 0xcf, 0x50, 0xfd, 0x10, 0x09,  // |...?.u..)f..P...|
+/*00000380*/ 0x45, 0x2e, 0x97, 0xd5, 0x7c, 0xb4, 0x12, 0x7a, 0x5f, 0xfc, 0x1c, 0x74, 0x02, 0xf0, 0xa7, 0x98,  // |E...|..z_..t....|
+/*00000390*/ 0xd2, 0x03, 0x86, 0x19, 0x08, 0x54, 0x3d, 0x4d, 0x88, 0x13, 0x88, 0x87, 0x26, 0x61, 0x3e, 0x88,  // |.....T=M....&a>.|
+/*000003a0*/ 0xf8, 0x18, 0xcc, 0xac, 0x6f, 0xec, 0x12, 0x57, 0xfe, 0x80, 0xa3, 0xbe, 0x04, 0x39, 0x52, 0xe0,  // |....o..W.....9R.|
+/*000003b0*/ 0xc3, 0xfa, 0xed, 0x4f, 0xf5, 0x07, 0x59, 0x7e, 0xfa, 0xb9, 0x35, 0x36, 0xf2, 0x55, 0x23, 0xab,  // |...O..Y~..56.U#.|
+/*000003c0*/ 0x15, 0x65, 0x57, 0xb2, 0xce, 0xdb, 0x63, 0xe0, 0x1f, 0x1f, 0xa5, 0xfa, 0x70, 0x2e, 0x53, 0x76,  // |.eW...c.....p.Sv|
+/*000003d0*/ 0x20, 0x5b, 0x54, 0xc2, 0x0f, 0xe9, 0xca, 0x2c, 0x82, 0xf1, 0x30, 0x61, 0xbb, 0x99, 0x1e, 0x2a,  // | [T....,..0a...*|
+/*000003e0*/ 0xa2, 0x71, 0x91, 0x39, 0x07, 0xda, 0xcd, 0x50, 0xbb, 0x73, 0x5b, 0xa4, 0x05, 0x26, 0xee, 0x9f,  // |.q.9...P.s[..&..|
+/*000003f0*/ 0x5e, 0x88, 0x72, 0x92, 0xc9, 0x60, 0x2b, 0xd7, 0x6a, 0x91, 0x40, 0x52, 0x6b, 0xd1, 0xab, 0x00,  // |^.r..`+.j.@Rk...|
+/*00000400*/ 0xcc, 0x60, 0x53, 0x9b, 0x36, 0x40, 0x3b, 0x60, 0x18, 0x7f, 0x5f, 0xc2, 0x8c, 0x44, 0x08, 0xae,  // |.`S.6@;`.._..D..|
+/*00000410*/ 0x95, 0xae, 0x8c, 0xd7, 0x8d, 0x68, 0x4a, 0x42, 0x64, 0x1d, 0xdf, 0xdc, 0x17, 0x1a, 0x28, 0xe0,  // |.....hJBd.....(.|
+/*00000420*/ 0x55, 0x35, 0x00, 0x65, 0xe4, 0xd4, 0xd7, 0x3e, 0x1c, 0x6a, 0xa1, 0xbf, 0xba, 0xd8, 0x29, 0xce,  // |U5.e...>.j....).|
+/*00000430*/ 0xa6, 0x1f, 0xf9, 0x06, 0xff, 0x70, 0x43, 0xc8, 0xa0, 0x49, 0x03, 0xcd, 0x19, 0xf2, 0x16, 0x01,  // |.....pC..I......|
+/*00000440*/ 0x46, 0xf0, 0x29, 0xdb, 0xc2, 0x85, 0x89, 0x20, 0x37, 0x91, 0xd3, 0x74, 0x1c, 0x38, 0x08, 0xb3,  // |F.).... 7..t.8..|
+/*00000450*/ 0xd5, 0xa3, 0x4c, 0x52, 0x6e, 0xb3, 0x24, 0xc0, 0xbc, 0xd6, 0xc6, 0x64, 0x0b, 0x40, 0x44, 0xc4,  // |..LRn.$....d.@D.|
+/*00000460*/ 0xb9, 0x11, 0x10, 0x2a, 0xcd, 0x43, 0x99, 0x47, 0xe9, 0xfb, 0xf0, 0xe0, 0x56, 0x13, 0x40, 0x41,  // |...*.C.G....V.@A|
+/*00000470*/ 0x8a, 0x41, 0xcc, 0x92, 0x8d, 0xd5, 0xb9, 0x47, 0x05, 0xc7, 0x72, 0x76, 0x02, 0x09, 0x05, 0xd9,  // |.A.....G..rv....|
+/*00000480*/ 0x12, 0xb6, 0xa8, 0x0a, 0x86, 0x28, 0x5c, 0x41, 0x7e, 0xf1, 0xbc, 0xa9, 0x93, 0xae, 0xdf, 0x0b,  // |.....(\A~.......|
+/*00000490*/ 0xa1, 0xfc, 0x47, 0xb5, 0xde, 0x1c, 0x25, 0xe9, 0x8b, 0xb2, 0x03, 0x3a, 0xa7, 0x36, 0x4e, 0xcb,  // |..G...%....:.6N.|
+/*000004a0*/ 0xfa, 0xcd, 0xe6, 0x4f, 0x67, 0x3f, 0xe2, 0xa3, 0x3d, 0xdb, 0x61, 0x0d, 0x99, 0x05, 0x15, 0x96,  // |...Og?..=.a.....|
+/*000004b0*/ 0x14, 0x4e, 0x89, 0xf7, 0x8b, 0xdd, 0x84, 0x48, 0x35, 0xa8, 0x5c, 0x73, 0x67, 0x5d, 0x55, 0x5d,  // |.N.....H5.\sg]U]|
+/*000004c0*/ 0xe2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x82, 0x80, 0x69, 0x29, 0x54, 0x97, 0xc2, 0xcd,  // |..........i)T...|
+/*000004d0*/ 0x4c, 0x00, 0x00, 0x00, 0x00, 0x44, 0x07, 0x64, 0xa1, 0x66, 0xe3, 0x3c, 0x6e, 0x51, 0x96, 0x6a,  // |L....D.d.f.<nQ.j|
+/*000004e0*/ 0x06, 0x8c, 0x08, 0x92, 0x24, 0x03, 0xe1, 0xc2, 0xce, 0x80, 0x56, 0x75, 0x78, 0xd3, 0xc8, 0x1d,  // |....$.....Vux...|
+/*000004f0*/ 0x52, 0xc6, 0x32, 0xbf, 0x89, 0x91, 0x1a, 0x81, 0x9f, 0x11, 0x69, 0xd6, 0x9b, 0x27, 0x20, 0x19,  // |R.2.......i..' .|
+/*00000500*/ 0x59, 0x12, 0x2d, 0x85, 0x7e, 0x3a, 0xed, 0xa9, 0xd7, 0x92, 0xa4, 0x2d, 0xce, 0x2f, 0xf0, 0xd4,  // |Y.-.~:.....-./..|
+/*00000510*/ 0x0e, 0xec, 0xe4, 0xd8, 0x0c, 0xaf, 0x1c, 0x28, 0xe8, 0x47, 0xef, 0x04, 0x61, 0x2a, 0x38, 0x94,  // |.......(.G..a*8.|
+/*00000520*/ 0x40, 0x2f, 0x92, 0x3e, 0x8a, 0xcd, 0x24, 0xfc, 0xba, 0xa6, 0x68, 0xa7, 0x2c, 0xbb, 0xc1, 0x67,  // |@/.>..$...h.,..g|
+/*00000530*/ 0x5f, 0x0b, 0x85, 0x75, 0x70, 0xa5, 0x03, 0x0e, 0x25, 0xe2, 0x09, 0x34, 0x78, 0x66, 0x6f, 0xe0,  // |_..up...%..4xfo.|
+/*00000540*/ 0xf6, 0xac, 0xaf, 0xc6, 0x4a, 0xbc, 0xda, 0xc5, 0x06, 0x9e, 0x53, 0xe8, 0x75, 0x0b, 0x50, 0xde,  // |....J.....S.u.P.|
+/*00000550*/ 0xf7, 0xc0, 0x7f, 0x78, 0x97, 0x13, 0x22, 0x76, 0x18, 0x88, 0xf9, 0x99, 0xa1, 0x05, 0x42, 0xee,  // |...x.."v......B.|
+/*00000560*/ 0x40, 0xf0, 0xb7, 0x00, 0x0e, 0xf5, 0xac, 0x7c, 0xe5, 0x8b, 0x1f, 0x05, 0xe3, 0xd1, 0x9d, 0x6b,  // |@......|.......k|
+/*00000570*/ 0xd4, 0x9c, 0x3d, 0x14, 0x08, 0x21, 0xce, 0x72, 0x8f, 0x91, 0x9c, 0xba, 0xdd, 0x46, 0xcd, 0xef,  // |..=..!.r.....F..|
+/*00000580*/ 0x6d, 0x7b, 0x0d, 0x7d, 0x59, 0x91, 0x05, 0xc2, 0xde, 0x6c, 0x8a, 0x65, 0xd0, 0x97, 0xb1, 0x93,  // |m{.}Y....l.e....|
+/*00000590*/ 0x9f, 0x51, 0xec, 0x79, 0x30, 0x44, 0xbd, 0xe5, 0xdf, 0x94, 0xed, 0xad, 0x18, 0xd7, 0x24, 0x89,  // |.Q.y0D........$.|
+/*000005a0*/ 0x36, 0x65, 0xc5, 0x88, 0xc0, 0x9a, 0xb7, 0xaa, 0x58, 0x60, 0xfe, 0x6c, 0xe8, 0xf3, 0x39, 0x6b,  // |6e......X`.l..9k|
+/*000005b0*/ 0x45, 0xe6, 0x34, 0xbc, 0x61, 0x68, 0xa2, 0x70, 0x16, 0x49, 0x8b, 0x7d, 0x78, 0x09, 0x99, 0x21,  // |E.4.ah.p.I.}x..!|
+/*000005c0*/ 0x5a, 0xea, 0xfd, 0xbc, 0x69, 0x23, 0xd5, 0x15, 0xd1, 0x5c, 0x32, 0x8b, 0xc0, 0x7b, 0xb2, 0x1e,  // |Z...i#...\2..{..|
+/*000005d0*/ 0x56, 0xf1, 0x6b, 0xd0,                                                                          // |V.k.|
+};
+
+T_DECL(in_cksum_49479689a, "tests os_cpu_in_cksum with known problem packet in various random segmentation and memory alignment")
+{
+       uint16_t dsum = dumb_in_cksum(pkt49479689, sizeof(pkt49479689));
+       T_ASSERT_EQ(ntohs(dsum), (uint16_t)0xa32b, "verifying dumb chksum");
+       test_checksum(pkt49479689, sizeof(pkt49479689));
+}
+
+T_DECL(in_cksum_49479689b, "tests os_cpu_in_cksum with many random packets in various random segmentation and memory alignment")
+{
+       for (int i = 0; i < 100; i++) {
+               test_one_random_packet(4096);
+       }
+}
index 6f76a7a73deeeec972e9ddcde540f71aa5532af6..33f9faa24311c08a1bf68fca4d58c704f3a91916 100644 (file)
@@ -7,6 +7,7 @@
 #include <darwintest.h>
 #include <darwintest_utils.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
 
 #define GB (1ULL * 1024 * 1024 * 1024)
 
index cd9b6c776335d2b2926df63ea5fcaf62d86bfca3..d8a400c021727dca2b28d9c1e70a988810390996 100644 (file)
 #include <sys/kdebug.h>
 #include <sys/kdebug_signpost.h>
 #include <sys/sysctl.h>
+#include <stdint.h>
+
+#include "ktrace_helpers.h"
 
 T_GLOBAL_META(
        T_META_NAMESPACE("xnu.ktrace"),
        T_META_ASROOT(true));
 
-#define KDBG_TEST_MACROS    1
-#define KDBG_TEST_OLD_TIMES 2
+#define KDBG_TEST_MACROS         1
+#define KDBG_TEST_OLD_TIMES      2
+#define KDBG_TEST_FUTURE_TIMES   3
+#define KDBG_TEST_IOP_SYNC_FLUSH 4
 
 static void
 assert_kdebug_test(unsigned int flavor)
@@ -39,6 +44,8 @@ assert_kdebug_test(unsigned int flavor)
 
 T_DECL(kdebug_trace_syscall, "test that kdebug_trace(2) emits correct events")
 {
+       start_controlling_ktrace();
+
        ktrace_session_t s = ktrace_session_create();
        T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
 
@@ -49,10 +56,16 @@ T_DECL(kdebug_trace_syscall, "test that kdebug_trace(2) emits correct events")
                events_seen++;
                T_PASS("saw traced event");
 
-               T_EXPECT_EQ(tp->arg1, 1UL, "argument 1 of traced event is correct");
-               T_EXPECT_EQ(tp->arg2, 2UL, "argument 2 of traced event is correct");
-               T_EXPECT_EQ(tp->arg3, 3UL, "argument 3 of traced event is correct");
-               T_EXPECT_EQ(tp->arg4, 4UL, "argument 4 of traced event is correct");
+               if (ktrace_is_kernel_64_bit(s)) {
+                       T_EXPECT_EQ(tp->arg1, UINT64_C(0xfeedfacefeedface),
+                                       "argument 1 of traced event is correct");
+               } else {
+                       T_EXPECT_EQ(tp->arg1, UINT64_C(0xfeedface),
+                                       "argument 1 of traced event is correct");
+               }
+               T_EXPECT_EQ(tp->arg2, 2ULL, "argument 2 of traced event is correct");
+               T_EXPECT_EQ(tp->arg3, 3ULL, "argument 3 of traced event is correct");
+               T_EXPECT_EQ(tp->arg4, 4ULL, "argument 4 of traced event is correct");
 
                ktrace_end(s, 1);
        });
@@ -66,7 +79,8 @@ T_DECL(kdebug_trace_syscall, "test that kdebug_trace(2) emits correct events")
        ktrace_filter_pid(s, getpid());
 
        T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
-       T_ASSERT_POSIX_SUCCESS(kdebug_trace(TRACE_DEBUGID, 1, 2, 3, 4), NULL);
+       T_ASSERT_POSIX_SUCCESS(kdebug_trace(TRACE_DEBUGID, 0xfeedfacefeedface, 2,
+                       3, 4), NULL);
        ktrace_end(s, 0);
 
        dispatch_main();
@@ -78,6 +92,8 @@ T_DECL(kdebug_trace_syscall, "test that kdebug_trace(2) emits correct events")
 T_DECL(kdebug_signpost_syscall,
     "test that kdebug_signpost(2) emits correct events")
 {
+       start_controlling_ktrace();
+
        ktrace_session_t s = ktrace_session_create();
        T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
 
@@ -92,10 +108,10 @@ T_DECL(kdebug_signpost_syscall,
                single_seen++;
                T_PASS("single signpost is traced");
 
-               T_EXPECT_EQ(tp->arg1, 1UL, "argument 1 of single signpost is correct");
-               T_EXPECT_EQ(tp->arg2, 2UL, "argument 2 of single signpost is correct");
-               T_EXPECT_EQ(tp->arg3, 3UL, "argument 3 of single signpost is correct");
-               T_EXPECT_EQ(tp->arg4, 4UL, "argument 4 of single signpost is correct");
+               T_EXPECT_EQ(tp->arg1, 1ULL, "argument 1 of single signpost is correct");
+               T_EXPECT_EQ(tp->arg2, 2ULL, "argument 2 of single signpost is correct");
+               T_EXPECT_EQ(tp->arg3, 3ULL, "argument 3 of single signpost is correct");
+               T_EXPECT_EQ(tp->arg4, 4ULL, "argument 4 of single signpost is correct");
        });
 
        ktrace_events_single_paired(s,
@@ -104,18 +120,17 @@ T_DECL(kdebug_signpost_syscall,
                paired_seen++;
                T_PASS("paired signposts are traced");
 
-               T_EXPECT_EQ(start->arg1, 5UL, "argument 1 of start signpost is correct");
-               T_EXPECT_EQ(start->arg2, 6UL, "argument 2 of start signpost is correct");
-               T_EXPECT_EQ(start->arg3, 7UL, "argument 3 of start signpost is correct");
-               T_EXPECT_EQ(start->arg4, 8UL, "argument 4 of start signpost is correct");
+               T_EXPECT_EQ(start->arg1, 5ULL, "argument 1 of start signpost is correct");
+               T_EXPECT_EQ(start->arg2, 6ULL, "argument 2 of start signpost is correct");
+               T_EXPECT_EQ(start->arg3, 7ULL, "argument 3 of start signpost is correct");
+               T_EXPECT_EQ(start->arg4, 8ULL, "argument 4 of start signpost is correct");
 
-               T_EXPECT_EQ(end->arg1, 9UL, "argument 1 of end signpost is correct");
-               T_EXPECT_EQ(end->arg2, 10UL, "argument 2 of end signpost is correct");
-               T_EXPECT_EQ(end->arg3, 11UL, "argument 3 of end signpost is correct");
-               T_EXPECT_EQ(end->arg4, 12UL, "argument 4 of end signpost is correct");
+               T_EXPECT_EQ(end->arg1, 9ULL, "argument 1 of end signpost is correct");
+               T_EXPECT_EQ(end->arg2, 10ULL, "argument 2 of end signpost is correct");
+               T_EXPECT_EQ(end->arg3, 11ULL, "argument 3 of end signpost is correct");
+               T_EXPECT_EQ(end->arg4, 12ULL, "argument 4 of end signpost is correct");
 
-               T_EXPECT_EQ(single_seen, 1,
-               "signposts are traced in the correct order");
+               T_EXPECT_EQ(single_seen, 1, "signposts are traced in the correct order");
 
                ktrace_end(s, 1);
        });
@@ -134,6 +149,8 @@ T_DECL(kdebug_signpost_syscall,
        T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()),
            "started tracing");
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
        T_EXPECT_POSIX_SUCCESS(kdebug_signpost(SIGNPOST_SINGLE_CODE, 1, 2, 3, 4),
            "emitted single signpost");
        T_EXPECT_POSIX_SUCCESS(
@@ -142,11 +159,70 @@ T_DECL(kdebug_signpost_syscall,
        T_EXPECT_POSIX_SUCCESS(
                kdebug_signpost_end(SIGNPOST_PAIRED_CODE, 9, 10, 11, 12),
                "emitted end signpost");
+#pragma clang diagnostic pop
        ktrace_end(s, 0);
 
        dispatch_main();
 }
 
+T_DECL(syscall_tracing,
+               "ensure that syscall arguments are traced propertly")
+{
+       ktrace_session_t s = ktrace_session_create();
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
+
+       __block bool seen = 0;
+
+       ktrace_filter_pid(s, getpid());
+
+       static const int telemetry_syscall_no = 451;
+       static const uint64_t arg1 = 0xfeedfacefeedface;
+
+       ktrace_events_single(s, BSDDBG_CODE(DBG_BSD_EXCP_SC, telemetry_syscall_no),
+                       ^(struct trace_point *evt){
+               if (KDBG_EXTRACT_CODE(evt->debugid) != telemetry_syscall_no || seen) {
+                       return;
+               }
+
+               seen = true;
+               if (ktrace_is_kernel_64_bit(s)) {
+                       T_EXPECT_EQ(evt->arg1, arg1,
+                                       "argument 1 of syscall event is correct");
+               } else {
+                       T_EXPECT_EQ(evt->arg1, (uint64_t)(uint32_t)(arg1),
+                                       "argument 1 of syscall event is correct");
+               }
+
+               ktrace_end(s, 1);
+       });
+
+       ktrace_set_completion_handler(s, ^{
+               T_ASSERT_TRUE(seen,
+                               "should have seen a syscall event for kevent_id(2)");
+               ktrace_session_destroy(s);
+               T_END;
+       });
+
+       int error = ktrace_start(s, dispatch_get_main_queue());
+       T_ASSERT_POSIX_ZERO(error, "started tracing");
+
+       /*
+        * telemetry(2) has a 64-bit argument that will definitely be traced, and
+        * is unlikely to be used elsewhere by this process.
+        */
+       extern int __telemetry(uint64_t cmd, uint64_t deadline, uint64_t interval,
+                       uint64_t leeway, uint64_t arg4, uint64_t arg5);
+       (void)__telemetry(arg1, 0, 0, 0, 0, 0);
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 5 * NSEC_PER_SEC),
+                       dispatch_get_main_queue(), ^{
+               T_LOG("ending test due to timeout");
+               ktrace_end(s, 0);
+       });
+
+       dispatch_main();
+}
+
 #pragma mark kdebug behaviors
 
 #define WRAPPING_EVENTS_COUNT     (150000)
@@ -161,6 +237,8 @@ T_DECL(wrapping,
        int wait_wrapping_secs = (WRAPPING_EVENTS_COUNT / TRACE_ITERATIONS) + 5;
        int current_secs = wait_wrapping_secs;
 
+       start_controlling_ktrace();
+
        /* use sysctls manually to bypass libktrace assumptions */
 
        int mib[4] = { CTL_KERN, KERN_KDEBUG };
@@ -239,12 +317,14 @@ T_DECL(reject_old_events,
 {
        __block uint64_t event_horizon_ts;
 
+       start_controlling_ktrace();
+
        ktrace_session_t s = ktrace_session_create();
        T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
 
        __block int events = 0;
-       ktrace_events_range(s, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0),
-           KDBG_EVENTID(DBG_BSD + 1, 0, 0), ^(struct trace_point *tp) {
+       ktrace_events_single(s, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 1),
+           ^(struct trace_point *tp) {
                events++;
                T_EXPECT_GT(tp->timestamp, event_horizon_ts,
                "events in trace should be from after tracing began");
@@ -279,6 +359,8 @@ T_DECL(ascending_time_order,
        __block unsigned int prev_cpu = 0;
        __block bool in_order = true;
 
+       start_controlling_ktrace();
+
        ktrace_session_t s = ktrace_session_create();
        T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
 
@@ -451,6 +533,8 @@ T_DECL(dyld_events, "test that dyld registering libraries emits events")
        uint8_t *saw_unmapping = &(saw_events[1]);
        uint8_t *saw_shared_cache = &(saw_events[2]);
 
+       start_controlling_ktrace();
+
        ktrace_session_t s = ktrace_session_create();
        T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
 
@@ -651,6 +735,8 @@ expect_kdbg_test_events(ktrace_session_t s, bool use_all_callback,
 
 T_DECL(kernel_events, "ensure kernel macros work")
 {
+       start_controlling_ktrace();
+
        ktrace_session_t s = ktrace_session_create();
        T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
 
@@ -665,7 +751,7 @@ T_DECL(kernel_events, "ensure kernel macros work")
                 * OS.
                 */
                unsigned int dev_exp;
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
                dev_exp = is_development_kernel() ? EXP_KERNEL_EVENTS : 0U;
 #else
                dev_exp = EXP_KERNEL_EVENTS;
@@ -685,6 +771,8 @@ T_DECL(kernel_events, "ensure kernel macros work")
 
 T_DECL(kernel_events_filtered, "ensure that the filtered kernel macros work")
 {
+       start_controlling_ktrace();
+
        ktrace_session_t s = ktrace_session_create();
        T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
 
@@ -713,6 +801,8 @@ T_DECL(kernel_events_filtered, "ensure that the filtered kernel macros work")
 T_DECL(kernel_events_noprocfilt,
     "ensure that the no process filter kernel macros work")
 {
+       start_controlling_ktrace();
+
        ktrace_session_t s = ktrace_session_create();
        T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
 
@@ -744,7 +834,7 @@ T_DECL(kernel_events_noprocfilt,
 static volatile bool continue_abuse = true;
 
 #define STRESS_DEBUGID (0xfeedfac0)
-#define ABUSE_SECS (10)
+#define ABUSE_SECS (2)
 #define TIMER_NS (100 * NSEC_PER_USEC)
 /*
  * Use the quantum as the gap threshold.
@@ -767,6 +857,8 @@ kdebug_abuser_thread(void *ctx)
 T_DECL(stress, "emit events on all but one CPU with a small buffer",
     T_META_CHECK_LEAKS(false))
 {
+       start_controlling_ktrace();
+
        T_SETUPBEGIN;
        ktrace_session_t s = ktrace_session_create();
        T_WITH_ERRNO; T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create");
@@ -875,7 +967,7 @@ T_DECL(stress, "emit events on all but one CPU with a small buffer",
                        prev_timestamp = tp->timestamp;
                });
                ktrace_events_single(sread, TRACE_LOST_EVENTS, ^(struct trace_point *tp){
-                       T_LOG("lost: %llu on %d (%lu)", tp->timestamp, tp->cpuid, tp->arg1);
+                       T_LOG("lost: %llu on %d (%llu)", tp->timestamp, tp->cpuid, tp->arg1);
                });
 
                __block uint64_t last_write = 0;
@@ -891,7 +983,7 @@ T_DECL(stress, "emit events on all but one CPU with a small buffer",
                        end->timestamp - start->timestamp, &dur_ns);
                        T_QUIET; T_ASSERT_POSIX_ZERO(converror, "convert timestamp to ns");
 
-                       T_LOG("write: %llu (+%gs): %gus on %d: %lu events", start->timestamp,
+                       T_LOG("write: %llu (+%gs): %gus on %d: %llu events", start->timestamp,
                        (double)delta_ns / 1e9, (double)dur_ns / 1e3, end->cpuid, end->arg1);
                        last_write = end->timestamp;
                });
@@ -974,6 +1066,8 @@ T_DECL(stress, "emit events on all but one CPU with a small buffer",
 T_DECL(round_trips,
     "test sustained tracing with multiple round-trips through the kernel")
 {
+       start_controlling_ktrace();
+
        ktrace_session_t s = ktrace_session_create();
        T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
 
@@ -1037,6 +1131,8 @@ T_DECL(round_trips,
  */
 T_DECL(event_coverage, "ensure events appear up to the end of tracing")
 {
+       start_controlling_ktrace();
+
        ktrace_session_t s = ktrace_session_create();
        T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
 
@@ -1131,6 +1227,8 @@ set_nevents(unsigned int nevents)
 
 T_DECL(set_buffer_size, "ensure large buffer sizes can be set")
 {
+       start_controlling_ktrace();
+
        uint64_t memsize = 0;
        T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("hw.memsize", &memsize,
            &(size_t){ sizeof(memsize) }, NULL, 0), "get memory size");
@@ -1158,3 +1256,205 @@ T_DECL(set_buffer_size, "ensure large buffer sizes can be set")
                    "%u events in kernel when %u requested", actualevents, i);
        }
 }
+
+static void *
+donothing(__unused void *arg)
+{
+       return NULL;
+}
+
+T_DECL(long_names, "ensure long command names are reported")
+{
+       start_controlling_ktrace();
+
+       char longname[] = "thisisaverylongprocessname!";
+       char *longname_ptr = longname;
+       static_assert(sizeof(longname) > 16,
+           "the name should be longer than MAXCOMLEN");
+
+       int ret = sysctlbyname("kern.procname", NULL, NULL, longname,
+           sizeof(longname));
+       T_ASSERT_POSIX_SUCCESS(ret,
+           "use sysctl kern.procname to lengthen the name");
+
+       ktrace_session_t ktsess = ktrace_session_create();
+
+       /*
+        * 32-bit kernels can only trace 16 bytes of the string in their event
+        * arguments.
+        */
+       if (!ktrace_is_kernel_64_bit(ktsess)) {
+               longname[16] = '\0';
+       }
+
+       ktrace_filter_pid(ktsess, getpid());
+
+       __block bool saw_newthread = false;
+       ktrace_events_single(ktsess, TRACE_STRING_NEWTHREAD,
+           ^(struct trace_point *tp) {
+               if (ktrace_get_pid_for_thread(ktsess, tp->threadid) ==
+                   getpid()) {
+                       saw_newthread = true;
+
+                       char argname[32] = {};
+                       strncat(argname, (char *)&tp->arg1, sizeof(tp->arg1));
+                       strncat(argname, (char *)&tp->arg2, sizeof(tp->arg2));
+                       strncat(argname, (char *)&tp->arg3, sizeof(tp->arg3));
+                       strncat(argname, (char *)&tp->arg4, sizeof(tp->arg4));
+
+                       T_EXPECT_EQ_STR((char *)argname, longname_ptr,
+                           "process name of new thread should be long");
+
+                       ktrace_end(ktsess, 1);
+               }
+       });
+
+       ktrace_set_completion_handler(ktsess, ^{
+               ktrace_session_destroy(ktsess);
+               T_EXPECT_TRUE(saw_newthread,
+                   "should have seen the new thread");
+               T_END;
+       });
+
+       int error = ktrace_start(ktsess, dispatch_get_main_queue());
+       T_ASSERT_POSIX_ZERO(error, "started tracing");
+
+       pthread_t thread = NULL;
+       error = pthread_create(&thread, NULL, donothing, NULL);
+       T_ASSERT_POSIX_ZERO(error, "create new thread");
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 5 * NSEC_PER_SEC),
+           dispatch_get_main_queue(), ^{
+               ktrace_end(ktsess, 0);
+       });
+
+       error = pthread_join(thread, NULL);
+       T_ASSERT_POSIX_ZERO(error, "join to thread");
+
+       dispatch_main();
+}
+
+T_DECL(continuous_time, "make sure continuous time status can be queried",
+       T_META_RUN_CONCURRENTLY(true))
+{
+       bool cont_time = kdebug_using_continuous_time();
+       T_ASSERT_FALSE(cont_time, "should not be using continuous time yet");
+}
+
+static const uint32_t frame_eventid = KDBG_EVENTID(DBG_BSD,
+    DBG_BSD_KDEBUG_TEST, 1);
+
+static ktrace_session_t
+future_events_session(void)
+{
+       ktrace_session_t ktsess = ktrace_session_create();
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(ktsess, "failed to create session");
+
+       ktrace_events_single(ktsess, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0),
+           ^(struct trace_point *tp __unused) {
+               T_FAIL("saw future test event from IOP");
+       });
+       ktrace_events_single(ktsess, frame_eventid, ^(struct trace_point *tp) {
+               if (tp->debugid & DBG_FUNC_START) {
+                       T_LOG("saw start event");
+               } else {
+                       T_LOG("saw event traced after trying to trace future event, ending");
+                       ktrace_end(ktsess, 1);
+               }
+       });
+
+       ktrace_set_collection_interval(ktsess, 100);
+       return ktsess;
+}
+
+T_DECL(future_iop_events,
+    "make sure IOPs cannot trace events in the future while live tracing")
+{
+       start_controlling_ktrace();
+       ktrace_session_t ktsess = future_events_session();
+       ktrace_set_completion_handler(ktsess, ^{
+               ktrace_session_destroy(ktsess);
+               T_END;
+       });
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(ktsess, dispatch_get_main_queue()),
+           "start tracing");
+       kdebug_trace(frame_eventid | DBG_FUNC_START, 0, 0, 0, 0);
+       assert_kdebug_test(KDBG_TEST_FUTURE_TIMES);
+       kdebug_trace(frame_eventid | DBG_FUNC_END, 0, 0, 0, 0);
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 5 * NSEC_PER_SEC),
+           dispatch_get_main_queue(), ^{
+               T_FAIL("ending tracing after timeout");
+               ktrace_end(ktsess, 0);
+       });
+
+       dispatch_main();
+}
+
+T_DECL(future_iop_events_disabled,
+    "make sure IOPs cannot trace events in the future after disabling tracing")
+{
+       start_controlling_ktrace();
+       ktrace_session_t ktsess = future_events_session();
+       T_ASSERT_POSIX_ZERO(ktrace_configure(ktsess), "configure tracing");
+
+       kdebug_trace(frame_eventid | DBG_FUNC_START, 0, 0, 0, 0);
+       assert_kdebug_test(KDBG_TEST_FUTURE_TIMES);
+       kdebug_trace(frame_eventid | DBG_FUNC_END, 0, 0, 0, 0);
+
+       T_ASSERT_POSIX_ZERO(ktrace_disable_configured(ktsess),
+           "disable tracing");
+       ktrace_session_destroy(ktsess);
+
+       ktsess = future_events_session();
+       T_QUIET;
+       T_ASSERT_POSIX_ZERO(ktrace_set_use_existing(ktsess), "use existing trace");
+       ktrace_set_completion_handler(ktsess, ^{
+               ktrace_session_destroy(ktsess);
+               T_END;
+       });
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(ktsess, dispatch_get_main_queue()),
+           "start tracing existing session");
+
+       dispatch_main();
+}
+
+T_DECL(iop_events_disable,
+    "make sure IOP events are flushed before disabling trace")
+{
+       start_controlling_ktrace();
+       ktrace_session_t ktsess = future_events_session();
+
+       assert_kdebug_test(KDBG_TEST_IOP_SYNC_FLUSH);
+       T_ASSERT_POSIX_ZERO(ktrace_configure(ktsess), "configure tracing");
+
+       kdebug_trace(frame_eventid | DBG_FUNC_START, 0, 0, 0, 0);
+
+       T_ASSERT_POSIX_ZERO(ktrace_disable_configured(ktsess),
+           "disable tracing");
+       ktrace_session_destroy(ktsess);
+
+       ktsess = ktrace_session_create();
+       T_QUIET; T_WITH_ERRNO;
+       T_ASSERT_NOTNULL(ktsess, "create session");
+
+       ktrace_events_single(ktsess,
+           KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0xff),
+           ^(struct trace_point *tp __unused) {
+               T_PASS("saw IOP event from sync flush");
+       });
+
+       T_QUIET;
+       T_ASSERT_POSIX_ZERO(ktrace_set_use_existing(ktsess), "use existing trace");
+       ktrace_set_completion_handler(ktsess, ^{
+               ktrace_session_destroy(ktsess);
+               T_END;
+       });
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(ktsess, dispatch_get_main_queue()),
+           "start tracing existing session");
+
+       dispatch_main();
+}
index 76af0603a67d331f9bb71494d0670ca01d4de869..39a73a070d29ce96a58b390f0700dfd4e7e2437b 100644 (file)
@@ -21,6 +21,10 @@ T_GLOBAL_META(T_META_NAMESPACE("xnu.kernel_mtx_perf_test"));
 
 #define ITER 100000
 #define TEST_MTX_MAX_STATS              8
+#define FULL_CONTENDED 0
+#define HALF_CONTENDED 1
+#define MAX_CONDENDED  2
+
 
 #define TEST_MTX_LOCK_STATS             0
 #define TEST_MTX_UNLOCK_MTX_STATS       6
@@ -28,7 +32,8 @@ T_GLOBAL_META(T_META_NAMESPACE("xnu.kernel_mtx_perf_test"));
 static void
 test_from_kernel_lock_unlock_contended(void)
 {
-       int i, ret, name_size;
+       int i, ret;
+       unsigned long name_size;
        uint64_t avg, run, tot;
        size_t size;
        char iter[35];
@@ -37,7 +42,7 @@ test_from_kernel_lock_unlock_contended(void)
        T_LOG("Testing locking/unlocking mutex from kernel with contention.\n");
        T_LOG("Requesting test with %d iterations\n", ITER);
 
-       size = 1000;
+       size = 2000;
        buff = calloc(size, sizeof(char));
        T_QUIET; T_ASSERT_NOTNULL(buff, "Allocating buffer fo sysctl");
 
@@ -45,85 +50,95 @@ test_from_kernel_lock_unlock_contended(void)
        ret = sysctlbyname("kern.test_mtx_contended", buff, &size, iter, sizeof(iter));
        T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname kern.test_mtx_contended");
 
-       T_LOG("%s stats:\n%s\n", __func__, buff);
+       T_LOG("\n%s stats :\n%s\n", __func__, buff);
 
-       /* first line is "STATS INNER LOOP" */
        buff_p = buff;
-       while (*buff_p != '\n') {
-               buff_p++;
-       }
-       buff_p++;
-
-       /*
-        * Sequence of statistic lines like
-        * { samples 100000, tot 3586175 ns, avg 35 ns, max 3997 ns, min 33 ns } TEST_MTX_LOCK_STATS
-        * for all TEST_MTX_MAX_STATS statistics
-        */
-       for (i = 0; i < TEST_MTX_MAX_STATS; i++) {
-               avg_p = strstr(buff_p, "avg ");
-
-               /* contended test records statistics only for lock/unlock for now */
-               if (i == TEST_MTX_LOCK_STATS || i == TEST_MTX_UNLOCK_MTX_STATS) {
-                       T_QUIET; T_ASSERT_NOTNULL(avg_p, "contended %i average not found", i);
-                       sscanf(avg_p, "avg %llu", &avg);
-
-                       name = strstr(buff_p, "TEST_MTX_");
-                       end_name = strstr(buff_p, "_STATS");
-                       name_size = end_name - name - strlen("TEST_MTX_") + 1;
-
-                       char name_string[40];
-                       char avg_name_string[50];
-                       char *pre_string = "contended ";
-                       snprintf(name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]);
-                       pre_string = "avg contended ";
-                       snprintf(avg_name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]);
-                       T_PERF(name_string, avg, "ns", avg_name_string);
+       int t;
+       for (t = 0; t < MAX_CONDENDED; t++) {
+               char *type;
+               if (t == FULL_CONTENDED) {
+                       type = "FULL_CONTENDED ";
+               } else {
+                       type = "HALF_CONTENDED ";
                }
 
-               buff_p = avg_p;
+               /* first line is "STATS INNER LOOP" */
                while (*buff_p != '\n') {
                        buff_p++;
                }
                buff_p++;
-       }
 
-       while (*buff_p != '\n') {
+               /*
+                * Sequence of statistic lines like
+                * { samples 100000, tot 3586175 ns, avg 35 ns, max 3997 ns, min 33 ns } TEST_MTX_LOCK_STATS
+                * for all TEST_MTX_MAX_STATS statistics
+                */
+               for (i = 0; i < TEST_MTX_MAX_STATS; i++) {
+                       avg_p = strstr(buff_p, "avg ");
+
+                       /* contended test records statistics only for lock/unlock for now */
+                       if (i == TEST_MTX_LOCK_STATS || i == TEST_MTX_UNLOCK_MTX_STATS) {
+                               T_QUIET; T_ASSERT_NOTNULL(avg_p, "contended %i average not found", i);
+                               sscanf(avg_p, "avg %llu", &avg);
+
+                               name = strstr(buff_p, "TEST_MTX_");
+                               end_name = strstr(buff_p, "_STATS");
+                               name_size = (unsigned long) end_name - (unsigned long) name - strlen("TEST_MTX_") + 1;
+
+                               char name_string[40];
+                               char avg_name_string[50];
+                               char *pre_string = "contended ";
+                               snprintf(name_string, name_size + strlen(pre_string) + strlen(type), "%s%s%s", pre_string, type, &name[strlen("TEST_MTX_")]);
+                               pre_string = "avg contended ";
+                               snprintf(avg_name_string, name_size + strlen(pre_string) + strlen(type), "%s%s%s", pre_string, type, &name[strlen("TEST_MTX_")]);
+                               T_PERF(name_string, avg, "ns", avg_name_string);
+                       }
+
+                       buff_p = avg_p;
+                       while (*buff_p != '\n') {
+                               buff_p++;
+                       }
+                       buff_p++;
+               }
+
+               while (*buff_p != '\n') {
+                       buff_p++;
+               }
                buff_p++;
-       }
-       buff_p++;
 
-       /* next line is "STATS OUTER LOOP" */
-       while (*buff_p != '\n') {
+               /* next line is "STATS OUTER LOOP" */
+               while (*buff_p != '\n') {
+                       buff_p++;
+               }
                buff_p++;
-       }
-       buff_p++;
 
-       /* contended test records statistics only for lock/unlock for now */
-       avg_p = strstr(buff_p, "run time ");
-       T_QUIET; T_ASSERT_NOTNULL(avg_p, "contended %d loop run time not found", 0);
-       sscanf(avg_p, "run time %llu", &run);
+               /* contended test records statistics only for lock/unlock for now */
+               avg_p = strstr(buff_p, "run time ");
+               T_QUIET; T_ASSERT_NOTNULL(avg_p, "contended %d loop run time not found", 0);
+               sscanf(avg_p, "run time %llu", &run);
 
-       avg_p = strstr(buff_p, "total time ");
-       T_QUIET; T_ASSERT_NOTNULL(avg_p, "uncontended %d loop total time not found", 0);
-       sscanf(avg_p, "total time %llu", &tot);
+               avg_p = strstr(buff_p, "total time ");
+               T_QUIET; T_ASSERT_NOTNULL(avg_p, "uncontended %d loop total time not found", 0);
+               sscanf(avg_p, "total time %llu", &tot);
 
-       if (run < tot) {
-               avg = run;
-       } else {
-               avg = tot;
-       }
+               if (run < tot) {
+                       avg = run;
+               } else {
+                       avg = tot;
+               }
 
-       name = strstr(buff_p, "TEST_MTX_");
-       end_name = strstr(buff_p, "_STATS");
-       name_size = end_name - name - strlen("TEST_MTX_") + 1;
+               name = strstr(buff_p, "TEST_MTX_");
+               end_name = strstr(buff_p, "_STATS");
+               name_size = (unsigned long) end_name - (unsigned long) name - strlen("TEST_MTX_") + 1;
 
-       char name_string[50];
-       char avg_name_string[60];
-       char *pre_string = "contended loop ";
-       snprintf(name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]);
-       pre_string = "avg time contended loop ";
-       snprintf(avg_name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]);
-       T_PERF(name_string, avg / ITER, "ns", avg_name_string);
+               char name_string[50];
+               char avg_name_string[60];
+               char *pre_string = "contended loop ";
+               snprintf(name_string, name_size + strlen(pre_string) + strlen(type), "%s%s%s", pre_string, type, &name[strlen("TEST_MTX_")]);
+               pre_string = "avg time contended loop ";
+               snprintf(avg_name_string, name_size + strlen(pre_string) + strlen(type), "%s%s%s", pre_string, type, &name[strlen("TEST_MTX_")]);
+               T_PERF(name_string, avg / ITER, "ns", avg_name_string);
+       }
 
        free(buff);
 }
@@ -131,7 +146,8 @@ test_from_kernel_lock_unlock_contended(void)
 static void
 test_from_kernel_lock_unlock_uncontended(void)
 {
-       int i, ret, name_size;
+       int i, ret;
+       unsigned long name_size;
        uint64_t avg, run, tot;
        size_t size;
        char iter[35];
@@ -169,7 +185,7 @@ test_from_kernel_lock_unlock_uncontended(void)
 
                name = strstr(buff_p, "TEST_MTX_");
                end_name = strstr(buff_p, "_STATS");
-               name_size = end_name - name - strlen("TEST_MTX_") + 1;
+               name_size = (unsigned long) end_name - (unsigned long) name - strlen("TEST_MTX_") + 1;
 
                char name_string[40];
                char avg_name_string[50];
@@ -219,7 +235,7 @@ test_from_kernel_lock_unlock_uncontended(void)
 
                name = strstr(buff_p, "TEST_MTX_");
                end_name = strstr(buff_p, "_STATS");
-               name_size = end_name - name - strlen("TEST_MTX_") + 1;
+               name_size = (unsigned long) end_name - (unsigned long) name - strlen("TEST_MTX_") + 1;
 
                char name_string[50];
                char avg_name_string[60];
@@ -238,78 +254,175 @@ test_from_kernel_lock_unlock_uncontended(void)
        free(buff);
 }
 
-extern char **environ;
+#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
+static bool
+get_freq(float val, char scale, int *int_val)
+{
+       switch (scale) {
+       case 'M':
+       case 'm':
+               *int_val = (int) val;
+               break;
+       case 'G':
+       case 'g':
+               *int_val = (int) (val * 1000);
+               break;
+       default:
+               return FALSE;
+       }
+       return TRUE;
+}
+
+static bool
+parse_freq(char* buff, int buff_size, const char* string_start, int string_start_size, char* to_parse)
+{
+       char* start;
+       float val;
+       char scale;
+       int int_val;
+
+       start = strstr(to_parse, string_start);
+       if (start == NULL) {
+               return FALSE;
+       }
+
+       if (strstr(start, "Hz") != NULL) {
+               sscanf(start + string_start_size, "%f%cHz", &val, &scale);
+       } else {
+               if (strstr(start, "hz") != NULL) {
+                       sscanf(start + string_start_size, "%f%chz", &val, &scale);
+               } else {
+                       return FALSE;
+               }
+       }
+
+       if (!get_freq(val, scale, &int_val)) {
+               return FALSE;
+       }
+
+       snprintf(buff, buff_size, "%d", int_val);
+
+       return TRUE;
+}
+
+static bool freq_fixed = FALSE;
+static char str_val_min[10];
+static char str_val_max[10];
+
+static bool
+get_previous_freq_values(void)
+{
+       FILE *fp;
+       char out_xcpm[1035];
+       bool min_scan = FALSE;
+       bool max_scan = FALSE;
+
+       memset(str_val_min, 0, sizeof(str_val_min));
+       memset(str_val_max, 0, sizeof(str_val_max));
+
+       fp = popen("/usr/local/bin/xcpm limits", "r");
+       if (fp == NULL) {
+               return FALSE;
+       }
+
+       while (fgets(out_xcpm, sizeof(out_xcpm) - 1, fp) != NULL && (!max_scan || !min_scan)) {
+               if (!max_scan) {
+                       max_scan = parse_freq(str_val_max, sizeof(str_val_max), "Max frequency:", sizeof("Max frequency:"), out_xcpm);
+               }
+               if (!min_scan) {
+                       min_scan = parse_freq(str_val_min, sizeof(str_val_min), "Min frequency:", sizeof("Min frequency:"), out_xcpm);
+               }
+       }
+
+       pclose(fp);
+
+       if (!max_scan || !min_scan) {
+               return FALSE;
+       }
+
+       return TRUE;
+}
+#endif
+
 static void
 fix_cpu_frequency(void)
 {
-#if CONFIG_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
        int spawn_ret, pid;
        char *const clpcctrl_args[] = {"/usr/local/bin/clpcctrl", "-f", "5000", NULL};
 
        T_LOG("Setting cpu frequency to %d\n", 5000);
 
-       spawn_ret = posix_spawn(&pid, clpcctrl_args[0], NULL, NULL, clpcctrl_args, environ);
-       waitpid(pid, &spawn_ret, 0);
+       spawn_ret = posix_spawn(&pid, clpcctrl_args[0], NULL, NULL, clpcctrl_args, *_NSGetEnviron());
+       T_QUIET; T_ASSERT_POSIX_ZERO(spawn_ret, "posix_spawn");
+       T_QUIET; T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, "waitpid failed");
+       T_QUIET; T_ASSERT_EQ(spawn_ret, 0, " clpcctrl failed");
 
-#else /*CONFIG_EMBEDDED*/
+#else /*(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)*/
 
        int spawn_ret, pid;
-       int ret, nom_freq;
+       int ret;
        size_t len;
-       float val;
-       char scale;
-       char *buffer, *cpu_freq;
+       char *buffer;
        char str_val[10];
 
+       if (!get_previous_freq_values()) {
+               T_LOG("Impossible to parse freq values from xcpm");
+               freq_fixed = FALSE;
+               return;
+       }
+
        ret = sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0);
        T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname machdep.cpu.brand_string");
 
-       buffer = malloc(len + 2);
+       buffer = calloc(len + 2, sizeof(char));
        ret = sysctlbyname("machdep.cpu.brand_string", buffer, &len, NULL, 0);
        T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname machdep.cpu.brand_string");
        buffer[len + 1] = '\0';
 
-       cpu_freq = strstr(buffer, "CPU @ ");
-       if (cpu_freq == NULL) {
-               T_LOG("Could not fix frequency, %s field not present\n", "CPU @ ");
-               goto out;
-       }
-
-       if (strstr(cpu_freq, "Hz") != NULL) {
-               sscanf(cpu_freq, "CPU @ %f%cHz", &val, &scale);
-       } else {
-               if (strstr(cpu_freq, "hz") != NULL) {
-                       sscanf(cpu_freq, "CPU @ %f%chz", &val, &scale);
-               } else {
-                       T_LOG("Could not fix frequency, %s field not present\n", "Hz");
-                       goto out;
-               }
+       memset(str_val, 0, sizeof(str_val));
+       if (!parse_freq(str_val, sizeof(str_val), "CPU @", sizeof("CPU @"), buffer)) {
+               T_LOG("Impossible to parse freq values from machdep.cpu.brand_string (string was %s)", buffer);
+               freq_fixed = FALSE;
+               return;
        }
 
-       switch (scale) {
-       case 'M':
-       case 'm':
-               nom_freq = (int) val;
-               break;
-       case 'G':
-       case 'g':
-               nom_freq = (int) (val * 1000);
-               break;
-       default:
-               T_LOG("Could not fix frequency, scale field is %c\n", scale);
-               goto out;
-       }
-
-       snprintf(str_val, 10, "%d", nom_freq);
-       T_LOG("Setting min and max cpu frequency to %d (%s)\n", nom_freq, str_val);
+       T_LOG("Previous min and max cpu frequency (%s) (%s)\n", str_val_min, str_val_max);
+       T_LOG("Setting min and max cpu frequency to (%s)\n", str_val);
        char *xcpm_args[] = {"/usr/local/bin/xcpm", "limits", str_val, str_val, NULL};
-       spawn_ret = posix_spawn(&pid, xcpm_args[0], NULL, NULL, xcpm_args, environ);
-       waitpid(pid, &spawn_ret, 0);
+       spawn_ret = posix_spawn(&pid, xcpm_args[0], NULL, NULL, xcpm_args, *_NSGetEnviron());
+       T_QUIET; T_ASSERT_POSIX_ZERO(spawn_ret, "posix_spawn");
+       T_QUIET; T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, "waitpid failed");
+       T_QUIET; T_ASSERT_EQ(spawn_ret, 0, "xcpm limits failed");
+
+       freq_fixed = TRUE;
 
-out:
        free(buffer);
        return;
-#endif /*CONFIG_EMBEDDED*/
+#endif /*(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)*/
+}
+
+static void
+cleanup_cpu_freq(void)
+{
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
+       int spawn_ret, pid;
+       char *const clpcctrl_args[] = {"/usr/local/bin/clpcctrl", "-d", NULL};
+       spawn_ret = posix_spawn(&pid, clpcctrl_args[0], NULL, NULL, clpcctrl_args, *_NSGetEnviron());
+       T_QUIET; T_ASSERT_POSIX_ZERO(spawn_ret, "posix_spawn");
+       T_QUIET; T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, "waitpid failed");
+       T_QUIET; T_ASSERT_EQ(spawn_ret, 0, "clpcctrl failed");
+
+#else
+       if (freq_fixed) {
+               int spawn_ret, pid;
+               char *xcpm_args[] = {"/usr/local/bin/xcpm", "limits", str_val_min, str_val_max, NULL};
+               spawn_ret = posix_spawn(&pid, xcpm_args[0], NULL, NULL, xcpm_args, *_NSGetEnviron());
+               T_QUIET; T_ASSERT_POSIX_ZERO(spawn_ret, "posix_spawn");
+               T_QUIET; T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, "waitpid failed");
+               T_QUIET; T_ASSERT_EQ(spawn_ret, 0, "xcpm limits failed");
+       }
+#endif
 }
 
 T_DECL(kernel_mtx_perf_test,
@@ -318,6 +431,8 @@ T_DECL(kernel_mtx_perf_test,
 {
        fix_cpu_frequency();
 
+       T_ATEND(cleanup_cpu_freq);
+
        test_from_kernel_lock_unlock_uncontended();
        test_from_kernel_lock_unlock_contended();
 }
index f5f32d45b4b6f835edf7e1982b8c72fdfda2a753..29099bae662b3ae4bed0eae638f2da41a8e095cd 100644 (file)
 #include <mach-o/swap.h>
 #include <libkern/OSByteOrder.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 #define MAX_LEN 1024
 
-#if TARGET_OS_MAC && !TARGET_OS_EMBEDDED
+#if TARGET_OS_MAC && !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
        //running on macOS
        #define KERNEL_SEARCH_DIR "/System/Library/Kernels/*"
 #else
index 734de790267153be3069bad36e5ee58d345d4842..d84d65fa56303258ed74d92f50404959ff4a268c 100644 (file)
@@ -17,7 +17,8 @@
 
 T_GLOBAL_META(
        T_META_NAMESPACE("xnu.kevent"),
-       T_META_CHECK_LEAKS(false));
+       T_META_CHECK_LEAKS(false),
+       T_META_RUN_CONCURRENTLY(true));
 
 #define TIMEOUT_SECS 10
 
@@ -147,7 +148,8 @@ reader_thread(void *arg)
                        if (errno == EINTR) {
                                continue;
                        } else if (errno == EBADF) {
-                               T_LOG("reader got an error (%s), shutting down", strerror(errno));
+                               T_LOG("reader got an error (%s), shutting down",
+                                   strerror(errno));
                                return NULL;
                        } else {
                                T_ASSERT_POSIX_SUCCESS(rdsize, "read on PTY");
@@ -178,7 +180,8 @@ writer_thread(void *arg)
                        if (errno == EINTR) {
                                continue;
                        } else {
-                               T_LOG("writer got an error (%s), shutting down", strerror(errno));
+                               T_LOG("writer got an error (%s), shutting down",
+                                   strerror(errno));
                                return NULL;
                        }
                }
@@ -192,16 +195,6 @@ writer_thread(void *arg)
 static int attach_master, attach_slave;
 static pthread_t reader, writer;
 
-static void
-join_threads(void)
-{
-       close(attach_slave);
-       close(attach_master);
-       writing = false;
-       pthread_join(reader, NULL);
-       pthread_join(writer, NULL);
-}
-
 static void
 redispatch(dispatch_group_t grp, dispatch_source_type_t type, int fd)
 {
@@ -246,7 +239,6 @@ T_DECL(attach_while_tty_wakeups,
            (void *)(uintptr_t)attach_master), NULL);
        T_ASSERT_POSIX_ZERO(pthread_create(&writer, NULL, writer_thread,
            (void *)(uintptr_t)attach_slave), NULL);
-       T_ATEND(join_threads);
        T_SETUPEND;
 
        redispatch(grp, DISPATCH_SOURCE_TYPE_READ, attach_master);
index 9bbb7d62ecb85f5100b01e618832ecfe398b8ae1..4039173064af5c0b05f142c3015bdd55edf0c93d 100644 (file)
@@ -58,7 +58,7 @@ struct test_msg {
 
 #pragma mark pthread callbacks
 
-static void
+static pthread_t
 thread_create_at_qos(qos_class_t qos, void * (*function)(void *));
 static void
 send(mach_port_t send_port, mach_port_t reply_port, mach_port_t msg_port, mach_msg_priority_t qos, mach_msg_option_t options);
@@ -207,6 +207,62 @@ workloop_cb_test_sync_send_and_enable(uint64_t *workloop_id, struct kevent_qos_s
        T_END;
 }
 
+/*
+ * WL handler which checks the overridden Qos and then handoffs the IPC,
+ * enables the knote and checks for the Qos again that it hasn't dropped the sync ipc override.
+ */
+static void
+workloop_cb_test_sync_send_and_enable_handoff(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events)
+{
+       unsigned override_priority;
+       int error;
+
+       T_LOG("Workloop handler workloop_cb_test_sync_send_and_enable_handoff called");
+
+       EXPECT_TEST_MSG(*eventslist);
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       /* The effective Qos should be the one expected after override */
+       EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
+           "dispatch_source event handler QoS should be %s",
+           g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]);
+
+       /* Snapshot the current override priority */
+       override_priority = get_user_promotion_basepri();
+
+       struct kevent_qos_s *kev = *eventslist;
+       mach_msg_header_t *hdr = (mach_msg_header_t *)kev->ext[0];
+
+       /* handoff the IPC */
+       struct kevent_qos_s handoff_kev = {
+               .filter = EVFILT_WORKLOOP,
+               .ident = hdr->msgh_remote_port,
+               .flags = EV_ADD | EV_DISABLE,
+               .fflags = 0x80000000,
+       };
+
+       error = kevent_id(*workloop_id, &handoff_kev, 1, &handoff_kev, 1, NULL,
+           NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS | KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(error, "kevent_id");
+       T_ASSERT_EQ(0, error, "Handed off the sync IPC");
+
+       /* Enable the knote */
+       enable_kevent(workloop_id, kev->ident);
+
+       /*
+        * Check if the override has not been dropped.
+        */
+       EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
+           "dispatch_source event handler QoS should still be %s",
+           g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]);
+
+       *events = 0;
+       T_END;
+}
+
 /*
  * WL handler receives the first message and checks sync ipc override, then enables the knote
  * and receives 2nd message and checks it sync ipc override.
@@ -346,7 +402,7 @@ populate_kevent(struct kevent_qos_s *kev, unsigned long long port)
        kev->ident = port;
        kev->filter = EVFILT_MACHPORT;
        kev->flags = EV_ADD | EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED;
-       kev->fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
+       kev->fflags = (MACH_RCV_MSG | MACH_RCV_VOUCHER | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
            MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) |
            MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0));
        kev->data = 1;
@@ -355,15 +411,15 @@ populate_kevent(struct kevent_qos_s *kev, unsigned long long port)
 static void
 enable_kevent(uint64_t *workloop_id, unsigned long long port)
 {
-       kern_return_t kr;
        struct kevent_qos_s kev;
+       int error;
 
        populate_kevent(&kev, port);
        struct kevent_qos_s kev_err[] = {{ 0 }};
 
-       kr = kevent_id(*workloop_id, &kev, 1, kev_err, 1, NULL,
+       error = kevent_id(*workloop_id, &kev, 1, kev_err, 1, NULL,
            NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS | KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(kr, "kevent_id");
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(error, "kevent_id");
 }
 
 /*
@@ -847,16 +903,14 @@ send(
                send_msg.qos = (uint32_t)_pthread_qos_class_encode(qc, relpri, 0);
        }
 
-       ret = mach_msg(&(send_msg.header),
-           MACH_SEND_MSG |
-           MACH_SEND_TIMEOUT |
-           MACH_SEND_OVERRIDE |
-           ((reply_port ? MACH_SEND_SYNC_OVERRIDE : 0) | options),
-           send_msg.header.msgh_size,
-           0,
-           MACH_PORT_NULL,
-           10000,
-           0);
+       mach_msg_option_t send_opts = options;
+       if (reply_port) {
+               send_opts |= MACH_SEND_SYNC_OVERRIDE;
+       }
+       send_opts |= MACH_SEND_MSG | MACH_SEND_TIMEOUT | MACH_SEND_OVERRIDE;
+
+       ret = mach_msg(&send_msg.header, send_opts, send_msg.header.msgh_size,
+           0, MACH_PORT_NULL, 10000, qos);
 
        T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client mach_msg");
 }
@@ -957,7 +1011,7 @@ qos_client_send_to_intransit(void *arg __unused)
                    (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), 0);
        }
 
-       T_LOG("Sent 5 msgs, now trying to send sync ipc messgae, which will block with a timeout\n");
+       T_LOG("Sent 5 msgs, now trying to send sync ipc message, which will block with a timeout\n");
        /* Send the message to the in-transit port, it should block and override the rcv's workloop */
        send(msg_port, special_reply_port, MACH_PORT_NULL,
            (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0), 0);
@@ -974,7 +1028,7 @@ T_HELPER_DECL(qos_client_send_to_intransit_with_thr_pri,
        sleep(HELPER_TIMEOUT_SECS);
 }
 
-static void
+static pthread_t
 thread_create_at_qos(qos_class_t qos, void * (*function)(void *))
 {
        qos_class_t qos_thread;
@@ -994,6 +1048,7 @@ thread_create_at_qos(qos_class_t qos, void * (*function)(void *))
        T_LOG("pthread created\n");
        pthread_get_qos_class_np(thread, &qos_thread, NULL);
        T_EXPECT_EQ(qos_thread, (qos_class_t)qos, NULL);
+       return thread;
 }
 
 static void *
@@ -1032,6 +1087,58 @@ qos_send_and_sync_rcv(void *arg __unused)
        return NULL;
 }
 
+static void *
+qos_sync_rcv(void *arg __unused)
+{
+       mach_port_t qos_send_port;
+       mach_port_t special_reply_port;
+
+       T_LOG("Client: from created thread\n");
+
+       kern_return_t kr = bootstrap_look_up(bootstrap_port,
+           KEVENT_QOS_SERVICE_NAME, &qos_send_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
+
+       special_reply_port = thread_get_special_reply_port();
+       T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port");
+
+       /* enqueue two messages to make sure that mqueue is not empty */
+       send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL,
+           (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_QUEUE_OVERRIDE], 0, 0), 0);
+
+       sleep(RECV_TIMEOUT_SECS);
+
+       /* sync wait on msg port */
+       receive(special_reply_port, qos_send_port);
+
+       T_LOG("Client done doing sync rcv, now waiting for server to end the test");
+       sleep(SEND_TIMEOUT_SECS);
+
+       T_ASSERT_FAIL("client timed out");
+       return NULL;
+}
+
+static void
+thread_wait_to_block(mach_port_t thread_port)
+{
+       thread_extended_info_data_t extended_info;
+       kern_return_t kr;
+
+       while (1) {
+               mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT;
+               kr = thread_info(thread_port, THREAD_EXTENDED_INFO,
+                   (thread_info_t)&extended_info, &count);
+
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info");
+
+               if (extended_info.pth_run_state == TH_STATE_WAITING) {
+                       T_LOG("Target thread blocked\n");
+                       break;
+               }
+               thread_switch(thread_port, SWITCH_OPTION_DEPRESS, 0);
+       }
+}
+
 T_HELPER_DECL(qos_client_send_sync_and_sync_rcv,
     "Send messages and syncronously wait for rcv")
 {
@@ -1039,6 +1146,25 @@ T_HELPER_DECL(qos_client_send_sync_and_sync_rcv,
        sleep(HELPER_TIMEOUT_SECS);
 }
 
+T_HELPER_DECL(qos_client_sync_rcv_qos_change,
+    "Send messages and syncronously wait for rcv and change qos of waiting thread")
+{
+       pthread_t rcv_thread;
+
+       rcv_thread = thread_create_at_qos(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], qos_sync_rcv);
+
+       T_LOG("Waiting for %d seconds before changing qos of rcv thread", SEND_TIMEOUT_SECS);
+       sleep(SEND_TIMEOUT_SECS);
+
+       /* Wait for the thread to block */
+       thread_wait_to_block(pthread_mach_thread_np(rcv_thread));
+
+       /* Update the rcv thread's qos */
+       pthread_override_qos_class_start_np(rcv_thread, g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0);
+
+       sleep(HELPER_TIMEOUT_SECS);
+}
+
 static void *
 qos_client_send_sync_msg_and_test_link(void *arg)
 {
@@ -1327,7 +1453,7 @@ qos_client_create_sepcial_reply_and_spawn_thread(void *arg __unused)
        /* Create a new thread to send the sync message on our special reply port */
        thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_client_destroy_other_threads_port);
 
-       /* Client starting to receive messgae */
+       /* Client starting to receive message */
        receive(special_reply_port, qos_send_port);
 
        sleep(3 * SEND_TIMEOUT_SECS);
@@ -1457,6 +1583,10 @@ expect_kevent_id_recv(mach_port_t port, qos_class_t qos[], const char *qos_name[
                T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
                            worker_cb, event_cb,
                            (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_and_enable, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_and_enable_handoff") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                           worker_cb, event_cb,
+                           (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_and_enable_handoff, 0, 0), NULL);
        } else if (strcmp(wl_function, "workloop_cb_test_send_two_sync") == 0) {
                T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
                            worker_cb, event_cb,
@@ -1521,7 +1651,7 @@ expect_kevent_id_recv(mach_port_t port, qos_class_t qos[], const char *qos_name[
                                             .ident = port,
                                             .filter = EVFILT_MACHPORT,
                                             .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED,
-                                            .fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
+                                            .fflags = (MACH_RCV_MSG | MACH_RCV_VOUCHER | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
            MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) |
            MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0)),
                                             .data = 1,
@@ -1553,6 +1683,51 @@ T_HELPER_DECL(server_kevent_id,
            RECV_TIMEOUT_SECS);
 }
 
+static void *
+special_reply_port_thread(void *ctxt)
+{
+       kern_return_t ret;
+       mach_port_t rcv_port = *(mach_port_t *)ctxt;
+       struct test_msg rcv_msg = {
+               .header = {
+                       .msgh_remote_port = MACH_PORT_NULL,
+                       .msgh_local_port  = rcv_port,
+                       .msgh_size        = sizeof(rcv_msg),
+               },
+       };
+
+       ret = mach_msg(&rcv_msg.header, MACH_RCV_MSG | MACH_RCV_TIMEOUT, 0,
+           rcv_msg.header.msgh_size, rcv_port, 1000, MACH_PORT_NULL);
+
+       T_EXPECT_EQ(ret, MACH_RCV_TIMED_OUT, "receive should not panic");
+
+       *(mach_port_t *)ctxt = MACH_PORT_NULL;
+
+       sleep(1); // give some time to pthread_exit
+
+       ret = mach_msg(&rcv_msg.header, MACH_RCV_MSG | MACH_RCV_TIMEOUT, 0,
+           rcv_msg.header.msgh_size, rcv_port, 1000, MACH_PORT_NULL);
+
+       T_EXPECT_EQ(ret, MACH_RCV_TIMED_OUT, "receive should not panic");
+
+       T_END;
+}
+
+T_DECL(special_reply_port, "basic special reply port robustness checks",
+    T_META_RUN_CONCURRENTLY(true))
+{
+       pthread_t thread;
+       mach_port_t srp = thread_get_special_reply_port();
+
+       pthread_create(&thread, NULL, special_reply_port_thread, &srp);
+
+       while (srp) {
+               usleep(1000);
+       }
+
+       pthread_exit(NULL);
+}
+
 #define TEST_QOS(server_name, client_name, name, wl_function_name, qos_bo, qos_bo_name, qos_qo, qos_qo_name, qos_ao, qos_ao_name) \
        T_DECL(server_kevent_id_##name, \
                        "Event delivery at " qos_ao_name " QoS using a kevent_id", \
@@ -1677,6 +1852,7 @@ TEST_QOS("server_kevent_id", "qos_client_send_two_msg_and_destroy", send_two_UI_
     QOS_CLASS_BACKGROUND, "background",
     QOS_CLASS_MAINTENANCE, "maintenance",
     QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion")
+
 /*
  * Test 11: test sending two ports with chaining
  *
@@ -1689,7 +1865,29 @@ TEST_QOS("server_kevent_id", "qos_client_send_complex_msg_with_pri", send_comple
     QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion")
 
 /*
- * Test 12 - 19
+ * Test 12: test sending two ports with chaining
+ *
+ * Send a sync IPC to a connection port, which itself is embedded in a message
+ * sent as a sync IPC to a service port.
+ */
+TEST_QOS("server_kevent_id", "qos_client_send_complex_msg_with_pri", send_complex_sync_UI_and_enable_and_handoff, "workloop_cb_test_sync_send_and_enable_handoff",
+    QOS_CLASS_USER_INITIATED, "user initiated",
+    QOS_CLASS_USER_INITIATED, "user initiated",
+    QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion")
+
+/*
+ * Test 13: test changing qos of a thread to trigger turnstile push
+ *
+ * Send a sync IPC to a service port and change the qos of the blocked thread
+ * to verify that changing qos triggers a turnstile push.
+ */
+TEST_QOS("server_kevent_id", "qos_client_sync_rcv_qos_change", qos_change_to_IN, "workloop_cb_test_intransit",
+    QOS_CLASS_DEFAULT, "default",
+    QOS_CLASS_MAINTENANCE, "maintenance",
+    QOS_CLASS_USER_INITIATED, "user initiated")
+
+/*
+ * Test 14 - 21
  *
  * Test single sync ipc link with server that breaks/preserves the link in different ways.
  */
@@ -1732,8 +1930,9 @@ TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_correct_s
     QOS_CLASS_DEFAULT, "default",
     QOS_CLASS_DEFAULT, "default",
     QOS_CLASS_DEFAULT, "default")
+
 /*
- * Test 20 - 23
+ * Test 22 - 25
  *
  * Test sequential sync ipc link with server that breaks/preserves the link.
  */
index 1f74ada51b34c15eb3c2b6b0839f7855db0a4ca8..62b87e68e9ae85be9a3335d670b66c2cd5bc9ed2 100644 (file)
@@ -1,20 +1,21 @@
+/* Copyright (c) 2018 Apple Inc.  All rights reserved. */
+
 #include <darwintest.h>
 #include <inttypes.h>
 #include <stdint.h>
+#include <sys/sysctl.h>
 
 #include <kperf/kpc.h>
 
-T_DECL(fixed_counters,
-    "test that fixed counters return monotonically increasing values",
-    T_META_ASROOT(YES))
-{
-       T_SKIP("unimplemented");
-}
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.ktrace"),
+       T_META_ASROOT(true),
+       T_META_CHECK_LEAKS(false));
 
 T_DECL(fixed_thread_counters,
-    "test that fixed thread counters return monotonically increasing values",
-    T_META_ASROOT(YES))
+    "test that fixed thread counters return monotonically increasing values")
 {
+
        int err;
        uint32_t ctrs_cnt;
        uint64_t *ctrs_a;
@@ -66,3 +67,66 @@ T_DECL(fixed_thread_counters,
        free(ctrs_a);
        free(ctrs_b);
 }
+
+#if defined(__arm64__)
+/*
+ * This policy only applies to arm64 devices.
+ */
+
+static int g_prev_disablewl = 0;
+
+static void
+whitelist_atend(void)
+{
+       int ret = sysctlbyname("kpc.disable_whitelist", NULL, NULL,
+           &g_prev_disablewl, sizeof(g_prev_disablewl));
+       if (ret < 0) {
+               T_LOG("failed to reset whitelist: %d (%s)", errno, strerror(errno));
+       }
+}
+
+T_DECL(whitelist, "ensure kpc's whitelist is filled out")
+{
+       /* Start enforcing the whitelist. */
+       int set = 0;
+       size_t getsz = sizeof(g_prev_disablewl);
+       int ret = sysctlbyname("kpc.disable_whitelist", &g_prev_disablewl, &getsz,
+           &set, sizeof(set));
+       if (ret < 0 && errno == ENOENT) {
+               T_SKIP("kpc not running with a whitelist, or RELEASE kernel");
+       }
+
+       T_ASSERT_POSIX_SUCCESS(ret, "started enforcing the event whitelist");
+       T_ATEND(whitelist_atend);
+
+       uint32_t nconfigs = kpc_get_config_count(KPC_CLASS_CONFIGURABLE_MASK);
+       uint64_t *config = calloc(nconfigs, sizeof(*config));
+
+       /*
+        * Check that events in the whitelist are allowed.  CORE_CYCLE (0x2) is
+        * always present in the whitelist.
+        */
+       config[0] = 0x02;
+       ret = kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, config);
+       T_ASSERT_POSIX_SUCCESS(ret, "configured kpc to count cycles");
+
+       /* Check that non-event bits are ignored by the whitelist. */
+       config[0] = 0x102;
+       ret = kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, config);
+       T_ASSERT_POSIX_SUCCESS(ret,
+           "configured kpc to count cycles with non-event bits set");
+
+       /* Check that configurations of non-whitelisted events fail. */
+       config[0] = 0xfe;
+       ret = kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, config);
+       T_ASSERT_POSIX_FAILURE(ret, EPERM,
+           "shouldn't allow arbitrary events with whitelist enabled");
+
+       /* Clean up the configuration. */
+       config[0] = 0;
+       (void)kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, config);
+
+       free(config);
+}
+
+#endif /* defined(__arm64__) */
index 0c6684ae291e12a6e2ca52c13d5da52dae12ad5d..29ceeab7d2788f5d77bb8778b44027b4504a8323 100644 (file)
@@ -9,6 +9,7 @@
 #include <ktrace/session.h>
 #include <ktrace/private.h>
 #include <System/sys/kdebug.h>
+#include <kperf/kpc.h>
 #include <kperf/kperf.h>
 #include <kperfdata/kpdecode.h>
 #include <os/assumes.h>
@@ -16,6 +17,7 @@
 #include <sys/sysctl.h>
 
 #include "kperf_helpers.h"
+#include "ktrace_helpers.h"
 
 T_GLOBAL_META(
        T_META_NAMESPACE("xnu.kperf"),
@@ -40,12 +42,16 @@ spinning_thread(void *semp)
        return NULL;
 }
 
-#define PERF_STK_KHDR  UINT32_C(0x25020014)
-#define PERF_STK_UHDR  UINT32_C(0x25020018)
-#define PERF_TMR_FIRE  KDBG_EVENTID(DBG_PERF, 3, 0)
-#define PERF_TMR_HNDLR KDBG_EVENTID(DBG_PERF, 3, 2)
-#define PERF_TMR_PEND  KDBG_EVENTID(DBG_PERF, 3, 3)
-#define PERF_TMR_SKIP  KDBG_EVENTID(DBG_PERF, 3, 4)
+#define PERF_STK_KHDR   UINT32_C(0x25020014)
+#define PERF_STK_UHDR   UINT32_C(0x25020018)
+#define PERF_TMR_FIRE   KDBG_EVENTID(DBG_PERF, 3, 0)
+#define PERF_TMR_HNDLR  KDBG_EVENTID(DBG_PERF, 3, 2)
+#define PERF_TMR_PEND   KDBG_EVENTID(DBG_PERF, 3, 3)
+#define PERF_TMR_SKIP   KDBG_EVENTID(DBG_PERF, 3, 4)
+#define PERF_KPC_CONFIG KDBG_EVENTID(DBG_PERF, 6, 4)
+#define PERF_KPC_REG    KDBG_EVENTID(DBG_PERF, 6, 5)
+#define PERF_KPC_REG32  KDBG_EVENTID(DBG_PERF, 6, 7)
+#define PERF_INSTR_DATA KDBG_EVENTID(DBG_PERF, 1, 17)
 
 #define SCHED_HANDOFF KDBG_EVENTID(DBG_MACH, DBG_MACH_SCHED, \
                MACH_STACK_HANDOFF)
@@ -59,12 +65,6 @@ spinning_thread(void *semp)
 
 #define TIMER_PERIOD_NS (1 * NSEC_PER_MSEC)
 
-static void
-reset_ktrace(void)
-{
-       kperf_reset();
-}
-
 /*
  * Ensure that kperf is correctly IPIing CPUs that are actively scheduling by
  * bringing up threads and ensuring that threads on-core are sampled by each
@@ -74,6 +74,8 @@ reset_ktrace(void)
 T_DECL(ipi_active_cpus,
     "make sure that kperf IPIs all active CPUs")
 {
+       start_controlling_ktrace();
+
        int ncpus = dt_ncpu();
        T_QUIET;
        T_ASSERT_LT(ncpus, MAX_CPUS,
@@ -282,7 +284,6 @@ T_DECL(ipi_active_cpus,
        T_ASSERT_POSIX_SUCCESS(kperf_timer_action_set(0, 1), NULL);
 
        T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), "start kperf sampling");
-       T_ATEND(reset_ktrace);
 
        T_ASSERT_POSIX_ZERO(ktrace_start(s,
            dispatch_get_global_queue(QOS_CLASS_USER_INITIATED, 0)),
@@ -297,9 +298,9 @@ T_DECL(ipi_active_cpus,
 
 #define KDEBUG_TRIGGER_TIMEOUT_NS (10 * NSEC_PER_SEC)
 
-#define NON_TRIGGER_CLASS    UINT8_C(0xfd)
-#define NON_TRIGGER_SUBCLASS UINT8_C(0xff)
-#define NON_TRIGGER_CODE     UINT8_C(0xff)
+#define NON_TRIGGER_CLASS    UINT32_C(0xfd)
+#define NON_TRIGGER_SUBCLASS UINT32_C(0xff)
+#define NON_TRIGGER_CODE     UINT32_C(0xff)
 
 #define NON_TRIGGER_EVENT \
                (KDBG_EVENTID(NON_TRIGGER_CLASS, NON_TRIGGER_SUBCLASS, \
@@ -319,13 +320,13 @@ expect_kdebug_trigger(const char *filter_desc, const uint32_t *debugids,
 
        ktrace_events_single(s, PERF_STK_KHDR, ^(struct trace_point *tp) {
                missing_kernel_stacks--;
-               T_LOG("saw kernel stack with %lu frames, flags = %#lx", tp->arg2,
-               tp->arg1);
+               T_LOG("saw kernel stack with %" PRIu64 " frames, flags = %#"
+               PRIx64, tp->arg2, tp->arg1);
        });
        ktrace_events_single(s, PERF_STK_UHDR, ^(struct trace_point *tp) {
                missing_user_stacks--;
-               T_LOG("saw user stack with %lu frames, flags = %#lx", tp->arg2,
-               tp->arg1);
+               T_LOG("saw user stack with %" PRIu64 " frames, flags = %#"
+               PRIx64, tp->arg2, tp->arg1);
        });
 
        for (unsigned int i = 0; i < n_debugids; i++) {
@@ -386,16 +387,18 @@ expect_kdebug_trigger(const char *filter_desc, const uint32_t *debugids,
        });
 }
 
-#define TRIGGER_CLASS     UINT8_C(0xfe)
-#define TRIGGER_CLASS_END UINT8_C(0xfd)
-#define TRIGGER_SUBCLASS  UINT8_C(0xff)
-#define TRIGGER_CODE      UINT8_C(0)
+#define TRIGGER_CLASS     UINT32_C(0xfe)
+#define TRIGGER_CLASS_END UINT32_C(0xfd)
+#define TRIGGER_SUBCLASS  UINT32_C(0xff)
+#define TRIGGER_CODE      UINT32_C(0)
 #define TRIGGER_DEBUGID \
                (KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, TRIGGER_CODE))
 
 T_DECL(kdebug_trigger_classes,
     "test that kdebug trigger samples on classes")
 {
+       start_controlling_ktrace();
+
        const uint32_t class_debugids[] = {
                KDBG_EVENTID(TRIGGER_CLASS, 1, 1),
                KDBG_EVENTID(TRIGGER_CLASS, 2, 1),
@@ -411,6 +414,8 @@ T_DECL(kdebug_trigger_classes,
 T_DECL(kdebug_trigger_subclasses,
     "test that kdebug trigger samples on subclasses")
 {
+       start_controlling_ktrace();
+
        const uint32_t subclass_debugids[] = {
                KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, 0),
                KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, 1),
@@ -426,6 +431,8 @@ T_DECL(kdebug_trigger_subclasses,
 T_DECL(kdebug_trigger_debugids,
     "test that kdebug trigger samples on debugids")
 {
+       start_controlling_ktrace();
+
        const uint32_t debugids[] = {
                TRIGGER_DEBUGID
        };
@@ -440,9 +447,17 @@ T_DECL(kdebug_trigger_debugids,
  * events from that class.
  */
 
+static void
+reset_kperf(void)
+{
+       (void)kperf_reset();
+}
+
 T_DECL(kdbg_callstacks,
     "test that the kdbg_callstacks samples on syscalls")
 {
+       start_controlling_ktrace();
+
        ktrace_session_t s;
        __block bool saw_user_stack = false;
 
@@ -471,7 +486,7 @@ T_DECL(kdbg_callstacks,
 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
        T_ASSERT_POSIX_SUCCESS(kperf_kdbg_callstacks_set(1), NULL);
 #pragma clang diagnostic pop
-       T_ATEND(kperf_reset);
+       T_ATEND(reset_kperf);
 
        T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
 
@@ -526,6 +541,8 @@ expect_stacks_traced(void (^cb)(void))
 
 T_DECL(pet, "test that PET mode samples kernel and user stacks")
 {
+       start_controlling_ktrace();
+
        configure_kperf_stacks_timer(-1, 10);
        T_ASSERT_POSIX_SUCCESS(kperf_timer_pet_set(0), NULL);
 
@@ -540,6 +557,8 @@ T_DECL(lightweight_pet,
     "test that lightweight PET mode samples kernel and user stacks",
     T_META_ASROOT(true))
 {
+       start_controlling_ktrace();
+
        int set = 1;
 
        configure_kperf_stacks_timer(-1, 10);
@@ -556,6 +575,8 @@ T_DECL(lightweight_pet,
 
 T_DECL(pet_stress, "repeatedly enable and disable PET mode")
 {
+       start_controlling_ktrace();
+
        int niters = 1000;
        while (niters--) {
                configure_kperf_stacks_timer(-1, 10);
@@ -568,6 +589,8 @@ T_DECL(pet_stress, "repeatedly enable and disable PET mode")
 
 T_DECL(timer_stress, "repeatedly enable and disable timers")
 {
+       start_controlling_ktrace();
+
        int niters = 1000;
        while (niters--) {
                configure_kperf_stacks_timer(-1, 1);
@@ -576,3 +599,153 @@ T_DECL(timer_stress, "repeatedly enable and disable timers")
        }
        ;
 }
+
+T_DECL(pmc_config_only, "shouldn't show PMC config events unless requested")
+{
+       start_controlling_ktrace();
+
+       __block bool saw_kpc_config = false;
+       __block bool saw_kpc_reg = false;
+
+       ktrace_session_t s = ktrace_session_create();
+       T_ASSERT_NOTNULL(s, "ktrace_session_create");
+
+       /*
+        * Make sure BSD events are traced in order to trigger samples on syscalls.
+        */
+       ktrace_events_single(s, PERF_KPC_CONFIG,
+           ^(__unused struct trace_point *tp) {
+               saw_kpc_config = true;
+       });
+       ktrace_events_single(s, PERF_KPC_REG,
+           ^(__unused struct trace_point *tp) {
+               saw_kpc_reg = true;
+       });
+       ktrace_events_single(s, PERF_KPC_REG32,
+           ^(__unused struct trace_point *tp) {
+               saw_kpc_reg = true;
+       });
+
+       ktrace_set_completion_handler(s, ^{
+               ktrace_session_destroy(s);
+               T_EXPECT_FALSE(saw_kpc_config,
+               "should see no KPC configs without sampler enabled");
+               T_EXPECT_FALSE(saw_kpc_reg,
+               "should see no KPC registers without sampler enabled");
+               T_END;
+       });
+
+       uint32_t nconfigs = kpc_get_config_count(KPC_CLASS_CONFIGURABLE_MASK);
+       uint64_t *config = calloc(nconfigs, sizeof(*config));
+       config[0] = 0x02;
+       int ret = kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, config);
+       T_ASSERT_POSIX_SUCCESS(ret, "configured kpc");
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(kpc_set_counting(KPC_CLASS_CONFIGURABLE_MASK),
+           "kpc_set_counting");
+
+       (void)kperf_action_count_set(1);
+       T_ATEND(reset_kperf);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1, KPERF_SAMPLER_PMC_CPU),
+           NULL);
+
+       (void)kperf_timer_count_set(1);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(kperf_timer_period_set(0,
+           kperf_ns_to_ticks(TIMER_PERIOD_NS)), NULL);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(kperf_timer_action_set(0, 1), NULL);
+
+       T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), "start kperf sampling");
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 10 * NSEC_PER_SEC),
+           dispatch_get_main_queue(), ^(void) {
+               ktrace_end(s, 1);
+       });
+
+       dispatch_main();
+}
+
+static void
+skip_if_monotonic_unsupported(void)
+{
+       int r;
+       int supported = 0;
+       size_t supported_size = sizeof(supported);
+
+       r = sysctlbyname("kern.monotonic.supported", &supported, &supported_size,
+           NULL, 0);
+       if (r < 0) {
+               T_WITH_ERRNO;
+               T_SKIP("could not find \"kern.monotonic.supported\" sysctl");
+       }
+
+       if (!supported) {
+               T_SKIP("monotonic is not supported on this platform");
+       }
+}
+
+#define INSTRS_CYCLES_UPPER 500
+#define INSTRS_CYCLES_LOWER 50
+
+T_DECL(instrs_cycles, "ensure instructions and cycles are sampled")
+{
+       skip_if_monotonic_unsupported();
+
+       start_controlling_ktrace();
+
+       ktrace_session_t sess = ktrace_session_create();
+
+       __block uint64_t ninstrs_cycles = 0;
+       __block uint64_t nzeroes = 0;
+       ktrace_events_single(sess, PERF_INSTR_DATA,
+           ^(__unused struct trace_point *tp) {
+               ninstrs_cycles++;
+               if (tp->arg1 == 0) {
+                       T_LOG("%llx (%s)\n", tp->threadid, tp->command);
+                       nzeroes++;
+               }
+               if (ninstrs_cycles >= INSTRS_CYCLES_UPPER) {
+                       ktrace_end(sess, 1);
+               }
+       });
+
+       ktrace_set_collection_interval(sess, 200);
+
+       ktrace_set_completion_handler(sess, ^{
+               T_EXPECT_GE(ninstrs_cycles, (uint64_t)INSTRS_CYCLES_LOWER,
+                   "saw enough instructions and cycles events");
+               T_EXPECT_EQ(nzeroes, UINT64_C(0),
+                   "saw no events with 0 instructions");
+               T_END;
+       });
+
+       (void)kperf_action_count_set(1);
+       T_ATEND(reset_kperf);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1,
+           KPERF_SAMPLER_TH_INSTRS_CYCLES), NULL);
+
+       (void)kperf_timer_count_set(1);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(kperf_timer_period_set(0,
+           kperf_ns_to_ticks(TIMER_PERIOD_NS)), NULL);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(kperf_timer_action_set(0, 1), NULL);
+
+       T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), "start kperf sampling");
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(sess, dispatch_get_main_queue()),
+           NULL);
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 10 * NSEC_PER_SEC),
+           dispatch_get_main_queue(), ^(void) {
+               ktrace_end(sess, 1);
+       });
+
+       dispatch_main();
+}
+
index a586569c3ced33196d73227d3f885e5d0427470c..5c5e3dcfcd00e183c6a1cf6821ac9626b17fabe8 100644 (file)
@@ -7,18 +7,23 @@
 #include <dispatch/dispatch.h>
 #include <kperf/kperf.h>
 #include <ktrace/session.h>
+#include <ktrace/private.h>
 #include <System/sys/kdebug.h>
 #include <pthread.h>
 
 #include "kperf_helpers.h"
+#include "ktrace_helpers.h"
 
 #define PERF_STK_KHDR  UINT32_C(0x25020014)
 #define PERF_STK_UHDR  UINT32_C(0x25020018)
 #define PERF_STK_KDATA UINT32_C(0x2502000c)
 #define PERF_STK_UDATA UINT32_C(0x25020010)
 
+#define CALLSTACK_VALID 0x1
+#define CALLSTACK_TRUNCATED 0x10
+
 T_GLOBAL_META(
-       T_META_NAMESPACE("xnu.kperf"),
+       T_META_NAMESPACE("xnu.ktrace"),
        T_META_CHECK_LEAKS(false));
 
 static void
@@ -29,12 +34,14 @@ expect_frame(const char **bt, unsigned int bt_len, CSSymbolRef symbol,
        unsigned int frame_idx = max_frames - bt_idx - 1;
 
        if (!bt[frame_idx]) {
-               T_LOG("frame %2u: skipping system frame", frame_idx);
+               T_LOG("frame %2u: skipping system frame '%s'", frame_idx,
+                   CSSymbolGetName(symbol));
                return;
        }
 
        if (CSIsNull(symbol)) {
-               T_FAIL("invalid symbol for address %#lx at frame %d", addr, frame_idx);
+               T_FAIL("invalid symbol for address %#lx at frame %d", addr,
+                   frame_idx);
                return;
        }
 
@@ -105,11 +112,11 @@ expect_backtrace(ktrace_session_t s, uint64_t tid, unsigned int *stacks_seen,
                        return;
                }
 
-               T_LOG("found stack from thread %#lx", tp->threadid);
+               T_LOG("found stack from thread %#" PRIx64, tp->threadid);
                stacks++;
                if (!(tp->arg1 & 1)) {
-                       T_FAIL("invalid %s stack on thread %#lx", kern ? "kernel" : "user",
-                       tp->threadid);
+                       T_FAIL("invalid %s stack on thread %#" PRIx64,
+                       kern ? "kernel" : "user", tp->threadid);
                        return;
                }
 
@@ -209,26 +216,36 @@ recurse_b(dispatch_semaphore_t spinning, unsigned int frames)
        return recurse_a(spinning, frames - 1) + 1;
 }
 
-#define USER_FRAMES       (12)
+#define USER_FRAMES (12)
 
 #if defined(__x86_64__)
-#define RECURSE_START_OFFSET (4)
-#else /* defined(__x86_64__) */
+
 #define RECURSE_START_OFFSET (3)
-#endif /* defined(__x86_64__) */
+
+#else /* defined(__x86_64__) */
+
+#define RECURSE_START_OFFSET (2)
+
+#endif /* !defined(__x86_64__) */
 
 static const char *user_bt[USER_FRAMES] = {
 #if defined(__x86_64__)
+       /*
+        * x86_64 has an extra "thread_start" frame here.
+        */
        NULL,
 #endif /* defined(__x86_64__) */
-       NULL, NULL,
+       NULL,
        "backtrace_thread",
        "recurse_a", "recurse_b", "recurse_a", "recurse_b",
-       "recurse_a", "recurse_b", "recurse_a",
+       "recurse_a", "recurse_b", "recurse_a", "recurse_b",
 #if !defined(__x86_64__)
-       "recurse_b",
+       /*
+        * Pick up the slack to make the number of frames constant.
+        */
+       "recurse_a",
 #endif /* !defined(__x86_64__) */
-       NULL
+       NULL,
 };
 
 #if defined(__arm__)
@@ -300,7 +317,8 @@ backtrace_thread(void *arg)
 }
 
 static uint64_t
-create_backtrace_thread(dispatch_semaphore_t notify_spinning)
+create_backtrace_thread(void *(*thread_fn)(void *),
+    dispatch_semaphore_t notify_spinning)
 {
        pthread_t thread = NULL;
        uint64_t tid;
@@ -315,7 +333,7 @@ create_backtrace_thread(dispatch_semaphore_t notify_spinning)
                }
        });
 
-       T_QUIET; T_ASSERT_POSIX_ZERO(pthread_create(&thread, NULL, backtrace_thread,
+       T_QUIET; T_ASSERT_POSIX_ZERO(pthread_create(&thread, NULL, thread_fn,
            (void *)notify_spinning), NULL);
        T_QUIET; T_ASSERT_NOTNULL(thread, "backtrace thread created");
        dispatch_semaphore_wait(backtrace_started, DISPATCH_TIME_FOREVER);
@@ -343,7 +361,7 @@ start_backtrace_thread(void)
 #define TEST_TIMEOUT_NS (5 * NSEC_PER_SEC)
 #endif /* !TARGET_OS_WATCH */
 
-T_DECL(backtraces_kdebug_trigger,
+T_DECL(kdebug_trigger,
     "test that backtraces from kdebug trigger are correct",
     T_META_ASROOT(true))
 {
@@ -352,12 +370,16 @@ T_DECL(backtraces_kdebug_trigger,
        kperf_kdebug_filter_t filter;
        uint64_t tid;
 
+       start_controlling_ktrace();
+
        s = ktrace_session_create();
        T_ASSERT_NOTNULL(s, "ktrace session was created");
 
+       ktrace_set_collection_interval(s, 100);
+
        T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()), NULL);
 
-       tid = create_backtrace_thread(NULL);
+       tid = create_backtrace_thread(backtrace_thread, NULL);
        expect_backtrace(s, tid, &stacks_seen, false, user_bt, USER_FRAMES, 0);
        expect_backtrace(s, tid, &stacks_seen, true, kernel_bt, KERNEL_FRAMES, 0);
 
@@ -403,7 +425,7 @@ T_DECL(backtraces_kdebug_trigger,
        dispatch_main();
 }
 
-T_DECL(backtraces_user_timer,
+T_DECL(user_timer,
     "test that user backtraces on a timer are correct",
     T_META_ASROOT(true))
 {
@@ -412,14 +434,18 @@ T_DECL(backtraces_user_timer,
        uint64_t tid;
        dispatch_semaphore_t wait_for_spinning = dispatch_semaphore_create(0);
 
+       start_controlling_ktrace();
+
        s = ktrace_session_create();
        T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create");
 
+       ktrace_set_collection_interval(s, 100);
+
        ktrace_filter_pid(s, getpid());
 
        configure_kperf_stacks_timer(getpid(), 10);
 
-       tid = create_backtrace_thread(wait_for_spinning);
+       tid = create_backtrace_thread(backtrace_thread, wait_for_spinning);
        /* potentially calling dispatch function and system call */
        expect_backtrace(s, tid, &stacks_seen, false, user_bt, USER_FRAMES - 1, 2);
 
@@ -447,7 +473,144 @@ T_DECL(backtraces_user_timer,
        dispatch_main();
 }
 
+static volatile bool spin = true;
+
+__attribute__((noinline, not_tail_called))
+static void
+recurse_spin(dispatch_semaphore_t notify_sema, int depth)
+{
+       if (depth > 0) {
+               recurse_spin(notify_sema, depth - 1);
+       } else {
+               dispatch_semaphore_signal(notify_sema);
+               while (spin);
+       }
+}
+
+static void *
+spin_thread(void *arg)
+{
+       dispatch_semaphore_t notify_sema = arg;
+       dispatch_semaphore_signal(backtrace_started);
+       recurse_spin(notify_sema, 257);
+       return NULL;
+}
+
+T_DECL(truncated_user_stacks, "ensure stacks are marked as truncated")
+{
+       start_controlling_ktrace();
+
+       ktrace_session_t s = ktrace_session_create();
+       T_ASSERT_NOTNULL(s, "ktrace session was created");
+
+       ktrace_set_collection_interval(s, 100);
+
+       T_QUIET;
+       T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()), NULL);
+
+       configure_kperf_stacks_timer(getpid(), 10);
+
+       __block bool saw_stack = false;
+       ktrace_set_completion_handler(s, ^{
+           T_EXPECT_TRUE(saw_stack, "saw the user stack");
+           T_END;
+       });
+
+       dispatch_semaphore_t notify_sema = dispatch_semaphore_create(0);
+       uint64_t tid = create_backtrace_thread(spin_thread, notify_sema);
+
+       ktrace_events_single(s, PERF_STK_UHDR, ^(struct trace_point *tp) {
+               if (tp->threadid != tid) {
+                       return;
+               }
+               T_LOG("found %llu frame stack", tp->arg2);
+               T_EXPECT_BITS_SET(tp->arg1, CALLSTACK_VALID,
+                   "found valid callstack");
+               T_EXPECT_BITS_SET(tp->arg1, CALLSTACK_TRUNCATED,
+                   "found truncated callstack");
+               saw_stack = true;
+               ktrace_end(s, 1);
+       });
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL);
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()),
+           "start tracing");
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TEST_TIMEOUT_NS),
+           dispatch_get_main_queue(), ^(void)
+       {
+               T_LOG("ending test after timeout");
+               ktrace_end(s, 0);
+       });
+
+       dispatch_main();
+}
+
+T_DECL(max_user_stacks, "ensure stacks up to 256 frames can be captured")
+{
+       start_controlling_ktrace();
+
+       ktrace_session_t s = ktrace_session_create();
+       T_ASSERT_NOTNULL(s, "ktrace session was created");
+
+       ktrace_set_collection_interval(s, 100);
+
+       T_QUIET;
+       T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()), NULL);
+
+       configure_kperf_stacks_timer(getpid(), 10);
+
+       __block bool saw_stack = false;
+       __block bool saw_stack_data = false;
+       __block uint64_t nevents = 0;
+       ktrace_set_completion_handler(s, ^{
+           T_EXPECT_TRUE(saw_stack, "saw the user stack");
+           T_LOG("saw %" PRIu64 " stack data events", nevents);
+           T_EXPECT_TRUE(saw_stack_data, "saw all frames of the user stack");
+           T_END;
+       });
+
+       dispatch_semaphore_t notify_sema = dispatch_semaphore_create(0);
+       uint64_t tid = create_backtrace_thread(spin_thread, notify_sema);
+
+       ktrace_events_single(s, PERF_STK_UHDR, ^(struct trace_point *tp) {
+               if (tp->threadid != tid) {
+                       return;
+               }
+               T_LOG("found %llu frame stack", tp->arg2);
+               T_EXPECT_BITS_SET(tp->arg1, CALLSTACK_VALID,
+                   "found valid callstack");
+               T_EXPECT_EQ(tp->arg2, UINT64_C(256),
+                   "found the correct number of frames");
+               saw_stack = true;
+       });
+
+       ktrace_events_single(s, PERF_STK_UDATA, ^(struct trace_point *tp) {
+               if (tp->threadid != tid && !saw_stack) {
+                       return;
+               }
+               nevents++;
+               if (nevents == 256 / 4) {
+                       ktrace_end(s, 1);
+               }
+               saw_stack_data = true;
+       });
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL);
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()),
+           "start tracing");
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TEST_TIMEOUT_NS),
+           dispatch_get_main_queue(), ^(void)
+       {
+               T_LOG("ending test after timeout");
+               ktrace_end(s, 0);
+       });
+
+       dispatch_main();
+}
+
 /* TODO test kernel stacks in all modes */
 /* TODO legacy PET mode backtracing */
-/* TODO test deep stacks, further than 128 frames, make sure they are truncated */
-/* TODO test constrained stacks */
index 8dded5ed37cafce7a377d8331d42baaa5624ab48..ec3bb300376707141b6b9cef10afc680114707d6 100644 (file)
@@ -3,6 +3,8 @@
 #include <sys/event.h>
 #include <darwintest.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 /* <rdar://problem/28139044> EVFILT_USER doesn't properly support add&fire atomic combination
  *
  * Chek that using EV_ADD and EV_TRIGGER on a EV_USER actually trigger the event just added.
index 5678d325174ab0253da6be915c69350975b8f6c8..4937898d9a4dfff82defe110551722c7193c5b77 100644 (file)
@@ -6,6 +6,8 @@
 
 #include <darwintest.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 /*
  * <rdar://problem/30231213> close() of kqueue FD races with kqueue_scan park
  *
index d2a285d7c8b6d095a6d24ba03e56cfac18038112..40a1e2719d40eccf6813553c962a76e92b87dbc0 100644 (file)
@@ -13,6 +13,8 @@
 
 #include <TargetConditionals.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 #define TMP_FILE_PATH "/tmp/test_kqueue_fifo_18776047"
 
 #define READ_BUFFER_LEN 256
diff --git a/tests/ktrace_helpers.h b/tests/ktrace_helpers.h
new file mode 100644 (file)
index 0000000..05191cb
--- /dev/null
@@ -0,0 +1,59 @@
+#ifndef KTRACE_HELPERS_H
+#define KTRACE_HELPERS_H
+
+#include <darwintest.h>
+#include <libproc.h>
+#include <sys/sysctl.h>
+#include <System/sys/kdebug.h>
+
+static inline void
+reset_ktrace(void)
+{
+       (void)sysctl((int[]){ CTL_KERN, KERN_KDEBUG, KERN_KDREMOVE }, 3,
+           NULL, 0, NULL, 0);
+       kperf_reset();
+}
+
+static inline void
+start_controlling_ktrace(void)
+{
+       T_SETUPBEGIN;
+
+       int state = 0;
+       size_t statesz = sizeof(state);
+       int ret = sysctlbyname("ktrace.state", &state, &statesz, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "getting ktrace state");
+
+       if (state == 1) {
+               int ownerpid = 0;
+               size_t pidsz = sizeof(ownerpid);
+               ret = sysctlbyname("ktrace.owning_pid", &ownerpid, &pidsz, NULL, 0);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "getting owning pid");
+
+               if (ownerpid <= 0) {
+                       T_LOG("ktrace is in foreground, but no owner");
+                       goto out;
+               }
+
+               char ownername[1024];
+               ret = proc_name(ownerpid, ownername, sizeof(ownername));
+               if (ret == 0) {
+                       T_LOG("ktrace is in foreground, but owner (%d) has no name", ownerpid);
+                       goto out;
+               }
+
+               T_LOG("ktrace is in foreground, owned by %s, sending SIGKILL", ownername);
+               kill(ownerpid, SIGKILL);
+               usleep(500000);
+
+               ret = proc_name(ownerpid, ownername, sizeof(ownername));
+               T_QUIET; T_ASSERT_EQ(ret, 0, "should have killed ktrace owner");
+       }
+
+out:
+       reset_ktrace();
+       T_ATEND(reset_ktrace);
+       T_SETUPEND;
+}
+
+#endif /* !defined(KTRACE_HELPERS_H) */
index 3f2378e3157237a20f4311b18ed716763e06ad36..e6261ddae850aec86e8270a4a823be4522b7cd31 100644 (file)
@@ -141,13 +141,10 @@ T_GLOBAL_META(
  * a robust implementation should determine the proper range to use via
  * another means.
  */
-#define FIXED_STACK_ADDR ((uintptr_t)0x10000000)        /* must be page-aligned */
 #ifndef STANDALONE
 /* libdarwintest needs LOTs of stack */
 #endif
 #define FIXED_STACK_SIZE (PAGE_SIZE * 16)
-
-#define FIXED_TRAMP_ADDR (FIXED_STACK_ADDR + FIXED_STACK_SIZE + PAGE_SIZE)
 #define FIXED_TRAMP_MAXLEN (PAGE_SIZE * 8)
 
 #pragma pack(1)
@@ -185,7 +182,8 @@ static far_call_t input_desc = { .seg = COMPAT_MODE_CS_SELECTOR, .off = 0 };
 static uint64_t stackAddr = 0;
 static compat_tramp_t thunkit = NULL;
 static uint64_t thunk64_addr;
-static stackaddr_to_gsbase_t stack2gs[] = { { FIXED_STACK_ADDR, FIXED_STACK_ADDR + FIXED_STACK_SIZE, 0 } };
+/* stack2gs[0] is initialized in map_lowmem_stack() */
+static stackaddr_to_gsbase_t stack2gs[] = { { 0 } };
 
 extern int compat_mode_trampoline(far_call_t *, void *, uint64_t);
 extern void long_mode_trampoline(void);
@@ -303,9 +301,9 @@ handle_arithmetic_exception(_STRUCT_X86_THREAD_FULL_STATE64 *xtfs64, uint64_t *i
 {
        fprintf(stderr, "Caught divide-error exception\n");
        fprintf(stderr, "cs=0x%x rip=0x%x gs=0x%x ss=0x%x rsp=0x%llx\n",
-           (unsigned)xtfs64->ss64.__cs,
-           (unsigned)xtfs64->ss64.__rip, (unsigned)xtfs64->ss64.__gs,
-           (unsigned)xtfs64->__ss, xtfs64->ss64.__rsp);
+           (unsigned)xtfs64->__ss64.__cs,
+           (unsigned)xtfs64->__ss64.__rip, (unsigned)xtfs64->__ss64.__gs,
+           (unsigned)xtfs64->__ss, xtfs64->__ss64.__rsp);
        *ip_skip_countp = 2;
 }
 
@@ -320,9 +318,9 @@ handle_badinsn_exception(_STRUCT_X86_THREAD_FULL_STATE64 *xtfs64, uint64_t __unu
 
        fprintf(stderr, "Caught invalid opcode exception\n");
        fprintf(stderr, "cs=%x rip=%x gs=%x ss=0x%x rsp=0x%llx | handling between 0x%llx and 0x%llx\n",
-           (unsigned)xtfs64->ss64.__cs,
-           (unsigned)xtfs64->ss64.__rip, (unsigned)xtfs64->ss64.__gs,
-           (unsigned)xtfs64->__ss, xtfs64->ss64.__rsp,
+           (unsigned)xtfs64->__ss64.__cs,
+           (unsigned)xtfs64->__ss64.__rip, (unsigned)xtfs64->__ss64.__gs,
+           (unsigned)xtfs64->__ss, xtfs64->__ss64.__rsp,
            start_addr, end_addr);
 
        /*
@@ -334,14 +332,14 @@ handle_badinsn_exception(_STRUCT_X86_THREAD_FULL_STATE64 *xtfs64, uint64_t __unu
         * (Note that due to the way the invalid opcode indication was implemented,
         * %rip is already set to the next instruction.)
         */
-       if (xtfs64->ss64.__rip >= start_addr && xtfs64->ss64.__rip <= end_addr) {
+       if (xtfs64->__ss64.__rip >= start_addr && xtfs64->__ss64.__rip <= end_addr) {
                /*
                 * On return from the failed sysenter, %cs is changed to the
                 * sysenter code selector and %ss is set to 0x23, so switch them
                 * back to sane values.
                 */
-               if ((unsigned)xtfs64->ss64.__cs == SYSENTER_SELECTOR) {
-                       xtfs64->ss64.__cs = COMPAT_MODE_CS_SELECTOR;
+               if ((unsigned)xtfs64->__ss64.__cs == SYSENTER_SELECTOR) {
+                       xtfs64->__ss64.__cs = COMPAT_MODE_CS_SELECTOR;
                        xtfs64->__ss = 0x23; /* XXX */
                }
        }
@@ -393,8 +391,8 @@ catch_mach_exception_raise_state_identity(mach_port_t exception_port,
        default:
                fprintf(stderr, "Unsupported catch_mach_exception_raise_state_identity: code 0x%llx sub 0x%llx\n",
                    code[0], codeCnt > 1 ? code[1] : 0LL);
-               fprintf(stderr, "flavor=%d %%cs=0x%x %%rip=0x%llx\n", *flavor, (unsigned)xtfs64->ss64.__cs,
-                   xtfs64->ss64.__rip);
+               fprintf(stderr, "flavor=%d %%cs=0x%x %%rip=0x%llx\n", *flavor, (unsigned)xtfs64->__ss64.__cs,
+                   xtfs64->__ss64.__rip);
        }
 
        /*
@@ -403,12 +401,12 @@ catch_mach_exception_raise_state_identity(mach_port_t exception_port,
         * new state's cs register to just after the div instruction
         * to enable the thread to resume.
         */
-       if ((unsigned)xtfs64->ss64.__cs == COMPAT_MODE_CS_SELECTOR) {
+       if ((unsigned)xtfs64->__ss64.__cs == COMPAT_MODE_CS_SELECTOR) {
                *new_stateCnt = old_stateCnt;
                *new_xtfs64 = *xtfs64;
-               new_xtfs64->ss64.__rip += rip_skip_count;
-               fprintf(stderr, "new cs=0x%x rip=0x%llx\n", (unsigned)new_xtfs64->ss64.__cs,
-                   new_xtfs64->ss64.__rip);
+               new_xtfs64->__ss64.__rip += rip_skip_count;
+               fprintf(stderr, "new cs=0x%x rip=0x%llx\n", (unsigned)new_xtfs64->__ss64.__cs,
+                   new_xtfs64->__ss64.__rip);
                return KERN_SUCCESS;
        } else {
                return KERN_NOT_SUPPORTED;
@@ -500,7 +498,7 @@ signal_handler(int signo, siginfo_t *sinfop, void *ucontext)
 #ifndef STANDALONE
                T_ASSERT_FAIL("Unexpected signal %d\n", signo);
 #else
-               restore_gsbase(mctx.fp_fullp->__ss.ss64.__rsp);
+               restore_gsbase(mctx.fp_fullp->__ss.__ss64.__rsp);
                fprintf(stderr, "Not handling signal %d\n", signo);
                abort();
 #endif
@@ -521,10 +519,10 @@ signal_handler(int signo, siginfo_t *sinfop, void *ucontext)
                        int cnt = i386_set_ldt((int)idx, &descs[idx], 1);
                        if (cnt != (int)idx) {
 #ifdef DEBUG
-                               fprintf(stderr, "i386_set_ldt unexpectedly returned %d\n", cnt);
+                               fprintf(stderr, "i386_set_ldt unexpectedly returned %d (errno = %s)\n", cnt, strerror(errno));
 #endif
 #ifndef STANDALONE
-                               T_LOG("i386_set_ldt unexpectedly returned %d\n", cnt);
+                               T_LOG("i386_set_ldt unexpectedly returned %d (errno: %s)\n", cnt, strerror(errno));
                                T_ASSERT_FAIL("i386_set_ldt failure");
 #else
                                exit(1);
@@ -567,9 +565,9 @@ signal_handler(int signo, siginfo_t *sinfop, void *ucontext)
                 * Since we're handing this signal on the same thread, we may need to
                 * restore GSbase.
                 */
-               uint64_t orig_gsbase = stack_range_to_GSbase(ss64->ss64.__rsp, 0);
+               uint64_t orig_gsbase = stack_range_to_GSbase(ss64->__ss64.__rsp, 0);
                if (orig_gsbase != 0 && orig_gsbase != ss64->__gsbase) {
-                       restore_gsbase(ss64->ss64.__rsp);
+                       restore_gsbase(ss64->__ss64.__rsp);
                }
 
                if (signo == SIGFPE) {
@@ -584,10 +582,10 @@ signal_handler(int signo, siginfo_t *sinfop, void *ucontext)
                 * new state's cs register to just after the div instruction
                 * to enable the thread to resume.
                 */
-               if ((unsigned)ss64->ss64.__cs == COMPAT_MODE_CS_SELECTOR) {
-                       ss64->ss64.__rip += rip_skip_count;
-                       fprintf(stderr, "new cs=0x%x rip=0x%llx\n", (unsigned)ss64->ss64.__cs,
-                           ss64->ss64.__rip);
+               if ((unsigned)ss64->__ss64.__cs == COMPAT_MODE_CS_SELECTOR) {
+                       ss64->__ss64.__rip += rip_skip_count;
+                       fprintf(stderr, "new cs=0x%x rip=0x%llx\n", (unsigned)ss64->__ss64.__cs,
+                           ss64->__ss64.__rip);
                }
        } else {
                _STRUCT_X86_THREAD_STATE64 *ss64 = &mctx.fp_basep->__ss;
@@ -675,28 +673,42 @@ dump_desc(union ldt_entry *entp)
 static int
 map_lowmem_stack(void **lowmemstk)
 {
-       void *addr, *redzone;
+       void *addr;
+       int err;
 
-       if ((redzone = mmap((void *)(FIXED_STACK_ADDR - PAGE_SIZE), PAGE_SIZE, PROT_READ,
-           MAP_FIXED | MAP_PRIVATE | MAP_ANON, -1, 0)) == MAP_FAILED) {
+       if ((addr = mmap(0, FIXED_STACK_SIZE + PAGE_SIZE, PROT_READ | PROT_WRITE,
+           MAP_32BIT | MAP_PRIVATE | MAP_ANON, -1, 0)) == MAP_FAILED) {
                return errno;
        }
 
-       if ((addr = mmap((void *)FIXED_STACK_ADDR, FIXED_STACK_SIZE, PROT_READ | PROT_WRITE,
-           MAP_FIXED | MAP_PRIVATE | MAP_ANON, -1, 0)) == MAP_FAILED) {
-               (void)munmap(redzone, PAGE_SIZE);
-               return errno;
+       if ((uintptr_t)addr > 0xFFFFF000ULL) {
+               /* Error: This kernel does not support MAP_32BIT or there's a bug. */
+#ifndef STANDALONE
+               T_ASSERT_FAIL("%s: failed to map a 32-bit-accessible stack", __func__);
+#else
+               fprintf(stderr, "This kernel returned a virtual address > 4G (%p) despite MAP_32BIT.  Aborting.\n", addr);
+               exit(1);
+#endif
+       }
+
+       /* Enforce one page of redzone at the bottom of the stack */
+       if (mprotect(addr, PAGE_SIZE, PROT_NONE) < 0) {
+               err = errno;
+               (void) munmap(addr, FIXED_STACK_SIZE + PAGE_SIZE);
+               return err;
        }
 
        if (lowmemstk) {
-               *lowmemstk = addr;
+               stack2gs[0].stack_base = (uintptr_t)addr + PAGE_SIZE;
+               stack2gs[0].stack_limit = stack2gs[0].stack_base + FIXED_STACK_SIZE;
+               *lowmemstk = (void *)((uintptr_t)addr + PAGE_SIZE);
        }
 
        return 0;
 }
 
 static int
-map_32bit_code_impl(uint8_t *code_src, size_t code_len, void **codeptr, void *baseaddr,
+map_32bit_code_impl(uint8_t *code_src, size_t code_len, void **codeptr,
     size_t szlimit)
 {
        void *addr;
@@ -707,14 +719,24 @@ map_32bit_code_impl(uint8_t *code_src, size_t code_len, void **codeptr, void *ba
        }
 
 #ifdef DEBUG
-       printf("baseaddr = %p, size = %lu, szlimit = %u\n", baseaddr, sz, (unsigned)szlimit);
+       printf("size = %lu, szlimit = %u\n", sz, (unsigned)szlimit);
 #endif
 
-       if ((addr = mmap(baseaddr, sz, PROT_READ | PROT_WRITE | PROT_EXEC,
-           MAP_FIXED | MAP_PRIVATE | MAP_ANON, -1, 0)) == MAP_FAILED) {
+       if ((addr = mmap(0, sz, PROT_READ | PROT_WRITE | PROT_EXEC,
+           MAP_32BIT | MAP_PRIVATE | MAP_ANON, -1, 0)) == MAP_FAILED) {
                return errno;
        }
 
+       if ((uintptr_t)addr > 0xFFFFF000ULL) {
+               /* Error: This kernel does not support MAP_32BIT or there's a bug. */
+#ifndef STANDALONE
+               T_ASSERT_FAIL("%s: failed to map a 32-bit-accessible trampoline", __func__);
+#else
+               fprintf(stderr, "This kernel returned a virtual address > 4G (%p) despite MAP_32BIT.  Aborting.\n", addr);
+               exit(1);
+#endif
+       }
+
 #ifdef DEBUG
        printf("Mapping code @%p..%p => %p..%p\n", (void *)code_src,
            (void *)((uintptr_t)code_src + (unsigned)code_len),
@@ -724,7 +746,9 @@ map_32bit_code_impl(uint8_t *code_src, size_t code_len, void **codeptr, void *ba
        bcopy(code_src, addr, code_len);
 
        /* Fill the rest of the page with NOPs */
-       memset((void *)((uintptr_t)addr + code_len), 0x90, sz - code_len);
+       if ((sz - code_len) > 0) {
+               memset((void *)((uintptr_t)addr + code_len), 0x90, sz - code_len);
+       }
 
        if (codeptr) {
                *codeptr = addr;
@@ -740,31 +764,7 @@ map_32bit_trampoline(compat_tramp_t *lowmemtrampp)
 
        return map_32bit_code_impl((uint8_t *)&compat_mode_trampoline,
                   (size_t)compat_mode_trampoline_len, (void **)lowmemtrampp,
-                  (void *)FIXED_TRAMP_ADDR, FIXED_TRAMP_MAXLEN);
-}
-
-static int
-enable_ldt64(int *val)
-{
-       int ldt64_enable_value = 1;
-       int ldt64_enable_old = 0;
-       size_t ldt64_value_sz = sizeof(ldt64_enable_value);
-       int err;
-
-       /* Enable the feature for this test (development kernels only) */
-       if ((err = sysctlbyname("machdep.ldt64", 0, 0, &ldt64_enable_value,
-           ldt64_value_sz)) != 0) {
-               if (errno == EPERM) {
-                       if ((err = sysctlbyname("machdep.ldt64", &ldt64_enable_old,
-                           &ldt64_value_sz, 0, 0)) == 0) {
-                               *val = ldt64_enable_old;
-                       }
-               }
-               return errno;
-       }
-
-       *val = ldt64_enable_value;
-       return 0;
+                  FIXED_TRAMP_MAXLEN);
 }
 
 static uint64_t
@@ -922,7 +922,6 @@ ldt64_test_setup(pthread_t *cmthreadp, thread_arg_t *cmargp, boolean_t setldt_in
        void *addr;
        uintptr_t code_addr;
        uintptr_t thunk64_movabs_addr;
-       int enable_status = 0;
 
        descs = malloc(sizeof(union ldt_entry) * 256);
        if (descs == 0) {
@@ -934,29 +933,15 @@ ldt64_test_setup(pthread_t *cmthreadp, thread_arg_t *cmargp, boolean_t setldt_in
 #endif
        }
 
-       if ((err = enable_ldt64(&enable_status)) != 0 && enable_status == 0) {
-#ifndef STANDALONE
-               T_LOG("Warning: Couldn't set ldt64=1 via sysctl: %s\n",
-                   strerror(err));
-               T_ASSERT_FAIL("Couldn't enable ldt64 feature.\n");
-#else
-               fprintf(stderr, "Warning: Couldn't set ldt64=1 via sysctl: %s\n",
-                   strerror(err));
-               exit(1);
-#endif
-       }
-
 #ifdef DEBUG
        printf("32-bit code is at %p\n", (void *)&code_32);
 #endif
 
        if ((err = map_lowmem_stack(&addr)) != 0) {
-#ifdef DEBUG
-               fprintf(stderr, "Failed to mmap lowmem stack: %s\n", strerror(err));
-#endif
 #ifndef STANDALONE
-               T_ASSERT_FAIL("failed to mmap lowmem stack");
+               T_ASSERT_FAIL("failed to mmap lowmem stack: %s", strerror(err));
 #else
+               fprintf(stderr, "Failed to mmap lowmem stack: %s\n", strerror(err));
                exit(1);
 #endif
        }
@@ -966,28 +951,12 @@ ldt64_test_setup(pthread_t *cmthreadp, thread_arg_t *cmargp, boolean_t setldt_in
        printf("lowstack addr = %p\n", (void *)stackAddr);
 #endif
 
-       if ((err = create_worker_thread(cmargp, (uint32_t)stackAddr, cmthreadp)) != 0) {
-#ifdef DEBUG
-               fprintf(stderr, "Fatal: Could not create thread: %s\n", strerror(err));
-#endif
-#ifndef STANDALONE
-               T_LOG("Fatal: Could not create thread: %s\n", strerror(err));
-               T_ASSERT_FAIL("Thread creation failure");
-#else
-               exit(1);
-#endif
-       }
-
-
        if ((err = map_32bit_trampoline(&thunkit)) != 0) {
-#ifdef DEBUG
-               fprintf(stderr, "Failed to map trampoline into lowmem: %s\n", strerror(err));
-#endif
-               join_32bit_thread(cmthreadp, cmargp);
 #ifndef STANDALONE
                T_LOG("Failed to map trampoline into lowmem: %s\n", strerror(err));
                T_ASSERT_FAIL("Failed to map trampoline into lowmem");
 #else
+               fprintf(stderr, "Failed to map trampoline into lowmem: %s\n", strerror(err));
                exit(1);
 #endif
        }
@@ -1002,12 +971,11 @@ ldt64_test_setup(pthread_t *cmthreadp, thread_arg_t *cmargp, boolean_t setldt_in
        bzero(descs, sizeof(union ldt_entry) * 256);
 
        if ((cnt = i386_get_ldt(0, descs, 1)) <= 0) {
-               fprintf(stderr, "i386_get_ldt unexpectedly returned %d\n", cnt);
-               join_32bit_thread(cmthreadp, cmargp);
 #ifndef STANDALONE
-               T_LOG("i386_get_ldt unexpectedly returned %d\n", cnt);
+               T_LOG("i386_get_ldt unexpectedly returned %d (errno: %s)\n", cnt, strerror(errno));
                T_ASSERT_FAIL("i386_get_ldt failure");
 #else
+               fprintf(stderr, "i386_get_ldt unexpectedly returned %d (errno: %s)\n", cnt, strerror(errno));
                exit(1);
 #endif
        }
@@ -1041,14 +1009,11 @@ ldt64_test_setup(pthread_t *cmthreadp, thread_arg_t *cmargp, boolean_t setldt_in
                /* Set the LDT: */
                cnt = i386_set_ldt((int)idx, &descs[idx], 1);
                if (cnt != (int)idx) {
-#ifdef DEBUG
-                       fprintf(stderr, "i386_set_ldt unexpectedly returned %d\n", cnt);
-#endif
-                       join_32bit_thread(cmthreadp, cmargp);
 #ifndef STANDALONE
-                       T_LOG("i386_set_ldt unexpectedly returned %d\n", cnt);
+                       T_LOG("i386_set_ldt unexpectedly returned %d (errno: %s)\n", cnt, strerror(errno));
                        T_ASSERT_FAIL("i386_set_ldt failure");
 #else
+                       fprintf(stderr, "i386_set_ldt unexpectedly returned %d (errno: %s)\n", cnt, strerror(errno));
                        exit(1);
 #endif
                }
@@ -1068,19 +1033,28 @@ ldt64_test_setup(pthread_t *cmthreadp, thread_arg_t *cmargp, boolean_t setldt_in
                }
 #endif
        } else {
-#ifdef DEBUG
-               fprintf(stderr, "i386_get_ldt unexpectedly returned %d\n", cnt);
-#endif
-               join_32bit_thread(cmthreadp, cmargp);
 #ifndef STANDALONE
-               T_LOG("i386_get_ldt unexpectedly returned %d\n", cnt);
+               T_LOG("i386_get_ldt unexpectedly returned %d (errno: %s)\n", cnt, strerror(errno));
                T_ASSERT_FAIL("i386_get_ldt failure");
 #else
+               fprintf(stderr, "i386_get_ldt unexpectedly returned %d (errno: %s)\n", cnt, strerror(errno));
                exit(1);
 #endif
        }
 
        free(descs);
+
+       if ((err = create_worker_thread(cmargp, (uint32_t)stackAddr, cmthreadp)) != 0) {
+#ifdef DEBUG
+               fprintf(stderr, "Fatal: Could not create thread: %s\n", strerror(err));
+#endif
+#ifndef STANDALONE
+               T_LOG("Fatal: Could not create thread: %s\n", strerror(err));
+               T_ASSERT_FAIL("Thread creation failure");
+#else
+               exit(1);
+#endif
+       }
 }
 
 #ifdef STANDALONE
diff --git a/tests/ldt_entitlement.plist b/tests/ldt_entitlement.plist
new file mode 100644 (file)
index 0000000..19058c6
--- /dev/null
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>com.apple.security.ldt-in-64bit-process</key>
+       <true/>
+       <key>com.apple.security.mmap-map-32bit</key>
+       <true/>
+</dict>
+</plist>
index 7a9a472771971bf71c39b79185c8939077699122..23e199e37f908365d6c44ddd3664f2c17d49898c 100644 (file)
@@ -7,6 +7,8 @@
 
 #include <darwintest.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 T_DECL(mach_boottime_usec, "mach_boottime_usec()",
     T_META_ALL_VALID_ARCHS(true), T_META_LTEPHASE(LTE_POSTINIT))
 {
index 0d49b7bdfd9f77e874e21efed6822a0743050a7f..fe782e0f9186fc94fd7332084e83e0ec4a7821c7 100644 (file)
@@ -36,7 +36,7 @@ update(uint64_t *a, uint64_t *c)
 }
 
 T_DECL(mct_monotonic, "Testing mach_continuous_time returns sane, monotonic values",
-    T_META_ALL_VALID_ARCHS(true))
+    T_META_ALL_VALID_ARCHS(true), T_META_RUN_CONCURRENTLY(true))
 {
        mach_timebase_info(&tb_info);
 #ifdef HAS_KERNEL_TIME_TRAPS
@@ -69,7 +69,7 @@ T_DECL(mct_monotonic, "Testing mach_continuous_time returns sane, monotonic valu
 }
 
 T_DECL(mat_monotonic, "Testing mach_absolute_time returns sane, monotonic values",
-    T_META_ALL_VALID_ARCHS(true))
+    T_META_ALL_VALID_ARCHS(true), T_META_RUN_CONCURRENTLY(true))
 {
        mach_timebase_info(&tb_info);
 #ifdef HAS_KERNEL_TIME_TRAPS
@@ -100,7 +100,8 @@ T_DECL(mat_monotonic, "Testing mach_absolute_time returns sane, monotonic values
        }
 }
 
-T_DECL(mct_pause, "Testing mach_continuous_time and mach_absolute_time don't diverge")
+T_DECL(mct_pause, "Testing mach_continuous_time and mach_absolute_time don't diverge",
+    T_META_RUN_CONCURRENTLY(true))
 {
        mach_timebase_info(&tb_info);
 
@@ -136,7 +137,8 @@ update_kern(uint64_t *abs, uint64_t *cont)
 #endif
 
 #ifdef HAS_KERNEL_TIME_TRAPS
-T_DECL(mct_pause_kern, "Testing kernel mach_continuous_time and mach_absolute_time don't diverge")
+T_DECL(mct_pause_kern, "Testing kernel mach_continuous_time and mach_absolute_time don't diverge",
+    T_META_RUN_CONCURRENTLY(true))
 {
        mach_timebase_info(&tb_info);
 
@@ -281,7 +283,7 @@ T_DECL(mct_settimeofday_kern, "Testing kernel mach_continuous_time behavior over
 #endif
 
 T_DECL(mct_aproximate, "Testing mach_continuous_approximate_time()",
-    T_META_ALL_VALID_ARCHS(true))
+    T_META_ALL_VALID_ARCHS(true), T_META_RUN_CONCURRENTLY(true))
 {
        mach_timebase_info(&tb_info);
 
diff --git a/tests/mach_exception_reply.c b/tests/mach_exception_reply.c
new file mode 100644 (file)
index 0000000..d34bc25
--- /dev/null
@@ -0,0 +1,457 @@
+#define T_NAMESPACE "xnu.ipc"
+#include <darwintest.h>
+
+#include <pthread.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <mach/mach.h>
+#include <pthread/qos_private.h>
+#include <voucher/ipc_pthread_priority_types.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+#define MSG      1024
+#define PG_ALLOC 4096
+
+typedef enum {
+       ReplyWithNoError,
+       ReplyWithReplyPort,
+       ReplyWithReplyPortMove,
+       ReplyWithReplyPortCplxBit,
+       ReplyWithReplyPortMoveCplxBit,
+       ReplyWithPortDesc,
+       ReplyWithOOLDesc,
+       ReplyWithVoucher,
+       ReplyWithVoucherGarbage
+} ReplyType;
+
+struct exc_thread_arg {
+       ReplyType    rt;
+       mach_port_t  port;
+};
+
+static const char *
+reply_type_str(ReplyType rt)
+{
+       switch (rt) {
+       case ReplyWithNoError:
+               return "ReplyWithNoError";
+       case ReplyWithReplyPort:
+               return "ReplyWithReplyPort";
+       case ReplyWithReplyPortMove:
+               return "ReplyWithReplyPortMove";
+       case ReplyWithReplyPortCplxBit:
+               return "ReplyWithReplyPortCplxBit";
+       case ReplyWithReplyPortMoveCplxBit:
+               return "ReplyWithReplyPortMoveCplxBit";
+       case ReplyWithPortDesc:
+               return "ReplyWithPortDesc";
+       case ReplyWithOOLDesc:
+               return "ReplyWithOOLDesc";
+       case ReplyWithVoucher:
+               return "ReplyWithVoucher";
+       case ReplyWithVoucherGarbage:
+               return "ReplyWithVoucherGarbage";
+       }
+}
+
+static mach_voucher_t
+create_pthpriority_voucher(void)
+{
+       char voucher_buf[sizeof(mach_voucher_attr_recipe_data_t) + sizeof(ipc_pthread_priority_value_t)];
+
+       mach_voucher_t voucher = MACH_PORT_NULL;
+       kern_return_t kr;
+       ipc_pthread_priority_value_t ipc_pthread_priority_value =
+           (ipc_pthread_priority_value_t)_pthread_qos_class_encode(QOS_CLASS_USER_INTERACTIVE, 0, 0);
+
+       mach_voucher_attr_raw_recipe_size_t recipe_size = 0;
+       mach_voucher_attr_recipe_t recipe =
+           (mach_voucher_attr_recipe_t)&voucher_buf[0];
+
+       recipe->key = MACH_VOUCHER_ATTR_KEY_PTHPRIORITY;
+       recipe->command = MACH_VOUCHER_ATTR_PTHPRIORITY_CREATE;
+       recipe->previous_voucher = MACH_VOUCHER_NULL;
+
+       memcpy((char *)&recipe->content[0], &ipc_pthread_priority_value, sizeof(ipc_pthread_priority_value));
+       recipe->content_size = sizeof(ipc_pthread_priority_value_t);
+       recipe_size += sizeof(mach_voucher_attr_recipe_data_t) + recipe->content_size;
+
+       kr = host_create_mach_voucher(mach_host_self(),
+           (mach_voucher_attr_raw_recipe_array_t)&voucher_buf[0],
+           recipe_size,
+           &voucher);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_create_mach_voucher");
+       return voucher;
+}
+
+static void *
+handle_exceptions(void *arg)
+{
+       struct exc_thread_arg *ta = (struct exc_thread_arg *)arg;
+       mach_port_t ePort = ta->port;
+       ReplyType reply_type = ta->rt;
+
+       char msg_store[MSG + MAX_TRAILER_SIZE];
+       char reply_store[MSG];
+       mach_msg_header_t *msg = (mach_msg_header_t *)msg_store;
+       vm_address_t page;
+       kern_return_t kr;
+
+       kr = vm_allocate(mach_task_self(), &page, PG_ALLOC, VM_FLAGS_ANYWHERE);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "ool page allocation of %d bytes", PG_ALLOC);
+
+       mach_voucher_t voucher = create_pthpriority_voucher();
+
+       while (1) {
+               bzero(msg, sizeof(msg_store));
+
+               msg->msgh_local_port = ePort;
+               msg->msgh_size = MSG;
+               kr = mach_msg_receive(msg);
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "exception msg recv");
+
+               bzero(reply_store, sizeof(reply_store));
+
+               switch (reply_type) {
+               case ReplyWithNoError: {
+#pragma pack(4)
+                       typedef struct {
+                               mach_msg_header_t hdr;
+                               NDR_record_t ndr;
+                               kern_return_t kr;
+                       } reply_fmt_t;
+#pragma pack()
+                       reply_fmt_t *reply = (reply_fmt_t *)reply_store;
+
+                       reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, 0, 0, 0);
+                       reply->hdr.msgh_remote_port = msg->msgh_remote_port;
+                       reply->hdr.msgh_local_port = MACH_PORT_NULL;
+                       reply->hdr.msgh_size = sizeof(*reply);
+                       reply->hdr.msgh_id = msg->msgh_id + 100;
+                       break;
+               }
+
+               case ReplyWithReplyPort: {
+#pragma pack(4)
+                       typedef struct {
+                               mach_msg_header_t hdr;
+                               NDR_record_t ndr;
+                               kern_return_t kr;
+                       } reply_fmt_t;
+#pragma pack()
+                       reply_fmt_t *reply = (reply_fmt_t *)reply_store;
+
+                       reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, MACH_MSG_TYPE_COPY_SEND, 0, 0);
+                       reply->hdr.msgh_remote_port = msg->msgh_remote_port;
+                       reply->hdr.msgh_local_port = ePort; /* Bogus */
+                       reply->hdr.msgh_size = sizeof(*reply);
+                       reply->hdr.msgh_id = msg->msgh_id + 100;
+                       break;
+               }
+
+               case ReplyWithReplyPortMove: {
+#pragma pack(4)
+                       typedef struct {
+                               mach_msg_header_t hdr;
+                               NDR_record_t ndr;
+                               kern_return_t kr;
+                       } reply_fmt_t;
+#pragma pack()
+                       reply_fmt_t *reply = (reply_fmt_t *)reply_store;
+
+                       reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, MACH_MSG_TYPE_MOVE_SEND, 0, 0);
+                       reply->hdr.msgh_remote_port = msg->msgh_remote_port;
+                       reply->hdr.msgh_local_port = ePort; /* Bogus */
+                       reply->hdr.msgh_size = sizeof(*reply);
+                       reply->hdr.msgh_id = msg->msgh_id + 100;
+                       break;
+               }
+
+               case ReplyWithReplyPortCplxBit: {
+#pragma pack(4)
+                       typedef struct {
+                               mach_msg_header_t hdr;
+                               mach_msg_body_t body;
+                       } reply_fmt_t;
+#pragma pack()
+                       reply_fmt_t *reply = (reply_fmt_t *)reply_store;
+
+                       reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, MACH_MSG_TYPE_COPY_SEND, 0, MACH_MSGH_BITS_COMPLEX);
+                       reply->hdr.msgh_remote_port = msg->msgh_remote_port;
+                       reply->hdr.msgh_local_port = ePort; /* Bogus */
+                       reply->hdr.msgh_size = sizeof(*reply);
+                       reply->hdr.msgh_id = msg->msgh_id + 100;
+                       reply->body.msgh_descriptor_count = 0;
+                       break;
+               }
+
+               case ReplyWithReplyPortMoveCplxBit: {
+#pragma pack(4)
+                       typedef struct {
+                               mach_msg_header_t hdr;
+                               mach_msg_body_t body;
+                       } reply_fmt_t;
+#pragma pack()
+                       reply_fmt_t *reply = (reply_fmt_t *)reply_store;
+
+                       reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, MACH_MSG_TYPE_MOVE_SEND, 0, MACH_MSGH_BITS_COMPLEX);
+                       reply->hdr.msgh_remote_port = msg->msgh_remote_port;
+                       reply->hdr.msgh_local_port = ePort; /* Bogus */
+                       reply->hdr.msgh_size = sizeof(*reply);
+                       reply->hdr.msgh_id = msg->msgh_id + 100;
+                       reply->body.msgh_descriptor_count = 0;
+                       break;
+               }
+
+               case ReplyWithPortDesc: {
+#pragma pack(4)
+                       typedef struct {
+                               mach_msg_header_t hdr;
+                               mach_msg_body_t body;
+                               mach_msg_port_descriptor_t port;
+                       } reply_fmt_t;
+#pragma pack()
+                       reply_fmt_t *reply = (reply_fmt_t *)reply_store;
+
+                       reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, 0, 0, MACH_MSGH_BITS_COMPLEX);
+                       reply->hdr.msgh_remote_port = msg->msgh_remote_port;
+                       reply->hdr.msgh_local_port = MACH_PORT_NULL;
+                       reply->hdr.msgh_size = sizeof(*reply);
+                       reply->hdr.msgh_id = msg->msgh_id + 100;
+                       reply->body.msgh_descriptor_count = 1;
+                       reply->port.type = MACH_MSG_PORT_DESCRIPTOR;
+                       reply->port.name = ePort;
+                       reply->port.disposition = MACH_MSG_TYPE_COPY_SEND;
+                       break;
+               }
+
+               case ReplyWithOOLDesc: {
+#pragma pack(4)
+                       typedef struct {
+                               mach_msg_header_t hdr;
+                               mach_msg_body_t body;
+                               mach_msg_ool_descriptor_t ool;
+                       } reply_fmt_t;
+#pragma pack()
+                       reply_fmt_t *reply = (reply_fmt_t *)reply_store;
+
+                       reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE, 0, 0, MACH_MSGH_BITS_COMPLEX);
+                       reply->hdr.msgh_remote_port = msg->msgh_remote_port;
+                       reply->hdr.msgh_local_port = MACH_PORT_NULL;
+                       reply->hdr.msgh_size = sizeof(*reply);
+                       reply->hdr.msgh_id = msg->msgh_id + 100;
+                       reply->body.msgh_descriptor_count = 1;
+                       reply->ool.type = MACH_MSG_OOL_DESCRIPTOR;
+                       reply->ool.address = (void *)page;
+                       reply->ool.size = PG_ALLOC;
+                       reply->ool.deallocate = 0;
+                       reply->ool.copy = MACH_MSG_VIRTUAL_COPY;
+                       break;
+               }
+
+               case ReplyWithVoucher: {
+#pragma pack(4)
+                       typedef struct {
+                               mach_msg_header_t hdr;
+                               NDR_record_t ndr;
+                               kern_return_t kr;
+                       } reply_fmt_t;
+#pragma pack()
+                       reply_fmt_t *reply = (reply_fmt_t *)reply_store;
+
+                       reply->hdr.msgh_remote_port = msg->msgh_remote_port;
+                       reply->hdr.msgh_local_port = MACH_PORT_NULL;
+                       reply->hdr.msgh_size = sizeof(*reply);
+                       reply->hdr.msgh_id = msg->msgh_id + 100;
+                       reply->kr = KERN_SUCCESS;
+
+                       /* try to send a voucher */
+                       reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE,
+                           0,
+                           MACH_MSG_TYPE_MOVE_SEND,
+                           0);
+                       reply->hdr.msgh_voucher_port = voucher;
+                       voucher = MACH_VOUCHER_NULL;
+                       break;
+               }
+
+               case ReplyWithVoucherGarbage: {
+#pragma pack(4)
+                       typedef struct {
+                               mach_msg_header_t hdr;
+                               NDR_record_t ndr;
+                               kern_return_t kr;
+                       } reply_fmt_t;
+#pragma pack()
+                       reply_fmt_t *reply = (reply_fmt_t *)reply_store;
+
+                       reply->hdr.msgh_remote_port = msg->msgh_remote_port;
+                       reply->hdr.msgh_local_port = MACH_PORT_NULL;
+                       reply->hdr.msgh_size = sizeof(*reply);
+                       reply->hdr.msgh_id = msg->msgh_id + 100;
+                       reply->kr = KERN_SUCCESS;
+
+                       /* don't claim to send a voucher */
+                       reply->hdr.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND_ONCE,
+                           0, 0, 0);
+                       /* but put some bits in the field */
+                       reply->hdr.msgh_voucher_port = (mach_voucher_t)0xdead;
+                       break;
+               }
+
+               default:
+                       T_ASSERT_FAIL("Invalid ReplyType: %d", reply_type);
+                       T_END;
+               }
+
+               if (voucher) {
+                       kr = mach_port_mod_refs(mach_task_self(), voucher,
+                           MACH_PORT_RIGHT_SEND, -1);
+                       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "destroy voucher");
+               }
+
+               T_LOG("sending exception reply of type (%s)", reply_type_str(reply_type));
+               kr = mach_msg_send((mach_msg_header_t *)reply_store);
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "exception reply msg send");
+
+               T_PASS("Successfully delivered exception reply message of type %s", reply_type_str(reply_type));
+               T_END;
+               return NULL;
+       }
+}
+
+static sigjmp_buf jb;
+static int *bad_pointer = NULL;
+static int s_sigmask = 0;
+
+static void
+signal_handler(int sig, siginfo_t *sip __unused, void *ucontext __unused)
+{
+       if (sigmask(sig) & s_sigmask) { /* TODO: check that the fault was generated by us */
+               siglongjmp(jb, sig);
+       } else {
+               siglongjmp(jb, -sig);
+       }
+}
+
+static int
+handle_signals(void)
+{
+       int mask = 0;
+
+       struct sigaction sa = {
+               .sa_sigaction = signal_handler,
+               .sa_flags = SA_SIGINFO
+       };
+       sigfillset(&sa.sa_mask);
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(sigaction(SIGTRAP, &sa, NULL), NULL);
+       mask |= sigmask(SIGTRAP);
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(sigaction(SIGSEGV, &sa, NULL), NULL);
+       mask |= sigmask(SIGSEGV);
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(sigaction(SIGILL, &sa, NULL), NULL);
+       mask |= sigmask(SIGILL);
+
+       return mask;
+}
+
+static void
+test_exc_reply_type(ReplyType reply_type)
+{
+       kern_return_t kr;
+       task_t me = mach_task_self();
+       thread_t self = mach_thread_self();
+       pthread_t handler_thread;
+       pthread_attr_t  attr;
+       mach_port_t ePort;
+
+       s_sigmask = handle_signals();
+       T_LOG("task self = 0x%x, thread self = 0x%x\n", me, self);
+
+       kr = mach_port_allocate(me, MACH_PORT_RIGHT_RECEIVE, &ePort);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "allocate receive right");
+
+       kr = mach_port_insert_right(me, ePort, ePort, MACH_MSG_TYPE_MAKE_SEND);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "insert right into port=[%d]", ePort);
+
+       kr = thread_set_exception_ports(self, EXC_MASK_ALL, ePort, EXCEPTION_DEFAULT, THREAD_STATE_NONE);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "set exception ports on self=[%d], handler=[%d]", self, ePort);
+
+       pthread_attr_init(&attr);
+       pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+       struct exc_thread_arg *ta = (struct exc_thread_arg *)malloc(sizeof(*ta));
+       T_QUIET; T_ASSERT_NOTNULL(ta, "exception handler thread args allocation");
+       ta->port = ePort;
+       ta->rt = reply_type;
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_create(&handler_thread, &attr, handle_exceptions, (void *)ta),
+           "pthread creation");
+
+       pthread_attr_destroy(&attr);
+
+       /* cause exception! */
+       int x = sigsetjmp(jb, 0); //s_sigmask);
+       if (x == 0) {
+               *bad_pointer = 0;
+       } else if (x < 0) {
+               T_FAIL("Unexpected state on return-from-exception");
+               T_END;
+       } else {
+               T_PASS("Successfully recovered from exception");
+               T_END;
+       }
+       T_FAIL("Unexpected end of test!");
+       T_END;
+}
+
+T_DECL(mach_exc_ReplyNoError, "exception server reply with no error",
+    T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*"))
+{
+       test_exc_reply_type(ReplyWithNoError);
+}
+T_DECL(mach_exc_ReplyWithReplyPort, "exception server reply with reply port",
+    T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*"))
+{
+       test_exc_reply_type(ReplyWithReplyPort);
+}
+T_DECL(mach_exc_ReplyWithReplyPortMove, "exception server reply with reply port as MOVE_SEND",
+    T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*"))
+{
+       test_exc_reply_type(ReplyWithReplyPortMove);
+}
+T_DECL(mach_exc_ReplyWithReplyPortCplxBit, "exception server reply with reply port and complex bit set",
+    T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*"))
+{
+       test_exc_reply_type(ReplyWithReplyPortCplxBit);
+}
+T_DECL(mach_exc_ReplyWithReplyPortMoveCplxBit, "exception server reply with reply port as MOVE_SEND and complex bit set",
+    T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*"))
+{
+       test_exc_reply_type(ReplyWithReplyPortMoveCplxBit);
+}
+T_DECL(mach_exc_ReplyWithOOLPort, "exception server reply with OOL port descriptor",
+    T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*"))
+{
+       test_exc_reply_type(ReplyWithPortDesc);
+}
+T_DECL(mach_exc_ReplyWithOOLDesc, "exception server reply with OOL memory descriptor",
+    T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*"))
+{
+       test_exc_reply_type(ReplyWithOOLDesc);
+}
+T_DECL(mach_exc_ReplyWithVoucher, "exception server reply with a voucher",
+    T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*"))
+{
+       test_exc_reply_type(ReplyWithVoucher);
+}
+T_DECL(mach_exc_ReplyWithVoucherGarbage, "exception server reply with bits in msgh_voucher_port",
+    T_META_CHECK_LEAKS(false), T_META_IGNORECRASHES(".*mach_exception_reply.*"))
+{
+       test_exc_reply_type(ReplyWithVoucherGarbage);
+}
index d6fe33a7bec4e93af4eae5c36b3efa19f970e52c..057126566c639ca31c8fe4a836787671922d7e5b 100644 (file)
@@ -6,6 +6,8 @@
 #include <darwintest.h>
 #include <darwintest_utils.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 #define T_LOG_VERBOSE(...)
 
 #define timespec2nanosec(ts) ((uint64_t)((ts)->tv_sec) * NSEC_PER_SEC + (uint64_t)((ts)->tv_nsec))
index 0b15106002f47d5dfa8cad8eaf4bc6131d95cfaf..f072b1041786b161964fdc929a6aba2c7d92fa0e 100644 (file)
@@ -4,6 +4,8 @@
 #include <stdlib.h>
 #include <stdio.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 #define NR_PORTS 4
 
 T_DECL(mach_port_deallocate, "mach_port_deallocate deallocates also PORT_SET"){
index dec8548c143d1699f31cb9d4886f87de1479214a..b0c3d76e031e5a209d3c0e94f819f1e575b40101 100644 (file)
@@ -3,6 +3,18 @@
 #include <mach/message.h>
 #include <darwintest.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+static inline mach_port_type_t
+get_port_type(mach_port_t mp)
+{
+       mach_port_type_t type;
+       T_QUIET;
+       T_ASSERT_MACH_SUCCESS(mach_port_type(mach_task_self(), mp, &type),
+           "mach_port_type(mP)");
+       return type;
+}
+
 T_DECL(mach_port_insert_right, "insert send right for an existing right", T_META_CHECK_LEAKS(false))
 {
        mach_port_t port = MACH_PORT_NULL;
@@ -14,6 +26,14 @@ T_DECL(mach_port_insert_right, "insert send right for an existing right", T_META
        retval = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &port);
        T_ASSERT_MACH_SUCCESS(retval, "allocate a port=[%d]", port);
 
+       T_ASSERT_EQ(get_port_type(port), MACH_PORT_TYPE_RECEIVE,
+           "0x%x should be a receive right", port);
+
+       retval = mach_port_insert_right(task, port, port, MACH_MSG_TYPE_MAKE_SEND);
+       T_ASSERT_MACH_SUCCESS(retval, "insert a send right for port=[%d] with name=[%d]", port, port);
+       T_ASSERT_EQ(get_port_type(port), MACH_PORT_TYPE_RECEIVE | MACH_PORT_TYPE_SEND,
+           "0x%x should be a send-receive right", port);
+
        mach_port_name_t name = 123;
 
        retval = mach_port_insert_right(task, name, port, MACH_MSG_TYPE_MAKE_SEND);
@@ -26,6 +46,9 @@ T_DECL(mach_port_insert_right, "insert send right for an existing right", T_META
        retval = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &port2);
        T_ASSERT_MACH_SUCCESS(retval, "allocate a port=[%d]", port2);
 
+       T_ASSERT_EQ(get_port_type(port2), MACH_PORT_TYPE_RECEIVE,
+           "0x%x should be a receive right", port2);
+
        name = port;
        retval = mach_port_insert_right(task, name, port2, MACH_MSG_TYPE_MAKE_SEND);
        T_ASSERT_MACH_ERROR(retval, KERN_RIGHT_EXISTS, "insert a send right for port=[%d] with name=[%d]", port2, name);
index acb7d119aec54da7f90dd373d008409b5e714df4..184d2f62a4522c151e771a51309cee1ab2611039 100644 (file)
@@ -7,6 +7,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
 
 T_DECL(mach_port_mod_refs, "mach_port_mod_refs"){
        mach_port_t port_set;
index d58dd85c4824a646c8d6602ac029b0ec3ca5814f..43fb263a02e060dab1cfa5a465838e08642cf64e 100644 (file)
@@ -2,6 +2,8 @@
 
 #include <darwintest.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 extern kern_return_t mach_timebase_info_trap(mach_timebase_info_t info);
 
 T_DECL(mach_timebase_info, "mach_timebase_info(_trap)",
diff --git a/tests/memorystatus_assertion_helpers.c b/tests/memorystatus_assertion_helpers.c
new file mode 100644 (file)
index 0000000..daa7731
--- /dev/null
@@ -0,0 +1,239 @@
+#include <sys/sysctl.h>
+#include <sys/kern_memorystatus.h>
+
+#include <darwintest.h>
+
+#include "memorystatus_assertion_helpers.h"
+
+static void log_state(uint32_t state);
+
+int
+set_priority(pid_t pid, int32_t priority, uint64_t user_data, boolean_t is_assertion_driven)
+{
+       int err;
+       uint32_t flag = 0;
+       memorystatus_priority_properties_t mjp = { 0 };
+
+       if (is_assertion_driven) {
+               /*
+                * Control over an assertion driven priority will be
+                * relinquished when priority == JETSAM_PRIORITY_IDLE
+                */
+               if (priority == JETSAM_PRIORITY_IDLE) {
+                       T_LOG("Relinquish ...assertion... priority(%d) for pid[%d]", priority, pid);
+               } else {
+                       T_LOG("Setting ...assertion... priority(%d) for pid[%d]", priority, pid);
+               }
+               flag |= MEMORYSTATUS_SET_PRIORITY_ASSERTION;
+       } else {
+               T_LOG("Setting ...requested... priority(%d) for pid[%d]", priority, pid);
+               flag = 0;
+       }
+
+       mjp.priority = priority;
+       mjp.user_data = user_data;
+
+       err = memorystatus_control(MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES, pid, flag, &mjp, sizeof(mjp));
+
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(err, "MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES failed");
+       return err;
+}
+
+boolean_t
+check_properties(pid_t pid, int32_t expected_priority, int32_t expected_limit_mb, uint64_t expected_user_data, boolean_t expected_assertion_state, const char *test)
+{
+       const char *PROP_CHECK_ERROR_STRING = "property mismatch";
+       boolean_t verbose = true;
+       boolean_t ret;
+
+       int32_t  actual_priority = 0;
+       int32_t  actual_limit_mb = 0;
+       uint64_t actual_user_data = 0;
+       uint32_t actual_state = 0;
+
+       verbose = false;
+       (void)get_priority_props(pid, verbose, &actual_priority, &actual_limit_mb, &actual_user_data, &actual_state);
+
+       if (test != NULL) {
+               T_LOG("check_properties: %s", test);
+       }
+
+       ret = verify_assertion_state(actual_state, expected_assertion_state);
+       T_QUIET;
+       T_ASSERT_TRUE(ret, "verify_assertion_state failed");
+
+
+       /*
+        * These tests use well defined limits, so we don't try to handle defaults like
+        * a limit of <= 0 which typically applies a system-wide per process limit.
+        */
+
+       if ((actual_priority != expected_priority) || (actual_limit_mb != expected_limit_mb) || (actual_user_data != expected_user_data)) {
+               /* we have a mismatch */
+               T_LOG("%s test failed: %s\n", test, PROP_CHECK_ERROR_STRING);
+
+               if (actual_priority != expected_priority) {
+                       T_LOG("priority mismatch [actual / expected] [%d / %d]", actual_priority, expected_priority);
+               }
+
+               if (actual_limit_mb != expected_limit_mb) {
+                       T_LOG("limit mismatch [actual / expected] [%d / %d]", actual_limit_mb, expected_limit_mb);
+               }
+
+               if (actual_user_data != expected_user_data) {
+                       T_LOG("user data mismatch [actual / expected] [0x%llx / 0x%llx]", actual_user_data, expected_user_data);
+               }
+
+               T_LOG("state is 0x%x\n", actual_state);
+               log_state(actual_state);
+
+               T_ASSERT_FAIL("check_properties: %s", test);
+       } else {
+               T_PASS("check_properties: %s ok", test);
+               return true;
+       }
+       return false;
+}
+
+int
+set_assertion_priority(pid_t pid, int32_t priority, uint64_t user_data)
+{
+       return set_priority(pid, priority, user_data, TRUE);
+}
+
+int
+relinquish_assertion_priority(pid_t pid, uint64_t user_data)
+{
+       return set_assertion_priority(pid, JETSAM_PRIORITY_IDLE, user_data);
+}
+
+int
+set_memlimits(
+       pid_t pid,
+       int32_t active_limit_mb, int32_t inactive_limit_mb,
+       boolean_t active_is_fatal, boolean_t inactive_is_fatal)
+{
+       int err;
+       memorystatus_memlimit_properties_t mmprops;
+
+       memset(&mmprops, 0, sizeof(memorystatus_memlimit_properties_t));
+
+       mmprops.memlimit_active = active_limit_mb;
+       mmprops.memlimit_inactive = inactive_limit_mb;
+
+       if (active_is_fatal) {
+               mmprops.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
+       } else {
+               mmprops.memlimit_active_attr &= ~(uint32_t)MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
+       }
+
+       if (inactive_is_fatal) {
+               mmprops.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
+       } else {
+               mmprops.memlimit_inactive_attr &= ~(uint32_t)MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
+       }
+
+       T_LOG("Setting pid[%d] limits active [%d %s] inactive [%d %s]", pid,
+           mmprops.memlimit_active, (active_is_fatal ? "hard" : "soft"),
+           mmprops.memlimit_inactive, (inactive_is_fatal ? "hard" : "soft"));
+
+       err =  memorystatus_control(MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES, pid, 0, &mmprops, sizeof(mmprops));
+
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(err, "MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES failed");
+       return err;
+}
+
+boolean_t
+get_priority_props(pid_t pid, boolean_t verbose, int32_t *priority, int32_t *limit_mb, uint64_t *user_data, uint32_t *state)
+{
+       memorystatus_priority_entry_t entry = {0};
+
+       int size = memorystatus_control(MEMORYSTATUS_CMD_GET_PRIORITY_LIST, pid, 0, &entry, sizeof(entry));
+
+       /* validate size returned */
+       if (size <= 0) {
+               T_ASSERT_FAIL("get_priority: can't get list size: %d!\n", size);
+       }
+
+       if (size != sizeof(entry)) {
+               T_ASSERT_FAIL("get_priority: returned unexpected entry size\n");
+       }
+
+       if (entry.pid != pid) {
+               T_ASSERT_FAIL("get_priority: returned unexpected entry pid\n");
+       }
+
+       T_LOG("get_priority_props: pid[%d] limit %d, user_data 0x%llx, priority %d, state 0x%x",
+           entry.pid, entry.limit, entry.user_data, entry.priority, entry.state);
+
+
+       if (verbose) {
+               log_state(entry.state);
+       }
+
+       if (priority) {
+               *priority = entry.priority;
+       }
+       if (limit_mb) {
+               *limit_mb = entry.limit;
+       }
+       if (user_data) {
+               *user_data = entry.user_data;
+       }
+       if (state) {
+               *state = entry.state;
+       }
+
+       return true;
+}
+
+boolean_t
+verify_assertion_state(uint32_t state, boolean_t expected_assertion_state)
+{
+       boolean_t actual_assertion_state;
+       char *actual_string;
+       char *expected_string;
+
+       if (expected_assertion_state == ASSERTION_STATE_IS_SET) {
+               expected_string = "ASSERTION_STATE_IS_SET";
+       } else {
+               expected_string = "ASSERTION_STATE_IS_RELINQUISHED";
+       }
+
+       if (state & kMemorystatusAssertion) {
+               /*
+                * An assertion driven jetsam priority is at play.
+                */
+               actual_assertion_state = ASSERTION_STATE_IS_SET;
+               actual_string = "ASSERTION_STATE_IS_SET";
+       } else {
+               /*
+                * There is no assertion driven jetsam priority in place.
+                */
+               actual_assertion_state = ASSERTION_STATE_IS_RELINQUISHED;
+               actual_string = "ASSERTION_STATE_IS_RELINQUISHED";
+       }
+
+       if (actual_assertion_state == expected_assertion_state) {
+               T_PASS("%s as expected", expected_string);
+               return true;
+       } else {
+               T_FAIL("state 0x%x:  %s but expected %s", state, actual_string, expected_string);
+               // log_state(state);
+               return false;   /* failed */
+       }
+}
+
+static void
+log_state(uint32_t state)
+{
+       T_LOG("\t%s kMemorystatusSuspended", ((state & kMemorystatusSuspended)        ? "IS " : "NOT"));
+       T_LOG("\t%s kMemorystatusFrozen", ((state & kMemorystatusFrozen)           ? "IS " : "NOT"));
+       T_LOG("\t%s kMemorystatusWasThawed", ((state & kMemorystatusWasThawed)        ? "IS " : "NOT"));
+       T_LOG("\t%s kMemorystatusTracked", ((state & kMemorystatusTracked)          ? "IS " : "NOT"));
+       T_LOG("\t%s kMemorystatusSupportsIdleExit", ((state & kMemorystatusSupportsIdleExit) ? "IS " : "NOT"));
+       T_LOG("\t%s kMemorystatusDirty", ((state & kMemorystatusDirty)            ? "IS " : "NOT"));
+       T_LOG("\t%s kMemorystatusAssertion", ((state & kMemorystatusAssertion)        ? "IS " : "NOT"));
+}
diff --git a/tests/memorystatus_assertion_helpers.h b/tests/memorystatus_assertion_helpers.h
new file mode 100644 (file)
index 0000000..7827743
--- /dev/null
@@ -0,0 +1,92 @@
+#ifndef MEMORYSTATUS_ASSERTION_HELPERS_H
+#define MEMORYSTATUS_ASSERTION_HELPERS_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#define ASSERTION_STATE_IS_SET          true
+#define ASSERTION_STATE_IS_RELINQUISHED false
+
+/* Helper functions for setting and checking memorystatus assertions
+ * on processes.
+ */
+
+/*
+ * Set the jetsam priority and user data for a process.
+ *
+ * If this request is assertion driven, the kernel will
+ * set the process's assertion priority.
+ *
+ * If this request is not assertion driven, the kernel
+ * will set the process's requested priority.
+ *
+ * The kernel will then apply policy and move the process
+ * to the appropriate jetsam priority.
+ *
+ * Returns:    0 on success
+ *        non-0 on failure
+ */
+int
+set_priority(pid_t pid, int32_t priority, uint64_t user_data, boolean_t is_assertion_driven);
+
+/*
+ * Return: true on success
+ *         false on failure  --> this asserts a failure and quits test
+ */
+boolean_t
+check_properties(pid_t pid, int32_t expected_priority, int32_t expected_limit_mb, uint64_t expected_user_data, boolean_t expected_assertion_state, const char *test);
+
+/*
+ *  Set the active and inactive memlimits for a process.
+ *  Set the fatalness for each limit.
+ *
+ * Returns:     0 on success
+ *              non-zero on failure
+ */
+int
+set_memlimits(
+       pid_t pid,
+       int32_t active_limit_mb, int32_t inactive_limit_mb,
+       boolean_t active_is_fatal, boolean_t inactive_is_fatal);
+
+/*
+ * Returns:    0 on success
+ *        non-0 on failure
+ */
+int
+set_assertion_priority(pid_t pid, int32_t priority, uint64_t user_data);
+
+/*
+ * Returns:    0 on success
+ *        non-0 on failure
+ */
+int
+relinquish_assertion_priority(pid_t pid, uint64_t user_data);
+
+/*
+ * Get the priority properties for a single process.
+ *
+ * This returns the process's effective jetsam priority, jetsam limit,
+ * user_data (not kernel related), and proc's kernel state.
+ * If this call fails, there is no reason to continue the test.
+ *
+ * Return: true on success
+ *        false on failure  --> this asserts fail and test quits
+ */
+boolean_t
+get_priority_props(pid_t pid, boolean_t verbose, int32_t *priority, int32_t *limit_mb, uint64_t *user_data, uint32_t *state);
+
+/*
+ * Input:
+ *     state:   kernel state bits from the get_priority_props() call
+ *      expected_assertion_state:
+ *             true if process should be holding an assertion state.
+ *             false if no assertion state is held (eg: relinquished).
+ *
+ * Return  true:  verification passed
+ *       false:  verification failed
+ */
+boolean_t
+verify_assertion_state(uint32_t state, boolean_t expected_assertion_state);
+
+#endif /* MEMORYSTATUS_ASSERTION_HELPERS_H */
index abd029e115c7fdcaac07419d0d0b63647842d6ed..c9399519aaccd50b047673a31b85dd1246cd0673 100644 (file)
@@ -3,6 +3,10 @@
 #include <sys/sysctl.h>
 #include <sys/kern_memorystatus.h>
 #include <mach-o/dyld.h>
+#include <mach/mach_vm.h>
+#include <mach/vm_page_size.h>  /* Needed for vm_region info */
+#include <mach/shared_region.h>
+#include <mach/mach.h>
 
 #ifdef T_NAMESPACE
 #undef T_NAMESPACE
@@ -10,6 +14,8 @@
 #include <darwintest.h>
 #include <darwintest_utils.h>
 
+#include "memorystatus_assertion_helpers.h"
+
 T_GLOBAL_META(
        T_META_NAMESPACE("xnu.vm"),
        T_META_CHECK_LEAKS(false)
@@ -17,6 +23,7 @@ T_GLOBAL_META(
 
 #define MEM_SIZE_MB                     10
 #define NUM_ITERATIONS          5
+#define FREEZE_PAGES_MAX 256
 
 #define CREATE_LIST(X) \
        X(SUCCESS) \
@@ -29,6 +36,7 @@ T_GLOBAL_META(
        X(MEMORYSTATUS_CONTROL_FAILED) \
        X(IS_FREEZABLE_NOT_AS_EXPECTED) \
        X(MEMSTAT_PRIORITY_CHANGE_FAILED) \
+    X(INVALID_ALLOCATE_PAGES_ARGUMENTS) \
        X(EXIT_CODE_MAX)
 
 #define EXIT_CODES_ENUM(VAR) VAR,
@@ -41,14 +49,199 @@ static const char *exit_codes_str[] = {
        CREATE_LIST(EXIT_CODES_STRING)
 };
 
+static int
+get_vmpage_size()
+{
+       int vmpage_size;
+       size_t size = sizeof(vmpage_size);
+       int ret = sysctlbyname("vm.pagesize", &vmpage_size, &size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "failed to query vm.pagesize");
+       T_QUIET; T_ASSERT_GT(vmpage_size, 0, "vm.pagesize is not > 0");
+       return vmpage_size;
+}
 
-static pid_t pid = -1;
+static pid_t child_pid = -1;
 static int freeze_count = 0;
 
 void move_to_idle_band(void);
-void run_freezer_test(int size_mb);
+void run_freezer_test(int);
 void freeze_helper_process(void);
+/* Gets and optionally sets the freeze pages max threshold */
+int sysctl_freeze_pages_max(int* new_value);
+
+/* NB: in_shared_region and get_rprvt are pulled from the memorystatus unit test.
+ * We're moving away from those unit tests, so they're copied here.
+ */
+
+/* Cribbed from 'top'... */
+static int
+in_shared_region(mach_vm_address_t addr, cpu_type_t type)
+{
+       mach_vm_address_t base = 0, size = 0;
+
+       switch (type) {
+       case CPU_TYPE_ARM:
+               base = SHARED_REGION_BASE_ARM;
+               size = SHARED_REGION_SIZE_ARM;
+               break;
+
+       case CPU_TYPE_ARM64:
+               base = SHARED_REGION_BASE_ARM64;
+               size = SHARED_REGION_SIZE_ARM64;
+               break;
+
+
+       case CPU_TYPE_X86_64:
+               base = SHARED_REGION_BASE_X86_64;
+               size = SHARED_REGION_SIZE_X86_64;
+               break;
+
+       case CPU_TYPE_I386:
+               base = SHARED_REGION_BASE_I386;
+               size = SHARED_REGION_SIZE_I386;
+               break;
+
+       case CPU_TYPE_POWERPC:
+               base = SHARED_REGION_BASE_PPC;
+               size = SHARED_REGION_SIZE_PPC;
+               break;
+
+       case CPU_TYPE_POWERPC64:
+               base = SHARED_REGION_BASE_PPC64;
+               size = SHARED_REGION_SIZE_PPC64;
+               break;
+
+       default: {
+               int t = type;
+
+               fprintf(stderr, "unknown CPU type: 0x%x\n", t);
+               abort();
+       }
+       }
+
+       return addr >= base && addr < (base + size);
+}
+
+/* Get the resident private memory of the given pid */
+static unsigned long long
+get_rprvt(pid_t pid)
+{
+       mach_port_name_t task;
+       kern_return_t kr;
+
+       mach_vm_size_t rprvt = 0;
+       mach_vm_size_t empty = 0;
+       mach_vm_size_t fw_private = 0;
+       mach_vm_size_t pagesize = vm_kernel_page_size;  // The vm_region page info is reported
+                                                       // in terms of vm_kernel_page_size.
+       mach_vm_size_t regs = 0;
+
+       mach_vm_address_t addr;
+       mach_vm_size_t size;
+
+       int split = 0;
+
+       kr = task_for_pid(mach_task_self(), pid, &task);
+       T_QUIET; T_ASSERT_TRUE(kr == KERN_SUCCESS, "Unable to get task_for_pid of child");
+
+       for (addr = 0;; addr += size) {
+               vm_region_top_info_data_t info;
+               mach_msg_type_number_t count = VM_REGION_TOP_INFO_COUNT;
+               mach_port_t object_name;
+
+               kr = mach_vm_region(task, &addr, &size, VM_REGION_TOP_INFO, (vm_region_info_t)&info, &count, &object_name);
+               if (kr != KERN_SUCCESS) {
+                       break;
+               }
+
+#if   defined (__arm64__)
+               if (in_shared_region(addr, CPU_TYPE_ARM64)) {
+#else
+               if (in_shared_region(addr, CPU_TYPE_ARM)) {
+#endif
+                       // Private Shared
+                       fw_private += info.private_pages_resident * pagesize;
+
+                       /*
+                        * Check if this process has the globally shared
+                        * text and data regions mapped in.  If so, set
+                        * split to TRUE and avoid checking
+                        * again.
+                        */
+                       if (split == FALSE && info.share_mode == SM_EMPTY) {
+                               vm_region_basic_info_data_64_t  b_info;
+                               mach_vm_address_t b_addr = addr;
+                               mach_vm_size_t b_size = size;
+                               count = VM_REGION_BASIC_INFO_COUNT_64;
+
+                               kr = mach_vm_region(task, &b_addr, &b_size, VM_REGION_BASIC_INFO_64, (vm_region_info_t)&b_info, &count, &object_name);
+                               if (kr != KERN_SUCCESS) {
+                                       break;
+                               }
+
+                               if (b_info.reserved) {
+                                       split = TRUE;
+                               }
+                       }
+
+                       /*
+                        * Short circuit the loop if this isn't a shared
+                        * private region, since that's the only region
+                        * type we care about within the current address
+                        * range.
+                        */
+                       if (info.share_mode != SM_PRIVATE) {
+                               continue;
+                       }
+               }
 
+               regs++;
+
+               /*
+                * Update counters according to the region type.
+                */
+
+               if (info.share_mode == SM_COW && info.ref_count == 1) {
+                       // Treat single reference SM_COW as SM_PRIVATE
+                       info.share_mode = SM_PRIVATE;
+               }
+
+               switch (info.share_mode) {
+               case SM_LARGE_PAGE:
+               // Treat SM_LARGE_PAGE the same as SM_PRIVATE
+               // since they are not shareable and are wired.
+               case SM_PRIVATE:
+                       rprvt += info.private_pages_resident * pagesize;
+                       rprvt += info.shared_pages_resident * pagesize;
+                       break;
+
+               case SM_EMPTY:
+                       empty += size;
+                       break;
+
+               case SM_COW:
+               case SM_SHARED:
+                       if (pid == 0) {
+                               // Treat kernel_task specially
+                               if (info.share_mode == SM_COW) {
+                                       rprvt += info.private_pages_resident * pagesize;
+                               }
+                               break;
+                       }
+
+                       if (info.share_mode == SM_COW) {
+                               rprvt += info.private_pages_resident * pagesize;
+                       }
+                       break;
+
+               default:
+                       assert(0);
+                       break;
+               }
+       }
+
+       return rprvt;
+}
 
 void
 move_to_idle_band(void)
@@ -75,9 +268,12 @@ freeze_helper_process(void)
 {
        size_t length;
        int ret, freeze_enabled, errno_freeze_sysctl;
+       uint64_t resident_memory_before, resident_memory_after, vmpage_size;
+       vmpage_size = (uint64_t) get_vmpage_size();
+       resident_memory_before = get_rprvt(child_pid) / vmpage_size;
 
-       T_LOG("Freezing child pid %d", pid);
-       ret = sysctlbyname("kern.memorystatus_freeze", NULL, NULL, &pid, sizeof(pid));
+       T_LOG("Freezing child pid %d", child_pid);
+       ret = sysctlbyname("kern.memorystatus_freeze", NULL, NULL, &child_pid, sizeof(child_pid));
        errno_freeze_sysctl = errno;
        sleep(1);
 
@@ -85,7 +281,7 @@ freeze_helper_process(void)
         * The child process toggles its freezable state on each iteration.
         * So a failure for every alternate freeze is expected.
         */
-       if (freeze_count % 2 == 0) {
+       if (freeze_count % 2) {
                length = sizeof(freeze_enabled);
                T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.freeze_enabled", &freeze_enabled, &length, NULL, 0),
                    "failed to query vm.freeze_enabled");
@@ -97,9 +293,13 @@ freeze_helper_process(void)
                        T_LOG("Freeze has been disabled. Terminating early.");
                        T_END;
                }
-
-               T_LOG("Freeze succeeded. Thawing child pid %d", pid);
-               ret = sysctlbyname("kern.memorystatus_thaw", NULL, NULL, &pid, sizeof(pid));
+               resident_memory_after = get_rprvt(child_pid) / vmpage_size;
+               uint64_t freeze_pages_max = (uint64_t) sysctl_freeze_pages_max(NULL);
+               T_QUIET; T_ASSERT_LT(resident_memory_after, resident_memory_before, "Freeze didn't reduce resident memory set");
+               if (resident_memory_before > freeze_pages_max) {
+                       T_QUIET; T_ASSERT_LE(resident_memory_before - resident_memory_after, freeze_pages_max, "Freeze pages froze more than the threshold.");
+               }
+               ret = sysctlbyname("kern.memorystatus_thaw", NULL, NULL, &child_pid, sizeof(child_pid));
                T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_thaw failed");
        } else {
                T_QUIET; T_ASSERT_TRUE(ret != KERN_SUCCESS, "Freeze should have failed");
@@ -108,11 +308,11 @@ freeze_helper_process(void)
 
        freeze_count++;
 
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGUSR1), "failed to send SIGUSR1 to child process");
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGUSR1), "failed to send SIGUSR1 to child process");
 }
 
 void
-run_freezer_test(int size_mb)
+run_freezer_test(int num_pages)
 {
        int ret, freeze_enabled;
        char sz_str[50];
@@ -138,7 +338,7 @@ run_freezer_test(int size_mb)
                if (freeze_count < NUM_ITERATIONS) {
                        freeze_helper_process();
                } else {
-                       kill(pid, SIGKILL);
+                       kill(child_pid, SIGKILL);
                        dispatch_source_cancel(ds_freeze);
                }
        });
@@ -149,7 +349,7 @@ run_freezer_test(int size_mb)
        T_QUIET; T_ASSERT_POSIX_ZERO(ret, "_NSGetExecutablePath");
        T_LOG("Executable path: %s", testpath);
 
-       sprintf(sz_str, "%d", size_mb);
+       sprintf(sz_str, "%d", num_pages);
        launch_tool_args = (char *[]){
                testpath,
                "-n",
@@ -160,19 +360,19 @@ run_freezer_test(int size_mb)
        };
 
        /* Spawn the child process. Suspend after launch until the exit proc handler has been set up. */
-       ret = dt_launch_tool(&pid, launch_tool_args, true, NULL, NULL);
+       ret = dt_launch_tool(&child_pid, launch_tool_args, true, NULL, NULL);
        if (ret != 0) {
                T_LOG("dt_launch tool returned %d with error code %d", ret, errno);
        }
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "dt_launch_tool");
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(child_pid, "dt_launch_tool");
 
-       ds_proc = dispatch_source_create(DISPATCH_SOURCE_TYPE_PROC, (uintptr_t)pid, DISPATCH_PROC_EXIT, dispatch_get_main_queue());
+       ds_proc = dispatch_source_create(DISPATCH_SOURCE_TYPE_PROC, (uintptr_t)child_pid, DISPATCH_PROC_EXIT, dispatch_get_main_queue());
        T_QUIET; T_ASSERT_NOTNULL(ds_proc, "dispatch_source_create (ds_proc)");
 
        dispatch_source_set_event_handler(ds_proc, ^{
                int status = 0, code = 0;
-               pid_t rc = waitpid(pid, &status, 0);
-               T_QUIET; T_ASSERT_EQ(rc, pid, "waitpid");
+               pid_t rc = waitpid(child_pid, &status, 0);
+               T_QUIET; T_ASSERT_EQ(rc, child_pid, "waitpid");
                code = WEXITSTATUS(status);
 
                if (code == 0) {
@@ -185,35 +385,24 @@ run_freezer_test(int size_mb)
        });
        dispatch_activate(ds_proc);
 
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGCONT), "failed to send SIGCONT to child process");
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGCONT), "failed to send SIGCONT to child process");
        dispatch_main();
 }
 
-T_HELPER_DECL(allocate_pages,
-    "allocates pages to freeze",
-    T_META_ASROOT(true)) {
-       int i, j, ret, size_mb, vmpgsize;
-       size_t len;
+static void
+allocate_pages(int num_pages)
+{
+       int i, j, vmpgsize;
        char val;
-       __block int num_pages, num_iter = 0;
+       __block int num_iter = 0;
        __block char **buf;
        dispatch_source_t ds_signal;
-
-       len = sizeof(vmpgsize);
-       ret = sysctlbyname("vm.pagesize", &vmpgsize, &len, NULL, 0);
-       if (ret != 0) {
-               exit(SYSCTL_VM_PAGESIZE_FAILED);
-       }
-       if (vmpgsize == 0) {
-               exit(VM_PAGESIZE_IS_ZERO);
-       }
-
-       if (argc < 1) {
-               exit(TOO_FEW_ARGUMENTS);
+       vmpgsize = get_vmpage_size();
+       if (num_pages < 1) {
+               printf("Invalid number of pages to allocate: %d\n", num_pages);
+               exit(INVALID_ALLOCATE_PAGES_ARGUMENTS);
        }
 
-       size_mb = atoi(argv[0]);
-       num_pages = size_mb * 1024 * 1024 / vmpgsize;
        buf = (char**)malloc(sizeof(char*) * (size_t)num_pages);
 
        /* Gives us the compression ratio we see in the typical case (~2.7) */
@@ -252,6 +441,10 @@ T_HELPER_DECL(allocate_pages,
                }
 
                current_state = memorystatus_control(MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE, getpid(), 0, NULL, 0);
+               /* Sysprocs start off as unfreezable. Verify that first. */
+               if (num_iter == 0 && current_state != 0) {
+                       exit(IS_FREEZABLE_NOT_AS_EXPECTED);
+               }
 
                /* Toggle freezable state */
                new_state = (current_state) ? 0: 1;
@@ -278,6 +471,182 @@ T_HELPER_DECL(allocate_pages,
        dispatch_main();
 }
 
-T_DECL(freeze, "VM freezer test") {
-       run_freezer_test(MEM_SIZE_MB);
+T_HELPER_DECL(allocate_pages,
+    "allocates pages to freeze",
+    T_META_ASROOT(true)) {
+       if (argc < 1) {
+               exit(TOO_FEW_ARGUMENTS);
+       }
+
+       int num_pages = atoi(argv[0]);
+       allocate_pages(num_pages);
+}
+
+T_DECL(freeze, "VM freezer test", T_META_ASROOT(true)) {
+       run_freezer_test(
+               (MEM_SIZE_MB << 20) / get_vmpage_size());
+}
+
+static int old_freeze_pages_max = 0;
+static void
+reset_freeze_pages_max()
+{
+       if (old_freeze_pages_max != 0) {
+               sysctl_freeze_pages_max(&old_freeze_pages_max);
+       }
+}
+
+int
+sysctl_freeze_pages_max(int* new_value)
+{
+       static int set_end_handler = false;
+       int freeze_pages_max, ret;
+       size_t size = sizeof(freeze_pages_max);
+       ret = sysctlbyname("kern.memorystatus_freeze_pages_max", &freeze_pages_max, &size, new_value, size);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "Unable to query kern.memorystatus_freeze_pages_max");
+       if (!set_end_handler) {
+               // Save the original value and instruct darwintest to restore it after the test completes
+               old_freeze_pages_max = freeze_pages_max;
+               T_ATEND(reset_freeze_pages_max);
+               set_end_handler = true;
+       }
+       return old_freeze_pages_max;
+}
+
+T_DECL(freeze_over_max_threshold, "Max Freeze Threshold is Enforced", T_META_ASROOT(true)) {
+       int freeze_pages_max = FREEZE_PAGES_MAX;
+       sysctl_freeze_pages_max(&freeze_pages_max);
+       run_freezer_test(FREEZE_PAGES_MAX * 2);
+}
+
+T_HELPER_DECL(frozen_background, "Frozen background process", T_META_ASROOT(true)) {
+       kern_return_t kern_ret;
+       /* Set the process to freezable */
+       kern_ret = memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE, getpid(), 1, NULL, 0);
+       T_QUIET; T_ASSERT_EQ(kern_ret, KERN_SUCCESS, "set process is freezable");
+       /* Signal to our parent that we can be frozen */
+       if (kill(getppid(), SIGUSR1) != 0) {
+               T_LOG("Unable to signal to parent process!");
+               exit(1);
+       }
+       while (1) {
+               ;
+       }
+}
+
+/* Launches the frozen_background helper as a managed process. */
+static pid_t
+launch_frozen_background_process()
+{
+       pid_t pid;
+       char **launch_tool_args;
+       char testpath[PATH_MAX];
+       uint32_t testpath_buf_size;
+       int ret;
+
+       testpath_buf_size = sizeof(testpath);
+       ret = _NSGetExecutablePath(testpath, &testpath_buf_size);
+       printf("Launching %s\n", testpath);
+       launch_tool_args = (char *[]){
+               testpath,
+               "-n",
+               "frozen_background",
+               NULL
+       };
+       ret = dt_launch_tool(&pid, launch_tool_args, false, NULL, NULL);
+       if (ret != 0) {
+               T_LOG("dt_launch tool returned %d with error code %d", ret, errno);
+       }
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "dt_launch_tool");
+       /* Set the process's managed bit, so that the kernel treats this process like an app instead of a sysproc. */
+       ret = memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED, pid, 1, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "memorystatus_control");
+       return pid;
+}
+
+static void
+freeze_process(pid_t pid)
+{
+       int ret, freeze_enabled, errno_freeze_sysctl;
+       size_t length;
+       T_LOG("Freezing pid %d", pid);
+
+       ret = sysctlbyname("kern.memorystatus_freeze", NULL, NULL, &pid, sizeof(pid));
+       errno_freeze_sysctl = errno;
+       length = sizeof(freeze_enabled);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.freeze_enabled", &freeze_enabled, &length, NULL, 0),
+           "failed to query vm.freeze_enabled");
+       if (freeze_enabled) {
+               errno = errno_freeze_sysctl;
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_freeze failed");
+       } else {
+               /* If freezer is disabled, skip the test. This can happen due to disk space shortage. */
+               T_LOG("Freeze has been disabled. Terminating early.");
+               T_END;
+       }
+}
+
+static void
+memorystatus_assertion_test_demote_frozen()
+{
+#if !CONFIG_EMBEDDED
+       T_SKIP("Freezing processes is only supported on embedded");
+#endif
+       /*
+        * Test that if we assert a priority on a process, freeze it, and then demote all frozen processes, it does not get demoted below the asserted priority.
+        * Then remove thee assertion, and ensure it gets demoted properly.
+        */
+       /* these values will remain fixed during testing */
+       int             active_limit_mb = 15;   /* arbitrary */
+       int             inactive_limit_mb = 7;  /* arbitrary */
+       /* Launch the child process, and elevate its priority */
+       int requestedpriority;
+       dispatch_source_t ds_signal, ds_exit;
+       requestedpriority = JETSAM_PRIORITY_UI_SUPPORT;
+
+       /* Wait for the child process to tell us that it's ready, and then freeze it */
+       signal(SIGUSR1, SIG_IGN);
+       ds_signal = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue());
+       T_QUIET; T_ASSERT_NOTNULL(ds_signal, "dispatch_source_create");
+       dispatch_source_set_event_handler(ds_signal, ^{
+               int sysctl_ret;
+               /* Freeze the process, trigger agressive demotion, and check that it hasn't been demoted. */
+               freeze_process(child_pid);
+               /* Agressive demotion */
+               sysctl_ret = sysctlbyname("kern.memorystatus_demote_frozen_processes", NULL, NULL, NULL, 0);
+               T_ASSERT_POSIX_SUCCESS(sysctl_ret, "sysctl kern.memorystatus_demote_frozen_processes failed");
+               /* Check */
+               (void)check_properties(child_pid, requestedpriority, inactive_limit_mb, 0x0, ASSERTION_STATE_IS_SET, "Priority was set");
+               T_LOG("Relinquishing our assertion.");
+               /* Relinquish our assertion, and check that it gets demoted. */
+               relinquish_assertion_priority(child_pid, 0x0);
+               (void)check_properties(child_pid, JETSAM_PRIORITY_AGING_BAND2, inactive_limit_mb, 0x0, ASSERTION_STATE_IS_RELINQUISHED, "Assertion was reqlinquished.");
+               /* Kill the child */
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGKILL), "Unable to kill child process");
+               T_END;
+       });
+
+       /* Launch the child process and set the initial properties on it. */
+       child_pid = launch_frozen_background_process();
+       set_memlimits(child_pid, active_limit_mb, inactive_limit_mb, false, false);
+       set_assertion_priority(child_pid, requestedpriority, 0x0);
+       (void)check_properties(child_pid, requestedpriority, inactive_limit_mb, 0x0, ASSERTION_STATE_IS_SET, "Priority was set");
+       /* Listen for exit. */
+       ds_exit = dispatch_source_create(DISPATCH_SOURCE_TYPE_PROC, (uintptr_t)child_pid, DISPATCH_PROC_EXIT, dispatch_get_main_queue());
+       dispatch_source_set_event_handler(ds_exit, ^{
+               int status = 0, code = 0;
+               pid_t rc = waitpid(child_pid, &status, 0);
+               T_QUIET; T_ASSERT_EQ(rc, child_pid, "waitpid");
+               code = WEXITSTATUS(status);
+               T_QUIET; T_ASSERT_EQ(code, 0, "Child exited cleanly");
+               T_END;
+       });
+
+       dispatch_activate(ds_exit);
+       dispatch_activate(ds_signal);
+       dispatch_main();
+}
+
+T_DECL(assertion_test_demote_frozen, "demoted frozen process goes to asserted priority.", T_META_ASROOT(true)) {
+       memorystatus_assertion_test_demote_frozen();
 }
diff --git a/tests/memorystatus_is_assertion.c b/tests/memorystatus_is_assertion.c
new file mode 100644 (file)
index 0000000..6475513
--- /dev/null
@@ -0,0 +1,506 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <signal.h>
+#include <spawn.h>
+#include <spawn_private.h>
+#include <stdint.h>
+#include <sys/sysctl.h>
+#include <sys/spawn_internal.h>
+#include <sys/kern_memorystatus.h>
+#include <mach-o/dyld.h>
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+#include "memorystatus_assertion_helpers.h"
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.vm"),
+       T_META_CHECK_LEAKS(false)
+       );
+
+extern char **environ;
+
+/*
+ * This test has multiple sub-tests that set and then verify jetsam priority transitions
+ * as though they were driven by assertions. It uses the MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES
+ * version of the memorystatus_control() system call and specifically tests the use of the
+ * MEMORYSTATUS_SET_PRIORITY_ASSERTION flag.
+ *
+ * The kernel will apply policy that chooses a maximum jetsam priority, resolving conflicts
+ * between an assertion driven priority and clean/dirty transition policy.
+ *
+ * Processes that do not opt into dirty-tracking should behave as they always have.
+ * This is the typical App transition behavior.
+ *
+ * Processes that do opt into dirty-tracking have more complex policy:
+ * For example:
+ * A MAX assertion priority will prevent a dirty process from transitioning to a clean
+ * state if the process opts into idle-exit.
+ *    See: memorystatus_schedule_idle_demotion_locked() where we note that
+ *    the process isn't going to be making the trip to the lower bands.
+ *
+ * But a MAX assertion evaluation will not prevent a clean process from transition to dirty.
+ * Assertion driven priorities should not change memory limits, they are expected to
+ * just change a process's position in the jetsam priority bands.
+ *
+ * MEMORYSTATUS_CMD_xxx requires root (in the absence of entitlement).
+ * Use T_META_ASROOT(true) to accomplish this.
+ *
+ * A note on test strategy.  It is not necessary to spawn a child to test these
+ * assertion calls.   The test can act on itself, that is, it can make calls to
+ * set and relinquish assertion state just like it can make calls to do dirty/clean
+ * transitions.  Of course, in reality, we expect only runningboardd to manipulate
+ * assertion based priorities.
+ */
+
+/*
+ * New flag to tell kernel this is an assertion driven priority update.
+ */
+#ifndef MEMORYSTATUS_SET_PRIORITY_ASSERTION
+#define MEMORYSTATUS_SET_PRIORITY_ASSERTION 0x1
+#endif
+
+static void
+proc_will_set_clean(pid_t pid)
+{
+       proc_set_dirty(pid, false);
+       T_LOG("pid[%d] --> now clean", pid);
+       return;
+}
+
+static void
+proc_will_set_dirty(pid_t pid)
+{
+       proc_set_dirty(pid, true);
+       T_LOG("pid[%d] --> now dirty", pid);
+       return;
+}
+
+#define kJetsamAgingPolicyNone                          (0)
+#define kJetsamAgingPolicyLegacy                        (1)
+#define kJetsamAgingPolicySysProcsReclaimedFirst        (2)
+#define kJetsamAgingPolicyAppsReclaimedFirst            (3)
+#define kJetsamAgingPolicyMax                           kJetsamAgingPolicyAppsReclaimedFirst
+
+#ifndef kMemorystatusAssertion
+#define kMemorystatusAssertion 0x40
+#endif
+
+/*
+ * Make repetitive (eg: back-to-back) calls using MEMORYSTATUS_SET_PRIORITY_ASSERTION.
+ * We know that runningboardd may try to relinquish its hold on an assertion priority
+ * when it hasn't first set the assertion priority. The kernel must survive this
+ * pattern even though it might be considered poor behavior on runningboardd's part.
+ * When dirty tracking processes are involved, we are exercising the kernel's
+ * idle-deferred paths. Only assertion state (whether or not assertion state is
+ * set or relinquished) is verified in this round of tests.
+ * Test is invoked three times:
+ *     Scenario 1) as a non-dirty-tracking process  (like a typical app)
+ *             relinquish assertion priority multiple times
+ *             set same assertion priority multiple times.
+ *     Scenario 2) setup a dirty-tracking process that is clean  (like a typical extension)
+ *             relinquish assertion priority multiple times
+ *             set same assertion priority multiple times.
+ *     Scenario 3) setup dirty-tracking process that is dirty  (like a typical extension)
+ *             relinquish assertion priority multiple times
+ *             set same assertion priority multiple times.
+ */
+
+static void
+memorystatus_assertion_test_repetitive(char *test, boolean_t turn_on_dirty_tracking, boolean_t start_clean)
+{
+       int count;
+       int maxcount = 3;
+       boolean_t verbose;
+       uint32_t state;
+       uint64_t user_data = 0;
+       pid_t mypid = getpid();
+
+       /* these values will remain fixed during testing */
+       int             active_limit_mb = 15;   /* arbitrary */
+       int             inactive_limit_mb = 7;  /* arbitrary */
+
+       /* these values may vary during test */
+       int             requestedpriority = 0;
+       int             assertionpriority = 0;
+
+       T_SETUPBEGIN;
+
+       requestedpriority =  JETSAM_PRIORITY_UI_SUPPORT;
+       assertionpriority =  JETSAM_PRIORITY_FOREGROUND;
+       set_memlimits(mypid, active_limit_mb, inactive_limit_mb, true, true);
+       set_priority(mypid, requestedpriority, 0, false);
+
+       if (turn_on_dirty_tracking) {
+               proc_track_dirty(mypid, (PROC_DIRTY_TRACK | PROC_DIRTY_ALLOW_IDLE_EXIT | PROC_DIRTY_DEFER));
+
+               if (start_clean) {
+                       proc_will_set_clean(mypid);
+               } else {
+                       proc_will_set_dirty(mypid);
+               }
+       } else {
+               /*
+                * Do nothing.
+                * Acts like an app with no dirty tracking
+                * By default launches in the requested priority and is
+                * considered idle because it's below FG band.
+                */
+       }
+
+
+       verbose = false;
+       (void)get_priority_props(mypid, verbose, NULL, NULL, NULL, NULL);
+
+       /* log current setup state */
+       T_LOG("SETUP STATE COMPLETE: Test %s", test);
+
+       T_SETUPEND;
+
+       int i;
+       boolean_t ret;
+       for (i = 0; i < 2; i++) {
+               if (i == 1 && turn_on_dirty_tracking) {
+                       T_LOG("Avoid idle-deferred - sleeping for 20");
+                       sleep(20);
+
+                       if (start_clean) {
+                               proc_will_set_dirty(mypid);
+                       } else {
+                               proc_will_set_clean(mypid);
+                       }
+
+                       (void)get_priority_props(mypid, verbose, NULL, NULL, NULL, &state);
+               }
+
+               /*
+                * Relinquish assertion priority even though we don't
+                * currently hold an assertion priority.
+                */
+               for (count = 0; count < maxcount; count++) {
+                       if (relinquish_assertion_priority(mypid, user_data)) {
+                               T_ASSERT_FAIL("relinquish_assertion_priority failed");
+                       }
+               }
+
+               /* Verify assertion state is relinquished */
+               (void)get_priority_props(mypid, verbose, NULL, NULL, NULL, &state);
+
+               ret = verify_assertion_state(state, ASSERTION_STATE_IS_RELINQUISHED);
+               T_QUIET;
+               T_ASSERT_TRUE(ret, "verify_assertion_state failed");
+
+
+
+               /*
+                * Set an assertion priority multiple times in a row.
+                */
+               for (count = 0; count < maxcount; count++) {
+                       if (set_assertion_priority(mypid, assertionpriority, user_data) != 0) {
+                               T_ASSERT_FAIL("set_assertion_priority failed");
+                       }
+               }
+
+               /* Verify state holds an assertion priority */
+               (void)get_priority_props(mypid, verbose, NULL, NULL, NULL, &state);
+
+               ret = verify_assertion_state(state, ASSERTION_STATE_IS_SET);
+               T_QUIET;
+               T_ASSERT_TRUE(ret, "verify_assertion_state failed");
+       }
+}
+
+/*
+ * Process is dirty tracking and opts into pressured exit.
+ */
+static void
+memorystatus_assertion_test_allow_idle_exit()
+{
+       pid_t mypid = getpid();
+
+       /* these values will remain fixed during testing */
+       int active_limit_mb   = 15; /* arbitrary */
+       int inactive_limit_mb = 7;  /* arbitrary */
+
+       /* these values may vary during test */
+       int requestedpriority = JETSAM_PRIORITY_UI_SUPPORT;
+
+       T_SETUPBEGIN;
+
+       set_memlimits(mypid, active_limit_mb, inactive_limit_mb, true, true);
+       set_priority(mypid, requestedpriority, 0, false);
+
+       proc_track_dirty(mypid, (PROC_DIRTY_TRACK | PROC_DIRTY_ALLOW_IDLE_EXIT | PROC_DIRTY_DEFER));
+
+       proc_will_set_clean(mypid);
+
+       (void)check_properties(mypid, JETSAM_PRIORITY_IDLE_DEFERRED, inactive_limit_mb, 0x0, ASSERTION_STATE_IS_RELINQUISHED, "Clean start");
+
+       T_LOG("SETUP STATE COMPLETE");
+
+       int g_jetsam_aging_policy = 0;
+       /*
+        * Jetsam aging policy
+        * Failure to retrieve is not fatal.
+        */
+       size_t size = sizeof(g_jetsam_aging_policy);
+       if (sysctlbyname("kern.jetsam_aging_policy", &g_jetsam_aging_policy, &size, NULL, 0) != 0) {
+               T_LOG(__func__, true, "Unable to retrieve jetsam aging policy (not fatal)");
+       }
+
+       T_SETUPEND;
+
+       /*
+        * Relinquish assertion priority even though we don't hold it.  No change in state expected.
+        */
+       T_LOG("********Test0 clean: no state change on relinquish");
+       relinquish_assertion_priority(mypid, 0xF00D);
+       (void)check_properties(mypid, JETSAM_PRIORITY_IDLE_DEFERRED, inactive_limit_mb, 0xF00D, ASSERTION_STATE_IS_RELINQUISHED, "Test0");
+
+       T_LOG("********Test1 clean: deferred now assertion[10]");
+       set_assertion_priority(mypid, JETSAM_PRIORITY_FOREGROUND, 0xFEED);
+       (void)check_properties(mypid, JETSAM_PRIORITY_FOREGROUND, inactive_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test1");
+
+       /* Test2 */
+       T_LOG("********Test2 clean:  assertion[10 -> 3]");
+       set_assertion_priority(mypid, JETSAM_PRIORITY_BACKGROUND, 0xFACE);
+       (void)check_properties(mypid, JETSAM_PRIORITY_BACKGROUND, inactive_limit_mb, 0xFACE, ASSERTION_STATE_IS_SET, "Test2");
+
+       /* Test3 */
+       T_LOG("********Test3 clean: assertion[3 -> 0], but now deferred");
+       relinquish_assertion_priority(mypid, 0xBEEF);
+       (void)check_properties(mypid, JETSAM_PRIORITY_IDLE_DEFERRED, inactive_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test3");
+
+       /* Test4 */
+       T_LOG("********Test4 clean: deferred now assertion[10]");
+       set_assertion_priority(mypid, JETSAM_PRIORITY_FOREGROUND, 0xFEED);
+       (void)check_properties(mypid, JETSAM_PRIORITY_FOREGROUND, inactive_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test4");
+
+       T_LOG("Avoid idle-deferred moving forward. Sleeping for 20");
+       sleep(20);
+
+       /* Test5 */
+       T_LOG("********Test5 dirty: set dirty priority but assertion[10] prevails");
+       proc_will_set_dirty(mypid);   /* active priority is less than FG*/
+       (void)check_properties(mypid, JETSAM_PRIORITY_FOREGROUND, active_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test5");
+
+       /* Test6 */
+       T_LOG("********Test6 dirty: assertion[10 -> 3] but dirty priority prevails");
+       set_assertion_priority(mypid, JETSAM_PRIORITY_BACKGROUND, 0xFEEB);  /* active priority is > BG */
+       (void)check_properties(mypid, JETSAM_PRIORITY_UI_SUPPORT, active_limit_mb, 0xFEEB, ASSERTION_STATE_IS_SET, "Test6");
+
+       /* Test7 */
+       T_LOG("********Test7 dirty: assertion[3 -> 0] but dirty prevails");
+       relinquish_assertion_priority(mypid, 0xBEEF);
+       (void)check_properties(mypid, JETSAM_PRIORITY_UI_SUPPORT, active_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test7");
+
+
+       /* Test8 */
+       T_LOG("********Test8 dirty: assertion[0 -> 10] overrides dirty");
+       set_assertion_priority(mypid, JETSAM_PRIORITY_FOREGROUND, 0xFEED);
+       (void)check_properties(mypid, JETSAM_PRIORITY_FOREGROUND, active_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test8");
+
+       /* Test9 */
+       T_LOG("********Test9 dirty wants to go clean, but clean state is prevented as assertion[10] prevails");
+       proc_will_set_clean(mypid);
+       (void)check_properties(mypid, JETSAM_PRIORITY_FOREGROUND, active_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test9");
+
+       /* Test10 */
+       T_LOG("********Test10 dirty goes dirty and stays dirty, and assertion[10] prevails again");
+       proc_will_set_dirty(mypid);
+       (void)check_properties(mypid, JETSAM_PRIORITY_FOREGROUND, active_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test10");
+
+       /* Test11 */
+       T_LOG("********Test11 dirty: assertion[10 -> 3] but dirty prevails");
+       set_assertion_priority(mypid, JETSAM_PRIORITY_BACKGROUND, 0xFACE);
+       (void)check_properties(mypid, JETSAM_PRIORITY_UI_SUPPORT, active_limit_mb, 0xFACE, ASSERTION_STATE_IS_SET, "Test11");
+
+       /* Test12 */
+       T_LOG("********Test12 dirty: assertion[3 -> 0] but dirty prevails");
+       relinquish_assertion_priority(mypid, 0xBEEF);
+       (void)check_properties(mypid, JETSAM_PRIORITY_UI_SUPPORT, active_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test12");
+
+
+       /* Test13 */
+       T_LOG("********Test13 dirty goes clean: both assertion[0] and clean");
+       proc_will_set_clean(mypid);
+       if (g_jetsam_aging_policy == kJetsamAgingPolicySysProcsReclaimedFirst) {
+               /* For sysproc aging policy the daemon should be at idle deferred and with an active memory limit */
+               (void)check_properties(mypid, JETSAM_PRIORITY_IDLE_DEFERRED, active_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test13");
+       } else {
+               /* For the legacy aging policy, daemon should be at idle band with inactive memory limit */
+               (void)check_properties(mypid, JETSAM_PRIORITY_IDLE, inactive_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test13");
+       }
+}
+
+/*
+ * Process is dirty tracking and does not opt into pressured exit.
+ * This test lives above Foreground.  Assertions will have no affect
+ * except where the assertion priority bumps it above the requested priority.
+ */
+static void
+memorystatus_assertion_test_do_not_allow_idle_exit()
+{
+       pid_t mypid = getpid();
+
+       /* these values will remain fixed during testing */
+       int             active_limit_mb = 15;   /* arbitrary */
+       int             inactive_limit_mb = 7;  /* arbitrary */
+       int             requestedpriority = JETSAM_PRIORITY_AUDIO_AND_ACCESSORY;
+
+       T_SETUPBEGIN;
+
+       set_memlimits(mypid, active_limit_mb, inactive_limit_mb, true, true);
+       set_priority(mypid, requestedpriority, 0, false);
+       proc_track_dirty(mypid, (PROC_DIRTY_TRACK));
+
+       proc_will_set_dirty(mypid);
+
+       (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, active_limit_mb, 0x0, ASSERTION_STATE_IS_RELINQUISHED, "Dirty start");
+
+       proc_will_set_clean(mypid);
+
+       (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0x0, ASSERTION_STATE_IS_RELINQUISHED, "Clean transition");
+
+       T_LOG("SETUP STATE COMPLETE");
+
+       T_SETUPEND;
+
+       /*
+        * Relinquish assertion priority even though we don't hold it.  No change in state expected.
+        */
+
+
+       /* Test0 */
+       T_LOG("********Test0 clean: no state change on relinquish");
+       relinquish_assertion_priority(mypid, 0xF00D);
+       (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0xF00D, ASSERTION_STATE_IS_RELINQUISHED, "Test0");
+
+       /* Test1 */
+       T_LOG("********Test1 clean: assertion[0 -> 10] but inactive priority prevails");
+       set_assertion_priority(mypid, JETSAM_PRIORITY_FOREGROUND, 0xFEED);
+       (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test1");
+
+       /* Test2 */
+       T_LOG("********Test2 clean:  assertion[10 -> 3] but inactive priority prevails");
+       set_assertion_priority(mypid, JETSAM_PRIORITY_BACKGROUND, 0xFACE);
+       (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0xFACE, ASSERTION_STATE_IS_SET, "Test2");
+
+       /* Test3 */
+       T_LOG("********Test3 clean: assertion[3 -> 0], but inactive priority prevails");
+       relinquish_assertion_priority(mypid, 0xBEEF);
+       (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test3");
+
+       /* Test4 */
+       T_LOG("********Test4 go dirty: assertion[0] has no affect, active priority prevails");
+       proc_will_set_dirty(mypid);
+       (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, active_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test4");
+
+       /* Test5 */
+       T_LOG("********Test5 dirty: assertion[0 -> 10] active priority prevails");
+       set_assertion_priority(mypid, JETSAM_PRIORITY_FOREGROUND, 0xFEED);
+       (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, active_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test5");
+
+       /* Test6 */
+       T_LOG("********Test6 dirty:  assertion[10 -> 3] active priority prevails");
+       set_assertion_priority(mypid, JETSAM_PRIORITY_BACKGROUND, 0xFACE);
+       (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, active_limit_mb, 0xFACE, ASSERTION_STATE_IS_SET, "Test6");
+
+       /* Test 7 */
+       T_LOG("********Test7 dirty: assertion[3 -> 0], active priority prevails");
+       relinquish_assertion_priority(mypid, 0xBEEF);
+       (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, active_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test7");
+
+       /* Test8 */
+       T_LOG("********Test8 dirty: assertion[0 -> 19], dirty but now assertion[19] prevails");
+       set_assertion_priority(mypid, JETSAM_PRIORITY_CRITICAL, 0xFEED);
+       (void)check_properties(mypid, JETSAM_PRIORITY_CRITICAL, active_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test8");
+
+
+       /* Test9 */
+       T_LOG("********Test9 go clean: inactive priority but assertion[19] prevails");
+       proc_will_set_clean(mypid);
+       (void)check_properties(mypid, JETSAM_PRIORITY_CRITICAL, inactive_limit_mb, 0xFEED, ASSERTION_STATE_IS_SET, "Test9");
+
+       /* Test10 */
+       T_LOG("********Test10 clean:  assertion[19 -> 3] inactive limit prevails");
+       set_assertion_priority(mypid, JETSAM_PRIORITY_BACKGROUND, 0xFACE);
+       (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0xFACE, ASSERTION_STATE_IS_SET, "Test10");
+
+
+       /* Test11 */
+       T_LOG("********Test11 clean:  assertion[3 -> 0] inactive priority still prevails");
+       relinquish_assertion_priority(mypid, 0xBEEF);
+       (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test11");
+
+       /* Test12 */
+       T_LOG("********Test12 dirty goes clean: both assertion[0] and clean");
+       proc_will_set_clean(mypid);
+       (void)check_properties(mypid, JETSAM_PRIORITY_AUDIO_AND_ACCESSORY, inactive_limit_mb, 0xBEEF, ASSERTION_STATE_IS_RELINQUISHED, "Test12");
+}
+
+T_DECL(assertion_test_bad_flags, "verify bad flag returns an error", T_META_TIMEOUT(30), T_META_ASROOT(true)) {
+       int err;
+       uint32_t flag = 0;
+
+       memorystatus_priority_properties_t mjp = { 0 };
+
+       mjp.priority = JETSAM_PRIORITY_FOREGROUND;
+       mjp.user_data = 0;
+
+       /*
+        * init a bad flag
+        */
+
+       flag = 0xf;
+
+       err = memorystatus_control(MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES, getpid(), flag, &mjp, sizeof(mjp));
+
+       T_QUIET;
+       T_ASSERT_POSIX_FAILURE(err, EINVAL, "MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES should fail with bad flags (err=%d)", err);
+}
+
+
+T_DECL(assertion_test_repetitive_non_dirty_tracking, "Scenario #1 - repetitive assertion priority on non-dirty-tracking process", T_META_TIMEOUT(60), T_META_ASROOT(true)) {
+       /*
+        * Verify back-to-back assertion calls set assertion state as expected.
+        * false --> non-dirty-tracking process (like a typical app)
+        * false --> clean/dirty does not apply here
+        */
+
+       memorystatus_assertion_test_repetitive("Scenario #1", false, false);
+}
+
+T_DECL(assertion_test_repetitive_dirty_tracking_clean, "Scenario #2 - repetitive assertion priority on clean dirty-tracking process", T_META_TIMEOUT(60), T_META_ASROOT(true)) {
+       /*
+        * Verify back-to-back assertion calls set assertion state as expected.
+        * true --> dirty-tracking process (like a typical extension/widget)
+        * true --> start clean / inactive
+        * This will exercise idle-deferred paths.
+        */
+       memorystatus_assertion_test_repetitive("Scenario #2", true, true);
+}
+
+T_DECL(assertion_test_repetitive_dirty_tracking_dirty, "Scenario #3 - repetitive assertion priority on dirty dirty-tracking processes", T_META_TIMEOUT(60), T_META_ASROOT(true)) {
+       /*
+        * Verify back-to-back assertion calls set assertion state as expected.
+        * true --> dirty-tracking process (like a typical extension/widget)
+        * false --> start dirty / active state
+        * This will exercise idle-deferred paths.
+        */
+       memorystatus_assertion_test_repetitive("Scenario #3", true, false);
+}
+
+
+T_DECL(assertion_test_allow_idle_exit, "set assertion priorities on process supporting idle exit", T_META_TIMEOUT(360), T_META_ASROOT(true)) {
+       memorystatus_assertion_test_allow_idle_exit();
+}
+
+T_DECL(assertion_test_do_not_allow_idle_exit, "set assertion priorities on process no idle exit allowed", T_META_TIMEOUT(360), T_META_ASROOT(true)) {
+       memorystatus_assertion_test_do_not_allow_idle_exit();
+}
index bc376ee5793f4d48d26ee0ae0d19851d604d04f0..b660e5c6a95c4bb825022d3dba2d040431579a19 100644 (file)
@@ -22,29 +22,41 @@ T_GLOBAL_META(
        T_META_CHECK_LEAKS(false)
        );
 
-#define TIMEOUT_SECS                                    1500
+#define TIMEOUT_SECS                                    10 * 60 /* abort if test takes > 10 minutes */
 
-#if TARGET_OS_EMBEDDED
-#define ALLOCATION_SIZE_VM_REGION                               (16*1024)               /* 16 KB */
-#define ALLOCATION_SIZE_VM_OBJECT                               ALLOCATION_SIZE_VM_REGION
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
+#define ALLOCATION_SIZE_VM_REGION                       (16*1024)               /* 16 KB */
+#define ALLOCATION_SIZE_VM_OBJECT                       ALLOCATION_SIZE_VM_REGION
 #else
-#define ALLOCATION_SIZE_VM_REGION                               (1024*1024*100) /* 100 MB */
-#define ALLOCATION_SIZE_VM_OBJECT                               (16*1024)               /* 16 KB */
+#define ALLOCATION_SIZE_VM_REGION                       (1024*1024*100) /* 100 MB */
+#define ALLOCATION_SIZE_VM_OBJECT                       (16*1024)               /* 16 KB */
 #endif
 #define MAX_CHILD_PROCS                                 100
 
+#define NUM_GIVE_BACK                                   5
+#define NUM_GIVE_BACK_PORTS                             20
+
+/* 60% is too high on bridgeOS to achieve without vm-pageshortage jetsams. Set it to 40%. */
+#if TARGET_OS_BRIDGE
+#define ZONEMAP_JETSAM_LIMIT_SYSCTL                     "kern.zone_map_jetsam_limit=40"
+#else
 #define ZONEMAP_JETSAM_LIMIT_SYSCTL                     "kern.zone_map_jetsam_limit=60"
+#endif
 
 #define VME_ZONE_TEST_OPT                               "allocate_vm_regions"
 #define VM_OBJECTS_ZONE_TEST_OPT                        "allocate_vm_objects"
 #define GENERIC_ZONE_TEST_OPT                           "allocate_from_generic_zone"
 
-#define VME_ZONE                                                                "VM map entries"
-#define VMOBJECTS_ZONE                                                  "vm objects"
-#define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO    98
+#define VME_ZONE                                        "VM map entries"
+#define VMOBJECTS_ZONE                                  "vm objects"
+#define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO            98
+
+#define VM_TAG1                                         100
+#define VM_TAG2                                         101
 
-#define VM_TAG1                                                                 100
-#define VM_TAG2                                                                 101
+#define LARGE_MEM_GB                                    32
+#define LARGE_MEM_JETSAM_LIMIT                          40
+#define JETSAM_LIMIT_LOWEST                             10
 
 enum {
        VME_ZONE_TEST = 0,
@@ -60,8 +72,6 @@ typedef struct test_config_struct {
 } test_config_struct;
 
 static test_config_struct current_test;
-static int num_children = 0;
-static bool test_ending = false;
 static dispatch_source_t ds_signal = NULL;
 static dispatch_source_t ds_timer = NULL;
 static dispatch_queue_t dq_spawn = NULL;
@@ -71,12 +81,13 @@ static mach_zone_info_array_t zone_info_array = NULL;
 static mach_zone_name_t largest_zone_name;
 static mach_zone_info_t largest_zone_info;
 
-static char testpath[PATH_MAX];
+static pthread_mutex_t test_mtx = PTHREAD_MUTEX_INITIALIZER;   /* protects the next 3 things */
+static bool test_ending = false;
+static int num_children = 0;
 static pid_t child_pids[MAX_CHILD_PROCS];
-static pthread_mutex_t test_ending_mtx;
 
-static void allocate_vm_regions(void);
-static void allocate_vm_objects(void);
+static char testpath[PATH_MAX];
+static void allocate_vm_stuff(int);
 static void allocate_from_generic_zone(void);
 static void begin_test_teardown(void);
 static void cleanup_and_end_test(void);
@@ -85,7 +96,7 @@ static void spawn_child_process(void);
 static void run_test(void);
 static bool verify_generic_jetsam_criteria(void);
 static bool vme_zone_compares_to_vm_objects(void);
-static void print_zone_map_size(void);
+static int query_zone_map_size(void);
 static void query_zone_info(void);
 static void print_zone_info(mach_zone_name_t *zn, mach_zone_info_t *zi);
 
@@ -96,56 +107,70 @@ extern kern_return_t mach_zone_info_for_largest_zone(
        mach_zone_info_t *info
        );
 
+static bool
+check_time(time_t start, int timeout)
+{
+       return start + timeout < time(NULL);
+}
+
+/*
+ * flag values for allocate_vm_stuff()
+ */
+#define REGIONS 1
+#define OBJECTS 2
+
 static void
-allocate_vm_regions(void)
+allocate_vm_stuff(int flags)
 {
-       uint64_t alloc_size = ALLOCATION_SIZE_VM_REGION, i = 0;
+       uint64_t alloc_size, i;
+       time_t start = time(NULL);
+       mach_vm_address_t give_back[NUM_GIVE_BACK];
+       char *msg;
+
+       if (flags == REGIONS) {
+               alloc_size = ALLOCATION_SIZE_VM_REGION;
+               msg = "";
+       } else {
+               alloc_size = ALLOCATION_SIZE_VM_OBJECT;
+               msg = " each region backed by a VM object";
+       }
+
+       printf("[%d] Allocating VM regions, each of size %lld KB%s\n", getpid(), (alloc_size >> 10), msg);
 
-       printf("[%d] Allocating VM regions, each of size %lld KB\n", getpid(), (alloc_size >> 10));
        for (i = 0;; i++) {
                mach_vm_address_t addr = (mach_vm_address_t)NULL;
 
                /* Alternate VM tags between consecutive regions to prevent coalescing */
-               int flags = VM_MAKE_TAG((i % 2)? VM_TAG1: VM_TAG2) | VM_FLAGS_ANYWHERE;
+               int vmflags = VM_MAKE_TAG((i % 2)? VM_TAG1: VM_TAG2) | VM_FLAGS_ANYWHERE;
 
-               if ((mach_vm_allocate(mach_task_self(), &addr, (mach_vm_size_t)alloc_size, flags)) != KERN_SUCCESS) {
+               if ((mach_vm_allocate(mach_task_self(), &addr, (mach_vm_size_t)alloc_size, vmflags)) != KERN_SUCCESS) {
                        break;
                }
-       }
-       printf("[%d] Number of allocations: %lld\n", getpid(), i);
 
-       /* Signal to the parent that we're done allocating */
-       kill(getppid(), SIGUSR1);
+               /*
+                * If interested in objects, touch the region so the VM object is created,
+                * then free this page. Keeps us from holding a lot of dirty pages.
+                */
+               if (flags == OBJECTS) {
+                       *((int *)addr) = 0;
+                       madvise((void *)addr, (size_t)alloc_size, MADV_FREE);
+               }
 
-       while (1) {
-               sleep(2);
-               /* Exit if parent has exited. Ensures child processes don't linger around after the test exits */
-               if (getppid() == 1) {
+               if (check_time(start, TIMEOUT_SECS)) {
+                       printf("[%d] child timeout during allocations\n", getpid());
                        exit(0);
                }
-       }
-}
-
-static void
-allocate_vm_objects(void)
-{
-       uint64_t alloc_size = ALLOCATION_SIZE_VM_OBJECT, i = 0;
-
-       printf("[%d] Allocating VM regions, each of size %lld KB, each backed by a VM object\n", getpid(), (alloc_size >> 10));
-       for (i = 0;; i++) {
-               mach_vm_address_t addr = (mach_vm_address_t)NULL;
 
-               /* Alternate VM tags between consecutive regions to prevent coalescing */
-               int flags = VM_MAKE_TAG((i % 2)? VM_TAG1: VM_TAG2) | VM_FLAGS_ANYWHERE;
-
-               if ((mach_vm_allocate(mach_task_self(), &addr, (mach_vm_size_t)alloc_size, flags)) != KERN_SUCCESS) {
-                       break;
+               if (i < NUM_GIVE_BACK) {
+                       give_back[i] = addr;
                }
-               /* Touch the region so the VM object can actually be created */
-               *((int *)addr) = 0;
-               /* OK to free this page. Keeps us from holding a lot of dirty pages */
-               madvise((void *)addr, (size_t)alloc_size, MADV_FREE);
        }
+
+       /* return some of the resource to avoid O-O-M problems */
+       for (uint64_t j = 0; j < NUM_GIVE_BACK && j < i; ++j) {
+               mach_vm_deallocate(mach_task_self(), give_back[j], (mach_vm_size_t)alloc_size);
+       }
+
        printf("[%d] Number of allocations: %lld\n", getpid(), i);
 
        /* Signal to the parent that we're done allocating */
@@ -157,13 +182,21 @@ allocate_vm_objects(void)
                if (getppid() == 1) {
                        exit(0);
                }
+
+               if (check_time(start, TIMEOUT_SECS)) {
+                       printf("[%d] child timeout while waiting\n", getpid());
+                       exit(0);
+               }
        }
 }
 
+
 static void
 allocate_from_generic_zone(void)
 {
        uint64_t i = 0;
+       time_t start = time(NULL);
+       mach_port_t give_back[NUM_GIVE_BACK_PORTS];
 
        printf("[%d] Allocating mach_ports\n", getpid());
        for (i = 0;; i++) {
@@ -172,6 +205,20 @@ allocate_from_generic_zone(void)
                if ((mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port)) != KERN_SUCCESS) {
                        break;
                }
+
+               if (check_time(start, TIMEOUT_SECS)) {
+                       printf("[%d] child timeout during allocations\n", getpid());
+                       exit(0);
+               }
+
+               if (i < NUM_GIVE_BACK_PORTS) {
+                       give_back[i] = port;
+               }
+       }
+
+       /* return some of the resource to avoid O-O-M problems */
+       for (uint64_t j = 0; j < NUM_GIVE_BACK_PORTS && j < i; ++j) {
+               mach_port_deallocate(mach_task_self(), give_back[j]);
        }
        printf("[%d] Number of allocations: %lld\n", getpid(), i);
 
@@ -184,6 +231,11 @@ allocate_from_generic_zone(void)
                if (getppid() == 1) {
                        exit(0);
                }
+
+               if (check_time(start, TIMEOUT_SECS)) {
+                       printf("[%d] child timeout while waiting\n", getpid());
+                       exit(0);
+               }
        }
 }
 
@@ -194,6 +246,8 @@ print_zone_info(mach_zone_name_t *zn, mach_zone_info_t *zi)
            zn->mzn_name, zi->mzi_cur_size, zi->mzi_count);
 }
 
+static time_t main_start;
+
 static void
 query_zone_info(void)
 {
@@ -201,6 +255,9 @@ query_zone_info(void)
        kern_return_t kr;
        static uint64_t num_calls = 0;
 
+       if (check_time(main_start, TIMEOUT_SECS)) {
+               T_ASSERT_FAIL("Global timeout expired");
+       }
        for (i = 0; i < current_test.num_zones; i++) {
                kr = mach_zone_info_for_zone(mach_host_self(), current_test.zone_names[i], &(zone_info_array[i]));
                T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_zone_info_for_zone(%s) returned %d [%s]", current_test.zone_names[i].mzn_name, kr, mach_error_string(kr));
@@ -209,7 +266,7 @@ query_zone_info(void)
        T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_zone_info_for_largest_zone returned %d [%s]", kr, mach_error_string(kr));
 
        num_calls++;
-       if (num_calls % 10 != 0) {
+       if (num_calls % 5 != 0) {
                return;
        }
 
@@ -264,6 +321,19 @@ verify_generic_jetsam_criteria(void)
 static void
 begin_test_teardown(void)
 {
+       int ret, old_limit = 95;
+
+       /*
+        * Restore kern.zone_map_jetsam_limit to the default high value, to prevent further jetsams.
+        * We should change the value of old_limit if ZONE_MAP_JETSAM_LIMIT_DEFAULT changes in the kernel.
+        * We don't have a way to capture what the original value was before the test, because the
+        * T_META_SYSCTL_INT macro will have changed the value before the test starts running.
+        */
+       ret = sysctlbyname("kern.zone_map_jetsam_limit", NULL, NULL, &old_limit, sizeof(old_limit));
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.zone_map_jetsam_limit failed");
+       T_LOG("kern.zone_map_jetsam_limit set to %d%%", old_limit);
+
+
        /* End ktrace session */
        if (session != NULL) {
                T_LOG("Ending ktrace session...");
@@ -299,13 +369,13 @@ cleanup_and_end_test(void)
         * The atend handler executes on a different dispatch queue.
         * We want to do the cleanup only once.
         */
-       pthread_mutex_lock(&test_ending_mtx);
+       pthread_mutex_lock(&test_mtx);
        if (test_ending) {
-               pthread_mutex_unlock(&test_ending_mtx);
+               pthread_mutex_unlock(&test_mtx);
                return;
        }
-       test_ending = true;
-       pthread_mutex_unlock(&test_ending_mtx);
+       test_ending = TRUE;
+       pthread_mutex_unlock(&test_mtx);
 
        dispatch_async(dq_spawn, ^{
                /*
@@ -325,23 +395,25 @@ cleanup_and_end_test(void)
                }
        });
 
+       pthread_mutex_lock(&test_mtx);
        T_LOG("Number of processes spawned: %d", num_children);
        T_LOG("Killing child processes...");
 
        /* Kill all the child processes that were spawned */
        for (i = 0; i < num_children; i++) {
-               kill(child_pids[i], SIGKILL);
+               pid_t pid = child_pids[i];
+               int status = 0;
+
                /*
-                * Sleep between kills to avoid hogging the VM map entries zone lock (on the task_terminate path).
+                * Kill and wait for each child to exit
                 * Without this we were seeing hw_lock_bit timeouts in BATS.
                 */
-               sleep(1);
-       }
-       for (i = 0; i < num_children; i++) {
-               int status = 0;
-               if (waitpid(child_pids[i], &status, 0) < 0) {
+               kill(pid, SIGKILL);
+               pthread_mutex_unlock(&test_mtx);
+               if (waitpid(pid, &status, 0) < 0) {
                        T_LOG("waitpid returned status %d", status);
                }
+               pthread_mutex_lock(&test_mtx);
        }
        sleep(1);
 
@@ -382,11 +454,20 @@ setup_ktrace_session(void)
        });
 
        /* Listen for memorystatus_do_kill trace events */
-       ret = ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END, ^(ktrace_event_t event) {
+       ret = ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)), ^(ktrace_event_t event) {
                int i;
                bool received_jetsam_event = false;
 
-               /* We don't care about jetsams for any other reason except zone-map-exhaustion */
+               /*
+                * libktrace does not support DBG_FUNC_START/END in the event filter. It simply ignores it.
+                * So we need to explicitly check for the end event (a successful jetsam kill) here,
+                * instead of passing in ((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START).
+                */
+               if (!(event->debugid & DBG_FUNC_START)) {
+                       return;
+               }
+
+               /* Check for zone-map-exhaustion jetsam. */
                if (event->arg2 == kMemorystatusKilledZoneMapExhaustion) {
                        begin_test_teardown();
                        T_LOG("[memorystatus_do_kill] jetsam reason: zone-map-exhaustion, pid: %d\n\n", (int)event->arg1);
@@ -400,6 +481,7 @@ setup_ktrace_session(void)
                                 * The test simulates this scenario, we should see a targeted jetsam for the
                                 * vm objects zone too.
                                 */
+                               pthread_mutex_lock(&test_mtx);
                                for (i = 0; i < num_children; i++) {
                                        if (child_pids[i] == (pid_t)event->arg1) {
                                                received_jetsam_event = true;
@@ -407,6 +489,7 @@ setup_ktrace_session(void)
                                                break;
                                        }
                                }
+                               pthread_mutex_unlock(&test_mtx);
                                /*
                                 * If we didn't see a targeted jetsam, verify that the largest zone actually
                                 * fulfilled the criteria for generic jetsams.
@@ -421,6 +504,27 @@ setup_ktrace_session(void)
                        }
 
                        T_QUIET; T_ASSERT_TRUE(received_jetsam_event, "Jetsam event not as expected");
+               } else {
+                       /*
+                        * The test relies on the children being able to send a signal to the parent, to continue spawning new processes
+                        * that leak more zone memory. If a child is jetsammed for some other reason, the parent can get stuck waiting for
+                        * a signal from the child, never being able to make progress (We spawn only a single process at a time to rate-limit
+                        * the zone memory bloat.). If this happens, the test eventually times out. So if a child is jetsammed for some
+                        * reason other than zone-map-exhaustion, end the test early.
+                        *
+                        * This typically happens when we end up triggering vm-pageshortage jetsams before zone-map-exhaustion jetsams.
+                        * Lowering the zone_map_jetsam_limit if the zone map size was initially low should help with this too.
+                        * See sysctlbyname("kern.zone_map_jetsam_limit"...) in run_test() below.
+                        */
+                       pthread_mutex_lock(&test_mtx);
+                       for (i = 0; i < num_children; i++) {
+                               if (child_pids[i] == (pid_t)event->arg1) {
+                                       begin_test_teardown();
+                                       T_PASS("Child pid %d was jetsammed due to reason %d. Terminating early.",
+                                       (int)event->arg1, (int)event->arg2);
+                               }
+                       }
+                       pthread_mutex_unlock(&test_mtx);
                }
        });
        T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_events_single");
@@ -429,8 +533,8 @@ setup_ktrace_session(void)
        T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_start");
 }
 
-static void
-print_zone_map_size(void)
+static int
+query_zone_map_size(void)
 {
        int ret;
        uint64_t zstats[2];
@@ -440,6 +544,16 @@ print_zone_map_size(void)
        T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.zone_map_size_and_capacity failed");
 
        T_LOG("Zone map capacity: %-30lldZone map size: %lld [%lld%% full]", zstats[1], zstats[0], (zstats[0] * 100) / zstats[1]);
+
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
+       int memstat_level;
+       size_t memstat_level_size = sizeof(memstat_level);
+       ret = sysctlbyname("kern.memorystatus_level", &memstat_level, &memstat_level_size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_level failed");
+
+       T_LOG("kern.memorystatus_level = %d%%", memstat_level);
+#endif
+       return (int)(zstats[0] * 100 / zstats[1]);
 }
 
 static void
@@ -449,22 +563,30 @@ spawn_child_process(void)
        char helper_func[50];
        char *launch_tool_args[4];
 
-       T_QUIET; T_ASSERT_LT(num_children, MAX_CHILD_PROCS, "Spawned %d children. Timing out...", MAX_CHILD_PROCS);
+       pthread_mutex_lock(&test_mtx);
+       if (!test_ending) {
+               if (num_children == MAX_CHILD_PROCS) {
+                       pthread_mutex_unlock(&test_mtx);
+                       T_ASSERT_FAIL("Spawned too many children. Aborting test");
+                       /* not reached */
+               }
 
-       strlcpy(helper_func, current_test.helper_func, sizeof(helper_func));
-       launch_tool_args[0] = testpath;
-       launch_tool_args[1] = "-n";
-       launch_tool_args[2] = helper_func;
-       launch_tool_args[3] = NULL;
+               strlcpy(helper_func, current_test.helper_func, sizeof(helper_func));
+               launch_tool_args[0] = testpath;
+               launch_tool_args[1] = "-n";
+               launch_tool_args[2] = helper_func;
+               launch_tool_args[3] = NULL;
 
-       /* Spawn the child process */
-       int rc = dt_launch_tool(&pid, launch_tool_args, false, NULL, NULL);
-       if (rc != 0) {
-               T_LOG("dt_launch tool returned %d with error code %d", rc, errno);
-       }
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "dt_launch_tool");
+               /* Spawn the child process */
+               int rc = dt_launch_tool(&pid, launch_tool_args, false, NULL, NULL);
+               if (rc != 0) {
+                       T_LOG("dt_launch tool returned %d with error code %d", rc, errno);
+               }
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "dt_launch_tool");
 
-       child_pids[num_children++] = pid;
+               child_pids[num_children++] = pid;
+       }
+       pthread_mutex_unlock(&test_mtx);
 }
 
 static void
@@ -472,12 +594,13 @@ run_test(void)
 {
        uint64_t mem;
        uint32_t testpath_buf_size, pages;
-       int ret, dev, pgsz;
+       int ret, dev, pgsz, initial_zone_occupancy, old_limit, new_limit = 0;
        size_t sysctl_size;
 
        T_ATEND(cleanup_and_end_test);
        T_SETUPBEGIN;
 
+       main_start = time(NULL);
        dev = 0;
        sysctl_size = sizeof(dev);
        ret = sysctlbyname("kern.development", &dev, &sysctl_size, NULL, 0);
@@ -506,9 +629,41 @@ run_test(void)
        T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl vm.pages failed");
        T_LOG("vm.pages: %d", pages);
 
-       zone_info_array = (mach_zone_info_array_t) calloc((unsigned long)current_test.num_zones, sizeof *zone_info_array);
+       sysctl_size = sizeof(old_limit);
+       ret = sysctlbyname("kern.zone_map_jetsam_limit", &old_limit, &sysctl_size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.zone_map_jetsam_limit failed");
+       T_LOG("kern.zone_map_jetsam_limit: %d", old_limit);
 
-       print_zone_map_size();
+       initial_zone_occupancy = query_zone_map_size();
+
+       /* On large memory systems, set the zone_map jetsam limit lower so we can hit it without timing out. */
+       if (mem > (uint64_t)LARGE_MEM_GB * 1024 * 1024 * 1024) {
+               new_limit = LARGE_MEM_JETSAM_LIMIT;
+       }
+
+       /*
+        * If we start out with the zone map < 5% full, aim for 10% as the limit, so we don't time out.
+        * For anything else aim for 2x the initial size, capped by whatever value was set by T_META_SYSCTL_INT,
+        * or LARGE_MEM_JETSAM_LIMIT for large memory systems.
+        */
+       if (initial_zone_occupancy < 5) {
+               new_limit = JETSAM_LIMIT_LOWEST;
+       } else {
+               new_limit = initial_zone_occupancy * 2;
+       }
+
+       if (new_limit > 0 && new_limit < old_limit) {
+               /*
+                * We should be fine messing with the zone_map_jetsam_limit here, i.e. outside of T_META_SYSCTL_INT.
+                * When the test ends, T_META_SYSCTL_INT will restore the zone_map_jetsam_limit to what it was
+                * before the test anyway.
+                */
+               ret = sysctlbyname("kern.zone_map_jetsam_limit", NULL, NULL, &new_limit, sizeof(new_limit));
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.zone_map_jetsam_limit failed");
+               T_LOG("kern.zone_map_jetsam_limit set to %d%%", new_limit);
+       }
+
+       zone_info_array = (mach_zone_info_array_t) calloc((unsigned long)current_test.num_zones, sizeof *zone_info_array);
 
        /*
         * If the timeout specified by T_META_TIMEOUT is hit, the atend handler does not get called.
@@ -529,7 +684,7 @@ run_test(void)
        T_QUIET; T_ASSERT_NOTNULL(ds_signal, "dispatch_source_create: signal");
 
        dispatch_source_set_event_handler(ds_signal, ^{
-               print_zone_map_size();
+               (void)query_zone_map_size();
 
                /* Wait a few seconds before spawning another child. Keeps us from allocating too aggressively */
                sleep(5);
@@ -537,7 +692,7 @@ run_test(void)
        });
        dispatch_activate(ds_signal);
 
-       /* Timer to query jetsam-relevant zone info every second. Print it every 10 seconds. */
+       /* Timer to query jetsam-relevant zone info every second. Print it every 5 seconds. */
        ds_timer = dispatch_source_create(DISPATCH_SOURCE_TYPE_TIMER, 0, 0, dispatch_queue_create("timer_queue", NULL));
        T_QUIET; T_ASSERT_NOTNULL(ds_timer, "dispatch_source_create: timer");
        dispatch_source_set_timer(ds_timer, dispatch_time(DISPATCH_TIME_NOW, NSEC_PER_SEC), NSEC_PER_SEC, 0);
@@ -582,13 +737,14 @@ move_to_idle_band(void)
 
 T_HELPER_DECL(allocate_vm_regions, "allocates VM regions")
 {
-       allocate_vm_regions();
+       move_to_idle_band();
+       allocate_vm_stuff(REGIONS);
 }
 
 T_HELPER_DECL(allocate_vm_objects, "allocates VM objects and VM regions")
 {
        move_to_idle_band();
-       allocate_vm_objects();
+       allocate_vm_stuff(OBJECTS);
 }
 
 T_HELPER_DECL(allocate_from_generic_zone, "allocates from a generic zone")
index a669863631ec6cf4950d6fb71da0b2f15251de73..4210e0509dd61dec51db53ff97789cfd6de9d6e5 100644 (file)
@@ -7,6 +7,8 @@
 
 #include <darwintest.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 T_DECL(mktimer_kobject, "mktimer_kobject()", T_META_ALL_VALID_ARCHS(true))
 {
        mach_port_t timer_port = MACH_PORT_NULL;
diff --git a/tests/mo_immovable_receive.c b/tests/mo_immovable_receive.c
new file mode 100644 (file)
index 0000000..14b4f0e
--- /dev/null
@@ -0,0 +1,227 @@
+#include <darwintest.h>
+#include <servers/bootstrap.h>
+#include <mach/mach.h>
+#include <mach/message.h>
+#include <stdlib.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+typedef struct {
+       mach_msg_header_t   header;
+       mach_msg_body_t     body;
+       mach_msg_guarded_port_descriptor_t guarded_port_descriptor1;
+       mach_msg_guarded_port_descriptor_t guarded_port_descriptor2;
+       mach_msg_trailer_t  trailer;            // subtract this when sending
+} ipc_complex_message;
+
+static ipc_complex_message icm_request = {};
+
+struct args {
+       const char *progname;
+       int verbose;
+       int voucher;
+       int num_msgs;
+       const char *server_port_name;
+       mach_port_t server_port;
+       mach_port_t reply_port;
+       mach_port_t voucher_port;
+       int request_msg_size;
+       void *request_msg;
+       int reply_msg_size;
+       void *reply_msg;
+       mach_port_t sp_voucher_port;
+       uint32_t persona_id;
+       long client_pid;
+};
+
+void parse_args(struct args *args);
+void* create_buffer(int *buffer_size);
+void client(struct args *args);
+void server_setup(struct args* args);
+void server(struct args *args);
+
+void
+parse_args(struct args *args)
+{
+       args->verbose = 0;
+       args->voucher = 0;
+       args->server_port_name = "TEST";
+       args->server_port = MACH_PORT_NULL;
+       args->reply_port = MACH_PORT_NULL;
+       args->voucher_port = MACH_PORT_NULL;
+       args->num_msgs = 1;
+       args->request_msg_size = sizeof(ipc_complex_message);
+       args->request_msg = &icm_request;
+       args->client_pid = getpid();
+}
+
+/* Create a mach IPC listener which will respond to the client's message */
+void
+server_setup(struct args* args)
+{
+       kern_return_t ret;
+       mach_port_t bsport;
+
+       ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE,
+           &args->server_port);
+       T_ASSERT_MACH_SUCCESS(ret, "server: mach_port_allocate()");
+
+       ret = mach_port_insert_right(mach_task_self(), args->server_port, args->server_port,
+           MACH_MSG_TYPE_MAKE_SEND);
+       T_ASSERT_MACH_SUCCESS(ret, "server: mach_port_insert_right()");
+
+       ret = task_get_bootstrap_port(mach_task_self(), &bsport);
+       T_ASSERT_MACH_SUCCESS(ret, "server: task_get_bootstrap_port()");
+
+       ret = bootstrap_register(bsport, args->server_port_name, args->server_port);
+       T_ASSERT_MACH_SUCCESS(ret, "server: bootstrap_register()");
+
+       T_LOG("server: waiting for IPC messages from client on port '%s'.\n",
+           args->server_port_name);
+}
+
+/* Server process loop
+ *
+ * Listens for message.
+ *
+ */
+void
+server(struct args *args)
+{
+       mach_msg_header_t *request;
+       mach_msg_option_t rcvoption;
+       kern_return_t ret;
+
+       request = (mach_msg_header_t *)args->request_msg;
+
+       rcvoption = MACH_RCV_MSG | MACH_RCV_INTERRUPT | MACH_RCV_GUARDED_DESC;
+
+       T_LOG("server: Awaiting message\n");
+       ret = mach_msg(request,
+           rcvoption,
+           0,
+           sizeof(ipc_complex_message),
+           args->server_port,
+           MACH_MSG_TIMEOUT_NONE,
+           MACH_PORT_NULL);
+
+       T_ASSERT_MACH_SUCCESS(ret, "server: mach_msg receive");
+
+       ipc_complex_message *request_complexmsg = (ipc_complex_message *)request;
+       T_ASSERT_NE(request_complexmsg->guarded_port_descriptor1.name, 0, "server: Should not receive mach_port_null; name = %x", request_complexmsg->guarded_port_descriptor1.name);
+       T_ASSERT_EQ(request_complexmsg->guarded_port_descriptor1.type, MACH_MSG_GUARDED_PORT_DESCRIPTOR, "server: Received a guarded port descriptor");
+       T_ASSERT_EQ(request_complexmsg->guarded_port_descriptor1.disposition, MACH_MSG_TYPE_PORT_RECEIVE, "server: Received a receive right");
+       T_ASSERT_EQ(request_complexmsg->guarded_port_descriptor1.context, (unsigned long)request, "server: Received a port with correct context = %p", request);
+       T_LOG("Guard flags = %d", request_complexmsg->guarded_port_descriptor1.flags);
+
+       T_ASSERT_NE(request_complexmsg->guarded_port_descriptor2.name, 0, "server: Should not receive mach_port_null; name = %x", request_complexmsg->guarded_port_descriptor2.name);
+       T_ASSERT_EQ(request_complexmsg->guarded_port_descriptor2.type, MACH_MSG_GUARDED_PORT_DESCRIPTOR, "server: Received a guarded port descriptor");
+       T_ASSERT_EQ(request_complexmsg->guarded_port_descriptor2.disposition, MACH_MSG_TYPE_PORT_RECEIVE, "server: Received a receive right");
+       T_ASSERT_EQ(request_complexmsg->guarded_port_descriptor2.context, (unsigned long)request, "server: Received a port with correct context = %p", request);
+
+       mach_port_status_t status;
+       mach_msg_type_number_t status_size = MACH_PORT_RECEIVE_STATUS_COUNT;
+
+       kern_return_t kr = mach_port_get_attributes(mach_task_self(), request_complexmsg->guarded_port_descriptor1.name,
+           MACH_PORT_RECEIVE_STATUS, (mach_port_info_t)&status, &status_size);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_get_attributes for descriptor 1");
+       T_LOG("Status flags %d", status.mps_flags);
+       T_ASSERT_NE(0, (status.mps_flags & MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE), "Imm rcv bit is set for descriptor1");
+
+       kr = mach_port_get_attributes(mach_task_self(), request_complexmsg->guarded_port_descriptor2.name,
+           MACH_PORT_RECEIVE_STATUS, (mach_port_info_t)&status, &status_size);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_get_attributes for descriptor 2");
+       T_LOG("Status flags %d", status.mps_flags);
+       T_ASSERT_NE(0, (status.mps_flags & MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE), "Imm rcv bit is set for descriptor2");
+
+       mach_msg_destroy(request);
+}
+
+void
+client(struct args *args)
+{
+       //Find the bootstrap port
+       mach_port_t bsport;
+       mach_port_t guarded_port;
+       mach_port_t unguarded_port;
+
+       kern_return_t ret = task_get_bootstrap_port(mach_task_self(), &bsport);
+       T_ASSERT_MACH_SUCCESS(ret, "client: task_get_bootstrap_port()");
+
+       //Look up the service port
+       ret = bootstrap_look_up(bsport, (char *)args->server_port_name,
+           &args->server_port);
+       T_ASSERT_MACH_SUCCESS(ret, "client: bootstrap_look_up()");
+
+       //Create the unguarded port
+       ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE,
+           &unguarded_port);
+       T_ASSERT_MACH_SUCCESS(ret, "client: mach_port_allocate() reply port");
+
+       mach_port_options_t opts = {
+               .flags = MPO_CONTEXT_AS_GUARD
+       };
+
+       ret = mach_port_construct(mach_task_self(), &opts, 0x10, &guarded_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "mach_port_construct");
+
+       //Construct the message
+       mach_msg_header_t *request = (mach_msg_header_t *)args->request_msg;
+       request->msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND, MACH_MSG_TYPE_MAKE_SEND_ONCE,
+           0, 0) | MACH_MSGH_BITS_COMPLEX;
+       request->msgh_size = (mach_msg_size_t)args->request_msg_size;
+       request->msgh_remote_port = args->server_port;
+       request->msgh_local_port = args->reply_port;
+       request->msgh_id = 1;
+
+       ipc_complex_message *complexmsg = (ipc_complex_message *)request;
+       complexmsg->body.msgh_descriptor_count = 2;
+       complexmsg->guarded_port_descriptor1.name = guarded_port;
+       complexmsg->guarded_port_descriptor1.disposition = MACH_MSG_TYPE_MOVE_RECEIVE;
+       complexmsg->guarded_port_descriptor1.flags = MACH_MSG_GUARD_FLAGS_IMMOVABLE_RECEIVE;
+       complexmsg->guarded_port_descriptor1.context = 0x10;
+       complexmsg->guarded_port_descriptor1.type = MACH_MSG_GUARDED_PORT_DESCRIPTOR;
+
+       complexmsg->guarded_port_descriptor2.name = unguarded_port;
+       complexmsg->guarded_port_descriptor2.disposition = MACH_MSG_TYPE_MOVE_RECEIVE;
+       complexmsg->guarded_port_descriptor2.flags = MACH_MSG_GUARD_FLAGS_IMMOVABLE_RECEIVE | MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND;
+       complexmsg->guarded_port_descriptor2.context = 0;
+       complexmsg->guarded_port_descriptor2.type = MACH_MSG_GUARDED_PORT_DESCRIPTOR;
+
+       mach_msg_option_t option = MACH_SEND_MSG;
+
+       //Listen for the reply on the reply port
+       T_LOG("client: Sending request\n");
+       ret = mach_msg(request,
+           option,
+           (mach_msg_size_t)args->request_msg_size,
+           0,
+           MACH_PORT_NULL,
+           MACH_MSG_TIMEOUT_NONE,
+           MACH_PORT_NULL);
+       T_ASSERT_MACH_SUCCESS(ret, "client: mach_msg_overwrite()");
+}
+
+T_DECL(mo_immovable_receive, "Send a message containing a guard port descriptor for an immovable receive right")
+{
+       struct args args = {};
+       parse_args(&args);
+       args.request_msg_size -= sizeof(mach_msg_trailer_t);
+       args.reply_msg_size -= sizeof(mach_msg_trailer_t);
+
+       //Create the server
+       pid_t pid = fork();
+       if (pid == 0) {
+               T_LOG("Server is up");
+               server_setup(&args);
+               server(&args);
+               exit(0);
+       }
+
+       sleep(2);
+       T_LOG("Preparing client to send a request");
+       client(&args);
+       T_ASSERT_POSIX_SUCCESS(waitpid(pid, NULL, 0), "waitpid()");
+}
diff --git a/tests/mpsc.c b/tests/mpsc.c
new file mode 100644 (file)
index 0000000..08ce256
--- /dev/null
@@ -0,0 +1,26 @@
+/*
+ * mpsc: test the MPSC interface
+ */
+
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+#include <darwintest.h>
+#include <sys/sysctl.h>
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.mpsc"),
+    T_META_RUN_CONCURRENTLY(true));
+
+T_DECL(pingpong, "mpsc_pingpong")
+{
+       uint64_t count = 100 * 1000, nsecs = 0;
+       size_t nlen = sizeof(nsecs);
+       int error;
+
+       error = sysctlbyname("kern.mpsc_test_pingpong", &nsecs, &nlen,
+           &count, sizeof(count));
+       T_ASSERT_POSIX_SUCCESS(error, "sysctlbyname");
+       T_LOG("%lld asyncs in %lld ns (%g us/async)", count, nsecs,
+           (nsecs / 1e3) / count);
+}
index c644f2ad853eaa8181de817dc322cb406fd43f69..89b8fc995be1f45fde4a12c04e91dbad8060bb50 100644 (file)
@@ -9,7 +9,8 @@
 #include <darwintest.h>
 #include <darwintest_utils.h>
 
-T_GLOBAL_META(T_META_NAMESPACE("xnu.net"));
+T_GLOBAL_META(T_META_NAMESPACE("xnu.net"),
+    T_META_RUN_CONCURRENTLY(true));
 
 T_DECL(PR_35136664_utun,
     "This bind a utun and close it without connecting")
index d4b2477f57511080251134ddfc94ef8e9d734284..8965080c5a18a0d4f63a9e3e62df906ae5305464 100644 (file)
@@ -1,3 +1,5 @@
+/* -*- compile-command: "xcrun --sdk iphoneos.internal make net_tuntests" -*- */
+
 #include <inttypes.h>
 #include <stdbool.h>
 #include <stdio.h>
@@ -6,10 +8,11 @@
 #include <string.h>
 #include <unistd.h>
 #include <poll.h>
+#include <sys/types.h>
 #include <sys/event.h>
+#include <sys/time.h>
 #include <uuid/uuid.h>
 #include <arpa/inet.h>
-#include <sys/types.h>
 #include <sys/sysctl.h>
 #include <sys/kern_control.h>
 #include <sys/ioctl.h>
 
 T_GLOBAL_META(T_META_NAMESPACE("xnu.net.tun"));
 
+/* Disable all these test until <rdar://problem/49124468> is fixed */
+T_GLOBAL_META(T_META_ENABLED(false));
+
+#if 0
+#undef T_QUIET
+#define T_QUIET
+#endif
+
 #if 0
 static void
 log_hexdump(const void *inp, size_t len)
@@ -51,17 +62,22 @@ log_hexdump(const void *inp, size_t len)
                T_LOG("%s", buf);
        }
 }
+#else
+static void
+log_hexdump(const void *inp, size_t len)
+{
+#pragma unused(inp, len)
+}
 #endif
 
-static uint64_t
-get_skywalk_features(void)
+static bool
+is_netagent_enabled(void)
 {
-       uint64_t features = 0;
-       size_t len = sizeof(features);
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(sysctlbyname("kern.skywalk.features", &features, &len, NULL, 0), NULL);
-       T_QUIET; T_ASSERT_EQ(len, sizeof(features), NULL);
-       T_QUIET; T_ASSERT_TRUE(features & SK_FEATURE_SKYWALK, NULL);
-       return features;
+       int enabled = 0;
+       size_t len = sizeof(enabled);
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(sysctlbyname("net.link.generic.system.enable_netagent", &enabled, &len, NULL, 0), NULL);
+       T_QUIET; T_ASSERT_EQ(len, sizeof(enabled), NULL);
+       return enabled == 1;
 }
 
 static bool g_is_ipsec_test;
@@ -73,6 +89,10 @@ static int g_OPT_GET_CHANNEL_UUID = -1;
 static int g_OPT_IFNAME = -1;
 static char *g_CONTROL_NAME = NULL;
 
+static int create_tunsock_old(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]);
+static int create_tunsock_new(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]);
+static int (*create_tunsock)(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]);
+
 static void
 setup_ipsec_test(void)
 {
@@ -83,6 +103,7 @@ setup_ipsec_test(void)
        g_OPT_GET_CHANNEL_UUID = IPSEC_OPT_GET_CHANNEL_UUID;
        g_OPT_IFNAME = IPSEC_OPT_IFNAME;
        g_CONTROL_NAME = IPSEC_CONTROL_NAME;
+       create_tunsock = create_tunsock_new;
        g_is_ipsec_test = true;
 }
 
@@ -96,21 +117,74 @@ setup_utun_test(void)
        g_OPT_GET_CHANNEL_UUID = UTUN_OPT_GET_CHANNEL_UUID;
        g_OPT_IFNAME = UTUN_OPT_IFNAME;
        g_CONTROL_NAME = UTUN_CONTROL_NAME;
+       create_tunsock = create_tunsock_old;
        g_is_utun_test = true;
 }
 
+static bool
+setblocking(int s, bool blocking)
+{
+       int flags;
+       bool ret;
+
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(flags = fcntl(s, F_GETFL, 0), NULL);
+
+       ret = !(flags & O_NONBLOCK);
+
+       if (blocking) {
+               flags &= ~O_NONBLOCK;
+       } else {
+               flags |= O_NONBLOCK;
+       }
+
+#if 0
+       T_LOG("Setting fd %d from %s to %s\n",
+           s, ret ? "blocking" : "nonblocking",
+           blocking ? "blocking" : "nonblocking");
+#endif
+
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(flags = fcntl(s, F_SETFL, flags), NULL);
+
+       return ret;
+}
+
+
 static void
-check_enables(int tunsock, int enable_netif, int enable_flowswitch, int enable_channel, uuid_t uuid)
+check_enables(int tunsock, int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[])
 {
        int scratch;
        socklen_t scratchlen, uuidlen;
-       uuid_t scratchuuid;
+       uuid_t scratchuuid[channel_count];
        if (!uuid) {
                uuid = scratchuuid;
        }
 
        //T_LOG("checking tunsock %d", tunsock);
 
+       if (g_is_ipsec_test && channel_count && !enable_netif) {
+               /* Unfortunately, the connect incorrectly unwinds the bind if it get an error.
+                * until that is fixed, expect EINVAL here
+                */
+               scratchlen = sizeof(scratch);
+               T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
+                   &scratch, &scratchlen), EINVAL, NULL);
+               T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
+                   &scratch, &scratchlen), EINVAL, NULL);
+               T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
+                   &scratch, &scratchlen), EINVAL, NULL);
+               for (int i = 0; i < channel_count; i++) {
+                       uuid_clear(uuid[i]);
+               }
+               uuidlen = sizeof(uuid_t) * (unsigned int)channel_count;
+               T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
+                   uuid, &uuidlen), EINVAL, NULL);
+               for (int i = 0; i < channel_count; i++) {
+                       T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL);
+               }
+               return;
+       }
+
+
        scratchlen = sizeof(scratch);
        T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
            &scratch, &scratchlen), NULL);
@@ -121,7 +195,7 @@ check_enables(int tunsock, int enable_netif, int enable_flowswitch, int enable_c
        T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
            &scratch, &scratchlen), NULL);
        T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)scratchlen, sizeof(scratch), NULL);
-       if (get_skywalk_features() & SK_FEATURE_NETNS) {
+       if (is_netagent_enabled()) {
                if (enable_netif) {
                        T_QUIET; T_EXPECT_EQ(scratch, enable_flowswitch, NULL);
                } else {
@@ -138,23 +212,31 @@ check_enables(int tunsock, int enable_netif, int enable_flowswitch, int enable_c
        if (g_is_ipsec_test && !enable_netif) {
                T_QUIET; T_EXPECT_EQ(scratch, 0, NULL);
        } else {
-               T_QUIET; T_EXPECT_EQ(scratch, enable_channel, NULL);
+               T_QUIET; T_EXPECT_EQ(scratch, (int)channel_count, NULL);
        }
 
        if (scratch) {
-               uuid_clear(uuid);
-               uuidlen = sizeof(uuid_t);
+               for (int i = 0; i < channel_count; i++) {
+                       uuid_clear(uuid[i]);
+               }
+               uuidlen = sizeof(uuid_t) * (unsigned int)channel_count;
                T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
                    uuid, &uuidlen), NULL);
-               T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t), NULL);
-               T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid), NULL);
+               T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL);
+               for (int i = 0; i < channel_count; i++) {
+                       T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid[i]), NULL);
+               }
        } else {
-               uuid_clear(uuid);
-               uuidlen = sizeof(uuid_t);
+               for (int i = 0; i < channel_count; i++) {
+                       uuid_clear(uuid[i]);
+               }
+               uuidlen = sizeof(uuid_t) * (unsigned int)channel_count;
                T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
                    uuid, &uuidlen), ENXIO, NULL);
-               T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t), NULL);
-               T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
+               T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL);
+               for (int i = 0; i < channel_count; i++) {
+                       T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL);
+               }
        }
 }
 
@@ -309,20 +391,26 @@ create_sa(const char ifname[IFXNAMSIZ], uint8_t type, uint32_t spi, struct in_ad
        addcmd.dst.saddr.sin_port = htons(0);
        addcmd.dst.saddr.sin_addr = *dst;
 
-       //log_hexdump(&addcmd, sizeof(addcmd));
+       log_hexdump(&addcmd, sizeof(addcmd));
 
        ssize_t slen;
        T_QUIET; T_EXPECT_POSIX_SUCCESS(slen = send(g_pfkeyso, &addcmd, sizeof(addcmd), 0), NULL);
        T_QUIET; T_EXPECT_EQ(slen, (ssize_t)sizeof(addcmd), NULL);
 }
 
+/* This version of the test expects channels to be enabled after connect.
+ * Once the utun driver is converted, switch to create_tunsock_new
+ */
 static int
-create_tunsock(int enable_netif, int enable_flowswitch, int enable_channel)
+create_tunsock_old(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[])
 {
        int tunsock;
        struct ctl_info kernctl_info;
        struct sockaddr_ctl kernctl_addr;
-       uuid_t uuid;
+       uuid_t scratchuuid[channel_count];
+       if (!uuid) {
+               uuid = scratchuuid;
+       }
        socklen_t uuidlen;
 
 startover:
@@ -340,21 +428,25 @@ startover:
        kernctl_addr.sc_id = kernctl_info.ctl_id;
        kernctl_addr.sc_unit = 0;
 
-       //T_LOG("enable_netif = %d, enable_flowswitch = %d, enable_channel = %d",
-       //enable_netif, enable_channel, enable_flowswitch);
+       T_LOG("%s: enable_netif = %d, enable_flowswitch = %d, channel_count = %d",
+           __func__, enable_netif, enable_flowswitch, channel_count);
 
        T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
            &enable_netif, sizeof(enable_netif)), EINVAL, NULL);
        T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
            &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL);
        T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
-           &enable_channel, sizeof(enable_channel)), EINVAL, NULL);
-       uuid_clear(uuid);
-       uuidlen = sizeof(uuid_t);
+           &channel_count, sizeof(channel_count)), EINVAL, NULL);
+       for (int i = 0; i < channel_count; i++) {
+               uuid_clear(uuid[i]);
+       }
+       uuidlen = sizeof(uuid_t) * (unsigned int)channel_count;
        T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
            uuid, &uuidlen), EINVAL, NULL);
-       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t), NULL);
-       T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
+       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL);
+       for (int i = 0; i < channel_count; i++) {
+               T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL);
+       }
 
        T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(bind(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr)), NULL);
 
@@ -363,13 +455,17 @@ startover:
        T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
            &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL);
        T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
-           &enable_channel, sizeof(enable_channel)), EINVAL, NULL);
-       uuid_clear(uuid);
-       uuidlen = sizeof(uuid_t);
+           &channel_count, sizeof(channel_count)), EINVAL, NULL);
+       for (int i = 0; i < channel_count; i++) {
+               uuid_clear(uuid[i]);
+       }
+       uuidlen = sizeof(uuid_t) * (unsigned int)channel_count;
        T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
            uuid, &uuidlen), ENXIO, NULL);
-       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t), NULL);
-       T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
+       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL);
+       for (int i = 0; i < channel_count; i++) {
+               T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL);
+       }
 
        int error = connect(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr));
        if (error == -1 && errno == EBUSY) {
@@ -386,7 +482,7 @@ startover:
        T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
            &enable_netif, sizeof(enable_netif)), EINVAL, NULL);
 
-       if (get_skywalk_features() & SK_FEATURE_NETNS) {
+       if (is_netagent_enabled()) {
                if (enable_netif) {
                        T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
                            &enable_flowswitch, sizeof(enable_flowswitch)), NULL);
@@ -399,45 +495,208 @@ startover:
                    &enable_flowswitch, sizeof(enable_flowswitch)), ENOTSUP, NULL);
        }
 
-       if (enable_channel) {
+       if (channel_count) {
                if (g_is_ipsec_test && !enable_netif) {
                        /* ipsec doesn't support channels without a netif */
                        T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
-                           &enable_channel, sizeof(enable_channel)), EOPNOTSUPP, NULL);
-                       uuid_clear(uuid);
-                       uuidlen = sizeof(uuid_t);
+                           &channel_count, sizeof(channel_count)), EOPNOTSUPP, NULL);
+                       for (int i = 0; i < channel_count; i++) {
+                               uuid_clear(uuid[i]);
+                       }
+                       uuidlen = sizeof(uuid_t) * (unsigned int)channel_count;
                        T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
                            uuid, &uuidlen), ENXIO, NULL);
-                       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t), NULL);
-                       T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
+                       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL);
+                       for (int i = 0; i < channel_count; i++) {
+                               T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL);
+                       }
                } else {
                        T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
-                           &enable_channel, sizeof(enable_channel)), NULL);
-                       uuid_clear(uuid);
-                       uuidlen = sizeof(uuid_t);
+                           &channel_count, sizeof(channel_count)), NULL);
+                       for (int i = 0; i < channel_count; i++) {
+                               uuid_clear(uuid[i]);
+                       }
+                       uuidlen = sizeof(uuid_t) * (unsigned int)channel_count;
                        T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
                            uuid, &uuidlen), NULL);
-                       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t), NULL);
-                       T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid), NULL);
+                       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL);
+                       for (int i = 0; i < channel_count; i++) {
+                               T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid[i]), NULL);
+                       }
                }
        } else {
                T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
-                   &enable_channel, sizeof(enable_channel)), ENXIO, NULL);
-               uuid_clear(uuid);
-               uuidlen = sizeof(uuid_t);
+                   &channel_count, sizeof(channel_count)), ENXIO, NULL);
+               for (int i = 0; i < channel_count; i++) {
+                       uuid_clear(uuid[i]);
+               }
+               uuidlen = sizeof(uuid_t) * (unsigned int)channel_count;
                T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
                    uuid, &uuidlen), ENXIO, NULL);
-               T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t), NULL);
-               T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
+               T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL);
+               for (int i = 0; i < channel_count; i++) {
+                       T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL);
+               }
+       }
+
+       check_enables(tunsock, enable_netif, enable_flowswitch, channel_count, uuid);
+
+       //T_LOG("Returning tunsock %d", tunsock);
+
+       return tunsock;
+}
+
+/* This version of the test expects channels to be enabled before connect
+ * Once the utun driver is converted, rename this to just create_tunsock
+ */
+static int
+create_tunsock_new(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[])
+{
+       int tunsock;
+       struct ctl_info kernctl_info;
+       struct sockaddr_ctl kernctl_addr;
+       uuid_t scratchuuid[channel_count];
+       if (!uuid) {
+               uuid = scratchuuid;
+       }
+       socklen_t uuidlen;
+
+startover:
+
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(tunsock = socket(PF_SYSTEM, SOCK_DGRAM, SYSPROTO_CONTROL), NULL);
+
+       memset(&kernctl_info, 0, sizeof(kernctl_info));
+       strlcpy(kernctl_info.ctl_name, g_CONTROL_NAME, sizeof(kernctl_info.ctl_name));
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(ioctl(tunsock, CTLIOCGINFO, &kernctl_info), NULL);
+
+       memset(&kernctl_addr, 0, sizeof(kernctl_addr));
+       kernctl_addr.sc_len = sizeof(kernctl_addr);
+       kernctl_addr.sc_family = AF_SYSTEM;
+       kernctl_addr.ss_sysaddr = AF_SYS_CONTROL;
+       kernctl_addr.sc_id = kernctl_info.ctl_id;
+       kernctl_addr.sc_unit = 0;
+
+       T_LOG("%s: enable_netif = %d, enable_flowswitch = %d, channel_count = %d",
+           __func__, enable_netif, enable_flowswitch, channel_count);
+
+       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
+           &enable_netif, sizeof(enable_netif)), EINVAL, NULL);
+       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
+           &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL);
+       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
+           &channel_count, sizeof(channel_count)), EINVAL, NULL);
+       for (int i = 0; i < channel_count; i++) {
+               uuid_clear(uuid[i]);
+       }
+       uuidlen = sizeof(uuid_t) * (unsigned int)channel_count;
+       T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
+           uuid, &uuidlen), EINVAL, NULL);
+       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL);
+       for (int i = 0; i < channel_count; i++) {
+               T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL);
+       }
+
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(bind(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr)), NULL);
+
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
+           &enable_netif, sizeof(enable_netif)), NULL);
+       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
+           &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL);
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
+           &channel_count, sizeof(channel_count)), NULL);
+
+       for (int i = 0; i < channel_count; i++) {
+               uuid_clear(uuid[i]);
+       }
+       uuidlen = sizeof(uuid_t) * (unsigned int)channel_count;
+       T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
+           uuid, &uuidlen), ENXIO, NULL);
+       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL);
+       for (int i = 0; i < channel_count; i++) {
+               T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL);
+       }
+
+       int error = connect(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr));
+       if (error == -1 && errno == EBUSY) {
+               /* XXX remove this retry nonsense when this is fixed:
+                * <rdar://problem/37340313> creating an interface without specifying specific interface name should not return EBUSY
+                */
+               close(tunsock);
+               T_LOG("connect got EBUSY, sleeping 1 second before retry");
+               sleep(1);
+               goto startover;
+       }
+       if (g_is_ipsec_test && channel_count && !enable_netif) {
+               /* ipsec doesn't support channels without a netif */
+               T_QUIET; T_EXPECT_POSIX_FAILURE(error, ENOTSUP, "connect() == -1 && errno == ENOTSUP");
+       } else {
+               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(error, "connect() == 0");
+       }
+
+       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
+           &enable_netif, sizeof(enable_netif)), EINVAL, NULL);
+
+       if (g_is_ipsec_test && channel_count && !enable_netif) {
+               /* Connect failed above, so we get EINVAL */
+               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
+                   &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL);
+       } else {
+               if (is_netagent_enabled()) {
+                       if (enable_netif) {
+                               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
+                                   &enable_flowswitch, sizeof(enable_flowswitch)), NULL);
+                       } else {
+                               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
+                                   &enable_flowswitch, sizeof(enable_flowswitch)), ENOENT, NULL);
+                       }
+               } else {
+                       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
+                           &enable_flowswitch, sizeof(enable_flowswitch)), ENOTSUP, NULL);
+               }
+       }
+
+       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
+           &channel_count, sizeof(channel_count)), EINVAL, NULL);
+
+       for (int i = 0; i < channel_count; i++) {
+               uuid_clear(uuid[i]);
+       }
+       uuidlen = sizeof(uuid_t) * (unsigned int)channel_count;
+       if (!channel_count || (g_is_ipsec_test && channel_count && !enable_netif)) {
+               /* ipsec doesn't support channels without a netif */
+               if (g_is_ipsec_test && channel_count && !enable_netif) {
+                       /* Unfortunately, the connect incorrectly unwinds the bind if it get an error.
+                        * until that is fixed, expect EINVAL here
+                        */
+                       T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
+                           uuid, &uuidlen), EINVAL, NULL);
+               } else {
+                       T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
+                           uuid, &uuidlen), ENXIO, NULL);
+               }
+               T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL);
+               for (int i = 0; i < channel_count; i++) {
+                       T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL);
+               }
+       } else {
+               uuidlen = sizeof(uuid_t) * (unsigned int)channel_count;
+               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
+                   uuid, &uuidlen), NULL);
+               T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL);
+               for (int i = 0; i < channel_count; i++) {
+                       T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid[i]), NULL);
+               }
        }
 
-       check_enables(tunsock, enable_netif, enable_flowswitch, enable_channel, uuid);
+       check_enables(tunsock, enable_netif, enable_flowswitch, channel_count, uuid);
 
        //T_LOG("Returning tunsock %d", tunsock);
 
        return tunsock;
 }
 
+static int (*create_tunsock)(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]) = create_tunsock_new;
+
 #if 0
 static void
 ipsec_stats(void)
@@ -458,21 +717,21 @@ static void
 permute_enables(void)
 {
        int tunsock;
-       T_EXPECT_GE(tunsock = create_tunsock(false, false, false), 0, NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(false, false, false, NULL), 0, NULL);
        T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-       T_EXPECT_GE(tunsock = create_tunsock(false, false, true), 0, NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(false, false, true, NULL), 0, NULL);
        T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-       T_EXPECT_GE(tunsock = create_tunsock(false, true, false), 0, NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(false, true, false, NULL), 0, NULL);
        T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-       T_EXPECT_GE(tunsock = create_tunsock(false, true, true), 0, NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(false, true, true, NULL), 0, NULL);
        T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-       T_EXPECT_GE(tunsock = create_tunsock(true, false, false), 0, NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(true, false, false, NULL), 0, NULL);
        T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-       T_EXPECT_GE(tunsock = create_tunsock(true, false, true), 0, NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(true, false, true, NULL), 0, NULL);
        T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-       T_EXPECT_GE(tunsock = create_tunsock(true, true, false), 0, NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(true, true, false, NULL), 0, NULL);
        T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-       T_EXPECT_GE(tunsock = create_tunsock(true, true, true), 0, NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(true, true, true, NULL), 0, NULL);
        T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
 }
 
@@ -502,9 +761,9 @@ cleanup_tunsock(void)
 }
 
 static void
-setup_tunsock(void)
+setup_tunsock(int channel_count, uuid_t uuids[])
 {
-       T_ASSERT_GE(g_tunsock = create_tunsock(true, false, true), 0, NULL);
+       T_ASSERT_GE(g_tunsock = create_tunsock(true, false, channel_count, uuids), 0, NULL);
        T_ATEND(cleanup_tunsock);
 
        char ifname[IFXNAMSIZ];
@@ -529,11 +788,388 @@ setup_tunsock(void)
 T_DECL(setup_ipsec, "This test sets up an ipsec interface")
 {
        setup_ipsec_test();
-       setup_tunsock();
+       setup_tunsock(1, NULL);
 }
 
 T_DECL(setup_utun, "This test sets up a utun interface")
 {
        setup_utun_test();
-       setup_tunsock();
+       setup_tunsock(1, NULL);
+}
+
+static const int SOCKET_TRAFFIC_CLASSES[] = {
+       SO_TC_BK_SYS, // BK
+       SO_TC_BK,  // BK
+       SO_TC_BE,  // BE
+       SO_TC_RD,  // BE
+       SO_TC_OAM, // BE
+       SO_TC_AV,  // VI
+       SO_TC_RV,  // VI
+       SO_TC_VI,  // VI
+       SO_TC_VO,  // VO
+       SO_TC_CTL, // VO
+};
+
+// this should match ipsec_find_tx_ring_by_svc in ipsec driver
+static const int SOCKET_TC_TO_RING[] = {
+       3,
+       3,
+       2,
+       2,
+       2,
+       1,
+       1,
+       1,
+       0,
+       0,
+};
+
+/* How many sockets map to this ring */
+static const int RING_TO_TC_COUNT[] = {
+       2, 3, 3, 2,
+};
+
+static void
+setup_channels_and_rings(int kq, int channel_count, channel_t channels[], channel_ring_t rxrings[], channel_ring_t txrings[], uuid_t uuids[], int cfds[])
+{
+       setup_tunsock(channel_count, uuids);
+
+#if 0
+       // give time to enable a tcpdump if desired
+       T_LOG("Sleeping 10");
+       sleep(10);
+       T_LOG("Done");
+#endif
+
+       for (int ri = 0; ri < channel_count; ri++) {
+               if (rxrings) {
+                       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(channels[ri] = os_channel_create(uuids[ri], 0), NULL);
+                       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(rxrings[ri] = os_channel_rx_ring(channels[ri],
+                           os_channel_ring_id(channels[ri], CHANNEL_FIRST_RX_RING)), NULL);
+               }
+               if (txrings) {
+                       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(channels[ri] = os_channel_create(uuids[ri], 0), NULL);
+                       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(rxrings[ri] = os_channel_rx_ring(channels[ri],
+                           os_channel_ring_id(channels[ri], CHANNEL_FIRST_TX_RING)), NULL);
+               }
+
+               struct kevent kev;
+               T_QUIET; T_EXPECT_POSIX_SUCCESS(cfds[ri] = os_channel_get_fd(channels[ri]), NULL);
+               EV_SET(&kev, cfds[ri], EVFILT_READ, EV_ADD | EV_ENABLE, 0, 0, (void *)(uintptr_t)ri);
+               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(kevent(kq, &kev, 1, NULL, 0, NULL), NULL);
+       }
+}
+
+static void
+cleanup_channels_and_rings(int channel_count, channel_t channels[], channel_ring_t rxrings[], channel_ring_t txrings[], uuid_t uuids[])
+{
+       for (int ri = 0; ri < channel_count; ri++) {
+               if (rxrings) {
+                       rxrings[ri] = NULL;
+               }
+               if (txrings) {
+                       rxrings[ri] = NULL;
+               }
+               os_channel_destroy(channels[ri]);
+               channels[ri] = NULL;
+               uuid_clear(uuids[ri]);
+       }
+}
+
+static void
+setup_sockets(int sockets[SO_TC_MAX], int type)
+{
+       for (int si = 0; si < SO_TC_MAX; si++) {
+               T_QUIET; T_EXPECT_POSIX_SUCCESS(sockets[si] = socket(PF_INET, type, 0), NULL);
+
+               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(sockets[si], SOL_SOCKET,
+                   SO_TRAFFIC_CLASS, &SOCKET_TRAFFIC_CLASSES[si], sizeof(SOCKET_TRAFFIC_CLASSES[si])), NULL);
+
+               // XXX setsockopt(IP_BOUND_IF) here?
+
+               struct sockaddr_in sin;
+               memset(&sin, 0, sizeof(sin));
+               sin.sin_len = sizeof(sin);
+               sin.sin_family = AF_INET;
+               sin.sin_addr = g_addr1;
+
+               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(bind(sockets[si], (struct sockaddr *)&sin, sizeof(sin)), NULL);
+
+               char sbuf[INET6_ADDRSTRLEN];
+               inet_ntop(sin.sin_family, &sin.sin_addr.s_addr, sbuf, sizeof(sbuf));
+#if 0
+               T_LOG("%s socket %d bound to %s port %d",
+                   type == SOCK_DGRAM ? "udp" : type == SOCK_STREAM ? "tcp" : "???",
+                   sockets[si], sbuf, ntohs(sin.sin_port));
+#endif
+               setblocking(sockets[si], false);
+       }
+}
+
+static void
+cleanup_sockets(int sockets[SO_TC_MAX])
+{
+       for (int si = 0; si < SO_TC_MAX; si++) {
+               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(sockets[si]), NULL);
+               sockets[si] = -1;
+       }
+}
+
+static void
+drain_ring(channel_ring_t rxring)
+{
+       uint32_t i, sc = os_channel_available_slot_count(rxring);
+       channel_slot_t rxprev = NULL;
+       for (i = 0; i < sc; i++) {
+               slot_prop_t rxprop;
+               channel_slot_t rxslot;
+
+               memset(&rxprop, 0, sizeof(rxprop));
+               T_QUIET; T_WITH_ERRNO; T_EXPECT_NOTNULL(rxslot = os_channel_get_next_slot(rxring, rxprev, &rxprop), NULL);
+               T_QUIET; T_ASSERT_NE_UINT(0, rxprop.sp_len, NULL);
+               T_QUIET; T_ASSERT_NOTNULL((void *)rxprop.sp_buf_ptr, NULL);
+
+               log_hexdump((void *)rxprop.sp_buf_ptr, rxprop.sp_len);
+
+               rxprev = rxslot;
+       }
+       if (sc) {
+               T_QUIET; T_EXPECT_POSIX_ZERO(os_channel_advance_slot(rxring, rxprev), NULL);
+       }
+}
+
+static void
+send_one_packet(int s, int type)
+{
+       struct sockaddr_in sin;
+       memset(&sin, 0, sizeof(sin));
+       sin.sin_len = sizeof(sin);
+       sin.sin_family = AF_INET;
+       sin.sin_addr = g_addr2;
+       sin.sin_port = ntohs(12345);
+
+       if (type == SOCK_STREAM) {
+               T_QUIET; T_EXPECT_POSIX_FAILURE(connect(s, (struct sockaddr *)&sin, sizeof(sin)), EINPROGRESS, NULL);
+       }
+       if (type == SOCK_DGRAM) {
+               T_QUIET; T_WITH_ERRNO; T_EXPECT_EQ_LONG((long)sizeof(s), sendto(s, &s, sizeof(s), 0,
+                   (struct sockaddr *)&sin, sizeof(sin)), NULL);
+       }
+}
+
+static void
+expect_empty_rings(int channel_count, channel_ring_t rings[])
+{
+       /* Check all the rings and make sure there are no packets */
+       for (int ri = 0; ri < channel_count; ri++) {
+               T_QUIET; T_EXPECT_EQ_UINT(0U, os_channel_available_slot_count(rings[ri]), NULL);
+       }
+}
+
+static void
+xfer_1_packet_singly(int channel_count, int type)
+{
+       uuid_t uuids[channel_count];
+       channel_t channels[channel_count];
+       int sockets[SO_TC_MAX];
+       channel_ring_t rxrings[channel_count];
+       int cfds[channel_count];
+       int kq;
+
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(kq = kqueue(), NULL);
+
+       setup_channels_and_rings(kq, channel_count, channels, rxrings, NULL, uuids, cfds);
+
+       setup_sockets(sockets, type);
+
+       for (int si = 0; si < SO_TC_MAX; si++) {
+               expect_empty_rings(channel_count, rxrings);
+
+               send_one_packet(sockets[si], type);
+
+               int expected_ring = channel_count == 1 ? 0 : SOCKET_TC_TO_RING[si];
+
+               /* Wait for the packet delivery and check that it's only one packet and on the correct ring */
+               struct kevent kev[channel_count + 1];
+               int nev;
+               memset(kev, 0, sizeof(kev));
+               struct timespec to = { 0, 100 * NSEC_PER_MSEC }; // 100 ms
+               T_QUIET; T_EXPECT_POSIX_SUCCESS(nev = kevent(kq, NULL, 0, kev, channel_count + 1, &to), NULL);
+               T_QUIET; T_EXPECT_EQ_INT(nev, 1, NULL);
+               T_QUIET; T_EXPECT_EQ_PTR((void *)kev[0].ident, (void *)(uintptr_t)cfds[expected_ring], NULL);
+               T_QUIET; T_EXPECT_EQ_PTR(kev[0].udata, (void *)(uintptr_t)expected_ring, NULL);
+               T_QUIET; T_EXPECT_EQ_SHORT(kev[0].filter, (short)EVFILT_READ, NULL);
+               T_QUIET; T_EXPECT_FALSE(kev[0].flags & EV_ERROR, NULL);
+
+               /* Make sure it comes out the expected interface */
+               for (int ri = 0; ri < channel_count; ri++) {
+                       errno = 0;
+
+                       uint32_t sc = os_channel_available_slot_count(rxrings[ri]);
+
+                       /* Check that the packet appears only on the expected ring and
+                        * is the only packet on the expected ring.
+                        */
+                       T_QUIET; T_EXPECT_EQ_UINT(ri == expected_ring, sc, NULL);
+
+                       if ((ri == expected_ring) == sc) {
+                               T_PASS("tc index %d ring %d expected ring %d slot count %u", si, ri, expected_ring, sc);
+                       } else {
+                               T_FAIL("tc index %d ring %d expected ring %d slot count %u", si, ri, expected_ring, sc);
+                       }
+
+                       drain_ring(rxrings[ri]);
+               }
+       }
+
+       cleanup_sockets(sockets);
+
+       cleanup_channels_and_rings(channel_count, channels, rxrings, NULL, uuids);
+
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(kq), NULL);
+}
+
+T_DECL(ipsec35889979u1s, "transfers 1 packet at a time of each sevice class over udp to a single ring")
+{
+       setup_ipsec_test();
+       xfer_1_packet_singly(1, SOCK_DGRAM);
+}
+
+T_DECL(ipsec35889979u4s, "transfers 1 packet at a time of each sevice class over udp to 4 rings")
+{
+       setup_ipsec_test();
+       xfer_1_packet_singly(4, SOCK_DGRAM);
+}
+
+T_DECL(ipsec35889979t1s, "transfers 1 packet at a time of each sevice class over tcp to a single ring")
+{
+       setup_ipsec_test();
+       xfer_1_packet_singly(1, SOCK_STREAM);
+}
+
+
+T_DECL(ipsec35889979t4s, "transfers 1 packet at a time of each sevice class over tcp to 4 rings",
+    /* This test will fail because tcp syn packets get elevated
+     * due to ack prioritization
+     */
+    T_META_ENABLED(false))
+{
+       setup_ipsec_test();
+       xfer_1_packet_singly(4, SOCK_STREAM);
+}
+
+static void
+xfer_1_packet_together(int channel_count, int type)
+{
+       uuid_t uuids[channel_count];
+       channel_t channels[channel_count];
+       int sockets[SO_TC_MAX];
+       channel_ring_t rxrings[channel_count];
+       int cfds[channel_count];
+       int kq;
+
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(kq = kqueue(), NULL);
+
+       setup_channels_and_rings(kq, channel_count, channels, rxrings, NULL, uuids, cfds);
+
+       setup_sockets(sockets, type);
+
+       for (int si = 0; si < SO_TC_MAX; si++) {
+               expect_empty_rings(channel_count, rxrings);
+
+               send_one_packet(sockets[si], type);
+       }
+
+       /* Sleep to make sure all packets get delivered */
+       struct timespec to = { 0, 100 * NSEC_PER_MSEC }; // 100 ms
+       nanosleep(&to, NULL);
+
+       /* Wait for the packet delivery and check that all rings event */
+       struct kevent kev[channel_count + 1];
+       int nev;
+       memset(kev, 0, sizeof(kev));
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(nev = kevent(kq, NULL, 0, kev, channel_count + 1, &to), NULL);
+       T_QUIET; T_EXPECT_EQ_INT(nev, channel_count, NULL);
+
+       uint32_t found[channel_count];
+       memset(found, 0, sizeof(found));
+       for (int e = 0; e < nev; e++) {
+               T_LOG("kevent %lu filter 0x%4x flags 0x%04x fflags 0x%08x data %"PRIdPTR" udata %p",
+                   kev[e].ident, kev[e].filter, kev[e].flags, kev[e].fflags, kev[e].data, kev[e].udata);
+
+               T_QUIET; T_ASSERT_GE_PTR(kev[e].udata, (void *)0, NULL);
+               T_QUIET; T_ASSERT_LT_PTR(kev[e].udata, (void *)(intptr_t)channel_count, NULL);
+               int ri = (int)kev[e].udata;
+               T_QUIET; T_EXPECT_EQ_UINT(found[ri], 0U, NULL);
+
+               T_QUIET; T_EXPECT_EQ_ULONG(kev[e].ident, (uintptr_t)cfds[ri], NULL);
+               T_QUIET; T_EXPECT_EQ_SHORT(kev[e].filter, (short)EVFILT_READ, NULL);
+               T_QUIET; T_EXPECT_FALSE(kev[e].flags & EV_ERROR, NULL);
+
+               if (channel_count == 1) {
+                       T_QUIET; T_EXPECT_EQ_LONG(kev[e].data, (long)SO_TC_MAX, NULL);
+               } else {
+                       T_QUIET; T_EXPECT_EQ_LONG(kev[e].data, (long)RING_TO_TC_COUNT[ri], NULL);
+               }
+
+               found[ri] += (uint32_t)kev[e].data;
+       }
+       /* Check that something came out of all rings */
+       for (int ri = 0; ri < channel_count; ri++) {
+               T_QUIET; T_EXPECT_NE_UINT(found[ri], 0U, NULL);
+       }
+
+       /* Make sure it comes out the expected interface */
+       for (int ri = 0; ri < channel_count; ri++) {
+               uint32_t sc = os_channel_available_slot_count(rxrings[ri]);
+               if (channel_count == 1) {
+                       if (sc == SO_TC_MAX) {
+                               T_PASS("ring %d got %"PRIu32" slots expecting %"PRIu32"", ri, sc, SO_TC_MAX);
+                       } else {
+                               T_FAIL("ring %d got %"PRIu32" slots expecting %"PRIu32"", ri, sc, SO_TC_MAX);
+                       }
+               } else {
+                       if (sc == (uint32_t)RING_TO_TC_COUNT[ri]) {
+                               T_PASS("ring %d got %"PRIu32" slots expecting %"PRIu32"", ri, sc, (uint32_t)RING_TO_TC_COUNT[ri]);
+                       } else {
+                               T_FAIL("ring %d got %"PRIu32" slots expecting %"PRIu32"", ri, sc, (uint32_t)RING_TO_TC_COUNT[ri]);
+                       }
+               }
+
+               drain_ring(rxrings[ri]);
+       }
+
+       cleanup_sockets(sockets);
+
+       cleanup_channels_and_rings(channel_count, channels, rxrings, NULL, uuids);
+
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(kq), NULL);
+}
+
+T_DECL(ipsec35889979u1m, "transfers 1 packet together of each sevice class over udp to a single ring")
+{
+       setup_ipsec_test();
+       xfer_1_packet_together(1, SOCK_DGRAM);
+}
+
+T_DECL(ipsec35889979u4m, "transfers 1 packet together of each sevice class over udp to 4 rings")
+{
+       setup_ipsec_test();
+       xfer_1_packet_together(4, SOCK_DGRAM);
+}
+
+T_DECL(ipsec35889979t1m, "transfers 1 packet together of each sevice class over tcp to a single ring")
+{
+       setup_ipsec_test();
+       xfer_1_packet_together(1, SOCK_STREAM);
+}
+
+T_DECL(ipsec35889979t4m, "transfers 1 packet together of each sevice class over tcp to 4 rings",
+    /* This test will fail because tcp syn packets get elevated
+     * due to ack prioritization
+     */
+    T_META_ENABLED(false))
+{
+       setup_ipsec_test();
+       xfer_1_packet_together(4, SOCK_STREAM);
 }
index ea36703aa1b20c3c3742d92481033446e31ab07e..b1f87634f9599952a53378cab38a69ade15c165b 100644 (file)
@@ -3,15 +3,27 @@
 #include <darwintest.h>
 #include <mach-o/dyld.h>
 #include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
 
-T_DECL(no32exec_bootarg, "make sure the no32exec boot-arg is honored", T_META_BOOTARGS_SET("-no32exec"))
+static int binprefs_child_is_64 = 0;
+
+static void
+signal_handler(__unused int sig)
+{
+       binprefs_child_is_64 = 1;
+       return;
+}
+
+T_DECL(no32exec_bootarg_with_spawn, "make sure the no32exec boot-arg is honored, using posix_spawn", T_META_BOOTARGS_SET("-no32exec"))
 {
        int spawn_ret, pid;
        char path[1024];
        uint32_t size = sizeof(path);
 
-       T_ASSERT_EQ(_NSGetExecutablePath(path, &size), 0, NULL);
-       T_ASSERT_LT(strlcat(path, "_helper", size), size, NULL);
+       T_QUIET; T_ASSERT_EQ(_NSGetExecutablePath(path, &size), 0, NULL);
+       T_QUIET; T_ASSERT_LT(strlcat(path, "_helper", size), size, NULL);
 
        spawn_ret = posix_spawn(&pid, path, NULL, NULL, NULL, NULL);
        if (spawn_ret == 0) {
@@ -21,3 +33,61 @@ T_DECL(no32exec_bootarg, "make sure the no32exec boot-arg is honored", T_META_BO
        }
        T_ASSERT_EQ(spawn_ret, EBADARCH, NULL);
 }
+
+T_DECL(no32exec_bootarg_with_spawn_binprefs, "make sure the no32exec boot-arg is honored, using posix_spawn"
+    "with binprefs on a fat i386/x86_64 Mach-O", T_META_BOOTARGS_SET("-no32exec"))
+{
+       int pid, ret;
+       posix_spawnattr_t spawnattr;
+       cpu_type_t cpuprefs[] = { CPU_TYPE_X86, CPU_TYPE_X86_64 };
+
+       char path[1024];
+       uint32_t size = sizeof(path);
+       T_QUIET; T_ASSERT_EQ(_NSGetExecutablePath(path, &size), 0, NULL);
+       T_QUIET; T_ASSERT_LT(strlcat(path, "_helper_binprefs", size), size, NULL);
+
+       T_QUIET; T_ASSERT_NE(signal(SIGUSR1, signal_handler), SIG_ERR, "signal");
+
+       ret = posix_spawnattr_init(&spawnattr);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_init");
+
+       ret = posix_spawnattr_setbinpref_np(&spawnattr, sizeof(cpuprefs) / sizeof(cpuprefs[0]), cpuprefs, NULL);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_setbinpref_np");
+
+       ret = posix_spawn(&pid, path, NULL, &spawnattr, NULL, NULL);
+       T_ASSERT_EQ(ret, 0, "posix_spawn should succeed despite 32-bit binpref appearing first");
+
+       sleep(1);
+       ret = kill(pid, SIGUSR1); // ping helper; helper should ping back if running 64-bit
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kill");
+
+       ret = wait(NULL);
+       T_QUIET; T_ASSERT_EQ(ret, pid, "child pid");
+
+       T_ASSERT_EQ(binprefs_child_is_64, 1, "child process should be running in 64-bit mode");
+
+       ret = posix_spawnattr_destroy(&spawnattr);
+       T_QUIET; T_ASSERT_EQ(ret, 0, "posix_spawnattr_destroy");
+}
+
+T_DECL(no32_exec_bootarg_with_exec, "make sure the no32exec boot-arg is honored, using fork and exec", T_META_BOOTARGS_SET("-no32exec"))
+{
+       int pid;
+       char path[1024];
+       uint32_t size = sizeof(path);
+
+       T_QUIET; T_ASSERT_EQ(_NSGetExecutablePath(path, &size), 0, NULL);
+       T_QUIET; T_ASSERT_LT(strlcat(path, "_helper", size), size, NULL);
+
+       pid = fork();
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "fork");
+
+       if (pid == 0) { /* child */
+               execve(path, NULL, NULL); /* this should fail, resulting in the call to exit below */
+               exit(errno);
+       } else { /* parent */
+               int wait_ret = 0;
+               waitpid(pid, &wait_ret, 0);
+               T_ASSERT_EQ(WEXITSTATUS(wait_ret), EBADARCH, "execve should set errno = EBADARCH");
+       }
+}
diff --git a/tests/no32exec_35914211_helper_binprefs.c b/tests/no32exec_35914211_helper_binprefs.c
new file mode 100644 (file)
index 0000000..0909633
--- /dev/null
@@ -0,0 +1,34 @@
+#include <darwintest.h>
+#include <unistd.h>
+#include <signal.h>
+
+int can_signal_parent = 0;
+
+void
+signal_handler(int sig)
+{
+       if (sig == SIGUSR1) {
+               can_signal_parent = 1;
+       }
+       return;
+}
+
+T_DECL(no32exec_bootarg_with_spawn_binprefs_helper, "helper for no32exec_bootarg_with_spawn_binprefs test")
+{
+       unsigned long ptrSize = sizeof(long);
+       int ppid = getppid();
+
+       signal(SIGUSR1, signal_handler);
+       signal(SIGALRM, signal_handler);
+
+       // parent will signal us if they're no32exec_bootarg_with_spawn_binprefs, otherwise timeout
+       alarm(3);
+       pause();
+
+       /* signal to parent process if we are running in 64-bit mode */
+       if (can_signal_parent && ptrSize == 8) {
+               kill(ppid, SIGUSR1);
+       }
+
+       T_SKIP("nothing to see here");
+}
diff --git a/tests/os_proc.c b/tests/os_proc.c
new file mode 100644 (file)
index 0000000..9f2f0ce
--- /dev/null
@@ -0,0 +1,51 @@
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <mach/mach.h>
+#include <mach/task_info.h>
+#include <os/proc.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+#if !TARGET_OS_OSX
+void test_os_proc_available_memory(void);
+extern int getpid(void);
+
+T_DECL(test_os_proc_available_memory, "Basic available memory")
+{
+       kern_return_t err;
+       task_vm_info_data_t vm_info = {};
+       mach_msg_type_number_t count = TASK_VM_INFO_REV4_COUNT;
+       uint64_t remainingBytes;
+
+       err = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count);
+       remainingBytes = os_proc_available_memory();
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+       T_EXPECT_EQ(count, TASK_VM_INFO_REV4_COUNT, "task_info count(%d) is equal to TASK_VM_INFO_REV4_COUNT (%d)\n", count, TASK_VM_INFO_REV4_COUNT);
+       T_EXPECT_NE(remainingBytes, 0ULL, "os_proc_available_memory() should not return 0");
+       T_EXPECT_NE(vm_info.limit_bytes_remaining, 0ULL, "vm_info.limit_bytes_remaining should not return 0");
+       T_EXPECT_EQ(vm_info.limit_bytes_remaining, remainingBytes,
+           "task_info --rev4 call returned value 0x%llx for vm_info.limit_bytes_remaining. Expected 0x%llx",
+           vm_info.limit_bytes_remaining, remainingBytes);
+
+       /* this should now make the available memory return 0 */
+       proc_track_dirty(getpid(), PROC_DIRTY_TRACK);
+
+       count = TASK_VM_INFO_REV4_COUNT;
+       err = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count);
+       remainingBytes = os_proc_available_memory();
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+       T_EXPECT_EQ(count, TASK_VM_INFO_REV4_COUNT, "task_info count(%d) is equal to TASK_VM_INFO_REV4_COUNT\n", count);
+       T_EXPECT_EQ(remainingBytes, 0ULL, "os_proc_available_memory() should return 0");
+       T_EXPECT_EQ(vm_info.limit_bytes_remaining, 0ULL, "vm_info.limit_bytes_remaining should return 0");
+       T_EXPECT_EQ(vm_info.limit_bytes_remaining, remainingBytes,
+           "task_info --rev4 call returned value 0x%llx for vm_info.limit_bytes_remaining. Expected 0x%llx",
+           vm_info.limit_bytes_remaining, remainingBytes);
+}
+#else
+T_DECL(test_os_proc_available_memory, "Basic available memory")
+{
+       T_SKIP("Not available on macOS");
+}
+#endif
diff --git a/tests/os_refcnt.c b/tests/os_refcnt.c
new file mode 100644 (file)
index 0000000..36263be
--- /dev/null
@@ -0,0 +1,394 @@
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <stdio.h>
+#include <assert.h>
+#include <setjmp.h>
+
+#define DEVELOPMENT 1
+#define DEBUG 0
+#define XNU_KERNEL_PRIVATE 1
+
+#define OS_REFCNT_DEBUG 1
+#define STRESS_TESTS 0
+
+void handle_panic(const char *func, char *str, ...);
+#define panic(...) handle_panic(__func__, __VA_ARGS__)
+
+#include "../libkern/os/refcnt.h"
+#include "../libkern/os/refcnt.c"
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+/* import some of the refcnt internal state for testing */
+extern bool ref_debug_enable;
+os_refgrp_decl_extern(global_ref_group);
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("os_refcnt"),
+       T_META_CHECK_LEAKS(false)
+       );
+
+T_DECL(os_refcnt, "Basic atomic refcount")
+{
+       struct os_refcnt rc;
+       os_ref_init(&rc, NULL);
+       T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 1, "refcount correctly initialized");
+
+       os_ref_retain(&rc);
+       os_ref_retain(&rc);
+       T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 3, "retain increased count");
+
+       os_ref_count_t x = os_ref_release(&rc);
+       T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 2, "release decreased count");
+       T_ASSERT_EQ_UINT(x, 2, "release returned correct count");
+
+       os_ref_release_live(&rc);
+       T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 1, "release_live decreased count");
+
+       x = os_ref_release(&rc);
+       T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 0, "released");
+       T_ASSERT_EQ_UINT(x, 0, "returned released");
+
+       os_ref_init(&rc, NULL);
+       x = os_ref_retain_try(&rc);
+       T_ASSERT_GT_INT(x, 0, "try retained");
+
+       (void)os_ref_release(&rc);
+       (void)os_ref_release(&rc);
+       T_QUIET; T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 0, "release");
+
+       x = os_ref_retain_try(&rc);
+       T_ASSERT_EQ_INT(x, 0, "try failed");
+}
+
+T_DECL(refcnt_raw, "Raw refcount")
+{
+       os_ref_atomic_t rc;
+       os_ref_init_raw(&rc, NULL);
+       T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 1, "refcount correctly initialized");
+
+       os_ref_retain_raw(&rc, NULL);
+       os_ref_retain_raw(&rc, NULL);
+       T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 3, "retain increased count");
+
+       os_ref_count_t x = os_ref_release_raw(&rc, NULL);
+       T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 2, "release decreased count");
+       T_ASSERT_EQ_UINT(x, 2, "release returned correct count");
+
+       os_ref_release_live_raw(&rc, NULL);
+       T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 1, "release_live decreased count");
+
+       x = os_ref_release_raw(&rc, NULL);
+       T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 0, "released");
+       T_ASSERT_EQ_UINT(x, 0, "returned released");
+
+       os_ref_init_raw(&rc, NULL);
+       x = os_ref_retain_try_raw(&rc, NULL);
+       T_ASSERT_GT_INT(x, 0, "try retained");
+
+       (void)os_ref_release_raw(&rc, NULL);
+       (void)os_ref_release_raw(&rc, NULL);
+       T_QUIET; T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 0, "release");
+
+       x = os_ref_retain_try_raw(&rc, NULL);
+       T_ASSERT_EQ_INT(x, 0, "try failed");
+}
+
+T_DECL(refcnt_locked, "Locked refcount")
+{
+       struct os_refcnt rc;
+       os_ref_init(&rc, NULL);
+
+       os_ref_retain_locked(&rc);
+       os_ref_retain_locked(&rc);
+       T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 3, "retain increased count");
+
+       os_ref_count_t x = os_ref_release_locked(&rc);
+       T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 2, "release decreased count");
+       T_ASSERT_EQ_UINT(x, 2, "release returned correct count");
+
+       (void)os_ref_release_locked(&rc);
+       x = os_ref_release_locked(&rc);
+       T_ASSERT_EQ_UINT(os_ref_get_count(&rc), 0, "released");
+       T_ASSERT_EQ_UINT(x, 0, "returned released");
+}
+
+T_DECL(refcnt_raw_locked, "Locked raw refcount")
+{
+       os_ref_atomic_t rc;
+       os_ref_init_raw(&rc, NULL);
+
+       os_ref_retain_locked_raw(&rc, NULL);
+       os_ref_retain_locked_raw(&rc, NULL);
+       T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 3, "retain increased count");
+
+       os_ref_count_t x = os_ref_release_locked_raw(&rc, NULL);
+       T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 2, "release decreased count");
+       T_ASSERT_EQ_UINT(x, 2, "release returned correct count");
+
+       (void)os_ref_release_locked_raw(&rc, NULL);
+       x = os_ref_release_locked_raw(&rc, NULL);
+       T_ASSERT_EQ_UINT(os_ref_get_count_raw(&rc), 0, "released");
+       T_ASSERT_EQ_UINT(x, 0, "returned released");
+}
+
+T_DECL(refcnt_mask_locked, "Locked bitwise refcount")
+{
+       const os_ref_count_t b = 12;
+       os_ref_atomic_t rc;
+       os_ref_count_t reserved = 0xaaa;
+       os_ref_init_count_mask(&rc, NULL, 1, reserved, b);
+
+       os_ref_retain_locked_mask(&rc, NULL, b);
+       os_ref_retain_locked_mask(&rc, NULL, b);
+       T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, b), 3, "retain increased count");
+
+       os_ref_count_t x = os_ref_release_locked_mask(&rc, NULL, b);
+       T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, b), 2, "release decreased count");
+       T_ASSERT_EQ_UINT(x, 2, "release returned correct count");
+       T_ASSERT_EQ_UINT(rc & ((1U << b) - 1), reserved, "Reserved bits not modified");
+
+       (void)os_ref_release_locked_mask(&rc, NULL, b);
+       x = os_ref_release_locked_mask(&rc, NULL, b);
+       T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, b), 0, "released");
+       T_ASSERT_EQ_UINT(x, 0, "returned released");
+       T_ASSERT_EQ_UINT(rc & ((1U << b) - 1), reserved, "Reserved bits not modified");
+}
+
+static void
+do_bitwise_test(const os_ref_count_t bits)
+{
+       os_ref_atomic_t rc;
+       os_ref_count_t reserved = 0xaaaaaaaaU & ((1U << bits) - 1);
+       os_ref_init_count_mask(&rc, NULL, 1, reserved, bits);
+
+       T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, bits), 1, "[%u bits] refcount initialized", bits);
+
+       os_ref_retain_mask(&rc, NULL, bits);
+       os_ref_retain_mask(&rc, NULL, bits);
+       T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, bits), 3, "retain increased count");
+
+       os_ref_count_t x = os_ref_release_mask(&rc, NULL, bits);
+       T_ASSERT_EQ_UINT(x, 2, "release returned correct count");
+
+       os_ref_release_live_mask(&rc, NULL, bits);
+       T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, bits), 1, "release_live decreased count");
+
+       x = os_ref_release_mask(&rc, NULL, bits);
+       T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, bits), 0, "released");
+       T_ASSERT_EQ_UINT(x, 0, "returned released");
+
+       T_ASSERT_EQ_UINT(rc & ((1U << bits) - 1), reserved, "Reserved bits not modified");
+
+       os_ref_init_count_mask(&rc, NULL, 1, reserved, bits);
+       x = os_ref_retain_try_mask(&rc, NULL, bits);
+       T_ASSERT_GT_INT(x, 0, "try retained");
+
+       (void)os_ref_release_mask(&rc, NULL, bits);
+       (void)os_ref_release_mask(&rc, NULL, bits);
+       T_QUIET; T_ASSERT_EQ_UINT(os_ref_get_count_mask(&rc, bits), 0, "release");
+
+       x = os_ref_retain_try_mask(&rc, NULL, bits);
+       T_ASSERT_EQ_INT(x, 0, "try failed");
+
+       T_ASSERT_EQ_UINT(rc & ((1U << bits) - 1), reserved, "Reserved bits not modified");
+}
+
+T_DECL(refcnt_bitwise, "Bitwise refcount")
+{
+       do_bitwise_test(0);
+       do_bitwise_test(1);
+       do_bitwise_test(8);
+       do_bitwise_test(26);
+
+       os_ref_atomic_t rc = 0xaaaaaaaa;
+
+       const os_ref_count_t nbits = 3;
+       const os_ref_count_t count = 5;
+       const os_ref_count_t bits = 7;
+       os_ref_init_count_mask(&rc, NULL, count, bits, nbits);
+
+       os_ref_count_t mask = (1U << nbits) - 1;
+       T_ASSERT_EQ_UINT(rc & mask, bits, "bits correctly initialized");
+       T_ASSERT_EQ_UINT(rc >> nbits, count, "count correctly initialized");
+}
+
+os_refgrp_decl(static, g1, "test group", NULL);
+os_refgrp_decl_extern(g1);
+
+T_DECL(refcnt_groups, "Group accounting")
+{
+#if OS_REFCNT_DEBUG
+       ref_debug_enable = true;
+
+       struct os_refcnt rc;
+       os_ref_init(&rc, &g1);
+
+       T_ASSERT_EQ_UINT(g1.grp_children, 1, "group attached");
+       T_ASSERT_EQ_UINT(global_ref_group.grp_children, 1, "global group attached");
+       T_ASSERT_EQ_UINT(g1.grp_count, 1, "group count");
+       T_ASSERT_EQ_ULLONG(g1.grp_retain_total, 1ULL, "group retains");
+       T_ASSERT_EQ_ULLONG(g1.grp_release_total, 0ULL, "group releases");
+
+       os_ref_retain(&rc);
+       os_ref_retain(&rc);
+       os_ref_release_live(&rc);
+       os_ref_release_live(&rc);
+
+       T_EXPECT_EQ_ULLONG(g1.grp_retain_total, 3ULL, "group retains");
+       T_EXPECT_EQ_ULLONG(g1.grp_release_total, 2ULL, "group releases");
+
+       os_ref_count_t x = os_ref_release(&rc);
+       T_QUIET; T_ASSERT_EQ_UINT(x, 0, "released");
+
+       T_ASSERT_EQ_UINT(g1.grp_children, 0, "group detatched");
+       T_ASSERT_EQ_UINT(g1.grp_count, 0, "group count");
+#else
+       T_SKIP("Refcount debugging disabled");
+#endif
+}
+
+enum {
+       OSREF_UNDERFLOW    = 1,
+       OSREF_OVERFLOW     = 2,
+       OSREF_RESURRECTION = 3,
+       OSREF_DEALLOC_LIVE = 4,
+};
+
+static jmp_buf jb;
+static bool expect_panic = false;
+
+void
+handle_panic(const char *func, char *__unused str, ...)
+{
+       int ret = -1;
+       if (!expect_panic) {
+               T_FAIL("unexpected panic from %s", func);
+               T_LOG("corrupt program state, aborting");
+               abort();
+       }
+       expect_panic = false;
+
+       if (strcmp(func, "os_ref_panic_underflow") == 0) {
+               ret = OSREF_UNDERFLOW;
+       } else if (strcmp(func, "os_ref_panic_overflow") == 0) {
+               ret = OSREF_OVERFLOW;
+       } else if (strcmp(func, "os_ref_panic_resurrection") == 0) {
+               ret = OSREF_RESURRECTION;
+       } else if (strcmp(func, "os_ref_panic_live") == 0) {
+               ret = OSREF_DEALLOC_LIVE;
+       } else {
+               T_LOG("unexpected panic from %s", func);
+       }
+
+       longjmp(jb, ret);
+}
+
+T_DECL(refcnt_underflow, "Underflow")
+{
+       os_ref_atomic_t rc;
+       os_ref_init_raw(&rc, NULL);
+       (void)os_ref_release_raw(&rc, NULL);
+
+       int x = setjmp(jb);
+       if (x == 0) {
+               expect_panic = true;
+               (void)os_ref_release_raw(&rc, NULL);
+               T_FAIL("underflow not caught");
+       } else {
+               T_ASSERT_EQ_INT(x, OSREF_UNDERFLOW, "underflow caught");
+       }
+}
+
+T_DECL(refcnt_overflow, "Overflow")
+{
+       os_ref_atomic_t rc;
+       os_ref_init_count_raw(&rc, NULL, 0x0fffffffU);
+
+       int x = setjmp(jb);
+       if (x == 0) {
+               expect_panic = true;
+               (void)os_ref_retain_raw(&rc, NULL);
+               T_FAIL("overflow not caught");
+       } else {
+               T_ASSERT_EQ_INT(x, OSREF_OVERFLOW, "overflow caught");
+       }
+}
+
+T_DECL(refcnt_resurrection, "Resurrection")
+{
+       os_ref_atomic_t rc;
+       os_ref_init_raw(&rc, NULL);
+       os_ref_count_t n = os_ref_release_raw(&rc, NULL);
+
+       T_QUIET; T_EXPECT_EQ_UINT(n, 0, "reference not released");
+
+       int x = setjmp(jb);
+       if (x == 0) {
+               expect_panic = true;
+               (void)os_ref_retain_raw(&rc, NULL);
+               T_FAIL("resurrection not caught");
+       } else {
+               T_ASSERT_EQ_INT(x, OSREF_RESURRECTION, "resurrection caught");
+       }
+}
+
+T_DECL(refcnt_dealloc_live, "Dealloc expected live object")
+{
+       os_ref_atomic_t rc;
+       os_ref_init_raw(&rc, NULL);
+
+       expect_panic = true;
+       int x = setjmp(jb);
+       if (x == 0) {
+               expect_panic = true;
+               os_ref_release_live_raw(&rc, NULL);
+               T_FAIL("dealloc live not caught");
+       } else {
+               T_ASSERT_EQ_INT(x, OSREF_DEALLOC_LIVE, "dealloc live caught");
+       }
+}
+
+T_DECL(refcnt_initializer, "Static intializers")
+{
+       struct os_refcnt rc = OS_REF_INITIALIZER;
+       os_ref_atomic_t rca = OS_REF_ATOMIC_INITIALIZER;
+
+       T_ASSERT_EQ_INT(0, os_ref_retain_try(&rc), NULL);
+       T_ASSERT_EQ_INT(0, os_ref_get_count_raw(&rca), NULL);
+}
+
+#if STRESS_TESTS
+
+static const unsigned long iters = 1024 * 1024 * 32;
+
+static void *
+func(void *_rc)
+{
+       struct os_refcnt *rc = _rc;
+       for (unsigned long i = 0; i < iters; i++) {
+               os_ref_retain(rc);
+               os_ref_release_live(rc);
+       }
+       return NULL;
+}
+
+T_DECL(refcnt_stress, "Stress test")
+{
+       pthread_t th1, th2;
+
+       struct os_refcnt rc;
+       os_ref_init(&rc, NULL);
+
+       T_ASSERT_POSIX_ZERO(pthread_create(&th1, NULL, func, &rc), "pthread_create");
+       T_ASSERT_POSIX_ZERO(pthread_create(&th2, NULL, func, &rc), "pthread_create");
+
+       void *r1, *r2;
+       T_ASSERT_POSIX_ZERO(pthread_join(th1, &r1), "pthread_join");
+       T_ASSERT_POSIX_ZERO(pthread_join(th2, &r2), "pthread_join");
+
+       os_ref_count_t x = os_ref_release(&rc);
+       T_ASSERT_EQ_INT(x, 0, "Consistent refcount");
+}
+
+#endif
diff --git a/tests/os_unaligned.c b/tests/os_unaligned.c
new file mode 100644 (file)
index 0000000..311ecbb
--- /dev/null
@@ -0,0 +1,36 @@
+#include <stdint.h>
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+#include "../libkern/os/ptrtools.h"
+
+#define CHECK_ALIGNMENT(T) \
+{ \
+       T *__p; \
+       T_QUIET; T_EXPECT_EQ_ULONG(__alignof__(*__p), sizeof(*__p), #T " native alignment"); \
+       T_ASSERT_EQ_ULONG(__alignof__(os_unaligned_deref(__p)), 1UL, #T " alignment"); \
+}
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+struct A {
+       int a;
+};
+
+T_DECL(os_unaligned, "Unaligned pointer access")
+{
+       int x = 0x914842;
+       int *p = &x;
+
+       T_ASSERT_EQ_INT(os_unaligned_deref(p), x, "load");
+       os_unaligned_deref(&x) = INT_MIN;
+       T_ASSERT_EQ_INT(x, INT_MIN, "store");
+
+       CHECK_ALIGNMENT(unsigned);
+       CHECK_ALIGNMENT(long long);
+       CHECK_ALIGNMENT(uintptr_t);
+       CHECK_ALIGNMENT(int16_t);
+       CHECK_ALIGNMENT(uint64_t);
+       CHECK_ALIGNMENT(struct A);
+       CHECK_ALIGNMENT(void *);
+}
diff --git a/tests/osptr.cpp b/tests/osptr.cpp
new file mode 100644 (file)
index 0000000..054b869
--- /dev/null
@@ -0,0 +1,772 @@
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++11-extensions"
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <stdio.h>
+#include <assert.h>
+#include <typeinfo>
+
+#if 0
+# define OSPTR_LOG T_LOG
+#elif 0
+# define OSPTR_LOG printf
+#else
+# define OSPTR_LOG(x...)  do { } while(0)
+#endif
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("osptr"),
+       T_META_CHECK_LEAKS(false),
+       T_META_RUN_CONCURRENTLY(true)
+       );
+
+static int num_instances = 0;
+static int num_retains = 0;
+static int num_releases = 0;
+
+class OSMetaClassBase
+{
+       static int id_counter;
+       static OSMetaClassBase *freelist;
+
+public:
+       int inst_id;
+       mutable int refcount;
+       mutable OSMetaClassBase *next;
+       static void *type_id;
+
+       OSMetaClassBase() : refcount(1), next(nullptr)
+       {
+               inst_id = id_counter++;
+               num_instances++;
+               OSPTR_LOG("[%p, %d] constructed\n", this, inst_id);
+       }
+
+       virtual ~OSMetaClassBase()
+       {
+               OSPTR_LOG("[%p, %d] destroyed\n", this, inst_id);
+       }
+
+       virtual void
+       retain() const
+       {
+               T_QUIET; T_EXPECT_GT_INT(refcount, 0, "Instance resurrected");
+               refcount++;
+               num_retains++;
+               OSPTR_LOG("[%p, %d] retain, refcount=%d\n", this, inst_id, refcount);
+       }
+
+       virtual void
+       release() const
+       {
+               T_QUIET; T_EXPECT_GT_INT(refcount, 0, "Double free");
+               refcount--;
+               num_releases++;
+               OSPTR_LOG("[%p, %d] release, refcount=%d\n", this, inst_id, refcount);
+
+               /*
+                * Don't delete the object, but keep it around so that we
+                * can detect double frees
+                */
+               if (refcount == 0) {
+                       num_instances--;
+                       this->next = freelist;
+                       freelist = const_cast<OSMetaClassBase *>(this);
+               }
+       }
+
+       virtual void
+       taggedRetain(void *tag) const
+       {
+               OSPTR_LOG("tag[%p] ", tag);
+               retain();
+       }
+
+       virtual void
+       taggedRelease(void *tag) const
+       {
+               OSPTR_LOG("tag[%p] ", tag);
+               release();
+       }
+};
+
+int OSMetaClassBase::id_counter;
+OSMetaClassBase *OSMetaClassBase::freelist;
+
+void *OSMetaClassBase::type_id;
+
+#define OSTypeID(T) T::type_id
+#define OSTypeAlloc(T) new T
+#define OSDynamicCast(T, p) dynamic_cast<T *>(p)
+
+#define LIBKERN_SMART_POINTERS
+#include <libkern/c++/OSPtr.h>
+
+class Base : public OSMetaClassBase {
+public:
+       Base() : OSMetaClassBase()
+       {
+       }
+};
+
+class Derived : public Base {
+public:
+       Derived() : Base()
+       {
+       }
+};
+
+class Other : public OSMetaClassBase {
+public:
+       Other() : OSMetaClassBase()
+       {
+       }
+};
+
+typedef OSPtr<Base> BasePtr;
+typedef OSPtr<Derived> DerivedPtr;
+typedef OSPtr<Other> OtherPtr;
+
+static void
+default_constructor()
+{
+       BasePtr a;
+       T_ASSERT_NULL(a.get(), "Default NULL construction");
+       T_ASSERT_EQ_INT(num_instances, 0, "No instances created");
+}
+
+static void
+null_constructor()
+{
+       BasePtr a(nullptr);
+       T_ASSERT_NULL(a.get(), "Default NULL construction");
+       T_ASSERT_EQ_INT(num_instances, 0, "No instances created");
+}
+
+static void
+raw_constructor()
+{
+       Base *a = new Base();
+       T_ASSERT_EQ_INT(num_instances, 1, "Created instance");
+
+       {
+               BasePtr p(a);
+
+               T_ASSERT_EQ_INT(num_instances, 1, "No new instance");
+               T_ASSERT_EQ_PTR(p.get(), a, "osptr bound to correct object");
+               T_ASSERT_EQ_INT(a->refcount, 2, "Object refcount incremented");
+       }
+
+       T_ASSERT_EQ_INT(a->refcount, 1, "Object refcount decremented");
+       a->release();
+       T_ASSERT_EQ_INT(num_instances, 0, "All instances released");
+}
+
+static void
+alloc()
+{
+       BasePtr a = BasePtr::alloc();
+
+       T_ASSERT_NOTNULL(a.get(), "osptr seated");
+       T_ASSERT_EQ_INT(num_instances, 1, "Instance created");
+       T_ASSERT_EQ_INT(a->refcount, 1, "Reference created");
+}
+
+static void
+destroy()
+{
+       {
+               BasePtr a = BasePtr::alloc();
+               T_ASSERT_EQ_INT(num_instances, 1, "Instance created");
+       }
+
+       T_ASSERT_EQ_INT(num_instances, 0, "All instances released");
+}
+
+static void
+copy()
+{
+       BasePtr a = BasePtr::alloc();
+       BasePtr b;
+       int a_id = a->inst_id;
+
+       BasePtr a_copy(a);
+
+       T_ASSERT_EQ_INT(a_copy->inst_id, a_id, NULL);
+       T_ASSERT_EQ_INT(a->refcount, 2, NULL);
+       T_ASSERT_EQ_INT(a_copy->refcount, 2, NULL);
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+       T_EXPECT_EQ_INT(num_retains, 1, NULL);
+
+       BasePtr b_copy(b);
+       T_ASSERT_NULL(b_copy.get(), "Copy null osptr");
+
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+       T_EXPECT_EQ_INT(num_retains, 1, NULL);
+
+       BasePtr a_copy2 = a;
+       T_ASSERT_EQ_PTR(a_copy2.get(), a.get(), NULL);
+
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+       T_EXPECT_EQ_INT(num_retains, 2, NULL);
+       T_EXPECT_EQ_INT(num_releases, 0, NULL);
+}
+
+static void
+copy_subclass()
+{
+       auto a = DerivedPtr::alloc();
+       BasePtr b(a);
+
+       T_ASSERT_EQ_PTR(a.get(), b.get(), NULL);
+       T_ASSERT_EQ_INT(b->refcount, 2, NULL);
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+
+       a = nullptr;
+       T_ASSERT_NOTNULL(b.get(), NULL);
+       T_ASSERT_EQ_INT(b->refcount, 1, NULL);
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+}
+
+static void
+assign()
+{
+       int a_id, b_id;
+
+       BasePtr p;
+       BasePtr a = BasePtr::alloc();
+       BasePtr b = BasePtr::alloc();
+
+       a_id = a->inst_id;
+       b_id = b->inst_id;
+
+       p = a;
+
+       T_ASSERT_EQ_PTR(p.get(), a.get(), "Assigned osptr references same object");
+       T_ASSERT_EQ_INT(p->inst_id, a_id, NULL);
+       T_ASSERT_EQ_INT(a->refcount, 2, "Assigned osptr bumps refcount");
+       T_QUIET; T_ASSERT_TRUE(b->refcount == 1, NULL);
+
+       p = b;
+
+       T_ASSERT_EQ_PTR(p.get(), b.get(), "Assigned osptr references same object");
+       T_ASSERT_EQ_INT(p->inst_id, b_id, NULL);
+       T_ASSERT_EQ_INT(a->refcount, 1, "Previous assignee drops reference");
+       T_ASSERT_EQ_INT(b->refcount, 2, "New assignee bumps reference");
+
+       T_ASSERT_EQ_INT(a->inst_id, a_id, NULL);
+       T_ASSERT_EQ_INT(b->inst_id, b_id, NULL);
+
+       a = nullptr;
+
+       T_ASSERT_EQ_INT(num_instances, 1, "Assignment to null releases object");
+
+       b = nullptr;
+       p = nullptr;
+
+       T_ASSERT_EQ_INT(num_instances, 0, "All instances released");
+}
+
+static void
+assign_raw()
+{
+       Base *a1 = new Base();
+       Base *a2 = new Base();
+
+       {
+               BasePtr p;
+
+               p = a1;
+               T_ASSERT_EQ_PTR(p.get(), a1, NULL);
+               T_ASSERT_EQ_INT(a1->refcount, 2, NULL);
+               T_ASSERT_EQ_INT(a2->refcount, 1, NULL);
+
+               p = a2;
+               T_ASSERT_EQ_PTR(p.get(), a2, NULL);
+               T_ASSERT_EQ_INT(a1->refcount, 1, NULL);
+               T_ASSERT_EQ_INT(a2->refcount, 2, NULL);
+       }
+
+       T_ASSERT_EQ_INT(a1->refcount, 1, NULL);
+       T_ASSERT_EQ_INT(a2->refcount, 1, NULL);
+
+       a1->release();
+       a2->release();
+
+       T_ASSERT_EQ_INT(num_instances, 0, "All instances released");
+}
+
+static void
+assign_null()
+{
+       BasePtr a = BasePtr::alloc();
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+
+       a = nullptr;
+
+       T_ASSERT_NULL(a.get(), NULL);
+       T_ASSERT_EQ_INT(num_instances, 0, "No instances created");
+
+       a = BasePtr::alloc();
+       BasePtr b(a.get());
+
+       T_ASSERT_EQ_INT(a->refcount, 2, NULL);
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+
+       b = nullptr;
+
+       T_ASSERT_EQ_INT(a->refcount, 1, NULL);
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+
+       a = nullptr;
+
+       T_ASSERT_EQ_INT(num_instances, 0, "All instances released");
+}
+
+static void
+assign_subclass()
+{
+       int a_id, b_id;
+
+       OSPtr<OSMetaClassBase> base;
+       BasePtr a = BasePtr::alloc();
+       BasePtr b = BasePtr::alloc();
+
+       a_id = a->inst_id;
+       b_id = b->inst_id;
+
+       base = a;
+
+       T_ASSERT_TRUE(base.get() == static_cast<OSMetaClassBase *>(a.get()), NULL);
+       T_ASSERT_TRUE(base->inst_id == a_id, NULL);
+       T_ASSERT_TRUE(a->refcount == 2, NULL);
+       T_ASSERT_TRUE(b->refcount == 1, NULL);
+
+       base = b;
+
+       T_ASSERT_TRUE(base.get() == static_cast<OSMetaClassBase *>(b.get()), NULL);
+       T_ASSERT_TRUE(base->inst_id == b_id, NULL);
+       T_ASSERT_TRUE(a->refcount == 1, NULL);
+       T_ASSERT_TRUE(b->refcount == 2, NULL);
+
+       T_ASSERT_TRUE(a->inst_id == a_id, NULL);
+       T_ASSERT_TRUE(b->inst_id == b_id, NULL);
+
+       a = nullptr;
+
+       T_ASSERT_TRUE(num_instances == 1, NULL);
+
+       b = nullptr;
+       base = nullptr;
+
+       T_ASSERT_EQ_INT(num_instances, 0, "All instances released");
+}
+
+static void
+assign_compatible()
+{
+       OSPtr<Base> a = OSPtr<Base>::alloc();
+       OSPtr<const Base> b = a;
+       T_ASSERT_EQ_PTR(a.get(), b.get(), NULL);
+
+       OSPtr<Derived> c = OSPtr<Derived>::alloc();
+       OSPtr<Base> d = c;
+       T_ASSERT_EQ_PTR(c.get(), d.get(), NULL);
+}
+
+static void
+move()
+{
+       OSPtr<const Base> a = OSPtr<const Base>::alloc();
+       int a_id = a->inst_id;
+
+       OSPtr<const Base> b(os::move(a));
+
+       T_ASSERT_TRUE(a.get() == NULL, NULL);
+       T_ASSERT_TRUE(b->inst_id == a_id, NULL);
+       T_ASSERT_TRUE(b->refcount == 1, NULL);
+       T_ASSERT_TRUE(num_instances == 1, NULL);
+       T_EXPECT_EQ_INT(num_retains, 0, NULL);
+}
+
+static void
+move_assign()
+{
+       OSPtr<const Base> a = OSPtr<const Base>::alloc();
+       OSPtr<const Base> b = OSPtr<const Base>::alloc();
+       int a_id = a->inst_id;
+       int b_id = b->inst_id;
+
+       OSPtr<const Base> d;
+
+       d = os::move(a);
+
+       T_ASSERT_TRUE(a.get() == NULL, NULL);
+       T_ASSERT_TRUE(d->inst_id == a_id, NULL);
+       T_ASSERT_TRUE(d->refcount == 1, NULL);
+       T_ASSERT_TRUE(num_instances == 2, NULL);
+
+       d = os::move(b);
+       T_ASSERT_TRUE(a.get() == NULL, NULL);
+       T_ASSERT_TRUE(b.get() == NULL, NULL);
+       T_ASSERT_TRUE(d->inst_id == b_id, NULL);
+       T_ASSERT_TRUE(d->refcount == 1, NULL);
+       T_ASSERT_TRUE(num_instances == 1, NULL);
+       T_EXPECT_EQ_INT(num_retains, 0, NULL);
+}
+
+static void
+move_assign_null()
+{
+       BasePtr a = BasePtr::alloc();
+       BasePtr b = a;
+
+       T_EXPECT_EQ_INT(num_retains, 1, NULL);
+
+       a = os::move(nullptr);
+
+       T_ASSERT_TRUE(a.get() == NULL, NULL);
+       T_ASSERT_TRUE(b->refcount == 1, NULL);
+
+       b = os::move(nullptr);
+
+       T_ASSERT_EQ_INT(num_instances, 0, "All instances released");
+       T_EXPECT_EQ_INT(num_retains, 1, NULL);
+}
+
+static void
+move_assign_raw()
+{
+       BasePtr a = BasePtr::alloc();
+       Base *b = new Base;
+       Base *tmp = b;
+
+       T_ASSERT_EQ_INT(num_instances, 2, NULL);
+
+       a = os::move(tmp);
+
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+       T_ASSERT_NULL(tmp, NULL);
+       T_ASSERT_EQ_PTR(a.get(), b, NULL);
+       T_ASSERT_EQ_INT(a->refcount, 2, NULL);
+       b->release();
+       T_ASSERT_EQ_INT(a->refcount, 1, NULL);
+}
+
+static void
+move_assign_subclass()
+{
+       auto a = DerivedPtr::alloc();
+       BasePtr b;
+
+       b = os::move(a);
+
+       T_ASSERT_NULL(a.get(), NULL);
+       T_ASSERT_NOTNULL(b.get(), NULL);
+       T_ASSERT_EQ_INT(b->refcount, 1, NULL);
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+}
+
+static void
+move_assign_self()
+{
+       OSPtr<const Base> a = OSPtr<const Base>::alloc();
+       int a_id = a->inst_id;
+
+       a = os::move(a);
+
+       T_ASSERT_NOTNULL(a.get(), "osptr seated");
+       T_ASSERT_TRUE(a->inst_id == a_id, NULL);
+       T_ASSERT_TRUE(a->refcount == 1, NULL);
+       T_ASSERT_TRUE(num_instances == 1, NULL);
+       T_EXPECT_EQ_INT(num_retains, 0, NULL);
+}
+
+static void
+test_const_cast()
+{
+       OSPtr<const Base> a = OSPtr<const Base>::alloc();
+
+       OSPtr<Base> b;
+
+       b = a.const_pointer_cast<Base>();
+
+       T_ASSERT_TRUE(a.get() == b.get(), NULL);
+       T_ASSERT_TRUE(a->refcount == 2, NULL);
+       T_ASSERT_TRUE(b->refcount == 2, NULL);
+
+       T_ASSERT_TRUE(num_instances == 1, NULL);
+       T_EXPECT_EQ_INT(num_retains, 1, NULL);
+}
+
+static void
+const_cast_move()
+{
+       OSPtr<const Base> a = OSPtr<const Base>::alloc();
+       int a_id = a->inst_id;
+
+       OSPtr<Base> b;
+
+       b = os::move(a).const_pointer_cast<Base>();
+
+       T_ASSERT_TRUE(a.get() == NULL, NULL);
+       T_ASSERT_TRUE(b->inst_id == a_id, NULL);
+       T_ASSERT_TRUE(b->refcount == 1, NULL);
+
+       T_ASSERT_TRUE(num_instances == 1, NULL);
+       T_EXPECT_EQ_INT(num_retains, 0, NULL);
+}
+
+static void
+const_cast_move_self()
+{
+       BasePtr a = BasePtr::alloc();
+       int a_id = a->inst_id;
+
+       a = os::move(a).const_pointer_cast<Base>();
+
+       T_ASSERT_NOTNULL(a.get(), "osptr seated");
+       T_ASSERT_TRUE(a->inst_id == a_id, NULL);
+       T_ASSERT_TRUE(a->refcount == 1, NULL);
+       T_ASSERT_TRUE(num_instances == 1, NULL);
+       T_ASSERT_TRUE(num_retains == 0, NULL);
+}
+
+static void
+test_static_cast()
+{
+       DerivedPtr a = DerivedPtr::alloc();
+
+       BasePtr b;
+
+       b = a.static_pointer_cast<Base>();
+
+       T_ASSERT_TRUE(a.get() == b.get(), NULL);
+       T_ASSERT_TRUE(a->refcount == 2, NULL);
+       T_ASSERT_TRUE(b->refcount == 2, NULL);
+
+       T_ASSERT_TRUE(num_instances == 1, NULL);
+       T_EXPECT_TRUE(num_retains == 1, NULL);
+}
+
+static void
+static_cast_move()
+{
+       DerivedPtr a = DerivedPtr::alloc();
+       int a_id = a->inst_id;
+
+       BasePtr b;
+
+       b = os::move(a).static_pointer_cast<Base>();
+
+       T_ASSERT_NULL(a.get(), NULL);
+       T_ASSERT_EQ_INT(b->inst_id, a_id, NULL);
+       T_ASSERT_EQ_INT(b->refcount, 1, NULL);
+
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+       T_EXPECT_EQ_INT(num_retains, 0, NULL);
+}
+
+static void
+static_cast_move_self()
+{
+       BasePtr a = BasePtr::alloc();
+       int a_id = a->inst_id;
+
+       a = os::move(a).static_pointer_cast<Base>();
+
+       T_ASSERT_NOTNULL(a.get(), "osptr seated");
+       T_ASSERT_TRUE(a->inst_id == a_id, NULL);
+       T_ASSERT_TRUE(a->refcount == 1, NULL);
+       T_ASSERT_TRUE(num_instances == 1, NULL);
+       T_ASSERT_TRUE(num_retains == 0, NULL);
+}
+
+static void
+tagged_ptr()
+{
+       OSTaggedPtr<Base, Derived> a;
+       auto b = OSTaggedPtr<Derived, Base>::alloc();
+
+       T_ASSERT_NULL(a.get(), NULL);
+       T_ASSERT_NOTNULL(b.get(), NULL);
+
+       T_ASSERT_TRUE(typeid(a.get()) == typeid(Base *), NULL);
+       T_ASSERT_TRUE(typeid(b.get()) == typeid(Derived *), NULL);
+}
+
+static void
+attach()
+{
+       Base *a = new Base();
+       BasePtr b;
+       b.attach(os::move(a));
+
+       T_ASSERT_NULL(a, NULL);
+       T_ASSERT_NOTNULL(b.get(), NULL);
+       T_ASSERT_EQ_INT(b->refcount, 1, NULL);
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+       T_ASSERT_EQ_INT(num_retains, 0, NULL);
+
+       b.attach(new Base);
+       T_ASSERT_NOTNULL(b.get(), NULL);
+       T_ASSERT_EQ_INT(b->refcount, 1, NULL);
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+       T_ASSERT_EQ_INT(num_retains, 0, NULL);
+       T_ASSERT_EQ_INT(num_releases, 1, NULL);
+}
+
+static void
+detach()
+{
+       BasePtr a = BasePtr::alloc();
+       Base *p = a.detach();
+
+       T_ASSERT_NULL(a.get(), NULL);
+       T_ASSERT_NOTNULL(p, NULL);
+       T_ASSERT_EQ_INT(p->refcount, 1, NULL);
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+       T_ASSERT_EQ_INT(num_retains, 0, NULL);
+       T_ASSERT_EQ_INT(num_releases, 0, NULL);
+
+       BasePtr b(os::move(p), os::no_retain); // re-seat so that 'p' gets freed
+}
+
+static void
+foreign()
+{
+       auto a = OSPtr<Base>::alloc();
+       auto b = OSTaggedPtr<Base, Derived>::alloc();
+
+       void *a_ptr = a.get();
+       void *b_ptr = b.get();
+
+       a.swap(b);
+
+       T_ASSERT_EQ_PTR(b.get(), a_ptr, NULL);
+       T_ASSERT_EQ_PTR(a.get(), b_ptr, NULL);
+       T_ASSERT_EQ_INT(a->refcount, 1, NULL);
+       T_ASSERT_EQ_INT(b->refcount, 1, NULL);
+       T_ASSERT_EQ_INT(num_instances, 2, NULL);
+       T_ASSERT_GE_INT(num_retains, 2, NULL);
+}
+
+static void
+test_dynamic_cast()
+{
+       auto a = DerivedPtr::alloc();
+       T_ASSERT_NOTNULL(a.get(), NULL);
+       BasePtr b = a;
+
+       auto c = b.dynamic_pointer_cast<Derived>();
+       T_ASSERT_NOTNULL(c.get(), NULL);
+
+       T_ASSERT_EQ_INT(c->refcount, 3, NULL);
+       T_ASSERT_EQ_INT(num_instances, 1, NULL);
+
+       auto d = OtherPtr::alloc();
+       auto e = d.dynamic_pointer_cast<Derived>();
+       auto f = OSDynamicCastPtr<Derived>(OtherPtr::alloc());
+
+       T_ASSERT_NULL(e.get(), NULL);
+       T_ASSERT_NULL(f.get(), NULL);
+
+       T_ASSERT_EQ_INT(num_instances, 2, NULL);
+       T_ASSERT_EQ_INT(d->refcount, 1, NULL);
+
+       auto g = OSDynamicCastPtr<Base>(DerivedPtr::alloc());
+       T_ASSERT_EQ_INT(num_instances, 3, NULL);
+       T_ASSERT_EQ_INT(g->refcount, 1, NULL);
+}
+
+#define OSPTR_TEST_DECL(name) \
+       T_DECL(name, #name) { \
+               num_instances = 0; \
+               num_retains = 0; \
+               num_releases = 0; \
+               name(); \
+               T_QUIET; T_ASSERT_EQ_INT(num_instances, 0, "Instance leak"); \
+       }
+
+OSPTR_TEST_DECL(default_constructor)
+OSPTR_TEST_DECL(null_constructor)
+OSPTR_TEST_DECL(raw_constructor)
+OSPTR_TEST_DECL(alloc)
+OSPTR_TEST_DECL(destroy)
+OSPTR_TEST_DECL(copy)
+OSPTR_TEST_DECL(copy_subclass)
+OSPTR_TEST_DECL(assign)
+OSPTR_TEST_DECL(assign_raw)
+OSPTR_TEST_DECL(assign_null)
+OSPTR_TEST_DECL(assign_subclass)
+OSPTR_TEST_DECL(assign_compatible)
+OSPTR_TEST_DECL(move)
+OSPTR_TEST_DECL(move_assign)
+OSPTR_TEST_DECL(move_assign_null)
+OSPTR_TEST_DECL(move_assign_raw)
+OSPTR_TEST_DECL(move_assign_subclass)
+OSPTR_TEST_DECL(move_assign_self)
+OSPTR_TEST_DECL(test_const_cast)
+OSPTR_TEST_DECL(const_cast_move)
+OSPTR_TEST_DECL(const_cast_move_self)
+OSPTR_TEST_DECL(test_static_cast)
+OSPTR_TEST_DECL(static_cast_move)
+OSPTR_TEST_DECL(static_cast_move_self)
+OSPTR_TEST_DECL(tagged_ptr)
+OSPTR_TEST_DECL(attach)
+OSPTR_TEST_DECL(detach)
+OSPTR_TEST_DECL(foreign)
+OSPTR_TEST_DECL(test_dynamic_cast)
+
+
+/*
+ * Test that the "trivial_abi" attribute works as expected
+ */
+
+struct Complex {
+       uintptr_t val;
+       Complex() : val(71)
+       {
+       }
+       ~Complex()
+       {
+       }
+};
+
+struct Trivial {
+       uintptr_t val;
+       Trivial() : val(42)
+       {
+       }
+       ~Trivial()
+       {
+       }
+} __attribute__((trivial_abi));
+
+/* defined in osptr_helper.cpp */
+__BEGIN_DECLS
+extern uintptr_t pass_trivial(Trivial);
+extern uintptr_t pass_complex(Complex);
+__END_DECLS
+Trivial return_trivial(uintptr_t);
+Complex return_complex(uintptr_t);
+
+T_DECL(trivial_abi, "Test trivial_abi classes are passed by value")
+{
+       Trivial a;
+       uintptr_t x = pass_trivial(a);
+       T_EXPECT_EQ_ULONG(a.val, x, "Trivial class argument passed by-value");
+
+       Complex b;
+       uintptr_t y = pass_complex(b);
+       T_EXPECT_NE_ULONG(b.val, y, "Non-trivial class argument passed by-reference");
+
+       Trivial c = return_trivial(55);
+       T_EXPECT_EQ_ULONG(c.val, 55UL, "Trivial class returned by-value");
+
+       Complex d = return_complex(99);
+       T_EXPECT_NE_ULONG(d.val, 99UL, "Non-trivial class returned by-reference");
+}
+
+#pragma clang diagnostic pop
diff --git a/tests/osptr_dumb.cpp b/tests/osptr_dumb.cpp
new file mode 100644 (file)
index 0000000..8cb7e4f
--- /dev/null
@@ -0,0 +1,80 @@
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <stdio.h>
+#include <assert.h>
+#include <typeinfo>
+
+#if 0
+# define OSPTR_LOG T_LOG
+#elif 0
+# define OSPTR_LOG printf
+#else
+# define OSPTR_LOG(x...)  do { } while(0)
+#endif
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("osptr"),
+       T_META_CHECK_LEAKS(false),
+       T_META_RUN_CONCURRENTLY(true)
+       );
+
+class OSMetaClassBase
+{
+public:
+       virtual void
+       retain() const
+       {
+       }
+       virtual void
+       release() const
+       {
+       }
+       virtual void
+       taggedRetain(void *tag) const
+       {
+       }
+       virtual void
+       taggedRelease(void *tag) const
+       {
+       }
+
+       static void *type_id;
+};
+
+void *OSMetaClassBase::type_id;
+
+#define OSTypeAlloc(T) new T
+#define OSTypeID(T) T::type_id
+
+#include <libkern/c++/OSPtr.h>
+
+class Base : public OSMetaClassBase {
+public:
+       Base() : OSMetaClassBase()
+       {
+       }
+};
+
+class Derived : public Base {
+public:
+       Derived() : Base()
+       {
+       }
+};
+
+typedef OSPtr<Base> BasePtr;
+typedef OSPtr<Derived> DerivedPtr;
+
+T_DECL(dumb_osptr, "Dumb OSPtrs work")
+{
+       BasePtr x = nullptr;
+       T_ASSERT_EQ_PTR(x, nullptr, NULL);
+       T_ASSERT_TRUE(typeid(BasePtr) == typeid(Base *), NULL);
+       T_ASSERT_TRUE(typeid(DerivedPtr) == typeid(Derived *), NULL);
+
+       OSTaggedPtr<Base, Base> y = nullptr;
+       OSTaggedPtr<Derived, Base> z = nullptr;
+       T_ASSERT_EQ_PTR(y, nullptr, NULL);
+       T_ASSERT_TRUE(typeid(y) == typeid(Base *), NULL);
+       T_ASSERT_TRUE(typeid(z) == typeid(Derived *), NULL);
+}
diff --git a/tests/osptr_helper.cpp b/tests/osptr_helper.cpp
new file mode 100644 (file)
index 0000000..28eef3d
--- /dev/null
@@ -0,0 +1,24 @@
+#include <stdint.h>
+
+extern "C" {
+uintptr_t
+pass_trivial(uintptr_t x)
+{
+       return x;
+}
+uintptr_t
+pass_complex(uintptr_t x)
+{
+       return x;
+}
+uintptr_t
+_Z14return_trivialm(uintptr_t x)
+{
+       return x;
+}
+uintptr_t
+_Z14return_complexm(uintptr_t x)
+{
+       return x;
+}
+}
index 3e8aa68d8ff1178ae441b851e475dd4de912571e..e30acbb5dfdf2374a491f17c793ef6711dad8060 100644 (file)
@@ -1,6 +1,7 @@
 #include <stdio.h>
 #include <signal.h>
 #include <sys/sysctl.h>
+#include <sys/kern_memorystatus.h>
 #include <mach-o/dyld.h>
 #include <perfcheck_keys.h>
 
@@ -32,6 +33,8 @@ enum {
        X(DISPATCH_SOURCE_CREATE_FAILED) \
        X(INITIAL_SIGNAL_TO_PARENT_FAILED) \
        X(SIGNAL_TO_PARENT_FAILED) \
+       X(MEMORYSTATUS_CONTROL_FAILED) \
+       X(IS_FREEZABLE_NOT_AS_EXPECTED) \
        X(EXIT_CODE_MAX)
 
 #define EXIT_CODES_ENUM(VAR) VAR,
@@ -47,8 +50,9 @@ static const char *exit_codes_str[] = {
 #define SYSCTL_FREEZE_TO_MEMORY         "kern.memorystatus_freeze_to_memory=1"
 
 static pid_t pid = -1;
-static dt_stat_t r;
-static dt_stat_time_t s;
+static dt_stat_t ratio;
+static dt_stat_time_t compr_time;
+static dt_stat_time_t decompr_time;
 
 void allocate_zero_pages(char **buf, int num_pages, int vmpgsize);
 void allocate_mostly_zero_pages(char **buf, int num_pages, int vmpgsize);
@@ -128,7 +132,7 @@ freeze_helper_process(void)
        T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.compressor_input_bytes", &input_before, &length, NULL, 0),
            "failed to query vm.compressor_input_bytes");
 
-       T_STAT_MEASURE(s) {
+       T_STAT_MEASURE(compr_time) {
                ret = sysctlbyname("kern.memorystatus_freeze", NULL, NULL, &pid, sizeof(pid));
                errno_sysctl_freeze = errno;
        };
@@ -152,7 +156,7 @@ freeze_helper_process(void)
                T_END;
        }
 
-       dt_stat_add(r, (double)(input_after - input_before) / (double)(compressed_after - compressed_before));
+       dt_stat_add(ratio, (double)(input_after - input_before) / (double)(compressed_after - compressed_before));
 
        ret = sysctlbyname("kern.memorystatus_thaw", NULL, NULL, &pid, sizeof(pid));
        T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_thaw failed");
@@ -163,8 +167,6 @@ freeze_helper_process(void)
 void
 cleanup(void)
 {
-       int status = 0;
-
        /* No helper process. */
        if (pid == -1) {
                return;
@@ -182,9 +184,10 @@ run_compressor_test(int size_mb, int page_type)
        char **launch_tool_args;
        char testpath[PATH_MAX];
        uint32_t testpath_buf_size;
-       dispatch_source_t ds_freeze, ds_proc;
+       dispatch_source_t ds_freeze, ds_proc, ds_decompr;
        int freeze_enabled;
        size_t length;
+       __block bool decompr_latency_is_stable = false;
 
        length = sizeof(freeze_enabled);
        T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.freeze_enabled", &freeze_enabled, &length, NULL, 0),
@@ -196,24 +199,35 @@ run_compressor_test(int size_mb, int page_type)
 
        T_ATEND(cleanup);
 
-       r = dt_stat_create("(input bytes / compressed bytes)", "compression_ratio");
-       s = dt_stat_time_create("compressor_latency");
+       ratio = dt_stat_create("(input bytes / compressed bytes)", "compression_ratio");
+       compr_time = dt_stat_time_create("compressor_latency");
+
        // This sets the A/B failure threshold at 50% of baseline for compressor_latency
-       dt_stat_set_variable(s, kPCFailureThresholdPctVar, 50.0);
+       dt_stat_set_variable((struct dt_stat *)compr_time, kPCFailureThresholdPctVar, 50.0);
+
+       signal(SIGUSR2, SIG_IGN);
+       ds_decompr = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR2, 0, dispatch_get_main_queue());
+       T_QUIET; T_ASSERT_NOTNULL(ds_decompr, "dispatch_source_create (ds_decompr)");
+
+       dispatch_source_set_event_handler(ds_decompr, ^{
+               decompr_latency_is_stable = true;
+       });
+       dispatch_activate(ds_decompr);
 
        signal(SIGUSR1, SIG_IGN);
        ds_freeze = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue());
        T_QUIET; T_ASSERT_NOTNULL(ds_freeze, "dispatch_source_create (ds_freeze)");
 
        dispatch_source_set_event_handler(ds_freeze, ^{
-               if (!dt_stat_stable(s)) {
+               if (!(dt_stat_stable(compr_time) && decompr_latency_is_stable)) {
                        freeze_helper_process();
                } else {
-                       dt_stat_finalize(s);
-                       dt_stat_finalize(r);
+                       dt_stat_finalize(compr_time);
+                       dt_stat_finalize(ratio);
 
                        kill(pid, SIGKILL);
                        dispatch_source_cancel(ds_freeze);
+                       dispatch_source_cancel(ds_decompr);
                }
        });
        dispatch_activate(ds_freeze);
@@ -266,7 +280,7 @@ run_compressor_test(int size_mb, int page_type)
 }
 
 T_HELPER_DECL(allocate_pages, "allocates pages to compress") {
-       int i, j, ret, size_mb, page_type, vmpgsize;
+       int i, j, ret, size_mb, page_type, vmpgsize, freezable_state;
        size_t vmpgsize_length;
        __block int num_pages;
        __block char **buf;
@@ -312,6 +326,20 @@ T_HELPER_DECL(allocate_pages, "allocates pages to compress") {
                i = buf[j][0];
        }
 
+       decompr_time = dt_stat_time_create("decompression_latency");
+
+       /* Opt in to freezing. */
+       printf("[%d] Setting state to freezable\n", getpid());
+       if (memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE, getpid(), 1, NULL, 0) != KERN_SUCCESS) {
+               exit(MEMORYSTATUS_CONTROL_FAILED);
+       }
+
+       /* Verify that the state has been set correctly */
+       freezable_state = memorystatus_control(MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE, getpid(), 0, NULL, 0);
+       if (freezable_state != 1) {
+               exit(IS_FREEZABLE_NOT_AS_EXPECTED);
+       }
+
        dispatch_after(dispatch_time(DISPATCH_TIME_NOW, NSEC_PER_SEC), dispatch_get_main_queue(), ^{
                /* Signal to the parent that we're done allocating and it's ok to freeze us */
                printf("[%d] Sending initial signal to parent to begin freezing\n", getpid());
@@ -326,13 +354,33 @@ T_HELPER_DECL(allocate_pages, "allocates pages to compress") {
                exit(DISPATCH_SOURCE_CREATE_FAILED);
        }
 
+       __block bool collect_dt_stat_measurements = true;
+
        dispatch_source_set_event_handler(ds_signal, ^{
                volatile int tmp;
+               uint64_t decompr_start_time, decompr_end_time;
+
+               decompr_start_time = mach_absolute_time();
 
                /* Make sure all the pages are accessed before trying to freeze again */
                for (int x = 0; x < num_pages; x++) {
                        tmp = buf[x][0];
                }
+
+               decompr_end_time = mach_absolute_time();
+
+               if (collect_dt_stat_measurements) {
+                       if (dt_stat_stable(decompr_time)) {
+                               collect_dt_stat_measurements = false;
+                               dt_stat_finalize(decompr_time);
+                               if (kill(getppid(), SIGUSR2) != 0) {
+                                       exit(SIGNAL_TO_PARENT_FAILED);
+                               }
+                       } else {
+                               dt_stat_mach_time_add(decompr_time, decompr_end_time - decompr_start_time);
+                       }
+               }
+
                if (kill(getppid(), SIGUSR1) != 0) {
                        exit(SIGNAL_TO_PARENT_FAILED);
                }
@@ -348,42 +396,49 @@ T_HELPER_DECL(allocate_pages, "allocates pages to compress") {
 #ifndef DT_IOSMARK
 T_DECL(compr_10MB_zero,
     "Compression latency for 10MB - zero pages",
+    T_META_ASROOT(true),
     T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) {
        run_compressor_test(10, ALL_ZEROS);
 }
 
 T_DECL(compr_10MB_mostly_zero,
     "Compression latency for 10MB - mostly zero pages",
+    T_META_ASROOT(true),
     T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) {
        run_compressor_test(10, MOSTLY_ZEROS);
 }
 
 T_DECL(compr_10MB_random,
     "Compression latency for 10MB - random pages",
+    T_META_ASROOT(true),
     T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) {
        run_compressor_test(10, RANDOM);
 }
 
 T_DECL(compr_10MB_typical,
     "Compression latency for 10MB - typical pages",
+    T_META_ASROOT(true),
     T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) {
        run_compressor_test(10, TYPICAL);
 }
 
 T_DECL(compr_100MB_zero,
     "Compression latency for 100MB - zero pages",
+    T_META_ASROOT(true),
     T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) {
        run_compressor_test(100, ALL_ZEROS);
 }
 
 T_DECL(compr_100MB_mostly_zero,
     "Compression latency for 100MB - mostly zero pages",
+    T_META_ASROOT(true),
     T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) {
        run_compressor_test(100, MOSTLY_ZEROS);
 }
 
 T_DECL(compr_100MB_random,
     "Compression latency for 100MB - random pages",
+    T_META_ASROOT(true),
     T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) {
        run_compressor_test(100, RANDOM);
 }
@@ -391,6 +446,7 @@ T_DECL(compr_100MB_random,
 
 T_DECL(compr_100MB_typical,
     "Compression latency for 100MB - typical pages",
+    T_META_ASROOT(true),
     T_META_SYSCTL_INT(SYSCTL_FREEZE_TO_MEMORY)) {
        run_compressor_test(100, TYPICAL);
 }
diff --git a/tests/perf_kdebug.c b/tests/perf_kdebug.c
deleted file mode 100644 (file)
index d2861ba..0000000
+++ /dev/null
@@ -1,193 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-#include <darwintest.h>
-
-#include <sys/kdebug.h>
-#include <sys/sysctl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-T_GLOBAL_META(
-       T_META_NAMESPACE("xnu.perf.kdebug"),
-       T_META_ASROOT(true),
-       T_META_CHECK_LEAKS(false),
-       T_META_TAG_PERF
-       );
-
-//
-// Helper functions for direct control over the kernel trace facility.
-//
-
-static void
-_sysctl_reset()
-{
-       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDREMOVE };
-       if (sysctl(mib, 3, NULL, NULL, NULL, 0)) {
-               T_FAIL("KERN_KDREMOVE sysctl failed");
-       }
-}
-
-static void
-_sysctl_setbuf(uint32_t capacity)
-{
-       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDSETBUF, (int)capacity };
-       if (sysctl(mib, 4, NULL, NULL, NULL, 0)) {
-               T_FAIL("KERN_KDSETBUF sysctl failed");
-       }
-}
-
-static void
-_sysctl_setup()
-{
-       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDSETUP };
-       if (sysctl(mib, 3, NULL, NULL, NULL, 0)) {
-               T_FAIL("KERN_KDSETUP sysctl failed");
-       }
-}
-
-static void
-_sysctl_enable(int value)
-{
-       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDENABLE, value };
-       if (sysctl(mib, 4, NULL, NULL, NULL, 0) < 0) {
-               T_FAIL("KERN_KDENABLE sysctl failed");
-       }
-}
-
-static void
-_sysctl_enable_typefilter(uint8_t* type_filter_bitmap)
-{
-       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDSET_TYPEFILTER };
-       size_t needed = KDBG_TYPEFILTER_BITMAP_SIZE;
-       if (sysctl(mib, 3, type_filter_bitmap, &needed, NULL, 0)) {
-               T_FAIL("KERN_KDSET_TYPEFILTER sysctl failed");
-       }
-}
-
-static void
-_sysctl_nowrap(bool is_nowrap)
-{
-       int mib[] = { CTL_KERN, KERN_KDEBUG, is_nowrap ? KERN_KDEFLAGS : KERN_KDDFLAGS, KDBG_NOWRAP };
-       if (sysctl(mib, 4, NULL, NULL, NULL, 0)) {
-               T_FAIL("KDBG_NOWRAP sysctl failed");
-       }
-}
-
-static void
-enable_tracing(bool value)
-{
-       _sysctl_enable(value ? KDEBUG_ENABLE_TRACE : 0);
-}
-
-static void
-enable_typefilter_all_reject()
-{
-       uint8_t type_filter_bitmap[KDBG_TYPEFILTER_BITMAP_SIZE];
-       memset(type_filter_bitmap, 0, sizeof(type_filter_bitmap));
-       _sysctl_enable_typefilter(type_filter_bitmap);
-}
-
-static void
-enable_typefilter_all_pass()
-{
-       uint8_t type_filter_bitmap[KDBG_TYPEFILTER_BITMAP_SIZE];
-       memset(type_filter_bitmap, 0xff, sizeof(type_filter_bitmap));
-       _sysctl_enable_typefilter(type_filter_bitmap);
-}
-
-static void
-loop_kdebug_trace(dt_stat_time_t s)
-{
-       do {
-               dt_stat_token start = dt_stat_time_begin(s);
-               for (uint32_t i = 0; i < 100; i++) {
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-               }
-               dt_stat_time_end_batch(s, 1000, start);
-       } while (!dt_stat_stable(s));
-}
-
-static void
-loop_getppid(dt_stat_time_t s)
-{
-       do {
-               dt_stat_token start = dt_stat_time_begin(s);
-               for (uint32_t i = 0; i < 100; i++) {
-                       getppid();
-                       getppid();
-                       getppid();
-                       getppid();
-                       getppid();
-                       getppid();
-                       getppid();
-                       getppid();
-                       getppid();
-                       getppid();
-               }
-               dt_stat_time_end_batch(s, 1000, start);
-       } while (!dt_stat_stable(s));
-}
-
-static void
-reset_kdebug_trace(void)
-{
-       _sysctl_reset();
-}
-
-static void
-test(const char* test_name, void (^pretest_setup)(void), void (*test)(dt_stat_time_t s))
-{
-       T_ATEND(reset_kdebug_trace);
-       _sysctl_reset();
-       _sysctl_setbuf(1000000);
-       _sysctl_nowrap(false);
-       _sysctl_setup();
-
-       pretest_setup();
-
-       dt_stat_time_t s = dt_stat_time_create("%s", test_name);
-
-       test(s);
-
-       dt_stat_finalize(s);
-}
-
-//
-// Begin tests...
-//
-
-T_DECL(kdebug_trace_baseline_syscall,
-    "Test the latency of a syscall while kernel tracing is disabled") {
-       test("kdebug_trace_baseline_syscall", ^{ enable_tracing(false); }, loop_getppid);
-}
-
-T_DECL(kdebug_trace_kdbg_disabled,
-    "Test the latency of kdebug_trace while kernel tracing is disabled") {
-       test("kdebug_trace_kdbg_disabled", ^{ enable_tracing(false); }, loop_kdebug_trace);
-}
-
-T_DECL(kdebug_trace_kdbg_enabled,
-    "Test the latency of kdebug_trace while kernel tracing is enabled with no typefilter") {
-       test("kdebug_trace_kdbg_enabled", ^{ enable_tracing(true); }, loop_kdebug_trace);
-}
-
-T_DECL(kdebug_trace_kdbg_enabled_typefilter_pass,
-    "Test the latency of kdebug_trace while kernel tracing is enabled with a typefilter that passes the event") {
-       test("kdebug_trace_kdbg_enabled_typefilter_pass", ^{ enable_tracing(true); enable_typefilter_all_pass(); }, loop_kdebug_trace);
-}
-
-T_DECL(kdebug_trace_kdbg_enabled_typefilter_reject,
-    "Test the latency of kdebug_trace while kernel tracing is enabled with a typefilter that rejects the event") {
-       test("kdebug_trace_kdbg_enabled_typefilter_reject", ^{ enable_tracing(true); enable_typefilter_all_reject(); }, loop_kdebug_trace);
-}
index 384d3586222a9bb5364443758a9625a962511745..d0f64ab0ad287cf53dedd37dfa4c284bae2d01a1 100644 (file)
@@ -55,10 +55,13 @@ static memregion_config *memregion_config_per_thread;
 static size_t pgsize;
 static int num_threads;
 static int ready_thread_count;
+static int finished_thread_count;
 static dt_stat_time_t runtime;
 static pthread_cond_t start_cvar;
 static pthread_cond_t threads_ready_cvar;
+static pthread_cond_t threads_finished_cvar;
 static pthread_mutex_t ready_thread_count_lock;
+static pthread_mutex_t finished_thread_count_lock;
 
 static void map_mem_regions_default(int fault_type, size_t memsize);
 static void map_mem_regions_single(int fault_type, size_t memsize);
@@ -275,6 +278,15 @@ thread_setup(void *arg)
        T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_unlock(&ready_thread_count_lock), "pthread_mutex_unlock");
 
        fault_pages(my_index);
+
+       /* Up the finished count */
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_lock(&finished_thread_count_lock), "pthread_mutex_lock");
+       finished_thread_count++;
+       if (finished_thread_count == num_threads) {
+               /* All the threads are done. Wake up the main thread */
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_signal(&threads_finished_cvar), "pthread_cond_signal");
+       }
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_unlock(&finished_thread_count_lock), "pthread_mutex_unlock");
        return NULL;
 }
 
@@ -289,7 +301,10 @@ execute_threads(void)
        T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_init(&threads_ready_cvar, NULL), "pthread_cond_init");
        T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_init(&start_cvar, NULL), "pthread_cond_init");
        T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_init(&ready_thread_count_lock, NULL), "pthread_mutex_init");
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_init(&threads_finished_cvar, NULL), "pthread_cond_init");
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_init(&finished_thread_count_lock, NULL), "pthread_mutex_init");
        ready_thread_count = 0;
+       finished_thread_count = 0;
 
        threads = (pthread_t *)malloc(sizeof(*threads) * (size_t)num_threads);
        thread_indices = (int *)malloc(sizeof(*thread_indices) * (size_t)num_threads);
@@ -300,20 +315,28 @@ execute_threads(void)
        }
 
        T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_lock(&ready_thread_count_lock), "pthread_mutex_lock");
-       if (ready_thread_count != num_threads) {
+       while (ready_thread_count != num_threads) {
                T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_wait(&threads_ready_cvar, &ready_thread_count_lock),
                    "pthread_cond_wait");
        }
        T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_unlock(&ready_thread_count_lock), "pthread_mutex_unlock");
 
        T_STAT_MEASURE(runtime) {
+               /* Ungate the threads */
                T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_broadcast(&start_cvar), "pthread_cond_broadcast");
-               for (thread_index = 0; thread_index < num_threads; thread_index++) {
-                       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_join(threads[thread_index], &thread_retval_ptr),
-                           "pthread_join");
+               /* Wait for the threads to finish */
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_lock(&finished_thread_count_lock), "pthread_mutex_lock");
+               while (finished_thread_count != num_threads) {
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_wait(&threads_finished_cvar, &finished_thread_count_lock), "pthread_cond_wait");
                }
        };
 
+       /* Join the threads */
+       for (thread_index = 0; thread_index < num_threads; thread_index++) {
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_join(threads[thread_index], &thread_retval_ptr),
+                   "pthread_join");
+       }
+
        free(threads);
        free(thread_indices);
 }
@@ -344,8 +367,6 @@ run_test(int fault_type, int mapping_variant, size_t memsize)
        snprintf(metric_str, 32, "Runtime-%s", variant_str[mapping_variant]);
        runtime = dt_stat_time_create(metric_str);
 
-       // This sets the A/B failure threshold at 50% of baseline for Runtime
-       dt_stat_set_variable((dt_stat_t)runtime, kPCFailureThresholdPctVar, 50.0);
        while (!dt_stat_stable(runtime)) {
                map_mem_regions(fault_type, mapping_variant, memsize);
                execute_threads();
@@ -418,6 +439,9 @@ T_DECL(read_soft_fault_multithreaded,
                nthreads = (int)strtol(e, NULL, 0);
        } else {
                nthreads = get_ncpu();
+               if (nthreads == 1) {
+                       T_SKIP("Skipping multi-threaded test on single core device.");
+               }
        }
        setup_and_run_test(SOFT_FAULT, nthreads);
 }
@@ -439,6 +463,9 @@ T_DECL(zero_fill_fault_multithreaded,
                nthreads = (int)strtol(e, NULL, 0);
        } else {
                nthreads = get_ncpu();
+               if (nthreads == 1) {
+                       T_SKIP("Skipping multi-threaded test on single core device.");
+               }
        }
        setup_and_run_test(ZERO_FILL, nthreads);
 }
index 10a64fbe58f842041b70b630228f7318c546b70e..84f45dce0b431ed10e35447209fb6c6f65a91527 100644 (file)
@@ -33,6 +33,8 @@
 #include <libproc_internal.h>
 #include <TargetConditionals.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 #define ALLOC_SIZE_LARGE 5*1024*1024
 #define ALLOC_SIZE_SMALL 2*1024*1024
 
diff --git a/tests/pipe_drain.c b/tests/pipe_drain.c
new file mode 100644 (file)
index 0000000..4808e0e
--- /dev/null
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <darwintest.h>
+#include <darwintest_multiprocess.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <signal.h>
+
+static void
+signal_handler(int sig, siginfo_t *sip __unused, void *ucontext __unused)
+{
+       if (sig == SIGPIPE) {
+               T_FAIL("Received SIGPIPE");
+       }
+
+       exit(141);
+}
+
+static void *
+thread_read(void *arg)
+{
+       int fd = (int) (uintptr_t)arg;
+       char buf[10];
+
+       read(fd, buf, 10);
+       T_LOG("thread returned from read");
+       return 0;
+}
+
+T_DECL(pipe_drain,
+    "test a pipe with multiple read descriptor could close one descriptor and drain that descriptor")
+{
+       int pipe_fd[2];
+       int dup_fd;
+       int ret;
+       char buf[10] = "Hello";
+       pthread_t thread;
+
+       /* Install the signal handler for SIGPIPE */
+
+       struct sigaction sa = {
+               .sa_sigaction = signal_handler,
+               .sa_flags = SA_SIGINFO
+       };
+       sigfillset(&sa.sa_mask);
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(sigaction(SIGPIPE, &sa, NULL), NULL);
+
+       ret = pipe(pipe_fd);
+       T_EXPECT_EQ(ret, 0, NULL);
+
+       dup_fd = dup(pipe_fd[0]);
+       T_EXPECT_GE(dup_fd, 0, NULL);
+
+       pthread_create(&thread, NULL, thread_read, (void *) (uintptr_t) pipe_fd[0]);
+
+       sleep(5);
+
+       close(pipe_fd[0]);
+       ret = (int)write(pipe_fd[1], buf, strlen(buf) + 1);
+       T_EXPECT_EQ(ret, (int)strlen(buf) + 1, NULL);
+}
diff --git a/tests/pipe_kevent.c b/tests/pipe_kevent.c
new file mode 100644 (file)
index 0000000..8a02261
--- /dev/null
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2019 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <darwintest.h>
+#include <darwintest_multiprocess.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <dispatch/dispatch.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+
+T_DECL(pipe_noblock_kevent,
+    "Set a pipe and no block and setup EVFLT_WRITE kevent on it and make sure it does not fire when the pipe is full")
+{
+       int fd[2], write_fd;
+       dispatch_queue_t dq1 = dispatch_queue_create("com.apple.test.pipe_noblock_kevent.queue", DISPATCH_QUEUE_SERIAL);
+
+       pipe(fd);
+       write_fd = fd[1];
+       __block int iter = 1;
+
+       /* Make sure the pipe is No block */
+       fcntl(write_fd, F_SETFL, (O_NONBLOCK));
+
+       dispatch_source_t write_source = dispatch_source_create(DISPATCH_SOURCE_TYPE_WRITE, (uintptr_t)write_fd, 0, dq1);
+       dispatch_source_set_event_handler(write_source, ^{
+               unsigned long length = dispatch_source_get_data(write_source);
+
+               T_LOG("Iteration: %d, Length available: %lu\n", iter++, length);
+
+               char buf[512] = "deadbeef";
+               ssize_t rv = write(write_fd, buf, 512);
+               T_EXPECT_POSIX_SUCCESS(rv, "write success");
+               if (rv < 0) {
+                       T_FAIL("Write should have succeeded but failed with error %ld", rv);
+                       T_END;
+               }
+       });
+
+       dispatch_resume(write_source);
+
+       T_LOG("Arming a timer for 15 seconds to exit, assuming kevent will block before that");
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 15 * NSEC_PER_SEC), dispatch_get_main_queue(), ^{
+               T_LOG("PASS: Kevent blocked as expected in the EVFLT_WRITE");
+               T_END;
+       });
+
+       dispatch_main();
+}
index 49e4be65f38c6961f9c8d752f045cc323a08efb5..c72cf5db808dd5b802b02db81e79c92f399b220f 100644 (file)
@@ -10,7 +10,8 @@
 #include <stdint.h>
 #include <unistd.h>
 
-T_GLOBAL_META(T_META_NAMESPACE("xnu.poll"));
+T_GLOBAL_META(T_META_NAMESPACE("xnu.poll"),
+    T_META_RUN_CONCURRENTLY(true));
 
 #define SLEEP_TIME_SECS 1
 #define POLL_TIMEOUT_MS 1800
@@ -26,7 +27,7 @@ T_DECL(sleep_with_no_fds,
     "poll() called with no fds provided should act like sleep")
 {
        uint64_t begin_time, sleep_time, poll_time;
-       struct pollfd pfd = { 0 };
+       struct pollfd pfd = { .fd = 0, .events = 0, .revents = 0 };
 
        begin_time = mach_absolute_time();
        sleep(SLEEP_TIME_SECS);
index 3f1f96ea389c6a247d5263e714acc9801d9d08ab..55d3c12b1199037d4dc7b39e27f1d0153b33dca7 100644 (file)
 #include <darwintest.h>
 #include <mach/port_descriptions.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 static void
-expect_special_port_description(const char *(*fn)(mach_port_t),
+expect_special_port_description(const char *(*fn)(int),
     mach_port_t port, const char *namestr)
 {
-       const char *desc = fn(port);
+       const char *desc = fn((int)port);
        T_EXPECT_NOTNULL(desc, "%s is %s", namestr, desc);
        if (desc) {
                T_QUIET; T_EXPECT_GT(strlen(desc), strlen(""),
@@ -72,10 +74,12 @@ T_DECL(host_special_port_descriptions,
        TEST_HSP(HOST_RESOURCE_NOTIFY_PORT);
        TEST_HSP(HOST_CLOSURED_PORT);
        TEST_HSP(HOST_SYSPOLICYD_PORT);
+       TEST_HSP(HOST_FILECOORDINATIOND_PORT);
+       TEST_HSP(HOST_FAIRPLAYD_PORT);
 
 #undef TEST_HSP
 
-       T_EXPECT_EQ(HOST_SYSPOLICYD_PORT, HOST_MAX_SPECIAL_PORT,
+       T_EXPECT_EQ(HOST_FAIRPLAYD_PORT, HOST_MAX_SPECIAL_PORT,
            "checked all of the ports");
 
        const char *invalid_hsp =
@@ -151,6 +155,7 @@ T_DECL(host_special_port_mapping,
        TEST_HSP(HOST_RESOURCE_NOTIFY_PORT);
        TEST_HSP(HOST_CLOSURED_PORT);
        TEST_HSP(HOST_SYSPOLICYD_PORT);
+       TEST_HSP(HOST_FILECOORDINATIOND_PORT);
 
 #undef TEST_HSP
 
diff --git a/tests/posix_spawn_file_actions.c b/tests/posix_spawn_file_actions.c
new file mode 100644 (file)
index 0000000..2093069
--- /dev/null
@@ -0,0 +1,156 @@
+#include <darwintest.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <spawn.h>
+#include <spawn_private.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/spawn_internal.h>
+#include <sys/sysctl.h>
+#include <sys/syslimits.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+/* TEST_PATH needs to be something that exists, but is not the cwd */
+#define TEST_PATH "/System/Library/Caches"
+
+T_DECL(posix_spawn_file_actions_addchdir_np, "Check posix_spawn_file_actions_addchdir_np",
+    T_META_ASROOT(true))
+{
+       posix_spawn_file_actions_t file_actions;
+       int ret;
+
+       ret = posix_spawn_file_actions_init(&file_actions);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_init");
+
+       ret = posix_spawn_file_actions_addchdir_np(&file_actions, TEST_PATH);
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_addchdir_np");
+
+       char * const    prog = "/bin/sh";
+       char * const    argv_child[] = { prog,
+                                        "-c",
+                                        "test $(pwd) = \"" TEST_PATH "\"",
+                                        NULL, };
+       pid_t           child_pid;
+       extern char   **environ;
+
+       ret = posix_spawn(&child_pid, prog, &file_actions, NULL, argv_child, environ);
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn");
+
+       T_LOG("parent: spawned child with pid %d\n", child_pid);
+
+       ret = posix_spawn_file_actions_destroy(&file_actions);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_destroy");
+
+       T_LOG("parent: waiting for child process\n");
+
+       int status = 0;
+       int waitpid_result = waitpid(child_pid, &status, 0);
+       T_ASSERT_POSIX_SUCCESS(waitpid_result, "waitpid");
+       T_ASSERT_EQ(waitpid_result, child_pid, "waitpid should return child we spawned");
+       T_ASSERT_EQ(WIFEXITED(status), 1, "child should have exited normally");
+       T_ASSERT_EQ(WEXITSTATUS(status), EX_OK, "child should have exited with success");
+}
+
+T_DECL(posix_spawn_file_actions_addchdir_np_errors, "Check posix_spawn_file_actions_addchdir_np errors",
+    T_META_ASROOT(true))
+{
+       char longpath[PATH_MAX + 1];
+       posix_spawn_file_actions_t file_actions;
+       int ret;
+
+       memset(longpath, 'a', PATH_MAX);
+       longpath[PATH_MAX] = '\0';
+
+       ret = posix_spawn_file_actions_init(&file_actions);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_init");
+
+       ret = posix_spawn_file_actions_addchdir_np(NULL, "/");
+       T_ASSERT_EQ(ret, EINVAL, "NULL *file_actions returns EINVAL");
+
+       ret = posix_spawn_file_actions_addchdir_np(&file_actions, longpath);
+       T_ASSERT_EQ(ret, ENAMETOOLONG, "Path longer than PATH_MAX returns ENAMETOOLONG");
+
+       ret = posix_spawn_file_actions_destroy(&file_actions);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_destroy");
+}
+
+T_DECL(posix_spawn_file_actions_addfchdir_np, "Check posix_spawn_file_actions_addfchdir_np",
+    T_META_ASROOT(true))
+{
+       posix_spawn_file_actions_t file_actions;
+       int ret;
+       int test_fd;
+
+       ret = posix_spawn_file_actions_init(&file_actions);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_init");
+
+       test_fd = open(TEST_PATH, O_RDONLY | O_CLOEXEC);
+       T_ASSERT_POSIX_SUCCESS(test_fd, "open " TEST_PATH);
+
+       ret = posix_spawn_file_actions_addfchdir_np(&file_actions, test_fd);
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_addfchdir_np");
+
+       char * const    prog = "/bin/sh";
+       char * const    argv_child[] = { prog,
+                                        "-c",
+                                        "test $(pwd) = \"" TEST_PATH "\"",
+                                        NULL, };
+       pid_t           child_pid;
+       extern char   **environ;
+
+       ret = posix_spawn(&child_pid, prog, &file_actions, NULL, argv_child, environ);
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn");
+
+       T_LOG("parent: spawned child with pid %d\n", child_pid);
+
+       ret = posix_spawn_file_actions_destroy(&file_actions);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_destroy");
+
+       T_LOG("parent: waiting for child process\n");
+
+       int status = 0;
+       int waitpid_result = waitpid(child_pid, &status, 0);
+       T_ASSERT_POSIX_SUCCESS(waitpid_result, "waitpid");
+       T_ASSERT_EQ(waitpid_result, child_pid, "waitpid should return child we spawned");
+       T_ASSERT_EQ(WIFEXITED(status), 1, "child should have exited normally");
+       T_ASSERT_EQ(WEXITSTATUS(status), EX_OK, "child should have exited with success");
+
+       ret = close(test_fd);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "close test fd");
+}
+
+T_DECL(posix_spawn_file_actions_addfchdir_np_errors, "Check posix_spawn_file_actions_addfchdir_np errors",
+    T_META_ASROOT(true))
+{
+       posix_spawn_file_actions_t file_actions;
+       int ret;
+
+       ret = posix_spawn_file_actions_init(&file_actions);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_init");
+
+       ret = posix_spawn_file_actions_addfchdir_np(NULL, 0);
+       T_ASSERT_EQ(ret, EINVAL, "NULL *file_actions returns EINVAL");
+
+       ret = posix_spawn_file_actions_addfchdir_np(&file_actions, -1);
+       T_ASSERT_EQ(ret, EBADF, "-1 file descriptor returns EBADF");
+
+       ret = posix_spawn_file_actions_destroy(&file_actions);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_destroy");
+}
diff --git a/tests/posix_spawn_file_actions_add_fileportdup2_np.c b/tests/posix_spawn_file_actions_add_fileportdup2_np.c
new file mode 100644 (file)
index 0000000..e1c8710
--- /dev/null
@@ -0,0 +1,74 @@
+#include <darwintest.h>
+
+#include <errno.h>
+#include <libproc.h>
+#include <signal.h>
+#include <spawn.h>
+#include <spawn_private.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/proc_info.h>
+#include <sys/spawn_internal.h>
+#include <sys/sysctl.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <sys/fileport.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+T_DECL(posix_spawn_file_actions_add_fileportdup2_np,
+    "Check posix_spawnattr for posix_spawn_file_actions_add_fileportdup2_np",
+    T_META_ASROOT(true))
+{
+       posix_spawnattr_t attr;
+       posix_spawn_file_actions_t fact;
+       int ret, pipes[2];
+       mach_port_t mp;
+
+       ret = pipe(pipes);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pipe");
+
+       ret = fileport_makeport(pipes[1], &mp);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "fileport_makefd");
+
+       ret = posix_spawnattr_init(&attr);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_init");
+
+       ret = posix_spawn_file_actions_init(&fact);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_init");
+
+       ret = posix_spawn_file_actions_add_fileportdup2_np(&fact, mp, STDOUT_FILENO);
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_add_fileportdup2_np");
+
+       char * const prog = "/bin/echo";
+       char * const argv_child[] = { prog, "1", NULL };
+       pid_t child_pid;
+       extern char   **environ;
+
+       ret = posix_spawn(&child_pid, prog, &fact, &attr, argv_child, environ);
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn");
+
+       ret = posix_spawn_file_actions_destroy(&fact);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_destroy");
+
+       ret = posix_spawnattr_destroy(&attr);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_destroy");
+
+       T_LOG("parent: spawned child with pid %d\n", child_pid);
+
+       int status = 0;
+       int waitpid_result = waitpid(child_pid, &status, 0);
+       T_ASSERT_POSIX_SUCCESS(waitpid_result, "waitpid");
+       T_ASSERT_EQ(waitpid_result, child_pid, "waitpid should return child we spawned");
+       T_ASSERT_EQ(WIFEXITED(status), 1, "child should have exited normally");
+       T_ASSERT_EQ(WEXITSTATUS(status), EX_OK, "child should have exited with success");
+
+       char buf[1];
+       ssize_t rc = read(pipes[0], buf, sizeof(buf));
+       T_ASSERT_POSIX_SUCCESS(rc, "read");
+       T_ASSERT_EQ(rc, 1l, "should have read one byte");
+       T_ASSERT_EQ(buf[0], '1', "should have read '1'");
+}
diff --git a/tests/posix_spawn_posix_cred.c b/tests/posix_spawn_posix_cred.c
new file mode 100644 (file)
index 0000000..c800622
--- /dev/null
@@ -0,0 +1,91 @@
+#include <darwintest.h>
+
+#include <errno.h>
+#include <libproc.h>
+#include <signal.h>
+#include <spawn.h>
+#include <spawn_private.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/kauth.h>
+#include <sys/proc_info.h>
+#include <sys/spawn_internal.h>
+#include <sys/sysctl.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+T_DECL(posix_spawn_posix_cred, "Check posix_spawnattr for POSIX creds",
+    T_META_ASROOT(true))
+{
+       posix_spawnattr_t attr;
+       int ret;
+
+       ret = posix_spawnattr_init(&attr);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_init");
+
+       ret = posix_spawnattr_setflags(&attr, POSIX_SPAWN_START_SUSPENDED);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_setflags");
+
+       ret = posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETSID);
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_setflags(POSIX_SPAWN_SETSID)");
+
+       ret = posix_spawnattr_set_uid_np(&attr, 502);
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_set_uid_np");
+
+       ret = posix_spawnattr_set_gid_np(&attr, 501);
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_set_gid_np");
+
+       gid_t groups[3] = { 501, 250, 299 };
+       ret = posix_spawnattr_set_groups_np(&attr, 3, &groups, KAUTH_UID_NONE);
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_set_groups_np");
+
+       ret = posix_spawnattr_set_login_np(&attr, "fake-name");
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_set_login_np");
+
+       char * const    prog = "/bin/sh";
+       char * const    argv_child[] = { prog,
+                                        "-c",
+                                        "test $(logname) = \"fake-name\" -a \"$(id -G)\" = \"501 250 299\"",
+                                        NULL, };
+       pid_t           child_pid;
+       extern char   **environ;
+
+       ret = posix_spawn(&child_pid, prog, NULL, &attr, argv_child, environ);
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn");
+
+       T_LOG("parent: spawned child with pid %d\n", child_pid);
+
+       ret = posix_spawnattr_destroy(&attr);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_destroy");
+
+       struct proc_bsdinfo info;
+
+       ret = proc_pidinfo(child_pid, PROC_PIDTBSDINFO, 1, &info, sizeof(info));
+       T_QUIET;
+       T_ASSERT_EQ(ret, (int)sizeof(info), "proc_pidinfo(PROC_PIDTBSDINFO)");
+
+       T_EXPECT_TRUE((bool)(info.pbi_flags & PROC_FLAG_SLEADER),
+           "check setsid happened");
+       T_EXPECT_EQ(info.pbi_uid, 502, "UID was set");
+       T_EXPECT_EQ(info.pbi_gid, 501, "GID was set");
+
+       ret = kill(child_pid, SIGCONT);
+       T_ASSERT_POSIX_SUCCESS(ret, "kill(signal)");
+
+       T_LOG("parent: waiting for child process\n");
+
+       int status = 0;
+       int waitpid_result = waitpid(child_pid, &status, 0);
+       T_ASSERT_POSIX_SUCCESS(waitpid_result, "waitpid");
+       T_ASSERT_EQ(waitpid_result, child_pid, "waitpid should return child we spawned");
+       T_ASSERT_EQ(WIFEXITED(status), 1, "child should have exited normally");
+       T_ASSERT_EQ(WEXITSTATUS(status), EX_OK, "child should have exited with success");
+}
diff --git a/tests/prioritize_process_launch.c b/tests/prioritize_process_launch.c
new file mode 100644 (file)
index 0000000..8f7ed11
--- /dev/null
@@ -0,0 +1,838 @@
+/*
+ * prioritize process launch: Tests prioritized process launch across posix spawn and exec.
+ */
+
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+#include <darwintest.h>
+#include <darwintest_multiprocess.h>
+
+#include <dispatch/dispatch.h>
+#include <pthread.h>
+#include <launch.h>
+#include <mach/mach.h>
+#include <mach/message.h>
+#include <mach/mach_voucher.h>
+#include <pthread/workqueue_private.h>
+#include <voucher/ipc_pthread_priority_types.h>
+#include <servers/bootstrap.h>
+#include <stdlib.h>
+#include <sys/event.h>
+#include <unistd.h>
+#include <crt_externs.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <libkern/OSAtomic.h>
+#include <sys/wait.h>
+#include <spawn.h>
+#include <spawn_private.h>
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.prioritize_process_launch"),
+    T_META_RUN_CONCURRENTLY(true));
+
+#define HELPER_TIMEOUT_SECS (3000)
+#define MACH_RCV_OPTIONS  (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | \
+                   MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) | \
+                   MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0))
+
+static pthread_t
+thread_create_at_qos(qos_class_t qos, void * (*function)(void *), void *arg);
+static mach_port_t sr_port;
+
+
+#pragma mark Mach receive
+
+static mach_voucher_t
+create_pthpriority_voucher(mach_msg_priority_t qos)
+{
+       char voucher_buf[sizeof(mach_voucher_attr_recipe_data_t) + sizeof(ipc_pthread_priority_value_t)];
+
+       mach_voucher_t voucher = MACH_PORT_NULL;
+       kern_return_t ret;
+       ipc_pthread_priority_value_t ipc_pthread_priority_value =
+           (ipc_pthread_priority_value_t)qos;
+
+       mach_voucher_attr_raw_recipe_array_t recipes;
+       mach_voucher_attr_raw_recipe_size_t recipe_size = 0;
+       mach_voucher_attr_recipe_t recipe =
+           (mach_voucher_attr_recipe_t)&voucher_buf[recipe_size];
+
+       recipe->key = MACH_VOUCHER_ATTR_KEY_PTHPRIORITY;
+       recipe->command = MACH_VOUCHER_ATTR_PTHPRIORITY_CREATE;
+       recipe->previous_voucher = MACH_VOUCHER_NULL;
+       memcpy((char *)&recipe->content[0], &ipc_pthread_priority_value, sizeof(ipc_pthread_priority_value));
+       recipe->content_size = sizeof(ipc_pthread_priority_value_t);
+       recipe_size += sizeof(mach_voucher_attr_recipe_data_t) + recipe->content_size;
+
+       recipes = (mach_voucher_attr_raw_recipe_array_t)&voucher_buf[0];
+
+       ret = host_create_mach_voucher(mach_host_self(),
+           recipes,
+           recipe_size,
+           &voucher);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client host_create_mach_voucher");
+       return voucher;
+}
+
+static void
+send(
+       mach_port_t send_port,
+       mach_port_t reply_port,
+       mach_port_t msg_port,
+       mach_msg_priority_t qos,
+       mach_msg_option_t options,
+       int send_disposition)
+{
+       kern_return_t ret = 0;
+
+       struct {
+               mach_msg_header_t header;
+               mach_msg_body_t body;
+               mach_msg_port_descriptor_t port_descriptor;
+       } send_msg = {
+               .header = {
+                       .msgh_remote_port = send_port,
+                       .msgh_local_port  = reply_port,
+                       .msgh_bits        = MACH_MSGH_BITS_SET(send_disposition,
+           reply_port ? MACH_MSG_TYPE_MAKE_SEND_ONCE : 0,
+           MACH_MSG_TYPE_MOVE_SEND,
+           MACH_MSGH_BITS_COMPLEX),
+                       .msgh_id          = 0x100,
+                       .msgh_size        = sizeof(send_msg),
+               },
+               .body = {
+                       .msgh_descriptor_count = 1,
+               },
+               .port_descriptor = {
+                       .name        = msg_port,
+                       .disposition = MACH_MSG_TYPE_MOVE_RECEIVE,
+                       .type        = MACH_MSG_PORT_DESCRIPTOR,
+               },
+       };
+
+       if (options & MACH_SEND_SYNC_USE_THRPRI) {
+               send_msg.header.msgh_voucher_port = create_pthpriority_voucher(qos);
+       }
+
+       if (msg_port == MACH_PORT_NULL) {
+               send_msg.body.msgh_descriptor_count = 0;
+       }
+
+       ret = mach_msg(&(send_msg.header),
+           MACH_SEND_MSG |
+           MACH_SEND_TIMEOUT |
+           MACH_SEND_OVERRIDE |
+           ((reply_port ? MACH_SEND_SYNC_OVERRIDE : 0) | options),
+           send_msg.header.msgh_size,
+           0,
+           MACH_PORT_NULL,
+           10000,
+           0);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client mach_msg");
+}
+
+static void
+receive(
+       mach_port_t rcv_port,
+       mach_port_t notify_port)
+{
+       kern_return_t ret = 0;
+
+       struct {
+               mach_msg_header_t header;
+               mach_msg_body_t body;
+               mach_msg_port_descriptor_t port_descriptor;
+               mach_msg_trailer_t trailer;
+       } rcv_msg = {
+               .header =
+               {
+                       .msgh_remote_port = MACH_PORT_NULL,
+                       .msgh_local_port  = rcv_port,
+                       .msgh_size        = sizeof(rcv_msg),
+               },
+       };
+
+       T_LOG("Client: Starting sync receive\n");
+
+       ret = mach_msg(&(rcv_msg.header),
+           MACH_RCV_MSG |
+           MACH_RCV_SYNC_WAIT,
+           0,
+           rcv_msg.header.msgh_size,
+           rcv_port,
+           0,
+           notify_port);
+}
+
+static int
+get_pri(thread_t thread_port)
+{
+       kern_return_t kr;
+
+       thread_extended_info_data_t extended_info;
+       mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT;
+       kr = thread_info(thread_port, THREAD_EXTENDED_INFO,
+           (thread_info_t)&extended_info, &count);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info");
+
+       return extended_info.pth_curpri;
+}
+
+static void
+set_thread_name(const char *fn_name)
+{
+       char name[50] = "";
+
+       thread_t thread_port = pthread_mach_thread_np(pthread_self());
+
+       int pri = get_pri(thread_port);
+
+       snprintf(name, sizeof(name), "%s at pri %2d", fn_name, pri);
+       pthread_setname_np(name);
+}
+
+static void
+thread_wait_to_block(mach_port_t thread_port)
+{
+       thread_extended_info_data_t extended_info;
+       kern_return_t kr;
+
+       while (1) {
+               mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT;
+               kr = thread_info(thread_port, THREAD_EXTENDED_INFO,
+                   (thread_info_t)&extended_info, &count);
+
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info");
+
+               if (extended_info.pth_run_state == TH_STATE_WAITING) {
+                       T_LOG("Target thread blocked\n");
+                       break;
+               }
+               thread_switch(thread_port, SWITCH_OPTION_DEPRESS, 0);
+       }
+}
+
+static void *
+thread_sync_rcv(void *arg)
+{
+       mach_port_t port = (mach_port_t)arg;
+       mach_port_t special_reply_port;
+
+       set_thread_name(__FUNCTION__);
+       special_reply_port = thread_get_special_reply_port();
+       T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port");
+
+       sr_port = special_reply_port;
+       /* Do a sync rcv on special reply port and push on given arg port */
+       receive(special_reply_port, port);
+       return NULL;
+}
+
+static pthread_t
+thread_create_at_qos(qos_class_t qos, void * (*function)(void *), void *arg)
+{
+       qos_class_t qos_thread;
+       pthread_t pthread;
+       pthread_attr_t attr;
+       int ret;
+
+       ret = setpriority(PRIO_DARWIN_ROLE, 0, PRIO_DARWIN_ROLE_UI_FOCAL);
+       if (ret != 0) {
+               T_LOG("set priority failed\n");
+       }
+
+       pthread_attr_init(&attr);
+       pthread_attr_set_qos_class_np(&attr, qos, 0);
+       pthread_create(&pthread, &attr, function, arg);
+
+       T_LOG("pthread created\n");
+       pthread_get_qos_class_np(pthread, &qos_thread, NULL);
+       return pthread;
+}
+
+static mach_port_t
+get_sync_push_port_at_qos(qos_class_t qos)
+{
+       mach_port_t port;
+       kern_return_t kr;
+       pthread_t pthread;
+       thread_t thread;
+
+       /* Create a rcv right to have a sync ipc push from a thread */
+       kr = mach_port_allocate(mach_task_self(),
+           MACH_PORT_RIGHT_RECEIVE,
+           &port);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "sync push port  mach_port_allocate");
+
+       kr = mach_port_insert_right(mach_task_self(),
+           port,
+           port,
+           MACH_MSG_TYPE_MAKE_SEND);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "sync push port mach_port_insert_right");
+
+       /* Create a thread at given qos and start a sync push on given port */
+       pthread = thread_create_at_qos(qos, thread_sync_rcv, (void *)(uintptr_t)port);
+       thread = pthread_mach_thread_np(pthread);
+       thread_wait_to_block(thread);
+
+       return port;
+}
+
+static mach_port_t
+create_port_and_copyin_a_port(mach_port_t port)
+{
+       mach_port_t new_port;
+       kern_return_t kr;
+
+       /* Create a rcv right */
+       kr = mach_port_allocate(mach_task_self(),
+           MACH_PORT_RIGHT_RECEIVE,
+           &new_port);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "copyin  mach_port_allocate");
+
+       kr = mach_port_insert_right(mach_task_self(),
+           new_port,
+           new_port,
+           MACH_MSG_TYPE_MAKE_SEND);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "copyin mach_port_insert_right");
+
+       send(new_port, MACH_PORT_NULL, port, 0, 0, MACH_MSG_TYPE_COPY_SEND);
+       return new_port;
+}
+
+static pid_t
+posix_spawn_child_with_watch_ports(
+       char *binary,
+       char *arg,
+       mach_port_t *port_array,
+       int arrayCnt)
+{
+       pid_t child_pid = 0;
+       char *new_argv[] = { binary, arg, NULL};
+       errno_t ret;
+       posix_spawnattr_t attr;
+
+       ret = posix_spawnattr_init(&attr);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_init");
+
+       ret = posix_spawnattr_set_importancewatch_port_np(&attr, arrayCnt, port_array);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_set_importancewatch_port_np");
+
+       ret = posix_spawn(&child_pid, binary, NULL, &attr, new_argv, NULL);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn");
+
+       ret = posix_spawnattr_destroy(&attr);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_destroy");
+
+       return child_pid;
+}
+
+static void
+worker_cb(pthread_priority_t __unused priority)
+{
+       T_FAIL("a worker thread was created");
+}
+
+static void
+event_cb(void ** __unused events, int * __unused nevents)
+{
+       T_FAIL("a kevent routine was called instead of workloop");
+}
+
+static void
+workloop_cb_test_intransit(uint64_t *workloop_id __unused, void **eventslist, int *events)
+{
+       pid_t pid;
+       int stat;
+       int priority;
+       mach_port_t port;
+       struct kevent_qos_s *kev = *eventslist;
+       mach_msg_header_t *hdr = (mach_msg_header_t *)kev->ext[0];
+       port = hdr->msgh_local_port;
+
+       T_LOG("Workloop handler workloop_cb_test_intransit called. ");
+       T_LOG("The total events returned is %d", *events);
+
+       priority = get_pri(mach_thread_self());
+       T_EXPECT_EQ(priority, 47, "Priority of servicer is %d", priority);
+
+       pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "WAIT", &port, 1);
+
+       /* Make sure our priority has dropped */
+       priority = get_pri(mach_thread_self());
+       T_EXPECT_EQ(priority, 31, "Priority of servicer is %d", priority);
+
+       sleep(2);
+
+       /*enqueue the port to sever the temp onwer boost */
+       create_port_and_copyin_a_port(port);
+
+       waitpid(pid, &stat, 0);
+
+       *events = 0;
+
+       T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat));
+       T_EXPECT_EQ(WEXITSTATUS(stat), 31, "Temp owner boost did not work correctly with knotes");
+       T_END;
+}
+
+static void
+workloop_cb_test_knote_kill(uint64_t *workloop_id __unused, void **eventslist, int *events)
+{
+       pid_t pid;
+       int stat;
+       int priority;
+       mach_port_t port;
+       struct kevent_qos_s *kev = *eventslist;
+       mach_msg_header_t *hdr = (mach_msg_header_t *)kev->ext[0];
+       port = hdr->msgh_local_port;
+
+       T_LOG("Workloop handler workloop_cb_test_knote_kill called. ");
+       T_LOG("The total events returned is %d", *events);
+
+       priority = get_pri(mach_thread_self());
+       T_EXPECT_EQ(priority, 47, "Priority of servicer is %d", priority);
+
+       pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "EXIT", &port, 1);
+
+       sleep(2);
+
+       /* Make sure our priority is boosted again */
+       priority = get_pri(mach_thread_self());
+       T_EXPECT_EQ(priority, 47, "Priority of servicer is %d", priority);
+
+       waitpid(pid, &stat, 0);
+
+       *events = 0;
+
+       T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat));
+       T_EXPECT_EQ(WEXITSTATUS(stat), 47, "Temp owner boost did not work correctly with knotes");
+       T_END;
+}
+
+static void
+workloop_cb_test_sync_bootstrap(uint64_t *workloop_id __unused, void **eventslist, int *events)
+{
+       static pid_t pid = 0;
+       int stat;
+       int priority;
+       static mach_port_t port = MACH_PORT_NULL;
+       struct kevent_qos_s *kev = *eventslist;
+       mach_msg_header_t *hdr = (mach_msg_header_t *)kev->ext[0];
+
+       T_LOG("Workloop handler workloop_cb_test_knote_kill called. ");
+       T_LOG("The total events returned is %d", *events);
+
+       /* Check if called for peek */
+       if (hdr == NULL) {
+               priority = get_pri(mach_thread_self());
+               T_EXPECT_EQ(priority, 47, "Priority of servicer is %d", priority);
+
+               port = (mach_port_t)kev->ident;
+               pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "MSGSYNC", &port, 1);
+       } else {
+               /* Wait till the priority of servicer is 47 */
+               T_LOG("Waiting for the servicer to be boosted");
+               do {
+                       sleep(1);
+                       priority = get_pri(mach_thread_self());
+               } while (priority != 47);
+
+               T_EXPECT_EQ(priority, 47, "Priority of servicer is %d", priority);
+
+               /* Get the reply port and send the receive right in it */
+               mach_port_t reply_port = hdr->msgh_remote_port;
+               T_LOG("The rcv right to send is %d", port);
+               send(reply_port, MACH_PORT_NULL, port, 0, 0, MACH_MSG_TYPE_MOVE_SEND_ONCE);
+
+               waitpid(pid, &stat, 0);
+
+               /* The handler priority should not be boosted anymore */
+               priority = get_pri(mach_thread_self());
+               T_EXPECT_EQ(priority, 31, "Priority of servicer is %d", priority);
+
+               T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat));
+               T_EXPECT_EQ(WEXITSTATUS(stat), 31, "Temp owner boost did not work correctly with knotes");
+               T_END;
+       }
+       *events = 0;
+}
+
+static void
+register_workloop_for_port(
+       mach_port_t port,
+       pthread_workqueue_function_workloop_t func,
+       unsigned int options)
+{
+       int r;
+
+       /* register workloop handler with pthread */
+       if (func != NULL) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                           worker_cb, event_cb,
+                           (pthread_workqueue_function_workloop_t)func, 0, 0), NULL);
+       }
+
+       /* attach port to workloop */
+       struct kevent_qos_s kev[] = {{
+                                            .ident = port,
+                                            .filter = EVFILT_MACHPORT,
+                                            .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED,
+                                            .fflags = options,
+                                            .data = 1,
+                                            .qos = (int32_t)_pthread_qos_class_encode(QOS_CLASS_DEFAULT, 0, 0)
+                                    }};
+
+       struct kevent_qos_s kev_err[] = {{ 0 }};
+
+       /* Setup workloop for mach msg rcv */
+       r = kevent_id(25, kev, 1, kev_err, 1, NULL,
+           NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS);
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "kevent_id");
+       T_QUIET; T_ASSERT_EQ(r, 0, "no errors returned from kevent_id");
+}
+
+/*
+ * Test 1: Test turnstile boosting for temp owner ports for posix_spawn.
+ *
+ * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port and
+ * test that spawned binary has the temp owner push of the port.
+ */
+T_DECL(posix_spawn_basic_priority, "Basic posix spawn temp owner priority test", T_META_ASROOT(YES))
+{
+       mach_port_t port;
+       pid_t pid;
+       int stat;
+
+       port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE);
+       pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "EXIT", &port, 1);
+
+       waitpid(pid, &stat, 0);
+
+       T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat));
+       T_EXPECT_EQ(WEXITSTATUS(stat), 47, "spawn did not properly boost main thread");
+       T_END;
+}
+
+/*
+ * Test 2: Test turnstile boosting for temp owner ports for posix_spawn and exec.
+ *
+ * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port and
+ * test that spawned binary has the temp owner push of the port. The spawned binary will exec
+ * and verify that it still has the push.
+ */
+T_DECL(posix_spawn_exec_basic_priority, "Basic posix spawn/exec temp owner priority test", T_META_ASROOT(YES))
+{
+       mach_port_t port;
+       pid_t pid;
+       int stat;
+
+       port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE);
+       pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "EXEC", &port, 1);
+
+       waitpid(pid, &stat, 0);
+
+       T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat));
+       T_EXPECT_EQ(WEXITSTATUS(stat), 47, "spawn/exec did not properly boost main thread");
+       T_END;
+}
+
+/*
+ * Test 3: Test turnstile boosting for temp owner ports for posix_spawn and set exec.
+ *
+ * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port and
+ * test that spawned binary has the temp owner push of the port. The spawned binary will
+ * posix_spawn set exec and verify that it still has the push.
+ */
+T_DECL(posix_spawn_set_exec_basic_priority, "Basic posix spawn set exec temp owner priority test", T_META_ASROOT(YES))
+{
+       mach_port_t port;
+       pid_t pid;
+       int stat;
+
+       port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE);
+       pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "SETEXEC", &port, 1);
+
+       waitpid(pid, &stat, 0);
+
+       T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat));
+       T_EXPECT_EQ(WEXITSTATUS(stat), 47, "spawn set exec did not properly boost main thread");
+       T_END;
+}
+
+/*
+ * Test 4: Test turnstile boosting for temp owner ports for posix_spawn and set exec.
+ *
+ * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port and
+ * test that spawned binary has the temp owner push of the port. The spawned binary already
+ * having the temp owner push will try to do set exec with watchports which should fail.
+ */
+T_DECL(posix_spawn_set_exec_with_more_ports, "posix spawn set exec with more watch ports", T_META_ASROOT(YES))
+{
+       mach_port_t port;
+       pid_t pid;
+       int stat;
+
+       port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE);
+       pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "SETEXEC_PORTS", &port, 1);
+
+       waitpid(pid, &stat, 0);
+
+       T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat));
+       T_EXPECT_EQ(WEXITSTATUS(stat), EINVAL, "spawn set exec did not error out when watchports were passed to already boosted process");
+       T_END;
+}
+
+/*
+ * Test 5: Test turnstile boosting for temp owner ports for multiple posix_spawns.
+ *
+ * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port, then
+ * pass the same port as a watchport to another posix_spawn and verify that the boost was
+ * transferred to the new process.
+ */
+T_DECL(posix_spawn_multiple, "multiple posix_spawn with same watchport", T_META_ASROOT(YES))
+{
+       mach_port_t port;
+       pid_t pid1, pid2;
+       int stat1, stat2;
+
+       port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE);
+       pid1 = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "WAIT", &port, 1);
+
+       /* Let the child 1 execute a little, the sleep here is optional */
+       sleep(2);
+
+       pid2 = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "EXIT", &port, 1);
+
+       waitpid(pid2, &stat2, 0);
+       waitpid(pid1, &stat1, 0);
+
+       T_QUIET; T_LOG("The return stat for child 1 is is %d", WEXITSTATUS(stat1));
+       T_QUIET; T_LOG("The return stat for child 2 is is %d", WEXITSTATUS(stat2));
+       T_EXPECT_EQ(WEXITSTATUS(stat2), 47, "spawn of multiple processes with same watchport did not transfer the boost correctly");
+       T_EXPECT_EQ(WEXITSTATUS(stat1), 31, "spawn of multiple processes with same watchport did not transfer the boost correctly");
+       T_END;
+}
+
+/*
+ * Test 6: Test turnstile boosting for temp owner ports for posix_spawn for dead port.
+ *
+ * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port and
+ * test that spawned binary has the temp owner push of the port. Destroy the port and verify
+ * the temp owner push has gone away.
+ */
+T_DECL(posix_spawn_dead_reply_port, "posix spawn with reply port destory", T_META_ASROOT(YES))
+{
+       mach_port_t port;
+       kern_return_t kr;
+       pid_t pid;
+       int stat;
+
+       port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE);
+       pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "WAIT", &port, 1);
+
+       /* Let the child execute a little, the sleep here is optional */
+       sleep(2);
+
+       /* Destory the special reply port */
+       kr = mach_port_mod_refs(mach_task_self(), sr_port, MACH_PORT_RIGHT_RECEIVE, -1);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "posix_spaw_dead_port  mach_port_mod_refs");
+
+       waitpid(pid, &stat, 0);
+
+       T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat));
+       T_EXPECT_EQ(WEXITSTATUS(stat), 31, "Temp owner boost was not removed on port death");
+       T_END;
+}
+
+/*
+ * Test 7: Test turnstile boosting for temp owner ports for posix_spawn for dead port.
+ *
+ * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port and
+ * test that spawned binary has the temp owner push of the port. Destroy the port and verify
+ * the temp owner push has gone.
+ */
+T_DECL(posix_spawn_dead_port, "posix spawn with port destory", T_META_ASROOT(YES))
+{
+       mach_port_t port;
+       kern_return_t kr;
+       pid_t pid;
+       int stat;
+
+       port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE);
+       pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "WAIT", &port, 1);
+
+       /* Destory the port */
+       kr = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_RECEIVE, -1);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "posix_spaw_dead_port  mach_port_mod_refs");
+
+       waitpid(pid, &stat, 0);
+
+       T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat));
+       T_EXPECT_EQ(WEXITSTATUS(stat), 31, "Temp owner boost was not removed on port death");
+       T_END;
+}
+
+/*
+ * Test 8: Test turnstile boosting for temp owner ports for posix_spawn when port is copied in.
+ *
+ * Create a port with sync IPC push and then pass the port to posix_spawn as a watch port and
+ * test that spawned binary has the temp owner push of the port. Copyin the port and verify
+ * the temp owner push has gone.
+ */
+T_DECL(posix_spawn_copyin_port, "posix spawn with copyin port", T_META_ASROOT(YES))
+{
+       mach_port_t port;
+       pid_t pid;
+       int stat;
+
+       port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE);
+       pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "WAIT", &port, 1);
+
+       /* Let the child execute a little, the sleep here is optional */
+       sleep(2);
+
+       /* Copyin the port in another port */
+       create_port_and_copyin_a_port(port);
+
+       waitpid(pid, &stat, 0);
+
+       T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat));
+       T_EXPECT_EQ(WEXITSTATUS(stat), 31, "Temp owner boost was not removed on port copyin");
+       T_END;
+}
+
+/*
+ * Test 9: Test turnstile boosting for temp owner ports for posix_spawn with multiple ports.
+ *
+ * Create multiple ports with sync IPC push and then pass the port to posix_spawn as watch ports and
+ * test that spawned binary has the temp owner push of the ports. Copyin ports one by one and verify
+ * the push has gone.
+ */
+T_DECL(posix_spawn_multiple_port, "posix spawn with multiple ports", T_META_ASROOT(YES))
+{
+       mach_port_t port[2];
+       pid_t pid;
+       int stat;
+
+       port[0] = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE);
+       port[1] = get_sync_push_port_at_qos(QOS_CLASS_USER_INITIATED);
+       pid = posix_spawn_child_with_watch_ports("prioritize_process_launch_helper", "MULTIWAIT", port, 2);
+
+       /* Let the child execute a little, the sleep here is optional */
+       sleep(2);
+
+       /* Copyin the port in another port */
+       create_port_and_copyin_a_port(port[0]);
+
+       /* Let the child execute a little, the sleep here is optional */
+       sleep(2);
+
+       /* Copyin the port in another port */
+       create_port_and_copyin_a_port(port[1]);
+
+       waitpid(pid, &stat, 0);
+
+       T_QUIET; T_LOG("The return stat is %d", WEXITSTATUS(stat));
+       T_EXPECT_EQ(WEXITSTATUS(stat), 31, "Temp owner boost did not work correctly with multiple ports");
+       T_END;
+}
+
+/*
+ * Test 10: Test turnstile boosting for temp owner ports for posix_spawn when port attached to a knote.
+ *
+ * Create a port with sync IPC push attach a workloop knote to it, send a message on the port, then in the
+ * servicer pass the port to posix_spawn as a watch port and test that spawned binary has the temp owner
+ * push of the port and the servicer looses the boost.
+ */
+T_DECL(posix_spawn_knote, "posix spawn with temp owner port attached to knote", T_META_ASROOT(YES))
+{
+       mach_port_t port;
+
+       port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE);
+
+       /* attach port to a workloop */
+       register_workloop_for_port(port, workloop_cb_test_intransit, MACH_RCV_OPTIONS);
+
+       /* send a message on port to activate workloop handler */
+       send(port, MACH_PORT_NULL, MACH_PORT_NULL, QOS_CLASS_DEFAULT, 0, MACH_MSG_TYPE_COPY_SEND);
+       sigsuspend(0);
+}
+
+/*
+ * Test 11: Test turnstile boosting for temp owner ports for posix_spawn when port attached to a knote.
+ *
+ * Create a port with sync IPC push attach a workloop knote to it, send a message on the port, then in the
+ * servicer pass the port to posix_spawn as a watch port and test that spawned binary has the temp owner
+ * push of the port and the servicer looses the boost, verify that once the spawned binary dies, the servicer
+ * gets the push.
+ */
+T_DECL(posix_spawn_knote_ret, "posix spawn with temp owner port attached to knote with spawned binary dead", T_META_ASROOT(YES))
+{
+       mach_port_t port;
+
+       port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE);
+
+       register_workloop_for_port(port, workloop_cb_test_knote_kill, MACH_RCV_OPTIONS);
+
+       /* send a message on port to activate workloop handler */
+       send(port, MACH_PORT_NULL, MACH_PORT_NULL, QOS_CLASS_DEFAULT, 0, MACH_MSG_TYPE_COPY_SEND);
+       sigsuspend(0);
+}
+
+/*
+ * Test 12: Test turnstile boosting for temp owner ports and mach msg option for sync bootstrap_checkin.
+ *
+ * Create a port with sync IPC push attach a workloop knote to it, send a message on the port, then in the
+ * servicer pass the port to posix_spawn as a watch port and test that spawned binary has the temp owner
+ * push of the port and the servicer looses the boost, the spawn binary then does a sync bootstrap_checkin
+ * with test binary to get the receive right and verify that is still has the boost.
+ */
+T_DECL(mach_msg_sync_boostrap_checkin, "test mach msg option for sync bootstrap_checkin", T_META_ASROOT(YES))
+{
+       mach_port_t port;
+       mach_port_t sync_port;
+       kern_return_t kr;
+
+       port = get_sync_push_port_at_qos(QOS_CLASS_USER_INTERACTIVE);
+
+       register_workloop_for_port(port, workloop_cb_test_sync_bootstrap, MACH_RCV_SYNC_PEEK);
+
+       /* Create a mach port for spawned binary to do bootstrap checkin */
+       kr = mach_port_allocate(mach_task_self(),
+           MACH_PORT_RIGHT_RECEIVE,
+           &sync_port);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_msg_sync_boostrap_checkin mach_port_allocate");
+
+       kr = mach_port_insert_right(mach_task_self(),
+           sync_port,
+           sync_port,
+           MACH_MSG_TYPE_MAKE_SEND);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_msg_sync_boostrap_checkin mach_port_insert_right");
+
+       kr = mach_port_mod_refs(mach_task_self(), sync_port, MACH_PORT_RIGHT_SEND, 1);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_msg_sync_boostrap_checkin mach_port_mod_refs");
+
+       register_workloop_for_port(sync_port, NULL, MACH_RCV_OPTIONS);
+
+       /* Stash the port in task to make sure child also gets it */
+       kr = mach_ports_register(mach_task_self(), &sync_port, 1);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_msg_sync_boostrap_checkin mach_ports_register");
+
+       /* send a message on port to activate workloop handler */
+       send(port, MACH_PORT_NULL, MACH_PORT_NULL, QOS_CLASS_DEFAULT, 0, MACH_MSG_TYPE_COPY_SEND);
+       sigsuspend(0);
+}
diff --git a/tests/prioritize_process_launch_helper.c b/tests/prioritize_process_launch_helper.c
new file mode 100644 (file)
index 0000000..f190e62
--- /dev/null
@@ -0,0 +1,335 @@
+/*
+ * prioritize process launch: Tests prioritized process launch across posix spawn and exec.
+ */
+
+#include <dispatch/dispatch.h>
+#include <pthread.h>
+#include <launch.h>
+#include <mach/mach.h>
+#include <mach/message.h>
+#include <mach/mach_voucher.h>
+#include <pthread/workqueue_private.h>
+#include <voucher/ipc_pthread_priority_types.h>
+#include <servers/bootstrap.h>
+#include <stdlib.h>
+#include <sys/event.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <crt_externs.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <libkern/OSAtomic.h>
+#include <sys/wait.h>
+#include <spawn.h>
+#include <spawn_private.h>
+#include <string.h>
+
+
+mach_port_t
+receive(
+       mach_port_t rcv_port,
+       mach_port_t notify_port);
+
+static int
+get_pri(thread_t thread_port)
+{
+       kern_return_t kr;
+
+       thread_extended_info_data_t extended_info;
+       mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT;
+       kr = thread_info(thread_port, THREAD_EXTENDED_INFO,
+           (thread_info_t)&extended_info, &count);
+
+       if (kr != KERN_SUCCESS) {
+               printf("thread info failed to get current priority of the thread\n");
+       }
+       return extended_info.pth_curpri;
+}
+
+static void
+set_thread_name(const char *fn_name)
+{
+       char name[50] = "";
+
+       thread_t thread_port = pthread_mach_thread_np(pthread_self());
+
+       int pri = get_pri(thread_port);
+
+       snprintf(name, sizeof(name), "%s at pri %2d", fn_name, pri);
+       pthread_setname_np(name);
+}
+
+static void
+send(
+       mach_port_t send_port,
+       mach_port_t reply_port,
+       mach_port_t msg_port,
+       mach_msg_option_t options,
+       int send_disposition)
+{
+       kern_return_t ret = 0;
+
+       struct {
+               mach_msg_header_t header;
+               mach_msg_body_t body;
+               mach_msg_port_descriptor_t port_descriptor;
+       } send_msg = {
+               .header = {
+                       .msgh_remote_port = send_port,
+                       .msgh_local_port  = reply_port,
+                       .msgh_bits        = MACH_MSGH_BITS_SET(send_disposition,
+           reply_port ? MACH_MSG_TYPE_MAKE_SEND_ONCE : 0,
+           MACH_MSG_TYPE_MOVE_SEND,
+           MACH_MSGH_BITS_COMPLEX),
+                       .msgh_id          = 0x100,
+                       .msgh_size        = sizeof(send_msg),
+               },
+               .body = {
+                       .msgh_descriptor_count = 1,
+               },
+               .port_descriptor = {
+                       .name        = msg_port,
+                       .disposition = MACH_MSG_TYPE_MOVE_RECEIVE,
+                       .type        = MACH_MSG_PORT_DESCRIPTOR,
+               },
+       };
+
+       if (msg_port == MACH_PORT_NULL) {
+               send_msg.body.msgh_descriptor_count = 0;
+       }
+
+       ret = mach_msg(&(send_msg.header),
+           MACH_SEND_MSG |
+           MACH_SEND_TIMEOUT |
+           MACH_SEND_OVERRIDE |
+           ((reply_port ? MACH_SEND_SYNC_OVERRIDE : 0) | options),
+           send_msg.header.msgh_size,
+           0,
+           MACH_PORT_NULL,
+           10000,
+           0);
+
+       if (ret != KERN_SUCCESS) {
+               printf("mach_msg_send failed with error %d\n", ret);
+       }
+}
+
+mach_port_t
+receive(
+       mach_port_t rcv_port,
+       mach_port_t notify_port)
+{
+       kern_return_t ret = 0;
+       mach_port_t service_port;
+
+       struct {
+               mach_msg_header_t header;
+               mach_msg_body_t body;
+               mach_msg_port_descriptor_t port_descriptor;
+               mach_msg_trailer_t trailer;
+       } rcv_msg = {
+               .header =
+               {
+                       .msgh_remote_port = MACH_PORT_NULL,
+                       .msgh_local_port  = rcv_port,
+                       .msgh_size        = sizeof(rcv_msg),
+               },
+       };
+
+       printf("Client: Starting sync receive\n");
+
+       ret = mach_msg(&(rcv_msg.header),
+           MACH_RCV_MSG | MACH_RCV_LARGE |
+           (notify_port ? MACH_RCV_SYNC_WAIT : 0),
+           0,
+           rcv_msg.header.msgh_size,
+           rcv_port,
+           0,
+           notify_port);
+
+       printf("mach msg rcv returned %d\n", ret);
+
+
+       if (rcv_msg.body.msgh_descriptor_count != 1) {
+               if (notify_port) {
+                       printf("Did not receive a service port in mach msg %d\n", rcv_msg.body.msgh_descriptor_count);
+               }
+               return MACH_PORT_NULL;
+       }
+
+       service_port = rcv_msg.port_descriptor.name;
+       return service_port;
+}
+
+int
+main(int argc __attribute__((unused)), char *argv[])
+{
+       int priority;
+       set_thread_name(__FUNCTION__);
+
+       /* Check for priority */
+       priority = get_pri(mach_thread_self());
+       printf("The priority of child is %d\n", priority);
+
+       if (strcmp(argv[1], "EXIT") == 0) {
+               printf("Helper process exiting\n");
+               exit(priority);
+       } else if (strcmp(argv[1], "EXEC") == 0) {
+               int ret;
+
+               printf("Helper process execing\n");
+               /* exec the same binary with EXIT arg */
+               char *binary = "prioritize_process_launch_helper";
+               char *new_argv[] = {binary, "EXIT", NULL};
+               ret = execve(binary, new_argv, NULL);
+               exit(ret);
+       } else if (strcmp(argv[1], "SETEXEC") == 0) {
+               int ret;
+               int child_pid;
+               posix_spawnattr_t attr;
+
+               ret = posix_spawnattr_init(&attr);
+               if (ret != 0) {
+                       printf("posix_spawnattr_init failed \n");
+                       exit(ret);
+               }
+               ret = posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETEXEC);
+               if (ret != 0) {
+                       printf("posix_spawnattr_setflags failed \n");
+                       exit(ret);
+               }
+
+               printf("Helper process doing posix_spawn set_exec\n");
+               /* set exec the same binary with EXIT arg */
+               char *binary = "prioritize_process_launch_helper";
+               char *new_argv[] = {binary, "EXIT", NULL};
+
+               ret = posix_spawn(&child_pid, binary, NULL, &attr, new_argv, NULL);
+               exit(ret);
+       } else if (strcmp(argv[1], "SETEXEC_PORTS") == 0) {
+               int ret;
+               int child_pid;
+               posix_spawnattr_t attr;
+               mach_port_t port;
+
+               kern_return_t kr =  mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port);
+               if (kr != KERN_SUCCESS) {
+                       printf("mach_port_allocate failed with error %d\n", kr);
+                       exit(kr);
+               }
+
+               kr = mach_port_insert_right(mach_task_self(), port, port, MACH_MSG_TYPE_MAKE_SEND);
+               if (kr != KERN_SUCCESS) {
+                       printf("mach_port_insert_right failed with error %d\n", kr);
+                       exit(kr);
+               }
+
+               ret = posix_spawnattr_init(&attr);
+               if (ret != 0) {
+                       printf("posix_spawnattr_init failed \n");
+                       exit(ret);
+               }
+
+               ret = posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETEXEC);
+               if (ret != 0) {
+                       printf("posix_spawnattr_setflags failed \n");
+                       exit(ret);
+               }
+
+               ret = posix_spawnattr_set_importancewatch_port_np(&attr, 1, &port);
+               if (ret != 0) {
+                       printf("posix_spawnattr_set_importance_port_np failed \n");
+                       exit(ret);
+               }
+
+               printf("Helper process doing posix_spawn set_exec\n");
+               /* set exec the same binary with EXIT arg */
+               char *binary = "prioritize_process_launch_helper";
+               char *new_argv[] = {binary, "EXIT", NULL};
+
+               ret = posix_spawn(&child_pid, binary, NULL, &attr, new_argv, NULL);
+               printf("spawned failed with error %d\n", ret);
+               exit(ret);
+       } else if (strcmp(argv[1], "WAIT") == 0) {
+               do {
+                       sleep(1);
+                       priority = get_pri(mach_thread_self());
+               } while (priority == 47);
+               exit(priority);
+       } else if (strcmp(argv[1], "MULTIWAIT") == 0) {
+               do {
+                       sleep(1);
+                       priority = get_pri(mach_thread_self());
+               } while (priority == 47);
+               printf("The priority came down to %d\n", priority);
+               do {
+                       sleep(1);
+                       priority = get_pri(mach_thread_self());
+               } while (priority == 37);
+               printf("The priority came down to %d\n", priority);
+               exit(priority);
+       } else if (strcmp(argv[1], "MSGSYNC") == 0) {
+               int ret_val = 31;
+               mach_port_array_t port_array = NULL;
+               unsigned int portCnt = 0;
+               mach_port_t send_port;
+               mach_port_t special_reply_port;
+               mach_port_t service_port;
+               kern_return_t kr;
+
+               priority = get_pri(mach_thread_self());
+               printf("The priority of spawned binary is  to %d\n", priority);
+               if (priority != 47) {
+                       ret_val = 0;
+               }
+
+               /* Get the stashed send right using mach_ports_lookup */
+               kr = mach_ports_lookup(mach_task_self(), &port_array, &portCnt);
+               if (kr != KERN_SUCCESS) {
+                       printf("mach_ports_lookup failed with return value %d and port count %d\n", kr, portCnt);
+                       exit(0);
+               }
+
+               send_port = port_array[0];
+               special_reply_port = thread_get_special_reply_port();
+               if (!MACH_PORT_VALID(special_reply_port)) {
+                       printf("Failed to special reply port for thread\n");
+                       exit(0);
+               }
+
+               /* Perform a Sync bootstrap checkin */
+               send(send_port, special_reply_port, MACH_PORT_NULL, MACH_SEND_SYNC_BOOTSTRAP_CHECKIN, MACH_MSG_TYPE_COPY_SEND);
+               sleep(2);
+
+               /* Make sure we are still boosted */
+               priority = get_pri(mach_thread_self());
+               printf("The priority of spawned binary is  to %d\n", priority);
+               if (priority != 47) {
+                       ret_val = 0;
+               }
+
+               /* Receive the service port */
+               service_port = receive(special_reply_port, send_port);
+
+               /* Make sure we are still boosted */
+               priority = get_pri(mach_thread_self());
+               printf("The priority of spawned binary is  to %d\n", priority);
+               if (priority != 47) {
+                       ret_val = 0;
+               }
+
+               /* Try to receive on service port */
+               receive(service_port, MACH_PORT_NULL);
+
+               /* Make sure we are no longer boosted */
+               priority = get_pri(mach_thread_self());
+               printf("The priority of spawned binary is  to %d\n", priority);
+               if (priority != 31) {
+                       ret_val = 0;
+               }
+               exit(ret_val);
+       }
+
+       exit(0);
+}
diff --git a/tests/prng.c b/tests/prng.c
new file mode 100644 (file)
index 0000000..18b6ee8
--- /dev/null
@@ -0,0 +1,81 @@
+#include <dispatch/dispatch.h>
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <sys/random.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+#define BUF_SIZE ((size_t)(1 << 25))
+#define BLOCK_SIZE ((size_t)16)
+
+static int
+cmp(const void *a, const void *b)
+{
+       return memcmp(a, b, 16);
+}
+
+static void
+prng_sanitycheck(uint8_t *buf, size_t buf_size)
+{
+       size_t nblocks = buf_size / BLOCK_SIZE;
+       qsort(buf, nblocks, BLOCK_SIZE, cmp);
+
+       for (size_t i = 0; i < nblocks - 1; i += 1) {
+               T_QUIET;
+               T_ASSERT_NE(memcmp(buf, buf + BLOCK_SIZE, BLOCK_SIZE), 0, "duplicate block");
+               buf += BLOCK_SIZE;
+       }
+}
+
+static void
+prng_getentropy(void *ctx, size_t i)
+{
+       uint8_t *buf = ((uint8_t *)ctx) + (BUF_SIZE * i);
+
+       for (size_t j = 0; j < BUF_SIZE; j += 256) {
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(getentropy(&buf[j], 256), "getentropy");
+       }
+
+       prng_sanitycheck(buf, BUF_SIZE);
+}
+
+static void
+prng_devrandom(void *ctx, size_t i)
+{
+       uint8_t *buf = ((uint8_t *)ctx) + (BUF_SIZE * i);
+
+       int fd = open("/dev/random", O_RDONLY);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(fd, "open");
+
+       size_t n = BUF_SIZE;
+       while (n > 0) {
+               ssize_t m = read(fd, buf, n);
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(m, "read");
+
+               n -= (size_t)m;
+               buf += m;
+       }
+
+       buf = ((uint8_t *)ctx) + (BUF_SIZE * i);
+       prng_sanitycheck(buf, BUF_SIZE);
+}
+
+T_DECL(prng, "prng test")
+{
+       size_t ncpu = (size_t)dt_ncpu();
+
+       uint8_t *buf = malloc(BUF_SIZE * ncpu);
+       T_QUIET;
+       T_ASSERT_NOTNULL(buf, "malloc");
+
+       dispatch_apply_f(ncpu, DISPATCH_APPLY_AUTO, buf, prng_getentropy);
+
+       dispatch_apply_f(ncpu, DISPATCH_APPLY_AUTO, buf, prng_devrandom);
+
+       prng_sanitycheck(buf, BUF_SIZE * ncpu);
+
+       free(buf);
+}
index 8502f2d87acdd0b84c44c1185d6a249db22bded1..51206a2f42bdb4cddd84232cfb1727715e8fbb92 100644 (file)
@@ -6,6 +6,7 @@
 #include <fcntl.h>
 #include <inttypes.h>
 #include <libproc.h>
+#include <libgen.h>
 #include <limits.h>
 #include <mach/mach.h>
 #include <mach/policy.h>
@@ -28,6 +29,8 @@
 #include <unistd.h>
 #undef PRIVATE
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 #define ACT_CHANGE_UID 1
 #define ACT_CHANGE_RUID 2
 #define ACT_EXIT 127
@@ -732,8 +735,7 @@ free_proc_info(void ** proc_info, int num)
 
 T_DECL(proc_info_listpids_all_pids,
     "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        /*
         * Get the value of nprocs with no buffer sent in
@@ -800,8 +802,7 @@ T_DECL(proc_info_listpids_all_pids,
 
 T_DECL(proc_info_listpids_pgrp_only,
     "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
        T_LOG("Test to verify PROC_PGRP_ONLY returns correct value");
@@ -823,8 +824,7 @@ T_DECL(proc_info_listpids_pgrp_only,
 
 T_DECL(proc_info_listpids_ppid_only,
     "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
        T_LOG("Test to verify PROC_PPID_ONLY returns correct value");
@@ -844,8 +844,7 @@ T_DECL(proc_info_listpids_ppid_only,
 
 T_DECL(proc_info_listpids_uid_only,
     "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
        T_LOG("Test to verify PROC_UID_ONLY returns correct value");
@@ -864,8 +863,7 @@ T_DECL(proc_info_listpids_uid_only,
 
 T_DECL(proc_info_listpids_ruid_only,
     "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
        T_LOG("Test to verify PROC_RUID_ONLY returns correct value");
@@ -884,8 +882,7 @@ T_DECL(proc_info_listpids_ruid_only,
 
 T_DECL(proc_info_listpids_tty_only,
     "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        int ret = isatty(STDOUT_FILENO);
        if (ret != 1) {
@@ -915,8 +912,7 @@ T_DECL(proc_info_listpids_tty_only,
 
 T_DECL(proc_info_pidinfo_proc_piduniqidentifierinfo,
     "Test to identify PROC_PIDUNIQIDENTIFIERINFO returns correct unique identifiers for process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        void * proc_info[2];
        proc_info_caller(P_UNIQIDINFO | C_UNIQIDINFO, proc_info, NULL);
@@ -936,8 +932,7 @@ T_DECL(proc_info_pidinfo_proc_piduniqidentifierinfo,
 
 T_DECL(proc_info_pidinfo_proc_pidtbsdinfo,
     "Test to verify PROC_PIDTBSDINFO returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        void * proc_info[2];
        int child_pid = 0;
@@ -969,8 +964,7 @@ T_DECL(proc_info_pidinfo_proc_pidtbsdinfo,
 
 T_DECL(proc_info_pidt_shortbsdinfo,
     "Test to verify PROC_PIDT_SHORTBSDINFO returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        void * proc_info[2];
        int child_pid = 0;
@@ -999,8 +993,7 @@ T_DECL(proc_info_pidt_shortbsdinfo,
 
 T_DECL(proc_info_pidt_bsdinfowithuniqid,
     "Test to verify PROC_PIDT_BSDINFOWITHUNIQID returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        void * proc_info[4];
        int child_pid = 0;
@@ -1044,8 +1037,7 @@ T_DECL(proc_info_pidt_bsdinfowithuniqid,
 
 T_DECL(proc_info_proc_pidtask_info,
     "Test to verify PROC_PIDTASKINFO returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        void * proc_info[2];
        proc_info_caller(P_TASK_INFO | P_TASK_INFO_NEW, proc_info, NULL);
@@ -1102,8 +1094,7 @@ T_DECL(proc_info_proc_pidtask_info,
 
 T_DECL(proc_info_proc_pidtaskallinfo,
     "Test to verify PROC_PIDTASKALLINFO returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        void * proc_info[4];
        int child_pid = 0;
@@ -1180,8 +1171,7 @@ T_DECL(proc_info_proc_pidtaskallinfo,
 
 T_DECL(proc_info_proc_pidlistthreads,
     "Test to verify PROC_PIDLISTTHREADS returns valid information about process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        void * proc_info[1];
        proc_info_caller(THREAD_ADDR, proc_info, NULL);
@@ -1189,8 +1179,7 @@ T_DECL(proc_info_proc_pidlistthreads,
 
 T_DECL(proc_info_proc_pidthreadinfo,
     "Test to verify PROC_PIDTHREADINFO returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        void * proc_info[2];
        int child_pid = 0;
@@ -1228,8 +1217,7 @@ T_DECL(proc_info_proc_pidthreadinfo,
 
 T_DECL(proc_info_proc_threadid64info,
     "Test to verify PROC_PIDTHREADID64INFO returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        void * proc_info[2];
        proc_info_caller(PTHINFO | PTHINFO_64, proc_info, NULL);
@@ -1257,8 +1245,7 @@ T_DECL(proc_info_proc_threadid64info,
 
 T_DECL(proc_info_proc_pidthreadpathinfo,
     "Test to verify PROC_PIDTHREADPATHINFO returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        void * proc_info[2];
        proc_info_caller(PTHINFO | PINFO_PATH, proc_info, NULL);
@@ -1289,8 +1276,7 @@ T_DECL(proc_info_proc_pidthreadpathinfo,
 
 T_DECL(proc_info_proc_pidarchinfo,
     "Test to verify PROC_PIDARCHINFO returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        void * proc_info[1];
        proc_info_caller(PAI, proc_info, NULL);
@@ -1312,8 +1298,7 @@ T_DECL(proc_info_proc_pidarchinfo,
 
 T_DECL(proc_info_proc_pidregioninfo,
     "Test to verify PROC_PIDREGIONINFO returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        void * proc_info[3];
        proc_info_caller(PREGINFO, proc_info, NULL);
@@ -1363,8 +1348,7 @@ T_DECL(proc_info_proc_pidregioninfo,
 
 T_DECL(proc_info_proc_pidregionpathinfo,
     "Test to verify PROC_PIDREGIONPATHINFO returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_INSTALLEDUSEROS))
+    T_META_ASROOT(true))
 {
        void * proc_info[3];
        proc_info_caller(PREGINFO_PATH, proc_info, NULL);
@@ -1451,8 +1435,7 @@ T_DECL(proc_info_proc_pidregionpathinfo,
 
 T_DECL(proc_info_proc_pidregionpathinfo2,
     "Test to verify PROC_PIDREGIONPATHINFO2 returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_INSTALLEDUSEROS))
+    T_META_ASROOT(true))
 {
        void * proc_info[3];
        proc_info_caller(PREGINFO_PATH_2, proc_info, NULL);
@@ -1544,8 +1527,7 @@ T_DECL(proc_info_proc_pidregionpathinfo2,
 
 T_DECL(proc_info_proc_pidregionpathinfo3,
     "Test to verify PROC_PIDREGIONPATHINFO3 returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_INSTALLEDUSEROS))
+    T_META_ASROOT(true))
 {
        void * proc_info[5];
        proc_info_caller(PREGINFO_PATH_3, proc_info, NULL);
@@ -1569,8 +1551,7 @@ T_DECL(proc_info_proc_pidregionpathinfo3,
 
 T_DECL(proc_info_proc_pidvnodepathinfo,
     "Test to verify PROC_PIDVNODEPATHINFO returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        void * proc_info[1];
        proc_info_caller(PVNINFO, proc_info, NULL);
@@ -1605,8 +1586,7 @@ T_DECL(proc_info_proc_pidvnodepathinfo,
 
 T_DECL(proc_info_pidinfo_proc_pidlistfds,
     "proc_info API tests to verify PROC_INFO_CALL_PIDINFO/PROC_PIDLISTFDS",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        int retval;
        int orig_nfiles              = 0;
@@ -1654,8 +1634,7 @@ T_DECL(proc_info_pidinfo_proc_pidlistfds,
 
 T_DECL(proc_info_proc_pidpathinfo,
     "Test to verify PROC_PIDPATHINFO returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        char * pid_path = NULL;
        pid_path        = malloc(sizeof(char) * PROC_PIDPATHINFO_MAXSIZE);
@@ -1671,8 +1650,7 @@ T_DECL(proc_info_proc_pidpathinfo,
 
 T_DECL(proc_info_proc_pidlistfileports,
     "Test to verify PROC_PIDLISTFILEPORTS returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        struct proc_fileportinfo * fileport_info = NULL;
        mach_port_t tmp_file_port                = MACH_PORT_NULL;
@@ -1723,8 +1701,7 @@ T_DECL(proc_info_proc_pidlistfileports,
 
 T_DECL(proc_info_proc_pidcoalitioninfo,
     "Test to verify PROC_PIDCOALITIONINFO returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler);
        int child_pid             = proc_config->child_pids[0];
@@ -1751,8 +1728,7 @@ T_DECL(proc_info_proc_pidcoalitioninfo,
 
 T_DECL(proc_info_proc_pidworkqueueinfo,
     "Test to verify PROC_PIDWORKQUEUEINFO returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler);
        int child_pid             = proc_config->child_pids[0];
@@ -1778,8 +1754,7 @@ T_DECL(proc_info_proc_pidworkqueueinfo,
 }
 T_DECL(proc_info_proc_pidnoteexit,
     "Test to verify PROC_PIDNOTEEXIT returns valid information about the process",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        /*
         * Ask the child to close pipe and quit, cleanup pipes for parent
@@ -1800,8 +1775,7 @@ T_DECL(proc_info_proc_pidnoteexit,
 
 T_DECL(proc_info_negative_tests,
     "Test to validate PROC_INFO_CALL_PIDINFO for invalid arguments",
-    T_META_ASROOT(true),
-    T_META_LTEPHASE(LTE_POSTINIT))
+    T_META_ASROOT(true))
 {
        proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler);
        int child_pid             = proc_config->child_pids[0];
@@ -2073,7 +2047,8 @@ T_DECL(dynamic_kqueue_extended_info, "the kernel should report valid extended dy
 
 #pragma mark proc_listpids
 
-T_DECL(list_kdebug_pids, "the kernel should report processes that are filtered by kdebug", T_META_ASROOT(YES))
+T_DECL(list_kdebug_pids, "the kernel should report processes that are filtered by kdebug",
+    T_META_ASROOT(YES), T_META_RUN_CONCURRENTLY(false))
 {
        int mib[4] = {CTL_KERN, KERN_KDEBUG};
        int npids;
@@ -2118,3 +2093,67 @@ T_DECL(list_kdebug_pids, "the kernel should report processes that are filtered b
        T_QUIET;
        T_ASSERT_POSIX_SUCCESS(ret, "KERN_KDREMOVE sysctl");
 }
+
+#pragma mark misc
+
+static int prf_fd;
+static char prf_path[PATH_MAX];
+static void
+prf_end(void)
+{
+       close(prf_fd);
+       unlink(prf_path);
+}
+
+T_DECL(proc_regionfilename, "proc_regionfilename() should work")
+{
+       static char expected[] = "'very rigorous maritime engineering standards' && the front fell off";
+       static char real[sizeof(expected)];
+       int rc;
+       void *addr;
+
+       prf_fd = CONF_TMP_FILE_OPEN(prf_path);
+       T_ATEND(prf_end);
+
+       rc = (int) write(prf_fd, expected, sizeof(expected));
+       T_ASSERT_POSIX_SUCCESS(rc, "write to tmpfile");
+
+       addr = mmap(0, 0x1000, PROT_READ, MAP_PRIVATE, prf_fd, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_NE_PTR(addr, MAP_FAILED, "mmap of tmpfile");
+
+       T_WITH_ERRNO;
+       T_ASSERT_GT(proc_regionfilename(getpid(), (uint64_t) addr, real, MAXPATHLEN), 0, "proc_regionfilename");
+       T_EXPECT_EQ_STR(basename(prf_path), basename(real), "filename");
+}
+
+T_DECL(proc_regionpath, "PROC_PIDREGIONPATH should return addr, length and path")
+{
+       int rc;
+       struct proc_regionpath path;
+       static char some_text[] = "'very rigorous maritime engineering standards' && the front fell off";
+       unsigned long rounded_length = (sizeof(some_text) & (unsigned long) ~(PAGE_SIZE - 1)) + PAGE_SIZE;
+       void *addr;
+
+       prf_fd = CONF_TMP_FILE_OPEN(prf_path);
+       T_ATEND(prf_end);
+
+       rc = (int) write(prf_fd, some_text, sizeof(some_text));
+       T_ASSERT_POSIX_SUCCESS(rc, "write to tmpfile");
+
+       addr = mmap(0, PAGE_SIZE, PROT_READ, MAP_PRIVATE, prf_fd, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_NE_PTR(addr, MAP_FAILED, "mmap of tmpfile");
+
+       rc = proc_pidinfo(getpid(), PROC_PIDREGIONPATH, (uint64_t)addr, &path, sizeof(struct proc_regionpath));
+       T_ASSERT_POSIX_SUCCESS(rc, "proc_pidinfo");
+
+       T_ASSERT_EQ((unsigned long) path.prpo_regionlength, rounded_length, "regionlength must match");
+       T_ASSERT_EQ_PTR((void *) path.prpo_addr, addr, "addr must match");
+
+       rc = proc_pidinfo(getpid(), PROC_PIDREGIONPATH, (uint64_t)((char *) addr + 20), &path, sizeof(struct proc_regionpath));
+       T_ASSERT_POSIX_SUCCESS(rc, "proc_pidinfo 20 bytes past the base address");
+
+       T_ASSERT_EQ((unsigned long) path.prpo_regionlength, rounded_length, "regionlength must match, even when 20 bytes past the base address");
+       T_ASSERT_EQ_PTR((void *) path.prpo_addr, addr, "addr must match, even when 20 bytes past the base address");
+}
diff --git a/tests/proc_info_44873309.c b/tests/proc_info_44873309.c
new file mode 100644 (file)
index 0000000..cdd2bfc
--- /dev/null
@@ -0,0 +1,39 @@
+#include <darwintest.h>
+#include <mach/mach.h>
+#include <mach/mach_types.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+
+#include <stdio.h>
+#include <assert.h>
+#include <err.h>
+#include <libproc.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+T_DECL(proc_info_44873309, "ensure new proc_pidinfo flavor returns correct table sizes",
+    T_META_CHECK_LEAKS(false), T_META_ASROOT(true))
+{
+       mach_port_t port;
+       int retval;
+
+       pid_t pid = getpid();
+       struct proc_ipctableinfo table_info = {};
+       retval = proc_pidinfo(pid, PROC_PIDIPCTABLEINFO, 0, (void *)&table_info, (uint32_t)sizeof(table_info));
+       T_WITH_ERRNO; T_EXPECT_GT(retval, 0, "proc_pidinfo(PROC_PIDIPCTABLEINFO) returned %d", retval);
+       T_EXPECT_EQ(retval, (int)sizeof(table_info), "proc_pidinfo(PROC_PIDIPCTABLEINFO) table_size = %u, table_free = %u",
+           table_info.table_size, table_info.table_free);
+
+       kern_return_t ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port);
+       T_ASSERT_MACH_SUCCESS(ret, "mach_port_allocate MACH_PORT_RIGHT_RECEIVE");
+
+       struct proc_ipctableinfo table_info2 = {};
+       retval = proc_pidinfo(pid, PROC_PIDIPCTABLEINFO, 0, (void *)&table_info2, (uint32_t)sizeof(table_info2));
+       T_WITH_ERRNO; T_EXPECT_GT(retval, 0, "proc_pidinfo(PROC_PIDIPCTABLEINFO) returned %d", retval);
+       T_EXPECT_EQ(retval, (int)sizeof(table_info2), "proc_pidinfo(PROC_PIDIPCTABLEINFO) table_size2 = %u, table_free2 = %u",
+           table_info2.table_size, table_info2.table_free);
+
+       T_EXPECT_EQ(table_info.table_free, table_info2.table_free + 1, "Comparing the table_free values");
+}
diff --git a/tests/proc_info_44873309.entitlements b/tests/proc_info_44873309.entitlements
new file mode 100644 (file)
index 0000000..a333f47
--- /dev/null
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>com.apple.private.kernel.global-proc-info</key>
+       <true/>
+</dict>
+</plist>
index 8af5647fe1415d2395f9391220de634da98c458b..9bdba1c3e270df38b6f2332626d9c53a3357facd 100644 (file)
@@ -14,6 +14,8 @@
 #include <darwintest.h>
 #include <TargetConditionals.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 #define MAX_TRIES 20
 #define EXTRA_THREADS 15
 
index 4482e275ce420249a3ef296093fd1493fdf8a000..e482a848dedffe0bc672b1475acde5d6a6dd603e 100644 (file)
@@ -4,6 +4,8 @@
 #include <stdio.h>
 #include <unistd.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 T_DECL(proc_udata_info, "Get and set a proc udata token"){
        uint64_t token = mach_absolute_time();
        proc_info_udata_t udata;
index 470d5ca4ede4c45f13869a2bcae4182f6cd04623..887573c8e54782266a21f6a375fbde94a9b8ec38 100644 (file)
@@ -3,6 +3,8 @@
 #include <System/sys/proc_uuid_policy.h>
 #include <stdint.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 #define NUM_PROC_UUID_POLICY_FLAGS 4
 
 T_DECL(proc_uuid_policy_26567533, "Tests passing a NULL uuid in (uap->uuid).", T_META_LTEPHASE(LTE_POSTINIT))
index b8eebdb35acbb5e7105f17b86d738445bd520349..0b94fa9de1f76806b3ee2fcd30f25453fecc517b 100644 (file)
@@ -7,7 +7,8 @@
 #include <stdlib.h>
 #include <unistd.h>
 
-T_GLOBAL_META(T_META_ASROOT(true));
+T_GLOBAL_META(T_META_ASROOT(true),
+    T_META_RUN_CONCURRENTLY(true));
 
 T_DECL(processor_cpu_stat64,
     "ensure 64-bit processor statistics are reported correctly",
@@ -46,7 +47,6 @@ T_DECL(processor_cpu_stat64,
        memset(prestats, 0xff, cpu_count * sizeof(*prestats));
 
        for (int i = 0; i < (int)cpu_count; i++) {
-               printf("%d\n", PROCESSOR_CPU_STAT64_COUNT);
                mach_msg_type_number_t info_count = PROCESSOR_CPU_STAT64_COUNT;
                kr = processor_info(cpu_ports[i], PROCESSOR_CPU_STAT64, &host,
                    (processor_info_t)&prestats[i], &info_count);
index abeff39dcbe8322c5f3a467ae5b779e22d282c4d..63fd25d7452eb35a592fb9399fe26e9d16907d1c 100644 (file)
@@ -11,7 +11,9 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
-#define TMP_FILE_PATH "/tmp/test_pwrite"
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+#define TMP_FILE_PATH "/tmp/test_pwrite_28581610"
 
 static sigjmp_buf xfsz_jmpbuf;
 
index c10df2ad9ae754d8b5c65911760e29d45c7eb4d6..d864d85319665ef4cad611bee8fe680869fb030d 100644 (file)
@@ -41,6 +41,8 @@
 #include <unistd.h>
 #include <sys/sysctl.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 #ifndef _COMM_PAGE_CPU_QUIESCENT_COUNTER
 
 T_DECL(test_quiescent_counter, "Validate that _COMM_PAGE_CPU_QUIESCENT_COUNTER increments",
index 1cb3f94cc4a1b2d081879451e13b1db7ca1bdfa0..8a05c73c1f6278bfc009b6034e5e7869b04fbf28 100644 (file)
@@ -4,6 +4,9 @@
 #include <stdint.h>
 #include <sys/sysctl.h>
 #include <TargetConditionals.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 extern uint64_t __mach_bridge_remote_time(uint64_t);
 
 T_DECL(remote_time_syscall, "test mach_bridge_remote_time syscall",
diff --git a/tests/restart.c b/tests/restart.c
new file mode 100644 (file)
index 0000000..e0ea5fd
--- /dev/null
@@ -0,0 +1,149 @@
+#include <mach/task.h>
+#include <mach/mach.h>
+#include <kern/restartable.h>
+#include <stdbool.h>
+#include <darwintest.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <signal.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
+extern task_restartable_range_t range;
+extern void restartable_function(int *);
+static int step = 0;
+
+#if defined(__x86_64__)
+__asm__("    .align 4\n"
+               "    .text\n"
+               "    .private_extern _restartable_function\n"
+               "_restartable_function:\n"
+               // this should use $arg1 but I don't know intel calling conventions
+               // so the argument to restartable_function() is actually ignored
+               // as we know what it is anyway, and Intel PC-relative addressing,
+               // unlike ARM, is pretty readable
+               "    incl _step(%rip)\n"
+               "1:\n"
+               "    pause\n"
+               "    jmp 1b\n"
+               "LExit_restartable_function:\n"
+               "    ret\n");
+#elif defined(__arm64__)
+__asm__("    .align 4\n"
+               "    .text\n"
+               "    .private_extern _restartable_function\n"
+               "_restartable_function:\n"
+               "    ldr    x11, [x0]\n"
+               "    add    x11, x11, #1\n"
+               "    str    x11, [x0]\n"
+               "1:\n"
+               "    b 1b\n"
+               "LExit_restartable_function:\n"
+               "    ret\n");
+#elif defined(__arm__)
+__asm__("    .align 4\n"
+               "    .text\n"
+               "    .thumb\n"
+               "    .private_extern _restartable_function\n"
+               "    .thumb_func\n"
+               "_restartable_function:\n"
+               "0:\n"
+               "    ldr    r12, [r0]\n"
+               "    add    r12, r12, #1\n"
+               "    str    r12, [r0]\n"
+               "1:\n"
+               "    b 1b\n"
+               "LExit_restartable_function:\n"
+               "    bx lr\n");
+#elif defined(__i386__)
+#define SKIP_TEST 1
+#else
+#error Architecture unsupported
+#endif
+
+#ifndef SKIP_TEST
+__asm__("    .align 4\n"
+               "    .data\n"
+               "    .private_extern _range\n"
+               "_range:\n"
+#if __LP64__
+               "    .quad _restartable_function\n"
+#else
+               "    .long _restartable_function\n"
+               "    .long 0\n"
+#endif
+               "    .short LExit_restartable_function - _restartable_function\n"
+               "    .short LExit_restartable_function - _restartable_function\n"
+               "    .long 0\n");
+#endif
+
+static void
+noop_signal(int signo __unused)
+{
+}
+
+static void *
+task_restartable_ranges_thread(void *_ctx)
+{
+       int *stepp = _ctx;
+       restartable_function(stepp); // increments step
+       T_PASS("was successfully restarted\n");
+       (*stepp)++;
+       return NULL;
+}
+
+static void
+wait_for_step(int which)
+{
+       for (int i = 0; step != which && i < 10; i++) {
+               usleep(100000);
+       }
+}
+
+T_DECL(task_restartable_ranges, "test task_restartable_ranges")
+{
+#ifdef SKIP_TEST
+       T_SKIP("Not supported");
+#else
+       kern_return_t kr;
+       pthread_t th;
+       int rc;
+
+       signal(SIGUSR1, noop_signal);
+
+       kr = task_restartable_ranges_register(mach_task_self(), &range, 1);
+       T_ASSERT_MACH_SUCCESS(kr, "task_restartable_ranges_register");
+
+       {
+               rc = pthread_create(&th, NULL, &task_restartable_ranges_thread, &step);
+               T_ASSERT_POSIX_SUCCESS(rc, "pthread_create");
+
+               wait_for_step(1);
+               T_ASSERT_EQ(step, 1, "The thread started (sync)");
+
+               kr = task_restartable_ranges_synchronize(mach_task_self());
+               T_ASSERT_MACH_SUCCESS(kr, "task_restartable_ranges_synchronize");
+
+               T_LOG("wait for the function to be restarted (sync)");
+               wait_for_step(2);
+               T_ASSERT_EQ(step, 2, "The thread exited (sync)");
+               pthread_join(th, NULL);
+       }
+
+       {
+               rc = pthread_create(&th, NULL, &task_restartable_ranges_thread, &step);
+               T_ASSERT_POSIX_SUCCESS(rc, "pthread_create");
+
+               wait_for_step(3);
+               T_ASSERT_EQ(step, 3, "The thread started (signal)");
+
+               rc = pthread_kill(th, SIGUSR1);
+               T_ASSERT_POSIX_SUCCESS(rc, "pthread_kill");
+
+               T_LOG("wait for the function to be restarted (signal)");
+               wait_for_step(4);
+               T_ASSERT_EQ(step, 4, "The thread exited (signal)");
+               pthread_join(th, NULL);
+       }
+#endif
+}
index fe04a2ec5c8146389fd794c5364bed843a8eb5d5..5acfb74ee5b34bff742af19b96354cedb41a9ef3 100644 (file)
@@ -30,10 +30,6 @@ T_DECL(settime_32089962_not_entitled_root,
        struct timeval adj_time;
        struct timex ntptime;
 
-       if (geteuid() != 0) {
-               T_SKIP("settimeofday_root_29193041 test requires root privileges to run.");
-       }
-
        /* test settimeofday */
        T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&settimeofdaytime, NULL), NULL);
        T_ASSERT_POSIX_ZERO(settimeofday(&settimeofdaytime, NULL), NULL);
@@ -67,7 +63,7 @@ T_DECL(settime_32089962_not_entitled_not_root,
        T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&settimeofdaytime, NULL), NULL);
 
        /* test settimeofday */
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
        T_ASSERT_POSIX_ZERO(settimeofday(&settimeofdaytime, NULL), NULL);
 #else
        res = settimeofday(&settimeofdaytime, NULL);
@@ -95,10 +91,6 @@ T_DECL(settimeofday_29193041_not_entitled_root,
        struct timeval time;
        long new_time;
 
-       if (geteuid() != 0) {
-               T_SKIP("settimeofday_root_29193041 test requires root privileges to run.");
-       }
-
        T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
 
        /* increment the time of one day */
@@ -137,7 +129,7 @@ T_DECL(settimeofday_29193041_not_entitled_not_root,
        time.tv_sec = new_time;
        time.tv_usec = 0;
 
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
        T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
 #else
        int res = settimeofday(&time, NULL);
@@ -146,7 +138,7 @@ T_DECL(settimeofday_29193041_not_entitled_not_root,
 
        T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
 
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
        /* expext to be past new_time */
        T_EXPECT_GE_LONG(time.tv_sec, new_time, "Time successfully changed without root and without entitlement");
        time.tv_sec -= DAY;
diff --git a/tests/shared_cache_tests.c b/tests/shared_cache_tests.c
new file mode 100644 (file)
index 0000000..572309d
--- /dev/null
@@ -0,0 +1,39 @@
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <mach-o/dyld.h>
+#include <mach-o/dyld_priv.h>
+#include <TargetConditionals.h>
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.shared_cache"));
+
+// Give the test up to two minutes because in the failure case we want to invoke update_dyld_shared_cache, which
+// might take a bit to do.
+T_DECL(present, "tests that the device is running with a shared cache", T_META_ASROOT(true), T_META_TIMEOUT(120))
+{
+       size_t shared_cache_len = 0;
+       const void *cache_header = _dyld_get_shared_cache_range(&shared_cache_len);
+       if ((cache_header == NULL) || (shared_cache_len == 0)) {
+#if TARGET_OS_OSX
+               char *tmp_dir = (char *) dt_tmpdir();
+               T_QUIET; T_ASSERT_NOTNULL(tmp_dir, "darwintest created tmp dir");
+               // Try to invoke update_dyld_shared_cache to gather information on why we're not running with a shared cache
+               char *shared_cache_update_cmd[] = { "/usr/bin/update_dyld_shared_cache", "-debug", "-cache_dir", tmp_dir, NULL };
+               pid_t child1 = dt_launch_tool_pipe(shared_cache_update_cmd, false, NULL, ^bool (char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) {
+                       T_LOG("%s", data);
+                       return false;
+               }, ^bool (__unused char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) {
+                       T_LOG("%s", data);
+                       return false;
+               }, BUFFER_PATTERN_LINE, NULL);
+
+               int status = 0;
+               dt_waitpid(child1, &status, NULL, 0);
+
+               T_LOG("waitpid for %d returned with status %d", child1, WEXITSTATUS(status));
+#endif // TARGET_OS_OSX
+               T_ASSERT_NOTNULL(cache_header, "shared cache present");
+               T_ASSERT_GT((int) shared_cache_len, 0, "shared cache has non-zero length");
+       }
+
+       T_PASS("shared cache appears to be present and valid");
+}
index 25ec4f2750afd3eb038d35455ea23dff66531b6b..01080d3b37b711cee65d34403ae8830d96809d46 100644 (file)
@@ -6,6 +6,7 @@
 
 #include <darwintest.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
 
 static int exitcode = 0x6789BEEF;
 int should_exit = 0;
index 5e9258923dfbea77576f4855cdf030ead50d63ec..1788cad68b648b4472c44f602095c67ab72ec3f9 100644 (file)
@@ -6,6 +6,8 @@
 
 #include <darwintest.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 T_DECL(sigcontreturn, "checks that a call to waitid() for a child that is stopped and then continued returns correctly")
 {
        pid_t           pid;
index b1173c2e2b6b6095c7980b1ee44de68b2a5530c5..e5dca6a2a0c99ea09d38e90cec798e1c2984b925 100644 (file)
@@ -11,6 +11,9 @@
 #include <errno.h>
 #include <pthread.h>
 #include <stdbool.h>
+#include <TargetConditionals.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
 
 static bool debug;
 
@@ -167,10 +170,14 @@ multithreaded_bind_test(bool v6, int socket_count)
 static void
 run_multithreaded_bind_test(int number_of_runs, bool v6, int socket_count)
 {
+#if TARGET_OS_BRIDGE
+       T_SKIP("Not enough memory to handle this test");
+#else /* TARGET_OS_BRIDGE */
        for (int i = 0; i < number_of_runs; i++) {
                multithreaded_bind_test(v6, socket_count);
        }
        T_PASS("multithreaded_bind_test %s", v6 ? "IPv6" : "IPv4");
+#endif /* TARGET_OS_BRIDGE */
 }
 
 T_DECL(socket_bind_35685803,
index 5454e80b1f6fbe6bf9b9c1339dd80aad91b74460..92993534c8a5c1ce2936f9dcef4c3435970ae0bb 100644 (file)
@@ -3,6 +3,8 @@
 #include <sys/socket.h>
 #include <unistd.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 T_DECL(socket_poll_close_25786011, "Tests an invalid poll call to a socket and then calling close.", T_META_LTEPHASE(LTE_POSTINIT))
 {
        int my_socket, ret;
diff --git a/tests/stackshot.m b/tests/stackshot.m
deleted file mode 100644 (file)
index 7aef17c..0000000
+++ /dev/null
@@ -1,1022 +0,0 @@
-#include <darwintest.h>
-#include <darwintest_utils.h>
-#include <kern/debug.h>
-#include <kern/kern_cdata.h>
-#include <kdd.h>
-#include <libproc.h>
-#include <mach-o/dyld.h>
-#include <mach-o/dyld_priv.h>
-#include <sys/syscall.h>
-#include <sys/stackshot.h>
-
-/*
- * mirrors the dyld_cache_header struct defined in dyld_cache_format.h from dyld source code
- * TODO: remove once rdar://42361850 is in the build
- */
-struct dyld_cache_header
-{
-    char       magic[16];                              // e.g. "dyld_v0    i386"
-    uint32_t   mappingOffset;          // file offset to first dyld_cache_mapping_info
-    uint32_t    mappingCount;           // number of dyld_cache_mapping_info entries
-    uint32_t    imagesOffset;           // file offset to first dyld_cache_image_info
-    uint32_t    imagesCount;            // number of dyld_cache_image_info entries
-    uint64_t    dyldBaseAddress;        // base address of dyld when cache was built
-    uint64_t    codeSignatureOffset;    // file offset of code signature blob
-    uint64_t    codeSignatureSize;             // size of code signature blob (zero means to end of file)
-    uint64_t    slideInfoOffset;        // file offset of kernel slid info
-    uint64_t    slideInfoSize;          // size of kernel slid info
-    uint64_t    localSymbolsOffset;     // file offset of where local symbols are stored
-    uint64_t    localSymbolsSize;       // size of local symbols information
-    uint8_t     uuid[16];               // unique value for each shared cache file
-    uint64_t    cacheType;              // 0 for development, 1 for production
-    uint32_t    branchPoolsOffset;      // file offset to table of uint64_t pool addresses
-    uint32_t    branchPoolsCount;       // number of uint64_t entries
-    uint64_t    accelerateInfoAddr;     // (unslid) address of optimization info
-    uint64_t    accelerateInfoSize;     // size of optimization info
-    uint64_t    imagesTextOffset;       // file offset to first dyld_cache_image_text_info
-    uint64_t    imagesTextCount;        // number of dyld_cache_image_text_info entries
-    uint64_t    dylibsImageGroupAddr;   // (unslid) address of ImageGroup for dylibs in this cache
-    uint64_t    dylibsImageGroupSize;   // size of ImageGroup for dylibs in this cache
-    uint64_t    otherImageGroupAddr;    // (unslid) address of ImageGroup for other OS dylibs
-    uint64_t    otherImageGroupSize;    // size of oImageGroup for other OS dylibs
-    uint64_t    progClosuresAddr;       // (unslid) address of list of program launch closures
-    uint64_t    progClosuresSize;       // size of list of program launch closures
-    uint64_t    progClosuresTrieAddr;   // (unslid) address of trie of indexes into program launch closures
-    uint64_t    progClosuresTrieSize;   // size of trie of indexes into program launch closures
-    uint32_t    platform;               // platform number (macOS=1, etc)
-    uint32_t    formatVersion        : 8,  // dyld3::closure::kFormatVersion
-                dylibsExpectedOnDisk : 1,  // dyld should expect the dylib exists on disk and to compare inode/mtime to see if cache is valid
-                simulator            : 1,  // for simulator of specified platform
-                locallyBuiltCache    : 1,  // 0 for B&I built cache, 1 for locally built cache
-                padding              : 21; // TBD
-};
-
-T_GLOBAL_META(
-               T_META_NAMESPACE("xnu.stackshot"),
-               T_META_CHECK_LEAKS(false),
-               T_META_ASROOT(true)
-               );
-
-static const char *current_process_name(void);
-static void verify_stackshot_sharedcache_layout(struct dyld_uuid_info_64 *uuids, uint32_t uuid_count);
-static void parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int child_pid);
-static void parse_thread_group_stackshot(void **sbuf, size_t sslen);
-static uint64_t stackshot_timestamp(void *ssbuf, size_t sslen);
-static void initialize_thread(void);
-
-#define DEFAULT_STACKSHOT_BUFFER_SIZE (1024 * 1024)
-#define MAX_STACKSHOT_BUFFER_SIZE     (6 * 1024 * 1024)
-
-/* bit flags for parse_stackshot */
-#define PARSE_STACKSHOT_DELTA 0x1
-#define PARSE_STACKSHOT_ZOMBIE 0x2
-#define PARSE_STACKSHOT_SHAREDCACHE_LAYOUT 0x4
-
-T_DECL(microstackshots, "test the microstackshot syscall")
-{
-       void *buf = NULL;
-       unsigned int size = DEFAULT_STACKSHOT_BUFFER_SIZE;
-
-       while (1) {
-               buf = malloc(size);
-               T_QUIET; T_ASSERT_NOTNULL(buf, "allocated stackshot buffer");
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-               int len = syscall(SYS_microstackshot, buf, size,
-                               STACKSHOT_GET_MICROSTACKSHOT);
-#pragma clang diagnostic pop
-               if (len == ENOSYS) {
-                       T_SKIP("microstackshot syscall failed, likely not compiled with CONFIG_TELEMETRY");
-               }
-               if (len == -1 && errno == ENOSPC) {
-                       /* syscall failed because buffer wasn't large enough, try again */
-                       free(buf);
-                       buf = NULL;
-                       size *= 2;
-                       T_ASSERT_LE(size, (unsigned int)MAX_STACKSHOT_BUFFER_SIZE,
-                                       "growing stackshot buffer to sane size");
-                       continue;
-               }
-               T_ASSERT_POSIX_SUCCESS(len, "called microstackshot syscall");
-               break;
-    }
-
-       T_EXPECT_EQ(*(uint32_t *)buf,
-                       (uint32_t)STACKSHOT_MICRO_SNAPSHOT_MAGIC,
-                       "magic value for microstackshot matches");
-
-       free(buf);
-}
-
-struct scenario {
-       const char *name;
-       uint32_t flags;
-       bool should_fail;
-       bool maybe_unsupported;
-       pid_t target_pid;
-       uint64_t since_timestamp;
-       uint32_t size_hint;
-       dt_stat_time_t timer;
-};
-
-static void
-quiet(struct scenario *scenario)
-{
-       if (scenario->timer) {
-               T_QUIET;
-       }
-}
-
-static void
-take_stackshot(struct scenario *scenario, void (^cb)(void *buf, size_t size))
-{
-       initialize_thread();
-
-       void *config = stackshot_config_create();
-       quiet(scenario);
-       T_ASSERT_NOTNULL(config, "created stackshot config");
-
-       int ret = stackshot_config_set_flags(config, scenario->flags);
-       quiet(scenario);
-       T_ASSERT_POSIX_ZERO(ret, "set flags %#x on stackshot config", scenario->flags);
-
-       if (scenario->size_hint > 0) {
-               ret = stackshot_config_set_size_hint(config, scenario->size_hint);
-               quiet(scenario);
-               T_ASSERT_POSIX_ZERO(ret, "set size hint %" PRIu32 " on stackshot config",
-                               scenario->size_hint);
-       }
-
-       if (scenario->target_pid > 0) {
-               ret = stackshot_config_set_pid(config, scenario->target_pid);
-               quiet(scenario);
-               T_ASSERT_POSIX_ZERO(ret, "set target pid %d on stackshot config",
-                               scenario->target_pid);
-       }
-
-       if (scenario->since_timestamp > 0) {
-               ret = stackshot_config_set_delta_timestamp(config, scenario->since_timestamp);
-               quiet(scenario);
-               T_ASSERT_POSIX_ZERO(ret, "set since timestamp %" PRIu64 " on stackshot config",
-                               scenario->since_timestamp);
-       }
-
-       int retries_remaining = 5;
-
-retry: ;
-       uint64_t start_time = mach_absolute_time();
-       ret = stackshot_capture_with_config(config);
-       uint64_t end_time = mach_absolute_time();
-
-       if (scenario->should_fail) {
-               T_EXPECTFAIL;
-               T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config");
-               return;
-       }
-
-       if (ret == EBUSY || ret == ETIMEDOUT) {
-               if (retries_remaining > 0) {
-                       if (!scenario->timer) {
-                               T_LOG("stackshot_capture_with_config failed with %s (%d), retrying",
-                                               strerror(ret), ret);
-                       }
-
-                       retries_remaining--;
-                       goto retry;
-               } else {
-                       T_ASSERT_POSIX_ZERO(ret,
-                                       "called stackshot_capture_with_config (no retries remaining)");
-               }
-       } else if ((ret == ENOTSUP) && scenario->maybe_unsupported) {
-               T_SKIP("kernel indicated this stackshot configuration is not supported");
-       } else {
-               quiet(scenario);
-               T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config");
-       }
-
-       if (scenario->timer) {
-               dt_stat_mach_time_add(scenario->timer, end_time - start_time);
-       }
-       void *buf = stackshot_config_get_stackshot_buffer(config);
-       size_t size = stackshot_config_get_stackshot_size(config);
-       if (scenario->name) {
-               char sspath[MAXPATHLEN];
-               strlcpy(sspath, scenario->name, sizeof(sspath));
-               strlcat(sspath, ".kcdata", sizeof(sspath));
-               T_QUIET; T_ASSERT_POSIX_ZERO(dt_resultfile(sspath, sizeof(sspath)),
-                               "create result file path");
-
-               T_LOG("writing stackshot to %s", sspath);
-
-               FILE *f = fopen(sspath, "w");
-               T_WITH_ERRNO; T_QUIET; T_ASSERT_NOTNULL(f,
-                               "open stackshot output file");
-
-               size_t written = fwrite(buf, size, 1, f);
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(written, "wrote stackshot to file");
-
-               fclose(f);
-       }
-       cb(buf, size);
-
-       ret = stackshot_config_dealloc(config);
-       T_QUIET; T_EXPECT_POSIX_ZERO(ret, "deallocated stackshot config");
-}
-
-T_DECL(kcdata, "test that kcdata stackshots can be taken and parsed")
-{
-       struct scenario scenario = {
-               .name = "kcdata",
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS |
-                               STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT),
-       };
-
-       T_LOG("taking kcdata stackshot");
-       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-               parse_stackshot(0, ssbuf, sslen, -1);
-       });
-}
-
-T_DECL(kcdata_faulting, "test that kcdata stackshots while faulting can be taken and parsed")
-{
-       struct scenario scenario = {
-               .name = "faulting",
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
-                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT
-                               | STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING),
-       };
-
-       T_LOG("taking faulting stackshot");
-       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-               parse_stackshot(0, ssbuf, sslen, -1);
-       });
-}
-
-T_DECL(bad_flags, "test a poorly-formed stackshot syscall")
-{
-       struct scenario scenario = {
-               .flags = STACKSHOT_SAVE_IN_KERNEL_BUFFER /* not allowed from user space */,
-               .should_fail = true,
-       };
-
-       T_LOG("attempting to take stackshot with kernel-only flag");
-       take_stackshot(&scenario, ^(__unused void *ssbuf, __unused size_t sslen) {
-               T_ASSERT_FAIL("stackshot data callback called");
-       });
-}
-
-T_DECL(delta, "test delta stackshots")
-{
-       struct scenario scenario = {
-               .name = "delta",
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
-                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT),
-       };
-
-       T_LOG("taking full stackshot");
-       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-               uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen);
-
-               T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time);
-
-               parse_stackshot(0, ssbuf, sslen, -1);
-
-               struct scenario delta_scenario = {
-                       .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
-                                       | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT
-                                       | STACKSHOT_COLLECT_DELTA_SNAPSHOT),
-                       .since_timestamp = stackshot_time
-               };
-
-               take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) {
-                       parse_stackshot(PARSE_STACKSHOT_DELTA, dssbuf, dsslen, -1);
-               });
-       });
-}
-
-T_DECL(shared_cache_layout, "test stackshot inclusion of shared cache layout")
-{
-       struct scenario scenario = {
-               .name = "shared_cache_layout",
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
-                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT |
-                               STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT),
-       };
-
-       T_LOG("taking stackshot with STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT set");
-       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-               parse_stackshot(PARSE_STACKSHOT_SHAREDCACHE_LAYOUT, ssbuf, sslen, -1);
-       });
-}
-
-static void *stuck_sysctl_thread(void *arg) {
-       int val = 1;
-       dispatch_semaphore_t child_thread_started = *(dispatch_semaphore_t *)arg;
-
-       dispatch_semaphore_signal(child_thread_started);
-       T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.wedge_thread", NULL, NULL, &val, sizeof(val)), "wedge child thread");
-
-       return NULL;
-}
-
-T_HELPER_DECL(zombie_child, "child process to sample as a zombie")
-{
-       pthread_t pthread;
-       dispatch_semaphore_t child_thread_started = dispatch_semaphore_create(0);
-       T_QUIET; T_ASSERT_NOTNULL(child_thread_started, "zombie child thread semaphore");
-
-       /* spawn another thread to get stuck in the kernel, then call exit() to become a zombie */
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_create(&pthread, NULL, stuck_sysctl_thread, &child_thread_started), "pthread_create");
-
-       dispatch_semaphore_wait(child_thread_started, DISPATCH_TIME_FOREVER);
-
-       /* sleep for a bit in the hope of ensuring that the other thread has called the sysctl before we signal the parent */
-       usleep(100);
-       T_ASSERT_POSIX_SUCCESS(kill(getppid(), SIGUSR1), "signaled parent to take stackshot");
-
-       exit(0);
-}
-
-T_DECL(zombie, "tests a stackshot of a zombie task with a thread stuck in the kernel")
-{
-       char path[PATH_MAX];
-       uint32_t path_size = sizeof(path);
-       T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath");
-       char *args[] = { path, "-n", "zombie_child", NULL };
-
-       dispatch_source_t child_sig_src;
-       dispatch_semaphore_t child_ready_sem = dispatch_semaphore_create(0);
-       T_QUIET; T_ASSERT_NOTNULL(child_ready_sem, "zombie child semaphore");
-
-       dispatch_queue_t signal_processing_q = dispatch_queue_create("signal processing queue", NULL);
-       T_QUIET; T_ASSERT_NOTNULL(child_ready_sem, "signal processing queue");
-
-       pid_t pid;
-
-       T_LOG("spawning a child");
-
-       signal(SIGUSR1, SIG_IGN);
-       child_sig_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, signal_processing_q);
-       T_QUIET; T_ASSERT_NOTNULL(child_sig_src, "dispatch_source_create (child_sig_src)");
-
-       dispatch_source_set_event_handler(child_sig_src, ^{ dispatch_semaphore_signal(child_ready_sem); });
-       dispatch_activate(child_sig_src);
-
-       int sp_ret = posix_spawn(&pid, args[0], NULL, NULL, args, NULL);
-       T_QUIET; T_ASSERT_POSIX_ZERO(sp_ret, "spawned process '%s' with PID %d", args[0], pid);
-
-       dispatch_semaphore_wait(child_ready_sem, DISPATCH_TIME_FOREVER);
-
-       T_LOG("received signal from child, capturing stackshot");
-
-       struct proc_bsdshortinfo bsdshortinfo;
-       int retval, iterations_to_wait = 10;
-
-       while (iterations_to_wait > 0) {
-               retval = proc_pidinfo(pid, PROC_PIDT_SHORTBSDINFO, 0, &bsdshortinfo, sizeof(bsdshortinfo));
-               if ((retval == 0) && errno == ESRCH) {
-                       T_LOG("unable to find child using proc_pidinfo, assuming zombie");
-                       break;
-               }
-
-               T_QUIET; T_WITH_ERRNO; T_ASSERT_GT(retval, 0, "proc_pidinfo(PROC_PIDT_SHORTBSDINFO) returned a value > 0");
-               T_QUIET; T_ASSERT_EQ(retval, (int)sizeof(bsdshortinfo), "proc_pidinfo call for PROC_PIDT_SHORTBSDINFO returned expected size");
-
-               if (bsdshortinfo.pbsi_flags & PROC_FLAG_INEXIT) {
-                       T_LOG("child proc info marked as in exit");
-                       break;
-               }
-
-               iterations_to_wait--;
-               if (iterations_to_wait == 0) {
-                       /*
-                        * This will mark the test as failed but let it continue so we
-                        * don't leave a process stuck in the kernel.
-                        */
-                       T_FAIL("unable to discover that child is marked as exiting");
-               }
-
-               /* Give the child a few more seconds to make it to exit */
-               sleep(5);
-       }
-
-       /* Give the child some more time to make it through exit */
-       sleep(10);
-
-       struct scenario scenario = {
-               .name = "zombie",
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
-                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT),
-       };
-
-       take_stackshot(&scenario, ^( void *ssbuf, size_t sslen) {
-               /* First unwedge the child so we can reap it */
-               int val = 1, status;
-               T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.unwedge_thread", NULL, NULL, &val, sizeof(val)), "unwedge child");
-
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(pid, &status, 0), "waitpid on zombie child");
-
-               parse_stackshot(PARSE_STACKSHOT_ZOMBIE, ssbuf, sslen, pid);
-       });
-}
-
-static void
-expect_instrs_cycles_in_stackshot(void *ssbuf, size_t sslen)
-{
-       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
-
-       bool in_task = false;
-       bool in_thread = false;
-       bool saw_instrs_cycles = false;
-       iter = kcdata_iter_next(iter);
-
-       KCDATA_ITER_FOREACH(iter) {
-               switch (kcdata_iter_type(iter)) {
-               case KCDATA_TYPE_CONTAINER_BEGIN:
-                       switch (kcdata_iter_container_type(iter)) {
-                       case STACKSHOT_KCCONTAINER_TASK:
-                               in_task = true;
-                               saw_instrs_cycles = false;
-                               break;
-
-                       case STACKSHOT_KCCONTAINER_THREAD:
-                               in_thread = true;
-                               saw_instrs_cycles = false;
-                               break;
-
-                       default:
-                               break;
-                       }
-                       break;
-
-               case STACKSHOT_KCTYPE_INSTRS_CYCLES:
-                       saw_instrs_cycles = true;
-                       break;
-
-               case KCDATA_TYPE_CONTAINER_END:
-                       if (in_thread) {
-                               T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles,
-                                               "saw instructions and cycles in thread");
-                               in_thread = false;
-                       } else if (in_task) {
-                               T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles,
-                                               "saw instructions and cycles in task");
-                               in_task = false;
-                       }
-
-               default:
-                       break;
-               }
-       }
-}
-
-static void
-skip_if_monotonic_unsupported(void)
-{
-       int supported = 0;
-       size_t supported_size = sizeof(supported);
-       int ret = sysctlbyname("kern.monotonic.supported", &supported,
-                       &supported_size, 0, 0);
-       if (ret < 0 || !supported) {
-               T_SKIP("monotonic is unsupported");
-       }
-}
-
-T_DECL(instrs_cycles, "test a getting instructions and cycles in stackshot")
-{
-       skip_if_monotonic_unsupported();
-
-       struct scenario scenario = {
-               .name = "instrs-cycles",
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES
-                               | STACKSHOT_KCDATA_FORMAT),
-       };
-
-       T_LOG("attempting to take stackshot with instructions and cycles");
-       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-               parse_stackshot(0, ssbuf, sslen, -1);
-               expect_instrs_cycles_in_stackshot(ssbuf, sslen);
-       });
-}
-
-T_DECL(delta_instrs_cycles,
-               "test delta stackshots with instructions and cycles")
-{
-       skip_if_monotonic_unsupported();
-
-       struct scenario scenario = {
-               .name = "delta-instrs-cycles",
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES
-                               | STACKSHOT_KCDATA_FORMAT),
-       };
-
-       T_LOG("taking full stackshot");
-       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-               uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen);
-
-               T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time);
-
-               parse_stackshot(0, ssbuf, sslen, -1);
-               expect_instrs_cycles_in_stackshot(ssbuf, sslen);
-
-               struct scenario delta_scenario = {
-                       .name = "delta-instrs-cycles-next",
-                       .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES
-                                       | STACKSHOT_KCDATA_FORMAT
-                                       | STACKSHOT_COLLECT_DELTA_SNAPSHOT),
-                       .since_timestamp = stackshot_time,
-               };
-
-               take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) {
-                       parse_stackshot(PARSE_STACKSHOT_DELTA, dssbuf, dsslen, -1);
-                       expect_instrs_cycles_in_stackshot(dssbuf, dsslen);
-               });
-       });
-}
-
-static void
-check_thread_groups_supported()
-{
-       int err;
-       int supported = 0;
-       size_t supported_size = sizeof(supported);
-       err = sysctlbyname("kern.thread_groups_supported", &supported, &supported_size, NULL, 0);
-
-       if (err || !supported)
-               T_SKIP("thread groups not supported on this system");
-}
-
-T_DECL(thread_groups, "test getting thread groups in stackshot")
-{
-       check_thread_groups_supported();
-
-       struct scenario scenario = {
-               .name = "thread-groups",
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_THREAD_GROUP
-                               | STACKSHOT_KCDATA_FORMAT),
-       };
-
-       T_LOG("attempting to take stackshot with thread group flag");
-       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-               parse_thread_group_stackshot(ssbuf, sslen);
-       });
-}
-
-static void
-parse_page_table_asid_stackshot(void **ssbuf, size_t sslen)
-{
-       bool seen_asid = false;
-       bool seen_page_table_snapshot = false;
-       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
-       T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT,
-                       "buffer provided is a stackshot");
-
-       iter = kcdata_iter_next(iter);
-       KCDATA_ITER_FOREACH(iter) {
-               switch (kcdata_iter_type(iter)) {
-               case KCDATA_TYPE_ARRAY: {
-                       T_QUIET;
-                       T_ASSERT_TRUE(kcdata_iter_array_valid(iter),
-                                       "checked that array is valid");
-
-                       if (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_PAGE_TABLES) {
-                               continue;
-                       }
-
-                       T_ASSERT_FALSE(seen_page_table_snapshot, "check that we haven't yet seen a page table snapshot");
-                       seen_page_table_snapshot = true;
-
-                       T_ASSERT_EQ((size_t) kcdata_iter_array_elem_size(iter), sizeof(uint64_t),
-                               "check that each element of the pagetable dump is the expected size");
-
-                       uint64_t *pt_array = kcdata_iter_payload(iter);
-                       uint32_t elem_count = kcdata_iter_array_elem_count(iter);
-                       uint32_t j;
-                       bool nonzero_tte = false;
-                       for (j = 0; j < elem_count;) {
-                               T_QUIET; T_ASSERT_LE(j + 4, elem_count, "check for valid page table segment header");
-                               uint64_t pa = pt_array[j];
-                               uint64_t num_entries = pt_array[j + 1];
-                               uint64_t start_va = pt_array[j + 2];
-                               uint64_t end_va = pt_array[j + 3];
-
-                               T_QUIET; T_ASSERT_NE(pa, (uint64_t) 0, "check that the pagetable physical address is non-zero");
-                               T_QUIET; T_ASSERT_EQ(pa % (num_entries * sizeof(uint64_t)), (uint64_t) 0, "check that the pagetable physical address is correctly aligned");
-                               T_QUIET; T_ASSERT_NE(num_entries, (uint64_t) 0, "check that a pagetable region has more than 0 entries");
-                               T_QUIET; T_ASSERT_LE(j + 4 + num_entries, (uint64_t) elem_count, "check for sufficient space in page table array");
-                               T_QUIET; T_ASSERT_GT(end_va, start_va, "check for valid VA bounds in page table segment header");
-
-                               for (uint32_t k = j + 4; k < (j + 4 + num_entries); ++k) {
-                                       if (pt_array[k] != 0) {
-                                               nonzero_tte = true;
-                                               T_QUIET; T_ASSERT_EQ((pt_array[k] >> 48) & 0xf, (uint64_t) 0, "check that bits[48:51] of arm64 TTE are clear");
-                                               // L0-L2 table and non-compressed L3 block entries should always have bit 1 set; assumes L0-L2 blocks will not be used outside the kernel
-                                               bool table = ((pt_array[k] & 0x2) != 0);
-                                               if (table) {
-                                                       T_QUIET; T_ASSERT_NE(pt_array[k] & ((1ULL << 48) - 1) & ~((1ULL << 12) - 1), (uint64_t) 0, "check that arm64 TTE physical address is non-zero");
-                                               } else { // should be a compressed PTE
-                                                       T_QUIET; T_ASSERT_NE(pt_array[k] & 0xC000000000000000ULL, (uint64_t) 0, "check that compressed PTE has at least one of bits [63:62] set");
-                                                       T_QUIET; T_ASSERT_EQ(pt_array[k] & ~0xC000000000000000ULL, (uint64_t) 0, "check that compressed PTE has no other bits besides [63:62] set");
-                                               }
-                                       }
-                               }
-
-                               j += (4 + num_entries);
-                       }
-                       T_ASSERT_TRUE(nonzero_tte, "check that we saw at least one non-empty TTE");
-                       T_ASSERT_EQ(j, elem_count, "check that page table dump size matches extent of last header"); 
-                       break;
-               }
-               case STACKSHOT_KCTYPE_ASID: {
-                       T_ASSERT_FALSE(seen_asid, "check that we haven't yet seen an ASID");
-                       seen_asid = true;
-               }
-               }
-       }
-       T_ASSERT_TRUE(seen_page_table_snapshot, "check that we have seen a page table snapshot");
-       T_ASSERT_TRUE(seen_asid, "check that we have seen an ASID");
-}
-
-T_DECL(dump_page_tables, "test stackshot page table dumping support")
-{
-       struct scenario scenario = {
-               .name = "asid-page-tables",
-               .flags = (STACKSHOT_KCDATA_FORMAT | STACKSHOT_ASID | STACKSHOT_PAGE_TABLES),
-               .size_hint = (1ULL << 23), // 8 MB
-               .target_pid = getpid(),
-               .maybe_unsupported = true,
-       };
-
-       T_LOG("attempting to take stackshot with ASID and page table flags");
-       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-               parse_page_table_asid_stackshot(ssbuf, sslen);
-       });
-}
-
-#pragma mark performance tests
-
-#define SHOULD_REUSE_SIZE_HINT 0x01
-#define SHOULD_USE_DELTA       0x02
-#define SHOULD_TARGET_SELF     0x04
-
-static void
-stackshot_perf(unsigned int options)
-{
-       struct scenario scenario = {
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
-                       | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT),
-       };
-
-       dt_stat_t size = dt_stat_create("bytes", "size");
-       dt_stat_time_t duration = dt_stat_time_create("duration");
-       scenario.timer = duration;
-
-       if (options & SHOULD_TARGET_SELF) {
-               scenario.target_pid = getpid();
-       }
-
-       while (!dt_stat_stable(duration) || !dt_stat_stable(size)) {
-               __block uint64_t last_time = 0;
-               __block uint32_t size_hint = 0;
-               take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-                       dt_stat_add(size, (double)sslen);
-                       last_time = stackshot_timestamp(ssbuf, sslen);
-                       size_hint = (uint32_t)sslen;
-               });
-               if (options & SHOULD_USE_DELTA) {
-                       scenario.since_timestamp = last_time;
-                       scenario.flags |= STACKSHOT_COLLECT_DELTA_SNAPSHOT;
-               }
-               if (options & SHOULD_REUSE_SIZE_HINT) {
-                       scenario.size_hint = size_hint;
-               }
-       }
-
-       dt_stat_finalize(duration);
-       dt_stat_finalize(size);
-}
-
-T_DECL(perf_no_size_hint, "test stackshot performance with no size hint",
-               T_META_TAG_PERF)
-{
-       stackshot_perf(0);
-}
-
-T_DECL(perf_size_hint, "test stackshot performance with size hint",
-               T_META_TAG_PERF)
-{
-       stackshot_perf(SHOULD_REUSE_SIZE_HINT);
-}
-
-T_DECL(perf_process, "test stackshot performance targeted at process",
-               T_META_TAG_PERF)
-{
-       stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_TARGET_SELF);
-}
-
-T_DECL(perf_delta, "test delta stackshot performance",
-               T_META_TAG_PERF)
-{
-       stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA);
-}
-
-T_DECL(perf_delta_process, "test delta stackshot performance targeted at a process",
-               T_META_TAG_PERF)
-{
-       stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA | SHOULD_TARGET_SELF);
-}
-
-static uint64_t
-stackshot_timestamp(void *ssbuf, size_t sslen)
-{
-       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
-
-       uint32_t type = kcdata_iter_type(iter);
-       if (type != KCDATA_BUFFER_BEGIN_STACKSHOT && type != KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT) {
-               T_ASSERT_FAIL("invalid kcdata type %u", kcdata_iter_type(iter));
-       }
-
-       iter = kcdata_iter_find_type(iter, KCDATA_TYPE_MACH_ABSOLUTE_TIME);
-       T_QUIET;
-       T_ASSERT_TRUE(kcdata_iter_valid(iter), "timestamp found in stackshot");
-
-       return *(uint64_t *)kcdata_iter_payload(iter);
-}
-
-#define TEST_THREAD_NAME "stackshot_test_thread"
-
-static void
-parse_thread_group_stackshot(void **ssbuf, size_t sslen)
-{
-       bool seen_thread_group_snapshot = false;
-       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
-       T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT,
-                       "buffer provided is a stackshot");
-
-       NSMutableSet *thread_groups = [[NSMutableSet alloc] init];
-
-       iter = kcdata_iter_next(iter);
-       KCDATA_ITER_FOREACH(iter) {
-               switch (kcdata_iter_type(iter)) {
-               case KCDATA_TYPE_ARRAY: {
-                       T_QUIET;
-                       T_ASSERT_TRUE(kcdata_iter_array_valid(iter),
-                                       "checked that array is valid");
-
-                       if (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT) {
-                               continue;
-                       }
-
-                       seen_thread_group_snapshot = true;
-
-                       if (kcdata_iter_array_elem_size(iter) >= sizeof(struct thread_group_snapshot_v2)) {
-                               struct thread_group_snapshot_v2 *tgs_array = kcdata_iter_payload(iter);
-                               for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) {
-                                       struct thread_group_snapshot_v2 *tgs = tgs_array + j;
-                                       [thread_groups addObject:@(tgs->tgs_id)];
-                               }
-
-                       }
-                       else {
-                               struct thread_group_snapshot *tgs_array = kcdata_iter_payload(iter);
-                               for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) {
-                                       struct thread_group_snapshot *tgs = tgs_array + j;
-                                       [thread_groups addObject:@(tgs->tgs_id)];
-                               }
-                       }
-                       break;
-               }
-               }
-       }
-       KCDATA_ITER_FOREACH(iter) {
-               NSError *error = nil;
-
-               switch (kcdata_iter_type(iter)) {
-
-               case KCDATA_TYPE_CONTAINER_BEGIN: {
-                       T_QUIET;
-                       T_ASSERT_TRUE(kcdata_iter_container_valid(iter),
-                                       "checked that container is valid");
-
-                       if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_THREAD) {
-                               break;
-                       }
-
-                       NSDictionary *container = parseKCDataContainer(&iter, &error);
-                       T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot");
-                       T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container");
-
-                       int tg = [container[@"thread_snapshots"][@"thread_group"] intValue];
-
-                       T_ASSERT_TRUE([thread_groups containsObject:@(tg)], "check that the thread group the thread is in exists");
-
-                       break;
-               };
-
-               }
-       }
-       T_ASSERT_TRUE(seen_thread_group_snapshot, "check that we have seen a thread group snapshot");
-}
-
-static void
-verify_stackshot_sharedcache_layout(struct dyld_uuid_info_64 *uuids, uint32_t uuid_count)
-{
-       uuid_t cur_shared_cache_uuid;
-       __block uint32_t lib_index = 0, libs_found = 0;
-
-       _dyld_get_shared_cache_uuid(cur_shared_cache_uuid);
-       int result = dyld_shared_cache_iterate_text(cur_shared_cache_uuid, ^(const dyld_shared_cache_dylib_text_info* info) {
-                       T_QUIET; T_ASSERT_LT(lib_index, uuid_count, "dyld_shared_cache_iterate_text exceeded number of libraries returned by kernel");
-
-                       libs_found++;
-                       struct dyld_uuid_info_64 *cur_stackshot_uuid_entry = &uuids[lib_index];
-                       T_QUIET; T_ASSERT_EQ(memcmp(info->dylibUuid, cur_stackshot_uuid_entry->imageUUID, sizeof(info->dylibUuid)), 0,
-                                       "dyld returned UUID doesn't match kernel returned UUID");
-                       T_QUIET; T_ASSERT_EQ(info->loadAddressUnslid, cur_stackshot_uuid_entry->imageLoadAddress,
-                                       "dyld returned load address doesn't match kernel returned load address");
-                       lib_index++;
-               });
-
-       T_ASSERT_EQ(result, 0, "iterate shared cache layout");
-       T_ASSERT_EQ(libs_found, uuid_count, "dyld iterator returned same number of libraries as kernel");
-
-       T_LOG("verified %d libraries from dyld shared cache", libs_found);
-}
-
-static void
-parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int child_pid)
-{
-       bool delta = (stackshot_parsing_flags & PARSE_STACKSHOT_DELTA);
-       bool expect_zombie_child = (stackshot_parsing_flags & PARSE_STACKSHOT_ZOMBIE);
-       bool expect_shared_cache_layout = false;
-       bool expect_shared_cache_uuid = !delta;
-       bool found_zombie_child = false, found_shared_cache_layout = false, found_shared_cache_uuid = false;
-
-       if (stackshot_parsing_flags & PARSE_STACKSHOT_SHAREDCACHE_LAYOUT) {
-               size_t shared_cache_length = 0;
-               const struct dyld_cache_header *cache_header = NULL;
-               cache_header = _dyld_get_shared_cache_range(&shared_cache_length);
-               T_QUIET; T_ASSERT_NOTNULL(cache_header, "current process running with shared cache");
-               T_QUIET; T_ASSERT_GT(shared_cache_length, sizeof(struct _dyld_cache_header), "valid shared cache length populated by _dyld_get_shared_cache_range");
-
-               if (cache_header->locallyBuiltCache) {
-                       T_LOG("device running with locally built shared cache, expect shared cache layout");
-                       expect_shared_cache_layout = true;
-               } else {
-                       T_LOG("device running with B&I built shared-cache, no shared cache layout expected");
-               }
-       }
-
-       if (expect_zombie_child) {
-               T_QUIET; T_ASSERT_GT(child_pid, 0, "child pid greater than zero");
-       }
-
-       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
-       if (delta) {
-               T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT,
-                               "buffer provided is a delta stackshot");
-       } else {
-               T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT,
-                               "buffer provided is a stackshot");
-       }
-
-       iter = kcdata_iter_next(iter);
-       KCDATA_ITER_FOREACH(iter) {
-               NSError *error = nil;
-
-               switch (kcdata_iter_type(iter)) {
-               case KCDATA_TYPE_ARRAY: {
-                       T_QUIET;
-                       T_ASSERT_TRUE(kcdata_iter_array_valid(iter),
-                                       "checked that array is valid");
-
-                       NSMutableDictionary *array = parseKCDataArray(iter, &error);
-                       T_QUIET; T_ASSERT_NOTNULL(array, "parsed array from stackshot");
-                       T_QUIET; T_ASSERT_NULL(error, "error unset after parsing array");
-
-                       if (kcdata_iter_array_elem_type(iter) == STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT) {
-                               struct dyld_uuid_info_64 *shared_cache_uuids = kcdata_iter_payload(iter);
-                               uint32_t uuid_count = kcdata_iter_array_elem_count(iter);
-                               T_ASSERT_NOTNULL(shared_cache_uuids, "parsed shared cache layout array");
-                               T_ASSERT_GT(uuid_count, 0, "returned valid number of UUIDs from shared cache");
-                               verify_stackshot_sharedcache_layout(shared_cache_uuids, uuid_count);
-                               found_shared_cache_layout = true;
-                       }
-
-                       break;
-               }
-
-               case KCDATA_TYPE_CONTAINER_BEGIN: {
-                       T_QUIET;
-                       T_ASSERT_TRUE(kcdata_iter_container_valid(iter),
-                                       "checked that container is valid");
-
-                       if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_TASK) {
-                               break;
-                       }
-
-                       NSDictionary *container = parseKCDataContainer(&iter, &error);
-                       T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot");
-                       T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container");
-
-                       int pid = [container[@"task_snapshots"][@"task_snapshot"][@"ts_pid"] intValue];
-                       if (expect_zombie_child && (pid == child_pid)) {
-                                       found_zombie_child = true;
-
-                                       uint64_t task_flags = [container[@"task_snapshots"][@"task_snapshot"][@"ts_ss_flags"] unsignedLongLongValue];
-                                       T_ASSERT_TRUE((task_flags & kTerminatedSnapshot) == kTerminatedSnapshot, "child zombie marked as terminated");
-
-                                       continue;
-                       } else if (pid != getpid()) {
-                               break;
-                       }
-
-                       T_EXPECT_EQ_STR(current_process_name(),
-                                       [container[@"task_snapshots"][@"task_snapshot"][@"ts_p_comm"] UTF8String],
-                                       "current process name matches in stackshot");
-
-                       uint64_t task_flags = [container[@"task_snapshots"][@"task_snapshot"][@"ts_ss_flags"] unsignedLongLongValue];
-                       T_ASSERT_FALSE((task_flags & kTerminatedSnapshot) == kTerminatedSnapshot, "current process not marked as terminated");
-
-                       T_QUIET;
-                       T_EXPECT_LE(pid, [container[@"task_snapshots"][@"task_snapshot"][@"ts_unique_pid"] intValue],
-                                       "unique pid is greater than pid");
-
-                       bool found_main_thread = false;
-                       for (id thread_key in container[@"task_snapshots"][@"thread_snapshots"]) {
-                               NSMutableDictionary *thread = container[@"task_snapshots"][@"thread_snapshots"][thread_key];
-                               NSDictionary *thread_snap = thread[@"thread_snapshot"];
-
-                               T_QUIET; T_EXPECT_GT([thread_snap[@"ths_thread_id"] intValue], 0,
-                                               "thread ID of thread in current task is valid");
-                               T_QUIET; T_EXPECT_GT([thread_snap[@"ths_base_priority"] intValue], 0,
-                                               "base priority of thread in current task is valid");
-                               T_QUIET; T_EXPECT_GT([thread_snap[@"ths_sched_priority"] intValue], 0,
-                                               "scheduling priority of thread in current task is valid");
-
-                               NSString *pth_name = thread[@"pth_name"];
-                               if (pth_name != nil && [pth_name isEqualToString:@TEST_THREAD_NAME]) {
-                                       found_main_thread = true;
-
-                                       T_QUIET; T_EXPECT_GT([thread_snap[@"ths_total_syscalls"] intValue], 0,
-                                                       "total syscalls of current thread is valid");
-
-                                       NSDictionary *cpu_times = thread[@"cpu_times"];
-                                       T_EXPECT_GE([cpu_times[@"runnable_time"] intValue],
-                                                       [cpu_times[@"system_time"] intValue] +
-                                                       [cpu_times[@"user_time"] intValue],
-                                                       "runnable time of current thread is valid");
-                               }
-                       }
-                       T_EXPECT_TRUE(found_main_thread, "found main thread for current task in stackshot");
-                       break;
-               }
-               case STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO: {
-                       struct dyld_uuid_info_64_v2 *shared_cache_info = kcdata_iter_payload(iter);
-                       uuid_t shared_cache_uuid;
-                       T_QUIET; T_ASSERT_TRUE(_dyld_get_shared_cache_uuid(shared_cache_uuid), "retrieve current shared cache UUID");
-                       T_QUIET; T_ASSERT_EQ(memcmp(shared_cache_info->imageUUID, shared_cache_uuid, sizeof(shared_cache_uuid)), 0,
-                                       "dyld returned UUID doesn't match kernel returned UUID for system shared cache");
-                       found_shared_cache_uuid = true;
-                       break;
-               }
-               }
-       }
-
-       if (expect_zombie_child) {
-               T_QUIET; T_ASSERT_TRUE(found_zombie_child, "found zombie child in kcdata");
-       }
-
-       if (expect_shared_cache_layout) {
-               T_QUIET; T_ASSERT_TRUE(found_shared_cache_layout, "shared cache layout found in kcdata");
-       }
-
-       if (expect_shared_cache_uuid) {
-               T_QUIET; T_ASSERT_TRUE(found_shared_cache_uuid, "shared cache UUID found in kcdata");
-       }
-
-       T_ASSERT_FALSE(KCDATA_ITER_FOREACH_FAILED(iter), "successfully iterated kcdata");
-}
-
-static const char *
-current_process_name(void)
-{
-       static char name[64];
-
-       if (!name[0]) {
-               int ret = proc_name(getpid(), name, sizeof(name));
-               T_QUIET;
-               T_ASSERT_POSIX_SUCCESS(ret, "proc_name failed for current process");
-       }
-
-       return name;
-}
-
-static void
-initialize_thread(void)
-{
-       int ret = pthread_setname_np(TEST_THREAD_NAME);
-       T_QUIET;
-       T_ASSERT_POSIX_ZERO(ret, "set thread name to %s", TEST_THREAD_NAME);
-}
index aabe544b818244b35ce6e0f7770e6b9f7b15dbd5..e7f34ea3c1bd117baf90cb1a8b35d7ede0bccd45 100644 (file)
@@ -24,7 +24,7 @@
 #include <unistd.h>
 #include <TargetConditionals.h>
 
-#if !TARGET_OS_EMBEDDED
+#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 #include <pcre.h>
 #endif
 
@@ -118,7 +118,7 @@ void check_python(void *stackshot, const char *fmt, ...)
 {
        save_stackshot(stackshot, "/tmp/ss");
 
-#if !TARGET_OS_EMBEDDED
+#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
        va_list args;
        va_start(args, fmt);
        char *re_string = NULL;
index 448095598fb567db7acc7b377fbe504e02260dc3..342ea2bc68d541a10a138989b5073ba2f551c035 100644 (file)
@@ -18,13 +18,7 @@ T_GLOBAL_META(
        T_META_ASROOT(true)
        );
 
-#if TARGET_OS_WATCH
-#define SPAWN_ITERATIONS 1999
-#elif TARGET_OS_IPHONE
-#define SPAWN_ITERATIONS 4999
-#else
-#define SPAWN_ITERATIONS 9999
-#endif
+#define TEST_DURATION_NS (60 * NSEC_PER_SEC)
 
 #define REAP_INTERVAL 10
 
@@ -78,13 +72,15 @@ retry:
        T_QUIET; T_EXPECT_POSIX_ZERO(ret, "deallocated stackshot config");
 }
 
-T_DECL(stackshot_spawn_exit, "tests taking many stackshots while children processes are spawning+exiting")
+T_DECL(stackshot_spawn_exit, "tests taking many stackshots while children processes are spawning+exiting", T_META_TIMEOUT(120))
 {
        char path[PATH_MAX];
        uint32_t path_size = sizeof(path);
        T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath");
        char *args[] = { path, "-n", "spawn_children_helper", NULL };
 
+       uint64_t stop_time = clock_gettime_nsec_np(CLOCK_UPTIME_RAW) + TEST_DURATION_NS;
+
        dispatch_queue_t stackshot_queue = dispatch_queue_create("stackshot_queue", NULL);
        dispatch_async(stackshot_queue, ^(void) {
                int num_stackshots = 0;
@@ -108,7 +104,8 @@ T_DECL(stackshot_spawn_exit, "tests taking many stackshots while children proces
            "set stdout of child to NULL");
 
        int children_unreaped = 0, status;
-       for (int iterations_remaining = SPAWN_ITERATIONS; iterations_remaining > 0; iterations_remaining--) {
+       uint64_t iterations_completed = 0;
+       while (clock_gettime_nsec_np(CLOCK_UPTIME_RAW) < stop_time) {
                pid_t pid;
 
                int sp_ret = posix_spawn(&pid, args[0], &actions, NULL, args, NULL);
@@ -123,9 +120,10 @@ T_DECL(stackshot_spawn_exit, "tests taking many stackshots while children proces
                        }
                }
 
-               if ((iterations_remaining % 100) == 0) {
-                       T_LOG("spawned %d children thus far", (SPAWN_ITERATIONS - iterations_remaining));
+               if ((iterations_completed % 100) == 0) {
+                       T_LOG("spawned %llu children thus far", iterations_completed);
                }
+               iterations_completed++;
        }
 
        while (children_unreaped) {
diff --git a/tests/stackshot_tests.m b/tests/stackshot_tests.m
new file mode 100644 (file)
index 0000000..29fa817
--- /dev/null
@@ -0,0 +1,1302 @@
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <kern/debug.h>
+#include <kern/kern_cdata.h>
+#include <kdd.h>
+#include <libproc.h>
+#include <mach-o/dyld.h>
+#include <mach-o/dyld_images.h>
+#include <mach-o/dyld_priv.h>
+#include <sys/syscall.h>
+#include <sys/stackshot.h>
+
+T_GLOBAL_META(
+               T_META_NAMESPACE("xnu.stackshot"),
+               T_META_CHECK_LEAKS(false),
+               T_META_ASROOT(true)
+               );
+
+static const char *current_process_name(void);
+static void verify_stackshot_sharedcache_layout(struct dyld_uuid_info_64 *uuids, uint32_t uuid_count);
+static void parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int child_pid);
+static void parse_thread_group_stackshot(void **sbuf, size_t sslen);
+static uint64_t stackshot_timestamp(void *ssbuf, size_t sslen);
+static void initialize_thread(void);
+
+#define DEFAULT_STACKSHOT_BUFFER_SIZE (1024 * 1024)
+#define MAX_STACKSHOT_BUFFER_SIZE     (6 * 1024 * 1024)
+
+/* bit flags for parse_stackshot */
+#define PARSE_STACKSHOT_DELTA                0x01
+#define PARSE_STACKSHOT_ZOMBIE               0x02
+#define PARSE_STACKSHOT_SHAREDCACHE_LAYOUT   0x04
+#define PARSE_STACKSHOT_DISPATCH_QUEUE_LABEL 0x08
+#define PARSE_STACKSHOT_TURNSTILEINFO        0x10
+
+#define TEST_STACKSHOT_QUEUE_LABEL        "houston.we.had.a.problem"
+#define TEST_STACKSHOT_QUEUE_LABEL_LENGTH sizeof(TEST_STACKSHOT_QUEUE_LABEL)
+
+T_DECL(microstackshots, "test the microstackshot syscall")
+{
+       void *buf = NULL;
+       unsigned int size = DEFAULT_STACKSHOT_BUFFER_SIZE;
+
+       while (1) {
+               buf = malloc(size);
+               T_QUIET; T_ASSERT_NOTNULL(buf, "allocated stackshot buffer");
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+               int len = syscall(SYS_microstackshot, buf, size,
+                               STACKSHOT_GET_MICROSTACKSHOT);
+#pragma clang diagnostic pop
+               if (len == ENOSYS) {
+                       T_SKIP("microstackshot syscall failed, likely not compiled with CONFIG_TELEMETRY");
+               }
+               if (len == -1 && errno == ENOSPC) {
+                       /* syscall failed because buffer wasn't large enough, try again */
+                       free(buf);
+                       buf = NULL;
+                       size *= 2;
+                       T_ASSERT_LE(size, (unsigned int)MAX_STACKSHOT_BUFFER_SIZE,
+                                       "growing stackshot buffer to sane size");
+                       continue;
+               }
+               T_ASSERT_POSIX_SUCCESS(len, "called microstackshot syscall");
+               break;
+    }
+
+       T_EXPECT_EQ(*(uint32_t *)buf,
+                       (uint32_t)STACKSHOT_MICRO_SNAPSHOT_MAGIC,
+                       "magic value for microstackshot matches");
+
+       free(buf);
+}
+
+struct scenario {
+       const char *name;
+       uint32_t flags;
+       bool quiet;
+       bool should_fail;
+       bool maybe_unsupported;
+       pid_t target_pid;
+       uint64_t since_timestamp;
+       uint32_t size_hint;
+       dt_stat_time_t timer;
+};
+
+static void
+quiet(struct scenario *scenario)
+{
+       if (scenario->timer || scenario->quiet) {
+               T_QUIET;
+       }
+}
+
+static void
+take_stackshot(struct scenario *scenario, void (^cb)(void *buf, size_t size))
+{
+       initialize_thread();
+
+       void *config = stackshot_config_create();
+       quiet(scenario);
+       T_ASSERT_NOTNULL(config, "created stackshot config");
+
+       int ret = stackshot_config_set_flags(config, scenario->flags);
+       quiet(scenario);
+       T_ASSERT_POSIX_ZERO(ret, "set flags %#x on stackshot config", scenario->flags);
+
+       if (scenario->size_hint > 0) {
+               ret = stackshot_config_set_size_hint(config, scenario->size_hint);
+               quiet(scenario);
+               T_ASSERT_POSIX_ZERO(ret, "set size hint %" PRIu32 " on stackshot config",
+                               scenario->size_hint);
+       }
+
+       if (scenario->target_pid > 0) {
+               ret = stackshot_config_set_pid(config, scenario->target_pid);
+               quiet(scenario);
+               T_ASSERT_POSIX_ZERO(ret, "set target pid %d on stackshot config",
+                               scenario->target_pid);
+       }
+
+       if (scenario->since_timestamp > 0) {
+               ret = stackshot_config_set_delta_timestamp(config, scenario->since_timestamp);
+               quiet(scenario);
+               T_ASSERT_POSIX_ZERO(ret, "set since timestamp %" PRIu64 " on stackshot config",
+                               scenario->since_timestamp);
+       }
+
+       int retries_remaining = 5;
+
+retry: ;
+       uint64_t start_time = mach_absolute_time();
+       ret = stackshot_capture_with_config(config);
+       uint64_t end_time = mach_absolute_time();
+
+       if (scenario->should_fail) {
+               T_EXPECTFAIL;
+               T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config");
+               return;
+       }
+
+       if (ret == EBUSY || ret == ETIMEDOUT) {
+               if (retries_remaining > 0) {
+                       if (!scenario->timer) {
+                               T_LOG("stackshot_capture_with_config failed with %s (%d), retrying",
+                                               strerror(ret), ret);
+                       }
+
+                       retries_remaining--;
+                       goto retry;
+               } else {
+                       T_ASSERT_POSIX_ZERO(ret,
+                                       "called stackshot_capture_with_config (no retries remaining)");
+               }
+       } else if ((ret == ENOTSUP) && scenario->maybe_unsupported) {
+               T_SKIP("kernel indicated this stackshot configuration is not supported");
+       } else {
+               quiet(scenario);
+               T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config");
+       }
+
+       if (scenario->timer) {
+               dt_stat_mach_time_add(scenario->timer, end_time - start_time);
+       }
+       void *buf = stackshot_config_get_stackshot_buffer(config);
+       size_t size = stackshot_config_get_stackshot_size(config);
+       if (scenario->name) {
+               char sspath[MAXPATHLEN];
+               strlcpy(sspath, scenario->name, sizeof(sspath));
+               strlcat(sspath, ".kcdata", sizeof(sspath));
+               T_QUIET; T_ASSERT_POSIX_ZERO(dt_resultfile(sspath, sizeof(sspath)),
+                               "create result file path");
+
+               if (!scenario->quiet) {
+                       T_LOG("writing stackshot to %s", sspath);
+               }
+
+               FILE *f = fopen(sspath, "w");
+               T_WITH_ERRNO; T_QUIET; T_ASSERT_NOTNULL(f,
+                               "open stackshot output file");
+
+               size_t written = fwrite(buf, size, 1, f);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(written, "wrote stackshot to file");
+
+               fclose(f);
+       }
+       cb(buf, size);
+
+       ret = stackshot_config_dealloc(config);
+       T_QUIET; T_EXPECT_POSIX_ZERO(ret, "deallocated stackshot config");
+}
+
+T_DECL(kcdata, "test that kcdata stackshots can be taken and parsed")
+{
+       struct scenario scenario = {
+               .name = "kcdata",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS |
+                               STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       T_LOG("taking kcdata stackshot");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               parse_stackshot(0, ssbuf, sslen, -1);
+       });
+}
+
+T_DECL(kcdata_faulting, "test that kcdata stackshots while faulting can be taken and parsed")
+{
+       struct scenario scenario = {
+               .name = "faulting",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
+                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT
+                               | STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING),
+       };
+
+       T_LOG("taking faulting stackshot");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               parse_stackshot(0, ssbuf, sslen, -1);
+       });
+}
+
+T_DECL(bad_flags, "test a poorly-formed stackshot syscall")
+{
+       struct scenario scenario = {
+               .flags = STACKSHOT_SAVE_IN_KERNEL_BUFFER /* not allowed from user space */,
+               .should_fail = true,
+       };
+
+       T_LOG("attempting to take stackshot with kernel-only flag");
+       take_stackshot(&scenario, ^(__unused void *ssbuf, __unused size_t sslen) {
+               T_ASSERT_FAIL("stackshot data callback called");
+       });
+}
+
+T_DECL(delta, "test delta stackshots")
+{
+       struct scenario scenario = {
+               .name = "delta",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
+                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       T_LOG("taking full stackshot");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen);
+
+               T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time);
+
+               parse_stackshot(0, ssbuf, sslen, -1);
+
+               struct scenario delta_scenario = {
+                       .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
+                                       | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT
+                                       | STACKSHOT_COLLECT_DELTA_SNAPSHOT),
+                       .since_timestamp = stackshot_time
+               };
+
+               take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) {
+                       parse_stackshot(PARSE_STACKSHOT_DELTA, dssbuf, dsslen, -1);
+               });
+       });
+}
+
+T_DECL(shared_cache_layout, "test stackshot inclusion of shared cache layout")
+{
+       struct scenario scenario = {
+               .name = "shared_cache_layout",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
+                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT |
+                               STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT),
+       };
+
+       size_t shared_cache_length;
+       const void *cache_header = _dyld_get_shared_cache_range(&shared_cache_length);
+       if (cache_header == NULL) {
+               T_SKIP("Device not running with shared cache, skipping test...");
+       }
+
+       if (shared_cache_length == 0) {
+               T_SKIP("dyld reports that currently running shared cache has zero length");
+       }
+
+       T_LOG("taking stackshot with STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT set");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               parse_stackshot(PARSE_STACKSHOT_SHAREDCACHE_LAYOUT, ssbuf, sslen, -1);
+       });
+}
+
+T_DECL(stress, "test that taking stackshots for 60 seconds doesn't crash the system")
+{
+       uint64_t max_diff_time = 60ULL /* seconds */ * 1000000000ULL;
+       uint64_t start_time;
+
+       struct scenario scenario = {
+               .name = "stress",
+               .quiet = true,
+               .flags = (STACKSHOT_KCDATA_FORMAT |
+                               STACKSHOT_THREAD_WAITINFO |
+                               STACKSHOT_SAVE_LOADINFO |
+                               STACKSHOT_SAVE_KEXT_LOADINFO |
+                               STACKSHOT_GET_GLOBAL_MEM_STATS |
+                               // STACKSHOT_GET_BOOT_PROFILE |
+                               STACKSHOT_SAVE_IMP_DONATION_PIDS |
+                               STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT |
+                               STACKSHOT_THREAD_GROUP |
+                               STACKSHOT_SAVE_JETSAM_COALITIONS |
+                               STACKSHOT_ASID |
+                               // STACKSHOT_PAGE_TABLES |
+                               0),
+       };
+
+       start_time = clock_gettime_nsec_np(CLOCK_MONOTONIC);
+       while (clock_gettime_nsec_np(CLOCK_MONOTONIC) - start_time < max_diff_time) {
+               take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+                       printf(".");
+                       fflush(stdout);
+               });
+
+               /* Leave some time for the testing infrastructure to catch up */
+               usleep(10000);
+
+       }
+       printf("\n");
+}
+
+T_DECL(dispatch_queue_label, "test that kcdata stackshots contain libdispatch queue labels")
+{
+       struct scenario scenario = {
+               .name = "kcdata",
+               .flags = (STACKSHOT_GET_DQ | STACKSHOT_KCDATA_FORMAT),
+       };
+       dispatch_semaphore_t child_ready_sem, parent_done_sem;
+       dispatch_queue_t dq;
+
+#if TARGET_OS_WATCH
+       T_SKIP("This test is flaky on watches: 51663346");
+#endif
+
+       child_ready_sem = dispatch_semaphore_create(0);
+       T_QUIET; T_ASSERT_NOTNULL(child_ready_sem, "dqlabel child semaphore");
+
+       parent_done_sem = dispatch_semaphore_create(0);
+       T_QUIET; T_ASSERT_NOTNULL(parent_done_sem, "dqlabel parent semaphore");
+
+       dq = dispatch_queue_create(TEST_STACKSHOT_QUEUE_LABEL, NULL);
+       T_QUIET; T_ASSERT_NOTNULL(dq, "dispatch queue");
+
+       /* start the helper thread */
+       dispatch_async(dq, ^{
+                       dispatch_semaphore_signal(child_ready_sem);
+
+                       dispatch_semaphore_wait(parent_done_sem, DISPATCH_TIME_FOREVER);
+       });
+
+       /* block behind the child starting up */
+       dispatch_semaphore_wait(child_ready_sem, DISPATCH_TIME_FOREVER);
+
+       T_LOG("taking kcdata stackshot with libdispatch queue labels");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               parse_stackshot(PARSE_STACKSHOT_DISPATCH_QUEUE_LABEL, ssbuf, sslen, -1);
+       });
+
+       dispatch_semaphore_signal(parent_done_sem);
+}
+
+static void *stuck_sysctl_thread(void *arg) {
+       int val = 1;
+       dispatch_semaphore_t child_thread_started = *(dispatch_semaphore_t *)arg;
+
+       dispatch_semaphore_signal(child_thread_started);
+       T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.wedge_thread", NULL, NULL, &val, sizeof(val)), "wedge child thread");
+
+       return NULL;
+}
+
+T_HELPER_DECL(zombie_child, "child process to sample as a zombie")
+{
+       pthread_t pthread;
+       dispatch_semaphore_t child_thread_started = dispatch_semaphore_create(0);
+       T_QUIET; T_ASSERT_NOTNULL(child_thread_started, "zombie child thread semaphore");
+
+       /* spawn another thread to get stuck in the kernel, then call exit() to become a zombie */
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_create(&pthread, NULL, stuck_sysctl_thread, &child_thread_started), "pthread_create");
+
+       dispatch_semaphore_wait(child_thread_started, DISPATCH_TIME_FOREVER);
+
+       /* sleep for a bit in the hope of ensuring that the other thread has called the sysctl before we signal the parent */
+       usleep(100);
+       T_ASSERT_POSIX_SUCCESS(kill(getppid(), SIGUSR1), "signaled parent to take stackshot");
+
+       exit(0);
+}
+
+T_DECL(zombie, "tests a stackshot of a zombie task with a thread stuck in the kernel")
+{
+       char path[PATH_MAX];
+       uint32_t path_size = sizeof(path);
+       T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath");
+       char *args[] = { path, "-n", "zombie_child", NULL };
+
+       dispatch_source_t child_sig_src;
+       dispatch_semaphore_t child_ready_sem = dispatch_semaphore_create(0);
+       T_QUIET; T_ASSERT_NOTNULL(child_ready_sem, "zombie child semaphore");
+
+       dispatch_queue_t signal_processing_q = dispatch_queue_create("signal processing queue", NULL);
+       T_QUIET; T_ASSERT_NOTNULL(signal_processing_q, "signal processing queue");
+
+       pid_t pid;
+
+       T_LOG("spawning a child");
+
+       signal(SIGUSR1, SIG_IGN);
+       child_sig_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, signal_processing_q);
+       T_QUIET; T_ASSERT_NOTNULL(child_sig_src, "dispatch_source_create (child_sig_src)");
+
+       dispatch_source_set_event_handler(child_sig_src, ^{ dispatch_semaphore_signal(child_ready_sem); });
+       dispatch_activate(child_sig_src);
+
+       int sp_ret = posix_spawn(&pid, args[0], NULL, NULL, args, NULL);
+       T_QUIET; T_ASSERT_POSIX_ZERO(sp_ret, "spawned process '%s' with PID %d", args[0], pid);
+
+       dispatch_semaphore_wait(child_ready_sem, DISPATCH_TIME_FOREVER);
+
+       T_LOG("received signal from child, capturing stackshot");
+
+       struct proc_bsdshortinfo bsdshortinfo;
+       int retval, iterations_to_wait = 10;
+
+       while (iterations_to_wait > 0) {
+               retval = proc_pidinfo(pid, PROC_PIDT_SHORTBSDINFO, 0, &bsdshortinfo, sizeof(bsdshortinfo));
+               if ((retval == 0) && errno == ESRCH) {
+                       T_LOG("unable to find child using proc_pidinfo, assuming zombie");
+                       break;
+               }
+
+               T_QUIET; T_WITH_ERRNO; T_ASSERT_GT(retval, 0, "proc_pidinfo(PROC_PIDT_SHORTBSDINFO) returned a value > 0");
+               T_QUIET; T_ASSERT_EQ(retval, (int)sizeof(bsdshortinfo), "proc_pidinfo call for PROC_PIDT_SHORTBSDINFO returned expected size");
+
+               if (bsdshortinfo.pbsi_flags & PROC_FLAG_INEXIT) {
+                       T_LOG("child proc info marked as in exit");
+                       break;
+               }
+
+               iterations_to_wait--;
+               if (iterations_to_wait == 0) {
+                       /*
+                        * This will mark the test as failed but let it continue so we
+                        * don't leave a process stuck in the kernel.
+                        */
+                       T_FAIL("unable to discover that child is marked as exiting");
+               }
+
+               /* Give the child a few more seconds to make it to exit */
+               sleep(5);
+       }
+
+       /* Give the child some more time to make it through exit */
+       sleep(10);
+
+       struct scenario scenario = {
+               .name = "zombie",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
+                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       take_stackshot(&scenario, ^( void *ssbuf, size_t sslen) {
+               /* First unwedge the child so we can reap it */
+               int val = 1, status;
+               T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.unwedge_thread", NULL, NULL, &val, sizeof(val)), "unwedge child");
+
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(pid, &status, 0), "waitpid on zombie child");
+
+               parse_stackshot(PARSE_STACKSHOT_ZOMBIE, ssbuf, sslen, pid);
+       });
+}
+
+static uint32_t
+get_user_promotion_basepri(void)
+{
+       mach_msg_type_number_t count = THREAD_POLICY_STATE_COUNT;
+       struct thread_policy_state thread_policy;
+       boolean_t get_default = FALSE;
+       mach_port_t thread_port = pthread_mach_thread_np(pthread_self());
+
+       kern_return_t kr = thread_policy_get(thread_port, THREAD_POLICY_STATE,
+           (thread_policy_t)&thread_policy, &count, &get_default);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_policy_get");
+       return thread_policy.thps_user_promotion_basepri;
+}
+
+static int
+get_pri(thread_t thread_port)
+{
+       kern_return_t kr;
+
+       thread_extended_info_data_t extended_info;
+       mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT;
+       kr = thread_info(thread_port, THREAD_EXTENDED_INFO,
+           (thread_info_t)&extended_info, &count);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info");
+
+       return extended_info.pth_curpri;
+}
+
+
+T_DECL(turnstile_singlehop, "turnstile single hop test")
+{
+       dispatch_queue_t dq1, dq2;
+       dispatch_semaphore_t sema_x;
+       dispatch_queue_attr_t dq1_attr, dq2_attr;
+       qos_class_t main_qos = 0;
+       int main_relpri = 0, main_relpri2 = 0, main_afterpri = 0;
+       struct scenario scenario = {
+               .name = "turnstile_singlehop",
+               .flags = (STACKSHOT_THREAD_WAITINFO | STACKSHOT_KCDATA_FORMAT),
+       };
+       dq1_attr = dispatch_queue_attr_make_with_qos_class(DISPATCH_QUEUE_SERIAL, QOS_CLASS_UTILITY, 0);
+       dq2_attr = dispatch_queue_attr_make_with_qos_class(DISPATCH_QUEUE_SERIAL, QOS_CLASS_USER_INITIATED, 0);
+       pthread_mutex_t lock_a = PTHREAD_MUTEX_INITIALIZER;
+       pthread_mutex_t lock_b = PTHREAD_MUTEX_INITIALIZER;
+
+       pthread_mutex_t *lockap = &lock_a, *lockbp = &lock_b;
+
+       dq1 = dispatch_queue_create("q1", dq1_attr);
+       dq2 = dispatch_queue_create("q2", dq2_attr);
+       sema_x = dispatch_semaphore_create(0);
+
+       pthread_mutex_lock(lockap);
+       dispatch_async(dq1, ^{
+               pthread_mutex_lock(lockbp);
+               T_ASSERT_POSIX_SUCCESS(pthread_get_qos_class_np(pthread_self(), &main_qos, &main_relpri), "get qos class");
+               T_LOG("The priority of q1 is %d\n", get_pri(mach_thread_self()));
+               dispatch_semaphore_signal(sema_x);
+               pthread_mutex_lock(lockap);
+       });
+       dispatch_semaphore_wait(sema_x, DISPATCH_TIME_FOREVER);
+
+       T_LOG("Async1 completed");
+
+       pthread_set_qos_class_self_np(QOS_CLASS_UTILITY, 0);
+       T_ASSERT_POSIX_SUCCESS(pthread_get_qos_class_np(pthread_self(), &main_qos, &main_relpri), "get qos class");
+       T_LOG("The priority of main is %d\n", get_pri(mach_thread_self()));
+       main_relpri = get_pri(mach_thread_self());
+
+       dispatch_async(dq2, ^{
+               T_ASSERT_POSIX_SUCCESS(pthread_get_qos_class_np(pthread_self(), &main_qos, &main_relpri2), "get qos class");
+               T_LOG("The priority of q2 is %d\n", get_pri(mach_thread_self()));
+               dispatch_semaphore_signal(sema_x);
+               pthread_mutex_lock(lockbp);
+       });
+       dispatch_semaphore_wait(sema_x, DISPATCH_TIME_FOREVER);
+       
+       T_LOG("Async2 completed");
+
+       while (1) {
+               main_afterpri = get_user_promotion_basepri();
+               if (main_relpri != main_afterpri) {
+                       T_LOG("Success with promotion pri is %d", main_afterpri);
+                       break;
+               }
+
+               usleep(100);
+       }
+
+       take_stackshot(&scenario, ^( void *ssbuf, size_t sslen) {
+               parse_stackshot(PARSE_STACKSHOT_TURNSTILEINFO, ssbuf, sslen, -1);
+       });
+}
+
+
+static void
+expect_instrs_cycles_in_stackshot(void *ssbuf, size_t sslen)
+{
+       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
+
+       bool in_task = false;
+       bool in_thread = false;
+       bool saw_instrs_cycles = false;
+       iter = kcdata_iter_next(iter);
+
+       KCDATA_ITER_FOREACH(iter) {
+               switch (kcdata_iter_type(iter)) {
+               case KCDATA_TYPE_CONTAINER_BEGIN:
+                       switch (kcdata_iter_container_type(iter)) {
+                       case STACKSHOT_KCCONTAINER_TASK:
+                               in_task = true;
+                               saw_instrs_cycles = false;
+                               break;
+
+                       case STACKSHOT_KCCONTAINER_THREAD:
+                               in_thread = true;
+                               saw_instrs_cycles = false;
+                               break;
+
+                       default:
+                               break;
+                       }
+                       break;
+
+               case STACKSHOT_KCTYPE_INSTRS_CYCLES:
+                       saw_instrs_cycles = true;
+                       break;
+
+               case KCDATA_TYPE_CONTAINER_END:
+                       if (in_thread) {
+                               T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles,
+                                               "saw instructions and cycles in thread");
+                               in_thread = false;
+                       } else if (in_task) {
+                               T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles,
+                                               "saw instructions and cycles in task");
+                               in_task = false;
+                       }
+
+               default:
+                       break;
+               }
+       }
+}
+
+static void
+skip_if_monotonic_unsupported(void)
+{
+       int supported = 0;
+       size_t supported_size = sizeof(supported);
+       int ret = sysctlbyname("kern.monotonic.supported", &supported,
+                       &supported_size, 0, 0);
+       if (ret < 0 || !supported) {
+               T_SKIP("monotonic is unsupported");
+       }
+}
+
+T_DECL(instrs_cycles, "test a getting instructions and cycles in stackshot")
+{
+       skip_if_monotonic_unsupported();
+
+       struct scenario scenario = {
+               .name = "instrs-cycles",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES
+                               | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       T_LOG("attempting to take stackshot with instructions and cycles");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               parse_stackshot(0, ssbuf, sslen, -1);
+               expect_instrs_cycles_in_stackshot(ssbuf, sslen);
+       });
+}
+
+T_DECL(delta_instrs_cycles,
+               "test delta stackshots with instructions and cycles")
+{
+       skip_if_monotonic_unsupported();
+
+       struct scenario scenario = {
+               .name = "delta-instrs-cycles",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES
+                               | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       T_LOG("taking full stackshot");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen);
+
+               T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time);
+
+               parse_stackshot(0, ssbuf, sslen, -1);
+               expect_instrs_cycles_in_stackshot(ssbuf, sslen);
+
+               struct scenario delta_scenario = {
+                       .name = "delta-instrs-cycles-next",
+                       .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES
+                                       | STACKSHOT_KCDATA_FORMAT
+                                       | STACKSHOT_COLLECT_DELTA_SNAPSHOT),
+                       .since_timestamp = stackshot_time,
+               };
+
+               take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) {
+                       parse_stackshot(PARSE_STACKSHOT_DELTA, dssbuf, dsslen, -1);
+                       expect_instrs_cycles_in_stackshot(dssbuf, dsslen);
+               });
+       });
+}
+
+static void
+check_thread_groups_supported()
+{
+       int err;
+       int supported = 0;
+       size_t supported_size = sizeof(supported);
+       err = sysctlbyname("kern.thread_groups_supported", &supported, &supported_size, NULL, 0);
+
+       if (err || !supported)
+               T_SKIP("thread groups not supported on this system");
+}
+
+T_DECL(thread_groups, "test getting thread groups in stackshot")
+{
+       check_thread_groups_supported();
+
+       struct scenario scenario = {
+               .name = "thread-groups",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_THREAD_GROUP
+                               | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       T_LOG("attempting to take stackshot with thread group flag");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               parse_thread_group_stackshot(ssbuf, sslen);
+       });
+}
+
+static void
+parse_page_table_asid_stackshot(void **ssbuf, size_t sslen)
+{
+       bool seen_asid = false;
+       bool seen_page_table_snapshot = false;
+       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
+       T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT,
+                       "buffer provided is a stackshot");
+
+       iter = kcdata_iter_next(iter);
+       KCDATA_ITER_FOREACH(iter) {
+               switch (kcdata_iter_type(iter)) {
+               case KCDATA_TYPE_ARRAY: {
+                       T_QUIET;
+                       T_ASSERT_TRUE(kcdata_iter_array_valid(iter),
+                                       "checked that array is valid");
+
+                       if (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_PAGE_TABLES) {
+                               continue;
+                       }
+
+                       T_ASSERT_FALSE(seen_page_table_snapshot, "check that we haven't yet seen a page table snapshot");
+                       seen_page_table_snapshot = true;
+
+                       T_ASSERT_EQ((size_t) kcdata_iter_array_elem_size(iter), sizeof(uint64_t),
+                               "check that each element of the pagetable dump is the expected size");
+
+                       uint64_t *pt_array = kcdata_iter_payload(iter);
+                       uint32_t elem_count = kcdata_iter_array_elem_count(iter);
+                       uint32_t j;
+                       bool nonzero_tte = false;
+                       for (j = 0; j < elem_count;) {
+                               T_QUIET; T_ASSERT_LE(j + 4, elem_count, "check for valid page table segment header");
+                               uint64_t pa = pt_array[j];
+                               uint64_t num_entries = pt_array[j + 1];
+                               uint64_t start_va = pt_array[j + 2];
+                               uint64_t end_va = pt_array[j + 3];
+
+                               T_QUIET; T_ASSERT_NE(pa, (uint64_t) 0, "check that the pagetable physical address is non-zero");
+                               T_QUIET; T_ASSERT_EQ(pa % (num_entries * sizeof(uint64_t)), (uint64_t) 0, "check that the pagetable physical address is correctly aligned");
+                               T_QUIET; T_ASSERT_NE(num_entries, (uint64_t) 0, "check that a pagetable region has more than 0 entries");
+                               T_QUIET; T_ASSERT_LE(j + 4 + num_entries, (uint64_t) elem_count, "check for sufficient space in page table array");
+                               T_QUIET; T_ASSERT_GT(end_va, start_va, "check for valid VA bounds in page table segment header");
+
+                               for (uint32_t k = j + 4; k < (j + 4 + num_entries); ++k) {
+                                       if (pt_array[k] != 0) {
+                                               nonzero_tte = true;
+                                               T_QUIET; T_ASSERT_EQ((pt_array[k] >> 48) & 0xf, (uint64_t) 0, "check that bits[48:51] of arm64 TTE are clear");
+                                               // L0-L2 table and non-compressed L3 block entries should always have bit 1 set; assumes L0-L2 blocks will not be used outside the kernel
+                                               bool table = ((pt_array[k] & 0x2) != 0);
+                                               if (table) {
+                                                       T_QUIET; T_ASSERT_NE(pt_array[k] & ((1ULL << 48) - 1) & ~((1ULL << 12) - 1), (uint64_t) 0, "check that arm64 TTE physical address is non-zero");
+                                               } else { // should be a compressed PTE
+                                                       T_QUIET; T_ASSERT_NE(pt_array[k] & 0xC000000000000000ULL, (uint64_t) 0, "check that compressed PTE has at least one of bits [63:62] set");
+                                                       T_QUIET; T_ASSERT_EQ(pt_array[k] & ~0xC000000000000000ULL, (uint64_t) 0, "check that compressed PTE has no other bits besides [63:62] set");
+                                               }
+                                       }
+                               }
+
+                               j += (4 + num_entries);
+                       }
+                       T_ASSERT_TRUE(nonzero_tte, "check that we saw at least one non-empty TTE");
+                       T_ASSERT_EQ(j, elem_count, "check that page table dump size matches extent of last header"); 
+                       break;
+               }
+               case STACKSHOT_KCTYPE_ASID: {
+                       T_ASSERT_FALSE(seen_asid, "check that we haven't yet seen an ASID");
+                       seen_asid = true;
+               }
+               }
+       }
+       T_ASSERT_TRUE(seen_page_table_snapshot, "check that we have seen a page table snapshot");
+       T_ASSERT_TRUE(seen_asid, "check that we have seen an ASID");
+}
+
+T_DECL(dump_page_tables, "test stackshot page table dumping support")
+{
+       struct scenario scenario = {
+               .name = "asid-page-tables",
+               .flags = (STACKSHOT_KCDATA_FORMAT | STACKSHOT_ASID | STACKSHOT_PAGE_TABLES),
+               .size_hint = (1ULL << 23), // 8 MB
+               .target_pid = getpid(),
+               .maybe_unsupported = true,
+       };
+
+       T_LOG("attempting to take stackshot with ASID and page table flags");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               parse_page_table_asid_stackshot(ssbuf, sslen);
+       });
+}
+
+static void stackshot_verify_current_proc_uuid_info(void **ssbuf, size_t sslen, uint64_t expected_offset, const struct proc_uniqidentifierinfo *proc_info_data)
+{
+       const uuid_t *current_uuid = (const uuid_t *)(&proc_info_data->p_uuid);
+
+       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
+       T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, "buffer provided is a stackshot");
+
+       iter = kcdata_iter_next(iter);
+
+       KCDATA_ITER_FOREACH(iter) {
+               switch (kcdata_iter_type(iter)) {
+                       case KCDATA_TYPE_ARRAY: {
+                               T_QUIET; T_ASSERT_TRUE(kcdata_iter_array_valid(iter), "checked that array is valid");
+                               if (kcdata_iter_array_elem_type(iter) == KCDATA_TYPE_LIBRARY_LOADINFO64) {
+                                       struct user64_dyld_uuid_info *info = (struct user64_dyld_uuid_info *) kcdata_iter_payload(iter);
+                                       if (uuid_compare(*current_uuid, info->imageUUID) == 0) {
+                                               T_ASSERT_EQ(expected_offset, info->imageLoadAddress, "found matching UUID with matching binary offset");
+                                               return;
+                                       }
+                               } else if (kcdata_iter_array_elem_type(iter) == KCDATA_TYPE_LIBRARY_LOADINFO) {
+                                       struct user32_dyld_uuid_info *info = (struct user32_dyld_uuid_info *) kcdata_iter_payload(iter);
+                                       if (uuid_compare(*current_uuid, info->imageUUID) == 0) {
+                                               T_ASSERT_EQ(expected_offset, ((uint64_t) info->imageLoadAddress),  "found matching UUID with matching binary offset");
+                                               return;
+                                       }
+                               }
+                               break;
+                       }
+                       default:
+                               break;
+               }
+       }
+
+       T_FAIL("failed to find matching UUID in stackshot data");
+}
+
+T_DECL(proc_uuid_info, "tests that the main binary UUID for a proc is always populated")
+{
+       struct proc_uniqidentifierinfo proc_info_data = { };
+       mach_msg_type_number_t      count;
+       kern_return_t               kernel_status;
+       task_dyld_info_data_t       task_dyld_info;
+       struct dyld_all_image_infos *target_infos;
+       int retval;
+       bool found_image_in_image_infos = false;
+       uint64_t expected_mach_header_offset = 0;
+
+       /* Find the UUID of our main binary */
+       retval = proc_pidinfo(getpid(), PROC_PIDUNIQIDENTIFIERINFO, 0, &proc_info_data, sizeof(proc_info_data));
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(retval, "proc_pidinfo PROC_PIDUNIQIDENTIFIERINFO");
+       T_QUIET; T_ASSERT_EQ_INT(retval, (int) sizeof(proc_info_data), "proc_pidinfo PROC_PIDUNIQIDENTIFIERINFO returned data");
+
+       uuid_string_t str = {};
+       uuid_unparse(*(uuid_t*)&proc_info_data.p_uuid, str);
+       T_LOG("Found current UUID is %s", str);
+
+       /* Find the location of the dyld image info metadata */
+       count = TASK_DYLD_INFO_COUNT;
+       kernel_status = task_info(mach_task_self(), TASK_DYLD_INFO, (task_info_t)&task_dyld_info, &count);
+       T_QUIET; T_ASSERT_EQ(kernel_status, KERN_SUCCESS, "retrieve task_info for TASK_DYLD_INFO");
+
+       target_infos = (struct dyld_all_image_infos *)task_dyld_info.all_image_info_addr;
+
+       /* Find our binary in the dyld image info array */
+       for (int i = 0; i < (int) target_infos->uuidArrayCount; i++) {
+               if (uuid_compare(target_infos->uuidArray[i].imageUUID, *(uuid_t*)&proc_info_data.p_uuid) == 0) {
+                       expected_mach_header_offset = (uint64_t) target_infos->uuidArray[i].imageLoadAddress;
+                       found_image_in_image_infos = true;
+               }
+       }
+
+       T_ASSERT_TRUE(found_image_in_image_infos, "found binary image in dyld image info list");
+
+       /* Overwrite the dyld image info data so the kernel has to fallback to the UUID stored in the proc structure */
+       target_infos->uuidArrayCount = 0;
+
+       struct scenario scenario = {
+               .name = "proc_uuid_info",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_KCDATA_FORMAT),
+               .target_pid = getpid(),
+       };
+
+       T_LOG("attempting to take stackshot for current PID");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               stackshot_verify_current_proc_uuid_info(ssbuf, sslen, expected_mach_header_offset, &proc_info_data);
+       });
+}
+
+#pragma mark performance tests
+
+#define SHOULD_REUSE_SIZE_HINT 0x01
+#define SHOULD_USE_DELTA       0x02
+#define SHOULD_TARGET_SELF     0x04
+
+static void
+stackshot_perf(unsigned int options)
+{
+       struct scenario scenario = {
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
+                       | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       dt_stat_t size = dt_stat_create("bytes", "size");
+       dt_stat_time_t duration = dt_stat_time_create("duration");
+       scenario.timer = duration;
+
+       if (options & SHOULD_TARGET_SELF) {
+               scenario.target_pid = getpid();
+       }
+
+       while (!dt_stat_stable(duration) || !dt_stat_stable(size)) {
+               __block uint64_t last_time = 0;
+               __block uint32_t size_hint = 0;
+               take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+                       dt_stat_add(size, (double)sslen);
+                       last_time = stackshot_timestamp(ssbuf, sslen);
+                       size_hint = (uint32_t)sslen;
+               });
+               if (options & SHOULD_USE_DELTA) {
+                       scenario.since_timestamp = last_time;
+                       scenario.flags |= STACKSHOT_COLLECT_DELTA_SNAPSHOT;
+               }
+               if (options & SHOULD_REUSE_SIZE_HINT) {
+                       scenario.size_hint = size_hint;
+               }
+       }
+
+       dt_stat_finalize(duration);
+       dt_stat_finalize(size);
+}
+
+T_DECL(perf_no_size_hint, "test stackshot performance with no size hint",
+               T_META_TAG_PERF)
+{
+       stackshot_perf(0);
+}
+
+T_DECL(perf_size_hint, "test stackshot performance with size hint",
+               T_META_TAG_PERF)
+{
+       stackshot_perf(SHOULD_REUSE_SIZE_HINT);
+}
+
+T_DECL(perf_process, "test stackshot performance targeted at process",
+               T_META_TAG_PERF)
+{
+       stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_TARGET_SELF);
+}
+
+T_DECL(perf_delta, "test delta stackshot performance",
+               T_META_TAG_PERF)
+{
+       stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA);
+}
+
+T_DECL(perf_delta_process, "test delta stackshot performance targeted at a process",
+               T_META_TAG_PERF)
+{
+       stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA | SHOULD_TARGET_SELF);
+}
+
+static uint64_t
+stackshot_timestamp(void *ssbuf, size_t sslen)
+{
+       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
+
+       uint32_t type = kcdata_iter_type(iter);
+       if (type != KCDATA_BUFFER_BEGIN_STACKSHOT && type != KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT) {
+               T_ASSERT_FAIL("invalid kcdata type %u", kcdata_iter_type(iter));
+       }
+
+       iter = kcdata_iter_find_type(iter, KCDATA_TYPE_MACH_ABSOLUTE_TIME);
+       T_QUIET;
+       T_ASSERT_TRUE(kcdata_iter_valid(iter), "timestamp found in stackshot");
+
+       return *(uint64_t *)kcdata_iter_payload(iter);
+}
+
+#define TEST_THREAD_NAME "stackshot_test_thread"
+
+static void
+parse_thread_group_stackshot(void **ssbuf, size_t sslen)
+{
+       bool seen_thread_group_snapshot = false;
+       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
+       T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT,
+                       "buffer provided is a stackshot");
+
+       NSMutableSet *thread_groups = [[NSMutableSet alloc] init];
+
+       iter = kcdata_iter_next(iter);
+       KCDATA_ITER_FOREACH(iter) {
+               switch (kcdata_iter_type(iter)) {
+               case KCDATA_TYPE_ARRAY: {
+                       T_QUIET;
+                       T_ASSERT_TRUE(kcdata_iter_array_valid(iter),
+                                       "checked that array is valid");
+
+                       if (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT) {
+                               continue;
+                       }
+
+                       seen_thread_group_snapshot = true;
+
+                       if (kcdata_iter_array_elem_size(iter) >= sizeof(struct thread_group_snapshot_v2)) {
+                               struct thread_group_snapshot_v2 *tgs_array = kcdata_iter_payload(iter);
+                               for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) {
+                                       struct thread_group_snapshot_v2 *tgs = tgs_array + j;
+                                       [thread_groups addObject:@(tgs->tgs_id)];
+                               }
+
+                       }
+                       else {
+                               struct thread_group_snapshot *tgs_array = kcdata_iter_payload(iter);
+                               for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) {
+                                       struct thread_group_snapshot *tgs = tgs_array + j;
+                                       [thread_groups addObject:@(tgs->tgs_id)];
+                               }
+                       }
+                       break;
+               }
+               }
+       }
+       KCDATA_ITER_FOREACH(iter) {
+               NSError *error = nil;
+
+               switch (kcdata_iter_type(iter)) {
+
+               case KCDATA_TYPE_CONTAINER_BEGIN: {
+                       T_QUIET;
+                       T_ASSERT_TRUE(kcdata_iter_container_valid(iter),
+                                       "checked that container is valid");
+
+                       if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_THREAD) {
+                               break;
+                       }
+
+                       NSDictionary *container = parseKCDataContainer(&iter, &error);
+                       T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot");
+                       T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container");
+
+                       int tg = [container[@"thread_snapshots"][@"thread_group"] intValue];
+
+                       T_ASSERT_TRUE([thread_groups containsObject:@(tg)], "check that the thread group the thread is in exists");
+
+                       break;
+               };
+
+               }
+       }
+       T_ASSERT_TRUE(seen_thread_group_snapshot, "check that we have seen a thread group snapshot");
+}
+
+static void
+verify_stackshot_sharedcache_layout(struct dyld_uuid_info_64 *uuids, uint32_t uuid_count)
+{
+       uuid_t cur_shared_cache_uuid;
+       __block uint32_t lib_index = 0, libs_found = 0;
+
+       _dyld_get_shared_cache_uuid(cur_shared_cache_uuid);
+       int result = dyld_shared_cache_iterate_text(cur_shared_cache_uuid, ^(const dyld_shared_cache_dylib_text_info* info) {
+                       T_QUIET; T_ASSERT_LT(lib_index, uuid_count, "dyld_shared_cache_iterate_text exceeded number of libraries returned by kernel");
+
+                       libs_found++;
+                       struct dyld_uuid_info_64 *cur_stackshot_uuid_entry = &uuids[lib_index];
+                       T_QUIET; T_ASSERT_EQ(memcmp(info->dylibUuid, cur_stackshot_uuid_entry->imageUUID, sizeof(info->dylibUuid)), 0,
+                                       "dyld returned UUID doesn't match kernel returned UUID");
+                       T_QUIET; T_ASSERT_EQ(info->loadAddressUnslid, cur_stackshot_uuid_entry->imageLoadAddress,
+                                       "dyld returned load address doesn't match kernel returned load address");
+                       lib_index++;
+               });
+
+       T_ASSERT_EQ(result, 0, "iterate shared cache layout");
+       T_ASSERT_EQ(libs_found, uuid_count, "dyld iterator returned same number of libraries as kernel");
+
+       T_LOG("verified %d libraries from dyld shared cache", libs_found);
+}
+
+static void
+parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int child_pid)
+{
+       bool delta = (stackshot_parsing_flags & PARSE_STACKSHOT_DELTA);
+       bool expect_zombie_child = (stackshot_parsing_flags & PARSE_STACKSHOT_ZOMBIE);
+       bool expect_shared_cache_layout = false;
+       bool expect_shared_cache_uuid = !delta;
+       bool expect_dispatch_queue_label = (stackshot_parsing_flags & PARSE_STACKSHOT_DISPATCH_QUEUE_LABEL);
+       bool expect_turnstile_lock = (stackshot_parsing_flags & PARSE_STACKSHOT_TURNSTILEINFO);
+       bool found_zombie_child = false, found_shared_cache_layout = false, found_shared_cache_uuid = false;
+       bool found_dispatch_queue_label = false, found_turnstile_lock = false;
+
+       if (expect_shared_cache_uuid) {
+               uuid_t shared_cache_uuid;
+               if (!_dyld_get_shared_cache_uuid(shared_cache_uuid)) {
+                       T_LOG("Skipping verifying shared cache UUID in stackshot data because not running with a shared cache");
+                       expect_shared_cache_uuid = false;
+               }
+       }
+
+       if (stackshot_parsing_flags & PARSE_STACKSHOT_SHAREDCACHE_LAYOUT) {
+               size_t shared_cache_length = 0;
+               const void *cache_header = _dyld_get_shared_cache_range(&shared_cache_length);
+               T_QUIET; T_ASSERT_NOTNULL(cache_header, "current process running with shared cache");
+               T_QUIET; T_ASSERT_GT(shared_cache_length, sizeof(struct _dyld_cache_header), "valid shared cache length populated by _dyld_get_shared_cache_range");
+
+               if (_dyld_shared_cache_is_locally_built()) {
+                       T_LOG("device running with locally built shared cache, expect shared cache layout");
+                       expect_shared_cache_layout = true;
+               } else {
+                       T_LOG("device running with B&I built shared-cache, no shared cache layout expected");
+               }
+       }
+
+       if (expect_zombie_child) {
+               T_QUIET; T_ASSERT_GT(child_pid, 0, "child pid greater than zero");
+       }
+
+       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
+       if (delta) {
+               T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT,
+                               "buffer provided is a delta stackshot");
+       } else {
+               T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT,
+                               "buffer provided is a stackshot");
+       }
+
+       iter = kcdata_iter_next(iter);
+       KCDATA_ITER_FOREACH(iter) {
+               NSError *error = nil;
+
+               switch (kcdata_iter_type(iter)) {
+               case KCDATA_TYPE_ARRAY: {
+                       T_QUIET;
+                       T_ASSERT_TRUE(kcdata_iter_array_valid(iter),
+                                       "checked that array is valid");
+
+                       NSMutableDictionary *array = parseKCDataArray(iter, &error);
+                       T_QUIET; T_ASSERT_NOTNULL(array, "parsed array from stackshot");
+                       T_QUIET; T_ASSERT_NULL(error, "error unset after parsing array");
+
+                       if (kcdata_iter_array_elem_type(iter) == STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT) {
+                               struct dyld_uuid_info_64 *shared_cache_uuids = kcdata_iter_payload(iter);
+                               uint32_t uuid_count = kcdata_iter_array_elem_count(iter);
+                               T_ASSERT_NOTNULL(shared_cache_uuids, "parsed shared cache layout array");
+                               T_ASSERT_GT(uuid_count, 0, "returned valid number of UUIDs from shared cache");
+                               verify_stackshot_sharedcache_layout(shared_cache_uuids, uuid_count);
+                               found_shared_cache_layout = true;
+                       }
+
+                       break;
+               }
+
+               case KCDATA_TYPE_CONTAINER_BEGIN: {
+                       T_QUIET;
+                       T_ASSERT_TRUE(kcdata_iter_container_valid(iter),
+                                       "checked that container is valid");
+
+                       if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_TASK) {
+                               break;
+                       }
+
+                       NSDictionary *container = parseKCDataContainer(&iter, &error);
+                       T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot");
+                       T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container");
+
+                       if (expect_dispatch_queue_label && !found_dispatch_queue_label) {
+                               for (id thread_key in container[@"task_snapshots"][@"thread_snapshots"]) {
+                                       NSMutableDictionary *thread = container[@"task_snapshots"][@"thread_snapshots"][thread_key];
+                                       NSString *dql = thread[@"dispatch_queue_label"];
+
+                                       if ([dql isEqualToString:@TEST_STACKSHOT_QUEUE_LABEL]) {
+                                               found_dispatch_queue_label = true;
+                                               break;
+                                       }
+                               }
+                       }
+
+                       int pid = [container[@"task_snapshots"][@"task_snapshot"][@"ts_pid"] intValue];
+                       if (expect_zombie_child && (pid == child_pid)) {
+                                       found_zombie_child = true;
+
+                                       uint64_t task_flags = [container[@"task_snapshots"][@"task_snapshot"][@"ts_ss_flags"] unsignedLongLongValue];
+                                       T_ASSERT_TRUE((task_flags & kTerminatedSnapshot) == kTerminatedSnapshot, "child zombie marked as terminated");
+
+                                       continue;
+                       } else if (pid != getpid()) {
+                               break;
+                       }
+
+                       T_EXPECT_EQ_STR(current_process_name(),
+                                       [container[@"task_snapshots"][@"task_snapshot"][@"ts_p_comm"] UTF8String],
+                                       "current process name matches in stackshot");
+
+                       uint64_t task_flags = [container[@"task_snapshots"][@"task_snapshot"][@"ts_ss_flags"] unsignedLongLongValue];
+                       T_ASSERT_FALSE((task_flags & kTerminatedSnapshot) == kTerminatedSnapshot, "current process not marked as terminated");
+
+                       T_QUIET;
+                       T_EXPECT_LE(pid, [container[@"task_snapshots"][@"task_snapshot"][@"ts_unique_pid"] intValue],
+                                       "unique pid is greater than pid");
+
+                       bool found_main_thread = false;
+                       uint64_t main_thread_id = -1;
+                       for (id thread_key in container[@"task_snapshots"][@"thread_snapshots"]) {
+                               NSMutableDictionary *thread = container[@"task_snapshots"][@"thread_snapshots"][thread_key];
+                               NSDictionary *thread_snap = thread[@"thread_snapshot"];
+
+                               T_QUIET; T_EXPECT_GT([thread_snap[@"ths_thread_id"] intValue], 0,
+                                               "thread ID of thread in current task is valid");
+                               T_QUIET; T_EXPECT_GT([thread_snap[@"ths_base_priority"] intValue], 0,
+                                               "base priority of thread in current task is valid");
+                               T_QUIET; T_EXPECT_GT([thread_snap[@"ths_sched_priority"] intValue], 0,
+                                               "scheduling priority of thread in current task is valid");
+
+                               NSString *pth_name = thread[@"pth_name"];
+                               if (pth_name != nil && [pth_name isEqualToString:@TEST_THREAD_NAME]) {
+                                       found_main_thread = true;
+                                       main_thread_id = [thread_snap[@"ths_thread_id"] intValue];
+
+                                       T_QUIET; T_EXPECT_GT([thread_snap[@"ths_total_syscalls"] intValue], 0,
+                                                       "total syscalls of current thread is valid");
+
+                                       NSDictionary *cpu_times = thread[@"cpu_times"];
+                                       T_EXPECT_GE([cpu_times[@"runnable_time"] intValue],
+                                                       [cpu_times[@"system_time"] intValue] +
+                                                       [cpu_times[@"user_time"] intValue],
+                                                       "runnable time of current thread is valid");
+                               }
+                       }
+                       T_EXPECT_TRUE(found_main_thread, "found main thread for current task in stackshot");
+
+                       if (expect_turnstile_lock && !found_turnstile_lock) {
+                               NSArray *tsinfos = container[@"task_snapshots"][@"thread_turnstileinfo"];
+
+                               for (id i in tsinfos) {
+                                       if ([i[@"turnstile_context"] intValue] == main_thread_id) {
+                                               found_turnstile_lock = true;
+                                               break;
+                                       }
+                               }
+                       }
+                       break;
+               }
+               case STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO: {
+                       struct dyld_uuid_info_64_v2 *shared_cache_info = kcdata_iter_payload(iter);
+                       uuid_t shared_cache_uuid;
+                       T_QUIET; T_ASSERT_TRUE(_dyld_get_shared_cache_uuid(shared_cache_uuid), "retrieve current shared cache UUID");
+                       T_QUIET; T_ASSERT_EQ(memcmp(shared_cache_info->imageUUID, shared_cache_uuid, sizeof(shared_cache_uuid)), 0,
+                                       "dyld returned UUID doesn't match kernel returned UUID for system shared cache");
+                       found_shared_cache_uuid = true;
+                       break;
+               }
+               }
+       }
+
+       if (expect_zombie_child) {
+               T_QUIET; T_ASSERT_TRUE(found_zombie_child, "found zombie child in kcdata");
+       }
+
+       if (expect_shared_cache_layout) {
+               T_QUIET; T_ASSERT_TRUE(found_shared_cache_layout, "shared cache layout found in kcdata");
+       }
+
+       if (expect_shared_cache_uuid) {
+               T_QUIET; T_ASSERT_TRUE(found_shared_cache_uuid, "shared cache UUID found in kcdata");
+       }
+
+       if (expect_dispatch_queue_label) {
+               T_QUIET; T_ASSERT_TRUE(found_dispatch_queue_label, "dispatch queue label found in kcdata");
+       }
+
+       if (expect_turnstile_lock) {
+               T_QUIET; T_ASSERT_TRUE(found_turnstile_lock, "found expected deadlock");
+       }
+
+       T_ASSERT_FALSE(KCDATA_ITER_FOREACH_FAILED(iter), "successfully iterated kcdata");
+}
+
+static const char *
+current_process_name(void)
+{
+       static char name[64];
+
+       if (!name[0]) {
+               int ret = proc_name(getpid(), name, sizeof(name));
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(ret, "proc_name failed for current process");
+       }
+
+       return name;
+}
+
+static void
+initialize_thread(void)
+{
+       int ret = pthread_setname_np(TEST_THREAD_NAME);
+       T_QUIET;
+       T_ASSERT_POSIX_ZERO(ret, "set thread name to %s", TEST_THREAD_NAME);
+}
index 6a8977bb529cf9b632730130a174a95a76429d5c..976026d96ebad709c4025860613edeed9762ddc0 100644 (file)
@@ -14,6 +14,8 @@
 #include <sysexits.h>
 #include <err.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 /*
  * Test to validate that suspended-spawn DTRTs when a SIGKILL is recieved
  * while the process is waiting for SIGCONT.
diff --git a/tests/sysctl_get_owned_vmobjects.c b/tests/sysctl_get_owned_vmobjects.c
new file mode 100644 (file)
index 0000000..f1a1ffb
--- /dev/null
@@ -0,0 +1,140 @@
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <mach/mach.h>
+#include <mach/task_info.h>
+#include <mach/vm_region.h>
+#include <mach/mach_vm.h>
+#include <sys/kern_sysctl.h>
+#include <errno.h>
+
+static const char* g_sysctl_name = "vm.get_owned_vmobjects";
+
+static void
+main_test(void)
+{
+       int ret;
+       mach_port_name_t task_name;
+       vmobject_list_output_t out_buffer;
+       size_t out_size;
+       size_t output_size;
+       const vm_size_t tmp_size = 16 * 1024 * 1024; /* arbitrary size */
+       vm_address_t tmp_buf;
+       vm_address_t tmp_buf2;
+       mach_vm_size_t addr_size;
+       mach_vm_address_t addr;
+       kern_return_t kr;
+       mach_port_t __self = mach_task_self();
+       vm_region_submap_info_data_64_t regionInfo;
+       uint32_t nestingDepth;
+       mach_msg_type_number_t count;
+
+       /* allocate a temporary buffer */
+       kr = vm_allocate(__self, &tmp_buf, tmp_size, VM_FLAGS_ANYWHERE | VM_FLAGS_PURGABLE);
+       T_QUIET;
+       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_allocate(%zu) error 0x%x (%s)",
+           tmp_size, kr, mach_error_string(kr));
+       T_QUIET;
+       T_EXPECT_NE(tmp_buf, 0UL, "failed to allocate temporary purgable buffer\n");
+
+       kr = vm_allocate(__self, &tmp_buf2, tmp_size, VM_FLAGS_ANYWHERE | VM_FLAGS_PURGABLE);
+       T_QUIET;
+       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_allocate(%zu) error 0x%x (%s)",
+           tmp_size, kr, mach_error_string(kr));
+       T_QUIET;
+       T_EXPECT_NE(tmp_buf2, 0UL, "failed to allocate temporary purgable buffer\n");
+
+       /* expected failures */
+       out_size = tmp_size;
+       ret = sysctlbyname(g_sysctl_name, NULL, 0, NULL, 0);
+       T_EXPECT_EQ(ret, -1, "expected failure with 0 parameters\n");
+       T_EXPECT_EQ(errno, EINVAL, "expected EINVAL with 0 parameters\n");
+
+       ret = sysctlbyname(g_sysctl_name, (void*) tmp_buf, &out_size, NULL, 0);
+       T_EXPECT_EQ(ret, -1, "expected failure with no new parameters\n");
+       T_EXPECT_EQ(errno, EINVAL, "expected EINVAL with 0 new parameters\n");
+
+       out_size = tmp_size;
+       ret = sysctlbyname(g_sysctl_name, NULL, 0, (void*) tmp_buf, out_size);
+       T_EXPECT_EQ(ret, -1, "expected failure with no old parameters\n");
+       T_EXPECT_EQ(errno, EINVAL, "expected EINVAL with 0 old parameters\n");
+
+       task_name = MACH_PORT_NULL;
+       ret = sysctlbyname(g_sysctl_name, (void*) tmp_buf, &out_size, &task_name, sizeof(task_name));
+       T_EXPECT_EQ(ret, -1, "expected failure with task_name == MACH_PORT_NULL in new parameters\n");
+       T_EXPECT_EQ(errno, ESRCH, "expected ESRCH with invalid task port name\n");
+
+       /* we should get the number of entries we should allocate for */
+       out_size = 0;
+       output_size = 0;
+       task_name = mach_task_self();
+       ret = sysctlbyname(g_sysctl_name, NULL, &out_size, &task_name, sizeof(task_name));
+       T_QUIET;
+       T_EXPECT_EQ(ret, 0, "failed getting the number of entries\n");
+       T_EXPECT_EQ(out_size, 2 * sizeof(vm_object_query_data_t) + sizeof(int64_t), "expeccted one entry\n");
+
+       /* calculcate and allocate the proper sized output buffer */
+       output_size = out_size;
+       out_buffer = (vmobject_list_output_t)calloc(output_size, 1);
+       T_QUIET;
+       T_EXPECT_NE(out_buffer, NULL, "failed to allocate the output buffer for sysctlbyname\n");
+
+       /* get the truncated list for the current process */
+       memset(out_buffer, 0, output_size);
+       out_size = 1 * sizeof(vm_object_query_data_t) + sizeof(int64_t);
+       ret = sysctlbyname(g_sysctl_name, out_buffer, &out_size, &task_name, sizeof(task_name));
+
+       T_QUIET;
+       T_EXPECT_EQ(ret, 0, "sysctlbyname failed\n");
+       T_EXPECT_EQ(out_size, 1 * sizeof(vm_object_query_data_t) + sizeof(int64_t), "sysctl return size is incorrect\n");
+       T_EXPECT_EQ(out_buffer->entries, 1ULL, "should have 1 vm object\n");
+       T_EXPECT_NE(out_buffer->data[0].object_id, 0ULL, "vm_object_id should not be 0\n");
+
+       /* get the list for the current process */
+       out_size = output_size;
+       memset(out_buffer, 0, output_size);
+       ret = sysctlbyname(g_sysctl_name, out_buffer, &out_size, &task_name, sizeof(task_name));
+
+       T_QUIET;
+       T_EXPECT_EQ(ret, 0, "sysctlbyname failed\n");
+       T_EXPECT_EQ(out_size, 2 * sizeof(vm_object_query_data_t) + sizeof(int64_t), "sysctl return size is incorrect\n");
+       T_EXPECT_EQ(out_buffer->entries, 2ULL, "should have 2 vm objects\n");
+       T_EXPECT_NE(out_buffer->data[0].object_id, 0ULL, "vm_object_id should not be 0\n");
+
+       addr = tmp_buf;
+       addr_size = tmp_size;
+       nestingDepth = UINT_MAX;
+       count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
+       kr = mach_vm_region_recurse(__self, &addr, &addr_size, &nestingDepth, (vm_region_info_t)&regionInfo, &count);
+       T_QUIET;
+       T_EXPECT_EQ(kr, KERN_SUCCESS, "mach_vm_region_recurse(%zu) error 0x%x (%s)\n",
+           tmp_size, kr, mach_error_string(kr));
+       T_EXPECT_EQ(regionInfo.object_id_full, out_buffer->data[0].object_id, "object_id_full does not match out_buffer->object[0]\n");
+
+       addr = tmp_buf2;
+       addr_size = tmp_size;
+       nestingDepth = UINT_MAX;
+       count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
+       kr = mach_vm_region_recurse(__self, &addr, &addr_size, &nestingDepth, (vm_region_info_t)&regionInfo, &count);
+       T_QUIET;
+       T_EXPECT_EQ(kr, KERN_SUCCESS, "mach_vm_region_recurse(%zu) error 0x%x (%s)\n",
+           tmp_size, kr, mach_error_string(kr));
+       T_EXPECT_EQ(regionInfo.object_id_full, out_buffer->data[1].object_id, "object_id_full does not match out_buffer->object[1]\n");
+
+       kr = vm_deallocate(__self, tmp_buf, tmp_size);
+       T_QUIET;
+       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate(%zu) error 0x%x (%s)\n",
+           tmp_size, kr, mach_error_string(kr));
+
+       kr = vm_deallocate(__self, tmp_buf2, tmp_size);
+       T_QUIET;
+       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate(%zu) error 0x%x (%s)\n",
+           tmp_size, kr, mach_error_string(kr));
+
+       free(out_buffer);
+       out_buffer = NULL;
+}
+
+T_DECL(test_get_vmobject_list, "Get owned vm_objects for process")
+{
+       main_test();
+}
index 74ab31f23f5e63a6312cdb6b893b66e2b8e32df8..a40a5d569ed098174dec3f659a94c4222ba49240 100644 (file)
@@ -13,6 +13,8 @@
 #include <sys/sysctl.h>
 #include <unistd.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 /* *************************************************************************************
  * Test the task_info API.
  *
@@ -163,6 +165,42 @@ T_DECL(task_vm_info, "tests task vm info", T_META_ASROOT(true), T_META_LTEPHASE(
            "task_info --rev2 call returned value 0x%llx for vm_info.max_address. Expected anything other than 0x%llx since "
            "this value should be modified by rev2",
            vm_info.max_address, CANARY);
+
+       /*
+        * Test the REV4 version of TASK_VM_INFO.
+        */
+
+       count                         = TASK_VM_INFO_REV4_COUNT;
+       vm_info.phys_footprint        = TESTPHYSFOOTPRINTVAL;
+       vm_info.min_address           = CANARY;
+       vm_info.max_address           = CANARY;
+       vm_info.limit_bytes_remaining = CANARY;
+
+       err = task_info(mach_task_self(), TASK_VM_INFO_PURGEABLE, (task_info_t)&vm_info, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+
+       T_EXPECT_EQ(count, TASK_VM_INFO_REV4_COUNT, "task_info count(%d) is equal to TASK_VM_INFO_REV4_COUNT\n", count);
+
+       T_EXPECT_NE(vm_info.phys_footprint, (unsigned long long)TESTPHYSFOOTPRINTVAL,
+           "task_info --rev4 call returned value %llu for vm_info.phys_footprint.  Expected anything other than %u since this "
+           "value should be modified by rev4",
+           vm_info.phys_footprint, TESTPHYSFOOTPRINTVAL);
+
+       T_EXPECT_NE(vm_info.min_address, CANARY,
+           "task_info --rev4 call returned value 0x%llx for vm_info.min_address. Expected anything other than 0x%llx since "
+           "this value should be modified by rev4",
+           vm_info.min_address, CANARY);
+
+       T_EXPECT_NE(vm_info.max_address, CANARY,
+           "task_info --rev4 call returned value 0x%llx for vm_info.max_address. Expected anything other than 0x%llx since "
+           "this value should be modified by rev4",
+           vm_info.max_address, CANARY);
+
+       T_EXPECT_NE(vm_info.limit_bytes_remaining, CANARY,
+           "task_info --rev4 call returned value 0x%llx for vm_info.limit_bytes_remaining. Expected anything other than 0x%llx since "
+           "this value should be modified by rev4",
+           vm_info.limit_bytes_remaining, CANARY);
 }
 
 T_DECL(host_debug_info, "tests host debug info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
index f56e38da596cdad4c46e72a29b34410c0d40d84c..a5872b27d7ec49ce450d74f378d16f8c4c57f895 100644 (file)
@@ -12,6 +12,8 @@
 #include <sys/wait.h>
 #include <stdlib.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 static void
 do_child(int *pipefd)
 {
index 5b3dff783e6dfaf6f97e7e151f3d0863259ed2ee..b9fbe2ee7bb1b2e2e45a7ea2f31fd4adb8f4a76a 100644 (file)
@@ -13,7 +13,8 @@
 #include <sys/sysctl.h>
 #include <unistd.h>
 
-T_GLOBAL_META(T_META_NAMESPACE("xnu.ipc"));
+T_GLOBAL_META(T_META_NAMESPACE("xnu.ipc"),
+    T_META_RUN_CONCURRENTLY(true));
 
 /*
  * Attempt to inspect kernel_task using a task_inspect_t.  Interact with the
diff --git a/tests/task_vm_info_decompressions.c b/tests/task_vm_info_decompressions.c
new file mode 100644 (file)
index 0000000..281f967
--- /dev/null
@@ -0,0 +1,230 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <mach/task_info.h>
+#include <mach/mach.h>
+#include <unistd.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/kern_memorystatus.h>
+#include <sys/sysctl.h>
+#include <stdatomic.h>
+
+#include <darwintest.h>
+#include <TargetConditionals.h>
+
+#define KB 1024
+#define MALLOC_SIZE_PER_THREAD (64 * KB)
+#define freezer_path "/usr/local/bin/freeze"
+
+/* BridgeOS could spend more time execv freezer */
+#if TARGET_OS_BRIDGE
+static int timeout = 600;
+#else
+static int timeout = 120;
+#endif
+
+static _Atomic int thread_malloc_count = 0;
+static _Atomic int thread_thawed_count = 0;
+static _Atomic int phase = 0;
+
+struct thread_args {
+       int    id;
+};
+
+static void
+freeze_pid(pid_t pid)
+{
+       char pid_str[6];
+       char *args[3];
+       pid_t child_pid;
+       int status;
+
+       sprintf(pid_str, "%d", pid);
+       child_pid = fork();
+       if (child_pid == 0) {
+               /* Launch freezer */
+               args[0] = freezer_path;
+               args[1] = pid_str;
+               args[2] = NULL;
+               execv(freezer_path, args);
+               /* execve() does not return on success */
+               perror("execve");
+               T_FAIL("execve() failed");
+       }
+
+       /* Wait for freezer to complete */
+       T_LOG("Waiting for freezer %d to complete", child_pid);
+       while (0 == waitpid(child_pid, &status, WNOHANG)) {
+               if (timeout < 0) {
+                       kill(child_pid, SIGKILL);
+                       T_FAIL("Freezer took too long to freeze the test");
+               }
+               sleep(1);
+               timeout--;
+       }
+       if (WIFEXITED(status) != 1 || WEXITSTATUS(status) != 0) {
+               T_FAIL("Freezer error'd out");
+       }
+}
+static void *
+worker_thread_function(void *args)
+{
+       struct thread_args *targs = args;
+       int thread_id = targs->id;
+       char *array;
+
+       /* Allocate memory */
+       array = malloc(MALLOC_SIZE_PER_THREAD);
+       T_EXPECT_NOTNULL(array, "thread %d allocated heap memory to be dirtied", thread_id);
+
+       /* Waiting for phase 1 (touch pages) to start */
+       while (atomic_load(&phase) != 1) {
+               ;
+       }
+
+       /* Phase 1: touch pages */
+       T_LOG("thread %d phase 1: dirtying %d heap pages (%d bytes)", thread_id, MALLOC_SIZE_PER_THREAD / (int)PAGE_SIZE, MALLOC_SIZE_PER_THREAD);
+       memset(&array[0], 1, MALLOC_SIZE_PER_THREAD);
+       atomic_fetch_add(&thread_malloc_count, 1);
+
+       /* Wait for process to be frozen */
+       while (atomic_load(&phase) != 2) {
+               ;
+       }
+
+       /* Phase 2, process thawed, trigger decompressions by re-faulting pages */
+       T_LOG("thread %d phase 2: faulting pages back in to trigger decompressions", thread_id);
+       memset(&array[0], 1, MALLOC_SIZE_PER_THREAD);
+
+       /* Main thread will retrieve vm statistics once all threads are thawed */
+       atomic_fetch_add(&thread_thawed_count, 1);
+
+       free(array);
+
+
+#if 0 /* Test if the thread's decompressions counter was added to the task decompressions counter when a thread terminates */
+       if (thread_id < 2) {
+               sleep(10);
+       }
+#endif
+
+       return NULL;
+}
+
+static pthread_t*
+create_threads(int nthreads, pthread_t *threads, struct thread_args *targs)
+{
+       int i;
+       int err;
+       pthread_attr_t attr;
+
+       err = pthread_attr_init(&attr);
+       T_ASSERT_POSIX_ZERO(err, "pthread_attr_init");
+       for (i = 0; i < nthreads; i++) {
+               targs[i].id = i;
+               err = pthread_create(&threads[i], &attr, worker_thread_function, (void*)&targs[i]);
+               T_QUIET; T_ASSERT_POSIX_ZERO(err, "pthread_create");
+       }
+
+       return threads;
+}
+
+static void
+join_threads(int nthreads, pthread_t *threads)
+{
+       int i;
+       int err;
+
+       for (i = 0; i < nthreads; i++) {
+               err = pthread_join(threads[i], NULL);
+               T_QUIET; T_ASSERT_POSIX_ZERO(err, "pthread_join");
+       }
+}
+
+T_DECL(task_vm_info_decompressions,
+    "Test multithreaded per-task decompressions counter")
+{
+       int     err;
+       int     ncpu;
+       size_t  ncpu_size = sizeof(ncpu);
+       int     npages;
+       int     compressor_mode;
+       size_t  compressor_mode_size = sizeof(compressor_mode);
+       task_vm_info_data_t vm_info;
+       mach_msg_type_number_t count;
+       pthread_t *threads;
+       struct thread_args *targs;
+
+       T_SETUPBEGIN;
+
+       /* Make sure freezer is enabled on target machine */
+       err = sysctlbyname("vm.compressor_mode", &compressor_mode, &compressor_mode_size, NULL, 0);
+       if (compressor_mode < 8) {
+               T_SKIP("This test requires freezer which is not available on the testing platform (vm.compressor_mode is set to %d)", compressor_mode);
+       }
+#if TARGET_OS_BRIDGE
+       T_SKIP("This test requires freezer which is not available on bridgeOS (vm.compressor_mode is set to %d)", compressor_mode);
+#endif
+
+       /* Set number of threads to ncpu available on testing device */
+       err = sysctlbyname("hw.ncpu", &ncpu, &ncpu_size, NULL, 0);
+       T_EXPECT_EQ_INT(0, err, "Detected %d cpus\n", ncpu);
+
+       /* Set total number of pages to be frozen */
+       npages = ncpu * MALLOC_SIZE_PER_THREAD / (int)PAGE_SIZE;
+       T_LOG("Test will be freezing at least %d heap pages\n", npages);
+
+       /* Change state to freezable */
+       err = memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE, getpid(), (uint32_t)1, NULL, 0);
+       T_EXPECT_EQ(KERN_SUCCESS, err, "set pid %d to be freezable", getpid());
+
+       /* Call into kernel to retrieve vm_info and make sure we do not have any decompressions before the test */
+       count = TASK_VM_INFO_COUNT;
+       err = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count);
+       T_EXPECT_EQ(count, TASK_VM_INFO_COUNT, "count == TASK_VM_INFO_COUNT: %d", count);
+       T_EXPECT_EQ_INT(0, err, "task_info(TASK_VM_INFO) returned 0");
+       T_EXPECT_EQ_INT(0, vm_info.decompressions, "Expected 0 decompressions before test starts");
+
+       /* Thread data */
+       threads = malloc(sizeof(pthread_t) * (size_t)ncpu);
+       targs = malloc(sizeof(struct thread_args) * (size_t)ncpu);
+
+       T_SETUPEND;
+
+       /* Phase 1: create threads to write to malloc memory */
+       create_threads(ncpu, threads, targs);
+       atomic_fetch_add(&phase, 1);
+
+       /* Wait for all threads to dirty their malloc pages */
+       while (atomic_load(&thread_malloc_count) != ncpu) {
+               sleep(1);
+       }
+       T_EXPECT_EQ(ncpu, atomic_load(&thread_malloc_count), "%d threads finished writing to malloc pages\n", ncpu);
+
+       /* Launch freezer to compress the dirty pages */
+       T_LOG("Running freezer to compress pages for pid %d", getpid());
+       freeze_pid(getpid());
+
+       /* Phase 2: triger decompression in threads */
+       atomic_fetch_add(&phase, 1);
+
+       /* Wait for all threads to decompress their malloc pages */
+       while (atomic_load(&thread_thawed_count) != ncpu) {
+               sleep(1);
+       }
+
+       /* Phase 3: Call into kernel to retrieve vm_info and to get the updated decompressions counter */
+       count = TASK_VM_INFO_COUNT;
+       err = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count);
+       T_EXPECT_EQ(count, TASK_VM_INFO_COUNT, "count == TASK_VM_INFO_COUNT: %d", count);
+       T_EXPECT_EQ(0, err, "task_info(TASK_VM_INFO) returned 0");
+
+       /* Make sure this task has decompressed at least all of the dirtied memory */
+       T_EXPECT_GE_INT(vm_info.decompressions, npages, "decompressed %d pages (>= heap pages: %d)", vm_info.decompressions, npages);
+       T_PASS("Correctly retrieve per-task decompressions stats");
+
+       /* Cleanup */
+       join_threads(ncpu, threads);
+       free(threads);
+       free(targs);
+}
index 810dcf2d8ed4f952e6d803d1f16bd13b3e3159ad..abf66285b60955b0331f7963f484ff5f75bce9fb 100644 (file)
@@ -7,6 +7,7 @@
 #include <kern/debug.h>
 #include <notify.h>
 #include <sys/kdebug.h>
+#include <sys/sysctl.h>
 #include <TargetConditionals.h>
 
 enum telemetry_pmi {
@@ -23,6 +24,26 @@ T_GLOBAL_META(T_META_NAMESPACE("xnu.debugging.telemetry"),
 extern int __telemetry(uint64_t cmd, uint64_t deadline, uint64_t interval,
     uint64_t leeway, uint64_t arg4, uint64_t arg5);
 
+/*
+ * Microstackshots based on PMI are only supported on devices with monotonic
+ * support.
+ */
+
+static void
+skip_if_pmi_unsupported(void)
+{
+       int supported = 0;
+       int ret = sysctlbyname("kern.monotonic.supported", &supported,
+           &(size_t){ sizeof(supported), }, NULL, 0);
+       if (ret < 0) {
+               T_SKIP("monotonic sysctl generated an error: %d (%s)", errno,
+                   strerror(errno));
+       }
+       if (!supported) {
+               T_SKIP("monotonic must be supported for microstackshots");
+       }
+}
+
 /*
  * Data Analytics (da) also has a microstackshot configuration -- set a PMI
  * cycle interval of 0 to force it to disable microstackshot on PMI.
@@ -50,7 +71,11 @@ disable_da_microstackshots(void)
        CFNumberRef num = CFNumberCreate(NULL, kCFNumberSInt64Type, &zero);
        set_da_microstackshot_period(num);
        T_LOG("notified da of tasking change, sleeping");
+#if TARGET_OS_WATCH
+       sleep(8);
+#else /* TARGET_OS_WATCH */
        sleep(3);
+#endif /* !TARGET_OS_WATCH */
 }
 
 /*
@@ -68,8 +93,7 @@ reenable_da_microstackshots(void)
 static void
 telemetry_cleanup(void)
 {
-       int ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_NONE, 0, 0, 0, 0);
-       T_EXPECT_POSIX_SUCCESS(ret, "telemetry(... NONE ...)");
+       (void)__telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_NONE, 0, 0, 0, 0);
        reenable_da_microstackshots();
 }
 
@@ -107,9 +131,7 @@ thread_spin(__unused void *arg)
 
 T_DECL(microstackshot_pmi, "attempt to configure microstackshots on PMI")
 {
-#if TARGET_OS_WATCH
-       T_SKIP("unsupported platform");
-#endif /* TARGET_OS_WATCH */
+       skip_if_pmi_unsupported();
 
        T_SETUPBEGIN;
        ktrace_session_t s = ktrace_session_create();
@@ -122,6 +144,7 @@ T_DECL(microstackshot_pmi, "attempt to configure microstackshots on PMI")
        __block int interrupt_records = 0;
        __block int timer_arm_records = 0;
        __block int unknown_records = 0;
+       __block int empty_records = 0;
 
        ktrace_events_single(s, MT_MICROSTACKSHOT, ^(__unused struct trace_point *tp) {
                pmi_events++;
@@ -141,6 +164,14 @@ T_DECL(microstackshot_pmi, "attempt to configure microstackshots on PMI")
                        timer_arm_records++;
                }
 
+               if (start->arg2 == end->arg2) {
+                       /*
+                        * The buffer didn't grow for this record -- there was
+                        * an error.
+                        */
+                       empty_records++;
+               }
+
                const uint8_t any_record = kPMIRecord | kIORecord | kInterruptRecord |
                kTimerArmingRecord;
                if ((start->arg1 & any_record) == 0) {
@@ -158,8 +189,11 @@ T_DECL(microstackshot_pmi, "attempt to configure microstackshots on PMI")
                pmi_records / (double)SLEEP_SECS);
                T_EXPECT_EQ(unknown_records, 0, "saw zero unknown record events");
                T_EXPECT_GT(microstackshot_record_events, 0,
-               "saw non-zero microstackshot record events (%g/sec)",
+               "saw non-zero microstackshot record events (%d -- %g/sec)",
+               microstackshot_record_events,
                microstackshot_record_events / (double)SLEEP_SECS);
+               T_EXPECT_NE(empty_records, microstackshot_record_events,
+               "saw non-empty records (%d empty)", empty_records);
 
                if (interrupt_records > 0) {
                        T_LOG("saw %g interrupt records per second",
@@ -216,6 +250,8 @@ T_DECL(microstackshot_pmi, "attempt to configure microstackshots on PMI")
 T_DECL(error_handling,
     "ensure that error conditions for the telemetry syscall are observed")
 {
+       skip_if_pmi_unsupported();
+
        telemetry_init();
 
        int ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_INSTRS,
index e715b428d65180d8001cd32d69ea5941197b6bb7..72a34ad8e524417fe1d37fdd3c3b244b27f8b7c8 100644 (file)
@@ -9,6 +9,8 @@
 
 #include <darwintest.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 static int nthreads = 0;
 static int fd;
 static _Atomic int phase = 0;
index 1c7eb3f6ca56ac4be1cddbeecff9edc1a6323641..507219204331e11064abb6c1a2d88897643c6650 100644 (file)
@@ -2,6 +2,8 @@
 #include <ktrace.h>
 #include <sys/kdebug.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 #define TEST_EVENTID (0xfedcbb00)
 
 static void*
diff --git a/tests/time.c b/tests/time.c
new file mode 100644 (file)
index 0000000..178b4c6
--- /dev/null
@@ -0,0 +1,99 @@
+#include <darwintest.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/syslimits.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+
+T_GLOBAL_META(T_META_CHECK_LEAKS(false));
+
+T_DECL(settimeofday, "check setting and getting time of day",
+    T_META_ASROOT(true))
+{
+       struct timeval origtime = {};
+       struct timezone origtz = {};
+       int ret = gettimeofday(&origtime, &origtz);
+       T_ASSERT_POSIX_SUCCESS(ret, "get current time with gettimeofday(2)");
+
+#if TARGET_OS_BRIDGE
+       /*
+        * bridgeOS is not allowed to set the time -- only the macOS side can.
+        */
+       T_SKIP("bridgeOS is not allowed to call settimeofday(2)");
+#endif /* TARGET_OS_BRIDGE */
+
+       struct timeval newtime = {};
+       newtime = origtime;
+       newtime.tv_sec -= 60;
+       ret = settimeofday(&newtime, NULL);
+       T_ASSERT_POSIX_SUCCESS(ret,
+           "set time back 60 seconds with settimeofday(2)");
+
+       ret = gettimeofday(&newtime, NULL);
+       T_ASSERT_POSIX_SUCCESS(ret, "get new time with gettimeofday(2)");
+
+       T_ASSERT_GT(origtime.tv_sec, newtime.tv_sec,
+           "new time should be before original time");
+
+       newtime = origtime;
+       newtime.tv_sec += 1;
+       ret = settimeofday(&newtime, NULL);
+       T_ASSERT_POSIX_SUCCESS(ret,
+           "set time close to original value with gettimeofday(2)");
+}
+
+static char tmppath[PATH_MAX] = "";
+
+static void
+cleanup_tmpfile(void)
+{
+       if (tmppath[0] != '\0') {
+               unlink(tmppath);
+       }
+}
+
+static int
+create_tmpfile(void)
+{
+       const char *tmpdir = getenv("TMPDIR");
+       strlcat(tmppath, tmpdir ? tmpdir : "/tmp", sizeof(tmppath));
+       strlcat(tmppath, "xnu_quick_test.XXXXX", sizeof(tmppath));
+       int fd = mkstemp(tmppath);
+       T_ASSERT_POSIX_SUCCESS(fd, "created temporary file at %s", tmppath);
+       T_ATEND(cleanup_tmpfile);
+       return fd;
+}
+
+T_DECL(futimes, "check that futimes updates file times",
+    T_META_RUN_CONCURRENTLY(true))
+{
+       int tmpfd = create_tmpfile();
+
+       struct stat stbuf = {};
+       int ret = fstat(tmpfd, &stbuf);
+       T_ASSERT_POSIX_SUCCESS(ret, "get file metadata with fstat(2)");
+       struct timeval amtimes[2] = {};
+       TIMESPEC_TO_TIMEVAL(&amtimes[0], &stbuf.st_atimespec);
+       TIMESPEC_TO_TIMEVAL(&amtimes[1], &stbuf.st_mtimespec);
+
+       amtimes[0].tv_sec -= 120;
+       amtimes[1].tv_sec -= 120;
+
+       ret = futimes(tmpfd, amtimes);
+       T_ASSERT_POSIX_SUCCESS(ret, "update file times with utimes(2)");
+
+       ret = fstat(tmpfd, &stbuf);
+       T_ASSERT_POSIX_SUCCESS(ret, "get file metadata after update with fstat(2)");
+       struct timeval newamtimes[2] = {};
+       TIMESPEC_TO_TIMEVAL(&newamtimes[0], &stbuf.st_atimespec);
+       TIMESPEC_TO_TIMEVAL(&newamtimes[1], &stbuf.st_mtimespec);
+
+       /*
+        * Reading the metadata shouldn't count as an access.
+        */
+       T_ASSERT_EQ(amtimes[0].tv_sec, newamtimes[0].tv_sec,
+           "access time matches what was set");
+       T_ASSERT_EQ(amtimes[1].tv_sec, newamtimes[1].tv_sec,
+           "modification time matches what was set");
+}
index 95dfadeb099e33564d851e539ad9b53e814af888..65fd2db0711ff7d61b9a35833264ea423248f067 100644 (file)
@@ -34,6 +34,12 @@ T_GLOBAL_META(T_META_NAMESPACE("xnu.turnstile_multihop"));
 
 #define HELPER_TIMEOUT_SECS (3000)
 
+struct test_msg {
+       mach_msg_header_t header;
+       mach_msg_body_t body;
+       mach_msg_port_descriptor_t port_descriptor;
+};
+
 static boolean_t spin_for_ever = false;
 
 static void
@@ -220,46 +226,106 @@ get_user_promotion_basepri(void)
        return thread_policy.thps_user_promotion_basepri;
 }
 
-static int messages_received = 0;
+#define LISTENER_WLID  0x100
+#define CONN_WLID      0x200
+
+static uint32_t
+register_port_options(void)
+{
+       return MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
+              MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) |
+              MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) |
+              MACH_RCV_VOUCHER;
+}
+
+static void
+register_port(uint64_t wlid, mach_port_t port)
+{
+       int r;
+
+       struct kevent_qos_s kev = {
+               .ident  = port,
+               .filter = EVFILT_MACHPORT,
+               .flags  = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED,
+               .fflags = register_port_options(),
+               .data   = 1,
+               .qos    = (int32_t)_pthread_qos_class_encode(QOS_CLASS_MAINTENANCE, 0, 0)
+       };
+
+       struct kevent_qos_s kev_err = { 0 };
+
+       /* Setup workloop for mach msg rcv */
+       r = kevent_id(wlid, &kev, 1, &kev_err, 1, NULL,
+           NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS);
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "kevent_id");
+       T_QUIET; T_ASSERT_EQ(r, 0, "no errors returned from kevent_id");
+}
+
 /*
  * Basic WL handler callback, it checks the
  * effective Qos of the servicer thread.
  */
 static void
-workloop_cb_test_intransit(uint64_t *workloop_id __unused, void **eventslist __unused, int *events)
+workloop_cb_test_intransit(uint64_t *workloop_id, void **eventslist, int *events)
 {
-       messages_received++;
-       T_LOG("Workloop handler workloop_cb_test_intransit called. Received message no %d",
-           messages_received);
+       static bool got_peer;
+
+       struct kevent_qos_s *kev = eventslist[0];
+       mach_msg_header_t *hdr;
+       struct test_msg *tmsg;
 
+       T_LOG("Workloop handler %s called. Received message on 0x%llx",
+           __func__, *workloop_id);
 
        /* Skip the test if we can't check Qos */
        if (geteuid() != 0) {
                T_SKIP("kevent_qos test requires root privileges to run.");
        }
 
-       if (messages_received == 1) {
-               sleep(5);
-               T_LOG("Do some CPU work.");
-               do_work(5000);
+       T_QUIET; T_ASSERT_EQ(*events, 1, "should have one event");
+
+       hdr = (mach_msg_header_t *)kev->ext[0];
+       T_ASSERT_NOTNULL(hdr, "has a message");
+       T_ASSERT_EQ(hdr->msgh_size, (uint32_t)sizeof(struct test_msg), "of the right size");
+       tmsg = (struct test_msg *)hdr;
+
+       switch (*workloop_id) {
+       case LISTENER_WLID:
+               T_LOG("Registering peer connection");
+               T_QUIET; T_ASSERT_FALSE(got_peer, "Should not have seen peer yet");
+               got_peer = true;
+               break;
+
+       case CONN_WLID:
+               T_LOG("Received message on peer");
+               break;
+
+       default:
+               T_FAIL("???");
+       }
+
+       sleep(5);
+       T_LOG("Do some CPU work.");
+       do_work(5000);
 
-               /* Check if the override now is IN + 60 boost */
-               T_EXPECT_EFFECTIVE_QOS_EQ(QOS_CLASS_USER_INITIATED,
-                   "dispatch_source event handler QoS should be QOS_CLASS_USER_INITIATED");
-               T_EXPECT_EQ(get_user_promotion_basepri(), 60u,
-                   "dispatch_source event handler should be overridden at 60");
+       /* Check if the override now is IN + 60 boost */
+       T_EXPECT_EFFECTIVE_QOS_EQ(QOS_CLASS_USER_INITIATED,
+           "dispatch_source event handler QoS should be QOS_CLASS_USER_INITIATED");
+       T_EXPECT_EQ(get_user_promotion_basepri(), 60u,
+           "dispatch_source event handler should be overridden at 60");
+
+       if (*workloop_id == LISTENER_WLID) {
+               register_port(CONN_WLID, tmsg->port_descriptor.name);
 
-               /* Enable the knote to get 2nd message */
-               struct kevent_qos_s *kev = *eventslist;
                kev->flags = EV_ADD | EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED;
-               kev->fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
-                   MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) |
-                   MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) |
-                   MACH_RCV_VOUCHER);
+               kev->fflags = register_port_options();
+               kev->ext[0] = kev->ext[1] = kev->ext[2] = kev->ext[3] = 0;
                *events = 1;
        } else {
+               /* this will unblock the waiter */
+               mach_msg_destroy(hdr);
                *events = 0;
-               exit(0);
        }
 }
 
@@ -331,11 +397,7 @@ send(
 {
        kern_return_t ret = 0;
 
-       struct {
-               mach_msg_header_t header;
-               mach_msg_body_t body;
-               mach_msg_port_descriptor_t port_descriptor;
-       } send_msg = {
+       struct test_msg send_msg = {
                .header = {
                        .msgh_remote_port = send_port,
                        .msgh_local_port  = reply_port,
@@ -598,7 +660,7 @@ thread_at_sixty(void *arg __unused)
 
        T_QUIET; T_LOG("The time for priority 60 thread to acquire lock was %llu \n",
            (after_lock_time - before_lock_time));
-       exit(0);
+       T_END;
 }
 
 static void *
@@ -669,35 +731,44 @@ thread_at_default(void *arg __unused)
 static void *
 thread_at_maintenance(void *arg __unused)
 {
-       mach_port_t qos_send_port;
+       mach_port_t service_port;
+       mach_port_t conn_port;
        mach_port_t special_reply_port;
+       mach_port_options_t opts = {
+               .flags = MPO_INSERT_SEND_RIGHT,
+       };
 
        main_thread_port = mach_thread_self();
 
        set_thread_name(__FUNCTION__);
 
        kern_return_t kr = bootstrap_look_up(bootstrap_port,
-           TURNSTILE_MULTIHOP_SERVICE_NAME, &qos_send_port);
+           TURNSTILE_MULTIHOP_SERVICE_NAME, &service_port);
        T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
 
+       kr = mach_port_construct(mach_task_self(), &opts, 0ull, &conn_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct");
+
        special_reply_port = thread_get_special_reply_port();
        T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port");
 
        /* Become the dispatch sync owner, dispatch_sync_owner will be set in dispatch_sync_wait function */
 
-       /* Send an async message */
-       send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL,
+       /* Send a sync message */
+       send(conn_port, special_reply_port, MACH_PORT_NULL,
            (uint32_t)_pthread_qos_class_encode(QOS_CLASS_MAINTENANCE, 0, 0), 0);
 
-       /* Send a sync message */
-       send(qos_send_port, special_reply_port, MACH_PORT_NULL,
+       /* Send an async checkin message */
+       send(service_port, MACH_PORT_NULL, conn_port,
            (uint32_t)_pthread_qos_class_encode(QOS_CLASS_MAINTENANCE, 0, 0), 0);
 
        /* Create a new thread at QOS_CLASS_DEFAULT qos */
        thread_create_at_qos(QOS_CLASS_DEFAULT, thread_at_default);
 
        /* Block on Sync IPC */
-       receive(special_reply_port, qos_send_port);
+       receive(special_reply_port, service_port);
+
+       T_LOG("received reply");
 
        dispatch_sync_cancel(def_thread_port, QOS_CLASS_DEFAULT);
        return NULL;
@@ -706,19 +777,8 @@ thread_at_maintenance(void *arg __unused)
 T_HELPER_DECL(three_ulock_sync_ipc_hop,
     "Create chain of 4 threads with 3 ulocks and 1 sync IPC at different qos")
 {
-       dt_stat_time_t roundtrip_stat = dt_stat_time_create("multihop_lock_acquire");
-
-       T_STAT_MEASURE_LOOP(roundtrip_stat) {
-               if (fork() == 0) {
-                       thread_create_at_qos(QOS_CLASS_MAINTENANCE, thread_at_maintenance);
-                       sigsuspend(0);
-                       exit(0);
-               }
-               wait(NULL);
-       }
-
-       dt_stat_finalize(roundtrip_stat);
-       T_END;
+       thread_create_at_qos(QOS_CLASS_MAINTENANCE, thread_at_maintenance);
+       sigsuspend(0);
 }
 
 static void
@@ -744,41 +804,14 @@ thread_create_at_qos(qos_class_t qos, void * (*function)(void *))
 
 #pragma mark Mach receive - kevent_qos
 
-static void
-expect_kevent_id_recv(mach_port_t port)
+T_HELPER_DECL(server_kevent_id,
+    "Reply with the QoS that a dispatch source event handler ran with")
 {
-       int r;
-
        T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
                    worker_cb, event_cb,
                    (pthread_workqueue_function_workloop_t)workloop_cb_test_intransit, 0, 0), NULL);
 
-       struct kevent_qos_s kev[] = {{
-                                            .ident = port,
-                                            .filter = EVFILT_MACHPORT,
-                                            .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED,
-                                            .fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
-           MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) |
-           MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) |
-           MACH_RCV_VOUCHER),
-                                            .data = 1,
-                                            .qos = (int32_t)_pthread_qos_class_encode(QOS_CLASS_MAINTENANCE, 0, 0)
-                                    }};
-
-       struct kevent_qos_s kev_err[] = {{ 0 }};
-
-       /* Setup workloop for mach msg rcv */
-       r = kevent_id(25, kev, 1, kev_err, 1, NULL,
-           NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS);
-
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "kevent_id");
-       T_QUIET; T_ASSERT_EQ(r, 0, "no errors returned from kevent_id");
-}
-
-T_HELPER_DECL(server_kevent_id,
-    "Reply with the QoS that a dispatch source event handler ran with")
-{
-       expect_kevent_id_recv(get_server_port());
+       register_port(LISTENER_WLID, get_server_port());
        sigsuspend(0);
        T_ASSERT_FAIL("should receive a message");
 }
index 28b5becd86b5b4a3c6b3c3ddc9a82da181e3a515..8ba659d02eaab7d7f07b07fc5acb44131d8db85c 100644 (file)
@@ -168,8 +168,7 @@ ull_unlock(lock_t *lock, int id, uint opcode, uint flags)
 
        if (prev == (ULL_WAITERS | ull_locked)) {
                /* locked with waiters */
-               *lock = 0;
-               __c11_atomic_thread_fence(__ATOMIC_ACQ_REL);
+               __c11_atomic_store(lock, 0, __ATOMIC_SEQ_CST);
 
                if ((flags & ULF_WAKE_THREAD) && (_os_get_self() == main_thread_name)) {
                        flags &= ~(uint)ULF_WAKE_THREAD;
index 34b9667f31c416d0517abc15b5a470ffa7d10532..64636f539a8b7c67f7253a9111a113a39e60d9ff 100644 (file)
 #include <sys/sysctl.h>
 #include <sys/types.h>
 
-#define SYSCTL_TURNSTILE_TEST_DEFAULT                   1
-#define SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE          2
-
+#define SYSCTL_TURNSTILE_TEST_USER_DEFAULT            1
+#define SYSCTL_TURNSTILE_TEST_USER_HASHTABLE          2
+#define SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT          3
+#define SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE        4
 
 T_GLOBAL_META(T_META_NAMESPACE("xnu.turnstiles_test"));
 
@@ -48,7 +49,7 @@ thread_create_at_qos(qos_class_t qos, void * (*function)(void *), int type)
 }
 
 static int
-get_pri(thread_t thread_port)
+get_sched_pri(thread_t thread_port)
 {
        kern_return_t kr;
 
@@ -61,6 +62,20 @@ get_pri(thread_t thread_port)
        return extended_info.pth_curpri;
 }
 
+static int
+get_base_pri(thread_t thread_port)
+{
+       kern_return_t kr;
+
+       thread_extended_info_data_t extended_info;
+       mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT;
+       kr = thread_info(thread_port, THREAD_EXTENDED_INFO,
+           (thread_info_t)&extended_info, &count);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info");
+       return extended_info.pth_priority;
+}
+
 static void
 turnstile_prim_lock(int type)
 {
@@ -68,7 +83,7 @@ turnstile_prim_lock(int type)
        uint64_t tid;
        int in_val = type;
        pthread_threadid_np(NULL, &tid);
-       T_LOG("sysctlbyname lock called from thread %llu \n", tid);
+       T_LOG("sysctlbyname lock type %d called from thread %llu \n", type, tid);
        ret = sysctlbyname("kern.turnstiles_test_lock", NULL, 0, &in_val, sizeof(in_val));
        T_LOG("sysctlbyname lock returned from thread %llu with value %d \n", tid, ret);
 }
@@ -80,15 +95,84 @@ turnstile_prim_unlock(int type)
        uint64_t tid;
        int in_val = type;
        pthread_threadid_np(NULL, &tid);
-       T_LOG("sysctlbyname unlock called from thread %llu \n", tid);
+       T_LOG("sysctlbyname unlock type %d called from thread %llu \n", type, tid);
        ret = sysctlbyname("kern.turnstiles_test_unlock", NULL, 0, &in_val, sizeof(in_val));
        T_LOG("sysctlbyname unlock returned from thread %llu with value %d \n", tid, ret);
 }
 
+struct thread_data {
+       int pri_to_set;
+       int lock1;
+       int lock2;
+       unsigned int sleep;
+       int sched_pri_to_check;
+       int base_pri_to_check;
+};
+
+static void *
+chain_locking(void* args)
+{
+       struct thread_data* data = (struct thread_data*) args;
+       int policy, pri;
+       int ret;
+       struct sched_param param;
+
+       /* Change our priority to pri_to_set */
+       ret = pthread_getschedparam(pthread_self(), &policy, &param);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_getschedparam");
+
+       param.sched_priority = data->pri_to_set;
+
+       /* this sets both sched and base pri */
+       ret = pthread_setschedparam(pthread_self(), policy, &param);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_setschedparam");
+
+       pri = get_sched_pri(mach_thread_self());
+
+       T_ASSERT_EQ(pri, data->pri_to_set, "Priority before holding locks");
+
+       /* take lock1 */
+       if (data->lock1) {
+               turnstile_prim_lock(data->lock1);
+       }
+
+       /* take lock2 */
+       if (data->lock2) {
+               turnstile_prim_lock(data->lock2);
+       }
+
+       if (data->sleep) {
+               sleep(data->sleep);
+       }
+
+       if (data->sched_pri_to_check) {
+               pri = get_sched_pri(mach_thread_self());
+               T_ASSERT_EQ(pri, data->sched_pri_to_check, "Sched priority while holding locks");
+       }
+
+       if (data->base_pri_to_check) {
+               pri = get_base_pri(mach_thread_self());
+               T_ASSERT_EQ(pri, data->base_pri_to_check, "Base priority while holding locks");
+       }
+
+       if (data->lock2) {
+               turnstile_prim_unlock(data->lock2);
+       }
+
+       if (data->lock1) {
+               turnstile_prim_unlock(data->lock1);
+       }
+
+       pri = get_sched_pri(mach_thread_self());
+       T_ASSERT_EQ(pri, data->pri_to_set, "Priority after releasing locks");
+
+       return NULL;
+}
+
 static void *
 take_lock_check_priority(void * arg)
 {
-       int old_pri = get_pri(mach_thread_self());
+       int old_pri = get_base_pri(mach_thread_self());
        int unboosted_pri;
        int boosted_pri;
        int after_unlock_pri;
@@ -102,20 +186,20 @@ take_lock_check_priority(void * arg)
        /* Take the test lock */
        turnstile_prim_lock(type);
 
-       unboosted_pri =  get_pri(mach_thread_self());
+       unboosted_pri = get_base_pri(mach_thread_self());
        T_ASSERT_EQ(unboosted_pri, 37, "thread(%llu) priority after acquiring the lock (uncontended) is %d\n", tid, unboosted_pri);
 
        sleep(8);
 
        /* Check for elevated priority */
-       boosted_pri =  get_pri(mach_thread_self());
+       boosted_pri =  get_base_pri(mach_thread_self());
        T_ASSERT_EQ(boosted_pri, 47, "thread(%llu) priority after contention by 47 thread is %d\n", tid, boosted_pri);
 
        /* Drop the lock */
        turnstile_prim_unlock(type);
 
        /* Check for regular priority */
-       after_unlock_pri =  get_pri(mach_thread_self());
+       after_unlock_pri =  get_base_pri(mach_thread_self());
        T_ASSERT_EQ(after_unlock_pri, 37, "thread(%llu) priority after dropping lock is %d\n", tid, after_unlock_pri);
 
        return NULL;
@@ -130,7 +214,7 @@ try_to_take_lock_and_unlock(void *arg)
        pthread_threadid_np(NULL, &tid);
        sleep(4);
 
-       int old_pri = get_pri(mach_thread_self());
+       int old_pri = get_base_pri(mach_thread_self());
        T_ASSERT_EQ(old_pri, 47, "thread(%llu) priority before acquiring the lock is %d\n", tid, old_pri);
 
        /* Try taking the test lock */
@@ -143,7 +227,7 @@ try_to_take_lock_and_unlock(void *arg)
 static void *
 take_lock_and_exit(void * arg)
 {
-       int old_pri = get_pri(mach_thread_self());
+       int old_pri = get_base_pri(mach_thread_self());
        int unboosted_pri;
        int boosted_pri;
        uint64_t tid;
@@ -156,13 +240,13 @@ take_lock_and_exit(void * arg)
        /* Take the test lock */
        turnstile_prim_lock(type);
 
-       unboosted_pri =  get_pri(mach_thread_self());
+       unboosted_pri =  get_base_pri(mach_thread_self());
        T_ASSERT_EQ(unboosted_pri, 37, "thread(%llu) priority after acquiring the lock (uncontended) is %d\n", tid, unboosted_pri);
 
        sleep(8);
 
        /* Check for elevated priority */
-       boosted_pri =  get_pri(mach_thread_self());
+       boosted_pri =  get_base_pri(mach_thread_self());
        T_ASSERT_EQ(boosted_pri, 47, "thread(%llu) priority after contention by 47 thread is %d\n", tid, boosted_pri);
 
        /* return without unlocking the lock */
@@ -178,7 +262,7 @@ unlock_an_owner_exited_lock(void *arg)
        pthread_threadid_np(NULL, &tid);
        sleep(12);
 
-       int old_pri = get_pri(mach_thread_self());
+       int old_pri = get_base_pri(mach_thread_self());
        T_ASSERT_EQ(old_pri, 47, "thread(%llu) priority before acquiring the lock is %d\n", tid, old_pri);
 
        /* Unlock the test lock causing the turnstile code to call thread_deallocate_safe */
@@ -246,13 +330,166 @@ test3(int type)
        return;
 }
 
-T_DECL(turnstile_test, "Turnstile test", T_META_ASROOT(YES))
+/*
+ * Test 4: test if a chain of user-space turnstile primitives followed by kernel primitives works correctly.
+ */
+static void
+test4(void)
 {
-       test1(SYSCTL_TURNSTILE_TEST_DEFAULT);
-       test2(SYSCTL_TURNSTILE_TEST_DEFAULT);
-       test3(SYSCTL_TURNSTILE_TEST_DEFAULT);
+       pthread_t threads[5] = {};
+       struct thread_data data[5] = {};
+
+       T_LOG("Test 4: test if a chain of user-space turnstile primitives followed by kernel primitives works correctly");
+
+       /*
+        * Chain: t4->ud->t3->uh->t2->kh->t1->kd->t0
+        * ud and uh (user space turnstiles) will push base pri and sched pri
+        * kd and kh (kernel space turnstiles) will push sched pri
+        * sched pri should be propagated up to the end
+        * kh is the breaking point of the chain for sched pri
+        */
+
+
+       /* Create a thread at priority 4 and take SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT lock */
+       data[0].pri_to_set = 4;
+       data[0].lock1 = SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT; /* this should be not locked */
+       data[0].lock2 = NULL;
+       data[0].sleep = 10; /* long sleep, nothing is blocking this thread */
+       data[0].sched_pri_to_check = 60;
+       data[0].base_pri_to_check = 4;
+       pthread_create(&threads[0], NULL, chain_locking, (void *)&data[0]);
+       sleep(2); /* give the thread time to acquire the lock */
+
+       /* Create a thread at priority 31 and take SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE lock followed by SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT */
+       data[1].pri_to_set = 31;
+       data[1].lock1 = SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE; /* this should be not locked */
+       data[1].lock2 = SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT; /* this should be locked */
+       data[1].sleep = 0; /* no need to sleep, everything should be pushing by the time it acquires the lock */
+       data[1].sched_pri_to_check = 60;
+       data[1].base_pri_to_check = 31;
+       pthread_create(&threads[1], NULL, chain_locking, (void *)&data[1]);
+       sleep(2); /* give the thread time to acquire the lock */
+
+       /* Create a thread at priority 40 and take SYSCTL_TURNSTILE_TEST_USER_HASHTABLE lock followed by SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE */
+       data[2].pri_to_set = 40;
+       data[2].lock1 = SYSCTL_TURNSTILE_TEST_USER_HASHTABLE; /* this should be not locked */
+       data[2].lock2 = SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE; /* this should be locked */
+       data[2].sleep = 0; /* no need to sleep, everything should be pushing by the time it acquires the lock */
+       data[2].sched_pri_to_check = 60;
+       data[2].base_pri_to_check = 60;
+       pthread_create(&threads[2], NULL, chain_locking, (void *)&data[2]);
+       sleep(2); /* give the thread time to acquire the lock */
+
+       /* Create a thread at priority 47 and take SYSCTL_TURNSTILE_TEST_USER_DEFAULT lock followed by SYSCTL_TURNSTILE_TEST_USER_HASHTABLE */
+       data[3].pri_to_set = 47;
+       data[3].lock1 = SYSCTL_TURNSTILE_TEST_USER_DEFAULT; /* this should be not locked */
+       data[3].lock2 = SYSCTL_TURNSTILE_TEST_USER_HASHTABLE; /* this should be locked */
+       data[3].sleep = 0; /* no need to sleep, everything should be pushing by the time it acquires the lock */
+       data[3].sched_pri_to_check = 60;
+       data[3].base_pri_to_check = 60;
+       pthread_create(&threads[3], NULL, chain_locking, (void *)&data[3]);
+       sleep(2); /* give the thread time to acquire the lock */
+
+       /* Create a thread at priority 60 and take SYSCTL_TURNSTILE_TEST_USER_DEFAULT */
+       data[4].pri_to_set = 60;
+       data[4].lock1 = SYSCTL_TURNSTILE_TEST_USER_DEFAULT; /* this should be locked */
+       data[4].lock2 = NULL;
+       data[4].sleep = 0; /* no need to sleep, nothing should be pushing by the time it acquires the lock */
+       data[4].sched_pri_to_check = 60; /* this is its own priority */
+       data[4].base_pri_to_check = 60;
+       pthread_create(&threads[4], NULL, chain_locking, (void *)&data[4]);
 
-       test1(SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE);
-       test2(SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE);
-       test3(SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE);
+       sleep(16);
+       return;
+}
+
+/*
+ * Test 5: test if a chain of user-space turnstile primitives interleaved by kernel primitives works correctly.
+ */
+static void
+test5(void)
+{
+       pthread_t threads[5] = {};
+       struct thread_data data[5] = {};
+
+       T_LOG("Test 5: test if a chain of user-space turnstile primitives interleaved by kernel primitives works correctly");
+
+       /*
+        * Chain: t4->ud->t3->kh->t2->uh->t1->kd->t0
+        * ud and uh (user space turnstiles) will push base pri and sched pri
+        * kd and kh (kernel space turnstiles) will push sched pri
+        * uh is the breaking point of the chain for sched pri
+        */
+
+       /* Create a thread at priority 4 and take SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT lock */
+       data[0].pri_to_set = 4;
+       data[0].lock1 = SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT; /* this should be not locked */
+       data[0].lock2 = NULL;
+       data[0].sleep = 10; /* long sleep, nothing is blocking this thread */
+       data[0].sched_pri_to_check = 41;
+       data[0].base_pri_to_check = 4;
+       pthread_create(&threads[0], NULL, chain_locking, (void *)&data[0]);
+       sleep(2); /* give the thread time to acquire the lock */
+
+       /* Create a thread at priority 31 and take SYSCTL_TURNSTILE_TEST_USER_HASHTABLE lock followed by SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT */
+       data[1].pri_to_set = 31;
+       data[1].lock1 = SYSCTL_TURNSTILE_TEST_USER_HASHTABLE; /* this should be not locked */
+       data[1].lock2 = SYSCTL_TURNSTILE_TEST_KERNEL_DEFAULT; /* this should be locked */
+       data[1].sleep = 0; /* no need to sleep, everything should be pushing by the time it acquires the lock */
+       data[1].sched_pri_to_check = 41;
+       data[1].base_pri_to_check = 41;
+       pthread_create(&threads[1], NULL, chain_locking, (void *)&data[1]);
+       sleep(2); /* give the thread time to acquire the lock */
+
+       /* Create a thread at priority 41 and take SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE lock followed by SYSCTL_TURNSTILE_TEST_USER_HASHTABLE */
+       data[2].pri_to_set = 41;
+       data[2].lock1 = SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE; /* this should be not locked */
+       data[2].lock2 = SYSCTL_TURNSTILE_TEST_USER_HASHTABLE; /* this should be locked */
+       data[2].sleep = 0; /* no need to sleep, everything should be pushing by the time it acquires the lock */
+       data[2].sched_pri_to_check = 60;
+       data[2].base_pri_to_check = 41;
+       pthread_create(&threads[2], NULL, chain_locking, (void *)&data[2]);
+       sleep(2); /* give the thread time to acquire the lock */
+
+       /* Create a thread at priority 47 and take SYSCTL_TURNSTILE_TEST_USER_DEFAULT lock followed by SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE */
+       data[3].pri_to_set = 47;
+       data[3].lock1 = SYSCTL_TURNSTILE_TEST_USER_DEFAULT; /* this should be not locked */
+       data[3].lock2 = SYSCTL_TURNSTILE_TEST_KERNEL_HASHTABLE; /* this should be locked */
+       data[3].sleep = 0; /* no need to sleep, everything should be pushing by the time it acquires the lock */
+       data[3].sched_pri_to_check = 60;
+       data[3].base_pri_to_check = 60;
+       pthread_create(&threads[3], NULL, chain_locking, (void *)&data[3]);
+       sleep(2); /* give the thread time to acquire the lock */
+
+       /* Create a thread at priority 60 and take SYSCTL_TURNSTILE_TEST_USER_DEFAULT */
+       data[4].pri_to_set = 60;
+       data[4].lock1 = SYSCTL_TURNSTILE_TEST_USER_DEFAULT; /* this should be locked */
+       data[4].lock2 = NULL;
+       data[4].sleep = 0; /* no need to sleep, nothing should be pushing by the time it acquires the lock */
+       data[4].sched_pri_to_check = 60; /* this is its own priority */
+       data[4].base_pri_to_check = 60;
+       pthread_create(&threads[4], NULL, chain_locking, (void *)&data[4]);
+
+       sleep(16);
+       return;
+}
+
+T_DECL(turnstile_test, "Turnstile test", T_META_ASROOT(YES))
+{
+       test1(SYSCTL_TURNSTILE_TEST_USER_DEFAULT);
+       test2(SYSCTL_TURNSTILE_TEST_USER_DEFAULT);
+       test3(SYSCTL_TURNSTILE_TEST_USER_DEFAULT);
+
+       test1(SYSCTL_TURNSTILE_TEST_USER_HASHTABLE);
+       test2(SYSCTL_TURNSTILE_TEST_USER_HASHTABLE);
+       test3(SYSCTL_TURNSTILE_TEST_USER_HASHTABLE);
+
+       /*
+        * rdar://problem/46302128
+        * These tests are using a sysctl to lock a dummy kernel resource that uses turnstile.
+        * However a thread holding a kernel push from turnstile should never return in
+        * userspace, and rdar://problem/24194397 adds an assert for it.
+        */
+       //test4();
+       //test5();
 }
index c534bde51d760f990d8194cee311793ac9d970c3..be5d5121ecb822a54e64e0bc07cac6942514fea5 100644 (file)
@@ -13,6 +13,8 @@
 #include <darwintest.h>
 #include <darwintest_utils.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 #define FILENAME "utimensat"
 
 static const struct timespec tptr[][2] = {
index 64a9f6901f0c881d681d34d3027de283abdfe8d3..2c5c08727f420f4951b48e99f7c026d4e5fa7f52 100644 (file)
@@ -6,7 +6,8 @@
 
 T_GLOBAL_META(
        T_META_NAMESPACE("xnu.vm"),
-       T_META_CHECK_LEAKS(false)
+       T_META_CHECK_LEAKS(false),
+       T_META_RUN_CONCURRENTLY(true)
        );
 
 static void run_test(void);
index 4dbea7be7278a70a072d3d2b42b2ee729a9a91de..44c003118b89b21f54522141970db8093a737a71 100644 (file)
 #include <mach/vm_map.h>
 
 #include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+#include <TargetConditionals.h>
 
 #include <Kernel/kern/ledger.h>
 extern int ledger(int cmd, caddr_t arg1, caddr_t arg2, caddr_t arg3);
 
-#if ENTITLED && defined(__arm64__)
-#define LEGACY_FOOTPRINT 1
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+boolean_t legacy_footprint;
+
+#if LEGACY_FOOTPRINT_ENTITLED && defined(__arm64__)
+#define TEST_VM_NAMESPACE "xnu.vm_legacy"
 #else /* ENTITLED && __arm64__ */
-#define LEGACY_FOOTPRINT 0
+#define TEST_VM_NAMESPACE "xnu.vm"
 #endif /* ENTITLED && __arm64__ */
 
 #define MEM_SIZE (100 * 1024 * 1024) /* 100 MB */
@@ -35,6 +42,8 @@ ledger_init(void)
        struct ledger_template_info     *templateInfo;
        int64_t                         templateCnt;
        int                             i;
+       int                             legacy_footprint_entitlement_mode;
+       size_t                          oldlen;
 
        if (ledger_inited) {
                return;
@@ -42,6 +51,24 @@ ledger_init(void)
        ledger_inited = 1;
 
        T_SETUPBEGIN;
+
+       legacy_footprint = FALSE;
+#if LEGACY_FOOTPRINT_ENTITLED
+       int ret;
+
+       T_QUIET;
+       T_WITH_ERRNO;
+       oldlen = sizeof(legacy_footprint_entitlement_mode);
+       ret = sysctlbyname("kern.legacy_footprint_entitlement_mode",
+           &legacy_footprint_entitlement_mode,
+           &oldlen,
+           NULL,
+           0);
+       if (ret == 0 && legacy_footprint_entitlement_mode == 2) {
+               legacy_footprint = TRUE;
+       }
+#endif /* LEGACY_FOOTPRINT_ENTITLED */
+
        T_QUIET;
        T_WITH_ERRNO;
        T_ASSERT_EQ(ledger(LEDGER_INFO,
@@ -192,7 +219,7 @@ pre_warm(
 
 T_DECL(phys_footprint_anonymous,
     "phys_footprint for anonymous memory",
-    T_META_NAMESPACE("xnu.vm"),
+    T_META_NAMESPACE(TEST_VM_NAMESPACE),
     T_META_LTEPHASE(LTE_POSTINIT))
 {
        uint64_t                footprint_before, pagetable_before;
@@ -265,7 +292,7 @@ T_DECL(phys_footprint_anonymous,
 
 T_DECL(phys_footprint_file,
     "phys_footprint for mapped file",
-    T_META_NAMESPACE("xnu.vm"),
+    T_META_NAMESPACE(TEST_VM_NAMESPACE),
     T_META_LTEPHASE(LTE_POSTINIT))
 {
        uint64_t                footprint_before, pagetable_before;
@@ -365,7 +392,7 @@ T_DECL(phys_footprint_file,
 
 T_DECL(phys_footprint_purgeable,
     "phys_footprint for purgeable memory",
-    T_META_NAMESPACE("xnu.vm"),
+    T_META_NAMESPACE(TEST_VM_NAMESPACE),
     T_META_LTEPHASE(LTE_POSTINIT))
 {
        uint64_t                footprint_before, pagetable_before;
@@ -484,7 +511,7 @@ T_DECL(phys_footprint_purgeable,
 
 T_DECL(phys_footprint_purgeable_ownership,
     "phys_footprint for owned purgeable memory",
-    T_META_NAMESPACE("xnu.vm"),
+    T_META_NAMESPACE(TEST_VM_NAMESPACE),
     T_META_LTEPHASE(LTE_POSTINIT))
 {
        uint64_t                footprint_before, pagetable_before;
@@ -648,7 +675,7 @@ T_DECL(phys_footprint_purgeable_ownership,
 #ifdef MAP_MEM_LEDGER_TAGGED
 T_DECL(phys_footprint_ledger_purgeable_owned,
     "phys_footprint for ledger-tagged purgeable memory ownership",
-    T_META_NAMESPACE("xnu.vm"),
+    T_META_NAMESPACE(TEST_VM_NAMESPACE),
     T_META_LTEPHASE(LTE_POSTINIT))
 {
        uint64_t                footprint_before, pagetable_before;
@@ -821,7 +848,7 @@ T_DECL(phys_footprint_ledger_purgeable_owned,
 
 T_DECL(phys_footprint_ledger_owned,
     "phys_footprint for ledger-tagged memory ownership",
-    T_META_NAMESPACE("xnu.vm"),
+    T_META_NAMESPACE(TEST_VM_NAMESPACE),
     T_META_LTEPHASE(LTE_POSTINIT))
 {
        uint64_t                footprint_before, pagetable_before;
@@ -830,7 +857,6 @@ T_DECL(phys_footprint_ledger_owned,
        kern_return_t           kr;
        mach_vm_address_t       pre_vm_addr, vm_addr;
        mach_vm_size_t          vm_size, dirty_size, me_size;
-       int                     state;
        mach_port_t             me_port;
 
        /* pre-warm to account for page table expansion */
@@ -988,6 +1014,11 @@ setIntValue(CFMutableDictionaryRef dict, const CFStringRef key, int value)
        CFDictionarySetValue(dict, key, number);
        CFRelease(number);
 }
+static inline void
+setBoolValue(CFMutableDictionaryRef dict, const CFStringRef key, bool value)
+{
+       CFDictionarySetValue(dict, key, value ? kCFBooleanTrue : kCFBooleanFalse);
+}
 typedef void (^SurfacePlaneBlock)(void *data, size_t planeIndex, size_t width, size_t height, size_t rowbytes);
 static IOReturn
 SurfaceApplyPlaneBlock(IOSurfaceRef surface, SurfacePlaneBlock block)
@@ -1049,6 +1080,24 @@ ClearSurface(IOSurfaceRef surface)
                }
        });
 }
+static size_t
+SurfaceGetMemorySize(IOSurfaceRef surface)
+{
+       size_t planeCount = IOSurfaceGetPlaneCount(surface);
+
+       if (planeCount == 0) {
+               size_t rb = IOSurfaceGetBytesPerRow(surface);
+               size_t h = IOSurfaceGetHeight(surface);
+               return rb * h;
+       } else if (planeCount == 2) {
+               size_t rb0 = IOSurfaceGetBytesPerRowOfPlane(surface, 0);
+               size_t h0 = IOSurfaceGetHeightOfPlane(surface, 0);
+               size_t rb1 = IOSurfaceGetBytesPerRowOfPlane(surface, 1);
+               size_t h1 = IOSurfaceGetHeightOfPlane(surface, 1);
+               return rb0 * h0 + rb1 * h1;
+       }
+       return 0;
+}
 static IOSurfaceRef
 CreateSurface(uint32_t pixelsWide, uint32_t pixelsHigh, uint32_t rowBytesAlignment, uint32_t fmt, bool purgeable, bool clear)
 {
@@ -1075,11 +1124,11 @@ CreateSurface(uint32_t pixelsWide, uint32_t pixelsHigh, uint32_t rowBytesAlignme
        setIntValue(props, kIOSurfaceWidth, (int)pixelsWide);
        setIntValue(props, kIOSurfaceHeight, (int)pixelsHigh);
        setIntValue(props, kIOSurfacePixelFormat, (int)fmt);
-#if TARGET_OS_IPHONE
-       setIntValue(props, kIOSurfaceNonPurgeable, purgeable);
-#else /* TARGET_OS_IPHONE */
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
+       setBoolValue(props, kIOSurfaceNonPurgeable, !purgeable);
+#else /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
        (void)purgeable;
-#endif /* TARGET_OS_IPHONE */
+#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
        {
                if (bpe != bpp) { // i.e. a 422 format such as 'yuvf' etc.
                        setIntValue(props, kIOSurfaceElementWidth, 2);
@@ -1099,17 +1148,19 @@ CreateSurface(uint32_t pixelsWide, uint32_t pixelsHigh, uint32_t rowBytesAlignme
 }
 T_DECL(phys_footprint_purgeable_iokit,
     "phys_footprint for purgeable IOKit memory",
-    T_META_NAMESPACE("xnu.vm"),
+    T_META_NAMESPACE(TEST_VM_NAMESPACE),
     T_META_LTEPHASE(LTE_POSTINIT))
 {
        uint64_t        footprint_before, pagetable_before;
        uint64_t        footprint_after, pagetable_after;
-       uint64_t        footprint_expected;
+       uint64_t        footprint_expected, footprint_delta_slop;
+       int64_t         footprint_delta;
        IOSurfaceRef    surface;
        uint32_t        old_state;
        uint64_t        surface_size;
 
        T_SETUPBEGIN;
+       footprint_delta_slop = 8 * vm_kernel_page_size;
        ledger_init();
        surface = CreateSurface(1024, 1024, 0, 32, true, true);
        IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableVolatile, &old_state);
@@ -1123,92 +1174,227 @@ T_DECL(phys_footprint_purgeable_iokit,
        get_ledger_info(&footprint_before, &pagetable_before);
        surface = CreateSurface(1024, 1024, 0, 32, true, true);
        get_ledger_info(&footprint_after, &pagetable_after);
-#if LEGACY_FOOTPRINT
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("LEGACY FOOTPRINT: creating IOSurface: no footprint impact");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "create IOSurface %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           surface_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-#else /* LEGACY_FOOTPRINT */
+       if (legacy_footprint) {
+               footprint_expected = footprint_before;
+               footprint_expected += (pagetable_after - pagetable_before);
+               footprint_delta = (int64_t)(footprint_after - footprint_expected);
+               T_LOG("LEGACY FOOTPRINT: creating purgeable IOSurface: no footprint impact");
+               T_EXPECT_LE((uint64_t)llabs(footprint_delta), footprint_delta_slop,
+                   "create purgeable IOSurface %lld bytes: "
+                   "footprint %lld -> %lld expected %lld delta %lld",
+                   surface_size, footprint_before, footprint_after,
+                   footprint_expected, footprint_delta);
+       } else {
+               footprint_expected = footprint_before + surface_size;
+               footprint_expected += (pagetable_after - pagetable_before);
+               footprint_delta = (int64_t)(footprint_after - footprint_expected);
+               T_LOG("creating purgeable IOSurface increases phys_footprint");
+               T_EXPECT_LE((uint64_t)llabs(footprint_delta), footprint_delta_slop,
+                   "create purgeable IOSurface %lld bytes: "
+                   "footprint %lld -> %lld expected %lld delta %lld",
+                   surface_size, footprint_before, footprint_after,
+                   footprint_expected, footprint_delta);
+       }
+
+       /* make IOSurface volatile: footprint shrinks */
+       get_ledger_info(&footprint_before, &pagetable_before);
+       IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableVolatile, &old_state);
+       get_ledger_info(&footprint_after, &pagetable_after);
+       if (legacy_footprint) {
+               footprint_expected = footprint_before;
+               footprint_expected += (pagetable_after - pagetable_before);
+               T_LOG("LEGACY FOOTPRINT: volatile IOSurface: no footprint impact");
+               T_EXPECT_EQ(footprint_after, footprint_expected,
+                   "volatile IOSurface %lld bytes: "
+                   "footprint %lld -> %lld expected %lld delta %lld",
+                   surface_size, footprint_before, footprint_after,
+                   footprint_expected, footprint_after - footprint_expected);
+       } else {
+               footprint_expected = footprint_before - surface_size;
+               footprint_expected += (pagetable_after - pagetable_before);
+               T_LOG("making IOSurface volatile decreases phys_footprint");
+               T_EXPECT_EQ(footprint_after, footprint_expected,
+                   "made volatile %lld bytes: "
+                   "footprint %lld -> %lld expected %lld delta %lld",
+                   surface_size, footprint_before, footprint_after,
+                   footprint_expected, footprint_after - footprint_expected);
+       }
+
+       /* make IOSurface non-volatile: footprint grows */
+       get_ledger_info(&footprint_before, &pagetable_before);
+       IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableNonVolatile, &old_state);
+       get_ledger_info(&footprint_after, &pagetable_after);
+       if (legacy_footprint) {
+               footprint_expected = footprint_before;
+               footprint_expected += (pagetable_after - pagetable_before);
+               T_LOG("LEGACY FOOTPRINT: non-volatile IOSurface: no footprint impact");
+               T_EXPECT_EQ(footprint_after, footprint_expected,
+                   "non-volatile IOSurface %lld bytes: "
+                   "footprint %lld -> %lld expected %lld delta %lld",
+                   surface_size, footprint_before, footprint_after,
+                   footprint_expected, footprint_after - footprint_expected);
+       } else {
+               footprint_expected = footprint_before + surface_size;
+               footprint_expected += (pagetable_after - pagetable_before);
+               T_LOG("making IOSurface non-volatile increases phys_footprint");
+               T_EXPECT_EQ(footprint_after, footprint_expected,
+                   "made non-volatile %lld bytes: "
+                   "footprint %lld -> %lld expected %lld delta %lld",
+                   surface_size, footprint_before, footprint_after,
+                   footprint_expected, footprint_after - footprint_expected);
+       }
+
+       /* accessing IOSurface re-mapping: no footprint impact */
+
+       /* deallocating IOSurface re-mapping: no footprint impact */
+
+       /* release IOSurface: footprint shrinks */
+       get_ledger_info(&footprint_before, &pagetable_before);
+       CFRelease(surface);
+       get_ledger_info(&footprint_after, &pagetable_after);
+       if (legacy_footprint) {
+               footprint_expected = footprint_before;
+               footprint_expected += (pagetable_after - pagetable_before);
+               T_LOG("LEGACY FOOTPRINT: release IOSurface: no footprint impact");
+               T_EXPECT_EQ(footprint_after, footprint_expected,
+                   "releasing IOSurface %lld bytes: "
+                   "footprint %lld -> %lld expected %lld delta %lld",
+                   surface_size, footprint_before, footprint_after,
+                   footprint_expected, footprint_after - footprint_expected);
+       } else {
+               footprint_expected = footprint_before - surface_size;
+               footprint_expected += (pagetable_after - pagetable_before);
+               T_LOG("releasing IOSurface decreases phys_footprint");
+               T_EXPECT_EQ(footprint_after, footprint_expected,
+                   "released IOSurface %lld bytes: "
+                   "footprint %lld -> %lld expected %lld delta %lld",
+                   surface_size, footprint_before, footprint_after,
+                   footprint_expected, footprint_after - footprint_expected);
+       }
+}
+
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
+T_DECL(phys_footprint_nonpurgeable_iokit,
+    "phys_footprint for non-purgeable IOKit memory",
+    T_META_NAMESPACE(TEST_VM_NAMESPACE),
+    T_META_LTEPHASE(LTE_POSTINIT))
+{
+       uint64_t        footprint_before, pagetable_before;
+       uint64_t        footprint_after, pagetable_after;
+       uint64_t        footprint_expected, footprint_delta_slop;
+       int64_t         footprint_delta;
+       IOSurfaceRef    surface;
+       uint64_t        surface_size;
+       void            *map_base;
+       size_t          map_size;
+       mach_vm_address_t remap_addr;
+       kern_return_t kr;
+       vm_prot_t       cur_prot, max_prot;
+       uint32_t        old_state;
+
+
+       T_SETUPBEGIN;
+       ledger_init();
+       surface = CreateSurface(1024, 1024, 0, 32, false, true);
+       CFRelease(surface);
+       footprint_delta_slop = 8 * vm_kernel_page_size;
+       T_SETUPEND;
+
+       surface_size = 1024 * 1024 * 4;
+
+       /* create IOsurface: footprint grows */
+       get_ledger_info(&footprint_before, &pagetable_before);
+       surface = CreateSurface(1024, 1024, 0, 32, false, true);
+       get_ledger_info(&footprint_after, &pagetable_after);
        footprint_expected = footprint_before + surface_size;
        footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("creating IOSurface increases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "create IOSurface %lld bytes: "
+       footprint_delta = (int64_t)(footprint_after - footprint_expected);
+       T_LOG("creating non-purgeable IOSurface increases phys_footprint");
+       T_EXPECT_LE((uint64_t)llabs(footprint_delta), footprint_delta_slop,
+           "create non-purgeable IOSurface %lld bytes: "
            "footprint %lld -> %lld expected %lld delta %lld",
            surface_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-#endif /* LEGACY_FOOTPRINT */
+           footprint_expected, footprint_delta);
 
-       /* make IOSurface volatile: footprint shrinks */
+       /* make IOSurface volatile: fail and no footprint impact */
        get_ledger_info(&footprint_before, &pagetable_before);
        IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableVolatile, &old_state);
        get_ledger_info(&footprint_after, &pagetable_after);
-#if LEGACY_FOOTPRINT
        footprint_expected = footprint_before;
        footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("LEGACY FOOTPRINT: volatile IOSurface: no footprint impact");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "volatile IOSurface %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           surface_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-#else /* LEGACY_FOOTPRINT */
-       footprint_expected = footprint_before - surface_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("making IOSurface volatile decreases phys_footprint");
+       T_LOG("making non-purgeable IOSurface volatile: no footprint impact");
        T_EXPECT_EQ(footprint_after, footprint_expected,
-           "made volatile %lld bytes: "
+           "made volatile %lld non-purgeable bytes: "
            "footprint %lld -> %lld expected %lld delta %lld",
            surface_size, footprint_before, footprint_after,
            footprint_expected, footprint_after - footprint_expected);
-#endif /* LEGACY_FOOTPRINT */
 
-       /* make IOSurface non-volatile: footprint grows */
+       /* re-mapping IOSurface: no footprint impact */
        get_ledger_info(&footprint_before, &pagetable_before);
-       IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableNonVolatile, &old_state);
+       map_base = IOSurfaceGetBaseAddress(surface);
+       map_size = SurfaceGetMemorySize(surface);
+//     T_EXPECT_EQ(map_size, surface_size, "map_size %lld surface_size %lld",
+//                 map_size, surface_size);
+       remap_addr = 0;
+       kr = mach_vm_remap(mach_task_self(),
+           &remap_addr,
+           (mach_vm_size_t)surface_size,
+           0,
+           VM_FLAGS_ANYWHERE,
+           mach_task_self(),
+           (mach_vm_address_t)map_base,
+           FALSE,                /* copy */
+           &cur_prot,
+           &max_prot,
+           VM_INHERIT_DEFAULT);
+       T_QUIET;
+       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_remap() error 0x%x (%s)",
+           kr, mach_error_string(kr));
        get_ledger_info(&footprint_after, &pagetable_after);
-#if LEGACY_FOOTPRINT
        footprint_expected = footprint_before;
        footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("LEGACY FOOTPRINT: non-volatile IOSurface: no footprint impact");
+       T_LOG("re-mapping IOSurface does not impact phys_footprint");
        T_EXPECT_EQ(footprint_after, footprint_expected,
-           "non-volatile IOSurface %lld bytes: "
+           "remapping IOSurface %lld bytes: "
            "footprint %lld -> %lld expected %lld delta %lld",
            surface_size, footprint_before, footprint_after,
            footprint_expected, footprint_after - footprint_expected);
-#else /* LEGACY_FOOTPRINT */
+
+       /* accessing IOSurface re-mapping: footprint grows */
+       get_ledger_info(&footprint_before, &pagetable_before);
+       memset((char *)(uintptr_t)remap_addr, 'p', (size_t)surface_size);
+       get_ledger_info(&footprint_after, &pagetable_after);
        footprint_expected = footprint_before + surface_size;
        footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("making IOSurface non-volatile increases phys_footprint");
+       T_LOG("accessing re-mapped IOSurface grows phys_footprint");
        T_EXPECT_EQ(footprint_after, footprint_expected,
-           "made non-volatile %lld bytes: "
+           "accessing remapped IOSurface %lld bytes: "
            "footprint %lld -> %lld expected %lld delta %lld",
            surface_size, footprint_before, footprint_after,
            footprint_expected, footprint_after - footprint_expected);
-#endif /* LEGACY_FOOTPRINT */
-
-       /* accessing IOSurface re-mapping: no footprint impact */
-
-       /* deallocating IOSurface re-mapping: no footprint impact */
 
-       /* release IOSurface: footprint shrinks */
+       /* deallocating IOSurface re-mapping: footprint shrinks */
        get_ledger_info(&footprint_before, &pagetable_before);
-       CFRelease(surface);
+       kr = mach_vm_deallocate(mach_task_self(),
+           remap_addr,
+           (mach_vm_size_t)surface_size);
+       T_QUIET;
+       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate() error 0x%x (%s)",
+           kr, mach_error_string(kr));
        get_ledger_info(&footprint_after, &pagetable_after);
-#if LEGACY_FOOTPRINT
-       footprint_expected = footprint_before;
+       footprint_expected = footprint_before - surface_size;
        footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("LEGACY FOOTPRINT: release IOSurface: no footprint impact");
+       T_LOG("deallocating re-mapping of IOSurface shrinks phys_footprint");
        T_EXPECT_EQ(footprint_after, footprint_expected,
-           "releasing IOSurface %lld bytes: "
+           "deallocating remapped IOSurface %lld bytes: "
            "footprint %lld -> %lld expected %lld delta %lld",
            surface_size, footprint_before, footprint_after,
            footprint_expected, footprint_after - footprint_expected);
-#else /* LEGACY_FOOTPRINT */
+
+       /* release IOSurface: footprint shrinks */
+       get_ledger_info(&footprint_before, &pagetable_before);
+       CFRelease(surface);
+       get_ledger_info(&footprint_after, &pagetable_after);
        footprint_expected = footprint_before - surface_size;
        footprint_expected += (pagetable_after - pagetable_before);
        T_LOG("releasing IOSurface decreases phys_footprint");
@@ -1217,5 +1403,5 @@ T_DECL(phys_footprint_purgeable_iokit,
            "footprint %lld -> %lld expected %lld delta %lld",
            surface_size, footprint_before, footprint_after,
            footprint_expected, footprint_after - footprint_expected);
-#endif /* LEGACY_FOOTPRINT */
 }
+#endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
diff --git a/tests/vm_phys_footprint_legacy.c b/tests/vm_phys_footprint_legacy.c
deleted file mode 100644 (file)
index c635779..0000000
+++ /dev/null
@@ -1,1223 +0,0 @@
-#define ENTITLED 1
-
-#include <darwintest.h>
-#include <darwintest_utils.h>
-
-#include <mach/mach_error.h>
-#include <mach/mach_init.h>
-#include <mach/mach_port.h>
-#include <mach/mach_vm.h>
-#include <mach/task.h>
-#include <mach/task_info.h>
-#include <mach/vm_map.h>
-
-#include <sys/mman.h>
-
-#include <Kernel/kern/ledger.h>
-extern int ledger(int cmd, caddr_t arg1, caddr_t arg2, caddr_t arg3);
-
-#if ENTITLED && defined(__arm64__)
-#define LEGACY_FOOTPRINT 1
-#else /* ENTITLED && __arm64__ */
-#define LEGACY_FOOTPRINT 0
-#endif /* ENTITLED && __arm64__ */
-
-#define MEM_SIZE (100 * 1024 * 1024) /* 100 MB */
-
-static int64_t ledger_count = -1;
-static int footprint_index = -1;
-static int pagetable_index = -1;
-static struct ledger_entry_info *lei = NULL;
-
-static void
-ledger_init(void)
-{
-       static int                      ledger_inited = 0;
-       struct ledger_info              li;
-       struct ledger_template_info     *templateInfo;
-       int64_t                         templateCnt;
-       int                             i;
-
-       if (ledger_inited) {
-               return;
-       }
-       ledger_inited = 1;
-
-       T_SETUPBEGIN;
-       T_QUIET;
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(ledger(LEDGER_INFO,
-           (caddr_t)(uintptr_t)getpid(),
-           (caddr_t)&li,
-           NULL),
-           0,
-           "ledger(LEDGER_INFO)");
-
-       templateCnt = li.li_entries;
-       templateInfo = malloc((size_t)li.li_entries * sizeof(struct ledger_template_info));
-       T_QUIET;
-       T_WITH_ERRNO;
-       T_ASSERT_NE(templateInfo, NULL, "malloc()");
-
-       ledger_count = li.li_entries;
-       footprint_index = -1;
-       pagetable_index = -1;
-       T_QUIET;
-       T_WITH_ERRNO;
-       T_ASSERT_GE(ledger(LEDGER_TEMPLATE_INFO,
-           (caddr_t)templateInfo,
-           (caddr_t)&templateCnt,
-           NULL),
-           0,
-           "ledger(LEDGER_TEMPLATE_INFO)");
-       for (i = 0; i < templateCnt; i++) {
-               if (!strncmp(templateInfo[i].lti_name,
-                   "phys_footprint",
-                   strlen("phys_footprint"))) {
-                       footprint_index = i;
-               } else if (!strncmp(templateInfo[i].lti_name,
-                   "page_table",
-                   strlen("page_table"))) {
-                       pagetable_index = i;
-               }
-       }
-       free(templateInfo);
-
-       lei = (struct ledger_entry_info *)
-           malloc((size_t)ledger_count * sizeof(*lei));
-       T_QUIET;
-       T_WITH_ERRNO;
-       T_ASSERT_NE(lei, NULL, "malloc(ledger_entry_info)");
-
-       T_QUIET;
-       T_ASSERT_NE(footprint_index, -1, "no footprint_index");
-       T_QUIET;
-       T_ASSERT_NE(pagetable_index, -1, "no pagetable_index");
-
-       T_SETUPEND;
-}
-
-static void
-get_ledger_info(
-       uint64_t        *phys_footprint,
-       uint64_t        *page_table)
-{
-       int64_t count;
-
-       count = ledger_count;
-       T_QUIET;
-       T_WITH_ERRNO;
-       T_ASSERT_GE(ledger(LEDGER_ENTRY_INFO,
-           (caddr_t)(uintptr_t)getpid(),
-           (caddr_t)lei,
-           (caddr_t)&count),
-           0,
-           "ledger(LEDGER_ENTRY_INFO)");
-       T_QUIET;
-       T_ASSERT_GT(count, (int64_t)footprint_index, "no entry for footprint");
-       T_QUIET;
-       T_ASSERT_GT(count, (int64_t)pagetable_index, "no entry for pagetable");
-       if (phys_footprint) {
-               *phys_footprint = (uint64_t)(lei[footprint_index].lei_balance);
-       }
-       if (page_table) {
-               *page_table = (uint64_t)(lei[pagetable_index].lei_balance);
-       }
-}
-
-static mach_vm_address_t
-pre_warm(
-       mach_vm_size_t  vm_size)
-{
-       kern_return_t           kr;
-       mach_vm_address_t       vm_addr;
-       unsigned char           BigBufOnStack[100 * 1024];
-       uint64_t                footprint, page_table;
-
-       /* make sure ledgers are ready to be queried */
-       ledger_init();
-
-       T_SETUPBEGIN;
-
-       /*
-        * Touch a few pages ahead on the stack, to make
-        * sure we don't see a footprint increase due to
-        * an extra stack page later.
-        */
-       memset(BigBufOnStack, 0xb, sizeof(BigBufOnStack));
-       T_QUIET;
-       T_EXPECT_EQ(BigBufOnStack[0], 0xb,
-           "BigBufOnStack[0] == 0x%x",
-           BigBufOnStack[0]);
-       T_QUIET;
-       T_EXPECT_EQ(BigBufOnStack[sizeof(BigBufOnStack) - 1], 0xb,
-           "BigBufOnStack[%lu] == 0x%x",
-           sizeof(BigBufOnStack),
-           BigBufOnStack[sizeof(BigBufOnStack) - 1]);
-
-       /*
-        * Pre-allocate, touch and then release the same amount
-        * of memory we'll be allocating later during the test,
-        * to account for any memory overhead (page tables, global
-        * variables, ...).
-        */
-       vm_addr = 0;
-       kr = mach_vm_allocate(mach_task_self(),
-           &vm_addr,
-           vm_size,
-           VM_FLAGS_ANYWHERE);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_allocate(%lld) error 0x%x (%s)",
-           vm_size, kr, mach_error_string(kr));
-       memset((char *)(uintptr_t)vm_addr, 'p', (size_t)vm_size);
-       kr = mach_vm_deallocate(mach_task_self(),
-           vm_addr,
-           vm_size);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-
-       /*
-        * Exercise the ledger code to make sure it's ready to run
-        * without any extra memory overhead later.
-        */
-       get_ledger_info(&footprint, &page_table);
-
-       T_SETUPEND;
-
-       /*
-        * Return the start of the virtual range we pre-warmed, so that the
-        * test can check that it's using the same range.
-        */
-       return vm_addr;
-}
-
-T_DECL(legacy_phys_footprint_anonymous,
-    "phys_footprint for anonymous memory",
-    T_META_NAMESPACE("xnu.vm"),
-    T_META_LTEPHASE(LTE_POSTINIT))
-{
-       uint64_t                footprint_before, pagetable_before;
-       uint64_t                footprint_after, pagetable_after;
-       uint64_t                footprint_expected;
-       kern_return_t           kr;
-       mach_vm_address_t       pre_vm_addr, vm_addr;
-       mach_vm_size_t          vm_size, dirty_size;
-
-       /* pre-warm to account for page table expansion */
-       pre_vm_addr = pre_warm(MEM_SIZE);
-
-       /* allocating virtual memory... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       vm_addr = 0;
-       vm_size = MEM_SIZE;
-       kr = mach_vm_allocate(mach_task_self(), &vm_addr, vm_size,
-           VM_FLAGS_ANYWHERE);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_allocate() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       T_QUIET;
-       T_EXPECT_EQ(vm_addr, pre_vm_addr, "pre-warm mishap");
-       /* ... should not change footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("virtual allocation does not change phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "virtual allocation of %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           vm_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* touching memory... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       dirty_size = vm_size / 2;
-       memset((char *)(uintptr_t)vm_addr, 'x', (size_t)dirty_size);
-       /* ... should increase footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before + dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("modifying anonymous memory increases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "touched %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* deallocating memory... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       kr = mach_vm_deallocate(mach_task_self(), vm_addr, vm_size);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       /* ... should decrease footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before - dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("deallocating dirty anonymous memory decreases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "deallocated %lld dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-}
-
-#define TEMP_FILE_TEMPLATE "/tmp/phys_footprint_data.XXXXXXXX"
-#define TEMP_FILE_SIZE  (1 * 1024 * 1024)
-
-T_DECL(legacy_phys_footprint_file,
-    "phys_footprint for mapped file",
-    T_META_NAMESPACE("xnu.vm"),
-    T_META_LTEPHASE(LTE_POSTINIT))
-{
-       uint64_t                footprint_before, pagetable_before;
-       uint64_t                footprint_after, pagetable_after;
-       uint64_t                footprint_expected;
-       mach_vm_address_t       pre_vm_addr;
-       int                     fd;
-       char                    *map_addr;
-       size_t                  map_size, dirty_size;
-       ssize_t                 nbytes;
-       char                    tmp_file_name[PATH_MAX] = TEMP_FILE_TEMPLATE;
-       char                    *buf;
-       size_t                  buf_size;
-
-       T_SETUPBEGIN;
-       buf_size = TEMP_FILE_SIZE;
-       T_QUIET;
-       T_ASSERT_NOTNULL(buf = (char *)malloc(buf_size),
-           "allocate %zu-byte buffer", buf_size);
-       memset(buf, 'f', buf_size);
-       T_WITH_ERRNO;
-       T_QUIET;
-       T_ASSERT_NOTNULL(mktemp(tmp_file_name),
-           "create temporary file name");
-       T_WITH_ERRNO;
-       T_QUIET;
-       T_ASSERT_GE(fd = open(tmp_file_name, O_CREAT | O_RDWR),
-           0,
-           "create temp file");
-       T_WITH_ERRNO;
-       T_QUIET;
-       T_ASSERT_EQ(nbytes = write(fd, buf, buf_size),
-           (ssize_t)buf_size,
-           "write %zu bytes", buf_size);
-       free(buf);
-       T_SETUPEND;
-
-       /* pre-warm to account for page table expansion */
-       pre_vm_addr = pre_warm(TEMP_FILE_SIZE);
-
-       /* mapping a file does not impact footprint... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       map_size = TEMP_FILE_SIZE;
-       T_WITH_ERRNO;
-       T_QUIET;
-       T_ASSERT_NOTNULL(map_addr = (char *)mmap(NULL, map_size,
-           PROT_READ | PROT_WRITE,
-           MAP_FILE | MAP_SHARED, fd, 0),
-           "mmap()");
-       T_QUIET;
-       T_EXPECT_EQ((mach_vm_address_t)map_addr, pre_vm_addr,
-           "pre-warm mishap");
-       /* ... should not change footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("mapping file does not change phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "mapping file with %zu bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           map_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* touching file-backed memory... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       dirty_size = map_size / 2;
-       memset(map_addr, 'F', dirty_size);
-       /* ... should not impact footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("modifying file-backed memory does not impact phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "touched %zu bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* deallocating file-backed memory... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       T_WITH_ERRNO;
-       T_QUIET;
-       T_ASSERT_EQ(munmap(map_addr, map_size),
-           0,
-           "unmap file");
-       /* ... should not impact footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("unmapping file-backed memory does not impact phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "unmapped %zu dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-}
-
-T_DECL(legacy_phys_footprint_purgeable,
-    "phys_footprint for purgeable memory",
-    T_META_NAMESPACE("xnu.vm"),
-    T_META_LTEPHASE(LTE_POSTINIT))
-{
-       uint64_t                footprint_before, pagetable_before;
-       uint64_t                footprint_after, pagetable_after;
-       uint64_t                footprint_expected;
-       kern_return_t           kr;
-       mach_vm_address_t       pre_vm_addr, vm_addr;
-       mach_vm_size_t          vm_size, dirty_size;
-       int                     state;
-
-       /* pre-warm to account for page table expansion */
-       pre_vm_addr = pre_warm(MEM_SIZE);
-
-       /* allocating purgeable virtual memory... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       vm_addr = 0;
-       vm_size = MEM_SIZE;
-       kr = mach_vm_allocate(mach_task_self(), &vm_addr, vm_size,
-           VM_FLAGS_ANYWHERE | VM_FLAGS_PURGABLE);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_allocate() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       T_QUIET;
-       T_EXPECT_EQ(vm_addr, pre_vm_addr, "pre-warm mishap");
-       /* ... should not change footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("purgeable virtual allocation does not change phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "purgeable virtual allocation of %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           vm_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* touching memory... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       dirty_size = vm_size / 2;
-       memset((char *)(uintptr_t)vm_addr, 'x', (size_t)dirty_size);
-       /* ... should increase footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before + dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("modifying anonymous memory increases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "touched %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* making it volatile... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       state = VM_PURGABLE_VOLATILE;
-       T_QUIET;
-       T_ASSERT_EQ(mach_vm_purgable_control(mach_task_self(),
-           vm_addr,
-           VM_PURGABLE_SET_STATE,
-           &state),
-           KERN_SUCCESS,
-           "vm_purgable_control(VOLATILE)");
-       T_QUIET;
-       T_ASSERT_EQ(state, VM_PURGABLE_NONVOLATILE,
-           "memory was non-volatile");
-       /* ... should decrease footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before - dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("making volatile decreases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "made volatile %lld dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* making it non-volatile... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       state = VM_PURGABLE_NONVOLATILE;
-       T_QUIET;
-       T_ASSERT_EQ(mach_vm_purgable_control(mach_task_self(),
-           vm_addr,
-           VM_PURGABLE_SET_STATE,
-           &state),
-           KERN_SUCCESS,
-           "vm_purgable_control(NONVOLATILE)");
-       T_QUIET;
-       T_ASSERT_EQ(state, VM_PURGABLE_VOLATILE,
-           "memory was volatile");
-       /* ... should increase footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before + dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("making non-volatile increases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "made non-volatile %lld dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* deallocating memory... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       kr = mach_vm_deallocate(mach_task_self(), vm_addr, vm_size);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       /* ... should decrease footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before - dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("deallocating memory decreases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "deallocated %lld dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-}
-
-T_DECL(legacy_phys_footprint_purgeable_ownership,
-    "phys_footprint for owned purgeable memory",
-    T_META_NAMESPACE("xnu.vm"),
-    T_META_LTEPHASE(LTE_POSTINIT))
-{
-       uint64_t                footprint_before, pagetable_before;
-       uint64_t                footprint_after, pagetable_after;
-       uint64_t                footprint_expected;
-       kern_return_t           kr;
-       mach_vm_address_t       pre_vm_addr, vm_addr;
-       mach_vm_size_t          vm_size, dirty_size, me_size;
-       int                     state;
-       mach_port_t             me_port;
-
-       /* pre-warm to account for page table expansion */
-       pre_vm_addr = pre_warm(MEM_SIZE);
-
-       /* allocating purgeable virtual memory... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       vm_addr = 0;
-       vm_size = MEM_SIZE;
-       kr = mach_vm_allocate(mach_task_self(), &vm_addr, vm_size,
-           VM_FLAGS_ANYWHERE | VM_FLAGS_PURGABLE);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_allocate() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       T_QUIET;
-       T_EXPECT_EQ(vm_addr, pre_vm_addr, "pre-warm mishap");
-       /* ... should not change footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("purgeable virtual allocation does not change phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "purgeable virtual allocation of %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           vm_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* touching memory... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       dirty_size = vm_size / 2;
-       memset((char *)(uintptr_t)vm_addr, 'x', (size_t)dirty_size);
-       /* ... should increase footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before + dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("modifying anonymous memory increases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "touched %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* making it volatile... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       state = VM_PURGABLE_VOLATILE;
-       T_QUIET;
-       T_ASSERT_EQ(mach_vm_purgable_control(mach_task_self(),
-           vm_addr,
-           VM_PURGABLE_SET_STATE,
-           &state),
-           KERN_SUCCESS,
-           "vm_purgable_control(VOLATILE)");
-       T_QUIET;
-       T_ASSERT_EQ(state, VM_PURGABLE_NONVOLATILE,
-           "memory was non-volatile");
-       /* ... should decrease footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before - dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("making volatile decreases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "made volatile %lld dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* making it non-volatile... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       state = VM_PURGABLE_NONVOLATILE;
-       T_QUIET;
-       T_ASSERT_EQ(mach_vm_purgable_control(mach_task_self(),
-           vm_addr,
-           VM_PURGABLE_SET_STATE,
-           &state),
-           KERN_SUCCESS,
-           "vm_purgable_control(NONVOLATILE)");
-       T_QUIET;
-       T_ASSERT_EQ(state, VM_PURGABLE_VOLATILE,
-           "memory was volatile");
-       /* ... should increase footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before + dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("making non-volatile increases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "made non-volatile %lld dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* making a memory entry... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       me_size = vm_size;
-       me_port = MACH_PORT_NULL;
-       kr = mach_make_memory_entry_64(mach_task_self(),
-           &me_size,
-           vm_addr,
-           VM_PROT_READ | VM_PROT_WRITE,
-           &me_port,
-           MACH_PORT_NULL);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "make_memory_entry() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       T_QUIET;
-       T_EXPECT_EQ(me_size, vm_size, "memory entry size mismatch");
-       /* ... should not change footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("making a memory entry does not change phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "making a memory entry of %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           vm_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* deallocating memory while holding memory entry... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       kr = mach_vm_deallocate(mach_task_self(), vm_addr, vm_size);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       /* ... should not change footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("deallocating owned memory while holding memory entry "
-           "does not change phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "deallocated %lld dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* releasing the memory entry... */
-       kr = mach_port_deallocate(mach_task_self(), me_port);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "mach_port_deallocate() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       /* ... should decrease footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before - dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("releasing memory entry decreases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "made volatile %lld dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-}
-
-#ifdef MAP_MEM_LEDGER_TAGGED
-T_DECL(legacy_phys_footprint_ledger_purgeable_owned,
-    "phys_footprint for ledger-tagged purgeable memory ownership",
-    T_META_NAMESPACE("xnu.vm"),
-    T_META_LTEPHASE(LTE_POSTINIT))
-{
-       uint64_t                footprint_before, pagetable_before;
-       uint64_t                footprint_after, pagetable_after;
-       uint64_t                footprint_expected;
-       kern_return_t           kr;
-       mach_vm_address_t       pre_vm_addr, vm_addr;
-       mach_vm_size_t          vm_size, dirty_size, me_size;
-       int                     state;
-       mach_port_t             me_port;
-
-       /* pre-warm to account for page table expansion */
-       pre_vm_addr = pre_warm(MEM_SIZE);
-
-       /* making a memory entry... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       vm_size = MEM_SIZE;
-       me_size = vm_size;
-       me_port = MACH_PORT_NULL;
-       kr = mach_make_memory_entry_64(mach_task_self(),
-           &me_size,
-           0,
-           (MAP_MEM_NAMED_CREATE |
-           MAP_MEM_LEDGER_TAGGED |
-           MAP_MEM_PURGABLE |
-           VM_PROT_READ | VM_PROT_WRITE),
-           &me_port,
-           MACH_PORT_NULL);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "make_memory_entry() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       T_QUIET;
-       T_EXPECT_EQ(me_size, vm_size, "memory entry size mismatch");
-       /* ... should not change footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("making a memory entry does not change phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "making a memory entry of %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           vm_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* mapping ledger-tagged virtual memory... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       vm_addr = 0;
-       kr = mach_vm_map(mach_task_self(), &vm_addr, vm_size,
-           0, /* mask */
-           VM_FLAGS_ANYWHERE,
-           me_port,
-           0, /* offset */
-           FALSE, /* copy */
-           VM_PROT_READ | VM_PROT_WRITE,
-           VM_PROT_READ | VM_PROT_WRITE,
-           VM_INHERIT_DEFAULT);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_map() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       T_QUIET;
-       T_EXPECT_EQ(vm_addr, pre_vm_addr, "pre-warm mishap");
-       /* ... should not change footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("mapping ledger-tagged memory does not change phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "ledger-tagged mapping of %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           vm_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* touching memory... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       dirty_size = vm_size / 2;
-       memset((char *)(uintptr_t)vm_addr, 'x', (size_t)dirty_size);
-       /* ... should increase footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before + dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("modifying ledger-tagged memory increases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "touched %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* making it volatile... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       state = VM_PURGABLE_VOLATILE;
-       T_QUIET;
-       T_ASSERT_EQ(mach_vm_purgable_control(mach_task_self(),
-           vm_addr,
-           VM_PURGABLE_SET_STATE,
-           &state),
-           KERN_SUCCESS,
-           "vm_purgable_control(VOLATILE)");
-       T_QUIET;
-       T_ASSERT_EQ(state, VM_PURGABLE_NONVOLATILE,
-           "memory was non-volatile");
-       /* ... should decrease footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before - dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("making volatile decreases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "made volatile %lld dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* making it non-volatile... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       state = VM_PURGABLE_NONVOLATILE;
-       T_QUIET;
-       T_ASSERT_EQ(mach_vm_purgable_control(mach_task_self(),
-           vm_addr,
-           VM_PURGABLE_SET_STATE,
-           &state),
-           KERN_SUCCESS,
-           "vm_purgable_control(NONVOLATILE)");
-       T_QUIET;
-       T_ASSERT_EQ(state, VM_PURGABLE_VOLATILE,
-           "memory was volatile");
-       /* ... should increase footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before + dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("making non-volatile increases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "made non-volatile %lld dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* deallocating memory while holding memory entry... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       kr = mach_vm_deallocate(mach_task_self(), vm_addr, vm_size);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       /* ... should not change footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("deallocating owned memory while holding memory entry "
-           "does not change phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "deallocated %lld dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* releasing the memory entry... */
-       kr = mach_port_deallocate(mach_task_self(), me_port);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "mach_port_deallocate() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       /* ... should decrease footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before - dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("releasing memory entry decreases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "made volatile %lld dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-}
-
-T_DECL(legacy_phys_footprint_ledger_owned,
-    "phys_footprint for ledger-tagged memory ownership",
-    T_META_NAMESPACE("xnu.vm"),
-    T_META_LTEPHASE(LTE_POSTINIT))
-{
-       uint64_t                footprint_before, pagetable_before;
-       uint64_t                footprint_after, pagetable_after;
-       uint64_t                footprint_expected;
-       kern_return_t           kr;
-       mach_vm_address_t       pre_vm_addr, vm_addr;
-       mach_vm_size_t          vm_size, dirty_size, me_size;
-       int                     state;
-       mach_port_t             me_port;
-
-       /* pre-warm to account for page table expansion */
-       pre_vm_addr = pre_warm(MEM_SIZE);
-
-       /* making a memory entry... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       vm_size = MEM_SIZE;
-       me_size = vm_size;
-       me_port = MACH_PORT_NULL;
-       kr = mach_make_memory_entry_64(mach_task_self(),
-           &me_size,
-           0,
-           (MAP_MEM_NAMED_CREATE |
-           MAP_MEM_LEDGER_TAGGED |
-           VM_PROT_READ | VM_PROT_WRITE),
-           &me_port,
-           MACH_PORT_NULL);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "make_memory_entry() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       T_QUIET;
-       T_EXPECT_EQ(me_size, vm_size, "memory entry size mismatch");
-       /* ... should not change footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("making a memory entry does not change phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "making a memory entry of %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           vm_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* mapping ledger-tagged virtual memory... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       vm_addr = 0;
-       kr = mach_vm_map(mach_task_self(), &vm_addr, vm_size,
-           0, /* mask */
-           VM_FLAGS_ANYWHERE,
-           me_port,
-           0, /* offset */
-           FALSE, /* copy */
-           VM_PROT_READ | VM_PROT_WRITE,
-           VM_PROT_READ | VM_PROT_WRITE,
-           VM_INHERIT_DEFAULT);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_map() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       T_QUIET;
-       T_EXPECT_EQ(vm_addr, pre_vm_addr, "pre-warm mishap");
-       /* ... should not change footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("mapping ledger-tagged memory does not change phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "ledger-tagged mapping of %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           vm_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* touching memory... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       dirty_size = vm_size / 2;
-       memset((char *)(uintptr_t)vm_addr, 'x', (size_t)dirty_size);
-       /* ... should increase footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before + dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("modifying ledger-tagged memory increases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "touched %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* deallocating memory while holding memory entry... */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       kr = mach_vm_deallocate(mach_task_self(), vm_addr, vm_size);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "vm_deallocate() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       /* ... should not change footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("deallocating owned memory while holding memory entry "
-           "does not change phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "deallocated %lld dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-
-       /* releasing the memory entry... */
-       kr = mach_port_deallocate(mach_task_self(), me_port);
-       T_QUIET;
-       T_EXPECT_EQ(kr, KERN_SUCCESS, "mach_port_deallocate() error 0x%x (%s)",
-           kr, mach_error_string(kr));
-       /* ... should decrease footprint */
-       get_ledger_info(&footprint_after, &pagetable_after);
-       footprint_expected = footprint_before - dirty_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("releasing memory entry decreases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "made volatile %lld dirty bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           dirty_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-}
-#endif /* MAP_MEM_LEDGER_TAGGED */
-
-/* IOSurface code from: CoreImage/CoreImageTests/CIRender/SurfaceUtils.c */
-#include <CoreFoundation/CoreFoundation.h>
-#include <IOSurface/IOSurface.h>
-#include <IOSurface/IOSurfacePrivate.h>
-static size_t
-bytes_per_element(uint32_t format)
-{
-       size_t bpe = 0;
-       switch (format) {
-       case 32:     // kCVPixelFormatType_32ARGB (ARGB8)
-               bpe = 4;
-               break;
-       default:
-               bpe = 0;
-               break;
-       }
-       return bpe;
-}
-static size_t
-bytes_per_pixel(uint32_t format)
-{
-       size_t bpe = 0;
-       switch (format) {
-       case 32:     // kCVPixelFormatType_32ARGB (ARGB8)
-               bpe = 4;
-               break;
-       default:
-               bpe = 0;
-               break;
-       }
-       return bpe;
-}
-static inline size_t
-roundSizeToMultiple(size_t size, size_t mult)
-{
-       return ((size + mult - 1) / mult) * mult;
-}
-static inline void
-setIntValue(CFMutableDictionaryRef dict, const CFStringRef key, int value)
-{
-       CFNumberRef number = CFNumberCreate(0, kCFNumberIntType, &value);
-       CFDictionarySetValue(dict, key, number);
-       CFRelease(number);
-}
-typedef void (^SurfacePlaneBlock)(void *data, size_t planeIndex, size_t width, size_t height, size_t rowbytes);
-static IOReturn
-SurfaceApplyPlaneBlock(IOSurfaceRef surface, SurfacePlaneBlock block)
-{
-       if (surface == nil || block == nil) {
-               return kIOReturnBadArgument;
-       }
-
-       IOReturn result = kIOReturnSuccess;
-       size_t planeCount = IOSurfaceGetPlaneCount(surface);
-
-       if (planeCount == 0) {
-               result = IOSurfaceLock(surface, 0, NULL);
-               if (result != kIOReturnSuccess) {
-                       return result;
-               }
-
-               void* base = IOSurfaceGetBaseAddress(surface);
-               size_t rb = IOSurfaceGetBytesPerRow(surface);
-               size_t w = IOSurfaceGetWidth(surface);
-               size_t h = IOSurfaceGetHeight(surface);
-
-               if (base && rb && w && h) {
-                       block(base, 0, w, h, rb);
-               }
-
-               IOSurfaceUnlock(surface, 0, NULL);
-       } else if (planeCount == 2) {
-               for (size_t i = 0; i < planeCount; i++) {
-                       result = IOSurfaceLock(surface, 0, NULL);
-                       if (result != kIOReturnSuccess) {
-                               return result;
-                       }
-
-                       void* base = IOSurfaceGetBaseAddressOfPlane(surface, i);
-                       size_t rb = IOSurfaceGetBytesPerRowOfPlane(surface, i);
-                       size_t w = IOSurfaceGetWidthOfPlane(surface, i);
-                       size_t h = IOSurfaceGetHeightOfPlane(surface, i);
-
-                       if (base && rb && w && h) {
-                               block(base, i, w, h, rb);
-                       }
-
-                       IOSurfaceUnlock(surface, 0, NULL);
-               }
-       }
-       return result;
-}
-static void
-ClearSurface(IOSurfaceRef surface)
-{
-       const int zero = 0;
-       (void) SurfaceApplyPlaneBlock(surface, ^(void *p, size_t i, __unused size_t w, size_t h, size_t rb)
-       {
-               if (i == 0) {
-                       memset(p, zero, rb * h);
-               } else {
-                       memset(p, 128, rb * h);
-               }
-       });
-}
-static IOSurfaceRef
-CreateSurface(uint32_t pixelsWide, uint32_t pixelsHigh, uint32_t rowBytesAlignment, uint32_t fmt, bool purgeable, bool clear)
-{
-       IOSurfaceRef surface = nil;
-
-       if (pixelsWide < 1 || pixelsHigh < 1 || fmt == 0) {
-               return nil;
-       }
-
-       size_t bpp = bytes_per_pixel(fmt);
-       size_t bpe = bytes_per_element(fmt);
-       if (bpp == 0 || bpe == 0) {
-               return nil;
-       }
-
-       size_t rowbytes = pixelsWide * bpp;
-       if (rowBytesAlignment == 0) {
-               rowBytesAlignment = 16;
-       }
-       rowbytes = roundSizeToMultiple(rowbytes, rowBytesAlignment);
-
-       CFMutableDictionaryRef props = CFDictionaryCreateMutable(0, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
-       setIntValue(props, kIOSurfaceBytesPerRow, (int)rowbytes);
-       setIntValue(props, kIOSurfaceWidth, (int)pixelsWide);
-       setIntValue(props, kIOSurfaceHeight, (int)pixelsHigh);
-       setIntValue(props, kIOSurfacePixelFormat, (int)fmt);
-#if TARGET_OS_IPHONE
-       setIntValue(props, kIOSurfaceNonPurgeable, purgeable);
-#else /* TARGET_OS_IPHONE */
-       (void)purgeable;
-#endif /* TARGET_OS_IPHONE */
-       {
-               if (bpe != bpp) { // i.e. a 422 format such as 'yuvf' etc.
-                       setIntValue(props, kIOSurfaceElementWidth, 2);
-                       setIntValue(props, kIOSurfaceElementHeight, 1);
-               }
-               setIntValue(props, kIOSurfaceBytesPerElement, (int)bpe);
-       }
-
-       surface = IOSurfaceCreate(props);
-
-       if (clear) {
-               ClearSurface(surface);
-       }
-
-       CFRelease(props);
-       return surface;
-}
-T_DECL(legacy_phys_footprint_purgeable_iokit,
-    "phys_footprint for purgeable IOKit memory",
-    T_META_NAMESPACE("xnu.vm"),
-    T_META_LTEPHASE(LTE_POSTINIT))
-{
-       uint64_t        footprint_before, pagetable_before;
-       uint64_t        footprint_after, pagetable_after;
-       uint64_t        footprint_expected;
-       IOSurfaceRef    surface;
-       uint32_t        old_state;
-       uint64_t        surface_size;
-
-       T_SETUPBEGIN;
-       ledger_init();
-       surface = CreateSurface(1024, 1024, 0, 32, true, true);
-       IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableVolatile, &old_state);
-       IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableNonVolatile, &old_state);
-       CFRelease(surface);
-       T_SETUPEND;
-
-       surface_size = 1024 * 1024 * 4;
-
-       /* create IOsurface: footprint grows */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       surface = CreateSurface(1024, 1024, 0, 32, true, true);
-       get_ledger_info(&footprint_after, &pagetable_after);
-#if LEGACY_FOOTPRINT
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("LEGACY FOOTPRINT: creating IOSurface: no footprint impact");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "create IOSurface %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           surface_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-#else /* LEGACY_FOOTPRINT */
-       footprint_expected = footprint_before + surface_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("creating IOSurface increases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "create IOSurface %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           surface_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-#endif /* LEGACY_FOOTPRINT */
-
-       /* make IOSurface volatile: footprint shrinks */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableVolatile, &old_state);
-       get_ledger_info(&footprint_after, &pagetable_after);
-#if LEGACY_FOOTPRINT
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("LEGACY FOOTPRINT: volatile IOSurface: no footprint impact");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "volatile IOSurface %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           surface_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-#else /* LEGACY_FOOTPRINT */
-       footprint_expected = footprint_before - surface_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("making IOSurface volatile decreases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "made volatile %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           surface_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-#endif /* LEGACY_FOOTPRINT */
-
-       /* make IOSurface non-volatile: footprint grows */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       IOSurfaceSetPurgeable(surface, kIOSurfacePurgeableNonVolatile, &old_state);
-       get_ledger_info(&footprint_after, &pagetable_after);
-#if LEGACY_FOOTPRINT
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("LEGACY FOOTPRINT: non-volatile IOSurface: no footprint impact");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "non-volatile IOSurface %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           surface_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-#else /* LEGACY_FOOTPRINT */
-       footprint_expected = footprint_before + surface_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("making IOSurface non-volatile increases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "made non-volatile %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           surface_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-#endif /* LEGACY_FOOTPRINT */
-
-       /* accessing IOSurface re-mapping: no footprint impact */
-
-       /* deallocating IOSurface re-mapping: no footprint impact */
-
-       /* release IOSurface: footprint shrinks */
-       get_ledger_info(&footprint_before, &pagetable_before);
-       CFRelease(surface);
-       get_ledger_info(&footprint_after, &pagetable_after);
-#if LEGACY_FOOTPRINT
-       footprint_expected = footprint_before;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("LEGACY FOOTPRINT: release IOSurface: no footprint impact");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "releasing IOSurface %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           surface_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-#else /* LEGACY_FOOTPRINT */
-       footprint_expected = footprint_before - surface_size;
-       footprint_expected += (pagetable_after - pagetable_before);
-       T_LOG("releasing IOSurface decreases phys_footprint");
-       T_EXPECT_EQ(footprint_after, footprint_expected,
-           "released IOSurface %lld bytes: "
-           "footprint %lld -> %lld expected %lld delta %lld",
-           surface_size, footprint_before, footprint_after,
-           footprint_expected, footprint_after - footprint_expected);
-#endif /* LEGACY_FOOTPRINT */
-}
index ac03b77ff6b9e12cee6ab83ae287608571f185f3..e1c06259f75af35ddd96f74b3bdcf97c5a0b17be 100644 (file)
@@ -8,6 +8,8 @@
 #include <darwintest.h>
 #include <darwintest_utils.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 extern char * testpath;
 
 T_DECL(set_max_addr,
index f5107ff84f759160b18381bfc0d57721e26979d5..9b6ea0837733d79446e0e82d1207e56f064f1606 100644 (file)
@@ -10,6 +10,8 @@
 #include <mach/mach.h>
 #include <darwintest.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 T_DECL(voucher_entry, "voucher_entry", T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true))
 {
        kern_return_t kr        = KERN_SUCCESS;
index 0b4967720d972d05cd776524bcd3645087136d3c..6731d3bb12f9c9e926496d8fc25be95bf3f73cd0 100644 (file)
@@ -20,6 +20,7 @@
 
 #include <darwintest.h>
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
 
 static mach_port_t
 get_atm_voucher(void)
index c80267309edf2b976a1b26432e9691dac57cf1fd..c46de40691de42dcfb10c232d28f41414a45eacc 100644 (file)
@@ -13,7 +13,8 @@
 
 #include <darwintest.h>
 
-T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"));
+T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"),
+    T_META_RUN_CONCURRENTLY(true));
 
 static mach_port_t port = MACH_PORT_NULL;
 
index 458307962fce582345ba9cbe8b45518d0a3339c1..deb7d37929f19e4cddc9f4f8243a9f3a489f918c 100644 (file)
@@ -12,6 +12,8 @@
 
 #if !TARGET_OS_IPHONE
 
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+
 static pthread_t workq_thread;
 static bool signal_received;
 
index 412f8558a8a9d8119d32ebc2436825aba0ad2082..81b4bdbc7bc59fa968a9747e7b07f2bc326013a7 100644 (file)
 #include <sys/sysctl.h>
 #include <sys/wait.h>
 
-T_GLOBAL_META(T_META_NAMESPACE("xnu.quicktest"), T_META_CHECK_LEAKS(false));
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.quicktest"),
+       T_META_CHECK_LEAKS(false),
+       T_META_RUN_CONCURRENTLY(true)
+       );
+
 char g_target_path[PATH_MAX];
 
-/*  **************************************************************************************************************
- *     Test the syscall system call.
- *  **************************************************************************************************************
- */
-T_DECL(syscall,
-    "xnu_quick_test for syscall", T_META_CHECK_LEAKS(NO))
+T_DECL(syscall, "xnu_quick_test for syscall")
 {
        int                             my_fd = -1;
        char *                  my_pathp;
@@ -59,12 +59,8 @@ T_DECL(syscall,
        T_ATEND(remove_target_directory);
 }
 
-/*  **************************************************************************************************************
- *     Test fork wait4, and exit system calls.
- *  **************************************************************************************************************
- */
 T_DECL(fork_wait4_exit,
-    "Tests forking off a process and waiting for the child to exit", T_META_CHECK_LEAKS(false))
+    "Tests forking off a process and waiting for the child to exit")
 {
        int                             my_err, my_status;
        pid_t                       my_pid, my_wait_pid;
@@ -104,15 +100,15 @@ T_DECL(fork_wait4_exit,
            "check if wait4 returns right exit status");
 }
 
-T_DECL(getrusage, "Sanity check of getrusage")
+T_DECL(getrusage, "check getrusage works")
 {
-       struct rusage   my_rusage;
-
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(getrusage( RUSAGE_SELF, &my_rusage ), 0, NULL);
-       T_LOG("Checking that getrusage returned sane values");
-       T_EXPECT_LT(my_rusage.ru_msgrcv, 1000, NULL);
-       T_EXPECT_GE(my_rusage.ru_msgrcv, 0, NULL);
-       T_EXPECT_LT(my_rusage.ru_nsignals, 1000, NULL);
-       T_EXPECT_GE(my_rusage.ru_nsignals, 0, NULL);
+       struct rusage rubuf;
+
+       int ret = getrusage(RUSAGE_SELF, &rubuf);
+       T_ASSERT_POSIX_SUCCESS(ret, "getrusage for self");
+
+       T_EXPECT_LT(rubuf.ru_msgrcv, 1000, "upper bound on messages received");
+       T_EXPECT_GE(rubuf.ru_msgrcv, 0, "lower bound on messages reseived");
+       T_EXPECT_LT(rubuf.ru_nsignals, 1000, "upper bound on signals");
+       T_EXPECT_GE(rubuf.ru_nsignals, 0, "lower bound on signals");
 }
index b3d6a9d4b3590be9c4c50a16d4040b39c48b435e..24c96e43f16bf0348a52cbda5e0157f49ec3b4b4 100644 (file)
@@ -9,11 +9,15 @@
 #include <sys/ioctl.h>
 #include <sys/mount.h>
 
-#if !TARGET_OS_EMBEDDED
+#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 #include <sys/csr.h>
 #endif
 
-T_GLOBAL_META(T_META_NAMESPACE("xnu.quicktest"), T_META_CHECK_LEAKS(false));
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.quicktest"),
+       T_META_CHECK_LEAKS(false),
+       T_META_RUN_CONCURRENTLY(true)
+       );
 
 
 /*  **************************************************************************************************************
@@ -31,7 +35,7 @@ T_DECL(ioctl, "Sanity check of ioctl by exercising DKIOCGETBLOCKCOUNT and DKIOCG
        long long                       my_block_count;
        char                            my_name[MAXPATHLEN];
 
-#if !TARGET_OS_EMBEDDED
+#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
        /*
         * this test won't be able to open the root disk device unless CSR is
         * disabled or in AppleInternal mode
index ec62af5495df256e3c5fe123ed55af448105204f..4772c5912f2f5178890f34d37aae3c863b97bb82 100644 (file)
@@ -5,7 +5,11 @@
 #include <sys/stat.h>
 #include <sys/wait.h>
 
-T_GLOBAL_META(T_META_NAMESPACE("xnu.quicktest"), T_META_CHECK_LEAKS(false));
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.quicktest"),
+       T_META_CHECK_LEAKS(false),
+       T_META_RUN_CONCURRENTLY(true)
+       );
 
 T_DECL(getpriority_setpriority, "Tests getpriority and setpriority system calls", T_META_ASROOT(true))
 {
diff --git a/tools/cocci/OSAtomic_rewrite.cocci b/tools/cocci/OSAtomic_rewrite.cocci
new file mode 100644 (file)
index 0000000..6c34e2a
--- /dev/null
@@ -0,0 +1,202 @@
+// To apply, at the top of xnu.git:
+// $ spatch --max-width=120 --use-gitgrep --in-place --include-headers --sp-file tools/cocci/OSAtomic_rewrite.cocci -dir .
+//
+// coccinelle insists on adding a space for (void) casts which can be fixed with:
+// $ git grep -l '(void) os_atomic' | xargs -n1 sed -i '' -e 's/(void) os_atomic/(void)os_atomic/'
+
+@@ expression E; @@
+
+(
+- OSIncrementAtomic(E)
++ os_atomic_inc_orig(E, relaxed)
+|
+- OSIncrementAtomic8(E)
++ os_atomic_inc_orig(E, relaxed)
+|
+- OSIncrementAtomic16(E)
++ os_atomic_inc_orig(E, relaxed)
+|
+- OSIncrementAtomic32(E)
++ os_atomic_inc_orig(E, relaxed)
+|
+- OSIncrementAtomic64(E)
++ os_atomic_inc_orig(E, relaxed)
+|
+- OSIncrementAtomicLong(E)
++ os_atomic_inc_orig(E, relaxed)
+|
+- OSAddAtomic(1, E)
++ os_atomic_inc_orig(E, relaxed)
+|
+- OSAddAtomic8(1, E)
++ os_atomic_inc_orig(E, relaxed)
+|
+- OSAddAtomic16(1, E)
++ os_atomic_inc_orig(E, relaxed)
+|
+- OSAddAtomic32(1, E)
++ os_atomic_inc_orig(E, relaxed)
+|
+- OSAddAtomic64(1, E)
++ os_atomic_inc_orig(E, relaxed)
+|
+- OSAddAtomicLong(1, E)
++ os_atomic_inc_orig(E, relaxed)
+|
+- OSDecrementAtomic(E)
++ os_atomic_dec_orig(E, relaxed)
+|
+- OSDecrementAtomic8(E)
++ os_atomic_dec_orig(E, relaxed)
+|
+- OSDecrementAtomic16(E)
++ os_atomic_dec_orig(E, relaxed)
+|
+- OSDecrementAtomic32(E)
++ os_atomic_dec_orig(E, relaxed)
+|
+- OSDecrementAtomic64(E)
++ os_atomic_dec_orig(E, relaxed)
+|
+- OSDecrementAtomicLong(E)
++ os_atomic_dec_orig(E, relaxed)
+|
+- OSAddAtomic(-1, E)
++ os_atomic_dec_orig(E, relaxed)
+|
+- OSAddAtomic8(-1, E)
++ os_atomic_dec_orig(E, relaxed)
+|
+- OSAddAtomic16(-1, E)
++ os_atomic_dec_orig(E, relaxed)
+|
+- OSAddAtomic32(-1, E)
++ os_atomic_dec_orig(E, relaxed)
+|
+- OSAddAtomic64(-1, E)
++ os_atomic_dec_orig(E, relaxed)
+|
+- OSAddAtomicLong(-1, E)
++ os_atomic_dec_orig(E, relaxed)
+)
+
+@@ expression E, F; @@
+
+(
+- OSAddAtomic(-F, E)
++ os_atomic_sub_orig(E, F, relaxed)
+|
+- OSAddAtomic8(-F, E)
++ os_atomic_sub_orig(E, F, relaxed)
+|
+- OSAddAtomic16(-F, E)
++ os_atomic_sub_orig(E, F, relaxed)
+|
+- OSAddAtomic32(-F, E)
++ os_atomic_sub_orig(E, F, relaxed)
+|
+- OSAddAtomic64(-F, E)
++ os_atomic_sub_orig(E, F, relaxed)
+|
+- OSAddAtomicLong(-F, E)
++ os_atomic_sub_orig(E, F, relaxed)
+|
+- OSAddAtomic(F, E)
++ os_atomic_add_orig(E, F, relaxed)
+|
+- OSAddAtomic8(F, E)
++ os_atomic_add_orig(E, F, relaxed)
+|
+- OSAddAtomic16(F, E)
++ os_atomic_add_orig(E, F, relaxed)
+|
+- OSAddAtomic32(F, E)
++ os_atomic_add_orig(E, F, relaxed)
+|
+- OSAddAtomic64(F, E)
++ os_atomic_add_orig(E, F, relaxed)
+|
+- OSAddAtomicLong(F, E)
++ os_atomic_add_orig(E, F, relaxed)
+|
+- OSBitOrAtomic(F, E)
++ os_atomic_or_orig(E, F, relaxed)
+|
+- OSBitOrAtomic8(F, E)
++ os_atomic_or_orig(E, F, relaxed)
+|
+- OSBitOrAtomic16(F, E)
++ os_atomic_or_orig(E, F, relaxed)
+|
+- OSBitOrAtomic32(F, E)
++ os_atomic_or_orig(E, F, relaxed)
+|
+- OSBitOrAtomic64(F, E)
++ os_atomic_or_orig(E, F, relaxed)
+|
+- OSBitOrAtomicLong(F, E)
++ os_atomic_or_orig(E, F, relaxed)
+|
+- OSBitXorAtomic(F, E)
++ os_atomic_xor_orig(E, F, relaxed)
+|
+- OSBitXorAtomic8(F, E)
++ os_atomic_xor_orig(E, F, relaxed)
+|
+- OSBitXorAtomic16(F, E)
++ os_atomic_xor_orig(E, F, relaxed)
+|
+- OSBitXorAtomic32(F, E)
++ os_atomic_xor_orig(E, F, relaxed)
+|
+- OSBitXorAtomic64(F, E)
++ os_atomic_xor_orig(E, F, relaxed)
+|
+- OSBitXorAtomicLong(F, E)
++ os_atomic_xor_orig(E, F, relaxed)
+|
+- OSBitAndAtomic(F, E)
++ os_atomic_and_orig(E, F, relaxed)
+|
+- OSBitAndAtomic8(F, E)
++ os_atomic_and_orig(E, F, relaxed)
+|
+- OSBitAndAtomic16(F, E)
++ os_atomic_and_orig(E, F, relaxed)
+|
+- OSBitAndAtomic32(F, E)
++ os_atomic_and_orig(E, F, relaxed)
+|
+- OSBitAndAtomic64(F, E)
++ os_atomic_and_orig(E, F, relaxed)
+|
+- OSBitAndAtomicLong(F, E)
++ os_atomic_and_orig(E, F, relaxed)
+)
+
+@@ expression E, F, A; @@
+
+(
+- OSCompareAndSwap(F, E, A)
++ os_atomic_cmpxchg(A, E, F, acq_rel)
+|
+- OSCompareAndSwapPtr(F, E, A)
++ os_atomic_cmpxchg(A, E, F, acq_rel)
+|
+- OSCompareAndSwap8(F, E, A)
++ os_atomic_cmpxchg(A, E, F, acq_rel)
+|
+- OSCompareAndSwap16(F, E, A)
++ os_atomic_cmpxchg(A, E, F, acq_rel)
+|
+- OSCompareAndSwap32(F, E, A)
++ os_atomic_cmpxchg(A, E, F, acq_rel)
+|
+- OSCompareAndSwap64(F, E, A)
++ os_atomic_cmpxchg(A, E, F, acq_rel)
+|
+- OSCompareAndSwapLong(F, E, A)
++ os_atomic_cmpxchg(A, E, F, acq_rel)
+)
+
+// vim:ft=diff:
diff --git a/tools/cocci/c11_atomic_builtin_rewrite.cocci b/tools/cocci/c11_atomic_builtin_rewrite.cocci
new file mode 100644 (file)
index 0000000..7072ed0
--- /dev/null
@@ -0,0 +1,162 @@
+// To apply, at the top of xnu.git:
+// $ spatch --max-width=120 --use-gitgrep --in-place --include-headers --sp-file tools/cocci/c11_atomic_builtin_rewrite.cocci
+
+@memory_order@
+identifier m =~ "(memory_order_(relaxed|consume|acquire|release|acq_rel|seq_cst)(|_smp)|__ATOMIC_(RELAXED|CONSUME|ACQUIRE|RELEASE|ACQ_REL|SEQ_CST))";
+@@
+
+m
+
+@script:ocaml os_memory_order@
+m << memory_order.m;
+new_m;
+@@
+
+new_m := make_ident (String.lowercase_ascii (Str.global_replace (Str.regexp "memory_order_\\|__ATOMIC_\\|_smp") "" m))
+
+@fence@
+identifier memory_order.m;
+identifier os_memory_order.new_m;
+@@
+
+- __c11_atomic_thread_fence(m)
++ os_atomic_thread_fence(new_m)
+
+@load@
+expression E;
+type T;
+identifier memory_order.m;
+identifier os_memory_order.new_m;
+@@
+
+- __c11_atomic_load
++ os_atomic_load
+ (
+(
+-((T)E)
++E
+|
+-(T)E
++E
+|
+E
+)
+ ,
+-m
++new_m
+ )
+
+@inc@
+expression E;
+type T;
+identifier memory_order.m;
+identifier os_memory_order.new_m;
+@@
+
+- __c11_atomic_fetch_add
++ os_atomic_inc_orig
+ (
+(
+-((T)E)
++E
+|
+-(T)E
++E
+|
+E
+)
+ ,
+-1, m
++new_m
+ )
+
+@dec@
+expression E;
+type T;
+identifier memory_order.m;
+identifier os_memory_order.new_m;
+@@
+
+- __c11_atomic_fetch_sub
++ os_atomic_dec_orig
+ (
+(
+-((T)E)
++E
+|
+-(T)E
++E
+|
+E
+)
+ ,
+-1, m
++new_m
+ )
+
+@single_arg@
+expression E, F;
+type T;
+identifier memory_order.m;
+identifier os_memory_order.new_m;
+@@
+
+(
+- __c11_atomic_store
++ os_atomic_store
+|
+- __c11_atomic_fetch_add
++ os_atomic_add_orig
+|
+- __c11_atomic_fetch_sub
++ os_atomic_sub_orig
+|
+- __c11_atomic_fetch_and
++ os_atomic_and_orig
+|
+- __c11_atomic_fetch_or
++ os_atomic_or_orig
+|
+- __c11_atomic_fetch_xor
++ os_atomic_xor_orig
+)
+ (
+(
+-((T)E)
++E
+|
+-(T)E
++E
+|
+E
+)
+ , F,
+-m
++new_m
+ )
+
+@cmpxcgh@
+expression E, F, G;
+type T;
+identifier memory_order.m;
+identifier os_memory_order.new_m;
+@@
+
+- __c11_atomic_compare_exchange_strong
++ os_atomic_cmpxchgv
+ (
+(
+-((T)E)
++E
+|
+-(T)E
++E
+|
+E
+)
+ ,
+- &F, G, m, memory_order_relaxed
++ F, G, &F, new_m
+ )
+
+// vim:ft=diff:
diff --git a/tools/cocci/hw_atomic_rewrite.cocci b/tools/cocci/hw_atomic_rewrite.cocci
new file mode 100644 (file)
index 0000000..d4e8b2f
--- /dev/null
@@ -0,0 +1,96 @@
+// To apply, at the top of xnu.git:
+// $ spatch --max-width=120 --use-gitgrep --in-place --include-headers --sp-file tools/cocci/hw_atomic_rewrite.cocci -dir .
+//
+// coccinelle insists on adding a space for (void) casts which can be fixed with:
+// $ git grep -l 'os_atomic' | xargs -n1 sed -i '' -e 's/os_atomic/os_atomic/'
+
+@@ expression E, F; @@ // hw_atomic_add -> os_atomic_{inc,dec}
+
+(
+- hw_atomic_add(E, -1) + 1
++ os_atomic_dec_orig(E, relaxed)
+|
+- hw_atomic_add(E, -1)
++ os_atomic_dec(E, relaxed)
+|
+- hw_atomic_add(E, -F) + F
++ os_atomic_sub_orig(E, F, relaxed)
+|
+- hw_atomic_add(E, -F)
++ os_atomic_sub(E, F, relaxed)
+|
+- hw_atomic_add(E, 1) - 1
++ os_atomic_inc_orig(E, relaxed)
+|
+- hw_atomic_add(E, 1)
++ os_atomic_inc(E, relaxed)
+|
+- hw_atomic_add(E, F) - F
++ os_atomic_add_orig(E, F, relaxed)
+|
+- hw_atomic_add(E, F)
++ os_atomic_add(E, F, relaxed)
+)
+
+@@ expression E, F; @@ // hw_atomic_sub -> os_atomic_{inc,dec}
+
+(
+- hw_atomic_sub(E, -1) - 1
++ os_atomic_inc_orig(E, relaxed)
+|
+- hw_atomic_sub(E, -1)
++ os_atomic_inc(E, relaxed)
+|
+- hw_atomic_sub(E, -F) - F
++ os_atomic_add_orig(E, F, relaxed)
+|
+- hw_atomic_sub(E, -F)
++ os_atomic_add(E, F, relaxed)
+|
+- hw_atomic_sub(E, 1) + 1
++ os_atomic_dec_orig(E, relaxed)
+|
+- hw_atomic_sub(E, 1)
++ os_atomic_dec(E, relaxed)
+|
+- hw_atomic_sub(E, F) + F
++ os_atomic_sub_orig(E, F, relaxed)
+|
+- hw_atomic_sub(E, F)
++ os_atomic_sub(E, F, relaxed)
+)
+
+@@ expression E, F; @@ // hw_atomic_and -> os_atomic_and
+
+(
+- hw_atomic_and(E, ~F)
++ os_atomic_andnot(E, F, relaxed)
+|
+- hw_atomic_and(E, F)
++ os_atomic_and(E, F, relaxed)
+|
+- hw_atomic_and_noret(E, ~F)
++ os_atomic_andnot(E, F, relaxed)
+|
+- hw_atomic_and_noret(E, F)
++ os_atomic_and(E, F, relaxed)
+)
+
+@@ expression E, F; @@ // hw_atomic_or -> os_atomic_or
+
+(
+- hw_atomic_or(E, F)
++ os_atomic_or(E, F, relaxed)
+|
+- hw_atomic_or_noret(E, F)
++ os_atomic_or(E, F, relaxed)
+)
+
+@@ expression E, F, A; @@ // hw_compare_and_store
+
+(
+- hw_compare_and_store(E, F, A)
++ os_atomic_cmpxchg(A, E, F, acq_rel)
+)
+
+// vim:ft=diff:
diff --git a/tools/cocci/mcache_atomic_rewrite.cocci b/tools/cocci/mcache_atomic_rewrite.cocci
new file mode 100644 (file)
index 0000000..f5f1ec9
--- /dev/null
@@ -0,0 +1,159 @@
+// To apply, at the top of xnu.git:
+// $ spatch --max-width=120 --use-gitgrep --in-place --include-headers --sp-file tools/cocci/mcache_atomic_rewrite.cocci -dir .
+//
+// coccinelle insists on adding a space for (void) casts which can be fixed with:
+// $ git grep -l '(void) os_atomic' | xargs -n1 sed -i '' -e 's/(void) os_atomic/(void)os_atomic/'
+
+@@ expression E, F, A; @@
+
+(
+- atomic_add_16_ov(E, 1)
++ os_atomic_inc_orig(E, relaxed)
+|
+- atomic_add_16(E, 1)
++ os_atomic_inc(E, relaxed)
+|
+- atomic_add_32_ov(E, 1)
++ os_atomic_inc_orig(E, relaxed)
+|
+- atomic_add_32(E, 1)
++ os_atomic_inc(E, relaxed)
+|
+- atomic_add_64_ov(E, 1)
++ os_atomic_inc_orig(E, relaxed)
+|
+- atomic_add_64(E, 1)
++ os_atomic_inc(E, relaxed)
+|
+- atomic_add_16_ov(E, -1)
++ os_atomic_dec_orig(E, relaxed)
+|
+- atomic_add_16(E, -1)
++ os_atomic_dec(E, relaxed)
+|
+- atomic_add_32_ov(E, -1)
++ os_atomic_dec_orig(E, relaxed)
+|
+- atomic_add_32(E, -1)
++ os_atomic_dec(E, relaxed)
+|
+- atomic_add_64_ov(E, -1)
++ os_atomic_dec_orig(E, relaxed)
+|
+- atomic_add_64(E, -1)
++ os_atomic_dec(E, relaxed)
+|
+- atomic_add_16_ov(E, F)
++ os_atomic_add_orig(E, F, relaxed)
+|
+- atomic_add_16(E, F)
++ os_atomic_add(E, F, relaxed)
+|
+- atomic_add_32_ov(E, F)
++ os_atomic_add_orig(E, F, relaxed)
+|
+- atomic_add_32(E, F)
++ os_atomic_add(E, F, relaxed)
+|
+- atomic_add_64_ov(E, F)
++ os_atomic_add_orig(E, F, relaxed)
+|
+- atomic_add_64(E, F)
++ os_atomic_add(E, F, relaxed)
+|
+- atomic_test_set_32(A, E, F)
++ os_atomic_cmpxchg(A, E, F, acq_rel)
+|
+- atomic_test_set_64(A, E, F)
++ os_atomic_cmpxchg(A, E, F, acq_rel)
+|
+- atomic_test_set_ptr(A, E, F)
++ os_atomic_cmpxchg(A, E, F, acq_rel)
+|
+- atomic_set_32(E, F)
++ os_atomic_store(E, F, release)
+|
+- atomic_set_64(E, F)
++ os_atomic_store(E, F, release)
+|
+- atomic_set_ptr(E, F)
++ os_atomic_store(E, F, release)
+|
+- atomic_get_64(E, A)
++ E = os_atomic_load(A, relaxed)
+|
+- membar_sync()
++ os_atomic_thread_fence(seq_cst)
+|
+- atomic_or_8_ov(E, F)
++ os_atomic_or_orig(E, F, relaxed)
+|
+- atomic_or_16_ov(E, F)
++ os_atomic_or_orig(E, F, relaxed)
+|
+- atomic_or_32_ov(E, F)
++ os_atomic_or_orig(E, F, relaxed)
+|
+- atomic_or_8(E, F)
++ os_atomic_or(E, F, relaxed)
+|
+- atomic_or_16(E, F)
++ os_atomic_or(E, F, relaxed)
+|
+- atomic_or_32(E, F)
++ os_atomic_or(E, F, relaxed)
+|
+- atomic_and_8_ov(E, F)
++ os_atomic_and_orig(E, F, relaxed)
+|
+- atomic_and_16_ov(E, F)
++ os_atomic_and_orig(E, F, relaxed)
+|
+- atomic_and_32_ov(E, F)
++ os_atomic_and_orig(E, F, relaxed)
+|
+- atomic_and_8(E, F)
++ os_atomic_and(E, F, relaxed)
+|
+- atomic_and_16(E, F)
++ os_atomic_and(E, F, relaxed)
+|
+- atomic_and_32(E, F)
++ os_atomic_and(E, F, relaxed)
+|
+- atomic_bitset_8_ov(E, F)
++ os_atomic_or_orig(E, F, relaxed)
+|
+- atomic_bitset_16_ov(E, F)
++ os_atomic_or_orig(E, F, relaxed)
+|
+- atomic_bitset_32_ov(E, F)
++ os_atomic_or_orig(E, F, relaxed)
+|
+- atomic_bitset_8(E, F)
++ os_atomic_or(E, F, relaxed)
+|
+- atomic_bitset_16(E, F)
++ os_atomic_or(E, F, relaxed)
+|
+- atomic_bitset_32(E, F)
++ os_atomic_or(E, F, relaxed)
+|
+- atomic_bitclear_8_ov(E, F)
++ os_atomic_andnot_orig(E, F, relaxed)
+|
+- atomic_bitclear_16_ov(E, F)
++ os_atomic_andnot_orig(E, F, relaxed)
+|
+- atomic_bitclear_32_ov(E, F)
++ os_atomic_andnot_orig(E, F, relaxed)
+|
+- atomic_bitclear_8(E, F)
++ os_atomic_andnot(E, F, relaxed)
+|
+- atomic_bitclear_16(E, F)
++ os_atomic_andnot(E, F, relaxed)
+|
+- atomic_bitclear_32(E, F)
++ os_atomic_andnot(E, F, relaxed)
+)
diff --git a/tools/cocci/os_atomic_normalize.cocci b/tools/cocci/os_atomic_normalize.cocci
new file mode 100644 (file)
index 0000000..efc1d46
--- /dev/null
@@ -0,0 +1,94 @@
+// To apply, at the top of xnu.git:
+// $ spatch --max-width=120 --use-gitgrep --in-place --include-headers --sp-file tools/cocci/os_atomic_normalize.cocci -dir .
+//
+// coccinelle insists on adding a space for (void) casts which can be fixed with:
+// $ git grep -l '(void) os_atomic' | xargs -n1 sed -i '' -e 's/(void) os_atomic/(void)os_atomic/'
+
+@os_atomic@
+identifier fn =~ "^os_atomic";
+@@
+
+fn
+
+@script:ocaml unorig@
+fn << os_atomic.fn;
+new_fn;
+@@
+
+new_fn := make_ident (Str.global_replace (Str.regexp "_orig") "" fn)
+
+@@
+identifier os_atomic.fn;
+identifier unorig.new_fn;
+expression A, B, C;
+@@
+
+-(void)fn
++new_fn
+ (...)
+
+@@ expression E, F, m; @@
+
+(
+- os_atomic_add(E, 1, m)
++ os_atomic_inc(E, m)
+|
+- os_atomic_add_orig(E, 1, m)
++ os_atomic_inc_orig(E, m)
+|
+- os_atomic_sub(E, -1, m)
++ os_atomic_inc(E, m)
+|
+- os_atomic_sub_orig(E, -1, m)
++ os_atomic_inc_orig(E, m)
+|
+- os_atomic_add(E, -1, m)
++ os_atomic_dec(E, m)
+|
+- os_atomic_add_orig(E, -1, m)
++ os_atomic_dec_orig(E, m)
+|
+- os_atomic_sub(E, 1, m)
++ os_atomic_dec(E, m)
+|
+- os_atomic_sub_orig(E, 1, m)
++ os_atomic_dec_orig(E, m)
+|
+- os_atomic_add(E, -(F), m)
++ os_atomic_sub(E, F, m)
+|
+- os_atomic_add_orig(E, -(F), m)
++ os_atomic_sub_orig(E, F, m)
+|
+- os_atomic_add(E, -F, m)
++ os_atomic_sub(E, F, m)
+|
+- os_atomic_add_orig(E, -F, m)
++ os_atomic_sub_orig(E, F, m)
+|
+- os_atomic_sub(E, -(F), m)
++ os_atomic_add(E, F, m)
+|
+- os_atomic_sub_orig(E, -(F), m)
++ os_atomic_add_orig(E, F, m)
+|
+- os_atomic_sub(E, -F, m)
++ os_atomic_add(E, F, m)
+|
+- os_atomic_sub_orig(E, -F, m)
++ os_atomic_add_orig(E, F, m)
+|
+- os_atomic_and(E, ~(F), m)
++ os_atomic_andnot(E, F, m)
+|
+- os_atomic_and_orig(E, ~(F), m)
++ os_atomic_andnot_orig(E, F, m)
+|
+- os_atomic_and(E, ~F, m)
++ os_atomic_andnot(E, F, m)
+|
+- os_atomic_and_orig(E, ~F, m)
++ os_atomic_andnot_orig(E, F, m)
+)
+
+// vim:ft=diff:
index aec946f227246c6f3206cbcd2fe2e1a865907398..17d6b10ee09c282e7a05548086eeaf38ebe1af1b 100644 (file)
@@ -38,11 +38,13 @@ LLDBMACROS_PYTHON_FILES = $(LLDBMACROS_USERDEBUG_FILES) \
        core/xnu_lldb_init.py \
        plugins/__init__.py \
        plugins/zprint_perf_log.py \
+       sysregdoc/AArch64-esr_el1.xml \
        atm.py \
        bank.py \
        turnstile.py \
        kevent.py \
        workqueue.py \
+       ulock.py \
        xnu.py \
        xnudefines.py \
        ktrace.py \
@@ -75,7 +77,8 @@ LLDBMACROS_PYTHON_FILES = $(LLDBMACROS_USERDEBUG_FILES) \
        waitq.py \
        pgtrace.py \
        xnutriage.py \
-       zonetriage.py
+       zonetriage.py \
+       sysreg.py
 
 ifneq ($(PLATFORM),MacOSX)
        LLDBMACROS_PYTHON_FILES+= \
index e54ee3487c06cf26b064be0c076f990b55cddc9e..71bd84431283276d2c6ac15732895ff6b8899365 100755 (executable)
@@ -34,7 +34,7 @@ def GetBankTaskSummary(bank_task):
     """
 
     format_str = "{0: <#020x} {1: <16d} {2: <#020x} {3: <16d} {4: <16d} {5: <16d} {6: <16d} {7: <16d}"
-    out_string = format_str.format(bank_task, bank_task.bt_proc_persona.pid, bank_task.bt_creditcard, unsigned(bank_task.bt_elem.be_refs), unsigned(bank_task.bt_elem.be_made), bank_task.bt_proc_persona.persona_id, bank_task.bt_proc_persona.uid, bank_task.bt_proc_persona.gid)
+    out_string = format_str.format(bank_task, bank_task.bt_proc_persona.pid, bank_task.bt_ledger, unsigned(bank_task.bt_elem.be_refs), unsigned(bank_task.bt_elem.be_made), bank_task.bt_proc_persona.persona_id, bank_task.bt_proc_persona.uid, bank_task.bt_proc_persona.gid)
 
     #if DEVELOPMENT
     format_str = "{0: <#020x} {1: <20s}"
index 3b1c4eadd580b657a86f390d32c4fee3d03ba7a1..e58c7752fd4947cb686857d0e41d03ca402d6773 100755 (executable)
@@ -295,15 +295,29 @@ class value(object):
         return content
 
     def _GetValueAsSigned(self):
+        if self._sbval19k84obscure747_is_ptr:
+            print "ERROR: You cannot get 'int' from pointer type %s, please use unsigned(obj) for such purposes." % str(self._sbval19k84obscure747_type)
+            raise ValueError("Cannot get signed int for pointer data.")
         serr = lldb.SBError()
         retval = self._sbval19k84obscure747.GetValueAsSigned(serr)
         if serr.success:
             return retval
         raise ValueError("Failed to read signed data. "+ str(self._sbval19k84obscure747) +"(type =" + str(self._sbval19k84obscure747_type) + ") Error description: " + serr.GetCString())
-    
+
+    def _GetValueAsCast(self, dest_type):
+        if type(dest_type) is not lldb.SBType:
+            raise ValueError("Invalid type for dest_type: {}".format(type(dest_type)))
+        addr = self._GetValueAsUnsigned()
+        sbval = self._sbval19k84obscure747.target.CreateValueFromExpression("newname", "(void *)"+str(addr))
+        val = value(sbval.Cast(dest_type))
+        return val
+
     def _GetValueAsUnsigned(self):
         serr = lldb.SBError()
-        retval = self._sbval19k84obscure747.GetValueAsUnsigned(serr)
+        if self._sbval19k84obscure747_is_ptr:
+            retval = self._sbval19k84obscure747.GetValueAsAddress()
+        else:
+            retval = self._sbval19k84obscure747.GetValueAsUnsigned(serr)
         if serr.success:
             return retval
         raise ValueError("Failed to read unsigned data. "+ str(self._sbval19k84obscure747) +"(type =" + str(self._sbval19k84obscure747_type) + ") Error description: " + serr.GetCString())
@@ -311,7 +325,7 @@ class value(object):
     def _GetValueAsString(self, offset = 0, maxlen = 1024):
         serr = lldb.SBError()
         sbdata = None
-        if self._sbval19k84obscure747.TypeIsPointerType():
+        if self._sbval19k84obscure747_is_ptr:
             sbdata = self._sbval19k84obscure747.GetPointeeData(offset, maxlen)
         else:
             sbdata = self._sbval19k84obscure747.GetData()
@@ -381,7 +395,7 @@ def dereference(val):
             obj_ptr = (int *)0x1234  #C
             val = *obj_ptr           #C
     """
-    if type(val) is value and val.GetSBValue().TypeIsPointerType():
+    if type(val) is value and val._sbval19k84obscure747_is_ptr:
         return value(val.GetSBValue().Dereference())
     raise TypeError('Cannot dereference this type.')
         
@@ -410,8 +424,8 @@ def cast(obj, target_type):
     elif type(target_type) is value:
         dest_type = target_type.GetSBValue().GetType()
 
-    if type(obj) is value :
-        return value(obj.GetSBValue().Cast(dest_type))
+    if type(obj) is value:
+        return obj._GetValueAsCast(dest_type)
     elif type(obj) is int:
         print "ERROR: You cannot cast an 'int' to %s, please use kern.GetValueFromAddress() for such purposes." % str(target_type) 
     raise TypeError("object of type %s cannot be casted to %s" % (str(type(obj)), str(target_type)))
index 43a3bd864b0cb5b1dfebf60e8359b1879ba2f531..ff2376e2ef5afc37a0ba1b5db59736fc555bdfd9 100755 (executable)
@@ -223,7 +223,7 @@ def IterateRBTreeEntry(element, element_type, field_name):
                 elt = cast(elt, element_type)
 
 
-def IteratePriorityQueueEntry(root, element_type, field_name):
+def IteratePriorityQueue(root, element_type, field_name):
     """ iterate over a priority queue as defined with struct priority_queue from osfmk/kern/priority_queue.h
             root         - value : Value object for the priority queue
             element_type - str   : Type of the link element
@@ -246,6 +246,19 @@ def IteratePriorityQueueEntry(root, element_type, field_name):
             if addr: queue.append(addr)
             elt = elt.next
 
+def IterateMPSCQueue(root, element_type, field_name):
+    """ iterate over an MPSC queue as defined with struct mpsc_queue_head from osfmk/kern/mpsc_queue.h
+            root         - value : Value object for the mpsc queue
+            element_type - str   : Type of the link element
+            field_name   - str   : Name of the field in link element's structure
+        returns:
+            A generator does not return. It is used for iterating
+            value  : an object thats of type (element_type). Always a pointer object
+    """
+    elt = root.mpqh_head.mpqc_next
+    while unsigned(elt):
+        yield containerof(elt, element_type, field_name)
+        elt = elt.mpqc_next
 
 class KernelTarget(object):
     """ A common kernel object that provides access to kernel objects and information.
@@ -327,6 +340,7 @@ class KernelTarget(object):
                 addr = int(addr, 16)
             else:
                 addr = int(addr)
+        addr = self.StripKernelPAC(addr)
         ret_array = []
         symbolicator = self._GetSymbolicator()
         syms = symbolicator.symbolicate(addr)
@@ -424,6 +438,17 @@ class KernelTarget(object):
         val = ((addr + size) & (unsigned(self.GetGlobalVariable("page_size"))-1))
         return (val < size and val > 0)
 
+    def StripUserPAC(self, addr):
+        if self.arch != 'arm64e':
+            return addr
+        T0Sz = self.GetGlobalVariable('gT0Sz')
+        return StripPAC(addr, T0Sz)
+
+    def StripKernelPAC(self, addr):
+        if self.arch != 'arm64e':
+            return addr
+        T1Sz = self.GetGlobalVariable('gT1Sz')
+        return StripPAC(addr, T1Sz)
 
     def PhysToKVARM64(self, addr):
         ptov_table = self.GetGlobalVariable('ptov_table')
index 2e7e21847cbb36c249bf269d3241aff402888c2b..c1fc18cc341788330055e5fad27eadd6751f0307 100755 (executable)
@@ -649,6 +649,32 @@ def IterateQueue(queue_head, element_ptr_type, element_field_name):
         yield elt
         cur_elt = elt.GetChildMemberWithName(element_field_name).GetChildMemberWithName('next')
 
+def IterateCircleQueue(queue_head, element_ptr_type, element_field_name):
+    """ iterate over a circle queue in kernel of type circle_queue_head_t. refer to osfmk/kern/circle_queue.h
+        params:
+            queue_head         - lldb.SBValue : Value object for queue_head.
+            element_type       - lldb.SBType : a pointer type of the element 'next' points to. Typically its structs like thread, task etc..
+            element_field_name - str : name of the field in target struct.
+        returns:
+            A generator does not return. It is used for iterating.
+            SBValue  : an object thats of type (element_type) queue_head->next. Always a pointer object
+    """
+    head = queue_head.head
+    queue_head_addr = 0x0
+    if head.TypeIsPointerType():
+        queue_head_addr = head.GetValueAsUnsigned()
+    else:
+        queue_head_addr = head.GetAddress().GetLoadAddress(osplugin_target_obj)
+    cur_elt = head
+    while True:
+        if not cur_elt.IsValid() or cur_elt.GetValueAsUnsigned() == 0:
+            break
+        elt = cur_elt.Cast(element_ptr_type)
+        yield elt
+        cur_elt = elt.GetChildMemberWithName(element_field_name).GetChildMemberWithName('next')
+        if cur_elt.GetValueAsUnsigned() == queue_head_addr:
+            break
+
 def GetUniqueSessionID(process_obj):
     """ Create a unique session identifier.
         params:
index bb96a17cd29e9e8db902a7d94920e59d2b26bf9f..9df6b7635a8845f4940f26ce94af3154b27320b7 100755 (executable)
@@ -4,7 +4,7 @@ import sys
 import re
 
 class ArgumentError(Exception):
-    """ Exception class for raising errors in command arguments. The lldb_command framework will catch this 
+    """ Exception class for raising errors in command arguments. The lldb_command framework will catch this
         class of exceptions and print suitable error message to user.
     """
     def __init__(self, msg):
@@ -28,77 +28,199 @@ class RedirectStdStreams(object):
         sys.stdout = self.old_stdout
         sys.stderr = self.old_stderr
 
+class IndentScope(object):
+    def __init__(self, O):
+        self._O = O
+
+    def __enter__(self):
+        self._O._indent += '    '
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._O._indent = self._O._indent[:-4]
+
+class HeaderScope(object):
+    def __init__(self, O, hdr, indent = False):
+        self._O = O
+        self._header = hdr
+        self._indent = indent
+
+    def __enter__(self):
+        self._oldHeader = self._O._header
+        self._oldLastHeader = self._O._lastHeader
+        self._O._header = self._header
+        self._O._lastHeader = None
+        if self._indent:
+            self._O._indent += '    '
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._O._header = self._oldHeader
+        self._O._lastHeader = self._oldLastHeader
+        if self._indent:
+            self._O._indent = self._O._indent[:-4]
+
+class VT(object):
+    Black        = "\033[38;5;0m"
+    DarkRed      = "\033[38;5;1m"
+    DarkGreen    = "\033[38;5;2m"
+    Brown        = "\033[38;5;3m"
+    DarkBlue     = "\033[38;5;4m"
+    DarkMagenta  = "\033[38;5;5m"
+    DarkCyan     = "\033[38;5;6m"
+    Grey         = "\033[38;5;7m"
+
+    DarkGrey     = "\033[38;5;8m"
+    Red          = "\033[38;5;9m"
+    Green        = "\033[38;5;10m"
+    Yellow       = "\033[38;5;11m"
+    Blue         = "\033[38;5;12m"
+    Magenta      = "\033[38;5;13m"
+    Cyan         = "\033[38;5;14m"
+    White        = "\033[38;5;15m"
+
+    Default      = "\033[39m"
+
+    Bold         = "\033[1m"
+    EndBold      = "\033[22m"
+
+    Oblique      = "\033[3m"
+    EndOblique   = "\033[23m"
+
+    Underline    = "\033[4m"
+    EndUnderline = "\033[24m"
+
+    Reset        = "\033[0m"
+
+class NOVT(object):
+    def __getattribute__(self, *args):
+        return ""
+
 class CommandOutput(object):
     """
-    An output handler for all commands. Use Output.print to direct all output of macro via the handler. 
+    An output handler for all commands. Use Output.print to direct all output of macro via the handler.
     These arguments are passed after a "--". eg
     (lldb) zprint -- -o /tmp/zprint.out.txt
-    
-    Currently this provide capabilities 
+
+    Currently this provide capabilities
+    -h show help
     -o path/to/filename
-       The output of this command execution will be saved to file. Parser information or errors will 
+       The output of this command execution will be saved to file. Parser information or errors will
        not be sent to file though. eg /tmp/output.txt
     -s filter_string
-       the "filter_string" param is parsed to python regex expression and each line of output 
-       will be printed/saved only if it matches the expression. 
+       the "filter_string" param is parsed to python regex expression and each line of output
+       will be printed/saved only if it matches the expression.
        The command header will not be filtered in any case.
+    -p <plugin_name>
+       Send the output of the command to plugin.
+    -v ...
+       Up verbosity
+    -c <always|never|auto>
+       configure color
     """
-    def __init__(self, cmd_name, CommandResult):
+    def __init__(self, cmd_name, CommandResult=None, fhandle=None):
         """ Create a new instance to handle command output.
         params:
-                CommandResult : SBCommandReturnObject result param from lldb's command invocation. 
+                CommandResult : SBCommandReturnObject result param from lldb's command invocation.
         """
         self.fname=None
-        self.fhandle=None
+        self.fhandle=fhandle
         self.FILTER=False
         self.pluginRequired = False
         self.pluginName = None
         self.cmd_name = cmd_name
         self.resultObj = CommandResult
-        self.immediateOutput = False
         self.verbose_level = 0
         self.target_cmd_args = []
         self.target_cmd_options = {}
+        self.color = None
+        self.isatty = os.isatty(sys.__stdout__.fileno())
+        self._indent = ''
+        self._buffer = ''
 
-    def write(self, s):
-        """ Handler for all commands output. By default just print to stdout """
-        if self.FILTER and not self.reg.search(s):
-            return
-        if self.FILTER:
-            s += "\n"
+        self._header = None
+        self._lastHeader = None
+        self._line = 0
 
+    def _write(self, s):
         if self.fhandle != None:
-            self.fhandle.write(s)
+            self.fhandle.write(self._indent + s + "\n")
         else:
-            if self.immediateOutput:
-                sys.__stdout__.write(s)
-            else:
-                res_str = s
-                if s.endswith("\n"):
-                    res_str = s[:-1]
-                if self.resultObj and len(res_str) > 0: self.resultObj.AppendMessage(res_str)
+            self.resultObj.AppendMessage(self._indent + s)
+        self._line += 1
+
+    def _doColor(self):
+        if self.color is True:
+            return True;
+        return self.color is None and self.isatty
+
+    def _needsHeader(self):
+        if self._header is None:
+            return False
+        if self._lastHeader is None:
+            return True
+        if not self.isatty:
+            return False
+        return self._line - self._lastHeader > 40
+
+    def indent(self):
+        return IndentScope(self)
+
+    def table(self, header, indent = False):
+        return HeaderScope(self, header, indent)
+
+    def format(self, s, *args, **kwargs):
+        if self._doColor():
+            kwargs['VT'] = VT
+        else:
+            kwargs['VT'] = NOVT()
+
+        return s.format(*args, **kwargs)
+
+    def error(self, s, *args, **kwargs):
+        print self.format("{cmd.cmd_name}: {VT.Red}"+s+"{VT.Default}", cmd=self, *args, **kwargs)
+
+    def write(self, s):
+        """ Handler for all commands output. By default just print to stdout """
+
+        s = self._buffer + s
+
+        while s.find('\n') != -1:
+            l, s = s.split("\n", 1)
+            if self.FILTER:
+                if not self.reg.search(l):
+                    continue
+                if self._doColor():
+                    l = self.reg.sub(VT.Underline + r"\g<0>" + VT.EndUnderline, l);
+
+            if len(l) and self._needsHeader():
+                for hdr in self._header.split("\n"):
+                    self._write(self.format("{VT.Bold}{:s}{VT.EndBold}", hdr))
+                self._lastHeader = self._line
+
+            self._write(l)
+
+        self._buffer = s
 
     def flush(self):
         if self.fhandle != None:
             self.fhandle.flush()
-        
+
     def __del__(self):
         """ closes any open files. report on any errors """
-        if self.fhandle != None :
+        if self.fhandle != None and self.fname != None:
             self.fhandle.close()
-    
+
     def setOptions(self, cmdargs, cmdoptions =''):
-        """ parse the arguments passed to the command 
-            param : 
+        """ parse the arguments passed to the command
+            param :
                 cmdargs => [] of <str> (typically args.split())
-                cmdoptions : str - string of command level options. 
+                cmdoptions : str - string of command level options.
                              These should be CAPITAL LETTER options only.
         """
         opts=()
         args = cmdargs
         cmdoptions = cmdoptions.upper()
         try:
-            opts,args = getopt.gnu_getopt(args,'hvo:s:p:'+ cmdoptions,[])
+            opts,args = getopt.gnu_getopt(args,'hvo:s:p:c:'+ cmdoptions,[])
             self.target_cmd_args = args
         except getopt.GetoptError,err:
             raise ArgumentError(str(err))
@@ -113,6 +235,7 @@ class CommandOutput(object):
                 self.fhandle=open(self.fname,"w")
                 print "saving results in file ",str(a)
                 self.fhandle.write("(lldb)%s %s \n" % (self.cmd_name, " ".join(cmdargs)))
+                self.isatty = os.isatty(self.fhandle.fileno())
             elif o == "-s" and len(a) > 0:
                 self.reg = re.compile(a.strip(),re.MULTILINE|re.DOTALL)
                 self.FILTER=True
@@ -121,12 +244,17 @@ class CommandOutput(object):
                 self.pluginRequired = True
                 self.pluginName = a.strip()
                 #print "passing output to " + a.strip()
-            elif o == "-v" :
+            elif o == "-v":
                 self.verbose_level += 1
+            elif o == "-c":
+                if a in ["always", '1']:
+                    self.color = True
+                elif a in ["never", '0']:
+                    self.color = False
+                else:
+                    self.color = None
             else:
                 o = o.strip()
                 self.target_cmd_options[o] = a
 
-            
-        
 
index e7f494b96e40dd1d4aa3776ad5ef5dac6c15d314..c0f1a8002649dbd9d590cf8388ca712e0c13707a 100755 (executable)
@@ -1,10 +1,15 @@
+from __future__ import absolute_import
+from __future__ import print_function
 import os
+import sys
 import re
 
+PY3 = sys.version_info > (3,)
+
 def GetSettingsValues(debugger, setting_variable_name):
     """ Queries the lldb internal settings
         params:
-            debugger : lldb.SBDebugger instance 
+            debugger : lldb.SBDebugger instance
             setting_variable_name: str - string name of the setting(eg prompt)
         returns:
             [] : Array of strings. Empty array if setting is not found/set
@@ -66,9 +71,16 @@ def __lldb_init_module(debugger, internal_dict):
     if "DEBUG_XNU_LLDBMACROS" in os.environ and len(os.environ['DEBUG_XNU_LLDBMACROS']) > 0:
         debug_session_enabled = True
     prev_os_plugin = "".join(GetSettingsValues(debugger, 'target.process.python-os-plugin-path'))
-    print "Loading kernel debugging from %s" % __file__
-    print "LLDB version %s" % debugger.GetVersionString()
-    self_path = str(__file__)
+    if PY3:
+        print("#" * 30)
+        print("WARNING! Python version 3 is not supported for xnu lldbmacros.")
+        print("Please restart your debugging session with the following workaround")
+        print("\ndefaults write com.apple.dt.lldb DefaultPythonVersion 2\n")
+        print("#" * 30)
+        print("\n")
+    print("Loading kernel debugging from %s" % __file__)
+    print("LLDB version %s" % debugger.GetVersionString())
+    self_path = "{}".format(__file__)
     base_dir_name = self_path[:self_path.rfind("/")]
     core_os_plugin = base_dir_name + "/lldbmacros/core/operating_system.py"
     osplugin_cmd = "settings set target.process.python-os-plugin-path \"%s\"" % core_os_plugin
@@ -86,22 +98,22 @@ def __lldb_init_module(debugger, internal_dict):
         pass
     if debug_session_enabled :
         if len(prev_os_plugin) > 0:
-            print "\nDEBUG_XNU_LLDBMACROS is set. Skipping the setting of OS plugin from dSYM.\nYou can manually set the OS plugin by running\n" + osplugin_cmd
+            print("\nDEBUG_XNU_LLDBMACROS is set. Skipping the setting of OS plugin from dSYM.\nYou can manually set the OS plugin by running\n" + osplugin_cmd)
         else:
-            print osplugin_cmd
+            print(osplugin_cmd)
             debugger.HandleCommand(osplugin_cmd)
-        print "\nDEBUG_XNU_LLDBMACROS is set. Skipping the load of xnu debug framework.\nYou can manually load the framework by running\n" + xnu_load_cmd
+        print("\nDEBUG_XNU_LLDBMACROS is set. Skipping the load of xnu debug framework.\nYou can manually load the framework by running\n" + xnu_load_cmd)
     else:
-        print osplugin_cmd
+        print(osplugin_cmd)
         debugger.HandleCommand(osplugin_cmd)
-        print whitelist_trap_cmd
+        print(whitelist_trap_cmd)
         debugger.HandleCommand(whitelist_trap_cmd)
-        print xnu_load_cmd
+        print(xnu_load_cmd)
         debugger.HandleCommand(xnu_load_cmd)
-        print disable_optimization_warnings_cmd
+        print(disable_optimization_warnings_cmd)
         debugger.HandleCommand(disable_optimization_warnings_cmd)
         if source_map_cmd:
-            print source_map_cmd
+            print(source_map_cmd)
             debugger.HandleCommand(source_map_cmd)
 
         load_kexts = True
@@ -111,15 +123,15 @@ def __lldb_init_module(debugger, internal_dict):
         if os.access(builtinkexts_path, os.F_OK):
             kexts = os.listdir(builtinkexts_path)
             if len(kexts) > 0:
-                print "\nBuiltin kexts: %s\n" % kexts
+                print("\nBuiltin kexts: %s\n" % kexts)
                 if load_kexts == False:
-                    print "XNU_LLDBMACROS_NOBUILTINKEXTS is set, not loading:\n"
+                    print("XNU_LLDBMACROS_NOBUILTINKEXTS is set, not loading:\n")
                 for kextdir in kexts:
                     script = os.path.join(builtinkexts_path, kextdir, kextdir.split('.')[-1] + ".py")
                     import_kext_cmd = "command script import \"%s\"" % script
-                    print "%s" % import_kext_cmd
+                    print("%s" % import_kext_cmd)
                     if load_kexts:
                         debugger.HandleCommand(import_kext_cmd)
 
-    print "\n"
+    print("\n")
 
index e2bdaf20ebc29eca34a2cddeeae3edfd4abd8ad9..1e55dc2198db5170479186f776fe9be146584fdb 100755 (executable)
@@ -34,6 +34,7 @@ def GetObjectSummary(obj):
         return
 
     vt = dereference(Cast(obj, 'uintptr_t *')) - 2 * sizeof('uintptr_t')
+    vt = kern.StripKernelPAC(vt)
     vtype = kern.SymbolicateFromAddress(vt)
     if len(vtype):
         vtype_str = " <" + vtype[0].GetName() + ">"
@@ -91,6 +92,7 @@ def GetObjectTypeStr(obj):
         return None
 
     vt = dereference(Cast(obj, 'uintptr_t *')) - 2 * sizeof('uintptr_t')
+    vt = kern.StripKernelPAC(vt)
     vtype = kern.SymbolicateFromAddress(vt)
     if len(vtype):
         return vtype[0].GetName()
@@ -128,6 +130,7 @@ def GetRegistryEntrySummary(entry):
     
     # I'm using uintptr_t for now to work around <rdar://problem/12749733> FindFirstType & Co. should allow you to make pointer types directly
     vtableAddr = dereference(Cast(entry, 'uintptr_t *')) - 2 * sizeof('uintptr_t *')
+    vtableAddr = kern.StripKernelPAC(vtableAddr)
     vtype = kern.SymbolicateFromAddress(vtableAddr)
     if vtype is None or len(vtype) < 1:
         out_string += "<object 0x{0: <16x}, id 0x{1:x}, vtable 0x{2: <16x}".format(entry, CastIOKitClass(entry, 'IORegistryEntry *').reserved.fRegistryEntryID, vtableAddr)
index 81090bbd8c2615e610c7b9f353577719a59916fe..6ac2b4eccbb8887fe95de2cd9614e65ca7f681aa 100755 (executable)
@@ -478,23 +478,22 @@ def ShowAllIPC(cmd_args=None):
 
 # EndMacro: showallipc
 
-@lldb_command('showipcsummary')
-def ShowIPCSummary(cmd_args=None):
+@lldb_command('showipcsummary', fancy=True)
+def ShowIPCSummary(cmd_args=None, cmd_options={}, O=None):
     """ Summarizes the IPC state of all tasks. 
         This is a convenient way to dump some basic clues about IPC messaging. You can use the output to determine
         tasks that are candidates for further investigation.
     """
-    print GetTaskIPCSummary.header
-    ipc_table_size = 0
-    for t in kern.tasks:
-        (summary, table_size) = GetTaskIPCSummary(t)
-        ipc_table_size += table_size
-        print summary
-    for t in kern.terminated_tasks:
-        (summary, table_size) = GetTaskIPCSummary(t)
-        ipc_table_size += table_size
-    print "Total Table size: {:d}".format(ipc_table_size)
-    return
+    with O.table(GetTaskIPCSummary.header):
+        ipc_table_size = 0
+        for t in kern.tasks:
+            (summary, table_size) = GetTaskIPCSummary(t)
+            ipc_table_size += table_size
+            print summary
+        for t in kern.terminated_tasks:
+            (summary, table_size) = GetTaskIPCSummary(t)
+            ipc_table_size += table_size
+        print "Total Table size: {:d}".format(ipc_table_size)
 
 def GetKObjectFromPort(portval):
     """ Get Kobject description from the port.
@@ -503,7 +502,7 @@ def GetKObjectFromPort(portval):
     """
     kobject_str = "{0: <#020x}".format(portval.kdata.kobject)
     io_bits = unsigned(portval.ip_object.io_bits)
-    objtype_index = io_bits & 0xfff
+    objtype_index = io_bits & 0x7ff
     if objtype_index < len(xnudefines.kobject_types) :
         objtype_str = xnudefines.kobject_types[objtype_index]
         if objtype_str == 'IOKIT_OBJ':
@@ -561,9 +560,8 @@ def GetPortDestinationSummary(port):
     format_string = "{0: <20s} {1: <20s}"
     destname_str = ''
     destination_str = ''
-    ipc_space_kernel = unsigned(kern.globals.ipc_space_kernel)
     target_spaceval = port.data.receiver
-    if unsigned(target_spaceval) == ipc_space_kernel :
+    if int(port.ip_object.io_bits) & 0x800 :
         destname_str = GetKObjectFromPort(port)
     else:
         if int(port.ip_object.io_bits) & 0x80000000 :
@@ -592,6 +590,9 @@ def GetIPCEntrySummary(entry, ipc_name='', rights_filter=0):
             'S'     : Send right
             'R'     : Receive right
             'O'     : Send-once right
+            'm'     : Immovable send port
+            'i'     : Immovable receive port
+            'g'     : No grant port
         types of notifications:
             'd'     : Dead-Name notification requested
             's'     : Send-Possible notification armed
@@ -649,6 +650,12 @@ def GetIPCEntrySummary(entry, ipc_name='', rights_filter=0):
         if portval.ip_nsrequest != 0: right_str += 'n'
         # port-destroy notification requested
         if portval.ip_pdrequest != 0: right_str += 'x'
+        # Immovable receive rights
+        if portval.ip_immovable_receive != 0: right_str += 'i'
+        # Immovable send rights
+        if portval.ip_immovable_send != 0: right_str += 'm'
+        # No-grant Port
+        if portval.ip_no_grant != 0: right_str += 'g'
 
         # early-out if the rights-filter doesn't match
         if rights_filter != 0 and rights_filter != right_str:
@@ -662,7 +669,7 @@ def GetIPCEntrySummary(entry, ipc_name='', rights_filter=0):
         # 1 0     32
         # 1 1     16
         ie_gen_roll = { 0:'.64', 1:'.48', 2:'.32', 3:'.16' }
-        ipc_name = '{:s}{:s}'.format(strip(ipc_name), ie_gen_roll[(ie_bits & 0x00c00000) >> 22])
+        ipc_name = '{:s}{:s}'.format(ipc_name.strip(), ie_gen_roll[(ie_bits & 0x00c00000) >> 22])
 
         # now show the port destination part
         destname_str = GetPortDestinationSummary(Cast(ie_object, 'ipc_port_t'))
@@ -786,6 +793,9 @@ def ShowTaskRights(cmd_args=None, cmd_options={}):
                    'S'     : Send right
                    'R'     : Receive right
                    'O'     : Send-once right
+                   'm'     : Immovable send port
+                   'i'     : Immovable receive port
+                   'g'     : No grant port
                types of notifications:
                    'd'     : Dead-Name notification requested
                    's'     : Send-Possible notification armed
@@ -824,6 +834,9 @@ def ShowTaskRightsBt(cmd_args=None, cmd_options={}):
                    'S'     : Send right
                    'R'     : Receive right
                    'O'     : Send-once right
+                   'm'     : Immovable send port
+                   'i'     : Immovable receive port
+                   'g'     : No grant port
                types of notifications:
                    'd'     : Dead-Name notification requested
                    's'     : Send-Possible notification armed
@@ -864,6 +877,9 @@ def ShowAllRights(cmd_args=None, cmd_options={}):
                     'S'     : Send right
                     'R'     : Receive right
                     'O'     : Send-once right
+                    'm'     : Immovable send port
+                    'i'     : Immovable receive port
+                    'g'     : No grant port
                 types of notifications:
                     'd'     : Dead-Name notification requested
                     's'     : Send-Possible notification armed
index 66fc2e8c4d647e2b80affc36cb09ed29d370affd..5db5554e578e4353b5e0b8cb91117f999c80b4d7 100755 (executable)
@@ -65,6 +65,7 @@ kcdata_type_def = {
     'STACKSHOT_KCCONTAINER_THREAD':     0x904,
     'STACKSHOT_KCTYPE_DONATING_PIDS':   0x907,
     'STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO': 0x908,
+    'STACKSHOT_KCTYPE_THREAD_NAME':     0x909,
     'STACKSHOT_KCTYPE_KERN_STACKFRAME': 0x90A,
     'STACKSHOT_KCTYPE_KERN_STACKFRAME64': 0x90B,
     'STACKSHOT_KCTYPE_USER_STACKFRAME': 0x90C,
@@ -95,6 +96,8 @@ kcdata_type_def = {
     'STACKSHOT_KCTYPE_ASID' : 0x925,
     'STACKSHOT_KCTYPE_PAGE_TABLES' : 0x926,
     'STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT' : 0x927,
+    'STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL' : 0x928,
+    'STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO' : 0x929,
 
     'STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT': 0x940,
     'STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT': 0x941,
@@ -821,6 +824,9 @@ KNOWN_TYPES_COLLECTION[0x906] = KCTypeDescription(0x906, (
     legacy_size = 0x68
 )
 
+KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL')] = KCSubTypeElement('dispatch_queue_label', KCSUBTYPE_TYPE.KC_ST_CHAR,
+                          KCSubTypeElement.GetSizeForArray(64, 1), 0, 1)
+
 KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT'), (
     KCSubTypeElement.FromBasicCtype('tds_thread_id', KCSUBTYPE_TYPE.KC_ST_UINT64, 0),
     KCSubTypeElement.FromBasicCtype('tds_voucher_identifier', KCSUBTYPE_TYPE.KC_ST_UINT64, 8),
@@ -860,7 +866,7 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT')] =
 )
 
 
-KNOWN_TYPES_COLLECTION[0x909] = KCSubTypeElement('pth_name', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(64, 1), 0, 1)
+KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_THREAD_NAME')] = KCSubTypeElement('pth_name', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(64, 1), 0, 1)
 
 KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT'), (
     KCSubTypeElement('imageLoadAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0),
@@ -997,6 +1003,16 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_THREAD_WAITINFO')] = KCT
             ),
             'thread_waitinfo')
 
+KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO'),
+            (
+                        KCSubTypeElement.FromBasicCtype('waiter', KCSUBTYPE_TYPE.KC_ST_UINT64, 0),
+                        KCSubTypeElement.FromBasicCtype('turnstile_context', KCSUBTYPE_TYPE.KC_ST_UINT64, 8),
+                        KCSubTypeElement.FromBasicCtype('turnstile_priority', KCSUBTYPE_TYPE.KC_ST_UINT8, 16),
+                        KCSubTypeElement.FromBasicCtype('number_of_hops', KCSUBTYPE_TYPE.KC_ST_UINT8, 17),
+                        KCSubTypeElement.FromBasicCtype('turnstile_flags', KCSUBTYPE_TYPE.KC_ST_UINT64, 18),
+            ),
+            'thread_turnstileinfo')
+
 KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_THREAD_GROUP'),
             (
                         KCSubTypeElement.FromBasicCtype('tgs_id', KCSUBTYPE_TYPE.KC_ST_UINT64, 0),
@@ -1187,7 +1203,7 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_PAGE_TABLES')] = KCTypeD
 )
 
 def GetSecondsFromMATime(mat, tb):
-    return (float(mat) * tb['numer']) / tb['denom']
+    return (float(long(mat) * tb['numer']) / tb['denom']) / 1e9
 
 def FindLibraryForAddress(liblist, address):
     current_lib = None
@@ -1283,6 +1299,11 @@ STACKSHOT_WAITOWNER_MTXSPIN     = (UINT64_MAX - 5)
 STACKSHOT_WAITOWNER_THREQUESTED = (UINT64_MAX - 6)
 STACKSHOT_WAITOWNER_SUSPENDED   = (UINT64_MAX - 7)
 
+STACKSHOT_TURNSTILE_STATUS_UNKNOWN      = 0x01
+STACKSHOT_TURNSTILE_STATUS_LOCKED_WAITQ = 0x02
+STACKSHOT_TURNSTILE_STATUS_WORKQUEUE    = 0x04
+STACKSHOT_TURNSTILE_STATUS_THREAD       = 0x08
+
 def formatWaitInfo(info):
     s = 'thread %d: ' % info['waiter'];
     type = info['wait_type']
@@ -1370,13 +1391,50 @@ def formatWaitInfo(info):
             s += ", unknown owner"
         s += ", workloop id %x" % context
     elif type == kThreadWaitOnProcess:
-        s += "waitpid, for pid %d" % owner
+        if owner == 2**64-1:
+            s += "waitpid, for any children"
+        elif 2**32 <= owner and owner < 2**64-1:
+            s += "waitpid, for process group %d" % abs(owner - 2**64)
+        else:
+            s += "waitpid, for pid %d" % owner
 
     else:
         s += "unknown type %d (owner %d, context %x)" % (type, owner, context)
 
     return s
+
+def formatTurnstileInfo(ti):
+    if ti is None:
+        return " [no turnstile]"
+
+    ts_flags = int(ti['turnstile_flags'])
+    ctx = int(ti['turnstile_context'])
+    hop = int(ti['number_of_hops'])
+    prio = int(ti['turnstile_priority'])
+    if ts_flags & STACKSHOT_TURNSTILE_STATUS_LOCKED_WAITQ:
+        return " [turnstile was in process of being updated]"
+    if ts_flags & STACKSHOT_TURNSTILE_STATUS_WORKQUEUE:
+        return " [blocked on workqueue: 0x%x, hops: %x, priority: %d]" % (ctx, hop, prio)
+    if ts_flags & STACKSHOT_TURNSTILE_STATUS_THREAD:
+        return " [blocked on: %d, hops: %x, priority: %d]" % (ctx, hop, prio)
+    if ts_flags & STACKSHOT_TURNSTILE_STATUS_UNKNOWN:
+        return " [turnstile with unknown inheritor]"
+
+    return " [unknown turnstile status!]"
         
+def formatWaitInfoWithTurnstiles(waitinfos, tsinfos):
+    wis_tis = []
+    for w in waitinfos:
+        found_pair = False
+        for t in tsinfos:
+            if int(w['waiter']) == int(t['waiter']):
+                wis_tis.append((w, t))
+                found_pair = True
+                break
+        if not found_pair:
+            wis_tis.append((w, None))
+
+    return map(lambda (wi, ti): formatWaitInfo(wi) + formatTurnstileInfo(ti), wis_tis)
 
 def SaveStackshotReport(j, outfile_name, incomplete):
     import time
@@ -1514,6 +1572,9 @@ def SaveStackshotReport(j, outfile_name, incomplete):
             thsnap["qosEffective"] = threadsnap["ths_eqos"]
             thsnap["qosRequested"] = threadsnap["ths_rqos"]
 
+            if "pth_name" in thdata:
+                thsnap["name"] = thdata["pth_name"];
+
             if threadsnap['ths_continuation']:
                 thsnap["continuation"] = GetSymbolInfoForFrame(AllImageCatalog, pr_libs, threadsnap['ths_continuation'])
             if "kernel_stack_frames" in thdata:
@@ -1535,7 +1596,9 @@ def SaveStackshotReport(j, outfile_name, incomplete):
             if threadsnap['ths_wait_event']:
                 thsnap["waitEvent"] = GetSymbolInfoForFrame(AllImageCatalog, pr_libs, threadsnap['ths_wait_event'])
 
-        if 'thread_waitinfo' in piddata:
+        if 'thread_waitinfo' in piddata and 'thread_turnstileinfo' in piddata:
+            tsnap['waitInfo'] = formatWaitInfoWithTurnstiles(piddata['thread_waitinfo'] , piddata['thread_turnstileinfo'])
+        elif 'thread_waitinfo' in piddata:
             tsnap['waitInfo'] = map(formatWaitInfo, piddata['thread_waitinfo'])
 
     obj['binaryImages'] = AllImageCatalog
@@ -1615,6 +1678,9 @@ def iterate_kcdatas(kcdata_file):
     with data_from_stream(kcdata_file) as data:
         iterator = kcdata_item_iterator(data)
         kcdata_buffer = KCObject.FromKCItem(iterator.next())
+        if not isinstance(kcdata_buffer, KCBufferObject):
+            iterator = kcdata_item_iterator(data[16:])
+            kcdata_buffer = KCObject.FromKCItem(iterator.next())
         if not isinstance(kcdata_buffer, KCBufferObject):
             try:
                 decoded = base64.b64decode(data)
@@ -1641,6 +1707,8 @@ def iterate_kcdatas(kcdata_file):
 
         for magic in iterator:
             kcdata_buffer = KCObject.FromKCItem(magic)
+            if kcdata_buffer.i_type == 0:
+                continue
             if not isinstance(kcdata_buffer, KCBufferObject):
                 raise Exception, "unknown file type"
             kcdata_buffer.ReadItems(iterator)
index 1fb875628748e23d261d26061f461639b3539c27..4c4e7d7e82f785dc5a3dbcd5606ee62e7225948f 100755 (executable)
@@ -1,4 +1,5 @@
 from xnu import *
+from workqueue import GetWorkqueueThreadRequestSummary
 
 def IterateProcKqueues(proc):
     """ Iterate through all kqueues in the given process
@@ -57,7 +58,7 @@ def IterateProcKqworkloops(proc):
 
     hash_mask = proc_filedesc.fd_kqhashmask
     for i in xrange(hash_mask + 1):
-        for kqwl in IterateListEntry(proc_filedesc.fd_kqhash[i], 'struct kqworkloop *', 'kqwl_hashlink', list_prefix='s'):
+        for kqwl in IterateListEntry(proc_filedesc.fd_kqhash[i], 'struct kqworkloop *', 'kqwl_hashlink'):
             yield kqwl
 
 def IterateAllKqueues():
@@ -67,9 +68,10 @@ def IterateAllKqueues():
             kq - yields each kqueue in the system
     """
     for t in kern.tasks:
-        if unsigned(t.bsd_info) == 0:
+        proc = unsigned(t.bsd_info)
+        if proc == 0:
             continue
-        proc = kern.GetValueFromAddress(t.bsd_info, 'proc_t')
+        proc = kern.GetValueFromAddress(proc, 'proc_t')
         for kq in IterateProcKqueues(proc):
             yield kq
 
@@ -102,36 +104,41 @@ def GetKnoteKqueue(kn):
     return kern.GetValueFromAddress(int(kn.kn_kq_packed), 'struct kqueue *')
 
 @lldb_type_summary(['knote *'])
-@header('{:<20s} {:<20s} {:<10s} {:<20s} {:<20s} {:<30s} {:<10} {:<10} {:<10} {:<30s}'.format('knote', 'ident', 'kev_flags', 'kqueue', 'udata', 'filtops', 'qos_use', 'qos_req', 'qos_ovr', 'status'))
+@header('{:<20s} {:<20s} {:<10s} {:<20s} {:<20s} {:<30s} {:<10} {:<10} {:<10} {:<20s}'.format('knote', 'ident', 'kev_flags', 'kqueue', 'udata', 'filtops', 'qos_req', 'qos_use', 'qos_ovr', 'status'))
 def GetKnoteSummary(kn):
     """ Summarizes a knote and related information
 
         returns: str - summary of knote
     """
-    format_string = '{o: <#020x} {o.kn_kevent.ident: <#020x} {o.kn_kevent.flags: <#010x} {kq_ptr: <#020x} {o.kn_kevent.udata: <#020x} {ops_str: <30s} {qos_use: <10s} {qos_req: <10s} {qos_ovr: <10s} {st_str: <30s}'
+    format_string = '{o: <#020x} {o.kn_kevent.kei_ident: <#020x} {o.kn_kevent.kei_flags: <#010x} {kq_ptr: <#020x} {o.kn_kevent.kei_udata: <#020x} {ops_str: <30s} {qos_req: <10s} {qos_use: <10s} {qos_ovr: <10s} {st_str: <20s}'
     state = unsigned(kn.kn_status)
-    fops_str = kern.Symbolicate(kern.globals.sysfilt_ops[unsigned(kn.kn_filtid)])
+    fops_str = kern.Symbolicate(kern.globals.sysfilt_ops[unsigned(kn.kn_kevent.kei_filtid)])
+    qos_index = int(kn.kn_qos_index)
+    if qos_index > 6:
+        qos_req = qos_index
+    else:
+        qos_req = int((kn.kn_kevent.kei_qos & 0x003fff00) >> 8).bit_length()
     return format_string.format(
             o=kn,
-            qos_use=xnudefines.thread_qos_short_strings[int(kn.kn_qos_index)],
-            qos_req=xnudefines.thread_qos_short_strings[int(kn.kn_req_index)],
+            qos_req=xnudefines.thread_qos_short_strings[qos_req],
+            qos_use=xnudefines.thread_qos_short_strings[qos_index],
             qos_ovr=xnudefines.thread_qos_short_strings[int(kn.kn_qos_override)],
             st_str=xnudefines.GetStateString(xnudefines.kn_state_strings, state),
             kq_ptr=int(GetKnoteKqueue(kn)),
             ops_str=fops_str)
 
-@lldb_command('showknote')
-def ShowKnote(cmd_args=None):
+@lldb_command('showknote', fancy=True)
+def ShowKnote(cmd_args=None, cmd_options={}, O=None):
     """ Show information about a knote
 
         usage: showknote <struct knote *>
     """
     if not cmd_args:
-        raise ArgumentError('missing struct knote * argument')
+        return O.error('missing struct knote * argument')
 
     kn = kern.GetValueFromAddress(cmd_args[0], 'struct knote *')
-    print GetKnoteSummary.header
-    print GetKnoteSummary(kn)
+    with O.table(GetKnoteSummary.header):
+        print GetKnoteSummary(kn)
 
 def IterateKqueueKnotes(kq):
     """ Iterate through all knotes of a given kqueue
@@ -147,42 +154,15 @@ def IterateKqueueKnotes(kq):
             continue
         yield kn
 
-@lldb_type_summary(['struct kqrequest *'])
-@header('{:<20s} {:<20s} {:<5s} {:<5s} {:<5s} {:s}'.format('kqrequest', 'thread', 'qos', 'ovr_qos', 'sa_qos', 'state'))
-def GetKqrequestSummary(kqr):
-    """ Summarize kqrequest information
-
-        params:
-            kqr - the kqrequest object
-        returns: str - summary of kqrequest
-    """
-    fmt = '{kqrp: <#020x} {kqr.kqr_thread: <#020x} {qos: <5s} {ovr_qos: <5s} {sa_qos: <5s} {state_str:<s}'
-    return fmt.format(kqrp=int(kqr),
-            kqr=kqr,
-            qos=xnudefines.thread_qos_short_strings[int(kqr.kqr_qos_index)],
-            ovr_qos=xnudefines.thread_qos_short_strings[int(kqr.kqr_override_index)],
-            sa_qos=xnudefines.thread_qos_short_strings[int(kqr.kqr_stayactive_qos)],
-            state_str=xnudefines.GetStateString(xnudefines.kqrequest_state_strings, kqr.kqr_state))
-
-@lldb_command('showkqrequest')
-def ShowKqrequest(cmd_args=None):
-    """ Display information about a kqrequest object.
-
-        usage: showkqrequest <struct kqrequest *>
-    """
-    if len(cmd_args) < 1:
-        raise ArgumentError('missing struct kqrequest * argument')
-    kqr = kern.GetValueFromAddress(cmd_args[0], 'struct kqrequest *')
-    print GetKqrequestSummary.header
-    print GetKqrequestSummary(kqr)
-    print GetKnoteSummary.header
-    for kn in IterateTAILQ_HEAD(kqr.kqr_suppressed, 'kn_tqe'):
-        print GetKnoteSummary(kn)
+kqueue_summary_fmt = '{ptr: <#020x} {o.kq_p: <#020x} {dyn_id: <#020x} {servicer: <#20x} {owner: <#20x} {o.kq_count: <6d} {wqs: <#020x} {st_str: <10s}'
 
-kqueue_summary_fmt = '{ptr: <#020x} {o.kq_p: <#020x} {dyn_id: <#020x} {servicer: <#20x} {owner: <#20x} {o.kq_count: <6d} {wqs: <#020x} {kqr_state: <30s} {st_str: <10s}'
+def GetServicer(req):
+    if req.tr_state in [3, 4]: # [ BINDING , BOUND ]
+        return int(req.tr_thread)
+    return 0
 
 @lldb_type_summary(['struct kqueue *'])
-@header('{: <20s} {: <20s} {: <20s} {: <20s} {: <20s} {: <6s} {: <20s} {: <30s} {: <10s}'.format('kqueue', 'process', 'dynamic_id', 'servicer', 'owner', '#evts', 'wqs', 'request', 'state'))
+@header('{: <20s} {: <20s} {: <20s} {: <20s} {: <20s} {: <6s} {: <20s} {: <10s}'.format('kqueue', 'process', 'dynamic_id', 'servicer', 'owner', '#evts', 'wqs', 'state'))
 def GetKqueueSummary(kq):
     """ Summarize kqueue information
 
@@ -206,30 +186,29 @@ def GetKqfileSummary(kqf):
             o=kq,
             ptr=int(kq),
             wqs=int(kq.kq_wqs),
-            kqr_state='',
             dyn_id=0,
             st_str=xnudefines.GetStateString(xnudefines.kq_state_strings, state),
             servicer=0,
             owner=0)
 
-@lldb_command('showkqfile')
-def ShowKqfile(cmd_args=None):
+@lldb_command('showkqfile', fancy=True)
+def ShowKqfile(cmd_args=None, cmd_options={}, O=None):
     """ Display information about a kqfile object.
 
         usage: showkqfile <struct kqfile *>
     """
     if len(cmd_args) < 1:
-        raise ArgumentError('missing struct kqfile * argument')
+        return O.error('missing struct kqfile * argument')
 
     kqf = kern.GetValueFromAddress(cmd_args[0], 'kqfile *')
 
-    print GetKqfileSummary.header
-    print GetKqfileSummary(kqf)
-    print GetKnoteSummary.header
-    for kn in IterateKqueueKnotes(kqf.kqf_kqueue):
-        print GetKnoteSummary(kn)
-    for kn in IterateTAILQ_HEAD(kqf.kqf_suppressed, 'kn_tqe'):
-        print GetKnoteSummary(kn)
+    with O.table(GetKqfileSummary.header):
+        print GetKqfileSummary(kqf)
+    with O.table(GetKnoteSummary.header):
+        for kn in IterateKqueueKnotes(kqf.kqf_kqueue):
+            print GetKnoteSummary(kn)
+        for kn in IterateTAILQ_HEAD(kqf.kqf_suppressed, 'kn_tqe'):
+            print GetKnoteSummary(kn)
 
 @lldb_type_summary(['struct kqworkq *'])
 @header(GetKqueueSummary.header)
@@ -242,25 +221,30 @@ def GetKqworkqSummary(kqwq):
     """
     return GetKqfileSummary(kern.GetValueFromAddress(int(kqwq), 'struct kqfile *'))
 
-@lldb_command('showkqworkq')
-def ShowKqworkq(cmd_args=None):
+@lldb_command('showkqworkq', fancy=True)
+def ShowKqworkq(cmd_args=None, cmd_options={}, O=None):
     """ Display summary and knote information about a kqworkq.
 
         usage: showkqworkq <struct kqworkq *>
     """
     if len(cmd_args) < 1:
-        raise ArgumentError('missing struct kqworkq * argument')
+        return O.error('missing struct kqworkq * argument')
 
     kqwq = kern.GetValueFromAddress(cmd_args[0], 'struct kqworkq *')
     kq = kqwq.kqwq_kqueue
-    print GetKqueueSummary.header
-    print GetKqworkqSummary(kqwq)
-    print GetKnoteSummary.header
-    for kn in IterateKqueueKnotes(kq):
-        print GetKnoteSummary(kn)
-    for i in xrange(0, xnudefines.KQWQ_NBUCKETS):
-        for kn in IterateTAILQ_HEAD(kq.kq_queue[i], 'kn_tqe'):
+    with O.table(GetKqueueSummary.header):
+        print GetKqworkqSummary(kqwq)
+
+    with O.table(GetWorkqueueThreadRequestSummary.header):
+        for i in range(1, 8):
+            print GetWorkqueueThreadRequestSummary(kq.kq_p, kqwq.kqwq_request[i])
+
+    with O.table(GetKnoteSummary.header):
+        for kn in IterateKqueueKnotes(kq):
             print GetKnoteSummary(kn)
+        for i in xrange(0, xnudefines.KQWQ_NBUCKETS):
+            for kn in IterateTAILQ_HEAD(kq.kq_queue[i], 'kn_tqe'):
+                print GetKnoteSummary(kn)
 
 @lldb_type_summary(['struct kqworkloop *'])
 @header(GetKqueueSummary.header)
@@ -277,104 +261,98 @@ def GetKqworkloopSummary(kqwl):
             o=kqwl.kqwl_kqueue,
             wqs=int(kqwl.kqwl_kqueue.kq_wqs),
             dyn_id=kqwl.kqwl_dynamicid,
-            kqr_state=xnudefines.GetStateString(xnudefines.kqrequest_state_strings, kqwl.kqwl_request.kqr_state),
             st_str=xnudefines.GetStateString(xnudefines.kq_state_strings, state),
-            servicer=int(kqwl.kqwl_request.kqr_thread),
+            servicer=GetServicer(kqwl.kqwl_request),
             owner=int(kqwl.kqwl_owner)
             )
 
-@lldb_command('showkqworkloop')
-def ShowKqworkloop(cmd_args=None):
+@lldb_command('showkqworkloop', fancy=True)
+def ShowKqworkloop(cmd_args=None, cmd_options={}, O=None):
     """ Display information about a kqworkloop.
 
         usage: showkqworkloop <struct kqworkloop *>
     """
     if len(cmd_args) < 1:
-        raise ArgumentError('missing struct kqworkloop * argument')
+        return O.error('missing struct kqworkloop * argument')
 
     kqwl = kern.GetValueFromAddress(cmd_args[0], 'struct kqworkloop *')
 
-    print GetKqworkloopSummary.header
-    print GetKqworkloopSummary(kqwl)
+    with O.table(GetKqworkloopSummary.header):
+        print GetKqworkloopSummary(kqwl)
 
-    print GetKqrequestSummary.header
-    kqr = kern.GetValueFromAddress(unsigned(addressof(kqwl.kqwl_request)), 'struct kqrequest *')
-    print GetKqrequestSummary(kqr)
+    with O.table(GetWorkqueueThreadRequestSummary.header):
+        print GetWorkqueueThreadRequestSummary(kqwl.kqwl_kqueue.kq_p, kqwl.kqwl_request)
 
-    print GetKnoteSummary.header
-    for kn in IterateKqueueKnotes(kqwl.kqwl_kqueue):
-        print GetKnoteSummary(kn)
+    with O.table(GetKnoteSummary.header):
+        for kn in IterateKqueueKnotes(kqwl.kqwl_kqueue):
+            print GetKnoteSummary(kn)
 
-@lldb_command('showkqueue')
-def ShowKqueue(cmd_args=None):
+@lldb_command('showkqueue', fancy=True)
+def ShowKqueue(cmd_args=None, cmd_options={}, O=None):
     """ Given a struct kqueue pointer, display the summary of the kqueue
 
         usage: showkqueue <struct kqueue *>
     """
     if not cmd_args:
-        raise ArgumentError('missing struct kqueue * argument')
+        return O.error('missing struct kqueue * argument')
 
     kq = kern.GetValueFromAddress(cmd_args[0], 'struct kqueue *')
     if int(kq.kq_state) & xnudefines.KQ_WORKQ:
-        ShowKqworkq(cmd_args=[str(int(kq))])
+        ShowKqworkq(cmd_args, cmd_options, O)
     elif int(kq.kq_state) & xnudefines.KQ_WORKLOOP:
-        ShowKqworkloop(cmd_args=[str(int(kq))])
+        ShowKqworkloop(cmd_args, cmd_options, O)
     else:
-        print GetKqueueSummary.header
-        print GetKqueueSummary(kq)
-        print GetKnoteSummary.header
-        for kn in IterateKqueueKnotes(kq):
-            print GetKnoteSummary(kn)
+        ShowKqfile(cmd_args, cmd_options, O)
 
-@lldb_command('showprocworkqkqueue')
-def ShowProcWorkqKqueue(cmd_args=None):
+@lldb_command('showprocworkqkqueue', fancy=True)
+def ShowProcWorkqKqueue(cmd_args=None, cmd_options={}, O=None):
     """ Show the workqueue kqueue for a given process.
 
-        usage: showworkqkqueue <proc_t>
+        usage: showprocworkqkqueue <proc_t>
     """
     if not cmd_args:
-        raise ArgumentError('missing struct proc * argument')
+        return O.error('missing struct proc * argument')
 
     proc = kern.GetValueFromAddress(cmd_args[0], 'proc_t')
     ShowKqworkq(cmd_args=[str(int(proc.p_fd.fd_wqkqueue))])
 
-@lldb_command('showprockqueues')
-def ShowProcKqueues(cmd_args=None):
+@lldb_command('showprockqueues', fancy=True)
+def ShowProcKqueues(cmd_args=None, cmd_options={}, O=None):
     """ Show the kqueues for a given process.
 
         usage: showprockqueues <proc_t>
     """
     if not cmd_args:
-        raise ArgumentError('missing struct proc * argument')
+        return O.error('missing struct proc * argument')
 
     proc = kern.GetValueFromAddress(cmd_args[0], 'proc_t')
 
-    print GetKqueueSummary.header
-    for kq in IterateProcKqueues(proc):
-        print GetKqueueSummary(kq)
+    with O.table(GetKqueueSummary.header):
+        for kq in IterateProcKqueues(proc):
+            print GetKqueueSummary(kq)
 
-@lldb_command('showprocknotes')
-def ShowProcKnotes(cmd_args=None):
+@lldb_command('showprocknotes', fancy=True)
+def ShowProcKnotes(cmd_args=None, cmd_options={}, O=None):
     """ Show the knotes for a given process.
 
         usage: showprocknotes <proc_t>
     """
 
     if not cmd_args:
-        raise ArgumentError('missing struct proc * argument')
+        return O.error('missing struct proc * argument')
 
     proc = kern.GetValueFromAddress(cmd_args[0], 'proc_t')
 
-    print GetKnoteSummary.header
-    for kn in IterateProcKnotes(proc):
-        print GetKnoteSummary(kn)
+    with O.table(GetKnoteSummary.header):
+        for kn in IterateProcKnotes(proc):
+            print GetKnoteSummary(kn)
 
-@lldb_command('showallkqueues')
-def ShowAllKqueues(cmd_args=[], cmd_options={}):
+@lldb_command('showallkqueues', fancy=True)
+def ShowAllKqueues(cmd_args=None, cmd_options={}, O=None):
     """ Display a summary of all the kqueues in the system
 
         usage: showallkqueues
     """
-    print GetKqueueSummary.header
-    for kq in IterateAllKqueues():
-        print GetKqueueSummary(kq)
+    with O.table(GetKqueueSummary.header):
+        for kq in IterateAllKqueues():
+            print GetKqueueSummary(kq)
index e157a5db3db8940c1be3de7259fde328c1fed6d9..a73dc5b8a95021b6f840696e15609e3179e7e2c9 100755 (executable)
@@ -59,13 +59,13 @@ def CalculateLedgerPeak(phys_footprint_entry):
         return: value - representing the ledger peak for the given phys footprint entry
     """
     now = kern.globals.sched_tick / 20
-    ledger_peak = phys_footprint_entry.le_credit - phys_footprint_entry.le_debit
-    if hasattr(phys_footprint_entry._le._le_max, 'le_interval_max') and (phys_footprint_entry._le._le_max.le_interval_max > ledger_peak):
-        ledger_peak = phys_footprint_entry._le._le_max.le_interval_max
+    ledger_peak = long(phys_footprint_entry.le_credit) - long(phys_footprint_entry.le_debit)
+    if hasattr(phys_footprint_entry._le._le_max, 'le_interval_max') and (long(phys_footprint_entry._le._le_max.le_interval_max) > ledger_peak):
+        ledger_peak = long(phys_footprint_entry._le._le_max.le_interval_max)
     return ledger_peak
 
-@header("{: >8s} {: >12s} {: >12s} {: >10s} {: >12s} {: >14s} {: >10s} {: >12s} {: >10s} {: >10s} {: >10s}  {: <20s}\n".format(
-'pid', 'effective', 'requested', 'state', 'user_data', 'physical', 'iokit', 'footprint',
+@header("{: >8s} {: >12s} {: >12s} {: >10s} {: >10s} {: >12s} {: >14s} {: >10s} {: >12s} {: >10s} {: >10s} {: >10s}  {: <20s}\n".format(
+'pid', 'effective', 'requested', 'state', 'relaunch', 'user_data', 'physical', 'iokit', 'footprint',
 'recent peak', 'lifemax', 'limit', 'command'))
 def GetMemoryStatusNode(proc_val):
     """ Internal function to get memorystatus information from the given proc
@@ -81,18 +81,18 @@ def GetMemoryStatusNode(proc_val):
     task_phys_footprint_ledger_entry = task_ledgerp.l_entries[kern.globals.task_ledgers.phys_footprint]
     page_size = kern.globals.page_size
 
-    phys_mem_footprint = (task_physmem_footprint_ledger_entry.le_credit - task_physmem_footprint_ledger_entry.le_debit) / page_size
-    iokit_footprint = (task_iokit_footprint_ledger_entry.le_credit - task_iokit_footprint_ledger_entry.le_debit) / page_size
-    phys_footprint = (task_phys_footprint_ledger_entry.le_credit - task_phys_footprint_ledger_entry.le_debit) / page_size
-    phys_footprint_limit = task_phys_footprint_ledger_entry.le_limit / page_size
+    phys_mem_footprint = (long(task_physmem_footprint_ledger_entry.le_credit) - long(task_physmem_footprint_ledger_entry.le_debit)) / page_size
+    iokit_footprint = (long(task_iokit_footprint_ledger_entry.le_credit) - long(task_iokit_footprint_ledger_entry.le_debit)) / page_size
+    phys_footprint = (long(task_phys_footprint_ledger_entry.le_credit) - long(task_phys_footprint_ledger_entry.le_debit)) / page_size
+    phys_footprint_limit = long(task_phys_footprint_ledger_entry.le_limit) / page_size
     ledger_peak = CalculateLedgerPeak(task_phys_footprint_ledger_entry)
     phys_footprint_spike = ledger_peak / page_size
-    phys_footprint_lifetime_max = task_phys_footprint_ledger_entry._le._le_max.le_lifetime_max / page_size
+    phys_footprint_lifetime_max = long(task_phys_footprint_ledger_entry._le._le_max.le_lifetime_max) / page_size
 
-    format_string = '{0: >8d} {1: >12d} {2: >12d} {3: #011x} {4: #011x} {5: >12d} {6: >10d} {7: >13d}'
+    format_string = '{0: >8d} {1: >12d} {2: >12d} {3: #011x} {4: >10d} {5: #011x} {6: >12d} {7: >10d} {8: >13d}'
     out_str += format_string.format(proc_val.p_pid, proc_val.p_memstat_effectivepriority,
-        proc_val.p_memstat_requestedpriority, proc_val.p_memstat_state, proc_val.p_memstat_userdata,
-        phys_mem_footprint, iokit_footprint, phys_footprint)
+        proc_val.p_memstat_requestedpriority, proc_val.p_memstat_state, proc_val.p_memstat_relaunch_flags, 
+        proc_val.p_memstat_userdata, phys_mem_footprint, iokit_footprint, phys_footprint)
     if phys_footprint != phys_footprint_spike:
         out_str += "{: >12d}".format(phys_footprint_spike)
     else:
@@ -333,8 +333,10 @@ def ZcacheCPUPrint(cmd_args=None):
 # Macro: zprint
 
 @lldb_type_summary(['zone','zone_t'])
-@header("{:^18s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:^6s} {:^6s} {:^6s} {:>10s} {:^15s} {:<20s}".format(
-'ZONE', 'TOT_SZ', 'PAGE_COUNT', 'ALLOC_ELTS', 'FREE_ELTS', 'FREE_SZ', 'ALL_FREE_PGS', 'ELT_SZ', 'ALLOC', '(ELTS', 'PGS', 'WASTE)', 'CACHE_ELTS', 'FLAGS', 'NAME'))
+@header(("{:<18s}  {:_^23s}  {:_^24s}  {:_^13s}  {:_^31s}\n"+
+"{:<18s}  {:>11s} {:>11s}  {:>8s} {:>7s} {:>7s}  {:>6s} {:>6s}  {:>7s} {:>5s} {:>3s} {:>5s} {:>7s}   {:<15s} {:<20s}").format(
+'', 'SIZE (bytes)', 'ELEMENTS (#)', 'PAGES', 'ALLOC CHUNK CONFIG',
+'ZONE', 'ALLOC', 'FREE', 'ALLOC', 'FREE', 'CACHE', 'COUNT', 'FREE', 'SIZE', 'ELTS', 'PGS', 'WASTE', 'ELT_SZ', 'FLAGS', 'NAME'))
 def GetZoneSummary(zone):
     """ Summarize a zone with important information. See help zprint for description of each field
         params:
@@ -343,11 +345,10 @@ def GetZoneSummary(zone):
           str - summary of the zone
     """
     out_string = ""
-    format_string = '{:#018x} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:6d} {:6d} {:6d} {:10d}  {markings} {name:s} '
+    format_string = '{zone:#018x}  {zone.cur_size:11,d} {free_size:11,d}  {zone.count:8,d} {zone.countfree:7,d} {cache_elem_count:7,d}  {zone.page_count:6,d} {zone.count_all_free_pages:6,d}  {zone.alloc_size:7,d} {alloc_count:5,d} {alloc_pages:3,d} {alloc_waste:5,d} {zone.elem_size:7,d}   {markings:<15s} {zone.zone_name:<20s} '
     pagesize = kern.globals.page_size
 
-    free_elements = zone.countfree
-    free_size = free_elements * zone.elem_size
+    free_size = zone.countfree * zone.elem_size
     mag_capacity = kern.GetGlobalVariable('magazine_element_count')
 
     alloc_pages = zone.alloc_size / pagesize
@@ -390,18 +391,16 @@ def GetZoneSummary(zone):
         if zone.zcache[0].zcc_depot_index != -1:
             cache_elem_count += zone.zcache[0].zcc_depot_index * mag_capacity
 
-    out_string += format_string.format(zone, zone.cur_size, zone.page_count,
-                    zone.count, free_elements, free_size, zone.count_all_free_pages,
-                    zone.elem_size, zone.alloc_size, alloc_count,
-                    alloc_pages, alloc_waste, cache_elem_count, name = zone.zone_name, markings=markings)
+    out_string += format_string.format(zone=zone, free_size=free_size, alloc_count=alloc_count,
+                    alloc_pages=alloc_pages, alloc_waste=alloc_waste, cache_elem_count=cache_elem_count, markings=markings)
 
     if zone.exhaustible :
             out_string += "(max: {:d})".format(zone.max_size)
 
     return out_string
 
-@lldb_command('zprint')
-def Zprint(cmd_args=None):
+@lldb_command('zprint', fancy=True)
+def Zprint(cmd_args=None, cmd_options={}, O=None):
     """ Routine to print a summary listing of all the kernel zones
     All columns are printed in decimal
     Legend:
@@ -424,9 +423,9 @@ def Zprint(cmd_args=None):
         I - zone was destroyed and is no longer valid
     """
     global kern
-    print GetZoneSummary.header
-    for zval in kern.zones:
-        print GetZoneSummary(zval)
+    with O.table(GetZoneSummary.header):
+        for zval in kern.zones:
+            print GetZoneSummary(zval)
 
 @xnudebug_test('test_zprint')
 def TestZprint(kernel_target, config, lldb_obj, isConnected ):
@@ -2236,12 +2235,12 @@ def ShowProcVnodes(cmd_args=None):
     if int(fdptr.fd_rdir) != 0:
         print '{0: <25s}\n{1: <s}\n{2: <s}'.format('Current Root Directory:', GetVnodeSummary.header, GetVnodeSummary(fdptr.fd_rdir))
     count = 0
-    print '\n' + '{0: <5s} {1: <7s}'.format('fd', 'flags') + GetVnodeSummary.header
+    print '\n' + '{0: <5s} {1: <7s} {2: <20s} '.format('fd', 'flags', 'fileglob') + GetVnodeSummary.header
     # Hack to get around <rdar://problem/12879494> llb fails to cast addresses to double pointers
-    fpptr = Cast(fdptr.fd_ofiles, 'fileproc *')
+    fpptr = Cast(fdptr.fd_ofiles, 'uint64_t *')
     while count < fdptr.fd_nfiles:
         fpp = dereference(fpptr)
-        fproc = Cast(fpp, 'fileproc *')
+        fproc = kern.GetValueFromAddress(int(fpp), 'fileproc *')
         if int(fproc) != 0:
             fglob = dereference(fproc).f_fglob
             flags = ""
@@ -2250,9 +2249,9 @@ def ShowProcVnodes(cmd_args=None):
                 if (fdptr.fd_ofileflags[count] & 2):    flags += 'F'
                 if (fdptr.fd_ofileflags[count] & 4):    flags += 'R'
                 if (fdptr.fd_ofileflags[count] & 8):    flags += 'C'
-                print '{0: <5d} {1: <7s}'.format(count, flags) + GetVnodeSummary(Cast(fglob.fg_data, 'vnode *'))
+                print '{0: <5d} {1: <7s} {2: <#020x} '.format(count, flags, fglob) + GetVnodeSummary(Cast(fglob.fg_data, 'vnode *'))
         count += 1
-        fpptr = kern.GetValueFromAddress(int(fpptr) + kern.ptrsize,'fileproc *')
+        fpptr = kern.GetValueFromAddress(int(fpptr) + kern.ptrsize,'uint64_t *')
 
 @lldb_command('showallprocvnodes')
 def ShowAllProcVnodes(cmd_args=None):
@@ -3082,7 +3081,24 @@ FixedTags = {
 }
 
 def GetVMKernName(tag):
-    return FixedTags[tag]
+    """ returns the formatted name for a vmtag and
+        the sub-tag for kmod tags.
+    """
+    if ((tag <= 27) or (tag == 255)):
+        return (FixedTags[tag], "")
+    site = kern.globals.vm_allocation_sites[tag]
+    if site:
+        if site.flags & 0x007F:
+            cstr = addressof(site.subtotals[site.subtotalscount])
+            return ("{:<50s}".format(str(Cast(cstr, 'char *'))), "")
+        else:
+            if site.flags & 0x0200:
+                xsite = Cast(site,'OSKextAccount *')
+                tagstr = ".{:<3d}".format(xsite.loadTag)
+                return (GetKmodIDName(xsite.loadTag), tagstr);
+            else:
+                return (kern.Symbolicate(site), "")
+    return ("", "")
 
 @lldb_command("showvmtags", "AS")
 def showvmtags(cmd_args=None, cmd_options={}):
@@ -3101,19 +3117,18 @@ def showvmtags(cmd_args=None, cmd_options={}):
     if "-A" in cmd_options:
         all_tags = True
     page_size = unsigned(kern.globals.page_size)
-    tagcounts = []
-    tagpeaks = []
-    for tag in range(256):
-        tagcounts.append(0)
-    for tag in range(256):
-        tagpeaks.append(0)
+    nsites = unsigned(kern.globals.vm_allocation_tag_highest)
+    tagcounts = [0] * nsites
+    tagpeaks = [0] * nsites
+    tagmapped = [0] * nsites
 
     if kern.globals.vm_tag_active_update:
-        for tag in range(256):
+        for tag in range(nsites):
             site = kern.globals.vm_allocation_sites[tag]
             if site:
-                tagcounts[unsigned(tag)] = unsigned(site.total)
-                tagpeaks[unsigned(tag)] = unsigned(site.peak)
+                tagcounts[tag] = unsigned(site.total)
+                tagmapped[tag] = unsigned(site.mapped)
+                tagpeaks[tag] = unsigned(site.peak)
     else:
         queue_head = kern.globals.vm_objects_wired
         for object in IterateQueue(queue_head, 'struct vm_object *', 'wired_objq'):
@@ -3123,29 +3138,30 @@ def showvmtags(cmd_args=None, cmd_options={}):
         CountMapTags(kern.globals.kernel_map, tagcounts, slow)
 
     total = 0
-    print " {:<7s}  {:>7s}   {:>7s}  {:<50s}".format("tag.kmod","peak","size","name")
-    for tag in range(256):
-        if all_tags or tagcounts[tag]:
+    totalmapped = 0
+    print " vm_allocation_tag_highest: {:<7d}  ".format(nsites)
+    print " {:<7s}  {:>7s}   {:>7s}   {:>7s}  {:<50s}".format("tag.kmod", "peak", "size", "mapped", "name")
+    for tag in range(nsites):
+        if all_tags or tagcounts[tag] or tagmapped[tag]:
             total += tagcounts[tag]
-            tagstr = ""
-            sitestr = ""
-            if ((tag <= 27) or (tag == 255)):
-                sitestr = GetVMKernName(tag)
-            else:
-                site = kern.globals.vm_allocation_sites[tag]
-                if site:
-                    if site.flags & 0x007F:
-                        cstr = addressof(site.subtotals[site.subtotalscount])
-                        sitestr = "{:<50s}".format(str(Cast(cstr, 'char *')))
+            totalmapped += tagmapped[tag]
+            (sitestr, tagstr) = GetVMKernName(tag)
+            site = kern.globals.vm_allocation_sites[tag]
+            print " {:>3d}{:<4s}  {:>7d}K  {:>7d}K  {:>7d}K  {:<50s}".format(tag, tagstr, tagpeaks[tag] / 1024, tagcounts[tag] / 1024, tagmapped[tag] / 1024, sitestr)
+
+            for sub in range(site.subtotalscount):
+                alloctag = unsigned(site.subtotals[sub].tag)
+                amount = unsigned(site.subtotals[sub].total)
+                subsite = kern.globals.vm_allocation_sites[alloctag]
+                if alloctag and subsite:
+                    if ((subsite.flags & 0x007f) == 0):
+                        kind_str = "named"
                     else:
-                        if site.flags & 0x0200:
-                            xsite = Cast(site,'OSKextAccount *')
-                            tagstr = ".{:<3d}".format(xsite.loadTag)
-                            sitestr = GetKmodIDName(xsite.loadTag)
-                        else:
-                            sitestr = kern.Symbolicate(site)
-            print " {:>3d}{:<4s}  {:>7d}K  {:>7d}K  {:<50s}".format(tag,tagstr,tagpeaks[tag] / 1024, tagcounts[tag] / 1024,sitestr)
-    print "Total:              {:>7d}K".format(total / 1024)
+                        kind_str = "from"
+                    (sitestr, tagstr) = GetVMKernName(alloctag)
+                    print " {:>7s}  {:>7s}   {:>7s}   {:>7d}K      {:s} {:>3d}{:<4s} {:<50s}".format(" ", " ", " ", amount / 1024, kind_str, alloctag, tagstr, sitestr)
+
+    print "Total:              {:>7d}K  {:>7d}K".format(total / 1024, totalmapped / 1024)
     return None
 
 
@@ -3327,10 +3343,304 @@ def _calc_vm_page_hash(obj, off):
 
     return hash_id
 
+def AddressIsFromZoneMap(addr):
+    zone_map_min_address = kern.GetGlobalVariable('zone_map_min_address')
+    zone_map_max_address = kern.GetGlobalVariable('zone_map_max_address')
+    if (unsigned(addr) >= unsigned(zone_map_min_address)) and (unsigned(addr) < unsigned(zone_map_max_address)):
+        return 1
+    else:
+        return 0
+
+def ElementOffsetInForeignPage():
+    zone_element_alignment = 32 # defined in zalloc.c
+    zone_page_metadata_size = sizeof('struct zone_page_metadata')
+    if zone_page_metadata_size % zone_element_alignment == 0:
+        offset = zone_page_metadata_size
+    else:
+        offset = zone_page_metadata_size + (zone_element_alignment - (zone_page_metadata_size % zone_element_alignment))
+    return unsigned(offset)
+
+def ElementStartAddrFromZonePageMetadata(page_metadata):
+    zone_metadata_region_min = kern.GetGlobalVariable('zone_metadata_region_min')
+    zone_map_min_address = kern.GetGlobalVariable('zone_map_min_address')
+    page_size = kern.GetGlobalVariable('page_size')
+    if AddressIsFromZoneMap(page_metadata):
+        page_index = (unsigned(page_metadata) - unsigned(zone_metadata_region_min)) / sizeof('struct zone_page_metadata')
+        element_start_addr = unsigned(zone_map_min_address) + unsigned(page_index * page_size)
+    else:
+        element_start_addr = unsigned(page_metadata) + unsigned(ElementOffsetInForeignPage())
+
+    return element_start_addr
+
+def ZonePageStartAddrFromZonePageMetadata(page_metadata):
+    zone_metadata_region_min = kern.GetGlobalVariable('zone_metadata_region_min')
+    zone_map_min_address = kern.GetGlobalVariable('zone_map_min_address')
+    page_size = kern.GetGlobalVariable('page_size')
+
+    if AddressIsFromZoneMap(page_metadata):
+        page_index = (unsigned(page_metadata) - unsigned(zone_metadata_region_min)) / sizeof('struct zone_page_metadata')
+        zone_page_addr = unsigned(zone_map_min_address) + unsigned(page_index * page_size)
+    else:
+        zone_page_addr = unsigned(page_metadata)
+
+    return unsigned(zone_page_addr)
+
+def CreateFreeElementsList(zone, first_free):
+    free_elements = []
+    if unsigned(first_free) == 0:
+        return free_elements
+    current = first_free
+    while True:
+        free_elements.append(unsigned(current))
+        next = dereference(Cast(current, 'vm_offset_t *'))
+        next = (unsigned(next) ^ unsigned(kern.globals.zp_nopoison_cookie))
+        next = kern.GetValueFromAddress(next, 'vm_offset_t *')
+        if unsigned(next) == 0:
+            break;
+        current = Cast(next, 'void *')
+
+    return free_elements
+
+#Macro: showallocatedzoneelement
+@lldb_command('showallocatedzoneelement')
+def ShowAllocatedElementsInZone(cmd_args=None, cmd_options={}):
+    """ Show all the allocated elements in a zone
+        usage: showzoneallocelements <address of zone>
+    """
+    if len(cmd_args) < 1:
+        raise ArgumentError("Please specify a zone")
+
+    zone = kern.GetValueFromAddress(cmd_args[0], 'struct zone *')
+    elements = FindAllocatedElementsInZone(zone)
+    i = 1
+    for elem in elements:
+        print "{0: >10d}/{1:<10d} element: {2: <#20x}".format(i, len(elements), elem)
+        i += 1
+
+#EndMacro: showallocatedzoneelement
+
+def FindAllocatedElementsInZone(zone):
+    page_size = kern.GetGlobalVariable('page_size')
+    elements = []
+    page_queues = ["any_free_foreign", "intermediate", "all_used"]
+    found_total = 0
+
+    for queue in page_queues:
+        found_in_queue = 0
+        if queue == "any_free_foreign" and unsigned(zone.allows_foreign) != 1:
+            continue
+
+        for zone_page_metadata in IterateQueue(zone.pages.__getattr__(queue), 'struct zone_page_metadata *', 'pages'):
+            free_elements = []
+            first_free_element = kern.GetValueFromAddress(GetFreeList(zone_page_metadata))
+            free_elements = CreateFreeElementsList(zone, first_free_element)
+
+            chunk_page_count = zone_page_metadata.page_count
+            element_addr_start = ElementStartAddrFromZonePageMetadata(zone_page_metadata)
+            zone_page_start = ZonePageStartAddrFromZonePageMetadata(zone_page_metadata)
+            next_page = zone_page_start + page_size
+            element_addr_end = zone_page_start + (chunk_page_count * page_size)
+            elem = unsigned(element_addr_start)
+            while elem < element_addr_end:
+                if elem not in free_elements:
+                    elements.append(elem)
+                    found_in_queue += 1
+                elem += zone.elem_size
+
+                if queue == "any_free_foreign":
+                    if (elem + zone.elem_size) >= next_page:
+                        zone_page_start = unsigned((elem + page_size) & ~(page_size - 1))
+                        next_page = zone_page_start + page_size
+                        elem = zone_page_start + unsigned(ElementOffsetInForeignPage())
+
+        found_total += found_in_queue
+#       print "Found {0: <d} allocated elements in the {1: <s} page queue".format(found_in_queue, queue)
+
+#   print "Total number of allocated elements: {0: <d} in zone {1: <s}".format(found_total, zone.zone_name)
+    return elements
+
+def match_vm_page_attributes(page, matching_attributes):
+    page_ptr = addressof(page)
+    unpacked_vm_object = _vm_page_unpack_ptr(page.vmp_object)
+    matched_attributes = 0
+    if "vmp_q_state" in matching_attributes and (page.vmp_q_state == matching_attributes["vmp_q_state"]):
+        matched_attributes += 1
+    if "vm_object" in matching_attributes and (unsigned(unpacked_vm_object) == unsigned(matching_attributes["vm_object"])):
+        matched_attributes += 1
+    if "vmp_offset" in matching_attributes and (unsigned(page.vmp_offset) == unsigned(matching_attributes["vmp_offset"])):
+        matched_attributes += 1
+    if "phys_page" in matching_attributes and (unsigned(_vm_page_get_phys_page(page_ptr)) == unsigned(matching_attributes["phys_page"])):
+        matched_attributes += 1
+    if "bitfield" in matching_attributes and unsigned(page.__getattr__(matching_attributes["bitfield"])) == 1:
+        matched_attributes += 1
+
+    return matched_attributes
+
+#Macro scan_vm_pages
+@header("{0: >26s}{1: >20s}{2: >10s}{3: >20s}{4: >20s}{5: >16s}".format("vm_pages_index/zone", "vm_page", "q_state", "vm_object", "offset", "ppn", "bitfield", "from_zone_map"))
+@lldb_command('scan_vm_pages', 'S:O:F:I:P:B:I:N:ZA')
+def ScanVMPages(cmd_args=None, cmd_options={}):
+    """ Scan the global vm_pages array (-A) and/or vmpages zone (-Z) for pages with matching attributes.
+        usage: scan_vm_pages <matching attribute(s)> [-A start vm_pages index] [-N number of pages to scan] [-Z scan vm_pages zone]
+
+            scan_vm_pages -A: scan vm pages in the global vm_pages array
+            scan_vm_pages -Z: scan vm pages allocated from the vm.pages zone
+            scan_vm_pages <-A/-Z> -S <vm_page_q_state value>: Find vm pages in the specified queue
+            scan_vm_pages <-A/-Z> -O <vm_object>: Find vm pages in the specified vm_object
+            scan_vm_pages <-A/-Z> -F <offset>: Find vm pages with the specified vmp_offset value
+            scan_vm_pages <-A/-Z> -P <phys_page>: Find vm pages with the specified physical page number
+            scan_vm_pages <-A/-Z> -B <bitfield>: Find vm pages with the bitfield set
+            scan_vm_pages <-A> -I <start_index>: Start the scan from start_index
+            scan_vm_pages <-A> -N <npages>: Scan at most npages
+    """
+    if (len(cmd_options) < 1):
+        raise ArgumentError("Please specify at least one matching attribute")
+
+    vm_pages = kern.globals.vm_pages
+    vm_pages_count = kern.globals.vm_pages_count
+
+    start_index = 0
+    npages = vm_pages_count
+    scan_vmpages_array = False
+    scan_vmpages_zone = False
+    attribute_count = 0
+
+    if "-A" in cmd_options:
+        scan_vmpages_array = True
+
+    if "-Z" in cmd_options:
+        scan_vmpages_zone = True
+
+    if scan_vmpages_array == False and scan_vmpages_zone == False:
+        raise ArgumentError("Please specify where to scan (-A: vm_pages array, -Z: vm.pages zone)")
+
+    attribute_values = {}
+    if "-S" in cmd_options:
+        attribute_values["vmp_q_state"] = kern.GetValueFromAddress(cmd_options["-S"], 'int')
+        attribute_count += 1
+
+    if "-O" in cmd_options:
+        attribute_values["vm_object"] = kern.GetValueFromAddress(cmd_options["-O"], 'vm_object_t')
+        attribute_count += 1
+
+    if "-F" in cmd_options:
+        attribute_values["vmp_offset"] = kern.GetValueFromAddress(cmd_options["-F"], 'unsigned long long')
+        attribute_count += 1
+
+    if "-P" in cmd_options:
+        attribute_values["phys_page"] = kern.GetValueFromAddress(cmd_options["-P"], 'unsigned int')
+        attribute_count += 1
+
+    if "-B" in cmd_options:
+        valid_vmp_bitfields = [
+            "vmp_in_background",
+            "vmp_on_backgroundq",
+            "vmp_gobbled",
+            "vmp_laundry",
+            "vmp_no_cache",
+            "vmp_private",
+            "vmp_reference",
+            "vmp_busy",
+            "vmp_wanted",
+            "vmp_tabled",
+            "vmp_hashed",
+            "vmp_fictitious",
+            "vmp_clustered",
+            "vmp_pmapped",
+            "vmp_xpmapped",
+            "vmp_free_when_done",
+            "vmp_absent",
+            "vmp_error",
+            "vmp_dirty",
+            "vmp_cleaning",
+            "vmp_precious",
+            "vmp_overwriting",
+            "vmp_restart",
+            "vmp_unusual",
+            "vmp_cs_validated",
+            "vmp_cs_tainted",
+            "vmp_cs_nx",
+            "vmp_reusable",
+            "vmp_lopage",
+            "vmp_written_by_kernel",
+            "vmp_unused_object_bits"
+            ]
+        attribute_values["bitfield"] = cmd_options["-B"]
+        if attribute_values["bitfield"] in valid_vmp_bitfields:
+            attribute_count += 1
+        else:
+            raise ArgumentError("Unknown bitfield: {0:>20s}".format(bitfield))
+
+    if "-I" in cmd_options:
+        start_index = kern.GetValueFromAddress(cmd_options["-I"], 'int')
+        npages = vm_pages_count - start_index
+
+    if "-N" in cmd_options:
+        npages = kern.GetValueFromAddress(cmd_options["-N"], 'int')
+        if npages == 0:
+            raise ArgumentError("You specified -N 0, nothing to be scanned")
+
+    end_index = start_index + npages - 1
+    if end_index >= vm_pages_count:
+        raise ArgumentError("Index range out of bound. vm_pages_count: {0:d}".format(vm_pages_count))
+
+    header_after_n_lines = 40
+    format_string = "{0: >26s}{1: >#20x}{2: >10d}{3: >#20x}{4: >#20x}{5: >#16x}"
+
+    found_in_array = 0
+    if scan_vmpages_array:
+        print "Scanning vm_pages[{0:d} to {1:d}] for {2:d} matching attribute(s)......".format(start_index, end_index, attribute_count)
+        i = start_index
+        while i <= end_index:
+            page = vm_pages[i]
+            if match_vm_page_attributes(page, attribute_values) == attribute_count:
+                if found_in_array % header_after_n_lines == 0:
+                    print ScanVMPages.header
+
+                print format_string.format(str(i), addressof(page), page.vmp_q_state, _vm_page_unpack_ptr(page.vmp_object), page.vmp_offset, _vm_page_get_phys_page(addressof(page)))
+                found_in_array += 1
+
+            i += 1
+
+    found_in_zone = 0
+    if scan_vmpages_zone:
+        page_size = kern.GetGlobalVariable('page_size')
+        num_zones = kern.GetGlobalVariable('num_zones')
+        zone_array = kern.GetGlobalVariable('zone_array')
+        print "Scanning vm.pages zone for {0:d} matching attribute(s)......".format(attribute_count)
+        i = 0
+        while i < num_zones:
+            zone = zone_array[i]
+            if str(zone.zone_name) == "vm pages":
+                break;
+            i += 1
+
+        if i == num_zones:
+            print "Cannot find vm_pages zone, skip the scan"
+        else:
+            print "Scanning page queues in the vm_pages zone..."
+            elements = FindAllocatedElementsInZone(zone)
+            for elem in elements:
+                page = kern.GetValueFromAddress(elem, 'vm_page_t')
+
+                if match_vm_page_attributes(page, attribute_values) == attribute_count:
+                    if found_in_zone % header_after_n_lines == 0:
+                        print ScanVMPages.header
+
+                    vm_object = _vm_page_unpack_ptr(page.vmp_object)
+                    phys_page = _vm_page_get_phys_page(page)
+                    print format_string.format("vm_pages zone", elem, page.vmp_q_state, vm_object, page.vmp_offset, phys_page)
+                    found_in_zone += 1
+
+    total = found_in_array + found_in_zone
+    print "Found {0:d} vm pages ({1:d} in array, {2:d} in zone) matching the requested {3:d} attribute(s)".format(total, found_in_array, found_in_zone, attribute_count)
+
+#EndMacro scan_vm_pages
+
 VM_PAGE_IS_WIRED = 1
 
 @header("{0: <10s} of {1: <10s} {2: <20s} {3: <20s} {4: <20s} {5: <10s} {6: <5s}\t {7: <28s}\t{8: <50s}".format("index", "total", "vm_page_t", "offset", "next", "phys_page", "wire#", "first bitfield", "second bitfield"))
-@lldb_command('vmobjectwalkpages', 'SBNQP:')
+@lldb_command('vmobjectwalkpages', 'CSBNQP:O:')
 def VMObjectWalkPages(cmd_args=None, cmd_options={}):
     """ Print the resident pages contained in the provided object. If a vm_page_t is provided as well, we
         specifically look for this page, highlighting it in the output or noting if it was not found. For
@@ -3338,11 +3648,13 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}):
         see and compare this to the object's resident page count field.
         Usage:
             vmobjectwalkpages <vm_object_t> : Walk and print all the pages for a given object (up to 4K pages by default)
+            vmobjectwalkpages <vm_object_t> -C : list pages in compressor after processing resident pages
             vmobjectwalkpages <vm_object_t> -B : Walk and print all the pages for a given object (up to 4K pages by default), traversing the memq backwards
             vmobjectwalkpages <vm_object_t> -N : Walk and print all the pages for a given object, ignore the page limit
             vmobjectwalkpages <vm_object_t> -Q : Walk all pages for a given object, looking for known signs of corruption (i.e. q_state == VM_PAGE_IS_WIRED && wire_count == 0)
             vmobjectwalkpages <vm_object_t> -P <vm_page_t> : Walk all the pages for a given object, annotate the specified page in the output with ***
             vmobjectwalkpages <vm_object_t> -P <vm_page_t> -S : Walk all the pages for a given object, stopping when we find the specified page
+            vmobjectwalkpages <vm_object_t> -O <offset> : Like -P, but looks for given offset
 
     """
 
@@ -3357,10 +3669,14 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}):
     if "-P" in cmd_options:
         page = kern.GetValueFromAddress(cmd_options['-P'], 'vm_page_t')
 
+    off = -1
+    if "-O" in cmd_options:
+        off = kern.GetValueFromAddress(cmd_options['-O'], 'vm_offset_t')
+
     stop = 0
     if "-S" in cmd_options:
-        if page == 0:
-            raise ArgumentError("-S can only be passed when a page is specified with -P")
+        if page == 0 and off < 0:
+            raise ArgumentError("-S can only be passed when a page is specified with -P or -O")
         stop = 1
 
     walk_backwards = False
@@ -3385,6 +3701,10 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}):
     if "-N" in cmd_options:
         ignore_limit = 1
 
+    show_compressed = 0
+    if "-C" in cmd_options:
+        show_compressed = 1
+
     page_count = 0
     res_page_count = unsigned(obj.resident_page_count)
     page_found = False
@@ -3397,7 +3717,11 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}):
             out_string += "******"
             page_found = True
 
-        if page != 0 or quiet_mode:
+        if (off > 0 and not(page_found) and vmp.vmp_offset == off):
+            out_string += "******"
+            page_found = True
+
+        if page != 0 or off > 0 or quiet_mode:
              if (page_count % 1000) == 0:
                 print "traversed %d pages ...\n" % (page_count)
         else:
@@ -3457,7 +3781,7 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}):
 
         if (page_count >= limit and not(ignore_limit)):
             print out_string + "Limit reached (%d pages), stopping..." % (limit)
-            return
+            break
 
         print out_string
 
@@ -3468,8 +3792,30 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}):
     if (page != 0):
         print("page found? : %s\n" % page_found)
 
+    if (off > 0):
+        print("page found? : %s\n" % page_found)
+
     print("Object reports resident page count of %d, we saw %d pages when we walked the resident list.\n" % (unsigned(obj.resident_page_count), unsigned(page_count)))
 
+    if show_compressed != 0 and obj.pager != 0 and unsigned(obj.pager.mo_pager_ops) == unsigned(addressof(kern.globals.compressor_pager_ops)):
+        pager = Cast(obj.pager, 'compressor_pager *')
+        chunks = pager.cpgr_num_slots / 128
+        pagesize = kern.globals.page_size
+
+        page_idx = 0
+        while page_idx < pager.cpgr_num_slots:
+            if chunks != 0:
+                chunk = pager.cpgr_slots.cpgr_islots[page_idx / 128]
+                slot = chunk[page_idx % 128]
+            elif pager.cpgr_num_slots > 2:
+                slot = pager.cpgr_slots.cpgr_dslots[page_idx]
+            else:
+                slot = pager.cpgr_slots.cpgr_eslots[page_idx]
+
+            if slot != 0:
+               print("compressed page for offset: %x slot %x\n" % ((page_idx * pagesize) - obj.paging_offset, slot))
+            page_idx = page_idx + 1
+
 
 @lldb_command("show_all_apple_protect_pagers")
 def ShowAllAppleProtectPagers(cmd_args=None):
@@ -3563,23 +3909,23 @@ def ShowJetsamSnapshot(cmd_args=None, cmd_options={}):
     # Dumps the snapshot header info
     print lldb_run_command('p *memorystatus_jetsam_snapshot')
 
-    hdr_format = "{0: >32s} {1: >5s} {2: >4s} {3: >6s} {4: >6s} {5: >20s} {6: >20s} {7: >20s} {8: >5s} {9: >10s} {10: >6s} {11: >6s} {12: >10s} {13: >15s} {14: >15s} {15: >15s} {16: >15s}"
+    hdr_format = "{0: >32s} {1: >5s} {2: >4s} {3: >6s} {4: >6s} {5: >20s} {6: >20s} {7: >20s} {8: >5s} {9: >10s} {10: >6s} {11: >6s} {12: >10s} {13: >15s} {14: >15s} {15: >15s}"
     if (show_footprint_details == True):
-        hdr_format += "{17: >15s} {18: >15s} {19: >12s} {20: >12s} {21: >17s} {22: >10s} {23: >13s} {24: >10s}"
+        hdr_format += "{16: >15s} {17: >15s} {18: >12s} {19: >12s} {20: >17s} {21: >10s} {22: >13s} {23: >10s}"
 
 
     if (show_footprint_details == False):
-        print hdr_format.format('command', 'index', 'pri', 'cid', 'pid', 'starttime', 'killtime', 'idletime', 'kill', '#ents', 'fds', 'gen', 'state', 'footprint', 'max', 'purgeable', 'lifetimeMax')
-        print hdr_format.format('', '', '', '', '', '(abs)', '(abs)', '(abs)', 'cause', '', '', 'Count', '', '(pages)', '(pages)', '(pages)', '(pages)')
+        print hdr_format.format('command', 'index', 'pri', 'cid', 'pid', 'starttime', 'killtime', 'idletime', 'kill', '#ents', 'fds', 'gen', 'state', 'footprint', 'purgeable', 'lifetimeMax')
+        print hdr_format.format('', '', '', '', '', '(abs)', '(abs)', '(abs)', 'cause', '', '', 'Count', '', '(pages)', '(pages)', '(pages)')
     else:
-        print hdr_format.format('command', 'index', 'pri', 'cid', 'pid', 'starttime', 'killtime', 'idletime', 'kill', '#ents', 'fds', 'gen', 'state', 'footprint', 'max', 'purgeable', 'lifetimeMax', '|| internal', 'internal_comp', 'iokit_mapped', 'purge_nonvol', 'purge_nonvol_comp', 'alt_acct', 'alt_acct_comp', 'page_table')
-        print hdr_format.format('', '', '', '', '', '(abs)', '(abs)', '(abs)', 'cause', '', '', 'Count', '', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)')
+        print hdr_format.format('command', 'index', 'pri', 'cid', 'pid', 'starttime', 'killtime', 'idletime', 'kill', '#ents', 'fds', 'gen', 'state', 'footprint', 'purgeable', 'lifetimeMax', '|| internal', 'internal_comp', 'iokit_mapped', 'purge_nonvol', 'purge_nonvol_comp', 'alt_acct', 'alt_acct_comp', 'page_table')
+        print hdr_format.format('', '', '', '', '', '(abs)', '(abs)', '(abs)', 'cause', '', '', 'Count', '', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)')
 
 
     entry_format = "{e.name: >32s} {index: >5d} {e.priority: >4d} {e.jse_coalition_jetsam_id: >6d} {e.pid: >6d} "\
                    "{e.jse_starttime: >20d} {e.jse_killtime: >20d} "\
                    "{e.jse_idle_delta: >20d} {e.killed: >5d} {e.jse_memory_region_count: >10d} "\
-                   "{e.fds: >6d} {e.jse_gencount: >6d} {e.state: >10x} {e.pages: >15d} {e.max_pages: >15d} "\
+                   "{e.fds: >6d} {e.jse_gencount: >6d} {e.state: >10x} {e.pages: >15d} "\
                    "{e.purgeable_pages: >15d} {e.max_pages_lifetime: >15d}"
 
     if (show_footprint_details == True):
@@ -3595,7 +3941,7 @@ def ShowJetsamSnapshot(cmd_args=None, cmd_options={}):
     snapshot_list = kern.globals.memorystatus_jetsam_snapshot.entries
     idx = 0
     while idx < count:
-        current_entry = Cast(snapshot_list[idx], 'jetsam_snapshot_entry')
+        current_entry = dereference(Cast(addressof(snapshot_list[idx]), 'jetsam_snapshot_entry *'))
         print entry_format.format(index=idx, e=current_entry)
         idx +=1
     return
index effb5ea289e72321456faf5dae215b12288498fa..237927c691be99c7ff55a5fb467379c1e6cf1f15 100755 (executable)
@@ -942,10 +942,24 @@ def DumpRawTraceFile(cmd_args=[], cmd_options={}):
             # XXX condition here is on __LP64__
             if lp64 :
                 tempbuf += struct.pack('QQQQQQIIQ', 
-                        e.timestamp, e.arg1, e.arg2, e.arg3, e.arg4, e.arg5, e.debugid, e.cpuid, e.unused)
+                        unsigned(e.timestamp),
+                        unsigned(e.arg1),
+                        unsigned(e.arg2),
+                        unsigned(e.arg3),
+                        unsigned(e.arg4),
+                        unsigned(e.arg5),
+                        unsigned(e.debugid),
+                        unsigned(e.cpuid),
+                        unsigned(e.unused))
             else :
-                tempbuf += struct.pack('QIIIIII', 
-                        e.timestamp, e.arg1, e.arg2, e.arg3, e.arg4, e.arg5, e.debugid)
+                tempbuf += struct.pack('QIIIIII',
+                        unsigned(e.timestamp),
+                        unsigned(e.arg1),
+                        unsigned(e.arg2),
+                        unsigned(e.arg3),
+                        unsigned(e.arg4),
+                        unsigned(e.arg5),
+                        unsigned(e.debugid))
 
             # Watch for out of order timestamps
             if earliest_time < (htab[min_kdbp].kd_prev_timebase & KDBG_TIMESTAMP_MASK) :
index 2c6cfd8766d897bcf19758203b8a815d035e32dd..c7777f86b60c9451c706eb38a1ebb12cf5963c45 100755 (executable)
@@ -1,17 +1,31 @@
 
 """ Please make sure you read the README COMPLETELY BEFORE reading anything below.
-    It is very critical that you read coding guidelines in Section E in README file. 
+    It is very critical that you read coding guidelines in Section E in README file.
 """
-
 from xnu import *
 from utils import *
 from string import *
 from socket import *
+import tempfile
 
 import xnudefines
 from netdefines import *
 from routedefines import *
 
+def GetDlilIfFlagsAsString(dlil_if_flags):
+    """ Return a formatted string description of the dlil interface flags
+    """
+    out_string = ""
+    flags = (unsigned)(dlil_if_flags & 0xffff)
+    i = 0
+    num = 1
+    while num <= flags:
+        if flags & num:
+            out_string += dlil_if_flags_strings[i] + ","
+        i += 1
+        num = num << 1
+    return rstrip(out_string, ",")
+
 def GetIfFlagsAsString(if_flags):
     """ Return a formatted string description of the interface flags
     """
@@ -21,7 +35,7 @@ def GetIfFlagsAsString(if_flags):
     num = 1
     while num <= flags:
         if flags & num:
-            out_string += if_flags_strings[i] + "," 
+            out_string += if_flags_strings[i] + ","
         i += 1
         num = num << 1
     return rstrip(out_string, ",")
@@ -36,6 +50,7 @@ def ShowIfConfiguration(ifnet):
     format_string = "{0: <s}: flags={1: <x} <{2: <s}> index {3: <d} mtu {4: <d}"
     if iface :
         out_string += format_string.format(iface.if_xname, (iface.if_flags & 0xffff), GetIfFlagsAsString(iface.if_flags), iface.if_index, iface.if_data.ifi_mtu)
+        out_string += "\n\tdlil flags=" + hex(dlifnet.dl_if_flags)+ " <" + GetDlilIfFlagsAsString(dlifnet.dl_if_flags) + ">"
         out_string += "\n\t(struct ifnet *)" + hex(ifnet)
         if iface.if_snd.ifcq_len :
             out_string += "\n\t" + str(iface.if_snd.ifcq_len)
@@ -53,6 +68,51 @@ def GetIfConfiguration(ifname):
             return ifnet
     return None
 
+# Macro: net_get_always_on_pktap
+@lldb_command('net_get_always_on_pktap')
+def NetGetAlwaysOnPktap(cmd_args=None):
+    """ Dump the always-on packet capture to /tmp/dump.pktap
+    """
+    for i in range(0, 10):
+        ifnet = GetIfConfiguration("pktap"+str(i))
+        if not ifnet:
+            continue
+        if ifnet.if_bpf == 0:
+            ifnet = None
+            continue
+        if ifnet.if_bpf.bif_dlist.bd_headdrop == 0:
+            ifnet = None
+            continue
+
+        break
+
+    if not ifnet:
+        print "Could not find a pktap interface"
+        return
+
+    bpf_d = ifnet.if_bpf.bif_dlist
+
+    f = tempfile.NamedTemporaryFile(prefix="dump-", suffix=".pktap", dir="/tmp/", mode="wb", delete=False)
+
+    err = lldb.SBError()
+
+    if bpf_d.bd_hbuf != 0:
+        addr = bpf_d.bd_hbuf[0]._sbval19k84obscure747.AddressOf().GetValueAsUnsigned()
+        buf = LazyTarget.GetProcess().ReadMemory(addr, unsigned(bpf_d.bd_hlen), err)
+        if err.fail:
+            print "Error, getting sbuf"
+        f.write(buf)
+
+    addr = bpf_d.bd_sbuf[0]._sbval19k84obscure747.AddressOf().GetValueAsUnsigned()
+    buf = LazyTarget.GetProcess().ReadMemory(addr, unsigned(bpf_d.bd_slen), err)
+    if err.fail:
+        print "Error, getting sbuf"
+    f.write(buf)
+
+    print f.name
+    f.close()
+# EndMacro: net_get_always_on_pktap
+
 # Macro: ifconfig
 @lldb_command('ifconfig')
 def ShowIfconfig(cmd_args=None) :
@@ -70,9 +130,20 @@ def ShowIfconfig(cmd_args=None) :
             print GetIfaddrs(ifnet)
 # EndMacro: ifconfig
 
+#Macro: ifconfig_dlil
+@lldb_command('ifconfig_dlil')
+def ShowIfconfigDlil(cmd_args=None) :
+    """ Display ifconfig-like output for DLIL interface list, print (struct ifnet *) pointer and dlil info for further inspection
+    """
+    dlil_ifnets = kern.globals.dlil_ifnet_head
+    for dlil_ifnet in IterateTAILQ_HEAD(dlil_ifnets, "dl_if_link"):
+        ShowIfConfiguration(dlil_ifnet)
+        print GetIfaddrs(Cast(dlil_ifnet, 'ifnet *'))
+# EndMacro: ifconfig_dlil
+
 def GetAddressAsStringColonHex(addr, count):
     out_string = ""
-    i = 0 
+    i = 0
     addr_format_string = "{0:02x}"
     while (i < count):
         if (i == 0):
@@ -92,7 +163,7 @@ def GetSocketAddrAsStringUnix(sockaddr):
     if (sock_unix == 0):
         return "(null)"
     else:
-        if (len(str(sock_unix.sun_path)) > 0): 
+        if (len(str(sock_unix.sun_path)) > 0):
             return str(sock_unix.sun_path)
         else:
             return "\"\""
@@ -100,7 +171,7 @@ def GetSocketAddrAsStringUnix(sockaddr):
 def GetInAddrAsString(ia):
     out_string = ""
     inaddr = Cast(ia, 'in_addr *')
-    
+
     packed_value = struct.pack('I', unsigned(ia.s_addr))
     out_string = inet_ntoa(packed_value)
     return out_string
@@ -115,7 +186,7 @@ def GetIn6AddrAsString(ia):
 
 def GetSocketAddrAsStringInet(sockaddr):
     sock_in = Cast(sockaddr, 'sockaddr_in *')
-    return GetInAddrAsString(sock_in.sin_addr)
+    return GetInAddrAsString(addressof(sock_in.sin_addr))
 
 def GetSocketAddrAsStringInet6(sockaddr):
     sock_in6 = Cast(sockaddr, 'sockaddr_in6 *')
@@ -132,7 +203,7 @@ def GetSocketAddrAsStringLink(sockaddr):
         else:
             out_string += GetAddressAsStringColonHex(addressof(sock_link.sdl_data[sock_link.sdl_nlen]), sock_link.sdl_alen)
     return out_string
-    
+
 def GetSocketAddrAsStringAT(sockaddr):
     out_string = ""
     sock_addr = Cast(sockaddr, 'sockaddr *')
@@ -206,7 +277,7 @@ def GetCapabilitiesAsString(flags):
     num = 1
     while num <= flags:
         if flags & num:
-            out_string += if_capenable_strings[i] + "," 
+            out_string += if_capenable_strings[i] + ","
         i += 1
         num = num << 1
     return rstrip(out_string, ",")
@@ -260,7 +331,7 @@ def ShowDlilIfnetConfiguration(dlil_ifnet, show_all) :
 @lldb_command('showifnets')
 def ShowIfnets(cmd_args=None) :
     """ Display ifconfig-like output for all attached and detached interfaces
-    """                                      
+    """
     showall = 0
     if cmd_args != None and len(cmd_args) > 0 :
         showall = 1
@@ -394,9 +465,9 @@ def GetSocketProtocolAsString(sock):
 def GetInAddr4to6AsString(inaddr):
     out_string = ""
     if (inaddr is not None):
-        ia = Cast(inaddr, 'char *')
-        inaddr_format_string = "{0: <d}.{1: <d}.{2: <d}.{3: <d}"
-        out_string += inaddr_format_string.format(ia[0], ia[1], ia[2], ia[3])
+        ia = Cast(inaddr, 'unsigned char *')
+        inaddr_format_string = "{0:d}:{1:d}:{2:d}:{3:d}"
+        out_string += inaddr_format_string.format(unsigned(ia[0]), unsigned(ia[1]), unsigned(ia[2]), unsigned(ia[3]))
     return out_string
 
 def GetInPortAsString(port):
@@ -419,11 +490,11 @@ def GetIPv4SocketAsString(sock) :
     else:
         out_string += "inpcb: " + hex(pcb)
         out_string += GetSocketProtocolAsString(sock)
-        
-        out_string += GetInAddr4to6AsString(addressof(pcb.inp_dependladdr.inp46_local))
+
+        out_string += GetInAddr4to6AsString(addressof(pcb.inp_dependladdr.inp46_local.ia46_addr4))
         out_string += GetInPortAsString(addressof(pcb.inp_lport))
         out_string += " -> "
-        out_string += GetInAddr4to6AsString(addressof(pcb.inp_dependfaddr.inp46_foreign))
+        out_string += GetInAddr4to6AsString(addressof(pcb.inp_dependfaddr.inp46_foreign.ia46_addr4))
         out_string += GetInPortAsString(addressof(pcb.inp_fport))
     return out_string
 
@@ -435,7 +506,7 @@ def GetIPv6SocketAsString(sock) :
     else:
         out_string += "inpcb: " + hex(pcb) + " "
         out_string += GetSocketProtocolAsString(sock)
+
         out_string += GetIn6AddrAsString((pcb.inp_dependladdr.inp6_local.__u6_addr.__u6_addr8))
         out_string += GetInPortAsString(addressof(pcb.inp_lport))
         out_string += " -> "
@@ -472,6 +543,7 @@ def GetSocket(socket) :
             out_string += GetIPv4SocketAsString(so)
         if (domain.dom_family == 30):
             out_string += GetIPv6SocketAsString(so)
+        out_string += " s=" + str(int(so.so_snd.sb_cc)) + " r=" + str(int(so.so_rcv.sb_cc)) + " usecnt=" + str(int(so.so_usecount)) + "] "
     else:
         out_string += "(null)"
     return out_string
@@ -506,85 +578,84 @@ def ShowSocket(cmd_args=None) :
         return
 # EndMacro: showsocket
 
+def GetProcSockets(proc, total_snd_cc, total_rcv_cc):
+    """ Given a proc_t pointer, display information about its sockets
+    """
+    out_string = ""
+
+    if proc is None:
+        out_string += "Unknown value passed as argument."
+    else:
+        snd_cc = 0
+        rcv_cc = 0
+        sock_fd_seen = 0
+        count = 0
+        """struct  filedesc *"""
+        proc_filedesc = proc.p_fd
+        """struct  fileproc **"""
+        proc_ofiles = proc_filedesc.fd_ofiles
+        """ high-water mark of fd_ofiles """
+        proc_lastfile = unsigned(proc_filedesc.fd_lastfile)
+        if proc_filedesc.fd_nfiles != 0:
+            while count <= proc_lastfile:
+                if (unsigned(proc_ofiles[count]) != 0 and proc_ofiles[count].f_fglob != 0):
+                        fg = proc_ofiles[count].f_fglob
+                        if (int(fg.fg_ops.fo_type) == 2):
+                            if (proc_filedesc.fd_ofileflags[count] & 4):
+                                out_string += "U: "
+                            else:
+                                out_string += " "
+                            out_string += "fd = " + str(count) + " "
+                            if (fg.fg_data != 0):
+                                out_string += GetSocket(unsigned(fg.fg_data))
+                                out_string += "\n"
+
+                                so = kern.GetValueFromAddress(unsigned(fg.fg_data), 'socket *')
+                                snd_cc += int(so.so_snd.sb_cc)
+                                total_snd_cc[0] += int(so.so_snd.sb_cc)
+                                rcv_cc += int(so.so_rcv.sb_cc)
+                                total_rcv_cc[0] += int(so.so_rcv.sb_cc)
+                                sock_fd_seen += 1
+                            else:
+                                out_string += ""
+                count += 1
+        out_string += "total sockets " + str(int(sock_fd_seen)) + " snd_cc " + str(int(snd_cc)) + " rcv_cc " + str(int(rcv_cc)) + "\n"
+    return out_string
+
+
 # Macro: showprocsockets
 @lldb_command('showprocsockets')
 def ShowProcSockets(cmd_args=None):
     """ Given a proc_t pointer, display information about its sockets
     """
+    total_snd_cc = [0]
+    total_rcv_cc = [0]
     out_string = ""
-    
     if cmd_args != None and len(cmd_args) > 0 :
         proc = kern.GetValueFromAddress(cmd_args[0], 'proc *')
-        proc_fd = proc.p_fd
 
         if not proc:
             print "Unknown value passed as argument."
             return
         else:
-            count = 0
-            fpp = Cast(proc_fd.fd_ofiles, 'fileproc **')
-            while (count < proc_fd.fd_nfiles):
-                fp = Cast(dereference(fpp), 'fileproc *')
-                if (fp != 0):
-                    fg = Cast(fp.f_fglob, 'fileglob *')
-                    if (int(fg.fg_ops.fo_type) == 2):
-                        if (proc_fd.fd_ofileflags[count] & 4):
-                            out_string += "U: "
-                        else:
-                            out_string += " "
-                        out_string += "fd = " + str(count) + " " 
-                        if (fg.fg_data != 0):
-                            out_string += GetSocket(unsigned(fg.fg_data))
-                            out_string += "\n"
-                        else:
-                            out_string += ""
-                fpp = kern.GetValueFromAddress(unsigned(fpp + 8), 'fileproc **')
-                count += 1
-        print out_string
+            print GetProcInfo(proc)
+            print GetProcSockets(proc, total_snd_cc, total_rcv_cc)
     else:
         print "Missing argument 0 in user function."
 # EndMacro: showprocsockets
 
-def GetProcSockets(proc):
-    """ Given a proc_t pointer, display information about its sockets
-    """
-    out_string = ""
-    proc_fd = proc.p_fd
-
-    if proc is None:
-        out_string += "Unknown value passed as argument."
-    else:
-        count = 0
-        fpp = Cast(proc_fd.fd_ofiles, 'fileproc **')
-        while (count < proc_fd.fd_nfiles):
-            fp = Cast(dereference(fpp), 'fileproc *')
-            if (fp != 0):
-                fg = Cast(fp.f_fglob, 'fileglob *')
-                if (int(fg.fg_ops.fo_type) == 2):
-                    if (proc_fd.fd_ofileflags[count] & 4):
-                        out_string += "U: "
-                    else:
-                        out_string += " "
-                    out_string += "fd = " + str(count) + " " 
-                    if (fg.fg_data != 0):
-                        out_string += GetSocket(unsigned(fg.fg_data))
-                        out_string += "\n"
-                    else:
-                        out_string += ""
-            fpp = kern.GetValueFromAddress(unsigned(fpp + 8), 'fileproc **')
-            count += 1
-    return out_string
-    
-    
 # Macro: showallprocsockets
 @lldb_command('showallprocsockets')
 def ShowAllProcSockets(cmd_args=None):
     """Display information about the sockets of all the processes
     """
+    total_snd_cc = [0]
+    total_rcv_cc = [0]
     for proc in kern.procs:
         print "================================================================================"
         print GetProcInfo(proc)
-        print GetProcSockets(proc)
+        print GetProcSockets(proc, total_snd_cc, total_rcv_cc)
+    print ("total_snd_cc: " + str(int(total_snd_cc[0])) + " total_rcv_cc: " + str(int(total_rcv_cc[0])) + "\n")
 # EndMacro: showallprocsockets
 
 
@@ -596,7 +667,7 @@ def GetRtEntryPrDetailsAsString(rte):
     dst_string_format = "{0:<18s}"
     if (dst.sa_family == AF_INET):
         out_string += dst_string_format.format(GetSocketAddrAsStringInet(dst)) + " "
-    else: 
+    else:
         if (dst.sa_family == AF_INET6):
             out_string += dst_string_format.format(GetSocketAddrAsStringInet6(dst)) + " "
             isv6 = 1
@@ -696,7 +767,7 @@ def GetRtEntryPrDetailsAsString(rte):
     out_string += str(int(rt.rt_ifp.if_unit))
     out_string += "\n"
     return out_string
-    
+
 
 RNF_ROOT = 2
 def GetRtTableAsString(rt_tables):
@@ -737,26 +808,26 @@ def GetRtInetAsString():
     rt_tables = kern.globals.rt_tables[2]
     if (kern.ptrsize == 8):
         rt_table_header_format_string = "{0:<18s} {1: <16s} {2:<20s} {3:<16s} {4:<8s} {5:<8s} {6:<8s}"
-        print rt_table_header_format_string.format("rtentry", " dst", "gw", "parent", "Refs", "Use", "flags/if") 
+        print rt_table_header_format_string.format("rtentry", " dst", "gw", "parent", "Refs", "Use", "flags/if")
         print rt_table_header_format_string.format("-" * 18, "-" * 16, "-" * 16, "-" * 16, "-" * 8, "-" * 8, "-" * 8)
         print GetRtTableAsString(rt_tables)
     else:
         rt_table_header_format_string = "{0:<8s} {1:<16s} {2:<18s} {3:<8s} {4:<8s} {5:<8s} {6:<8s}"
-        print rt_table_header_format_string.format("rtentry", "dst", "gw", "parent", "Refs", "Use", "flags/if") 
-        print rt_table_header_format_string.format("-" * 8, "-" * 16, "-" * 16, "-" * 8, "-" * 8, "-" * 8, "-" * 8) 
+        print rt_table_header_format_string.format("rtentry", "dst", "gw", "parent", "Refs", "Use", "flags/if")
+        print rt_table_header_format_string.format("-" * 8, "-" * 16, "-" * 16, "-" * 8, "-" * 8, "-" * 8, "-" * 8)
         print GetRtTableAsString(rt_tables)
 
 def GetRtInet6AsString():
     rt_tables = kern.globals.rt_tables[30]
     if (kern.ptrsize == 8):
         rt_table_header_format_string = "{0:<18s} {1: <16s} {2:<20s} {3:<16s} {4:<8s} {5:<8s} {6:<8s}"
-        print rt_table_header_format_string.format("rtentry", " dst", "gw", "parent", "Refs", "Use", "flags/if") 
+        print rt_table_header_format_string.format("rtentry", " dst", "gw", "parent", "Refs", "Use", "flags/if")
         print rt_table_header_format_string.format("-" * 18, "-" * 16, "-" * 16, "-" * 16, "-" * 8, "-" * 8, "-" * 8)
         print GetRtTableAsString(rt_tables)
     else:
         rt_table_header_format_string = "{0:<8s} {1:<16s} {2:<18s} {3:<8s} {4:<8s} {5:<8s} {6:<8s}"
-        print rt_table_header_format_string.format("rtentry", "dst", "gw", "parent", "Refs", "Use", "flags/if") 
-        print rt_table_header_format_string.format("-" * 8, "-" * 16, "-" * 18, "-" * 8, "-" * 8, "-" * 8, "-" * 8) 
+        print rt_table_header_format_string.format("rtentry", "dst", "gw", "parent", "Refs", "Use", "flags/if")
+        print rt_table_header_format_string.format("-" * 8, "-" * 16, "-" * 18, "-" * 8, "-" * 8, "-" * 8, "-" * 8)
         print GetRtTableAsString(rt_tables)
 
 # Macro: show_rt_inet
@@ -764,7 +835,7 @@ def GetRtInet6AsString():
 def ShowRtInet(cmd_args=None):
     """ Display the IPv4 routing table
     """
-    print GetRtInetAsString() 
+    print GetRtInetAsString()
 # EndMacro: show_rt_inet
 
 # Macro: show_rt_inet6
@@ -824,7 +895,7 @@ def ShowRtEntryDebug(cmd_args=None):
                 out_string += "\n"
             ix += 1
         cnt += 1
-    
+
     cnt = 0
     while (cnt < RTD_TRACE_HIST_SIZE):
         ix = 0
@@ -838,7 +909,7 @@ def ShowRtEntryDebug(cmd_args=None):
                 out_string += "\n"
             ix += 1
         cnt += 1
+
     out_string += "\nTotal locks : " + str(int(rtd.rtd_lock_cnt))
     out_string += "\nTotal unlocks : " + str(int(rtd.rtd_unlock_cnt))
 
@@ -855,7 +926,7 @@ def ShowRtEntryDebug(cmd_args=None):
                 out_string += "\n"
             ix += 1
         cnt += 1
+
     cnt = 0
     while (cnt < RTD_TRACE_HIST_SIZE):
         ix = 0
@@ -1474,20 +1545,23 @@ def GetInPcb(pcb, proto):
 
     if (proto == IPPROTO_TCP):
         out_string +=  " tcp"
+    elif (proto == IPPROTO_UDP):
+        out_string += " udp"
+    elif (proto == IPPROTO_RAW):
+        out_string += " raw"
     else:
-        if (proto == IPPROTO_UDP):
-            out_string += " udp"
-        else:
-            out_string += str(proto) +  "."
+        out_string += str(proto) +  "."
+
     if (pcb.inp_vflag & INP_IPV4):
         out_string += "4 "
     if (pcb.inp_vflag & INP_IPV6):
         out_string += "6 "
 
     if (pcb.inp_vflag & INP_IPV4):
-        out_string += "                                      "
+        out_string += "                                       "
         out_string += GetInAddrAsString(addressof(pcb.inp_dependladdr.inp46_local.ia46_addr4))
     else:
+        out_string += "  "
         out_string += GetIn6AddrAsString((pcb.inp_dependladdr.inp6_local.__u6_addr.__u6_addr8))
 
     out_string += " "
@@ -1495,7 +1569,7 @@ def GetInPcb(pcb, proto):
     out_string += " "
 
     if (pcb.inp_vflag & INP_IPV4):
-        out_string += "                                      "
+        out_string += "                                 "
         out_string += GetInAddrAsString(addressof(pcb.inp_dependfaddr.inp46_foreign.ia46_addr4))
     else:
         out_string += GetIn6AddrAsString((pcb.inp_dependfaddr.inp6_foreign.__u6_addr.__u6_addr8))
@@ -1507,6 +1581,7 @@ def GetInPcb(pcb, proto):
     if (proto == IPPROTO_TCP):
         out_string += GetTcpState(pcb.inp_ppcb)
 
+    out_string += "\n\t"
     if (pcb.inp_flags & INP_RECVOPTS):
         out_string += "recvopts "
     if (pcb.inp_flags & INP_RECVRETOPTS):
@@ -1577,27 +1652,58 @@ def GetInPcb(pcb, proto):
         out_string += "in_fctree "
     if (pcb.inp_flags2 & INP2_WANT_APP_POLICY):
         out_string += "want_app_policy "
-          
+
+    out_string += "\n\t"
     so = pcb.inp_socket
     if (so != 0):
-        out_string += "[so=" + str(so) + " s=" + str(int(so.so_snd.sb_cc)) + " r=" + str(int(so.so_rcv.sb_cc)) + " usecnt=" + str(int(so.so_usecount)) + "] "
+        out_string += "so=" + str(so) + " s=" + str(int(so.so_snd.sb_cc)) + " r=" + str(int(so.so_rcv.sb_cc)) + " usecnt=" + str(int(so.so_usecount)) + ", "
 
     if (pcb.inp_state == 0 or pcb.inp_state == INPCB_STATE_INUSE):
-        out_string += "inuse"
+        out_string += "inuse"
     else:
         if (pcb.inp_state == INPCB_STATE_DEAD):
-            out_string += "dead"
+            out_string += "dead"
         else:
-            out_string += "unknown (" + str(int(pcb.inp_state)) + ")"
+            out_string += "unknown (" + str(int(pcb.inp_state)) + ")"
 
     return out_string
 
+def CalcMbufInList(mpkt, pkt_cnt, buf_byte_cnt, mbuf_cnt, mbuf_cluster_cnt):
+    while (mpkt != 0):
+        mp = mpkt
+        mpkt = mpkt.m_hdr.mh_nextpkt
+        pkt_cnt[0] +=1
+        while (mp != 0):
+            mbuf_cnt[0] += 1
+            buf_byte_cnt[int(mp.m_hdr.mh_type)] += 256
+            buf_byte_cnt[Mbuf_Type.MT_LAST] += 256
+            if (mp.m_hdr.mh_flags & 0x01):
+                mbuf_cluster_cnt[0] += 1
+                buf_byte_cnt[int(mp.m_hdr.mh_type)] += mp.M_dat.MH.MH_dat.MH_ext.ext_size
+                buf_byte_cnt[Mbuf_Type.MT_LAST] += mp.M_dat.MH.MH_dat.MH_ext.ext_size
+            mp = mp.m_hdr.mh_next
+
+def CalcMbufInSB(so, snd_cc, snd_buf, rcv_cc, rcv_buf, snd_record_cnt, rcv_record_cnt, snd_mbuf_cnt, rcv_mbuf_cnt, snd_mbuf_cluster_cnt, rcv_mbuf_cluster_cnt):
+    snd_cc[0] += so.so_snd.sb_cc
+    mpkt = so.so_snd.sb_mb
+    CalcMbufInList(mpkt, snd_record_cnt, snd_buf, snd_mbuf_cnt, snd_mbuf_cluster_cnt)
+    rcv_cc[0] += so.so_rcv.sb_cc
+    mpkt = so.so_rcv.sb_mb
+    CalcMbufInList(mpkt, rcv_record_cnt, rcv_buf, rcv_mbuf_cnt, rcv_mbuf_cluster_cnt)
+
 def GetPcbInfo(pcbi, proto):
+    tcp_reassqlen = 0
     out_string = ""
-    snd_cc = 0
-    snd_buf = unsigned(0)
-    rcv_cc = 0
-    rcv_buf = unsigned(0)
+    snd_mbuf_cnt = [0]
+    snd_mbuf_cluster_cnt = [0]
+    snd_record_cnt = [0]
+    snd_cc = [0]
+    snd_buf = [0] * (Mbuf_Type.MT_LAST + 1)
+    rcv_mbuf_cnt = [0]
+    rcv_mbuf_cluster_cnt = [0]
+    rcv_record_cnt = [0]
+    rcv_cc = [0]
+    rcv_buf = [0] * (Mbuf_Type.MT_LAST + 1)
     pcbseen = 0
     out_string += "lastport " + str(int(pcbi.ipi_lastport)) + " lastlow " + str(int(pcbi.ipi_lastlow)) + " lasthi " + str(int(pcbi.ipi_lasthi)) + "\n"
     out_string += "active pcb count is " + str(int(pcbi.ipi_count)) + "\n"
@@ -1605,41 +1711,52 @@ def GetPcbInfo(pcbi, proto):
     out_string += "hash size is " + str(int(hashsize)) + "\n"
     out_string += str(pcbi.ipi_hashbase) + " has the following inpcb(s):\n"
     if (kern.ptrsize == 8):
-        out_string += "pcb            proto  source                     address  port  destination               address  port\n"
+        out_string += "pcb                proto  source                                        port  destination                                 port\n"
     else:
         out_string += "pcb            proto  source           address  port  destination         address  port\n\n"
 
-    i = 0
-    hashbase = pcbi.ipi_hashbase
-    while (i < hashsize):
-        head = hashbase[i]
+    if proto == IPPROTO_RAW:
+        head = cast(pcbi.ipi_listhead, 'inpcbhead *')
         pcb = cast(head.lh_first, 'inpcb *')
         while pcb != 0:
             pcbseen += 1
             out_string += GetInPcb(pcb, proto) + "\n"
             so = pcb.inp_socket
             if so != 0:
-                snd_cc += so.so_snd.sb_cc
-                mp = so.so_snd.sb_mb
-                while mp != 0:
-                    snd_buf += 256
-                    if (mp.m_hdr.mh_flags & 0x01):
-                        snd_buf += mp.M_dat.MH.MH_dat.MH_ext.ext_size
-                    mp = mp.m_hdr.mh_next
-                rcv_cc += so.so_rcv.sb_cc
-                mp = so.so_rcv.sb_mb
-                while mp != 0:
-                    rcv_buf += 256
-                    if (mp.m_hdr.mh_flags & 0x01):
-                        rcv_buf += mp.M_dat.MH.MH_dat.MH_ext.ext_size
-                    mp = mp.m_hdr.mh_next
-            pcb = cast(pcb.inp_hash.le_next, 'inpcb *')
-        i += 1
-    
-    out_string += "total seen " + str(int(pcbseen)) + " snd_cc " + str(int(snd_cc)) + " rcv_cc " + str(int(rcv_cc)) + "\n"
-    out_string += "total snd_buf " + str(int(snd_buf)) + " rcv_buf " + str(int(rcv_buf)) + "\n"
-    out_string  += "port hash base is " + hex(pcbi.ipi_porthashbase) + "\n"
-    
+                CalcMbufInSB(so, snd_cc, snd_buf, rcv_cc, rcv_buf, snd_record_cnt, rcv_record_cnt, snd_mbuf_cnt, rcv_mbuf_cnt, snd_mbuf_cluster_cnt, rcv_mbuf_cluster_cnt)
+            pcb = cast(pcb.inp_list.le_next, 'inpcb *')
+    else:
+        i = 0
+        hashbase = pcbi.ipi_hashbase
+        while (i < hashsize):
+            head = hashbase[i]
+            pcb = cast(head.lh_first, 'inpcb *')
+            while pcb != 0:
+                pcbseen += 1
+                out_string += GetInPcb(pcb, proto) + "\n"
+                so = pcb.inp_socket
+                if so != 0:
+                    CalcMbufInSB(so, snd_cc, snd_buf, rcv_cc, rcv_buf, snd_record_cnt, rcv_record_cnt, snd_mbuf_cnt, rcv_mbuf_cnt, snd_mbuf_cluster_cnt, rcv_mbuf_cluster_cnt)
+                if proto == IPPROTO_TCP and pcb.inp_ppcb:
+                    tcpcb = cast(pcb.inp_ppcb, 'tcpcb *')
+                    tcp_reassqlen += tcpcb.t_reassqlen
+
+                pcb = cast(pcb.inp_hash.le_next, 'inpcb *')
+            i += 1
+
+    out_string += "total pcbs seen: " + str(int(pcbseen)) + "\n"
+    out_string += "total send mbuf count: " + str(int(snd_mbuf_cnt[0])) + " receive mbuf count: " + str(int(rcv_mbuf_cnt[0])) + "\n"
+    out_string += "total send mbuf cluster count: " + str(int(snd_mbuf_cluster_cnt[0])) + " receive mbuf cluster count: " + str(int(rcv_mbuf_cluster_cnt[0])) + "\n"
+    out_string += "total send record count: " + str(int(snd_record_cnt[0])) + " receive record count: " + str(int(rcv_record_cnt[0])) + "\n"
+    out_string += "total snd_cc (total bytes in send buffers): " + str(int(snd_cc[0])) + " rcv_cc (total bytes in receive buffers): " + str(int(rcv_cc[0])) + "\n"
+    out_string += "total snd_buf bytes " + str(int(snd_buf[Mbuf_Type.MT_LAST])) + " rcv_buf bytes " + str(int(rcv_buf[Mbuf_Type.MT_LAST])) + "\n"
+    for x in range(Mbuf_Type.MT_LAST):
+        if (snd_buf[x] != 0 or rcv_buf[x] != 0):
+            out_string += "total snd_buf bytes of type " + Mbuf_Type.reverse_mapping[x] + " : " + str(int(snd_buf[x])) + " total recv_buf bytes of type " + Mbuf_Type.reverse_mapping[x] + " : " + str(int(rcv_buf[x])) + "\n"
+    out_string += "port hash base is " + hex(pcbi.ipi_porthashbase) + "\n"
+    if proto == IPPROTO_TCP:
+        out_string += "TCP reassembly queue length: " + str(tcp_reassqlen) + "\n"
+
     i = 0
     hashbase = pcbi.ipi_porthashbase
     while (i < hashsize):
@@ -1659,7 +1776,7 @@ def GetInPcbPort(ppcb):
     out_string += hex(ppcb) + ": lport "
     out_string += Getntohs(ppcb.phd_port)
     return out_string
-    
+
 
 def Getntohs(port):
     out_string = ""
@@ -1697,23 +1814,26 @@ def ShowKernEventPcbInfo(cmd_args=None):
 def GetKernControlPcbInfo(ctl_head):
     out_string = ""
     kctl = Cast(ctl_head.tqh_first, 'kctl *')
-    if (kern.ptrsize == 8):    
-        kcb_format_string = "0x{0:<16x} {1:4d} {2:10d}\n"
+    if (kern.ptrsize == 8):
+        kcb_format_string = "0x{0:<16x} {1:10d} {2:10d} {3:10d}\n"
     else:
-        kcb_format_string = "0x{0:<8x} {1:4d} {2:10d}\n"
+        kcb_format_string = "0x{0:<8x} {1:10d} {2:10d} {3:10d}\n"
     while unsigned(kctl) != 0:
         kctl_name = "controller: " + str(kctl.name) + "\n"
         out_string += kctl_name
         kcb = Cast(kctl.kcb_head.tqh_first, 'ctl_cb *')
         if unsigned(kcb) != 0:
             if (kern.ptrsize == 8):
-                out_string += "socket               unit       usecount\n"
-                out_string += "------               ----       --------\n"
+                out_string += "socket               usecount     snd_cc     rcv_cc\n"
+                out_string += "------               --------     ------     ------\n"
             else:
-                out_string += "socket       unit       usecount\n"
-                out_string += "------       ----       --------\n"
+                out_string += "socket       usecount     snd_cc     rcv_cc\n"
+                out_string += "------       --------     ------     ------\n"
         while unsigned(kcb) != 0:
-            out_string += kcb_format_string.format(kcb.so, kcb.unit, kcb.usecount)   
+            so = Cast(kcb.so, 'socket *')
+            snd_cc = so.so_snd.sb_cc
+            rcv_cc = so.so_rcv.sb_cc
+            out_string += kcb_format_string.format(kcb.so, kcb.usecount, snd_cc, rcv_cc)
             kcb = kcb.next.tqe_next
         out_string += "\n"
         kctl = kctl.next.tqe_next
@@ -1742,6 +1862,14 @@ def ShowUdpPcbInfo(cmd_args=None):
     print GetPcbInfo(addressof(kern.globals.udbinfo), IPPROTO_UDP)
 # EndMacro:  show_udp_pcbinfo
 
+# Macro: show_rip_pcbinfo
+@lldb_command('show_rip_pcbinfo')
+def ShowRipPcbInfo(cmd_args=None):
+    """ Display the list of Raw IP protocol control block information
+    """
+    print GetPcbInfo(addressof(kern.globals.ripcbinfo), IPPROTO_RAW)
+# EndMacro:  show_rip_pcbinfo
+
 # Macro: show_tcp_timewaitslots
 @lldb_command('show_tcp_timewaitslots')
 def ShowTcpTimeWaitSlots(cmd_args=None):
index 7301b974773baa2f59d2aa32f42736ea24e78024..d894ab049fe6980aebca37556e3407b0bc04da89 100755 (executable)
@@ -1,3 +1,51 @@
+def enum(*sequential, **named):
+    enums = dict(zip(sequential, range(len(sequential))), **named)
+    reverse = dict((value, key) for key, value in enums.iteritems())
+    enums['reverse_mapping'] = reverse
+    return type('Enum', (), enums)
+
+Mbuf_Type = enum(
+    'MT_FREE',
+    'MT_DATA',
+    'MT_HEADER',
+    'MT_SOCKET',
+    'MT_PCB',
+    'MT_RTABLE',
+    'MT_HTABLE',
+    'MT_ATABLE',
+    'MT_SONAME',
+    'MT_SOOPTS',
+    'MT_FTABLE',
+    'MT_RIGHTS',
+    'MT_IFADDR',
+    'MT_CONTROL',
+    'MT_OOBDATA',
+    'MT_TAG',
+    'MT_LAST')
+
+M_EXT           = 0x0001
+M_PKTHDR        = 0x0002
+M_EOR           = 0x0004
+M_PROTO1        = 0x0008
+M_PROTO2        = 0x0010
+M_PROTO3        = 0x0020
+M_LOOP          = 0x0040
+M_PROTO5        = 0x0080
+
+M_BCAST         = 0x0100
+M_MCAST         = 0x0200
+M_FRAG          = 0x0400
+M_FIRSTFRAG     = 0x0800
+M_LASTFRAG      = 0x1000
+M_PROMISC       = 0x2000
+M_HASFCS        = 0x4000
+M_TAGHDR        = 0x8000
+
+dlil_if_flags_strings = ["DLIF_INUSE",
+                         "DLIF_REUSE",
+                         "DLIF_DEBUG"
+                        ]
+
 if_capenable_strings = ["RXCSUM",
                         "TXCSUM", 
                         "VLAN_MTU", 
@@ -33,6 +81,11 @@ if_flags_strings = ["UP",
                     "MULTICAST"
                     ]
 
+if_refflags_strings = ["IFRF_EMBRYONIC",
+                       "IFRF_ATTACHED",
+                       "IFRF_DETACHING"
+                      ]
+
 if_eflags_strings = ["AUTOCONFIGURING",
                      "unused",
                      "unused",
index 2424d2d9394edd7049c7bacba117780898d713bd..8bff2689cb0f9e418e561b154340a90759f11146 100755 (executable)
@@ -1050,13 +1050,13 @@ def ShowPTEARM(pte):
     else:
         pte_pgoff = pte_pgoff / 4
         nttes = page_size / 4
-    if ptd.pt_cnt[pt_index].refcnt == 0x4000:
+    if ptd.ptd_info[pt_index].refcnt == 0x4000:
         level = 2
         granule = nttes * page_size
     else:
         level = 3
         granule = page_size
-    print "maps VA: {:#x}".format(long(unsigned(ptd.pt_map[pt_index].va)) + (pte_pgoff * granule))
+    print "maps VA: {:#x}".format(long(unsigned(ptd.ptd_info[pt_index].va)) + (pte_pgoff * granule))
     pteval = long(unsigned(dereference(kern.GetValueFromAddress(unsigned(pte), 'pt_entry_t *'))))
     print "value: {:#x}".format(pteval)
     if kern.arch.startswith('arm64'):
index f37169a3b59ae5bda7dfc2ec1c8c4304f13e4c93..cf7afc73c6018d0a831a7f02bb194bb8a299e25d 100755 (executable)
@@ -171,6 +171,7 @@ def GetASTSummary(ast):
         B - AST_BSD
         K - AST_KPERF
         M - AST_MACF
+        r - AST_RESET_PCS
         G - AST_GUARD
         T - AST_TELEMETRY_USER
         T - AST_TELEMETRY_KERNEL
@@ -185,7 +186,7 @@ def GetASTSummary(ast):
     out_string = ""
     state = int(ast)
     thread_state_chars = {0x0:'', 0x1:'P', 0x2:'Q', 0x4:'U', 0x8:'H', 0x10:'Y', 0x20:'A',
-                          0x40:'L', 0x80:'B', 0x100:'K', 0x200:'M',
+                          0x40:'L', 0x80:'B', 0x100:'K', 0x200:'M', 0x400: 'r',
                           0x1000:'G', 0x2000:'T', 0x4000:'T', 0x8000:'T', 0x10000:'S',
                           0x20000: 'D', 0x40000: 'I', 0x80000: 'E', 0x100000: 'R', 0x200000: 'N'}
     state_str = ''
@@ -605,7 +606,7 @@ def ShowTaskCoalitions(cmd_args=None, cmd_options={}):
 # EndMacro: showtaskcoalitions
 
 @lldb_type_summary(['proc', 'proc *'])
-@header("{0: >6s} {1: ^20s} {2: >14s} {3: ^10s} {4: <20s}".format("pid", "process", "io_policy", "wq_state", "command"))
+@header("{0: >6s}   {1: <18s} {2: >11s} {3: ^10s} {4: <20s}".format("pid", "process", "io_policy", "wq_state", "command"))
 def GetProcSummary(proc):
     """ Summarize the process data. 
         params:
@@ -614,7 +615,7 @@ def GetProcSummary(proc):
           str - string summary of the process.
     """
     out_string = ""
-    format_string= "{0: >6d} {1: >#020x} {2: >14s} {3: >2d} {4: >2d} {5: >2d}    {6: <20s}"
+    format_string= "{0: >6d}   {1: <#018x} {2: >11s} {3: >2d} {4: >2d} {5: >2d}   {6: <20s}"
     pval = proc.GetSBValue()
     #code.interact(local=locals())
     if str(pval.GetType()) != str(gettype('proc *')) :
@@ -915,7 +916,7 @@ def DumpThreadTerminateQueue(cmd_args=None):
     
     count = 0
     print GetThreadSummary.header
-    for th in IterateQueue(addressof(kern.globals.thread_terminate_queue), 'struct thread *',  'q_link'):
+    for th in IterateMPSCQueue(addressof(kern.globals.thread_terminate_queue.mpd_queue), 'struct thread', 'mpsc_links'):
         print GetThreadSummary(th)
         count += 1
     print "{0: <d} entries!".format(count)
@@ -965,19 +966,23 @@ def DumpCallQueue(cmd_args=None):
 def ShowAllTaskIOStats(cmd_args=None):
     """ Commad to print I/O stats for all tasks
     """
-    print "{0: <20s} {1: <20s} {2: <20s} {3: <20s} {4: <20s} {5: <20s}".format("task", "Immediate Writes", "Deferred Writes", "Invalidated Writes", "Metadata Writes", "name")
+    print "{0: <20s} {1: <20s} {2: <20s} {3: <20s} {4: <20s} {5: <20s} {6: <20s} {7: <20s} {8: <20s} {9: <20s}".format("task", "Immediate Writes", "Deferred Writes", "Invalidated Writes", "Metadata Writes", "Immediate Writes to External", "Deferred Writes to External", "Invalidated Writes to External", "Metadata Writes to External", "name")
     for t in kern.tasks:
         pval = Cast(t.bsd_info, 'proc *')
-        print "{0: <#18x} {1: >20d} {2: >20d} {3: >20d} {4: >20d} {5: <20s}".format(t,
-            t.task_immediate_writes, 
-            t.task_deferred_writes,
-            t.task_invalidated_writes,
-            t.task_metadata_writes,
+        print "{0: <#18x} {1: >20d} {2: >20d} {3: >20d} {4: >20d}  {5: <20s} {6: <20s} {7: <20s} {8: <20s} {9: <20s}".format(t,
+            t.task_writes_counters_internal.task_immediate_writes, 
+            t.task_writes_counters_internal.task_deferred_writes,
+            t.task_writes_counters_internal.task_invalidated_writes,
+            t.task_writes_counters_internal.task_metadata_writes,
+            t.task_writes_counters_external.task_immediate_writes, 
+            t.task_writes_counters_external.task_deferred_writes,
+            t.task_writes_counters_external.task_invalidated_writes,
+            t.task_writes_counters_external.task_metadata_writes,
             str(pval.p_comm)) 
 
 
-@lldb_command('showalltasks','C')
-def ShowAllTasks(cmd_args=None, cmd_options={}):
+@lldb_command('showalltasks','C', fancy=True)
+def ShowAllTasks(cmd_args=None, cmd_options={}, O=None):
     """  Routine to print a summary listing of all the tasks
          wq_state -> reports "number of workq threads", "number of scheduled workq threads", "number of pending work items"
          if "number of pending work items" seems stuck at non-zero, it may indicate that the workqueue mechanism is hung
@@ -994,11 +999,11 @@ def ShowAllTasks(cmd_args=None, cmd_options={}):
         showcorpse = True
         extra_hdr += " " + GetKCDataSummary.header
 
-    print GetTaskSummary.header + extra_hdr + " " + GetProcSummary.header
-    for t in kern.tasks:
-        pval = Cast(t.bsd_info, 'proc *')
-        out_str = GetTaskSummary(t, showcorpse) + " " + GetProcSummary(pval)
-        print out_str
+    with O.table(GetTaskSummary.header + extra_hdr + " " + GetProcSummary.header):
+        for t in kern.tasks:
+            pval = Cast(t.bsd_info, 'proc *')
+            print GetTaskSummary(t, showcorpse) + " " + GetProcSummary(pval)
+
     ZombTasks()
 
 @lldb_command('taskforpmap')
index ed22c6ae807c1812efe9448beb4e2a177095128d..0708c7658e2b23f0f42725e238bbf4d9e21bcac8 100755 (executable)
@@ -43,7 +43,7 @@ def ShowInterrupts(cmd_args=None):
             cpu_data_entry = Cast(element, 'cpu_data_t *')
             print "CPU {} IRQ: {:d}\n".format(y, cpu_data_entry.cpu_stat.irq_ex_cnt)
             print "CPU {} IPI: {:d}\n".format(y, cpu_data_entry.cpu_stat.ipi_cnt)
-            print "CPU {} PMI: {:d}\n".format(y, cpu_data_entry.cpu_stat.pmi_cnt)
+            print "CPU {} PMI: {:d}\n".format(y, cpu_data_entry.cpu_monotonic.mtc_npmis)
             print "CPU {} TMR: {:d}\n".format(y, cpu_data_entry.cpu_stat.timer_cnt)
             x = x + 1
         y = y + 1
@@ -162,6 +162,127 @@ def ShowCurremtAbsTime(cmd_args=None):
 
     print "Last dispatch time known: %d MATUs" % cur_abstime
 
+bucketStr = ["", "FIXPRI (>UI)", "TIMESHARE_FG", "TIMESHARE_IN", "TIMESHARE_DF", "TIMESHARE_UT", "TIMESHARE_BG"]
+
+@header("     {:>18s} | {:>20s} | {:>20s} | {:>10s} | {:>10s}".format('Thread Group', 'Interactivity Score', 'Last Timeshare Tick', 'pri_shift', 'highq'))
+def GetSchedClutchBucketSummary(clutch_bucket):
+    return "     0x{:>16x} | {:>20d} | {:>20d} | {:>10d} | {:>10d}".format(clutch_bucket.scb_clutch.sc_tg, clutch_bucket.scb_interactivity_score, clutch_bucket.scb_timeshare_tick, clutch_bucket.scb_pri_shift, clutch_bucket.scb_runq.highq)
+
+def ShowSchedClutchForPset(pset):
+    root_clutch = pset.pset_clutch_root
+    print "\n{:s} : {:d}\n\n".format("Current Timestamp", GetRecentTimestamp())
+    print "{:>10s} | {:>20s} | {:>30s} | {:>18s} | {:>10s} | {:>10s} | {:>30s} | {:>30s} | {:>15s} | ".format("Root", "Root Buckets", "Clutch Buckets", "Address", "Priority", "Count", "CPU Usage (MATUs)", "CPU Blocked (MATUs)", "Deadline (abs)") + GetSchedClutchBucketSummary.header
+    print "=" * 300
+    print "{:>10s} | {:>20s} | {:>30s} | 0x{:16x} | {:>10d} | {:>10d} | {:>30s} | {:>30s} | {:>15s} | ".format("Root", "*", "*", addressof(root_clutch), root_clutch.scr_priority, root_clutch.scr_thr_count, "*", "*", "*")
+    print "-" * 300
+
+    for i in range(1, 7):
+        root_bucket = root_clutch.scr_buckets[i]
+        print "{:>10s} | {:>20s} | {:>30s} | 0x{:16x} | {:>10s} | {:>10s} | {:>30s} | {:>30s} | {:>15d} | ".format("*", bucketStr[i], "*", addressof(root_bucket), "*", "*", "*", "*", root_bucket.scrb_deadline)
+        prioq = root_bucket.scrb_clutch_buckets
+        clutch_bucket_list = []
+        for clutch_bucket in IteratePriorityQueue(prioq, 'struct sched_clutch_bucket', 'scb_pqlink'):
+            clutch_bucket_list.append(clutch_bucket)
+        if len(clutch_bucket_list) > 0:
+            clutch_bucket_list.sort(key=lambda x: x.scb_priority, reverse=True)
+            for clutch_bucket in clutch_bucket_list:
+                cpu_used = clutch_bucket.scb_cpu_data.cpu_data.scbcd_cpu_used
+                cpu_blocked = clutch_bucket.scb_cpu_data.cpu_data.scbcd_cpu_blocked
+                print "{:>10s} | {:>20s} | {:>30s} | 0x{:16x} | {:>10d} | {:>10d} | {:>30d} | {:>30d} | {:>15s} | ".format("*", "*", clutch_bucket.scb_clutch.sc_tg.tg_name, clutch_bucket, clutch_bucket.scb_priority, clutch_bucket.scb_thr_count, cpu_used, cpu_blocked, "*") + GetSchedClutchBucketSummary(clutch_bucket)
+        print "-" * 300
+
+@lldb_command('showschedclutch')
+def ShowSchedClutch(cmd_args=[]):
+    """ Routine to print the clutch scheduler hierarchy.
+        Usage: showschedclutch <pset>
+    """
+    if not cmd_args:
+        raise ArgumentError("Invalid argument")
+    pset = kern.GetValueFromAddress(cmd_args[0], "processor_set_t")
+    ShowSchedClutchForPset(pset)
+
+@lldb_command('showschedclutchroot')
+def ShowSchedClutchRoot(cmd_args=[]):
+    """ show information about the root of the sched clutch hierarchy
+        Usage: showschedclutchroot <root>
+    """
+    if not cmd_args:
+        raise ArgumentError("Invalid argument")
+    root = kern.GetValueFromAddress(cmd_args[0], "struct sched_clutch_root *")
+    if not root:
+        print "unknown arguments:", str(cmd_args)
+        return False
+    print "{:>30s} : 0x{:16x}".format("Root", root)
+    print "{:>30s} : 0x{:16x}".format("Pset", root.scr_pset)
+    print "{:>30s} : {:d}".format("Priority", root.scr_priority)
+    print "{:>30s} : {:d}".format("Urgency", root.scr_urgency)
+    print "{:>30s} : {:d}".format("Threads", root.scr_thr_count)
+    print "{:>30s} : {:d}".format("Current Timestamp", GetRecentTimestamp())
+    print "{:>30s} : {:b} (BG/UT/DF/IN/FG/FIX/NULL)".format("Runnable Root Buckets Bitmap", int(root.scr_runnable_bitmap[0]))
+
+@lldb_command('showschedclutchrootbucket')
+def ShowSchedClutchRootBucket(cmd_args=[]):
+    """ show information about a root bucket in the sched clutch hierarchy
+        Usage: showschedclutchrootbucket <root_bucket>
+    """
+    if not cmd_args:
+        raise ArgumentError("Invalid argument")
+    root_bucket = kern.GetValueFromAddress(cmd_args[0], "struct sched_clutch_root_bucket *")
+    if not root_bucket:
+        print "unknown arguments:", str(cmd_args)
+        return False
+    print "{:<30s} : 0x{:16x}".format("Root Bucket", root_bucket)
+    print "{:<30s} : {:s}".format("Bucket Name", bucketStr[int(root_bucket.scrb_bucket)])
+    print "{:<30s} : {:d}".format("Deadline", root_bucket.scrb_deadline)
+    print "{:<30s} : {:d}".format("Current Timestamp", GetRecentTimestamp())
+    print "\n"
+    prioq = root_bucket.scrb_clutch_buckets
+    clutch_bucket_list = []
+    for clutch_bucket in IteratePriorityQueue(prioq, 'struct sched_clutch_bucket', 'scb_pqlink'):
+        clutch_bucket_list.append(clutch_bucket)
+    if len(clutch_bucket_list) > 0:
+        print "=" * 240
+        print "{:>30s} | {:>18s} | {:>20s} | {:>20s} | ".format("Name", "Clutch Bucket", "Priority", "Count") + GetSchedClutchBucketSummary.header
+        print "=" * 240
+        clutch_bucket_list.sort(key=lambda x: x.scb_priority, reverse=True)
+        for clutch_bucket in clutch_bucket_list:
+            print "{:>30s} | 0x{:16x} | {:>20d} | {:>20d} | ".format(clutch_bucket.scb_clutch.sc_tg.tg_name, clutch_bucket, clutch_bucket.scb_priority, clutch_bucket.scb_thr_count) + GetSchedClutchBucketSummary(clutch_bucket)
+
+@lldb_command('showschedclutchbucket')
+def ShowSchedClutchBucket(cmd_args=[]):
+    """ show information about a clutch bucket in the sched clutch hierarchy
+        Usage: showschedclutchbucket <clutch_bucket>
+    """
+    if not cmd_args:
+        raise ArgumentError("Invalid argument")
+    clutch_bucket = kern.GetValueFromAddress(cmd_args[0], "struct sched_clutch_bucket *")
+    if not clutch_bucket:
+        print "unknown arguments:", str(cmd_args)
+        return False
+    print "{:<30s} : 0x{:16x}".format("Clutch Bucket", clutch_bucket)
+    print "{:<30s} : {:s}".format("TG Name", clutch_bucket.scb_clutch.sc_tg.tg_name)
+    print "{:<30s} : {:d}".format("Priority", clutch_bucket.scb_priority)
+    print "{:<30s} : {:d}".format("Thread Count", clutch_bucket.scb_thr_count)
+    print "{:<30s} : 0x{:16x}".format("Thread Group", clutch_bucket.scb_clutch.sc_tg)
+    cpu_used = clutch_bucket.scb_cpu_data.cpu_data.scbcd_cpu_used
+    cpu_blocked = clutch_bucket.scb_cpu_data.cpu_data.scbcd_cpu_blocked
+    print "{:<30s} : {:d}".format("CPU Used (MATUs)", cpu_used)
+    print "{:<30s} : {:d}".format("CPU Blocked (MATUs)", cpu_blocked) 
+    print "{:<30s} : {:d}".format("Interactivity Score", clutch_bucket.scb_interactivity_score)
+    print "{:<30s} : {:d}".format("Last Timeshare Update Tick", clutch_bucket.scb_timeshare_tick)
+    print "{:<30s} : {:d}".format("Priority Shift", clutch_bucket.scb_pri_shift) 
+    print "\n"
+    runq = clutch_bucket.scb_clutchpri_prioq
+    thread_list = []
+    for thread in IteratePriorityQueue(runq, 'struct thread', 'sched_clutchpri_link'):
+        thread_list.append(thread)
+    if len(thread_list) > 0:
+        print "=" * 240
+        print GetThreadSummary.header + "{:s}".format("Process Name")
+        print "=" * 240
+        for thread in thread_list:
+            proc = Cast(thread.task.bsd_info, 'proc *')
+            print GetThreadSummary(thread) + "{:s}".format(str(proc.p_comm))
 
 @lldb_command('abs2nano')
 def ShowAbstimeToNanoTime(cmd_args=[]):
@@ -436,11 +557,11 @@ def ShowGroupSetSummary(runq, task_map):
         if unsigned(runq_queue_p) != unsigned(runq_queue_head):
             runq_queue_this_count = 0
 
-            for entry in ParanoidIterateLinkageChain(runq_queue_head, "sched_entry_t", "entry_links"):
+            for entry in ParanoidIterateLinkageChain(runq_queue_head, "sched_entry_t", "entry_links", circleQueue=True):
                 runq_queue_this_count += 1
 
             print "      Queue [{: <#012x}] Priority {: <3d} count {:d}\n".format(runq_queue_head, runq_queue_i, runq_queue_this_count)
-            for entry in ParanoidIterateLinkageChain(runq_queue_head, "sched_entry_t", "entry_links"):
+            for entry in ParanoidIterateLinkageChain(runq_queue_head, "sched_entry_t", "entry_links", circleQueue=True):
                 group_addr = unsigned(entry) - (sizeof(dereference(entry)) * unsigned(entry.sched_pri))
                 group = kern.GetValueFromAddress(unsigned(group_addr), 'sched_group_t')
                 task = task_map.get(unsigned(group), 0x0)
@@ -474,17 +595,17 @@ def ShowRunQSummary(runq):
 
     for runq_queue_i in xrange(runq_queue_count) :
         runq_queue_head = addressof(runq.queues[runq_queue_i])
-        runq_queue_p = runq_queue_head.next
+        runq_queue_p = runq_queue_head.head
 
-        if unsigned(runq_queue_p) != unsigned(runq_queue_head):
+        if unsigned(runq_queue_p):
             runq_queue_this_count = 0
 
-            for thread in ParanoidIterateLinkageChain(runq_queue_head, "thread_t", "runq_links"):
+            for thread in ParanoidIterateLinkageChain(runq_queue_head, "thread_t", "runq_links", circleQueue=True):
                 runq_queue_this_count += 1
 
             print "      Queue [{: <#012x}] Priority {: <3d} count {:d}\n".format(runq_queue_head, runq_queue_i, runq_queue_this_count)
             print "\t" + GetThreadSummary.header + "\n"
-            for thread in ParanoidIterateLinkageChain(runq_queue_head, "thread_t", "runq_links"):
+            for thread in ParanoidIterateLinkageChain(runq_queue_head, "thread_t", "runq_links", circleQueue=True):
                 print "\t" + GetThreadSummary(thread) + "\n"
                 if config['verbosity'] > vHUMAN :
                     print "\t" + GetThreadBackTrace(thread, prefix="\t\t") + "\n"
@@ -496,7 +617,7 @@ def ShowRTRunQSummary(rt_runq):
     print "    Realtime Queue ({:<#012x}) Count {:d}\n".format(addressof(rt_runq.queue), rt_runq.count)
     if rt_runq.count != 0:
         print "\t" + GetThreadSummary.header + "\n"
-        for rt_runq_thread in ParanoidIterateLinkageChain(rt_runq.queue, "thread_t", "runq_links"):
+        for rt_runq_thread in ParanoidIterateLinkageChain(rt_runq.queue, "thread_t", "runq_links", circleQueue=True):
             print "\t" + GetThreadSummary(rt_runq_thread) + "\n"
 
 def ShowGrrrSummary(grrr_runq):
@@ -514,17 +635,11 @@ def ShowGrrrSummary(grrr_runq):
             print "Count {:d} Weight {:d}\n".format(grrr_group.count, grrr_group.weight)
             grrr_group_client_head = addressof(grrr_group.clients)
             print GetThreadSummary.header
-            for thread in ParanoidIterateLinkageChain(grrr_group_client_head, "thread_t", "runq_links"):
+            for thread in ParanoidIterateLinkageChain(grrr_group_client_head, "thread_t", "runq_links", circleQueue=True):
                 print "\t" + GetThreadSummary(thread) + "\n"
                 if config['verbosity'] > vHUMAN :
                     print "\t" + GetThreadBackTrace(thread, prefix="\t\t") + "\n"
 
-def ShowNextThread(processor):
-    if (processor.next_thread != 0) :
-        print "      " + "Next thread:\n"
-        print "\t" + GetThreadSummary.header + "\n"
-        print "\t" + GetThreadSummary(processor.next_thread) + "\n"
-
 def ShowActiveThread(processor):
     if (processor.active_thread != 0) :
         print "\t" + GetThreadSummary.header + "\n"
@@ -541,10 +656,8 @@ def ShowScheduler(cmd_args=None):
     show_priority_runq = 0
     show_priority_pset_runq = 0
     show_group_pset_runq = 0
-    if unsigned(kern.globals.sched_current_dispatch) != 0 :
-        sched_string = str(kern.globals.sched_current_dispatch.sched_name)
-    else :
-        sched_string = str(kern.globals.sched_string)
+    show_clutch = 0
+    sched_string = str(kern.globals.sched_string)
 
     if sched_string == "traditional":
         show_priority_runq = 1
@@ -561,24 +674,28 @@ def ShowScheduler(cmd_args=None):
     elif sched_string == "amp":
         show_priority_pset_runq = 1
         show_priority_runq = 1
+    elif sched_string == "clutch":
+        show_clutch = 1
     else :
         print "Unknown sched_string {:s}".format(sched_string)
 
-    if unsigned(kern.globals.sched_current_dispatch) != 0 :
-        print "Scheduler: {:s} ({:s})\n".format(sched_string,
-                kern.Symbolicate(unsigned(kern.globals.sched_current_dispatch)))
-
-    run_buckets = kern.globals.sched_run_buckets
-
-    run_count      = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_RUN')]
-    fixpri_count   = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_FIXPRI')]
-    share_fg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_FG')]
-    share_df_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_DF')]
-    share_ut_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_UT')]
-    share_bg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_BG')]
-
-    print "Processors: {g.processor_avail_count:d} Runnable threads: {:d} Fixpri threads: {:d}\n".format(run_count, fixpri_count, g=kern.globals)
-    print "FG Timeshare threads: {:d} DF Timeshare threads: {:d} UT Timeshare threads: {:d} BG Timeshare threads: {:d}\n".format(share_fg_count, share_df_count, share_ut_count, share_bg_count)
+    print "Scheduler: {:s}\n".format(sched_string)
+
+    if show_clutch == 0:
+        run_buckets = kern.globals.sched_run_buckets
+        run_count      = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_RUN')]
+        fixpri_count   = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_FIXPRI')]
+        share_fg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_FG')]
+        share_df_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_DF')]
+        share_ut_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_UT')]
+        share_bg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_BG')]
+        print "Processors: {g.processor_avail_count:d} Runnable threads: {:d} Fixpri threads: {:d}\n".format(run_count, fixpri_count, g=kern.globals)
+        print "FG Timeshare threads: {:d} DF Timeshare threads: {:d} UT Timeshare threads: {:d} BG Timeshare threads: {:d}\n".format(share_fg_count, share_df_count, share_ut_count, share_bg_count)
+    
+    processor_offline     = GetEnumValue('processor_state_t::PROCESSOR_OFF_LINE')
+    processor_idle        = GetEnumValue('processor_state_t::PROCESSOR_IDLE')
+    processor_dispatching = GetEnumValue('processor_state_t::PROCESSOR_DISPATCHING')
+    processor_running     = GetEnumValue('processor_state_t::PROCESSOR_RUNNING')
 
     if show_group_pset_runq:
         if hasattr(kern.globals, "multiq_sanity_check"):
@@ -626,13 +743,12 @@ def ShowScheduler(cmd_args=None):
             processor_array = kern.globals.processor_array
 
             print "Active Processors:\n"
-            active_bitmap = int(pset.cpu_state_map[5]) | int(pset.cpu_state_map[6])
+            active_bitmap = int(pset.cpu_state_map[processor_dispatching]) | int(pset.cpu_state_map[processor_running])
             for cpuid in IterateBitmap(active_bitmap):
                 processor = processor_array[cpuid]
                 if processor != 0:
                     print "    " + GetProcessorSummary(processor)
                     ShowActiveThread(processor)
-                    ShowNextThread(processor)
 
                     if show_priority_runq:
                         runq = processor.runq
@@ -644,13 +760,12 @@ def ShowScheduler(cmd_args=None):
 
 
             print "Idle Processors:\n"
-            idle_bitmap = int(pset.cpu_state_map[4]) & int(pset.primary_map)
+            idle_bitmap = int(pset.cpu_state_map[processor_idle]) & int(pset.primary_map)
             for cpuid in IterateBitmap(idle_bitmap):
                 processor = processor_array[cpuid]
                 if processor != 0:
                     print "    " + GetProcessorSummary(processor)
                     ShowActiveThread(processor)
-                    ShowNextThread(processor)
 
                     if show_priority_runq:
                         ShowRunQSummary(processor.runq)
@@ -658,13 +773,12 @@ def ShowScheduler(cmd_args=None):
 
 
             print "Idle Secondary Processors:\n"
-            idle_bitmap = int(pset.cpu_state_map[4]) & ~(int(pset.primary_map))
+            idle_bitmap = int(pset.cpu_state_map[processor_idle]) & ~(int(pset.primary_map))
             for cpuid in IterateBitmap(idle_bitmap):
                 processor = processor_array[cpuid]
                 if processor != 0:
                     print "    " + GetProcessorSummary(processor)
                     ShowActiveThread(processor)
-                    ShowNextThread(processor)
 
                     if show_priority_runq:
                         print ShowRunQSummary(processor.runq)
@@ -673,7 +787,7 @@ def ShowScheduler(cmd_args=None):
 
             print "Other Processors:\n"
             other_bitmap = 0
-            for i in range(0, 4):
+            for i in range(processor_offline, processor_idle):
                 other_bitmap |= int(pset.cpu_state_map[i])
             other_bitmap &= int(pset.cpu_bitmask)
             for cpuid in IterateBitmap(other_bitmap):
@@ -681,40 +795,41 @@ def ShowScheduler(cmd_args=None):
                 if processor != 0:
                     print "    " + GetProcessorSummary(processor)
                     ShowActiveThread(processor)
-                    ShowNextThread(processor)
 
                     if show_priority_runq:
                         ShowRunQSummary(processor.runq)
             print " \n"
 
+            if show_clutch:
+                print "=== Clutch Scheduler Hierarchy ===\n\n"
+                ShowSchedClutchForPset(pset)
 
             pset = pset.pset_list
 
         node = node.node_list
 
-    print "\nTerminate Queue: ({:<#012x})\n".format(addressof(kern.globals.thread_terminate_queue))
-    first = False
-    for thread in ParanoidIterateLinkageChain(kern.globals.thread_terminate_queue, "thread_t", "runq_links"):
-        if first:
-            print "\t" + GetThreadSummary.header + "\n"
-            first = True
-        print "\t" + GetThreadSummary(thread) + "\n"
-
     print "\nCrashed Threads Queue: ({:<#012x})\n".format(addressof(kern.globals.crashed_threads_queue))
-    first = False
+    first = True
     for thread in ParanoidIterateLinkageChain(kern.globals.crashed_threads_queue, "thread_t", "runq_links"):
         if first:
-            print "\t" + GetThreadSummary.header + "\n"
-            first = True
-        print "\t" + GetThreadSummary(thread) + "\n"
-
-    print "\nWaiting For Kernel Stacks Queue: ({:<#012x})\n".format(addressof(kern.globals.thread_stack_queue))
-    first = False
-    for thread in ParanoidIterateLinkageChain(kern.globals.thread_stack_queue, "thread_t", "runq_links"):
-        if first:
-            print "\t" + GetThreadSummary.header + "\n"
-            first = True
-        print "\t" + GetThreadSummary(thread) + "\n"
+            print "\t" + GetThreadSummary.header
+            first = False
+        print "\t" + GetThreadSummary(thread)
+
+    def dump_mpsc_thread_queue(name, head):
+        head = addressof(head)
+        print "\n{:s}: ({:<#012x})\n".format(name, head)
+        first = True
+        for thread in IterateMPSCQueue(head.mpd_queue, 'struct thread', 'mpsc_links'):
+            if first:
+                print "\t" + GetThreadSummary.header
+                first = False
+            print "\t" + GetThreadSummary(thread)
+
+    dump_mpsc_thread_queue("Terminate Queue", kern.globals.thread_terminate_queue)
+    dump_mpsc_thread_queue("Waiting For Kernel Stacks Queue", kern.globals.thread_stack_queue)
+    dump_mpsc_thread_queue("Thread Exception Queue", kern.globals.thread_exception_queue)
+    dump_mpsc_thread_queue("Thread Deallocate Queue", kern.globals.thread_deallocate_queue)
 
     print "\n"
 
@@ -723,8 +838,8 @@ def ShowScheduler(cmd_args=None):
 # EndMacro: showallprocessors
 
 
-def ParanoidIterateLinkageChain(queue_head, element_type, field_name, field_ofst=0):
-    """ Iterate over a Linkage Chain queue in kernel of type queue_head_t. (osfmk/kern/queue.h method 1)
+def ParanoidIterateLinkageChain(queue_head, element_type, field_name, field_ofst=0, circleQueue=False):
+    """ Iterate over a Linkage Chain queue in kernel of type queue_head_t or circle_queue_head_t. (osfmk/kern/queue.h method 1 or circle_queue.h)
         This is equivalent to the qe_foreach_element() macro
         Blows up aggressively and descriptively when something goes wrong iterating a queue.
         Prints correctness errors, and throws exceptions on 'cannot proceed' errors
@@ -754,11 +869,15 @@ def ParanoidIterateLinkageChain(queue_head, element_type, field_name, field_ofst
     if not queue_head.GetSBValue().GetType().IsPointerType() :
         queue_head = addressof(queue_head)
 
-    # Mosh the value into a brand new value, to really get rid of its old cvalue history
-    queue_head = kern.GetValueFromAddress(unsigned(queue_head), 'struct queue_entry *')
+    if circleQueue:
+        # Mosh the value into a brand new value, to really get rid of its old cvalue history
+        queue_head = kern.GetValueFromAddress(unsigned(queue_head), 'struct circle_queue_head *').head
+    else:
+        # Mosh the value into a brand new value, to really get rid of its old cvalue history
+        queue_head = kern.GetValueFromAddress(unsigned(queue_head), 'struct queue_entry *')
 
     if unsigned(queue_head) == 0:
-        if ParanoidIterateLinkageChain.enable_paranoia:
+        if not circleQueue and ParanoidIterateLinkageChain.enable_paranoia:
             print "bad queue_head_t: {:s}".format(queue_head)
         return
 
@@ -792,7 +911,9 @@ def ParanoidIterateLinkageChain(queue_head, element_type, field_name, field_ofst
     obj = 0
 
     try:
-        while (unsigned(queue_head) != unsigned(link)):
+        while True:
+            if not circleQueue and unsigned(queue_head) == unsigned(link):
+                break;
             if ParanoidIterateLinkageChain.enable_paranoia:
                 if unsigned(link.next) == 0:
                     raise ValueError("NULL next pointer: queue_head {:>#18x} link: {:>#18x} next: {:>#18x} prev: {:>#18x}".format(queue_head, link, link.next, link.prev))
@@ -809,6 +930,8 @@ def ParanoidIterateLinkageChain(queue_head, element_type, field_name, field_ofst
             yield obj
             last_link = link
             link = link.next
+            if circleQueue and unsigned(queue_head) == unsigned(link):
+                break;
     except:
         exc_info = sys.exc_info()
         try:
index 2119bc01096274bb1f38c11060c36f152f213029..f0cbae8fae0a6868e5db73f897d550d878707fd0 100755 (executable)
@@ -6,6 +6,7 @@
 from xnu import *
 from utils import *
 from string import *
+from net import *
 
 import xnudefines
 
@@ -564,3 +565,174 @@ def ShowProcNECP(cmd_args=None):
     print GetNECPSummary.header
     for kc in IterateProcNECP(proc):
         print GetNECPSummary(kc)
+
+def NexusTypePtr(nx):
+    if nx.nx_prov.nxprov_params.nxp_type == GetEnumValue("nexus_type_t::NEXUS_TYPE_FLOW_SWITCH"):
+        return "(struct nx_flowswitch *){:18s}".format(hex(nx.nx_arg))
+    elif nx.nx_prov.nxprov_params.nxp_type == GetEnumValue("nexus_type_t::NEXUS_TYPE_NET_IF"):
+        return "     (struct nx_netif *){:18s}".format(hex(nx.nx_arg))
+    elif nx.nx_prov.nxprov_params.nxp_type == GetEnumValue("nexus_type_t::NEXUS_TYPE_USER_PIPE"):
+        return "     (struct nx_upipe *){:18s}".format(hex(nx.nx_arg))
+    elif nx.nx_prov.nxprov_params.nxp_type == GetEnumValue("nexus_type_t::NEXUS_TYPE_KERNEL_PIPE"):
+        return "   (struct kern_nexus *){:18s}".format(hex(nx))
+    else:
+        return "unknown"
+
+def GetStructNexusSummary(nx):
+    nexus_summary_string = ""
+    nexus_summary_string += "{0:s} ".format(NexusTypePtr(nx))
+    nexus_summary_string += "{0:30s} ".format(str(Cast(addressof(nx.nx_prov.nxprov_params.nxp_name), 'char *')))
+    nexus_summary_string += "rings: tx {:2d} rx {:2d} slots: {:4d} rx {:4d} bufsize {:5d} metasize {:5d} mhints {:2d} ".format(
+            nx.nx_prov.nxprov_params.nxp_tx_rings,
+            nx.nx_prov.nxprov_params.nxp_rx_rings,
+            nx.nx_prov.nxprov_params.nxp_rx_slots,
+            nx.nx_prov.nxprov_params.nxp_tx_slots,
+            nx.nx_prov.nxprov_params.nxp_buf_size,
+            nx.nx_prov.nxprov_params.nxp_meta_size,
+            nx.nx_prov.nxprov_params.nxp_mhints)
+
+    return nexus_summary_string
+
+@lldb_command('shownexuses')
+def ShowNexuses(cmd_args=None):
+    """ Show Nexus.
+
+        usage: shownexues
+    """
+    nexus_summaries = []
+    nexuses = kern.globals.nx_head
+    for nx in IterateRBTreeEntry(nexuses, 'struct kern_nexus*', 'nx_link'):
+        nexus_summaries.append(GetStructNexusSummary(nx))
+    nexus_summaries.sort()
+    for nx_str in nexus_summaries:
+        print "{0:s}".format(nx_str)
+
+def GetSockAddr4(sin):
+    return GetInAddrAsString(sin.sin_addr)
+
+def GetSockAddr6(sin6):
+    addr = sin6.sin6_addr.__u6_addr.__u6_addr8
+    addr_raw_string = ":".join(["{0:02x}{0:02x}".format(unsigned(addr[i]),
+        unsigned(addr[i+1])) for i in range(0, 16, 2)])
+    return inet_ntop(AF_INET6, inet_pton(AF_INET6, addr_raw_string))
+
+def GetSockAddr46(sockaddr46):
+    if sockaddr46 is None :
+        raise ArgumentError('sockaddr is None')
+    if (sockaddr46.sa.sa_family == 2):
+        return GetSockAddr4(sockaddr46.sin)
+    elif (sockaddr46.sa.sa_family == 30):
+        return GetSockAddr6(sockaddr46.sin6)
+    else:
+        raise ArgumentError('invalid sockaddr_in_4_6 address family')
+
+def GetSockPort46(sockaddr46):
+    if sockaddr46 is None :
+        raise ArgumentError('sockaddr is None')
+    if (sockaddr46.sa.sa_family == 2):
+        return ntohs(sockaddr46.sin.sin_port)
+    elif (sockaddr46.sa.sa_family == 30):
+        return ntohs(sockaddr46.sin6.sin6_port)
+    else:
+        raise ArgumentError('invalid sockaddr_in_4_6 address family')
+
+def FlowEntryStr(fe):
+    return "(struct flow_entry*){} src={},dst={},proto={},sport={},dport={} ".format(
+            hex(fe), GetSockAddr46(fe.fe_laddr), GetSockAddr46(fe.fe_faddr),
+            unsigned(fe.fe_key.fk_proto), GetSockPort46(fe.fe_laddr),
+            GetSockPort46(fe.fe_faddr), fe.fe_owner_name)
+
+def GetFlowEntryPid(fe):
+    return fe.fe_owner_pid
+
+def GetFlowswitchFlowEntries(fsw):
+    fm = kern.GetValueFromAddress(unsigned(fsw.fsw_flow_mgr), 'struct flow_mgr *')
+    cht = kern.GetValueFromAddress(unsigned(fm.fm_flow_table), 'struct cuckoo_hashtable *')
+
+    flows = []
+    def GetCuckooNodeAsFLowEntry(node, hashValue):
+            fe = containerof(node, 'struct flow_entry', 'fe_cnode')
+            flows.append(fe)
+
+    CuckooHashtableForeach(cht, GetCuckooNodeAsFLowEntry)
+    return flows
+
+def IsNexusAFlowswitch(nx):
+    return nx.nx_prov.nxprov_params.nxp_type == GetEnumValue('nexus_type_t::NEXUS_TYPE_FLOW_SWITCH')
+
+def GetNexusAsFlowswitch(nx):
+    return kern.GetValueFromAddress(unsigned(nx.nx_arg), 'struct nx_flowswitch *')
+
+def FlowswitchStr(fsw):
+    return "{}:\n(struct nx_flowswitch *){}".format(str(fsw.fsw_ifp.if_xname), hex(fsw))
+
+@lldb_command('showflowswitches')
+def ShowFlowswitches(cmd_args=None):
+    """ Show flow switches
+
+        usage: showflowswitches [ifname]
+    """
+    ifname = ""
+    if len(cmd_args) == 1:
+        ifname = cmd_args[0]
+
+    nexuses = kern.globals.nx_head
+    for nx in IterateRBTreeEntry(nexuses, 'struct kern_nexus*', 'nx_link'):
+        if not IsNexusAFlowswitch(nx):
+            continue
+        fsw = GetNexusAsFlowswitch(nx)
+        if ifname not in str(fsw.fsw_ifp.if_xname):
+            continue
+        print "{}".format(FlowswitchStr(fsw))
+        flows = GetFlowswitchFlowEntries(fsw)
+        flows.sort(key=GetFlowEntryPid)
+        for fe in flows:
+            print "    {}".format(FlowEntryStr(fe))
+
+def CuckooHashtableForeachSlot(cht, slotHandler):
+    for i in range(0, cht._n_buckets):
+        b = cht._buckets[i]
+        if unsigned(b._inuse) == 0:
+            continue
+        for j in range(0, kern.globals._CHT_BUCKET_SLOTS):
+            s = b._slots[j]
+            if unsigned(s._node) != 0:
+                slotHandler(s)
+
+def CuckooHashtableForeach(cht, handler):
+    def CuckooHashtableSlotHandler(s):
+        if unsigned(s._node) == 0:
+            return
+        node = s._node
+        while unsigned(node) != 0:
+            handler(node, s._hash)
+            node = node.next
+    CuckooHashtableForeachSlot(cht, CuckooHashtableSlotHandler)
+
+@lldb_command('showcuckoohashtable')
+def ShowCuckooHashtable(cmd_args=None):
+    """ Show Cuckoo Hashtable.
+
+        usage: showcuckoohashtable <struct cuckoo_hashtable *>
+    """
+    if not cmd_args:
+        raise ArgumentError('missing struct cuckoo_hashtable * argument')
+
+    cht = kern.GetValueFromAddress(cmd_args[0], 'struct cuckoo_hashtable *')
+
+    print "(struct cuckoo_hashtable *){:18s} capacity {:d} entries {:d}".format(hex(cht), cht._capacity, cht._n_entries)
+    def CuckooHashtablePrintNode(node, hashValue):
+        print "  node {} hash 0x{:08x}".format(hex(node), int(hashValue))
+
+    CuckooHashtableForeach(cht, CuckooHashtablePrintNode)
+
+@lldb_command('showprotons')
+def ShowProtoNS(cmd_args=None):
+    """ Show the protons table
+    """
+
+    protons_tokens = kern.globals.protons_tokens
+    for pt in IterateRBTreeEntry(protons_tokens, 'struct protons_token *', 'pt_link'):
+        print "(struct protons_token *){} protocol {:3} pid {:5} epid {:5} ref {:2} flags {}".format(
+                hex(pt), int(pt.pt_protocol), int(pt.pt_pid), int(pt.pt_epid),
+                int(pt.pt_refcnt.ref_count), hex(pt.pt_flags))
index 467e2018dfb221c707121ab30aec6242ee41c1d7..ba262e329f87fb4c84389643e671741d103e2a0b 100755 (executable)
 import lldb
 from xnu import *
 
-def _showStructPacking(symbol, prefix, begin_offset=0, typedef=None):
-  """
-     recursively parse the field members of structure. 
-     params : symbol (lldb.SBType) reference to symbol in binary
-              prefix (string)      string to be prefixed for each line of output. Useful for recursive struct parsing.
-     returns: string containing lines of output.
-  """
-  ctype = "unknown type"
-  if symbol.GetTypeClass() == lldb.eTypeClassUnion :
-    ctype = "union"
-  if symbol.GetTypeClass() == lldb.eTypeClassStruct :
-    ctype = "struct"
-
-  if typedef:
-    outstr =  "[%4d] (%s) (%s) %s { " % (symbol.GetByteSize(), typedef, ctype, symbol.GetName()) + "\n"
-  else :
-    outstr =  "[%4d] (%s) %s { " % (symbol.GetByteSize(), ctype, symbol.GetName()) + "\n"
-  numFields = symbol.GetNumberOfFields()
-  _has_memory_hole = False
-  _compact_size = 0    # asuming the struct is perfectly packed
-  _compact_offset = begin_offset
-  _previous_bit_offset = 0 
-  for i in range(numFields):
-    member = symbol.GetFieldAtIndex(i)
-    m_offset = member.GetOffsetInBytes() + begin_offset
-    m_offset_bits = member.GetOffsetInBits()
-    m_type = member.GetType()
-    m_name = member.GetName()
-    m_size = m_type.GetByteSize()
-    warningstr = ""
-    debugstr = "" # + str((m_size, m_offset , m_offset_bits, _previous_bit_offset, _compact_offset, begin_offset))
-    if _compact_offset != m_offset and (m_offset_bits -  _previous_bit_offset) > m_size*8 :
-      _has_memory_hole = True
-      warningstr = "   *** Possible memory hole ***" 
-      _compact_offset = m_offset
-    _compact_offset += m_size
-
-    _type_class = m_type.GetTypeClass()
-    _canonical_type = m_type.GetCanonicalType()
-    _canonical_type_class = m_type.GetCanonicalType().GetTypeClass()
-
-    if _type_class == lldb.eTypeClassTypedef and (_canonical_type_class == lldb.eTypeClassStruct or _canonical_type_class == lldb.eTypeClassUnion) :
-      outstr += prefix + ("*%4d," % m_offset) + _showStructPacking(_canonical_type, prefix+"    ", m_offset, str(m_type)) + warningstr + debugstr + "\n"
-    elif _type_class == lldb.eTypeClassStruct or _type_class == lldb.eTypeClassUnion :
-      outstr += prefix + ("*%4d," % m_offset) + _showStructPacking(m_type, prefix+"    ", m_offset) + warningstr + debugstr + "\n"
+_UnionStructClass = [ lldb.eTypeClassStruct, lldb.eTypeClassClass, lldb.eTypeClassUnion ]
+
+def _showStructPacking(O, symbol, begin_offset=0, symsize=0, typedef=None, outerSize=0, memberName=None):
+    """
+       recursively parse the field members of structure.
+       params : O the output formatter (standard.py)
+                symbol (lldb.SBType) reference to symbol in binary
+       returns: string containing lines of output.
+    """
+    ctype = "unknown type"
+    is_union = False
+    is_class = False
+    union_size = None
+    sym_size = symbol.GetByteSize()
+
+    if symbol.GetTypeClass() == lldb.eTypeClassUnion:
+        ctype = "union"
+        is_union = True
+        union_size = sym_size
+    if symbol.GetTypeClass() == lldb.eTypeClassStruct:
+        ctype = "struct"
+    if symbol.GetTypeClass() == lldb.eTypeClassClass:
+        ctype = "class"
+        is_class = True
+
+    if not outerSize or outerSize == sym_size:
+        outstr = O.format("{:04d},[{:4d}]", begin_offset, sym_size)
+    elif outerSize < sym_size: # happens with c++ inheritance
+        outstr = O.format("{:04d},[{:4d}]", begin_offset, outerSize)
+    else:
+        outstr = O.format("{:04d},[{:4d}]{VT.DarkRed}{{{:+d}}}{VT.Default}",
+                begin_offset, sym_size, outerSize - sym_size)
+
+    if typedef:
+        outstr += O.format(" {0}", typedef)
+    if symbol.IsAnonymousType():
+        outstr += O.format(" ({VT.DarkMagenta}anonymous {0}{VT.Default})", ctype)
+    else:
+        outstr += O.format(" ({VT.DarkMagenta}{0} {1}{VT.Default})", ctype, symbol.GetName())
+    if memberName:
+        outstr += O.format(" {0} {{", memberName)
     else:
-      outstr += prefix + ("+%4d,[%4d] (%s) %s" % (m_offset, m_size, m_type.GetName(), m_name)) + warningstr + debugstr + "\n"
-    if i > 0 :
-      _previous_bit_offset = m_offset_bits
-  outstr += prefix + "}"
-  if _has_memory_hole == True :
-    outstr += "   *** Warning: Struct layout leaves memory hole *** "
-  return outstr
-
-@lldb_command('showstructpacking')
-def showStructInfo(cmd_args=None):
-  """Show how a structure is packed in the binary. The format is 
-     +<offset>, [<size_of_member>] (<type>) <name> 
-     For example:
-      (lldb) script lldbmacros.showStructInfo("pollfd")
-      [  8] (struct) pollfd { 
-      +  0,[  4] (int) fd
-      +  4,[  2] (short) events
-      +  6,[  2] (short) revents
-      }
-    syntax: showstructpacking task
-  """
-  if not cmd_args:
-    raise ArgumentError("Please provide a type name.")
-  
-  sym = gettype(cmd_args[0])
-  if sym == None:
-    print "No such struct found"
-  if sym.GetTypeClass() == lldb.eTypeClassTypedef:
-      sym = sym.GetCanonicalType()
-  if sym.GetTypeClass() != lldb.eTypeClassStruct:
-    print "%s is not a structure" % cmd_args[0]
-  else:
-    print _showStructPacking(sym,"", 0)
+        outstr += ") {"
+
+    print outstr
+
+    with O.indent():
+        _previous_size = 0
+        _packed_bit_offset = 0
+        _nfields = symbol.GetNumberOfFields()
+
+        if is_class:
+            _next_offset_in_bits = 0
+            _nclasses = symbol.GetNumberOfDirectBaseClasses()
+
+            for i in range(_nclasses):
+                member = symbol.GetDirectBaseClassAtIndex(i)
+                if i < _nclasses - 1:
+                    m_size_bits = symbol.GetDirectBaseClassAtIndex(i + 1).GetOffsetInBits()
+                elif _nfields:
+                    m_size_bits = symbol.GetFieldAtIndex(0).GetOffsetInBits()
+                else:
+                    m_size_bits = symbol.GetByteSize() * 8
+
+                m_offset = member.GetOffsetInBytes() + begin_offset
+                m_type = member.GetType()
+                m_name = member.GetName()
+                m_size = m_size_bits / 8
+
+                _previous_size = m_size
+                _packed_bit_offset = member.GetOffsetInBits() + m_size_bits
+
+                _showStructPacking(O, m_type, m_offset, str(m_type), outerSize=m_size, memberName=m_name)
+
+        for i in range(_nfields):
+            member = symbol.GetFieldAtIndex(i)
+            m_offset = member.GetOffsetInBytes() + begin_offset
+            m_offset_bits = member.GetOffsetInBits()
+
+            m_type = member.GetType()
+            m_name = member.GetName()
+            m_size = m_type.GetByteSize()
+
+            if member.IsBitfield():
+                m_is_bitfield = True
+                m_size_bits = member.GetBitfieldSizeInBits()
+            else:
+                m_is_bitfield = False
+                m_size_bits = m_size * 8
+
+            if not is_union and _packed_bit_offset < m_offset_bits:
+                m_previous_offset = begin_offset + _packed_bit_offset / 8
+                m_hole_bits = m_offset_bits - _packed_bit_offset
+                if _packed_bit_offset % 8 == 0:
+                    print O.format("{:04d},[{:4d}] ({VT.DarkRed}*** padding ***{VT.Default})",
+                            m_previous_offset, m_hole_bits / 8)
+                else:
+                    print O.format("{:04d},[{:4d}] ({VT.Brown}*** padding : {:d} ***{VT.Default})",
+                            m_previous_offset, _previous_size, m_hole_bits)
+
+            _previous_size = m_size
+            _packed_bit_offset = m_offset_bits + m_size_bits
+
+            _type_class = m_type.GetTypeClass()
+            _canonical_type = m_type.GetCanonicalType()
+            _canonical_type_class = m_type.GetCanonicalType().GetTypeClass()
+
+            if _type_class == lldb.eTypeClassTypedef and _canonical_type_class in _UnionStructClass:
+                _showStructPacking(O, _canonical_type, m_offset, str(m_type), outerSize=union_size, memberName=m_name)
+            elif _type_class in _UnionStructClass:
+                _showStructPacking(O, m_type, m_offset, outerSize=union_size, memberName=m_name)
+            else:
+                outstr = O.format("{:04d},[{:4d}]", m_offset, m_size)
+                if is_union and union_size != m_size_bits / 8:
+                    outstr += O.format("{VT.DarkRed}{{{:+d}}}{VT.Default}",
+                            union_size - m_size_bits / 8)
+                if m_is_bitfield:
+                    outstr += O.format(" ({VT.DarkGreen}{:s} : {:d}{VT.Default}) {:s}",
+                            m_type.GetName(), m_size_bits, m_name)
+                else:
+                    outstr += O.format(" ({VT.DarkGreen}{:s}{VT.Default}) {:s}",
+                            m_type.GetName(), m_name)
+                print outstr
+
+        referenceSize = min(outerSize, sym_size) or sym_size
+        if not is_union and _packed_bit_offset < referenceSize * 8:
+            m_previous_offset = begin_offset + _packed_bit_offset / 8
+            m_hole_bits = referenceSize * 8 - _packed_bit_offset
+            offset = _packed_bit_offset / 8 + begin_offset
+            if _packed_bit_offset % 8 == 0:
+                print O.format("{:04d},[{:4d}] ({VT.DarkRed}*** padding ***{VT.Default})",
+                        m_previous_offset, m_hole_bits / 8)
+            else:
+                print O.format("{:04d},[{:4d}] ({VT.Brown}padding : {:d}{VT.Default})\n",
+                        m_previous_offset, _previous_size, m_hole_bits)
+
+    print "}"
+
+@lldb_command('showstructpacking', fancy=True)
+def showStructInfo(cmd_args=None, cmd_options={}, O=None):
+    """Show how a structure is packed in the binary. The format is
+       <offset>, [<size_of_member>] (<type>) <name>
+
+       For example:
+          (lldb) showstructpacking pollfd
+             0,[   8] struct pollfd {
+                 0,[   4] (int) fd
+                 4,[   2] (short) events
+                 6,[   2] (short) revents
+          }
+
+      syntax: showstructpacking task
+    """
+    if not cmd_args:
+        raise ArgumentError("Please provide a type name.")
+
+    ty_name = cmd_args[0]
+    try:
+        sym = gettype(ty_name)
+    except NameError:
+        return O.error("Cannot find type named {0}", ty_name)
+
+    if sym.GetTypeClass() == lldb.eTypeClassTypedef:
+        sym = sym.GetCanonicalType()
+
+    if sym.GetTypeClass() not in _UnionStructClass:
+        return O.error("{0} is not a structure/union/class type", ty_name)
+
+    _showStructPacking(O, sym, 0)
 
 # EndMacro: showstructinto
diff --git a/tools/lldbmacros/sysreg.py b/tools/lldbmacros/sysreg.py
new file mode 100755 (executable)
index 0000000..376a0e2
--- /dev/null
@@ -0,0 +1,190 @@
+""" Please make sure you read the README file COMPLETELY BEFORE reading anything below.
+    It is very critical that you read coding guidelines in Section E in README file.
+"""
+
+""" Note for adding new register support:
+    
+    1. Add target register to "supported registers" in the docstring of DecodeSysreg
+    2. Populate _SYSREG_TO_DECODE_FUNC_MAP with your implementation, optionally using
+       _SYSREG_TO_DOCNAME_MAP
+    3. Populate _SUPPORTED_SYSREGS list with target register
+    
+"""
+
+from xnu import *
+import os
+import sys
+import xml.etree.ElementTree as ET
+
+GREEN = '\033[0;32m'
+RED   = '\033[0;31m'
+NC    = '\033[0m'
+
+_SUPPORTED_SYSREGS = ['ESR_EL1']
+
+_SYSREG_DOC_PATH = os.path.dirname(os.path.abspath(__file__)) + '/sysregdoc/'
+
+_SYSREG_TO_DOCNAME_MAP = {
+    'ESR_EL1': 'AArch64-esr_el1.xml'
+}
+
+## Actual definition at the bottom of the file
+_SYSREG_TO_DECODE_FUNC_MAP = None
+
+# Macro: decode_sysreg
+@lldb_command('decode_sysreg')
+def DecodeSysreg(cmd_args=None):
+    """ Print out human-understandable explanation of a system register value
+        usage: decode_sysreg <sysreg> <value>
+        example: decode_sysreg esr_el1 0x96000021
+
+        supported registers:
+        ESR_EL1
+    """
+
+    ## For now, require exactly 2 arguments
+    if not cmd_args or len(cmd_args) != 2:
+        raise ArgumentError("Missing arguments.")
+
+    reg_name = cmd_args[0].upper()
+    reg_value = int(cmd_args[1], 0)
+
+    if reg_name not in _SUPPORTED_SYSREGS:
+        raise ArgumentError("{} is not supported".format(reg_name))
+
+    _SYSREG_TO_DECODE_FUNC_MAP[reg_name](reg_value)
+# EndMacro: decode_sysreg
+
+
+lldb_alias('decode_esr', 'decode_sysreg esr_el1')
+
+
+def PrintEsrEl1Explanation(regval):
+    """ Print out a detailed explanation of regval regarded as the value of
+        ESR_EL1, by parsing ARM machine readable specification
+    """
+    xmlfilename = _SYSREG_DOC_PATH + _SYSREG_TO_DOCNAME_MAP['ESR_EL1']
+    tree = ET.parse(xmlfilename)
+    root = tree.getroot()
+
+    ec = (regval >> 26) & ((1 << 6) - 1)
+    ecstring = '0b{:06b}'.format(ec)
+
+    print _Colorify(VT.Green, 'EC == ' + ecstring)
+
+    ecxpath = './registers/register/reg_fieldsets/fields/field[@id="EC_31_26"]/field_values/field_value_instance[field_value="{}"]/field_value_description//para'.format(ecstring)
+    ec_desc_paras = root.findall(ecxpath)
+
+    if ec_desc_paras is None or len(ec_desc_paras) == 0:
+        print 'EC not defined.'
+        print '\r\n'
+
+    for para in ec_desc_paras:
+        sys.stdout.write(para.text)
+        for child in para:
+            sys.stdout.write(_GetParaChildrenStr(child))
+            sys.stdout.write(child.tail)
+        print '\r\n'
+        print '\r\n'
+
+    iss = regval & ((1 << 25) - 1);
+    issstring = '0x{:07x}'.format(iss)
+    print _Colorify(VT.Green, 'ISS == ' + issstring)
+    print '\r\n'
+
+    iss_condition_xpath = './registers/register/reg_fieldsets/fields/field[@id="EC_31_26"]/field_values/field_value_instance[field_value="{}"]/field_value_links_to'.format(ecstring)
+    iss_condition = root.find(iss_condition_xpath)
+    iss_condition_str = iss_condition.attrib['linked_field_condition']
+
+    iss_fields_xpath = './registers/register/reg_fieldsets/fields/field[@id="ISS_24_0"]/partial_fieldset/fields[fields_instance="{}"]//field'.format(iss_condition_str)
+    iss_fields = root.findall(iss_fields_xpath)
+    
+    for field in iss_fields:
+        _PrintEsrIssField(field, regval)
+
+
+def _GetParaChildrenStr(elem):
+    """ Convert child tags of <para> element into text for printing
+    """
+
+    if elem.tag == 'binarynumber':
+        return elem.text
+    if elem.tag == 'arm-defined-word':
+        return elem.text
+    elif elem.tag == 'xref':
+        return elem.attrib['browsertext'].encode('utf-8')
+    elif elem.tag == 'register_link':
+        return elem.text
+    else:
+        return _Colorify(VT.Red, '*unsupported text*')
+
+
+def _PrintEsrIssField(elem, regval):
+    """ Print detailed explanation of the ISS field of ESR
+    """
+
+    field_name_str = elem.find('field_name').text
+    field_msb = int(elem.find('field_msb').text)
+    field_lsb = int(elem.find('field_lsb').text)
+    fd_before_paras = elem.findall('./field_description[@order="before"]//para')
+    fd_after_paras = elem.findall('./field_description[@order="after"]//para')
+
+    field_bits = field_msb - field_lsb + 1
+    field_value = (regval >> field_lsb) & ((1 << field_bits) - 1)
+    field_value_string = ('0b{:0' + '{}'.format(field_bits) + 'b}').format(field_value)
+
+    print _Colorify(VT.Green, _GetIndentedString(2, field_name_str) + ' == ' + field_value_string)
+
+    fv_desc_paras = elem.findall('./field_values/field_value_instance[field_value="{}"]/field_value_description//para'.format(field_value_string))
+
+    if fv_desc_paras and len(fv_desc_paras):
+        for para in fv_desc_paras:
+            sys.stdout.write(_GetIndentedString(2, ''))
+            sys.stdout.write(para.text)
+            for child in para:
+                sys.stdout.write(_GetParaChildrenStr(child))
+                sys.stdout.write((child.tail))
+        print '\r\n'
+        print '\r\n'
+    else:
+        print _Colorify(VT.Red, _GetIndentedString(2, '(No matching value, dumping out full description)')) 
+        for para in fd_before_paras:
+            sys.stdout.write(_GetIndentedString(2, ''))
+            sys.stdout.write(para.text)
+            for child in para:
+                sys.stdout.write(_GetParaChildrenStr(child))
+                sys.stdout.write(child.tail)
+            print '\r\n'
+            print '\r\n'
+
+        ## Dump all possible values
+        all_field_values = elem.findall('./field_values/field_value_instance//field_value')
+        all_field_values_str = [fv.text for fv in all_field_values]
+        if all_field_values_str != []:
+            print _GetIndentedString(2, ', '.join(all_field_values_str))
+
+        for para in fd_after_paras:
+            sys.stdout.write(_GetIndentedString(2, ''))
+            sys.stdout.write(para.text)
+            for child in para:
+                sys.stdout.write(_GetParaChildrenStr(child))
+                sys.stdout.write(child.tail)
+            print '\r\n'
+            print '\r\n'
+
+
+def _GetIndentedString(indentation, msg):
+    """ Return `msg` indented by `indentation` number of spaces
+    """
+    return ' ' * indentation + msg
+
+
+def _Colorify(color, msg):
+    """ Return `msg` enclosed by color codes
+    """
+    return color + msg + VT.Reset
+
+
+_SYSREG_TO_DECODE_FUNC_MAP = {
+    'ESR_EL1': PrintEsrEl1Explanation
+}
diff --git a/tools/lldbmacros/sysregdoc/AArch64-esr_el1.xml b/tools/lldbmacros/sysregdoc/AArch64-esr_el1.xml
new file mode 100644 (file)
index 0000000..c24be2d
--- /dev/null
@@ -0,0 +1,6153 @@
+<?xml version='1.0' encoding='utf-8'?>
+<!DOCTYPE register_page SYSTEM "registers.dtd">
+<!-- Copyright (c) 2010-2018 Arm Limited or its affiliates. All rights reserved. -->
+<!-- This document is Non-Confidential. This document may only be used and distributed in accordance with the terms of the agreement entered into by Arm and the party that Arm delivered this document to. -->
+<?xml-stylesheet href="one_register.xsl" type="text/xsl" ?>
+
+
+
+
+
+
+<register_page>
+  <registers>
+    <register execution_state="AArch64" is_register="True" is_internal="True" is_banked="False" is_optional="False" is_stub_entry="False">
+      <reg_short_name>ESR_EL1</reg_short_name>
+      <reg_long_name>Exception Syndrome Register (EL1)</reg_long_name>
+      
+
+          <reg_reset_value></reg_reset_value>
+      <reg_mappings>
+          <reg_mapping>
+              
+            <mapped_name filename="AArch32-dfsr.xml">DFSR</mapped_name>
+            <mapped_type>Architectural</mapped_type>
+              <mapped_execution_state>AArch32</mapped_execution_state>
+              <mapped_from_startbit>31</mapped_from_startbit>
+              <mapped_from_endbit>0</mapped_from_endbit>
+
+              <mapped_to_startbit>31</mapped_to_startbit>
+              <mapped_to_endbit>0</mapped_to_endbit>
+
+          </reg_mapping>
+      </reg_mappings>
+      <reg_purpose>
+        
+    
+      <purpose_text>
+        <para>Holds syndrome information for an exception taken to EL1.</para>
+      </purpose_text>
+
+      </reg_purpose>
+      <reg_groups>
+            <reg_group>Exception and fault handling registers</reg_group>
+      </reg_groups>
+      <reg_usage_constraints>
+        
+
+      </reg_usage_constraints>
+      <reg_configuration>
+        
+
+      </reg_configuration>
+      <reg_attributes>
+          <attributes_text>
+            <para>ESR_EL1 is a 64-bit register.</para>
+          </attributes_text>
+      </reg_attributes>
+      <reg_fieldsets>
+        
+
+
+
+
+
+
+
+
+
+
+
+  <fields length="64">
+    <text_before_fields>
+      
+  <para>ESR_EL1 is made <arm-defined-word>UNKNOWN</arm-defined-word> as a result of an exception return from EL1.</para>
+<para>When an <arm-defined-word>UNPREDICTABLE</arm-defined-word> instruction is treated as <arm-defined-word>UNDEFINED</arm-defined-word>, and the exception is taken to EL1, the value of ESR_EL1 is <arm-defined-word>UNKNOWN</arm-defined-word>. The value written to ESR_EL1 must be consistent with a value that could be created as a result of an exception from the same Exception level that generated the exception as a result of a situation that is not <arm-defined-word>UNPREDICTABLE</arm-defined-word> at that Exception level, in order to avoid the possibility of a privilege violation.</para>
+
+    </text_before_fields>
+    
+        <field 
+           id="0_63_32" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>63</field_msb>
+        <field_lsb>32</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="EC_31_26" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="True" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>EC</field_name>
+        <field_msb>31</field_msb>
+        <field_lsb>26</field_lsb>
+        <field_description order="before">
+          
+  <para>Exception Class. Indicates the reason for the exception that this register holds information about.</para>
+<para>For each EC value, the table references a subsection that gives information about:</para>
+<list type="unordered">
+<listitem><content>The cause of the exception, for example the configuration required to enable the trap.</content>
+</listitem><listitem><content>The encoding of the associated ISS.</content>
+</listitem></list>
+<para>Possible values of the EC field are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+          <field_value>0b000000</field_value>
+        <field_value_description>
+  <para>Unknown reason.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exceptions with an unknown reason"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b000001</field_value>
+        <field_value_description>
+  <para>Trapped WFI or WFE instruction execution.</para>
+<para>Conditional WFE and WFI instructions that fail their condition code check do not cause an exception.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from a WFI or WFE instruction"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b000011</field_value>
+        <field_value_description>
+  <para>Trapped MCR or MRC access with (coproc==<binarynumber>0b1111</binarynumber>) that is not reported using EC <binarynumber>0b000000</binarynumber>.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from an MCR or MRC access"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b000100</field_value>
+        <field_value_description>
+  <para>Trapped MCRR or MRRC access with (coproc==<binarynumber>0b1111</binarynumber>) that is not reported using EC <binarynumber>0b000000</binarynumber>.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from an MCRR or MRRC access"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b000101</field_value>
+        <field_value_description>
+  <para>Trapped MCR or MRC access with (coproc==<binarynumber>0b1110</binarynumber>).</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from an MCR or MRC access"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b000110</field_value>
+        <field_value_description>
+  <para>Trapped LDC or STC access.</para>
+<para>The only architected uses of these instruction are:</para>
+<list type="unordered">
+<listitem><content>An STC to write data to memory from <register_link state="AArch32" id="AArch32-dbgdtrrxint.xml">DBGDTRRXint</register_link>.</content>
+</listitem><listitem><content>An LDC to read data from memory to <register_link state="AArch32" id="AArch32-dbgdtrtxint.xml">DBGDTRTXint</register_link>.</content>
+</listitem></list>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from an LDC or STC instruction"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b000111</field_value>
+        <field_value_description>
+  <para>Access to SVE, Advanced SIMD, or floating-point functionality trapped by <register_link state="AArch64" id="AArch64-cpacr_el1.xml">CPACR_EL1</register_link>.FPEN, <register_link state="AArch64" id="AArch64-cptr_el2.xml">CPTR_EL2</register_link>.FPEN, <register_link state="AArch64" id="AArch64-cptr_el2.xml">CPTR_EL2</register_link>.TFP, or <register_link state="AArch64" id="AArch64-cptr_el3.xml">CPTR_EL3</register_link>.TFP control.</para>
+<para>Excludes exceptions resulting from <register_link state="AArch64" id="AArch64-cpacr_el1.xml">CPACR_EL1</register_link> when the value of <register_link state="AArch64" id="AArch64-hcr_el2.xml">HCR_EL2</register_link>.TGE is 1, or because SVE or Advanced SIMD and floating-point are not implemented. These are reported with EC value <binarynumber>0b000000</binarynumber> as described in <xref linkend="CHDJCBHE" browsertext="'EC encodings when routing exceptions to EL2' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1.10.4" filename="D_the_aarch64_system_level_programmers_model"/>.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from an access to SVE, Advanced SIMD or floating-point functionality, resulting from CPACR_EL1.FPEN, CPTR_EL2.FPEN or CPTR_ELx.TFP"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b001100</field_value>
+        <field_value_description>
+  <para>Trapped MRRC access with (coproc==<binarynumber>0b1110</binarynumber>).</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from an MCRR or MRRC access"/>
+    </field_value_instance>
+                  <field_value_instance>
+          <field_value>0b001101</field_value>
+        <field_value_description>
+  <para>Branch Target Exception.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from Branch Target Identification instruction"/>
+            <field_value_condition>When ARMv8.5-BTI is implemented</field_value_condition>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b001110</field_value>
+        <field_value_description>
+  <para>Illegal Execution state.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from an Illegal Execution state, or a PC or SP alignment fault"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b010001</field_value>
+        <field_value_description>
+  <para>SVC instruction execution in AArch32 state.</para>
+<para>This is reported in ESR_EL2 only when the exception is generated because the value of <register_link state="AArch64" id="AArch64-hcr_el2.xml">HCR_EL2</register_link>.TGE is 1.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from HVC or SVC instruction execution"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b010101</field_value>
+        <field_value_description>
+  <para>SVC instruction execution in AArch64 state.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from HVC or SVC instruction execution"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b011000</field_value>
+        <field_value_description>
+  <para>Trapped MSR, MRS or System instruction execution in AArch64 state, that is not reported using EC <binarynumber>0b000000</binarynumber>, <binarynumber>0b000001</binarynumber> or <binarynumber>0b000111</binarynumber>.</para>
+<para>If <xref browsertext="ARMv8.4-IDST" filename="A_introduction_to_the_armv8_architecture.fm" linkend="v8.4.IDST"></xref> is implemented, also exceptions generated on a read of an ID register.</para>
+<para>If <xref browsertext="ARMv8.0-CSV2" filename="A_introduction_to_the_armv8_architecture.fm" linkend="v8.0.CSV2"></xref> is implemented, also Cache Speculation Variant exceptions.</para>
+<para>This includes all instructions that cause exceptions that are part of the encoding space defined in <xref linkend="BEIJIEIE" browsertext="'System instruction class encoding overview' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section C5.2.2" filename="C_the_a64_system_instruction_class"/>, except for those exceptions reported using EC values <binarynumber>0b000000</binarynumber>, <binarynumber>0b000001</binarynumber>, or <binarynumber>0b000111</binarynumber>.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from MSR, MRS, or System instruction execution in AArch64 state"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b011001</field_value>
+        <field_value_description>
+  <para>Access to SVE functionality trapped as a result of <register_link state="AArch64" id="AArch64-cpacr_el1.xml">CPACR_EL1</register_link>.ZEN, <register_link state="AArch64" id="AArch64-cptr_el2.xml">CPTR_EL2</register_link>.ZEN, <register_link state="AArch64" id="AArch64-cptr_el2.xml">CPTR_EL2</register_link>.TZ, or <register_link state="AArch64" id="AArch64-cptr_el3.xml">CPTR_EL3</register_link>.EZ, that is not reported using EC <binarynumber>0b000000</binarynumber>.</para>
+<para>This EC is defined only if <xref linkend="SVE" browsertext="SVE" filename="A_introduction_to_the_armv8_architecture.fm"/> is implemented.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from an access to SVE functionality, resulting from CPACR_EL1.ZEN, CPTR_EL2.ZEN, CPTR_EL2.TZ, or CPTR_EL3.EZ"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b100000</field_value>
+        <field_value_description>
+  <para>Instruction Abort from a lower Exception level, that might be using AArch32 or AArch64.</para>
+<para>Used for MMU faults generated by instruction accesses and synchronous External aborts, including synchronous parity or ECC errors. Not used for debug related exceptions.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from an Instruction Abort"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b100001</field_value>
+        <field_value_description>
+  <para>Instruction Abort taken without a change in Exception level.</para>
+<para>Used for MMU faults generated by instruction accesses and synchronous External aborts, including synchronous parity or ECC errors. Not used for debug related exceptions.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from an Instruction Abort"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b100010</field_value>
+        <field_value_description>
+  <para>PC alignment fault exception.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from an Illegal Execution state, or a PC or SP alignment fault"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b100100</field_value>
+        <field_value_description>
+  <para>Data Abort from a lower Exception level, that might be using AArch32 or AArch64.</para>
+<para>Used for MMU faults generated by data accesses, alignment faults other than those caused by Stack Pointer misalignment, and synchronous External aborts, including synchronous parity or ECC errors. Not used for debug related exceptions.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from a Data Abort"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b100101</field_value>
+        <field_value_description>
+  <para>Data Abort taken without a change in Exception level.</para>
+<para>Used for MMU faults generated by data accesses, alignment faults other than those caused by Stack Pointer misalignment, and synchronous External aborts, including synchronous parity or ECC errors. Not used for debug related exceptions.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from a Data Abort"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b100110</field_value>
+        <field_value_description>
+  <para>SP alignment fault exception.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from an Illegal Execution state, or a PC or SP alignment fault"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b101000</field_value>
+        <field_value_description>
+  <para>Trapped floating-point exception taken from AArch32 state.</para>
+<para>This EC value is valid if the implementation supports trapping of floating-point exceptions, otherwise it is reserved. Whether a floating-point implementation supports trapping of floating-point exceptions is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word>.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from a trapped floating-point exception"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b101100</field_value>
+        <field_value_description>
+  <para>Trapped floating-point exception taken from AArch64 state.</para>
+<para>This EC value is valid if the implementation supports trapping of floating-point exceptions, otherwise it is reserved. Whether a floating-point implementation supports trapping of floating-point exceptions is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word>.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from a trapped floating-point exception"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b101111</field_value>
+        <field_value_description>
+  <para>SError interrupt.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="SError interrupt"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b110000</field_value>
+        <field_value_description>
+  <para>Breakpoint exception from a lower Exception level,  that might be using AArch32 or AArch64.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from a Breakpoint or Vector Catch debug exception"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b110001</field_value>
+        <field_value_description>
+  <para>Breakpoint exception taken without a change in Exception level.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from a Breakpoint or Vector Catch debug exception"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b110010</field_value>
+        <field_value_description>
+  <para>Software Step exception from a lower Exception level,  that might be using AArch32 or AArch64.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from a Software Step exception"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b110011</field_value>
+        <field_value_description>
+  <para>Software Step exception taken without a change in Exception level.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from a Software Step exception"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b110100</field_value>
+        <field_value_description>
+  <para>Watchpoint exception from a lower Exception level,  that might be using AArch32 or AArch64.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from a Watchpoint exception"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b110101</field_value>
+        <field_value_description>
+  <para>Watchpoint exception taken without a change in Exception level.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from a Watchpoint exception"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b111000</field_value>
+        <field_value_description>
+  <para>BKPT instruction execution in AArch32 state.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from execution of a Breakpoint instruction"/>
+    </field_value_instance>
+                <field_value_instance>
+          <field_value>0b111100</field_value>
+        <field_value_description>
+  <para>BRK instruction execution in AArch64 state.</para>
+<para>This is reported in <register_link state="AArch64" id="AArch64-esr_el3.xml">ESR_EL3</register_link> only if a BRK instruction is executed.</para>
+</field_value_description>
+                <field_value_links_to linked_field_name="ISS" linked_field_condition="Exception from execution of a Breakpoint instruction"/>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>All other EC values are reserved by Arm, and:</para>
+<list type="unordered">
+<listitem><content>Unused values in the range <binarynumber>0b000000</binarynumber> - <binarynumber>0b101100</binarynumber> (<hexnumber>0x00</hexnumber> - <hexnumber>0x2C</hexnumber>) are reserved for future use for synchronous exceptions.</content>
+</listitem><listitem><content>Unused values in the range <binarynumber>0b101101</binarynumber> - <binarynumber>0b111111</binarynumber> (<hexnumber>0x2D</hexnumber> - <hexnumber>0x3F</hexnumber>) are reserved for future use, and might be used for synchronous or asynchronous exceptions.</content>
+</listitem></list>
+<para>The effect of programming this field to a reserved value is that behavior is <arm-defined-word>CONSTRAINED UNPREDICTABLE</arm-defined-word>, as described in <xref linkend="CEGHGHJI" browsertext="'Reserved values in System and memory-mapped registers and translation table entries' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section K1.1.11"/>.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="IL_25_25" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>IL</field_name>
+        <field_msb>25</field_msb>
+        <field_lsb>25</field_lsb>
+        <field_description order="before">
+          
+  <para>Instruction Length for synchronous exceptions. Possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>16-bit instruction trapped.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <list type="unordered">
+<listitem><content>
+<para>An SError interrupt.</para>
+</content>
+</listitem><listitem><content>
+<para>An Instruction Abort exception.</para>
+</content>
+</listitem><listitem><content>
+<para>A PC alignment fault exception.</para>
+</content>
+</listitem><listitem><content>
+<para>An SP alignment fault exception.</para>
+</content>
+</listitem><listitem><content>
+<para>A Data Abort exception for which the value of the ISV bit is 0.</para>
+</content>
+</listitem><listitem><content>
+<para>An Illegal Execution state exception.</para>
+</content>
+</listitem><listitem><content>
+<para>Any debug exception except for Breakpoint instruction exceptions. For Breakpoint instruction exceptions, this bit has its standard meaning:</para>
+<list type="unordered">
+<listitem><content>
+<para><binarynumber>0b0</binarynumber>: 16-bit T32 BKPT instruction.</para>
+</content>
+</listitem><listitem><content>
+<para><binarynumber>0b1</binarynumber>: 32-bit A32 BKPT instruction or A64 BRK instruction.</para>
+</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>
+<para>An exception reported using EC value <binarynumber>0b000000</binarynumber>.</para>
+</content>
+</listitem></list>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="ISS_24_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="True" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>ISS</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Instruction Specific Syndrome. Architecturally, this field can be defined independently for each defined Exception class. However, in practice, some ISS encodings are used for more than one Exception class.</para>
+<para>Typically, an ISS encoding has a number of subfields. When an ISS subfield holds a register number, the value returned in that field is the AArch64 view of the register number. For an exception taken from AArch32 state, <xref linkend="BEIDFCCE" browsertext="'Mapping of the general-purpose registers between the Execution states' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1.20.1,"/> defines this view of the specified AArch32 register. If the AArch32 register descriptor is <binarynumber>0b1111</binarynumber>, then:</para>
+<list type="unordered">
+<listitem><content>If the instruction that generated the exception was not <arm-defined-word>UNPREDICTABLE</arm-defined-word>, the field takes the value <binarynumber>0b11111</binarynumber>.</content>
+</listitem><listitem><content>If the instruction that generated the exception was <arm-defined-word>UNPREDICTABLE</arm-defined-word>, the field takes an <arm-defined-word>UNKNOWN</arm-defined-word> value that must be either:<list type="unordered">
+<listitem><content>The AArch64 view of the register number of a register that might have been used at the Exception level from which the exception was taken.</content>
+</listitem><listitem><content>The value <binarynumber>0b11111</binarynumber>.</content>
+</listitem></list>
+</content>
+</listitem></list>
+<para>When the EC field is <binarynumber>0b000000</binarynumber>, indicating an exception with an unknown reason, the ISS field is not valid, <arm-defined-word>RES0</arm-defined-word>.</para>
+
+        </field_description>
+        <field_values>
+             
+               <field_value_name>I</field_value_name>
+        </field_values>
+          <field_resets>
+  
+</field_resets>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exceptions with an unknown reason</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="0_24_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+    <text_after_fields>
+    
+  <para>When an exception is reported using this EC code the IL field is set to 1.</para>
+<para>This EC code is used for all exceptions that are not covered by any other EC value. This includes exceptions that are generated in the following situations:</para>
+<list type="unordered">
+<listitem><content>The attempted execution of an instruction bit pattern that has no allocated instruction at the current Exception level and Security state, including:<list type="unordered">
+<listitem><content>A read access using a System register pattern that is not allocated for reads at the current Exception level and Security state.</content>
+</listitem><listitem><content>A write access using a System register pattern that is not allocated for writes at the current Exception level and Security state.</content>
+</listitem><listitem><content>Instruction encodings for instructions not implemented in the implementation.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>In Debug state, the attempted execution of an instruction bit pattern that is unallocated in Debug state.</content>
+</listitem><listitem><content>In Non-debug state, the attempted execution of an instruction bit pattern that is unallocated in Non-debug state.</content>
+</listitem><listitem><content>In AArch32 state, attempted execution of a short vector floating-point instruction.</content>
+</listitem><listitem><content>In an implementation that does not include Advanced SIMD and floating-point functionality, an attempted access to Advanced SIMD or floating-point functionality under conditions where that access would be permitted if that functionality was present. This includes the attempted execution of an Advanced SIMD or floating-point instruction, and attempted accesses to Advanced SIMD and floating-point System registers.</content>
+</listitem><listitem><content>An exception generated because of the value of one of the <register_link state="AArch64" id="AArch64-sctlr_el1.xml">SCTLR_EL1</register_link>.{ITD, SED, CP15BEN} control bits.</content>
+</listitem><listitem><content>Attempted execution of:<list type="unordered">
+<listitem><content>An HVC instruction when disabled by <register_link state="AArch64" id="AArch64-hcr_el2.xml">HCR_EL2</register_link>.HCD or <register_link state="AArch64" id="AArch64-scr_el3.xml">SCR_EL3</register_link>.HCE.</content>
+</listitem><listitem><content>An SMC instruction when disabled by <register_link state="AArch64" id="AArch64-scr_el3.xml">SCR_EL3</register_link>.SMD.</content>
+</listitem><listitem><content>An HLT instruction when disabled by <register_link state="ext" id="ext-edscr.xml">EDSCR</register_link>.HDE.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>Attempted execution of an MSR or MRS instruction to access <register_link state="AArch64" id="AArch64-sp_el0.xml">SP_EL0</register_link> when the value of <register_link state="AArch64" id="AArch64-spsel.xml">SPSel</register_link>.SP is 0.</content>
+</listitem><listitem><content>Attempted execution, in Debug state, of:<list type="unordered">
+<listitem><content>A DCPS1 instruction when the value of <register_link state="AArch64" id="AArch64-hcr_el2.xml">HCR_EL2</register_link>.TGE is 1 and EL2 is disabled or not implemented in the current Security state.</content>
+</listitem><listitem><content>A DCPS2 instruction from EL1 or EL0 when EL2 is disabled or not implemented in the current Security state.</content>
+</listitem><listitem><content>A DCPS3 instruction when the value of <register_link state="ext" id="ext-edscr.xml">EDSCR</register_link>.SDD is 1, or when EL3 is not implemented.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>When EL3 is using AArch64, attempted execution from Secure EL1 of an SRS instruction using R13_mon. See <xref linkend="CHDJIEBG" browsertext="'Traps to EL3 of monitor functionality from Secure EL1 using AArch32' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content>In Debug state when the value of <register_link state="ext" id="ext-edscr.xml">EDSCR</register_link>.SDD is 1, the attempted execution at EL2, EL1, or EL0 of an instruction that is configured to trap to EL3.</content>
+</listitem><listitem><content>In AArch32 state, the attempted execution of an MRS (banked register) or an MSR (banked register) instruction to SPSR_mon, SP_mon, or LR_mon.</content>
+</listitem><listitem><content>An exception that is taken to EL2 because the value of <register_link state="AArch64" id="AArch64-hcr_el2.xml">HCR_EL2</register_link>.TGE is 1 that, if the value of <register_link state="AArch64" id="AArch64-hcr_el2.xml">HCR_EL2</register_link>.TGE was 0 would have been reported with an ESR_ELx.EC value of <binarynumber>0b000111</binarynumber>.</content>
+</listitem><listitem><content>When SVE is not implemented, attempted execution of:<list type="unordered">
+<listitem><content>An SVE instruction.</content>
+</listitem><listitem><content>An MSR or MRS instruction to access <register_link state="AArch64" id="AArch64-zcr_el1.xml">ZCR_EL1</register_link>, <register_link state="AArch64" id="AArch64-zcr_el2.xml">ZCR_EL2</register_link>, or <register_link state="AArch64" id="AArch64-zcr_el3.xml">ZCR_EL3</register_link>.</content>
+</listitem></list>
+</content>
+</listitem></list>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+
+        <fieldat id="0_24_0" msb="24" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from a WFI or WFE instruction</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="CV_24_24" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>CV</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>24</field_lsb>
+        <field_description order="before">
+          
+  <para>Condition code valid. Possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>The COND field is not valid.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>The COND field is valid.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>For exceptions taken from AArch64, CV is set to 1.</para>
+<para>For exceptions taken from AArch32:</para>
+<list type="unordered">
+<listitem><content>When an A32 instruction is trapped, CV is set to 1.</content>
+</listitem><listitem><content>When a T32 instruction is trapped, it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether CV is set to 1 or set to 0. See the description of the COND field for more information.</content>
+</listitem></list>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="COND_23_20" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>COND</field_name>
+        <field_msb>23</field_msb>
+        <field_lsb>20</field_lsb>
+        <field_description order="before">
+          
+  <para>The condition code for the trapped instruction. This field is valid only for exceptions taken from AArch32, and only when the value of CV is 1.</para>
+<para>For exceptions taken from AArch64, this field is set to <binarynumber>0b1110</binarynumber>.</para>
+<para>For exceptions taken from AArch32:</para>
+<list type="unordered">
+<listitem><content>When an A32 instruction is trapped, CV is set to 1 and:<list type="unordered">
+<listitem><content>If the instruction is conditional, COND is set to the condition code field value from the instruction.</content>
+</listitem><listitem><content>If the instruction is unconditional, COND is set to <binarynumber>0b1110</binarynumber>.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>A conditional A32 instruction that is known to pass its condition code check can be presented either:<list type="unordered">
+<listitem><content>With COND set to <binarynumber>0b1110</binarynumber>, the value for unconditional.</content>
+</listitem><listitem><content>With the COND value held in the instruction.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>When a T32 instruction is trapped, it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether:<list type="unordered">
+<listitem><content>CV is set to 0 and COND is set to an <arm-defined-word>UNKNOWN</arm-defined-word> value. Software must examine the SPSR.IT field to determine the condition, if any, of the T32 instruction.</content>
+</listitem><listitem><content>CV is set to 1 and COND is set to the condition code for the condition that applied to the instruction.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>For an implementation that, for both A32 and T32 instructions, takes an exception on a trapped conditional instruction only if the instruction passes its condition code check, these definitions mean that when CV is set to 1 it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether the COND field is set to <binarynumber>0b1110</binarynumber>, or to the value of any condition that applied to the instruction.</content>
+</listitem></list>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="0_19_1" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>19</field_msb>
+        <field_lsb>1</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="TI_0_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>TI</field_name>
+        <field_msb>0</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Trapped instruction. Possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>WFI trapped.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>WFE trapped.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  <para>The following sections describe configuration settings for generating this exception:</para>
+<list type="unordered">
+<listitem><content><xref linkend="D1CHDJGAIC" browsertext="'Controls for exceptions taken to EL1 using AArch64' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIBHJCJ" browsertext="'Traps to EL2 of Non-secure EL1 and EL0 execution of WFE and WFI instructions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDEGCIJ" browsertext="'Traps to EL3 of EL2, EL1, and EL0 execution of WFE and WFI instructions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem></list>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="CV_24_24" msb="24" lsb="24"/>
+        <fieldat id="COND_23_20" msb="23" lsb="20"/>
+        <fieldat id="0_19_1" msb="19" lsb="1"/>
+        <fieldat id="TI_0_0" msb="0" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from an MCR or MRC access</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="CV_24_24" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>CV</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>24</field_lsb>
+        <field_description order="before">
+          
+  <para>Condition code valid. Possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>The COND field is not valid.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>The COND field is valid.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>For exceptions taken from AArch64, CV is set to 1.</para>
+<para>For exceptions taken from AArch32:</para>
+<list type="unordered">
+<listitem><content>When an A32 instruction is trapped, CV is set to 1.</content>
+</listitem><listitem><content>When a T32 instruction is trapped, it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether CV is set to 1 or set to 0. See the description of the COND field for more information.</content>
+</listitem></list>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="COND_23_20" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>COND</field_name>
+        <field_msb>23</field_msb>
+        <field_lsb>20</field_lsb>
+        <field_description order="before">
+          
+  <para>The condition code for the trapped instruction. This field is valid only for exceptions taken from AArch32, and only when the value of CV is 1.</para>
+<para>For exceptions taken from AArch64, this field is set to <binarynumber>0b1110</binarynumber>.</para>
+<para>For exceptions taken from AArch32:</para>
+<list type="unordered">
+<listitem><content>When an A32 instruction is trapped, CV is set to 1 and:<list type="unordered">
+<listitem><content>If the instruction is conditional, COND is set to the condition code field value from the instruction.</content>
+</listitem><listitem><content>If the instruction is unconditional, COND is set to <binarynumber>0b1110</binarynumber>.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>A conditional A32 instruction that is known to pass its condition code check can be presented either:<list type="unordered">
+<listitem><content>With COND set to <binarynumber>0b1110</binarynumber>, the value for unconditional.</content>
+</listitem><listitem><content>With the COND value held in the instruction.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>When a T32 instruction is trapped, it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether:<list type="unordered">
+<listitem><content>CV is set to 0 and COND is set to an <arm-defined-word>UNKNOWN</arm-defined-word> value. Software must examine the SPSR.IT field to determine the condition, if any, of the T32 instruction.</content>
+</listitem><listitem><content>CV is set to 1 and COND is set to the condition code for the condition that applied to the instruction.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>For an implementation that, for both A32 and T32 instructions, takes an exception on a trapped conditional instruction only if the instruction passes its condition code check, these definitions mean that when CV is set to 1 it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether the COND field is set to <binarynumber>0b1110</binarynumber>, or to the value of any condition that applied to the instruction.</content>
+</listitem></list>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="Opc2_19_17" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Opc2</field_name>
+        <field_msb>19</field_msb>
+        <field_lsb>17</field_lsb>
+        <field_description order="before">
+          
+  <para>The Opc2 value from the issued instruction.</para>
+<para>For a trapped VMRS access, holds the value <binarynumber>0b000</binarynumber>.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="Opc1_16_14" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Opc1</field_name>
+        <field_msb>16</field_msb>
+        <field_lsb>14</field_lsb>
+        <field_description order="before">
+          
+  <para>The Opc1 value from the issued instruction.</para>
+<para>For a trapped VMRS access, holds the value <binarynumber>0b111</binarynumber>.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="CRn_13_10" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>CRn</field_name>
+        <field_msb>13</field_msb>
+        <field_lsb>10</field_lsb>
+        <field_description order="before">
+          
+  <para>The CRn value from the issued instruction.</para>
+<para>For a trapped VMRS access, holds the reg field from the VMRS instruction encoding.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="Rt_9_5" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Rt</field_name>
+        <field_msb>9</field_msb>
+        <field_lsb>5</field_lsb>
+        <field_description order="before">
+          
+  <para>The Rt value from the issued instruction, the general-purpose register used for the transfer. The reported value gives the AArch64 view of the register. See <xref linkend="BEIDFCCE" browsertext="'Mapping of the general-purpose registers between the Execution states' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1.20.1"/>.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="CRm_4_1" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>CRm</field_name>
+        <field_msb>4</field_msb>
+        <field_lsb>1</field_lsb>
+        <field_description order="before">
+          
+  <para>The CRm value from the issued instruction.</para>
+<para>For a trapped VMRS access, holds the value <binarynumber>0b0000</binarynumber>.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="Direction_0_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Direction</field_name>
+        <field_msb>0</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Indicates the direction of the trapped instruction. The possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Write to System register space. MCR instruction.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Read from System register space. MRC or VMRS instruction.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  <para>The following sections describe configuration settings for generating exceptions that are reported using EC value <binarynumber>0b000011</binarynumber>:</para>
+<list type="unordered">
+<listitem><content><xref linkend="CHDJFHAI" browsertext="'Traps to EL1 of EL0 accesses to the Generic Timer registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDGDIEA" browsertext="'Traps to EL1 of EL0 accesses to Performance Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="BEICGACA" browsertext="'Traps to EL1 of EL0 accesses to Activity Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIGHBDF" browsertext="'Traps to EL2 of Non-secure EL1 accesses to virtual memory control registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEICAEBH" browsertext="'Traps to EL2 of Non-secure EL1 execution of TLB maintenance instructions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEICHIHA" browsertext="'Traps to EL2 of Non-secure EL1 and EL0 execution of cache maintenance instructions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIFCHFF" browsertext="'Traps to EL2 of Non-secure EL1 and EL0 accesses to the Auxiliary Control Register' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="BEIDHFBB" browsertext="'Traps to EL2 of Non-secure EL1 and EL0 accesses to lockdown, DMA, and TCM operations' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIEEJIA" browsertext="'Traps to EL2 of Non-secure EL1 and EL0 accesses to the ID registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIFFJII" browsertext="'Trapping to EL2 of Non-secure EL1 accesses to the CPACR_EL1 or CPACR' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIDJJAH" browsertext="'Generic trapping to EL2 of Non-secure EL1 and EL0 accesses to System registers, from AArch32 state only' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/></content>
+</listitem><listitem><content><xref linkend="CHDFFCIB" browsertext="'Traps to EL2 of Non-secure EL0 and EL1 accesses to the Generic Timer registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/></content>
+</listitem><listitem><content><xref linkend="D1BEICIHGI" browsertext="'Traps to EL2 of Non-secure EL1 and EL0 accesses to Performance Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="BEIGCEGJ" browsertext="'Traps to EL2 of EL1 and EL0 accesses to Activity Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="CHDJIEBG" browsertext="'Traps to EL3 of Secure monitor functionality from Secure EL1 using AArch32' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDJACCE" browsertext="'Trapping to EL3 of EL2 accesses to the CPTR_EL2 or HCPTR, and EL2 and EL1 accesses to the CPACR_EL1 or CPACR' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDGGBGH" browsertext="'Traps to EL3 of EL2, EL1, and EL0 accesses to Performance Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="BEIHHEIC" browsertext="'Traps to EL3 of EL2, EL1, and EL0 accesses to Activity Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem></list>
+<para>The following sections describe configuration settings for generating exceptions that are reported using EC value <binarynumber>0b000101</binarynumber>:</para>
+<list type="unordered">
+<listitem><content><xref linkend="D1CHDBAICA" browsertext="'Traps to EL1 of EL0 and EL1 System register accesses to the trace registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDIGAJA" browsertext="'Traps to EL1 of EL0 accesses to the Debug Communications Channel (DCC) registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIEEJIA" browsertext="'Traps to EL2 of Non-secure EL1 and EL0 accesses to the ID registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>, for trapped accesses to the JIDR.</content>
+</listitem><listitem><content><xref linkend="D1BEIIHJHA" browsertext="'Traps to EL2 of Non-secure system register accesses to the trace registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIHBBIC" browsertext="'Trapping System register accesses to Debug ROM registers to EL2' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIECEGJ" browsertext="'Trapping System register accesses to OS-related debug registers to EL2' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEICAABI" browsertext="'Trapping general System register accesses to debug registers to EL2' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDJICAB" browsertext="'Traps to EL3 of all System register accesses to the trace registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDDHIIA" browsertext="'Trapping System register accesses to OS-related debug registers to EL3' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDCBEHG" browsertext="'Trapping general System register accesses to debug registers to EL3' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem></list>
+<para><xref linkend="D1BEIEEJIA" browsertext="'Traps to EL2 of Non-secure EL1 and EL0 accesses to the ID registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1," filename="D_the_aarch64_system_level_programmers_model"/> describes configuration settings for generating exceptions that are reported using EC value <binarynumber>0b001000</binarynumber>.</para>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="CV_24_24" msb="24" lsb="24"/>
+        <fieldat id="COND_23_20" msb="23" lsb="20"/>
+        <fieldat id="Opc2_19_17" msb="19" lsb="17"/>
+        <fieldat id="Opc1_16_14" msb="16" lsb="14"/>
+        <fieldat id="CRn_13_10" msb="13" lsb="10"/>
+        <fieldat id="Rt_9_5" msb="9" lsb="5"/>
+        <fieldat id="CRm_4_1" msb="4" lsb="1"/>
+        <fieldat id="Direction_0_0" msb="0" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from an MCRR or MRRC access</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="CV_24_24" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>CV</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>24</field_lsb>
+        <field_description order="before">
+          
+  <para>Condition code valid. Possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>The COND field is not valid.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>The COND field is valid.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>For exceptions taken from AArch64, CV is set to 1.</para>
+<para>For exceptions taken from AArch32:</para>
+<list type="unordered">
+<listitem><content>When an A32 instruction is trapped, CV is set to 1.</content>
+</listitem><listitem><content>When a T32 instruction is trapped, it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether CV is set to 1 or set to 0. See the description of the COND field for more information.</content>
+</listitem></list>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="COND_23_20" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>COND</field_name>
+        <field_msb>23</field_msb>
+        <field_lsb>20</field_lsb>
+        <field_description order="before">
+          
+  <para>The condition code for the trapped instruction. This field is valid only for exceptions taken from AArch32, and only when the value of CV is 1.</para>
+<para>For exceptions taken from AArch64, this field is set to <binarynumber>0b1110</binarynumber>.</para>
+<para>For exceptions taken from AArch32:</para>
+<list type="unordered">
+<listitem><content>When an A32 instruction is trapped, CV is set to 1 and:<list type="unordered">
+<listitem><content>If the instruction is conditional, COND is set to the condition code field value from the instruction.</content>
+</listitem><listitem><content>If the instruction is unconditional, COND is set to <binarynumber>0b1110</binarynumber>.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>A conditional A32 instruction that is known to pass its condition code check can be presented either:<list type="unordered">
+<listitem><content>With COND set to <binarynumber>0b1110</binarynumber>, the value for unconditional.</content>
+</listitem><listitem><content>With the COND value held in the instruction.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>When a T32 instruction is trapped, it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether:<list type="unordered">
+<listitem><content>CV is set to 0 and COND is set to an <arm-defined-word>UNKNOWN</arm-defined-word> value. Software must examine the SPSR.IT field to determine the condition, if any, of the T32 instruction.</content>
+</listitem><listitem><content>CV is set to 1 and COND is set to the condition code for the condition that applied to the instruction.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>For an implementation that, for both A32 and T32 instructions, takes an exception on a trapped conditional instruction only if the instruction passes its condition code check, these definitions mean that when CV is set to 1 it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether the COND field is set to <binarynumber>0b1110</binarynumber>, or to the value of any condition that applied to the instruction.</content>
+</listitem></list>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="Opc1_19_16" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Opc1</field_name>
+        <field_msb>19</field_msb>
+        <field_lsb>16</field_lsb>
+        <field_description order="before">
+          
+  <para>The Opc1 value from the issued instruction.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="0_15_15" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>15</field_msb>
+        <field_lsb>15</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="Rt2_14_10" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Rt2</field_name>
+        <field_msb>14</field_msb>
+        <field_lsb>10</field_lsb>
+        <field_description order="before">
+          
+  <para>The Rt2 value from the issued instruction, the second general-purpose register used for the transfer. The reported value gives the AArch64 view of the register. See <xref linkend="BEIDFCCE" browsertext="'Mapping of the general-purpose registers between the Execution states' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1.20.1"/>.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="Rt_9_5" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Rt</field_name>
+        <field_msb>9</field_msb>
+        <field_lsb>5</field_lsb>
+        <field_description order="before">
+          
+  <para>The Rt value from the issued instruction, the first general-purpose register used for the transfer. The reported value gives the AArch64 view of the register. See <xref linkend="BEIDFCCE" browsertext="'Mapping of the general-purpose registers between the Execution states' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1.20.1"/>.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="CRm_4_1" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>CRm</field_name>
+        <field_msb>4</field_msb>
+        <field_lsb>1</field_lsb>
+        <field_description order="before">
+          
+  <para>The CRm value from the issued instruction.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="Direction_0_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Direction</field_name>
+        <field_msb>0</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Indicates the direction of the trapped instruction. The possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Write to System register space. MCRR instruction.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Read from System register space. MRRC instruction.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  <para>The following sections describe configuration settings for generating exceptions that are reported using EC value <binarynumber>0b000100</binarynumber>:</para>
+<list type="unordered">
+<listitem><content><xref linkend="CHDJFHAI" browsertext="'Traps to EL1 of EL0 accesses to the Generic Timer registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDGDIEA" browsertext="'Traps to EL1 of EL0 accesses to Performance Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="BEICGACA" browsertext="'Traps to EL1 of EL0 accesses to Activity Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIGHBDF" browsertext="'Traps to EL2 of Non-secure EL1 accesses to virtual memory control registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIDJJAH" browsertext="'General trapping to EL2 of Non-secure EL0 and EL1 accesses to System registers, from AArch32 state only' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="CHDFFCIB" browsertext="'Traps to EL2 of Non-secure EL0 and EL1 accesses to the Generic Timer registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEICIHGI" browsertext="'Traps to EL2 of Non-secure EL0 and EL1 accesses to Performance Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="BEIGCEGJ" browsertext="'Traps to EL2 of EL1 and EL0 accesses to Activity Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDGGBGH" browsertext="'Traps to EL3 of EL2, EL1, and EL0 accesses to Performance Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="BEIHHEIC" browsertext="'Traps to EL3 of EL2, EL1, and EL0 accesses to Activity Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem></list>
+<para>The following sections describe configuration settings for generating exceptions that are reported using EC value <binarynumber>0b001100</binarynumber>:</para>
+<list type="unordered">
+<listitem><content><xref linkend="D1CHDBAICA" browsertext="'Traps to EL1 of EL0 and EL1 System register accesses to the trace registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDIGAJA" browsertext="'Traps to EL1 of EL0 accesses to the Debug Communications Channel (DCC) registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIIHJHA" browsertext="'Traps to EL2 of Non-secure system register accesses to the trace registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIHBBIC" browsertext="'Trapping System register accesses to Debug ROM registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDJICAB" browsertext="'Traps to EL3 of all System register accesses to the trace registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDDHIIA" browsertext="'Trapping System register accesses to powerdown debug registers to EL3' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDCBEHG" browsertext="'Trapping general System register accesses to debug registers to EL3' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem></list>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="CV_24_24" msb="24" lsb="24"/>
+        <fieldat id="COND_23_20" msb="23" lsb="20"/>
+        <fieldat id="Opc1_19_16" msb="19" lsb="16"/>
+        <fieldat id="0_15_15" msb="15" lsb="15"/>
+        <fieldat id="Rt2_14_10" msb="14" lsb="10"/>
+        <fieldat id="Rt_9_5" msb="9" lsb="5"/>
+        <fieldat id="CRm_4_1" msb="4" lsb="1"/>
+        <fieldat id="Direction_0_0" msb="0" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from an LDC or STC instruction</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="CV_24_24" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>CV</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>24</field_lsb>
+        <field_description order="before">
+          
+  <para>Condition code valid. Possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>The COND field is not valid.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>The COND field is valid.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>For exceptions taken from AArch64, CV is set to 1.</para>
+<para>For exceptions taken from AArch32:</para>
+<list type="unordered">
+<listitem><content>When an A32 instruction is trapped, CV is set to 1.</content>
+</listitem><listitem><content>When a T32 instruction is trapped, it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether CV is set to 1 or set to 0. See the description of the COND field for more information.</content>
+</listitem></list>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="COND_23_20" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>COND</field_name>
+        <field_msb>23</field_msb>
+        <field_lsb>20</field_lsb>
+        <field_description order="before">
+          
+  <para>The condition code for the trapped instruction. This field is valid only for exceptions taken from AArch32, and only when the value of CV is 1.</para>
+<para>For exceptions taken from AArch64, this field is set to <binarynumber>0b1110</binarynumber>.</para>
+<para>For exceptions taken from AArch32:</para>
+<list type="unordered">
+<listitem><content>When an A32 instruction is trapped, CV is set to 1 and:<list type="unordered">
+<listitem><content>If the instruction is conditional, COND is set to the condition code field value from the instruction.</content>
+</listitem><listitem><content>If the instruction is unconditional, COND is set to <binarynumber>0b1110</binarynumber>.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>A conditional A32 instruction that is known to pass its condition code check can be presented either:<list type="unordered">
+<listitem><content>With COND set to <binarynumber>0b1110</binarynumber>, the value for unconditional.</content>
+</listitem><listitem><content>With the COND value held in the instruction.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>When a T32 instruction is trapped, it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether:<list type="unordered">
+<listitem><content>CV is set to 0 and COND is set to an <arm-defined-word>UNKNOWN</arm-defined-word> value. Software must examine the SPSR.IT field to determine the condition, if any, of the T32 instruction.</content>
+</listitem><listitem><content>CV is set to 1 and COND is set to the condition code for the condition that applied to the instruction.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>For an implementation that, for both A32 and T32 instructions, takes an exception on a trapped conditional instruction only if the instruction passes its condition code check, these definitions mean that when CV is set to 1 it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether the COND field is set to <binarynumber>0b1110</binarynumber>, or to the value of any condition that applied to the instruction.</content>
+</listitem></list>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="imm8_19_12" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>imm8</field_name>
+        <field_msb>19</field_msb>
+        <field_lsb>12</field_lsb>
+        <field_description order="before">
+          
+  <para>The immediate value from the issued instruction.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="0_11_10" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>11</field_msb>
+        <field_lsb>10</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="Rn_9_5" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Rn</field_name>
+        <field_msb>9</field_msb>
+        <field_lsb>5</field_lsb>
+        <field_description order="before">
+          
+  <para>The Rn value from the issued instruction, the general-purpose register used for the transfer. The reported value gives the AArch64 view of the register. See <xref linkend="BEIDFCCE" browsertext="'Mapping of the general-purpose registers between the Execution states' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1.20.1"/>.</para>
+<para>This field is valid only when AM[2] is 0, indicating an immediate form of the LDC or STC instruction. When AM[2] is 1, indicating a literal form of the LDC or STC instruction, this field is <arm-defined-word>UNKNOWN</arm-defined-word>.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="Offset_4_4" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Offset</field_name>
+        <field_msb>4</field_msb>
+        <field_lsb>4</field_lsb>
+        <field_description order="before">
+          
+  <para>Indicates whether the offset is added or subtracted:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Subtract offset.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Add offset.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>This bit corresponds to the U bit in the instruction encoding.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="AM_3_1" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>AM</field_name>
+        <field_msb>3</field_msb>
+        <field_lsb>1</field_lsb>
+        <field_description order="before">
+          
+  <para>Addressing mode. The permitted values of this field are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b000</field_value>
+        <field_value_description>
+  <para>Immediate unindexed.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b001</field_value>
+        <field_value_description>
+  <para>Immediate post-indexed.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b010</field_value>
+        <field_value_description>
+  <para>Immediate offset.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b011</field_value>
+        <field_value_description>
+  <para>Immediate pre-indexed.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b100</field_value>
+        <field_value_description>
+  <para>For a trapped STC instruction or a trapped T32 LDC instruction this encoding is reserved.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b110</field_value>
+        <field_value_description>
+  <para>For a trapped STC instruction, this encoding is reserved.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>The values <binarynumber>0b101</binarynumber> and <binarynumber>0b111</binarynumber> are reserved. The effect of programming this field to a reserved value is that behavior is <arm-defined-word>CONSTRAINED UNPREDICTABLE</arm-defined-word>, as described in <xref linkend="CEGFJDFD" browsertext="'Reserved values in AArch64 System registers and translation table entries' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section K1.2.2"/>.</para>
+<para>Bit [2] in this subfield indicates the instruction form, immediate or literal.</para>
+<para>Bits [1:0] in this subfield correspond to the bits {P, W} in the instruction encoding.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="Direction_0_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Direction</field_name>
+        <field_msb>0</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Indicates the direction of the trapped instruction. The possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Write to memory. STC instruction.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Read from memory. LDC instruction.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  <para>The following sections describe the configuration settings for the traps that are reported using EC value <binarynumber>0b000110</binarynumber>:</para>
+<list type="unordered">
+<listitem><content><xref linkend="D1CHDIGAJA" browsertext="'Traps to EL1 of EL0 accesses to the Debug Communications Channel (DCC) registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEICAABI" browsertext="'Trapping general System register accesses to debug registers to EL2' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDCBEHG" browsertext="'Trapping general System register accesses to debug registers to EL3' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem></list>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="CV_24_24" msb="24" lsb="24"/>
+        <fieldat id="COND_23_20" msb="23" lsb="20"/>
+        <fieldat id="imm8_19_12" msb="19" lsb="12"/>
+        <fieldat id="0_11_10" msb="11" lsb="10"/>
+        <fieldat id="Rn_9_5" msb="9" lsb="5"/>
+        <fieldat id="Offset_4_4" msb="4" lsb="4"/>
+        <fieldat id="AM_3_1" msb="3" lsb="1"/>
+        <fieldat id="Direction_0_0" msb="0" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from an access to SVE, Advanced SIMD or floating-point functionality, resulting from CPACR_EL1.FPEN, CPTR_EL2.FPEN or CPTR_ELx.TFP</fields_instance>
+    <text_before_fields>
+      
+  <para>The accesses covered by this trap include:</para>
+<list type="unordered">
+<listitem><content>Execution of SVE or Advanced SIMD and floating-point instructions.</content>
+</listitem><listitem><content>Accesses to the Advanced SIMD and floating-point System registers.</content>
+</listitem></list>
+<para>For an implementation that does not include either SVE or support for floating-point and Advanced SIMD, the exception is reported using the EC value <binarynumber>0b000000</binarynumber>.</para>
+
+    </text_before_fields>
+    
+        <field 
+           id="CV_24_24" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>CV</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>24</field_lsb>
+        <field_description order="before">
+          
+  <para>Condition code valid. Possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>The COND field is not valid.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>The COND field is valid.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>For exceptions taken from AArch64, CV is set to 1.</para>
+<para>For exceptions taken from AArch32:</para>
+<list type="unordered">
+<listitem><content>When an A32 instruction is trapped, CV is set to 1.</content>
+</listitem><listitem><content>When a T32 instruction is trapped, it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether CV is set to 1 or set to 0. See the description of the COND field for more information.</content>
+</listitem></list>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="COND_23_20" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>COND</field_name>
+        <field_msb>23</field_msb>
+        <field_lsb>20</field_lsb>
+        <field_description order="before">
+          
+  <para>The condition code for the trapped instruction. This field is valid only for exceptions taken from AArch32, and only when the value of CV is 1.</para>
+<para>For exceptions taken from AArch64, this field is set to <binarynumber>0b1110</binarynumber>.</para>
+<para>For exceptions taken from AArch32:</para>
+<list type="unordered">
+<listitem><content>When an A32 instruction is trapped, CV is set to 1 and:<list type="unordered">
+<listitem><content>If the instruction is conditional, COND is set to the condition code field value from the instruction.</content>
+</listitem><listitem><content>If the instruction is unconditional, COND is set to <binarynumber>0b1110</binarynumber>.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>A conditional A32 instruction that is known to pass its condition code check can be presented either:<list type="unordered">
+<listitem><content>With COND set to <binarynumber>0b1110</binarynumber>, the value for unconditional.</content>
+</listitem><listitem><content>With the COND value held in the instruction.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>When a T32 instruction is trapped, it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether:<list type="unordered">
+<listitem><content>CV is set to 0 and COND is set to an <arm-defined-word>UNKNOWN</arm-defined-word> value. Software must examine the SPSR.IT field to determine the condition, if any, of the T32 instruction.</content>
+</listitem><listitem><content>CV is set to 1 and COND is set to the condition code for the condition that applied to the instruction.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>For an implementation that, for both A32 and T32 instructions, takes an exception on a trapped conditional instruction only if the instruction passes its condition code check, these definitions mean that when CV is set to 1 it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether the COND field is set to <binarynumber>0b1110</binarynumber>, or to the value of any condition that applied to the instruction.</content>
+</listitem></list>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="0_19_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>19</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+    <text_after_fields>
+    
+  <para>The following sections describe the configuration settings for the traps that are reported using EC value <binarynumber>0b000111</binarynumber>:</para>
+<list type="unordered">
+<listitem><content><xref linkend="D1CHDIAIGC" browsertext="'Traps to EL1 of EL0 and EL1 accesses to SIMD and floating-point functionality' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIBCFGC" browsertext="'General trapping to EL2 of Non-secure accesses to the SIMD and floating-point registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDDGEFI" browsertext="'Traps to EL3 of all accesses to the SIMD and floating-point registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/></content>
+</listitem></list>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="CV_24_24" msb="24" lsb="24"/>
+        <fieldat id="COND_23_20" msb="23" lsb="20"/>
+        <fieldat id="0_19_0" msb="19" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from an access to SVE functionality, resulting from CPACR_EL1.ZEN, CPTR_EL2.ZEN, CPTR_EL2.TZ, or CPTR_EL3.EZ</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="0_24_0_1" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+            <fields_condition>When SVE is implemented</fields_condition>
+      </field>
+        <field 
+           id="0_24_0_2" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+    <text_after_fields>
+    
+  <para>The accesses covered by this trap include:</para>
+<list type="unordered">
+<listitem><content>Execution of SVE instructions.</content>
+</listitem><listitem><content>Accesses to the SVE system registers, ZCR_ELx and ID_AA64ZFR0_EL1.</content>
+</listitem></list>
+<para>For an implementation that does not include SVE, the exception is reported using the EC value <binarynumber>0b000000</binarynumber>.</para>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+
+        <fieldat id="0_24_0_1" msb="24" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from an Illegal Execution state, or a PC or SP alignment fault</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="0_24_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+    <text_after_fields>
+    
+  <para>There are no configuration settings for generating Illegal Execution state exceptions and PC alignment fault exceptions. For more information about these exceptions see <xref linkend="CHDGFFFA" browsertext="'The Illegal Execution state exception' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/> and <xref linkend="BEIFHIFH" browsertext="'PC alignment checking' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</para>
+<para><xref linkend="BEIHDCIE" browsertext="'Stack pointer alignment checking' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/> describes the configuration settings for generating SP alignment fault exceptions.</para>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+
+        <fieldat id="0_24_0" msb="24" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from HVC or SVC instruction execution</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="0_24_16" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>16</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="imm16_15_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>imm16</field_name>
+        <field_msb>15</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>The value of the immediate field from the HVC or SVC instruction.</para>
+<para>For an HVC instruction, and for an A64 SVC instruction, this is the value of the imm16 field of the issued instruction.</para>
+<para>For an A32 or T32 SVC instruction:</para>
+<list type="unordered">
+<listitem><content>If the instruction is unconditional, then:<list type="unordered">
+<listitem><content>For the T32 instruction, this field is zero-extended from the imm8 field of the instruction.</content>
+</listitem><listitem><content>For the A32 instruction, this field is the bottom 16 bits of the imm24 field of the instruction.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>If the instruction is conditional, this field is <arm-defined-word>UNKNOWN</arm-defined-word>.</content>
+</listitem></list>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  <para>In AArch32 state, the HVC instruction is unconditional, and a conditional SVC instruction generates an exception only if it passes its condition code check. Therefore, the syndrome information for these exceptions does not require conditionality information.</para>
+<para>For T32 and A32 instructions, see <xref linkend="A32T32-base.instructions.SVC" browsertext="'SVC' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section F7 (T32 and A32 Base Instruction Set Instruction Descriptions)" filename="F_t32_a32_base_instruction_descriptions"/> and <xref linkend="A32T32-base.instructions.HVC" browsertext="'HVC' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section F7" filename="F_t32_a32_base_instruction_descriptions"/>.</para>
+<para>For A64 instructions, see <xref linkend="A64.instructions.SVC" browsertext="'SVC' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section C5 (A64 Base Instruction Descriptions)," filename="C_a64_base_instruction_descriptions"/> and <xref linkend="A64.instructions.HVC" browsertext="'HVC' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section C5" filename="C_a64_base_instruction_descriptions"/>.</para>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+
+        <fieldat id="0_24_16" msb="24" lsb="16"/>
+        <fieldat id="imm16_15_0" msb="15" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from SMC instruction execution in AArch32 state</fields_instance>
+    <text_before_fields>
+      
+  <para>For an SMC instruction that completes normally and generates an exception that is taken to EL3, the ISS encoding is <arm-defined-word>RES0</arm-defined-word>.</para>
+<para>For an SMC instruction that is trapped to EL2 from EL1 because <register_link state="AArch64" id="AArch64-hcr_el2.xml">HCR_EL2</register_link>.TSC is 1, the ISS encoding is as shown in the diagram.</para>
+
+    </text_before_fields>
+    
+        <field 
+           id="CV_24_24" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>CV</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>24</field_lsb>
+        <field_description order="before">
+          
+  <para>Condition code valid. Possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>The COND field is not valid.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>The COND field is valid.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>For exceptions taken from AArch64, CV is set to 1.</para>
+<para>For exceptions taken from AArch32:</para>
+<list type="unordered">
+<listitem><content>When an A32 instruction is trapped, CV is set to 1.</content>
+</listitem><listitem><content>When a T32 instruction is trapped, it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether CV is set to 1 or set to 0. See the description of the COND field for more information.</content>
+</listitem></list>
+<para>This field is only valid if CCKNOWNPASS is 1, otherwise it is <arm-defined-word>RES0</arm-defined-word>.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="COND_23_20" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>COND</field_name>
+        <field_msb>23</field_msb>
+        <field_lsb>20</field_lsb>
+        <field_description order="before">
+          
+  <para>The condition code for the trapped instruction. This field is valid only for exceptions taken from AArch32, and only when the value of CV is 1.</para>
+<para>For exceptions taken from AArch64, this field is set to <binarynumber>0b1110</binarynumber>.</para>
+<para>For exceptions taken from AArch32:</para>
+<list type="unordered">
+<listitem><content>When an A32 instruction is trapped, CV is set to 1 and:<list type="unordered">
+<listitem><content>If the instruction is conditional, COND is set to the condition code field value from the instruction.</content>
+</listitem><listitem><content>If the instruction is unconditional, COND is set to <binarynumber>0b1110</binarynumber>.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>A conditional A32 instruction that is known to pass its condition code check can be presented either:<list type="unordered">
+<listitem><content>With COND set to <binarynumber>0b1110</binarynumber>, the value for unconditional.</content>
+</listitem><listitem><content>With the COND value held in the instruction.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>When a T32 instruction is trapped, it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether:<list type="unordered">
+<listitem><content>CV is set to 0 and COND is set to an <arm-defined-word>UNKNOWN</arm-defined-word> value. Software must examine the SPSR.IT field to determine the condition, if any, of the T32 instruction.</content>
+</listitem><listitem><content>CV is set to 1 and COND is set to the condition code for the condition that applied to the instruction.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>For an implementation that, for both A32 and T32 instructions, takes an exception on a trapped conditional instruction only if the instruction passes its condition code check, these definitions mean that when CV is set to 1 it is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether the COND field is set to <binarynumber>0b1110</binarynumber>, or to the value of any condition that applied to the instruction.</content>
+</listitem></list>
+<para>This field is only valid if CCKNOWNPASS is 1, otherwise it is <arm-defined-word>RES0</arm-defined-word>.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="CCKNOWNPASS_19_19" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>CCKNOWNPASS</field_name>
+        <field_msb>19</field_msb>
+        <field_lsb>19</field_lsb>
+        <field_description order="before">
+          
+  <para>Indicates whether the instruction might have failed its condition code check.</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>The instruction was unconditional, or was conditional and passed its condition code check.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>The instruction was conditional, and might have failed its condition code check.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <note><para>In an implementation in which an SMC instruction that fails it code check is not trapped, this field can always return the value 0.</para></note>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="0_18_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>18</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+    <text_after_fields>
+    
+  <para><xref linkend="BEIGGFEI" browsertext="'Traps to EL2 of Non-secure EL1 execution of SMC instructions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)," filename="D_the_aarch64_system_level_programmers_model"/> describes the configuration settings for trapping SMC instructions from EL1 modes, and <xref linkend="BEIBEAGE" browsertext="'System calls' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1.16," filename="D_the_aarch64_system_level_programmers_model"/> describes the case where these exceptions are trapped to EL3.</para>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="CV_24_24" msb="24" lsb="24"/>
+        <fieldat id="COND_23_20" msb="23" lsb="20"/>
+        <fieldat id="CCKNOWNPASS_19_19" msb="19" lsb="19"/>
+        <fieldat id="0_18_0" msb="18" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from SMC instruction execution in AArch64 state</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="0_24_16" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>16</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="imm16_15_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>imm16</field_name>
+        <field_msb>15</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>The value of the immediate field from the issued SMC instruction.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  <para>The value of ISS[24:0] described here is used both:</para>
+<list type="unordered">
+<listitem><content>When an SMC instruction is trapped from EL1 modes.</content>
+</listitem><listitem><content>When an SMC instruction is not trapped, so completes normally and generates an exception that is taken to EL3.</content>
+</listitem></list>
+<para><xref linkend="BEIGGFEI" browsertext="'Traps to EL2 of Non-secure EL1 execution of SMC instructions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)," filename="D_the_aarch64_system_level_programmers_model"/> describes the configuration settings for trapping SMC instructions from Non-secure EL1 modes, and <xref linkend="BEIBEAGE" browsertext="'System calls' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1.16," filename="D_the_aarch64_system_level_programmers_model"/> describes the case where these exceptions are trapped to EL3.</para>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+
+        <fieldat id="0_24_16" msb="24" lsb="16"/>
+        <fieldat id="imm16_15_0" msb="15" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from MSR, MRS, or System instruction execution in AArch64 state</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="0_24_22" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>22</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="Op0_21_20" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Op0</field_name>
+        <field_msb>21</field_msb>
+        <field_lsb>20</field_lsb>
+        <field_description order="before">
+          
+  <para>The Op0 value from the issued instruction.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="Op2_19_17" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Op2</field_name>
+        <field_msb>19</field_msb>
+        <field_lsb>17</field_lsb>
+        <field_description order="before">
+          
+  <para>The Op2 value from the issued instruction.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="Op1_16_14" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Op1</field_name>
+        <field_msb>16</field_msb>
+        <field_lsb>14</field_lsb>
+        <field_description order="before">
+          
+  <para>The Op1 value from the issued instruction.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="CRn_13_10" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>CRn</field_name>
+        <field_msb>13</field_msb>
+        <field_lsb>10</field_lsb>
+        <field_description order="before">
+          
+  <para>The CRn value from the issued instruction.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="Rt_9_5" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Rt</field_name>
+        <field_msb>9</field_msb>
+        <field_lsb>5</field_lsb>
+        <field_description order="before">
+          
+  <para>The Rt value from the issued instruction, the general-purpose register used for the transfer.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="CRm_4_1" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>CRm</field_name>
+        <field_msb>4</field_msb>
+        <field_lsb>1</field_lsb>
+        <field_description order="before">
+          
+  <para>The CRm value from the issued instruction.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="Direction_0_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Direction</field_name>
+        <field_msb>0</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Indicates the direction of the trapped instruction. The possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Write access, including MSR instructions.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Read access, including MRS instructions.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  <para>For exceptions caused by System instructions, see <xref linkend="A64.encoding_index.system" browsertext="the 'System' subsection of 'Branches, exception generating and System instructions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section C3 (A64 Instruction Set Encoding)," filename="C_a64_instruction_set_encoding"/> for the encoding values returned by an instruction.</para>
+<para>The following sections describe configuration settings for generating the exception that is reported using EC value <binarynumber>0b011000</binarynumber>:</para>
+<list type="unordered">
+<listitem><content>In <xref linkend="D1BABDIIDI" browsertext="'EL1 configurable controls' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.<list type="unordered">
+<listitem><content><xref linkend="CHDCDIIJ" browsertext="'Traps to EL1 of EL0 execution of cache maintenance instructions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDEIDGH" browsertext="'Traps to EL1 of EL0 accesses to the CTR_EL0' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDCDHJA" browsertext="'Traps to EL1 of EL0 execution of DC ZVA instructions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDHABAB" browsertext="'Traps to EL1 of EL0 accesses to the PSTATE.{D, A, I, F} interrupt masks' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1(The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDBAICA" browsertext="'Traps to EL1 of EL0 and EL1 System register accesses to the trace registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDIGAJA" browsertext="'Traps to EL1 of EL0 accesses to the Debug Communications Channel (DCC) registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="CHDJFHAI" browsertext="'Traps to EL1 of EL0 accesses to the Generic Timer registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDGDIEA" browsertext="'Traps to EL1 of EL0 accesses to Performance Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="BEICGACA" browsertext="'Traps to EL1 of EL0 accesses to Activity Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>In <xref linkend="D1BABBABAG" browsertext="'EL2 configurable controls' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.<list type="unordered">
+<listitem><content><xref linkend="D1BEIGHBDF" browsertext="'Traps to EL2 of Non-secure EL1 accesses to virtual memory control registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="CHDEHBGG" browsertext="'Traps to EL2 of Non-secure EL0 and EL1 execution of DC ZVA instructions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEICAEBH" browsertext="'Traps to EL2 of Non-secure EL1 execution of TLB maintenance instructions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEICHIHA" browsertext="'Traps to EL2 of Non-secure EL0 and EL1 execution of cache maintenance instructions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIFCHFF" browsertext="'Traps to EL2 of Non-secure EL1 accesses to the Auxiliary Control Register' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="BEIDHFBB" browsertext="'Traps to EL2 of Non-secure EL0 and EL1 accesses to lockdown, DMA, and TCM operations' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIEEJIA" browsertext="'Traps to EL2 of Non-secure EL0 and EL1 accesses to the ID registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIFFJII" browsertext="'Trapping to EL2 of Non-secure EL1 accesses to the CPACR_EL1 or CPACR' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIIHJHA" browsertext="'Traps to EL2 of Non-secure system register accesses to the trace registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIHBBIC" browsertext="'Trapping System register accesses to Debug ROM registers to EL2' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEIECEGJ" browsertext="'Trapping System register accesses to OS-related debug registers to EL2' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="CHDFFCIB" browsertext="'Traps to EL2 of Non-secure EL0 and EL1 accesses to the Generic Timer registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEICAABI" browsertext="'Trapping general System register accesses to debug registers to EL2' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1BEICIHGI" browsertext="'Traps to EL2 of Non-secure EL0 and EL1 accesses to Performance Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="BEIGCEGJ" browsertext="'Traps to EL2 of EL1 and EL0 accesses to Activity Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="CHDIAGCG" browsertext="'Trap to EL2 Non-secure EL1 accesses to Pointer authentication key registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="CHDCFJDF" browsertext="'Traps to EL2 for Nested virtualization' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="CHDIJFAH" browsertext="'Trap to EL2 Non-secure EL1 accesses to AT S1E* instructions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="CHDFBJDH" browsertext="'Trap to EL3 accesses to Pointer authentication key registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem></list>
+</content>
+</listitem><listitem><content>In <xref linkend="D1BABCFDGA" browsertext="'EL3 configurable controls' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.<list type="unordered">
+<listitem><content><xref linkend="CHDHAJBA" browsertext="'Traps to EL3 of Secure EL1 accesses to the Counter-timer Physical Secure timer registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDJACCE" browsertext="'Trapping to EL3 of EL2 accesses to the CPTR_EL2 or HCPTR, and EL2 and EL1 accesses to the CPACR_EL1 or CPACR' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDJICAB" browsertext="'Traps to EL3 of all System register accesses to the trace registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDDHIIA" browsertext="'Trapping System register accesses to OS-related debug registers to EL3' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDCBEHG" browsertext="'Trapping general System register accesses to debug registers to EL3' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="D1CHDGGBGH" browsertext="'Traps to EL3 of EL2, EL1, and EL0 accesses to Performance Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="BEIHHEIC" browsertext="'Traps to EL3 of EL2, EL1, and EL0 accesses to Activity Monitors registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1 (The AArch64 System Level Programmers' Model)" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem></list>
+</content>
+</listitem></list>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="0_24_22" msb="24" lsb="22"/>
+        <fieldat id="Op0_21_20" msb="21" lsb="20"/>
+        <fieldat id="Op2_19_17" msb="19" lsb="17"/>
+        <fieldat id="Op1_16_14" msb="16" lsb="14"/>
+        <fieldat id="CRn_13_10" msb="13" lsb="10"/>
+        <fieldat id="Rt_9_5" msb="9" lsb="5"/>
+        <fieldat id="CRm_4_1" msb="4" lsb="1"/>
+        <fieldat id="Direction_0_0" msb="0" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>IMPLEMENTATION DEFINED exception to EL3</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="IMPLEMENTATION DEFINED_24_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>IMPLEMENTATION DEFINED</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+            <para><arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word>.</para>
+          
+  
+
+        </field_description>
+        <field_values>
+             
+               <field_value_name>I</field_value_name>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+
+        <fieldat id="IMPLEMENTATION DEFINED_24_0" msb="24" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from an Instruction Abort</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="0_24_13" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>13</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="SET_12_11" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>SET</field_name>
+        <field_msb>12</field_msb>
+        <field_lsb>11</field_lsb>
+        <field_description order="before">
+          
+  <para>Synchronous Error Type. When the RAS Extension is implemented and IFSC is <binarynumber>0b010000</binarynumber>, describes the state of the PE after taking the Instruction Abort exception. The possible values of this field are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b00</field_value>
+        <field_value_description>
+  <para>Recoverable error (UER).</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b10</field_value>
+        <field_value_description>
+  <para>Uncontainable error (UC).</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b11</field_value>
+        <field_value_description>
+  <para>Restartable error (UEO) or Corrected error (CE).</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>All other values are reserved.</para>
+<note><para>Software can use this information to determine what recovery might be possible. Taking a synchronous External Abort exception might result in an unrecoverable PE state.</para></note><para>This field is <arm-defined-word>RES0</arm-defined-word> if either:</para>
+<list type="unordered">
+<listitem><content>The RAS Extension is not implemented.</content>
+</listitem><listitem><content>The value returned in the IFSC field is not <binarynumber>0b010000</binarynumber>.</content>
+</listitem></list>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="FnV_10_10" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>FnV</field_name>
+        <field_msb>10</field_msb>
+        <field_lsb>10</field_lsb>
+        <field_description order="before">
+          
+  <para>FAR not Valid, for a synchronous External abort other than a synchronous External abort on a translation table walk.</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>FAR is valid.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>FAR is not valid, and holds an <arm-defined-word>UNKNOWN</arm-defined-word> value.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>This field is only valid if the IFSC code is <binarynumber>0b010000</binarynumber>. It is <arm-defined-word>RES0</arm-defined-word> for all other aborts.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="EA_9_9" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>EA</field_name>
+        <field_msb>9</field_msb>
+        <field_lsb>9</field_lsb>
+        <field_description order="before">
+          
+  <para>External abort type. This bit can provide an <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> classification of External aborts.</para>
+<para>For any abort other than an External abort this bit returns a value of 0.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="0_8_8" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>8</field_msb>
+        <field_lsb>8</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="S1PTW_7_7" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>S1PTW</field_name>
+        <field_msb>7</field_msb>
+        <field_lsb>7</field_lsb>
+        <field_description order="before">
+          
+  <para>For a stage 2 fault, indicates whether the fault was a stage 2 fault on an access made for a stage 1 translation table walk:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Fault not on a stage 2 translation for a stage 1 translation table walk.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Fault on the stage 2 translation of an access for a stage 1 translation table walk.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>For any abort other than a stage 2 fault this bit is <arm-defined-word>RES0</arm-defined-word>.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="0_6_6" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>6</field_msb>
+        <field_lsb>6</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="IFSC_5_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>IFSC</field_name>
+        <field_msb>5</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Instruction Fault Status Code. Possible values of this field are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b000000</field_value>
+        <field_value_description>
+  <para>Address size fault, level 0 of translation or translation table base register</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b000001</field_value>
+        <field_value_description>
+  <para>Address size fault, level 1</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b000010</field_value>
+        <field_value_description>
+  <para>Address size fault, level 2</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b000011</field_value>
+        <field_value_description>
+  <para>Address size fault, level 3</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b000100</field_value>
+        <field_value_description>
+  <para>Translation fault, level 0</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b000101</field_value>
+        <field_value_description>
+  <para>Translation fault, level 1</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b000110</field_value>
+        <field_value_description>
+  <para>Translation fault, level 2</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b000111</field_value>
+        <field_value_description>
+  <para>Translation fault, level 3</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b001001</field_value>
+        <field_value_description>
+  <para>Access flag fault, level 1</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b001010</field_value>
+        <field_value_description>
+  <para>Access flag fault, level 2</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b001011</field_value>
+        <field_value_description>
+  <para>Access flag fault, level 3</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b001101</field_value>
+        <field_value_description>
+  <para>Permission fault, level 1</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b001110</field_value>
+        <field_value_description>
+  <para>Permission fault, level 2</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b001111</field_value>
+        <field_value_description>
+  <para>Permission fault, level 3</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b010000</field_value>
+        <field_value_description>
+  <para>Synchronous External abort, not on translation table walk</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b010100</field_value>
+        <field_value_description>
+  <para>Synchronous External abort, on translation table walk, level 0</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b010101</field_value>
+        <field_value_description>
+  <para>Synchronous External abort, on translation table walk, level 1</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b010110</field_value>
+        <field_value_description>
+  <para>Synchronous External abort, on translation table walk, level 2</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b010111</field_value>
+        <field_value_description>
+  <para>Synchronous External abort, on translation table walk, level 3</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b011000</field_value>
+        <field_value_description>
+  <para>Synchronous parity or ECC error on memory access, not on translation table walk</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b011100</field_value>
+        <field_value_description>
+  <para>Synchronous parity or ECC error on memory access on translation table walk, level 0</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b011101</field_value>
+        <field_value_description>
+  <para>Synchronous parity or ECC error on memory access on translation table walk, level 1</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b011110</field_value>
+        <field_value_description>
+  <para>Synchronous parity or ECC error on memory access on translation table walk, level 2</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b011111</field_value>
+        <field_value_description>
+  <para>Synchronous parity or ECC error on memory access on translation table walk, level 3</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b110000</field_value>
+        <field_value_description>
+  <para>TLB conflict abort</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b110001</field_value>
+        <field_value_description>
+  <para>Unsupported atomic hardware update fault, if the implementation includes <xref browsertext="ARMv8.1-TTHM]" filename="A_introduction_to_the_armv8_architecture.fm" linkend="v8.1.TTHM"></xref>. Otherwise reserved.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>All other values are reserved.</para>
+<para>When the RAS Extension is implemented, <binarynumber>0b011000</binarynumber>, <binarynumber>0b011100</binarynumber>, <binarynumber>0b011101</binarynumber>, <binarynumber>0b011110</binarynumber>, and <binarynumber>0b011111</binarynumber>, are reserved.</para>
+<note><para>Armv8.2 requires the implementation of the RAS Extension.</para></note><para>For more information about the lookup level associated with a fault, see <xref linkend="CACDHEEH" browsertext="'The level associated with MMU faults' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile"/>.</para>
+<note><para>Because Access flag faults and Permission faults can only result from a Block or Page translation table descriptor, they cannot occur at level 0.</para></note><para>If the S1PTW bit is set, then the level refers the level of the stage2 translation that is translating a stage 1 translation walk.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="0_24_13" msb="24" lsb="13"/>
+        <fieldat id="SET_12_11" msb="12" lsb="11"/>
+        <fieldat id="FnV_10_10" msb="10" lsb="10"/>
+        <fieldat id="EA_9_9" msb="9" lsb="9"/>
+        <fieldat id="0_8_8" msb="8" lsb="8"/>
+        <fieldat id="S1PTW_7_7" msb="7" lsb="7"/>
+        <fieldat id="0_6_6" msb="6" lsb="6"/>
+        <fieldat id="IFSC_5_0" msb="5" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from a Data Abort</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="ISV_24_24" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>ISV</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>24</field_lsb>
+        <field_description order="before">
+          
+  <para>Instruction syndrome valid. Indicates whether the syndrome information in ISS[23:0] is valid.</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>No valid instruction syndrome. ISS[23:14] are <arm-defined-word>RES0</arm-defined-word>.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>ISS[23:14] hold a valid instruction syndrome.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>This bit is 0 for all faults reported in ESR_EL2 except the following stage 2 aborts:</para>
+<list type="unordered">
+<listitem><content>AArch64 loads and stores of a single general-purpose register (including the register specified with <binarynumber>0b11111</binarynumber>, including those with Acquire/Release semantics, but excluding Load Exclusive or Store Exclusive and excluding those with writeback.</content>
+</listitem><listitem><content>AArch32 instructions where the instruction:<list type="unordered">
+<listitem><content>Is an LDR, LDA, LDRT, LDRSH, LDRSHT, LDRH, LDAH, LDRHT, LDRSB, LDRSBT, LDRB, LDAB, LDRBT, STR, STL, STRT, STRH, STLH, STRHT, STRB, STLB, or STRBT instruction.</content>
+</listitem><listitem><content>Is not performing register writeback.</content>
+</listitem><listitem><content>Is not using R15 as a source or destination register.</content>
+</listitem></list>
+</content>
+</listitem></list>
+<para>For these cases, ISV is <arm-defined-word>UNKNOWN</arm-defined-word> if the exception was generated in Debug state in memory access mode, and otherwise indicates whether ISS[23:14] hold a valid syndrome.</para>
+<para>ISV is 0 for all faults reported in ESR_EL1 or ESR_EL3.</para>
+<para>When the RAS Extension is implemented, ISV is 0 for any synchronous External abort.</para>
+<para>For ISS reporting, a stage 2 abort on a stage 1 translation table walk does not return a valid instruction syndrome, and therefore ISV is 0 for these aborts.</para>
+<para>When the RAS Extension is not implemented, the value of ISV on a synchronous External abort on a stage 2 translation table walk is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word>.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="SAS_23_22" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>SAS</field_name>
+        <field_msb>23</field_msb>
+        <field_lsb>22</field_lsb>
+        <field_description order="before">
+          
+  <para>Syndrome Access Size. When ISV is 1, indicates the size of the access attempted by the faulting operation.</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b00</field_value>
+        <field_value_description>
+  <para>Byte</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b01</field_value>
+        <field_value_description>
+  <para>Halfword</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b10</field_value>
+        <field_value_description>
+  <para>Word</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b11</field_value>
+        <field_value_description>
+  <para>Doubleword</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>This field is <arm-defined-word>UNKNOWN</arm-defined-word> when the value of ISV is <arm-defined-word>UNKNOWN</arm-defined-word>.</para>
+<para>This field is <arm-defined-word>RES0</arm-defined-word> when the value of ISV is 0.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="SSE_21_21" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>SSE</field_name>
+        <field_msb>21</field_msb>
+        <field_lsb>21</field_lsb>
+        <field_description order="before">
+          
+  <para>Syndrome Sign Extend. When ISV is 1, for a byte, halfword, or word load operation, indicates whether the data item must be sign extended. For these cases, the possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Sign-extension not required.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Data item must be sign-extended.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>For all other operations this bit is 0.</para>
+<para>This field is <arm-defined-word>UNKNOWN</arm-defined-word> when the value of ISV is <arm-defined-word>UNKNOWN</arm-defined-word>.</para>
+<para>This field is <arm-defined-word>RES0</arm-defined-word> when the value of ISV is 0.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="SRT_20_16" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>SRT</field_name>
+        <field_msb>20</field_msb>
+        <field_lsb>16</field_lsb>
+        <field_description order="before">
+          
+  <para>Syndrome Register transfer. When ISV is 1, the register number of the Rt operand of the faulting instruction. If the exception was taken from an Exception level that is using AArch32 then this is the AArch64 view of the register. See <xref linkend="BEIDFCCE" browsertext="'Mapping of the general-purpose registers between the Execution states' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1.20.1"/>.</para>
+<para>This field is <arm-defined-word>UNKNOWN</arm-defined-word> when the value of ISV is <arm-defined-word>UNKNOWN</arm-defined-word>.</para>
+<para>This field is <arm-defined-word>RES0</arm-defined-word> when the value of ISV is 0.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="SF_15_15" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>SF</field_name>
+        <field_msb>15</field_msb>
+        <field_lsb>15</field_lsb>
+        <field_description order="before">
+          
+  <para>Width of the register accessed by the instruction is Sixty-Four. When ISV is 1, the possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Instruction loads/stores a 32-bit wide register.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Instruction loads/stores a 64-bit wide register.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <note><para>This field specifies the register width identified by the instruction, not the Execution state.</para></note><para>This field is <arm-defined-word>UNKNOWN</arm-defined-word> when the value of ISV is <arm-defined-word>UNKNOWN</arm-defined-word>.</para>
+<para>This field is <arm-defined-word>RES0</arm-defined-word> when the value of ISV is 0.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="AR_14_14" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>AR</field_name>
+        <field_msb>14</field_msb>
+        <field_lsb>14</field_lsb>
+        <field_description order="before">
+          
+  <para>Acquire/Release. When ISV is 1, the possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Instruction did not have acquire/release semantics.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Instruction did have acquire/release semantics.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>This field is <arm-defined-word>UNKNOWN</arm-defined-word> when the value of ISV is <arm-defined-word>UNKNOWN</arm-defined-word>.</para>
+<para>This field is <arm-defined-word>RES0</arm-defined-word> when the value of ISV is 0.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="VNCR_13_13_1" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>VNCR</field_name>
+        <field_msb>13</field_msb>
+        <field_lsb>13</field_lsb>
+        <field_description order="before">
+          
+  <para>Indicates that the fault came from use of <register_link state="AArch64" id="AArch64-vncr_el2.xml">VNCR_EL2</register_link> register by EL1 code.</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>The fault was not generated by the use of <register_link state="AArch64" id="AArch64-vncr_el2.xml">VNCR_EL2</register_link>, by an MRS or MSR instruction executed at EL1.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>The fault was generated by the use of <register_link state="AArch64" id="AArch64-vncr_el2.xml">VNCR_EL2</register_link>, by an MRS or MSR instruction executed at EL1.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>This field is 0 in ESR_EL1.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+            <fields_condition>When ARMv8.4-NV is implemented</fields_condition>
+      </field>
+        <field 
+           id="0_13_13_2" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>13</field_msb>
+        <field_lsb>13</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="SET_12_11" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>SET</field_name>
+        <field_msb>12</field_msb>
+        <field_lsb>11</field_lsb>
+        <field_description order="before">
+          
+  <para>Synchronous Error Type. When the RAS Extension is implemented and DFSC is <binarynumber>0b010000</binarynumber>, describes the state of the PE after taking the Data Abort exception. The possible values of this field are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b00</field_value>
+        <field_value_description>
+  <para>Recoverable error (UER).</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b10</field_value>
+        <field_value_description>
+  <para>Uncontainable error (UC).</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b11</field_value>
+        <field_value_description>
+  <para>Restartable error (UEO) or Corrected error (CE).</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>All other values are reserved.</para>
+<note><para>Software can use this information to determine what recovery might be possible. Taking a synchronous External Abort exception might result in an unrecoverable PE state.</para></note><para>This field is <arm-defined-word>RES0</arm-defined-word> if either:</para>
+<list type="unordered">
+<listitem><content>The RAS Extension is not implemented.</content>
+</listitem><listitem><content>The value returned in the DFSC field is not <binarynumber>0b010000</binarynumber>.</content>
+</listitem></list>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="FnV_10_10" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>FnV</field_name>
+        <field_msb>10</field_msb>
+        <field_lsb>10</field_lsb>
+        <field_description order="before">
+          
+  <para>FAR not Valid, for a synchronous External abort other than a synchronous External abort on a translation table walk.</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>FAR is valid.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>FAR is not valid, and holds an <arm-defined-word>UNKNOWN</arm-defined-word> value.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>This field is valid only if the DFSC code is <binarynumber>0b010000</binarynumber>. It is <arm-defined-word>RES0</arm-defined-word> for all other aborts.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="EA_9_9" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>EA</field_name>
+        <field_msb>9</field_msb>
+        <field_lsb>9</field_lsb>
+        <field_description order="before">
+          
+  <para>External abort type. This bit can provide an <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> classification of External aborts.</para>
+<para>For any abort other than an External abort this bit returns a value of 0.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="CM_8_8" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>CM</field_name>
+        <field_msb>8</field_msb>
+        <field_lsb>8</field_lsb>
+        <field_description order="before">
+          
+  <para>Cache maintenance. Indicates whether the Data Abort came from a cache maintenance or address translation instruction:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>The Data Abort was not generated by the execution of one of the System instructions identified in the description of value 1.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>The Data Abort was generated by either the execution of a cache maintenance instruction or by a synchronous fault on the execution of an address translation instruction. The <register_link id="AArch64-dc-zva.xml" state="AArch64">DC ZVA</register_link> instruction is not classified as a cache maintenance instruction, and therefore its execution cannot cause this field to be set to 1.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="S1PTW_7_7" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>S1PTW</field_name>
+        <field_msb>7</field_msb>
+        <field_lsb>7</field_lsb>
+        <field_description order="before">
+          
+  <para>For a stage 2 fault, indicates whether the fault was a stage 2 fault on an access made for a stage 1 translation table walk:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Fault not on a stage 2 translation for a stage 1 translation table walk.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Fault on the stage 2 translation of an access for a stage 1 translation table walk.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>For any abort other than a stage 2 fault this bit is <arm-defined-word>RES0</arm-defined-word>.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="WnR_6_6" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>WnR</field_name>
+        <field_msb>6</field_msb>
+        <field_lsb>6</field_lsb>
+        <field_description order="before">
+          
+  <para>Write not Read. Indicates whether a synchronous abort was caused by an instruction writing to a memory location, or by an instruction reading from a memory location. The possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Abort caused by an instruction reading from a memory location.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Abort caused by an instruction writing to a memory location.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>For faults on cache maintenance and address translation instructions, this bit always returns a value of 1.</para>
+<para>For faults from an atomic instruction that both reads and writes from a memory location, this bit is set to 0 if a read of the address specified by the instruction would have generated the fault which is being reported, otherwise it is set to 1. The architecture permits, but does not require, a relaxation of this requirement such that for all stage 2 aborts on stage 1 translation table walks for atomic instructions, the WnR bit is always 0.</para>
+<para>This field is <arm-defined-word>UNKNOWN</arm-defined-word> for:</para>
+<list type="unordered">
+<listitem><content>An External abort on an Atomic access.</content>
+</listitem><listitem><content>A fault reported using a DFSC value of <binarynumber>0b110101</binarynumber> or <binarynumber>0b110001</binarynumber>, indicating an unsupported Exclusive or atomic access.</content>
+</listitem></list>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="DFSC_5_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>DFSC</field_name>
+        <field_msb>5</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Data Fault Status Code. Possible values of this field are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b000000</field_value>
+        <field_value_description>
+  <para>Address size fault, level 0 of translation or translation table base register.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b000001</field_value>
+        <field_value_description>
+  <para>Address size fault, level 1.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b000010</field_value>
+        <field_value_description>
+  <para>Address size fault, level 2.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b000011</field_value>
+        <field_value_description>
+  <para>Address size fault, level 3.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b000100</field_value>
+        <field_value_description>
+  <para>Translation fault, level 0.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b000101</field_value>
+        <field_value_description>
+  <para>Translation fault, level 1.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b000110</field_value>
+        <field_value_description>
+  <para>Translation fault, level 2.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b000111</field_value>
+        <field_value_description>
+  <para>Translation fault, level 3.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b001001</field_value>
+        <field_value_description>
+  <para>Access flag fault, level 1.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b001010</field_value>
+        <field_value_description>
+  <para>Access flag fault, level 2.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b001011</field_value>
+        <field_value_description>
+  <para>Access flag fault, level 3.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b001101</field_value>
+        <field_value_description>
+  <para>Permission fault, level 1.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b001110</field_value>
+        <field_value_description>
+  <para>Permission fault, level 2.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b001111</field_value>
+        <field_value_description>
+  <para>Permission fault, level 3.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b010000</field_value>
+        <field_value_description>
+  <para>Synchronous External abort, not on translation table walk.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b010001</field_value>
+        <field_value_description>
+  <para>Synchronous Tag Check fail</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b010100</field_value>
+        <field_value_description>
+  <para>Synchronous External abort, on translation table walk, level 0.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b010101</field_value>
+        <field_value_description>
+  <para>Synchronous External abort, on translation table walk, level 1.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b010110</field_value>
+        <field_value_description>
+  <para>Synchronous External abort, on translation table walk, level 2.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b010111</field_value>
+        <field_value_description>
+  <para>Synchronous External abort, on translation table walk, level 3.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b011000</field_value>
+        <field_value_description>
+  <para>Synchronous parity or ECC error on memory access, not on translation table walk.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b011100</field_value>
+        <field_value_description>
+  <para>Synchronous parity or ECC error on memory access on translation table walk, level 0.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b011101</field_value>
+        <field_value_description>
+  <para>Synchronous parity or ECC error on memory access on translation table walk, level 1.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b011110</field_value>
+        <field_value_description>
+  <para>Synchronous parity or ECC error on memory access on translation table walk, level 2.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b011111</field_value>
+        <field_value_description>
+  <para>Synchronous parity or ECC error on memory access on translation table walk, level 3.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b100001</field_value>
+        <field_value_description>
+  <para>Alignment fault.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b110000</field_value>
+        <field_value_description>
+  <para>TLB conflict abort.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b110001</field_value>
+        <field_value_description>
+  <para>Unsupported atomic hardware update fault, if the implementation includes <xref browsertext="ARMv8.1-TTHM]" filename="A_introduction_to_the_armv8_architecture.fm" linkend="v8.1.TTHM"></xref>. Otherwise reserved.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b110100</field_value>
+        <field_value_description>
+  <para><arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> fault (Lockdown).</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b110101</field_value>
+        <field_value_description>
+  <para><arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> fault (Unsupported Exclusive or Atomic access).</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b111101</field_value>
+        <field_value_description>
+  <para>Section Domain Fault, used only for faults reported in the <register_link state="AArch64" id="AArch64-par_el1.xml">PAR_EL1</register_link>.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b111110</field_value>
+        <field_value_description>
+  <para>Page Domain Fault, used only for faults reported in the <register_link state="AArch64" id="AArch64-par_el1.xml">PAR_EL1</register_link>.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>All other values are reserved.</para>
+<para>When the RAS Extension is implemented, <binarynumber>0b011000</binarynumber>, <binarynumber>0b011100</binarynumber>, <binarynumber>0b011101</binarynumber>, <binarynumber>0b011110</binarynumber>, and <binarynumber>0b011111</binarynumber>, are reserved.</para>
+<para>For more information about the lookup level associated with a fault, see <xref linkend="CACDHEEH" browsertext="'The level associated with MMU faults' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile"/>.</para>
+<note><para>Because Access flag faults and Permission faults can only result from a Block or Page translation table descriptor, they cannot occur at level 0.</para></note><para>If the S1PTW bit is set, then the level refers the level of the stage2 translation that is translating a stage 1 translation walk.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="ISV_24_24" msb="24" lsb="24"/>
+        <fieldat id="SAS_23_22" msb="23" lsb="22"/>
+        <fieldat id="SSE_21_21" msb="21" lsb="21"/>
+        <fieldat id="SRT_20_16" msb="20" lsb="16"/>
+        <fieldat id="SF_15_15" msb="15" lsb="15"/>
+        <fieldat id="AR_14_14" msb="14" lsb="14"/>
+        <fieldat id="VNCR_13_13_1" msb="13" lsb="13"/>
+        <fieldat id="SET_12_11" msb="12" lsb="11"/>
+        <fieldat id="FnV_10_10" msb="10" lsb="10"/>
+        <fieldat id="EA_9_9" msb="9" lsb="9"/>
+        <fieldat id="CM_8_8" msb="8" lsb="8"/>
+        <fieldat id="S1PTW_7_7" msb="7" lsb="7"/>
+        <fieldat id="WnR_6_6" msb="6" lsb="6"/>
+        <fieldat id="DFSC_5_0" msb="5" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from a trapped floating-point exception</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="0_24_24" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>24</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="TFV_23_23" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>TFV</field_name>
+        <field_msb>23</field_msb>
+        <field_lsb>23</field_lsb>
+        <field_description order="before">
+          
+  <para>Trapped Fault Valid bit. Indicates whether the IDF, IXF, UFF, OFF, DZF, and IOF bits hold valid information about trapped floating-point exceptions. The possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>The IDF, IXF, UFF, OFF, DZF, and IOF bits do not hold valid information about trapped floating-point exceptions and are <arm-defined-word>UNKNOWN</arm-defined-word>.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>One or more floating-point exceptions occurred during an operation performed while executing the reported instruction. The IDF, IXF, UFF, OFF, DZF, and IOF bits indicate trapped floating-point exceptions that occurred. For more information see <xref linkend="BEIJDDAG" browsertext="'Floating-point exception traps' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1.13.4"/>.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>It is <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> whether this field is set to 0 on an exception generated by a trapped floating point exception from a vector instruction.</para>
+<note><para>This is not a requirement. Implementations can set this field to 1 on a trapped floating-point exception from a vector instruction and return valid information in the {IDF, IXF, UFF, OFF, DZF, IOF} fields.</para></note>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="0_22_11" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>22</field_msb>
+        <field_lsb>11</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="VECITR_10_8" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>VECITR</field_name>
+        <field_msb>10</field_msb>
+        <field_lsb>8</field_lsb>
+        <field_description order="before">
+          
+  <para>For a trapped floating-point exception from an instruction executed in AArch32 state this field is <arm-defined-word>RES1</arm-defined-word>.</para>
+<para>For a trapped floating-point exception from an instruction executed in AArch64 state this field is <arm-defined-word>UNKNOWN</arm-defined-word>.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="IDF_7_7" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>IDF</field_name>
+        <field_msb>7</field_msb>
+        <field_lsb>7</field_lsb>
+        <field_description order="before">
+          
+  <para>Input Denormal floating-point exception trapped bit. If the TFV field is 0, this bit is <arm-defined-word>UNKNOWN</arm-defined-word>. Otherwise, the possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Input denormal floating-point exception has not occurred.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Input denormal floating-point exception occurred during execution of the reported instruction.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="0_6_5" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>6</field_msb>
+        <field_lsb>5</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="IXF_4_4" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>IXF</field_name>
+        <field_msb>4</field_msb>
+        <field_lsb>4</field_lsb>
+        <field_description order="before">
+          
+  <para>Inexact floating-point exception trapped bit. If the TFV field is 0, this bit is <arm-defined-word>UNKNOWN</arm-defined-word>. Otherwise, the possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Inexact floating-point exception has not occurred.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Inexact floating-point exception occurred during execution of the reported instruction.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="UFF_3_3" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>UFF</field_name>
+        <field_msb>3</field_msb>
+        <field_lsb>3</field_lsb>
+        <field_description order="before">
+          
+  <para>Underflow floating-point exception trapped bit. If the TFV field is 0, this bit is <arm-defined-word>UNKNOWN</arm-defined-word>. Otherwise, the possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Underflow floating-point exception has not occurred.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Underflow floating-point exception occurred during execution of the reported instruction.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="OFF_2_2" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>OFF</field_name>
+        <field_msb>2</field_msb>
+        <field_lsb>2</field_lsb>
+        <field_description order="before">
+          
+  <para>Overflow floating-point exception trapped bit. If the TFV field is 0, this bit is <arm-defined-word>UNKNOWN</arm-defined-word>. Otherwise, the possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Overflow floating-point exception has not occurred.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Overflow floating-point exception occurred during execution of the reported instruction.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="DZF_1_1" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>DZF</field_name>
+        <field_msb>1</field_msb>
+        <field_lsb>1</field_lsb>
+        <field_description order="before">
+          
+  <para>Divide by Zero floating-point exception trapped bit. If the TFV field is 0, this bit is <arm-defined-word>UNKNOWN</arm-defined-word>. Otherwise, the possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Divide by Zero floating-point exception has not occurred.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Divide by Zero floating-point exception occurred during execution of the reported instruction.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="IOF_0_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>IOF</field_name>
+        <field_msb>0</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Invalid Operation floating-point exception trapped bit. If the TFV field is 0, this bit is <arm-defined-word>UNKNOWN</arm-defined-word>. Otherwise, the possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Invalid Operation floating-point exception has not occurred.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Invalid Operation floating-point exception occurred during execution of the reported instruction.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  <para>In an implementation that supports the trapping of floating-point exceptions:</para>
+<list type="unordered">
+<listitem><content>From an Exception level using AArch64, the <register_link state="AArch64" id="AArch64-fpcr.xml">FPCR</register_link>.{IDE, IXE, UFE, OFE, DZE, IOE} bits enable each of the floating-point exception traps.</content>
+</listitem><listitem><content>From an Exception level using AArch32, the <register_link state="AArch32" id="AArch32-fpscr.xml">FPSCR</register_link>.{IDE, IXE, UFE, OFE, DZE, IOE} bits enable each of the floating-point exception traps.</content>
+</listitem></list>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="0_24_24" msb="24" lsb="24"/>
+        <fieldat id="TFV_23_23" msb="23" lsb="23"/>
+        <fieldat id="0_22_11" msb="22" lsb="11"/>
+        <fieldat id="VECITR_10_8" msb="10" lsb="8"/>
+        <fieldat id="IDF_7_7" msb="7" lsb="7"/>
+        <fieldat id="0_6_5" msb="6" lsb="5"/>
+        <fieldat id="IXF_4_4" msb="4" lsb="4"/>
+        <fieldat id="UFF_3_3" msb="3" lsb="3"/>
+        <fieldat id="OFF_2_2" msb="2" lsb="2"/>
+        <fieldat id="DZF_1_1" msb="1" lsb="1"/>
+        <fieldat id="IOF_0_0" msb="0" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>SError interrupt</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="IDS_24_24" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>IDS</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>24</field_lsb>
+        <field_description order="before">
+          
+  <para><arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> syndrome. Possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Bits[23:0] of the ISS field holds the fields described in this encoding.</para>
+<note><para>If the RAS Extension is not implemented, this means that bits[23:0] of the ISS field are <arm-defined-word>RES0</arm-defined-word>.</para></note>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Bits[23:0] of the ISS field holds <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> syndrome information that can be used to provide additional information about the SError interrupt.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <note><para>This field was previously called ISV.</para></note>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="0_23_14" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>23</field_msb>
+        <field_lsb>14</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="IESB_13_13_1" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>IESB</field_name>
+        <field_msb>13</field_msb>
+        <field_lsb>13</field_lsb>
+        <field_description order="before">
+          
+  <para>Implicit error synchronization event.</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>The SError interrupt was either not synchronized by the implicit error synchronization event or not taken immediately.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>The SError interrupt was synchronized by the implicit error synchronization event and taken immediately.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>This field is <arm-defined-word>RES0</arm-defined-word> if the value returned in the DFSC field is not <binarynumber>0b010001</binarynumber>.</para>
+<note><para>Armv8.2 requires the implementation of the RAS Extension and <xref linkend="v8.2.IESB" browsertext="ARMv8.2-IESB" filename="A_introduction_to_the_armv8_architecture.fm"/>.</para></note>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+            <fields_condition>When ARMv8.2-IESB is implemented</fields_condition>
+      </field>
+        <field 
+           id="0_13_13_2" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>13</field_msb>
+        <field_lsb>13</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="AET_12_10" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>AET</field_name>
+        <field_msb>12</field_msb>
+        <field_lsb>10</field_lsb>
+        <field_description order="before">
+          
+  <para>Asynchronous Error Type.</para>
+<para>When the RAS Extension is implemented and DFSC is <binarynumber>0b010001</binarynumber>, describes the state of the PE after taking the SError interrupt exception. The possible values of this field are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b000</field_value>
+        <field_value_description>
+  <para>Uncontainable error (UC).</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b001</field_value>
+        <field_value_description>
+  <para>Unrecoverable error (UEU).</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b010</field_value>
+        <field_value_description>
+  <para>Restartable error (UEO).</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b011</field_value>
+        <field_value_description>
+  <para>Recoverable error (UER).</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b110</field_value>
+        <field_value_description>
+  <para>Corrected error (CE).</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>All other values are reserved.</para>
+<para>If multiple errors are taken as a single SError interrupt exception, the overall state of the PE is reported. For example, if both a Recoverable and Unrecoverable error occurred, the state is Unrecoverable.</para>
+<note><para>Software can use this information to determine what recovery might be possible. The recovery software must also examine any implemented fault records to determine the location and extent of the error.</para></note><para>This field is <arm-defined-word>RES0</arm-defined-word> if either:</para>
+<list type="unordered">
+<listitem><content>The RAS Extension is not implemented.</content>
+</listitem><listitem><content>The value returned in the DFSC field is not <binarynumber>0b010001</binarynumber>.</content>
+</listitem></list>
+<note><para>Armv8.2 requires the implementation of the RAS Extension.</para></note>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="EA_9_9" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>EA</field_name>
+        <field_msb>9</field_msb>
+        <field_lsb>9</field_lsb>
+        <field_description order="before">
+          
+  <para>External abort type. When the RAS Extension is implemented, this bit can provide an <arm-defined-word>IMPLEMENTATION DEFINED</arm-defined-word> classification of External aborts.</para>
+<para>For any abort other than an External abort this bit returns a value of 0.</para>
+<para>This field is <arm-defined-word>RES0</arm-defined-word> if either:</para>
+<list type="unordered">
+<listitem><content>The RAS Extension is not implemented.</content>
+</listitem><listitem><content>The value returned in the DFSC field is not <binarynumber>0b010001</binarynumber>.</content>
+</listitem></list>
+<note><para>Armv8.2 requires the implementation of the RAS Extension.</para></note>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="0_8_6" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>8</field_msb>
+        <field_lsb>6</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="DFSC_5_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>DFSC</field_name>
+        <field_msb>5</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Data Fault Status Code. When the RAS Extension is implemented, possible values of this field are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b000000</field_value>
+        <field_value_description>
+  <para>Uncategorized.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b010001</field_value>
+        <field_value_description>
+  <para>Asynchronous SError interrupt.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>All other values are reserved.</para>
+<para>If the RAS Extension is not implemented, this field is <arm-defined-word>RES0</arm-defined-word>.</para>
+<note><para>Armv8.2 requires the implementation of the RAS Extension.</para></note>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="IDS_24_24" msb="24" lsb="24"/>
+        <fieldat id="0_23_14" msb="23" lsb="14"/>
+        <fieldat id="IESB_13_13_1" msb="13" lsb="13"/>
+        <fieldat id="AET_12_10" msb="12" lsb="10"/>
+        <fieldat id="EA_9_9" msb="9" lsb="9"/>
+        <fieldat id="0_8_6" msb="8" lsb="6"/>
+        <fieldat id="DFSC_5_0" msb="5" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from a Breakpoint or Vector Catch debug exception</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="0_24_6" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>6</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="IFSC_5_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>IFSC</field_name>
+        <field_msb>5</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Instruction Fault Status Code. This field is set to <binarynumber>0b100010</binarynumber>, to indicate a Debug exception.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  <para>For more information about generating these exceptions:</para>
+<list type="unordered">
+<listitem><content>For exceptions from AArch64, see <xref linkend="BCGGEABJ" browsertext="'Breakpoint exceptions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D2 (AArch64 Self-hosted Debug)" filename="D_debug_exceptions"/>.</content>
+</listitem><listitem><content>For exceptions from AArch32, see <xref linkend="BGBDJAJB" browsertext="'Breakpoint exceptions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section G2 (AArch32 Self-hosted Debug)" filename="G_aarch32_self_hosted_debug"/> and <xref linkend="G2BCGJGBCC" browsertext="'Vector Catch exceptions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section G2" filename="G_aarch32_self_hosted_debug"/>.</content>
+</listitem></list>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+
+        <fieldat id="0_24_6" msb="24" lsb="6"/>
+        <fieldat id="IFSC_5_0" msb="5" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from a Software Step exception</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="ISV_24_24" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>ISV</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>24</field_lsb>
+        <field_description order="before">
+          
+  <para>Instruction syndrome valid. Indicates whether the EX bit, ISS[6], is valid, as follows:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>EX bit is <arm-defined-word>RES0</arm-defined-word>.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>EX bit is valid.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>See the EX bit description for more information.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="0_23_7" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>23</field_msb>
+        <field_lsb>7</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="EX_6_6" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>EX</field_name>
+        <field_msb>6</field_msb>
+        <field_lsb>6</field_lsb>
+        <field_description order="before">
+          
+  <para>Exclusive operation. If the ISV bit is set to 1, this bit indicates whether a Load-Exclusive instruction was stepped.</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>An instruction other than a Load-Exclusive instruction was stepped.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>A Load-Exclusive instruction was stepped.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>If the ISV bit is set to 0, this bit is <arm-defined-word>RES0</arm-defined-word>, indicating no syndrome data is available.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="IFSC_5_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>IFSC</field_name>
+        <field_msb>5</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Instruction Fault Status Code. This field is set to <binarynumber>0b100010</binarynumber>, to indicate a Debug exception.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  <para>For more information about generating these exceptions, see <xref linkend="BCGIIDAJ" browsertext="'Software Step exceptions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D2 (AArch64 Self-hosted Debug)" filename="D_debug_exceptions"/>.</para>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="ISV_24_24" msb="24" lsb="24"/>
+        <fieldat id="0_23_7" msb="23" lsb="7"/>
+        <fieldat id="EX_6_6" msb="6" lsb="6"/>
+        <fieldat id="IFSC_5_0" msb="5" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from a Watchpoint exception</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="0_24_14" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>14</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="VNCR_13_13_1" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>VNCR</field_name>
+        <field_msb>13</field_msb>
+        <field_lsb>13</field_lsb>
+        <field_description order="before">
+          
+  <para>Indicates that the watchpoint came from use of <register_link state="AArch64" id="AArch64-vncr_el2.xml">VNCR_EL2</register_link> register by EL1 code.</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>The watchpoint was not generated by the use of <register_link state="AArch64" id="AArch64-vncr_el2.xml">VNCR_EL2</register_link> by EL1 code.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>The watchpoint was generated by the use of <register_link state="AArch64" id="AArch64-vncr_el2.xml">VNCR_EL2</register_link> by EL1 code.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>This field is 0 in ESR_EL1.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+            <fields_condition>When ARMv8.4-NV is implemented</fields_condition>
+      </field>
+        <field 
+           id="0_13_13_2" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>13</field_msb>
+        <field_lsb>13</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="0_12_9" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>12</field_msb>
+        <field_lsb>9</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="CM_8_8" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>CM</field_name>
+        <field_msb>8</field_msb>
+        <field_lsb>8</field_lsb>
+        <field_description order="before">
+          
+  <para>Cache maintenance. Indicates whether the Watchpoint exception came from a cache maintenance or address translation instruction:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>The Watchpoint exception was not generated by the execution of one of the System instructions identified in the description of value 1.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>The Watchpoint exception was generated by either the execution of a cache maintenance instruction or by a synchronous Watchpoint exception on the execution of an address translation instruction. The <register_link id="AArch64-dc-zva.xml" state="AArch64">DC ZVA</register_link> instruction is not classified as a cache maintenance instruction, and therefore its execution cannot cause this field to be set to 1.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="0_7_7" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>7</field_msb>
+        <field_lsb>7</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="WnR_6_6" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>WnR</field_name>
+        <field_msb>6</field_msb>
+        <field_lsb>6</field_lsb>
+        <field_description order="before">
+          
+  <para>Write not Read. Indicates whether the Watchpoint exception was caused by an instruction writing to a memory location, or by an instruction reading from a memory location. The possible values of this bit are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>Watchpoint exception caused by an instruction reading from a memory location.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>Watchpoint exception caused by an instruction writing to a memory location.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>For Watchpoint exceptions on cache maintenance and address translation instructions, this bit always returns a value of 1.</para>
+<para>For Watchpoint exceptions from an atomic instruction, this field is set to 0 if a read of the location would have generated the Watchpoint exception, otherwise it is set to 1.</para>
+<para>If multiple watchpoints match on the same access, it is <arm-defined-word>UNPREDICTABLE</arm-defined-word> which watchpoint generates the Watchpoint exception.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="DFSC_5_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>DFSC</field_name>
+        <field_msb>5</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Data Fault Status Code. This field is set to <binarynumber>0b100010</binarynumber>, to indicate a Debug exception.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  <para>For more information about generating these exceptions, see <xref linkend="BCGGECBJ" browsertext="'Watchpoint exceptions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D2 (AArch64 Self-hosted Debug)" filename="D_debug_exceptions"/>.</para>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="0_24_14" msb="24" lsb="14"/>
+        <fieldat id="VNCR_13_13_1" msb="13" lsb="13"/>
+        <fieldat id="0_12_9" msb="12" lsb="9"/>
+        <fieldat id="CM_8_8" msb="8" lsb="8"/>
+        <fieldat id="0_7_7" msb="7" lsb="7"/>
+        <fieldat id="WnR_6_6" msb="6" lsb="6"/>
+        <fieldat id="DFSC_5_0" msb="5" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from execution of a Breakpoint instruction</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="0_24_16" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>16</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="Comment_15_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>Comment</field_name>
+        <field_msb>15</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Set to the instruction comment field value, zero extended as necessary. For the AArch32 BKPT instructions, the comment field is described as the immediate field.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  <para>For more information about generating these exceptions, see <xref linkend="BCGIEHAG" browsertext="'Breakpoint instruction exceptions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D2 (AArch64 Self-hosted Debug)" filename="D_debug_exceptions"/>.</para>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+        
+        
+
+        <fieldat id="0_24_16" msb="24" lsb="16"/>
+        <fieldat id="Comment_15_0" msb="15" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_condition>When ARMv8.3-NV is implemented</fields_condition>
+      <fields_instance>Exception from ERET, ERETAA or ERETAB instruction</fields_instance>
+    <text_before_fields>
+      
+  <para>This EC value only applies when <register_link state="AArch64" id="AArch64-hcr_el2.xml">HCR_EL2</register_link>.NV is 1.</para>
+
+    </text_before_fields>
+    
+        <field 
+           id="0_24_2" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>2</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="ERET_1_1" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>ERET</field_name>
+        <field_msb>1</field_msb>
+        <field_lsb>1</field_lsb>
+        <field_description order="before">
+          
+  <para>Indicates whether an ERET or ERETA* instruction was trapped to EL2. Possible values are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>ERET instruction trapped to EL2.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>ERETAA or ERETAB instruction trapped to EL2.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>If this bit is 0, the ERETA field is <arm-defined-word>RES0</arm-defined-word>.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+        <field 
+           id="ERETA_0_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>ERETA</field_name>
+        <field_msb>0</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>Indicates whether an ERETAA or ERETAB instruction was trapped to EL2. Possible values are:</para>
+
+        </field_description>
+        <field_values>
+            
+
+                <field_value_instance>
+            <field_value>0b0</field_value>
+        <field_value_description>
+  <para>ERETAA instruction trapped to EL2.</para>
+</field_value_description>
+    </field_value_instance>
+                <field_value_instance>
+            <field_value>0b1</field_value>
+        <field_value_description>
+  <para>ERETAB instruction trapped to EL2.</para>
+</field_value_description>
+    </field_value_instance>
+        </field_values>
+            <field_description order="after">
+              
+  <para>When the ERET field is 0, this bit is <arm-defined-word>RES0</arm-defined-word>.</para>
+
+            </field_description>
+          <field_resets>
+  
+    <field_reset>
+        
+      <field_reset_standard_text>U</field_reset_standard_text>
+  
+    </field_reset>
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  <para>For more information about generating these exceptions, see <xref linkend="CHDCFJDF" browsertext="'Traps to EL2 for Nested virtualization' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</para>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+        <fields_condition>When ARMv8.3-NV is implemented</fields_condition>
+      
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="0_24_2" msb="24" lsb="2"/>
+        <fieldat id="ERET_1_1" msb="1" lsb="1"/>
+        <fieldat id="ERETA_0_0" msb="0" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_condition>When ARMv8.5-BTI is implemented</fields_condition>
+      <fields_instance>Exception from Branch Target Identification instruction</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="0_24_2" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>2</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+        <field 
+           id="BTYPE_1_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+        >
+          <field_name>BTYPE</field_name>
+        <field_msb>1</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+          
+  <para>This field is set to the PSTATE.BTYPE value that generated the Branch Target Exception.</para>
+
+        </field_description>
+        <field_values>
+            
+
+        </field_values>
+          <field_resets>
+  
+</field_resets>
+      </field>
+    <text_after_fields>
+    
+  <para>For more information about generating these exceptions, see <xref browsertext="The AArch64 application level programmers' model' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section B1" filename="B_the_aarch64_application_level_programmers_model.fm" linkend="BEIBJCGI"></xref>.</para>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+        <fields_condition>When ARMv8.5-BTI is implemented</fields_condition>
+      
+        
+        
+        
+        
+
+        <fieldat id="0_24_2" msb="24" lsb="2"/>
+        <fieldat id="BTYPE_1_0" msb="1" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+            <partial_fieldset>
+              <fields length="25">
+      <fields_instance>Exception from a Pointer Authentication instruction when HCR_EL2.API == 0 || SCR_EL3.API == 0</fields_instance>
+    <text_before_fields>
+      
+  
+
+    </text_before_fields>
+    
+        <field 
+           id="0_24_0" 
+           is_variable_length="False" 
+           has_partial_fieldset="False" 
+           is_linked_to_partial_fieldset="False" 
+           is_access_restriction_possible="False" 
+           is_constant_value="False" 
+           rwtype="RES0"
+        >
+          <field_name>0</field_name>
+        <field_msb>24</field_msb>
+        <field_lsb>0</field_lsb>
+        <field_description order="before">
+            <para>Reserved, <arm-defined-word>RES0</arm-defined-word>.</para>
+        </field_description>
+        <field_values>
+        </field_values>
+      </field>
+    <text_after_fields>
+    
+  <para>For more information about generating these exceptions, see:</para>
+<list type="unordered">
+<listitem><content><xref linkend="CHDGDDCJ" browsertext="'Trap to EL2 Non-secure EL0 accesses to Pointer authentication key registers' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem><listitem><content><xref linkend="CHDIGBED" browsertext="'Trap to EL3 accesses to Pointer authentication instructions' in the Arm® Architecture Reference Manual, Armv8, for Armv8-A architecture profile, section D1" filename="D_the_aarch64_system_level_programmers_model"/>.</content>
+</listitem></list>
+
+    </text_after_fields>
+  </fields>
+              <reg_fieldset length="25">
+      
+        
+        
+
+        <fieldat id="0_24_0" msb="24" lsb="0"/>
+    </reg_fieldset>
+            </partial_fieldset>
+      </field>
+    <text_after_fields>
+    
+  
+
+    </text_after_fields>
+  </fields>
+  <reg_fieldset length="64">
+      
+        
+        
+        
+        
+        
+        
+        
+        
+
+        <fieldat id="0_63_32" msb="63" lsb="32"/>
+        <fieldat id="EC_31_26" msb="31" lsb="26"/>
+        <fieldat id="IL_25_25" msb="25" lsb="25"/>
+        <fieldat id="ISS_24_0" msb="24" lsb="0"/>
+    </reg_fieldset>
+
+      </reg_fieldsets>
+      
+
+
+<access_mechanisms>
+  
+    
+      <access_permission_text>
+        <para>When <register_link state="AArch64" id="AArch64-hcr_el2.xml">HCR_EL2</register_link>.E2H is 1, without explicit synchronization, access from EL3 using the mnemonic ESR_EL1 or ESR_EL12 are not guaranteed to be ordered with respect to accesses using the other mnemonic.</para>
+      </access_permission_text>
+
+
+      <access_mechanism accessor="MRS ESR_EL1">
+        <encoding>
+          
+          <access_instruction>MRS &lt;Xt&gt;, ESR_EL1</access_instruction>
+            
+            <enc n="op0" v="0b11"/>
+            
+            <enc n="op1" v="0b000"/>
+            
+            <enc n="CRn" v="0b0101"/>
+            
+            <enc n="CRm" v="0b0010"/>
+            
+            <enc n="op2" v="0b000"/>
+        </encoding>
+          <access_permission>
+            <ps name="MRS" sections="1" secttype="access_permission">
+              <pstext>
+if PSTATE.EL == EL0 then
+    UNDEFINED;
+elsif PSTATE.EL == EL1 then
+    if EL2Enabled() &amp;&amp; !ELUsingAArch32(EL2) &amp;&amp; HCR_EL2.TRVM == '1' then
+        AArch64.SystemAccessTrap(EL2, 0x18);
+    elsif EL2Enabled() &amp;&amp; !ELUsingAArch32(EL2) &amp;&amp; HCR_EL2.&lt;NV2,NV1,NV&gt; == '111' then
+        return NVMem[0x138];
+    else
+        return ESR_EL1;
+elsif PSTATE.EL == EL2 then
+    if HCR_EL2.E2H == '1' then
+        return ESR_EL2;
+    else
+        return ESR_EL1;
+elsif PSTATE.EL == EL3 then
+    return ESR_EL1;
+              </pstext>
+            </ps>
+          </access_permission>
+      </access_mechanism>
+      <access_mechanism accessor="MSRregister ESR_EL1">
+        <encoding>
+          
+          <access_instruction>MSR ESR_EL1, &lt;Xt&gt;</access_instruction>
+            
+            <enc n="op0" v="0b11"/>
+            
+            <enc n="op1" v="0b000"/>
+            
+            <enc n="CRn" v="0b0101"/>
+            
+            <enc n="CRm" v="0b0010"/>
+            
+            <enc n="op2" v="0b000"/>
+        </encoding>
+          <access_permission>
+            <ps name="MSRregister" sections="1" secttype="access_permission">
+              <pstext>
+if PSTATE.EL == EL0 then
+    UNDEFINED;
+elsif PSTATE.EL == EL1 then
+    if EL2Enabled() &amp;&amp; !ELUsingAArch32(EL2) &amp;&amp; HCR_EL2.TVM == '1' then
+        AArch64.SystemAccessTrap(EL2, 0x18);
+    elsif EL2Enabled() &amp;&amp; !ELUsingAArch32(EL2) &amp;&amp; HCR_EL2.&lt;NV2,NV1,NV&gt; == '111' then
+        NVMem[0x138] = X[t];
+    else
+        ESR_EL1 = X[t];
+elsif PSTATE.EL == EL2 then
+    if HCR_EL2.E2H == '1' then
+        ESR_EL2 = X[t];
+    else
+        ESR_EL1 = X[t];
+elsif PSTATE.EL == EL3 then
+    ESR_EL1 = X[t];
+              </pstext>
+            </ps>
+          </access_permission>
+      </access_mechanism>
+      <access_mechanism accessor="MRS ESR_EL12">
+        <encoding>
+          
+          <access_instruction>MRS &lt;Xt&gt;, ESR_EL12</access_instruction>
+            
+            <enc n="op0" v="0b11"/>
+            
+            <enc n="op1" v="0b101"/>
+            
+            <enc n="CRn" v="0b0101"/>
+            
+            <enc n="CRm" v="0b0010"/>
+            
+            <enc n="op2" v="0b000"/>
+        </encoding>
+          <access_permission>
+            <ps name="MRS" sections="1" secttype="access_permission">
+              <pstext>
+if PSTATE.EL == EL0 then
+    UNDEFINED;
+elsif PSTATE.EL == EL1 then
+    if EL2Enabled() &amp;&amp; HCR_EL2.&lt;NV2,NV1,NV&gt; == '101' then
+        return NVMem[0x138];
+    elsif EL2Enabled() &amp;&amp; HCR_EL2.NV == '1' then
+        AArch64.SystemAccessTrap(EL2, 0x18);
+    else
+        UNDEFINED;
+elsif PSTATE.EL == EL2 then
+    if EL2Enabled() &amp;&amp; HCR_EL2.E2H == '1' then
+        return ESR_EL1;
+    else
+        UNDEFINED;
+elsif PSTATE.EL == EL3 then
+    if EL2Enabled() &amp;&amp; HCR_EL2.E2H == '1' then
+        return ESR_EL1;
+    else
+        UNDEFINED;
+              </pstext>
+            </ps>
+          </access_permission>
+      </access_mechanism>
+      <access_mechanism accessor="MSRregister ESR_EL12">
+        <encoding>
+          
+          <access_instruction>MSR ESR_EL12, &lt;Xt&gt;</access_instruction>
+            
+            <enc n="op0" v="0b11"/>
+            
+            <enc n="op1" v="0b101"/>
+            
+            <enc n="CRn" v="0b0101"/>
+            
+            <enc n="CRm" v="0b0010"/>
+            
+            <enc n="op2" v="0b000"/>
+        </encoding>
+          <access_permission>
+            <ps name="MSRregister" sections="1" secttype="access_permission">
+              <pstext>
+if PSTATE.EL == EL0 then
+    UNDEFINED;
+elsif PSTATE.EL == EL1 then
+    if EL2Enabled() &amp;&amp; HCR_EL2.&lt;NV2,NV1,NV&gt; == '101' then
+        NVMem[0x138] = X[t];
+    elsif EL2Enabled() &amp;&amp; HCR_EL2.NV == '1' then
+        AArch64.SystemAccessTrap(EL2, 0x18);
+    else
+        UNDEFINED;
+elsif PSTATE.EL == EL2 then
+    if EL2Enabled() &amp;&amp; HCR_EL2.E2H == '1' then
+        ESR_EL1 = X[t];
+    else
+        UNDEFINED;
+elsif PSTATE.EL == EL3 then
+    if EL2Enabled() &amp;&amp; HCR_EL2.E2H == '1' then
+        ESR_EL1 = X[t];
+    else
+        UNDEFINED;
+              </pstext>
+            </ps>
+          </access_permission>
+      </access_mechanism>
+      <access_mechanism accessor="MRS ESR_EL2">
+        <encoding>
+          
+          <access_instruction>MRS &lt;Xt&gt;, ESR_EL2</access_instruction>
+            
+            <enc n="op0" v="0b11"/>
+            
+            <enc n="op1" v="0b100"/>
+            
+            <enc n="CRn" v="0b0101"/>
+            
+            <enc n="CRm" v="0b0010"/>
+            
+            <enc n="op2" v="0b000"/>
+        </encoding>
+          <access_permission>
+            <ps name="MRS" sections="1" secttype="access_permission">
+              <pstext>
+if PSTATE.EL == EL0 then
+    UNDEFINED;
+elsif PSTATE.EL == EL1 then
+    if EL2Enabled() &amp;&amp; HCR_EL2.&lt;NV2,NV&gt; == '11' then
+        return ESR_EL1;
+    elsif EL2Enabled() &amp;&amp; HCR_EL2.NV == '1' then
+        AArch64.SystemAccessTrap(EL2, 0x18);
+    else
+        UNDEFINED;
+elsif PSTATE.EL == EL2 then
+    return ESR_EL2;
+elsif PSTATE.EL == EL3 then
+    return ESR_EL2;
+              </pstext>
+            </ps>
+          </access_permission>
+      </access_mechanism>
+      <access_mechanism accessor="MSRregister ESR_EL2">
+        <encoding>
+          
+          <access_instruction>MSR ESR_EL2, &lt;Xt&gt;</access_instruction>
+            
+            <enc n="op0" v="0b11"/>
+            
+            <enc n="op1" v="0b100"/>
+            
+            <enc n="CRn" v="0b0101"/>
+            
+            <enc n="CRm" v="0b0010"/>
+            
+            <enc n="op2" v="0b000"/>
+        </encoding>
+          <access_permission>
+            <ps name="MSRregister" sections="1" secttype="access_permission">
+              <pstext>
+if PSTATE.EL == EL0 then
+    UNDEFINED;
+elsif PSTATE.EL == EL1 then
+    if EL2Enabled() &amp;&amp; HCR_EL2.&lt;NV2,NV&gt; == '11' then
+        ESR_EL1 = X[t];
+    elsif EL2Enabled() &amp;&amp; HCR_EL2.NV == '1' then
+        AArch64.SystemAccessTrap(EL2, 0x18);
+    else
+        UNDEFINED;
+elsif PSTATE.EL == EL2 then
+    ESR_EL2 = X[t];
+elsif PSTATE.EL == EL3 then
+    ESR_EL2 = X[t];
+              </pstext>
+            </ps>
+          </access_permission>
+      </access_mechanism>
+</access_mechanisms>
+
+      <arch_variants>
+      </arch_variants>
+  </register>
+</registers>
+
+    <timestamp>27/03/2019 21:59; e5e4db499bf9867a4b93324c4dbac985d3da9376</timestamp>
+</register_page>
\ No newline at end of file
index 372e13ec4f408b579f6a1c62823925496867646a..1f7c731d6a20c2da0516e4280d3392aa64b2a2f1 100755 (executable)
@@ -15,22 +15,24 @@ def GetTurnstileSummary(turnstile):
     type_and_gencount = Cast(addressof(turnstile.ts_type_gencount), 'union turnstile_type_gencount *')
     turnstile_type = ""
 
-    if type_and_gencount.ts_type == 0:
+    if type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_NONE'):
       turnstile_type = "none   "
-    elif type_and_gencount.ts_type == 1:
+    elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_KERNEL_MUTEX'):
       turnstile_type = "knl_mtx"
-    elif type_and_gencount.ts_type == 2:
+    elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_ULOCK'):
       turnstile_type = "ulock  "
-    elif type_and_gencount.ts_type == 3:
+    elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_PTHREAD_MUTEX'):
       turnstile_type = "pth_mtx"
-    elif type_and_gencount.ts_type == 4:
+    elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_SYNC_IPC'):
       turnstile_type = "syn_ipc"
-    elif type_and_gencount.ts_type == 5:
+    elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_WORKLOOPS'):
       turnstile_type = "kqwl   "
-    elif type_and_gencount.ts_type == 6:
+    elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_WORKQS'):
       turnstile_type = "workq  "
-    elif type_and_gencount.ts_type == 7:
+    elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_KNOTE'):
       turnstile_type = "knote  "
+    elif type_and_gencount.ts_type == GetEnumValue('turnstile_type_t::TURNSTILE_SLEEP_INHERITOR'):
+      turnstile_type = "slp_inh"
 
     turnstile_state = ""
     if turnstile.ts_state & 0x1:
@@ -144,4 +146,32 @@ def ShowAllTurnstiles(cmd_args=None, cmd_options={}):
         PrintTurnstile(turnstile)
     return True
 # EndMacro showallbusyturnstiles
+
+@lldb_command('showthreadbaseturnstiles', fancy=True)
+def ShowThreadInheritorBase(cmd_args=None, cmd_options={}, O=None):
+    """ A DEVELOPMENT macro that walks the list of userspace turnstiles pushing on a thread
+        and prints them.
+        usage: (lldb) showthreadbaseturnstiles thread_pointer
+    """
+    if not cmd_args:
+        return O.error('invalid thread pointer')
+
+    thread = kern.GetValueFromAddress(cmd_args[0], "thread_t")
+    with O.table(GetTurnstileSummary.header):
+        for turnstile in IteratePriorityQueue(thread.base_inheritor_queue, 'struct turnstile', 'ts_inheritor_links'):
+            PrintTurnstile(turnstile)
+
+@lldb_command('showthreadschedturnstiles', fancy=True)
+def ShowThreadInheritorSched(cmd_args=None, cmd_options={}, O=None):
+    """ A DEVELOPMENT macro that walks the list of kernelspace turnstiles pushing on a thread
+        and prints them.
+        usage: (lldb) showthreadschedturnstiles thread_pointer
+    """
+    if not cmd_args:
+        return O.error('invalid thread pointer')
+
+    thread = kern.GetValueFromAddress(cmd_args[0], "thread_t")
+    with O.table(GetTurnstileSummary.header):
+        for turnstile in IteratePriorityQueue(thread.sched_inheritor_queue, 'struct turnstile', 'ts_inheritor_links'):
+            PrintTurnstile(turnstile)
 #endif
diff --git a/tools/lldbmacros/ulock.py b/tools/lldbmacros/ulock.py
new file mode 100755 (executable)
index 0000000..e2f9b45
--- /dev/null
@@ -0,0 +1,45 @@
+from xnu import *
+from scheduler import GetRecentTimestamp
+import xnudefines
+
+ulock_types = {
+    1: "COMPARE_AND_WAIT",
+    2: "UNFAIR_LOCK",
+    3: "UNFAIR_LOCK64_SHARED",
+    4: "COMPARE_AND_WAIT64",
+    5: "COMPARE_AND_WAIT64_SHARED"
+}
+
+@header("{:<20s} {:<20s} {:<20s} {:<10s} {:<20s} {:<20s} {:<20s}".format(
+    'ull_t', 'kind', 'addr/obj', 'pid/offs', 'owner', 'turnstile', 'waiters'))
+def GetUlockSummary(ull):
+    code = int(ull.ull_opcode)
+    if ulock_types.has_key(code):
+        ull_type = ulock_types[code]
+    else:
+        ull_type = "{:#x}".format(code)
+
+    s = "{ull: <#20x} {ull_type: <20s}".format(ull=ull, ull_type=ull_type)
+    ulk=ull.ull_key
+    if int(ulk.ulk_key_type) is 1:
+        s += " {ulk.ulk_addr: <#20x} {ulk.ulk_pid: <10d}".format(ulk=ulk)
+    elif int(ulk.ulk_key_type) is 2:
+        s += " {ulk.ulk_object: <#20x} {ulk.ulk_offset: <10d}".format(ulk=ulk)
+    else:
+        s += " {:<20s} {:<10s}".format("", "")
+
+    return s + " {ull.ull_owner: <#20x} {ull.ull_turnstile: <#20x} {ull.ull_nwaiters: >7d}".format(ull=ull)
+
+@lldb_command('showallulocks', fancy=True)
+def ShowAllUlocks(cmd_args=None, cmd_options={}, O=None):
+    """ Display a summary of all the ulocks in the system
+
+        usage: showallulocks
+    """
+
+    with O.table(GetUlockSummary.header):
+        count = kern.globals.ull_hash_buckets;
+        buckets = kern.globals.ull_bucket
+        for i in xrange(0, count):
+            for ull in IterateLinkageChain(addressof(buckets[i].ulb_head), 'ull_t *', 'ull_hash_link'):
+                print GetUlockSummary(ull)
index 3413fff961331a0201d26d132274dc7f76200449..f8844b3dc9583a6fe88145b44c5429d08a99a383 100755 (executable)
@@ -77,6 +77,7 @@ def ShowX86UserStack(thread, user_lib_info = None):
     return
 
 def _PrintARMUserStack(task, cur_pc, cur_fp, framesize, frametype, frameformat, user_lib_info=None):
+    cur_pc = kern.StripUserPAC(cur_pc)
     if cur_pc == 0:
         "No valid user context for this activation."
         return
@@ -87,6 +88,7 @@ def _PrintARMUserStack(task, cur_pc, cur_fp, framesize, frametype, frameformat,
         frame = GetUserDataAsString(task, cur_fp, framesize)
         cur_fp = _ExtractDataFromString(frame, 0, frametype)
         cur_pc = _ExtractDataFromString(frame, (framesize / 2), frametype)
+        cur_pc = kern.StripUserPAC(cur_pc)
         if not cur_fp:
             break
         print frameformat.format(frameno, cur_fp, cur_pc, GetBinaryNameForPC(cur_pc, user_lib_info))
@@ -864,6 +866,38 @@ def ShowOSMalloc(cmd_args=None):
 
 # EndMacro: showosmalloc
 
+def SaveDataToFile(start_addr, length, outputfile, task=None,):
+    """ Save the data at the specified address (of the specified length) to the file.
+        params: start_addr : start address of the region of memory to save
+                length : length of the region of memory to save
+                outputfile : file to save the data in
+                task (optional) : task containing the memory region (if from user data)
+        returns: True if we saved the requested data, False otherwise
+    """
+    if task:
+        memory_data = GetUserDataAsString(task, start_addr, length)
+    else:
+        data_ptr = kern.GetValueFromAddress(start_addr, 'uint8_t *')
+        if data_ptr == 0:
+            print "invalid kernel start address specified"
+            return False
+        memory_data = []
+        for i in range(length):
+            memory_data.append(chr(data_ptr[i]))
+            if i % 50000 == 0:
+                print "%d of %d            \r" % (i, length),
+        memory_data = ''.join(memory_data)
+
+    if len(memory_data) != length:
+        print "Failed to read {:d} bytes from address {: <#020x}".format(length, start_addr)
+        return False
+
+    fh = open(outputfile, 'w')
+    fh.write(memory_data)
+    fh.close()
+    print "Saved {:d} bytes to file {:s}".format(length, outputfile)
+    return True
+
 
 @lldb_command('savekcdata', 'T:O:')
 def SaveKCDataToFile(cmd_args=None, cmd_options={}):
@@ -891,28 +925,6 @@ def SaveKCDataToFile(cmd_args=None, cmd_options={}):
     if flags_copyout:
         if not task:
             raise ArgumentError('Invalid task pointer provided.')
-        memory_data = GetUserDataAsString(task, memory_begin_address, memory_size)
+        return SaveDataToFile(memory_begin_address, memory_size, outputfile, task)
     else:
-        data_ptr = kern.GetValueFromAddress(memory_begin_address, 'uint8_t *')
-        if data_ptr == 0:
-            print "Kcdata descriptor is NULL"
-            return False
-        memory_data = []
-        for i in range(memory_size):
-            memory_data.append(chr(data_ptr[i]))
-            if i % 50000 == 0:
-                print "%d of %d            \r" % (i, memory_size),
-        memory_data = ''.join(memory_data)
-
-    if len(memory_data) != memory_size:
-        print "Failed to read {:d} bytes from address {: <#020x}".format(memory_size, memory_begin_address)
-        return False
-
-    fh = open(outputfile, 'w')
-    fh.write(memory_data)
-    fh.close()
-    print "Saved {:d} bytes to file {:s}".format(memory_size, outputfile)
-    return True
-
-
-
+        return SaveDataToFile(memory_begin_address, memory_size, outputfile, None)
index 74e54223e0d49b65278c01464ebfc4730bd16501..a4a9a61b2b441d32c8ec6f9c9ccdcb574ad9d6c6 100755 (executable)
@@ -10,10 +10,13 @@ CPU_TYPE_I386 = 0x00000007
 CPU_TYPE_X86_64 = 0x01000007
 CPU_TYPE_ARM = 0x0000000c
 CPU_TYPE_ARM64 = 0x0100000c
+CPU_TYPE_ARM64_32 = 0x0200000c
 
 def GetRegisterSetForCPU(cputype, subtype):
     if cputype == CPU_TYPE_ARM64:
         retval = Armv8_RegisterSet
+    elif cputype == CPU_TYPE_ARM64_32:
+        retval = Armv8_RegisterSet
     elif cputype == CPU_TYPE_ARM:
         retval = Armv7_RegisterSet
     elif cputype == CPU_TYPE_I386:
@@ -52,6 +55,9 @@ class UserThreadObject(object):
                     self.saved_state = self.thread.machine.PcbData
                 else:
                     self.saved_state = self.thread.machine.contextData.ss.uss.ss_32
+            if cputype == CPU_TYPE_ARM64_32:
+                self.reg_type = "arm64"
+                self.saved_state = self.thread.machine.upcb.uss.ss_64
 
         logging.debug("created thread id 0x%x of type %s, is_kern_64bit 0x%x cputype 0x%x"
                       % (self.thread_id, self.reg_type, is_kern_64bit, cputype))
@@ -101,8 +107,7 @@ class UserProcess(target.Process):
         if task.t_flags & 0x2:
             dataregisters64bit = True
 
-        is_kern_64bit = kern.arch in ['x86_64', 'x86_64h', 'arm64'
-        ]
+        is_kern_64bit = kern.arch in ['x86_64', 'x86_64h', 'arm64', 'arm64e']
 
         self.cputype = unsigned(self.proc.p_cputype)
         self.cpusubtype = unsigned(self.proc.p_cpusubtype)
index 33d601f8d2e2681672f207c58d0221163fa9a966..6039f204833ac91e7824e6ddf58c9a19e8f47338 100755 (executable)
@@ -474,3 +474,12 @@ def print_hex_data(data, begin_offset=0, desc=""):
 def Ones(x):
     return (1 << x)-1
 
+def StripPAC(x, TySz):
+    sign_mask = 1 << 55
+    ptr_mask = Ones(64-TySz)
+    pac_mask = ~ptr_mask
+    sign = x & sign_mask
+    if sign:
+        return (x | pac_mask) + 2**64
+    else:
+        return x & ptr_mask
index dae699f277f1bd1de9abfbc1a645d8b120e35f61..26bb400f7a566a0a4e66abe04a366bbb82928cdc 100755 (executable)
@@ -68,17 +68,20 @@ def GetWQThreadSummary(th, uth):
     kqr = uth.uu_kqr_bound
     if not kqr:
         kq = 0
-    elif kqr.kqr_state & 0x1: # workloop
+    elif kqr.tr_flags & 0x1: # kevent
+        kq = p.p_fd.fd_wqkqueue
+        kind = "kqwq[%s]" % (xnudefines.thread_qos_short_strings[int(kqr.tr_kq_qos_index)])
+    elif kqr.tr_flags & 0x2: # workloop
         kq = ContainerOf(kqr, 'struct kqworkloop', 'kqwl_request')
         kind = "workloop"
     else:
-        kq = p.p_fd.fd_wqkqueue
-        kind = "kqwq[%s]" % (xnudefines.thread_qos_short_strings[int(kqr.kqr_qos_index)])
+        kq = 0
+        kind = "???"
 
     return "{th: <#020x} {uth: <#020x} {thport: >#010x}  {kind: <9s} {kq: <#020x} {idle: <10s} {uu_workq_flags: <30s}".format(th=th, uth=uth, thport=uth.uu_workq_thport, kind=kind, kq=kq, idle=idle, uu_workq_flags=" ".join(uu_workq_flags))
 
-@header("{:<20s} {:<20s} {:<10s} {:<3s} {:<4s} {:<30s}".format(
-    'request', 'kqueue', 'state', '#', 'qos', 'tr_flags'))
+@header("{:<20s} {:<20s} {:<20s} {:<10s} {:<4s} {:<6s} {:<6s} {:<6s} {:<30s}".format(
+    'request', 'kqueue', 'thread', 'state', '#', 'qos', 'kq_qos', 'kq_ovr', 'tr_flags'))
 def GetWorkqueueThreadRequestSummary(proc, req):
     kq = 0
     tr_flags = []
@@ -88,12 +91,17 @@ def GetWorkqueueThreadRequestSummary(proc, req):
         kq = proc.p_fd.fd_wqkqueue
     if req.tr_flags & 0x02:
         tr_flags.append("WORKLOOP")
-        kq = ContainerOf(req, 'struct kqworkloop', 'kqwl_request.kqr_req')
+        kq = ContainerOf(req, 'struct kqworkloop', 'kqwl_request')
     if req.tr_flags & 0x04: tr_flags.append("OVERCOMMIT")
     if req.tr_flags & 0x08: tr_flags.append("PARAMS")
     if req.tr_flags & 0x10: tr_flags.append("OUTSIDE_QOS")
 
-    state = {0: "IDLE", 1: "NEW", 2: "QUEUED", 4: "BINDING" }[int(req.tr_state)]
+    state = {0: "IDLE", 1: "NEW", 2: "QUEUED", 3: "CANCELED", 4: "BINDING", 5: "BOUND" }[int(req.tr_state)]
+    if req.tr_kq_wakeup: state += "*"
+
+    thread = 0
+    if int(req.tr_state) in [3, 4]:
+        thread = req.tr_thread
 
     qos = int(req.tr_qos)
     if qos == 8:
@@ -103,74 +111,80 @@ def GetWorkqueueThreadRequestSummary(proc, req):
     else:
         qos = xnudefines.thread_qos_short_strings[qos]
 
-    return "{req: <#020x} {kq: <#020x} {state: <10s} {req.tr_count: <3d} {qos: <4s} {tr_flags: <30s}".format(req=req, kq=kq, state=state, qos=qos, tr_flags=" ".join(tr_flags))
+    kq_qos = xnudefines.thread_qos_short_strings[int(req.tr_kq_qos_index)]
+    kq_ovr = xnudefines.thread_qos_short_strings[int(req.tr_kq_override_index)]
+    req_addr = unsigned(addressof(req))
 
-@lldb_command('showwqthread')
-def ShowWQThread(cmd_args=None):
+    return "{req_addr: <#020x} {kq: <#020x} {thread: <#020x} {state: <10s} {req.tr_count: <4d} {qos: <6s} {kq_qos: <6s} {kq_ovr: <6s} {tr_flags: <30s}".format(
+            req_addr=req_addr, req=req, kq=kq, thread=thread, state=state, qos=qos, kq_qos=kq_qos, kq_ovr=kq_ovr, tr_flags=" ".join(tr_flags))
+
+@lldb_command('showwqthread', fancy=True)
+def ShowWQThread(cmd_args=None, cmd_options={}, O=None):
     """ Shows info about a workqueue thread
 
         usage: showworkqthread <thread_t>
     """
 
     if not cmd_args:
-        raise ArgumentError('missing struct proc * argument')
+        return O.error('missing struct proc * argument')
 
     th = kern.GetValueFromAddress(cmd_args[0], "struct thread *")
     if not (th.thread_tag & 0x20):
         raise ArgumentError('not a workqueue thread')
 
-    print GetWQThreadSummary.header
-    print GetWQThreadSummary(th, Cast(th.uthread, 'struct uthread *'))
+    with O.table(GetWQThreadSummary.header):
+        print GetWQThreadSummary(th, Cast(th.uthread, 'struct uthread *'))
 
 
-@lldb_command('showprocworkqueue')
-def ShowProcWorkqueue(cmd_args=None):
+@lldb_command('showprocworkqueue', fancy=True)
+def ShowProcWorkqueue(cmd_args=None, cmd_options={}, O=None):
     """ Shows the process workqueue
 
         usage: showprocworkqueue <proc_t>
     """
 
     if not cmd_args:
-        raise ArgumentError('missing struct proc * argument')
+        return O.error('missing struct proc * argument')
 
     proc = kern.GetValueFromAddress(cmd_args[0], "proc_t")
     wq = Cast(proc.p_wqptr, "struct workqueue *");
-    if wq:
-        print GetWorkqueueSummary.header
+    if not wq:
+        return O.error("{:#x} doesn't have a workqueue", proc)
+
+    with O.table(GetWorkqueueSummary.header):
         print GetWorkqueueSummary(proc, wq)
 
-        if wq.wq_reqcount:
-            print "    "
-            print "    " + GetWorkqueueThreadRequestSummary.header
+        with O.table(GetWorkqueueThreadRequestSummary.header, indent=True):
+            if wq.wq_reqcount:
+                print ""
             if wq.wq_event_manager_threadreq:
-                print "    " + GetWorkqueueThreadRequestSummary(proc, wq.wq_event_manager_threadreq)
-            for req in IteratePriorityQueueEntry(wq.wq_overcommit_queue, 'struct workq_threadreq_s', 'tr_entry'):
-                print "    " + GetWorkqueueThreadRequestSummary(proc, req)
-            for req in IteratePriorityQueueEntry(wq.wq_constrained_queue, 'struct workq_threadreq_s', 'tr_entry'):
-                print "    " + GetWorkqueueThreadRequestSummary(proc, req)
-            for req in IteratePriorityQueueEntry(wq.wq_special_queue, 'struct workq_threadreq_s', 'tr_entry'):
-                print "    " + GetWorkqueueThreadRequestSummary(proc, req)
-
-        print "    "
-        print "    " + GetWQThreadSummary.header
-        for uth in IterateTAILQ_HEAD(wq.wq_thrunlist, "uu_workq_entry"):
-            print "    " + GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth)
-        for uth in IterateTAILQ_HEAD(wq.wq_thidlelist, "uu_workq_entry"):
-            print "    " + GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth)
-        for uth in IterateTAILQ_HEAD(wq.wq_thnewlist, "uu_workq_entry"):
-            print "    " + GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth)
-
-@lldb_command('showallworkqueues')
-def ShowAllWorkqueues(cmd_args=None):
+                print GetWorkqueueThreadRequestSummary(proc, wq.wq_event_manager_threadreq)
+            for req in IteratePriorityQueue(wq.wq_overcommit_queue, 'struct workq_threadreq_s', 'tr_entry'):
+                print GetWorkqueueThreadRequestSummary(proc, req)
+            for req in IteratePriorityQueue(wq.wq_constrained_queue, 'struct workq_threadreq_s', 'tr_entry'):
+                print GetWorkqueueThreadRequestSummary(proc, req)
+            for req in IteratePriorityQueue(wq.wq_special_queue, 'struct workq_threadreq_s', 'tr_entry'):
+                print GetWorkqueueThreadRequestSummary(proc, req)
+
+        with O.table(GetWQThreadSummary.header, indent=True):
+            print ""
+            for uth in IterateTAILQ_HEAD(wq.wq_thrunlist, "uu_workq_entry"):
+                print GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth)
+            for uth in IterateTAILQ_HEAD(wq.wq_thidlelist, "uu_workq_entry"):
+                print GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth)
+            for uth in IterateTAILQ_HEAD(wq.wq_thnewlist, "uu_workq_entry"):
+                print GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth)
+
+@lldb_command('showallworkqueues', fancy=True)
+def ShowAllWorkqueues(cmd_args=None, cmd_options={}, O=None):
     """ Display a summary of all the workqueues in the system
 
         usage: showallworkqueues
     """
 
-    print GetWorkqueueSummary.header
-
-    for t in kern.tasks:
-        proc = Cast(t.bsd_info, 'proc *')
-        wq = Cast(proc.p_wqptr, "struct workqueue *");
-        if wq:
-            print GetWorkqueueSummary(proc, wq)
+    with O.table(GetWorkqueueSummary.header):
+        for t in kern.tasks:
+            proc = Cast(t.bsd_info, 'proc *')
+            wq = Cast(proc.p_wqptr, "struct workqueue *");
+            if wq:
+                print GetWorkqueueSummary(proc, wq)
index 68800167846d8670beadb68280ca5687d225b06f..7ec9ca7c8c88e76ae5419f73ea85f6b72848dbb9 100755 (executable)
@@ -21,6 +21,8 @@ MODULE_NAME=__name__
 
 COMMON_HELP_STRING = """
     -h  Show the help string for the command.
+    -c [always|auto|never|0|1]
+                            Control the colorized output of certain commands
     -o <path/to/filename>   The output of this command execution will be saved to file. Parser information or errors will
                             not be sent to file though. eg /tmp/output.txt
     -s <filter_string>      The "filter_string" param is parsed to python regex expression and each line of output
@@ -45,11 +47,11 @@ def header(initial_value):
         return obj
     return _set_header
 
-# holds type declarations done by xnu. 
+# holds type declarations done by xnu.
 #DONOTTOUCHME: Exclusive use of lldb_type_summary only.
-lldb_summary_definitions = {} 
+lldb_summary_definitions = {}
 def lldb_type_summary(types_list):
-    """ A function decorator to register a summary for a type in lldb. 
+    """ A function decorator to register a summary for a type in lldb.
         params: types_list - [] an array of types that you wish to register a summary callback function. (ex. ['task *', 'task_t'])
         returns: Nothing. This is a decorator.
     """
@@ -60,13 +62,13 @@ def lldb_type_summary(types_list):
                 out_string += "\n" + obj.header +"\n"
             out_string += obj( core.value(lldbval) )
             return out_string
-        
+
         myglobals = globals()
         summary_function_name = "LLDBSummary" + obj.__name__
         myglobals[summary_function_name] = _internal_summary_function
         summary_function = myglobals[summary_function_name]
         summary_function.__doc__ = obj.__doc__
-        
+
         global lldb_summary_definitions
         for single_type in types_list:
             if config['showTypeSummary']:
@@ -74,19 +76,20 @@ def lldb_type_summary(types_list):
                     lldb.debugger.HandleCommand("type summary delete --category kernel \""+ single_type + "\"")
                 lldb.debugger.HandleCommand("type summary add \""+ single_type +"\" --category kernel --python-function " + MODULE_NAME + "." + summary_function_name)
             lldb_summary_definitions[single_type] = obj
-            
+
         return obj
     return _get_summary
 
-#global cache of documentation for lldb commands exported by this module 
+#global cache of documentation for lldb commands exported by this module
 #DONOTTOUCHME: Exclusive use of lldb_command only.
 lldb_command_documentation = {}
 
-def lldb_command(cmd_name, option_string = ''):
+def lldb_command(cmd_name, option_string = '', fancy=False):
     """ A function decorator to define a command with namd 'cmd_name' in the lldb scope to call python function.
         params: cmd_name - str : name of command to be set in lldb prompt.
-            option_string - str: getopt like option string. Only CAPITAL LETTER options allowed. 
+            option_string - str: getopt like option string. Only CAPITAL LETTER options allowed.
                                  see README on Customizing command options.
+            fancy - bool       : whether the command will receive an 'O' object to do fancy output (tables, indent, color)
     """
     if option_string != option_string.upper():
         raise RuntimeError("Cannot setup command with lowercase option args. %s" % option_string)
@@ -104,16 +107,18 @@ def lldb_command(cmd_name, option_string = ''):
             command_args = shlex.split(command)
             lldb.debugger.HandleCommand('type category disable kernel' )
             def_verbose_level = config['verbosity']
-            
+
             try:
                 stream.setOptions(command_args, option_string)
                 if stream.verbose_level != 0:
-                    config['verbosity'] +=  stream.verbose_level 
+                    config['verbosity'] +=  stream.verbose_level
                 with RedirectStdStreams(stdout=stream) :
+                    args = { 'cmd_args': stream.target_cmd_args }
                     if option_string:
-                        obj(cmd_args=stream.target_cmd_args, cmd_options=stream.target_cmd_options)
-                    else:
-                        obj(cmd_args=stream.target_cmd_args)
+                        args['cmd_options'] = stream.target_cmd_options
+                    if fancy:
+                        args['O'] = stream
+                    obj(**args)
             except KeyboardInterrupt:
                 print "Execution interrupted by user"
             except ArgumentError as arg_error:
@@ -133,7 +138,7 @@ However, it is recommended that you report the exception to lldb/kernel debuggin
 
             if config['showTypeSummary']:
                 lldb.debugger.HandleCommand('type category enable kernel' )
-            
+
             if stream.pluginRequired :
                 plugin = LoadXNUPlugin(stream.pluginName)
                 if plugin == None :
@@ -143,10 +148,10 @@ However, it is recommended that you report the exception to lldb/kernel debuggin
                 return_data = plugin.plugin_execute(cmd_name, result.GetOutput())
                 ProcessXNUPluginResult(return_data)
                 plugin.plugin_cleanup()
-            
+
             #restore the verbose level after command is complete
             config['verbosity'] = def_verbose_level
-            
+
             return
 
         myglobals = globals()
@@ -163,14 +168,24 @@ However, it is recommended that you report the exception to lldb/kernel debuggin
             lldb.debugger.HandleCommand("command script delete "+cmd_name)
         lldb_command_documentation[cmd_name] = (obj.__name__, obj.__doc__.lstrip(), option_string)
         lldb.debugger.HandleCommand("command script add -f " + MODULE_NAME + "." + command_function_name + " " + cmd_name)
+
+        if fancy:
+            def wrapped_fun(cmd_args=None, cmd_options={}, O=None):
+                if O is None:
+                    stream = CommandOutput(cmd_name, fhandle=sys.stdout)
+                    with RedirectStdStreams(stdout=stream):
+                        return obj(cmd_args, cmd_options, stream)
+                else:
+                    return obj(cmd_args, cmd_options, O)
+            return wrapped_fun
         return obj
     return _cmd
 
 def lldb_alias(alias_name, cmd_line):
-    """ define an alias in the lldb command line. 
+    """ define an alias in the lldb command line.
         A programatic way of registering an alias. This basically does
         (lldb)command alias alias_name "cmd_line"
-        ex. 
+        ex.
         lldb_alias('readphys16', 'readphys 16')
     """
     alias_name = alias_name.strip()
@@ -194,7 +209,7 @@ def SetupLLDBTypeSummaries(reset=False):
     return
 
 def LoadXNUPlugin(name):
-    """ Try to load a plugin from the plugins directory. 
+    """ Try to load a plugin from the plugins directory.
     """
     retval = None
     name=name.strip()
@@ -208,7 +223,7 @@ def LoadXNUPlugin(name):
             print "Plugin is not correctly implemented. Please read documentation on implementing plugins"
     except:
         print "plugin not found :"+name
-         
+
     return retval
 
 def ProcessXNUPluginResult(result_data):
@@ -218,7 +233,7 @@ def ProcessXNUPluginResult(result_data):
     ret_status = result_data[0]
     ret_string = result_data[1]
     ret_commands = result_data[2]
-    
+
     if ret_status == False:
         print "Plugin failed: " + ret_string
         return
@@ -233,15 +248,15 @@ def ProcessXNUPluginResult(result_data):
 #DONOTTOUCHME: Exclusive use of xnudebug_test only
 lldb_command_tests = {}
 def xnudebug_test(test_name):
-    """ A function decoratore to register a test with the framework. Each test is supposed to be of format 
+    """ A function decoratore to register a test with the framework. Each test is supposed to be of format
         def Test<name>(kernel_target, config, lldb_obj, isConnected )
-        
+
         NOTE: The testname should start with "Test" else exception will be raised.
     """
     def _test(obj):
         global lldb_command_tests
         if obj.__name__.find("Test") != 0 :
-            print "Test name ", obj.__name__ , " should start with Test" 
+            print "Test name ", obj.__name__ , " should start with Test"
             raise ValueError
         lldb_command_tests[test_name] = (test_name, obj.__name__, obj, obj.__doc__)
         return obj
@@ -249,14 +264,14 @@ def xnudebug_test(test_name):
 
 
 # End Debugging specific utility functions
-# Kernel Debugging specific classes and accessor methods 
+# Kernel Debugging specific classes and accessor methods
 
 # global access object for target kernel
 
 def GetObjectAtIndexFromArray(array_base, index):
     """ Subscript indexing for arrays that are represented in C as pointers.
         for ex. int *arr = malloc(20*sizeof(int));
-        now to get 3rd int from 'arr' you'd do 
+        now to get 3rd int from 'arr' you'd do
         arr[2] in C
         GetObjectAtIndexFromArray(arr_val,2)
         params:
@@ -278,8 +293,8 @@ kern = None
 def GetLLDBThreadForKernelThread(thread_obj):
     """ Get a reference to lldb.SBThread representation for kernel thread.
         params:
-            thread_obj : core.cvalue - thread object of type thread_t 
-        returns 
+            thread_obj : core.cvalue - thread object of type thread_t
+        returns
             lldb.SBThread - lldb thread object for getting backtrace/registers etc.
     """
     tid = unsigned(thread_obj.thread_id)
@@ -369,10 +384,10 @@ def GetThreadBackTrace(thread_obj, verbosity = vHUMAN, prefix = ""):
 
         if not function:
             # No debug info for 'function'.
-            out_string += prefix 
+            out_string += prefix
             if not is_continuation:
-                out_string += "{fp:#018x} ".format(fp = frame_p) 
-            
+                out_string += "{fp:#018x} ".format(fp = frame_p)
+
             symbol = frame.GetSymbol()
             if not symbol:
                 out_string += GetKextSymbolInfo(load_addr)
@@ -381,7 +396,7 @@ def GetThreadBackTrace(thread_obj, verbosity = vHUMAN, prefix = ""):
                 start_addr = symbol.GetStartAddress().GetFileAddress()
                 symbol_name = symbol.GetName()
                 symbol_offset = file_addr - start_addr
-                out_string += "{addr:#018x} {mod}`{symbol} + {offset:#x} \n".format(addr=load_addr, 
+                out_string += "{addr:#018x} {mod}`{symbol} + {offset:#x} \n".format(addr=load_addr,
                     mod=mod_name, symbol=symbol_name, offset=symbol_offset)
         else:
             # Debug info is available for 'function'.
@@ -391,15 +406,15 @@ def GetThreadBackTrace(thread_obj, verbosity = vHUMAN, prefix = ""):
             func_name = '%s [inlined]' % func_name if frame.IsInlined() else func_name
             if is_continuation and frame.IsInlined():
                 debuglog("Skipping frame for thread {:#018x} since its inlined".format(thread_obj))
-                continue 
-            out_string += prefix 
+                continue
+            out_string += prefix
             if not is_continuation:
                 out_string += "{fp:#018x} ".format(fp=frame_p)
             out_string += "{addr:#018x} {func}{args} \n".format(addr=load_addr,
                                     func=func_name,
                                     file=file_name, line=line_num,
                                     args="(" + (str(frame.arguments).replace("\n", ", ") if len(frame.arguments) > 0 else "void") + ")")
-        iteration += 1 
+        iteration += 1
         if frame_p:
             last_frame_p = frame_p
 
@@ -409,9 +424,9 @@ def GetThreadBackTrace(thread_obj, verbosity = vHUMAN, prefix = ""):
     return out_string
 
 def GetSourceInformationForAddress(addr):
-    """ convert and address to function +offset information. 
+    """ convert and address to function +offset information.
         params: addr - int address in the binary to be symbolicated
-        returns: string of format "0xaddress: function + offset" 
+        returns: string of format "0xaddress: function + offset"
     """
     symbols = kern.SymbolicateFromAddress(addr)
     format_string = "{0:#018x} <{1:s} + {2:#0x}>"
@@ -429,7 +444,7 @@ def GetFrameLocalVariable(variable_name, frame_no=0):
     """ Find a local variable by name
         params:
           variable_name: str - name of variable to search for
-        returns: 
+        returns:
           core.value - if the variable is found.
           None   - if not found or not Valid
     """
@@ -466,16 +481,16 @@ def KernelDebugCommandsHelp(cmd_args=None):
     return None
 
 
-@lldb_command('showraw')    
+@lldb_command('showraw')
 def ShowRawCommand(cmd_args=None):
-    """ A command to disable the kernel summaries and show data as seen by the system. 
+    """ A command to disable the kernel summaries and show data as seen by the system.
         This is useful when trying to read every field of a struct as compared to brief summary
     """
     command = " ".join(cmd_args)
     lldb.debugger.HandleCommand('type category disable kernel' )
     lldb.debugger.HandleCommand( command )
     lldb.debugger.HandleCommand('type category enable kernel' )
+
 
 @lldb_command('xnudebug')
 def XnuDebugCommand(cmd_args=None):
@@ -537,18 +552,18 @@ def XnuDebugCommand(cmd_args=None):
         if test_name in lldb_command_tests:
             test = lldb_command_tests[test_name]
             print "Running test {:s}".format(test[0])
-            if test[2](kern, config, lldb, True) : 
+            if test[2](kern, config, lldb, True) :
                 print "[PASSED] {:s}".format(test[0])
             else:
                 print "[FAILED] {:s}".format(test[0])
-            return ""    
+            return ""
         else:
             print "No such test registered with name: {:s}".format(test_name)
             print "XNUDEBUG Available tests are:"
             for i in lldb_command_tests.keys():
                 print i
         return None
-    
+
     return False
 
 @lldb_command('showversion')
@@ -564,83 +579,68 @@ def ShowVersion(cmd_args=None):
     """
     print kern.version
 
-
-@lldb_command('paniclog', 'S')
-def ShowPanicLog(cmd_args=None, cmd_options={}):
-    """ Display the paniclog information
-        usage: (lldb) paniclog
-        options:
-            -v : increase verbosity
-            -S : parse stackshot data (if panic stackshot available)
+def ProcessPanicStackshot(panic_stackshot_addr, panic_stackshot_len):
+    """ Process the panic stackshot from the panic header, saving it to a file if it is valid
+        params: panic_stackshot_addr : start address of the panic stackshot binary data
+                panic_stackshot_len : length of the stackshot binary data
+        returns: nothing
     """
+    if not panic_stackshot_addr:
+        print "No panic stackshot available (invalid addr)"
+        return
 
-    if "-S" in cmd_options:
-        if hasattr(kern.globals, "kc_panic_data"):
-            stackshot_saved = False
-            # TODO: Update logic to handle "in-memory" panic stackshot on Gibraltar platforms
-            #       once we drop support for the on disk one there.
-            if kern.arch == 'x86_64':
-                if kern.globals.panic_stackshot_len != 0:
-                    stackshot_saved = True
-                else:
-                    print "No panic stackshot available"
-            else:
-                if unsigned(kern.globals.panic_info.eph_panic_flags) & xnudefines.EMBEDDED_PANIC_STACKSHOT_SUCCEEDED_FLAG:
-                    stackshot_saved = True
-                else:
-                    print "No panic stackshot available"
-            if stackshot_saved:
-                kc_data = unsigned(addressof(kern.globals.kc_panic_data))
-                ts = int(time.time())
-                ss_binfile = "/tmp/panic_%d.bin" % ts
-                ss_ipsfile = "/tmp/stacks_%d.ips" % ts
-                print "savekcdata  0x%x -O %s" % (kc_data, ss_binfile)
-                SaveKCDataToFile(["0x%x" % kc_data], {"-O":ss_binfile})
-                self_path = str(__file__)
-                base_dir_name = self_path[:self_path.rfind("/")]
-                print "python %s/kcdata.py %s -s %s" % (base_dir_name, ss_binfile, ss_ipsfile)
-                (c,so,se) = RunShellCommand("python %s/kcdata.py %s -s %s" % (base_dir_name, ss_binfile, ss_ipsfile))
-                if c == 0:
-                    print "Saved ips stackshot file as %s" % ss_ipsfile
-                else:
-                    print "Failed to run command: exit code: %d, SO: %s SE: %s" % (c, so, se)
-        else:
-            print "kc_panic_data is unavailable for this kernel config."
+    if not panic_stackshot_len:
+        print "No panic stackshot available (zero length)"
+        return;
 
-    out_str = ""
-    warn_str = ""
+    ts = int(time.time())
+    ss_binfile = "/tmp/panic_%d.bin" % ts
+    ss_ipsfile = "/tmp/stacks_%d.ips" % ts
 
-    if kern.arch == 'x86_64':
-        panic_buf = Cast(kern.globals.panic_info, 'char *')
-        panic_log_magic = unsigned(kern.globals.panic_info.mph_magic)
-        panic_log_begin_offset = unsigned(kern.globals.panic_info.mph_panic_log_offset)
-        panic_log_len = unsigned(kern.globals.panic_info.mph_panic_log_len)
-        other_log_begin_offset = unsigned(kern.globals.panic_info.mph_other_log_offset)
-        other_log_len = unsigned(kern.globals.panic_info.mph_other_log_len)
-        cur_debug_buf_ptr_offset = (unsigned(kern.globals.debug_buf_ptr) - unsigned(kern.globals.panic_info))
-        if other_log_begin_offset != 0 and (other_log_len == 0 or other_log_len < (cur_debug_buf_ptr_offset - other_log_begin_offset)):
-            other_log_len = cur_debug_buf_ptr_offset - other_log_begin_offset
-        expected_panic_magic = xnudefines.MACOS_PANIC_MAGIC
+    if not SaveDataToFile(panic_stackshot_addr, panic_stackshot_len, ss_binfile, None):
+        print "Failed to save stackshot binary data to file"
+        return
+
+    self_path = str(__file__)
+    base_dir_name = self_path[:self_path.rfind("/")]
+    print "python %s/kcdata.py %s -s %s" % (base_dir_name, ss_binfile, ss_ipsfile)
+    (c,so,se) = RunShellCommand("python %s/kcdata.py %s -s %s" % (base_dir_name, ss_binfile, ss_ipsfile))
+    if c == 0:
+        print "Saved ips stackshot file as %s" % ss_ipsfile
+        return
     else:
-        panic_buf = Cast(kern.globals.panic_info, 'char *')
-        panic_log_magic = unsigned(kern.globals.panic_info.eph_magic)
-        panic_log_begin_offset = unsigned(kern.globals.panic_info.eph_panic_log_offset)
-        panic_log_len = unsigned(kern.globals.panic_info.eph_panic_log_len)
-        other_log_begin_offset = unsigned(kern.globals.panic_info.eph_other_log_offset)
-        other_log_len = unsigned(kern.globals.panic_info.eph_other_log_len)
-        expected_panic_magic = xnudefines.EMBEDDED_PANIC_MAGIC
-
-    if panic_log_begin_offset == 0:
+        print "Failed to run command: exit code: %d, SO: %s SE: %s" % (c, so, se)
         return
 
+def ParseEmbeddedPanicLog(panic_header, cmd_options={}):
+    panic_buf = Cast(panic_header, 'char *')
+    panic_log_magic = unsigned(panic_header.eph_magic)
+    panic_log_begin_offset = unsigned(panic_header.eph_panic_log_offset)
+    panic_log_len = unsigned(panic_header.eph_panic_log_len)
+    other_log_begin_offset = unsigned(panic_header.eph_other_log_offset)
+    other_log_len = unsigned(panic_header.eph_other_log_len)
+    expected_panic_magic = xnudefines.EMBEDDED_PANIC_MAGIC
+    panic_stackshot_addr = unsigned(panic_header) + unsigned(panic_header.eph_stackshot_offset)
+    panic_stackshot_len = unsigned(panic_header.eph_stackshot_len)
+    panic_header_flags = unsigned(panic_header.eph_panic_flags)
+
+    warn_str = ""
+    out_str = ""
+
     if panic_log_magic != 0 and panic_log_magic != expected_panic_magic:
-        warn_str += "BAD MAGIC! Found 0x%x expected 0x%x".format(panic_log_magic,
+        warn_str += "BAD MAGIC! Found 0x%x expected 0x%x" % (panic_log_magic,
                     expected_panic_magic)
 
-    if panic_log_begin_offset == 0:
-        if warn_str:
-            print "\n %s" % warn_str
-        return
+    if warn_str:
+        print "\n %s" % warn_str
+        if panic_log_begin_offset == 0:
+            return
+
+    if "-S" in cmd_options:
+        if panic_header_flags & xnudefines.EMBEDDED_PANIC_STACKSHOT_SUCCEEDED_FLAG:
+            ProcessPanicStackshot(panic_stackshot_addr, panic_stackshot_len)
+        else:
+            print "No panic stackshot available"
 
     panic_log_curindex = 0
     while panic_log_curindex < panic_log_len:
@@ -656,12 +656,198 @@ def ShowPanicLog(cmd_args=None, cmd_options={}):
             other_log_curindex += 1
 
     print out_str
+    return
+
+def ParseMacOSPanicLog(panic_header, cmd_options={}):
+    panic_buf = Cast(panic_header, 'char *')
+    panic_log_magic = unsigned(panic_header.mph_magic)
+    panic_log_begin_offset = unsigned(panic_header.mph_panic_log_offset)
+    panic_log_len = unsigned(panic_header.mph_panic_log_len)
+    other_log_begin_offset = unsigned(panic_header.mph_other_log_offset)
+    other_log_len = unsigned(panic_header.mph_other_log_len)
+    cur_debug_buf_ptr_offset = (unsigned(kern.globals.debug_buf_ptr) - unsigned(panic_header))
+    if other_log_begin_offset != 0 and (other_log_len == 0 or other_log_len < (cur_debug_buf_ptr_offset - other_log_begin_offset)):
+        other_log_len = cur_debug_buf_ptr_offset - other_log_begin_offset
+    expected_panic_magic = xnudefines.MACOS_PANIC_MAGIC
+    panic_stackshot_addr = unsigned(panic_header) + unsigned(panic_header.mph_stackshot_offset)
+    panic_stackshot_len = unsigned(panic_header.mph_stackshot_len)
+    panic_header_flags = unsigned(panic_header.mph_panic_flags)
+
+    warn_str = ""
+    out_str = ""
+
+    if panic_log_magic != 0 and panic_log_magic != expected_panic_magic:
+        warn_str += "BAD MAGIC! Found 0x%x expected 0x%x" % (panic_log_magic,
+                    expected_panic_magic)
 
     if warn_str:
         print "\n %s" % warn_str
+        if panic_log_begin_offset == 0:
+            return
+
+    if "-S" in cmd_options:
+        if panic_header_flags & xnudefines.MACOS_PANIC_STACKSHOT_SUCCEEDED_FLAG:
+            ProcessPanicStackshot(panic_stackshot_addr, panic_stackshot_len)
+        else:
+            print "No panic stackshot available"
+
+    panic_log_curindex = 0
+    while panic_log_curindex < panic_log_len:
+        p_char = str(panic_buf[(panic_log_begin_offset + panic_log_curindex)])
+        out_str += p_char
+        panic_log_curindex += 1
+
+    if other_log_begin_offset != 0:
+        other_log_curindex = 0
+        while other_log_curindex < other_log_len:
+            p_char = str(panic_buf[(other_log_begin_offset + other_log_curindex)])
+            out_str += p_char
+            other_log_curindex += 1
+
+    print out_str
+    return
+
+def ParseAURRPanicLog(panic_header, cmd_options={}):
+    reset_cause = {
+        0x0: "OTHER",
+        0x1: "CATERR",
+        0x2: "SWD_TIMEOUT",
+        0x3: "GLOBAL RESET",
+        0x4: "STRAIGHT TO S5",
+    }
+
+    expected_panic_magic = xnudefines.AURR_PANIC_MAGIC
+
+    panic_buf = Cast(panic_header, 'char *')
+
+    try:
+        # This line will blow up if there's not type info for this struct (older kernel)
+        # We fall back to manual parsing below
+        aurr_panic_header = Cast(panic_header, 'struct efi_aurr_panic_header *')
+        panic_log_magic = unsigned(aurr_panic_header.efi_aurr_magic)
+        panic_log_version = unsigned(aurr_panic_header.efi_aurr_version)
+        panic_log_reset_cause = unsigned(aurr_panic_header.efi_aurr_reset_cause)
+        panic_log_reset_log_offset = unsigned(aurr_panic_header.efi_aurr_reset_log_offset)
+        panic_log_reset_log_len = unsigned(aurr_panic_header.efi_aurr_reset_log_len)
+    except Exception as e:
+        print "*** Warning: kernel symbol file has no type information for 'struct efi_aurr_panic_header'..."
+        print "*** Warning: trying to manually parse..."
+        aurr_panic_header = Cast(panic_header, "uint32_t *")
+        panic_log_magic = unsigned(aurr_panic_header[0])
+        # panic_log_crc = unsigned(aurr_panic_header[1])
+        panic_log_version = unsigned(aurr_panic_header[2])
+        panic_log_reset_cause = unsigned(aurr_panic_header[3])
+        panic_log_reset_log_offset = unsigned(aurr_panic_header[4])
+        panic_log_reset_log_len = unsigned(aurr_panic_header[5])
+
+    if panic_log_magic != 0 and panic_log_magic != expected_panic_magic:
+        print "BAD MAGIC! Found 0x%x expected 0x%x" % (panic_log_magic,
+                    expected_panic_magic)
+        return
+
+    print "AURR Panic Version: %d" % (panic_log_version)
+
+    # When it comes time to extend this in the future, please follow the
+    # construct used below in ShowPanicLog()
+    if panic_log_version in (xnudefines.AURR_PANIC_VERSION, xnudefines.AURR_CRASHLOG_PANIC_VERSION):
+        # AURR Report Version 1 (AURR/MacEFI) or 2 (Crashlog)
+        # see macefifirmware/Vendor/Apple/EfiPkg/AppleDebugSupport/Library/Debugger.h
+        print "Reset Cause: 0x%x (%s)" % (panic_log_reset_cause, reset_cause.get(panic_log_reset_cause, "UNKNOWN"))
+
+        # Adjust panic log string length (cap to maximum supported values)
+        if panic_log_version == xnudefines.AURR_PANIC_VERSION:
+            max_string_len = panic_log_reset_log_len and min(panic_log_reset_log_len, xnudefines.AURR_PANIC_STRING_LEN) or 0
+        elif panic_log_version == xnudefines.AURR_CRASHLOG_PANIC_VERSION:
+            max_string_len = xnudefines.CRASHLOG_PANIC_STRING_LEN
+
+        panic_str_offset = 0
+        out_str = ""
+
+        while panic_str_offset < max_string_len:
+            p_char = str(panic_buf[panic_log_reset_log_offset + panic_str_offset])
+            out_str += p_char
+            panic_str_offset += 1
+
+        print out_str
 
+        # Save Crashlog Binary Data (if available)
+        if "-S" in cmd_options and panic_log_version == xnudefines.AURR_CRASHLOG_PANIC_VERSION:
+            crashlog_binary_offset = panic_log_reset_log_offset + xnudefines.CRASHLOG_PANIC_STRING_LEN
+            crashlog_binary_size = (panic_log_reset_log_len > xnudefines.CRASHLOG_PANIC_STRING_LEN) and (panic_log_reset_log_len - xnudefines.CRASHLOG_PANIC_STRING_LEN) or 0
+
+            if 0 == crashlog_binary_size:
+                print "No crashlog data found..."
+                return
+
+            # Save to file
+            ts = int(time.time())
+            ss_binfile = "/tmp/crashlog_%d.bin" % ts
+
+            if not SaveDataToFile(panic_buf + crashlog_binary_offset, crashlog_binary_size, ss_binfile, None):
+                print "Failed to save crashlog binary data to file"
+                return
+    else:
+        return ParseUnknownPanicLog(panic_header, cmd_options)
+
+    return
+
+def ParseUnknownPanicLog(panic_header, cmd_options={}):
+    magic_ptr = Cast(panic_header, 'uint32_t *')
+    panic_log_magic = dereference(magic_ptr)
+    print "Unrecognized panic header format. Magic: 0x%x..." % unsigned(panic_log_magic)
+    print "Panic region starts at 0x%08x" % int(panic_header)
+    print "Hint: To dump this panic header in order to try manually parsing it, use this command:"
+    print " (lldb) memory read -fx -s4 -c64 0x%08x" % int(panic_header)
+    print " ^ that will dump the first 256 bytes of the panic region"
+    ## TBD: Hexdump some bits here to allow folks to poke at the region manually?
     return
 
+
+@lldb_command('paniclog', 'SM')
+def ShowPanicLog(cmd_args=None, cmd_options={}):
+    """ Display the paniclog information
+        usage: (lldb) paniclog
+        options:
+            -v : increase verbosity
+            -S : parse stackshot data (if panic stackshot available)
+            -M : parse macOS panic area (print panic string (if available), and/or capture crashlog info)
+    """
+
+    if "-M" in cmd_options:
+        if not hasattr(kern.globals, "mac_panic_header"):
+            print "macOS panic data requested but unavailable on this device"
+            return
+        panic_header = kern.globals.mac_panic_header
+        # DEBUG HACK FOR TESTING
+        #panic_header = kern.GetValueFromAddress(0xfffffff054098000, "uint32_t *")
+    else:
+        panic_header = kern.globals.panic_info
+
+    if hasattr(panic_header, "eph_magic"):
+        panic_log_magic = unsigned(panic_header.eph_magic)
+    elif hasattr(panic_header, "mph_magic"):
+        panic_log_magic = unsigned(panic_header.mph_magic)
+    else:
+        print "*** Warning: unsure of panic header format, trying anyway"
+        magic_ptr = Cast(panic_header, 'uint32_t *')
+        panic_log_magic = int(dereference(magic_ptr))
+
+    if panic_log_magic == 0:
+        # No panic here..
+        return
+
+    panic_parsers = {
+        int(xnudefines.AURR_PANIC_MAGIC)     : ParseAURRPanicLog,
+        int(xnudefines.MACOS_PANIC_MAGIC)    : ParseMacOSPanicLog,
+        int(xnudefines.EMBEDDED_PANIC_MAGIC) : ParseEmbeddedPanicLog,
+    }
+
+    # Find the right parser (fall back to unknown parser above)
+    parser = panic_parsers.get(panic_log_magic, ParseUnknownPanicLog)
+
+    # execute it
+    return parser(panic_header, cmd_options)
+
 @lldb_command('showbootargs')
 def ShowBootArgs(cmd_args=None):
     """ Display boot arguments passed to the target kernel
@@ -672,7 +858,7 @@ def ShowBootArgs(cmd_args=None):
 
 @static_var("last_process_uniq_id", 1)
 def GetDebuggerStopIDValue():
-    """ Create a unique session identifier. 
+    """ Create a unique session identifier.
         returns:
             int - a unique number identified by processid and stopid.
     """
@@ -689,7 +875,7 @@ def GetDebuggerStopIDValue():
         GetDebuggerStopIDValue.last_process_uniq_id +=1
         proc_uniq_id = GetDebuggerStopIDValue.last_process_uniq_id + 1
 
-    stop_id_str = "{:d}:{:d}".format(proc_uniq_id, stop_id)        
+    stop_id_str = "{:d}:{:d}".format(proc_uniq_id, stop_id)
     return hash(stop_id_str)
 
 # The initialization code to add your commands
@@ -703,6 +889,11 @@ def __lldb_init_module(debugger, internal_dict):
     debugger.HandleCommand('type summary add --regex --summary-string "${var%s}" -C yes -p -v "char \[[0-9]*\]"')
     debugger.HandleCommand('type format add --format hex -C yes uintptr_t')
     kern = KernelTarget(debugger)
+    if not hasattr(lldb.SBValue, 'GetValueAsAddress'):
+        warn_str = "WARNING: lldb version is too old. Some commands may break. Please update to latest lldb."
+        if os.isatty(sys.__stdout__.fileno()):
+            warn_str = VT.DarkRed + warn_str + VT.Default
+        print warn_str
     print "xnu debug macros loaded successfully. Run showlldbtypesummaries to enable type summaries."
 
 __lldb_init_module(lldb.debugger, None)
@@ -729,11 +920,11 @@ def ShowLLDBTypeSummaries(cmd_args=[]):
 
 @lldb_command('walkqueue_head', 'S')
 def WalkQueueHead(cmd_args=[], cmd_options={}):
-    """ walk a queue_head_t and list all members in it. Note this is for queue_head_t. refer to osfmk/kern/queue.h 
+    """ walk a queue_head_t and list all members in it. Note this is for queue_head_t. refer to osfmk/kern/queue.h
         Option: -S - suppress summary output.
         Usage: (lldb) walkqueue_head  <queue_entry *> <struct type> <fieldname>
         ex:    (lldb) walkqueue_head  0x7fffff80 "thread *" "task_threads"
-        
+
     """
     global lldb_summary_definitions
     if not cmd_args:
@@ -754,7 +945,7 @@ def WalkQueueHead(cmd_args=[], cmd_options={}):
             print lldb_summary_definitions[el_type](i)
         else:
             print "{0: <#020x}".format(i)
-    
+
 
 
 @lldb_command('walklist_entry', 'S')
@@ -768,7 +959,7 @@ def WalkList(cmd_args=[], cmd_options={}):
         Option: -S - suppress summary output.
         Usage: (lldb) walklist_entry  <obj with list_entry *> <struct type> <fieldname>
         ex:    (lldb) walklist_entry  0x7fffff80 "struct proc *" "p_sibling"
-        
+
     """
     global lldb_summary_definitions
     if not cmd_args:
@@ -926,7 +1117,7 @@ def IOTrace_cmd(cmd_args=[], cmd_options={}):
 
 from memory import *
 from process import *
-from ipc import * 
+from ipc import *
 from pmap import *
 from ioreg import *
 from mbufs import *
@@ -952,5 +1143,7 @@ from pgtrace import *
 from xnutriage import *
 from kevent import *
 from workqueue import *
+from ulock import *
 from ntstat import *
 from zonetriage import *
+from sysreg import *
index 9ae4701736f9ca4cf8819274021a56a1ad8ea012..a91d0831d51022abc0fc0a52fcf2a7784d6d4993 100755 (executable)
@@ -4,7 +4,7 @@
     The objective is to provide a single place to be the bridge between C code in xnu and the python macros used by lldb.
     If you define a variable which has been copied/referred over from C code and has high chance of changing over time. It would
     be best to define a supporting function of format "populate_<variable_name>". This will help in running them to repopulate.
-     
+
     Please take a look at example of kobject_types below before making changes to this file.
     Note: The Format of the function has to be populate_<variable_name> so that the automated updating will pick it up.
 """
@@ -70,18 +70,21 @@ arm_level2_access_strings = [ " noaccess",
                               " "
                              ]
 
-kq_state_strings = { 0x000: '',
-                     0x001: 'SEL',
-                     0x002: 'SLEEP',
-                     0x004: 'PROCWAIT',
-                     0x008: 'KEV32',
-                     0x010: 'KEV64',
-                     0x020: 'KEVQOS',
-                     0x040: 'WORKQ',
-                     0x080: 'WORKLOOP',
-                     0x100: 'PROCESS',
-                     0x200: 'DRAIN',
-                     0x400: 'WAKEUP' }
+kq_state_strings = { 0x0000: '',
+                     0x0001: 'SEL',
+                     0x0002: 'SLEEP',
+                     0x0004: 'PROCWAIT',
+                     0x0008: '32',
+                     0x0010: '64',
+                     0x0020: 'QOS',
+                     0x0040: 'WQ',
+                     0x0080: 'WL',
+                     0x0100: 'PROCESS',
+                     0x0200: 'DRAIN',
+                     0x0400: 'WAKEUP',
+                     0x0800: 'DYN',
+                     0x1000: 'R2K',
+                     0x2000: 'TS' }
 
 kn_state_strings = { 0x0000: '',
                      0x0001: 'ACTIVE',
@@ -89,23 +92,14 @@ kn_state_strings = { 0x0000: '',
                      0x0004: 'DISABLED',
                      0x0008: 'DROPPING',
                      0x0010: 'LOCKED',
-                     0x0020: 'ATTACHING',
+                     0x0020: 'POSTING',
                      0x0040: 'STAYACTIVE',
-                     0x0080: 'DEFERDROP',
-                     0x0100: 'ATTACHED',
-                     0x0200: 'DISPATCH',
-                     0x0400: 'UDATASPEC',
-                     0x0800: 'SUPPRESS',
-                     0x1000: 'MERGE_QOS',
-                     0x2000: 'REQVANISH',
-                     0x4000: 'VANISHED' }
-
-kqrequest_state_strings = { 0x01: 'WORKLOOP',
-                            0x02: 'THREQUESTED',
-                            0x04: 'WAKEUP',
-                            0x08: 'THOVERCOMMIT',
-                            0x10: 'R2K_ARMED',
-                            0x20: 'ALLOC_TURNSTILE' }
+                     0x0080: 'DEFERDELETE',
+                     0x0100: 'MERGE_QOS',
+                     0x0200: 'REQVANISH',
+                     0x0400: 'VANISHED',
+                     0x0800: 'SUPPRESS' }
+
 thread_qos_short_strings = { 0: '--',
                              1: 'MT',
                              2: 'BG',
@@ -179,8 +173,8 @@ proc_flag_explain_strings = ["!0x00000004 - process is 32 bit",  #only exception
                              ]
 #File: xnu/osfmk/kern/ipc_kobject.h
 # string representations for Kobject types
-kobject_types = ['', 'THREAD', 'TASK', 'HOST', 'HOST_PRIV', 'PROCESSOR', 'PSET', 'PSET_NAME', 'TIMER', 'PAGER_REQ', 'DEVICE', 'XMM_OBJECT', 'XMM_PAGER', 'XMM_KERNEL', 'XMM_REPLY', 
-                     'NOTDEF 15', 'NOTDEF 16', 'HOST_SEC', 'LEDGER', 'MASTER_DEV', 'TASK_NAME', 'SUBSYTEM', 'IO_DONE_QUE', 'SEMAPHORE', 'LOCK_SET', 'CLOCK', 'CLOCK_CTRL' , 'IOKIT_SPARE', 
+kobject_types = ['', 'THREAD', 'TASK', 'HOST', 'HOST_PRIV', 'PROCESSOR', 'PSET', 'PSET_NAME', 'TIMER', 'PAGER_REQ', 'DEVICE', 'XMM_OBJECT', 'XMM_PAGER', 'XMM_KERNEL', 'XMM_REPLY',
+                     'NOTDEF 15', 'NOTDEF 16', 'HOST_SEC', 'LEDGER', 'MASTER_DEV', 'TASK_NAME', 'SUBSYTEM', 'IO_DONE_QUE', 'SEMAPHORE', 'LOCK_SET', 'CLOCK', 'CLOCK_CTRL' , 'IOKIT_SPARE',
                       'NAMED_MEM', 'IOKIT_CON', 'IOKIT_OBJ', 'UPL', 'MEM_OBJ_CONTROL', 'AU_SESSIONPORT', 'FILEPORT', 'LABELH', 'TASK_RESUME', 'VOUCHER', 'VOUCHER_ATTR_CONTROL', 'WORK_INTERVAL',
                       'UX_HANDLER']
 
@@ -213,7 +207,15 @@ EMBEDDED_PANIC_MAGIC = 0x46554E4B
 EMBEDDED_PANIC_STACKSHOT_SUCCEEDED_FLAG = 0x02
 
 MACOS_PANIC_MAGIC = 0x44454544
+MACOS_PANIC_STACKSHOT_SUCCEEDED_FLAG = 0x04
+
+AURR_PANIC_MAGIC = 0x41555252
+AURR_PANIC_STRING_LEN = 112
+AURR_PANIC_VERSION = 1
+
+CRASHLOG_PANIC_STRING_LEN = 32
+AURR_CRASHLOG_PANIC_VERSION = 2
 
 if __name__ == "__main__":
     populate_kobject_types("../../")
-    
+
index 15213c0a2ea0c014e9e3cf2d3c575e68d34e380f..49aaa5c96a4e87a15a2701d6e5a0697dbe20f19b 100644 (file)
@@ -70,13 +70,13 @@ $(OBJROOT)/%.o: $(SRCROOT)/%.c
        $(CC) $(CFLAGS) $? -o $@
 
 $(DSTROOT):
-       mkdir -p $(DSTROOT);
+       mkdir -p $(DSTROOT)
 
 $(OBJROOT):
-       mkdir -p $(OBJROOT);
+       mkdir -p $(OBJROOT)
 
 $(SYMROOT):
-       mkdir -p $(SYMROOT);
+       mkdir -p $(SYMROOT)
 
 clean:
        rm -rf $(OBJROOT)
index ab4fbadc78114353186d620765657a959b58e19c..e3f8ee3ef0b557e9b4907534f5b4116102299650 100644 (file)
@@ -6,7 +6,7 @@
 #include <assert.h>
 #include <TargetConditionals.h>
 
-#if TARGET_OS_EMBEDDED
+#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 #define MEMSIZE (1L<<28)
 #else
 #define MEMSIZE (1L<<30)
index d2c718f39379f61390b99549f34912af0fb9b264..ba66220c8303520b4076d9dba5fbfd08eb6d27e7 100644 (file)
@@ -1,6 +1,8 @@
 include ../Makefile.common
 
 CC:=$(shell xcrun -sdk "$(SDKROOT)" -find cc)
+CODESIGN:=$(shell xcrun -sdk "$(SDKROOT)" -find codesign)
+CODESIGN_ALLOCATE:=$(shell xcrun -sdk "$(SDKROOT)" -find codesign_allocate)
 
 SYMROOT?=$(shell /bin/pwd)
 
@@ -30,15 +32,21 @@ ARCH_FLAGS := $(if $(ARCH_64), $(ARCH_64_FLAGS)) $(if $(ARCH_32), $(ARCH_32_FLAG
 
 DSTROOT?=$(shell /bin/pwd)
 
-TARGETS := persona_mgr persona_spawn persona_test_run.sh
+TARGETS := persona_mgr persona_spawn persona_test_run.sh persona_spawn_unentitled
 
 all: $(addprefix $(DSTROOT)/, $(TARGETS))
 
-$(DSTROOT)/persona_%: persona_%.c persona_test.h Makefile
+$(DSTROOT)/persona_%: persona_%.c persona_test.h Makefile persona-entitlements.plist
        ${CC} ${CFLAGS} ${ARCH_FLAGS} -o $(SYMROOT)/$(notdir $@) $<
+       env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) \
+           $(CODESIGN) -s - --entitlements persona-entitlements.plist $(SYMROOT)/$(notdir $@)
        if [ ! -e $@ ]; then ditto $(SYMROOT)/$(notdir $@) $@; fi
 
-$(DSTROOT)/persona_test_run.sh: persona_test_run.sh
+$(DSTROOT)/persona_spawn_unentitled: persona_spawn.c persona_test.h Makefile
+       ${CC} ${CFLAGS} ${ARCH_FLAGS} -o $(SYMROOT)/$(notdir $@) $<
+       if [ ! -e $@ ]; then ditto $(SYMROOT)/$(notdir $@) $@; fi
+
+$(DSTROOT)/persona_test_run.sh: persona_test_run_src.sh
        cp $? $@
        chmod +x $@
 
diff --git a/tools/tests/personas/persona-entitlements.plist b/tools/tests/personas/persona-entitlements.plist
new file mode 100644 (file)
index 0000000..43ddfad
--- /dev/null
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>com.apple.private.persona-mgmt</key>
+       <true/>
+</dict>
+</plist>
diff --git a/tools/tests/personas/persona_test_run.sh b/tools/tests/personas/persona_test_run.sh
deleted file mode 100755 (executable)
index b07ec37..0000000
+++ /dev/null
@@ -1,569 +0,0 @@
-#!/bin/bash
-# persona_test_run.sh
-#
-# This file aims to be a comprehensive test suite for the persona subsystem.
-# It uses two tools:
-#   1. persona_mgr - create, destroy, lookup personas
-#   2. persona_spawn - spawn processes into personas with a variety of options
-# The script relies heavily on the particular output of these tools, so if you
-# are modifying / extending those tools, this file also need to be updated to
-# properly capture the new output. Specifically, the get_persona_info function
-# needs to be maintained / updated.
-#
-# NOTE: the function get_persona_info() also needs to be kept up to date with
-# the types of personas found in bsd/sys/persona.h
-
-# be sure to bail on script errors and unepected tool failures
-set -e
-
-PERSONA_MGR="${PWD}/persona_mgr"
-PERSONA_SPAWN="${PWD}/persona_spawn"
-
-if [ ! -d "$TMPDIR" ]; then
-       echo "Couldn't find temp directory '$TMPDIR': check permissions/environment?"
-       exit 255
-fi
-
-if [ ! -e "${PERSONA_MGR}" ] ||  [ ! -x "${PERSONA_MGR}" ]; then
-       echo "Can't find '${PERSONA_MGR}': skipping test"
-       exit 0
-fi
-if [ ! -e "${PERSONA_SPAWN}" ] || [ ! -x "${PERSONA_SPAWN}" ]; then
-       echo "Can't find '${PERSONA_SPAWN}': skipping test"
-       exit 0
-fi
-
-function check_for_persona_support() {
-       local errno=0
-       ${PERSONA_MGR} support || errno=$?
-       if [ $errno -eq 78 ]; then
-               echo "Persona subsystem is not supported - skipping tests"
-               exit 0
-       fi
-       return 0
-}
-check_for_persona_support
-
-
-## bail [failure_msg]
-#
-# exit the script with an error code that corresponds to the line number
-# from which this function was invoked. Because we want to exit with a
-# non-zero exit code, we use: 1 + (254 % line).
-#
-function bail() {
-       local msg="$1"
-       local line=$2
-       if [ -z "$line" ]; then
-               line=${BASH_LINENO[0]}
-       fi
-       echo "[$line] ERROR: $msg" 1>&2
-       exit $((1 + $line % 254))
-}
-
-## check_return [message_on_failure]
-#
-# Check the return value of the previous command or script line. If the
-# value of '$?' is not 0, then call bail() with an appropriate message.
-#
-function check_return() {
-       local err=$?
-       local msg=$1
-       local line=$2
-       if [ -z "$line" ]; then
-               line=${BASH_LINENO[0]}
-       fi
-       echo "CHECK: $msg"
-       if [ $err -ne 0 ]; then
-               bail "e=$err: $msg" $line
-       fi
-
-       return 0
-}
-
-## expect_failure [message_on_success]
-#
-# Check the return value of the previous command or script line. If the
-# value of '$?' is 0 (success), then call bail() with a message saying
-# that we expected this previous command/line to fail.
-# 
-function expect_failure() {
-       local err=$?
-       local msg=$1
-       local line=$2
-       if [ -z "$line" ]; then
-               line=${BASH_LINENO[0]}
-       fi
-       if [ $err -eq 0 ]; then
-               bail "found success, expected failure: $msg" $line
-       fi
-
-       echo "EXPECT: failure: $msg"
-       return 0
-}
-
-## test_num [debug_info] [number]
-#
-# Check that a variable value is a number, bail() on error.
-#
-function test_num() {
-       local type=$1
-       local num=$2
-       local line=$3
-       if [ -z "$line" ]; then
-               line=${BASH_LINENO[0]}
-       fi
-       if [ -z "$num" ]; then
-               bail "invalid (NULL) $type" $line
-       fi
-       [ "$num" -eq "$num" ] 2>/dev/null
-       if [ $? -ne 0 ]; then
-               bail "invalid $type: $num" $line
-       fi
-
-       return 0
-}
-
-## global variables used to return values to callers
-_ID=-1
-_TYPE="invalid"
-_LOGIN=""
-_UID=-1
-_GID=-1
-_NGROUPS=-1
-_GROUPS=""
-
-## get_persona_info {persona_id} {persona_login}
-#
-# Lookup persona info for the given ID/login. At least one of the ID/login
-# parameters must be valid
-function get_persona_info() {
-       local pna_id=${1:-1}
-       local pna_login=${2:- }
-       local line=$3
-       if [ -z "$line" ]; then
-               line=${BASH_LINENO[0]}
-       fi
-
-       local largs="-u ${pna_id}"
-       if [ "${pna_login}" != " " ]; then
-               largs+=" -l ${pna_login}"
-       fi
-
-       _ID=-1
-       _TYPE=-1
-       _LOGIN=""
-       _UID=-1
-       _GID=-1
-       _NGROUPS=-1
-       _GROUPS=()
-
-       local file="${TMPDIR}/plookup"
-
-       ${PERSONA_MGR} lookup ${largs} > "${file}"
-       check_return "persona lookup of: ${largs}" $line
-
-       _ID=$(cat "${file}" | grep "+id: " | head -1 | sed 's/.*+id:[ ]*\([0-9][0-9]*\).*/\1/')
-       test_num "Persona ID lookup:${largs}" "$_ID"
-
-       local type=$(cat "${file}" | grep "+type: " | head -1 | sed 's/.*+type:[ ]*\([0-9][0-9]*\).*/\1/')
-       test_num "+type lookup:${largs}" "$type"
-       ##
-       ## NOTE: keep in sync with bsd/sys/persona.h types!
-       ##
-       if [ $type -eq 1 ]; then
-               _TYPE=guest
-       elif [ $type -eq 2 ]; then
-               _TYPE=managed
-       elif [ $type -eq 3 ]; then
-               _TYPE=priv
-       elif [ $type -eq 4 ]; then
-               _TYPE=system
-       else
-               _TYPE=invalid
-       fi
-
-       _LOGIN=$(cat "${file}" | grep "+login: " | head -1 | sed 's/.*+login:[ ]*"\([^"]*\)".*/\1/')
-       if [ -z "$_LOGIN" ]; then
-               bail "invalid login for pna_id:$_ID: '$_LOGIN'" $line
-       fi
-
-       # these are always the same
-       _UID=$_ID
-
-       _GID=$(cat "${file}" | grep "+gid: " | head -1 | sed 's/.*+gid:[ ]*\([0-9][0-9]*\).*/\1/')
-       test_num "GID lookup:${largs}" "$_GID"
-
-       _NGROUPS=$(cat "${file}" | grep "ngroups: " | head -1 | sed 's/.*ngroups:[ ]*\([0-9][0-9]*\)[ ][ ]*{.*}.*/\1/')
-       test_num "NGROUPS lookup:${largs}" "$_NGROUPS"
-
-       _GROUPS=( $(cat "${file}" | grep "ngroups: " | head -1 | sed 's/.*ngroups:[ ]*[0-9][0-9]*[ ][ ]*{[ ]*\([^ ].*\)[ ][ ]*}.*/\1/') )
-       if [ $_NGROUPS -gt 0 ]; then
-               if [ -z "${_GROUPS}" ]; then
-                       bail "lookup:${largs}: missing $_NGROUPS groups" $line
-               fi
-               if [ ${#_GROUPS[@]} -ne $_NGROUPS ]; then
-                       bail "lookup:${largs} wrong number of groups ${#_GROUPS[@]} != $_NGROUPS" $line
-               fi
-       fi
-}
-
-## validate_child_info [output_file] [persona_id] {uid} {gid} {groups}
-#
-# Parse the output of the 'persona_spawn' command and validate that
-# the new child process is in the correct persona with the correct
-# process attributes.
-#
-function validate_child_info() {
-       local file=$1
-       local pna_id=$2
-       local uid=${3:--1}
-       local gid=${4:--1}
-       local groups=${5:- }
-       local line=$6
-       if [ -z "$line" ]; then
-               line=${BASH_LINENO[0]}
-       fi
-       local l=( )
-
-       # get the child's PID
-       local cpid="$(cat "$file" | grep "Child: PID:" | sed 's/.*Child: PID:\([0-9][0-9]*\).*/\1/')"
-       test_num "Child PID" "$cpid" $line
-
-       # validate the child's persona
-       l=( $(cat "$file" | grep "Child: Persona:" | sed 's/.*Child: Persona: \([0-9][0-9]*\) (err:\([0-9][0-9]*\))/\1 \2/') )
-       if [ ${#l[@]} -ne 2 ]; then
-               bail "Invalid Child[$cpid] Persona line" $line
-       fi
-       test_num "Child Persona ID" "${l[0]}" $line
-       test_num "kpersona_info retval" "${l[1]}" $line
-
-       if [ ${l[0]} -ne $pna_id ]; then
-               bail "Child[$cpid] persona:${l[0]} != specified persona:$pna_id" $line
-       fi
-
-       # Validate the UID/GID
-       l=( $(cat "$file" | grep "Child: UID:" | sed 's/.*UID:\([0-9][0-9]*\), GID:\([0-9][0-9]*\).*/\1 \2/') )
-       if [ ${#l[@]} -ne 2 ]; then
-               bail "Invalid Child[$cpid] UID/GID output" $line
-       fi
-       if [ $uid -ge 0 ]; then
-               if [ $uid -ne ${l[0]} ]; then
-                       bail "Child[$cpid] UID:${l[0]} != specified UID:$uid" $line
-               fi
-       fi
-       if [ $gid -ge 0 ]; then
-               if [ $gid -ne ${l[1]} ]; then
-                       bail "Child[$cpid] GID:${l[1]} != specified GID:$gid" $line
-               fi
-       fi
-
-       # TODO: validate / verify groups?
-
-       return 0
-}
-
-
-## spawn_child [persona_id] {uid} {gid} {group_spec}
-#
-# Create a child process that is spawn'd into the persona given by
-# the first argument (pna_id). The new process can have its UID, GID,
-# and group membership properties overridden.
-#
-function spawn_child() {
-       local pna_id=$1
-       local uid=${2:--1}
-       local gid=${3:--1}
-       local groups=${4:- }
-       local line=$5
-       if [ -z "$line" ]; then
-               line=${BASH_LINENO[0]}
-       fi
-
-       local file="child.${pna_id}"
-       local spawn_args="-I $pna_id"
-       if [ $uid -ge 0 ]; then
-               spawn_args+=" -u $uid"
-               file+=".u$uid"
-       fi
-       if [ $gid -ge 0 ]; then
-               spawn_args+=" -g $gid"
-               file+=".g$gid"
-       fi
-       if [ "$groups" != " " ]; then
-               spawn_args+=" -G $groups"
-               file+="._groups"
-       fi
-
-       echo "SPAWN: $file"
-       ${PERSONA_SPAWN} -v $spawn_args ${PERSONA_SPAWN} child -v -E > "${TMPDIR}/$file"
-       check_return "child info: $file" $line
-
-       # Grab the specified persona's info so we can
-       # verify the child's info against it.
-       # This function puts data into global variables, e.g. _ID, _GID, etc.
-       get_persona_info ${pna_id} " " $line
-       if [ $uid -lt 0 ]; then
-               uid=$_UID
-       fi
-       if [ $gid -lt 0 ]; then
-               gid=$_GID
-       fi
-       if [ "$groups" == " " ]; then
-               # convert a bash array into a comma-separated list for validation
-               local _g="${_GROUPS[@]}"
-               groups="${_g// /,}"
-       fi
-
-       validate_child_info "${TMPDIR}/$file" "$pna_id" "$uid" "$gid" "$groups" $line
-
-       ## TODO: validate that the first child spawned into a persona *cannot* spawn
-       ## into a different persona...
-       ##if [ $uid -eq 0 ]; then
-       ##      ${PERSONA_SPAWN} -v $spawn_args ${PERSONA_SPAWN} child -v -E -R -v -I 99 /bin/echo "This is running in the system persona"
-       ##      expect_failure "Spawned child that re-execs into non-default persona" $line
-       ##fi
-       return 0
-}
-
-## get_created_id [output_file]
-#
-# Parse the output of the 'persona_mgr' command to determine the ID
-# of the newly created persona.
-#
-function get_created_id() {
-       local file=$1
-       local o=$(cat "$file" | grep "Created persona" | sed 's/.*Created persona \([0-9][0-9]*\):/\1/')
-       echo $o
-       return 0
-}
-
-## create_persona [login_name] [persona_type] {persona_id} {gid} {group_spec}
-#
-# Create a new persona with given parameters.
-#
-# Returns: the newly created persona ID via the global variable, $_ID
-#
-function create_persona() {
-       local name=${1}
-       local type=${2}
-       local pna_id=${3:--1}
-       local gid=${4:--1}
-       local groups=${5:- }
-       local line=$6
-       if [ -z "$line" ]; then
-               line=${BASH_LINENO[0]}
-       fi
-
-       if [ -z "$name" -o -z "$type" ]; then
-               bail "Invalid arguments to create_persona '$name' '$type'" $line
-       fi
-
-       local file="persona.at${line}"
-       # persona ID of '-1' is auto-assigned
-       local spawn_args="-v -l $name -i $pna_id"
-       if [ $pna_id -eq -1 ]; then
-               file+=".auto"
-       else
-               file+=".${pna_id}"
-       fi
-
-       spawn_args+=" -t $type"
-       file+=".$type"
-
-       if [ $gid -ge 0 ]; then
-               spawn_args+=" -g $gid"
-               file+=".g$gid"
-       fi
-       if [ "$groups" != " " ]; then
-               spawn_args+=" -G $groups"
-               file+="._groups"
-       fi
-
-       echo "CREATE: $file"
-       ${PERSONA_MGR} create ${spawn_args} > "${TMPDIR}/${file}"
-       check_return "persona creation: ${file}" $line
-       # test output should include persona creation output for later debugging
-       cat "${TMPDIR}/${file}"
-
-       # validate the output of the persona_mgr tool (what we think we created)
-       _ID=`get_created_id "${TMPDIR}/${file}"`
-       test_num "persona_id for $file" "$_ID" $line
-       if [ ${pna_id} -gt 0 ]; then
-               if [ $_ID -ne ${pna_id} ]; then
-                       bail "Created persona doesn't have expected ID $_ID != ${pna_id}" $line
-               fi
-       fi
-
-       # validate the entire persona information (what a kpersona_lookup says we created)
-       # This function puts data into global variables, e.g. _ID, _LOGIN, _GID, etc.
-       echo "VALIDATE: ${file}"
-       get_persona_info ${pna_id} "$name" $line
-       if [ "$name" != "$_LOGIN" ]; then
-               bail "${file}: unexpected login '$_LOGIN' != '$name'" $line
-       fi
-       if [ "$type" != "$_TYPE" ]; then
-               bail "${file}: unexpected type '$_TYPE' != '$type'" $line
-       fi
-       if [ ${pna_id} -gt 0 ]; then
-               if [ ${pna_id} -ne $_ID ]; then
-                       bail "${file}: unexpected ID '$_ID' != '${pna_id}'" $line
-               fi
-       fi
-       if [ $gid -ge 0 ]; then
-               if [ $gid -ne $_GID ]; then
-                       bail "${file}: unexpected GID '$_GID' != '$gid'" $line
-               fi
-       fi
-       if [ "$groups" != " " ]; then
-               local _g="${_GROUPS[@]}"
-               if [ "${_g// /,}" != "$groups" ]; then
-                       bail "${file}: unexpected groups '${_g// /,}' != '$groups'" $line
-               fi
-       fi
-
-       return 0
-}
-
-## destroy_persona [persona_id]
-#
-# Destroy the given persona.
-#
-function destroy_persona() {
-       local pna_id=$1
-       local line=$2
-       if [ -z "$line" ]; then
-               line=${BASH_LINENO[0]}
-       fi
-
-       echo "DESTROY: ${pna_id}"
-       ${PERSONA_MGR} destroy -v -i ${pna_id}
-       check_return "destruction of ${pna_id}" $line
-}
-
-#
-#
-# Begin Tests!
-#
-#
-echo "Running persona tests [$LINENO] ($TMPDIR)"
-
-##
-## Test Group 0: basic creation + spawn tests
-##
-
-# default group, specific ID
-create_persona "test0_1" "guest" 1001
-P0ID=$_ID
-spawn_child $P0ID
-spawn_child $P0ID 1100
-spawn_child $P0ID 0
-spawn_child $P0ID -1 1101
-spawn_child $P0ID 1100 1101
-spawn_child $P0ID 1100 1101 1000,2000,3000
-spawn_child $P0ID 1100 -1 1000,2000,3000
-spawn_child $P0ID -1 -1 1000,2000,3000
-destroy_persona $P0ID
-
-# specific ID, non-default group
-create_persona "test0_2" "guest" 1002 2000
-P0ID=$_ID
-spawn_child $P0ID
-spawn_child $P0ID 1100
-spawn_child $P0ID 0
-spawn_child $P0ID -1 1101
-spawn_child $P0ID 1100 1101
-spawn_child $P0ID 1100 1101 1000,2000,3000
-spawn_child $P0ID 1100 -1 1000,2000,3000
-spawn_child $P0ID -1 -1 1000,2000,3000
-destroy_persona $P0ID
-
-# non-default set of groups
-create_persona "test0_3" "guest" 1003 2000 2000,3000,4000
-P0ID=$_ID
-spawn_child $P0ID
-spawn_child $P0ID 1100
-spawn_child $P0ID 0
-spawn_child $P0ID -1 1101
-spawn_child $P0ID 1100 1101
-spawn_child $P0ID 1100 1101 1111,2222,3333
-spawn_child $P0ID 1100 -1 1111,2222,3333
-spawn_child $P0ID -1 -1 1111,2222,3333
-destroy_persona $P0ID
-
-
-##
-## Test Group 1: persona creation / re-creation
-##
-
-# Create 3 personas with auto-assigned IDs
-create_persona "test1_1" "guest"
-P1ID=$_ID
-create_persona "test1_2" "managed"
-P2ID=$_ID
-create_persona "test1_3" "priv"
-P3ID=$_ID
-create_persona "test1_4" "system"
-P4ID=$_ID
-
-D1=$(($P2ID - $P1ID))
-D2=$(($P3ID - $P2ID))
-D3=$(($P4ID - $P3ID))
-if [ $D1 -ne $D2 -o $D1 -ne $D3 -o $D2 -ne $D3 ]; then
-       bail "inconsistent automatic Persona ID increment: $D1,$D2,$D3 ($P1ID,$P2ID,$P3ID,$P4ID)"
-fi
-
-# make sure we can't re-allocate the same name / ID
-${PERSONA_MGR} create -v -l test1_1 -t guest -i -1 && expect_failure "re-create same name:test1_1 type:guest"
-${PERSONA_MGR} create -v -l test1_1 -t managed -i -1 && expect_failure "re-create same name:test1_1 type:managed"
-${PERSONA_MGR} create -v -l test1_1_new -t managed -i $P1ID && expect_failure "re-create $P1ID with new name:test1_1_new type:managed"
-
-##
-## Test Group 2: auto-assigned ID tricks
-##
-
-# Notice the difference in IDs, then try to create a persona by
-# specifying an ID that will match the next auto-assigned ID
-# (should succeed)
-P5ID_REQ=$(($P4ID + $D2))
-create_persona "test2_1" "guest" ${P5ID_REQ}
-P5ID=$_ID
-if [ ! $P5ID -eq ${P5ID_REQ} ]; then
-       bail "test2_1: ${P5ID_REQ} != $P5ID"
-fi
-
-# try to create a persona with auto-assigned ID
-# (resulting persona should have ID != P5ID)
-create_persona "test2_2" "guest"
-P6ID=$_ID
-if [ $P6ID -eq $P5ID ]; then
-       bail "created duplicate persona IDs: $P6ID == $P5ID"
-fi
-
-##
-## Test Group 3: persona destruction
-##
-
-destroy_persona $P1ID
-destroy_persona $P2ID
-destroy_persona $P3ID
-destroy_persona $P4ID
-destroy_persona $P5ID
-destroy_persona $P6ID
-
-# try to re-destroy the personas
-# (should fail)
-${PERSONA_MGR} destroy -v -i $P1ID && expect_failure "re-destroy (1/2) $P1ID"
-${PERSONA_MGR} destroy -v -i $P1ID && expect_failure "re-destroy (2/2) $P1ID"
-${PERSONA_MGR} destroy -v -i $P2ID && expect_failure "re-destroy $P2ID"
-${PERSONA_MGR} destroy -v -i $P3ID && expect_failure "re-destroy $P3ID"
-${PERSONA_MGR} destroy -v -i $P4ID && expect_failure "re-destroy $P4ID"
-${PERSONA_MGR} destroy -v -i $P5ID && expect_failure "re-destroy $P5ID"
-${PERSONA_MGR} destroy -v -i $P6ID && expect_failure "re-destroy $P6ID"
-
-# cleanup
-rm -rf "${TMPDIR}"
-
-echo ""
-echo "${0##/}: SUCCESS"
-exit 0
diff --git a/tools/tests/personas/persona_test_run_src.sh b/tools/tests/personas/persona_test_run_src.sh
new file mode 100755 (executable)
index 0000000..95e132a
--- /dev/null
@@ -0,0 +1,575 @@
+#!/bin/bash
+# persona_test_run.sh
+#
+# This file aims to be a comprehensive test suite for the persona subsystem.
+# It uses two tools:
+#   1. persona_mgr - create, destroy, lookup personas
+#   2. persona_spawn - spawn processes into personas with a variety of options
+# The script relies heavily on the particular output of these tools, so if you
+# are modifying / extending those tools, this file also need to be updated to
+# properly capture the new output. Specifically, the get_persona_info function
+# needs to be maintained / updated.
+#
+# NOTE: the function get_persona_info() also needs to be kept up to date with
+# the types of personas found in bsd/sys/persona.h
+
+PERSONA_MGR="${PWD}/persona_mgr"
+PERSONA_SPAWN="${PWD}/persona_spawn"
+PERSONA_SPAWN_UNENTITLED="${PWD}/persona_spawn_unentitled"
+
+TEST_DEFAULT_PERSONA=0
+
+if [ ! -d "$TMPDIR" ]; then
+       echo "Couldn't find temp directory '$TMPDIR': check permissions/environment?"
+       exit 255
+fi
+
+if [ ! -e "${PERSONA_MGR}" ] ||  [ ! -x "${PERSONA_MGR}" ]; then
+       echo "Can't find '${PERSONA_MGR}': skipping test"
+       exit 0
+fi
+if [ ! -e "${PERSONA_SPAWN}" ] || [ ! -x "${PERSONA_SPAWN}" ]; then
+       echo "Can't find '${PERSONA_SPAWN}': skipping test"
+       exit 0
+fi
+
+function check_for_persona_support() {
+       local errno=0
+       ${PERSONA_MGR} support || errno=$?
+       if [ $errno -eq 78 ]; then
+               echo "Persona subsystem is not supported - skipping tests"
+               exit 0
+       fi
+       return 0
+}
+check_for_persona_support
+
+
+## bail [failure_msg]
+#
+# exit the script with an error code that corresponds to the line number
+# from which this function was invoked. Because we want to exit with a
+# non-zero exit code, we use: 1 + (254 % line).
+#
+function bail() {
+       local msg="$1"
+       local line=$2
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+       echo "[$line] ERROR: $msg" 1>&2
+       exit $((1 + $line % 254))
+}
+
+## check_return [message_on_failure]
+#
+# Check the return value of the previous command or script line. If the
+# value of '$?' is not 0, then call bail() with an appropriate message.
+#
+function check_return() {
+       local err=$?
+       local msg=$1
+       local line=$2
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+       echo "CHECK: $msg"
+       if [ $err -ne 0 ]; then
+               bail "e=$err: $msg" $line
+       fi
+
+       return 0
+}
+
+## expect_failure [message_on_success]
+#
+# Check the return value of the previous command or script line. If the
+# value of '$?' is 0 (success), then call bail() with a message saying
+# that we expected this previous command/line to fail.
+# 
+function expect_failure() {
+       local err=$?
+       local msg=$1
+       local line=$2
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+       if [ $err -eq 0 ]; then
+               bail "found success, expected failure: $msg" $line
+       fi
+
+       echo "EXPECT: failure: $msg"
+       return 0
+}
+
+## test_num [debug_info] [number]
+#
+# Check that a variable value is a number, bail() on error.
+#
+function test_num() {
+       local type=$1
+       local num=$2
+       local line=$3
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+       if [ -z "$num" ]; then
+               bail "invalid (NULL) $type" $line
+       fi
+       [ "$num" -eq "$num" ] 2>/dev/null
+       if [ $? -ne 0 ]; then
+               bail "invalid $type: $num" $line
+       fi
+
+       return 0
+}
+
+## global variables used to return values to callers
+_ID=-1
+_TYPE="invalid"
+_LOGIN=""
+_UID=-1
+_GID=-1
+_NGROUPS=-1
+_GROUPS=""
+
+## get_persona_info {persona_id} {persona_login}
+#
+# Lookup persona info for the given ID/login. At least one of the ID/login
+# parameters must be valid
+function get_persona_info() {
+       local pna_id=${1:-1}
+       local pna_login=${2:- }
+       local line=$3
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+
+       local largs="-u ${pna_id}"
+       if [ "${pna_login}" != " " ]; then
+               largs+=" -l ${pna_login}"
+       fi
+
+       _ID=-1
+       _TYPE=-1
+       _LOGIN=""
+       _UID=-1
+       _GID=-1
+       _NGROUPS=-1
+       _GROUPS=()
+
+       local file="${TMPDIR}/plookup"
+
+       ${PERSONA_MGR} lookup ${largs} > "${file}"
+       check_return "persona lookup of: ${largs}" $line
+
+       _ID=$(cat "${file}" | grep "+id: " | head -1 | sed 's/.*+id:[ ]*\([0-9][0-9]*\).*/\1/')
+       test_num "Persona ID lookup:${largs}" "$_ID"
+
+       local type=$(cat "${file}" | grep "+type: " | head -1 | sed 's/.*+type:[ ]*\([0-9][0-9]*\).*/\1/')
+       test_num "+type lookup:${largs}" "$type"
+       ##
+       ## NOTE: keep in sync with bsd/sys/persona.h types!
+       ##
+       if [ $type -eq 1 ]; then
+               _TYPE=guest
+       elif [ $type -eq 2 ]; then
+               _TYPE=managed
+       elif [ $type -eq 3 ]; then
+               _TYPE=priv
+       elif [ $type -eq 4 ]; then
+               _TYPE=system
+       else
+               _TYPE=invalid
+       fi
+
+       _LOGIN=$(cat "${file}" | grep "+login: " | head -1 | sed 's/.*+login:[ ]*"\([^"]*\)".*/\1/')
+       if [ -z "$_LOGIN" ]; then
+               bail "invalid login for pna_id:$_ID: '$_LOGIN'" $line
+       fi
+
+       # these are always the same
+       _UID=$_ID
+
+       _GID=$(cat "${file}" | grep "+gid: " | head -1 | sed 's/.*+gid:[ ]*\([0-9][0-9]*\).*/\1/')
+       test_num "GID lookup:${largs}" "$_GID"
+
+       _NGROUPS=$(cat "${file}" | grep "ngroups: " | head -1 | sed 's/.*ngroups:[ ]*\([0-9][0-9]*\)[ ][ ]*{.*}.*/\1/')
+       test_num "NGROUPS lookup:${largs}" "$_NGROUPS"
+
+       _GROUPS=( $(cat "${file}" | grep "ngroups: " | head -1 | sed 's/.*ngroups:[ ]*[0-9][0-9]*[ ][ ]*{[ ]*\([^ ].*\)[ ][ ]*}.*/\1/') )
+       if [ $_NGROUPS -gt 0 ]; then
+               if [ -z "${_GROUPS}" ]; then
+                       bail "lookup:${largs}: missing $_NGROUPS groups" $line
+               fi
+               if [ ${#_GROUPS[@]} -ne $_NGROUPS ]; then
+                       bail "lookup:${largs} wrong number of groups ${#_GROUPS[@]} != $_NGROUPS" $line
+               fi
+       fi
+}
+
+## validate_child_info [output_file] [persona_id] {uid} {gid} {groups}
+#
+# Parse the output of the 'persona_spawn' command and validate that
+# the new child process is in the correct persona with the correct
+# process attributes.
+#
+function validate_child_info() {
+       local file=$1
+       local pna_id=$2
+       local uid=${3:--1}
+       local gid=${4:--1}
+       local groups=${5:- }
+       local line=$6
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+       local l=( )
+
+       # get the child's PID
+       local cpid="$(cat "$file" | grep "Child: PID:" | sed 's/.*Child: PID:\([0-9][0-9]*\).*/\1/')"
+       test_num "Child PID" "$cpid" $line
+
+       # validate the child's persona
+       l=( $(cat "$file" | grep "Child: Persona:" | sed 's/.*Child: Persona: \([0-9][0-9]*\) (err:\([0-9][0-9]*\))/\1 \2/') )
+       if [ ${#l[@]} -ne 2 ]; then
+               bail "Invalid Child[$cpid] Persona line" $line
+       fi
+       test_num "Child Persona ID" "${l[0]}" $line
+       test_num "kpersona_info retval" "${l[1]}" $line
+
+       if [ ${l[0]} -ne $pna_id ]; then
+               bail "Child[$cpid] persona:${l[0]} != specified persona:$pna_id" $line
+       fi
+
+       # Validate the UID/GID
+       l=( $(cat "$file" | grep "Child: UID:" | sed 's/.*UID:\([0-9][0-9]*\), GID:\([0-9][0-9]*\).*/\1 \2/') )
+       if [ ${#l[@]} -ne 2 ]; then
+               bail "Invalid Child[$cpid] UID/GID output" $line
+       fi
+       if [ $uid -ge 0 ]; then
+               if [ $uid -ne ${l[0]} ]; then
+                       bail "Child[$cpid] UID:${l[0]} != specified UID:$uid" $line
+               fi
+       fi
+       if [ $gid -ge 0 ]; then
+               if [ $gid -ne ${l[1]} ]; then
+                       bail "Child[$cpid] GID:${l[1]} != specified GID:$gid" $line
+               fi
+       fi
+
+       # TODO: validate / verify groups?
+
+       return 0
+}
+
+
+## spawn_child [persona_id] {uid} {gid} {group_spec}
+#
+# Create a child process that is spawn'd into the persona given by
+# the first argument (pna_id). The new process can have its UID, GID,
+# and group membership properties overridden.
+#
+function spawn_child() {
+       local pna_id=$1
+       local uid=${2:--1}
+       local gid=${3:--1}
+       local groups=${4:- }
+       local line=$5
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+
+       local file="child.${pna_id}"
+       local spawn_args="-I $pna_id"
+       if [ $uid -ge 0 ]; then
+               spawn_args+=" -u $uid"
+               file+=".u$uid"
+       fi
+       if [ $gid -ge 0 ]; then
+               spawn_args+=" -g $gid"
+               file+=".g$gid"
+       fi
+       if [ "$groups" != " " ]; then
+               spawn_args+=" -G $groups"
+               file+="._groups"
+       fi
+
+       echo "SPAWN: $file"
+       ${PERSONA_SPAWN} -v $spawn_args ${PERSONA_SPAWN} child -v -E > "${TMPDIR}/$file"
+       check_return "child info: $file" $line
+
+       # Grab the specified persona's info so we can
+       # verify the child's info against it.
+       # This function puts data into global variables, e.g. _ID, _GID, etc.
+       get_persona_info ${pna_id} " " $line
+       if [ $uid -lt 0 ]; then
+               uid=$_UID
+       fi
+       if [ $gid -lt 0 ]; then
+               gid=$_GID
+       fi
+       if [ "$groups" == " " ]; then
+               # convert a bash array into a comma-separated list for validation
+               local _g="${_GROUPS[@]}"
+               groups="${_g// /,}"
+       fi
+
+       validate_child_info "${TMPDIR}/$file" "$pna_id" "$uid" "$gid" "$groups" $line
+
+       ## validate that the first child spawned into a persona *cannot* spawn
+       ## into a different persona...
+       if [ $uid -eq 0 ]; then
+               ${PERSONA_SPAWN} -v $spawn_args ${PERSONA_SPAWN_UNENTITLED} child -v -E -R spawn -v $spawn_args -I ${TEST_DEFAULT_PERSONA} /bin/echo "This is running in the system persona"
+               expect_failure "Spawned child that re-execs into non-default persona" $line
+       fi
+       return 0
+}
+
+## get_created_id [output_file]
+#
+# Parse the output of the 'persona_mgr' command to determine the ID
+# of the newly created persona.
+#
+function get_created_id() {
+       local file=$1
+       local o=$(cat "$file" | grep "Created persona" | sed 's/.*Created persona \([0-9][0-9]*\):/\1/')
+       echo $o
+       return 0
+}
+
+## create_persona [login_name] [persona_type] {persona_id} {gid} {group_spec}
+#
+# Create a new persona with given parameters.
+#
+# Returns: the newly created persona ID via the global variable, $_ID
+#
+function create_persona() {
+       local name=${1}
+       local type=${2}
+       local pna_id=${3:--1}
+       local gid=${4:--1}
+       local groups=${5:- }
+       local line=$6
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+
+       if [ -z "$name" -o -z "$type" ]; then
+               bail "Invalid arguments to create_persona '$name' '$type'" $line
+       fi
+
+       local file="persona.at${line}"
+       # persona ID of '-1' is auto-assigned
+       local spawn_args="-v -l $name -i $pna_id"
+       if [ $pna_id -eq -1 ]; then
+               file+=".auto"
+       else
+               file+=".${pna_id}"
+       fi
+
+       spawn_args+=" -t $type"
+       file+=".$type"
+
+       if [ $gid -ge 0 ]; then
+               spawn_args+=" -g $gid"
+               file+=".g$gid"
+       fi
+       if [ "$groups" != " " ]; then
+               spawn_args+=" -G $groups"
+               file+="._groups"
+       fi
+
+       echo "CREATE: $file"
+       ${PERSONA_MGR} create ${spawn_args} > "${TMPDIR}/${file}"
+       check_return "persona creation: ${file}" $line
+       # test output should include persona creation output for later debugging
+       cat "${TMPDIR}/${file}"
+
+       # validate the output of the persona_mgr tool (what we think we created)
+       _ID=`get_created_id "${TMPDIR}/${file}"`
+       test_num "persona_id for $file" "$_ID" $line
+       if [ ${pna_id} -gt 0 ]; then
+               if [ $_ID -ne ${pna_id} ]; then
+                       bail "Created persona doesn't have expected ID $_ID != ${pna_id}" $line
+               fi
+       fi
+
+       # validate the entire persona information (what a kpersona_lookup says we created)
+       # This function puts data into global variables, e.g. _ID, _LOGIN, _GID, etc.
+       echo "VALIDATE: ${file}"
+       get_persona_info ${pna_id} "$name" $line
+       if [ "$name" != "$_LOGIN" ]; then
+               bail "${file}: unexpected login '$_LOGIN' != '$name'" $line
+       fi
+       if [ "$type" != "$_TYPE" ]; then
+               bail "${file}: unexpected type '$_TYPE' != '$type'" $line
+       fi
+       if [ ${pna_id} -gt 0 ]; then
+               if [ ${pna_id} -ne $_ID ]; then
+                       bail "${file}: unexpected ID '$_ID' != '${pna_id}'" $line
+               fi
+       fi
+       if [ $gid -ge 0 ]; then
+               if [ $gid -ne $_GID ]; then
+                       bail "${file}: unexpected GID '$_GID' != '$gid'" $line
+               fi
+       fi
+       if [ "$groups" != " " ]; then
+               local _g="${_GROUPS[@]}"
+               if [ "${_g// /,}" != "$groups" ]; then
+                       bail "${file}: unexpected groups '${_g// /,}' != '$groups'" $line
+               fi
+       fi
+
+       return 0
+}
+
+## destroy_persona [persona_id]
+#
+# Destroy the given persona.
+#
+function destroy_persona() {
+       local pna_id=$1
+       local line=$2
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+
+       echo "DESTROY: ${pna_id}"
+       ${PERSONA_MGR} destroy -v -i ${pna_id}
+       check_return "destruction of ${pna_id}" $line
+}
+
+#
+#
+# Begin Tests!
+#
+#
+echo "Running persona tests [$LINENO] ($TMPDIR)"
+
+##
+## Test Group 0: basic creation + spawn tests
+##
+
+create_persona "test_default_persona" "guest" 9999
+TEST_DEFAULT_PERSONA=$_ID
+
+# default group, specific ID
+create_persona "test0_1" "guest" 1001
+P0ID=$_ID
+
+spawn_child $P0ID
+spawn_child $P0ID 1100
+spawn_child $P0ID 0
+spawn_child $P0ID -1 1101
+spawn_child $P0ID 1100 1101
+spawn_child $P0ID 1100 1101 1000,2000,3000
+spawn_child $P0ID 1100 -1 1000,2000,3000
+spawn_child $P0ID -1 -1 1000,2000,3000
+destroy_persona $P0ID
+
+# specific ID, non-default group
+create_persona "test0_2" "guest" 1002 2000
+P0ID=$_ID
+spawn_child $P0ID
+spawn_child $P0ID 1100
+spawn_child $P0ID 0
+spawn_child $P0ID -1 1101
+spawn_child $P0ID 1100 1101
+spawn_child $P0ID 1100 1101 1000,2000,3000
+spawn_child $P0ID 1100 -1 1000,2000,3000
+spawn_child $P0ID -1 -1 1000,2000,3000
+destroy_persona $P0ID
+
+# non-default set of groups
+create_persona "test0_3" "guest" 1003 2000 2000,3000,4000
+P0ID=$_ID
+spawn_child $P0ID
+spawn_child $P0ID 1100
+spawn_child $P0ID 0
+spawn_child $P0ID -1 1101
+spawn_child $P0ID 1100 1101
+spawn_child $P0ID 1100 1101 1111,2222,3333
+spawn_child $P0ID 1100 -1 1111,2222,3333
+spawn_child $P0ID -1 -1 1111,2222,3333
+destroy_persona $P0ID
+
+
+##
+## Test Group 1: persona creation / re-creation
+##
+
+# Create 3 personas with auto-assigned IDs
+create_persona "test1_1" "guest"
+P1ID=$_ID
+create_persona "test1_2" "managed"
+P2ID=$_ID
+create_persona "test1_3" "priv"
+P3ID=$_ID
+create_persona "test1_4" "guest"
+P4ID=$_ID
+
+D1=$(($P2ID - $P1ID))
+D2=$(($P3ID - $P2ID))
+D3=$(($P4ID - $P3ID))
+if [ $D1 -ne $D2 -o $D1 -ne $D3 -o $D2 -ne $D3 ]; then
+       bail "inconsistent automatic Persona ID increment: $D1,$D2,$D3 ($P1ID,$P2ID,$P3ID,$P4ID)"
+fi
+
+# make sure we can't re-allocate the same name / ID
+${PERSONA_MGR} create -v -l test1_1 -t guest -i -1 && expect_failure "re-create same name:test1_1 type:guest"
+${PERSONA_MGR} create -v -l test1_1 -t managed -i -1 && expect_failure "re-create same name:test1_1 type:managed"
+${PERSONA_MGR} create -v -l test1_1_new -t managed -i $P1ID && expect_failure "re-create $P1ID with new name:test1_1_new type:managed"
+
+##
+## Test Group 2: auto-assigned ID tricks
+##
+
+# Notice the difference in IDs, then try to create a persona by
+# specifying an ID that will match the next auto-assigned ID
+# (should succeed)
+P5ID_REQ=$(($P4ID + $D2))
+create_persona "test2_1" "guest" ${P5ID_REQ}
+P5ID=$_ID
+if [ ! $P5ID -eq ${P5ID_REQ} ]; then
+       bail "test2_1: ${P5ID_REQ} != $P5ID"
+fi
+
+# try to create a persona with auto-assigned ID
+# (resulting persona should have ID != P5ID)
+create_persona "test2_2" "guest"
+P6ID=$_ID
+if [ $P6ID -eq $P5ID ]; then
+       bail "created duplicate persona IDs: $P6ID == $P5ID"
+fi
+
+##
+## Test Group 3: persona destruction
+##
+
+destroy_persona $P1ID
+destroy_persona $P2ID
+destroy_persona $P3ID
+destroy_persona $P4ID
+destroy_persona $P5ID
+destroy_persona $P6ID
+
+# try to re-destroy the personas
+# (should fail)
+${PERSONA_MGR} destroy -v -i $P1ID && expect_failure "re-destroy (1/2) $P1ID"
+${PERSONA_MGR} destroy -v -i $P1ID && expect_failure "re-destroy (2/2) $P1ID"
+${PERSONA_MGR} destroy -v -i $P2ID && expect_failure "re-destroy $P2ID"
+${PERSONA_MGR} destroy -v -i $P3ID && expect_failure "re-destroy $P3ID"
+${PERSONA_MGR} destroy -v -i $P4ID && expect_failure "re-destroy $P4ID"
+${PERSONA_MGR} destroy -v -i $P5ID && expect_failure "re-destroy $P5ID"
+${PERSONA_MGR} destroy -v -i $P6ID && expect_failure "re-destroy $P6ID"
+
+destroy_persona ${TEST_DEFAULT_PERSONA}
+
+# cleanup
+rm -rf "${TMPDIR}"
+
+echo ""
+echo "${0##/}: SUCCESS"
+exit 0
index 39b7cd915e9a05de26cb17a37ae717a1aa6e42f1..cd1963c5655bd2fe63dbf67149837c8b53ad7fc7 100644 (file)
@@ -67,6 +67,7 @@ typedef enum my_policy_type { MY_POLICY_REALTIME, MY_POLICY_TIMESHARE, MY_POLICY
 #define CONSTRAINT_NANOS        (20000000ll)    /* 20 ms */
 #define COMPUTATION_NANOS       (10000000ll)    /* 10 ms */
 #define TRACEWORTHY_NANOS       (10000000ll)    /* 10 ms */
+#define TRACEWORTHY_NANOS_TEST  ( 2000000ll)    /*  2 ms */
 
 #if DEBUG
 #define debug_log(args ...) printf(args)
@@ -131,6 +132,9 @@ static boolean_t                g_test_rt_smt = FALSE;
 /* Test whether realtime threads are successfully avoiding CPU 0 on Intel */
 static boolean_t                g_test_rt_avoid0 = FALSE;
 
+/* Print a histgram showing how many threads ran on each CPU */
+static boolean_t                g_histogram = FALSE;
+
 /* One randomly chosen thread holds up the train for a certain duration. */
 static boolean_t                g_do_one_long_spin = FALSE;
 static uint32_t                 g_one_long_spin_id = 0;
@@ -681,6 +685,11 @@ main(int argc, char **argv)
                }
                g_policy = MY_POLICY_REALTIME;
                g_do_all_spin = TRUE;
+               g_histogram = true;
+               /* Don't change g_traceworthy_latency_ns if it's explicity been set to something other than the default */
+               if (g_traceworthy_latency_ns == TRACEWORTHY_NANOS) {
+                       g_traceworthy_latency_ns = TRACEWORTHY_NANOS_TEST;
+               }
        } else if (g_test_rt_smt) {
                if (g_nlogicalcpu != 2 * g_nphysicalcpu) {
                        /* Not SMT */
@@ -693,6 +702,7 @@ main(int argc, char **argv)
                }
                g_policy = MY_POLICY_REALTIME;
                g_do_all_spin = TRUE;
+               g_histogram = true;
        } else if (g_test_rt_avoid0) {
 #if defined(__x86_64__) || defined(__i386__)
                if (g_numthreads == 0) {
@@ -704,6 +714,7 @@ main(int argc, char **argv)
                }
                g_policy = MY_POLICY_REALTIME;
                g_do_all_spin = TRUE;
+               g_histogram = true;
 #else
                printf("Attempt to run --test-rt-avoid0 on a non-Intel device\n");
                exit(0);
@@ -948,13 +959,15 @@ main(int argc, char **argv)
        }
 #endif
 
-       if (g_test_rt || g_test_rt_smt || g_test_rt_avoid0) {
+       if (g_histogram) {
                putchar('\n');
 
                for (uint32_t i = 0; i < g_numcpus; i++) {
                        printf("%d\t%d\n", i, g_cpu_histogram[i].accum);
                }
+       }
 
+       if (g_test_rt || g_test_rt_smt || g_test_rt_avoid0) {
 #define PRIMARY   0x5555555555555555ULL
 #define SECONDARY 0xaaaaaaaaaaaaaaaaULL
 
@@ -970,7 +983,7 @@ main(int argc, char **argv)
                                /* Test for threads running on both primary and secondary cpus of the same core (FAIL) */
                                fail = ((map & PRIMARY) & ((map & SECONDARY) >> 1));
                        } else if (g_test_rt) {
-                               fail = __builtin_popcountll(map) != g_numthreads;
+                               fail = (__builtin_popcountll(map) != g_numthreads) && (worst_latencies_ns[i] > g_traceworthy_latency_ns);
                        } else if (g_test_rt_avoid0) {
                                fail = ((map & 0x1) == 0x1);
                        }
@@ -1109,6 +1122,7 @@ parse_args(int argc, char *argv[])
                { "test-rt",            no_argument,            (int*)&g_test_rt,               TRUE },
                { "test-rt-smt",        no_argument,            (int*)&g_test_rt_smt,           TRUE },
                { "test-rt-avoid0",     no_argument,            (int*)&g_test_rt_avoid0,        TRUE },
+               { "histogram",          no_argument,            (int*)&g_histogram,             TRUE },
                { "verbose",            no_argument,            (int*)&g_verbose,               TRUE },
                { "help",               no_argument,            NULL,                           'h' },
                { NULL,                 0,                      NULL,                           0 }
index 07cbd9fbdfe2684061ae28eac8a4fe3c42420306..60c9dc5d92a6da0ef89642f7ada009f2489a9109 100755 (executable)
@@ -75,10 +75,7 @@ function state_string(strings, state)
 end
 
 kqrequest_state_strings = {
-       ['THREQUESTED'] = 0x02,
-       ['WAKEUP'] = 0x04,
-       ['BOUND'] = 0x08,
-       ['DRAIN'] = 0x40,
+       ['WAKEUP'] = 1,
 }
 
 kqueue_state_strings = {
@@ -94,6 +91,8 @@ kqueue_state_strings = {
        ['DRAIN'] = 0x200,
        ['WAKEUP'] = 0x400,
        ['DYNAMIC'] = 0x800,
+       ['R2K'] = 0x1000,
+       ['TURNSTILE'] = 0x2000,
 }
 
 knote_state_strings = {
@@ -102,16 +101,13 @@ knote_state_strings = {
        ['DISABLED'] = 0x0004,
        ['DROPPING'] = 0x0008,
        ['LOCKED'] = 0x0010,
-       ['ATTACHING'] = 0x0020,
+       ['POSTING'] = 0x0020,
        ['STAYACTIVE'] = 0x0040,
        ['DEFERDELETE'] = 0x0080,
-       ['ATTACHED'] = 0x0100,
-       ['DISPATCH'] = 0x0200,
-       ['UDATA_SPECIFIC'] = 0x0400,
+       ['MERGE_QOS'] = 0x0100,
+       ['REQVANISH'] = 0x0200,
+       ['VANISHED'] = 0x0400,
        ['SUPPRESSED'] = 0x0800,
-       ['MERGE_QOS'] = 0x1000,
-       ['REQVANISH'] = 0x2000,
-       ['VANISHED'] = 0x4000,
 }
 
 kevent_flags_strings = {
@@ -159,36 +155,53 @@ function kevent_filter_string(filt)
                return 'SOCK'
        elseif filt == -14 then
                return 'MEMORYSTATUS'
-       elseif filt == 15 then
-               return 'KQREAD'
-       elseif filt == 16 then
-               return 'PIPE_R'
+       elseif filt == -15 then
+               return 'EXCEPT'
+       elseif filt == -16 then
+               return 'NW_CHANNEL'
+       elseif filt == -17 then
+               return 'WORKLOOP'
+
        elseif filt == 17 then
-               return 'PIPE_W'
+               return 'KQREAD'
        elseif filt == 18 then
-               return 'PTSD'
+               return 'PIPE_N'
        elseif filt == 19 then
-               return 'SOWRITE'
+               return 'PIPE_R'
        elseif filt == 20 then
-               return 'SOEXCEPT'
+               return 'PIPE_W'
        elseif filt == 21 then
-               return 'SPEC'
+               return 'PTSD'
        elseif filt == 22 then
-               return 'BPFREAD'
+               return 'SOREAD'
        elseif filt == 23 then
-               return 'NECP_FD'
+               return 'SOWRITE'
        elseif filt == 24 then
-               return 'SKYWALK_CHANNEL_W'
+               return 'SCK'
        elseif filt == 25 then
-               return 'SKYWALK_CHANNEL_R'
+               return 'SOEXCEPT'
        elseif filt == 26 then
-               return 'FSEVENT'
+               return 'SPEC'
        elseif filt == 27 then
-               return 'VN'
+               return 'BPFREAD'
        elseif filt == 28 then
-               return 'SKYWALK_CHANNEL_E'
+               return 'NECP_FD'
        elseif filt == 29 then
+               return 'SKYWALK_CHANNEL_W'
+       elseif filt == 30 then
+               return 'SKYWALK_CHANNEL_R'
+       elseif filt == 31 then
+               return 'SKYWALK_CHANNEL_E'
+       elseif filt == 32 then
+               return 'FSEVENT'
+       elseif filt == 33 then
+               return 'VN'
+       elseif filt == 34 then
                return 'TTY'
+       elseif filt == 35 then
+               return 'PTMX'
+       elseif filt == 36 then
+               return 'DETACHED'
        else
                return string.format('[%d]', filt)
        end
index ae853d43322fae6d96639c560a633d70c9fc4f90..970dde6fe933fcb3dde9a659daecedd5deadd9f8 100755 (executable)
@@ -149,6 +149,8 @@ trace_codename("wq_select_threadreq", function(buf)
                printf("%s\tSelection failed: no request\n", prefix)
        elseif buf[2] == 2 then
                printf("%s\tSelection failed: throttled\n", prefix)
+       elseif buf[2] == 3 then
+               printf("%s\tSelection failed: scheduler would preempt\n", prefix)
        end
 end)